commit 118f1fc726524c7ce728cc1474b7679fa4168177
Author: maxiao1 <maxiao1@sugon.com>
Date:   Sat Sep 13 17:00:20 2025 +0800

    sglangv0.5.2 & support Qwen3-Next-80B-A3B-Instruct

diff --git a/.clang-format-ignore b/.clang-format-ignore
new file mode 100644
index 000000000..15c76cc45
--- /dev/null
+++ b/.clang-format-ignore
@@ -0,0 +1 @@
+sgl-kernel/3rdparty/tensorrt_llm/*
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
new file mode 100644
index 000000000..3c7b67cac
--- /dev/null
+++ b/.devcontainer/Dockerfile
@@ -0,0 +1,35 @@
+FROM lmsysorg/sglang:dev
+
+# Create non-root user with specified UID and GID
+# NOTE: Replace with your own UID and GID. This is a workaround from https://github.com/microsoft/vscode-remote-release/issues/49#issuecomment-489060908.
+ARG HOST_UID=1003
+ARG HOST_GID=1003
+RUN groupadd -g $HOST_GID devuser && \
+    useradd -m -u $HOST_UID -g $HOST_GID -s /bin/zsh devuser
+
+# Give devuser sudo access
+RUN apt-get update && apt-get install -y sudo && \
+    echo "devuser ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/devuser && \
+    rm -rf /var/lib/apt/lists/* && \
+    apt-get clean
+
+# Set up oh-my-zsh for devuser
+RUN cp -r /root/.oh-my-zsh /home/devuser/.oh-my-zsh && \
+    cp /root/.zshrc /home/devuser/.zshrc && \
+    cp /root/.vimrc /home/devuser/.vimrc && \
+    cp /root/.tmux.conf /home/devuser/.tmux.conf && \
+    sed -i 's|/root/.oh-my-zsh|/home/devuser/.oh-my-zsh|g' /home/devuser/.zshrc && \
+    chown -R devuser:devuser /home/devuser/
+
+# Set workspace directory and ownership
+WORKDIR /sgl-workspace/sglang
+RUN chown -R devuser:devuser /sgl-workspace
+
+# Switch to devuser
+USER devuser
+
+# Install uv
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Install rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 000000000..338b10fe6
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,30 @@
+{
+    "name": "sglang",
+    "build": {
+        "dockerfile": "Dockerfile"
+    },
+    "remoteUser": "devuser",
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                // Python development
+                "ms-python.python",
+                "charliermarsh.ruff",
+                // Rust development
+                "rust-lang.rust-analyzer",
+                "tamasfe.even-better-toml"
+            ]
+        }
+    },
+    "forwardPorts": [],
+    "runArgs": [
+        "--gpus",
+        "all"
+    ],
+    // The two lines below ensures that your local changes in the sglang
+    // repo is automatically synced to the sglang pip package installed
+    // in the dev docker container. You can remove / comment out these
+    // two lines if you prefer to sync code changes manually.
+    "workspaceMount": "source=${localWorkspaceFolder},target=/sgl-workspace/sglang,type=bind",
+    "workspaceFolder": "/sgl-workspace/sglang"
+}
diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 000000000..030a7293d
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,25 @@
+# https://editorconfig.org/
+
+root = true
+
+[*]
+charset = utf-8
+end_of_line = lf
+indent_style = space
+indent_size = 4
+trim_trailing_whitespace = true
+insert_final_newline = true
+
+[*.{json,yaml,yml}]
+indent_size = 2
+
+[*.md]
+indent_size = 2
+x-soft-wrap-text = true
+
+[*.rst]
+indent_size = 4
+x-soft-wrap-text = true
+
+[Makefile]
+indent_style = tab
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
new file mode 100644
index 000000000..32435d6ed
--- /dev/null
+++ b/.github/CODEOWNERS
@@ -0,0 +1,21 @@
+.github @merrymercy @zhyncs
+/docker @zhyncs @HaiShaw @ByronHsu
+/python/pyproject.toml @merrymercy @zhyncs
+/python/sglang/* @merrymercy @Ying1123 @zhyncs @hnyls2002
+/python/sglang/srt/constrained @hnyls2002
+/python/sglang/srt/disaggregation @ByronHsu @hnyls2002
+/python/sglang/srt/disaggregation/mooncake @ShangmingCai
+/python/sglang/srt/distributed @yizhang2077 @merrymercy
+/python/sglang/srt/entrypoints @ispobock @CatherineSue @slin1237 @merrymercy
+/python/sglang/srt/eplb @fzyzcjy
+/python/sglang/srt/function_call @CatherineSue
+/python/sglang/srt/layers @merrymercy @Ying1123 @zhyncs @ispobock @HaiShaw @ch-wan @BBuf @kushanam @Edwardf0t1
+/python/sglang/srt/lora @Ying1123 @Fridge003 @lifuhuang
+/python/sglang/srt/managers @merrymercy @Ying1123 @hnyls2002 @xiezhq-hermann
+/python/sglang/srt/mem_cache @merrymercy @Ying1123 @hnyls2002 @xiezhq-hermann
+/python/sglang/srt/model_executor @merrymercy @Ying1123 @hnyls2002 @zhyncs @ispobock
+/python/sglang/srt/multimodal @mickqian @JustinTong0323
+/python/sglang/srt/speculative @Ying1123 @merrymercy @rkooo567 @kssteven418
+/sgl-kernel @zhyncs @ispobock @HandH1998 @BBuf @yizhang2077 @merrymercy @FlamingoPg @HaiShaw
+/sgl-router @slin1237 @ByronHsu
+/test/srt/test_modelopt* @Edwardf0t1
diff --git a/.github/ISSUE_TEMPLATE/1-bug-report.yml b/.github/ISSUE_TEMPLATE/1-bug-report.yml
new file mode 100644
index 000000000..5f6734867
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/1-bug-report.yml
@@ -0,0 +1,38 @@
+name: 🐞 Bug report
+description: Create a report to help us reproduce and fix the bug
+title: "[Bug] "
+labels: ['Bug']
+
+body:
+- type: checkboxes
+  attributes:
+    label: Checklist
+    options:
+    - label: 1. I have searched related issues but cannot get the expected help.
+    - label: 2. The bug has not been fixed in the latest version.
+    - label: 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback.
+    - label: 4. If the issue you raised is not a bug but a question, please raise a discussion at https://github.com/sgl-project/sglang/discussions/new/choose Otherwise, it will be closed.
+    - label: 5. Please use English, otherwise it will be closed.
+- type: textarea
+  attributes:
+    label: Describe the bug
+    description: A clear and concise description of what the bug is.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Reproduction
+    description: |
+      What command or script did you run? Which **model** are you using?
+    placeholder: |
+      A placeholder for the command.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Environment
+    description: |
+      Please provide necessary environment information here with `python3 -m sglang.check_env`. Otherwise the issue will be closed.
+    placeholder: Environment here.
+  validations:
+    required: true
diff --git a/.github/ISSUE_TEMPLATE/2-feature-request.yml b/.github/ISSUE_TEMPLATE/2-feature-request.yml
new file mode 100644
index 000000000..31bc4a127
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/2-feature-request.yml
@@ -0,0 +1,23 @@
+name: 🚀 Feature request
+description: Suggest an idea for this project
+title: "[Feature] "
+
+body:
+- type: checkboxes
+  attributes:
+    label: Checklist
+    options:
+    - label: 1. If the issue you raised is not a feature but a question, please raise a discussion at https://github.com/sgl-project/sglang/discussions/new/choose Otherwise, it will be closed.
+    - label: 2. Please use English, otherwise it will be closed.
+- type: textarea
+  attributes:
+    label: Motivation
+    description: |
+      A clear and concise description of the motivation of the feature.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Related resources
+    description: |
+      If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful.
diff --git a/.github/REVIEWERS.md b/.github/REVIEWERS.md
new file mode 100644
index 000000000..ac9ce6102
--- /dev/null
+++ b/.github/REVIEWERS.md
@@ -0,0 +1,53 @@
+# Area Reviewer
+
+Here are some reviewers for common areas. You can ping them to review your code if you touch related parts.
+
+## Hardware platforms
+- general @Alcanderian
+- AMD GPU @HaiShaw
+- Blackwell GPU @kushanam @trevor-m @zhyncs
+- CPU @mingfeima
+
+## Kernel
+- general @zhyncs @ispobock @HandH1998 @BBuf @yizhang2077 @HaiShaw
+- triton attention backend @ispobock
+- aiter attention backend @HaiShaw @kkHuang-amd @valarLip
+- flash attention backend @hebiao064
+- flashinfer attention backend @Fridge003
+- moe kernel @BBuf @fzyzcjy @ch-wan @Alcanderian
+
+## Scheduler and memory pool
+- general @merrymercy @Ying1123 @hnyls2002 @xiezhq-hermann
+- constrained decoding @hnyls2002
+- hierarchical cache @xiezhq-hermann @DarkSharpness
+- lora @Fridge003 @Ying1123 @lifuhuang
+- speculative decoding @merrymercy @Ying1123 @kssteven418 @Qiaolin-Yu
+- sliding window attention @hanming-lu
+
+## Parallelism
+- expert parallelism @fzyzcjy @ch-wan
+- data parallelism attention @ch-wan
+- pipeline parallelism @Ying1123
+- tensor parallelism @merrymercy
+
+## PD disaggregation
+- general @ByronHsu @ShangmingCai @hnyls2002
+- Mooncake backend @ShangmingCai
+
+## Build and release
+- general @zhyncs @merrymercy
+
+## API Server
+- general @CatherineSue @slin1237 @ispobock
+- function calling and reasoning parsing @CatherineSue
+- OpenAI API @CatherineSue @slin1237
+
+## SGL-Router
+- general @slin1237 @ByronHsu
+
+## Model
+- multimodal models @mickqian @JustinTong0323
+- other new models @zhaochenyang20
+
+## Reinforcment learning
+- general @zhaochenyang20 @hebiao064 @fzyzcjy @zhuzilin
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
new file mode 100644
index 000000000..ab51d4bf5
--- /dev/null
+++ b/.github/pull_request_template.md
@@ -0,0 +1,24 @@
+<!-- Thank you for your contribution! Please follow these guidelines to enhance your pull request. If anything is unclear, submit your PR and reach out to maintainers for assistance. Join our Slack community at https://slack.sglang.ai to discuss further. -->
+
+## Motivation
+
+<!-- Describe the purpose and goals of this pull request. -->
+
+## Modifications
+
+<!-- Detail the changes made in this pull request. -->
+
+## Accuracy Tests
+
+<!-- If this pull request affects model outputs (e.g., changes to the kernel or model forward code), provide accuracy test results. -->
+
+## Benchmarking and Profiling
+
+<!-- If this pull request impacts inference speed, provide benchmarking and profiling results. -->
+
+## Checklist
+
+- [ ] Format your code according to the [Format code with pre-commit](https://docs.sglang.ai/developer_guide/contribution_guide.html#format-code-with-pre-commit).
+- [ ] Add unit tests according to the [Run and add unit tests](https://docs.sglang.ai/developer_guide/contribution_guide.html#run-and-add-unit-tests).
+- [ ] Update documentation according to [Write documentations](https://docs.sglang.ai/developer_guide/contribution_guide.html#write-documentations).
+- [ ] Provide accuracy and speed benchmark results according to [Test the accuracy](https://docs.sglang.ai/developer_guide/contribution_guide.html#test-the-accuracy) and [Benchmark the speed](https://docs.sglang.ai/developer_guide/contribution_guide.html#benchmark-the-speed).
diff --git a/.github/workflows/cancel-all-pending-pr-test-runs.yml b/.github/workflows/cancel-all-pending-pr-test-runs.yml
new file mode 100644
index 000000000..6217542e7
--- /dev/null
+++ b/.github/workflows/cancel-all-pending-pr-test-runs.yml
@@ -0,0 +1,45 @@
+name: Cancel All Pending PR Test Runs
+
+on:
+  workflow_dispatch:
+    inputs:
+      workflows:
+        description: 'Space-separated list of workflow filenames to cancel'
+        required: true
+        type: string
+        default: 'pr-test.yml pr-test-xeon.yml'
+
+permissions:
+  actions: write   # Needed to cancel runs
+  contents: read   # Needed to read repo info
+
+jobs:
+  cancel-pending:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Install GitHub CLI
+        run: sudo apt-get install -y gh jq
+
+      - name: Cancel all pending/waiting runs for specified workflows
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO: ${{ github.repository }}
+        run: |
+          # Read the space-separated string from the input into a bash array
+          WORKFLOW_FILES=(${{ github.event.inputs.workflows }})
+
+          echo "Targeting ${#WORKFLOW_FILES[@]} workflow(s): ${{ github.event.inputs.workflows }}"
+
+          for workflow_file in "${WORKFLOW_FILES[@]}"; do
+            echo "--- Checking workflow: $workflow_file ---"
+            gh run list \
+              --repo "$REPO" \
+              --workflow "$workflow_file" \
+              --json databaseId,status \
+              --limit 1000 \
+              | jq -r '.[] | select(.status=="queued" or .status=="in_progress") | .databaseId' \
+              | while read run_id; do
+                  echo "Cancelling run ID: $run_id for workflow: $workflow_file"
+                  gh run cancel "$run_id" --repo "$REPO"
+                done
+          done
diff --git a/.github/workflows/cancel-pr-workflow-on-merge.yml b/.github/workflows/cancel-pr-workflow-on-merge.yml
new file mode 100644
index 000000000..535884ba6
--- /dev/null
+++ b/.github/workflows/cancel-pr-workflow-on-merge.yml
@@ -0,0 +1,22 @@
+name: Cancel PR Workflows on Merge
+
+on:
+  pull_request_target:
+    types:
+      - closed
+
+permissions:
+  actions: write
+
+jobs:
+  cancel:
+    if: github.event.pull_request.merged == true
+    runs-on: ubuntu-latest
+    steps:
+      - name: Cancel Previous Runs
+        uses: styfle/cancel-workflow-action@0.12.1
+        with:
+          workflow_id: all
+          access_token: ${{ secrets.GITHUB_TOKEN }}
+          ignore_sha: true
+          pr_number: ${{ github.event.pull_request.number }}
diff --git a/.github/workflows/close-inactive-issues.yml b/.github/workflows/close-inactive-issues.yml
new file mode 100644
index 000000000..048e6c443
--- /dev/null
+++ b/.github/workflows/close-inactive-issues.yml
@@ -0,0 +1,96 @@
+name: Close Inactive Issues
+
+on:
+  schedule:
+    - cron: '0 0 * * *'
+  workflow_dispatch:
+
+permissions:
+  issues: write
+  contents: read
+
+jobs:
+  close-inactive-issues:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check and close inactive issues
+        uses: actions/github-script@v6
+        with:
+          github-token: ${{secrets.GITHUB_TOKEN}}
+          script: |
+            const sixtyDaysAgo = new Date(Date.now() - 60 * 24 * 60 * 60 * 1000);
+
+            const [owner, repo] = process.env.GITHUB_REPOSITORY.split('/');
+            console.log(`Owner: ${owner}, Repo: ${repo}`);
+
+            async function fetchIssues(page = 1) {
+              console.log(`Fetching issues for ${owner}/${repo}, page ${page}`);
+              return await github.rest.issues.listForRepo({
+                owner,
+                repo,
+                state: 'open',
+                sort: 'updated',
+                direction: 'asc',
+                per_page: 100,
+                page: page
+              });
+            }
+
+            async function processIssues() {
+              console.log('Starting to process issues');
+              console.log(`Repository: ${owner}/${repo}`);
+
+              let page = 1;
+              let hasMoreIssues = true;
+              while (hasMoreIssues) {
+                try {
+                  const issues = await fetchIssues(page);
+                  console.log(`Fetched ${issues.data.length} issues on page ${page}`);
+
+                  if (issues.data.length === 0) {
+                    hasMoreIssues = false;
+                    break;
+                  }
+
+                  for (const issue of issues.data) {
+                    // Skip if the issue has 'good first issue' label
+                    if (issue.labels.some(label => label.name === 'good first issue')) {
+                      console.log(`Skipping issue #${issue.number} as it's marked as 'good first issue'`);
+                      continue;
+                    }
+                    if (new Date(issue.updated_at) < sixtyDaysAgo) {
+                      try {
+                        await github.rest.issues.update({
+                          owner,
+                          repo,
+                          issue_number: issue.number,
+                          state: 'closed',
+                          labels: [...issue.labels.map(l => l.name), 'inactive']
+                        });
+                        await github.rest.issues.createComment({
+                          owner,
+                          repo,
+                          issue_number: issue.number,
+                          body: 'This issue has been automatically closed due to inactivity. Please feel free to reopen it if needed.'
+                        });
+                        console.log(`Closed issue #${issue.number} due to inactivity.`);
+                      } catch (error) {
+                        console.error(`Failed to close issue #${issue.number}: ${error.message}`);
+                      }
+                    } else {
+                      console.log(`Issue #${issue.number} is still active. Stopping processing.`);
+                      hasMoreIssues = false;
+                      break;
+                    }
+                  }
+                  page += 1;
+                } catch (error) {
+                  console.error(`Error fetching issues on page ${page}: ${error.message}`);
+                  hasMoreIssues = false;
+                }
+              }
+              console.log('Finished processing issues');
+            }
+
+            await processIssues();
diff --git a/.github/workflows/execute-notebook.yml b/.github/workflows/execute-notebook.yml
new file mode 100644
index 000000000..7298d80ec
--- /dev/null
+++ b/.github/workflows/execute-notebook.yml
@@ -0,0 +1,60 @@
+name: Execute Notebooks
+
+on:
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "python/sglang/**"
+      - "docs/**"
+  workflow_dispatch:
+
+
+concurrency:
+  group: execute-notebook-${{ github.ref }}
+  cancel-in-progress: true
+
+
+jobs:
+  run-all-notebooks:
+    runs-on: 1-gpu-runner
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+          pip install -r docs/requirements.txt
+          apt-get update && apt-get install -y pandoc parallel retry
+          ln -sf "$(which python3)" /usr/bin/python
+
+      - name: Setup Jupyter Kernel
+        run: |
+          python -m ipykernel install --user --name python3 --display-name "Python 3"
+
+      - name: Execute notebooks
+        timeout-minutes: 40
+        run: |
+          cd docs
+          make clean
+          make compile
+
+
+  notebook-finish:
+    needs: [
+      run-all-notebooks
+    ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check all dependent job statuses
+        run: |
+          results=(${{ join(needs.*.result, ' ') }})
+          for result in "${results[@]}"; do
+            if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then
+              echo "Job failed with result: $result"
+              exit 1
+            fi
+          done
+          echo "All jobs completed successfully"
+          exit 0
diff --git a/.github/workflows/experiment-runner.yml b/.github/workflows/experiment-runner.yml
new file mode 100644
index 000000000..487ed9ba3
--- /dev/null
+++ b/.github/workflows/experiment-runner.yml
@@ -0,0 +1,30 @@
+name: Experiment Runner
+
+on:
+  workflow_dispatch:
+    inputs:
+      script:
+        description: "Experiment Runner Script"
+        default: "configs/sharegpt_config.yaml"
+
+concurrency:
+  group: experiment-runner-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  experiment-runner-1-gpu:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+
+      - name: Test experiment runner
+        timeout-minutes: 120
+        run: |
+          cd test/srt
+          python3 experiment_runner.py --config ${{ inputs.script }}
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 000000000..3a281299a
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,22 @@
+name: Lint
+
+on: [pull_request]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+
+      - name: Install pre-commit hook
+        run: |
+          python -m pip install pre-commit
+          pre-commit install
+
+      - name: Linting
+        run: pre-commit run --all-files --show-diff-on-failure
diff --git a/.github/workflows/nightly-test-amd.yml b/.github/workflows/nightly-test-amd.yml
new file mode 100644
index 000000000..096e876de
--- /dev/null
+++ b/.github/workflows/nightly-test-amd.yml
@@ -0,0 +1,41 @@
+name: Nightly Test (AMD)
+
+on:
+  schedule:
+    - cron: '0 0 * * *'
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+
+concurrency:
+  group: nightly-test-amd-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  nightly-test:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    strategy:
+      matrix:
+        runner: [linux-mi300-gpu-2, linux-mi325-gpu-2-nightly]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Setup docker
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: Nightly Test
+        run: |
+          bash scripts/ci/amd_ci_exec.sh -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd --timeout-per-file 7200
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY
diff --git a/.github/workflows/nightly-test.yml b/.github/workflows/nightly-test.yml
new file mode 100644
index 000000000..a32c1dbea
--- /dev/null
+++ b/.github/workflows/nightly-test.yml
@@ -0,0 +1,33 @@
+name: Nightly Test
+
+on:
+  schedule:
+    - cron: '0 0 * * *'
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+
+concurrency:
+  group: nightly-test-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  nightly-test:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 2-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 120
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite nightly --timeout-per-file 3600
diff --git a/.github/workflows/open-pr-copy-from-oss.yml b/.github/workflows/open-pr-copy-from-oss.yml
new file mode 100644
index 000000000..05af6ea44
--- /dev/null
+++ b/.github/workflows/open-pr-copy-from-oss.yml
@@ -0,0 +1,28 @@
+name: Open A PR to Copy Code From OSS
+
+on:
+  workflow_dispatch:
+  # schedule:
+  #   - cron: '0 10 * * *'
+
+permissions:
+  contents: write
+
+jobs:
+  copy:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          ref: 'main'
+
+      - name: Install GitHub CLI (if not present)
+        run: |
+          bash scripts/code_sync/install_github_cli.sh
+
+      - name: Copy from OSS code
+        env:
+          GH_TOKEN: ${{ secrets.PAT_FOR_CODE_SYNC_FROM_LIANMIN }}
+        run: |
+          python3 scripts/code_sync/copy_from_oss.py
diff --git a/.github/workflows/open-pr-copy-to-oss.yml b/.github/workflows/open-pr-copy-to-oss.yml
new file mode 100644
index 000000000..b3bb6aae4
--- /dev/null
+++ b/.github/workflows/open-pr-copy-to-oss.yml
@@ -0,0 +1,31 @@
+name: Open A PR to Copy Diff To OSS
+
+on:
+  workflow_dispatch:
+    inputs:
+      commit_sha:
+        description: 'The commit SHA to copy. Defaults to LAST to copy the latest commit.'
+        required: false
+        default: 'LAST'
+
+permissions:
+  contents: write
+
+jobs:
+  copy:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Install GitHub CLI (if not present)
+        run: |
+          bash scripts/code_sync/install_github_cli.sh
+
+      - name: Copy to OSS code
+        env:
+          GH_TOKEN: ${{ secrets.PAT_FOR_CODE_SYNC_FROM_LIANMIN }}
+        run: |
+          python3 scripts/code_sync/copy_to_oss.py --commit ${{ github.event.inputs.commit_sha }}
diff --git a/.github/workflows/pr-benchmark-rust.yml b/.github/workflows/pr-benchmark-rust.yml
new file mode 100644
index 000000000..e039cba23
--- /dev/null
+++ b/.github/workflows/pr-benchmark-rust.yml
@@ -0,0 +1,306 @@
+name: PR Benchmark (Rust Router)
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "sgl-router/**"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "sgl-router/**"
+    types: [opened, synchronize, reopened, labeled]
+  workflow_dispatch:
+
+concurrency:
+  group: pr-benchmark-rust-${{ github.ref }}
+  cancel-in-progress: true
+permissions:
+  contents: read
+  pull-requests: write
+  issues: write
+jobs:
+  # Quick check job that always runs on PRs
+  benchmark-compile-check:
+    name: Benchmark Compilation Check
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_rust.sh
+
+      - name: Setup sccache
+        uses: mozilla-actions/sccache-action@v0.0.3
+        continue-on-error: true
+
+      - name: Rust cache
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: sgl-router
+          # Share cache across all benchmark jobs
+          shared-key: "rust-cache"
+          # Save cache even on failure
+          save-if: true
+
+      - name: Check benchmarks compile
+        run: |
+          source "$HOME/.cargo/env"
+          cd sgl-router/
+          # Try to use sccache, but disable if it fails
+          if command -v sccache &> /dev/null; then
+            echo "Testing sccache availability..."
+            # Try to start sccache and check if it works
+            export RUSTC_WRAPPER=sccache
+            export SCCACHE_GHA_ENABLED="true"
+            if sccache --start-server 2>/dev/null && sccache --show-stats 2>/dev/null; then
+              echo "sccache is working, using it for compilation"
+            else
+              echo "sccache failed to start, falling back to regular cargo"
+              unset RUSTC_WRAPPER
+              unset SCCACHE_GHA_ENABLED
+            fi
+          else
+            echo "sccache not available, using regular cargo"
+          fi
+          cargo check --benches
+
+  # Full benchmark jobs that only run with label or on main branch
+  benchmark-request-processing:
+    name: Request Processing Benchmark
+    if: |
+      github.repository == 'sgl-project/sglang' &&
+      (github.event_name == 'push' ||
+       github.event_name == 'workflow_dispatch' ||
+       contains(github.event.pull_request.labels.*.name, 'benchmark'))
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          # Fetch enough history for baseline comparison
+          fetch-depth: 100
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_rust.sh
+
+      - name: Setup sccache
+        uses: mozilla-actions/sccache-action@v0.0.3
+        continue-on-error: true
+
+      - name: Rust cache
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: sgl-router
+          # Share cache across all benchmark jobs
+          shared-key: "rust-cache"
+          # Save cache even on failure
+          save-if: true
+
+      - name: Run request processing benchmark
+        timeout-minutes: 30
+        run: |
+          source "$HOME/.cargo/env"
+          cd sgl-router/
+          # Try to use sccache, but disable if it fails
+          if command -v sccache &> /dev/null; then
+            echo "Testing sccache availability..."
+            # Try to start sccache and check if it works
+            export RUSTC_WRAPPER=sccache
+            export SCCACHE_GHA_ENABLED="true"
+            if sccache --start-server 2>/dev/null && sccache --show-stats 2>/dev/null; then
+              echo "sccache is working, using it for compilation"
+            else
+              echo "sccache failed to start, falling back to regular cargo"
+              unset RUSTC_WRAPPER
+              unset SCCACHE_GHA_ENABLED
+            fi
+          else
+            echo "sccache not available, using regular cargo"
+          fi
+          # Run only the summary benchmark for quick validation in PRs
+          cargo bench --bench request_processing -- benchmark_summary --exact
+
+      - name: Upload benchmark results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: request-processing-results-${{ github.sha }}
+          path: |
+            sgl-router/target/criterion/benchmark_summary/
+          retention-days: 30
+
+  benchmark-tokenizer:
+    name: Tokenizer Benchmark
+    if: |
+      github.repository == 'sgl-project/sglang' &&
+      (github.event_name == 'push' ||
+       github.event_name == 'workflow_dispatch' ||
+       contains(github.event.pull_request.labels.*.name, 'benchmark'))
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 100
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_rust.sh
+
+      - name: Setup sccache
+        uses: mozilla-actions/sccache-action@v0.0.3
+        continue-on-error: true
+
+      - name: Rust cache
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: sgl-router
+          # Share cache across all benchmark jobs
+          shared-key: "rust-cache"
+          # Save cache even on failure
+          save-if: true
+
+      - name: Run tokenizer benchmark
+        timeout-minutes: 30
+        run: |
+          source "$HOME/.cargo/env"
+          cd sgl-router/
+          # Try to use sccache, but disable if it fails
+          if command -v sccache &> /dev/null; then
+            echo "Testing sccache availability..."
+            # Try to start sccache and check if it works
+            export RUSTC_WRAPPER=sccache
+            export SCCACHE_GHA_ENABLED="true"
+            if sccache --start-server 2>/dev/null && sccache --show-stats 2>/dev/null; then
+              echo "sccache is working, using it for compilation"
+            else
+              echo "sccache failed to start, falling back to regular cargo"
+              unset RUSTC_WRAPPER
+              unset SCCACHE_GHA_ENABLED
+            fi
+          else
+            echo "sccache not available, using regular cargo"
+          fi
+          cargo bench --bench tokenizer_benchmark
+
+      - name: Upload benchmark results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: tokenizer-results-${{ github.sha }}
+          path: |
+            sgl-router/target/criterion/tokenizer*/
+          retention-days: 30
+
+  benchmark-tool-parser:
+    name: Tool Parser Benchmark
+    if: |
+      github.repository == 'sgl-project/sglang' &&
+      (github.event_name == 'push' ||
+       github.event_name == 'workflow_dispatch' ||
+       contains(github.event.pull_request.labels.*.name, 'benchmark'))
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 100
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_rust.sh
+
+      - name: Setup sccache
+        uses: mozilla-actions/sccache-action@v0.0.3
+        continue-on-error: true
+
+      - name: Rust cache
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: sgl-router
+          # Share cache across all benchmark jobs
+          shared-key: "rust-cache"
+          # Save cache even on failure
+          save-if: true
+
+      - name: Run tool parser benchmark
+        timeout-minutes: 30
+        run: |
+          source "$HOME/.cargo/env"
+          cd sgl-router/
+          # Try to use sccache, but disable if it fails
+          if command -v sccache &> /dev/null; then
+            echo "Testing sccache availability..."
+            # Try to start sccache and check if it works
+            export RUSTC_WRAPPER=sccache
+            export SCCACHE_GHA_ENABLED="true"
+            if sccache --start-server 2>/dev/null && sccache --show-stats 2>/dev/null; then
+              echo "sccache is working, using it for compilation"
+            else
+              echo "sccache failed to start, falling back to regular cargo"
+              unset RUSTC_WRAPPER
+              unset SCCACHE_GHA_ENABLED
+            fi
+          else
+            echo "sccache not available, using regular cargo"
+          fi
+          cargo bench --bench tool_parser_benchmark
+
+      - name: Upload benchmark results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: tool-parser-results-${{ github.sha }}
+          path: |
+            sgl-router/target/criterion/tool_parser*/
+          retention-days: 30
+
+  benchmark-summary:
+    name: Benchmark Summary
+    needs: [benchmark-request-processing, benchmark-tokenizer, benchmark-tool-parser]
+    if: always() && (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request')
+    runs-on: ubuntu-latest
+    steps:
+      - name: Download all benchmark results
+        uses: actions/download-artifact@v4
+        with:
+          pattern: '*-results-${{ github.sha }}'
+          path: benchmark-results
+
+      - name: Generate summary
+        run: |
+          echo "## Benchmark Results Summary" > summary.md
+          echo "" >> summary.md
+          echo "### Request Processing" >> summary.md
+          if [ -d "benchmark-results/request-processing-results-${{ github.sha }}" ]; then
+            echo "✅ Completed" >> summary.md
+          else
+            echo "❌ Failed or skipped" >> summary.md
+          fi
+          echo "" >> summary.md
+          echo "### Tokenizer" >> summary.md
+          if [ -d "benchmark-results/tokenizer-results-${{ github.sha }}" ]; then
+            echo "✅ Completed" >> summary.md
+          else
+            echo "❌ Failed or skipped" >> summary.md
+          fi
+          echo "" >> summary.md
+          echo "### Tool Parser" >> summary.md
+          if [ -d "benchmark-results/tool-parser-results-${{ github.sha }}" ]; then
+            echo "✅ Completed" >> summary.md
+          else
+            echo "❌ Failed or skipped" >> summary.md
+          fi
+          cat summary.md
+
+      - name: Upload summary
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-summary-${{ github.sha }}
+          path: summary.md
+          retention-days: 30
diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml
new file mode 100644
index 000000000..2c7e2c652
--- /dev/null
+++ b/.github/workflows/pr-test-amd.yml
@@ -0,0 +1,377 @@
+name: PR Test (AMD)
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "python/**"
+      - "scripts/ci/**"
+      - "test/**"
+      - "sgl-kernel/**"
+      - ".github/workflows/pr-test-amd.yml"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "python/**"
+      - "scripts/ci/**"
+      - "test/**"
+      - "sgl-kernel/**"
+      - ".github/workflows/pr-test-amd.yml"
+  workflow_dispatch:
+
+concurrency:
+  group: pr-test-amd-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  accuracy-test-1-gpu-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: Evaluate Accuracy
+        timeout-minutes: 30
+        run: |
+          bash scripts/ci/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_eval_accuracy_large.py
+          bash scripts/ci/amd_ci_exec.sh python3 test_eval_fp8_accuracy.py
+          bash scripts/ci/amd_ci_exec.sh python3 models/test_qwen_models.py
+
+  accuracy-test-2-gpu-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi300-gpu-2, linux-mi325-gpu-2, linux-mi35x-gpu-2]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: Evaluate accuracy (TP=2)
+        timeout-minutes: 60
+        run: |
+          bash scripts/ci/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_moe_eval_accuracy_large.py
+
+  mla-test-1-gpu-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: MLA TEST
+        timeout-minutes: 30
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 test_mla.py
+
+  performance-test-1-gpu-part-1-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: Benchmark single latency
+        timeout-minutes: 20
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default
+
+      - name: Benchmark online latency
+        timeout-minutes: 15
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default
+
+      - name: Benchmark offline throughput
+        timeout-minutes: 15
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default
+
+      - name: Benchmark offline throughput (Non-streaming, small batch size)
+        timeout-minutes: 15
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
+
+  performance-test-1-gpu-part-2-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: Benchmark offline throughput (w/o RadixAttention)
+        timeout-minutes: 15
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache
+
+      - name: Benchmark offline throughput (w/ Triton)
+        timeout-minutes: 15
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend
+
+      - name: Benchmark offline throughput (w/ FP8)
+        timeout-minutes: 15
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
+
+  bench-test-2-gpu-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi300-gpu-2, linux-mi325-gpu-2]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: Benchmark dummy grok (TP=2)
+        timeout-minutes: 30
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 models/test_dummy_grok_models.py
+
+      - name: Benchmark single latency (TP=2)
+        timeout-minutes: 25
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
+
+      - name: Benchmark single latency + torch.compile (TP=2)
+        timeout-minutes: 25
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1
+
+      - name: Benchmark offline throughput (TP=2)
+        timeout-minutes: 25
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default
+
+      - name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
+        timeout-minutes: 25
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
+
+  unit-test-backend-1-gpu-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
+        part: [0, 1, 2, 3, 4, 5, 6, 7]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 50
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 8
+
+  unit-test-backend-1-gpu-amd-mi35x:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi35x-gpu-1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 50
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd-mi35x
+
+  unit-test-backend-2-gpu-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi300-gpu-2, linux-mi325-gpu-2]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 40
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-2-gpu-amd
+
+  unit-test-backend-8-gpu-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi300-gpu-8]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 60
+        run: |
+          bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd --timeout-per-file 3600
+
+  unit-test-sgl-kernel-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 14
+        run: |
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_align.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_softmax.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests/speculative ci_sglang python3 -m pytest test_eagle_utils.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_apply_token_bitmask_inplace.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_activation.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_kvcacheio.py
+
+  pr-test-amd-finish:
+    if: always()
+    needs: [
+      accuracy-test-1-gpu-amd, mla-test-1-gpu-amd, bench-test-2-gpu-amd,
+      accuracy-test-2-gpu-amd, performance-test-1-gpu-part-1-amd, performance-test-1-gpu-part-2-amd,
+      unit-test-backend-1-gpu-amd, unit-test-backend-1-gpu-amd-mi35x, unit-test-backend-2-gpu-amd,
+      unit-test-backend-8-gpu-amd, unit-test-sgl-kernel-amd
+    ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check all dependent job statuses
+        run: |
+          results=(${{ join(needs.*.result, ' ') }})
+          for result in "${results[@]}"; do
+            if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then
+              echo "Job failed with result: $result"
+              exit 1
+            fi
+          done
+          echo "All jobs completed successfully"
+          exit 0
diff --git a/.github/workflows/pr-test-h20.yml b/.github/workflows/pr-test-h20.yml
new file mode 100644
index 000000000..58e335289
--- /dev/null
+++ b/.github/workflows/pr-test-h20.yml
@@ -0,0 +1,81 @@
+name: PR Test (H20)
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  workflow_dispatch:
+    inputs:
+      version:
+        required: true
+        type: choice
+        default: 'release'
+        options:
+          - 'release'
+          - 'nightly'
+
+concurrency:
+  group: pr-test-h20-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  check-changes:
+    runs-on: ubuntu-latest
+    outputs:
+      src: ${{ steps.filter.outputs.src }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Detect file changes
+        id: filter
+        uses: dorny/paths-filter@v3
+        with:
+          filters: |
+            src:
+              - "python/sglang/srt/models/deepseek*"
+              - "python/sglang/srt/layers/moe/**"
+              - ".github/workflows/pr-test-h20.yml"
+              - "python/pyproject.toml"
+
+  per-commit-8-gpu-h20:
+    needs: [check-changes]
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 8-gpu-h20
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 20
+
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-8-gpu-h20
+
+  pr-test-h20-finish:
+    needs: [
+      check-changes,
+      per-commit-8-gpu-h20,
+    ]
+    if: needs.check-changes.outputs.src == 'true'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check all dependent job statuses
+        run: |
+          results=(${{ join(needs.*.result, ' ') }})
+          for result in "${results[@]}"; do
+            if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then
+              echo "Job failed with result: $result"
+              exit 1
+            fi
+          done
+          echo "All jobs completed successfully"
+          exit 0
diff --git a/.github/workflows/pr-test-npu.yml b/.github/workflows/pr-test-npu.yml
new file mode 100644
index 000000000..c0fe381e3
--- /dev/null
+++ b/.github/workflows/pr-test-npu.yml
@@ -0,0 +1,184 @@
+name: PR Test (Ascend NPU)
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "python/**"
+      - "scripts/ci/**"
+      - "test/**"
+      - ".github/workflows/pr-test-npu.yml"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "python/**"
+      - "scripts/ci/**"
+      - "test/**"
+      - ".github/workflows/pr-test-npu.yml"
+  workflow_dispatch:
+
+concurrency:
+  group: pr-test-npu-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  per-commit-1-ascend-npu:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    runs-on: linux-arm64-npu-1
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          # speed up by using infra cache services
+          CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
+          sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
+          pip config set global.index-url http://${CACHING_URL}/pypi/simple
+          pip config set global.trusted-host ${CACHING_URL}
+
+          bash scripts/ci/npu_ci_install_dependency.sh
+          # copy required file from our daily cache
+          cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
+          # copy download through proxy
+          curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
+
+      - name: Run test
+        timeout-minutes: 60
+        env:
+          SGLANG_USE_MODELSCOPE: true
+          SGLANG_IS_IN_CI: true
+          HF_ENDPOINT: https://hf-mirror.com
+          TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-1-ascend-npu
+
+  per-commit-2-ascend-npu:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    runs-on: linux-arm64-npu-2
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          # speed up by using infra cache services
+          CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
+          sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
+          pip config set global.index-url http://${CACHING_URL}/pypi/simple
+          pip config set global.trusted-host ${CACHING_URL}
+
+          bash scripts/ci/npu_ci_install_dependency.sh
+          # copy required file from our daily cache
+          cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
+          # copy download through proxy
+          curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
+
+      - name: Run test
+        timeout-minutes: 90
+        env:
+          SGLANG_USE_MODELSCOPE: true
+          SGLANG_IS_IN_CI: true
+          HF_ENDPOINT: https://hf-mirror.com
+          TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-2-ascend-npu
+
+  per-commit-4-ascend-npu:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    runs-on: linux-arm64-npu-4
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          # speed up by using infra cache services
+          CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
+          sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
+          pip config set global.index-url http://${CACHING_URL}/pypi/simple
+          pip config set global.trusted-host ${CACHING_URL}
+
+          bash scripts/ci/npu_ci_install_dependency.sh
+          # copy required file from our daily cache
+          cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
+          # copy download through proxy
+          curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
+
+      - name: Run test
+        timeout-minutes: 120
+        env:
+          SGLANG_USE_MODELSCOPE: true
+          SGLANG_IS_IN_CI: true
+          HF_ENDPOINT: https://hf-mirror.com
+          TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-4-ascend-npu --timeout-per-file 3600
+
+  per-commit-16-ascend-a3:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    runs-on: linux-aarch64-a3-16
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          # speed up by using infra cache services
+          CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
+          sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
+          pip config set global.index-url http://${CACHING_URL}/pypi/simple
+          pip config set global.trusted-host ${CACHING_URL}
+
+          bash scripts/ci/npu_ci_install_dependency.sh
+          # copy required file from our daily cache
+          cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
+          # copy download through proxy
+          curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
+
+      - name: Run test
+        timeout-minutes: 90
+        env:
+          SGLANG_USE_MODELSCOPE: true
+          SGLANG_IS_IN_CI: true
+          HF_ENDPOINT: https://hf-mirror.com
+          TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-16-ascend-a3 --timeout-per-file 5400
+
+  pr-test-npu-finish:
+    if: always()
+    needs:
+      - per-commit-1-ascend-npu
+      - per-commit-2-ascend-npu
+      - per-commit-4-ascend-npu
+      - per-commit-16-ascend-a3
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check all dependent job statuses
+        run: |
+          results=(${{ join(needs.*.result, ' ') }})
+          for result in "${results[@]}"; do
+            if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then
+              echo "Job failed with result: $result"
+              exit 1
+            fi
+          done
+          echo "All jobs completed successfully"
+          exit 0
diff --git a/.github/workflows/pr-test-pd-router.yml b/.github/workflows/pr-test-pd-router.yml
new file mode 100644
index 000000000..2a1bde1b4
--- /dev/null
+++ b/.github/workflows/pr-test-pd-router.yml
@@ -0,0 +1,599 @@
+name: PR Test (PD Router)
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - 'python/sglang/srt/disaggregation/**'
+      - 'scripts/ci/ci_start_disaggregation_servers.sh'
+      - 'sgl-router/**'
+  pull_request:
+    branches: [ main ]
+    paths:
+      - 'python/sglang/srt/disaggregation/**'
+      - 'scripts/ci/ci_start_disaggregation_servers.sh'
+      - 'sgl-router/**'
+  workflow_dispatch:
+
+concurrency:
+  group: test-disaggregation-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  pull-requests: write
+  issues: write
+
+jobs:
+  test-disaggregation:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    runs-on: [h200]
+    timeout-minutes: 45
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 10
+
+    - name: Setup Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.12'
+
+    - name: Setup Rust
+      run: |
+        bash scripts/ci/ci_install_rust.sh
+
+    - name: Cache Rust dependencies
+      uses: actions/cache@v4
+      with:
+        path: |
+          ~/.cargo/bin/
+          ~/.cargo/registry/index/
+          ~/.cargo/registry/cache/
+          ~/.cargo/git/db/
+          sgl-router/target/
+        key: ${{ runner.os }}-cargo-${{ hashFiles('sgl-router/Cargo.lock') }}
+        restore-keys: |
+          ${{ runner.os }}-cargo-
+
+    - name: Cache pip dependencies
+      uses: actions/cache@v4
+      with:
+        path: ~/.cache/pip
+        key: ${{ runner.os }}-pip-${{ hashFiles('python/pyproject.toml') }}
+        restore-keys: |
+          ${{ runner.os }}-pip-
+
+    - name: Validate environment
+      run: |
+        echo "=== System Validation ==="
+        nvidia-smi
+        echo "GPU count: $(nvidia-smi -L | wc -l)"
+        if [ $(nvidia-smi -L | wc -l) -lt 8 ]; then
+          echo "Error: This test requires at least 8 GPUs"
+          exit 1
+        fi
+
+        echo "=== GPU Process Check ==="
+        # Fail fast if any GPU compute processes are active
+        if command -v nvidia-smi >/dev/null 2>&1; then
+          # Try to query compute apps first (preferred and concise)
+          gpu_procs=$(nvidia-smi --query-compute-apps=pid,process_name,gpu_uuid --format=csv,noheader 2>/dev/null | sed '/^$/d' || true)
+
+          # Fallback to detailed PIDS report if the query returns nothing but there might still be processes
+          if [ -z "$gpu_procs" ]; then
+            gpu_procs=$(nvidia-smi -q -d PIDS 2>/dev/null | awk '/Processes/{flag=1;next}/^$/{flag=0}flag' | sed '/^\s*Processes:/d' | sed '/^\s*$/d' || true)
+          fi
+
+          if [ -n "$gpu_procs" ]; then
+            echo "Error: Found active GPU processes using the device(s):"
+            echo "$gpu_procs"
+            exit 1
+          else
+            echo "No active GPU compute processes detected."
+          fi
+        else
+          echo "Error: nvidia-smi not found; skipping GPU process check."
+          exit 1
+        fi
+
+        echo "=== RDMA Validation ==="
+        if ! command -v ibv_devices >/dev/null 2>&1; then
+          echo "Error: InfiniBand tools not found"
+          exit 1
+        fi
+
+        # Check for active IB devices
+        found_active_device=false
+        for device in mlx5_{0..11}; do
+            if ibv_devinfo $device >/dev/null 2>&1; then
+                state=$(ibv_devinfo $device | grep "state:" | head -1 | awk '{print $2}')
+                if [[ "$state" == "PORT_ACTIVE" ]]; then
+                    echo "✓ Found active device: $device"
+                    found_active_device=true
+                    break
+                fi
+            fi
+        done
+
+        if [ "$found_active_device" = false ]; then
+          echo "Error: No active IB devices found"
+          echo "Available devices:"
+          ibv_devices || true
+          exit 1
+        fi
+
+        echo "=== Model Validation ==="
+        if [ ! -d "/raid/models/meta-llama/Llama-3.1-8B-Instruct" ]; then
+          echo "Error: Model not found"
+          ls -la /raid/models/ || echo "No models directory"
+          exit 1
+        fi
+        echo "✓ Model found"
+
+    - name: Install SGLang dependencies
+      run: |
+        echo "Installing SGLang with all extras..."
+        python3 -m pip --no-cache-dir install --upgrade pip
+        python3 -m pip --no-cache-dir install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126
+        python3 -m pip --no-cache-dir install -e "python[all]" --break-system-packages
+        python3 -m pip --no-cache-dir install mooncake-transfer-engine==0.3.5
+        python3 -m pip --no-cache-dir install --user --force-reinstall genai-bench==0.0.2
+        python3 -m pip --no-cache-dir install sgl-kernel==0.3.9.post2
+
+    - name: Build and install sgl-router
+      run: |
+        source "$HOME/.cargo/env"
+        echo "Building sgl-router..."
+        cd sgl-router
+        cargo build && python3 -m build && pip install --force-reinstall dist/*.whl
+
+    - name: Start disaggregation servers
+      id: start_servers
+      run: |
+        echo "Starting disaggregation servers..."
+        bash scripts/ci/ci_start_disaggregation_servers.sh &
+        SERVER_PID=$!
+        echo "server_pid=$SERVER_PID" >> $GITHUB_OUTPUT
+
+        # Wait for all 8 servers to be healthy (script already does this)
+        wait_count=0
+        while [ $wait_count -lt 30 ]; do
+          if ps -p $SERVER_PID > /dev/null; then
+            # Check if the startup script printed success message
+            sleep 2
+            wait_count=$((wait_count + 1))
+          else
+            # Script exited - check if it was successful
+            wait $SERVER_PID
+            exit_code=$?
+            if [ $exit_code -eq 0 ]; then
+              echo "✓ All disaggregation servers are healthy"
+              break
+            else
+              echo "Error: Server startup failed with code $exit_code"
+              exit 1
+            fi
+          fi
+        done
+
+        echo "✓ Servers started (PID: $SERVER_PID)"
+
+    - name: Test all policies sequentially
+      timeout-minutes: 30
+      run: |
+        POLICIES=("random" "round_robin" "cache_aware" "power_of_two")
+        BASE_URL="http://127.0.0.9:8000"
+
+        # Free commonly used ports for router and metrics
+        echo "Freeing ports 29000 (metrics) and 8000 (API), if in use..."
+        fuser -k -n tcp 29000 2>/dev/null || true
+        fuser -k -n tcp 8000 2>/dev/null || true
+        sleep 1
+
+        for policy in "${POLICIES[@]}"; do
+          echo ""
+          echo "=================================================="
+          echo "Testing policy: $policy"
+          echo "=================================================="
+
+          # Free ports before starting router
+          fuser -k -n tcp 29000 2>/dev/null || true
+          fuser -k -n tcp 8000 2>/dev/null || true
+
+          # Start router with the current policy
+          echo "Starting router with policy: $policy..."
+          RUST_BACKTRACE=1 python3 -m sglang_router.launch_router \
+            --pd-disaggregation \
+            --policy "$policy" \
+            --prefill http://127.0.0.1:30001 9001 \
+            --prefill http://127.0.0.2:30002 9002 \
+            --prefill http://127.0.0.3:30003 9003 \
+            --prefill http://127.0.0.4:30004 9004 \
+            --decode http://127.0.0.5:30005 \
+            --decode http://127.0.0.6:30006 \
+            --decode http://127.0.0.7:30007 \
+            --decode http://127.0.0.8:30008 \
+            --host 127.0.0.9 \
+            --port 8000 &
+          ROUTER_PID=$!
+
+          # Wait for router to become healthy
+          echo "Waiting for router to become healthy..."
+          TIMEOUT=60
+          ELAPSED=0
+          while [ $ELAPSED -lt $TIMEOUT ]; do
+            if curl --connect-timeout 5 --silent http://127.0.0.9:8000 > /dev/null 2>&1; then
+              echo "✓ Router is reachable"
+              break
+            fi
+            if ! ps -p $ROUTER_PID > /dev/null; then
+              echo "Error: Router process died"
+              exit 1
+            fi
+            sleep 5
+            ELAPSED=$((ELAPSED + 5))
+          done
+
+          if [ $ELAPSED -ge $TIMEOUT ]; then
+            echo "Error: Router health check timeout"
+            kill $ROUTER_PID 2>/dev/null || true
+            exit 1
+          fi
+
+          # Test API functionality
+          echo "Testing API completions for $policy..."
+          response=$(curl -s -X POST "$BASE_URL/v1/chat/completions" \
+            -H "Content-Type: application/json" \
+            -H "Authorization: Bearer test-token" \
+            -d '{
+              "model": "/raid/models/meta-llama/Llama-3.1-8B-Instruct",
+              "messages": [
+                {"role": "user", "content": "Write a Python function to calculate fibonacci numbers recursively"}
+              ],
+              "stream": false,
+              "max_tokens": 100
+            }')
+
+          if echo "$response" | jq -e '.choices[0].message.content' > /dev/null 2>&1; then
+            echo "✓ API test passed for $policy"
+          else
+            echo "✗ API test failed for $policy: $response"
+            kill $ROUTER_PID 2>/dev/null || true
+            exit 1
+          fi
+
+          # Test streaming
+          echo "Testing streaming API for $policy..."
+          stream_response=$(timeout 30 curl -s -X POST "$BASE_URL/v1/chat/completions" \
+            -H "Content-Type: application/json" \
+            -H "Authorization: Bearer test-token" \
+            -d '{
+              "model": "/raid/models/meta-llama/Llama-3.1-8B-Instruct",
+              "messages": [
+                {"role": "user", "content": "Count from 1 to 5"}
+              ],
+              "stream": true,
+              "max_tokens": 50
+            }')
+
+          if echo "$stream_response" | grep -q "data:"; then
+            echo "✓ Streaming API test passed for $policy"
+          else
+            echo "✗ Streaming API test failed for $policy"
+            kill $ROUTER_PID 2>/dev/null || true
+            exit 1
+          fi
+
+          # Run genai-bench benchmark
+          echo "Running genai-bench for $policy..."
+          genai-bench benchmark \
+            --api-backend openai \
+            --api-base "http://127.0.0.9:8000" \
+            --api-key "dummy-token" \
+            --api-model-name "/raid/models/meta-llama/Llama-3.1-8B-Instruct" \
+            --model-tokenizer /raid/models/meta-llama/Llama-3.1-8B-Instruct \
+            --task text-to-text \
+            --num-concurrency 64 \
+            --traffic-scenario "D(8000,2000)" \
+            --max-requests-per-run 640 \
+            --max-time-per-run 2 \
+            --experiment-folder-name "benchmark_${policy}" \
+            --experiment-base-dir "."
+
+          # Find the actual experiment folder
+          actual_folder=$(find . -maxdepth 1 -name "benchmark_${policy}" -type d | head -1)
+
+          if [ -n "$actual_folder" ]; then
+            # Extract metrics from the Excel summary or JSON files
+            summary_file="$actual_folder"/*_summary.xlsx
+            json_files=$(find "$actual_folder" -name "*.json" | grep -v experiment_metadata)
+
+            echo "Genai-bench results saved in: $actual_folder"
+
+            # Extract mean values and validate performance thresholds
+            echo "📊 Extracting performance metrics for $policy..."
+
+            # Find JSON files excluding experiment metadata
+            json_files=$(find "$actual_folder" -name "*.json" | grep -v experiment_metadata)
+
+            if [ -n "$json_files" ]; then
+              # Extract metrics using jq and validate against loose thresholds
+              for json_file in $json_files; do
+                echo "Processing: $(basename "$json_file")"
+
+                                # Extract mean values for performance validation
+                ttft_mean=$(jq -r '.aggregated_metrics.stats.ttft.mean' "$json_file")
+                e2e_latency_mean=$(jq -r '.aggregated_metrics.stats.e2e_latency.mean' "$json_file")
+                input_throughput_mean=$(jq -r '.aggregated_metrics.stats.input_throughput.mean' "$json_file")
+                output_throughput_mean=$(jq -r '.aggregated_metrics.stats.output_throughput.mean' "$json_file")
+
+                echo "  TTFT mean: ${ttft_mean}s"
+                echo "  E2E Latency mean: ${e2e_latency_mean}s"
+                echo "  Input Throughput mean: ${input_throughput_mean} tokens/s"
+                echo "  Output Throughput mean: ${output_throughput_mean} tokens/s"
+
+                # Set mean thresholds (allowing for reasonable variance)
+                # These can be adjusted based on your performance requirements
+                ttft_threshold=4.7          # Max 4.7 seconds for mean TTFT
+                e2e_latency_threshold=35.0   # Max 35.0 seconds for mean E2E latency
+                input_throughput_threshold=12000   # Min 12000 tokens/s for mean input throughput
+                output_throughput_threshold=68    # Min 68 tokens/s for mean output throughput
+
+
+                # Validate mean thresholds
+                validation_passed=true
+
+                if (( $(echo "$ttft_mean > $ttft_threshold" | bc -l) )); then
+                  echo "❌ TTFT validation failed: $ttft_mean > $ttft_threshold"
+                  validation_passed=false
+                fi
+
+                if (( $(echo "$e2e_latency_mean > $e2e_latency_threshold" | bc -l) )); then
+                  echo "❌ E2E Latency validation failed: $e2e_latency_mean > $e2e_latency_threshold"
+                  validation_passed=false
+                fi
+
+                if (( $(echo "$input_throughput_mean < $input_throughput_threshold" | bc -l) )); then
+                  echo "❌ Input Throughput validation failed: $input_throughput_mean < $input_throughput_threshold"
+                  validation_passed=false
+                fi
+
+                if (( $(echo "$output_throughput_mean < $output_throughput_threshold" | bc -l) )); then
+                  echo "❌ Output Throughput validation failed: $output_throughput_mean < $output_throughput_threshold"
+                  validation_passed=false
+                fi
+
+                if [ "$validation_passed" = true ]; then
+                  echo "✅ Performance validation passed for $policy"
+                else
+                  echo "❌ Performance validation failed for $policy"
+                  kill $ROUTER_PID 2>/dev/null || true
+                  exit 1
+                fi
+              done
+
+              echo "✓ Genai-bench completed successfully for $policy"
+              echo "📊 Detailed metrics and plots available in: $actual_folder"
+            else
+              echo "✗ Benchmark failed for $policy: No JSON results found"
+              kill $ROUTER_PID 2>/dev/null || true
+              exit 1
+            fi
+          else
+            echo "✗ Benchmark failed for $policy: Experiment folder not found"
+            kill $ROUTER_PID 2>/dev/null || true
+            exit 1
+          fi
+
+          # Stop router before testing next policy
+          echo "Stopping router for $policy..."
+          # First try graceful shutdown
+          kill $ROUTER_PID 2>/dev/null || true
+
+          # Wait up to 5 seconds for graceful shutdown
+          for i in {1..5}; do
+            if ! ps -p $ROUTER_PID > /dev/null 2>&1; then
+              echo "Router stopped gracefully"
+              break
+            fi
+            sleep 1
+          done
+
+          # Force kill if still running
+          if ps -p $ROUTER_PID > /dev/null 2>&1; then
+            echo "Force killing router..."
+            kill -9 $ROUTER_PID 2>/dev/null || true
+          fi
+
+          # Short delay to ensure port is released
+          sleep 2
+
+          echo "✓ Completed testing for $policy"
+        done
+
+        echo ""
+        echo "✅ All policies tested successfully!"
+
+
+    - name: Upload benchmark results
+      if: success()
+      uses: actions/upload-artifact@v4
+      with:
+        name: genai-bench-results-all-policies
+        path: benchmark_**/
+
+    - name: Cleanup servers
+      if: always()
+      run: |
+        if [ -n "${{ steps.start_servers.outputs.server_pid }}" ]; then
+          pkill -P ${{ steps.start_servers.outputs.server_pid }} || true
+          kill ${{ steps.start_servers.outputs.server_pid }} || true
+        fi
+        pkill -f "sglang.launch_server" || true
+        sleep 5
+        remaining=$(ps aux | grep -c "sglang.launch_server" || echo "0")
+        echo "Cleanup completed. Remaining processes: $remaining"
+
+  summarize-benchmarks:
+    needs: test-disaggregation
+    runs-on: ubuntu-latest
+    if: success()
+
+    steps:
+    - name: Install jq
+      run: sudo apt-get update && sudo apt-get install -y jq bc
+
+    - name: Download benchmark results
+      uses: actions/download-artifact@v4
+      with:
+        name: genai-bench-results-all-policies
+
+    - name: List downloaded contents
+      run: |
+        echo "Contents after download:"
+        ls -la
+        find . -name "benchmark_*" -type d
+        echo "JSON files found:"
+        find . -name "*.json" | head -10
+
+    - name: Create benchmark summary
+      run: |
+        echo "=== DEBUG: Creating benchmark summary ==="
+        echo "Available benchmark directories:"
+        find . -name "benchmark_*" -type d
+        echo "=========================================="
+
+        echo "## PD Router Genai-Bench Results Summary" >> $GITHUB_STEP_SUMMARY
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo "🚀 **Benchmarked with genai-bench for comprehensive LLM serving performance evaluation**" >> $GITHUB_STEP_SUMMARY
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo "| Policy | Status | TTFT (s) | E2E Latency (s) | Input Throughput (tok/s) | Output Throughput (tok/s) |" >> $GITHUB_STEP_SUMMARY
+        echo "|--------|--------|----------|-----------------|--------------------------|---------------------------|" >> $GITHUB_STEP_SUMMARY
+
+        # First, complete the table with all policies
+        for policy in random round_robin cache_aware power_of_two; do
+          # Find genai-bench result folders for this policy (handle zip extraction structure)
+          result_folder=$(find . -maxdepth 2 -name "benchmark_${policy}" -type d | head -1)
+          if [ -z "$result_folder" ]; then
+            # Try alternative patterns in case of different extraction structure
+            result_folder=$(find . -maxdepth 3 -path "*benchmark_${policy}*" -type d | head -1)
+          fi
+
+          echo "DEBUG: Policy ${policy} -> Found folder: ${result_folder:-'NOT FOUND'}"
+
+          if [ -n "$result_folder" ] && [ -d "$result_folder" ]; then
+            # Find JSON file with metrics
+            json_file=$(find "$result_folder" -name "*.json" -not -name "experiment_metadata.json" | head -1)
+
+            if [ -n "$json_file" ] && [ -f "$json_file" ]; then
+              # Extract performance metrics
+              ttft_mean=$(jq -r '.aggregated_metrics.stats.ttft.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A")
+              e2e_latency_mean=$(jq -r '.aggregated_metrics.stats.e2e_latency.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A")
+              input_throughput_mean=$(jq -r '.aggregated_metrics.stats.input_throughput.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A")
+              output_throughput_mean=$(jq -r '.aggregated_metrics.stats.output_throughput.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A")
+
+              # Format numbers for display (2 decimal places)
+              if [ "$ttft_mean" != "N/A" ] && [ "$ttft_mean" != "null" ]; then
+                ttft_display=$(printf "%.2f" "$ttft_mean" 2>/dev/null || echo "$ttft_mean")
+              else
+                ttft_display="N/A"
+              fi
+
+              if [ "$e2e_latency_mean" != "N/A" ] && [ "$e2e_latency_mean" != "null" ]; then
+                e2e_display=$(printf "%.2f" "$e2e_latency_mean" 2>/dev/null || echo "$e2e_latency_mean")
+              else
+                e2e_display="N/A"
+              fi
+
+              if [ "$input_throughput_mean" != "N/A" ] && [ "$input_throughput_mean" != "null" ]; then
+                input_display=$(printf "%.0f" "$input_throughput_mean" 2>/dev/null || echo "$input_throughput_mean")
+              else
+                input_display="N/A"
+              fi
+
+              if [ "$output_throughput_mean" != "N/A" ] && [ "$output_throughput_mean" != "null" ]; then
+                output_display=$(printf "%.0f" "$output_throughput_mean" 2>/dev/null || echo "$output_throughput_mean")
+              else
+                output_display="N/A"
+              fi
+
+              echo "| ${policy} | ✅ Success | $ttft_display | $e2e_display | $input_display | $output_display |" >> $GITHUB_STEP_SUMMARY
+            else
+              echo "| ${policy} | ❌ No Data | N/A | N/A | N/A | N/A |" >> $GITHUB_STEP_SUMMARY
+            fi
+          else
+            echo "| ${policy} | ❌ Failed | N/A | N/A | N/A | N/A |" >> $GITHUB_STEP_SUMMARY
+          fi
+        done
+
+        # Add performance validation summary
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo "## 📊 Performance Validation" >> $GITHUB_STEP_SUMMARY
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo "**Thresholds:** TTFT ≤ 2.0s | E2E Latency ≤ 8.0s | Input Throughput ≥ 10,000 tok/s | Output Throughput ≥ 100 tok/s" >> $GITHUB_STEP_SUMMARY
+        echo "" >> $GITHUB_STEP_SUMMARY
+
+        validation_summary=""
+        for policy in random round_robin cache_aware power_of_two; do
+          # Use same robust path finding as above
+          result_folder=$(find . -maxdepth 2 -name "benchmark_${policy}" -type d | head -1)
+          if [ -z "$result_folder" ]; then
+            result_folder=$(find . -maxdepth 3 -path "*benchmark_${policy}*" -type d | head -1)
+          fi
+
+          if [ -n "$result_folder" ] && [ -d "$result_folder" ]; then
+            json_file=$(find "$result_folder" -name "*.json" -not -name "experiment_metadata.json" | head -1)
+            if [ -n "$json_file" ] && [ -f "$json_file" ]; then
+              # Extract metrics for validation
+              ttft=$(jq -r '.aggregated_metrics.stats.ttft.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A")
+              e2e_latency=$(jq -r '.aggregated_metrics.stats.e2e_latency.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A")
+              input_throughput=$(jq -r '.aggregated_metrics.stats.input_throughput.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A")
+              output_throughput=$(jq -r '.aggregated_metrics.stats.output_throughput.mean // "N/A"' "$json_file" 2>/dev/null || echo "N/A")
+
+              # Check thresholds (using same values as in main workflow)
+              validation_status="✅"
+              if [ "$ttft" != "N/A" ] && [ "$ttft" != "null" ]; then
+                if (( $(echo "$ttft > 2.0" | bc -l 2>/dev/null || echo "0") )); then
+                  validation_status="❌"
+                fi
+              fi
+              if [ "$e2e_latency" != "N/A" ] && [ "$e2e_latency" != "null" ]; then
+                if (( $(echo "$e2e_latency > 24.0" | bc -l 2>/dev/null || echo "0") )); then
+                  validation_status="❌"
+                fi
+              fi
+              if [ "$input_throughput" != "N/A" ] && [ "$input_throughput" != "null" ]; then
+                if (( $(echo "$input_throughput < 10000" | bc -l 2>/dev/null || echo "0") )); then
+                  validation_status="❌"
+                fi
+              fi
+              if [ "$output_throughput" != "N/A" ] && [ "$output_throughput" != "null" ]; then
+                if (( $(echo "$output_throughput < 90" | bc -l 2>/dev/null || echo "0") )); then
+                  validation_status="❌"
+                fi
+              fi
+
+              validation_summary="${validation_summary}- **${policy}**: $validation_status\n"
+            else
+              validation_summary="${validation_summary}- **${policy}**: ❌ No data\n"
+            fi
+          else
+            validation_summary="${validation_summary}- **${policy}**: ❌ Failed\n"
+          fi
+        done
+
+        echo -e "$validation_summary" >> $GITHUB_STEP_SUMMARY
+
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo "## 📊 Genai-Bench Features Used" >> $GITHUB_STEP_SUMMARY
+        echo "- **Token-level Performance**: TTFT, TPOT, End-to-End latency" >> $GITHUB_STEP_SUMMARY
+        echo "- **Throughput Analysis**: Input/Output/Total token throughput" >> $GITHUB_STEP_SUMMARY
+        echo "- **Statistical Analysis**: Percentiles, mean, std dev for all metrics" >> $GITHUB_STEP_SUMMARY
+        echo "- **Visual Reports**: Automated plots and Excel summaries" >> $GITHUB_STEP_SUMMARY
+        echo "- **SGLang Backend**: Native integration with SGLang serving" >> $GITHUB_STEP_SUMMARY
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo "✅ All policies tested successfully with genai-bench!" >> $GITHUB_STEP_SUMMARY
diff --git a/.github/workflows/pr-test-rust.yml b/.github/workflows/pr-test-rust.yml
new file mode 100644
index 000000000..60c7ebdf2
--- /dev/null
+++ b/.github/workflows/pr-test-rust.yml
@@ -0,0 +1,190 @@
+name: PR Test (Rust)
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "sgl-router/**"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "sgl-router/**"
+  workflow_dispatch:
+
+concurrency:
+  group: pr-test-rust-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unit-test-rust:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_rust.sh
+
+      - name: Rust cache
+        uses: Swatinem/rust-cache@v2
+        with:
+          workspaces: sgl-router
+
+      - name: Run lint
+        run: |
+          source "$HOME/.cargo/env"
+          cd sgl-router/
+          cargo clippy --all-targets --all-features -- -D warnings
+
+      - name: Run fmt
+        run: |
+          source "$HOME/.cargo/env"
+          cd sgl-router/
+          cargo fmt -- --check
+
+      - name: Run Rust tests
+        timeout-minutes: 20
+        run: |
+          source "$HOME/.cargo/env"
+          cd sgl-router/
+          cargo test
+
+      - name: Check benchmark compilation
+        run: |
+          source "$HOME/.cargo/env"
+          cd sgl-router/
+          cargo check --benches
+
+      - name: Quick benchmark sanity check
+        timeout-minutes: 15
+        run: |
+          source "$HOME/.cargo/env"
+          cd sgl-router/
+          # Run quick benchmarks to ensure they work using Python script
+          python3 scripts/run_benchmarks.py --quick
+
+  pytest-rust:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: BM.A10.4
+    timeout-minutes: 25
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install rust dependencies
+        run: |
+          bash scripts/ci/ci_install_rust.sh
+
+      - name: Install SGLang dependencies
+        run: |
+          sudo bash scripts/ci/ci_install_dependency.sh
+
+      - name: Build python binding
+        run: |
+          source "$HOME/.cargo/env"
+          cd sgl-router
+          pip install setuptools-rust wheel build
+          python3 -m build
+          pip install --force-reinstall dist/*.whl
+
+
+      - name: Run Python unit tests
+        run: |
+          cd sgl-router
+          source "$HOME/.cargo/env"
+          pip install pytest pytest-cov pytest-xdist
+          pytest -q py_test/unit --cov=sglang_router --cov-report=term-missing --cov-fail-under=80
+
+      - name: Run Python integration tests
+        run: |
+          cd sgl-router
+          source "$HOME/.cargo/env"
+          # Integration tests use FastAPI/uvicorn for mock workers
+          pip install fastapi uvicorn orjson
+          pytest -q -m integration
+
+      - name: Run Python E2E tests
+        run: |
+          bash scripts/killall_sglang.sh "nuk_gpus"
+          cd sgl-router
+          python3 -m pip --no-cache-dir install --upgrade --ignore-installed blinker
+          python3 -m pip --no-cache-dir install --upgrade --break-system-packages genai-bench==0.0.2
+          pytest -m e2e -s  -vv -o log_cli=true --log-cli-level=INFO
+
+      - name: Upload benchmark results
+        if: success()
+        uses: actions/upload-artifact@v4
+        with:
+          name: genai-bench-results-all-policies
+          path: sgl-router/benchmark_**/
+
+  finish:
+    needs: [unit-test-rust, pytest-rust]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Finish
+        run: echo "This is an empty step to ensure that all jobs are completed."
+
+  summarize-benchmarks:
+    needs: pytest-rust
+    runs-on: ubuntu-latest
+    if: success()
+
+    steps:
+    - name: Install jq
+      run: sudo apt-get update && sudo apt-get install -y jq bc
+
+    - name: Download benchmark results
+      uses: actions/download-artifact@v4
+      with:
+        name: genai-bench-results-all-policies
+
+    - name: List downloaded contents
+      run: |
+        echo "Contents after download:"
+        ls -la
+        find . -name "benchmark_*" -type d
+        echo "JSON files found:"
+        find . -name "*.json" | head -10
+
+    - name: Create benchmark summary
+      run: |
+        echo "=== DEBUG: Creating benchmark summary ==="
+        echo "Available benchmark directories:"
+        find . -name "benchmark_*" -type d || true
+        echo "=========================================="
+
+        echo "## Router E2E Genai-Bench Results Summary" >> $GITHUB_STEP_SUMMARY
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo "Results captured from E2E tests for two scenarios: regular router (2 workers, dp=2) and PD router (2 prefill + 2 decode)." >> $GITHUB_STEP_SUMMARY
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo "| Scenario | Status | TTFT (s) | E2E Latency (s) | Input Throughput (tok/s) | Output Throughput (tok/s) |" >> $GITHUB_STEP_SUMMARY
+        echo "|----------|--------|----------|-----------------|--------------------------|---------------------------|" >> $GITHUB_STEP_SUMMARY
+
+        scenarios=$'Regular (dp=2, round_robin)|benchmark_round_robin_regular\nPD (2 prefill + 2 decode, round_robin)|benchmark_round_robin_pd'
+
+        echo "$scenarios" | sed 's/^\s*//' | while IFS='|' read -r label pattern; do
+          [ -z "$label" ] && continue
+          # Find the result folder (handle different extraction layouts)
+          result_folder=$(find . -maxdepth 3 \( -name "$pattern" -o -path "*${pattern}*" \) -type d | head -1)
+
+          if [ -n "$result_folder" ] && [ -d "$result_folder" ]; then
+            json_file=$(find "$result_folder" -name "*.json" -not -name "experiment_metadata.json" | head -1)
+
+            if [ -n "$json_file" ] && [ -f "$json_file" ]; then
+              ttft_mean=$(jq -r '.aggregated_metrics.stats.ttft.mean' "$json_file")
+              e2e_latency_mean=$(jq -r '.aggregated_metrics.stats.e2e_latency.mean' "$json_file")
+              input_throughput_mean=$(jq -r '.aggregated_metrics.stats.input_throughput.mean' "$json_file")
+              output_throughput_mean=$(jq -r '.aggregated_metrics.stats.output_throughput.mean' "$json_file")
+
+              ttft_display=$(printf "%.2f" "$ttft_mean" 2>/dev/null || echo "$ttft_mean")
+              e2e_display=$(printf "%.2f" "$e2e_latency_mean" 2>/dev/null || echo "$e2e_latency_mean")
+              input_display=$(printf "%.0f" "$input_throughput_mean" 2>/dev/null || echo "$input_throughput_mean")
+              output_display=$(printf "%.0f" "$output_throughput_mean" 2>/dev/null || echo "$output_throughput_mean")
+
+              echo "| ${label} | ✅ Success | $ttft_display | $e2e_display | $input_display | $output_display |" >> $GITHUB_STEP_SUMMARY
+            fi
+          fi
+        done
diff --git a/.github/workflows/pr-test-sgl-kernel.yml b/.github/workflows/pr-test-sgl-kernel.yml
new file mode 100644
index 000000000..832188cdd
--- /dev/null
+++ b/.github/workflows/pr-test-sgl-kernel.yml
@@ -0,0 +1,151 @@
+name: PR Test (sgl-kernel)
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - "sgl-kernel/**"
+  pull_request:
+    branches: [main]
+    paths:
+      - "sgl-kernel/**"
+  workflow_dispatch:
+
+concurrency:
+  group: pr-test-sgl-kernel-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Check clang-format
+        uses: DoozyX/clang-format-lint-action@v0.18.1
+        with:
+          source: sgl-kernel
+          extensions: h,c,cpp,hpp,cu,cuh,cc
+          clangFormatVersion: 18
+          style: file
+
+  build-wheels:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: sgl-kernel-build-node
+    strategy:
+      matrix:
+        include:
+          - python-version: "3.10"
+            cuda-version: "12.4"
+          - python-version: "3.10"
+            cuda-version: "12.8"
+          - python-version: "3.10"
+            cuda-version: "12.9"
+    name: Build Wheel (CUDA ${{ matrix.cuda-version }})
+    steps:
+      - name: Cleanup
+        run: |
+          sudo rm -rf $GITHUB_WORKSPACE/* || true
+
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Build wheel for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }}
+        if: github.event_name != 'push' || (matrix.cuda-version != '12.4' && matrix.cuda-version != '12.8')
+        run: |
+          cd sgl-kernel
+          chmod +x ./build.sh
+          ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}
+          path: sgl-kernel/dist/*
+
+  unit-test:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    needs: build-wheels
+    runs-on: 1-gpu-runner
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-python3.10-cuda12.9
+
+      - name: Install
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+          pip3 install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126 && pip3 install pytest
+          pip3 uninstall sgl-kernel -y || true
+          pip3 install sgl-kernel/dist/*whl --force-reinstall --no-deps
+          pip3 list | grep sgl-kernel
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          cd sgl-kernel
+          pytest tests/
+
+      - name: Uninstall dependencies
+        run: |
+          pip3 uninstall sgl-kernel -y
+
+  mla-test:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    needs: build-wheels
+    runs-on: 1-gpu-runner
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-python3.10-cuda12.9
+
+      - name: Install
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+          pip3 install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126
+          pip3 uninstall sgl-kernel -y || true
+          pip3 install sgl-kernel/dist/*whl --force-reinstall --no-deps
+          pip3 list | grep sgl-kernel
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          cd test/srt
+          python3 test_mla_deepseek_v3.py
+
+      - name: Uninstall dependencies
+        run: |
+          pip3 uninstall sgl-kernel -y
+
+  finish:
+    needs: [unit-test, mla-test, lint, build-wheels]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check all dependent job statuses
+        run: |
+          results=(${{ join(needs.*.result, ' ') }})
+          for result in "${results[@]}"; do
+            if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then
+              echo "Job failed with result: $result"
+              exit 1
+            fi
+          done
+          echo "All jobs completed successfully"
+          exit 0
diff --git a/.github/workflows/pr-test-xeon.yml b/.github/workflows/pr-test-xeon.yml
new file mode 100644
index 000000000..fcc70f286
--- /dev/null
+++ b/.github/workflows/pr-test-xeon.yml
@@ -0,0 +1,106 @@
+name: PR Test (Xeon)
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "python/**"
+      - "scripts/ci/**"
+      - "test/**"
+      - "sgl-kernel/**"
+      - ".github/workflows/pr-test-xeon.yml"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "python/**"
+      - "scripts/ci/**"
+      - "test/**"
+      - "sgl-kernel/**"
+      - ".github/workflows/pr-test-xeon.yml"
+  workflow_dispatch:
+
+concurrency:
+  group: pr-test-xeon-${{ github.ref }}
+  cancel-in-progress: false
+
+jobs:
+  build-test:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false
+    runs-on: xeon-gnr
+    env:
+      HF_HOME: /home/sdp/.cache/huggingface
+    strategy:
+      matrix:
+        build_type: ['all']
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Build and Push
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+          tag=v${version}-xeon
+
+          docker build . -f docker/Dockerfile.xeon  -t sglang_xeon --no-cache
+
+      - name: Run container
+        run: |
+          docker run -dt \
+            -v ${{ github.workspace }}:/sglang-checkout/ --ipc=host \
+            -v ${HF_HOME}:/root/.cache/huggingface \
+            --name ci_sglang_xeon \
+            sglang_xeon
+
+      - name: Install dependencies
+        timeout-minutes: 20
+        run: |
+          docker exec ci_sglang_xeon bash -c "python3 -m pip install --upgrade pip"
+          docker exec ci_sglang_xeon pip uninstall sgl-kernel -y || true
+          docker exec -w /sglang-checkout/sgl-kernel ci_sglang_xeon bash -c "cp pyproject_cpu.toml pyproject.toml && pip install -v ."
+          docker exec -w /sglang-checkout/ ci_sglang_xeon bash -c "pip install -e "python[dev_cpu]""
+
+      - name: Check AMX support
+        id: check_amx
+        timeout-minutes: 5
+        run: |
+          docker exec -w /sglang-checkout/ ci_sglang_xeon \
+            bash -c "python3 -c 'import torch; import sgl_kernel; assert torch._C._cpu._is_amx_tile_supported(); assert hasattr(torch.ops.sgl_kernel, \"convert_weight_packed\"); '"
+        continue-on-error: true
+
+      - name: Run unit tests
+        if: steps.check_amx.outcome == 'success'
+        timeout-minutes: 36
+        run: |
+          docker exec -w /sglang-checkout/ ci_sglang_xeon \
+            bash -c "cd ./test/srt && python3 run_suite.py --suite per-commit-cpu"
+
+      - name: Change permission
+        timeout-minutes: 2
+        run: |
+          docker exec -u root ci_sglang_xeon bash -c "
+            rm -rf /tmp/ci-home  &&
+            chown -R  $(id -u):$(id -g) /sglang-checkout/ 2>/dev/null || true
+          "
+
+      - name: Cleanup container
+        if: always()
+        run: |
+          docker rm -f ci_sglang_xeon || true
+
+  pr-test-xeon-finish:
+    if: always()
+    needs: [build-test]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check all dependent job statuses
+        run: |
+          results=(${{ join(needs.*.result, ' ') }})
+          for result in "${results[@]}"; do
+            if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then
+              echo "Job failed with result: $result"
+              exit 1
+            fi
+          done
+          echo "All jobs completed successfully"
+          exit 0
diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
new file mode 100644
index 000000000..bd1053902
--- /dev/null
+++ b/.github/workflows/pr-test.yml
@@ -0,0 +1,437 @@
+name: PR Test
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  workflow_dispatch:
+    inputs:
+      version:
+        description: "FlashInfer version"
+        required: true
+        type: choice
+        default: 'release'
+        options:
+          - 'release'
+          - 'nightly'
+
+concurrency:
+  group: pr-test-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  check-changes:
+    runs-on: ubuntu-latest
+    outputs:
+      src: ${{ steps.filter.outputs.src }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Detect file changes
+        id: filter
+        uses: dorny/paths-filter@v3
+        with:
+          filters: |
+            src:
+              - "python/**"
+              - "scripts/ci/**"
+              - "test/**"
+              - ".github/workflows/pr-test.yml"
+
+  unit-test-frontend:
+    needs: check-changes
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 10
+        run: |
+          cd test/lang
+          python3 run_suite.py --suite per-commit
+
+  unit-test-backend-1-gpu:
+    needs: [check-changes, unit-test-frontend]
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 1-gpu-runner
+    strategy:
+      fail-fast: false
+      matrix:
+        part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit --auto-partition-id ${{ matrix.part }} --auto-partition-size 10
+
+  unit-test-backend-2-gpu:
+    needs: [check-changes]
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 2-gpu-runner
+    strategy:
+      fail-fast: false
+      matrix:
+        part: [0, 1]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-2-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
+
+  unit-test-backend-4-gpu:
+    needs: [check-changes, unit-test-backend-2-gpu]
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 4-gpu-runner
+    strategy:
+      fail-fast: false
+      matrix:
+        part: [0, 1]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-4-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
+
+  unit-test-backend-8-gpu:
+    needs: [check-changes, unit-test-backend-2-gpu]
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 8-gpu-runner
+    strategy:
+      fail-fast: false
+      matrix:
+        part: [0, 1]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-8-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
+
+  performance-test-1-gpu-part-1:
+    needs: check-changes
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+
+      - name: Benchmark single latency
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small
+          python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default
+
+      - name: Benchmark online latency
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default
+
+      - name: Benchmark offline throughput
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default
+
+      - name: Benchmark offline throughput (Non-streaming, small batch size)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
+
+      - name: Benchmark online latency (EAGLE)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle
+
+      - name: Benchmark online latency (LoRA)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency
+          python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency_with_concurrent_adapter_updates
+
+  performance-test-1-gpu-part-2:
+    needs: check-changes
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+
+      - name: Benchmark offline throughput (w/o RadixAttention)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache
+
+      - name: Benchmark offline throughput (w/ Triton)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend
+
+      - name: Benchmark offline throughput (w/ FP8)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
+
+      - name: Benchmark VLM offline throughput
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_offline_throughput
+
+      - name: Benchmark VLM online latency
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_online_latency
+
+  performance-test-2-gpu:
+    needs: [check-changes, unit-test-backend-2-gpu]
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 2-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+
+      - name: Benchmark single latency (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
+
+      - name: Benchmark single latency + torch.compile (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1
+
+      - name: Benchmark offline throughput (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default
+
+      - name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
+
+      - name: Benchmark offline PP decode throughput (PP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_pp_offline_throughput_default_decode
+
+      - name: Benchmark offline PP prefill throughput (PP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_pp_long_context_prefill
+
+  accuracy-test-1-gpu:
+    needs: check-changes
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+          git clone https://github.com/merrymercy/human-eval.git
+          cd human-eval
+          pip install -e .
+
+      - name: Evaluate accuracy
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 test_eval_accuracy_large.py
+
+  accuracy-test-2-gpu:
+    needs: [check-changes, accuracy-test-1-gpu]
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 2-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+          git clone https://github.com/merrymercy/human-eval.git
+          cd human-eval
+          pip install -e .
+
+      - name: Evaluate accuracy (TP=2)
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 test_moe_eval_accuracy_large.py
+
+  unit-test-deepep-4-gpu:
+    needs: [check-changes, unit-test-backend-2-gpu]
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 4-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_deepep.sh
+
+      - name: Run test
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-4-gpu-deepep
+
+  unit-test-deepep-8-gpu:
+    needs: [check-changes, unit-test-backend-2-gpu]
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false &&
+        needs.check-changes.outputs.src == 'true'
+    runs-on: 8-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_deepep.sh
+
+      - name: Run test
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-8-gpu-deepep
+
+  unit-test-backend-8-gpu-b200:
+    needs: [check-changes, unit-test-backend-2-gpu]
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false &&
+      needs.check-changes.outputs.src == 'true'
+    runs-on: b200-runner
+    strategy:
+      fail-fast: false
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-8-gpu-b200 --auto-partition-id 0 --auto-partition-size 1
+
+
+  pr-test-finish:
+    needs: [
+      check-changes,
+      unit-test-frontend, unit-test-backend-1-gpu,
+      unit-test-backend-2-gpu, unit-test-backend-4-gpu, unit-test-backend-8-gpu,
+      performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu,
+      accuracy-test-1-gpu, accuracy-test-2-gpu,
+      unit-test-deepep-4-gpu, unit-test-deepep-8-gpu,
+      unit-test-backend-8-gpu-b200,
+    ]
+    if: always()
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check all dependent job statuses
+        run: |
+          results=(${{ join(needs.*.result, ' ') }})
+          for result in "${results[@]}"; do
+            if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then
+              echo "Job failed with result: $result"
+              exit 1
+            fi
+          done
+          echo "All jobs completed successfully"
+          exit 0
diff --git a/.github/workflows/release-docker-amd-nightly.yml b/.github/workflows/release-docker-amd-nightly.yml
new file mode 100644
index 000000000..c61e200df
--- /dev/null
+++ b/.github/workflows/release-docker-amd-nightly.yml
@@ -0,0 +1,65 @@
+name: Release Docker Images Nightly (AMD)
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 13 * * *'
+
+concurrency:
+  # A PR number if a pull request and otherwise the commit hash. This cancels
+  # queued and in-progress runs for the same PR (presubmit) or commit
+  # (postsubmit). The workflow name is prepended to avoid conflicts between
+  # different workflows.
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  publish:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: amd-docker-scale
+    environment: 'prod'
+    strategy:
+      matrix:
+        gpu_arch: ['gfx942', 'gfx942-rocm700', 'gfx950']
+        build_type: ['all', 'srt']
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: "Set Date"
+        run: |
+          echo "DATE=$(date +%Y%m%d)" >> $GITHUB_ENV
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_AMD_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_AMD_TOKEN }}
+
+      - name: Build and Push
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+
+          if [ "${{ matrix.gpu_arch }}" = "gfx942" ]; then
+            rocm_tag="rocm630-mi30x"
+          elif [ "${{ matrix.gpu_arch }}" = "gfx942-rocm700" ]; then
+            rocm_tag="rocm700-mi30x"
+          elif [ "${{ matrix.gpu_arch }}" = "gfx950" ]; then
+            rocm_tag="rocm700-mi35x"
+          else
+            echo "Unsupported gfx arch"
+            exit 1
+          fi
+
+          tag=v${version}-${rocm_tag}
+
+          if [ "${{ matrix.build_type }}" = "all" ]; then
+            tag_suffix=""
+          elif [ "${{ matrix.build_type }}" = "srt" ]; then
+            tag_suffix="-srt"
+          else
+            echo "Unsupported build type"
+            exit 1
+          fi
+
+          docker build . -f docker/Dockerfile.rocm --build-arg BUILD_TYPE=${{ matrix.build_type }} --build-arg GPU_ARCH=${{ matrix.gpu_arch }} -t rocm/sgl-dev:${tag}-${{ env.DATE }}${tag_suffix} --no-cache
+          docker push rocm/sgl-dev:${tag}-${{ env.DATE }}${tag_suffix}
diff --git a/.github/workflows/release-docker-amd.yml b/.github/workflows/release-docker-amd.yml
new file mode 100644
index 000000000..98c11e2fa
--- /dev/null
+++ b/.github/workflows/release-docker-amd.yml
@@ -0,0 +1,56 @@
+name: Release Docker Images (AMD)
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+
+jobs:
+  publish:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: amd-docker-scale
+    environment: 'prod'
+    strategy:
+      matrix:
+        gpu_arch: ['gfx942', 'gfx942-rocm700', 'gfx950']
+        build_type: ['all', 'srt']
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and Push
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+
+          if [ "${{ matrix.gpu_arch }}" = "gfx942" ]; then
+            rocm_tag="rocm630-mi30x"
+          elif [ "${{ matrix.gpu_arch }}" = "gfx942-rocm700" ]; then
+            rocm_tag="rocm700-mi30x"
+          elif [ "${{ matrix.gpu_arch }}" = "gfx950" ]; then
+            rocm_tag="rocm700-mi35x"
+          else
+            echo "Unsupported gfx arch"
+            exit 1
+          fi
+
+          tag=v${version}-${rocm_tag}
+
+          if [ "${{ matrix.build_type }}" = "all" ]; then
+            tag_suffix=""
+          elif [ "${{ matrix.build_type }}" = "srt" ]; then
+            tag_suffix="-srt"
+          else
+            echo "Unsupported build type"
+            exit 1
+          fi
+
+          docker build . -f docker/Dockerfile.rocm --build-arg BUILD_TYPE=${{ matrix.build_type }} --build-arg GPU_ARCH=${{ matrix.gpu_arch }} -t lmsysorg/sglang:${tag}${tag_suffix} --no-cache
+          docker push lmsysorg/sglang:${tag}${tag_suffix}
diff --git a/.github/workflows/release-docker-dev.yml b/.github/workflows/release-docker-dev.yml
new file mode 100644
index 000000000..38e2e790f
--- /dev/null
+++ b/.github/workflows/release-docker-dev.yml
@@ -0,0 +1,49 @@
+name: Build Development Docker Image
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 0 * * *'
+
+jobs:
+  build-dev:
+    if: ${{ github.repository == 'sgl-project/sglang' }}
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        variant:
+          - version: 12.6.1
+            type: all
+            tag: dev
+          - version: 12.8.1
+            type: blackwell
+            tag: blackwell
+          - version: 12.9.1
+            type: blackwell
+            tag: b200-cu129
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          tool-cache: false
+          docker-images: false
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: false
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and Push Dev Image
+        run: |
+          docker buildx build --output type=image,compression=zstd . -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.version }} --build-arg BUILD_TYPE=${{ matrix.variant.type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.variant.tag }} --no-cache
+          docker push lmsysorg/sglang:${{ matrix.variant.tag }}
diff --git a/.github/workflows/release-docker-gb200.yml b/.github/workflows/release-docker-gb200.yml
new file mode 100644
index 000000000..fbcacb330
--- /dev/null
+++ b/.github/workflows/release-docker-gb200.yml
@@ -0,0 +1,36 @@
+name: Release Docker Images (GB200)
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+
+jobs:
+  publish:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-22.04-arm
+    environment: "prod"
+    steps:
+      - name: Delete huge unnecessary tools folder
+        run: rm -rf /opt/hostedtoolcache
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and Push
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+          tag=v${version}-cu129-gb200
+
+          docker buildx build --platform linux/arm64 --push --output type=image -t lmsysorg/sglang:${tag} -f docker/Dockerfile.gb200 --build-arg CUDA_VERSION=12.9.1 --build-arg BUILD_TYPE=blackwell --no-cache .
diff --git a/.github/workflows/release-docker-npu-nightly.yml b/.github/workflows/release-docker-npu-nightly.yml
new file mode 100644
index 000000000..527a0cdc2
--- /dev/null
+++ b/.github/workflows/release-docker-npu-nightly.yml
@@ -0,0 +1,78 @@
+name: Release Docker Images Nightly (Ascend NPU)
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - ".github/workflows/release-docker-npu-nightly.yml"
+      - "docker/Dockerfile.npu"
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    runs-on: ubuntu-22.04-arm
+    strategy:
+      matrix:
+        cann_version: ["8.2.rc1"]
+        device_type: ["910b", "a3"]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Free up disk space
+        uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
+        with:
+          tool-cache: true
+          docker-images: false
+
+      - name: Setup Docker buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            lmsysorg/sglang
+          # push with schedule event
+          # push with workflow_dispatch event
+          tags: |
+            type=ref,event=pr
+            type=ref,event=branch
+            type=schedule,pattern=main
+          flavor: |
+            latest=false
+            suffix=-cann${{ matrix.cann_version }}-${{ matrix.device_type }},onlatest=true
+      # Login against a Docker registry except on PR
+      # https://github.com/docker/login-action
+      - name: Log into docker hub
+        uses: docker/login-action@v3
+        if: ${{ github.repository == 'sgl-project/sglang' && github.event_name != 'pull_request' }}
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      # Build and push Docker image with Buildx (don't push on PR)
+      # https://github.com/docker/build-push-action
+      - name: Build and push Docker image
+        id: build-and-push
+        uses: docker/build-push-action@v6
+        with:
+          context: docker
+          file: docker/Dockerfile.npu
+          # TODO: need add x86 platforms support when memfabric is ready
+          platforms: linux/arm64
+          labels: ${{ steps.meta.outputs.labels }}
+          tags: ${{ steps.meta.outputs.tags }}
+          push: ${{ github.repository == 'sgl-project/sglang' && github.event_name != 'pull_request' }}
+          provenance: false
+          build-args: |
+            SGLANG_KERNEL_NPU_TAG=20250901
+            CANN_VERSION=${{ matrix.cann_version }}
+            DEVICE_TYPE=${{ matrix.device_type }}
diff --git a/.github/workflows/release-docker-npu.yml b/.github/workflows/release-docker-npu.yml
new file mode 100644
index 000000000..f9d52eb4b
--- /dev/null
+++ b/.github/workflows/release-docker-npu.yml
@@ -0,0 +1,74 @@
+name: Release Docker Images (Ascend NPU)
+on:
+  push:
+    tags:
+      - "*" # Trigger on all tags and filterred by pep440 later
+  workflow_dispatch:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - ".github/workflows/release-docker-npu.yml"
+      - "docker/Dockerfile.npu"
+
+jobs:
+  build:
+    runs-on: ubuntu-22.04-arm
+    strategy:
+      matrix:
+        cann_version: ["8.2.rc1"]
+        device_type: ["910b", "a3"]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Free up disk space
+        uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
+        with:
+          tool-cache: true
+          docker-images: false
+
+        # push with tag
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            lmsysorg/sglang
+          tags: |
+            type=ref,event=pr
+            type=ref,event=tag,suffix=-cann${{ matrix.cann_version }}-${{ matrix.device_type }}
+          flavor: |
+            latest=false
+
+      # Login against a Docker registry except on PR
+      # https://github.com/docker/login-action
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        if: ${{ github.repository == 'sgl-project/sglang' && github.event_name != 'pull_request' }}
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Get version
+        id: get_version
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+          echo "TAG=lmsysorg/sglang:v$version-cann${{ matrix.cann_version }}-${{ matrix.device_type }}" >> $GITHUB_OUTPUT
+
+      - name: Build and push Docker image
+        id: build-and-push
+        uses: docker/build-push-action@v6
+        with:
+          context: docker
+          file: docker/Dockerfile.npu
+          # TODO: need add x86 platforms support when memfabric is ready
+          platforms: linux/arm64
+          labels: ${{ steps.meta.outputs.labels }}
+          tags: ${{ steps.meta.outputs.tags || steps.get_version.outputs.TAG }}
+          push: ${{ github.repository == 'sgl-project/sglang' && github.event_name != 'pull_request' }}
+          provenance: false
+          build-args: |
+            SGLANG_KERNEL_NPU_TAG=20250901
+            CANN_VERSION=${{ matrix.cann_version }}
+            DEVICE_TYPE=${{ matrix.device_type }}
diff --git a/.github/workflows/release-docker-router.yml b/.github/workflows/release-docker-router.yml
new file mode 100644
index 000000000..f98651e8a
--- /dev/null
+++ b/.github/workflows/release-docker-router.yml
@@ -0,0 +1,30 @@
+name: Release SGLang Router Docker Image
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "sgl-router/py_src/sglang_router/version.py"
+  workflow_dispatch:
+
+jobs:
+  publish:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-24.04
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and Push
+        run: |
+          version=$(cat sgl-router/py_src/sglang_router/version.py | cut -d'"' -f2)
+          tag=v${version}
+
+          docker build . -f docker/Dockerfile.router -t lmsysorg/sglang-router:${tag} --no-cache
+          docker push lmsysorg/sglang-router:${tag}
diff --git a/.github/workflows/release-docker-xeon.yml b/.github/workflows/release-docker-xeon.yml
new file mode 100644
index 000000000..bd2a3910f
--- /dev/null
+++ b/.github/workflows/release-docker-xeon.yml
@@ -0,0 +1,35 @@
+name: Release Docker Xeon Images
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+
+jobs:
+  publish:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-24.04
+    environment: 'prod'
+    strategy:
+      matrix:
+        build_type: ['all']
+    steps:
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and Push
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+          tag=v${version}-xeon
+
+          docker build . -f docker/Dockerfile.xeon  -t lmsysorg/sglang:${tag} --no-cache
+          docker push lmsysorg/sglang:${tag}
diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml
new file mode 100644
index 000000000..60a8df621
--- /dev/null
+++ b/.github/workflows/release-docker.yml
@@ -0,0 +1,97 @@
+name: Release Docker Images
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+
+jobs:
+  publish:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-latest
+    environment: 'prod'
+    strategy:
+      matrix:
+        cuda_version: ['12.6.1', '12.8.1', '12.9.1']
+        build_type: ['all', 'blackwell']
+        exclude:
+          - cuda_version: '12.6.1'
+            build_type: 'blackwell'
+          - cuda_version: '12.8.1'
+            build_type: 'all'
+          - cuda_version: '12.9.1'
+            build_type: 'all'
+    steps:
+      - name: Delete huge unnecessary tools folder
+        run: rm -rf /opt/hostedtoolcache
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          tool-cache: false
+          docker-images: false
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: false
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and Push
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+
+          if [ "${{ matrix.cuda_version }}" = "11.8.0" ]; then
+            cuda_tag="cu118"
+          elif [ "${{ matrix.cuda_version }}" = "12.1.1" ]; then
+            cuda_tag="cu121"
+          elif [ "${{ matrix.cuda_version }}" = "12.4.1" ]; then
+            cuda_tag="cu124"
+          elif [ "${{ matrix.cuda_version }}" = "12.5.1" ]; then
+            cuda_tag="cu125"
+          elif [ "${{ matrix.cuda_version }}" = "12.6.1" ]; then
+            cuda_tag="cu126"
+          elif [ "${{ matrix.cuda_version }}" = "12.8.1" ]; then
+            cuda_tag="cu128"
+          elif [ "${{ matrix.cuda_version }}" = "12.9.1" ]; then
+            cuda_tag="cu129"
+          else
+            echo "Unsupported CUDA version"
+            exit 1
+          fi
+
+          tag=v${version}-${cuda_tag}
+
+          if [ "${{ matrix.build_type }}" = "all" ]; then
+            tag_suffix=""
+          elif [ "${{ matrix.build_type }}" = "srt" ]; then
+            tag_suffix="-srt"
+          elif [ "${{ matrix.build_type }}" = "blackwell" ]; then
+            tag_suffix="-b200"
+          else
+            echo "Unsupported build type"
+            exit 1
+          fi
+
+          docker buildx build --output type=image,compression=zstd . -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.cuda_version }} --build-arg BUILD_TYPE=${{ matrix.build_type }} -t lmsysorg/sglang:${tag}${tag_suffix} --no-cache
+          docker push lmsysorg/sglang:${tag}${tag_suffix}
+
+          if [ "${{ matrix.cuda_version }}" = "12.6.1" ]; then
+            docker tag lmsysorg/sglang:${tag}${tag_suffix} lmsysorg/sglang:latest${tag_suffix}
+            docker push lmsysorg/sglang:latest${tag_suffix}
+          fi
+
+          if [ "${{ matrix.cuda_version }}" = "12.9.1" ]; then
+            docker tag lmsysorg/sglang:${tag}${tag_suffix} lmsysorg/sglang:v${version}
+            docker push lmsysorg/sglang:v${version}
+          fi
diff --git a/.github/workflows/release-docs.yml b/.github/workflows/release-docs.yml
new file mode 100644
index 000000000..0e09eec93
--- /dev/null
+++ b/.github/workflows/release-docs.yml
@@ -0,0 +1,65 @@
+name: Release Documentation
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "docs/**"
+      - "python/sglang/version.py"
+      - "python/sglang/**"
+  workflow_dispatch:
+
+concurrency:
+  group: release-docs-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  execute-and-deploy:
+    runs-on: 1-gpu-runner
+    if: github.repository == 'sgl-project/sglang'
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+          pip install -r docs/requirements.txt
+          apt-get update && apt-get install -y pandoc parallel retry
+          ln -sf "$(which python3)" /usr/bin/python
+
+      - name: Setup Jupyter Kernel
+        run: |
+          python -m ipykernel install --user --name python3 --display-name "Python 3"
+
+      - name: Execute notebooks
+        timeout-minutes: 40
+        run: |
+          cd docs
+          make clean
+          make compile
+
+      - name: Push HTML to sgl-project.github.io
+        timeout-minutes: 60
+        env:
+          GITHUB_TOKEN: ${{ secrets.DOCUMENTATION_PAT_TOKEN }}
+        run: |
+          cd docs
+          make html
+          python3 wrap_run_llm.py
+
+          cd _build/html
+
+          git clone https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git ../sgl-project.github.io --depth 1
+          find ../sgl-project.github.io/ -mindepth 1 -not -path "../sgl-project.github.io/.git*" -not -name CNAME -not -name ".jekyll" -not -name ".nojekyll" -delete
+          cp -r * ../sgl-project.github.io
+          cp ../../README.md ../sgl-project.github.io/README.md
+          cd ../sgl-project.github.io
+          git config user.name "zhaochenyang20"
+          git config user.email "zhaochenyang20@gmail.com"
+          git add .
+          git commit -m "Update $(date +'%Y-%m-%d %H:%M:%S')"
+          git push https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git main
+          cd ..
+          rm -rf sgl-project.github.io
diff --git a/.github/workflows/release-fake-tag.yml b/.github/workflows/release-fake-tag.yml
new file mode 100644
index 000000000..ce5999506
--- /dev/null
+++ b/.github/workflows/release-fake-tag.yml
@@ -0,0 +1,35 @@
+name: Release Fake Tag
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+
+permissions:
+  contents: write
+
+jobs:
+  publish:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-latest
+    environment: 'prod'
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Get version
+        id: get_version
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+          echo "TAG=v$version" >> $GITHUB_OUTPUT
+
+      - name: Create and push fake tag
+        env:
+          GITHUB_TOKEN: ${{ secrets.REPO_TOKEN }}
+        run: |
+          git config user.name zhyncs
+          git config user.email me@zhyncs.com
+          git checkout -b ${{ steps.get_version.outputs.TAG }}
+          git push --set-upstream origin ${{ steps.get_version.outputs.TAG }}
diff --git a/.github/workflows/release-pypi-router.yml b/.github/workflows/release-pypi-router.yml
new file mode 100644
index 000000000..a2128be83
--- /dev/null
+++ b/.github/workflows/release-pypi-router.yml
@@ -0,0 +1,119 @@
+# Reference: https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/.github/workflows/build_wheels.yml#L1
+
+name: Release SGLang Router to PyPI
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - sgl-router/pyproject.toml
+  workflow_dispatch:
+
+jobs:
+  build:
+    name: Build on ${{ matrix.os }} (${{ matrix.target }})
+    runs-on: ${{ matrix.os }}-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - os: ubuntu
+            target: x86_64
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          path: sglang-repo
+
+      - name: Move sgl-router folder to root and delete sglang-repo
+        run: |
+          mv sglang-repo/sgl-router/* .
+          rm -rf sglang-repo
+          ls -alt
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install build dependencies
+        run: |
+          python -m pip install -U pip
+          python -m pip install build twine auditwheel
+
+      - name: Build package
+        uses: pypa/cibuildwheel@v2.21.3
+        env:
+          CIBW_BUILD: "cp38-manylinux_x86_64 cp39-manylinux_x86_64 cp310-manylinux_x86_64 cp311-manylinux_x86_64 cp312-manylinux_x86_64"
+          CIBW_BEFORE_ALL: |
+            yum update -y && yum install -y openssl-devel wget unzip && \
+            # Install latest protoc (v32.0) that supports proto3
+            cd /tmp && \
+            wget https://github.com/protocolbuffers/protobuf/releases/download/v32.0/protoc-32.0-linux-x86_64.zip && \
+            unzip protoc-32.0-linux-x86_64.zip -d /usr/local && \
+            rm protoc-32.0-linux-x86_64.zip && \
+            # Install Rust
+            curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+          CIBW_ENVIRONMENT: "PATH=$HOME/.cargo/bin:$PATH"
+
+      - name: List built packages
+        run: ls -lh wheelhouse/
+
+      - name: Check packages
+        run: twine check --strict wheelhouse/*
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: packages-${{ matrix.os }}-${{ matrix.target }}
+          path: wheelhouse/
+
+  build-sdist:
+    name: Build SDist
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          path: sglang-repo
+
+      - name: Move sgl-router folder to root, copy the license file, and delete sglang-repo
+        run: |
+          mv sglang-repo/sgl-router/* .
+          mv sglang-repo/LICENSE .
+          rm -rf sglang-repo
+          ls -alt
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Build SDist
+        run: |
+          pip install build
+          python -m pip install -U packaging
+          python -m build --sdist
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: sdist
+          path: dist/*.tar.gz
+
+  upload:
+    name: Upload to PyPI
+    if: github.repository == 'sgl-project/sglang'  # Ensure this job only runs for the sgl-project/sglang repository
+    needs: [build, build-sdist]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          path: dist
+          merge-multiple: true
+
+      - name: Upload to PyPI
+        env:
+          TWINE_USERNAME: __token__
+          TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN_ROUTER }}
+        run: |
+          pip install twine
+          twine upload dist/* --verbose
diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml
new file mode 100644
index 000000000..354ee21d1
--- /dev/null
+++ b/.github/workflows/release-pypi.yml
@@ -0,0 +1,31 @@
+name: Release PyPI
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+
+jobs:
+  publish:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-latest
+    environment: "prod"
+    steps:
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Upload to pypi
+        run: |
+          cd python
+          cp ../README.md ../LICENSE .
+          pip install build
+          python3 -m build
+          pip install twine
+          python3 -m twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
diff --git a/.github/workflows/release-whl-kernel-cu118.yml b/.github/workflows/release-whl-kernel-cu118.yml
new file mode 100644
index 000000000..4757bcaa1
--- /dev/null
+++ b/.github/workflows/release-whl-kernel-cu118.yml
@@ -0,0 +1,92 @@
+name: Release SGLang Kernel Wheel (cu118)
+
+on:
+  workflow_dispatch:
+    inputs:
+      tag_name:
+        type: string
+  push:
+    branches:
+      - main
+    paths:
+      - sgl-kernel/python/sgl_kernel/version.py
+
+jobs:
+  build-wheels:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: sgl-kernel-release-node
+    strategy:
+      matrix:
+        python-version: ["3.9"]
+        cuda-version: ["11.8"]
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Build wheels for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }}
+        run: |
+          cd sgl-kernel
+          chmod +x ./build.sh
+          ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}
+          path: sgl-kernel/dist/*
+
+  release:
+    needs: build-wheels
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-*
+
+      - name: Set tag name
+        id: set_tag_name
+        run: |
+          if [ -z "${{ inputs.tag_name }}" ]; then
+            TAG_NAME="v$(cat sgl-kernel/python/sgl_kernel/version.py | cut -d'"' -f2)"
+            echo "tag_name=$TAG_NAME" >> $GITHUB_OUTPUT
+          else
+            echo "tag_name=${{ inputs.tag_name }}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Release
+        uses: softprops/action-gh-release@v2
+        with:
+          tag_name: ${{ steps.set_tag_name.outputs.tag_name }}
+          repository: sgl-project/whl
+          token: ${{ secrets.WHL_TOKEN }}
+          files: |
+            sgl-kernel/dist/*
+
+      - name: Clone wheel index
+        run: git clone https://oauth2:${WHL_TOKEN}@github.com/sgl-project/whl.git sgl-whl
+        env:
+          WHL_TOKEN: ${{ secrets.WHL_TOKEN }}
+
+      - name: Update wheel index
+        run: python3 scripts/update_kernel_whl_index.py
+
+      - name: Push wheel index
+        run: |
+          cd sgl-whl
+          git config --local user.name "github-actions[bot]"
+          git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          git add -A
+          git commit -m "update whl index"
+          git push
diff --git a/.github/workflows/release-whl-kernel.yml b/.github/workflows/release-whl-kernel.yml
new file mode 100644
index 000000000..b12c91288
--- /dev/null
+++ b/.github/workflows/release-whl-kernel.yml
@@ -0,0 +1,283 @@
+name: Release SGLang Kernels
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - sgl-kernel/python/sgl_kernel/version.py
+  workflow_dispatch:
+    inputs:
+      tag_name:
+        type: string
+        required: false
+
+concurrency:
+  group: release-sglang-kernels-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build-cu129:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: sgl-kernel-release-node
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+        cuda-version: ["12.9"]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Build wheels
+        run: |
+          cd sgl-kernel
+          chmod +x ./build.sh
+          ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
+
+      - name: Upload to PyPI
+        working-directory: sgl-kernel
+        run: |
+          pip install twine
+          python3 -m twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
+
+  build-cu124:
+    if: github.repository == 'sgl-project/sglang'
+    needs: build-cu129
+    runs-on: sgl-kernel-release-node
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+        cuda-version: ["12.4"]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Build wheels
+        run: |
+          cd sgl-kernel
+          chmod +x ./build.sh
+          ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}
+          path: sgl-kernel/dist/*
+
+  release-cu124:
+    needs: build-cu124
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-*
+
+      - name: Set tag name
+        id: set_tag_name
+        run: |
+          if [ -z "${{ inputs.tag_name }}" ]; then
+            TAG_NAME="v$(cat sgl-kernel/python/sgl_kernel/version.py | cut -d'"' -f2)"
+            echo "tag_name=$TAG_NAME" >> $GITHUB_OUTPUT
+          else
+            echo "tag_name=${{ inputs.tag_name }}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Release
+        uses: softprops/action-gh-release@v2
+        with:
+          tag_name: ${{ steps.set_tag_name.outputs.tag_name }}
+          repository: sgl-project/whl
+          token: ${{ secrets.WHL_TOKEN }}
+          files: |
+            sgl-kernel/dist/*
+
+      - name: Clone wheel index
+        run: git clone https://oauth2:${WHL_TOKEN}@github.com/sgl-project/whl.git sgl-whl
+        env:
+          WHL_TOKEN: ${{ secrets.WHL_TOKEN }}
+
+      - name: Update wheel index
+        run: python3 scripts/update_kernel_whl_index.py --cuda 124
+
+      - name: Push wheel index
+        run: |
+          cd sgl-whl
+          git config --local user.name "github-actions[bot]"
+          git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          git add -A
+          git commit -m "update whl index"
+          git push
+
+  build-cu128:
+    if: github.repository == 'sgl-project/sglang'
+    needs: build-cu129
+    runs-on: sgl-kernel-release-node
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+        cuda-version: ["12.8"]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Build wheels
+        run: |
+          cd sgl-kernel
+          chmod +x ./build.sh
+          ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}
+          path: sgl-kernel/dist/*
+
+  release-cu128:
+    needs: build-cu128
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-*
+
+      - name: Set tag name
+        id: set_tag_name
+        run: |
+          if [ -z "${{ inputs.tag_name }}" ]; then
+            TAG_NAME="v$(cat sgl-kernel/python/sgl_kernel/version.py | cut -d'"' -f2)"
+            echo "tag_name=$TAG_NAME" >> $GITHUB_OUTPUT
+          else
+            echo "tag_name=${{ inputs.tag_name }}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Release
+        uses: softprops/action-gh-release@v2
+        with:
+          tag_name: ${{ steps.set_tag_name.outputs.tag_name }}
+          repository: sgl-project/whl
+          token: ${{ secrets.WHL_TOKEN }}
+          files: |
+            sgl-kernel/dist/*
+
+      - name: Clone wheel index
+        run: git clone https://oauth2:${WHL_TOKEN}@github.com/sgl-project/whl.git sgl-whl
+        env:
+          WHL_TOKEN: ${{ secrets.WHL_TOKEN }}
+
+      - name: Update wheel index
+        run: python3 scripts/update_kernel_whl_index.py --cuda 128
+
+      - name: Push wheel index
+        run: |
+          cd sgl-whl
+          git config --local user.name "github-actions[bot]"
+          git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          git add -A
+          git commit -m "update whl index"
+          git push
+
+  build-cu129-aarch64:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: sgl-kernel-release-node-arm
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+        cuda-version: ["12.9"]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Build wheels
+        run: |
+          cd sgl-kernel
+          chmod +x ./build.sh
+          ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}" aarch64
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}-aarch64
+          path: sgl-kernel/dist/*
+
+  release-cu129-aarch64:
+    needs: build-cu129-aarch64
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-*
+
+      - name: Set tag name
+        id: set_tag_name
+        run: |
+          if [ -z "${{ inputs.tag_name }}" ]; then
+            TAG_NAME="v$(cat sgl-kernel/python/sgl_kernel/version.py | cut -d'"' -f2)"
+            echo "tag_name=$TAG_NAME" >> $GITHUB_OUTPUT
+          else
+            echo "tag_name=${{ inputs.tag_name }}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Release
+        uses: softprops/action-gh-release@v2
+        with:
+          tag_name: ${{ steps.set_tag_name.outputs.tag_name }}
+          repository: sgl-project/whl
+          token: ${{ secrets.WHL_TOKEN }}
+          files: |
+            sgl-kernel/dist/*
+
+      - name: Clone wheel index
+        run: git clone https://oauth2:${WHL_TOKEN}@github.com/sgl-project/whl.git sgl-whl
+        env:
+          WHL_TOKEN: ${{ secrets.WHL_TOKEN }}
+
+      - name: Update wheel index
+        run: python3 scripts/update_kernel_whl_index.py --cuda 129
+
+      - name: Push wheel index
+        run: |
+          cd sgl-whl
+          git config --local user.name "github-actions[bot]"
+          git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          git add -A
+          git commit -m "update whl index"
+          git push
diff --git a/.github/workflows/vllm-dependency-test.yml b/.github/workflows/vllm-dependency-test.yml
new file mode 100644
index 000000000..442d76d50
--- /dev/null
+++ b/.github/workflows/vllm-dependency-test.yml
@@ -0,0 +1,43 @@
+name: VLLM Dependency Test
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "python/**"
+      - "scripts/ci/**"
+      - "test/**"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "python/**"
+      - "scripts/ci/**"
+      - "test/**"
+
+concurrency:
+  group: vllm-dependency-test-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  vllm-dependency-test:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/ci_install_dependency.sh
+          pip install "bitsandbytes>=0.44.0"
+
+          pip install "sgl-kernel==0.3.7"
+
+      - name: Run vLLM dependency tests
+        timeout-minutes: 60
+        run: |
+          export SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK=1
+
+          cd test/srt
+          python3 run_suite.py --suite vllm_dependency_test --timeout-per-file 3600
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..9725fabd9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,240 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+
+# Tokenizer cache for tests
+.tokenizer_cache/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+# MacOS
+.DS_Store
+
+# Vim
+*.swp
+
+# Documentation
+docs/_build
+
+# SGL
+benchmark/mmlu/data
+benchmark/mmlu/data.tar
+benchmark/llava_bench/images
+benchmark/llava_bench/mme_pack
+*.jsonl
+tmp*.txt
+
+# Plots
+*.png
+*.pdf
+
+# personnal
+work_dirs/
+*.csv
+
+!logo.png
+
+# Prerequisites
+*.d
+
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+*.smod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+
+compile_commands.json
+
+*.iml
+
+# VSCode
+.vscode
+
+1
+
+# Autoenv
+.env.leave
+
+# Rust lib
+Cargo.lock
+
+lmms-eval
diff --git a/.isort.cfg b/.isort.cfg
new file mode 100644
index 000000000..835095154
--- /dev/null
+++ b/.isort.cfg
@@ -0,0 +1,3 @@
+[settings]
+profile=black
+known_first_party=sglang
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 000000000..a295f2eb4
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,59 @@
+default_stages: [pre-commit, pre-push, manual]
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: check-symlinks
+      - id: destroyed-symlinks
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+        args: [--allow-multiple-documents]
+      - id: check-toml
+      - id: check-ast
+      - id: check-added-large-files
+      - id: check-merge-conflict
+      - id: check-shebang-scripts-are-executable
+      - id: detect-private-key
+      - id: debug-statements
+      - id: no-commit-to-branch
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.13.2
+    hooks:
+      - id: isort
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.11.7
+    hooks:
+      - id: ruff
+        args: [--select=F401, --fixable=F401]
+        files: ^(benchmark/|docs/|examples/)
+        exclude: \.ipynb$
+  - repo: https://github.com/psf/black
+    rev: 24.10.0
+    hooks:
+      - id: black-jupyter
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.4.1
+    hooks:
+      - id: codespell
+        additional_dependencies: ['tomli']
+        args: ['--toml', 'python/pyproject.toml', '-L', 'cann,thi,makro,wil,rouge']
+        exclude: |
+          (?x)^(
+            test/srt/test_reasoning_parser\.py|
+            docs/advanced_features/vlm_query\.ipynb
+          )$
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v18.1.8
+    hooks:
+    - id: clang-format
+      types_or: [c++, cuda]
+      args: [--style=file, --verbose]
+  - repo: https://github.com/kynan/nbstripout
+    rev: 0.8.1
+    hooks:
+      - id: nbstripout
+        args:
+          - '--keep-output'
+          - '--extra-keys=metadata.kernelspec metadata.language_info.version'
diff --git a/3rdparty/amd/profiling/PROFILING.md b/3rdparty/amd/profiling/PROFILING.md
new file mode 100644
index 000000000..7e15ec844
--- /dev/null
+++ b/3rdparty/amd/profiling/PROFILING.md
@@ -0,0 +1,425 @@
+## Profiling SGLang Infer System with AMD GPUs
+This AppNote describes the SGLang profiling technical, code augment and running steps for systems with AMD Instinct GPUs, nevertheless the same procedure may work with Nvidia GPUs too.
+Examples and steps are provided in detail, to facilitate easy reproduce and use to localize performance problem towards optimizations.
+Two primary methods are covered:
+- [RPD](https://github.com/ROCm/rocmProfileData.git)
+- [PyTorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)
+
+### Profiling SGLang Infer System with RPD Profiler
+RPD profiler is a low-overhead cross-platform profiler. Therefore, the same RPD code augment not only works for profiling on ROCm/AMD GPUs, but also works for profiling on CUDA/Nvidia GPUs as well. To do RPD profiling on SGLang repository, please use scripts and patch files included in this directory and follow the steps below:
+1. Install RPD with rpd.patch applied during installation using install_rpd.sh, both files are in this directory.
+
+install_rpd.sh
+
+```bash
+# download and install RPD
+apt update && apt install -y sqlite3 libsqlite3-dev libfmt-dev
+
+# install rpd module
+git clone https://github.com/ROCmSoftwarePlatform/rocmProfileData
+cd rocmProfileData
+git checkout 976899e9c6dbc6dd2bccf770818e4e44125590ac
+git apply rpd.patch
+make && make install
+cd rocpd_python && python setup.py install && cd ..
+cd rpd_tracer && make clean;make install && python setup.py install && cd ..
+```
+
+rpd.patch
+
+```bash
+diff --git a/rpd_tracer/Makefile b/rpd_tracer/Makefile
+index e9d9feb..b2e9e1a 100644
+--- a/rpd_tracer/Makefile
++++ b/rpd_tracer/Makefile
+@@ -16,7 +16,7 @@ ifneq (,$(HIP_PATH))
+         $(info Building with roctracer)
+         RPD_LIBS += -L/opt/rocm/lib -lroctracer64 -lroctx64 -lamdhip64 -lrocm_smi64
+         RPD_INCLUDES += -I/opt/rocm/include -I/opt/rocm/include/roctracer -I/opt/rocm/include/hsa
+-        RPD_SRCS += RoctracerDataSource.cpp RocmSmiDataSource.cpp
++        RPD_SRCS += RoctracerDataSource.cpp
+         RPD_INCLUDES += -D__HIP_PLATFORM_AMD__
+ endif
+```
+2. Add loadTracer.sh file included in this directory to /sglang/python/sglang.
+
+loadTracer.sh
+
+```bash
+#!/bin/bash
+################################################################################
+# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+################################################################################
+OUTPUT_FILE="trace.rpd"
+
+if [ "$1" = "-o" ] ; then
+  OUTPUT_FILE=$2
+  shift
+  shift
+fi
+
+if [ -e ${OUTPUT_FILE} ] ; then
+  rm ${OUTPUT_FILE}
+fi
+
+python3 -m rocpd.schema --create ${OUTPUT_FILE}
+if [ $? != 0 ] ; then
+  echo "Error: Could not create rpd file. Please run 'python setup.py install' from the rocpd_python dir"
+  exit
+fi
+
+export RPDT_FILENAME=${OUTPUT_FILE}
+export RPDT_AUTOSTART=0
+LD_PRELOAD=librocm-smi_64:librpd_tracer.so "$@"
+```
+3. Apply patch (provided in this directory) with "git apply rpd_profile_server_enable.patch" if the main profiling purpose is to get info on gpu kernels as well as limited cpu activity info.
+
+#### Common Notes 1
+Please note that although we are doing TP=8 in the example, we purposely only log RPD profiling on 2 ranks in the patch file (i.e.tp_rank=0/1) for profiling/visualization convenience, as even Perfetto streaming mode can only load maximal 8GB json file for visualization. With 2 ranks logged in RPD profiling, we could still check whether there are issues among ranks (e.g. load imbalance issue, nccl issue), and at the same time, we could log relatively longer time duration before the json file generated from RPD file hits 8GB size.
+
+rpd_profile_server_enable.patch
+
+```bash
+diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
+index 62d1ff9..9021c01 100644
+--- a/python/sglang/srt/managers/scheduler.py
++++ b/python/sglang/srt/managers/scheduler.py
+@@ -71,6 +71,8 @@ from sglang.srt.utils import (
+     suppress_other_loggers,
+ )
+ from sglang.utils import get_exception_traceback
++from rpdTracerControl import rpdTracerControl
++rpdTracerControl.skipCreate()
+
+ logger = logging.getLogger(__name__)
+
+@@ -245,6 +247,7 @@ class Scheduler:
+                 ],
+                 with_stack=True,
+             )
++            self.rpd = rpdTracerControl()
+
+     @torch.inference_mode()
+     def event_loop(self):
+@@ -1027,15 +1030,24 @@ class Scheduler:
+     def start_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.start()
++        #self.profiler.start() #block pytorch profiler for rpd profiler enabling
++        if self.tp_rank == 0 or self.tp_rank == 1:
++            self.rpd.start()
++            self.rpd.rangePush("", "rpd profile range", "")
++            logger.info("rpd is enabled")
+
+     def stop_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.stop()
+-        self.profiler.export_chrome_trace(
+-            self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+-        )
++        #self.profiler.stop()
++        #self.profiler.export_chrome_trace(
++        #    self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
++        #)
++        if self.tp_rank ==0 or self.tp_rank ==1:
++            self.rpd.rangePop()
++            self.rpd.stop()
++            self.rpd.flush()
++            logger.info("rpd is done")
+         logger.info("Profiler is done")
+```
+
+#### Advanced Debugging with RPD Profiler
+Sometimes, we want to use rpd profiler to capture more CPU and python activities in order to debug some challenging issues (e.g. root cause of load imbalance across gpu processes, root cause of bubbles, etc). Only in such cases, we need to apply patch "git apply rpd_profile_server_enable_wCPU_activities.patch", where 3 files are modified.
+
+rpd_profile_server_enable_wCPU_activities.patch
+
+```bash
+diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
+index 62d1ff9..2edb427 100644
+--- a/python/sglang/srt/managers/scheduler.py
++++ b/python/sglang/srt/managers/scheduler.py
+@@ -71,6 +71,8 @@ from sglang.srt.utils import (
+     suppress_other_loggers,
+ )
+ from sglang.utils import get_exception_traceback
++from rpdTracerControl import rpdTracerControl
++rpdTracerControl.skipCreate()
+
+ logger = logging.getLogger(__name__)
+
+@@ -245,6 +247,7 @@ class Scheduler:
+                 ],
+                 with_stack=True,
+             )
++            self.rpd = rpdTracerControl()
+
+     @torch.inference_mode()
+     def event_loop(self):
+@@ -1027,15 +1030,26 @@ class Scheduler:
+     def start_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.start()
++        #self.profiler.start()
++        logger.info("torch profiler is disabled")
++        if self.tp_rank == 0 or self.tp_rank == 1:
++            self.rpd.setPythonTrace(True)
++            self.rpd.start()
++            self.rpd.rangePush("", "scheduler", "")
++        logger.info("rpd is enabled inside scheduler profiling")
+
+     def stop_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.stop()
+-        self.profiler.export_chrome_trace(
+-            self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+-        )
++        #self.profiler.stop()
++        #self.profiler.export_chrome_trace(
++        #    self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
++        #)
++        if self.tp_rank ==0 or self.tp_rank ==1:
++            self.rpd.rangePop()
++            self.rpd.stop()
++            self.rpd.flush()
++            logger.info("rpd is done inside scheduler")
+         logger.info("Profiler is done")
+
+
+diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
+index 2621ccd..181df85 100644
+--- a/python/sglang/srt/managers/tokenizer_manager.py
++++ b/python/sglang/srt/managers/tokenizer_manager.py
+@@ -58,6 +58,10 @@ from sglang.srt.sampling.sampling_params import SamplingParams
+ from sglang.srt.server_args import PortArgs, ServerArgs
+ from sglang.srt.utils import is_generation_model, is_multimodal_model
+
++from rpdTracerControl import rpdTracerControl
++rpdTracerControl.skipCreate()
++
++
+ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+
+ logger = logging.getLogger(__name__)
+@@ -514,10 +518,20 @@ class TokenizerManager:
+         self.send_to_scheduler.send_pyobj(req)
+
+     def start_profile(self):
++        rpd = rpdTracerControl()
++        rpd.setPythonTrace(True)
++        rpd.start()
++        rpd.rangePush("", "tokenizer_manager", "")
++        logger.info("tokenizer_manager rpd profiling started!")
+         req = ProfileReq.START_PROFILE
+         self.send_to_scheduler.send_pyobj(req)
+
+     def stop_profile(self):
++        rpd = rpdTracerControl()
++        rpd.rangePop()
++        rpd.stop()
++        rpd.flush()
++        logger.info("rpd profiling is done inside tokenizer_manager!")
+         req = ProfileReq.STOP_PROFILE
+         self.send_to_scheduler.send_pyobj(req)
+
+diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
+index 7111c93..2bd722c 100644
+--- a/python/sglang/srt/server.py
++++ b/python/sglang/srt/server.py
+@@ -30,6 +30,8 @@ import threading
+ import time
+ from http import HTTPStatus
+ from typing import Dict, List, Optional, Union
++from rpdTracerControl import rpdTracerControl
++rpdTracerControl.skipCreate()
+
+ # Fix a bug of Python threading
+ setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
+@@ -152,6 +154,11 @@ async def flush_cache():
+ @app.post("/start_profile")
+ async def start_profile():
+     """Start profiling."""
++    rpd = rpdTracerControl()
++    rpd.setPythonTrace(True)
++    rpd.start()
++    rpd.rangePush("", "server rpd profile range", "")
++    logger.info("rpd profiling started in server.py!")
+     tokenizer_manager.start_profile()
+     return Response(
+         content="Start profiling.\n",
+@@ -164,6 +171,11 @@ async def start_profile():
+ async def stop_profile():
+     """Stop profiling."""
+     tokenizer_manager.stop_profile()
++    rpd = rpdTracerControl()
++    rpd.rangePop()
++    rpd.stop()
++    rpd.flush()
++    logger.info("rpd profiling is done in server.py!")
+     return Response(
+         content="Stop profiling. This will take some time.\n",
+         status_code=200,
+```
+
+4. As an example for grok1 profiling, we create a dummy_grok1 directory with config.json (see content below) inside this directory and copy this directory to the right path for "--model-path" if you want to use the example server.sh file provided.
+```bash
+cat ../dummy_grok1/config.json
+{
+  "architectures": [
+    "Grok1ModelForCausalLM"
+  ],
+  "embedding_multiplier_scale": 78.38367176906169,
+  "output_multiplier_scale": 0.5773502691896257,
+  "vocab_size": 131072,
+  "hidden_size": 6144,
+  "intermediate_size": 32768,
+  "max_position_embeddings": 8192,
+  "num_experts_per_tok": 2,
+  "num_local_experts": 8,
+  "num_attention_heads": 48,
+  "num_hidden_layers": 64,
+  "num_key_value_heads": 8,
+  "head_dim": 128,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 10000.0,
+  "model_type": "mixtral",
+  "torch_dtype": "bfloat16"
+}
+```
+5. Launch server with rpd enabled script ./server.sh in one terminal inside the docker container.
+
+#### Common Notes 2
+- Remember to change model-path to the correct path
+- loadTracer.sh is needed to conduct profiling
+- SGLANG_TORCH_PROFILER_DIR is used for default torch profiler
+- Do not use loadTracer.sh if you are using the torch profiler, simply use python3 -m sglang.launch_server.
+
+
+server.sh
+
+```bash
+#!/bin/bash
+
+# export SGLANG_TORCH_PROFILER_DIR=/data/sglang/
+export SGLANG_TORCH_PROFILER_DIR=/sgl-workspace/sglang/profile/
+
+# Get the current timestamp
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+
+# Define the log file with a timestamp
+LOGFILE="sglang_server_log_$TIMESTAMP.json"
+
+# Run the Python command and save the output to the log file
+loadTracer.sh python3 -m sglang.launch_server \
+    --model-path /sgl-workspace/sglang/dummy_grok1 \
+    --tokenizer-path Xenova/grok-1-tokenizer \
+    --load-format dummy \
+    --quantization fp8 \
+    --tp 8 \
+    --port 30000 \
+    --disable-radix-cache 2>&1 | tee "$LOGFILE"
+```
+6. Open another terminal for the same docker container, and run the rpd enabled ./client.sh after you see "The server is fired up and is ready to roll!" message from server side terminal.
+
+#### Common Notes 3
+- Use curl http://localhost:30000/start_profile & curl http://localhost:30000/stop_profile to control the start and end of profiling. Check sglang/python/sglang/srt/managers/scheduler.py for more details.
+- Please don't use RPD profiler together with PyTorch profiler to avoid interference.
+- The rocmProfileData/tools/rpd2tracing.py file is used to generate json file from RPD file.
+
+client.sh
+
+```bash
+#!/bin/bash
+
+# Start profiling via API
+curl http://localhost:30000/start_profile -H "Content-Type: application/json"
+
+# Benchmark serving using sglang with random dataset and tokenizer
+# Define the log file with a timestamp
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+LOGFILE="sglang_client_log_$TIMESTAMP.json"
+
+# Run the benchmark with specified parameters and save logs
+python3 -m sglang.bench_serving \
+    --backend sglang \
+    --tokenizer Xenova/grok-1-tokenizer \
+    --dataset-name random \
+    --random-input 1024\
+    --random-output 1024 \
+    --num-prompts 120 \
+    --request-rate 8 \
+    --output-file online.jsonl 2>&1 | tee "$LOGFILE"
+
+# Stop profiling via API
+curl http://localhost:30000/stop_profile -H "Content-Type: application/json"
+
+# Convert tracing file to csv & json
+sqlite3 trace.rpd ".mode csv" ".header on" ".output trace.csv" "select * from top;" ".output stdout"
+python3 ./rocmProfileData/tools/rpd2tracing.py trace.rpd trace.json
+```
+7. Follow [Perfetto docs](https://perfetto.dev/docs/visualization/large-traces) to visualize large json files. Try to adjust parameters so that the trace.json file size is less than 9GB.
+
+### Profiling SGLang Infer System with PyTorch Profiler
+
+Please use the steps as follows:
+
+1. Apply the patch torch_profiler.patch. Note that you can modify "if self.tp_rank == 0" in the patch to allow more ranks be recorded in profiling.
+
+torch_profiler.patch
+```bash
+diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
+index 62d1ff9..6ecd78c 100644
+--- a/python/sglang/srt/managers/scheduler.py
++++ b/python/sglang/srt/managers/scheduler.py
+@@ -240,7 +240,6 @@ class Scheduler:
+             )
+             self.profiler = torch.profiler.profile(
+                 activities=[
+-                    torch.profiler.ProfilerActivity.CPU,
+                     torch.profiler.ProfilerActivity.CUDA,
+                 ],
+                 with_stack=True,
+@@ -1033,9 +1032,11 @@ class Scheduler:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+         self.profiler.stop()
+-        self.profiler.export_chrome_trace(
+-            self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+-        )
++        if self.tp_rank == 0:
++            with open(f"stats_repro_{int(time.time())}.txt", "w") as f:
++                print(self.profiler.key_averages(group_by_input_shape=True).table(sort_by="cuda_time_total", row_limit=-1), file=f)
++                print("Profiling stats done.")
++
+         logger.info("Profiler is done")
+```
+
+2. Create the model path directory and copy it to the right path for "--model-path" if you want to use the server.sh file provided.
+
+3. Modify the included server.sh by removing "loadTracer.sh" before python command and launch script ./server.sh in one terminal inside the docker container.
+
+4. Similar to step 6 in RPD profiling section, but remove the last 2 lines in client.sh, which converted rpd file into csv and json files. Run modified client.sh for PyTorch profiling.
+-------
+- [Torch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)
diff --git a/3rdparty/amd/profiling/client.sh b/3rdparty/amd/profiling/client.sh
new file mode 100755
index 000000000..150ea9f19
--- /dev/null
+++ b/3rdparty/amd/profiling/client.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Start profiling via API
+curl http://localhost:30000/start_profile -H "Content-Type: application/json"
+
+# Benchmark serving using sglang with random dataset and tokenizer
+# Define the log file with a timestamp
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+LOGFILE="sglang_client_log_$TIMESTAMP.json"
+
+# Run the benchmark with specified parameters and save logs
+python3 -m sglang.bench_serving \
+    --backend sglang \
+    --tokenizer Xenova/grok-1-tokenizer \
+    --dataset-name random \
+    --random-input 1024\
+    --random-output 1024 \
+    --num-prompts 240 \
+    --request-rate 8 \
+    --output-file online.jsonl 2>&1 | tee "$LOGFILE"
+
+# Stop profiling via API
+curl http://localhost:30000/stop_profile -H "Content-Type: application/json"
+
+# Convert tracing file to csv & json
+sqlite3 trace.rpd ".mode csv" ".header on" ".output trace.csv" "select * from top;" ".output stdout"
+python3 /sgl-workspace/rocmProfileData/tools/rpd2tracing.py trace.rpd trace.json
diff --git a/3rdparty/amd/profiling/install_rpd.sh b/3rdparty/amd/profiling/install_rpd.sh
new file mode 100644
index 000000000..d1b04b988
--- /dev/null
+++ b/3rdparty/amd/profiling/install_rpd.sh
@@ -0,0 +1,10 @@
+# download and install RPD
+apt update && apt install -y sqlite3 libsqlite3-dev libfmt-dev
+
+# install rpd module
+git clone https://github.com/ROCmSoftwarePlatform/rocmProfileData
+cd rocmProfileData
+git apply rpd.patch
+make && make install
+cd rocpd_python && python setup.py install && cd ..
+cd rpd_tracer && make clean;make install && python setup.py install && cd ..
diff --git a/3rdparty/amd/profiling/loadTracer.sh b/3rdparty/amd/profiling/loadTracer.sh
new file mode 100755
index 000000000..8a95a335c
--- /dev/null
+++ b/3rdparty/amd/profiling/loadTracer.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+################################################################################
+# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+################################################################################
+OUTPUT_FILE="trace.rpd"
+
+if [ "$1" = "-o" ] ; then
+  OUTPUT_FILE=$2
+  shift
+  shift
+fi
+
+if [ -e ${OUTPUT_FILE} ] ; then
+  rm ${OUTPUT_FILE}
+fi
+
+python3 -m rocpd.schema --create ${OUTPUT_FILE}
+if [ $? != 0 ] ; then
+  echo "Error: Could not create rpd file. Please run 'python setup.py install' from the rocpd_python dir"
+  exit
+fi
+
+export RPDT_FILENAME=${OUTPUT_FILE}
+export RPDT_AUTOSTART=0
+LD_PRELOAD=librocm-smi_64:librpd_tracer.so "$@"
diff --git a/3rdparty/amd/profiling/rpd.patch b/3rdparty/amd/profiling/rpd.patch
new file mode 100644
index 000000000..87917654a
--- /dev/null
+++ b/3rdparty/amd/profiling/rpd.patch
@@ -0,0 +1,12 @@
+diff --git a/rpd_tracer/Makefile b/rpd_tracer/Makefile
+index e9d9feb..b2e9e1a 100644
+--- a/rpd_tracer/Makefile
++++ b/rpd_tracer/Makefile
+@@ -16,7 +16,7 @@ ifneq (,$(HIP_PATH))
+         $(info Building with roctracer)
+         RPD_LIBS += -L/opt/rocm/lib -lroctracer64 -lroctx64 -lamdhip64 -lrocm_smi64
+         RPD_INCLUDES += -I/opt/rocm/include -I/opt/rocm/include/roctracer -I/opt/rocm/include/hsa
+-        RPD_SRCS += RoctracerDataSource.cpp RocmSmiDataSource.cpp
++        RPD_SRCS += RoctracerDataSource.cpp
+         RPD_INCLUDES += -D__HIP_PLATFORM_AMD__
+ endif
diff --git a/3rdparty/amd/profiling/rpd_profile_server_enable.patch b/3rdparty/amd/profiling/rpd_profile_server_enable.patch
new file mode 100644
index 000000000..3cd391533
--- /dev/null
+++ b/3rdparty/amd/profiling/rpd_profile_server_enable.patch
@@ -0,0 +1,49 @@
+diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
+index 62d1ff9..9021c01 100644
+--- a/python/sglang/srt/managers/scheduler.py
++++ b/python/sglang/srt/managers/scheduler.py
+@@ -71,6 +71,8 @@ from sglang.srt.utils import (
+     suppress_other_loggers,
+ )
+ from sglang.utils import get_exception_traceback
++from rpdTracerControl import rpdTracerControl
++rpdTracerControl.skipCreate()
+
+ logger = logging.getLogger(__name__)
+
+@@ -245,6 +247,7 @@ class Scheduler:
+                 ],
+                 with_stack=True,
+             )
++            self.rpd = rpdTracerControl()
+
+     @torch.inference_mode()
+     def event_loop(self):
+@@ -1027,15 +1030,24 @@ class Scheduler:
+     def start_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.start()
++        #self.profiler.start() #block pytorch profiler for rpd profiler enabling
++        if self.tp_rank == 0 or self.tp_rank == 1:
++            self.rpd.start()
++            self.rpd.rangePush("", "rpd profile range", "")
++            logger.info("rpd is enabled")
+
+     def stop_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.stop()
+-        self.profiler.export_chrome_trace(
+-            self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+-        )
++        #self.profiler.stop()
++        #self.profiler.export_chrome_trace(
++        #    self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
++        #)
++        if self.tp_rank ==0 or self.tp_rank ==1:
++            self.rpd.rangePop()
++            self.rpd.stop()
++            self.rpd.flush()
++            logger.info("rpd is done")
+         logger.info("Profiler is done")
diff --git a/3rdparty/amd/profiling/rpd_profile_server_enable_wCPU_activities.patch b/3rdparty/amd/profiling/rpd_profile_server_enable_wCPU_activities.patch
new file mode 100644
index 000000000..5416f4d57
--- /dev/null
+++ b/3rdparty/amd/profiling/rpd_profile_server_enable_wCPU_activities.patch
@@ -0,0 +1,126 @@
+diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
+index 62d1ff9..2edb427 100644
+--- a/python/sglang/srt/managers/scheduler.py
++++ b/python/sglang/srt/managers/scheduler.py
+@@ -71,6 +71,8 @@ from sglang.srt.utils import (
+     suppress_other_loggers,
+ )
+ from sglang.utils import get_exception_traceback
++from rpdTracerControl import rpdTracerControl
++rpdTracerControl.skipCreate()
+
+ logger = logging.getLogger(__name__)
+
+@@ -245,6 +247,7 @@ class Scheduler:
+                 ],
+                 with_stack=True,
+             )
++            self.rpd = rpdTracerControl()
+
+     @torch.inference_mode()
+     def event_loop(self):
+@@ -1027,15 +1030,26 @@ class Scheduler:
+     def start_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.start()
++        #self.profiler.start()
++        logger.info("torch profiler is disabled")
++        if self.tp_rank == 0 or self.tp_rank == 1:
++            self.rpd.setPythonTrace(True)
++            self.rpd.start()
++            self.rpd.rangePush("", "scheduler", "")
++        logger.info("rpd is enabled inside scheduler profiling")
+
+     def stop_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.stop()
+-        self.profiler.export_chrome_trace(
+-            self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+-        )
++        #self.profiler.stop()
++        #self.profiler.export_chrome_trace(
++        #    self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
++        #)
++        if self.tp_rank ==0 or self.tp_rank ==1:
++            self.rpd.rangePop()
++            self.rpd.stop()
++            self.rpd.flush()
++            logger.info("rpd is done inside scheduler")
+         logger.info("Profiler is done")
+
+
+diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
+index 2621ccd..181df85 100644
+--- a/python/sglang/srt/managers/tokenizer_manager.py
++++ b/python/sglang/srt/managers/tokenizer_manager.py
+@@ -58,6 +58,10 @@ from sglang.srt.sampling.sampling_params import SamplingParams
+ from sglang.srt.server_args import PortArgs, ServerArgs
+ from sglang.srt.utils import is_generation_model, is_multimodal_model
+
++from rpdTracerControl import rpdTracerControl
++rpdTracerControl.skipCreate()
++
++
+ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+
+ logger = logging.getLogger(__name__)
+@@ -514,10 +518,20 @@ class TokenizerManager:
+         self.send_to_scheduler.send_pyobj(req)
+
+     def start_profile(self):
++        rpd = rpdTracerControl()
++        rpd.setPythonTrace(True)
++        rpd.start()
++        rpd.rangePush("", "tokenizer_manager", "")
++        logger.info("tokenizer_manager rpd profiling started!")
+         req = ProfileReq.START_PROFILE
+         self.send_to_scheduler.send_pyobj(req)
+
+     def stop_profile(self):
++        rpd = rpdTracerControl()
++        rpd.rangePop()
++        rpd.stop()
++        rpd.flush()
++        logger.info("rpd profiling is done inside tokenizer_manager!")
+         req = ProfileReq.STOP_PROFILE
+         self.send_to_scheduler.send_pyobj(req)
+
+diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
+index 7111c93..2bd722c 100644
+--- a/python/sglang/srt/server.py
++++ b/python/sglang/srt/server.py
+@@ -30,6 +30,8 @@ import threading
+ import time
+ from http import HTTPStatus
+ from typing import Dict, List, Optional, Union
++from rpdTracerControl import rpdTracerControl
++rpdTracerControl.skipCreate()
+
+ # Fix a bug of Python threading
+ setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
+@@ -152,6 +154,11 @@ async def flush_cache():
+ @app.post("/start_profile")
+ async def start_profile():
+     """Start profiling."""
++    rpd = rpdTracerControl()
++    rpd.setPythonTrace(True)
++    rpd.start()
++    rpd.rangePush("", "server rpd profile range", "")
++    logger.info("rpd profiling started in server.py!")
+     tokenizer_manager.start_profile()
+     return Response(
+         content="Start profiling.\n",
+@@ -164,6 +171,11 @@ async def start_profile():
+ async def stop_profile():
+     """Stop profiling."""
+     tokenizer_manager.stop_profile()
++    rpd = rpdTracerControl()
++    rpd.rangePop()
++    rpd.stop()
++    rpd.flush()
++    logger.info("rpd profiling is done in server.py!")
+     return Response(
+         content="Stop profiling. This will take some time.\n",
+         status_code=200,
diff --git a/3rdparty/amd/profiling/server.sh b/3rdparty/amd/profiling/server.sh
new file mode 100755
index 000000000..f877e6c7a
--- /dev/null
+++ b/3rdparty/amd/profiling/server.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# export SGLANG_TORCH_PROFILER_DIR=/data/sglang/
+export SGLANG_TORCH_PROFILER_DIR=/sgl-workspace/sglang/profile/
+
+# Get the current timestamp
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+
+# Define the log file with a timestamp
+LOGFILE="sglang_server_log_$TIMESTAMP.json"
+
+# Run the Python command and save the output to the log file
+loadTracer.sh python3 -m sglang.launch_server \
+    --model-path /sgl-workspace/sglang/dummy_grok1 \
+    --tokenizer-path Xenova/grok-1-tokenizer \
+    --load-format dummy \
+    --quantization fp8 \
+    --tp 8 \
+    --port 30000 \
+    --disable-radix-cache 2>&1 | tee "$LOGFILE"
diff --git a/3rdparty/amd/profiling/torch_profiler.patch b/3rdparty/amd/profiling/torch_profiler.patch
new file mode 100644
index 000000000..40f557406
--- /dev/null
+++ b/3rdparty/amd/profiling/torch_profiler.patch
@@ -0,0 +1,25 @@
+diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
+index 62d1ff9..6ecd78c 100644
+--- a/python/sglang/srt/managers/scheduler.py
++++ b/python/sglang/srt/managers/scheduler.py
+@@ -240,7 +240,6 @@ class Scheduler:
+             )
+             self.profiler = torch.profiler.profile(
+                 activities=[
+-                    torch.profiler.ProfilerActivity.CPU,
+                     torch.profiler.ProfilerActivity.CUDA,
+                 ],
+                 with_stack=True,
+@@ -1033,9 +1032,11 @@ class Scheduler:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+         self.profiler.stop()
+-        self.profiler.export_chrome_trace(
+-            self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+-        )
++        if self.tp_rank == 0:
++            with open(f"stats_repro_{int(time.time())}.txt", "w") as f:
++                print(self.profiler.key_averages(group_by_input_shape=True).table(sort_by="cuda_time_total", row_limit=-1), file=f)
++                print("Profiling stats done.")
++
+         logger.info("Profiler is done")
diff --git a/3rdparty/amd/tuning/TUNING.md b/3rdparty/amd/tuning/TUNING.md
new file mode 100644
index 000000000..e7b9b2049
--- /dev/null
+++ b/3rdparty/amd/tuning/TUNING.md
@@ -0,0 +1,118 @@
+## Tuning SGLang Infer System with AMD GPUs
+This AppNote describes the SGLang performance tuning technical, code harness and running steps for systems with AMD Instinct GPUs.
+Harness code, examples and steps are provided in detail, to facilitate easy reproduce & use to tune performance towards workloads.
+Three primary runtime areas are covered:
+
+## 1. Triton Kernels
+To maximize Triton kernel efficiency, several strategies can be employed:
+
+### Key Environment Variables:
+- **num_stages**: Adjusts the number of pipeline stages to optimize kernel efficiency based on the specific type of operations (e.g., General Matrix Multiplication - GEMM).
+- **waves_per_eu**: Controls the usage of Vector General Purpose Registers (VGPR) to enhance occupancy, thereby improving latency or throughput.
+- **BLOCK_M, BLOCK_N, BLOCK_K**: Tunable tile sizes that assist in balancing memory transfer and computational efficiency.
+- **matrix_instr_nonkdim**: Optimizes the usage of Matrix-Fused Multiply-Add (MFMA) instructions for specific kernel types, such as Flash Attention.
+- **OPTIMIZE_EPILOGUE**: An environment variable that can be set to `1` to enhance performance by eliminating the `convert_layout` operation in the kernel's epilogue.
+```python
+@triton.autotune(configs=[
+        triton.Config({'waves_per_eu': 1}, num_warps=4, num_stages=1),
+        triton.Config({'waves_per_eu': 1}, num_warps=8, num_stages=1),
+        triton.Config({'waves_per_eu': 1}, num_warps=16, num_stages=1),
+        triton.Config({'waves_per_eu': 2}, num_warps=4, num_stages=1),
+        triton.Config({'waves_per_eu': 2}, num_warps=8, num_stages=1),
+        triton.Config({'waves_per_eu': 2}, num_warps=16, num_stages=1),
+        triton.Config({'waves_per_eu': 4}, num_warps=4, num_stages=1),
+        triton.Config({'waves_per_eu': 4}, num_warps=8, num_stages=1),
+        triton.Config({'waves_per_eu': 4}, num_warps=16, num_stages=1),
+    ], key=['BLOCK_N', 'NUM_TOKEN_BLKS'], use_cuda_graph=True)
+@triton.jit
+def _triton_kernel_funtion():
+    ...
+```
+## 2. Torch Tunable Operations
+**TunableOp** is a feature in PyTorch that allows for the definition and optimization of custom kernels with tunable parameters. This feature is particularly useful for enhancing the performance of kernels by experimenting with different configurations.
+
+### Key Environment Variables:
+1. **PYTORCH_TUNABLEOP_ENABLED**:
+   - Default: `0`
+   - Set to `1` to enable TunableOp.
+
+2. **PYTORCH_TUNABLEOP_TUNING**:
+   - Default: `1`
+   - Set to `0` to disable tuning. If a tuned entry is not found, it will run the tuning step and record the entry when PYTORCH_TUNABLEOP_ENABLED is enabled.
+
+3. **PYTORCH_TUNABLEOP_VERBOSE**:
+   - Default: `0`
+   - Set to `1` to enable verbose output for TunableOp.
+
+### Usage Example:
+To enable TunableOp and tuning, and optionally enable verbose mode, you can run the following command in your terminal:
+
+```bash
+#Tuning
+PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_TUNING=1 your_script.sh
+
+#Inference with tuning op
+PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_TUNING=0 your_script.sh
+
+#Print out the log
+PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_TUNING=0 PYTORCH_TUNABLEOP_VERBOSE=1 your_script.sh
+
+```
+## 3. Torch Compilation
+
+
+The following are suggestions for optimizing matrix multiplication (GEMM) and convolution (conv) operations in PyTorch using Inductor, a part of the PyTorch compilation framework. The goal is to leverage Triton to achieve better performance.
+
+To tune Triton kernels with GEMM and convolution ops (conv), use the `torch.compile` function with the max-autotune mode. This benchmarks a predefined list of Triton configurations and selects the fastest one for each shape.
+
+### Key Configurations:
+1. **Max Autotune**:
+   - Set `torch._inductor.config.max_autotune = True` or `TORCHINDUCTOR_MAX_AUTOTUNE=1`.
+
+2. **Fine-Grained Control**:
+   - Enable GEMM tuning: `torch._inductor.config.max_autotune_gemm = True`.
+   - Enable tuning for pointwise/reduction ops: `torch._inductor.config.max_autotune.pointwise = True`.
+
+3. **Backend Selection**:
+   - Use `torch._inductor.max_autotune_gemm_backends` to limit backends to TRITON for better performance.
+
+4. **Freezing for Inference**:
+   - Use `torch._inductor.config.freezing=True` to enable constant folding optimizations.
+
+5. **Debugging**:
+   - Set `TORCH_COMPILE_DEBUG=1` to extract Triton kernels generated by Inductor.
+
+### Example Code Block:
+```bash
+#Gemm Tuning
+TORCHINDUCTOR_MAX_AUTOTUNE=1 TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1 your_script.sh
+
+#Specify your backend to TRITON for Gemm Tuning
+TORCHINDUCTOR_MAX_AUTOTUNE=1 TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1 TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS=TRITON your_script.sh
+
+#Inference with large improvement on AMD GPU
+TORCHINDUCTOR_FREEZING=1 your_script.sh
+```
+## 4. Fused MOE kernel
+To maximize moe kernel efficiency, need to use below scripts to find out the best launch configuration
+
+### Key parameters:
+- **--model**: what moe model type to do tuning, it will automatically decide the size of d_model, model_intermediate_size, num_layers
+- **--tp-size**: simulate the whole model run configuration to set the dimension size using tp correctly
+- **--batch**: M dimension size of moe kernel, for prefill moe kernel the value is batch*input_len, for decode moe kernel the value is batch
+- **--dtype**: computation type
+
+```bash
+#Tuning
+#for example, we have one case like this "python3 -m sglang.bench_latency --model dummy_grok1/ --load-format dummy --tokenizer-path Xenova/grok-1-tokenizer --tp 8 --batch-size 32 --input 1024 --output 8 --attention-backend triton --sampling-backend pytorch --quantization fp8" to run, it defined batch-size 32 input length 1024 and output length 8, from "--batch" in moe view point, the prefill batch is 32*1024 = 32768, the decode batch is 32*1(only one output token generated in each run).
+#so we can tune decode moe use below command
+python benchmark_moe_rocm.py --model grok1 --tp-size 8 --dtype float8 --batch "32"
+# and use this command to tune prefill moe
+python benchmark_moe_rocm.py --model grok1 --tp-size 8 --dtype float8 --batch "32768"
+```
+
+## Reference
+
+For more detailed information on tuning SGLang performance with AMD GPUs, please refer to the following link:
+
+[ROCm Documentation: Triton Kernel Performance Optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#triton-kernel-performance-optimization)
diff --git a/3rdparty/amd/tuning/benchmark_moe_rocm.py b/3rdparty/amd/tuning/benchmark_moe_rocm.py
new file mode 100644
index 000000000..af596d218
--- /dev/null
+++ b/3rdparty/amd/tuning/benchmark_moe_rocm.py
@@ -0,0 +1,380 @@
+import argparse
+import json
+import os
+import sys
+
+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+from tqdm import tqdm
+from transformers import AutoConfig
+
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
+    fused_moe,
+    get_config_file_name,
+)
+
+padding_size = 128 if bool(int(os.getenv("SGLANG_MOE_PADDING", "0"))) else 0
+
+
+def main(model, tp_size, dtype: str, batches):
+    method = fused_moe
+
+    for bs in batches:
+        run_grid(int(bs), model=model, method=method, tp_size=tp_size, dtype=dtype)
+
+
+def prune_configs(M, N, K, configs):
+    pruned_configs = []
+    elemBytes_a = 1  # [DV Note] Hard-coded for float16 (2 bytes)
+    elemBytes_b = 1  # [DV Note] Hard-coded for float16 (2 bytes)
+
+    mfma = 16 if M < 32 or N < 32 else 32
+
+    # TODO (zhanglx): figure out the boundary between large and small gemms
+    large_gemm = False
+    if M >= 2048 and N >= 2048:
+        large_gemm = True
+
+    for config in configs:
+        BLOCK_SIZE_M = config.get("BLOCK_SIZE_M")
+        BLOCK_SIZE_N = config.get("BLOCK_SIZE_N")
+        BLOCK_SIZE_K = config.get("BLOCK_SIZE_K")
+        num_warps = config.get("num_warps")
+        matrix_instr_nonkdim = config.get("matrix_instr_nonkdim")
+        # kpack = config.get("kpack")
+        if matrix_instr_nonkdim > mfma:
+            continue
+        if mfma == 4 and BLOCK_SIZE_K < 64:
+            continue
+        # some layouts could not work properly in case
+        # number elements per thread is less 1
+        if BLOCK_SIZE_M * BLOCK_SIZE_N < 64:
+            continue
+        SPLIT_K = 1  # config.get("SPLIT_K")
+        GROUP_M = config.get("GROUP_SIZE_M")
+        if matrix_instr_nonkdim > BLOCK_SIZE_M or matrix_instr_nonkdim > BLOCK_SIZE_N:
+            continue
+        if matrix_instr_nonkdim >= M and matrix_instr_nonkdim != BLOCK_SIZE_M:
+            continue
+        if matrix_instr_nonkdim >= N and matrix_instr_nonkdim != BLOCK_SIZE_N:
+            continue
+        # Skip BLOCK_SIZE that is too large compare to M/N
+        # unless BLOCK_SIZE is already small enough
+        if M * 2 < BLOCK_SIZE_M and BLOCK_SIZE_M != 16:
+            continue
+        if N * 2 < BLOCK_SIZE_N and BLOCK_SIZE_N != 16:
+            continue
+        # skip large split_k when not necessary
+        if SPLIT_K != 1 and not need_split_k(M, N, K):
+            continue
+        # skip split_k that leads to EVEN_K = false
+        leap = SPLIT_K * BLOCK_SIZE_K
+        modv = K % leap
+        if modv != 0:
+            continue
+        # skip large GROUP_M
+        if GROUP_M * BLOCK_SIZE_M > M and GROUP_M != 1:
+            continue
+        # out of shared memory resource
+        # TODO (zhanglx): This does not consider the LDS usage in the epilogue
+        LDS = (
+            BLOCK_SIZE_K * BLOCK_SIZE_M * elemBytes_a
+            + BLOCK_SIZE_K * BLOCK_SIZE_N * elemBytes_b
+        )
+        if LDS > 65536:
+            continue
+        # Skip small block sizes and num_warps for large gemm
+        # For fp16 and f8, we want to only use BLOCK_SIZE >= 64
+        if large_gemm:
+            if BLOCK_SIZE_M < 64 or BLOCK_SIZE_N < 64:
+                continue
+            if BLOCK_SIZE_K < 64:
+                continue
+            if num_warps < 4:
+                continue
+
+        pruned_configs.append(config)
+
+    return pruned_configs
+
+
+def union_of_list_of_dicts(l1, l2):
+    result = []
+    temp_list = l1.copy()
+    temp_list.extend(l2)
+    for myDict in temp_list:
+        if myDict not in result:
+            result.append(myDict)
+
+    return result
+
+
+def run_grid(bs, model, method, tp_size, dtype: str):
+
+    config = AutoConfig.from_pretrained(model)
+
+    top_k = config.num_experts_per_tok
+    d_model = config.hidden_size
+    model_intermediate_size = config.intermediate_size
+    num_layers = config.num_hidden_layers
+    hidden_states_dtype = config.torch_dtype
+
+    if config.num_experts_per_tok:
+        if config.architectures[0] == "Grok1ModelForCausalLM":
+            num_total_experts = config.num_experts
+        else:
+            num_total_experts = config.num_local_experts
+    else:
+        raise ValueError(f"Unsupported Mixtral model {model}")
+
+    # tp_size = 2
+    num_warmup_calls = 10
+    num_calls = 30
+
+    num_warmup_trials = 1
+    num_trials = 1
+
+    full_configs = []
+
+    block_m_range = [16, 32, 64, 128, 256]
+    block_n_range = [16, 32, 64, 128, 256]
+    block_k_range = [32, 64, 128, 256]  # MUST >= 32
+    num_warps_range = [1, 2, 4, 8]
+    group_m_range = [1, 4, 8, 16, 32]
+    # For now we see better perf with num_stages=0 for all gemm configs we care
+    # But keep this explicit so that we do not forget we may need to set it to
+    # other values in the future
+    num_stage_range = [2]
+    waves_per_eu_range = [0, 1, 2, 4, 8]
+    # Remove 32 because of triton compiling error
+    matrix_instr_nonkdim_range = [16]
+    kpack_range = [1, 2]
+
+    for block_size_m in block_m_range:
+        for block_size_n in block_n_range:
+            for block_size_k in block_k_range:
+                for group_size_m in group_m_range:
+                    for num_warps in num_warps_range:
+                        for num_stages in num_stage_range:
+                            for waves_per_eu in waves_per_eu_range:
+                                for matrix_instr_nonkdim in matrix_instr_nonkdim_range:
+                                    for kpack in kpack_range:
+                                        full_configs.append(
+                                            {
+                                                "BLOCK_SIZE_M": block_size_m,
+                                                "BLOCK_SIZE_N": block_size_n,
+                                                "BLOCK_SIZE_K": block_size_k,
+                                                "GROUP_SIZE_M": group_size_m,
+                                                "num_warps": num_warps,
+                                                "num_stages": num_stages,
+                                                "waves_per_eu": waves_per_eu,
+                                                "matrix_instr_nonkdim": matrix_instr_nonkdim,
+                                                "kpack": kpack,
+                                            }
+                                        )
+
+    M1 = bs * 2
+    N1 = model_intermediate_size * 2 // tp_size
+    K1 = d_model
+    prune_configs_1 = prune_configs(M1, N1, K1, full_configs)
+
+    M2 = bs * 2
+    N2 = d_model
+    K2 = model_intermediate_size // tp_size
+    prune_configs_2 = prune_configs(M2, N2, K2, full_configs)
+
+    configs = union_of_list_of_dicts(prune_configs_1, prune_configs_2)
+
+    print(
+        f"{bs=} || {len(full_configs)=} | {len(prune_configs_1)=} | \
+            {len(prune_configs_2)=} | {len(configs)=}"
+    )
+
+    best_config = None
+    best_time_us = 1e20
+
+    print(f"{tp_size=} {bs=}")
+
+    for config in tqdm(configs):
+        # warmup
+        try:
+            print(config)
+            for _ in range(num_warmup_trials):
+                run_timing(
+                    num_calls=num_warmup_calls,
+                    bs=bs,
+                    d_model=d_model,
+                    num_total_experts=num_total_experts,
+                    top_k=top_k,
+                    tp_size=tp_size,
+                    model_intermediate_size=model_intermediate_size,
+                    method=method,
+                    config=config,
+                    dtype=dtype,
+                    hidden_states_dtype=hidden_states_dtype,
+                )
+        except triton.runtime.autotuner.OutOfResources:
+            continue
+
+        # trial
+        for _ in range(num_trials):
+            kernel_dur_ms = run_timing(
+                num_calls=num_calls,
+                bs=bs,
+                d_model=d_model,
+                num_total_experts=num_total_experts,
+                top_k=top_k,
+                tp_size=tp_size,
+                model_intermediate_size=model_intermediate_size,
+                method=method,
+                config=config,
+                dtype=dtype,
+                hidden_states_dtype=hidden_states_dtype,
+            )
+
+            kernel_dur_us = 1000 * kernel_dur_ms
+            model_dur_ms = kernel_dur_ms * num_layers
+
+            if kernel_dur_us < best_time_us:
+                best_config = config
+                best_time_us = kernel_dur_us
+
+                tqdm.write(
+                    f"{kernel_dur_us=:.1f} {model_dur_ms=:.1f}"
+                    f" {bs=} {tp_size=} {top_k=} {num_total_experts=} "
+                    f"{d_model=} {model_intermediate_size=} {num_layers=}"
+                )
+
+    print("best_time_us", best_time_us)
+    print("best_config", best_config)
+
+    # holds Dict[str, Dict[str, int]]
+    filename = get_config_file_name(
+        num_total_experts,
+        model_intermediate_size // tp_size,
+        "float8" if dtype == "float8" else None,
+    )
+    print(f"writing config to file {filename}")
+    existing_content = {}
+    if os.path.exists(filename):
+        with open(filename, "r") as f:
+            existing_content = json.load(f)
+    existing_content[str(bs)] = best_config
+    with open(filename, "w") as f:
+        json.dump(existing_content, f, indent=4)
+        f.write("\n")
+
+
+def run_timing(
+    num_calls: int,
+    bs: int,
+    d_model: int,
+    num_total_experts: int,
+    top_k: int,
+    tp_size: int,
+    model_intermediate_size: int,
+    method,
+    config,
+    dtype: str,
+    hidden_states_dtype,
+) -> float:
+    shard_intermediate_size = model_intermediate_size // tp_size
+
+    hidden_states = torch.rand(
+        (bs, d_model),
+        device="cuda:0",
+        dtype=hidden_states_dtype,
+    )
+
+    w1 = torch.rand(
+        (num_total_experts, 2 * shard_intermediate_size, d_model + padding_size),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+
+    w2 = torch.rand(
+        (num_total_experts, d_model, shard_intermediate_size + padding_size),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+
+    w1_scale = None
+    w2_scale = None
+    a1_scale = None
+    a2_scale = None
+
+    if dtype == "float8":
+        w1 = w1.to(torch.float8_e4m3fnuz)
+        w2 = w2.to(torch.float8_e4m3fnuz)
+        w1_scale = torch.ones(
+            num_total_experts, device=hidden_states.device, dtype=torch.float32
+        )
+        w2_scale = torch.ones(
+            num_total_experts, device=hidden_states.device, dtype=torch.float32
+        )
+        a1_scale = torch.ones(1, device=hidden_states.device, dtype=torch.float32)
+        a2_scale = torch.ones(1, device=hidden_states.device, dtype=torch.float32)
+
+    gating_output = F.softmax(
+        torch.rand(
+            (num_calls, bs, num_total_experts),
+            device=hidden_states.device,
+            dtype=torch.float32,
+        ),
+        dim=-1,
+    )
+
+    ##################################
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    start_event.record()
+    for i in range(num_calls):
+        hidden_states = method(
+            hidden_states=hidden_states,
+            w1=w1,
+            w2=w2,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            gating_output=gating_output[0],
+            topk=top_k,
+            renormalize=True,
+            inplace=True,
+            override_config=config,
+            use_fp8=dtype == "float8",
+        )
+
+    end_event.record()
+    end_event.synchronize()
+
+    dur_ms = start_event.elapsed_time(end_event) / num_calls
+    return dur_ms
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="benchmark_mixtral_moe",
+        description="Benchmark and tune the fused_moe kernel",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="auto",
+        choices=["float8", "float16", "bfloat16"],
+        help="Data type used for fused_moe kernel computations",
+    )
+    parser.add_argument("--model", type=str, default="hpcai-tech/grok-1")
+
+    parser.add_argument("--tp-size", type=int, default=2, help="Tensor paralleli size")
+    parser.add_argument("-b", "--batches", type=str)
+
+    args = parser.parse_args()
+
+    batches = args.batches.split(",")
+
+    sys.exit(main(args.model, args.tp_size, args.dtype, batches))
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 000000000..9c422689c
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2023-2024 SGLang Team
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/Makefile b/Makefile
new file mode 100644
index 000000000..459dfa573
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,46 @@
+.PHONY: check-deps install-deps format update help
+
+# Show help for each target
+help:
+	@echo "Available targets:"
+	@grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
+
+check-deps: ## Check and install required Python formatting dependencies
+	@command -v isort >/dev/null 2>&1 || (echo "Installing isort..." && pip install isort)
+	@command -v black >/dev/null 2>&1 || (echo "Installing black..." && pip install black)
+
+install-deps: ## Install Python formatting tools (isort and black)
+	pip install isort black
+
+format: check-deps ## Format modified Python files using isort and black
+	@echo "Formatting modified Python files..."
+	git diff --name-only --diff-filter=M | grep '\.py$$' | xargs -I {} sh -c 'isort {} && black {}'
+
+FILES_TO_UPDATE = docker/Dockerfile.rocm \
+                 python/pyproject.toml \
+                 python/sglang/version.py \
+                 docs/developer_guide/setup_github_runner.md \
+                 docs/get_started/install.md \
+                 docs/platforms/amd_gpu.md \
+                 docs/platforms/ascend_npu.md \
+				 benchmark/deepseek_v3/README.md
+
+update: ## Update version numbers across project files. Usage: make update <new_version>
+	@if [ -z "$(filter-out $@,$(MAKECMDGOALS))" ]; then \
+		echo "Version required. Usage: make update <new_version>"; \
+		exit 1; \
+	fi
+	@OLD_VERSION=$$(grep "version" python/sglang/version.py | cut -d '"' -f2); \
+	NEW_VERSION=$(filter-out $@,$(MAKECMDGOALS)); \
+	echo "Updating version from $$OLD_VERSION to $$NEW_VERSION"; \
+	for file in $(FILES_TO_UPDATE); do \
+		if [ "$(shell uname)" = "Darwin" ]; then \
+			sed -i '' -e "s/$$OLD_VERSION/$$NEW_VERSION/g" $$file; \
+		else \
+			sed -i -e "s/$$OLD_VERSION/$$NEW_VERSION/g" $$file; \
+		fi \
+	done; \
+	echo "Version update complete"
+
+%:
+	@:
diff --git a/README.md b/README.md
new file mode 100644
index 000000000..451a6d424
--- /dev/null
+++ b/README.md
@@ -0,0 +1,78 @@
+<div align="center" id="sglangtop">
+<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
+
+[![PyPI](https://img.shields.io/pypi/v/sglang)](https://pypi.org/project/sglang)
+![PyPI - Downloads](https://static.pepy.tech/badge/sglang?period=month)
+[![license](https://img.shields.io/github/license/sgl-project/sglang.svg)](https://github.com/sgl-project/sglang/tree/main/LICENSE)
+[![issue resolution](https://img.shields.io/github/issues-closed-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
+[![open issues](https://img.shields.io/github/issues-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
+[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/sgl-project/sglang)
+
+</div>
+
+--------------------------------------------------------------------------------
+
+| [**Blog**](https://lmsys.org/blog/2025-05-05-large-scale-ep/)
+| [**Documentation**](https://docs.sglang.ai/)
+| [**Join Slack**](https://slack.sglang.ai/)
+| [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
+| [**Roadmap**](https://github.com/sgl-project/sglang/issues/7736)
+| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
+
+## News
+- [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf), [Highlights](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_highlights.pdf), [AITER/MoRI](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_aiter_mori.pdf), [Wave](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_wave.pdf)).
+- [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
+- [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
+- [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
+- [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
+- [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
+- [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
+- [2024/12] v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
+
+<details>
+<summary>More</summary>
+
+- [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
+- [2025/01] SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
+- [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
+- [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
+- [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
+- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
+- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
+- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
+
+</details>
+
+## About
+SGLang is a fast serving framework for large language models and vision language models.
+It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
+The core features include:
+
+- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor/pipeline/expert/data parallelism, structured outputs, chunked prefill, quantization (FP4/FP8/INT4/AWQ/GPTQ), and multi-lora batching.
+- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
+- **Extensive Model Support**: Supports a wide range of generative models (Llama, Qwen, DeepSeek, Kimi, GPT, Gemma, Mistral, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
+- **Active Community**: SGLang is open-source and backed by an active community with wide industry adoption.
+
+## Getting Started
+- [Install SGLang](https://docs.sglang.ai/get_started/install.html)
+- [Quick Start](https://docs.sglang.ai/basic_usage/send_request.html)
+- [Backend Tutorial](https://docs.sglang.ai/basic_usage/openai_api_completions.html)
+- [Frontend Tutorial](https://docs.sglang.ai/references/frontend/frontend_tutorial.html)
+- [Contribution Guide](https://docs.sglang.ai/developer_guide/contribution_guide.html)
+
+## Benchmark and Performance
+Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/), [Large-scale expert parallelism](https://lmsys.org/blog/2025-05-05-large-scale-ep/).
+
+## Roadmap
+[Development Roadmap (2025 H2)](https://github.com/sgl-project/sglang/issues/7736)
+
+## Adoption and Sponsorship
+SGLang has been deployed at large scale, generating trillions of tokens in production each day. It is trusted and adopted by a wide range of leading enterprises and institutions, including xAI, AMD, NVIDIA, Intel, LinkedIn, Cursor, Oracle Cloud, Google Cloud, Microsoft Azure, AWS, Atlas Cloud, Voltage Park, Nebius, DataCrunch, Novita, InnoMatrix, MIT, UCLA, the University of Washington, Stanford, UC Berkeley, Tsinghua University, Jam & Tea Studios, Baseten, and other major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto industry standard, with deployments running on over 1,000,000 GPUs worldwide.
+
+<img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
+
+## Contact Us
+For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
+
+## Acknowledgment
+We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
diff --git a/assets/logo.png b/assets/logo.png
new file mode 100644
index 000000000..2a8bc258f
Binary files /dev/null and b/assets/logo.png differ
diff --git a/assets/logo.svg b/assets/logo.svg
new file mode 100644
index 000000000..4d6393926
--- /dev/null
+++ b/assets/logo.svg
@@ -0,0 +1 @@
+<svg width="2392" height="729" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><defs><filter id="fx0" x="-10%" y="-10%" width="120%" height="120%" filterUnits="userSpaceOnUse" primitiveUnits="userSpaceOnUse"><feComponentTransfer color-interpolation-filters="sRGB"><feFuncR type="discrete" tableValues="0.835294 0.835294"/><feFuncG type="discrete" tableValues="0.345098 0.345098"/><feFuncB type="discrete" tableValues="0.086275 0.086275"/><feFuncA type="linear" slope="0.400000" intercept="0.000000"/></feComponentTransfer><feGaussianBlur stdDeviation="7.638889 7.638889"/></filter><filter id="fx1" x="-10%" y="-10%" width="120%" height="120%" filterUnits="userSpaceOnUse" primitiveUnits="userSpaceOnUse"><feComponentTransfer color-interpolation-filters="sRGB"><feFuncR type="discrete" tableValues="0.835294 0.835294"/><feFuncG type="discrete" tableValues="0.345098 0.345098"/><feFuncB type="discrete" tableValues="0.086275 0.086275"/><feFuncA type="linear" slope="0.400000" intercept="0.000000"/></feComponentTransfer><feGaussianBlur stdDeviation="6.111111 6.111111"/></filter><filter id="fx2" x="-10%" y="-10%" width="120%" height="120%" filterUnits="userSpaceOnUse" primitiveUnits="userSpaceOnUse"><feComponentTransfer color-interpolation-filters="sRGB"><feFuncR type="discrete" tableValues="0.835294 0.835294"/><feFuncG type="discrete" tableValues="0.345098 0.345098"/><feFuncB type="discrete" tableValues="0.086275 0.086275"/><feFuncA type="linear" slope="0.400000" intercept="0.000000"/></feComponentTransfer><feGaussianBlur stdDeviation="7.638889 7.638889"/></filter><clipPath id="clip3"><path d="M1756.97 902.5C1708.88 902.5 1667.36 908.097 1632.43 919.291 1597.49 930.485 1568.73 945.491 1546.14 964.309 1523.56 983.128 1506.82 1004.62 1495.94 1028.8 1485.05 1052.97 1479.61 1078.03 1479.61 1103.99 1479.61 1125.73 1483.26 1144.63 1490.57 1160.69 1497.89 1176.75 1507.63 1190.86 1519.82 1203.03 1532.01 1215.2 1545.74 1225.74 1561.01 1234.66 1576.29 1243.59 1591.97 1251.86 1608.05 1259.48 1624.14 1267.11 1639.9 1274.49 1655.34 1281.63 1670.77 1288.77 1684.58 1296.39 1696.77 1304.5 1708.96 1312.61 1718.71 1321.62 1726.02 1331.51 1733.33 1341.41 1736.99 1353.17 1736.99 1366.8 1736.99 1378.48 1734.23 1389.59 1728.7 1400.14 1723.18 1410.68 1714.48 1420.01 1702.62 1428.12 1690.76 1436.23 1675.65 1442.64 1657.29 1447.35 1638.93 1452.05 1616.91 1454.4 1591.24 1454.4 1576.61 1454.4 1561.5 1453.59 1545.9 1451.97 1530.3 1450.35 1514.7 1448.08 1499.1 1445.16 1483.51 1442.23 1468.07 1438.67 1452.79 1434.45 1437.52 1430.23 1423.06 1425.69 1409.41 1420.82L1386.5 1533.25C1415.1 1542.33 1444.83 1549.14 1475.71 1553.69 1506.58 1558.23 1540.05 1560.5 1576.12 1560.5 1621.3 1560.5 1662.24 1555.47 1698.96 1545.41 1735.69 1535.35 1766.97 1520.92 1792.8 1502.1 1818.64 1483.28 1838.54 1460.57 1852.52 1433.96 1866.49 1407.36 1873.48 1377.67 1873.48 1344.9 1873.48 1323.48 1869.74 1304.58 1862.27 1288.2 1854.79 1271.81 1844.96 1257.38 1832.77 1244.88 1820.59 1232.39 1806.78 1221.44 1791.34 1212.03 1775.9 1202.62 1760.06 1193.94 1743.81 1185.99 1727.56 1178.05 1711.72 1170.5 1696.28 1163.36 1680.85 1156.23 1667.04 1148.76 1654.85 1140.98 1642.66 1133.19 1632.83 1124.67 1625.36 1115.42 1617.88 1106.18 1614.15 1095.39 1614.15 1083.06 1614.15 1072.35 1616.66 1062.21 1621.7 1052.64 1626.74 1043.07 1634.62 1034.8 1645.34 1027.82 1656.07 1020.85 1669.72 1015.33 1686.29 1011.27 1702.87 1007.22 1722.69 1005.19 1745.76 1005.19 1757.46 1005.19 1769.81 1005.76 1782.81 1006.89 1795.81 1008.03 1808.73 1009.65 1821.56 1011.76 1834.4 1013.87 1846.75 1016.3 1858.61 1019.06 1870.47 1021.82 1881.11 1024.82 1890.54 1028.06L1911.5 922.455C1901.75 919.859 1890.86 917.344 1878.84 914.91 1866.82 912.477 1854.06 910.368 1840.57 908.584 1827.09 906.799 1813.28 905.339 1799.14 904.203 1785 903.068 1770.95 902.5 1756.97 902.5ZM756 866 3148 866 3148 1595 756 1595Z" fill-rule="evenodd" clip-rule="evenodd"/></clipPath><clipPath id="clip4"><rect x="1.24353" y="1.08319" width="595.223" height="724.834"/></clipPath><clipPath id="clip5"><rect x="1.41663" y="1.66669" width="592.667" height="675.667"/></clipPath><clipPath id="clip6"><rect x="-2078.55" y="-2770.64" width="374073" height="374073"/></clipPath><image width="512" height="512" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAYAAAD0eNT6AAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEbAAUAAAABAAAAUgEoAAMAAAABAAIAAIdpAAQAAAABAAAAWgAAAAAAAACQAAAAAQAAAJAAAAABAAOgAQADAAAAAQABAACgAgAEAAAAAQAAAgCgAwAEAAAAAQAAAgAAAAAAGcBUEAAAAAlwSFlzAAAWJQAAFiUBSVIk8AAAQABJREFUeAHt3QmcXFWd6PF/VXVWwq6OozAgIjoiDKQTEBjHwCij7Nla2Yc1Ks6Mz+d7b5aPb9pZ3rzZfM8NCKC4IWMnnYAs6jAIjEImJJ1gEFRkGQVxYSdk63TVnf+pTnf6dN3uruUu55z7u34wfW/de5bvuVX/f926iwgTAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAwgKlhMujOAQQQCA1gUikdNsRss+OmbJvKZLZUUm2Vrtkx8Nr5Be9IrXUKqZgBAIUIAEIcFDpEgKhCNx8tLyuWpF3RZGcKCU5Uvv1Fv1vdkz/duiyJ/S/72uS8L1yTe5ctEF+GLMeixBAYJcACQC7AgIIOCVw+6EyY9u+crZEcoE27J36X7nNBm7S7W6oTpPretbI822WwWYIBCtAAhDs0NIxBPwSuH6BzNx7i3xYv+1/VFv+mwm2/hU9KnBtrSp/17NRnkmwXIpCwGsBEgCvh4/GIxCGQH+3nKqH+D+tvTkktR6V5AU9b+Djm9bLVb2cL5AaMwX7I0AC4M9Y0VIEghMw3/r33CJ/r4H5j7PqnH7o3V2pyrlnbpSns6qTehBwUYAEwMVRoU0IFECg/1g5QKpym3bVnNyX9fQrPbHgjIXr5f6sK6Y+BFwRaPfkGlfaTzsQQMBDgZXdejZ/Ve7VpucR/I3Yb+g1g3eu6JaTPeSjyQgkIsARgEQYKQQBBJoVuGm+HFiN6sH/wGa3SXG97bWanLx0g3w3xTooGgEnBUgAnBwWGoVAmAJ9x8l+lZ2yRnt3mDM9NCcHVuUE7hvgzIjQkIwE+AkgI2iqQaDoAnopXqkyKNepgzvB3wxKJPtGZVlxS3fsDYaKPmz0P2ABEoCAB5euIeCSgF7q9wG91G+hS20a05bDB8vyT2Pm+ROB4AX4CSD4IaaDCOQvsOpIeU00XX6sLdkn/9ZM2IKa3oTo+CUDsnbCNXgBgYAEugLqC12ZRMAcfr15vhygd0N7kx7ufL1+E5ulH3Z7l0rykmaBW/T1J6sij/Ssk19OUgwvIdCWQDRD/lYPtbsc/E2/yvp++JT++/a2OunBRnctkK5nXpaD9YP/t/RzYI7ef2GGfhYMVkvyin4ePCXb5Imeh2TQg67QxAQEOAKQAKKrRdQfpFKWJfrEtN/XNv6u/rdfE239pX4gmDOi/02fsraSe6g3IcYqkwrsut7/MV1p+qQrOvJiVJP3LNkg33akOR01o+9wmV6eLX+ghbxLE7Df0w/8w/XvaZMUOqSv/Vg/A/5d1/+OPkfhNv0M2DbJ+rzksQAJgMeDF9d0802/f568Vwf2I/r6SfpfJW69JpeZbwK362VSn+QyqSbFWK1BYOU8+aTuj/+t4QV3F9y1eH39veNuC6do2apj5DD9Rv8RDeLv01WbSfwnKnGzltFfLsunFq6TByZaieV+CpAA+Dlusa02NzXRQ5j/pIN6ROwKnS28Tz9QPsrvo50hFm1r/QY6pzJLDy2L7O1R3yP9Bnzo4nXyuEdtrje1b64cWinL/9UZc7Jl0id5f0uPjvwvPTqyyTcX2hsvQAIQ7+LV0puPld8YGpKr9EMr7TOs9eZpcv22inz0vLXysldINDYXgf75crl+g1yeS+WdVKoPDVo8IH/TSRFZbmsO9XfNkv+tRwA/pvXOSLFu8xPB1VsG5U8v2CRbUqyHojMQSDpDzKDJVDFWQD9g3zVU1UNz6Qd/U63ZXy6ZVZWB1XOle2w7+BuBWIFIlsUud31hSd7rehNH2rf67XJwZbbco8H/L3RZmsHfVGlOHP/wnOn6GTBfjjILmPwV4AiAv2MnGvz/UL9dXatdMG/KrKcdegbx+YsGZEXWFVOfHwL9c/Vs+nL9rn9+NNhu5ZAe6drf9SNd+hnwe/oZcLM2PY8rLLZpADl70fp6/bYec14IcATAi2FqbKSeWPUX+sa/Xl/JI/ibBs3Qqwtu1HZc3Ng6liCgAmX5oMcOXTN2yu+43H492fcM/QwwVyvkEfwNzSw96tCv7bjEZSfaNrEACcDENs6+om+4Xs28Xfh9sqLtuFbbs8RZLBqWi8CtR8i+WrHX+0Wlok8sdHRaMU8WaNO+rv/NzLmJ5iqja1fNkytybgfVtyFAAtAGWp6bmOCv9f9lnm0YV7fZh76ij3c9dtxyZgsssGN6/cjQbJ8J9NvtG11svwbbt2nUNYf98w7+Izz6a6B8ZpW51TOTVwIkAB4Nl4PBf0Rvpl5+uMI86W1kAf8WV0CDQUn/d7n3AvqQINf6cP0Cmam+N+h/eznWtpL+JHglRwIcG5UpmkMCMAWQKy87HPxHiA7UJ719emSGf4srsHqe3nXOtSf+tTcczt27YK8t8kntypHtdSf1rTgSkDpxshWQACTrmUppHgT/4X6X5Nxdv02m4kChfgjot9MP+dHSKVvp1D3x9WmKc/WkP9cvq+RIwJS7lTsrkAC4MxaxLfEm+O9qve5Qn6wfAo7tDQtDF9CfgV6vfTwtkH66dQ/8knxGXX34zOZIgCdvAB92Jk8ok2+mb8F/l8DRN82T9ySvQYk+CHQN1n/7z+vS1ESJNIr9MtECOyhMv/2fqJsf30ERWW/KkYCsxduojwSgDbQsNvE0+Ndp9AjAf8/CiDrcEjCPmtWxD+aacL3vtTvPAih5+Z7iSIBbb9GG1pAANJDkv8Dn4G/0NAic1H+0HJS/JC3IUuCFLXpjmlL9J4Asq02tLo1eP06t8BYK7psvr9XVfT2qxpGAFsY661VJALIWn6I+34P/ru6V9JdKr28CM8Uw8XKcQBTMyX+md9uqO2RDXDezXqYf0uaRvuaGO75OHAlwdORIABwamECCf11U7wtwskO0NCVlgfrz5/XIT8rVZFd8Sdb0PCROXAWg0dPXb/9jx4sjAWM1HPmbBMCRgQgp+BtS/Rng+OXdMs0RXpqRskCtVr8LnN4ZOpip34We9C0Vc7ttn07+m4xt+EgAtw2ezCjT10gAMuWOryy04L+rl3P2L8th8T1maUgCeunfLA1SFwbUp6GusjiRAMjP5M2aTLt2179OhpqfAzrRS3hbEoCEQVstLtDgX2coR3Joqx6s759AZaj+G3Uwt4HWgHvbmWvlVy6MRHkoyCSanwNc2Lm0DSQAOQ5EyMHfsNZKcnCOvFSdlUDk9WN/G5VqclXjwnyW6Lk0B+dTc+q1ciQgdeKpKyABmNoolTVCD/4GrVQL6tBlKvuB74Wuni9HaR+O8b0fY9r/+A82yB1j5vP+07nnESQIwpGABDHbKYoEoB21DrcpQvCvE5Vljw6p2NxxAb1ZTlDPgdfD/1f36sErh9i9fqRyE44cCWgCKa1VSADSkp2g3MIEf9P/SLZPwMDiAAT6umVvHeOzA+jKSBd2lAflSyMzLvyrJ1fudKEdKbeBIwEpA09UPAnARDIpLC9U8Fc//f3ylRQYKdIRAb3hvznzP5ijPBpsVyzaJL92hLfejKgkW1xqT4pt4RLBFHEnKpoEYCKZhJcXLfgbvqgmTyfMSHEOCWhwutyh5nTclGrJnZP/RjqjSYkzDyQaaVOK//JzQIq4cUWTAMSpJLysiMG/TliWnyRMSXGOCKyYJwu0KYc70pwkmrFp6Tq5L4mCkiyjWpVHkyzPg7L4OSDDQSIBSBm7sMFfpNo1XX6UMi/F5yRQCe3SP5Erc6KctNpKlzysK+i5iYWaOBKQ0XCTAKQIXeDgb1Q3nnmvbE6Rl6JzEjBPp9PD/2flVH0a1W7eVpEb0yi40zIX3S/PaTT8YafleLg9RwIyGDQSgJSQCx78jep3UqKl2JwF9A6Pl2gTpufcjOSqj+Qr562Vl5MrMNmS9Ov/3cmW6E1pHAlIeahIAFIAJvjrLSZr0pcCLUXmLNCrQ6ufypfm3IxEq9fn7F6daIHJF7Yy+SK9KZEjASkOFQlAwrgE/zroIws3yEDCtBTngMDb5supEtbtab931oA86ADthE14cEDu0RefmnCF8F+oHwnQz9Zl4Xc12x6SACToTfDfhVmSTyXISlEuCYR28l/k3qV/44e7d/jOhM48n2B8+zKa1ysi5apVPEo4UW4SgIQ4Cf6jkM9Mr8kXR+f4IxiB/qPlIP0UPjmYDok8+/KessqH/uhTAc1VCkU/qZYjAQnvrCQACYAS/HcjRpH85ekDsnX3Ev4KRqCr/tQ//ck8kKkkn7/obj9uV73wAXlRLwb8h0DkO+kGRwI60Ru3LQnAOJBWZwn+u8X0bOUHaofINbuX8FcoAn2H18/6vyiU/mg/Ir3JznU+9WfWS/KPmgRwcy29y7h+1nyGnwM633tJADowJPhbeNsrJbmoZ4VUraXMBCFQmSlLNPi8JojOaCc0gHyrZ4Nfd9k75VHZURu+BJP3GElAIm9FEoA2GQn+Npwe+v/ownXygL2UuWAESvXD/8F0R69k8PKkuqUb5Ls6CH8dzkB01BPOCeiIT6/p7XD7Qm5O8B837PphumTAzw/UcT1hNkag71h5qy4+IeYlXxc9WTtYbve18Q+ul7/WyMd9NoYHkHMCOtiRSQBaxCP4jwMryYoH18mHxy1lNiCBSq0+vuaDNohJD/8v9/mnql69LHDmS3KBHsXgbpvDeyRHAtp8Z5IAtABH8Lex9IN01bM1Obd3+Dpl+0XmghDQk//m6KfruUF0ZrgTO6dV5Xrf+2POB9DLbU/Xftzle18Saj9HAtqAJAFoEo3gb0OZ4P9cJO9fNiA77VeYC0mga6acp2O9Vyh90mRm9Zkb5ekQ+mMut50eyWkcCRgdzfqRAK4OGPWY8g8SgCmJRAj+NhLB3/YIeU6f+nd5SP2renry30RjUE8CzJEAfg4YISIJGJFo4t9gftdroq9trULwt9kI/rZHyHMr5svx+uS/ewPq4w8XrZfD9UNPd+Owplu6ZfZgWW7Rnp0UVs/a7o3mrvJHOt6fa7uEAmzIEYBJBpngb+MQ/G2P0OcqAd73P8Tgb/ZDjgQ0vBs5EtBA0riABKDRpL6E4G/DEPxtj9DnVh0j++uYLwmon1tnDMpXA+pPQ1dIAhpISAIaSOwFJAC2R32O4G+jEPxtjyLM1WpysfZzZjB9LcnXTntQXgimPxN0hCSgAYYkoIFk9wISgN0W9b8I/jYIwd/2KMKcjnlJPzUvC6mv5apcHVJ/JusLSUCDDklAA8nwAhKAMTAE/zEY+ifB3/Yoylz/XH3kb0neFFB/71+4QQYC6s+UXSEJaCAiCWgg4VbAoyQE/1GK+h8Ef9ujUHNl7vsfwniTBDSMIknAOBKOACgIwd/eKwj+tkeR5vqPlQP0TPlTA+rzi3rHvMLeN58koGFPJgkYQ1L4BIDgP2Zv0D8J/rZH4eZq9Rv/dIXSb92frzdBMJT+tNOPkSRAE7s729k+wG1IAnYNaqETAIK//dYm+NseRZu7a4F0yfDZ/6F0Parpg39C6Uwn/TBJwLRIziAJGFUkCVCKwiYABP/RN0L9D4K/7VHEuRc2y0I9+e/1ofRdg913etbLj0PpT6f9IAloECx8ElDIBIDgb78RCP62R1Hn9N6pHwyp7/rt/6qQ+pNEX0gCGhQLnQQULgEg+NtvAIK/7VHUuZXd8hbt+4KA+v8LfVrlNwLqT2JdIQlooCxsElCoBIDgb+/4BH/bo8hz+kHwAe2/HjUPY9JP9Gt5VPXEY0kS0GBTyCSgMAkAwd/e4Qn+tkeR5/qOk1l6+P/8gAyGoi65NqD+pNIVkoAG1sIlAYVIAAj+9o5O8Lc9ij5X3ilnq8F+wThEcsvitfJUMP1JsSMjSYBW8W8pVuNT0YVKAoJPAAj+9nuP4G97MFc/7h/UyX+lEif/tbJfmySgOk3O0G1IAobhCpMEBJ0AEPztjwGCv+3BnMjKeXK0OswLyOKxTeu54U2r49mzRraRBFhqhUgCgk0ACP7Wzswd/mwO5nYJ6AfAh4PCiOSqXtHbGTG1LEAS0EAWfBIQZAJA8Ld3ZL752x7MDQusPkr20X3jfQF57KjW5MsB9SfzrpAENJAHnQQElwAQ/O0dmOBvezC3W0AP+V6oc3vsXuL9X1/v2SjPeN+LnDtAEtAwAMEmAUElAAR/e8cl+NsezNkC+ql2mb3E87kaJ/8lNYIkAQ2SQSYBwSQABH97hyX42x7M2QL93XKiLjncXurxXCTfX7xB/sPjHjjXdJKAhiEJLgkIIgEg+Ns7KsHf9mAuRqAc1n3/9bFmV8b0kkUdCpAENAAGlQR4nwAQ/O0dlOBvezDXKNA3X16rl4Wc1fiKt0s2d02XG71tveMNN0nArBflNG3mbY43NavmBZMEeJ0AEPzt/Z3gb3swFy9QqdV/+58W/6p/S3W///KZ98pm/1ruT4tPeVR2aBKwWFtMEjA8bEEkAd4mAAR/+8OD4G97MBcv0LdUKvrIn0viX/VzaVcky/1suV+tJgloGC/vkwAvEwCCv70jEvxtD+YmFuh6on4o96CJ1/DrFb3t73fPGpAH/Wq1v60lCWgYO6+TAO8SAIK/vQMS/G0P5iYX0Kf+BXXf/5re+W/yHvNq0gIkAQ2i3iYBXiUABH97xyP42x7MTS7QP18O0ZP/3j35Wl69+szsF2WVVy0OpLEkAQ0D6WUS4E0CQPC3dziCv+3B3NQC+gn1AV3Lm/f8lD2K5DoTiKZcjxVSESAJaGD1Lgnw4sOA4G/vaAR/24O5qQVuP1Rm6OH/C6de05s1atWKXOtNawNt6EgSoPvWrYF2sdVueZUEOJ8AEPzt/Y/gb3sw15zA1n1lqR7+f01za3ux1jd77pcnvGhp4I00ScDsF2QJScDoQHuTBDidABD8R3eo+h8Ef9uDueYF9BMpqJP/NJnh5L/mhz/1NUkCGoi9SAKcTQAI/vYORfC3PZhrXmDlXDlS1z6++S2cX/Nn1UPkW863smANJAloGHDnkwAnEwCCv70jEfxtD+ZaFAjtvv8lubpnhVRbVGD1DARMElDbKov5OWAU2+kkwLkEgOA/uuPU/yD42x7MtSbQd7jM0Tf5Oa1t5fTag11l+YLTLSx443oekkGSAGsncDYJcCoBIPhbO43+zCmrnovk/csGZKf9CnMINCdQninn6360V3Nre7HWqjPXyq+8aGmBG0kS0DD4TiYBziQABH97hyH42x7MtSegt8o11/6HM5U4+c+XwSQJaBgp55KAUkMTc1hA8LfRCf62B3PtCazqlt/V32K/297WTm71w0Xr5XD90NK3CJMvAvoz1PTybOnX6GceKcyk+6/uw3+k+/Ln8sbI/QgAwd/eBQj+tgdz7QtEgZ38p++NKwn+7e8PeW3JkYAGeWeOBOSaABD87R2D4G97MNe+wC3d8ir9nryo/RKc23LrzB1yg3OtokFNCYwkAbryLU1tEP5KTiQBuSUABH97Dyf42x7MdSagZ41eoiXM7KwUd7aOIrnhtAflBXdaREtaFTBJQHWbLNHtSAKG8XJPAnJJAAj+9luH4G97MNeZgO5PJf2R8dLOSnFr60oky91qEa1pR4AkoEEt1yQg8wSA4G/vAAR/24O5zgVumifv0VIO7bwkZ0pYu3CDDDjTGhrSkQBJQANfbklApgkAwd8eeIK/7cFcMgI1Ceu+/3r4n/v+J7NrOFMKSUDDUOSSBGSWABD87QEn+NsezCUjcNN8OVBLOiWZ0hwopSQvzBBZ4UBLaELCAiQBDaCZJwGZJAAEf3ugCf62B3PJCdQiWaalVZIrMfeSvnD6gGzNvRU0IBUBkoAG1kyTgNQTAIK/PcAEf9uDueQElnfLNN2/LkquxNxLivROhtfk3goakKrAmCTgG6lW5E/hmSUBqSYABH97jyP42x7MJSuwf1kWaomvS7bUXEv7t0X3yyO5toDKMxHYlQQs1cpIAobFM0kCUksACP72+4bgb3swl7xAOQrr5D8pc/Jf8nuJuyWSBDSMTepJQCoJAMHfHkiCv+3BXPICK7vlLbqfvTP5knMr8RfPVuXW3Gqn4lwESAIa2FNNAhJPAAj+9gAS/G0P5lISGL7vv94qP5hpOY/BDmYsW+oISUADV2pJQKIJAMHfHjiCv+3BXDoCet//2Rr5z0+n9FxKHapOk+tyqZlKnRAgCWgYhlSSgMQSAIK/PWAEf9uDufQEBkXO0Qf/7JteDdmWrO+db/SskZ9nWyu1uSZAEtAwIoknAYkkAAR/e6AI/rYHcykLlOUDKdeQafF66R93/stU3N3KSAIaxibRJKDjBIDgbw8Qwd/2YC5dgdXz5Bj99t+dbi2Zlv7Yg+vkO5nWSGVOC5AENAxPYklARwkAwd8eGIK/7cFc+gJ6578Ppl9LdjXoff+v7BXRxxkwIbBbgCRgt8WuvxJJAtpOAAj+9oAQ/G0P5tIXWH2U7CMl6Um/psxq2FauyJcyq42KvBIYSQL0s/ZmrxqeXmM7TgLaSgAI/vaIEvxtD+ayEYi66rf9nZ1NbenXor/99+md/55LvyZq8FXAJAG1bdJDEjA6gh0lAS0nAAT/Ufj6HwR/24O5bAR0vzPX/JsH/wQz1Wqc/BfMYKbYEZKABty2k4CWEgCCvw1P8Lc9mMtOYOV8OUn3vzdnV2O6NWlfHlgyIGvTrYXSQxEgCWgYybaSgKYTAIK/DU7wtz2Yy1YgtPv+66fX57IVpDbfBUgCGkaw5SSgqQSA4G9DE/xtD+ayFejvlt/UGs/IttZUa3tpy065MdUaKDxIAZKAhmFtKQmYMgEg+NvABH/bg7nsBaKyXKa1Tsu+5tRq/PIFm2RLaqVTcNACI0mAdvKmoDvafOeaTgImTQA0+P+l1mn+Y1IBgj+7Qd4CfUulou/ui/NuR5L169mM1yRZHmUVT8AkAdVt8j7tOUnA8PA3lQRMmACsmidXaDm9w2Xx/wR/9gEXBMqPy+najoNcaEtCbbhn0Xr5QUJlUUyBBUgCGga/ngToo8IvbHhl14LYBEB/YzxHA95nJtqoaMsJ/kUbcXf7Wy6Fdec/vZiR+/67u7t517IxScA3vGt8Og0u6f01rtEv9O+OK95cS2xNfcfKWytVWacLg7nBiNXBFmcI/i2CsXpqAqu75Y21kjyiFcQm7qlVnF7Bz8x6UQ485VHZkV4VlFxEgb7DZXpllnxd+35WEfvf0OeSvCBDcvTijfLTsa9ZHyS3HyozNPgbNIK/IhD8x+4q/J23QK1c//ZvvWfzblOH9V9L8O9QkM1jBUaOBOhnOLcNNkL6uPBSl3zFnEM0Fsz6MNm6j3xMX3zb2BWK+jfBv6gj72a/TXKub+IL3GxdW62qlbvk2ra2ZCMEmhAwScBzkSzVVTkxUBH0QVvvqDwhl46lG00A+o+VA/T3gD8b+2JR/yb4F3Xk3e339n3qZzi/2t0WttayqCS3L/wP+c/WtmJtBFoTWDYgO83VARwJGHX7277jZL+RudEEoDRU//a/x8gLRf2X4F/UkXe73xowg3rsrx7N4OQ/t3e5YFrHkQBrKPev7JQ/HllSTwBuPUL21Q+YS0YWFvVfgn9RR97tfq86Rn5HA+bb3W5lS637We0N8u2WtmBlBDoQMEcCno3qj87m5wCRP9KTJOcYznoCsGOmnK1/1xd0YOz1pgR/r4cv6MZHNflQSB3U3yKv6lkh1ZD6RF/cFxhNAiJZ7X5rU23hfl2zZJGpYfgngEjOSbU6xwsn+Ds+QAVu3s0nyJ7afZOghzINTuuS60PpDP3wS6CeBIieT1PwJEBjXv0zpXxLt7xKh/B4v4YxudYS/JOzpKTkBao76mf+myQgjKkkK89cK78KozP0wkeB0SSg2FcH/L75GaA8WJF36CA23BDIx4Fttc0E/1bFWD9rgZrIsqzrTLO+Uo2T/9L0pezmBPg5QKbpzwDHlfVQyLHNkYW1FsE/rPEMsTcr5so7NDM/IpS+6Y3JH144IPeG0h/64bfA6JGAgv4coDHwBJMAvNnvYWy99QT/1s3YInuByvCd/7KvOKUa9THGn9OERt9+TAi4IVDwJODNZX03vtGNocimFQT/bJyppTMBc26O7qv1M3U7K8mZrV/ZVpavOtMaGoLALoHCJgEleZM+XEz2LcqeQPAvykj738/BslymvZjhf09Ge3DDeWvl5dE5/kDAIQGTBFQPqd82+F8cala6TYlkv7Ke/leIu/8R/NPdlyg9OYFec3luVE8Akis075IiuSbvJlA/ApMJmHtTVN8g5+k6RUkC9jQfNBobw5/0SMeMA18K5jGq4Q9YgXt4RLe8V7v/hoAI1iwekA0B9YeuBCrwwuNifhafHmj3xnerZG4EtGX80kDnT922j/TXn6oWaAfpViACod33X7j0L5A9M+huLO+WaXpTnK/rl8WQzr2ZbMxeMQnAs5OtEdhrp+ojj79mBjqwftGdQAT6uuW3tCvvCaQ7phvPvTxHVgTUH7oSoICJCfuX9NB/SRYG2L2JuvS8SQAem+jVEJeb7O5VJVnNkYAQR9f/PlVEPqC90H/CmPS+/9dfdLdsD6M39CJEgQJ+8x8ZxsfMSYA/Gpkr0L8cCSjQYPvSVb0153R9P17sS3ubaGdUrsi1TazHKgjkIlDQb/4j1o+UNUNfMzJXpH85ElCk0fajr7ue0PUbfrR26lbqe+yORffLI1OvyRoIZC9Q4G/+w9iR3FfeXpHv6dxQ9vxO1MiRACeGgUYYAT37+IMhSdT0sb8h9Ye+hCNQ8G/+ZiCr+jzu75V33ZzjrnCGtrWecCSgNS/WTkdg1Vz5bS3ZPJgrlOnp50RuC6Uz9CMcgcJ/8zdDWZJ7egbkJXMSoJRKcqP5t8ATlwgWePBd6LreJ/9D2g7NR8OYopIsN3dXC6M39CIUgZHgr++0Ip3t3zB8+kFTv9lRPQGYVpOv6xpFuhywAUQXkATEqbAsdQG97/9s/UA6N/WKsqtgqNYln8+uOmpCYGoBDvsPG2nwf7m0c/jS3HoCcPqAbNUPIH6vIwmY+l3EGokL7Cxr8I+CeibHTT1r5OeJQ1EgAm0KjHzzNz/5tllEMJvpuUZXL3xAXjQdqicA5g/9GeBT+s/z5u+CT5wYWPAdIOvu6xtyWdZ1pllfqcaXiTR9Kbs1Ab75W16bqyX5fyNLRhMAvVznOT0K8PGRF4r8r8kSuVlQkfeA7Pq+sluO1W//3dnVmHpNjy7cIIU9qTh1XSpoSYBv/jaXftH/q5518suRpaMJgFlQPViW6z/mskAm/TmA2wazG6QtUC6Hdemfen1OE2g9qMGEQL4CfPO3/fVN+cDQVvn02KVWAmAeh6g3IT1bV9AreJg4EsA+kKbArUfIvnojrqVp1pFx2duq0+TLGddJdQg0CPDNv4Fki6blZ/c8JINjX7ESAPPC4rXylAY+kwRYK47dqGB/cySgYAOeVXd3TK/f9nd2VvVlUM+/6Ml/nEeUATRVTCzAN/8Gm1opkouWDDTe9r8hATCbLlovd2gScKH+WWsoqoALOBJQwEFPuct6OK6k/7s85WqyLZ4ribL1prYGAb75N5Dox4x8bNFA/BM5YxMAU4QmAeZGAeYDiiTAgHCJ4LAC/5+IwOp58i4t6LBECnOgEPP74uJ1ss6BptCEggrwzT924Hs1lo+e9T9+jQkTALPi4vX1m3mQBOxW42ZBuy34qwMB/e0/qPv+67eMz3bAwaYIdCTAN/9Yvl6N4Z+IfWXXwkkTALMOSUADH0lAAwkLWhHo75bf1ONyp7WyjePrvrRlcPjWoo63k+YFKDAS/PU9Vejb+44b2imDv1l/ygTArEQSYBSsiSTA4mCmJYFS/cY/01raxuGV9b7/X7pgk2xxuIk0LVABgn/swDYV/M2WTSUAZkWSAKNgTSQBFgczzQjctUC69HKcS5tZ16N1rvGorTQ1EAGCf+xANh38zdZNJwBmZZIAo2BNJAEWBzNTCbywRc7QQ5Wvn2o9X17X3/7vXrJOHvKlvbQzDAGCf+w4thT8TQktJQBmA5IAo2BNJAEWBzOTCYR28l+NS/8mG25eS0GA4B+L2nLwN6W0nACYjUgCjII1kQRYHMzECazuljfq8pPiXvN02S+fq8lqT9tOsz0UIPjHDlpbwd+U1FYCYDYkCTAK1kQSYHEwM15AT5b7kC5r+z03vrzc50ty3bIB2Zl7O2hAIQQI/rHD3HbwN6V19GFEEtAwICQBDSQsMAJ9x8ksvVmOubtmKFNVhuS6UDpDP9wWIPjHjk9Hwd+U2FECYAogCTAK1kQSYHEwYwQqQ/I+/Wf/gDRuW7xRfhpQf+iKowIE/9iB6Tj4m1I7TgBMISQBRsGaSAIsDmb00r+w7vzHyX/s1BkIEPxjkRMJ/qbkRBIAUxBJgFGwJpIAi6O4M6vny1Ha+2MCEnh80zr514D6Q1ccFCD4xw5KYsHflJ5YAmAKIwkwCtZEEmBxFHNGn6Z1RWA9X97LQ8ICG1K3ukPwjx2PRIO/qSHRBMAUSBJgFKyJJMDiKNZMX7fsrYf/zw6o14OlQfliQP2hK44JEPxjByTx4G9qSTwBMIWSBBgFayIJsDiKM1MRuUB7u0cwPY5kxaJN8utg+kNHnBIg+McORyrB39SUSgJgCiYJMArWRBJgcRRkpiTmcdrhTJz8F85YOtYTgn/sgKQW/E1tqSUApnCSAKNgTSQBFkfYM6vmyzu1h28LpZelSB7W9/S9ofSHfrgjQPCPHYtUg7+pMdUEwFRAEmAUrIkkwOIIdya4+/6LfDbc0aJneQkQ/GPlUw/+ptbUEwBTCUmAUbAmkgCLI7yZvqPl1dqrswLq2Svbu+SGgPpDVxwQIPjHDkImwd/UnEkCYCoiCTAK1kQSYHGENVOpyGXaoxmh9EpvY/zV89bKy6H0h37kL0Dwjx2DzIK/qT2zBMBURhJgFKyJJMDiCGOm17yvonoCEEaHhntxTUidoS/5ChD8Y/0zDf6mBZkmAKZCkgCjYE0kARaH/zNHzpVTpCQH+9+T0R7cu2S9bByd4w8EOhAg+MfiZR78TSsyTwBMpSQBRsGaSAIsDr9nojL3/fd7BGl9WgIE/1jZXIK/aUkuCYCpmCTAKFgTSYDF4eeM3vnvt7Tlf+Bn62Nb/exLe0h/7CssRKAFAYJ/LFZuwd+0JrcEwFROEmAUrIkkwOLwb6Zr+Nu/3gAwmOkLF90t24PpDR3JRYDgH8uea/A3Lco1ATANIAkwCtZEEmBx+DPTd7hM17PlL/anxVO2NIoqct2Ua7ECApMIEPxjcXIP/qZVuScAphEkAUbBmkgCLA4/ZiozZYme/f8aP1rbVCu/vWSt/KSpNVkJgRgBgn8MiogTwd+0zIkEwDSEJMAoWBNJgMXhwUwprJP/VPwqD9RpoqMCBP/YgXEm+JvWOZMAmMaQBBgFayIJsDjcnek7Vt6qrTvB3Ra23LInq2+Q21reig0QUAGCf+xu4FTwNy10KgEwDSIJMArWRBJgcbg5UxmSK7RlJTdb10arIrmmZ4VU29iSTQouQPCP3QGcC/6mlc4lAKZRJAFGwZpIAiwOt2b05L85pZKc51arOmrNUFdNvtBRCWxcSAGCf+ywOxn8TUudTABMw0gCjII1kQRYHO7MVGbJuXr2/17utKjDlpRk9Zkb5ekOS2HzggkQ/GMH3Nngb1rrbAJgGkcSYBSsiSTA4nBm5nJnWpJEQ2qc/JcEY5HKIPjHjrbTwd+02IvfLPvnySXaVvMwEqcTFgOa0XTbrBdl8SmPyo6M6qOaCQRWHSPHRTW5b4KXfVz8o0Xr5a36waAHNZgQmFqA4B9r5HzwN632IqByJKBhB+NIQANJPgs0+H8wn5rTqTUqydUE/3RsQyyV4B87ql4Ef9NyLxIA01CSAKNgTSQBFkf2M/rtf3+tdWn2NadW47Zal3wltdIpOCgBgn/scHoT/E3rvUkATGNJAoyCNZEEWBwZz0RykdY4M+Na06zuaz1r5Pk0K6DsMAQI/rHj6FXwNz3wKgEwDSYJMArWRBJgcWQzoz+Ql6JILsumtmxq0ccYX51NTdTiswDBP3b0vAv+phfeJQCm0SQBRsGaSAIsjvRn+ufKyVrLYenXlFkN65bcL+szq42KvBQg+McOm5fB3/TEywTANJwkwChYE0mAxZHyzPBjf1OuJLvi9UZG3Pc/O24vayL4xw6bt8Hf9MbbBMA0niTAKFgTSYDFkc5M/7FygJ4pf2o6pedS6ovTavL1XGqmUi8ECP6xw+R18Dc98joBMB0gCTAK1kQSYHGkMFMTc+OfrhRKzqvIL54+IFvzqpx63RYg+MeOj/fB3/TK+wTAdIIkwChYE0mAxZHczF0LNPDX5OLkSsy/pGpFrs2/FbTARQGCf+yoBBH8Tc+CSABMR0gCjII1kQRYHMnMPP+KnKX3z3x9MqU5UEpJvtOzVh52oCU0wTEBgn/sgAQT/E3vgkkATGdIAoyCNZEEWBwJzJTCuvOf3vCXk/8S2C1CK4LgHzuiQQV/08OgEgDTIZIAo2BNJAEWR/szfXPlUA2YJ7ZfgnNb/vLZSG52rlU0KFcBgn8sf3DB3/QyuATAdIokwChYE0mAxdHeTKUsV+iWegFAIFNJrl02IDsD6Q3dSECA4B+LGGTwNz0NMgEwHSMJMArWRBJgcbQ203eczNItLmhtK6fXrsqQfN7pFtK4TAUI/rHcwQZ/09tgEwDTOZIAo2BNJAEWR/MzXUPyfl17v+a3cHtNvZXxrYs3yk/dbiWty0qA4B8rHXTwNz0OOgEwHSQJMArWRBJgcTQ3o/f9D+qxv3opIyf/NTf0wa9F8I8d4uCDv+l18AmA6SRJgFGwJpIAi2PymdXz5ShdY/7ka3n16uM/2CB3eNViGpuKAME/lrUQwd/0vBAJgOloPQmIZJn+WTPzTEIS0OROUIvkw02u6sVqevj/6l7eB16MVZqNJPjH6hYm+JveFyYBMJ1dPCDX6WVcJAEGY3giCRiRmODf1UfJPvqS+f0/lGlHeVC+FEpn6Ed7AgT/WLdCBX8jUKgEwHSYJMAoWBNJgMVhz1SnyYW6ZA97qddzfYs2ya+97gGN70iA4B/LV7jgbxQKlwCYTpMEGAVrIgmwOHbPlCK5bPec/3+Vypz85/8ott8Dgn+sXSGDv5EoZAJgOk4SYBSsiSTA4hDp767f9e/wcYt9nt206H5Z43MHaHv7AgT/WLvCBn+jUdgEwHSeJMAoWBNJwFiOcmCX/olcObZ7/F0cAYJ/7FgXOvgbkUInAAaAJMAoWBNJgHL0zZfX6gmjZ1kyfs9s3laRG/3uAq1vR4DgH6tW+OBvVAqfABgEkgCjYE2FTwIqNblURaZZKj7PRPKV89bKyz53gba3LkDwjzUj+O9iIQHYBUES0PBGKWwS0LdUKvrIH5MABDNV9Nr/YDpDR5oSIPjHMhH8x7CQAIzBIAkYgzH8ZyGTgMoTcqp2/6AGDX8XfO+sAXnQ3+bT8lYFCP6xYgT/cSwkAONASALGgUgh7xgY1n3/Iy79a9irA15A8I8dXIJ/DAsJQAwKSUADSmGOBPTPl0O09yc3CPi74NmX95RV/jaflrciQPCP1SL4x7JwEuAELJwYGANTjCRg+FbR4STGJfn8RXfL9pjxZFFgAgT/2AEl+MeyDC8M54Nukk62+xJHAhrkgk4C+g6X6Xry3x829NrfBVG1qs+/YApegOAfO8QE/1iW3QtJAHZbxP5FEtDAEmwS0DVbluq1/69p6LGnC/Spf9/q2SCPetp8mt2kAME/ForgH8tiLyQBsD1i50gCGliCTAKiKLA7/5U4+a9hzw1sAcE/dkAJ/rEsjQtJABpNYpeQBDSwBJUErJwv5p7/JzT00t8FP6sdLLf723xaPpUAwT9WiOAfyxK/kAQg3iV2aT0JEPmQvqhHV5lUwCQBX+oN4WTSSD4c2Igu71kh1cD6RHd2CfTqe+7VIl/Vc1YWgjIq8InF6+UTo3P8MaUACcCURPYKuoMt1/B/uS6t2a8Udu59R8yXz/rcez35b46+Ec7xuQ/j2r6zqypfHLeM2YAEjpgnn4pK0hNQlzrtign+vZ0WUrTtSQDaGHF+DhiHpr+d66Nzvb11bnmmnK+HdPYa1yufZ1eduVGe9rkDtH1igVXz5TJ9NbQjVhN3eOpXzGH/3qlXY43xAiQA40WanCcJGAdVks+snCtHjlvqxWypVP9A9aKtzTRS+3NVM+uxjn8C5lwVPVn1//vX8tRazGH/DmhJADrAIwmw8GaWyvL5Xs/OB+ifVz/x72irJ37P/HDhOvl3v7tA6+ME9ChVqRTJlfra7LjXC7iMb/4dDjoJQIeAJAEW4Dw9H+ACa4nrM6Fd+qf3/S9xkqrre11b7dND/xfqhr/X1sbhbcQ3/wTGVD8rmJIQqP8GXtITBD37BpxE38eV8VR1m7yx5yEZHLfcudlbuuVVgyV5Uhs207nGtdegrTN2yAGnPSgvtLc5W7kqcNcC6Xp+s/xEz/o/2NU2ZtguLvVLCJsjAAlBciRgFPIAPanu7NE5h//Q4H+xNi+U4K8HiOVrBH+Hd7gOmvb8K/qeIvgbQYJ/B/vR+E1JAMaLdDBfTwK4T4D+UOn+Gcrm91S9nNPbKxfidtNyVa6OW86yIAQ+EEQvOusEwb8zv4atSQAaSDpbwH0C6n7zVh0jh3Umme7Wq+fLH2gK8KZ0a8m09PsXbpCBTGukskwE+o+Wg7Si4zKpzN1KCP4pjA0JQAqo5kiAnq17hRatXzSLOemlSotd7jn3/Xd5dGjbWIGoXH8vFfl8LU74G7tDJPg3CUCCmGOLWjSgh2MLfMdATYAWjPVw6e+b5suB2p5TXWpTh215cXpN+josg80dFdCf1BY42rQsmmW++fdmUVER6yABSHHUi3wkQA99HN+3VCop8rZdtN4g39zK2cm2tdMptb7+9AHZ2s62bOOFQEgPqWoFnG/+rWi1sS4JQBtorWxS4CMBc6Y9Wf+m3QpX6uuaJ6jpkZmLU68ouwoifSiFufyUKUCBvvnyWu3WfgF2baou8c1/KqEEXicBSABxqiKKeiRgaKd7JwLuX64/Pe11U42ZR6/f2bNefuxRe2lqCwJdNTm0hdVDWZVv/hmNJAlARtBFPBKgtwZ+TUa8TVdTDuzOf/r7MPf9b3r0PVzRwfdQyop8808ZeGzxJABjNVL+u36fgEiWaTXFeJRwTfZMmbSl4ld2y1v09/J3trSR2yv/4pma3OJ2E2ldRwKRzOloe7825lK/jMeLBCBj8EL9HFCW6RnzTl5dWT6oK4R0OdU1ywZk5+Sd5lWfBfSbQpfP7W+h7Rz2bwErqVVJAJKSbKGcwvwcEMnmFlhSXbXvOJmlkf/8VCvJtvAhvY7humyrpLYcBLblUGfWVXLYP2vxXfWRAOQEX4QjARpwX86Jt6Hayk45R8/+37fhBV8XRHLL4rXylK/Np93NCZRr7iTRzbW45bX45t8yWXIbkAAkZ9lyScEfCYjk8ZZR0tsgqHupc/JfejuKSyWXS/JTl9qTcFv45p8waKvFkQC0Kpbw+gEfCYi2dcmjCXO1VdzKuXKkbjivrY3d3OixTevlTjebRquSFBicXn8PhXjSMCf8JbmjtFkWCUCbcEluFuiRgB+ft9aNnwD0csRzkxyv3MuK5KreolxJkjt2vg3oWSPmHIDQ7vPAYf98d6vR2kkARiny/SPASwTvyVfUqt3pBxNZLZ16ZnupIl+cejXWCEVAL1116b3UKSuH/TsVTHB7EoAEMTstKqSfA/Rpe9/q1COJ7fuPlQO0nDcmUZYjZfQtul+ec6QtNCMDAT0P4I4MqsmiCr75Z6HcQh0kAC1gZbFqCD8HmLP/N+/pRgJQqsrvZjFuWdWhP2dcnVVd1OOGwEt7yO3akhfdaE3breCbf9t06W1IApCebdsl+34kQA9Z3njR3bK9bYAEN9S2vCXB4vItKpLv67f/Nfk2gtqzFjDvJT2itiLrehOsj2/+CWImWRQJQJKaCZbl8ZGAalSRf06QorOiSvKmzgpwaGvu++/QYGTelE9qjT5eDcA3/8x3leYrJAFo3irzNX08EqCH//uWrJWfZI41QYWlSF41wUu+Ld7cNUO+5lujaW8yAksG5Ed6NOumZErLrBS++WdG3V5FJADtuWW2lWdHArZFVfmzzHCaqEg/NIN4mIr248tn3hv8XeGaGNHirlKJ5H9q7534aa2JUeCbfxNIea9CApD3CDRRvy9HAvR3yr9avNGxO5eVJIh9XDvByX9NvFdCXmXhgDymR7T+wYM+8s3fg0EyTQziw9ET646a6cGRgHtqh8g/dtTJNDaO5JU0is24zH9ftF5+kHGdVOegwDMif6PNcvlEUL75O7jfTNQkEoCJZBxc7uyRgEh+3lWVc3pWSNVBtpccbFNrTYpkeWsbsHaoAubxz9XhO1u6eC8Ivvl7tuORAHg2YK4dCTDX/Oud6U49c6M87SjlY462q9lmPTPrJelvdmXWC1+g5355QkryXu2pS0e3+Obv4a5HAuDhoJkjAfpo22Xa9Ly/cb+oJ6edotemf99VRn1q3o9cbVsz7dL2f/6UR2VHM+uyTnEEFq+TdZp8L9Ieb8m515EmI3++eL18Iud2UH0bAiQAbaC5sEk9CRB5v7Ylr+DwlH4AvUPf+Pe64DFhGyJZN+Fr7r9QGyrJNe43kxbmIaDnhdyhdwZ4l9ad188BQ/oF4FJNRv4uj/5TZ+cCJACdG+ZWggbflXok4HhtQKaHuTXw36n1HuPDiWkL18tD6vOr3Aaps4q/WT/c21kZbB2wwOIN8h+VkhytXbwv424+qVcknLhkvXwh43qpLkEBEoAEMfMoSo8EbCgPyTwNyuaNqAl5qtNL+qb/E30W/cla7y9SrSmhwtXFmHw7oeIyLSYqcfJfpuCeVnbWOnny2UgWaPN79b+07xOgu6V8oTpNjtLzkb6n9TF5LKBjyRSKwIr5cnx5+DrhExLu0w6NotdPq0jvmWv9+za9ap68W9v/rwmbpF3cz6pvkEMcvbIi7b5TfpsCfXPl0EpF/o+mveYR2El/wbtHf3L4U3PUoc3msZljAiQAjg1IEs1ZNV/eGdXkT/TknFO0vBkdlPmUnoT21cqQfMbhs/yn7F6vfhAe0a0/k5Tk4ClXdmWFSD6mR1nceaaCKy60oymBVXPlt6OyfERX7tH/9mlqo/iVtulRv1v0a/+nnT/fJ779LJ1EgARgEhzfX7r1CNl3cKacpd8GTtJvwO/U/hw4RZ+26esbdae4S4PlHZvWyXd7/XwASUM39SjAFWrw2YYX3FzwbHWbvKHnIacu83JTilZNKnD9Apm511a9ZLCqJwuW5ERd+TD9rzLpRiKP6+v3aPJ/59ay3HLeWnl5ivV52VMBEgBPB66dZvd1y97lirypVJPXaTDcQzP7PTSzf0H/fb5akf98+H75aW8gAX+8T/2D8JX6Q4oOGP+ac/N8+3duSEJp0O2Hyozt++hngH4Z0Ft3z9GjBPvqv5vLNdmsnwVPV7fLIySeoYz21P0gAZjaiDUCEVjVLUv1Q67P8e78bMugvPWCTblf3+04E81DAIFOBZI+SaTT9rA9AqkJ6FnLK7Tw21OroPOCa/rA9wsJ/p1DUgICCEwtQAIwtRFrBCRQGpSLtDtO3rZYD8f989L1cndA3HQFAQQcFiABcHhwaFryAos2ya/1a3aed1CcqFP3znxRPj7RiyxHAAEEkhYgAUhalPKcF1i6Qb6r37bfpw2tOtHYSL4/Y4eczj3/nRgNGoFAYQRIAAoz1HR0rIDexvhmvczpQl22c+zyrP/WKzAeni7yrtMelBeyrpv6EECg2AIkAMUe/0L3ftE6uUGTgDP0aEAu1znrpZjfHJou7zh9QJ4t9EDQeQQQyEWABCAXdip1RUCTgG9FVTlS27M2wzaZR6j+/Q/Wy2k9a+T5DOulKgQQQGBUQL/8MCGAgLlByra95X9oYP5z1ZiVosgGPQnxI+Y8hBTroGgEEEBgSgESgCmJWKFIAqvfLgdHQ/JxPTx/vvZ7WmJ9j+TnpbJ8XG+v/KXeQO+2mJgVBSGAQCYCJACZMFOJbwL9R8tB0iWX6tPPztOjAge32f6q3nnwm/qExuv2nSO3nXi3DLVZDpshgAACiQuQACROSoEhCeiRgFL/MdKtz084SYP5O/Ss/bdq/w7S/+IeqLJFlz+s69xXK8t908pyj4+PTw5p/OgLAghMLEACMLENryAQK2DOF3jl1fKq8qDsqU9anF6bIc9t0f8uulu2x27AQgQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAA6eprwAAAPiSURBVAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBIgj8F1Cx5oYEZQsEAAAAAElFTkSuQmCC" preserveAspectRatio="none" id="img7"></image><clipPath id="clip8"><rect x="0" y="0" width="371301" height="371301"/></clipPath><clipPath id="clip9"><rect x="-0.363636" y="-2770.64" width="371302" height="374073"/></clipPath><image width="512" height="512" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAYAAAD0eNT6AAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEbAAUAAAABAAAAUgEoAAMAAAABAAIAAIdpAAQAAAABAAAAWgAAAAAAAACQAAAAAQAAAJAAAAABAAOgAQADAAAAAQABAACgAgAEAAAAAQAAAgCgAwAEAAAAAQAAAgAAAAAAGcBUEAAAAAlwSFlzAAAWJQAAFiUBSVIk8AAAQABJREFUeAHt3QmcXFWd6PF/VXVWwq6OozAgIjoiDKQTEBjHwCij7Nla2Yc1Ks6Mz+d7b5aPb9pZ3rzZfM8NCKC4IWMnnYAs6jAIjEImJJ1gEFRkGQVxYSdk63TVnf+pTnf6dN3uruUu55z7u34wfW/de5bvuVX/f926iwgTAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAwgKlhMujOAQQQCA1gUikdNsRss+OmbJvKZLZUUm2Vrtkx8Nr5Be9IrXUKqZgBAIUIAEIcFDpEgKhCNx8tLyuWpF3RZGcKCU5Uvv1Fv1vdkz/duiyJ/S/72uS8L1yTe5ctEF+GLMeixBAYJcACQC7AgIIOCVw+6EyY9u+crZEcoE27J36X7nNBm7S7W6oTpPretbI822WwWYIBCtAAhDs0NIxBPwSuH6BzNx7i3xYv+1/VFv+mwm2/hU9KnBtrSp/17NRnkmwXIpCwGsBEgCvh4/GIxCGQH+3nKqH+D+tvTkktR6V5AU9b+Djm9bLVb2cL5AaMwX7I0AC4M9Y0VIEghMw3/r33CJ/r4H5j7PqnH7o3V2pyrlnbpSns6qTehBwUYAEwMVRoU0IFECg/1g5QKpym3bVnNyX9fQrPbHgjIXr5f6sK6Y+BFwRaPfkGlfaTzsQQMBDgZXdejZ/Ve7VpucR/I3Yb+g1g3eu6JaTPeSjyQgkIsARgEQYKQQBBJoVuGm+HFiN6sH/wGa3SXG97bWanLx0g3w3xTooGgEnBUgAnBwWGoVAmAJ9x8l+lZ2yRnt3mDM9NCcHVuUE7hvgzIjQkIwE+AkgI2iqQaDoAnopXqkyKNepgzvB3wxKJPtGZVlxS3fsDYaKPmz0P2ABEoCAB5euIeCSgF7q9wG91G+hS20a05bDB8vyT2Pm+ROB4AX4CSD4IaaDCOQvsOpIeU00XX6sLdkn/9ZM2IKa3oTo+CUDsnbCNXgBgYAEugLqC12ZRMAcfr15vhygd0N7kx7ufL1+E5ulH3Z7l0rykmaBW/T1J6sij/Ssk19OUgwvIdCWQDRD/lYPtbsc/E2/yvp++JT++/a2OunBRnctkK5nXpaD9YP/t/RzYI7ef2GGfhYMVkvyin4ePCXb5Imeh2TQg67QxAQEOAKQAKKrRdQfpFKWJfrEtN/XNv6u/rdfE239pX4gmDOi/02fsraSe6g3IcYqkwrsut7/MV1p+qQrOvJiVJP3LNkg33akOR01o+9wmV6eLX+ghbxLE7Df0w/8w/XvaZMUOqSv/Vg/A/5d1/+OPkfhNv0M2DbJ+rzksQAJgMeDF9d0802/f568Vwf2I/r6SfpfJW69JpeZbwK362VSn+QyqSbFWK1BYOU8+aTuj/+t4QV3F9y1eH39veNuC6do2apj5DD9Rv8RDeLv01WbSfwnKnGzltFfLsunFq6TByZaieV+CpAA+Dlusa02NzXRQ5j/pIN6ROwKnS28Tz9QPsrvo50hFm1r/QY6pzJLDy2L7O1R3yP9Bnzo4nXyuEdtrje1b64cWinL/9UZc7Jl0id5f0uPjvwvPTqyyTcX2hsvQAIQ7+LV0puPld8YGpKr9EMr7TOs9eZpcv22inz0vLXysldINDYXgf75crl+g1yeS+WdVKoPDVo8IH/TSRFZbmsO9XfNkv+tRwA/pvXOSLFu8xPB1VsG5U8v2CRbUqyHojMQSDpDzKDJVDFWQD9g3zVU1UNz6Qd/U63ZXy6ZVZWB1XOle2w7+BuBWIFIlsUud31hSd7rehNH2rf67XJwZbbco8H/L3RZmsHfVGlOHP/wnOn6GTBfjjILmPwV4AiAv2MnGvz/UL9dXatdMG/KrKcdegbx+YsGZEXWFVOfHwL9c/Vs+nL9rn9+NNhu5ZAe6drf9SNd+hnwe/oZcLM2PY8rLLZpADl70fp6/bYec14IcATAi2FqbKSeWPUX+sa/Xl/JI/ibBs3Qqwtu1HZc3Ng6liCgAmX5oMcOXTN2yu+43H492fcM/QwwVyvkEfwNzSw96tCv7bjEZSfaNrEACcDENs6+om+4Xs28Xfh9sqLtuFbbs8RZLBqWi8CtR8i+WrHX+0Wlok8sdHRaMU8WaNO+rv/NzLmJ5iqja1fNkytybgfVtyFAAtAGWp6bmOCv9f9lnm0YV7fZh76ij3c9dtxyZgsssGN6/cjQbJ8J9NvtG11svwbbt2nUNYf98w7+Izz6a6B8ZpW51TOTVwIkAB4Nl4PBf0Rvpl5+uMI86W1kAf8WV0CDQUn/d7n3AvqQINf6cP0Cmam+N+h/eznWtpL+JHglRwIcG5UpmkMCMAWQKy87HPxHiA7UJ719emSGf4srsHqe3nXOtSf+tTcczt27YK8t8kntypHtdSf1rTgSkDpxshWQACTrmUppHgT/4X6X5Nxdv02m4kChfgjot9MP+dHSKVvp1D3x9WmKc/WkP9cvq+RIwJS7lTsrkAC4MxaxLfEm+O9qve5Qn6wfAo7tDQtDF9CfgV6vfTwtkH66dQ/8knxGXX34zOZIgCdvAB92Jk8ok2+mb8F/l8DRN82T9ySvQYk+CHQN1n/7z+vS1ESJNIr9MtECOyhMv/2fqJsf30ERWW/KkYCsxduojwSgDbQsNvE0+Ndp9AjAf8/CiDrcEjCPmtWxD+aacL3vtTvPAih5+Z7iSIBbb9GG1pAANJDkv8Dn4G/0NAic1H+0HJS/JC3IUuCFLXpjmlL9J4Asq02tLo1eP06t8BYK7psvr9XVfT2qxpGAFsY661VJALIWn6I+34P/ru6V9JdKr28CM8Uw8XKcQBTMyX+md9uqO2RDXDezXqYf0uaRvuaGO75OHAlwdORIABwamECCf11U7wtwskO0NCVlgfrz5/XIT8rVZFd8Sdb0PCROXAWg0dPXb/9jx4sjAWM1HPmbBMCRgQgp+BtS/Rng+OXdMs0RXpqRskCtVr8LnN4ZOpip34We9C0Vc7ttn07+m4xt+EgAtw2ezCjT10gAMuWOryy04L+rl3P2L8th8T1maUgCeunfLA1SFwbUp6GusjiRAMjP5M2aTLt2179OhpqfAzrRS3hbEoCEQVstLtDgX2coR3Joqx6s759AZaj+G3Uwt4HWgHvbmWvlVy6MRHkoyCSanwNc2Lm0DSQAOQ5EyMHfsNZKcnCOvFSdlUDk9WN/G5VqclXjwnyW6Lk0B+dTc+q1ciQgdeKpKyABmNoolTVCD/4GrVQL6tBlKvuB74Wuni9HaR+O8b0fY9r/+A82yB1j5vP+07nnESQIwpGABDHbKYoEoB21DrcpQvCvE5Vljw6p2NxxAb1ZTlDPgdfD/1f36sErh9i9fqRyE44cCWgCKa1VSADSkp2g3MIEf9P/SLZPwMDiAAT6umVvHeOzA+jKSBd2lAflSyMzLvyrJ1fudKEdKbeBIwEpA09UPAnARDIpLC9U8Fc//f3ylRQYKdIRAb3hvznzP5ijPBpsVyzaJL92hLfejKgkW1xqT4pt4RLBFHEnKpoEYCKZhJcXLfgbvqgmTyfMSHEOCWhwutyh5nTclGrJnZP/RjqjSYkzDyQaaVOK//JzQIq4cUWTAMSpJLysiMG/TliWnyRMSXGOCKyYJwu0KYc70pwkmrFp6Tq5L4mCkiyjWpVHkyzPg7L4OSDDQSIBSBm7sMFfpNo1XX6UMi/F5yRQCe3SP5Erc6KctNpKlzysK+i5iYWaOBKQ0XCTAKQIXeDgb1Q3nnmvbE6Rl6JzEjBPp9PD/2flVH0a1W7eVpEb0yi40zIX3S/PaTT8YafleLg9RwIyGDQSgJSQCx78jep3UqKl2JwF9A6Pl2gTpufcjOSqj+Qr562Vl5MrMNmS9Ov/3cmW6E1pHAlIeahIAFIAJvjrLSZr0pcCLUXmLNCrQ6ufypfm3IxEq9fn7F6daIHJF7Yy+SK9KZEjASkOFQlAwrgE/zroIws3yEDCtBTngMDb5supEtbtab931oA86ADthE14cEDu0RefmnCF8F+oHwnQz9Zl4Xc12x6SACToTfDfhVmSTyXISlEuCYR28l/k3qV/44e7d/jOhM48n2B8+zKa1ysi5apVPEo4UW4SgIQ4Cf6jkM9Mr8kXR+f4IxiB/qPlIP0UPjmYDok8+/KessqH/uhTAc1VCkU/qZYjAQnvrCQACYAS/HcjRpH85ekDsnX3Ev4KRqCr/tQ//ck8kKkkn7/obj9uV73wAXlRLwb8h0DkO+kGRwI60Ru3LQnAOJBWZwn+u8X0bOUHaofINbuX8FcoAn2H18/6vyiU/mg/Ir3JznU+9WfWS/KPmgRwcy29y7h+1nyGnwM633tJADowJPhbeNsrJbmoZ4VUraXMBCFQmSlLNPi8JojOaCc0gHyrZ4Nfd9k75VHZURu+BJP3GElAIm9FEoA2GQn+Npwe+v/ownXygL2UuWAESvXD/8F0R69k8PKkuqUb5Ls6CH8dzkB01BPOCeiIT6/p7XD7Qm5O8B837PphumTAzw/UcT1hNkag71h5qy4+IeYlXxc9WTtYbve18Q+ul7/WyMd9NoYHkHMCOtiRSQBaxCP4jwMryYoH18mHxy1lNiCBSq0+vuaDNohJD/8v9/mnql69LHDmS3KBHsXgbpvDeyRHAtp8Z5IAtABH8Lex9IN01bM1Obd3+Dpl+0XmghDQk//m6KfruUF0ZrgTO6dV5Xrf+2POB9DLbU/Xftzle18Saj9HAtqAJAFoEo3gb0OZ4P9cJO9fNiA77VeYC0mga6acp2O9Vyh90mRm9Zkb5ekQ+mMut50eyWkcCRgdzfqRAK4OGPWY8g8SgCmJRAj+NhLB3/YIeU6f+nd5SP2renry30RjUE8CzJEAfg4YISIJGJFo4t9gftdroq9trULwt9kI/rZHyHMr5svx+uS/ewPq4w8XrZfD9UNPd+Owplu6ZfZgWW7Rnp0UVs/a7o3mrvJHOt6fa7uEAmzIEYBJBpngb+MQ/G2P0OcqAd73P8Tgb/ZDjgQ0vBs5EtBA0riABKDRpL6E4G/DEPxtj9DnVh0j++uYLwmon1tnDMpXA+pPQ1dIAhpISAIaSOwFJAC2R32O4G+jEPxtjyLM1WpysfZzZjB9LcnXTntQXgimPxN0hCSgAYYkoIFk9wISgN0W9b8I/jYIwd/2KMKcjnlJPzUvC6mv5apcHVJ/JusLSUCDDklAA8nwAhKAMTAE/zEY+ifB3/Yoylz/XH3kb0neFFB/71+4QQYC6s+UXSEJaCAiCWgg4VbAoyQE/1GK+h8Ef9ujUHNl7vsfwniTBDSMIknAOBKOACgIwd/eKwj+tkeR5vqPlQP0TPlTA+rzi3rHvMLeN58koGFPJgkYQ1L4BIDgP2Zv0D8J/rZH4eZq9Rv/dIXSb92frzdBMJT+tNOPkSRAE7s729k+wG1IAnYNaqETAIK//dYm+NseRZu7a4F0yfDZ/6F0Parpg39C6Uwn/TBJwLRIziAJGFUkCVCKwiYABP/RN0L9D4K/7VHEuRc2y0I9+e/1ofRdg913etbLj0PpT6f9IAloECx8ElDIBIDgb78RCP62R1Hn9N6pHwyp7/rt/6qQ+pNEX0gCGhQLnQQULgEg+NtvAIK/7VHUuZXd8hbt+4KA+v8LfVrlNwLqT2JdIQlooCxsElCoBIDgb+/4BH/bo8hz+kHwAe2/HjUPY9JP9Gt5VPXEY0kS0GBTyCSgMAkAwd/e4Qn+tkeR5/qOk1l6+P/8gAyGoi65NqD+pNIVkoAG1sIlAYVIAAj+9o5O8Lc9ij5X3ilnq8F+wThEcsvitfJUMP1JsSMjSYBW8W8pVuNT0YVKAoJPAAj+9nuP4G97MFc/7h/UyX+lEif/tbJfmySgOk3O0G1IAobhCpMEBJ0AEPztjwGCv+3BnMjKeXK0OswLyOKxTeu54U2r49mzRraRBFhqhUgCgk0ACP7Wzswd/mwO5nYJ6AfAh4PCiOSqXtHbGTG1LEAS0EAWfBIQZAJA8Ld3ZL752x7MDQusPkr20X3jfQF57KjW5MsB9SfzrpAENJAHnQQElwAQ/O0dmOBvezC3W0AP+V6oc3vsXuL9X1/v2SjPeN+LnDtAEtAwAMEmAUElAAR/e8cl+NsezNkC+ql2mb3E87kaJ/8lNYIkAQ2SQSYBwSQABH97hyX42x7M2QL93XKiLjncXurxXCTfX7xB/sPjHjjXdJKAhiEJLgkIIgEg+Ns7KsHf9mAuRqAc1n3/9bFmV8b0kkUdCpAENAAGlQR4nwAQ/O0dlOBvezDXKNA3X16rl4Wc1fiKt0s2d02XG71tveMNN0nArBflNG3mbY43NavmBZMEeJ0AEPzt/Z3gb3swFy9QqdV/+58W/6p/S3W///KZ98pm/1ruT4tPeVR2aBKwWFtMEjA8bEEkAd4mAAR/+8OD4G97MBcv0LdUKvrIn0viX/VzaVcky/1suV+tJgloGC/vkwAvEwCCv70jEvxtD+YmFuh6on4o96CJ1/DrFb3t73fPGpAH/Wq1v60lCWgYO6+TAO8SAIK/vQMS/G0P5iYX0Kf+BXXf/5re+W/yHvNq0gIkAQ2i3iYBXiUABH97xyP42x7MTS7QP18O0ZP/3j35Wl69+szsF2WVVy0OpLEkAQ0D6WUS4E0CQPC3dziCv+3B3NQC+gn1AV3Lm/f8lD2K5DoTiKZcjxVSESAJaGD1Lgnw4sOA4G/vaAR/24O5qQVuP1Rm6OH/C6de05s1atWKXOtNawNt6EgSoPvWrYF2sdVueZUEOJ8AEPzt/Y/gb3sw15zA1n1lqR7+f01za3ux1jd77pcnvGhp4I00ScDsF2QJScDoQHuTBDidABD8R3eo+h8Ef9uDueYF9BMpqJP/NJnh5L/mhz/1NUkCGoi9SAKcTQAI/vYORfC3PZhrXmDlXDlS1z6++S2cX/Nn1UPkW863smANJAloGHDnkwAnEwCCv70jEfxtD+ZaFAjtvv8lubpnhVRbVGD1DARMElDbKov5OWAU2+kkwLkEgOA/uuPU/yD42x7MtSbQd7jM0Tf5Oa1t5fTag11l+YLTLSx443oekkGSAGsncDYJcCoBIPhbO43+zCmrnovk/csGZKf9CnMINCdQninn6360V3Nre7HWqjPXyq+8aGmBG0kS0DD4TiYBziQABH97hyH42x7MtSegt8o11/6HM5U4+c+XwSQJaBgp55KAUkMTc1hA8LfRCf62B3PtCazqlt/V32K/297WTm71w0Xr5XD90NK3CJMvAvoz1PTybOnX6GceKcyk+6/uw3+k+/Ln8sbI/QgAwd/eBQj+tgdz7QtEgZ38p++NKwn+7e8PeW3JkYAGeWeOBOSaABD87R2D4G97MNe+wC3d8ir9nryo/RKc23LrzB1yg3OtokFNCYwkAbryLU1tEP5KTiQBuSUABH97Dyf42x7MdSagZ41eoiXM7KwUd7aOIrnhtAflBXdaREtaFTBJQHWbLNHtSAKG8XJPAnJJAAj+9luH4G97MNeZgO5PJf2R8dLOSnFr60oky91qEa1pR4AkoEEt1yQg8wSA4G/vAAR/24O5zgVumifv0VIO7bwkZ0pYu3CDDDjTGhrSkQBJQANfbklApgkAwd8eeIK/7cFcMgI1Ceu+/3r4n/v+J7NrOFMKSUDDUOSSBGSWABD87QEn+NsezCUjcNN8OVBLOiWZ0hwopSQvzBBZ4UBLaELCAiQBDaCZJwGZJAAEf3ugCf62B3PJCdQiWaalVZIrMfeSvnD6gGzNvRU0IBUBkoAG1kyTgNQTAIK/PcAEf9uDueQElnfLNN2/LkquxNxLivROhtfk3goakKrAmCTgG6lW5E/hmSUBqSYABH97jyP42x7MJSuwf1kWaomvS7bUXEv7t0X3yyO5toDKMxHYlQQs1cpIAobFM0kCUksACP72+4bgb3swl7xAOQrr5D8pc/Jf8nuJuyWSBDSMTepJQCoJAMHfHkiCv+3BXPICK7vlLbqfvTP5knMr8RfPVuXW3Gqn4lwESAIa2FNNAhJPAAj+9gAS/G0P5lISGL7vv94qP5hpOY/BDmYsW+oISUADV2pJQKIJAMHfHjiCv+3BXDoCet//2Rr5z0+n9FxKHapOk+tyqZlKnRAgCWgYhlSSgMQSAIK/PWAEf9uDufQEBkXO0Qf/7JteDdmWrO+db/SskZ9nWyu1uSZAEtAwIoknAYkkAAR/e6AI/rYHcykLlOUDKdeQafF66R93/stU3N3KSAIaxibRJKDjBIDgbw8Qwd/2YC5dgdXz5Bj99t+dbi2Zlv7Yg+vkO5nWSGVOC5AENAxPYklARwkAwd8eGIK/7cFc+gJ6578Ppl9LdjXoff+v7BXRxxkwIbBbgCRgt8WuvxJJAtpOAAj+9oAQ/G0P5tIXWH2U7CMl6Um/psxq2FauyJcyq42KvBIYSQL0s/ZmrxqeXmM7TgLaSgAI/vaIEvxtD+ayEYi66rf9nZ1NbenXor/99+md/55LvyZq8FXAJAG1bdJDEjA6gh0lAS0nAAT/Ufj6HwR/24O5bAR0vzPX/JsH/wQz1Wqc/BfMYKbYEZKABty2k4CWEgCCvw1P8Lc9mMtOYOV8OUn3vzdnV2O6NWlfHlgyIGvTrYXSQxEgCWgYybaSgKYTAIK/DU7wtz2Yy1YgtPv+66fX57IVpDbfBUgCGkaw5SSgqQSA4G9DE/xtD+ayFejvlt/UGs/IttZUa3tpy065MdUaKDxIAZKAhmFtKQmYMgEg+NvABH/bg7nsBaKyXKa1Tsu+5tRq/PIFm2RLaqVTcNACI0mAdvKmoDvafOeaTgImTQA0+P+l1mn+Y1IBgj+7Qd4CfUulou/ui/NuR5L169mM1yRZHmUVT8AkAdVt8j7tOUnA8PA3lQRMmACsmidXaDm9w2Xx/wR/9gEXBMqPy+najoNcaEtCbbhn0Xr5QUJlUUyBBUgCGga/ngToo8IvbHhl14LYBEB/YzxHA95nJtqoaMsJ/kUbcXf7Wy6Fdec/vZiR+/67u7t517IxScA3vGt8Og0u6f01rtEv9O+OK95cS2xNfcfKWytVWacLg7nBiNXBFmcI/i2CsXpqAqu75Y21kjyiFcQm7qlVnF7Bz8x6UQ485VHZkV4VlFxEgb7DZXpllnxd+35WEfvf0OeSvCBDcvTijfLTsa9ZHyS3HyozNPgbNIK/IhD8x+4q/J23QK1c//ZvvWfzblOH9V9L8O9QkM1jBUaOBOhnOLcNNkL6uPBSl3zFnEM0Fsz6MNm6j3xMX3zb2BWK+jfBv6gj72a/TXKub+IL3GxdW62qlbvk2ra2ZCMEmhAwScBzkSzVVTkxUBH0QVvvqDwhl46lG00A+o+VA/T3gD8b+2JR/yb4F3Xk3e339n3qZzi/2t0WttayqCS3L/wP+c/WtmJtBFoTWDYgO83VARwJGHX7277jZL+RudEEoDRU//a/x8gLRf2X4F/UkXe73xowg3rsrx7N4OQ/t3e5YFrHkQBrKPev7JQ/HllSTwBuPUL21Q+YS0YWFvVfgn9RR97tfq86Rn5HA+bb3W5lS637We0N8u2WtmBlBDoQMEcCno3qj87m5wCRP9KTJOcYznoCsGOmnK1/1xd0YOz1pgR/r4cv6MZHNflQSB3U3yKv6lkh1ZD6RF/cFxhNAiJZ7X5rU23hfl2zZJGpYfgngEjOSbU6xwsn+Ds+QAVu3s0nyJ7afZOghzINTuuS60PpDP3wS6CeBIieT1PwJEBjXv0zpXxLt7xKh/B4v4YxudYS/JOzpKTkBao76mf+myQgjKkkK89cK78KozP0wkeB0SSg2FcH/L75GaA8WJF36CA23BDIx4Fttc0E/1bFWD9rgZrIsqzrTLO+Uo2T/9L0pezmBPg5QKbpzwDHlfVQyLHNkYW1FsE/rPEMsTcr5so7NDM/IpS+6Y3JH144IPeG0h/64bfA6JGAgv4coDHwBJMAvNnvYWy99QT/1s3YInuByvCd/7KvOKUa9THGn9OERt9+TAi4IVDwJODNZX03vtGNocimFQT/bJyppTMBc26O7qv1M3U7K8mZrV/ZVpavOtMaGoLALoHCJgEleZM+XEz2LcqeQPAvykj738/BslymvZjhf09Ge3DDeWvl5dE5/kDAIQGTBFQPqd82+F8cala6TYlkv7Ke/leIu/8R/NPdlyg9OYFec3luVE8Akis075IiuSbvJlA/ApMJmHtTVN8g5+k6RUkC9jQfNBobw5/0SMeMA18K5jGq4Q9YgXt4RLe8V7v/hoAI1iwekA0B9YeuBCrwwuNifhafHmj3xnerZG4EtGX80kDnT922j/TXn6oWaAfpViACod33X7j0L5A9M+huLO+WaXpTnK/rl8WQzr2ZbMxeMQnAs5OtEdhrp+ojj79mBjqwftGdQAT6uuW3tCvvCaQ7phvPvTxHVgTUH7oSoICJCfuX9NB/SRYG2L2JuvS8SQAem+jVEJeb7O5VJVnNkYAQR9f/PlVEPqC90H/CmPS+/9dfdLdsD6M39CJEgQJ+8x8ZxsfMSYA/Gpkr0L8cCSjQYPvSVb0153R9P17sS3ubaGdUrsi1TazHKgjkIlDQb/4j1o+UNUNfMzJXpH85ElCk0fajr7ue0PUbfrR26lbqe+yORffLI1OvyRoIZC9Q4G/+w9iR3FfeXpHv6dxQ9vxO1MiRACeGgUYYAT37+IMhSdT0sb8h9Ye+hCNQ8G/+ZiCr+jzu75V33ZzjrnCGtrWecCSgNS/WTkdg1Vz5bS3ZPJgrlOnp50RuC6Uz9CMcgcJ/8zdDWZJ7egbkJXMSoJRKcqP5t8ATlwgWePBd6LreJ/9D2g7NR8OYopIsN3dXC6M39CIUgZHgr++0Ip3t3zB8+kFTv9lRPQGYVpOv6xpFuhywAUQXkATEqbAsdQG97/9s/UA6N/WKsqtgqNYln8+uOmpCYGoBDvsPG2nwf7m0c/jS3HoCcPqAbNUPIH6vIwmY+l3EGokL7Cxr8I+CeibHTT1r5OeJQ1EgAm0KjHzzNz/5tllEMJvpuUZXL3xAXjQdqicA5g/9GeBT+s/z5u+CT5wYWPAdIOvu6xtyWdZ1pllfqcaXiTR9Kbs1Ab75W16bqyX5fyNLRhMAvVznOT0K8PGRF4r8r8kSuVlQkfeA7Pq+sluO1W//3dnVmHpNjy7cIIU9qTh1XSpoSYBv/jaXftH/q5518suRpaMJgFlQPViW6z/mskAm/TmA2wazG6QtUC6Hdemfen1OE2g9qMGEQL4CfPO3/fVN+cDQVvn02KVWAmAeh6g3IT1bV9AreJg4EsA+kKbArUfIvnojrqVp1pFx2duq0+TLGddJdQg0CPDNv4Fki6blZ/c8JINjX7ESAPPC4rXylAY+kwRYK47dqGB/cySgYAOeVXd3TK/f9nd2VvVlUM+/6Ml/nEeUATRVTCzAN/8Gm1opkouWDDTe9r8hATCbLlovd2gScKH+WWsoqoALOBJQwEFPuct6OK6k/7s85WqyLZ4ribL1prYGAb75N5Dox4x8bNFA/BM5YxMAU4QmAeZGAeYDiiTAgHCJ4LAC/5+IwOp58i4t6LBECnOgEPP74uJ1ss6BptCEggrwzT924Hs1lo+e9T9+jQkTALPi4vX1m3mQBOxW42ZBuy34qwMB/e0/qPv+67eMz3bAwaYIdCTAN/9Yvl6N4Z+IfWXXwkkTALMOSUADH0lAAwkLWhHo75bf1ONyp7WyjePrvrRlcPjWoo63k+YFKDAS/PU9Vejb+44b2imDv1l/ygTArEQSYBSsiSTA4mCmJYFS/cY/01raxuGV9b7/X7pgk2xxuIk0LVABgn/swDYV/M2WTSUAZkWSAKNgTSQBFgczzQjctUC69HKcS5tZ16N1rvGorTQ1EAGCf+xANh38zdZNJwBmZZIAo2BNJAEWBzNTCbywRc7QQ5Wvn2o9X17X3/7vXrJOHvKlvbQzDAGCf+w4thT8TQktJQBmA5IAo2BNJAEWBzOTCYR28l+NS/8mG25eS0GA4B+L2nLwN6W0nACYjUgCjII1kQRYHMzECazuljfq8pPiXvN02S+fq8lqT9tOsz0UIPjHDlpbwd+U1FYCYDYkCTAK1kQSYHEwM15AT5b7kC5r+z03vrzc50ty3bIB2Zl7O2hAIQQI/rHD3HbwN6V19GFEEtAwICQBDSQsMAJ9x8ksvVmOubtmKFNVhuS6UDpDP9wWIPjHjk9Hwd+U2FECYAogCTAK1kQSYHEwYwQqQ/I+/Wf/gDRuW7xRfhpQf+iKowIE/9iB6Tj4m1I7TgBMISQBRsGaSAIsDmb00r+w7vzHyX/s1BkIEPxjkRMJ/qbkRBIAUxBJgFGwJpIAi6O4M6vny1Ha+2MCEnh80zr514D6Q1ccFCD4xw5KYsHflJ5YAmAKIwkwCtZEEmBxFHNGn6Z1RWA9X97LQ8ICG1K3ukPwjx2PRIO/qSHRBMAUSBJgFKyJJMDiKNZMX7fsrYf/zw6o14OlQfliQP2hK44JEPxjByTx4G9qSTwBMIWSBBgFayIJsDiKM1MRuUB7u0cwPY5kxaJN8utg+kNHnBIg+McORyrB39SUSgJgCiYJMArWRBJgcRRkpiTmcdrhTJz8F85YOtYTgn/sgKQW/E1tqSUApnCSAKNgTSQBFkfYM6vmyzu1h28LpZelSB7W9/S9ofSHfrgjQPCPHYtUg7+pMdUEwFRAEmAUrIkkwOIIdya4+/6LfDbc0aJneQkQ/GPlUw/+ptbUEwBTCUmAUbAmkgCLI7yZvqPl1dqrswLq2Svbu+SGgPpDVxwQIPjHDkImwd/UnEkCYCoiCTAK1kQSYHGENVOpyGXaoxmh9EpvY/zV89bKy6H0h37kL0Dwjx2DzIK/qT2zBMBURhJgFKyJJMDiCGOm17yvonoCEEaHhntxTUidoS/5ChD8Y/0zDf6mBZkmAKZCkgCjYE0kARaH/zNHzpVTpCQH+9+T0R7cu2S9bByd4w8EOhAg+MfiZR78TSsyTwBMpSQBRsGaSAIsDr9nojL3/fd7BGl9WgIE/1jZXIK/aUkuCYCpmCTAKFgTSYDF4eeM3vnvt7Tlf+Bn62Nb/exLe0h/7CssRKAFAYJ/LFZuwd+0JrcEwFROEmAUrIkkwOLwb6Zr+Nu/3gAwmOkLF90t24PpDR3JRYDgH8uea/A3Lco1ATANIAkwCtZEEmBx+DPTd7hM17PlL/anxVO2NIoqct2Ua7ECApMIEPxjcXIP/qZVuScAphEkAUbBmkgCLA4/ZiozZYme/f8aP1rbVCu/vWSt/KSpNVkJgRgBgn8MiogTwd+0zIkEwDSEJMAoWBNJgMXhwUwprJP/VPwqD9RpoqMCBP/YgXEm+JvWOZMAmMaQBBgFayIJsDjcnek7Vt6qrTvB3Ra23LInq2+Q21reig0QUAGCf+xu4FTwNy10KgEwDSIJMArWRBJgcbg5UxmSK7RlJTdb10arIrmmZ4VU29iSTQouQPCP3QGcC/6mlc4lAKZRJAFGwZpIAiwOt2b05L85pZKc51arOmrNUFdNvtBRCWxcSAGCf+ywOxn8TUudTABMw0gCjII1kQRYHO7MVGbJuXr2/17utKjDlpRk9Zkb5ekOS2HzggkQ/GMH3Nngb1rrbAJgGkcSYBSsiSTA4nBm5nJnWpJEQ2qc/JcEY5HKIPjHjrbTwd+02IvfLPvnySXaVvMwEqcTFgOa0XTbrBdl8SmPyo6M6qOaCQRWHSPHRTW5b4KXfVz8o0Xr5a36waAHNZgQmFqA4B9r5HzwN632IqByJKBhB+NIQANJPgs0+H8wn5rTqTUqydUE/3RsQyyV4B87ql4Ef9NyLxIA01CSAKNgTSQBFkf2M/rtf3+tdWn2NadW47Zal3wltdIpOCgBgn/scHoT/E3rvUkATGNJAoyCNZEEWBwZz0RykdY4M+Na06zuaz1r5Pk0K6DsMAQI/rHj6FXwNz3wKgEwDSYJMArWRBJgcWQzoz+Ql6JILsumtmxq0ccYX51NTdTiswDBP3b0vAv+phfeJQCm0SQBRsGaSAIsjvRn+ufKyVrLYenXlFkN65bcL+szq42KvBQg+McOm5fB3/TEywTANJwkwChYE0mAxZHyzPBjf1OuJLvi9UZG3Pc/O24vayL4xw6bt8Hf9MbbBMA0niTAKFgTSYDFkc5M/7FygJ4pf2o6pedS6ovTavL1XGqmUi8ECP6xw+R18Dc98joBMB0gCTAK1kQSYHGkMFMTc+OfrhRKzqvIL54+IFvzqpx63RYg+MeOj/fB3/TK+wTAdIIkwChYE0mAxZHczF0LNPDX5OLkSsy/pGpFrs2/FbTARQGCf+yoBBH8Tc+CSABMR0gCjII1kQRYHMnMPP+KnKX3z3x9MqU5UEpJvtOzVh52oCU0wTEBgn/sgAQT/E3vgkkATGdIAoyCNZEEWBwJzJTCuvOf3vCXk/8S2C1CK4LgHzuiQQV/08OgEgDTIZIAo2BNJAEWR/szfXPlUA2YJ7ZfgnNb/vLZSG52rlU0KFcBgn8sf3DB3/QyuATAdIokwChYE0mAxdHeTKUsV+iWegFAIFNJrl02IDsD6Q3dSECA4B+LGGTwNz0NMgEwHSMJMArWRBJgcbQ203eczNItLmhtK6fXrsqQfN7pFtK4TAUI/rHcwQZ/09tgEwDTOZIAo2BNJAEWR/MzXUPyfl17v+a3cHtNvZXxrYs3yk/dbiWty0qA4B8rHXTwNz0OOgEwHSQJMArWRBJgcTQ3o/f9D+qxv3opIyf/NTf0wa9F8I8d4uCDv+l18AmA6SRJgFGwJpIAi2PymdXz5ShdY/7ka3n16uM/2CB3eNViGpuKAME/lrUQwd/0vBAJgOloPQmIZJn+WTPzTEIS0OROUIvkw02u6sVqevj/6l7eB16MVZqNJPjH6hYm+JveFyYBMJ1dPCDX6WVcJAEGY3giCRiRmODf1UfJPvqS+f0/lGlHeVC+FEpn6Ed7AgT/WLdCBX8jUKgEwHSYJMAoWBNJgMVhz1SnyYW6ZA97qddzfYs2ya+97gGN70iA4B/LV7jgbxQKlwCYTpMEGAVrIgmwOHbPlCK5bPec/3+Vypz85/8ott8Dgn+sXSGDv5EoZAJgOk4SYBSsiSTA4hDp767f9e/wcYt9nt206H5Z43MHaHv7AgT/WLvCBn+jUdgEwHSeJMAoWBNJwFiOcmCX/olcObZ7/F0cAYJ/7FgXOvgbkUInAAaAJMAoWBNJgHL0zZfX6gmjZ1kyfs9s3laRG/3uAq1vR4DgH6tW+OBvVAqfABgEkgCjYE2FTwIqNblURaZZKj7PRPKV89bKyz53gba3LkDwjzUj+O9iIQHYBUES0PBGKWwS0LdUKvrIH5MABDNV9Nr/YDpDR5oSIPjHMhH8x7CQAIzBIAkYgzH8ZyGTgMoTcqp2/6AGDX8XfO+sAXnQ3+bT8lYFCP6xYgT/cSwkAONASALGgUgh7xgY1n3/Iy79a9irA15A8I8dXIJ/DAsJQAwKSUADSmGOBPTPl0O09yc3CPi74NmX95RV/jaflrciQPCP1SL4x7JwEuAELJwYGANTjCRg+FbR4STGJfn8RXfL9pjxZFFgAgT/2AEl+MeyDC8M54Nukk62+xJHAhrkgk4C+g6X6Xry3x829NrfBVG1qs+/YApegOAfO8QE/1iW3QtJAHZbxP5FEtDAEmwS0DVbluq1/69p6LGnC/Spf9/q2SCPetp8mt2kAME/ForgH8tiLyQBsD1i50gCGliCTAKiKLA7/5U4+a9hzw1sAcE/dkAJ/rEsjQtJABpNYpeQBDSwBJUErJwv5p7/JzT00t8FP6sdLLf723xaPpUAwT9WiOAfyxK/kAQg3iV2aT0JEPmQvqhHV5lUwCQBX+oN4WTSSD4c2Igu71kh1cD6RHd2CfTqe+7VIl/Vc1YWgjIq8InF6+UTo3P8MaUACcCURPYKuoMt1/B/uS6t2a8Udu59R8yXz/rcez35b46+Ec7xuQ/j2r6zqypfHLeM2YAEjpgnn4pK0hNQlzrtign+vZ0WUrTtSQDaGHF+DhiHpr+d66Nzvb11bnmmnK+HdPYa1yufZ1eduVGe9rkDtH1igVXz5TJ9NbQjVhN3eOpXzGH/3qlXY43xAiQA40WanCcJGAdVks+snCtHjlvqxWypVP9A9aKtzTRS+3NVM+uxjn8C5lwVPVn1//vX8tRazGH/DmhJADrAIwmw8GaWyvL5Xs/OB+ifVz/x72irJ37P/HDhOvl3v7tA6+ME9ChVqRTJlfra7LjXC7iMb/4dDjoJQIeAJAEW4Dw9H+ACa4nrM6Fd+qf3/S9xkqrre11b7dND/xfqhr/X1sbhbcQ3/wTGVD8rmJIQqP8GXtITBD37BpxE38eV8VR1m7yx5yEZHLfcudlbuuVVgyV5Uhs207nGtdegrTN2yAGnPSgvtLc5W7kqcNcC6Xp+s/xEz/o/2NU2ZtguLvVLCJsjAAlBciRgFPIAPanu7NE5h//Q4H+xNi+U4K8HiOVrBH+Hd7gOmvb8K/qeIvgbQYJ/B/vR+E1JAMaLdDBfTwK4T4D+UOn+Gcrm91S9nNPbKxfidtNyVa6OW86yIAQ+EEQvOusEwb8zv4atSQAaSDpbwH0C6n7zVh0jh3Umme7Wq+fLH2gK8KZ0a8m09PsXbpCBTGukskwE+o+Wg7Si4zKpzN1KCP4pjA0JQAqo5kiAnq17hRatXzSLOemlSotd7jn3/Xd5dGjbWIGoXH8vFfl8LU74G7tDJPg3CUCCmGOLWjSgh2MLfMdATYAWjPVw6e+b5suB2p5TXWpTh215cXpN+josg80dFdCf1BY42rQsmmW++fdmUVER6yABSHHUi3wkQA99HN+3VCop8rZdtN4g39zK2cm2tdMptb7+9AHZ2s62bOOFQEgPqWoFnG/+rWi1sS4JQBtorWxS4CMBc6Y9Wf+m3QpX6uuaJ6jpkZmLU68ouwoifSiFufyUKUCBvvnyWu3WfgF2baou8c1/KqEEXicBSABxqiKKeiRgaKd7JwLuX64/Pe11U42ZR6/f2bNefuxRe2lqCwJdNTm0hdVDWZVv/hmNJAlARtBFPBKgtwZ+TUa8TVdTDuzOf/r7MPf9b3r0PVzRwfdQyop8808ZeGzxJABjNVL+u36fgEiWaTXFeJRwTfZMmbSl4ld2y1v09/J3trSR2yv/4pma3OJ2E2ldRwKRzOloe7825lK/jMeLBCBj8EL9HFCW6RnzTl5dWT6oK4R0OdU1ywZk5+Sd5lWfBfSbQpfP7W+h7Rz2bwErqVVJAJKSbKGcwvwcEMnmFlhSXbXvOJmlkf/8VCvJtvAhvY7humyrpLYcBLblUGfWVXLYP2vxXfWRAOQEX4QjARpwX86Jt6Hayk45R8/+37fhBV8XRHLL4rXylK/Np93NCZRr7iTRzbW45bX45t8yWXIbkAAkZ9lyScEfCYjk8ZZR0tsgqHupc/JfejuKSyWXS/JTl9qTcFv45p8waKvFkQC0Kpbw+gEfCYi2dcmjCXO1VdzKuXKkbjivrY3d3OixTevlTjebRquSFBicXn8PhXjSMCf8JbmjtFkWCUCbcEluFuiRgB+ft9aNnwD0csRzkxyv3MuK5KreolxJkjt2vg3oWSPmHIDQ7vPAYf98d6vR2kkARiny/SPASwTvyVfUqt3pBxNZLZ16ZnupIl+cejXWCEVAL1116b3UKSuH/TsVTHB7EoAEMTstKqSfA/Rpe9/q1COJ7fuPlQO0nDcmUZYjZfQtul+ec6QtNCMDAT0P4I4MqsmiCr75Z6HcQh0kAC1gZbFqCD8HmLP/N+/pRgJQqsrvZjFuWdWhP2dcnVVd1OOGwEt7yO3akhfdaE3breCbf9t06W1IApCebdsl+34kQA9Z3njR3bK9bYAEN9S2vCXB4vItKpLv67f/Nfk2gtqzFjDvJT2itiLrehOsj2/+CWImWRQJQJKaCZbl8ZGAalSRf06QorOiSvKmzgpwaGvu++/QYGTelE9qjT5eDcA3/8x3leYrJAFo3irzNX08EqCH//uWrJWfZI41QYWlSF41wUu+Ld7cNUO+5lujaW8yAksG5Ed6NOumZErLrBS++WdG3V5FJADtuWW2lWdHArZFVfmzzHCaqEg/NIN4mIr248tn3hv8XeGaGNHirlKJ5H9q7534aa2JUeCbfxNIea9CApD3CDRRvy9HAvR3yr9avNGxO5eVJIh9XDvByX9NvFdCXmXhgDymR7T+wYM+8s3fg0EyTQziw9ET646a6cGRgHtqh8g/dtTJNDaO5JU0is24zH9ftF5+kHGdVOegwDMif6PNcvlEUL75O7jfTNQkEoCJZBxc7uyRgEh+3lWVc3pWSNVBtpccbFNrTYpkeWsbsHaoAubxz9XhO1u6eC8Ivvl7tuORAHg2YK4dCTDX/Oud6U49c6M87SjlY462q9lmPTPrJelvdmXWC1+g5355QkryXu2pS0e3+Obv4a5HAuDhoJkjAfpo22Xa9Ly/cb+oJ6edotemf99VRn1q3o9cbVsz7dL2f/6UR2VHM+uyTnEEFq+TdZp8L9Ieb8m515EmI3++eL18Iud2UH0bAiQAbaC5sEk9CRB5v7Ylr+DwlH4AvUPf+Pe64DFhGyJZN+Fr7r9QGyrJNe43kxbmIaDnhdyhdwZ4l9ad188BQ/oF4FJNRv4uj/5TZ+cCJACdG+ZWggbflXok4HhtQKaHuTXw36n1HuPDiWkL18tD6vOr3Aaps4q/WT/c21kZbB2wwOIN8h+VkhytXbwv424+qVcknLhkvXwh43qpLkEBEoAEMfMoSo8EbCgPyTwNyuaNqAl5qtNL+qb/E30W/cla7y9SrSmhwtXFmHw7oeIyLSYqcfJfpuCeVnbWOnny2UgWaPN79b+07xOgu6V8oTpNjtLzkb6n9TF5LKBjyRSKwIr5cnx5+DrhExLu0w6NotdPq0jvmWv9+za9ap68W9v/rwmbpF3cz6pvkEMcvbIi7b5TfpsCfXPl0EpF/o+mveYR2El/wbtHf3L4U3PUoc3msZljAiQAjg1IEs1ZNV/eGdXkT/TknFO0vBkdlPmUnoT21cqQfMbhs/yn7F6vfhAe0a0/k5Tk4ClXdmWFSD6mR1nceaaCKy60oymBVXPlt6OyfERX7tH/9mlqo/iVtulRv1v0a/+nnT/fJ779LJ1EgARgEhzfX7r1CNl3cKacpd8GTtJvwO/U/hw4RZ+26esbdae4S4PlHZvWyXd7/XwASUM39SjAFWrw2YYX3FzwbHWbvKHnIacu83JTilZNKnD9Apm511a9ZLCqJwuW5ERd+TD9rzLpRiKP6+v3aPJ/59ay3HLeWnl5ivV52VMBEgBPB66dZvd1y97lirypVJPXaTDcQzP7PTSzf0H/fb5akf98+H75aW8gAX+8T/2D8JX6Q4oOGP+ac/N8+3duSEJp0O2Hyozt++hngH4Z0Ft3z9GjBPvqv5vLNdmsnwVPV7fLIySeoYz21P0gAZjaiDUCEVjVLUv1Q67P8e78bMugvPWCTblf3+04E81DAIFOBZI+SaTT9rA9AqkJ6FnLK7Tw21OroPOCa/rA9wsJ/p1DUgICCEwtQAIwtRFrBCRQGpSLtDtO3rZYD8f989L1cndA3HQFAQQcFiABcHhwaFryAos2ya/1a3aed1CcqFP3znxRPj7RiyxHAAEEkhYgAUhalPKcF1i6Qb6r37bfpw2tOtHYSL4/Y4eczj3/nRgNGoFAYQRIAAoz1HR0rIDexvhmvczpQl22c+zyrP/WKzAeni7yrtMelBeyrpv6EECg2AIkAMUe/0L3ftE6uUGTgDP0aEAu1znrpZjfHJou7zh9QJ4t9EDQeQQQyEWABCAXdip1RUCTgG9FVTlS27M2wzaZR6j+/Q/Wy2k9a+T5DOulKgQQQGBUQL/8MCGAgLlByra95X9oYP5z1ZiVosgGPQnxI+Y8hBTroGgEEEBgSgESgCmJWKFIAqvfLgdHQ/JxPTx/vvZ7WmJ9j+TnpbJ8XG+v/KXeQO+2mJgVBSGAQCYCJACZMFOJbwL9R8tB0iWX6tPPztOjAge32f6q3nnwm/qExuv2nSO3nXi3DLVZDpshgAACiQuQACROSoEhCeiRgFL/MdKtz084SYP5O/Ss/bdq/w7S/+IeqLJFlz+s69xXK8t908pyj4+PTw5p/OgLAghMLEACMLENryAQK2DOF3jl1fKq8qDsqU9anF6bIc9t0f8uulu2x27AQgQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAA6eprwAAAPiSURBVAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBIgj8F1Cx5oYEZQsEAAAAAElFTkSuQmCC" preserveAspectRatio="none" id="img10"></image><clipPath id="clip11"><rect x="0" y="0" width="371301" height="371301"/></clipPath><clipPath id="clip12"><path d="M2369.95 902.5C2319.29 902.5 2274.72 909.476 2236.23 923.428 2197.75 937.38 2164.46 955.873 2136.37 978.909 2108.28 1001.95 2084.82 1028.23 2065.98 1057.75 2047.14 1087.28 2032.04 1117.78 2020.68 1149.25 2009.31 1180.72 2001.27 1211.79 1996.56 1242.45 1991.85 1273.11 1989.5 1300.93 1989.5 1325.92 1989.5 1363.55 1994.86 1396.89 2005.58 1425.93 2016.29 1454.97 2031.8 1479.47 2052.1 1499.42 2072.39 1519.38 2097.08 1534.54 2126.14 1544.93 2155.21 1555.31 2188.25 1560.5 2225.27 1560.5 2262.62 1560.5 2296.8 1557.17 2327.81 1550.52 2358.83 1543.87 2390.09 1533.89 2421.59 1520.59L2490.27 1181.37 2263.76 1181.37 2244.27 1280.17 2351.93 1280.17 2318.8 1442.72C2308.41 1446.29 2298.1 1448.97 2287.87 1450.75 2277.64 1452.54 2265.05 1453.43 2250.12 1453.43 2226.08 1453.43 2205.54 1450.59 2188.49 1444.91 2171.44 1439.23 2157.48 1430.8 2146.6 1419.6 2135.72 1408.41 2127.76 1394.78 2122.73 1378.72 2117.7 1362.66 2115.18 1344.09 2115.18 1323 2115.18 1302.56 2117.05 1280.57 2120.78 1257.05 2124.52 1233.53 2130.44 1210.17 2138.56 1186.97 2146.68 1163.77 2157.07 1141.54 2169.74 1120.29 2182.41 1099.04 2197.75 1080.22 2215.77 1063.84 2233.8 1047.45 2254.58 1034.31 2278.13 1024.41 2301.67 1014.52 2328.22 1009.57 2357.77 1009.57 2387.33 1009.57 2415.66 1012.9 2442.78 1019.55 2469.89 1026.2 2494.17 1033.58 2515.6 1041.69L2538.5 931.214C2512.84 922.778 2485.56 915.883 2456.66 910.53 2427.76 905.177 2398.85 902.5 2369.95 902.5ZM756 866 3148 866 3148 1595 756 1595Z" fill-rule="evenodd" clip-rule="evenodd"/></clipPath><clipPath id="clip13"><rect x="0.916748" y="1.08319" width="617.517" height="724.834"/></clipPath></defs><g transform="translate(-756 -866)"><g><g clip-path="url(#clip3)"><g clip-path="url(#clip4)" filter="url(#fx0)" transform="translate(1396 868)"><g><g><path d="M406.807 34.4998C420.781 34.4998 434.837 35.068 448.972 36.2035 463.109 37.3389 476.921 38.799 490.407 40.5838 503.894 42.3677 516.649 44.4772 528.673 46.9104 540.697 49.3436 551.584 51.859 561.333 54.4545L540.372 160.065C530.948 156.821 520.305 153.819 508.444 151.061 496.581 148.304 484.232 145.87 471.396 143.761 458.559 141.652 445.642 140.03 432.642 138.894 419.644 137.759 407.295 137.191 395.595 137.191 372.522 137.191 352.699 139.219 336.125 143.275 319.551 147.33 305.902 152.846 295.178 159.822 284.453 166.797 276.572 175.071 271.535 184.642 266.497 194.214 263.979 204.354 263.979 215.061 263.979 227.39 267.717 238.178 275.191 247.425 282.666 256.673 292.496 265.189 304.683 272.977 316.87 280.763 330.681 288.226 346.117 295.364 361.554 302.502 377.396 310.046 393.646 317.995 409.894 325.944 425.737 334.624 441.173 344.033 456.61 353.442 470.422 364.392 482.608 376.884 494.795 389.376 504.625 403.814 512.099 420.199 519.574 436.584 523.311 455.483 523.311 476.898 523.311 509.669 516.324 539.356 502.35 565.962 488.376 592.567 468.471 615.279 442.636 634.098 416.8 652.916 385.521 667.354 348.798 677.413 312.076 687.471 271.129 692.5 225.957 692.5 189.884 692.5 156.412 690.229 125.539 685.687 94.6662 681.144 64.931 674.33 36.3331 665.246L59.2438 552.821C72.8928 557.688 87.3538 562.23 102.628 566.448 117.902 570.666 133.339 574.235 148.937 577.155 164.536 580.075 180.136 582.347 195.734 583.969 211.333 585.591 226.445 586.403 241.068 586.403 266.741 586.403 288.759 584.05 307.12 579.346 325.481 574.641 340.592 568.233 352.455 560.122 364.316 552.01 373.009 542.682 378.534 532.137 384.059 521.592 386.821 510.479 386.821 498.799 386.821 485.172 383.164 473.41 375.853 463.514 368.541 453.618 358.791 444.614 346.605 436.503 334.418 428.391 320.607 420.767 305.17 413.629 289.733 406.491 273.972 399.11 257.886 391.484 241.8 383.86 226.119 375.586 210.845 366.663 195.571 357.741 181.841 347.197 169.654 335.028 157.468 322.861 147.718 308.748 140.407 292.687 133.095 276.626 129.439 257.727 129.439 235.988 129.439 210.032 134.882 184.967 145.769 160.795 156.656 136.624 173.391 115.128 195.978 96.3092 218.563 77.4909 247.324 62.4847 282.259 51.2908 317.194 40.0968 358.71 34.4998 406.807 34.4998Z" stroke="#A5300F" stroke-width="21" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D55816" fill-rule="evenodd" fill-opacity="1"/></g></g></g></g><path d="M1756.97 902.5C1770.95 902.5 1785 903.068 1799.14 904.203 1813.28 905.339 1827.09 906.799 1840.57 908.584 1854.06 910.368 1866.82 912.477 1878.84 914.91 1890.86 917.344 1901.75 919.859 1911.5 922.455L1890.54 1028.06C1881.11 1024.82 1870.47 1021.82 1858.61 1019.06 1846.75 1016.3 1834.4 1013.87 1821.56 1011.76 1808.73 1009.65 1795.81 1008.03 1782.81 1006.89 1769.81 1005.76 1757.46 1005.19 1745.76 1005.19 1722.69 1005.19 1702.87 1007.22 1686.29 1011.27 1669.72 1015.33 1656.07 1020.85 1645.34 1027.82 1634.62 1034.8 1626.74 1043.07 1621.7 1052.64 1616.66 1062.21 1614.15 1072.35 1614.15 1083.06 1614.15 1095.39 1617.88 1106.18 1625.36 1115.42 1632.83 1124.67 1642.66 1133.19 1654.85 1140.98 1667.04 1148.76 1680.85 1156.23 1696.28 1163.36 1711.72 1170.5 1727.56 1178.05 1743.81 1185.99 1760.06 1193.94 1775.9 1202.62 1791.34 1212.03 1806.78 1221.44 1820.59 1232.39 1832.77 1244.88 1844.96 1257.38 1854.79 1271.81 1862.27 1288.2 1869.74 1304.58 1873.48 1323.48 1873.48 1344.9 1873.48 1377.67 1866.49 1407.36 1852.52 1433.96 1838.54 1460.57 1818.64 1483.28 1792.8 1502.1 1766.97 1520.92 1735.69 1535.35 1698.96 1545.41 1662.24 1555.47 1621.3 1560.5 1576.12 1560.5 1540.05 1560.5 1506.58 1558.23 1475.71 1553.69 1444.83 1549.14 1415.1 1542.33 1386.5 1533.25L1409.41 1420.82C1423.06 1425.69 1437.52 1430.23 1452.79 1434.45 1468.07 1438.67 1483.51 1442.23 1499.1 1445.16 1514.7 1448.08 1530.3 1450.35 1545.9 1451.97 1561.5 1453.59 1576.61 1454.4 1591.24 1454.4 1616.91 1454.4 1638.93 1452.05 1657.29 1447.35 1675.65 1442.64 1690.76 1436.23 1702.62 1428.12 1714.48 1420.01 1723.18 1410.68 1728.7 1400.14 1734.23 1389.59 1736.99 1378.48 1736.99 1366.8 1736.99 1353.17 1733.33 1341.41 1726.02 1331.51 1718.71 1321.62 1708.96 1312.61 1696.77 1304.5 1684.58 1296.39 1670.77 1288.77 1655.34 1281.63 1639.9 1274.49 1624.14 1267.11 1608.05 1259.48 1591.97 1251.86 1576.29 1243.59 1561.01 1234.66 1545.74 1225.74 1532.01 1215.2 1519.82 1203.03 1507.63 1190.86 1497.89 1176.75 1490.57 1160.69 1483.26 1144.63 1479.61 1125.73 1479.61 1103.99 1479.61 1078.03 1485.05 1052.97 1495.94 1028.8 1506.82 1004.62 1523.56 983.128 1546.14 964.309 1568.73 945.491 1597.49 930.485 1632.43 919.291 1667.36 908.097 1708.88 902.5 1756.97 902.5Z" stroke="#A5300F" stroke-width="20.625" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D55816" fill-rule="evenodd" fill-opacity="1"/><g clip-path="url(#clip5)" filter="url(#fx1)" transform="translate(757 888)"><g><g><path d="M482.943 195.5C482.943 283.043 379.597 370.586 276.25 370.586" stroke="#A5300F" stroke-width="21" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd"/><path d="M195.25 316.5C262.575 316.5 329.901 417.048 329.901 517.595" stroke="#A5300F" stroke-width="21" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd"/><path d="M30.2499 316C30.2499 270.437 67.1864 233.5 112.75 233.5 158.313 233.5 195.25 270.437 195.25 316 195.25 361.564 158.313 398.5 112.75 398.5 67.1864 398.5 30.2499 361.564 30.2499 316Z" stroke="#A5300F" stroke-width="21" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FADDCD" fill-rule="evenodd" fill-opacity="1"/><path d="M400.25 113C400.25 67.4365 437.187 30.5 482.75 30.5 528.314 30.5 565.25 67.4365 565.25 113 565.25 158.563 528.314 195.5 482.75 195.5 437.187 195.5 400.25 158.563 400.25 113Z" stroke="#A5300F" stroke-width="21" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FADDCD" fill-rule="evenodd" fill-opacity="1"/><path d="M237.25 539.334C237.25 527.275 247.025 517.5 259.084 517.5L400.417 517.5C412.475 517.5 422.25 527.275 422.25 539.334L422.25 626.666C422.25 638.725 412.475 648.5 400.417 648.5L259.084 648.5C247.025 648.5 237.25 638.725 237.25 626.666Z" stroke="#A5300F" stroke-width="21" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FADDCD" fill-rule="evenodd" fill-opacity="1"/><g clip-path="url(#clip6)" transform="matrix(0.000360892 0 0 0.000360892 261.75 517)"><g clip-path="url(#clip8)" transform="matrix(1 0 0 1 0.0663341 0.216198)"><use width="100%" height="100%" xlink:href="#img7" opacity="1" transform="scale(725.197 725.197)"></use></g></g></g></g></g><path d="M1226.19 1083.5C1226.19 1171.04 1122.85 1258.59 1019.5 1258.59" stroke="#A5300F" stroke-width="20.625" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd"/><path d="M938.5 1204.5C1005.83 1204.5 1073.15 1305.05 1073.15 1405.6" stroke="#A5300F" stroke-width="20.625" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd"/><path d="M773.5 1204C773.5 1158.44 810.436 1121.5 856 1121.5 901.563 1121.5 938.5 1158.44 938.5 1204 938.5 1249.56 901.563 1286.5 856 1286.5 810.436 1286.5 773.5 1249.56 773.5 1204Z" stroke="#A5300F" stroke-width="20.625" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FADDCD" fill-rule="evenodd" fill-opacity="1"/><path d="M1143.5 1001C1143.5 955.437 1180.44 918.5 1226 918.5 1271.56 918.5 1308.5 955.437 1308.5 1001 1308.5 1046.56 1271.56 1083.5 1226 1083.5 1180.44 1083.5 1143.5 1046.56 1143.5 1001Z" stroke="#A5300F" stroke-width="20.625" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FADDCD" fill-rule="evenodd" fill-opacity="1"/><path d="M980.5 1427.33C980.5 1415.28 990.275 1405.5 1002.33 1405.5L1143.67 1405.5C1155.72 1405.5 1165.5 1415.28 1165.5 1427.33L1165.5 1514.67C1165.5 1526.72 1155.72 1536.5 1143.67 1536.5L1002.33 1536.5C990.275 1536.5 980.5 1526.72 980.5 1514.67Z" stroke="#A5300F" stroke-width="20.625" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FADDCD" fill-rule="evenodd" fill-opacity="1"/><g clip-path="url(#clip9)" transform="matrix(0.000360892 0 0 0.000360892 1005 1405)"><g clip-path="url(#clip11)" transform="matrix(1 0 0 1 0.0684703 0.216198)"><use width="100%" height="100%" xlink:href="#img10" opacity="1" transform="scale(725.197 725.197)"></use></g></g><g clip-path="url(#clip12)"><g clip-path="url(#clip13)" filter="url(#fx2)" transform="translate(2001 868)"><g><g><path d="M414.785 34.4998C443.687 34.4998 472.591 37.1766 501.494 42.53 530.398 47.8835 557.677 54.7783 583.333 63.2143L560.437 173.692C539.004 165.581 514.728 158.2 487.611 151.548 460.494 144.896 432.159 141.571 402.606 141.571 373.053 141.571 346.504 146.519 322.96 156.415 299.415 166.311 278.631 179.451 260.607 195.836 242.582 212.221 227.238 231.04 214.573 252.292 201.907 273.544 191.515 295.769 183.396 318.968 175.278 342.167 169.35 365.528 165.616 389.051 161.881 412.575 160.014 434.557 160.014 454.997 160.014 476.087 162.531 494.662 167.564 510.723 172.598 526.783 180.555 540.41 191.434 551.604 202.313 562.798 216.278 571.234 233.327 576.913 250.377 582.59 270.918 585.429 294.95 585.429 309.888 585.429 322.473 584.537 332.703 582.752 342.932 580.968 353.244 578.291 363.636 574.722L396.761 412.169 289.104 412.169 308.589 313.371 535.107 313.371 466.421 652.592C434.92 665.894 403.662 675.872 372.647 682.523 341.633 689.174 307.453 692.5 270.106 692.5 233.083 692.5 200.039 687.309 170.974 676.926 141.908 666.544 117.227 651.375 96.9299 631.421 76.6329 611.467 61.1254 586.97 50.4087 557.931 39.6919 528.893 34.3335 495.554 34.3335 457.918 34.3335 432.934 36.6877 405.112 41.3971 374.451 46.1055 343.789 54.1431 312.723 65.5098 281.25 76.8756 249.778 91.9768 219.279 110.813 189.753 129.649 160.227 153.113 133.946 181.204 110.909 209.296 87.8731 242.582 69.3795 281.066 55.4276 319.55 41.4758 364.123 34.4998 414.785 34.4998Z" stroke="#A5300F" stroke-width="21" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D55816" fill-rule="evenodd" fill-opacity="1"/></g></g></g></g><path d="M2369.95 902.5C2398.85 902.5 2427.76 905.177 2456.66 910.53 2485.56 915.883 2512.84 922.778 2538.5 931.214L2515.6 1041.69C2494.17 1033.58 2469.89 1026.2 2442.78 1019.55 2415.66 1012.9 2387.33 1009.57 2357.77 1009.57 2328.22 1009.57 2301.67 1014.52 2278.13 1024.41 2254.58 1034.31 2233.8 1047.45 2215.77 1063.84 2197.75 1080.22 2182.41 1099.04 2169.74 1120.29 2157.07 1141.54 2146.68 1163.77 2138.56 1186.97 2130.44 1210.17 2124.52 1233.53 2120.78 1257.05 2117.05 1280.57 2115.18 1302.56 2115.18 1323 2115.18 1344.09 2117.7 1362.66 2122.73 1378.72 2127.76 1394.78 2135.72 1408.41 2146.6 1419.6 2157.48 1430.8 2171.44 1439.23 2188.49 1444.91 2205.54 1450.59 2226.08 1453.43 2250.12 1453.43 2265.05 1453.43 2277.64 1452.54 2287.87 1450.75 2298.1 1448.97 2308.41 1446.29 2318.8 1442.72L2351.93 1280.17 2244.27 1280.17 2263.76 1181.37 2490.27 1181.37 2421.59 1520.59C2390.09 1533.89 2358.83 1543.87 2327.81 1550.52 2296.8 1557.17 2262.62 1560.5 2225.27 1560.5 2188.25 1560.5 2155.21 1555.31 2126.14 1544.93 2097.08 1534.54 2072.39 1519.38 2052.1 1499.42 2031.8 1479.47 2016.29 1454.97 2005.58 1425.93 1994.86 1396.89 1989.5 1363.55 1989.5 1325.92 1989.5 1300.93 1991.85 1273.11 1996.56 1242.45 2001.27 1211.79 2009.31 1180.72 2020.68 1149.25 2032.04 1117.78 2047.14 1087.28 2065.98 1057.75 2084.82 1028.23 2108.28 1001.95 2136.37 978.909 2164.46 955.873 2197.75 937.38 2236.23 923.428 2274.72 909.476 2319.29 902.5 2369.95 902.5Z" stroke="#A5300F" stroke-width="20.625" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#D55816" fill-rule="evenodd" fill-opacity="1"/><path d="M2837.7 900.5 2964.33 900.5 2853.41 1455.63 3132.5 1455.63 3111.73 1562.5 2705.5 1562.5 2837.7 900.5Z" stroke="#E4C0B8" stroke-width="20.625" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FFFFFF" fill-rule="evenodd" fill-opacity="1"/><path d="M2807.7 900.5 2934.33 900.5 2823.41 1455.63 3102.5 1455.63 3081.73 1562.5 2675.5 1562.5 2807.7 900.5Z" stroke="#D29886" stroke-width="20.625" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FFFFFF" fill-rule="evenodd" fill-opacity="1"/><path d="M2778.7 900.5 2905.33 900.5 2794.41 1455.63 3073.5 1455.63 3052.73 1562.5 2646.5 1562.5 2778.7 900.5Z" stroke="#BC644B" stroke-width="20.625" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FFFFFF" fill-rule="evenodd" fill-opacity="1"/><path d="M2748.7 900.5 2875.33 900.5 2764.41 1455.63 3043.5 1455.63 3022.73 1562.5 2616.5 1562.5 2748.7 900.5Z" stroke="#A5300F" stroke-width="20.625" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FADDCD" fill-rule="evenodd" fill-opacity="1"/></g></g></svg>
diff --git a/assets/logo_square.svg b/assets/logo_square.svg
new file mode 100644
index 000000000..a82fa0aeb
--- /dev/null
+++ b/assets/logo_square.svg
@@ -0,0 +1 @@
+<svg width="596" height="683" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><defs><filter id="fx0" x="-10%" y="-10%" width="120%" height="120%" filterUnits="userSpaceOnUse" primitiveUnits="userSpaceOnUse"><feComponentTransfer color-interpolation-filters="sRGB"><feFuncR type="discrete" tableValues="0.835294 0.835294"/><feFuncG type="discrete" tableValues="0.345098 0.345098"/><feFuncB type="discrete" tableValues="0.086275 0.086275"/><feFuncA type="linear" slope="0.400000" intercept="0.000000"/></feComponentTransfer><feGaussianBlur stdDeviation="6.111111 6.111111"/></filter><clipPath id="clip1"><rect x="1.41663" y="1.66669" width="592.667" height="675.667"/></clipPath><clipPath id="clip2"><rect x="-2078.55" y="-2770.64" width="374073" height="374073"/></clipPath><image width="512" height="512" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAYAAAD0eNT6AAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEbAAUAAAABAAAAUgEoAAMAAAABAAIAAIdpAAQAAAABAAAAWgAAAAAAAACQAAAAAQAAAJAAAAABAAOgAQADAAAAAQABAACgAgAEAAAAAQAAAgCgAwAEAAAAAQAAAgAAAAAAGcBUEAAAAAlwSFlzAAAWJQAAFiUBSVIk8AAAQABJREFUeAHt3QmcXFWd6PF/VXVWwq6OozAgIjoiDKQTEBjHwCij7Nla2Yc1Ks6Mz+d7b5aPb9pZ3rzZfM8NCKC4IWMnnYAs6jAIjEImJJ1gEFRkGQVxYSdk63TVnf+pTnf6dN3uruUu55z7u34wfW/de5bvuVX/f926iwgTAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAwgKlhMujOAQQQCA1gUikdNsRss+OmbJvKZLZUUm2Vrtkx8Nr5Be9IrXUKqZgBAIUIAEIcFDpEgKhCNx8tLyuWpF3RZGcKCU5Uvv1Fv1vdkz/duiyJ/S/72uS8L1yTe5ctEF+GLMeixBAYJcACQC7AgIIOCVw+6EyY9u+crZEcoE27J36X7nNBm7S7W6oTpPretbI822WwWYIBCtAAhDs0NIxBPwSuH6BzNx7i3xYv+1/VFv+mwm2/hU9KnBtrSp/17NRnkmwXIpCwGsBEgCvh4/GIxCGQH+3nKqH+D+tvTkktR6V5AU9b+Djm9bLVb2cL5AaMwX7I0AC4M9Y0VIEghMw3/r33CJ/r4H5j7PqnH7o3V2pyrlnbpSns6qTehBwUYAEwMVRoU0IFECg/1g5QKpym3bVnNyX9fQrPbHgjIXr5f6sK6Y+BFwRaPfkGlfaTzsQQMBDgZXdejZ/Ve7VpucR/I3Yb+g1g3eu6JaTPeSjyQgkIsARgEQYKQQBBJoVuGm+HFiN6sH/wGa3SXG97bWanLx0g3w3xTooGgEnBUgAnBwWGoVAmAJ9x8l+lZ2yRnt3mDM9NCcHVuUE7hvgzIjQkIwE+AkgI2iqQaDoAnopXqkyKNepgzvB3wxKJPtGZVlxS3fsDYaKPmz0P2ABEoCAB5euIeCSgF7q9wG91G+hS20a05bDB8vyT2Pm+ROB4AX4CSD4IaaDCOQvsOpIeU00XX6sLdkn/9ZM2IKa3oTo+CUDsnbCNXgBgYAEugLqC12ZRMAcfr15vhygd0N7kx7ufL1+E5ulH3Z7l0rykmaBW/T1J6sij/Ssk19OUgwvIdCWQDRD/lYPtbsc/E2/yvp++JT++/a2OunBRnctkK5nXpaD9YP/t/RzYI7ef2GGfhYMVkvyin4ePCXb5Imeh2TQg67QxAQEOAKQAKKrRdQfpFKWJfrEtN/XNv6u/rdfE239pX4gmDOi/02fsraSe6g3IcYqkwrsut7/MV1p+qQrOvJiVJP3LNkg33akOR01o+9wmV6eLX+ghbxLE7Df0w/8w/XvaZMUOqSv/Vg/A/5d1/+OPkfhNv0M2DbJ+rzksQAJgMeDF9d0802/f568Vwf2I/r6SfpfJW69JpeZbwK362VSn+QyqSbFWK1BYOU8+aTuj/+t4QV3F9y1eH39veNuC6do2apj5DD9Rv8RDeLv01WbSfwnKnGzltFfLsunFq6TByZaieV+CpAA+Dlusa02NzXRQ5j/pIN6ROwKnS28Tz9QPsrvo50hFm1r/QY6pzJLDy2L7O1R3yP9Bnzo4nXyuEdtrje1b64cWinL/9UZc7Jl0id5f0uPjvwvPTqyyTcX2hsvQAIQ7+LV0puPld8YGpKr9EMr7TOs9eZpcv22inz0vLXysldINDYXgf75crl+g1yeS+WdVKoPDVo8IH/TSRFZbmsO9XfNkv+tRwA/pvXOSLFu8xPB1VsG5U8v2CRbUqyHojMQSDpDzKDJVDFWQD9g3zVU1UNz6Qd/U63ZXy6ZVZWB1XOle2w7+BuBWIFIlsUud31hSd7rehNH2rf67XJwZbbco8H/L3RZmsHfVGlOHP/wnOn6GTBfjjILmPwV4AiAv2MnGvz/UL9dXatdMG/KrKcdegbx+YsGZEXWFVOfHwL9c/Vs+nL9rn9+NNhu5ZAe6drf9SNd+hnwe/oZcLM2PY8rLLZpADl70fp6/bYec14IcATAi2FqbKSeWPUX+sa/Xl/JI/ibBs3Qqwtu1HZc3Ng6liCgAmX5oMcOXTN2yu+43H492fcM/QwwVyvkEfwNzSw96tCv7bjEZSfaNrEACcDENs6+om+4Xs28Xfh9sqLtuFbbs8RZLBqWi8CtR8i+WrHX+0Wlok8sdHRaMU8WaNO+rv/NzLmJ5iqja1fNkytybgfVtyFAAtAGWp6bmOCv9f9lnm0YV7fZh76ij3c9dtxyZgsssGN6/cjQbJ8J9NvtG11svwbbt2nUNYf98w7+Izz6a6B8ZpW51TOTVwIkAB4Nl4PBf0Rvpl5+uMI86W1kAf8WV0CDQUn/d7n3AvqQINf6cP0Cmam+N+h/eznWtpL+JHglRwIcG5UpmkMCMAWQKy87HPxHiA7UJ719emSGf4srsHqe3nXOtSf+tTcczt27YK8t8kntypHtdSf1rTgSkDpxshWQACTrmUppHgT/4X6X5Nxdv02m4kChfgjot9MP+dHSKVvp1D3x9WmKc/WkP9cvq+RIwJS7lTsrkAC4MxaxLfEm+O9qve5Qn6wfAo7tDQtDF9CfgV6vfTwtkH66dQ/8knxGXX34zOZIgCdvAB92Jk8ok2+mb8F/l8DRN82T9ySvQYk+CHQN1n/7z+vS1ESJNIr9MtECOyhMv/2fqJsf30ERWW/KkYCsxduojwSgDbQsNvE0+Ndp9AjAf8/CiDrcEjCPmtWxD+aacL3vtTvPAih5+Z7iSIBbb9GG1pAANJDkv8Dn4G/0NAic1H+0HJS/JC3IUuCFLXpjmlL9J4Asq02tLo1eP06t8BYK7psvr9XVfT2qxpGAFsY661VJALIWn6I+34P/ru6V9JdKr28CM8Uw8XKcQBTMyX+md9uqO2RDXDezXqYf0uaRvuaGO75OHAlwdORIABwamECCf11U7wtwskO0NCVlgfrz5/XIT8rVZFd8Sdb0PCROXAWg0dPXb/9jx4sjAWM1HPmbBMCRgQgp+BtS/Rng+OXdMs0RXpqRskCtVr8LnN4ZOpip34We9C0Vc7ttn07+m4xt+EgAtw2ezCjT10gAMuWOryy04L+rl3P2L8th8T1maUgCeunfLA1SFwbUp6GusjiRAMjP5M2aTLt2179OhpqfAzrRS3hbEoCEQVstLtDgX2coR3Joqx6s759AZaj+G3Uwt4HWgHvbmWvlVy6MRHkoyCSanwNc2Lm0DSQAOQ5EyMHfsNZKcnCOvFSdlUDk9WN/G5VqclXjwnyW6Lk0B+dTc+q1ciQgdeKpKyABmNoolTVCD/4GrVQL6tBlKvuB74Wuni9HaR+O8b0fY9r/+A82yB1j5vP+07nnESQIwpGABDHbKYoEoB21DrcpQvCvE5Vljw6p2NxxAb1ZTlDPgdfD/1f36sErh9i9fqRyE44cCWgCKa1VSADSkp2g3MIEf9P/SLZPwMDiAAT6umVvHeOzA+jKSBd2lAflSyMzLvyrJ1fudKEdKbeBIwEpA09UPAnARDIpLC9U8Fc//f3ylRQYKdIRAb3hvznzP5ijPBpsVyzaJL92hLfejKgkW1xqT4pt4RLBFHEnKpoEYCKZhJcXLfgbvqgmTyfMSHEOCWhwutyh5nTclGrJnZP/RjqjSYkzDyQaaVOK//JzQIq4cUWTAMSpJLysiMG/TliWnyRMSXGOCKyYJwu0KYc70pwkmrFp6Tq5L4mCkiyjWpVHkyzPg7L4OSDDQSIBSBm7sMFfpNo1XX6UMi/F5yRQCe3SP5Erc6KctNpKlzysK+i5iYWaOBKQ0XCTAKQIXeDgb1Q3nnmvbE6Rl6JzEjBPp9PD/2flVH0a1W7eVpEb0yi40zIX3S/PaTT8YafleLg9RwIyGDQSgJSQCx78jep3UqKl2JwF9A6Pl2gTpufcjOSqj+Qr562Vl5MrMNmS9Ov/3cmW6E1pHAlIeahIAFIAJvjrLSZr0pcCLUXmLNCrQ6ufypfm3IxEq9fn7F6daIHJF7Yy+SK9KZEjASkOFQlAwrgE/zroIws3yEDCtBTngMDb5supEtbtab931oA86ADthE14cEDu0RefmnCF8F+oHwnQz9Zl4Xc12x6SACToTfDfhVmSTyXISlEuCYR28l/k3qV/44e7d/jOhM48n2B8+zKa1ysi5apVPEo4UW4SgIQ4Cf6jkM9Mr8kXR+f4IxiB/qPlIP0UPjmYDok8+/KessqH/uhTAc1VCkU/qZYjAQnvrCQACYAS/HcjRpH85ekDsnX3Ev4KRqCr/tQ//ck8kKkkn7/obj9uV73wAXlRLwb8h0DkO+kGRwI60Ru3LQnAOJBWZwn+u8X0bOUHaofINbuX8FcoAn2H18/6vyiU/mg/Ir3JznU+9WfWS/KPmgRwcy29y7h+1nyGnwM633tJADowJPhbeNsrJbmoZ4VUraXMBCFQmSlLNPi8JojOaCc0gHyrZ4Nfd9k75VHZURu+BJP3GElAIm9FEoA2GQn+Npwe+v/ownXygL2UuWAESvXD/8F0R69k8PKkuqUb5Ls6CH8dzkB01BPOCeiIT6/p7XD7Qm5O8B837PphumTAzw/UcT1hNkag71h5qy4+IeYlXxc9WTtYbve18Q+ul7/WyMd9NoYHkHMCOtiRSQBaxCP4jwMryYoH18mHxy1lNiCBSq0+vuaDNohJD/8v9/mnql69LHDmS3KBHsXgbpvDeyRHAtp8Z5IAtABH8Lex9IN01bM1Obd3+Dpl+0XmghDQk//m6KfruUF0ZrgTO6dV5Xrf+2POB9DLbU/Xftzle18Saj9HAtqAJAFoEo3gb0OZ4P9cJO9fNiA77VeYC0mga6acp2O9Vyh90mRm9Zkb5ekQ+mMut50eyWkcCRgdzfqRAK4OGPWY8g8SgCmJRAj+NhLB3/YIeU6f+nd5SP2renry30RjUE8CzJEAfg4YISIJGJFo4t9gftdroq9trULwt9kI/rZHyHMr5svx+uS/ewPq4w8XrZfD9UNPd+Owplu6ZfZgWW7Rnp0UVs/a7o3mrvJHOt6fa7uEAmzIEYBJBpngb+MQ/G2P0OcqAd73P8Tgb/ZDjgQ0vBs5EtBA0riABKDRpL6E4G/DEPxtj9DnVh0j++uYLwmon1tnDMpXA+pPQ1dIAhpISAIaSOwFJAC2R32O4G+jEPxtjyLM1WpysfZzZjB9LcnXTntQXgimPxN0hCSgAYYkoIFk9wISgN0W9b8I/jYIwd/2KMKcjnlJPzUvC6mv5apcHVJ/JusLSUCDDklAA8nwAhKAMTAE/zEY+ifB3/Yoylz/XH3kb0neFFB/71+4QQYC6s+UXSEJaCAiCWgg4VbAoyQE/1GK+h8Ef9ujUHNl7vsfwniTBDSMIknAOBKOACgIwd/eKwj+tkeR5vqPlQP0TPlTA+rzi3rHvMLeN58koGFPJgkYQ1L4BIDgP2Zv0D8J/rZH4eZq9Rv/dIXSb92frzdBMJT+tNOPkSRAE7s729k+wG1IAnYNaqETAIK//dYm+NseRZu7a4F0yfDZ/6F0Parpg39C6Uwn/TBJwLRIziAJGFUkCVCKwiYABP/RN0L9D4K/7VHEuRc2y0I9+e/1ofRdg913etbLj0PpT6f9IAloECx8ElDIBIDgb78RCP62R1Hn9N6pHwyp7/rt/6qQ+pNEX0gCGhQLnQQULgEg+NtvAIK/7VHUuZXd8hbt+4KA+v8LfVrlNwLqT2JdIQlooCxsElCoBIDgb+/4BH/bo8hz+kHwAe2/HjUPY9JP9Gt5VPXEY0kS0GBTyCSgMAkAwd/e4Qn+tkeR5/qOk1l6+P/8gAyGoi65NqD+pNIVkoAG1sIlAYVIAAj+9o5O8Lc9ij5X3ilnq8F+wThEcsvitfJUMP1JsSMjSYBW8W8pVuNT0YVKAoJPAAj+9nuP4G97MFc/7h/UyX+lEif/tbJfmySgOk3O0G1IAobhCpMEBJ0AEPztjwGCv+3BnMjKeXK0OswLyOKxTeu54U2r49mzRraRBFhqhUgCgk0ACP7Wzswd/mwO5nYJ6AfAh4PCiOSqXtHbGTG1LEAS0EAWfBIQZAJA8Ld3ZL752x7MDQusPkr20X3jfQF57KjW5MsB9SfzrpAENJAHnQQElwAQ/O0dmOBvezC3W0AP+V6oc3vsXuL9X1/v2SjPeN+LnDtAEtAwAMEmAUElAAR/e8cl+NsezNkC+ql2mb3E87kaJ/8lNYIkAQ2SQSYBwSQABH97hyX42x7M2QL93XKiLjncXurxXCTfX7xB/sPjHjjXdJKAhiEJLgkIIgEg+Ns7KsHf9mAuRqAc1n3/9bFmV8b0kkUdCpAENAAGlQR4nwAQ/O0dlOBvezDXKNA3X16rl4Wc1fiKt0s2d02XG71tveMNN0nArBflNG3mbY43NavmBZMEeJ0AEPzt/Z3gb3swFy9QqdV/+58W/6p/S3W///KZ98pm/1ruT4tPeVR2aBKwWFtMEjA8bEEkAd4mAAR/+8OD4G97MBcv0LdUKvrIn0viX/VzaVcky/1suV+tJgloGC/vkwAvEwCCv70jEvxtD+YmFuh6on4o96CJ1/DrFb3t73fPGpAH/Wq1v60lCWgYO6+TAO8SAIK/vQMS/G0P5iYX0Kf+BXXf/5re+W/yHvNq0gIkAQ2i3iYBXiUABH97xyP42x7MTS7QP18O0ZP/3j35Wl69+szsF2WVVy0OpLEkAQ0D6WUS4E0CQPC3dziCv+3B3NQC+gn1AV3Lm/f8lD2K5DoTiKZcjxVSESAJaGD1Lgnw4sOA4G/vaAR/24O5qQVuP1Rm6OH/C6de05s1atWKXOtNawNt6EgSoPvWrYF2sdVueZUEOJ8AEPzt/Y/gb3sw15zA1n1lqR7+f01za3ux1jd77pcnvGhp4I00ScDsF2QJScDoQHuTBDidABD8R3eo+h8Ef9uDueYF9BMpqJP/NJnh5L/mhz/1NUkCGoi9SAKcTQAI/vYORfC3PZhrXmDlXDlS1z6++S2cX/Nn1UPkW863smANJAloGHDnkwAnEwCCv70jEfxtD+ZaFAjtvv8lubpnhVRbVGD1DARMElDbKov5OWAU2+kkwLkEgOA/uuPU/yD42x7MtSbQd7jM0Tf5Oa1t5fTag11l+YLTLSx443oekkGSAGsncDYJcCoBIPhbO43+zCmrnovk/csGZKf9CnMINCdQninn6360V3Nre7HWqjPXyq+8aGmBG0kS0DD4TiYBziQABH97hyH42x7MtSegt8o11/6HM5U4+c+XwSQJaBgp55KAUkMTc1hA8LfRCf62B3PtCazqlt/V32K/297WTm71w0Xr5XD90NK3CJMvAvoz1PTybOnX6GceKcyk+6/uw3+k+/Ln8sbI/QgAwd/eBQj+tgdz7QtEgZ38p++NKwn+7e8PeW3JkYAGeWeOBOSaABD87R2D4G97MNe+wC3d8ir9nryo/RKc23LrzB1yg3OtokFNCYwkAbryLU1tEP5KTiQBuSUABH97Dyf42x7MdSagZ41eoiXM7KwUd7aOIrnhtAflBXdaREtaFTBJQHWbLNHtSAKG8XJPAnJJAAj+9luH4G97MNeZgO5PJf2R8dLOSnFr60oky91qEa1pR4AkoEEt1yQg8wSA4G/vAAR/24O5zgVumifv0VIO7bwkZ0pYu3CDDDjTGhrSkQBJQANfbklApgkAwd8eeIK/7cFcMgI1Ceu+/3r4n/v+J7NrOFMKSUDDUOSSBGSWABD87QEn+NsezCUjcNN8OVBLOiWZ0hwopSQvzBBZ4UBLaELCAiQBDaCZJwGZJAAEf3ugCf62B3PJCdQiWaalVZIrMfeSvnD6gGzNvRU0IBUBkoAG1kyTgNQTAIK/PcAEf9uDueQElnfLNN2/LkquxNxLivROhtfk3goakKrAmCTgG6lW5E/hmSUBqSYABH97jyP42x7MJSuwf1kWaomvS7bUXEv7t0X3yyO5toDKMxHYlQQs1cpIAobFM0kCUksACP72+4bgb3swl7xAOQrr5D8pc/Jf8nuJuyWSBDSMTepJQCoJAMHfHkiCv+3BXPICK7vlLbqfvTP5knMr8RfPVuXW3Gqn4lwESAIa2FNNAhJPAAj+9gAS/G0P5lISGL7vv94qP5hpOY/BDmYsW+oISUADV2pJQKIJAMHfHjiCv+3BXDoCet//2Rr5z0+n9FxKHapOk+tyqZlKnRAgCWgYhlSSgMQSAIK/PWAEf9uDufQEBkXO0Qf/7JteDdmWrO+db/SskZ9nWyu1uSZAEtAwIoknAYkkAAR/e6AI/rYHcykLlOUDKdeQafF66R93/stU3N3KSAIaxibRJKDjBIDgbw8Qwd/2YC5dgdXz5Bj99t+dbi2Zlv7Yg+vkO5nWSGVOC5AENAxPYklARwkAwd8eGIK/7cFc+gJ6578Ppl9LdjXoff+v7BXRxxkwIbBbgCRgt8WuvxJJAtpOAAj+9oAQ/G0P5tIXWH2U7CMl6Um/psxq2FauyJcyq42KvBIYSQL0s/ZmrxqeXmM7TgLaSgAI/vaIEvxtD+ayEYi66rf9nZ1NbenXor/99+md/55LvyZq8FXAJAG1bdJDEjA6gh0lAS0nAAT/Ufj6HwR/24O5bAR0vzPX/JsH/wQz1Wqc/BfMYKbYEZKABty2k4CWEgCCvw1P8Lc9mMtOYOV8OUn3vzdnV2O6NWlfHlgyIGvTrYXSQxEgCWgYybaSgKYTAIK/DU7wtz2Yy1YgtPv+66fX57IVpDbfBUgCGkaw5SSgqQSA4G9DE/xtD+ayFejvlt/UGs/IttZUa3tpy065MdUaKDxIAZKAhmFtKQmYMgEg+NvABH/bg7nsBaKyXKa1Tsu+5tRq/PIFm2RLaqVTcNACI0mAdvKmoDvafOeaTgImTQA0+P+l1mn+Y1IBgj+7Qd4CfUulou/ui/NuR5L169mM1yRZHmUVT8AkAdVt8j7tOUnA8PA3lQRMmACsmidXaDm9w2Xx/wR/9gEXBMqPy+najoNcaEtCbbhn0Xr5QUJlUUyBBUgCGga/ngToo8IvbHhl14LYBEB/YzxHA95nJtqoaMsJ/kUbcXf7Wy6Fdec/vZiR+/67u7t517IxScA3vGt8Og0u6f01rtEv9O+OK95cS2xNfcfKWytVWacLg7nBiNXBFmcI/i2CsXpqAqu75Y21kjyiFcQm7qlVnF7Bz8x6UQ485VHZkV4VlFxEgb7DZXpllnxd+35WEfvf0OeSvCBDcvTijfLTsa9ZHyS3HyozNPgbNIK/IhD8x+4q/J23QK1c//ZvvWfzblOH9V9L8O9QkM1jBUaOBOhnOLcNNkL6uPBSl3zFnEM0Fsz6MNm6j3xMX3zb2BWK+jfBv6gj72a/TXKub+IL3GxdW62qlbvk2ra2ZCMEmhAwScBzkSzVVTkxUBH0QVvvqDwhl46lG00A+o+VA/T3gD8b+2JR/yb4F3Xk3e339n3qZzi/2t0WttayqCS3L/wP+c/WtmJtBFoTWDYgO83VARwJGHX7277jZL+RudEEoDRU//a/x8gLRf2X4F/UkXe73xowg3rsrx7N4OQ/t3e5YFrHkQBrKPev7JQ/HllSTwBuPUL21Q+YS0YWFvVfgn9RR97tfq86Rn5HA+bb3W5lS637We0N8u2WtmBlBDoQMEcCno3qj87m5wCRP9KTJOcYznoCsGOmnK1/1xd0YOz1pgR/r4cv6MZHNflQSB3U3yKv6lkh1ZD6RF/cFxhNAiJZ7X5rU23hfl2zZJGpYfgngEjOSbU6xwsn+Ds+QAVu3s0nyJ7afZOghzINTuuS60PpDP3wS6CeBIieT1PwJEBjXv0zpXxLt7xKh/B4v4YxudYS/JOzpKTkBao76mf+myQgjKkkK89cK78KozP0wkeB0SSg2FcH/L75GaA8WJF36CA23BDIx4Fttc0E/1bFWD9rgZrIsqzrTLO+Uo2T/9L0pezmBPg5QKbpzwDHlfVQyLHNkYW1FsE/rPEMsTcr5so7NDM/IpS+6Y3JH144IPeG0h/64bfA6JGAgv4coDHwBJMAvNnvYWy99QT/1s3YInuByvCd/7KvOKUa9THGn9OERt9+TAi4IVDwJODNZX03vtGNocimFQT/bJyppTMBc26O7qv1M3U7K8mZrV/ZVpavOtMaGoLALoHCJgEleZM+XEz2LcqeQPAvykj738/BslymvZjhf09Ge3DDeWvl5dE5/kDAIQGTBFQPqd82+F8cala6TYlkv7Ke/leIu/8R/NPdlyg9OYFec3luVE8Akis075IiuSbvJlA/ApMJmHtTVN8g5+k6RUkC9jQfNBobw5/0SMeMA18K5jGq4Q9YgXt4RLe8V7v/hoAI1iwekA0B9YeuBCrwwuNifhafHmj3xnerZG4EtGX80kDnT922j/TXn6oWaAfpViACod33X7j0L5A9M+huLO+WaXpTnK/rl8WQzr2ZbMxeMQnAs5OtEdhrp+ojj79mBjqwftGdQAT6uuW3tCvvCaQ7phvPvTxHVgTUH7oSoICJCfuX9NB/SRYG2L2JuvS8SQAem+jVEJeb7O5VJVnNkYAQR9f/PlVEPqC90H/CmPS+/9dfdLdsD6M39CJEgQJ+8x8ZxsfMSYA/Gpkr0L8cCSjQYPvSVb0153R9P17sS3ubaGdUrsi1TazHKgjkIlDQb/4j1o+UNUNfMzJXpH85ElCk0fajr7ue0PUbfrR26lbqe+yORffLI1OvyRoIZC9Q4G/+w9iR3FfeXpHv6dxQ9vxO1MiRACeGgUYYAT37+IMhSdT0sb8h9Ye+hCNQ8G/+ZiCr+jzu75V33ZzjrnCGtrWecCSgNS/WTkdg1Vz5bS3ZPJgrlOnp50RuC6Uz9CMcgcJ/8zdDWZJ7egbkJXMSoJRKcqP5t8ATlwgWePBd6LreJ/9D2g7NR8OYopIsN3dXC6M39CIUgZHgr++0Ip3t3zB8+kFTv9lRPQGYVpOv6xpFuhywAUQXkATEqbAsdQG97/9s/UA6N/WKsqtgqNYln8+uOmpCYGoBDvsPG2nwf7m0c/jS3HoCcPqAbNUPIH6vIwmY+l3EGokL7Cxr8I+CeibHTT1r5OeJQ1EgAm0KjHzzNz/5tllEMJvpuUZXL3xAXjQdqicA5g/9GeBT+s/z5u+CT5wYWPAdIOvu6xtyWdZ1pllfqcaXiTR9Kbs1Ab75W16bqyX5fyNLRhMAvVznOT0K8PGRF4r8r8kSuVlQkfeA7Pq+sluO1W//3dnVmHpNjy7cIIU9qTh1XSpoSYBv/jaXftH/q5518suRpaMJgFlQPViW6z/mskAm/TmA2wazG6QtUC6Hdemfen1OE2g9qMGEQL4CfPO3/fVN+cDQVvn02KVWAmAeh6g3IT1bV9AreJg4EsA+kKbArUfIvnojrqVp1pFx2duq0+TLGddJdQg0CPDNv4Fki6blZ/c8JINjX7ESAPPC4rXylAY+kwRYK47dqGB/cySgYAOeVXd3TK/f9nd2VvVlUM+/6Ml/nEeUATRVTCzAN/8Gm1opkouWDDTe9r8hATCbLlovd2gScKH+WWsoqoALOBJQwEFPuct6OK6k/7s85WqyLZ4ribL1prYGAb75N5Dox4x8bNFA/BM5YxMAU4QmAeZGAeYDiiTAgHCJ4LAC/5+IwOp58i4t6LBECnOgEPP74uJ1ss6BptCEggrwzT924Hs1lo+e9T9+jQkTALPi4vX1m3mQBOxW42ZBuy34qwMB/e0/qPv+67eMz3bAwaYIdCTAN/9Yvl6N4Z+IfWXXwkkTALMOSUADH0lAAwkLWhHo75bf1ONyp7WyjePrvrRlcPjWoo63k+YFKDAS/PU9Vejb+44b2imDv1l/ygTArEQSYBSsiSTA4mCmJYFS/cY/01raxuGV9b7/X7pgk2xxuIk0LVABgn/swDYV/M2WTSUAZkWSAKNgTSQBFgczzQjctUC69HKcS5tZ16N1rvGorTQ1EAGCf+xANh38zdZNJwBmZZIAo2BNJAEWBzNTCbywRc7QQ5Wvn2o9X17X3/7vXrJOHvKlvbQzDAGCf+w4thT8TQktJQBmA5IAo2BNJAEWBzOTCYR28l+NS/8mG25eS0GA4B+L2nLwN6W0nACYjUgCjII1kQRYHMzECazuljfq8pPiXvN02S+fq8lqT9tOsz0UIPjHDlpbwd+U1FYCYDYkCTAK1kQSYHEwM15AT5b7kC5r+z03vrzc50ty3bIB2Zl7O2hAIQQI/rHD3HbwN6V19GFEEtAwICQBDSQsMAJ9x8ksvVmOubtmKFNVhuS6UDpDP9wWIPjHjk9Hwd+U2FECYAogCTAK1kQSYHEwYwQqQ/I+/Wf/gDRuW7xRfhpQf+iKowIE/9iB6Tj4m1I7TgBMISQBRsGaSAIsDmb00r+w7vzHyX/s1BkIEPxjkRMJ/qbkRBIAUxBJgFGwJpIAi6O4M6vny1Ha+2MCEnh80zr514D6Q1ccFCD4xw5KYsHflJ5YAmAKIwkwCtZEEmBxFHNGn6Z1RWA9X97LQ8ICG1K3ukPwjx2PRIO/qSHRBMAUSBJgFKyJJMDiKNZMX7fsrYf/zw6o14OlQfliQP2hK44JEPxjByTx4G9qSTwBMIWSBBgFayIJsDiKM1MRuUB7u0cwPY5kxaJN8utg+kNHnBIg+McORyrB39SUSgJgCiYJMArWRBJgcRRkpiTmcdrhTJz8F85YOtYTgn/sgKQW/E1tqSUApnCSAKNgTSQBFkfYM6vmyzu1h28LpZelSB7W9/S9ofSHfrgjQPCPHYtUg7+pMdUEwFRAEmAUrIkkwOIIdya4+/6LfDbc0aJneQkQ/GPlUw/+ptbUEwBTCUmAUbAmkgCLI7yZvqPl1dqrswLq2Svbu+SGgPpDVxwQIPjHDkImwd/UnEkCYCoiCTAK1kQSYHGENVOpyGXaoxmh9EpvY/zV89bKy6H0h37kL0Dwjx2DzIK/qT2zBMBURhJgFKyJJMDiCGOm17yvonoCEEaHhntxTUidoS/5ChD8Y/0zDf6mBZkmAKZCkgCjYE0kARaH/zNHzpVTpCQH+9+T0R7cu2S9bByd4w8EOhAg+MfiZR78TSsyTwBMpSQBRsGaSAIsDr9nojL3/fd7BGl9WgIE/1jZXIK/aUkuCYCpmCTAKFgTSYDF4eeM3vnvt7Tlf+Bn62Nb/exLe0h/7CssRKAFAYJ/LFZuwd+0JrcEwFROEmAUrIkkwOLwb6Zr+Nu/3gAwmOkLF90t24PpDR3JRYDgH8uea/A3Lco1ATANIAkwCtZEEmBx+DPTd7hM17PlL/anxVO2NIoqct2Ua7ECApMIEPxjcXIP/qZVuScAphEkAUbBmkgCLA4/ZiozZYme/f8aP1rbVCu/vWSt/KSpNVkJgRgBgn8MiogTwd+0zIkEwDSEJMAoWBNJgMXhwUwprJP/VPwqD9RpoqMCBP/YgXEm+JvWOZMAmMaQBBgFayIJsDjcnek7Vt6qrTvB3Ra23LInq2+Q21reig0QUAGCf+xu4FTwNy10KgEwDSIJMArWRBJgcbg5UxmSK7RlJTdb10arIrmmZ4VU29iSTQouQPCP3QGcC/6mlc4lAKZRJAFGwZpIAiwOt2b05L85pZKc51arOmrNUFdNvtBRCWxcSAGCf+ywOxn8TUudTABMw0gCjII1kQRYHO7MVGbJuXr2/17utKjDlpRk9Zkb5ekOS2HzggkQ/GMH3Nngb1rrbAJgGkcSYBSsiSTA4nBm5nJnWpJEQ2qc/JcEY5HKIPjHjrbTwd+02IvfLPvnySXaVvMwEqcTFgOa0XTbrBdl8SmPyo6M6qOaCQRWHSPHRTW5b4KXfVz8o0Xr5a36waAHNZgQmFqA4B9r5HzwN632IqByJKBhB+NIQANJPgs0+H8wn5rTqTUqydUE/3RsQyyV4B87ql4Ef9NyLxIA01CSAKNgTSQBFkf2M/rtf3+tdWn2NadW47Zal3wltdIpOCgBgn/scHoT/E3rvUkATGNJAoyCNZEEWBwZz0RykdY4M+Na06zuaz1r5Pk0K6DsMAQI/rHj6FXwNz3wKgEwDSYJMArWRBJgcWQzoz+Ql6JILsumtmxq0ccYX51NTdTiswDBP3b0vAv+phfeJQCm0SQBRsGaSAIsjvRn+ufKyVrLYenXlFkN65bcL+szq42KvBQg+McOm5fB3/TEywTANJwkwChYE0mAxZHyzPBjf1OuJLvi9UZG3Pc/O24vayL4xw6bt8Hf9MbbBMA0niTAKFgTSYDFkc5M/7FygJ4pf2o6pedS6ovTavL1XGqmUi8ECP6xw+R18Dc98joBMB0gCTAK1kQSYHGkMFMTc+OfrhRKzqvIL54+IFvzqpx63RYg+MeOj/fB3/TK+wTAdIIkwChYE0mAxZHczF0LNPDX5OLkSsy/pGpFrs2/FbTARQGCf+yoBBH8Tc+CSABMR0gCjII1kQRYHMnMPP+KnKX3z3x9MqU5UEpJvtOzVh52oCU0wTEBgn/sgAQT/E3vgkkATGdIAoyCNZEEWBwJzJTCuvOf3vCXk/8S2C1CK4LgHzuiQQV/08OgEgDTIZIAo2BNJAEWR/szfXPlUA2YJ7ZfgnNb/vLZSG52rlU0KFcBgn8sf3DB3/QyuATAdIokwChYE0mAxdHeTKUsV+iWegFAIFNJrl02IDsD6Q3dSECA4B+LGGTwNz0NMgEwHSMJMArWRBJgcbQ203eczNItLmhtK6fXrsqQfN7pFtK4TAUI/rHcwQZ/09tgEwDTOZIAo2BNJAEWR/MzXUPyfl17v+a3cHtNvZXxrYs3yk/dbiWty0qA4B8rHXTwNz0OOgEwHSQJMArWRBJgcTQ3o/f9D+qxv3opIyf/NTf0wa9F8I8d4uCDv+l18AmA6SRJgFGwJpIAi2PymdXz5ShdY/7ka3n16uM/2CB3eNViGpuKAME/lrUQwd/0vBAJgOloPQmIZJn+WTPzTEIS0OROUIvkw02u6sVqevj/6l7eB16MVZqNJPjH6hYm+JveFyYBMJ1dPCDX6WVcJAEGY3giCRiRmODf1UfJPvqS+f0/lGlHeVC+FEpn6Ed7AgT/WLdCBX8jUKgEwHSYJMAoWBNJgMVhz1SnyYW6ZA97qddzfYs2ya+97gGN70iA4B/LV7jgbxQKlwCYTpMEGAVrIgmwOHbPlCK5bPec/3+Vypz85/8ott8Dgn+sXSGDv5EoZAJgOk4SYBSsiSTA4hDp767f9e/wcYt9nt206H5Z43MHaHv7AgT/WLvCBn+jUdgEwHSeJMAoWBNJwFiOcmCX/olcObZ7/F0cAYJ/7FgXOvgbkUInAAaAJMAoWBNJgHL0zZfX6gmjZ1kyfs9s3laRG/3uAq1vR4DgH6tW+OBvVAqfABgEkgCjYE2FTwIqNblURaZZKj7PRPKV89bKyz53gba3LkDwjzUj+O9iIQHYBUES0PBGKWwS0LdUKvrIH5MABDNV9Nr/YDpDR5oSIPjHMhH8x7CQAIzBIAkYgzH8ZyGTgMoTcqp2/6AGDX8XfO+sAXnQ3+bT8lYFCP6xYgT/cSwkAONASALGgUgh7xgY1n3/Iy79a9irA15A8I8dXIJ/DAsJQAwKSUADSmGOBPTPl0O09yc3CPi74NmX95RV/jaflrciQPCP1SL4x7JwEuAELJwYGANTjCRg+FbR4STGJfn8RXfL9pjxZFFgAgT/2AEl+MeyDC8M54Nukk62+xJHAhrkgk4C+g6X6Xry3x829NrfBVG1qs+/YApegOAfO8QE/1iW3QtJAHZbxP5FEtDAEmwS0DVbluq1/69p6LGnC/Spf9/q2SCPetp8mt2kAME/ForgH8tiLyQBsD1i50gCGliCTAKiKLA7/5U4+a9hzw1sAcE/dkAJ/rEsjQtJABpNYpeQBDSwBJUErJwv5p7/JzT00t8FP6sdLLf723xaPpUAwT9WiOAfyxK/kAQg3iV2aT0JEPmQvqhHV5lUwCQBX+oN4WTSSD4c2Igu71kh1cD6RHd2CfTqe+7VIl/Vc1YWgjIq8InF6+UTo3P8MaUACcCURPYKuoMt1/B/uS6t2a8Udu59R8yXz/rcez35b46+Ec7xuQ/j2r6zqypfHLeM2YAEjpgnn4pK0hNQlzrtign+vZ0WUrTtSQDaGHF+DhiHpr+d66Nzvb11bnmmnK+HdPYa1yufZ1eduVGe9rkDtH1igVXz5TJ9NbQjVhN3eOpXzGH/3qlXY43xAiQA40WanCcJGAdVks+snCtHjlvqxWypVP9A9aKtzTRS+3NVM+uxjn8C5lwVPVn1//vX8tRazGH/DmhJADrAIwmw8GaWyvL5Xs/OB+ifVz/x72irJ37P/HDhOvl3v7tA6+ME9ChVqRTJlfra7LjXC7iMb/4dDjoJQIeAJAEW4Dw9H+ACa4nrM6Fd+qf3/S9xkqrre11b7dND/xfqhr/X1sbhbcQ3/wTGVD8rmJIQqP8GXtITBD37BpxE38eV8VR1m7yx5yEZHLfcudlbuuVVgyV5Uhs207nGtdegrTN2yAGnPSgvtLc5W7kqcNcC6Xp+s/xEz/o/2NU2ZtguLvVLCJsjAAlBciRgFPIAPanu7NE5h//Q4H+xNi+U4K8HiOVrBH+Hd7gOmvb8K/qeIvgbQYJ/B/vR+E1JAMaLdDBfTwK4T4D+UOn+Gcrm91S9nNPbKxfidtNyVa6OW86yIAQ+EEQvOusEwb8zv4atSQAaSDpbwH0C6n7zVh0jh3Umme7Wq+fLH2gK8KZ0a8m09PsXbpCBTGukskwE+o+Wg7Si4zKpzN1KCP4pjA0JQAqo5kiAnq17hRatXzSLOemlSotd7jn3/Xd5dGjbWIGoXH8vFfl8LU74G7tDJPg3CUCCmGOLWjSgh2MLfMdATYAWjPVw6e+b5suB2p5TXWpTh215cXpN+josg80dFdCf1BY42rQsmmW++fdmUVER6yABSHHUi3wkQA99HN+3VCop8rZdtN4g39zK2cm2tdMptb7+9AHZ2s62bOOFQEgPqWoFnG/+rWi1sS4JQBtorWxS4CMBc6Y9Wf+m3QpX6uuaJ6jpkZmLU68ouwoifSiFufyUKUCBvvnyWu3WfgF2baou8c1/KqEEXicBSABxqiKKeiRgaKd7JwLuX64/Pe11U42ZR6/f2bNefuxRe2lqCwJdNTm0hdVDWZVv/hmNJAlARtBFPBKgtwZ+TUa8TVdTDuzOf/r7MPf9b3r0PVzRwfdQyop8808ZeGzxJABjNVL+u36fgEiWaTXFeJRwTfZMmbSl4ld2y1v09/J3trSR2yv/4pma3OJ2E2ldRwKRzOloe7825lK/jMeLBCBj8EL9HFCW6RnzTl5dWT6oK4R0OdU1ywZk5+Sd5lWfBfSbQpfP7W+h7Rz2bwErqVVJAJKSbKGcwvwcEMnmFlhSXbXvOJmlkf/8VCvJtvAhvY7humyrpLYcBLblUGfWVXLYP2vxXfWRAOQEX4QjARpwX86Jt6Hayk45R8/+37fhBV8XRHLL4rXylK/Np93NCZRr7iTRzbW45bX45t8yWXIbkAAkZ9lyScEfCYjk8ZZR0tsgqHupc/JfejuKSyWXS/JTl9qTcFv45p8waKvFkQC0Kpbw+gEfCYi2dcmjCXO1VdzKuXKkbjivrY3d3OixTevlTjebRquSFBicXn8PhXjSMCf8JbmjtFkWCUCbcEluFuiRgB+ft9aNnwD0csRzkxyv3MuK5KreolxJkjt2vg3oWSPmHIDQ7vPAYf98d6vR2kkARiny/SPASwTvyVfUqt3pBxNZLZ16ZnupIl+cejXWCEVAL1116b3UKSuH/TsVTHB7EoAEMTstKqSfA/Rpe9/q1COJ7fuPlQO0nDcmUZYjZfQtul+ec6QtNCMDAT0P4I4MqsmiCr75Z6HcQh0kAC1gZbFqCD8HmLP/N+/pRgJQqsrvZjFuWdWhP2dcnVVd1OOGwEt7yO3akhfdaE3breCbf9t06W1IApCebdsl+34kQA9Z3njR3bK9bYAEN9S2vCXB4vItKpLv67f/Nfk2gtqzFjDvJT2itiLrehOsj2/+CWImWRQJQJKaCZbl8ZGAalSRf06QorOiSvKmzgpwaGvu++/QYGTelE9qjT5eDcA3/8x3leYrJAFo3irzNX08EqCH//uWrJWfZI41QYWlSF41wUu+Ld7cNUO+5lujaW8yAksG5Ed6NOumZErLrBS++WdG3V5FJADtuWW2lWdHArZFVfmzzHCaqEg/NIN4mIr248tn3hv8XeGaGNHirlKJ5H9q7534aa2JUeCbfxNIea9CApD3CDRRvy9HAvR3yr9avNGxO5eVJIh9XDvByX9NvFdCXmXhgDymR7T+wYM+8s3fg0EyTQziw9ET646a6cGRgHtqh8g/dtTJNDaO5JU0is24zH9ftF5+kHGdVOegwDMif6PNcvlEUL75O7jfTNQkEoCJZBxc7uyRgEh+3lWVc3pWSNVBtpccbFNrTYpkeWsbsHaoAubxz9XhO1u6eC8Ivvl7tuORAHg2YK4dCTDX/Oud6U49c6M87SjlY462q9lmPTPrJelvdmXWC1+g5355QkryXu2pS0e3+Obv4a5HAuDhoJkjAfpo22Xa9Ly/cb+oJ6edotemf99VRn1q3o9cbVsz7dL2f/6UR2VHM+uyTnEEFq+TdZp8L9Ieb8m515EmI3++eL18Iud2UH0bAiQAbaC5sEk9CRB5v7Ylr+DwlH4AvUPf+Pe64DFhGyJZN+Fr7r9QGyrJNe43kxbmIaDnhdyhdwZ4l9ad188BQ/oF4FJNRv4uj/5TZ+cCJACdG+ZWggbflXok4HhtQKaHuTXw36n1HuPDiWkL18tD6vOr3Aaps4q/WT/c21kZbB2wwOIN8h+VkhytXbwv424+qVcknLhkvXwh43qpLkEBEoAEMfMoSo8EbCgPyTwNyuaNqAl5qtNL+qb/E30W/cla7y9SrSmhwtXFmHw7oeIyLSYqcfJfpuCeVnbWOnny2UgWaPN79b+07xOgu6V8oTpNjtLzkb6n9TF5LKBjyRSKwIr5cnx5+DrhExLu0w6NotdPq0jvmWv9+za9ap68W9v/rwmbpF3cz6pvkEMcvbIi7b5TfpsCfXPl0EpF/o+mveYR2El/wbtHf3L4U3PUoc3msZljAiQAjg1IEs1ZNV/eGdXkT/TknFO0vBkdlPmUnoT21cqQfMbhs/yn7F6vfhAe0a0/k5Tk4ClXdmWFSD6mR1nceaaCKy60oymBVXPlt6OyfERX7tH/9mlqo/iVtulRv1v0a/+nnT/fJ779LJ1EgARgEhzfX7r1CNl3cKacpd8GTtJvwO/U/hw4RZ+26esbdae4S4PlHZvWyXd7/XwASUM39SjAFWrw2YYX3FzwbHWbvKHnIacu83JTilZNKnD9Apm511a9ZLCqJwuW5ERd+TD9rzLpRiKP6+v3aPJ/59ay3HLeWnl5ivV52VMBEgBPB66dZvd1y97lirypVJPXaTDcQzP7PTSzf0H/fb5akf98+H75aW8gAX+8T/2D8JX6Q4oOGP+ac/N8+3duSEJp0O2Hyozt++hngH4Z0Ft3z9GjBPvqv5vLNdmsnwVPV7fLIySeoYz21P0gAZjaiDUCEVjVLUv1Q67P8e78bMugvPWCTblf3+04E81DAIFOBZI+SaTT9rA9AqkJ6FnLK7Tw21OroPOCa/rA9wsJ/p1DUgICCEwtQAIwtRFrBCRQGpSLtDtO3rZYD8f989L1cndA3HQFAQQcFiABcHhwaFryAos2ya/1a3aed1CcqFP3znxRPj7RiyxHAAEEkhYgAUhalPKcF1i6Qb6r37bfpw2tOtHYSL4/Y4eczj3/nRgNGoFAYQRIAAoz1HR0rIDexvhmvczpQl22c+zyrP/WKzAeni7yrtMelBeyrpv6EECg2AIkAMUe/0L3ftE6uUGTgDP0aEAu1znrpZjfHJou7zh9QJ4t9EDQeQQQyEWABCAXdip1RUCTgG9FVTlS27M2wzaZR6j+/Q/Wy2k9a+T5DOulKgQQQGBUQL/8MCGAgLlByra95X9oYP5z1ZiVosgGPQnxI+Y8hBTroGgEEEBgSgESgCmJWKFIAqvfLgdHQ/JxPTx/vvZ7WmJ9j+TnpbJ8XG+v/KXeQO+2mJgVBSGAQCYCJACZMFOJbwL9R8tB0iWX6tPPztOjAge32f6q3nnwm/qExuv2nSO3nXi3DLVZDpshgAACiQuQACROSoEhCeiRgFL/MdKtz084SYP5O/Ss/bdq/w7S/+IeqLJFlz+s69xXK8t908pyj4+PTw5p/OgLAghMLEACMLENryAQK2DOF3jl1fKq8qDsqU9anF6bIc9t0f8uulu2x27AQgQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAA6eprwAAAPiSURBVAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBIgj8F1Cx5oYEZQsEAAAAAElFTkSuQmCC" preserveAspectRatio="none" id="img3"></image><clipPath id="clip4"><rect x="0" y="0" width="371301" height="371301"/></clipPath><clipPath id="clip5"><rect x="-0.363636" y="-2770.64" width="371302" height="374073"/></clipPath><image width="512" height="512" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAYAAAD0eNT6AAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEbAAUAAAABAAAAUgEoAAMAAAABAAIAAIdpAAQAAAABAAAAWgAAAAAAAACQAAAAAQAAAJAAAAABAAOgAQADAAAAAQABAACgAgAEAAAAAQAAAgCgAwAEAAAAAQAAAgAAAAAAGcBUEAAAAAlwSFlzAAAWJQAAFiUBSVIk8AAAQABJREFUeAHt3QmcXFWd6PF/VXVWwq6OozAgIjoiDKQTEBjHwCij7Nla2Yc1Ks6Mz+d7b5aPb9pZ3rzZfM8NCKC4IWMnnYAs6jAIjEImJJ1gEFRkGQVxYSdk63TVnf+pTnf6dN3uruUu55z7u34wfW/de5bvuVX/f926iwgTAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAwgKlhMujOAQQQCA1gUikdNsRss+OmbJvKZLZUUm2Vrtkx8Nr5Be9IrXUKqZgBAIUIAEIcFDpEgKhCNx8tLyuWpF3RZGcKCU5Uvv1Fv1vdkz/duiyJ/S/72uS8L1yTe5ctEF+GLMeixBAYJcACQC7AgIIOCVw+6EyY9u+crZEcoE27J36X7nNBm7S7W6oTpPretbI822WwWYIBCtAAhDs0NIxBPwSuH6BzNx7i3xYv+1/VFv+mwm2/hU9KnBtrSp/17NRnkmwXIpCwGsBEgCvh4/GIxCGQH+3nKqH+D+tvTkktR6V5AU9b+Djm9bLVb2cL5AaMwX7I0AC4M9Y0VIEghMw3/r33CJ/r4H5j7PqnH7o3V2pyrlnbpSns6qTehBwUYAEwMVRoU0IFECg/1g5QKpym3bVnNyX9fQrPbHgjIXr5f6sK6Y+BFwRaPfkGlfaTzsQQMBDgZXdejZ/Ve7VpucR/I3Yb+g1g3eu6JaTPeSjyQgkIsARgEQYKQQBBJoVuGm+HFiN6sH/wGa3SXG97bWanLx0g3w3xTooGgEnBUgAnBwWGoVAmAJ9x8l+lZ2yRnt3mDM9NCcHVuUE7hvgzIjQkIwE+AkgI2iqQaDoAnopXqkyKNepgzvB3wxKJPtGZVlxS3fsDYaKPmz0P2ABEoCAB5euIeCSgF7q9wG91G+hS20a05bDB8vyT2Pm+ROB4AX4CSD4IaaDCOQvsOpIeU00XX6sLdkn/9ZM2IKa3oTo+CUDsnbCNXgBgYAEugLqC12ZRMAcfr15vhygd0N7kx7ufL1+E5ulH3Z7l0rykmaBW/T1J6sij/Ssk19OUgwvIdCWQDRD/lYPtbsc/E2/yvp++JT++/a2OunBRnctkK5nXpaD9YP/t/RzYI7ef2GGfhYMVkvyin4ePCXb5Imeh2TQg67QxAQEOAKQAKKrRdQfpFKWJfrEtN/XNv6u/rdfE239pX4gmDOi/02fsraSe6g3IcYqkwrsut7/MV1p+qQrOvJiVJP3LNkg33akOR01o+9wmV6eLX+ghbxLE7Df0w/8w/XvaZMUOqSv/Vg/A/5d1/+OPkfhNv0M2DbJ+rzksQAJgMeDF9d0802/f568Vwf2I/r6SfpfJW69JpeZbwK362VSn+QyqSbFWK1BYOU8+aTuj/+t4QV3F9y1eH39veNuC6do2apj5DD9Rv8RDeLv01WbSfwnKnGzltFfLsunFq6TByZaieV+CpAA+Dlusa02NzXRQ5j/pIN6ROwKnS28Tz9QPsrvo50hFm1r/QY6pzJLDy2L7O1R3yP9Bnzo4nXyuEdtrje1b64cWinL/9UZc7Jl0id5f0uPjvwvPTqyyTcX2hsvQAIQ7+LV0puPld8YGpKr9EMr7TOs9eZpcv22inz0vLXysldINDYXgf75crl+g1yeS+WdVKoPDVo8IH/TSRFZbmsO9XfNkv+tRwA/pvXOSLFu8xPB1VsG5U8v2CRbUqyHojMQSDpDzKDJVDFWQD9g3zVU1UNz6Qd/U63ZXy6ZVZWB1XOle2w7+BuBWIFIlsUud31hSd7rehNH2rf67XJwZbbco8H/L3RZmsHfVGlOHP/wnOn6GTBfjjILmPwV4AiAv2MnGvz/UL9dXatdMG/KrKcdegbx+YsGZEXWFVOfHwL9c/Vs+nL9rn9+NNhu5ZAe6drf9SNd+hnwe/oZcLM2PY8rLLZpADl70fp6/bYec14IcATAi2FqbKSeWPUX+sa/Xl/JI/ibBs3Qqwtu1HZc3Ng6liCgAmX5oMcOXTN2yu+43H492fcM/QwwVyvkEfwNzSw96tCv7bjEZSfaNrEACcDENs6+om+4Xs28Xfh9sqLtuFbbs8RZLBqWi8CtR8i+WrHX+0Wlok8sdHRaMU8WaNO+rv/NzLmJ5iqja1fNkytybgfVtyFAAtAGWp6bmOCv9f9lnm0YV7fZh76ij3c9dtxyZgsssGN6/cjQbJ8J9NvtG11svwbbt2nUNYf98w7+Izz6a6B8ZpW51TOTVwIkAB4Nl4PBf0Rvpl5+uMI86W1kAf8WV0CDQUn/d7n3AvqQINf6cP0Cmam+N+h/eznWtpL+JHglRwIcG5UpmkMCMAWQKy87HPxHiA7UJ719emSGf4srsHqe3nXOtSf+tTcczt27YK8t8kntypHtdSf1rTgSkDpxshWQACTrmUppHgT/4X6X5Nxdv02m4kChfgjot9MP+dHSKVvp1D3x9WmKc/WkP9cvq+RIwJS7lTsrkAC4MxaxLfEm+O9qve5Qn6wfAo7tDQtDF9CfgV6vfTwtkH66dQ/8knxGXX34zOZIgCdvAB92Jk8ok2+mb8F/l8DRN82T9ySvQYk+CHQN1n/7z+vS1ESJNIr9MtECOyhMv/2fqJsf30ERWW/KkYCsxduojwSgDbQsNvE0+Ndp9AjAf8/CiDrcEjCPmtWxD+aacL3vtTvPAih5+Z7iSIBbb9GG1pAANJDkv8Dn4G/0NAic1H+0HJS/JC3IUuCFLXpjmlL9J4Asq02tLo1eP06t8BYK7psvr9XVfT2qxpGAFsY661VJALIWn6I+34P/ru6V9JdKr28CM8Uw8XKcQBTMyX+md9uqO2RDXDezXqYf0uaRvuaGO75OHAlwdORIABwamECCf11U7wtwskO0NCVlgfrz5/XIT8rVZFd8Sdb0PCROXAWg0dPXb/9jx4sjAWM1HPmbBMCRgQgp+BtS/Rng+OXdMs0RXpqRskCtVr8LnN4ZOpip34We9C0Vc7ttn07+m4xt+EgAtw2ezCjT10gAMuWOryy04L+rl3P2L8th8T1maUgCeunfLA1SFwbUp6GusjiRAMjP5M2aTLt2179OhpqfAzrRS3hbEoCEQVstLtDgX2coR3Joqx6s759AZaj+G3Uwt4HWgHvbmWvlVy6MRHkoyCSanwNc2Lm0DSQAOQ5EyMHfsNZKcnCOvFSdlUDk9WN/G5VqclXjwnyW6Lk0B+dTc+q1ciQgdeKpKyABmNoolTVCD/4GrVQL6tBlKvuB74Wuni9HaR+O8b0fY9r/+A82yB1j5vP+07nnESQIwpGABDHbKYoEoB21DrcpQvCvE5Vljw6p2NxxAb1ZTlDPgdfD/1f36sErh9i9fqRyE44cCWgCKa1VSADSkp2g3MIEf9P/SLZPwMDiAAT6umVvHeOzA+jKSBd2lAflSyMzLvyrJ1fudKEdKbeBIwEpA09UPAnARDIpLC9U8Fc//f3ylRQYKdIRAb3hvznzP5ijPBpsVyzaJL92hLfejKgkW1xqT4pt4RLBFHEnKpoEYCKZhJcXLfgbvqgmTyfMSHEOCWhwutyh5nTclGrJnZP/RjqjSYkzDyQaaVOK//JzQIq4cUWTAMSpJLysiMG/TliWnyRMSXGOCKyYJwu0KYc70pwkmrFp6Tq5L4mCkiyjWpVHkyzPg7L4OSDDQSIBSBm7sMFfpNo1XX6UMi/F5yRQCe3SP5Erc6KctNpKlzysK+i5iYWaOBKQ0XCTAKQIXeDgb1Q3nnmvbE6Rl6JzEjBPp9PD/2flVH0a1W7eVpEb0yi40zIX3S/PaTT8YafleLg9RwIyGDQSgJSQCx78jep3UqKl2JwF9A6Pl2gTpufcjOSqj+Qr562Vl5MrMNmS9Ov/3cmW6E1pHAlIeahIAFIAJvjrLSZr0pcCLUXmLNCrQ6ufypfm3IxEq9fn7F6daIHJF7Yy+SK9KZEjASkOFQlAwrgE/zroIws3yEDCtBTngMDb5supEtbtab931oA86ADthE14cEDu0RefmnCF8F+oHwnQz9Zl4Xc12x6SACToTfDfhVmSTyXISlEuCYR28l/k3qV/44e7d/jOhM48n2B8+zKa1ysi5apVPEo4UW4SgIQ4Cf6jkM9Mr8kXR+f4IxiB/qPlIP0UPjmYDok8+/KessqH/uhTAc1VCkU/qZYjAQnvrCQACYAS/HcjRpH85ekDsnX3Ev4KRqCr/tQ//ck8kKkkn7/obj9uV73wAXlRLwb8h0DkO+kGRwI60Ru3LQnAOJBWZwn+u8X0bOUHaofINbuX8FcoAn2H18/6vyiU/mg/Ir3JznU+9WfWS/KPmgRwcy29y7h+1nyGnwM633tJADowJPhbeNsrJbmoZ4VUraXMBCFQmSlLNPi8JojOaCc0gHyrZ4Nfd9k75VHZURu+BJP3GElAIm9FEoA2GQn+Npwe+v/ownXygL2UuWAESvXD/8F0R69k8PKkuqUb5Ls6CH8dzkB01BPOCeiIT6/p7XD7Qm5O8B837PphumTAzw/UcT1hNkag71h5qy4+IeYlXxc9WTtYbve18Q+ul7/WyMd9NoYHkHMCOtiRSQBaxCP4jwMryYoH18mHxy1lNiCBSq0+vuaDNohJD/8v9/mnql69LHDmS3KBHsXgbpvDeyRHAtp8Z5IAtABH8Lex9IN01bM1Obd3+Dpl+0XmghDQk//m6KfruUF0ZrgTO6dV5Xrf+2POB9DLbU/Xftzle18Saj9HAtqAJAFoEo3gb0OZ4P9cJO9fNiA77VeYC0mga6acp2O9Vyh90mRm9Zkb5ekQ+mMut50eyWkcCRgdzfqRAK4OGPWY8g8SgCmJRAj+NhLB3/YIeU6f+nd5SP2renry30RjUE8CzJEAfg4YISIJGJFo4t9gftdroq9trULwt9kI/rZHyHMr5svx+uS/ewPq4w8XrZfD9UNPd+Owplu6ZfZgWW7Rnp0UVs/a7o3mrvJHOt6fa7uEAmzIEYBJBpngb+MQ/G2P0OcqAd73P8Tgb/ZDjgQ0vBs5EtBA0riABKDRpL6E4G/DEPxtj9DnVh0j++uYLwmon1tnDMpXA+pPQ1dIAhpISAIaSOwFJAC2R32O4G+jEPxtjyLM1WpysfZzZjB9LcnXTntQXgimPxN0hCSgAYYkoIFk9wISgN0W9b8I/jYIwd/2KMKcjnlJPzUvC6mv5apcHVJ/JusLSUCDDklAA8nwAhKAMTAE/zEY+ifB3/Yoylz/XH3kb0neFFB/71+4QQYC6s+UXSEJaCAiCWgg4VbAoyQE/1GK+h8Ef9ujUHNl7vsfwniTBDSMIknAOBKOACgIwd/eKwj+tkeR5vqPlQP0TPlTA+rzi3rHvMLeN58koGFPJgkYQ1L4BIDgP2Zv0D8J/rZH4eZq9Rv/dIXSb92frzdBMJT+tNOPkSRAE7s729k+wG1IAnYNaqETAIK//dYm+NseRZu7a4F0yfDZ/6F0Parpg39C6Uwn/TBJwLRIziAJGFUkCVCKwiYABP/RN0L9D4K/7VHEuRc2y0I9+e/1ofRdg913etbLj0PpT6f9IAloECx8ElDIBIDgb78RCP62R1Hn9N6pHwyp7/rt/6qQ+pNEX0gCGhQLnQQULgEg+NtvAIK/7VHUuZXd8hbt+4KA+v8LfVrlNwLqT2JdIQlooCxsElCoBIDgb+/4BH/bo8hz+kHwAe2/HjUPY9JP9Gt5VPXEY0kS0GBTyCSgMAkAwd/e4Qn+tkeR5/qOk1l6+P/8gAyGoi65NqD+pNIVkoAG1sIlAYVIAAj+9o5O8Lc9ij5X3ilnq8F+wThEcsvitfJUMP1JsSMjSYBW8W8pVuNT0YVKAoJPAAj+9nuP4G97MFc/7h/UyX+lEif/tbJfmySgOk3O0G1IAobhCpMEBJ0AEPztjwGCv+3BnMjKeXK0OswLyOKxTeu54U2r49mzRraRBFhqhUgCgk0ACP7Wzswd/mwO5nYJ6AfAh4PCiOSqXtHbGTG1LEAS0EAWfBIQZAJA8Ld3ZL752x7MDQusPkr20X3jfQF57KjW5MsB9SfzrpAENJAHnQQElwAQ/O0dmOBvezC3W0AP+V6oc3vsXuL9X1/v2SjPeN+LnDtAEtAwAMEmAUElAAR/e8cl+NsezNkC+ql2mb3E87kaJ/8lNYIkAQ2SQSYBwSQABH97hyX42x7M2QL93XKiLjncXurxXCTfX7xB/sPjHjjXdJKAhiEJLgkIIgEg+Ns7KsHf9mAuRqAc1n3/9bFmV8b0kkUdCpAENAAGlQR4nwAQ/O0dlOBvezDXKNA3X16rl4Wc1fiKt0s2d02XG71tveMNN0nArBflNG3mbY43NavmBZMEeJ0AEPzt/Z3gb3swFy9QqdV/+58W/6p/S3W///KZ98pm/1ruT4tPeVR2aBKwWFtMEjA8bEEkAd4mAAR/+8OD4G97MBcv0LdUKvrIn0viX/VzaVcky/1suV+tJgloGC/vkwAvEwCCv70jEvxtD+YmFuh6on4o96CJ1/DrFb3t73fPGpAH/Wq1v60lCWgYO6+TAO8SAIK/vQMS/G0P5iYX0Kf+BXXf/5re+W/yHvNq0gIkAQ2i3iYBXiUABH97xyP42x7MTS7QP18O0ZP/3j35Wl69+szsF2WVVy0OpLEkAQ0D6WUS4E0CQPC3dziCv+3B3NQC+gn1AV3Lm/f8lD2K5DoTiKZcjxVSESAJaGD1Lgnw4sOA4G/vaAR/24O5qQVuP1Rm6OH/C6de05s1atWKXOtNawNt6EgSoPvWrYF2sdVueZUEOJ8AEPzt/Y/gb3sw15zA1n1lqR7+f01za3ux1jd77pcnvGhp4I00ScDsF2QJScDoQHuTBDidABD8R3eo+h8Ef9uDueYF9BMpqJP/NJnh5L/mhz/1NUkCGoi9SAKcTQAI/vYORfC3PZhrXmDlXDlS1z6++S2cX/Nn1UPkW863smANJAloGHDnkwAnEwCCv70jEfxtD+ZaFAjtvv8lubpnhVRbVGD1DARMElDbKov5OWAU2+kkwLkEgOA/uuPU/yD42x7MtSbQd7jM0Tf5Oa1t5fTag11l+YLTLSx443oekkGSAGsncDYJcCoBIPhbO43+zCmrnovk/csGZKf9CnMINCdQninn6360V3Nre7HWqjPXyq+8aGmBG0kS0DD4TiYBziQABH97hyH42x7MtSegt8o11/6HM5U4+c+XwSQJaBgp55KAUkMTc1hA8LfRCf62B3PtCazqlt/V32K/297WTm71w0Xr5XD90NK3CJMvAvoz1PTybOnX6GceKcyk+6/uw3+k+/Ln8sbI/QgAwd/eBQj+tgdz7QtEgZ38p++NKwn+7e8PeW3JkYAGeWeOBOSaABD87R2D4G97MNe+wC3d8ir9nryo/RKc23LrzB1yg3OtokFNCYwkAbryLU1tEP5KTiQBuSUABH97Dyf42x7MdSagZ41eoiXM7KwUd7aOIrnhtAflBXdaREtaFTBJQHWbLNHtSAKG8XJPAnJJAAj+9luH4G97MNeZgO5PJf2R8dLOSnFr60oky91qEa1pR4AkoEEt1yQg8wSA4G/vAAR/24O5zgVumifv0VIO7bwkZ0pYu3CDDDjTGhrSkQBJQANfbklApgkAwd8eeIK/7cFcMgI1Ceu+/3r4n/v+J7NrOFMKSUDDUOSSBGSWABD87QEn+NsezCUjcNN8OVBLOiWZ0hwopSQvzBBZ4UBLaELCAiQBDaCZJwGZJAAEf3ugCf62B3PJCdQiWaalVZIrMfeSvnD6gGzNvRU0IBUBkoAG1kyTgNQTAIK/PcAEf9uDueQElnfLNN2/LkquxNxLivROhtfk3goakKrAmCTgG6lW5E/hmSUBqSYABH97jyP42x7MJSuwf1kWaomvS7bUXEv7t0X3yyO5toDKMxHYlQQs1cpIAobFM0kCUksACP72+4bgb3swl7xAOQrr5D8pc/Jf8nuJuyWSBDSMTepJQCoJAMHfHkiCv+3BXPICK7vlLbqfvTP5knMr8RfPVuXW3Gqn4lwESAIa2FNNAhJPAAj+9gAS/G0P5lISGL7vv94qP5hpOY/BDmYsW+oISUADV2pJQKIJAMHfHjiCv+3BXDoCet//2Rr5z0+n9FxKHapOk+tyqZlKnRAgCWgYhlSSgMQSAIK/PWAEf9uDufQEBkXO0Qf/7JteDdmWrO+db/SskZ9nWyu1uSZAEtAwIoknAYkkAAR/e6AI/rYHcykLlOUDKdeQafF66R93/stU3N3KSAIaxibRJKDjBIDgbw8Qwd/2YC5dgdXz5Bj99t+dbi2Zlv7Yg+vkO5nWSGVOC5AENAxPYklARwkAwd8eGIK/7cFc+gJ6578Ppl9LdjXoff+v7BXRxxkwIbBbgCRgt8WuvxJJAtpOAAj+9oAQ/G0P5tIXWH2U7CMl6Um/psxq2FauyJcyq42KvBIYSQL0s/ZmrxqeXmM7TgLaSgAI/vaIEvxtD+ayEYi66rf9nZ1NbenXor/99+md/55LvyZq8FXAJAG1bdJDEjA6gh0lAS0nAAT/Ufj6HwR/24O5bAR0vzPX/JsH/wQz1Wqc/BfMYKbYEZKABty2k4CWEgCCvw1P8Lc9mMtOYOV8OUn3vzdnV2O6NWlfHlgyIGvTrYXSQxEgCWgYybaSgKYTAIK/DU7wtz2Yy1YgtPv+66fX57IVpDbfBUgCGkaw5SSgqQSA4G9DE/xtD+ayFejvlt/UGs/IttZUa3tpy065MdUaKDxIAZKAhmFtKQmYMgEg+NvABH/bg7nsBaKyXKa1Tsu+5tRq/PIFm2RLaqVTcNACI0mAdvKmoDvafOeaTgImTQA0+P+l1mn+Y1IBgj+7Qd4CfUulou/ui/NuR5L169mM1yRZHmUVT8AkAdVt8j7tOUnA8PA3lQRMmACsmidXaDm9w2Xx/wR/9gEXBMqPy+najoNcaEtCbbhn0Xr5QUJlUUyBBUgCGga/ngToo8IvbHhl14LYBEB/YzxHA95nJtqoaMsJ/kUbcXf7Wy6Fdec/vZiR+/67u7t517IxScA3vGt8Og0u6f01rtEv9O+OK95cS2xNfcfKWytVWacLg7nBiNXBFmcI/i2CsXpqAqu75Y21kjyiFcQm7qlVnF7Bz8x6UQ485VHZkV4VlFxEgb7DZXpllnxd+35WEfvf0OeSvCBDcvTijfLTsa9ZHyS3HyozNPgbNIK/IhD8x+4q/J23QK1c//ZvvWfzblOH9V9L8O9QkM1jBUaOBOhnOLcNNkL6uPBSl3zFnEM0Fsz6MNm6j3xMX3zb2BWK+jfBv6gj72a/TXKub+IL3GxdW62qlbvk2ra2ZCMEmhAwScBzkSzVVTkxUBH0QVvvqDwhl46lG00A+o+VA/T3gD8b+2JR/yb4F3Xk3e339n3qZzi/2t0WttayqCS3L/wP+c/WtmJtBFoTWDYgO83VARwJGHX7277jZL+RudEEoDRU//a/x8gLRf2X4F/UkXe73xowg3rsrx7N4OQ/t3e5YFrHkQBrKPev7JQ/HllSTwBuPUL21Q+YS0YWFvVfgn9RR97tfq86Rn5HA+bb3W5lS637We0N8u2WtmBlBDoQMEcCno3qj87m5wCRP9KTJOcYznoCsGOmnK1/1xd0YOz1pgR/r4cv6MZHNflQSB3U3yKv6lkh1ZD6RF/cFxhNAiJZ7X5rU23hfl2zZJGpYfgngEjOSbU6xwsn+Ds+QAVu3s0nyJ7afZOghzINTuuS60PpDP3wS6CeBIieT1PwJEBjXv0zpXxLt7xKh/B4v4YxudYS/JOzpKTkBao76mf+myQgjKkkK89cK78KozP0wkeB0SSg2FcH/L75GaA8WJF36CA23BDIx4Fttc0E/1bFWD9rgZrIsqzrTLO+Uo2T/9L0pezmBPg5QKbpzwDHlfVQyLHNkYW1FsE/rPEMsTcr5so7NDM/IpS+6Y3JH144IPeG0h/64bfA6JGAgv4coDHwBJMAvNnvYWy99QT/1s3YInuByvCd/7KvOKUa9THGn9OERt9+TAi4IVDwJODNZX03vtGNocimFQT/bJyppTMBc26O7qv1M3U7K8mZrV/ZVpavOtMaGoLALoHCJgEleZM+XEz2LcqeQPAvykj738/BslymvZjhf09Ge3DDeWvl5dE5/kDAIQGTBFQPqd82+F8cala6TYlkv7Ke/leIu/8R/NPdlyg9OYFec3luVE8Akis075IiuSbvJlA/ApMJmHtTVN8g5+k6RUkC9jQfNBobw5/0SMeMA18K5jGq4Q9YgXt4RLe8V7v/hoAI1iwekA0B9YeuBCrwwuNifhafHmj3xnerZG4EtGX80kDnT922j/TXn6oWaAfpViACod33X7j0L5A9M+huLO+WaXpTnK/rl8WQzr2ZbMxeMQnAs5OtEdhrp+ojj79mBjqwftGdQAT6uuW3tCvvCaQ7phvPvTxHVgTUH7oSoICJCfuX9NB/SRYG2L2JuvS8SQAem+jVEJeb7O5VJVnNkYAQR9f/PlVEPqC90H/CmPS+/9dfdLdsD6M39CJEgQJ+8x8ZxsfMSYA/Gpkr0L8cCSjQYPvSVb0153R9P17sS3ubaGdUrsi1TazHKgjkIlDQb/4j1o+UNUNfMzJXpH85ElCk0fajr7ue0PUbfrR26lbqe+yORffLI1OvyRoIZC9Q4G/+w9iR3FfeXpHv6dxQ9vxO1MiRACeGgUYYAT37+IMhSdT0sb8h9Ye+hCNQ8G/+ZiCr+jzu75V33ZzjrnCGtrWecCSgNS/WTkdg1Vz5bS3ZPJgrlOnp50RuC6Uz9CMcgcJ/8zdDWZJ7egbkJXMSoJRKcqP5t8ATlwgWePBd6LreJ/9D2g7NR8OYopIsN3dXC6M39CIUgZHgr++0Ip3t3zB8+kFTv9lRPQGYVpOv6xpFuhywAUQXkATEqbAsdQG97/9s/UA6N/WKsqtgqNYln8+uOmpCYGoBDvsPG2nwf7m0c/jS3HoCcPqAbNUPIH6vIwmY+l3EGokL7Cxr8I+CeibHTT1r5OeJQ1EgAm0KjHzzNz/5tllEMJvpuUZXL3xAXjQdqicA5g/9GeBT+s/z5u+CT5wYWPAdIOvu6xtyWdZ1pllfqcaXiTR9Kbs1Ab75W16bqyX5fyNLRhMAvVznOT0K8PGRF4r8r8kSuVlQkfeA7Pq+sluO1W//3dnVmHpNjy7cIIU9qTh1XSpoSYBv/jaXftH/q5518suRpaMJgFlQPViW6z/mskAm/TmA2wazG6QtUC6Hdemfen1OE2g9qMGEQL4CfPO3/fVN+cDQVvn02KVWAmAeh6g3IT1bV9AreJg4EsA+kKbArUfIvnojrqVp1pFx2duq0+TLGddJdQg0CPDNv4Fki6blZ/c8JINjX7ESAPPC4rXylAY+kwRYK47dqGB/cySgYAOeVXd3TK/f9nd2VvVlUM+/6Ml/nEeUATRVTCzAN/8Gm1opkouWDDTe9r8hATCbLlovd2gScKH+WWsoqoALOBJQwEFPuct6OK6k/7s85WqyLZ4ribL1prYGAb75N5Dox4x8bNFA/BM5YxMAU4QmAeZGAeYDiiTAgHCJ4LAC/5+IwOp58i4t6LBECnOgEPP74uJ1ss6BptCEggrwzT924Hs1lo+e9T9+jQkTALPi4vX1m3mQBOxW42ZBuy34qwMB/e0/qPv+67eMz3bAwaYIdCTAN/9Yvl6N4Z+IfWXXwkkTALMOSUADH0lAAwkLWhHo75bf1ONyp7WyjePrvrRlcPjWoo63k+YFKDAS/PU9Vejb+44b2imDv1l/ygTArEQSYBSsiSTA4mCmJYFS/cY/01raxuGV9b7/X7pgk2xxuIk0LVABgn/swDYV/M2WTSUAZkWSAKNgTSQBFgczzQjctUC69HKcS5tZ16N1rvGorTQ1EAGCf+xANh38zdZNJwBmZZIAo2BNJAEWBzNTCbywRc7QQ5Wvn2o9X17X3/7vXrJOHvKlvbQzDAGCf+w4thT8TQktJQBmA5IAo2BNJAEWBzOTCYR28l+NS/8mG25eS0GA4B+L2nLwN6W0nACYjUgCjII1kQRYHMzECazuljfq8pPiXvN02S+fq8lqT9tOsz0UIPjHDlpbwd+U1FYCYDYkCTAK1kQSYHEwM15AT5b7kC5r+z03vrzc50ty3bIB2Zl7O2hAIQQI/rHD3HbwN6V19GFEEtAwICQBDSQsMAJ9x8ksvVmOubtmKFNVhuS6UDpDP9wWIPjHjk9Hwd+U2FECYAogCTAK1kQSYHEwYwQqQ/I+/Wf/gDRuW7xRfhpQf+iKowIE/9iB6Tj4m1I7TgBMISQBRsGaSAIsDmb00r+w7vzHyX/s1BkIEPxjkRMJ/qbkRBIAUxBJgFGwJpIAi6O4M6vny1Ha+2MCEnh80zr514D6Q1ccFCD4xw5KYsHflJ5YAmAKIwkwCtZEEmBxFHNGn6Z1RWA9X97LQ8ICG1K3ukPwjx2PRIO/qSHRBMAUSBJgFKyJJMDiKNZMX7fsrYf/zw6o14OlQfliQP2hK44JEPxjByTx4G9qSTwBMIWSBBgFayIJsDiKM1MRuUB7u0cwPY5kxaJN8utg+kNHnBIg+McORyrB39SUSgJgCiYJMArWRBJgcRRkpiTmcdrhTJz8F85YOtYTgn/sgKQW/E1tqSUApnCSAKNgTSQBFkfYM6vmyzu1h28LpZelSB7W9/S9ofSHfrgjQPCPHYtUg7+pMdUEwFRAEmAUrIkkwOIIdya4+/6LfDbc0aJneQkQ/GPlUw/+ptbUEwBTCUmAUbAmkgCLI7yZvqPl1dqrswLq2Svbu+SGgPpDVxwQIPjHDkImwd/UnEkCYCoiCTAK1kQSYHGENVOpyGXaoxmh9EpvY/zV89bKy6H0h37kL0Dwjx2DzIK/qT2zBMBURhJgFKyJJMDiCGOm17yvonoCEEaHhntxTUidoS/5ChD8Y/0zDf6mBZkmAKZCkgCjYE0kARaH/zNHzpVTpCQH+9+T0R7cu2S9bByd4w8EOhAg+MfiZR78TSsyTwBMpSQBRsGaSAIsDr9nojL3/fd7BGl9WgIE/1jZXIK/aUkuCYCpmCTAKFgTSYDF4eeM3vnvt7Tlf+Bn62Nb/exLe0h/7CssRKAFAYJ/LFZuwd+0JrcEwFROEmAUrIkkwOLwb6Zr+Nu/3gAwmOkLF90t24PpDR3JRYDgH8uea/A3Lco1ATANIAkwCtZEEmBx+DPTd7hM17PlL/anxVO2NIoqct2Ua7ECApMIEPxjcXIP/qZVuScAphEkAUbBmkgCLA4/ZiozZYme/f8aP1rbVCu/vWSt/KSpNVkJgRgBgn8MiogTwd+0zIkEwDSEJMAoWBNJgMXhwUwprJP/VPwqD9RpoqMCBP/YgXEm+JvWOZMAmMaQBBgFayIJsDjcnek7Vt6qrTvB3Ra23LInq2+Q21reig0QUAGCf+xu4FTwNy10KgEwDSIJMArWRBJgcbg5UxmSK7RlJTdb10arIrmmZ4VU29iSTQouQPCP3QGcC/6mlc4lAKZRJAFGwZpIAiwOt2b05L85pZKc51arOmrNUFdNvtBRCWxcSAGCf+ywOxn8TUudTABMw0gCjII1kQRYHO7MVGbJuXr2/17utKjDlpRk9Zkb5ekOS2HzggkQ/GMH3Nngb1rrbAJgGkcSYBSsiSTA4nBm5nJnWpJEQ2qc/JcEY5HKIPjHjrbTwd+02IvfLPvnySXaVvMwEqcTFgOa0XTbrBdl8SmPyo6M6qOaCQRWHSPHRTW5b4KXfVz8o0Xr5a36waAHNZgQmFqA4B9r5HzwN632IqByJKBhB+NIQANJPgs0+H8wn5rTqTUqydUE/3RsQyyV4B87ql4Ef9NyLxIA01CSAKNgTSQBFkf2M/rtf3+tdWn2NadW47Zal3wltdIpOCgBgn/scHoT/E3rvUkATGNJAoyCNZEEWBwZz0RykdY4M+Na06zuaz1r5Pk0K6DsMAQI/rHj6FXwNz3wKgEwDSYJMArWRBJgcWQzoz+Ql6JILsumtmxq0ccYX51NTdTiswDBP3b0vAv+phfeJQCm0SQBRsGaSAIsjvRn+ufKyVrLYenXlFkN65bcL+szq42KvBQg+McOm5fB3/TEywTANJwkwChYE0mAxZHyzPBjf1OuJLvi9UZG3Pc/O24vayL4xw6bt8Hf9MbbBMA0niTAKFgTSYDFkc5M/7FygJ4pf2o6pedS6ovTavL1XGqmUi8ECP6xw+R18Dc98joBMB0gCTAK1kQSYHGkMFMTc+OfrhRKzqvIL54+IFvzqpx63RYg+MeOj/fB3/TK+wTAdIIkwChYE0mAxZHczF0LNPDX5OLkSsy/pGpFrs2/FbTARQGCf+yoBBH8Tc+CSABMR0gCjII1kQRYHMnMPP+KnKX3z3x9MqU5UEpJvtOzVh52oCU0wTEBgn/sgAQT/E3vgkkATGdIAoyCNZEEWBwJzJTCuvOf3vCXk/8S2C1CK4LgHzuiQQV/08OgEgDTIZIAo2BNJAEWR/szfXPlUA2YJ7ZfgnNb/vLZSG52rlU0KFcBgn8sf3DB3/QyuATAdIokwChYE0mAxdHeTKUsV+iWegFAIFNJrl02IDsD6Q3dSECA4B+LGGTwNz0NMgEwHSMJMArWRBJgcbQ203eczNItLmhtK6fXrsqQfN7pFtK4TAUI/rHcwQZ/09tgEwDTOZIAo2BNJAEWR/MzXUPyfl17v+a3cHtNvZXxrYs3yk/dbiWty0qA4B8rHXTwNz0OOgEwHSQJMArWRBJgcTQ3o/f9D+qxv3opIyf/NTf0wa9F8I8d4uCDv+l18AmA6SRJgFGwJpIAi2PymdXz5ShdY/7ka3n16uM/2CB3eNViGpuKAME/lrUQwd/0vBAJgOloPQmIZJn+WTPzTEIS0OROUIvkw02u6sVqevj/6l7eB16MVZqNJPjH6hYm+JveFyYBMJ1dPCDX6WVcJAEGY3giCRiRmODf1UfJPvqS+f0/lGlHeVC+FEpn6Ed7AgT/WLdCBX8jUKgEwHSYJMAoWBNJgMVhz1SnyYW6ZA97qddzfYs2ya+97gGN70iA4B/LV7jgbxQKlwCYTpMEGAVrIgmwOHbPlCK5bPec/3+Vypz85/8ott8Dgn+sXSGDv5EoZAJgOk4SYBSsiSTA4hDp767f9e/wcYt9nt206H5Z43MHaHv7AgT/WLvCBn+jUdgEwHSeJMAoWBNJwFiOcmCX/olcObZ7/F0cAYJ/7FgXOvgbkUInAAaAJMAoWBNJgHL0zZfX6gmjZ1kyfs9s3laRG/3uAq1vR4DgH6tW+OBvVAqfABgEkgCjYE2FTwIqNblURaZZKj7PRPKV89bKyz53gba3LkDwjzUj+O9iIQHYBUES0PBGKWwS0LdUKvrIH5MABDNV9Nr/YDpDR5oSIPjHMhH8x7CQAIzBIAkYgzH8ZyGTgMoTcqp2/6AGDX8XfO+sAXnQ3+bT8lYFCP6xYgT/cSwkAONASALGgUgh7xgY1n3/Iy79a9irA15A8I8dXIJ/DAsJQAwKSUADSmGOBPTPl0O09yc3CPi74NmX95RV/jaflrciQPCP1SL4x7JwEuAELJwYGANTjCRg+FbR4STGJfn8RXfL9pjxZFFgAgT/2AEl+MeyDC8M54Nukk62+xJHAhrkgk4C+g6X6Xry3x829NrfBVG1qs+/YApegOAfO8QE/1iW3QtJAHZbxP5FEtDAEmwS0DVbluq1/69p6LGnC/Spf9/q2SCPetp8mt2kAME/ForgH8tiLyQBsD1i50gCGliCTAKiKLA7/5U4+a9hzw1sAcE/dkAJ/rEsjQtJABpNYpeQBDSwBJUErJwv5p7/JzT00t8FP6sdLLf723xaPpUAwT9WiOAfyxK/kAQg3iV2aT0JEPmQvqhHV5lUwCQBX+oN4WTSSD4c2Igu71kh1cD6RHd2CfTqe+7VIl/Vc1YWgjIq8InF6+UTo3P8MaUACcCURPYKuoMt1/B/uS6t2a8Udu59R8yXz/rcez35b46+Ec7xuQ/j2r6zqypfHLeM2YAEjpgnn4pK0hNQlzrtign+vZ0WUrTtSQDaGHF+DhiHpr+d66Nzvb11bnmmnK+HdPYa1yufZ1eduVGe9rkDtH1igVXz5TJ9NbQjVhN3eOpXzGH/3qlXY43xAiQA40WanCcJGAdVks+snCtHjlvqxWypVP9A9aKtzTRS+3NVM+uxjn8C5lwVPVn1//vX8tRazGH/DmhJADrAIwmw8GaWyvL5Xs/OB+ifVz/x72irJ37P/HDhOvl3v7tA6+ME9ChVqRTJlfra7LjXC7iMb/4dDjoJQIeAJAEW4Dw9H+ACa4nrM6Fd+qf3/S9xkqrre11b7dND/xfqhr/X1sbhbcQ3/wTGVD8rmJIQqP8GXtITBD37BpxE38eV8VR1m7yx5yEZHLfcudlbuuVVgyV5Uhs207nGtdegrTN2yAGnPSgvtLc5W7kqcNcC6Xp+s/xEz/o/2NU2ZtguLvVLCJsjAAlBciRgFPIAPanu7NE5h//Q4H+xNi+U4K8HiOVrBH+Hd7gOmvb8K/qeIvgbQYJ/B/vR+E1JAMaLdDBfTwK4T4D+UOn+Gcrm91S9nNPbKxfidtNyVa6OW86yIAQ+EEQvOusEwb8zv4atSQAaSDpbwH0C6n7zVh0jh3Umme7Wq+fLH2gK8KZ0a8m09PsXbpCBTGukskwE+o+Wg7Si4zKpzN1KCP4pjA0JQAqo5kiAnq17hRatXzSLOemlSotd7jn3/Xd5dGjbWIGoXH8vFfl8LU74G7tDJPg3CUCCmGOLWjSgh2MLfMdATYAWjPVw6e+b5suB2p5TXWpTh215cXpN+josg80dFdCf1BY42rQsmmW++fdmUVER6yABSHHUi3wkQA99HN+3VCop8rZdtN4g39zK2cm2tdMptb7+9AHZ2s62bOOFQEgPqWoFnG/+rWi1sS4JQBtorWxS4CMBc6Y9Wf+m3QpX6uuaJ6jpkZmLU68ouwoifSiFufyUKUCBvvnyWu3WfgF2baou8c1/KqEEXicBSABxqiKKeiRgaKd7JwLuX64/Pe11U42ZR6/f2bNefuxRe2lqCwJdNTm0hdVDWZVv/hmNJAlARtBFPBKgtwZ+TUa8TVdTDuzOf/r7MPf9b3r0PVzRwfdQyop8808ZeGzxJABjNVL+u36fgEiWaTXFeJRwTfZMmbSl4ld2y1v09/J3trSR2yv/4pma3OJ2E2ldRwKRzOloe7825lK/jMeLBCBj8EL9HFCW6RnzTl5dWT6oK4R0OdU1ywZk5+Sd5lWfBfSbQpfP7W+h7Rz2bwErqVVJAJKSbKGcwvwcEMnmFlhSXbXvOJmlkf/8VCvJtvAhvY7humyrpLYcBLblUGfWVXLYP2vxXfWRAOQEX4QjARpwX86Jt6Hayk45R8/+37fhBV8XRHLL4rXylK/Np93NCZRr7iTRzbW45bX45t8yWXIbkAAkZ9lyScEfCYjk8ZZR0tsgqHupc/JfejuKSyWXS/JTl9qTcFv45p8waKvFkQC0Kpbw+gEfCYi2dcmjCXO1VdzKuXKkbjivrY3d3OixTevlTjebRquSFBicXn8PhXjSMCf8JbmjtFkWCUCbcEluFuiRgB+ft9aNnwD0csRzkxyv3MuK5KreolxJkjt2vg3oWSPmHIDQ7vPAYf98d6vR2kkARiny/SPASwTvyVfUqt3pBxNZLZ16ZnupIl+cejXWCEVAL1116b3UKSuH/TsVTHB7EoAEMTstKqSfA/Rpe9/q1COJ7fuPlQO0nDcmUZYjZfQtul+ec6QtNCMDAT0P4I4MqsmiCr75Z6HcQh0kAC1gZbFqCD8HmLP/N+/pRgJQqsrvZjFuWdWhP2dcnVVd1OOGwEt7yO3akhfdaE3breCbf9t06W1IApCebdsl+34kQA9Z3njR3bK9bYAEN9S2vCXB4vItKpLv67f/Nfk2gtqzFjDvJT2itiLrehOsj2/+CWImWRQJQJKaCZbl8ZGAalSRf06QorOiSvKmzgpwaGvu++/QYGTelE9qjT5eDcA3/8x3leYrJAFo3irzNX08EqCH//uWrJWfZI41QYWlSF41wUu+Ld7cNUO+5lujaW8yAksG5Ed6NOumZErLrBS++WdG3V5FJADtuWW2lWdHArZFVfmzzHCaqEg/NIN4mIr248tn3hv8XeGaGNHirlKJ5H9q7534aa2JUeCbfxNIea9CApD3CDRRvy9HAvR3yr9avNGxO5eVJIh9XDvByX9NvFdCXmXhgDymR7T+wYM+8s3fg0EyTQziw9ET646a6cGRgHtqh8g/dtTJNDaO5JU0is24zH9ftF5+kHGdVOegwDMif6PNcvlEUL75O7jfTNQkEoCJZBxc7uyRgEh+3lWVc3pWSNVBtpccbFNrTYpkeWsbsHaoAubxz9XhO1u6eC8Ivvl7tuORAHg2YK4dCTDX/Oud6U49c6M87SjlY462q9lmPTPrJelvdmXWC1+g5355QkryXu2pS0e3+Obv4a5HAuDhoJkjAfpo22Xa9Ly/cb+oJ6edotemf99VRn1q3o9cbVsz7dL2f/6UR2VHM+uyTnEEFq+TdZp8L9Ieb8m515EmI3++eL18Iud2UH0bAiQAbaC5sEk9CRB5v7Ylr+DwlH4AvUPf+Pe64DFhGyJZN+Fr7r9QGyrJNe43kxbmIaDnhdyhdwZ4l9ad188BQ/oF4FJNRv4uj/5TZ+cCJACdG+ZWggbflXok4HhtQKaHuTXw36n1HuPDiWkL18tD6vOr3Aaps4q/WT/c21kZbB2wwOIN8h+VkhytXbwv424+qVcknLhkvXwh43qpLkEBEoAEMfMoSo8EbCgPyTwNyuaNqAl5qtNL+qb/E30W/cla7y9SrSmhwtXFmHw7oeIyLSYqcfJfpuCeVnbWOnny2UgWaPN79b+07xOgu6V8oTpNjtLzkb6n9TF5LKBjyRSKwIr5cnx5+DrhExLu0w6NotdPq0jvmWv9+za9ap68W9v/rwmbpF3cz6pvkEMcvbIi7b5TfpsCfXPl0EpF/o+mveYR2El/wbtHf3L4U3PUoc3msZljAiQAjg1IEs1ZNV/eGdXkT/TknFO0vBkdlPmUnoT21cqQfMbhs/yn7F6vfhAe0a0/k5Tk4ClXdmWFSD6mR1nceaaCKy60oymBVXPlt6OyfERX7tH/9mlqo/iVtulRv1v0a/+nnT/fJ779LJ1EgARgEhzfX7r1CNl3cKacpd8GTtJvwO/U/hw4RZ+26esbdae4S4PlHZvWyXd7/XwASUM39SjAFWrw2YYX3FzwbHWbvKHnIacu83JTilZNKnD9Apm511a9ZLCqJwuW5ERd+TD9rzLpRiKP6+v3aPJ/59ay3HLeWnl5ivV52VMBEgBPB66dZvd1y97lirypVJPXaTDcQzP7PTSzf0H/fb5akf98+H75aW8gAX+8T/2D8JX6Q4oOGP+ac/N8+3duSEJp0O2Hyozt++hngH4Z0Ft3z9GjBPvqv5vLNdmsnwVPV7fLIySeoYz21P0gAZjaiDUCEVjVLUv1Q67P8e78bMugvPWCTblf3+04E81DAIFOBZI+SaTT9rA9AqkJ6FnLK7Tw21OroPOCa/rA9wsJ/p1DUgICCEwtQAIwtRFrBCRQGpSLtDtO3rZYD8f989L1cndA3HQFAQQcFiABcHhwaFryAos2ya/1a3aed1CcqFP3znxRPj7RiyxHAAEEkhYgAUhalPKcF1i6Qb6r37bfpw2tOtHYSL4/Y4eczj3/nRgNGoFAYQRIAAoz1HR0rIDexvhmvczpQl22c+zyrP/WKzAeni7yrtMelBeyrpv6EECg2AIkAMUe/0L3ftE6uUGTgDP0aEAu1znrpZjfHJou7zh9QJ4t9EDQeQQQyEWABCAXdip1RUCTgG9FVTlS27M2wzaZR6j+/Q/Wy2k9a+T5DOulKgQQQGBUQL/8MCGAgLlByra95X9oYP5z1ZiVosgGPQnxI+Y8hBTroGgEEEBgSgESgCmJWKFIAqvfLgdHQ/JxPTx/vvZ7WmJ9j+TnpbJ8XG+v/KXeQO+2mJgVBSGAQCYCJACZMFOJbwL9R8tB0iWX6tPPztOjAge32f6q3nnwm/qExuv2nSO3nXi3DLVZDpshgAACiQuQACROSoEhCeiRgFL/MdKtz084SYP5O/Ss/bdq/w7S/+IeqLJFlz+s69xXK8t908pyj4+PTw5p/OgLAghMLEACMLENryAQK2DOF3jl1fKq8qDsqU9anF6bIc9t0f8uulu2x27AQgQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAA6eprwAAAPiSURBVAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBIgj8F1Cx5oYEZQsEAAAAAElFTkSuQmCC" preserveAspectRatio="none" id="img6"></image><clipPath id="clip7"><rect x="0" y="0" width="371301" height="371301"/></clipPath></defs><g transform="translate(-756 -884)"><g><g clip-path="url(#clip1)" filter="url(#fx0)" transform="translate(757 888)"><g><g><path d="M482.943 195.5C482.943 283.043 379.597 370.586 276.25 370.586" stroke="#A5300F" stroke-width="21" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd"/><path d="M195.25 316.5C262.575 316.5 329.901 417.048 329.901 517.595" stroke="#A5300F" stroke-width="21" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd"/><path d="M30.2499 316C30.2499 270.437 67.1864 233.5 112.75 233.5 158.313 233.5 195.25 270.437 195.25 316 195.25 361.564 158.313 398.5 112.75 398.5 67.1864 398.5 30.2499 361.564 30.2499 316Z" stroke="#A5300F" stroke-width="21" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FADDCD" fill-rule="evenodd" fill-opacity="1"/><path d="M400.25 113C400.25 67.4365 437.187 30.5 482.75 30.5 528.314 30.5 565.25 67.4365 565.25 113 565.25 158.563 528.314 195.5 482.75 195.5 437.187 195.5 400.25 158.563 400.25 113Z" stroke="#A5300F" stroke-width="21" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FADDCD" fill-rule="evenodd" fill-opacity="1"/><path d="M237.25 539.334C237.25 527.275 247.025 517.5 259.084 517.5L400.417 517.5C412.475 517.5 422.25 527.275 422.25 539.334L422.25 626.666C422.25 638.725 412.475 648.5 400.417 648.5L259.084 648.5C247.025 648.5 237.25 638.725 237.25 626.666Z" stroke="#A5300F" stroke-width="21" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FADDCD" fill-rule="evenodd" fill-opacity="1"/><g clip-path="url(#clip2)" transform="matrix(0.000360892 0 0 0.000360892 261.75 517)"><g clip-path="url(#clip4)" transform="matrix(1 0 0 1 0.0663341 0.216198)"><use width="100%" height="100%" xlink:href="#img3" opacity="1" transform="scale(725.197 725.197)"></use></g></g></g></g></g><path d="M1226.19 1083.5C1226.19 1171.04 1122.85 1258.59 1019.5 1258.59" stroke="#A5300F" stroke-width="20.625" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd"/><path d="M938.5 1204.5C1005.83 1204.5 1073.15 1305.05 1073.15 1405.6" stroke="#A5300F" stroke-width="20.625" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="none" fill-rule="evenodd"/><path d="M773.5 1204C773.5 1158.44 810.436 1121.5 856 1121.5 901.563 1121.5 938.5 1158.44 938.5 1204 938.5 1249.56 901.563 1286.5 856 1286.5 810.436 1286.5 773.5 1249.56 773.5 1204Z" stroke="#A5300F" stroke-width="20.625" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FADDCD" fill-rule="evenodd" fill-opacity="1"/><path d="M1143.5 1001C1143.5 955.437 1180.44 918.5 1226 918.5 1271.56 918.5 1308.5 955.437 1308.5 1001 1308.5 1046.56 1271.56 1083.5 1226 1083.5 1180.44 1083.5 1143.5 1046.56 1143.5 1001Z" stroke="#A5300F" stroke-width="20.625" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FADDCD" fill-rule="evenodd" fill-opacity="1"/><path d="M980.5 1427.33C980.5 1415.28 990.275 1405.5 1002.33 1405.5L1143.67 1405.5C1155.72 1405.5 1165.5 1415.28 1165.5 1427.33L1165.5 1514.67C1165.5 1526.72 1155.72 1536.5 1143.67 1536.5L1002.33 1536.5C990.275 1536.5 980.5 1526.72 980.5 1514.67Z" stroke="#A5300F" stroke-width="20.625" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#FADDCD" fill-rule="evenodd" fill-opacity="1"/><g clip-path="url(#clip5)" transform="matrix(0.000360892 0 0 0.000360892 1005 1405)"><g clip-path="url(#clip7)" transform="matrix(1 0 0 1 0.0684703 0.216198)"><use width="100%" height="100%" xlink:href="#img6" opacity="1" transform="scale(725.197 725.197)"></use></g></g></g></g></svg>
diff --git a/benchmark/bench_attention_sink/bench_attention_sink_triton.py b/benchmark/bench_attention_sink/bench_attention_sink_triton.py
new file mode 100644
index 000000000..21bc2a593
--- /dev/null
+++ b/benchmark/bench_attention_sink/bench_attention_sink_triton.py
@@ -0,0 +1,250 @@
+import argparse
+
+import torch
+import triton
+
+from sglang.srt.layers.attention.triton_ops.decode_attention import (
+    decode_attention_fwd_grouped,
+)
+from sglang.srt.layers.attention.triton_ops.extend_attention import extend_attention_fwd
+
+# gpt oss
+head_num = 64
+head_dim = 64
+head_kv_num = 8
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["S"],  # sequence length on x-axis
+        x_vals=[128, 256, 512, 1024, 2048, 4096],
+        x_log=True,
+        line_arg="B",  # batch size as different lines
+        line_vals=[1, 8, 32, 128],
+        line_names=["B=1", "B=8", "B=32", "B=128"],
+        styles=[
+            ("blue", "-"),
+            ("green", "-"),
+            ("red", "-"),
+            ("cyan", "-"),
+        ],
+        ylabel="TFLOPS",
+        plot_name="attention-sink-triton-decode",
+        args={},
+    )
+)
+def benchmark_decode(B, S, H_Q, H_KV, D):
+    D_V = D
+    dtype = torch.bfloat16
+    seq_len = S
+    total_tokens = B * seq_len
+    device = torch.device("cuda")
+    sm_scale = 1.0 / (D**0.5)
+    max_kv_splits = 8
+    num_kv_splits = torch.full((B,), 4, dtype=torch.int32, device="cuda")
+
+    # q represents the new token being generated, one per batch
+    q = torch.randn(B, H_Q, D, dtype=dtype, device="cuda")
+
+    # k_buffer and v_buffer represent all previous tokens
+    k_buffer = torch.randn(total_tokens, H_KV, D, dtype=dtype, device="cuda")
+    v_buffer = torch.randn(total_tokens, H_KV, D, dtype=dtype, device="cuda")
+
+    o = torch.zeros(B, H_Q, D_V, dtype=dtype, device="cuda")
+
+    b_seq_len = torch.full((B,), seq_len, device="cuda")
+
+    kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda")
+    kv_indptr[1 : B + 1] = torch.cumsum(b_seq_len, dim=0)
+    kv_indices = torch.arange(total_tokens, device="cuda")
+
+    attn_logits1 = torch.empty(
+        (B, H_Q, max_kv_splits, D_V),
+        dtype=torch.float32,
+        device="cuda",
+    )
+    attn_lse1 = torch.empty(
+        (B, H_Q, max_kv_splits, D_V),
+        dtype=torch.float32,
+        device="cuda",
+    )
+    sink = torch.randn(H_Q, device=device, dtype=torch.float32)
+
+    # warmup
+    for _ in range(5):
+        decode_attention_fwd_grouped(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            kv_indptr,
+            kv_indices,
+            attn_logits1,
+            attn_lse1,
+            num_kv_splits,
+            max_kv_splits,
+            sm_scale,
+            logit_cap=0.0,
+            sinks=sink,
+        )
+
+    # benchmark
+    run_step = 500
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    for _ in range(run_step):
+        decode_attention_fwd_grouped(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            kv_indptr,
+            kv_indices,
+            attn_logits1,
+            attn_lse1,
+            num_kv_splits,
+            max_kv_splits,
+            sm_scale,
+            logit_cap=0.0,
+            sinks=sink,
+        )
+    end_event.record()
+    end_event.synchronize()
+    torch.cuda.synchronize()
+    ms = start_event.elapsed_time(end_event) / run_step
+    tflops = lambda ms: (2 * B * S * H_Q * D) * 1e-9 / ms  # must be causal
+    return tflops(ms)
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["S"],  # sequence length on x-axis
+        x_vals=[128, 256, 512, 1024, 2048, 4096],
+        x_log=True,
+        line_arg="B",  # batch size as different lines
+        line_vals=[1, 8, 32, 128],
+        line_names=["B=1", "B=8", "B=32", "B=128"],
+        styles=[
+            ("blue", "-"),
+            ("green", "-"),
+            ("red", "-"),
+            ("cyan", "-"),
+        ],
+        ylabel="TFLOPS",
+        plot_name="attention-sink-triton-extend",
+        args={},
+    )
+)
+def benchmark_extend(B, S, H_Q, H_KV, D):
+    # S here represents N_CTX from the test
+    dtype = torch.bfloat16
+    device = "cuda"
+
+    # Split S into prefix and extend lengths
+    prefill_len = S // 2  # Similar to test's N_CTX // 2
+    extend_len = S // 4  # Make extend length smaller than prefix
+
+    # Calculate total tokens and extend tokens
+    total_extend_tokens = B * extend_len
+    total_prefix_tokens = B * prefill_len
+
+    # Create query, key, value tensors for extension
+    q_extend = torch.randn(total_extend_tokens, H_Q, D, dtype=dtype, device=device)
+    k_extend = torch.randn(total_extend_tokens, H_KV, D, dtype=dtype, device=device)
+    v_extend = torch.randn(total_extend_tokens, H_KV, D, dtype=dtype, device=device)
+    o_extend = torch.empty_like(q_extend)
+
+    # Create key-value buffers for prefix
+    k_buffer = torch.randn(total_prefix_tokens, H_KV, D, dtype=dtype, device=device)
+    v_buffer = torch.randn(total_prefix_tokens, H_KV, D, dtype=dtype, device=device)
+
+    # Create index pointers
+    qo_indptr = torch.arange(0, (B + 1) * extend_len, extend_len, device=device).to(
+        torch.int32
+    )
+    kv_indptr = torch.arange(0, (B + 1) * prefill_len, prefill_len, device=device).to(
+        torch.int32
+    )
+    kv_indices = torch.arange(0, total_prefix_tokens, device=device).to(torch.int32)
+
+    sm_scale = 1.0 / (D**0.5)
+    # sliding_window = 128  # From GPT-OSS config, skip for now
+    sliding_window = -1
+
+    sink = torch.randn(H_Q, device=device, dtype=torch.float32)
+
+    # warmup
+    for _ in range(5):
+        extend_attention_fwd(
+            q_extend,
+            k_extend,
+            v_extend,
+            o_extend,
+            k_buffer,
+            v_buffer,
+            qo_indptr,
+            kv_indptr,
+            kv_indices,
+            custom_mask=None,
+            is_causal=True,
+            mask_indptr=None,
+            max_len_extend=extend_len,
+            sm_scale=sm_scale,
+            sliding_window_size=sliding_window,
+            sinks=sink,
+        )
+
+    # benchmark
+    run_step = 500
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    for _ in range(run_step):
+        extend_attention_fwd(
+            q_extend,
+            k_extend,
+            v_extend,
+            o_extend,
+            k_buffer,
+            v_buffer,
+            qo_indptr,
+            kv_indptr,
+            kv_indices,
+            custom_mask=None,
+            is_causal=True,
+            mask_indptr=None,
+            max_len_extend=extend_len,
+            sm_scale=sm_scale,
+            sliding_window_size=sliding_window,
+            sinks=sink,
+        )
+    end_event.record()
+    end_event.synchronize()
+    torch.cuda.synchronize()
+    ms = start_event.elapsed_time(end_event) / run_step
+
+    # FLOPS calculation: each attention operation requires 2 multiplications per element
+    total_flops = 2 * total_extend_tokens * H_Q * (prefill_len + extend_len / 2) * D
+    tflops = lambda ms: total_flops * 1e-12 / (ms * 1e-3)  # convert to TFLOPS
+    return tflops(ms)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--bench", type=str, default="all", help="all, extend, decode")
+    args = parser.parse_args()
+
+    kwargs = {
+        "H_Q": head_num,
+        "H_KV": head_kv_num,
+        "D": head_dim,
+    }
+
+    if args.bench in ["all", "decode"]:
+        benchmark_decode.run(print_data=True, show_plots=False, **kwargs)
+
+    if args.bench in ["all", "extend"]:
+        benchmark_extend.run(print_data=True, show_plots=False, **kwargs)
+
+    print("Benchmark finished!")
diff --git a/benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py b/benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py
new file mode 100644
index 000000000..282097112
--- /dev/null
+++ b/benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py
@@ -0,0 +1,130 @@
+# Benchmark with lots of common prefixes. Used to benchmark prefix caching performance.
+#
+# Launch a server:
+# python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --log-level-http warning
+
+import random
+import string
+import time
+
+from tqdm import tqdm
+from transformers import AutoTokenizer
+
+import sglang as sgl
+from sglang import set_default_backend
+from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
+
+
+def generate_random_string(token_length: int) -> str:
+    random_string = "".join(
+        random.choices(string.ascii_letters + string.digits, k=token_length * 100)
+    )
+    tokenized_output = tokenizer.encode(random_string, add_special_tokens=False)[
+        :token_length
+    ]
+
+    if len(tokenized_output) < token_length:
+        tokenized_output = tokenized_output + [tokenizer.pad_token_id] * (
+            token_length - len(tokenized_output)
+        )
+
+    decoded_string = tokenizer.decode(tokenized_output, skip_special_tokens=False)
+    return decoded_string
+
+
+def generate_unique_prefix(base_text, index):
+    return str(index) + base_text[len(str(index)) :]
+
+
+@sgl.function
+def text_qa(s, question, gen_len):
+    s += "Q: " + question + "\n"
+    s += "A:" + sgl.gen("answer", stop="\n", temperature=0, max_tokens=gen_len)
+
+
+def prepare_prompts(num_prefix, num_samples_per_prefix, prefix_length, suffix_length):
+    base_prefix = generate_random_string(prefix_length)
+
+    tot_input_len = 0
+    all_prompts = []
+    for i in tqdm(range(num_prefix), desc="prepare prompts"):
+        unique_prefix = generate_unique_prefix(base_prefix, i)
+        prompt_list = []
+        for j in range(num_samples_per_prefix):
+            suffix = generate_random_string(suffix_length)
+            prompt = unique_prefix + suffix
+            prompt_list.append(prompt)
+            tot_input_len += len(tokenizer.encode(prompt))
+        all_prompts.append(prompt_list)
+    return all_prompts, tot_input_len
+
+
+def test_batch_by_batch(all_prompts, gen_len):
+    backend.flush_cache()
+
+    tot_time = 0
+    for i in range(len(all_prompts)):
+        tic = time.perf_counter()
+        text_qa.run_batch(
+            list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))),
+        )
+        tot_time += time.perf_counter() - tic
+
+    return tot_time
+
+
+def test_batch_by_batch_with_hint(all_prompts, gen_len):
+    backend.flush_cache()
+
+    tot_time = 0
+    for i in range(len(all_prompts)):
+        tic = time.perf_counter()
+        # Send a hint to cache the prefix
+        text_qa.run_batch(list(zip(all_prompts[i][:1], [gen_len])))
+        # Send the batch
+        text_qa.run_batch(list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))))
+
+        tot_time += time.perf_counter() - tic
+
+    return tot_time
+
+
+def test_send_all(all_prompts, gen_len):
+    backend.flush_cache()
+
+    all_prompts = [x for prompt_list in all_prompts for x in prompt_list]
+
+    tic = time.perf_counter()
+    text_qa.run_batch(
+        list(zip(all_prompts, [gen_len] * len(all_prompts))),
+    )
+    tot_time = time.perf_counter() - tic
+
+    return tot_time
+
+
+if __name__ == "__main__":
+    tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
+    backend = RuntimeEndpoint("http://127.0.0.1:30000")
+    set_default_backend(backend)
+
+    random.seed(0)
+    num_prefix = 10
+    num_samples_per_prefix = 32
+    prefix_length = 1024
+    suffix_length = 128
+    gen_len = 1
+    all_prompts, tot_input_len = prepare_prompts(
+        num_prefix, num_samples_per_prefix, prefix_length, suffix_length
+    )
+
+    print(f"Total input token length: {tot_input_len}\n")
+
+    cost = test_batch_by_batch(all_prompts, gen_len)
+    print(f"Latency of test_batch_by_batch          : {cost:.4f} s\n")
+
+    cost = test_batch_by_batch_with_hint(all_prompts, gen_len)
+    print(f"Latency of test_batch_by_batch_with_hint: {cost:.4f} s\n")
+
+    cost = test_send_all(all_prompts, gen_len)
+    print(f"Latency of test_send_all                : {cost:.4f} s\n")
diff --git a/benchmark/benchmark_batch/benchmark_batch.py b/benchmark/benchmark_batch/benchmark_batch.py
new file mode 100644
index 000000000..a8592d48a
--- /dev/null
+++ b/benchmark/benchmark_batch/benchmark_batch.py
@@ -0,0 +1,193 @@
+import concurrent.futures
+import os
+import random
+import time
+from concurrent.futures import ProcessPoolExecutor
+from statistics import mean
+
+import requests
+from tqdm import tqdm
+from transformers import AutoTokenizer
+
+from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
+
+###############################################################################
+# CONFIG
+###############################################################################
+ENDPOINT_URL = "http://127.0.0.1:30000"
+TOKENIZER_DIR = "/models/meta-llama/Llama-3.2-3B"
+
+# Benchmark configurations
+NUM_REQUESTS = 10  # Total number of requests (each with BATCH_SIZE prompts)
+NUM_TOKENS = 32000  # Tokens per prompt
+BATCH_SIZE = 8  # Number of prompts per request
+GEN_TOKENS = 0  # Tokens to generate per prompt
+
+
+###############################################################################
+# REQUEST GENERATION (in parallel)
+###############################################################################
+def generate_random_prompt(index, tokenizer_dir, num_tokens):
+    """Generate a single random prompt with specified token count."""
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
+    vocab_size = tokenizer.vocab_size
+
+    def generate_random_text(num_toks):
+        random_token_ids = [random.randint(0, vocab_size - 1) for _ in range(num_toks)]
+        return tokenizer.decode(random_token_ids, clean_up_tokenization_spaces=True)
+
+    random_text = generate_random_text(num_tokens)
+    return f"Prompt {index}: {random_text}"
+
+
+def prepare_all_prompts(num_requests, batch_size, num_tokens, tokenizer_dir):
+    """Generate prompts for all requests in parallel."""
+    total_prompts = num_requests * batch_size
+    all_prompts = [None] * total_prompts
+    max_workers = min(os.cpu_count() or 1, total_prompts)
+
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        futures = [
+            executor.submit(generate_random_prompt, i, tokenizer_dir, num_tokens)
+            for i in range(total_prompts)
+        ]
+        for future in tqdm(
+            concurrent.futures.as_completed(futures),
+            total=total_prompts,
+            desc="Generating prompts",
+        ):
+            index = futures.index(future)
+            all_prompts[index] = future.result()
+
+    batched_prompts = [
+        all_prompts[i * batch_size : (i + 1) * batch_size] for i in range(num_requests)
+    ]
+
+    print(
+        f"Generated {total_prompts} prompts with {num_tokens} tokens each, grouped into {num_requests} requests of {batch_size} prompts.\n"
+    )
+    return batched_prompts
+
+
+###############################################################################
+# HTTP CALLS
+###############################################################################
+def send_batch_request(endpoint, prompts, gen_tokens, request_id):
+    """Send a batch of prompts to the /generate endpoint synchronously."""
+    sampling_params = {
+        "max_new_tokens": gen_tokens,
+        "temperature": 0.7,
+        "stop": "\n",
+    }
+    data = {"text": prompts, "sampling_params": sampling_params}
+
+    start_time = time.perf_counter()
+    try:
+        response = requests.post(
+            endpoint.base_url + "/generate", json=data, timeout=3600
+        )
+        if response.status_code != 200:
+            error = response.json()
+            raise RuntimeError(f"Request {request_id} failed: {error}")
+        result = response.json()
+        elapsed_time = (time.perf_counter() - start_time) * 1000  # Convert to ms
+        avg_per_prompt = elapsed_time / len(prompts) if prompts else 0
+        return request_id, elapsed_time, avg_per_prompt, True, len(prompts)
+    except Exception as e:
+        print(f"[Request] Error for request {request_id}: {e}")
+        return request_id, 0, 0, False, len(prompts)
+
+
+def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
+    """Run the benchmark sequentially."""
+    results = []
+    num_requests = len(batched_prompts)
+
+    # Record start time for total latency
+    benchmark_start_time = time.perf_counter()
+
+    for i, batch_prompts in enumerate(batched_prompts):
+        request_id = i + 1
+        assert (
+            len(batch_prompts) == batch_size
+        ), f"Request {request_id} should have {batch_size} prompts, got {len(batch_prompts)}"
+
+        print(
+            f"[Request] Sending request {request_id}/{num_requests} with {len(batch_prompts)} prompts at {int(time.time()*1000)}"
+        )
+        result = send_batch_request(endpoint, batch_prompts, gen_tokens, request_id)
+        results.append(result)
+
+    # Calculate total latency
+    total_latency = (time.perf_counter() - benchmark_start_time) * 1000  # Convert to ms
+
+    return results, total_latency
+
+
+###############################################################################
+# RESULTS
+###############################################################################
+def process_results(results, total_latency, num_requests):
+    """Process and display benchmark results."""
+    total_time = 0
+    successful_requests = 0
+    failed_requests = 0
+    request_latencies = []
+    per_prompt_latencies = []
+    total_prompts = 0
+
+    for request_id, elapsed_time, avg_per_prompt, success, batch_size in results:
+        if success:
+            successful_requests += 1
+            total_prompts += batch_size
+            request_latencies.append(elapsed_time)
+            per_prompt_latencies.append(avg_per_prompt)
+            total_time += elapsed_time / 1000  # Convert to seconds
+        else:
+            failed_requests += 1
+
+    avg_request_latency = mean(request_latencies) if request_latencies else 0
+    avg_per_prompt_latency = mean(per_prompt_latencies) if per_prompt_latencies else 0
+    throughput = total_prompts / total_time if total_time > 0 else 0
+
+    print("\nBenchmark Summary:")
+    print(f"  Total requests sent:         {len(results)}")
+    print(f"  Total prompts sent:          {total_prompts}")
+    print(f"  Successful requests:         {successful_requests}")
+    print(f"  Failed requests:             {failed_requests}")
+    print(f"  Total latency (all requests): {total_latency:.2f} ms")
+    print(f"  Avg per request latency:     {avg_request_latency:.2f} ms")
+    print(f"  Avg per prompt latency:      {avg_per_prompt_latency:.2f} ms")
+    print(f"  Throughput:                  {throughput:.2f} prompts/second\n")
+
+
+###############################################################################
+# MAIN
+###############################################################################
+def main():
+    # Initialize endpoint
+    endpoint = RuntimeEndpoint(ENDPOINT_URL)
+
+    # Generate prompts
+    batched_prompts = prepare_all_prompts(
+        NUM_REQUESTS, BATCH_SIZE, NUM_TOKENS, TOKENIZER_DIR
+    )
+
+    # Flush cache before benchmark
+    # endpoint.flush_cache()
+
+    # Run benchmark
+    print(
+        f"Starting benchmark: NUM_TOKENS={NUM_TOKENS}, BATCH_SIZE={BATCH_SIZE}, NUM_REQUESTS={NUM_REQUESTS}\n"
+    )
+    results, total_latency = run_benchmark(
+        endpoint, batched_prompts, BATCH_SIZE, GEN_TOKENS
+    )
+
+    # Process and display results
+    process_results(results, total_latency, NUM_REQUESTS)
+
+
+if __name__ == "__main__":
+    random.seed(0)
+    main()
diff --git a/benchmark/benchmark_batch/benchmark_tokenizer.py b/benchmark/benchmark_batch/benchmark_tokenizer.py
new file mode 100644
index 000000000..88a5820b6
--- /dev/null
+++ b/benchmark/benchmark_batch/benchmark_tokenizer.py
@@ -0,0 +1,126 @@
+import random
+import time
+from statistics import mean
+
+from transformers import AutoTokenizer
+
+# CONFIG
+TOKENIZER_DIR = (
+    "/shared/public/sharing/fait360brew/training/models/meta-llama/Llama-3.2-3B"
+)
+NUM_TOKENS = 20000  # Each prompt should contain this many tokens
+BATCH_SIZES = [1, 2, 4, 8]  # Test different batch sizes
+NUM_RUNS = 5  # Number of runs for each batch size to get reliable measurements
+
+
+def generate_random_prompts(num_prompts, num_tokens, tokenizer):
+    """Generate random prompts with specified token count."""
+    vocab_size = tokenizer.vocab_size
+    all_prompts = []
+
+    print(f"Generating {num_prompts} random prompts with {num_tokens} tokens each...")
+    for i in range(num_prompts):
+        # Generate random token IDs - this directly gives us the exact token count
+        random_token_ids = [
+            random.randint(0, vocab_size - 1) for _ in range(num_tokens)
+        ]
+        random_text = tokenizer.decode(
+            random_token_ids, clean_up_tokenization_spaces=True
+        )
+
+        prompt = f"Prompt {i}: {random_text}"
+        tokens = tokenizer.encode(prompt)
+        print(f"  Prompt {i}: {len(tokens)} tokens")
+        all_prompts.append(prompt)
+
+    return all_prompts
+
+
+def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer):
+    """Compare sequential vs batch tokenization for a given batch size."""
+
+    # Sequential tokenization using encode()
+    sequential_times = []
+    for run in range(NUM_RUNS):
+        batch_prompts = prompts[:batch_size]  # Use same prompts for fair comparison
+
+        start_time = time.perf_counter()
+        for prompt in batch_prompts:
+            tokens = tokenizer.encode(prompt)
+        sequential_time = (time.perf_counter() - start_time) * 1000
+        sequential_times.append(sequential_time)
+
+    # Batch tokenization using tokenizer()
+    batch_times = []
+    for run in range(NUM_RUNS):
+        batch_prompts = prompts[:batch_size]  # Use same prompts for fair comparison
+
+        start_time = time.perf_counter()
+        tokens = tokenizer(batch_prompts)
+        batch_time = (time.perf_counter() - start_time) * 1000
+        batch_times.append(batch_time)
+
+    return {
+        "batch_size": batch_size,
+        "avg_sequential_ms": mean(sequential_times),
+        "avg_batch_ms": mean(batch_times),
+        "speedup_factor": (
+            mean(sequential_times) / mean(batch_times) if mean(batch_times) > 0 else 0
+        ),
+        "sequential_runs": sequential_times,
+        "batch_runs": batch_times,
+    }
+
+
+def main():
+    print("Tokenizer Benchmark: Sequential vs Batch Processing")
+    print("-" * 60)
+    print(f"Tokenizer: {TOKENIZER_DIR}")
+    print(f"Tokens per prompt: {NUM_TOKENS}")
+    print(f"Number of runs per batch size: {NUM_RUNS}")
+    print("-" * 60)
+
+    # Load tokenizer once for all operations
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
+
+    # The largest batch size determines how many prompts we need
+    max_batch_size = max(BATCH_SIZES)
+    all_prompts = generate_random_prompts(max_batch_size, NUM_TOKENS, tokenizer)
+
+    results = []
+    print("\nRunning benchmark...")
+
+    for batch_size in BATCH_SIZES:
+        print(f"\nBenchmarking batch size: {batch_size}")
+        result = benchmark_sequential_vs_batch(all_prompts, batch_size, tokenizer)
+        results.append(result)
+
+        print(f"  Sequential tokenization (encode):")
+        for i, run_time in enumerate(result["sequential_runs"]):
+            print(f"    Run {i+1}: {run_time:.2f} ms")
+        print(f"    Average: {result['avg_sequential_ms']:.2f} ms")
+
+        print(f"  Batch tokenization (tokenizer):")
+        for i, run_time in enumerate(result["batch_runs"]):
+            print(f"    Run {i+1}: {run_time:.2f} ms")
+        print(f"    Average: {result['avg_batch_ms']:.2f} ms")
+
+        print(f"  Speedup factor: {result['speedup_factor']:.2f}x")
+
+    print("\n" + "=" * 60)
+    print("SUMMARY OF RESULTS")
+    print("=" * 60)
+    print(
+        f"{'Batch Size':<10} {'Sequential (ms)':<18} {'Batch (ms)':<18} {'Speedup':<10}"
+    )
+    print("-" * 60)
+
+    for result in results:
+        print(
+            f"{result['batch_size']:<10} {result['avg_sequential_ms']:.2f} ms{' ' * 8} {result['avg_batch_ms']:.2f} ms{' ' * 8} {result['speedup_factor']:.2f}x"
+        )
+
+
+if __name__ == "__main__":
+    random.seed(0)
+    main()
diff --git a/benchmark/benchmark_vllm_060/README.md b/benchmark/benchmark_vllm_060/README.md
new file mode 100644
index 000000000..b480dabf2
--- /dev/null
+++ b/benchmark/benchmark_vllm_060/README.md
@@ -0,0 +1,89 @@
+## How to reproduce the benchmark results for SGLang v0.3.0 compared to vLLM v0.6.0
+
+In short, with multi step enabled, in online scenarios that we benchmarked, the Median TTFT of vLLM is **3 times** that of SGLang, and the Median ITL is **10 times** that of SGLang. Lower Median TTFT and ITL are better. vLLM's multi-step optimization did not improve throughput while ensuring lower Median TTFT and ITL. Also, under maximum throughput benchmark, if vLLM does not set gpu util to 0.95 separately and uses the default configuration instead, its maximum throughput is **lower** than that of SGLang.
+
+## Online benchmark results
+
+### Llama 3.1 8B Instruct 1 x A100 80G
+
+| RPS  | Num prompts | Engine | Median E2E Latency | Median TTFT | Median TPOT | Median ITL |
+|------|-------------|--------|--------------------|-------------|-------------|------------|
+| 4    | 1200        | SGLang | 1564.17            | **31.98**   | 13.17       | **11.93**  |
+| 4    | 1200        | vLLM   | 1691.97            | **100.48**  | 14.14       | **129.32** |
+| 8    | 2400        | SGLang | 2175.02            | **35.68**   | 17.85       | **14.41**  |
+| 8    | 2400        | vLLM   | 2137.16            | **120.39**  | 17.09       | **158.63** |
+
+### Llama 3.1 70B Insruct 4 x H100 80G
+
+| RPS  | Num Prompts | Engine | Median E2E Latency | Median TTFT | Median TPOT | Median ITL |
+|------|-------------|--------|--------------------|-------------|-------------|------------|
+| 4    | 1200        | SGLang | 3005.24            | **53.94**   | 25.03       | **21.67**  |
+| 4    | 1200        | vLLM   | 2915.60            | **179.15**  | 23.58       | **231.23** |
+| 8    | 2400        | SGLang | 4064.98            | **58.11**   | 33.07       | **24.45**  |
+| 8    | 2400        | vLLM   | 3752.38            | **207.12**  | 29.15       | **275.32** |
+
+## Offline benchmark results
+
+### Llama 3.1 8B Instruct 1 x A100 80G
+
+| RPS  | Num Prompts | Engine | Request throughput | Output token throughput |
+|------|-------------|--------|--------------------|-------------------------|
+| inf  | 5000        | SGLang | 22.03              | **4281.51**             |
+| inf  | 5000        | vLLM   | 21.27              | **4132.37**             |
+
+### Llama 3.1 70B Insruct 4 x H100 80G
+
+| RPS  | Num Prompts | Engine | Request throughput | Output token throughput |
+|------|-------------|--------|--------------------|-------------------------|
+| inf  | 5000        | SGLang | 19.84              | **3856.01**             |
+| inf  | 5000        | vLLM   | 19.04              | **3700.64**             |
+
+## Installation
+
+```bash
+# install sglang v0.3.0
+pip install --upgrade pip
+pip install "sglang[all]"==0.3.0
+pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
+
+# install vllm v0.6.0
+pip install vllm==0.6.0
+```
+
+## Notes
+
+We referred to the reproduction method in https://github.com/vllm-project/vllm/issues/8176, and added the `--num-scheduler-steps 10` parameter when starting the vLLM server. The `gpu_memory_utilization` of vLLM is by default 0.9 at both TP 1 and TP 4, while SGLang's `mem_frac` is 0.88 at TP 1 and 0.85 at TP 4, so we manually set it to 0.88 at TP 4.
+
+## Online benchmarks
+
+```bash
+# Llama 3.1 8B Instruct on 1 x A100
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache
+python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests --num-scheduler-steps 10 --max_model_len 4096
+
+# Llama 3.1 70B Instruct on 4 x H100
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-70B-Instruct --disable-radix-cache --tp 4
+python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-70B-Instruct --disable-log-requests --num-scheduler-steps 10 --tensor 4 --max_model_len 4096
+
+# bench serving
+python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 1200 --request-rate 4
+python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 2400 --request-rate 8
+python3 -m sglang.bench_serving --backend vllm --dataset-name sharegpt --num-prompts 1200 --request-rate 4
+python3 -m sglang.bench_serving --backend vllm --dataset-name sharegpt --num-prompts 2400 --request-rate 8
+```
+
+## Offline benchmarks
+
+```bash
+# Llama 3.1 8B Instruct on 1 x A100
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache
+python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests --num-scheduler-steps 10 --max_model_len 4096
+
+# Llama 3.1 70B Instruct on 4 x H100
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-70B-Instruct --disable-radix-cache --tp 4 --mem-frac 0.88
+python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-70B-Instruct --disable-log-requests --num-scheduler-steps 10 --tensor 4 --max_model_len 4096
+
+# bench serving
+python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 5000
+python3 -m sglang.bench_serving --backend vllm --dataset-name sharegpt --num-prompts 5000
+```
diff --git a/benchmark/blog_v0_2/405b_sglang.sh b/benchmark/blog_v0_2/405b_sglang.sh
new file mode 100644
index 000000000..491853782
--- /dev/null
+++ b/benchmark/blog_v0_2/405b_sglang.sh
@@ -0,0 +1,24 @@
+# Create dummy weights:
+# 1. Create a folder `~/llama-3.1-405b-fp8-dummy` and create `config.json` and tokenizer under this folder.
+# 2. Get `config.json`` from ./config.md
+# 3. Download the tokenizer
+#   wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer.json
+#   wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json
+
+# Launch sglang
+# python -m sglang.launch_server --model-path ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --tp 8 --quantization fp8 --disable-radix --mem-frac 0.87
+
+# offline
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 > sglang_log11
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 4000 --random-input 1024 --random-output 512 > sglang_log12
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 800 --random-input 4096 --random-output 2048 > sglang_log13
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1500 --random-input 4096 --random-output 1024 > sglang_log14
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 6000 --random-input 256 --random-output 512 > sglang_log15
+python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 2000 > sglang_log21
+
+# online
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 > sglang_log31
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 > sglang_log32
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 > sglang_log33
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 > sglang_log34
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > sglang_log35
diff --git a/benchmark/blog_v0_2/405b_trt.sh b/benchmark/blog_v0_2/405b_trt.sh
new file mode 100644
index 000000000..1950bc92b
--- /dev/null
+++ b/benchmark/blog_v0_2/405b_trt.sh
@@ -0,0 +1,17 @@
+# Launch trtllm
+# https://github.com/sgl-project/tensorrt-demo
+
+# offline
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log11
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 4000 --random-input 1024 --random-output 512 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log12
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 800 --random-input 4096 --random-output 2048 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log13
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 1500 --random-input 4096 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log14
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 6000 --random-input 256 --random-output 512 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log15
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name sharegpt --num-prompt 2000 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log21
+
+# online
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log31
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log32
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log33
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log34
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log35
diff --git a/benchmark/blog_v0_2/405b_vllm.sh b/benchmark/blog_v0_2/405b_vllm.sh
new file mode 100644
index 000000000..d1ef048e8
--- /dev/null
+++ b/benchmark/blog_v0_2/405b_vllm.sh
@@ -0,0 +1,24 @@
+# Create dummy weights:
+# 1. Create a folder `~/llama-3.1-405b-fp8-dummy` and create `config.json` and tokenizer under this folder.
+# 2. Get `config.json`` from ./config.md
+# 3. Download the tokenizer
+#   wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer.json
+#   wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json
+
+# Launch vllm
+# python3 -m vllm.entrypoints.openai.api_server --model ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --disable-log-requests --tensor-parallel-size 8 --max-model-len 10000
+
+# offline
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 > vllm_log11
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 4000 --random-input 1024 --random-output 512 > vllm_log12
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 800 --random-input 4096 --random-output 2048 > vllm_log13
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 1500 --random-input 4096 --random-output 1024 > vllm_log14
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 6000 --random-input 256 --random-output 512 > vllm_log15
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name sharegpt --num-prompt 2000 > vllm_log21
+
+# online
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 > vllm_log31
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 > vllm_log32
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 > vllm_log33
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 > vllm_log34
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > vllm_log35
diff --git a/benchmark/blog_v0_2/README.md b/benchmark/blog_v0_2/README.md
new file mode 100644
index 000000000..7448554ee
--- /dev/null
+++ b/benchmark/blog_v0_2/README.md
@@ -0,0 +1,164 @@
+# How to reproduce the benchmark results of SGLang
+
+## Prerequisite
+
+### Install the latest SGLang
+
+```bash
+git clone https://github.com/sgl-project/sglang.git
+cd sglang
+git checkout v0.2.7
+
+pip install --upgrade pip
+pip install -e "python[all]"
+
+pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
+```
+
+### Set up ulimit and HF_TOKEN
+
+```bash
+ulimit -n 65535
+# Change the token to a real and usable one, with access permissions for the Llama 3 models.
+export HF_TOKEN=hf_token
+```
+
+### Launch the server
+
+```bash
+# Meta-Llama-3.1-8B-Instruct
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile --disable-radix-cache
+
+# Meta-Llama-3.1-70B-Instruct
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-70B-Instruct --disable-radix-cache --tp 8
+
+# Meta-Llama-3-70B-Instruct-FP8
+python -m sglang.launch_server --model-path neuralmagic/Meta-Llama-3-70B-Instruct-FP8 --disable-radix-cache --tp 8
+```
+
+## Benchmark
+
+### Hardware Requirements
+
+- 8B models: Single NVIDIA A100 80GB GPU
+- 70B models: 8 x NVIDIA A100 80GB GPUs with Tensor Parallelism (TP) 8
+- 70B FP8 models: 8 x NVIDIA H100 GPUs with Tensor Parallelism (TP) 8
+
+Please ensure you have the appropriate hardware before running the benchmarks.
+
+#### Offline benchmark
+
+```bash
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompts 3000 --output-file offline.jsonl
+cat offline.jsonl | cut -d':' -f12 | cut -d',' -f1
+```
+
+#### Online benchmark
+
+```bash
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online.jsonl
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online.jsonl
+cat online.jsonl | cut -d':' -f9 | cut -d',' -f1
+```
+
+## Other
+
+We tried using vLLM 0.5.3.post1, but it often crashes under high loads, and it seems to have similar or worse performance compared to vLLM 0.5.2 from our partial benchmarking, so we are using the older version, vLLM 0.5.2.
+
+Preparation for TensorRT LLM can refer to https://github.com/sgl-project/tensorrt-demo. Specifically, we used a batch size of 512, a max input length of 8192, and a max number of tokens of 8192. The instance count for preprocessing and postprocessing in Triton Server is 16.
+
+```bash
+# vLLM
+pip install vllm==0.5.2
+pip install jsonschema==4.21.1
+
+# Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-8B-Instruct --disable-log-requests
+
+# meta-llama/Meta-Llama-3-70B-Instruct
+python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B-Instruct --disable-log-requests --tensor 8
+
+# neuralmagic/Meta-Llama-3-70B-Instruct-FP8
+python -m vllm.entrypoints.openai.api_server --model neuralmagic/Meta-Llama-3-70B-Instruct-FP8 --disable-log-requests --tensor 8
+```
+
+```bash
+wget https://raw.githubusercontent.com/sgl-project/sglang/main/python/sglang/bench_serving.py
+```
+
+```bash
+# vLLM Offline
+
+python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name sharegpt --num-prompts 3000 --output-file offline_vllm.jsonl
+cat offline_vllm.jsonl | cut -d':' -f12 | cut -d',' -f1
+```
+
+```bash
+# vLLM Online
+
+python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online_vllm.jsonl
+python3 bench_serving.py --backend vllm --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online_vllm.jsonl
+cat online_vllm.jsonl | cut -d':' -f9 | cut -d',' -f1
+```
+
+```bash
+# TensorRT LLM Offline 8B
+
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline_trt_8b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline_trt_8b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline_trt_8b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline_trt_8b.jsonl
+python3 bench_serving.py --backend trt --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline_trt_8b.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name sharegpt --num-prompts 3000 --output-file offline_trt_8b.jsonl
+cat offline_trt_8b.jsonl | cut -d':' -f12 | cut -d',' -f1
+```
+
+```bash
+# TensorRT LLM Online 8B
+
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online_trt_8b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online_trt_8b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online_trt_8b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online_trt_8b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online_trt_8b.jsonl
+cat online_trt_8b.jsonl | cut -d':' -f9 | cut -d',' -f1
+```
+
+```bash
+# TensorRT LLM Offline 70B
+
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --num-prompts 4000 --random-input 1024 --random-output 1024 --output-file offline_trt_70b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --num-prompts 5000 --random-input 1024 --random-output 512 --output-file offline_trt_70b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --num-prompts 1000 --random-input 4096 --random-output 2048 --output-file offline_trt_70b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --num-prompts 2000 --random-input 4096 --random-output 1024 --output-file offline_trt_70b.jsonl
+python3 bench_serving.py --backend trt --dataset-name random --num-prompts 6000 --random-input 256 --random-output 512 --output-file offline_trt_70b.jsonl --model meta-llama/Meta-Llama-3-70B-Instruct
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name sharegpt --num-prompts 3000 --output-file offline_trt_70b.jsonl
+cat offline_trt_70b.jsonl | cut -d':' -f12 | cut -d',' -f1
+```
+
+```bash
+# TensorRT LLM Online 70B
+
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 300 --request-rate 1 --output-file online_trt_70b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 600 --request-rate 2 --output-file online_trt_70b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 1200 --request-rate 4 --output-file online_trt_70b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 2400 --request-rate 8 --output-file online_trt_70b.jsonl
+python3 bench_serving.py --backend trt --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-name random --random-input 1024 --random-output 1024 --num-prompts 3200 --request-rate 16 --output-file online_trt_70b.jsonl
+cat online_trt_70b.jsonl | cut -d':' -f9 | cut -d',' -f1
+```
diff --git a/benchmark/blog_v0_2/config.md b/benchmark/blog_v0_2/config.md
new file mode 100644
index 000000000..3faf6009b
--- /dev/null
+++ b/benchmark/blog_v0_2/config.md
@@ -0,0 +1,100 @@
+### used for TensorRT LLM
+
+```
+{
+    "architecture": "LlamaForCausalLM",
+    "dtype": "float16",
+    "logits_dtype": "float32",
+    "vocab_size": 128256,
+    "max_position_embeddings": 8192,
+    "hidden_size": 16384,
+    "num_hidden_layers": 126,
+    "num_attention_heads": 128,
+    "num_key_value_heads": 16,
+    "head_size": 128,
+    "qk_layernorm": false,
+    "hidden_act": "silu",
+    "intermediate_size": 53248,
+    "norm_epsilon": 1e-05,
+    "position_embedding_type": "rope_gpt_neox",
+    "use_parallel_embedding": false,
+    "embedding_sharding_dim": 0,
+    "share_embedding_table": false,
+    "mapping": {
+        "world_size": 8,
+        "tp_size": 8,
+        "pp_size": 1,
+        "gpus_per_node": 8
+    },
+    "quantization": {
+        "quant_algo": "FP8",
+        "kv_cache_quant_algo": null,
+        "group_size": 128,
+        "smoothquant_val": null,
+        "has_zero_point": false,
+        "pre_quant_scale": false,
+        "exclude_modules": [
+            "lm_head"
+        ]
+    },
+    "kv_dtype": "float16",
+    "rotary_scaling": null,
+    "residual_mlp": false,
+    "moe_normalization_mode": null,
+    "rotary_base": 500000.0,
+    "moe_num_experts": 0,
+    "moe_top_k": 0,
+    "moe_tp_mode": 2,
+    "attn_bias": false,
+    "disable_weight_only_quant_plugin": false,
+    "mlp_bias": false
+}
+```
+
+### used for vLLM and SGLang
+
+```
+{
+  "_name_or_path": "dummy_fp8",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": 128009,
+  "hidden_act": "silu",
+  "hidden_size": 16384,
+  "initializer_range": 0.02,
+  "intermediate_size": 53248,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 128,
+  "num_hidden_layers": 126,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "quantization_config": {
+    "activation_scheme": "static",
+    "ignored_layers": [
+      "lm_head"
+    ],
+    "quant_method": "fp8"
+  },
+  "rope_scaling": {
+    "factor": 8.0,
+    "low_freq_factor": 1.0,
+    "high_freq_factor": 4.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "max_position_embeddings": 131072,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.41.1",
+  "use_cache": true,
+  "vocab_size": 128256
+}
+```
diff --git a/benchmark/boolq/README.md b/benchmark/boolq/README.md
new file mode 100644
index 000000000..3704742ee
--- /dev/null
+++ b/benchmark/boolq/README.md
@@ -0,0 +1,19 @@
+## Download data
+```
+git clone https://hf-mirror.com/datasets/google/boolq
+```
+
+## Convert parquet to json
+```
+bash parquet_to_json.sh
+```
+## Run benchmark
+
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path ramblingpolymath/Qwen3-32B-W8A8 --port 30000
+```
+
+```
+python3 bench_sglang.py
+```
diff --git a/benchmark/boolq/bench_sglang.py b/benchmark/boolq/bench_sglang.py
new file mode 100644
index 000000000..b3ce3c996
--- /dev/null
+++ b/benchmark/boolq/bench_sglang.py
@@ -0,0 +1,124 @@
+import argparse
+import json
+import time
+
+import numpy as np
+
+from sglang.api import set_default_backend
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import read_jsonl
+
+
+def get_example(lines, i, answer):
+    prompt = "Question: " + lines[i]["question"] + lines[i]["passage"] + "\nAnswer:"
+    if answer:
+        prompt += str(lines[i]["answer"])
+    return prompt
+
+
+def few_shot_examples(lines, k):
+    prompts = ""
+    for i in range(k):
+        prompts += get_example(lines, i, True) + "\n\n"
+    return prompts
+
+
+def main(args):
+    # Select backend
+    set_default_backend(select_sglang_backend(args))
+
+    # Read data
+    train_data_path = args.train_data_path
+    test_data_path = args.test_data_path
+    lines_train = list(read_jsonl(train_data_path))
+    lines_test = list(read_jsonl(test_data_path))
+
+    # Construct prompts
+    num_questions = args.num_questions
+    num_shots = args.num_shots
+    few_shots = few_shot_examples(lines_train, num_shots)
+
+    questions = []
+    answer = []
+    for i in range(len(lines_test[:num_questions])):
+        questions.append(get_example(lines_test, i, False))
+        answer.append(str(lines_test[i]["answer"]))
+    arguments = [{"question": q} for q in questions]
+
+    #####################################
+    ######### SGL Program Begin #########
+    #####################################
+
+    import sglang as sgl
+
+    @sgl.function
+    def few_shot_boolq(s, question):
+        s += few_shots + question
+        s += sgl.gen("answer", max_tokens=5, stop=["\n"])
+
+    #####################################
+    ########## SGL Program End ##########
+    #####################################
+
+    # Run requests
+    tic = time.perf_counter()
+    states = few_shot_boolq.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+
+    preds = []
+    for i in range(len(states)):
+        preds.append(states[i]["answer"])
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(answer))
+
+    # Compute speed
+    num_output_tokens = sum(
+        s.get_meta_info("answer")["completion_tokens"] for s in states
+    )
+    output_throughput = num_output_tokens / latency
+
+    # Print results
+    print(f"Accuracy: {acc:.3f}")
+    print(f"Latency: {latency:.3f} s")
+    print(f"Output throughput: {output_throughput:.3f} token/s")
+
+    # Results
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "boolq",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-shots", type=int, default=5)
+    parser.add_argument(
+        "--train-data-path", type=str, default="./boolq/data/train-00000-of-00001.json"
+    )
+    parser.add_argument(
+        "--test-data-path",
+        type=str,
+        default="./boolq/data/validation-00000-of-00001.json",
+    )
+    parser.add_argument("--num-questions", type=int, default=200)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/boolq/convert_parquet_to_json.py b/benchmark/boolq/convert_parquet_to_json.py
new file mode 100644
index 000000000..e3e69cb31
--- /dev/null
+++ b/benchmark/boolq/convert_parquet_to_json.py
@@ -0,0 +1,28 @@
+import sys
+
+import pyarrow.parquet as pq
+
+
+def convert_parquet_to_json(input_file, output_file):
+    # read parquet file
+    table = pq.read_table(input_file)
+
+    # turn parquet data to dataframe
+    df = table.to_pandas()
+
+    # turn dataframe to json form
+    json_data = df.to_json(orient="records", lines=True)
+
+    # write json to file
+    with open(output_file, "w") as f:
+        f.write(json_data)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage:python convert_parquet_to_json.py <input_file> <output_file>")
+
+    input_file = sys.argv[1]
+    output_file = sys.argv[2]
+
+    convert_parquet_to_json(input_file, output_file)
diff --git a/benchmark/boolq/parquet_to_json.sh b/benchmark/boolq/parquet_to_json.sh
new file mode 100755
index 000000000..9aaf087ff
--- /dev/null
+++ b/benchmark/boolq/parquet_to_json.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+#define input and output direction
+input_dir="./boolq/data"
+output_dir="./boolq/data"
+
+#define files needed to be handled
+files=(
+        "train-00000-of-00001.parquet"
+        "validation-00000-of-00001.parquet"
+)
+
+#foe files above, use python script to convert the form
+for file in "${files[@]}"; do
+    input_file="${input_dir}/${file}"
+    output_file="${output_dir}/${file%.parquet}.json"
+
+    echo "Converting ${input_file} to ${output_file} ..."
+    python3 convert_parquet_to_json.py "${input_file}" "${output_file}"
+
+    if [ $? -eq 0 ]; then
+        echo "Conversion successful: ${output_file}"
+    else
+        echo "Conversion failed: ${input_file}"
+    fi
+done
diff --git a/benchmark/ceval/README.md b/benchmark/ceval/README.md
new file mode 100644
index 000000000..b822e43c3
--- /dev/null
+++ b/benchmark/ceval/README.md
@@ -0,0 +1,15 @@
+## Download data
+```
+git lfs clone https://huggingface.co/datasets/ceval/ceval-exam
+```
+
+## Run benchmark
+
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path ramblingpolymath/Qwen3-32B-W8A8 --port 30000
+```
+
+```
+python3 bench_sglang.py
+```
diff --git a/benchmark/ceval/bench_sglang.py b/benchmark/ceval/bench_sglang.py
new file mode 100644
index 000000000..32ed0baf2
--- /dev/null
+++ b/benchmark/ceval/bench_sglang.py
@@ -0,0 +1,138 @@
+import argparse
+import json
+import os
+import random
+import re
+import time
+
+import numpy as np
+from datasets import load_dataset
+
+from sglang.api import set_default_backend
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+
+choices = ["A", "B", "C", "D"]
+
+
+def get_one_example(line, include_answer):
+    res = line["question"]
+    res += f"\nA. {line['A']}"
+    res += f"\nB. {line['B']}"
+    res += f"\nC. {line['C']}"
+    res += f"\nD. {line['D']}"
+
+    if include_answer:
+        res += f"\nAnswer: {line['answer']} \n\n"
+    return res
+
+
+def get_few_shot_examples(lines):
+    res = ""
+    for line in lines:
+        res += get_one_example(line, True) + "\n\n"
+    return res
+
+
+def get_answer_value(response):
+    pattern = r"(Answer:|answer:|答案是|答案是:|正确答案是:|答案:|Assistant:)\s*([A-D])(?![\w])"
+    match = re.search(pattern, response)
+
+    if match:
+        return match.group(2)
+
+    return random.choice(choices)
+
+
+def main(args):
+    # Read data && Construct prompts
+    arguments = []
+    labels = []
+    examples = "examples:\n"
+    data_path = args.data_path
+    for subject in os.listdir(data_path):
+        subject_path = os.path.join(data_path, subject)
+        if os.path.isdir(subject_path) and subject != ".git":
+            dataset = load_dataset(data_path, name=subject)
+            dev_lines_temp = dataset["dev"]
+            val_lines_temp = dataset["val"]
+            few_shot_examples = get_few_shot_examples(dev_lines_temp, subject)
+            examples += f"{few_shot_examples}"
+            for val_line in val_lines_temp:
+                arguments.append(
+                    {
+                        "examples": few_shot_examples,
+                        "question": get_one_example(val_line, False),
+                    }
+                )
+                labels.append(val_line["answer"])
+
+    #####################################
+    ######### SGL Program Begin #########
+    #####################################
+
+    import sglang as sgl
+
+    @sgl.function
+    def few_shot_ceval(s, examples, question):
+        s += examples + question + sgl.gen("Answer")
+
+    #####################################
+    ########## SGL Program End ##########
+    #####################################
+
+    num_questions = args.num_questions if args.num_questions else len(arguments)
+
+    # Select backend
+    set_default_backend(select_sglang_backend(args))
+
+    # Run requests
+    tic = time.perf_counter()
+    states = few_shot_ceval.run_batch(
+        arguments[:num_questions],
+        temperature=0,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+
+    preds = [get_answer_value(states[i]["Answer"]) for i in range(num_questions)]
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels[:num_questions]))
+
+    # Compute speed
+    num_output_tokens = sum(
+        s.get_meta_info("Answer")["completion_tokens"] for s in states
+    )
+    output_throughput = num_output_tokens / latency
+
+    # Print results
+    print(f"Accuracy: {acc:.3f}")
+    print(f"Latency: {latency:.3f} s")
+    print(f"Output throughput: {output_throughput:.3f} token/s")
+
+    # Write results
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "ceval",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="ceval-exam")
+    parser.add_argument("--num-questions", type=int, default=None)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/deepseek_v3/README.md b/benchmark/deepseek_v3/README.md
new file mode 100644
index 000000000..53efc23f5
--- /dev/null
+++ b/benchmark/deepseek_v3/README.md
@@ -0,0 +1,373 @@
+# DeepSeek V3.1/V3/R1 Support
+
+The SGLang and DeepSeek teams collaborated to get DeepSeek V3 FP8 running on NVIDIA and AMD GPUs **from day one**. SGLang also supports [MLA optimization](https://lmsys.org/blog/2024-09-04-sglang-v0-3/#deepseek-multi-head-latent-attention-mla-throughput-optimizations) and [DP attention](https://lmsys.org/blog/2024-12-04-sglang-v0-4/#data-parallelism-attention-for-deepseek-models), making SGLang one of the best open-source LLM engines for running DeepSeek models. SGLang is the inference engine recommended by the official [DeepSeek team](https://github.com/deepseek-ai/DeepSeek-V3/tree/main?tab=readme-ov-file#62-inference-with-sglang-recommended).
+
+Special thanks to Meituan's Search & Recommend Platform Team and Baseten's Model Performance Team for implementing the model, and DataCrunch for providing GPU resources.
+
+For optimizations made on the DeepSeek series models regarding SGLang, please refer to [DeepSeek Model Optimizations in SGLang](https://docs.sglang.ai/basic_usage/deepseek.html).
+
+## Installation & Launch
+
+If you encounter errors when starting the server, ensure the weights have finished downloading. It's recommended to download them beforehand or restart multiple times until all weights are downloaded.
+
+### Using Docker (Recommended)
+
+```bash
+# Pull latest image
+# https://hub.docker.com/r/lmsysorg/sglang/tags
+docker pull lmsysorg/sglang:latest
+
+# Launch
+docker run --gpus all --shm-size 32g -p 30000:30000 -v ~/.cache/huggingface:/root/.cache/huggingface --ipc=host --network=host --privileged lmsysorg/sglang:latest \
+    python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code --port 30000
+```
+
+If you are using RDMA, please note that:
+
+1. `--network host` and `--privileged` are required by RDMA. If you don't need RDMA, you can remove them.
+2. You may need to set `NCCL_IB_GID_INDEX` if you are using RoCE, for example: `export NCCL_IB_GID_INDEX=3`.
+
+Add [performance optimization options](#performance-optimization-options) as needed.
+
+### Using pip
+
+```bash
+# Installation
+pip install "sglang[all]>=0.5.2"
+
+# Launch
+python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code
+```
+
+Add [performance optimization options](#performance-optimization-options) as needed.
+
+<a id="option_args"></a>
+
+### Performance Optimization Options
+
+[MLA optimizations](https://lmsys.org/blog/2024-09-04-sglang-v0-3/#deepseek-multi-head-latent-attention-mla-throughput-optimizations) are enabled by default. Here are some optional optimizations can be enabled as needed.
+
+- [Data Parallelism Attention](https://lmsys.org/blog/2024-12-04-sglang-v0-4/#data-parallelism-attention-for-deepseek-models): For high QPS scenarios, add the `--enable-dp-attention` argument to boost throughput.
+- [Torch.compile Optimization](https://lmsys.org/blog/2024-09-04-sglang-v0-3/#torchcompile-latency-optimizations): Add `--enable-torch-compile` argument to enable it. This will take some time while server starts. The maximum batch size for torch.compile optimization can be controlled with `--torch-compile-max-bs`. It's recommended to set it between `1` and `8`. (e.g., `--torch-compile-max-bs 8`)
+
+### Usage: Chat with DeepSeek
+
+#### DeepSeek V3/R1
+
+```python3
+import openai
+client = openai.Client(
+    base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
+
+# Chat completion
+response = client.chat.completions.create(
+    model="default",
+    messages=[
+        {"role": "system", "content": "You are a helpful AI assistant"},
+        {"role": "user", "content": "List 3 countries and their capitals."},
+    ],
+    temperature=0,
+    max_tokens=64,
+)
+print(response)
+```
+
+#### DeepSeek V3.1
+On top of the basic usage similar to the DeepSeek V3/R1 example, DeepSeek V3.1 supports a request-level thinking/non-thinking toggle. Simply switch the `"thinking"` field in `extra_body={"chat_template_kwargs": {"thinking": True}}` to enable/disable the thinking mode.
+
+##### Non Thinking
+```python3
+import openai
+client = openai.Client(
+    base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
+
+# Chat completion
+response = client.chat.completions.create(
+    model="default",
+    messages=[
+        {"role": "system", "content": "You are a helpful AI assistant"},
+        {"role": "user", "content": "Answer the following with the second letter of the correct answer only: What is the capital of France?"},
+    ],
+    temperature=0,
+    max_tokens=1024,
+    extra_body = {"chat_template_kwargs": {"thinking": False}}
+)
+print(response.choices[0].message.content)
+```
+Answer:
+```
+h
+```
+* The correct response should be 'A', as the correct answer to the question is 'Paris'.
+##### Thinking
+```python3
+import openai
+client = openai.Client(
+    base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
+
+# Chat completion
+response = client.chat.completions.create(
+    model="default",
+    messages=[
+        {"role": "system", "content": "You are a helpful AI assistant"},
+        {"role": "user", "content": "Answer the following with the second letter of the correct answer only: What is the capital of France?"},
+    ],
+    temperature=0,
+    max_tokens=1024,
+    extra_body = {"chat_template_kwargs": {"thinking": True}}
+)
+print(response)
+```
+Answer:
+```
+First, the question is: "What is the capital of France?" I know that the capital of France is Paris.
+
+The user says: "Answer the following with the second letter of the correct answer only." So, I need to provide only the second letter of the correct answer.
+
+The correct answer is "Paris". Now, I need to find the second letter of "Paris".
+
+Let's spell it out: P-A-R-I-S.
+
+- First letter: P
+
+- Second letter: A
+
+- Third letter: R
+
+- Fourth letter: I
+
+- Fifth letter: S
+
+So, the second letter is "A".
+
+I should only output the second letter, which is "A". No additional text or explanation, just the letter.
+
+The user emphasized "the second letter of the correct answer only", so my response should be just "A".
+
+Finally, I need to make sure that this is the correct answer. Yes, Paris is indeed the capital of France.</think>A
+```
+* The response contains `</think>` thinking trace and model was able to derive the correct answer from it.
+
+### Example: Serving with two H20\*8 nodes
+
+For example, there are two H20 nodes, each with 8 GPUs. The first node's IP is `10.0.0.1`, and the second node's IP is `10.0.0.2`. Please **use the first node's IP** for both commands.
+
+If the command fails, try setting the `GLOO_SOCKET_IFNAME` parameter. For more information, see [Common Environment Variables](https://pytorch.org/docs/stable/distributed.html#common-environment-variables).
+
+If the multi nodes support NVIDIA InfiniBand and encounter hanging issues during startup, consider adding the parameter `export NCCL_IB_GID_INDEX=3`. For more information, see [this](https://github.com/sgl-project/sglang/issues/3516#issuecomment-2668493307).
+
+```bash
+# node 1
+python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3 --tp 16 --dist-init-addr 10.0.0.1:5000 --nnodes 2 --node-rank 0 --trust-remote-code
+
+# node 2
+python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3 --tp 16 --dist-init-addr 10.0.0.1:5000 --nnodes 2 --node-rank 1 --trust-remote-code
+```
+
+If you have two H100 nodes, the usage is similar to the aforementioned H20.
+
+> **Note that the launch command here does not enable Data Parallelism Attention or `torch.compile` Optimization**. For optimal performance, please refer to the command options in [Performance Optimization Options](#option_args).
+
+### Example: Serving with two H200\*8 nodes and docker
+
+There are two H200 nodes, each with 8 GPUs. The first node's IP is `192.168.114.10`, and the second node's IP is `192.168.114.11`. Configure the endpoint to expose it to another Docker container using `--host 0.0.0.0` and `--port 40000`, and set up communications with `--dist-init-addr 192.168.114.10:20000`.
+A single H200 with 8 devices can run DeepSeek V3, the dual H200 setup is just to demonstrate multi-node usage.
+
+```bash
+# node 1
+docker run --gpus all \
+    --shm-size 32g \
+    --network=host \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --name sglang_multinode1 \
+    -it \
+    --rm \
+    --env "HF_TOKEN=$HF_TOKEN" \
+    --ipc=host \
+    lmsysorg/sglang:latest \
+    python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3 --tp 16 --dist-init-addr 192.168.114.10:20000 --nnodes 2 --node-rank 0 --trust-remote-code --host 0.0.0.0 --port 40000
+```
+
+```bash
+# node 2
+docker run --gpus all \
+    --shm-size 32g \
+    --network=host \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --name sglang_multinode2 \
+    -it \
+    --rm \
+    --env "HF_TOKEN=$HF_TOKEN" \
+    --ipc=host \
+    lmsysorg/sglang:latest \
+    python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3 --tp 16 --dist-init-addr 192.168.114.10:20000 --nnodes 2 --node-rank 1 --trust-remote-code --host 0.0.0.0 --port 40000
+```
+
+To ensure functionality, we include a test from a client Docker container.
+
+```bash
+docker run --gpus all \
+    --shm-size 32g \
+    --network=host \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --name sglang_multinode_client \
+    -it \
+    --rm \
+    --env "HF_TOKEN=$HF_TOKEN" \
+    --ipc=host \
+    lmsysorg/sglang:latest \
+    python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 1 --random-output 512 --random-range-ratio 1 --num-prompts 1 --host 0.0.0.0 --port 40000 --output-file "deepseekv3_multinode.jsonl"
+```
+
+> **Note that the launch command here does not enable Data Parallelism Attention or `torch.compile` Optimization**. For optimal performance, please refer to the command options in [Performance Optimization Options](#option_args).
+
+### Example: Serving with four A100\*8 nodes
+
+To serve DeepSeek-V3 with A100 GPUs, we need to convert the [FP8 model checkpoints](https://huggingface.co/deepseek-ai/DeepSeek-V3) to BF16 with [script](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/fp8_cast_bf16.py) mentioned [here](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/fp8_cast_bf16.py) first.
+
+Since the BF16 model is over 1.3 TB, we need to prepare four A100 nodes, each with 8 80GB GPUs. Assume the first node's IP is `10.0.0.1`, and the converted model path is `/path/to/DeepSeek-V3-BF16`, we can have following commands to launch the server.
+
+```bash
+# node 1
+python3 -m sglang.launch_server --model-path /path/to/DeepSeek-V3-BF16 --tp 32 --dist-init-addr 10.0.0.1:5000 --nnodes 4 --node-rank 0 --trust-remote-code --host 0.0.0.0 --port 30000
+
+# node 2
+python3 -m sglang.launch_server --model-path /path/to/DeepSeek-V3-BF16 --tp 32 --dist-init-addr 10.0.0.1:5000 --nnodes 4 --node-rank 1 --trust-remote-code
+
+# node 3
+python3 -m sglang.launch_server --model-path /path/to/DeepSeek-V3-BF16 --tp 32 --dist-init-addr 10.0.0.1:5000 --nnodes 4 --node-rank 2 --trust-remote-code
+
+# node 4
+python3 -m sglang.launch_server --model-path /path/to/DeepSeek-V3-BF16 --tp 32 --dist-init-addr 10.0.0.1:5000 --nnodes 4 --node-rank 3 --trust-remote-code
+```
+
+> **Note that the launch command here does not enable Data Parallelism Attention or `torch.compile` Optimization**. For optimal performance, please refer to the command options in [Performance Optimization Options](#option_args).
+
+Then we can benchmark the accuracy and latency by accessing the first node's exposed port with the following example commands.
+
+```bash
+# bench accuracy
+python3 benchmark/gsm8k/bench_sglang.py --num-questions 1319 --host http://10.0.0.1 --port 30000
+
+# bench latency
+python3 -m sglang.bench_one_batch_server --model None --base-url http://10.0.0.1:30000 --batch-size 1 --input-len 128 --output-len 128
+```
+
+
+### Example: Serving with 8 A100/A800 with AWQ Quantization
+
+**Recommended Usage**
+
+Add `--quantization moe_wna16` flag to enable moe wna16 kernel for better performance.
+One example is as follows:
+
+```bash
+python3 -m sglang.launch_server --model cognitivecomputations/DeepSeek-R1-AWQ --tp 8 --trust-remote-code --quantization moe_wna16
+```
+
+Alternatively, you can use `--quantization awq_marlin` as follows:
+
+```bash
+python3 -m sglang.launch_server --model cognitivecomputations/DeepSeek-R1-AWQ --tp 8 --trust-remote-code --quantization awq_marlin --dtype float16
+```
+
+Note that `awq_marlin` only supports `float16` now, which may lead to some precision loss.
+
+### Example: Serving with 16 A100/A800 with int8 Quantization
+
+There are block-wise and per-channel quantization methods, and the quantization parameters have already been uploaded to Huggingface. One example is as follows:
+
+- [meituan/DeepSeek-R1-Block-INT8](https://huggingface.co/meituan/DeepSeek-R1-Block-INT8)
+- [meituan/DeepSeek-R1-Channel-INT8](https://huggingface.co/meituan/DeepSeek-R1-Channel-INT8)
+
+Assuming that master node IP is `MASTER_IP`, checkpoint path is `/path/to/DeepSeek-R1-INT8` and port=5000, we can have following commands to launch the server:
+```bash
+#master
+python3 -m sglang.launch_server \
+	--model meituan/DeepSeek-R1-Block-INT8 --tp 16 --dist-init-addr \
+	MASTER_IP:5000 --nnodes 2 --node-rank 0 --trust-remote-code --enable-torch-compile --torch-compile-max-bs 8
+#cluster
+python3 -m sglang.launch_server \
+	--model meituan/DeepSeek-R1-Block-INT8 --tp 16 --dist-init-addr \
+	MASTER_IP:5000 --nnodes 2 --node-rank 1 --trust-remote-code --enable-torch-compile --torch-compile-max-bs 8
+```
+
+> **Note that the launch command here enables `torch.compile` Optimization**. For optimal performance, please refer to the command options in [Performance Optimization Options](#option_args).
+
+Then on the **master node**, supposing the ShareGPT data is located at `/path/to/ShareGPT_V3_unfiltered_cleaned_split.json`, you can run the following commands to benchmark the launched server:
+
+```bash
+# bench accuracy
+python3 benchmark/gsm8k/bench_sglang.py --num-questions 1319
+
+# bench serving
+python3 -m sglang.bench_serving --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json --dataset-name random  --random-input 128 --random-output 128 --num-prompts 1000 --request-rate 128 --random-range-ratio 1.0
+```
+
+> **Note: using `--parallel 200` can accelerate accuracy benchmarking**.
+
+### Example: Serving with 32 L40S with int8 Quantization
+
+Running with per-channel quantization model:
+
+- [meituan/DeepSeek-R1-Channel-INT8](https://huggingface.co/meituan/DeepSeek-R1-Channel-INT8)
+
+Assuming that master node IP is `MASTER_IP`, checkpoint path is `/path/to/DeepSeek-R1-Channel-INT8` and port=5000, we can have following commands to launch the server:
+
+```bash
+#master
+python3 -m sglang.launch_server --model meituan/DeepSeek-R1-Channel-INT8 --tp 32 --quantization w8a8_int8 \
+	--dist-init-addr MASTER_IP:5000 --nnodes 4 --node-rank 0 --trust-remote \
+	--enable-torch-compile --torch-compile-max-bs 32
+#cluster
+python3 -m sglang.launch_server --model meituan/DeepSeek-R1-Channel-INT8 --tp 32 --quantization w8a8_int8 \
+	--dist-init-addr MASTER_IP:5000 --nnodes 4 --node-rank 1 --trust-remote \
+	--enable-torch-compile --torch-compile-max-bs 32
+python3 -m sglang.launch_server --model meituan/DeepSeek-R1-Channel-INT8 --tp 32 --quantization w8a8_int8 \
+	--dist-init-addr MASTER_IP:5000 --nnodes 4 --node-rank 2 --trust-remote \
+	--enable-torch-compile --torch-compile-max-bs 32
+python3 -m sglang.launch_server --model meituan/DeepSeek-R1-Channel-INT8 --tp 32 --quantization w8a8_int8 \
+	--dist-init-addr MASTER_IP:5000 --nnodes 4 --node-rank 3 --trust-remote \
+	--enable-torch-compile --torch-compile-max-bs 32
+```
+
+The benchmarking method is the same as describted in the previous [16 x A100](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-16-a100a800-with-int8-quantization) example.
+
+### Example: Serving on any cloud or Kubernetes with SkyPilot
+
+SkyPilot helps find cheapest available GPUs across any cloud or existing Kubernetes clusters and launch distributed serving with a single command. See details [here](https://github.com/skypilot-org/skypilot/tree/master/llm/deepseek-r1).
+
+To serve on multiple nodes:
+
+```bash
+git clone https://github.com/skypilot-org/skypilot.git
+# Serve on 2 H100/H200x8 nodes
+sky launch -c r1 llm/deepseek-r1/deepseek-r1-671B.yaml --retry-until-up
+# Serve on 4 A100x8 nodes
+sky launch -c r1 llm/deepseek-r1/deepseek-r1-671B-A100.yaml --retry-until-up
+```
+
+#### Troubleshooting
+
+If you encounter the following error with fp16/bf16 checkpoint:
+
+```bash
+ValueError: Weight output_partition_size = 576 is not divisible by weight quantization block_n = 128.
+```
+
+edit your `config.json` and remove the `quantization_config` block. For example:
+
+```json
+"quantization_config": {
+    "activation_scheme": "dynamic",
+    "fmt": "e4m3",
+    "quant_method": "fp8",
+    "weight_block_size": [128, 128]
+},
+```
+
+Removing this block typically resolves the error. For more details, see the discussion in [sgl-project/sglang#3491](https://github.com/sgl-project/sglang/issues/3491#issuecomment-2650779851).
+
+## DeepSeek V3 Optimization Plan
+
+https://github.com/sgl-project/sglang/issues/2591
diff --git a/benchmark/dspy/README.md b/benchmark/dspy/README.md
new file mode 100644
index 000000000..a2b213aa6
--- /dev/null
+++ b/benchmark/dspy/README.md
@@ -0,0 +1,51 @@
+## Install
+
+```
+pip3 install dspy-ai
+```
+
+Turn off cache at https://github.com/stanfordnlp/dspy/blob/34d8420383ec752037aa271825c1d3bf391e1277/dsp/modules/cache_utils.py#L10.
+```
+cache_turn_on = False
+```
+
+or set the environment variable
+
+```
+export DSP_CACHEBOOL=false
+```
+
+## Benchmark SGLang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+```
+python3 bench_dspy_intro.py --backend sglang
+```
+
+
+## Benchmark TGI
+```
+docker run --name tgi --rm -ti --gpus all --network host \
+  -v /home/ubuntu/model_weights/Llama-2-7b-chat-hf:/Llama-2-7b-chat-hf \
+  ghcr.io/huggingface/text-generation-inference:1.3.0 \
+  --model-id /Llama-2-7b-chat-hf --num-shard 1  --trust-remote-code \
+  --max-input-length 2048 --max-total-tokens 4096 \
+  --port 24000
+```
+
+```
+python3 bench_dspy_intro.py --backend tgi
+```
+
+
+
+## Benchmark vLLM
+```
+python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests  --port 21000
+```
+
+```
+python3 bench_dspy_intro.py --backend vllm
+```
diff --git a/benchmark/dspy/bench_dspy_intro.py b/benchmark/dspy/bench_dspy_intro.py
new file mode 100644
index 000000000..2b0936ed2
--- /dev/null
+++ b/benchmark/dspy/bench_dspy_intro.py
@@ -0,0 +1,192 @@
+"""
+Adapted from
+https://github.com/stanfordnlp/dspy/blob/34d8420383ec752037aa271825c1d3bf391e1277/intro.ipynb#L9
+"""
+
+import argparse
+
+import dspy
+from dspy.datasets import HotPotQA
+
+
+class BasicQA(dspy.Signature):
+    """Answer questions with short factoid answers."""
+
+    question = dspy.InputField()
+    answer = dspy.OutputField(desc="often between 1 and 5 words")
+
+
+class GenerateAnswer(dspy.Signature):
+    """Answer questions with short factoid answers."""
+
+    context = dspy.InputField(desc="may contain relevant facts")
+    question = dspy.InputField()
+    answer = dspy.OutputField(desc="often between 1 and 5 words")
+
+
+class RAG(dspy.Module):
+    def __init__(self, num_passages=3):
+        super().__init__()
+
+        self.retrieve = dspy.Retrieve(k=num_passages)
+        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
+
+    def forward(self, question):
+        context = self.retrieve(question).passages
+        prediction = self.generate_answer(context=context, question=question)
+        return dspy.Prediction(context=context, answer=prediction.answer)
+
+
+def main(args):
+    # lm = dspy.OpenAI(model='gpt-3.5-turbo')
+    if args.backend == "tgi":
+        lm = dspy.HFClientTGI(
+            model="meta-llama/Llama-2-7b-chat-hf",
+            port=args.port,
+            url="http://localhost",
+        )
+    elif args.backend == "sglang":
+        lm = dspy.HFClientSGLang(
+            model="meta-llama/Llama-2-7b-chat-hf",
+            port=args.port,
+            url="http://localhost",
+        )
+    elif args.backend == "vllm":
+        lm = dspy.HFClientVLLM(
+            model="meta-llama/Llama-2-7b-chat-hf",
+            port=args.port,
+            url="http://localhost",
+        )
+    else:
+        raise ValueError(f"Invalid backend: {args.backend}")
+
+    colbertv2_wiki17_abstracts = dspy.ColBERTv2(
+        url="http://20.102.90.50:2017/wiki17_abstracts"
+    )
+    dspy.settings.configure(lm=lm, rm=colbertv2_wiki17_abstracts)
+
+    # Load the dataset.
+    dataset = HotPotQA(
+        train_seed=1, train_size=20, eval_seed=2023, dev_size=args.dev_size, test_size=0
+    )
+
+    # Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata.
+    trainset = [x.with_inputs("question") for x in dataset.train]
+    devset = [x.with_inputs("question") for x in dataset.dev]
+
+    print(len(trainset), len(devset))
+
+    train_example = trainset[0]
+    print(f"Question: {train_example.question}")
+    print(f"Answer: {train_example.answer}")
+
+    dev_example = devset[18]
+    print(f"Question: {dev_example.question}")
+    print(f"Answer: {dev_example.answer}")
+    print(f"Relevant Wikipedia Titles: {dev_example.gold_titles}")
+
+    print(
+        f"For this dataset, training examples have input keys {train_example.inputs().keys()} and label keys {train_example.labels().keys()}"
+    )
+    print(
+        f"For this dataset, dev examples have input keys {dev_example.inputs().keys()} and label keys {dev_example.labels().keys()}"
+    )
+
+    # Define the predictor.
+    generate_answer = dspy.Predict(BasicQA)
+
+    # Call the predictor on a particular input.
+    pred = generate_answer(question=dev_example.question)
+
+    # Print the input and the prediction.
+    print(f"Question: {dev_example.question}")
+    print(f"Predicted Answer: {pred.answer}")
+
+    lm.inspect_history(n=1)
+
+    # Define the predictor. Notice we're just changing the class. The signature BasicQA is unchanged.
+    generate_answer_with_chain_of_thought = dspy.ChainOfThought(BasicQA)
+
+    # Call the predictor on the same input.
+    pred = generate_answer_with_chain_of_thought(question=dev_example.question)
+
+    # Print the input, the chain of thought, and the prediction.
+    print(f"Question: {dev_example.question}")
+    print(f"Thought: {pred.rationale.split('.', 1)[1].strip()}")
+    print(f"Predicted Answer: {pred.answer}")
+
+    retrieve = dspy.Retrieve(k=3)
+    topK_passages = retrieve(dev_example.question).passages
+
+    print(
+        f"Top {retrieve.k} passages for question: {dev_example.question} \n",
+        "-" * 30,
+        "\n",
+    )
+
+    for idx, passage in enumerate(topK_passages):
+        print(f"{idx+1}]", passage, "\n")
+
+    retrieve("When was the first FIFA World Cup held?").passages[0]
+
+    from dspy.teleprompt import BootstrapFewShot
+
+    # Validation logic: check that the predicted answer is correct.
+    # Also check that the retrieved context does actually contain that answer.
+    def validate_context_and_answer(example, pred, trace=None):
+        answer_EM = dspy.evaluate.answer_exact_match(example, pred)
+        answer_PM = dspy.evaluate.answer_passage_match(example, pred)
+        return answer_EM and answer_PM
+
+    # Set up a basic teleprompter, which will compile our RAG program.
+    teleprompter = BootstrapFewShot(metric=validate_context_and_answer)
+
+    # Compile!
+    compiled_rag = teleprompter.compile(RAG(), trainset=trainset)
+
+    # Ask any question you like to this simple RAG program.
+    my_question = "What castle did David Gregory inherit?"
+
+    # Get the prediction. This contains `pred.context` and `pred.answer`.
+    pred = compiled_rag(my_question)
+
+    # Print the contexts and the answer.
+    print(f"Question: {my_question}")
+    print(f"Predicted Answer: {pred.answer}")
+    print(f"Retrieved Contexts (truncated): {[c[:200] + '...' for c in pred.context]}")
+
+    from dspy.evaluate.evaluate import Evaluate
+
+    # Set up the `evaluate_on_hotpotqa` function. We'll use this many times below.
+    evaluate_on_hotpotqa = Evaluate(
+        devset=devset,
+        num_threads=args.num_threads,
+        display_progress=True,
+        display_table=5,
+    )
+
+    # Evaluate the `compiled_rag` program with the `answer_exact_match` metric.
+    metric = dspy.evaluate.answer_exact_match
+    evaluate_on_hotpotqa(compiled_rag, metric=metric)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--port", type=int)
+    parser.add_argument("--num-threads", type=int, default=32)
+    parser.add_argument("--dev-size", type=int, default=150)
+    parser.add_argument(
+        "--backend", type=str, choices=["sglang", "tgi", "vllm"], default="sglang"
+    )
+    args = parser.parse_args()
+
+    if args.port is None:
+        default_port = {
+            "vllm": 21000,
+            "lightllm": 22000,
+            "tgi": 24000,
+            "sglang": 30000,
+        }
+        args.port = default_port.get(args.backend, None)
+
+    main(args)
diff --git a/benchmark/generative_agents/README.md b/benchmark/generative_agents/README.md
new file mode 100644
index 000000000..393a9ce83
--- /dev/null
+++ b/benchmark/generative_agents/README.md
@@ -0,0 +1,38 @@
+## Download the dataset
+
+```
+wget -O agent_calls.jsonl https://drive.google.com/uc?export=download&id=19qLpD45e9JGTKF2cUjJJegwzSUEZEKht
+```
+
+## Run benchmark
+
+Ensure that this benchmark is run in a serial manner (using --parallel 1) to preserve any potential dependencies between requests.
+
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+```
+python3 bench_sglang.py --num-events 1000 --parallel 1
+```
+
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
+```
+
+```
+python3 bench_other.py --num-events 1000 --backend vllm --parallel 1
+```
+
+### Benchmark guidance
+```
+python3 bench_other.py --num-events 1000 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+
+### Benchmark lmql
+
+```
+python3 bench_other.py --num-events 1000 --backend lmql --parallel 1
+```
diff --git a/benchmark/generative_agents/agent_functions.py b/benchmark/generative_agents/agent_functions.py
new file mode 100644
index 000000000..9785fc1ec
--- /dev/null
+++ b/benchmark/generative_agents/agent_functions.py
@@ -0,0 +1,300 @@
+import sglang as sgl
+
+# here are the top five agent functions contributing ~70% LLM calls
+# reference: https://github.com/joonspk-research/generative_agents/
+
+
+@sgl.function
+def poignancy_event(s, persona_name, persona_iss, event):
+    s += "Here is a brief description of " + persona_name + ".\n"
+    s += persona_iss + "\n"
+    s += "On the scale of 1 to 10, where 1 is purely mundane (e.g., brushing teeth, making bed) and 10 is extremely poignant (e.g., a break up, college acceptance), rate the likely poignancy of the following event for"
+    s += persona_name + ".\n\n"
+    s += "Event: " + event
+    s += "Rate (return a number between 1 to 10):"
+    s += sgl.gen(name="Rate", max_tokens=2)
+
+
+def poignancy_event_prompt(persona_name, persona_iss, event):
+    # return prompt and max_tokens
+    s = ""
+    s += "Here is a brief description of " + persona_name + ".\n"
+    s += persona_iss + "\n"
+    s += "On the scale of 1 to 10, where 1 is purely mundane (e.g., brushing teeth, making bed) and 10 is extremely poignant (e.g., a break up, college acceptance), rate the likely poignancy of the following event for"
+    s += persona_name + ".\n\n"
+    s += "Event: " + event
+    s += "Rate (return a number between 1 to 10):"
+    return {"prompt": s, "max_tokens": 2, "stop": None}
+
+
+@sgl.function
+def generate_event_triple(s, persona_name, action):
+    s += """Task: Turn the input into (subject, predicate, object).
+Input: Sam Johnson is eating breakfast.
+Output: (Dolores Murphy, eat, breakfast)
+---
+Input: Joon Park is brewing coffee.
+Output: (Joon Park, brew, coffee)
+---
+Input: Jane Cook is sleeping.
+Output: (Jane Cook, is, sleep)
+---
+Input: Michael Bernstein is writing email on a computer.
+Output: (Michael Bernstein, write, email)
+---
+Input: Percy Liang is teaching students in a classroom.
+Output: (Percy Liang, teach, students)
+---
+Input: Merrie Morris is running on a treadmill.
+Output: (Merrie Morris, run, treadmill)
+---"""
+    s += persona_name + "is" + action + ".\n"
+    s += "(" + persona_name + ","
+    s += sgl.gen(name="Triple", max_tokens=20, stop=")")
+
+
+def generate_event_triple_prompt(persona_name, action):
+    s = ""
+    s += """Task: Turn the input into (subject, predicate, object).
+Input: Sam Johnson is eating breakfast.
+Output: (Dolores Murphy, eat, breakfast)
+---
+Input: Joon Park is brewing coffee.
+Output: (Joon Park, brew, coffee)
+---
+Input: Jane Cook is sleeping.
+Output: (Jane Cook, is, sleep)
+---
+Input: Michael Bernstein is writing email on a computer.
+Output: (Michael Bernstein, write, email)
+---
+Input: Percy Liang is teaching students in a classroom.
+Output: (Percy Liang, teach, students)
+---
+Input: Merrie Morris is running on a treadmill.
+Output: (Merrie Morris, run, treadmill)
+---"""
+    s += persona_name + "is" + action + ".\n"
+    s += "(" + persona_name + ","
+    return {"prompt": s, "max_tokens": 20, "stop": ")"}
+
+
+@sgl.function
+def generate_pronunciatio(s, action):
+    s += "Convert an action description to an emoji (important: use two or less emojis).\n"
+    s += "Action description: " + action + ".\n"
+    s += "Emoji:" + sgl.gen(name="Emoji", max_tokens=6)
+
+
+def generate_pronunciatio_prompt(action):
+    s = ""
+    s += "Convert an action description to an emoji (important: use two or less emojis).\n"
+    s += "Action description: " + action + ".\n"
+    s += "Emoji:"
+    return {"prompt": s, "max_tokens": 6, "stop": None}
+
+
+@sgl.function
+def action_location_sector(
+    s,
+    persona_name,
+    living_sector,
+    living_sector_areas,
+    current_sector,
+    current_sector_areas,
+    daily_plan,
+    sector_options,
+    current_action,
+    next_action,
+):
+    s += """Task -- choose an appropriate area  from the area options for a task at hand.
+Sam Kim lives in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen.
+Sam Kim is currently in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen.
+Area options: {Sam Kim's house, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}.
+* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
+* Must be one of the "Area options," verbatim.
+For taking a walk, Sam Kim should go to the following area: {Johnson Park}
+---
+Jane Anderson lives in {Oak Hill College Student Dormatory} that has Jane Anderson's room.
+Jane Anderson is currently in {Oak Hill College} that has a classroom, library
+Area options: {Oak Hill College Student Dormatory, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}.
+* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
+* Must be one of the "Area options," verbatim.
+For eating dinner, Jane Anderson should go to the following area: {Hobbs Cafe}
+---"""
+    s += (
+        persona_name
+        + " lives in "
+        + living_sector
+        + " that has "
+        + living_sector_areas
+        + ".\n"
+    )
+    s += (
+        persona_name
+        + " is currently in "
+        + current_sector
+        + " that has "
+        + current_sector_areas
+        + ".\n"
+    )
+    s += daily_plan + ".\n"
+    s += "Area options: " + sector_options + ".\n"
+    s += """* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
+* Must be one of the "Area options," verbatim.\n"""
+    s += (
+        persona_name
+        + " is "
+        + current_action
+        + ". For "
+        + next_action
+        + ", "
+        + persona_name
+        + " should go to the following area: {"
+    )
+    s += sgl.gen(name="Location", max_tokens=10, stop="}")
+
+
+def action_location_sector_prompt(
+    persona_name,
+    living_sector,
+    living_sector_areas,
+    current_sector,
+    current_sector_areas,
+    daily_plan,
+    sector_options,
+    current_action,
+    next_action,
+):
+    s = ""
+    s += """Task -- choose an appropriate area  from the area options for a task at hand.
+Sam Kim lives in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen.
+Sam Kim is currently in {Sam Kim's house} that has Sam Kim's room, bathroom, kitchen.
+Area options: {Sam Kim's house, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}.
+* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
+* Must be one of the "Area options," verbatim.
+For taking a walk, Sam Kim should go to the following area: {Johnson Park}
+---
+Jane Anderson lives in {Oak Hill College Student Dormatory} that has Jane Anderson's room.
+Jane Anderson is currently in {Oak Hill College} that has a classroom, library
+Area options: {Oak Hill College Student Dormatory, The Rose and Crown Pub, Hobbs Cafe, Oak Hill College, Johnson Park, Harvey Oak Supply Store, The Willows Market and Pharmacy}.
+* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
+* Must be one of the "Area options," verbatim.
+For eating dinner, Jane Anderson should go to the following area: {Hobbs Cafe}
+---"""
+    s += (
+        persona_name
+        + " lives in "
+        + living_sector
+        + " that has "
+        + living_sector_areas
+        + ".\n"
+    )
+    s += (
+        persona_name
+        + " is currently in "
+        + current_sector
+        + " that has "
+        + current_sector_areas
+        + ".\n"
+    )
+    s += daily_plan + ".\n"
+    s += "Area options: " + sector_options + ".\n"
+    s += """* Stay in the current area if the activity can be done there. Only go out if the activity needs to take place in another place.
+* Must be one of the "Area options," verbatim.\n"""
+    s += (
+        persona_name
+        + " is "
+        + current_action
+        + ". For "
+        + next_action
+        + ", "
+        + persona_name
+        + " should go to the following area: {"
+    )
+    return {"prompt": s, "max_tokens": 10, "stop": "}"}
+
+
+@sgl.function
+def action_location_object(
+    s, persona_name, target_sector, target_sector_areas, current_action, next_action
+):
+    s += """
+Jane Anderson is in kitchen in Jane Anderson's house.
+Jane Anderson is going to Jane Anderson's house that has the following areas: {kitchen,  bedroom, bathroom}
+Stay in the current area if the activity can be done there. Never go into other people's rooms unless necessary.
+For cooking, Jane Anderson should go to the following area in Jane Anderson's house:
+Answer: {kitchen}
+---
+Tom Watson is in common room in Tom Watson's apartment.
+Tom Watson is going to Hobbs Cafe that has the following areas: {cafe}
+Stay in the current area if the activity can be done there. Never go into other people's rooms unless necessary.
+For getting coffee, Tom Watson should go to the following area in Hobbs Cafe:
+Answer: {cafe}
+---"""
+    s += (
+        persona_name
+        + " is going to "
+        + target_sector
+        + " that has the following areas: {"
+        + target_sector_areas
+        + "}\n"
+    )
+    s += """* Stay in the current area if the activity can be done there.
+* NEVER go into other people's rooms unless necessary."""
+    s += (
+        persona_name
+        + " is "
+        + current_action
+        + ". For "
+        + next_action
+        + ", "
+        + persona_name
+        + "should go to the following area in "
+        + target_sector
+    )
+    s += " (MUST pick one of {" + target_sector_areas + "}):\n"
+    s += "Answer: {" + sgl.gen(name="Area", max_tokens=5, stop="}")
+
+
+def action_location_object_prompt(
+    persona_name, target_sector, target_sector_areas, current_action, next_action
+):
+    s = ""
+    s += """
+Jane Anderson is in kitchen in Jane Anderson's house.
+Jane Anderson is going to Jane Anderson's house that has the following areas: {kitchen,  bedroom, bathroom}
+Stay in the current area if the activity can be done there. Never go into other people's rooms unless necessary.
+For cooking, Jane Anderson should go to the following area in Jane Anderson's house:
+Answer: {kitchen}
+---
+Tom Watson is in common room in Tom Watson's apartment.
+Tom Watson is going to Hobbs Cafe that has the following areas: {cafe}
+Stay in the current area if the activity can be done there. Never go into other people's rooms unless necessary.
+For getting coffee, Tom Watson should go to the following area in Hobbs Cafe:
+Answer: {cafe}
+---"""
+    s += (
+        persona_name
+        + " is going to "
+        + target_sector
+        + " that has the following areas: {"
+        + target_sector_areas
+        + "}\n"
+    )
+    s += """* Stay in the current area if the activity can be done there.
+* NEVER go into other people's rooms unless necessary."""
+    s += (
+        persona_name
+        + " is "
+        + current_action
+        + ". For "
+        + next_action
+        + ", "
+        + persona_name
+        + "should go to the following area in "
+        + target_sector
+    )
+    s += " (MUST pick one of {" + target_sector_areas + "}):\n"
+    s += "Answer: {"
+    return {"prompt": s, "max_tokens": 5, "stop": "}"}
diff --git a/benchmark/generative_agents/bench_other.py b/benchmark/generative_agents/bench_other.py
new file mode 100644
index 000000000..c0b3a3406
--- /dev/null
+++ b/benchmark/generative_agents/bench_other.py
@@ -0,0 +1,80 @@
+import argparse
+import json
+import time
+
+from agent_functions import (
+    action_location_object_prompt,
+    action_location_sector_prompt,
+    generate_event_triple_prompt,
+    generate_pronunciatio_prompt,
+    poignancy_event_prompt,
+)
+from tqdm import tqdm
+
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+from sglang.utils import dump_state_text, read_jsonl
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)[: args.num_events]
+    mapping = {
+        "poignancy_event": poignancy_event_prompt,
+        "generate_event_triple": generate_event_triple_prompt,
+        "generate_pronunciatio": generate_pronunciatio_prompt,
+        "action_location_sector": action_location_sector_prompt,
+        "action_location_object": action_location_object_prompt,
+    }
+
+    arguments = [mapping[k](**v) for l in lines for k, v in l.items()]
+    states = []
+
+    # Select backend
+    call_generate = get_call_generate(args)
+
+    def get_one_answer(arg):
+        answer = call_generate(**arg, temperature=0)
+        states.append(answer)
+
+    async def get_one_answer_async(arg):
+        answer = await call_generate(**arg, temperature=0)
+        states.append(answer)
+
+    tic = time.perf_counter()
+    # we always sequentially execute agent calls to maintain its dependency
+    if args.backend != "lmql":
+        for arg in tqdm(arguments):
+            get_one_answer(arg)
+    else:
+        import asyncio
+
+        loop = asyncio.get_event_loop()
+        for arg in tqdm(arguments):
+            loop.run_until_complete(get_one_answer_async(arg))
+    latency = time.perf_counter() - tic
+
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "Generative Agents",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            # to pack weighted functions as a single agent
+            "num_requests": len(arguments) / len(mapping),
+            "other": {
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="agent_calls.jsonl")
+    parser.add_argument("--num-events", type=int, default=10)
+    args = add_common_other_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/generative_agents/bench_sglang.py b/benchmark/generative_agents/bench_sglang.py
new file mode 100644
index 000000000..034b16591
--- /dev/null
+++ b/benchmark/generative_agents/bench_sglang.py
@@ -0,0 +1,74 @@
+import argparse
+import json
+import time
+
+from agent_functions import (
+    action_location_object,
+    action_location_sector,
+    generate_event_triple,
+    generate_pronunciatio,
+    poignancy_event,
+)
+
+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text, read_jsonl
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)[: args.num_events]
+    mapping = {
+        "poignancy_event": poignancy_event,
+        "generate_event_triple": generate_event_triple,
+        "generate_pronunciatio": generate_pronunciatio,
+        "action_location_sector": action_location_sector,
+        "action_location_object": action_location_object,
+    }
+    arguments = [{mapping[k]: v for k, v in l.items()} for l in lines]
+
+    # Select backend
+    backend = select_sglang_backend(args)
+    sgl.set_default_backend(backend)
+
+    states = []
+    # Run requests
+    tic = time.perf_counter()
+    for a in arguments:
+        # only a single key in the dict
+        for func, arg in a.items():
+            result = func.run(**arg)
+        result.sync()
+        states.append(result)
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "Generative Agents",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            # to pack weighted functions as a single agent
+            "num_requests": len(arguments) / len(mapping),
+            "other": {
+                "num_events": args.num_events,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="agent_calls.jsonl")
+    parser.add_argument("--num-events", type=int, default=10)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/gpt_oss/README.md b/benchmark/gpt_oss/README.md
new file mode 100644
index 000000000..4d1b00e91
--- /dev/null
+++ b/benchmark/gpt_oss/README.md
@@ -0,0 +1,163 @@
+# How to reproduce the result of GPT-OSS with SGLang
+
+### Install the latest SGLang
+
+```bash
+git clone https://github.com/sgl-project/sglang.git
+cd sglang
+git checkout v0.5.1.post3
+
+pip install --upgrade pip
+pip install -e "python[all]"
+```
+
+### Reproduce the benchmark throughput result (Batch Size 1)
+
+Launch Command
+
+```bash
+# MXFP4 120B on H100
+python3 -m sglang.launch_server --model openai/gpt-oss-120b --tp 8 --attention-backend triton
+
+# BF16 120B on H100
+python3 -m sglang.launch_server --model lmsys/gpt-oss-120b-bf16 --tp 8 --attention-backend triton
+
+# MXFP4 120B on B200
+python3 -m sglang.launch_server --model openai/gpt-oss-120b --tp 4
+
+# BF16 120B on B200
+python3 -m sglang.launch_server --model lmsys/gpt-oss-120b-bf16 --tp 4
+```
+
+Benchmark Command
+
+```bash
+
+# MXFP4 120B on H100
+python3 -m sglang.bench_one_batch_server --model openai/gpt-oss-120b --base-url http://localhost:30000 --batch-size 1 --input-len 1024 --output-len 512 --show-report
+```
+
+### Reproduce the benchmark throughput result (Batch Size 32)
+
+Launch Command
+
+```bash
+# MXFP4 120B on H100
+python3 -m sglang.launch_server --model openai/gpt-oss-120b --tp 8
+
+# BF16 120B on H100
+python3 -m sglang.launch_server --model lmsys/gpt-oss-120b-bf16 --tp 8
+
+# MXFP4 120B on B200
+python3 -m sglang.launch_server --model openai/gpt-oss-120b --tp 4
+
+# BF16 120B on B200
+python3 -m sglang.launch_server --model lmsys/gpt-oss-120b-bf16 --tp 4
+```
+
+Benchmark Command
+
+```bash
+python3 -m sglang.bench_one_batch_server --model openai/gpt-oss-120b --base-url http://localhost:30000 --batch-size 32 --input-len 1024 8192 --output-len 512 --show-report
+```
+
+### Reproduce the evaluation result
+
+Install gpt-oss
+
+```bash
+git clone https://github.com/openai/gpt-oss.git
+cd gpt-oss
+pip install -e .
+```
+
+Evaluation Command
+
+```bash
+DATASET=gpqa
+BASE_URL=YOUR_BASE_URL
+OPENAI_API_KEY=dummy python -m gpt_oss.evals \
+    --base-url ${BASE_URL}/v1 \
+    --model dummy \
+    --reasoning-effort low,medium,high \
+    --eval $DATASET \
+    --n-threads 1000
+```
+
+### Reproduce the benchmark result of acceptance length
+> Note: On B200, if top k is 1, set `--attention-backend trtllm_mha`
+```bash
+git clone https://github.com/sgl-project/SpecForge.git
+cd SpecForge/benchmarks
+config_list=(
+    "1,0,0,0"
+    "1,3,1,4"
+    "1,5,4,8"
+)
+python3 bench_model_speedup.py \
+    --model-path openai/gpt-oss-120b \
+    --speculative-draft-model-path lmsys/EAGLE3-gpt-oss-120b-bf16 \
+    --port 20001 \
+    --trust-remote-code \
+    --mem-fraction-static 0.8 \
+    --tp-size 4 \
+    --attention-backend fa3 \
+    --config-list "${config_list[@]}" \
+    --benchmark-list mtbench:80 gsm8k:200 humaneval:200 math500:200 \
+    --output lmsys_gpt-oss-120b_Eagle3_result.jsonl
+
+python3 bench_model_speedup.py \
+    --model-path openai/gpt-oss-120b \
+    --speculative-draft-model-path nvidia/gpt-oss-120b-Eagle3 \
+    --port 20001 \
+    --trust-remote-code \
+    --mem-fraction-static 0.8 \
+    --tp-size 4 \
+    --attention-backend fa3 \
+    --config-list "${config_list[@]}" \
+    --benchmark-list mtbench:80 gsm8k:200 humaneval:200 math500:200 \
+    --output nv_gpt-oss-120b_Eagle3_result.jsonl
+```
+
+### Reproduce the result of speculative decoding speedup
+
+Launch Command
+
+```bash
+# On Hopper:
+# - Tree decoding (topk > 1) and chain decoding (topk = 1) are supported on both FA3 and Triton backends.
+python3 -m sglang.launch_server --model openai/gpt-oss-120b --speculative-algorithm EAGLE3 --speculative-draft-model-path lmsys/EAGLE3-gpt-oss-120b-bf16 --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --tp 4
+python3 -m sglang.launch_server --model openai/gpt-oss-120b --speculative-algorithm EAGLE3 --speculative-draft-model-path lmsys/EAGLE3-gpt-oss-120b-bf16 --speculative-num-steps 5 --speculative-eagle-topk 4 --speculative-num-draft-tokens 8 --tp 4
+
+# On Blackwell:
+# - Chain decoding (topk = 1) is supported on TRTLLM-MHA backend. Tree decoding (topk > 1) is in progress, stay tuned!
+# - Both tree decoding (topk > 1) and chain decoding (topk = 1) are supported on the Triton backend.
+python3 -m sglang.launch_server --model openai/gpt-oss-120b --speculative-algo EAGLE3 --speculative-draft-model-path lmsys/EAGLE3-gpt-oss-120b-bf16 --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --tp 4
+python3 -m sglang.launch_server --model openai/gpt-oss-120b --speculative-algo EAGLE3 --speculative-draft-model-path lmsys/EAGLE3-gpt-oss-120b-bf16 --speculative-num-steps 5 --speculative-eagle-topk 4 --speculative-num-draft-tokens 8 --attention-backend triton --tp 4
+```
+
+Benchmark Command
+
+```bash
+config_list=(
+    "1,0,0,0"
+    "1,3,1,4"
+    "1,5,4,8"
+)
+python3 bench_model_speedup.py \
+    --model-path openai/gpt-oss-120b \
+    --speculative-draft-model-path lmsys/EAGLE3-gpt-oss-120b-bf16 \
+    --port 20001 \
+    --trust-remote-code \
+    --mem-fraction-static 0.8 \
+    --tp-size 4 \
+    --attention-backend fa3 \
+    --config-list "${config_list[@]}" \
+    --benchmark-list gsm8k:200 humaneval:200 math500:200 \
+    --output lmsys_gpt-oss-120b_Eagle3_result.jsonl
+```
+
+We can gain the best speedup with the following settings:
+
+- **1.39x** speedup with the `--speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4` setting.
+- **1.52x** speedup with the `--speculative-num-steps 5 --speculative-eagle-topk 4 --speculative-num-draft-tokens 8` setting.
diff --git a/benchmark/gsm8k/README.md b/benchmark/gsm8k/README.md
new file mode 100644
index 000000000..c110f533c
--- /dev/null
+++ b/benchmark/gsm8k/README.md
@@ -0,0 +1,47 @@
+## Run benchmark
+
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+```
+python3 bench_sglang.py --num-questions 200
+```
+
+
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
+```
+
+```
+python3 bench_other.py --num-questions 200 --backend vllm
+```
+
+
+### Benchmark lightllm
+```
+# A10G
+python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000
+```
+
+```
+python3 bench_other.py --num-questions 200 --backend lightllm
+```
+
+
+### Benchmark guidance
+```
+python3 bench_other.py --num-questions 200 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+
+
+### Benchmark lmql
+```
+CUDA_VISIBLE_DEVICES=0,1 lmql serve-model meta-llama/Llama-2-7b-chat-hf --cuda --port 23000
+```
+
+```
+python3 bench_other.py --num-questions 100 --backend lmql --parallel 2
+```
diff --git a/benchmark/gsm8k/bench_other.py b/benchmark/gsm8k/bench_other.py
new file mode 100644
index 000000000..6dcb9ad7c
--- /dev/null
+++ b/benchmark/gsm8k/bench_other.py
@@ -0,0 +1,151 @@
+import argparse
+import ast
+import asyncio
+import json
+import re
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+import numpy as np
+from tqdm import tqdm
+
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+from sglang.utils import download_and_cache_file, dump_state_text, read_jsonl
+
+INVALID = -9999999
+
+
+def get_one_example(lines, i, include_answer):
+    ret = "Question: " + lines[i]["question"] + "\nAnswer:"
+    if include_answer:
+        ret += " " + lines[i]["answer"]
+    return ret
+
+
+def get_few_shot_examples(lines, k):
+    ret = ""
+    for i in range(k):
+        ret += get_one_example(lines, i, True) + "\n\n"
+    return ret
+
+
+def get_answer_value(answer_str):
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+
+
+def main(args):
+    # Select backend
+    call_generate = get_call_generate(args)
+
+    # Read data
+    url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
+    filename = download_and_cache_file(url)
+    lines = list(read_jsonl(filename))
+
+    # Construct prompts
+    num_questions = args.num_questions
+    num_shots = args.num_shots
+    few_shot_examples = get_few_shot_examples(lines, num_shots)
+
+    questions = []
+    labels = []
+    for i in range(len(lines[:num_questions])):
+        questions.append(get_one_example(lines, i, False))
+        labels.append(get_answer_value(lines[i]["answer"]))
+    assert all(l != INVALID for l in labels)
+
+    states = [None] * len(labels)
+
+    # Run requests
+    if args.backend != "lmql":
+        # Use thread pool
+        def get_one_answer(i):
+            answer = call_generate(
+                prompt=few_shot_examples + questions[i],
+                temperature=0,
+                max_tokens=256,
+                stop=["Question", "Assistant:", "<|separator|>"],
+            )
+            states[i] = answer
+
+        tic = time.perf_counter()
+        if args.parallel == 1:
+            for i in tqdm(range(len(questions))):
+                get_one_answer(i)
+        else:
+            with ThreadPoolExecutor(args.parallel) as executor:
+                list(
+                    tqdm(
+                        executor.map(get_one_answer, list(range(len(questions)))),
+                        total=len(questions),
+                    )
+                )
+
+    else:
+        # Use asyncio
+        async def batched_call(batch_size):
+            for i in range(0, len(questions), batch_size):
+                tasks = []
+                for q in questions[i : i + batch_size]:
+                    tasks.append(
+                        call_generate(
+                            few_shot_examples + q,
+                            temperature=0,
+                            max_tokens=256,
+                            stop="Question",
+                        )
+                    )
+                rets = await asyncio.gather(*tasks)
+                for j in range(len(rets)):
+                    states[i + j] = rets[j]
+
+        tic = time.perf_counter()
+        asyncio.run(batched_call(batch_size=args.parallel))
+    latency = time.perf_counter() - tic
+
+    preds = []
+    for i in range(len(states)):
+        preds.append(get_answer_value(states[i]))
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    invalid = np.mean(np.array(preds) == INVALID)
+
+    # Print results
+    print(f"Accuracy: {acc:.3f}")
+    print(f"Invalid: {invalid:.3f}")
+    print(f"Latency: {latency:.3f} s")
+
+    # Dump results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "gsm8k",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-shots", type=int, default=5)
+    parser.add_argument("--data-path", type=str, default="test.jsonl")
+    parser.add_argument("--num-questions", type=int, default=200)
+    args = add_common_other_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/gsm8k/bench_sglang.py b/benchmark/gsm8k/bench_sglang.py
new file mode 100644
index 000000000..9cdc2cf84
--- /dev/null
+++ b/benchmark/gsm8k/bench_sglang.py
@@ -0,0 +1,148 @@
+import argparse
+import ast
+import json
+import os
+import re
+import time
+
+import numpy as np
+
+from sglang.lang.api import set_default_backend
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    dump_bench_raw_result,
+    select_sglang_backend,
+)
+from sglang.utils import download_and_cache_file, dump_state_text, read_jsonl
+
+INVALID = -9999999
+
+
+def get_one_example(lines, i, include_answer):
+    ret = "Question: " + lines[i]["question"] + "\nAnswer:"
+    if include_answer:
+        ret += " " + lines[i]["answer"]
+    return ret
+
+
+def get_few_shot_examples(lines, k):
+    ret = ""
+    for i in range(k):
+        ret += get_one_example(lines, i, True) + "\n\n"
+    return ret
+
+
+def get_answer_value(answer_str):
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+
+
+def main(args):
+    # Select backend
+    set_default_backend(select_sglang_backend(args))
+
+    # Read data
+    data_path = args.data_path
+    url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
+    if not os.path.isfile(data_path):
+        data_path = download_and_cache_file(url)
+    lines = list(read_jsonl(data_path))
+
+    # Construct prompts
+    num_questions = args.num_questions
+    num_shots = args.num_shots
+    few_shot_examples = get_few_shot_examples(lines, num_shots)
+
+    questions = []
+    labels = []
+    for i in range(len(lines[:num_questions])):
+        questions.append(get_one_example(lines, i, False))
+        labels.append(get_answer_value(lines[i]["answer"]))
+    assert all(l != INVALID for l in labels)
+    arguments = [{"question": q} for q in questions]
+
+    #####################################
+    ######### SGL Program Begin #########
+    #####################################
+
+    import sglang as sgl
+
+    @sgl.function
+    def few_shot_gsm8k(s, question):
+        s += few_shot_examples + question
+        s += sgl.gen(
+            "answer", max_tokens=512, stop=["Question", "Assistant:", "<|separator|>"]
+        )
+
+    #####################################
+    ########## SGL Program End ##########
+    #####################################
+
+    # Run requests
+    tic = time.perf_counter()
+    states = few_shot_gsm8k.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+
+    preds = []
+    for i in range(len(states)):
+        preds.append(get_answer_value(states[i]["answer"]))
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    invalid = np.mean(np.array(preds) == INVALID)
+
+    # Compute speed
+    num_output_tokens = sum(
+        s.get_meta_info("answer")["completion_tokens"] for s in states
+    )
+    output_throughput = num_output_tokens / latency
+
+    # Print results
+    print(f"Accuracy: {acc:.3f}")
+    print(f"Invalid: {invalid:.3f}")
+    print(f"Latency: {latency:.3f} s")
+    print(f"Output throughput: {output_throughput:.3f} token/s")
+
+    # Dump results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+    dump_bench_raw_result(
+        path=args.raw_result_file,
+        states=states,
+        preds=preds,
+        labels=labels,
+    )
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "gsm8k",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-shots", type=int, default=5)
+    parser.add_argument("--data-path", type=str, default="test.jsonl")
+    parser.add_argument("--num-questions", type=int, default=200)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/hellaswag/README.md b/benchmark/hellaswag/README.md
new file mode 100644
index 000000000..cb7e65366
--- /dev/null
+++ b/benchmark/hellaswag/README.md
@@ -0,0 +1,47 @@
+## Run benchmark
+
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+```
+python3 bench_sglang.py --num-questions 200
+```
+
+
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
+```
+
+```
+python3 bench_other.py --num-questions 200 --backend vllm
+```
+
+
+### Benchmark lightllm
+```
+# A10G
+python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000
+```
+
+```
+python3 bench_other.py --num-questions 200 --backend lightllm
+```
+
+
+### Benchmark guidance
+```
+CUDA_VISIBLE_DEVICES=0,1 python3 bench_other.py --num-questions 200 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+
+
+### Benchmark lmql
+```
+lmql serve-model meta-llama/Llama-2-7b-chat-hf --cuda --port 23000
+```
+
+```
+python3 bench_other.py --num-questions 200 --backend lmql --port 23000 --parallel 1
+```
diff --git a/benchmark/hellaswag/bench_other.py b/benchmark/hellaswag/bench_other.py
new file mode 100644
index 000000000..cde0794bb
--- /dev/null
+++ b/benchmark/hellaswag/bench_other.py
@@ -0,0 +1,118 @@
+import argparse
+import asyncio
+import json
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+import numpy as np
+from tqdm import tqdm
+
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_select
+from sglang.utils import download_and_cache_file, read_jsonl
+
+
+def get_one_example(lines, i, include_answer):
+    ret = lines[i]["activity_label"] + ": " + lines[i]["ctx"] + " "
+    if include_answer:
+        ret += lines[i]["endings"][lines[i]["label"]]
+    return ret
+
+
+def get_few_shot_examples(lines, k):
+    ret = ""
+    for i in range(k):
+        ret += get_one_example(lines, i, True) + "\n\n"
+    return ret
+
+
+def main(args):
+    # Select backend
+    call_select = get_call_select(args)
+
+    # Read data
+    url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl"
+    filename = download_and_cache_file(url)
+    lines = list(read_jsonl(filename))
+
+    # Construct prompts
+    num_questions = args.num_questions
+    num_shots = args.num_shots
+    few_shot_examples = get_few_shot_examples(lines, num_shots)
+
+    questions = []
+    choices = []
+    labels = []
+    for i in range(len(lines[:num_questions])):
+        questions.append(get_one_example(lines, i, False))
+        choices.append(lines[i]["endings"])
+        labels.append(lines[i]["label"])
+
+    preds = [None] * len(labels)
+
+    # Run requests
+    if args.backend != "lmql":
+        # Use thread pool
+        def get_one_answer(i):
+            preds[i] = call_select(
+                context=few_shot_examples + questions[i], choices=choices[i]
+            )
+
+        tic = time.perf_counter()
+        if args.parallel == 1:
+            for i in tqdm(range(len(questions))):
+                get_one_answer(i)
+        else:
+            with ThreadPoolExecutor(args.parallel) as executor:
+                list(
+                    tqdm(
+                        executor.map(get_one_answer, list(range(len(questions)))),
+                        total=len(questions),
+                    )
+                )
+    else:
+        # Use asyncio
+        async def batched_call(batch_size):
+            for i in range(0, len(questions), batch_size):
+                tasks = []
+                for q, c in zip(
+                    questions[i : i + batch_size], choices[i : i + batch_size]
+                ):
+                    tasks.append(call_select(context=few_shot_examples + q, choices=c))
+                rets = await asyncio.gather(*tasks)
+                for j in range(len(rets)):
+                    preds[i + j] = rets[j]
+
+        tic = time.perf_counter()
+        asyncio.run(batched_call(batch_size=args.parallel))
+
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    print(f"Latency: {latency:.3f}")
+    print(f"Accuracy: {acc:.3f}")
+
+    # Write results
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "hellaswag",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-shots", type=int, default=20)
+    parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl")
+    parser.add_argument("--num-questions", type=int, default=200)
+    args = add_common_other_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/hellaswag/bench_sglang.py b/benchmark/hellaswag/bench_sglang.py
new file mode 100644
index 000000000..2adce99b8
--- /dev/null
+++ b/benchmark/hellaswag/bench_sglang.py
@@ -0,0 +1,109 @@
+import argparse
+import json
+import os
+import time
+
+import numpy as np
+
+from sglang.lang.api import set_default_backend
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import download_and_cache_file, read_jsonl
+
+
+def get_one_example(lines, i, include_answer):
+    ret = lines[i]["activity_label"] + ": " + lines[i]["ctx"] + " "
+    if include_answer:
+        ret += lines[i]["endings"][lines[i]["label"]]
+    return ret
+
+
+def get_few_shot_examples(lines, k):
+    ret = ""
+    for i in range(k):
+        ret += get_one_example(lines, i, True) + "\n\n"
+    return ret
+
+
+def main(args):
+    # Select backend
+    set_default_backend(select_sglang_backend(args))
+
+    # Read data
+    data_path = args.data_path
+    url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl"
+    if not os.path.isfile(data_path):
+        data_path = download_and_cache_file(url)
+    lines = list(read_jsonl(data_path))
+
+    # Construct prompts
+    num_questions = args.num_questions
+    num_shots = args.num_shots
+    few_shot_examples = get_few_shot_examples(lines, num_shots)
+
+    questions = []
+    choices = []
+    labels = []
+    for i in range(len(lines[:num_questions])):
+        questions.append(get_one_example(lines, i, False))
+        choices.append(lines[i]["endings"])
+        labels.append(lines[i]["label"])
+    arguments = [{"question": q, "choices": c} for q, c in zip(questions, choices)]
+
+    #####################################
+    ######### SGL Program Begin #########
+    #####################################
+
+    import sglang as sgl
+
+    @sgl.function
+    def few_shot_hellaswag(s, question, choices):
+        s += few_shot_examples + question
+        s += sgl.select("answer", choices=choices)
+
+    #####################################
+    ########## SGL Program End ##########
+    #####################################
+
+    # Run requests
+    tic = time.perf_counter()
+    rets = few_shot_hellaswag.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    preds = [choices[i].index(rets[i]["answer"]) for i in range(len(rets))]
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    print(f"Latency: {latency:.3f}")
+    print(f"Accuracy: {acc:.3f}")
+
+    # Write results
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "hellaswag",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-shots", type=int, default=20)
+    parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl")
+    parser.add_argument("--num-questions", type=int, default=200)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/hf3fs/bench.sh b/benchmark/hf3fs/bench.sh
new file mode 100644
index 000000000..049116b89
--- /dev/null
+++ b/benchmark/hf3fs/bench.sh
@@ -0,0 +1,59 @@
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages:/usr/local/lib/python3.12/dist-packages/torch/lib
+python3 benchmark/hf3fs/bench_client.py
+
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages:/usr/local/lib/python3.12/dist-packages/torch/lib
+SGLANG_HICACHE_HF3FS_CONFIG_PATH=/sgl-workspace/sglang/benchmark/hf3fs/hf3fs.json \
+python3 benchmark/hf3fs/bench_storage.py
+
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages:/usr/local/lib/python3.12/dist-packages/torch/lib
+export SGLANG_HICACHE_HF3FS_CONFIG_PATH=/sgl-workspace/sglang/benchmark/hf3fs/hf3fs.json
+echo '{"file_path_prefix": "/data/hf3fs-test-0", "file_size": 1099511627776, "numjobs": 16, "entries": 8}' > \
+${SGLANG_HICACHE_HF3FS_CONFIG_PATH}
+python3 benchmark/hf3fs/bench_zerocopy.py
+
+####################################################################################################
+
+rm -rf nohup.out && \
+nohup python3 -m sglang.launch_server \
+    --model-path /code/models/Qwen3-32B/ \
+    --host 0.0.0.0 --port 33301 \
+    --page-size 64 \
+    --enable-hierarchical-cache \
+    --hicache-ratio 2 --hicache-size 0 \
+    --hicache-write-policy write_through \
+    --hicache-storage-backend hf3fs &
+
+rm -rf bench_multiturn.out && \
+nohup python3 benchmark/hicache/bench_multiturn.py \
+    --model-path /code/models/Qwen3-32B \
+    --dataset-path /code/models/ShareGPT_V3_unfiltered_cleaned_split.json \
+    --port 33301 \
+    --request-length 2048 --num-clients 512 --num-rounds 3 --max-parallel 8 \
+    > bench_multiturn.out &
+
+####################################################################################################
+
+rm -rf nohup.out && \
+nohup python3 -m sglang.launch_server \
+    --model-path /code/models/DeepSeek-R1/ \
+    --tp 16 --nnodes 2 --node-rank 0 \
+    --dist-init-addr 10.74.249.153:5000 \
+    --host 0.0.0.0 --port 33301 \
+    --page-size 64 \
+    --enable-hierarchical-cache \
+    --hicache-ratio 2 --hicache-size 60 \
+    --hicache-write-policy write_through \
+    --hicache-storage-backend hf3fs &
+
+rm -rf bench_multiturn.out && \
+nohup python3 benchmark/hicache/bench_multiturn.py \
+    --model-path /code/models/Qwen3-32B \
+    --dataset-path /code/models/ShareGPT_V3_unfiltered_cleaned_split.json \
+    --port 33301 \
+    --request-length 2048 --num-clients 1024 --num-rounds 3 --max-parallel 8 \
+    > bench_multiturn.out &
+
+####################################################################################################
+
+ps aux | grep "sglang.launch_server" | grep -v grep | awk '{print $2}' | xargs kill -9
+ps aux | grep "bench_multiturn.py" | grep -v grep | awk '{print $2}' | xargs kill -9
diff --git a/benchmark/hf3fs/bench_client.py b/benchmark/hf3fs/bench_client.py
new file mode 100644
index 000000000..33c502575
--- /dev/null
+++ b/benchmark/hf3fs/bench_client.py
@@ -0,0 +1,162 @@
+import concurrent.futures
+import logging
+import random
+import time
+from typing import List
+
+import torch
+from tqdm import tqdm
+
+from sglang.srt.mem_cache.storage.hf3fs.client_hf3fs import Hf3fsClient
+
+
+def print_stats(x: List[int]):
+    x = sorted(x)
+    lenx = len(x)
+    print(
+        f"mean = {sum(x)/len(x):.2f}, "
+        f"min = {min(x):.2f}, "
+        f"p25 = {x[int(lenx*0.25)]:.2f}, "
+        f"p50 = {x[int(lenx*0.5)]:.2f}, "
+        f"p75 = {x[int(lenx*0.75)]:.2f}, "
+        f"max = {max(x):.2f}"
+    )
+
+
+def test():
+    # /path/to/hf3fs
+    file_path = "/data/bench.bin"
+    file_size = 1 << 40
+    bytes_per_page = 16 << 20
+    entries = 32
+    file_ops = Hf3fsClient(file_path, file_size, bytes_per_page, entries)
+
+    print("test batch_read / batch_write")
+    num_pages = 128
+    dtype = torch.bfloat16
+    numel = bytes_per_page // dtype.itemsize
+    offsets = list(range(file_size // bytes_per_page))
+    random.shuffle(offsets)
+    offsets = offsets[:num_pages]
+    offsets = [i * bytes_per_page for i in offsets]
+    tensor_writes = [
+        torch.randn(numel, dtype=dtype)
+        for _ in tqdm(range(num_pages), desc="prepare tensor")
+    ]
+    for i in tqdm(range(0, num_pages, file_ops.entries), desc="batch_write"):
+        results = file_ops.batch_write(
+            offsets[i : i + file_ops.entries], tensor_writes[i : i + file_ops.entries]
+        )
+        assert all([result == numel * dtype.itemsize for result in results])
+    tensor_reads = [
+        torch.empty(numel, dtype=dtype)
+        for _ in tqdm(range(num_pages), desc="prepare tensor")
+    ]
+    for i in tqdm(range(0, num_pages, file_ops.entries), desc="batch_read"):
+        results = file_ops.batch_read(
+            offsets[i : i + file_ops.entries], tensor_reads[i : i + file_ops.entries]
+        )
+        assert all([result == numel * dtype.itemsize for result in results])
+    assert all([torch.allclose(r, w) for r, w in zip(tensor_reads, tensor_writes)])
+
+    file_ops.close()
+    print("test done")
+
+
+def bench():
+    file_path = "/data/bench.bin"
+    file_size = 1 << 40
+    bytes_per_page = 16 << 20
+    entries = 8
+    numjobs = 16
+
+    dtype = torch.bfloat16
+    numel = bytes_per_page // dtype.itemsize
+
+    file_ops = [
+        Hf3fsClient(file_path, file_size, bytes_per_page, entries)
+        for _ in range(numjobs)
+    ]
+
+    num_page = entries
+
+    offsets = list(range(file_size // bytes_per_page))
+    tensors_write = [torch.randn(numel, dtype=dtype)] * num_page
+    tensors_read = [torch.empty(numel, dtype=dtype)] * num_page
+    random.shuffle(offsets)
+
+    warmup = 50
+    iteration = 100
+
+    executor = concurrent.futures.ThreadPoolExecutor(max_workers=numjobs)
+
+    w_bw = []
+    w_size = num_page * numjobs * bytes_per_page / (1 << 30)
+    for i in tqdm(range(warmup + iteration), desc="Benchmarking write (GB/s)"):
+        _offsets = [
+            [
+                offset * bytes_per_page
+                for offset in offsets[
+                    (i * numjobs + j) * num_page : (i * numjobs + j + 1) * num_page
+                ]
+            ]
+            for j in range(numjobs)
+        ]
+        tik = time.perf_counter()
+        futures = [
+            executor.submit(file_ops[j].batch_write, offset, tensors_write)
+            for j, offset in enumerate(_offsets)
+        ]
+        results = [future.result() for future in futures]
+        tok = time.perf_counter()
+        if i < warmup:
+            continue
+        w_bw.append(w_size / (tok - tik))
+        results = [
+            _result == bytes_per_page for result in results for _result in result
+        ]
+        assert all(results)
+    print_stats(w_bw)
+
+    r_bw = []
+    r_size = w_size
+    for i in tqdm(range(warmup + iteration), desc="Benchmarking read (GB/s)"):
+        _offsets = [
+            [
+                offset * bytes_per_page
+                for offset in offsets[
+                    (i * numjobs + j) * num_page : (i * numjobs + j + 1) * num_page
+                ]
+            ]
+            for j in range(numjobs)
+        ]
+        tik = time.perf_counter()
+        futures = [
+            executor.submit(file_ops[j].batch_read, offset, tensors_read)
+            for j, offset in enumerate(_offsets)
+        ]
+        results = [future.result() for future in futures]
+        tok = time.perf_counter()
+        if i < warmup:
+            continue
+        r_bw.append(r_size / (tok - tik))
+        results = [
+            _result == bytes_per_page for result in results for _result in result
+        ]
+        assert all(results)
+    print_stats(r_bw)
+
+    executor.shutdown(wait=True)
+    for _file_ops in file_ops:
+        _file_ops.close()
+    print("bench done")
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+    test()
+    bench()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/hf3fs/bench_storage.py b/benchmark/hf3fs/bench_storage.py
new file mode 100644
index 000000000..f0ce171bf
--- /dev/null
+++ b/benchmark/hf3fs/bench_storage.py
@@ -0,0 +1,258 @@
+import json
+import logging
+import os
+import random
+import time
+from typing import List
+
+import torch
+from tqdm import tqdm
+
+from sglang.srt.mem_cache.storage.hf3fs.mini_3fs_metadata_server import (
+    Hf3fsLocalMetadataClient,
+)
+from sglang.srt.mem_cache.storage.hf3fs.storage_hf3fs import HiCacheHF3FS
+
+
+def print_stats(x: List[int]):
+    x = sorted(x)
+    lenx = len(x)
+    print(
+        f"mean = {sum(x)/len(x):.2f}, "
+        f"min = {min(x):.2f}, "
+        f"p25 = {x[int(lenx*0.25)]:.2f}, "
+        f"p50 = {x[int(lenx*0.5)]:.2f}, "
+        f"p75 = {x[int(lenx*0.75)]:.2f}, "
+        f"max = {max(x):.2f}"
+    )
+
+
+def test():
+    # Qwen3-32B
+    layer_num = 64
+    head_num, head_dim = 8, 128
+    kv_lora_rank, qk_rope_head_dim = 0, 0
+    store_dtype = torch.bfloat16
+    tokens_per_page = 64
+
+    file_path_prefix = "/data/test"
+    file_size = 128 << 20
+    numjobs = 16
+    bytes_per_page = 16 << 20
+    entries = 2
+    dtype = store_dtype
+
+    config_path = os.getenv(HiCacheHF3FS.default_env_var)
+    assert config_path
+    try:
+        with open(config_path, "w") as f:
+            json.dump(
+                {
+                    "file_path_prefix": file_path_prefix,
+                    "file_size": file_size,
+                    "numjobs": numjobs,
+                    "entries": entries,
+                },
+                f,
+            )
+    except Exception as e:
+        raise RuntimeError(f"Failed to dump config to {config_path}: {str(e)}")
+    hicache_hf3fs = HiCacheHF3FS.from_env_config(bytes_per_page, dtype)
+
+    numel = 2 * tokens_per_page * layer_num * head_num * head_dim
+    assert numel * dtype.itemsize == bytes_per_page
+
+    num_pages = 10
+    tensors = {}
+    for i in range(num_pages):
+        k = f"key_{i}"
+        v = torch.randn((numel,)).to(dtype=dtype)
+        ok = hicache_hf3fs.set(k, v)
+        if i < (file_size // bytes_per_page):
+            assert ok, f"Failed to insert {k}"
+        else:
+            assert not ok
+        tensors[k] = v
+    assert hicache_hf3fs.get("key_8") is None
+    assert hicache_hf3fs.get("key_9") is None
+
+    start = 0
+    for i in range(start, start + hicache_hf3fs.num_pages):
+        k = f"key_{i}"
+        assert hicache_hf3fs.exists(k)
+        out = hicache_hf3fs.get(k)
+        assert out is not None
+        v = tensors[k]
+        assert torch.allclose(v, out, atol=1e-3), f"Tensor mismatch for {k}"
+
+    assert not hicache_hf3fs.exists("not_exists")
+
+    hicache_hf3fs.delete("key_7")
+    v2 = torch.randn((numel,)).to(dtype=dtype)
+    assert hicache_hf3fs.set("key_new", v2)
+    assert torch.allclose(hicache_hf3fs.get("key_new"), v2, atol=1e-3)
+
+    hicache_hf3fs.clear()
+    assert (
+        len(hicache_hf3fs.metadata_client.rank_metadata.free_pages)
+        == hicache_hf3fs.metadata_client.rank_metadata.num_pages
+    )
+
+    # batch
+    num_pages = 10
+    tensors = {}
+    keys = []
+    values = []
+    for i in range(num_pages):
+        k = f"key_{i}"
+        keys.append(k)
+        v = torch.randn((numel,)).to(dtype=dtype)
+        values.append(v)
+
+    ok = hicache_hf3fs.batch_set(keys, values)
+    assert not ok
+    assert hicache_hf3fs.get("key_8") is None
+    assert hicache_hf3fs.get("key_9") is None
+
+    results = hicache_hf3fs.batch_get(keys[: hicache_hf3fs.num_pages])
+    for result, key, value in zip(
+        results, keys[: hicache_hf3fs.num_pages], values[: hicache_hf3fs.num_pages]
+    ):
+        assert torch.allclose(value, result, atol=1e-3), f"Tensor mismatch for {key}"
+
+    hicache_hf3fs.close()
+    os.remove(hicache_hf3fs.file_path)
+
+    print("All test cases passed.")
+
+
+def bench():
+    # Qwen3-32B
+    layer_num = 64
+    head_num, head_dim = 8, 128
+    kv_lora_rank, qk_rope_head_dim = 0, 0
+    store_dtype = torch.bfloat16
+    tokens_per_page = 64
+
+    file_path = "/data/test.bin"
+    file_size = 1 << 40
+    numjobs = 16
+    bytes_per_page = 16 << 20
+    entries = 8
+    dtype = store_dtype
+    hicache_hf3fs = HiCacheHF3FS(
+        rank=0,
+        file_path=file_path,
+        file_size=file_size,
+        numjobs=numjobs,
+        bytes_per_page=bytes_per_page,
+        entries=entries,
+        dtype=dtype,
+        metadata_client=Hf3fsLocalMetadataClient(),
+    )
+
+    numel = 2 * tokens_per_page * layer_num * head_num * head_dim
+    assert numel * dtype.itemsize == bytes_per_page
+
+    num_page = 128
+    values = [torch.randn((numel,)).to(dtype=dtype) for _ in tqdm(range(num_page))]
+
+    warmup = 50
+    iteration = 100
+
+    w_bw = []
+    w_size = num_page * bytes_per_page / (1 << 30)
+    for i in tqdm(range(warmup + iteration), desc="Benchmarking write (GB/s)"):
+        keys = [f"{j}" for j in range(i * num_page, (i + 1) * num_page)]
+        tik = time.perf_counter()
+        ok = hicache_hf3fs.batch_set(keys, values)
+        tok = time.perf_counter()
+        if i < warmup:
+            continue
+        w_bw.append(w_size / (tok - tik))
+        assert ok
+    print_stats(w_bw)
+
+    r_bw = []
+    r_size = num_page * bytes_per_page / (1 << 30)
+    for i in tqdm(range(warmup + iteration), desc="Benchmarking read (GB/s)"):
+        keys = random.sample(
+            list(hicache_hf3fs.metadata_client.rank_metadata.key_to_index.keys()),
+            num_page,
+        )
+        tik = time.perf_counter()
+        results = hicache_hf3fs.batch_get(keys)
+        tok = time.perf_counter()
+        if i < warmup:
+            continue
+        r_bw.append(r_size / (tok - tik))
+        assert all([r is not None for r in results])
+    print_stats(r_bw)
+
+    hicache_hf3fs.close()
+
+
+def allclose():
+    # Qwen3-32B
+    layer_num = 64
+    head_num, head_dim = 8, 128
+    kv_lora_rank, qk_rope_head_dim = 0, 0
+    store_dtype = torch.bfloat16
+    tokens_per_page = 64
+
+    file_path = "/data/test.bin"
+    file_size = 1 << 40
+    numjobs = 16
+    bytes_per_page = 16 << 20
+    entries = 8
+    dtype = store_dtype
+    hicache_hf3fs = HiCacheHF3FS(
+        rank=0,
+        file_path=file_path,
+        file_size=file_size,
+        numjobs=numjobs,
+        bytes_per_page=bytes_per_page,
+        entries=entries,
+        dtype=dtype,
+        metadata_client=Hf3fsLocalMetadataClient(),
+    )
+
+    numel = 2 * tokens_per_page * layer_num * head_num * head_dim
+    assert numel * dtype.itemsize == bytes_per_page
+
+    num_page = 128
+    values = [torch.randn((numel,)).to(dtype=dtype) for _ in tqdm(range(num_page))]
+
+    iteration = 100
+
+    for i in tqdm(range(iteration), desc="Benchmarking write (GB/s)"):
+        keys = [f"{j}" for j in range(i * num_page, (i + 1) * num_page)]
+        ok = hicache_hf3fs.batch_set(keys, values)
+        assert ok
+
+    read_keys, read_results = [], []
+    for i in tqdm(range(iteration), desc="Benchmarking read (GB/s)"):
+        keys = random.sample(
+            list(hicache_hf3fs.metadata_client.rank_metadata.key_to_index.keys()),
+            num_page,
+        )
+        results = hicache_hf3fs.batch_get(keys)
+        read_keys.extend(keys)
+        read_results.extend(results)
+        assert all([r is not None for r in results])
+
+    for key, result in tqdm(zip(read_keys, read_results)):
+        assert torch.allclose(values[int(key) % num_page], result, atol=1e-3)
+
+    hicache_hf3fs.close()
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+    test()
+    bench()
+    allclose()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/hf3fs/bench_zerocopy.py b/benchmark/hf3fs/bench_zerocopy.py
new file mode 100644
index 000000000..bfa7bff0e
--- /dev/null
+++ b/benchmark/hf3fs/bench_zerocopy.py
@@ -0,0 +1,140 @@
+import threading
+import time
+
+import torch
+from tqdm import tqdm
+
+from sglang.srt.distributed import (
+    get_world_group,
+    init_distributed_environment,
+    initialize_model_parallel,
+)
+from sglang.srt.managers.cache_controller import (
+    HiCacheController,
+    PrefetchOperation,
+    StorageOperation,
+)
+from sglang.srt.mem_cache.allocator import TokenToKVPoolAllocator
+from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool
+from sglang.srt.mem_cache.memory_pool_host import MHATokenToKVPoolHost
+
+init_distributed_environment(
+    world_size=1,
+    rank=0,
+    distributed_init_method="tcp://127.0.0.1:23456",
+    local_rank=0,
+    backend="gloo",
+)
+
+initialize_model_parallel(
+    tensor_model_parallel_size=1,
+    pipeline_model_parallel_size=1,
+)
+
+group = get_world_group().cpu_group
+
+max_total_num_tokens = 524288
+page_size = 64
+kv_cache_dtype = torch.bfloat16
+layer_num = 64
+head_num, head_dim = 8, 128
+device = "cuda"
+hicache_ratio = 2
+hicache_size = 0
+hicache_mem_layout = "page_first"
+# hicache_mem_layout = "layer_first"
+hicache_write_policy = "write_through"
+hicache_io_backend = "kernel"
+hicache_storage_backend = "hf3fs"
+prefetch_threshold = 256
+
+op_size = 1024
+op_num = 16
+
+token_to_kv_pool = MHATokenToKVPool(
+    max_total_num_tokens,
+    page_size=page_size,
+    dtype=kv_cache_dtype,
+    head_num=head_num,
+    head_dim=head_dim,
+    layer_num=layer_num,
+    device=device,
+    enable_memory_saver=True,
+)
+
+token_to_kv_pool_allocator = TokenToKVPoolAllocator(
+    max_total_num_tokens,
+    dtype=kv_cache_dtype,
+    device=device,
+    kvcache=token_to_kv_pool,
+    need_sort=False,
+)
+
+kv_cache = token_to_kv_pool_allocator.get_kvcache()
+token_to_kv_pool_host = MHATokenToKVPoolHost(
+    kv_cache,
+    hicache_ratio,
+    hicache_size,
+    page_size,
+    hicache_mem_layout,
+)
+
+load_cache_event = threading.Event()
+cache_controller = HiCacheController(
+    token_to_kv_pool_allocator,
+    token_to_kv_pool_host,
+    page_size,
+    group,
+    load_cache_event=load_cache_event,
+    write_policy=hicache_write_policy,
+    io_backend=hicache_io_backend,
+    storage_backend=hicache_storage_backend,
+    prefetch_threshold=prefetch_threshold,
+)
+
+operations = [
+    StorageOperation(
+        torch.tensor(list(range(i, i + op_size))),
+        list(range(i, i + op_size)),
+        hash_value=[f"{j}" for j in range(i, i + op_size, page_size)],
+    )
+    for i in tqdm(range(0, op_num * op_size, op_size))
+]
+
+tik = time.monotonic()
+if hicache_mem_layout == "page_first":
+    for operation in operations:
+        cache_controller.zerocopy_page_backup(operation, batch_size=128)
+elif hicache_mem_layout == "layer_first":
+    for operation in operations:
+        cache_controller.generic_page_backup(operation, batch_size=128)
+tok = time.monotonic()
+print(f"{tok-tik:.6f} s")
+
+operations = [
+    PrefetchOperation(
+        f"{i}",
+        torch.tensor(list(range(i, i + op_size))),
+        list(range(i, i + op_size)),
+        f"{i}",
+    )
+    for i in tqdm(range(0, op_num * op_size, op_size))
+]
+
+for operation in operations:
+    operation.hash_value = [
+        f"{j}"
+        for j in range(
+            int(operation.last_hash), int(operation.last_hash) + op_size, page_size
+        )
+    ]
+
+tik = time.monotonic()
+if hicache_mem_layout == "page_first":
+    for operation in operations:
+        cache_controller.zerocopy_page_transfer(operation, batch_size=128)
+elif hicache_mem_layout == "layer_first":
+    for operation in operations:
+        cache_controller.generic_page_transfer(operation, batch_size=128)
+tok = time.monotonic()
+print(f"{tok-tik:.6f} s")
diff --git a/benchmark/hicache/README.md b/benchmark/hicache/README.md
new file mode 100644
index 000000000..f2aa6ff05
--- /dev/null
+++ b/benchmark/hicache/README.md
@@ -0,0 +1,91 @@
+## Run synthetic multi-turn benchmark
+
+```
+# SGLang server with radix cache disabled
+python -m sglang.launch_server --model-path Qwen/Qwen2.5-14B-Instruct --port 30000 --disable-radix-cache
+
+# SGLang server with radix cache on and first-come-first-serve policy
+python -m sglang.launch_server --model-path Qwen/Qwen2.5-14B-Instruct --port 30000 --schedule-policy fcfs
+
+# The default SGLang server with radix cache on and long-prefix-match policy
+python -m sglang.launch_server --model-path Qwen/Qwen2.5-14B-Instruct --port 30000
+
+# SGLang server with hierarchical radix cache enabled
+python -m sglang.launch_server --model-path Qwen/Qwen2.5-14B-Instruct --port 30000 --enable-hierarchical-cache
+
+```
+
+```
+python bench_multiturn.py --model-path Qwen/Qwen2.5-14B-Instruct
+```
+
+Note: The performance gain of hierarchical caching depends on the ratio of reusable tokens to GPU memory capacity. The more tokens to be reused, the larger the model, and the more constrained the GPU memory size, the greater the benefit one can expect from hierarchical caching.
+
+
+# Benchmark with more datasets
+## Download Dataset
+```bash
+./download.sh {sharegpt|ultragpt|loogle|nextqa|all}
+```
+This script will automatically download the required dataset to the current working directory
+
+## Multiturn Benchmark
+### Supported Datasets
+- sharegpt
+- ultrachat
+- loogle
+### Example Usage:
+```bash
+python3 bench_serving.py --model mistralai/Mistral-7B-Instruct-v0.3 --backend sglang \
+--dataset-path longdep_qa.json --dataset-name loogle --request-rate 10 --num-prompts 10  \
+--port 8001 --enable-multiturn --disable-shuffle
+```
+This uses `mistralai/Mistral-7B-Instruct-v0.3` model with `sglang` as backend. The dataset
+is `longdep_qa.json`. We send `10 conversations` with `10 req/s` to port 8001. We enable
+multiturn chat without shuffling the order of conversations (i.e. following the original
+order in the dataset file).
+
+### Note:
+The requests of multiple conversations are sent in a round robin fashion.
+For example, if we have 3 conversations A, B, C whose rounds are `[2, 3, 4]` correspondingly,
+multiturn chat will send the requests to the backend in the following order: `[A1, B1, C1, A2, B2, C2, B3, C3, C4]`
+This has implications on the cache reuse patterns: the cache reuse distance is the largest
+under this request pattern (which means a prefix-aware local scheduler in the backend can
+yield the most benefit compared to a FIFO scheduler)
+
+## Shared Prefix Benchmark
+### Supported Datasets
+- loogle
+### Example Usage:
+```bash
+python3 bench_serving.py --model mistralai/Mistral-7B-Instruct-v0.3 --backend sglang \
+--dataset-path longdep_qa.json --dataset-name loogle --request-rate 10 --num-prompts 10  \
+--port 8001 --enable-shared-prefix --disable-shuffle
+```
+### Note:
+Shared Prefix benchmark sends the questions for the same prompt together. For example,
+if we have 3 shared prefix A, B, C, which have [2, 3, 4] questions correspondingly,
+the shared prefix benchmark will send the requests to the
+backend in the following order: `[A+Q1, A+Q2, B+Q1, B+Q2, B+Q3, C+Q1, C+Q2, C+Q3]`.
+
+
+## Multi Modality Benchmark (WIP)
+### Supported Datasets:
+- nextqa
+### Example Usage:
+```bash
+Server:
+python3 -m sglang.launch_server --model-path lmms-lab/LLaVA-NeXT-Video-7B  --tp 2 --dp 1 --port 8001 \
+--host 0.0.0.0 --mem-fraction-static 0.9 --tokenizer-path llava-hf/llava-1.5-7b-hf \
+--json-model-override-args "{\"architectures\": [\"LlavaVidForCausalLM\"], \"model_type\":\"llava\", \"mm_spatial_pool_stride\":2}"
+
+Client:
+python3 bench_serving.py --model lmms-lab/LLaVA-NeXT-Video-7B --backend sglang  --dataset-path \
+NExTVideo  --dataset-name nextqa --request-rate 10 --num-prompts 1 --disable-shuffle --port 8001 \ --enable-multiturn --max-frames 16 --tokenizer llava-hf/llava-1.5-7b-hf --fixed-output-len 2048
+```
+Note: for the server args, `tokenizer-path`, overriding architecture are necessary.
+
+## Supported Backend
+- sglang (oai)
+- vllm (oai)
+- lmdeploy (oai)
diff --git a/benchmark/hicache/bench_long_context.py b/benchmark/hicache/bench_long_context.py
new file mode 100644
index 000000000..eed0ae5dc
--- /dev/null
+++ b/benchmark/hicache/bench_long_context.py
@@ -0,0 +1,101 @@
+import json
+import queue
+import time
+
+import requests
+from bench_multiturn import (
+    ReadyQueue,
+    WorkloadGenerator,
+    gen_payload,
+    log_to_jsonl_file,
+    parse_args,
+)
+from tqdm.asyncio import tqdm
+
+from sglang.bench_serving import get_tokenizer
+
+
+class ContextWorkloadGenerator(WorkloadGenerator):
+    def __init__(self, args):
+        # Construct the base URL for requests
+        self.baseurl = f"http://{args.host}:{args.port}/"
+        self.url = self.baseurl + "generate"
+
+        self.tokenizer = get_tokenizer(args.model_path)
+        self.distribution = args.distribution
+        self.request_rate = args.request_rate
+        self.start_time = None
+        self.finished_time = None
+
+        self.sent_requests = 0
+        self.completed_requests = 0
+
+        self.dataset = json.load(open(args.dataset_path))
+        num_requests = min(args.num_clients, len(self.dataset["queries"]))
+
+        init_requests = []
+        for i in range(num_requests):
+            context_id = self.dataset["queries"][i]["context"]
+            init_requests.append(
+                (
+                    i,
+                    gen_payload(
+                        self.dataset["contexts"][context_id]
+                        + self.dataset["queries"][i]["question"],
+                        len(
+                            self.tokenizer(
+                                self.dataset["queries"][i]["reference_answer"]
+                            )["input_ids"]
+                        ),
+                    ),
+                )
+            )
+        self.ready_queue = ReadyQueue(init_requests=init_requests)
+
+        self.response_queue = queue.Queue()
+        self.pbar = tqdm(total=num_requests)
+        self.performance_metrics = {
+            "ttft": [],
+            "latency": [],
+            "itl": [],
+            "prompt_len": [],
+            "cached_tokens": [],
+            "generated_len": [],
+        }
+
+        self.max_parallel = args.max_parallel
+        self.logfile = args.log_file
+
+    def response_handler(self):
+        while True:
+            try:
+                client_id, response = self.response_queue.get(
+                    timeout=10
+                )  # Block until response is available
+                if not response.success:
+                    raise ValueError(f"Request failed with error: {response.error}")
+                self.performance_metrics["ttft"].append(response.ttft)
+                self.performance_metrics["itl"].extend(response.itl)
+                self.performance_metrics["latency"].append(response.latency)
+                self.performance_metrics["prompt_len"].append(response.prompt_len)
+                self.performance_metrics["cached_tokens"].append(response.cached_tokens)
+                self.performance_metrics["generated_len"].append(response.generated_len)
+                self.completed_requests += 1
+
+            except queue.Empty:
+                if self.pbar.n == self.pbar.total:
+                    break
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    args.num_rounds = 1
+    args.max_parallel = 24
+    flush_cache_url = f"http://{args.host}:{args.port}/flush_cache"
+
+    for request_rate in [24, 16, 12, 8, 4, 2, 1]:
+        args.request_rate = request_rate
+        requests.post(flush_cache_url)
+        time.sleep(1)
+        performance_data = ContextWorkloadGenerator(args).run()
+        log_to_jsonl_file(performance_data, args.log_file, args.tag)
diff --git a/benchmark/hicache/bench_mix.py b/benchmark/hicache/bench_mix.py
new file mode 100644
index 000000000..cfd25bc40
--- /dev/null
+++ b/benchmark/hicache/bench_mix.py
@@ -0,0 +1,567 @@
+import argparse
+import asyncio
+import json
+import logging
+import os
+import queue
+import random
+import threading
+import time
+from dataclasses import dataclass
+from functools import wraps
+
+import aiohttp
+
+from sglang.bench_serving import (
+    RequestFuncOutput,
+    get_tokenizer,
+    remove_prefix,
+    sample_random_requests,
+)
+
+# Set up logger
+logger = logging.getLogger(__name__)
+
+# Set up JSONL file for debug logging
+debug_log_file = None
+# Create a lock for thread-safe debug log writing
+debug_log_lock = threading.Lock()
+
+
+def write_debug_log(data):
+    global debug_log_file
+
+    """Write debug information to a JSONL file"""
+    if debug_log_file is None:
+        return
+
+    # Acquire lock for thread-safe writing
+    with debug_log_lock:
+        # Write as JSONL (JSON Line format)
+        debug_log_file.write(json.dumps(data) + "\n")
+        debug_log_file.flush()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Script to benchmark concurrent requests to a server."
+    )
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default="/data/models/Qwen3-0.6B",
+        help="model path compatible with Hugging Face Transformers",
+    )
+    parser.add_argument(
+        "--dataset-path",
+        type=str,
+        default="/data/models/ShareGPT_V3_unfiltered_cleaned_split/ShareGPT_V3_unfiltered_cleaned_split.json",
+        help="local dataset to sample tokens from",
+    )
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="localhost",
+        help="Server hostname or IP (default: localhost)",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=30000,
+        help="Server port (default: 30000)",
+    )
+    parser.add_argument(
+        "--duration",
+        type=int,
+        default=600,
+        help="Duration to run the benchmark in seconds (default: 300 seconds)",
+    )
+    parser.add_argument(
+        "--log-level",
+        type=str,
+        default="info",
+        choices=["debug", "info"],
+        help="Set the logging level (default: info)",
+    )
+    parser.add_argument(
+        "--debug-log-file",
+        type=str,
+        default="debug.log.jsonl",
+        help="File to write debug logs in JSONL format",
+    )
+    return parser.parse_args()
+
+
+def load_config():
+    config_path = os.getenv("CONFIG_PATH")
+    if not config_path:
+        raise ValueError("Environment variable 'CONFIG_PATH' is not set.")
+
+    with open(config_path, "r") as f:
+        config = json.load(f)
+
+    required_keys = [
+        "num_rounds",
+        "num_clients",
+        "round_ratios",
+        "mean_new_tokens_per_round",
+        "mean_return_tokens_per_round",
+        "mean_inter_round_interval",
+    ]
+
+    for key in required_keys:
+        if key not in config:
+            raise KeyError(f"Missing required configuration key: {key}")
+
+    num_rounds = config["num_rounds"]
+    assert len(config["round_ratios"]) == num_rounds
+    assert len(config["mean_new_tokens_per_round"]) == num_rounds
+    assert len(config["mean_return_tokens_per_round"]) == num_rounds
+    assert len(config["mean_inter_round_interval"]) == num_rounds
+
+    print(config)
+
+    return config
+
+
+@dataclass
+class UserData:
+    user_id: int
+    current_round: int
+    total_rounds: int
+    prompt: str
+    return_tokens: int
+    start: int
+
+
+def synchronized():
+    def _decorator(func):
+        @wraps(func)
+        def wrapper(self, *args, **kwargs):
+            with self.lock:
+                return func(self, *args, **kwargs)
+
+        return wrapper
+
+    return _decorator
+
+
+class UserGenerator:
+    def __init__(self, config, model_path, dataset_path):
+        self.tokenizer_path = model_path
+        self.tokenizer = get_tokenizer(self.tokenizer_path)
+        self.dataset_path = dataset_path
+
+        self.user_id = 0
+        self.lock = threading.Lock()
+
+        self.num_rounds = config["num_rounds"]
+
+        self.cumulative_ratios = [
+            sum(config["round_ratios"][: i + 1])
+            for i in range(len(config["round_ratios"]))
+        ]
+        self.mean_new_tokens_per_round = config["mean_new_tokens_per_round"]
+        self.mean_return_tokens_per_round = config["mean_return_tokens_per_round"]
+        self.mean_inter_round_interval = config["mean_inter_round_interval"]
+
+        self.sigma = 100
+        self.range_ratio = 0.8
+        assert self.range_ratio <= 1
+
+        self.candidate_inputs = [
+            [
+                r
+                for r in sample_random_requests(
+                    input_len=(
+                        self.mean_new_tokens_per_round[i] * (2 - self.range_ratio)
+                    ),
+                    output_len=(
+                        self.mean_return_tokens_per_round[i] * (2 - self.range_ratio)
+                    ),
+                    num_prompts=config["num_clients"],
+                    range_ratio=self.range_ratio / (2 - self.range_ratio),
+                    tokenizer=self.tokenizer,
+                    dataset_path=self.dataset_path,
+                    random_sample=False,
+                )
+            ]
+            for i in range(self.num_rounds)
+        ]
+
+        self.multiturn_queue = []
+
+        self.user_stats = [0 for _ in range(self.num_rounds)]
+        self.input_stats = [[0, 0] for _ in range(self.num_rounds)]
+        self.output_stats = [[0, 0] for _ in range(self.num_rounds)]
+
+    def gen(self):
+        user_id = self.user_id
+        self.user_id += 1
+
+        rand_ratio = random.randint(0, self.cumulative_ratios[-1])
+        i = len(self.cumulative_ratios)
+        for idx, cumulative_ratio in enumerate(self.cumulative_ratios):
+            if rand_ratio >= cumulative_ratio:
+                continue
+            else:
+                i = idx + 1
+                break
+        total_rounds = i
+        current_round = 0
+
+        candidate_input = random.sample(self.candidate_inputs[current_round], 1)[0]
+        self.input_stats[0][0] += candidate_input.prompt_len
+        self.input_stats[0][1] += 1
+        prompt = f"{user_id} " + candidate_input.prompt
+        return_tokens = int(
+            random.gauss(self.mean_return_tokens_per_round[current_round], self.sigma)
+        )
+        if return_tokens <= 0:
+            return_tokens = self.mean_return_tokens_per_round[current_round]
+        start = 0
+
+        user_data = UserData(
+            user_id, current_round, total_rounds, prompt, return_tokens, start
+        )
+
+        self.user_stats[total_rounds - 1] += 1
+
+        return user_data
+
+    @synchronized()
+    def push(self, user_data, generated_text, len_itl):
+        self.output_stats[user_data.current_round][0] += len_itl + 1
+        self.output_stats[user_data.current_round][1] += 1
+        user_data.current_round += 1
+        if user_data.current_round >= user_data.total_rounds:
+            return
+
+        candidate_input = random.sample(
+            self.candidate_inputs[user_data.current_round], 1
+        )[0]
+        self.input_stats[user_data.current_round][0] += candidate_input.prompt_len
+        self.input_stats[user_data.current_round][1] += 1
+        user_data.prompt += generated_text + candidate_input.prompt
+        user_data.return_tokens = int(
+            random.gauss(
+                self.mean_return_tokens_per_round[user_data.current_round], self.sigma
+            )
+        )
+        if user_data.return_tokens <= 0:
+            user_data.return_tokens = self.mean_return_tokens_per_round[
+                user_data.current_round
+            ]
+        interval = random.gauss(
+            self.mean_inter_round_interval[user_data.current_round], self.sigma
+        )
+        if interval <= 0:
+            interval = self.mean_inter_round_interval[user_data.current_round]
+        user_data.start = time.perf_counter() + interval
+
+        if len(self.multiturn_queue) == 0:
+            self.multiturn_queue.append(user_data)
+        else:
+            i = len(self.multiturn_queue)
+            for idx, d in enumerate(self.multiturn_queue):
+                if user_data.start < d.start:
+                    i = idx
+                    break
+            self.multiturn_queue.insert(idx, user_data)
+
+    @synchronized()
+    def pop(self):
+        if (
+            len(self.multiturn_queue)
+            and time.perf_counter() > self.multiturn_queue[0].start
+        ):
+            return self.multiturn_queue.pop(0)
+        return self.gen()
+
+
+def gen_payload(prompt, output_len):
+    payload = {
+        "text": prompt,
+        "sampling_params": {
+            "temperature": 0.0,
+            "max_new_tokens": output_len,
+            "ignore_eos": True,
+        },
+        "stream": True,
+        "stream_options": {"include_usage": True},
+        "lora_path": "",
+        "return_logprob": False,
+        "logprob_start_len": -1,
+    }
+    return payload
+
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=20 * 60 * 60)
+
+
+async def async_request_sglang_generate(
+    user_data,
+    url,
+    atomic_counter,
+):
+    """
+    Sends a streaming request to the server. Gathers text token-by-token.
+    """
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        headers = {}
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        output = RequestFuncOutput()
+        payload = gen_payload(user_data.prompt, user_data.return_tokens)
+        write_debug_log({"timestamp": st, "user_data": user_data.__dict__})
+
+        try:
+            async with session.post(url=url, json=payload, headers=headers) as response:
+                if response.status == 200:
+                    prompt_tokens = 0
+                    cached_tokens = 0
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                        latency = time.perf_counter() - st
+                        if chunk == "[DONE]":
+                            pass
+                        else:
+                            data = json.loads(chunk)
+
+                            if data.get("text"):
+                                timestamp = time.perf_counter()
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+                                    prompt_tokens = (data.get("meta_info") or {}).get(
+                                        "prompt_tokens", 0
+                                    )
+                                    cached_tokens = (data.get("meta_info") or {}).get(
+                                        "cached_tokens", 0
+                                    )
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text = data["text"]
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+                    output.prompt_len = prompt_tokens
+                    output.cached_tokens = cached_tokens
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception as e:
+            output.success = False
+            output.error = str(e)
+            print(f"Request failed: {e}")
+
+    atomic_counter.increment(1)
+    return output
+
+
+class AtomicCounter:
+    def __init__(self, initial_value=0):
+        self._value = initial_value
+        self.lock = threading.Lock()
+
+    @synchronized()
+    def increment(self, amount=1):
+        self._value += amount
+
+    @synchronized()
+    def get(self):
+        return self._value
+
+
+class WorkloadGenerator:
+    def __init__(self, args):
+        config = load_config()
+        user_generator = UserGenerator(
+            config,
+            args.model_path,
+            args.dataset_path,
+        )
+
+        self.url = f"http://{args.host}:{args.port}/generate"
+
+        self.tokenizer = user_generator.tokenizer
+        self.start_time = None
+        self.finished_time = None
+        self.duration = args.duration
+        self.done = False
+
+        self.sent_requests = 0
+        self.completed_requests = 0
+
+        self.user_generator = user_generator
+        self.response_queue = queue.Queue()
+        self.performance_metrics = {
+            "ttft": [],
+            "latency": [],
+            "prompt_len": [],
+            "cached_tokens": [],
+        }
+        self.max_parallel = config["num_clients"]
+
+        self.atomic_counter = AtomicCounter()
+
+    async def handle_request(self, user_data):
+        try:
+            response = await async_request_sglang_generate(
+                user_data, self.url, self.atomic_counter
+            )
+            self.response_queue.put((user_data, response))
+        except Exception as e:
+            print(f"Request failed: {e}")
+            self.completed_requests += 1
+
+    def request_sender(self):
+        async def request_loop():
+            while True:
+                if self.sent_requests - self.completed_requests < self.max_parallel:
+                    new_request = self.user_generator.pop()
+                    if new_request:
+                        asyncio.create_task(self.handle_request(new_request))
+                        self.sent_requests += 1
+                else:
+                    await asyncio.sleep(0.05)
+                    continue
+
+                if time.perf_counter() - self.start_time > self.duration:
+                    self.done = True
+                    break
+
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        loop.run_until_complete(request_loop())
+        loop.close()
+
+    def response_handler(self):
+        while True:
+            try:
+                user_data, response = self.response_queue.get(timeout=10)
+                logger.info(
+                    f"{((time.perf_counter()-self.start_time)/self.duration*100):.2f}%"
+                )
+                if not response.success:
+                    raise ValueError(f"Request failed with error: {response.error}")
+
+                self.user_generator.push(
+                    user_data, response.generated_text, len(response.itl)
+                )
+                self.performance_metrics["ttft"].append(response.ttft)
+                self.performance_metrics["latency"].append(response.latency)
+                self.performance_metrics["prompt_len"].append(response.prompt_len)
+                self.performance_metrics["cached_tokens"].append(response.cached_tokens)
+                self.completed_requests += 1
+                self.finished_time = time.perf_counter()
+
+            except queue.Empty:
+                if self.done:
+                    break
+            except ValueError as e:
+                print(f"Error processing response for client {user_data}: {e}")
+                continue
+
+    def run(self):
+        request_thread = threading.Thread(target=self.request_sender, daemon=True)
+        response_thread = threading.Thread(target=self.response_handler, daemon=True)
+
+        self.start_time = time.perf_counter()
+        request_thread.start()
+        response_thread.start()
+
+        request_thread.join()
+        response_thread.join()
+
+        performance_data = {
+            "summary": {
+                "total_requests": len(self.performance_metrics["ttft"]),
+                "average_ttft": sum(self.performance_metrics["ttft"])
+                / len(self.performance_metrics["ttft"]),
+                "p90_ttft": sorted(self.performance_metrics["ttft"])[
+                    int(0.9 * len(self.performance_metrics["ttft"]))
+                ],
+                "median_ttft": sorted(self.performance_metrics["ttft"])[
+                    len(self.performance_metrics["ttft"]) // 2
+                ],
+                "average_latency": sum(self.performance_metrics["latency"])
+                / len(self.performance_metrics["latency"]),
+                "p90_latency": sorted(self.performance_metrics["latency"])[
+                    int(0.9 * len(self.performance_metrics["latency"]))
+                ],
+                "median_latency": sorted(self.performance_metrics["latency"])[
+                    len(self.performance_metrics["latency"]) // 2
+                ],
+                "throughput": self.atomic_counter.get()
+                / (self.finished_time - self.start_time),
+                "cache_hit_rate": (
+                    0
+                    if sum(self.performance_metrics["prompt_len"]) == 0
+                    else sum(self.performance_metrics["cached_tokens"])
+                    / sum(self.performance_metrics["prompt_len"])
+                ),
+            },
+        }
+        print("All requests completed")
+        print("Performance metrics summary:")
+        print(f"  Total requests: {performance_data['summary']['total_requests']}")
+        print(f"  Average TTFT: {performance_data['summary']['average_ttft']:.2f}")
+        print(f"  P90 TTFT: {performance_data['summary']['p90_ttft']:.2f}")
+        print(f"  Median TTFT: {performance_data['summary']['median_ttft']:.2f}")
+        print(
+            f"  Average latency: {performance_data['summary']['average_latency']:.2f}"
+        )
+        print(f"  P90 latency: {performance_data['summary']['p90_latency']:.2f}")
+        print(f"  Median latency: {performance_data['summary']['median_latency']:.2f}")
+        print(
+            f"  Throughput: {performance_data['summary']['throughput']:.2f} requests per second"
+        )
+        print(f"  Cache Hit Rate: {performance_data['summary']['cache_hit_rate']:.6f}")
+
+        user_stats = self.user_generator.user_stats
+        input_stats = self.user_generator.input_stats
+        output_stats = self.user_generator.output_stats
+        print(f"round_ratios: {user_stats}")
+        print(
+            f"mean_new_tokens_per_round: {[int(a/b) if b > 0 else 0 for a, b in input_stats]}"
+        )
+        print(
+            f"mean_return_tokens_per_round: {[int(a/b) if b > 0 else 0 for a, b in output_stats]}"
+        )
+        return performance_data
+
+
+def main():
+    global debug_log_file
+
+    args = parse_args()
+    if args.log_level == "debug":
+        logging.basicConfig(level=logging.DEBUG)
+        logger.info("use log_level debug")
+        # Initialize debug log file
+        debug_log_file = open(args.debug_log_file, "w")
+    else:
+        logging.basicConfig(level=logging.INFO)
+        logger.info("use log_level info")
+    performance_data = WorkloadGenerator(args).run()
+
+    # Close debug log file if it was opened
+    if debug_log_file:
+        debug_log_file.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/hicache/bench_mix.sh b/benchmark/hicache/bench_mix.sh
new file mode 100755
index 000000000..5ff6dca94
--- /dev/null
+++ b/benchmark/hicache/bench_mix.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages:/usr/local/lib/python3.12/dist-packages/torch/lib
+rm -rf nohup.out && \
+nohup python3 -m sglang.launch_server \
+    --attention-backend triton \
+    --model-path /code/models/Qwen3-32B/ \
+    --log-level info \
+    --tp 4 --mem-frac 0.25 \
+    --host 0.0.0.0 --port 33301 \
+    --enable-metrics --enable-cache-report \
+    --page-size 64 \
+    --enable-hierarchical-cache \
+    --hicache-ratio 2.5 --hicache-size 0 \
+    --hicache-io-backend kernel \
+    --hicache-mem-layout layer_first \
+    --hicache-write-policy write_through \
+    &
+
+##################################################
+
+export CONFIG_PATH=/tmp/bench_mix_config.json
+
+# num_clients: Maximum number of concurrent client requests to be simulated
+# round_ratios: Distribution of requests across rounds. Given sum(round_ratios) total requests,
+#               round_ratios[i] denotes the number of requests that will execute for (i+1) rounds
+echo '{
+  "num_rounds": 10,
+  "num_clients": 60,
+  "round_ratios": [50, 25, 15, 15, 10, 10, 9, 8, 7, 6],
+  "mean_new_tokens_per_round": [1000, 400, 350, 300, 280, 260, 240, 220, 210, 200],
+  "mean_return_tokens_per_round": [100, 100, 100, 100, 100, 100, 100, 100, 100, 100],
+  "mean_inter_round_interval": [30, 30, 30, 30, 30, 30, 30, 30, 30, 30]
+}' > ${CONFIG_PATH}
+
+rm -rf bench_mix.out && \
+nohup python3 /sgl-workspace/sglang/benchmark/hicache/bench_mix.py \
+    --model-path /code/models/Qwen3-32B/ \
+    --dataset-path /code/models/ShareGPT_V3_unfiltered_cleaned_split.json \
+    --port 33301 \
+    --duration 600 \
+> bench_mix.out &
diff --git a/benchmark/hicache/bench_multiturn.py b/benchmark/hicache/bench_multiturn.py
new file mode 100644
index 000000000..a3e8b0d74
--- /dev/null
+++ b/benchmark/hicache/bench_multiturn.py
@@ -0,0 +1,517 @@
+import argparse
+import asyncio
+import json
+import queue
+import random
+import threading
+import time
+from datetime import datetime
+from typing import Optional
+
+import aiohttp
+import numpy as np
+import requests
+from tqdm.asyncio import tqdm
+
+from sglang.bench_serving import (
+    RequestFuncOutput,
+    get_tokenizer,
+    remove_prefix,
+    sample_random_requests,
+)
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=20 * 60 * 60)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Script to benchmark concurrent requests to a server."
+    )
+    parser.add_argument(
+        "--num-clients",
+        type=int,
+        default=256,
+        help="Number of concurrent clients",
+    )
+    parser.add_argument(
+        "--max-parallel",
+        type=int,
+        default=128,
+        help="Maximum number of parallel requests",
+    )
+    parser.add_argument(
+        "--request-length",
+        type=int,
+        default=512,
+        help="Length of each new request",
+    )
+    parser.add_argument(
+        "--output-length",
+        type=int,
+        default=64,
+        help="Length of each output",
+    )
+    parser.add_argument(
+        "--num-rounds",
+        type=int,
+        default=5,
+        help="Number of rounds per client",
+    )
+    parser.add_argument(
+        "--distribution",
+        type=str,
+        default="poisson",
+        choices=["poisson", "uniform"],
+        help="Distribution type for request intervals (poisson or uniform)",
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=1.0,
+        help="Average number of requests per second",
+    )
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="localhost",
+        help="Server hostname or IP (default: localhost)",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=30000,
+        help="Server port (default: 30000)",
+    )
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default="meta-llama/Llama-3.1-8B-Instruct",
+        help="model path compatible with Hugging Face Transformers",
+    )
+    parser.add_argument(
+        "--dataset-path",
+        type=str,
+        default="",
+        help="local dataset to sample tokens from",
+    )
+    parser.add_argument(
+        "--log-file",
+        type=str,
+        default="performance_metrics.jsonl",
+        help="File to log performance metrics",
+    )
+    parser.add_argument(
+        "--disable-auto-run",
+        action="store_true",
+        help="If set, disable automatically testing with a range of request rates.",
+    )
+
+    parser.add_argument(
+        "--disable-random-sample",
+        action="store_true",
+        help="If set, disable random sampling of requests from the ShareGPT dataset.",
+    )
+    parser.add_argument(
+        "--sub-question-input-length",
+        type=int,
+        default=0,
+        help="Length of the sub question input for each request, if set 0 use request_length",
+    )
+    parser.add_argument(
+        "--ready-queue-policy",
+        type=str,
+        default="random",
+        help="Policy for popping requests from the ready queue (random or fifo)",
+    )
+    parser.add_argument(
+        "--tag",
+        type=str,
+        default="",
+        help="Tag of a certain run in the log file",
+    )
+    parser.add_argument("--seed", type=int, default=1, help="The random seed.")
+    parser.add_argument(
+        "--lora-path",
+        type=str,
+        default="",
+        help="String of LoRA path. Currently we only support benchmarking on a single LoRA adaptor.",
+    )
+    return parser.parse_args()
+
+
+async def async_request_sglang_generate(
+    payload,
+    url,
+    pbar: Optional[tqdm] = None,
+):
+    """
+    Sends a streaming request to the server. Gathers text token-by-token.
+    """
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        headers = {}
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        output = RequestFuncOutput()
+
+        try:
+            async with session.post(url=url, json=payload, headers=headers) as response:
+                if response.status == 200:
+                    prompt_tokens = 0
+                    cached_tokens = 0
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                        latency = time.perf_counter() - st
+                        if chunk == "[DONE]":
+                            pass
+                        else:
+                            data = json.loads(chunk)
+
+                            if data["text"]:
+                                timestamp = time.perf_counter()
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+                                    prompt_tokens = (data.get("meta_info") or {}).get(
+                                        "prompt_tokens", 0
+                                    )
+                                    cached_tokens = (data.get("meta_info") or {}).get(
+                                        "cached_tokens", 0
+                                    )
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text = data["text"]
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+                    output.prompt_len = prompt_tokens
+                    output.cached_tokens = cached_tokens
+                    output.generated_len = len(output.itl) + 1
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception as e:
+            output.success = False
+            output.error = str(e)
+            print(f"Request failed: {e}")
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+def gen_payload(prompt, output_len, lora_path=""):
+    payload = {
+        "text": prompt,
+        "sampling_params": {
+            "temperature": 0.0,
+            "max_new_tokens": output_len,
+            "ignore_eos": True,
+        },
+        "stream": True,
+        "stream_options": {"include_usage": True},
+        "lora_path": lora_path,
+        "return_logprob": False,
+        "logprob_start_len": -1,
+    }
+    return payload
+
+
+def log_to_jsonl_file(data, file_path="performance_metrics.jsonl", tag=""):
+    """Append the data with a timestamp and tag to the specified JSONL file."""
+    timestamped_data = {"timestamp": datetime.now().isoformat(), "tag": tag, **data}
+    try:
+        with open(file_path, "a") as file:
+            file.write(
+                json.dumps(timestamped_data) + "\n"
+            )  # Write as a single line in JSONL format
+    except IOError as e:
+        print(f"Error writing to JSONL file: {e}")
+
+
+class ReadyQueue:
+    """
+    Thread-safe queue that can pop requests in different orders based on given policy.
+    """
+
+    def __init__(self, init_requests=None, policy="random"):
+        self.lock = threading.Lock()
+        self.requests = init_requests or []
+        self.policy = policy
+
+    def append(self, item):
+        with self.lock:
+            self.requests.append(item)
+
+    def pop(self):
+        with self.lock:
+            if not self.requests:
+                return None
+            if self.policy == "random":
+                index = random.randrange(len(self.requests))
+                return self.requests.pop(index)
+            elif self.policy == "fifo":
+                return self.requests.pop(0)
+            else:
+                # todo, varying thinking time of clients
+                raise ValueError(f"{self.policy} not implemented")
+
+
+class WorkloadGenerator:
+    def __init__(self, args):
+        # Construct the base URL for requests
+        self.url = f"http://{args.host}:{args.port}/generate"
+
+        self.tokenizer = get_tokenizer(args.model_path)
+        self.distribution = args.distribution
+        self.request_rate = args.request_rate
+        self.start_time = None
+        self.finished_time = None
+
+        self.sent_requests = 0
+        self.completed_requests = 0
+
+        self.candidate_inputs = sample_random_requests(
+            input_len=args.request_length,
+            output_len=args.output_length,
+            num_prompts=args.num_clients,
+            range_ratio=1.0,
+            tokenizer=self.tokenizer,
+            dataset_path=args.dataset_path,
+            random_sample=not args.disable_random_sample,
+        )
+        self.candidate_inputs = [i.prompt for i in self.candidate_inputs]
+
+        if args.sub_question_input_length != 0:
+            sub_question_input_length = args.sub_question_input_length
+        else:
+            sub_question_input_length = args.request_length
+
+        self.sub_question_inputs = sample_random_requests(
+            input_len=sub_question_input_length,
+            output_len=args.output_length,
+            num_prompts=args.num_clients * max(args.num_rounds - 1, 1),
+            range_ratio=1.0,
+            tokenizer=self.tokenizer,
+            dataset_path=args.dataset_path,
+            random_sample=not args.disable_random_sample,
+        )
+
+        init_requests = [
+            (
+                i,
+                gen_payload(
+                    self.candidate_inputs[i], args.output_length, args.lora_path
+                ),
+            )
+            for i in range(args.num_clients)
+        ]
+        self.client_records = {
+            i: {"round": 0, "history": init_requests[i][1]["text"]}
+            for i in range(args.num_clients)
+        }
+        self.ready_queue = ReadyQueue(
+            init_requests=init_requests, policy=args.ready_queue_policy
+        )
+        self.candidate_inputs = self.candidate_inputs[args.num_clients :]
+
+        self.response_queue = queue.Queue()
+        self.pbar = tqdm(total=args.num_clients * args.num_rounds)
+        self.performance_metrics = {
+            "ttft": [],
+            "latency": [],
+            "prompt_len": [],
+            "cached_tokens": [],
+            "generated_len": [],
+        }
+        self.num_rounds = args.num_rounds
+        self.max_parallel = args.max_parallel
+        self.output_length = args.output_length
+
+    async def handle_request(self, item):
+        try:
+            client_id, payload = item
+            response = await async_request_sglang_generate(payload, self.url, self.pbar)
+            if self.pbar.n == self.pbar.total:
+                self.finished_time = time.perf_counter()
+            self.response_queue.put((client_id, response))
+        except Exception as e:
+            print(f"Request failed: {e}")
+
+    def request_sender(self):
+        async def request_loop():
+            while True:
+                if self.sent_requests - self.completed_requests < self.max_parallel:
+                    new_request = self.ready_queue.pop()
+                    if new_request:
+                        asyncio.create_task(self.handle_request(new_request))
+                        self.sent_requests += 1
+                else:
+                    await asyncio.sleep(0.05)
+                    continue
+
+                if self.pbar.n == self.pbar.total:
+                    break
+
+                # Calculate Poisson-distributed wait time
+                if self.distribution == "poisson":
+                    sleep_time = random.expovariate(self.request_rate)
+                elif self.distribution == "uniform":
+                    avg_interval = (
+                        1.0 / self.request_rate if self.request_rate > 0 else 1.0
+                    )
+                    sleep_time = random.uniform(0, 2 * avg_interval)
+                else:
+                    raise ValueError("Invalid distribution type")
+                await asyncio.sleep(sleep_time)  # Wait before sending the next request
+
+        # Create and run the event loop for asynchronous requests
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        loop.run_until_complete(request_loop())
+        loop.close()
+
+    def response_handler(self):
+        while True:
+            try:
+                client_id, response = self.response_queue.get(
+                    timeout=10
+                )  # Block until response is available
+                if not response.success:
+                    raise ValueError(f"Request failed with error: {response.error}")
+                self.client_records[client_id]["history"] += response.generated_text
+                self.client_records[client_id]["round"] += 1
+                self.performance_metrics["ttft"].append(response.ttft)
+                self.performance_metrics["latency"].append(response.latency)
+                self.performance_metrics["prompt_len"].append(response.prompt_len)
+                self.performance_metrics["cached_tokens"].append(response.cached_tokens)
+                self.performance_metrics["generated_len"].append(response.generated_len)
+                self.completed_requests += 1
+
+                if self.client_records[client_id]["round"] < self.num_rounds:
+                    # append new request to client's history
+                    self.client_records[client_id][
+                        "history"
+                    ] += self.sub_question_inputs.pop().prompt
+                    self.ready_queue.append(
+                        (
+                            client_id,
+                            gen_payload(
+                                self.client_records[client_id]["history"],
+                                self.output_length,
+                                args.lora_path,
+                            ),
+                        )
+                    )
+            except queue.Empty:
+                if self.pbar.n == self.pbar.total:
+                    break
+            except ValueError as e:
+                print(f"Error processing response for client {client_id}: {e}")
+                continue
+
+    def run(self):
+        request_thread = threading.Thread(target=self.request_sender, daemon=True)
+        response_thread = threading.Thread(target=self.response_handler, daemon=True)
+
+        self.start_time = time.perf_counter()
+        request_thread.start()
+        response_thread.start()
+
+        request_thread.join()
+        response_thread.join()
+        self.pbar.close()
+
+        duration = self.finished_time - self.start_time
+        performance_data = {
+            "summary": {
+                "total_requests": len(self.performance_metrics["ttft"]),
+                "request_rate": self.request_rate,
+                "average_ttft": sum(self.performance_metrics["ttft"])
+                / len(self.performance_metrics["ttft"]),
+                "p90_ttft": sorted(self.performance_metrics["ttft"])[
+                    int(0.9 * len(self.performance_metrics["ttft"]))
+                ],
+                "median_ttft": sorted(self.performance_metrics["ttft"])[
+                    len(self.performance_metrics["ttft"]) // 2
+                ],
+                "average_latency": sum(self.performance_metrics["latency"])
+                / len(self.performance_metrics["latency"]),
+                "p90_latency": sorted(self.performance_metrics["latency"])[
+                    int(0.9 * len(self.performance_metrics["latency"]))
+                ],
+                "median_latency": sorted(self.performance_metrics["latency"])[
+                    len(self.performance_metrics["latency"]) // 2
+                ],
+                "input_token_throughput": sum(self.performance_metrics["prompt_len"])
+                / duration,
+                "output_token_throughput": sum(
+                    self.performance_metrics["generated_len"]
+                )
+                / duration,
+                "throughput": self.pbar.total / duration,
+                "cache_hit_rate": (
+                    0
+                    if sum(self.performance_metrics["prompt_len"]) == 0
+                    else sum(self.performance_metrics["cached_tokens"])
+                    / sum(self.performance_metrics["prompt_len"])
+                ),
+            },
+        }
+        print("All requests completed")
+        print("Performance metrics summary:")
+        print(
+            f"  Total requests: {performance_data['summary']['total_requests']} at {performance_data['summary']['request_rate']} requests per second"
+        )
+        print(f"  Average TTFT: {performance_data['summary']['average_ttft']:.2f}")
+        print(f"  P90 TTFT: {performance_data['summary']['p90_ttft']:.2f}")
+        print(f"  Median TTFT: {performance_data['summary']['median_ttft']:.2f}")
+        print(
+            f"  Average latency: {performance_data['summary']['average_latency']:.2f}"
+        )
+        print(f"  P90 latency: {performance_data['summary']['p90_latency']:.2f}")
+        print(f"  Median latency: {performance_data['summary']['median_latency']:.2f}")
+        print(
+            f"  Input token throughput: {performance_data['summary']['input_token_throughput']:.2f} tokens per second"
+        )
+        print(
+            f"  Output token throughput: {performance_data['summary']['output_token_throughput']:.2f} tokens per second"
+        )
+        print(
+            f"  Request Throughput: {performance_data['summary']['throughput']:.2f} requests per second"
+        )
+        print(f"  Cache Hit Rate: {performance_data['summary']['cache_hit_rate']:.6f}")
+        return performance_data
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    flush_cache_url = f"http://{args.host}:{args.port}/flush_cache"
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    if args.disable_auto_run:
+        print("Running with specified request rate...")
+        request_rates = [args.request_rate]
+    else:
+        print("Auto-running with different request rates...")
+        request_rates = [16, 14, 12, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
+
+    for rate in request_rates:
+        args.request_rate = rate
+        requests.post(flush_cache_url)
+        time.sleep(1)
+        performance_data = WorkloadGenerator(args).run()
+        log_to_jsonl_file(performance_data, args.log_file, tag=args.tag)
diff --git a/benchmark/hicache/bench_serving.py b/benchmark/hicache/bench_serving.py
new file mode 100644
index 000000000..e38d0d0ea
--- /dev/null
+++ b/benchmark/hicache/bench_serving.py
@@ -0,0 +1,1029 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/backend_request_func.py
+# Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/benchmark_serving.py
+
+"""
+Benchmark online serving with dynamic requests.
+
+Usage:
+python3 -m sglang.bench_serving --backend sglang --num-prompt 10
+
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 3000 --random-input 1024 --random-output 1024 --random-range-ratio 0.5
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --request-rate-range 1,2,4,8,16,32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --multi
+"""
+
+import argparse
+import asyncio
+import json
+import os
+import random
+import sys
+import time
+import traceback
+import warnings
+from argparse import ArgumentParser
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
+
+import aiohttp
+import numpy as np
+import requests
+from data_processing import MsgContent, SampleOutput, get_dataset
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase
+
+from sglang.bench_serving import get_tokenizer, remove_prefix, set_ulimit
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=20 * 60 * 60)
+
+global args
+
+
+@dataclass
+class RequestFuncInput:
+    prompts: List[Tuple[MsgContent, int, int]]
+    api_url: str
+    model: str
+    lora_name: str
+    extra_request_body: Dict[str, Any]
+
+    # For multiturn chat, store the context
+    prev_messages: List = field(default_factory=list)
+    finished_prompts: int = 0
+
+
+@dataclass
+class RequestFuncOutput:
+    generated_text: List[str] = field(default_factory=list)
+    prompt_len: List[int] = field(default_factory=list)
+    output_len: List[int] = field(default_factory=list)
+    latency: List[float] = field(default_factory=list)
+    ttft: List[float] = field(default_factory=list)
+    itl: List[float] = field(default_factory=list)  # List of inter-token latencies
+
+    success: bool = False
+    error: str = ""
+
+
+# set ignore_eos True by default
+async def async_request_openai_completions(
+    request_func_input: RequestFuncInput,
+    queue: asyncio.Queue,
+    tokenizer: PreTrainedTokenizerBase,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        "completions"
+    ), "OpenAI Completions API URL must end with 'completions'."
+
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "model": request_func_input.model,
+            "temperature": 0.0,
+            "best_of": 1,
+            "stream": not args.disable_stream,
+            "stream_options": {"include_usage": True},
+            "ignore_eos": not args.disable_ignore_eos,
+            **request_func_input.extra_request_body,
+        }
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        output = RequestFuncOutput()
+
+        prompt_idx = request_func_input.finished_prompts
+        messages = request_func_input.prev_messages
+        prompt, input_len, max_tokens = request_func_input.prompts[prompt_idx]
+        prompt_len = sum(
+            prompt[1] + prompt[2]  # input_len + output_len
+            for prompt in request_func_input.prompts[:prompt_idx]
+        )
+        prompt_len += input_len
+
+        # Messages
+        messages.append(
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        )
+        payload["messages"] = messages
+        payload["max_tokens"] = max_tokens
+
+        # output.prompt_len = request_func_input.prompt_len
+        # print(payload)
+
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    actual_prompt_len = prompt_len - 1
+                    actual_output_len = 0
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                        latency = time.perf_counter() - st
+                        if chunk == "[DONE]":
+                            pass
+                        else:
+                            data = json.loads(chunk)
+                            timestamp = time.perf_counter()
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if data["usage"] is not None and len(data["usage"]) > 0:
+                                actual_prompt_len = data["usage"]["prompt_tokens"]
+                                actual_output_len = data["usage"]["completion_tokens"]
+                                continue
+                            delta = data["choices"][0]["delta"]
+
+                            if delta.get("content", None):
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft.append(ttft)
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+
+                                generated_text += delta["content"]
+                            most_recent_timestamp = timestamp
+
+                    output.prompt_len.append(actual_prompt_len)  # truncate <s>
+                    output.output_len.append(actual_output_len)
+                    output.generated_text.append(generated_text)
+                    output.success = True
+                    output.latency.append(latency)
+
+                    # Prepare for the new request
+                    request_func_input.prompts[prompt_idx] = (
+                        prompt,
+                        input_len,
+                        actual_output_len,  # changes from max_tokens to output_len
+                    )
+                    prompt_idx += 1
+                    messages.append(
+                        {
+                            "role": "assistant",
+                            "content": generated_text,
+                        }
+                    )
+
+                    # Move the new request to the end of the queue
+                    if prompt_idx < len(request_func_input.prompts):
+                        request_func_input.finished_prompts = prompt_idx
+                        request_func_input.prev_messages = messages
+                        await queue.put(request_func_input)
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_profile(api_url: str) -> RequestFuncOutput:
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        output = RequestFuncOutput()
+        try:
+            async with session.post(url=api_url) as response:
+                if response.status == 200:
+                    output.success = True
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    return output
+
+
+ASYNC_REQUEST_FUNCS = {
+    "sglang": async_request_openai_completions,
+    "vllm": async_request_openai_completions,
+    "lmdeploy": async_request_openai_completions,
+}
+
+
+@dataclass
+class BenchmarkMetrics:
+    completed: int
+    total_input: int
+    total_output: int
+    total_output_retokenized: int
+    request_throughput: float
+    input_throughput: float
+    output_throughput: float
+    output_throughput_retokenized: float
+    total_throughput: float
+    total_throughput_retokenized: float
+    mean_ttft_ms: float
+    median_ttft_ms: float
+    std_ttft_ms: float
+    p90_ttft_ms: float
+    p99_ttft_ms: float
+    mean_tpot_ms: float
+    median_tpot_ms: float
+    std_tpot_ms: float
+    p90_tpot_ms: float
+    p99_tpot_ms: float
+    mean_itl_ms: float
+    median_itl_ms: float
+    std_itl_ms: float
+    p90_itl_ms: float
+    p99_itl_ms: float
+    mean_e2e_latency_ms: float
+    median_e2e_latency_ms: float
+    std_e2e_latency_ms: float
+    p99_e2e_latency_ms: float
+    concurrency: float
+
+
+async def get_requests(
+    input_requests_queue: asyncio.Queue,
+    request_rate: float,
+    num_actual_requests: int,
+) -> AsyncGenerator[RequestFuncInput, None]:
+    for _ in range(num_actual_requests):
+        try:
+            request = await asyncio.wait_for(
+                input_requests_queue.get(), timeout=300
+            )  # Wait for 5 minutes then abort
+        except Exception as e:
+            print(f"exception: {e}")
+            break
+
+        yield request
+
+        if request_rate == float("inf"):
+            continue
+
+        interval = np.random.exponential(1.0 / request_rate)
+        await asyncio.sleep(interval)
+
+
+def calculate_metrics(
+    outputs: List[RequestFuncOutput],
+    dur_s: float,
+    tokenizer: PreTrainedTokenizerBase,
+    backend: str,
+) -> Tuple[BenchmarkMetrics, List[int]]:
+    output_lens: List[int] = []
+    retokenized_output_lens: List[int] = []
+    total_input = 0
+    completed = 0
+    itls: List[float] = []
+    tpots: List[float] = []
+    ttfts: List[float] = []
+    e2e_latencies: List[float] = []
+    output_success = 0
+    for i in range(len(outputs)):
+        if outputs[i].success:
+            output_success += 1
+            assert len(outputs[i].generated_text) == len(outputs[i].latency)
+            assert len(outputs[i].generated_text) == len(outputs[i].ttft)
+            for j in range(len(outputs[i].generated_text)):
+                output_len = outputs[i].output_len[j]
+                output_lens.append(output_len)
+                retokenized_output_len = len(
+                    tokenizer.encode(
+                        outputs[i].generated_text[j], add_special_tokens=False
+                    )
+                )
+                retokenized_output_lens.append(retokenized_output_len)
+                total_input += outputs[i].prompt_len[j]
+                if output_len > 1:
+                    tpots.append(
+                        (outputs[i].latency[j] - outputs[i].ttft[j]) / (output_len - 1)
+                    )
+
+                completed += 1
+            itls += outputs[i].itl
+            ttfts += outputs[i].ttft
+            e2e_latencies += outputs[i].latency
+
+        else:
+            output_lens.append(0)
+            retokenized_output_lens.append(0)
+
+    if completed == 0:
+        warnings.warn(
+            "All requests failed. This is likely due to a misconfiguration "
+            "on the benchmark arguments.",
+            stacklevel=2,
+        )
+    metrics = BenchmarkMetrics(
+        completed=completed,
+        total_input=total_input,
+        total_output=sum(output_lens),
+        total_output_retokenized=sum(retokenized_output_lens),
+        request_throughput=completed / dur_s,
+        input_throughput=total_input / dur_s,
+        output_throughput=sum(output_lens) / dur_s,
+        output_throughput_retokenized=sum(retokenized_output_lens) / dur_s,
+        total_throughput=(total_input + sum(output_lens)) / dur_s,
+        total_throughput_retokenized=(total_input + sum(retokenized_output_lens))
+        / dur_s,
+        mean_ttft_ms=np.mean(ttfts or 0)
+        * 1000,  # ttfts is empty if streaming is not supported by backend
+        median_ttft_ms=np.median(ttfts or 0) * 1000,
+        std_ttft_ms=np.std(ttfts or 0) * 1000,
+        p90_ttft_ms=np.percentile(ttfts or 0, 90) * 1000,
+        p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
+        mean_tpot_ms=np.mean(tpots or 0) * 1000,
+        median_tpot_ms=np.median(tpots or 0) * 1000,
+        std_tpot_ms=np.std(tpots or 0) * 1000,
+        p90_tpot_ms=np.percentile(tpots or 0, 90) * 1000,
+        p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
+        mean_itl_ms=np.mean(itls or 0) * 1000,
+        median_itl_ms=np.median(itls or 0) * 1000,
+        std_itl_ms=np.std(itls or 0) * 1000,
+        p90_itl_ms=np.percentile(itls or 0, 90) * 1000,
+        p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
+        mean_e2e_latency_ms=np.mean(e2e_latencies) * 1000,
+        median_e2e_latency_ms=np.median(e2e_latencies) * 1000,
+        std_e2e_latency_ms=np.std(e2e_latencies) * 1000,
+        p99_e2e_latency_ms=np.percentile(e2e_latencies, 99) * 1000,
+        concurrency=np.sum(e2e_latencies) / dur_s,
+    )
+
+    return metrics, output_lens
+
+
+async def benchmark(
+    backend: str,
+    api_url: str,
+    base_url: str,
+    model_id: str,
+    tokenizer: PreTrainedTokenizerBase,
+    input_requests: SampleOutput,
+    request_rate: float,
+    max_concurrency: Optional[int],
+    disable_tqdm: bool,
+    lora_name: str,
+    extra_request_body: Dict[str, Any],
+    profile: bool,
+    enable_shared_prefix: bool,
+):
+    if backend in ASYNC_REQUEST_FUNCS:
+        request_func = ASYNC_REQUEST_FUNCS[backend]
+    else:
+        raise ValueError(f"Unknown backend: {backend}")
+
+    # Limit concurrency
+    # From https://github.com/vllm-project/vllm/pull/9390
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
+
+    async def limited_request_func(request_func_input, queue, tokenizer, pbar):
+        if semaphore is None:
+            return await request_func(
+                request_func_input=request_func_input,
+                queue=queue,
+                tokenizer=tokenizer,
+                pbar=pbar,
+            )
+        async with semaphore:
+            return await request_func(
+                request_func_input=request_func_input,
+                queue=queue,
+                tokenizer=tokenizer,
+                pbar=pbar,
+            )
+
+    num_actual_requests = sum(len(r) for r in input_requests)
+    print(f"Num of shared prefixes or conversations: {len(input_requests)}")
+    print(f"Num of total requests: {num_actual_requests}")
+
+    # flatten the requests for shared prefix
+    if enable_shared_prefix:
+        input_requests = [[r] for requests in input_requests for r in requests]
+    inputs_requests_queue = asyncio.Queue(maxsize=len(input_requests))
+    print("Starting initial single prompt test run...")
+    # NOTE: Just use the first request of the first conversation for warmup
+    test_input = RequestFuncInput(
+        model=model_id,
+        prompts=input_requests[0][:1],
+        api_url=api_url,
+        lora_name=lora_name,
+        extra_request_body=extra_request_body,
+    )
+    test_output = await request_func(
+        request_func_input=test_input, queue=inputs_requests_queue, tokenizer=tokenizer
+    )
+    if not test_output.success:
+        raise ValueError(
+            "Initial test run failed - Please make sure benchmark arguments "
+            f"are correctly specified. Error: {test_output.error}"
+        )
+    else:
+        print("Initial test run completed. Starting main benchmark run...")
+
+    # Check the states
+    assert inputs_requests_queue.empty()
+
+    # Flush cache
+    if "sglang" in backend:
+        requests.post(base_url + "/flush_cache")
+
+    time.sleep(1.0)
+
+    # Start profiler
+    if profile:
+        print("Starting profiler...")
+        profile_output = await async_request_profile(
+            api_url=base_url + "/start_profile"
+        )
+        if profile_output.success:
+            print("Profiler started")
+
+    for request in input_requests:
+        request_func_input = RequestFuncInput(
+            model=model_id,
+            prompts=request,
+            api_url=api_url,
+            lora_name=lora_name,
+            extra_request_body=extra_request_body,
+        )
+        inputs_requests_queue.put_nowait(request_func_input)
+    if (
+        not args.enable_multiturn
+        and not args.enable_shared_prefix
+        and not args.dataset_name == "generated-shared-prefix"
+    ):
+        assert len(input_requests) == num_actual_requests
+
+    pbar = None if disable_tqdm else tqdm(total=num_actual_requests)
+
+    benchmark_start_time = time.perf_counter()
+    tasks: List[asyncio.Task] = []
+    async for request in get_requests(
+        inputs_requests_queue, request_rate, num_actual_requests
+    ):
+        tasks.append(
+            asyncio.create_task(
+                limited_request_func(
+                    request_func_input=request,
+                    queue=inputs_requests_queue,
+                    tokenizer=tokenizer,
+                    pbar=pbar,
+                )
+            )
+        )
+    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
+
+    # Stop profiler
+    if profile:
+        print("Stopping profiler...")
+        profile_output = await async_request_profile(api_url=base_url + "/stop_profile")
+        if profile_output.success:
+            print("Profiler stopped")
+
+    if pbar is not None:
+        pbar.close()
+
+    # Compute metrics and print results
+    benchmark_duration = time.perf_counter() - benchmark_start_time
+    metrics, output_lens = calculate_metrics(
+        outputs=outputs,
+        dur_s=benchmark_duration,
+        tokenizer=tokenizer,
+        backend=backend,
+    )
+
+    print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
+    print("{:<40} {:<10}".format("Backend:", backend))
+    print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
+    print(
+        "{:<40} {:<10}".format(
+            "Max request concurrency:",
+            max_concurrency if max_concurrency else "not set",
+        )
+    )
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+    print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
+    print(
+        "{:<40} {:<10}".format(
+            "Total generated tokens (retokenized):", metrics.total_output_retokenized
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Request throughput (req/s):", metrics.request_throughput
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Input token throughput (tok/s):", metrics.input_throughput
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Output token throughput (tok/s):", metrics.output_throughput
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Total token throughput (tok/s):", metrics.total_throughput
+        )
+    )
+    print("{:<40} {:<10.2f}".format("Concurrency:", metrics.concurrency))
+    print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
+    print(
+        "{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Median E2E Latency (ms):", metrics.median_e2e_latency_ms
+        )
+    )
+    print("{s:{c}^{n}}".format(s="Time to First Token", n=50, c="-"))
+    print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
+    print("{:<40} {:<10.2f}".format("Median TTFT (ms):", metrics.median_ttft_ms))
+    print("{:<40} {:<10.2f}".format("P90 TTFT (ms):", metrics.p90_ttft_ms))
+    print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
+    print(
+        "{s:{c}^{n}}".format(s="Time per Output Token (excl. 1st token)", n=50, c="-")
+    )
+    print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
+    print("{:<40} {:<10.2f}".format("Median TPOT (ms):", metrics.median_tpot_ms))
+    print("{:<40} {:<10.2f}".format("P90 TPOT (ms):", metrics.p90_tpot_ms))
+    print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
+    print("{s:{c}^{n}}".format(s="Inter-token Latency", n=50, c="-"))
+    print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
+    print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
+    print("{:<40} {:<10.2f}".format("P90 ITL (ms):", metrics.p90_itl_ms))
+    print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
+    print("=" * 50)
+
+    if (
+        metrics.median_ttft_ms is not None
+        and metrics.mean_itl_ms is not None
+        and metrics.output_throughput is not None
+    ):
+        result = {
+            # Arguments
+            "backend": args.backend,
+            "dataset_name": args.dataset_name,
+            "request_rate": request_rate,
+            "max_concurrency": max_concurrency,
+            "fixed_output_len": args.fixed_output_len,
+            "random_input_len": args.random_input_len,
+            "random_output_len": args.random_output_len,
+            "random_range_ratio": args.random_range_ratio,
+            # Results
+            "duration": benchmark_duration,
+            "completed": metrics.completed,
+            "total_input_tokens": metrics.total_input,
+            "total_output_tokens": metrics.total_output,
+            "total_output_tokens_retokenized": metrics.total_output_retokenized,
+            "request_throughput": metrics.request_throughput,
+            "input_throughput": metrics.input_throughput,
+            "output_throughput": metrics.output_throughput,
+            "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
+            "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
+            "std_e2e_latency_ms": metrics.std_e2e_latency_ms,
+            "p99_e2e_latency_ms": metrics.p99_e2e_latency_ms,
+            "mean_ttft_ms": metrics.mean_ttft_ms,
+            "median_ttft_ms": metrics.median_ttft_ms,
+            "std_ttft_ms": metrics.std_ttft_ms,
+            "p99_ttft_ms": metrics.p99_ttft_ms,
+            "mean_tpot_ms": metrics.mean_tpot_ms,
+            "median_tpot_ms": metrics.median_tpot_ms,
+            "std_tpot_ms": metrics.std_tpot_ms,
+            "p99_tpot_ms": metrics.p99_tpot_ms,
+            "mean_itl_ms": metrics.mean_itl_ms,
+            "median_itl_ms": metrics.median_itl_ms,
+            "std_itl_ms": metrics.std_itl_ms,
+            "p99_itl_ms": metrics.p99_itl_ms,
+            "concurrency": metrics.concurrency,
+            "input_throughput": metrics.input_throughput,
+            "output_throughput": metrics.output_throughput,
+            "fixed_output_len": args.fixed_output_len,
+            "random_input_len": args.random_input_len,
+            "random_output_len": args.random_output_len,
+            "random_range_ratio": args.random_range_ratio,
+            "duration": benchmark_duration,
+            "completed": metrics.completed,
+        }
+    else:
+        print(f"Error running benchmark for request rate: {request_rate}")
+        print("-" * 30)
+
+    # Determine output file name
+    if args.output_file:
+        output_file_name = args.output_file
+    else:
+        now = datetime.now().strftime("%m%d")
+        if args.dataset_name == "random":
+            output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
+        else:
+            output_file_name = (
+                f"{args.backend}_{now}_{args.num_prompts}_{args.dataset_name}.jsonl"
+            )
+
+    # Append results to a JSONL file
+    with open(output_file_name, "a") as file:
+        file.write(json.dumps(result) + "\n")
+
+    result = {
+        "duration": benchmark_duration,
+        "completed": metrics.completed,
+        "total_input_tokens": metrics.total_input,
+        "total_output_tokens": metrics.total_output,
+        "total_output_tokens_retokenized": metrics.total_output_retokenized,
+        "request_throughput": metrics.request_throughput,
+        "input_throughput": metrics.input_throughput,
+        "output_throughput": metrics.output_throughput,
+        "mean_ttft_ms": metrics.mean_ttft_ms,
+        "median_ttft_ms": metrics.median_ttft_ms,
+        "std_ttft_ms": metrics.std_ttft_ms,
+        "p90_ttft_ms": metrics.p90_ttft_ms,
+        "p99_ttft_ms": metrics.p99_ttft_ms,
+        "mean_tpot_ms": metrics.mean_tpot_ms,
+        "median_tpot_ms": metrics.median_tpot_ms,
+        "std_tpot_ms": metrics.std_tpot_ms,
+        "p90_tpot_ms": metrics.p90_tpot_ms,
+        "p99_tpot_ms": metrics.p99_tpot_ms,
+        "mean_itl_ms": metrics.mean_itl_ms,
+        "median_itl_ms": metrics.median_itl_ms,
+        "std_itl_ms": metrics.std_itl_ms,
+        "p90_itl_ms": metrics.p90_itl_ms,
+        "p99_itl_ms": metrics.p99_itl_ms,
+        "input_lens": [output.prompt_len for output in outputs],
+        "output_lens": output_lens,
+        "ttfts": [output.ttft for output in outputs],
+        "itls": [output.itl for output in outputs],
+        "generated_texts": [output.generated_text for output in outputs],
+        "errors": [output.error for output in outputs],
+        "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
+        "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
+    }
+    return result
+
+
+def run_benchmark(args_: argparse.Namespace):
+    global args
+    args = args_
+
+    # Set default value for max_concurrency if not present
+    if not hasattr(args, "max_concurrency"):
+        args.max_concurrency = None
+
+    # Set global environments
+    set_ulimit()
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    extra_request_body = {}
+    if args.extra_request_body:
+        extra_request_body = json.loads(args.extra_request_body)
+
+    # Set url
+    if args.port is None:
+        args.port = {
+            "sglang": 30000,
+            "lmdeploy": 23333,
+            "vllm": 8000,
+        }.get(args.backend, 30000)
+
+    model_url = (
+        f"{args.base_url}/v1/models"
+        if args.base_url
+        else f"http://{args.host}:{args.port}/v1/models"
+    )
+
+    if args.backend in ["sglang", "vllm", "lmdeploy"]:
+        api_url = (
+            f"{args.base_url}/v1/chat/completions"
+            if args.base_url
+            else f"http://{args.host}:{args.port}/v1/chat/completions"
+        )
+    base_url = (
+        f"http://{args.host}:{args.port}" if args.base_url is None else args.base_url
+    )
+
+    # Get model name
+    if args.model is None:
+        if args.backend == "truss":
+            print(
+                "Please provide a model with `--model` when using truss backend. e.g. --model meta-llama/Llama-3.1-8B-Instruct"
+            )
+            sys.exit(1)
+        try:
+            response = requests.get(model_url)
+            model_list = response.json().get("data", [])
+            args.model = model_list[0]["id"] if model_list else None
+        except Exception as e:
+            print(f"Failed to fetch model from {model_url}. Error: {e}")
+            print(
+                "Please specify the correct host and port using `--host` and `--port`."
+            )
+            sys.exit(1)
+
+    if args.model is None:
+        print("No model specified or found. Please provide a model using `--model`.")
+        sys.exit(1)
+
+    # Dataset compatibility check
+    if args.enable_multiturn:
+        # TODO: Support multiturn for random
+        if args.dataset_name not in ["sharegpt", "ultrachat", "loogle", "nextqa"]:
+            print(
+                "Multiturn conversation is only supported for sharegpt, ultrachat, loogle, and nextqa datasets."
+            )
+            sys.exit(1)
+
+    if args.enable_shared_prefix:
+        if args.dataset_name not in ["loogle", "nextqa"]:
+            print("Shared prefix is only supported for loogle and nextqa datasets.")
+            sys.exit(1)
+
+    print(f"{args}\n")
+
+    # Read dataset
+    backend = args.backend
+    model_id = args.model
+    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+
+    tokenizer = get_tokenizer(tokenizer_id)
+
+    input_requests = get_dataset(args, tokenizer)
+
+    return asyncio.run(
+        benchmark(
+            backend=backend,
+            api_url=api_url,
+            base_url=base_url,
+            model_id=model_id,
+            tokenizer=tokenizer,
+            input_requests=input_requests,
+            request_rate=args.request_rate,
+            max_concurrency=args.max_concurrency,
+            disable_tqdm=args.disable_tqdm,
+            lora_name=args.lora_name,
+            extra_request_body=extra_request_body,
+            profile=args.profile,
+            enable_shared_prefix=args.enable_shared_prefix,
+        )
+    )
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(description="Benchmark the online serving throughput.")
+    parser.add_argument(
+        "--backend",
+        type=str,
+        choices=list(ASYNC_REQUEST_FUNCS.keys()),
+        default="sglang",
+        help="Must specify a backend, depending on the LLM Inference Engine.",
+    )
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default=None,
+        help="Server or API base url if not using http host and port.",
+    )
+    parser.add_argument(
+        "--host", type=str, default="0.0.0.0", help="Default host is 0.0.0.0."
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        help="If not set, the default port is configured according to its default value for different LLM Inference Engines.",
+    )
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        default="sharegpt",
+        choices=[
+            "sharegpt",
+            "random",
+            "generated-shared-prefix",
+            "ultrachat",
+            "loogle",
+            "nextqa",
+        ],
+        help="Name of the dataset to benchmark on.",
+    )
+    parser.add_argument(
+        "--dataset-path", type=str, default="", help="Path to the dataset."
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        help="Name or path of the model. If not set, the default model will request /v1/models for conf.",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        help="Name or path of the tokenizer. If not set, using the model conf.",
+    )
+    parser.add_argument(
+        "--chat-template",
+        type=str,
+        help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server.",
+    )
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=1000,
+        help="Number of prompts to process. Default is 1000.",
+    )
+    parser.add_argument(
+        "--fixed-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output length from the dataset.",
+    )
+    parser.add_argument(
+        "--sharegpt-context-len",
+        type=int,
+        default=None,
+        help="The context length of the model for the ShareGPT dataset. Requests longer than the context length will be dropped.",
+    )
+    parser.add_argument(
+        "--random-input-len",
+        type=int,
+        default=1024,
+        help="Number of input tokens per request, used only for random dataset.",
+    )
+    parser.add_argument(
+        "--random-output-len",
+        default=1024,
+        type=int,
+        help="Number of output tokens per request, used only for random dataset.",
+    )
+    parser.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=0.0,
+        help="Range of sampled ratio of input/output length, "
+        "used only for random dataset.",
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=float("inf"),
+        help="Number of requests per second. If this is inf, then all the requests are sent at time 0. "
+        "Otherwise, we use Poisson process to synthesize the request arrival times. Default is inf.",
+    )
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        default=None,
+        help="Maximum number of concurrent requests. This can be used "
+        "to help simulate an environment where a higher level component "
+        "is enforcing a maximum number of concurrent requests. While the "
+        "--request-rate argument controls the rate at which requests are "
+        "initiated, this argument will control how many are actually allowed "
+        "to execute at a time. This means that when used in combination, the "
+        "actual request rate may be lower than specified with --request-rate, "
+        "if the server is not processing requests fast enough to keep up.",
+    )
+    parser.add_argument(
+        "--multi",
+        action="store_true",
+        help="Use request rate range rather than single value.",
+    )
+    parser.add_argument(
+        "--request-rate-range",
+        type=str,
+        default="2,34,2",
+        help="Range of request rates in the format start,stop,step. Default is 2,34,2. It also supports a list of request rates, requiring the parameters to not equal three.",
+    )
+    parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
+    parser.add_argument(
+        "--enable-multiturn",
+        action="store_true",
+        help="Enable multiturn chat for online serving benchmarking. "
+        "This option is effective on the following datasets: "
+        "sharegpt, ultrachat, loogle, nextqa",
+    )
+    parser.add_argument(
+        "--enable-shared-prefix",
+        action="store_true",
+        help="Enable shared prefix for online serving benchmarking. "
+        "This option is effective on the following datasets: "
+        "loogle, nextqa",
+    )
+
+    parser.add_argument(
+        "--disable-shuffle",
+        action="store_true",
+        help="Disable shuffling datasets. This is useful to generate stable output "
+        "in benchmarking",
+    )
+    parser.add_argument(
+        "--disable-tqdm",
+        action="store_true",
+        help="Specify to disable tqdm progress bar.",
+    )
+    parser.add_argument(
+        "--disable-stream",
+        action="store_true",
+        help="Disable streaming mode.",
+    )
+    parser.add_argument(
+        "--return-logprob",
+        action="store_true",
+        help="Return logprob.",
+    )
+    parser.add_argument("--seed", type=int, default=1, help="The random seed.")
+    parser.add_argument(
+        "--disable-ignore-eos",
+        action="store_true",
+        help="Disable ignoring EOS.",
+    )
+    parser.add_argument(
+        "--extra-request-body",
+        metavar='{"key1": "value1", "key2": "value2"}',
+        type=str,
+        help="Append given JSON object to the request payload. You can use this to specify"
+        "additional generate params like sampling params.",
+    )
+    parser.add_argument(
+        "--apply-chat-template",
+        action="store_true",
+        help="Apply chat template",
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Use Torch Profiler. The endpoint must be launched with "
+        "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
+    )
+    parser.add_argument(
+        "--lora-name",
+        type=str,
+        default=None,
+        help="The name of LoRA adapter",
+    )
+
+    group = parser.add_argument_group("generated-shared-prefix dataset arguments")
+    group.add_argument(
+        "--gsp-num-groups",
+        type=int,
+        default=64,
+        help="Number of system prompt groups for generated-shared-prefix dataset",
+    )
+    group.add_argument(
+        "--gsp-prompts-per-group",
+        type=int,
+        default=16,
+        help="Number of prompts per system prompt group for generated-shared-prefix dataset",
+    )
+    group.add_argument(
+        "--gsp-system-prompt-len",
+        type=int,
+        default=2048,
+        help="Target length in tokens for system prompts in generated-shared-prefix dataset",
+    )
+    group.add_argument(
+        "--gsp-question-len",
+        type=int,
+        default=128,
+        help="Target length in tokens for questions in generated-shared-prefix dataset",
+    )
+    group.add_argument(
+        "--gsp-output-len",
+        type=int,
+        default=256,
+        help="Target length in tokens for outputs in generated-shared-prefix dataset",
+    )
+    # videos specific
+    parser.add_argument(
+        "--max-frames",
+        type=int,
+        default=sys.maxsize,
+        help="The maximum number of frames to extract from each video. "
+        "This option is specific to the nextqa dataset (video benchmark). ",
+    )
+    args = parser.parse_args()
+
+    if args.enable_multiturn and args.enable_shared_prefix:
+        parser.error(
+            "--enable-multiturn and --enable-shared-prefix cannot be set at the same time."
+        )
+
+    run_benchmark(args)
diff --git a/benchmark/hicache/data_processing.py b/benchmark/hicache/data_processing.py
new file mode 100644
index 000000000..8f72a0d95
--- /dev/null
+++ b/benchmark/hicache/data_processing.py
@@ -0,0 +1,590 @@
+import json
+import os
+import pickle
+import random
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+from nextqa import NExTQALoader
+
+# from nextqa.video import , VideoPrompt
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase
+
+SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
+
+from sglang.bench_serving import (
+    download_and_cache_file,
+    gen_prompt,
+    get_gen_prefix_cache_path,
+)
+from sglang.lang.chat_template import get_chat_template, get_chat_template_by_model_path
+from sglang.srt.entrypoints.openai.protocol import ChatCompletionMessageContentPart
+from sglang.utils import encode_video_base64
+
+# type of content fields, can be only prompts or with images/videos
+MsgContent = Union[str, List[ChatCompletionMessageContentPart]]
+
+# A list of all the conversations. Each conversation is a list of
+# tuples. If multiturn is not enabled, the length of list is 1,
+# containing only the first Q&A pair.
+# For the shared prefix workload (synthetic, loogle, nextqa), it
+# is a list of conversations sharing the same prefix (synthetic,
+# doc, video)
+SampleOutput = List[List[Tuple[MsgContent, int, int]]]
+
+
+def common_filter_chat(
+    num_requests: int,
+    new_dataset: List,
+    tokenizer: PreTrainedTokenizerBase,
+    min_prompt_len: Optional[int],
+    min_output_len: Optional[int],
+    max_prompt_len: Optional[int],
+    max_output_len: Optional[int],
+    fixed_output_len: Optional[int],
+) -> SampleOutput:
+    # Filter out sequences that are too long or too short
+    filtered_dataset: SampleOutput = []
+    l = 0
+    input_tokens = 0
+    output_tokens = 0
+    while l < num_requests:
+        for i in range(len(new_dataset)):
+            if l == num_requests:
+                break
+            processed = []
+            for j in new_dataset[i]:
+                # Tokenize the prompts and completions.
+                prompt = j[0]
+                prompt_token_ids = tokenizer.encode(prompt)
+                prompt_len = len(prompt_token_ids)
+
+                completion = j[1]
+                completion_token_ids = tokenizer.encode(completion)
+                output_len = (
+                    len(completion_token_ids)
+                    if fixed_output_len is None
+                    else fixed_output_len
+                )
+                if (
+                    min_prompt_len is not None
+                    and prompt_len < min_prompt_len
+                    or min_output_len is not None
+                    and output_len < min_output_len
+                    or max_prompt_len is not None
+                    and prompt_len > max_prompt_len
+                    or max_output_len is not None
+                    and output_len > max_output_len
+                ):
+                    # Prune too short sequences.
+                    continue
+                input_tokens += prompt_len
+                output_tokens += output_len
+                processed.append((prompt, prompt_len, output_len))
+            if len(processed) != 0:
+                filtered_dataset.append(processed)
+                l += 1
+
+    print(f"#Input tokens: {input_tokens}")
+    print(f"#Output tokens: {output_tokens}")
+    return filtered_dataset
+
+
+def sample_sharegpt_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    disable_shuffle: bool = False,
+    enable_multiturn: bool = True,
+    fixed_output_len: Optional[int] = None,
+) -> SampleOutput:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Download sharegpt if necessary
+    if not os.path.isfile(dataset_path):
+        dataset_path = download_and_cache_file(SHAREGPT_URL)
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+
+    # Keep one conversation in one list
+    new_dataset = []
+    for data in dataset:
+        if len(data["conversations"]) % 2 != 0:
+            continue
+        if data["conversations"][0]["from"] != "human":
+            continue
+        chat = []
+        total_len = 2
+        if enable_multiturn:
+            total_len = len(data["conversations"])
+        for i in range(0, total_len, 2):
+            # One user One Assistant
+            chat.append(
+                (
+                    data["conversations"][i]["value"],
+                    data["conversations"][i + 1]["value"],
+                )
+            )
+        new_dataset.append(chat)
+
+    if not disable_shuffle:
+        # Shuffle the dataset.
+        random.shuffle(new_dataset)
+
+    # Filter out sequences that are too long or too short
+    filtered_dataset: SampleOutput = common_filter_chat(
+        num_requests, new_dataset, tokenizer, 4, 4, None, None, fixed_output_len
+    )
+    return filtered_dataset
+
+
+def sample_ultrachat_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    disable_shuffle: bool = False,
+    enable_multiturn: bool = True,
+    fixed_output_len: Optional[int] = None,
+) -> SampleOutput:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Load the dataset
+    dataset = []
+    with open(dataset_path) as f:
+        while True:
+            line = f.readline()
+            if not line:
+                break
+            dataset.append(json.loads(line))
+
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["data"]) >= 2]
+
+    # Keep one conversation in one list
+    new_dataset = []
+    for data in dataset:
+        if len(data["data"]) % 2 != 0:
+            continue
+        chat = []
+        total_len = 2
+        if enable_multiturn:
+            total_len = len(data["data"])
+        for i in range(0, total_len, 2):
+            # One user One Assistant
+            chat.append((data["data"][i], data["data"][i + 1]))
+        new_dataset.append(chat)
+
+    # Shuffle the dataset.
+    if not disable_shuffle:
+        random.shuffle(new_dataset)
+
+    # Filter out sequences that are too long or too short
+    filtered_dataset: SampleOutput = common_filter_chat(
+        num_requests, new_dataset, tokenizer, 4, 4, None, None, fixed_output_len
+    )
+    return filtered_dataset
+
+
+def sample_loogle_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    disable_shuffle: bool = False,
+    enable_multiturn: bool = True,
+    enable_shared_prefix: bool = False,
+    fixed_output_len: Optional[int] = None,
+) -> SampleOutput:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Load the dataset
+    dataset = []
+    with open(dataset_path) as f:
+        while True:
+            line = f.readline()
+            if not line:
+                break
+            dataset.append(json.loads(line))
+
+    # Keep one conversation in one list
+    new_dataset = []
+    # TODO: Add shared prefix support for loogle
+    # NOTE: Now we preprocess it only for chat
+    for data in dataset:
+        chat = []
+        if (
+            "qa_pairs" not in data
+            or data["qa_pairs"] == "none"
+            or len(data["qa_pairs"]) == 0
+        ):
+            # If Q is none (for summarization),
+            # We add a question for summarization
+            # And keep the summary up to 1024 words
+            chat.append(
+                (
+                    "Input: "
+                    + data["input"]
+                    + " Question: "
+                    + "Please summarize the input",
+                    data["input"][:1024],
+                )
+            )
+            new_dataset.append(chat)
+        else:
+            qa_pairs = eval(data["qa_pairs"])
+            for i, qa in enumerate(qa_pairs):
+                if i == 0 or enable_shared_prefix:
+                    # Combine input with the first Q
+                    chat.append(
+                        ("Input: " + data["input"] + " Question: " + qa["Q"], qa["A"])
+                    )
+                elif enable_multiturn:
+                    chat.append((qa["Q"], qa["A"]))
+
+            new_dataset.append(chat)
+
+    # Shuffle the dataset.
+    if not disable_shuffle:
+        random.shuffle(new_dataset)
+
+    # Filter out sequences that are too long or too short
+    filtered_dataset: SampleOutput = common_filter_chat(
+        num_requests, new_dataset, tokenizer, 4, None, None, None, fixed_output_len
+    )
+    return filtered_dataset
+
+
+def sample_nextqa_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    max_frames: int,  # Specific for video
+    model_path: str,
+    disable_shuffle: bool = False,
+    enable_multiturn: bool = True,  # No multiturn support for now
+    backend: str = "sglang-oai",
+    chat_template_name: Optional[str] = None,
+    fixed_output_len: Optional[int] = None,
+) -> SampleOutput:
+    """
+    Example of messages:
+    message = {
+        "role": "user",
+        "content": [
+            {"type": "image_url", "image_url": {"url": base64_data}},
+            {"type": "text", "text": video.prompt},
+        ],
+    }
+    """
+
+    if fixed_output_len is None:
+        fixed_output_len = 4096
+
+    # TODO: Check for multiturn
+    dataset = NExTQALoader(video_dir=dataset_path, max_frames=max_frames)
+    new_dataset = []
+    for v in dataset:
+        new_dataset.append(v)
+
+    if not disable_shuffle:
+        random.shuffle(new_dataset)
+
+    # TODO: prompt len can get from server side
+    filtered_dataset = []
+    l = 0
+    while l < num_requests:
+        for i in range(len(new_dataset)):
+            if l == num_requests:
+                break
+
+            video = new_dataset[i]
+
+            # text prompt
+            prompt = video.prompt
+
+            # NOTE: Chat Template is a must for video benchmark because we have to
+            # add special image token for later expansion
+            if backend == "sglang" or backend == "sglang-native":
+                if "chat_template" in tokenizer.init_kwargs:
+                    chat_template = get_chat_template(tokenizer.get_chat_template())
+                elif chat_template_name is not None:
+                    chat_template = get_chat_template(chat_template_name)
+                else:
+                    chat_template = get_chat_template_by_model_path(model_path)
+                prompt = chat_template.image_token + prompt
+
+            prompt_token_ids = tokenizer(prompt).input_ids
+            prompt_len = len(prompt_token_ids)
+            output_len = fixed_output_len  # max output len, not real output len
+
+            # video input
+            base64_data = encode_video_base64(video.path, video.num_frames)
+
+            # NOTE: This will be replaced by the expanded length from the server
+            prompt_len += video.num_frames
+
+            # add to content
+            content = [
+                {"type": "image_url", "image_url": {"url": base64_data}},
+                {"type": "text", "text": prompt},
+            ]
+
+            filtered_dataset.append([(content, prompt_len, output_len)])
+            l += 1
+    return filtered_dataset
+
+
+def sample_random_requests(
+    input_len: int,
+    output_len: int,
+    num_prompts: int,
+    range_ratio: float,
+    tokenizer: PreTrainedTokenizerBase,
+    dataset_path: str,
+    disable_shuffle: bool = False,
+) -> SampleOutput:
+
+    input_lens = np.random.randint(
+        max(int(input_len * range_ratio), 1),
+        input_len + 1,
+        size=num_prompts,
+    )
+    output_lens = np.random.randint(
+        int(output_len * range_ratio),
+        output_len + 1,
+        size=num_prompts,
+    )
+
+    if True:
+        # Sample token ids from ShareGPT and repeat/truncate them to satisfy the input_lens
+
+        # Download sharegpt if necessary
+        if not os.path.isfile(dataset_path):
+            dataset_path = download_and_cache_file(SHAREGPT_URL)
+
+        # Load the dataset.
+        with open(dataset_path) as f:
+            dataset = json.load(f)
+        # Filter out the conversations with less than 2 turns.
+        dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+        # Only keep the first two turns of each conversation.
+        dataset = [
+            (data["conversations"][0]["value"], data["conversations"][1]["value"])
+            for data in dataset
+        ]
+
+        if not disable_shuffle:
+            # Shuffle the dataset.
+            random.shuffle(dataset)
+
+        # Filter out sequences that are too long or too short
+        input_requests: SampleOutput = []
+        for data in dataset:
+            i = len(input_requests)
+            if i == num_prompts:
+                break
+
+            # Tokenize the prompts and completions.
+            prompt = data[0]
+            prompt_token_ids = tokenizer.encode(prompt)
+            prompt_len = len(prompt_token_ids)
+
+            # Skip empty prompt
+            if prompt_len == 0:
+                continue
+
+            if prompt_len > input_lens[i]:
+                input_ids = prompt_token_ids[: input_lens[i]]
+            else:
+                ratio = (input_lens[i] + prompt_len - 1) // prompt_len
+                input_ids = (prompt_token_ids * ratio)[: input_lens[i]]
+            prompt = tokenizer.decode(input_ids)
+            input_requests.append([(prompt, int(input_lens[i]), int(output_lens[i]))])
+    else:
+        # Sample token ids from random integers. This can cause some NaN issues.
+        offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
+        input_requests = []
+        for i in range(num_prompts):
+            prompt = tokenizer.decode(
+                [
+                    (offsets[i] + i + j) % tokenizer.vocab_size
+                    for j in range(input_lens[i])
+                ]
+            )
+            input_requests.append([(prompt, int(input_lens[i]), int(output_lens[i]))])
+
+    print(f"#Input tokens: {np.sum(input_lens)}")
+    print(f"#Output tokens: {np.sum(output_lens)}")
+    return input_requests
+
+
+def gen_prompt(tokenizer, token_num):
+    """Generate a random prompt of specified token length using tokenizer vocabulary."""
+    all_available_tokens = list(tokenizer.get_vocab().values())
+    selected_tokens = random.choices(all_available_tokens, k=token_num)
+    return tokenizer.decode(selected_tokens)
+
+
+def get_gen_prefix_cache_path(args, tokenizer):
+    """Create cache directory under ~/.cache/sglang/benchmark"""
+    cache_dir = Path.home() / ".cache" / "sglang" / "benchmark"
+
+    # Create a unique cache filename based on the generation parameters
+    cache_key = (
+        f"gsp_prefix_{args.gsp_num_groups}_{args.gsp_prompts_per_group}_"
+        f"{args.gsp_system_prompt_len}_{args.gsp_question_len}_{args.gsp_output_len}_"
+        f"{tokenizer.__class__.__name__}.pkl"
+    )
+    return cache_dir / cache_key
+
+
+def sample_generated_shared_prefix_requests(
+    num_groups: int,
+    prompts_per_group: int,
+    system_prompt_len: int,
+    question_len: int,
+    output_len: int,
+    tokenizer: PreTrainedTokenizerBase,
+    args,
+    disable_shuffle: bool = False,
+) -> SampleOutput:
+    """Generate benchmark requests with shared system prompts using random tokens and caching."""
+    cache_path = get_gen_prefix_cache_path(args, tokenizer)
+
+    # Try to load from cache first
+    if cache_path.exists():
+        print(f"\nLoading cached generated input data from {cache_path}")
+        with open(cache_path, "rb") as f:
+            return pickle.load(f)
+
+    print("\nGenerating new input data...")
+
+    # Generate system prompts for each group
+    system_prompts = []
+    for _ in range(num_groups):
+        system_prompt = gen_prompt(tokenizer, system_prompt_len)
+        system_prompts.append(system_prompt)
+
+    # Generate questions
+    questions = []
+    for _ in range(num_groups * prompts_per_group):
+        question = gen_prompt(tokenizer, question_len)
+        questions.append(question)
+
+    # Combine system prompts with questions
+    input_requests = []
+    total_input_tokens = 0
+    total_output_tokens = 0
+
+    for group_idx in tqdm(range(num_groups), desc="Generating system prompt"):
+        system_prompt = system_prompts[group_idx]
+        input_requests.append([])
+        for prompt_idx in tqdm(
+            range(prompts_per_group), desc="Generating questions", leave=False
+        ):
+            question = questions[group_idx * prompts_per_group + prompt_idx]
+            full_prompt = f"{system_prompt}\n\n{question}"
+            prompt_len = len(tokenizer.encode(full_prompt))
+            input_requests[-1].append((full_prompt, prompt_len, output_len))
+            total_input_tokens += prompt_len
+            total_output_tokens += output_len
+
+    if not disable_shuffle:
+        # Shuffle questions
+        random.shuffle(input_requests)
+
+    # Print statistics
+    print(f"\nGenerated shared prefix dataset statistics:")
+    print(f"Number of groups: {num_groups}")
+    print(f"Prompts per group: {prompts_per_group}")
+    print(f"Total prompts: {len(input_requests) * prompts_per_group}")
+    print(f"Total input tokens: {total_input_tokens}")
+    print(f"Total output tokens: {total_output_tokens}")
+    print(
+        f"Average system prompt length: {sum(len(tokenizer.encode(sp)) for sp in system_prompts) / len(system_prompts):.1f} tokens"
+    )
+    print(
+        f"Average question length: {sum(len(tokenizer.encode(q)) for q in questions) / len(questions):.1f} tokens\n"
+    )
+
+    # Save to cache
+    cache_path.parent.mkdir(parents=True, exist_ok=True)
+    print(f"Caching generated input data to {cache_path}")
+    with open(cache_path, "wb") as f:
+        pickle.dump(input_requests, f)
+
+    return input_requests
+
+
+def get_dataset(args, tokenizer):
+    if args.dataset_name == "sharegpt":
+        input_requests = sample_sharegpt_requests(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            disable_shuffle=args.disable_shuffle,
+            enable_multiturn=args.enable_multiturn,
+            fixed_output_len=args.fixed_output_len,
+        )
+    elif args.dataset_name == "ultrachat":
+        input_requests = sample_ultrachat_requests(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            disable_shuffle=args.disable_shuffle,
+            enable_multiturn=args.enable_multiturn,
+            fixed_output_len=args.fixed_output_len,
+        )
+    elif args.dataset_name == "loogle":
+        input_requests = sample_loogle_requests(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            disable_shuffle=args.disable_shuffle,
+            enable_multiturn=args.enable_multiturn,
+            enable_shared_prefix=args.enable_shared_prefix,
+            fixed_output_len=args.fixed_output_len,
+        )
+    elif args.dataset_name == "nextqa":
+        input_requests = sample_nextqa_requests(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            max_frames=args.max_frames,
+            model_path=args.model,
+            disable_shuffle=args.disable_shuffle,
+            enable_multiturn=args.enable_multiturn,
+            backend=args.backend,
+            chat_template_name=args.chat_template,
+            fixed_output_len=args.fixed_output_len,
+        )
+    elif args.dataset_name == "random":
+        input_requests = sample_random_requests(
+            input_len=args.random_input_len,
+            output_len=args.random_output_len,
+            num_prompts=args.num_prompts,
+            range_ratio=args.random_range_ratio,
+            tokenizer=tokenizer,
+            dataset_path=args.dataset_path,
+        )
+    elif args.dataset_name == "generated-shared-prefix":
+        input_requests = sample_generated_shared_prefix_requests(
+            num_groups=args.gsp_num_groups,
+            prompts_per_group=args.gsp_prompts_per_group,
+            system_prompt_len=args.gsp_system_prompt_len,
+            question_len=args.gsp_question_len,
+            output_len=args.gsp_output_len,
+            args=args,
+            tokenizer=tokenizer,
+        )
+    else:
+        raise ValueError(f"Unknown dataset: {args.dataset_name}")
+    return input_requests
diff --git a/benchmark/hicache/download.sh b/benchmark/hicache/download.sh
new file mode 100755
index 000000000..340bcc3bc
--- /dev/null
+++ b/benchmark/hicache/download.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/bash
+
+# The usage function
+usage() {
+    echo "Usage: $0 {sharegpt|ultragpt|loogle|nextqa|all}"
+    exit 1
+}
+
+# The download function
+download() {
+    case "$1" in
+        sharegpt)
+            echo $1
+            wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+            ;;
+        ultragpt)
+            echo $1
+            # Questions about the world
+            wget https://cloud.tsinghua.edu.cn/seafhttp/files/be1d7b87-22ca-449e-a6a7-c61d1ea7e010/ultrachat_release_230407.json
+            # Writing and Creation
+            wget https://cloud.tsinghua.edu.cn/seafhttp/files/61742d2a-25e2-4d08-b2b9-15f47ae50ace/ultrachat_material_release_230417.json
+            wget https://cloud.tsinghua.edu.cn/seafhttp/files/f71f6aa6-d346-4b16-85b7-8502efa3d608/ultrachat_material_release_230412.json
+            # External materials
+            wget https://cloud.tsinghua.edu.cn/seafhttp/files/42d22e28-e899-4975-a70f-5eda163e265d/ultrachat_existent_material_release_230420.json.gz
+            gunzip ultrachat_existent_material_release_230420.json.gz
+            ;;
+        loogle)
+            echo $1
+            git lfs install
+            git clone git@hf.co:datasets/bigainlco/LooGLE
+            unzip LooGLE/data.zip
+            ;;
+        nextqa)
+            echo $1
+            git lfs install
+            git clone https://huggingface.co/datasets/lmms-lab/NExTQA
+            unzip NExTQA/videos.zip
+            ;;
+        *)
+            usage
+            exit 1
+            ;;
+    esac
+}
+
+# Arg check
+if [ "$#" -ne 1 ]; then
+    usage
+fi
+
+# Invoke
+
+case "$1" in
+    sharegpt|ultragpt|loogle|nextqa)
+        download "$1"
+        ;;
+    all)
+        download sharegpt
+        download ultragpt
+        download loogle
+        download nextqa
+        ;;
+    *)
+        usage
+        ;;
+esac
diff --git a/benchmark/hicache/nextqa.py b/benchmark/hicache/nextqa.py
new file mode 100644
index 000000000..4db6caa1c
--- /dev/null
+++ b/benchmark/hicache/nextqa.py
@@ -0,0 +1,159 @@
+import os
+import sys
+from typing import List
+
+import av
+from datasets import load_dataset
+
+
+def find_video_files(video_dir) -> List[str]:
+    if os.path.isfile(video_dir):
+        return [video_dir]
+
+    video_files = []
+    for root, dirs, files in os.walk(video_dir):
+        for file in files:
+            if file.endswith((".mp4", ".avi", ".mov")):
+                video_files.append(os.path.join(root, file))
+            # if file is dir
+            elif os.path.isdir(file):
+                video_files.extend(find_video_files(file))
+    return video_files
+
+
+def video_frames(video_path, max_frames) -> int:
+    container = av.open(video_path)
+    total_frames = container.streams.video[0].frames
+    return min(total_frames, max_frames)
+
+
+class Video:
+    def __init__(self, video_path, num_frames):
+        self.path = video_path
+        self.num_frames = num_frames
+
+    def __str__(self):
+        return f"Video({self.path}, {self.num_frames})"
+
+    def __iter__(self):
+        return iter((self.path, self.num_frames))
+
+
+class VideoPrompt(Video):
+    def __init__(self, video_path, num_frames, prompt):
+        super().__init__(video_path, num_frames)
+        self.prompt = prompt
+
+    def __str__(self):
+        return f"VideoPrompt({self.path}, {self.num_frames}, {self.prompt})"
+
+    def __iter__(self):
+        return iter((self.path, self.num_frames, self.prompt))
+
+
+class VideoLoader:
+    pass
+
+
+class VideoFileLoader(VideoLoader):
+    """
+    Load all the videos in a directory
+    """
+
+    def __init__(self, video_dir, batch_size=1, max_frames=sys.maxsize):
+        super().__init__()
+        self.video_dir = video_dir
+        self.video_files = find_video_files(video_dir)
+        self.batch_size = batch_size
+        self.max_frames = max_frames
+        print(f"batch_size: {batch_size}, max_frames: {max_frames}")
+
+    def __iter__(self):  # (file, number of frames)
+        if self.batch_size == 1:
+            for video_file in self.video_files:
+                yield Video(video_file, video_frames(video_file, self.max_frames))
+        else:
+            batch = []
+            for video_file in self.video_files:
+                video = Video(video_file, video_frames(video_file, self.max_frames))
+                batch.append(video)
+                if len(batch) == self.batch_size:
+                    yield batch
+                    batch = []
+
+
+class NExTQALoader(VideoLoader):
+    """
+    Load vdideos and prompts from NExT dataset
+    set: train, test or validation
+    """
+
+    def __init__(
+        self, video_dir, batch_size=1, max_frames=sys.maxsize, dset="test", task="OE"
+    ):
+        """
+        task: 'MV' or 'OE'
+        """
+        super().__init__()
+        self.task = task
+        print(f"Loading the {dset} data of {task} from lmms-lab/NExTQA")
+        self.ds = load_dataset("lmms-lab/NExTQA", task)
+        self.ds = self.ds[dset]
+
+        # self.n = ds.num_rows
+        self.video_dir = video_dir
+        self.video_files = find_video_files(video_dir)
+        self.video_to_path = dict()
+        for video_file in self.video_files:
+            video_id = video_file.split("/")[-1].split(".")[0]
+            self.video_to_path[video_id] = video_file
+
+        self.batch_size = batch_size
+        self.max_frames = max_frames
+
+    def get_video_prompt(self, entry, max_frames) -> VideoPrompt:
+        # Get video
+        video_id = entry["video"]
+        video_path = self.video_to_path[video_id]
+        assert os.path.exists(video_path), f"Video not found: {video_path}"
+        num_frames = min(entry["frame_count"], max_frames)
+        video = Video(video_path, num_frames)
+        prompt = entry["question"] + "?"
+        if self.task == "MC":  # add choices
+            prompt += f' a0: {entry["a0"]}, a1: {entry["a1"]}, a2: {entry["a2"]}, a3: {entry["a3"]}'
+        return VideoPrompt(video_path, num_frames, prompt)
+
+    def __iter__(self):
+        if self.batch_size == 1:
+            for entry in self.ds:
+                yield self.get_video_prompt(entry, self.max_frames)
+        else:
+            batch = []
+            for entry in self.ds:
+                video = self.get_video_prompt(entry, self.max_frames)
+                batch.append(video)
+                if len(batch) == self.batch_size:
+                    yield batch
+                    batch = []
+
+
+# main
+if __name__ == "__main__":
+    video_dir = "./videos"
+    # video_loader = VideoFileLoader(video_dir, batch_size=16)
+    # for batch in video_loader:
+    #     print(f"Number of videos in batch: {len(batch)}")
+    #     for video_file, num_frames in batch:
+    #         print(f"Video: {video_file} number of frames: {num_frames}")
+
+    video_loader = NExTQALoader(video_dir, batch_size=16, dset="test", task="OE")
+    for batch in video_loader:
+        print(f"Number of videos in batch: {len(batch)}")
+        for video_file, num_frames, prompt in batch:
+            print(
+                f"Video: {video_file} number of frames: {num_frames}, prompt: {prompt}"
+            )
+        # break
+        # for video_file, prompt in batch:
+        #     print(f"Video: {video_file} prompt: {prompt}")
+        #     break
diff --git a/benchmark/json_decode_regex/README.md b/benchmark/json_decode_regex/README.md
new file mode 100644
index 000000000..26acabae9
--- /dev/null
+++ b/benchmark/json_decode_regex/README.md
@@ -0,0 +1,60 @@
+## Run benchmark
+
+### Build dataset
+```
+pip install wikipedia
+python3 build_dataset.py
+```
+
+### Dependencies
+
+```
+llama_cpp_python          0.2.19
+guidance                  0.1.10
+vllm                      0.2.5
+outlines                  0.0.22
+```
+
+### Benchmark sglang
+
+Run Llama-7B
+
+```
+python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+Run Mixtral-8x7B
+
+```
+python3 -m sglang.launch_server --model-path mistralai/Mixtral-8x7B-Instruct-v0.1 --port 30000 --tp-size 8
+```
+
+Benchmark
+
+```
+python3 bench_sglang.py --num-questions 10
+```
+
+
+### Benchmark Outlines + vLLM
+
+Run Llama-7B
+
+```
+python3 -m outlines.serve.serve --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf  --disable-log-requests --port 21000
+```
+
+Benchmark
+
+```
+python3 bench_other.py --backend outlines --num-questions 10
+```
+
+
+### Benchmark guidance
+
+Run Llama-7B and benchmark
+
+```
+python3 bench_other.py --backend guidance --num-questions 10 --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
diff --git a/benchmark/json_decode_regex/bench_other.py b/benchmark/json_decode_regex/bench_other.py
new file mode 100644
index 000000000..87051ea82
--- /dev/null
+++ b/benchmark/json_decode_regex/bench_other.py
@@ -0,0 +1,98 @@
+import argparse
+import json
+import time
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+
+from tqdm import tqdm
+
+from sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_STR
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+from sglang.utils import dump_state_text, read_jsonl
+
+REGEX_LIST = r"\[(" + REGEX_STR + ", )*" + REGEX_STR + r"\]"
+
+
+# fmt: off
+def json_decode(document, generate):
+    s = "Please extract the information of a city from the following wikipedia page.\n"
+    s += "Page begin.\n" + document + "Page end.\n"
+    s += "Here is the name, country, and symbol of the city in JSON format.\n"
+    s += "{\n"
+    s += '  "name": '
+    s += generate(s, max_tokens=8, regex=REGEX_STR + ",") + "\n"
+    s += '  "country": '
+    s += generate(s, max_tokens=8, regex=REGEX_STR + ",") + "\n"
+    s += '  "latitude": '
+    s += generate(s, max_tokens=8, regex=REGEX_FLOAT + ",") + "\n"
+    s += '  "population": '
+    s += generate(s, max_tokens=8, regex=REGEX_INT + ",") + "\n"
+    s += '  "top 3 landmarks": '
+    s += generate(s, max_tokens=24, regex=REGEX_LIST) + "\n"
+    s += "}\n"
+
+    return s
+# fmt: on
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)
+    arguments = []
+    for i in range(len(lines[: args.num_questions])):
+        arguments.append(
+            {
+                "document": lines[i]["document"],
+            }
+        )
+    states = [None] * len(arguments)
+
+    # Select backend
+    call_generate = partial(get_call_generate(args), temperature=0)
+
+    # Run requests
+    def get_one_answer(i):
+        states[i] = json_decode(generate=call_generate, **arguments[i])
+
+    tic = time.perf_counter()
+    if args.parallel == 1:
+        for i in tqdm(range(len(arguments))):
+            get_one_answer(i)
+    else:
+        with ThreadPoolExecutor(args.parallel) as executor:
+            rets = list(
+                tqdm(
+                    executor.map(get_one_answer, list(range(len(arguments)))),
+                    total=len(arguments),
+                )
+            )
+            for _ in rets:
+                pass
+
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "json_decode_regex",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="questions.jsonl")
+    parser.add_argument("--num-questions", type=int, default=20)
+    args = add_common_other_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/json_decode_regex/bench_sglang.py b/benchmark/json_decode_regex/bench_sglang.py
new file mode 100644
index 000000000..9aab11e43
--- /dev/null
+++ b/benchmark/json_decode_regex/bench_sglang.py
@@ -0,0 +1,101 @@
+import argparse
+import json
+import time
+
+import sglang as sgl
+from sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_STR
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text, read_jsonl
+
+REGEX_LIST = r"\[(" + REGEX_STR + ", )*" + REGEX_STR + r"\]"
+
+# fmt: off
+@sgl.function
+def json_warm_up(s):
+    s += "The information about Hogwarts is in the following JSON format.\n"
+    with s.var_scope("json_output"):
+        s += "{\n"
+        s += '  "name": ' + sgl.gen("name", max_tokens=8, regex=REGEX_STR + ",") + "\n"
+        s += '  "country": ' + sgl.gen("country", max_tokens=8, regex=REGEX_STR + ",") + "\n"
+        s += '  "latitude": ' + sgl.gen("latitude", max_tokens=8, regex=REGEX_FLOAT + ",") + "\n"
+        s += '  "population": ' + sgl.gen("population", max_tokens=8, regex=REGEX_INT + ",") + "\n"
+        s += '  "top 3 landmarks": ' + sgl.gen( "landmarks", max_tokens=24, regex=REGEX_LIST) + "\n"
+        s += "}\n"
+    print(f'The warmp up json result is:\n{s["json_output"]}')
+# fmt: on
+
+# fmt: off
+@sgl.function
+def json_decode(s, document):
+    s += "Please extract the information of a city from the following wikipedia page.\n"
+    s += "Page begin.\n" + document + "Page end.\n"
+    s += "Here is the name, country, and symbol of the city in JSON format.\n"
+    with s.var_scope("json_output"):
+        s += "{\n"
+        s += '  "name": ' + sgl.gen("name", max_tokens=8, regex=REGEX_STR + ",") + "\n"
+        s += '  "country": ' + sgl.gen("country", max_tokens=8, regex=REGEX_STR + ",") + "\n"
+        s += '  "latitude": ' + sgl.gen("latitude", max_tokens=8, regex=REGEX_FLOAT + ",") + "\n"
+        s += '  "population": ' + sgl.gen("population", max_tokens=8, regex=REGEX_INT + ",") + "\n"
+        s += '  "top 3 landmarks": ' + sgl.gen( "landmarks", max_tokens=24, regex=REGEX_LIST) + "\n"
+        s += "}\n"
+# fmt: on
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)
+    lines = list(lines)
+    arguments = []
+    for i in range(len(lines[: args.num_questions])):
+        arguments.append(
+            {
+                "document": lines[i]["document"],
+            }
+        )
+
+    # Select backend
+    backend = select_sglang_backend(args)
+    sgl.set_default_backend(backend)
+
+    # Warm up
+    json_warm_up.run().sync()
+
+    # Run requests
+    tic = time.perf_counter()
+    states = json_decode.run_batch(
+        arguments, temperature=0, num_threads=args.parallel, progress_bar=True
+    )
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(f"tmp_{args.backend}_json_results.txt", "w") as fout:
+        for state in states:
+            fout.write(state["json_output"] + "\n")
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "json_decode_regex",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="questions.jsonl")
+    parser.add_argument("--num-questions", type=int, default=20)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/json_decode_regex/build_dataset.py b/benchmark/json_decode_regex/build_dataset.py
new file mode 100644
index 000000000..1396e5ede
--- /dev/null
+++ b/benchmark/json_decode_regex/build_dataset.py
@@ -0,0 +1,58 @@
+import json
+
+import transformers
+import wikipedia
+
+model_path = "meta-llama/Llama-2-7b-chat-hf"
+t = transformers.AutoTokenizer.from_pretrained(model_path)
+city_names = [
+    "los angles",
+    "london",
+    "tokyo",
+    "beijing",
+    "singapore",
+    "paris",
+    "dubai",
+    "sydney",
+    "moscow",
+    "rome",
+    "toronto",
+    "rio de janeiro",
+    "istanbul",
+    "berlin",
+    "auckland",
+    "buenos aires",
+    "mexico city",
+    "mumbai",
+    "seoul",
+    "bangkok",
+    "cairo",
+    "athens",
+    "jerusalem",
+]
+
+
+def get_content(city_name):
+    content = str(wikipedia.page(city_name).content)
+    content = content.replace("\n\n", "\n")
+
+    tokens = t.encode(content)
+
+    expected_tokens = 3000
+    truncate_len = int((expected_tokens / len(tokens)) * len(content))
+    truncate_content = content[:truncate_len]
+    truncate_tokens = t.encode(truncate_content)
+
+    # Count token
+    print(
+        f"city_name: {city_name}, #tokens: {len(tokens)}, #truncate tokens: {len(truncate_tokens)}"
+    )
+
+    return truncate_content
+
+
+if __name__ == "__main__":
+    with open("questions.jsonl", "w") as fout:
+        for city_name in city_names:
+            truncate_content = get_content(city_name)
+            fout.write(json.dumps({"document": truncate_content}) + "\n")
diff --git a/benchmark/json_jump_forward/README.md b/benchmark/json_jump_forward/README.md
new file mode 100644
index 000000000..38fb67e89
--- /dev/null
+++ b/benchmark/json_jump_forward/README.md
@@ -0,0 +1,88 @@
+## Run benchmark
+
+### Dependencies
+
+```
+llama_cpp_python          0.2.38
+guidance                  0.1.10
+vllm                      0.2.7
+outlines                  0.0.25
+```
+
+### Build dataset
+
+When benchmarking long document information retrieval, run the following command to build the dataset:
+
+```bash
+pip install wikipedia
+python3 build_dataset.py
+```
+
+### Benchmark sglang
+
+Run Llama-7B
+
+```bash
+python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+Benchmark Character Generation
+
+```bash
+python3 bench_sglang.py --mode character
+```
+
+Benchmark City Information Retrieval
+
+```bash
+python3 bench_sglang.py --mode city
+```
+
+
+### Benchmark Outlines + vLLM
+
+Run Llama-7B
+
+```bash
+python3 -m outlines.serve.serve --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf  --disable-log-requests --port 21000
+```
+
+Benchmark Character Generation
+
+```bash
+python3 bench_other.py --mode character --backend outlines
+```
+
+Benchmark City Information Retrieval
+
+```bash
+python3 bench_other.py --mode city --backend outlines
+```
+
+### Benchmark guidance
+
+Run Llama-7B and benchmark character generation
+
+```bash
+python3 bench_other.py --mode character --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+
+Run Llama-7B and benchmark city information retrieval
+
+```bash
+python3 bench_other.py --mode city --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+
+### Benchmark lmql
+
+Run Llama-7B and benchmark character generation
+
+```
+python3 bench_other.py --mode character --backend lmql --parallel 1
+```
+
+Run Llama-7B and benchmark city information retrieval
+
+```
+python3 bench_other.py --mode city --backend lmql --parallel 1
+```
diff --git a/benchmark/json_jump_forward/bench_other.py b/benchmark/json_jump_forward/bench_other.py
new file mode 100644
index 000000000..a64e950d7
--- /dev/null
+++ b/benchmark/json_jump_forward/bench_other.py
@@ -0,0 +1,288 @@
+import argparse
+import json
+import time
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+
+import guidance
+from tqdm import tqdm
+
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+from sglang.utils import dump_state_text, read_jsonl
+
+# there are some FSM bugs with json regex converted from pydantic model
+# here use a string regex instead
+# regex_string = build_regex_from_object(HarryPoterRole)
+character_regex = (
+    r"""\{\n"""
+    + r"""    "name": "[\w\d\s]{1,16}",\n"""
+    + r"""    "house": "(Gryffindor|Slytherin|Ravenclaw|Hufflepuff)",\n"""
+    + r"""    "blood status": "(Pure-blood|Half-blood|Muggle-born)",\n"""
+    + r"""    "occupation": "(student|teacher|auror|ministry of magic|death eater|order of the phoenix)",\n"""
+    + r"""    "wand": \{\n"""
+    + r"""        "wood": "[\w\d\s]{1,16}",\n"""
+    + r"""        "core": "[\w\d\s]{1,16}",\n"""
+    + r"""        "length": [0-9]{1,2}\.[0-9]{0,2}\n"""
+    + r"""    \},\n"""
+    + r"""    "alive": "(Alive|Deceased)",\n"""
+    + r"""    "patronus": "[\w\d\s]{1,16}",\n"""
+    + r"""    "bogart": "[\w\d\s]{1,16}"\n"""
+    + r"""\}"""
+)
+
+city_regex = (
+    r"""\{\n"""
+    + r"""  "name": "[\w\d\s]{1,16}",\n"""
+    + r"""  "country": "[\w\d\s]{1,16}",\n"""
+    + r"""  "latitude": [-+]?[0-9]*\.?[0-9]{0,2},\n"""
+    + r"""  "population": [-+]?[0-9]{1,9},\n"""
+    + r"""  "top 3 landmarks": \["[\w\d\s]{1,16}", "[\w\d\s]{1,16}", "[\w\d\s]{1,16}"\]\n"""
+    + r"""\}"""
+)
+
+# fmt: off
+def character_gen(name, generate):
+    s = name + " is a character in Harry Potter. Please fill in the following information about this character.\n"
+    s += generate(s, max_tokens=256, regex=character_regex)
+    return s
+# fmt: on
+
+# fmt: off
+def city_gen(document, generate):
+    s = "Please extract the information of a city from the following wikipedia page.\n"
+    s += "Page begin.\n" + document + "Page end.\n"
+    s += "Here is the name, country, and symbol of the city in JSON format.\n"
+    s += generate(s, max_tokens=256, regex=city_regex)
+    return s
+# fmt: on
+
+
+@guidance
+def character_maker(lm, name):
+    regex_str_no_quote = r"[\w\d\s]+"
+    regex_float = r"[0-9]+\.[0-9]+"
+    lm += f"""\
+    {name} is a character in Harry Potter. Please fill in the following information about this character.
+    {{
+        "name": "{guidance.gen("name", max_tokens=16, regex=regex_str_no_quote)}",
+        "house": "{guidance.select(options=['Gryffindor', 'Slytherin', 'Ravenclaw', 'Hufflepuff'], name='house')}",
+        "blood status": "{guidance.select(options=['Pure-blood', 'Half-blood', 'Muggle-born'], name='blood status')}",
+        "occupation": "{guidance.select(options=['student', 'teacher', 'auror', 'ministry of magic', 'death eater', 'order of the phoenix'], name='occupation')}",
+        "wand": {{
+            "wood": "{guidance.gen("wood", max_tokens=16, regex=regex_str_no_quote)}",
+            "core": "{guidance.gen('core', max_tokens=16, regex=regex_str_no_quote)}",
+            "length": {guidance.gen('length', max_tokens=10, regex=regex_float)}
+        }},
+        "alive": "{guidance.select(options=['Alive', 'Deceased'], name='alive')}",
+        "patronus": "{guidance.gen('patronus', max_tokens=16, regex=regex_str_no_quote)}",
+        "bogart": "{guidance.gen('bogart', max_tokens=16, regex=regex_str_no_quote)}"
+    }}
+    """
+
+    return lm
+
+
+async def call_generate_lmql(
+    prompt, temperature, max_tokens, regex, max_len=4096, model=None, **kwargs
+):
+    assert model is not None
+    import lmql
+
+    @lmql.query(model=model)
+    async def program(question, max_tokens, regex):
+        '''lmql
+        """{question}[ANSWER]""" where len(TOKENS(ANSWER)) < max_tokens and REGEX(ANSWER, regex)
+        return ANSWER
+        '''
+
+    return await program(
+        question=prompt,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        max_len=max_len,
+        regex=regex,
+        **kwargs,
+    )
+
+
+@guidance
+def city_maker(lm, document):
+    regex_str_no_quote = r"[\w\d\s]+"
+    regex_float = r"[0-9]+\.[0-9]+"
+    lm += f"""\
+    Please extract the information of a city from the following wikipedia page.
+    Page begin.
+    {document}
+    Page end.
+    Here is the name, country, and symbol of the city in JSON format.
+    {{
+        "name": "{guidance.gen("name", max_tokens=16, regex=regex_str_no_quote)}",
+        "country": "{guidance.gen("country", max_tokens=16, regex=regex_str_no_quote)}",
+        "latitude": {guidance.gen("latitude", max_tokens=10, regex=regex_float)},
+        "population": {guidance.gen("population", max_tokens=10, regex=r"[0-9]+")},
+        "top 3 landmarks": [
+            "{guidance.gen("landmark1", max_tokens=16, regex=regex_str_no_quote)}", "{guidance.gen("landmark2", max_tokens=16, regex=regex_str_no_quote)}", "{guidance.gen("landmark3", max_tokens=16, regex=regex_str_no_quote)}"
+        ]
+    }}
+    """
+
+    return lm
+
+
+def bench_character(args):
+    arguments = []
+    with open(args.data_path, "r") as f:
+        for line in f:
+            arguments.append({"name": line.strip()})
+    arguments = arguments[: args.num_jsons]
+
+    states = [None] * len(arguments)
+
+    # Select backend
+    if args.backend == "outlines":
+        call_generate = partial(get_call_generate(args), temperature=0)
+
+        def get_one_answer(i):
+            states[i] = character_gen(**arguments[i], generate=call_generate)
+
+    elif args.backend == "guidance":
+        model = guidance.models.LlamaCpp(
+            args.model_path,
+            n_gpu_layers=-1,
+            n_ctx=args.n_ctx,
+        )
+
+        def get_one_answer(i):
+            lm = model + character_maker(**arguments[i])
+            states[i] = lm
+
+    elif args.backend == "lmql":
+        import asyncio
+
+        import lmql
+
+        model = lmql.model(args.model_path, endpoint=f"{args.host}:{args.port}")
+        call_generate = partial(
+            call_generate_lmql,
+            model=model,
+            max_tokens=256,
+            regex=character_regex,
+        )
+
+        async def get_one_answer_async(i):
+            states[i] = await call_generate(prompt=arguments[i]["name"], temperature=0)
+
+    else:
+        raise ValueError(f"Invalid backend: {args.backend}")
+
+    tic = time.perf_counter()
+
+    if args.backend != "lmql":
+        if args.parallel == 1:
+            for i in tqdm(range(len(arguments))):
+                get_one_answer(i)
+        else:
+            with ThreadPoolExecutor(args.parallel) as executor:
+                rets = list(
+                    tqdm(
+                        executor.map(get_one_answer, list(range(len(arguments)))),
+                        total=len(arguments),
+                    )
+                )
+                for _ in rets:
+                    pass
+    else:
+        batches = []
+        for i in range(0, len(arguments), args.parallel):
+            batches.append(list(range(i, min(i + args.parallel, len(arguments)))))
+        loop = asyncio.get_event_loop()
+
+        for bt in tqdm(batches):
+            loop.run_until_complete(
+                asyncio.gather(*[get_one_answer_async(i) for i in bt])
+            )
+
+    latency = time.perf_counter() - tic
+
+    return states, latency
+
+
+def bench_city_doc(args):
+    arguments = []
+    for line in read_jsonl(args.data_path):
+        arguments.append({"document": line["document"]})
+    arguments = arguments[: args.num_jsons]
+
+    states = [None] * len(arguments)
+
+    # Select backend
+    if args.backend == "outlines":
+        call_generate = partial(get_call_generate(args), temperature=0)
+
+        def get_one_answer(i):
+            states[i] = city_gen(**arguments[i], generate=call_generate)
+
+    elif args.backend == "guidance":
+        model = guidance.models.LlamaCpp(
+            args.model_path,
+            n_gpu_layers=-1,
+            n_ctx=args.n_ctx,
+        )
+
+        def get_one_answer(i):
+            lm = model + city_maker(**arguments[i])
+            states[i] = lm
+
+    else:
+        raise ValueError(f"Invalid backend: {args.backend}")
+
+    tic = time.perf_counter()
+    if args.parallel == 1:
+        for i in tqdm(range(len(arguments))):
+            get_one_answer(i)
+    else:
+        with ThreadPoolExecutor(args.parallel) as executor:
+            rets = executor.map(get_one_answer, list(range(len(arguments))))
+            for _ in rets:
+                pass
+
+    latency = time.perf_counter() - tic
+
+    return states, latency
+
+
+def main(args):
+    if args.mode == "character":
+        args.data_path = "dataset.txt"
+        states, latency = bench_character(args)
+    elif args.mode == "city":
+        args.data_path = "questions.jsonl"
+        states, latency = bench_city_doc(args)
+
+    # Compute accuracy
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}_{args.mode}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "json_jump_forward",
+            "backend": args.backend,
+            "latency": round(latency, 3),
+            "num_jsons": args.num_jsons,
+            "mode": args.mode,
+            "parallel": args.parallel,
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str)
+    parser.add_argument("--num-jsons", type=int, default=50)
+    parser.add_argument(
+        "--mode", type=str, default="character", choices=["character", "city"]
+    )
+    args = add_common_other_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/json_jump_forward/bench_sglang.py b/benchmark/json_jump_forward/bench_sglang.py
new file mode 100644
index 000000000..29f635f75
--- /dev/null
+++ b/benchmark/json_jump_forward/bench_sglang.py
@@ -0,0 +1,143 @@
+import argparse
+import json
+import time
+
+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text, read_jsonl
+
+# there are some FSM bugs with json regex converted from pydantic model
+# here use a string regex instead
+# regex_string = build_regex_from_object(HarryPoterRole)
+character_regex = (
+    r"""\{\n"""
+    + r"""    "name": "[\w\d\s]{1,16}",\n"""
+    + r"""    "house": "(Gryffindor|Slytherin|Ravenclaw|Hufflepuff)",\n"""
+    + r"""    "blood status": "(Pure-blood|Half-blood|Muggle-born)",\n"""
+    + r"""    "occupation": "(student|teacher|auror|ministry of magic|death eater|order of the phoenix)",\n"""
+    + r"""    "wand": \{\n"""
+    + r"""        "wood": "[\w\d\s]{1,16}",\n"""
+    + r"""        "core": "[\w\d\s]{1,16}",\n"""
+    + r"""        "length": [0-9]{1,2}\.[0-9]{0,2}\n"""
+    + r"""    \},\n"""
+    + r"""    "alive": "(Alive|Deceased)",\n"""
+    + r"""    "patronus": "[\w\d\s]{1,16}",\n"""
+    + r"""    "bogart": "[\w\d\s]{1,16}"\n"""
+    + r"""\}"""
+)
+
+city_regex = (
+    r"""\{\n"""
+    + r"""  "name": "[\w\d\s]{1,16}",\n"""
+    + r"""  "country": "[\w\d\s]{1,16}",\n"""
+    + r"""  "latitude": [-+]?[0-9]*\.?[0-9]{0,2},\n"""
+    + r"""  "population": [-+]?[0-9]{1,9},\n"""
+    + r"""  "top 3 landmarks": \["[\w\d\s]{1,16}", "[\w\d\s]{1,16}", "[\w\d\s]{1,16}"\]\n"""
+    + r"""\}"""
+)
+
+# fmt: off
+@sgl.function
+def character_gen(s, name):
+    s += name + " is a character in Harry Potter. Please fill in the following information about this character.\n"
+    s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
+# fmt: on
+
+# fmt: off
+@sgl.function
+def city_gen(s, document):
+    s += "Please extract the information of a city from the following wikipedia page.\n"
+    s += "Page begin.\n" + document + "Page end.\n"
+    s += "Here is the name, country, and symbol of the city in JSON format.\n"
+    s += sgl.gen("json_output",max_tokens=256, regex=city_regex)
+# fmt: on
+
+
+def bench_city_doc(args):
+    arguments = []
+    for line in read_jsonl(args.data_path):
+        arguments.append({"document": line["document"]})
+    arguments = arguments[: args.num_jsons]
+
+    # Select backend
+    backend = select_sglang_backend(args)
+    sgl.set_default_backend(backend)
+
+    # Run requests
+    tic = time.perf_counter()
+    states = city_gen.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+
+    return states, latency
+
+
+def bench_character(args):
+    arguments = []
+    with open(args.data_path, "r") as f:
+        for line in f:
+            arguments.append({"name": line.strip()})
+    arguments = arguments[: args.num_jsons]
+
+    # Select backend
+    backend = select_sglang_backend(args)
+    sgl.set_default_backend(backend)
+
+    # Run requests
+    tic = time.perf_counter()
+    states = character_gen.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+
+    return states, latency
+
+
+def main(args):
+    if args.mode == "character":
+        args.data_path = "dataset.txt"
+        states, latency = bench_character(args)
+    elif args.mode == "city":
+        args.data_path = "questions.jsonl"
+        states, latency = bench_city_doc(args)
+
+    # Compute accuracy
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}_{args.mode}.txt", states)
+    with open(f"{args.backend}_{args.mode}.json", "w") as fout:
+        for state in states:
+            fout.write(state["json_output"] + "\n")
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "json_jump_forward",
+            "backend": args.backend,
+            "latency": round(latency, 3),
+            "num_jsons": args.num_jsons,
+            "mode": args.mode,
+            "parallel": args.parallel,
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str)
+    parser.add_argument("--num-jsons", type=int, default=50)
+    parser.add_argument(
+        "--mode", type=str, default="character", choices=["character", "city"]
+    )
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/json_jump_forward/build_dataset.py b/benchmark/json_jump_forward/build_dataset.py
new file mode 100644
index 000000000..1396e5ede
--- /dev/null
+++ b/benchmark/json_jump_forward/build_dataset.py
@@ -0,0 +1,58 @@
+import json
+
+import transformers
+import wikipedia
+
+model_path = "meta-llama/Llama-2-7b-chat-hf"
+t = transformers.AutoTokenizer.from_pretrained(model_path)
+city_names = [
+    "los angles",
+    "london",
+    "tokyo",
+    "beijing",
+    "singapore",
+    "paris",
+    "dubai",
+    "sydney",
+    "moscow",
+    "rome",
+    "toronto",
+    "rio de janeiro",
+    "istanbul",
+    "berlin",
+    "auckland",
+    "buenos aires",
+    "mexico city",
+    "mumbai",
+    "seoul",
+    "bangkok",
+    "cairo",
+    "athens",
+    "jerusalem",
+]
+
+
+def get_content(city_name):
+    content = str(wikipedia.page(city_name).content)
+    content = content.replace("\n\n", "\n")
+
+    tokens = t.encode(content)
+
+    expected_tokens = 3000
+    truncate_len = int((expected_tokens / len(tokens)) * len(content))
+    truncate_content = content[:truncate_len]
+    truncate_tokens = t.encode(truncate_content)
+
+    # Count token
+    print(
+        f"city_name: {city_name}, #tokens: {len(tokens)}, #truncate tokens: {len(truncate_tokens)}"
+    )
+
+    return truncate_content
+
+
+if __name__ == "__main__":
+    with open("questions.jsonl", "w") as fout:
+        for city_name in city_names:
+            truncate_content = get_content(city_name)
+            fout.write(json.dumps({"document": truncate_content}) + "\n")
diff --git a/benchmark/json_jump_forward/dataset.txt b/benchmark/json_jump_forward/dataset.txt
new file mode 100644
index 000000000..c12421e5d
--- /dev/null
+++ b/benchmark/json_jump_forward/dataset.txt
@@ -0,0 +1,50 @@
+Harry Potter
+Hermione Granger
+Ron Weasley
+Albus Dumbledore
+Severus Snape
+Rubeus Hagrid
+Draco Malfoy
+Ginny Weasley
+Fred Weasley
+George Weasley
+Percy Weasley
+Sirius Black
+Remus Lupin
+Neville Longbottom
+Luna Lovegood
+Cedric Diggory
+Cho Chang
+Lord Voldemort
+Minerva McGonagall
+Filius Flitwick
+Dolores Umbridge
+Bellatrix Lestrange
+Lucius Malfoy
+Molly Weasley
+Arthur Weasley
+Nymphadora Tonks
+Dobby
+Moaning Myrtle
+Peter Pettigrew
+Alastor 'Mad-Eye' Moody
+Horace Slughorn
+Vernon Dursley
+Petunia Dursley
+Dudley Dursley
+Argus Filch
+Sybill Trelawney
+Gilderoy Lockhart
+Fleur Delacour
+Viktor Krum
+Bill Weasley
+Oliver Wood
+Cornelius Fudge
+Barty Crouch Sr.
+Barty Crouch Jr.
+Kingsley Shacklebolt
+Quirinus Quirrell
+Nearly Headless Nick
+Aunt Marge
+Griphook
+Ludo Bagman
diff --git a/benchmark/json_schema/README.md b/benchmark/json_schema/README.md
new file mode 100644
index 000000000..e6790e1fc
--- /dev/null
+++ b/benchmark/json_schema/README.md
@@ -0,0 +1,15 @@
+## Run benchmark
+
+### Benchmark sglang
+
+Run Llama-8b
+
+```bash
+python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --port 30000
+```
+
+Benchmark
+
+```bash
+python3 bench_sglang.py
+```
diff --git a/benchmark/json_schema/bench_sglang.py b/benchmark/json_schema/bench_sglang.py
new file mode 100644
index 000000000..55365ff2e
--- /dev/null
+++ b/benchmark/json_schema/bench_sglang.py
@@ -0,0 +1,146 @@
+import argparse
+import json
+import time
+from typing import List, Tuple
+
+import jsonschema
+from datasets import load_dataset
+
+import sglang as sgl
+from sglang.global_config import global_config
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text
+
+
+@sgl.function
+def schema_gen(s, message: Tuple[str, str], json_schema: str):
+    system, user = message
+    s += sgl.system(system)
+    s += sgl.user(user)
+    s += sgl.assistant(
+        sgl.gen("json_output", temperature=0, max_tokens=256, json_schema=json_schema)
+    )
+
+
+def contains_formats(schema, formats: List[str]):
+    if isinstance(schema, dict):
+        if schema.get("format", None) in formats:
+            return True
+        for value in schema.values():
+            if contains_formats(value, formats):
+                return True
+    elif isinstance(schema, list):
+        for item in schema:
+            if contains_formats(item, formats):
+                return True
+    return False
+
+
+def convert_dataset(path: str):
+    raw_dataset = load_dataset(path)
+    dataset = []
+    for data in raw_dataset["train"]:
+        messages = data["prompt"]
+        schema = data["schema"]
+        obj = json.loads(schema)
+
+        # skip some corrupted examples
+        if obj.get("type", None) is None:
+            continue
+
+        # skip schema with format "email"
+        # which is not supported by outlines for now
+        if contains_formats(obj, ["email"]):
+            continue
+
+        system = messages[0]
+        user = messages[1]
+        assert system["role"] == "system", "invalid role"
+        assert user["role"] == "user", "invalid role"
+        assert len(messages) == 2, "invalid message length"
+        message = json.dumps(system["content"]), json.dumps(user["content"])
+        dataset.append(
+            {
+                "message": message,
+                "json_schema": schema,
+            }
+        )
+
+    return dataset
+
+
+def bench_schema(args):
+    arguments = convert_dataset(args.data_path)
+
+    if args.num_jsons < 0 or args.num_jsons > len(arguments):
+        args.num_jsons = len(arguments)
+    arguments = arguments[: args.num_jsons]
+
+    # Select backend
+    backend = select_sglang_backend(args)
+    sgl.set_default_backend(backend)
+
+    # Run requests
+    tic = time.perf_counter()
+    states = schema_gen.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+
+    # Check if the outputs are valid
+    indexes = []
+    for i, state in enumerate(states):
+        try:
+            schema = json.loads(arguments[i]["json_schema"])
+            obj = json.loads(state["json_output"])
+            assert jsonschema.validate(obj, schema) is None
+        except Exception as e:
+            print(e)
+            indexes.append(i)
+
+    return states, latency
+
+
+def main(args):
+    states, latency = bench_schema(args)
+
+    # Compute accuracy
+    tokenizer = get_tokenizer(
+        global_config.default_backend.get_server_info()["tokenizer_path"]
+    )
+    output_jsons = [state["json_output"] for state in states]
+    num_output_tokens = sum(len(tokenizer.encode(x)) for x in output_jsons)
+    print(f"Latency: {latency:.3f}")
+    print(f"Output throughput: {num_output_tokens / latency:.3f} token/s")
+    print(f"#output tokens: {num_output_tokens}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+    with open(f"{args.backend}.jsonl", "w") as fout:
+        for state in states:
+            fout.write(state["json_output"] + "\n")
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "json_schema",
+            "backend": args.backend,
+            "latency": round(latency, 3),
+            "num_jsons": args.num_jsons,
+            "parallel": args.parallel,
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="NousResearch/json-mode-eval")
+    parser.add_argument("--num-jsons", type=int, default=-1)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/kernels/all_reduce/benchmark_mscclpp.py b/benchmark/kernels/all_reduce/benchmark_mscclpp.py
new file mode 100644
index 000000000..eebbd00ce
--- /dev/null
+++ b/benchmark/kernels/all_reduce/benchmark_mscclpp.py
@@ -0,0 +1,224 @@
+"""For Now, MSCCL is only supported on TP16 and TP8 case
+
+export WORLD_SIZE=1
+export RANK=0
+export MASTER_ADDR=127.0.0.1
+export MASTER_PORT=12345
+
+torchrun --nproc_per_node gpu \
+--nnodes $WORLD_SIZE \
+--node_rank $RANK \
+--master_addr $MASTER_ADDR \
+--master_port $MASTER_PORT benchmark/kernels/all_reduce/benchmark_mscclpp.py
+"""
+
+import os
+from contextlib import nullcontext
+from typing import List
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from sglang.srt.distributed import init_distributed_environment
+from sglang.srt.distributed.device_communicators.pymscclpp import PyMscclppCommunicator
+from sglang.srt.distributed.device_communicators.pynccl import PyNcclCommunicator
+from sglang.srt.distributed.parallel_state import (
+    get_tensor_model_parallel_group,
+    graph_capture,
+    initialize_model_parallel,
+    set_mscclpp_all_reduce,
+)
+
+
+def torch_allreduce(torch_input: torch.Tensor, group: ProcessGroup) -> torch.Tensor:
+    dist.all_reduce(torch_input, group=group)
+    return torch_input
+
+
+def msccl_allreduce(
+    msccl_input: torch.Tensor, msccl_comm: PyMscclppCommunicator
+) -> torch.Tensor:
+    return msccl_comm.all_reduce(msccl_input)
+
+
+def pynccl_allreduce(
+    msccl_input: torch.Tensor, pynccl_comm: PyNcclCommunicator
+) -> torch.Tensor:
+    pynccl_comm.all_reduce(msccl_input)
+    return msccl_input
+
+
+def _bench_graph_time(func, inp_randn, warmup_loop=2, graph_loop=10, test_loop=10):
+    graph_input = inp_randn.clone()
+    with graph_capture() as graph_capture_context:
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph, stream=graph_capture_context.stream):
+            for _ in range(graph_loop):
+                graph_out = func(graph_input)
+
+    graph.replay()
+    func_output = graph_out.clone()
+
+    for _ in range(warmup_loop):
+        graph.replay()
+    torch.cuda.synchronize()
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    latencies: List[float] = []
+    for _ in range(test_loop):
+        torch.cuda.synchronize()
+        dist.barrier()
+        start_event.record()
+        graph.replay()
+        end_event.record()
+        end_event.synchronize()
+        latencies.append(start_event.elapsed_time(end_event))
+    func_cost_us = sum(latencies) / len(latencies) / graph_loop * 1000
+    graph.reset()
+    return func_output, func_cost_us
+
+
+def _bench_eager_time(func, inp_randn, warmup_loop=2, test_loop=10):
+    eager_input = inp_randn.clone()
+    eager_output = func(eager_input)
+    func_output = eager_output.clone()
+
+    for _ in range(warmup_loop):
+        func(eager_input)
+    torch.cuda.synchronize()
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    torch.cuda.synchronize()
+    start_event.record()
+    for _ in range(test_loop):
+        func(eager_input)
+    end_event.record()
+    torch.cuda.synchronize()
+    func_cost_us = start_event.elapsed_time(end_event) / test_loop * 1000
+
+    return func_output, func_cost_us
+
+
+def get_torch_prof_ctx(do_prof: bool):
+    ctx = (
+        torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+            record_shapes=True,
+            with_stack=True,
+        )
+        if do_prof
+        else nullcontext()
+    )
+    return ctx
+
+
+def human_readable_size(size, decimal_places=1):
+    for unit in ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]:
+        if size < 1024.0 or unit == "PiB":
+            break
+        size /= 1024.0
+    return f"{size:.{decimal_places}f} {unit}"
+
+
+try:
+    from tabulate import tabulate
+except ImportError:
+    print("tabulate not installed, skipping table printing")
+    tabulate = None
+
+
+def print_markdown_table(data):
+    if tabulate is not None:
+        print(tabulate(data, headers="keys", tablefmt="github"))
+        return
+    headers = data[0].keys()
+    header_row = "| " + " | ".join(headers) + " |"
+    separator = "| " + " | ".join(["---"] * len(headers)) + " |"
+    rows = []
+    for item in data:
+        row = "| " + " | ".join(str(item[key]) for key in headers) + " |"
+        rows.append(row)
+    markdown_table = "\n".join([header_row, separator] + rows)
+    print(markdown_table)
+
+
+if __name__ == "__main__":
+    import logging
+
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        force=True,
+    )
+    if not dist.is_initialized():
+        dist.init_process_group(backend="nccl")
+    world, world_size = dist.group.WORLD, dist.get_world_size()
+    rank = dist.get_rank()
+    torch.cuda.set_device(rank % 8)
+    device = torch.cuda.current_device()
+    set_mscclpp_all_reduce(True)
+    init_distributed_environment(
+        world_size=world_size,
+        rank=rank,
+        local_rank=rank % 8,
+    )
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+    group = get_tensor_model_parallel_group().device_group
+    cpu_group = get_tensor_model_parallel_group().cpu_group
+    pynccl_comm = get_tensor_model_parallel_group().pynccl_comm
+    pymscclpp_comm = get_tensor_model_parallel_group().pymscclpp_comm
+    dist.barrier()
+    profile = False
+    dtype = torch.bfloat16
+    ctx = get_torch_prof_ctx(profile)
+    result = []
+
+    with ctx:
+        for i in range(10, 20):
+            sz = 2**i
+            if sz * dtype.itemsize > 2**20:
+                break
+            inp_randn = torch.randint(1, 16, (sz,), dtype=dtype, device=device)
+
+            memory = torch.empty_like(inp_randn)
+            memory_out = torch.empty_like(memory)
+            torch_eager_output, torch_eager_time = _bench_eager_time(
+                lambda inp: torch_allreduce(inp, group), inp_randn
+            )
+            msccl_eager_output, msccl_eager_time = _bench_eager_time(
+                lambda inp: msccl_allreduce(inp, pymscclpp_comm), inp_randn
+            )
+            msccl_graph_output, msccl_graph_time = _bench_graph_time(
+                lambda inp: msccl_allreduce(inp, pymscclpp_comm), inp_randn
+            )
+            # since pynccl is inplace op, this return result is not correct if graph loop > 1
+            _, pynccl_graph_time = _bench_graph_time(
+                lambda inp: pynccl_allreduce(inp, pynccl_comm), inp_randn
+            )
+            torch.testing.assert_close(torch_eager_output, msccl_graph_output)
+            torch.testing.assert_close(torch_eager_output, msccl_eager_output)
+            result.append(
+                {
+                    "msg_size": human_readable_size(inp_randn.nbytes),
+                    "torch eager time": torch_eager_time,
+                    "msccl eager time": msccl_eager_time,
+                    "msccl graph time": msccl_graph_time,
+                    "pynccl graph time": pynccl_graph_time,
+                }
+            )
+            if rank == 0:
+                print(f"sz={sz}, dtype={dtype}: correctness check PASS!")
+    if rank == 0:
+        print_markdown_table(result)
+    if profile:
+        prof_dir = f"prof/msccl"
+        os.makedirs(prof_dir, exist_ok=True)
+        ctx.export_chrome_trace(f"{prof_dir}/trace_rank{dist.get_rank()}.json.gz")
diff --git a/benchmark/kernels/decoding_attention_triton/triton_flashinfer_cudnn.py b/benchmark/kernels/decoding_attention_triton/triton_flashinfer_cudnn.py
new file mode 100644
index 000000000..b61240b05
--- /dev/null
+++ b/benchmark/kernels/decoding_attention_triton/triton_flashinfer_cudnn.py
@@ -0,0 +1,403 @@
+import itertools
+import math
+
+import cudnn
+import torch
+import torch.utils.benchmark as benchmark
+from flashinfer import BatchDecodeWithPagedKVCacheWrapper
+
+from sglang.srt.layers.attention.triton_ops.decode_attention import decode_attention_fwd
+from sglang.srt.utils import should_use_tensor_core
+
+
+def benchmark_forward(
+    fn,
+    *inputs,
+    repeats=10,
+    amp=False,
+    amp_dtype=torch.float16,
+    **kwinputs,
+):
+    def amp_wrapper(*inputs, **kwinputs):
+        with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
+            fn(*inputs, **kwinputs)
+
+    t = benchmark.Timer(
+        stmt="fn_amp(*inputs, **kwinputs)",
+        globals={"fn_amp": amp_wrapper, "inputs": inputs, "kwinputs": kwinputs},
+        num_threads=torch.get_num_threads(),
+    )
+    m = t.timeit(repeats)
+    return t, m
+
+
+def time_fwd(func, *args, **kwargs):
+    time_f = benchmark_forward(func, *args, **kwargs)
+    return time_f[1].mean * 1e6
+
+
+def decode_attention_sglang(
+    q,
+    kv_data,
+    batch_size,
+    kv_len,
+    head_num_q,
+    head_num_kv,
+    head_dim,
+    num_kv_splits,
+    warmup=10,
+):
+
+    k_buffer = kv_data[0].view(-1, head_num_kv, head_dim)
+    v_buffer = kv_data[1].view(-1, head_num_kv, head_dim)
+    o = torch.empty_like(q)
+    total_tokens = batch_size * kv_len
+    req_to_token = torch.arange(0, total_tokens).to(0).int().view(batch_size, kv_len)
+    b_req_idx = torch.arange(0, batch_size).to(0).int()
+    b_seq_len = torch.full((batch_size,), kv_len, dtype=torch.int32, device="cuda")
+    max_len_in_batch = kv_len
+    sm_scale = 1.0 / (head_dim**0.5)
+
+    attn_logits = torch.empty(
+        (batch_size, head_num_q, num_kv_splits, head_dim + 1),
+        dtype=torch.float32,
+        device="cuda",
+    )
+
+    for _ in range(warmup):
+        decode_attention_fwd(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            req_to_token,
+            b_req_idx,
+            b_seq_len,
+            attn_logits,
+            num_kv_splits,
+            sm_scale,
+        )
+
+    f = time_fwd(
+        decode_attention_fwd,
+        q,
+        k_buffer,
+        v_buffer,
+        o,
+        req_to_token,
+        b_req_idx,
+        b_seq_len,
+        attn_logits,
+        num_kv_splits,
+        sm_scale,
+    )
+
+    return f, o
+
+
+def decode_attention_flashinfer(dtype, head_num_q, head_num_kv):
+    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8, device="cuda")
+    use_tensor_cores = should_use_tensor_core(
+        kv_cache_dtype=dtype,
+        num_attention_heads=head_num_q,
+        num_kv_heads=head_num_kv,
+    )
+    flashinfer_decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
+        workspace_buffer, "NHD", use_tensor_cores=use_tensor_cores
+    )
+
+    class FlashinferAttention(torch.autograd.Function):
+        @staticmethod
+        def forward(
+            ctx,
+            q,
+            kv_data,
+            batch_size,
+            kv_len,
+            head_num_q,
+            head_num_kv,
+            head_dim,
+            dtype,
+            warmup=10,
+        ):
+            total_tokens = batch_size * kv_len
+            kv_indptr = torch.arange(0, batch_size + 1).to(0).int() * kv_len
+            kv_indices = torch.arange(0, total_tokens).to(0).int()
+            kv_last_page_len = torch.full(
+                (batch_size,), 1, dtype=torch.int32, device="cuda"
+            )
+
+            flashinfer_decode_wrapper.end_forward()
+            flashinfer_decode_wrapper.begin_forward(
+                kv_indptr,
+                kv_indices,
+                kv_last_page_len,
+                head_num_q,
+                head_num_kv,
+                head_dim,
+                1,
+                pos_encoding_mode="NONE",
+                data_type=dtype,
+            )
+
+            for _ in range(warmup):
+                o = flashinfer_decode_wrapper.forward(
+                    q.contiguous().view(-1, head_num_q, head_dim), kv_data
+                )
+
+            f = time_fwd(
+                flashinfer_decode_wrapper.forward,
+                q.contiguous().view(-1, head_num_q, head_dim),
+                kv_data,
+            )
+
+            return f, o
+
+    return FlashinferAttention
+
+
+def convert_to_cudnn_type(torch_type):
+    if torch_type == torch.float16:
+        return cudnn.data_type.HALF
+    elif torch_type == torch.bfloat16:
+        return cudnn.data_type.BFLOAT16
+    elif torch_type == torch.float32:
+        return cudnn.data_type.FLOAT
+    elif torch_type == torch.int32:
+        return cudnn.data_type.INT32
+    elif torch_type == torch.int64:
+        return cudnn.data_type.INT64
+    else:
+        raise ValueError("Unsupported tensor data type.")
+
+
+def decode_attention_cudnn(
+    q, kv_data, batch_size, kv_len, head_num_q, head_num_kv, head_dim, dtype, warmup=10
+):
+    # Prepare data: continuous q,k,v
+    dims_q = (batch_size, head_num_q, 1, head_dim)
+    strides_q = (head_num_q * head_dim, head_dim, head_num_q * head_dim, 1)
+    q_gpu = q.as_strided(dims_q, strides_q)
+    o_gpu = (
+        torch.empty(batch_size * head_num_q * head_dim)
+        .half()
+        .cuda()
+        .as_strided(dims_q, strides_q)
+    )
+
+    dims_kv = (batch_size, head_num_kv, kv_len, head_dim)
+    strides_kv = (
+        kv_len * head_num_kv * head_dim,
+        head_dim,
+        head_num_kv * head_dim,
+        1,
+    )
+    k_gpu = kv_data[0].as_strided(dims_kv, strides_kv)
+    v_gpu = kv_data[1].as_strided(dims_kv, strides_kv)
+
+    seq_len_q_gpu = torch.full((batch_size, 1, 1, 1), 1, device="cuda")
+    seq_len_kv_gpu = torch.full((batch_size, 1, 1, 1), kv_len, device="cuda")
+    attn_scale = 1.0 / (head_dim**0.5)
+
+    # Prepare data: paged k,v
+    block_size = 1
+    blocks_per_batch = math.ceil(kv_len / block_size)
+    # [num_blocks, head_num_kv, block_size, head_dim], num_blocks = batch_size * blocks_per_batch
+    container_k_gpu = torch.cat(k_gpu.chunk(blocks_per_batch, dim=2), dim=0)
+    container_v_gpu = torch.cat(v_gpu.chunk(blocks_per_batch, dim=2), dim=0)
+    page_table_k_gpu = (
+        torch.linspace(
+            0,
+            batch_size * blocks_per_batch - 1,
+            batch_size * blocks_per_batch,
+            device="cuda",
+            dtype=torch.int32,
+        )
+        .reshape(blocks_per_batch, 1, batch_size, 1)
+        .transpose(0, 2)
+    )
+    page_table_v_gpu = page_table_k_gpu.clone()
+
+    graph = cudnn.pygraph(
+        io_data_type=convert_to_cudnn_type(dtype),
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+    )
+
+    q = graph.tensor_like(q_gpu)
+    container_k = graph.tensor_like(container_k_gpu)
+    container_v = graph.tensor_like(container_v_gpu)
+    page_table_k = graph.tensor_like(page_table_k_gpu)
+    page_table_v = graph.tensor_like(page_table_v_gpu)
+
+    seq_len_q = graph.tensor_like(seq_len_q_gpu)
+    seq_len_kv = graph.tensor_like(seq_len_kv_gpu)
+
+    o, _ = graph.sdpa(
+        name="sdpa",
+        q=q,
+        k=container_k,  # Container K: non contiguous container with K blocks
+        v=container_v,  # Container V: non contiguous container with V blocks
+        is_inference=True,
+        attn_scale=attn_scale,
+        use_causal_mask=False,
+        use_padding_mask=True,
+        seq_len_q=seq_len_q,
+        seq_len_kv=seq_len_kv,
+        paged_attention_k_table=page_table_k,  # Page Table K: Tensor containing offsets to the container with K blocks
+        paged_attention_v_table=page_table_v,  # Page Table V: Tensor containing offsets to the container with V blocks
+        paged_attention_max_seq_len_kv=kv_len,  # The maximum sequence length for K caches (this is optional, but recommended)
+    )
+
+    o.set_output(True).set_dim(dims_q).set_stride(strides_q)
+
+    graph.validate()
+    graph.build_operation_graph()
+    graph.create_execution_plans([cudnn.heur_mode.A])
+    graph.check_support()
+    graph.build_plans()
+
+    workspace = torch.empty(
+        graph.get_workspace_size(), device="cuda", dtype=torch.uint8
+    )
+
+    variant_pack = {
+        q: q_gpu,
+        container_k: container_k_gpu,
+        container_v: container_v_gpu,
+        page_table_k: page_table_k_gpu,
+        page_table_v: page_table_v_gpu,
+        seq_len_q: seq_len_q_gpu,
+        seq_len_kv: seq_len_kv_gpu,
+        o: o_gpu,
+    }
+
+    for _ in range(warmup):
+        graph.execute(variant_pack, workspace)
+
+    f = time_fwd(
+        graph.execute,
+        variant_pack,
+        workspace,
+    )
+
+    return f, o_gpu.squeeze(dim=2)
+
+
+def calculate_diff():
+
+    dtype = torch.float16
+    batch_size = 64
+    kv_len = 4096
+    head_num_q = 64
+    head_num_kv = 8
+    head_dim = 128
+
+    q = torch.randn(batch_size, head_num_q, head_dim, dtype=dtype, device="cuda")
+    kv_data = (
+        torch.randn(
+            batch_size * kv_len, head_num_kv, head_dim, dtype=dtype, device="cuda"
+        ),
+        torch.randn(
+            batch_size * kv_len, head_num_kv, head_dim, dtype=dtype, device="cuda"
+        ),
+    )
+
+    _, output_sglang = decode_attention_sglang(
+        q,
+        kv_data,
+        batch_size,
+        kv_len,
+        head_num_q,
+        head_num_kv,
+        head_dim,
+        num_kv_splits=8,
+    )
+
+    attn_flashinfer = decode_attention_flashinfer(dtype, head_num_q, head_num_kv).apply
+    _, output_flashinfer = attn_flashinfer(
+        q, kv_data, batch_size, kv_len, head_num_q, head_num_kv, head_dim, dtype
+    )
+
+    _, output_cudnn = decode_attention_cudnn(
+        q, kv_data, batch_size, kv_len, head_num_q, head_num_kv, head_dim, dtype
+    )
+
+    print(f"SGLang output={output_sglang}")
+    print(f"FlashInfer output={output_flashinfer}")
+    print(f"cuDNN output={output_cudnn}")
+    if torch.allclose(output_sglang, output_flashinfer, atol=1e-2, rtol=1e-2):
+        print("✅ SGLang[Triton] and FlashInfer match")
+    else:
+        print("❌ SGLang[Triton] and FlashInfer differ")
+
+    if torch.allclose(output_sglang, output_cudnn, atol=1e-2, rtol=1e-2):
+        print("✅ SGLang[Triton] and cuDNN match")
+    else:
+        print("❌ SGLang[Triton] and cuDNN differ")
+
+
+if __name__ == "__main__":
+    calculate_diff()
+
+    head_dim = 128
+    dtype = torch.float16
+    batch_size_range = [2**i for i in range(0, 8, 2)]
+    kv_len_range = [2**i for i in range(6, 13, 1)]
+    configs = list(itertools.product(batch_size_range, kv_len_range))
+
+    for head_num_q, head_num_kv in [[32, 32], [64, 8], [40, 8]]:
+        attn_flashinfer = decode_attention_flashinfer(
+            dtype, head_num_q, head_num_kv
+        ).apply
+        for batch_size, kv_len in configs:
+            q = torch.randn(
+                batch_size, head_num_q, head_dim, dtype=dtype, device="cuda"
+            )
+            kv_data = (
+                torch.randn(
+                    batch_size * kv_len,
+                    head_num_kv,
+                    head_dim,
+                    dtype=dtype,
+                    device="cuda",
+                ),
+                torch.randn(
+                    batch_size * kv_len,
+                    head_num_kv,
+                    head_dim,
+                    dtype=dtype,
+                    device="cuda",
+                ),
+            )
+            us_cudnn, output_cudnn = decode_attention_cudnn(
+                q, kv_data, batch_size, kv_len, head_num_q, head_num_kv, head_dim, dtype
+            )
+            us_sglang, output_sglang = decode_attention_sglang(
+                q,
+                kv_data,
+                batch_size,
+                kv_len,
+                head_num_q,
+                head_num_kv,
+                head_dim,
+                num_kv_splits=8,
+            )
+            us_flashinfer, _ = attn_flashinfer(
+                q, kv_data, batch_size, kv_len, head_num_q, head_num_kv, head_dim, dtype
+            )
+            print(
+                head_num_q,
+                "  ",
+                head_num_kv,
+                "  ",
+                batch_size,
+                "  ",
+                kv_len,
+                "  ",
+                us_cudnn,
+                "  ",
+                us_sglang,
+                "  ",
+                us_flashinfer,
+            )
diff --git a/benchmark/kernels/deepep/deepep_utils.py b/benchmark/kernels/deepep/deepep_utils.py
new file mode 100644
index 000000000..169529ef6
--- /dev/null
+++ b/benchmark/kernels/deepep/deepep_utils.py
@@ -0,0 +1,218 @@
+# ADAPTED FROM https://github.com/deepseek-ai/DeepEP/blob/main/tests/utils.py
+
+import os
+import sys
+from typing import Optional
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+
+def init_dist(local_rank: int, num_local_ranks: int, args):
+    ip = args.master_addr
+    port = args.master_port
+    num_nodes = args.nnodes
+    node_rank = args.node_rank
+    assert (num_local_ranks < 8 and num_nodes == 1) or num_local_ranks == 8
+
+    dist.init_process_group(
+        backend="nccl",
+        init_method=f"tcp://{ip}:{port}",
+        world_size=num_nodes * num_local_ranks,
+        rank=node_rank * num_local_ranks + local_rank,
+    )
+    torch.set_default_dtype(torch.bfloat16)
+    torch.set_default_device("cuda")
+    torch.cuda.set_device(local_rank)
+
+    return (
+        dist.get_rank(),
+        dist.get_world_size(),
+        dist.new_group(list(range(num_local_ranks * num_nodes))),
+    )
+
+
+def calc_diff(x: torch.Tensor, y: torch.Tensor):
+    x, y = x.double() + 1, y.double() + 1
+    denominator = (x * x + y * y).sum()
+    sim = 2 * (x * y).sum() / denominator
+    return (1 - sim).item()
+
+
+def per_token_cast_to_fp8(x: torch.Tensor):
+    assert x.dim() == 2 and x.size(1) % 128 == 0
+    m, n = x.shape
+    x_view = x.view(m, -1, 128)
+    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
+    return (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(
+        m, n
+    ), (x_amax / 448.0).view(m, -1)
+
+
+def per_token_cast_back(x_fp8: torch.Tensor, x_scales: torch.Tensor):
+    x_fp32 = x_fp8.to(torch.float32).view(x_fp8.size(0), -1, 128)
+    x_scales = x_scales.view(x_fp8.size(0), -1, 1)
+    return (x_fp32 * x_scales).view(x_fp8.shape).to(torch.bfloat16)
+
+
+def inplace_unique(x: torch.Tensor, num_slots: int):
+    assert x.dim() == 2
+    mask = x < 0
+    x_padded = x.masked_fill(mask, num_slots)
+    bin_count = torch.zeros((x.size(0), num_slots + 1), dtype=x.dtype, device=x.device)
+    bin_count.scatter_add_(1, x_padded, torch.ones_like(x_padded))
+    bin_count = bin_count[:, :num_slots]
+    sorted_bin_count, sorted_bin_idx = torch.sort(bin_count, dim=-1, descending=True)
+    sorted_bin_idx.masked_fill_(sorted_bin_count == 0, -1)
+    sorted_bin_idx = torch.sort(sorted_bin_idx, descending=True, dim=-1).values
+    x[:, :].fill_(-1)
+    valid_len = min(num_slots, x.size(1))
+    x[:, :valid_len] = sorted_bin_idx[:, :valid_len]
+
+
+def create_grouped_scores(
+    scores: torch.Tensor, group_idx: torch.Tensor, num_groups: int
+):
+    num_tokens, num_experts = scores.shape
+    scores = scores.view(num_tokens, num_groups, -1)
+    mask = torch.zeros((num_tokens, num_groups), dtype=torch.bool, device=scores.device)
+    mask = mask.scatter_(1, group_idx, True).unsqueeze(-1).expand_as(scores)
+    return (scores * mask).view(num_tokens, num_experts)
+
+
+def bench(fn, num_warmups: int = 20, num_tests: int = 30, post_fn=None):
+    # Flush L2 cache with 256 MB data
+    torch.cuda.synchronize()
+    cache = torch.empty(int(256e6 // 4), dtype=torch.int, device="cuda")
+
+    # Warmup
+    for _ in range(num_warmups):
+        fn()
+
+    # Flush L2
+    cache.zero_()
+
+    # Testing
+    start_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
+    end_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
+    for i in range(num_tests):
+        # Record
+        start_events[i].record()
+        fn()
+        end_events[i].record()
+        if post_fn is not None:
+            post_fn()
+    torch.cuda.synchronize()
+
+    times = np.array(
+        [s.elapsed_time(e) / 1e3 for s, e in zip(start_events, end_events)]
+    )[1:]
+    return np.average(times), np.min(times), np.max(times)
+
+
+class empty_suppress:
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *_):
+        pass
+
+
+class suppress_stdout_stderr:
+    def __enter__(self):
+        self.outnull_file = open(os.devnull, "w")
+        self.errnull_file = open(os.devnull, "w")
+
+        self.old_stdout_fileno_undup = sys.stdout.fileno()
+        self.old_stderr_fileno_undup = sys.stderr.fileno()
+
+        self.old_stdout_fileno = os.dup(sys.stdout.fileno())
+        self.old_stderr_fileno = os.dup(sys.stderr.fileno())
+
+        self.old_stdout = sys.stdout
+        self.old_stderr = sys.stderr
+
+        os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
+        os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)
+
+        sys.stdout = self.outnull_file
+        sys.stderr = self.errnull_file
+        return self
+
+    def __exit__(self, *_):
+        sys.stdout = self.old_stdout
+        sys.stderr = self.old_stderr
+
+        os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
+        os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
+
+        os.close(self.old_stdout_fileno)
+        os.close(self.old_stderr_fileno)
+
+        self.outnull_file.close()
+        self.errnull_file.close()
+
+
+def bench_kineto(
+    fn,
+    kernel_names,
+    num_tests: int = 30,
+    suppress_kineto_output: bool = False,
+    trace_path: Optional[str] = None,
+    barrier_comm_profiling: bool = False,
+):
+    # Profile
+    suppress = suppress_stdout_stderr if suppress_kineto_output else empty_suppress
+    with suppress():
+        schedule = torch.profiler.schedule(wait=0, warmup=1, active=1, repeat=1)
+        with torch.profiler.profile(
+            activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule
+        ) as prof:
+            for i in range(2):
+                # NOTES: use a large kernel and a barrier to eliminate the unbalanced CPU launch overhead
+                if barrier_comm_profiling:
+                    lhs = torch.randn((8192, 8192), dtype=torch.float, device="cuda")
+                    rhs = torch.randn((8192, 8192), dtype=torch.float, device="cuda")
+                    lhs @ rhs
+                    dist.all_reduce(torch.ones(1, dtype=torch.float, device="cuda"))
+                for _ in range(num_tests):
+                    fn()
+                prof.step()
+
+    # Parse the profiling table
+    assert isinstance(kernel_names, str) or isinstance(kernel_names, tuple)
+    is_tupled = isinstance(kernel_names, tuple)
+    prof_lines = (
+        prof.key_averages()
+        .table(sort_by="cuda_time_total", max_name_column_width=100)
+        .split("\n")
+    )
+    kernel_names = (kernel_names,) if isinstance(kernel_names, str) else kernel_names
+    assert all([isinstance(name, str) for name in kernel_names])
+    for name in kernel_names:
+        assert (
+            sum([name in line for line in prof_lines]) == 1
+        ), f"Errors of the kernel {name} in the profiling table"
+
+    # Save chrome traces
+    if trace_path is not None:
+        prof.export_chrome_trace(trace_path)
+
+    # Return average kernel times
+    units = {"ms": 1e3, "us": 1e6}
+    kernel_times = []
+    for name in kernel_names:
+        for line in prof_lines:
+            if name in line:
+                time_str = line.split()[-2]
+                for unit, scale in units.items():
+                    if unit in time_str:
+                        kernel_times.append(float(time_str.replace(unit, "")) / scale)
+                        break
+                break
+    return tuple(kernel_times) if is_tupled else kernel_times[0]
+
+
+def hash_tensor(t: torch.Tensor):
+    return t.view(torch.int64).sum().item()
diff --git a/benchmark/kernels/deepep/tuning_deepep.py b/benchmark/kernels/deepep/tuning_deepep.py
new file mode 100644
index 000000000..bb900a875
--- /dev/null
+++ b/benchmark/kernels/deepep/tuning_deepep.py
@@ -0,0 +1,476 @@
+# MODIFIED FROM https://github.com/deepseek-ai/DeepEP/blob/main/tests/test_internode.py
+
+"""
+Example usage:
+python tuning_deepep.py --nnodes 4 --node-rank $MY_NODE_RANK --master-addr 1.2.3.4
+Then check `deepep_tuned.json`
+"""
+
+import argparse
+import json
+import time
+from copy import deepcopy
+from pathlib import Path
+
+# noinspection PyUnresolvedReferences
+import deep_ep
+import torch
+import torch.distributed as dist
+from deepep_utils import (
+    bench,
+    calc_diff,
+    create_grouped_scores,
+    init_dist,
+    inplace_unique,
+    per_token_cast_back,
+    per_token_cast_to_fp8,
+)
+
+
+def test_main(
+    num_sms: int,
+    local_rank: int,
+    num_local_ranks: int,
+    num_ranks: int,
+    num_nodes: int,
+    rank: int,
+    buffer: deep_ep.Buffer,
+    group: dist.ProcessGroup,
+    args,
+):
+    # Settings
+    num_tokens, hidden, num_topk_groups, num_topk, num_experts = (
+        4096,
+        7168,
+        min(num_nodes, 4),
+        8,
+        (256 // num_ranks) * num_ranks,
+    )
+    assert num_experts % num_ranks == 0 and num_local_ranks == 8
+    if local_rank == 0:
+        print(
+            f"[config] num_tokens={num_tokens}, hidden={hidden}, num_topk_groups={num_topk_groups}, num_topk={num_topk}",
+            flush=True,
+        )
+
+    # Random data
+    x = torch.ones((num_tokens, hidden), dtype=torch.bfloat16, device="cuda") * rank
+    x_pure_rand = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device="cuda")
+    x_e4m3 = per_token_cast_to_fp8(x)
+    scores = (
+        torch.randn((num_tokens, num_experts), dtype=torch.float32, device="cuda").abs()
+        + 1
+    )
+    group_scores = scores.view(num_tokens, num_nodes, -1).amax(dim=-1)
+    group_idx = torch.topk(
+        group_scores, k=num_topk_groups, dim=-1, sorted=False
+    ).indices
+    masked_scores = create_grouped_scores(scores, group_idx, num_nodes)
+    topk_idx = torch.topk(masked_scores, num_topk, dim=-1, largest=True, sorted=False)[
+        1
+    ]
+    topk_weights = (
+        torch.ones((num_tokens, num_topk), dtype=torch.float32, device="cuda") * rank
+    )
+    topk_weights_pure_rand = torch.randn(
+        (num_tokens, num_topk), dtype=torch.float32, device="cuda"
+    )
+    rank_idx = topk_idx // (num_experts // num_ranks)
+    rank_idx.masked_fill_(topk_idx == -1, -1)
+    inplace_unique(rank_idx, num_ranks)
+    rdma_rank_idx = rank_idx // num_local_ranks
+    rdma_rank_idx.masked_fill_(rank_idx == -1, -1)
+    inplace_unique(rdma_rank_idx, num_nodes)
+
+    # RDMA dispatch counts
+    rdma_idx = topk_idx // (num_experts // num_nodes)
+    rdma_idx.masked_fill_(topk_idx == -1, -1)
+    inplace_unique(rdma_idx, num_nodes)
+    num_rdma_token_sent = rdma_idx.ne(-1).sum().item()
+
+    # Expert meta
+    num_tokens_per_expert = torch.zeros((num_experts,), dtype=torch.int, device="cuda")
+    for i in range(num_experts):
+        num_tokens_per_expert[i] = (topk_idx == i).sum()
+    gbl_num_tokens_per_expert = num_tokens_per_expert.clone()
+    dist.all_reduce(gbl_num_tokens_per_expert, group=group)
+
+    # Rank layout meta
+    num_tokens_per_rank = torch.empty((num_ranks,), dtype=torch.int, device="cuda")
+    num_tokens_per_rdma_rank = torch.empty((num_nodes,), dtype=torch.int, device="cuda")
+    token_idx_in_rank = torch.full(
+        (num_ranks, num_tokens), -1, dtype=torch.long, device="cuda"
+    )
+    for i in range(num_ranks):
+        num_tokens_per_rank[i] = (rank_idx == i).sum()
+        token_sel = (rank_idx == i).max(dim=-1)[0]
+        count = token_sel.sum().item()
+        tokens = torch.sort(token_sel.to(torch.int), descending=True)[1]
+        tokens[:count] = torch.sort(tokens[:count])[0]
+        token_idx_in_rank[i][tokens[:count]] = torch.arange(
+            count, dtype=torch.long, device="cuda"
+        )
+    for i in range(num_nodes):
+        num_tokens_per_rdma_rank[i] = (rdma_rank_idx == i).sum()
+    token_idx_in_rank = token_idx_in_rank.T.contiguous().to(torch.int)
+    is_token_in_rank = token_idx_in_rank >= 0
+    gbl_num_tokens_per_rank = num_tokens_per_rank.clone()
+    dist.all_reduce(gbl_num_tokens_per_rank, group=group)
+
+    (
+        ref_num_tokens_per_rank,
+        ref_num_tokens_per_rdma_rank,
+        ref_num_tokens_per_expert,
+        ref_is_token_in_rank,
+        _,
+    ) = buffer.get_dispatch_layout(topk_idx, num_experts)
+    assert torch.allclose(ref_num_tokens_per_rank, num_tokens_per_rank)
+    assert torch.allclose(ref_num_tokens_per_rdma_rank, num_tokens_per_rdma_rank)
+    assert torch.allclose(ref_num_tokens_per_expert, num_tokens_per_expert)
+    assert torch.allclose(ref_is_token_in_rank, is_token_in_rank)
+    t = bench(lambda: buffer.get_dispatch_layout(topk_idx, num_experts))[0]
+    if local_rank == 0:
+        print(f"[layout] Kernel performance: {t * 1000:.3f} ms", flush=True)
+        print("", flush=True)
+    group.barrier()
+    time.sleep(1)
+
+    # Config
+    rdma_buffer_size, nvl_buffer_size = 128, (720 if num_ranks in (144, 160) else 512)
+    config = deep_ep.Config(num_sms, 8, nvl_buffer_size, 16, rdma_buffer_size)
+
+    # Test dispatch
+    # noinspection PyShadowingNames
+    def check_data(check_x, recv_gbl_rank_prefix_sum):
+        assert torch.allclose(check_x.amin(dim=1), check_x.amax(dim=1))
+        check_start = 0
+        for i in range(num_ranks):
+            check_end = recv_gbl_rank_prefix_sum[i].item()
+            assert (check_x[check_start:check_end, :].int() - i).sum().item() == 0
+            check_start = check_end
+
+    for previous_mode in (False, True):
+        for async_mode in (False, True):
+            for current_x in (x_pure_rand, x, x_e4m3):
+                for with_topk in (False, True):
+                    if local_rank == 0:
+                        print(
+                            f'[testing] Running with {"FP8" if isinstance(current_x, tuple) else "BF16"}, {"with" if with_topk else "without"} top-k (async={async_mode}, previous={previous_mode}) ...',
+                            flush=True,
+                            end="",
+                        )
+                    dispatch_args = {
+                        "x": current_x,
+                        "num_tokens_per_rank": num_tokens_per_rank,
+                        "num_tokens_per_rdma_rank": num_tokens_per_rdma_rank,
+                        "is_token_in_rank": is_token_in_rank,
+                        "num_tokens_per_expert": num_tokens_per_expert,
+                        "config": config,
+                        "async_finish": async_mode,
+                    }
+                    if with_topk:
+                        dispatch_args.update(
+                            {
+                                "topk_idx": topk_idx,
+                                "topk_weights": (
+                                    topk_weights_pure_rand
+                                    if current_x is x_pure_rand
+                                    else topk_weights
+                                ),
+                            }
+                        )
+                    if previous_mode:
+                        dispatch_args.update({"previous_event": buffer.capture()})
+                    (
+                        recv_x,
+                        recv_topk_idx,
+                        recv_topk_weights,
+                        recv_num_tokens_per_expert_list,
+                        handle,
+                        event,
+                    ) = buffer.dispatch(**dispatch_args)
+                    event.current_stream_wait() if async_mode else ()
+                    recv_x = (
+                        per_token_cast_back(*recv_x)
+                        if isinstance(recv_x, tuple)
+                        else recv_x
+                    )
+
+                    # Checks
+                    recv_gbl_rank_prefix_sum = handle[-4]
+                    assert gbl_num_tokens_per_rank[rank].item() == recv_x.size(
+                        0
+                    ), f"{gbl_num_tokens_per_rank[rank].item()} != {recv_x.size(0)}"
+                    assert (
+                        gbl_num_tokens_per_expert.view(num_ranks, -1)[rank].tolist()
+                        == recv_num_tokens_per_expert_list
+                    )
+                    if current_x is not x_pure_rand:
+                        check_data(recv_x, recv_gbl_rank_prefix_sum)
+                    if with_topk:
+                        # Check `topk_idx`
+                        assert (
+                            recv_topk_idx.eq(-1)
+                            | (
+                                (recv_topk_idx >= 0)
+                                & (recv_topk_idx < (num_experts // num_ranks))
+                            )
+                        ).sum().item() == recv_topk_idx.numel()
+                        for i, count in enumerate(recv_num_tokens_per_expert_list):
+                            assert recv_topk_idx.eq(i).sum().item() == count
+
+                        # Check `topk_weights`
+                        if current_x is not x_pure_rand:
+                            recv_topk_weights[recv_topk_idx.eq(-1)] = (
+                                recv_topk_weights.amax(dim=1, keepdim=True).expand_as(
+                                    recv_topk_weights
+                                )[recv_topk_idx.eq(-1)]
+                            )
+                            check_data(recv_topk_weights, recv_gbl_rank_prefix_sum)
+
+                    # Test cached dispatch (must without top-k staffs)
+                    if not with_topk:
+                        dispatch_args = {
+                            "x": current_x,
+                            "handle": handle,
+                            "config": config,
+                            "async_finish": async_mode,
+                        }
+                        if previous_mode:
+                            dispatch_args.update({"previous_event": buffer.capture()})
+                        recv_x, _, _, _, _, event = buffer.dispatch(**dispatch_args)
+                        event.current_stream_wait() if async_mode else ()
+                        recv_x = (
+                            per_token_cast_back(*recv_x)
+                            if isinstance(recv_x, tuple)
+                            else recv_x
+                        )
+                        if current_x is not x_pure_rand:
+                            check_data(recv_x, recv_gbl_rank_prefix_sum)
+
+                    # Test combine
+                    combine_args = {
+                        "x": recv_x,
+                        "handle": handle,
+                        "config": config,
+                        "async_finish": async_mode,
+                    }
+                    if with_topk:
+                        combine_args.update({"topk_weights": recv_topk_weights})
+                    if previous_mode:
+                        combine_args.update({"previous_event": buffer.capture()})
+                    combined_x, combined_topk_weights, event = buffer.combine(
+                        **combine_args
+                    )
+                    event.current_stream_wait() if async_mode else ()
+                    check_x = combined_x.float() / is_token_in_rank.sum(
+                        dim=1
+                    ).unsqueeze(1)
+                    ref_x = x_pure_rand if current_x is x_pure_rand else x
+                    assert calc_diff(check_x, ref_x) < 5e-6
+                    if with_topk:
+                        check_topk_weights = (
+                            combined_topk_weights
+                            if (current_x is x_pure_rand)
+                            else (
+                                combined_topk_weights
+                                / is_token_in_rank.sum(dim=1).unsqueeze(1)
+                            )
+                        )
+                        ref_topk_weights = (
+                            topk_weights_pure_rand
+                            if current_x is x_pure_rand
+                            else topk_weights
+                        )
+                        assert calc_diff(check_topk_weights, ref_topk_weights) < 1e-9
+
+                    # For later tuning
+                    dispatch_bf16_rdma_send_bytes = num_rdma_token_sent * hidden * 2
+                    dispatch_bf16_nvl_recv_bytes = recv_x.numel() * 2
+                    combine_bf16_nvl_send_bytes = dispatch_bf16_nvl_recv_bytes
+                    combine_bf16_rdma_recv_bytes = dispatch_bf16_rdma_send_bytes
+
+                    if local_rank == 0:
+                        print(" passed", flush=True)
+    if local_rank == 0:
+        print("", flush=True)
+
+    output_data = {}
+
+    # Tune dispatch performance
+    best_dispatch_results = None
+    fp8_factor = (1 + 4 / 128) / 2
+    for current_x in (x_e4m3, x):
+        best_time, best_results = 1e10, None
+        rdma_send_bytes = (
+            (dispatch_bf16_rdma_send_bytes * fp8_factor)
+            if isinstance(current_x, tuple)
+            else dispatch_bf16_rdma_send_bytes
+        )
+        nvl_recv_bytes = (
+            (dispatch_bf16_nvl_recv_bytes * fp8_factor)
+            if isinstance(current_x, tuple)
+            else dispatch_bf16_nvl_recv_bytes
+        )
+        for nvl_chunk_size in range(4, 33, 4):
+            for rdma_chunk_size in range(4, 33, 4):
+                config_kwargs = {
+                    "num_sms": num_sms,
+                    "num_max_nvl_chunked_send_tokens": nvl_chunk_size,
+                    "num_max_nvl_chunked_recv_tokens": nvl_buffer_size,
+                    "num_max_rdma_chunked_send_tokens": rdma_chunk_size,
+                    "num_max_rdma_chunked_recv_tokens": rdma_buffer_size,
+                }
+                config = deep_ep.Config(**config_kwargs)
+                tune_args = {"x": current_x, "handle": handle, "config": config}
+                t = bench(lambda: buffer.dispatch(**tune_args))[0]
+                if t < best_time:
+                    best_time, best_results = t, (
+                        num_sms,
+                        nvl_chunk_size,
+                        rdma_chunk_size,
+                        config_kwargs,
+                    )
+                if local_rank == 0:
+                    print(
+                        f"[tuning] SMs {num_sms}, NVL chunk {nvl_chunk_size}, RDMA chunk {rdma_chunk_size}: {rdma_send_bytes / 1e9 / t:.2f} GB/s (RDMA), {nvl_recv_bytes / 1e9 / t:.2f} GB/s (NVL) ",
+                        flush=True,
+                    )
+        if local_rank == 0:
+            print(
+                f'[tuning] Best dispatch ({"FP8" if isinstance(current_x, tuple) else "BF16"}): SMs {best_results[0]}, NVL chunk {best_results[1]}, RDMA chunk {best_results[2]}: {rdma_send_bytes / 1e9 / best_time:.2f} GB/s (RDMA), {nvl_recv_bytes / 1e9 / best_time:.2f} GB/s (NVL)',
+                flush=True,
+            )
+            print("", flush=True)
+            is_fp8 = isinstance(current_x, tuple)
+            if is_fp8:
+                output_data["normal_dispatch"] = deepcopy(best_results[3])
+
+        if isinstance(current_x, tuple):
+            # Gather FP8 the best config from rank 0
+            best_dispatch_results = torch.tensor(
+                [best_results[0], best_results[1], best_results[2]],
+                dtype=torch.int32,
+                device="cuda",
+            )
+            all_best_fp8_results_list = [
+                torch.zeros_like(best_dispatch_results)
+                for _ in range(torch.distributed.get_world_size())
+            ]
+            dist.all_gather(
+                all_best_fp8_results_list, best_dispatch_results, group=group
+            )
+            best_dispatch_results = all_best_fp8_results_list[0].tolist()
+    dispatch_config = deep_ep.Config(
+        best_dispatch_results[0],
+        best_dispatch_results[1],
+        nvl_buffer_size,
+        best_dispatch_results[2],
+        rdma_buffer_size,
+    )
+
+    dispatch_args = {
+        "x": x,
+        "num_tokens_per_rank": num_tokens_per_rank,
+        "num_tokens_per_rdma_rank": num_tokens_per_rdma_rank,
+        "is_token_in_rank": is_token_in_rank,
+        "num_tokens_per_expert": num_tokens_per_expert,
+        "config": dispatch_config if dispatch_config is not None else config,
+    }
+    recv_x, _, _, _, handle, _ = buffer.dispatch(**dispatch_args)
+
+    # Tune combine performance
+    best_time, best_results = 1e10, None
+    for nvl_chunk_size in range(1, 5, 1):
+        for rdma_chunk_size in range(8, 33, 4):
+            config_kwargs = {
+                "num_sms": num_sms,
+                "num_max_nvl_chunked_send_tokens": nvl_chunk_size,
+                "num_max_nvl_chunked_recv_tokens": nvl_buffer_size,
+                "num_max_rdma_chunked_send_tokens": rdma_chunk_size,
+                "num_max_rdma_chunked_recv_tokens": rdma_buffer_size,
+            }
+            config = deep_ep.Config(**config_kwargs)
+            tune_args = {"x": recv_x, "handle": handle, "config": config}
+            t = bench(lambda: buffer.combine(**tune_args))[0]
+            if local_rank == 0:
+                print(
+                    f"[tuning] SMs {num_sms}, NVL chunk {nvl_chunk_size}, RDMA chunk {rdma_chunk_size}: {combine_bf16_rdma_recv_bytes / 1e9 / t:.2f} GB/s (RDMA), {combine_bf16_nvl_send_bytes / 1e9 / t:.2f} GB/s (NVL) ",
+                    flush=True,
+                )
+                if t < best_time:
+                    best_time, best_results = t, (
+                        num_sms,
+                        nvl_chunk_size,
+                        rdma_chunk_size,
+                        config_kwargs,
+                    )
+
+    if local_rank == 0:
+        print(
+            f"[tuning] Best combine: SMs {best_results[0]}, NVL chunk {best_results[1]}, RDMA chunk {best_results[2]}: {combine_bf16_rdma_recv_bytes / 1e9 / best_time:.2f} GB/s (RDMA), {combine_bf16_nvl_send_bytes / 1e9 / best_time:.2f} GB/s (NVL)",
+            flush=True,
+        )
+        print("", flush=True)
+        output_data["normal_combine"] = deepcopy(best_results[3])
+
+    if rank == 0 and local_rank == 0:
+        _write_output(args, output_data)
+
+
+def _write_output(args, output_data):
+    text = json.dumps(output_data, indent=4)
+    output_path = args.output_path
+    print(f"Write to {output_path} with {text}")
+    Path(output_path).write_text(text)
+
+
+# noinspection PyUnboundLocalVariable
+def test_loop(local_rank: int, num_local_ranks: int, args):
+    num_nodes = args.nnodes
+    rank, num_ranks, group = init_dist(local_rank, num_local_ranks, args)
+
+    num_sms = args.num_sms
+    num_qps_per_rank = num_sms // 2
+
+    buffer = deep_ep.Buffer(
+        group,
+        int(1e9),
+        int(1e9),
+        low_latency_mode=False,
+        num_qps_per_rank=num_qps_per_rank,
+    )
+    assert num_local_ranks == 8 and num_ranks > 8
+    torch.manual_seed(rank)
+
+    for i in (num_sms,):
+        test_main(
+            i,
+            local_rank,
+            num_local_ranks,
+            num_ranks,
+            num_nodes,
+            rank,
+            buffer,
+            group,
+            args,
+        )
+        if local_rank == 0:
+            print("", flush=True)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-sms", type=int, default=24)
+    parser.add_argument("--output-path", type=str, default="deepep_tuned.json")
+    parser.add_argument("--nnodes", type=int, default=1)
+    parser.add_argument("--node-rank", type=int, default=0)
+    parser.add_argument("--master-addr", type=str, default="127.0.0.1")
+    parser.add_argument("--master-port", type=int, default=8361)
+    args = parser.parse_args()
+    print(f"Start system with {args=}")
+
+    num_processes = 8
+    torch.multiprocessing.spawn(
+        test_loop, args=(num_processes, args), nprocs=num_processes
+    )
diff --git a/benchmark/kernels/deepseek/README.md b/benchmark/kernels/deepseek/README.md
new file mode 100644
index 000000000..fba775f23
--- /dev/null
+++ b/benchmark/kernels/deepseek/README.md
@@ -0,0 +1,19 @@
+## DeepSeek kernels benchmark
+
+
+### Prerequisites
+- You should install [DeepGemm](https://github.com/deepseek-ai/DeepGEMM) from source before run `benchmark_deepgemm_fp8_gemm.py` and `benchmark_deepgemm_fp8_group_gemm.py`.
+
+### Benchmark
+- `benchmark_deepgemm_fp8_gemm.py`
+    ```bash
+    python benchmark_deepgemm_fp8_gemm.py --run_correctness --tp_size 1
+    ```
+
+- `benchmark_deepgemm_fp8_group_gemm.py`
+    ```bash
+    python benchmark_deepgemm_fp8_group_gemm.py --run_correctness --tp_size 1
+    ```
+
+ - You can use the `--run_correctness` parameter to verify all kernels results's correctness.
+    - You can use the `--tp_size` parameter to benchmark all FP8 w8a8 block-wise matrix multiplications involved in DeepSeek V3/R1 under the current tensor parallelism (TP) setting. This benchmark compares DeepSeek's open-source [DeepGemm](https://github.com/deepseek-ai/DeepGEMM) implementation with SGLang's and VLLM Triton implementation.
diff --git a/benchmark/kernels/deepseek/benchmark_deepgemm_fp8_gemm.py b/benchmark/kernels/deepseek/benchmark_deepgemm_fp8_gemm.py
new file mode 100644
index 000000000..f93732154
--- /dev/null
+++ b/benchmark/kernels/deepseek/benchmark_deepgemm_fp8_gemm.py
@@ -0,0 +1,400 @@
+from typing import Tuple
+
+import deep_gemm
+import tilelang
+import tilelang.language as T
+import torch
+import triton
+from deep_gemm import ceil_div, get_col_major_tma_aligned_tensor
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    w8a8_block_fp8_matmul as vllm_w8a8_block_fp8_matmul,
+)
+
+from sglang.srt.layers.quantization.fp8_kernel import (
+    w8a8_block_fp8_matmul_deepgemm as w8a8_block_fp8_matmul,
+)
+
+
+# Adapted from https://github.com/tile-ai/tilelang/blob/a8cfdce92795cb861c9033573534653ee040b5ed/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py#L1
+def tl_gemm(
+    M,
+    N,
+    K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+):
+    assert in_dtype in [
+        "e4m3_float8",
+    ], "Currently only e4m3_float8 is supported"
+    assert out_dtype in [
+        "bfloat16",
+        "float16",
+    ], "Currently only bfloat16 and float16 are supported"
+
+    TILE_SIZE = (128, 128, 128)
+    block_M = TILE_SIZE[0]
+    block_N = TILE_SIZE[1]
+    block_K = TILE_SIZE[2]
+
+    A_shape = (M, K)
+    Scales_A_shape = (M, T.ceildiv(K, block_K))
+    B_shape = (N, K)
+    Scales_B_shape = (T.ceildiv(N, block_N), T.ceildiv(K, block_K))
+    A_shared_shape = (block_M, block_K)
+    B_shared_shape = (block_N, block_K)
+    C_shared_shape = (block_M, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Buffer(A_shape, in_dtype),
+        scales_a: T.Buffer(Scales_A_shape, "float32"),
+        B: T.Buffer(B_shape, in_dtype),
+        scales_b: T.Buffer(Scales_B_shape, "float32"),
+        C: T.Buffer((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (
+            bx,
+            by,
+        ):
+
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_shared = T.alloc_shared(C_shared_shape, out_dtype)
+            Scale_C_shared = T.alloc_shared((block_M), "float32")
+            C_local = T.alloc_fragment(C_shared_shape, accum_dtype)
+            C_local_accum = T.alloc_fragment(C_shared_shape, accum_dtype)
+
+            # Improve L2 Cache
+            T.use_swizzle(panel_size=10)
+
+            T.clear(C_local)
+            T.clear(C_local_accum)
+            K_iters = T.ceildiv(K, block_K)
+            for k in T.Pipelined(K_iters, num_stages=4):
+                # Load A into shared memory
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                # Load B into shared memory
+                T.copy(B[bx * block_N, k * block_K], B_shared)
+                # Load scale into shared memory
+                Scale_B = scales_b[bx, k]
+                for i in T.Parallel(block_M):
+                    Scale_C_shared[i] = scales_a[by * block_M + i, k] * Scale_B
+
+                T.gemm(A_shared, B_shared, C_local, transpose_B=True)
+                # Promote to enable 2xAcc
+                for i, j in T.Parallel(block_M, block_N):
+                    C_local_accum[i, j] += C_local[i, j] * Scale_C_shared[i]
+                T.clear(C_local)
+            # TMA store
+            T.copy(C_local_accum, C_shared)
+            T.copy(C_shared, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def per_token_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2 and x.size(1) % 128 == 0
+    m, n = x.shape
+    x_view = x.view(m, -1, 128)
+    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
+    return (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(
+        m, n
+    ), (x_amax / 448.0).view(m, -1)
+
+
+def per_block_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros(
+        (ceil_div(m, 128) * 128, ceil_div(n, 128) * 128), dtype=x.dtype, device=x.device
+    )
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (x_amax / 448.0).view(
+        x_view.size(0), x_view.size(2)
+    )
+
+
+def fp8_gemm_deepgemm(
+    x_fp8: torch.Tensor,
+    x_scale: torch.Tensor,
+    y_fp8: torch.Tensor,
+    y_scale: torch.Tensor,
+    m: int,
+    n: int,
+    k: int,
+):
+    """DeepGEMM implementation of FP8 GEMM"""
+    out = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
+
+    # Run DeepGEMM kernel
+    deep_gemm.gemm_fp8_fp8_bf16_nt((x_fp8, x_scale), (y_fp8, y_scale), out)
+    return out
+
+
+def fp8_gemm_sglang(
+    x_fp8: torch.Tensor,
+    x_scale: torch.Tensor,
+    y_fp8: torch.Tensor,
+    y_scale: torch.Tensor,
+    m: int,
+    n: int,
+    k: int,
+):
+    """SGLang implementation of FP8 GEMM"""
+    block_size = [128, 128]  # Matches the block size in per_block_cast_to_fp8
+
+    # Run SGLang kernel
+    out = w8a8_block_fp8_matmul(
+        x_fp8, y_fp8, x_scale, y_scale, block_size, torch.bfloat16
+    )
+    return out
+
+
+def fp8_gemm_vllm(
+    x_fp8: torch.Tensor,
+    x_scale: torch.Tensor,
+    y_fp8: torch.Tensor,
+    y_scale: torch.Tensor,
+    m: int,
+    n: int,
+    k: int,
+):
+    """vLLM implementation of FP8 GEMM"""
+    block_size = [128, 128]  # Matches the block size in per_block_cast_to_fp8
+
+    # Run vLLM kernel
+    out = vllm_w8a8_block_fp8_matmul(
+        x_fp8, y_fp8, x_scale, y_scale, block_size, torch.bfloat16
+    )
+    return out
+
+
+def calculate_diff(m: int, n: int, k: int):
+    x = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
+    y = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
+
+    x_fp8, x_scale = per_token_cast_to_fp8(x.clone())
+    y_fp8, y_scale = per_block_cast_to_fp8(y.clone())
+    x_scale_col_major = get_col_major_tma_aligned_tensor(x_scale.clone())
+
+    out_deepgemm = fp8_gemm_deepgemm(
+        x_fp8.clone(),
+        x_scale_col_major.clone(),
+        y_fp8.clone(),
+        y_scale.clone(),
+        m,
+        n,
+        k,
+    )
+    out_sglang = fp8_gemm_sglang(
+        x_fp8.clone(), x_scale.clone(), y_fp8.clone(), y_scale.clone(), m, n, k
+    )
+
+    tilelang_func = tl_gemm(m, n, k, "e4m3_float8", "bfloat16", "float32")
+    tilelang_kernel = tilelang.compile(tilelang_func, out_idx=[-1])
+    out_tilelang = tilelang_kernel(
+        x_fp8.clone(), x_scale.clone(), y_fp8.clone(), y_scale.clone()
+    )
+
+    diff_sglang_deepgemm = torch.abs(out_deepgemm - out_sglang).mean().item()
+    diff_tilelang_deepgemm = torch.abs(out_deepgemm - out_tilelang).mean().item()
+    diff_tilelang_sglang = torch.abs(out_tilelang - out_sglang).mean().item()
+
+    print(f"Shape m={m}, n={n}, k={k}:")
+    print(f"DeepGEMM output: {out_deepgemm[0, 0:5]}")
+    print(f"SGLang output: {out_sglang[0, 0:5]}")
+    print(f"TileLang output: {out_tilelang[0, 0:5]}")
+    print(f"Mean absolute difference (SGLang-DeepGEMM): {diff_sglang_deepgemm}")
+    print(f"Mean absolute difference (TileLang-DeepGEMM): {diff_tilelang_deepgemm}")
+    print(f"Mean absolute difference (TileLang-SGLang): {diff_tilelang_sglang}")
+
+    sglang_deepgemm_match = torch.allclose(
+        out_deepgemm, out_sglang, atol=1e-2, rtol=1e-2
+    )
+    tilelang_deepgemm_match = torch.allclose(
+        out_deepgemm, out_tilelang, atol=1e-2, rtol=1e-2
+    )
+    tilelang_sglang_match = torch.allclose(
+        out_tilelang, out_sglang, atol=1e-2, rtol=1e-2
+    )
+
+    if sglang_deepgemm_match and tilelang_deepgemm_match and tilelang_sglang_match:
+        print("✅ All implementations match\n")
+    else:
+        print("❌ Some implementations differ:")
+        print(f"  - SGLang vs DeepGEMM: {'✅' if sglang_deepgemm_match else '❌'}")
+        print(f"  - TileLang vs DeepGEMM: {'✅' if tilelang_deepgemm_match else '❌'}")
+        print(f"  - TileLang vs SGLang: {'✅' if tilelang_sglang_match else '❌'}\n")
+
+
+def get_weight_shapes(tp_size):
+    # cannot TP
+    total = [
+        (512 + 64, 7168),
+        ((128 + 64) * 128, 7168),
+        (128 * (128 + 128), 512),
+        (7168, 16384),
+        (7168, 18432),
+    ]
+    # N can TP
+    n_tp = [
+        (18432 * 2, 7168),
+        ((128 + 64) * 128, 7168),
+        (128 * (128 + 128), 512),
+        (24576, 1536),
+        (4096, 7168),
+    ]
+    # K can TP
+    k_tp = [(7168, 18432), (7168, 16384), (7168, 2048)]
+
+    weight_shapes = []
+    for t in total:
+        weight_shapes.append(t)
+    for n_t in n_tp:
+        new_t = (n_t[0] // tp_size, n_t[1])
+        weight_shapes.append(new_t)
+    for k_t in k_tp:
+        new_t = (k_t[0], k_t[1] // tp_size)
+        weight_shapes.append(new_t)
+
+    return weight_shapes
+
+
+def create_benchmark_configs(tp_size):
+    configs = []
+    weight_shapes = get_weight_shapes(tp_size)
+    batch_sizes = [8, 16, 32, 64, 128, 256, 1024, 2048, 4096]
+
+    for n, k in weight_shapes:
+        for m in batch_sizes:
+            configs.append((m, n, k, tp_size))
+
+    return configs
+
+
+def get_benchmark(tp_size):
+    all_configs = create_benchmark_configs(tp_size)
+
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["m", "n", "k", "tp_size"],
+            x_vals=[list(config) for config in all_configs],
+            line_arg="provider",
+            line_vals=["deepgemm", "sglang", "tilelang"],
+            line_names=["DeepGEMM", "SGLang", "TileLang"],
+            styles=[("blue", "-"), ("red", "-"), ("green", "-")],
+            ylabel="ms",
+            plot_name=f"fp8-gemm-performance-comparison-tp{tp_size}",
+            args={},
+        )
+    )
+    def benchmark(m, n, k, tp_size, provider):
+        print(f"Shape (m={m}, n={n}, k={k}, tp={tp_size}), Provider: {provider}")
+        x = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
+        y = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
+
+        # Preprocess data before benchmarking
+        x_fp8, x_scale = per_token_cast_to_fp8(x)
+        y_fp8, y_scale = per_block_cast_to_fp8(y)
+        x_scale_col_major = get_col_major_tma_aligned_tensor(x_scale.clone())
+
+        quantiles = [0.5, 0.2, 0.8]
+
+        if provider == "deepgemm":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: fp8_gemm_deepgemm(
+                    x_fp8.clone(),
+                    x_scale_col_major.clone(),
+                    y_fp8.clone(),
+                    y_scale.clone(),
+                    m,
+                    n,
+                    k,
+                ),
+                quantiles=quantiles,
+            )
+        elif provider == "sglang":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: fp8_gemm_sglang(
+                    x_fp8.clone(),
+                    x_scale.clone(),
+                    y_fp8.clone(),
+                    y_scale.clone(),
+                    m,
+                    n,
+                    k,
+                ),
+                quantiles=quantiles,
+            )
+        else:  # tilelang
+            tilelang_func = tl_gemm(m, n, k, "e4m3_float8", "bfloat16", "float32")
+            tilelang_kernel = tilelang.compile(tilelang_func, out_idx=[-1])
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: tilelang_kernel(
+                    x_fp8.clone(),
+                    x_scale.clone(),
+                    y_fp8.clone(),
+                    y_scale.clone(),
+                ),
+                quantiles=quantiles,
+            )
+
+        # Calculate TFLOPS
+        flops = 2 * m * n * k  # multiply-adds
+        tflops = flops / (ms * 1e-3) / 1e12
+
+        # Print shape-specific results with TFLOPS
+        print(f"Time: {ms*1000:.2f} ms, TFLOPS: {tflops:.2f}")
+        return ms * 1000, max_ms * 1000, min_ms * 1000  # convert to ms
+
+    return benchmark
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default="./configs/benchmark_ops/fp8_gemm/",
+        help="Path to save fp8 gemm benchmark results",
+    )
+    parser.add_argument(
+        "--run_correctness",
+        action="store_true",
+        default=True,
+        help="Whether to run correctness test",
+    )
+    parser.add_argument(
+        "--tp_size",
+        type=int,
+        default=1,
+        help="Tensor parallelism size to benchmark (default: 1)",
+    )
+    args = parser.parse_args()
+
+    # Set random seed for reproducibility
+    torch.manual_seed(0)
+    torch.cuda.manual_seed(0)
+
+    # Enable TF32, adapted from https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_core.py#L148
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+
+    # Run correctness tests on a few examples
+    if args.run_correctness:
+        print("Running correctness tests...")
+        calculate_diff(64, 512, 7168)  # Small test
+        calculate_diff(64, 7168, 16384)  # Medium test
+        calculate_diff(64, 18432, 7168)  # Large test
+
+    # Get the benchmark function with the specified tp_size
+    benchmark = get_benchmark(args.tp_size)
+
+    print(f"Running performance benchmark for TP size = {args.tp_size}...")
+    benchmark.run(print_data=True, save_path=args.save_path)
diff --git a/benchmark/kernels/deepseek/benchmark_deepgemm_fp8_group_gemm.py b/benchmark/kernels/deepseek/benchmark_deepgemm_fp8_group_gemm.py
new file mode 100644
index 000000000..2c3e8dfcc
--- /dev/null
+++ b/benchmark/kernels/deepseek/benchmark_deepgemm_fp8_group_gemm.py
@@ -0,0 +1,486 @@
+from typing import Tuple
+
+import deep_gemm
+import torch
+import triton
+import triton.language as tl
+from deep_gemm import calc_diff, get_col_major_tma_aligned_tensor
+
+# Import shared functionality from the regular GEMM benchmark
+from sglang.benchmark.kernels.deepseek.benchmark_deepgemm_fp8_gemm import (
+    per_block_cast_to_fp8,
+    per_token_cast_to_fp8,
+)
+
+
+def construct_grouped_and_flat_fp8(
+    x: torch.Tensor, y: torch.Tensor, num_groups: int, is_masked: bool
+) -> Tuple[
+    Tuple[torch.Tensor, torch.Tensor],  # grouped x_fp8
+    Tuple[torch.Tensor, torch.Tensor],  # grouped y_fp8
+    Tuple[torch.Tensor, torch.Tensor],  # flat x_fp8
+    Tuple[torch.Tensor, torch.Tensor],  # flat y_fp8
+    torch.Tensor,  # output
+    torch.Tensor,  # reference output
+]:
+    # Verify input shapes
+    m, k = x.shape
+    n, k_y = y.shape
+    assert k == k_y, f"Incompatible shapes: x({m}, {k}), y({n}, {k_y})"
+    assert m % num_groups == 0, f"m({m}) must be divisible by num_groups({num_groups})"
+    assert m % 4 == 0, f"TMA alignment error: {m}"
+
+    # Reshape inputs for grouped processing
+    m_per_group = m // num_groups
+    x_grouped = x.view(num_groups, m_per_group, k)
+    y_grouped = y.unsqueeze(0).expand(num_groups, n, k)
+
+    # Initialize output tensors
+    out = torch.empty((num_groups, m_per_group, n), device="cuda", dtype=torch.bfloat16)
+    ref_out = torch.einsum("gmk,gnk->gmn", x_grouped, y_grouped)
+
+    # Quantize grouped tensors
+    x_fp8_grouped = (
+        torch.empty_like(x_grouped, dtype=torch.float8_e4m3fn),
+        torch.empty(
+            (num_groups, m_per_group, k // 128), device="cuda", dtype=torch.float
+        ),
+    )
+    y_fp8_grouped = (
+        torch.empty_like(y_grouped, dtype=torch.float8_e4m3fn),
+        torch.empty(
+            (num_groups, (n + 127) // 128, k // 128), device="cuda", dtype=torch.float
+        ),
+    )
+    for i in range(num_groups):
+        x_fp8_grouped[0][i], x_fp8_grouped[1][i] = per_token_cast_to_fp8(x_grouped[i])
+        y_fp8_grouped[0][i], y_fp8_grouped[1][i] = per_block_cast_to_fp8(y_grouped[i])
+
+    # Quantize flat tensors
+    x_fp8_flat = per_token_cast_to_fp8(x)
+    y_fp8_flat = per_block_cast_to_fp8(y)
+
+    # For non-masked input, merge the group and M dims in output
+    if not is_masked:
+        x_fp8_grouped = (
+            x_fp8_grouped[0].view(-1, k),
+            per_token_cast_to_fp8(x_grouped.view(-1, k))[1],
+        )
+        out, ref_out = out.view(-1, n), ref_out.view(-1, n)
+
+    # Transpose earlier for testing
+    x_fp8_grouped = (
+        x_fp8_grouped[0],
+        get_col_major_tma_aligned_tensor(x_fp8_grouped[1]),
+    )
+    x_fp8_flat = (x_fp8_flat[0], get_col_major_tma_aligned_tensor(x_fp8_flat[1]))
+
+    return x_fp8_grouped, y_fp8_grouped, x_fp8_flat, y_fp8_flat, out, ref_out
+
+
+# Since we don't have a group gemm kernel in SGLang/vLLM, we implemented a
+# custom kernel based on the Triton tutorial.
+# https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
+@triton.jit
+def fp8_gemm_group_triton_kernel(
+    # Pointers to matrices
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    # Pointers to scaling factors
+    a_scale_ptr,
+    b_scale_ptr,
+    # Matrix dimensions
+    M,
+    N,
+    K,
+    # The stride variables represent how much to increase the ptr by when moving by 1
+    # element in a particular dimension.
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    # Strides for scaling factors
+    stride_a_scale_m,
+    stride_a_scale_k,
+    stride_b_scale_n,
+    stride_b_scale_k,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """Kernel for computing the matmul C = A x B with FP8 inputs and scaling factors.
+    A has shape (M, K), B has shape (K, N) and C has shape (M, N)
+
+    Note: Block sizes must be multiples of 32 for optimal TMA performance.
+    """
+    # Map program ids to the block of C it should compute
+    pid_group = tl.program_id(axis=0)  # Group ID
+    pid_n = tl.program_id(axis=1)  # N dimension ID
+
+    # Compute the M block ID within this group
+    group_size_m = min(M - pid_group * GROUP_SIZE_M, GROUP_SIZE_M)
+    pid_m_within_group = tl.program_id(axis=2) % group_size_m
+    pid_m = pid_group * GROUP_SIZE_M + pid_m_within_group
+
+    # Create pointers for the first blocks of A and B
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+
+    # Initialize accumulator
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    # Main loop
+    for k_block in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        k_offset = k_block * BLOCK_SIZE_K
+
+        # Load the next block of A and B, with masks
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k_offset, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k_offset, other=0.0)
+
+        # Calculate indices for scaling factors for this K block
+        a_scale_ptrs = a_scale_ptr + (
+            offs_am * stride_a_scale_m + k_block * stride_a_scale_k
+        )
+        b_scale_ptrs = b_scale_ptr + (
+            pid_n * stride_b_scale_n + k_block * stride_b_scale_k
+        )
+
+        # Perform matrix multiplication in FP8
+        res = tl.dot(a, b)
+
+        # Load scaling factors for the current block
+        a_scale = tl.load(a_scale_ptrs)[:, None]  # [BLOCK_SIZE_M, 1]
+        b_scale = tl.load(b_scale_ptrs)
+
+        # Apply scaling factors to the accumulated result
+        accumulator += res * a_scale * b_scale
+
+        # Advance pointers
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    # Convert to bfloat16 for output
+    c = accumulator.to(tl.bfloat16)
+
+    # Write back the result
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
+def fp8_gemm_group_triton(a_tuple, b_tuple, c, num_groups):
+    """
+    Perform matrix multiplication with FP8 inputs and proper scaling.
+
+    Args:
+        a_tuple: Tuple of (quantized_tensor, scale_factors) for input A
+        b_tuple: Tuple of (quantized_tensor, scale_factors) for input B
+        c: Output tensor in BF16 format
+        num_groups: Number of groups for grouped GEMM
+
+    Returns:
+        Result tensor in BF16 format
+    """
+    # Unpack the tuples
+    a, a_scale = a_tuple
+    b, b_scale = b_tuple
+
+    M, K = a.shape
+    _, N = b.shape
+
+    # Configure block sizes - must be multiples of 32 for TMA alignment
+    BLOCK_SIZE_M = 128
+    BLOCK_SIZE_N = 128
+    BLOCK_SIZE_K = 128
+
+    # Calculate grid dimensions
+    num_pid_m = triton.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = triton.cdiv(N, BLOCK_SIZE_N)
+    num_groups_grid = triton.cdiv(num_pid_m, num_groups)
+
+    # 3D grid launch - (group, n_blocks, m_blocks_per_group)
+    grid = (num_groups_grid, num_pid_n, min(num_groups, num_pid_m))
+
+    fp8_gemm_group_triton_kernel[grid](
+        a,
+        b,
+        c,
+        a_scale,
+        b_scale,
+        M,
+        N,
+        K,
+        a.stride(0),
+        a.stride(1),
+        b.stride(0),
+        b.stride(1),
+        c.stride(0),
+        c.stride(1),
+        a_scale.stride(0),
+        1,  # Stride in the K dimension may be 1
+        b_scale.stride(0),
+        1 if b_scale.dim() > 1 else 0,
+        BLOCK_SIZE_M=BLOCK_SIZE_M,
+        BLOCK_SIZE_N=BLOCK_SIZE_N,
+        BLOCK_SIZE_K=BLOCK_SIZE_K,
+        GROUP_SIZE_M=num_groups,
+    )
+
+    return c
+
+
+def fp8_gemm_group_deepgemm(x_fp8_grouped, y_fp8_grouped, out, m_indices):
+    deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
+        x_fp8_grouped,
+        y_fp8_grouped,
+        out,
+        m_indices,
+    )
+    return out
+
+
+def calculate_diff(m: int, n: int, k: int, num_groups: int):
+    print(f"Shape (m={m}, n={n}, k={k}")
+    x = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
+    y = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
+    x_fp8_grouped, y_fp8_grouped, x_fp8_flat, y_fp8_flat, out, out_torch = (
+        construct_grouped_and_flat_fp8(x, y, num_groups, is_masked=False)
+    )
+    m_per_group = m // num_groups
+    out_deepgemm = out.clone()
+    m_indices = torch.arange(0, num_groups, device="cuda", dtype=torch.int)
+    m_indices = (
+        m_indices.unsqueeze(-1).expand(num_groups, m_per_group).contiguous().view(-1)
+    )
+
+    fp8_gemm_group_deepgemm(
+        x_fp8_grouped,
+        y_fp8_grouped,
+        out_deepgemm,
+        m_indices,
+    )
+    torch.cuda.synchronize()
+
+    # Prepare inputs for Triton
+    a, a_scale = x_fp8_flat
+    b, b_scale = y_fp8_flat
+    b = b.T.contiguous()
+    # Ensure scales are in the right format and contiguous
+    a_scale, b_scale = a_scale.contiguous(), b_scale.contiguous()
+    M, _ = a.shape
+    _, N = b.shape
+    c = torch.empty((M, N), device=a.device, dtype=torch.bfloat16)
+    out_triton = fp8_gemm_group_triton((a, a_scale), (b, b_scale), c, num_groups)
+    torch.cuda.synchronize()
+
+    diff_torch_deepgemm = torch.abs(out_torch - out_deepgemm).mean().item()
+    diff_torch_triton = torch.abs(out_torch - out_triton).mean().item()
+    diff_deepgemm_triton = torch.abs(out_deepgemm - out_triton).mean().item()
+
+    print(f"Shape m={m}, n={n}, k={k}:")
+    print(f"Torch output: {out_torch[0, 0:5]}")
+    print(f"DeepGEMM output: {out_deepgemm[0, 0:5]}")
+    print(f"Triton output: {out_triton[0, 0:5]}")
+    print(f"Mean absolute difference (Torch-DeepGEMM): {diff_torch_deepgemm}")
+    print(f"Mean absolute difference (Torch-Triton): {diff_torch_triton}")
+    print(f"Mean absolute difference (DeepGEMM-Triton): {diff_deepgemm_triton}")
+
+    deepgemm_torch_diff = calc_diff(out_deepgemm, out_torch)
+    triton_torch_diff = calc_diff(out_triton, out_torch)
+    deepgemm_triton_diff = calc_diff(out_deepgemm, out_triton)
+
+    DIFF_THRESHOLD = 0.001
+    all_match = (
+        deepgemm_torch_diff < DIFF_THRESHOLD
+        and triton_torch_diff < DIFF_THRESHOLD
+        and deepgemm_triton_diff < DIFF_THRESHOLD
+    )
+    if all_match:
+        print("✅ All implementations match\n")
+    else:
+        print("❌ Some implementations differ:")
+        print(
+            f"  - Torch vs DeepGEMM: {'✅' if deepgemm_torch_diff < DIFF_THRESHOLD else '❌'}"
+            f"  - Torch vs Triton: {'✅' if triton_torch_diff < DIFF_THRESHOLD else '❌'}"
+            f"  - DeepGEMM vs Triton: {'✅' if deepgemm_triton_diff < DIFF_THRESHOLD else '❌'}"
+        )
+
+
+def get_weight_shapes(tp_size):
+    # cannot TP
+    total = [
+        (512 + 64, 7168),
+        ((128 + 64) * 128, 7168),
+        (128 * (128 + 128), 512),
+        (7168, 16384),
+        (7168, 18432),
+    ]
+    # N can TP
+    n_tp = [
+        (18432 * 2, 7168),
+        ((128 + 64) * 128, 7168),
+        (128 * (128 + 128), 512),
+        (24576, 1536),
+        (4096, 7168),
+    ]
+    # K can TP
+    k_tp = [(7168, 18432), (7168, 16384), (7168, 2048)]
+
+    weight_shapes = []
+    for t in total:
+        weight_shapes.append(t)
+    for n_t in n_tp:
+        new_t = (n_t[0] // tp_size, n_t[1])
+        weight_shapes.append(new_t)
+    for k_t in k_tp:
+        new_t = (k_t[0], k_t[1] // tp_size)
+        weight_shapes.append(new_t)
+
+    return weight_shapes
+
+
+def create_benchmark_configs(tp_size):
+    configs = []
+    weight_shapes = get_weight_shapes(tp_size)
+    batch_sizes = [2048, 4096]
+    group_sizes = [4, 8]
+    for n, k in weight_shapes:
+        for m in batch_sizes:
+            for num_groups in group_sizes:
+                configs.append((m, n, k, num_groups, tp_size))
+
+    return configs
+
+
+def get_benchmark(tp_size):
+    all_configs = create_benchmark_configs(tp_size)
+
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["m", "n", "k", "num_groups", "tp_size"],
+            x_vals=[config for config in all_configs],
+            line_arg="provider",
+            line_vals=["deepgemm", "triton"],
+            line_names=["DeepGEMM", "Triton"],
+            styles=[("blue", "-"), ("red", "-")],
+            ylabel="ms",
+            plot_name=f"fp8-group-gemm-performance-comparison-tp{tp_size}",
+            args={},
+        )
+    )
+    def benchmark(m, n, k, num_groups, tp_size, provider):
+        print(
+            f"Shape (m={m}, n={n}, k={k}, tp={tp_size}, num_groups={num_groups}, Provider: {provider}"
+        )
+        x = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
+        y = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
+        x_fp8_grouped, y_fp8_grouped, x_fp8_flat, y_fp8_flat, out, out_torch = (
+            construct_grouped_and_flat_fp8(x, y, num_groups, is_masked=False)
+        )
+        m_per_group = m // num_groups
+        m_indices = torch.arange(0, num_groups, device="cuda", dtype=torch.int)
+        m_indices = (
+            m_indices.unsqueeze(-1)
+            .expand(num_groups, m_per_group)
+            .contiguous()
+            .view(-1)
+        )
+
+        quantiles = [0.5, 0.2, 0.8]
+
+        if provider == "deepgemm":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: fp8_gemm_group_deepgemm(
+                    x_fp8_grouped,
+                    y_fp8_grouped,
+                    out,
+                    m_indices,
+                ),
+                quantiles=quantiles,
+            )
+        elif provider == "triton":
+            # Prepare inputs for Triton
+            # We did it outside of the lambda function to make it fair comparison like deepgemm
+            a, a_scale = x_fp8_flat
+            b, b_scale = y_fp8_flat
+            b = b.T.contiguous()
+            # Ensure scales are in the right format and contiguous
+            a_scale, b_scale = a_scale.contiguous(), b_scale.contiguous()
+            M, _ = a.shape
+            _, N = b.shape
+            c = torch.empty((M, N), device=a.device, dtype=torch.bfloat16)
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: fp8_gemm_group_triton(
+                    (a, a_scale),
+                    (b, b_scale),
+                    c,
+                    num_groups,
+                ),
+                quantiles=quantiles,
+            )
+
+        # Calculate TFLOPS
+        flops = 2 * m * n * k  # multiply-adds
+        tflops = flops / (ms * 1e-3) / 1e12
+
+        print(f"Time: {ms*1000:.2f} ms, TFLOPS: {tflops:.2f}")
+        return ms * 1000, max_ms * 1000, min_ms * 1000  # convert to ms
+
+    return benchmark
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default="./configs/benchmark_ops/fp8_group_gemm/",
+        help="Path to save deepgemm fp8 group gemm benchmark results",
+    )
+    parser.add_argument(
+        "--run_correctness",
+        action="store_true",
+        help="Whether to run correctness test",
+    )
+    parser.add_argument(
+        "--tp_size",
+        type=int,
+        default=1,
+        help="Tensor parallelism size to benchmark (default: 1)",
+    )
+    args = parser.parse_args()
+
+    # Set random seed for reproducibility
+    torch.manual_seed(0)
+    torch.cuda.manual_seed(0)
+
+    # Enable TF32, adapted from https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_core.py#L148
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+
+    # Run correctness tests on a few examples
+    if args.run_correctness:
+        print("Running correctness tests...")
+        calculate_diff(8192, 7168, 4096, 4)
+        calculate_diff(8192, 2048, 7168, 4)
+        calculate_diff(4096, 7168, 4096, 8)
+        calculate_diff(4096, 2048, 7168, 8)
+        calculate_diff(4096, 576, 7168, 8)
+
+    # Get the benchmark function with the specified tp_size
+    benchmark = get_benchmark(args.tp_size)
+
+    print(f"Running performance benchmark for TP size = {args.tp_size}...")
+    benchmark.run(print_data=True, save_path=args.save_path)
diff --git a/benchmark/kernels/fbgemm/README.md b/benchmark/kernels/fbgemm/README.md
new file mode 100644
index 000000000..e51356d8a
--- /dev/null
+++ b/benchmark/kernels/fbgemm/README.md
@@ -0,0 +1,29 @@
+## Benchmark FBGEMM Grouped GEMM
+
+Benchmark FBGEMM Grouped GEMM in both Triton and CUDA version and SGLang Triton Grouped GEMM, it will be used to compare the bandwidth of different implementations.
+
+### Requirements
+
+```shell
+pip install fbgemm-gpu-genai
+```
+
+### Usage
+
+```bash
+python3 benchmark/fbgemm/benchmark_fbgemm_grouped_gemm.py --model Qwen/Qwen2-57B-A14B-Instruct --tp-size 4 --use-fp8-w8a8
+```
+
+For example, in H200, the Qwen2-57B-A14B-Instruct TP4 fp8w8a8 grouped gemm bandwidth result is as follows:
+
+```shell
+grouped-gemm-performance:
+   batch_size  FBGEMM Triton Grouped GEMM FP8  FBGEMM CUTLASS F8F8BF16 Rowwise  SGLang Grouped GEMM FP8
+0       256.0                     3704.841339                      3042.626402              2254.725030
+1       512.0                     3691.426346                      3029.065684              2269.504543
+2      1024.0                     3653.938629                      2258.471467              2358.319020
+3      2048.0                     3596.644313                      2271.611904              2476.895397
+4      4096.0                     3468.496435                      2231.283986              2179.473910
+```
+
+The theoretical peak bandwidth of H200 is 4.8 TB/s. Taking batch_size 256 as an example, the bandwidth of FBGEMM Triton Grouped GEMM FP8 is 3704.841339 GB/s, the bandwidth of FBGEMM CUTLASS F8F8BF16 Rowwise is 3042.626402 GB/s, and the bandwidth of SGLang Grouped GEMM FP8 is 2254.725030 GB/s. Therefore, FBGEMM Triton Grouped GEMM FP8 achieves 77.9% of H200's theoretical peak bandwidth, FBGEMM CUTLASS F8F8BF16 Rowwise achieves 63.4% of H200's theoretical peak bandwidth, and SGLang Grouped GEMM FP8 achieves 46.9% of H200's theoretical peak bandwidth.
diff --git a/benchmark/kernels/fbgemm/benchmark_fbgemm_grouped_gemm.py b/benchmark/kernels/fbgemm/benchmark_fbgemm_grouped_gemm.py
new file mode 100644
index 000000000..6e8c8dcf2
--- /dev/null
+++ b/benchmark/kernels/fbgemm/benchmark_fbgemm_grouped_gemm.py
@@ -0,0 +1,516 @@
+# python3 benchmark/fbgemm/benchmark_fbgemm_grouped_gemm.py --model Qwen/Qwen2-57B-A14B-Instruct --tp-size 4 --use-fp8-w8a8
+import argparse
+
+import torch
+import triton
+from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import (
+    quantize_fp8_row,
+    triton_quantize_fp8_row,
+)
+from fbgemm_gpu.experimental.gemm.triton_gemm.grouped_gemm import (
+    grouped_gemm as fbgemm_grouped_gemm,
+)
+from fbgemm_gpu.experimental.gemm.triton_gemm.grouped_gemm import (
+    grouped_gemm_fp8_rowwise as fbgemm_grouped_gemm_fp8_rowwise,
+)
+from transformers import AutoConfig
+
+from sglang.srt.layers.moe.ep_moe.kernels import (
+    grouped_gemm_triton as sglang_grouped_gemm,
+)
+
+
+def get_model_config(model_name: str, tp_size: int):
+    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+
+    if config.architectures[0] == "DbrxForCausalLM":
+        num_groups = config.ffn_config.moe_num_experts
+        intermediate_size = config.ffn_config.ffn_hidden_size
+    elif config.architectures[0] == "JambaForCausalLM":
+        num_groups = config.num_experts
+        intermediate_size = config.intermediate_size
+    elif config.architectures[0] == "Qwen2MoeForCausalLM":
+        num_groups = config.num_experts
+        intermediate_size = config.moe_intermediate_size
+    elif config.architectures[0] == "Qwen3MoeForCausalLM":
+        num_groups = config.num_experts
+        intermediate_size = config.moe_intermediate_size
+    elif config.architectures[0] in [
+        "DeepseekV2ForCausalLM",
+        "DeepseekV3ForCausalLM",
+    ]:
+        num_groups = config.n_routed_experts
+        intermediate_size = config.moe_intermediate_size
+    elif config.architectures[0] == "Llama4ForConditionalGeneration":
+        num_groups = config.text_config.num_local_experts
+        intermediate_size = config.text_config.intermediate_size
+    elif config.architectures[0] in [
+        "Grok1ForCausalLM",
+        "Grok1ImgGen",
+        "Grok1AForCausalLM",
+    ]:
+        num_groups = config.num_local_experts
+        intermediate_size = config.moe_intermediate_size
+    else:
+        num_groups = config.num_local_experts
+        intermediate_size = config.intermediate_size
+
+    shape_configs = {
+        "num_groups": num_groups,
+        "hidden_size": config.hidden_size,
+        "intermediate_size": intermediate_size,
+        "dtype": config.torch_dtype,
+    }
+    print(f"{shape_configs=}")
+    return shape_configs
+
+
+def create_test_data(batch_size, num_groups, hidden_size, intermediate_size):
+    torch.manual_seed(42)
+
+    tokens_per_group = batch_size // num_groups
+    m_sizes = torch.full(
+        (num_groups,), tokens_per_group, dtype=torch.int32, device="cuda"
+    )
+
+    x = torch.randn(batch_size, hidden_size, dtype=torch.bfloat16, device="cuda")
+
+    base_weights = torch.randn(
+        num_groups, intermediate_size, hidden_size, dtype=torch.bfloat16, device="cuda"
+    )
+
+    w_fbgemm = base_weights.reshape(num_groups * intermediate_size, hidden_size)
+    w_sglang = base_weights
+
+    c_fbgemm = torch.empty(
+        batch_size, intermediate_size, dtype=torch.bfloat16, device="cuda"
+    )
+    c_sglang = torch.empty(
+        batch_size, intermediate_size, dtype=torch.bfloat16, device="cuda"
+    )
+
+    seg_indptr = torch.zeros(num_groups + 1, dtype=torch.int32, device="cuda")
+    for i in range(1, num_groups + 1):
+        seg_indptr[i] = seg_indptr[i - 1] + tokens_per_group
+
+    weight_indices = torch.arange(num_groups, dtype=torch.int32, device="cuda")
+
+    return (
+        x,
+        w_fbgemm,
+        w_sglang,
+        c_fbgemm,
+        c_sglang,
+        m_sizes,
+        seg_indptr,
+        weight_indices,
+    )
+
+
+def create_fp8_test_data(
+    batch_size, num_groups, hidden_size, intermediate_size, backend="triton"
+):
+    """
+    Create test data for FP8 grouped GEMM operations.
+
+    Args:
+        batch_size: Total batch size
+        num_groups: Number of groups
+        hidden_size: Hidden dimension size
+        intermediate_size: Intermediate dimension size
+        backend: "triton" for Triton GEMM, "cutlass" for CUTLASS GEMM
+
+    Returns:
+        For triton: (x_fp8, w_fp8, m_sizes, x_scale, w_scale)
+        For cutlass: (x, wq, w_scale, m_sizes)
+    """
+    torch.manual_seed(42)
+
+    tokens_per_group = batch_size // num_groups
+
+    # Create weight matrices for each group
+    w_list = []
+    for _ in range(num_groups):
+        w = torch.randn(
+            intermediate_size, hidden_size, dtype=torch.float16, device="cuda"
+        )
+        w_list.append(w)
+
+    # Quantize weights using quantize_fp8_row for each group
+    wq_list, w_scale_list = zip(*[quantize_fp8_row(w) for w in w_list])
+
+    if backend == "triton":
+        # Triton format: concatenated weights
+        w_fp8 = torch.concat(wq_list, dim=0).contiguous()
+        w_scale = torch.concat(w_scale_list, dim=0).contiguous()
+
+        # Create m_sizes as int32 for triton
+        m_sizes = torch.full(
+            (num_groups,), tokens_per_group, dtype=torch.int32, device="cuda"
+        )
+
+        # Create and quantize input
+        x_fp16 = torch.randn(
+            batch_size, hidden_size, dtype=torch.float16, device="cuda"
+        )
+        x_fp8, x_scale = triton_quantize_fp8_row(x_fp16)
+        x_scale = x_scale.view(batch_size, -1)
+
+        return x_fp8, w_fp8, m_sizes, x_scale, w_scale
+
+    elif backend == "cutlass":
+        # CUTLASS format: stacked weights
+        wq = torch.stack(wq_list, dim=0).contiguous()
+        w_scale = torch.stack(w_scale_list, dim=0).contiguous()
+
+        # Create m_sizes as int64 for cutlass
+        m_values = [tokens_per_group] * num_groups
+        m_sizes = torch.tensor(m_values).to(dtype=torch.int64, device="cuda")
+
+        # Create input data - separate for each group then concat
+        x_list = []
+        for _ in range(num_groups):
+            x = torch.randn(
+                tokens_per_group, hidden_size, dtype=torch.float16, device="cuda"
+            )
+            x_list.append(x)
+
+        # Concatenate inputs into single tensor
+        x = torch.concat(x_list, dim=0).contiguous()
+
+        return x, wq, w_scale, m_sizes
+
+    else:
+        raise ValueError(f"Unsupported backend: {backend}")
+
+
+def calculate_memory_bandwidth(m_sizes, hidden_size, intermediate_size, dtype):
+    """
+    Calculate memory bandwidth based on accessed expert weights.
+
+    Args:
+        m_sizes: Tensor containing batch sizes for each group
+        hidden_size: Hidden dimension size
+        intermediate_size: Intermediate dimension size
+        dtype: Data type of weights
+
+    Returns:
+        Memory size in bytes for accessed expert weights
+    """
+    # Count non-zero groups (active experts)
+    if hasattr(m_sizes, "cpu"):
+        active_experts = torch.count_nonzero(m_sizes).item()
+    else:
+        active_experts = sum(1 for m in m_sizes if m > 0)
+
+    # Calculate bytes per element based on dtype
+    if dtype in [torch.float16, torch.bfloat16]:
+        bytes_per_element = 2
+    elif dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
+        bytes_per_element = 1
+    elif dtype == torch.float32:
+        bytes_per_element = 4
+    else:
+        # Default to 2 bytes for unknown dtypes
+        bytes_per_element = 2
+
+    # Memory per expert weight matrix
+    memory_per_expert = hidden_size * intermediate_size * bytes_per_element
+
+    # Total memory for active experts
+    total_memory_bytes = active_experts * memory_per_expert
+
+    return total_memory_bytes
+
+
+def get_benchmark_config(use_fp8_w8a8=False):
+    if use_fp8_w8a8:
+        return {
+            "line_vals": [
+                "fbgemm_triton_grouped_gemm_fp8",
+                "fbgemm_cutlass_f8f8bf16_rowwise",
+                "sglang_grouped_gemm",
+            ],
+            "line_names": [
+                "FBGEMM Triton Grouped GEMM FP8",
+                "FBGEMM CUTLASS F8F8BF16 Rowwise",
+                "SGLang Grouped GEMM FP8",
+            ],
+            "styles": [("blue", "-"), ("orange", "-"), ("red", "-")],
+        }
+    else:
+        return {
+            "line_vals": ["fbgemm_triton_grouped_gemm", "sglang_grouped_gemm"],
+            "line_names": [
+                "FBGEMM Triton Grouped GEMM BF16",
+                "SGLang Grouped GEMM BF16",
+            ],
+            "styles": [("blue", "-"), ("green", "-")],
+        }
+
+
+def run_benchmark(
+    model_config, use_fp8_w8a8=False, save_path="./benchmark_grouped_gemm/"
+):
+    config = get_benchmark_config(use_fp8_w8a8)
+
+    benchmark_config = triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[256, 512, 1024, 2048, 4096],
+        line_arg="provider",
+        line_vals=config["line_vals"],
+        line_names=config["line_names"],
+        styles=config["styles"],
+        ylabel="Bandwidth (GB/s)",
+        plot_name="grouped-gemm-performance",
+        args={},
+    )
+
+    @triton.testing.perf_report(benchmark_config)
+    def dynamic_benchmark(batch_size, provider, model_config, use_fp8_w8a8=False):
+        print(f"Benchmarking {provider} with batch_size={batch_size}")
+        torch.cuda.manual_seed_all(0)
+
+        num_groups = model_config["num_groups"]
+        hidden_size = model_config["hidden_size"]
+        intermediate_size = model_config["intermediate_size"]
+
+        if provider == "fbgemm_triton_grouped_gemm_fp8":
+            try:
+                test_data = create_fp8_test_data(
+                    batch_size,
+                    num_groups,
+                    hidden_size,
+                    intermediate_size,
+                    backend="triton",
+                )
+                x_fp8, w_fp8, m_sizes, x_scale, w_scale = test_data
+
+                # Calculate memory bandwidth
+                memory_bytes = calculate_memory_bandwidth(
+                    m_sizes, hidden_size, intermediate_size, torch.float8_e4m3fn
+                )
+
+                def run_func():
+                    return fbgemm_grouped_gemm_fp8_rowwise(
+                        x_fp8, w_fp8, m_sizes, x_scale, w_scale, use_fast_accum=True
+                    )
+
+            except Exception as e:
+                print(f"FP8 not supported, skipping: {e}")
+                return float("inf"), float("inf"), float("inf")
+
+        elif provider == "fbgemm_cutlass_f8f8bf16_rowwise":
+            try:
+                test_data = create_fp8_test_data(
+                    batch_size,
+                    num_groups,
+                    hidden_size,
+                    intermediate_size,
+                    backend="cutlass",
+                )
+                x, wq, w_scale, m_sizes = test_data
+
+                # Calculate memory bandwidth
+                memory_bytes = calculate_memory_bandwidth(
+                    m_sizes, hidden_size, intermediate_size, torch.float8_e4m3fn
+                )
+
+                # Quantize input using triton_quantize_fp8_row
+                xq, x_scale = triton_quantize_fp8_row(x)
+                x_scale = x_scale.view(batch_size, -1)
+
+                def run_func():
+                    return torch.ops.fbgemm.f8f8bf16_rowwise_grouped_stacked(
+                        xq, wq, x_scale, w_scale, m_sizes
+                    )
+
+            except Exception as e:
+                print(
+                    f"CUTLASS f8f8bf16_rowwise_grouped_stacked not supported, "
+                    f"skipping: {e}"
+                )
+                return float("inf"), float("inf"), float("inf")
+        else:
+            test_data = create_test_data(
+                batch_size, num_groups, hidden_size, intermediate_size
+            )
+            (
+                x,
+                w_fbgemm,
+                w_sglang,
+                c_fbgemm,
+                c_sglang,
+                m_sizes,
+                seg_indptr,
+                weight_indices,
+            ) = test_data
+
+            # Calculate memory bandwidth for BF16 operations
+            memory_bytes = calculate_memory_bandwidth(
+                m_sizes, hidden_size, intermediate_size, torch.bfloat16
+            )
+
+            if provider == "fbgemm_triton_grouped_gemm":
+
+                def run_func():
+                    return fbgemm_grouped_gemm(
+                        x, w_fbgemm, m_sizes, use_fast_accum=True
+                    )
+
+            else:
+
+                def run_func():
+                    return sglang_grouped_gemm(
+                        x,
+                        w_sglang,
+                        c_sglang,
+                        num_groups,
+                        weight_column_major=True,
+                        seg_indptr=seg_indptr,
+                        weight_indices=weight_indices,
+                        c_dtype=c_sglang.dtype,
+                    )
+
+        for _ in range(10):
+            try:
+                run_func()
+            except Exception as e:
+                print(f"Error during warmup for {provider}: {e}")
+                return float("inf"), float("inf"), float("inf")
+
+        torch.cuda.synchronize()
+
+        try:
+            quantiles = [0.5, 0.2, 0.8]
+            ms, min_ms, max_ms = triton.testing.do_bench(run_func, quantiles=quantiles)
+
+            # Convert time (ms) to bandwidth (GB/s)
+            # Bandwidth = Memory (bytes) / Time (seconds)
+            # Convert ms to seconds and bytes to GB (1e9)
+            gb_per_s = (memory_bytes / 1e9) / (ms / 1000)
+            # min bandwidth = max time, max bandwidth = min time
+            min_gb_per_s = (memory_bytes / 1e9) / (max_ms / 1000)
+            max_gb_per_s = (memory_bytes / 1e9) / (min_ms / 1000)
+
+            return gb_per_s, min_gb_per_s, max_gb_per_s
+        except Exception as e:
+            print(f"Error during benchmarking for {provider}: {e}")
+            return 0.0, 0.0, 0.0
+
+    dynamic_benchmark.run(
+        show_plots=True,
+        print_data=True,
+        save_path=save_path,
+        model_config=model_config,
+        use_fp8_w8a8=use_fp8_w8a8,
+    )
+
+
+def verify_correctness(model_config):
+    print("Verifying correctness...")
+    batch_size = 128
+    num_groups = model_config["num_groups"]
+    hidden_size = model_config["hidden_size"]
+    intermediate_size = model_config["intermediate_size"]
+
+    test_data = create_test_data(batch_size, num_groups, hidden_size, intermediate_size)
+    (
+        x,
+        w_fbgemm,
+        w_sglang,
+        c_fbgemm,
+        c_sglang,
+        m_sizes,
+        seg_indptr,
+        weight_indices,
+    ) = test_data
+
+    result_fbgemm = fbgemm_grouped_gemm(x, w_fbgemm, m_sizes, use_fast_accum=True)
+
+    result_sglang = sglang_grouped_gemm(
+        x,
+        w_sglang,
+        c_sglang,
+        num_groups,
+        weight_column_major=True,
+        seg_indptr=seg_indptr,
+        weight_indices=weight_indices,
+        c_dtype=c_sglang.dtype,
+    )
+
+    if torch.allclose(result_fbgemm, result_sglang, rtol=1e-3, atol=1e-3):
+        print("✓ BF16 Correctness verification passed!")
+    else:
+        max_diff = torch.max(torch.abs(result_fbgemm - result_sglang))
+        print(f"✗ BF16 Correctness verification failed! Max diff: {max_diff}")
+        return False
+
+    return True
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Benchmark FBGEMM vs SGLang Grouped GEMM"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="mistralai/Mixtral-8x7B-Instruct-v0.1",
+        help="Model name to get configuration from",
+    )
+    parser.add_argument(
+        "--tp-size", type=int, default=1, help="Tensor parallelism size"
+    )
+    parser.add_argument(
+        "--use-fp8-w8a8", action="store_true", help="Enable FP8 W8A8 benchmark"
+    )
+    parser.add_argument(
+        "--save-path",
+        type=str,
+        default="./benchmark_grouped_gemm/",
+        help="Path to save benchmark results",
+    )
+    parser.add_argument(
+        "--verify-correctness",
+        action="store_true",
+        help="Verify correctness before benchmarking",
+    )
+
+    args = parser.parse_args()
+
+    try:
+        model_config = get_model_config(args.model, args.tp_size)
+    except Exception as e:
+        print(f"Failed to get model config: {e}")
+        print("Using default configuration...")
+        model_config = {
+            "num_groups": 8,
+            "hidden_size": 4096,
+            "intermediate_size": 14336,
+            "dtype": torch.bfloat16,
+        }
+
+    print("Running benchmark with:")
+    print(f"  num_groups: {model_config['num_groups']}")
+    print(f"  hidden_size: {model_config['hidden_size']}")
+    print(f"  intermediate_size: {model_config['intermediate_size']}")
+    print(f"  use_fp8_w8a8: {args.use_fp8_w8a8}")
+
+    if args.verify_correctness:
+        if not verify_correctness(model_config):
+            print("Correctness verification failed. Exiting...")
+            return
+
+    try:
+        run_benchmark(
+            model_config=model_config,
+            use_fp8_w8a8=args.use_fp8_w8a8,
+            save_path=args.save_path,
+        )
+    except Exception as e:
+        print(f"Benchmark failed: {e}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/kernels/flashinfer_allreduce_fusion/README.md b/benchmark/kernels/flashinfer_allreduce_fusion/README.md
new file mode 100644
index 000000000..e651604c7
--- /dev/null
+++ b/benchmark/kernels/flashinfer_allreduce_fusion/README.md
@@ -0,0 +1,102 @@
+# FlashInfer Fused AllReduce + RMSNorm Benchmark
+
+This benchmark script is modified from the [original implementation](https://github.com/vllm-project/vllm/blob/237e1fb887c7f5a579420fa0295097f24b006594/benchmarks/kernels/benchmark_fused_collective.py) by the vLLM community. It aims to compare the performance differences between FlashInfer fused operators in SGLang (trtllm_allreduce_fusion: AllReduce + Residual Add + RMSNorm + optional quantization) and conventional implementations (standard `tensor_model_parallel_all_reduce` + separate RMSNorm/quantization). Specifically, this script tests the timing performance of two implementation paths: 1) Standard AllReduce and RMSNorm executed separately; 2) FlashInfer's fused operator combining AllReduce, Residual Add, RMSNorm, and optional quantization operations.
+
+This benchmark script helps us tune the ipc workspace size of the `flashinfer_allreduce_residual_rmsnorm` operator in SGLang and prepare for applications with FP8/FP4 quantized fused operators.
+
+Script path: `benchmark/kernels/flashinfer_allreduce_fusion/benchmark_fused_collective.py`
+
+## Feature Overview
+
+- Compare average execution time (ms) and calculate speedup ratios for the following paths:
+  - standard_allreduce_rmsnorm (Standard AllReduce + RMSNorm)
+  - flashinfer_fused_allreduce_rmsnorm (Fused AllReduce + RMSNorm), including oneshot and twoshot modes
+  - Optionally compare FP8/FP4 quantized fused paths with standard paths
+- Use CUDA Graph capture and batch replay to reduce measurement noise
+- Automatically select the faster "standard baseline" (native/compiled version) as the denominator for speedup calculation
+- Optionally export results in Markdown format
+
+## Runtime Environment and Prerequisites
+
+- At least 2 GPUs, and launch multi-process distributed training using `torchrun` (NCCL backend)
+- Properly install/compile sglang along with sgl-kernel and custom operators
+
+## Quick Start (Command Examples)
+
+The following examples use world_size=2. You can modify `--nproc_per_node` and parameters according to your machine:
+
+- Regular paths only (no quantization):
+```
+torchrun --nproc_per_node=2 \
+benchmark/kernels/flashinfer_allreduce_fusion/benchmark_fused_collective.py \
+--no-quant --hidden-dim 1024 --seq-lens 512 1024 2048 4096 --trials 100
+```
+
+- FP8 quantization paths only:
+```
+torchrun --nproc_per_node=2 \
+benchmark/kernels/flashinfer_allreduce_fusion/benchmark_fused_collective.py \
+--quant-fp8 --hidden-dim 1024 --seq-lens 512 1024 2048 4096 --trials 100
+```
+
+- FP4 quantization paths only:
+```
+torchrun --nproc_per_node=2 \
+benchmark/kernels/flashinfer_allreduce_fusion/benchmark_fused_collective.py \
+--quant-fp4 --hidden-dim 1024 --seq-lens 512 1024 2048 4096 --trials 100
+```
+
+- Larger hidden dimensions:
+```
+torchrun --nproc_per_node=2 \
+benchmark/kernels/flashinfer_allreduce_fusion/benchmark_fused_collective.py \
+--no-quant  --hidden-dim 4096 --seq-lens 512 1024 2048 4096 --trials 100
+```
+
+## Parameter Description
+- `--seq-lens`: List of sequence lengths to test (default: 128 512 1024 2048)
+- `--hidden-dim`: Hidden dimension (default: 8192)
+- `--dtypes`: Data type list, `float16|bfloat16|float32` (default: bfloat16)
+- `--no-residual`: Only test "no residual" scenarios (default tests both "with/without residual")
+- Mutually exclusive quantization options:
+  - `--no-quant`: No quantization testing
+  - `--quant-fp8`: Only FP8 quantization testing
+  - `--quant-fp4`: Only FP4 quantization testing
+  - `--quant-all`: Test all (default)
+- FlashInfer related:
+  - `--disable-oneshot`: Disable oneshot mode (default enables oneshot and tests twoshot simultaneously)
+- Runtime configuration:
+  - `--warmup`: Warmup count before graph capture and before graph replay (default 5)
+  - `--trials`: Benchmark iteration count (default 20; internally each `graph.replay()` will batch replay multiple times)
+  - `--output-file`: Save results as Markdown file (only rank0 takes effect)
+
+## Output Example
+
+Each configuration group prints a table showing average execution time and relative speedup ratios (baseline is the faster standard implementation). For example:
+```
+================================================================================
+Results: seq_len=1024, hidden_dim=1024
+dtype=torch.bfloat16, residual=yes, quant_mode=none
+================================================================================
+Operation                                          Time (ms)    Speedup
+--------------------------------------------------------------------------------
+standard_allreduce_rmsnorm                         0.024        0.98x
+standard_allreduce_rmsnorm_native_compiled         0.023        baseline
+flashinfer_fused_allreduce_rmsnorm_oneshot         0.011        2.19x
+flashinfer_fused_allreduce_rmsnorm_twoshot         0.041        0.57x
+```
+
+If `--output-file` is specified, all configurations will be summarized in Markdown tables in that file.
+
+## Important Notes and Recommendations
+
+- Distributed: The script uses `torchrun` environment variables to initialize distributed training and binds tensors/communication groups to the current rank's corresponding device.
+- World size: Requires `WORLD_SIZE > 1` to perform communication operator benchmarks. Otherwise, the script will error and prompt.
+- FlashInfer:
+  - If not installed or interfaces are missing, the script will only run standard paths and provide prompts in the logs.
+  - The fused operator internally uses "oneshot"/"twoshot" two trigger methods; oneshot is enabled by default and twoshot is tested simultaneously.
+- FP8/FP4:
+  - FP8 uses sglang's FP8 tools and dtype, with underlying platform selection of `e4m3`/`e4m3fnuz` etc.
+  - FP4 uses sgl-kernel's `scaled_fp4_quant`, requiring corresponding platform support.
+- CUDA Graph:
+  - Uses sglang's `graph_capture()` to prepare capture-ready state for communication, then uses `torch.cuda.graph` to capture kernels, reducing measurement jitter.
diff --git a/benchmark/kernels/flashinfer_allreduce_fusion/benchmark_fused_collective.py b/benchmark/kernels/flashinfer_allreduce_fusion/benchmark_fused_collective.py
new file mode 100644
index 000000000..4aebf62b9
--- /dev/null
+++ b/benchmark/kernels/flashinfer_allreduce_fusion/benchmark_fused_collective.py
@@ -0,0 +1,1304 @@
+# Modified from https://github.com/vllm-project/vllm/blob/237e1fb887c7f5a579420fa0295097f24b006594/benchmarks/kernels/benchmark_fused_collective.py
+
+"""
+Benchmark for FlashInfer fused collective operations vs standard operations.
+
+This benchmark compares:
+1. FlashInfer's trtllm_allreduce_fusion (fused allreduce + rmsnorm + optional quant)
+2. Standard tensor_model_parallel_all_reduce + separate rmsnorm/quant operations
+
+Usage with torchrun:
+    torchrun --nproc_per_node=2 benchmark/kernels/flashinfer_allreduce_fusion/benchmark_fused_collective.py --no-quant --hidden-dim 1024 --seq-len 512 1024 2048 4096 --trials 100
+    torchrun --nproc_per_node=2 benchmark/kernels/flashinfer_allreduce_fusion/benchmark_fused_collective.py --quant-fp8 --hidden-dim 1024 --seq-len 512 1024 2048 4096 --trials 100
+    torchrun --nproc_per_node=2 benchmark/kernels/flashinfer_allreduce_fusion/benchmark_fused_collective.py --quant-fp4 --hidden-dim 1024 --seq-len 512 1024 2048 4096 --trials 100
+
+    torchrun --nproc_per_node=2 benchmark/kernels/flashinfer_allreduce_fusion/benchmark_fused_collective.py --no-quant --hidden-dim 4096 --seq-len 512 1024 2048 4096 --trials 100
+    torchrun --nproc_per_node=2 benchmark/kernels/flashinfer_allreduce_fusion/benchmark_fused_collective.py --quant-fp8 --hidden-dim 4096 --seq-len 512 1024 2048 4096 --trials 100
+    torchrun --nproc_per_node=2 benchmark/kernels/flashinfer_allreduce_fusion/benchmark_fused_collective.py --quant-fp4 --hidden-dim 4096 --seq-len 512 1024 2048 4096 --trials 100
+"""
+
+import argparse
+import contextlib
+import itertools
+import logging
+import os
+import time
+from typing import Optional
+
+import torch  # type: ignore
+import torch.distributed as dist  # type: ignore
+
+from sglang.srt.distributed import get_tp_group, tensor_model_parallel_all_reduce
+from sglang.srt.distributed.parallel_state import (
+    cleanup_dist_env_and_memory,
+    graph_capture,
+    init_distributed_environment,
+    initialize_model_parallel,
+)
+from sglang.srt.layers.layernorm import RMSNorm  # noqa
+from sglang.srt.layers.quantization.fp8_kernel import fp8_dtype as SGLANG_FP8_DTYPE
+from sglang.srt.layers.quantization.fp8_kernel import static_quant_fp8
+
+try:
+    from sgl_kernel import fused_add_rmsnorm as SGL_FUSED_ADD_RMS_NORM
+    from sgl_kernel import rmsnorm as SGL_RMS_NORM
+    from sgl_kernel import scaled_fp4_quant as SGL_SCALED_FP4_QUANT
+except Exception:  # pragma: no cover - fallback on non-supported platforms
+    SGL_FUSED_ADD_RMS_NORM = None
+    SGL_RMS_NORM = None
+    SGL_SCALED_FP4_QUANT = None
+
+FP8_DTYPE = SGLANG_FP8_DTYPE
+
+logger = logging.getLogger(__name__)
+
+# Try to import FlashInfer
+try:
+    import flashinfer.comm as flashinfer_comm  # type: ignore
+
+    if not hasattr(flashinfer_comm, "trtllm_allreduce_fusion"):
+        flashinfer_comm = None
+        logger.warning(
+            "FlashInfer comm module found but missing trtllm_allreduce_fusion"
+        )
+except ImportError:
+    flashinfer_comm = None
+    logger.warning("FlashInfer not found, only benchmarking standard operations")
+
+# Constants
+MiB = 1024 * 1024
+
+# FlashInfer max sizes per world size
+# Enable 64MB for 2, 4, 8 world sizes to verify large input sizes
+# use --disable-oneshot to disable oneshot mode for very large input sizes
+_FI_MAX_SIZES = {
+    2: 64 * MiB,  # 64MB
+    4: 64 * MiB,  # 64MB
+    8: 64 * MiB,  # 64MB
+}
+
+# Global workspace tensor for FlashInfer
+_FI_WORKSPACE_TENSOR = None
+
+
+def setup_flashinfer_workspace(
+    world_size: int,
+    rank: int,
+    hidden_dim: int,
+    max_token_num: int,
+    use_fp32_lamport: bool = False,
+):
+    """Setup FlashInfer workspace for fused allreduce operations."""
+    global _FI_WORKSPACE_TENSOR
+
+    if flashinfer_comm is None:
+        return None, None
+
+    if world_size not in _FI_MAX_SIZES:
+        logger.warning("FlashInfer not supported for world size %s", world_size)
+        return None, None
+
+    try:
+        # Create IPC workspace
+        ipc_handles, workspace_tensor = (
+            flashinfer_comm.trtllm_create_ipc_workspace_for_all_reduce_fusion(
+                tp_rank=rank,
+                tp_size=world_size,
+                max_token_num=max_token_num,
+                hidden_dim=hidden_dim,
+                group=get_tp_group().device_group,
+                use_fp32_lamport=use_fp32_lamport,
+            )
+        )
+
+        _FI_WORKSPACE_TENSOR = workspace_tensor
+        return ipc_handles, workspace_tensor
+    except Exception as e:
+        logger.error("Failed to setup FlashInfer workspace: %s", e)
+        return None, None
+
+
+def cleanup_flashinfer_workspace(ipc_handles):
+    """Cleanup FlashInfer workspace."""
+    if flashinfer_comm is None or ipc_handles is None:
+        return
+
+    try:
+        group = get_tp_group().device_group
+        flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce(ipc_handles, group)
+    except Exception as e:
+        logger.error("Failed to cleanup FlashInfer workspace: %s", e)
+
+
+class FlashInferFusedAllReduceParams:
+    """Parameters for FlashInfer fused allreduce operations."""
+
+    def __init__(
+        self,
+        rank: int,
+        world_size: int,
+        use_fp32_lamport: bool = False,
+        max_token_num: int = 1024,
+    ):
+        self.rank = rank
+        self.world_size = world_size
+        self.use_fp32_lamport = use_fp32_lamport
+        self.trigger_completion_at_end = True
+        self.launch_with_pdl = True
+        self.fp32_acc = True
+        self.max_token_num = max_token_num
+
+    def get_trtllm_fused_allreduce_kwargs(self):
+        return {
+            "world_rank": self.rank,
+            "world_size": self.world_size,
+            "launch_with_pdl": self.launch_with_pdl,
+            "trigger_completion_at_end": self.trigger_completion_at_end,
+            "fp32_acc": self.fp32_acc,
+        }
+
+
+def flashinfer_fused_allreduce_rmsnorm(
+    input_tensor: torch.Tensor,
+    residual: Optional[torch.Tensor],
+    rms_gamma: torch.Tensor,
+    rms_eps: float,
+    allreduce_params: "FlashInferFusedAllReduceParams",
+    use_oneshot: bool,
+    norm_out: Optional[torch.Tensor] = None,
+):
+    """FlashInfer fused allreduce + rmsnorm operation."""
+    if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None:
+        raise RuntimeError("FlashInfer not available or workspace not initialized")
+
+    if norm_out is None:
+        norm_out = input_tensor
+        residual_out = residual
+    else:
+        residual_out = input_tensor
+
+    flashinfer_comm.trtllm_allreduce_fusion(
+        allreduce_in=input_tensor,
+        token_num=input_tensor.shape[0],
+        residual_in=residual,
+        residual_out=residual_out,
+        norm_out=norm_out,
+        rms_gamma=rms_gamma,
+        rms_eps=rms_eps,
+        hidden_dim=input_tensor.shape[-1],
+        workspace_ptrs=_FI_WORKSPACE_TENSOR,
+        pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm,
+        allreduce_out=None,
+        quant_out=None,
+        scale_out=None,
+        layout_code=None,
+        scale_factor=None,
+        use_oneshot=use_oneshot,
+        **allreduce_params.get_trtllm_fused_allreduce_kwargs(),
+    )
+
+
+def flashinfer_fused_allreduce_rmsnorm_fp8_quant(
+    input_tensor: torch.Tensor,
+    residual: Optional[torch.Tensor],
+    rms_gamma: torch.Tensor,
+    rms_eps: float,
+    scale_factor: torch.Tensor,
+    allreduce_params: FlashInferFusedAllReduceParams,
+    use_oneshot: bool = True,
+    norm_out: Optional[torch.Tensor] = None,
+    quant_out: Optional[torch.Tensor] = None,
+):
+    """FlashInfer fused allreduce + rmsnorm + FP8 quantization."""
+    if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None:
+        raise RuntimeError("FlashInfer not available or workspace not initialized")
+
+    if norm_out is None:
+        norm_out = input_tensor
+        residual_out = residual
+    else:
+        residual_out = input_tensor
+
+    flashinfer_comm.trtllm_allreduce_fusion(
+        allreduce_in=input_tensor,
+        token_num=input_tensor.shape[0],
+        residual_in=residual,
+        residual_out=residual_out,
+        norm_out=norm_out,
+        rms_gamma=rms_gamma,
+        rms_eps=rms_eps,
+        hidden_dim=input_tensor.shape[-1],
+        workspace_ptrs=_FI_WORKSPACE_TENSOR,
+        pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant,
+        allreduce_out=None,
+        quant_out=quant_out,
+        scale_out=None,
+        layout_code=None,
+        scale_factor=scale_factor,
+        use_oneshot=use_oneshot,
+        **allreduce_params.get_trtllm_fused_allreduce_kwargs(),
+    )
+
+
+def flashinfer_fused_allreduce_rmsnorm_fp4_quant(
+    input_tensor: torch.Tensor,
+    residual: Optional[torch.Tensor],
+    rms_gamma: torch.Tensor,
+    rms_eps: float,
+    input_global_scale: torch.Tensor,
+    allreduce_params: FlashInferFusedAllReduceParams,
+    quant_out: torch.Tensor,
+    use_oneshot: bool,
+    output_scale: torch.Tensor,
+    norm_out: Optional[torch.Tensor] = None,
+):
+    """FlashInfer fused allreduce + rmsnorm + FP4 quantization."""
+    if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None:
+        raise RuntimeError("FlashInfer not available or workspace not initialized")
+
+    if norm_out is None:
+        norm_out = input_tensor
+        residual_out = residual
+    else:
+        residual_out = input_tensor
+
+    flashinfer_comm.trtllm_allreduce_fusion(
+        allreduce_in=input_tensor,
+        token_num=input_tensor.shape[0],
+        residual_in=residual,
+        residual_out=residual_out,
+        norm_out=norm_out,
+        rms_gamma=rms_gamma,
+        rms_eps=rms_eps,
+        hidden_dim=input_tensor.shape[-1],
+        workspace_ptrs=_FI_WORKSPACE_TENSOR,
+        pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant,
+        allreduce_out=None,
+        quant_out=quant_out,
+        scale_out=output_scale,
+        layout_code=None,
+        scale_factor=input_global_scale,
+        use_oneshot=use_oneshot,
+        **allreduce_params.get_trtllm_fused_allreduce_kwargs(),
+    )
+
+
+def standard_allreduce_rmsnorm(
+    input_tensor: torch.Tensor,
+    residual: Optional[torch.Tensor],
+    rms_gamma: torch.Tensor,
+    rms_eps: float,
+    norm_out: Optional[torch.Tensor] = None,
+):
+    """Standard allreduce + rmsnorm operations."""
+    # All-reduce first
+    allreduce_out = tensor_model_parallel_all_reduce(input_tensor)
+    # Then RMS norm
+    if residual is not None:
+        # Fused add + RMS norm (in-place on allreduce_out)
+        if SGL_FUSED_ADD_RMS_NORM is not None:
+            SGL_FUSED_ADD_RMS_NORM(allreduce_out, residual, rms_gamma, rms_eps)
+        else:
+            rms = RMSNorm(allreduce_out.shape[-1], eps=rms_eps)
+            rms.weight.data = rms_gamma
+            rms.forward_native(allreduce_out, residual)
+    else:
+        # Just RMS norm
+        if SGL_RMS_NORM is not None:
+            _ = SGL_RMS_NORM(allreduce_out, rms_gamma, rms_eps)
+        else:
+            rms = RMSNorm(allreduce_out.shape[-1], eps=rms_eps)
+            rms.weight.data = rms_gamma
+            _ = rms.forward_native(allreduce_out)
+
+
+def standard_allreduce_rmsnorm_fp8_quant(
+    input_tensor: torch.Tensor,
+    residual: Optional[torch.Tensor],
+    rms_gamma: torch.Tensor,
+    rms_eps: float,
+    scale_factor: torch.Tensor,
+    norm_out: Optional[torch.Tensor] = None,
+    quant_out: Optional[torch.Tensor] = None,
+):
+    """Standard allreduce + rmsnorm + FP8 quantization."""
+    # All-reduce first
+    allreduce_out = tensor_model_parallel_all_reduce(input_tensor)
+
+    # Then RMS norm + static FP8 quantization
+    if residual is not None:
+        if SGL_FUSED_ADD_RMS_NORM is not None:
+            SGL_FUSED_ADD_RMS_NORM(allreduce_out, residual, rms_gamma, rms_eps)
+            quant_out, _ = static_quant_fp8(
+                allreduce_out, scale_factor, repeat_scale=False
+            )
+        else:
+            rms = RMSNorm(allreduce_out.shape[-1], eps=rms_eps)
+            rms.weight.data = rms_gamma
+            normed, _ = rms.forward_native(allreduce_out, residual)
+            quant_out, _ = static_quant_fp8(normed, scale_factor, repeat_scale=False)
+        return quant_out, residual
+    else:
+        if SGL_RMS_NORM is not None:
+            normed = SGL_RMS_NORM(allreduce_out, rms_gamma, rms_eps)
+        else:
+            rms = RMSNorm(allreduce_out.shape[-1], eps=rms_eps)
+            rms.weight.data = rms_gamma
+            normed = rms.forward_native(allreduce_out)
+        quant_out, _ = static_quant_fp8(normed, scale_factor, repeat_scale=False)
+        return quant_out
+
+
+def standard_allreduce_rmsnorm_fp4_quant(
+    input_tensor: torch.Tensor,
+    residual: Optional[torch.Tensor],
+    rms_gamma: torch.Tensor,
+    rms_eps: float,
+    input_global_scale: torch.Tensor,
+    quant_out: torch.Tensor,
+    output_scale: torch.Tensor,
+    norm_out: Optional[torch.Tensor] = None,
+):
+    """Standard allreduce + rmsnorm + FP4 quantization."""
+
+    # All-reduce first
+    allreduce_out = tensor_model_parallel_all_reduce(input_tensor)
+
+    # Then RMS norm
+    if residual is not None:
+        if SGL_FUSED_ADD_RMS_NORM is not None:
+            SGL_FUSED_ADD_RMS_NORM(allreduce_out, residual, rms_gamma, rms_eps)
+            quant_input = allreduce_out
+        else:
+            rms = RMSNorm(allreduce_out.shape[-1], eps=rms_eps)
+            rms.weight.data = rms_gamma
+            quant_input, _ = rms.forward_native(allreduce_out, residual)
+        residual_out = residual
+    else:
+        if SGL_RMS_NORM is not None:
+            quant_input = SGL_RMS_NORM(allreduce_out, rms_gamma, rms_eps)
+        else:
+            rms = RMSNorm(allreduce_out.shape[-1], eps=rms_eps)
+            rms.weight.data = rms_gamma
+            quant_input = rms.forward_native(allreduce_out)
+        residual_out = allreduce_out
+
+    # Finally FP4 quantization
+    if SGL_SCALED_FP4_QUANT is None:
+        raise RuntimeError("scaled_fp4_quant is not available on this platform")
+    quant_res, output_scale_res = SGL_SCALED_FP4_QUANT(quant_input, input_global_scale)
+    if residual is not None:
+        return quant_res, residual_out, output_scale_res
+    else:
+        return quant_res, quant_input
+
+
+def standard_allreduce_rmsnorm_native(
+    input_tensor: torch.Tensor,
+    residual: Optional[torch.Tensor],
+    rmsnorm_layer: RMSNorm,
+    norm_out: Optional[torch.Tensor] = None,
+):
+    """Standard allreduce + rmsnorm operations using native RMSNorm forward."""
+    # All-reduce first
+    allreduce_out = tensor_model_parallel_all_reduce(input_tensor)
+    # Apply native RMSNorm
+    if residual is not None:
+        result = rmsnorm_layer.forward_native(allreduce_out, residual)
+        return result  # Returns (norm_out, residual_out)
+    else:
+        result = rmsnorm_layer.forward_native(allreduce_out)
+        return result  # Returns norm_out
+
+
+def standard_allreduce_rmsnorm_fp8_quant_native(
+    input_tensor: torch.Tensor,
+    residual: Optional[torch.Tensor],
+    rmsnorm_layer: RMSNorm,
+    scale_factor: torch.Tensor,
+    norm_out: Optional[torch.Tensor] = None,
+    quant_out: Optional[torch.Tensor] = None,
+):
+    """Standard allreduce + rmsnorm + FP8 quantization using native implementations."""
+    # All-reduce first
+    allreduce_out = tensor_model_parallel_all_reduce(input_tensor)
+
+    # Apply native RMSNorm
+    if residual is not None:
+        norm_out, residual_out = rmsnorm_layer.forward_native(allreduce_out, residual)
+    else:
+        norm_out = rmsnorm_layer.forward_native(allreduce_out)
+        residual_out = allreduce_out
+
+    # Apply native FP8 quantization
+    quant_out, _ = static_quant_fp8(norm_out, scale_factor, repeat_scale=False)
+
+    if residual is not None:
+        return quant_out, residual_out
+    else:
+        return quant_out
+
+
+def standard_allreduce_rmsnorm_fp4_quant_native(
+    input_tensor: torch.Tensor,
+    residual: Optional[torch.Tensor],
+    rmsnorm_layer: RMSNorm,
+    input_global_scale: torch.Tensor,
+    quant_out: torch.Tensor,
+    output_scale: torch.Tensor,
+    norm_out: Optional[torch.Tensor] = None,
+):
+    """Standard allreduce + rmsnorm + FP4 quantization using native RMSNorm."""
+    # All-reduce first
+    allreduce_out = tensor_model_parallel_all_reduce(input_tensor)
+
+    # Apply native RMSNorm
+    if residual is not None:
+        norm_out, residual_out = rmsnorm_layer.forward_native(allreduce_out, residual)
+        quant_input = norm_out
+    else:
+        norm_out = rmsnorm_layer.forward_native(allreduce_out)
+        quant_input = norm_out
+        residual_out = allreduce_out
+
+    # Apply FP4 quantization (still using fused CUDA op as there's no native FP4)
+    if SGL_SCALED_FP4_QUANT is None:
+        raise RuntimeError("scaled_fp4_quant is not available on this platform")
+    quant_res, output_scale_res = SGL_SCALED_FP4_QUANT(quant_input, input_global_scale)
+
+    if residual is not None:
+        return quant_res, residual_out, output_scale_res
+    else:
+        return quant_res, norm_out
+
+
+# Compiled versions of native functions
+@torch.compile
+def standard_allreduce_rmsnorm_native_compiled(
+    input_tensor: torch.Tensor,
+    residual: Optional[torch.Tensor],
+    rmsnorm_layer: RMSNorm,
+    norm_out: Optional[torch.Tensor] = None,
+):
+    """Compiled version of standard allreduce + rmsnorm."""
+    return standard_allreduce_rmsnorm_native(
+        input_tensor, residual, rmsnorm_layer, norm_out
+    )
+
+
+@torch.compile
+def standard_allreduce_rmsnorm_fp8_quant_native_compiled(
+    input_tensor: torch.Tensor,
+    residual: Optional[torch.Tensor],
+    rmsnorm_layer: RMSNorm,
+    scale_factor: torch.Tensor,
+    norm_out: Optional[torch.Tensor] = None,
+    quant_out: Optional[torch.Tensor] = None,
+):
+    """Compiled version of standard allreduce + rmsnorm + FP8 quantization."""
+    return standard_allreduce_rmsnorm_fp8_quant_native(
+        input_tensor,
+        residual,
+        rmsnorm_layer,
+        scale_factor,
+        norm_out,
+        quant_out,
+    )
+
+
+@torch.compile
+def standard_allreduce_rmsnorm_fp4_quant_native_compiled(
+    input_tensor: torch.Tensor,
+    residual: Optional[torch.Tensor],
+    rmsnorm_layer: RMSNorm,
+    input_global_scale: torch.Tensor,
+    quant_out: torch.Tensor,
+    output_scale: torch.Tensor,
+    norm_out: Optional[torch.Tensor] = None,
+):
+    """Compiled version of standard allreduce + rmsnorm + FP4 quantization."""
+    return standard_allreduce_rmsnorm_fp4_quant_native(
+        input_tensor,
+        residual,
+        rmsnorm_layer,
+        input_global_scale,
+        quant_out,
+        output_scale,
+        norm_out,
+    )
+
+
+def create_test_tensors(
+    seq_len: int, hidden_dim: int, dtype: torch.dtype, use_residual: bool = True
+):
+    """Create test tensors for benchmarking."""
+    input_tensor = torch.randn(seq_len, hidden_dim, dtype=dtype)
+    residual = (
+        torch.randn_like(input_tensor)
+        if use_residual
+        else torch.zeros_like(input_tensor)
+    )
+    rms_gamma = torch.ones(hidden_dim, dtype=dtype)
+    norm_out = None if use_residual else torch.empty_like(input_tensor)
+
+    # Quantization scales
+    scale_fp8 = torch.tensor(1.0, dtype=torch.float32)
+    scale_fp4 = torch.tensor(1.0, dtype=torch.float32)
+    quant_out_fp8 = torch.empty_like(input_tensor, dtype=FP8_DTYPE)
+    # Pre-allocate FP4 output tensors (to avoid allocation overhead in benchmarks)
+    fp4_quant_out = torch.empty((seq_len, hidden_dim // 2), dtype=torch.uint8)
+    fp4_output_scale = torch.empty((128, 4), dtype=torch.int32)
+
+    return (
+        input_tensor,
+        norm_out,
+        residual,
+        rms_gamma,
+        scale_fp8,
+        quant_out_fp8,
+        scale_fp4,
+        fp4_quant_out,
+        fp4_output_scale,
+    )
+
+
+def benchmark_operation(
+    operation_func, *args, warmup: int = 5, trials: int = 20, **kwargs
+):
+    """Benchmark a single operation using CUDA graphs."""
+    # Warmup before graph capture
+    for _ in range(warmup):
+        operation_func(*args, **kwargs)
+    torch.cuda.synchronize()
+
+    # Create CUDA graph
+    graph = torch.cuda.CUDAGraph()
+    num_op_per_cudagraph = 10
+
+    # Use sglang's graph_capture to make tensor_model_parallel_all_reduce graph-safe
+    with graph_capture() as graph_capture_context:
+        with torch.cuda.graph(graph, stream=graph_capture_context.stream):
+            for _ in range(num_op_per_cudagraph):
+                operation_func(*args, **kwargs)
+
+    # Graph warmup
+    torch.cuda.synchronize()
+    for _ in range(warmup):
+        graph.replay()
+
+    # Benchmark with CUDA graph
+    torch.cuda.synchronize()
+    start_time = time.perf_counter()
+
+    for _ in range(trials // num_op_per_cudagraph):
+        # operation_func(*args, **kwargs)
+        graph.replay()
+
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+
+    avg_time_ms = ((end_time - start_time) / trials) * 1000
+    return avg_time_ms
+
+
+def run_benchmarks(
+    seq_len: int,
+    hidden_dim: int,
+    dtype: torch.dtype,
+    use_residual: bool,
+    allreduce_params: Optional[FlashInferFusedAllReduceParams],
+    quant_mode: str = "all",
+    disable_oneshot: bool = False,
+):
+    """Run all benchmarks for given configuration.
+
+    Args:
+        quant_mode: "none", "fp8_only", "fp4_only", or "all"
+    """
+    (
+        input_tensor,
+        norm_out,
+        residual,
+        rms_gamma,
+        scale_fp8,
+        quant_out_fp8,
+        scale_fp4,
+        fp4_quant_out,
+        fp4_output_scale,
+    ) = create_test_tensors(seq_len, hidden_dim, dtype, use_residual)
+
+    rms_eps = 1e-6
+    results = {}
+
+    # Create RMSNorm once for native benchmarks
+    rmsnorm_layer = RMSNorm(hidden_dim, eps=rms_eps)
+    rmsnorm_layer.weight.data = rms_gamma
+
+    if quant_mode in ["all", "none"]:
+        # Standard AllReduce + RMSNorm
+        try:
+            time_ms = benchmark_operation(
+                standard_allreduce_rmsnorm,
+                input_tensor,
+                norm_out=norm_out,
+                residual=residual,
+                rms_gamma=rms_gamma,
+                rms_eps=rms_eps,
+            )
+            results["standard_allreduce_rmsnorm"] = time_ms
+        except Exception as e:
+            logger.error("Standard AllReduce+RMSNorm failed: %s", e)
+            results["standard_allreduce_rmsnorm"] = float("inf")
+
+        # Standard AllReduce + RMSNorm Native Compiled
+        try:
+            time_ms = benchmark_operation(
+                standard_allreduce_rmsnorm_native_compiled,
+                input_tensor,
+                residual=residual,
+                rmsnorm_layer=rmsnorm_layer,
+                norm_out=norm_out,
+            )
+            results["standard_allreduce_rmsnorm_native_compiled"] = time_ms
+        except Exception as e:
+            logger.error("Standard AllReduce+RMSNorm Native Compiled failed: %s", e)
+            results["standard_allreduce_rmsnorm_native_compiled"] = float("inf")
+
+        # FlashInfer Fused AllReduce + RMSNorm Oneshot
+        if flashinfer_comm is not None and allreduce_params is not None:
+            try:
+                if not disable_oneshot:
+                    time_ms = benchmark_operation(
+                        flashinfer_fused_allreduce_rmsnorm,
+                        input_tensor,
+                        residual=residual,
+                        norm_out=norm_out,
+                        rms_gamma=rms_gamma,
+                        rms_eps=rms_eps,
+                        allreduce_params=allreduce_params,
+                        use_oneshot=True,
+                    )
+                    results["flashinfer_fused_allreduce_rmsnorm_oneshot"] = time_ms
+            except Exception as e:
+                logger.error("FlashInfer Fused AllReduce+RMSNorm Oneshot failed: %s", e)
+                results["flashinfer_fused_allreduce_rmsnorm_oneshot"] = float("inf")
+
+            # FlashInfer Fused AllReduce + RMSNorm Two-shot
+            try:
+                time_ms = benchmark_operation(
+                    flashinfer_fused_allreduce_rmsnorm,
+                    input_tensor,
+                    residual=residual,
+                    norm_out=norm_out,
+                    rms_gamma=rms_gamma,
+                    rms_eps=rms_eps,
+                    allreduce_params=allreduce_params,
+                    use_oneshot=False,
+                )
+                results["flashinfer_fused_allreduce_rmsnorm_twoshot"] = time_ms
+            except Exception as e:
+                logger.error(
+                    "FlashInfer Fused AllReduce+RMSNorm Two-shot failed: %s", e
+                )
+                results["flashinfer_fused_allreduce_rmsnorm_twoshot"] = float("inf")
+
+    if quant_mode in ["all", "fp8_only"]:
+        # Standard AllReduce + RMSNorm + FP8 Quant
+        try:
+            time_ms = benchmark_operation(
+                standard_allreduce_rmsnorm_fp8_quant,
+                input_tensor,
+                norm_out=norm_out,
+                residual=residual,
+                rms_gamma=rms_gamma,
+                rms_eps=rms_eps,
+                scale_factor=scale_fp8,
+                quant_out=quant_out_fp8,
+            )
+            results["standard_allreduce_rmsnorm_fp8_quant"] = time_ms
+        except Exception as e:
+            logger.error("Standard AllReduce+RMSNorm+FP8 failed: %s", e)
+            results["standard_allreduce_rmsnorm_fp8_quant"] = float("inf")
+
+        # Standard AllReduce + RMSNorm + FP8 Quant Native Compiled
+        try:
+            time_ms = benchmark_operation(
+                standard_allreduce_rmsnorm_fp8_quant_native_compiled,
+                input_tensor,
+                residual=residual,
+                rmsnorm_layer=rmsnorm_layer,
+                # quant_fp8_layer removed in sglang version; static_quant_fp8 is used within the function
+                scale_factor=scale_fp8,
+                norm_out=norm_out,
+                quant_out=quant_out_fp8,
+            )
+            results["standard_allreduce_rmsnorm_fp8_quant_native_compiled"] = time_ms
+        except Exception as e:
+            logger.error("Standard AllReduce+RMSNorm+FP8 Native Compiled failed: %s", e)
+            results["standard_allreduce_rmsnorm_fp8_quant_native_compiled"] = float(
+                "inf"
+            )
+
+        # FlashInfer Fused AllReduce + RMSNorm + FP8 Quant Oneshot
+        if flashinfer_comm is not None and allreduce_params is not None:
+            try:
+                if not disable_oneshot:
+                    time_ms = benchmark_operation(
+                        flashinfer_fused_allreduce_rmsnorm_fp8_quant,
+                        input_tensor,
+                        norm_out=norm_out,
+                        residual=residual,
+                        rms_gamma=rms_gamma,
+                        rms_eps=rms_eps,
+                        scale_factor=scale_fp8,
+                        quant_out=quant_out_fp8,
+                        allreduce_params=allreduce_params,
+                        use_oneshot=True,
+                    )
+                    results["flashinfer_fused_allreduce_rmsnorm_fp8_quant_oneshot"] = (
+                        time_ms
+                    )
+            except Exception as e:
+                logger.error(
+                    "FlashInfer Fused AllReduce+RMSNorm+FP8 Oneshot failed: %s",
+                    e,
+                )
+                results["flashinfer_fused_allreduce_rmsnorm_fp8_quant_oneshot"] = float(
+                    "inf"
+                )
+            # FlashInfer Fused AllReduce + RMSNorm + FP8 Quant Two-shot
+            try:
+                time_ms = benchmark_operation(
+                    flashinfer_fused_allreduce_rmsnorm_fp8_quant,
+                    input_tensor,
+                    norm_out=norm_out,
+                    residual=residual,
+                    rms_gamma=rms_gamma,
+                    rms_eps=rms_eps,
+                    scale_factor=scale_fp8,
+                    quant_out=quant_out_fp8,
+                    allreduce_params=allreduce_params,
+                    use_oneshot=False,
+                )
+                results["flashinfer_fused_allreduce_rmsnorm_fp8_quant_twoshot"] = (
+                    time_ms
+                )
+            except Exception as e:
+                logger.error(
+                    "FlashInfer Fused AllReduce+RMSNorm+FP8 Two-shot failed: %s",
+                    e,
+                )
+                results["flashinfer_fused_allreduce_rmsnorm_fp8_quant_twoshot"] = float(
+                    "inf"
+                )
+
+    if quant_mode in ["all", "fp4_only"]:
+        # Standard AllReduce + RMSNorm + FP4 Quant
+        try:
+            time_ms = benchmark_operation(
+                standard_allreduce_rmsnorm_fp4_quant,
+                input_tensor,
+                norm_out=norm_out,
+                residual=residual,
+                rms_gamma=rms_gamma,
+                rms_eps=rms_eps,
+                input_global_scale=scale_fp4,
+                quant_out=fp4_quant_out,
+                output_scale=fp4_output_scale,
+            )
+            results["standard_allreduce_rmsnorm_fp4_quant"] = time_ms
+        except Exception as e:
+            logger.error("Standard AllReduce+RMSNorm+FP4 failed: %s", e)
+            results["standard_allreduce_rmsnorm_fp4_quant"] = float("inf")
+
+        # Standard AllReduce + RMSNorm + FP4 Quant Native Compiled
+        try:
+            time_ms = benchmark_operation(
+                standard_allreduce_rmsnorm_fp4_quant_native_compiled,
+                input_tensor,
+                residual=residual,
+                rmsnorm_layer=rmsnorm_layer,
+                input_global_scale=scale_fp4,
+                quant_out=fp4_quant_out,
+                output_scale=fp4_output_scale,
+                norm_out=norm_out,
+            )
+            results["standard_allreduce_rmsnorm_fp4_quant_native_compiled"] = time_ms
+        except Exception as e:
+            logger.error("Standard AllReduce+RMSNorm+FP4 Native Compiled failed: %s", e)
+            results["standard_allreduce_rmsnorm_fp4_quant_native_compiled"] = float(
+                "inf"
+            )
+
+        # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Oneshot
+        if flashinfer_comm is not None and allreduce_params is not None:
+            try:
+                if not disable_oneshot:
+                    time_ms = benchmark_operation(
+                        flashinfer_fused_allreduce_rmsnorm_fp4_quant,
+                        input_tensor,
+                        residual=residual,
+                        norm_out=norm_out,
+                        rms_gamma=rms_gamma,
+                        rms_eps=rms_eps,
+                        input_global_scale=scale_fp4,
+                        allreduce_params=allreduce_params,
+                        quant_out=fp4_quant_out,
+                        output_scale=fp4_output_scale,
+                        use_oneshot=True,
+                    )
+                    results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_oneshot"] = (
+                        time_ms
+                    )
+            except Exception as e:
+                logger.error(
+                    "FlashInfer Fused AllReduce+RMSNorm+FP4 Oneshot failed: %s",
+                    e,
+                )
+                results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_oneshot"] = float(
+                    "inf"
+                )
+
+        # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Two-shot
+        if flashinfer_comm is not None and allreduce_params is not None:
+            try:
+                time_ms = benchmark_operation(
+                    flashinfer_fused_allreduce_rmsnorm_fp4_quant,
+                    input_tensor,
+                    residual=residual,
+                    norm_out=norm_out,
+                    rms_gamma=rms_gamma,
+                    rms_eps=rms_eps,
+                    input_global_scale=scale_fp4,
+                    allreduce_params=allreduce_params,
+                    quant_out=fp4_quant_out,
+                    output_scale=fp4_output_scale,
+                    use_oneshot=False,
+                )
+                results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_twoshot"] = (
+                    time_ms
+                )
+            except Exception as e:
+                logger.error(
+                    "FlashInfer Fused AllReduce+RMSNorm+FP4 Two-shot failed: %s",
+                    e,
+                )
+                results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_twoshot"] = float(
+                    "inf"
+                )
+
+    return results
+
+
+def prepare_results_with_speedups(results_dict):
+    """Prepare results with speedup calculations based on dynamic baseline selection."""
+    prepared_results = []
+
+    # Determine the fastest baseline for each operation type
+    def get_fastest_baseline(op_name, results_dict):
+        """Get the fastest baseline between standard and native_compiled versions."""
+        if "fp8_quant" in op_name:
+            candidates = [
+                "standard_allreduce_rmsnorm_fp8_quant",
+                "standard_allreduce_rmsnorm_fp8_quant_native_compiled",
+            ]
+        elif "fp4_quant" in op_name:
+            candidates = [
+                "standard_allreduce_rmsnorm_fp4_quant",
+                "standard_allreduce_rmsnorm_fp4_quant_native_compiled",
+            ]
+        else:
+            candidates = [
+                "standard_allreduce_rmsnorm",
+                "standard_allreduce_rmsnorm_native_compiled",
+            ]
+
+        # Find the fastest among available candidates
+        fastest_time = float("inf")
+        fastest_baseline = None
+
+        for candidate in candidates:
+            if (
+                candidate in results_dict
+                and results_dict[candidate] != float("inf")
+                and results_dict[candidate] < fastest_time
+            ):
+                fastest_time = results_dict[candidate]
+                fastest_baseline = candidate
+
+        return fastest_baseline
+
+    # Create dynamic baseline mapping
+    dynamic_baseline_mapping = {}
+    for op_name in results_dict:
+        if (
+            op_name.startswith("flashinfer_")
+            or op_name.startswith("standard_")
+            and not op_name.endswith("_native_compiled")
+        ):
+            dynamic_baseline_mapping[op_name] = get_fastest_baseline(
+                op_name, results_dict
+            )
+
+    for op_name, time_ms in results_dict.items():
+        if time_ms == float("inf"):
+            speedup_str = "FAILED"
+            time_str = "FAILED"
+        else:
+            time_str = f"{time_ms:.3f}"
+            # Find the appropriate baseline for this operation
+            baseline_op = dynamic_baseline_mapping.get(op_name)
+            if baseline_op and baseline_op in results_dict:
+                baseline_time = results_dict[baseline_op]
+                if baseline_time != float("inf") and baseline_time > 0:
+                    speedup = baseline_time / time_ms
+                    speedup_str = f"{speedup:.2f}x"
+                else:
+                    speedup_str = "N/A"
+            else:
+                # For baseline operations, determine if this is the fastest baseline
+                if op_name.endswith("_native_compiled") or (
+                    op_name.startswith("standard_")
+                    and not op_name.endswith("_native_compiled")
+                ):
+                    fastest_baseline = get_fastest_baseline(op_name, results_dict)
+                    if fastest_baseline == op_name:
+                        speedup_str = "baseline"
+                    else:
+                        if fastest_baseline and fastest_baseline in results_dict:
+                            baseline_time = results_dict[fastest_baseline]
+                            if baseline_time != float("inf") and baseline_time > 0:
+                                speedup = baseline_time / time_ms
+                                speedup_str = f"{speedup:.2f}x"
+                            else:
+                                speedup_str = "N/A"
+                        else:
+                            speedup_str = "N/A"
+                else:
+                    speedup_str = "N/A"
+
+        prepared_results.append(
+            {
+                "operation": op_name,
+                "time_ms": time_ms,
+                "time_str": time_str,
+                "speedup_str": speedup_str,
+            }
+        )
+
+    return prepared_results
+
+
+def print_results(results_dict, seq_len, hidden_dim, dtype, use_residual, quant_mode):
+    """Print benchmark results in a formatted table."""
+    print(f"\n{'=' * 80}")
+    print(f"Results: seq_len={seq_len}, hidden_dim={hidden_dim}")
+    print(
+        f"dtype={dtype}, residual={'yes' if use_residual else 'no'}, "
+        f"quant_mode={quant_mode}"
+    )
+    print(f"{'=' * 80}")
+    print(f"{'Operation':<50} {'Time (ms)':<12} {'Speedup':<10}")
+    print(f"{'-' * 80}")
+
+    # Prepare results with speedup calculations
+    prepared_results = prepare_results_with_speedups(results_dict)
+
+    for result in prepared_results:
+        if result["time_ms"] == float("inf"):
+            time_display = result["time_str"]
+        else:
+            time_display = f"{result['time_ms']:.3f}"
+
+        print(
+            f"{result['operation']:<50} {time_display:<12} {result['speedup_str']:<10}"
+        )
+
+
+def format_results_markdown(
+    all_results: list[dict], world_size: int, args: argparse.Namespace
+) -> str:
+    """Format all benchmark results as markdown."""
+    markdown = f"""# FlashInfer Fused Collective Operations Benchmark Results
+
+**World Size:** {world_size}
+**Hidden Dimension:** {args.hidden_dim}
+**Warmup Iterations:** {args.warmup}
+**Benchmark Trials:** {args.trials}
+**Quantization Mode:** {all_results[0]["quant_mode"] if all_results else "N/A"}
+
+---
+
+"""
+
+    for result in all_results:
+        seq_len = result["seq_len"]
+        dtype = result["dtype"]
+        use_residual = result["use_residual"]
+        results_dict = result["results"]
+
+        residual_str = "with residual" if use_residual else "no residual"
+
+        markdown += f"""
+## Configuration: seq_len={seq_len}, dtype={dtype}, {residual_str}
+
+| Operation | Time (ms) | Speedup |
+|-----------|-----------|---------|
+"""
+
+        # Prepare results with speedup calculations
+        prepared_results = prepare_results_with_speedups(results_dict)
+
+        for result in prepared_results:
+            # Format operation name for better readability
+            formatted_op_name = result["operation"].replace("_", " ").title()
+            markdown += f"| {formatted_op_name} | {result['time_str']} |"
+            markdown += f"{result['speedup_str']} |\n"
+
+        markdown += "\n"
+
+    return markdown
+
+
+def save_results_to_file(
+    all_results: list[dict], world_size: int, args: argparse.Namespace, rank: int
+):
+    """Save benchmark results to markdown file (only on rank 0)."""
+    if rank != 0:
+        return
+
+    if not all_results:
+        logger.warning("No results to save")
+        return
+
+    output_path = args.output_file
+
+    try:
+        markdown_content = format_results_markdown(all_results, world_size, args)
+
+        with open(output_path, "w") as f:
+            f.write(markdown_content)
+
+    except Exception as e:
+        logger.error("Failed to save results to file: %s", e)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Benchmark fused collective operations"
+    )
+    parser.add_argument(
+        "--seq-lens",
+        type=int,
+        nargs="+",
+        default=[128, 512, 1024, 2048],
+        help="Sequence lengths to test",
+    )
+    parser.add_argument(
+        "--hidden-dim", type=int, default=8192, help="Hidden dimension size"
+    )
+    parser.add_argument(
+        "--dtypes",
+        type=str,
+        nargs="+",
+        default=["bfloat16"],
+        choices=["float16", "bfloat16", "float32"],
+        help="Data types to test",
+    )
+    parser.add_argument(
+        "--no-residual",
+        action="store_true",
+        help="Skip residual connection tests",
+    )
+
+    # Quantization mode options (mutually exclusive with --no-quant)
+    quant_group = parser.add_mutually_exclusive_group()
+    quant_group.add_argument(
+        "--no-quant", action="store_true", help="Skip all quantization tests"
+    )
+    quant_group.add_argument(
+        "--quant-fp8", action="store_true", help="Only run FP8 quantization tests"
+    )
+    quant_group.add_argument(
+        "--quant-fp4", action="store_true", help="Only run FP4 quantization tests"
+    )
+    quant_group.add_argument(
+        "--quant-all",
+        action="store_true",
+        help="Run all quantization tests (default)",
+    )
+
+    parser.add_argument(
+        "--disable-oneshot",
+        action="store_true",
+        help="Disable oneshot mode for FlashInfer operations",
+    )
+    parser.add_argument(
+        "--warmup", type=int, default=5, help="Number of warmup iterations"
+    )
+    parser.add_argument(
+        "--trials", type=int, default=20, help="Number of benchmark trials"
+    )
+    parser.add_argument(
+        "--output-file",
+        type=str,
+        help="""Output file path for markdown results
+                (default: benchmark_results_<timestamp>.md)
+        """,
+    )
+
+    args = parser.parse_args()
+
+    # Check if running with torchrun (required for collective operations)
+    if "RANK" not in os.environ or "WORLD_SIZE" not in os.environ:
+        raise RuntimeError(
+            "Must run with torchrun for distributed benchmarking. "
+            "Example: torchrun --nproc_per_node=2 benchmark_fused_collective.py"
+        )
+
+    # Initialize distributed environment
+    rank = int(os.environ["RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    torch.set_default_device(device)
+
+    init_distributed_environment(
+        world_size=world_size,
+        rank=rank,
+        local_rank=rank,
+        backend="nccl",
+    )
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+    # Validate world size (must be > 1 for collective operations)
+    if world_size <= 1:
+        raise ValueError(
+            "World size must be > 1 for collective operations benchmarking. "
+            f"Current world size: {world_size}. Use torchrun with --nproc_per_node > 1."
+        )
+
+    # Determine quantization mode
+    if args.no_quant:
+        quant_mode = "none"
+    elif args.quant_fp8:
+        quant_mode = "fp8_only"
+    elif args.quant_fp4:
+        quant_mode = "fp4_only"
+    else:  # args.quant_all or default
+        quant_mode = "all"
+
+    if rank == 0:
+        logger.info("Running benchmark with world_size=%s, rank=%s", world_size, rank)
+        logger.info("Quantization mode: %s", quant_mode)
+        if flashinfer_comm is not None:
+            oneshot_status = "enabled" if not args.disable_oneshot else "disabled"
+            logger.info(
+                "FlashInfer available - will benchmark fused operations (oneshot: %s)",
+                oneshot_status,
+            )
+        else:
+            logger.info(
+                "FlashInfer not available - only benchmarking standard operations"
+            )
+
+    # Convert dtype strings to torch dtypes
+    dtype_map = {
+        "float16": torch.float16,
+        "bfloat16": torch.bfloat16,
+        "float32": torch.float32,
+    }
+    dtypes = [dtype_map[dt] for dt in args.dtypes]
+
+    # Test configurations
+    residual_options = [True] if not args.no_residual else [False]
+    if not args.no_residual:
+        residual_options.append(False)
+
+    configs = list(itertools.product(args.seq_lens, dtypes, residual_options))
+
+    # Setup FlashInfer workspace if available
+    ipc_handles = None
+    allreduce_params = None
+
+    if flashinfer_comm is not None:
+        # Use the largest hidden dimension for workspace setup
+        max_num_token = _FI_MAX_SIZES.get(world_size) // (
+            args.hidden_dim * world_size * 2
+        )
+
+        ipc_handles, workspace_tensor = setup_flashinfer_workspace(
+            world_size, rank, args.hidden_dim, max_num_token
+        )
+
+        if workspace_tensor is not None:
+            allreduce_params = FlashInferFusedAllReduceParams(
+                rank=rank,
+                world_size=world_size,
+                max_token_num=max_num_token,
+            )
+
+    # Collect all results for markdown export
+    all_results = []
+
+    try:
+        # Run benchmarks
+        for seq_len, dtype, use_residual in configs:
+            if rank == 0:
+                logger.info(
+                    "\nTesting:  seq_len=%s, hidden_dim=%s, dtype=%s, residual=%s",
+                    seq_len,
+                    args.hidden_dim,
+                    dtype,
+                    use_residual,
+                )
+
+            results = run_benchmarks(
+                seq_len,
+                args.hidden_dim,
+                dtype,
+                use_residual,
+                allreduce_params,
+                quant_mode=quant_mode,
+                disable_oneshot=args.disable_oneshot,
+            )
+
+            # Store results for markdown export
+            if rank == 0:
+                all_results.append(
+                    {
+                        "seq_len": seq_len,
+                        "hidden_dim": args.hidden_dim,
+                        "dtype": str(dtype).replace("torch.", ""),
+                        "use_residual": use_residual,
+                        "quant_mode": quant_mode,
+                        "results": results,
+                    }
+                )
+
+                print_results(
+                    results,
+                    seq_len,
+                    args.hidden_dim,
+                    dtype,
+                    use_residual,
+                    quant_mode,
+                )
+
+        # Save results to markdown file
+        if args.output_file and rank == 0:
+            save_results_to_file(all_results, world_size, args, rank)
+
+    finally:
+        # Cleanup
+        if ipc_handles is not None:
+            cleanup_flashinfer_workspace(ipc_handles)
+
+        with contextlib.suppress(Exception):
+            dist.barrier()
+        cleanup_dist_env_and_memory(shutdown_ray=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/kernels/fused_moe_triton/README.md b/benchmark/kernels/fused_moe_triton/README.md
new file mode 100644
index 000000000..48598854a
--- /dev/null
+++ b/benchmark/kernels/fused_moe_triton/README.md
@@ -0,0 +1,76 @@
+## Tuning Triton MoE Kernels
+
+This directory contains benchmarking tools for MoE (Mixture of Experts) kernels.
+
+### Tuning Tool
+
+- `tuning_fused_moe_triton.py`: A tool for tuning the `fused_moe_triton` kernel. Adapted from [vllm's benchmark_moe.py](https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_moe.py), with added support for various model architectures.
+
+Example usage:
+```bash
+# Tune Mixtral-8x7B with default settings
+python benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py \
+    --model mistralai/Mixtral-8x7B-Instruct-v0.1 \
+    --tune
+
+# Tune Qwen2-57B with FP8 and TP=4
+python benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py \
+    --model Qwen/Qwen2-57B-A14B-Instruct \
+    --tp-size 4 \
+    --dtype fp8_w8a8 \
+    --tune
+
+# Tune Qwen3-235B-A22B-FP8 and TP=4
+python benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py \
+    --model Qwen/Qwen3-235B-A22B-FP8 \
+    --tp-size 4 \
+    --dtype fp8_w8a8 \
+    --tune
+
+# Tune DeepSeek-V3 with FP8 and TP=8
+python benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py \
+    --model deepseek-ai/DeepSeek-V3-0324 \
+    --tp-size 8 \
+    --dtype fp8_w8a8 \
+    --tune
+
+# Tune DeepSeek-R1 with channel-wise INT8 and TP=16
+python benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py \
+    --model meituan/DeepSeek-R1-Channel-INT8 \
+    --tp-size 16 \
+    --dtype int8_w8a8 \
+    --tune
+```
+
+After tuning, a configuration file (e.g., `E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json`) will be generated in the current directory. You can move this file to `sglang/srt/layers/fused_moe_triton/configs/triton_version` dir to use it in `sglang`.
+
+### Performance Comparison Tool
+
+- `benchmark_vllm_vs_sglang_fused_moe_triton.py`: A tool for comparing the performance of fused MoE kernels between vllm and sglang implementations. Supports various model architectures and data types.
+
+Example usage:
+```bash
+# Compare with default settings (Mixtral model)
+python benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py
+
+# Compare with FP8 mode for Qwen2-57B
+python benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py \
+    --model Qwen/Qwen2-57B-A14B-Instruct \
+    --use-fp8-w8a8
+
+# Compare with custom TP size
+python benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py \
+    --model deepseek-ai/DeepSeek-V3-0324 \
+    --tp-size 8
+
+# Compare with custom TP size
+python benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py \
+    --model deepseek-ai/DeepSeek-V3-0324 \
+    --tp-size 8
+```
+
+The benchmark results will be saved as plots and data files in the specified output directory (default: `./configs/benchmark_ops/vllm_sglang_fused_moe/`).
+
+- `benchmark_torch_compile_fused_moe.py`: A tool for benchmarking the performance of the fused MoE kernel with `torch.compile` and original fused MoE kernel.
+
+Usage is the same as `benchmark_vllm_vs_sglang_fused_moe_triton.py`, note that `torch.compile` does not support `fp8_w8a8` and `int8_w8a8` fused_moe_kernel.
diff --git a/benchmark/kernels/fused_moe_triton/benchmark_sglang_fused_moe_triton.py b/benchmark/kernels/fused_moe_triton/benchmark_sglang_fused_moe_triton.py
new file mode 100644
index 000000000..7621628c1
--- /dev/null
+++ b/benchmark/kernels/fused_moe_triton/benchmark_sglang_fused_moe_triton.py
@@ -0,0 +1,292 @@
+# python3 benchmark/kernels/fused_moe_triton/sglang_fused_moe_triton.py --model /DeepSeek-V3/ --tp-size 8
+import argparse
+
+import torch
+import triton
+from transformers import AutoConfig
+
+from sglang.srt.distributed.parallel_state import (
+    destroy_distributed_environment,
+    destroy_model_parallel,
+    init_distributed_environment,
+    initialize_model_parallel,
+)
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
+    fused_moe as fused_moe_sglang,
+)
+from sglang.srt.layers.moe.fused_moe_triton.triton_kernels_moe import (
+    triton_kernel_moe_forward,
+)
+from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
+from sglang.srt.layers.moe.topk import TopK, TopKConfig, select_experts
+
+
+def get_model_config(model_name: str, tp_size: int):
+    """Get model configuration parameters"""
+    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+
+    if config.architectures[0] == "Qwen2MoeForCausalLM":
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // tp_size
+    elif config.architectures[0] == "Qwen3MoeForCausalLM":
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // tp_size
+    elif config.architectures[0] in [
+        "DeepseekV2ForCausalLM",
+        "DeepseekV3ForCausalLM",
+        "Glm4MoeForCausalLM",
+    ]:
+        E = (
+            config.n_routed_experts + 1
+            if config.architectures[0] in ["DeepseekV3ForCausalLM"]
+            else config.n_routed_experts
+        )
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // tp_size
+    else:
+        # Default: Mixtral
+        E = config.num_local_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // tp_size
+
+    block_shape = None
+    if (
+        hasattr(config, "quantization_config")
+        and "weight_block_size" in config.quantization_config
+    ):
+        block_shape = config.quantization_config["weight_block_size"]
+        assert len(block_shape) == 2
+
+    shape_configs = {
+        "num_experts": E,
+        "topk": topk,
+        "hidden_size": config.hidden_size,
+        "shard_intermediate_size": shard_intermediate_size,
+        "dtype": config.torch_dtype,
+        "block_shape": block_shape,
+    }
+    print(f"{shape_configs=}")
+    return shape_configs
+
+
+def fused_moe_triton_api(
+    x,
+    w1,
+    w2,
+    input_gating,
+    topk,
+):
+    topk_op = TopK(
+        top_k=topk,
+        renormalize=False,
+        use_grouped_topk=False,
+    )
+    topk_op.use_triton_kernels = True
+    triton_topk_output = topk_op.forward_cuda(
+        hidden_states=x,
+        router_logits=input_gating,
+    )
+
+    moe_runner_config = MoeRunnerConfig(
+        inplace=False,
+    )
+    return triton_kernel_moe_forward(
+        x,
+        w1,
+        w2,
+        triton_topk_output,
+        moe_runner_config,
+    )
+
+
+def fused_moe_sglang_api(
+    x,
+    w1,
+    w2,
+    input_gating,
+    topk,
+    use_fp8_w8a8=False,
+    w1_scale=None,
+    w2_scale=None,
+    a1_scale=None,
+    a2_scale=None,
+    block_shape=None,
+):
+    topk_output = select_experts(
+        hidden_states=x,
+        router_logits=input_gating,
+        topk_config=TopKConfig(top_k=topk, renormalize=False),
+    )
+    return fused_moe_sglang(
+        x,
+        w1,
+        w2,
+        topk_output,
+        use_fp8_w8a8=use_fp8_w8a8,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+        block_shape=block_shape,
+    )
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=list([128, 256, 512, 1024, 2048, 4096, 8192]),
+        line_arg="provider",
+        line_vals=[
+            "sglang_fused_moe_triton_v340",
+            "sglang_fused_moe_triton",
+        ],
+        line_names=[
+            "sglang_fused_moe_triton_v340",
+            "sglang_fused_moe_triton",
+        ],
+        styles=[
+            ("blue", "-"),
+            ("green", "-"),
+        ],
+        ylabel="Time (ms)",
+        plot_name="fused-moe-performance",
+        args={},
+    )
+)
+def benchmark(
+    batch_size,
+    provider,
+    model_config,
+    use_fp8_w8a8=False,
+    use_cuda_graph: bool = False,
+):
+    print(f"benchmark {provider} with batch_size={batch_size}")
+    torch.set_default_device("cuda")
+    torch.cuda.manual_seed_all(0)
+
+    num_tokens = batch_size
+    num_experts = model_config["num_experts"]
+    hidden_size = model_config["hidden_size"]
+    shard_intermediate_size = model_config["shard_intermediate_size"]
+    topk = model_config["topk"]
+    dtype = model_config["dtype"]
+    block_shape = model_config["block_shape"]
+
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+
+    w1 = torch.randn(num_experts, shard_intermediate_size, hidden_size, dtype=dtype)
+    w2 = torch.randn(
+        num_experts, hidden_size, shard_intermediate_size // 2, dtype=dtype
+    )
+
+    w1_tri = w1.clone()
+    w2_tri = w2.clone()
+    w1_tri = w1_tri.transpose(-2, -1).contiguous()
+    w2_tri = w2_tri.transpose(-2, -1).contiguous()
+
+    input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
+
+    if provider == "sglang_fused_moe_triton_v340":
+        api_func = fused_moe_triton_api
+        api_kwargs = {
+            "x": x,
+            "w1": w1_tri,
+            "w2": w2_tri,
+            "input_gating": input_gating,
+            "topk": topk,
+        }
+    else:
+        api_func = fused_moe_sglang_api
+        api_kwargs = {
+            "x": x,
+            "w1": w1,
+            "w2": w2,
+            "input_gating": input_gating,
+            "topk": topk,
+            "use_fp8_w8a8": use_fp8_w8a8,
+            "block_shape": block_shape,
+        }
+
+    # Warmup
+    for _ in range(10):
+        _ = api_func(**api_kwargs)
+    torch.cuda.synchronize()
+
+    if use_cuda_graph:
+        stream = torch.cuda.Stream()
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph, stream=stream):
+            api_func(**api_kwargs)
+        torch.cuda.synchronize()
+
+        bench_lambda = lambda: graph.replay()
+    else:
+        bench_lambda = lambda: api_func(**api_kwargs)
+
+    quantiles = [0.5, 0.2, 0.8]
+    ms, min_ms, max_ms = triton.testing.do_bench(bench_lambda, quantiles=quantiles)
+    return ms, min_ms, max_ms
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1"
+    )
+    parser.add_argument("--tp-size", type=int, default=2)
+    parser.add_argument("--use-fp8-w8a8", action="store_true")
+    parser.add_argument(
+        "--use-cuda-graph", action="store_true", help="Enable CUDA Graph capture/replay"
+    )
+    parser.add_argument(
+        "--save-path",
+        type=str,
+        default="./configs/benchmark_ops/sglang_fused_moe/",
+    )
+    parser.add_argument("--trust-remote-code", action="store_true")
+    args = parser.parse_args()
+
+    try:
+        if not torch.distributed.is_initialized():
+            torch.distributed.init_process_group(
+                backend="nccl" if torch.cuda.is_available() else "gloo",
+                init_method="tcp://127.0.0.1:23456",
+                world_size=1,
+                rank=0,
+            )
+
+        init_distributed_environment(
+            world_size=1,
+            rank=0,
+            distributed_init_method="tcp://127.0.0.1:23456",
+            local_rank=0,
+            backend="nccl" if torch.cuda.is_available() else "gloo",
+        )
+
+        initialize_model_parallel(
+            tensor_model_parallel_size=1,
+            pipeline_model_parallel_size=1,
+        )
+
+        model_config = get_model_config(args.model, args.tp_size)
+        benchmark.run(
+            show_plots=True,
+            print_data=True,
+            save_path=args.save_path,
+            model_config=model_config,
+            use_fp8_w8a8=args.use_fp8_w8a8,
+            use_cuda_graph=args.use_cuda_graph,
+        )
+    finally:
+        destroy_model_parallel()
+        destroy_distributed_environment()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/kernels/fused_moe_triton/benchmark_sum_scale.py b/benchmark/kernels/fused_moe_triton/benchmark_sum_scale.py
new file mode 100644
index 000000000..979d2bbd1
--- /dev/null
+++ b/benchmark/kernels/fused_moe_triton/benchmark_sum_scale.py
@@ -0,0 +1,202 @@
+import torch
+import triton
+import triton.language as tl
+from triton.testing import do_bench
+
+
+@triton.jit
+def _moe_sum_reduce_kernel(
+    input_ptr,
+    input_stride_0,
+    input_stride_1,
+    input_stride_2,
+    output_ptr,
+    output_stride_0,
+    output_stride_1,
+    token_num: int,
+    topk_num: int,
+    hidden_dim: int,
+    routed_scaling_factor: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DIM: tl.constexpr,
+    NUM_STAGE: tl.constexpr,
+):
+    input_stride_0 = tl.cast(input_stride_0, dtype=tl.int64)
+    input_stride_1 = tl.cast(input_stride_1, dtype=tl.int64)
+    output_stride_0 = tl.cast(output_stride_0, dtype=tl.int64)
+
+    token_block_id = tl.program_id(0)
+    dim_block_id = tl.program_id(1)
+
+    offs_token = token_block_id * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_dim = dim_block_id * BLOCK_DIM + tl.arange(0, BLOCK_DIM)
+
+    mask_token = offs_token < token_num
+    mask_dim = offs_dim < hidden_dim
+
+    base_ptrs = input_ptr + offs_token[:, None] * input_stride_0 + offs_dim[None, :]
+
+    accumulator = tl.zeros((BLOCK_M, BLOCK_DIM), dtype=tl.float32)
+
+    for i in tl.range(0, topk_num, num_stages=NUM_STAGE):
+        tile = tl.load(
+            base_ptrs + i * input_stride_1,
+            mask=mask_token[:, None] & mask_dim[None, :],
+            other=0.0,
+        )
+        accumulator += tile.to(tl.float32)
+    accumulator *= routed_scaling_factor
+
+    # -------- Write back --------
+    store_ptrs = output_ptr + offs_token[:, None] * output_stride_0 + offs_dim[None, :]
+    tl.store(
+        store_ptrs,
+        accumulator.to(input_ptr.dtype.element_ty),
+        mask=mask_token[:, None] & mask_dim[None, :],
+    )
+
+
+# _moe_sum_reduce_kernel kernel modified from https://github.com/ModelTC/lightllm/blob/main/lightllm/common/fused_moe/moe_sum_reduce.py
+def moe_sum_reduce(
+    input: torch.Tensor, output: torch.Tensor, routed_scaling_factor: float
+):
+    assert input.is_contiguous()
+    assert output.is_contiguous()
+
+    token_num, topk_num, hidden_dim = input.shape
+    assert output.shape[0] == token_num and output.shape[1] == hidden_dim
+
+    BLOCK_M = 1
+    BLOCK_DIM = 2048
+    NUM_STAGE = 1
+    num_warps = 16
+
+    grid = (
+        triton.cdiv(token_num, BLOCK_M),
+        triton.cdiv(hidden_dim, BLOCK_DIM),
+    )
+
+    _moe_sum_reduce_kernel[grid](
+        input,
+        *input.stride(),
+        output,
+        *output.stride(),
+        token_num=token_num,
+        topk_num=topk_num,
+        hidden_dim=hidden_dim,
+        routed_scaling_factor=routed_scaling_factor,
+        BLOCK_M=BLOCK_M,
+        BLOCK_DIM=BLOCK_DIM,
+        NUM_STAGE=NUM_STAGE,
+        num_warps=num_warps,
+    )
+    return
+
+
+def compute_sum_scaled_baseline(
+    x: torch.Tensor, out: torch.Tensor, routed_scaling_factor: float
+) -> torch.Tensor:
+    torch.sum(x, dim=1, out=out)
+    out.mul_(routed_scaling_factor)
+    return out
+
+
+@torch.compile
+def compute_sum_scaled_compiled(
+    x: torch.Tensor, out: torch.Tensor, routed_scaling_factor: float
+) -> torch.Tensor:
+    torch.sum(x * routed_scaling_factor, dim=1, out=out)
+    return out
+
+
+def get_benchmark():
+    num_tokens_range = [2**i for i in range(0, 13)]
+
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["num_tokens"],
+            x_vals=num_tokens_range,
+            line_arg="version",
+            line_vals=["baseline", "compiled", "triton"],
+            line_names=["Original", "TorchCompile", "TritonKernel"],
+            styles=[("blue", "-"), ("green", "-"), ("red", "-")],
+            ylabel="us",
+            plot_name="sum_scaled_performance",
+            args={},
+        )
+    )
+    def benchmark(num_tokens, version):
+        topk = 9
+        hidden_size = 4096
+        dtype = torch.bfloat16
+        scaling_factor = 0.3
+
+        x = torch.randn(num_tokens, topk, hidden_size, dtype=dtype, device="cuda")
+        out = torch.empty(num_tokens, hidden_size, dtype=dtype, device="cuda")
+
+        # Warmup
+        for _ in range(3):
+            if version == "baseline":
+                compute_sum_scaled_baseline(x, out, scaling_factor)
+            elif version == "compiled":
+                compute_sum_scaled_compiled(x, out, scaling_factor)
+            else:
+                moe_sum_reduce(x, out, scaling_factor)
+
+        # Benchmark
+        quantiles = [0.5, 0.2, 0.8]
+        if version == "baseline":
+            ms, min_ms, max_ms = do_bench(
+                lambda: compute_sum_scaled_baseline(x, out, scaling_factor),
+                quantiles=quantiles,
+            )
+        elif version == "compiled":
+            ms, min_ms, max_ms = do_bench(
+                lambda: compute_sum_scaled_compiled(x, out, scaling_factor),
+                quantiles=quantiles,
+            )
+        else:
+            ms, min_ms, max_ms = do_bench(
+                lambda: moe_sum_reduce(x, out, scaling_factor), quantiles=quantiles
+            )
+
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+    return benchmark
+
+
+def verify_correctness(num_tokens=1024):
+    x = torch.randn(num_tokens, 9, 4096, device="cuda", dtype=torch.bfloat16)
+    scaling_factor = 0.3
+
+    out_baseline = torch.empty_like(x[:, 0])
+    compute_sum_scaled_baseline(x, out_baseline, scaling_factor)
+
+    out_compiled = torch.empty_like(out_baseline)
+    compute_sum_scaled_compiled(x, out_compiled, scaling_factor)
+
+    out_triton = torch.empty_like(out_baseline)
+    moe_sum_reduce(x, out_triton, scaling_factor)
+
+    if torch.allclose(
+        out_baseline, out_compiled, atol=1e-2, rtol=1e-2
+    ) and torch.allclose(out_baseline, out_triton, atol=1e-2, rtol=1e-2):
+        print("✅ All implementations match")
+    else:
+        print("❌ Implementations differ")
+        print(
+            f"Baseline vs Compiled: {(out_baseline - out_compiled).abs().max().item()}"
+        )
+        print(f"Baseline vs Triton: {(out_baseline - out_triton).abs().max().item()}")
+
+
+if __name__ == "__main__":
+    print("Running correctness verification...")
+    verify_correctness()
+
+    print("\nRunning performance benchmark...")
+    benchmark = get_benchmark()
+    benchmark.run(
+        print_data=True,
+        # save_path="./configs/benchmark_ops/sum_scaled/"
+    )
diff --git a/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py b/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py
new file mode 100644
index 000000000..2b4faa24b
--- /dev/null
+++ b/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py
@@ -0,0 +1,305 @@
+# python3 benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py --model /DeepSeek-V3/ --tp-size 8 --use-fp8-w8a8
+import argparse
+
+import torch
+import triton
+from torch.nn import functional as F
+from transformers import AutoConfig
+
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
+    fused_moe as fused_moe_triton,
+)
+from sglang.srt.model_executor.cuda_graph_runner import set_torch_compile_config
+
+
+def get_model_config(model_name: str, tp_size: int):
+    """Get model configuration parameters"""
+    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+
+    if config.architectures[0] == "DbrxForCausalLM":
+        E = config.ffn_config.moe_num_experts
+        topk = config.ffn_config.moe_top_k
+        intermediate_size = config.ffn_config.ffn_hidden_size
+        shard_intermediate_size = 2 * intermediate_size // tp_size
+    elif config.architectures[0] == "JambaForCausalLM":
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // tp_size
+    elif config.architectures[0] == "Qwen2MoeForCausalLM":
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // tp_size
+    elif config.architectures[0] == "Qwen3MoeForCausalLM":
+        E = config.n_routed_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // tp_size
+    elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]:
+        E = config.n_routed_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // tp_size
+    elif config.architectures[0] == "Llama4ForConditionalGeneration":
+        E = config.text_config.num_local_experts
+        topk = config.text_config.num_experts_per_tok
+        intermediate_size = config.text_config.intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // tp_size
+    elif config.architectures[0] in [
+        "Grok1ForCausalLM",
+        "Grok1ImgGen",
+        "Grok1AForCausalLM",
+    ]:
+        E = config.num_local_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // tp_size
+    else:
+        # Default: Mixtral
+        E = config.num_local_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // tp_size
+
+    shape_configs = {
+        "num_experts": E,
+        "topk": topk,
+        "hidden_size": config.hidden_size,
+        "shard_intermediate_size": shard_intermediate_size,
+        "dtype": config.torch_dtype,
+    }
+    print(f"{shape_configs=}")
+    return shape_configs
+
+
+def fused_topk_native(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+):
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+    M, _ = hidden_states.shape
+    topk_weights = torch.empty(
+        M, topk, dtype=torch.float32, device=hidden_states.device
+    )
+    topk_ids = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device)
+    topk_weights = F.softmax(gating_output.float(), dim=-1)
+    topk_weights, topk_ids = torch.topk(topk_weights, topk, dim=-1)
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    return topk_weights, topk_ids
+
+
+@torch.compile(dynamic=False)
+def fused_moe_torch(
+    x,
+    w1,
+    w2,
+    input_gating,
+    topk,
+    use_fp8_w8a8=False,
+    w1_scale=None,
+    w2_scale=None,
+    a1_scale=None,
+    a2_scale=None,
+) -> torch.Tensor:
+    assert not use_fp8_w8a8, "Fp8_w8a8 fused_moe is not supported for torch compile"
+
+    topk_weights, topk_ids = fused_topk_native(
+        hidden_states=x,
+        gating_output=input_gating,
+        topk=topk,
+        renormalize=True,
+    )
+    w13_weights = w1[topk_ids]
+    w1_weights, w3_weights = torch.chunk(w13_weights, 2, dim=2)
+    w2_weights = w2[topk_ids]
+    x1 = torch.einsum("ti,taoi -> tao", x, w1_weights)
+    x1 = F.silu(x1)
+    x3 = torch.einsum("ti, taoi -> tao", x, w3_weights)
+    expert_outs = torch.einsum("tao, taio -> tai", (x1 * x3), w2_weights)
+    return torch.einsum("tai,ta -> ti", expert_outs, topk_weights.to(expert_outs.dtype))
+
+
+def fused_moe_torch_compile(
+    x,
+    w1,
+    w2,
+    input_gating,
+    topk,
+    use_fp8_w8a8=False,
+    w1_scale=None,
+    w2_scale=None,
+    a1_scale=None,
+    a2_scale=None,
+):
+    return fused_moe_torch(
+        x,
+        w1,
+        w2,
+        input_gating,
+        topk,
+        use_fp8_w8a8=use_fp8_w8a8,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+    )
+
+
+def fused_moe_sglang_api(
+    x,
+    w1,
+    w2,
+    input_gating,
+    topk,
+    use_fp8_w8a8=False,
+    w1_scale=None,
+    w2_scale=None,
+    a1_scale=None,
+    a2_scale=None,
+):
+    return fused_moe_triton(
+        x,
+        w1,
+        w2,
+        input_gating,
+        topk,
+        renormalize=True,
+        inplace=True,
+        use_fp8_w8a8=use_fp8_w8a8,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+    )
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=list(range(1, 5)),
+        line_arg="provider",
+        line_vals=[
+            "fused_moe_triton",
+            "fused_moe_torch_compile",
+        ],
+        line_names=[
+            "fused_moe_triton",
+            "fused_moe_torch_compile",
+        ],
+        styles=[
+            ("blue", "-"),
+            ("green", "-"),
+        ],
+        ylabel="Time (ms)",
+        plot_name="fused-moe-performance",
+        args={},
+    )
+)
+def benchmark(batch_size, provider, model_config, use_fp8_w8a8=False):
+    print(f"benchmark {provider} with batch_size={batch_size}")
+    torch.set_default_device("cuda")
+    torch.cuda.manual_seed_all(0)
+    set_torch_compile_config()
+
+    num_tokens = batch_size
+    num_experts = model_config["num_experts"]
+    hidden_size = model_config["hidden_size"]
+    shard_intermediate_size = model_config["shard_intermediate_size"]
+    topk = model_config["topk"]
+    dtype = model_config["dtype"]
+
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+
+    if use_fp8_w8a8:
+        init_dtype = dtype
+        w1 = torch.randn(
+            num_experts, shard_intermediate_size, hidden_size, dtype=init_dtype
+        )
+        w2 = torch.randn(
+            num_experts, hidden_size, shard_intermediate_size // 2, dtype=init_dtype
+        )
+        w1 = w1.to(torch.float8_e4m3fn)
+        w2 = w2.to(torch.float8_e4m3fn)
+        w1_scale = torch.randn(num_experts, dtype=torch.float32)
+        w2_scale = torch.randn(num_experts, dtype=torch.float32)
+        a1_scale = torch.randn(1, dtype=torch.float32)
+        a2_scale = torch.randn(1, dtype=torch.float32)
+    else:
+        w1 = torch.randn(num_experts, shard_intermediate_size, hidden_size, dtype=dtype)
+        w2 = torch.randn(
+            num_experts, hidden_size, shard_intermediate_size // 2, dtype=dtype
+        )
+        w1_scale = w2_scale = a1_scale = a2_scale = None
+
+    input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
+
+    # Warmup
+    api_func = (
+        fused_moe_torch_compile
+        if provider == "fused_moe_torch_compile"
+        else fused_moe_sglang_api
+    )
+    for _ in range(10):
+        y = api_func(
+            x,
+            w1,
+            w2,
+            input_gating,
+            topk,
+            use_fp8_w8a8=use_fp8_w8a8,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+        )
+    torch.cuda.synchronize()
+
+    quantiles = [0.5, 0.2, 0.8]
+    ms, min_ms, max_ms = triton.testing.do_bench(
+        lambda: api_func(
+            x,
+            w1,
+            w2,
+            input_gating,
+            topk,
+            use_fp8_w8a8=use_fp8_w8a8,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+        )[0],
+        quantiles=quantiles,
+    )
+    return ms, min_ms, max_ms
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1"
+    )
+    parser.add_argument("--tp-size", type=int, default=2)
+    parser.add_argument("--use-fp8-w8a8", action="store_true")
+    parser.add_argument(
+        "--save-path",
+        type=str,
+        default="./configs/benchmark_ops/fused_moe_torch_compile/",
+    )
+    args = parser.parse_args()
+
+    model_config = get_model_config(args.model, args.tp_size)
+    benchmark.run(
+        show_plots=True,
+        print_data=True,
+        save_path=args.save_path,
+        model_config=model_config,
+        use_fp8_w8a8=args.use_fp8_w8a8,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py b/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py
new file mode 100644
index 000000000..6afd7f354
--- /dev/null
+++ b/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py
@@ -0,0 +1,349 @@
+# python3 benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py --model /DeepSeek-V3/ --tp-size 8 --use-fp8-w8a8
+import argparse
+
+import torch
+import triton
+import vllm
+from transformers import AutoConfig
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_moe as fused_moe_vllm
+
+from sglang.srt.distributed.parallel_state import (
+    destroy_distributed_environment,
+    destroy_model_parallel,
+    init_distributed_environment,
+    initialize_model_parallel,
+)
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
+    fused_moe as fused_moe_sglang,
+)
+
+
+def get_model_config(model_name: str, tp_size: int):
+    """Get model configuration parameters"""
+    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+
+    if config.architectures[0] == "DbrxForCausalLM":
+        E = config.ffn_config.moe_num_experts
+        topk = config.ffn_config.moe_top_k
+        intermediate_size = config.ffn_config.ffn_hidden_size
+        shard_intermediate_size = 2 * intermediate_size // tp_size
+    elif config.architectures[0] == "JambaForCausalLM":
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // tp_size
+    elif config.architectures[0] == "Qwen2MoeForCausalLM":
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // tp_size
+    elif config.architectures[0] == "Qwen3MoeForCausalLM":
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // tp_size
+    elif config.architectures[0] in [
+        "DeepseekV2ForCausalLM",
+        "DeepseekV3ForCausalLM",
+        "Glm4MoeForCausalLM",
+    ]:
+        E = (
+            config.n_routed_experts + 1
+            if config.architectures[0] in ["DeepseekV3ForCausalLM"]
+            else config.n_routed_experts
+        )
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // tp_size
+    elif config.architectures[0] == "Llama4ForConditionalGeneration":
+        E = config.text_config.num_local_experts
+        topk = config.text_config.num_experts_per_tok
+        intermediate_size = config.text_config.intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // tp_size
+    elif config.architectures[0] in [
+        "Grok1ForCausalLM",
+        "Grok1ImgGen",
+        "Grok1AForCausalLM",
+    ]:
+        E = config.num_local_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // tp_size
+    else:
+        # Default: Mixtral
+        E = config.num_local_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // tp_size
+
+    vllm_version_num = (
+        vllm.__version_tuple__[0] * 100
+        + vllm.__version_tuple__[1] * 10
+        + vllm.__version_tuple__[2]
+    )
+    block_shape = None
+    if (
+        hasattr(config, "quantization_config")
+        and "weight_block_size" in config.quantization_config
+    ):
+        block_shape = config.quantization_config["weight_block_size"]
+        assert len(block_shape) == 2
+        assert (
+            vllm_version_num >= 66
+        ), "Block-wise quantized fp8 fused_moe is only supported for VLLM>=0.6.6.post1"
+
+    shape_configs = {
+        "num_experts": E,
+        "topk": topk,
+        "hidden_size": config.hidden_size,
+        "shard_intermediate_size": shard_intermediate_size,
+        "dtype": config.torch_dtype,
+        "block_shape": block_shape,
+    }
+    print(f"{shape_configs=}")
+    return shape_configs
+
+
+def fused_moe_vllm_api(
+    x,
+    w1,
+    w2,
+    input_gating,
+    topk,
+    use_fp8_w8a8=False,
+    w1_scale=None,
+    w2_scale=None,
+    a1_scale=None,
+    a2_scale=None,
+    block_shape=None,
+):
+    if block_shape is not None:
+        return fused_moe_vllm(
+            x,
+            w1,
+            w2,
+            input_gating,
+            topk,
+            renormalize=True,
+            inplace=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            block_shape=block_shape,
+        )
+    else:
+        return fused_moe_vllm(
+            x,
+            w1,
+            w2,
+            input_gating,
+            topk,
+            renormalize=True,
+            inplace=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+        )
+
+
+def fused_moe_sglang_api(
+    x,
+    w1,
+    w2,
+    input_gating,
+    topk,
+    use_fp8_w8a8=False,
+    w1_scale=None,
+    w2_scale=None,
+    a1_scale=None,
+    a2_scale=None,
+    block_shape=None,
+):
+    return fused_moe_sglang(
+        x,
+        w1,
+        w2,
+        input_gating,
+        topk,
+        renormalize=True,
+        inplace=True,
+        use_fp8_w8a8=use_fp8_w8a8,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+        block_shape=block_shape,
+    )
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=list(range(1, 513)),
+        line_arg="provider",
+        line_vals=[
+            "vllm_fused_moe_triton",
+            "sglang_fused_moe_triton",
+        ],
+        line_names=[
+            "vllm_fused_moe_triton",
+            "sglang_fused_moe_triton",
+        ],
+        styles=[
+            ("blue", "-"),
+            ("green", "-"),
+        ],
+        ylabel="Time (ms)",
+        plot_name="fused-moe-performance",
+        args={},
+    )
+)
+def benchmark(batch_size, provider, model_config, use_fp8_w8a8=False):
+    print(f"benchmark {provider} with batch_size={batch_size}")
+    torch.set_default_device("cuda")
+    torch.cuda.manual_seed_all(0)
+
+    num_tokens = batch_size
+    num_experts = model_config["num_experts"]
+    hidden_size = model_config["hidden_size"]
+    shard_intermediate_size = model_config["shard_intermediate_size"]
+    topk = model_config["topk"]
+    dtype = model_config["dtype"]
+    block_shape = model_config["block_shape"]
+
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    w1_scale = w2_scale = a1_scale = a2_scale = None
+
+    if use_fp8_w8a8:
+        init_dtype = dtype
+        w1 = torch.randn(
+            num_experts, shard_intermediate_size, hidden_size, dtype=init_dtype
+        )
+        w2 = torch.randn(
+            num_experts, hidden_size, shard_intermediate_size // 2, dtype=init_dtype
+        )
+        w1 = w1.to(torch.float8_e4m3fn)
+        w2 = w2.to(torch.float8_e4m3fn)
+
+        if block_shape is None:
+            w1_scale = torch.randn(num_experts, dtype=torch.float32)
+            w2_scale = torch.randn(num_experts, dtype=torch.float32)
+            a1_scale = torch.randn(1, dtype=torch.float32)
+            a2_scale = torch.randn(1, dtype=torch.float32)
+        else:
+            block_n, block_k = block_shape[0], block_shape[1]
+            n_tiles_w1 = (shard_intermediate_size + block_n - 1) // block_n
+            n_tiles_w2 = (hidden_size + block_n - 1) // block_n
+            k_tiles_w1 = (hidden_size + block_k - 1) // block_k
+            k_tiles_w2 = (shard_intermediate_size // 2 + block_k - 1) // block_k
+            w1_scale = torch.rand(
+                (num_experts, n_tiles_w1, k_tiles_w1), dtype=torch.float32
+            )
+            w2_scale = torch.rand(
+                (num_experts, n_tiles_w2, k_tiles_w2), dtype=torch.float32
+            )
+    else:
+        w1 = torch.randn(num_experts, shard_intermediate_size, hidden_size, dtype=dtype)
+        w2 = torch.randn(
+            num_experts, hidden_size, shard_intermediate_size // 2, dtype=dtype
+        )
+
+    input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
+
+    # Warmup
+    api_func = (
+        fused_moe_vllm_api
+        if provider == "vllm_fused_moe_triton"
+        else fused_moe_sglang_api
+    )
+    for _ in range(10):
+        y = api_func(
+            x,
+            w1,
+            w2,
+            input_gating,
+            topk,
+            use_fp8_w8a8=use_fp8_w8a8,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            block_shape=block_shape,
+        )
+    torch.cuda.synchronize()
+
+    quantiles = [0.5, 0.2, 0.8]
+    ms, min_ms, max_ms = triton.testing.do_bench(
+        lambda: api_func(
+            x,
+            w1,
+            w2,
+            input_gating,
+            topk,
+            use_fp8_w8a8=use_fp8_w8a8,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            block_shape=block_shape,
+        )[0],
+        quantiles=quantiles,
+    )
+    return ms, min_ms, max_ms
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1"
+    )
+    parser.add_argument("--tp-size", type=int, default=2)
+    parser.add_argument("--use-fp8-w8a8", action="store_true")
+    parser.add_argument(
+        "--save-path",
+        type=str,
+        default="./configs/benchmark_ops/vllm_sglang_fused_moe/",
+    )
+    args = parser.parse_args()
+
+    try:
+        if not torch.distributed.is_initialized():
+            torch.distributed.init_process_group(
+                backend="nccl" if torch.cuda.is_available() else "gloo",
+                init_method="tcp://127.0.0.1:23456",
+                world_size=1,
+                rank=0,
+            )
+
+        init_distributed_environment(
+            world_size=1,
+            rank=0,
+            distributed_init_method="tcp://127.0.0.1:23456",
+            local_rank=0,
+            backend="nccl" if torch.cuda.is_available() else "gloo",
+        )
+
+        initialize_model_parallel(
+            tensor_model_parallel_size=1,
+            pipeline_model_parallel_size=1,
+        )
+
+        model_config = get_model_config(args.model, args.tp_size)
+        benchmark.run(
+            show_plots=True,
+            print_data=True,
+            save_path=args.save_path,
+            model_config=model_config,
+            use_fp8_w8a8=args.use_fp8_w8a8,
+        )
+    finally:
+        destroy_model_parallel()
+        destroy_distributed_environment()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py b/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py
new file mode 100644
index 000000000..7b52f02a3
--- /dev/null
+++ b/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py
@@ -0,0 +1,599 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_moe.py
+import argparse
+import json
+import time
+from contextlib import nullcontext
+from datetime import datetime
+from typing import Any, Dict, List, Tuple, TypedDict
+
+import ray
+import torch
+import triton
+from ray.experimental.tqdm_ray import tqdm
+from transformers import AutoConfig
+
+from sglang.srt.layers.moe.fused_moe_triton import override_config
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
+    fused_moe,
+    get_config_dtype_str,
+    get_config_file_name,
+    get_default_config,
+    get_moe_configs,
+)
+from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
+from sglang.srt.layers.moe.topk import TopKConfig, select_experts
+from sglang.srt.utils import is_hip
+
+_is_hip = is_hip()
+
+
+class BenchmarkConfig(TypedDict):
+    BLOCK_SIZE_M: int
+    BLOCK_SIZE_N: int
+    BLOCK_SIZE_K: int
+    GROUP_SIZE_M: int
+    num_warps: int
+    num_stages: int
+
+
+def benchmark_config(
+    config: BenchmarkConfig,
+    num_tokens: int,
+    num_experts: int,
+    shard_intermediate_size: int,
+    hidden_size: int,
+    topk: int,
+    dtype: torch.dtype,
+    use_fp8_w8a8: bool,
+    use_int8_w8a8: bool,
+    use_int8_w8a16: bool,
+    block_shape: List[int] = None,
+    num_iters: int = 100,
+) -> float:
+    init_dtype = torch.float16 if use_fp8_w8a8 else dtype
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    if use_int8_w8a16 or use_int8_w8a8:
+        w1 = torch.randint(
+            -127,
+            127,
+            (
+                num_experts,
+                shard_intermediate_size,
+                hidden_size,
+            ),
+            dtype=torch.int8,
+        )
+        w2 = torch.randint(
+            -127,
+            127,
+            (
+                num_experts,
+                hidden_size,
+                shard_intermediate_size // 2,
+            ),
+            dtype=torch.int8,
+        )
+    else:
+        w1 = torch.randn(
+            num_experts, shard_intermediate_size, hidden_size, dtype=init_dtype
+        )
+        w2 = torch.randn(
+            num_experts, hidden_size, shard_intermediate_size // 2, dtype=init_dtype
+        )
+    gating_output = torch.randn(num_iters, num_tokens, num_experts, dtype=torch.float32)
+
+    w1_scale = None
+    w2_scale = None
+    a1_scale = None
+    a2_scale = None
+    if use_int8_w8a16:
+        w1_scale = torch.randn(
+            (num_experts, 2 * shard_intermediate_size), dtype=torch.float32
+        )
+        w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32)
+    if use_fp8_w8a8 or use_int8_w8a8:
+        if use_int8_w8a8 and block_shape is None:
+            w1_scale = torch.randn(
+                num_experts, shard_intermediate_size, dtype=torch.float32
+            )
+            w2_scale = torch.randn(num_experts, hidden_size, dtype=torch.float32)
+        elif block_shape is None:
+            w1_scale = torch.randn(num_experts, dtype=torch.float32)
+            w2_scale = torch.randn(num_experts, dtype=torch.float32)
+            a1_scale = torch.randn(1, dtype=torch.float32)
+            a2_scale = torch.randn(1, dtype=torch.float32)
+        else:
+            block_n, block_k = block_shape[0], block_shape[1]
+            n_tiles_w1 = (shard_intermediate_size + block_n - 1) // block_n
+            n_tiles_w2 = (hidden_size + block_n - 1) // block_n
+            k_tiles_w1 = (hidden_size + block_k - 1) // block_k
+            k_tiles_w2 = (shard_intermediate_size // 2 + block_k - 1) // block_k
+            w1_scale = torch.rand(
+                (num_experts, n_tiles_w1, k_tiles_w1), dtype=torch.float32
+            )
+            w2_scale = torch.rand(
+                (num_experts, n_tiles_w2, k_tiles_w2), dtype=torch.float32
+            )
+
+    if use_fp8_w8a8:
+        w1 = w1.to(torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn)
+        w2 = w2.to(torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn)
+
+    input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
+    topk_config = TopKConfig(
+        top_k=topk,
+        renormalize=True,
+    )
+    topk_output = select_experts(x, input_gating, topk_config)
+
+    def prepare(i: int):
+        input_gating = gating_output[i]
+        new_topk_output = select_experts(x, input_gating, topk_config)
+        topk_output.topk_weights.copy_(new_topk_output.topk_weights)
+        topk_output.topk_ids.copy_(new_topk_output.topk_ids)
+        topk_output.router_logits.copy_(new_topk_output.router_logits)
+
+    def run():
+        moe_runner_config = MoeRunnerConfig(
+            inplace=True,
+        )
+
+        with override_config(config):
+            fused_moe(
+                x,
+                w1,
+                w2,
+                topk_output,
+                moe_runner_config=moe_runner_config,
+                use_fp8_w8a8=use_fp8_w8a8,
+                use_int8_w8a8=use_int8_w8a8,
+                use_int8_w8a16=use_int8_w8a16,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                a1_scale=a1_scale,
+                a2_scale=a2_scale,
+                block_shape=block_shape,
+            )
+
+    # JIT compilation & warmup
+    run()
+    torch.cuda.synchronize()
+
+    # Capture 10 invocations with CUDA graph
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        for _ in range(10):
+            run()
+    torch.cuda.synchronize()
+
+    # Warmup
+    for _ in range(5):
+        graph.replay()
+    torch.cuda.synchronize()
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    latencies: List[float] = []
+    for i in range(num_iters):
+        prepare(i)
+        torch.cuda.synchronize()
+
+        start_event.record()
+        graph.replay()
+        end_event.record()
+        end_event.synchronize()
+        latencies.append(start_event.elapsed_time(end_event))
+    avg = sum(latencies) / (num_iters * 10) * 1000  # us
+    graph.reset()
+    return avg
+
+
+def get_rocm_configs_compute_bound() -> List[Dict[str, int]]:
+    configs: List[BenchmarkConfig] = []
+    waves_per_eu_range = 0
+    for num_stages in [2]:
+        for block_m in [32, 64, 128, 256]:
+            for block_k in [32, 64, 128, 256]:
+                for block_n in [16, 32, 64, 128, 256]:
+                    for num_warps in [1, 2, 4, 8]:
+                        for group_size in [1, 4, 8, 16, 32]:
+                            configs.append(
+                                {
+                                    "BLOCK_SIZE_M": block_m,
+                                    "BLOCK_SIZE_N": block_n,
+                                    "BLOCK_SIZE_K": block_k,
+                                    "GROUP_SIZE_M": group_size,
+                                    "num_warps": num_warps,
+                                    "num_stages": num_stages,
+                                    "waves_per_eu": waves_per_eu_range,
+                                }
+                            )
+    return configs
+
+
+def get_configs_compute_bound() -> List[Dict[str, int]]:
+    # Reduced search space for faster tuning.
+    # TODO(woosuk): Increase the search space and use a performance model to
+    # prune the search space.
+    configs: List[BenchmarkConfig] = []
+    if _is_hip:
+        configs = get_rocm_configs_compute_bound()
+    else:
+        for num_stages in [2, 3, 4, 5]:
+            for block_m in [16, 32, 64, 128, 256]:
+                for block_k in [64, 128, 256]:
+                    for block_n in [32, 64, 128, 256]:
+                        for num_warps in [4, 8]:
+                            for group_size in [1, 16, 32, 64]:
+                                configs.append(
+                                    {
+                                        "BLOCK_SIZE_M": block_m,
+                                        "BLOCK_SIZE_N": block_n,
+                                        "BLOCK_SIZE_K": block_k,
+                                        "GROUP_SIZE_M": group_size,
+                                        "num_warps": num_warps,
+                                        "num_stages": num_stages,
+                                    }
+                                )
+    return configs
+
+
+@ray.remote(num_gpus=1)
+class BenchmarkWorker:
+
+    def __init__(self, seed: int) -> None:
+        torch.set_default_device("cuda")
+        torch.cuda.manual_seed_all(0)
+        self.seed = seed
+        # Get the device ID to allocate tensors and kernels
+        # on the respective GPU.
+        self.device_id = int(ray.get_gpu_ids()[0])
+
+    def benchmark(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        shard_intermediate_size: int,
+        hidden_size: int,
+        topk: int,
+        dtype: torch.dtype,
+        use_fp8_w8a8: bool,
+        use_int8_w8a8: bool,
+        use_int8_w8a16: bool,
+        block_shape: List[int],
+    ) -> Tuple[Dict[str, int], float]:
+        torch.cuda.manual_seed_all(0)
+        dtype_str = get_config_dtype_str(
+            dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
+        )
+        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
+        # is the intermediate size after silu_and_mul.
+        block_n = block_shape[0] if block_shape else 0
+        block_k = block_shape[1] if block_shape else 0
+        op_config = get_moe_configs(
+            num_experts, shard_intermediate_size // 2, dtype_str, block_n, block_k
+        )
+        if op_config is None:
+            config = get_default_config(
+                num_tokens,
+                num_experts,
+                shard_intermediate_size,
+                hidden_size,
+                topk,
+                dtype_str,
+                False,
+                block_shape,
+            )
+        else:
+            config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))]
+        with torch.cuda.device(self.device_id) if is_hip() else nullcontext():
+            kernel_time = benchmark_config(
+                config,
+                num_tokens,
+                num_experts,
+                shard_intermediate_size,
+                hidden_size,
+                topk,
+                dtype,
+                use_fp8_w8a8,
+                use_int8_w8a8,
+                use_int8_w8a16,
+                block_shape,
+            )
+        return config, kernel_time
+
+    def tune(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        shard_intermediate_size: int,
+        hidden_size: int,
+        topk: int,
+        dtype: torch.dtype,
+        use_fp8_w8a8: bool,
+        use_int8_w8a8: bool,
+        use_int8_w8a16: bool,
+        block_shape: List[int],
+        search_space: List[Dict[str, int]],
+    ) -> Dict[str, int]:
+        best_config = None
+        best_time = float("inf")
+        with torch.cuda.device(self.device_id) if is_hip() else nullcontext():
+            for config in tqdm(search_space):
+                try:
+                    kernel_time = benchmark_config(
+                        config,
+                        num_tokens,
+                        num_experts,
+                        shard_intermediate_size,
+                        hidden_size,
+                        topk,
+                        dtype,
+                        use_fp8_w8a8,
+                        use_int8_w8a8,
+                        use_int8_w8a16,
+                        block_shape,
+                        num_iters=10,
+                    )
+                except (triton.runtime.autotuner.OutOfResources, RuntimeError):
+                    # Some configurations may be invalid and fail to compile.
+                    continue
+
+                if kernel_time < best_time:
+                    best_time = kernel_time
+                    best_config = config
+        now = datetime.now()
+        print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
+        assert best_config is not None
+        return best_config
+
+
+def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
+    return {
+        "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
+        "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
+        "BLOCK_SIZE_K": config["BLOCK_SIZE_K"],
+        "GROUP_SIZE_M": config["GROUP_SIZE_M"],
+        "num_warps": config["num_warps"],
+        "num_stages": config["num_stages"],
+        **(
+            {"waves_per_eu": config["waves_per_eu"]} if "waves_per_eu" in config else {}
+        ),
+    }
+
+
+def save_configs(
+    configs: Dict[int, BenchmarkConfig],
+    num_experts: int,
+    shard_intermediate_size: int,
+    hidden_size: int,
+    topk: int,
+    dtype: torch.dtype,
+    use_fp8_w8a8: bool,
+    use_int8_w8a8: bool,
+    use_int8_w8a16: bool,
+    block_shape: List[int],
+) -> None:
+    dtype_str = get_config_dtype_str(
+        dtype,
+        use_int8_w8a16=use_int8_w8a16,
+        use_fp8_w8a8=use_fp8_w8a8,
+        use_int8_w8a8=use_int8_w8a8,
+    )
+
+    # NOTE(woosuk): The current naming convention uses w2.shape[2], which
+    # is the intermediate size after silu_and_mul.
+    filename = get_config_file_name(
+        num_experts,
+        shard_intermediate_size // 2,
+        dtype_str,
+        block_shape,
+    )
+
+    print(f"Writing best config to {filename}...")
+    with open(filename, "w") as f:
+        json.dump(configs, f, indent=4)
+        f.write("\n")
+
+
+def main(args: argparse.Namespace):
+    print(args)
+
+    config = AutoConfig.from_pretrained(args.model, trust_remote_code=True)
+    if config.architectures[0] == "DbrxForCausalLM":
+        E = config.ffn_config.moe_num_experts
+        topk = config.ffn_config.moe_top_k
+        intermediate_size = config.ffn_config.ffn_hidden_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    elif config.architectures[0] == "JambaForCausalLM":
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    elif config.architectures[0] in ["Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"]:
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]:
+        E = (
+            config.n_routed_experts + (0 if args.disable_shared_experts_fusion else 1)
+            if config.architectures[0] in ["DeepseekV3ForCausalLM"]
+            else config.n_routed_experts
+        )
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    elif config.architectures[0] == "Llama4ForConditionalGeneration":
+        E = config.text_config.num_local_experts + (
+            0 if args.disable_shared_experts_fusion else 1
+        )
+        topk = config.text_config.num_experts_per_tok
+        intermediate_size = config.text_config.intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    elif config.architectures[0] in [
+        "Grok1ForCausalLM",
+        "Grok1ImgGen",
+        "Grok1AForCausalLM",
+    ]:
+        E = config.num_local_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    elif config.architectures[0] in ["Glm4MoeForCausalLM"]:
+        E = config.n_routed_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    else:
+        # Default: Mixtral
+        E = config.num_local_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+
+    hidden_size = getattr(config, "hidden_size", None) or config.text_config.hidden_size
+    dtype = config.torch_dtype
+    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
+    use_int8_w8a8 = args.dtype == "int8_w8a8"
+    use_int8_w8a16 = args.dtype == "int8_w8a16"
+    block_shape = None
+    if (
+        hasattr(config, "quantization_config")
+        and "weight_block_size" in config.quantization_config
+    ):
+        block_shape = config.quantization_config["weight_block_size"]
+        assert len(block_shape) == 2
+
+    if args.batch_size is None:
+        batch_sizes = [
+            1,
+            2,
+            4,
+            8,
+            16,
+            24,
+            32,
+            48,
+            64,
+            96,
+            128,
+            256,
+            512,
+            1024,
+            1536,
+            2048,
+            3072,
+            4096,
+        ]
+    else:
+        batch_sizes = [args.batch_size]
+
+    ray.init()
+    num_gpus = int(ray.available_resources()["GPU"])
+    workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
+
+    def _distribute(method: str, inputs: List[Any]) -> List[Any]:
+        outputs = []
+        worker_idx = 0
+        for input_args in inputs:
+            worker = workers[worker_idx]
+            worker_method = getattr(worker, method)
+            output = worker_method.remote(*input_args)
+            outputs.append(output)
+            worker_idx = (worker_idx + 1) % num_gpus
+        return ray.get(outputs)
+
+    if args.tune:
+        search_space = get_configs_compute_bound()
+        if block_shape is not None:
+            block_n, block_k = block_shape[0], block_shape[1]
+            search_space = [
+                config
+                for config in search_space
+                if block_k % config["BLOCK_SIZE_K"] == 0
+            ]
+        print(f"Start tuning over {len(search_space)} configurations...")
+
+        start = time.perf_counter()
+        configs = _distribute(
+            "tune",
+            [
+                (
+                    batch_size,
+                    E,
+                    shard_intermediate_size,
+                    hidden_size,
+                    topk,
+                    dtype,
+                    use_fp8_w8a8,
+                    use_int8_w8a8,
+                    use_int8_w8a16,
+                    block_shape,
+                    search_space,
+                )
+                for batch_size in batch_sizes
+            ],
+        )
+        best_configs = {
+            M: sort_config(config) for M, config in zip(batch_sizes, configs)
+        }
+        save_configs(
+            best_configs,
+            E,
+            shard_intermediate_size,
+            hidden_size,
+            topk,
+            dtype,
+            use_fp8_w8a8,
+            use_int8_w8a8,
+            use_int8_w8a16,
+            block_shape,
+        )
+        end = time.perf_counter()
+        print(f"Tuning took {end - start:.2f} seconds")
+    else:
+        outputs = _distribute(
+            "benchmark",
+            [
+                (
+                    batch_size,
+                    E,
+                    shard_intermediate_size,
+                    hidden_size,
+                    topk,
+                    dtype,
+                    use_fp8_w8a8,
+                    use_int8_w8a8,
+                    use_int8_w8a16,
+                    block_shape,
+                )
+                for batch_size in batch_sizes
+            ],
+        )
+
+        for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
+            print(f"Batch size: {batch_size}, config: {config}")
+            print(f"Kernel time: {kernel_time:.2f} us")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1"
+    )
+    parser.add_argument("--tp-size", "--tp", type=int, default=2)
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        choices=["auto", "fp8_w8a8", "int8_w8a16", "int8_w8a8"],
+        default="auto",
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--batch-size", type=int, required=False)
+    parser.add_argument("--tune", action="store_true")
+    parser.add_argument("--disable-shared-experts-fusion", action="store_true")
+    args = parser.parse_args()
+
+    main(args)
diff --git a/benchmark/kernels/minmax-text-01-lightning_attention/benchmark_lightning_attention_decode.py b/benchmark/kernels/minmax-text-01-lightning_attention/benchmark_lightning_attention_decode.py
new file mode 100644
index 000000000..78d81499e
--- /dev/null
+++ b/benchmark/kernels/minmax-text-01-lightning_attention/benchmark_lightning_attention_decode.py
@@ -0,0 +1,576 @@
+import itertools
+import math
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+from einops import rearrange
+from sgl_kernel import lightning_attention_decode as sgl_lightning_attention_decode
+
+
+@triton.jit
+def _decode_kernel(
+    Q,
+    K,
+    V,
+    KV,
+    Out,
+    S,
+    b: tl.constexpr,
+    h: tl.constexpr,
+    n: tl.constexpr,
+    d: tl.constexpr,
+    d_original: tl.constexpr,
+    e: tl.constexpr,
+    e_original: tl.constexpr,
+):
+    off_bh = tl.program_id(0)
+    off_h = off_bh % h
+
+    qk_offset = off_bh * n * d
+    v_offset = off_bh * n * e
+    o_offset = off_bh * n * e
+    kv_offset = off_bh * d * e
+
+    s = tl.load(S + off_h)
+    ratio = tl.exp(-s)
+
+    d_idx = tl.arange(0, d)
+    e_idx = tl.arange(0, e)
+
+    # Create masks for original dimensions
+    d_mask = d_idx < d_original
+    e_mask = e_idx < e_original
+
+    # Load with masking
+    q = tl.load(Q + qk_offset + d_idx, mask=d_mask, other=0.0)
+    k = tl.load(K + qk_offset + d_idx, mask=d_mask, other=0.0)
+    v = tl.load(V + v_offset + e_idx, mask=e_mask, other=0.0)
+
+    # Load KV with 2D masking
+    kv = tl.load(
+        KV + kv_offset + d_idx[:, None] * e + e_idx[None, :],
+        mask=(d_mask[:, None] & e_mask[None, :]),
+        other=0.0,
+    )
+
+    # Compute outer product using element-wise operations
+    k_v_prod = k[:, None] * v[None, :]
+    kv = ratio * kv + k_v_prod
+
+    # Store KV with 2D masking
+    tl.store(
+        KV + kv_offset + d_idx[:, None] * e + e_idx[None, :],
+        kv.to(KV.dtype.element_ty),
+        mask=(d_mask[:, None] & e_mask[None, :]),
+    )
+
+    # Compute matrix-vector multiplication using element-wise operations and reduction
+    o = tl.sum(q[:, None] * kv, axis=0)
+
+    # Store output with masking
+    tl.store(Out + o_offset + e_idx, o.to(Out.dtype.element_ty), mask=e_mask)
+
+
+def lightning_attn_decode(q, k, v, kv, s):
+    """Triton implementation of Lightning Attention decode operation"""
+    b, h, n, d = q.shape
+    e = v.shape[-1]
+    assert n == 1, "Sequence length must be 1 in decode mode"
+
+    # Get padded dimensions (power of 2)
+    d_padded = next_power_of_2(d)
+    e_padded = next_power_of_2(e)
+
+    # Create output tensor (padded)
+    o_padded = torch.empty(b, h, n, e_padded, dtype=v.dtype, device=v.device)
+
+    # Create padded tensors without actually padding the data
+    q_padded = torch.empty(b, h, n, d_padded, dtype=q.dtype, device=q.device)
+    k_padded = torch.empty(b, h, n, d_padded, dtype=k.dtype, device=k.device)
+    v_padded = torch.empty(b, h, n, e_padded, dtype=v.dtype, device=v.device)
+    kv_padded = torch.empty(
+        b, h, d_padded, e_padded, dtype=torch.float32, device=kv.device
+    )
+
+    # Copy data to padded tensors
+    q_padded[..., :d] = q
+    k_padded[..., :d] = k
+    v_padded[..., :e] = v
+    kv_padded[..., :d, :e] = kv
+
+    # Launch kernel
+    grid = (b * h, 1)
+    _decode_kernel[grid](
+        q_padded,
+        k_padded,
+        v_padded,
+        kv_padded,
+        o_padded,
+        s,
+        b=b,
+        h=h,
+        n=n,
+        d=d_padded,
+        d_original=d,
+        e=e_padded,
+        e_original=e,
+    )
+
+    # Get unpadded outputs
+    o = o_padded[..., :e]
+    kv_out = kv_padded[..., :d, :e]
+
+    return o, kv_out
+
+
+def next_power_of_2(n):
+    return 2 ** (int(math.ceil(math.log(n, 2))))
+
+
+class MiniMaxText01LightningAttention(nn.Module):
+    def __init__(self, config=None, layer_idx: Optional[int] = None, **kwargs):
+        super().__init__()
+        if config is None:
+            config = type("Config", (), kwargs)
+
+        bias = False
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+
+        self.out_proj = nn.Linear(
+            self.head_dim * self.num_heads, self.hidden_size, bias=bias
+        )
+        self.act = get_activation_fn(config.hidden_act)
+        self.norm = MiniMaxText01RMSNorm(self.head_dim * self.num_heads)
+
+        self.qkv_proj = nn.Linear(
+            self.hidden_size, 3 * self.head_dim * self.num_heads, bias=bias
+        )
+        self.output_gate = nn.Linear(
+            self.hidden_size, self.head_dim * self.num_heads, bias=bias
+        )
+
+        # for inference only
+        self.offset = 0
+        self.layer_idx = layer_idx
+
+    def forward(
+        self,
+        hidden_states,
+        attn_mask: Optional[torch.Tensor] = None,  # (b, h, n, m)
+        output_attentions: bool = False,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        use_cache: bool = False,
+        slope_rate: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        if (not self.training) and (not do_eval):
+            return self.inference(
+                hidden_states,
+                attn_mask,
+                output_attentions,
+                past_key_value,
+                use_cache,
+                slope_rate,
+            )
+
+    def inference(
+        self,
+        x,
+        attn_mask: Optional[torch.Tensor] = None,  # (b, n)
+        output_attentions: bool = False,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        use_cache: bool = False,
+        slope_rate: Optional[torch.Tensor] = None,  # (h, 1, 1)
+    ):
+        # x: b n d
+        b, n, d = x.shape
+        # linear map
+        qkv = self.act(self.qkv_proj(x))
+        new_shape = qkv.size()[:-1] + (self.num_heads, -1)
+        qkv = qkv.view(*new_shape)
+        q, k, v = torch.split(qkv, [self.head_dim] * 3, dim=3)
+        q = q.transpose(1, 2)  # [b, n, h, d] -> [b, h, n, d]
+        k = k.transpose(1, 2)  # [b, n, h, d] -> [b, h, n, d]
+        v = v.transpose(1, 2)  # [b, n, h, d] -> [b, h, n, e]
+
+        self.offset += 1
+        ratio = torch.exp(-slope_rate)  # [h, 1, 1]
+
+        # decode mode
+        kv = past_key_value  # [b, h, d, e]
+        output = []
+        for i in range(n):
+            # kv: [b, h, d, e]
+            # ratio: [h, 1, 1]
+            # k: [b, h, n, d]
+            # v: [b, h, n, e]
+            # k[:, :, i : i + 1]: [b, h, 1, d]
+            # v[:, :, i : i + 1]: [b, h, 1, e]
+            # ratio * kv: [b, h, d, e]
+            # torch.einsum(
+            #     "... n d, ... n e -> ... d e",
+            #     k[:, :, i : i + 1],
+            #     v[:, :, i : i + 1],
+            # )
+            # [b, h, d, e] + [b, h, d, e] -> [b, h, d, e]
+            kv = ratio * kv + torch.einsum(
+                "... n d, ... n e -> ... d e",
+                k[:, :, i : i + 1],
+                v[:, :, i : i + 1],
+            )
+            # q[:, :, i : i + 1]: [b, h, 1, d]
+            # kv.to(q.dtype): [b, h, d, e]
+            # torch.einsum(
+            #     "... n e, ... e d -> ... n d", q[:, :, i : i + 1], kv.to(q.dtype)
+            # )
+            # [b, h, 1, d] * [b, h, d, e] -> [b, h, 1, e]
+            qkv = torch.einsum(
+                "... n e, ... e d -> ... n d", q[:, :, i : i + 1], kv.to(q.dtype)
+            )
+            output.append(qkv)
+        output = torch.cat(output, dim=-2)
+
+        # reshape
+        output = rearrange(output, "b h n d -> b n (h d)")
+        # normalize
+        output = self.norm(output)
+        # gate
+        output = F.sigmoid(self.output_gate(x)) * output
+        # outproj
+        output = self.out_proj(output)
+
+        attn_weights = None
+
+        return output, attn_weights, kv
+
+
+def get_activation_fn(activation):
+    if activation == "gelu":
+        return F.gelu
+    elif activation == "relu":
+        return F.relu
+    elif activation == "elu":
+        return F.elu
+    elif activation == "sigmoid":
+        return F.sigmoid
+    elif activation == "exp":
+
+        def f(x):
+            with torch.no_grad():
+                x_max = torch.max(x, dim=-1, keepdims=True).values
+            y = torch.exp(x - x_max)
+            return y
+
+        return f
+    elif activation == "leak":
+        return F.leaky_relu
+    elif activation == "1+elu":
+
+        def f(x):
+            return 1 + F.elu(x)
+
+        return f
+    elif activation == "2+elu":
+
+        def f(x):
+            return 2 + F.elu(x)
+
+        return f
+    elif activation == "silu" or activation == "swish":
+        return F.silu
+    elif activation == "sine":
+        return torch.sin
+    else:
+        return lambda x: x
+
+
+class MiniMaxText01RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        MiniMaxText01RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+def test_lightning_attention_implementations(model_params):
+    torch.manual_seed(42)
+
+    batch_size = 64
+    seq_len = 1
+    dtype = torch.bfloat16
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    hidden_states = torch.randn(
+        batch_size, seq_len, model_params["hidden_size"], dtype=dtype, device=device
+    )
+
+    attention_mask = torch.ones(batch_size, seq_len, dtype=dtype, device=device)
+
+    slope_rate = _build_slope_tensor(model_params["num_attention_heads"]).to(device)
+
+    model_attn = MiniMaxText01LightningAttention(**model_params).to(dtype).to(device)
+    model_attn.eval()
+
+    d = model_params["head_dim"]
+    past_kv = torch.randn(
+        batch_size,
+        model_params["num_attention_heads"],
+        d,
+        d,
+        device=device,
+    )
+    with torch.no_grad():
+        model_output, _, new_kv = model_attn.inference(
+            hidden_states,
+            attn_mask=attention_mask,
+            slope_rate=slope_rate,
+            past_key_value=past_kv,
+        )
+
+    qkv = model_attn.act(model_attn.qkv_proj(hidden_states))
+    new_shape = qkv.size()[:-1] + (model_attn.num_heads, -1)
+    qkv = qkv.view(*new_shape)
+    q, k, v = torch.split(qkv, [model_attn.head_dim] * 3, dim=-1)
+    q = q.transpose(1, 2)
+    k = k.transpose(1, 2)
+    v = v.transpose(1, 2)
+    q = q.contiguous()
+    k = k.contiguous()
+    v = v.contiguous()
+    past_kv = past_kv.contiguous()
+    slope_rate = slope_rate.contiguous()
+
+    # Test Triton implementation
+    triton_output, triton_new_kv = lightning_attn_decode(q, k, v, past_kv, slope_rate)
+    triton_output = triton_output.transpose(1, 2).contiguous()
+    triton_output = triton_output.view(batch_size, seq_len, -1)
+    triton_output = model_attn.norm(triton_output)
+    triton_output = torch.sigmoid(model_attn.output_gate(hidden_states)) * triton_output
+    triton_output = model_attn.out_proj(triton_output)
+
+    # Test SGL implementation
+    sgl_output = torch.empty_like(v)
+    sgl_new_kv = torch.empty_like(past_kv)
+    sgl_lightning_attention_decode(q, k, v, past_kv, slope_rate, sgl_output, sgl_new_kv)
+
+    sgl_output = sgl_output.transpose(1, 2).contiguous()
+    sgl_output = sgl_output.view(batch_size, seq_len, -1)
+    sgl_output = model_attn.norm(sgl_output)
+    sgl_output = torch.sigmoid(model_attn.output_gate(hidden_states)) * sgl_output
+    sgl_output = model_attn.out_proj(sgl_output)
+
+    # Verify Triton implementation results
+    torch.testing.assert_close(
+        model_output,
+        triton_output,
+        rtol=1e-3,
+        atol=1e-2,
+        msg="Triton lightning attention implementation produces different output results",
+    )
+    torch.testing.assert_close(
+        new_kv,
+        triton_new_kv,
+        rtol=1e-3,
+        atol=1e-2,
+        msg="Triton lightning attention implementation produces different kv results",
+    )
+
+    # Verify SGL implementation results
+    torch.testing.assert_close(
+        model_output,
+        sgl_output,
+        rtol=1e-3,
+        atol=1e-2,
+        msg="SGL lightning attention implementation produces different output results",
+    )
+    torch.testing.assert_close(
+        new_kv,
+        sgl_new_kv,
+        rtol=1e-3,
+        atol=1e-2,
+        msg="SGL lightning attention implementation produces different kv results",
+    )
+
+    print("✅ All implementations match")
+
+
+def _build_slope_tensor(n_attention_heads: int):
+    def get_slopes(n):
+        def get_slopes_power_of_2(n):
+            start = 2 ** (-(2 ** -(math.log2(n) - 3)))
+            ratio = start
+            return [start * ratio**i for i in range(n)]
+
+        if math.log2(n).is_integer():
+            return get_slopes_power_of_2(n)
+        else:
+            closest_power_of_2 = 2 ** math.floor(math.log2(n))
+            return (
+                get_slopes_power_of_2(closest_power_of_2)
+                + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
+            )
+
+    slopes = torch.tensor(get_slopes(n_attention_heads)).reshape(
+        n_attention_heads, 1, 1
+    )
+    return slopes
+
+
+def get_benchmark():
+    batch_size_range = [i for i in range(1, 33)]  # max 32
+    seq_length_range = [1]  # decode mode sequence length is fixed to 1
+    configs = list(itertools.product(batch_size_range, seq_length_range))
+
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["batch_size", "seq_len"],
+            x_vals=[list(_) for _ in configs],
+            line_arg="provider",
+            line_vals=["Original", "Triton", "SGL"],
+            line_names=[
+                "Original PyTorch Implementation",
+                "Triton Implementation",
+                "SGL Implementation",
+            ],
+            styles=[("blue", "-"), ("green", "-"), ("red", "-")],
+            ylabel="us",
+            plot_name="lightning-attention-decode-performance",
+            args={},
+        )
+    )
+    def benchmark(batch_size, seq_len, provider):
+        dtype = torch.bfloat16
+        device = torch.device("cuda")
+
+        params = {
+            "hidden_size": 6144,
+            "num_attention_heads": 64,
+            "head_dim": 96,
+            "hidden_act": "gelu",
+        }
+
+        hidden_states = torch.randn(
+            batch_size, seq_len, params["hidden_size"], dtype=dtype, device=device
+        )
+
+        attention_mask = torch.ones(batch_size, seq_len, dtype=dtype, device=device)
+
+        slope_rate = _build_slope_tensor(params["num_attention_heads"]).to(device)
+        model_attn = MiniMaxText01LightningAttention(**params).to(dtype).to(device)
+        model_attn.eval()
+
+        d = params["head_dim"]
+        past_kv = torch.randn(
+            batch_size,
+            params["num_attention_heads"],
+            d,
+            d,
+            device=device,
+        )
+
+        quantiles = [0.5, 0.2, 0.8]
+        if provider == "Original":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: model_attn.inference(
+                    hidden_states,
+                    attn_mask=attention_mask,
+                    slope_rate=slope_rate,
+                    past_key_value=past_kv,
+                ),
+                quantiles=quantiles,
+            )
+        elif provider == "Triton":
+
+            def run_triton():
+                qkv = model_attn.act(model_attn.qkv_proj(hidden_states))
+                new_shape = qkv.size()[:-1] + (model_attn.num_heads, -1)
+                qkv = qkv.view(*new_shape)
+                q, k, v = torch.split(qkv, [model_attn.head_dim] * 3, dim=-1)
+                q = q.transpose(1, 2)
+                k = k.transpose(1, 2)
+                v = v.transpose(1, 2)
+
+                output, new_kv = lightning_attn_decode(q, k, v, past_kv, slope_rate)
+                output = output.transpose(1, 2).contiguous()
+                output = output.view(batch_size, seq_len, -1)
+                output = model_attn.norm(output)
+                output = torch.sigmoid(model_attn.output_gate(hidden_states)) * output
+                return model_attn.out_proj(output)
+
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                run_triton,
+                quantiles=quantiles,
+            )
+        else:  # SGL
+
+            def run_sgl():
+                qkv = model_attn.act(model_attn.qkv_proj(hidden_states))
+                new_shape = qkv.size()[:-1] + (model_attn.num_heads, -1)
+                qkv = qkv.view(*new_shape)
+                q, k, v = torch.split(qkv, [model_attn.head_dim] * 3, dim=-1)
+                q = q.transpose(1, 2).contiguous()
+                k = k.transpose(1, 2).contiguous()
+                v = v.transpose(1, 2).contiguous()
+
+                output = torch.empty_like(v)
+                new_kv = torch.empty_like(past_kv)
+                sgl_lightning_attention_decode(
+                    q, k, v, past_kv, slope_rate, output, new_kv
+                )
+
+                output = output.transpose(1, 2).contiguous()
+                output = output.view(batch_size, seq_len, -1)
+                output = model_attn.norm(output)
+                output = torch.sigmoid(model_attn.output_gate(hidden_states)) * output
+                return model_attn.out_proj(output)
+
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                run_sgl,
+                quantiles=quantiles,
+            )
+
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+    return benchmark
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default="./configs/benchmark_ops/lightning_attention_decode/",
+        help="Path to save lightning attention decode benchmark results",
+    )
+    args = parser.parse_args()
+
+    params = {
+        "hidden_size": 6144,
+        "num_attention_heads": 64,
+        "head_dim": 96,
+        "hidden_act": "silu",
+    }
+    # Run correctness test first
+    # Adapted from https://huggingface.co/MiniMaxAI/MiniMax-Text-01/blob/main/config.json
+    test_lightning_attention_implementations(params)
+
+    # Run performance benchmark
+    benchmark = get_benchmark()
+    benchmark.run(print_data=True, save_path=args.save_path)
diff --git a/benchmark/kernels/minmax-text-01-lightning_attention/benchmark_lightning_attention_prefill.py b/benchmark/kernels/minmax-text-01-lightning_attention/benchmark_lightning_attention_prefill.py
new file mode 100644
index 000000000..3bf9054bd
--- /dev/null
+++ b/benchmark/kernels/minmax-text-01-lightning_attention/benchmark_lightning_attention_prefill.py
@@ -0,0 +1,603 @@
+import itertools
+import math
+import os
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+from einops import rearrange
+
+
+# Adapted from https://github.com/OpenNLPLab/lightning-attention/blob/main/lightning_attn/ops/triton/lightning_attn2.py
+@triton.jit
+def _fwd_kernel(
+    Q,
+    K,
+    V,
+    Out,
+    S,  # log lambda
+    b: tl.constexpr,
+    h: tl.constexpr,
+    n: tl.constexpr,
+    d: tl.constexpr,
+    e: tl.constexpr,
+    BLOCK: tl.constexpr,
+    NUM_BLOCK: tl.constexpr,
+    BLOCK_MODEL: tl.constexpr,
+):
+    ##### get offset
+    off_bh = tl.program_id(0)
+    off_h = off_bh % h
+    off_e = tl.program_id(1)
+    qk_offset = off_bh * n * d
+    v_offset = off_bh * n * e
+    o_offset = off_bh * n * e
+    # channel offset
+    e_offset = off_e * BLOCK_MODEL
+
+    ##### get block ptr
+    Q_block_ptr = Q + qk_offset + tl.arange(0, d)[None, :]
+    K_trans_block_ptr = K + qk_offset + tl.arange(0, d)[:, None]
+    V_block_ptr = V + v_offset + e_offset + tl.arange(0, BLOCK_MODEL)[None, :]
+    O_block_ptr = Out + o_offset + e_offset + tl.arange(0, BLOCK_MODEL)[None, :]
+    S_block_ptr = S + off_h
+
+    ##### init diag decay(Lambda); q, k decay; kv
+    s = tl.load(S_block_ptr)
+    # q, k decay
+    off_block = tl.arange(
+        0, BLOCK
+    )  # Not bug, this is a bit different from algorithm 1, but is mathematically equivalent
+    q_decay = tl.exp(-s.to(tl.float32) * off_block[:, None])
+    k_trans_decay = tl.exp(-s.to(tl.float32) * (BLOCK - off_block[None, :]))
+    block_decay = tl.exp(-s.to(tl.float32) * BLOCK)
+    # diag decay
+    index = off_block[:, None] - off_block[None, :]
+    s_index = s * index
+    s_index = tl.where(index >= 0, -s_index, float("-inf"))
+    diag_decay = tl.exp(s_index)
+    kv = tl.zeros([d, BLOCK_MODEL], dtype=tl.float32)
+
+    ##### compute
+    for i in range(NUM_BLOCK):
+        # load
+        q = tl.load(
+            Q_block_ptr + off_block[:, None] * d, mask=off_block[:, None] < n, other=0.0
+        ).to(tl.float32)
+        k_trans = tl.load(
+            K_trans_block_ptr + off_block[None, :] * d,
+            mask=off_block[None, :] < n,
+            other=0.0,
+        ).to(tl.float32)
+        v = tl.load(
+            V_block_ptr + off_block[:, None] * e, mask=off_block[:, None] < n, other=0.0
+        ).to(tl.float32)
+
+        # compute
+        qk = tl.dot(q, k_trans) * diag_decay
+        o_intra = tl.dot(qk, v)
+        o_inter = tl.dot(q, kv) * q_decay
+        o = o_intra + o_inter
+
+        # save and update
+        tl.store(
+            O_block_ptr + off_block[:, None] * e,
+            o.to(O_block_ptr.dtype.element_ty),
+            mask=off_block[:, None] < n,
+        )
+        kv = block_decay * kv + tl.dot(k_trans * k_trans_decay, v)
+        off_block += BLOCK
+
+
+def lightning_attn2(q, k, v, s):
+    q = q.contiguous()
+    k = k.contiguous()
+    v = v.contiguous()
+    s = s.contiguous()
+
+    b, h, n, d = q.shape
+    e = v.shape[-1]
+
+    # Pad d to next power of 2
+    d_padded = next_power_of_2(d)
+    if d_padded != d:
+        q_padded = F.pad(q, (0, d_padded - d))
+        k_padded = F.pad(k, (0, d_padded - d))
+    else:
+        q_padded = q
+        k_padded = k
+
+    # Pad e to next power of 2
+    e_padded = next_power_of_2(e)
+    if e_padded != e:
+        v_padded = F.pad(v, (0, e_padded - e))
+    else:
+        v_padded = v
+
+    o_padded = torch.empty((b, h, n, e_padded), dtype=q.dtype, device=q.device)
+
+    BLOCK = 64
+    NUM_BLOCK = triton.cdiv(q.shape[2], BLOCK)
+    # parallel over channel
+    BLOCK_MODEL = min(triton.next_power_of_2(e_padded), 32)
+    grid = (b * h, triton.cdiv(e_padded, BLOCK_MODEL))
+
+    _fwd_kernel[grid](
+        q_padded,
+        k_padded,
+        v_padded,
+        o_padded,
+        s,
+        b,
+        h,
+        n,
+        d_padded,
+        e_padded,
+        BLOCK=BLOCK,
+        NUM_BLOCK=NUM_BLOCK,
+        BLOCK_MODEL=BLOCK_MODEL,
+    )
+
+    # Remove padding from output
+    if e_padded != e:
+        o = o_padded[..., :e]
+    else:
+        o = o_padded
+
+    return o
+
+
+def is_support(dim):
+    return 16 % dim
+
+
+def next_power_of_2(n):
+    return 2 ** (int(math.ceil(math.log(n, 2))))
+
+
+def lightning_attn_func(q, k, v, s):
+    b, h, n, d = q.shape
+    e = v.shape[-1]
+    assert is_support(d) and is_support(e)
+
+    # pad v's feature dim to power of 2
+    e_pad = next_power_of_2(e)
+    need_pad = e_pad != e
+    if need_pad:
+        v = F.pad(v, (0, e_pad - e))
+
+    if d > 128:
+        # split over head
+        if 64 % d:
+            m = 64
+        elif 32 % d:
+            m = 32
+        elif 16 % d:
+            m = 16
+        arr = [m * i for i in range(d // m + 1)]
+        if arr[-1] != d:
+            arr.append(d)
+        n = len(arr)
+        o = 0
+        for i in range(n - 1):
+            start = arr[i]
+            end = arr[i + 1]
+            q1 = q[..., start:end]
+            k1 = k[..., start:end]
+            o += lightning_attn2(q1, k1, v, s)
+    else:
+        o = lightning_attn2(q, k, v, s)
+
+    if need_pad:
+        o = o[:, :, :, :e]
+
+    return o
+
+
+debug = eval(os.environ.get("debug", default="False"))
+
+BLOCK = 256
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->MiniMaxText01
+class MiniMaxText01RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        MiniMaxText01RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+# Copied from https://huggingface.co/MiniMaxAI/MiniMax-Text-01/blob/main/modeling_minimax_text_01.py
+def get_activation_fn(activation):
+    if debug:
+        logger.info(f"activation: {activation}")
+    if activation == "gelu":
+        return F.gelu
+    elif activation == "relu":
+        return F.relu
+    elif activation == "elu":
+        return F.elu
+    elif activation == "sigmoid":
+        return F.sigmoid
+    elif activation == "exp":
+
+        def f(x):
+            with torch.no_grad():
+                x_max = torch.max(x, dim=-1, keepdims=True).values
+            y = torch.exp(x - x_max)
+
+            return y
+
+        return f
+    elif activation == "leak":
+        return F.leaky_relu
+    elif activation == "1+elu":
+
+        def f(x):
+            return 1 + F.elu(x)
+
+        return f
+    elif activation == "2+elu":
+
+        def f(x):
+            return 2 + F.elu(x)
+
+        return f
+    elif activation == "silu" or activation == "swish":
+        return F.silu
+    elif activation == "sine":
+        return torch.sin
+    else:
+        logger.info(f"activation: does not support {activation}, use Identity!!!")
+        return lambda x: x
+
+
+# Copied from https://huggingface.co/MiniMaxAI/MiniMax-Text-01/blob/main/modeling_minimax_text_01.py
+class MiniMaxText01LightningAttention(nn.Module):
+    def __init__(self, config=None, layer_idx: Optional[int] = None, **kwargs):
+        super().__init__()
+        if config is None:
+            config = type("Config", (), kwargs)
+
+        bias = False
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+
+        self.out_proj = nn.Linear(
+            self.head_dim * self.num_heads, self.hidden_size, bias=bias
+        )
+        self.act = get_activation_fn(config.hidden_act)
+        self.norm = MiniMaxText01RMSNorm(self.head_dim * self.num_heads)
+
+        self.qkv_proj = nn.Linear(
+            self.hidden_size, 3 * self.head_dim * self.num_heads, bias=bias
+        )
+        self.output_gate = nn.Linear(
+            self.hidden_size, self.head_dim * self.num_heads, bias=bias
+        )
+
+        # for inference only
+        self.offset = 0
+        self.layer_idx = layer_idx
+
+    def forward(
+        self,
+        hidden_states,
+        attn_mask: Optional[torch.Tensor] = None,  # (b, h, n, m)
+        output_attentions: bool = False,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        use_cache: bool = False,
+        slope_rate: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        if (not self.training) and (not do_eval):
+            return self.inference(
+                hidden_states,
+                attn_mask,
+                output_attentions,
+                past_key_value,
+                use_cache,
+                slope_rate,
+            )
+
+    def inference(
+        self,
+        x,
+        attn_mask: Optional[torch.Tensor] = None,  # (b, n)
+        output_attentions: bool = False,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        use_cache: bool = False,
+        slope_rate: Optional[torch.Tensor] = None,  # (h, 1, 1)
+    ):
+        # x: b n d
+        b, n, d = x.shape
+        # linear map
+        qkv = self.act(self.qkv_proj(x))
+        new_shape = qkv.size()[:-1] + (self.num_heads, -1)
+        qkv = qkv.view(*new_shape)
+        q, k, v = torch.split(qkv, [self.head_dim] * 3, dim=3)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+
+        if past_key_value is None:
+            self.offset = q.shape[-2]
+        else:
+            self.offset += 1
+
+        # for align with metaseq
+        ratio = torch.exp(-slope_rate)
+
+        # only use for the first time
+        if past_key_value is None:
+            slope_rate = slope_rate.to(torch.float32)
+            if attn_mask is not None:
+                v = v.masked_fill(
+                    (1 - attn_mask).unsqueeze(1).unsqueeze(-1).to(torch.bool), 0
+                )
+            NUM_BLOCK = (n + BLOCK - 1) // BLOCK
+            b, h, n, d = q.shape
+            e = v.shape[-1]
+            # other
+            array = torch.arange(BLOCK).to(q) + 1
+            q_decay = torch.exp(-slope_rate * array.reshape(-1, 1))
+            k_decay = torch.exp(-slope_rate * (BLOCK - array.reshape(-1, 1)))
+            index = array[:, None] - array[None, :]
+            s_index = (
+                slope_rate
+                * index[
+                    None,
+                    None,
+                ]
+            )
+            s_index = torch.where(index >= 0, -s_index, float("-inf"))
+            diag_decay = torch.exp(s_index)
+
+            kv = torch.zeros(b, h, d, e).to(torch.float32).to(q.device)
+            output = torch.empty((b, h, n, e), dtype=q.dtype, device=q.device)
+            for i in range(NUM_BLOCK):
+                si = i * BLOCK
+                ei = min(si + BLOCK, n)
+                m = ei - si
+                qi = q[:, :, si:ei].contiguous()
+                ki = k[:, :, si:ei].contiguous()
+                vi = v[:, :, si:ei].contiguous()
+                qkv_none_diag = torch.matmul(qi * q_decay[:, :m], kv).to(torch.float32)
+
+                # diag
+                qk = (
+                    torch.matmul(qi, ki.transpose(-1, -2)).to(torch.float32)
+                    * diag_decay[:, :, :m, :m]
+                )
+                qkv_diag = torch.matmul(qk, vi.to(torch.float32))
+                block_decay = torch.exp(-slope_rate * m)
+                output[:, :, si:ei] = qkv_none_diag + qkv_diag
+                kv = block_decay * kv + torch.matmul(
+                    (ki * k_decay[:, -m:]).transpose(-1, -2).to(vi.dtype), vi
+                )
+
+        else:
+            kv = past_key_value
+            output = []
+            for i in range(n):
+                kv = ratio * kv + torch.einsum(
+                    "... n d, ... n e -> ... d e",
+                    k[:, :, i : i + 1],
+                    v[:, :, i : i + 1],
+                )
+                qkv = torch.einsum(
+                    "... n e, ... e d -> ... n d", q[:, :, i : i + 1], kv.to(q.dtype)
+                )
+                output.append(qkv)
+            output = torch.cat(output, dim=-2)
+        # reshape
+        output = rearrange(output, "b h n d -> b n (h d)")
+        # normalize
+        output = self.norm(output)
+        # gate
+        output = F.sigmoid(self.output_gate(x)) * output
+        # outproj
+        output = self.out_proj(output)
+
+        attn_weights = None
+
+        return output, attn_weights, kv
+
+
+def _build_slope_tensor(n_attention_heads: int):
+    def get_slopes(n):
+        def get_slopes_power_of_2(n):
+            start = 2 ** (-(2 ** -(math.log2(n) - 3)))
+            ratio = start
+            return [start * ratio**i for i in range(n)]
+
+        if math.log2(n).is_integer():
+            return get_slopes_power_of_2(
+                n
+            )  # In the paper, we only train models that have 2^a heads for some a. This function has
+        else:  # some good properties that only occur when the input is a power of 2. To maintain that even
+            closest_power_of_2 = 2 ** math.floor(
+                math.log2(n)
+            )  # when the number of heads is not a power of 2, we use this workaround.
+            return (
+                get_slopes_power_of_2(closest_power_of_2)
+                + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
+            )
+
+    # h, 1, 1
+    slopes = torch.tensor(get_slopes(n_attention_heads)).reshape(
+        n_attention_heads, 1, 1
+    )
+
+    return slopes
+
+
+def test_lightning_attention_implementations(model_params):
+    torch.manual_seed(42)
+
+    batch_size = 2
+    seq_len = 1024
+    dtype = torch.bfloat16
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    hidden_states = torch.randn(
+        batch_size, seq_len, model_params["hidden_size"], dtype=dtype, device=device
+    )
+
+    attention_mask = torch.ones(batch_size, seq_len, dtype=dtype, device=device)
+
+    slope_rate = _build_slope_tensor(model_params["num_attention_heads"]).to(device)
+
+    model_attn = MiniMaxText01LightningAttention(**model_params).to(dtype).to(device)
+    model_attn.eval()
+
+    with torch.no_grad():
+        model_output, _, _ = model_attn.inference(
+            hidden_states, attn_mask=attention_mask, slope_rate=slope_rate
+        )
+
+    qkv = model_attn.act(model_attn.qkv_proj(hidden_states))
+    new_shape = qkv.size()[:-1] + (model_attn.num_heads, -1)
+    qkv = qkv.view(*new_shape)
+    q, k, v = torch.split(qkv, [model_attn.head_dim] * 3, dim=-1)
+    q = q.transpose(1, 2)
+    k = k.transpose(1, 2)
+    v = v.transpose(1, 2)
+
+    lib_output = lightning_attn_func(q, k, v, slope_rate)
+    lib_output = lib_output.transpose(1, 2).contiguous()
+    lib_output = lib_output.view(batch_size, seq_len, -1)
+    lib_output = model_attn.norm(lib_output)
+    lib_output = torch.sigmoid(model_attn.output_gate(hidden_states)) * lib_output
+    lib_output = model_attn.out_proj(lib_output)
+
+    torch.testing.assert_close(
+        model_output,
+        lib_output,
+        rtol=1e-3,
+        atol=1e-2,
+        msg="Lightning attention implementations produce different results",
+    )
+
+    print("✅ Two implementations match")
+
+
+def get_benchmark():
+    batch_size_range = [2**i for i in range(0, 7)]  # max 64
+    seq_length_range = [256, 512, 1024, 2048, 4096]  # max 4096
+    configs = list(itertools.product(batch_size_range, seq_length_range))
+
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["batch_size", "seq_len"],
+            x_vals=[list(_) for _ in configs],
+            line_arg="provider",
+            line_vals=["MiniMax-Text-01", "OpenNLPLab"],
+            line_names=[
+                "MiniMax-Text-01 Model Implementation",
+                "OpenNLPLab Library Implementation",
+            ],
+            styles=[("blue", "-"), ("green", "-")],
+            ylabel="us",
+            plot_name="lightning-attention-prefill-performance",
+            args={},
+        )
+    )
+    def benchmark(batch_size, seq_len, provider):
+        dtype = torch.bfloat16
+        device = torch.device("cuda")
+
+        params = {
+            "hidden_size": 6144,
+            "num_attention_heads": 64,
+            "head_dim": 96,
+            "hidden_act": "gelu",
+        }
+
+        hidden_states = torch.randn(
+            batch_size, seq_len, params["hidden_size"], dtype=dtype, device=device
+        )
+
+        attention_mask = torch.ones(batch_size, seq_len, dtype=dtype, device=device)
+
+        slope_rate = _build_slope_tensor(params["num_attention_heads"]).to(device)
+        model_attn = MiniMaxText01LightningAttention(**params).to(dtype).to(device)
+        model_attn.eval()
+
+        quantiles = [0.5, 0.2, 0.8]
+        if provider == "MiniMax-Text-01":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: model_attn.inference(
+                    hidden_states, attn_mask=attention_mask, slope_rate=slope_rate
+                ),
+                quantiles=quantiles,
+            )
+        else:
+
+            def run_lib():
+                qkv = model_attn.act(model_attn.qkv_proj(hidden_states))
+                new_shape = qkv.size()[:-1] + (model_attn.num_heads, -1)
+                qkv = qkv.view(*new_shape)
+                q, k, v = torch.split(qkv, [model_attn.head_dim] * 3, dim=-1)
+                q = q.transpose(1, 2)
+                k = k.transpose(1, 2)
+                v = v.transpose(1, 2)
+
+                lib_output = lightning_attn_func(q, k, v, slope_rate)
+                lib_output = lib_output.transpose(1, 2).contiguous()
+                lib_output = lib_output.view(batch_size, seq_len, -1)
+                lib_output = model_attn.norm(lib_output)
+                lib_output = (
+                    torch.sigmoid(model_attn.output_gate(hidden_states)) * lib_output
+                )
+                return model_attn.out_proj(lib_output)
+
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                run_lib,
+                quantiles=quantiles,
+            )
+
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+    return benchmark
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default="./configs/benchmark_ops/lightning_attention_prefill/",
+        help="Path to save lightning attention prefill benchmark results",
+    )
+    args = parser.parse_args()
+
+    # Run correctness test first
+    # Adapted from https://huggingface.co/MiniMaxAI/MiniMax-Text-01/blob/main/config.json
+    params = {
+        "hidden_size": 6144,
+        "num_attention_heads": 64,
+        "head_dim": 96,
+        "hidden_act": "silu",
+    }
+    test_lightning_attention_implementations(params)
+
+    # Run performance benchmark
+    benchmark = get_benchmark()
+    benchmark.run(print_data=True, save_path=args.save_path)
diff --git a/benchmark/kernels/quantization/bench_fp4_quant.py b/benchmark/kernels/quantization/bench_fp4_quant.py
new file mode 100644
index 000000000..318e820ad
--- /dev/null
+++ b/benchmark/kernels/quantization/bench_fp4_quant.py
@@ -0,0 +1,133 @@
+import argparse
+import itertools
+
+import torch
+import triton
+from sgl_kernel import scaled_fp4_grouped_quant, silu_and_mul_scaled_fp4_grouped_quant
+from sgl_kernel.elementwise import silu_and_mul
+
+from sglang.srt.layers.moe.ep_moe.kernels import silu_and_mul_masked_post_quant_fwd
+from sglang.srt.layers.quantization import deep_gemm_wrapper
+
+
+def _test_accuracy_once(E, M, K, input_dtype, device):
+    x = torch.randn(E, M, K, device=device, dtype=input_dtype)
+    glb_scales = torch.ones((E,), dtype=torch.float32, device=device)
+    masks = torch.full((E,), M, dtype=torch.int32, device=device)
+    out, blk_scales = silu_and_mul_scaled_fp4_grouped_quant(x, glb_scales, masks)
+    out1, blk_scales1 = scaled_fp4_grouped_quant(
+        silu_and_mul(x),
+        glb_scales,
+        masks,
+    )
+
+    torch.testing.assert_close(out, out1)
+    torch.testing.assert_close(blk_scales, blk_scales1)
+    print(f"E: {E}, M: {M}, K: {K}, type: {input_dtype} OK")
+
+
+NUM_RANKS = 48
+M_PER_RANKs = [128, 256, 512, 1024]
+Ms = [M_PER_RANK * NUM_RANKS for M_PER_RANK in M_PER_RANKs]
+Ks = [2048, 4096, 7168]
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["M", "K"],
+        x_vals=list(itertools.product(Ms, Ks)),
+        x_log=False,
+        line_arg="provider",
+        line_vals=["triton_fp8", "cuda_unfused_fp4", "cuda_fused_fp4"],
+        line_names=["triton_fp8", "cuda_unfused_fp4", "cuda_fused_fp4"],
+        styles=[("blue", "-"), ("orange", "-"), ("green", "-")],
+        ylabel="ms",
+        plot_name="fp4 quant",
+        args={},
+    )
+)
+def benchmark(M, K, provider):
+    E = 6
+    device = "cuda"
+    x = torch.randn(E, M, K, device=device, dtype=torch.bfloat16)
+    glb_scales = torch.ones((E,), dtype=torch.float32, device=device)
+    masks = torch.randint(1, 4096, (E,), dtype=torch.int32, device=device)
+    fp8_out = torch.empty(
+        (
+            x.shape[0],
+            x.shape[1],
+            x.shape[2] // 2,
+        ),
+        device=x.device,
+        dtype=torch.float8_e4m3fn,
+    )
+    scale_block_size = 128
+    fp8_scales = torch.empty(
+        (
+            x.shape[0],
+            x.shape[1],
+            x.shape[2] // 2 // scale_block_size,
+        ),
+        device=x.device,
+        dtype=torch.float32,
+    )
+
+    quantiles = [0.5, 0.2, 0.8]
+    if provider == "triton_fp8":
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: silu_and_mul_masked_post_quant_fwd(
+                x,
+                fp8_out,
+                fp8_scales,
+                scale_block_size,
+                masks,
+                scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
+            ),
+            quantiles=quantiles,
+        )
+    if provider == "cuda_unfused_fp4":
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: scaled_fp4_grouped_quant(
+                silu_and_mul(x),
+                glb_scales,
+                masks,
+            ),
+            quantiles=quantiles,
+        )
+    if provider == "cuda_fused_fp4":
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: silu_and_mul_scaled_fp4_grouped_quant(
+                x,
+                glb_scales,
+                masks,
+            ),
+            quantiles=quantiles,
+        )
+
+    return ms, min_ms, max_ms
+
+
+def test_accuracy():
+    E = 6
+    N_RANKS = 48
+    Ms = [128, 256, 512, 1024]
+    Ks = [2048, 4096, 7168]
+    input_dtype = torch.bfloat16
+    for M in Ms:
+        for K in Ks:
+            _test_accuracy_once(E, N_RANKS * M, K, input_dtype, "cuda")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default="./bench_fp4_quant_res",
+        help="Path to save fp4 quant benchmark results",
+    )
+    args = parser.parse_args()
+
+    test_accuracy()
+
+    benchmark.run(print_data=True, show_plots=True, save_path=args.save_path)
diff --git a/benchmark/kernels/quantization/bench_int8_quant.py b/benchmark/kernels/quantization/bench_int8_quant.py
new file mode 100644
index 000000000..94b795690
--- /dev/null
+++ b/benchmark/kernels/quantization/bench_int8_quant.py
@@ -0,0 +1,94 @@
+import argparse
+
+import torch
+import triton
+from vllm._custom_ops import scaled_int8_quant as vllm_scaled_int8_quant
+
+from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
+
+
+@torch.compile(backend="inductor")
+def torch_int8_quant(x):
+    int8_max = torch.iinfo(torch.int8).max
+
+    abs_max = x.abs().max(dim=-1, keepdim=True).values
+    scales = abs_max.to(torch.float32) / float(int8_max)
+
+    q_x = (x / scales).round().to(torch.int8)
+
+    return q_x, scales
+
+
+def _test_accuracy_once(M, K, input_dtype, device):
+    x = torch.randn(M, K, dtype=input_dtype, device=device) * 5000
+    out, scales, _ = vllm_scaled_int8_quant(x, symmetric=True)
+    out1, scales1 = per_token_quant_int8(x)
+    out2, scales2 = torch_int8_quant(x)
+    torch.testing.assert_close(out, out2, atol=1, rtol=0)
+    torch.testing.assert_close(out, out1, atol=1, rtol=0)
+    torch.testing.assert_close(scales, scales2)
+    torch.testing.assert_close(scales1, scales2)
+    print(f"M: {M}, K: {K}, type: {input_dtype} OK")
+
+
+def test_accuracy():
+    Ms = [1, 13, 128, 1024, 2048, 4096]
+    Ks = [512, 1024, 2048, 8192]
+    input_dtypes = [torch.float16, torch.bfloat16]
+    for M in Ms:
+        for K in Ks:
+            for input_dtype in input_dtypes:
+                _test_accuracy_once(M, K, input_dtype, "cuda")
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048],
+        x_log=False,
+        line_arg="provider",
+        line_vals=["vllm op", "triton", "torch.compile"],
+        line_names=["vllm op", "triton", "torch.compile"],
+        styles=[("blue", "-"), ("orange", "-"), ("red", "-")],
+        ylabel="ms",
+        plot_name="int8 per token quant",
+        args={},
+    )
+)
+def benchmark(batch_size, provider):
+    M, K = batch_size, 16384
+    x = torch.randn(M, K, dtype=torch.float16, device="cuda") * 1000
+
+    quantiles = [0.5, 0.2, 0.8]
+    if provider == "vllm op":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: vllm_scaled_int8_quant(x, symmetric=True),
+            quantiles=quantiles,
+        )
+    if provider == "triton":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: per_token_quant_int8(x),
+            quantiles=quantiles,
+        )
+    if provider == "torch.compile":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: torch_int8_quant(x),
+            quantiles=quantiles,
+        )
+
+    return ms, min_ms, max_ms
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default="./bench_int8_quant_res",
+        help="Path to save int8 quant benchmark results",
+    )
+    args = parser.parse_args()
+
+    test_accuracy()
+
+    benchmark.run(print_data=True, show_plots=True, save_path=args.save_path)
diff --git a/benchmark/kernels/quantization/tuning_block_wise_kernel.py b/benchmark/kernels/quantization/tuning_block_wise_kernel.py
new file mode 100644
index 000000000..1b51e54b7
--- /dev/null
+++ b/benchmark/kernels/quantization/tuning_block_wise_kernel.py
@@ -0,0 +1,474 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import argparse
+import json
+import multiprocessing as mp
+import os
+import time
+from datetime import datetime
+from typing import Any, Dict, List
+
+import torch
+import triton
+from tqdm import tqdm
+
+mp.set_start_method("spawn", force=True)
+
+from sglang.srt.layers.quantization.fp8_kernel import (
+    _w8a8_block_fp8_matmul,
+    _w8a8_block_fp8_matmul_unrolledx4,
+)
+from sglang.srt.layers.quantization.int8_kernel import _w8a8_block_int8_matmul
+from sglang.srt.utils import get_device_core_count, get_device_name, is_hip
+
+_is_hip = is_hip()
+
+DTYPE_MAP = {
+    "float32": torch.float32,
+    "float16": torch.float16,
+    "half": torch.half,
+    "bfloat16": torch.bfloat16,
+}
+
+
+def w8a8_block_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: List[int],
+    config: Dict[str, Any],
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    """This function performs matrix multiplication with block-wise quantization.
+
+    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
+    The output is returned in the specified `output_dtype`.
+
+    Args:
+        A: The input tensor, e.g., activation.
+        B: The input tensor, e.g., weight.
+        As: The per-token-group quantization scale for `A`.
+        Bs: The per-block quantization scale for `B`.
+        block_size: The block size for per-block quantization. It should be 2-dim, e.g., [128, 128].
+        output_dytpe: The dtype of the returned tensor.
+
+    Returns:
+        torch.Tensor: The result of matmul.
+    """
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+
+    assert A.shape[-1] == B.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
+    assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
+    M = A.numel() // A.shape[-1]
+
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    N, K = B.shape
+    assert triton.cdiv(N, block_n) == Bs.shape[0]
+    assert triton.cdiv(K, block_k) == Bs.shape[1]
+
+    C_shape = A.shape[:-1] + (N,)
+    C = A.new_empty(C_shape, dtype=output_dtype)
+
+    def grid(META):
+        return (
+            triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
+        )
+
+    # Use manually unrolledx4 kernel on AMD GPU when the grid size is small.
+    # Empirical testing shows the sweet spot lies when it's less than the # of
+    # compute units available on the device.
+    num_workgroups = triton.cdiv(M, config["BLOCK_SIZE_M"]) * triton.cdiv(
+        N, config["BLOCK_SIZE_N"]
+    )
+
+    if A.dtype == torch.float8_e4m3fnuz or A.dtype == torch.float8_e4m3fn:
+        kernel = (
+            _w8a8_block_fp8_matmul_unrolledx4
+            if (_is_hip == True and num_workgroups <= get_device_core_count())
+            else _w8a8_block_fp8_matmul
+        )
+    else:
+        kernel = _w8a8_block_int8_matmul
+
+    kernel[grid](
+        A,
+        B,
+        C,
+        As,
+        Bs,
+        M,
+        N,
+        K,
+        block_n,
+        block_k,
+        A.stride(-2),
+        A.stride(-1),
+        B.stride(1),
+        B.stride(0),
+        C.stride(-2),
+        C.stride(-1),
+        As.stride(-2),
+        As.stride(-1),
+        Bs.stride(1),
+        Bs.stride(0),
+        **config,
+    )
+
+    return C
+
+
+def get_rocm_configs_compute_bound():
+    configs = []
+    waves_per_eu_range = 0
+    for num_stages in [2]:
+        for block_m in [32, 64, 128, 256]:
+            for block_k in [32, 64, 128, 256]:
+                for block_n in [16, 32, 64, 128, 256]:
+                    for num_warps in [4, 8]:
+                        for group_size in [1, 4, 8, 16, 32]:
+                            configs.append(
+                                {
+                                    "BLOCK_SIZE_M": block_m,
+                                    "BLOCK_SIZE_N": block_n,
+                                    "BLOCK_SIZE_K": block_k,
+                                    "GROUP_SIZE_M": group_size,
+                                    "num_warps": num_warps,
+                                    "num_stages": num_stages,
+                                    "waves_per_eu": waves_per_eu_range,
+                                }
+                            )
+    return configs
+
+
+def get_configs_compute_bound():
+    configs = []
+    if _is_hip:
+        configs = get_rocm_configs_compute_bound()
+    else:
+        for num_stages in [2, 3, 4, 5]:
+            for block_m in [16, 32, 64, 128, 256]:
+                for block_k in [64, 128]:
+                    for block_n in [32, 64, 128, 256]:
+                        for num_warps in [4, 8]:
+                            for group_size in [1, 16, 32, 64]:
+                                configs.append(
+                                    {
+                                        "BLOCK_SIZE_M": block_m,
+                                        "BLOCK_SIZE_N": block_n,
+                                        "BLOCK_SIZE_K": block_k,
+                                        "GROUP_SIZE_M": group_size,
+                                        "num_warps": num_warps,
+                                        "num_stages": num_stages,
+                                    }
+                                )
+    return configs
+
+
+def get_weight_shapes(tp_size):
+    # NOTE(HandH1998): The weight shapes only works for DeepSeek-V3. Modify them, if you tune for another different model.
+    # cannot TP
+    total = [
+        (512 + 64, 7168),
+        ((128 + 64) * 128, 7168),
+        (128 * (128 + 128), 512),
+        (7168, 16384),
+        (7168, 18432),
+    ]
+    # N can TP
+    n_tp = [
+        (18432 * 2, 7168),
+        ((128 + 64) * 128, 7168),
+        (128 * (128 + 128), 512),
+        (24576, 1536),
+        (4096, 7168),
+    ]
+    # K can TP
+    k_tp = [(7168, 18432), (7168, 16384), (7168, 2048)]
+
+    weight_shapes = []
+    for t in total:
+        weight_shapes.append(t)
+    for n_t in n_tp:
+        new_t = (n_t[0] // tp_size, n_t[1])
+        weight_shapes.append(new_t)
+    for k_t in k_tp:
+        new_t = (k_t[0], k_t[1] // tp_size)
+        weight_shapes.append(new_t)
+    return weight_shapes
+
+
+def benchmark_config(
+    A, B, As, Bs, block_size, config, out_dtype=torch.float16, num_iters=10
+):
+    def run():
+        w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)
+
+    torch.cuda.synchronize()
+    # JIT complication & warmup
+    for _ in range(5):
+        run()
+    torch.cuda.synchronize()
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    latencies: List[float] = []
+    for i in range(num_iters):
+        torch.cuda.synchronize()
+        start_event.record()
+        run()
+        end_event.record()
+        end_event.synchronize()
+        latencies.append(start_event.elapsed_time(end_event))
+    avg = sum(latencies) / (num_iters * 10) * 1000  # us
+    return avg
+
+
+def tune(M, N, K, block_size, out_dtype, search_space, input_type):
+    factor_for_scale = 1e-2
+
+    if input_type == "fp8":
+        fp8_info = torch.finfo(
+            torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
+        )
+        fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+        A_fp32 = (
+            (torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max
+        )
+        A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(
+            torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
+        )
+
+        B_fp32 = (
+            (torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max
+        )
+        B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(
+            torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
+        )
+    else:
+        int8_info = torch.iinfo(torch.int8)
+        int8_max, int8_min = int8_info.max, int8_info.min
+
+        A_fp32 = (
+            (torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * int8_max
+        )
+        A = A_fp32.clamp(min=int8_min, max=int8_max).to(torch.int8)
+
+        B_fp32 = (
+            (torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * int8_max
+        )
+        B = B_fp32.clamp(min=int8_min, max=int8_max).to(torch.int8)
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+
+    As = torch.rand(M, k_tiles, dtype=torch.float32, device="cuda") * factor_for_scale
+    Bs = (
+        torch.rand(n_tiles, k_tiles, dtype=torch.float32, device="cuda")
+        * factor_for_scale
+    )
+
+    best_config = None
+    best_time = float("inf")
+    for config in tqdm(search_space):
+        try:
+            kernel_time = benchmark_config(
+                A,
+                B,
+                As,
+                Bs,
+                block_size,
+                config,
+                out_dtype,
+                num_iters=10,
+            )
+        except triton.runtime.autotuner.OutOfResources:
+            # Some configurations may be invalid and fail to compile.
+            continue
+
+        if kernel_time < best_time:
+            best_time = kernel_time
+            best_config = config
+    now = datetime.now()
+    print(f"{now.ctime()}] Completed tuning for batch_size={M}")
+    assert best_config is not None
+    return best_config
+
+
+def save_configs(
+    N,
+    K,
+    block_n,
+    block_k,
+    configs,
+    save_path,
+    input_type="fp8",
+) -> None:
+    os.makedirs(save_path, exist_ok=True)
+    device_name = get_device_name().replace(" ", "_")
+    json_file_name = f"N={N},K={K},device_name={device_name},dtype={input_type}_w8a8,block_shape=[{block_n}, {block_k}].json"
+
+    config_file_path = os.path.join(save_path, json_file_name)
+    print(f"Writing best config to {config_file_path}...")
+
+    with open(config_file_path, "w") as f:
+        json.dump(configs, f, indent=4)
+        f.write("\n")
+
+
+def get_available_gpu_count():
+    """Get the number of available GPUs."""
+    return torch.cuda.device_count()
+
+
+def tune_on_gpu(args_dict):
+    """Run tuning on a specific GPU."""
+    gpu_id = args_dict["gpu_id"]
+    batch_sizes = args_dict["batch_sizes"]
+    weight_shapes = args_dict["weight_shapes"]
+    args = args_dict["args"]
+
+    torch.cuda.set_device(gpu_id)
+    print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")
+
+    block_n = args.block_n
+    block_k = args.block_k
+    out_dtype = DTYPE_MAP[args.out_dtype]
+    save_path = args.save_path
+    input_type = args.input_type
+
+    search_space = get_configs_compute_bound()
+    search_space = [
+        config for config in search_space if block_k % config["BLOCK_SIZE_K"] == 0
+    ]
+
+    start = time.perf_counter()
+    results = {}
+    for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"):
+        N, K = shape[0], shape[1]
+        print(f"[GPU {gpu_id}] Tune for weight shape of `N: {N}, K: {K}`")
+        benchmark_results = [
+            tune(
+                batch_size,
+                N,
+                K,
+                [block_n, block_k],
+                out_dtype,
+                search_space,
+                input_type,
+            )
+            for batch_size in tqdm(batch_sizes, desc=f"GPU {gpu_id} - Batch sizes")
+        ]
+        best_configs = {M: config for M, config in zip(batch_sizes, benchmark_results)}
+        save_configs(N, K, block_n, block_k, best_configs, save_path, input_type)
+
+    end = time.perf_counter()
+    print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds")
+
+
+def distribute_batch_sizes(batch_sizes, num_gpus):
+    """Distribute batch sizes across available GPUs."""
+    batches_per_gpu = []
+    for i in range(num_gpus):
+        start_idx = i * len(batch_sizes) // num_gpus
+        end_idx = (i + 1) * len(batch_sizes) // num_gpus
+        batches_per_gpu.append(batch_sizes[start_idx:end_idx])
+    return batches_per_gpu
+
+
+def main(args):
+    print(args)
+
+    num_gpus = get_available_gpu_count()
+    if num_gpus == 0:
+        raise RuntimeError("No GPU available for tuning")
+    print(f"Found {num_gpus} GPUs for parallel tuning")
+
+    torch.cuda.init()
+
+    if args.batch_size is None:
+        batch_sizes = [
+            1,
+            2,
+            4,
+            8,
+            16,
+            24,
+            32,
+            48,
+            64,
+            96,
+            128,
+            256,
+            512,
+            1024,
+            1536,
+            2048,
+            3072,
+            4096,
+        ]
+    else:
+        batch_sizes = [args.batch_size]
+        num_gpus = 1  # If only one batch size, use only one GPU
+
+    weight_shapes = get_weight_shapes(args.tp_size)
+
+    batches_per_gpu = distribute_batch_sizes(batch_sizes, num_gpus)
+
+    process_args = []
+    for gpu_id in range(num_gpus):
+        process_args.append(
+            {
+                "gpu_id": gpu_id,
+                "batch_sizes": batches_per_gpu[gpu_id],
+                "weight_shapes": weight_shapes,  # Each GPU processes all weight shapes
+                "args": args,
+            }
+        )
+
+    ctx = mp.get_context("spawn")
+    with ctx.Pool(num_gpus) as pool:
+        pool.map(tune_on_gpu, process_args)
+
+    print("Multi-GPU tuning completed")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--tp-size", "-tp", type=int, default=8)
+    parser.add_argument(
+        "--input-type", type=str, choices=["fp8", "int8"], default="fp8"
+    )
+    parser.add_argument(
+        "--out-dtype",
+        type=str,
+        choices=["float32", "float16", "bfloat16", "half"],
+        default="float16",
+    )
+    parser.add_argument("--block-n", type=int, default=128)
+    parser.add_argument("--block-k", type=int, default=128)
+    parser.add_argument("--batch-size", type=int, required=False)
+    parser.add_argument(
+        "--save-path", type=str, default="python/sglang/srt/layers/quantization/configs"
+    )
+    args = parser.parse_args()
+
+    main(args)
diff --git a/benchmark/kernels/rmsnorm/benchmark_rmsnorm.py b/benchmark/kernels/rmsnorm/benchmark_rmsnorm.py
new file mode 100644
index 000000000..aeeea62c0
--- /dev/null
+++ b/benchmark/kernels/rmsnorm/benchmark_rmsnorm.py
@@ -0,0 +1,230 @@
+import itertools
+from typing import Optional, Tuple, Union
+
+import torch
+import triton
+from flashinfer.norm import fused_add_rmsnorm, rmsnorm
+from torch import nn
+from vllm import _custom_ops as vllm_ops
+
+
+class HuggingFaceRMSNorm(nn.Module):
+    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        orig_dtype = x.dtype
+        x = x.to(torch.float32)
+        if residual is not None:
+            x = x + residual.to(torch.float32)
+            residual = x.to(orig_dtype)
+
+        variance = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        x = x.to(orig_dtype) * self.weight
+        if residual is None:
+            return x
+        else:
+            return x, residual
+
+
+def rmsnorm_naive(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+):
+    naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
+    naive_norm.weight = nn.Parameter(weight)
+    naive_norm = naive_norm.to(x.device)
+
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+
+    output = naive_norm(x, residual)
+
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+
+
+def rmsnorm_flashinfer(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+):
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+
+    if residual is not None:
+        fused_add_rmsnorm(x, residual, weight, eps)
+        output = (x, residual)
+    else:
+        output = rmsnorm(x, weight, eps)
+
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+
+
+def rmsnorm_vllm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+):
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+
+    if residual is not None:
+        vllm_ops.fused_add_rms_norm(x, residual, weight, eps)
+        output = (x, residual)
+    else:
+        out = torch.empty_like(x)
+        vllm_ops.rms_norm(out, x, weight, eps)
+        output = out
+
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+
+
+def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True):
+    dtype = torch.bfloat16
+    x = torch.randn(batch_size, seq_len, hidden_size, dtype=dtype, device="cuda")
+    weight = torch.ones(hidden_size, dtype=dtype, device="cuda")
+    residual = torch.randn_like(x) if use_residual else None
+
+    output_naive = rmsnorm_naive(
+        x.clone(), weight, residual.clone() if residual is not None else None
+    )
+    output_flashinfer = rmsnorm_flashinfer(
+        x.clone(), weight, residual.clone() if residual is not None else None
+    )
+    output_vllm = rmsnorm_vllm(
+        x.clone(), weight, residual.clone() if residual is not None else None
+    )
+
+    if use_residual:
+        output_naive = output_naive[0]
+        output_flashinfer = output_flashinfer[0]
+        output_vllm = output_vllm[0]
+
+    print(f"Naive output={output_naive}")
+    print(f"FlashInfer output={output_flashinfer}")
+    print(f"VLLM output={output_vllm}")
+
+    if torch.allclose(
+        output_naive, output_flashinfer, atol=1e-2, rtol=1e-2
+    ) and torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
+        print("✅ All implementations match")
+    else:
+        print("❌ Implementations differ")
+
+
+batch_size_range = [2**i for i in range(0, 7, 2)]
+seq_length_range = [2**i for i in range(6, 11, 1)]
+head_num_range = [32, 48]
+configs = list(itertools.product(head_num_range, batch_size_range, seq_length_range))
+
+
+def get_benchmark(use_residual):
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["head_num", "batch_size", "seq_len"],
+            x_vals=[list(_) for _ in configs],
+            line_arg="provider",
+            line_vals=["huggingface", "flashinfer", "vllm"],
+            line_names=["HuggingFace", "FlashInfer", "vLLM"],
+            styles=[("blue", "-"), ("green", "-"), ("red", "-")],
+            ylabel="us",
+            plot_name=f"rmsnorm-performance-{'with' if use_residual else 'without'}-residual",
+            args={},
+        )
+    )
+    def benchmark(head_num, batch_size, seq_len, provider):
+        dtype = torch.bfloat16
+        hidden_size = head_num * 128  # assuming head_dim = 128
+
+        x = torch.randn(batch_size, seq_len, hidden_size, dtype=dtype, device="cuda")
+        weight = torch.ones(hidden_size, dtype=dtype, device="cuda")
+        residual = torch.randn_like(x) if use_residual else None
+
+        quantiles = [0.5, 0.2, 0.8]
+
+        if provider == "huggingface":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rmsnorm_naive(
+                    x.clone(),
+                    weight,
+                    residual.clone() if residual is not None else None,
+                ),
+                quantiles=quantiles,
+            )
+        elif provider == "flashinfer":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rmsnorm_flashinfer(
+                    x.clone(),
+                    weight,
+                    residual.clone() if residual is not None else None,
+                ),
+                quantiles=quantiles,
+            )
+        else:
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rmsnorm_vllm(
+                    x.clone(),
+                    weight,
+                    residual.clone() if residual is not None else None,
+                ),
+                quantiles=quantiles,
+            )
+
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+    return benchmark
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--use_residual", action="store_true", help="Whether to use residual connection"
+    )
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default="./configs/benchmark_ops/rmsnorm/",
+        help="Path to save rmsnorm benchmark results",
+    )
+    args = parser.parse_args()
+
+    # Run correctness test
+    calculate_diff(
+        batch_size=4, seq_len=128, hidden_size=4096, use_residual=args.use_residual
+    )
+
+    # Get the benchmark function with proper use_residual setting
+    benchmark = get_benchmark(args.use_residual)
+    # Run performance benchmark
+    benchmark.run(print_data=True, save_path=args.save_path)
diff --git a/benchmark/kernels/scheduler_batch/benchmark_get_last_loc_triton.py b/benchmark/kernels/scheduler_batch/benchmark_get_last_loc_triton.py
new file mode 100644
index 000000000..3e17205e7
--- /dev/null
+++ b/benchmark/kernels/scheduler_batch/benchmark_get_last_loc_triton.py
@@ -0,0 +1,169 @@
+import os
+
+import torch
+import triton
+import triton.language as tl
+
+
+@torch.compile(dynamic=True)
+def get_last_loc_torch(
+    req_to_token: torch.Tensor,
+    req_pool_indices_tensor: torch.Tensor,
+    prefix_lens_tensor: torch.Tensor,
+) -> torch.Tensor:
+    return torch.where(
+        prefix_lens_tensor > 0,
+        req_to_token[req_pool_indices_tensor, prefix_lens_tensor - 1],
+        torch.full_like(prefix_lens_tensor, -1),
+    )
+
+
+@triton.jit
+def get_last_loc_kernel(
+    req_to_token,
+    req_pool_indices_tensor,
+    prefix_lens_tensor,
+    result,
+    num_tokens,
+    req_to_token_stride,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offset = tl.arange(0, BLOCK_SIZE) + pid * BLOCK_SIZE
+    mask = offset < num_tokens
+
+    prefix_lens = tl.load(prefix_lens_tensor + offset, mask=mask, other=0)
+    req_pool_indices = tl.load(req_pool_indices_tensor + offset, mask=mask, other=0)
+
+    token_mask = prefix_lens > 0
+    token_index = req_pool_indices * req_to_token_stride + (prefix_lens - 1)
+    tokens = tl.load(req_to_token + token_index, mask=token_mask, other=-1)
+
+    tl.store(result + offset, tokens, mask=mask)
+
+
+def get_last_loc_triton(
+    req_to_token: torch.Tensor,
+    req_pool_indices_tensor: torch.Tensor,
+    prefix_lens_tensor: torch.Tensor,
+) -> torch.Tensor:
+    BLOCK_SIZE = 256
+    num_tokens = prefix_lens_tensor.shape[0]
+    result = torch.empty_like(prefix_lens_tensor)
+    grid = (triton.cdiv(num_tokens, BLOCK_SIZE),)
+
+    get_last_loc_kernel[grid](
+        req_to_token,
+        req_pool_indices_tensor,
+        prefix_lens_tensor,
+        result,
+        num_tokens,
+        req_to_token.stride(0),
+        BLOCK_SIZE,
+    )
+    return result
+
+
+def test_get_last_loc():
+    max_batch = 4097
+    max_context_len = 6148
+    batch_size = 20
+
+    # Initialize input tensors
+    req_to_token = torch.zeros(
+        (max_batch, max_context_len), dtype=torch.int32, device="cuda"
+    )
+    req_pool_indices = torch.arange(batch_size, dtype=torch.int64, device="cuda")
+    pre_lens = torch.randint(
+        -max_context_len // 2,
+        max_context_len,
+        (batch_size,),
+        dtype=torch.int64,
+        device="cuda",
+    )
+
+    last_loc_res = get_last_loc_triton(req_to_token, req_pool_indices, pre_lens)
+    last_loc_ref = get_last_loc_torch(req_to_token, req_pool_indices, pre_lens)
+
+    # Compare results
+    torch.testing.assert_close(last_loc_res, last_loc_ref)
+
+
+def get_benchmark():
+    batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["batch_size"],
+            x_vals=batch_sizes,
+            line_arg="provider",
+            line_vals=["reference", "triton"],
+            line_names=["PyTorch", "Triton"],
+            styles=[("blue", "-"), ("green", "-")],
+            ylabel="us",
+            plot_name="get-last-loc-performance",
+            args={},
+        )
+    )
+    def benchmark(batch_size, provider):
+        max_batch = 2048
+        max_context_len = 16384
+
+        req_to_token = torch.zeros(
+            (max_batch, max_context_len), dtype=torch.int32, device="cuda"
+        )
+        req_pool_indices = torch.arange(batch_size, dtype=torch.int64, device="cuda")
+        pre_lens = torch.randint(
+            -max_context_len // 2,
+            max_context_len,
+            (batch_size,),
+            dtype=torch.int64,
+            device="cuda",
+        )
+
+        quantiles = [0.5, 0.2, 0.8]
+
+        if provider == "reference":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: get_last_loc_torch(req_to_token, req_pool_indices, pre_lens),
+                quantiles=quantiles,
+            )
+        elif provider == "triton":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: get_last_loc_triton(req_to_token, req_pool_indices, pre_lens),
+                quantiles=quantiles,
+            )
+
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+    return benchmark
+
+
+def run_benchmark(save_path: str = "./configs/benchmark_ops/get_last_loc/"):
+    """Run benchmark and save results"""
+
+    # Ensure save path exists
+    os.makedirs(save_path, exist_ok=True)
+
+    # Run correctness test
+    test_get_last_loc()
+    print("Correctness test passed!")
+
+    # Run performance test
+    benchmark = get_benchmark()
+    benchmark.run(print_data=True, save_path=save_path)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default="./configs/benchmark_ops/get_last_loc/",
+        help="Path to save benchmark results",
+    )
+    args = parser.parse_args()
+
+    run_benchmark(args.save_path)
diff --git a/benchmark/kernels/scheduler_batch/benchmark_write_req_to_token_pool_triton.py b/benchmark/kernels/scheduler_batch/benchmark_write_req_to_token_pool_triton.py
new file mode 100644
index 000000000..1ce43c8ba
--- /dev/null
+++ b/benchmark/kernels/scheduler_batch/benchmark_write_req_to_token_pool_triton.py
@@ -0,0 +1,342 @@
+import itertools
+import os
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def write_req_to_token_pool_triton(
+    req_to_token_ptr,  # [max_batch, max_context_len]
+    req_pool_indices,
+    pre_lens,
+    seq_lens,
+    extend_lens,
+    out_cache_loc,
+    req_to_token_ptr_stride: tl.constexpr,
+):
+    BLOCK_SIZE: tl.constexpr = 512
+    pid = tl.program_id(0)
+
+    req_pool_index = tl.load(req_pool_indices + pid)
+    pre_len = tl.load(pre_lens + pid)
+    seq_len = tl.load(seq_lens + pid)
+
+    # TODO: optimize this?
+    cumsum_start = 0
+    for i in range(pid):
+        cumsum_start += tl.load(extend_lens + i)
+
+    num_loop = tl.cdiv(seq_len - pre_len, BLOCK_SIZE)
+    for i in range(num_loop):
+        offset = tl.arange(0, BLOCK_SIZE) + i * BLOCK_SIZE
+        mask = offset < (seq_len - pre_len)
+        value = tl.load(out_cache_loc + cumsum_start + offset, mask=mask)
+        tl.store(
+            req_to_token_ptr
+            + req_pool_index * req_to_token_ptr_stride
+            + offset
+            + pre_len,
+            value,
+            mask=mask,
+        )
+
+
+@triton.jit
+def write_req_to_token_pool_triton_optimize(
+    req_to_token_ptr,  # [max_batch, max_context_len]
+    req_pool_indices,
+    pre_lens,
+    seq_lens,
+    extend_lens,
+    out_cache_loc,
+    req_to_token_ptr_stride: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid_batch = tl.program_id(0)
+    pid_token = tl.program_id(1)
+
+    req_pool_index = tl.load(req_pool_indices + pid_batch)
+    pre_len = tl.load(pre_lens + pid_batch)
+    seq_len = tl.load(seq_lens + pid_batch)
+    extend_len = seq_len - pre_len
+
+    cumsum_start = 0
+    for i in range(pid_batch):
+        cumsum_start += tl.load(extend_lens + i)
+
+    token_start = pid_token * BLOCK_SIZE
+
+    offset = tl.arange(0, BLOCK_SIZE)
+    actual_offset = token_start + offset
+    mask = actual_offset < extend_len
+
+    src_ptr = out_cache_loc + cumsum_start + actual_offset
+    src_ptr = tl.max_contiguous(tl.multiple_of(src_ptr, BLOCK_SIZE), BLOCK_SIZE)
+    value = tl.load(src_ptr, mask=mask)
+    dst_ptr = (
+        req_to_token_ptr
+        + req_pool_index * req_to_token_ptr_stride
+        + actual_offset
+        + pre_len
+    )
+    dst_ptr = tl.max_contiguous(tl.multiple_of(dst_ptr, BLOCK_SIZE), BLOCK_SIZE)
+
+    tl.store(dst_ptr, value, mask=mask)
+
+
+def write_req_to_token_pool_reference(
+    req_to_token: torch.Tensor,
+    req_pool_indices: torch.Tensor,
+    pre_lens: torch.Tensor,
+    seq_lens: torch.Tensor,
+    extend_lens: torch.Tensor,
+    out_cache_loc: torch.Tensor,
+) -> None:
+    """Reference implementation using PyTorch"""
+    for i in range(len(req_pool_indices)):
+        req_pool_idx = req_pool_indices[i].item()
+        pre_len = pre_lens[i].item()
+        seq_len = seq_lens[i].item()
+        extend_len = extend_lens[i].item()
+
+        cumsum_start = sum(extend_lens[:i].tolist())
+
+        # Copy values from out_cache_loc to req_to_token
+        req_to_token[req_pool_idx, pre_len:seq_len] = out_cache_loc[
+            cumsum_start : cumsum_start + extend_len
+        ]
+
+
+def test_write_req_to_token_pool():
+    max_batch = 4097
+    max_context_len = 6148
+    batch_size = 1
+    extend_len = 14
+
+    # Initialize input tensors
+    req_to_token = torch.zeros(
+        (max_batch, max_context_len), dtype=torch.int32, device="cuda"
+    )
+    req_pool_indices = torch.tensor([42], dtype=torch.int32, device="cuda")
+    pre_lens = torch.tensor([8], dtype=torch.int32, device="cuda")
+    seq_lens = torch.tensor([22], dtype=torch.int32, device="cuda")
+    extend_lens = torch.tensor([extend_len], dtype=torch.int32, device="cuda")
+    out_cache_loc = torch.arange(extend_len, dtype=torch.int32, device="cuda")
+
+    # Create copies for reference implementation
+    req_to_token_ref = req_to_token.clone()
+    req_to_token_opt = req_to_token.clone()
+
+    # Run original triton kernel
+    write_req_to_token_pool_triton[(batch_size,)](
+        req_to_token,
+        req_pool_indices,
+        pre_lens,
+        seq_lens,
+        extend_lens,
+        out_cache_loc,
+        max_context_len,
+    )
+
+    # Run optimized triton kernel
+    def grid(batch_size, extend_len):
+        num_token_blocks = triton.cdiv(extend_len, 512)
+        return (batch_size, num_token_blocks)
+
+    write_req_to_token_pool_triton_optimize[grid(batch_size, extend_len)](
+        req_to_token_opt,
+        req_pool_indices,
+        pre_lens,
+        seq_lens,
+        extend_lens,
+        out_cache_loc,
+        max_context_len,
+        BLOCK_SIZE=512,
+    )
+
+    # Run reference implementation
+    write_req_to_token_pool_reference(
+        req_to_token_ref,
+        req_pool_indices,
+        pre_lens,
+        seq_lens,
+        extend_lens,
+        out_cache_loc,
+    )
+
+    # Compare results
+    torch.testing.assert_close(req_to_token, req_to_token_ref)
+    torch.testing.assert_close(req_to_token_opt, req_to_token_ref)
+
+    # Test case 2: batch size > 1
+    batch_size = 3
+    extend_lens_list = [14, 20, 30]
+    total_extend_len = sum(extend_lens_list)
+
+    req_to_token = torch.zeros(
+        (max_batch, max_context_len), dtype=torch.int32, device="cuda"
+    )
+    req_pool_indices = torch.tensor([42, 100, 200], dtype=torch.int32, device="cuda")
+    pre_lens = torch.tensor([8, 10, 15], dtype=torch.int32, device="cuda")
+    seq_lens = torch.tensor([22, 30, 45], dtype=torch.int32, device="cuda")
+    extend_lens = torch.tensor(extend_lens_list, dtype=torch.int32, device="cuda")
+    out_cache_loc = torch.arange(total_extend_len, dtype=torch.int32, device="cuda")
+
+    req_to_token_ref = req_to_token.clone()
+    req_to_token_opt = req_to_token.clone()
+
+    # Run original triton kernel
+    write_req_to_token_pool_triton[(batch_size,)](
+        req_to_token,
+        req_pool_indices,
+        pre_lens,
+        seq_lens,
+        extend_lens,
+        out_cache_loc,
+        max_context_len,
+    )
+
+    # Run optimized triton kernel
+    max_extend_len = max(extend_lens_list)
+    write_req_to_token_pool_triton_optimize[grid(batch_size, max_extend_len)](
+        req_to_token_opt,
+        req_pool_indices,
+        pre_lens,
+        seq_lens,
+        extend_lens,
+        out_cache_loc,
+        max_context_len,
+        BLOCK_SIZE=512,
+    )
+
+    # Run reference implementation
+    write_req_to_token_pool_reference(
+        req_to_token_ref,
+        req_pool_indices,
+        pre_lens,
+        seq_lens,
+        extend_lens,
+        out_cache_loc,
+    )
+
+    # Compare results
+    torch.testing.assert_close(req_to_token, req_to_token_ref)
+    torch.testing.assert_close(req_to_token_opt, req_to_token_ref)
+
+
+def get_benchmark():
+    batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128]
+    extend_lens = [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
+    configs = list(itertools.product(batch_sizes, extend_lens))
+
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["batch_size", "extend_len"],
+            x_vals=configs,
+            line_arg="provider",
+            line_vals=["reference", "triton", "triton_optimize"],
+            line_names=["PyTorch", "Triton", "Triton Optimized"],
+            styles=[("blue", "-"), ("green", "-"), ("red", "-")],
+            ylabel="us",
+            plot_name="write-req-to-token-pool-performance",
+            args={},
+        )
+    )
+    def benchmark(batch_size, extend_len, provider):
+        max_batch = 256
+        max_context_len = 16384
+
+        extend_lens_list = [extend_len] * batch_size
+        total_extend_len = sum(extend_lens_list)
+
+        req_to_token = torch.zeros(
+            (max_batch, max_context_len), dtype=torch.int32, device="cuda"
+        )
+        req_pool_indices = torch.arange(batch_size, dtype=torch.int32, device="cuda")
+        pre_lens = torch.ones(batch_size, dtype=torch.int32, device="cuda") * 8
+        seq_lens = pre_lens + extend_len
+        extend_lens = torch.tensor(extend_lens_list, dtype=torch.int32, device="cuda")
+        out_cache_loc = torch.arange(total_extend_len, dtype=torch.int32, device="cuda")
+
+        quantiles = [0.5, 0.2, 0.8]
+
+        if provider == "reference":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: write_req_to_token_pool_reference(
+                    req_to_token.clone(),
+                    req_pool_indices,
+                    pre_lens,
+                    seq_lens,
+                    extend_lens,
+                    out_cache_loc,
+                ),
+                quantiles=quantiles,
+            )
+        elif provider == "triton":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: write_req_to_token_pool_triton[(batch_size,)](
+                    req_to_token.clone(),
+                    req_pool_indices,
+                    pre_lens,
+                    seq_lens,
+                    extend_lens,
+                    out_cache_loc,
+                    max_context_len,
+                ),
+                quantiles=quantiles,
+            )
+        else:
+
+            def run_optimized():
+                block_size = 128 if extend_len <= 1024 else 512
+                grid_config = (batch_size, triton.cdiv(extend_len, block_size))
+                write_req_to_token_pool_triton_optimize[grid_config](
+                    req_to_token.clone(),
+                    req_pool_indices,
+                    pre_lens,
+                    seq_lens,
+                    extend_lens,
+                    out_cache_loc,
+                    max_context_len,
+                    BLOCK_SIZE=block_size,
+                )
+
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                run_optimized, quantiles=quantiles
+            )
+
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+    return benchmark
+
+
+def run_benchmark(save_path: str = "./configs/benchmark_ops/write_req_to_token_pool/"):
+    """Run benchmark and save results"""
+
+    # Ensure save path exists
+    os.makedirs(save_path, exist_ok=True)
+
+    # Run correctness test
+    test_write_req_to_token_pool()
+    print("Correctness test passed!")
+
+    # Run performance test
+    benchmark = get_benchmark()
+    benchmark.run(print_data=True, save_path=save_path)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default="./configs/benchmark_ops/write_req_to_token_pool/",
+        help="Path to save benchmark results",
+    )
+    args = parser.parse_args()
+
+    run_benchmark(args.save_path)
diff --git a/benchmark/kernels/sliding_window_attention_triton/bench_triton_swa_kernel.py b/benchmark/kernels/sliding_window_attention_triton/bench_triton_swa_kernel.py
new file mode 100644
index 000000000..98144d470
--- /dev/null
+++ b/benchmark/kernels/sliding_window_attention_triton/bench_triton_swa_kernel.py
@@ -0,0 +1,283 @@
+import itertools
+
+import torch
+import torch.nn.functional as F
+import triton.testing as tt
+
+from sglang.srt.layers.attention.triton_ops.extend_attention import extend_attention_fwd
+
+
+def extend_attention_fwd_torch(
+    q: torch.Tensor,  # [extend_tokens, H_Q, D]
+    k: torch.Tensor,  # [extend_tokens, H_KV, D]
+    v: torch.Tensor,  # [extend_tokens, H_KV, D]
+    o: torch.Tensor,  # [extend_tokens, H_Q, D]
+    k_cache: torch.Tensor,  # [total_tokens, H_KV, D]
+    v_cache: torch.Tensor,  # [total_tokens, H_KV, D]
+    qo_indptr: torch.Tensor,  # [B+1]
+    kv_indptr: torch.Tensor,  # [B+1]
+    kv_indices: torch.Tensor,  # [prefix_tokens]
+    sliding_window_size: int,
+):
+    B = qo_indptr.size(0) - 1
+    _, H_Q, D = q.shape
+    _, H_KV, _ = k.shape
+
+    group_size = H_Q // H_KV
+    scale = 1.0 / D**0.5
+
+    for i in range(B):
+        q_start = int(qo_indptr[i].item())
+        q_end = int(qo_indptr[i + 1].item())
+        kv_start = int(kv_indptr[i].item())
+        kv_end = int(kv_indptr[i + 1].item())
+
+        prefix_indices = kv_indices[kv_start:kv_end]
+        k_prefix = k_cache[prefix_indices]  # [prefix_len, H_KV, D]
+        v_prefix = v_cache[prefix_indices]  # [prefix_len, H_KV, D]
+
+        k_extend = k[q_start:q_end]  # [extend_len, H_KV, D]
+        v_extend = v[q_start:q_end]  # [extend_len, H_KV, D]
+        q_extend = q[q_start:q_end]  # [extend_len, H_Q,  D]
+
+        k_full = torch.cat([k_prefix, k_extend], dim=0)  # [total_len, H_KV, D]
+        v_full = torch.cat([v_prefix, v_extend], dim=0)  # [total_len, H_KV, D]
+
+        if group_size != 1:
+            k_full_hq = k_full.repeat_interleave(
+                group_size, dim=1
+            )  # [total_len, H_Q, D]
+            v_full_hq = v_full.repeat_interleave(
+                group_size, dim=1
+            )  # [total_len, H_Q, D]
+        else:
+            k_full_hq = k_full
+            v_full_hq = v_full
+
+        prefix_len = k_prefix.size(0)
+        extend_len = k_extend.size(0)
+        total_len = prefix_len + extend_len
+
+        # causal
+        pos_keys = torch.arange(total_len, device=q.device)
+        t = prefix_len + torch.arange(extend_len, device=q.device)  # [extend_len]
+        causal_mask = pos_keys.unsqueeze(0) <= t.unsqueeze(1)
+
+        # sliding window
+        if sliding_window_size is not None and sliding_window_size > 0:
+            start = (t - (sliding_window_size)).clamp_min(0)  # [extend_len]
+        else:
+            start = torch.zeros_like(t)
+        window_mask = pos_keys.unsqueeze(0) >= start.unsqueeze(1)
+
+        final_mask = causal_mask & window_mask
+
+        attn_scores = (
+            torch.einsum("qhd,khd->qhk", q_extend, k_full_hq) * scale
+        )  # [extend_len, H_Q, total_len]
+        attn_scores = attn_scores.masked_fill(~final_mask.unsqueeze(1), float("-inf"))
+
+        attn_weights = F.softmax(attn_scores, dim=-1)
+        o[q_start:q_end] = torch.einsum("qhk,khd->qhd", attn_weights, v_full_hq)
+
+
+def _build_batch(
+    B, N_CTX, H_Q, H_KV, D, WINDOW_SIZE, dtype=torch.bfloat16, device="cuda"
+):
+    b_seq_len_prefix = torch.randint(
+        1, max(2, N_CTX // 2), (B,), dtype=torch.int32, device=device
+    )
+    b_seq_len_extend = torch.randint(
+        1, max(2, N_CTX // 2), (B,), dtype=torch.int32, device=device
+    )
+    b_seq_len = b_seq_len_prefix + b_seq_len_extend
+
+    b_start_loc = torch.zeros((B,), dtype=torch.int32, device=device)
+    b_start_loc[1:] = torch.cumsum(b_seq_len[:-1], 0)
+    b_start_loc_extend = torch.zeros((B,), dtype=torch.int32, device=device)
+    b_start_loc_extend[1:] = torch.cumsum(b_seq_len_extend[:-1], 0)
+
+    kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device=device)
+    kv_indptr[1 : B + 1] = torch.cumsum(b_seq_len_prefix[:B], dim=0)
+
+    kv_indices = torch.zeros(
+        (int(b_seq_len_prefix.sum().item()),), dtype=torch.int32, device=device
+    )
+    for i in range(B):
+        s = kv_indptr[i].item()
+        e = kv_indptr[i + 1].item()
+        kv_indices[s:e] = torch.arange(
+            b_start_loc[i],
+            b_start_loc[i] + b_seq_len_prefix[i],
+            dtype=torch.int32,
+            device=device,
+        )
+
+    total_token_num = int(torch.sum(b_seq_len).item())
+    extend_token_num = int(torch.sum(b_seq_len_extend).item())
+
+    k_buffer = torch.empty(
+        (total_token_num, H_KV, D), dtype=dtype, device=device
+    ).normal_(mean=0.1, std=0.2)
+    v_buffer = torch.empty(
+        (total_token_num, H_KV, D), dtype=dtype, device=device
+    ).normal_(mean=0.1, std=0.2)
+
+    k_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device=device)
+    v_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device=device)
+    q_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device=device)
+
+    for i in range(B):
+        extend_start_in_buffer = b_start_loc[i] + b_seq_len_prefix[i]
+        extend_end_in_buffer = b_start_loc[i] + b_seq_len[i]
+        extend_start = b_start_loc_extend[i]
+        extend_end = b_start_loc_extend[i] + b_seq_len_extend[i]
+
+        k_extend[extend_start:extend_end] = k_buffer[
+            extend_start_in_buffer:extend_end_in_buffer
+        ]
+        v_extend[extend_start:extend_end] = v_buffer[
+            extend_start_in_buffer:extend_end_in_buffer
+        ]
+        q_extend[extend_start:extend_end] = torch.empty(
+            (int(b_seq_len_extend[i].item()), H_Q, D), dtype=dtype, device=device
+        ).normal_(mean=0.1, std=0.2)
+
+    o_extend_triton = torch.empty(
+        (extend_token_num, H_Q, D), dtype=dtype, device=device
+    )
+    o_extend_torch = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device=device)
+
+    b_seq_len_extend = b_seq_len - b_seq_len_prefix
+    max_len_extend = int(torch.max(b_seq_len_extend, 0)[0].item())
+    qo_indptr = torch.zeros((B + 1,), dtype=torch.int32, device=device)
+    qo_indptr[1 : B + 1] = torch.cumsum(b_seq_len_extend[:B], dim=0)
+
+    inputs = dict(
+        q_extend=q_extend,
+        k_extend=k_extend,
+        v_extend=v_extend,
+        k_buffer=k_buffer,
+        v_buffer=v_buffer,
+        o_extend_triton=o_extend_triton,
+        o_extend_torch=o_extend_torch,
+        qo_indptr=qo_indptr,
+        kv_indptr=kv_indptr,
+        kv_indices=kv_indices,
+        max_len_extend=max_len_extend,
+        WINDOW_SIZE=WINDOW_SIZE,
+    )
+    meta = dict(
+        B=B, N_CTX=N_CTX, H_Q=H_Q, H_KV=H_KV, D=D, extend_token_num=extend_token_num
+    )
+    return inputs, meta
+
+
+def _run_triton(inputs):
+    extend_attention_fwd(
+        inputs["q_extend"],
+        inputs["k_extend"],
+        inputs["v_extend"],
+        inputs["o_extend_triton"],
+        inputs["k_buffer"],
+        inputs["v_buffer"],
+        inputs["qo_indptr"],
+        inputs["kv_indptr"],
+        inputs["kv_indices"],
+        custom_mask=None,
+        is_causal=True,
+        mask_indptr=None,
+        max_len_extend=inputs["max_len_extend"],
+        sliding_window_size=inputs["WINDOW_SIZE"],
+    )
+
+
+def _run_torch_ref(inputs):
+    extend_attention_fwd_torch(
+        inputs["q_extend"],
+        inputs["k_extend"],
+        inputs["v_extend"],
+        inputs["o_extend_torch"],
+        inputs["k_buffer"],
+        inputs["v_buffer"],
+        inputs["qo_indptr"],
+        inputs["kv_indptr"],
+        inputs["kv_indices"],
+        inputs["WINDOW_SIZE"],
+    )
+
+
+N_CTXS = [1024, 2048, 4096, 8192]
+WINDOW_SIZES = [-1, 127, 256, 512]
+
+CONFIGS = list(itertools.product(N_CTXS, WINDOW_SIZES))
+
+PROVIDERS = ["torch", "triton"]
+
+
+@tt.perf_report(
+    tt.Benchmark(
+        x_names=["N_CTX", "WINDOW_SIZE"],
+        x_vals=CONFIGS,
+        line_arg="provider",
+        line_vals=PROVIDERS,
+        line_names=PROVIDERS,
+        ylabel="Runtime (ms)",
+        plot_name="extend_attention_triton_vs_torch",
+        args={
+            "B": 32,
+            "H_Q": 64,
+            "H_KV": 8,
+            "D": 128,
+            "dtype": "bf16",
+            "device": "cuda",
+            "check_correctness": False,
+            "warmup": 25,
+            "rep": 100,
+        },
+    )
+)
+def bench(
+    N_CTX,
+    provider,
+    B,
+    H_Q,
+    H_KV,
+    D,
+    dtype,
+    device,
+    WINDOW_SIZE,
+    check_correctness,
+    warmup,
+    rep,
+):
+    torch.manual_seed(0)
+    torch.cuda.manual_seed(0)
+    dtype_map = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}
+    dt = dtype_map[dtype]
+
+    inputs, _ = _build_batch(
+        B, N_CTX, H_Q, H_KV, D, WINDOW_SIZE, dtype=dt, device=device
+    )
+
+    if check_correctness and provider == "triton":
+        _run_triton(inputs)
+        _run_torch_ref(inputs)
+        torch.cuda.synchronize()
+        if not torch.allclose(
+            inputs["o_extend_triton"], inputs["o_extend_torch"], rtol=1e-3, atol=1e-3
+        ):
+            raise AssertionError("Mismatch between triton and torch reference.")
+
+    if provider == "triton":
+        ms = tt.do_bench(lambda: _run_triton(inputs), warmup=warmup, rep=rep)
+    elif provider == "torch":
+        ms = tt.do_bench(lambda: _run_torch_ref(inputs), warmup=warmup, rep=rep)
+    else:
+        raise ValueError(provider)
+
+    return ms
+
+
+if __name__ == "__main__":
+    bench.run(print_data=True, show_plots=False)
diff --git a/benchmark/line_retrieval/README.md b/benchmark/line_retrieval/README.md
new file mode 100644
index 000000000..3c81a63cb
--- /dev/null
+++ b/benchmark/line_retrieval/README.md
@@ -0,0 +1,37 @@
+## Download data
+
+```
+wget https://raw.githubusercontent.com/merrymercy/merrymercy.github.io/master/files/random_words.json
+python3 gen_data.py --number 1000
+```
+
+## Run benchmark
+
+### Benchmark sglang
+```
+python3 -m sglang.launch_server --model-path codellama/CodeLlama-7b-hf --port 30000
+```
+
+```
+python3 bench_sglang.py --src-index 600 --num-q 50 --parallel 1
+```
+
+
+###
+
+```
+# original
+Accuracy: 0.940, latency: 332.83 s
+
+# parallel encoding (no_adjust, offset = 1000)
+Accuracy: 0.760, latency: 238.46 s
+
+# parallel encoding (no_adjust, offset = 3000)
+Accuracy: 0.760, latency: 238.46 s
+
+# parallel encoding (no_adjust, offset = 0)
+Accuracy: 0.520, latency: 238.46 s
+
+# parallel encoding (adjust_cache)
+Accuracy: 0.460, latency: 257.66 s
+```
diff --git a/benchmark/line_retrieval/bench_sglang.py b/benchmark/line_retrieval/bench_sglang.py
new file mode 100644
index 000000000..e974e7dd3
--- /dev/null
+++ b/benchmark/line_retrieval/bench_sglang.py
@@ -0,0 +1,149 @@
+import argparse
+import json
+import re
+import time
+
+import numpy as np
+
+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text
+
+
+@sgl.function
+def line_retrieval(s, prefix, suffix, body_0, body_1, body_2, body_3):
+    s += prefix + "\n"
+
+    contexts = [body_0, body_1, body_2, body_3]
+    position_ids_offset = [i * 1000 for i in range(len(contexts))]
+    forks = s.fork(len(contexts), position_ids_offset)
+    forks += lambda i: contexts[i] + "\n"
+    forks.join(mode="concate_and_append")
+
+    s += "\n" + suffix
+    s += sgl.gen("answer", max_tokens=16)
+
+
+def eval_model(args, line_obj, num_hoops, src_indices, dst_percents):
+    arguments = []
+    labels = []
+    sum_src_indices = []
+    sum_dst_indices = []
+
+    for i in range(len(src_indices)):
+        for j in range(len(dst_percents)):
+            src_index = src_indices[i]
+            dst_percent = dst_percents[j]
+
+            query_indices = line_obj["group_by_num_hoops"][str(num_hoops)]
+            query_indices = [
+                q
+                for q in query_indices
+                if all(l <= src_index for l in line_obj["links"][q]) and q < src_index
+            ]
+            dst_index = query_indices[
+                min(int(len(query_indices) * dst_percent), len(query_indices) - 1)
+            ]
+            label = line_obj["values"][dst_index]
+
+            body = line_obj["lines"][: src_index + 1]
+            suffix = line_obj["suffix"].replace("???", line_obj["indices"][dst_index])
+            body_part_len = len(body) // 4
+
+            arguments.append(
+                {
+                    "prefix": line_obj["prefix"],
+                    "body_0": "\n".join(body[:body_part_len]),
+                    "body_1": "\n".join(body[body_part_len : 2 * body_part_len]),
+                    "body_2": "\n".join(body[2 * body_part_len : 3 * body_part_len]),
+                    "body_3": "\n".join(body[3 * body_part_len :]),
+                    "suffix": suffix,
+                }
+            )
+            labels.append(label)
+            sum_src_indices.append(src_index)
+            sum_dst_indices.append(dst_index)
+
+    # Select backend
+    backend = select_sglang_backend(args)
+
+    tic = time.perf_counter()
+    states = line_retrieval.run_batch(
+        arguments,
+        temperature=0,
+        backend=backend,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+
+    corrects = []
+    for i in range(len(arguments)):
+        output = states[i]["answer"]
+        prompt_len = states[i].get_meta_info("answer").get("prompt_length", -1)
+        label = labels[i]
+
+        # Try all numbers
+        findall = re.findall("\d+", output)
+        if not findall:
+            response_number = output
+        else:
+            for response_number in findall:
+                if response_number == label:
+                    break
+
+        correct = response_number == label
+        corrects.append(correct)
+
+        # Log results
+        summary = (
+            f"Line index: {sum_src_indices[i]} -> {sum_dst_indices[i]}, "
+            f"Prompt len: {prompt_len}, "
+            f"Correct: {correct}, "
+            f"Label: {label}, Predicted: {response_number}, "
+        )
+        print(summary)
+
+    accuracy = np.mean(corrects)
+    print(f"Accuracy: {accuracy:.3f}, latency: {latency:.2f} s")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "line_retrieval",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": len(arguments),
+            "other": {
+                "num_questions": len(arguments),
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+def main(args):
+    line_obj = json.load(open(args.data_path, "r"))
+
+    num_hoops = args.num_hoops
+    for src_index in args.src_index:
+        src_indices = [src_index]
+        num_queries = args.num_queries_per_src
+        dst_percents = [i * (1 / (num_queries)) for i in range(num_queries)]
+        eval_model(args, line_obj, num_hoops, src_indices, dst_percents)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="lines_1000_0.0.json")
+    parser.add_argument("--src-index", type=int, nargs="+", default=[100])
+    parser.add_argument("--num-queries-per-src", type=int, default=10)
+    parser.add_argument("--num-hoops", type=int, default=1)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/line_retrieval/gen_data.py b/benchmark/line_retrieval/gen_data.py
new file mode 100644
index 000000000..a01d40a06
--- /dev/null
+++ b/benchmark/line_retrieval/gen_data.py
@@ -0,0 +1,139 @@
+"""
+Generate line data for line retrieval task.
+
+Usage:
+python3 gen_data.py --number 1000
+"""
+
+import argparse
+import json
+from collections import defaultdict
+
+import numpy as np
+from tqdm import tqdm
+
+
+def generate_lines(random_words, num_lines, redirect_ratio):
+    prefix = "Here is a list of lines, each with its corresponding REGISTER_CONTENT value. Please memorize them. Be prepared to provide the REGISTER_CONTENT value for a specific line index when I ask."
+    suffix = "The list has ended. Please give the final REGISTER_CONTENT value for a specific line after resolving the redirections and references. For example, the REGISTER_CONTENT of Line __idx0__ is __val0__. The REGISTER_CONTENT of Line __idx1__ is __val1__. The REGISTER_CONTENT of Line __idx2__ is __val2__. The REGISTER_CONTENT of Line ??? is"
+
+    # Raw lines
+    visited_indices = set([None])
+    visited_values = set([None])
+
+    lines = []
+    redirects = []
+    indices = []
+    values = []
+    for i in tqdm(range(num_lines)):
+        line_index = None
+        while line_index in visited_indices:
+            line_index = "-".join(np.random.choice(random_words, size=(2,)))
+        visited_indices.add(line_index)
+
+        line_value = np.random.randint(low=0, high=999999)
+        line_value = f"{line_value:06}"
+
+        line = f"Line {line_index}: The REGISTER_CONTENT is {line_value}."
+        lines.append(line)
+        redirects.append(None)
+        indices.append(line_index)
+        values.append(line_value)
+
+    # Add redirect
+    if redirect_ratio > 0:
+        num_redirect_lines = int(len(lines) * redirect_ratio)
+        redirect_indices = np.random.choice(
+            np.arange(len(lines)), size=(num_redirect_lines,), replace=False
+        )
+        for i in redirect_indices:
+            target_idx = np.random.choice(min(i * 2 + 100, num_lines))
+            lines[i] = (
+                f"Line {indices[i]}: The REGISTER_CONTENT is the same as Line {indices[target_idx]}."
+            )
+            redirects[i] = target_idx
+
+    # Build links and find sources
+    links = [[] for _ in range(num_lines)]
+    contains_ring = set()
+    for i in range(num_lines):
+        if redirects[i] is None:
+            continue
+
+        tmp_link = []
+        cur = i
+        visited = set()
+        while redirects[cur] is not None:
+            visited.add(cur)
+            tmp_link.append(redirects[cur])
+            cur = redirects[cur]
+
+            if cur in visited:
+                contains_ring.add(i)
+                tmp_link = None
+                break
+        values[i] = values[cur]
+        links[i] = tmp_link
+
+    # Group by num_links
+    group_by_num_hoops = defaultdict(list)
+    for i in range(num_lines):
+        if i in contains_ring:
+            continue
+        group_by_num_hoops[len(links[i]) + 1].append(i)
+
+    keys = sorted(list(group_by_num_hoops.keys()))
+    for num_links in keys:
+        print(f"#links: {num_links}, #lines: {len(group_by_num_hoops[num_links])}")
+
+    # Append few-shot examples
+    hoop1_candidates = list(group_by_num_hoops[1])
+    hoop1_candidate_keys = {c: max([c] + links[c]) for c in hoop1_candidates}
+    hoop1_candidates.sort(key=lambda c: hoop1_candidate_keys[c])
+    hoop2_candidates = list(group_by_num_hoops[2])
+    hoop2_candidate_keys = {c: max([c] + links[c]) for c in hoop2_candidates}
+    hoop2_candidates.sort(key=lambda c: hoop2_candidate_keys[c])
+
+    i = hoop1_candidates[5]
+    suffix = suffix.replace("__idx0__", indices[i]).replace("__val0__", values[i])
+    if len(hoop2_candidates):
+        i = hoop2_candidates[0]
+        suffix = suffix.replace("__idx1__", indices[i]).replace("__val1__", values[i])
+        i = hoop2_candidates[1]
+        suffix = suffix.replace("__idx2__", indices[i]).replace("__val2__", values[i])
+    else:
+        i = hoop1_candidates[1]
+        suffix = suffix.replace("__idx1__", indices[i]).replace("__val1__", values[i])
+        i = hoop1_candidates[10]
+        suffix = suffix.replace("__idx2__", indices[i]).replace("__val2__", values[i])
+
+    obj = {
+        "prefix": prefix,
+        "suffix": suffix,
+        "lines": lines,
+        "indices": indices,
+        "values": values,
+        "links": links,
+        "group_by_num_hoops": group_by_num_hoops,
+        "contains_ring": sorted(list(contains_ring)),
+    }
+    return obj
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--number", type=int)
+    parser.add_argument("--redirect-ratio", type=float, default=0.0)
+    args = parser.parse_args()
+
+    num_lines = args.number
+
+    random_words_filename = "random_words.json"
+    random_words = json.load(open(random_words_filename, "r"))
+
+    np.random.seed(42)
+    obj = generate_lines(random_words, num_lines, args.redirect_ratio)
+
+    fout = f"lines_{num_lines}_{args.redirect_ratio:.1f}.json"
+    with open(fout, "w") as fout:
+        json.dump(obj, fout, indent=2)
diff --git a/benchmark/llava_bench/README.md b/benchmark/llava_bench/README.md
new file mode 100644
index 000000000..a71f339df
--- /dev/null
+++ b/benchmark/llava_bench/README.md
@@ -0,0 +1,61 @@
+## Download benchmark images
+
+```
+python3 download_images.py
+```
+
+image benchmark source: https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild
+
+### Other Dependency
+```
+pip3 install "sglang[all]"
+pip3 install "torch>=2.1.2" "transformers>=4.36" pillow
+```
+
+## Run benchmark
+
+### Benchmark sglang
+Launch a server
+```
+python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --port 30000
+```
+
+Run benchmark
+```
+# Run with local models
+python3 bench_sglang.py --num-questions 60
+
+# Run with OpenAI models
+python3 bench_sglang.py --num-questions 60 --backend gpt-4-vision-preview
+```
+
+### Bench LLaVA original code
+```
+git clone git@github.com:haotian-liu/LLaVA.git
+cd LLaVA
+git reset --hard 9a26bd1435b4ac42c282757f2c16d34226575e96
+pip3 install -e .
+
+cd ~/sglang/benchmark/llava_bench
+CUDA_VISIBLE_DEVICES=0 bash bench_hf_llava_bench.sh
+```
+
+
+### Benchmark llama.cpp
+
+```
+# Install
+CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
+pip install sse_starlette starlette_context pydantic_settings
+
+# Download weights
+mkdir -p ~/model_weights/llava-v1.5-7b/
+wget https://huggingface.co/mys/ggml_llava-v1.5-7b/resolve/main/ggml-model-f16.gguf -O ~/model_weights/llava-v1.5-7b/ggml-model-f16.gguf
+wget https://huggingface.co/mys/ggml_llava-v1.5-7b/resolve/main/mmproj-model-f16.gguf -O ~/model_weights/llava-v1.5-7b/mmproj-model-f16.gguf
+```
+
+```
+python3 -m llama_cpp.server --model ~/model_weights/llava-v1.5-7b/ggml-model-f16.gguf --clip_model_path ~/model_weights/llava-v1.5-7b/mmproj-model-f16.gguf --chat_format llava-1-5 --port 23000
+
+OPENAI_BASE_URL=http://localhost:23000/v1 python3 bench_sglang.py --backend gpt-4-vision-preview --num-q 1
+```
diff --git a/benchmark/llava_bench/bench_hf_llava_bench.sh b/benchmark/llava_bench/bench_hf_llava_bench.sh
new file mode 100755
index 000000000..a51a715b6
--- /dev/null
+++ b/benchmark/llava_bench/bench_hf_llava_bench.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+python -m llava.eval.model_vqa \
+    --model-path liuhaotian/llava-v1.5-7b \
+    --question-file ./questions.jsonl \
+    --image-folder ./images \
+    --answers-file ./answers_hf.jsonl \
+    --temperature 0 \
+    --conv-mode vicuna_v1
diff --git a/benchmark/llava_bench/bench_hf_mme.sh b/benchmark/llava_bench/bench_hf_mme.sh
new file mode 100755
index 000000000..6ed332fe0
--- /dev/null
+++ b/benchmark/llava_bench/bench_hf_mme.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+python -m llava.eval.model_vqa_loader \
+    --model-path liuhaotian/llava-v1.5-7b \
+    --question-file ./mme_pack/llava_mme_bench_replace.jsonl \
+    --image-folder ./mme_pack/MME_Benchmark_release_version \
+    --answers-file ./answers_hf_mme.jsonl \
+    --temperature 0 \
+    --conv-mode vicuna_v1
diff --git a/benchmark/llava_bench/bench_sglang.py b/benchmark/llava_bench/bench_sglang.py
new file mode 100644
index 000000000..b9e8c1405
--- /dev/null
+++ b/benchmark/llava_bench/bench_sglang.py
@@ -0,0 +1,96 @@
+import argparse
+import json
+import os
+import time
+
+import tqdm
+
+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text, read_jsonl
+
+
+@sgl.function
+def image_qa(s, image_file, question):
+    s += sgl.user(sgl.image(image_file) + question)
+    s += sgl.assistant(sgl.gen("answer", max_tokens=args.max_tokens))
+
+
+def main(args):
+    lines = list(read_jsonl(args.question_file))[: args.num_questions]
+    arguments = [
+        {
+            "image_file": os.path.abspath(args.image_folder + "/" + l["image"]),
+            "question": l["text"],
+        }
+        for l in lines
+    ]
+    # arguments = [
+    #    {"image_file":
+    #        Image.open(os.path.abspath(args.image_folder + "/" + l["image"])),
+    #      "question": l["text"]} for l in lines
+    # ]
+
+    states = [None] * len(lines)
+
+    # Select backend
+    backend = select_sglang_backend(args)
+    sgl.set_default_backend(backend)
+
+    # Run requests
+    tic = time.perf_counter()
+    if args.parallel == 1:
+        for i in tqdm.tqdm(range(len(lines))):
+            image_file = arguments[i]["image_file"]
+            question = arguments[i]["question"]
+            ret = image_qa.run(image_file=image_file, question=question, temperature=0)
+            states[i] = ret
+    else:
+        states = image_qa.run_batch(
+            arguments, temperature=0, num_threads=args.parallel, progress_bar=True
+        )
+    latency = time.perf_counter() - tic
+
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    print(f"Write output to {args.answer_file}")
+    with open(args.answer_file, "w") as fout:
+        for i in range(len(lines)):
+            value = {
+                "question_id": lines[i]["question_id"],
+                "prompt": lines[i]["text"],
+                "text": states[i]["answer"].strip(),
+                "model_id": backend.model_info["model_path"],
+                "answer_id": i,
+                "metadata": {},
+            }
+            fout.write(json.dumps(value) + "\n")
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "llava_bench",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": len(lines),
+            "parallel": args.parallel,
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--question-file", type=str, default="questions.jsonl")
+    parser.add_argument("--answer-file", type=str, default="answers.jsonl")
+    parser.add_argument("--image-folder", type=str, default="./images")
+    parser.add_argument("--temperature", type=float, default=0.0)
+    parser.add_argument("--num-questions", type=int, default=None)
+    parser.add_argument("--max-tokens", type=int, default=768)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/llava_bench/bench_sglang_mme.sh b/benchmark/llava_bench/bench_sglang_mme.sh
new file mode 100644
index 000000000..8c8c09cc0
--- /dev/null
+++ b/benchmark/llava_bench/bench_sglang_mme.sh
@@ -0,0 +1,2 @@
+MME_FOLDER=./mme_pack
+python3 bench_sglang.py --num-questions 5000 --question-file $MME_FOLDER/llava_mme_bench_replace.jsonl --answer-file answer_mme.jsonl --image-folder $MME_FOLDER/MME_Benchmark_release_version --max-tokens 4
diff --git a/benchmark/llava_bench/download_images.py b/benchmark/llava_bench/download_images.py
new file mode 100644
index 000000000..701190a03
--- /dev/null
+++ b/benchmark/llava_bench/download_images.py
@@ -0,0 +1,20 @@
+import os
+
+# Create the 'images' directory if it doesn't exist
+if not os.path.exists("images"):
+    os.makedirs("images")
+
+# Base URL
+base_url = "https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild/resolve/main/images/"
+
+# Loop through image numbers
+for i in range(1, 25):
+    # Format the image number with leading zeros
+    image_number = str(i).zfill(3)
+    image_url = base_url + image_number + ".jpg"
+    image_path = "images/" + image_number + ".jpg"
+
+    # Download the image using wget
+    os.system(f"wget -O {image_path} {image_url}")
+
+print("Download complete.")
diff --git a/benchmark/llm_judge/README.md b/benchmark/llm_judge/README.md
new file mode 100644
index 000000000..164b19a08
--- /dev/null
+++ b/benchmark/llm_judge/README.md
@@ -0,0 +1,33 @@
+## Run benchmark
+
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+```
+python3 bench_sglang.py --num-questions 25 --parallel 8
+python3 bench_sglang.py --num-questions 16 --parallel 1
+```
+
+
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
+```
+
+```
+python3 bench_other.py --backend vllm --num-questions 25
+```
+
+
+### Benchmark guidance
+```
+python3 bench_other.py --backend guidance --num-questions 25 --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+
+### Benchmark lmql
+
+```
+python3 bench_other.py --backend lmql --num-questions 25 --parallel 1
+```
diff --git a/benchmark/llm_judge/bench_other.py b/benchmark/llm_judge/bench_other.py
new file mode 100644
index 000000000..8e6029067
--- /dev/null
+++ b/benchmark/llm_judge/bench_other.py
@@ -0,0 +1,151 @@
+import argparse
+import json
+import time
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+
+from tqdm import tqdm
+
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+from sglang.utils import dump_state_text, read_jsonl
+
+system_prompt = "Please serve as an impartial judge and rigorously evaluate the quality of the following article. Apply the most stringent standards possible, showing no leniency."
+
+dimension_prompts = [
+    "Content: This refers to the essences of the essay. The substance should be well researched, accurate, relevant to the topic and should show a thorough understanding of the subject. The essay should also reflect a clear goal or purpose.",
+    "Organization and Structure: An essay needs to be properly structured with a clear introduction, body, and conclusion. The essay should flow naturally, with one paragraph leading seamlessly into the next.",
+    "Argument and Analysis: The argument made in the essay should be logical, coherent and clearly articulated. Each point made should be backed up by solid evidence and thorough analysis.",
+    "Clarity and Precision: The essay should be written in a clear and concise manner. The points made should be easily understood by the reader. The language used should also be precise and unambiguous.",
+    "Grammar and Punctuation: Proper use of grammar and punctuation is vital in an academic essay. Errors in grammar and punctuation not only distract the reader but can also negatively impact the meaning and interpretation of the content.",
+    "Referencing and Citation: An essay should contain proper citations and references for all sources used. This not only prevents accusations of plagiarism but also gives credit to the authors of the works that have contributed to the essay. The citation should adhere to a specific format as required by the academic institution or specified by the professor.",
+]
+
+
+def multi_dimension_judge(article, generate):
+    s = system_prompt
+    s += "\n```\n" + article + "\n```\n\n"
+
+    judges = []
+    for i in range(len(dimension_prompts)):
+        comp = generate(
+            s
+            + "USER: Please judge the quality based on the following metric. "
+            + dimension_prompts[i]
+            + " Please provide a single-paragraph judgement. "
+            + "Focus on the provided metric and do not say other things. "
+            'End your judgement paragraph with the word "END"\nJUDGE:',
+            max_tokens=256,
+            stop="END",
+        )
+        judges.append(comp)
+
+    s += "I will judge the quality based on the following metrics.\n"
+    for i in range(len(dimension_prompts)):
+        s += dimension_prompts[i].split(":")[0] + ": " + judges[i].strip() + "\n"
+
+    s += "In summary, on a scale of 1 to 10, I would give the article a score of"
+    s += generate(s, max_tokens=2, stop=None)
+
+    return s
+
+
+async def multi_dimension_judge_async(article, generate):
+    s = system_prompt
+    s += "\n```\n" + article + "\n```\n\n"
+
+    judges = []
+    for i in range(len(dimension_prompts)):
+        comp = await generate(
+            s
+            + "USER: Please judge the quality based on the following metric. "
+            + dimension_prompts[i]
+            + " Please provide a single-paragraph judgement. "
+            + "Focus on the provided metric and do not say other things. "
+            'End your judgement paragraph with the word "END"\nJUDGE:',
+            max_tokens=256,
+            stop="END",
+        )
+        judges.append(comp)
+
+    s += "I will judge the quality based on the following metrics.\n"
+    for i in range(len(dimension_prompts)):
+        s += dimension_prompts[i].split(":")[0] + ": " + judges[i].strip() + "\n"
+
+    s += "In summary, on a scale of 1 to 10, I would give the article a score of"
+    s += await generate(s, max_tokens=2, stop=None)
+
+    return s
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)[: args.num_questions]
+    states = [None] * len(lines)
+
+    # Select backend
+    call_generate = partial(get_call_generate(args), temperature=0)
+
+    # Run requests
+    tic = time.perf_counter()
+
+    if args.backend != "lmql":
+
+        def get_one_answer(i):
+            states[i] = multi_dimension_judge(lines[i], call_generate)
+
+        if args.parallel == 1:
+            for i in tqdm(range(len(lines))):
+                get_one_answer(i)
+        else:
+            with ThreadPoolExecutor(args.parallel) as executor:
+                list(
+                    tqdm(
+                        executor.map(get_one_answer, list(range(len(lines)))),
+                        total=len(lines),
+                    )
+                )
+
+    else:
+        import asyncio
+
+        async def get_one_answer_async(i):
+            states[i] = await multi_dimension_judge_async(lines[i], call_generate)
+
+        batches = []
+        for i in range(0, len(lines), args.parallel):
+            batches.append(list(range(i, min(i + args.parallel, len(lines)))))
+
+        loop = asyncio.get_event_loop()
+        for bt in tqdm(batches):
+            loop.run_until_complete(
+                asyncio.gather(*[get_one_answer_async(i) for i in bt])
+            )
+
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "llm_judge",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="articles.jsonl")
+    parser.add_argument("--num-questions", type=int, default=20)
+    args = add_common_other_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/llm_judge/bench_sglang.py b/benchmark/llm_judge/bench_sglang.py
new file mode 100644
index 000000000..97e6c3979
--- /dev/null
+++ b/benchmark/llm_judge/bench_sglang.py
@@ -0,0 +1,97 @@
+import argparse
+import json
+import time
+
+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text, read_jsonl
+
+system_prompt = "Please serve as an impartial judge and rigorously evaluate the quality of the following article. Apply the most stringent standards possible, showing no leniency."
+
+dimension_prompts = [
+    "Content: This refers to the essences of the essay. The substance should be well researched, accurate, relevant to the topic and should show a thorough understanding of the subject. The essay should also reflect a clear goal or purpose.",
+    "Organization and Structure: An essay needs to be properly structured with a clear introduction, body, and conclusion. The essay should flow naturally, with one paragraph leading seamlessly into the next.",
+    "Argument and Analysis: The argument made in the essay should be logical, coherent and clearly articulated. Each point made should be backed up by solid evidence and thorough analysis.",
+    "Clarity and Precision: The essay should be written in a clear and concise manner. The points made should be easily understood by the reader. The language used should also be precise and unambiguous.",
+    "Grammar and Punctuation: Proper use of grammar and punctuation is vital in an academic essay. Errors in grammar and punctuation not only distract the reader but can also negatively impact the meaning and interpretation of the content.",
+    "Referencing and Citation: An essay should contain proper citations and references for all sources used. This not only prevents accusations of plagiarism but also gives credit to the authors of the works that have contributed to the essay. The citation should adhere to a specific format as required by the academic institution or specified by the professor.",
+]
+
+
+@sgl.function
+def multi_dimension_judge(s, article):
+    s += system_prompt
+    s += "\n```\n" + article + "\n```\n\n"
+
+    forks = s.fork(len(dimension_prompts))
+    for i in range(len(dimension_prompts)):
+        forks[i] += (
+            "USER: Please judge the quality based on the following metric. "
+            + dimension_prompts[i]
+            + " Please provide a single-paragraph judgement. "
+            + "Focus on the provided metric and do not say other things. "
+            'End your judgement paragraph with the word "END"\nJUDGE:'
+        )
+        forks[i] += sgl.gen("judgement", max_tokens=256, stop="END")
+    forks.join()
+
+    s += "I will judge the quality based on the following metrics.\n"
+    for i in range(len(dimension_prompts)):
+        s += (
+            dimension_prompts[i].split(":")[0]
+            + ": "
+            + forks[i]["judgement"].strip()
+            + "\n"
+        )
+
+    s += "In summary, on a scale of 1 to 10, I would give the article a score of"
+    s += sgl.gen("score", max_tokens=2)
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)[: args.num_questions]
+    arguments = [{"article": l} for l in lines]
+
+    # Select backend
+    backend = select_sglang_backend(args)
+
+    # Run requests
+    tic = time.perf_counter()
+    states = multi_dimension_judge.run_batch(
+        arguments,
+        temperature=0,
+        backend=backend,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "llm_judge",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="articles.jsonl")
+    parser.add_argument("--num-questions", type=int, default=20)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/long_json_decode/README.md b/benchmark/long_json_decode/README.md
new file mode 100644
index 000000000..37fceee13
--- /dev/null
+++ b/benchmark/long_json_decode/README.md
@@ -0,0 +1,33 @@
+## Run benchmark
+
+### Benchmark sglang
+```
+python3 -m sglang.launch_server --model-path codellama/CodeLlama-7b-instruct-hf --port 30000
+```
+
+```
+python3 bench_sglang.py --num-questions 5 --parallel 1
+```
+
+
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model codellama/CodeLlama-7b-instruct-hf  --disable-log-requests --port 21000 --gpu 0.97
+```
+
+```
+python3 bench_other.py --backend vllm --num-questions 5
+```
+
+
+### Benchmark guidance
+```
+python3 bench_other.py --backend guidance --num-questions 5 --parallel 1 --n-ctx 11000 --model-path path/to/code-llama/gguf
+```
+
+
+### Build dataset
+```
+pip install wikipedia
+python3 build_dataset.py
+```
diff --git a/benchmark/long_json_decode/bench_other.py b/benchmark/long_json_decode/bench_other.py
new file mode 100644
index 000000000..0ad38a014
--- /dev/null
+++ b/benchmark/long_json_decode/bench_other.py
@@ -0,0 +1,89 @@
+import argparse
+import json
+import time
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+
+from tqdm import tqdm
+
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+from sglang.utils import dump_state_text, read_jsonl
+
+
+def json_decode(document, generate):
+    s = "Please extract the information of a city from the following wikipedia page.\n"
+    s += "Page begin.\n" + document + "Page end.\n"
+    s += "Here is the name, country, and symbol of the city in JSON format.\n"
+    s += "{\n"
+    s += '  "name": "'
+    s += generate(s, max_tokens=8, stop='"') + '",\n'
+    s += '  "country": "'
+    s += generate(s, max_tokens=8, stop='"') + '",\n'
+    s += '  "air port code": "'
+    s += generate(s, max_tokens=8, stop='"') + '",\n'
+    s += '  "top 3 landmarks": "'
+    s += generate(s, max_tokens=24, stop='"') + '",\n'
+    s += "}\n"
+    return s
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)
+    arguments = []
+    for i in range(len(lines[: args.num_questions])):
+        arguments.append(
+            {
+                "document": lines[i]["document"],
+            }
+        )
+    states = [None] * len(arguments)
+
+    # Select backend
+    call_generate = partial(get_call_generate(args), temperature=0)
+
+    # Run requests
+    def get_one_answer(i):
+        states[i] = json_decode(generate=call_generate, **arguments[i])
+
+    tic = time.perf_counter()
+    if args.parallel == 1:
+        for i in tqdm(range(len(arguments))):
+            get_one_answer(i)
+    else:
+        with ThreadPoolExecutor(args.parallel) as executor:
+            list(
+                tqdm(
+                    executor.map(get_one_answer, list(range(len(arguments)))),
+                    total=len(arguments),
+                )
+            )
+
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "long_json_decode",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="questions.jsonl")
+    parser.add_argument("--num-questions", type=int, default=100)
+    args = add_common_other_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/long_json_decode/bench_sglang.py b/benchmark/long_json_decode/bench_sglang.py
new file mode 100644
index 000000000..8394cfc2e
--- /dev/null
+++ b/benchmark/long_json_decode/bench_sglang.py
@@ -0,0 +1,81 @@
+import argparse
+import json
+import time
+
+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text, read_jsonl
+
+
+@sgl.function
+def json_decode(s, document):
+    s += "Please extract the information of a city from the following wikipedia page.\n"
+    s += "Page begin.\n" + document + "Page end.\n"
+    s += "Here is the name, country, and symbol of the city in JSON format.\n"
+    s += "{\n"
+    s += '  "name": "' + sgl.gen("name", max_tokens=8, stop='"') + '",\n'
+    s += '  "country": "' + sgl.gen("country", max_tokens=8, stop='"') + '",\n'
+    s += (
+        '  "air port code": "'
+        + sgl.gen("air port code", max_tokens=8, stop='"')
+        + '",\n'
+    )
+    s += (
+        '  "top 3 landmarks": "'
+        + sgl.gen("landmarks", max_tokens=24, stop='"')
+        + '",\n'
+    )
+    s += "}\n"
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)
+    arguments = []
+    for i in range(len(lines[: args.num_questions])):
+        arguments.append(
+            {
+                "document": lines[i]["document"],
+            }
+        )
+
+    # Select backend
+    backend = select_sglang_backend(args)
+    sgl.set_default_backend(backend)
+
+    # Run requests
+    tic = time.perf_counter()
+    states = json_decode.run_batch(
+        arguments, temperature=0, num_threads=args.parallel, progress_bar=True
+    )
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "long_json_decode",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="questions.jsonl")
+    parser.add_argument("--num-questions", type=int, default=10)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/long_json_decode/build_dataset.py b/benchmark/long_json_decode/build_dataset.py
new file mode 100644
index 000000000..7b15d3951
--- /dev/null
+++ b/benchmark/long_json_decode/build_dataset.py
@@ -0,0 +1,27 @@
+import json
+
+import transformers
+import wikipedia
+
+name = "meta-llama/Llama-2-7b-chat-hf"
+t = transformers.AutoTokenizer.from_pretrained(name)
+city_names = ["los angles", "london", "tokyo", "beijing", "singapore"]
+
+
+for city_name in city_names:
+    content = str(wikipedia.page(city_name).content)
+    content = content.replace("\n\n", "\n")
+
+    tokens = t.encode(content)
+
+    truncate_len = int((10000 / len(tokens)) * len(content))
+    truncate_content = content[:truncate_len]
+    truncate_tokens = t.encode(truncate_content)
+
+    # Count token
+    print(
+        f"city_name: {city_name}, #tokens: {len(tokens)}, #truncate tokens: {len(truncate_tokens)}"
+    )
+
+    with open("questions.jsonl", "a") as fout:
+        fout.write(json.dumps({"document": truncate_content}) + "\n")
diff --git a/benchmark/lora/launch_server.py b/benchmark/lora/launch_server.py
new file mode 100644
index 000000000..b0781ca30
--- /dev/null
+++ b/benchmark/lora/launch_server.py
@@ -0,0 +1,75 @@
+import argparse
+import os
+
+NUM_LORAS = 4
+LORA_PATH = {
+    "base": "meta-llama/Llama-2-7b-hf",
+    "lora": "winddude/wizardLM-LlaMA-LoRA-7B",
+}
+
+
+def launch_server(args):
+    base_path = LORA_PATH["base"]
+    lora_path = LORA_PATH["lora"]
+
+    if args.base_only:
+        cmd = f"python3 -m sglang.launch_server --model {base_path} "
+    else:
+        cmd = f"python3 -m sglang.launch_server --model {base_path} --lora-paths "
+        for i in range(NUM_LORAS):
+            lora_name = f"lora{i}"
+            cmd += f"{lora_name}={lora_path} "
+    cmd += f"--disable-radix "
+    cmd += f"--max-loras-per-batch {args.max_loras_per_batch} "
+    cmd += f"--max-running-requests {args.max_running_requests} "
+    cmd += f"--lora-backend {args.lora_backend} "
+    cmd += f"--tp-size {args.tp_size} "
+    if args.disable_custom_all_reduce:
+        cmd += "--disable-custom-all-reduce"
+    if args.enable_mscclpp:
+        cmd += "--enable-mscclpp"
+    print(cmd)
+    os.system(cmd)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--base-only",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--max-loras-per-batch",
+        type=int,
+        default=8,
+    )
+    parser.add_argument(
+        "--max-running-requests",
+        type=int,
+        default=8,
+    )
+    parser.add_argument(
+        "--lora-backend",
+        type=str,
+        default="triton",
+    )
+    parser.add_argument(
+        "--tp-size",
+        type=int,
+        default=1,
+        help="Tensor parallel size for distributed inference",
+    )
+    # disable_custom_all_reduce
+    parser.add_argument(
+        "--disable-custom-all-reduce",
+        action="store_true",
+        help="Disable custom all reduce when device does not support p2p communication",
+    )
+    parser.add_argument(
+        "--enable-mscclpp",
+        action="store_true",
+        help="Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL.",
+    )
+    args = parser.parse_args()
+
+    launch_server(args)
diff --git a/benchmark/lora/lora_bench.py b/benchmark/lora/lora_bench.py
new file mode 100644
index 000000000..0a1e37a5c
--- /dev/null
+++ b/benchmark/lora/lora_bench.py
@@ -0,0 +1,476 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import argparse
+import asyncio
+import json
+import random
+import resource
+import sys
+import time
+import traceback
+from argparse import ArgumentParser
+from datetime import datetime
+from typing import Any, Dict, List, Optional, Tuple
+
+import aiohttp
+import numpy as np
+from launch_server import LORA_PATH, NUM_LORAS
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase
+
+from sglang.bench_serving import (
+    AIOHTTP_TIMEOUT,
+    RequestFuncInput,
+    RequestFuncOutput,
+    calculate_metrics,
+    get_request,
+    get_tokenizer,
+    remove_prefix,
+    sample_random_requests,
+)
+
+global args
+
+
+# set ignore_eos True by default
+async def async_request_openai_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    # assert api_url.endswith(
+    #     "completions"
+    # ), "OpenAI Completions API URL must end with 'completions'."
+
+    prompt = request_func_input.prompt
+
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        # payload = {
+        #     "model": request_func_input.model,
+        #     "prompt": prompt,
+        #     "temperature": 0.0,
+        #     "best_of": 1,
+        #     "max_tokens": request_func_input.output_len,
+        #     "stream": not args.disable_stream,
+        #     "ignore_eos": not args.disable_ignore_eos,
+        #     **request_func_input.extra_request_body,
+        # }
+        # headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+        if args.base_only:
+            payload = {
+                "text": prompt,
+                "sampling_params": {"max_new_tokens": request_func_input.output_len},
+            }
+        else:
+            payload = {
+                "text": prompt,
+                "sampling_params": {"max_new_tokens": request_func_input.output_len},
+                "lora_path": f"lora{random.randint(0, NUM_LORAS - 1)}",
+            }
+        headers = {"Authorization": ""}
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                        latency = time.perf_counter() - st
+                        if chunk == "[DONE]":
+                            pass
+                        else:
+                            data = json.loads(chunk)
+
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if data["text"]:
+                                # if data["choices"][0]["text"]:
+                                timestamp = time.perf_counter()
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                # generated_text += data["choices"][0]["text"]
+                                generated_text += data["text"]
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+                    output.output_len = request_func_input.output_len
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+ASYNC_REQUEST_FUNCS = {
+    "sglang": async_request_openai_completions,
+}
+
+
+async def benchmark(
+    backend: str,
+    api_url: str,
+    model_id: str,
+    tokenizer: PreTrainedTokenizerBase,
+    input_requests: List[Tuple[str, int, int]],
+    request_rate: float,
+    disable_tqdm: bool,
+    extra_request_body: Dict[str, Any],
+):
+    if backend in ASYNC_REQUEST_FUNCS:
+        request_func = ASYNC_REQUEST_FUNCS[backend]
+    else:
+        raise ValueError(f"Unknown backend: {backend}")
+
+    print("Starting initial single prompt test run...")
+    test_request = input_requests[0]
+    test_input = RequestFuncInput(
+        model=model_id,
+        prompt=test_request.prompt,
+        api_url=api_url,
+        prompt_len=test_request.prompt_len,
+        output_len=test_request.output_len,
+        lora_name="dummy",  # the lora_name argument will not be used
+        image_data=None,
+        extra_request_body=extra_request_body,
+    )
+    test_output = await request_func(request_func_input=test_input)
+    if not test_output.success:
+        raise ValueError(
+            "Initial test run failed - Please make sure benchmark arguments "
+            f"are correctly specified. Error: {test_output.error}"
+        )
+    else:
+        print("Initial test run completed. Starting main benchmark run...")
+
+    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+
+    benchmark_start_time = time.perf_counter()
+    tasks: List[asyncio.Task] = []
+    async for request in get_request(input_requests, request_rate):
+        request_func_input = RequestFuncInput(
+            model=model_id,
+            prompt=request.prompt,
+            api_url=api_url,
+            prompt_len=request.prompt_len,
+            output_len=request.output_len,
+            lora_name="dummy",
+            image_data=None,
+            extra_request_body=extra_request_body,
+        )
+        tasks.append(
+            asyncio.create_task(
+                request_func(request_func_input=request_func_input, pbar=pbar)
+            )
+        )
+    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
+
+    if pbar is not None:
+        pbar.close()
+
+    benchmark_duration = time.perf_counter() - benchmark_start_time
+
+    metrics, output_lens = calculate_metrics(
+        input_requests=input_requests,
+        outputs=outputs,
+        dur_s=benchmark_duration,
+        tokenizer=tokenizer,
+        backend=backend,
+    )
+
+    print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
+    print("{:<40} {:<10}".format("Backend:", backend))
+    print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+    print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
+    print(
+        "{:<40} {:<10}".format(
+            "Total generated tokens (retokenized):", metrics.total_output_retokenized
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Request throughput (req/s):", metrics.request_throughput
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Input token throughput (tok/s):", metrics.input_throughput
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Output token throughput (tok/s):", metrics.output_throughput
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format("Total throughput (tok/s):", metrics.total_throughput)
+    )
+    print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
+    print(
+        "{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Median E2E Latency (ms):", metrics.median_e2e_latency_ms
+        )
+    )
+    print("{s:{c}^{n}}".format(s="Time to First Token", n=50, c="-"))
+    print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
+    print("{:<40} {:<10.2f}".format("Median TTFT (ms):", metrics.median_ttft_ms))
+    print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
+    print(
+        "{s:{c}^{n}}".format(s="Time per Output Token (excl. 1st token)", n=50, c="-")
+    )
+    print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
+    print("{:<40} {:<10.2f}".format("Median TPOT (ms):", metrics.median_tpot_ms))
+    print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
+    print("{s:{c}^{n}}".format(s="Inter-token Latency", n=50, c="-"))
+    print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
+    print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
+    print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
+    print("=" * 50)
+
+    if (
+        metrics.median_ttft_ms is not None
+        and metrics.mean_itl_ms is not None
+        and metrics.output_throughput is not None
+    ):
+        result = {
+            "backend": args.backend,
+            "request_rate": request_rate,
+            "total_input_tokens": metrics.total_input,
+            "total_output_tokens": metrics.total_output,
+            "total_output_tokens_retokenized": metrics.total_output_retokenized,
+            "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
+            "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
+            "median_ttft_ms": metrics.median_ttft_ms,
+            "median_itl_ms": metrics.median_itl_ms,
+            "output_throughput": metrics.output_throughput,
+            "random_input_len": args.random_input_len,
+            "random_output_len": args.random_output_len,
+            "random_range_ratio": args.random_range_ratio,
+            "duration": benchmark_duration,
+            "completed": metrics.completed,
+        }
+    else:
+        print(f"Error running benchmark for request rate: {request_rate}")
+        print("-" * 30)
+
+    # Determine output file name
+    if args.output_file:
+        output_file_name = args.output_file
+    else:
+        now = datetime.now().strftime("%m%d")
+        output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
+
+    # Append results to a JSONL file
+    with open(output_file_name, "a") as file:
+        file.write(json.dumps(result) + "\n")
+
+    result = {
+        "duration": benchmark_duration,
+        "completed": metrics.completed,
+        "total_input_tokens": metrics.total_input,
+        "total_output_tokens": metrics.total_output,
+        "total_output_tokens_retokenized": metrics.total_output_retokenized,
+        "request_throughput": metrics.request_throughput,
+        "input_throughput": metrics.input_throughput,
+        "output_throughput": metrics.output_throughput,
+        "mean_ttft_ms": metrics.mean_ttft_ms,
+        "median_ttft_ms": metrics.median_ttft_ms,
+        "std_ttft_ms": metrics.std_ttft_ms,
+        "p99_ttft_ms": metrics.p99_ttft_ms,
+        "mean_tpot_ms": metrics.mean_tpot_ms,
+        "median_tpot_ms": metrics.median_tpot_ms,
+        "std_tpot_ms": metrics.std_tpot_ms,
+        "p99_tpot_ms": metrics.p99_tpot_ms,
+        "mean_itl_ms": metrics.mean_itl_ms,
+        "median_itl_ms": metrics.median_itl_ms,
+        "std_itl_ms": metrics.std_itl_ms,
+        "p99_itl_ms": metrics.p99_itl_ms,
+        "input_lens": [output.prompt_len for output in outputs],
+        "output_lens": output_lens,
+        "ttfts": [output.ttft for output in outputs],
+        "itls": [output.itl for output in outputs],
+        "generated_texts": [output.generated_text for output in outputs],
+        "errors": [output.error for output in outputs],
+        "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
+        "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
+    }
+    return result
+
+
+def run_benchmark(args_: argparse.Namespace):
+    global args
+    args = args_
+
+    # Set global environments
+    set_ulimit()
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    # Set url
+    if args.port is None:
+        args.port = {
+            "sglang": 30000,
+        }.get(args.backend, 30000)
+
+    # api_url = (
+    #     f"{args.base_url}/v1/completions"
+    #     if args.base_url
+    #     else f"http://{args.host}:{args.port}/v1/completions"
+    # )
+    api_url = (
+        f"{args.base_url}/generate"
+        if args.base_url
+        else f"http://{args.host}:{args.port}/generate"
+    )
+
+    print(f"{args}\n")
+
+    # Read dataset
+    backend = args.backend
+    model_id = args.model = LORA_PATH["base"]
+    tokenizer_id = args.model
+
+    tokenizer = get_tokenizer(tokenizer_id)
+
+    input_requests = sample_random_requests(
+        input_len=args.random_input_len,
+        output_len=args.random_output_len,
+        num_prompts=args.num_prompts,
+        range_ratio=args.random_range_ratio,
+        tokenizer=tokenizer,
+        dataset_path="",
+    )
+
+    return asyncio.run(
+        benchmark(
+            backend=backend,
+            api_url=api_url,
+            model_id=model_id,
+            tokenizer=tokenizer,
+            input_requests=input_requests,
+            request_rate=args.request_rate,
+            disable_tqdm=False,
+            extra_request_body={},
+        )
+    )
+
+
+def set_ulimit(target_soft_limit=65535):
+    resource_type = resource.RLIMIT_NOFILE
+    current_soft, current_hard = resource.getrlimit(resource_type)
+
+    if current_soft < target_soft_limit:
+        try:
+            resource.setrlimit(resource_type, (target_soft_limit, current_hard))
+        except ValueError as e:
+            print(f"Fail to set RLIMIT_NOFILE: {e}")
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(description="Benchmark the online lora serving throughput.")
+    parser.add_argument(
+        "--backend",
+        type=str,
+        choices=list(ASYNC_REQUEST_FUNCS.keys()),
+        default="sglang",
+        help="Must specify a backend, depending on the LLM Inference Engine.",
+    )
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default=None,
+        help="Server or API base url if not using http host and port.",
+    )
+    parser.add_argument(
+        "--host", type=str, default="0.0.0.0", help="Default host is 0.0.0.0."
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        help="If not set, the default port is configured according to its default value for different LLM Inference Engines.",
+    )
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=50,
+        help="Number of prompts to process. Default is 1000.",
+    )
+    parser.add_argument(
+        "--random-input-len",
+        type=int,
+        default=1024,
+        help="Number of input tokens per request, used only for random dataset.",
+    )
+    parser.add_argument(
+        "--random-output-len",
+        type=int,
+        default=128,
+        help="Number of output tokens per request, used only for random dataset.",
+    )
+    parser.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=0.0,
+        help="Range of sampled ratio of input/output length, "
+        "used only for random dataset.",
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=float("inf"),
+        help="Number of requests per second. If this is inf, then all the requests are sent at time 0. "
+        "Otherwise, we use Poisson process to synthesize the request arrival times. Default is inf.",
+    )
+    parser.add_argument(
+        "--base-only",
+        action="store_true",
+    )
+    parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
+    parser.add_argument("--seed", type=int, default=1, help="The random seed.")
+    args = parser.parse_args()
+    run_benchmark(args)
diff --git a/benchmark/mmlu/README.md b/benchmark/mmlu/README.md
new file mode 100644
index 000000000..16de20cda
--- /dev/null
+++ b/benchmark/mmlu/README.md
@@ -0,0 +1,59 @@
+## Download data
+```
+bash download_data.sh
+```
+
+## Run benchmark
+
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+```
+python3 bench_sglang.py --nsub 10
+```
+
+```
+# OpenAI models
+python3 bench_sglang.py --backend gpt-3.5-turbo --parallel 8
+```
+
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
+```
+
+```
+python3 bench_other.py --nsub 10 --backend vllm
+```
+
+
+### Benchmark lightllm
+```
+# A10G
+python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000
+
+# V100
+python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 4500 --port 22000
+```
+
+```
+python3 bench_other.py --nsub 10 --backend lightllm
+```
+
+
+### Benchmark guidance
+```
+python3 bench_other.py --nsub 10 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+
+
+### Benchmark lmql
+```
+CUDA_VISIBLE_DEVICES=0,1 lmql serve-model meta-llama/Llama-2-7b-chat-hf --cuda --port 23000
+```
+
+```
+python3 bench_other.py --nsub 10 --backend lmql --parallel 2
+```
diff --git a/benchmark/mmlu/bench_other.py b/benchmark/mmlu/bench_other.py
new file mode 100644
index 000000000..f1b166c2b
--- /dev/null
+++ b/benchmark/mmlu/bench_other.py
@@ -0,0 +1,173 @@
+import argparse
+import asyncio
+import json
+import os
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+import numpy as np
+import pandas as pd
+import tiktoken
+from tqdm import tqdm
+
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+
+choices = ["A", "B", "C", "D"]
+
+tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
+
+
+def format_subject(subject):
+    l = subject.split("_")
+    s = ""
+    for entry in l:
+        s += " " + entry
+    return s
+
+
+def format_example(df, idx, include_answer=True):
+    prompt = df.iloc[idx, 0]
+    k = df.shape[1] - 2
+    for j in range(k):
+        prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j + 1])
+    prompt += "\nAnswer:"
+    if include_answer:
+        prompt += " {}\n\n".format(df.iloc[idx, k + 1])
+    return prompt
+
+
+def gen_prompt(train_df, subject, k=-1):
+    prompt = "The following are multiple choice questions (with answers) about{}.\n\n".format(
+        format_subject(subject)
+    )
+    if k == -1:
+        k = train_df.shape[0]
+    for i in range(k):
+        prompt += format_example(train_df, i)
+    return prompt
+
+
+def evaluate(args, subject, dev_df, test_df, call_generate):
+    prompts = []
+    labels = []
+
+    # Construct prompts
+    k = args.ntrain
+    train_prompt = gen_prompt(dev_df, subject, k)
+    while len(tokenizer.encode(train_prompt)) > 1536:
+        k -= 1
+        train_prompt = gen_prompt(dev_df, subject, k)
+
+    for i in range(test_df.shape[0]):
+        prompt_end = format_example(test_df, i, include_answer=False)
+        prompt = train_prompt + prompt_end
+        prompts.append(prompt)
+
+        label = test_df.iloc[i, test_df.shape[1] - 1]
+        labels.append(label)
+
+    preds = [None] * len(prompts)
+    max_tokens = 1
+
+    # Run requests
+    if args.backend != "lmql":
+        # Use thread pool
+        def get_one_answer(i):
+            pred = call_generate(prompts[i], temperature=0, max_tokens=max_tokens)
+            preds[i] = pred.strip()[0]
+
+        tic = time.perf_counter()
+        if args.parallel == 1:
+            for i in range(len(prompts)):
+                get_one_answer(i)
+        else:
+            with ThreadPoolExecutor(args.parallel) as executor:
+                executor.map(get_one_answer, list(range(len(prompts))))
+    else:
+        # Use asyncio
+        async def batched_call(batch_size):
+            for i in range(0, len(prompts), batch_size):
+                tasks = []
+                for p in prompts[i : i + batch_size]:
+                    tasks.append(call_generate(p, temperature=0, max_tokens=max_tokens))
+                rets = await asyncio.gather(*tasks)
+                for j in range(len(rets)):
+                    preds[i + j] = rets[j].strip()[0]
+
+        tic = time.perf_counter()
+        asyncio.run(batched_call(batch_size=args.parallel))
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    cors = [pred == label for pred, label in zip(preds, labels)]
+    acc = np.mean(cors)
+    cors = np.array(cors)
+
+    print(
+        "Average accuracy {:.3f}, latency {:.2f}, #q: {} - {}".format(
+            acc, latency, len(prompts), subject
+        )
+    )
+
+    return cors, acc, latency
+
+
+def main(args):
+    subjects = sorted(
+        [
+            f.split("_test.csv")[0]
+            for f in os.listdir(os.path.join(args.data_dir, "test"))
+            if "_test.csv" in f
+        ]
+    )
+
+    all_cors = []
+    all_latencies = []
+    num_requests = 0
+
+    # Select backend
+    call_generate = get_call_generate(args)
+
+    for subject in tqdm(subjects[: args.nsub]):
+        dev_df = pd.read_csv(
+            os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None
+        )[: args.ntrain]
+        test_df = pd.read_csv(
+            os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None
+        )
+
+        cors, acc, latency = evaluate(args, subject, dev_df, test_df, call_generate)
+        all_cors.append(cors)
+        all_latencies.append(latency)
+        num_requests += len(test_df)
+
+    total_latency = np.sum(all_latencies)
+    print("Total latency: {:.3f}".format(total_latency))
+
+    weighted_acc = np.mean(np.concatenate(all_cors))
+    print("Average accuracy: {:.3f}".format(weighted_acc))
+
+    # Write results
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "mmlu",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(total_latency, 3),
+            "accuracy": round(weighted_acc, 3),
+            "num_requests": num_requests,
+            "other": {
+                "nsub": args.nsub,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ntrain", type=int, default=5)
+    parser.add_argument("--data_dir", type=str, default="data")
+    parser.add_argument("--nsub", type=int, default=60)
+    args = add_common_other_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/mmlu/bench_sglang.py b/benchmark/mmlu/bench_sglang.py
new file mode 100644
index 000000000..23057be4a
--- /dev/null
+++ b/benchmark/mmlu/bench_sglang.py
@@ -0,0 +1,181 @@
+import argparse
+import json
+import os
+import time
+
+import numpy as np
+import pandas as pd
+import tiktoken
+
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    dump_bench_raw_result,
+    select_sglang_backend,
+)
+
+choices = ["A", "B", "C", "D"]
+
+tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
+
+
+def format_subject(subject):
+    l = subject.split("_")
+    s = ""
+    for entry in l:
+        s += " " + entry
+    return s
+
+
+def format_example(df, idx, include_answer=True):
+    prompt = df.iloc[idx, 0]
+    k = df.shape[1] - 2
+    for j in range(k):
+        prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j + 1])
+    prompt += "\nAnswer:"
+    if include_answer:
+        prompt += " {}\n\n".format(df.iloc[idx, k + 1])
+    return prompt
+
+
+def gen_prompt(train_df, subject, k=-1):
+    prompt = "The following are multiple choice questions (with answers) about{}.\n\n".format(
+        format_subject(subject)
+    )
+    if k == -1:
+        k = train_df.shape[0]
+    for i in range(k):
+        prompt += format_example(train_df, i)
+    return prompt
+
+
+def main(args):
+    subjects = sorted(
+        [
+            f.split("_test.csv")[0]
+            for f in os.listdir(os.path.join(args.data_dir, "test"))
+            if "_test.csv" in f
+        ]
+    )
+
+    # Build prompts
+    arguments = []
+    labels = []
+    num_questions = []
+
+    for subject in subjects[: args.nsub]:
+        dev_df = pd.read_csv(
+            os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None
+        )[: args.ntrain]
+        test_df = pd.read_csv(
+            os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None
+        )
+        num_questions.append(test_df.shape[0])
+
+        k = args.ntrain
+        few_shot_examples = gen_prompt(dev_df, subject, k)
+        while len(tokenizer.encode(few_shot_examples)) > 1536:
+            k -= 1
+            few_shot_examples = gen_prompt(dev_df, subject, k)
+
+        for i in range(test_df.shape[0]):
+            prompt_end = format_example(test_df, i, include_answer=False)
+
+            arguments.append(
+                {
+                    "examples": few_shot_examples,
+                    "question": prompt_end,
+                }
+            )
+
+            label = test_df.iloc[i, test_df.shape[1] - 1]
+            labels.append(label)
+
+    #####################################
+    ######### SGL Program Begin #########
+    #####################################
+
+    import sglang as sgl
+
+    if args.backend.startswith("gpt-"):
+
+        @sgl.function
+        def few_shot_mmlu(s, examples, question):
+            s += sgl.user(examples + question)
+            s += sgl.assistant(sgl.gen("answer"))
+
+    else:
+
+        @sgl.function
+        def few_shot_mmlu(s, examples, question):
+            s += examples + question + sgl.gen("answer")
+
+    #####################################
+    ########## SGL Program End ##########
+    #####################################
+
+    # Select backend
+    backend = select_sglang_backend(args)
+
+    # Run
+    tic = time.perf_counter()
+    states = few_shot_mmlu.run_batch(
+        arguments,
+        temperature=0,
+        max_new_tokens=1,
+        backend=backend,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    preds = [
+        s["answer"].strip()[0] if len(s["answer"].strip()) > 0 else "" for s in states
+    ]
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    cors = [pred == label for pred, label in zip(preds, labels)]
+
+    pt = 0
+    for subject, num_qs in zip(subjects[: args.nsub], num_questions):
+        print(
+            f"subject: {subject}, #q:{num_qs}, acc: {np.mean(cors[pt: pt + num_qs]):.3f}"
+        )
+        pt += num_qs
+    assert pt == len(cors)
+    weighted_acc = np.mean(cors)
+
+    dump_bench_raw_result(
+        path=args.raw_result_file,
+        states=states,
+        preds=preds,
+        labels=labels,
+    )
+
+    # Print results
+    print("Total latency: {:.3f}".format(latency))
+    print("Average accuracy: {:.3f}".format(weighted_acc))
+
+    # Write results
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "mmlu",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(weighted_acc, 3),
+            "num_requests": len(arguments),
+            "other": {
+                "nsub": args.nsub,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ntrain", "-k", type=int, default=5)
+    parser.add_argument("--data_dir", "-d", type=str, default="data")
+    parser.add_argument("--save_dir", "-s", type=str, default="results")
+    parser.add_argument("--nsub", type=int, default=60)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/mmlu/download_data.sh b/benchmark/mmlu/download_data.sh
new file mode 100644
index 000000000..fb224165e
--- /dev/null
+++ b/benchmark/mmlu/download_data.sh
@@ -0,0 +1,2 @@
+wget https://people.eecs.berkeley.edu/~hendrycks/data.tar
+tar xf data.tar
diff --git a/benchmark/mmmu/README.md b/benchmark/mmmu/README.md
new file mode 100644
index 000000000..80db21921
--- /dev/null
+++ b/benchmark/mmmu/README.md
@@ -0,0 +1,46 @@
+## Run evaluation
+
+### Evaluate sglang
+
+Host the VLM:
+
+```
+python -m sglang.launch_server --model-path Qwen/Qwen2-VL-7B-Instruct --port 30000
+```
+
+It's recommended to reduce the memory usage by appending something like `--mem-fraction-static 0.6` to the command above.
+
+Benchmark:
+
+```
+python benchmark/mmmu/bench_sglang.py --port 30000 --concurrency 16
+```
+
+You can adjust the `--concurrency` to control the number of concurrent OpenAI calls.
+
+You can use `--lora-path` to specify the LoRA adapter to apply during benchmarking. E.g.,
+```
+# Launch server with LoRA enabled
+python -m sglang.launch_server --model-path microsoft/Phi-4-multimodal-instruct --port 30000 --trust-remote-code --disable-radix-cache --lora-paths vision=<LoRA path>
+
+# Apply LoRA adapter during inferencing
+python -m benchmark/mmmu/bench_sglang.py --concurrency 8 --lora-path vision
+```
+
+You can use `--response-answer-regex` to specify how to extract the answer from the response string. E.g.,
+```
+python3 -m sglang.launch_server --model-path zai-org/GLM-4.1V-9B-Thinking --reasoning-parser glm45
+
+python3 bench_sglang.py --response-answer-regex "<\|begin_of_box\|>(.*)<\|end_of_box\|>" --concurrency 64
+```
+
+You can use `--extra-request-body` to specify additional OpenAI request parameters. E.g.,
+```
+python3 bench_sglang.py --extra-request-body '{"max_new_tokens": 128, "temperature": 0.01}'
+```
+
+### Evaluate hf
+
+```
+python benchmark/mmmu/bench_hf.py --model-path Qwen/Qwen2-VL-7B-Instruct
+```
diff --git a/benchmark/mmmu/bench_hf.py b/benchmark/mmmu/bench_hf.py
new file mode 100644
index 000000000..949b63b80
--- /dev/null
+++ b/benchmark/mmmu/bench_hf.py
@@ -0,0 +1,164 @@
+import argparse
+
+import PIL
+import torch
+from data_utils import save_json
+from eval_utils import (
+    EvalArgs,
+    eval_result,
+    get_sampling_params,
+    prepare_samples,
+    process_result,
+)
+from tqdm import tqdm
+from transformers import AutoModel, AutoProcessor, GenerationConfig
+
+
+@torch.no_grad()
+def eval_mmmu(args):
+    eval_args = EvalArgs.from_cli_args(args)
+
+    sampling_params = get_sampling_params(eval_args)
+    generation_config = GenerationConfig(
+        max_new_tokens=sampling_params["max_new_tokens"],
+        do_sample=False,
+    )
+
+    try:
+        from transformers import AutoModelForImageTextToText
+
+        model = AutoModelForImageTextToText.from_pretrained(
+            args.model_path,
+            torch_dtype="auto",
+            trust_remote_code=True,
+        )
+    except Exception as first_exception:
+        try:
+            # check if the model is belongs to internvl
+            if "InternVL" in args.model_path:
+                from internvl_utils import load_image
+                from transformers import AutoTokenizer
+
+                tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+                model = AutoModel.from_pretrained(
+                    args.model_path,
+                    torch_dtype="auto",
+                    trust_remote_code=True,
+                )
+                generation_config_internvl = dict(
+                    max_new_tokens=sampling_params["max_new_tokens"], do_sample=False
+                )
+
+            else:
+                model = AutoModel.from_pretrained(
+                    args.model_path,
+                    torch_dtype="auto",
+                    trust_remote_code=True,
+                    init_tts=False,
+                )
+        except Exception as second_exception:
+            raise RuntimeError(
+                f"Failed to load model: First attempt failed with {first_exception}, "
+                f"second attempt failed with {second_exception}"
+            ) from second_exception
+
+    model = model.eval().cuda()
+
+    processor = AutoProcessor.from_pretrained(
+        args.model_path, torch_dtype="auto", device_map="auto", trust_remote_code=True
+    )
+
+    samples = prepare_samples(eval_args)
+    out_samples = dict()
+
+    answer_dict = {}
+    for sample in tqdm(samples):
+        prompt = sample["final_input_prompt"]
+        image = sample["image"]
+        prefix = prompt.split("<")[0]
+        suffix = prompt.split(">")[1]
+        assert image is not None
+
+        if "InternVL" in args.model_path:
+            pixel_values = load_image(sample["image_path"]).to(torch.bfloat16).cuda()
+            contents = ""
+            if prefix:
+                contents += prefix
+            contents += "<image>\n"
+            if suffix:
+                contents += suffix
+            response = model.chat(
+                tokenizer, pixel_values, contents, generation_config_internvl
+            )
+            print(f"response: {response}")
+            process_result(response, sample, answer_dict, out_samples)
+            continue
+
+        contents = []
+        if prefix:
+            contents += [{"type": "text", "text": prefix}]
+        contents += [
+            {
+                "type": "image",
+                "image": sample["image_path"],
+            }
+        ]
+        if suffix:
+            contents += [{"type": "text", "text": suffix}]
+        messages = [{"role": "user", "content": contents}]
+        try:
+            model_inputs = processor.tokenizer.apply_chat_template(
+                messages,
+                tokenize=True,
+                return_dict=True,
+                add_generation_prompt=True,
+                return_tensors="pt",
+            ).to(model.device)
+            input_len = model_inputs["input_ids"].shape[-1]
+            generation = model.generate(
+                **model_inputs, generation_config=generation_config
+            )
+            generation = generation[0][input_len:]
+            response = processor.decode(generation, skip_special_tokens=True)
+        except:
+            contents = []
+            if prefix:
+                contents += [prefix]
+            image = PIL.Image.open(sample["image_path"])
+            contents += [image]
+            if suffix:
+                contents += [suffix]
+            messages = [{"role": "user", "content": contents}]
+            response = model.chat(
+                msgs=messages,
+                tokenizer=processor.tokenizer,
+                sampling=False,
+                max_new_tokens=sampling_params["max_new_tokens"],
+                use_tts_template=False,
+                generate_audio=False,
+                temperature=0.0,
+            )
+        print(f"response: {response}")
+        process_result(response, sample, answer_dict, out_samples)
+
+    args.output_path = f"{args.model_path}_answer_hf.json"
+    save_json(args.output_path, out_samples)
+    eval_result(
+        model_answer_path=args.output_path,
+        answer_dict=answer_dict,
+        eval_output_path=f"{args.model_path}_val_hf.json",
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        help="The path of the model weights. This can be a local folder or a Hugging Face repo ID.",
+        required=True,
+    )
+    EvalArgs.add_cli_args(parser)
+    args = parser.parse_args()
+
+    eval_mmmu(args)
diff --git a/benchmark/mmmu/bench_sglang.py b/benchmark/mmmu/bench_sglang.py
new file mode 100644
index 000000000..d8834ea5f
--- /dev/null
+++ b/benchmark/mmmu/bench_sglang.py
@@ -0,0 +1,212 @@
+"""
+Bench the sglang-hosted vLM with benchmark MMMU
+
+Usage:
+    Host the VLM: python -m sglang.launch_server --model-path Qwen/Qwen2-VL-7B-Instruct --port 30000
+
+    Benchmark: python benchmark/mmmu/bench_sglang.py --port 30000 --concurrency 16
+
+The eval output will be logged
+"""
+
+import argparse
+import asyncio
+import re
+import sys
+import time
+import traceback
+from dataclasses import dataclass, field
+from typing import Any, List, Optional, Tuple
+
+import aiohttp
+import openai
+from data_utils import save_json
+from eval_utils import (
+    EvalArgs,
+    eval_result,
+    get_sampling_params,
+    prepare_samples,
+    process_result,
+)
+from tqdm import tqdm
+
+from sglang.test.test_utils import add_common_sglang_args_and_parse
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=20 * 60 * 60)
+
+
+@dataclass
+class RequestFuncOutput:
+    generated_text: List[str] = field(default_factory=list)
+    prompt_len: List[int] = field(default_factory=list)
+    output_len: List[int] = field(default_factory=list)
+    latency: List[float] = field(default_factory=list)
+    ttft: List[float] = field(default_factory=list)
+    itl: List[float] = field(default_factory=list)  # List of inter-token latencies
+
+    success: bool = False
+    error: str = ""
+
+
+async def async_request_profile(api_url: str) -> RequestFuncOutput:
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        output = RequestFuncOutput()
+        try:
+            async with session.post(url=api_url) as response:
+                if response.status == 200:
+                    output.success = True
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    return output
+
+
+def _get_prefix_suffix(prompt: str) -> Tuple[str, str]:
+    """Split the prompt into prefix and suffix."""
+    prefix = prompt.split("<")[0]
+    suffix = prompt.split(">", 1)[1]
+    return prefix, suffix
+
+
+async def process_sample(
+    client: Any, sample: dict, sampling_params: dict, lora_path: Optional[str] = None
+) -> Tuple[dict, str]:
+    """Send a single sample to the LLM and return (sample, response)."""
+    prompt = sample["final_input_prompt"]
+    prefix, suffix = _get_prefix_suffix(prompt)
+    image = sample["image"]
+    assert image is not None
+    image_path = sample["image_path"]
+    extra_body = None if lora_path is None else {"lora_path": lora_path}
+    response = await client.chat.completions.create(
+        model="default",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prefix},
+                    {"type": "image_url", "image_url": {"url": image_path}},
+                    {"type": "text", "text": suffix},
+                ],
+            }
+        ],
+        temperature=0,
+        max_completion_tokens=sampling_params["max_new_tokens"],
+        max_tokens=sampling_params["max_new_tokens"],
+        extra_body=extra_body,
+    )
+    return sample, response.choices[0].message.content
+
+
+async def process_sample_with_semaphore(
+    semaphore: asyncio.Semaphore,
+    client: Any,
+    sample: dict,
+    sampling_params: dict,
+    lora_path: Optional[str] = None,
+) -> Tuple[dict, str]:
+    """Wrap process_sample with a semaphore for concurrency control."""
+    async with semaphore:
+        return await process_sample(client, sample, sampling_params, lora_path)
+
+
+async def eval_mmmu(args) -> None:
+    """Main evaluation loop with concurrency control."""
+    eval_args = EvalArgs.from_cli_args(args)
+    sampling_params = get_sampling_params(eval_args)
+    samples = prepare_samples(eval_args)
+    lora_path = eval_args.lora_path
+    answer_dict = {}
+    out_samples = {}
+    client = openai.AsyncOpenAI(
+        api_key="sk", base_url=f"http://127.0.0.1:{args.port}/v1"
+    )
+    start = time.perf_counter()
+    base_url = f"http://127.0.0.1:{args.port}"
+
+    if args.profile:
+        print("Starting profiler...")
+        profile_output = await async_request_profile(
+            api_url=f"{base_url}/start_profile"
+        )
+        if profile_output.success:
+            print("Profiler started")
+
+        samples = samples[: args.profile_number]
+
+    if args.concurrency == 1:
+        # For concurrency == 1, run in sequential mode to ensure consistent order
+        # this is mainly for profiling
+        for sample in tqdm(samples):
+            _, response = await process_sample(
+                client, sample, sampling_params, lora_path
+            )
+            answer = (
+                re.search(args.response_answer_regex, response)
+                if response is not None
+                else None
+            )
+            process_result(
+                answer.group(1) if answer else response,
+                sample,
+                answer_dict,
+                out_samples,
+            )
+    else:
+        semaphore = asyncio.Semaphore(args.concurrency)
+        tasks = [
+            process_sample_with_semaphore(
+                semaphore, client, sample, sampling_params, lora_path
+            )
+            for sample in samples
+        ]
+
+        for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
+            sample, response = await coro
+            answer = (
+                re.search(args.response_answer_regex, response)
+                if response is not None
+                else None
+            )
+            process_result(
+                answer.group(1) if answer else response,
+                sample,
+                answer_dict,
+                out_samples,
+            )
+
+    if args.profile:
+        print("Stopping profiler...")
+        profile_output = await async_request_profile(api_url=f"{base_url}/stop_profile")
+        if profile_output.success:
+            print("Profiler stopped")
+
+    print(f"Benchmark time: {time.perf_counter() - start}")
+    args.output_path = "./answer_sglang.json"
+    save_json(args.output_path, out_samples)
+    eval_result(
+        model_answer_path=args.output_path,
+        answer_dict=answer_dict,
+        eval_output_path="./val_sglang.json",
+    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    EvalArgs.add_cli_args(parser)
+    args = add_common_sglang_args_and_parse(parser)
+    return args
+
+
+def main():
+    args = parse_args()
+    asyncio.run(eval_mmmu(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/mmmu/data_utils.py b/benchmark/mmmu/data_utils.py
new file mode 100644
index 000000000..cf8916934
--- /dev/null
+++ b/benchmark/mmmu/data_utils.py
@@ -0,0 +1,221 @@
+"""Utils for data load, save, and process (e.g., prompt construction)"""
+
+import json
+import os
+import re
+
+import yaml
+
+DOMAIN_CAT2SUB_CAT = {
+    "Art and Design": ["Art", "Art_Theory", "Design", "Music"],
+    "Business": ["Accounting", "Economics", "Finance", "Manage", "Marketing"],
+    "Science": [
+        "Biology",
+        "Chemistry",
+        "Geography",
+        "Math",
+        "Physics",
+    ],
+    "Health and Medicine": [
+        "Basic_Medical_Science",
+        "Clinical_Medicine",
+        "Diagnostics_and_Laboratory_Medicine",
+        "Pharmacy",
+        "Public_Health",
+    ],
+    "Humanities and Social Science": [
+        "History",
+        "Literature",
+        "Sociology",
+        "Psychology",
+    ],
+    "Tech and Engineering": [
+        "Agriculture",
+        "Architecture_and_Engineering",
+        "Computer_Science",
+        "Electronics",
+        "Energy_and_Power",
+        "Materials",
+        "Mechanical_Engineering",
+    ],
+}
+
+
+CAT_SHORT2LONG = {
+    "acc": "Accounting",
+    "agri": "Agriculture",
+    "arch": "Architecture_and_Engineering",
+    "art": "Art",
+    "art_theory": "Art_Theory",
+    "bas_med": "Basic_Medical_Science",
+    "bio": "Biology",
+    "chem": "Chemistry",
+    "cli_med": "Clinical_Medicine",
+    "cs": "Computer_Science",
+    "design": "Design",
+    "diag_med": "Diagnostics_and_Laboratory_Medicine",
+    "econ": "Economics",
+    "elec": "Electronics",
+    "ep": "Energy_and_Power",
+    "fin": "Finance",
+    "geo": "Geography",
+    "his": "History",
+    "liter": "Literature",
+    "manage": "Manage",
+    "mark": "Marketing",
+    "mate": "Materials",
+    "math": "Math",
+    "mech": "Mechanical_Engineering",
+    "music": "Music",
+    "phar": "Pharmacy",
+    "phys": "Physics",
+    "psy": "Psychology",
+    "pub_health": "Public_Health",
+    "socio": "Sociology",
+}
+
+
+# DATA SAVING
+def save_json(filename, ds):
+    with open(filename, "w") as f:
+        json.dump(ds, f, indent=4)
+
+
+def get_multi_choice_info(options):
+    """
+    Given the list of options for multiple choice question
+    Return the index2ans and all_choices
+    """
+
+    start_chr = "A"
+    all_choices = []
+    index2ans = {}
+    for i, option in enumerate(options):
+        index2ans[chr(ord(start_chr) + i)] = option
+        all_choices.append(chr(ord(start_chr) + i))
+
+    return index2ans, all_choices
+
+
+def load_yaml(file_path):
+    with open(file_path, "r") as stream:
+        try:
+            yaml_dict = yaml.safe_load(stream)
+        except yaml.YAMLError as exc:
+            print(exc)
+
+    return yaml_dict
+
+
+def parse_img_path(text):
+    matches = re.findall("<img='(.*?)'>", text)
+    return matches
+
+
+def process_single_sample(data):
+    question = data["question"]
+    o_imgs_paths = []
+    for option in data["options"]:
+        current_o_imgs_paths = parse_img_path(option)
+        for img_path in current_o_imgs_paths:
+            o_imgs_paths.append(img_path)
+
+    if len(o_imgs_paths) > 1:  # multiple images in options, used for random selection
+        return {
+            "id": data["id"],
+            "question": question,
+            "options": data["options"],
+            "answer": data["answer"],
+            "image": None,
+            "question_type": data["question_type"],
+        }
+    else:
+        return {
+            "id": data["id"],
+            "question": question,
+            "options": data["options"],
+            "answer": data["answer"],
+            "image": data["image_1"],
+            "question_type": data["question_type"],
+        }
+
+
+# DATA SAVING
+def save_json(filename, ds):
+    print(f"answers saved to: {filename}")
+    os.makedirs(os.path.dirname(filename), exist_ok=True)
+    with open(filename, "w") as f:
+        json.dump(ds, f, indent=4)
+
+
+def save_jsonl(filename, data):
+    """
+    Save a dictionary of data to a JSON Lines file with the filename as key and caption as value.
+
+    Args:
+        filename (str): The path to the file where the data should be saved.
+        data (dict): The dictionary containing the data to save where key is the image path and value is the caption.
+    """
+    with open(filename, "w", encoding="utf-8") as f:
+        for img_path, caption in data.items():
+            # Extract the base filename without the extension
+            base_filename = os.path.basename(img_path)
+            # Create a JSON object with the filename as the key and caption as the value
+            json_record = json.dumps({base_filename: caption}, ensure_ascii=False)
+            # Write the JSON object to the file, one per line
+            f.write(json_record + "\n")
+
+
+def save_args(args, path_dir):
+    argsDict = args.__dict__
+    with open(path_dir + "setting.txt", "w") as f:
+        f.writelines("------------------ start ------------------" + "\n")
+        for eachArg, value in argsDict.items():
+            f.writelines(eachArg + " : " + str(value) + "\n")
+        f.writelines("------------------- end -------------------")
+
+
+# DATA PROCESSING
+def construct_prompt(sample, config):
+    question = sample["question"]
+    options = eval(sample["options"])
+    example = ""
+    if sample["question_type"] == "multiple-choice":
+        start_chr = "A"
+        prediction_range = []
+        index2ans = {}
+        for option in options:
+            prediction_range.append(start_chr)
+            example += f"({start_chr}) {option}\n"
+            index2ans[start_chr] = option
+            start_chr = chr(ord(start_chr) + 1)
+        empty_prompt_sample_structure = config["multi_choice_example_format"]
+        empty_prompt = empty_prompt_sample_structure.format(question, example)
+        res_dict = {}
+        res_dict["index2ans"] = index2ans
+        res_dict["correct_choice"] = sample["answer"]
+        res_dict["all_choices"] = prediction_range
+        res_dict["empty_prompt"] = empty_prompt
+        if config["task_instructions"]:
+            res_dict["final_input_prompt"] = (
+                config["task_instructions"].strip() + "\n\n" + empty_prompt
+            )
+        else:
+            res_dict["final_input_prompt"] = empty_prompt
+
+        res_dict["gt_content"] = options[ord(sample["answer"].upper()) - ord("A")]
+    else:
+        empty_prompt_sample_structure = config["short_ans_example_format"]
+        empty_prompt = empty_prompt_sample_structure.format(question)
+        res_dict = {}
+        res_dict["empty_prompt"] = empty_prompt
+        if config["task_instructions"]:
+            res_dict["final_input_prompt"] = (
+                config["task_instructions"].strip() + "\n\n" + empty_prompt
+            )
+        else:
+            res_dict["final_input_prompt"] = empty_prompt
+        res_dict["gt_content"] = sample["answer"]
+
+    res_dict.update(sample)
+    return res_dict
diff --git a/benchmark/mmmu/eval_utils.py b/benchmark/mmmu/eval_utils.py
new file mode 100644
index 000000000..ca0e87c6a
--- /dev/null
+++ b/benchmark/mmmu/eval_utils.py
@@ -0,0 +1,649 @@
+"""Response Parsing and Evaluation for various models"""
+
+import argparse
+import dataclasses
+import json
+import os
+import pprint
+import random
+import re
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Dict, Optional
+
+import numpy as np
+import torch
+from data_utils import (
+    CAT_SHORT2LONG,
+    DOMAIN_CAT2SUB_CAT,
+    construct_prompt,
+    load_yaml,
+    process_single_sample,
+)
+from datasets import concatenate_datasets, load_dataset
+from tqdm import tqdm
+
+
+@dataclasses.dataclass
+class EvalArgs:
+    seed: int = 42
+    split: str = "validation"
+    image_pixels_limit: int = -1
+    result_filename: str = ""
+    prompt_format_file: str = "prompt_format.yaml"
+    dataset_path: str = "MMMU/MMMU"
+    extra_request_body: Optional[str] = None
+    profile: bool = False
+    profile_number: int = 5
+    concurrency: int = 1
+    response_answer_regex: str = "(.*)"
+    lora_path: Optional[str] = None
+
+    @staticmethod
+    def add_cli_args(parser: argparse.ArgumentParser):
+        parser.add_argument(
+            "--result-filename",
+            type=str,
+            default=EvalArgs.result_filename,
+            help="The filename to save the evaluation results.",
+        )
+        parser.add_argument(
+            "--image-pixels-limit",
+            type=int,
+            default=EvalArgs.image_pixels_limit,
+            help="The maximum number of pixels allowed for an image. If an image exceeds this limit, it will be skipped during evaluation.",
+        )
+        parser.add_argument(
+            "--dataset-path",
+            type=str,
+            default=EvalArgs.dataset_path,
+            help="path to the dataset",
+        )
+        parser.add_argument("--seed", type=int, default=1, help="The random seed.")
+        parser.add_argument(
+            "--prompt-format-file",
+            type=str,
+            help="The path to the prompt format of mmmu. If not, a default format llava_config.yaml will be used",
+        )
+        parser.add_argument(
+            "--split",
+            type=str,
+            default=EvalArgs.split,
+            help='Split of the dataset to use for evaluation. Default is "validation".',
+        )
+        parser.add_argument(
+            "--extra-request-body",
+            metavar='{"key1": "value1", "key2": "value2"}',
+            type=str,
+            default=EvalArgs.extra_request_body,
+            help="Append given JSON object to the request payload. You can use this to specify"
+            "additional generate params like sampling params.",
+        )
+        parser.add_argument(
+            "--profile", action="store_true", help="enable mmmu profile"
+        )
+        parser.add_argument(
+            "--profile-number",
+            type=int,
+            default=EvalArgs.profile_number,
+            help="Number of samples to profile. If not set, will profile all samples.",
+        )
+        parser.add_argument(
+            "--concurrency",
+            type=int,
+            default=EvalArgs.concurrency,
+            help="Number of concurrent requests to make during evaluation. Default is 1, which means no concurrency.",
+        )
+        parser.add_argument(
+            "--response-answer-regex",
+            type=str,
+            default=EvalArgs.response_answer_regex,
+            help="Specific regex to capture the answer from the response, string",
+        )
+        parser.add_argument(
+            "--lora-path",
+            type=str,
+            default=EvalArgs.lora_path,
+            help="Specify the LoRA path to use for evaluation. If specified, the value will be specified in the body of every request as `lora-path`.",
+        )
+
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        attrs = [attr.name for attr in dataclasses.fields(cls)]
+        return cls(**{attr: getattr(args, attr) for attr in attrs})
+
+
+def set_seed(seed_value):
+    """
+    Set the seed for PyTorch (both CPU and CUDA), Python, and NumPy for reproducible results.
+
+    :param seed_value: An integer value to be used as the seed.
+    """
+    torch.manual_seed(seed_value)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed_value)
+        torch.cuda.manual_seed_all(seed_value)  # For multi-GPU setups
+    random.seed(seed_value)
+    np.random.seed(seed_value)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+
+def prepare_samples(eval_args: EvalArgs):
+    print("Preparing samples...")
+    # Build prompts
+    set_seed(eval_args.seed)
+
+    prompt_format_file = (
+        eval_args.prompt_format_file
+        if eval_args.prompt_format_file is not None
+        else os.path.join(os.path.dirname(__file__), "prompt_format.yaml")
+    )
+    # load config and process to one value
+    eval_args.config = load_yaml(prompt_format_file)
+    for key, value in eval_args.config.items():
+        if key != "eval_params" and type(value) == list:
+            assert len(value) == 1, "key {} has more than one value".format(key)
+            eval_args.config[key] = value[0]
+
+    # run for each subject in parallel
+    sub_dataset_list = []
+    subjects = list(CAT_SHORT2LONG.values())  # Get a fixed list of subjects
+
+    print(f"Loading datasets for {len(subjects)} subjects...")
+    with ThreadPoolExecutor() as executor:
+        # Submit all load_dataset tasks
+        future_to_subject = {
+            executor.submit(
+                load_dataset, eval_args.dataset_path, subject, split=eval_args.split
+            ): subject
+            for subject in subjects
+        }
+
+        # Collect results as they complete
+        results = {}
+        for future in tqdm(
+            as_completed(future_to_subject),
+            total=len(subjects),
+            desc="Loading datasets",
+        ):
+            subject = future_to_subject[future]
+            try:
+                results[subject] = future.result()
+            except Exception as exc:
+                print(f"{subject} generated an exception: {exc}")
+
+    # Ensure datasets are added in the original order for consistency
+    for subject in subjects:
+        if subject in results:
+            sub_dataset_list.append(results[subject])
+        else:
+            # Handle cases where a dataset failed to load (optional, depends on desired behavior)
+            print(f"Warning: Dataset for subject '{subject}' could not be loaded.")
+
+    # merge all dataset
+    dataset = concatenate_datasets(sub_dataset_list)
+
+    # Prepare images in parallel
+    images_path = os.path.expanduser("~/.cache/mmmu/images")
+    os.makedirs(images_path, exist_ok=True)
+    print(f"Saving images to: {images_path}")
+
+    samples = []
+    skip_count = 0
+
+    def process_sample(i, sample):
+        sample = process_single_sample(sample)
+        sample = construct_prompt(sample, eval_args.config)
+        image = sample["image"]
+        width, height = image.size
+        if 0 < eval_args.image_pixels_limit <= width * height:
+            return None, True
+        # Use a unique identifier for the image path to avoid potential collisions if indices reset
+        image_path = f"{images_path}/image_{sample['id']}.png"
+        if not os.path.exists(image_path):
+            image.save(image_path)
+        sample["image_path"] = image_path
+        return sample, False
+
+    print("Processing samples...")
+    with ThreadPoolExecutor() as executor:
+        # Pass the sample itself to process_sample, index is less reliable now
+        futures = [
+            executor.submit(
+                process_sample, i, sample
+            )  # Keep index i for tqdm maybe? Or remove it. Let's keep it for now.
+            for i, sample in enumerate(dataset)
+        ]
+        for future in tqdm(
+            as_completed(futures), total=len(dataset), desc="Processing samples"
+        ):
+            sample, skipped = future.result()
+            if skipped:
+                skip_count += 1
+            elif sample:
+                samples.append(sample)
+
+    samples.sort(key=lambda x: x["final_input_prompt"])
+
+    print(
+        f"Skipping {skip_count} samples with large images, {round((float(skip_count) / len(dataset)) * 100, 2)}% of dataset"
+    )
+    print("Samples have been prepared")
+    return samples
+
+
+def get_sampling_params(eval_args):
+    max_new_tokens = 30
+    temperature = 0.001
+
+    extra_request_body = {}
+    if eval_args.extra_request_body:
+        extra_request_body = json.loads(eval_args.extra_request_body)
+
+    return {
+        "temperature": temperature,
+        "max_new_tokens": max_new_tokens,
+        **extra_request_body,
+    }
+
+
+# ----------- Process Multi-choice -------------
+def parse_multi_choice_response(response, all_choices, index2ans):
+    """
+    Parse the prediction from the generated response.
+    Return the predicted index e.g., A, B, C, D.
+    """
+    for char in [",", ".", "!", "?", ";", ":", "'"]:
+        response = response.strip(char)
+    response = " " + response + " "  # add space to avoid partial match
+
+    index_ans = True
+    ans_with_brack = False
+    candidates = []
+    for choice in all_choices:  # e.g., (A) (B) (C) (D)
+        if f"({choice})" in response:
+            candidates.append(choice)
+            ans_with_brack = True
+
+    if len(candidates) == 0:
+        for choice in all_choices:  # e.g., A B C D
+            if f" {choice} " in response:
+                candidates.append(choice)
+
+    # if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example
+    if len(candidates) == 0 and len(response.split()) > 5:
+        for index, ans in index2ans.items():
+            if ans.lower() in response.lower():
+                candidates.append(index)
+                index_ans = False  # it's content ans.
+
+    if len(candidates) == 0:  # still not get answer, randomly choose one.
+        pred_index = random.choice(all_choices)
+    elif len(candidates) > 1:
+        start_indexes = []
+        if index_ans:
+            if ans_with_brack:
+                for can in candidates:
+                    index = response.rfind(f"({can})")
+                    start_indexes.append(index)  # -1 will be ignored anyway
+                # start_indexes = [generated_response.index(f'({can})') for can in candidates]
+            else:
+                for can in candidates:
+                    index = response.rfind(f" {can} ")
+                    start_indexes.append(index)
+        else:
+            for can in candidates:
+                index = response.lower().rfind(index2ans[can].lower())
+                start_indexes.append(index)
+        # get the last one
+        pred_index = candidates[np.argmax(start_indexes)]
+    else:  # if only one candidate, use it.
+        pred_index = candidates[0]
+
+    return pred_index
+
+
+# ----------- Process Open -------------
+def check_is_number(string):
+    """
+    Check if the given string a number.
+    """
+    try:
+        float(string.replace(",", ""))
+        return True
+    except ValueError:
+        # check if there's comma inside
+        return False
+
+
+def normalize_str(string):
+    """
+    Normalize the str to lower case and make them float numbers if possible.
+    """
+    # check if characters in the string
+
+    # if number, numerize it.
+    string = string.strip()
+
+    is_number = check_is_number(string)
+
+    if is_number:
+        string = string.replace(",", "")
+        string = float(string)
+        # leave 2 decimal
+        string = round(string, 2)
+        return [string]
+    else:  # it's likely to be a string
+        # lower it
+        string = string.lower()
+        if len(string) == 1:
+            return [" " + string, string + " "]  # avoid trivial matches
+        return [string]
+
+
+def extract_numbers(string):
+    """
+    Exact all forms of numbers from a string with regex.
+    """
+    # Pattern for numbers with commas
+    pattern_commas = r"-?\b\d{1,3}(?:,\d{3})+\b"
+    # Pattern for scientific notation
+    pattern_scientific = r"-?\d+(?:\.\d+)?[eE][+-]?\d+"
+    # Pattern for simple numbers without commas
+    pattern_simple = r"-?(?:\d+\.\d+|\.\d+|\d+\b)(?![eE][+-]?\d+)(?![,\d])"
+
+    # Extract numbers with commas
+    numbers_with_commas = re.findall(pattern_commas, string)
+    # Extract numbers in scientific notation
+    numbers_scientific = re.findall(pattern_scientific, string)
+    # Extract simple numbers without commas
+    numbers_simple = re.findall(pattern_simple, string)
+
+    # Combine all extracted numbers
+    all_numbers = numbers_with_commas + numbers_scientific + numbers_simple
+    return all_numbers
+
+
+def parse_open_response(response):
+    """
+    Parse the prediction from the generated response.
+    Return a list of predicted strings or numbers.
+    """
+
+    # content = content.strip("\n").strip(".").strip(" ")
+    def get_key_subresponses(response):
+        key_responses = []
+        response = response.strip().strip(".").lower()
+        sub_responses = re.split(r"\.\s(?=[A-Z])|\n", response)
+        indicators_of_keys = [
+            "could be ",
+            "so ",
+            "is ",
+            "thus ",
+            "therefore ",
+            "final ",
+            "answer ",
+            "result ",
+        ]
+        key_responses = []
+        for index, resp in enumerate(sub_responses):
+            # if last one, accept it's an equation (the entire response can be just one sentence with equation)
+            if index == len(sub_responses) - 1:
+                indicators_of_keys.extend(["="])
+            shortest_key_response = None  # the shortest response that may contain the answer (tail part of the response)
+            for indicator in indicators_of_keys:
+                if indicator in resp:
+                    if not shortest_key_response:
+                        shortest_key_response = resp.split(indicator)[-1].strip()
+                    else:
+                        if len(resp.split(indicator)[-1].strip()) < len(
+                            shortest_key_response
+                        ):
+                            shortest_key_response = resp.split(indicator)[-1].strip()
+                    # key_responses.append(resp.split(indicator)[1].strip())
+
+            if shortest_key_response:
+                # and it's not trivial
+                if shortest_key_response.strip() not in [
+                    ":",
+                    ",",
+                    ".",
+                    "!",
+                    "?",
+                    ";",
+                    ":",
+                    "'",
+                ]:
+                    key_responses.append(shortest_key_response)
+        if len(key_responses) == 0:  # did not found any
+            return [response]
+        return key_responses
+
+    # pdb.set_trace()
+    key_responses = get_key_subresponses(response)
+
+    pred_list = key_responses.copy()  # keep the original string response
+    for resp in key_responses:
+        pred_list.extend(extract_numbers(resp))
+
+    tmp_pred_list = []
+    for i in range(len(pred_list)):
+        tmp_pred_list.extend(normalize_str(pred_list[i]))
+    pred_list = tmp_pred_list
+
+    # remove duplicates
+    pred_list = list(set(pred_list))
+
+    return pred_list
+
+
+# ----------- Evaluation -------------
+
+
+def eval_multi_choice(gold_i, pred_i):
+    """
+    Evaluate a multiple choice instance.
+    """
+    correct = False
+    # only they are exactly the same, we consider it as correct
+    if isinstance(gold_i, list):
+        for answer in gold_i:
+            if answer == pred_i:
+                correct = True
+                break
+    else:  # gold_i is a string
+        if gold_i == pred_i:
+            correct = True
+    return correct
+
+
+def eval_open(gold_i, pred_i):
+    """
+    Evaluate an open question instance
+    """
+    correct = False
+    if isinstance(gold_i, list):
+        # use float to avoid trivial matches
+        norm_answers = []
+        for answer in gold_i:
+            norm_answers.extend(normalize_str(answer))
+    else:
+        norm_answers = normalize_str(gold_i)
+    for pred in pred_i:  # pred is already normalized in parse response phase
+        if isinstance(pred, str):  # if it's a string, then find if ans in the pred_i
+            for norm_ans in norm_answers:
+                # only see if the string answer in the string pred
+                if isinstance(norm_ans, str) and norm_ans in pred:
+                    if not correct:
+                        correct = True
+                    break
+        else:  # it's a float number
+            if pred in norm_answers:
+                if not correct:
+                    correct = True
+                break
+    return correct
+
+
+# ----------- Batch Evaluation -------------
+def evaluate(samples):
+    """
+    Batch evaluation for multiple choice and open questions.
+    """
+    pred_correct = 0
+    judge_dict = dict()
+    for sample in samples:
+        gold_i = sample["answer"]
+        pred_i = sample["parsed_pred"]
+        if sample["question_type"] == "multiple-choice":
+            correct = eval_multi_choice(gold_i, pred_i)
+        else:  # open question
+            correct = eval_open(gold_i, pred_i)
+
+        if correct:
+            judge_dict[sample["id"]] = "Correct"
+            pred_correct += 1
+        else:
+            # print(f"Wrong! expected {pred_i}, answered with {gold_i}")
+            judge_dict[sample["id"]] = "Wrong"
+
+    if len(samples) == 0:
+        return {"acc": 0}
+    return judge_dict, {"acc": pred_correct / len(samples)}
+
+
+# ----------- Calculate Accuracy -------------
+def calculate_ins_level_acc(results: Dict):
+    """Calculate the instruction level accuracy for given Subject results"""
+    acc = 0
+    ins_num = 0
+    for cat_results in results.values():
+        acc += cat_results["acc"] * cat_results["num_example"]
+        ins_num += cat_results["num_example"]
+    if ins_num == 0:
+        return 0
+    return acc / ins_num
+
+
+def process_result(response, sample, answer_dict, out_samples):
+    if response is None:
+        return
+    if sample["question_type"] == "multiple-choice":
+        pred_ans = parse_multi_choice_response(
+            response, sample["all_choices"], sample["index2ans"]
+        )
+    else:  # open question
+        pred_ans = response
+
+    out_samples[sample["id"]] = pred_ans
+
+    # set ground truth answer
+    answer_dict[sample["id"]] = {
+        "question_type": sample["question_type"],
+        "ground_truth": sample["answer"],
+    }
+
+
+def eval_result(model_answer_path, answer_dict, eval_output_path=None):
+    if eval_output_path is None:
+        eval_output_path = model_answer_path
+    print("Evaluating...")
+    output_dict = json.load(open(model_answer_path))
+    # answer_dict = json.load(open(answer_path))
+
+    # group by category
+    output_dict_w_cat = {}
+    for data_id, parsed_pred in output_dict.items():
+        category = "_".join(data_id.split("_")[1:-1])
+        if category not in output_dict_w_cat:
+            output_dict_w_cat.update({category: {}})
+        output_dict_w_cat[category].update({data_id: parsed_pred})
+
+    # group by category
+    answer_dict_w_cat = {}
+    for data_id, parsed_pred in answer_dict.items():
+        category = "_".join(data_id.split("_")[1:-1])
+        if category not in answer_dict_w_cat:
+            answer_dict_w_cat.update({category: {}})
+        answer_dict_w_cat[category].update({data_id: parsed_pred})
+
+    evaluation_result = {}
+
+    for category in CAT_SHORT2LONG.values():
+        # print("Evaluating: {}".format(category))
+        # get cat_outputs and cat_answers
+        try:
+            cat_outputs = output_dict_w_cat[category]
+            cat_answers = answer_dict_w_cat[category]
+        except KeyError:
+            # print("Skipping {} for not found".format(category))
+            continue
+
+        exampels_to_eval = []
+        for data_id, parsed_pred in cat_outputs.items():
+            question_type = cat_answers[data_id]["question_type"]
+            if question_type != "multiple-choice":
+                parsed_pred = parse_open_response(
+                    parsed_pred
+                )  # mainly for type consistency (make it number, etc.)
+            else:
+                parsed_pred = parsed_pred
+
+            exampels_to_eval.append(
+                {
+                    "id": data_id,
+                    "question_type": question_type,
+                    "answer": cat_answers[data_id]["ground_truth"],
+                    "parsed_pred": parsed_pred,
+                }
+            )
+
+        judge_dict, metric_dict = evaluate(exampels_to_eval)
+        metric_dict.update({"num_example": len(exampels_to_eval)})
+
+        evaluation_result[category] = metric_dict
+
+    printable_results = {}
+    # pdb.set_trace()
+    # add domain Subject
+    for domain, in_domain_cats in DOMAIN_CAT2SUB_CAT.items():
+        in_domain_cat_results = {}
+        for cat_name in in_domain_cats:  # use the order in DOMAIN_CAT2SUB_CAT
+            if cat_name in evaluation_result.keys():
+                in_domain_cat_results[cat_name] = evaluation_result[cat_name]
+            else:
+                pass
+        in_domain_ins_acc = calculate_ins_level_acc(in_domain_cat_results)
+        in_domain_data_num = sum(
+            [
+                cat_results["num_example"]
+                for cat_results in in_domain_cat_results.values()
+            ]
+        )
+        printable_results["Overall-" + domain] = {
+            "num": int(in_domain_data_num),
+            "acc": round(in_domain_ins_acc, 3),
+        }
+        # add sub category
+        for cat_name, cat_results in in_domain_cat_results.items():
+            printable_results[cat_name] = {
+                "num": int(cat_results["num_example"]),
+                "acc": round(cat_results["acc"], 3),
+            }
+
+    # table.append(["-----------------------------", "-----", "----"])
+    all_ins_acc = calculate_ins_level_acc(evaluation_result)
+    overall_acc = round(all_ins_acc, 3)
+    printable_results["Overall"] = {
+        "num": sum(
+            [cat_results["num_example"] for cat_results in evaluation_result.values()]
+        ),
+        "acc": overall_acc,
+    }
+    pprint.pprint(printable_results)
+    out = eval_output_path
+    with open(out, "w", encoding="utf-8") as outfile:
+        json.dump(printable_results, outfile)
+        print(f"eval out saved to {out}")
+
+    print(f"Overall accuracy: {overall_acc}")
diff --git a/benchmark/mmmu/internvl_utils.py b/benchmark/mmmu/internvl_utils.py
new file mode 100644
index 000000000..44c62c99a
--- /dev/null
+++ b/benchmark/mmmu/internvl_utils.py
@@ -0,0 +1,94 @@
+# copy from https://huggingface.co/OpenGVLab/InternVL3-1B
+import torch
+import torchvision.transforms as T
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose(
+        [
+            T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
+            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=MEAN, std=STD),
+        ]
+    )
+    return transform
+
+
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def dynamic_preprocess(
+    image, min_num=1, max_num=12, image_size=448, use_thumbnail=False
+):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    )
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+
+
+def load_image(image_file, input_size=448, max_num=12):
+    image = Image.open(image_file).convert("RGB")
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess(
+        image, image_size=input_size, use_thumbnail=True, max_num=max_num
+    )
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
diff --git a/benchmark/mmmu/prompt_format.yaml b/benchmark/mmmu/prompt_format.yaml
new file mode 100644
index 000000000..1a0f7211f
--- /dev/null
+++ b/benchmark/mmmu/prompt_format.yaml
@@ -0,0 +1,15 @@
+task_instructions:
+- ""
+multi_choice_example_format:
+- "{}
+
+{}
+
+Answer with the option's letter from the given choices directly."
+
+short_ans_example_format:
+- "{}
+
+Answer the question using a single word or phrase."
+temperature:
+- 0
diff --git a/benchmark/mtbench/README.md b/benchmark/mtbench/README.md
new file mode 100644
index 000000000..fc37caee9
--- /dev/null
+++ b/benchmark/mtbench/README.md
@@ -0,0 +1,48 @@
+## Download Dataset
+
+```sh
+wget -O question.jsonl https://raw.githubusercontent.com/lm-sys/FastChat/main/fastchat/llm_judge/data/mt_bench/question.jsonl
+```
+
+## Run benchmark
+
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+```
+python3 bench_sglang.py --num-questions 80
+```
+
+### Benchmark sglang EAGLE
+```
+python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3-8B-Instruct --speculative-algo EAGLE \
+    --speculative-draft-model-path lmsys/sglang-EAGLE-LLaMA3-Instruct-8B --speculative-num-steps 5 \
+    --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --dtype float16 --port 30000
+```
+
+```
+python3 bench_sglang_eagle.py --num-questions 80 --parallel 1
+```
+
+
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
+```
+
+```
+python3 bench_other.py --num-questions 80 --backend vllm
+```
+
+
+### Benchmark lightllm
+```
+# A10G
+python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000
+```
+
+```
+python3 bench_other.py --num-questions 80 --backend lightllm
+```
diff --git a/benchmark/mtbench/bench_other.py b/benchmark/mtbench/bench_other.py
new file mode 100644
index 000000000..5e579e9a6
--- /dev/null
+++ b/benchmark/mtbench/bench_other.py
@@ -0,0 +1,111 @@
+import argparse
+import json
+import os
+import time
+import uuid
+from concurrent.futures import ThreadPoolExecutor
+
+from fastchat.model import get_conversation_template
+from tqdm import tqdm
+
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+
+
+def load_questions(filename):
+    questions = []
+    with open(filename, "r") as fin:
+        for line in fin:
+            obj = json.loads(line)
+            questions.append(obj)
+    return questions
+
+
+def write_answers(filename, model_id, questions, answers):
+    with open(os.path.expanduser(filename), "w") as fout:
+        for i in range(len(answers)):
+            ans_json = {
+                "question_id": questions[i]["question_id"],
+                "answer_id": uuid.uuid4().hex,
+                "model_id": model_id,
+                "choices": {
+                    "index": 0,
+                    "turns": [answers[i][0], answers[i][1]],
+                },
+                "tstamp": time.time(),
+            }
+            fout.write(json.dumps(ans_json) + "\n")
+
+
+def main(args):
+    questions = load_questions(args.question_file)
+    questions = (questions * 10)[: args.num_questions]
+    max_tokens = 256
+    model_id = "llama-2-chat"
+
+    conv_main = get_conversation_template(model_id)
+
+    # Select backend
+    call_generate = get_call_generate(args)
+
+    answers = [None] * len(questions)
+
+    def get_answer(i):
+        conv = conv_main.copy()
+        cur_answers = []
+        for j in range(2):
+            q = questions[i]["turns"][j]
+            conv.append_message(conv.roles[0], q)
+            conv.append_message(conv.roles[1], None)
+
+            prompt = conv.get_prompt()
+            output = call_generate(prompt, temperature=0, max_tokens=max_tokens).strip()
+
+            cur_answers.append(output)
+            conv.update_last_message(output)
+
+        answers[i] = cur_answers
+
+    # Run requests
+    tic = time.perf_counter()
+    if args.parallel == 1:
+        for i in tqdm(range(len(questions))):
+            get_answer(i)
+    else:
+        with ThreadPoolExecutor(args.parallel) as executor:
+            list(
+                tqdm(
+                    executor.map(get_answer, list(range(len(questions)))),
+                    total=len(questions),
+                )
+            )
+
+    latency = time.perf_counter() - tic
+
+    print(f"#questions: {len(questions)}, Latency: {latency:.2f}")
+
+    # Write results
+    answer_file = args.answer_file or f"tmp_output_{args.backend}.txt"
+    write_answers(answer_file, model_id, questions, answers)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "mtbench",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--question-file", type=str, default="question.jsonl")
+    parser.add_argument("--answer-file", type=str, default=None)
+    parser.add_argument("--num-questions", type=int, default=80)
+    args = add_common_other_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/mtbench/bench_sglang.py b/benchmark/mtbench/bench_sglang.py
new file mode 100644
index 000000000..0d0545b3a
--- /dev/null
+++ b/benchmark/mtbench/bench_sglang.py
@@ -0,0 +1,99 @@
+import argparse
+import json
+import os
+import time
+import uuid
+
+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+
+
+def load_questions(filename):
+    questions = []
+    with open(filename, "r") as fin:
+        for line in fin:
+            obj = json.loads(line)
+            questions.append(obj)
+    return questions
+
+
+def write_answers(filename, model_id, questions, answers):
+    with open(os.path.expanduser(filename), "w") as fout:
+        for i in range(len(answers)):
+            ans_json = {
+                "question_id": questions[i]["question_id"],
+                "answer_id": uuid.uuid4().hex,
+                "model_id": model_id,
+                "choices": {
+                    "index": 0,
+                    "turns": [answers[i][0], answers[i][1]],
+                },
+                "tstamp": time.time(),
+            }
+            fout.write(json.dumps(ans_json) + "\n")
+
+
+@sgl.function
+def answer_mt_bench(s, question_1, question_2):
+    s += sgl.system()
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1"))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2"))
+
+
+def main(args):
+    # Construct prompts
+    questions = load_questions(args.question_file)[: args.num_questions]
+    arguments = [
+        {"question_1": q["turns"][0], "question_2": q["turns"][1]} for q in questions
+    ]
+
+    # Select backend
+    backend = select_sglang_backend(args)
+    sgl.set_default_backend(backend)
+
+    # Run requests
+    tic = time.perf_counter()
+    rets = answer_mt_bench.run_batch(
+        arguments,
+        temperature=0,
+        max_new_tokens=256,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    answers = [[s["answer_1"], s["answer_2"]] for s in rets]
+    latency = time.perf_counter() - tic
+
+    print(f"#questions: {len(questions)}, Latency: {latency:.2f}")
+
+    # Write results
+    model_id = backend.model_info["model_path"]
+    answer_file = args.answer_file or f"tmp_output_{args.backend}.txt"
+    write_answers(answer_file, model_id, questions, answers)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "mtbench",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--question-file", type=str, default="question.jsonl")
+    parser.add_argument("--answer-file", type=str, default=None)
+    parser.add_argument("--num-questions", type=int, default=80)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/mtbench/bench_sglang_eagle.py b/benchmark/mtbench/bench_sglang_eagle.py
new file mode 100644
index 000000000..3eb6036c7
--- /dev/null
+++ b/benchmark/mtbench/bench_sglang_eagle.py
@@ -0,0 +1,137 @@
+"""
+Adapted from https://github.com/chromecast56/sglang/blob/6f145d2eadb93a116134f703358ce76f15381045/benchmark/mtbench/bench_sglang.py
+
+Benchmark SGLang EAGLE/EAGLE3 Speculative Decoding
+
+Usage:
+python3 benchmark/mtbench/bench_sglang_eagle.py --num-questions 80 --parallel 1
+"""
+
+import argparse
+import json
+import os
+import time
+import uuid
+
+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+
+
+def load_questions(filename):
+    questions = []
+    with open(filename, "r") as fin:
+        for line in fin:
+            obj = json.loads(line)
+            questions.append(obj)
+    return questions
+
+
+def write_answers(filename, model_id, questions, answers):
+    with open(os.path.expanduser(filename), "w") as fout:
+        for i in range(len(answers)):
+            ans_json = {
+                "question_id": questions[i]["question_id"],
+                "answer_id": uuid.uuid4().hex,
+                "model_id": model_id,
+                "choices": {
+                    "index": 0,
+                    "turns": [answers[i][0], answers[i][1]],
+                },
+                "tstamp": time.time(),
+            }
+            fout.write(json.dumps(ans_json) + "\n")
+
+
+@sgl.function
+def answer_mt_bench(s, question_1, question_2):
+    s += sgl.system(
+        "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
+    )
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1"))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2"))
+
+
+def main(args):
+    # Construct prompts
+    questions = load_questions(args.question_file)[: args.num_questions]
+    arguments = [
+        {"question_1": q["turns"][0], "question_2": q["turns"][1]} for q in questions
+    ]
+
+    # Select backend
+    backend = select_sglang_backend(args)
+    sgl.set_default_backend(backend)
+
+    # Run requests
+    tic = time.perf_counter()
+    rets = answer_mt_bench.run_batch(
+        arguments,
+        temperature=0,
+        max_new_tokens=2048,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    answers = [[s["answer_1"], s["answer_2"]] for s in rets]
+
+    latency = time.perf_counter() - tic
+    num_output_tokens = sum(
+        s.get_meta_info("answer_1")["completion_tokens"]
+        + s.get_meta_info("answer_2")["completion_tokens"]
+        for s in rets
+    )
+
+    # NOTE: acceptance length is just completion_tokens / spec_verify_ct
+    # {'id': '3bb9c5ead109488d8ed5ee9cbecaec29', 'finish_reason': {'type': 'length', 'length': 256}, 'prompt_tokens': 37, 'spec_verify_ct': 101, 'completion_tokens': 256, 'cached_tokens': 0}
+
+    output_throughput = num_output_tokens / latency
+
+    has_verify = "spec_verify_ct" in rets[0].get_meta_info("answer_1")
+    if has_verify:
+        num_verify_tokens = sum(
+            s.get_meta_info("answer_1")["spec_verify_ct"]
+            + s.get_meta_info("answer_2")["spec_verify_ct"]
+            for s in rets
+        )
+
+        accept_length = num_output_tokens / num_verify_tokens
+    else:
+        accept_length = 1.0
+
+    print(
+        f"#questions: {len(questions)}, Throughput: {output_throughput:.2f} token/s, Acceptance length: {accept_length:.2f}"
+    )
+
+    # Write results
+    model_id = backend.model_info["model_path"]
+    answer_file = args.answer_file or f"tmp_output_{args.backend}.txt"
+    write_answers(answer_file, model_id, questions, answers)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "mtbench",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "throughput": round(output_throughput, 3),
+            "accept_length": round(accept_length, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--question-file", type=str, default="question.jsonl")
+    parser.add_argument("--answer-file", type=str, default=None)
+    parser.add_argument("--num-questions", type=int, default=80)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/multi_chain_reasoning/README.md b/benchmark/multi_chain_reasoning/README.md
new file mode 100644
index 000000000..4c9f740f5
--- /dev/null
+++ b/benchmark/multi_chain_reasoning/README.md
@@ -0,0 +1,49 @@
+## Download data
+```
+wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
+```
+
+## Run benchmark
+
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000  --schedule-conservativeness 1.3
+```
+
+```
+python3 bench_sglang.py --num-questions 64
+python3 bench_sglang.py --num-questions 32 --parallel 1
+```
+
+
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
+```
+
+```
+python3 bench_other.py --num-questions 64 --backend vllm
+```
+
+
+### Benchmark lightllm
+```
+# A10G
+python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000
+```
+
+```
+python3 bench_other.py --num-questions 64 --backend lightllm
+```
+
+
+### Benchmark guidance
+```
+python3 bench_other.py --num-questions 8 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+
+### Benchmark lmql
+
+```
+python3 bench_other.py --num-questions 64 --backend lmql --parallel 1
+```
diff --git a/benchmark/multi_chain_reasoning/bench_other.py b/benchmark/multi_chain_reasoning/bench_other.py
new file mode 100644
index 000000000..f361496ad
--- /dev/null
+++ b/benchmark/multi_chain_reasoning/bench_other.py
@@ -0,0 +1,186 @@
+import argparse
+import ast
+import asyncio
+import json
+import re
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+import numpy as np
+from tqdm import tqdm
+
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+from sglang.utils import dump_state_text, read_jsonl
+
+INVALID = -9999999
+
+
+def get_answer_value(answer_str):
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+
+
+prompt_lib = [
+    "Let us think step by step.",
+    "Approach this methodically. Let's dissect the problem into smaller, more manageable parts.",
+    "It's important to proceed step by step, ensuring accuracy at each stage.",
+    "Take a deep breath and break this down.",
+    "A little bit of arithmetic and a logical approach will help us quickly arrive at the solution to this problem.",
+    "I am extremely good at math.",
+]
+
+
+def multi_chain_gsm8k(question, num_chains, call_generate):
+    s = "Question: " + question + "\n"
+    # s += call_generate(s + "Answer: " + prompt_lib[0], max_tokens=256,
+    #     stop="Question", temperature=0)
+    # return s
+
+    comps = []
+    for i in range(num_chains):
+        comps.append(
+            call_generate(
+                s + "Answer: " + prompt_lib[i % num_chains],
+                max_tokens=256,
+                temperature=0.3,
+                stop="Question",
+            )
+        )
+
+    s += "Answer: To answer this question, here are some possible solutions. "
+    s += "After considering all of them, I will do a majority vote.\n\n"
+    for i in range(num_chains):
+        s += f"Solution {i+1}: " + comps[i].strip() + "\n\n"
+    s += "\nBy considering the above solutions and doing a majority vote, I think the final answer (a single integer number) is "
+    s += call_generate(s, max_tokens=16, temperature=0, stop=None)
+    return s
+
+
+async def multi_chain_gsm8k_async(question, num_chains, call_generate):
+    s = "Question: " + question + "\n"
+    # s += call_generate(s + "Answer: " + prompt_lib[0], max_tokens=256,
+    #     stop="Question", temperature=0)
+    # return s
+
+    comps = []
+    for i in range(num_chains):
+        comps.append(
+            await call_generate(
+                s + "Answer: " + prompt_lib[i % num_chains],
+                max_tokens=256,
+                temperature=0.3,
+                stop="Question",
+            )
+        )
+
+    s += "Answer: To answer this question, here are some possible solutions. "
+    s += "After considering all of them, I will do a majority vote.\n\n"
+    for i in range(num_chains):
+        s += f"Solution {i+1}: " + comps[i].strip() + "\n\n"
+    s += "\nBy considering the above solutions and doing a majority vote, I think the final answer (a single integer number) is "
+    s += await call_generate(s, max_tokens=16, temperature=0, stop=None)
+    return s
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)
+
+    # Construct prompts
+    k = args.num_shot
+
+    questions = []
+    labels = []
+    for i in range(len(lines[: args.num_questions])):
+        questions.append(lines[i]["question"])
+        labels.append(get_answer_value(lines[i]["answer"]))
+    assert all(l != INVALID for l in labels)
+
+    states = [None] * len(labels)
+
+    # Select backend
+    call_generate = get_call_generate(args)
+
+    # Run requests
+    if args.backend != "lmql":
+        # Use thread pool
+        def get_one_answer(i):
+            answer = multi_chain_gsm8k(questions[i], args.num_chains, call_generate)
+            states[i] = answer
+
+        tic = time.perf_counter()
+        if args.parallel == 1:
+            for i in tqdm(range(len(questions))):
+                get_one_answer(i)
+        else:
+            with ThreadPoolExecutor(args.parallel) as executor:
+                list(
+                    tqdm(
+                        executor.map(get_one_answer, list(range(len(questions)))),
+                        total=len(questions),
+                    )
+                )
+
+    else:
+        # Use asyncio
+        async def get_one_answer_asyncio(i):
+            answer = await multi_chain_gsm8k_async(
+                questions[i], args.num_chains, call_generate
+            )
+            states[i] = answer
+
+        tic = time.perf_counter()
+        loop = asyncio.get_event_loop()
+        batches = [
+            list(range(i, min(i + args.parallel, len(questions))))
+            for i in range(0, len(questions), args.parallel)
+        ]
+        for bt in tqdm(batches):
+            tasks = [get_one_answer_asyncio(k) for k in bt]
+            loop.run_until_complete(asyncio.gather(*tasks))
+
+    latency = time.perf_counter() - tic
+
+    preds = []
+    for i in range(len(states)):
+        preds.append(get_answer_value(states[i]))
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    invalid = np.mean(np.array(preds) == INVALID)
+    print(f"Latency: {latency:.3f}")
+    print(f"Invalid: {invalid:.3f}")
+    print(f"Accuracy: {acc:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "multi_chain_gsm8k",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-shot", type=int, default=0)
+    parser.add_argument("--num-chains", type=int, default=5)
+    parser.add_argument("--data-path", type=str, default="test.jsonl")
+    parser.add_argument("--num-questions", type=int, default=50)
+    args = add_common_other_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/multi_chain_reasoning/bench_sglang.py b/benchmark/multi_chain_reasoning/bench_sglang.py
new file mode 100644
index 000000000..1d3129db2
--- /dev/null
+++ b/benchmark/multi_chain_reasoning/bench_sglang.py
@@ -0,0 +1,140 @@
+import argparse
+import ast
+import json
+import re
+import time
+
+import numpy as np
+
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text, read_jsonl
+
+INVALID = -9999999
+
+
+def get_answer_value(answer_str):
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+
+
+prompt_lib = [
+    "Let us think step by step.",
+    "Approach this methodically. Let's dissect the problem into smaller, more manageable parts.",
+    "It's important to proceed step by step, ensuring accuracy at each stage.",
+    "Take a deep breath and break this down.",
+    "A little bit of arithmetic and a logical approach will help us quickly arrive at the solution to this problem.",
+    "I am extremely good at math.",
+]
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)
+
+    # Construct prompts
+    # k = args.num_shot
+    # few_shot_examples = get_few_shot_examples(lines, k)
+
+    questions = []
+    labels = []
+    for i in range(len(lines[: args.num_questions])):
+        questions.append(lines[i]["question"])
+        labels.append(get_answer_value(lines[i]["answer"]))
+    assert all(l != INVALID for l in labels)
+    arguments = [{"question": q} for q in questions]
+
+    num_chains = args.num_chains
+
+    #####################################
+    ######### SGL Program Begin #########
+    #####################################
+
+    import sglang as sgl
+
+    @sgl.function
+    def multi_chain_gsm8k(s, question):
+        s += "Question: " + question + "\n"
+        # s += "Answer: " + prompt_lib[0] + sgl.gen("answer", max_tokens=256, stop="Question",
+        #    temperature=0)
+        # return
+
+        forks = s.fork(num_chains)
+        for i in range(num_chains):
+            forks[i] += (
+                "Answer: "
+                + prompt_lib[i % num_chains]
+                + sgl.gen("chain", max_tokens=256, temperature=0.3, stop="Question")
+            )
+        forks.join()
+
+        s += "Answer: To answer this question, here are some possible solutions. "
+        s += "After considering all of them, I will do a majority vote.\n\n"
+        for i in range(num_chains):
+            s += f"Solution {i+1}: " + forks[i]["chain"].strip() + "\n\n"
+        s += "\nBy considering the above solutions and doing a majority vote, I think the final answer (a single integer number) is "
+        s += sgl.gen("answer", max_tokens=16)
+
+    #####################################
+    ########## SGL Program End ##########
+    #####################################
+
+    # Select backend
+    backend = select_sglang_backend(args)
+
+    # Run requests
+    tic = time.perf_counter()
+    states = multi_chain_gsm8k.run_batch(
+        arguments,
+        temperature=0,
+        backend=backend,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+
+    preds = []
+    for i in range(len(states)):
+        preds.append(get_answer_value(states[i]["answer"]))
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    invalid = np.mean(np.array(preds) == INVALID)
+    print(f"Latency: {latency:.3f}")
+    print(f"Invalid: {invalid:.3f}")
+    print(f"Accuracy: {acc:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "multi_chain_gsm8k",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-shot", type=int, default=0)
+    parser.add_argument("--num-chains", type=int, default=5)
+    parser.add_argument("--data-path", type=str, default="test.jsonl")
+    parser.add_argument("--num-questions", type=int, default=50)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/multi_document_qa/README.md b/benchmark/multi_document_qa/README.md
new file mode 100644
index 000000000..09f49c78f
--- /dev/null
+++ b/benchmark/multi_document_qa/README.md
@@ -0,0 +1,47 @@
+## Run benchmark
+
+### Benchmark sglang
+```
+python3 -m sglang.launch_server --model-path codellama/CodeLlama-7b-instruct-hf --port 30000
+```
+
+```
+python3 bench_sglang.py --num-questions 10 --parallel 1
+```
+
+
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model codellama/CodeLlama-7b-instruct-hf  --disable-log-requests --port 21000 --gpu 0.97
+```
+
+```
+python3 bench_other.py --backend vllm --num-questions 64
+```
+
+
+### Benchmark guidance
+```
+python3 bench_other.py --backend guidance --num-questions 32 --parallel 1 --n-ctx 11000 --model-path path/to/code-llama/gguf
+```
+
+
+
+### Build dataset
+
+```
+pip install PyPDF2
+python3 build_dataset.py
+```
+
+```python
+import PyPDF2
+
+with open('llama2.pdf', 'rb') as file:
+    reader = PyPDF2.PdfReader(file)
+    text = ''
+    for page_num in range(len(reader.pages)):
+        text += reader.pages[page_num].extract_text()
+    with open('output.txt', 'w') as text_file:
+        text_file.write(text)
+```
diff --git a/benchmark/multi_document_qa/bench_other.py b/benchmark/multi_document_qa/bench_other.py
new file mode 100644
index 000000000..627837c5c
--- /dev/null
+++ b/benchmark/multi_document_qa/bench_other.py
@@ -0,0 +1,114 @@
+import argparse
+import json
+import time
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+
+from tqdm import tqdm
+
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+from sglang.utils import dump_state_text, read_jsonl
+
+USER_PREFIX = "[INST] "
+USER_SUFFIX = " [/INST]"
+ASSISTANT_PREFIX = ""
+ASSISTANT_SUFFIX = " </s><s>"
+
+
+def multi_document_qa(docs, question, generate):
+    s = USER_PREFIX
+    s += "Please answer a question according to given documents.\n"
+    s += "Question:" + question + "Documents begin.\n"
+
+    s += "".join(docs)
+
+    s += "\nDocuments end."
+    s += (
+        "\n\nBased on the above documents, please answer this question:\n"
+        + question
+        + "\nAnswer in three words or fewer."
+    )
+    s += USER_SUFFIX
+    s += ASSISTANT_PREFIX
+    answer = generate(s, max_tokens=16, stop=None)
+    return answer
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)
+    l = lines[0]
+    arguments = []
+    labels = []
+
+    num_docs = 10
+    if args.backend == "guidance":
+        num_docs = 7  # due to OOM
+
+    for i in range(len(l["questions"][: args.num_questions])):
+        arguments.append(
+            {
+                "docs": l["documents"][:num_docs],
+                "question": l["questions"][i],
+            }
+        )
+        labels.append(l["answers"][i])
+    states = [None] * len(arguments)
+
+    # Select backend
+    call_generate = partial(get_call_generate(args), temperature=0)
+
+    # Run requests
+    def get_one_answer(i):
+        states[i] = multi_document_qa(generate=call_generate, **arguments[i])
+
+    tic = time.perf_counter()
+    if args.parallel == 1:
+        for i in tqdm(range(len(labels))):
+            get_one_answer(i)
+    else:
+        with ThreadPoolExecutor(args.parallel) as executor:
+            list(
+                tqdm(
+                    executor.map(get_one_answer, list(range(len(labels)))),
+                    total=len(labels),
+                )
+            )
+
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    print(states)
+    correct = 0
+    for s, label in zip(states, labels):
+        answer = s.lower()
+        if all(x in answer for x in label.lower().split(" ")):
+            correct += 1
+    accuracy = correct / len(labels)
+    print(f"Accuracy: {accuracy:.3f}")
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "multi_document_qa",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_questions,
+            "accuracy": accuracy,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="questions.jsonl")
+    parser.add_argument("--num-questions", type=int, default=100)
+    args = add_common_other_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/multi_document_qa/bench_sglang.py b/benchmark/multi_document_qa/bench_sglang.py
new file mode 100644
index 000000000..0b4b0dbc6
--- /dev/null
+++ b/benchmark/multi_document_qa/bench_sglang.py
@@ -0,0 +1,93 @@
+import argparse
+import json
+import time
+
+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text, read_jsonl
+
+
+@sgl.function
+def multi_document_qa(s, docs, question):
+    s += sgl.user_begin()
+    s += "Please answer a question according to given documents.\n"
+    s += "Question:" + question + "Documents begin.\n"
+
+    forks = s.fork(len(docs))
+    forks += lambda i: docs[i]
+    forks.join("concate_and_append")
+
+    s += "\nDocuments end."
+    s += (
+        "\n\nBased on the above documents, please answer this question:\n"
+        + question
+        + "\nAnswer in three words or fewer."
+    )
+    s += sgl.user_end()
+    s += sgl.assistant(sgl.gen("answer", max_tokens=16))
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)
+    l = lines[0]
+    arguments = []
+    labels = []
+    for i in range(len(l["questions"][: args.num_questions])):
+        arguments.append(
+            {
+                "docs": l["documents"][:10],
+                "question": l["questions"][i],
+            }
+        )
+        labels.append(l["answers"][i])
+
+    # Select backend
+    backend = select_sglang_backend(args)
+    sgl.set_default_backend(backend)
+
+    # Run requests
+    tic = time.perf_counter()
+    states = multi_document_qa.run_batch(
+        arguments, temperature=0, num_threads=args.parallel, progress_bar=True
+    )
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    print([s["answer"] for s in states])
+    correct = 0
+    for s, label in zip(states, labels):
+        answer = s["answer"].lower()
+        if all(x in answer for x in label.lower().split(" ")):
+            correct += 1
+    accuracy = correct / len(labels)
+    print(f"Accuracy: {accuracy:.3f}")
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "multi_document_qa",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_questions,
+            "accuracy": accuracy,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="questions.jsonl")
+    parser.add_argument("--num-questions", type=int, default=100)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/multi_document_qa/build_dataset.py b/benchmark/multi_document_qa/build_dataset.py
new file mode 100644
index 000000000..27df9474f
--- /dev/null
+++ b/benchmark/multi_document_qa/build_dataset.py
@@ -0,0 +1,70 @@
+import json
+
+import transformers
+
+content = "\n".join(
+    open("llama2.txt", "r", encoding="utf-8", errors="ignore").readlines()
+)
+content = content.replace("\n\n", "\n")
+
+# Count token
+name = "meta-llama/Llama-2-7b-chat-hf"
+t = transformers.AutoTokenizer.from_pretrained(name)
+print(f"num tokens: {len(t.encode(content))}")
+
+# Segment
+SEP = "\n\n"
+parts = content.split(SEP)
+print(f"num segments: {len(parts)}")
+
+segment_len = 1100
+
+segments = []
+tmp = []
+tmp_len = 0
+for i in range(len(parts)):
+    tmp.append(parts[i])
+    tmp_len += len(t.encode(parts[i]))
+
+    if tmp_len > segment_len:
+        segments.append(SEP.join(tmp))
+        tmp = []
+        tmp_len = 0
+
+for i, s in enumerate(segments):
+    print(i, len(t.encode(segments[i])))
+
+# Dump
+with open("questions.jsonl", "w") as fout:
+    fout.write(
+        json.dumps(
+            {
+                "documents": segments[:30],
+                "questions": [
+                    "What is the name of the fine-tuned LLMs?",
+                    "Which figure shows the helpfulness human evaluation results for Llama 2-Chat?",
+                    "What is the number of parameters in the largest Llama 2 model?",
+                    "What is the batch size of fine-tuning?",
+                    "Where can we find the details of potential data contamination?",
+                    "What is the full name of MPT?",
+                    "What is the power consumption of RSC in Watt?",
+                    "How many tokens of data do they train on?",
+                    "Which model's release is delayed due to a lack of time to sufficiently red team?",
+                    "Which activation function is used in Llama?",
+                ],
+                "answers": [
+                    "Llama 2 Chat",
+                    "1",
+                    "70 B",
+                    "64",
+                    "A 6",
+                    "MosaicML",
+                    "400",
+                    "2 trillion",
+                    "34 B",
+                    "SwiGLU",
+                ],
+            }
+        )
+        + "\n"
+    )
diff --git a/benchmark/multi_turn_chat/README.md b/benchmark/multi_turn_chat/README.md
new file mode 100644
index 000000000..0fb5b21fa
--- /dev/null
+++ b/benchmark/multi_turn_chat/README.md
@@ -0,0 +1,66 @@
+### Benchmark sglang
+
+Run Llama-7B
+
+```
+python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+Run Mixtral-8x7B
+(When there is a CUDA out-of-memory error, try to reduce the `--mem-fraction-static`)
+
+```
+python3 -m sglang.launch_server --model-path mistralai/Mixtral-8x7B-Instruct-v0.1 --port 30000 --tp-size 8
+```
+
+Benchmark(short output)
+
+```
+python3 bench_sglang.py --tokenizer meta-llama/Llama-2-7b-chat-hf
+```
+
+Benchmark(long output)
+
+```
+python3 bench_sglang.py --tokenizer meta-llama/Llama-2-7b-chat-hf --long
+```
+
+### Benchmark vLLM
+
+Run Llama-7B
+
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf  --disable-log-requests --port 21000
+```
+
+Run Mixtral-8x7B
+
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model mistralai/Mixtral-8x7B-Instruct-v0.1 --disable-log-requests --port 21000 --tensor-parallel-size 8
+```
+
+Benchmark(short output)
+
+```
+python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend vllm
+```
+
+Benchmark(long output)
+
+```
+python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend vllm --long
+```
+
+### Benchmark guidance
+
+Benchmark Llama-7B (short output)
+
+```
+python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+
+Benchmark Llama-7B (long output)
+
+```
+python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf --long
+```
diff --git a/benchmark/multi_turn_chat/bench_other.py b/benchmark/multi_turn_chat/bench_other.py
new file mode 100644
index 000000000..9189af5be
--- /dev/null
+++ b/benchmark/multi_turn_chat/bench_other.py
@@ -0,0 +1,93 @@
+import json
+import time
+from argparse import ArgumentParser
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+
+from data_gen import gen_arguments
+from tqdm import tqdm
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+from sglang.utils import dump_state_text
+
+
+def multi_turns(generate, qas):
+    s = ""
+    for qa in qas:
+        s += qa["prompt"]
+        s += generate(s, max_tokens=qa["new_tokens"])
+
+    return s
+
+
+def main(args):
+    print(args)
+
+    tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
+
+    multi_qas = gen_arguments(args, tokenizer)
+
+    states = [None] * args.num_qa
+
+    call_generate = partial(get_call_generate(args), temperature=0)
+
+    def get_one_answer(i):
+        states[i] = multi_turns(generate=call_generate, **multi_qas[i])
+
+    tic = time.perf_counter()
+    if args.parallel == 1:
+        for i in tqdm(range(len(multi_qas))):
+            get_one_answer(i)
+    else:
+        with ThreadPoolExecutor(args.parallel) as executor:
+            rets = list(
+                tqdm(
+                    executor.map(get_one_answer, list(range(len(multi_qas)))),
+                    total=len(multi_qas),
+                )
+            )
+            for _ in rets:
+                pass
+
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    print(f"Latency: {latency:.3f}")
+
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "multi_turn_chat",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_qa,
+            "num_turns": args.turns,
+            "other": {
+                "parallel": args.parallel,
+                "output_mode": "long" if args.long else "short",
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--turns", type=int, default=4)
+    parser.add_argument("--num-qa", type=int, default=20)
+    parser.add_argument("--min-len-q", type=int, default=256)
+    parser.add_argument("--max-len-q", type=int, default=512)
+    parser.add_argument("--min-len-a", type=int, default=4)
+    parser.add_argument("--max-len-a", type=int, default=8)
+    parser.add_argument("--tokenizer", type=str, required=True)
+    parser.add_argument("--trust-remote-code", action="store_true")
+    parser.add_argument("--long", action="store_true")
+    args = add_common_other_args_and_parse(parser)
+
+    if args.long:
+        args.min_len_a = 256
+        args.max_len_a = 512
+        args.num_qa = 20
+    main(args)
diff --git a/benchmark/multi_turn_chat/bench_sglang.py b/benchmark/multi_turn_chat/bench_sglang.py
new file mode 100644
index 000000000..1051bf19e
--- /dev/null
+++ b/benchmark/multi_turn_chat/bench_sglang.py
@@ -0,0 +1,79 @@
+import json
+import time
+from argparse import ArgumentParser
+
+from data_gen import gen_arguments
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text
+
+
+@sgl.function
+def multi_turns(s, qas):
+    for qa in qas:
+        s += qa["prompt"]
+        s += sgl.gen(max_tokens=qa["new_tokens"], ignore_eos=True)
+
+
+def main(args):
+    tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
+
+    multi_qas = gen_arguments(args, tokenizer)
+
+    backend = select_sglang_backend(args)
+
+    tic = time.perf_counter()
+    states = multi_turns.run_batch(
+        multi_qas,
+        temperature=0,
+        backend=backend,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+
+    print(f"Latency: {latency:.3f}")
+
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "multi_turn_chat",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_qa,
+            "num_turns": args.turns,
+            "other": {
+                "parallel": args.parallel,
+                "output_mode": "long" if args.long else "short",
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--turns", type=int, default=4)
+    parser.add_argument("--num-qa", type=int, default=20)
+    parser.add_argument("--min-len-q", type=int, default=256)
+    parser.add_argument("--max-len-q", type=int, default=512)
+    parser.add_argument("--min-len-a", type=int, default=4)
+    parser.add_argument("--max-len-a", type=int, default=8)
+    parser.add_argument("--tokenizer", type=str, required=True)
+    parser.add_argument("--trust-remote-code", action="store_true")
+    parser.add_argument("--long", action="store_true")
+    args = add_common_sglang_args_and_parse(parser)
+
+    if args.long:
+        args.min_len_a = 256
+        args.max_len_a = 512
+        args.num_qa = 20
+
+    print(args)
+    main(args)
diff --git a/benchmark/multi_turn_chat/data_gen.py b/benchmark/multi_turn_chat/data_gen.py
new file mode 100644
index 000000000..043c07a76
--- /dev/null
+++ b/benchmark/multi_turn_chat/data_gen.py
@@ -0,0 +1,29 @@
+import random
+import string
+
+random.seed(42)
+
+
+def gen_prompt(tokenizer, token_num):
+    cha_set = string.ascii_letters + string.digits
+    ret = "".join(random.choices(cha_set, k=token_num))
+    while len(tokenizer(ret).input_ids) < token_num:
+        ret += random.choice(cha_set)
+    return ret
+
+
+def gen_arguments(args, tokenizer):
+    multi_qas = [{"qas": []} for _ in range(args.num_qa)]
+    for i in range(args.num_qa):
+        qas = multi_qas[i]["qas"]
+        for _ in range(args.turns):
+            prompt_len = random.randint(args.min_len_q, args.max_len_q)
+            new_tokens = random.randint(args.min_len_a, args.max_len_a)
+            qas.append(
+                {
+                    "prompt": gen_prompt(tokenizer, prompt_len),
+                    "new_tokens": new_tokens,
+                }
+            )
+
+    return multi_qas
diff --git a/benchmark/multi_turn_chat/long_prompt_multi_turn.py b/benchmark/multi_turn_chat/long_prompt_multi_turn.py
new file mode 100644
index 000000000..bda5bb9cc
--- /dev/null
+++ b/benchmark/multi_turn_chat/long_prompt_multi_turn.py
@@ -0,0 +1,129 @@
+import json
+import random
+import time
+from argparse import ArgumentParser
+from pathlib import Path
+
+from tqdm import tqdm
+
+import sglang as sgl
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text
+
+
+def gen_prompt(tokenizer, token_num):
+    all_available_tokens = list(tokenizer.get_vocab().values())
+    selected_tokens = random.choices(all_available_tokens, k=token_num)
+    ret = tokenizer.decode(selected_tokens)
+    return ret
+
+
+def get_cache_path(args):
+    # Create cache directory under ~/.cache/sglang
+    cache_dir = Path.home() / ".cache" / "sglang"
+
+    # Create a unique cache filename based on the arguments that affect generation
+    cache_key = f"qa_{args.num_qa}_{args.turns}_{args.system_prompt_len}_{args.len_q}_{args.len_a}_{args.tokenizer.replace('/', '_')}.json"
+    return cache_dir / cache_key
+
+
+def gen_arguments(args, tokenizer):
+    cache_path = get_cache_path(args)
+
+    # Try to load from cache first
+    if cache_path.exists():
+        print(f"Loading cached arguments from {cache_path}")
+        with open(cache_path, "r") as f:
+            return json.load(f)
+
+    print("Generating new arguments...")
+    # First progress bar for system prompts
+    multi_qas = []
+    for _ in tqdm(range(args.num_qa), desc="Generating system prompts"):
+        multi_qas.append(
+            {"system_prompt": gen_prompt(tokenizer, args.system_prompt_len), "qas": []}
+        )
+
+    # Nested progress bars for QA pairs
+    for i in tqdm(range(args.num_qa), desc="Generating QA pairs"):
+        qas = multi_qas[i]["qas"]
+        for j in range(args.turns):
+            qas.append(
+                {
+                    "prompt": gen_prompt(tokenizer, args.len_q),
+                    "new_tokens": args.len_a,
+                }
+            )
+
+    # Save to cache
+    cache_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(cache_path, "w") as f:
+        json.dump(multi_qas, f)
+    print(f"Cached arguments saved to {cache_path}")
+
+    return multi_qas
+
+
+@sgl.function
+def multi_turns(s, system_prompt, qas):
+    s += system_prompt
+
+    for i, qa in enumerate(qas):
+        s += qa["prompt"]
+        s += sgl.gen(max_tokens=qa["new_tokens"], ignore_eos=True)
+
+
+def main(args):
+    tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
+
+    multi_qas = gen_arguments(args, tokenizer)
+
+    backend = select_sglang_backend(args)
+
+    tic = time.perf_counter()
+    states = multi_turns.run_batch(
+        multi_qas,
+        temperature=0,
+        backend=backend,
+        num_threads="auto",
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+
+    print(f"Latency: {latency:.3f}")
+
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "multi_turn_system_prompt_chat",
+            "backend": args.backend,
+            "latency": round(latency, 3),
+            "num_requests": args.num_qa,
+            "num_turns": args.turns,
+            "other": {
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--turns", type=int, default=8)
+    parser.add_argument("--num-qa", type=int, default=128)
+    parser.add_argument("--system-prompt-len", type=int, default=2048)
+    parser.add_argument("--len-q", type=int, default=32)
+    parser.add_argument("--len-a", type=int, default=128)
+    parser.add_argument(
+        "--tokenizer", type=str, default="meta-llama/Meta-Llama-3-8B-Instruct"
+    )
+    parser.add_argument("--trust-remote-code", action="store_true")
+    args = add_common_sglang_args_and_parse(parser)
+
+    print(args)
+    main(args)
diff --git a/benchmark/prefill_only/bench_embeddings.py b/benchmark/prefill_only/bench_embeddings.py
new file mode 100644
index 000000000..ca66c85a3
--- /dev/null
+++ b/benchmark/prefill_only/bench_embeddings.py
@@ -0,0 +1,148 @@
+"""
+SGLang Embeddings Benchmark Script
+
+This script benchmarks SGLang's /v1/embeddings API performance using HTTP requests.
+
+Features:
+- HTTP-only implementation
+- Uses /v1/embeddings API endpoint directly
+- Configurable RPS, duration, and batch sizes
+- Progress tracking and detailed metrics
+- Poisson and constant request distributions
+
+Usage:
+- Update configuration variables at the top of the file
+- Ensure SGLang server is running on the configured HTTP_URL
+- Run: python bench_embeddings.py
+"""
+
+import asyncio
+import logging
+
+from transformers import AutoTokenizer
+from util import (
+    BenchmarkConfig,
+    generate_text_with_token_count,
+    run_benchmark_main,
+    run_generic_benchmark,
+)
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+###############################################################################
+# CONFIG
+###############################################################################
+# Create benchmark configuration
+config = BenchmarkConfig()
+config.rps_values = [500]
+config.duration_secs_values = [60]
+config.num_unique_requests = 100
+config.distribution = "POISSON"
+config.profile = False
+config.freeze_gc = True  # Enable GC freeze functionality
+# Profiler output directory - by default uses present working directory (pwd)
+# Uncomment and customize the line below to override the default location:
+# config.profiler_dir = "/sglang-oss-trace"
+
+# HTTP Configuration
+HTTP_URL = "http://localhost:30000/v1/embeddings"
+
+# Embeddings API Config
+EMBEDDINGS_MODEL_PATH = "/Qwen/Qwen3-Embedding-0.6B"
+BATCH_SIZE = [1]  # Number of items per request (batch size)
+
+# Configurable input token length
+EMBEDDINGS_INPUT_TOKENS = 500  # Default token length
+
+# Load tokenizer once for embeddings text generation
+print("Loading tokenizer for embeddings input generation...")
+embeddings_tokenizer = AutoTokenizer.from_pretrained(EMBEDDINGS_MODEL_PATH)
+
+# Generate input text with the specified token length using pre-loaded tokenizer
+EMBEDDINGS_INPUT_TEXT = generate_text_with_token_count(
+    EMBEDDINGS_MODEL_PATH,
+    EMBEDDINGS_INPUT_TOKENS,
+    config.special_replicated_token,
+    tokenizer=embeddings_tokenizer,
+)
+
+
+###############################################################################
+# REQUEST GENERATION (in parallel)
+###############################################################################
+def build_embeddings_request(index: int, item_count: int) -> tuple:
+    """Build a single embeddings request."""
+    try:
+        # For embeddings, input can be a string or list of strings
+        if item_count == 1:
+            input_data = EMBEDDINGS_INPUT_TEXT
+        else:
+            input_data = [EMBEDDINGS_INPUT_TEXT for _ in range(item_count)]
+        req = {
+            "input": input_data,
+            "model": EMBEDDINGS_MODEL_PATH,
+        }
+        return (index, req)
+    except Exception as e:
+        logger.error(f"Error building request {index}: {e}")
+        return (index, None)
+
+
+def validate_embeddings_response(response_data: dict) -> bool:
+    """Validate embeddings API response."""
+    return "data" in response_data
+
+
+def build_warmup_embeddings_request() -> dict:
+    """Build a warmup request for the embeddings API."""
+    return {
+        "input": EMBEDDINGS_INPUT_TEXT,
+        "model": EMBEDDINGS_MODEL_PATH,
+    }
+
+
+###############################################################################
+# MAIN
+###############################################################################
+async def run_benchmark(rps, duration_secs, item_count):
+    """Run a single embeddings benchmark with the given RPS value."""
+    return await run_generic_benchmark(
+        rps=rps,
+        duration_secs=duration_secs,
+        item_count=item_count,
+        config=config,
+        http_url=HTTP_URL,
+        build_request_func=build_embeddings_request,
+        response_validator=validate_embeddings_response,
+        api_name="EMBEDDINGS",
+        request_description="embeddings requests",
+    )
+
+
+async def main():
+    additional_info = {
+        "Input text length": f"{EMBEDDINGS_INPUT_TOKENS} tokens",
+        "Input text preview": (
+            EMBEDDINGS_INPUT_TEXT[:100] + "..."
+            if len(EMBEDDINGS_INPUT_TEXT) > 100
+            else EMBEDDINGS_INPUT_TEXT
+        ),
+    }
+
+    await run_benchmark_main(
+        config,
+        run_benchmark,
+        "EMBEDDINGS",
+        HTTP_URL,
+        BATCH_SIZE,
+        additional_info,
+        build_warmup_embeddings_request,
+    )
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/benchmark/prefill_only/bench_score.py b/benchmark/prefill_only/bench_score.py
new file mode 100644
index 000000000..117335eae
--- /dev/null
+++ b/benchmark/prefill_only/bench_score.py
@@ -0,0 +1,192 @@
+"""
+SGLang Scoring Benchmark Script
+
+This script benchmarks SGLang's scoring API performance using HTTP requests.
+
+Current Features:
+- HTTP-only implementation (open source compatible)
+- Uses /v1/score API endpoint directly
+- Single item scoring with batching support
+- Configurable RPS, duration, and batch sizes
+- Progress tracking and detailed metrics
+- Poisson and constant request distributions
+
+Usage:
+- Update configuration variables at the top of the file
+- Ensure SGLang server is running on the configured HTTP_URL
+- Run: python bench_score.py
+- Each request will contain ITEM_COUNT_VALUES items for batch scoring
+
+"""
+
+import asyncio
+
+from transformers import AutoTokenizer
+from util import (
+    BenchmarkConfig,
+    generate_text_with_token_count,
+    run_benchmark_main,
+    run_generic_benchmark,
+)
+
+###############################################################################
+# CONFIG
+###############################################################################
+# Create benchmark configuration
+config = BenchmarkConfig()
+config.rps_values = [160]
+config.duration_secs_values = [60]
+config.num_unique_requests = 100
+config.distribution = "POISSON"
+config.profile = False
+config.freeze_gc = True  # Enable GC freeze functionality
+# Profiler output directory - by default uses present working directory (pwd)
+# Uncomment and customize the line below to override the default location:
+# config.profiler_dir = "/sglang-oss-trace"
+
+# HTTP Configuration
+HTTP_URL = "http://localhost:30000/v1/score"  # Use score API directly
+
+# Score API Config
+# ITEM_COUNT_VALUES determines number of items per score request (batch size)
+SCORE_QUERY_TOKENS = 120
+SCORE_ITEM_TOKENS = 180
+SCORE_MODEL_PATH = "Qwen/Qwen3-0.6B"
+SCORE_LABEL_TOKEN_IDS = [9454, 2753]  # Yes/No token IDs
+ITEM_COUNT_VALUES = [10]  # Number of items per request
+
+# Special token to replicate for precise token counting
+SPECIAL_REPLICATED_TOKEN = "<|im_start|>"
+
+
+###############################################################################
+# REQUEST GENERATION (in parallel)
+###############################################################################
+def create_score_request_builder():
+    """Create a score request builder function with shared tokenizer."""
+    # Load tokenizer once here to verify special token and get precise counts
+    print("Loading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(SCORE_MODEL_PATH)
+
+    # Verify that our special token produces exactly 1 token
+    special_token_count = len(
+        tokenizer.encode(config.special_replicated_token, add_special_tokens=False)
+    )
+    print(
+        f"Special token '{config.special_replicated_token}' produces "
+        f"{special_token_count} token(s)"
+    )
+
+    def generate_text_with_token_count_local(num_toks):
+        """Generate text with precise token count using replicated token."""
+        return generate_text_with_token_count(
+            SCORE_MODEL_PATH,
+            num_toks,
+            config.special_replicated_token,
+            tokenizer=tokenizer,
+        )
+
+    def build_score_request(index: int, item_count: int) -> tuple:
+        """Build a single score request."""
+        try:
+            # Generate query and items for score API
+            query = generate_text_with_token_count_local(SCORE_QUERY_TOKENS)
+            items = [
+                generate_text_with_token_count_local(SCORE_ITEM_TOKENS)
+                for _ in range(item_count)
+            ]
+
+            # Return as dict for score API format
+            score_data = {
+                "query": query,
+                "items": items,
+                "label_token_ids": SCORE_LABEL_TOKEN_IDS,
+                "model": SCORE_MODEL_PATH,
+            }
+            return (index, score_data)
+
+        except Exception as e:
+            print(f"Error building request {index}: {e}")
+            return (index, None)
+
+    return build_score_request
+
+
+def validate_score_response(response_data: dict) -> bool:
+    """Validate score API response."""
+    return "scores" in response_data or "logprobs" in response_data
+
+
+def build_warmup_score_request() -> dict:
+    """Build a warmup request for the score API."""
+    # Load tokenizer once for warmup generation
+    tokenizer = AutoTokenizer.from_pretrained(SCORE_MODEL_PATH)
+
+    warmup_query = generate_text_with_token_count(
+        SCORE_MODEL_PATH,
+        SCORE_QUERY_TOKENS,
+        config.special_replicated_token,
+        tokenizer=tokenizer,
+    )
+    warmup_items = [
+        generate_text_with_token_count(
+            SCORE_MODEL_PATH,
+            SCORE_ITEM_TOKENS,
+            config.special_replicated_token,
+            tokenizer=tokenizer,
+        )
+        for _ in range(3)
+    ]
+
+    return {
+        "query": warmup_query,
+        "items": warmup_items,
+        "label_token_ids": SCORE_LABEL_TOKEN_IDS,
+        "model": SCORE_MODEL_PATH,
+        # Add missing parameters for consistency with the original warmup
+        "apply_softmax": True,
+        "item_first": False,
+    }
+
+
+###############################################################################
+# MAIN
+###############################################################################
+async def run_benchmark(rps, duration_secs, item_count):
+    """Run a single benchmark with the given RPS value."""
+    # Create the request builder function with shared tokenizer
+    build_request_func = create_score_request_builder()
+
+    return await run_generic_benchmark(
+        rps=rps,
+        duration_secs=duration_secs,
+        item_count=item_count,
+        config=config,
+        http_url=HTTP_URL,
+        build_request_func=build_request_func,
+        response_validator=validate_score_response,
+        api_name="SINGLE_ITEM_SCORING",
+        request_description="score requests",
+    )
+
+
+async def main():
+    """Main function that runs benchmarks for all RPS values."""
+    additional_info = {
+        "Query tokens per request": SCORE_QUERY_TOKENS,
+        "Item tokens per item": SCORE_ITEM_TOKENS,
+    }
+
+    await run_benchmark_main(
+        config,
+        run_benchmark,
+        "SINGLE_ITEM_SCORING",
+        HTTP_URL,
+        ITEM_COUNT_VALUES,
+        additional_info,
+        build_warmup_score_request,
+    )
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/benchmark/prefill_only/util.py b/benchmark/prefill_only/util.py
new file mode 100644
index 000000000..0dbc39027
--- /dev/null
+++ b/benchmark/prefill_only/util.py
@@ -0,0 +1,813 @@
+"""
+Common utilities for SGLang benchmark scripts.
+
+This module contains shared code for benchmarking different SGLang APIs
+including scoring, embeddings, and other endpoints.
+"""
+
+import asyncio
+import concurrent.futures
+import json
+import os
+import random
+from statistics import mean
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+import aiohttp
+import numpy as np
+from tqdm import tqdm
+from transformers import AutoTokenizer
+
+
+class BenchmarkConfig:
+    """Configuration for benchmark parameters."""
+
+    def __init__(self):
+        # Common benchmark settings
+        self.server_type = "HTTP"
+        self.rps_values = [70]
+        self.duration_secs_values = [60]
+        self.num_unique_requests = 100
+        self.distribution = "POISSON"  # Options: "CONSTANT", "POISSON"
+        self.profile = False
+
+        # Garbage Collection Control
+        self.freeze_gc = True  # Enable/disable garbage collection freezing
+
+        # Profiler configuration
+        self.profiler_dir = (
+            os.getcwd()
+        )  # Default profiler output directory (current working directory)
+
+        # Special token for text generation
+        self.special_replicated_token = "<|im_start|>"
+
+
+def generate_text_with_token_count(
+    model_path: str,
+    num_tokens: int,
+    special_token: str = "<|im_start|>",
+    tokenizer: Optional[Any] = None,
+) -> str:
+    """
+    Generate text with precise token count using a replicated token.
+
+    Args:
+        model_path: Path to the model for tokenizer
+        num_tokens: Target number of tokens
+        special_token: Token to replicate
+        tokenizer: Optional pre-loaded tokenizer to avoid repeated loading
+
+    Returns:
+        Generated text with approximately the target token count
+    """
+    if tokenizer is None:
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+    # Verify token count
+    special_token_count = len(tokenizer.encode(special_token, add_special_tokens=False))
+
+    if special_token_count == 1:
+        # Simple case: token maps to exactly 1 token
+        return special_token * num_tokens
+    else:
+        print(f"Special token '{special_token}' produces {special_token_count} tokens")
+        # Handle case where special token produces multiple tokens
+        repetitions = (num_tokens + special_token_count - 1) // special_token_count
+        text = special_token * repetitions
+
+        # Verify we got the expected token count
+        actual_tokens = len(tokenizer.encode(text, add_special_tokens=False))
+        if actual_tokens < num_tokens:
+            print(f"Warning: Generated {actual_tokens} tokens, expected {num_tokens}")
+
+        return text
+
+
+def setup_profiler(config: BenchmarkConfig, benchmark_name: str) -> None:
+    """
+    Set up profiler environment if profiling is enabled.
+
+    Args:
+        config: Benchmark configuration
+        benchmark_name: Name of the benchmark (used in directory path)
+    """
+    if config.profile:
+        # Create benchmark-specific subdirectory
+        profiler_path = os.path.join(
+            config.profiler_dir, benchmark_name.lower().replace("_", "-")
+        )
+        os.environ["SGLANG_TORCH_PROFILER_DIR"] = profiler_path
+        print(f"Profiler enabled. Output directory: {profiler_path}")
+    else:
+        print("Profiler disabled")
+
+
+def prepare_all_requests_parallel(
+    num_requests: int,
+    item_count: int,
+    build_request_func: Callable[[int, int], Tuple[int, Any]],
+    config: BenchmarkConfig,
+    description: str = "requests",
+) -> List[Any]:
+    """
+    Generic function to generate unique requests in parallel, then reuse them.
+
+    Args:
+        num_requests: Total number of requests needed
+        item_count: Number of items per request (batch size)
+        build_request_func: Function that takes (index, item_count) and returns (index, request_data)
+        config: Benchmark configuration
+        description: Description for progress bars
+
+    Returns:
+        List of request data objects
+    """
+
+    def build_request_wrapper(index):
+        """Wrapper to call the provided build_request_func."""
+        try:
+            return build_request_func(index, item_count)
+        except Exception as e:
+            print(f"Error building request {index}: {e}")
+            return (index, None)
+
+    # Generate only the unique requests
+    unique_requests = [None] * config.num_unique_requests
+    max_workers = min(8, os.cpu_count() or 1)  # Limit to 8 threads max
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = []
+        for i in tqdm(
+            range(config.num_unique_requests),
+            desc=f"Submitting {description} generation tasks",
+        ):
+            future = executor.submit(build_request_wrapper, i)
+            futures.append(future)
+
+        # Collect results as they complete
+        for f in tqdm(
+            concurrent.futures.as_completed(futures),
+            desc=f"Building unique {description}",
+            total=config.num_unique_requests,
+        ):
+            try:
+                index, req_data = f.result()
+                if req_data is not None:
+                    unique_requests[index] = req_data
+                else:
+                    print(f"Failed to build request {index}")
+            except Exception as e:
+                print(f"Error processing request result: {e}")
+
+    # Check if we have any valid requests
+    valid_requests = [req for req in unique_requests if req is not None]
+    if not valid_requests:
+        raise RuntimeError("Failed to generate any valid requests")
+
+    print(
+        f"Successfully generated {len(valid_requests)} out of "
+        f"{config.num_unique_requests} unique {description}"
+    )
+
+    # Create the full request list by cycling through unique requests
+    print(
+        f"Reusing {len(valid_requests)} unique {description} to create "
+        f"{num_requests} total requests..."
+    )
+    all_requests = []
+    for i in tqdm(range(num_requests), desc=f"Reusing {description}"):
+        unique_index = i % len(valid_requests)
+        all_requests.append(valid_requests[unique_index])
+
+    print(f"All {description} prepared.\n")
+    return all_requests
+
+
+async def sleep_with_distribution(distribution: str, rps: float) -> None:
+    """
+    Sleep according to the specified distribution pattern.
+
+    Args:
+        distribution: "CONSTANT" or "POISSON"
+        rps: Requests per second rate
+    """
+    if distribution == "CONSTANT":
+        interval = 1 / rps
+        await asyncio.sleep(interval)
+    elif distribution == "POISSON":
+        # For Poisson process, inter-arrival times follow exponential distribution
+        interval = random.expovariate(rps)
+        await asyncio.sleep(interval)
+    else:
+        raise ValueError(
+            f"Unknown distribution: {distribution}. Use 'CONSTANT' or 'POISSON'."
+        )
+
+
+def build_http_request_json(request_data: Any) -> str:
+    """
+    Generic function to build HTTP request JSON.
+
+    Args:
+        request_data: The data to serialize to JSON
+
+    Returns:
+        JSON string representation of the request data
+    """
+    return json.dumps(request_data)
+
+
+async def make_http_call(
+    session: aiohttp.ClientSession,
+    request_data: Any,
+    request_id: int,
+    results_queue: asyncio.Queue,
+    http_url: str,
+    response_validator: Callable[[Dict[str, Any]], bool],
+    api_name: str = "API",
+) -> None:
+    """
+    Generic HTTP call function for API requests.
+
+    Args:
+        session: aiohttp client session
+        request_data: Data to send in the request
+        request_id: Unique identifier for this request
+        results_queue: Queue to put results
+        http_url: URL to send the request to
+        response_validator: Function to validate the response JSON
+        api_name: Name of the API for error messages
+    """
+    try:
+        start_time = asyncio.get_event_loop().time()
+
+        request_json = build_http_request_json(request_data)
+        headers = {"Content-Type": "application/json"}
+
+        async with session.post(http_url, data=request_json, headers=headers) as resp:
+            resp_text = await resp.text()
+
+            if resp.status != 200:
+                print(
+                    f"[HTTP] {api_name} Request {request_id} failed with status "
+                    f"{resp.status}: {resp_text}"
+                )
+                completion_time = asyncio.get_event_loop().time()
+                await results_queue.put((request_id, 0, False, completion_time))
+                return
+
+            # Parse and validate response
+            try:
+                response_data = json.loads(resp_text)
+                success = response_validator(response_data)
+                if not success:
+                    print(
+                        f"[HTTP] {api_name} Request {request_id} failed response validation"
+                    )
+            except json.JSONDecodeError:
+                print(
+                    f"[HTTP] {api_name} Request {request_id} failed to parse JSON response"
+                )
+                success = False
+
+        completion_time = asyncio.get_event_loop().time()
+        elapsed_time = (completion_time - start_time) * 1000
+        await results_queue.put((request_id, elapsed_time, success, completion_time))
+
+    except Exception as e:
+        print(f"[HTTP] {api_name} Error for request {request_id}: {e}")
+        completion_time = asyncio.get_event_loop().time()
+        await results_queue.put((request_id, 0, False, completion_time))
+
+
+async def send_profile_request(
+    profile_text: str, http_url: str, session: Optional[aiohttp.ClientSession] = None
+) -> None:
+    """
+    Send a profile request (START_PROFILE or STOP_PROFILE) and wait for completion.
+
+    Args:
+        profile_text: "START_PROFILE" or "STOP_PROFILE"
+        http_url: Base HTTP URL (will derive profile endpoints from this)
+        session: Optional aiohttp session to use
+    """
+    try:
+        if session:
+            print(f"Sending {profile_text} request via HTTP...")
+
+            # Determine the correct endpoint
+            if "/v1/" in http_url:
+                base_url = http_url.rsplit("/v1/", 1)[0]  # Remove /v1/xxx
+            else:
+                base_url = http_url.rsplit("/", 1)[0]  # Remove last path component
+
+            if profile_text == "START_PROFILE":
+                endpoint_url = f"{base_url}/start_profile"
+            elif profile_text == "STOP_PROFILE":
+                endpoint_url = f"{base_url}/stop_profile"
+            else:
+                print(f"Unknown profile request: {profile_text}")
+                return
+
+            headers = {"Content-Type": "application/json"}
+
+            async with session.post(endpoint_url, headers=headers) as resp:
+                resp_text = await resp.text()
+                if resp.status == 200:
+                    print(f"{profile_text} request completed")
+                else:
+                    print(
+                        f"{profile_text} request failed with status "
+                        f"{resp.status}: {resp_text}"
+                    )
+        else:
+            print(f"Cannot send {profile_text} request - missing session")
+
+    except Exception as e:
+        print(f"Error sending {profile_text} request: {e}")
+
+
+async def call_freeze_gc_http(session: aiohttp.ClientSession, http_url: str) -> None:
+    """
+    Call the /freeze_gc HTTP endpoint.
+
+    Args:
+        session: aiohttp client session
+        http_url: Base HTTP URL to derive the freeze_gc endpoint from
+    """
+    try:
+        # Derive freeze_gc endpoint from the API URL
+        if "/v1/" in http_url:
+            freeze_gc_url = http_url.rsplit("/v1/", 1)[0] + "/freeze_gc"
+        else:
+            freeze_gc_url = http_url.rsplit("/", 1)[0] + "/freeze_gc"
+
+        print(f"Calling freeze_gc endpoint: {freeze_gc_url}")
+
+        async with session.post(freeze_gc_url) as resp:
+            if resp.status == 200:
+                print("freeze_gc called successfully")
+            else:
+                resp_text = await resp.text()
+                print(f"freeze_gc failed with status {resp.status}: {resp_text}")
+
+    except Exception as e:
+        print(f"Failed to call freeze_gc: {e}")
+
+
+async def send_warmup_requests(
+    session: aiohttp.ClientSession,
+    http_url: str,
+    build_warmup_request_func: Callable[[], Any],
+    num_warmup: int = 3,
+) -> None:
+    """
+    Send warmup requests to HTTP server.
+
+    Args:
+        session: aiohttp client session
+        http_url: URL to send warmup requests to
+        build_warmup_request_func: Function that returns a warmup request object
+        num_warmup: Number of warmup requests to send
+    """
+    print(f"Sending {num_warmup} HTTP warmup requests...")
+
+    for i in range(num_warmup):
+        try:
+            warmup_data = build_warmup_request_func()
+            request_json = build_http_request_json(warmup_data)
+            headers = {"Content-Type": "application/json"}
+
+            async with session.post(
+                http_url, data=request_json, headers=headers
+            ) as resp:
+                if resp.status == 200:
+                    print(f"Warmup request {i+1}/{num_warmup} completed successfully")
+                else:
+                    print(
+                        f"Warmup request {i+1}/{num_warmup} failed with status {resp.status}"
+                    )
+
+        except Exception as e:
+            print(f"Warmup request {i+1}/{num_warmup} failed with error: {e}")
+
+    print("HTTP warmup requests completed")
+
+
+async def perform_global_warmup_and_freeze(
+    config: BenchmarkConfig,
+    http_url: str,
+    build_warmup_request_func: Callable[[], Any],
+) -> None:
+    """
+    Perform warmup and optionally GC freeze operations once before all benchmark runs.
+
+    Args:
+        config: Benchmark configuration
+        http_url: URL for API requests
+        build_warmup_request_func: Function that returns a warmup request object
+    """
+    print("=" * 80)
+    print(f"PERFORMING GLOBAL WARMUP{' AND GC FREEZE' if config.freeze_gc else ''}")
+    print("=" * 80)
+
+    print(f"Performing HTTP warmup{' and GC freeze' if config.freeze_gc else ''}...")
+    async with aiohttp.ClientSession() as session:
+        await send_warmup_requests(session, http_url, build_warmup_request_func)
+        if config.freeze_gc:
+            await call_freeze_gc_http(session, http_url)
+        print(
+            f"HTTP warmup{' and GC freeze' if config.freeze_gc else ''} completed successfully."
+        )
+
+    print(
+        f"Global warmup{' and GC freeze' if config.freeze_gc else ''} operations completed."
+    )
+    print("=" * 80)
+
+
+async def process_results(
+    results_queue: asyncio.Queue,
+    num_requests: int,
+    send_duration: float,
+    total_duration: float,
+    rps: int,
+    duration_secs: int,
+    item_count: int,
+    test_start_time: float,
+    config: BenchmarkConfig,
+    http_mode: str = "UNKNOWN",
+) -> List[Dict[str, Any]]:
+    """
+    Process benchmark results and group them by minute intervals.
+
+    Args:
+        results_queue: Queue containing result tuples
+        num_requests: Total number of requests sent
+        send_duration: Time taken to send all requests
+        total_duration: Total time for all requests to complete
+        rps: Target requests per second
+        duration_secs: Test duration in seconds
+        item_count: Number of items per request
+        test_start_time: Start time of the test
+        config: Benchmark configuration
+        http_mode: Description of the HTTP mode/API being tested
+
+    Returns:
+        List of dictionaries containing minute-by-minute results
+    """
+    all_results = []
+
+    # Collect all results
+    for _ in range(num_requests):
+        result = await results_queue.get()
+        request_id, elapsed_time, success, completion_time = result
+        all_results.append(
+            {
+                "request_id": request_id,
+                "elapsed_time": elapsed_time,
+                "success": success,
+                "completion_time": completion_time,
+            }
+        )
+
+    # Group results by minute intervals
+    minute_results = []
+    num_minutes = int(duration_secs // 60) + (1 if duration_secs % 60 > 0 else 0)
+
+    for minute in range(num_minutes):
+        minute_start = test_start_time + (minute * 60)
+        minute_end = test_start_time + ((minute + 1) * 60)
+
+        # Filter results that completed in this minute
+        minute_data = [
+            r for r in all_results if minute_start <= r["completion_time"] < minute_end
+        ]
+
+        response_times = [r["elapsed_time"] for r in minute_data if r["success"]]
+        successful_requests = len([r for r in minute_data if r["success"]])
+        failed_requests = len([r for r in minute_data if not r["success"]])
+
+        avg_response_time = mean(response_times) if response_times else 0
+
+        # Calculate percentiles using numpy
+        if response_times:
+            p50 = np.percentile(response_times, 50)
+            p90 = np.percentile(response_times, 90)
+            p99 = np.percentile(response_times, 99)
+        else:
+            p50 = p90 = p99 = 0
+
+        minute_result = {
+            "test_duration_secs": duration_secs,
+            "minute_interval": minute + 1,
+            "target_rps": rps,
+            "item_count": item_count,
+            "server_type": config.server_type,
+            "distribution": config.distribution,
+            "unique_requests": config.num_unique_requests,
+            "total_requests": len(minute_data),
+            "successful_requests": successful_requests,
+            "failed_requests": failed_requests,
+            "send_duration_secs": send_duration,
+            "total_duration_secs": total_duration,
+            "avg_response_time_ms": avg_response_time,
+            "p50_response_time_ms": p50,
+            "p90_response_time_ms": p90,
+            "p99_response_time_ms": p99,
+        }
+
+        minute_results.append(minute_result)
+
+        print(
+            f"\nMinute {minute + 1} Summary for RPS {rps}, "
+            f"Duration {duration_secs}s, Item Count {item_count}:"
+        )
+        print(f"  Requests completed in minute: {len(minute_data)}")
+        print(f"  Successful requests:   {successful_requests}")
+        print(f"  Failed requests:       {failed_requests}")
+        print(f"  Average response time: {avg_response_time:.2f} ms")
+        print(f"  P50 response time:     {p50:.2f} ms")
+        print(f"  P90 response time:     {p90:.2f} ms")
+        print(f"  P99 response time:     {p99:.2f} ms")
+
+    # Print overall summary
+    all_response_times = [r["elapsed_time"] for r in all_results if r["success"]]
+    total_successful = len([r for r in all_results if r["success"]])
+    total_failed = len([r for r in all_results if not r["success"]])
+
+    overall_avg = mean(all_response_times) if all_response_times else 0
+    if all_response_times:
+        overall_p50 = np.percentile(all_response_times, 50)
+        overall_p90 = np.percentile(all_response_times, 90)
+        overall_p99 = np.percentile(all_response_times, 99)
+    else:
+        overall_p50 = overall_p90 = overall_p99 = 0
+
+    print(
+        f"\nOverall Summary for RPS {rps}, Duration {duration_secs}s, "
+        f"Item Count {item_count}:"
+    )
+    print(f"  Test duration:         {duration_secs} seconds")
+    print(f"  Server type:           {config.server_type}")
+    print(f"  HTTP mode:             {http_mode}")
+    print(f"  Target RPS:            {rps}")
+    print(f"  Item count:            {item_count}")
+    print(f"  Distribution:          {config.distribution}")
+    print(f"  Unique requests generated: {config.num_unique_requests}")
+    print(f"  Total requests sent:   {num_requests}")
+    print(f"  Successful requests:   {total_successful}")
+    print(f"  Failed requests:       {total_failed}")
+    print(f"  Time to send all requests: {send_duration:.2f} seconds")
+    print(f"  Time for all requests to complete: {total_duration:.2f} seconds")
+    print(f"  Average response time: {overall_avg:.2f} ms")
+    print(f"  P50 response time:     {overall_p50:.2f} ms")
+    print(f"  P90 response time:     {overall_p90:.2f} ms")
+    print(f"  P99 response time:     {overall_p99:.2f} ms\n")
+
+    return minute_results
+
+
+def print_csv_results(all_results: List[Dict[str, Any]]) -> None:
+    """
+    Print benchmark results in CSV format.
+
+    Args:
+        all_results: List of result dictionaries from process_results
+    """
+    print("\n" + "=" * 80)
+    print("FINAL CSV RESULTS:")
+    print("=" * 80)
+
+    # CSV Header
+    headers = [
+        "test_duration_secs",
+        "minute_interval",
+        "target_rps",
+        "item_count",
+        "server_type",
+        "distribution",
+        "unique_requests",
+        "total_requests",
+        "successful_requests",
+        "failed_requests",
+        "send_duration_secs",
+        "total_duration_secs",
+        "avg_response_time_ms",
+        "p50_response_time_ms",
+        "p90_response_time_ms",
+        "p99_response_time_ms",
+    ]
+    print(",".join(headers))
+
+    # CSV Data
+    for result in all_results:
+        row = [
+            result["test_duration_secs"],
+            result["minute_interval"],
+            result["target_rps"],
+            result["item_count"],
+            result["server_type"],
+            result["distribution"],
+            result["unique_requests"],
+            result["total_requests"],
+            result["successful_requests"],
+            result["failed_requests"],
+            f"{result['send_duration_secs']:.2f}",
+            f"{result['total_duration_secs']:.2f}",
+            f"{result['avg_response_time_ms']:.2f}",
+            f"{result['p50_response_time_ms']:.2f}",
+            f"{result['p90_response_time_ms']:.2f}",
+            f"{result['p99_response_time_ms']:.2f}",
+        ]
+        print(",".join(map(str, row)))
+
+
+async def run_benchmark_main(
+    config: BenchmarkConfig,
+    run_single_benchmark_func,
+    benchmark_name: str,
+    http_url: str,
+    item_count_values: List[int],
+    additional_info: Optional[Dict[str, Any]] = None,
+    build_warmup_request_func: Optional[Callable[[], Any]] = None,
+) -> None:
+    """
+    Main benchmark orchestration function.
+
+    Args:
+        config: Benchmark configuration
+        run_single_benchmark_func: Async function to run a single benchmark
+        benchmark_name: Name of the benchmark (e.g., "SCORING", "EMBEDDINGS")
+        http_url: URL of the API endpoint
+        item_count_values: List of item counts to test
+        additional_info: Additional information to print in the header
+        build_warmup_request_func: Optional function to build warmup requests
+    """
+    total_combinations = (
+        len(config.duration_secs_values)
+        * len(config.rps_values)
+        * len(item_count_values)
+    )
+
+    print(
+        f"Running benchmarks for {len(config.duration_secs_values)} duration "
+        f"values, {len(config.rps_values)} RPS values, and "
+        f"{len(item_count_values)} item count values = "
+        f"{total_combinations} total combinations"
+    )
+    print(f"Server Type: {config.server_type}")
+    print(f"HTTP Mode: {benchmark_name}")
+    print(f"API URL: {http_url}")
+
+    if additional_info:
+        for key, value in additional_info.items():
+            print(f"{key}: {value}")
+
+    print(f"Items per request (batch size): {item_count_values}")
+    print(f"Profiling Enabled: {config.profile}")
+    print(f"Duration values: {config.duration_secs_values}")
+    print(f"RPS values: {config.rps_values}")
+    print(f"Item count values: {item_count_values}")
+    print("=" * 80)
+
+    # Set up profiler environment
+    setup_profiler(config, benchmark_name)
+
+    # Perform global warmup and GC freeze operations if warmup function is provided
+    if build_warmup_request_func is not None:
+        await perform_global_warmup_and_freeze(
+            config, http_url, build_warmup_request_func
+        )
+
+    all_results = []
+
+    for duration_secs in config.duration_secs_values:
+        for rps in config.rps_values:
+            for item_count in item_count_values:
+                result = await run_single_benchmark_func(rps, duration_secs, item_count)
+                all_results.extend(result)  # Extend with minute results
+
+    print_csv_results(all_results)
+
+
+async def run_generic_benchmark(
+    rps: int,
+    duration_secs: int,
+    item_count: int,
+    config: BenchmarkConfig,
+    http_url: str,
+    build_request_func: Callable[[int, int], Tuple[int, Any]],
+    response_validator: Callable[[Dict[str, Any]], bool],
+    api_name: str,
+    request_description: str = "requests",
+) -> List[Dict[str, Any]]:
+    """
+    Generic benchmark runner that can be used for different APIs.
+
+    Args:
+        rps: Requests per second
+        duration_secs: Duration of the test in seconds
+        item_count: Number of items per request (batch size)
+        config: Benchmark configuration
+        http_url: URL of the API endpoint
+        build_request_func: Function to build individual requests
+        response_validator: Function to validate API responses
+        api_name: Name of the API for logging
+        request_description: Description for progress bars
+
+    Returns:
+        List of dictionaries containing minute-by-minute results
+    """
+    num_requests = int(rps * duration_secs)
+    print(
+        f"Starting benchmark with RPS={rps}, Duration={duration_secs}s, "
+        f"Item Count={item_count}, num_requests={num_requests}"
+    )
+    print(f"Server Type: {config.server_type}")
+    print(f"HTTP Mode: {api_name}")
+    print(f"Profiling Enabled: {config.profile}")
+
+    # Build requests in parallel (unmeasured)
+    all_requests = prepare_all_requests_parallel(
+        num_requests, item_count, build_request_func, config, request_description
+    )
+
+    results_queue = asyncio.Queue()
+    tasks = []
+
+    # Track timing for sending requests
+    send_start_time = asyncio.get_event_loop().time()
+
+    # HTTP implementation
+    async with aiohttp.ClientSession(
+        timeout=aiohttp.ClientTimeout(total=300)
+    ) as session:
+
+        # Send START_PROFILE if profiling is enabled
+        if config.profile:
+            await send_profile_request("START_PROFILE", http_url, session=session)
+
+        # Add progress bar for sending requests
+        with tqdm(
+            total=len(all_requests),
+            desc=f"Sending HTTP {request_description} at {rps} RPS",
+            unit="req",
+        ) as pbar:
+            for i, request_data in enumerate(all_requests):
+                request_id = i + 1
+                tasks.append(
+                    asyncio.create_task(
+                        make_http_call(
+                            session,
+                            request_data,
+                            request_id,
+                            results_queue,
+                            http_url,
+                            response_validator,
+                            api_name,
+                        )
+                    )
+                )
+
+                # Update progress bar
+                pbar.update(1)
+
+                # Throttle based on distribution
+                if i < len(all_requests) - 1:
+                    await sleep_with_distribution(config.distribution, rps)
+
+        send_end_time = asyncio.get_event_loop().time()
+        send_duration = send_end_time - send_start_time
+
+        # Wait for all requests to complete with progress tracking
+        print(f"Waiting for {len(tasks)} HTTP {request_description} to complete...")
+        with tqdm(
+            total=len(tasks), desc=f"Completing HTTP {request_description}", unit="req"
+        ) as completion_pbar:
+            completed_tasks = []
+            for task in asyncio.as_completed(tasks):
+                await task
+                completed_tasks.append(task)
+                completion_pbar.update(1)
+
+        # Send STOP_PROFILE if profiling is enabled
+        if config.profile:
+            await send_profile_request("STOP_PROFILE", http_url, session=session)
+
+    completion_end_time = asyncio.get_event_loop().time()
+    total_duration = completion_end_time - send_start_time
+
+    return await process_results(
+        results_queue,
+        num_requests,
+        send_duration,
+        total_duration,
+        rps,
+        duration_secs,
+        item_count,
+        send_start_time,
+        config,
+        api_name,
+    )
diff --git a/benchmark/react/README.md b/benchmark/react/README.md
new file mode 100644
index 000000000..51bcaa44a
--- /dev/null
+++ b/benchmark/react/README.md
@@ -0,0 +1,34 @@
+## Run benchmark
+
+NOTE: This is an implementation for replaying a given trace for throughput/latency benchmark purposes. It is not an actual ReAct agent implementation.
+
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+```
+python3 bench_sglang.py --num-questions 100
+```
+
+
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
+```
+
+```
+python3 bench_other.py --num-questions 100 --backend vllm
+```
+
+
+### Benchmark guidance
+```
+python3 bench_other.py --num-questions 100 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+
+### Benchmark lmql
+
+```
+python3 bench_other.py --num-questions 100 --backend lmql --parallel 1
+```
diff --git a/benchmark/react/bench_other.py b/benchmark/react/bench_other.py
new file mode 100644
index 000000000..08666662b
--- /dev/null
+++ b/benchmark/react/bench_other.py
@@ -0,0 +1,202 @@
+import argparse
+import json
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+from tqdm import tqdm
+
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+from sglang.utils import dump_state_text, read_jsonl
+
+
+def get_prompt(question):
+    prompt = (
+        """Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types:
+(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.
+(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage.
+(3) Finish[answer], which returns the answer and finishes the task.
+Here are some examples.
+Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?
+Thought 1: I need to search Colorado orogeny, find the area that the eastern sector of the Colorado orogeny extends into, then find the elevation range of the area.
+Action 1: Search[Colorado orogeny]
+Observation 1: The Colorado orogeny was an episode of mountain building (an orogeny) in Colorado and surrounding areas.
+Thought 2: It does not mention the eastern sector. So I need to look up eastern sector.
+Action 2: Lookup[eastern sector]
+Observation 2: (Result 1 / 1) The eastern sector extends into the High Plains and is called the Central Plains orogeny.
+Thought 3: The eastern sector of Colorado orogeny extends into the High Plains. So I need to search High Plains and find its elevation range.
+Action 3: Search[High Plains]
+Observation 3: High Plains refers to one of two distinct land regions:
+Thought 4: I need to instead search High Plains (United States).
+Action 4: Search[High Plains (United States)]
+Observation 4: The High Plains are a subregion of the Great Plains. From east to west, the High Plains rise in elevation from around 1,800 to 7,000 ft (550 to 2,130 m).[3]
+Thought 5: High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft.
+Action 5: Finish[1,800 to 7,000 ft]
+Question: Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who?
+Thought 1: The question simplifies to "The Simpsons" character Milhouse is named after who. I only need to search Milhouse and find who it is named after.
+Action 1: Search[Milhouse]
+Observation 1: Milhouse Mussolini Van Houten is a recurring character in the Fox animated television series The Simpsons voiced by Pamela Hayden and created by Matt Groening.
+Thought 2: The paragraph does not tell who Milhouse is named after, maybe I can look up "named after".
+Action 2: Lookup[named after]
+Observation 2: (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous.
+Thought 3: Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon.
+Action 3: Finish[Richard Nixon]
+Question: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture?
+Thought 1: I need to search Adam Clayton Powell and The Saimaa Gesture, and find which documentary is about Finnish rock groups.
+Action 1: Search[Adam Clayton Powell]
+Observation 1: Could not find [Adam Clayton Powell]. Similar: ['Adam Clayton Powell III', 'Seventh Avenue (Manhattan)', 'Adam Clayton Powell Jr. State Office Building', 'Isabel Washington Powell', 'Adam Powell', 'Adam Clayton Powell (film)', 'Giancarlo Esposito'].
+Thought 2: To find the documentary, I can search Adam Clayton Powell (film).
+Action 2: Search[Adam Clayton Powell (film)]
+Observation 2: Adam Clayton Powell is a 1989 American documentary film directed by Richard Kilberg.
+The film is about the rise and fall of influential African-American politician Adam Clayton Powell Jr.[3][4] It was later aired as part of the PBS series The American Experience.
+Thought 3: Adam Clayton Powell (film) is a documentary about an African-American politician, not Finnish rock groups. So the documentary about Finnish rock groups must instead be The Saimaa Gesture.
+Action 3: Finish[The Saimaa Gesture]
+Question: What profession does Nicholas Ray and Elia Kazan have in common?
+Thought 1: I need to search Nicholas Ray and Elia Kazan, find their professions, then find the profession they have in common.
+Action 1: Search[Nicholas Ray]
+Observation 1: Nicholas Ray (born Raymond Nicholas Kienzle Jr., August 7, 1911 – June 16, 1979) was an American film director, screenwriter, and actor best known for the 1955 film Rebel Without a Cause.
+Thought 2: Professions of Nicholas Ray are director, screenwriter, and actor. I need to search Elia Kazan next and find his professions.
+Action 2: Search[Elia Kazan]
+Observation 2: Elia Kazan was an American film and theatre director, producer, screenwriter and actor.
+Thought 3: Professions of Elia Kazan are director, producer, screenwriter, and actor. So profession Nicholas Ray and Elia Kazan have in common is director, screenwriter, and actor.
+Action 3: Finish[director, screenwriter, actor]
+Question: Which magazine was started first Arthur's Magazine or First for Women?
+Thought 1: I need to search Arthur's Magazine and First for Women, and find which was started first.
+Action 1: Search[Arthur's Magazine]
+Observation 1: Arthur's Magazine (1844-1846) was an American literary periodical published in Philadelphia in the 19th century.
+Thought 2: Arthur's Magazine was started in 1844. I need to search First for Women next.
+Action 2: Search[First for Women]
+Observation 2: First for Women is a woman's magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989.
+Thought 3: First for Women was started in 1989. 1844 (Arthur's Magazine) < 1989 (First for Women), so Arthur's Magazine was started first.
+Action 3: Finish[Arthur's Magazine]
+Question: Were Pavel Urysohn and Leonid Levin known for the same type of work?
+Thought 1: I need to search Pavel Urysohn and Leonid Levin, find their types of work, then find if they are the same.
+Action 1: Search[Pavel Urysohn]
+Observation 1: Pavel Samuilovich Urysohn (February 3, 1898 â August 17, 1924) was a Soviet mathematician who is best known for his contributions in dimension theory.
+Thought 2: Pavel Urysohn is a mathematician. I need to search Leonid Levin next and find its type of work.
+Action 2: Search[Leonid Levin]
+Observation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist.
+Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work.
+Action 3: Finish[yes]
+"""
+        + question
+    )
+    return prompt
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)[: args.num_questions]
+    arguments = [{"question": k, "triplets": v} for l in lines for k, v in l.items()]
+
+    states = []
+
+    # Select backend
+    call_generate = get_call_generate(args)
+
+    def run_single_agent(argument):
+        question = argument["question"]
+        triplets = argument["triplets"]
+        prompt = get_prompt(question)
+        for i in range(1, len(triplets) + 2):
+            prompt += "Thought " + str(i) + ":"
+            states.append(prompt)
+            answer = call_generate(
+                prompt, max_tokens=200, temperature=0, stop="Observation"
+            )
+            if i > len(triplets):
+                break
+            prompt += (
+                triplets[i - 1]["thought"]
+                + "\nAction "
+                + str(i)
+                + ":"
+                + triplets[i - 1]["action"]
+                + "\nObservation "
+                + str(i)
+                + ":"
+                + triplets[i - 1]["observation"]
+                + "\n"
+            )
+
+            states.append(answer)
+
+    async def run_single_agent_async(argument):
+        question = argument["question"]
+        triplets = argument["triplets"]
+        prompt = get_prompt(question)
+        for i in range(1, len(triplets) + 2):
+            prompt += "Thought " + str(i) + ":"
+            states.append(prompt)
+            answer = await call_generate(
+                prompt, max_tokens=200, temperature=0, stop="Observation", max_len=4096
+            )
+            if i > len(triplets):
+                break
+            prompt += (
+                triplets[i - 1]["thought"]
+                + "\nAction "
+                + str(i)
+                + ":"
+                + triplets[i - 1]["action"]
+                + "\nObservation "
+                + str(i)
+                + ":"
+                + triplets[i - 1]["observation"]
+                + "\n"
+            )
+
+            states.append(answer)
+
+    tic = time.perf_counter()
+
+    if args.backend != "lmql":
+        if args.parallel == 1:
+            for arg in tqdm(arguments):
+                run_single_agent(arg)
+        else:
+            with ThreadPoolExecutor(args.parallel) as executor:
+                list(
+                    tqdm(
+                        executor.map(run_single_agent, arguments), total=len(arguments)
+                    )
+                )
+
+    else:
+        import asyncio
+
+        loop = asyncio.get_event_loop()
+        batches = [
+            [] for _ in range((len(arguments) + args.parallel - 1) // args.parallel)
+        ]
+        for i, arg in enumerate(arguments):
+            batches[i // args.parallel].append(arg)
+        for bt in tqdm(batches):
+            tasks = [run_single_agent_async(arg) for arg in bt]
+            loop.run_until_complete(asyncio.gather(*tasks))
+
+    latency = time.perf_counter() - tic
+
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "ReAct Agents",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": len(arguments),
+            "other": {
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="hotpotqa_100.jsonl")
+    parser.add_argument("--num-questions", type=int, default=10)
+    args = add_common_other_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/react/bench_sglang.py b/benchmark/react/bench_sglang.py
new file mode 100644
index 000000000..331638e9f
--- /dev/null
+++ b/benchmark/react/bench_sglang.py
@@ -0,0 +1,153 @@
+import argparse
+import json
+import time
+
+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text, read_jsonl
+
+
+@sgl.function
+def webthink(s, question, triplets):
+    s += (
+        """Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types:
+(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.
+(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage.
+(3) Finish[answer], which returns the answer and finishes the task.
+Here are some examples.
+Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?
+Thought 1: I need to search Colorado orogeny, find the area that the eastern sector of the Colorado orogeny extends into, then find the elevation range of the area.
+Action 1: Search[Colorado orogeny]
+Observation 1: The Colorado orogeny was an episode of mountain building (an orogeny) in Colorado and surrounding areas.
+Thought 2: It does not mention the eastern sector. So I need to look up eastern sector.
+Action 2: Lookup[eastern sector]
+Observation 2: (Result 1 / 1) The eastern sector extends into the High Plains and is called the Central Plains orogeny.
+Thought 3: The eastern sector of Colorado orogeny extends into the High Plains. So I need to search High Plains and find its elevation range.
+Action 3: Search[High Plains]
+Observation 3: High Plains refers to one of two distinct land regions:
+Thought 4: I need to instead search High Plains (United States).
+Action 4: Search[High Plains (United States)]
+Observation 4: The High Plains are a subregion of the Great Plains. From east to west, the High Plains rise in elevation from around 1,800 to 7,000 ft (550 to 2,130 m).[3]
+Thought 5: High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft.
+Action 5: Finish[1,800 to 7,000 ft]
+Question: Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who?
+Thought 1: The question simplifies to "The Simpsons" character Milhouse is named after who. I only need to search Milhouse and find who it is named after.
+Action 1: Search[Milhouse]
+Observation 1: Milhouse Mussolini Van Houten is a recurring character in the Fox animated television series The Simpsons voiced by Pamela Hayden and created by Matt Groening.
+Thought 2: The paragraph does not tell who Milhouse is named after, maybe I can look up "named after".
+Action 2: Lookup[named after]
+Observation 2: (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous.
+Thought 3: Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon.
+Action 3: Finish[Richard Nixon]
+Question: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture?
+Thought 1: I need to search Adam Clayton Powell and The Saimaa Gesture, and find which documentary is about Finnish rock groups.
+Action 1: Search[Adam Clayton Powell]
+Observation 1: Could not find [Adam Clayton Powell]. Similar: ['Adam Clayton Powell III', 'Seventh Avenue (Manhattan)', 'Adam Clayton Powell Jr. State Office Building', 'Isabel Washington Powell', 'Adam Powell', 'Adam Clayton Powell (film)', 'Giancarlo Esposito'].
+Thought 2: To find the documentary, I can search Adam Clayton Powell (film).
+Action 2: Search[Adam Clayton Powell (film)]
+Observation 2: Adam Clayton Powell is a 1989 American documentary film directed by Richard Kilberg.
+The film is about the rise and fall of influential African-American politician Adam Clayton Powell Jr.[3][4] It was later aired as part of the PBS series The American Experience.
+Thought 3: Adam Clayton Powell (film) is a documentary about an African-American politician, not Finnish rock groups. So the documentary about Finnish rock groups must instead be The Saimaa Gesture.
+Action 3: Finish[The Saimaa Gesture]
+Question: What profession does Nicholas Ray and Elia Kazan have in common?
+Thought 1: I need to search Nicholas Ray and Elia Kazan, find their professions, then find the profession they have in common.
+Action 1: Search[Nicholas Ray]
+Observation 1: Nicholas Ray (born Raymond Nicholas Kienzle Jr., August 7, 1911 – June 16, 1979) was an American film director, screenwriter, and actor best known for the 1955 film Rebel Without a Cause.
+Thought 2: Professions of Nicholas Ray are director, screenwriter, and actor. I need to search Elia Kazan next and find his professions.
+Action 2: Search[Elia Kazan]
+Observation 2: Elia Kazan was an American film and theatre director, producer, screenwriter and actor.
+Thought 3: Professions of Elia Kazan are director, producer, screenwriter, and actor. So profession Nicholas Ray and Elia Kazan have in common is director, screenwriter, and actor.
+Action 3: Finish[director, screenwriter, actor]
+Question: Which magazine was started first Arthur's Magazine or First for Women?
+Thought 1: I need to search Arthur's Magazine and First for Women, and find which was started first.
+Action 1: Search[Arthur's Magazine]
+Observation 1: Arthur's Magazine (1844-1846) was an American literary periodical published in Philadelphia in the 19th century.
+Thought 2: Arthur's Magazine was started in 1844. I need to search First for Women next.
+Action 2: Search[First for Women]
+Observation 2: First for Women is a woman's magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989.
+Thought 3: First for Women was started in 1989. 1844 (Arthur's Magazine) < 1989 (First for Women), so Arthur's Magazine was started first.
+Action 3: Finish[Arthur's Magazine]
+Question: Were Pavel Urysohn and Leonid Levin known for the same type of work?
+Thought 1: I need to search Pavel Urysohn and Leonid Levin, find their types of work, then find if they are the same.
+Action 1: Search[Pavel Urysohn]
+Observation 1: Pavel Samuilovich Urysohn (February 3, 1898 â August 17, 1924) was a Soviet mathematician who is best known for his contributions in dimension theory.
+Thought 2: Pavel Urysohn is a mathematician. I need to search Leonid Levin next and find its type of work.
+Action 2: Search[Leonid Levin]
+Observation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist.
+Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work.
+Action 3: Finish[yes]
+"""
+        + question
+    )
+    for i in range(1, len(triplets) + 2):
+        s += "Thought " + str(i) + ":"
+        # NOTE: This is an implementation for replaying a given trace for benchmark purposes. It is not an actual ReAct agent implementation.
+        ss = s.fork(1)
+        ss[0] += sgl.gen(name="thought_action", max_tokens=200, stop="Observation")
+        ss.join()
+        # to verify the correctness of output, this should be collected
+        # print(ss[0]["thought_action"])
+        if i > len(triplets):
+            break
+        s += (
+            triplets[i - 1]["thought"]
+            + "\nAction "
+            + str(i)
+            + ":"
+            + triplets[i - 1]["action"]
+            + "\nObservation "
+            + str(i)
+            + ":"
+            + triplets[i - 1]["observation"]
+            + "\n"
+        )
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)[: args.num_questions]
+    arguments = [{"question": k, "triplets": v} for l in lines for k, v in l.items()]
+
+    # Select backend
+    backend = select_sglang_backend(args)
+    sgl.set_default_backend(backend)
+
+    states = []
+    tic = time.perf_counter()
+    states = webthink.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "ReAct Agents",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": len(arguments),
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="hotpotqa_100.jsonl")
+    parser.add_argument("--num-questions", type=int, default=10)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/reasoning_benchmark/README.md b/benchmark/reasoning_benchmark/README.md
new file mode 100644
index 000000000..f9d26e0c7
--- /dev/null
+++ b/benchmark/reasoning_benchmark/README.md
@@ -0,0 +1,77 @@
+# Run benchmark
+
+This benchmark is primarily intended to be used with reasoning models like `DeepSeek-R1` and its distilled models like `DeepSeek-R1-Distill-Qwen-1.5B`. Please use
+
+```bash
+pip install antlr4-python3-runtime
+```
+
+for `parse_latex` which we use for symbolic equality check.
+
+## Benchmark sglang
+
+1. Launch the Server
+```bash
+python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --port 30000
+```
+
+Note that depending on the GPU this benchmark will take quiet some time. To employ data parallelism please use:
+
+```bash
+python3 -m sglang_router.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --port 30000 --dp-size 4
+```
+
+
+2. Benchmarking
+
+We use [suggested](https://github.com/deepseek-ai/DeepSeek-R1) parameters of `temperature=0.6`, `top_p=.95`, `max_new_tokens=32768`. The command line argument `num-tries` can be used to evaluate the model multiple times on the same question. We use the suggested `64` from the repo for AIME 2024. For LIMO, we use `8` as the number of tries due to the size of the dataset.
+
+By default evaluate on LIMO dataset.
+
+```bash
+python3 bench_sglang.py --parallel 256 --num-tries 64 --port 30000
+```
+
+Evaluate on AIME 2024 dataset.
+
+```bash
+python3 bench_sglang.py --parallel 256 --port 30000 --data-path Maxwell-Jia/AIME_2024 --question-key Problem --answer-key Answer --num-tries 64
+```
+
+Evaluate on [AIME 2025 I dataset](https://huggingface.co/datasets/opencompass/AIME2025). For benchmark result see [here](https://matharena.ai/).
+
+```bash
+python3 bench_sglang.py --parallel 256 --port 30000 --data-path opencompass/AIME2025 --question-key question --answer-key answer --num-tries 64
+```
+## Results
+
+### Evaluation Results
+| Dataset    | Num Tries | Accuracy | Reference | Standard Error |
+|------------|-----------|----------|-----------|-----------|
+| LIMO       | 8         | 47.7%    | ?         | ?         |
+| AIME 2024  | 64        | 33.2%    | 28.9%     | 3.4%       |
+| AIME 2025 I| 64        | 29.9%    | 25.0%     |  ?        |
+
+### Statistic Analysis Results
+Set up SGLang engine for statistic analysis, for high efficiency we use `--dp-size 8` for data parallelism:
+```bash
+python3 -m sglang_router.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --port 30000 --dp-size 8
+```
+**Experiment 1**:
+We fixed the number of attempts (num_tries) and conducted multiple runs to assess the consistency of the model's performance. The results show that all recorded accuracies lie within ± one standard error deviation from the mean. This suggests that **our metric serves as an effective upper bound for the deviation of reported accuracy**.
+
+To collect the accuracy, run the following command 30 times:
+```bash
+python3 bench_sglang.py --parallel 64 --port 30000 --data-path Maxwell-Jia/AIME_2024 --question-key Problem --answer-key Answer --num-tries 64
+```
+
+![acc_hist](figure/Acc_histplot.png)
+
+
+**Experiment 2**: We explored the relationship between the number of attempts (num_tries) and the standard error (SE) by varying num_tries across a range (e.g., 8, 16, 32, ..., 256) and performing a single run for each value. The results demonstrate that as the number of attempts increases, the standard error decreases, leading to **greater stability in answer accuracy**.
+
+To reveal the relationship, run the command 6 times and adjust the parameter `--num-tries` for each run:
+```bash
+python3 bench_sglang.py --parallel 64 --port 30000 --data-path Maxwell-Jia/AIME_2024 --question-key Problem --answer-key Answer --num-tries <num_tries>
+```
+![SE_num_tries](figure/SE_numtries.png)
diff --git a/benchmark/reasoning_benchmark/answer_extraction.py b/benchmark/reasoning_benchmark/answer_extraction.py
new file mode 100644
index 000000000..45ce59afb
--- /dev/null
+++ b/benchmark/reasoning_benchmark/answer_extraction.py
@@ -0,0 +1,269 @@
+# Adapted from https://github.com/deepseek-ai/DeepSeek-Math/blob/main/evaluation/data_processing/answer_extraction.py
+
+import re
+
+import regex
+
+
+def _fix_fracs(string):
+    substrs = string.split("\\frac")
+    new_str = substrs[0]
+    if len(substrs) > 1:
+        substrs = substrs[1:]
+        for substr in substrs:
+            new_str += "\\frac"
+            if len(substr) > 0 and substr[0] == "{":
+                new_str += substr
+            else:
+                try:
+                    assert len(substr) >= 2
+                except:
+                    return string
+                a = substr[0]
+                b = substr[1]
+                if b != "{":
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}{" + b + "}" + post_substr
+                    else:
+                        new_str += "{" + a + "}{" + b + "}"
+                else:
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}" + b + post_substr
+                    else:
+                        new_str += "{" + a + "}" + b
+    string = new_str
+    return string
+
+
+def _fix_a_slash_b(string):
+    if len(string.split("/")) != 2:
+        return string
+    a = string.split("/")[0]
+    b = string.split("/")[1]
+    try:
+        if "sqrt" not in a:
+            a = int(a)
+        if "sqrt" not in b:
+            b = int(b)
+        assert string == "{}/{}".format(a, b)
+        new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
+        return new_string
+    except:
+        return string
+
+
+def _fix_sqrt(string):
+    _string = re.sub(r"\\sqrt(-?[0-9.a-zA-Z]+)", r"\\sqrt{\1}", string)
+    _string = re.sub(r"\\sqrt\s+(\w+)$", r"\\sqrt{\1}", _string)
+    return _string
+
+
+def _fix_tan(string):
+    _string = re.sub(r"\\tan(-?[0-9.a-zA-Z]+)", r"\\tan{\1}", string)
+    _string = re.sub(r"\\tan\s+(\w+)$", r"\\tan{\1}", _string)
+    return _string
+
+
+def strip_string(string):
+    string = str(string).strip()
+    # linebreaks
+    string = string.replace("\n", "")
+
+    # right "."
+    string = string.rstrip(".")
+
+    # remove inverse spaces
+    string = string.replace("\\!", "")
+    # string = string.replace("\\ ", "")
+
+    # replace \\ with \
+    # string = string.replace("\\\\", "\\")
+    # string = string.replace("\\\\", "\\")
+
+    if string.startswith("\\text{") and string.endswith("}"):
+        string = string.split("{", 1)[1][:-1]
+
+    # replace tfrac and dfrac with frac
+    string = string.replace("tfrac", "frac")
+    string = string.replace("dfrac", "frac")
+    string = string.replace("cfrac", "frac")
+
+    # remove \left and \right
+    string = string.replace("\\left", "")
+    string = string.replace("\\right", "")
+
+    # Remove unit: miles, dollars if after is not none
+    _string = re.sub(r"\\text{.*?}$", "", string).strip()
+    if _string != "" and _string != string:
+        # print("Warning: unit not removed: '{}' -> '{}'".format(string, _string))
+        string = _string
+
+    # Remove circ (degrees)
+    string = string.replace("^{\\circ}", "").strip()
+    string = string.replace("^\\circ", "").strip()
+
+    string = regex.sub(r"\{(c|m)?m\}(\^(2|3))?", "", string).strip()
+    string = regex.sub(r"p\.m\.$", "", string).strip()
+    string = regex.sub(r"(\d)\s*t$", r"\1", string).strip()
+
+    # remove dollar signs
+    string = string.replace("\\$", "")
+    string = string.replace("$", "")
+
+    # string = string.replace("\\text", "")
+    string = string.replace("x\\in", "")
+
+    # remove percentage
+    string = string.replace("\\%", "%")
+    string = string.replace("\%", "%")
+    # string = string.replace("%", "")
+
+    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
+    string = string.replace(" .", " 0.")
+    string = string.replace("{.", "{0.")
+
+    # cdot
+    string = string.replace("\\cdot", "")
+
+    # inf
+    string = string.replace("infinity", "\\infty")
+    if "\\infty" not in string:
+        string = string.replace("inf", "\\infty")
+    string = string.replace("+\\inity", "\\infty")
+
+    # and
+    # string = string.replace("and", "")
+    string = string.replace("\\mathbf", "")
+    string = string.replace("\\mathrm", "")
+
+    # use regex to remove \mbox{...}
+    string = re.sub(r"\\mbox{.*?}", "", string)
+
+    # quote
+    string.replace("'", "")
+    string.replace('"', "")
+
+    # i, j
+    if "j" in string and "i" not in string:
+        string = string.replace("j", "i")
+
+    # replace a.000b where b is not number or b is end, with ab, use regex
+    string = re.sub(r"(\d+)\.0+([^\d])", r"\1\2", string)
+    string = re.sub(r"(\d+)\.0+$", r"\1", string)
+
+    # if empty, return empty string
+    if len(string) == 0:
+        return string
+    if string[0] == ".":
+        string = "0" + string
+
+    # to consider: get rid of e.g. "k = " or "q = " at beginning
+    # if len(string.split("=")) == 2:
+    #     if len(string.split("=")[0]) <= 2:
+    #         string = string.split("=")[1]
+
+    string = _fix_sqrt(string)
+    string = _fix_tan(string)
+    string = string.replace(" ", "")
+
+    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
+    string = _fix_fracs(string)
+
+    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
+    string = _fix_a_slash_b(string)
+
+    string = regex.sub(r"(\\|,|\.)+$", "", string)
+
+    return string
+
+
+def extract_boxed_answers(text):
+    answers = []
+    for piece in text.split("boxed{")[1:]:
+        n = 0
+        for i in range(len(piece)):
+            if piece[i] == "{":
+                n += 1
+            elif piece[i] == "}":
+                n -= 1
+                if n < 0:
+                    if i + 1 < len(piece) and piece[i + 1] == "%":
+                        answers.append(piece[: i + 1])
+                    else:
+                        answers.append(piece[:i])
+                    break
+    return answers
+
+
+def extract_program_output(pred_str):
+    """
+    extract output between the last ```output\n...\n```
+    """
+    if "```output" not in pred_str:
+        return ""
+    if "```output" in pred_str:
+        pred_str = pred_str.split("```output")[-1]
+    if "```" in pred_str:
+        pred_str = pred_str.split("```")[0]
+    output = pred_str.strip()
+    return output
+
+
+def extract_answer(pred_str, exhaust=False):
+    pred = []
+    if "final answer is $" in pred_str and "$. I hope" in pred_str:
+        tmp = pred_str.split("final answer is $", 1)[1]
+        pred = [tmp.split("$. I hope", 1)[0].strip()]
+    elif "boxed" in pred_str:
+        pred = extract_boxed_answers(pred_str)
+    elif "he answer is" in pred_str:
+        pred = [pred_str.split("he answer is")[-1].strip()]
+    else:
+        program_output = extract_program_output(pred_str)
+        if program_output != "":
+            # fall back to program
+            pred.append(program_output)
+        else:  # use the last number
+            pattern = "-?\d*\.?\d+"
+            ans = re.findall(pattern, pred_str.replace(",", ""))
+            if len(ans) >= 1:
+                ans = ans[-1]
+            else:
+                ans = ""
+            if ans:
+                pred.append(ans)
+
+    # multiple line
+    _pred = []
+    for ans in pred:
+        ans = ans.strip().split("\n")[0]
+        ans = ans.lstrip(":")
+        ans = ans.rstrip(".")
+        ans = ans.rstrip("/")
+        ans = strip_string(ans)
+        _pred.append(ans)
+    if exhaust:
+        return _pred
+    else:
+        return _pred[-1] if _pred else ""
+
+
+def extract_math_answer(question, reasoning, task):
+    answer = []
+    for ans in extract_answer(reasoning, exhaust=True):
+        if "separated by commas" in question and all(ch not in ans for ch in "()[]"):
+            answer.extend([a.strip() for a in ans.split(",")])
+        elif regex.search(r"\\text\{\s*and\s*\}", ans):
+            answer.extend(
+                [
+                    a.strip()
+                    for a in regex.sub(r"\\text\{\s*and\s*\}", "[SEP]", ans).split(
+                        "[SEP]"
+                    )
+                ]
+            )
+        else:
+            answer.append(ans.strip())
+    return answer
diff --git a/benchmark/reasoning_benchmark/bench_sglang.py b/benchmark/reasoning_benchmark/bench_sglang.py
new file mode 100644
index 000000000..ccbff9d17
--- /dev/null
+++ b/benchmark/reasoning_benchmark/bench_sglang.py
@@ -0,0 +1,135 @@
+import argparse
+import json
+import time
+
+import answer_extraction
+import eval_utils
+import numpy as np
+from datasets import load_dataset
+
+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text
+
+
+@sgl.function
+def reasoning_gen(s, question: str):
+    s += sgl.user(
+        question
+        + "\nPlease reason step by step, and put your final answer within \boxed{}."
+    )
+    s += sgl.assistant(
+        sgl.gen(
+            "answer",
+        )
+    )
+
+
+def convert_dataset(path: str, question_key: str, answer_key: str, num_tries: int):
+    raw_dataset = load_dataset(path)
+    questions = []
+    answers = []
+    for data in raw_dataset["train"]:
+        question = data[question_key]
+        answer = data[answer_key]
+        for _ in range(num_tries):
+            questions.append({"question": question})
+            answers.append({"answer": answer})
+    return questions, answers
+
+
+def main(args):
+    # Select backend
+    sgl.set_default_backend(select_sglang_backend(args))
+
+    # Get dataset
+    questions, answers = convert_dataset(
+        args.data_path, args.question_key, args.answer_key, args.num_tries
+    )
+
+    # Run requests
+    tic = time.perf_counter()
+    states = reasoning_gen.run_batch(
+        questions,
+        num_threads=args.parallel,
+        progress_bar=True,
+        temperature=0.6,
+        max_new_tokens=32768,
+        top_p=0.95,
+    )
+    latency = time.perf_counter() - tic
+
+    # Extract results and record outcomes in a list.
+    outcomes = []
+    for i, state in enumerate(states):
+        try:
+            pred_answer = answer_extraction.extract_math_answer(
+                questions[i]["question"], state["answer"], "limo"
+            )
+            gt_answer = str(answers[i]["answer"])
+            pred_answer = (
+                pred_answer[-1] if isinstance(pred_answer, list) else pred_answer
+            )
+            is_correct = 1 if eval_utils.math_equal(pred_answer, gt_answer) else 0
+        except Exception as e:
+            print(f"Error extracting answer: {e}")
+            is_correct = 0
+
+        outcomes.append(is_correct)
+
+    # Calculate overall accuracy using numpy
+    overall_accuracy = np.mean(outcomes)
+    print(f"Overall Accuracy: {overall_accuracy}")
+
+    # Calculate mean standard error over questions if num_tries >= 2
+    if args.num_tries > 1:
+        outcomes_np = np.array(outcomes).reshape(-1, args.num_tries)
+        # Using sample standard deviation with ddof=1
+        std_per_question = np.std(outcomes_np, axis=1, ddof=1)
+        # Compute the standard error for each question: std / sqrt(num_tries)
+        se_per_question = std_per_question / np.sqrt(args.num_tries)
+        mean_se = se_per_question.mean()
+        print(f"Mean Standard Error of Accuracy across questions: {mean_se}")
+    else:
+        mean_se = None
+        print("Not enough samples per question to compute standard error.")
+
+    # Calculate output throughput
+    num_output_tokens = sum(
+        s.get_meta_info("answer")["completion_tokens"] for s in states
+    )
+    output_throughput = num_output_tokens / latency
+    print(f"Output throughput: {output_throughput} token/s")
+
+    # Dump results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    # Write results
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "limo",
+            "backend": args.backend,
+            "latency": round(latency, 3),
+            "overall_accuracy": round(overall_accuracy, 3),
+            "mean_se_accuracy": round(mean_se, 3) if mean_se is not None else None,
+            "num_requests": len(questions),
+            "other": {
+                "num_questions": len(questions),
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="GAIR/LIMO")
+    parser.add_argument("--question-key", type=str, default="question")
+    parser.add_argument("--answer-key", type=str, default="answer")
+    parser.add_argument("--num-tries", type=int, default=1)
+    add_common_sglang_args_and_parse(parser)
+    args = parser.parse_args()
+    main(args)
diff --git a/benchmark/reasoning_benchmark/eval_utils.py b/benchmark/reasoning_benchmark/eval_utils.py
new file mode 100644
index 000000000..ab736954f
--- /dev/null
+++ b/benchmark/reasoning_benchmark/eval_utils.py
@@ -0,0 +1,206 @@
+# Adapted from https://github.com/deepseek-ai/DeepSeek-Math/blob/main/evaluation/eval/eval_utils.py
+
+from math import isclose
+
+import regex
+from sympy import N, simplify
+from sympy.parsing.latex import parse_latex
+from sympy.parsing.sympy_parser import parse_expr
+
+
+def parse_digits(num):
+    # format: 234.23 || 23%
+    num = regex.sub(",", "", str(num))
+    try:
+        return float(num)
+    except:
+        if num.endswith("%"):
+            num = num[:-1]
+            if num.endswith("\\"):
+                num = num[:-1]
+            try:
+                return float(num) / 100
+            except:
+                pass
+    return None
+
+
+def is_digit(num):
+    # paired with parse_digits
+    return parse_digits(num) is not None
+
+
+def symbolic_equal(a, b):
+    def _parse(s):
+        for f in [parse_latex, parse_expr]:
+            try:
+                return f(s)
+            except:
+                pass
+        return s
+
+    a = _parse(a)
+    b = _parse(b)
+
+    try:
+        if simplify(a - b) == 0:
+            return True
+    except:
+        pass
+
+    try:
+        if isclose(N(a), N(b), abs_tol=1e-3):
+            return True
+    except:
+        pass
+    return False
+
+
+def math_equal(prediction, reference, include_percentage=True, is_close=True):
+    """
+    Exact match of math if and only if:
+    1. numerical equal: both can convert to float and are equal
+    2. symbolic equal: both can convert to sympy expression and are equal
+    """
+    if str(prediction) == str(reference):
+        return True
+
+    try:  # 1. numerical equal
+        if is_digit(prediction) and is_digit(reference):
+            prediction = parse_digits(prediction)
+            reference = parse_digits(reference)
+            # number questions
+            if include_percentage:
+                gt_result = [reference / 100, reference, reference * 100]
+            else:
+                gt_result = [reference]
+            for item in gt_result:
+                try:
+                    if is_close:
+                        if isclose(item, prediction, abs_tol=1e-3):
+                            return True
+                    else:
+                        if item == prediction:
+                            return True
+                except Exception:
+                    continue
+            return False
+    except:
+        pass
+
+    if not prediction and prediction not in [0, False]:
+        return False
+
+    # 2. symbolic equal
+    reference = str(reference).strip()
+    prediction = str(prediction).strip()
+
+    if (
+        regex.match(r"(\(|\[).+(\)|\])", prediction) is not None
+        and regex.match(r"(\(|\[).+(\)|\])", reference) is not None
+    ):
+        pred_parts = prediction[1:-1].split(",")
+        ref_parts = reference[1:-1].split(",")
+        if len(pred_parts) == len(ref_parts):
+            if all(
+                [
+                    math_equal(
+                        pred_parts[i], ref_parts[i], include_percentage, is_close
+                    )
+                    for i in range(len(pred_parts))
+                ]
+            ):
+                return True
+
+    # Add back matrix comparison
+    if (
+        (
+            prediction.startswith("\\begin{pmatrix}")
+            or prediction.startswith("\\begin{bmatrix}")
+        )
+        and (
+            prediction.endswith("\\end{pmatrix}")
+            or prediction.endswith("\\end{bmatrix}")
+        )
+        and (
+            reference.startswith("\\begin{pmatrix}")
+            or reference.startswith("\\begin{bmatrix}")
+        )
+        and (
+            reference.endswith("\\end{pmatrix}") or reference.endswith("\\end{bmatrix}")
+        )
+    ):
+        pred_lines = [
+            line.strip()
+            for line in prediction[
+                len("\\begin{pmatrix}") : -len("\\end{pmatrix}")
+            ].split("\\\\")
+            if line.strip()
+        ]
+        ref_lines = [
+            line.strip()
+            for line in reference[
+                len("\\begin{pmatrix}") : -len("\\end{pmatrix}")
+            ].split("\\\\")
+            if line.strip()
+        ]
+        matched = True
+        if len(pred_lines) == len(ref_lines):
+            for pred_line, ref_line in zip(pred_lines, ref_lines):
+                pred_parts = pred_line.split("&")
+                ref_parts = ref_line.split("&")
+                if len(pred_parts) == len(ref_parts):
+                    if not all(
+                        [
+                            math_equal(
+                                pred_parts[i],
+                                ref_parts[i],
+                                include_percentage,
+                                is_close,
+                            )
+                            for i in range(len(pred_parts))
+                        ]
+                    ):
+                        matched = False
+                        break
+                else:
+                    matched = False
+                if not matched:
+                    break
+        else:
+            matched = False
+        if matched:
+            return True
+
+    # Add back equation comparison
+    if prediction.count("=") == 1 and reference.count("=") == 1:
+        pred = prediction.split("=")
+        pred = f"{pred[0].strip()} - ({pred[1].strip()})"
+        ref = reference.split("=")
+        ref = f"{ref[0].strip()} - ({ref[1].strip()})"
+        if symbolic_equal(pred, ref) or symbolic_equal(f"-({pred})", ref):
+            return True
+    elif (
+        prediction.count("=") == 1
+        and len(prediction.split("=")[0].strip()) <= 2
+        and "=" not in reference
+    ):
+        if math_equal(
+            prediction.split("=")[1], reference, include_percentage, is_close
+        ):
+            return True
+    elif (
+        reference.count("=") == 1
+        and len(reference.split("=")[0].strip()) <= 2
+        and "=" not in prediction
+    ):
+        if math_equal(
+            prediction, reference.split("=")[1], include_percentage, is_close
+        ):
+            return True
+
+    # symbolic equal with sympy
+    if symbolic_equal(prediction, reference):
+        return True
+
+    return False
diff --git a/benchmark/tip_suggestion/.gitignore b/benchmark/tip_suggestion/.gitignore
new file mode 100644
index 000000000..3322de9b9
--- /dev/null
+++ b/benchmark/tip_suggestion/.gitignore
@@ -0,0 +1 @@
+!topic.jsonl
diff --git a/benchmark/tip_suggestion/README.md b/benchmark/tip_suggestion/README.md
new file mode 100644
index 000000000..97ebe1d82
--- /dev/null
+++ b/benchmark/tip_suggestion/README.md
@@ -0,0 +1,33 @@
+## Run benchmark
+
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+```
+python3 bench_sglang.py --num-questions 64
+python3 bench_sglang.py --num-questions 32 --parallel 1
+```
+
+
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
+```
+
+```
+python3 bench_other.py --backend vllm --num-questions 64
+```
+
+
+### Benchmark guidance
+```
+python3 bench_other.py --backend guidance --num-questions 32 --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+
+### Benchmark lmql
+
+```
+python3 bench_other.py --backend lmql --num-questions 32 --parallel 1
+```
diff --git a/benchmark/tip_suggestion/bench_other.py b/benchmark/tip_suggestion/bench_other.py
new file mode 100644
index 000000000..2630081bd
--- /dev/null
+++ b/benchmark/tip_suggestion/bench_other.py
@@ -0,0 +1,133 @@
+import argparse
+import json
+import time
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+
+from tqdm import tqdm
+
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+from sglang.utils import dump_state_text, read_jsonl
+
+number = 5
+
+
+def expand_tip(topic, tip, generate):
+    s = (
+        """Please expand a tip for a topic into a detailed paragraph.
+
+Topic: staying healthy
+Tip: Regular Exercise
+Paragraph: Incorporate physical activity into your daily routine. This doesn't necessarily mean intense gym workouts; it can be as simple as walking, cycling, or yoga. Regular exercise helps in maintaining a healthy weight, improves cardiovascular health, boosts mental health, and can enhance cognitive function, which is crucial for fields that require intense intellectual engagement.
+
+Topic: building a campfire
+Tip: Choose the Right Location
+Paragraph: Always build your campfire in a safe spot. This means selecting a location that's away from trees, bushes, and other flammable materials. Ideally, use a fire ring if available. If you're building a fire pit, it should be on bare soil or on a bed of stones, not on grass or near roots which can catch fire underground. Make sure the area above is clear of low-hanging branches.
+
+Topic: writing a blog post
+Tip: structure your content effectively
+Paragraph: A well-structured post is easier to read and more enjoyable. Start with an engaging introduction that hooks the reader and clearly states the purpose of your post. Use headings and subheadings to break up the text and guide readers through your content. Bullet points and numbered lists can make information more digestible. Ensure each paragraph flows logically into the next, and conclude with a summary or call-to-action that encourages reader engagement.
+
+Topic: """
+        + topic
+        + "\nTip: "
+        + tip
+        + "\nParagraph:"
+    )
+    return generate(s, max_tokens=128, stop=["\n\n"])
+
+
+def suggest_tips(topic, generate):
+    s = "Please act as a helpful assistant. Your job is to provide users with useful tips on a specific topic.\n"
+    s += "USER: Give some tips for " + topic + ".\n"
+    s += (
+        "ASSISTANT: Okay. Here are "
+        + str(number)
+        + " concise tips, each under 8 words:\n"
+    )
+
+    tips = []
+    for i in range(1, 1 + number):
+        s += f"{i}."
+        tip = generate(s, max_tokens=24, stop=[".", "\n"])
+        s += tip + ".\n"
+        tips.append(tip)
+
+    paragraphs = [expand_tip(topic, tip, generate=generate) for tip in tips]
+
+    for i in range(1, 1 + number):
+        s += f"Tip {i}:" + paragraphs[i - 1] + "\n"
+    return s
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)[: args.num_questions]
+    states = [None] * len(lines)
+
+    # Select backend
+    call_generate = partial(get_call_generate(args), temperature=0)
+
+    # Run requests
+    tic = time.perf_counter()
+    if args.backend != "lmql":
+
+        def get_one_answer(i):
+            states[i] = suggest_tips(lines[i]["topic"], call_generate)
+
+        if args.parallel == 1:
+            for i in tqdm(range(len(lines))):
+                get_one_answer(i)
+        else:
+            with ThreadPoolExecutor(args.parallel) as executor:
+                list(
+                    tqdm(
+                        executor.map(get_one_answer, list(range(len(lines)))),
+                        total=len(lines),
+                    )
+                )
+
+    else:
+        import asyncio
+
+        from lmql_funcs import suggest_tips_async
+
+        async def get_one_answer_async(i):
+            states[i] = await suggest_tips_async(lines[i]["topic"], call_generate)
+
+        batches = []
+        for i in range(0, len(lines), args.parallel):
+            batches.append(list(range(i, min(i + args.parallel, len(lines)))))
+        loop = asyncio.get_event_loop()
+        for batch in tqdm(batches):
+            loop.run_until_complete(
+                asyncio.gather(*[get_one_answer_async(i) for i in batch])
+            )
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "tip_suggestion",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="topic.jsonl")
+    parser.add_argument("--num-questions", type=int, default=100)
+    args = add_common_other_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/tip_suggestion/bench_sglang.py b/benchmark/tip_suggestion/bench_sglang.py
new file mode 100644
index 000000000..86c476f97
--- /dev/null
+++ b/benchmark/tip_suggestion/bench_sglang.py
@@ -0,0 +1,100 @@
+import argparse
+import json
+import time
+
+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text, read_jsonl
+
+number = 5
+
+
+@sgl.function
+def expand_tip(s, topic, tip):
+    s += (
+        """Please expand a tip for a topic into a detailed paragraph.
+
+Topic: staying healthy
+Tip: Regular Exercise
+Paragraph: Incorporate physical activity into your daily routine. This doesn't necessarily mean intense gym workouts; it can be as simple as walking, cycling, or yoga. Regular exercise helps in maintaining a healthy weight, improves cardiovascular health, boosts mental health, and can enhance cognitive function, which is crucial for fields that require intense intellectual engagement.
+
+Topic: building a campfire
+Tip: Choose the Right Location
+Paragraph: Always build your campfire in a safe spot. This means selecting a location that's away from trees, bushes, and other flammable materials. Ideally, use a fire ring if available. If you're building a fire pit, it should be on bare soil or on a bed of stones, not on grass or near roots which can catch fire underground. Make sure the area above is clear of low-hanging branches.
+
+Topic: writing a blog post
+Tip: structure your content effectively
+Paragraph: A well-structured post is easier to read and more enjoyable. Start with an engaging introduction that hooks the reader and clearly states the purpose of your post. Use headings and subheadings to break up the text and guide readers through your content. Bullet points and numbered lists can make information more digestible. Ensure each paragraph flows logically into the next, and conclude with a summary or call-to-action that encourages reader engagement.
+
+Topic: """
+        + topic
+        + "\nTip: "
+        + tip
+        + "\nParagraph:"
+    )
+    s += sgl.gen("paragraph", max_tokens=128, stop=["\n\n"], temperature=0)
+
+
+@sgl.function
+def suggest_tips(s, topic):
+    s += "Please act as a helpful assistant. Your job is to provide users with useful tips on a specific topic.\n"
+    s += "USER: Give some tips for " + topic + ".\n"
+    s += (
+        "ASSISTANT: Okay. Here are "
+        + str(number)
+        + " concise tips, each under 8 words:\n"
+    )
+
+    paragraphs = []
+    for i in range(1, 1 + number):
+        s += f"{i}." + sgl.gen(f"tip_{i}", max_tokens=24, stop=[".", "\n"]) + ".\n"
+        paragraphs.append(expand_tip(topic=topic, tip=s[f"tip_{i}"]))
+
+    for i in range(1, 1 + number):
+        s += f"Tip {i}:" + paragraphs[i - 1]["paragraph"] + "\n"
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)[: args.num_questions]
+    arguments = [{"topic": l["topic"]} for l in lines]
+
+    # Select backend
+    sgl.set_default_backend(select_sglang_backend(args))
+
+    # Run requests
+    tic = time.perf_counter()
+    states = suggest_tips.run_batch(
+        arguments, temperature=0, num_threads=args.parallel, progress_bar=True
+    )
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "tip_suggestion",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="topic.jsonl")
+    parser.add_argument("--num-questions", type=int, default=100)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/tip_suggestion/lmql_funcs.py b/benchmark/tip_suggestion/lmql_funcs.py
new file mode 100644
index 000000000..7790bbe95
--- /dev/null
+++ b/benchmark/tip_suggestion/lmql_funcs.py
@@ -0,0 +1,50 @@
+number = 5
+
+
+async def expand_tip_async(topic, tip, generate):
+    s = (
+        """Please expand a tip for a topic into a detailed paragraph.
+
+Topic: staying healthy
+Tip: Regular Exercise
+Paragraph: Incorporate physical activity into your daily routine. This doesn't necessarily mean intense gym workouts; it can be as simple as walking, cycling, or yoga. Regular exercise helps in maintaining a healthy weight, improves cardiovascular health, boosts mental health, and can enhance cognitive function, which is crucial for fields that require intense intellectual engagement.
+
+Topic: building a campfire
+Tip: Choose the Right Location
+Paragraph: Always build your campfire in a safe spot. This means selecting a location that's away from trees, bushes, and other flammable materials. Ideally, use a fire ring if available. If you're building a fire pit, it should be on bare soil or on a bed of stones, not on grass or near roots which can catch fire underground. Make sure the area above is clear of low-hanging branches.
+
+Topic: writing a blog post
+Tip: structure your content effectively
+Paragraph: A well-structured post is easier to read and more enjoyable. Start with an engaging introduction that hooks the reader and clearly states the purpose of your post. Use headings and subheadings to break up the text and guide readers through your content. Bullet points and numbered lists can make information more digestible. Ensure each paragraph flows logically into the next, and conclude with a summary or call-to-action that encourages reader engagement.
+
+Topic: """
+        + topic
+        + "\nTip: "
+        + tip
+        + "\nParagraph:"
+    )
+    return await generate(s, max_tokens=128, stop="\n\n")
+
+
+async def suggest_tips_async(topic, generate):
+    s = "Please act as a helpful assistant. Your job is to provide users with useful tips on a specific topic.\n"
+    s += "USER: Give some tips for " + topic + ".\n"
+    s += (
+        "ASSISTANT: Okay. Here are "
+        + str(number)
+        + " concise tips, each under 8 words:\n"
+    )
+
+    tips = []
+    for i in range(1, 1 + number):
+        s += f"{i}."
+        # NOTE: stop is different due to lmql does not support a list of stop tokens
+        tip = await generate(s, max_tokens=24, stop=".\n")
+        s += tip + ".\n"
+        tips.append(tip)
+
+    paragraphs = [await expand_tip_async(topic, tip, generate=generate) for tip in tips]
+
+    for i in range(1, 1 + number):
+        s += f"Tip {i}:" + paragraphs[i - 1] + "\n"
+    return s
diff --git a/benchmark/tip_suggestion/topic.jsonl b/benchmark/tip_suggestion/topic.jsonl
new file mode 100644
index 000000000..c8ac6d12e
--- /dev/null
+++ b/benchmark/tip_suggestion/topic.jsonl
@@ -0,0 +1,50 @@
+{"topic": "organizing a successful charity event", "number": 6}
+{"topic": "improving personal credit scores", "number": 7}
+{"topic": "staying motivated during job searches", "number": 5}
+{"topic": "maintaining a work-life balance", "number": 9}
+{"topic": "reducing carbon footprint at home", "number": 8}
+{"topic": "starting a book club", "number": 5}
+{"topic": "learning to play a musical instrument", "number": 7}
+{"topic": "getting into freelance writing", "number": 6}
+{"topic": "beginner yoga poses", "number": 8}
+{"topic": "preparing for graduate school exams", "number": 5}
+{"topic": "exploring minimalist living", "number": 9}
+{"topic": "effective grocery shopping", "number": 7}
+{"topic": "winter camping", "number": 5}
+{"topic": "starting a podcast on a budget", "number": 8}
+{"topic": "creating a capsule wardrobe", "number": 6}
+{"topic": "improving your writing skills", "number": 7}
+{"topic": "learning a new software quickly", "number": 9}
+{"topic": "reducing anxiety before public speaking", "number": 5}
+{"topic": "planning a solo travel adventure", "number": 8}
+{"topic": "beginner skateboarders", "number": 6}
+{"topic": "studying abroad", "number": 7}
+{"topic": "planting a vegetable garden", "number": 5}
+{"topic": "adopting a shelter pet", "number": 9}
+{"topic": "learning to cook ethnic cuisines", "number": 8}
+{"topic": "effective conflict resolution", "number": 5}
+{"topic": "starting a vlog", "number": 7}
+{"topic": "keeping a daily journal", "number": 6}
+{"topic": "improving sleep hygiene", "number": 8}
+{"topic": "beginner mountain climbers", "number": 5}
+{"topic": "creating a mobile app", "number": 9}
+{"topic": "maintaining a saltwater aquarium", "number": 7}
+{"topic": "preparing for a baby's arrival", "number": 6}
+{"topic": "writing a fantasy novel", "number": 5}
+{"topic": "effective team leadership", "number": 8}
+{"topic": "making a documentary film", "number": 9}
+{"topic": "learning about historical events", "number": 7}
+{"topic": "baking gluten-free treats", "number": 6}
+{"topic": "improving mental arithmetic skills", "number": 5}
+{"topic": "building a treehouse", "number": 8}
+{"topic": "getting started with watercolor painting", "number": 9}
+{"topic": "creating a YouTube tutorial series", "number": 7}
+{"topic": "landscape photography", "number": 5}
+{"topic": "navigating cultural differences", "number": 6}
+{"topic": "preparing for a marathon", "number": 8}
+{"topic": "building an online business", "number": 9}
+{"topic": "learning to dance at home", "number": 5}
+{"topic": "self-publishing a book", "number": 7}
+{"topic": "starting an urban farm", "number": 6}
+{"topic": "improving your memory", "number": 8}
+{"topic": "creating a personal brand online", "number": 9}
diff --git a/benchmark/tree_of_thought_deep/README.md b/benchmark/tree_of_thought_deep/README.md
new file mode 100644
index 000000000..bf5ab1638
--- /dev/null
+++ b/benchmark/tree_of_thought_deep/README.md
@@ -0,0 +1,51 @@
+## Download data
+```
+wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
+```
+
+## Run benchmark
+
+NOTE: This is an implementation for throughput/latency benchmark purposes. The prompts are not tuned to achieve good accuracy on the GSM-8K tasks.
+
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+```
+python3 bench_sglang.py --num-questions 32
+python3 bench_sglang.py --num-questions 16 --parallel 1
+```
+
+
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
+```
+
+```
+python3 bench_other.py --num-questions 32 --backend vllm
+```
+
+
+### Benchmark lightllm
+```
+# A10G
+python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000
+```
+
+```
+python3 bench_other.py --num-questions 32 --backend lightllm
+```
+
+
+### Benchmark guidance
+```
+python3 bench_other.py --num-questions 8 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+
+### Benchmark lmql
+
+```
+python3 bench_other.py --num-questions 8 --backend lmql --parallel 1
+```
diff --git a/benchmark/tree_of_thought_deep/bench_other.py b/benchmark/tree_of_thought_deep/bench_other.py
new file mode 100644
index 000000000..0ef8c6360
--- /dev/null
+++ b/benchmark/tree_of_thought_deep/bench_other.py
@@ -0,0 +1,222 @@
+import argparse
+import ast
+import json
+import re
+import time
+from collections import Counter
+from concurrent.futures import ThreadPoolExecutor
+
+import numpy as np
+from tqdm import tqdm
+
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+from sglang.utils import dump_state_text, read_jsonl
+
+INVALID = -9999999
+
+
+def get_answer_value(answer_str):
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+
+
+def most_frequent_number(numbers):
+    if not numbers:
+        return None
+
+    frequency = Counter(numbers)
+    most_frequent = max(frequency, key=frequency.get)
+    return most_frequent
+
+
+USER_PREFIX = "[INST] "
+USER_SUFFIX = " [/INST]"
+ASSISTANT_PREFIX = ""
+ASSISTANT_SUFFIX = " </s><s>"
+
+# Use a low temp to make the results more deterministic and the comparison more fair.
+temp = 0.001
+
+
+def propose_plan(s, question, num_branches, call_generate):
+    s += (
+        USER_PREFIX
+        + """Please generate a high-level plan for solving the following question. As the first step, just say what method and idea you will use to solve the question. You can reorganize the information in the question. Do not do the actual calculation. Keep your response concise and within 80 words. Question: """
+        + question
+        + USER_SUFFIX
+    )
+
+    s += ASSISTANT_PREFIX
+    comps = call_generate(
+        s, max_tokens=256, temperature=temp, stop=None, n=num_branches
+    )
+    return [s + comp + ASSISTANT_SUFFIX for comp in comps]
+
+
+def execute_plan(s, num_branches, call_generate):
+    s += (
+        USER_PREFIX
+        + """The plan looks good! Now, use real numbers and do the calculation. Please solve the question step-by-step according to the high-level plan. Give me the final answer. Make your response short."""
+        + USER_SUFFIX
+    )
+    s += ASSISTANT_PREFIX
+    comps = call_generate(
+        s, max_tokens=256, temperature=temp, stop=None, n=num_branches
+    )
+    return [s + comp + ASSISTANT_SUFFIX for comp in comps]
+
+
+def reflect_solution(s, num_branches, call_generate):
+    s += (
+        USER_PREFIX
+        + """Okay. Now, evaluate your own solution and give it a score on a scale of 1 to 5. Please do rigorous check of the correctness."""
+        + USER_SUFFIX
+    )
+    s += ASSISTANT_PREFIX
+    comps = call_generate(
+        s, max_tokens=256, temperature=temp, stop=None, n=num_branches
+    )
+    return [s + comp + ASSISTANT_SUFFIX for comp in comps]
+
+
+def get_final_answer(s, num_branches, call_generate):
+    s += (
+        USER_PREFIX
+        + """Based on your reflection, do you change your mind? Now, give me the final answer after careful consideration."""
+        + USER_SUFFIX
+    )
+    s += ASSISTANT_PREFIX
+    comps = call_generate(
+        s, max_tokens=256, temperature=temp, stop=None, n=num_branches
+    )
+    return [s + comp + ASSISTANT_SUFFIX for comp in comps]
+
+
+def tree_search(question, num_branches, call_generate):
+    plan_forks = propose_plan("", question, num_branches, call_generate)
+
+    sol_states = []
+    for plan in plan_forks:
+        forks = execute_plan(plan, num_branches, call_generate)
+        sol_states.extend(forks)
+
+    ref_states = []
+    for sol in sol_states:
+        forks = reflect_solution(sol, num_branches, call_generate)
+        ref_states.extend(forks)
+
+    solutions = []
+    for sol in ref_states:
+        ans = get_final_answer(sol, num_branches, call_generate)
+        solutions.append(ans)
+
+    return solutions
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)
+
+    # Construct prompts
+    num_branches = 2
+    questions = []
+    labels = []
+    for i in range(len(lines[: args.num_questions])):
+        questions.append(lines[i]["question"])
+        labels.append(get_answer_value(lines[i]["answer"]))
+    assert all(l != INVALID for l in labels)
+    arguments = [{"question": q, "num_branches": num_branches} for q in questions]
+
+    # Select backend
+    call_generate = get_call_generate(args)
+
+    # Run requests
+    states = [None] * len(questions)
+
+    tic = time.perf_counter()
+    if args.backend != "lmql":
+
+        def get_one_answer(i):
+            states[i] = tree_search(**arguments[i], call_generate=call_generate)
+
+        if args.parallel == 1:
+            for i in tqdm(range(len(questions))):
+                get_one_answer(i)
+        else:
+            with ThreadPoolExecutor(args.parallel) as executor:
+                list(
+                    tqdm(
+                        executor.map(get_one_answer, list(range(len(questions)))),
+                        total=len(questions),
+                    )
+                )
+
+    else:
+        import asyncio
+
+        from lmql_funcs import tree_search_async
+
+        async def get_one_answer_async(i):
+            states[i] = await tree_search_async(
+                **arguments[i], call_generate=call_generate
+            )
+
+        batches = [
+            [] for _ in range((len(questions) + args.parallel - 1) // args.parallel)
+        ]
+        for i in range(len(questions)):
+            batches[i // args.parallel].append(i)
+
+        loop = asyncio.get_event_loop()
+        for bt in tqdm(batches):
+            tasks = [get_one_answer_async(k) for k in bt]
+            loop.run_until_complete(asyncio.gather(*tasks))
+
+    latency = time.perf_counter() - tic
+
+    answers_text = []
+    for s in states:
+        answers_text.append([x for xs in s for x in xs])
+
+    preds = []
+    for i in range(len(states)):
+        answers = [get_answer_value(v) for v in answers_text[i]]
+        preds.append(most_frequent_number(answers))
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    invalid = np.mean(np.array(preds) == INVALID)
+    print(f"Latency: {latency:.3f}")
+    print(f"Invalid: {invalid:.3f}")
+    print(f"Accuracy: {acc:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", answers_text)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "tree_of_thought_gsm8k",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="test.jsonl")
+    parser.add_argument("--num-questions", type=int, default=200)
+    args = add_common_other_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/tree_of_thought_deep/bench_sglang.py b/benchmark/tree_of_thought_deep/bench_sglang.py
new file mode 100644
index 000000000..bcdb6e54d
--- /dev/null
+++ b/benchmark/tree_of_thought_deep/bench_sglang.py
@@ -0,0 +1,171 @@
+import argparse
+import ast
+import json
+import re
+import time
+from collections import Counter
+
+import numpy as np
+
+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text, read_jsonl
+
+INVALID = -9999999
+
+
+def get_answer_value(answer_str):
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+
+
+def most_frequent_number(numbers):
+    if not numbers:
+        return None
+
+    frequency = Counter(numbers)
+    most_frequent = max(frequency, key=frequency.get)
+    return most_frequent
+
+
+# Use a low temp to make the results more deterministic and the comparison more fair.
+temp = 0.001
+
+
+def propose_plan(s, question, num_branches):
+    s += sgl.user(
+        """Please generate a high-level plan for solving the following question. As the first step, just say what method and idea you will use to solve the question. You can reorganize the information in the question. Do not do the actual calculation. Keep your response concise and within 80 words. Question: """
+        + question
+    )
+    forks = s.fork(num_branches)
+    forks += sgl.assistant(sgl.gen("plan", max_tokens=256, temperature=temp))
+    return forks
+
+
+def execute_plan(s, num_branches):
+    s += sgl.user(
+        """The plan looks good! Now, use real numbers and do the calculation. Please solve the question step-by-step according to the high-level plan. Give me the final answer. Make your response short."""
+    )
+    forks = s.fork(num_branches)
+    forks += sgl.assistant(sgl.gen("answer", max_tokens=256, temperature=temp))
+    return forks
+
+
+def reflect_solution(s, num_branches):
+    s += sgl.user(
+        """Okay. Now, evaluate your own solution and give it a score on a scale of 1 to 5. Please do rigorous check of the correctness."""
+    )
+    forks = s.fork(num_branches)
+    forks += sgl.assistant(sgl.gen("score", max_tokens=256, temperature=temp))
+    return forks
+
+
+def get_final_answer(s, num_branches):
+    s += sgl.user(
+        """Based on your reflection, do you change your mind? Now, give me the final answer after careful consideration."""
+    )
+    forks = s.fork(num_branches)
+    forks += sgl.assistant(sgl.gen("final_answer", max_tokens=256, temperature=temp))
+    return forks
+
+
+@sgl.function
+def tree_search(s, question, num_branches):
+    plan_forks = propose_plan(s, question, num_branches)
+
+    sol_states = []
+    for plan in plan_forks:
+        forks = execute_plan(plan, num_branches)
+        sol_states.extend(forks)
+
+    ref_states = []
+    for sol in sol_states:
+        forks = reflect_solution(sol, num_branches)
+        ref_states.extend(forks)
+
+    solutions = []
+    for sol in ref_states:
+        forks = get_final_answer(sol, num_branches)
+        solutions.append(forks)
+    solutions = [[s.text() for s in forks] for forks in solutions]
+
+    return solutions
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)
+    lines = list(lines)
+
+    # Construct prompts
+    num_branches = 2
+    questions = []
+    labels = []
+    for i in range(len(lines[: args.num_questions])):
+        questions.append(lines[i]["question"])
+        labels.append(get_answer_value(lines[i]["answer"]))
+    assert all(l != INVALID for l in labels)
+    arguments = [{"question": q, "num_branches": num_branches} for q in questions]
+
+    # Select backend
+    backend = select_sglang_backend(args)
+
+    # Run requests
+    tic = time.perf_counter()
+    states = tree_search.run_batch(
+        arguments,
+        temperature=0,
+        backend=backend,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+    answers_text = []
+    for s in states:
+        answers_text.append([x for xs in s.ret_value for x in xs])
+
+    preds = []
+    for i in range(len(states)):
+        answers = [get_answer_value(v) for v in answers_text[i]]
+        preds.append(most_frequent_number(answers))
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    invalid = np.mean(np.array(preds) == INVALID)
+    print(f"Latency: {latency:.3f}")
+    print(f"Invalid: {invalid:.3f}")
+    print(f"Accuracy: {acc:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", answers_text)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "tree_of_thought_gsm8k",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="test.jsonl")
+    parser.add_argument("--num-questions", type=int, default=200)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/tree_of_thought_deep/lmql_funcs.py b/benchmark/tree_of_thought_deep/lmql_funcs.py
new file mode 100644
index 000000000..c783cdbe3
--- /dev/null
+++ b/benchmark/tree_of_thought_deep/lmql_funcs.py
@@ -0,0 +1,82 @@
+from bench_other import (
+    ASSISTANT_PREFIX,
+    ASSISTANT_SUFFIX,
+    USER_PREFIX,
+    USER_SUFFIX,
+    temp,
+)
+
+
+async def propose_plan_async(s, question, num_branches, call_generate):
+    s += (
+        USER_PREFIX
+        + """Please generate a high-level plan for solving the following question. As the first step, just say what method and idea you will use to solve the question. You can reorganize the information in the question. Do not do the actual calculation. Keep your response concise and within 80 words. Question: """
+        + question
+        + USER_SUFFIX
+    )
+
+    s += ASSISTANT_PREFIX
+    comps = await call_generate(
+        s, max_tokens=256, temperature=temp, stop=None, n=num_branches
+    )
+    return [s + comp + ASSISTANT_SUFFIX for comp in comps]
+
+
+async def execute_plan_async(s, num_branches, call_generate):
+    s += (
+        USER_PREFIX
+        + """The plan looks good! Now, use real numbers and do the calculation. Please solve the question step-by-step according to the high-level plan. Give me the final answer. Make your response short."""
+        + USER_SUFFIX
+    )
+    s += ASSISTANT_PREFIX
+    comps = await call_generate(
+        s, max_tokens=256, temperature=temp, stop=None, n=num_branches
+    )
+    return [s + comp + ASSISTANT_SUFFIX for comp in comps]
+
+
+async def reflect_solution_async(s, num_branches, call_generate):
+    s += (
+        USER_PREFIX
+        + """Okay. Now, evaluate your own solution and give it a score on a scale of 1 to 5. Please do rigorous check of the correctness."""
+        + USER_SUFFIX
+    )
+    s += ASSISTANT_PREFIX
+    comps = await call_generate(
+        s, max_tokens=256, temperature=temp, stop=None, n=num_branches
+    )
+    return [s + comp + ASSISTANT_SUFFIX for comp in comps]
+
+
+async def get_final_answer_async(s, num_branches, call_generate):
+    s += (
+        USER_PREFIX
+        + """Based on your reflection, do you change your mind? Now, give me the final answer after careful consideration."""
+        + USER_SUFFIX
+    )
+    s += ASSISTANT_PREFIX
+    comps = await call_generate(
+        s, max_tokens=256, temperature=temp, stop=None, n=num_branches
+    )
+    return [s + comp + ASSISTANT_SUFFIX for comp in comps]
+
+
+async def tree_search_async(question, num_branches, call_generate):
+    plan_forks = await propose_plan_async("", question, num_branches, call_generate)
+
+    sol_states = []
+    for plan in plan_forks:
+        forks = await execute_plan_async(plan, num_branches, call_generate)
+        sol_states.extend(forks)
+
+    ref_states = []
+    for sol in sol_states:
+        forks = await reflect_solution_async(sol, num_branches, call_generate)
+        ref_states.extend(forks)
+
+    solutions = []
+    for sol in ref_states:
+        ans = await get_final_answer_async(sol, num_branches, call_generate)
+        solutions.append(ans)
+
+    return solutions
diff --git a/benchmark/tree_of_thought_v0/README.md b/benchmark/tree_of_thought_v0/README.md
new file mode 100644
index 000000000..821bb20d1
--- /dev/null
+++ b/benchmark/tree_of_thought_v0/README.md
@@ -0,0 +1,43 @@
+## Download data
+```
+wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
+```
+
+## Run benchmark
+
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+```
+python3 bench_sglang.py --num-questions 32 --parallel 16
+python3 bench_sglang.py --num-questions 10 --parallel 1
+```
+
+
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
+```
+
+```
+python3 bench_other.py --num-questions 32 --backend vllm
+```
+
+
+### Benchmark lightllm
+```
+# A10G
+python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000
+```
+
+```
+python3 bench_other.py --num-questions 32 --backend lightllm
+```
+
+
+### Benchmark guidance
+```
+python3 bench_other.py --num-questions 32 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
diff --git a/benchmark/tree_of_thought_v0/bench_other.py b/benchmark/tree_of_thought_v0/bench_other.py
new file mode 100644
index 000000000..703ecd7f4
--- /dev/null
+++ b/benchmark/tree_of_thought_v0/bench_other.py
@@ -0,0 +1,179 @@
+import argparse
+import ast
+import json
+import re
+import time
+from collections import Counter
+from concurrent.futures import ThreadPoolExecutor
+
+import numpy as np
+from tqdm import tqdm
+
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+from sglang.utils import dump_state_text, read_jsonl
+
+INVALID = -9999999
+
+
+def get_answer_value(answer_str):
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+
+
+def most_frequent_number(numbers):
+    if not numbers:
+        return None
+
+    frequency = Counter(numbers)
+    most_frequent = max(frequency, key=frequency.get)
+    return most_frequent
+
+
+USER_PREFIX = "[INST] "
+USER_SUFFIX = " [/INST]"
+ASSISTANT_PREFIX = ""
+ASSISTANT_SUFFIX = " </s><s>"
+
+# Use a low temp to make the results more deterministic and the comparison more fair.
+temp = 0.3
+
+
+def propose_plan(s, question, num_branches, call_generate):
+    s += (
+        USER_PREFIX
+        + """Please generate a high-level plan for solving the following question. As the first step, just say what method and idea you will use to solve the question. You can reorganize the information in the question. Do not do the actual calculation. Keep your response concise and within 80 words. Question: """
+        + question
+        + USER_SUFFIX
+    )
+
+    s += ASSISTANT_PREFIX
+    comps = call_generate(
+        s, max_tokens=256, temperature=temp, stop=None, n=num_branches
+    )
+    return [s + comp + ASSISTANT_SUFFIX for comp in comps]
+
+
+def execute_plan(s, num_branches, call_generate):
+    s += (
+        USER_PREFIX
+        + """The plan looks good! Now, use real numbers and do the calculation. Please solve the question step-by-step according to the high-level plan. Give me the final answer. Make your response short."""
+        + USER_SUFFIX
+    )
+    s += ASSISTANT_PREFIX
+    comps = call_generate(
+        s, max_tokens=256, temperature=temp, stop=None, n=num_branches
+    )
+    return [s + comp + ASSISTANT_SUFFIX for comp in comps]
+
+
+def reflect_solution(s, num_branches, call_generate):
+    s += (
+        USER_PREFIX
+        + """Okay. Now you evaluate your own solution and give it a score on a scale of 1 to 5. Please do rigorous check of the correctness."""
+        + USER_SUFFIX
+    )
+    s += ASSISTANT_PREFIX
+    comps = call_generate(
+        s, max_tokens=256, temperature=temp, stop=None, n=num_branches
+    )
+    return [s + comp + ASSISTANT_SUFFIX for comp in comps]
+
+
+def tree_search(question, num_branches, call_generate):
+    s = ""
+    solutions = []
+
+    plan_forks = propose_plan(s, question, num_branches, call_generate)
+    for plan in plan_forks:
+        sol_forks = execute_plan(plan, num_branches, call_generate)
+        for sol in sol_forks:
+            score_forks = reflect_solution(sol, num_branches, call_generate)
+        solutions.append(sol_forks)
+
+    return solutions
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)
+
+    # Construct prompts
+    num_branches = 3
+    questions = []
+    labels = []
+    for i in range(len(lines[: args.num_questions])):
+        questions.append(lines[i]["question"])
+        labels.append(get_answer_value(lines[i]["answer"]))
+    assert all(l != INVALID for l in labels)
+    arguments = [{"question": q, "num_branches": num_branches} for q in questions]
+
+    # Select backend
+    call_generate = get_call_generate(args)
+
+    # Run requests
+    states = [None] * len(questions)
+
+    def get_one_answer(i):
+        states[i] = tree_search(**arguments[i], call_generate=call_generate)
+
+    tic = time.perf_counter()
+    if args.parallel == 1:
+        for i in tqdm(range(len(questions))):
+            get_one_answer(i)
+    else:
+        with ThreadPoolExecutor(args.parallel) as executor:
+            list(
+                tqdm(
+                    executor.map(get_one_answer, list(range(len(questions)))),
+                    total=len(questions),
+                )
+            )
+
+    latency = time.perf_counter() - tic
+
+    answers_text = []
+    for s in states:
+        answers_text.append([x for xs in s for x in xs])
+
+    preds = []
+    for i in range(len(states)):
+        answers = [get_answer_value(v) for v in answers_text[i]]
+        preds.append(most_frequent_number(answers))
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    invalid = np.mean(np.array(preds) == INVALID)
+    print(f"Latency: {latency:.3f}")
+    print(f"Invalid: {invalid:.3f}")
+    print(f"Accuracy: {acc:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", answers_text)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "tree_of_thought_gsm8k",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="test.jsonl")
+    parser.add_argument("--num-questions", type=int, default=200)
+    args = add_common_other_args_and_parse(parser)
+    main(args)
diff --git a/benchmark/tree_of_thought_v0/bench_sglang.py b/benchmark/tree_of_thought_v0/bench_sglang.py
new file mode 100644
index 000000000..6d7575f36
--- /dev/null
+++ b/benchmark/tree_of_thought_v0/bench_sglang.py
@@ -0,0 +1,159 @@
+import argparse
+import ast
+import json
+import re
+import time
+from collections import Counter
+
+import numpy as np
+
+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text, read_jsonl
+
+INVALID = -9999999
+
+
+def get_answer_value(answer_str):
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+
+
+def most_frequent_number(numbers):
+    if not numbers:
+        return None
+
+    frequency = Counter(numbers)
+    most_frequent = max(frequency, key=frequency.get)
+    return most_frequent
+
+
+# Use a low temp to make the results more deterministic and the comparison more fair.
+temp = 0.3
+
+
+def propose_plan(s, question, num_branches):
+    s += sgl.user(
+        """Please generate a high-level plan for solving the following question. As the first step, just say what method and idea you will use to solve the question. You can reorganize the information in the question. Do not do the actual calculation. Keep your response concise and within 80 words. Question: """
+        + question
+    )
+    forks = s.fork(num_branches)
+    forks += sgl.assistant(sgl.gen("plan", max_tokens=256, temperature=temp))
+    return forks
+
+
+def execute_plan(s, num_branches):
+    s += sgl.user(
+        """The plan looks good! Now, use real numbers and do the calculation. Please solve the question step-by-step according to the high-level plan. Give me the final answer. Make your response short."""
+    )
+    forks = s.fork(num_branches)
+    forks += sgl.assistant(sgl.gen("answer", max_tokens=256, temperature=temp))
+    return forks
+
+
+def reflect_solution(s, num_branches):
+    s += sgl.user(
+        """Okay. Now you evaluate your own solution and give it a score on a scale of 1 to 5. Please do rigorous check of the correctness."""
+    )
+    forks = s.fork(num_branches)
+    forks += sgl.assistant(sgl.gen("score", max_tokens=256, temperature=temp))
+    return forks
+
+
+@sgl.function
+def tree_search(s, question, num_branches):
+    forks_to_join = []
+
+    plan_forks = propose_plan(s, question, num_branches)
+    forks_to_join.append(plan_forks)
+
+    sol_states = []
+    for plan in plan_forks:
+        forks = execute_plan(plan, num_branches)
+        forks_to_join.append(forks)
+        sol_states.extend(forks)
+
+    for sol in sol_states:
+        forks = reflect_solution(sol, num_branches)
+        forks_to_join.append(forks)
+
+    for f in reversed(forks_to_join):
+        f.join()
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)
+
+    # Construct prompts
+    num_branches = 3
+    questions = []
+    labels = []
+    for i in range(len(lines[: args.num_questions])):
+        questions.append(lines[i]["question"])
+        labels.append(get_answer_value(lines[i]["answer"]))
+    assert all(l != INVALID for l in labels)
+    arguments = [{"question": q, "num_branches": num_branches} for q in questions]
+
+    # Select backend
+    backend = select_sglang_backend(args)
+
+    # Run requests
+    tic = time.perf_counter()
+    states = tree_search.run_batch(
+        arguments,
+        temperature=0,
+        backend=backend,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+    answers_text = []
+    for s in states:
+        answers_text.append([x for xs in s["answer"] for x in xs])
+
+    preds = []
+    for i in range(len(states)):
+        answers = [get_answer_value(v) for v in answers_text[i]]
+        preds.append(most_frequent_number(answers))
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    invalid = np.mean(np.array(preds) == INVALID)
+    print(f"Latency: {latency:.3f}")
+    print(f"Invalid: {invalid:.3f}")
+    print(f"Accuracy: {acc:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", answers_text)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "tree_of_thought_gsm8k",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="test.jsonl")
+    parser.add_argument("--num-questions", type=int, default=200)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
diff --git a/docker/Dockerfile b/docker/Dockerfile
new file mode 100644
index 000000000..3b9a420b3
--- /dev/null
+++ b/docker/Dockerfile
@@ -0,0 +1,388 @@
+ARG CUDA_VERSION=12.9.1
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 as base
+
+ARG BUILD_TYPE=all
+ARG BRANCH_TYPE=remote
+ARG DEEPEP_COMMIT=b92d0d4860ce6866cd6d31bfbae937f9a7a3772b
+ARG CMAKE_BUILD_PARALLEL_LEVEL=2
+ENV DEBIAN_FRONTEND=noninteractive \
+    CUDA_HOME=/usr/local/cuda \
+    GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ \
+    NVSHMEM_DIR=/sgl-workspace/nvshmem/install
+# Add GKE default lib and bin locations.
+ENV PATH="${PATH}:/usr/local/nvidia/bin" \
+    LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib:/usr/local/nvidia/lib64"
+
+RUN apt update && apt install wget -y && apt install software-properties-common -y \
+ && add-apt-repository ppa:deadsnakes/ppa -y \
+  && apt install python3.12-full python3.12-dev python3.10-venv -y \
+ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 \
+ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 2 \
+ && update-alternatives --set python3 /usr/bin/python3.12 \
+ && wget https://bootstrap.pypa.io/get-pip.py \
+ && python3 get-pip.py
+
+# Set timezone and install all packages
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+ && apt-get update && apt-get install -y --no-install-recommends \
+    tzdata \
+    software-properties-common netcat-openbsd kmod unzip openssh-server \
+    curl wget lsof zsh ccache tmux htop git-lfs tree \
+    build-essential cmake \
+    libopenmpi-dev libnuma1 libnuma-dev \
+    libibverbs-dev libibverbs1 libibumad3 \
+    librdmacm1 libnl-3-200 libnl-route-3-200 libnl-route-3-dev libnl-3-dev \
+    ibverbs-providers infiniband-diags perftest \
+    libgoogle-glog-dev libgtest-dev libjsoncpp-dev libunwind-dev \
+    libboost-all-dev libssl-dev \
+    libgrpc-dev libgrpc++-dev libprotobuf-dev protobuf-compiler protobuf-compiler-grpc \
+    pybind11-dev \
+    libhiredis-dev libcurl4-openssl-dev \
+    libczmq4 libczmq-dev \
+    libfabric-dev \
+    patchelf \
+    nvidia-dkms-550 \
+    devscripts debhelper fakeroot dkms check libsubunit0 libsubunit-dev \
+ && ln -sf /usr/bin/python3.12 /usr/bin/python \
+ && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean
+
+# GDRCopy installation
+RUN mkdir -p /tmp/gdrcopy && cd /tmp \
+ && git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \
+ && cd gdrcopy/packages \
+ && CUDA=/usr/local/cuda ./build-deb-packages.sh \
+ && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \
+ && cd / && rm -rf /tmp/gdrcopy
+
+# Fix DeepEP IBGDA symlink
+RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
+
+FROM scratch AS local_src
+COPY . /src
+
+FROM base AS build-image
+# Install SGLang
+WORKDIR /sgl-workspace
+ARG BRANCH_TYPE
+COPY --from=local_src /src /tmp/local_src
+RUN if [ "$BRANCH_TYPE" = "local" ]; then \
+        cp -r /tmp/local_src /sgl-workspace/sglang; \
+    else \
+        git clone --depth=1 https://github.com/sgl-project/sglang.git /sgl-workspace/sglang; \
+    fi \
+ && rm -rf /tmp/local_src
+RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5lib six \
+ && cd sglang \
+ && case "$CUDA_VERSION" in \
+      12.6.1) CUINDEX=126 ;; \
+      12.8.1) CUINDEX=128 ;; \
+      12.9.1) CUINDEX=129 ;; \
+      *) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
+    esac \
+ && python3 -m pip install --no-cache-dir -e "python[${BUILD_TYPE}]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} \
+ && python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.27.6 --force-reinstall --no-deps \
+ && python3 -m flashinfer --download-cubin \
+ && if [ "$CUDA_VERSION" = "12.6.1" ]; then \
+      python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.9.post2/sgl_kernel-0.3.9.post2+cu124-cp310-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \
+    fi
+
+# Download source files
+RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
+    git clone https://github.com/deepseek-ai/DeepEP.git && \
+    cd DeepEP && git checkout ${DEEPEP_COMMIT} && sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && \
+    cd .. && \
+    tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
+    mv nvshmem_src nvshmem && \
+    rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz
+
+# Build and install NVSHMEM
+RUN cd /sgl-workspace/nvshmem && \
+    NVSHMEM_SHMEM_SUPPORT=0 \
+    NVSHMEM_UCX_SUPPORT=0 \
+    NVSHMEM_USE_NCCL=0 \
+    NVSHMEM_MPI_SUPPORT=0 \
+    NVSHMEM_IBGDA_SUPPORT=1 \
+    NVSHMEM_PMIX_SUPPORT=0 \
+    NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
+    NVSHMEM_USE_GDRCOPY=1 \
+    cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES="90" && \
+    cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL}
+
+# Install DeepEP
+RUN cd /sgl-workspace/DeepEP && \
+    case "$CUDA_VERSION" in \
+      12.6.1) \
+        CHOSEN_TORCH_CUDA_ARCH_LIST='9.0' \
+        ;; \
+      12.8.1|12.9.1) \
+        CHOSEN_TORCH_CUDA_ARCH_LIST='9.0;10.0' \
+        ;; \
+      *) \
+        echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 \
+        ;; \
+    esac && \
+    NVSHMEM_DIR=${NVSHMEM_DIR} TORCH_CUDA_ARCH_LIST="${CHOSEN_TORCH_CUDA_ARCH_LIST}" pip install .
+
+# Python tools
+RUN python3 -m pip install --no-cache-dir \
+    datamodel_code_generator \
+    mooncake-transfer-engine==0.3.5 \
+    pre-commit \
+    pytest \
+    black \
+    isort \
+    icdiff \
+    uv \
+    wheel \
+    scikit-build-core \
+    nixl \
+    py-spy
+
+# Install development tools and utilities
+RUN apt-get update && apt-get install -y \
+    gdb \
+    ninja-build \
+    vim \
+    tmux \
+    htop \
+    wget \
+    curl \
+    locales \
+    lsof \
+    git \
+    git-lfs \
+    zsh \
+    tree \
+    silversearcher-ag \
+    cloc \
+    unzip \
+    pkg-config \
+    libssl-dev \
+    bear \
+    ccache \
+    less \
+    && apt install -y rdma-core infiniband-diags openssh-server perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+RUN apt update -y \
+    && apt install -y --no-install-recommends gnupg \
+    && echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu2004/amd64 /" | tee /etc/apt/sources.list.d/nvidia-devtools.list \
+    && apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub \
+    && apt update -y \
+    && apt install nsight-systems-cli -y
+
+# Set up locale
+RUN locale-gen en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US:en
+ENV LC_ALL en_US.UTF-8
+
+# Install minimal Python packages
+RUN python3 -m pip install --no-cache-dir --break-system-packages \
+    pytest \
+    black \
+    isort \
+    icdiff \
+    scikit_build_core \
+    uv \
+    pre-commit \
+    pandas \
+    matplotlib \
+    tabulate
+
+# Install diff-so-fancy
+RUN curl -LSso /usr/local/bin/diff-so-fancy https://github.com/so-fancy/diff-so-fancy/releases/download/v1.4.4/diff-so-fancy \
+    && chmod +x /usr/local/bin/diff-so-fancy
+
+# Install clang-format
+RUN curl -LSso /usr/local/bin/clang-format https://github.com/muttleyxd/clang-tools-static-binaries/releases/download/master-32d3ac78/clang-format-16_linux-amd64 \
+    && chmod +x /usr/local/bin/clang-format
+
+# Install clangd
+RUN curl -L https://github.com/clangd/clangd/releases/download/18.1.3/clangd-linux-18.1.3.zip -o clangd.zip \
+    && unzip clangd.zip \
+    && cp -r clangd_18.1.3/bin/* /usr/local/bin/ \
+    && cp -r clangd_18.1.3/lib/* /usr/local/lib/ \
+    && rm -rf clangd_18.1.3 clangd.zip
+
+# Install CMake
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.31.1/cmake-3.31.1-linux-x86_64.tar.gz \
+    && tar -xzf cmake-3.31.1-linux-x86_64.tar.gz \
+    && cp -r cmake-3.31.1-linux-x86_64/bin/* /usr/local/bin/ \
+    && cp -r cmake-3.31.1-linux-x86_64/share/* /usr/local/share/ \
+    && rm -rf cmake-3.31.1-linux-x86_64 cmake-3.31.1-linux-x86_64.tar.gz
+
+# Install Rust toolchain for sgl-router
+ENV PATH="/root/.cargo/bin:${PATH}"
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
+    && rustc --version && cargo --version
+
+# Build and install sgl-router
+RUN python3 -m pip install --no-cache-dir setuptools-rust \
+    && cd /sgl-workspace/sglang/sgl-router \
+    && cargo build --release \
+    && python3 -m pip install --no-cache-dir . \
+    && rm -rf /root/.cache
+
+
+# Add yank script
+COPY --chown=root:root <<-"EOF" /usr/local/bin/yank
+#!/bin/bash
+put() {
+  esc=$1
+  test -n "$TMUX" -o -z "${TERM##screen*}" && esc="\033Ptmux;\033$esc\033\\"
+  printf "$esc"
+}
+put "\033]52;c;!\a"
+buf=$( cat "$@" )
+len=$( printf %s "$buf" | wc -c ) max=74994
+test $len -gt $max && echo "$0: input is $(( len - max )) bytes too long" >&2
+put "\033]52;c;$( printf %s "$buf" | head -c $max | base64 | tr -d '\r\n' )\a"
+test -n "$TMUX" && tmux set-buffer "$buf" ||:
+EOF
+
+RUN chmod +x /usr/local/bin/yank
+
+# Install oh-my-zsh and plugins
+RUN sh -c "$(curl -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended \
+    && git clone https://github.com/zsh-users/zsh-autosuggestions ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-autosuggestions \
+    && git clone https://github.com/zsh-users/zsh-syntax-highlighting.git ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-syntax-highlighting
+
+# Configure Vim
+COPY --chown=root:root <<-"EOF" /root/.vimrc
+function! Yank(text) abort
+  let escape = system('yank', a:text)
+  if v:shell_error
+    echoerr escape
+  else
+    call writefile([escape], '/dev/tty', 'b')
+  endif
+endfunction
+
+noremap <silent> <Leader>y y:<C-U>call Yank(@0)<CR>
+
+" automatically run yank(1) whenever yanking in Vim
+function! CopyYank() abort
+  call Yank(join(v:event.regcontents, "\n"))
+endfunction
+
+autocmd TextYankPost * call CopyYank()
+
+" Basic settings
+set number
+syntax on
+set mouse=a
+filetype indent on
+
+" Indentation
+set autoindent nosmartindent
+set smarttab
+set expandtab
+set shiftwidth=4
+set softtabstop=4
+
+" Visual guides
+set colorcolumn=120
+highlight ColorColumn ctermbg=5
+
+" Status line
+set laststatus=2
+set statusline=%<%f\ %h%m%r%=%{\"[\".(&fenc==\"\"?&enc:&fenc).((exists(\"+bomb\")\ &&\ &bomb)?\",B\":\"\").\"]\ \"}%k\ %-14.(%l,%c%V%)\ %P
+
+" Backspace behavior
+set backspace=2
+
+" Encoding
+set encoding=utf-8
+set fileencoding=utf-8
+EOF
+
+# Configure tmux
+COPY --chown=root:root <<-"EOF" /root/.tmux.conf
+# Pane border styling
+set -g pane-border-style fg='#742727',bg=black
+set -g pane-active-border-style fg=red,bg=black
+
+# Status bar styling
+set -g status-style bg='#0C8A92',fg=black
+
+# Change prefix key to backtick
+set-option -g prefix `
+unbind C-b
+bind-key ` send-prefix
+
+# Split panes using - and = with current path
+unbind '"'
+bind - splitw -v -c '#{pane_current_path}'
+unbind '%'
+bind = splitw -h -c '#{pane_current_path}'
+
+# Vi mode settings
+bind-key -T copy-mode-vi Y send-keys -X copy-pipe 'yank > #{pane_tty}'
+set-window-option -g mode-keys vi
+
+# Other settings
+set-option -g escape-time 0
+set-option -g base-index 1
+set-window-option -g mouse on
+set -g history-limit 100000
+EOF
+
+# Configure Git
+RUN git config --global core.editor "vim" \
+    && git config --global core.whitespace "fix,-indent-with-non-tab,trailing-space,cr-at-eol" \
+    && git config --global core.pager "diff-so-fancy | less --tabs=4 -RFX" \
+    && git config --global color.ui true \
+    && git config --global color."diff-highlight".oldNormal "red bold" \
+    && git config --global color."diff-highlight".oldHighlight "red bold 52" \
+    && git config --global color."diff-highlight".newNormal "green bold" \
+    && git config --global color."diff-highlight".newHighlight "green bold 22" \
+    && git config --global color.diff.meta "11" \
+    && git config --global color.diff.frag "magenta bold" \
+    && git config --global color.diff.commit "yellow bold" \
+    && git config --global color.diff.old "red bold" \
+    && git config --global color.diff.new "green bold" \
+    && git config --global color.diff.whitespace "red reverse" \
+    && git config --global alias.lg "log --color --graph --pretty=format:'%Cred%h%Creset - %s %Cgreen(%cr) %C(bold blue)<%an>%Creset%C(auto)%d%Creset' --abbrev-commit --" \
+    && git config --global http.sslVerify false \
+    && git config --global pull.rebase true
+
+# Configure zsh
+COPY --chown=root:root <<-"EOF" /root/.zshrc
+export ZSH="/root/.oh-my-zsh"
+
+# Theme
+ZSH_THEME="robbyrussell"
+
+# Plugins
+plugins=(
+    git
+    z
+    zsh-autosuggestions
+    zsh-syntax-highlighting
+)
+
+source $ZSH/oh-my-zsh.sh
+
+# Aliases
+alias ll='ls -alF'
+alias la='ls -A'
+alias l='ls -CF'
+alias vi='vim'
+
+# Enhanced history
+HISTSIZE=10000
+SAVEHIST=10000
+setopt HIST_IGNORE_ALL_DUPS
+setopt HIST_FIND_NO_DUPS
+setopt INC_APPEND_HISTORY
+EOF
+
+RUN set -euxo ; \
+    curl --proto '=https' --tlsv1.2 -sSf https://just.systems/install.sh | bash -s -- --to /usr/local/bin
+
+# Set workspace directory
+WORKDIR /sgl-workspace/sglang
diff --git a/docker/Dockerfile.gb200 b/docker/Dockerfile.gb200
new file mode 100644
index 000000000..164326e23
--- /dev/null
+++ b/docker/Dockerfile.gb200
@@ -0,0 +1,348 @@
+ARG CUDA_VERSION=12.9.1
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04
+
+ARG BUILD_TYPE=blackwell
+ARG DEEPEP_COMMIT=1b14ad661c7640137fcfe93cccb2694ede1220b0
+ARG CMAKE_BUILD_PARALLEL_LEVEL=2
+ARG SGL_KERNEL_VERSION=0.3.9.post2
+ENV DEBIAN_FRONTEND=noninteractive \
+    CUDA_HOME=/usr/local/cuda \
+    GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ \
+    NVSHMEM_DIR=/sgl-workspace/nvshmem/install \
+    BUILD_TYPE=${BUILD_TYPE} \
+    TORCH_CUDA_ARCH_LIST="10.0 12.0"
+
+# Set timezone and install all packages
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+ && apt-get update && apt-get install -y --no-install-recommends \
+    tzdata \
+    software-properties-common netcat-openbsd kmod unzip openssh-server \
+    curl wget lsof zsh ccache tmux htop git-lfs tree \
+    python3 python3-pip python3-dev libpython3-dev python3-venv \
+    build-essential cmake \
+    libopenmpi-dev libnuma1 libnuma-dev \
+    libibverbs-dev libibverbs1 libibumad3 \
+    librdmacm1 libnl-3-200 libnl-route-3-200 libnl-route-3-dev libnl-3-dev \
+    ibverbs-providers infiniband-diags perftest \
+    libgoogle-glog-dev libgtest-dev libjsoncpp-dev libunwind-dev \
+    libboost-all-dev libssl-dev \
+    libgrpc-dev libgrpc++-dev libprotobuf-dev protobuf-compiler-grpc \
+    pybind11-dev \
+    libhiredis-dev libcurl4-openssl-dev \
+    libczmq4 libczmq-dev \
+    libfabric-dev \
+    patchelf \
+    nvidia-dkms-550 \
+    devscripts debhelper fakeroot dkms check libsubunit0 libsubunit-dev \
+ && ln -sf /usr/bin/python3 /usr/bin/python \
+ && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean
+
+# Install SGLang missing package for blackwell build type
+RUN python3 -m pip install openai httpx
+
+# GDRCopy installation
+RUN mkdir -p /tmp/gdrcopy && cd /tmp \
+ && git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \
+ && cd gdrcopy/packages \
+ && CUDA=/usr/local/cuda ./build-deb-packages.sh \
+ && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \
+ && cd / && rm -rf /tmp/gdrcopy
+
+# Fix DeepEP IBGDA symlink
+RUN ln -sf /usr/lib/$(uname -m)-linux-gnu/libmlx5.so.1 /usr/lib/$(uname -m)-linux-gnu/libmlx5.so
+
+# Clone and install SGLang
+WORKDIR /sgl-workspace
+RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5lib six \
+ && git clone --depth 1 https://github.com/sgl-project/sglang.git \
+ && cd sglang \
+ && case "$CUDA_VERSION" in \
+      12.9.1) CUINDEX=129 ;; \
+      *) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
+    esac \
+ && if [ "$CUDA_VERSION" = "12.9.1" ]; then \
+      python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.27.6 --force-reinstall --no-deps ; \
+      python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}+cu129-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps ; \
+    fi \
+ && python3 -m pip install --no-cache-dir -e "python[${BUILD_TYPE}]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} \
+ && python3 -m flashinfer --download-cubin
+
+# Download source files
+RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
+    git clone https://github.com/fzyzcjy/DeepEP.git && \
+    cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd .. && \
+    tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
+    mv nvshmem_src nvshmem && \
+    rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz
+
+# Build and install NVSHMEM
+RUN cd /sgl-workspace/nvshmem && \
+    NVSHMEM_SHMEM_SUPPORT=0 \
+    NVSHMEM_UCX_SUPPORT=0 \
+    NVSHMEM_USE_NCCL=0 \
+    NVSHMEM_MPI_SUPPORT=0 \
+    NVSHMEM_IBGDA_SUPPORT=1 \
+    NVSHMEM_PMIX_SUPPORT=0 \
+    NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
+    NVSHMEM_USE_GDRCOPY=1 \
+    cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES="90;100;120" && \
+    cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL}
+
+# Install DeepEP
+RUN cd /sgl-workspace/DeepEP && \
+    NVSHMEM_DIR=${NVSHMEM_DIR} pip install .
+
+# Python tools
+RUN python3 -m pip install --no-cache-dir \
+    datamodel_code_generator \
+    mooncake-transfer-engine==0.3.5 \
+    pre-commit \
+    pytest \
+    black \
+    isort \
+    icdiff \
+    uv \
+    wheel \
+    scikit-build-core
+
+# Install nixl kv transfer backend
+RUN python3 -m pip install --no-cache-dir \
+    nixl
+
+# Install development tools and utilities
+RUN apt-get update && apt-get install -y \
+    gdb \
+    ninja-build \
+    vim \
+    tmux \
+    htop \
+    wget \
+    curl \
+    locales \
+    lsof \
+    git \
+    git-lfs \
+    zsh \
+    tree \
+    silversearcher-ag \
+    cloc \
+    unzip \
+    pkg-config \
+    libssl-dev \
+    bear \
+    ccache \
+    less \
+    && apt install -y rdma-core infiniband-diags openssh-server perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+RUN apt update -y \
+    && apt install -y --no-install-recommends gnupg \
+    && echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu2004/$(if [ "$(uname -m)" = "aarch64" ]; then echo "arm64"; else echo "amd64"; fi) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list \
+    && apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/$(if [ "$(uname -m)" = "aarch64" ]; then echo "arm64"; else echo "x86_64"; fi)/7fa2af80.pub \
+    && apt update -y \
+    && apt install nsight-systems-cli -y
+
+# Set up locale
+RUN locale-gen en_US.UTF-8
+ENV LANG=en_US.UTF-8
+ENV LANGUAGE=en_US:en
+ENV LC_ALL=en_US.UTF-8
+
+# Install minimal Python packages
+RUN python3 -m pip install --no-cache-dir --break-system-packages \
+    pytest \
+    black \
+    isort \
+    icdiff \
+    scikit_build_core \
+    uv \
+    pre-commit \
+    pandas \
+    matplotlib \
+    tabulate
+
+# Install diff-so-fancy
+RUN curl -LSso /usr/local/bin/diff-so-fancy https://github.com/so-fancy/diff-so-fancy/releases/download/v1.4.4/diff-so-fancy \
+    && chmod +x /usr/local/bin/diff-so-fancy
+
+# Install clang-format
+RUN curl -LSso /usr/local/bin/clang-format https://github.com/muttleyxd/clang-tools-static-binaries/releases/download/master-32d3ac78/clang-format-16_linux-amd64 \
+    && chmod +x /usr/local/bin/clang-format
+
+# Install clangd
+RUN curl -L https://github.com/clangd/clangd/releases/download/18.1.3/clangd-linux-18.1.3.zip -o clangd.zip \
+    && unzip clangd.zip \
+    && cp -r clangd_18.1.3/bin/* /usr/local/bin/ \
+    && cp -r clangd_18.1.3/lib/* /usr/local/lib/ \
+    && rm -rf clangd_18.1.3 clangd.zip
+
+# Install CMake
+RUN CMAKE_VERSION=3.31.1 \
+    && ARCH=$(uname -m) \
+    && CMAKE_INSTALLER="cmake-${CMAKE_VERSION}-linux-${ARCH}" \
+    && wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_INSTALLER}.tar.gz" \
+    && tar -xzf "${CMAKE_INSTALLER}.tar.gz" \
+    && cp -r "${CMAKE_INSTALLER}/bin/"* /usr/local/bin/ \
+    && cp -r "${CMAKE_INSTALLER}/share/"* /usr/local/share/ \
+    && rm -rf "${CMAKE_INSTALLER}" "${CMAKE_INSTALLER}.tar.gz"
+
+# Add yank script
+COPY --chown=root:root <<-"EOF" /usr/local/bin/yank
+#!/bin/bash
+put() {
+  esc=$1
+  test -n "$TMUX" -o -z "${TERM##screen*}" && esc="\033Ptmux;\033$esc\033\\"
+  printf "$esc"
+}
+put "\033]52;c;!\a"
+buf=$( cat "$@" )
+len=$( printf %s "$buf" | wc -c ) max=74994
+test $len -gt $max && echo "$0: input is $(( len - max )) bytes too long" >&2
+put "\033]52;c;$( printf %s "$buf" | head -c $max | base64 | tr -d '\r\n' )\a"
+test -n "$TMUX" && tmux set-buffer "$buf" ||:
+EOF
+
+RUN chmod +x /usr/local/bin/yank
+
+# Install oh-my-zsh and plugins
+RUN sh -c "$(curl -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended \
+    && git clone https://github.com/zsh-users/zsh-autosuggestions ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-autosuggestions \
+    && git clone https://github.com/zsh-users/zsh-syntax-highlighting.git ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-syntax-highlighting
+
+# Configure Vim
+COPY --chown=root:root <<-"EOF" /root/.vimrc
+function! Yank(text) abort
+  let escape = system('yank', a:text)
+  if v:shell_error
+    echoerr escape
+  else
+    call writefile([escape], '/dev/tty', 'b')
+  endif
+endfunction
+
+noremap <silent> <Leader>y y:<C-U>call Yank(@0)<CR>
+
+" automatically run yank(1) whenever yanking in Vim
+function! CopyYank() abort
+  call Yank(join(v:event.regcontents, "\n"))
+endfunction
+
+autocmd TextYankPost * call CopyYank()
+
+" Basic settings
+set number
+syntax on
+set mouse=a
+filetype indent on
+
+" Indentation
+set autoindent nosmartindent
+set smarttab
+set expandtab
+set shiftwidth=4
+set softtabstop=4
+
+" Visual guides
+set colorcolumn=120
+highlight ColorColumn ctermbg=5
+
+" Status line
+set laststatus=2
+set statusline=%<%f\ %h%m%r%=%{\"[\".(&fenc==\"\"?&enc:&fenc).((exists(\"+bomb\")\ &&\ &bomb)?\",B\":\"\").\"]\ \"}%k\ %-14.(%l,%c%V%)\ %P
+
+" Backspace behavior
+set backspace=2
+
+" Encoding
+set encoding=utf-8
+set fileencoding=utf-8
+EOF
+
+# Configure tmux
+COPY --chown=root:root <<-"EOF" /root/.tmux.conf
+# Pane border styling
+set -g pane-border-style fg='#742727',bg=black
+set -g pane-active-border-style fg=red,bg=black
+
+# Status bar styling
+set -g status-style bg='#0C8A92',fg=black
+
+# Change prefix key to backtick
+set-option -g prefix `
+unbind C-b
+bind-key ` send-prefix
+
+# Split panes using - and = with current path
+unbind '"'
+bind - splitw -v -c '#{pane_current_path}'
+unbind '%'
+bind = splitw -h -c '#{pane_current_path}'
+
+# Vi mode settings
+bind-key -T copy-mode-vi Y send-keys -X copy-pipe 'yank > #{pane_tty}'
+set-window-option -g mode-keys vi
+
+# Other settings
+set-option -g escape-time 0
+set-option -g base-index 1
+set-window-option -g mouse on
+EOF
+
+# Configure Git
+RUN git config --global core.editor "vim" \
+    && git config --global core.whitespace "fix,-indent-with-non-tab,trailing-space,cr-at-eol" \
+    && git config --global core.pager "diff-so-fancy | less --tabs=4 -RFX" \
+    && git config --global color.ui true \
+    && git config --global color."diff-highlight".oldNormal "red bold" \
+    && git config --global color."diff-highlight".oldHighlight "red bold 52" \
+    && git config --global color."diff-highlight".newNormal "green bold" \
+    && git config --global color."diff-highlight".newHighlight "green bold 22" \
+    && git config --global color.diff.meta "11" \
+    && git config --global color.diff.frag "magenta bold" \
+    && git config --global color.diff.commit "yellow bold" \
+    && git config --global color.diff.old "red bold" \
+    && git config --global color.diff.new "green bold" \
+    && git config --global color.diff.whitespace "red reverse" \
+    && git config --global alias.lg "log --color --graph --pretty=format:'%Cred%h%Creset - %s %Cgreen(%cr) %C(bold blue)<%an>%Creset%C(auto)%d%Creset' --abbrev-commit --" \
+    && git config --global http.sslVerify false \
+    && git config --global pull.rebase true
+
+# Configure zsh
+COPY --chown=root:root <<-"EOF" /root/.zshrc
+export ZSH="/root/.oh-my-zsh"
+
+# Theme
+ZSH_THEME="robbyrussell"
+
+# Plugins
+plugins=(
+    git
+    z
+    zsh-autosuggestions
+    zsh-syntax-highlighting
+)
+
+source $ZSH/oh-my-zsh.sh
+
+# Aliases
+alias ll='ls -alF'
+alias la='ls -A'
+alias l='ls -CF'
+alias vi='vim'
+
+# Enhanced history
+HISTSIZE=10000
+SAVEHIST=10000
+setopt HIST_IGNORE_ALL_DUPS
+setopt HIST_FIND_NO_DUPS
+setopt INC_APPEND_HISTORY
+EOF
+
+RUN set -euxo ; \
+    curl --proto '=https' --tlsv1.2 -sSf https://just.systems/install.sh | bash -s -- --to /usr/local/bin
+
+# Set workspace directory
+WORKDIR /sgl-workspace/sglang
diff --git a/docker/Dockerfile.npu b/docker/Dockerfile.npu
new file mode 100644
index 000000000..3f9b0ae42
--- /dev/null
+++ b/docker/Dockerfile.npu
@@ -0,0 +1,93 @@
+ARG CANN_VERSION=8.2.rc1
+ARG DEVICE_TYPE=a3
+ARG OS=ubuntu22.04
+ARG PYTHON_VERSION=py3.11
+
+FROM quay.io/ascend/cann:$CANN_VERSION-$DEVICE_TYPE-$OS-$PYTHON_VERSION
+
+# Update pip & apt sources
+ARG PIP_INDEX_URL="https://pypi.org/simple/"
+ARG APTMIRROR=""
+ARG MEMFABRIC_URL=https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl
+ARG PYTORCH_VERSION=2.6.0
+ARG TORCHVISION_VERSION=0.21.0
+ARG PTA_URL="https://gitee.com/ascend/pytorch/releases/download/v7.1.0.1-pytorch2.6.0/torch_npu-2.6.0.post1-cp311-cp311-manylinux_2_28_aarch64.whl"
+ARG VLLM_TAG=v0.8.5
+ARG TRITON_ASCEND_URL=https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/triton_ascend-3.2.0.dev20250729-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl
+ARG SGLANG_TAG=main
+ARG ASCEND_CANN_PATH=/usr/local/Ascend/ascend-toolkit
+ARG SGLANG_KERNEL_NPU_TAG=main
+
+WORKDIR /workspace
+
+# Define environments
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN pip config set global.index-url $PIP_INDEX_URL
+RUN if [ -n "$APTMIRROR" ];then sed -i "s|.*.ubuntu.com|$APTMIRROR|g" /etc/apt/sources.list ;fi
+
+# Install development tools and utilities
+RUN apt-get update -y && apt upgrade -y && apt-get install -y \
+    build-essential \
+    cmake \
+    vim \
+    wget \
+    curl \
+    net-tools \
+    zlib1g-dev \
+    lld \
+    clang \
+    locales \
+    ccache \
+    openssl \
+    libssl-dev \
+    pkg-config \
+    ca-certificates \
+    protobuf-compiler \
+    && rm -rf /var/cache/apt/* \
+    && rm -rf /var/lib/apt/lists/* \
+    && update-ca-certificates \
+    && locale-gen en_US.UTF-8
+
+ENV LANG=en_US.UTF-8
+ENV LANGUAGE=en_US:en
+ENV LC_ALL=en_US.UTF-8
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Install dependencies
+# TODO: install from pypi released memfabric
+RUN pip install $MEMFABRIC_URL --no-cache-dir
+
+RUN pip install setuptools-rust wheel build --no-cache-dir
+
+# install rustup from rustup.rs
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
+    && rustc --version && cargo --version && protoc --version
+
+# Install vLLM
+RUN git clone --depth 1 https://github.com/vllm-project/vllm.git --branch $VLLM_TAG && \
+    (cd vllm && VLLM_TARGET_DEVICE="empty" pip install -v . --no-cache-dir) && rm -rf vllm
+
+# TODO: install from pypi released triton-ascend
+RUN pip install torch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION --index-url https://download.pytorch.org/whl/cpu --no-cache-dir \
+    && wget ${PTA_URL} && pip install "./torch_npu-2.6.0.post1-cp311-cp311-manylinux_2_28_aarch64.whl" --no-cache-dir \
+    && python3 -m pip install --no-cache-dir attrs==24.2.0 numpy==1.26.4 scipy==1.13.1 decorator==5.1.1 psutil==6.0.0 pytest==8.3.2 pytest-xdist==3.6.1 pyyaml pybind11 \
+    && pip install ${TRITON_ASCEND_URL} --no-cache-dir
+
+# Install SGLang
+RUN git clone https://github.com/sgl-project/sglang --branch $SGLANG_TAG && \
+    (cd sglang/python && pip install -v .[srt_npu] --no-cache-dir) && \
+    (cd sglang/sgl-router && python -m build && pip install --force-reinstall dist/*.whl) && \
+    rm -rf sglang
+
+# Install Deep-ep
+RUN git clone  --branch $SGLANG_KERNEL_NPU_TAG https://github.com/sgl-project/sgl-kernel-npu.git \
+    && export LD_LIBRARY_PATH=${ASCEND_CANN_PATH}/latest/runtime/lib64/stub:$LD_LIBRARY_PATH && \
+    source ${ASCEND_CANN_PATH}/set_env.sh && \
+    cd sgl-kernel-npu && \
+    bash build.sh \
+    && pip install output/deep_ep*.whl --no-cache-dir \
+    && cd .. && rm -rf sgl-kernel-npu \
+    && cd "$(pip show deep-ep | awk '/^Location:/ {print $2}')" && ln -s deep_ep/deep_ep_cpp*.so
+
+CMD ["/bin/bash"]
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
new file mode 100644
index 000000000..2c3c9c0be
--- /dev/null
+++ b/docker/Dockerfile.rocm
@@ -0,0 +1,213 @@
+# Usage (to build SGLang ROCm docker image):
+#   docker build --build-arg SGL_BRANCH=v0.5.2 --build-arg GPU_ARCH=gfx942 -t v0.5.2-rocm630-mi30x -f Dockerfile.rocm .
+#   docker build --build-arg SGL_BRANCH=v0.5.2 --build-arg GPU_ARCH=gfx942-rocm700 -t v0.5.2-rocm700-mi30x -f Dockerfile.rocm .
+#   docker build --build-arg SGL_BRANCH=v0.5.2 --build-arg GPU_ARCH=gfx950 -t v0.5.2-rocm700-mi35x -f Dockerfile.rocm .
+
+
+# Default base images
+ARG BASE_IMAGE_942="rocm/sgl-dev:vllm20250114"
+ARG BASE_IMAGE_942_ROCM700="rocm/sgl-dev:rocm7-vllm-20250904"
+ARG BASE_IMAGE_950="rocm/sgl-dev:rocm7-vllm-20250904"
+
+# This is necessary for scope purpose
+ARG GPU_ARCH=gfx950
+
+# ===============================
+# Base image 942 with rocm630 and args
+FROM $BASE_IMAGE_942 AS gfx942
+ENV BUILD_VLLM="0"
+ENV BUILD_TRITON="1"
+ENV BUILD_LLVM="0"
+ENV BUILD_AITER_ALL="1"
+ENV BUILD_MOONCAKE="1"
+ENV AITER_COMMIT="v0.1.4"
+ENV NO_DEPS_FLAG=""
+
+# ===============================
+# Base image 942 and args
+FROM $BASE_IMAGE_942_ROCM700 AS gfx942-rocm700
+ENV BUILD_VLLM="0"
+ENV BUILD_TRITON="0"
+ENV BUILD_LLVM="0"
+ENV BUILD_AITER_ALL="1"
+ENV BUILD_MOONCAKE="1"
+ENV AITER_COMMIT="v0.1.5"
+ENV NO_DEPS_FLAG=""
+
+# ===============================
+# Base image 950 and args
+FROM $BASE_IMAGE_950 AS gfx950
+ENV BUILD_VLLM="0"
+ENV BUILD_TRITON="0"
+ENV BUILD_LLVM="0"
+ENV BUILD_AITER_ALL="1"
+ENV BUILD_MOONCAKE="1"
+ENV AITER_COMMIT="v0.1.5"
+ENV NO_DEPS_FLAG="--no-deps"
+
+# ===============================
+# Chosen arch and args
+FROM ${GPU_ARCH}
+
+# This is necessary for scope purpose, again
+ARG GPU_ARCH=gfx950
+ENV GPU_ARCH_LIST=${GPU_ARCH%-*}
+
+ARG SGL_REPO="https://github.com/sgl-project/sglang.git"
+ARG SGL_DEFAULT="main"
+ARG SGL_BRANCH=${SGL_DEFAULT}
+
+ARG TRITON_REPO="https://github.com/ROCm/triton.git"
+ARG TRITON_COMMIT="improve_fa_decode_3.0.0"
+
+ARG AITER_REPO="https://github.com/ROCm/aiter.git"
+
+ARG LLVM_REPO="https://github.com/jrbyrnes/llvm-project.git"
+ARG LLVM_BRANCH="MainOpSelV2"
+ARG LLVM_COMMIT="6520ace8227ffe2728148d5f3b9872a870b0a560"
+
+ARG MOONCAKE_REPO="https://github.com/kvcache-ai/Mooncake.git"
+ARG MOONCAKE_COMMIT="dcdf1c784b40aa6975a8ed89fe26321b028e40e8"
+
+USER root
+
+# Install some basic utilities
+RUN python -m pip install --upgrade pip && pip install setuptools_scm
+RUN apt-get purge -y sccache; python -m pip uninstall -y sccache; rm -f "$(which sccache)"
+
+WORKDIR /sgl-workspace
+
+# -----------------------
+# llvm
+RUN if [ "$BUILD_LLVM" = "1" ]; then \
+     ENV HIP_CLANG_PATH="/sgl-workspace/llvm-project/build/bin/" \
+     git clone --single-branch ${LLVM_REPO} -b ${LLVM_BRANCH} \
+     && cd llvm-project \
+     && git checkout ${LLVM_COMMIT} \
+     && mkdir build \
+     && cd build \
+     && cmake -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm \
+     && make -j$(nproc); \
+    fi
+
+# -----------------------
+
+# -----------------------
+# AITER
+RUN pip uninstall -y aiter
+RUN git clone ${AITER_REPO} \
+ && cd aiter \
+ && git checkout ${AITER_COMMIT} \
+ && git submodule update --init --recursive
+RUN cd aiter \
+     && if [ "$BUILD_AITER_ALL" = "1" ] && [ "$BUILD_LLVM" = "1" ]; then \
+          HIP_CLANG_PATH=/sgl-workspace/llvm-project/build/bin/ PREBUILD_KERNELS=1 GPU_ARCHS=$GPU_ARCH_LIST python setup.py develop; \
+        elif [ "$BUILD_AITER_ALL" = "1" ]; then \
+          PREBUILD_KERNELS=1 GPU_ARCHS=$GPU_ARCH_LIST python setup.py develop; \
+        else \
+          GPU_ARCHS=$GPU_ARCH_LIST python setup.py develop; \
+        fi
+
+# -----------------------
+# Triton
+RUN if [ "$BUILD_TRITON" = "1" ]; then \
+        pip uninstall -y triton \
+     && git clone ${TRITON_REPO} \
+     && cd triton \
+     && git checkout ${TRITON_COMMIT} \
+     && cd python \
+     && python setup.py install; \
+    fi
+
+# -----------------------
+# Build vLLM
+ARG VLLM_REPO="https://github.com/ROCm/vllm.git"
+ARG VLLM_BRANCH="9f6b92db47c3444b7a7d67451ba0c3a2d6af4c2c"
+RUN if [ "$BUILD_VLLM" = "1" ]; then \
+        git clone ${VLLM_REPO} \
+     && cd vllm \
+     && git checkout ${VLLM_BRANCH} \
+     && python -m pip install -r requirements/rocm.txt \
+     && python setup.py clean --all \
+     && python setup.py develop; \
+    fi
+
+# -----------------------
+# Build Mooncake
+ENV PATH=$PATH:/usr/local/go/bin
+
+RUN if [ "$BUILD_MOONCAKE" = "1" ]; then \
+     apt update && apt install -y zip unzip wget && \
+     apt install -y gcc make libtool autoconf  librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool  libibverbs-dev rdma-core && \
+     apt install -y openssh-server openmpi-bin openmpi-common libopenmpi-dev && \
+     git clone ${MOONCAKE_REPO} && \
+     cd Mooncake && \
+     git checkout ${MOONCAKE_COMMIT} && \
+     git submodule update --init --recursive && \
+     bash dependencies.sh -y && \
+     rm -rf /usr/local/go && \
+     wget https://go.dev/dl/go1.22.2.linux-amd64.tar.gz && \
+     tar -C /usr/local -xzf go1.22.2.linux-amd64.tar.gz && \
+     rm go1.22.2.linux-amd64.tar.gz && \
+     mkdir -p build && \
+     cd build && \
+     cmake .. -DUSE_ETCD=ON && \
+     make -j "$(nproc)" && make install; \
+    fi
+
+
+# -----------------------
+# Build SGLang
+ARG BUILD_TYPE=all
+
+RUN pip install IPython \
+    && pip install orjson \
+    && pip install python-multipart \
+    && pip install torchao==0.9.0 \
+    && pip install pybind11
+
+RUN pip uninstall -y sgl_kernel sglang
+RUN git clone ${SGL_REPO} \
+    && cd sglang \
+    && if [ "${SGL_BRANCH}" = ${SGL_DEFAULT} ]; then \
+         echo "Using ${SGL_DEFAULT}, default branch."; \
+         git checkout ${SGL_DEFAULT}; \
+       else \
+         echo "Using ${SGL_BRANCH} branch."; \
+         git checkout ${SGL_BRANCH}; \
+       fi \
+    && cd sgl-kernel \
+    && rm -f pyproject.toml \
+    && mv pyproject_rocm.toml pyproject.toml \
+    && AMDGPU_TARGET=$GPU_ARCH_LIST python setup_rocm.py install \
+    && cd .. \
+    && if [ "$BUILD_TYPE" = "srt" ]; then \
+         python -m pip --no-cache-dir install -e "python[srt_hip]" ${NO_DEPS_FLAG}; \
+       else \
+         python -m pip --no-cache-dir install -e "python[all_hip]" ${NO_DEPS_FLAG}; \
+       fi
+
+RUN python -m pip cache purge
+
+# Copy config files to support MI300X in virtualized environments (MI300X_VF).  Symlinks will not be created in image build.
+RUN find /sgl-workspace/sglang/python/sglang/srt/layers/quantization/configs/ \
+         /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/ \
+         -type f -name '*MI300X*' | xargs -I {} sh -c 'vf_config=$(echo "$1" | sed "s/MI300X/MI300X_VF/"); cp "$1" "$vf_config"' -- {}
+
+# Performance environment variable.
+ENV HIP_FORCE_DEV_KERNARG=1
+ENV HSA_NO_SCRATCH_RECLAIM=1
+ENV SGLANG_SET_CPU_AFFINITY=1
+ENV SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
+ENV NCCL_MIN_NCHANNELS=112
+
+ENV SGLANG_USE_AITER=1
+ENV SGLANG_MOE_PADDING=1
+ENV VLLM_FP8_PADDING=1
+ENV VLLM_FP8_ACT_PADDING=1
+ENV VLLM_FP8_WEIGHT_PADDING=1
+ENV VLLM_FP8_REDUCE_CONV=1
+ENV TORCHINDUCTOR_MAX_AUTOTUNE=1
+ENV TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE=1
+
+CMD ["/bin/bash"]
diff --git a/docker/Dockerfile.router b/docker/Dockerfile.router
new file mode 100644
index 000000000..ded98bb8a
--- /dev/null
+++ b/docker/Dockerfile.router
@@ -0,0 +1,78 @@
+######################## BASE IMAGE ##########################
+FROM ubuntu:24.04 AS base
+
+ARG PYTHON_VERSION=3.12
+
+# set the environment variables
+ENV PATH="/root/.local/bin:${PATH}"
+ENV DEBIAN_FRONTEND=noninteractive
+
+# uv environment variables
+ENV UV_HTTP_TIMEOUT=500
+ENV VIRTUAL_ENV="/opt/venv"
+ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
+ENV UV_LINK_MODE="copy"
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+
+# install dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt update -y \
+    && apt install -y curl \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt clean
+
+# install uv
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# install python
+RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
+
+######################### BUILD IMAGE #########################
+FROM base AS build-image
+
+ARG SGLANG_REPO_REF=main
+
+# set the environment variables
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# install dependencies
+RUN apt update -y \
+    && apt install -y git build-essential libssl-dev pkg-config protobuf-compiler \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt clean
+
+# install rustup from rustup.rs
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
+    && rustc --version && cargo --version && protoc --version
+
+# pull the github repository
+RUN cd /opt \
+    && git clone --depth=1 https://github.com/sgl-project/sglang.git \
+    && cd /opt/sglang \
+    && git checkout ${SGLANG_REPO_REF}
+
+# working directory
+WORKDIR /opt/sglang/sgl-router
+
+# build the rust dependencies
+RUN cargo build --release \
+    && uv build \
+    && rm -rf /root/.cache
+
+######################### ROUTER IMAGE #########################
+FROM base AS router-image
+
+# Copy the built package from the build image
+COPY --from=build-image /opt/sglang/sgl-router/dist/*.whl dist/
+
+# Build the package and install
+RUN uv pip install --force-reinstall dist/*.whl
+
+# Clean up unnecessary files to reduce the image size
+RUN rm -rf /root/.cache \
+    && apt purge -y --auto-remove curl
+
+# Set the entrypoint to the main command
+ENTRYPOINT ["python3", "-m", "sglang_router.launch_router"]
diff --git a/docker/Dockerfile.sagemaker b/docker/Dockerfile.sagemaker
new file mode 100644
index 000000000..5fbff5096
--- /dev/null
+++ b/docker/Dockerfile.sagemaker
@@ -0,0 +1,6 @@
+FROM lmsysorg/sglang:latest
+
+COPY serve /usr/bin/serve
+RUN chmod 777 /usr/bin/serve
+
+ENTRYPOINT [ "/usr/bin/serve" ]
diff --git a/docker/Dockerfile.xeon b/docker/Dockerfile.xeon
new file mode 100644
index 000000000..fdc439b30
--- /dev/null
+++ b/docker/Dockerfile.xeon
@@ -0,0 +1,48 @@
+FROM ubuntu:24.04
+SHELL ["/bin/bash", "-c"]
+
+ARG VER_SGLANG=main
+ARG VER_TORCH=2.7.1
+ARG VER_TORCHVISION=0.22.1
+ARG VER_TRITON=3.3.1
+
+RUN apt-get update && \
+    apt-get full-upgrade -y && \
+    DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
+    ca-certificates \
+    git \
+    curl \
+    wget \
+    vim \
+    gcc \
+    g++ \
+    make
+
+WORKDIR /sgl-workspace
+
+RUN curl -fsSL -v -o miniforge.sh -O https://github.com/conda-forge/miniforge/releases/download/24.11.3-2/Miniforge3-24.11.3-2-Linux-x86_64.sh && \
+    bash miniforge.sh -b -p ./miniforge3 && \
+    rm -f miniforge.sh && \
+    . miniforge3/bin/activate && \
+    conda install -y libsqlite==3.48.0 gperftools tbb libnuma numactl
+
+ENV PATH=/sgl-workspace/miniforge3/bin:/sgl-workspace/miniforge3/condabin:${PATH}
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV CONDA_PREFIX=/sgl-workspace/miniforge3
+
+RUN pip config set global.index-url https://download.pytorch.org/whl/cpu && \
+    pip config set global.extra-index-url https://pypi.org/simple
+
+RUN git clone https://github.com/sgl-project/sglang.git && \
+    cd sglang && \
+    git checkout ${VER_SGLANG} && \
+    pip install -e "python[all_cpu]" && \
+    pip install torch==${VER_TORCH} torchvision==${VER_TORCHVISION} triton==${VER_TRITON} --force-reinstall && \
+    cd sgl-kernel && \
+    cp pyproject_cpu.toml pyproject.toml && \
+    pip install .
+
+ENV SGLANG_USE_CPU_ENGINE=1
+ENV LD_PRELOAD=/sgl-workspace/miniforge3/lib/libiomp5.so:/sgl-workspace/miniforge3/lib/libtcmalloc.so:/sgl-workspace/miniforge3/lib/libtbbmalloc.so.2
+
+WORKDIR /sgl-workspace/sglang
diff --git a/docker/compose.yaml b/docker/compose.yaml
new file mode 100644
index 000000000..f7ff1fbd5
--- /dev/null
+++ b/docker/compose.yaml
@@ -0,0 +1,35 @@
+services:
+  sglang:
+    image: lmsysorg/sglang:latest
+    container_name: sglang
+    volumes:
+      - ${HOME}/.cache/huggingface:/root/.cache/huggingface
+      # If you use modelscope, you need mount this directory
+      # - ${HOME}/.cache/modelscope:/root/.cache/modelscope
+    restart: always
+    network_mode: host # required by RDMA
+    privileged: true # required by RDMA
+    # Or you can only publish port 30000
+    # ports:
+    #   - 30000:30000
+    environment:
+      HF_TOKEN: <secret>
+      # if you use modelscope to download model, you need set this environment
+      # - SGLANG_USE_MODELSCOPE: true
+    entrypoint: python3 -m sglang.launch_server
+    command: --model-path meta-llama/Llama-3.1-8B-Instruct
+      --host 0.0.0.0
+      --port 30000
+    ulimits:
+      memlock: -1
+      stack: 67108864
+    ipc: host
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:30000/health || exit 1"]
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["0"]
+              capabilities: [gpu]
diff --git a/docker/k8s-sglang-distributed-sts.yaml b/docker/k8s-sglang-distributed-sts.yaml
new file mode 100644
index 000000000..4252363c7
--- /dev/null
+++ b/docker/k8s-sglang-distributed-sts.yaml
@@ -0,0 +1,103 @@
+# Two Nodes Sglang example
+
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: distributed-sglang
+spec:
+  replicas: 2   # number of nodes/pods to run distributed sglang
+  selector:
+    matchLabels:
+      app: distributed-sglang
+  serviceName: ""
+  template:
+    metadata:
+      labels:
+        app: distributed-sglang
+    spec:
+      containers:
+      - name: sglang-container
+        image: docker.io/lmsysorg/sglang:latest
+        imagePullPolicy: Always # image may be replaced by official CI versioned image
+        command:
+        - /bin/bash
+        - -c
+        # please modify the sglang serving arguments below, as necessary.
+        # NOTE: the --expert-parallel-size is for MoE model like DeepSeek-R1
+        args:
+        - |
+          python3 -m sglang.launch_server \
+          --model /llm-folder \
+          --dist-init-addr sglang-master-pod:5000 \
+          --tensor-parallel-size 16 \
+          --nnodes 2 \
+          --node-rank $POD_INDEX \
+          --trust-remote-code \
+          --host 0.0.0.0 \
+          --port 8000 \
+          --enable-metrics \
+          --expert-parallel-size 16
+        env:
+        - name: POD_INDEX     # reflects the node-rank
+          valueFrom:
+            fieldRef:
+              apiVersion: v1
+              fieldPath: metadata.labels['apps.kubernetes.io/pod-index']
+        - name: NCCL_DEBUG
+          value: INFO
+        resources:
+          limits:
+            nvidia.com/gpu: "8"
+          requests:
+        volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+        - mountPath: /llm-folder
+          name: llm
+        securityContext:
+          privileged: true   # to leverage RDMA/InfiniBand device, co-work with HostNetwork=true
+      hostNetwork: true
+      volumes:
+      - emptyDir:
+          medium: Memory
+          sizeLimit: 10Gi
+        name: dshm
+      - hostPath:
+          path: /llm-folder # replace with PVC or hostPath with your model weights
+          type: DirectoryOrCreate
+        name: llm
+      #- persistentVolumeClaim:
+      #  claimName: llm-pvc
+      #  name: llm
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: sglang-master-pod
+spec:
+  type: ClusterIP
+  selector:
+    app: distributed-sglang
+    apps.kubernetes.io/pod-index: "0"
+  ports:
+  - name: dist-port
+    port: 5000
+    targetPort: 5000
+---
+# the serving service
+apiVersion: v1
+kind: Service
+metadata:
+  name: sglang-serving-on-master
+spec:
+  type: NodePort
+  selector:
+    app: distributed-sglang
+    apps.kubernetes.io/pod-index: "0"
+  ports:
+  - name: serving
+    port: 8000
+    targetPort: 8000
+  - name: metrics
+    port: 8080
+    targetPort: 8080
diff --git a/docker/k8s-sglang-service.yaml b/docker/k8s-sglang-service.yaml
new file mode 100644
index 000000000..866d50be9
--- /dev/null
+++ b/docker/k8s-sglang-service.yaml
@@ -0,0 +1,117 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: llama-31-8b-sglang
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 30Gi
+  storageClassName: default # change this to your preferred storage class
+  volumeMode: Filesystem
+---
+apiVersion: node.k8s.io/v1
+kind: RuntimeClass
+metadata:
+  name: nvidia
+handler: nvidia
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: meta-llama-31-8b-instruct-sglang
+spec:
+  replicas: 1
+  strategy:
+    type: Recreate
+  selector:
+    matchLabels:
+      app: meta-llama-31-8b-instruct-sglang
+  template:
+    metadata:
+      labels:
+        app: meta-llama-31-8b-instruct-sglang
+        model: meta-llama-31-8b-instruct
+        engine: sglang
+    spec:
+      restartPolicy: Always
+      runtimeClassName: nvidia
+      containers:
+        - name: meta-llama-31-8b-instruct-sglang
+          image: docker.io/lmsysorg/sglang:latest
+          imagePullPolicy: Always # IfNotPresent or Never
+          ports:
+            - containerPort: 30000
+          command: ["python3", "-m", "sglang.launch_server"]
+          args:
+            [
+              "--model-path",
+              "meta-llama/Llama-3.1-8B-Instruct",
+              "--host",
+              "0.0.0.0",
+              "--port",
+              "30000",
+            ]
+          env:
+            - name: HF_TOKEN
+              value: <secret>
+          resources:
+            limits:
+              nvidia.com/gpu: 1
+              cpu: 8
+              memory: 40Gi
+            requests:
+              cpu: 2
+              memory: 16Gi
+              nvidia.com/gpu: 1
+          volumeMounts:
+            - name: shm
+              mountPath: /dev/shm
+            - name: hf-cache
+              mountPath: /root/.cache/huggingface
+            - name: localtime
+              mountPath: /etc/localtime
+              readOnly: true
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: 30000
+            initialDelaySeconds: 120
+            periodSeconds: 15
+            timeoutSeconds: 10
+            failureThreshold: 3
+          readinessProbe:
+            httpGet:
+              path: /health_generate
+              port: 30000
+            initialDelaySeconds: 120
+            periodSeconds: 15
+            timeoutSeconds: 10
+            failureThreshold: 3
+            successThreshold: 1
+      volumes:
+        - name: shm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 10Gi
+        - name: hf-cache
+          persistentVolumeClaim:
+            claimName: llama-31-8b-sglang
+        - name: localtime
+          hostPath:
+            path: /etc/localtime
+            type: File
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: meta-llama-31-8b-instruct-sglang
+spec:
+  selector:
+    app: meta-llama-31-8b-instruct-sglang
+  ports:
+    - protocol: TCP
+      port: 80 # port on host
+      targetPort: 30000 # port in container
+  type: LoadBalancer # change to ClusterIP if needed
diff --git a/docker/serve b/docker/serve
new file mode 100755
index 000000000..493ecbd23
--- /dev/null
+++ b/docker/serve
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+echo "Starting server"
+
+SERVER_ARGS="--host 0.0.0.0 --port 8080"
+
+if [ -n "$TENSOR_PARALLEL_DEGREE" ]; then
+    SERVER_ARGS="${SERVER_ARGS} --tp-size ${TENSOR_PARALLEL_DEGREE}"
+fi
+
+if [ -n "$DATA_PARALLEL_DEGREE" ]; then
+    SERVER_ARGS="${SERVER_ARGS} --dp-size ${DATA_PARALLEL_DEGREE}"
+fi
+
+if [ -n "$EXPERT_PARALLEL_DEGREE" ]; then
+    SERVER_ARGS="${SERVER_ARGS} --ep-size ${EXPERT_PARALLEL_DEGREE}"
+fi
+
+if [ -n "$MEM_FRACTION_STATIC" ]; then
+    SERVER_ARGS="${SERVER_ARGS} --mem-fraction-static ${MEM_FRACTION_STATIC}"
+fi
+
+if [ -n "$QUANTIZATION" ]; then
+    SERVER_ARGS="${SERVER_ARGS} --quantization ${QUANTIZATION}"
+fi
+
+if [ -n "$CHUNKED_PREFILL_SIZE" ]; then
+    SERVER_ARGS="${SERVER_ARGS} --chunked-prefill-size ${CHUNKED_PREFILL_SIZE}"
+fi
+
+python3 -m sglang.launch_server --model-path /opt/ml/model $SERVER_ARGS
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 000000000..6b8792c42
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,58 @@
+# Minimal Makefile for Sphinx documentation
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SPHINXAUTOBUILD ?= sphinx-autobuild
+SOURCEDIR     = .
+BUILDDIR      = _build
+PORT          ?= 8003
+
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+	@echo ""
+	@echo "Additional targets:"
+	@echo "  serve       to build and serve documentation with auto-build and live reload"
+
+# Compile Notebook files and record execution time
+compile:
+	@set -e; \
+	echo "Starting Notebook compilation..."; \
+	mkdir -p logs; \
+	echo "Notebook execution timings:" > logs/timing.log; \
+	START_TOTAL=$$(date +%s); \
+	find $(SOURCEDIR) -path "*/_build/*" -prune -o -name "*.ipynb" -print0 | \
+		parallel -0 -j3 --halt soon,fail=1 ' \
+		NB_NAME=$$(basename {}); \
+		START_TIME=$$(date +%s); \
+		retry --delay=0 --times=2 -- \
+			jupyter nbconvert --to notebook --execute --inplace "{}" \
+			--ExecutePreprocessor.timeout=600 \
+			--ExecutePreprocessor.kernel_name=python3; \
+		RET_CODE=$$?; \
+		END_TIME=$$(date +%s); \
+		ELAPSED_TIME=$$((END_TIME - START_TIME)); \
+		echo "$${NB_NAME}: $${ELAPSED_TIME}s" >> logs/timing.log; \
+		exit $$RET_CODE' || exit 1; \
+	END_TOTAL=$$(date +%s); \
+	TOTAL_ELAPSED=$$((END_TOTAL - START_TOTAL)); \
+	echo "---------------------------------" >> logs/timing.log; \
+	echo "Total execution time: $${TOTAL_ELAPSED}s" >> logs/timing.log; \
+	echo "All Notebook execution timings:" && cat logs/timing.log
+
+# Serve documentation with auto-build and live reload
+serve:
+	@echo "Starting auto-build server at http://0.0.0.0:$(PORT)"
+	@$(SPHINXAUTOBUILD) "$(SOURCEDIR)" "$(BUILDDIR)/html" \
+		--host 0.0.0.0 \
+		--port $(PORT) \
+		--watch $(SOURCEDIR) \
+		--re-ignore ".*\.(ipynb_checkpoints|pyc|pyo|pyd|git)"
+
+.PHONY: help Makefile compile clean serve
+
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+clean:
+	find . -name "*.ipynb" -exec nbstripout {} \;
+	rm -rf $(BUILDDIR)
+	rm -rf logs
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 000000000..2edb2c619
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,55 @@
+# SGLang Documentation
+
+We recommend new contributors start from writing documentation, which helps you quickly understand SGLang codebase.
+Most documentation files are located under the `docs/` folder.
+
+## Docs Workflow
+
+### Install Dependency
+
+```bash
+apt-get update && apt-get install -y pandoc parallel retry
+pip install -r requirements.txt
+```
+
+### Update Documentation
+
+Update your Jupyter notebooks in the appropriate subdirectories under `docs/`. If you add new files, remember to update `index.rst` (or relevant `.rst` files) accordingly.
+
+- **`pre-commit run --all-files`** manually runs all configured checks, applying fixes if possible. If it fails the first time, re-run it to ensure lint errors are fully resolved. Make sure your code passes all checks **before** creating a Pull Request.
+
+```bash
+# 1) Compile all Jupyter notebooks
+make compile  # This step can take a long time (10+ mins). You can consider skipping this step if you can make sure your added files are correct.
+make html
+
+# 2) Compile and Preview documentation locally with auto-build
+# This will automatically rebuild docs when files change
+# Open your browser at the displayed port to view the docs
+bash serve.sh
+
+# 2a) Alternative ways to serve documentation
+# Directly use make serve
+make serve
+# With custom port
+PORT=8080 make serve
+
+# 3) Clean notebook outputs
+# nbstripout removes notebook outputs so your PR stays clean
+pip install nbstripout
+find . -name '*.ipynb' -exec nbstripout {} \;
+
+# 4) Pre-commit checks and create a PR
+# After these checks pass, push your changes and open a PR on your branch
+pre-commit run --all-files
+```
+---
+
+## Documentation Style Guidelines
+
+- For common functionalities, we prefer **Jupyter Notebooks** over Markdown so that all examples can be executed and validated by our docs CI pipeline. For complex features (e.g., distributed serving), Markdown is preferred.
+- Keep in mind the documentation execution time when writing interactive Jupyter notebooks. Each interactive notebook will be run and compiled against every commit to ensure they are runnable, so it is important to apply some tips to reduce the documentation compilation time:
+  - Use small models (e.g., `qwen/qwen2.5-0.5b-instruct`) for most cases to reduce server launch time.
+  - Reuse the launched server as much as possible to reduce server launch time.
+- Do not use absolute links (e.g., `https://docs.sglang.ai/get_started/install.html`). Always prefer relative links (e.g., `../get_started/install.md`).
+- Follow the existing examples to learn how to launch a server, send a query and other common styles.
diff --git a/docs/_static/css/custom_log.css b/docs/_static/css/custom_log.css
new file mode 100644
index 000000000..61f65d019
--- /dev/null
+++ b/docs/_static/css/custom_log.css
@@ -0,0 +1,29 @@
+.output_area {
+    color: #615656;
+}
+
+table.autosummary td {
+    width: 50%
+  }
+
+  img.align-center {
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+}
+
+.output_area.stderr {
+    color: #d3d3d3 !important;
+}
+
+.output_area.stdout {
+    color: #d3d3d3 !important;
+}
+
+div.output_area.stderr {
+    color: #d3d3d3 !important;
+}
+
+div.output_area.stdout {
+    color: #d3d3d3 !important;
+}
diff --git a/docs/_static/css/readthedocs.css b/docs/_static/css/readthedocs.css
new file mode 100644
index 000000000..aca6649b4
--- /dev/null
+++ b/docs/_static/css/readthedocs.css
@@ -0,0 +1,9 @@
+table.autosummary td {
+  width: 50%
+}
+
+img.align-center {
+  display: block;
+  margin-left: auto;
+  margin-right: auto;
+}
diff --git a/docs/_static/image/logo.ico b/docs/_static/image/logo.ico
new file mode 100644
index 000000000..7a0b5b2f0
Binary files /dev/null and b/docs/_static/image/logo.ico differ
diff --git a/docs/_static/image/logo.png b/docs/_static/image/logo.png
new file mode 100644
index 000000000..2a8bc258f
Binary files /dev/null and b/docs/_static/image/logo.png differ
diff --git a/docs/advanced_features/attention_backend.md b/docs/advanced_features/attention_backend.md
new file mode 100644
index 000000000..68e4318d8
--- /dev/null
+++ b/docs/advanced_features/attention_backend.md
@@ -0,0 +1,104 @@
+# Attention Backend
+
+SGLang supports multiple attention backends. Each of them has different pros and cons.
+You can test them according to your needs.
+
+## Supporting matrix for different attention backends
+
+| **Backend**              | **Page Size > 1** | **Spec Decoding** | **MLA** | **Sliding Window** | **MultiModal** |
+|--------------------------|-------------------|-------------------|---------|--------------------|----------------|
+| **FlashInfer**           | ❌                | ✅                 | ✅      | ✅                 | ✅              |
+| **FA3**                  | ✅                | ✅                 | ✅      | ✅                 | ✅              |
+| **Triton**               | ❌                | ✅                 | ✅      | ✅                 | ❌              |
+| **Torch Native**         | ❌                | ❌                 | ✅      | ❌                 | ❌              |
+| **FlashMLA**             | ✅                | ✅                 | ✅      | ❌                 | ❌              |
+| **TRTLLM MLA**           | ✅                | ❌                 | ✅      | ✅                 | ❌              |
+| **Ascend**               | ✅                | ❌                 | ✅      | ❌                 | ❌              |
+| **Wave**                 | ✅                | ❌                 | ❌      | ❌                 | ❌              |
+
+**Notes:**
+- TRTLLM MLA only implements decode operations. For prefill operations (including multimodal inputs), it falls back to FlashInfer MLA backend.
+
+Note: Every kernel backend is compatible with a page size > 1 by specifying an argument such as `--page-size 16`.
+This is because a page size of 16 can be converted to a page size of 1 in the kernel backend.
+The "❌" and "✅" symbols in the table above under "Page Size > 1" indicate whether the kernel actually operates with a page size greater than 1, rather than treating a page size of 16 as a page size of 1.
+
+## User guide
+
+### Launch command for different attention backends.
+
+- FlashInfer (Default for Non-Hopper Machines, e.g., A100, A40)
+```bash
+python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --attention-backend flashinfer
+python3 -m sglang.launch_server --tp 8 --model deepseek-ai/DeepSeek-V3 --attention-backend flashinfer --trust-remote-code
+```
+
+- FlashAttention 3 (Default for Hopper Machines, e.g., H100, H200, H20)
+```bash
+python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --attention-backend fa3
+python3 -m sglang.launch_server --tp 8 --model deepseek-ai/DeepSeek-V3 --trust-remote-code --attention-backend fa3
+```
+
+- Triton
+```bash
+python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --attention-backend triton
+python3 -m sglang.launch_server --tp 8 --model deepseek-ai/DeepSeek-V3 --attention-backend triton --trust-remote-code
+```
+
+- Torch Native
+```bash
+python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --attention-backend torch_native
+```
+
+- FlashMLA
+```bash
+python3 -m sglang.launch_server --tp 8 --model deepseek-ai/DeepSeek-R1 --attention-backend flashmla --trust-remote-code
+python3 -m sglang.launch_server --tp 8 --model deepseek-ai/DeepSeek-R1 --attention-backend flashmla --kv-cache-dtype fp8_e4m3 --trust-remote-code
+```
+
+- TRTLLM MLA (Optimized for Blackwell Architecture, e.g., B200)
+```bash
+python3 -m sglang.launch_server --tp 8 --model deepseek-ai/DeepSeek-R1 --attention-backend trtllm_mla --trust-remote-code
+```
+
+- TRTLLM MLA with FP8 KV Cache (Higher concurrency, lower memory footprint)
+```bash
+python3 -m sglang.launch_server --tp 8 --model deepseek-ai/DeepSeek-R1 --attention-backend trtllm_mla --kv-cache-dtype fp8_e4m3 --trust-remote-code
+```
+
+- Ascend
+```bash
+python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --attention-backend ascend
+```
+
+- Wave
+```bash
+python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --attention-backend wave
+```
+
+## Steps to add a new attention backend
+To add a new attention backend, you can learn from the existing backends
+(`python/sglang/srt/layers/attention/triton_backend.py`, `python/sglang/srt/layers/attention/flashattention_backend.py`)
+and follow the steps below.
+
+1. Run without cuda graph. Support the two forward functions
+    - forward_extend
+        - Will be used for prefill, prefill with KV cache, and target verification
+        - It will be called once per layer
+    - forward_decode
+        - Will be used for normal decode, and draft decode
+        - It will be called once per layer
+    - init_forward_metadata
+        - Initialize the class and common metadata shared by all layers
+        - Call the plan function for optimizations like split_kv
+        - It will be called once per forward
+2. Run with cuda graph. It has two phases (capture and replay) and you need to implement three functions
+    - init_cuda_graph_state
+        - It will be called once during life time
+        - Create all common shared buffers
+    - init_forward_metadata_capture_cuda_graph
+        - It will be called before capturing a cuda graph
+        - It is similar to init_forward_metadata but write the medatada to some pre-defined buffers
+    - init_forward_metadata_replay_cuda_graph
+        - It will be called before replaying a cuda graph
+        - This function is in the critical path and needs to be fast
diff --git a/docs/advanced_features/hyperparameter_tuning.md b/docs/advanced_features/hyperparameter_tuning.md
new file mode 100644
index 000000000..e15ddd21c
--- /dev/null
+++ b/docs/advanced_features/hyperparameter_tuning.md
@@ -0,0 +1,77 @@
+# Hyperparameter Tuning
+
+## Achieving high throughput for offline batch inference
+
+Achieving a large batch size is the most important thing for attaining high throughput in offline batch inference.
+When the server is running at full load in a steady state, look for the following in the log:
+
+```Decode batch. #running-req: 233, #token: 370959, token usage: 0.82, cuda graph: True, gen throughput (token/s): 4594.01, #queue-req: 317```
+
+### Adjust the request submission speed to control `#queue-req`
+
+`#queue-req` indicates the number of requests in the queue.
+If you frequently see `#queue-req: 0`, it suggests that your client code is submitting requests too slowly.
+A healthy range for `#queue-req` is `100 - 2000`.
+However, avoid making `#queue-req` too large, as this will increase the scheduling overhead on the server.
+
+### Achieve a high `token usage`
+
+`token usage` indicates the KV cache memory utilization of the server. `token usage > 0.9` means good utilization.
+
+If you frequently see `token usage < 0.9` and `#queue-req > 0`, it means the server is too conservative about taking in new requests. You can decrease `--schedule-conservativeness` to a value like 0.3.
+The case of a server being too conservative can happen when users send many requests with a large `max_new_tokens` but the requests stop very early due to EOS or stop strings.
+
+On the other hand, if you see `token usage` very high and you frequently see warnings like
+`KV cache pool is full. Retract requests. #retracted_reqs: 1, #new_token_ratio: 0.9998 -> 1.0000`, you can increase `--schedule-conservativeness` to a value like 1.3.
+If you see `KV cache pool is full. Retract requests.` occasionally but not frequently, it is okay.
+
+### Tune `--mem-fraction-static` to increase KV cache pool capacity
+SGLang allocates memory as follows:
+
+Total memory usage = model weights + KV cache pool + CUDA graph buffers + activations
+
+The `--mem-fraction-static` parameter determines how much memory is allocated to the first two components:
+
+mem_fraction_static = (model weights + KV cache pool) / GPU memory capacity
+
+To support higher concurrency, you should maximize the KV cache pool capacity by setting `--mem-fraction-static` as high as possible while still reserving enough memory for activations and CUDA graph buffers.
+
+SGLang uses simple heuristics to set the default value of `--mem-fraction-static`, but you can optimize it for your use cases.
+As a rule of thumb, reserving 5–8 GB of memory for activations is typically sufficient. You can check this by inspecting the logs just before the server is ready.
+Look for log entries like this:
+
+```
+[2025-08-11 17:17:03] max_total_num_tokens=665690, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=4096, context_len=65536, available_gpu_mem=13.50 GB
+```
+
+Check the `available_gpu_mem` value.
+- If it is between 5–8 GB, the setting is good.
+- If it is too high (e.g., 10 - 20 GB), increase `--mem-fraction-static` to allocate more memory to the KV cache.
+- If it is too low, you risk out-of-memory (OOM) errors later, so decrease `--mem-fraction-static`.
+
+Another straightforward approach is to increase `--mem-fraction-static` in increments of 0.01 until you encounter OOM errors for your workloads.
+
+### Avoid out-of-memory errors by tuning `--chunked-prefill-size`, `--mem-fraction-static`, and `--max-running-requests`
+
+If you encounter out-of-memory (OOM) errors, you can adjust the following parameters:
+
+- If OOM occurs during prefill, try reducing `--chunked-prefill-size` to `4096` or `2048`. This saves memory but slows down the prefill speed for long prompts.
+- If OOM occurs during decoding, try lowering `--max-running-requests`.
+- You can also reduce `--mem-fraction-static` to a smaller value, such as 0.8 or 0.7. This decreases the memory usage of the KV cache memory pool and helps prevent OOM errors during both prefill and decoding. However, it limits maximum concurrency and reduces peak throughput.
+
+### Tune `--cuda-graph-max-bs`
+By default, CUDA graph is enabled only for small batch sizes (e.g., less than 160 or 256).
+However, for some models, especially at large tensor parallelism sizes, CUDA graph can be useful for batch sizes up to 512 or 768.
+Therefore, it may be beneficial to increase `--cuda-graph-max-bs` to a larger value.
+Note that CUDA graph consumes more memory, so you may need to reduce `--mem-fraction-static` at the same time.
+
+### Tune `--dp-size` and `--tp-size`
+
+Data parallelism is better for throughput. When there is enough GPU memory, always favor data parallelism for throughput. Refer to [sglang router](../advanced_features/router.md) for a better data parallelism rather than using `dp_size` parameter.
+
+### Try other options
+
+- `torch.compile` accelerates small models on small batch sizes. You can enable it with `--enable-torch-compile`.
+- Try other quantization (e.g. FP8 quantization with `--quantization fp8`)
+- Try other parallelism strategies (e.g. [expert parallelism](https://lmsys.org/blog/2025-05-05-large-scale-ep/)) or DP attention for deepseek models (with `--enable-dp-attention --dp-size 8`).
+- If the workload has many shared prefixes, try `--schedule-policy lpm`. Here, `lpm` stands for longest prefix match. It reorders requests to encourage more cache hits but introduces more scheduling overhead.
diff --git a/docs/advanced_features/lora.ipynb b/docs/advanced_features/lora.ipynb
new file mode 100644
index 000000000..1925baffc
--- /dev/null
+++ b/docs/advanced_features/lora.ipynb
@@ -0,0 +1,529 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# LoRA Serving"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "SGLang enables the use of [LoRA adapters](https://arxiv.org/abs/2106.09685) with a base model. By incorporating techniques from [S-LoRA](https://arxiv.org/pdf/2311.03285) and [Punica](https://arxiv.org/pdf/2310.18547), SGLang can efficiently support multiple LoRA adapters for different sequences within a single batch of inputs."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Arguments for LoRA Serving"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The following server arguments are relevant for multi-LoRA serving:\n",
+    "\n",
+    "* `enable_lora`: Enable LoRA support for the model. This argument is automatically set to True if `--lora-paths` is provided for backward compatibility.\n",
+    "\n",
+    "* `lora_paths`: The list of LoRA adapters to load. Each adapter must be specified in one of the following formats: <PATH> | <NAME>=<PATH> | JSON with schema {\"lora_name\":str,\"lora_path\":str,\"pinned\":bool}.\n",
+    "\n",
+    "* `max_loras_per_batch`: Maximum number of adaptors used by each batch. This argument can affect the amount of GPU memory reserved for multi-LoRA serving, so it should be set to a smaller value when memory is scarce. Defaults to be 8.\n",
+    "\n",
+    "* `max_loaded_loras`: If specified, it limits the maximum number of LoRA adapters loaded in CPU memory at a time. The value must be greater than or equal to `max-loras-per-batch`.\n",
+    "\n",
+    "* `lora_backend`: The backend of running GEMM kernels for Lora modules. Currently we only support Triton LoRA backend. In the future, faster backend built upon Cutlass or Cuda kernels will be added.\n",
+    "\n",
+    "* `max_lora_rank`: The maximum LoRA rank that should be supported. If not specified, it will be automatically inferred from the adapters provided in `--lora-paths`. This argument is needed when you expect to dynamically load adapters of larger LoRA rank after server startup.\n",
+    "\n",
+    "* `lora_target_modules`: The union set of all target modules where LoRA should be applied (e.g., `q_proj`, `k_proj`, `gate_proj`). If not specified, it will be automatically inferred from the adapters provided in `--lora-paths`. This argument is needed when you expect to dynamically load adapters of different target modules after server startup. You can also set it to `all` to enable LoRA for all supported modules. However, enabling LoRA on additional modules introduces a minor performance overhead. If your application is performance-sensitive, we recommend only specifying the modules for which you plan to load adapters.\n",
+    "\n",
+    "* `tp_size`: LoRA serving along with Tensor Parallelism is supported by SGLang. `tp_size` controls the number of GPUs for tensor parallelism. More details on the tensor sharding strategy can be found in [S-Lora](https://arxiv.org/pdf/2311.03285) paper.\n",
+    "\n",
+    "From client side, the user needs to provide a list of strings as input batch, and a list of adaptor names that each input sequence corresponds to."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Usage\n",
+    "\n",
+    "### Serving Single Adaptor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import requests\n",
+    "\n",
+    "from sglang.test.doc_patch import launch_server_cmd\n",
+    "from sglang.utils import wait_for_server, terminate_process"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "server_process, port = launch_server_cmd(\n",
+    "    \"\"\"\n",
+    "python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
+    "    --enable-lora \\\n",
+    "    --lora-paths lora0=algoprog/fact-generation-llama-3.1-8b-instruct-lora \\\n",
+    "    --max-loras-per-batch 1 --lora-backend triton \\\n",
+    "    --log-level warning \\\n",
+    "\"\"\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = f\"http://127.0.0.1:{port}\"\n",
+    "json_data = {\n",
+    "    \"text\": [\n",
+    "        \"List 3 countries and their capitals.\",\n",
+    "        \"List 3 countries and their capitals.\",\n",
+    "    ],\n",
+    "    \"sampling_params\": {\"max_new_tokens\": 32, \"temperature\": 0},\n",
+    "    # The first input uses lora0, and the second input uses the base model\n",
+    "    \"lora_path\": [\"lora0\", None],\n",
+    "}\n",
+    "response = requests.post(\n",
+    "    url + \"/generate\",\n",
+    "    json=json_data,\n",
+    ")\n",
+    "print(f\"Output 0: {response.json()[0]['text']}\")\n",
+    "print(f\"Output 1: {response.json()[1]['text']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Serving Multiple Adaptors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "server_process, port = launch_server_cmd(\n",
+    "    \"\"\"\n",
+    "python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
+    "    --enable-lora \\\n",
+    "    --lora-paths lora0=algoprog/fact-generation-llama-3.1-8b-instruct-lora \\\n",
+    "    lora1=Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16 \\\n",
+    "    --max-loras-per-batch 2 --lora-backend triton \\\n",
+    "    --log-level warning \\\n",
+    "\"\"\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = f\"http://127.0.0.1:{port}\"\n",
+    "json_data = {\n",
+    "    \"text\": [\n",
+    "        \"List 3 countries and their capitals.\",\n",
+    "        \"List 3 countries and their capitals.\",\n",
+    "    ],\n",
+    "    \"sampling_params\": {\"max_new_tokens\": 32, \"temperature\": 0},\n",
+    "    # The first input uses lora0, and the second input uses lora1\n",
+    "    \"lora_path\": [\"lora0\", \"lora1\"],\n",
+    "}\n",
+    "response = requests.post(\n",
+    "    url + \"/generate\",\n",
+    "    json=json_data,\n",
+    ")\n",
+    "print(f\"Output 0: {response.json()[0]['text']}\")\n",
+    "print(f\"Output 1: {response.json()[1]['text']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Dynamic LoRA loading"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Instead of specifying all adapters during server startup via `--lora-paths`. You can also load & unload LoRA adapters dynamically via the `/load_lora_adapter` and `/unload_lora_adapter` API.\n",
+    "\n",
+    "When using dynamic LoRA loading, it's recommended to explicitly specify both `--max-lora-rank` and `--lora-target-modules` at startup. For backward compatibility, SGLang will infer these values from `--lora-paths` if they are not explicitly provided. However, in that case, you would have to ensure that all dynamically loaded adapters share the same shape (rank and target modules) as those in the initial `--lora-paths` or are strictly \"smaller\"."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lora0 = \"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16\"  # rank - 4, target modules - q_proj, k_proj, v_proj, o_proj, gate_proj\n",
+    "lora1 = \"algoprog/fact-generation-llama-3.1-8b-instruct-lora\"  # rank - 64, target modules - q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj\n",
+    "lora0_new = \"philschmid/code-llama-3-1-8b-text-to-sql-lora\"  # rank - 256, target modules - q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj\n",
+    "\n",
+    "\n",
+    "# The `--target-lora-modules` param below is technically not needed, as the server will infer it from lora0 which already has all the target modules specified.\n",
+    "# We are adding it here just to demonstrate usage.\n",
+    "server_process, port = launch_server_cmd(\n",
+    "    \"\"\"\n",
+    "    python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
+    "    --enable-lora \\\n",
+    "    --cuda-graph-max-bs 2 \\\n",
+    "    --max-loras-per-batch 2 --lora-backend triton \\\n",
+    "    --max-lora-rank 256\n",
+    "    --lora-target-modules all\n",
+    "    --log-level warning\n",
+    "    \"\"\"\n",
+    ")\n",
+    "\n",
+    "url = f\"http://127.0.0.1:{port}\"\n",
+    "wait_for_server(url)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load adapter lora0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = requests.post(\n",
+    "    url + \"/load_lora_adapter\",\n",
+    "    json={\n",
+    "        \"lora_name\": \"lora0\",\n",
+    "        \"lora_path\": lora0,\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "if response.status_code == 200:\n",
+    "    print(\"LoRA adapter loaded successfully.\", response.json())\n",
+    "else:\n",
+    "    print(\"Failed to load LoRA adapter.\", response.json())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load adapter lora1:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = requests.post(\n",
+    "    url + \"/load_lora_adapter\",\n",
+    "    json={\n",
+    "        \"lora_name\": \"lora1\",\n",
+    "        \"lora_path\": lora1,\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "if response.status_code == 200:\n",
+    "    print(\"LoRA adapter loaded successfully.\", response.json())\n",
+    "else:\n",
+    "    print(\"Failed to load LoRA adapter.\", response.json())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Check inference output:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = f\"http://127.0.0.1:{port}\"\n",
+    "json_data = {\n",
+    "    \"text\": [\n",
+    "        \"List 3 countries and their capitals.\",\n",
+    "        \"List 3 countries and their capitals.\",\n",
+    "    ],\n",
+    "    \"sampling_params\": {\"max_new_tokens\": 32, \"temperature\": 0},\n",
+    "    # The first input uses lora0, and the second input uses lora1\n",
+    "    \"lora_path\": [\"lora0\", \"lora1\"],\n",
+    "}\n",
+    "response = requests.post(\n",
+    "    url + \"/generate\",\n",
+    "    json=json_data,\n",
+    ")\n",
+    "print(f\"Output from lora0: \\n{response.json()[0]['text']}\\n\")\n",
+    "print(f\"Output from lora1 (updated): \\n{response.json()[1]['text']}\\n\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Unload lora0 and replace it with a different adapter:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = requests.post(\n",
+    "    url + \"/unload_lora_adapter\",\n",
+    "    json={\n",
+    "        \"lora_name\": \"lora0\",\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "response = requests.post(\n",
+    "    url + \"/load_lora_adapter\",\n",
+    "    json={\n",
+    "        \"lora_name\": \"lora0\",\n",
+    "        \"lora_path\": lora0_new,\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "if response.status_code == 200:\n",
+    "    print(\"LoRA adapter loaded successfully.\", response.json())\n",
+    "else:\n",
+    "    print(\"Failed to load LoRA adapter.\", response.json())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Check output again:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = f\"http://127.0.0.1:{port}\"\n",
+    "json_data = {\n",
+    "    \"text\": [\n",
+    "        \"List 3 countries and their capitals.\",\n",
+    "        \"List 3 countries and their capitals.\",\n",
+    "    ],\n",
+    "    \"sampling_params\": {\"max_new_tokens\": 32, \"temperature\": 0},\n",
+    "    # The first input uses lora0, and the second input uses lora1\n",
+    "    \"lora_path\": [\"lora0\", \"lora1\"],\n",
+    "}\n",
+    "response = requests.post(\n",
+    "    url + \"/generate\",\n",
+    "    json=json_data,\n",
+    ")\n",
+    "print(f\"Output from lora0: \\n{response.json()[0]['text']}\\n\")\n",
+    "print(f\"Output from lora1 (updated): \\n{response.json()[1]['text']}\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### LoRA GPU Pinning"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Another advanced option is to specify adapters as `pinned` during loading. When an adapter is pinned, it is permanently assigned to one of the available GPU pool slots (as configured by `--max-loras-per-batch`) and will not be evicted from GPU memory during runtime. Instead, it remains resident until it is explicitly unloaded.\n",
+    "\n",
+    "This can improve performance in scenarios where the same adapter is frequently used across requests, by avoiding repeated memory transfers and reinitialization overhead. However, since GPU pool slots are limited, pinning adapters reduces the flexibility of the system to dynamically load other adapters on demand. If too many adapters are pinned, it may lead to degraded performance, or in the most extreme case (`Number of pinned adapters == max-loras-per-batch`), halt all unpinned requests. Therefore, currently SGLang limits maximal number of pinned adapters to `max-loras-per-batch - 1` to prevent unexpected starvations. \n",
+    "\n",
+    "In the example below, we start a server with `lora1` loaded as pinned, `lora2` and `lora3` loaded as regular (unpinned) adapters. Please note that, we intentionally specify `lora2` and `lora3` in two different formats to demonstrate that both are supported."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "server_process, port = launch_server_cmd(\n",
+    "    \"\"\"\n",
+    "    python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
+    "    --enable-lora \\\n",
+    "    --cuda-graph-max-bs 8 \\\n",
+    "    --max-loras-per-batch 3 --lora-backend triton \\\n",
+    "    --max-lora-rank 256 \\\n",
+    "    --lora-target-modules all \\\n",
+    "    --lora-paths \\\n",
+    "        {\"lora_name\":\"lora0\",\"lora_path\":\"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16\",\"pinned\":true} \\\n",
+    "        {\"lora_name\":\"lora1\",\"lora_path\":\"algoprog/fact-generation-llama-3.1-8b-instruct-lora\"} \\\n",
+    "        lora2=philschmid/code-llama-3-1-8b-text-to-sql-lora\n",
+    "    --log-level warning\n",
+    "    \"\"\"\n",
+    ")\n",
+    "\n",
+    "\n",
+    "url = f\"http://127.0.0.1:{port}\"\n",
+    "wait_for_server(url)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can also specify adapter as pinned during dynamic adapter loading. In the example below, we reload `lora2` as pinned adapter:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = requests.post(\n",
+    "    url + \"/unload_lora_adapter\",\n",
+    "    json={\n",
+    "        \"lora_name\": \"lora1\",\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "response = requests.post(\n",
+    "    url + \"/load_lora_adapter\",\n",
+    "    json={\n",
+    "        \"lora_name\": \"lora1\",\n",
+    "        \"lora_path\": \"algoprog/fact-generation-llama-3.1-8b-instruct-lora\",\n",
+    "        \"pinned\": True,  # Pin the adapter to GPU\n",
+    "    },\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Verify that the results are expected:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = f\"http://127.0.0.1:{port}\"\n",
+    "json_data = {\n",
+    "    \"text\": [\n",
+    "        \"List 3 countries and their capitals.\",\n",
+    "        \"List 3 countries and their capitals.\",\n",
+    "        \"List 3 countries and their capitals.\",\n",
+    "    ],\n",
+    "    \"sampling_params\": {\"max_new_tokens\": 32, \"temperature\": 0},\n",
+    "    # The first input uses lora0, and the second input uses lora1\n",
+    "    \"lora_path\": [\"lora0\", \"lora1\", \"lora2\"],\n",
+    "}\n",
+    "response = requests.post(\n",
+    "    url + \"/generate\",\n",
+    "    json=json_data,\n",
+    ")\n",
+    "print(f\"Output from lora0 (pinned): \\n{response.json()[0]['text']}\\n\")\n",
+    "print(f\"Output from lora1 (pinned): \\n{response.json()[1]['text']}\\n\")\n",
+    "print(f\"Output from lora2 (not pinned): \\n{response.json()[2]['text']}\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Future Works\n",
+    "\n",
+    "The development roadmap for LoRA-related features can be found in this [issue](https://github.com/sgl-project/sglang/issues/2929). Other features, including Embedding Layer, Unified Paging, Cutlass backend are still under development."
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/advanced_features/observability.md b/docs/advanced_features/observability.md
new file mode 100644
index 000000000..f03fb3772
--- /dev/null
+++ b/docs/advanced_features/observability.md
@@ -0,0 +1,35 @@
+# Observability
+
+## Production Metrics
+SGLang exposes the following metrics via Prometheus. You can enable them by adding `--enable-metrics` when launching the server.
+You can query them by:
+```
+curl http://localhost:30000/metrics
+```
+
+See [Production Metrics](../references/production_metrics.md) for more details.
+
+## Logging
+
+By default, SGLang does not log any request contents. You can log them by using `--log-requests`.
+You can control the verbosity by using `--log-request-level`.
+See [Logging](server_arguments.md#logging) for more details.
+
+## Request Dump and Replay
+
+You can dump all requests and replay them later for benchmarking or other purposes.
+
+To start dumping, use the following command to send a request to a server:
+```
+python3 -m sglang.srt.managers.configure_logging --url http://localhost:30000 --dump-requests-folder /tmp/sglang_request_dump --dump-requests-threshold 100
+```
+The server will dump the requests into a pickle file for every 100 requests.
+
+To replay the request dump, use `scripts/playground/replay_request_dump.py`.
+
+## Crash Dump and Replay
+Sometimes the server might crash, and you may want to debug the cause of the crash.
+SGLang supports crash dumping, which will dump all requests from the 5 minutes before the crash, allowing you to replay the requests and debug the reason later.
+
+To enable crash dumping, use `--crash-dump-folder /tmp/crash_dump`.
+To replay the crash dump, use `scripts/playground/replay_request_dump.py`.
diff --git a/docs/advanced_features/pd_disaggregation.md b/docs/advanced_features/pd_disaggregation.md
new file mode 100644
index 000000000..85a5db07e
--- /dev/null
+++ b/docs/advanced_features/pd_disaggregation.md
@@ -0,0 +1,150 @@
+# PD Disaggregation
+
+## Why and What is PD Disaggregation?
+
+Large Language Model (LLM) inference comprises two distinct phases: **Prefill** and **Decode**. The Prefill phase is computation-intensive, processing the entire input sequence, while the Decode phase is memory-intensive, managing the Key-Value (KV) cache for token generation. Traditionally, these phases are handled within a unified engine, where combined scheduling of prefill and decode batches introduces inefficiencies. To address these challenges, we introduce **Prefill and Decoding (PD) Disaggregation** in SGLang.
+
+### Issues with Unified Scheduling
+
+The conventional unified engine, which processes prefill and decode batches together, results in two significant problems:
+
+1. **Prefill Interruption**: Incoming prefill batches frequently interrupt ongoing decode batches, causing substantial delays in token generation.
+2. **DP Attention Imbalance**: In data-parallel (DP) attention, one DP worker may process a prefill batch while another handles a decode batch simultaneously, leading to increased decode latency.
+
+PD Disaggregation resolves these by separating the two stages, enabling tailored optimizations for each.
+
+For the design details, please refer to [link](https://docs.google.com/document/d/1rQXJwKd5b9b1aOzLh98mnyMhBMhlxXA5ATZTHoQrwvc/edit?tab=t.0).
+
+Currently, we support Mooncake and NIXL as the transfer engine.
+
+## Router Integration
+
+For deploying PD disaggregation at scale with load balancing and fault tolerance, SGLang provides a router. The router can distribute requests between prefill and decode instances using various routing policies. For detailed information on setting up routing with PD disaggregation, including configuration options and deployment patterns, see the [SGLang Router documentation](router.md#mode-3-prefill-decode-disaggregation).
+
+
+## Mooncake
+### Requirements
+
+```bash
+uv pip install mooncake-transfer-engine
+```
+
+### Usage
+
+### Llama Single Node
+
+```bash
+$ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disaggregation-mode prefill --disaggregation-ib-device mlx5_roce0
+$ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disaggregation-mode decode --port 30001 --base-gpu-id 1 --disaggregation-ib-device mlx5_roce0
+$ python -m sglang_router.launch_router --pd-disaggregation --prefill http://127.0.0.1:30000 --decode http://127.0.0.1:30001 --host 0.0.0.0 --port 8000
+```
+
+### DeepSeek Multi-Node
+
+```bash
+# prefill 0
+$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-ib-device ${device_name} --disaggregation-mode prefill --host ${local_ip} --port 30000 --trust-remote-code --dist-init-addr ${prefill_master_ip}:5000 --nnodes 2 --node-rank 0 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8
+# prefill 1
+$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-ib-device ${device_name} --disaggregation-mode prefill --host ${local_ip} --port 30000 --trust-remote-code --dist-init-addr ${prefill_master_ip}:5000 --nnodes 2 --node-rank 1 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8
+# decode 0
+$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-ib-device ${device_name} --disaggregation-mode decode --host ${local_ip} --port 30001 --trust-remote-code --dist-init-addr ${decode_master_ip}:5000 --nnodes 2 --node-rank 0 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8 --max-running-requests 128
+# decode 1
+$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-ib-device ${device_name} --disaggregation-mode decode --host ${local_ip} --port 30001 --trust-remote-code --dist-init-addr ${decode_master_ip}:5000 --nnodes 2 --node-rank 1 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8 --max-running-requests 128
+```
+### Advanced Configuration
+
+PD Disaggregation with Mooncake supports the following environment variables for fine-grained control over system behavior.
+
+#### Prefill Server Configuration
+| Variable | Description | Default |
+|:--------:|:-----------:|:--------:
+| **`SGLANG_DISAGGREGATION_THREAD_POOL_SIZE`** | Controls the total number of worker threads for KVCache transfer operations per TP rank | A dynamic value calculated by `int(0.75 * os.cpu_count()) // 8)`, which is limited to be larger than 4 and less than 12 to ensure efficiency and prevent thread race conditions |
+| **`SGLANG_DISAGGREGATION_QUEUE_SIZE`** | Sets the number of parallel transfer queues. KVCache transfer requests from multiple decode instances will be sharded into these queues so that they can share the threads and the transfer bandwidth at the same time. If it is set to `1`, then we transfer requests one by one according to fcfs strategy | `4` |
+| **`SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT`** | Timeout (seconds) for receiving destination KV indices during request initialization | `300` |
+
+If a greater mean TTFT is acceptable, you can `export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=600` (10 minutes) to relax the timeout condition.
+Please be aware that this setting will cause prefill instances to take a longer time to clean up the affected memory resources when a running decode node loses connection.
+
+#### Decode Server Configuration
+| Variable | Description | Default |
+|:--------:|:-----------:|:--------:
+| **`SGLANG_DISAGGREGATION_HEARTBEAT_INTERVAL`** | Interval (seconds) between health checks to prefill bootstrap servers | `5.0` |
+| **`SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE`** | Consecutive heartbeat failures before marking prefill server offline | `2` |
+| **`SGLANG_DISAGGREGATION_WAITING_TIMEOUT`** | Timeout (seconds) for receiving KV Cache after request initialization | `300` |
+
+If a greater mean TTFT is acceptable, you can `export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=600` (10 minutes) to relax the timeout condition.
+
+
+## NIXL
+### Requirements
+
+Install via pip.
+
+```bash
+pip install nixl
+```
+
+Or build from source - may be required if you already have UCX installed.
+
+```bash
+git clone https://github.com/ai-dynamo/nixl.git
+cd nixl
+pip install . --config-settings=setup-args="-Ducx_path=/path/to/ucx"
+```
+
+
+### Usage
+
+### Llama Single Node
+
+```bash
+$ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disaggregation-mode prefill --disaggregation-transfer-backend nixl
+$ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disaggregation-mode decode --port 30001 --base-gpu-id 1 --disaggregation-transfer-backend nixl
+$ python -m sglang_router.launch_router --pd-disaggregation --prefill http://127.0.0.1:30000 --decode http://127.0.0.1:30001 --host 0.0.0.0 --port 8000
+```
+
+### DeepSeek Multi-Node
+
+```bash
+# prefill 0
+$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-transfer-backend nixl --disaggregation-mode prefill --host ${local_ip} --port 30000 --trust-remote-code --dist-init-addr ${prefill_master_ip}:5000 --nnodes 2 --node-rank 0 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8
+# prefill 1
+$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-transfer-backend nixl --disaggregation-mode prefill --host ${local_ip} --port 30000 --trust-remote-code --dist-init-addr ${prefill_master_ip}:5000 --nnodes 2 --node-rank 1 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8
+# decode 0
+$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-transfer-backend nixl --disaggregation-mode decode --host ${local_ip} --port 30001 --trust-remote-code --dist-init-addr ${decode_master_ip}:5000 --nnodes 2 --node-rank 0 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8 --max-running-requests 128
+# decode 1
+$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-transfer-backend nixl --disaggregation-mode decode --host ${local_ip} --port 30001 --trust-remote-code --dist-init-addr ${decode_master_ip}:5000 --nnodes 2 --node-rank 1 --tp-size 16 --dp-size 8 --enable-dp-attention --moe-a2a-backend deepep --mem-fraction-static 0.8 --max-running-requests 128
+```
+
+## ASCEND
+
+### Usage
+
+Use ascend backend with [mf_adapter(download link)](https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com:443/sglang/mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl?AccessKeyId=HPUAXT4YM0U8JNTERLST&Expires=1783151861&Signature=3j10QDUjqk70enaq8lostYV2bEA%3D) and ASCEND_MF_STORE_URL being set
+
+```bash
+pip install mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl --force-reinstall
+export ASCEND_MF_STORE_URL="tcp://xxx.xx.xxx.xxx:xxxx"
+```
+Use mooncake backend, more details can be found in mooncake section.
+```bash
+export ENABLE_ASCEND_TRANSFER_WITH_MOONCAKE=true
+```
+
+
+### Llama Single Node
+
+```bash
+$ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disaggregation-mode prefill --disaggregation-transfer-backend ascend
+$ python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disaggregation-mode decode --port 30001 --base-gpu-id 1 --disaggregation-transfer-backend ascend
+$ python -m sglang_router.launch_router --pd-disaggregation --prefill http://127.0.0.1:30000 --decode http://127.0.0.1:30001 --host 0.0.0.0 --port 8000
+```
+
+### DeepSeek Multi-Node
+
+```bash
+# prefill 0
+$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-transfer-backend ascend --disaggregation-mode prefill --host ${local_ip} --port 30000 --trust-remote-code --dist-init-addr ${prefill_master_ip}:5000 --nnodes 1 --node-rank 0 --tp-size 16
+# decode 0
+$ python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --disaggregation-transfer-backend ascend --disaggregation-mode decode --host ${local_ip} --port 30001 --trust-remote-code --dist-init-addr ${decode_master_ip}:5000 --nnodes 1 --node-rank 0 --tp-size 16
+```
diff --git a/docs/advanced_features/quantization.md b/docs/advanced_features/quantization.md
new file mode 100644
index 000000000..3a229f83d
--- /dev/null
+++ b/docs/advanced_features/quantization.md
@@ -0,0 +1,152 @@
+# Quantization
+
+SGLang supports various quantization methods, including offline quantization and online dynamic quantization.
+
+Offline quantization loads pre-quantized model weights directly during inference. This is required for quantization methods
+such as GPTQ and AWQ, which collect and pre-compute various statistics from the original weights using the calibration dataset.
+
+Online quantization dynamically computes scaling parameters—such as the maximum/minimum values of model weights—during runtime.
+Like NVIDIA FP8 training's [delayed scaling](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html#Mixed-precision-training-with-FP8) mechanism, online quantization calculates the appropriate scaling factors
+on-the-fly to convert high-precision weights into a lower-precision format.
+
+**Note: For better performance, usability and convenience, offline quantization is recommended over online quantization.**
+
+If you use a pre-quantized model, do not add `--quantization` to enable online quantization at the same time.
+For popular pre-quantized models, please visit [ModelCloud](https://huggingface.co/collections/ModelCloud/vortex-673743382af0a52b2a8b9fe2)
+or [NeuralMagic](https://huggingface.co/collections/neuralmagic) collections on HF for some
+popular quality validated quantized models. Quantized models must be validated via benchmarks post-quantization
+to guard against abnormal quantization loss regressions.
+
+## Offline Quantization
+
+To load already quantized models, simply load the model weights and config. **Again, if the model has been quantized offline,
+there's no need to add `--quantization` argument when starting the engine. The quantization method will be parsed from the
+downloaded Hugging Face config. For example, DeepSeek V3/R1 models are already in FP8, so do not add redundant parameters.**
+
+```bash
+python3 -m sglang.launch_server \
+    --model-path hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4 \
+    --port 30000 --host 0.0.0.0
+```
+
+Take note, if your model is **per-channel quantized (INT8 or FP8) with per-token dynamic quantization activation**, you can opt to include `--quantization w8a8_int8` or `--quantization w8a8_fp8` to invoke the corresponding CUTLASS int8_kernel or fp8_kernel in sgl-kernel. This action will ignore the Hugging Face config's quantization settings. For instance, with `neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic`, if you execute with `--quantization w8a8_fp8`, the system will use the `W8A8Fp8Config` from SGLang to invoke the sgl-kernel, rather than the `CompressedTensorsConfig` for vLLM kernels.
+
+```bash
+python3 -m sglang.launch_server \
+    --model-path neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic \
+    --quantization w8a8_fp8 \
+    --port 30000 --host 0.0.0.0
+```
+
+### Examples of Offline Model Quantization
+
+#### Using [GPTQModel](https://github.com/ModelCloud/GPTQModel)
+
+```bash
+# install
+pip install gptqmodel --no-build-isolation -v
+```
+
+```py
+from datasets import load_dataset
+from gptqmodel import GPTQModel, QuantizeConfig
+
+model_id = "meta-llama/Llama-3.2-1B-Instruct"
+quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
+
+calibration_dataset = load_dataset(
+    "allenai/c4", data_files="en/c4-train.00001-of-01024.json.gz",
+    split="train"
+  ).select(range(1024))["text"]
+
+quant_config = QuantizeConfig(bits=4, group_size=128) # quantization config
+model = GPTQModel.load(model_id, quant_config) # load model
+
+model.quantize(calibration_dataset, batch_size=2) # quantize
+model.save(quant_path) # save model
+```
+
+#### Using [LLM Compressor](https://github.com/vllm-project/llm-compressor/)
+
+```bash
+# install
+pip install llmcompressor
+```
+
+Here, we take quantize `meta-llama/Meta-Llama-3-8B-Instruct` to `FP8` as an example to elaborate on how to do offline quantization.
+
+```python
+from transformers import AutoTokenizer
+from llmcompressor.transformers import SparseAutoModelForCausalLM
+from llmcompressor.transformers import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+
+# Step 1: Load the original model.
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+
+model = SparseAutoModelForCausalLM.from_pretrained(
+  MODEL_ID, device_map="auto", torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Step 2: Perform offline quantization.
+# Step 2.1: Configure the simple PTQ quantization.
+recipe = QuantizationModifier(
+  targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
+
+# Step 2.2: Apply the quantization algorithm.
+oneshot(model=model, recipe=recipe)
+
+# Step 3: Save the model.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+model.save_pretrained(SAVE_DIR)
+tokenizer.save_pretrained(SAVE_DIR)
+```
+
+Then, you can directly use the quantized model with `SGLang`, by using the following command:
+
+```bash
+python3 -m sglang.launch_server \
+    --model-path $PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic \
+    --port 30000 --host 0.0.0.0
+```
+
+## Online Quantization
+
+To enable online quantization, you can simply specify `--quantization` in the command line. For example, you can launch the server with the following command to enable `FP8` quantization for model `meta-llama/Meta-Llama-3.1-8B-Instruct`:
+
+```bash
+python3 -m sglang.launch_server \
+    --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --quantization fp8 \
+    --port 30000 --host 0.0.0.0
+```
+
+Our team is working on supporting more online quantization methods. SGLang will soon support methods including but not limited to `["awq", "gptq", "marlin", "gptq_marlin", "awq_marlin", "bitsandbytes", "gguf"]`.
+
+SGLang also supports quantization methods based on [torchao](https://github.com/pytorch/ao). You can simply specify `--torchao-config` in the command line to support this feature. For example, if you want to enable `int4wo-128` for model `meta-llama/Meta-Llama-3.1-8B-Instruct`, you can launch the server with the following command:
+
+```bash
+python3 -m sglang.launch_server \
+    --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --torchao-config int4wo-128 \
+    --port 30000 --host 0.0.0.0
+```
+
+SGLang supports the following quantization methods based on torchao `["int8dq", "int8wo", "fp8wo", "fp8dq-per_tensor", "fp8dq-per_row", "int4wo-32", "int4wo-64", "int4wo-128", "int4wo-256"]`.
+
+Note: According to [this issue](https://github.com/sgl-project/sglang/issues/2219#issuecomment-2561890230), `"int8dq"` method currently has some bugs when using together with cuda graph capture. So we suggest to disable cuda graph capture when using `"int8dq"` method. Namely, please use the following command:
+
+```bash
+python3 -m sglang.launch_server \
+    --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --torchao-config int8dq \
+    --disable-cuda-graph \
+    --port 30000 --host 0.0.0.0
+```
+
+## Reference
+
+- [GPTQModel](https://github.com/ModelCloud/GPTQModel)
+- [LLM Compressor](https://github.com/vllm-project/llm-compressor/)
+- [Torchao: PyTorch Architecture Optimization](https://github.com/pytorch/ao)
+- [vLLM Quantization](https://docs.vllm.ai/en/latest/quantization/)
diff --git a/docs/advanced_features/router.md b/docs/advanced_features/router.md
new file mode 100644
index 000000000..4aba99f37
--- /dev/null
+++ b/docs/advanced_features/router.md
@@ -0,0 +1,445 @@
+# SGLang Router
+
+The SGLang Router is a high-performance request distribution system that routes inference requests across multiple SGLang runtime instances. It features cache-aware load balancing, fault tolerance, and support for advanced deployment patterns including data parallelism and prefill-decode disaggregation.
+
+## Key Features
+
+- **Cache-Aware Load Balancing**: Optimizes cache utilization while maintaining balanced load distribution
+- **Multiple Routing Policies**: Choose from random, round-robin, cache-aware, or power-of-two policies
+- **Fault Tolerance**: Automatic retry and circuit breaker mechanisms for resilient operation
+- **Dynamic Scaling**: Add or remove workers at runtime without service interruption
+- **Kubernetes Integration**: Native service discovery and pod management
+- **Prefill-Decode Disaggregation**: Support for disaggregated serving load balancing
+- **Prometheus Metrics**: Built-in observability and monitoring
+
+## Installation
+
+```bash
+pip install sglang-router
+```
+
+## Quick Start
+
+To see all available options:
+
+```bash
+python -m sglang_router.launch_server --help  # Co-launch router and workers
+python -m sglang_router.launch_router --help  # Launch router only
+```
+
+## Deployment Modes
+
+The router supports three primary deployment patterns:
+
+1. **Co-launch Mode**: Router and workers launch together (simplest for single-node deployments)
+2. **Separate Launch Mode**: Router and workers launch independently (best for multi-node setups)
+3. **Prefill-Decode Disaggregation**: Specialized mode for disaggregated serving
+
+### Mode 1: Co-launch Router and Workers
+
+This mode launches both the router and multiple worker instances in a single command. It's the simplest deployment option and replaces the `--dp-size` argument of SGLang Runtime.
+
+```bash
+# Launch router with 4 workers
+python -m sglang_router.launch_server \
+    --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --dp-size 4 \
+    --host 0.0.0.0 \
+    --port 30000
+```
+
+#### Sending Requests
+
+Once the server is ready, send requests to the router endpoint:
+
+```python
+import requests
+
+# Using the /generate endpoint
+url = "http://localhost:30000/generate"
+data = {
+    "text": "What is the capital of France?",
+    "sampling_params": {
+        "temperature": 0.7,
+        "max_new_tokens": 100
+    }
+}
+
+response = requests.post(url, json=data)
+print(response.json())
+
+# OpenAI-compatible endpoint
+url = "http://localhost:30000/v1/chat/completions"
+data = {
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}]
+}
+
+response = requests.post(url, json=data)
+print(response.json())
+```
+
+### Mode 2: Separate Launch Mode
+
+This mode is ideal for multi-node deployments where workers run on different machines.
+
+#### Step 1: Launch Workers
+
+On each worker node:
+
+```bash
+# Worker node 1
+python -m sglang.launch_server \
+    --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --host 0.0.0.0 \
+    --port 8000
+
+# Worker node 2
+python -m sglang.launch_server \
+    --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --host 0.0.0.0 \
+    --port 8001
+```
+
+#### Step 2: Launch Router
+
+On the router node:
+
+```bash
+python -m sglang_router.launch_router \
+    --worker-urls http://worker1:8000 http://worker2:8001 \
+    --host 0.0.0.0 \
+    --port 30000 \
+    --policy cache_aware  # or random, round_robin, power_of_two
+```
+
+### Mode 3: Prefill-Decode Disaggregation
+
+This advanced mode separates prefill and decode operations for optimized performance:
+
+```bash
+python -m sglang_router.launch_router \
+    --pd-disaggregation \
+    --prefill http://prefill1:8000 9000 \
+    --prefill http://prefill2:8001 9001 \
+    --decode http://decode1:8002 \
+    --decode http://decode2:8003 \
+    --prefill-policy cache_aware \
+    --decode-policy round_robin
+```
+
+#### Understanding --prefill Arguments
+
+The `--prefill` flag accepts URLs with optional bootstrap ports:
+- `--prefill http://server:8000` - No bootstrap port
+- `--prefill http://server:8000 9000` - Bootstrap port 9000
+- `--prefill http://server:8000 none` - Explicitly no bootstrap port
+
+#### Policy Inheritance in PD Mode
+
+The router intelligently handles policy configuration for prefill and decode nodes:
+
+1. **Only `--policy` specified**: Both prefill and decode nodes use this policy
+2. **`--policy` and `--prefill-policy` specified**: Prefill nodes use `--prefill-policy`, decode nodes use `--policy`
+3. **`--policy` and `--decode-policy` specified**: Prefill nodes use `--policy`, decode nodes use `--decode-policy`
+4. **All three specified**: Prefill nodes use `--prefill-policy`, decode nodes use `--decode-policy` (main `--policy` is ignored)
+
+Example with mixed policies:
+```bash
+python -m sglang_router.launch_router \
+    --pd-disaggregation \
+    --prefill http://prefill1:8000
+    --prefill http://prefill2:8000 \
+    --decode http://decode1:8001
+    --decode http://decode2:8001 \
+    --policy round_robin \
+    --prefill-policy cache_aware  # Prefill uses cache_aware and decode uses round_robin from --policy
+```
+
+#### PD Mode with Service Discovery
+
+For Kubernetes deployments with separate prefill and decode server pools:
+
+```bash
+python -m sglang_router.launch_router \
+    --pd-disaggregation \
+    --service-discovery \
+    --prefill-selector app=prefill-server tier=gpu \
+    --decode-selector app=decode-server tier=cpu \
+    --service-discovery-namespace production \
+    --prefill-policy cache_aware \
+    --decode-policy round_robin
+```
+
+## Dynamic Scaling
+
+The router supports runtime scaling through REST APIs:
+
+### Adding Workers
+
+```bash
+# Launch a new worker
+python -m sglang.launch_server \
+    --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --port 30001
+
+# Add it to the router
+curl -X POST "http://localhost:30000/add_worker?url=http://127.0.0.1:30001"
+```
+
+### Removing Workers
+
+```bash
+curl -X POST "http://localhost:30000/remove_worker?url=http://127.0.0.1:30001"
+```
+
+**Note**: When using cache-aware routing, removed workers are cleanly evicted from the routing tree and request queues.
+
+## Fault Tolerance
+
+The router includes comprehensive fault tolerance mechanisms:
+
+### Retry Configuration
+
+```bash
+python -m sglang_router.launch_router \
+    --worker-urls http://worker1:8000 http://worker2:8001 \
+    --retry-max-retries 3 \
+    --retry-initial-backoff-ms 100 \
+    --retry-max-backoff-ms 10000 \
+    --retry-backoff-multiplier 2.0 \
+    --retry-jitter-factor 0.1
+```
+
+### Circuit Breaker
+
+Protects against cascading failures:
+
+```bash
+python -m sglang_router.launch_router \
+    --worker-urls http://worker1:8000 http://worker2:8001 \
+    --cb-failure-threshold 5 \
+    --cb-success-threshold 2 \
+    --cb-timeout-duration-secs 30 \
+    --cb-window-duration-secs 60
+```
+
+**Behavior**:
+- Worker is marked unhealthy after `cb-failure-threshold` consecutive failures
+- Returns to service after `cb-success-threshold` successful health checks
+- Circuit breaker can be disabled with `--disable-circuit-breaker`
+
+## Routing Policies
+
+The router supports multiple routing strategies:
+
+### 1. Random Routing
+Distributes requests randomly across workers.
+
+```bash
+--policy random
+```
+
+### 2. Round-Robin Routing
+Cycles through workers in order.
+
+```bash
+--policy round_robin
+```
+
+### 3. Power of Two Choices
+Samples two workers and routes to the less loaded one.
+
+```bash
+--policy power_of_two
+```
+
+### 4. Cache-Aware Load Balancing (Default)
+
+The most sophisticated policy that combines cache optimization with load balancing:
+
+```bash
+--policy cache_aware \
+--cache-threshold 0.5 \
+--balance-abs-threshold 32 \
+--balance-rel-threshold 1.0001
+```
+
+#### How It Works
+
+1. **Load Assessment**: Checks if the system is balanced
+   - Imbalanced if: `(max_load - min_load) > balance_abs_threshold` AND `max_load > balance_rel_threshold * min_load`
+
+2. **Routing Decision**:
+   - **Balanced System**: Uses cache-aware routing
+     - Routes to worker with highest prefix match if match > `cache_threshold`
+     - Otherwise routes to worker with most available cache capacity
+   - **Imbalanced System**: Uses shortest queue routing to the least busy worker
+
+3. **Cache Management**:
+   - Maintains approximate radix trees per worker
+   - Periodically evicts LRU entries based on `--eviction-interval-secs` and `--max-tree-size`
+
+### Data Parallelism Aware Routing
+
+Enables fine-grained control over data parallel replicas:
+
+```bash
+--dp-aware \
+--api-key your_api_key  # Required for worker authentication
+```
+
+This mode coordinates with SGLang's DP controller for optimized request distribution across data parallel ranks.
+
+## Configuration Reference
+
+### Core Settings
+
+| Parameter                   | Type | Default     | Description                                                     |
+| --------------------------- | ---- | ----------- | --------------------------------------------------------------- |
+| `--host`                    | str  | 127.0.0.1   | Router server host address                                      |
+| `--port`                    | int  | 30000       | Router server port                                              |
+| `--worker-urls`             | list | []          | Worker URLs for separate launch mode                            |
+| `--policy`                  | str  | cache_aware | Routing policy (random, round_robin, cache_aware, power_of_two) |
+| `--max-concurrent-requests` | int  | 64          | Maximum concurrent requests (rate limiting)                     |
+| `--request-timeout-secs`    | int  | 600         | Request timeout in seconds                                      |
+| `--max-payload-size`        | int  | 256MB       | Maximum request payload size                                    |
+
+### Cache-Aware Routing Parameters
+
+| Parameter                  | Type  | Default  | Description                                            |
+| -------------------------- | ----- | -------- | ------------------------------------------------------ |
+| `--cache-threshold`        | float | 0.5      | Minimum prefix match ratio for cache routing (0.0-1.0) |
+| `--balance-abs-threshold`  | int   | 32       | Absolute load difference threshold                     |
+| `--balance-rel-threshold`  | float | 1.0001   | Relative load ratio threshold                          |
+| `--eviction-interval-secs` | int   | 60       | Seconds between cache eviction cycles                  |
+| `--max-tree-size`          | int   | 16777216 | Maximum nodes in routing tree                          |
+
+### Fault Tolerance Parameters
+
+| Parameter                    | Type  | Default | Description                           |
+| ---------------------------- | ----- | ------- | ------------------------------------- |
+| `--retry-max-retries`        | int   | 3       | Maximum retry attempts per request    |
+| `--retry-initial-backoff-ms` | int   | 100     | Initial retry backoff in milliseconds |
+| `--retry-max-backoff-ms`     | int   | 10000   | Maximum retry backoff in milliseconds |
+| `--retry-backoff-multiplier` | float | 2.0     | Backoff multiplier between retries    |
+| `--retry-jitter-factor`      | float | 0.1     | Random jitter factor for retries      |
+| `--disable-retries`          | flag  | False   | Disable retry mechanism               |
+| `--cb-failure-threshold`     | int   | 5       | Failures before circuit opens         |
+| `--cb-success-threshold`     | int   | 2       | Successes to close circuit            |
+| `--cb-timeout-duration-secs` | int   | 30      | Circuit breaker timeout duration      |
+| `--cb-window-duration-secs`  | int   | 60      | Circuit breaker window duration       |
+| `--disable-circuit-breaker`  | flag  | False   | Disable circuit breaker               |
+
+### Prefill-Decode Disaggregation Parameters
+
+| Parameter                         | Type | Default | Description                                           |
+| --------------------------------- | ---- | ------- | ----------------------------------------------------- |
+| `--pd-disaggregation`             | flag | False   | Enable PD disaggregated mode                          |
+| `--prefill`                       | list | []      | Prefill server URLs with optional bootstrap ports     |
+| `--decode`                        | list | []      | Decode server URLs                                    |
+| `--prefill-policy`                | str  | None    | Routing policy for prefill nodes (overrides --policy) |
+| `--decode-policy`                 | str  | None    | Routing policy for decode nodes (overrides --policy)  |
+| `--worker-startup-timeout-secs`   | int  | 300     | Timeout for worker startup                            |
+| `--worker-startup-check-interval` | int  | 10      | Interval between startup checks                       |
+
+### Kubernetes Integration
+
+| Parameter                       | Type | Default                  | Description                                          |
+| ------------------------------- | ---- | ------------------------ | ---------------------------------------------------- |
+| `--service-discovery`           | flag | False                    | Enable Kubernetes service discovery                  |
+| `--selector`                    | list | []                       | Label selector for workers (key1=value1 key2=value2) |
+| `--prefill-selector`            | list | []                       | Label selector for prefill servers in PD mode        |
+| `--decode-selector`             | list | []                       | Label selector for decode servers in PD mode         |
+| `--service-discovery-port`      | int  | 80                       | Port for discovered pods                             |
+| `--service-discovery-namespace` | str  | None                     | Kubernetes namespace to watch                        |
+| `--bootstrap-port-annotation`   | str  | sglang.ai/bootstrap-port | Annotation for bootstrap ports                       |
+
+### Observability
+
+| Parameter              | Type | Default   | Description                                           |
+| ---------------------- | ---- | --------- | ----------------------------------------------------- |
+| `--prometheus-port`    | int  | 29000     | Prometheus metrics port                               |
+| `--prometheus-host`    | str  | 127.0.0.1 | Prometheus metrics host                               |
+| `--log-dir`            | str  | None      | Directory for log files                               |
+| `--log-level`          | str  | info      | Logging level (debug, info, warning, error, critical) |
+| `--request-id-headers` | list | None      | Custom headers for request tracing                    |
+
+### CORS Configuration
+
+| Parameter                | Type | Default | Description          |
+| ------------------------ | ---- | ------- | -------------------- |
+| `--cors-allowed-origins` | list | []      | Allowed CORS origins |
+
+## Advanced Features
+
+### Kubernetes Service Discovery
+
+Automatically discover and manage workers in Kubernetes:
+
+#### Standard Mode
+```bash
+python -m sglang_router.launch_router \
+    --service-discovery \
+    --selector app=sglang-worker env=prod \
+    --service-discovery-namespace production \
+    --service-discovery-port 8000
+```
+
+#### Prefill-Decode Disaggregation Mode
+```bash
+python -m sglang_router.launch_router \
+    --pd-disaggregation \
+    --service-discovery \
+    --prefill-selector app=prefill-server env=prod \
+    --decode-selector app=decode-server env=prod \
+    --service-discovery-namespace production
+```
+
+**Note**: The `--bootstrap-port-annotation` (default: `sglang.ai/bootstrap-port`) is used to discover bootstrap ports for prefill servers in PD mode. Prefill pods should have this annotation set to their bootstrap port value.
+
+### Prometheus Metrics
+
+Expose metrics for monitoring:
+
+```bash
+python -m sglang_router.launch_router \
+    --worker-urls http://worker1:8000 http://worker2:8001 \
+    --prometheus-port 29000 \
+    --prometheus-host 0.0.0.0
+```
+
+Metrics available at `http://localhost:29000/metrics`
+
+### Request Tracing
+
+Enable request ID tracking:
+
+```bash
+python -m sglang_router.launch_router \
+    --worker-urls http://worker1:8000 http://worker2:8001 \
+    --request-id-headers x-request-id x-trace-id
+```
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Workers not connecting**: Ensure workers are fully initialized before starting the router. Use `--worker-startup-timeout-secs` to increase wait time.
+
+2. **High latency**: Check if cache-aware routing is causing imbalance. Try adjusting `--balance-abs-threshold` and `--balance-rel-threshold`.
+
+3. **Memory growth**: Reduce `--max-tree-size` or decrease `--eviction-interval-secs` for more aggressive cache cleanup.
+
+4. **Circuit breaker triggering frequently**: Increase `--cb-failure-threshold` or extend `--cb-window-duration-secs`.
+
+### Debug Mode
+
+Enable detailed logging:
+
+```bash
+python -m sglang_router.launch_router \
+    --worker-urls http://worker1:8000 http://worker2:8001 \
+    --log-level debug \
+    --log-dir ./router_logs
+```
diff --git a/docs/advanced_features/separate_reasoning.ipynb b/docs/advanced_features/separate_reasoning.ipynb
new file mode 100644
index 000000000..0c20c5a08
--- /dev/null
+++ b/docs/advanced_features/separate_reasoning.ipynb
@@ -0,0 +1,381 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Reasoning Parser\n",
+    "\n",
+    "SGLang supports parsing reasoning content out from \"normal\" content for reasoning models such as [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1).\n",
+    "\n",
+    "## Supported Models & Parsers\n",
+    "\n",
+    "| Model  |  Reasoning tags      | Parser | Notes |\n",
+    "|---------|-----------------------------|------------------|-------|\n",
+    "| [DeepSeek‑R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `<think>` … `</think>` | `deepseek-r1` | Supports all variants (R1, R1-0528, R1-Distill) |\n",
+    "| [DeepSeek‑V3.1](https://huggingface.co/deepseek-ai/DeepSeek-V3.1) | `<think>` … `</think>` | `deepseek-v3` | Supports `thinking` parameter |\n",
+    "| [Standard Qwen3 models](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `<think>` … `</think>` | `qwen3` | Supports `enable_thinking` parameter |\n",
+    "| [Qwen3-Thinking models](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507) | `<think>` … `</think>` | `qwen3` or `qwen3-thinking` | Always generates thinking content |\n",
+    "| [Kimi models](https://huggingface.co/moonshotai/models) | `◁think▷` … `◁/think▷` | `kimi` | Uses special thinking delimiters |\n",
+    "| [GPT OSS](https://huggingface.co/openai/gpt-oss-120b) | `<\\|channel\\|>analysis<\\|message\\|>` … `<\\|end\\|>` | `gpt-oss` | N/A |\n",
+    "### Model-Specific Behaviors\n",
+    "\n",
+    "**DeepSeek-R1 Family:**\n",
+    "- DeepSeek-R1: No `<think>` start tag, jumps directly to thinking content\n",
+    "- DeepSeek-R1-0528: Generates both `<think>` start and `</think>` end tags\n",
+    "- Both are handled by the same `deepseek-r1` parser\n",
+    "\n",
+    "**DeepSeek-V3 Family:**\n",
+    "- DeepSeek-V3.1: Hybrid model supporting both thinking and non-thinking modes, use the `deepseek-v3` parser and `thinking` parameter (NOTE: not `enable_thinking`)\n",
+    "\n",
+    "**Qwen3 Family:**\n",
+    "- Standard Qwen3 (e.g., Qwen3-2507): Use `qwen3` parser, supports `enable_thinking` in chat templates\n",
+    "- Qwen3-Thinking (e.g., Qwen3-235B-A22B-Thinking-2507): Use `qwen3` or `qwen3-thinking` parser, always thinks\n",
+    "\n",
+    "**Kimi:**\n",
+    "- Kimi: Uses special `◁think▷` and `◁/think▷` tags\n",
+    "\n",
+    "**GPT OSS:**\n",
+    "- GPT OSS: Uses special `<|channel|>analysis<|message|>` and `<|end|>` tags"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Usage\n",
+    "\n",
+    "### Launching the Server"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Specify the `--reasoning-parser` option."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "from openai import OpenAI\n",
+    "from sglang.test.doc_patch import launch_server_cmd\n",
+    "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+    "\n",
+    "server_process, port = launch_server_cmd(\n",
+    "    \"python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1 --log-level warning\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note that `--reasoning-parser` defines the parser used to interpret responses."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### OpenAI Compatible API\n",
+    "\n",
+    "Using the OpenAI compatible API, the contract follows the [DeepSeek API design](https://api-docs.deepseek.com/guides/reasoning_model) established with the release of DeepSeek-R1:\n",
+    "\n",
+    "- `reasoning_content`: The content of the CoT.\n",
+    "- `content`: The content of the final answer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize OpenAI-like client\n",
+    "client = OpenAI(api_key=\"None\", base_url=f\"http://0.0.0.0:{port}/v1\")\n",
+    "model_name = client.models.list().data[0].id\n",
+    "\n",
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": \"What is 1+3?\",\n",
+    "    }\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Non-Streaming Request"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response_non_stream = client.chat.completions.create(\n",
+    "    model=model_name,\n",
+    "    messages=messages,\n",
+    "    temperature=0.6,\n",
+    "    top_p=0.95,\n",
+    "    stream=False,  # Non-streaming\n",
+    "    extra_body={\"separate_reasoning\": True},\n",
+    ")\n",
+    "print_highlight(\"==== Reasoning ====\")\n",
+    "print_highlight(response_non_stream.choices[0].message.reasoning_content)\n",
+    "\n",
+    "print_highlight(\"==== Text ====\")\n",
+    "print_highlight(response_non_stream.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Streaming Request"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response_stream = client.chat.completions.create(\n",
+    "    model=model_name,\n",
+    "    messages=messages,\n",
+    "    temperature=0.6,\n",
+    "    top_p=0.95,\n",
+    "    stream=True,  # Non-streaming\n",
+    "    extra_body={\"separate_reasoning\": True},\n",
+    ")\n",
+    "\n",
+    "reasoning_content = \"\"\n",
+    "content = \"\"\n",
+    "for chunk in response_stream:\n",
+    "    if chunk.choices[0].delta.content:\n",
+    "        content += chunk.choices[0].delta.content\n",
+    "    if chunk.choices[0].delta.reasoning_content:\n",
+    "        reasoning_content += chunk.choices[0].delta.reasoning_content\n",
+    "\n",
+    "print_highlight(\"==== Reasoning ====\")\n",
+    "print_highlight(reasoning_content)\n",
+    "\n",
+    "print_highlight(\"==== Text ====\")\n",
+    "print_highlight(content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Optionally, you can buffer the reasoning content to the last reasoning chunk (or the first chunk after the reasoning content)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response_stream = client.chat.completions.create(\n",
+    "    model=model_name,\n",
+    "    messages=messages,\n",
+    "    temperature=0.6,\n",
+    "    top_p=0.95,\n",
+    "    stream=True,  # Non-streaming\n",
+    "    extra_body={\"separate_reasoning\": True, \"stream_reasoning\": False},\n",
+    ")\n",
+    "\n",
+    "reasoning_content = \"\"\n",
+    "content = \"\"\n",
+    "for chunk in response_stream:\n",
+    "    if chunk.choices[0].delta.content:\n",
+    "        content += chunk.choices[0].delta.content\n",
+    "    if chunk.choices[0].delta.reasoning_content:\n",
+    "        reasoning_content += chunk.choices[0].delta.reasoning_content\n",
+    "\n",
+    "print_highlight(\"==== Reasoning ====\")\n",
+    "print_highlight(reasoning_content)\n",
+    "\n",
+    "print_highlight(\"==== Text ====\")\n",
+    "print_highlight(content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The reasoning separation is enable by default when specify . \n",
+    "**To disable it, set the `separate_reasoning` option to `False` in request.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response_non_stream = client.chat.completions.create(\n",
+    "    model=model_name,\n",
+    "    messages=messages,\n",
+    "    temperature=0.6,\n",
+    "    top_p=0.95,\n",
+    "    stream=False,  # Non-streaming\n",
+    "    extra_body={\"separate_reasoning\": False},\n",
+    ")\n",
+    "\n",
+    "print_highlight(\"==== Original Output ====\")\n",
+    "print_highlight(response_non_stream.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### SGLang Native API "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\")\n",
+    "input = tokenizer.apply_chat_template(\n",
+    "    messages,\n",
+    "    tokenize=False,\n",
+    "    add_generation_prompt=True,\n",
+    ")\n",
+    "\n",
+    "gen_url = f\"http://localhost:{port}/generate\"\n",
+    "gen_data = {\n",
+    "    \"text\": input,\n",
+    "    \"sampling_params\": {\n",
+    "        \"skip_special_tokens\": False,\n",
+    "        \"max_new_tokens\": 1024,\n",
+    "        \"temperature\": 0.6,\n",
+    "        \"top_p\": 0.95,\n",
+    "    },\n",
+    "}\n",
+    "gen_response = requests.post(gen_url, json=gen_data).json()[\"text\"]\n",
+    "\n",
+    "print_highlight(\"==== Original Output ====\")\n",
+    "print_highlight(gen_response)\n",
+    "\n",
+    "parse_url = f\"http://localhost:{port}/separate_reasoning\"\n",
+    "separate_reasoning_data = {\n",
+    "    \"text\": gen_response,\n",
+    "    \"reasoning_parser\": \"deepseek-r1\",\n",
+    "}\n",
+    "separate_reasoning_response_json = requests.post(\n",
+    "    parse_url, json=separate_reasoning_data\n",
+    ").json()\n",
+    "print_highlight(\"==== Reasoning ====\")\n",
+    "print_highlight(separate_reasoning_response_json[\"reasoning_text\"])\n",
+    "print_highlight(\"==== Text ====\")\n",
+    "print_highlight(separate_reasoning_response_json[\"text\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Offline Engine API"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sglang as sgl\n",
+    "from sglang.srt.parser.reasoning_parser import ReasoningParser\n",
+    "from sglang.utils import print_highlight\n",
+    "\n",
+    "llm = sgl.Engine(model_path=\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\")\n",
+    "input = tokenizer.apply_chat_template(\n",
+    "    messages,\n",
+    "    tokenize=False,\n",
+    "    add_generation_prompt=True,\n",
+    ")\n",
+    "sampling_params = {\n",
+    "    \"max_new_tokens\": 1024,\n",
+    "    \"skip_special_tokens\": False,\n",
+    "    \"temperature\": 0.6,\n",
+    "    \"top_p\": 0.95,\n",
+    "}\n",
+    "result = llm.generate(prompt=input, sampling_params=sampling_params)\n",
+    "\n",
+    "generated_text = result[\"text\"]  # Assume there is only one prompt\n",
+    "\n",
+    "print_highlight(\"==== Original Output ====\")\n",
+    "print_highlight(generated_text)\n",
+    "\n",
+    "parser = ReasoningParser(\"deepseek-r1\")\n",
+    "reasoning_text, text = parser.parse_non_stream(generated_text)\n",
+    "print_highlight(\"==== Reasoning ====\")\n",
+    "print_highlight(reasoning_text)\n",
+    "print_highlight(\"==== Text ====\")\n",
+    "print_highlight(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm.shutdown()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Supporting New Reasoning Model Schemas\n",
+    "\n",
+    "For future reasoning models, you can implement the reasoning parser as a subclass of `BaseReasoningFormatDetector` in `python/sglang/srt/reasoning_parser.py` and specify the reasoning parser for new reasoning model schemas accordingly."
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/advanced_features/server_arguments.md b/docs/advanced_features/server_arguments.md
new file mode 100644
index 000000000..873fa8b05
--- /dev/null
+++ b/docs/advanced_features/server_arguments.md
@@ -0,0 +1,318 @@
+# Server Arguments
+
+This page provides a list of server arguments used in the command line to configure the behavior
+and performance of the language model server during deployment. These arguments enable users to
+customize key aspects of the server, including model selection, parallelism policies,
+memory management, and optimization techniques.
+You can find all arguments by `python3 -m sglang.launch_server --help`
+
+## Common launch commands
+
+- To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
+
+  ```bash
+  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 2
+  ```
+
+- To enable multi-GPU data parallelism, add `--dp 2`. Data parallelism is better for throughput if there is enough memory. It can also be used together with tensor parallelism. The following command uses 4 GPUs in total. We recommend [SGLang Router](../advanced_features/router.md) for data parallelism.
+
+  ```bash
+  python -m sglang_router.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --dp 2 --tp 2
+  ```
+
+- If you see out-of-memory errors during serving, try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
+
+  ```bash
+  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --mem-fraction-static 0.7
+  ```
+
+- See [hyperparameter tuning](hyperparameter_tuning.md) on tuning hyperparameters for better performance.
+- For docker and Kubernetes runs, you need to set up shared memory which is used for communication between processes. See `--shm-size` for docker and `/dev/shm` size update for Kubernetes manifests.
+- If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
+
+  ```bash
+  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
+  ```
+
+- To enable `torch.compile` acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. By default, the cache path is located at `/tmp/torchinductor_root`, you can customize it using environment variable `TORCHINDUCTOR_CACHE_DIR`. For more details, please refer to [PyTorch official documentation](https://pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html) and [Enabling cache for torch.compile](https://docs.sglang.ai/backend/hyperparameter_tuning.html#enabling-cache-for-torch-compile).
+- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports other [quantization strategies (INT8/FP8)](https://github.com/sgl-project/sglang/blob/v0.3.6/python/sglang/srt/server_args.py#L671) as well.
+- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
+- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
+- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](../references/custom_chat_template.md).
+- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, you can use the following commands. If you meet deadlock, please try to add `--disable-cuda-graph`
+
+  ```bash
+  # Node 0
+  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --dist-init-addr sgl-dev-0:50000 --nnodes 2 --node-rank 0
+
+  # Node 1
+  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --dist-init-addr sgl-dev-0:50000 --nnodes 2 --node-rank 1
+  ```
+
+Please consult the documentation below and [server_args.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/server_args.py) to learn more about the arguments you may provide when launching a server.
+
+## Model and tokenizer
+
+| Arguments | Description | Defaults |
+|-----------|-------------|----------|
+| `--model-path` | The path of the model weights. This can be a local folder or a Hugging Face repo ID. | None |
+| `--tokenizer-path` | The path of the tokenizer. | None |
+| `--tokenizer-mode` | Tokenizer mode. 'auto' will use the fast tokenizer if available, and 'slow' will always use the slow tokenizer. | auto |
+| `--skip-tokenizer-init` | If set, skip init tokenizer and pass input_ids in generate request. | False |
+| `--load-format` | The format of the model weights to load. 'auto' will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is not available. 'pt' will load the weights in the pytorch bin format. 'safetensors' will load the weights in the safetensors format. 'npcache' will load the weights in pytorch format and store a numpy cache to speed up the loading. 'dummy' will initialize the weights with random values, which is mainly for profiling. 'gguf' will load the weights in the gguf format. 'bitsandbytes' will load the weights using bitsandbytes quantization. 'layered' loads weights layer by layer so that one can quantize a layer before loading another to make the peak memory envelope smaller. | auto |
+| `--trust-remote-code` | Whether or not to allow for custom models defined on the Hub in their own modeling files. | False |
+| `--context-length` | The model's maximum context length. Defaults to None (will use the value from the model's config.json instead). | None |
+| `--is-embedding` | Whether to use a CausalLM as an embedding model. | False |
+| `--enable-multimodal` | Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen. | None |
+| `--revision` | The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version. | None |
+| `--model-impl` | Which implementation of the model to use. 'auto' will try to use the SGLang implementation if it exists and fall back to the Transformers implementation if no SGLang implementation is available. 'sglang' will use the SGLang model implementation. 'transformers' will use the Transformers model implementation. | auto |
+
+## HTTP server
+
+| Arguments | Description | Defaults |
+|-----------|-------------|----------|
+| `--host` | The host address for the server. | 127.0.0.1 |
+| `--port` | The port number for the server. | 30000 |
+| `--skip-server-warmup` | If set, skip the server warmup process. | False |
+| `--warmups` | Warmup configurations. | None |
+| `--nccl-port` | The port for NCCL initialization. | None |
+
+## Quantization and data type
+
+| Arguments | Description | Defaults |
+|-----------|-------------|----------|
+| `--dtype` | Data type for model weights and activations. 'auto' will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models. 'half' for FP16. Recommended for AWQ quantization. 'float16' is the same as 'half'. 'bfloat16' for a balance between precision and range. 'float' is shorthand for FP32 precision. 'float32' for FP32 precision. | auto |
+| `--quantization` | The quantization method. | None |
+| `--quantization-param-path` | Path to the JSON file containing the KV cache scaling factors. This should generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache scaling factors default to 1.0, which may cause accuracy issues. | None |
+| `--kv-cache-dtype` | Data type for kv cache storage. 'auto' will use model data type. 'fp8_e5m2' and 'fp8_e4m3' is supported for CUDA 11.8+. | auto |
+
+## Memory and scheduling
+
+| Arguments | Description | Defaults |
+|-----------|-------------|----------|
+| `--mem-fraction-static` | The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors. | None |
+| `--max-running-requests` | The maximum number of running requests. | None |
+| `--max-total-tokens` | The maximum number of tokens in the memory pool. If not specified, it will be automatically calculated based on the memory usage fraction. This option is typically used for development and debugging purposes. | None |
+| `--chunked-prefill-size` | The maximum number of tokens in a chunk for the chunked prefill. Setting this to -1 means disabling chunked prefill. | None |
+| `--max-prefill-tokens` | The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length. | 16384 |
+| `--schedule-policy` | The scheduling policy of the requests. | fcfs |
+| `--schedule-conservativeness` | How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently. | 1.0 |
+| `--cpu-offload-gb` | How many GBs of RAM to reserve for CPU offloading. | 0 |
+| `--page-size` | The number of tokens in a page. | 1 |
+
+## Runtime options
+
+| Arguments | Description | Defaults |
+|-----------|-------------|----------|
+| `--device` | The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified. | None |
+| `--tp-size` | The tensor parallelism size. | 1 |
+| `--pp-size` | The pipeline parallelism size. | 1 |
+| `--max-micro-batch-size` | The maximum micro batch size in pipeline parallelism. | None |
+| `--stream-interval` | The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher. | 1 |
+| `--stream-output` | Whether to output as a sequence of disjoint segments. | False |
+| `--random-seed` | The random seed. | None |
+| `--constrained-json-whitespace-pattern` | Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*. | None |
+| `--watchdog-timeout` | Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging. | 300 |
+| `--dist-timeout` | Set timeout for torch.distributed initialization. | None |
+| `--download-dir` | Model download directory for huggingface. | None |
+| `--base-gpu-id` | The base GPU ID to start allocating GPUs from. Useful when running multiple instances on the same machine. | 0 |
+| `--gpu-id-step` | The delta between consecutive GPU IDs that are used. For example, setting it to 2 will use GPU 0,2,4,.... | 1 |
+| `--sleep-on-idle` | Reduce CPU usage when sglang is idle. | False |
+
+## Logging
+
+| Arguments                             | Description                                                                                                                                                                                                                                                                                                                                                                                  | Defaults |
+|---------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|
+| `--log-level`                         | The logging level of all loggers.                                                                                                                                                                                                                                                                                                                                                            | info     |
+| `--log-level-http`                    | The logging level of HTTP server. If not set, reuse --log-level by default.                                                                                                                                                                                                                                                                                                                  | None     |
+| `--log-requests`                      | Log metadata, inputs, outputs of all requests. The verbosity is decided by --log-requests-level.                                                                                                                                                                                                                                                                                             | False    |
+| `--log-requests-level`                | 0: Log metadata (no sampling parameters). 1: Log metadata and sampling parameters. 2: Log metadata, sampling parameters and partial input/output. 3: Log every input/output.                                                                                                                                                                                                                 | 0        |
+| `--show-time-cost`                    | Show time cost of custom marks.                                                                                                                                                                                                                                                                                                                                                              | False    |
+| `--enable-metrics`                    | Enable log prometheus metrics.                                                                                                                                                                                                                                                                                                                                                               | False    |
+| `--bucket-time-to-first-token`        | The buckets of time to first token, specified as a list of floats.                                                                                                                                                                                                                                                                                                                           | None     |
+| `--bucket-inter-token-latency`        | The buckets of inter-token latency, specified as a list of floats.                                                                                                                                                                                                                                                                                                                           | None     |
+| `--bucket-e2e-request-latency`        | The buckets of end-to-end request latency, specified as a list of floats.                                                                                                                                                                                                                                                                                                                    | None     |
+| `--collect-tokens-histogram`          | Collect prompt/generation tokens histogram.                                                                                                                                                                                                                                                                                                                                                  | False    |
+| `--kv-events-config`                  | Config in json format for NVIDIA dynamo KV event publishing. Publishing will be enabled if this flag is used.                                                                                                                                                                                                                                                                                | None     |
+| `--decode-log-interval`               | The log interval of decode batch.                                                                                                                                                                                                                                                                                                                                                            | 40       |
+| `--enable-request-time-stats-logging` | Enable per request time stats logging.                                                                                                                                                                                                                                                                                                                                                       | False    |
+| `--prompt-tokens-buckets`             | The buckets rule of prompt tokens. Supports 3 rule types: 'default' uses predefined buckets; 'tse <middle> <base> <count>' generates two sides exponential distributed buckets (e.g., 'tse 1000 2 8' generates buckets [984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'customer <value1> <value2> ...' uses custom bucket values (e.g., 'customer 10 50 100 500'). | None     |
+| `--generation-tokens-buckets`         | The buckets rule of prompt tokens. Supports 3 rule types: 'default' uses predefined buckets; 'tse <middle> <base> <count>' generates two sides exponential distributed buckets (e.g., 'tse 1000 2 8' generates buckets [984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'customer <value1> <value2> ...' uses custom bucket values (e.g., 'customer 10 50 100 500'). | None     |
+
+## API related
+
+| Arguments | Description | Defaults |
+|-----------|-------------|----------|
+| `--api-key` | Set API key of the server. It is also used in the OpenAI API compatible server. | None |
+| `--served-model-name` | Override the model name returned by the v1/models endpoint in OpenAI API server. | None |
+| `--chat-template` | The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server. | None |
+| `--completion-template` | The buliltin completion template name or the path of the completion template file. This is only used for OpenAI-compatible API server. only for code completion currently. | None |
+| `--file-storage-path` | The path of the file storage in backend. | sglang_storage |
+| `--enable-cache-report` | Return number of cached tokens in usage.prompt_tokens_details for each openai request. | False |
+| `--reasoning-parser` | Specify the parser for reasoning models, supported parsers are: {list(ReasoningParser.DetectorMap.keys())}. | None |
+| `--tool-call-parser` | Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', 'qwen3_coder', 'glm45', and 'step3'. | None |
+
+## Data parallelism
+
+| Arguments | Description | Defaults |
+|-----------|-------------|----------|
+| `--dp-size` | The data parallelism size. | 1 |
+| `--load-balance-method` | The load balancing strategy for data parallelism. Options include: 'round_robin', 'minimum_tokens'. The Minimum Token algorithm can only be used when DP attention is applied. This algorithm performs load balancing based on the real-time token load of the DP workers. | round_robin |
+
+## Multi-node distributed serving
+
+| Arguments | Description | Defaults |
+|-----------|-------------|----------|
+| `--dist-init-addr` | The host address for initializing distributed backend (e.g., `192.168.0.2:25000`). | None |
+| `--nnodes` | The number of nodes. | 1 |
+| `--node-rank` | The node rank. | 0 |
+
+## Model override args in JSON
+
+| Arguments | Description | Defaults |
+|-----------|-------------|----------|
+| `--json-model-override-args` | A dictionary in JSON string format used to override default model configurations. | {} |
+| `--preferred-sampling-params` | json-formatted sampling settings that will be returned in /get_model_info. | None |
+
+## LoRA
+
+| Arguments | Description | Defaults |
+|-----------|-------------|----------|
+| `--enable-lora` | Enable LoRA support for the model. This argument is automatically set to True if `--lora-paths` is provided for backward compatibility. | False |
+| `--max-lora-rank` | The maximum LoRA rank that should be supported. If not specified, it will be automatically inferred from the adapters provided in `--lora-paths`. This argument is needed when you expect to dynamically load adapters of larger LoRA rank after server startup. | None |
+| `--lora-target-modules` | The union set of all target modules where LoRA should be applied (e.g., `q_proj`, `k_proj`, `gate_proj`). If not specified, it will be automatically inferred from the adapters provided in `--lora-paths`. This argument is needed when you expect to dynamically load adapters of different target modules after server startup. You can also set it to `all` to enable LoRA for all supported modules. However, enabling LoRA on additional modules introduces a minor performance overhead. If your application is performance-sensitive, we recommend only specifying the modules for which you plan to load adapters. | None |
+| `--lora-paths` | The list of LoRA adapters to load. Each adapter must be specified in one of the following formats: <PATH> | <NAME>=<PATH> | JSON with schema {"lora_name":str,"lora_path":str,"pinned":bool} | None |
+| `--max-loras-per-batch` | Maximum number of adapters for a running batch, include base-only request. | 8 |
+| `--max-loaded-loras` | If specified, it limits the maximum number of LoRA adapters loaded in CPU memory at a time. The value must be greater than or equal to `--max-loras-per-batch`. | None |
+| `--lora-backend` | Choose the kernel backend for multi-LoRA serving. | triton |
+
+## Kernel backend
+
+| Arguments | Description | Defaults |
+|-----------|-------------|----------|
+| `--attention-backend` | Choose the kernels for attention layers. | None |
+| `--prefill-attention-backend` | (Experimental) This argument specifies the backend for prefill attention computation. Note that this argument has priority over `attention_backend`. | None |
+| `--decode-attention-backend` | (Experimental) This argument specifies the backend for decode attention computation. Note that this argument has priority over `attention_backend`. | None |
+| `--sampling-backend` | Choose the kernels for sampling layers. | None |
+| `--grammar-backend` | Choose the backend for grammar-guided decoding. | None |
+| `--mm-attention-backend` | Set multimodal attention backend. | None |
+
+## Speculative decoding
+
+| Arguments | Description | Defaults |
+|-----------|-------------|----------|
+| `--speculative-algorithm` | Speculative algorithm. | None |
+| `--speculative-draft-model-path` | The path of the draft model weights. This can be a local folder or a Hugging Face repo ID. | None |
+| `--speculative-num-steps` | The number of steps sampled from draft model in Speculative Decoding. | None |
+| `--speculative-eagle-topk` | The number of tokens sampled from the draft model in eagle2 each step. | None |
+| `--speculative-num-draft-tokens` | The number of tokens sampled from the draft model in Speculative Decoding. | None |
+| `--speculative-accept-threshold-single` | Accept a draft token if its probability in the target model is greater than this threshold. | 1.0 |
+| `--speculative-accept-threshold-acc` | The accept probability of a draft token is raised from its target probability p to min(1, p / threshold_acc). | 1.0 |
+| `--speculative-token-map` | The path of the draft model's small vocab table. | None |
+| `--speculative-attention-mode` | Attention backend for speculative decoding operations (both target verify and draft extend). Can be one of 'prefill' (default) or 'decode'. | Prefill |
+
+## Expert parallelism
+
+| Arguments | Description | Defaults |
+|-----------|-------------|----------|
+| `--ep-size` | The expert parallelism size. | 1 |
+| `--moe-a2a-backend` | Select the backend for all-to-all communication for expert parallelism. | none |
+| `--moe-runner-backend` | Select the runner backend for MoE. | 'triton' |
+| `--deepep-mode` | Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch. | auto |
+| `--ep-num-redundant-experts` | Allocate this number of redundant experts in expert parallel. | 0 |
+| `--ep-dispatch-algorithm` | The algorithm to choose ranks for redundant experts in EPLB. | None |
+| `--init-expert-location` | Initial location of EP experts. | trivial |
+| `--enable-eplb` | Enable EPLB algorithm. | False |
+| `--eplb-algorithm` | Chosen EPLB algorithm. | auto |
+| `--eplb-rebalance-num-iterations` | Number of iterations to automatically trigger a EPLB re-balance. | 1000 |
+| `--eplb-rebalance-layers-per-chunk` | Number of layers to rebalance per forward pass. | None |
+| `--expert-distribution-recorder-mode` | Mode of expert distribution recorder. | None |
+| `--expert-distribution-recorder-buffer-size` | Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer. | None |
+| `--enable-expert-distribution-metrics` | Enable logging metrics for expert balancedness. | False |
+| `--deepep-config` | Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path. | None |
+| `--moe-dense-tp-size` | TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports. | None |
+
+## Hierarchical cache
+
+| Arguments | Description | Defaults |
+|-----------|-------------|----------|
+| `--enable-hierarchical-cache` | Enable hierarchical cache. | False |
+| `--hicache-ratio` | The ratio of the size of host KV cache memory pool to the size of device pool. | 2.0 |
+| `--hicache-size` | The size of the hierarchical cache. | 0 |
+| `--hicache-write-policy` | The write policy for hierarchical cache. | write_through |
+| `--hicache-io-backend` | The IO backend for hierarchical cache. |  |
+| `--hicache-storage-backend` | The storage backend for hierarchical cache. | None |
+
+## Optimization/debug options
+
+| Arguments | Description | Defaults |
+|-----------|-------------|----------|
+| `--disable-radix-cache` | Disable RadixAttention for prefix caching. | False |
+| `--cuda-graph-max-bs` | Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value. | None |
+| `--cuda-graph-bs` | Set the list of batch sizes for cuda graph. | None |
+| `--disable-cuda-graph` | Disable cuda graph. | False |
+| `--disable-cuda-graph-padding` | Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed. | False |
+| `--enable-profile-cuda-graph` | Enable profiling of cuda graph capture. | False |
+| `--enable-nccl-nvls` | Enable NCCL NVLS for prefill heavy requests when available. | False |
+| `--enable-symm-mem` | Enable NCCL symmetric memory for fast collectives. | False |
+| `--enable-tokenizer-batch-encode` | Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds. | False |
+| `--disable-outlines-disk-cache` | Disable disk cache of outlines to avoid possible crashes related to file system or high concurrency. | False |
+| `--disable-custom-all-reduce` | Disable the custom all-reduce kernel and fall back to NCCL. | False |
+| `--enable-mscclpp` | Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL. | False |
+| `--disable-overlap-schedule` | Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker. | False |
+| `--enable-mixed-chunk` | Enabling mixing prefill and decode in a batch when using chunked prefill. | False |
+| `--enable-dp-attention` | Enabling data parallelism for attention and tensor parallelism for FFN. The dp size should be equal to the tp size. Currently DeepSeek-V2 and Qwen 2/3 MoE models are supported. | False |
+| `--enable-dp-lm-head` | Enable vocabulary parallel across the attention TP group to avoid all-gather across DP groups, optimizing performance under DP attention. | False |
+| `--enable-two-batch-overlap` | Enabling two micro batches to overlap. | False |
+| `--tbo-token-distribution-threshold` | The threshold of token distribution between two batches in micro-batch-overlap, determines whether to two-batch-overlap or two-chunk-overlap. Set to 0 denote disable two-chunk-overlap. | 0.48 |
+| `--enable-torch-compile` | Optimize the model with torch.compile. Experimental feature. | False |
+| `--torch-compile-max-bs` | Set the maximum batch size when using torch compile. | 32 |
+| `--torchao-config` | Optimize the model with torchao. Experimental feature. Current choices are: int8dq, int8wo, int4wo-<group_size>, fp8wo, fp8dq-per_tensor, fp8dq-per_row. |  |
+| `--enable-nan-detection` | Enable the NaN detection for debugging purposes. | False |
+| `--enable-p2p-check` | Enable P2P check for GPU access, otherwise the p2p access is allowed by default. | False |
+| `--triton-attention-reduce-in-fp32` | Cast the intermediate attention results to fp32 to avoid possible crashes related to fp16. This only affects Triton attention kernels. | False |
+| `--triton-attention-num-kv-splits` | The number of KV splits in flash decoding Triton kernel. Larger value is better in longer context scenarios. The default value is 8. | 8 |
+| `--num-continuous-decode-steps` | Run multiple continuous decoding steps to reduce scheduling overhead. This can potentially increase throughput but may also increase time-to-first-token latency. The default value is 1, meaning only run one decoding step at a time. | 1 |
+| `--delete-ckpt-after-loading` | Delete the model checkpoint after loading the model. | False |
+| `--enable-memory-saver` | Allow saving memory using release_memory_occupation and resume_memory_occupation. | False |
+| `--allow-auto-truncate` | Allow automatically truncating requests that exceed the maximum input length instead of returning an error. | False |
+| `--enable-custom-logit-processor` | Enable users to pass custom logit processors to the server (disabled by default for security). | False |
+| `--flashinfer-mla-disable-ragged` | Disable ragged processing in Flashinfer MLA. | False |
+| `--disable-shared-experts-fusion` | Disable shared experts fusion. | False |
+| `--disable-chunked-prefix-cache` | Disable chunked prefix cache. | False |
+| `--disable-fast-image-processor` | Disable fast image processor. | False |
+| `--enable-return-hidden-states` | Enable returning hidden states. | False |
+
+## Debug tensor dumps
+
+| Arguments | Description | Defaults |
+|-----------|-------------|----------|
+| `--debug-tensor-dump-output-folder` | The output folder for debug tensor dumps. | None |
+| `--debug-tensor-dump-input-file` | The input file for debug tensor dumps. | None |
+| `--debug-tensor-dump-inject` | Enable injection of debug tensor dumps. | False |
+| `--debug-tensor-dump-prefill-only` | Enable prefill-only mode for debug tensor dumps. | False |
+
+## PD disaggregation
+
+| Arguments | Description | Defaults |
+|-----------|-------------|----------|
+| `--disaggregation-mode` | PD disaggregation mode: "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only). | null |
+| `--disaggregation-transfer-backend` | The transfer backend for PD disaggregation. | mooncake |
+| `--disaggregation-bootstrap-port` | The bootstrap port for PD disaggregation. | 8998 |
+| `--disaggregation-decode-tp` | The decode TP for PD disaggregation. | None |
+| `--disaggregation-decode-dp` | The decode DP for PD disaggregation. | None |
+| `--disaggregation-prefill-pp` | The prefill PP for PD disaggregation. | 1 |
+
+## Model weight update
+
+| Arguments | Description | Defaults |
+|-----------|-------------|----------|
+| `--custom-weight-loader` | Custom weight loader paths. | None |
+| `--weight-loader-disable-mmap` | Disable mmap for weight loader. | False |
+
+## PD-Multiplexing
+
+| Arguments | Description | Defaults |
+|-----------|-------------|----------|
+| `--enable-pdmux` | Enable PD-Multiplexing. | False |
+| `--sm-group-num` | Number of SM groups for PD-Multiplexing. | 3 |
diff --git a/docs/advanced_features/speculative_decoding.ipynb b/docs/advanced_features/speculative_decoding.ipynb
new file mode 100644
index 000000000..aa62b897a
--- /dev/null
+++ b/docs/advanced_features/speculative_decoding.ipynb
@@ -0,0 +1,370 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Speculative Decoding\n",
+    "\n",
+    "SGLang now provides an EAGLE-based (EAGLE-2/EAGLE-3) speculative decoding option. Our implementation aims to maximize speed and efficiency and is considered to be among the fastest in open-source LLM engines.\n",
+    "\n",
+    "### Performance Highlights\n",
+    "\n",
+    "Please see below for the huge improvements on throughput for LLaMA-Instruct 3.1 8B tested on MT bench that can be achieved via EAGLE3 decoding.\n",
+    "For further details please see the [EAGLE3 paper](https://arxiv.org/pdf/2503.01840).\n",
+    "\n",
+    "| Method | Throughput (tokens/s) |\n",
+    "|--------|----------------|\n",
+    "| SGLang (w/o speculative, 1x H100) | 158.34 tokens/s |\n",
+    "| SGLang + EAGLE-2 (1x H100) | 244.10 tokens/s |\n",
+    "| SGLang + EAGLE-3 (1x H100) | 373.25 tokens/s |"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## EAGLE Decoding\n",
+    "\n",
+    "To enable EAGLE speculative decoding the following parameters are relevant:\n",
+    "* `speculative_draft_model_path`: Specifies draft model. This parameter is required.\n",
+    "* `speculative_num_steps`: Depth of autoregressive drafting. Increases speculation range but risks rejection cascades. Default is 5.\n",
+    "* `speculative_eagle_topk`: Branching factor per step. Improves candidate diversity, will lead to higher acceptance rate, but more lead to higher memory/compute consumption. Default is 4.\n",
+    "* `speculative_num_draft_tokens`: Maximum parallel verification capacity. Allows deeper tree evaluation but will lead to higher GPU memory usage. Default is 8.\n",
+    "\n",
+    "These parameters are the same for EAGLE-2 and EAGLE-3.\n",
+    "\n",
+    "You can find the best combinations of these parameters with [bench_speculative.py](https://github.com/sgl-project/sglang/blob/main/scripts/playground/bench_speculative.py).\n",
+    "\n",
+    "In the documentation below, we set `--cuda-graph-max-bs` to be a small value for faster engine startup. For your own workloads, please tune the above parameters together with `--cuda-graph-max-bs`, `--max-running-requests`, `--mem-fraction-static` for the best performance. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### EAGLE-2 decoding\n",
+    "\n",
+    "You can enable EAGLE-2 decoding by setting `--speculative-algorithm EAGLE` and choosing an appropriate model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sglang.test.doc_patch import launch_server_cmd\n",
+    "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+    "\n",
+    "import openai"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "server_process, port = launch_server_cmd(\n",
+    "    \"\"\"\n",
+    "python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf  --speculative-algorithm EAGLE \\\n",
+    "    --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 3 \\\n",
+    "    --speculative-eagle-topk 4 --speculative-num-draft-tokens 16 --cuda-graph-max-bs 8 --log-level warning\n",
+    "\"\"\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"meta-llama/Llama-2-7b-chat-hf\",\n",
+    "    messages=[\n",
+    "        {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    "    max_tokens=64,\n",
+    ")\n",
+    "\n",
+    "print_highlight(f\"Response: {response}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### EAGLE-2 Decoding with `torch.compile`\n",
+    "\n",
+    "You can also enable `torch.compile` for further optimizations and optionally set `--torch-compile-max-bs`:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "server_process, port = launch_server_cmd(\n",
+    "    \"\"\"\n",
+    "python3 -m sglang.launch_server --model meta-llama/Llama-2-7b-chat-hf  --speculative-algorithm EAGLE \\\n",
+    "    --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 5 \\\n",
+    "        --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --mem-fraction 0.6 \\\n",
+    "            --enable-torch-compile --torch-compile-max-bs 2 --log-level warning\n",
+    "\"\"\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"meta-llama/Llama-2-7b-chat-hf\",\n",
+    "    messages=[\n",
+    "        {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    "    max_tokens=64,\n",
+    ")\n",
+    "\n",
+    "print_highlight(f\"Response: {response}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### EAGLE-2 Decoding via Frequency-Ranked Speculative Sampling\n",
+    "\n",
+    "By employing a truncated high-frequency token vocabulary in the draft model, Eagle speculative decoding reduces `lm_head` computational overhead while accelerating the pipeline without quality degradation. For more details, checkout [the paper](https://arxiv.org/pdf/arXiv:2502.14856).\n",
+    "\n",
+    "In our implementation, set `--speculative-token-map` to enable the optimization. You can get the high-frequency token in FR-Spec from [this model](https://huggingface.co/thunlp/LLaMA3-Instruct-8B-FR-Spec). Or you can obtain high-frequency token by directly downloading these token from [this repo](https://github.com/thunlp/FR-Spec/tree/main?tab=readme-ov-file#prepare-fr-spec-vocabulary-subset).\n",
+    "\n",
+    "Thanks for the contribution from [Weilin Zhao](https://github.com/Achazwl) and [Zhousx](https://github.com/Zhou-sx). "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "server_process, port = launch_server_cmd(\n",
+    "    \"\"\"\n",
+    "python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3-8B-Instruct --speculative-algorithm EAGLE \\\n",
+    "    --speculative-draft-model-path lmsys/sglang-EAGLE-LLaMA3-Instruct-8B --speculative-num-steps 5 \\\n",
+    "    --speculative-eagle-topk 8 --speculative-num-draft-tokens 64 --speculative-token-map thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt \\\n",
+    "    --mem-fraction 0.7 --cuda-graph-max-bs 2 --dtype float16  --log-level warning\n",
+    "\"\"\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n",
+    "    messages=[\n",
+    "        {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    "    max_tokens=64,\n",
+    ")\n",
+    "\n",
+    "print_highlight(f\"Response: {response}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### EAGLE-3 Decoding\n",
+    "\n",
+    "You can enable EAGLE-3 decoding by setting `--speculative-algorithm EAGLE3` and choosing an appropriate model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "server_process, port = launch_server_cmd(\n",
+    "    \"\"\"\n",
+    "python3 -m sglang.launch_server --model meta-llama/Llama-3.1-8B-Instruct  --speculative-algorithm EAGLE3 \\\n",
+    "    --speculative-draft-model-path jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B --speculative-num-steps 5 \\\n",
+    "        --speculative-eagle-topk 8 --speculative-num-draft-tokens 32 --mem-fraction 0.6 \\\n",
+    "        --cuda-graph-max-bs 2 --dtype float16 --log-level warning\n",
+    "\"\"\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
+    "    messages=[\n",
+    "        {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    "    max_tokens=64,\n",
+    ")\n",
+    "\n",
+    "print_highlight(f\"Response: {response}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Multi Token Prediction\n",
+    "\n",
+    "We support [MTP(Multi-Token Prediction)](https://arxiv.org/pdf/2404.19737) in SGLang by using speculative decoding. We use Xiaomi/MiMo-7B-RL model as example here (deepseek mtp usage refer to [deepseek doc](../basic_usage/deepseek.md#multi-token-prediction))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "server_process, port = launch_server_cmd(\n",
+    "    \"\"\"\n",
+    "    python3 -m sglang.launch_server --model-path XiaomiMiMo/MiMo-7B-RL --host 0.0.0.0 --trust-remote-code \\\n",
+    "    --speculative-algorithm EAGLE --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 \\\n",
+    "    --mem-fraction 0.5 --log-level warning\n",
+    "\"\"\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "\n",
+    "url = f\"http://localhost:{port}/v1/chat/completions\"\n",
+    "\n",
+    "data = {\n",
+    "    \"model\": \"XiaomiMiMo/MiMo-7B-RL\",\n",
+    "    \"messages\": [{\"role\": \"user\", \"content\": \"What is the capital of France?\"}],\n",
+    "}\n",
+    "\n",
+    "response = requests.post(url, json=data)\n",
+    "print_highlight(response.json())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## References\n",
+    "\n",
+    "EAGLE process is as follows:\n",
+    "\n",
+    "- Within EAGLE the draft model predicts the next feature vector, i.e. the last hidden state of the original LLM, using the feature sequence $(f_1, ..., f_k)$ and the token sequence $(t_2, ..., t_{k+1})$. \n",
+    "- The next token is then sampled from $p_{k+2}=\\text{LMHead}(f_{k+1})$. Afterwards, the two sequences are extended in a tree style—branching out multiple potential continuations, with the branching factor per step controlled by the `speculative_eagle_topk` parameter—to ensure a more coherent connection of context, and are given as input again.\n",
+    "- EAGLE-2 additionally uses the draft model to evaluate how probable certain branches in the draft tree are, dynamically stopping the expansion of unlikely branches. After the expansion phase, reranking is employed to select only the top `speculative_num_draft_tokens` final nodes as draft tokens.\n",
+    "- EAGLE-3 removes the feature prediction objective, incorporates low and mid-layer features, and is trained in an on-policy manner.\n",
+    "\n",
+    "This enhances drafting accuracy by operating on the features instead of tokens for more regular inputs and passing the tokens from the next timestep additionally to minimize randomness effects from sampling. Furthermore the dynamic adjustment of the draft tree and selection of reranked final nodes increases acceptance rate of draft tokens further. For more details see [EAGLE-2](https://arxiv.org/abs/2406.16858) and [EAGLE-3](https://arxiv.org/abs/2503.01840) paper.\n",
+    "\n",
+    "\n",
+    "For guidance how to train your own EAGLE model please see the [EAGLE repo](https://github.com/SafeAILab/EAGLE/tree/main?tab=readme-ov-file#train)."
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/advanced_features/structured_outputs.ipynb b/docs/advanced_features/structured_outputs.ipynb
new file mode 100644
index 000000000..1382f1e0e
--- /dev/null
+++ b/docs/advanced_features/structured_outputs.ipynb
@@ -0,0 +1,853 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Structured Outputs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can specify a JSON schema, [regular expression](https://en.wikipedia.org/wiki/Regular_expression) or [EBNF](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form) to constrain the model output. The model output will be guaranteed to follow the given constraints. Only one constraint parameter (`json_schema`, `regex`, or `ebnf`) can be specified for a request.\n",
+    "\n",
+    "SGLang supports three grammar backends:\n",
+    "\n",
+    "- [XGrammar](https://github.com/mlc-ai/xgrammar)(default): Supports JSON schema, regular expression, and EBNF constraints.\n",
+    "- [Outlines](https://github.com/dottxt-ai/outlines): Supports JSON schema and regular expression constraints.\n",
+    "- [Llguidance](https://github.com/guidance-ai/llguidance): Supports JSON schema, regular expression, and EBNF constraints.\n",
+    "\n",
+    "We suggest using XGrammar for its better performance and utility. XGrammar currently uses the [GGML BNF format](https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md). For more details, see [XGrammar technical overview](https://blog.mlc.ai/2024/11/22/achieving-efficient-flexible-portable-structured-generation-with-xgrammar).\n",
+    "\n",
+    "To use Outlines, simply add `--grammar-backend outlines` when launching the server.\n",
+    "To use llguidance, add `--grammar-backend llguidance`  when launching the server.\n",
+    "If no backend is specified, XGrammar will be used as the default.\n",
+    "\n",
+    "For better output quality, **It's advisable to explicitly include instructions in the prompt to guide the model to generate the desired format.** For example, you can specify, 'Please generate the output in the following JSON format: ...'.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## OpenAI Compatible API"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openai\n",
+    "import os\n",
+    "\n",
+    "from sglang.test.doc_patch import launch_server_cmd\n",
+    "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+    "\n",
+    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+    "\n",
+    "\n",
+    "server_process, port = launch_server_cmd(\n",
+    "    \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --log-level warning\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")\n",
+    "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### JSON\n",
+    "\n",
+    "you can directly define a JSON schema or use [Pydantic](https://docs.pydantic.dev/latest/) to define and validate the response."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Using Pydantic**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pydantic import BaseModel, Field\n",
+    "\n",
+    "\n",
+    "# Define the schema using Pydantic\n",
+    "class CapitalInfo(BaseModel):\n",
+    "    name: str = Field(..., pattern=r\"^\\w+$\", description=\"Name of the capital city\")\n",
+    "    population: int = Field(..., description=\"Population of the capital city\")\n",
+    "\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
+    "    messages=[\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": \"Please generate the information of the capital of France in the JSON format.\",\n",
+    "        },\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    "    max_tokens=128,\n",
+    "    response_format={\n",
+    "        \"type\": \"json_schema\",\n",
+    "        \"json_schema\": {\n",
+    "            \"name\": \"foo\",\n",
+    "            # convert the pydantic model to json schema\n",
+    "            \"schema\": CapitalInfo.model_json_schema(),\n",
+    "        },\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "response_content = response.choices[0].message.content\n",
+    "# validate the JSON response by the pydantic model\n",
+    "capital_info = CapitalInfo.model_validate_json(response_content)\n",
+    "print_highlight(f\"Validated response: {capital_info.model_dump_json()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**JSON Schema Directly**\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "json_schema = json.dumps(\n",
+    "    {\n",
+    "        \"type\": \"object\",\n",
+    "        \"properties\": {\n",
+    "            \"name\": {\"type\": \"string\", \"pattern\": \"^[\\\\w]+$\"},\n",
+    "            \"population\": {\"type\": \"integer\"},\n",
+    "        },\n",
+    "        \"required\": [\"name\", \"population\"],\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
+    "    messages=[\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": \"Give me the information of the capital of France in the JSON format.\",\n",
+    "        },\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    "    max_tokens=128,\n",
+    "    response_format={\n",
+    "        \"type\": \"json_schema\",\n",
+    "        \"json_schema\": {\"name\": \"foo\", \"schema\": json.loads(json_schema)},\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "print_highlight(response.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### EBNF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ebnf_grammar = \"\"\"\n",
+    "root ::= city | description\n",
+    "city ::= \"London\" | \"Paris\" | \"Berlin\" | \"Rome\"\n",
+    "description ::= city \" is \" status\n",
+    "status ::= \"the capital of \" country\n",
+    "country ::= \"England\" | \"France\" | \"Germany\" | \"Italy\"\n",
+    "\"\"\"\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
+    "    messages=[\n",
+    "        {\"role\": \"system\", \"content\": \"You are a helpful geography bot.\"},\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": \"Give me the information of the capital of France.\",\n",
+    "        },\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    "    max_tokens=32,\n",
+    "    extra_body={\"ebnf\": ebnf_grammar},\n",
+    ")\n",
+    "\n",
+    "print_highlight(response.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Regular expression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = client.chat.completions.create(\n",
+    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
+    "    messages=[\n",
+    "        {\"role\": \"user\", \"content\": \"What is the capital of France?\"},\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    "    max_tokens=128,\n",
+    "    extra_body={\"regex\": \"(Paris|London)\"},\n",
+    ")\n",
+    "\n",
+    "print_highlight(response.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Structural Tag"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tool_get_current_weather = {\n",
+    "    \"type\": \"function\",\n",
+    "    \"function\": {\n",
+    "        \"name\": \"get_current_weather\",\n",
+    "        \"description\": \"Get the current weather in a given location\",\n",
+    "        \"parameters\": {\n",
+    "            \"type\": \"object\",\n",
+    "            \"properties\": {\n",
+    "                \"city\": {\n",
+    "                    \"type\": \"string\",\n",
+    "                    \"description\": \"The city to find the weather for, e.g. 'San Francisco'\",\n",
+    "                },\n",
+    "                \"state\": {\n",
+    "                    \"type\": \"string\",\n",
+    "                    \"description\": \"the two-letter abbreviation for the state that the city is\"\n",
+    "                    \" in, e.g. 'CA' which would mean 'California'\",\n",
+    "                },\n",
+    "                \"unit\": {\n",
+    "                    \"type\": \"string\",\n",
+    "                    \"description\": \"The unit to fetch the temperature in\",\n",
+    "                    \"enum\": [\"celsius\", \"fahrenheit\"],\n",
+    "                },\n",
+    "            },\n",
+    "            \"required\": [\"city\", \"state\", \"unit\"],\n",
+    "        },\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "tool_get_current_date = {\n",
+    "    \"type\": \"function\",\n",
+    "    \"function\": {\n",
+    "        \"name\": \"get_current_date\",\n",
+    "        \"description\": \"Get the current date and time for a given timezone\",\n",
+    "        \"parameters\": {\n",
+    "            \"type\": \"object\",\n",
+    "            \"properties\": {\n",
+    "                \"timezone\": {\n",
+    "                    \"type\": \"string\",\n",
+    "                    \"description\": \"The timezone to fetch the current date and time for, e.g. 'America/New_York'\",\n",
+    "                }\n",
+    "            },\n",
+    "            \"required\": [\"timezone\"],\n",
+    "        },\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "schema_get_current_weather = tool_get_current_weather[\"function\"][\"parameters\"]\n",
+    "schema_get_current_date = tool_get_current_date[\"function\"][\"parameters\"]\n",
+    "\n",
+    "\n",
+    "def get_messages():\n",
+    "    return [\n",
+    "        {\n",
+    "            \"role\": \"system\",\n",
+    "            \"content\": f\"\"\"\n",
+    "# Tool Instructions\n",
+    "- Always execute python code in messages that you share.\n",
+    "- When looking for real time information use relevant functions if available else fallback to brave_search\n",
+    "You have access to the following functions:\n",
+    "Use the function 'get_current_weather' to: Get the current weather in a given location\n",
+    "{tool_get_current_weather[\"function\"]}\n",
+    "Use the function 'get_current_date' to: Get the current date and time for a given timezone\n",
+    "{tool_get_current_date[\"function\"]}\n",
+    "If a you choose to call a function ONLY reply in the following format:\n",
+    "<{{start_tag}}={{function_name}}>{{parameters}}{{end_tag}}\n",
+    "where\n",
+    "start_tag => `<function`\n",
+    "parameters => a JSON dict with the function argument name as key and function argument value as value.\n",
+    "end_tag => `</function>`\n",
+    "Here is an example,\n",
+    "<function=example_function_name>{{\"example_name\": \"example_value\"}}</function>\n",
+    "Reminder:\n",
+    "- Function calls MUST follow the specified format\n",
+    "- Required parameters MUST be specified\n",
+    "- Only call one function at a time\n",
+    "- Put the entire function call reply on one line\n",
+    "- Always add your sources when using search results to answer the user query\n",
+    "You are a helpful assistant.\"\"\",\n",
+    "        },\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": \"You are in New York. Please get the current date and time, and the weather.\",\n",
+    "        },\n",
+    "    ]\n",
+    "\n",
+    "\n",
+    "messages = get_messages()\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
+    "    messages=messages,\n",
+    "    response_format={\n",
+    "        \"type\": \"structural_tag\",\n",
+    "        \"structures\": [\n",
+    "            {\n",
+    "                \"begin\": \"<function=get_current_weather>\",\n",
+    "                \"schema\": schema_get_current_weather,\n",
+    "                \"end\": \"</function>\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"begin\": \"<function=get_current_date>\",\n",
+    "                \"schema\": schema_get_current_date,\n",
+    "                \"end\": \"</function>\",\n",
+    "            },\n",
+    "        ],\n",
+    "        \"triggers\": [\"<function=\"],\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "print_highlight(response.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Native API and SGLang Runtime (SRT)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### JSON"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Using Pydantic**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "import json\n",
+    "from pydantic import BaseModel, Field\n",
+    "\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
+    "\n",
+    "\n",
+    "# Define the schema using Pydantic\n",
+    "class CapitalInfo(BaseModel):\n",
+    "    name: str = Field(..., pattern=r\"^\\w+$\", description=\"Name of the capital city\")\n",
+    "    population: int = Field(..., description=\"Population of the capital city\")\n",
+    "\n",
+    "\n",
+    "# Make API request\n",
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": \"Here is the information of the capital of France in the JSON format.\\n\",\n",
+    "    }\n",
+    "]\n",
+    "text = tokenizer.apply_chat_template(\n",
+    "    messages, tokenize=False, add_generation_prompt=True\n",
+    ")\n",
+    "response = requests.post(\n",
+    "    f\"http://localhost:{port}/generate\",\n",
+    "    json={\n",
+    "        \"text\": text,\n",
+    "        \"sampling_params\": {\n",
+    "            \"temperature\": 0,\n",
+    "            \"max_new_tokens\": 64,\n",
+    "            \"json_schema\": json.dumps(CapitalInfo.model_json_schema()),\n",
+    "        },\n",
+    "    },\n",
+    ")\n",
+    "print_highlight(response.json())\n",
+    "\n",
+    "\n",
+    "response_data = json.loads(response.json()[\"text\"])\n",
+    "# validate the response by the pydantic model\n",
+    "capital_info = CapitalInfo.model_validate(response_data)\n",
+    "print_highlight(f\"Validated response: {capital_info.model_dump_json()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**JSON Schema Directly**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "json_schema = json.dumps(\n",
+    "    {\n",
+    "        \"type\": \"object\",\n",
+    "        \"properties\": {\n",
+    "            \"name\": {\"type\": \"string\", \"pattern\": \"^[\\\\w]+$\"},\n",
+    "            \"population\": {\"type\": \"integer\"},\n",
+    "        },\n",
+    "        \"required\": [\"name\", \"population\"],\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "# JSON\n",
+    "response = requests.post(\n",
+    "    f\"http://localhost:{port}/generate\",\n",
+    "    json={\n",
+    "        \"text\": text,\n",
+    "        \"sampling_params\": {\n",
+    "            \"temperature\": 0,\n",
+    "            \"max_new_tokens\": 64,\n",
+    "            \"json_schema\": json_schema,\n",
+    "        },\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "print_highlight(response.json())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### EBNF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": \"Give me the information of the capital of France.\",\n",
+    "    }\n",
+    "]\n",
+    "text = tokenizer.apply_chat_template(\n",
+    "    messages, tokenize=False, add_generation_prompt=True\n",
+    ")\n",
+    "response = requests.post(\n",
+    "    f\"http://localhost:{port}/generate\",\n",
+    "    json={\n",
+    "        \"text\": text,\n",
+    "        \"sampling_params\": {\n",
+    "            \"max_new_tokens\": 128,\n",
+    "            \"temperature\": 0,\n",
+    "            \"n\": 3,\n",
+    "            \"ebnf\": (\n",
+    "                \"root ::= city | description\\n\"\n",
+    "                'city ::= \"London\" | \"Paris\" | \"Berlin\" | \"Rome\"\\n'\n",
+    "                'description ::= city \" is \" status\\n'\n",
+    "                'status ::= \"the capital of \" country\\n'\n",
+    "                'country ::= \"England\" | \"France\" | \"Germany\" | \"Italy\"'\n",
+    "            ),\n",
+    "        },\n",
+    "        \"stream\": False,\n",
+    "        \"return_logprob\": False,\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "print_highlight(response.json())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Regular expression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": \"Paris is the capital of\",\n",
+    "    }\n",
+    "]\n",
+    "text = tokenizer.apply_chat_template(\n",
+    "    messages, tokenize=False, add_generation_prompt=True\n",
+    ")\n",
+    "response = requests.post(\n",
+    "    f\"http://localhost:{port}/generate\",\n",
+    "    json={\n",
+    "        \"text\": text,\n",
+    "        \"sampling_params\": {\n",
+    "            \"temperature\": 0,\n",
+    "            \"max_new_tokens\": 64,\n",
+    "            \"regex\": \"(France|England)\",\n",
+    "        },\n",
+    "    },\n",
+    ")\n",
+    "print_highlight(response.json())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Structural Tag"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "# generate an answer\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
+    "\n",
+    "text = tokenizer.apply_chat_template(\n",
+    "    messages, tokenize=False, add_generation_prompt=True\n",
+    ")\n",
+    "payload = {\n",
+    "    \"text\": text,\n",
+    "    \"sampling_params\": {\n",
+    "        \"structural_tag\": json.dumps(\n",
+    "            {\n",
+    "                \"type\": \"structural_tag\",\n",
+    "                \"structures\": [\n",
+    "                    {\n",
+    "                        \"begin\": \"<function=get_current_weather>\",\n",
+    "                        \"schema\": schema_get_current_weather,\n",
+    "                        \"end\": \"</function>\",\n",
+    "                    },\n",
+    "                    {\n",
+    "                        \"begin\": \"<function=get_current_date>\",\n",
+    "                        \"schema\": schema_get_current_date,\n",
+    "                        \"end\": \"</function>\",\n",
+    "                    },\n",
+    "                ],\n",
+    "                \"triggers\": [\"<function=\"],\n",
+    "            }\n",
+    "        )\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "\n",
+    "# Send POST request to the API endpoint\n",
+    "response = requests.post(f\"http://localhost:{port}/generate\", json=payload)\n",
+    "print_highlight(response.json())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Offline Engine API"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sglang as sgl\n",
+    "\n",
+    "llm = sgl.Engine(\n",
+    "    model_path=\"meta-llama/Meta-Llama-3.1-8B-Instruct\", grammar_backend=\"xgrammar\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### JSON"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Using Pydantic**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "from pydantic import BaseModel, Field\n",
+    "\n",
+    "\n",
+    "prompts = [\n",
+    "    \"Give me the information of the capital of China in the JSON format.\",\n",
+    "    \"Give me the information of the capital of France in the JSON format.\",\n",
+    "    \"Give me the information of the capital of Ireland in the JSON format.\",\n",
+    "]\n",
+    "\n",
+    "\n",
+    "# Define the schema using Pydantic\n",
+    "class CapitalInfo(BaseModel):\n",
+    "    name: str = Field(..., pattern=r\"^\\w+$\", description=\"Name of the capital city\")\n",
+    "    population: int = Field(..., description=\"Population of the capital city\")\n",
+    "\n",
+    "\n",
+    "sampling_params = {\n",
+    "    \"temperature\": 0.1,\n",
+    "    \"top_p\": 0.95,\n",
+    "    \"json_schema\": json.dumps(CapitalInfo.model_json_schema()),\n",
+    "}\n",
+    "\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
+    "for prompt, output in zip(prompts, outputs):\n",
+    "    print_highlight(\"===============================\")\n",
+    "    print_highlight(f\"Prompt: {prompt}\")  # validate the output by the pydantic model\n",
+    "    capital_info = CapitalInfo.model_validate_json(output[\"text\"])\n",
+    "    print_highlight(f\"Validated output: {capital_info.model_dump_json()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**JSON Schema Directly**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "    \"Give me the information of the capital of China in the JSON format.\",\n",
+    "    \"Give me the information of the capital of France in the JSON format.\",\n",
+    "    \"Give me the information of the capital of Ireland in the JSON format.\",\n",
+    "]\n",
+    "\n",
+    "json_schema = json.dumps(\n",
+    "    {\n",
+    "        \"type\": \"object\",\n",
+    "        \"properties\": {\n",
+    "            \"name\": {\"type\": \"string\", \"pattern\": \"^[\\\\w]+$\"},\n",
+    "            \"population\": {\"type\": \"integer\"},\n",
+    "        },\n",
+    "        \"required\": [\"name\", \"population\"],\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "sampling_params = {\"temperature\": 0.1, \"top_p\": 0.95, \"json_schema\": json_schema}\n",
+    "\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
+    "for prompt, output in zip(prompts, outputs):\n",
+    "    print_highlight(\"===============================\")\n",
+    "    print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### EBNF\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "    \"Give me the information of the capital of France.\",\n",
+    "    \"Give me the information of the capital of Germany.\",\n",
+    "    \"Give me the information of the capital of Italy.\",\n",
+    "]\n",
+    "\n",
+    "sampling_params = {\n",
+    "    \"temperature\": 0.8,\n",
+    "    \"top_p\": 0.95,\n",
+    "    \"ebnf\": (\n",
+    "        \"root ::= city | description\\n\"\n",
+    "        'city ::= \"London\" | \"Paris\" | \"Berlin\" | \"Rome\"\\n'\n",
+    "        'description ::= city \" is \" status\\n'\n",
+    "        'status ::= \"the capital of \" country\\n'\n",
+    "        'country ::= \"England\" | \"France\" | \"Germany\" | \"Italy\"'\n",
+    "    ),\n",
+    "}\n",
+    "\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
+    "for prompt, output in zip(prompts, outputs):\n",
+    "    print_highlight(\"===============================\")\n",
+    "    print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Regular expression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "    \"Please provide information about London as a major global city:\",\n",
+    "    \"Please provide information about Paris as a major global city:\",\n",
+    "]\n",
+    "\n",
+    "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95, \"regex\": \"(France|England)\"}\n",
+    "\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
+    "for prompt, output in zip(prompts, outputs):\n",
+    "    print_highlight(\"===============================\")\n",
+    "    print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Structural Tag"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = tokenizer.apply_chat_template(\n",
+    "    messages, tokenize=False, add_generation_prompt=True\n",
+    ")\n",
+    "prompts = [text]\n",
+    "\n",
+    "\n",
+    "sampling_params = {\n",
+    "    \"temperature\": 0.8,\n",
+    "    \"top_p\": 0.95,\n",
+    "    \"structural_tag\": json.dumps(\n",
+    "        {\n",
+    "            \"type\": \"structural_tag\",\n",
+    "            \"structures\": [\n",
+    "                {\n",
+    "                    \"begin\": \"<function=get_current_weather>\",\n",
+    "                    \"schema\": schema_get_current_weather,\n",
+    "                    \"end\": \"</function>\",\n",
+    "                },\n",
+    "                {\n",
+    "                    \"begin\": \"<function=get_current_date>\",\n",
+    "                    \"schema\": schema_get_current_date,\n",
+    "                    \"end\": \"</function>\",\n",
+    "                },\n",
+    "            ],\n",
+    "            \"triggers\": [\"<function=\"],\n",
+    "        }\n",
+    "    ),\n",
+    "}\n",
+    "\n",
+    "\n",
+    "# Send POST request to the API endpoint\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
+    "for prompt, output in zip(prompts, outputs):\n",
+    "    print_highlight(\"===============================\")\n",
+    "    print_highlight(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm.shutdown()"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/advanced_features/structured_outputs_for_reasoning_models.ipynb b/docs/advanced_features/structured_outputs_for_reasoning_models.ipynb
new file mode 100644
index 000000000..c8f51a98a
--- /dev/null
+++ b/docs/advanced_features/structured_outputs_for_reasoning_models.ipynb
@@ -0,0 +1,830 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Structured Outputs For Reasoning Models\n",
+    "\n",
+    "When working with reasoning models that use special tokens like `<think>...</think>` to denote reasoning sections, you might want to allow free-form text within these sections while still enforcing grammar constraints on the rest of the output.\n",
+    "\n",
+    "SGLang provides a feature to disable grammar restrictions within reasoning sections. This is particularly useful for models that need to perform complex reasoning steps before providing a structured output.\n",
+    "\n",
+    "To enable this feature, use the `--reasoning-parser` flag which decide the think_end_token, such as `</think>`, when launching the server. You can also specify the reasoning parser using the `--reasoning-parser` flag.\n",
+    "\n",
+    "## Supported Models\n",
+    "\n",
+    "Currently, SGLang supports the following reasoning models:\n",
+    "- [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d): The reasoning content is wrapped with `<think>` and `</think>` tags.\n",
+    "- [QwQ](https://huggingface.co/Qwen/QwQ-32B): The reasoning content is wrapped with `<think>` and `</think>` tags.\n",
+    "\n",
+    "\n",
+    "## Usage\n",
+    "\n",
+    "## OpenAI Compatible API"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Specify the `--grammar-backend`, `--reasoning-parser` option."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openai\n",
+    "import os\n",
+    "\n",
+    "from sglang.test.doc_patch import launch_server_cmd\n",
+    "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+    "\n",
+    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+    "\n",
+    "\n",
+    "server_process, port = launch_server_cmd(\n",
+    "    \"python -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --host 0.0.0.0 --reasoning-parser deepseek-r1 --log-level warning\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")\n",
+    "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### JSON\n",
+    "\n",
+    "you can directly define a JSON schema or use [Pydantic](https://docs.pydantic.dev/latest/) to define and validate the response."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Using Pydantic**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pydantic import BaseModel, Field\n",
+    "\n",
+    "\n",
+    "# Define the schema using Pydantic\n",
+    "class CapitalInfo(BaseModel):\n",
+    "    name: str = Field(..., pattern=r\"^\\w+$\", description=\"Name of the capital city\")\n",
+    "    population: int = Field(..., description=\"Population of the capital city\")\n",
+    "\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\",\n",
+    "    messages=[\n",
+    "        {\n",
+    "            \"role\": \"assistant\",\n",
+    "            \"content\": \"Give me the information and population of the capital of France in the JSON format.\",\n",
+    "        },\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    "    max_tokens=2048,\n",
+    "    response_format={\n",
+    "        \"type\": \"json_schema\",\n",
+    "        \"json_schema\": {\n",
+    "            \"name\": \"foo\",\n",
+    "            # convert the pydantic model to json schema\n",
+    "            \"schema\": CapitalInfo.model_json_schema(),\n",
+    "        },\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "print_highlight(\n",
+    "    f\"reasoing_content: {response.choices[0].message.reasoning_content}\\n\\ncontent: {response.choices[0].message.content}\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**JSON Schema Directly**\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "json_schema = json.dumps(\n",
+    "    {\n",
+    "        \"type\": \"object\",\n",
+    "        \"properties\": {\n",
+    "            \"name\": {\"type\": \"string\", \"pattern\": \"^[\\\\w]+$\"},\n",
+    "            \"population\": {\"type\": \"integer\"},\n",
+    "        },\n",
+    "        \"required\": [\"name\", \"population\"],\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\",\n",
+    "    messages=[\n",
+    "        {\n",
+    "            \"role\": \"assistant\",\n",
+    "            \"content\": \"Give me the information and population of the capital of France in the JSON format.\",\n",
+    "        },\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    "    max_tokens=2048,\n",
+    "    response_format={\n",
+    "        \"type\": \"json_schema\",\n",
+    "        \"json_schema\": {\"name\": \"foo\", \"schema\": json.loads(json_schema)},\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "print_highlight(\n",
+    "    f\"reasoing_content: {response.choices[0].message.reasoning_content}\\n\\ncontent: {response.choices[0].message.content}\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### EBNF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ebnf_grammar = \"\"\"\n",
+    "root ::= city | description\n",
+    "city ::= \"London\" | \"Paris\" | \"Berlin\" | \"Rome\"\n",
+    "description ::= city \" is \" status\n",
+    "status ::= \"the capital of \" country\n",
+    "country ::= \"England\" | \"France\" | \"Germany\" | \"Italy\"\n",
+    "\"\"\"\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\",\n",
+    "    messages=[\n",
+    "        {\"role\": \"system\", \"content\": \"You are a helpful geography bot.\"},\n",
+    "        {\n",
+    "            \"role\": \"assistant\",\n",
+    "            \"content\": \"Give me the information and population of the capital of France in the JSON format.\",\n",
+    "        },\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    "    max_tokens=2048,\n",
+    "    extra_body={\"ebnf\": ebnf_grammar},\n",
+    ")\n",
+    "\n",
+    "print_highlight(\n",
+    "    f\"reasoing_content: {response.choices[0].message.reasoning_content}\\n\\ncontent: {response.choices[0].message.content}\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Regular expression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = client.chat.completions.create(\n",
+    "    model=\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\",\n",
+    "    messages=[\n",
+    "        {\"role\": \"assistant\", \"content\": \"What is the capital of France?\"},\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    "    max_tokens=2048,\n",
+    "    extra_body={\"regex\": \"(Paris|London)\"},\n",
+    ")\n",
+    "\n",
+    "print_highlight(\n",
+    "    f\"reasoing_content: {response.choices[0].message.reasoning_content}\\n\\ncontent: {response.choices[0].message.content}\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Structural Tag"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tool_get_current_weather = {\n",
+    "    \"type\": \"function\",\n",
+    "    \"function\": {\n",
+    "        \"name\": \"get_current_weather\",\n",
+    "        \"description\": \"Get the current weather in a given location\",\n",
+    "        \"parameters\": {\n",
+    "            \"type\": \"object\",\n",
+    "            \"properties\": {\n",
+    "                \"city\": {\n",
+    "                    \"type\": \"string\",\n",
+    "                    \"description\": \"The city to find the weather for, e.g. 'San Francisco'\",\n",
+    "                },\n",
+    "                \"state\": {\n",
+    "                    \"type\": \"string\",\n",
+    "                    \"description\": \"the two-letter abbreviation for the state that the city is\"\n",
+    "                    \" in, e.g. 'CA' which would mean 'California'\",\n",
+    "                },\n",
+    "                \"unit\": {\n",
+    "                    \"type\": \"string\",\n",
+    "                    \"description\": \"The unit to fetch the temperature in\",\n",
+    "                    \"enum\": [\"celsius\", \"fahrenheit\"],\n",
+    "                },\n",
+    "            },\n",
+    "            \"required\": [\"city\", \"state\", \"unit\"],\n",
+    "        },\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "tool_get_current_date = {\n",
+    "    \"type\": \"function\",\n",
+    "    \"function\": {\n",
+    "        \"name\": \"get_current_date\",\n",
+    "        \"description\": \"Get the current date and time for a given timezone\",\n",
+    "        \"parameters\": {\n",
+    "            \"type\": \"object\",\n",
+    "            \"properties\": {\n",
+    "                \"timezone\": {\n",
+    "                    \"type\": \"string\",\n",
+    "                    \"description\": \"The timezone to fetch the current date and time for, e.g. 'America/New_York'\",\n",
+    "                }\n",
+    "            },\n",
+    "            \"required\": [\"timezone\"],\n",
+    "        },\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "schema_get_current_weather = tool_get_current_weather[\"function\"][\"parameters\"]\n",
+    "schema_get_current_date = tool_get_current_date[\"function\"][\"parameters\"]\n",
+    "\n",
+    "\n",
+    "def get_messages():\n",
+    "    return [\n",
+    "        {\n",
+    "            \"role\": \"system\",\n",
+    "            \"content\": f\"\"\"\n",
+    "# Tool Instructions\n",
+    "- Always execute python code in messages that you share.\n",
+    "- When looking for real time information use relevant functions if available else fallback to brave_search\n",
+    "You have access to the following functions:\n",
+    "Use the function 'get_current_weather' to: Get the current weather in a given location\n",
+    "{tool_get_current_weather[\"function\"]}\n",
+    "Use the function 'get_current_date' to: Get the current date and time for a given timezone\n",
+    "{tool_get_current_date[\"function\"]}\n",
+    "If a you choose to call a function ONLY reply in the following format:\n",
+    "<{{start_tag}}={{function_name}}>{{parameters}}{{end_tag}}\n",
+    "where\n",
+    "start_tag => `<function`\n",
+    "parameters => a JSON dict with the function argument name as key and function argument value as value.\n",
+    "end_tag => `</function>`\n",
+    "Here is an example,\n",
+    "<function=example_function_name>{{\"example_name\": \"example_value\"}}</function>\n",
+    "Reminder:\n",
+    "- Function calls MUST follow the specified format\n",
+    "- Required parameters MUST be specified\n",
+    "- Only call one function at a time\n",
+    "- Put the entire function call reply on one line\n",
+    "- Always add your sources when using search results to answer the user query\n",
+    "You are a helpful assistant.\"\"\",\n",
+    "        },\n",
+    "        {\n",
+    "            \"role\": \"assistant\",\n",
+    "            \"content\": \"You are in New York. Please get the current date and time, and the weather.\",\n",
+    "        },\n",
+    "    ]\n",
+    "\n",
+    "\n",
+    "messages = get_messages()\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\",\n",
+    "    messages=messages,\n",
+    "    response_format={\n",
+    "        \"type\": \"structural_tag\",\n",
+    "        \"max_new_tokens\": 2048,\n",
+    "        \"structures\": [\n",
+    "            {\n",
+    "                \"begin\": \"<function=get_current_weather>\",\n",
+    "                \"schema\": schema_get_current_weather,\n",
+    "                \"end\": \"</function>\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"begin\": \"<function=get_current_date>\",\n",
+    "                \"schema\": schema_get_current_date,\n",
+    "                \"end\": \"</function>\",\n",
+    "            },\n",
+    "        ],\n",
+    "        \"triggers\": [\"<function=\"],\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "print_highlight(\n",
+    "    f\"reasoing_content: {response.choices[0].message.reasoning_content}\\n\\ncontent: {response.choices[0].message.content}\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Native API and SGLang Runtime (SRT)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### JSON"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Using Pydantic**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "from pydantic import BaseModel, Field\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\")\n",
+    "\n",
+    "\n",
+    "# Define the schema using Pydantic\n",
+    "class CapitalInfo(BaseModel):\n",
+    "    name: str = Field(..., pattern=r\"^\\w+$\", description=\"Name of the capital city\")\n",
+    "    population: int = Field(..., description=\"Population of the capital city\")\n",
+    "\n",
+    "\n",
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"assistant\",\n",
+    "        \"content\": \"Give me the information and population of the capital of France in the JSON format.\",\n",
+    "    },\n",
+    "]\n",
+    "text = tokenizer.apply_chat_template(\n",
+    "    messages, tokenize=False, add_generation_prompt=True\n",
+    ")\n",
+    "# Make API request\n",
+    "response = requests.post(\n",
+    "    f\"http://localhost:{port}/generate\",\n",
+    "    json={\n",
+    "        \"text\": text,\n",
+    "        \"sampling_params\": {\n",
+    "            \"temperature\": 0,\n",
+    "            \"max_new_tokens\": 2048,\n",
+    "            \"json_schema\": json.dumps(CapitalInfo.model_json_schema()),\n",
+    "        },\n",
+    "    },\n",
+    ")\n",
+    "print(response.json())\n",
+    "\n",
+    "\n",
+    "reasoing_content = response.json()[\"text\"].split(\"</think>\")[0]\n",
+    "content = response.json()[\"text\"].split(\"</think>\")[1]\n",
+    "print_highlight(f\"reasoing_content: {reasoing_content}\\n\\ncontent: {content}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**JSON Schema Directly**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "json_schema = json.dumps(\n",
+    "    {\n",
+    "        \"type\": \"object\",\n",
+    "        \"properties\": {\n",
+    "            \"name\": {\"type\": \"string\", \"pattern\": \"^[\\\\w]+$\"},\n",
+    "            \"population\": {\"type\": \"integer\"},\n",
+    "        },\n",
+    "        \"required\": [\"name\", \"population\"],\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "# JSON\n",
+    "text = tokenizer.apply_chat_template(\n",
+    "    messages, tokenize=False, add_generation_prompt=True\n",
+    ")\n",
+    "response = requests.post(\n",
+    "    f\"http://localhost:{port}/generate\",\n",
+    "    json={\n",
+    "        \"text\": text,\n",
+    "        \"sampling_params\": {\n",
+    "            \"temperature\": 0,\n",
+    "            \"max_new_tokens\": 2048,\n",
+    "            \"json_schema\": json_schema,\n",
+    "        },\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "print_highlight(response.json())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### EBNF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = requests.post(\n",
+    "    f\"http://localhost:{port}/generate\",\n",
+    "    json={\n",
+    "        \"text\": \"Give me the information of the capital of France.\",\n",
+    "        \"sampling_params\": {\n",
+    "            \"max_new_tokens\": 2048,\n",
+    "            \"temperature\": 0,\n",
+    "            \"n\": 3,\n",
+    "            \"ebnf\": (\n",
+    "                \"root ::= city | description\\n\"\n",
+    "                'city ::= \"London\" | \"Paris\" | \"Berlin\" | \"Rome\"\\n'\n",
+    "                'description ::= city \" is \" status\\n'\n",
+    "                'status ::= \"the capital of \" country\\n'\n",
+    "                'country ::= \"England\" | \"France\" | \"Germany\" | \"Italy\"'\n",
+    "            ),\n",
+    "        },\n",
+    "        \"stream\": False,\n",
+    "        \"return_logprob\": False,\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "print(response.json())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Regular expression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = requests.post(\n",
+    "    f\"http://localhost:{port}/generate\",\n",
+    "    json={\n",
+    "        \"text\": \"Paris is the capital of\",\n",
+    "        \"sampling_params\": {\n",
+    "            \"temperature\": 0,\n",
+    "            \"max_new_tokens\": 2048,\n",
+    "            \"regex\": \"(France|England)\",\n",
+    "        },\n",
+    "    },\n",
+    ")\n",
+    "print(response.json())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Structural Tag"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = tokenizer.apply_chat_template(\n",
+    "    messages, tokenize=False, add_generation_prompt=True\n",
+    ")\n",
+    "payload = {\n",
+    "    \"text\": text,\n",
+    "    \"sampling_params\": {\n",
+    "        \"max_new_tokens\": 2048,\n",
+    "        \"structural_tag\": json.dumps(\n",
+    "            {\n",
+    "                \"type\": \"structural_tag\",\n",
+    "                \"structures\": [\n",
+    "                    {\n",
+    "                        \"begin\": \"<function=get_current_weather>\",\n",
+    "                        \"schema\": schema_get_current_weather,\n",
+    "                        \"end\": \"</function>\",\n",
+    "                    },\n",
+    "                    {\n",
+    "                        \"begin\": \"<function=get_current_date>\",\n",
+    "                        \"schema\": schema_get_current_date,\n",
+    "                        \"end\": \"</function>\",\n",
+    "                    },\n",
+    "                ],\n",
+    "                \"triggers\": [\"<function=\"],\n",
+    "            }\n",
+    "        ),\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "\n",
+    "# Send POST request to the API endpoint\n",
+    "response = requests.post(f\"http://localhost:{port}/generate\", json=payload)\n",
+    "print_highlight(response.json())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Offline Engine API"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sglang as sgl\n",
+    "\n",
+    "llm = sgl.Engine(\n",
+    "    model_path=\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\",\n",
+    "    reasoning_parser=\"deepseek-r1\",\n",
+    "    grammar_backend=\"xgrammar\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### JSON"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Using Pydantic**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "from pydantic import BaseModel, Field\n",
+    "\n",
+    "\n",
+    "prompts = [\n",
+    "    \"Give me the information of the capital of China in the JSON format.\",\n",
+    "    \"Give me the information of the capital of France in the JSON format.\",\n",
+    "    \"Give me the information of the capital of Ireland in the JSON format.\",\n",
+    "]\n",
+    "\n",
+    "\n",
+    "# Define the schema using Pydantic\n",
+    "class CapitalInfo(BaseModel):\n",
+    "    name: str = Field(..., pattern=r\"^\\w+$\", description=\"Name of the capital city\")\n",
+    "    population: int = Field(..., description=\"Population of the capital city\")\n",
+    "\n",
+    "\n",
+    "sampling_params = {\n",
+    "    \"temperature\": 0,\n",
+    "    \"top_p\": 0.95,\n",
+    "    \"max_new_tokens\": 2048,\n",
+    "    \"json_schema\": json.dumps(CapitalInfo.model_json_schema()),\n",
+    "}\n",
+    "\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
+    "for prompt, output in zip(prompts, outputs):\n",
+    "    print(\"===============================\")\n",
+    "    print(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**JSON Schema Directly**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "    \"Give me the information of the capital of China in the JSON format.\",\n",
+    "    \"Give me the information of the capital of France in the JSON format.\",\n",
+    "    \"Give me the information of the capital of Ireland in the JSON format.\",\n",
+    "]\n",
+    "\n",
+    "json_schema = json.dumps(\n",
+    "    {\n",
+    "        \"type\": \"object\",\n",
+    "        \"properties\": {\n",
+    "            \"name\": {\"type\": \"string\", \"pattern\": \"^[\\\\w]+$\"},\n",
+    "            \"population\": {\"type\": \"integer\"},\n",
+    "        },\n",
+    "        \"required\": [\"name\", \"population\"],\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "sampling_params = {\"temperature\": 0, \"max_new_tokens\": 2048, \"json_schema\": json_schema}\n",
+    "\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
+    "for prompt, output in zip(prompts, outputs):\n",
+    "    print(\"===============================\")\n",
+    "    print(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### EBNF\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "    \"Give me the information of the capital of France.\",\n",
+    "    \"Give me the information of the capital of Germany.\",\n",
+    "    \"Give me the information of the capital of Italy.\",\n",
+    "]\n",
+    "\n",
+    "sampling_params = {\n",
+    "    \"temperature\": 0.8,\n",
+    "    \"top_p\": 0.95,\n",
+    "    \"ebnf\": (\n",
+    "        \"root ::= city | description\\n\"\n",
+    "        'city ::= \"London\" | \"Paris\" | \"Berlin\" | \"Rome\"\\n'\n",
+    "        'description ::= city \" is \" status\\n'\n",
+    "        'status ::= \"the capital of \" country\\n'\n",
+    "        'country ::= \"England\" | \"France\" | \"Germany\" | \"Italy\"'\n",
+    "    ),\n",
+    "}\n",
+    "\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
+    "for prompt, output in zip(prompts, outputs):\n",
+    "    print(\"===============================\")\n",
+    "    print(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Regular expression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "    \"Please provide information about London as a major global city:\",\n",
+    "    \"Please provide information about Paris as a major global city:\",\n",
+    "]\n",
+    "\n",
+    "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95, \"regex\": \"(France|England)\"}\n",
+    "\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
+    "for prompt, output in zip(prompts, outputs):\n",
+    "    print(\"===============================\")\n",
+    "    print(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = tokenizer.apply_chat_template(\n",
+    "    messages, tokenize=False, add_generation_prompt=True\n",
+    ")\n",
+    "prompts = [text]\n",
+    "\n",
+    "\n",
+    "sampling_params = {\n",
+    "    \"temperature\": 0.8,\n",
+    "    \"top_p\": 0.95,\n",
+    "    \"max_new_tokens\": 2048,\n",
+    "    \"structural_tag\": json.dumps(\n",
+    "        {\n",
+    "            \"type\": \"structural_tag\",\n",
+    "            \"structures\": [\n",
+    "                {\n",
+    "                    \"begin\": \"<function=get_current_weather>\",\n",
+    "                    \"schema\": schema_get_current_weather,\n",
+    "                    \"end\": \"</function>\",\n",
+    "                },\n",
+    "                {\n",
+    "                    \"begin\": \"<function=get_current_date>\",\n",
+    "                    \"schema\": schema_get_current_date,\n",
+    "                    \"end\": \"</function>\",\n",
+    "                },\n",
+    "            ],\n",
+    "            \"triggers\": [\"<function=\"],\n",
+    "        }\n",
+    "    ),\n",
+    "}\n",
+    "\n",
+    "\n",
+    "# Send POST request to the API endpoint\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
+    "for prompt, output in zip(prompts, outputs):\n",
+    "    print(\"===============================\")\n",
+    "    print(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm.shutdown()"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/advanced_features/tool_parser.ipynb b/docs/advanced_features/tool_parser.ipynb
new file mode 100644
index 000000000..fd88b6799
--- /dev/null
+++ b/docs/advanced_features/tool_parser.ipynb
@@ -0,0 +1,852 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Tool Parser\n",
+    "\n",
+    "This guide demonstrates how to use SGLang’s [Function calling](https://platform.openai.com/docs/guides/function-calling) functionality."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Currently supported parsers:\n",
+    "\n",
+    "| Parser | Supported Models | Notes |\n",
+    "|---|---|---|\n",
+    "| `llama3` | Llama 3.1 / 3.2 / 3.3 (e.g. `meta-llama/Llama-3.1-8B-Instruct`, `meta-llama/Llama-3.2-1B-Instruct`, `meta-llama/Llama-3.3-70B-Instruct`) | |\n",
+    "| `llama4` | Llama 4 (e.g. `meta-llama/Llama-4-Scout-17B-16E-Instruct`) | |\n",
+    "| `mistral` | Mistral (e.g. `mistralai/Mistral-7B-Instruct-v0.3`, `mistralai/Mistral-Nemo-Instruct-2407`, `mistralai/Mistral-7B-v0.3`) | |\n",
+    "| `qwen25` | Qwen 2.5 (e.g. `Qwen/Qwen2.5-1.5B-Instruct`, `Qwen/Qwen2.5-7B-Instruct`) and QwQ (i.e. `Qwen/QwQ-32B`) | For QwQ, reasoning parser can be enabled together with tool call parser. See [reasoning parser](https://docs.sglang.ai/backend/separate_reasoning.html). |\n",
+    "| `deepseekv3` | DeepSeek-v3 (e.g., `deepseek-ai/DeepSeek-V3-0324`) | |\n",
+    "| `gpt-oss` | GPT-OSS (e.g., `openai/gpt-oss-120b`, `openai/gpt-oss-20b`, `lmsys/gpt-oss-120b-bf16`, `lmsys/gpt-oss-20b-bf16`) | The gpt-oss tool parser filters out analysis channel events and only preserves normal text. This can cause the content to be empty when explanations are in the analysis channel. To work around this, complete the tool round by returning tool results as `role=\"tool\"` messages, which enables the model to generate the final content. |\n",
+    "| `kimi_k2` | `moonshotai/Kimi-K2-Instruct` | |\n",
+    "| `pythonic` | Llama-3.2 / Llama-3.3 / Llama-4 | Model outputs function calls as Python code. Requires `--tool-call-parser pythonic` and is recommended to use with a specific chat template. |\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## OpenAI Compatible API"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Launching the Server"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "from sglang.test.doc_patch import launch_server_cmd\n",
+    "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+    "from openai import OpenAI\n",
+    "\n",
+    "server_process, port = launch_server_cmd(\n",
+    "    \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0 --log-level warning\"  # qwen25\n",
+    ")\n",
+    "wait_for_server(f\"http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note that `--tool-call-parser` defines the parser used to interpret responses."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define Tools for Function Call\n",
+    "Below is a Python snippet that shows how to define a tool as a dictionary. The dictionary includes a tool name, a description, and property defined Parameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define tools\n",
+    "tools = [\n",
+    "    {\n",
+    "        \"type\": \"function\",\n",
+    "        \"function\": {\n",
+    "            \"name\": \"get_current_weather\",\n",
+    "            \"description\": \"Get the current weather in a given location\",\n",
+    "            \"parameters\": {\n",
+    "                \"type\": \"object\",\n",
+    "                \"properties\": {\n",
+    "                    \"city\": {\n",
+    "                        \"type\": \"string\",\n",
+    "                        \"description\": \"The city to find the weather for, e.g. 'San Francisco'\",\n",
+    "                    },\n",
+    "                    \"state\": {\n",
+    "                        \"type\": \"string\",\n",
+    "                        \"description\": \"the two-letter abbreviation for the state that the city is\"\n",
+    "                        \" in, e.g. 'CA' which would mean 'California'\",\n",
+    "                    },\n",
+    "                    \"unit\": {\n",
+    "                        \"type\": \"string\",\n",
+    "                        \"description\": \"The unit to fetch the temperature in\",\n",
+    "                        \"enum\": [\"celsius\", \"fahrenheit\"],\n",
+    "                    },\n",
+    "                },\n",
+    "                \"required\": [\"city\", \"state\", \"unit\"],\n",
+    "            },\n",
+    "        },\n",
+    "    }\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define Messages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_messages():\n",
+    "    return [\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": \"What's the weather like in Boston today? Output a reasoning before act, then use the tools to help you.\",\n",
+    "        }\n",
+    "    ]\n",
+    "\n",
+    "\n",
+    "messages = get_messages()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Initialize the Client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize OpenAI-like client\n",
+    "client = OpenAI(api_key=\"None\", base_url=f\"http://0.0.0.0:{port}/v1\")\n",
+    "model_name = client.models.list().data[0].id"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "###  Non-Streaming Request"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Non-streaming mode test\n",
+    "response_non_stream = client.chat.completions.create(\n",
+    "    model=model_name,\n",
+    "    messages=messages,\n",
+    "    temperature=0,\n",
+    "    top_p=0.95,\n",
+    "    max_tokens=1024,\n",
+    "    stream=False,  # Non-streaming\n",
+    "    tools=tools,\n",
+    ")\n",
+    "print_highlight(\"Non-stream response:\")\n",
+    "print_highlight(response_non_stream)\n",
+    "print_highlight(\"==== content ====\")\n",
+    "print_highlight(response_non_stream.choices[0].message.content)\n",
+    "print_highlight(\"==== tool_calls ====\")\n",
+    "print_highlight(response_non_stream.choices[0].message.tool_calls)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Handle Tools\n",
+    "When the engine determines it should call a particular tool, it will return arguments or partial arguments through the response. You can parse these arguments and later invoke the tool accordingly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "name_non_stream = response_non_stream.choices[0].message.tool_calls[0].function.name\n",
+    "arguments_non_stream = (\n",
+    "    response_non_stream.choices[0].message.tool_calls[0].function.arguments\n",
+    ")\n",
+    "\n",
+    "print_highlight(f\"Final streamed function call name: {name_non_stream}\")\n",
+    "print_highlight(f\"Final streamed function call arguments: {arguments_non_stream}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Streaming Request"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Streaming mode test\n",
+    "print_highlight(\"Streaming response:\")\n",
+    "response_stream = client.chat.completions.create(\n",
+    "    model=model_name,\n",
+    "    messages=messages,\n",
+    "    temperature=0,\n",
+    "    top_p=0.95,\n",
+    "    max_tokens=1024,\n",
+    "    stream=True,  # Enable streaming\n",
+    "    tools=tools,\n",
+    ")\n",
+    "\n",
+    "texts = \"\"\n",
+    "tool_calls = []\n",
+    "name = \"\"\n",
+    "arguments = \"\"\n",
+    "for chunk in response_stream:\n",
+    "    if chunk.choices[0].delta.content:\n",
+    "        texts += chunk.choices[0].delta.content\n",
+    "    if chunk.choices[0].delta.tool_calls:\n",
+    "        tool_calls.append(chunk.choices[0].delta.tool_calls[0])\n",
+    "print_highlight(\"==== Text ====\")\n",
+    "print_highlight(texts)\n",
+    "\n",
+    "print_highlight(\"==== Tool Call ====\")\n",
+    "for tool_call in tool_calls:\n",
+    "    print_highlight(tool_call)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Handle Tools\n",
+    "When the engine determines it should call a particular tool, it will return arguments or partial arguments through the response. You can parse these arguments and later invoke the tool accordingly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Parse and combine function call arguments\n",
+    "arguments = []\n",
+    "for tool_call in tool_calls:\n",
+    "    if tool_call.function.name:\n",
+    "        print_highlight(f\"Streamed function call name: {tool_call.function.name}\")\n",
+    "\n",
+    "    if tool_call.function.arguments:\n",
+    "        arguments.append(tool_call.function.arguments)\n",
+    "\n",
+    "# Combine all fragments into a single JSON string\n",
+    "full_arguments = \"\".join(arguments)\n",
+    "print_highlight(f\"streamed function call arguments: {full_arguments}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define a Tool Function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# This is a demonstration, define real function according to your usage.\n",
+    "def get_current_weather(city: str, state: str, unit: \"str\"):\n",
+    "    return (\n",
+    "        f\"The weather in {city}, {state} is 85 degrees {unit}. It is \"\n",
+    "        \"partly cloudly, with highs in the 90's.\"\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "available_tools = {\"get_current_weather\": get_current_weather}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "### Execute the Tool"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages.append(response_non_stream.choices[0].message)\n",
+    "\n",
+    "# Call the corresponding tool function\n",
+    "tool_call = messages[-1].tool_calls[0]\n",
+    "tool_name = tool_call.function.name\n",
+    "tool_to_call = available_tools[tool_name]\n",
+    "result = tool_to_call(**(json.loads(tool_call.function.arguments)))\n",
+    "print_highlight(f\"Function call result: {result}\")\n",
+    "# messages.append({\"role\": \"tool\", \"content\": result, \"name\": tool_name})\n",
+    "messages.append(\n",
+    "    {\n",
+    "        \"role\": \"tool\",\n",
+    "        \"tool_call_id\": tool_call.id,\n",
+    "        \"content\": str(result),\n",
+    "        \"name\": tool_name,\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "print_highlight(f\"Updated message history: {messages}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Send Results Back to Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_response = client.chat.completions.create(\n",
+    "    model=model_name,\n",
+    "    messages=messages,\n",
+    "    temperature=0,\n",
+    "    top_p=0.95,\n",
+    "    stream=False,\n",
+    "    tools=tools,\n",
+    ")\n",
+    "print_highlight(\"Non-stream response:\")\n",
+    "print_highlight(final_response)\n",
+    "\n",
+    "print_highlight(\"==== Text ====\")\n",
+    "print_highlight(final_response.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Native API and SGLang Runtime (SRT)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "import requests\n",
+    "\n",
+    "# generate an answer\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"Qwen/Qwen2.5-7B-Instruct\")\n",
+    "\n",
+    "messages = get_messages()\n",
+    "\n",
+    "input = tokenizer.apply_chat_template(\n",
+    "    messages,\n",
+    "    tokenize=False,\n",
+    "    add_generation_prompt=True,\n",
+    "    tools=tools,\n",
+    ")\n",
+    "\n",
+    "gen_url = f\"http://localhost:{port}/generate\"\n",
+    "gen_data = {\n",
+    "    \"text\": input,\n",
+    "    \"sampling_params\": {\n",
+    "        \"skip_special_tokens\": False,\n",
+    "        \"max_new_tokens\": 1024,\n",
+    "        \"temperature\": 0,\n",
+    "        \"top_p\": 0.95,\n",
+    "    },\n",
+    "}\n",
+    "gen_response = requests.post(gen_url, json=gen_data).json()[\"text\"]\n",
+    "print_highlight(\"==== Response ====\")\n",
+    "print_highlight(gen_response)\n",
+    "\n",
+    "# parse the response\n",
+    "parse_url = f\"http://localhost:{port}/parse_function_call\"\n",
+    "\n",
+    "function_call_input = {\n",
+    "    \"text\": gen_response,\n",
+    "    \"tool_call_parser\": \"qwen25\",\n",
+    "    \"tools\": tools,\n",
+    "}\n",
+    "\n",
+    "function_call_response = requests.post(parse_url, json=function_call_input)\n",
+    "function_call_response_json = function_call_response.json()\n",
+    "\n",
+    "print_highlight(\"==== Text ====\")\n",
+    "print(function_call_response_json[\"normal_text\"])\n",
+    "print_highlight(\"==== Calls ====\")\n",
+    "print(\"function name: \", function_call_response_json[\"calls\"][0][\"name\"])\n",
+    "print(\"function arguments: \", function_call_response_json[\"calls\"][0][\"parameters\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Offline Engine API"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sglang as sgl\n",
+    "from sglang.srt.function_call.function_call_parser import FunctionCallParser\n",
+    "from sglang.srt.managers.io_struct import Tool, Function\n",
+    "\n",
+    "llm = sgl.Engine(model_path=\"Qwen/Qwen2.5-7B-Instruct\")\n",
+    "tokenizer = llm.tokenizer_manager.tokenizer\n",
+    "input_ids = tokenizer.apply_chat_template(\n",
+    "    messages, tokenize=True, add_generation_prompt=True, tools=tools\n",
+    ")\n",
+    "\n",
+    "# Note that for gpt-oss tool parser, adding \"no_stop_trim\": True\n",
+    "# to make sure the tool call token <call> is not trimmed.\n",
+    "\n",
+    "sampling_params = {\n",
+    "    \"max_new_tokens\": 1024,\n",
+    "    \"temperature\": 0,\n",
+    "    \"top_p\": 0.95,\n",
+    "    \"skip_special_tokens\": False,\n",
+    "}\n",
+    "\n",
+    "# 1) Offline generation\n",
+    "result = llm.generate(input_ids=input_ids, sampling_params=sampling_params)\n",
+    "generated_text = result[\"text\"]  # Assume there is only one prompt\n",
+    "\n",
+    "print_highlight(\"=== Offline Engine Output Text ===\")\n",
+    "print_highlight(generated_text)\n",
+    "\n",
+    "\n",
+    "# 2) Parse using FunctionCallParser\n",
+    "def convert_dict_to_tool(tool_dict: dict) -> Tool:\n",
+    "    function_dict = tool_dict.get(\"function\", {})\n",
+    "    return Tool(\n",
+    "        type=tool_dict.get(\"type\", \"function\"),\n",
+    "        function=Function(\n",
+    "            name=function_dict.get(\"name\"),\n",
+    "            description=function_dict.get(\"description\"),\n",
+    "            parameters=function_dict.get(\"parameters\"),\n",
+    "        ),\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "tools = [convert_dict_to_tool(raw_tool) for raw_tool in tools]\n",
+    "\n",
+    "parser = FunctionCallParser(tools=tools, tool_call_parser=\"qwen25\")\n",
+    "normal_text, calls = parser.parse_non_stream(generated_text)\n",
+    "\n",
+    "print_highlight(\"=== Parsing Result ===\")\n",
+    "print(\"Normal text portion:\", normal_text)\n",
+    "print_highlight(\"Function call portion:\")\n",
+    "for call in calls:\n",
+    "    # call: ToolCallItem\n",
+    "    print_highlight(f\"  - tool name: {call.name}\")\n",
+    "    print_highlight(f\"    parameters: {call.parameters}\")\n",
+    "\n",
+    "# 3) If needed, perform additional logic on the parsed functions, such as automatically calling the corresponding function to obtain a return value, etc."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm.shutdown()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Tool Choice Mode\n",
+    "\n",
+    "SGLang supports OpenAI's `tool_choice` parameter to control when and which tools the model should call. This feature is implemented using EBNF (Extended Backus-Naur Form) grammar to ensure reliable tool calling behavior.\n",
+    "\n",
+    "### Supported Tool Choice Options\n",
+    "\n",
+    "- **`tool_choice=\"required\"`**: Forces the model to call at least one tool\n",
+    "- **`tool_choice={\"type\": \"function\", \"function\": {\"name\": \"specific_function\"}}`**: Forces the model to call a specific function\n",
+    "\n",
+    "### Backend Compatibility\n",
+    "\n",
+    "Tool choice is fully supported with the **Xgrammar backend**, which is the default grammar backend (`--grammar-backend xgrammar`). However, it may not be fully supported with other backends such as `outlines`.\n",
+    "\n",
+    "### Example: Required Tool Choice"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openai import OpenAI\n",
+    "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+    "from sglang.test.doc_patch import launch_server_cmd\n",
+    "\n",
+    "# Start a new server session for tool choice examples\n",
+    "server_process_tool_choice, port_tool_choice = launch_server_cmd(\n",
+    "    \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0  --log-level warning\"\n",
+    ")\n",
+    "wait_for_server(f\"http://localhost:{port_tool_choice}\")\n",
+    "\n",
+    "# Initialize client for tool choice examples\n",
+    "client_tool_choice = OpenAI(\n",
+    "    api_key=\"None\", base_url=f\"http://0.0.0.0:{port_tool_choice}/v1\"\n",
+    ")\n",
+    "model_name_tool_choice = client_tool_choice.models.list().data[0].id\n",
+    "\n",
+    "# Example with tool_choice=\"required\" - forces the model to call a tool\n",
+    "messages_required = [\n",
+    "    {\"role\": \"user\", \"content\": \"Hello, what is the capital of France?\"}\n",
+    "]\n",
+    "\n",
+    "# Define tools\n",
+    "tools = [\n",
+    "    {\n",
+    "        \"type\": \"function\",\n",
+    "        \"function\": {\n",
+    "            \"name\": \"get_current_weather\",\n",
+    "            \"description\": \"Get the current weather in a given location\",\n",
+    "            \"parameters\": {\n",
+    "                \"type\": \"object\",\n",
+    "                \"properties\": {\n",
+    "                    \"city\": {\n",
+    "                        \"type\": \"string\",\n",
+    "                        \"description\": \"The city to find the weather for, e.g. 'San Francisco'\",\n",
+    "                    },\n",
+    "                    \"unit\": {\n",
+    "                        \"type\": \"string\",\n",
+    "                        \"description\": \"The unit to fetch the temperature in\",\n",
+    "                        \"enum\": [\"celsius\", \"fahrenheit\"],\n",
+    "                    },\n",
+    "                },\n",
+    "                \"required\": [\"city\", \"unit\"],\n",
+    "            },\n",
+    "        },\n",
+    "    }\n",
+    "]\n",
+    "\n",
+    "response_required = client_tool_choice.chat.completions.create(\n",
+    "    model=model_name_tool_choice,\n",
+    "    messages=messages_required,\n",
+    "    temperature=0,\n",
+    "    max_tokens=1024,\n",
+    "    tools=tools,\n",
+    "    tool_choice=\"required\",  # Force the model to call a tool\n",
+    ")\n",
+    "\n",
+    "print_highlight(\"Response with tool_choice='required':\")\n",
+    "print(\"Content:\", response_required.choices[0].message.content)\n",
+    "print(\"Tool calls:\", response_required.choices[0].message.tool_calls)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Example: Specific Function Choice\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Example with specific function choice - forces the model to call a specific function\n",
+    "messages_specific = [\n",
+    "    {\"role\": \"user\", \"content\": \"What are the most attactive places in France?\"}\n",
+    "]\n",
+    "\n",
+    "response_specific = client_tool_choice.chat.completions.create(\n",
+    "    model=model_name_tool_choice,\n",
+    "    messages=messages_specific,\n",
+    "    temperature=0,\n",
+    "    max_tokens=1024,\n",
+    "    tools=tools,\n",
+    "    tool_choice={\n",
+    "        \"type\": \"function\",\n",
+    "        \"function\": {\"name\": \"get_current_weather\"},\n",
+    "    },  # Force the model to call the specific get_current_weather function\n",
+    ")\n",
+    "\n",
+    "print_highlight(\"Response with specific function choice:\")\n",
+    "print(\"Content:\", response_specific.choices[0].message.content)\n",
+    "print(\"Tool calls:\", response_specific.choices[0].message.tool_calls)\n",
+    "\n",
+    "if response_specific.choices[0].message.tool_calls:\n",
+    "    tool_call = response_specific.choices[0].message.tool_calls[0]\n",
+    "    print_highlight(f\"Called function: {tool_call.function.name}\")\n",
+    "    print_highlight(f\"Arguments: {tool_call.function.arguments}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process_tool_choice)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Pythonic Tool Call Format (Llama-3.2 / Llama-3.3 / Llama-4)\n",
+    "\n",
+    "Some Llama models (such as Llama-3.2-1B, Llama-3.2-3B, Llama-3.3-70B, and Llama-4) support a \"pythonic\" tool call format, where the model outputs function calls as Python code, e.g.:\n",
+    "\n",
+    "```python\n",
+    "[get_current_weather(city=\"San Francisco\", state=\"CA\", unit=\"celsius\")]\n",
+    "```\n",
+    "\n",
+    "- The output is a Python list of function calls, with arguments as Python literals (not JSON).\n",
+    "- Multiple tool calls can be returned in the same list:\n",
+    "```python\n",
+    "[get_current_weather(city=\"San Francisco\", state=\"CA\", unit=\"celsius\"),\n",
+    " get_current_weather(city=\"New York\", state=\"NY\", unit=\"fahrenheit\")]\n",
+    "```\n",
+    "\n",
+    "For more information, refer to Meta’s documentation on  [Zero shot function calling](https://github.com/meta-llama/llama-models/blob/main/models/llama4/prompt_format.md#zero-shot-function-calling---system-message).\n",
+    "\n",
+    "Note that this feature is still under development on Blackwell.\n",
+    "\n",
+    "### How to enable\n",
+    "- Launch the server with `--tool-call-parser pythonic`\n",
+    "- You may also specify --chat-template with the improved template for the model (e.g., `--chat-template=examples/chat_template/tool_chat_template_llama4_pythonic.jinja`).\n",
+    "This is recommended because the model expects a special prompt format to reliably produce valid pythonic tool call outputs. The template ensures that the prompt structure (e.g., special tokens, message boundaries like `<|eom|>`, and function call delimiters) matches what the model was trained or fine-tuned on. If you do not use the correct chat template, tool calling may fail or produce inconsistent results.\n",
+    "\n",
+    "#### Forcing Pythonic Tool Call Output Without a Chat Template\n",
+    "If you don't want to specify a chat template, you must give the model extremely explicit instructions in your messages to enforce pythonic output. For example, for `Llama-3.2-1B-Instruct`, you need:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openai\n",
+    "\n",
+    "server_process, port = launch_server_cmd(\n",
+    "    \" python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --tool-call-parser pythonic --tp 1  --log-level warning\"  # llama-3.2-1b-instruct\n",
+    ")\n",
+    "wait_for_server(f\"http://localhost:{port}\")\n",
+    "\n",
+    "tools = [\n",
+    "    {\n",
+    "        \"type\": \"function\",\n",
+    "        \"function\": {\n",
+    "            \"name\": \"get_weather\",\n",
+    "            \"description\": \"Get the current weather for a given location.\",\n",
+    "            \"parameters\": {\n",
+    "                \"type\": \"object\",\n",
+    "                \"properties\": {\n",
+    "                    \"location\": {\n",
+    "                        \"type\": \"string\",\n",
+    "                        \"description\": \"The name of the city or location.\",\n",
+    "                    }\n",
+    "                },\n",
+    "                \"required\": [\"location\"],\n",
+    "            },\n",
+    "        },\n",
+    "    },\n",
+    "    {\n",
+    "        \"type\": \"function\",\n",
+    "        \"function\": {\n",
+    "            \"name\": \"get_tourist_attractions\",\n",
+    "            \"description\": \"Get a list of top tourist attractions for a given city.\",\n",
+    "            \"parameters\": {\n",
+    "                \"type\": \"object\",\n",
+    "                \"properties\": {\n",
+    "                    \"city\": {\n",
+    "                        \"type\": \"string\",\n",
+    "                        \"description\": \"The name of the city to find attractions for.\",\n",
+    "                    }\n",
+    "                },\n",
+    "                \"required\": [\"city\"],\n",
+    "            },\n",
+    "        },\n",
+    "    },\n",
+    "]\n",
+    "\n",
+    "\n",
+    "def get_messages():\n",
+    "    return [\n",
+    "        {\n",
+    "            \"role\": \"system\",\n",
+    "            \"content\": (\n",
+    "                \"You are a travel assistant. \"\n",
+    "                \"When asked to call functions, ALWAYS respond ONLY with a python list of function calls, \"\n",
+    "                \"using this format: [func_name1(param1=value1, param2=value2), func_name2(param=value)]. \"\n",
+    "                \"Do NOT use JSON, do NOT use variables, do NOT use any other format. \"\n",
+    "                \"Here is an example:\\n\"\n",
+    "                '[get_weather(location=\"Paris\"), get_tourist_attractions(city=\"Paris\")]'\n",
+    "            ),\n",
+    "        },\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": (\n",
+    "                \"I'm planning a trip to Tokyo next week. What's the weather like and what are some top tourist attractions? \"\n",
+    "                \"Propose parallel tool calls at once, using the python list of function calls format as shown above.\"\n",
+    "            ),\n",
+    "        },\n",
+    "    ]\n",
+    "\n",
+    "\n",
+    "messages = get_messages()\n",
+    "\n",
+    "client = openai.Client(base_url=f\"http://localhost:{port}/v1\", api_key=\"xxxxxx\")\n",
+    "model_name = client.models.list().data[0].id\n",
+    "\n",
+    "\n",
+    "response_non_stream = client.chat.completions.create(\n",
+    "    model=model_name,\n",
+    "    messages=messages,\n",
+    "    temperature=0,\n",
+    "    top_p=0.9,\n",
+    "    stream=False,  # Non-streaming\n",
+    "    tools=tools,\n",
+    ")\n",
+    "print_highlight(\"Non-stream response:\")\n",
+    "print_highlight(response_non_stream)\n",
+    "\n",
+    "response_stream = client.chat.completions.create(\n",
+    "    model=model_name,\n",
+    "    messages=messages,\n",
+    "    temperature=0,\n",
+    "    top_p=0.9,\n",
+    "    stream=True,\n",
+    "    tools=tools,\n",
+    ")\n",
+    "texts = \"\"\n",
+    "tool_calls = []\n",
+    "name = \"\"\n",
+    "arguments = \"\"\n",
+    "\n",
+    "for chunk in response_stream:\n",
+    "    if chunk.choices[0].delta.content:\n",
+    "        texts += chunk.choices[0].delta.content\n",
+    "    if chunk.choices[0].delta.tool_calls:\n",
+    "        tool_calls.append(chunk.choices[0].delta.tool_calls[0])\n",
+    "\n",
+    "print_highlight(\"Streaming Response:\")\n",
+    "print_highlight(\"==== Text ====\")\n",
+    "print_highlight(texts)\n",
+    "\n",
+    "print_highlight(\"==== Tool Call ====\")\n",
+    "for tool_call in tool_calls:\n",
+    "    print_highlight(tool_call)\n",
+    "\n",
+    "terminate_process(server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> **Note:**  \n",
+    "> The model may still default to JSON if it was heavily finetuned on that format. Prompt engineering (including examples) is the only way to increase the chance of pythonic output if you are not using a chat template."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## How to support a new model?\n",
+    "1. Update the TOOLS_TAG_LIST in sglang/srt/function_call_parser.py with the model’s tool tags. Currently supported tags include:\n",
+    "```\n",
+    "\tTOOLS_TAG_LIST = [\n",
+    "\t    “<|plugin|>“,\n",
+    "\t    “<function=“,\n",
+    "\t    “<tool_call>“,\n",
+    "\t    “<|python_tag|>“,\n",
+    "\t    “[TOOL_CALLS]”\n",
+    "\t]\n",
+    "```\n",
+    "2. Create a new detector class in sglang/srt/function_call_parser.py that inherits from BaseFormatDetector. The detector should handle the model’s specific function call format. For example:\n",
+    "```\n",
+    "    class NewModelDetector(BaseFormatDetector):\n",
+    "```\n",
+    "3. Add the new detector to the MultiFormatParser class that manages all the format detectors."
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/advanced_features/vlm_query.ipynb b/docs/advanced_features/vlm_query.ipynb
new file mode 100644
index 000000000..d9a8ae75d
--- /dev/null
+++ b/docs/advanced_features/vlm_query.ipynb
@@ -0,0 +1,325 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0",
+   "metadata": {},
+   "source": [
+    "# Query Vision Language Model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1",
+   "metadata": {},
+   "source": [
+    "## Querying Qwen-VL"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nest_asyncio\n",
+    "\n",
+    "nest_asyncio.apply()  # Run this first.\n",
+    "\n",
+    "model_path = \"Qwen/Qwen2.5-VL-3B-Instruct\"\n",
+    "chat_template = \"qwen2-vl\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Lets create a prompt.\n",
+    "\n",
+    "from io import BytesIO\n",
+    "import requests\n",
+    "from PIL import Image\n",
+    "\n",
+    "from sglang.srt.parser.conversation import chat_templates\n",
+    "\n",
+    "image = Image.open(\n",
+    "    BytesIO(\n",
+    "        requests.get(\n",
+    "            \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n",
+    "        ).content\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "conv = chat_templates[chat_template].copy()\n",
+    "conv.append_message(conv.roles[0], f\"What's shown here: {conv.image_token}?\")\n",
+    "conv.append_message(conv.roles[1], \"\")\n",
+    "conv.image_data = [image]\n",
+    "\n",
+    "print(conv.get_prompt())\n",
+    "image"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4",
+   "metadata": {},
+   "source": [
+    "### Query via the offline Engine API"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sglang import Engine\n",
+    "\n",
+    "llm = Engine(\n",
+    "    model_path=model_path, chat_template=chat_template, mem_fraction_static=0.8\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "out = llm.generate(prompt=conv.get_prompt(), image_data=[image])\n",
+    "print(out[\"text\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7",
+   "metadata": {},
+   "source": [
+    "### Query via the offline Engine API, but send precomputed embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Compute the image embeddings using Huggingface.\n",
+    "\n",
+    "from transformers import AutoProcessor\n",
+    "from transformers import Qwen2_5_VLForConditionalGeneration\n",
+    "\n",
+    "processor = AutoProcessor.from_pretrained(model_path, use_fast=True)\n",
+    "vision = (\n",
+    "    Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path).eval().visual.cuda()\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "processed_prompt = processor(\n",
+    "    images=[image], text=conv.get_prompt(), return_tensors=\"pt\"\n",
+    ")\n",
+    "input_ids = processed_prompt[\"input_ids\"][0].detach().cpu().tolist()\n",
+    "precomputed_embeddings = vision(\n",
+    "    processed_prompt[\"pixel_values\"].cuda(), processed_prompt[\"image_grid_thw\"].cuda()\n",
+    ")\n",
+    "\n",
+    "mm_item = dict(\n",
+    "    modality=\"IMAGE\",\n",
+    "    image_grid_thw=processed_prompt[\"image_grid_thw\"],\n",
+    "    precomputed_embeddings=precomputed_embeddings,\n",
+    ")\n",
+    "out = llm.generate(input_ids=input_ids, image_data=[mm_item])\n",
+    "print(out[\"text\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "10",
+   "metadata": {},
+   "source": [
+    "## Querying Llama 4 (Vision)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nest_asyncio\n",
+    "\n",
+    "nest_asyncio.apply()  # Run this first.\n",
+    "\n",
+    "model_path = \"meta-llama/Llama-4-Scout-17B-16E-Instruct\"\n",
+    "chat_template = \"llama-4\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Lets create a prompt.\n",
+    "\n",
+    "from io import BytesIO\n",
+    "import requests\n",
+    "from PIL import Image\n",
+    "\n",
+    "from sglang.srt.parser.conversation import chat_templates\n",
+    "\n",
+    "image = Image.open(\n",
+    "    BytesIO(\n",
+    "        requests.get(\n",
+    "            \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n",
+    "        ).content\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "conv = chat_templates[chat_template].copy()\n",
+    "conv.append_message(conv.roles[0], f\"What's shown here: {conv.image_token}?\")\n",
+    "conv.append_message(conv.roles[1], \"\")\n",
+    "conv.image_data = [image]\n",
+    "\n",
+    "print(conv.get_prompt())\n",
+    "print(f\"Image size: {image.size}\")\n",
+    "\n",
+    "image"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "13",
+   "metadata": {},
+   "source": [
+    "### Query via the offline Engine API"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "14",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sglang.test.test_utils import is_in_ci\n",
+    "\n",
+    "if not is_in_ci():\n",
+    "    from sglang import Engine\n",
+    "\n",
+    "    llm = Engine(\n",
+    "        model_path=model_path,\n",
+    "        trust_remote_code=True,\n",
+    "        enable_multimodal=True,\n",
+    "        mem_fraction_static=0.8,\n",
+    "        tp_size=4,\n",
+    "        attention_backend=\"fa3\",\n",
+    "        context_length=65536,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not is_in_ci():\n",
+    "    out = llm.generate(prompt=conv.get_prompt(), image_data=[image])\n",
+    "    print(out[\"text\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "16",
+   "metadata": {},
+   "source": [
+    "### Query via the offline Engine API, but send precomputed embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "17",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not is_in_ci():\n",
+    "    # Compute the image embeddings using Huggingface.\n",
+    "\n",
+    "    from transformers import AutoProcessor\n",
+    "    from transformers import Llama4ForConditionalGeneration\n",
+    "\n",
+    "    processor = AutoProcessor.from_pretrained(model_path, use_fast=True)\n",
+    "    model = Llama4ForConditionalGeneration.from_pretrained(\n",
+    "        model_path, torch_dtype=\"auto\"\n",
+    "    ).eval()\n",
+    "    vision = model.vision_model.cuda()\n",
+    "    multi_modal_projector = model.multi_modal_projector.cuda()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not is_in_ci():\n",
+    "    processed_prompt = processor(\n",
+    "        images=[image], text=conv.get_prompt(), return_tensors=\"pt\"\n",
+    "    )\n",
+    "    print(f'{processed_prompt[\"pixel_values\"].shape=}')\n",
+    "    input_ids = processed_prompt[\"input_ids\"][0].detach().cpu().tolist()\n",
+    "\n",
+    "    image_outputs = vision(\n",
+    "        processed_prompt[\"pixel_values\"].to(\"cuda\"), output_hidden_states=False\n",
+    "    )\n",
+    "    image_features = image_outputs.last_hidden_state\n",
+    "    vision_flat = image_features.view(-1, image_features.size(-1))\n",
+    "    precomputed_embeddings = multi_modal_projector(vision_flat)\n",
+    "\n",
+    "    mm_item = dict(modality=\"IMAGE\", precomputed_embeddings=precomputed_embeddings)\n",
+    "    out = llm.generate(input_ids=input_ids, image_data=[mm_item])\n",
+    "    print(out[\"text\"])"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "cell_metadata_filter": "-all",
+   "custom_cell_magics": "kql",
+   "encoding": "# -*- coding: utf-8 -*-"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/basic_usage/deepseek.md b/docs/basic_usage/deepseek.md
new file mode 100644
index 000000000..8a71696f5
--- /dev/null
+++ b/docs/basic_usage/deepseek.md
@@ -0,0 +1,228 @@
+# DeepSeek Usage
+
+SGLang provides many optimizations specifically designed for the DeepSeek models, making it the inference engine recommended by the official [DeepSeek team](https://github.com/deepseek-ai/DeepSeek-V3/tree/main?tab=readme-ov-file#62-inference-with-sglang-recommended) from Day 0.
+
+This document outlines current optimizations for DeepSeek.
+For an overview of the implemented features see the completed [Roadmap](https://github.com/sgl-project/sglang/issues/2591).
+
+## Launch DeepSeek V3.1/V3/R1 with SGLang
+
+To run DeepSeek V3.1/V3/R1 models, the recommended settings are as follows:
+
+| Weight Type | Configuration |
+|------------|-------------------|
+| **Full precision FP8**<br>*(recommended)* | 8 x H200 |
+| | 8 x MI300X |
+| | 2 x 8 x H100/800/20 |
+| | Xeon 6980P CPU |
+| **Full precision BF16** | 2 x 8 x H200 |
+| | 2 x 8 x MI300X |
+| | 4 x 8 x H100/800/20 |
+| | 4 x 8 x A100/A800 |
+| **Quantized weights (AWQ)** | 8 x H100/800/20 |
+| | 8 x A100/A800 |
+| **Quantized weights (int8)** | 16 x A100/800 |
+| | 32 x L40S |
+| | Xeon 6980P CPU |
+| | 2 x Atlas 800I A3 |
+
+<style>
+.md-typeset__table {
+  width: 100%;
+}
+
+.md-typeset__table table {
+  border-collapse: collapse;
+  margin: 1em 0;
+  border: 2px solid var(--md-typeset-table-color);
+  table-layout: fixed;
+}
+
+.md-typeset__table th {
+  border: 1px solid var(--md-typeset-table-color);
+  border-bottom: 2px solid var(--md-typeset-table-color);
+  background-color: var(--md-default-bg-color--lighter);
+  padding: 12px;
+}
+
+.md-typeset__table td {
+  border: 1px solid var(--md-typeset-table-color);
+  padding: 12px;
+}
+
+.md-typeset__table tr:nth-child(2n) {
+  background-color: var(--md-default-bg-color--lightest);
+}
+</style>
+
+Detailed commands for reference:
+
+- [8 x H200](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#using-docker-recommended)
+- [8 x MI300X](../platforms/amd_gpu.md#running-deepseek-v3)
+- [2 x 8 x H200](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-two-h208-nodes)
+- [4 x 8 x A100](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-four-a1008-nodes)
+- [8 x A100 (AWQ)](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-8-a100a800-with-awq-quantization)
+- [16 x A100 (int8)](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-16-a100a800-with-int8-quantization)
+- [32 x L40S (int8)](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-32-l40s-with-int8-quantization)
+- [Xeon 6980P CPU](../platforms/cpu_server.md#example-running-deepseek-r1)
+- [2 x Atlas 800I A3 (int8)](../platforms/ascend_npu.md#running-deepseek-v3)
+
+### Download Weights
+If you encounter errors when starting the server, ensure the weights have finished downloading. It's recommended to download them beforehand or restart multiple times until all weights are downloaded. Please refer to [DeepSeek V3](https://huggingface.co/deepseek-ai/DeepSeek-V3-Base#61-inference-with-deepseek-infer-demo-example-only) official guide to download the weights.
+
+### Launch with one node of 8 x H200
+Please refer to [the example](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#installation--launch).
+**Note that Deepseek V3 is already in FP8**, so we should not run it with any quantization arguments like `--quantization fp8 --kv-cache-dtype fp8_e5m2`.
+
+### Running examples on Multi-node
+
+- [Serving with two H20*8 nodes](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-two-h208-nodes).
+
+- [Serving with two H200*8 nodes and docker](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-two-h2008-nodes-and-docker).
+
+- [Serving with four A100*8 nodes](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-four-a1008-nodes).
+
+## Optimizations
+
+### Multi-head Latent Attention (MLA) Throughput Optimizations
+
+**Description**: [MLA](https://arxiv.org/pdf/2405.04434) is an innovative attention mechanism introduced by the DeepSeek team, aimed at improving inference efficiency. SGLang has implemented specific optimizations for this, including:
+
+- **Weight Absorption**: By applying the associative law of matrix multiplication to reorder computation steps, this method balances computation and memory access and improves efficiency in the decoding phase.
+
+- **MLA Attention Backends**: Currently SGLang supports different optimized MLA attention backends, including [FlashAttention3](https://github.com/Dao-AILab/flash-attention), [Flashinfer](https://docs.flashinfer.ai/api/mla.html), [FlashMLA](https://github.com/deepseek-ai/FlashMLA), [CutlassMLA](https://github.com/sgl-project/sglang/pull/5390), **TRTLLM MLA** (optimized for Blackwell architecture), and [Triton](https://github.com/triton-lang/triton) backends. The default FA3 provides good performance across wide workloads.
+
+- **FP8 Quantization**: W8A8 FP8 and KV Cache FP8 quantization enables efficient FP8 inference. Additionally, we have implemented Batched Matrix Multiplication (BMM) operator to facilitate FP8 inference in MLA with weight absorption.
+
+- **CUDA Graph & Torch.compile**: Both MLA and Mixture of Experts (MoE) are compatible with CUDA Graph and Torch.compile, which reduces latency and accelerates decoding speed for small batch sizes.
+
+- **Chunked Prefix Cache**: Chunked prefix cache optimization can increase throughput by cutting prefix cache into chunks, processing them with multi-head attention and merging their states. Its improvement can be significant when doing chunked prefill on long sequences. Currently this optimization is only available for FlashAttention3 backend.
+
+Overall, with these optimizations, we have achieved up to **7x** acceleration in output throughput compared to the previous version.
+
+<p align="center">
+  <img src="https://lmsys.org/images/blog/sglang_v0_3/deepseek_mla.svg" alt="Multi-head Latent Attention for DeepSeek Series Models">
+</p>
+
+**Usage**: MLA optimization is enabled by default. For MLA models on Blackwell architecture (e.g., B200), the default backend is FlashInfer. To use the optimized TRTLLM MLA backend for prefill and decode operations, explicitly specify `--attention-backend trtllm_mla`.
+
+**Reference**: Check [Blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/#deepseek-multi-head-latent-attention-mla-throughput-optimizations) and [Slides](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/lmsys_1st_meetup_deepseek_mla.pdf) for more details.
+
+### Data Parallelism Attention
+
+**Description**: This optimization involves data parallelism (DP) for the MLA attention mechanism of DeepSeek Series Models, which allows for a significant reduction in the KV cache size, enabling larger batch sizes. Each DP worker independently handles different types of batches (prefill, decode, idle), which are then synchronized before and after processing through the Mixture-of-Experts (MoE) layer. If you do not use DP attention, KV cache will be duplicated among all TP ranks.
+
+<p align="center">
+  <img src="https://lmsys.org/images/blog/sglang_v0_4/dp_attention.svg" alt="Data Parallelism Attention for DeepSeek Series Models">
+</p>
+
+With data parallelism attention enabled, we have achieved up to **1.9x** decoding throughput improvement compared to the previous version.
+
+<p align="center">
+  <img src="https://lmsys.org/images/blog/sglang_v0_4/deepseek_coder_v2.svg" alt="Data Parallelism Attention Performance Comparison">
+</p>
+
+**Usage**:
+- Append `--enable-dp-attention --tp 8 --dp 8` to the server arguments when using 8 H200 GPUs. This optimization improves peak throughput in high batch size scenarios where the server is limited by KV cache capacity. However, it is not recommended for low-latency, small-batch use cases.
+- DP and TP attention can be flexibly combined. For example, to deploy DeepSeek-V3/R1 on 2 nodes with 8 H100 GPUs each, you can specify `--enable-dp-attention --tp 16 --dp 2`. This configuration runs attention with 2 DP groups, each containing 8 TP GPUs.
+
+**Reference**: Check [Blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/#data-parallelism-attention-for-deepseek-models).
+
+### Multi Node Tensor Parallelism
+
+**Description**: For users with limited memory on a single node, SGLang supports serving DeepSeek Series Models, including DeepSeek V3, across multiple nodes using tensor parallelism. This approach partitions the model parameters across multiple GPUs or nodes to handle models that are too large for one node's memory.
+
+**Usage**: Check [here](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-2-h208) for usage examples.
+
+### Block-wise FP8
+
+**Description**: SGLang implements block-wise FP8 quantization with two key optimizations:
+
+- **Activation**: E4M3 format using per-token-per-128-channel sub-vector scales with online casting.
+
+- **Weight**: Per-128x128-block quantization for better numerical stability.
+
+- **DeepGEMM**: The [DeepGEMM](https://github.com/deepseek-ai/DeepGEMM) kernel library optimized for FP8 matrix multiplications.
+
+**Usage**: The activation and weight optimization above are turned on by default for DeepSeek V3 models. DeepGEMM is enabled by default on NVIDIA Hopper GPUs and disabled by default on other devices. DeepGEMM can also be manually turned off by setting the environment variable `SGL_ENABLE_JIT_DEEPGEMM=0`.
+
+Before serving the DeepSeek model, precompile the DeepGEMM kernels using:
+```bash
+python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code
+```
+The precompilation process typically takes around 10 minutes to complete.
+
+### Multi-token Prediction
+**Description**: SGLang implements DeepSeek V3 Multi-Token Prediction (MTP) based on [EAGLE speculative decoding](https://docs.sglang.ai/advanced_features/speculative_decoding.html#EAGLE-Decoding). With this optimization, the decoding speed can be improved by **1.8x** for batch size 1 and **1.5x** for batch size 32 respectively on H200 TP8 setting.
+
+**Usage**:
+Add arguments `--speculative-algorithm`, `--speculative-num-steps`, `--speculative-eagle-topk` and `--speculative-num-draft-tokens` to enable this feature. For example:
+```
+python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3-0324 --speculative-algorithm EAGLE --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 --trust-remote-code --tp 8
+```
+- The best configuration for `--speculative-num-steps`, `--speculative-eagle-topk` and `--speculative-num-draft-tokens` can be searched with [bench_speculative.py](https://github.com/sgl-project/sglang/blob/main/scripts/playground/bench_speculative.py) script for given batch size. The minimum configuration is `--speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2`, which can achieve speedup for larger batch sizes.
+- FlashAttention3, FlashMLA, and Triton backend fully supports MTP usage. For FlashInfer backend (`--attention-backend flashinfer`) with speculative decoding,`--speculative-eagle-topk` parameter should be set to `1`. MTP support for the CutlassMLA and TRTLLM MLA backends are still under development.
+- To enable DeepSeek MTP for large batch sizes (>32), there are some parameters should be changed (Reference [this discussion](https://github.com/sgl-project/sglang/issues/4543#issuecomment-2737413756)):
+  - Adjust `--max-running-requests` to a larger number. The default value is `32` for MTP. For larger batch sizes, you should increase this value beyond the default value.
+  - Set `--cuda-graph-bs`. It's a list of batch sizes for cuda graph capture. The default captured batch sizes for speculative decoding is set [here](https://github.com/sgl-project/sglang/blob/49420741746c8f3e80e0eb17e7d012bfaf25793a/python/sglang/srt/model_executor/cuda_graph_runner.py#L126). You can include more batch sizes into it.
+
+
+### Reasoning Content for DeepSeek R1 & V3.1
+
+See [Reasoning Parser](https://docs.sglang.ai/advanced_features/separate_reasoning.html) and [Thinking Parameter for DeepSeek V3.1](https://docs.sglang.ai/basic_usage/openai_api_completions.html#Example:-DeepSeek-V3-Models).
+
+
+### Function calling for DeepSeek Models
+
+Add arguments `--tool-call-parser deepseekv3` and `--chat-template ./examples/chat_template/tool_chat_template_deepseekv3.jinja`(recommended) to enable this feature. For example (running on 1 * H20 node):
+
+```
+python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --port 30000 --host 0.0.0.0 --mem-fraction-static 0.9 --tool-call-parser deepseekv3 --chat-template ./examples/chat_template/tool_chat_template_deepseekv3.jinja
+```
+
+Sample Request:
+
+```
+curl "http://127.0.0.1:30000/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{"temperature": 0, "max_tokens": 100, "model": "deepseek-ai/DeepSeek-V3-0324", "tools": [{"type": "function", "function": {"name": "query_weather", "description": "Get weather of an city, the user should supply a city first", "parameters": {"type": "object", "properties": {"city": {"type": "string", "description": "The city, e.g. Beijing"}}, "required": ["city"]}}}], "messages": [{"role": "user", "content": "Hows the weather like in Qingdao today"}]}'
+```
+
+Expected Response
+
+```
+{"id":"6501ef8e2d874006bf555bc80cddc7c5","object":"chat.completion","created":1745993638,"model":"deepseek-ai/DeepSeek-V3-0324","choices":[{"index":0,"message":{"role":"assistant","content":null,"reasoning_content":null,"tool_calls":[{"id":"0","index":null,"type":"function","function":{"name":"query_weather","arguments":"{\"city\": \"Qingdao\"}"}}]},"logprobs":null,"finish_reason":"tool_calls","matched_stop":null}],"usage":{"prompt_tokens":116,"total_tokens":138,"completion_tokens":22,"prompt_tokens_details":null}}
+
+```
+Sample Streaming Request:
+```
+curl "http://127.0.0.1:30000/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{"temperature": 0, "max_tokens": 100, "model": "deepseek-ai/DeepSeek-V3-0324","stream":true,"tools": [{"type": "function", "function": {"name": "query_weather", "description": "Get weather of an city, the user should supply a city first", "parameters": {"type": "object", "properties": {"city": {"type": "string", "description": "The city, e.g. Beijing"}}, "required": ["city"]}}}], "messages": [{"role": "user", "content": "Hows the weather like in Qingdao today"}]}'
+```
+Expected Streamed Chunks (simplified for clarity):
+```
+data: {"choices":[{"delta":{"tool_calls":[{"function":{"arguments":"{\""}}]}}]}
+data: {"choices":[{"delta":{"tool_calls":[{"function":{"arguments":"city"}}]}}]}
+data: {"choices":[{"delta":{"tool_calls":[{"function":{"arguments":"\":\""}}]}}]}
+data: {"choices":[{"delta":{"tool_calls":[{"function":{"arguments":"Q"}}]}}]}
+data: {"choices":[{"delta":{"tool_calls":[{"function":{"arguments":"ing"}}]}}]}
+data: {"choices":[{"delta":{"tool_calls":[{"function":{"arguments":"dao"}}]}}]}
+data: {"choices":[{"delta":{"tool_calls":[{"function":{"arguments":"\"}"}}]}}]}
+data: {"choices":[{"delta":{"tool_calls":null}}], "finish_reason": "tool_calls"}
+data: [DONE]
+```
+The client needs to concatenate all arguments fragments to reconstruct the complete tool call:
+```
+{"city": "Qingdao"}
+```
+Important Notes:
+1. Use a lower `"temperature"` value for better results.
+2. To receive more consistent tool call results, it is recommended to use `--chat-template examples/chat_template/tool_chat_template_deepseekv3.jinja`. It provides an improved unified prompt.
+
+
+## FAQ
+
+**Q: Model loading is taking too long, and I'm encountering an NCCL timeout. What should I do?**
+
+A: If you're experiencing extended model loading times and an NCCL timeout, you can try increasing the timeout duration. Add the argument `--dist-timeout 3600` when launching your model. This will set the timeout to one hour, which often resolves the issue.
diff --git a/docs/basic_usage/gpt_oss.md b/docs/basic_usage/gpt_oss.md
new file mode 100644
index 000000000..240463ec4
--- /dev/null
+++ b/docs/basic_usage/gpt_oss.md
@@ -0,0 +1,114 @@
+# GPT OSS Usage
+
+Please refer to [https://github.com/sgl-project/sglang/issues/8833](https://github.com/sgl-project/sglang/issues/8833).
+
+## Responses API & Built-in Tools
+
+### Responses API
+
+GPT‑OSS is compatible with the OpenAI Responses API. Use `client.responses.create(...)` with `model`, `instructions`, `input`, and optional `tools` to enable built‑in tool use.
+
+### Built-in Tools
+
+GPT‑OSS can call built‑in tools for web search and Python execution. You can use the demo tool server or connect to external MCP tool servers.
+
+#### Python Tool
+
+- Executes short Python snippets for calculations, parsing, and quick scripts.
+- By default runs in a Docker-based sandbox. To run on the host, set `PYTHON_EXECUTION_BACKEND=UV` (this executes model-generated code locally; use with care).
+- Ensure Docker is available if you are not using the UV backend. It is recommended to run `docker pull python:3.11` in advance.
+
+#### Web Search Tool
+
+- Uses the Exa backend for web search.
+- Requires an Exa API key; set `EXA_API_KEY` in your environment. Create a key at `https://exa.ai`.
+
+### Tool & Reasoning Parser
+
+- We support OpenAI Reasoning and Tool Call parser, as well as our SGLang native api for tool call and reasoning. Refer to [reasoning parser](../advanced_features/separate_reasoning.ipynb) and [tool call parser](../advanced_features/function_calling.ipynb) for more details.
+
+
+## Notes
+
+- Use **Python 3.12** for the demo tools. And install the required `gpt-oss` packages.
+- The default demo integrates the web search tool (Exa backend) and a demo Python interpreter via Docker.
+- For search, set `EXA_API_KEY`. For Python execution, either have Docker available or set `PYTHON_EXECUTION_BACKEND=UV`.
+
+Examples:
+```bash
+export EXA_API_KEY=YOUR_EXA_KEY
+# Optional: run Python tool locally instead of Docker (use with care)
+export PYTHON_EXECUTION_BACKEND=UV
+```
+
+Launch the server with the demo tool server:
+
+`python3 -m sglang.launch_server --model-path openai/gpt-oss-120b --tool-server demo --tp 2`
+
+For production usage, sglang can act as an MCP client for multiple services. An [example tool server](https://github.com/openai/gpt-oss/tree/main/gpt-oss-mcp-server) is provided. Start the servers and point sglang to them:
+```bash
+mcp run -t sse browser_server.py:mcp
+mcp run -t sse python_server.py:mcp
+
+python -m sglang.launch_server ... --tool-server ip-1:port-1,ip-2:port-2
+```
+The URLs should be MCP SSE servers that expose server information and well-documented tools. These tools are added to the system prompt so the model can use them.
+
+### Quick Demo
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    base_url="http://localhost:30000/v1",
+    api_key="sk-123456"
+)
+
+tools = [
+    {"type": "code_interpreter"},
+    {"type": "web_search_preview"},
+]
+
+# Test python tool
+response = client.responses.create(
+    model="openai/gpt-oss-120b",
+    instructions="You are a helfpul assistant, you could use python tool to execute code.",
+    input="Use python tool to calculate the sum of 29138749187 and 29138749187", # 58,277,498,374
+    tools=tools
+)
+print("====== test python tool ======")
+print(response.output_text)
+
+# Test browser tool
+response = client.responses.create(
+    model="openai/gpt-oss-120b",
+    instructions="You are a helfpul assistant, you could use browser to search the web",
+    input="Search the web for the latest news about Nvidia stock price",
+    tools=tools
+)
+print("====== test browser tool ======")
+print(response.output_text)
+```
+
+Example output:
+```
+====== test python tool ======
+The sum of 29,138,749,187 and 29,138,749,187 is **58,277,498,374**.
+====== test browser tool ======
+**Recent headlines on Nvidia (NVDA) stock**
+
+| Date (2025) | Source | Key news points | Stock‑price detail |
+|-------------|--------|----------------|--------------------|
+| **May 13** | Reuters | The market data page shows Nvidia trading “higher” at **$116.61** with no change from the previous close. | **$116.61** – latest trade (delayed ≈ 15 min)【14†L34-L38】 |
+| **Aug 18** | CNBC | Morgan Stanley kept an **overweight** rating and lifted its price target to **$206** (up from $200), implying a 14 % upside from the Friday close. The firm notes Nvidia shares have already **jumped 34 % this year**. | No exact price quoted, but the article signals strong upside expectations【9†L27-L31】 |
+| **Aug 20** | The Motley Fool | Nvidia is set to release its Q2 earnings on Aug 27. The article lists the **current price of $175.36**, down 0.16 % on the day (as of 3:58 p.m. ET). | **$175.36** – current price on Aug 20【10†L12-L15】【10†L53-L57】 |
+
+**What the news tells us**
+
+* Nvidia’s share price has risen sharply this year – up roughly a third according to Morgan Stanley – and analysts are still raising targets (now $206).
+* The most recent market quote (Reuters, May 13) was **$116.61**, but the stock has surged since then, reaching **$175.36** by mid‑August.
+* Upcoming earnings on **Aug 27** are a focal point; both the Motley Fool and Morgan Stanley expect the results could keep the rally going.
+
+**Bottom line:** Nvidia’s stock is on a strong upward trajectory in 2025, with price targets climbing toward $200‑$210 and the market price already near $175 as of late August.
+
+```
diff --git a/docs/basic_usage/llama4.md b/docs/basic_usage/llama4.md
new file mode 100644
index 000000000..07cc2b737
--- /dev/null
+++ b/docs/basic_usage/llama4.md
@@ -0,0 +1,61 @@
+# Llama4 Usage
+
+[Llama 4](https://github.com/meta-llama/llama-models/blob/main/models/llama4/MODEL_CARD.md) is Meta's latest generation of open-source LLM model with industry-leading performance.
+
+SGLang has supported Llama 4 Scout (109B) and Llama 4 Maverick (400B) since [v0.4.5](https://github.com/sgl-project/sglang/releases/tag/v0.4.5).
+
+Ongoing optimizations are tracked in the [Roadmap](https://github.com/sgl-project/sglang/issues/5118).
+
+## Launch Llama 4 with SGLang
+
+To serve Llama 4 models on 8xH100/H200 GPUs:
+
+```bash
+python3 -m sglang.launch_server --model-path meta-llama/Llama-4-Scout-17B-16E-Instruct --tp 8 --context-length 1000000
+```
+
+### Configuration Tips
+
+- **OOM Mitigation**: Adjust `--context-length` to avoid a GPU out-of-memory issue. For the Scout model, we recommend setting this value up to 1M on 8\*H100 and up to 2.5M on 8\*H200. For the Maverick model, we don't need to set context length on 8\*H200. When hybrid kv cache is enabled, `--context-length` can be set up to 5M on 8\*H100 and up to 10M on 8\*H200 for the Scout model.
+
+- **Chat Template**: Add `--chat-template llama-4` for chat completion tasks.
+- **Enable Multi-Modal**: Add `--enable-multimodal` for multi-modal capabilities.
+- **Enable Hybrid-KVCache**: Add `--hybrid-kvcache-ratio` for hybrid kv cache. Details can be seen in [this PR](https://github.com/sgl-project/sglang/pull/6563)
+
+
+### EAGLE Speculative Decoding
+**Description**: SGLang has supported Llama 4 Maverick (400B) with [EAGLE speculative decoding](https://docs.sglang.ai/backend/speculative_decoding.html#EAGLE-Decoding).
+
+**Usage**:
+Add arguments `--speculative-draft-model-path`, `--speculative-algorithm`, `--speculative-num-steps`, `--speculative-eagle-topk` and `--speculative-num-draft-tokens` to enable this feature. For example:
+```
+python3 -m sglang.launch_server --model-path meta-llama/Llama-4-Maverick-17B-128E-Instruct --speculative-algorithm EAGLE3  --speculative-draft-model-path nvidia/Llama-4-Maverick-17B-128E-Eagle3 --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --trust-remote-code --tp 8 --context-length 1000000
+```
+
+- **Note** The Llama 4 draft model *nvidia/Llama-4-Maverick-17B-128E-Eagle3* can only recognize conversations in chat mode.
+
+## Benchmarking Results
+
+### Accuracy Test with `lm_eval`
+
+The accuracy on SGLang for both Llama4 Scout and Llama4 Maverick can match the [official benchmark numbers](https://ai.meta.com/blog/llama-4-multimodal-intelligence/).
+
+Benchmark results on MMLU Pro dataset with 8*H100:
+|                    | Llama-4-Scout-17B-16E-Instruct | Llama-4-Maverick-17B-128E-Instruct  |
+|--------------------|--------------------------------|-------------------------------------|
+| Official Benchmark | 74.3                           | 80.5                                |
+| SGLang             | 75.2                           | 80.7                                |
+
+Commands:
+
+```bash
+# Llama-4-Scout-17B-16E-Instruct model
+python -m sglang.launch_server --model-path meta-llama/Llama-4-Scout-17B-16E-Instruct --port 30000 --tp 8 --mem-fraction-static 0.8 --context-length 65536
+lm_eval --model local-chat-completions --model_args model=meta-llama/Llama-4-Scout-17B-16E-Instruct,base_url=http://localhost:30000/v1/chat/completions,num_concurrent=128,timeout=999999,max_gen_toks=2048 --tasks mmlu_pro --batch_size 128 --apply_chat_template --num_fewshot 0
+
+# Llama-4-Maverick-17B-128E-Instruct
+python -m sglang.launch_server --model-path meta-llama/Llama-4-Maverick-17B-128E-Instruct --port 30000 --tp 8 --mem-fraction-static 0.8 --context-length 65536
+lm_eval --model local-chat-completions --model_args model=meta-llama/Llama-4-Maverick-17B-128E-Instruct,base_url=http://localhost:30000/v1/chat/completions,num_concurrent=128,timeout=999999,max_gen_toks=2048 --tasks mmlu_pro --batch_size 128 --apply_chat_template --num_fewshot 0
+```
+
+Details can be seen in [this PR](https://github.com/sgl-project/sglang/pull/5092).
diff --git a/docs/basic_usage/native_api.ipynb b/docs/basic_usage/native_api.ipynb
new file mode 100644
index 000000000..5e4ca19a1
--- /dev/null
+++ b/docs/basic_usage/native_api.ipynb
@@ -0,0 +1,497 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# SGLang Native APIs\n",
+    "\n",
+    "Apart from the OpenAI compatible APIs, the SGLang Runtime also provides its native server APIs. We introduce the following APIs:\n",
+    "\n",
+    "- `/generate` (text generation model)\n",
+    "- `/get_model_info`\n",
+    "- `/get_server_info`\n",
+    "- `/health`\n",
+    "- `/health_generate`\n",
+    "- `/flush_cache`\n",
+    "- `/update_weights`\n",
+    "- `/encode`(embedding model)\n",
+    "- `/v1/rerank`(cross encoder rerank model)\n",
+    "- `/classify`(reward model)\n",
+    "- `/start_expert_distribution_record`\n",
+    "- `/stop_expert_distribution_record`\n",
+    "- `/dump_expert_distribution_record`\n",
+    "- A full list of these APIs can be found at [http_server.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/entrypoints/http_server.py)\n",
+    "\n",
+    "We mainly use `requests` to test these APIs in the following examples. You can also use `curl`.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Launch A Server"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sglang.test.doc_patch import launch_server_cmd\n",
+    "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+    "\n",
+    "server_process, port = launch_server_cmd(\n",
+    "    \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0 --log-level warning\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Generate (text generation model)\n",
+    "Generate completions. This is similar to the `/v1/completions` in OpenAI API. Detailed parameters can be found in the [sampling parameters](sampling_params.md)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "\n",
+    "url = f\"http://localhost:{port}/generate\"\n",
+    "data = {\"text\": \"What is the capital of France?\"}\n",
+    "\n",
+    "response = requests.post(url, json=data)\n",
+    "print_highlight(response.json())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Get Model Info\n",
+    "\n",
+    "Get the information of the model.\n",
+    "\n",
+    "- `model_path`: The path/name of the model.\n",
+    "- `is_generation`: Whether the model is used as generation model or embedding model.\n",
+    "- `tokenizer_path`: The path/name of the tokenizer.\n",
+    "- `preferred_sampling_params`: The default sampling params specified via `--preferred-sampling-params`. `None` is returned in this example as we did not explicitly configure it in server args.\n",
+    "- `weight_version`: This field contains the version of the model weights. This is often used to track changes or updates to the model’s trained parameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = f\"http://localhost:{port}/get_model_info\"\n",
+    "\n",
+    "response = requests.get(url)\n",
+    "response_json = response.json()\n",
+    "print_highlight(response_json)\n",
+    "assert response_json[\"model_path\"] == \"qwen/qwen2.5-0.5b-instruct\"\n",
+    "assert response_json[\"is_generation\"] is True\n",
+    "assert response_json[\"tokenizer_path\"] == \"qwen/qwen2.5-0.5b-instruct\"\n",
+    "assert response_json[\"preferred_sampling_params\"] is None\n",
+    "assert response_json.keys() == {\n",
+    "    \"model_path\",\n",
+    "    \"is_generation\",\n",
+    "    \"tokenizer_path\",\n",
+    "    \"preferred_sampling_params\",\n",
+    "    \"weight_version\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Get Server Info\n",
+    "Gets the server information including CLI arguments, token limits, and memory pool sizes.\n",
+    "- Note: `get_server_info` merges the following deprecated endpoints:\n",
+    "  - `get_server_args`\n",
+    "  - `get_memory_pool_size` \n",
+    "  - `get_max_total_num_tokens`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = f\"http://localhost:{port}/get_server_info\"\n",
+    "\n",
+    "response = requests.get(url)\n",
+    "print_highlight(response.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Health Check\n",
+    "- `/health`: Check the health of the server.\n",
+    "- `/health_generate`: Check the health of the server by generating one token."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = f\"http://localhost:{port}/health_generate\"\n",
+    "\n",
+    "response = requests.get(url)\n",
+    "print_highlight(response.text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = f\"http://localhost:{port}/health\"\n",
+    "\n",
+    "response = requests.get(url)\n",
+    "print_highlight(response.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Flush Cache\n",
+    "\n",
+    "Flush the radix cache. It will be automatically triggered when the model weights are updated by the `/update_weights` API."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = f\"http://localhost:{port}/flush_cache\"\n",
+    "\n",
+    "response = requests.post(url)\n",
+    "print_highlight(response.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Update Weights From Disk\n",
+    "\n",
+    "Update model weights from disk without restarting the server. Only applicable for models with the same architecture and parameter size.\n",
+    "\n",
+    "SGLang support `update_weights_from_disk` API for continuous evaluation during training (save checkpoint to disk and update weights from disk).\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# successful update with same architecture and size\n",
+    "\n",
+    "url = f\"http://localhost:{port}/update_weights_from_disk\"\n",
+    "data = {\"model_path\": \"qwen/qwen2.5-0.5b-instruct\"}\n",
+    "\n",
+    "response = requests.post(url, json=data)\n",
+    "print_highlight(response.text)\n",
+    "assert response.json()[\"success\"] is True\n",
+    "assert response.json()[\"message\"] == \"Succeeded to update model weights.\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# failed update with different parameter size or wrong name\n",
+    "\n",
+    "url = f\"http://localhost:{port}/update_weights_from_disk\"\n",
+    "data = {\"model_path\": \"qwen/qwen2.5-0.5b-instruct-wrong\"}\n",
+    "\n",
+    "response = requests.post(url, json=data)\n",
+    "response_json = response.json()\n",
+    "print_highlight(response_json)\n",
+    "assert response_json[\"success\"] is False\n",
+    "assert response_json[\"message\"] == (\n",
+    "    \"Failed to get weights iterator: \"\n",
+    "    \"qwen/qwen2.5-0.5b-instruct-wrong\"\n",
+    "    \" (repository not found).\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Encode (embedding model)\n",
+    "\n",
+    "Encode text into embeddings. Note that this API is only available for [embedding models](openai_api_embeddings.ipynb) and will raise an error for generation models.\n",
+    "Therefore, we launch a new server to server an embedding model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embedding_process, port = launch_server_cmd(\n",
+    "    \"\"\"\n",
+    "python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \\\n",
+    "    --host 0.0.0.0 --is-embedding --log-level warning\n",
+    "\"\"\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# successful encode for embedding model\n",
+    "\n",
+    "url = f\"http://localhost:{port}/encode\"\n",
+    "data = {\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"text\": \"Once upon a time\"}\n",
+    "\n",
+    "response = requests.post(url, json=data)\n",
+    "response_json = response.json()\n",
+    "print_highlight(f\"Text embedding (first 10): {response_json['embedding'][:10]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(embedding_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## v1/rerank (cross encoder rerank model)\n",
+    "Rerank a list of documents given a query using a cross-encoder model. Note that this API is only available for cross encoder model like [BAAI/bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3) with `attention-backend` `triton` and `torch_native`.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reranker_process, port = launch_server_cmd(\n",
+    "    \"\"\"\n",
+    "python3 -m sglang.launch_server --model-path BAAI/bge-reranker-v2-m3 \\\n",
+    "    --host 0.0.0.0 --disable-radix-cache --chunked-prefill-size -1 --attention-backend triton --is-embedding --log-level warning\n",
+    "\"\"\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# compute rerank scores for query and documents\n",
+    "\n",
+    "url = f\"http://localhost:{port}/v1/rerank\"\n",
+    "data = {\n",
+    "    \"model\": \"BAAI/bge-reranker-v2-m3\",\n",
+    "    \"query\": \"what is panda?\",\n",
+    "    \"documents\": [\n",
+    "        \"hi\",\n",
+    "        \"The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.\",\n",
+    "    ],\n",
+    "}\n",
+    "\n",
+    "response = requests.post(url, json=data)\n",
+    "response_json = response.json()\n",
+    "for item in response_json:\n",
+    "    print_highlight(f\"Score: {item['score']:.2f} - Document: '{item['document']}'\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(reranker_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Classify (reward model)\n",
+    "\n",
+    "SGLang Runtime also supports reward models. Here we use a reward model to classify the quality of pairwise generations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Note that SGLang now treats embedding models and reward models as the same type of models.\n",
+    "# This will be updated in the future.\n",
+    "\n",
+    "reward_process, port = launch_server_cmd(\n",
+    "    \"\"\"\n",
+    "python3 -m sglang.launch_server --model-path Skywork/Skywork-Reward-Llama-3.1-8B-v0.2 --host 0.0.0.0 --is-embedding --log-level warning\n",
+    "\"\"\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "PROMPT = (\n",
+    "    \"What is the range of the numeric output of a sigmoid node in a neural network?\"\n",
+    ")\n",
+    "\n",
+    "RESPONSE1 = \"The output of a sigmoid node is bounded between -1 and 1.\"\n",
+    "RESPONSE2 = \"The output of a sigmoid node is bounded between 0 and 1.\"\n",
+    "\n",
+    "CONVS = [\n",
+    "    [{\"role\": \"user\", \"content\": PROMPT}, {\"role\": \"assistant\", \"content\": RESPONSE1}],\n",
+    "    [{\"role\": \"user\", \"content\": PROMPT}, {\"role\": \"assistant\", \"content\": RESPONSE2}],\n",
+    "]\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\")\n",
+    "prompts = tokenizer.apply_chat_template(CONVS, tokenize=False)\n",
+    "\n",
+    "url = f\"http://localhost:{port}/classify\"\n",
+    "data = {\"model\": \"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\", \"text\": prompts}\n",
+    "\n",
+    "responses = requests.post(url, json=data).json()\n",
+    "for response in responses:\n",
+    "    print_highlight(f\"reward: {response['embedding'][0]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(reward_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Capture expert selection distribution in MoE models\n",
+    "\n",
+    "SGLang Runtime supports recording the number of times an expert is selected in a MoE model run for each expert in the model. This is useful when analyzing the throughput of the model and plan for optimization.\n",
+    "\n",
+    "*Note: We only print out the first 10 lines of the csv below for better readability. Please adjust accordingly if you want to analyze the results more deeply.*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "expert_record_server_process, port = launch_server_cmd(\n",
+    "    \"python3 -m sglang.launch_server --model-path Qwen/Qwen1.5-MoE-A2.7B --host 0.0.0.0 --expert-distribution-recorder-mode stat --log-level warning\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = requests.post(f\"http://localhost:{port}/start_expert_distribution_record\")\n",
+    "print_highlight(response)\n",
+    "\n",
+    "url = f\"http://localhost:{port}/generate\"\n",
+    "data = {\"text\": \"What is the capital of France?\"}\n",
+    "\n",
+    "response = requests.post(url, json=data)\n",
+    "print_highlight(response.json())\n",
+    "\n",
+    "response = requests.post(f\"http://localhost:{port}/stop_expert_distribution_record\")\n",
+    "print_highlight(response)\n",
+    "\n",
+    "response = requests.post(f\"http://localhost:{port}/dump_expert_distribution_record\")\n",
+    "print_highlight(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(expert_record_server_process)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/basic_usage/offline_engine_api.ipynb b/docs/basic_usage/offline_engine_api.ipynb
new file mode 100644
index 000000000..9c03e90a7
--- /dev/null
+++ b/docs/basic_usage/offline_engine_api.ipynb
@@ -0,0 +1,235 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Offline Engine API\n",
+    "\n",
+    "SGLang provides a direct inference engine without the need for an HTTP server, especially for use cases where additional HTTP server adds unnecessary complexity or overhead. Here are two general use cases:\n",
+    "\n",
+    "- Offline Batch Inference\n",
+    "- Custom Server on Top of the Engine\n",
+    "\n",
+    "This document focuses on the offline batch inference, demonstrating four different inference modes:\n",
+    "\n",
+    "- Non-streaming synchronous generation\n",
+    "- Streaming synchronous generation\n",
+    "- Non-streaming asynchronous generation\n",
+    "- Streaming asynchronous generation\n",
+    "\n",
+    "Additionally, you can easily build a custom server on top of the SGLang offline engine. A detailed example working in a python script can be found in [custom_server](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/custom_server.py).\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Nest Asyncio\n",
+    "Note that if you want to use **Offline Engine** in ipython or some other nested loop code, you need to add the following code:\n",
+    "```python\n",
+    "import nest_asyncio\n",
+    "\n",
+    "nest_asyncio.apply()\n",
+    "\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Advanced Usage\n",
+    "\n",
+    "The engine supports [vlm inference](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/offline_batch_inference_vlm.py) as well as [extracting hidden states](https://github.com/sgl-project/sglang/blob/main/examples/runtime/hidden_states). \n",
+    "\n",
+    "Please see [the examples](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine) for further use cases."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Offline Batch Inference\n",
+    "\n",
+    "SGLang offline engine supports batch inference with efficient scheduling."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# launch the offline engine\n",
+    "import asyncio\n",
+    "\n",
+    "import sglang as sgl\n",
+    "import sglang.test.doc_patch\n",
+    "from sglang.utils import async_stream_and_merge, stream_and_merge\n",
+    "\n",
+    "llm = sgl.Engine(model_path=\"qwen/qwen2.5-0.5b-instruct\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Non-streaming Synchronous Generation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "    \"Hello, my name is\",\n",
+    "    \"The president of the United States is\",\n",
+    "    \"The capital of France is\",\n",
+    "    \"The future of AI is\",\n",
+    "]\n",
+    "\n",
+    "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
+    "\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
+    "for prompt, output in zip(prompts, outputs):\n",
+    "    print(\"===============================\")\n",
+    "    print(f\"Prompt: {prompt}\\nGenerated text: {output['text']}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Streaming Synchronous Generation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "    \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n",
+    "    \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n",
+    "    \"Explain possible future trends in artificial intelligence. The future of AI is\",\n",
+    "]\n",
+    "\n",
+    "sampling_params = {\n",
+    "    \"temperature\": 0.2,\n",
+    "    \"top_p\": 0.9,\n",
+    "}\n",
+    "\n",
+    "print(\"\\n=== Testing synchronous streaming generation with overlap removal ===\\n\")\n",
+    "\n",
+    "for prompt in prompts:\n",
+    "    print(f\"Prompt: {prompt}\")\n",
+    "    merged_output = stream_and_merge(llm, prompt, sampling_params)\n",
+    "    print(\"Generated text:\", merged_output)\n",
+    "    print()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Non-streaming Asynchronous Generation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "    \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n",
+    "    \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n",
+    "    \"Explain possible future trends in artificial intelligence. The future of AI is\",\n",
+    "]\n",
+    "\n",
+    "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
+    "\n",
+    "print(\"\\n=== Testing asynchronous batch generation ===\")\n",
+    "\n",
+    "\n",
+    "async def main():\n",
+    "    outputs = await llm.async_generate(prompts, sampling_params)\n",
+    "\n",
+    "    for prompt, output in zip(prompts, outputs):\n",
+    "        print(f\"\\nPrompt: {prompt}\")\n",
+    "        print(f\"Generated text: {output['text']}\")\n",
+    "\n",
+    "\n",
+    "asyncio.run(main())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Streaming Asynchronous Generation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompts = [\n",
+    "    \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n",
+    "    \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n",
+    "    \"Explain possible future trends in artificial intelligence. The future of AI is\",\n",
+    "]\n",
+    "\n",
+    "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
+    "\n",
+    "print(\"\\n=== Testing asynchronous streaming generation (no repeats) ===\")\n",
+    "\n",
+    "\n",
+    "async def main():\n",
+    "    for prompt in prompts:\n",
+    "        print(f\"\\nPrompt: {prompt}\")\n",
+    "        print(\"Generated text: \", end=\"\", flush=True)\n",
+    "\n",
+    "        # Replace direct calls to async_generate with our custom overlap-aware version\n",
+    "        async for cleaned_chunk in async_stream_and_merge(llm, prompt, sampling_params):\n",
+    "            print(cleaned_chunk, end=\"\", flush=True)\n",
+    "\n",
+    "        print()  # New line after each prompt\n",
+    "\n",
+    "\n",
+    "asyncio.run(main())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm.shutdown()"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/basic_usage/openai_api.rst b/docs/basic_usage/openai_api.rst
new file mode 100644
index 000000000..370abe99c
--- /dev/null
+++ b/docs/basic_usage/openai_api.rst
@@ -0,0 +1,9 @@
+OpenAI-Compatible APIs
+======================
+
+.. toctree::
+   :maxdepth: 1
+
+   openai_api_completions.ipynb
+   openai_api_vision.ipynb
+   openai_api_embeddings.ipynb
diff --git a/docs/basic_usage/openai_api_completions.ipynb b/docs/basic_usage/openai_api_completions.ipynb
new file mode 100644
index 000000000..6b967709f
--- /dev/null
+++ b/docs/basic_usage/openai_api_completions.ipynb
@@ -0,0 +1,389 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# OpenAI APIs - Completions\n",
+    "\n",
+    "SGLang provides OpenAI-compatible APIs to enable a smooth transition from OpenAI services to self-hosted local models.\n",
+    "A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/api-reference).\n",
+    "\n",
+    "This tutorial covers the following popular APIs:\n",
+    "\n",
+    "- `chat/completions`\n",
+    "- `completions`\n",
+    "\n",
+    "Check out other tutorials to learn about [vision APIs](openai_api_vision.ipynb) for vision-language models and [embedding APIs](openai_api_embeddings.ipynb) for embedding models."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Launch A Server\n",
+    "\n",
+    "Launch the server in your terminal and wait for it to initialize."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sglang.test.doc_patch import launch_server_cmd\n",
+    "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+    "\n",
+    "server_process, port = launch_server_cmd(\n",
+    "    \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0 --log-level warning\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")\n",
+    "print(f\"Server started on http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Chat Completions\n",
+    "\n",
+    "### Usage\n",
+    "\n",
+    "The server fully implements the OpenAI API.\n",
+    "It will automatically apply the chat template specified in the Hugging Face tokenizer, if one is available.\n",
+    "You can also specify a custom chat template with `--chat-template` when launching the server."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openai\n",
+    "\n",
+    "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+    "    messages=[\n",
+    "        {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    "    max_tokens=64,\n",
+    ")\n",
+    "\n",
+    "print_highlight(f\"Response: {response}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Model Thinking/Reasoning Support\n",
+    "\n",
+    "Some models support internal reasoning or thinking processes that can be exposed in the API response. SGLang provides unified support for various reasoning models through the `chat_template_kwargs` parameter and compatible reasoning parsers.\n",
+    "\n",
+    "#### Supported Models and Configuration\n",
+    "\n",
+    "| Model Family | Chat Template Parameter | Reasoning Parser | Notes |\n",
+    "|--------------|------------------------|------------------|--------|\n",
+    "| DeepSeek-R1 (R1, R1-0528, R1-Distill) | `enable_thinking` | `--reasoning-parser deepseek-r1` | Standard reasoning models |\n",
+    "| DeepSeek-V3.1 | `thinking` | `--reasoning-parser deepseek-v3` | Hybrid model (thinking/non-thinking modes) |\n",
+    "| Qwen3 (standard) | `enable_thinking` | `--reasoning-parser qwen3` | Hybrid model (thinking/non-thinking modes) |\n",
+    "| Qwen3-Thinking | N/A (always enabled) | `--reasoning-parser qwen3-thinking` | Always generates reasoning |\n",
+    "| Kimi | N/A (always enabled) | `--reasoning-parser kimi` | Kimi thinking models |\n",
+    "| Gpt-Oss | N/A (always enabled) | `--reasoning-parser gpt-oss` | Gpt-Oss thinking models |\n",
+    "\n",
+    "#### Basic Usage\n",
+    "\n",
+    "To enable reasoning output, you need to:\n",
+    "1. Launch the server with the appropriate reasoning parser\n",
+    "2. Set the model-specific parameter in `chat_template_kwargs`\n",
+    "3. Optionally use `separate_reasoning: False` to not get reasoning content separately (default to `True`)\n",
+    "\n",
+    "**Note for Qwen3-Thinking models:** These models always generate thinking content and do not support the `enable_thinking` parameter. Use `--reasoning-parser qwen3-thinking` or `--reasoning-parser qwen3` to parse the thinking content.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Example: Qwen3 Models\n",
+    "\n",
+    "```python\n",
+    "# Launch server:\n",
+    "# python3 -m sglang.launch_server --model Qwen/Qwen3-4B --reasoning-parser qwen3\n",
+    "\n",
+    "from openai import OpenAI\n",
+    "\n",
+    "client = OpenAI(\n",
+    "    api_key=\"EMPTY\",\n",
+    "    base_url=f\"http://127.0.0.1:30000/v1\",\n",
+    ")\n",
+    "\n",
+    "model = \"Qwen/Qwen3-4B\"\n",
+    "messages = [{\"role\": \"user\", \"content\": \"How many r's are in 'strawberry'?\"}]\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=model,\n",
+    "    messages=messages,\n",
+    "    extra_body={\n",
+    "        \"chat_template_kwargs\": {\"enable_thinking\": True},\n",
+    "        \"separate_reasoning\": True\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "print(\"Reasoning:\", response.choices[0].message.reasoning_content)\n",
+    "print(\"-\"*100)\n",
+    "print(\"Answer:\", response.choices[0].message.content)\n",
+    "```\n",
+    "\n",
+    "**ExampleOutput:**\n",
+    "```\n",
+    "Reasoning: Okay, so the user is asking how many 'r's are in the word 'strawberry'. Let me think. First, I need to make sure I have the word spelled correctly. Strawberry... S-T-R-A-W-B-E-R-R-Y. Wait, is that right? Let me break it down.\n",
+    "\n",
+    "Starting with 'strawberry', let's write out the letters one by one. S, T, R, A, W, B, E, R, R, Y. Hmm, wait, that's 10 letters. Let me check again. S (1), T (2), R (3), A (4), W (5), B (6), E (7), R (8), R (9), Y (10). So the letters are S-T-R-A-W-B-E-R-R-Y. \n",
+    "...\n",
+    "Therefore, the answer should be three R's in 'strawberry'. But I need to make sure I'm not counting any other letters as R. Let me check again. S, T, R, A, W, B, E, R, R, Y. No other R's. So three in total. Yeah, that seems right.\n",
+    "\n",
+    "----------------------------------------------------------------------------------------------------\n",
+    "Answer: The word \"strawberry\" contains **three** letters 'r'. Here's the breakdown:\n",
+    "\n",
+    "1. **S-T-R-A-W-B-E-R-R-Y**  \n",
+    "   - The **third letter** is 'R'.  \n",
+    "   - The **eighth and ninth letters** are also 'R's.  \n",
+    "\n",
+    "Thus, the total count is **3**.  \n",
+    "\n",
+    "**Answer:** 3.\n",
+    "```\n",
+    "\n",
+    "**Note:** Setting `\"enable_thinking\": False` (or omitting it) will result in `reasoning_content` being `None`. Qwen3-Thinking models always generate reasoning content and don't support the `enable_thinking` parameter.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Example: DeepSeek-V3 Models\n",
+    "\n",
+    "DeepSeek-V3 models support thinking mode through the `thinking` parameter:\n",
+    "\n",
+    "```python\n",
+    "# Launch server:\n",
+    "# python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3.1 --tp 8  --reasoning-parser deepseek-v3\n",
+    "\n",
+    "from openai import OpenAI\n",
+    "\n",
+    "client = OpenAI(\n",
+    "    api_key=\"EMPTY\",\n",
+    "    base_url=f\"http://127.0.0.1:30000/v1\",\n",
+    ")\n",
+    "\n",
+    "model = \"deepseek-ai/DeepSeek-V3.1\"\n",
+    "messages = [{\"role\": \"user\", \"content\": \"How many r's are in 'strawberry'?\"}]\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=model,\n",
+    "    messages=messages,\n",
+    "    extra_body={\n",
+    "        \"chat_template_kwargs\": {\"thinking\": True},\n",
+    "        \"separate_reasoning\": True\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "print(\"Reasoning:\", response.choices[0].message.reasoning_content)\n",
+    "print(\"-\"*100)\n",
+    "print(\"Answer:\", response.choices[0].message.content)\n",
+    "```\n",
+    "\n",
+    "**Example Output:**\n",
+    "```\n",
+    "Reasoning: First, the question is: \"How many r's are in 'strawberry'?\"\n",
+    "\n",
+    "I need to count the number of times the letter 'r' appears in the word \"strawberry\".\n",
+    "\n",
+    "Let me write out the word: S-T-R-A-W-B-E-R-R-Y.\n",
+    "\n",
+    "Now, I'll go through each letter and count the 'r's.\n",
+    "...\n",
+    "So, I have three 'r's in \"strawberry\".\n",
+    "\n",
+    "I should double-check. The word is spelled S-T-R-A-W-B-E-R-R-Y. The letters are at positions: 3, 8, and 9 are 'r's. Yes, that's correct.\n",
+    "\n",
+    "Therefore, the answer should be 3.\n",
+    "----------------------------------------------------------------------------------------------------\n",
+    "Answer: The word \"strawberry\" contains **3** instances of the letter \"r\". Here's a breakdown for clarity:\n",
+    "\n",
+    "- The word is spelled: S-T-R-A-W-B-E-R-R-Y\n",
+    "- The \"r\" appears at the 3rd, 8th, and 9th positions.\n",
+    "```\n",
+    "\n",
+    "**Note:** DeepSeek-V3 models use the `thinking` parameter (not `enable_thinking`) to control reasoning output.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Parameters\n",
+    "\n",
+    "The chat completions API accepts OpenAI Chat Completions API's parameters. Refer to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create) for more details.\n",
+    "\n",
+    "SGLang extends the standard API with the `extra_body` parameter, allowing for additional customization. One key option within `extra_body` is `chat_template_kwargs`, which can be used to pass arguments to the chat template processor."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = client.chat.completions.create(\n",
+    "    model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+    "    messages=[\n",
+    "        {\n",
+    "            \"role\": \"system\",\n",
+    "            \"content\": \"You are a knowledgeable historian who provides concise responses.\",\n",
+    "        },\n",
+    "        {\"role\": \"user\", \"content\": \"Tell me about ancient Rome\"},\n",
+    "        {\n",
+    "            \"role\": \"assistant\",\n",
+    "            \"content\": \"Ancient Rome was a civilization centered in Italy.\",\n",
+    "        },\n",
+    "        {\"role\": \"user\", \"content\": \"What were their major achievements?\"},\n",
+    "    ],\n",
+    "    temperature=0.3,  # Lower temperature for more focused responses\n",
+    "    max_tokens=128,  # Reasonable length for a concise response\n",
+    "    top_p=0.95,  # Slightly higher for better fluency\n",
+    "    presence_penalty=0.2,  # Mild penalty to avoid repetition\n",
+    "    frequency_penalty=0.2,  # Mild penalty for more natural language\n",
+    "    n=1,  # Single response is usually more stable\n",
+    "    seed=42,  # Keep for reproducibility\n",
+    ")\n",
+    "\n",
+    "print_highlight(response.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Streaming mode is also supported."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stream = client.chat.completions.create(\n",
+    "    model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+    "    messages=[{\"role\": \"user\", \"content\": \"Say this is a test\"}],\n",
+    "    stream=True,\n",
+    ")\n",
+    "for chunk in stream:\n",
+    "    if chunk.choices[0].delta.content is not None:\n",
+    "        print(chunk.choices[0].delta.content, end=\"\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Completions\n",
+    "\n",
+    "### Usage\n",
+    "Completions API is similar to Chat Completions API, but without the `messages` parameter or chat templates."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = client.completions.create(\n",
+    "    model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+    "    prompt=\"List 3 countries and their capitals.\",\n",
+    "    temperature=0,\n",
+    "    max_tokens=64,\n",
+    "    n=1,\n",
+    "    stop=None,\n",
+    ")\n",
+    "\n",
+    "print_highlight(f\"Response: {response}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Parameters\n",
+    "\n",
+    "The completions API accepts OpenAI Completions API's parameters.  Refer to [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions/create) for more details.\n",
+    "\n",
+    "Here is an example of a detailed completions request:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = client.completions.create(\n",
+    "    model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+    "    prompt=\"Write a short story about a space explorer.\",\n",
+    "    temperature=0.7,  # Moderate temperature for creative writing\n",
+    "    max_tokens=150,  # Longer response for a story\n",
+    "    top_p=0.9,  # Balanced diversity in word choice\n",
+    "    stop=[\"\\n\\n\", \"THE END\"],  # Multiple stop sequences\n",
+    "    presence_penalty=0.3,  # Encourage novel elements\n",
+    "    frequency_penalty=0.3,  # Reduce repetitive phrases\n",
+    "    n=1,  # Generate one completion\n",
+    "    seed=123,  # For reproducible results\n",
+    ")\n",
+    "\n",
+    "print_highlight(f\"Response: {response}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Structured Outputs (JSON, Regex, EBNF)\n",
+    "\n",
+    "For OpenAI compatible structured outputs API, refer to [Structured Outputs](../advanced_features/structured_outputs.ipynb) for more details.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/basic_usage/openai_api_embeddings.ipynb b/docs/basic_usage/openai_api_embeddings.ipynb
new file mode 100644
index 000000000..26e95a4e7
--- /dev/null
+++ b/docs/basic_usage/openai_api_embeddings.ipynb
@@ -0,0 +1,195 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# OpenAI APIs - Embedding\n",
+    "\n",
+    "SGLang provides OpenAI-compatible APIs to enable a smooth transition from OpenAI services to self-hosted local models.\n",
+    "A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/guides/embeddings).\n",
+    "\n",
+    "This tutorial covers the embedding APIs for embedding models. For a list of the supported models see the [corresponding overview page](../supported_models/embedding_models.md)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Launch A Server\n",
+    "\n",
+    "Launch the server in your terminal and wait for it to initialize. Remember to add `--is-embedding` to the command."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sglang.test.doc_patch import launch_server_cmd\n",
+    "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+    "\n",
+    "embedding_process, port = launch_server_cmd(\n",
+    "    \"\"\"\n",
+    "python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \\\n",
+    "    --host 0.0.0.0 --is-embedding --log-level warning\n",
+    "\"\"\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using cURL"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import subprocess, json\n",
+    "\n",
+    "text = \"Once upon a time\"\n",
+    "\n",
+    "curl_text = f\"\"\"curl -s http://localhost:{port}/v1/embeddings \\\n",
+    "  -H \"Content-Type: application/json\" \\\n",
+    "  -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"input\": \"{text}\"}}'\"\"\"\n",
+    "\n",
+    "result = subprocess.check_output(curl_text, shell=True)\n",
+    "\n",
+    "print(result)\n",
+    "\n",
+    "text_embedding = json.loads(result)[\"data\"][0][\"embedding\"]\n",
+    "\n",
+    "print_highlight(f\"Text embedding (first 10): {text_embedding[:10]}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Python Requests"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "\n",
+    "text = \"Once upon a time\"\n",
+    "\n",
+    "response = requests.post(\n",
+    "    f\"http://localhost:{port}/v1/embeddings\",\n",
+    "    json={\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"input\": text},\n",
+    ")\n",
+    "\n",
+    "text_embedding = response.json()[\"data\"][0][\"embedding\"]\n",
+    "\n",
+    "print_highlight(f\"Text embedding (first 10): {text_embedding[:10]}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using OpenAI Python Client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openai\n",
+    "\n",
+    "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
+    "\n",
+    "# Text embedding example\n",
+    "response = client.embeddings.create(\n",
+    "    model=\"Alibaba-NLP/gte-Qwen2-1.5B-instruct\",\n",
+    "    input=text,\n",
+    ")\n",
+    "\n",
+    "embedding = response.data[0].embedding[:10]\n",
+    "print_highlight(f\"Text embedding (first 10): {embedding}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Input IDs\n",
+    "\n",
+    "SGLang also supports `input_ids` as input to get the embedding."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import os\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"Alibaba-NLP/gte-Qwen2-1.5B-instruct\")\n",
+    "input_ids = tokenizer.encode(text)\n",
+    "\n",
+    "curl_ids = f\"\"\"curl -s http://localhost:{port}/v1/embeddings \\\n",
+    "  -H \"Content-Type: application/json\" \\\n",
+    "  -d '{{\"model\": \"Alibaba-NLP/gte-Qwen2-1.5B-instruct\", \"input\": {json.dumps(input_ids)}}}'\"\"\"\n",
+    "\n",
+    "input_ids_embedding = json.loads(subprocess.check_output(curl_ids, shell=True))[\"data\"][\n",
+    "    0\n",
+    "][\"embedding\"]\n",
+    "\n",
+    "print_highlight(f\"Input IDs embedding (first 10): {input_ids_embedding[:10]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(embedding_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Multi-Modal Embedding Model\n",
+    "Please refer to [Multi-Modal Embedding Model](../supported_models/embedding_models.md)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/basic_usage/openai_api_vision.ipynb b/docs/basic_usage/openai_api_vision.ipynb
new file mode 100644
index 000000000..88d1ef7dd
--- /dev/null
+++ b/docs/basic_usage/openai_api_vision.ipynb
@@ -0,0 +1,254 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# OpenAI APIs - Vision\n",
+    "\n",
+    "SGLang provides OpenAI-compatible APIs to enable a smooth transition from OpenAI services to self-hosted local models.\n",
+    "A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/guides/vision).\n",
+    "This tutorial covers the vision APIs for vision language models.\n",
+    "\n",
+    "SGLang supports various vision language models such as Llama 3.2, LLaVA-OneVision, Qwen2.5-VL, Gemma3 and [more](../supported_models/multimodal_language_models.md).\n",
+    "\n",
+    "As an alternative to the OpenAI API, you can also use the [SGLang offline engine](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/offline_batch_inference_vlm.py)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Launch A Server\n",
+    "\n",
+    "Launch the server in your terminal and wait for it to initialize."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sglang.test.doc_patch import launch_server_cmd\n",
+    "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+    "\n",
+    "vision_process, port = launch_server_cmd(\n",
+    "    \"\"\"\n",
+    "python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct --log-level warning\n",
+    "\"\"\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using cURL\n",
+    "\n",
+    "Once the server is up, you can send test requests using curl or requests."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import subprocess\n",
+    "\n",
+    "curl_command = f\"\"\"\n",
+    "curl -s http://localhost:{port}/v1/chat/completions \\\\\n",
+    "  -H \"Content-Type: application/json\" \\\\\n",
+    "  -d '{{\n",
+    "    \"model\": \"Qwen/Qwen2.5-VL-7B-Instruct\",\n",
+    "    \"messages\": [\n",
+    "      {{\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": [\n",
+    "          {{\n",
+    "            \"type\": \"text\",\n",
+    "            \"text\": \"What’s in this image?\"\n",
+    "          }},\n",
+    "          {{\n",
+    "            \"type\": \"image_url\",\n",
+    "            \"image_url\": {{\n",
+    "              \"url\": \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n",
+    "            }}\n",
+    "          }}\n",
+    "        ]\n",
+    "      }}\n",
+    "    ],\n",
+    "    \"max_tokens\": 300\n",
+    "  }}'\n",
+    "\"\"\"\n",
+    "\n",
+    "response = subprocess.check_output(curl_command, shell=True).decode()\n",
+    "print_highlight(response)\n",
+    "\n",
+    "\n",
+    "response = subprocess.check_output(curl_command, shell=True).decode()\n",
+    "print_highlight(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Python Requests"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "\n",
+    "url = f\"http://localhost:{port}/v1/chat/completions\"\n",
+    "\n",
+    "data = {\n",
+    "    \"model\": \"Qwen/Qwen2.5-VL-7B-Instruct\",\n",
+    "    \"messages\": [\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": [\n",
+    "                {\"type\": \"text\", \"text\": \"What’s in this image?\"},\n",
+    "                {\n",
+    "                    \"type\": \"image_url\",\n",
+    "                    \"image_url\": {\n",
+    "                        \"url\": \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n",
+    "                    },\n",
+    "                },\n",
+    "            ],\n",
+    "        }\n",
+    "    ],\n",
+    "    \"max_tokens\": 300,\n",
+    "}\n",
+    "\n",
+    "response = requests.post(url, json=data)\n",
+    "print_highlight(response.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using OpenAI Python Client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openai import OpenAI\n",
+    "\n",
+    "client = OpenAI(base_url=f\"http://localhost:{port}/v1\", api_key=\"None\")\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"Qwen/Qwen2.5-VL-7B-Instruct\",\n",
+    "    messages=[\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": [\n",
+    "                {\n",
+    "                    \"type\": \"text\",\n",
+    "                    \"text\": \"What is in this image?\",\n",
+    "                },\n",
+    "                {\n",
+    "                    \"type\": \"image_url\",\n",
+    "                    \"image_url\": {\n",
+    "                        \"url\": \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n",
+    "                    },\n",
+    "                },\n",
+    "            ],\n",
+    "        }\n",
+    "    ],\n",
+    "    max_tokens=300,\n",
+    ")\n",
+    "\n",
+    "print_highlight(response.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Multiple-Image Inputs\n",
+    "\n",
+    "The server also supports multiple images and interleaved text and images if the model supports it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openai import OpenAI\n",
+    "\n",
+    "client = OpenAI(base_url=f\"http://localhost:{port}/v1\", api_key=\"None\")\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"Qwen/Qwen2.5-VL-7B-Instruct\",\n",
+    "    messages=[\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": [\n",
+    "                {\n",
+    "                    \"type\": \"image_url\",\n",
+    "                    \"image_url\": {\n",
+    "                        \"url\": \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\",\n",
+    "                    },\n",
+    "                },\n",
+    "                {\n",
+    "                    \"type\": \"image_url\",\n",
+    "                    \"image_url\": {\n",
+    "                        \"url\": \"https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png\",\n",
+    "                    },\n",
+    "                },\n",
+    "                {\n",
+    "                    \"type\": \"text\",\n",
+    "                    \"text\": \"I have two very different images. They are not related at all. \"\n",
+    "                    \"Please describe the first image in one sentence, and then describe the second image in another sentence.\",\n",
+    "                },\n",
+    "            ],\n",
+    "        }\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    ")\n",
+    "\n",
+    "print_highlight(response.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(vision_process)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/basic_usage/qwen3.md b/docs/basic_usage/qwen3.md
new file mode 100644
index 000000000..89295be60
--- /dev/null
+++ b/docs/basic_usage/qwen3.md
@@ -0,0 +1,27 @@
+# Qwen3-Next Usage
+
+SGLang has supported Qwen3-Next-80B-A3B-Instruct and Qwen3-Next-80B-A3B-Thinking since [this PR](https://github.com/sgl-project/sglang/pull/10233).
+
+## Launch Qwen3-Next with SGLang
+
+To serve Qwen3-Next models on 4xH100/H200 GPUs:
+
+```bash
+python3 -m sglang.launch_server --model Qwen/Qwen3-Next-80B-A3B-Instruct --tp 4
+```
+
+### Configuration Tips
+- `--max-mamba-cache-size`: Adjust `--max-mamba-cache-size` to increase mamba cache space and max running requests capability. It will decrease KV cache space as a trade-off. You can adjust it according to workload.
+- `--mamba-ssm-dtype`: `bfloat16` or `float32`, use `bfloat16` to save mamba cache size and `float32` to get more accurate results. The default setting is `float32`.
+
+### EAGLE Speculative Decoding
+**Description**: SGLang has supported Qwen3-Next models with [EAGLE speculative decoding](https://docs.sglang.ai/advanced_features/speculative_decoding.html#EAGLE-Decoding).
+
+**Usage**:
+Add arguments `--speculative-algorithm`, `--speculative-num-steps`, `--speculative-eagle-topk` and `--speculative-num-draft-tokens` to enable this feature. For example:
+
+``` bash
+python3 -m sglang.launch_server --model Qwen/Qwen3-Next-80B-A3B-Instruct --tp 4 --speculative-num-steps 3  --speculative-eagle-topk 1  --speculative-num-draft-tokens 4 --speculative-algo NEXTN
+```
+
+Details can be seen in [this PR](https://github.com/sgl-project/sglang/pull/10233).
diff --git a/docs/basic_usage/sampling_params.md b/docs/basic_usage/sampling_params.md
new file mode 100644
index 000000000..c1394a9fd
--- /dev/null
+++ b/docs/basic_usage/sampling_params.md
@@ -0,0 +1,305 @@
+# Sampling Parameters
+
+This doc describes the sampling parameters of the SGLang Runtime. It is the low-level endpoint of the runtime.
+If you want a high-level endpoint that can automatically handle chat templates, consider using the [OpenAI Compatible API](openai_api_completions.ipynb).
+
+## `/generate` Endpoint
+
+The `/generate` endpoint accepts the following parameters in JSON format. For detailed usage, see the [native API doc](native_api.ipynb). The object is defined at `io_struct.py::GenerateReqInput`. You can also read the source code to find more arguments and docs.
+
+| Argument                   | Type/Default                                                                 | Description                                                                                                                                                     |
+|----------------------------|------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| text                       | `Optional[Union[List[str], str]] = None`                                     | The input prompt. Can be a single prompt or a batch of prompts.                                                                                                 |
+| input_ids                  | `Optional[Union[List[List[int]], List[int]]] = None`                         | The token IDs for text; one can specify either text or input_ids.                                                                                               |
+| input_embeds               | `Optional[Union[List[List[List[float]]], List[List[float]]]] = None`         | The embeddings for input_ids; one can specify either text, input_ids, or input_embeds.                                                                          |
+| image_data                 | `Optional[Union[List[List[ImageDataItem]], List[ImageDataItem], ImageDataItem]] = None` | The image input. Can be an image instance, file name, URL, or base64 encoded string. Can be a single image, list of images, or list of lists of images. |
+| audio_data                 | `Optional[Union[List[AudioDataItem], AudioDataItem]] = None`                 | The audio input. Can be a file name, URL, or base64 encoded string.                                                                                             |
+| sampling_params            | `Optional[Union[List[Dict], Dict]] = None`                                   | The sampling parameters as described in the sections below.                                                                                                     |
+| rid                        | `Optional[Union[List[str], str]] = None`                                     | The request ID.                                                                                                                                                 |
+| return_logprob             | `Optional[Union[List[bool], bool]] = None`                                   | Whether to return log probabilities for tokens.                                                                                                                 |
+| logprob_start_len          | `Optional[Union[List[int], int]] = None`                                     | If return_logprob, the start location in the prompt for returning logprobs. Default is "-1", which returns logprobs for output tokens only.                     |
+| top_logprobs_num           | `Optional[Union[List[int], int]] = None`                                     | If return_logprob, the number of top logprobs to return at each position.                                                                                       |
+| token_ids_logprob          | `Optional[Union[List[List[int]], List[int]]] = None`                         | If return_logprob, the token IDs to return logprob for.                                                                                                         |
+| return_text_in_logprobs    | `bool = False`                                                               | Whether to detokenize tokens in text in the returned logprobs.                                                                                                  |
+| stream                     | `bool = False`                                                               | Whether to stream output.                                                                                                                                       |
+| lora_path                  | `Optional[Union[List[Optional[str]], Optional[str]]] = None`                 | The path to the LoRA.                                                                                                                                           |
+| custom_logit_processor     | `Optional[Union[List[Optional[str]], str]] = None`                           | Custom logit processor for advanced sampling control. Must be a serialized instance of `CustomLogitProcessor` using its `to_str()` method. For usage see below. |
+| return_hidden_states       | `Union[List[bool], bool] = False`                                            | Whether to return hidden states.                                                                                                                                |
+
+## Sampling parameters
+
+The object is defined at `sampling_params.py::SamplingParams`. You can also read the source code to find more arguments and docs.
+
+### Core parameters
+
+| Argument        | Type/Default                                 | Description                                                                                                                                    |
+|-----------------|----------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
+| max_new_tokens  | `int = 128`                                  | The maximum output length measured in tokens.                                                                                                  |
+| stop            | `Optional[Union[str, List[str]]] = None`     | One or multiple [stop words](https://platform.openai.com/docs/api-reference/chat/create#chat-create-stop). Generation will stop if one of these words is sampled. |
+| stop_token_ids  | `Optional[List[int]] = None`                 | Provide stop words in the form of token IDs. Generation will stop if one of these token IDs is sampled.                                        |
+| temperature     | `float = 1.0`                                | [Temperature](https://platform.openai.com/docs/api-reference/chat/create#chat-create-temperature) when sampling the next token. `temperature = 0` corresponds to greedy sampling, a higher temperature leads to more diversity. |
+| top_p           | `float = 1.0`                                | [Top-p](https://platform.openai.com/docs/api-reference/chat/create#chat-create-top_p) selects tokens from the smallest sorted set whose cumulative probability exceeds `top_p`. When `top_p = 1`, this reduces to unrestricted sampling from all tokens. |
+| top_k           | `int = -1`                                   | [Top-k](https://developer.nvidia.com/blog/how-to-get-better-outputs-from-your-large-language-model/#predictability_vs_creativity) randomly selects from the `k` highest-probability tokens. |
+| min_p           | `float = 0.0`                                | [Min-p](https://github.com/huggingface/transformers/issues/27670) samples from tokens with probability larger than `min_p * highest_token_probability`. |
+
+### Penalizers
+
+| Argument           | Type/Default           | Description                                                                                                                                    |
+|--------------------|------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
+| frequency_penalty  | `float = 0.0`          | Penalizes tokens based on their frequency in generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of penalization grows linearly with each appearance of a token. |
+| presence_penalty   | `float = 0.0`          | Penalizes tokens if they appeared in the generation so far. Must be between `-2` and `2` where negative numbers encourage repeatment of tokens and positive number encourages sampling of new tokens. The scaling of the penalization is constant if a token occurred. |
+| min_new_tokens     | `int = 0`              | Forces the model to generate at least `min_new_tokens` until a stop word or EOS token is sampled. Note that this might lead to unintended behavior, for example, if the distribution is highly skewed towards these tokens. |
+
+### Constrained decoding
+
+Please refer to our dedicated guide on [constrained decoding](../advanced_features/structured_outputs.ipynb) for the following parameters.
+
+| Argument        | Type/Default                    | Description                                                                                                                                    |
+|-----------------|---------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
+| json_schema     | `Optional[str] = None`          | JSON schema for structured outputs.                                                                                                            |
+| regex           | `Optional[str] = None`          | Regex for structured outputs.                                                                                                                  |
+| ebnf            | `Optional[str] = None`          | EBNF for structured outputs.                                                                                                                   |
+| structural_tag  | `Optional[str] = None`          | The structal tag for structured outputs.                                                                                                       |
+
+### Other options
+
+| Argument                      | Type/Default                    | Description                                                                                                                                    |
+|-------------------------------|---------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
+| n                             | `int = 1`                       | Specifies the number of output sequences to generate per request. (Generating multiple outputs in one request (n > 1) is discouraged; repeating the same prompts several times offers better control and efficiency.) |
+| ignore_eos                    | `bool = False`                  | Don't stop generation when EOS token is sampled.                                                                                               |
+| skip_special_tokens           | `bool = True`                   | Remove special tokens during decoding.                                                                                                         |
+| spaces_between_special_tokens | `bool = True`                   | Whether or not to add spaces between special tokens during detokenization.                                                                     |
+| no_stop_trim                  | `bool = False`                  | Don't trim stop words or EOS token from the generated text.                                                                                    |
+| custom_params                 | `Optional[List[Optional[Dict[str, Any]]]] = None` | Used when employing `CustomLogitProcessor`. For usage, see below.                                                                              |
+
+## Examples
+
+### Normal
+
+Launch a server:
+
+```bash
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000
+```
+
+Send a request:
+
+```python
+import requests
+
+response = requests.post(
+    "http://localhost:30000/generate",
+    json={
+        "text": "The capital of France is",
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 32,
+        },
+    },
+)
+print(response.json())
+```
+
+Detailed example in [send request](./send_request.ipynb).
+
+### Streaming
+
+Send a request and stream the output:
+
+```python
+import requests, json
+
+response = requests.post(
+    "http://localhost:30000/generate",
+    json={
+        "text": "The capital of France is",
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 32,
+        },
+        "stream": True,
+    },
+    stream=True,
+)
+
+prev = 0
+for chunk in response.iter_lines(decode_unicode=False):
+    chunk = chunk.decode("utf-8")
+    if chunk and chunk.startswith("data:"):
+        if chunk == "data: [DONE]":
+            break
+        data = json.loads(chunk[5:].strip("\n"))
+        output = data["text"].strip()
+        print(output[prev:], end="", flush=True)
+        prev = len(output)
+print("")
+```
+
+Detailed example in [openai compatible api](openai_api_completions.ipynb).
+
+### Multimodal
+
+Launch a server:
+
+```bash
+python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov
+```
+
+Download an image:
+
+```bash
+curl -o example_image.png -L https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true
+```
+
+Send a request:
+
+```python
+import requests
+
+response = requests.post(
+    "http://localhost:30000/generate",
+    json={
+        "text": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+                "<|im_start|>user\n<image>\nDescribe this image in a very short sentence.<|im_end|>\n"
+                "<|im_start|>assistant\n",
+        "image_data": "example_image.png",
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 32,
+        },
+    },
+)
+print(response.json())
+```
+
+The `image_data` can be a file name, a URL, or a base64 encoded string. See also `python/sglang/srt/utils.py:load_image`.
+
+Streaming is supported in a similar manner as [above](#streaming).
+
+Detailed example in [OpenAI API Vision](openai_api_vision.ipynb).
+
+### Structured Outputs (JSON, Regex, EBNF)
+
+You can specify a JSON schema, regular expression or [EBNF](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form) to constrain the model output. The model output will be guaranteed to follow the given constraints. Only one constraint parameter (`json_schema`, `regex`, or `ebnf`) can be specified for a request.
+
+SGLang supports two grammar backends:
+
+- [XGrammar](https://github.com/mlc-ai/xgrammar) (default): Supports JSON schema, regular expression, and EBNF constraints.
+  - XGrammar currently uses the [GGML BNF format](https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md).
+- [Outlines](https://github.com/dottxt-ai/outlines): Supports JSON schema and regular expression constraints.
+
+If instead you want to initialize the Outlines backend, you can use `--grammar-backend outlines` flag:
+
+```bash
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
+--port 30000 --host 0.0.0.0 --grammar-backend [xgrammar|outlines] # xgrammar or outlines (default: xgrammar)
+```
+
+```python
+import json
+import requests
+
+json_schema = json.dumps({
+    "type": "object",
+    "properties": {
+        "name": {"type": "string", "pattern": "^[\\w]+$"},
+        "population": {"type": "integer"},
+    },
+    "required": ["name", "population"],
+})
+
+# JSON (works with both Outlines and XGrammar)
+response = requests.post(
+    "http://localhost:30000/generate",
+    json={
+        "text": "Here is the information of the capital of France in the JSON format.\n",
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 64,
+            "json_schema": json_schema,
+        },
+    },
+)
+print(response.json())
+
+# Regular expression (Outlines backend only)
+response = requests.post(
+    "http://localhost:30000/generate",
+    json={
+        "text": "Paris is the capital of",
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 64,
+            "regex": "(France|England)",
+        },
+    },
+)
+print(response.json())
+
+# EBNF (XGrammar backend only)
+response = requests.post(
+    "http://localhost:30000/generate",
+    json={
+        "text": "Write a greeting.",
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 64,
+            "ebnf": 'root ::= "Hello" | "Hi" | "Hey"',
+        },
+    },
+)
+print(response.json())
+```
+
+Detailed example in [structured outputs](../advanced_features/structured_outputs.ipynb).
+
+### Custom logit processor
+
+Launch a server with `--enable-custom-logit-processor` flag on.
+
+```bash
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --enable-custom-logit-processor
+```
+
+Define a custom logit processor that will always sample a specific token id.
+
+```python
+from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
+
+class DeterministicLogitProcessor(CustomLogitProcessor):
+    """A dummy logit processor that changes the logits to always
+    sample the given token id.
+    """
+
+    def __call__(self, logits, custom_param_list):
+        # Check that the number of logits matches the number of custom parameters
+        assert logits.shape[0] == len(custom_param_list)
+        key = "token_id"
+
+        for i, param_dict in enumerate(custom_param_list):
+            # Mask all other tokens
+            logits[i, :] = -float("inf")
+            # Assign highest probability to the specified token
+            logits[i, param_dict[key]] = 0.0
+        return logits
+```
+
+Send a request:
+
+```python
+import requests
+
+response = requests.post(
+    "http://localhost:30000/generate",
+    json={
+        "text": "The capital of France is",
+        "custom_logit_processor": DeterministicLogitProcessor().to_str(),
+        "sampling_params": {
+            "temperature": 0.0,
+            "max_new_tokens": 32,
+            "custom_params": {"token_id": 5},
+        },
+    },
+)
+print(response.json())
+```
diff --git a/docs/basic_usage/send_request.ipynb b/docs/basic_usage/send_request.ipynb
new file mode 100644
index 000000000..6e457a02b
--- /dev/null
+++ b/docs/basic_usage/send_request.ipynb
@@ -0,0 +1,253 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Sending Requests\n",
+    "This notebook provides a quick-start guide to use SGLang in chat completions after installation.\n",
+    "\n",
+    "- For Vision Language Models, see [OpenAI APIs - Vision](openai_api_vision.ipynb).\n",
+    "- For Embedding Models, see [OpenAI APIs - Embedding](openai_api_embeddings.ipynb) and [Encode (embedding model)](native_api.html#Encode-(embedding-model)).\n",
+    "- For Reward Models, see [Classify (reward model)](native_api.html#Classify-(reward-model))."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Launch A Server"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sglang.test.doc_patch import launch_server_cmd\n",
+    "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+    "\n",
+    "# This is equivalent to running the following command in your terminal\n",
+    "# python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0\n",
+    "\n",
+    "server_process, port = launch_server_cmd(\n",
+    "    \"\"\"\n",
+    "python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct \\\n",
+    " --host 0.0.0.0 --log-level warning\n",
+    "\"\"\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using cURL\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import subprocess, json\n",
+    "\n",
+    "curl_command = f\"\"\"\n",
+    "curl -s http://localhost:{port}/v1/chat/completions \\\n",
+    "  -H \"Content-Type: application/json\" \\\n",
+    "  -d '{{\"model\": \"qwen/qwen2.5-0.5b-instruct\", \"messages\": [{{\"role\": \"user\", \"content\": \"What is the capital of France?\"}}]}}'\n",
+    "\"\"\"\n",
+    "\n",
+    "response = json.loads(subprocess.check_output(curl_command, shell=True))\n",
+    "print_highlight(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Python Requests"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "\n",
+    "url = f\"http://localhost:{port}/v1/chat/completions\"\n",
+    "\n",
+    "data = {\n",
+    "    \"model\": \"qwen/qwen2.5-0.5b-instruct\",\n",
+    "    \"messages\": [{\"role\": \"user\", \"content\": \"What is the capital of France?\"}],\n",
+    "}\n",
+    "\n",
+    "response = requests.post(url, json=data)\n",
+    "print_highlight(response.json())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using OpenAI Python Client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openai\n",
+    "\n",
+    "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
+    "\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+    "    messages=[\n",
+    "        {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    "    max_tokens=64,\n",
+    ")\n",
+    "print_highlight(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Streaming"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openai\n",
+    "\n",
+    "client = openai.Client(base_url=f\"http://127.0.0.1:{port}/v1\", api_key=\"None\")\n",
+    "\n",
+    "# Use stream=True for streaming responses\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"qwen/qwen2.5-0.5b-instruct\",\n",
+    "    messages=[\n",
+    "        {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
+    "    ],\n",
+    "    temperature=0,\n",
+    "    max_tokens=64,\n",
+    "    stream=True,\n",
+    ")\n",
+    "\n",
+    "# Handle the streaming output\n",
+    "for chunk in response:\n",
+    "    if chunk.choices[0].delta.content:\n",
+    "        print(chunk.choices[0].delta.content, end=\"\", flush=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Native Generation APIs\n",
+    "\n",
+    "You can also use the native `/generate` endpoint with requests, which provides more flexibility. An API reference is available at [Sampling Parameters](sampling_params.md)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "\n",
+    "response = requests.post(\n",
+    "    f\"http://localhost:{port}/generate\",\n",
+    "    json={\n",
+    "        \"text\": \"The capital of France is\",\n",
+    "        \"sampling_params\": {\n",
+    "            \"temperature\": 0,\n",
+    "            \"max_new_tokens\": 32,\n",
+    "        },\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "print_highlight(response.json())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Streaming"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests, json\n",
+    "\n",
+    "response = requests.post(\n",
+    "    f\"http://localhost:{port}/generate\",\n",
+    "    json={\n",
+    "        \"text\": \"The capital of France is\",\n",
+    "        \"sampling_params\": {\n",
+    "            \"temperature\": 0,\n",
+    "            \"max_new_tokens\": 32,\n",
+    "        },\n",
+    "        \"stream\": True,\n",
+    "    },\n",
+    "    stream=True,\n",
+    ")\n",
+    "\n",
+    "prev = 0\n",
+    "for chunk in response.iter_lines(decode_unicode=False):\n",
+    "    chunk = chunk.decode(\"utf-8\")\n",
+    "    if chunk and chunk.startswith(\"data:\"):\n",
+    "        if chunk == \"data: [DONE]\":\n",
+    "            break\n",
+    "        data = json.loads(chunk[5:].strip(\"\\n\"))\n",
+    "        output = data[\"text\"]\n",
+    "        print(output[prev:], end=\"\", flush=True)\n",
+    "        prev = len(output)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 000000000..d6ca64d88
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,205 @@
+import os
+import sys
+from datetime import datetime
+
+sys.path.insert(0, os.path.abspath("../.."))
+
+version_file = "../python/sglang/version.py"
+with open(version_file, "r") as f:
+    exec(compile(f.read(), version_file, "exec"))
+__version__ = locals()["__version__"]
+
+project = "SGLang"
+copyright = f"2023-{datetime.now().year}, SGLang"
+author = "SGLang Team"
+
+version = __version__
+release = __version__
+
+extensions = [
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
+    "sphinx.ext.napoleon",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.autosectionlabel",
+    "sphinx.ext.intersphinx",
+    "sphinx_tabs.tabs",
+    "myst_parser",
+    "sphinx_copybutton",
+    "sphinxcontrib.mermaid",
+    "nbsphinx",
+    "sphinx.ext.mathjax",
+]
+
+nbsphinx_allow_errors = True
+nbsphinx_execute = "never"
+
+autosectionlabel_prefix_document = True
+nbsphinx_allow_directives = True
+
+
+myst_enable_extensions = [
+    "dollarmath",
+    "amsmath",
+    "deflist",
+    "colon_fence",
+    "html_image",
+    "linkify",
+    "substitution",
+]
+
+myst_heading_anchors = 3
+
+nbsphinx_kernel_name = "python3"
+nbsphinx_execute_arguments = [
+    "--InlineBackend.figure_formats={'svg', 'pdf'}",
+    "--InlineBackend.rc={'figure.dpi': 96}",
+]
+
+
+nb_render_priority = {
+    "html": (
+        "application/vnd.jupyter.widget-view+json",
+        "application/javascript",
+        "text/html",
+        "image/svg+xml",
+        "image/png",
+        "image/jpeg",
+        "text/markdown",
+        "text/latex",
+        "text/plain",
+    )
+}
+
+myst_enable_extensions = [
+    "dollarmath",
+    "amsmath",
+    "deflist",
+    "colon_fence",
+    "html_image",
+    "linkify",
+    "substitution",
+]
+
+myst_heading_anchors = 3
+myst_ref_domains = ["std", "py"]
+
+templates_path = ["_templates"]
+
+source_suffix = {
+    ".rst": "restructuredtext",
+    ".md": "markdown",
+}
+
+master_doc = "index"
+
+language = "en"
+
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
+
+pygments_style = "sphinx"
+
+html_theme = "sphinx_book_theme"
+html_logo = "_static/image/logo.png"
+html_favicon = "_static/image/logo.ico"
+html_title = project
+html_copy_source = True
+html_last_updated_fmt = ""
+
+html_theme_options = {
+    "repository_url": "https://github.com/sgl-project/sgl-project.github.io",
+    "repository_branch": "main",
+    "show_navbar_depth": 3,
+    "max_navbar_depth": 4,
+    "collapse_navbar": True,
+    "use_edit_page_button": True,
+    "use_source_button": True,
+    "use_issues_button": True,
+    "use_repository_button": True,
+    "use_download_button": True,
+    "use_sidenotes": True,
+    "show_toc_level": 2,
+}
+
+html_context = {
+    "display_github": True,
+    "github_user": "sgl-project",
+    "github_repo": "sgl-project.github.io",
+    "github_version": "main",
+    "conf_py_path": "/docs/",
+}
+
+html_static_path = ["_static"]
+html_css_files = ["css/custom_log.css"]
+
+
+def setup(app):
+    app.add_css_file("css/custom_log.css")
+
+
+myst_enable_extensions = [
+    "dollarmath",
+    "amsmath",
+    "deflist",
+    "colon_fence",
+]
+myst_heading_anchors = 5
+
+htmlhelp_basename = "sglangdoc"
+
+latex_elements = {}
+
+latex_documents = [
+    (master_doc, "sglang.tex", "sglang Documentation", "SGLang Team", "manual"),
+]
+
+man_pages = [(master_doc, "sglang", "sglang Documentation", [author], 1)]
+
+texinfo_documents = [
+    (
+        master_doc,
+        "sglang",
+        "sglang Documentation",
+        author,
+        "sglang",
+        "One line description of project.",
+        "Miscellaneous",
+    ),
+]
+
+epub_title = project
+
+epub_exclude_files = ["search.html"]
+
+copybutton_prompt_text = r">>> |\.\.\. "
+copybutton_prompt_is_regexp = True
+
+autodoc_preserve_defaults = True
+navigation_with_keys = False
+
+autodoc_mock_imports = [
+    "torch",
+    "transformers",
+    "triton",
+]
+
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/3.12", None),
+    "typing_extensions": ("https://typing-extensions.readthedocs.io/en/latest", None),
+    "pillow": ("https://pillow.readthedocs.io/en/stable", None),
+    "numpy": ("https://numpy.org/doc/stable", None),
+    "torch": ("https://pytorch.org/docs/stable", None),
+}
+
+html_theme = "sphinx_book_theme"
+
+
+nbsphinx_prolog = """
+.. raw:: html
+
+    <style>
+        .output_area.stderr, .output_area.stdout {
+            color: #d3d3d3 !important; /* light gray */
+        }
+    </style>
+"""
diff --git a/docs/deploy.py b/docs/deploy.py
new file mode 100644
index 000000000..75b7ea7f2
--- /dev/null
+++ b/docs/deploy.py
@@ -0,0 +1,22 @@
+# Deploy the documents
+
+import os
+from datetime import datetime
+
+
+def run_cmd(cmd):
+    print(cmd)
+    os.system(cmd)
+
+
+run_cmd("cd $DOC_SITE_PATH; git pull")
+
+# (Optional) Remove old files
+# run_cmd("rm -rf $ALPA_SITE_PATH/*")
+
+run_cmd("cp -r _build/html/* $DOC_SITE_PATH")
+
+cmd_message = f"Update {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
+run_cmd(
+    f"cd $DOC_SITE_PATH; git add .; git commit -m '{cmd_message}'; git push origin main"
+)
diff --git a/docs/developer_guide/bench_serving.md b/docs/developer_guide/bench_serving.md
new file mode 100644
index 000000000..82f7aa2af
--- /dev/null
+++ b/docs/developer_guide/bench_serving.md
@@ -0,0 +1,334 @@
+## Bench Serving Guide
+
+This guide explains how to benchmark online serving throughput and latency using `python -m sglang.bench_serving`. It supports multiple inference backends via OpenAI-compatible and native endpoints, and produces both console metrics and optional JSONL outputs.
+
+### What it does
+
+- Generates synthetic or dataset-driven prompts and submits them to a target serving endpoint
+- Measures throughput, time-to-first-token (TTFT), inter-token latency (ITL), per-request end-to-end latency, and more
+- Supports streaming or non-streaming modes, rate control, and concurrency limits
+
+### Supported backends and endpoints
+
+- `sglang` / `sglang-native`: `POST /generate`
+- `sglang-oai`, `vllm`, `lmdeploy`: `POST /v1/completions`
+- `sglang-oai-chat`, `vllm-chat`, `lmdeploy-chat`: `POST /v1/chat/completions`
+- `trt` (TensorRT-LLM): `POST /v2/models/ensemble/generate_stream`
+- `gserver`: Custom server (Not Implemented yet in this script)
+- `truss`: `POST /v1/models/model:predict`
+
+If `--base-url` is provided, requests are sent to it. Otherwise, `--host` and `--port` are used. When `--model` is not provided, the script will attempt to query `GET /v1/models` for an available model ID (OpenAI-compatible endpoints).
+
+### Prerequisites
+
+- Python 3.8+
+- Dependencies typically used by this script: `aiohttp`, `numpy`, `requests`, `tqdm`, `transformers`, and for some datasets `datasets`, `pillow`, `pybase64`. Install as needed.
+- An inference server running and reachable via the endpoints above
+- If your server requires authentication, set environment variable `OPENAI_API_KEY` (used as `Authorization: Bearer <key>`)
+
+### Quick start
+
+Run a basic benchmark against an sglang server exposing `/generate`:
+
+```bash
+python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct
+```
+
+```bash
+python3 -m sglang.bench_serving \
+  --backend sglang \
+  --host 127.0.0.1 --port 30000 \
+  --num-prompts 1000 \
+  --model meta-llama/Llama-3.1-8B-Instruct
+```
+
+Or, using an OpenAI-compatible endpoint (completions):
+
+```bash
+python3 -m sglang.bench_serving \
+  --backend vllm \
+  --base-url http://127.0.0.1:8000 \
+  --num-prompts 1000 \
+  --model meta-llama/Llama-3.1-8B-Instruct
+```
+
+### Datasets
+
+Select with `--dataset-name`:
+
+- `sharegpt` (default): loads ShareGPT-style pairs; optionally restrict with `--sharegpt-context-len` and override outputs with `--sharegpt-output-len`
+- `random`: random text lengths; sampled from ShareGPT token space
+- `random-ids`: random token ids (can lead to gibberish)
+- `random-image`: generates random images and wraps them in chat messages; supports custom resolutions via 'heightxwidth' format
+- `generated-shared-prefix`: synthetic dataset with shared long system prompts and short questions
+- `mmmu`: samples from MMMU (Math split) and includes images
+
+Common dataset flags:
+
+- `--num-prompts N`: number of requests
+- `--random-input-len`, `--random-output-len`, `--random-range-ratio`: for random/random-ids/random-image
+- `--random-image-num-images`, `--random-image-resolution`: for random-image dataset (supports presets 1080p/720p/360p or custom 'heightxwidth' format)
+- `--apply-chat-template`: apply tokenizer chat template when constructing prompts
+- `--dataset-path PATH`: file path for ShareGPT json; if blank and missing, it will be downloaded and cached
+
+Generated Shared Prefix flags (for `generated-shared-prefix`):
+
+- `--gsp-num-groups`
+- `--gsp-prompts-per-group`
+- `--gsp-system-prompt-len`
+- `--gsp-question-len`
+- `--gsp-output-len`
+
+Random Image dataset flags (for `random-image`):
+
+- `--random-image-num-images`: Number of images per request
+- `--random-image-resolution`: Image resolution; supports presets (1080p, 720p, 360p) or custom 'heightxwidth' format (e.g., 1080x1920, 512x768)
+
+### Examples
+
+1. To benchmark random-image dataset with 3 images per request, 500 prompts, 512 input length, and 512 output length, you can run:
+
+```bash
+python -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-3B-Instruct --disable-radix-cache
+```
+
+```bash
+python -m sglang.bench_serving \
+    --backend sglang-oai-chat \
+    --dataset-name random-image \
+    --num-prompts 500 \
+    --random-image-num-images 3 \
+    --random-image-resolution 720p \
+    --random-input-len 512 \
+    --random-output-len 512
+```
+
+2. To benchmark random dataset with 3000 prompts, 1024 input length, and 1024 output length, you can run:
+
+```bash
+python -m sglang.launch_server --model-path Qwen/Qwen2.5-3B-Instruct
+```
+
+```bash
+python3 -m sglang.bench_serving \
+    --backend sglang \
+    --dataset-name random \
+    --num-prompts 3000 \
+    --random-input 1024 \
+    --random-output 1024 \
+    --random-range-ratio 0.5
+```
+
+### Choosing model and tokenizer
+
+- `--model` is required unless the backend exposes `GET /v1/models`, in which case the first model ID is auto-selected.
+- `--tokenizer` defaults to `--model`. Both can be HF model IDs or local paths.
+- For ModelScope workflows, setting `SGLANG_USE_MODELSCOPE=true` enables fetching via ModelScope (weights are skipped for speed).
+- If your tokenizer lacks a chat template, the script warns because token counting can be less robust for gibberish outputs.
+
+### Rate, concurrency, and streaming
+
+- `--request-rate`: requests per second. `inf` sends all immediately (burst). Non-infinite rate uses a Poisson process for arrival times.
+- `--max-concurrency`: caps concurrent in-flight requests regardless of arrival rate.
+- `--disable-stream`: switch to non-streaming mode when supported; TTFT then equals total latency for chat completions.
+
+### Other key options
+
+- `--output-file FILE.jsonl`: append JSONL results to file; auto-named if unspecified
+- `--output-details`: include per-request arrays (generated texts, errors, ttfts, itls, input/output lens)
+- `--extra-request-body '{"top_p":0.9,"temperature":0.6}'`: merged into payload (sampling params, etc.)
+- `--disable-ignore-eos`: pass through EOS behavior (varies by backend)
+- `--warmup-requests N`: run warmup requests with short output first (default 1)
+- `--flush-cache`: call `/flush_cache` (sglang) before main run
+- `--profile`: call `/start_profile` and `/stop_profile` (requires server to enable profiling, e.g., `SGLANG_TORCH_PROFILER_DIR`)
+- `--lora-name name1 name2 ...`: randomly pick one per request and pass to backend (e.g., `lora_path` for sglang)
+- `--tokenize-prompt`: send integer IDs instead of text (currently supports `--backend sglang` only)
+
+### Authentication
+
+If your target endpoint requires OpenAI-style auth, set:
+
+```bash
+export OPENAI_API_KEY=sk-...yourkey...
+```
+
+The script will add `Authorization: Bearer $OPENAI_API_KEY` automatically for OpenAI-compatible routes.
+
+### Metrics explained
+
+Printed after each run:
+
+- Request throughput (req/s)
+- Input token throughput (tok/s)
+- Output token throughput (tok/s)
+- Total token throughput (tok/s)
+- Concurrency: aggregate time of all requests divided by wall time
+- End-to-End Latency (ms): mean/median/std/p99 per-request total latency
+- Time to First Token (TTFT, ms): mean/median/std/p99 for streaming mode
+- Inter-Token Latency (ITL, ms): mean/median/std/p95/p99/max between tokens
+- TPOT (ms): Token processing time after first token, i.e., `(latency - ttft)/(tokens-1)`
+- Accept length (sglang-only, if available): speculative decoding accept length
+
+The script also retokenizes generated text with the configured tokenizer and reports "retokenized" counts.
+
+### JSONL output format
+
+When `--output-file` is set, one JSON object is appended per run. Base fields:
+
+- Arguments summary: backend, dataset, request_rate, max_concurrency, etc.
+- Duration and totals: completed, total_input_tokens, total_output_tokens, retokenized totals
+- Throughputs and latency statistics as printed in the console
+- `accept_length` when available (sglang)
+
+With `--output-details`, an extended object also includes arrays:
+
+- `input_lens`, `output_lens`
+- `ttfts`, `itls` (per request: ITL arrays)
+- `generated_texts`, `errors`
+
+### End-to-end examples
+
+1) sglang native `/generate` (streaming):
+
+```bash
+python3 -m sglang.bench_serving \
+  --backend sglang \
+  --host 127.0.0.1 --port 30000 \
+  --model meta-llama/Llama-3.1-8B-Instruct \
+  --dataset-name random \
+  --random-input-len 1024 --random-output-len 1024 --random-range-ratio 0.5 \
+  --num-prompts 2000 \
+  --request-rate 100 \
+  --max-concurrency 512 \
+  --output-file sglang_random.jsonl --output-details
+```
+
+2) OpenAI-compatible Completions (e.g., vLLM):
+
+```bash
+python3 -m sglang.bench_serving \
+  --backend vllm \
+  --base-url http://127.0.0.1:8000 \
+  --model meta-llama/Llama-3.1-8B-Instruct \
+  --dataset-name sharegpt \
+  --num-prompts 1000 \
+  --sharegpt-output-len 256
+```
+
+3) OpenAI-compatible Chat Completions (streaming):
+
+```bash
+python3 -m sglang.bench_serving \
+  --backend vllm-chat \
+  --base-url http://127.0.0.1:8000 \
+  --model meta-llama/Llama-3.1-8B-Instruct \
+  --dataset-name random \
+  --num-prompts 500 \
+  --apply-chat-template
+```
+
+4) Random images (VLM) with chat template:
+
+```bash
+python3 -m sglang.bench_serving \
+  --backend sglang \
+  --host 127.0.0.1 --port 30000 \
+  --model your-vlm-model \
+  --dataset-name random-image \
+  --random-image-num-images 2 \
+  --random-image-resolution 720p \
+  --random-input-len 128 --random-output-len 256 \
+  --num-prompts 200 \
+  --apply-chat-template
+```
+
+4a) Random images with custom resolution:
+
+```bash
+python3 -m sglang.bench_serving \
+  --backend sglang \
+  --host 127.0.0.1 --port 30000 \
+  --model your-vlm-model \
+  --dataset-name random-image \
+  --random-image-num-images 1 \
+  --random-image-resolution 512x768 \
+  --random-input-len 64 --random-output-len 128 \
+  --num-prompts 100 \
+  --apply-chat-template
+```
+
+5) Generated shared prefix (long system prompts + short questions):
+
+```bash
+python3 -m sglang.bench_serving \
+  --backend sglang \
+  --host 127.0.0.1 --port 30000 \
+  --model meta-llama/Llama-3.1-8B-Instruct \
+  --dataset-name generated-shared-prefix \
+  --gsp-num-groups 64 --gsp-prompts-per-group 16 \
+  --gsp-system-prompt-len 2048 --gsp-question-len 128 --gsp-output-len 256 \
+  --num-prompts 1024
+```
+
+6) Tokenized prompts (ids) for strict length control (sglang only):
+
+```bash
+python3 -m sglang.bench_serving \
+  --backend sglang \
+  --host 127.0.0.1 --port 30000 \
+  --model meta-llama/Llama-3.1-8B-Instruct \
+  --dataset-name random \
+  --tokenize-prompt \
+  --random-input-len 2048 --random-output-len 256 --random-range-ratio 0.2
+```
+
+7) Profiling and cache flush (sglang):
+
+```bash
+python3 -m sglang.bench_serving \
+  --backend sglang \
+  --host 127.0.0.1 --port 30000 \
+  --model meta-llama/Llama-3.1-8B-Instruct \
+  --profile \
+  --flush-cache
+```
+
+8) TensorRT-LLM streaming endpoint:
+
+```bash
+python3 -m sglang.bench_serving \
+  --backend trt \
+  --base-url http://127.0.0.1:8000 \
+  --model your-trt-llm-model \
+  --dataset-name random \
+  --num-prompts 100 \
+  --disable-ignore-eos
+```
+
+9) Evaluating large-scale KVCache sharing with mooncake trace (sglang only):
+
+```bash
+python3 -m sglang.bench_serving \
+  --backend sglang \
+  --host 127.0.0.1 --port 30000 \
+  --model mode-name \
+  --dataset-name mooncake \
+  --mooncake-slowdown-factor 1.0 \
+  --mooncake-num-rounds 1000 \
+  --mooncake-workload conversation|mooncake|agent|synthetic
+  --use-trace-timestamps true \
+  --random-output-len 256
+```
+
+### Troubleshooting
+
+- All requests failed: verify `--backend`, server URL/port, `--model`, and authentication. Check warmup errors printed by the script.
+- Throughput seems too low: adjust `--request-rate` and `--max-concurrency`; verify server batch size/scheduling; ensure streaming is enabled if appropriate.
+- Token counts look odd: prefer chat/instruct models with proper chat templates; otherwise tokenization of gibberish may be inconsistent.
+- Random-image/MMMU datasets: ensure you installed extra deps (`pillow`, `datasets`, `pybase64`).
+- Authentication errors (401/403): set `OPENAI_API_KEY` or disable auth on your server.
+
+### Notes
+
+- The script raises the file descriptor soft limit (`RLIMIT_NOFILE`) to help with many concurrent connections.
+- For sglang, `/get_server_info` is queried post-run to report speculative decoding accept length when available.
diff --git a/docs/developer_guide/benchmark_and_profiling.md b/docs/developer_guide/benchmark_and_profiling.md
new file mode 100644
index 000000000..948c837ff
--- /dev/null
+++ b/docs/developer_guide/benchmark_and_profiling.md
@@ -0,0 +1,182 @@
+# Benchmark and Profiling
+
+## Benchmark
+
+- Benchmark the latency of running a single static batch without a server. The arguments are the same as for `launch_server.py`.
+  Note that this is a simplified test script without a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this simplified script does not.
+  - Without a server (do not need to launch a server)
+    ```bash
+    python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --batch 32 --input-len 256 --output-len 32
+    ```
+  - With a server (please use `sglang.launch_server` to launch a server first and run the following command.)
+    ```bash
+    python -m sglang.bench_one_batch_server --base-url http://127.0.0.1:30000 --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --batch-size 32 --input-len 256 --output-len 32
+    ```
+
+
+- Benchmark offline processing. This script will start an offline engine and run the benchmark.
+
+  ```bash
+  python3 -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --num-prompts 10
+  ```
+
+- Benchmark online serving. Please use `sglang.launch_server` to launch a server first and run the following command.
+
+  ```bash
+  python3 -m sglang.bench_serving --backend sglang --num-prompt 10
+  ```
+
+## Profile with PyTorch Profiler
+
+[Pytorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html) is a convenient basic tool to inspect kernel execution time, call stack, and kernel overlap and occupancy.
+
+### Profile a server with `sglang.bench_serving`
+
+```bash
+# set trace path
+export SGLANG_TORCH_PROFILER_DIR=/root/sglang/profile_log
+
+# start server
+python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct
+
+# send profiling request from client
+python -m sglang.bench_serving --backend sglang --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 10 --sharegpt-output-len 100 --profile
+```
+
+Please make sure that the `SGLANG_TORCH_PROFILER_DIR` should be set at both server and client side, otherwise the trace file cannot be generated correctly . A secure way will be setting `SGLANG_TORCH_PROFILER_DIR` in the `.*rc` file of shell (e.g. `~/.bashrc` for bash shells).
+
+For more details, please refer to [Bench Serving Guide](./bench_serving.md).
+
+### Profile a server with `sglang.bench_offline_throughput`
+```bash
+export SGLANG_TORCH_PROFILER_DIR=/root/sglang/profile_log
+
+# profile one batch with bench_one_batch.py
+# batch size can be controlled with --batch argument
+python3 -m sglang.bench_one_batch --model-path meta-llama/Llama-3.1-8B-Instruct --batch 32 --input-len 1024 --output-len 10 --profile
+
+# profile multiple batches with bench_offline_throughput.py
+python -m sglang.bench_offline_throughput --model-path meta-llama/Llama-3.1-8B-Instruct --dataset-name random --num-prompts 10 --profile --mem-frac=0.8
+```
+
+### Profile a server with `sglang.profiler`
+
+When the server is running (e.g., processing a decoding request), you can start live profiling immediately by sending a profile request to the server.
+
+You can do this by running `python3 -m sglang.profiler`. For example:
+
+```
+# Terminal 1: Send a generation request
+python3 -m sglang.test.send_one
+
+# Terminal 2: Before the above request finishes, quickly launch the following command in a separate terminal.
+# It will generate a profile of the above request for several decoding batches.
+python3 -m sglang.profiler
+```
+
+### Possible PyTorch bugs
+If in any cases you encounter the following error (for example, using qwen 2.5 VL):
+```bash
+RuntimeError: !stack.empty() INTERNAL ASSERT FAILED at "/pytorch/torch/csrc/autograd/profiler_python.cpp":983, please report a bug to PyTorch. Python replay stack is empty.
+```
+This is likely a PyTorch Bug reported in [Bug: vLLM Profiler](https://github.com/vllm-project/vllm/issues/18240) and [Bug: torch.profiler.profile](https://github.com/pytorch/pytorch/issues/101632). As a workaround, you may disable `with_stack` with an environment variable such as follows:
+```bash
+export SGLANG_PROFILE_WITH_STACK=False
+python -m sglang.bench_offline_throughput --model-path meta-llama/Llama-3.1-8B-Instruct --dataset-name random --num-prompts 10 --profile --mem-frac=0.8
+```
+
+### View traces
+
+Trace files can be loaded and visualized from:
+
+1. https://ui.perfetto.dev/ (any browser)
+2. chrome://tracing (Chrome browser only)
+
+If browser cannot open trace file due to its large size,
+client can generate a small trace file (<100MB) by controlling number of prompts and lengths of prompt outputs.
+For example, when profiling a server,
+
+```bash
+python -m sglang.bench_serving --backend sglang --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 2 --sharegpt-output-len 100 --profile
+```
+
+This command sets the number of prompts to 2 with `--num-prompts` argument and limits the length of output sequences to 100 with `--sharegpt-output-len` argument, which can generate a small trace file for browser to open smoothly.
+
+Additionally, if you want to locate the SGLang Python source code through the cuda kernel in Trace, you need to disable CUDA Graph when starting the service. This can be done by using the `--disable-cuda-graph` parameter in the command to start the service.
+
+## Profile with Nsight
+
+[Nsight systems](https://docs.nvidia.com/nsight-systems/) is an advanced tool that exposes more profiling details, such as register and shared memory usage, annotated code regions and low-level CUDA APIs and events.
+
+1. Prerequisite:
+
+   Install using apt, or run inside a [NVIDIA Docker container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags) or [SGLang Docker container](https://github.com/sgl-project/sglang/tree/main/docker).
+
+   ```bash
+   # install nsys
+   # https://docs.nvidia.com/nsight-systems/InstallationGuide/index.html
+   apt update
+   apt install -y --no-install-recommends gnupg
+   echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu$(source /etc/lsb-release; echo "$DISTRIB_RELEASE" | tr -d .)/$(dpkg --print-architecture) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list
+   apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
+   apt update
+   apt install nsight-systems-cli
+   ```
+
+2. To profile a single batch, use
+
+   ```bash
+   nsys profile --trace-fork-before-exec=true --cuda-graph-trace=node python3 -m sglang.bench_one_batch --model meta-llama/Meta-Llama-3-8B --batch-size 64 --input-len 512
+   ```
+
+3. To profile a server, e.g.
+
+   ```bash
+   # launch the server, set the delay and duration times according to needs
+   # after the duration time has been used up, server will be killed by nsys
+
+   nsys profile --trace-fork-before-exec=true --cuda-graph-trace=node -o sglang.out --delay 60 --duration 70 python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache
+
+   # client
+   python3 -m sglang.bench_serving --backend sglang --num-prompts 1000 --dataset-name random --random-input 1024 --random-output 512
+   ```
+
+   In practice, we recommend users to set `--duration` argument to a large value. Whenever user wants the server to stop profiling. Firstly run:
+
+   ```bash
+   nsys sessions list
+   ```
+
+   to get the session id in the form of `profile-XXXXX`, then run:
+
+   ```bash
+   nsys stop --session=profile-XXXXX
+   ```
+
+   to manually kill the profiler and generate `nsys-rep` files instantly.
+
+4. Use NVTX to annotate code regions, e.g. to see their execution time.
+
+   ```bash
+   # install nvtx
+   pip install nvtx
+   ```
+
+   ```python
+   # code snippets
+   import nvtx
+   with nvtx.annotate("description", color="color"):
+       # some critical code
+   ```
+
+## Other tips
+
+1. You can benchmark a model using dummy weights by only providing the config.json file. This allows for quick testing of model variants without training. To do so, add `--load-format dummy` to the above commands and then you only need a correct `config.json` under the checkpoint folder.
+2. You can benchmark a model with modified configs (e.g., less layers) by using `--json-model-override-args`. For example, you can benchmark a model with only 2 layers and 2 kv heads using:
+
+   ```bash
+   python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --batch 32 --input-len 256 --output-len 32 --load-format dummy --json-model-override-args '{"num_hidden_layers": 1, "num_key_value_heads": 1}'
+   ```
+
+3. You can use `--python-backtrace=cuda` to see python call stack for all CUDA kernels, as in PyTorch Profiler. (Caveat: this can cause inaccurately long kernel runtimes for CUDA event based timing)
+4. For more arguments see [Nsight Systems User Guide](https://docs.nvidia.com/nsight-systems/UserGuide/index.html).
diff --git a/docs/developer_guide/contribution_guide.md b/docs/developer_guide/contribution_guide.md
new file mode 100644
index 000000000..f8e6f692d
--- /dev/null
+++ b/docs/developer_guide/contribution_guide.md
@@ -0,0 +1,103 @@
+# Contribution Guide
+
+Welcome to **SGLang**! We appreciate your interest in contributing. This guide provides a concise overview of how to set up your environment, run tests, build documentation, and open a Pull Request (PR). Whether you’re fixing a small bug or developing a major feature, we encourage following these steps for a smooth contribution process.
+
+## Install SGLang from Source
+
+### Fork and clone the repository
+
+**Note**: New contributors do **not** have the write permission to push to the official SGLang repo. Please fork the repository under your GitHub account, then clone your fork locally.
+
+```bash
+git clone https://github.com/<your_user_name>/sglang.git
+```
+
+### Build from source
+
+Refer to [Install SGLang from Source](../get_started/install.md#method-2-from-source).
+
+## Format code with pre-commit
+
+We use [pre-commit](https://pre-commit.com/) to maintain consistent code style checks. Before pushing your changes, please run:
+
+```bash
+pip3 install pre-commit
+pre-commit install
+pre-commit run --all-files
+```
+
+- **`pre-commit run --all-files`** manually runs all configured checks, applying fixes if possible. If it fails the first time, re-run it to ensure lint errors are fully resolved. Make sure your code passes all checks **before** creating a Pull Request.
+- **Do not commit** directly to the `main` branch. Always create a new branch (e.g., `feature/my-new-feature`), push your changes, and open a PR from that branch.
+
+## Run and add unit tests
+
+If you add a new feature or fix a bug, please add corresponding unit tests to ensure coverage and prevent regression.
+SGLang uses Python's built-in [unittest](https://docs.python.org/3/library/unittest.html) framework.
+For detailed instructions on running tests and integrating them into CI, refer to [test/README.md](https://github.com/sgl-project/sglang/tree/main/test/README.md).
+
+## Write documentations
+
+We recommend new contributors start from writing documentation, which helps you quickly understand SGLang codebase.
+For more details, please refer to [docs/README.md](https://github.com/sgl-project/sglang/tree/main/docs/README.md).
+
+## Test the accuracy
+If your code changes the model output, please run the accuracy tests. A quick sanity check is the few-shot GSM8K.
+
+```
+# Launch a server
+python3 -m sglang.launch_server --model Qwen/Qwen2-7B-Instruct
+
+# Evaluate
+python3 -m sglang.test.few_shot_gsm8k --num-questions 200
+```
+
+Please note that the above script is primarily a sanity check, not a rigorous accuracy or speed test.
+This test can have significant variance (1%–5%) in accuracy due to batching and the non-deterministic nature of the inference engine.
+Also, do not rely on the "Latency/Output throughput" from this script, as it is not a proper speed test.
+
+GSM8K is too easy for state-of-the-art models nowadays. Please try your own more challenging accuracy tests.
+You can find additional accuracy eval examples in:
+- [test_eval_accuracy_large.py](https://github.com/sgl-project/sglang/blob/main/test/srt/test_eval_accuracy_large.py)
+- [test_gpt_oss_1gpu.py](https://github.com/sgl-project/sglang/blob/main/test/srt/test_gpt_oss_1gpu.py)
+
+## Benchmark the speed
+Refer to [Benchmark and Profiling](../developer_guide/benchmark_and_profiling.md).
+
+## Request a review
+You can identify potential reviewers for your code by checking the [code owners](https://github.com/sgl-project/sglang/blob/main/.github/CODEOWNERS) and [reviewers](https://github.com/sgl-project/sglang/blob/main/.github/REVIEWERS.md) files.
+Another effective strategy is to review the file modification history and contact individuals who have frequently edited the files.
+If you modify files protected by code owners, their approval is required to merge the code.
+
+## General code style
+- Avoid code duplication. If the same code snippet (more than five lines) appears multiple times, extract it into a shared function.
+- Minimize device synchronization. Reduce expensive CPU-GPU synchronization operations, such as `tensor.item()` or `tensor.cpu()`, whenever possible. Use vectorized code.
+- Keep files concise. If a file exceeds 2,000 lines of code, split it into multiple smaller files.
+- Prioritize extreme efficiency. SGLang is a runtime, and most of your code runs on the critical path for every request. Optimize all minor overheads as much as possible, especially in the model forward code.
+  - A common pattern is some runtime checks in the model forward pass (e.g., [this](https://github.com/sgl-project/sglang/blob/f1b0eda55c2c4838e8ab90a0fac7fb1e3d7064ab/python/sglang/srt/models/deepseek_v2.py#L486-L491)). These are very likely the same for every layer. Please cache the result as a single boolean value whenever possible.
+- Strive to make functions as pure as possible. Avoid in-place modification of arguments.
+- When supporting new hardware or features, follow these guidelines:
+  - Do not drastically change existing code.
+  - Always prefer new files to introduce specific components for your new hardware (e.g., `allocator_ascend.py`).
+  - If you write multiple if/else blocks for new features, ensure the common path (e.g., NVIDIA hardware or the existing code path) is the first branch.
+
+## How to update sgl-kernel
+Since sglang and sgl-kernel are separate Python packages, our current GitHub CI infrastructure does not support updating a kernel and using it immediately within the same pull request (PR).
+To add a new kernel or modify an existing one in the sgl-kernel package, you must use multiple PRs.
+
+Follow these steps:
+
+1. Submit a PR to update the sgl-kernel source code without using it in sglang python package (e.g., [#8884](https://github.com/sgl-project/sglang/pull/8884/files)).
+2. Bump the version of sgl-kernel (e.g., [#9220](https://github.com/sgl-project/sglang/pull/9220/files)).
+   - Once merged, this will trigger an automatic release of the sgl-kernel wheel to PyPI.
+   - If not urgent, you can wait for other people to release the wheel. A new version will typically be released within one week.
+3. Apply the changes:
+   - Update the sgl-kernel version in `sglang/python/pyproject.toml` to use the modified kernels.
+   - Update the related caller code in the sglang to use the new kernel.
+
+## Tips for newcomers
+
+If you want to contribute but don’t have a specific idea in mind, pick issues labeled [“good first issue” or “help wanted”](https://github.com/sgl-project/sglang/issues?q=is%3Aissue+label%3A%22good+first+issue%22%2C%22help+wanted%22). These tasks typically have lower complexity and provide an excellent introduction to the codebase. Also check out this [code walk-through](https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/tree/main/sglang/code-walk-through) for a deeper look into SGLang’s workflow.
+
+If you have any questions or want to start a discussion, please feel free to ask in our [Slack channel](https://slack.sglang.ai).
+
+Thank you for your interest in SGLang. Happy coding!
diff --git a/docs/developer_guide/development_guide_using_docker.md b/docs/developer_guide/development_guide_using_docker.md
new file mode 100644
index 000000000..e38947902
--- /dev/null
+++ b/docs/developer_guide/development_guide_using_docker.md
@@ -0,0 +1,108 @@
+# Development Guide Using Docker
+
+## Setup VSCode on a Remote Host
+(Optional - you can skip this step if you plan to run sglang dev container locally)
+
+1. In the remote host, download `code` from [Https://code.visualstudio.com/docs/?dv=linux64cli](https://code.visualstudio.com/download) and run `code tunnel` in a shell.
+
+Example
+```bash
+wget https://vscode.download.prss.microsoft.com/dbazure/download/stable/fabdb6a30b49f79a7aba0f2ad9df9b399473380f/vscode_cli_alpine_x64_cli.tar.gz
+tar xf vscode_cli_alpine_x64_cli.tar.gz
+
+# https://code.visualstudio.com/docs/remote/tunnels
+./code tunnel
+```
+
+2. In your local machine, press F1 in VSCode and choose "Remote Tunnels: Connect to Tunnel".
+
+## Setup Docker Container
+
+### Option 1. Use the default dev container automatically from VSCode
+There is a `.devcontainer` folder in the sglang repository root folder to allow VSCode to automatically start up within dev container. You can read more about this VSCode extension in VSCode official document [Developing inside a Container](https://code.visualstudio.com/docs/devcontainers/containers).
+![image](https://github.com/user-attachments/assets/6a245da8-2d4d-4ea8-8db1-5a05b3a66f6d)
+(*Figure 1: Diagram from VSCode official documentation [Developing inside a Container](https://code.visualstudio.com/docs/devcontainers/containers).*)
+
+To enable this, you only need to:
+1. Start Visual Studio Code and install [VSCode dev container extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers).
+2. Press F1, type and choose "Dev Container: Open Folder in Container.
+3. Input the `sglang` local repo path in your machine and press enter.
+
+The first time you open it in dev container might take longer due to docker pull and build. Once it's successful, you should set on your status bar at the bottom left displaying that you are in a dev container:
+
+![image](https://github.com/user-attachments/assets/650bba0b-c023-455f-91f9-ab357340106b)
+
+Now when you run `sglang.launch_server` in the VSCode terminal or start debugging using F5, sglang server will be started in the dev container with all your local changes applied automatically:
+
+![image](https://github.com/user-attachments/assets/748c85ba-7f8c-465e-8599-2bf7a8dde895)
+
+
+### Option 2. Start up containers manually (advanced)
+
+The following startup command is an example for internal development by the SGLang team. You can **modify or add directory mappings as needed**, especially for model weight downloads, to prevent repeated downloads by different Docker containers.
+
+❗️ **Note on RDMA**
+
+    1. `--network host` and `--privileged` are required by RDMA. If you don't need RDMA, you can remove them but keeping them there does not harm. Thus, we enable these two flags by default in the commands below.
+    2. You may need to set `NCCL_IB_GID_INDEX` if you are using RoCE, for example: `export NCCL_IB_GID_INDEX=3`.
+
+```bash
+# Change the name to yours
+docker run -itd --shm-size 32g --gpus all -v <volumes-to-mount> --ipc=host --network=host --privileged --name sglang_dev lmsysorg/sglang:dev /bin/zsh
+docker exec -it sglang_dev /bin/zsh
+```
+Some useful volumes to mount are:
+1. **Huggingface model cache**: mounting model cache can avoid re-download every time docker restarts. Default location on Linux is `~/.cache/huggingface/`.
+2. **SGLang repository**: code changes in the SGLang local repository will be automatically synced to the .devcontainer.
+
+Example 1: Monting local cache folder `/opt/dlami/nvme/.cache` but not the SGLang repo. Use this when you prefer to manually transfer local code changes to the devcontainer.
+```bash
+docker run -itd --shm-size 32g --gpus all -v /opt/dlami/nvme/.cache:/root/.cache --ipc=host --network=host --privileged --name sglang_zhyncs lmsysorg/sglang:dev /bin/zsh
+docker exec -it sglang_zhyncs /bin/zsh
+```
+Example 2: Mounting both HuggingFace cache and local SGLang repo. Local code changes are automatically synced to the devcontainer as the SGLang is installed in editable mode in the dev image.
+```bash
+docker run -itd --shm-size 32g --gpus all -v $HOME/.cache/huggingface/:/root/.cache/huggingface -v $HOME/src/sglang:/sgl-workspace/sglang --ipc=host --network=host --privileged --name sglang_zhyncs lmsysorg/sglang:dev /bin/zsh
+docker exec -it sglang_zhyncs /bin/zsh
+```
+## Debug SGLang with VSCode Debugger
+1. (Create if not exist) open `launch.json` in VSCode.
+2. Add the following config and save. Please note that you can edit the script as needed to apply different parameters or debug a different program (e.g. benchmark script).
+     ```JSON
+       {
+          "version": "0.2.0",
+          "configurations": [
+              {
+                  "name": "Python Debugger: launch_server",
+                  "type": "debugpy",
+                  "request": "launch",
+                  "module": "sglang.launch_server",
+                  "console": "integratedTerminal",
+                  "args": [
+                      "--model-path", "meta-llama/Llama-3.2-1B",
+                      "--host", "0.0.0.0",
+                      "--port", "30000",
+                      "--trust-remote-code",
+                  ],
+                  "justMyCode": false
+              }
+          ]
+      }
+    ```
+
+3. Press "F5" to start. VSCode debugger will ensure that the program will pause at the breakpoints even if the program is running at remote SSH/Tunnel host + dev container.
+
+## Profile
+
+```bash
+# Change batch size, input, output and add `disable-cuda-graph` (for easier analysis)
+# e.g. DeepSeek V3
+nsys profile -o deepseek_v3 python3 -m sglang.bench_one_batch --batch-size 1 --input 128 --output 256 --model deepseek-ai/DeepSeek-V3 --trust-remote-code --tp 8 --disable-cuda-graph
+```
+
+## Evaluation
+
+```bash
+# e.g. gsm8k 8 shot
+python3 benchmark/gsm8k/bench_sglang.py --num-questions 2000 --parallel 2000 --num-shots 8
+```
diff --git a/docs/developer_guide/release_process.md b/docs/developer_guide/release_process.md
new file mode 100644
index 000000000..e817a256e
--- /dev/null
+++ b/docs/developer_guide/release_process.md
@@ -0,0 +1,18 @@
+# PyPI Package Release Process
+
+## Update the version in code
+Update the package version in `python/pyproject.toml` and `python/sglang/__init__.py`.
+
+## Upload the PyPI package
+
+```
+pip install build twine
+```
+
+```
+cd python
+bash upload_pypi.sh
+```
+
+## Make a release in GitHub
+Make a new release https://github.com/sgl-project/sglang/releases/new.
diff --git a/docs/developer_guide/setup_github_runner.md b/docs/developer_guide/setup_github_runner.md
new file mode 100644
index 000000000..6ed78a247
--- /dev/null
+++ b/docs/developer_guide/setup_github_runner.md
@@ -0,0 +1,49 @@
+# Set Up Self-Hosted Runners for GitHub Action
+
+## Add a Runner
+
+### Step 1: Start a docker container.
+
+You can mount a folder for the shared huggingface model weights cache. The command below uses `/tmp/huggingface` as an example.
+
+```
+docker pull nvidia/cuda:12.1.1-devel-ubuntu22.04
+# Nvidia
+docker run --shm-size 128g -it -v /tmp/huggingface:/hf_home --gpus all nvidia/cuda:12.1.1-devel-ubuntu22.04 /bin/bash
+# AMD
+docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.5.0rc1-rocm630 /bin/bash
+# AMD just the last 2 GPUs
+docker run --rm --device=/dev/kfd --device=/dev/dri/renderD176 --device=/dev/dri/renderD184 --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.5.0rc1-rocm630 /bin/bash
+```
+
+### Step 2: Configure the runner by `config.sh`
+
+Run these commands inside the container.
+
+```
+apt update && apt install -y curl python3-pip git
+export RUNNER_ALLOW_RUNASROOT=1
+```
+
+Then follow https://github.com/sgl-project/sglang/settings/actions/runners/new?arch=x64&os=linux to run `config.sh`
+
+**Notes**
+- Do not need to specify the runner group
+- Give it a name (e.g., `test-sgl-gpu-0`) and some labels (e.g., `1-gpu-runner`). The labels can be edited later in Github Settings.
+- Do not need to change the work folder.
+
+### Step 3: Run the runner by `run.sh`
+
+- Set up environment variables
+```
+export HF_HOME=/hf_home
+export SGLANG_IS_IN_CI=true
+export HF_TOKEN=hf_xxx
+export OPENAI_API_KEY=sk-xxx
+export CUDA_VISIBLE_DEVICES=0
+```
+
+- Run it forever
+```
+while true; do ./run.sh; echo "Restarting..."; sleep 2; done
+```
diff --git a/docs/get_started/install.md b/docs/get_started/install.md
new file mode 100644
index 000000000..e2e780e00
--- /dev/null
+++ b/docs/get_started/install.md
@@ -0,0 +1,131 @@
+# Install SGLang
+
+You can install SGLang using one of the methods below.
+
+This page primarily applies to common NVIDIA GPU platforms.
+For other or newer platforms, please refer to the dedicated pages for [NVIDIA Blackwell GPUs](../platforms/blackwell_gpu.md), [AMD GPUs](../platforms/amd_gpu.md), [Intel Xeon CPUs](../platforms/cpu_server.md), [NVIDIA Jetson](../platforms/nvidia_jetson.md), [Ascend NPUs](../platforms/ascend_npu.md).
+
+## Method 1: With pip or uv
+
+It is recommended to use uv for faster installation:
+
+```bash
+pip install --upgrade pip
+pip install uv
+uv pip install "sglang[all]>=0.5.2"
+```
+
+**Quick fixes to common problems**
+- If you encounter `OSError: CUDA_HOME environment variable is not set`. Please set it to your CUDA install root with either of the following solutions:
+  1. Use `export CUDA_HOME=/usr/local/cuda-<your-cuda-version>` to set the `CUDA_HOME` environment variable.
+  2. Install FlashInfer first following [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html), then install SGLang as described above.
+
+## Method 2: From source
+
+```bash
+# Use the last release branch
+git clone -b v0.5.2 https://github.com/sgl-project/sglang.git
+cd sglang
+
+# Install the python packages
+pip install --upgrade pip
+pip install -e "python[all]"
+```
+
+**Quick fixes to common problems**
+- If you want to develop SGLang, it is recommended to use docker. Please refer to [setup docker container](../developer_guide/development_guide_using_docker.md#setup-docker-container). The docker image is `lmsysorg/sglang:dev`.
+
+## Method 3: Using docker
+
+The docker images are available on Docker Hub at [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](https://github.com/sgl-project/sglang/tree/main/docker).
+Replace `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
+
+```bash
+docker run --gpus all \
+    --shm-size 32g \
+    -p 30000:30000 \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HF_TOKEN=<secret>" \
+    --ipc=host \
+    lmsysorg/sglang:latest \
+    python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
+```
+
+## Method 4: Using Kubernetes
+
+Please check out [OME](https://github.com/sgl-project/ome), a Kubernetes operator for enterprise-grade management and serving of large language models (LLMs).
+
+<details>
+<summary>More</summary>
+
+1. Option 1: For single node serving (typically when the model size fits into GPUs on one node)
+
+   Execute command `kubectl apply -f docker/k8s-sglang-service.yaml`, to create k8s deployment and service, with llama-31-8b as example.
+
+2. Option 2: For multi-node serving (usually when a large model requires more than one GPU node, such as `DeepSeek-R1`)
+
+   Modify the LLM model path and arguments as necessary, then execute command `kubectl apply -f docker/k8s-sglang-distributed-sts.yaml`, to create two nodes k8s statefulset and serving service.
+
+</details>
+
+## Method 5: Using docker compose
+
+<details>
+<summary>More</summary>
+
+> This method is recommended if you plan to serve it as a service.
+> A better approach is to use the [k8s-sglang-service.yaml](https://github.com/sgl-project/sglang/blob/main/docker/k8s-sglang-service.yaml).
+
+1. Copy the [compose.yml](https://github.com/sgl-project/sglang/blob/main/docker/compose.yaml) to your local machine
+2. Execute the command `docker compose up -d` in your terminal.
+</details>
+
+## Method 6: Run on Kubernetes or Clouds with SkyPilot
+
+<details>
+<summary>More</summary>
+
+To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
+
+1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
+2. Deploy on your own infra with a single command and get the HTTP API endpoint:
+<details>
+<summary>SkyPilot YAML: <code>sglang.yaml</code></summary>
+
+```yaml
+# sglang.yaml
+envs:
+  HF_TOKEN: null
+
+resources:
+  image_id: docker:lmsysorg/sglang:latest
+  accelerators: A100
+  ports: 30000
+
+run: |
+  conda deactivate
+  python3 -m sglang.launch_server \
+    --model-path meta-llama/Llama-3.1-8B-Instruct \
+    --host 0.0.0.0 \
+    --port 30000
+```
+
+</details>
+
+```bash
+# Deploy on any cloud or Kubernetes cluster. Use --cloud <cloud> to select a specific cloud provider.
+HF_TOKEN=<secret> sky launch -c sglang --env HF_TOKEN sglang.yaml
+
+# Get the HTTP API endpoint
+sky status --endpoint 30000 sglang
+```
+
+3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve).
+</details>
+
+## Common Notes
+
+- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is the default attention kernel backend. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), please switch to other kernels by adding `--attention-backend triton --sampling-backend pytorch` and open an issue on GitHub.
+- To reinstall flashinfer locally, use the following command: `pip3 install --upgrade flashinfer-python --force-reinstall --no-deps` and then delete the cache with `rm -rf ~/.cache/flashinfer`.
+- If you only need to use OpenAI API models with the frontend language, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
+- The language frontend operates independently of the backend runtime. You can install the frontend locally without needing a GPU, while the backend can be set up on a GPU-enabled machine. To install the frontend, run `pip install sglang`, and for the backend, use `pip install sglang[srt]`. `srt` is the abbreviation of SGLang runtime.
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 000000000..f948fca24
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,95 @@
+SGLang Documentation
+====================
+
+SGLang is a fast serving framework for large language models and vision language models.
+It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
+The core features include:
+
+- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor/pipeline/expert/data parallelism, structured outputs, chunked prefill, quantization (FP4/FP8/INT4/AWQ/GPTQ), and multi-lora batching.
+- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
+- **Extensive Model Support**: Supports a wide range of generative models (Llama, Qwen, DeepSeek, Kimi, GPT, Gemma, Mistral, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
+- **Active Community**: SGLang is open-source and backed by an active community with wide industry adoption.
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Get Started
+
+   get_started/install.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Basic Usage
+
+   basic_usage/send_request.ipynb
+   basic_usage/openai_api.rst
+   basic_usage/offline_engine_api.ipynb
+   basic_usage/native_api.ipynb
+   basic_usage/sampling_params.md
+   basic_usage/deepseek.md
+   basic_usage/gpt_oss.md
+   basic_usage/llama4.md
+   basic_usage/qwen3.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Advanced Features
+
+   advanced_features/server_arguments.md
+   advanced_features/hyperparameter_tuning.md
+   advanced_features/speculative_decoding.ipynb
+   advanced_features/structured_outputs.ipynb
+   advanced_features/structured_outputs_for_reasoning_models.ipynb
+   advanced_features/tool_parser.ipynb
+   advanced_features/separate_reasoning.ipynb
+   advanced_features/quantization.md
+   advanced_features/lora.ipynb
+   advanced_features/pd_disaggregation.md
+   advanced_features/vlm_query.ipynb
+   advanced_features/router.md
+   advanced_features/observability.md
+   advanced_features/attention_backend.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Supported Models
+
+   supported_models/generative_models.md
+   supported_models/multimodal_language_models.md
+   supported_models/embedding_models.md
+   supported_models/reward_models.md
+   supported_models/rerank_models.md
+   supported_models/support_new_models.md
+   supported_models/transformers_fallback.md
+   supported_models/modelscope.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Hardware Platforms
+
+   platforms/amd_gpu.md
+   platforms/blackwell_gpu.md
+   platforms/cpu_server.md
+   platforms/tpu.md
+   platforms/nvidia_jetson.md
+   platforms/ascend_npu.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Developer Guide
+
+   developer_guide/contribution_guide.md
+   developer_guide/development_guide_using_docker.md
+   developer_guide/benchmark_and_profiling.md
+   developer_guide/bench_serving.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: References
+
+   references/faq.md
+   references/environment_variables.md
+   references/production_metrics.md
+   references/multi_node_deployment/multi_node_index.rst
+   references/custom_chat_template.md
+   references/frontend/frontend_index.rst
+   references/learn_more.md
diff --git a/docs/platforms/amd_gpu.md b/docs/platforms/amd_gpu.md
new file mode 100644
index 000000000..81d6d544a
--- /dev/null
+++ b/docs/platforms/amd_gpu.md
@@ -0,0 +1,158 @@
+# AMD GPUs
+
+This document describes how run SGLang on AMD GPUs. If you encounter issues or have questions, please [open an issue](https://github.com/sgl-project/sglang/issues).
+
+## System Configuration
+
+When using AMD GPUs (such as MI300X), certain system-level optimizations help ensure stable performance. Here we take MI300X as an example. AMD provides official documentation for MI300X optimization and system tuning:
+
+- [AMD MI300X Tuning Guides](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html)
+- [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/inference/vllm-benchmark.html)
+- [AMD Instinct MI300X System Optimization](https://rocm.docs.amd.com/en/latest/how-to/system-optimization/mi300x.html)
+- [AMD Instinct MI300X Workload Optimization](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/inference-optimization/workload.html)
+- [Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html)
+
+**NOTE:** We strongly recommend reading these docs and guides entirely to fully utilize your system.
+
+Below are a few key settings to confirm or enable for SGLang:
+
+### Update GRUB Settings
+
+In `/etc/default/grub`, append the following to `GRUB_CMDLINE_LINUX`:
+
+```text
+pci=realloc=off iommu=pt
+```
+
+Afterward, run `sudo update-grub` (or your distro’s equivalent) and reboot.
+
+### Disable NUMA Auto-Balancing
+
+```bash
+sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'
+```
+
+You can automate or verify this change using [this helpful script](https://github.com/ROCm/triton/blob/rocm_env/scripts/amd/env_check.sh).
+
+Again, please go through the entire documentation to confirm your system is using the recommended configuration.
+
+## Install SGLang
+
+You can install SGLang using one of the methods below.
+
+### Install from Source
+
+```bash
+# Use the last release branch
+git clone -b v0.5.2 https://github.com/sgl-project/sglang.git
+cd sglang
+
+# Compile sgl-kernel
+pip install --upgrade pip
+cd sgl-kernel
+python setup_rocm.py install
+
+# Install sglang python package
+cd ..
+pip install -e "python[all_hip]"
+```
+
+### Install Using Docker (Recommended)
+
+The docker images are available on Docker Hub at [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile.rocm](https://github.com/sgl-project/sglang/tree/main/docker).
+
+The steps below show how to build and use an image.
+
+1. Build the docker image.
+   If you use pre-built images, you can skip this step and replace `sglang_image` with the pre-built image names in the steps below.
+
+   ```bash
+   docker build -t sglang_image -f Dockerfile.rocm .
+   ```
+
+2. Create a convenient alias.
+
+   ```bash
+   alias drun='docker run -it --rm --network=host --privileged --device=/dev/kfd --device=/dev/dri \
+       --ipc=host --shm-size 16G --group-add video --cap-add=SYS_PTRACE \
+       --security-opt seccomp=unconfined \
+       -v $HOME/dockerx:/dockerx \
+       -v /data:/data'
+   ```
+
+   If you are using RDMA, please note that:
+     - `--network host` and `--privileged` are required by RDMA. If you don't need RDMA, you can remove them.
+     - You may need to set `NCCL_IB_GID_INDEX` if you are using RoCE, for example: `export NCCL_IB_GID_INDEX=3`.
+
+3. Launch the server.
+
+   **NOTE:** Replace `<secret>` below with your [huggingface hub token](https://huggingface.co/docs/hub/en/security-tokens).
+
+   ```bash
+   drun -p 30000:30000 \
+       -v ~/.cache/huggingface:/root/.cache/huggingface \
+       --env "HF_TOKEN=<secret>" \
+       sglang_image \
+       python3 -m sglang.launch_server \
+       --model-path NousResearch/Meta-Llama-3.1-8B \
+       --host 0.0.0.0 \
+       --port 30000
+   ```
+
+4. To verify the utility, you can run a benchmark in another terminal or refer to [other docs](https://docs.sglang.ai/backend/openai_api_completions.html) to send requests to the engine.
+
+   ```bash
+   drun sglang_image \
+       python3 -m sglang.bench_serving \
+       --backend sglang \
+       --dataset-name random \
+       --num-prompts 4000 \
+       --random-input 128 \
+       --random-output 128
+   ```
+
+With your AMD system properly configured and SGLang installed, you can now fully leverage AMD hardware to power SGLang’s machine learning capabilities.
+
+## Examples
+
+### Running DeepSeek-V3
+
+The only difference when running DeepSeek-V3 is in how you start the server. Here's an example command:
+
+```bash
+drun -p 30000:30000 \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --ipc=host \
+    --env "HF_TOKEN=<secret>" \
+    sglang_image \
+    python3 -m sglang.launch_server \
+    --model-path deepseek-ai/DeepSeek-V3 \ # <- here
+    --tp 8 \
+    --trust-remote-code \
+    --host 0.0.0.0 \
+    --port 30000
+```
+
+[Running DeepSeek-R1 on a single NDv5 MI300X VM](https://techcommunity.microsoft.com/blog/azurehighperformancecomputingblog/running-deepseek-r1-on-a-single-ndv5-mi300x-vm/4372726) could also be a good reference.
+
+### Running Llama3.1
+
+Running Llama3.1 is nearly identical to running DeepSeek-V3. The only difference is in the model specified when starting the server, shown by the following example command:
+
+```bash
+drun -p 30000:30000 \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --ipc=host \
+    --env "HF_TOKEN=<secret>" \
+    sglang_image \
+    python3 -m sglang.launch_server \
+    --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \ # <- here
+    --tp 8 \
+    --trust-remote-code \
+    --host 0.0.0.0 \
+    --port 30000
+```
+
+### Warmup Step
+
+When the server displays `The server is fired up and ready to roll!`, it means the startup is successful.
diff --git a/docs/platforms/ascend_npu.md b/docs/platforms/ascend_npu.md
new file mode 100644
index 000000000..f57d3fe95
--- /dev/null
+++ b/docs/platforms/ascend_npu.md
@@ -0,0 +1,206 @@
+# Ascend NPUs
+
+You can install SGLang using any of the methods below. Please go through `System Settings` section to ensure the clusters are roaring at max performance. Feel free to leave an issue [here at sglang](https://github.com/sgl-project/sglang/issues) if you encounter any issues or have any problems.
+
+## System Settings
+
+### CPU performance power scheme
+
+The default power scheme on Ascend hardware is `ondemand` which could affect performance, changing it to `performance` is recommended.
+
+```shell
+echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
+
+# Make sure changes are applied successfully
+cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor # shows performance
+```
+
+### Disable NUMA balancing
+
+```shell
+sudo sysctl -w kernel.numa_balancing=0
+
+# Check
+cat /proc/sys/kernel/numa_balancing # shows 0
+```
+
+### Prevent swapping out system memory
+
+```shell
+sudo sysctl -w vm.swappiness=10
+
+# Check
+cat /proc/sys/vm/swappiness # shows 10
+```
+
+## Installing SGLang
+
+### Method 1: Installing from source with prerequisites
+
+#### Python Version
+
+Only `python==3.11` is supported currently. If you don't want to break system pre-installed python, try installing with [conda](https://github.com/conda/conda).
+
+```shell
+conda create --name sglang_npu python=3.11
+conda activate sglang_npu
+```
+
+#### MemFabric Adaptor
+
+_TODO: MemFabric is still a working project yet open sourced til August/September, 2025. We will release it as prebuilt wheel package for now._
+
+_Notice: Prebuilt wheel package is based on `aarch64`, please leave an issue [here at sglang](https://github.com/sgl-project/sglang/issues) to let us know the requests for `amd64` build._
+
+MemFabric Adaptor is a drop-in replacement of Mooncake Transfer Engine that enables KV cache transfer on Ascend NPU clusters.
+
+```shell
+MF_WHL_NAME="mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl"
+MEMFABRIC_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/${MF_WHL_NAME}"
+wget -O "${MF_WHL_NAME}" "${MEMFABRIC_URL}" && pip install "./${MF_WHL_NAME}"
+```
+
+#### Pytorch and Pytorch Framework Adaptor on Ascend
+
+Only `torch==2.6.0` is supported currently due to NPUgraph and Triton-on-Ascend's limitation, however a more generalized version will be release by the end of September, 2025.
+
+```shell
+PYTORCH_VERSION=2.6.0
+TORCHVISION_VERSION=0.21.0
+pip install torch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION --index-url https://download.pytorch.org/whl/cpu
+
+PTA_VERSION="v7.1.0.1-pytorch2.6.0"
+PTA_NAME="torch_npu-2.6.0.post1-cp311-cp311-manylinux_2_28_aarch64.whl"
+PTA_URL="https://gitee.com/ascend/pytorch/releases/download/${PTA_VERSION}/${PTA_WHL_NAME}"
+wget -O "${PTA_NAME}" "${PTA_URL}" && pip install "./${PTA_NAME}"
+```
+
+#### vLLM
+
+vLLM is still a major prerequisite on Ascend NPU. Because of `torch==2.6.0` limitation, only vLLM v0.8.5 is supported.
+
+```shell
+VLLM_TAG=v0.8.5
+git clone --depth 1 https://github.com/vllm-project/vllm.git --branch $VLLM_TAG
+(cd vllm && VLLM_TARGET_DEVICE="empty" pip install -v -e .)
+```
+
+#### Triton on Ascend
+
+_Notice:_ We recommend installing triton-ascend from source due to its rapid development, the version on PYPI can't keep up for know. This problem will be solved on Sep. 2025, afterwards `pip install` would be the one and only installing method.
+
+Please follow Triton-on-Ascend's [installation guide from source](https://gitee.com/ascend/triton-ascend#2%E6%BA%90%E4%BB%A3%E7%A0%81%E5%AE%89%E8%A3%85-triton-ascend) to install the latest `triton-ascend` package.
+
+#### DeepEP-compatible Library
+
+We are also providing a DeepEP-compatible Library as a drop-in replacement of deepseek-ai's DeepEP library, check the [installation guide](https://github.com/sgl-project/sgl-kernel-npu/blob/main/python/deep_ep/README.md).
+
+#### Installing SGLang from source
+
+```shell
+# Use the last release branch
+git clone -b v0.5.2 https://github.com/sgl-project/sglang.git
+cd sglang
+
+pip install --upgrade pip
+pip install -e python[srt_npu]
+```
+
+### Method 2: Using docker
+
+__Notice:__ `--privileged` and `--network=host` are required by RDMA, which is typically needed by Ascend NPU clusters.
+
+__Notice:__ The following docker command is based on Atlas 800I A3 machines. If you are using Atlas 800I A2, make sure only `davinci[0-7]` are mapped into container.
+
+```shell
+# Clone the SGLang repository
+git clone https://github.com/sgl-project/sglang.git
+cd sglang/docker
+
+# Build the docker image
+docker build -t sglang-npu:main -f Dockerfile.npu .
+
+alias drun='docker run -it --rm --privileged --network=host --ipc=host --shm-size=16g \
+    --device=/dev/davinci0 --device=/dev/davinci1 --device=/dev/davinci2 --device=/dev/davinci3 \
+    --device=/dev/davinci4 --device=/dev/davinci5 --device=/dev/davinci6 --device=/dev/davinci7 \
+    --device=/dev/davinci8 --device=/dev/davinci9 --device=/dev/davinci10 --device=/dev/davinci11 \
+    --device=/dev/davinci12 --device=/dev/davinci13 --device=/dev/davinci14 --device=/dev/davinci15 \
+    --device=/dev/davinci_manager --device=/dev/hisi_hdc \
+    --volume /usr/local/sbin:/usr/local/sbin --volume /usr/local/Ascend/driver:/usr/local/Ascend/driver \
+    --volume /usr/local/Ascend/firmware:/usr/local/Ascend/firmware \
+    --volume /etc/ascend_install.info:/etc/ascend_install.info \
+    --volume /var/queue_schedule:/var/queue_schedule --volume ~/.cache/:/root/.cache/'
+
+drun --env "HF_TOKEN=<secret>" \
+    sglang-npu:main \
+    python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --attention-backend ascend --host 0.0.0.0 --port 30000
+```
+
+## Examples
+
+### Running DeepSeek-V3
+
+Running DeepSeek with PD disaggregation on 2 x Atlas 800I A3.
+Model weights could be found [here](https://modelers.cn/models/State_Cloud/Deepseek-R1-bf16-hfd-w8a8).
+
+Prefill:
+
+```shell
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+export ASCEND_MF_STORE_URL="tcp://<PREFILL_HOST_IP>:<PORT>"
+
+drun sglang-npu:main \
+    python3 -m sglang.launch_server --model-path State_Cloud/DeepSeek-R1-bf16-hfd-w8a8 \
+    --trust-remote-code \
+    --attention-backend ascend \
+    --mem-fraction-static 0.8 \
+    --quantization w8a8_int8 \
+    --tp-size 16 \
+    --dp-size 1 \
+    --nnodes 1 \
+    --node-rank 0 \
+    --disaggregation-mode prefill \
+    --disaggregation-bootstrap-port 6657 \
+    --disaggregation-transfer-backend ascend \
+    --dist-init-addr <PREFILL_HOST_IP>:6688 \
+    --host <PREFILL_HOST_IP> \
+    --port 8000
+```
+
+Decode:
+
+```shell
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+export ASCEND_MF_STORE_URL="tcp://<PREFILL_HOST_IP>:<PORT>"
+export HCCL_BUFFSIZE=200
+export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=24
+
+drun sglang-npu:main \
+    python3 -m sglang.launch_server --model-path State_Cloud/DeepSeek-R1-bf16-hfd-w8a8 \
+    --trust-remote-code \
+    --attention-backend ascend \
+    --mem-fraction-static 0.8 \
+    --quantization w8a8_int8 \
+    --enable-deepep-moe \
+    --deepep-mode low_latency \
+    --tp-size 16 \
+    --dp-size 1 \
+    --ep-size 16 \
+    --nnodes 1 \
+    --node-rank 0 \
+    --disaggregation-mode decode \
+    --disaggregation-transfer-backend ascend \
+    --dist-init-addr <DECODE_HOST_IP>:6688 \
+    --host <DECODE_HOST_IP> \
+    --port 8001
+```
+
+Mini_LB:
+
+```shell
+drun sglang-npu:main \
+    python -m sglang.srt.disaggregation.launch_lb \
+    --prefill http://<PREFILL_HOST_IP>:8000 \
+    --decode http://<DECODE_HOST_IP>:8001 \
+    --host 127.0.0.1 --port 5000
+```
diff --git a/docs/platforms/blackwell_gpu.md b/docs/platforms/blackwell_gpu.md
new file mode 100644
index 000000000..8c433b3f0
--- /dev/null
+++ b/docs/platforms/blackwell_gpu.md
@@ -0,0 +1,9 @@
+# Blackwell GPUs
+
+We will release the pre-built wheels soon. Before that, please try to compile from source or check the blackwell docker images from [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
+
+## B200 with x86 CPUs
+TODO
+
+## GB200/GB300 with ARM CPUs
+TODO
diff --git a/docs/platforms/cpu_server.md b/docs/platforms/cpu_server.md
new file mode 100644
index 000000000..97fad918d
--- /dev/null
+++ b/docs/platforms/cpu_server.md
@@ -0,0 +1,204 @@
+# CPU Servers
+
+The document addresses how to set up the [SGLang](https://github.com/sgl-project/sglang) environment and run LLM inference on CPU servers.
+Specifically, SGLang is well optimized on the CPUs equipped with Intel® AMX® Instructions,
+which are 4th generation or newer Intel® Xeon® Scalable Processors.
+
+## Optimized Model List
+
+A list of popular LLMs are optimized and run efficiently on CPU,
+including the most notable open-source models like Llama series, Qwen series,
+and the phenomenal high-quality reasoning model DeepSeek-R1.
+
+| Model Name | BF16 | w8a8_int8 | FP8 |
+|:---:|:---:|:---:|:---:|
+| DeepSeek-R1 |   | [meituan/DeepSeek-R1-Channel-INT8](https://huggingface.co/meituan/DeepSeek-R1-Channel-INT8) | [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) |
+| Llama-3.2-3B | [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | [RedHatAI/Llama-3.2-3B-quantized.w8a8](https://huggingface.co/RedHatAI/Llama-3.2-3B-Instruct-quantized.w8a8) |   |
+| Llama-3.1-8B | [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) | [RedHatAI/Meta-Llama-3.1-8B-quantized.w8a8](https://huggingface.co/RedHatAI/Meta-Llama-3.1-8B-quantized.w8a8) |   |
+| QwQ-32B |   | [RedHatAI/QwQ-32B-quantized.w8a8](https://huggingface.co/RedHatAI/QwQ-32B-quantized.w8a8) |   |
+| DeepSeek-Distilled-Llama |   | [RedHatAI/DeepSeek-R1-Distill-Llama-70B-quantized.w8a8](https://huggingface.co/RedHatAI/DeepSeek-R1-Distill-Llama-70B-quantized.w8a8) |   |
+| Qwen3-235B |   |   | [Qwen/Qwen3-235B-A22B-FP8](https://huggingface.co/Qwen/Qwen3-235B-A22B-FP8) |
+
+**Note:** The model identifiers listed in the table above
+have been verified on 6th Gen Intel® Xeon® P-core platforms.
+
+## Installation
+
+### Install Using Docker
+
+It is recommended to use Docker for setting up the SGLang environment.
+A [Dockerfile](https://github.com/sgl-project/sglang/blob/main/docker/Dockerfile.xeon) is provided to facilitate the installation.
+Replace `<secret>` below with your [HuggingFace access token](https://huggingface.co/docs/hub/en/security-tokens).
+
+```bash
+# Clone the SGLang repository
+git clone https://github.com/sgl-project/sglang.git
+cd sglang/docker
+
+# Build the docker image
+docker build -t sglang-cpu:main -f Dockerfile.xeon .
+
+# Initiate a docker container
+docker run \
+    -it \
+    --privileged \
+    --ipc=host \
+    --network=host \
+    -v /dev/shm:/dev/shm \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    -p 30000:30000 \
+    -e "HF_TOKEN=<secret>" \
+    sglang-cpu:main /bin/bash
+```
+
+### Install From Source
+
+If you'd prefer to install SGLang in a bare metal environment,
+the command list is as below.
+It is worth noting that the environment variable `SGLANG_USE_CPU_ENGINE=1`
+is required to enable SGLang service with CPU engine.
+
+```bash
+# Create and activate a conda environment
+conda create -n sgl-cpu python=3.12 -y
+conda activate sgl-cpu
+
+# Optional: Set PyTorch CPU as primary pip install channel to avoid installing CUDA version
+pip config set global.index-url https://download.pytorch.org/whl/cpu
+pip config set global.extra-index-url https://pypi.org/simple
+
+# Check if some conda related environment variables have been set
+env | grep -i conda
+# The following environment variable settings are required
+# if they have not been set properly
+export CONDA_EXE=$(which conda)
+export CONDA_ROOT=${CONDA_EXE}/../..
+export CONDA_PREFIX=${CONDA_ROOT}/envs/sgl-cpu
+export PATH=${PATH}:${CONDA_ROOT}/bin:${CONDA_ROOT}/condabin
+
+# Clone the SGLang code
+git clone https://github.com/sgl-project/sglang.git
+cd sglang
+git checkout <YOUR-DESIRED-VERSION>
+
+# Install SGLang dependent libs, and build SGLang main package
+pip install --upgrade pip setuptools
+conda install -y libsqlite==3.48.0 gperftools tbb libnuma numactl
+pip install -e "python[all_cpu]"
+pip install torch==2.7.1 torchvision==0.22.1 triton==3.3.1 --force-reinstall
+
+# Build the CPU backend kernels
+cd sgl-kernel
+cp pyproject_cpu.toml pyproject.toml
+pip install .
+
+# Other required environment variables
+# Recommend to set these in ~/.bashrc in order not to set every time in a new terminal
+export SGLANG_USE_CPU_ENGINE=1
+export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libiomp5.so:${CONDA_PREFIX}/lib/libtcmalloc.so:${CONDA_PREFIX}/lib/libtbbmalloc.so.2
+```
+
+## Launch of the Serving Engine
+
+Example command to launch SGLang serving:
+
+```bash
+python -m sglang.launch_server   \
+    --model <MODEL_ID_OR_PATH>   \
+    --trust-remote-code          \
+    --disable-overlap-schedule   \
+    --device cpu                 \
+    --host 0.0.0.0               \
+    --tp 6
+```
+
+Notes:
+
+1. For running W8A8 quantized models, please add the flag `--quantization w8a8_int8`.
+
+2. The flag `--tp 6` specifies that tensor parallelism will be applied using 6 ranks (TP6).
+    The number of TP specified is how many TP ranks will be used during the execution.
+    In a CPU platform, a TP rank means a sub-NUMA cluster (SNC).
+    Usually we can get the SNC information (How many available) from Operation System.
+    User can specify TP to be no more than the total available SNCs in current system.
+
+    If the specified TP rank number differs from the total SNC count,
+    the system will automatically utilize the first `n` SNCs.
+    Note that `n` cannot exceed the total SNC number, doing so will result in an error.
+
+    To specify the cores to be used, we need to explicitly set the environment variable `SGLANG_CPU_OMP_THREADS_BIND`.
+    For example, if we want to run the SGLang service using the first 40 cores of each SNC on a Xeon® 6980P server,
+    which has 43-43-42 cores on the 3 SNCs of a socket, we should set:
+
+    ```bash
+    export SGLANG_CPU_OMP_THREADS_BIND="0-39|43-82|86-125|128-167|171-210|214-253"
+    ```
+
+    Please beware that with SGLANG_CPU_OMP_THREADS_BIND set,
+    the available memory amounts of the ranks may not be determined in prior.
+    You may need to set proper `--max-total-tokens` to avoid the out-of-memory error.
+
+3. For optimizing decoding with torch.compile, please add the flag `--enable-torch-compile`.
+    To specify the maximum batch size when using torch compile, set the flag `--torch-compile-max-bs`.
+    For example, `--enable-torch-compile --torch-compile-max-bs 4` means using torch compile and setting the
+    maximum batch size to 4.
+
+4. A warmup step is automatically triggered when the service is started.
+    The server is ready when you see the log `The server is fired up and ready to roll!`.
+
+## Benchmarking with Requests
+
+You can benchmark the performance via the `bench_serving` script.
+Run the command in another terminal.
+
+```bash
+python -m sglang.bench_serving   \
+    --dataset-name random        \
+    --random-input-len 1024      \
+    --random-output-len 1024     \
+    --num-prompts 1              \
+    --request-rate inf           \
+    --random-range-ratio 1.0
+```
+
+The detail explanations of the parameters can be looked up by the command:
+
+```bash
+python -m sglang.bench_serving -h
+```
+
+Additionally, the requests can be formed with
+[OpenAI Completions API](https://docs.sglang.ai/basic_usage/openai_api_completions.html)
+and sent via the command line (e.g. using `curl`) or via your own script.
+
+## Example: Running DeepSeek-R1
+
+An example command to launch service for W8A8 DeepSeek-R1 on a Xeon® 6980P server
+
+```bash
+python -m sglang.launch_server                 \
+    --model meituan/DeepSeek-R1-Channel-INT8   \
+    --trust-remote-code                        \
+    --disable-overlap-schedule                 \
+    --device cpu                               \
+    --quantization w8a8_int8                   \
+    --host 0.0.0.0                             \
+    --mem-fraction-static 0.8                  \
+    --tp 6
+```
+
+Similarly, an example command to launch service for FP8 DeepSeek-R1 would be
+
+```bash
+python -m sglang.launch_server                 \
+    --model deepseek-ai/DeepSeek-R1            \
+    --trust-remote-code                        \
+    --disable-overlap-schedule                 \
+    --device cpu                               \
+    --host 0.0.0.0                             \
+    --mem-fraction-static 0.8                  \
+    --tp 6
+```
+
+Then you can test with `bench_serving` command or construct your own command or script
+following [the benchmarking example](#benchmarking-with-requests).
diff --git a/docs/platforms/nvidia_jetson.md b/docs/platforms/nvidia_jetson.md
new file mode 100644
index 000000000..362f60c83
--- /dev/null
+++ b/docs/platforms/nvidia_jetson.md
@@ -0,0 +1,80 @@
+# NVIDIA Jetson Orin
+
+## Prerequisites
+
+Before starting, ensure the following:
+
+- [**NVIDIA Jetson AGX Orin Devkit**](https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/jetson-orin/) is set up with **JetPack 6.1** or later.
+- **CUDA Toolkit** and **cuDNN** are installed.
+- Verify that the Jetson AGX Orin is in **high-performance mode**:
+```bash
+sudo nvpmodel -m 0
+```
+* * * * *
+## Installing and running SGLang with Jetson Containers
+Clone the jetson-containers github repository:
+```
+git clone https://github.com/dusty-nv/jetson-containers.git
+```
+Run the installation script:
+```
+bash jetson-containers/install.sh
+```
+Build the container image:
+```
+jetson-containers build sglang
+```
+Run the container:
+```
+jetson-containers run $(autotag sglang)
+```
+Or you can also manually run a container with this command:
+```
+docker run --runtime nvidia -it --rm --network=host IMAGE_NAME
+```
+* * * * *
+
+Running Inference
+-----------------------------------------
+
+Launch the server:
+```bash
+python -m sglang.launch_server \
+  --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+  --device cuda \
+  --dtype half \
+  --attention-backend flashinfer \
+  --mem-fraction-static 0.8 \
+  --context-length 8192
+```
+The quantization and limited context length (`--dtype half --context-length 8192`) are due to the limited computational resources in [Nvidia jetson kit](https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/jetson-orin/). A detailed explanation can be found in [Server Arguments](../backend/server_arguments.md).
+
+After launching the engine, refer to [Chat completions](https://docs.sglang.ai/backend/openai_api_completions.html#Usage) to test the usability.
+* * * * *
+Running quantization with TorchAO
+-------------------------------------
+TorchAO is suggested to NVIDIA Jetson Orin.
+```bash
+python -m sglang.launch_server \
+    --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --device cuda \
+    --dtype bfloat16 \
+    --attention-backend flashinfer \
+    --mem-fraction-static 0.8 \
+    --context-length 8192 \
+    --torchao-config int4wo-128
+```
+This enables TorchAO's int4 weight-only quantization with a 128-group size. The usage of `--torchao-config int4wo-128` is also for memory efficiency.
+
+
+* * * * *
+Structured output with XGrammar
+-------------------------------
+Please refer to [SGLang doc structured output](../advanced_features/structured_outputs.ipynb).
+* * * * *
+
+Thanks to the support from [Nurgaliyev Shakhizat](https://github.com/shahizat), [Dustin Franklin](https://github.com/dusty-nv) and [Johnny Núñez Cano](https://github.com/johnnynunez).
+
+References
+----------
+-   [NVIDIA Jetson AGX Orin Documentation](https://developer.nvidia.com/embedded/jetson-agx-orin)
diff --git a/docs/platforms/tpu.md b/docs/platforms/tpu.md
new file mode 100644
index 000000000..f304234cf
--- /dev/null
+++ b/docs/platforms/tpu.md
@@ -0,0 +1,3 @@
+# TPU
+
+The support for TPU is under active development. Please stay tuned.
diff --git a/docs/references/custom_chat_template.md b/docs/references/custom_chat_template.md
new file mode 100644
index 000000000..557af5bf5
--- /dev/null
+++ b/docs/references/custom_chat_template.md
@@ -0,0 +1,42 @@
+# Custom Chat Template
+
+**NOTE**: There are two chat template systems in SGLang project. This document is about setting a custom chat template for the OpenAI-compatible API server (defined at [conversation.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/conversation.py)). It is NOT related to the chat template used in the SGLang language frontend (defined at [chat_template.py](https://github.com/sgl-project/sglang/blob/main/python/sglang/lang/chat_template.py)).
+
+By default, the server uses the chat template specified in the model tokenizer from Hugging Face.
+It should just work for most official models such as Llama-2/Llama-3.
+
+If needed, you can also override the chat template when launching the server:
+
+```bash
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template llama-2
+```
+
+If the chat template you are looking for is missing, you are welcome to contribute it or load it from a file.
+
+## JSON Format
+
+You can load the JSON format, which is defined by `conversation.py`.
+
+```json
+{
+  "name": "my_model",
+  "system": "<|im_start|>system",
+  "user": "<|im_start|>user",
+  "assistant": "<|im_start|>assistant",
+  "sep_style": "CHATML",
+  "sep": "<|im_end|>",
+  "stop_str": ["<|im_end|>", "<|im_start|>"]
+}
+```
+
+```bash
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template ./my_model_template.json
+```
+
+## Jinja Format
+
+You can also use the [Jinja template format](https://huggingface.co/docs/transformers/main/en/chat_templating) as defined by Hugging Face Transformers.
+
+```bash
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template ./my_model_template.jinja
+```
diff --git a/docs/references/environment_variables.md b/docs/references/environment_variables.md
new file mode 100644
index 000000000..f22685454
--- /dev/null
+++ b/docs/references/environment_variables.md
@@ -0,0 +1,97 @@
+# Environment Variables
+
+SGLang supports various environment variables that can be used to configure its runtime behavior. This document provides a comprehensive list and aims to stay updated over time.
+
+*Note: SGLang uses two prefixes for environment variables: `SGL_` and `SGLANG_`. This is likely due to historical reasons. While both are currently supported for different settings, future versions might consolidate them.*
+
+## General Configuration
+
+| Environment Variable | Description | Default Value |
+| --- | --- | --- |
+| `SGLANG_USE_MODELSCOPE` | Enable using models from ModelScope | `false` |
+| `SGLANG_HOST_IP` | Host IP address for the server | `0.0.0.0` |
+| `SGLANG_PORT` | Port for the server | auto-detected |
+| `SGLANG_LOGGING_CONFIG_PATH` | Custom logging configuration path | Not set |
+| `SGLANG_DISABLE_REQUEST_LOGGING` | Disable request logging | `false` |
+| `SGLANG_HEALTH_CHECK_TIMEOUT` | Timeout for health check in seconds | `20` |
+
+## Performance Tuning
+
+| Environment Variable | Description | Default Value |
+| --- | --- | --- |
+| `SGLANG_ENABLE_TORCH_INFERENCE_MODE` | Control whether to use torch.inference_mode | `false` |
+| `SGLANG_ENABLE_TORCH_COMPILE` | Enable torch.compile | `true` |
+| `SGLANG_SET_CPU_AFFINITY` | Enable CPU affinity setting (often set to `1` in Docker builds) | `0` |
+| `SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN` | Allows the scheduler to overwrite longer context length requests (often set to `1` in Docker builds) | `0` |
+| `SGLANG_IS_FLASHINFER_AVAILABLE` | Control FlashInfer availability check | `true` |
+| `SGLANG_SKIP_P2P_CHECK` | Skip P2P (peer-to-peer) access check | `false` |
+| `SGL_CHUNKED_PREFIX_CACHE_THRESHOLD` | Sets the threshold for enabling chunked prefix caching | `8192` |
+| `SGLANG_FUSED_MLA_ENABLE_ROPE_FUSION` | Enable RoPE fusion in Fused Multi-Layer Attention | `1` |
+
+## DeepGEMM Configuration (Advanced Optimization)
+
+| Environment Variable | Description | Default Value |
+| --- | --- | --- |
+| `SGL_ENABLE_JIT_DEEPGEMM` | Enable Just-In-Time compilation of DeepGEMM kernels | `"true"` |
+| `SGL_JIT_DEEPGEMM_PRECOMPILE` | Enable precompilation of DeepGEMM kernels | `"true"` |
+| `SGL_JIT_DEEPGEMM_COMPILE_WORKERS` | Number of workers for parallel DeepGEMM kernel compilation | `4` |
+| `SGL_IN_DEEPGEMM_PRECOMPILE_STAGE` | Indicator flag used during the DeepGEMM precompile script | `"false"` |
+| `SGL_DG_CACHE_DIR` | Directory for caching compiled DeepGEMM kernels | `~/.cache/deep_gemm` |
+| `SGL_DG_USE_NVRTC` | Use NVRTC (instead of Triton) for JIT compilation (Experimental) | `"0"` |
+| `SGL_USE_DEEPGEMM_BMM` | Use DeepGEMM for Batched Matrix Multiplication (BMM) operations | `"false"` |
+
+## Memory Management
+
+| Environment Variable | Description | Default Value |
+| --- | --- | --- |
+| `SGLANG_DEBUG_MEMORY_POOL` | Enable memory pool debugging | `false` |
+| `SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION` | Clip max new tokens estimation for memory planning | `4096` |
+| `SGLANG_DETOKENIZER_MAX_STATES` | Maximum states for detokenizer | Default value based on system |
+| `SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK` | Disable checks for memory imbalance across Tensor Parallel ranks | Not set (defaults to enabled check) |
+
+## Model-Specific Options
+
+| Environment Variable | Description | Default Value |
+| --- | --- | --- |
+| `SGLANG_USE_AITER` | Use AITER optimize implementation | `false` |
+| `SGLANG_INT4_WEIGHT` | Enable INT4 weight quantization | `false` |
+| `SGLANG_MOE_PADDING` | Enable MoE padding (sets padding size to 128 if value is `1`, often set to `1` in Docker builds) | `0` |
+| `SGLANG_FORCE_FP8_MARLIN` | Force using FP8 MARLIN kernels even if other FP8 kernels are available | `false` |
+| `SGLANG_ENABLE_FLASHINFER_GEMM` | Use flashinfer kernels when running blockwise fp8 GEMM on Blackwell GPUs | `false` |
+| `SGLANG_SUPPORT_CUTLASS_BLOCK_FP8` | Use Cutlass kernels when running blockwise fp8 GEMM on Hopper or Blackwell GPUs | `false` |
+| `SGLANG_CUTLASS_MOE` | Use Cutlass FP8 MoE kernel on Blackwell GPUs | `false` |
+
+
+## Distributed Computing
+
+| Environment Variable | Description | Default Value |
+| --- | --- | --- |
+| `SGLANG_BLOCK_NONZERO_RANK_CHILDREN` | Control blocking of non-zero rank children processes | `1` |
+| `SGL_IS_FIRST_RANK_ON_NODE` | Indicates if the current process is the first rank on its node | `"true"` |
+| `SGLANG_PP_LAYER_PARTITION` | Pipeline parallel layer partition specification | Not set |
+
+## Testing & Debugging (Internal/CI)
+
+*These variables are primarily used for internal testing, continuous integration, or debugging.*
+
+| Environment Variable | Description | Default Value |
+| --- | --- | --- |
+| `SGLANG_IS_IN_CI` | Indicates if running in CI environment | `false` |
+| `SGLANG_AMD_CI` | Indicates running in AMD CI environment | `0` |
+| `SGLANG_TEST_RETRACT` | Enable retract decode testing | `false` |
+| `SGLANG_RECORD_STEP_TIME` | Record step time for profiling | `false` |
+| `SGLANG_TEST_REQUEST_TIME_STATS` | Test request time statistics | `false` |
+| `SGLANG_CI_SMALL_KV_SIZE` | Use small KV cache size in CI | Not set |
+
+## Profiling & Benchmarking
+
+| Environment Variable | Description | Default Value |
+| --- | --- | --- |
+| `SGLANG_TORCH_PROFILER_DIR` | Directory for PyTorch profiler output | `/tmp` |
+| `SGLANG_PROFILE_WITH_STACK` | Set `with_stack` option (bool) for PyTorch profiler (capture stack trace) | `true` |
+
+## Storage & Caching
+
+| Environment Variable | Description | Default Value |
+| --- | --- | --- |
+| `SGLANG_DISABLE_OUTLINES_DISK_CACHE` | Disable Outlines disk cache | `true` |
diff --git a/docs/references/faq.md b/docs/references/faq.md
new file mode 100644
index 000000000..6d575d253
--- /dev/null
+++ b/docs/references/faq.md
@@ -0,0 +1,35 @@
+# Troubleshooting and Frequently Asked Questions
+
+## Troubleshooting
+
+This page lists common errors and tips for resolving them.
+
+### CUDA Out of Memory
+If you encounter out-of-memory (OOM) errors, you can adjust the following parameters:
+
+- If OOM occurs during prefill, try reducing `--chunked-prefill-size` to `4096` or `2048`. This saves memory but slows down the prefill speed for long prompts.
+- If OOM occurs during decoding, try lowering `--max-running-requests`.
+- You can also reduce `--mem-fraction-static` to a smaller value, such as 0.8 or 0.7. This decreases the memory usage of the KV cache memory pool and helps prevent OOM errors during both prefill and decoding. However, it limits maximum concurrency and reduces peak throughput.
+- Another common case for OOM is requesting input logprobs for a long prompt as it requires significant memory. To address this, set `logprob_start_len` in your sampling parameters to include only the necessary parts. If you do need input logprobs for a long prompt, try reducing `--mem-fraction-static`.
+
+### CUDA Error: Illegal Memory Access Encountered
+This error may result from kernel errors or out-of-memory issues:
+- If it is a kernel error, resolving it may be challenging. Please file an issue on GitHub.
+- If it is an out-of-memory issue, it may sometimes be reported as this error instead of "Out of Memory." Refer to the section above for guidance on avoiding OOM issues.
+
+
+## Frequently Asked Questions
+
+### The results are not deterministic, even with a temperature of 0
+
+You may notice that when you send the same request twice, the results from the engine will be slightly different, even when the temperature is set to 0.
+
+From our initial investigation, this indeterminism arises from two factors: dynamic batching and prefix caching. Roughly speaking, dynamic batching accounts for about 95% of the indeterminism, while prefix caching accounts for the remaining portion. The server runs dynamic batching under the hood. Different batch sizes can cause PyTorch/CuBLAS to dispatch to different CUDA kernels, which can lead to slight numerical differences. This difference accumulates across many layers, resulting in nondeterministic output when the batch size changes. Similarly, when prefix caching is enabled, it can also dispatch to different kernels. Even when the computations are mathematically equivalent, small numerical differences from different kernel implementations lead to the final nondeterministic outputs.
+
+To achieve more deterministic outputs in the current code, you can add `--disable-radix-cache` and send only one request at a time. The results will be mostly deterministic under this setting.
+
+We are still investigating the root causes and potential solutions. In the short term, we may introduce a "deterministic mode" that uses more padding to address the variance caused by dynamic batching. This mode will be more deterministic but slower.
+
+We have two issues to track our progress:
+- The deterministic mode is tracked at [https://github.com/sgl-project/sglang/issues/1729](https://github.com/sgl-project/sglang/issues/1729).
+- The per-request random seed is tracked at [https://github.com/sgl-project/sglang/issues/1335](https://github.com/sgl-project/sglang/issues/1335).
diff --git a/docs/references/frontend/choices_methods.md b/docs/references/frontend/choices_methods.md
new file mode 100644
index 000000000..30a0a1814
--- /dev/null
+++ b/docs/references/frontend/choices_methods.md
@@ -0,0 +1,77 @@
+# Choices Methods in SGLang
+This doc describes the choices methods supported by SGLang.
+
+The optional `choices_method` arg determines how options supplied to SGLang's `choices` primitive are selected. Only the `RuntimeEndpoint` backend supports the `choices_method` arg. Other backends, such as `OpenAI`, have bespoke selection implementations due to API limitations.
+
+## Methods
+
+### Token Length Normalized
+
+Token length normalized is the default SGLang choices method. It selects the option with the highest average logprob across all of its tokens.
+
+Usage example (alternatively, simply omit the `choices_method` arg):
+```python
+@sgl.function
+def example(s):
+    s += sgl.user("What is the capital of France?")
+    s += sgl.assistant(
+        sgl.gen(
+            "answer",
+            choices=["London", "Paris", "Berlin"],
+            choices_method=sgl.token_length_normalized,
+        )
+    )
+```
+
+
+This can perform poorly if an option contains many tokens, where its later tokens are predicted with high confidence based on its earlier tokens. For instance, even strong models will fail the above example if the specified options are `["Paris", "Antidisestablishmentarianism"]`.
+
+### Greedy Token Selection
+
+Greedy token selection simply selects the option with the highest logprob for its initial token. For overlapping options where one option is a subset of a longer option, the logprobs of the shorter option are extended using its average logprob for comparison against the longer option.
+
+Usage example:
+```python
+@sgl.function
+def example(s):
+    s += sgl.user("What is the capital of France?")
+    s += sgl.assistant(
+        sgl.gen(
+            "answer",
+            choices=["London", "Paris", "Berlin"],
+            choices_method=sgl.greedy_token_selection,
+        )
+    )
+```
+
+This can perform poorly if an option misleads the model down a bad path based on an attractive initial token. For instance, greedy selection will result in an incorrect response for this example:
+```python
+@sgl.function
+def us_president_example(s):
+    s += sgl.user("Name a US president.")
+    s += sgl.assistant(
+        sgl.gen(
+            "answer",
+            choices=["Donald Duck", "Millard Fillmore"],
+            choices_method=sgl.greedy_token_selection,
+        )
+    )
+```
+
+### Unconditional Likelihood Normalized
+
+Unconditional likelihood normalized selects the option with the highest average token logprob once normalized by the unconditional token logprobs, as described in [this EleutherAI blogpost](https://blog.eleuther.ai/multiple-choice-normalization/). This method incurs an additional LLM call to obtain the unconditional likelihoods.
+
+Usage example:
+```python
+@sgl.function
+def example(s):
+    s += sgl.user("What is the capital of France?")
+    s += sgl.assistant(
+        sgl.gen(
+            "answer",
+            choices=["London", "Paris", "Berlin"],
+            choices_method=sgl.unconditional_likelihood_normalized,
+        )
+    )
+```
diff --git a/docs/references/frontend/frontend_index.rst b/docs/references/frontend/frontend_index.rst
new file mode 100644
index 000000000..62544cba5
--- /dev/null
+++ b/docs/references/frontend/frontend_index.rst
@@ -0,0 +1,9 @@
+Frontend Language
+=================
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Frontend Language
+
+   frontend_tutorial.ipynb
+   choices_methods.md
diff --git a/docs/references/frontend/frontend_tutorial.ipynb b/docs/references/frontend/frontend_tutorial.ipynb
new file mode 100644
index 000000000..836cab627
--- /dev/null
+++ b/docs/references/frontend/frontend_tutorial.ipynb
@@ -0,0 +1,456 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# SGLang Frontend Language"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "SGLang frontend language can be used to define simple and easy prompts in a convenient, structured way."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Launch A Server\n",
+    "\n",
+    "Launch the server in your terminal and wait for it to initialize."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sglang import assistant_begin, assistant_end\n",
+    "from sglang import assistant, function, gen, system, user\n",
+    "from sglang import image\n",
+    "from sglang import RuntimeEndpoint\n",
+    "from sglang.lang.api import set_default_backend\n",
+    "from sglang.srt.utils import load_image\n",
+    "from sglang.test.doc_patch import launch_server_cmd\n",
+    "from sglang.utils import print_highlight, terminate_process, wait_for_server\n",
+    "\n",
+    "server_process, port = launch_server_cmd(\n",
+    "    \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --log-level warning\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")\n",
+    "print(f\"Server started on http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set the default backend. Note: Besides the local server, you may use also `OpenAI` or other API endpoints."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "set_default_backend(RuntimeEndpoint(f\"http://localhost:{port}\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Basic Usage\n",
+    "\n",
+    "The most simple way of using SGLang frontend language is a simple question answer dialog between a user and an assistant."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@function\n",
+    "def basic_qa(s, question):\n",
+    "    s += system(f\"You are a helpful assistant than can answer questions.\")\n",
+    "    s += user(question)\n",
+    "    s += assistant(gen(\"answer\", max_tokens=512))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "state = basic_qa(\"List 3 countries and their capitals.\")\n",
+    "print_highlight(state[\"answer\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Multi-turn Dialog\n",
+    "\n",
+    "SGLang frontend language can also be used to define multi-turn dialogs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@function\n",
+    "def multi_turn_qa(s):\n",
+    "    s += system(f\"You are a helpful assistant than can answer questions.\")\n",
+    "    s += user(\"Please give me a list of 3 countries and their capitals.\")\n",
+    "    s += assistant(gen(\"first_answer\", max_tokens=512))\n",
+    "    s += user(\"Please give me another list of 3 countries and their capitals.\")\n",
+    "    s += assistant(gen(\"second_answer\", max_tokens=512))\n",
+    "    return s\n",
+    "\n",
+    "\n",
+    "state = multi_turn_qa()\n",
+    "print_highlight(state[\"first_answer\"])\n",
+    "print_highlight(state[\"second_answer\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Control flow\n",
+    "\n",
+    "You may use any Python code within the function to define more complex control flows."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@function\n",
+    "def tool_use(s, question):\n",
+    "    s += assistant(\n",
+    "        \"To answer this question: \"\n",
+    "        + question\n",
+    "        + \". I need to use a \"\n",
+    "        + gen(\"tool\", choices=[\"calculator\", \"search engine\"])\n",
+    "        + \". \"\n",
+    "    )\n",
+    "\n",
+    "    if s[\"tool\"] == \"calculator\":\n",
+    "        s += assistant(\"The math expression is: \" + gen(\"expression\"))\n",
+    "    elif s[\"tool\"] == \"search engine\":\n",
+    "        s += assistant(\"The key word to search is: \" + gen(\"word\"))\n",
+    "\n",
+    "\n",
+    "state = tool_use(\"What is 2 * 2?\")\n",
+    "print_highlight(state[\"tool\"])\n",
+    "print_highlight(state[\"expression\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Parallelism\n",
+    "\n",
+    "Use `fork` to launch parallel prompts. Because `sgl.gen` is non-blocking, the for loop below issues two generation calls in parallel."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@function\n",
+    "def tip_suggestion(s):\n",
+    "    s += assistant(\n",
+    "        \"Here are two tips for staying healthy: \"\n",
+    "        \"1. Balanced Diet. 2. Regular Exercise.\\n\\n\"\n",
+    "    )\n",
+    "\n",
+    "    forks = s.fork(2)\n",
+    "    for i, f in enumerate(forks):\n",
+    "        f += assistant(\n",
+    "            f\"Now, expand tip {i+1} into a paragraph:\\n\"\n",
+    "            + gen(\"detailed_tip\", max_tokens=256, stop=\"\\n\\n\")\n",
+    "        )\n",
+    "\n",
+    "    s += assistant(\"Tip 1:\" + forks[0][\"detailed_tip\"] + \"\\n\")\n",
+    "    s += assistant(\"Tip 2:\" + forks[1][\"detailed_tip\"] + \"\\n\")\n",
+    "    s += assistant(\n",
+    "        \"To summarize the above two tips, I can say:\\n\" + gen(\"summary\", max_tokens=512)\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "state = tip_suggestion()\n",
+    "print_highlight(state[\"summary\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Constrained Decoding\n",
+    "\n",
+    "Use `regex` to specify a regular expression as a decoding constraint. This is only supported for local models."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@function\n",
+    "def regular_expression_gen(s):\n",
+    "    s += user(\"What is the IP address of the Google DNS servers?\")\n",
+    "    s += assistant(\n",
+    "        gen(\n",
+    "            \"answer\",\n",
+    "            temperature=0,\n",
+    "            regex=r\"((25[0-5]|2[0-4]\\d|[01]?\\d\\d?).){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\",\n",
+    "        )\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "state = regular_expression_gen()\n",
+    "print_highlight(state[\"answer\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Use `regex` to define a `JSON` decoding schema."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "character_regex = (\n",
+    "    r\"\"\"\\{\\n\"\"\"\n",
+    "    + r\"\"\"    \"name\": \"[\\w\\d\\s]{1,16}\",\\n\"\"\"\n",
+    "    + r\"\"\"    \"house\": \"(Gryffindor|Slytherin|Ravenclaw|Hufflepuff)\",\\n\"\"\"\n",
+    "    + r\"\"\"    \"blood status\": \"(Pure-blood|Half-blood|Muggle-born)\",\\n\"\"\"\n",
+    "    + r\"\"\"    \"occupation\": \"(student|teacher|auror|ministry of magic|death eater|order of the phoenix)\",\\n\"\"\"\n",
+    "    + r\"\"\"    \"wand\": \\{\\n\"\"\"\n",
+    "    + r\"\"\"        \"wood\": \"[\\w\\d\\s]{1,16}\",\\n\"\"\"\n",
+    "    + r\"\"\"        \"core\": \"[\\w\\d\\s]{1,16}\",\\n\"\"\"\n",
+    "    + r\"\"\"        \"length\": [0-9]{1,2}\\.[0-9]{0,2}\\n\"\"\"\n",
+    "    + r\"\"\"    \\},\\n\"\"\"\n",
+    "    + r\"\"\"    \"alive\": \"(Alive|Deceased)\",\\n\"\"\"\n",
+    "    + r\"\"\"    \"patronus\": \"[\\w\\d\\s]{1,16}\",\\n\"\"\"\n",
+    "    + r\"\"\"    \"bogart\": \"[\\w\\d\\s]{1,16}\"\\n\"\"\"\n",
+    "    + r\"\"\"\\}\"\"\"\n",
+    ")\n",
+    "\n",
+    "\n",
+    "@function\n",
+    "def character_gen(s, name):\n",
+    "    s += user(\n",
+    "        f\"{name} is a character in Harry Potter. Please fill in the following information about this character.\"\n",
+    "    )\n",
+    "    s += assistant(gen(\"json_output\", max_tokens=256, regex=character_regex))\n",
+    "\n",
+    "\n",
+    "state = character_gen(\"Harry Potter\")\n",
+    "print_highlight(state[\"json_output\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Batching \n",
+    "\n",
+    "Use `run_batch` to run a batch of prompts."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@function\n",
+    "def text_qa(s, question):\n",
+    "    s += user(question)\n",
+    "    s += assistant(gen(\"answer\", stop=\"\\n\"))\n",
+    "\n",
+    "\n",
+    "states = text_qa.run_batch(\n",
+    "    [\n",
+    "        {\"question\": \"What is the capital of the United Kingdom?\"},\n",
+    "        {\"question\": \"What is the capital of France?\"},\n",
+    "        {\"question\": \"What is the capital of Japan?\"},\n",
+    "    ],\n",
+    "    progress_bar=True,\n",
+    ")\n",
+    "\n",
+    "for i, state in enumerate(states):\n",
+    "    print_highlight(f\"Answer {i+1}: {states[i]['answer']}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Streaming \n",
+    "\n",
+    "Use `stream` to stream the output to the user."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@function\n",
+    "def text_qa(s, question):\n",
+    "    s += user(question)\n",
+    "    s += assistant(gen(\"answer\", stop=\"\\n\"))\n",
+    "\n",
+    "\n",
+    "state = text_qa.run(\n",
+    "    question=\"What is the capital of France?\", temperature=0.1, stream=True\n",
+    ")\n",
+    "\n",
+    "for out in state.text_iter():\n",
+    "    print(out, end=\"\", flush=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Complex Prompts\n",
+    "\n",
+    "You may use `{system|user|assistant}_{begin|end}` to define complex prompts."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@function\n",
+    "def chat_example(s):\n",
+    "    s += system(\"You are a helpful assistant.\")\n",
+    "    # Same as: s += s.system(\"You are a helpful assistant.\")\n",
+    "\n",
+    "    with s.user():\n",
+    "        s += \"Question: What is the capital of France?\"\n",
+    "\n",
+    "    s += assistant_begin()\n",
+    "    s += \"Answer: \" + gen(\"answer\", max_tokens=100, stop=\"\\n\")\n",
+    "    s += assistant_end()\n",
+    "\n",
+    "\n",
+    "state = chat_example()\n",
+    "print_highlight(state[\"answer\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Multi-modal Generation\n",
+    "\n",
+    "You may use SGLang frontend language to define multi-modal prompts.\n",
+    "See [here](https://docs.sglang.ai/supported_models/generative_models.html) for supported models."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "server_process, port = launch_server_cmd(\n",
+    "    \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct --host 0.0.0.0 --log-level warning\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")\n",
+    "print(f\"Server started on http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "set_default_backend(RuntimeEndpoint(f\"http://localhost:{port}\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Ask a question about an image."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@function\n",
+    "def image_qa(s, image_file, question):\n",
+    "    s += user(image(image_file) + question)\n",
+    "    s += assistant(gen(\"answer\", max_tokens=256))\n",
+    "\n",
+    "\n",
+    "image_url = \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n",
+    "image_bytes, _ = load_image(image_url)\n",
+    "state = image_qa(image_bytes, \"What is in the image?\")\n",
+    "print_highlight(state[\"answer\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/references/learn_more.md b/docs/references/learn_more.md
new file mode 100644
index 000000000..b1a8a17da
--- /dev/null
+++ b/docs/references/learn_more.md
@@ -0,0 +1,7 @@
+# Learn more
+
+You can find more blogs, slides, and videos about SGLang at [https://github.com/sgl-project/sgl-learning-materials](https://github.com/sgl-project/sgl-learning-materials).
+
+The latest SGLang features and updates are shared through the [LMSYS blog](https://lmsys.org/blog/).
+
+The 2025 H2 roadmap can be found at this [issue](https://github.com/sgl-project/sglang/issues/7736).
diff --git a/docs/references/multi_node_deployment/deploy_on_k8s.md b/docs/references/multi_node_deployment/deploy_on_k8s.md
new file mode 100644
index 000000000..cfc099f56
--- /dev/null
+++ b/docs/references/multi_node_deployment/deploy_on_k8s.md
@@ -0,0 +1,337 @@
+# Deploy On Kubernetes
+
+This document is for deploying a RoCE network-based SGLang two-node inference service on a Kubernetes (K8S) cluster.
+
+[LeaderWorkerSet (LWS)](https://github.com/kubernetes-sigs/lws) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads. A major use case is for multi-host/multi-node distributed inference.
+
+SGLang can also be deployed with LWS on Kubernetes for distributed model serving.
+
+Please see this guide for more details on deploying SGLang on Kubernetes using LWS.
+
+Here we take the deployment of DeepSeek-R1 as an example.
+
+## Prerequisites
+
+1. At least two Kubernetes nodes, each with two H20 systems and eight GPUs, are required.
+
+2. Make sure your K8S cluster has LWS correctly installed. If it hasn't been set up yet, please follow the [installation instructions](https://github.com/kubernetes-sigs/lws/blob/main/site/content/en/docs/installation/_index.md). **Note:** For LWS versions ≤0.5.x, you must use the Downward API to obtain `LWS_WORKER_INDEX`, as native support for this feature was introduced in v0.6.0.
+
+## Basic example
+
+For the basic example documentation, refer to [Deploy Distributed Inference Service with SGLang and LWS on GPUs](https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/sglang).
+
+However, that document only covers the basic NCCL socket mode.
+
+In this section, we’ll make some simple modifications to adapt the setup to the RDMA scenario.
+
+## RDMA RoCE case
+
+* Check your env:
+
+```bash
+[root@node1 ~]# ibstatus
+Infiniband device 'mlx5_bond_0' port 1 status:
+        default gid:     fe80:0000:0000:0000:0225:9dff:fe64:c79a
+        base lid:        0x0
+        sm lid:          0x0
+        state:           4: ACTIVE
+        phys state:      5: LinkUp
+        rate:            200 Gb/sec (2X NDR)
+        link_layer:      Ethernet
+
+Infiniband device 'mlx5_bond_1' port 1 status:
+        default gid:     fe80:0000:0000:0000:0225:9dff:fe6e:c3ec
+        base lid:        0x0
+        sm lid:          0x0
+        state:           4: ACTIVE
+        phys state:      5: LinkUp
+        rate:            200 Gb/sec (2X NDR)
+        link_layer:      Ethernet
+
+Infiniband device 'mlx5_bond_2' port 1 status:
+        default gid:     fe80:0000:0000:0000:0225:9dff:fe73:0dd7
+        base lid:        0x0
+        sm lid:          0x0
+        state:           4: ACTIVE
+        phys state:      5: LinkUp
+        rate:            200 Gb/sec (2X NDR)
+        link_layer:      Ethernet
+
+Infiniband device 'mlx5_bond_3' port 1 status:
+        default gid:     fe80:0000:0000:0000:0225:9dff:fe36:f7ff
+        base lid:        0x0
+        sm lid:          0x0
+        state:           4: ACTIVE
+        phys state:      5: LinkUp
+        rate:            200 Gb/sec (2X NDR)
+        link_layer:      Ethernet
+```
+
+* Prepare the `lws.yaml` file for deploying on k8s.
+
+```yaml
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: sglang
+spec:
+  replicas: 1
+  leaderWorkerTemplate:
+    size: 2
+    restartPolicy: RecreateGroupOnPodRestart
+    leaderTemplate:
+      metadata:
+        labels:
+          role: leader
+      spec:
+        dnsPolicy: ClusterFirstWithHostNet
+        hostNetwork: true
+        hostIPC: true
+        containers:
+          - name: sglang-leader
+            image: sglang:latest
+            securityContext:
+              privileged: true
+            env:
+              - name: NCCL_IB_GID_INDEX
+                value: "3"
+            command:
+              - python3
+              - -m
+              - sglang.launch_server
+              - --model-path
+              - /work/models
+              - --mem-fraction-static
+              -  "0.93"
+              - --torch-compile-max-bs
+              - "8"
+              - --max-running-requests
+              - "20"
+              - --tp
+              - "16" # Size of Tensor Parallelism
+              - --dist-init-addr
+              - $(LWS_LEADER_ADDRESS):20000
+              - --nnodes
+              - $(LWS_GROUP_SIZE)
+              - --node-rank
+              - $(LWS_WORKER_INDEX)
+              - --trust-remote-code
+              - --host
+              - "0.0.0.0"
+              - --port
+              - "40000"
+            resources:
+              limits:
+                nvidia.com/gpu: "8"
+            ports:
+              - containerPort: 40000
+            readinessProbe:
+              tcpSocket:
+                port: 40000
+              initialDelaySeconds: 15
+              periodSeconds: 10
+            volumeMounts:
+              - mountPath: /dev/shm
+                name: dshm
+              - name: model
+                mountPath: /work/models
+              - name: ib
+                mountPath: /dev/infiniband
+        volumes:
+          - name: dshm
+            emptyDir:
+              medium: Memory
+          - name: model
+            hostPath:
+              path: '< your models dir >' # modify it according your models dir
+          - name: ib
+            hostPath:
+              path: /dev/infiniband
+    workerTemplate:
+      spec:
+        dnsPolicy: ClusterFirstWithHostNet
+        hostNetwork: true
+        hostIPC: true
+        containers:
+          - name: sglang-worker
+            image: sglang:latest
+            securityContext:
+              privileged: true
+            env:
+            - name: NCCL_IB_GID_INDEX
+              value: "3"
+            command:
+              - python3
+              - -m
+              - sglang.launch_server
+              - --model-path
+              - /work/models
+              - --mem-fraction-static
+              - "0.93"
+              - --torch-compile-max-bs
+              - "8"
+              - --max-running-requests
+              - "20"
+              - --tp
+              - "16" # Size of Tensor Parallelism
+              - --dist-init-addr
+              - $(LWS_LEADER_ADDRESS):20000
+              - --nnodes
+              - $(LWS_GROUP_SIZE)
+              - --node-rank
+              - $(LWS_WORKER_INDEX)
+              - --trust-remote-code
+            resources:
+              limits:
+                nvidia.com/gpu: "8"
+            volumeMounts:
+              - mountPath: /dev/shm
+                name: dshm
+              - name: model
+                mountPath: /work/models
+              - name: ib
+                mountPath: /dev/infiniband
+        volumes:
+          - name: dshm
+            emptyDir:
+              medium: Memory
+          - name: ib
+            hostPath:
+              path: /dev/infiniband
+          - name: model
+            hostPath:
+              path: /data1/models/deepseek_v3_moe
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: sglang-leader
+spec:
+  selector:
+    leaderworkerset.sigs.k8s.io/name: sglang
+    role: leader
+  ports:
+    - protocol: TCP
+      port: 40000
+      targetPort: 40000
+
+```
+
+* Then use  `kubectl apply -f lws.yaml` you will get this output.
+
+```text
+NAME           READY   STATUS    RESTARTS       AGE
+sglang-0       0/1     Running   0              9s
+sglang-0-1     1/1     Running   0              9s
+```
+
+Wait for the sglang leader (`sglang-0`) status to change to 1/1, which indicates it is `Ready`.
+
+You can use the command `kubectl logs -f sglang-0` to view the logs of the leader node.
+
+Once successful, you should see output like this:
+
+```text
+[2025-02-17 05:27:24 TP1] Capture cuda graph end. Time elapsed: 84.89 s
+[2025-02-17 05:27:24 TP6] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
+[2025-02-17 05:27:24 TP0] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
+[2025-02-17 05:27:24 TP7] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
+[2025-02-17 05:27:24 TP3] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
+[2025-02-17 05:27:24 TP2] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
+[2025-02-17 05:27:24 TP4] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
+[2025-02-17 05:27:24 TP1] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
+[2025-02-17 05:27:24 TP5] max_total_num_tokens=712400, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=50, context_len=163840
+[2025-02-17 05:27:24] INFO:     Started server process [1]
+[2025-02-17 05:27:24] INFO:     Waiting for application startup.
+[2025-02-17 05:27:24] INFO:     Application startup complete.
+[2025-02-17 05:27:24] INFO:     Uvicorn running on http://0.0.0.0:40000 (Press CTRL+C to quit)
+[2025-02-17 05:27:25] INFO:     127.0.0.1:48908 - "GET /get_model_info HTTP/1.1" 200 OK
+[2025-02-17 05:27:25 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0
+[2025-02-17 05:27:32] INFO:     127.0.0.1:48924 - "POST /generate HTTP/1.1" 200 OK
+[2025-02-17 05:27:32] The server is fired up and ready to roll!
+```
+
+If it doesn’t start up successfully, please follow these steps to check for any remaining issues. Thanks!
+
+### Debug
+
+* Set `NCCL_DEBUG=TRACE` to check if it is a NCCL communication problem.
+
+This should resolve most NCCL-related issues.
+
+***Notice: If you find that NCCL_DEBUG=TRACE is not effective in the container environment, but the process is stuck or you encounter hard-to-diagnose issues, try switching to a different container image. Some images may not handle standard error output properly.***
+
+#### RoCE scenario
+
+* Please make sure that RDMA devices are available in the cluster environment.
+* Please make sure that the nodes in the cluster have Mellanox NICs with RoCE. In this example, we use Mellanox ConnectX 5 model NICs, and the proper OFED driver has been installed. If not, please refer to the document [Install OFED Driver](https://docs.nvidia.com/networking/display/mlnxofedv461000/installing+mellanox+ofed) to install the driver.
+* Check your env:
+
+  ```shell
+  $ lspci -nn | grep Eth | grep Mellanox
+  0000:7f:00.0 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
+  0000:7f:00.1 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
+  0000:c7:00.0 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
+  0000:c7:00.1 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
+  0001:08:00.0 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
+  0001:08:00.1 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
+  0001:a2:00.0 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
+  0001:a2:00.1 Ethernet controller [0200]: Mellanox Technologies MT43244 BlueField-3 integrated ConnectX-7 network controller [15b3:a2dc] (rev 01)
+  ```
+
+* Check the OFED driver:
+
+  ```shell
+  ofed_info -s
+  OFED-internal-23.07-0.5.0:
+  ```
+
+* Show RDMA link status and check IB devices:
+
+  ```shell
+  $ rdma link show
+  8/1: mlx5_bond_0/1: state ACTIVE physical_state LINK_UP netdev reth0
+  9/1: mlx5_bond_1/1: state ACTIVE physical_state LINK_UP netdev reth2
+  10/1: mlx5_bond_2/1: state ACTIVE physical_state LINK_UP netdev reth4
+  11/1: mlx5_bond_3/1: state ACTIVE physical_state LINK_UP netdev reth6
+
+  $ ibdev2netdev
+  8/1: mlx5_bond_0/1: state ACTIVE physical_state LINK_UP netdev reth0
+  9/1: mlx5_bond_1/1: state ACTIVE physical_state LINK_UP netdev reth2
+  10/1: mlx5_bond_2/1: state ACTIVE physical_state LINK_UP netdev reth4
+  11/1: mlx5_bond_3/1: state ACTIVE physical_state LINK_UP netdev reth6
+  ```
+
+* Test RoCE network speed on the host:
+
+  ```shell
+  yum install qperf
+  # for server：
+  execute qperf
+  # for client
+  qperf -t 60 -cm1 <server_ip>   rc_rdma_write_bw
+  ```
+
+* Check RDMA accessible in your container:
+
+  ```shell
+  # ibv_devices
+  # ibv_devinfo
+  ```
+
+## Keys to success
+
+* In the YAML configuration above, pay attention to the NCCL environment variable. For older versions of NCCL, you should check the NCCL_IB_GID_INDEX environment setting.
+* NCCL_SOCKET_IFNAME is also crucial, but in a containerized environment, this typically isn’t an issue.
+* In some cases, it’s necessary to configure GLOO_SOCKET_IFNAME correctly.
+* NCCL_DEBUG is essential for troubleshooting, but I've found that sometimes it doesn't show error logs within containers. This could be related to the Docker image you're using. You may want to try switching images if needed.
+* Avoid using Docker images based on Ubuntu 18.04, as they tend to have compatibility issues.
+
+## Remaining issues
+
+* In Kubernetes, Docker, or Containerd environments, we use hostNetwork to prevent performance degradation.
+* We utilize privileged mode, which  isn’t secure. Additionally, in containerized environments, full GPU isolation cannot be achieved.
+
+## TODO
+
+* Integrated with [k8s-rdma-shared-dev-plugin](https://github.com/Mellanox/k8s-rdma-shared-dev-plugin).
diff --git a/docs/references/multi_node_deployment/lws_pd/lws-examples/d-svc.yaml b/docs/references/multi_node_deployment/lws_pd/lws-examples/d-svc.yaml
new file mode 100644
index 000000000..27f98009e
--- /dev/null
+++ b/docs/references/multi_node_deployment/lws_pd/lws-examples/d-svc.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseekr10528-decode-main
+spec:
+  selector:
+    leaderworkerset.sigs.k8s.io/name: deepseekr10528-decode-main
+    role: leader
+  ports:
+    - protocol: TCP
+      port: 30000
+      targetPort: 30000
diff --git a/docs/references/multi_node_deployment/lws_pd/lws-examples/d.yaml b/docs/references/multi_node_deployment/lws_pd/lws-examples/d.yaml
new file mode 100644
index 000000000..ac1d295eb
--- /dev/null
+++ b/docs/references/multi_node_deployment/lws_pd/lws-examples/d.yaml
@@ -0,0 +1,290 @@
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: deepseekr10528-decode-main
+spec:
+  leaderWorkerTemplate:
+    leaderTemplate:
+      metadata:
+        labels:
+          role: leader
+      spec:
+        containers:
+        - command:
+          - python3
+          - -m
+          - sglang.launch_server
+          - --port
+          - "30000"
+          - --host
+          - "0.0.0.0"
+          - --model-path
+          - /work/models
+          - --chunked-prefill-size
+          - "262144"
+          - --page-size
+          - "64"
+          - --enable-dp-attention
+          - --enable-dp-lm-head
+          - --dp-size
+          - "16"
+          - --moe-a2a-backend
+          - deepep
+          - --disaggregation-mode
+          - decode
+          - --mem-fraction-static
+          - "0.849"
+          - --context-length
+          - "32768"
+          - --disaggregation-ib-device
+          - "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
+          - --cuda-graph-max-bs
+          - "64"
+          - --max-running-requests
+          - "2048"
+          - --tp-size
+          - "16" # Size of Tensor Parallelism
+          - --dist-init-addr
+          - $(LWS_LEADER_ADDRESS):20102
+          - --nnodes
+          - $(LWS_GROUP_SIZE)
+          - --node-rank
+          - $(LWS_WORKER_INDEX)
+          - --trust-remote-code
+          - --ep-num-redundant-experts
+          - "32"
+          - --moe-dense-tp-size
+          - "1"
+          env:
+          - name: CUDA_LAUNCH_BLOCKING
+            value: "0"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "3"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: NVSHMEM_HCA_PE_MAPPING
+            value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
+          - name:  NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: "none"
+          - name: NCCL_IB_TC
+            value: "136"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: NCCL_IB_SL
+            value: "5"
+          - name: MC_TE_METRIC
+            value: "true"
+          - name: SGLANG_MOONCAKE_TRANS_THREAD
+            value: "16"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: lmsysorg/sglang:latest
+          name: sglang-leader
+          ports:
+          - containerPort: 30000
+            protocol: TCP
+          readinessProbe:
+            periodSeconds: 30
+            tcpSocket:
+              port: 30000
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          securityContext:
+            capabilities:
+              add:
+              - IPC_LOCK
+            privileged: true
+          volumeMounts:
+          - mountPath: /root/.cache
+            name: sgl-cache
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /work/models
+            name: model
+          - mountPath: /dev/infiniband
+            name: ib
+          - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+            name: cf
+        dnsPolicy: ClusterFirstWithHostNet
+        hostIPC: true
+        hostNetwork: true
+        nodeSelector:
+        # should modify according your deployment env
+          pd: "yes"
+        tolerations:
+        # should modify according your deployment env
+        - key: bopd
+          operator: Exists
+        - key: node-role
+          operator: Exists
+        volumes:
+        - hostPath:
+            path: /data1/sgl_cache1
+            type: DirectoryOrCreate
+          name: sgl-cache
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+          name: model
+        - hostPath:
+            path: /dev/infiniband
+          name: ib
+        - hostPath:
+            path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+          name: cf
+    restartPolicy: RecreateGroupOnPodRestart
+    size:  2
+    workerTemplate:
+      metadata: {}
+      spec:
+        containers:
+        - command:
+          - python3
+          - -m
+          - sglang.launch_server
+          - --model-path
+          - /work/models
+          - --chunked-prefill-size
+          - "262144"
+          - --page-size
+          - "64"
+          - --enable-dp-attention
+          - --enable-dp-lm-head
+          - --dp-size
+          - "16"
+          - --moe-a2a-backend
+          - deepep
+          - --disaggregation-mode
+          - decode
+          - --mem-fraction-static
+          - "0.849"
+          - --context-length
+          - "32768"
+          - --disaggregation-ib-device
+          - "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
+          - --cuda-graph-max-bs
+          - "64"
+          - --max-running-requests
+          - "2048"
+          - --tp-size
+          - "16" # Size of Tensor Parallelism
+          - --dist-init-addr
+          - $(LWS_LEADER_ADDRESS):20102
+          - --nnodes
+          - $(LWS_GROUP_SIZE)
+          - --node-rank
+          - $(LWS_WORKER_INDEX)
+          - --trust-remote-code
+          - --ep-num-redundant-experts
+          - "32"
+          - --moe-dense-tp-size
+          - "1"
+          env:
+          - name: NVSHMEM_IB_TRAFFIC_CLASS
+            value: "16"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "3"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: NVSHMEM_HCA_PE_MAPPING
+            value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
+          - name:  NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: "none"
+          - name: NCCL_IB_TC
+            value: "136"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: MC_TE_METRIC
+            value: "true"
+          - name: NCCL_IB_SL
+            value: "5"
+          - name: SGLANG_MOONCAKE_TRANS_THREAD
+            value: "16"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: lmsysorg/sglang:latest
+          name: sglang-worker
+          ports:
+          - containerPort: 30001
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          securityContext:
+            capabilities:
+              add:
+              - IPC_LOCK
+            privileged: true
+          volumeMounts:
+          - mountPath: /root/.cache
+            name: sgl-cache
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /work/models
+            name: model
+          - mountPath: /dev/infiniband
+            name: ib
+          - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+            name: cf
+        dnsPolicy: ClusterFirstWithHostNet
+        hostIPC: true
+        hostNetwork: true
+        nodeSelector:
+        # should modify according your deployment env
+          pd: "yes"
+        tolerations:
+        # should modify according your deployment env
+        - key: bopd
+          operator: Exists
+        - key: node-role
+          operator: Exists
+        volumes:
+        - hostPath:
+            path: /data1/sgl_cache1
+            type: DirectoryOrCreate
+          name: sgl-cache
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            path: /dev/infiniband
+          name: ib
+        - hostPath:
+            # modify according to you deployment env
+            path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+          name: model
+        - hostPath:
+            # modify according to you deployment env
+            path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+          name: cf
+  networkConfig:
+    subdomainPolicy: Shared
+  replicas: 1
+  rolloutStrategy:
+    rollingUpdateConfiguration:
+      maxSurge: 0
+      maxUnavailable: 1
+    type: RollingUpdate
+  startupPolicy: LeaderCreated
diff --git a/docs/references/multi_node_deployment/lws_pd/lws-examples/lb.yaml b/docs/references/multi_node_deployment/lws_pd/lws-examples/lb.yaml
new file mode 100644
index 000000000..4ca690969
--- /dev/null
+++ b/docs/references/multi_node_deployment/lws_pd/lws-examples/lb.yaml
@@ -0,0 +1,56 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: deepseekr10528-lb-main
+  labels:
+    app: deepseekr10528-lb
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: deepseekr10528-lb
+  template:
+    metadata:
+      labels:
+        app: deepseekr10528-lb
+    spec:
+      nodeSelector:
+          bo: "yes"
+      tolerations:
+        - key: bopd
+          operator: Exists
+        - key: node-role
+          operator: Exists
+      containers:
+        - name: sgl-minilb
+          image: lmsysorg/sglang:latest
+          command:
+          - python
+          - -m
+          - sglang_router.launch_router
+          - --pd-disaggregation
+          - --prefill
+          - http://deepseekr10528-prefill-main:30000
+          - --decode
+          - http://deepseekr10528-decode-main:30000
+          - --host
+          - 0.0.0.0
+          - --port
+          -  "8000"
+          ports:
+            - containerPort: 8000
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseekr10528-lb-service
+spec:
+  type: NodePort # NodePort is easy to test, you can also specify `ClusterIP`
+  selector:
+    app: deepseekr10528-lb
+  ports:
+    - protocol: TCP
+      port: 8000         # Service Port（In-Cluster）
+      targetPort: 8000   # Exposed Container
+      nodePort: 30800
diff --git a/docs/references/multi_node_deployment/lws_pd/lws-examples/p-svc.yaml b/docs/references/multi_node_deployment/lws_pd/lws-examples/p-svc.yaml
new file mode 100644
index 000000000..6826a13df
--- /dev/null
+++ b/docs/references/multi_node_deployment/lws_pd/lws-examples/p-svc.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseekr10528-prefill-main
+spec:
+  selector:
+    leaderworkerset.sigs.k8s.io/name: deepseekr10528-prefill-main
+    role: leader
+  ports:
+    - protocol: TCP
+      port: 30000
+      targetPort: 30000
diff --git a/docs/references/multi_node_deployment/lws_pd/lws-examples/p.yaml b/docs/references/multi_node_deployment/lws_pd/lws-examples/p.yaml
new file mode 100644
index 000000000..62df262bb
--- /dev/null
+++ b/docs/references/multi_node_deployment/lws_pd/lws-examples/p.yaml
@@ -0,0 +1,304 @@
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: deepseekr10528-prefill-main
+spec:
+  leaderWorkerTemplate:
+    leaderTemplate:
+      metadata:
+        labels:
+          role: leader
+      spec:
+        containers:
+        - command:
+          - python3
+          - -m
+          - sglang.launch_server
+          - --port
+          - "30000"
+          - --host
+          - "0.0.0.0"
+          - --model-path
+          - /work/models
+          - --disaggregation-ib-device
+          # should modify according your rdma env
+          - mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
+          - --chunked-prefill-size
+          - "524288"
+          - --max-prefill-tokens
+          - "32768"
+          - --page-size
+          - "64"
+          - --ep-dispatch-algorithm
+          - dynamic
+          - --eplb-algorithm
+          - deepseek
+          - --enable-dp-lm-head
+          - --enable-dp-attention
+          - --dp-size
+          - "16"
+          - --disable-radix-cache
+          - --moe-a2a-backend
+          - deepep
+          - --disaggregation-mode
+          - prefill
+          - --mem-fraction-static
+          - "0.7"
+          - --context-length
+          - "32768"
+          - --tp
+          - "16"
+          - --dist-init-addr
+          - $(LWS_LEADER_ADDRESS):20102
+          - --nnodes
+          - $(LWS_GROUP_SIZE)
+          - --node-rank
+          - $(LWS_WORKER_INDEX)
+          - --trust-remote-code
+          - --ep-num-redundant-experts
+          - "32"
+          - --moe-dense-tp-size
+          - "1"
+          - --max-running-requests
+          - "1024"
+          env:
+          - name: NVSHMEM_HCA_PE_MAPPING
+            # should modify according your rdma env
+            value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "3"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: SGLANG_SET_CPU_AFFINITY
+            value: "true"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: none
+          - name: NCCL_IB_TC
+            value: "136"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: MC_TE_METRIC
+            value: "false"
+          - name: NCCL_IB_SL
+            value: "5"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: lmsysorg/sglang:latest
+          name: sglang-leader
+          ports:
+          - containerPort: 30000
+            protocol: TCP
+          readinessProbe:
+            periodSeconds: 30
+            tcpSocket:
+              port: 30000
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          securityContext:
+            capabilities:
+              add:
+              - IPC_LOCK
+            privileged: true
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /work/models
+            name: model
+          - mountPath: /dev/infiniband
+            name: ib
+          - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+            name: cf
+          - mountPath: /root/.cache
+            name: sgl-cache
+        dnsPolicy: ClusterFirstWithHostNet
+        hostIPC: true
+        hostNetwork: true
+        nodeSelector:
+        # should modify according your deployment env
+          pd: "yes"
+        tolerations:
+        # should modify according your deployment env
+        - key: bopd
+          operator: Exists
+        - key: node-role
+          operator: Exists
+        volumes:
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+          name: model
+        - hostPath:
+            path: /dev/infiniband
+          name: ib
+        - hostPath:
+            path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+          name: cf
+        - hostPath:
+            path: /data1/sgl_cache
+            type: DirectoryOrCreate
+          name: sgl-cache
+    restartPolicy: RecreateGroupOnPodRestart
+    size: 2
+    workerTemplate:
+      metadata: {}
+      spec:
+        containers:
+        - command:
+          - python3
+          - -m
+          - sglang.launch_server
+          - --model-path
+          - /work/models
+          - --disaggregation-ib-device
+          # should modify according your rdma env
+          - mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
+          - --chunked-prefill-size
+          - "524288"
+          - --max-prefill-tokens
+          - "32768"
+          - --page-size
+          - "64"
+          - --ep-dispatch-algorithm
+          - dynamic
+          - --eplb-algorithm
+          - deepseek
+          #          - --deepep-config
+          #          -  /home/aiges/tuned/tuned_8sms.json
+          # can be tuned using deepep test scripts
+          - --enable-dp-lm-head
+          - --enable-dp-attention
+          - --dp-size
+          - "16"
+          - --disable-radix-cache
+          - --moe-a2a-backend
+          - deepep
+          - --disaggregation-mode
+          - prefill
+          - --mem-fraction-static
+          - "0.7"
+          - --context-length
+          - "32768"
+          - --tp
+          - "16"
+          - --dist-init-addr
+          - $(LWS_LEADER_ADDRESS):20102
+          - --nnodes
+          - $(LWS_GROUP_SIZE)
+          - --node-rank
+          - $(LWS_WORKER_INDEX)
+          - --trust-remote-code
+          - --ep-num-redundant-experts
+          - "32"
+          - --moe-dense-tp-size
+          - "1"
+          - --max-running-requests
+          - "1024"
+          env:
+          - name: SGLANG_SET_CPU_AFFINITY
+            value: "true"
+          - name: NVSHMEM_HCA_PE_MAPPING
+            # should modify according your rdma env
+            value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: NVSHMEM_IB_TRAFFIC_CLASS
+            value: "16"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "3"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: CUDA_LAUNCH_BLOCKING
+            value: "0"
+          - name: SGLANG_MOONCAKE_TRANS_THREAD
+            value: "8"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: SGL_CHUNKED_PREFIX_CACHE_THRESHOLD
+            value: "0"
+          - name: NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: none
+          - name: NCCL_IB_TC
+            value: "136"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: MC_TE_METRIC
+            value: "true"
+          - name: NCCL_IB_SL
+            value: "5"
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: lmsysorg/sglang:latest
+          name: sglang-worker
+          ports:
+          - containerPort: 30001
+            protocol: TCP
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          securityContext:
+            capabilities:
+              add:
+              - IPC_LOCK
+            privileged: true
+          volumeMounts:
+          - mountPath: /root/.cache
+            name: sgl-cache
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /work/models
+            name: model
+          - mountPath: /dev/infiniband
+            name: ib
+          - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+            name: cf
+        dnsPolicy: ClusterFirstWithHostNet
+        hostIPC: true
+        hostNetwork: true
+        nodeSelector:
+        # should modify according your deployment env
+          pd: "yes"
+        tolerations:
+        # should modify according your deployment env
+        - key: bopd
+          operator: Exists
+        - key: node-role
+          operator: Exists
+        volumes:
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            path: /dev/infiniband
+          name: ib
+        - hostPath:
+            # modify according to you deployment env
+            path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+          name: model
+        - hostPath:
+            # modify according to you deployment env
+            path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+          name: cf
+        - hostPath:
+            # modify according to you deployment env
+            path: /data1/sgl_cache
+            type: DirectoryOrCreate
+          name: sgl-cache
diff --git a/docs/references/multi_node_deployment/lws_pd/lws_pd_deploy.md b/docs/references/multi_node_deployment/lws_pd/lws_pd_deploy.md
new file mode 100644
index 000000000..eb8454997
--- /dev/null
+++ b/docs/references/multi_node_deployment/lws_pd/lws_pd_deploy.md
@@ -0,0 +1,783 @@
+# LWS Based PD Deploy
+
+## 0. Prerequisites
+
+1. k8s >=1.26
+2. lws installed on k8s.
+
+## 1. Image Preparation
+
+`lmsysorg/sglang:deepep`
+
+## 2. Deployment Manifest Files
+
+***Notice: We will package all deployment files into Helm Chart format in the near future. Interested community members can contact us to contribute***
+
+### Prefill
+
+Prefill manifest file [prefill.yaml](lws-examples/p.yaml)
+
+*Note: The NodeSelector section, model location section, and taint toleration section can be adjusted according to your actual deployment environment*
+
+```yaml
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: deepseekr10528-prefill-main
+spec:
+  leaderWorkerTemplate:
+    leaderTemplate:
+      metadata:
+        labels:
+          role: leader
+      spec:
+        containers:
+        - command:
+          - python3
+          - -m
+          - sglang.launch_server
+          - --port
+          - "30000"
+          - --host
+          - "0.0.0.0"
+          - --model-path
+          - /work/models
+          - --disaggregation-ib-device
+          # should modify according your rdma env
+          - mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
+          - --chunked-prefill-size
+          - "524288"
+          - --max-prefill-tokens
+          - "32768"
+          - --page-size
+          - "64"
+          #          - --init-expert-location
+          #          - /home/aiges/tuned/attachment_ep_statistics/prefill_in1024.json
+          - --ep-dispatch-algorithm
+          - dynamic
+          - --eplb-algorithm
+          - deepseek
+          #          - --deepep-config
+          #          -  /home/aiges/tuned/tuned_8sms.json
+          - --enable-dp-lm-head
+          - --enable-dp-attention
+          - --dp-size
+          - "16"
+          - --disable-radix-cache
+          - --moe-a2a-backend
+          - deepep
+          - --disaggregation-mode
+          - prefill
+          - --mem-fraction-static
+          - "0.7"
+          - --context-length
+          - "32768"
+          - --tp
+          - "16"
+          - --dist-init-addr
+          - $(LWS_LEADER_ADDRESS):20102
+          - --nnodes
+          - $(LWS_GROUP_SIZE)
+          - --node-rank
+          - $(LWS_WORKER_INDEX)
+          - --trust-remote-code
+          - --ep-num-redundant-experts
+          - "32"
+          - --moe-dense-tp-size
+          - "1"
+          - --max-running-requests
+          - "1024"
+          env:
+#          - name: NVSHMEM_HCA_PE_MAPPING
+#            value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
+#          - name: NVSHMEM_HCA_LIST
+#            value: "mlx5_bond_0:1,mlx5_bond_1:1,mlx5_bond_2:1,mlx5_bond_3:1"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "3"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: SGLANG_SET_CPU_AFFINITY
+            value: "true"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: none
+          - name: NCCL_IB_TC
+            value: "136"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: MC_TE_METRIC
+            value: "false"
+          - name: NCCL_IB_SL
+            value: "5"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: lmsysorg/sglang:deepep
+          name: sglang-leader
+          ports:
+          - containerPort: 30000
+            protocol: TCP
+          readinessProbe:
+            periodSeconds: 30
+            tcpSocket:
+              port: 30000
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          securityContext:
+            capabilities:
+              add:
+              - IPC_LOCK
+            privileged: true
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /work/models
+            name: model
+          - mountPath: /dev/infiniband
+            name: ib
+          - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+            name: cf
+          - mountPath: /root/.cache
+            name: sgl-cache
+        dnsPolicy: ClusterFirstWithHostNet
+        hostIPC: true
+        hostNetwork: true
+        nodeSelector:
+          pd: "yes"
+        tolerations:
+        - key: pd
+          operator: Exists
+        - key: node-role
+          operator: Exists
+        volumes:
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            # modify according to you deployment env
+            path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+          name: model
+        - hostPath:
+            path: /dev/infiniband
+          name: ib
+        - hostPath:
+            # modify according to you deployment env
+            path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+          name: cf
+        - hostPath:
+            # modify according to you deployment env
+            path: /data1/sgl_cache
+            type: DirectoryOrCreate
+          name: sgl-cache
+    restartPolicy: RecreateGroupOnPodRestart
+    size: 2
+    workerTemplate:
+      metadata: {}
+      spec:
+        containers:
+        - command:
+          - python3
+          - -m
+          - sglang.launch_server
+          - --model-path
+          - /work/models
+          - --disaggregation-ib-device
+          - mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
+          - --chunked-prefill-size
+          - "524288"
+          - --max-prefill-tokens
+          - "32768"
+          - --page-size
+          - "64"
+          #- --init-expert-location
+          #- /home/aiges/tuned/attachment_ep_statistics/prefill_in1024.json
+          - --ep-dispatch-algorithm
+          - dynamic
+          - --eplb-algorithm
+          - deepseek
+#          - --deepep-config
+#          -  /home/aiges/tuned/tuned_8sms.json
+          - --enable-dp-lm-head
+          - --enable-dp-attention
+          - --dp-size
+          - "16"
+          - --disable-radix-cache
+          - --moe-a2a-backend
+          - deepep
+          - --disaggregation-mode
+          - prefill
+          - --mem-fraction-static
+          - "0.7"
+          - --context-length
+          - "32768"
+          - --tp
+          - "16"
+          - --dist-init-addr
+          - $(LWS_LEADER_ADDRESS):20102
+          - --nnodes
+          - $(LWS_GROUP_SIZE)
+          - --node-rank
+          - $(LWS_WORKER_INDEX)
+          - --trust-remote-code
+          - --ep-num-redundant-experts
+          - "32"
+          - --moe-dense-tp-size
+          - "1"
+          - --max-running-requests
+          - "1024"
+          env:
+          - name: SGLANG_SET_CPU_AFFINITY
+            value: "true"
+          - name: SGLANG_HACK_DEEPEP_NUM_SMS
+            value: "8"
+          - name: SGLANG_HACK_DEEPEP_NEW_MODE
+            value: "0"
+#          - name: NVSHMEM_HCA_PE_MAPPING
+#            value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
+#          - name: NVSHMEM_HCA_LIST
+#            value: "mlx5_bond_0:1,mlx5_bond_1:1,mlx5_bond_2:1,mlx5_bond_3:1"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: NVSHMEM_IB_TRAFFIC_CLASS
+            value: "16"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "3"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: CUDA_LAUNCH_BLOCKING
+            value: "0"
+          - name: SGLANG_MOONCAKE_TRANS_THREAD
+            value: "8"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: SGL_CHUNKED_PREFIX_CACHE_THRESHOLD
+            value: "0"
+          - name: NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: none
+          - name: NCCL_IB_TC
+            value: "136"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: MC_TE_METRIC
+            value: "true"
+          - name: NCCL_IB_SL
+            value: "5"
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: lmsysorg/sglang:deepep
+          name: sglang-worker
+          ports:
+          - containerPort: 30001
+            protocol: TCP
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          securityContext:
+            capabilities:
+              add:
+              - IPC_LOCK
+            privileged: true
+          volumeMounts:
+
+          - mountPath: /root/.cache
+            name: sgl-cache
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /work/models
+            name: model
+          - mountPath: /dev/infiniband
+            name: ib
+          - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+            name: cf
+        dnsPolicy: ClusterFirstWithHostNet
+        hostIPC: true
+        hostNetwork: true
+        nodeSelector:
+          pd: "yes"
+        tolerations:
+        - key: pd
+          operator: Exists
+        - key: node-role
+          operator: Exists
+        volumes:
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            path: /dev/infiniband
+          name: ib
+        - hostPath:
+            path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+          name: model
+        - hostPath:
+            path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+          name: cf
+        - hostPath:
+            path: /data1/sgl_cache
+            type: DirectoryOrCreate
+          name: sgl-cache
+
+```
+
+### Decode
+
+Decode node deployment manifest file [decode.yaml](lws-examples/d.yaml)
+
+*Note: The NodeSelector section, model location section, and taint toleration section can be adjusted according to your actual deployment environment*
+
+```yaml
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: deepseekr10528-decode-main
+spec:
+  leaderWorkerTemplate:
+    leaderTemplate:
+      metadata:
+        labels:
+          role: leader
+      spec:
+        containers:
+        - command:
+          - python3
+          - -m
+          - sglang.launch_server
+          - --port
+          - "30000"
+          - --host
+          - "0.0.0.0"
+          - --model-path
+          - /work/models
+          - --chunked-prefill-size
+          - "262144"
+          - --page-size
+          - "64"
+          - --enable-dp-attention
+          - --enable-dp-lm-head
+          - --dp-size
+          - "16"
+          - --moe-a2a-backend
+          - deepep
+          - --disaggregation-mode
+          - decode
+          - --mem-fraction-static
+          -  "0.849"
+          - --context-length
+          - "32768"
+          - --disaggregation-ib-device
+          - "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
+          - --cuda-graph-max-bs
+          - "64"
+          - --max-running-requests
+          - "2048"
+          - --tp-size
+          - "16" # Size of Tensor Parallelism
+          - --dist-init-addr
+          - $(LWS_LEADER_ADDRESS):20102
+          - --nnodes
+          - $(LWS_GROUP_SIZE)
+          - --node-rank
+          - $(LWS_WORKER_INDEX)
+          - --trust-remote-code
+          - --ep-num-redundant-experts
+          - "32"
+          - --moe-dense-tp-size
+          - "1"
+          env:
+          - name: CUDA_LAUNCH_BLOCKING
+            value: "0"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "3"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name:  NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: "none"
+          - name: NCCL_IB_TC
+            value: "136"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: NCCL_IB_SL
+            value: "5"
+          - name: MC_TE_METRIC
+            value: "true"
+          - name: SGLANG_MOONCAKE_TRANS_THREAD
+            value: "16"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: lmsysorg/sglang:deepep
+          name: sglang-leader
+          ports:
+          - containerPort: 30000
+            protocol: TCP
+          readinessProbe:
+            periodSeconds: 30
+            tcpSocket:
+              port: 30000
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          securityContext:
+            capabilities:
+              add:
+              - IPC_LOCK
+            privileged: true
+          volumeMounts:
+          - mountPath: /root/.cache
+            name: sgl-cache
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /work/models
+            name: model
+          - mountPath: /dev/infiniband
+            name: ib
+          - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+            name: cf
+        dnsPolicy: ClusterFirstWithHostNet
+        hostIPC: true
+        hostNetwork: true
+        nodeSelector:
+          pd: "yes"
+        tolerations:
+        - key: pd
+          operator: Exists
+        - key: node-role
+          operator: Exists
+        volumes:
+        - hostPath:
+            path: /data1/sgl_cache1
+            type: DirectoryOrCreate
+          name: sgl-cache
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+          name: model
+        - hostPath:
+            path: /dev/infiniband
+          name: ib
+        - hostPath:
+            path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+          name: cf
+    restartPolicy: RecreateGroupOnPodRestart
+    size:  2
+    workerTemplate:
+      metadata: {}
+      spec:
+        containers:
+        - command:
+          - python3
+          - -m
+          - sglang.launch_server
+          - --model-path
+          - /work/models
+          - --chunked-prefill-size
+          - "262144"
+          - --page-size
+          - "64"
+          - --enable-dp-attention
+          - --enable-dp-lm-head
+            #- --enable-two-batch-overlap
+          - --dp-size
+          - "16"
+          - --moe-a2a-backend
+          - deepep
+          - --disaggregation-mode
+          - decode
+          - --mem-fraction-static
+          -  "0.849"
+          - --context-length
+          - "32768"
+          - --disaggregation-ib-device
+          # should modify according your rdma env
+          - "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
+          - --cuda-graph-max-bs
+          - "64"
+          - --max-running-requests
+          - "2048"
+          - --tp-size
+          - "16" # Size of Tensor Parallelism
+          - --dist-init-addr
+          - $(LWS_LEADER_ADDRESS):20102
+          - --nnodes
+          - $(LWS_GROUP_SIZE)
+          - --node-rank
+          - $(LWS_WORKER_INDEX)
+          - --trust-remote-code
+          - --ep-num-redundant-experts
+          - "32"
+          - --moe-dense-tp-size
+          - "1"
+          env:
+          - name: SGLANG_HACK_DEEPEP_NUM_SMS
+            value: "24"
+          - name: SGLANG_HACK_DEEPEP_NEW_MODE
+            value: "0"
+          - name: NVSHMEM_IB_TRAFFIC_CLASS
+            value: "16"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "3"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name:  NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: "none"
+          - name: NCCL_IB_TC
+            value: "136"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: MC_TE_METRIC
+            value: "true"
+          - name: NCCL_IB_SL
+            value: "5"
+          - name: SGLANG_MOONCAKE_TRANS_THREAD
+            value: "16"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: lmsysorg/sglang:deepep
+          name: sglang-worker
+          ports:
+          - containerPort: 30001
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          securityContext:
+            capabilities:
+              add:
+              - IPC_LOCK
+            privileged: true
+          volumeMounts:
+          - mountPath: /root/.cache
+            name: sgl-cache
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /work/models
+            name: model
+          - mountPath: /dev/infiniband
+            name: ib
+          - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+            name: cf
+        dnsPolicy: ClusterFirstWithHostNet
+        hostIPC: true
+        hostNetwork: true
+        nodeSelector:
+          pd: "yes"
+        tolerations:
+        - key: pd
+          operator: Exists
+        - key: node-role
+          operator: Exists
+        volumes:
+        - hostPath:
+            path: /data1/sgl_cache1
+            type: DirectoryOrCreate
+          name: sgl-cache
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            path: /dev/infiniband
+          name: ib
+        - hostPath:
+            # modify according to you deployment env
+            path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+          name: model
+        - hostPath:
+            # modify according to you deployment env
+            path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+          name: cf
+  networkConfig:
+    subdomainPolicy: Shared
+  replicas: 1
+  rolloutStrategy:
+    rollingUpdateConfiguration:
+      maxSurge: 0
+      maxUnavailable: 1
+    type: RollingUpdate
+  startupPolicy: LeaderCreated
+```
+
+Execute separately:
+
+```bash
+kubectl apply -f p.yaml
+kubectl apply -f d.yaml
+```
+
+At this point, we have completed the deployment of the 1P1D SGlang engine part.
+
+To allow our users to directly experience the model API, we still need a load balancer to handle sequential calls between prefill and decode. Different companies implement LBs differently, and the community will also officially release a new LB component written in Rust in the near future.
+
+Currently, we use a static K8S service + minilb approach to implement model API calls.
+
+### Creating Service for Prefill and Decode
+
+#### Create prefill k8s service
+[p-svc.yaml](lws-examples/p-svc.yaml)
+```yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseekr10528-prefill-main
+spec:
+  selector:
+    leaderworkerset.sigs.k8s.io/name: deepseekr10528-prefill-main
+    role: leader
+  ports:
+    - protocol: TCP
+      port: 30000
+      targetPort: 30000
+```
+Execute `kubectl apply -f p-svc.yaml`
+
+#### Create decode k8s service
+[d-svc.yaml](lws-examples/d-svc.yaml)
+```yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseekr10528-decode-main
+spec:
+  selector:
+    leaderworkerset.sigs.k8s.io/name: deepseekr10528-decode-main
+    role: leader
+  ports:
+    - protocol: TCP
+      port: 30000
+      targetPort: 30000
+```
+Execute `kubectl apply -f d-svc.yaml`
+
+#### Deploy minilb and lb service
+[lb.yaml](lws-examples/lb.yaml)
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: deepseekr10528-lb-main
+  labels:
+    app: deepseekr10528-lb
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: deepseekr10528-lb
+  template:
+    metadata:
+      labels:
+        app: deepseekr10528-lb
+    spec:
+      nodeSelector:
+          pd: "yes"
+      tolerations:
+        - key: pd
+          operator: Exists
+        - key: node-role
+          operator: Exists
+      containers:
+        - name: sgl-minilb
+          image: lmsysorg/sglang:deepep
+          command:
+          - python
+          - -m
+          - sglang_router.launch_router
+          - --pd-disaggregation
+          - --prefill
+          - http://deepseekr10528-prefill-main:30000
+          - --decode
+          - http://deepseekr10528-decode-main:30000
+          - --host
+          - 0.0.0.0
+          - --port
+          -  "8000"
+          ports:
+            - containerPort: 8000
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseekr10528-lb-service
+spec:
+  type: NodePort
+  selector:
+    app: deepseekr10528-lb
+  ports:
+    - protocol: TCP
+      port: 8000         # Service Port（In-Cluster）
+      targetPort: 8000   # Exposed Container
+      nodePort: 30800
+```
+Execute `kubectl apply -f lb.yaml`
+
+After waiting for all model deployments to succeed, you will get the following output:
+
+```bash
+[root@ecs-001]# kubectl get po
+deepseekr10528-decode-main-0             1/1     Running   0          74m
+deepseekr10528-decode-main-0-1           1/1     Running   0          74m
+deepseekr10528-lb-main-9c5dbfc57-6lcbd   1/1     Running   0          22m
+deepseekr10528-prefill-main-0            1/1     Running   0          74m
+deepseekr10528-prefill-main-0-1          1/1     Running   0          74m
+[root@ecs-cbm-x1-pd-cpu-001 main_doc]# kubectl  get svc |grep dee
+deepseekr10528-decode-main    ClusterIP   None             <none>        <none>           97m
+deepseekr10528-lb-service     NodePort    172.16.242.169   <none>        8000:30800/TCP   22m
+deepseekr10528-prefill-main   ClusterIP   None             <none>        <none>           97m
+```
+
+At this point, select a nodePort:30800 to access:
+
+```bash
+[root@ecs-001]# curl -X POST "http://{nodePort}:30800/v1/chat/completions" \
+>     -H "Content-Type: application/json" \
+>     -H "Authorization: Bearer None" \
+>     -d '{
+>        "rid":"ccccdd",
+>         "model": "r1",
+>         "messages": [
+>             {"role": "system", "content": "0: You are a helpful AI assistant"},
+>             {"role": "user", "content": "你是谁？."}
+>         ],
+>         "max_tokens":221
+>     }'
+{"id":"ccccdd","object":"chat.completion","created":1750252498,"model":"qwen2","choices":[{"index":0,"message":{"role":"assistant","content":"<think>\n嗯，用户问了一个很基础的自我介绍问题"你是谁？"。这可能是第一次互动时的常规开场白，也可能是想确认我的身份和功能范围。\n\n用户没有提供任何背景信息，语气简洁中性。这种场景下新用户的可能性较高，需要给出清晰友好的自我介绍，同时突出实用价值来降低陌生感。\n\n考虑到中文用户，应该用简体中文回复。重点要说明三点：身份归属（深度求索）、功能定位（AI助手）、服务范围（学习/工作/生活）。结尾用开放性问题引导对话很关键——既能了解需求，又能避免让用户面对空白输入框时不知所措。\n\n用波浪线结尾可以软化语气，那个笑脸表情😊刚好能中和AI的机械感。不过要控制表情符号数量，避免显得轻浮。\n</think>\n你好呀！我是你的AI助手，由深度求索公司（DeepSeek）开发的语言模型，名字叫 **DeepSeek-R1**。你可以把我当成一个知识丰富、随叫随到的小帮手～😊\n\n我的任务就是陪你聊天、解答问题、","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":"length","matched_stop":null}],"usage":{"prompt_tokens":14,"total_tokens":235,"completion_tokens":221,"prompt_tokens_details":null}}
+
+```
+## FAQ
+
+1. The current deployment startup parameters may not be fully compatible with all RDMA scenarios. Different RDMA NCCL-related environment configurations may be needed in different network environments.
+
+2. Some preset, optimized configurations for EPLB are not used here. You can adjust them according to [6017](https://github.com/sgl-project/sglang/issues/6017) as needed.
diff --git a/docs/references/multi_node_deployment/multi_node.md b/docs/references/multi_node_deployment/multi_node.md
new file mode 100644
index 000000000..204b60586
--- /dev/null
+++ b/docs/references/multi_node_deployment/multi_node.md
@@ -0,0 +1,90 @@
+# Multi-Node Deployment
+
+## Llama 3.1 405B
+
+**Run 405B (fp16) on Two Nodes**
+
+```bash
+# replace 172.16.4.52:20000 with your own node ip address and port of the first node
+
+python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --dist-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0
+
+python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --dist-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1
+```
+
+Note that LLama 405B (fp8) can also be launched on a single node.
+
+```bash
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
+```
+
+## DeepSeek V3/R1
+
+Please refer to [DeepSeek documents for reference](https://docs.sglang.ai/basic_usage/deepseek.html#running-examples-on-multi-node).
+
+## Multi-Node Inference on SLURM
+
+This example showcases how to serve SGLang server across multiple nodes by SLURM. Submit the following job to the SLURM cluster.
+
+```
+#!/bin/bash -l
+
+#SBATCH -o SLURM_Logs/%x_%j_master.out
+#SBATCH -e SLURM_Logs/%x_%j_master.err
+#SBATCH -D ./
+#SBATCH -J Llama-405B-Online-Inference-TP16-SGL
+
+#SBATCH --nodes=2
+#SBATCH --ntasks=2
+#SBATCH --ntasks-per-node=1  # Ensure 1 task per node
+#SBATCH --cpus-per-task=18
+#SBATCH --mem=224GB
+#SBATCH --partition="lmsys.org"
+#SBATCH --gres=gpu:8
+#SBATCH --time=12:00:00
+
+echo "[INFO] Activating environment on node $SLURM_PROCID"
+if ! source ENV_FOLDER/bin/activate; then
+    echo "[ERROR] Failed to activate environment" >&2
+    exit 1
+fi
+
+# Define parameters
+model=MODEL_PATH
+tp_size=16
+
+echo "[INFO] Running inference"
+echo "[INFO] Model: $model"
+echo "[INFO] TP Size: $tp_size"
+
+# Set NCCL initialization address using the hostname of the head node
+HEAD_NODE=$(scontrol show hostname "$SLURM_NODELIST" | head -n 1)
+NCCL_INIT_ADDR="${HEAD_NODE}:8000"
+echo "[INFO] NCCL_INIT_ADDR: $NCCL_INIT_ADDR"
+
+# Launch the model server on each node using SLURM
+srun --ntasks=2 --nodes=2 --output="SLURM_Logs/%x_%j_node$SLURM_NODEID.out" \
+    --error="SLURM_Logs/%x_%j_node$SLURM_NODEID.err" \
+    python3 -m sglang.launch_server \
+    --model-path "$model" \
+    --grammar-backend "xgrammar" \
+    --tp "$tp_size" \
+    --dist-init-addr "$NCCL_INIT_ADDR" \
+    --nnodes 2 \
+    --node-rank "$SLURM_NODEID" &
+
+# Wait for the NCCL server to be ready on port 30000
+while ! nc -z "$HEAD_NODE" 30000; do
+    sleep 1
+    echo "[INFO] Waiting for $HEAD_NODE:30000 to accept connections"
+done
+
+echo "[INFO] $HEAD_NODE:30000 is ready to accept connections"
+
+# Keep the script running until the SLURM job times out
+wait
+```
+
+Then, you can test the server by sending requests following other [documents](https://docs.sglang.ai/backend/openai_api_completions.html).
+
+Thanks for [aflah02](https://github.com/aflah02) for providing the example, based on his [blog post](https://aflah02.substack.com/p/multi-node-llm-inference-with-sglang).
diff --git a/docs/references/multi_node_deployment/multi_node_index.rst b/docs/references/multi_node_deployment/multi_node_index.rst
new file mode 100644
index 000000000..03411f5be
--- /dev/null
+++ b/docs/references/multi_node_deployment/multi_node_index.rst
@@ -0,0 +1,13 @@
+Multi-Node Deployment
+=====================
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Multi-Node Deployment
+
+   multi_node.md
+   deploy_on_k8s.md
+   lws_pd/lws_pd_deploy.md
+
+- `Deploying DeepSeek with PD Disaggregation and Large-Scale Expert Parallelism on 96 H100 GPUs <https://lmsys.org/blog/2025-05-05-large-scale-ep/>`_
+- `Deploying Kimi K2 with PD Disaggregation and Large-Scale Expert Parallelism on 128 H200 GPUs <https://lmsys.org/blog/2025-07-20-k2-large-scale-ep/>`_
diff --git a/docs/references/production_metrics.md b/docs/references/production_metrics.md
new file mode 100644
index 000000000..16afaca67
--- /dev/null
+++ b/docs/references/production_metrics.md
@@ -0,0 +1,217 @@
+# Production Metrics
+
+SGLang exposes the following metrics via Prometheus. You can enable it by adding `--enable-metrics` when you launch the server.
+
+An example of the monitoring dashboard is available in [examples/monitoring/grafana.json](https://github.com/sgl-project/sglang/blob/main/examples/monitoring/grafana/dashboards/json/sglang-dashboard.json).
+
+Here is an example of the metrics:
+
+```
+$ curl http://localhost:30000/metrics
+# HELP sglang:prompt_tokens_total Number of prefill tokens processed.
+# TYPE sglang:prompt_tokens_total counter
+sglang:prompt_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 8.128902e+06
+# HELP sglang:generation_tokens_total Number of generation tokens processed.
+# TYPE sglang:generation_tokens_total counter
+sglang:generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.557572e+06
+# HELP sglang:token_usage The token usage
+# TYPE sglang:token_usage gauge
+sglang:token_usage{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.28
+# HELP sglang:cache_hit_rate The cache hit rate
+# TYPE sglang:cache_hit_rate gauge
+sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.007507552643049313
+# HELP sglang:time_to_first_token_seconds Histogram of time to first token in seconds.
+# TYPE sglang:time_to_first_token_seconds histogram
+sglang:time_to_first_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 2.3518979474117756e+06
+sglang:time_to_first_token_seconds_bucket{le="0.001",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+sglang:time_to_first_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+sglang:time_to_first_token_seconds_bucket{le="0.01",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+sglang:time_to_first_token_seconds_bucket{le="0.02",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+sglang:time_to_first_token_seconds_bucket{le="0.04",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
+sglang:time_to_first_token_seconds_bucket{le="0.06",model_name="meta-llama/Llama-3.1-8B-Instruct"} 3.0
+sglang:time_to_first_token_seconds_bucket{le="0.08",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:time_to_first_token_seconds_bucket{le="0.1",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:time_to_first_token_seconds_bucket{le="0.25",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:time_to_first_token_seconds_bucket{le="0.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:time_to_first_token_seconds_bucket{le="0.75",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:time_to_first_token_seconds_bucket{le="1.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 27.0
+sglang:time_to_first_token_seconds_bucket{le="2.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 140.0
+sglang:time_to_first_token_seconds_bucket{le="5.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 314.0
+sglang:time_to_first_token_seconds_bucket{le="7.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 941.0
+sglang:time_to_first_token_seconds_bucket{le="10.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1330.0
+sglang:time_to_first_token_seconds_bucket{le="15.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1970.0
+sglang:time_to_first_token_seconds_bucket{le="20.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 2326.0
+sglang:time_to_first_token_seconds_bucket{le="25.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 2417.0
+sglang:time_to_first_token_seconds_bucket{le="30.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 2513.0
+sglang:time_to_first_token_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 11008.0
+sglang:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 11008.0
+# HELP sglang:e2e_request_latency_seconds Histogram of End-to-end request latency in seconds
+# TYPE sglang:e2e_request_latency_seconds histogram
+sglang:e2e_request_latency_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 3.116093850019932e+06
+sglang:e2e_request_latency_seconds_bucket{le="0.3",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+sglang:e2e_request_latency_seconds_bucket{le="0.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:e2e_request_latency_seconds_bucket{le="0.8",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:e2e_request_latency_seconds_bucket{le="1.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:e2e_request_latency_seconds_bucket{le="1.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:e2e_request_latency_seconds_bucket{le="2.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:e2e_request_latency_seconds_bucket{le="2.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:e2e_request_latency_seconds_bucket{le="5.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.0
+sglang:e2e_request_latency_seconds_bucket{le="10.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 10.0
+sglang:e2e_request_latency_seconds_bucket{le="15.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 11.0
+sglang:e2e_request_latency_seconds_bucket{le="20.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 14.0
+sglang:e2e_request_latency_seconds_bucket{le="30.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 247.0
+sglang:e2e_request_latency_seconds_bucket{le="40.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 486.0
+sglang:e2e_request_latency_seconds_bucket{le="50.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 845.0
+sglang:e2e_request_latency_seconds_bucket{le="60.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1513.0
+sglang:e2e_request_latency_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 11228.0
+sglang:e2e_request_latency_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 11228.0
+# HELP sglang:time_per_output_token_seconds Histogram of time per output token in seconds.
+# TYPE sglang:time_per_output_token_seconds histogram
+sglang:time_per_output_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 866964.5791549598
+sglang:time_per_output_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
+sglang:time_per_output_token_seconds_bucket{le="0.01",model_name="meta-llama/Llama-3.1-8B-Instruct"} 73.0
+sglang:time_per_output_token_seconds_bucket{le="0.015",model_name="meta-llama/Llama-3.1-8B-Instruct"} 382.0
+sglang:time_per_output_token_seconds_bucket{le="0.02",model_name="meta-llama/Llama-3.1-8B-Instruct"} 593.0
+sglang:time_per_output_token_seconds_bucket{le="0.025",model_name="meta-llama/Llama-3.1-8B-Instruct"} 855.0
+sglang:time_per_output_token_seconds_bucket{le="0.03",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1035.0
+sglang:time_per_output_token_seconds_bucket{le="0.04",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1815.0
+sglang:time_per_output_token_seconds_bucket{le="0.05",model_name="meta-llama/Llama-3.1-8B-Instruct"} 11685.0
+sglang:time_per_output_token_seconds_bucket{le="0.075",model_name="meta-llama/Llama-3.1-8B-Instruct"} 433413.0
+sglang:time_per_output_token_seconds_bucket{le="0.1",model_name="meta-llama/Llama-3.1-8B-Instruct"} 4.950195e+06
+sglang:time_per_output_token_seconds_bucket{le="0.15",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.039435e+06
+sglang:time_per_output_token_seconds_bucket{le="0.2",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.171662e+06
+sglang:time_per_output_token_seconds_bucket{le="0.3",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.266055e+06
+sglang:time_per_output_token_seconds_bucket{le="0.4",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.296752e+06
+sglang:time_per_output_token_seconds_bucket{le="0.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.312226e+06
+sglang:time_per_output_token_seconds_bucket{le="0.75",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.339675e+06
+sglang:time_per_output_token_seconds_bucket{le="1.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.357747e+06
+sglang:time_per_output_token_seconds_bucket{le="2.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.389414e+06
+sglang:time_per_output_token_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.400757e+06
+sglang:time_per_output_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.400757e+06
+# HELP sglang:func_latency_seconds Function latency in seconds
+# TYPE sglang:func_latency_seconds histogram
+sglang:func_latency_seconds_sum{name="generate_request"} 4.514771912145079
+sglang:func_latency_seconds_bucket{le="0.05",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="0.07500000000000001",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="0.1125",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="0.16875",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="0.253125",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="0.3796875",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="0.56953125",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="0.8542968750000001",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="1.2814453125",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="1.9221679687500002",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="2.8832519531250003",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="4.3248779296875",name="generate_request"} 14007.0
+sglang:func_latency_seconds_bucket{le="6.487316894531251",name="generate_request"} 14007.0
+sglang:func_latency_seconds_bucket{le="9.730975341796876",name="generate_request"} 14007.0
+sglang:func_latency_seconds_bucket{le="14.596463012695313",name="generate_request"} 14007.0
+sglang:func_latency_seconds_bucket{le="21.89469451904297",name="generate_request"} 14007.0
+sglang:func_latency_seconds_bucket{le="32.84204177856446",name="generate_request"} 14007.0
+sglang:func_latency_seconds_bucket{le="49.26306266784668",name="generate_request"} 14007.0
+sglang:func_latency_seconds_bucket{le="+Inf",name="generate_request"} 14007.0
+sglang:func_latency_seconds_count{name="generate_request"} 14007.0
+# HELP sglang:num_running_reqs The number of running requests
+# TYPE sglang:num_running_reqs gauge
+sglang:num_running_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct"} 162.0
+# HELP sglang:num_used_tokens The number of used tokens
+# TYPE sglang:num_used_tokens gauge
+sglang:num_used_tokens{model_name="meta-llama/Llama-3.1-8B-Instruct"} 123859.0
+# HELP sglang:gen_throughput The generate throughput (token/s)
+# TYPE sglang:gen_throughput gauge
+sglang:gen_throughput{model_name="meta-llama/Llama-3.1-8B-Instruct"} 86.50814177726902
+# HELP sglang:num_queue_reqs The number of requests in the waiting queue
+# TYPE sglang:num_queue_reqs gauge
+sglang:num_queue_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct"} 2826.0
+```
+
+## Setup Guide
+
+This section describes how to set up the monitoring stack (Prometheus + Grafana) provided in the `examples/monitoring` directory.
+
+### Prerequisites
+
+- Docker and Docker Compose installed
+- SGLang server running with metrics enabled
+
+### Usage
+
+1.  **Start your SGLang server with metrics enabled:**
+
+    ```bash
+    python -m sglang.launch_server --model-path <your_model_path> --port 30000 --enable-metrics
+    ```
+    Replace `<your_model_path>` with the actual path to your model (e.g., `meta-llama/Meta-Llama-3.1-8B-Instruct`). Ensure the server is accessible from the monitoring stack (you might need `--host 0.0.0.0` if running in Docker). By default, the metrics endpoint will be available at `http://<sglang_server_host>:30000/metrics`.
+
+2.  **Navigate to the monitoring example directory:**
+    ```bash
+    cd examples/monitoring
+    ```
+
+3.  **Start the monitoring stack:**
+    ```bash
+    docker compose up -d
+    ```
+    This command will start Prometheus and Grafana in the background.
+
+4.  **Access the monitoring interfaces:**
+    *   **Grafana:** Open your web browser and go to [http://localhost:3000](http://localhost:3000).
+    *   **Prometheus:** Open your web browser and go to [http://localhost:9090](http://localhost:9090).
+
+5.  **Log in to Grafana:**
+    *   Default Username: `admin`
+    *   Default Password: `admin`
+    You will be prompted to change the password upon your first login.
+
+6.  **View the Dashboard:**
+    The SGLang dashboard is pre-configured and should be available automatically. Navigate to `Dashboards` -> `Browse` -> `SGLang Monitoring` folder -> `SGLang Dashboard`.
+
+### Troubleshooting
+
+*   **Port Conflicts:** If you encounter errors like "port is already allocated," check if other services (including previous instances of Prometheus/Grafana) are using ports `9090` or `3000`. Use `docker ps` to find running containers and `docker stop <container_id>` to stop them, or use `lsof -i :<port>` to find other processes using the ports. You might need to adjust the ports in the `docker-compose.yaml` file if they permanently conflict with other essential services on your system.
+
+To modify Grafana's port to the other one(like 3090) in your Docker Compose file, you need to explicitly specify the port mapping under the grafana service.
+
+    Option 1: Add GF_SERVER_HTTP_PORT to the environment section:
+    ```
+      environment:
+    - GF_AUTH_ANONYMOUS_ENABLED=true
+    - GF_SERVER_HTTP_PORT=3090  # <-- Add this line
+    ```
+    Option 2: Use port mapping:
+    ```
+    grafana:
+      image: grafana/grafana:latest
+      container_name: grafana
+      ports:
+      - "3090:3000"  # <-- Host:Container port mapping
+    ```
+*   **Connection Issues:**
+    *   Ensure both Prometheus and Grafana containers are running (`docker ps`).
+    *   Verify the Prometheus data source configuration in Grafana (usually auto-configured via `grafana/datasources/datasource.yaml`). Go to `Connections` -> `Data sources` -> `Prometheus`. The URL should point to the Prometheus service (e.g., `http://prometheus:9090`).
+    *   Confirm that your SGLang server is running and the metrics endpoint (`http://<sglang_server_host>:30000/metrics`) is accessible *from the Prometheus container*. If SGLang is running on your host machine and Prometheus is in Docker, use `host.docker.internal` (on Docker Desktop) or your machine's network IP instead of `localhost` in the `prometheus.yaml` scrape configuration.
+*   **No Data on Dashboard:**
+    *   Generate some traffic to your SGLang server to produce metrics. For example, run a benchmark:
+        ```bash
+        python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 100 --random-input 128 --random-output 128
+        ```
+    *   Check the Prometheus UI (`http://localhost:9090`) under `Status` -> `Targets` to see if the SGLang endpoint is being scraped successfully.
+    *   Verify the `model_name` and `instance` labels in your Prometheus metrics match the variables used in the Grafana dashboard. You might need to adjust the Grafana dashboard variables or the labels in your Prometheus configuration.
+
+### Configuration Files
+
+The monitoring setup is defined by the following files within the `examples/monitoring` directory:
+
+*   `docker-compose.yaml`: Defines the Prometheus and Grafana services.
+*   `prometheus.yaml`: Prometheus configuration, including scrape targets.
+*   `grafana/datasources/datasource.yaml`: Configures the Prometheus data source for Grafana.
+*   `grafana/dashboards/config/dashboard.yaml`: Tells Grafana to load dashboards from the specified path.
+*   `grafana/dashboards/json/sglang-dashboard.json`: The actual Grafana dashboard definition in JSON format.
+
+You can customize the setup by modifying these files. For instance, you might need to update the `static_configs` target in `prometheus.yaml` if your SGLang server runs on a different host or port.
+
+#### Check if the metrics are being collected
+
+Run `python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 3000 --random-input 1024 --random-output 1024 --random-range-ratio 0.5` to generate some requests.
+
+Then you should be able to see the metrics in the Grafana dashboard.
diff --git a/docs/references/torch_compile_cache.md b/docs/references/torch_compile_cache.md
new file mode 100644
index 000000000..f2bb257f4
--- /dev/null
+++ b/docs/references/torch_compile_cache.md
@@ -0,0 +1,13 @@
+# Enabling cache for torch.compile
+
+SGLang uses `max-autotune-no-cudagraphs` mode of torch.compile. The auto-tuning can be slow.
+If you want to deploy a model on many different machines, you can ship the torch.compile cache to these machines and skip the compilation steps.
+
+This is based on https://pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html
+
+
+1. Generate the cache by setting TORCHINDUCTOR_CACHE_DIR and running the model once.
+```
+TORCHINDUCTOR_CACHE_DIR=/root/inductor_root_cache python3 -m sglang.launch_server --model meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile
+```
+2. Copy the cache folder to other machines and launch the server with `TORCHINDUCTOR_CACHE_DIR`.
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 000000000..1a7e5d4eb
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,20 @@
+ipykernel
+ipywidgets
+jupyter_client
+markdown>=3.4.0
+matplotlib
+myst-parser
+nbconvert
+nbsphinx
+pandoc
+pillow
+pydantic
+sphinx
+sphinx-book-theme
+sphinx-copybutton
+sphinx-tabs
+nbstripout
+sphinxcontrib-mermaid
+urllib3<2.0.0
+gguf>=0.10.0
+sphinx-autobuild
diff --git a/docs/serve.sh b/docs/serve.sh
new file mode 100644
index 000000000..049f767cf
--- /dev/null
+++ b/docs/serve.sh
@@ -0,0 +1,3 @@
+# Clean and serve documentation with auto-build
+make clean
+make serve
diff --git a/docs/supported_models/embedding_models.md b/docs/supported_models/embedding_models.md
new file mode 100644
index 000000000..437cb8284
--- /dev/null
+++ b/docs/supported_models/embedding_models.md
@@ -0,0 +1,87 @@
+# Embedding Models
+
+SGLang provides robust support for embedding models by integrating efficient serving mechanisms with its flexible programming interface. This integration allows for streamlined handling of embedding tasks, facilitating faster and more accurate retrieval and semantic search operations. SGLang's architecture enables better resource utilization and reduced latency in embedding model deployment.
+
+```{important}
+Embedding models are executed with `--is-embedding` flag and some may require `--trust-remote-code`
+```
+
+## Quick Start
+
+### Launch Server
+
+```shell
+python3 -m sglang.launch_server \
+  --model-path Qwen/Qwen3-Embedding-4B \
+  --is-embedding \
+  --host 0.0.0.0 \
+  --port 30000
+```
+
+### Client Request
+
+```python
+import requests
+
+url = "http://127.0.0.1:30000"
+
+payload = {
+    "model": "Qwen/Qwen3-Embedding-4B",
+    "input": "What is the capital of France?",
+    "encoding_format": "float"
+}
+
+response = requests.post(url + "/v1/embeddings", json=payload).json()
+print("Embedding:", response["data"][0]["embedding"])
+```
+
+
+
+## Multimodal Embedding Example
+
+For multimodal models like GME that support both text and images:
+
+```shell
+python3 -m sglang.launch_server \
+  --model-path Alibaba-NLP/gme-Qwen2-VL-2B-Instruct \
+  --is-embedding \
+  --chat-template gme-qwen2-vl \
+  --host 0.0.0.0 \
+  --port 30000
+```
+
+```python
+import requests
+
+url = "http://127.0.0.1:30000"
+
+text_input = "Represent this image in embedding space."
+image_path = "https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild/resolve/main/images/023.jpg"
+
+payload = {
+    "model": "gme-qwen2-vl",
+    "input": [
+        {
+            "text": text_input
+        },
+        {
+            "image": image_path
+        }
+    ],
+}
+
+response = requests.post(url + "/v1/embeddings", json=payload).json()
+
+print("Embeddings:", [x.get("embedding") for x in response.get("data", [])])
+```
+
+## Supported Models
+
+| Model Family                               | Example Model                          | Chat Template | Description                                                                 |
+| ------------------------------------------ | -------------------------------------- | ------------- | --------------------------------------------------------------------------- |
+| **E5 (Llama/Mistral based)**              | `intfloat/e5-mistral-7b-instruct`     | N/A           | High-quality text embeddings based on Mistral/Llama architectures          |
+| **GTE-Qwen2**                             | `Alibaba-NLP/gte-Qwen2-7B-instruct`   | N/A           | Alibaba's text embedding model with multilingual support                   |
+| **Qwen3-Embedding**                       | `Qwen/Qwen3-Embedding-4B`             | N/A           | Latest Qwen3-based text embedding model for semantic representation        |
+| **BGE**                                    | `BAAI/bge-large-en-v1.5`              | N/A           | BAAI's text embeddings (requires `attention-backend` triton/torch_native)  |
+| **GME (Multimodal)**                      | `Alibaba-NLP/gme-Qwen2-VL-2B-Instruct`| `gme-qwen2-vl`| Multimodal embedding for text and image cross-modal tasks                  |
+| **CLIP**                                   | `openai/clip-vit-large-patch14-336`   | N/A           | OpenAI's CLIP for image and text embeddings                                |
diff --git a/docs/supported_models/generative_models.md b/docs/supported_models/generative_models.md
new file mode 100644
index 000000000..d6d3cdd45
--- /dev/null
+++ b/docs/supported_models/generative_models.md
@@ -0,0 +1,56 @@
+# Large Language Models
+
+These models accept text input and produce text output (e.g., chat completions). They are primarily large language models (LLMs), some with mixture-of-experts (MoE) architectures for scaling.
+
+## Example launch Command
+
+```shell
+python3 -m sglang.launch_server \
+  --model-path meta-llama/Llama-3.2-1B-Instruct \  # example HF/local path
+  --host 0.0.0.0 \
+  --port 30000 \
+```
+
+## Supported models
+
+Below the supported models are summarized in a table.
+
+If you are unsure if a specific architecture is implemented, you can search for it via GitHub. For example, to search for `Qwen3ForCausalLM`, use the expression:
+
+```
+repo:sgl-project/sglang path:/^python\/sglang\/srt\/models\// Qwen3ForCausalLM
+```
+
+in the GitHub search bar.
+
+| Model Family (Variants)             | Example HuggingFace Identifier                     | Description                                                                            |
+|-------------------------------------|--------------------------------------------------|----------------------------------------------------------------------------------------|
+| **DeepSeek** (v1, v2, v3/R1)        | `deepseek-ai/DeepSeek-R1`                        | Series of advanced reasoning-optimized models (including a 671B MoE) trained with reinforcement learning; top performance on complex reasoning, math, and code tasks. [SGLang provides Deepseek v3/R1 model-specific optimizations](../basic_usage/deepseek.md) and [Reasoning Parser](../advanced_features/separate_reasoning.ipynb)|
+| **GPT-OSS**       | `openai/gpt-oss-20b`, `openai/gpt-oss-120b`       | OpenAI’s latest GPT-OSS series for complex reasoning, agentic tasks, and versatile developer use cases.|
+| **Qwen** (3, 3MoE, 3Next, 2.5, 2 series)       | `Qwen/Qwen3-0.6B`, `Qwen/Qwen3-30B-A3B` `Qwen/Qwen3-Next-80B-A3B-Instruct `      | Alibaba’s latest Qwen3 series for complex reasoning, language understanding, and generation tasks; Support for MoE variants along with previous generation 2.5, 2, etc. [SGLang provides Qwen3 specific reasoning parser](../advanced_features/separate_reasoning.ipynb)|
+| **Llama** (2, 3.x, 4 series)        | `meta-llama/Llama-4-Scout-17B-16E-Instruct`       | Meta's open LLM series, spanning 7B to 400B parameters (Llama 2, 3, and new Llama 4) with well-recognized performance. [SGLang provides Llama-4 model-specific optimizations](../basic_usage/llama4.md)  |
+| **Mistral** (Mixtral, NeMo, Small3) | `mistralai/Mistral-7B-Instruct-v0.2`             | Open 7B LLM by Mistral AI with strong performance; extended into MoE (“Mixtral”) and NeMo Megatron variants for larger scale. |
+| **Gemma** (v1, v2, v3)              | `google/gemma-3-1b-it`                            | Google’s family of efficient multilingual models (1B–27B); Gemma 3 offers a 128K context window, and its larger (4B+) variants support vision input. |
+| **Phi** (Phi-1.5, Phi-2, Phi-3, Phi-4, Phi-MoE series) | `microsoft/Phi-4-multimodal-instruct`, `microsoft/Phi-3.5-MoE-instruct` | Microsoft’s Phi family of small models (1.3B–5.6B); Phi-4-multimodal (5.6B) processes text, images, and speech, Phi-4-mini is a high-accuracy text model and Phi-3.5-MoE is a mixture-of-experts model. |
+| **MiniCPM** (v3, 4B)               | `openbmb/MiniCPM3-4B`                            | OpenBMB’s series of compact LLMs for edge devices; MiniCPM 3 (4B) achieves GPT-3.5-level results in text tasks. |
+| **OLMoE** (Open MoE)               | `allenai/OLMoE-1B-7B-0924`                       | Allen AI’s open Mixture-of-Experts model (7B total, 1B active parameters) delivering state-of-the-art results with sparse expert activation. |
+| **StableLM** (3B, 7B)               | `stabilityai/stablelm-tuned-alpha-7b`            | StabilityAI’s early open-source LLM (3B & 7B) for general text generation; a demonstration model with basic instruction-following ability. |
+| **Command-R** (Cohere)              | `CohereForAI/c4ai-command-r-v01`                 | Cohere’s open conversational LLM (Command series) optimized for long context, retrieval-augmented generation, and tool use. |
+| **DBRX** (Databricks)              | `databricks/dbrx-instruct`                       | Databricks’ 132B-parameter MoE model (36B active) trained on 12T tokens; competes with GPT-3.5 quality as a fully open foundation model. |
+| **Grok** (xAI)                     | `xai-org/grok-1`                                | xAI’s grok-1 model known for vast size(314B parameters) and high quality; integrated in SGLang for high-performance inference. |
+| **ChatGLM** (GLM-130B family)       | `THUDM/chatglm2-6b`                              | Zhipu AI’s bilingual chat model (6B) excelling at Chinese-English dialogue; fine-tuned for conversational quality and alignment. |
+| **InternLM 2** (7B, 20B)           | `internlm/internlm2-7b`                          | Next-gen InternLM (7B and 20B) from SenseTime, offering strong reasoning and ultra-long context support (up to 200K tokens). |
+| **ExaONE 3** (Korean-English)      | `LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct`           | LG AI Research’s Korean-English model (7.8B) trained on 8T tokens; provides high-quality bilingual understanding and generation. |
+| **Baichuan 2** (7B, 13B)           | `baichuan-inc/Baichuan2-13B-Chat`                | BaichuanAI’s second-generation Chinese-English LLM (7B/13B) with improved performance and an open commercial license. |
+| **XVERSE** (MoE)                   | `xverse/XVERSE-MoE-A36B`                         | Yuanxiang’s open MoE LLM (XVERSE-MoE-A36B: 255B total, 36B active) supporting ~40 languages; delivers 100B+ dense-level performance via expert routing. |
+| **SmolLM** (135M–1.7B)            | `HuggingFaceTB/SmolLM-1.7B`                      | Hugging Face’s ultra-small LLM series (135M–1.7B params) offering surprisingly strong results, enabling advanced AI on mobile/edge devices. |
+| **GLM-4** (Multilingual 9B)        | `ZhipuAI/glm-4-9b-chat`                          | Zhipu’s GLM-4 series (up to 9B parameters) – open multilingual models with support for 1M-token context and even a 5.6B multimodal variant (Phi-4V). |
+| **MiMo** (7B series)               | `XiaomiMiMo/MiMo-7B-RL`                         | Xiaomi's reasoning-optimized model series, leverages Multiple-Token Prediction for faster inference. |
+| **ERNIE-4.5** (4.5, 4.5MoE series) | `baidu/ERNIE-4.5-21B-A3B-PT`                    | Baidu's ERNIE-4.5 series which consists of MoE with 47B and 3B active parameters, with the largest model having 424B total parameters, as well as a 0.3B dense model. |
+| **Arcee AFM-4.5B**               | `arcee-ai/AFM-4.5B-Base`                         | Arcee's foundational model series for real world reliability and edge deployments. |
+| **Persimmon** (8B)               | `adept/persimmon-8b-chat`                         | Adept’s open 8B model with a 16K context window and fast inference; trained for broad usability and licensed under Apache 2.0. |
+| **Ling** (16.8B–290B) | `inclusionAI/Ling-lite`, `inclusionAI/Ling-plus` | InclusionAI’s open MoE models. Ling-Lite has 16.8B total / 2.75B active parameters, and Ling-Plus has 290B total / 28.8B active parameters. They are designed for high performance on NLP and complex reasoning tasks. |
+| **Granite 3.0, 3.1** (IBM)               | `ibm-granite/granite-3.1-8b-instruct`                          | IBM's open dense foundation models optimized for reasoning, code, and business AI use cases. Integrated with Red Hat and watsonx systems. |
+| **Granite 3.0 MoE** (IBM)               | `ibm-granite/granite-3.0-3b-a800m-instruct`                          | IBM’s Mixture-of-Experts models offering strong performance with cost-efficiency. MoE expert routing designed for enterprise deployment at scale. |
+| **Llama Nemotron Super** (v1, v1.5, NVIDIA) | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, `nvidia/Llama-3_3-Nemotron-Super-49B-v1_5` | The [NVIDIA Nemotron](https://www.nvidia.com/en-us/ai-data-science/foundation-models/nemotron/) family builds on the strongest open models in the ecosystem by enhancing them with greater accuracy, efficiency, and transparency using NVIDIA open synthetic datasets, advanced techniques, and tools. This enables the creation of practical, right-sized, and high-performing AI agents. |
+| **Llama Nemotron Ultra** (v1, NVIDIA) | `nvidia/Llama-3_1-Nemotron-Ultra-253B-v1` | The [NVIDIA Nemotron](https://www.nvidia.com/en-us/ai-data-science/foundation-models/nemotron/) family builds on the strongest open models in the ecosystem by enhancing them with greater accuracy, efficiency, and transparency using NVIDIA open synthetic datasets, advanced techniques, and tools. This enables the creation of practical, right-sized, and high-performing AI agents. |
diff --git a/docs/supported_models/modelscope.md b/docs/supported_models/modelscope.md
new file mode 100644
index 000000000..4740c2770
--- /dev/null
+++ b/docs/supported_models/modelscope.md
@@ -0,0 +1,28 @@
+# Use Models From ModelScope
+
+To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable `SGLANG_USE_MODELSCOPE`.
+
+```bash
+export SGLANG_USE_MODELSCOPE=true
+```
+
+We take [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) as an example.
+
+Launch the Server:
+```bash
+python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
+```
+
+Or start it by docker:
+
+```bash
+docker run --gpus all \
+    -p 30000:30000 \
+    -v ~/.cache/modelscope:/root/.cache/modelscope \
+    --env "SGLANG_USE_MODELSCOPE=true" \
+    --ipc=host \
+    lmsysorg/sglang:latest \
+    python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000
+```
+
+Note that modelscope uses a different cache directory than huggingface. You may need to set it manually to avoid running out of disk space.
diff --git a/docs/supported_models/multimodal_language_models.md b/docs/supported_models/multimodal_language_models.md
new file mode 100644
index 000000000..a2adf99cb
--- /dev/null
+++ b/docs/supported_models/multimodal_language_models.md
@@ -0,0 +1,42 @@
+# Multimodal Language Models
+
+These models accept multi-modal inputs (e.g., images and text) and generate text output. They augment language models with multimodal encoders.
+
+## Example launch Command
+
+```shell
+python3 -m sglang.launch_server \
+  --model-path meta-llama/Llama-3.2-11B-Vision-Instruct \  # example HF/local path
+  --host 0.0.0.0 \
+  --port 30000 \
+```
+
+## Supported models
+
+Below the supported models are summarized in a table.
+
+If you are unsure if a specific architecture is implemented, you can search for it via GitHub. For example, to search for `Qwen2_5_VLForConditionalGeneration`, use the expression:
+
+```
+repo:sgl-project/sglang path:/^python\/sglang\/srt\/models\// Qwen2_5_VLForConditionalGeneration
+```
+
+in the GitHub search bar.
+
+
+| Model Family (Variants)    | Example HuggingFace Identifier             | Chat Template    | Description                                                                                                                                                                                                     |
+|----------------------------|--------------------------------------------|------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| **Qwen-VL** (Qwen2 series) | `Qwen/Qwen2.5-VL-7B-Instruct`              | `qwen2-vl`       | Alibaba’s vision-language extension of Qwen; for example, Qwen2.5-VL (7B and larger variants) can analyze and converse about image content.                                                                     |
+| **DeepSeek-VL2**           | `deepseek-ai/deepseek-vl2`                 | `deepseek-vl2`   | Vision-language variant of DeepSeek (with a dedicated image processor), enabling advanced multimodal reasoning on image and text inputs.                                                                        |
+| **Janus-Pro** (1B, 7B)     | `deepseek-ai/Janus-Pro-7B`                 | `janus-pro`      | DeepSeek’s open-source multimodal model capable of both image understanding and generation. Janus-Pro employs a decoupled architecture for separate visual encoding paths, enhancing performance in both tasks. |
+| **MiniCPM-V / MiniCPM-o**  | `openbmb/MiniCPM-V-2_6`                    | `minicpmv`       | MiniCPM-V (2.6, ~8B) supports image inputs, and MiniCPM-o adds audio/video; these multimodal LLMs are optimized for end-side deployment on mobile/edge devices.                                                 |
+| **Llama 3.2 Vision** (11B) | `meta-llama/Llama-3.2-11B-Vision-Instruct` | `llama_3_vision` | Vision-enabled variant of Llama 3 (11B) that accepts image inputs for visual question answering and other multimodal tasks.                                                                                     |
+| **LLaVA** (v1.5 & v1.6)    | *e.g.* `liuhaotian/llava-v1.5-13b`         | `vicuna_v1.1`    | Open vision-chat models that add an image encoder to LLaMA/Vicuna (e.g. LLaMA2 13B) for following multimodal instruction prompts.                                                                               |
+| **LLaVA-NeXT** (8B, 72B)   | `lmms-lab/llava-next-72b`                  | `chatml-llava`   | Improved LLaVA models (with an 8B Llama3 version and a 72B version) offering enhanced visual instruction-following and accuracy on multimodal benchmarks.                                                       |
+| **LLaVA-OneVision**        | `lmms-lab/llava-onevision-qwen2-7b-ov`     | `chatml-llava`   | Enhanced LLaVA variant integrating Qwen as the backbone; supports multiple images (and even video frames) as inputs via an OpenAI Vision API-compatible format.                                                 |
+| **Gemma 3 (Multimodal)**   | `google/gemma-3-4b-it`                     | `gemma-it`       | Gemma 3's larger models (4B, 12B, 27B) accept images (each image encoded as 256 tokens) alongside text in a combined 128K-token context.                                                                        |
+| **Kimi-VL** (A3B)          | `moonshotai/Kimi-VL-A3B-Instruct`          | `kimi-vl`        | Kimi-VL is a multimodal model that can understand and generate text from images.                                                                                                                                |
+| **Mistral-Small-3.1-24B**  | `mistralai/Mistral-Small-3.1-24B-Instruct-2503` | `mistral`   | Mistral 3.1 is a multimodal model that can generate text from text or images input. It also supports tool calling and structured output. |
+| **Phi-4-multimodal-instruct**  | `microsoft/Phi-4-multimodal-instruct` | `phi-4-mm`   | Phi-4-multimodal-instruct is the multimodal variant of the Phi-4-mini model, enhanced with LoRA for improved multimodal capabilities. It supports text, vision and audio modalities in SGLang. |
+| **MiMo-VL** (7B)           | `XiaomiMiMo/MiMo-VL-7B-RL`                 | `mimo-vl`        | Xiaomi's compact yet powerful vision-language model featuring a native resolution ViT encoder for fine-grained visual details, an MLP projector for cross-modal alignment, and the MiMo-7B language model optimized for complex reasoning tasks. |
+| **GLM-4.5V** (106B) /  **GLM-4.1V**(9B)           | `zai-org/GLM-4.5V`                   | `glm-4v`         | GLM-4.5V and GLM-4.1V-Thinking: Towards Versatile Multimodal Reasoning with Scalable Reinforcement Learning                                                                                                                                                                                                      |
diff --git a/docs/supported_models/rerank_models.md b/docs/supported_models/rerank_models.md
new file mode 100644
index 000000000..b6f2ffa20
--- /dev/null
+++ b/docs/supported_models/rerank_models.md
@@ -0,0 +1,49 @@
+# Rerank Models
+
+SGLang offers comprehensive support for rerank models by incorporating optimized serving frameworks with a flexible programming interface. This setup enables efficient processing of cross-encoder reranking tasks, improving the accuracy and relevance of search result ordering. SGLang’s design ensures high throughput and low latency during reranker model deployment, making it ideal for semantic-based result refinement in large-scale retrieval systems.
+
+```{important}
+They are executed with `--is-embedding` and some may require `--trust-remote-code`
+```
+
+## Example Launch Command
+
+```shell
+python3 -m sglang.launch_server \
+  --model-path BAAI/bge-reranker-v2-m3 \
+  --host 0.0.0.0 \
+  --disable-radix-cache \
+  --chunked-prefill-size -1 \
+  --attention-backend triton \
+  --is-embedding \
+  --port 30000
+```
+
+## Example Client Request
+
+```python
+import requests
+
+url = "http://127.0.0.1:30000/v1/rerank"
+
+payload = {
+    "model": "BAAI/bge-reranker-v2-m3",
+    "query": "what is panda?",
+    "documents": [
+        "hi",
+        "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China."
+    ]
+}
+
+response = requests.post(url, json=payload)
+response_json = response.json()
+
+for item in response_json:
+    print(f"Score: {item['score']:.2f} - Document: '{item['document']}'")
+```
+
+## Supported rerank models
+
+| Model Family (Rerank)                          | Example HuggingFace Identifier       | Chat Template | Description                                                                                                                      |
+|------------------------------------------------|--------------------------------------|---------------|----------------------------------------------------------------------------------------------------------------------------------|
+| **BGE-Reranker (BgeRerankModel)**              | `BAAI/bge-reranker-v2-m3`            | N/A           | Currently only support `attention-backend`   `triton` and `torch_native`.  high-performance cross-encoder reranker model from BAAI. Suitable for reranking search results based on semantic relevance.   |
diff --git a/docs/supported_models/reward_models.md b/docs/supported_models/reward_models.md
new file mode 100644
index 000000000..ef4474637
--- /dev/null
+++ b/docs/supported_models/reward_models.md
@@ -0,0 +1,28 @@
+# Reward Models
+
+These models output a scalar reward score or classification result, often used in reinforcement learning or content moderation tasks.
+
+```{important}
+They are executed with `--is-embedding` and some may require `--trust-remote-code`.
+```
+
+## Example launch Command
+
+```shell
+python3 -m sglang.launch_server \
+  --model-path Qwen/Qwen2.5-Math-RM-72B \  # example HF/local path
+  --is-embedding \
+  --host 0.0.0.0 \
+  --tp-size=4 \                          # set for tensor parallelism
+  --port 30000 \
+```
+
+## Supported models
+
+| Model Family (Reward)                                                     | Example HuggingFace Identifier                              | Description                                                                     |
+|---------------------------------------------------------------------------|-----------------------------------------------------|---------------------------------------------------------------------------------|
+| **Llama (3.1 Reward / `LlamaForSequenceClassification`)**                   | `Skywork/Skywork-Reward-Llama-3.1-8B-v0.2`            | Reward model (preference classifier) based on Llama 3.1 (8B) for scoring and ranking responses for RLHF.  |
+| **Gemma 2 (27B Reward / `Gemma2ForSequenceClassification`)**                | `Skywork/Skywork-Reward-Gemma-2-27B-v0.2`             | Derived from Gemma‑2 (27B), this model provides human preference scoring for RLHF and multilingual tasks.  |
+| **InternLM 2 (Reward / `InternLM2ForRewardMode`)**                         | `internlm/internlm2-7b-reward`                       | InternLM 2 (7B)–based reward model used in alignment pipelines to guide outputs toward preferred behavior.  |
+| **Qwen2.5 (Reward - Math / `Qwen2ForRewardModel`)**                         | `Qwen/Qwen2.5-Math-RM-72B`                           | A 72B math-specialized RLHF reward model from the Qwen2.5 series, tuned for evaluating and refining responses.  |
+| **Qwen2.5 (Reward - Sequence / `Qwen2ForSequenceClassification`)**          | `jason9693/Qwen2.5-1.5B-apeach`                      | A smaller Qwen2.5 variant used for sequence classification, offering an alternative RLHF scoring mechanism.  |
diff --git a/docs/supported_models/support_new_models.md b/docs/supported_models/support_new_models.md
new file mode 100644
index 000000000..511a8f398
--- /dev/null
+++ b/docs/supported_models/support_new_models.md
@@ -0,0 +1,320 @@
+# How to Support New Models
+
+This document explains how to add support for new language models and multimodal large language models (MLLMs) in
+SGLang. It also covers how to test new models and register external implementations.
+
+## How to Support a New Language Model
+
+To support a new model in SGLang, you only need to add a single file under
+the [SGLang Models Directory](https://github.com/sgl-project/sglang/tree/main/python/sglang/srt/models). You can learn
+from existing model implementations and create a new file for your model. For most models, you should be able to find a
+similar model to start with (e.g., starting from Llama). Also refer how
+to [port a Model from vLLM to SGLang](#port-a-model-from-vllm-to-sglang)
+
+## How to Support a New Multimodal Large Language Model
+
+To support a new multimodal large language model (MLLM) in SGLang, there are several key components in addition to the
+standard LLM support:
+
+1. **Register your new model as multimodal**:
+   Extend `is_multimodal_model`
+   in [model_config.py](https://github.com/sgl-project/sglang/blob/0ab3f437aba729b348a683ab32b35b214456efc7/python/sglang/srt/configs/model_config.py#L561)
+   to return `True` for your model.
+
+2. **Register a new chat-template**:
+   Only when your default chat-template is unable to accept images as input: Register a new chat template in [conversation.py](https://github.com/sgl-project/sglang/tree/main/python/sglang/srt/conversation.py) and the corresponding matching function.
+
+3. **Multimodal Data Processor**:
+   Define a new `Processor` class that inherits from `BaseMultimodalProcessor` and register this processor as your
+   model’s dedicated processor.
+   See [multimodal_processor.py](https://github.com/sgl-project/sglang/tree/main/python/sglang/srt/multimodal/processors)
+   for more details.
+
+4. **Handle Multimodal Tokens**:
+   Implement a `pad_input_ids` function for your new model. In this function, multimodal tokens in the prompt should be
+   expanded (if necessary) and padded with multimodal-data-hashes so that SGLang can recognize different multimodal data
+   with `RadixAttention`.
+
+5. **Handle Image Feature Extraction**:
+   Implement a `get_image_feature` function for your new model, which extracts image features from raw image data and converts them into the embeddings used by the language model.
+
+6. **Adapt to Vision Attention**:
+   Adapt the multi-headed `Attention` of ViT with SGLang’s `VisionAttention`.
+
+You can refer to [Qwen2VL](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/models/qwen2_vl.py) or
+other mllm implementations. These models demonstrate how to correctly handle both multimodal and textual inputs.
+
+## Testing and Debugging
+
+Please note all your testing and benchmarking results in PR description.
+
+### Interactive Debugging
+
+For interactive debugging, compare the outputs of Hugging Face/Transformers and SGLang. The following two commands
+should give the same text output and very similar prefill logits:
+
+- Get the reference output:
+  ```bash
+  python3 scripts/playground/reference_hf.py --model-path [new model] --model-type {text,mllm}
+  ```
+- Get the SGLang output:
+  ```bash
+  python3 -m sglang.bench_one_batch --correct --model [new model]
+  ```
+
+### Add the Model to the Test Suite
+
+To ensure the new model is well maintained, add it to the test suite by including it in the `ALL_OTHER_MODELS` list in
+the [test_generation_models.py](https://github.com/sgl-project/sglang/blob/main/test/srt/models/test_generation_models.py)
+file, test the new model on your local machine and report the results on demonstrative benchmarks (GSM8K, MMLU, MMMU,
+MMMU-Pro, etc.) in your PR. \\
+For VLMs, also include a test in `test_vision_openai_server_{x}.py` (e.g. [test_vision_openai_server_a.py](https://github.com/sgl-project/sglang/blob/main/test/srt/test_vision_openai_server_a.py), [test_vision_openai_server_b.py](https://github.com/sgl-project/sglang/blob/main/test/srt/test_vision_openai_server_b.py)).
+
+
+This is an example command to run to test a new model on your local machine:
+
+```bash
+ONLY_RUN=Qwen/Qwen2-1.5B python3 -m unittest test_generation_models.TestGenerationModels.test_others
+```
+
+### Benchmark
+
+- **(Required) MMMU**: follow MMMU benchmark [README.md](https://github.com/sgl-project/sglang/blob/main/benchmark/mmmu/README.md) to get SGLang vs. HF Transformer accuracy comparison. The accuracy score from SGLang run should not be much lower than that from HF Transformer run. Similarly, follow https://docs.sglang.ai/developer_guide/benchmark_and_profiling.html to get performance comparison: TTFT and throughput must meet or exceed baselines (e.g., HF Transformer).
+- **(Optional) Other evals**: If you ran other evals, please note the results in PR description.
+
+## Port a Model from vLLM to SGLang
+
+The [vLLM Models Directory](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models) is a valuable
+resource, as vLLM covers many models. SGLang reuses vLLM’s interface and some layers, making it easier to port models
+from vLLM to SGLang.
+
+To port a model from vLLM to SGLang:
+
+- Compare these two files for guidance:
+    - [SGLang Llama Implementation](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/models/llama.py)
+    - [vLLM Llama Implementation](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llama.py)
+- The major differences include:
+    - **Replace vLLM’s `Attention` with `RadixAttention`** (ensure you pass `layer_id` to `RadixAttention`).
+    - **Replace vLLM’s `LogitsProcessor` with SGLang’s `LogitsProcessor`.**
+    - **Replace the multi-headed `Attention` of ViT with SGLang’s `VisionAttention`.**
+    - **Replace other vLLM layers** (such as `RMSNorm`, `SiluAndMul`) with SGLang layers.
+    - **Remove `Sample`.**
+    - **Change the `forward()` functions** and add a `forward_batch()` method.
+    - **Add `EntryClass`** at the end.
+    - **Ensure that the new implementation uses only SGLang components** and does not rely on any vLLM components.
+
+Note: make sure you add your new model to the supported models list in the supported models documentation.
+
+## Registering an External Model Implementation
+
+In addition to the methods above, you can register your new model with the `ModelRegistry` before launching the server.
+This allows you to integrate your model without modifying the source code.
+
+For example:
+
+```python
+from sglang.srt.models.registry import ModelRegistry
+from sglang.srt.entrypoints.http_server import launch_server
+
+# For a single model, add it to the registry:
+ModelRegistry.models[model_name] = model_class
+
+# For multiple models, you can imitate the import_model_classes() function:
+from functools import lru_cache
+
+@lru_cache()
+def import_new_model_classes():
+    model_arch_name_to_cls = {}
+    # Populate model_arch_name_to_cls with your new model classes.
+    ...
+    return model_arch_name_to_cls
+
+ModelRegistry.models.update(import_new_model_classes())
+
+# Launch the server with your server arguments:
+launch_server(server_args)
+```
+
+## Example: Implementing and Serving a Llama Wrapper Model
+
+Below is an introductory, step-by-step walkthrough on how to implement a new model end-to-end in SGLang and then run it via the [Offline Engine](https://github.com/sgl-project/sglang/blob/main/docs/basic_usage/offline_engine_api.ipynb).
+
+### Implementing Our Model
+
+To keep things simple, this new model will be a simple wrapper around [Llama 3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct), and our goal will be just to bias the output logits for each `forward` call by taking the square root of each individual logit.
+
+Let's start by defining our model in a file called `llama_wrapper.py`.
+The first step is to import the necessary libraries from SRT, which is SGLang's internal backend.
+
+```python
+# In the file `llama_wrapper.py`
+
+import torch
+from transformers import LlamaConfig
+from typing import Optional
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+
+from sglang.srt.models.llama import LlamaForCausalLM
+```
+
+Next, we declare a new `class` for our model and have it inherit from `LlamaForCausalLM`, which allows our model to access `LlamaForCausalLM`'s predefined modules and layers, such as `LlamaAttention` and `LlamaMLP`.
+Note that almost all model implementations take in `config` and `quant_config` as arguments for their `__init__` method; `config` and `quant_config` are passed in via [`model_loader/loader.py`](https://github.com/sgl-project/sglang/blob/bf72b80122fd888bf619d17b96fa3e323ab809fc/python/sglang/srt/model_loader/loader.py#L219).
+Because we have inherited from `LlamaForCausalLM`, we can pass our parameters directly to its constructor, which will set the member variables for us.
+
+```python
+class LlamaWrapper(LlamaForCausalLM):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config=config, quant_config=quant_config, prefix=prefix)
+```
+
+Now, we want to define the `forward` method, which is what will be called at inference time.
+Note that the signature for `forward` is essentially the same for any model; you can take a look at the other models defined in the [`models` directory](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/models/) for references.
+To see where exactly `forward` is called in the SGLang runtime's internals, take a look at [`forward_decode`](https://github.com/sgl-project/sglang/blob/bf72b80122fd888bf619d17b96fa3e323ab809fc/python/sglang/srt/model_executor/model_runner.py#L1705) and [`forward_extend`](https://github.com/sgl-project/sglang/blob/bf72b80122fd888bf619d17b96fa3e323ab809fc/python/sglang/srt/model_executor/model_runner.py#L1724) in the [`ModelRunner` class](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/model_executor/model_runner.py).
+
+```python
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+        input_embeds: Optional[torch.Tensor] = None,
+        get_embedding: bool = False,
+    ) -> LogitsProcessorOutput:
+```
+
+We now call the `__call__` method for `self.model` (which is a member variable that `LlamaForCausalLM` defines in its `__init__` method), which eventually calls `LlamaForCausalLM`'s `forward` method.
+After that, we feed the `hidden_states` into our model's `LogitsProcessor` (again defined in `LlamaForCausalLM`).
+
+```python
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        res: LogitsProcessorOutput = self.logits_processor(
+            input_ids,
+            hidden_states,
+            self.lm_head,
+            forward_batch,
+        )
+```
+
+After receiving the logits for the next token, we can finally perform our biasing step.
+
+```python
+        orig_logits = res.next_token_logits
+        res.next_token_logits = torch.where(
+            orig_logits > 0,
+            orig_logits.sqrt(),
+            orig_logits
+        )
+
+        return res
+```
+Now, our `LlamaWrapper` model is created and ready to be served!
+
+### Serving Our Model Via SGLang's Offline Engine
+
+The next step of this walkthrough involves hosting our new model offline, so that it can be served locally and without an HTTP server.
+
+First, create a new file called `run.py`.
+Now, we must ensure that SGLang's `ModelRegistry` can find our model.
+To do this, we first download the model's configuration and weights from Huggingface.
+
+```python
+# In the file `run.py`
+
+import asyncio
+from functools import lru_cache
+from huggingface_hub import snapshot_download
+from llama_wrapper import LlamaWrapper # Make sure to import our new model!
+import sglang as sgl
+from sglang.srt.models.registry import ModelRegistry
+
+# Make sure to request access to this model on Huggingface, then export your
+# `HF_TOKEN` to download the model snapshot
+llama_dir = snapshot_download(
+    repo_id="meta-llama/Llama-3.1-8B-Instruct",
+    local_dir="./llama_ckpt",
+)
+```
+
+Now that we have our model on disk, we want to point it to `LlamaWrapper` by changing the `architectures` field in `./llama_ckpt/config.json` to be `LlamaWrapper`.
+That way, when we pass in the path of our model checkpoint to SGLang, it will know that we want to use "LlamaWrapper" instead of "LlamaForCausalLM" as our model.
+
+```python
+{
+  "architectures": [
+   #  "LlamaForCausalLM"
+    "LlamaWrapper"
+  ],
+  ...
+}
+```
+
+However, if we don't link our `LlamaWrapper` class to the "LlamaWrapper" registry keyword, then SGLang won't be able to find our model.
+Thus, to register our `LlamaWrapper`, we want to follow the steps in the above section titled "Registering an External Model Implementation".
+
+```python
+@lru_cache()
+def import_new_model_classes():
+    model_arch_name_to_cls = {"LlamaWrapper": LlamaWrapper}
+    return model_arch_name_to_cls
+
+ModelRegistry.models.update(import_new_model_classes())
+```
+
+Lastly, when we create our `Engine`, we just pass in the path to the local model directory.
+Then, our `LlamaWrapper` is ready to be served; for this walkthrough, we will use SGLang `Engine`'s non-streaming asynchronous generation endpoint.
+
+```python
+def main():
+    llm = sgl.Engine(model_path="./llama_ckpt")
+    sampling_params = {"temperature": 0.2, "top_k": 5}
+    prompts = [
+        "Write a short, neutral self-introduction for a fictional character. Hello, my name is",
+        "Provide a concise factual statement about France’s capital city. The capital of France is",
+        "Explain possible future trends in artificial intelligence. The future of AI is",
+    ]
+
+    asyncio.run(run_llm(llm, sampling_params, prompts))
+
+    llm.shutdown()
+
+async def run_llm(
+    llm,
+    sampling_params,
+    prompts,
+) -> None:
+    outputs = await llm.async_generate(prompts, sampling_params)
+
+    for prompt, output in zip(prompts, outputs):
+        print(f"\nPrompt: {prompt}")
+        print(f"Generated text: {output['text']}")
+
+if __name__ == "__main__":
+    main()
+```
+
+Now, when we call `python run.py`, we will get the outputs of our newly created model!
+
+
+## Documentation
+Add to table of supported models in [generative_models.md](https://github.com/sgl-project/sglang/blob/main/docs/supported_models/generative_models.md) or [multimodal_language_models.md](https://github.com/sgl-project/sglang/blob/main/docs/supported_models/multimodal_language_models.md)
+
+---
+
+By following these guidelines, you can add support for new language models and multimodal large language models in
+SGLang and ensure they are thoroughly tested and easily integrated into the system.
diff --git a/docs/supported_models/transformers_fallback.md b/docs/supported_models/transformers_fallback.md
new file mode 100644
index 000000000..3c7dd961c
--- /dev/null
+++ b/docs/supported_models/transformers_fallback.md
@@ -0,0 +1,58 @@
+# Transformers fallback in SGLang
+
+`sglang` can fall back to using models that are available in `transformers`. This works for most decoder-style language models and support for vision-language models is coming soon!
+
+## Example launch Command
+
+By default, we will use sglang implementation if it is available. Otherwise, we will fall back to transformers one. However, you can switch the implementation by setting `--model-impl` to `transformers`.
+
+```shell
+python3 -m sglang.launch_server \
+  --model-path meta-llama/Llama-3.2-1B-Instruct \
+  --host 0.0.0.0 \
+  --port 30000 \
+  --model-impl transformers
+```
+
+## Supported features
+
+### Quantization
+
+Transformers fall back has supported most of available quantization in SGLang (except GGUF). See [Quantization page](../advanced_features/quantization.md) for more information about supported quantization in SGLang.
+
+### Remote code
+
+This fallback also means that any model on the hub that can be used in `transformers` with `trust_remote_code=True` that correctly implements attention can be used in production!
+
+A model just needs the following two things:
+
+```python
+from transformers import PreTrainedModel
+from torch import nn
+
+class MyAttention(nn.Module):
+
+  def forward(self, hidden_states, **kwargs): # <- kwargs are required
+
+    ...
+    attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+    attn_output, attn_weights = attention_interface(
+      self,
+      query_states,
+      key_states,
+      value_states,
+      **kwargs,
+    )
+    ...
+
+class MyModel(PreTrainedModel):
+  _supports_attention_backend = True
+```
+
+Here is what happens in the background:
+
+1. The config is loaded
+2. `MyModel` python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`.
+3. The `TransformersModel` backend is used. See `/srt/models/transformers`, which leverages `self.config._attn_implementation = "sglang"`, thus the need to use `ALL_ATTENTION_FUNCTIONS`.
+
+That's it!
diff --git a/docs/wrap_run_llm.py b/docs/wrap_run_llm.py
new file mode 100644
index 000000000..2d21442cf
--- /dev/null
+++ b/docs/wrap_run_llm.py
@@ -0,0 +1,47 @@
+import os
+import re
+
+
+def insert_runllm_widget(html_content):
+    # RunLLM Widget script to be inserted
+    widget_script = """
+    <!-- RunLLM Widget Script -->
+    <script type="module" id="runllm-widget-script" src="https://widget.runllm.com" crossorigin="true" version="stable" runllm-keyboard-shortcut="Mod+j" runllm-name="SGLang Chatbot" runllm-position="BOTTOM_RIGHT" runllm-assistant-id="629" async></script>
+    """
+
+    # Find the closing body tag and insert the widget script before it
+    return re.sub(r"</body>", f"{widget_script}\n</body>", html_content)
+
+
+def process_html_files(build_dir):
+    for root, dirs, files in os.walk(build_dir):
+        for file in files:
+            if file.endswith(".html"):
+                file_path = os.path.join(root, file)
+
+                # Read the HTML file
+                with open(file_path, "r", encoding="utf-8") as f:
+                    content = f.read()
+
+                # Insert the RunLLM widget
+                modified_content = insert_runllm_widget(content)
+
+                # Write back the modified content
+                with open(file_path, "w", encoding="utf-8") as f:
+                    f.write(modified_content)
+
+
+def main():
+    # Get the build directory path
+    build_dir = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "_build", "html"
+    )
+    # Process all HTML files
+    if os.path.exists(build_dir):
+        process_html_files(build_dir)
+    else:
+        print(f"Build directory not found: {build_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/chat_template/tool_chat_template_deepseekr1.jinja b/examples/chat_template/tool_chat_template_deepseekr1.jinja
new file mode 100644
index 000000000..bb86cbd9e
--- /dev/null
+++ b/examples/chat_template/tool_chat_template_deepseekr1.jinja
@@ -0,0 +1,92 @@
+{% if not add_generation_prompt is defined %}
+    {% set add_generation_prompt = false %}
+{% endif %}
+{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true, is_last_user=false) %}
+{%- for message in messages %}
+    {%- if message['role'] == 'system' %}
+        {%- if ns.is_first_sp %}
+            {% set ns.system_prompt = ns.system_prompt + message['content'] %}
+            {% set ns.is_first_sp = false %}
+        {%- else %}
+            {% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+
+{# --- Append tool descriptions if tools are defined --- #}
+{% if tools is defined and tools is not none %}
+    {% set tool_ns = namespace(text='You are a helpful assistant with tool calling capabilities. '
+        'When a tool call is needed, you MUST use the following format to issue the call:\n'
+        '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>FUNCTION_NAME\n'
+        '```json\n{"param1": "value1", "param2": "value2"}\n```<｜tool▁call▁end｜><｜tool▁calls▁end｜>\n\n'
+        'Make sure the JSON is valid.'
+        '## Tools\n\n### Function\n\nYou have the following functions available:\n\n') %}
+    {% for tool in tools %}
+        {% set tool_ns.text = tool_ns.text + '- `' + tool['name'] + '`:\n```json\n' + (tool | tojson) + '\n```\n' %}
+    {% endfor %}
+    {% set ns.system_prompt = ns.system_prompt + '\n\n' + tool_ns.text %}
+{% endif %}
+
+{{ bos_token }}
+{{ ns.system_prompt }}
+{%- for message in messages %}
+    {% set content = message['content'] %}
+    {%- if message['role'] == 'user' %}
+        {%- set ns.is_tool = false -%}
+        {%- set ns.is_first = false -%}
+        {%- set ns.is_last_user = true -%}
+        {{'<｜User｜>' + content + '<｜Assistant｜>'}}
+    {%- endif %}
+    {%- if message['role'] == 'assistant' %}
+        {% if '</think>' in content %}
+            {% set content = content.split('</think>')[-1] %}
+        {% endif %}
+    {% endif %}
+    {%- if message['role'] == 'assistant' and message['tool_calls'] is defined and message['tool_calls'] is not none %}
+        {%- set ns.is_last_user = false -%}
+        {%- if ns.is_tool %}
+            {{'<｜tool▁outputs▁end｜>'}}
+        {%- endif %}
+        {%- set ns.is_first = false %}
+        {%- set ns.is_tool = false -%}
+        {%- set ns.is_output_first = true %}
+        {%- for tool in message['tool_calls'] %}
+            {%- if not ns.is_first %}
+                {%- if content is none %}
+                    {{'<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+                {%- else %}
+                    {{content + '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+                {%- endif %}
+                {%- set ns.is_first = true -%}
+            {%- else %}
+                {{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+            {%- endif %}
+        {%- endfor %}
+        {{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}
+    {%- endif %}
+    {%- if message['role'] == 'assistant' and (message['tool_calls'] is not defined or message['tool_calls'] is none)%}
+        {%- set ns.is_last_user = false -%}
+        {%- if ns.is_tool %}
+            {{'<｜tool▁outputs▁end｜>' + content + '<｜end▁of▁sentence｜>'}}
+            {%- set ns.is_tool = false -%}
+        {%- else %}
+            {{content + '<｜end▁of▁sentence｜>'}}
+        {%- endif %}
+    {%- endif %}
+    {%- if message['role'] == 'tool' %}
+        {%- set ns.is_last_user = false -%}
+        {%- set ns.is_tool = true -%}
+        {%- if ns.is_output_first %}
+            {{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + content + '<｜tool▁output▁end｜>'}}
+            {%- set ns.is_output_first = false %}
+        {%- else %}
+            {{'\n<｜tool▁output▁begin｜>' + content + '<｜tool▁output▁end｜>'}}
+        {%- endif %}
+    {%- endif %}
+{%- endfor -%}
+{% if ns.is_tool %}
+    {{'<｜tool▁outputs▁end｜>'}}
+{% endif %}
+{% if add_generation_prompt and not ns.is_last_user and not ns.is_tool %}
+    {{'<｜Assistant｜>'}}
+{% endif %}
diff --git a/examples/chat_template/tool_chat_template_deepseekv3.jinja b/examples/chat_template/tool_chat_template_deepseekv3.jinja
new file mode 100644
index 000000000..46c1b8801
--- /dev/null
+++ b/examples/chat_template/tool_chat_template_deepseekv3.jinja
@@ -0,0 +1,91 @@
+{% if not add_generation_prompt is defined %}
+    {% set add_generation_prompt = false %}
+{% endif %}
+
+{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true, is_last_user=false) %}
+{%- for message in messages %}
+    {%- if message['role'] == 'system' %}
+        {%- if ns.is_first_sp %}
+            {% set ns.system_prompt = ns.system_prompt + message['content'] %}
+            {% set ns.is_first_sp = false %}
+        {%- else %}
+            {% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}
+        {%- endif %}
+    {%- endif %}
+{%- endfor -%}
+
+{# --- Append tool descriptions if tools are defined --- #}
+{% if tools is defined and tools is not none %}
+    {% set tool_ns = namespace(text='You are a helpful assistant with tool calling capabilities. '
+        'When a tool call is needed, you MUST use the following format to issue the call:\n'
+        '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>FUNCTION_NAME\n'
+        '```json\n{"param1": "value1", "param2": "value2"}\n```<｜tool▁call▁end｜><｜tool▁calls▁end｜>\n\n'
+        'Make sure the JSON is valid.'
+        '## Tools\n\n### Function\n\nYou have the following functions available:\n\n') %}
+    {% for tool in tools %}
+        {% set tool_ns.text = tool_ns.text + '\n```json\n' + (tool | tojson) + '\n```\n' %}
+    {% endfor %}
+    {% set ns.system_prompt = ns.system_prompt + '\n\n' + tool_ns.text %}
+{% endif %}
+
+{{- bos_token }}
+{{- ns.system_prompt }}
+
+{%- for message in messages %}
+    {%- if message['role'] == 'user' %}
+        {%- set ns.is_tool = false -%}
+        {%- set ns.is_first = false -%}
+        {%- set ns.is_last_user = true -%}
+        {{'<｜User｜>' + message['content'] + '<｜Assistant｜>'}}
+    {%- endif %}
+    {%- if message['role'] == 'assistant' and message['tool_calls'] is defined and message['tool_calls'] is not none %}
+        {%- set ns.is_last_user = false -%}
+        {%- if ns.is_tool %}
+            {{- '<｜tool▁outputs▁end｜>'}}
+        {%- endif %}
+        {%- set ns.is_first = false %}
+        {%- set ns.is_tool = false -%}
+        {%- set ns.is_output_first = true %}
+        {%- for tool in message['tool_calls'] %}
+            {%- if not ns.is_first %}
+                {%- if message['content'] is none %}
+                    {{- '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+                {%- else %}
+                    {{- message['content'] + '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+                {%- endif %}
+                {%- set ns.is_first = true -%}
+            {%- else %}
+                {{- '\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+            {%- endif %}
+        {%- endfor %}
+        {{- '<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}
+    {%- endif %}
+    {%- if message['role'] == 'assistant' and (message['tool_calls'] is not defined or message['tool_calls'] is none)%}
+        {%- set ns.is_last_user = false -%}
+        {%- if ns.is_tool %}
+            {{- '<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}
+            {%- set ns.is_tool = false -%}
+        {%- else %}
+            {% set content = message['content'] %}
+            {{- content + '<｜end▁of▁sentence｜>'}}
+        {%- endif %}
+    {%- endif %}
+    {%- if message['role'] == 'tool' %}
+        {%- set ns.is_last_user = false -%}
+        {%- set ns.is_tool = true -%}
+        {%- if ns.is_output_first %}
+            {{- 'Use the results below to formulate an answer to the user question unless additional information is needed.' }}
+            {{- '<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}
+            {%- set ns.is_output_first = false %}
+        {%- else %}
+            {{- '\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}
+        {%- endif %}
+    {%- endif %}
+{%- endfor -%}
+
+{% if ns.is_tool %}
+    {{- '<｜tool▁outputs▁end｜>'}}
+{% endif %}
+{% if add_generation_prompt and not ns.is_last_user and not ns.is_tool %}
+    {{- '<｜Assistant｜>'}}
+{% endif %}
diff --git a/examples/chat_template/tool_chat_template_deepseekv31.jinja b/examples/chat_template/tool_chat_template_deepseekv31.jinja
new file mode 100644
index 000000000..08e93a30a
--- /dev/null
+++ b/examples/chat_template/tool_chat_template_deepseekv31.jinja
@@ -0,0 +1,91 @@
+{% if not add_generation_prompt is defined %}
+  {% set add_generation_prompt = false %}
+{% endif %}
+{% if not thinking is defined %}
+  {% set thinking = false %}
+{% endif %}
+{% set ns = namespace(is_first=false, is_tool=false, system_prompt='', is_first_sp=true, is_last_user=false) %}
+{%- for message in messages %}
+  {%- if message['role'] == 'system' %}
+    {%- if ns.is_first_sp %}
+      {% set ns.system_prompt = ns.system_prompt + message['content'] %}
+      {% set ns.is_first_sp = false %}
+    {%- else %}
+      {% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}
+    {%- endif %}
+  {%- endif %}
+{%- endfor %}
+
+{% if tools is defined and tools is not none %}
+  {% set tool_ns = namespace(text='## Tools\nYou have access to the following tools:\n') %}
+  {% for tool in tools %}
+    {% set tool_ns.text = tool_ns.text + '\n### ' + tool.function.name + '\nDescription: ' + tool.function.description + '\n\nParameters: ' + (tool.function.parameters | tojson) + '\n' %}
+  {% endfor %}
+  {% set tool_ns.text = tool_ns.text + "\nIMPORTANT: ALWAYS adhere to this exact format for tool use:\n<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>tool_call_name<｜tool▁sep｜>tool_call_arguments<｜tool▁call▁end｜>{{additional_tool_calls}}<｜tool▁calls▁end｜>\n\nWhere:\n\n- `tool_call_name` must be an exact match to one of the available tools\n- `tool_call_arguments` must be valid JSON that strictly follows the tool's Parameters Schema\n- For multiple tool calls, chain them directly without separators or spaces\n" %}
+  {% set ns.system_prompt = ns.system_prompt + '\n\n' + tool_ns.text %}
+{% endif %}
+
+{{ bos_token }}{{ ns.system_prompt }}
+{%- for message in messages %}
+  {%- if message['role'] == 'user' %}
+    {%- set ns.is_tool = false -%}
+    {%- set ns.is_first = false -%}
+    {%- set ns.is_last_user = true -%}
+    {{'<｜User｜>' + message['content']}}
+  {%- endif %}
+  {%- if message['role'] == 'assistant' and message['tool_calls'] is defined and message['tool_calls'] is not none %}
+    {%- if ns.is_last_user %}
+      {{'<｜Assistant｜></think>'}}
+    {%- endif %}
+    {%- set ns.is_last_user = false -%}
+    {%- set ns.is_first = false %}
+    {%- set ns.is_tool = false -%}
+    {%- for tool in message['tool_calls'] %}
+      {%- if not ns.is_first %}
+        {%- if message['content'] is none %}
+          {{'<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>'+ tool['function']['name'] + '<｜tool▁sep｜>' + tool['function']['arguments']|tojson + '<｜tool▁call▁end｜>'}}
+        {%- else %}
+          {{message['content'] + '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['function']['name'] + '<｜tool▁sep｜>' + tool['function']['arguments']|tojson + '<｜tool▁call▁end｜>'}}
+        {%- endif %}
+        {%- set ns.is_first = true -%}
+      {%- else %}
+        {{'<｜tool▁call▁begin｜>'+ tool['function']['name'] + '<｜tool▁sep｜>' + tool['function']['arguments']|tojson + '<｜tool▁call▁end｜>'}}
+      {%- endif %}
+    {%- endfor %}
+    {{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}
+  {%- endif %}
+  {%- if message['role'] == 'assistant' and (message['tool_calls'] is not defined or message['tool_calls'] is none) %}
+    {%- if ns.is_last_user %}
+      {{'<｜Assistant｜>'}}
+      {%- if message['prefix'] is defined and message['prefix'] and thinking %}
+        {{'<think>'}}
+      {%- else %}
+        {{'</think>'}}
+      {%- endif %}
+    {%- endif %}
+    {%- set ns.is_last_user = false -%}
+    {%- if ns.is_tool %}
+      {{message['content'] + '<｜end▁of▁sentence｜>'}}
+      {%- set ns.is_tool = false -%}
+    {%- else %}
+      {%- set content = message['content'] -%}
+      {%- if '</think>' in content %}
+        {%- set content = content.split('</think>', 1)[1] -%}
+      {%- endif %}
+      {{content + '<｜end▁of▁sentence｜>'}}
+    {%- endif %}
+  {%- endif %}
+  {%- if message['role'] == 'tool' %}
+    {%- set ns.is_last_user = false -%}
+    {%- set ns.is_tool = true -%}
+    {{'<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}
+  {%- endif %}
+{%- endfor -%}
+{%- if add_generation_prompt and ns.is_last_user and not ns.is_tool %}
+  {{'<｜Assistant｜>'}}
+  {%- if not thinking %}
+    {{'</think>'}}
+  {%- else %}
+    {{'<think>'}}
+  {%- endif %}
+{% endif %}
diff --git a/examples/chat_template/tool_chat_template_llama4_pythonic.jinja b/examples/chat_template/tool_chat_template_llama4_pythonic.jinja
new file mode 100644
index 000000000..74b315c31
--- /dev/null
+++ b/examples/chat_template/tool_chat_template_llama4_pythonic.jinja
@@ -0,0 +1,112 @@
+{# Copied from https://github.com/wukaixingxp/vllm/blob/8a32e2a6e452a03c0e8222e3876ad6086cbf581f/examples/tool_chat_template_llama4_pythonic.jinja to enable better model response. #}
+{{- bos_token }}
+{%- if custom_tools is defined and custom_tools %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if tools is defined and tools %}
+    {%- set tool_definition = tool_definition ~ (tools | tojson(indent=4)) %}
+{%- else %}
+    {%- set tools = none %}
+{%- endif %}
+
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set user_provided_system_message = true %}
+    {%- if messages[0]['content'] is string %}
+        {%- set system_message = messages[0]['content']|trim %}
+    {%- else %}
+        {%- set system_message = messages[0]['content'][0]['text']|trim %}
+    {%- endif %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- if tools is not none  %}
+        {#- Since not system_message was provided by user, if tool is provided, system_message is now default tool system message #}
+        {#- This system message is from llama website:https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/  #}
+        {%- set system_message = "You are a helpful assistant and an expert in function composition. You can answer general questions using your internal knowledge OR invoke functions when necessary. Follow these strict guidelines:\n\n1. FUNCTION CALLS:\n- ONLY use functions that are EXPLICITLY listed in the function list below\n- If NO functions are listed (empty function list []), respond ONLY with internal knowledge or \"I don't have access to [Unavailable service] information\"\n- If a function is not in the list, respond ONLY with internal knowledge or \"I don't have access to [Unavailable service] information\"\n- If ALL required parameters are present AND the query EXACTLY matches a listed function's purpose: output ONLY the function call(s)\n- Use exact format: [func_name1(param1=value1, param2=value2), func_name2(...)]\nExamples:\nCORRECT: [get_weather(location=\"Vancouver\"), calculate_route(start=\"Boston\", end=\"New York\")] <- Only if get_weather and calculate_route are in function list\nINCORRECT: get_weather(location=\"New York\")\nINCORRECT: Let me check the weather: [get_weather(location=\"New York\")]\nINCORRECT: [get_events(location=\"Singapore\")] <- If function not in list\n\n2. RESPONSE RULES:\n- For pure function requests matching a listed function: ONLY output the function call(s)\n- For knowledge questions: ONLY output text\n- For missing parameters: ONLY request the specific missing parameters\n- For unavailable services (not in function list): output ONLY with internal knowledge or \"I don't have access to [Unavailable service] information\". Do NOT execute a function call.\n- If the query asks for information beyond what a listed function provides: output ONLY with internal knowledge about your limitations\n- NEVER combine text and function calls in the same response\n- NEVER suggest alternative functions when the requested service is unavailable\n- NEVER create or invent new functions not listed below\n\n3. STRICT BOUNDARIES:\n- ONLY use functions from the list below - no exceptions\n- NEVER use a function as an alternative to unavailable information\n- NEVER call functions not present in the function list\n- NEVER add explanatory text to function calls\n- NEVER respond with empty brackets\n- Use proper Python/JSON syntax for function calls\n- Check the function list carefully before responding\n\n4. TOOL RESPONSE HANDLING:\n- When receiving tool responses: provide concise, natural language responses\n- Don't repeat tool response verbatim\n- Don't add supplementary information\n\nHere is a list of functions in JSON format that you can invoke:\n" %}
+    {%- else %}
+        {%- set system_message = "" %}
+    {%- endif %}
+{%- endif %}
+{#- Now writing the system message: use the user provided system message if user_provided_system_message, else default tool system message if tools presented #}
+{%- if system_message %}
+    {#- always use user provided system message to override default tool system message #}
+    {{- "<|header_start|>system<|header_end|>\n\n" }}
+    {{- system_message }}
+    {%- if user_provided_system_message and tools %}
+        {{- "\nHere is a list of functions in JSON format that you can invoke. Use exact format: [func_name1(param1=value1, param2=value2), func_name2(...)]\n" }}
+        {{- tool_definition -}}
+        {%- elif tool_definition %}
+        {{- tool_definition -}}
+    {%- endif %}
+    {{- "<|eot|>" }}
+{%- endif %}
+
+{#- Now deal with all other messages #}
+{%- for message in messages %}
+    {#- Base case: messages that are not from tool role and has empty tool_call list  #}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or ('tool_calls' in message and message.tool_calls|length != 0 )) %}
+        {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}
+        {%- if message['content'] is string %}
+            {{- message['content'] }}
+        {%- else %}
+            {%- for content in message['content'] %}
+                {%- if content['type'] == 'image' %}
+                    {{- '<|image|>' }}
+                {%- elif content['type'] == 'text' %}
+                    {{- content['text'] | trim }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+    {{- "<|eot|>" }}
+    {#- Tool case: messages has non-empty tool_call list, must from assistant #}
+    {%- elif 'tool_calls' in message %}
+        {#- assume tool_calls are always coming from assistant #}
+        {%- if message.role == 'assistant' %}
+            {{- '<|header_start|>assistant<|header_end|>\n\n' -}}
+        {%- if message['content'] is string %}
+            {{- message['content'] }}
+        {%- else %}
+            {%- for content in message['content'] %}
+                {%- if content['type'] == 'image' %}
+                    {{- '<|image|>' }}
+                {%- elif content['type'] == 'text' %}
+                    {{- content['text'] }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- "[" }}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+                {{-  tool_call.name + '(' -}}
+            {%- for param in tool_call.arguments %}
+                {{- param + '="' -}}
+                {{- "%s" | format(tool_call.arguments[param]) -}}
+                {{- '"' -}}
+                {% if not loop.last %}, {% endif %}
+            {%- endfor %}
+            {{- ')' -}}
+            {% if not loop.last %}, {% endif %}
+        {%- endfor %}
+        {{- "]<|eot|>" }}
+{%- endif %}
+{#- Tool_response case: messages are from tool_response  #}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|header_start|>ipython<|header_end|>\n\n" }}
+        {%- if message.content is string %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {%- for content in message['content']  %}
+                {%- if content['type']  == 'text' %}
+                    {{- content['text'] | tojson }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- "<|eot|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|header_start|>assistant<|header_end|>\n\n' }}
+{%- endif %}
diff --git a/examples/frontend_language/quick_start/anthropic_example_chat.py b/examples/frontend_language/quick_start/anthropic_example_chat.py
new file mode 100644
index 000000000..03d699be7
--- /dev/null
+++ b/examples/frontend_language/quick_start/anthropic_example_chat.py
@@ -0,0 +1,73 @@
+"""
+Usage:
+export ANTHROPIC_API_KEY=sk-******
+python3 anthropic_example_chat.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
+
+
+def single():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+    )
+
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+
+    print("\n-- answer_1 --\n", state["answer_1"])
+
+
+def stream():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+        stream=True,
+    )
+
+    for out in state.text_iter():
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )
+
+    for s in states:
+        print(s.messages())
+
+
+if __name__ == "__main__":
+    sgl.set_default_backend(sgl.Anthropic("claude-3-haiku-20240307"))
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
diff --git a/examples/frontend_language/quick_start/anthropic_example_complete.py b/examples/frontend_language/quick_start/anthropic_example_complete.py
new file mode 100644
index 000000000..bce2a61ea
--- /dev/null
+++ b/examples/frontend_language/quick_start/anthropic_example_complete.py
@@ -0,0 +1,68 @@
+"""
+Usage:
+export ANTHROPIC_API_KEY=sk-******
+python3 anthropic_example_complete.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def few_shot_qa(s, question):
+    s += """
+\n\nHuman: What is the capital of France?
+\n\nAssistant: Paris
+\n\nHuman: What is the capital of Germany?
+\n\nAssistant: Berlin
+\n\nHuman: What is the capital of Italy?
+\n\nAssistant: Rome
+"""
+    s += "\n\nHuman: " + question + "\n"
+    s += "\n\nAssistant:" + sgl.gen("answer", temperature=0)
+
+
+def single():
+    state = few_shot_qa.run(question="What is the capital of the United States?")
+    answer = state["answer"].strip().lower()
+
+    assert "washington" in answer, f"answer: {state['answer']}"
+
+    print(state.text())
+
+
+def stream():
+    state = few_shot_qa.run(
+        question="What is the capital of the United States?", stream=True
+    )
+
+    for out in state.text_iter("answer"):
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = few_shot_qa.run_batch(
+        [
+            {"question": "What is the capital of the United States?"},
+            {"question": "What is the capital of China?"},
+        ]
+    )
+
+    for s in states:
+        print(s["answer"])
+
+
+if __name__ == "__main__":
+    sgl.set_default_backend(sgl.Anthropic("claude-3-haiku-20240307"))
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
diff --git a/examples/frontend_language/quick_start/azure_openai_example_chat.py b/examples/frontend_language/quick_start/azure_openai_example_chat.py
new file mode 100644
index 000000000..d53f935f4
--- /dev/null
+++ b/examples/frontend_language/quick_start/azure_openai_example_chat.py
@@ -0,0 +1,83 @@
+"""
+Usage:
+export AZURE_OPENAI_API_KEY=sk-******
+python3 openai_example_chat.py
+"""
+
+import os
+
+import sglang as sgl
+
+
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.system("You are a helpful assistant.")
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
+
+
+def single():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+    )
+
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+
+    print("\n-- answer_1 --\n", state["answer_1"])
+
+
+def stream():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+        stream=True,
+    )
+
+    for out in state.text_iter():
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )
+
+    for s in states:
+        print(s.messages())
+
+
+if __name__ == "__main__":
+    backend = sgl.OpenAI(
+        model_name="azure-gpt-4",
+        api_version="2023-07-01-preview",
+        azure_endpoint="https://oai-arena-sweden.openai.azure.com/",
+        api_key=os.environ["AZURE_OPENAI_API_KEY"],
+        is_azure=True,
+    )
+    sgl.set_default_backend(backend)
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
diff --git a/examples/frontend_language/quick_start/gemini_example_chat.py b/examples/frontend_language/quick_start/gemini_example_chat.py
new file mode 100644
index 000000000..0ae623109
--- /dev/null
+++ b/examples/frontend_language/quick_start/gemini_example_chat.py
@@ -0,0 +1,73 @@
+"""
+Usage:
+export GCP_PROJECT_ID=******
+python3 gemini_example_chat.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
+
+
+def single():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+    )
+
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+
+    print("\n-- answer_1 --\n", state["answer_1"])
+
+
+def stream():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+        stream=True,
+    )
+
+    for out in state.text_iter():
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )
+
+    for s in states:
+        print(s.messages())
+
+
+if __name__ == "__main__":
+    sgl.set_default_backend(sgl.VertexAI("gemini-pro"))
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
diff --git a/examples/frontend_language/quick_start/gemini_example_complete.py b/examples/frontend_language/quick_start/gemini_example_complete.py
new file mode 100644
index 000000000..5188bf418
--- /dev/null
+++ b/examples/frontend_language/quick_start/gemini_example_complete.py
@@ -0,0 +1,68 @@
+"""
+Usage:
+export GCP_PROJECT_ID=******
+python3 gemini_example_complete.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def few_shot_qa(s, question):
+    s += """The following are questions with answers.
+Q: What is the capital of France?
+A: Paris
+Q: What is the capital of Germany?
+A: Berlin
+Q: What is the capital of Italy?
+A: Rome
+"""
+    s += "Q: " + question + "\n"
+    s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
+
+
+def single():
+    state = few_shot_qa.run(question="What is the capital of the United States?")
+    answer = state["answer"].strip().lower()
+
+    assert "washington" in answer, f"answer: {state['answer']}"
+
+    print(state.text())
+
+
+def stream():
+    state = few_shot_qa.run(
+        question="What is the capital of the United States?", stream=True
+    )
+
+    for out in state.text_iter("answer"):
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = few_shot_qa.run_batch(
+        [
+            {"question": "What is the capital of the United States?"},
+            {"question": "What is the capital of China?"},
+        ]
+    )
+
+    for s in states:
+        print(s["answer"])
+
+
+if __name__ == "__main__":
+    sgl.set_default_backend(sgl.VertexAI("gemini-pro"))
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
diff --git a/examples/frontend_language/quick_start/gemini_example_multimodal_chat.py b/examples/frontend_language/quick_start/gemini_example_multimodal_chat.py
new file mode 100644
index 000000000..afe0c723f
--- /dev/null
+++ b/examples/frontend_language/quick_start/gemini_example_multimodal_chat.py
@@ -0,0 +1,30 @@
+"""
+Usage:
+export GCP_PROJECT_ID=******
+python3 gemini_example_multimodal_chat.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def image_qa(s, image_file1, image_file2, question):
+    s += sgl.user(sgl.image(image_file1) + sgl.image(image_file2) + question)
+    s += sgl.assistant(sgl.gen("answer", max_tokens=256))
+
+
+if __name__ == "__main__":
+    sgl.set_default_backend(sgl.VertexAI("gemini-pro-vision"))
+
+    state = image_qa.run(
+        image_file1="./images/cat.jpeg",
+        image_file2="./images/dog.jpeg",
+        question="Describe difference of the two images in one sentence.",
+        stream=True,
+    )
+
+    for out in state.text_iter("answer"):
+        print(out, end="", flush=True)
+    print()
+
+    print(state["answer"])
diff --git a/examples/frontend_language/quick_start/images/cat.jpeg b/examples/frontend_language/quick_start/images/cat.jpeg
new file mode 100644
index 000000000..a6a8e48c9
Binary files /dev/null and b/examples/frontend_language/quick_start/images/cat.jpeg differ
diff --git a/examples/frontend_language/quick_start/images/dog.jpeg b/examples/frontend_language/quick_start/images/dog.jpeg
new file mode 100644
index 000000000..dc4b40e21
Binary files /dev/null and b/examples/frontend_language/quick_start/images/dog.jpeg differ
diff --git a/examples/frontend_language/quick_start/local_example_chat.py b/examples/frontend_language/quick_start/local_example_chat.py
new file mode 100644
index 000000000..e1e4b62cc
--- /dev/null
+++ b/examples/frontend_language/quick_start/local_example_chat.py
@@ -0,0 +1,75 @@
+"""
+Usage:
+python3 local_example_chat.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
+
+
+def single():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+    )
+
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+
+    print("\n-- answer_1 --\n", state["answer_1"])
+
+
+def stream():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+        stream=True,
+    )
+
+    for out in state.text_iter():
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )
+
+    for s in states:
+        print(s.messages())
+
+
+if __name__ == "__main__":
+    runtime = sgl.Runtime(model_path="meta-llama/Llama-2-7b-chat-hf")
+    sgl.set_default_backend(runtime)
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
+
+    runtime.shutdown()
diff --git a/examples/frontend_language/quick_start/local_example_complete.py b/examples/frontend_language/quick_start/local_example_complete.py
new file mode 100644
index 000000000..00a451cf6
--- /dev/null
+++ b/examples/frontend_language/quick_start/local_example_complete.py
@@ -0,0 +1,70 @@
+"""
+Usage:
+python3 local_example_complete.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def few_shot_qa(s, question):
+    s += """The following are questions with answers.
+Q: What is the capital of France?
+A: Paris
+Q: What is the capital of Germany?
+A: Berlin
+Q: What is the capital of Italy?
+A: Rome
+"""
+    s += "Q: " + question + "\n"
+    s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
+
+
+def single():
+    state = few_shot_qa.run(question="What is the capital of the United States?")
+    answer = state["answer"].strip().lower()
+
+    assert "washington" in answer, f"answer: {state['answer']}"
+
+    print(state.text())
+
+
+def stream():
+    state = few_shot_qa.run(
+        question="What is the capital of the United States?", stream=True
+    )
+
+    for out in state.text_iter("answer"):
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = few_shot_qa.run_batch(
+        [
+            {"question": "What is the capital of the United States?"},
+            {"question": "What is the capital of China?"},
+        ]
+    )
+
+    for s in states:
+        print(s["answer"])
+
+
+if __name__ == "__main__":
+    runtime = sgl.Runtime(model_path="meta-llama/Llama-2-7b-chat-hf")
+    sgl.set_default_backend(runtime)
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
+
+    runtime.shutdown()
diff --git a/examples/frontend_language/quick_start/local_example_llava_next.py b/examples/frontend_language/quick_start/local_example_llava_next.py
new file mode 100644
index 000000000..c941a549e
--- /dev/null
+++ b/examples/frontend_language/quick_start/local_example_llava_next.py
@@ -0,0 +1,78 @@
+"""
+Usage: python3 local_example_llava_next.py
+"""
+
+import sglang as sgl
+from sglang.lang.chat_template import get_chat_template
+
+
+@sgl.function
+def image_qa(s, image_path, question):
+    s += sgl.user(sgl.image(image_path) + question)
+    s += sgl.assistant(sgl.gen("answer"))
+
+
+def single():
+    state = image_qa.run(
+        image_path="images/cat.jpeg", question="What is this?", max_new_tokens=128
+    )
+    print(state["answer"], "\n")
+
+
+def stream():
+    state = image_qa.run(
+        image_path="images/cat.jpeg",
+        question="What is this?",
+        max_new_tokens=64,
+        stream=True,
+    )
+
+    for out in state.text_iter("answer"):
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = image_qa.run_batch(
+        [
+            {"image_path": "images/cat.jpeg", "question": "What is this?"},
+            {"image_path": "images/dog.jpeg", "question": "What is this?"},
+        ],
+        max_new_tokens=128,
+    )
+    for s in states:
+        print(s["answer"], "\n")
+
+
+if __name__ == "__main__":
+    import multiprocessing as mp
+
+    mp.set_start_method("spawn", force=True)
+
+    runtime = sgl.Runtime(model_path="lmms-lab/llama3-llava-next-8b")
+    runtime.endpoint.chat_template = get_chat_template("llama-3-instruct-llava")
+
+    # Or you can use the 72B model
+    # runtime = sgl.Runtime(model_path="lmms-lab/llava-next-72b", tp_size=8)
+    # runtime.endpoint.chat_template = get_chat_template("chatml-llava")
+
+    sgl.set_default_backend(runtime)
+    print(f"chat template: {runtime.endpoint.chat_template.name}")
+
+    # Or you can use API models
+    # sgl.set_default_backend(sgl.OpenAI("gpt-4-vision-preview"))
+    # sgl.set_default_backend(sgl.VertexAI("gemini-pro-vision"))
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
+
+    runtime.shutdown()
diff --git a/examples/frontend_language/quick_start/openai_example_chat.py b/examples/frontend_language/quick_start/openai_example_chat.py
new file mode 100644
index 000000000..9511e21cf
--- /dev/null
+++ b/examples/frontend_language/quick_start/openai_example_chat.py
@@ -0,0 +1,74 @@
+"""
+Usage:
+export OPENAI_API_KEY=sk-******
+python3 openai_example_chat.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.system("You are a helpful assistant.")
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
+
+
+def single():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+    )
+
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+
+    print("\n-- answer_1 --\n", state["answer_1"])
+
+
+def stream():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+        stream=True,
+    )
+
+    for out in state.text_iter():
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )
+
+    for s in states:
+        print(s.messages())
+
+
+if __name__ == "__main__":
+    sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo"))
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
diff --git a/examples/frontend_language/quick_start/openai_example_complete.py b/examples/frontend_language/quick_start/openai_example_complete.py
new file mode 100644
index 000000000..d64bcaf1c
--- /dev/null
+++ b/examples/frontend_language/quick_start/openai_example_complete.py
@@ -0,0 +1,68 @@
+"""
+Usage:
+export OPENAI_API_KEY=sk-******
+python3 openai_example_complete.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def few_shot_qa(s, question):
+    s += """The following are questions with answers.
+Q: What is the capital of France?
+A: Paris
+Q: What is the capital of Germany?
+A: Berlin
+Q: What is the capital of Italy?
+A: Rome
+"""
+    s += "Q: " + question + "\n"
+    s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
+
+
+def single():
+    state = few_shot_qa.run(question="What is the capital of the United States?")
+    answer = state["answer"].strip().lower()
+
+    assert "washington" in answer, f"answer: {state['answer']}"
+
+    print(state.text())
+
+
+def stream():
+    state = few_shot_qa.run(
+        question="What is the capital of the United States?", stream=True
+    )
+
+    for out in state.text_iter("answer"):
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = few_shot_qa.run_batch(
+        [
+            {"question": "What is the capital of the United States?"},
+            {"question": "What is the capital of China?"},
+        ]
+    )
+
+    for s in states:
+        print(s["answer"])
+
+
+if __name__ == "__main__":
+    sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo-instruct"))
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
diff --git a/examples/frontend_language/quick_start/openai_example_n.py b/examples/frontend_language/quick_start/openai_example_n.py
new file mode 100644
index 000000000..25372b9f4
--- /dev/null
+++ b/examples/frontend_language/quick_start/openai_example_n.py
@@ -0,0 +1,71 @@
+"""
+Usage:
+export OPENAI_API_KEY=sk-******
+python3 openai_example_chat.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.system("You are a helpful assistant.")
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=1024, n=2))
+    s += sgl.user(question_2)
+    s += sgl.assistant(
+        sgl.gen(
+            "answer_2",
+            max_tokens=1024,
+        )
+    )
+
+
+def single():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+    )
+
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+
+    print("\n-- answer_1 --\n", state["answer_1"])
+    print("\n-- answer_2 --\n", state["answer_2"])
+    assert isinstance(state["answer_1"], list)
+    assert len(state["answer_1"]) == 2
+    assert isinstance(state["answer_2"], str)
+
+
+def batch():
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )
+
+    for s in states:
+        print(s.messages())
+        print("\n-- answer_1 --\n", s["answer_1"])
+        print("\n-- answer_2 --\n", s["answer_2"])
+        assert isinstance(s["answer_1"], list)
+        assert len(s["answer_1"]) == 2
+        assert isinstance(s["answer_2"], str)
+
+
+if __name__ == "__main__":
+    sgl.set_default_backend(sgl.OpenAI("o1"))
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
diff --git a/examples/frontend_language/quick_start/openai_example_o1.py b/examples/frontend_language/quick_start/openai_example_o1.py
new file mode 100644
index 000000000..2e5c14002
--- /dev/null
+++ b/examples/frontend_language/quick_start/openai_example_o1.py
@@ -0,0 +1,57 @@
+"""
+Usage:
+export OPENAI_API_KEY=sk-******
+python3 openai_example_chat.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.system("You are a helpful assistant.")
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=100))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2"))
+
+
+def single():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+    )
+
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+
+    print("\n-- answer_1 --\n", state["answer_1"])
+
+
+def batch():
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )
+
+    for s in states:
+        print(s.messages())
+
+
+if __name__ == "__main__":
+    sgl.set_default_backend(sgl.OpenAI("o1"))
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
diff --git a/examples/frontend_language/quick_start/openrouter_example_chat.py b/examples/frontend_language/quick_start/openrouter_example_chat.py
new file mode 100644
index 000000000..a0b6f15bc
--- /dev/null
+++ b/examples/frontend_language/quick_start/openrouter_example_chat.py
@@ -0,0 +1,81 @@
+"""
+Usage:
+export OPENROUTER_API_KEY=sk-******
+python3 together_example_chat.py
+"""
+
+import os
+
+import sglang as sgl
+
+
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.system("You are a helpful assistant.")
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
+
+
+def single():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+    )
+
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+
+    print("\n-- answer_1 --\n", state["answer_1"])
+
+
+def stream():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+        stream=True,
+    )
+
+    for out in state.text_iter():
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )
+
+    for s in states:
+        print(s.messages())
+
+
+if __name__ == "__main__":
+    backend = sgl.OpenAI(
+        model_name="google/gemma-7b-it:free",
+        base_url="https://openrouter.ai/api/v1",
+        api_key=os.environ.get("OPENROUTER_API_KEY"),
+    )
+    sgl.set_default_backend(backend)
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
diff --git a/examples/frontend_language/quick_start/together_example_chat.py b/examples/frontend_language/quick_start/together_example_chat.py
new file mode 100644
index 000000000..2d2059062
--- /dev/null
+++ b/examples/frontend_language/quick_start/together_example_chat.py
@@ -0,0 +1,81 @@
+"""
+Usage:
+export TOGETHER_API_KEY=sk-******
+python3 together_example_chat.py
+"""
+
+import os
+
+import sglang as sgl
+
+
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.system("You are a helpful assistant.")
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
+
+
+def single():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+    )
+
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+
+    print("\n-- answer_1 --\n", state["answer_1"])
+
+
+def stream():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+        stream=True,
+    )
+
+    for out in state.text_iter():
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )
+
+    for s in states:
+        print(s.messages())
+
+
+if __name__ == "__main__":
+    backend = sgl.OpenAI(
+        model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
+        base_url="https://api.together.xyz/v1",
+        api_key=os.environ.get("TOGETHER_API_KEY"),
+    )
+    sgl.set_default_backend(backend)
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
diff --git a/examples/frontend_language/quick_start/together_example_complete.py b/examples/frontend_language/quick_start/together_example_complete.py
new file mode 100644
index 000000000..d9119ed6c
--- /dev/null
+++ b/examples/frontend_language/quick_start/together_example_complete.py
@@ -0,0 +1,76 @@
+"""
+Usage:
+export TOGETHER_API_KEY=sk-******
+python3 together_example_complete.py
+"""
+
+import os
+
+import sglang as sgl
+
+
+@sgl.function
+def few_shot_qa(s, question):
+    s += """The following are questions with answers.
+Q: What is the capital of France?
+A: Paris
+Q: What is the capital of Germany?
+A: Berlin
+Q: What is the capital of Italy?
+A: Rome
+"""
+    s += "Q: " + question + "\n"
+    s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
+
+
+def single():
+    state = few_shot_qa.run(question="What is the capital of the United States?")
+    answer = state["answer"].strip().lower()
+
+    assert "washington" in answer, f"answer: {state['answer']}"
+
+    print(state.text())
+
+
+def stream():
+    state = few_shot_qa.run(
+        question="What is the capital of the United States?", stream=True
+    )
+
+    for out in state.text_iter("answer"):
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = few_shot_qa.run_batch(
+        [
+            {"question": "What is the capital of the United States?"},
+            {"question": "What is the capital of China?"},
+        ]
+    )
+
+    for s in states:
+        print(s["answer"])
+
+
+if __name__ == "__main__":
+    backend = sgl.OpenAI(
+        model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
+        is_chat_model=False,
+        base_url="https://api.together.xyz/v1",
+        api_key=os.environ.get("TOGETHER_API_KEY"),
+    )
+    sgl.set_default_backend(backend)
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
diff --git a/examples/frontend_language/usage/chinese_regex.py b/examples/frontend_language/usage/chinese_regex.py
new file mode 100644
index 000000000..78e9c7e16
--- /dev/null
+++ b/examples/frontend_language/usage/chinese_regex.py
@@ -0,0 +1,53 @@
+import sglang as sgl
+
+character_regex = (
+    r"""\{\n"""
+    + r"""    "姓名": "[^"]{1,32}",\n"""
+    + r"""    "学院": "(格兰芬多|赫奇帕奇|拉文克劳|斯莱特林)",\n"""
+    + r"""    "血型": "(纯血|混血|麻瓜)",\n"""
+    + r"""    "职业": "(学生|教师|傲罗|魔法部|食死徒|凤凰社成员)",\n"""
+    + r"""    "魔杖": \{\n"""
+    + r"""        "材质": "[^"]{1,32}",\n"""
+    + r"""        "杖芯": "[^"]{1,32}",\n"""
+    + r"""        "长度": [0-9]{1,2}\.[0-9]{0,2}\n"""
+    + r"""    \},\n"""
+    + r"""    "存活": "(存活|死亡)",\n"""
+    + r"""    "守护神": "[^"]{1,32}",\n"""
+    + r"""    "博格特": "[^"]{1,32}"\n"""
+    + r"""\}"""
+)
+
+
+@sgl.function
+def character_gen(s, name):
+    s += name + " 是一名哈利波特系列小说中的角色。请填写以下关于这个角色的信息。"
+    s += """\
+这是一个例子
+{
+    "姓名": "哈利波特",
+    "学院": "格兰芬多",
+    "血型": "混血",
+    "职业": "学生",
+    "魔杖": {
+        "材质": "冬青木",
+        "杖芯": "凤凰尾羽",
+        "长度": 11.0
+    },
+    "存活": "存活",
+    "守护神": "麋鹿",
+    "博格特": "摄魂怪"
+}
+"""
+    s += f"现在请你填写{name}的信息：\n"
+    s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
+
+
+def main():
+    backend = sgl.RuntimeEndpoint("http://localhost:30000")
+    sgl.set_default_backend(backend)
+    ret = character_gen.run(name="赫敏格兰杰", temperature=0)
+    print(ret.text())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/frontend_language/usage/choices_logprob.py b/examples/frontend_language/usage/choices_logprob.py
new file mode 100644
index 000000000..6cd733fe9
--- /dev/null
+++ b/examples/frontend_language/usage/choices_logprob.py
@@ -0,0 +1,44 @@
+"""
+Usage:
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+python choices_logprob.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def tool_use(s, question):
+    s += "To answer this question: " + question + ", "
+    s += "I need to use a " + sgl.gen("tool", choices=["calculator", "search engine"])
+
+
+def main():
+    # Run one case
+    question = "What is 5 + 5?"
+    state = tool_use.run(question)
+    print("questions:", question)
+    print("choice:", state["tool"])
+    meta_info = state.get_meta_info("tool")
+    print("logprobs of choice 1", meta_info["input_token_logprobs"][0])
+    print("logprobs of choice 2", meta_info["input_token_logprobs"][1])
+    print("-" * 50)
+
+    # Run a batch
+    questions = [
+        "What is 5 + 6?",
+        "Who is Michael Jordan?",
+    ]
+    states = tool_use.run_batch([{"question": q} for q in questions])
+    for question, state in zip(questions, states):
+        print("questions:", question)
+        print("choice:", state["tool"])
+        meta_info = state.get_meta_info("tool")
+        print("logprobs of choice 1", meta_info["input_token_logprobs"][0])
+        print("logprobs of choice 2", meta_info["input_token_logprobs"][1])
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))
+    main()
diff --git a/examples/frontend_language/usage/cot_decoding.py b/examples/frontend_language/usage/cot_decoding.py
new file mode 100644
index 000000000..7a7a04bce
--- /dev/null
+++ b/examples/frontend_language/usage/cot_decoding.py
@@ -0,0 +1,115 @@
+from math import exp
+from pprint import pformat
+
+import sglang as sgl
+
+YELLOW = "\033[1;33m"
+GREEN = "\033[1;32m"
+BLUE = "\033[1;34m"
+CLEAR = "\033[1;0m"
+
+
+@sgl.function
+def cot_decoding(s, question, get_top_k, is_chat_model, verbose):
+    """CoT Decoding: http://arxiv.org/abs/2402.10200"""
+
+    if is_chat_model:
+        s += sgl.user("Question: " + question + "\nAnswer:")
+        s += sgl.assistant_begin()
+    else:
+        s += "Question: " + question + "\nAnswer:"
+
+    step_0 = s.fork(1)[0]
+    forks = s.fork(get_top_k)
+    answer_forks = s.fork(get_top_k)
+
+    # decoding step 0
+    step_0 += sgl.gen(
+        "get_top_k",
+        max_tokens=0,
+        return_logprob=True,
+        top_logprobs_num=get_top_k,
+        return_text_in_logprobs=True,
+    )
+    logprobs = step_0.get_meta_info("get_top_k")["output_top_logprobs"][0]
+
+    print("Decoding step 0:", ", ".join(pformat(token[2]) for token in logprobs))
+    for idx, (f, token) in enumerate(zip(forks, logprobs)):
+        logprob, token_id, text = token
+        f += text
+
+        if text == "<|end_of_text|>":
+            print(
+                f"{YELLOW}Path #{idx} {pformat(text)}[{exp(logprob):.3f}] (score=nan, answer=nan){CLEAR}"
+            )
+            continue
+
+        # continue greedy decoding
+        f += sgl.gen(
+            "answer",
+            temperature=0,
+            max_tokens=1024,
+            return_logprob=True,
+            top_logprobs_num=2,
+            return_text_in_logprobs=True,
+        )
+
+        # calculate probability disparity between the top and secondary tokens
+        x1s = [exp(xt[0][0]) for xt in f.get_meta_info("answer")["output_top_logprobs"]]
+        x2s = [exp(xt[1][0]) for xt in f.get_meta_info("answer")["output_top_logprobs"]]
+        tokens = [xt[0][2] for xt in f.get_meta_info("answer")["output_top_logprobs"]]
+        delta = (sum(x1s) - sum(x2s)) / len(x1s)
+
+        # extract the answer span (without the '<|end_of_text|>' token)
+        answer_forks[idx] += text + f["answer"] + "\nSo the answer is"
+        answer_forks[idx] += sgl.gen(
+            "answer_span",
+            temperature=0,
+            max_tokens=64,
+            return_logprob=True,
+            top_logprobs_num=2,
+            return_text_in_logprobs=True,
+        )
+        answer = answer_forks[idx]["answer_span"].replace("\n", " ").strip(":")
+        print(
+            f"{YELLOW}Path #{idx} {pformat(text)}[{exp(logprob):.3f}] (score={delta}, answer={answer}){CLEAR}"
+        )
+        generated_text = str(answer_forks[idx])[len("ProgramState(") : -1]
+        print(f"{BLUE}{pformat(generated_text)}{CLEAR}")
+
+        if verbose:
+            answer_tokens = [
+                xt[0][2]
+                for xt in answer_forks[idx].get_meta_info("answer_span")[
+                    "output_top_logprobs"
+                ]
+            ]
+            answer_x1s = [
+                exp(xt[0][0])
+                for xt in answer_forks[idx].get_meta_info("answer_span")[
+                    "output_top_logprobs"
+                ]
+            ]
+            answer_x2s = [
+                exp(xt[1][0])
+                for xt in answer_forks[idx].get_meta_info("answer_span")[
+                    "output_top_logprobs"
+                ]
+            ]
+
+            for token, x1, x2 in zip(tokens, x1s, x2s):
+                print(f" {GREEN}{pformat(token)}{CLEAR}({x1:.3f}-{x2:.3f})", end="")
+            print("\n===========")
+            for token, x1, x2 in zip(answer_tokens, answer_x1s, answer_x2s):
+                print(f" {GREEN}{pformat(token)}{CLEAR}({x1:.3f}-{x2:.3f})", end="")
+            print()
+
+
+sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))
+
+state = cot_decoding.run(
+    question=r"Claire makes a 3 egg omelet every morning for breakfast. How many dozens of eggs will she eat in 4 weeks?",
+    get_top_k=10,
+    is_chat_model=True,
+    verbose=False,
+)
diff --git a/examples/frontend_language/usage/json_decode.py b/examples/frontend_language/usage/json_decode.py
new file mode 100644
index 000000000..5dc3522d5
--- /dev/null
+++ b/examples/frontend_language/usage/json_decode.py
@@ -0,0 +1,83 @@
+"""
+Usage:
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+python json_decode.py
+"""
+
+from enum import Enum
+
+from pydantic import BaseModel
+
+import sglang as sgl
+from sglang.srt.constrained.outlines_backend import build_regex_from_object
+
+character_regex = (
+    r"""\{\n"""
+    + r"""    "name": "[\w\d\s]{1,16}",\n"""
+    + r"""    "house": "(Gryffindor|Slytherin|Ravenclaw|Hufflepuff)",\n"""
+    + r"""    "blood status": "(Pure-blood|Half-blood|Muggle-born)",\n"""
+    + r"""    "occupation": "(student|teacher|auror|ministry of magic|death eater|order of the phoenix)",\n"""
+    + r"""    "wand": \{\n"""
+    + r"""        "wood": "[\w\d\s]{1,16}",\n"""
+    + r"""        "core": "[\w\d\s]{1,16}",\n"""
+    + r"""        "length": [0-9]{1,2}\.[0-9]{0,2}\n"""
+    + r"""    \},\n"""
+    + r"""    "alive": "(Alive|Deceased)",\n"""
+    + r"""    "patronus": "[\w\d\s]{1,16}",\n"""
+    + r"""    "bogart": "[\w\d\s]{1,16}"\n"""
+    + r"""\}"""
+)
+
+
+@sgl.function
+def character_gen(s, name):
+    s += (
+        name
+        + " is a character in Harry Potter. Please fill in the following information about this character.\n"
+    )
+    s += "The constrained regex is:\n"
+    s += character_regex + "\n"
+    s += "The JSON output is:\n"
+    s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
+
+
+def driver_character_gen():
+    state = character_gen.run(name="Hermione Granger")
+    print(state.text())
+
+
+class Weapon(str, Enum):
+    sword = "sword"
+    axe = "axe"
+    mace = "mace"
+    spear = "spear"
+    bow = "bow"
+    crossbow = "crossbow"
+
+
+class Wizard(BaseModel):
+    name: str
+    age: int
+    weapon: Weapon
+
+
+@sgl.function
+def pydantic_wizard_gen(s):
+    s += "Give me a description about a wizard in the JSON format.\n"
+    s += sgl.gen(
+        "character",
+        max_tokens=128,
+        temperature=0,
+        regex=build_regex_from_object(Wizard),  # Requires pydantic >= 2.0
+    )
+
+
+def driver_pydantic_wizard_gen():
+    state = pydantic_wizard_gen.run()
+    print(state.text())
+
+
+if __name__ == "__main__":
+    sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))
+    driver_character_gen()
+    # driver_pydantic_wizard_gen()
diff --git a/examples/frontend_language/usage/json_logprobs.py b/examples/frontend_language/usage/json_logprobs.py
new file mode 100644
index 000000000..15206a619
--- /dev/null
+++ b/examples/frontend_language/usage/json_logprobs.py
@@ -0,0 +1,103 @@
+# NOTE: Currently this can only be run through HTTP requests.
+from concurrent.futures import ThreadPoolExecutor
+
+from json_decode import character_regex
+
+from sglang.utils import http_request
+
+character_names = ["Hermione Granger", "Ron Weasley", "Harry Potter"]
+
+base_url = "http://localhost:30000"
+
+prompt = "is a character in Harry Potter. Please fill in the following information about this character.\n"
+
+
+def openai_api_request(name):
+    data = {
+        "model": "",
+        "prompt": name + prompt,
+        "temperature": 0,
+        "max_tokens": 128,
+        "regex": character_regex,
+        "logprobs": 3,
+    }
+    res = http_request(base_url + "/v1/completions", json=data).json()
+
+    # with open(f"json_logprobs_{name.replace(' ', '_')}_tmp.json", "w") as fout:
+    #     fout.write(json.dumps(res, indent=4))
+
+    logprobs = res["choices"][0]["logprobs"]
+    usage = res["usage"]
+    assert len(logprobs["token_logprobs"]) == len(logprobs["tokens"])
+    assert len(logprobs["token_logprobs"]) == len(logprobs["top_logprobs"])
+    assert len(logprobs["token_logprobs"]) == usage["completion_tokens"] - 1
+
+    return res
+
+
+def srt_api_request(name):
+    data = {
+        "text": name + prompt,
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 128,
+            "regex": character_regex,
+        },
+        "return_logprob": True,
+        "logprob_start_len": 0,
+        "top_logprobs_num": 3,
+        "return_text_in_logprobs": True,
+    }
+
+    res = http_request(base_url + "/generate", json=data).json()
+
+    # with open(f"json_logprobs_{name.replace(' ', '_')}_tmp.json", "w") as fout:
+    #     fout.write(json.dumps(res, indent=4))
+
+    meta_info = res["meta_info"]
+    assert len(meta_info["input_token_logprobs"]) == len(
+        meta_info["input_top_logprobs"]
+    )
+    assert len(meta_info["output_token_logprobs"]) == len(
+        meta_info["output_top_logprobs"]
+    )
+    assert len(meta_info["input_token_logprobs"]) == meta_info["prompt_tokens"]
+    assert len(meta_info["output_token_logprobs"]) == meta_info["completion_tokens"] - 1
+
+    return res
+
+
+def pretty_print(res):
+    meta_info = res["meta_info"]
+
+    print("\n\n", "=" * 30, "Prefill", "=" * 30)
+    for i in range(len(meta_info["input_token_logprobs"])):
+        print(f"{str(meta_info['input_token_logprobs'][i][2].encode()): <20}", end="")
+        top_ks = (
+            [str(t[2].encode()) for t in meta_info["input_top_logprobs"][i]]
+            if meta_info["input_top_logprobs"][i]
+            else []
+        )
+        for top_k in top_ks:
+            print(f"{top_k: <15}", end="")
+        print()
+
+    print("\n\n", "=" * 30, "Decode", "=" * 30)
+    for i in range(len(meta_info["output_token_logprobs"])):
+        print(f"{str(meta_info['output_token_logprobs'][i][2].encode()): <20}", end="")
+        top_ks = [str(t[2].encode()) for t in meta_info["output_top_logprobs"][i]]
+        for top_k in top_ks:
+            print(f"{top_k: <15}", end="")
+        print()
+
+    print(res["text"])
+
+
+if __name__ == "__main__":
+    with ThreadPoolExecutor() as executor:
+        ress = executor.map(srt_api_request, character_names)
+
+    for res in ress:
+        pretty_print(res)
+
+    openai_api_request("Hermione Granger")
diff --git a/examples/frontend_language/usage/llava_video/srt_example_llava_v.py b/examples/frontend_language/usage/llava_video/srt_example_llava_v.py
new file mode 100644
index 000000000..ec5b334b0
--- /dev/null
+++ b/examples/frontend_language/usage/llava_video/srt_example_llava_v.py
@@ -0,0 +1,260 @@
+"""
+Usage:
+pip install opencv-python-headless
+
+python3 srt_example_llava_v.py
+"""
+
+import argparse
+import csv
+import json
+import os
+import time
+
+import requests
+
+import sglang as sgl
+
+
+@sgl.function
+def video_qa(s, num_frames, video_path, question):
+    s += sgl.user(sgl.video(video_path, num_frames) + question)
+    s += sgl.assistant(sgl.gen("answer"))
+
+
+def single(path, num_frames=16):
+    state = video_qa.run(
+        num_frames=num_frames,
+        video_path=path,
+        question="Please provide a detailed description of the video, focusing on the main subjects, their actions, the background scenes",
+        temperature=0.0,
+        max_new_tokens=1024,
+    )
+    print(state["answer"], "\n")
+
+
+def split_into_chunks(lst, num_chunks):
+    """Split a list into a specified number of chunks."""
+    # Calculate the chunk size using integer division. Note that this may drop some items if not evenly divisible.
+    chunk_size = len(lst) // num_chunks
+
+    if chunk_size == 0:
+        chunk_size = len(lst)
+    # Use list comprehension to generate chunks. The last chunk will take any remainder if the list size isn't evenly divisible.
+    chunks = [lst[i : i + chunk_size] for i in range(0, len(lst), chunk_size)]
+    # Ensure we have exactly num_chunks chunks, even if some are empty
+    chunks.extend([[] for _ in range(num_chunks - len(chunks))])
+    return chunks
+
+
+def save_batch_results(batch_video_files, states, cur_chunk, batch_idx, save_dir):
+    csv_filename = f"{save_dir}/chunk_{cur_chunk}_batch_{batch_idx}.csv"
+    with open(csv_filename, "w", newline="") as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerow(["video_name", "answer"])
+        for video_path, state in zip(batch_video_files, states):
+            video_name = os.path.basename(video_path)
+            writer.writerow([video_name, state["answer"]])
+
+
+def compile_and_cleanup_final_results(cur_chunk, num_batches, save_dir):
+    final_csv_filename = f"{save_dir}/final_results_chunk_{cur_chunk}.csv"
+    with open(final_csv_filename, "w", newline="") as final_csvfile:
+        writer = csv.writer(final_csvfile)
+        writer.writerow(["video_name", "answer"])
+        for batch_idx in range(num_batches):
+            batch_csv_filename = f"{save_dir}/chunk_{cur_chunk}_batch_{batch_idx}.csv"
+            with open(batch_csv_filename, "r") as batch_csvfile:
+                reader = csv.reader(batch_csvfile)
+                next(reader)  # Skip header row
+                for row in reader:
+                    writer.writerow(row)
+            os.remove(batch_csv_filename)
+
+
+def find_video_files(video_dir):
+    # Check if the video_dir is actually a file
+    if os.path.isfile(video_dir):
+        # If it's a file, return it as a single-element list
+        return [video_dir]
+
+    # Original logic to find video files in a directory
+    video_files = []
+    for root, dirs, files in os.walk(video_dir):
+        for file in files:
+            if file.endswith((".mp4", ".avi", ".mov")):
+                video_files.append(os.path.join(root, file))
+    return video_files
+
+
+def batch(video_dir, save_dir, cur_chunk, num_chunks, num_frames=16, batch_size=64):
+    video_files = find_video_files(video_dir)
+    chunked_video_files = split_into_chunks(video_files, num_chunks)[cur_chunk]
+    num_batches = 0
+
+    for i in range(0, len(chunked_video_files), batch_size):
+        batch_video_files = chunked_video_files[i : i + batch_size]
+        print(f"Processing batch of {len(batch_video_files)} video(s)...")
+
+        if not batch_video_files:
+            print("No video files found in the specified directory.")
+            return
+
+        batch_input = [
+            {
+                "num_frames": num_frames,
+                "video_path": video_path,
+                "question": "Please provide a detailed description of the video, focusing on the main subjects, their actions, the background scenes.",
+            }
+            for video_path in batch_video_files
+        ]
+
+        start_time = time.perf_counter()
+        states = video_qa.run_batch(batch_input, max_new_tokens=512, temperature=0.2)
+        total_time = time.perf_counter() - start_time
+        average_time = total_time / len(batch_video_files)
+        print(
+            f"Number of videos in batch: {len(batch_video_files)}. Average processing time per video: {average_time:.2f} seconds. Total time for this batch: {total_time:.2f} seconds"
+        )
+
+        save_batch_results(batch_video_files, states, cur_chunk, num_batches, save_dir)
+        num_batches += 1
+
+    compile_and_cleanup_final_results(cur_chunk, num_batches, save_dir)
+
+
+if __name__ == "__main__":
+
+    url = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
+
+    cache_dir = os.path.expanduser("~/.cache")
+    file_path = os.path.join(cache_dir, "jobs.mp4")
+
+    os.makedirs(cache_dir, exist_ok=True)
+
+    response = requests.get(url)
+    response.raise_for_status()  # Raise an exception for bad responses
+
+    with open(file_path, "wb") as f:
+        f.write(response.content)
+
+    print(f"File downloaded and saved to: {file_path}")
+    # Create the parser
+    parser = argparse.ArgumentParser(
+        description="Run video processing with specified port."
+    )
+
+    # Add an argument for the port
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=30000,
+        help="The master port for distributed serving.",
+    )
+    parser.add_argument(
+        "--chunk-idx", type=int, default=0, help="The index of the chunk to process."
+    )
+    parser.add_argument(
+        "--num-chunks", type=int, default=8, help="The number of chunks to process."
+    )
+    parser.add_argument(
+        "--save-dir",
+        type=str,
+        default="./work_dirs/llava_video",
+        help="The directory to save the processed video files.",
+    )
+    parser.add_argument(
+        "--video-dir",
+        type=str,
+        default=os.path.expanduser("~/.cache/jobs.mp4"),
+        help="The directory or path for the processed video files.",
+    )
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default="lmms-lab/LLaVA-NeXT-Video-7B",
+        help="The model path for the video processing.",
+    )
+    parser.add_argument(
+        "--num-frames",
+        type=int,
+        default=16,
+        help="The number of frames to process in each video.",
+    )
+    parser.add_argument("--mm_spatial_pool_stride", type=int, default=2)
+
+    # Parse the arguments
+    args = parser.parse_args()
+    cur_port = args.port
+    cur_chunk = args.chunk_idx
+    num_chunks = args.num_chunks
+    num_frames = args.num_frames
+
+    if "34b" in args.model_path.lower():
+        tokenizer_path = "liuhaotian/llava-v1.6-34b-tokenizer"
+    elif "7b" in args.model_path.lower():
+        tokenizer_path = "llava-hf/llava-1.5-7b-hf"
+    else:
+        print("Invalid model path. Please specify a valid model path.")
+        exit()
+
+    model_override_args = {}
+    model_override_args["mm_spatial_pool_stride"] = args.mm_spatial_pool_stride
+    model_override_args["architectures"] = ["LlavaVidForCausalLM"]
+    model_override_args["num_frames"] = args.num_frames
+    model_override_args["model_type"] = "llava"
+
+    if "34b" in args.model_path.lower():
+        model_override_args["image_token_index"] = 64002
+
+    if args.num_frames == 32:
+        model_override_args["rope_scaling"] = {"factor": 2.0, "rope_type": "linear"}
+        model_override_args["max_sequence_length"] = 4096 * 2
+        model_override_args["tokenizer_model_max_length"] = 4096 * 2
+    elif args.num_frames < 32:
+        pass
+    else:
+        print(
+            "The maximum number of frames to process is 32. Please specify a valid number of frames."
+        )
+        exit()
+
+    runtime = sgl.Runtime(
+        model_path=args.model_path,  # "liuhaotian/llava-v1.6-vicuna-7b",
+        tokenizer_path=tokenizer_path,
+        port=cur_port,
+        json_model_override_args=json.dumps(model_override_args),
+        tp_size=1,
+    )
+    sgl.set_default_backend(runtime)
+    print(f"chat template: {runtime.endpoint.chat_template.name}")
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    root = args.video_dir
+    if os.path.isfile(root):
+        video_files = [root]
+    else:
+        video_files = [
+            os.path.join(root, f)
+            for f in os.listdir(root)
+            if f.endswith((".mp4", ".avi", ".mov"))
+        ]  # Add more extensions if needed
+    start_time = time.perf_counter()  # Start time for processing a single video
+    for cur_video in video_files[:1]:
+        print(cur_video)
+        single(cur_video, num_frames)
+    end_time = time.perf_counter()  # End time for processing a single video
+    total_time = end_time - start_time
+    average_time = total_time / len(
+        video_files
+    )  # Calculate the average processing time
+    print(f"Average processing time per video: {average_time:.2f} seconds")
+    runtime.shutdown()
+
+    # # Run a batch of requests
+    # print("\n========== batch ==========\n")
+    # if not os.path.exists(args.save_dir):
+    #     os.makedirs(args.save_dir)
+    # batch(args.video_dir, args.save_dir, cur_chunk, num_chunks, num_frames, num_chunks)
+    # runtime.shutdown()
diff --git a/examples/frontend_language/usage/llava_video/srt_example_llava_v.sh b/examples/frontend_language/usage/llava_video/srt_example_llava_v.sh
new file mode 100755
index 000000000..ffb1af96d
--- /dev/null
+++ b/examples/frontend_language/usage/llava_video/srt_example_llava_v.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+
+##### USAGE #####
+#    - First node:
+#      ```sh
+#      bash examples/usage/llava_video/srt_example_llava_v.sh K 0 YOUR_VIDEO_PATH YOUR_MODEL_PATH FRAMES_PER_VIDEO
+#      ```
+#    - Second node:
+#      ```sh
+#      bash examples/usage/llava_video/srt_example_llava_v.sh K 1 YOUR_VIDEO_PATH YOUR_MODEL_PATH FRAMES_PER_VIDEO
+#      ```
+#    - The K node:
+#      ```sh
+#      bash examples/usage/llava_video/srt_example_llava_v.sh K K-1 YOUR_VIDEO_PATH YOUR_MODEL_PATH FRAMES_PER_VIDEO
+#      ```
+
+
+# Replace `K`, `YOUR_VIDEO_PATH`, `YOUR_MODEL_PATH`, and `FRAMES_PER_VIDEO` with your specific details.
+# CURRENT_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+CURRENT_ROOT=$(dirname "$0")
+
+echo ${CURRENT_ROOT}
+
+cd ${CURRENT_ROOT}
+
+export PYTHONWARNINGS=ignore
+
+START_TIME=$(date +%s)  # Capture start time
+
+NUM_NODES=$1
+
+CUR_NODES_IDX=$2
+
+VIDEO_DIR=$3
+
+MODEL_PATH=$4
+
+NUM_FRAMES=$5
+
+
+# FRAME_FORMAT=$6
+
+# FRAME_FORMAT=$(echo $FRAME_FORMAT | tr '[:lower:]' '[:upper:]')
+
+# # Check if FRAME_FORMAT is either JPEG or PNG
+# if [[ "$FRAME_FORMAT" != "JPEG" && "$FRAME_FORMAT" != "PNG" ]]; then
+#     echo "Error: FRAME_FORMAT must be either JPEG or PNG."
+#     exit 1
+# fi
+
+# export TARGET_FRAMES=$TARGET_FRAMES
+
+echo "Each video you will sample $NUM_FRAMES frames"
+
+# export FRAME_FORMAT=$FRAME_FORMAT
+
+# echo "The frame format is $FRAME_FORMAT"
+
+# Assuming GPULIST is a bash array containing your GPUs
+GPULIST=(0 1 2 3 4 5 6 7)
+LOCAL_CHUNKS=${#GPULIST[@]}
+
+echo "Number of GPUs in GPULIST: $LOCAL_CHUNKS"
+
+ALL_CHUNKS=$((NUM_NODES * LOCAL_CHUNKS))
+
+# Calculate GPUs per chunk
+GPUS_PER_CHUNK=1
+
+echo $GPUS_PER_CHUNK
+
+for IDX in $(seq 1 $LOCAL_CHUNKS); do
+    (
+        START=$(((IDX-1) * GPUS_PER_CHUNK))
+        LENGTH=$GPUS_PER_CHUNK # Length for slicing, not the end index
+
+        CHUNK_GPUS=(${GPULIST[@]:$START:$LENGTH})
+
+        # Convert the chunk GPUs array to a comma-separated string
+        CHUNK_GPUS_STR=$(IFS=,; echo "${CHUNK_GPUS[*]}")
+
+        LOCAL_IDX=$((CUR_NODES_IDX * LOCAL_CHUNKS + IDX))
+
+        echo "Chunk $(($LOCAL_IDX - 1)) will run on GPUs $CHUNK_GPUS_STR"
+
+        # Calculate the port for this chunk. Ensure it's incremented by 5 for each chunk.
+        PORT=$((10000 + RANDOM % 55536))
+
+        MAX_RETRIES=10
+        RETRY_COUNT=0
+        COMMAND_STATUS=1  # Initialize as failed
+
+        while [ $RETRY_COUNT -lt $MAX_RETRIES ] && [ $COMMAND_STATUS -ne 0 ]; do
+            echo "Running chunk $(($LOCAL_IDX - 1)) on GPUs $CHUNK_GPUS_STR with port $PORT. Attempt $(($RETRY_COUNT + 1))"
+
+#!/bin/bash
+            CUDA_VISIBLE_DEVICES=$CHUNK_GPUS_STR python3 srt_example_llava_v.py \
+            --port $PORT \
+            --num-chunks $ALL_CHUNKS \
+            --chunk-idx $(($LOCAL_IDX - 1)) \
+            --save-dir work_dirs/llava_next_video_inference_results \
+            --video-dir $VIDEO_DIR \
+            --model-path $MODEL_PATH \
+            --num-frames $NUM_FRAMES #&
+
+            wait $!  # Wait for the process to finish and capture its exit status
+            COMMAND_STATUS=$?
+
+            if [ $COMMAND_STATUS -ne 0 ]; then
+                echo "Execution failed for chunk $(($LOCAL_IDX - 1)), attempt $(($RETRY_COUNT + 1)). Retrying..."
+                RETRY_COUNT=$(($RETRY_COUNT + 1))
+                sleep 180  # Wait a bit before retrying
+            else
+                echo "Execution succeeded for chunk $(($LOCAL_IDX - 1))."
+            fi
+        done
+
+        if [ $COMMAND_STATUS -ne 0 ]; then
+            echo "Execution failed for chunk $(($LOCAL_IDX - 1)) after $MAX_RETRIES attempts."
+        fi
+    ) #&
+    sleep 2  # Slight delay to stagger the start times
+done
+
+wait
+
+cat work_dirs/llava_next_video_inference_results/final_results_chunk_*.csv > work_dirs/llava_next_video_inference_results/final_results_node_${CUR_NODES_IDX}.csv
+
+END_TIME=$(date +%s)  # Capture end time
+ELAPSED_TIME=$(($END_TIME - $START_TIME))
+echo "Total execution time: $ELAPSED_TIME seconds."
diff --git a/examples/frontend_language/usage/openai_chat_speculative.py b/examples/frontend_language/usage/openai_chat_speculative.py
new file mode 100644
index 000000000..f3fd74ed8
--- /dev/null
+++ b/examples/frontend_language/usage/openai_chat_speculative.py
@@ -0,0 +1,155 @@
+"""
+Usage:
+***Note: for speculative execution to work, user must put all "gen" in "assistant".
+Show in "assistant" the desired answer format. Each "gen" term should have a stop token.
+The stream mode is not supported in speculative execution.
+
+E.g.
+correct:
+    sgl.assistant("\nName:" + sgl.gen("name", stop="\n") + "\nBirthday:" + sgl.gen("birthday", stop="\n") + "\nJob:" + sgl.gen("job", stop="\n"))
+incorrect:
+    s += sgl.assistant("\nName:" + sgl.gen("name", stop="\n"))
+    s += sgl.assistant("\nBirthday:" + sgl.gen("birthday", stop="\n"))
+    s += sgl.assistant("\nJob:" + sgl.gen("job", stop="\n"))
+
+export OPENAI_API_KEY=sk-******
+python3 openai_chat_speculative.py
+"""
+
+import sglang as sgl
+from sglang import OpenAI, function, set_default_backend
+
+
+@function(num_api_spec_tokens=256)
+def gen_character_spec(s):
+    s += sgl.system("You are a helpful assistant.")
+    s += sgl.user("Construct a character within the following format:")
+    s += sgl.assistant(
+        "Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n"
+    )
+    s += sgl.user("Please generate new Name, Birthday and Job.\n")
+    s += sgl.assistant(
+        "Name:"
+        + sgl.gen("name", stop="\n")
+        + "\nBirthday:"
+        + sgl.gen("birthday", stop="\n")
+        + "\nJob:"
+        + sgl.gen("job", stop="\n")
+    )
+
+
+@function(num_api_spec_tokens=256)
+def gen_character_spec_no_few_shot(s):
+    s += sgl.user("Construct a character. For each field stop with a newline\n")
+    s += sgl.assistant(
+        "Name:"
+        + sgl.gen("name", stop="\n")
+        + "\nAge:"
+        + sgl.gen("age", stop="\n")
+        + "\nJob:"
+        + sgl.gen("job", stop="\n")
+    )
+
+
+@function
+def gen_character_normal(s):
+    s += sgl.system("You are a helpful assistant.")
+    s += sgl.user("What's the answer of 23 + 8?")
+    s += sgl.assistant(sgl.gen("answer", max_tokens=64))
+
+
+@function(num_api_spec_tokens=1024)
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.system("You are a helpful assistant.")
+    s += sgl.user("Answer questions in the following format:")
+    s += sgl.user(
+        "Question 1: What is the capital of France?\nQuestion 2: What is the population of this city?\n"
+    )
+    s += sgl.assistant(
+        "Answer 1: The capital of France is Paris.\nAnswer 2: The population of Paris in 2024 is estimated to be around 2.1 million for the city proper.\n"
+    )
+    s += sgl.user("Question 1: " + question_1 + "\nQuestion 2: " + question_2)
+    s += sgl.assistant(
+        "Answer 1: "
+        + sgl.gen("answer_1", stop="\n")
+        + "\nAnswer 2: "
+        + sgl.gen("answer_2", stop="\n")
+    )
+
+
+def test_spec_single_turn():
+    backend.token_usage.reset()
+
+    state = gen_character_spec.run()
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+
+    print("\n-- name:", state["name"])
+    print("-- birthday:", state["birthday"])
+    print("-- job:", state["job"])
+    print(backend.token_usage)
+
+
+def test_inaccurate_spec_single_turn():
+    state = gen_character_spec_no_few_shot.run()
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+
+    print("\n-- name:", state["name"])
+    print("\n-- age:", state["age"])
+    print("\n-- job:", state["job"])
+
+
+def test_normal_single_turn():
+    state = gen_character_normal.run()
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+
+
+def test_spec_multi_turn():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions in the capital of the United States.",
+    )
+
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+
+    print("\n-- answer_1 --\n", state["answer_1"])
+    print("\n-- answer_2 --\n", state["answer_2"])
+
+
+def test_spec_multi_turn_stream():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+        stream=True,
+    )
+
+    for out in state.text_iter():
+        print(out, end="", flush=True)
+
+
+if __name__ == "__main__":
+    backend = OpenAI("gpt-4-turbo")
+    set_default_backend(backend)
+
+    print("\n========== test spec single turn ==========\n")
+    # expect reasonable answer for each field
+    test_spec_single_turn()
+
+    print("\n========== test inaccurate spec single turn ==========\n")
+    # expect incomplete or unreasonable answers
+    test_inaccurate_spec_single_turn()
+
+    print("\n========== test normal single turn ==========\n")
+    # expect reasonable answer
+    test_normal_single_turn()
+
+    print("\n========== test spec multi turn ==========\n")
+    # expect answer with same format as in the few shot
+    test_spec_multi_turn()
+
+    print("\n========== test spec multi turn stream ==========\n")
+    # expect error in stream_executor: stream is not supported...
+    test_spec_multi_turn_stream()
diff --git a/examples/frontend_language/usage/openai_speculative.py b/examples/frontend_language/usage/openai_speculative.py
new file mode 100644
index 000000000..4389cb059
--- /dev/null
+++ b/examples/frontend_language/usage/openai_speculative.py
@@ -0,0 +1,54 @@
+"""
+Usage:
+python3 openai_speculative.py
+"""
+
+from sglang import OpenAI, function, gen, set_default_backend
+
+
+@function(num_api_spec_tokens=64)
+def gen_character_spec(s):
+    s += "Construct a character within the following format:\n"
+    s += "Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n"
+    s += "\nPlease generate new Name, Birthday and Job.\n"
+    s += "Name:" + gen("name", stop="\n") + "\nBirthday:" + gen("birthday", stop="\n")
+    s += "\nJob:" + gen("job", stop="\n") + "\n"
+
+
+@function
+def gen_character_no_spec(s):
+    s += "Construct a character within the following format:\n"
+    s += "Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n"
+    s += "\nPlease generate new Name, Birthday and Job.\n"
+    s += "Name:" + gen("name", stop="\n") + "\nBirthday:" + gen("birthday", stop="\n")
+    s += "\nJob:" + gen("job", stop="\n") + "\n"
+
+
+@function(num_api_spec_tokens=64)
+def gen_character_spec_no_few_shot(s):
+    # s += "Construct a character with name, birthday, and job:\n"
+    s += "Construct a character:\n"
+    s += "Name:" + gen("name", stop="\n") + "\nBirthday:" + gen("birthday", stop="\n")
+    s += "\nJob:" + gen("job", stop="\n") + "\n"
+
+
+if __name__ == "__main__":
+    backend = OpenAI("gpt-3.5-turbo-instruct")
+    set_default_backend(backend)
+
+    for function in [
+        gen_character_spec,
+        gen_character_no_spec,
+        gen_character_spec_no_few_shot,
+    ]:
+        backend.token_usage.reset()
+
+        print(f"function: {function.func.__name__}")
+
+        state = function.run()
+
+        print("...name:", state["name"])
+        print("...birthday:", state["birthday"])
+        print("...job:", state["job"])
+        print(backend.token_usage)
+        print()
diff --git a/examples/frontend_language/usage/parallel_sample.py b/examples/frontend_language/usage/parallel_sample.py
new file mode 100644
index 000000000..0f3cf1700
--- /dev/null
+++ b/examples/frontend_language/usage/parallel_sample.py
@@ -0,0 +1,40 @@
+"""
+Usage:
+python3 parallel_sample.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def parallel_sample(s, question, n):
+    s += (
+        "Question: Compute 1 + 2 + 3\n"
+        "Reasoning: I need to use a calculator.\n"
+        "Tool: calculator\n"
+        "Answer: 6\n"
+        "Question: Compute 3 + 2 + 2\n"
+        "Reasoning: I will try a calculator.\n"
+        "Tool: calculator\n"
+        "Answer: 7\n"
+    )
+    s += "Question: " + question + "\n"
+    forks = s.fork(n)
+    forks += "Reasoning:" + sgl.gen("reasoning", stop="\n") + "\n"
+    forks += "Tool:" + sgl.gen("tool", choices=["calculator", "browser"]) + "\n"
+    forks += "Answer:" + sgl.gen("answer", stop="\n") + "\n"
+    forks.join()
+
+
+sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo-instruct"))
+# sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))
+
+state = parallel_sample.run(question="Compute 5 + 2 + 4.", n=5, temperature=1.0)
+
+for i in range(5):
+    obj = {
+        "reasoning": state["reasoning"][i],
+        "tool": state["tool"][i],
+        "answer": state["answer"][i],
+    }
+    print(f"[{i}], {obj}")
diff --git a/examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb b/examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb
new file mode 100644
index 000000000..f309142ad
--- /dev/null
+++ b/examples/frontend_language/usage/rag_using_parea/trace_and_evaluate_rag_using_parea.ipynb
@@ -0,0 +1,408 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# RAG Powered by SGLang & Chroma Evaluated using Parea\n",
+    "\n",
+    "In this notebook, we will build a simple RAG pipeline using SGLang to execute our LLM calls, Chroma as vector database for retrieval and [Parea](https://www.parea.ai) for tracing and evaluation. We will then evaluate the performance of our RAG pipeline. The dataset we will use was created by [Virat](https://twitter.com/virattt) and contains 100 questions, contexts and answers from the Airbnb 2023 10k filing.\n",
+    "\n",
+    "The RAG pipeline consists of two steps:\n",
+    "1. Retrieval: Given a question, we retrieve the relevant context from all provided contexts.\n",
+    "2. Generation: Given the question and the retrieved context, we generate an answer.\n",
+    "\n",
+    "ℹ️ This notebook requires an OpenAI API key.\n",
+    "\n",
+    "ℹ️ This notebook requires a Parea API key, which can be created [here](https://docs.parea.ai/api-reference/authentication#parea-api-key)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setting up the environment\n",
+    "\n",
+    "We will first install the necessary packages: `sglang`, `parea-ai` and `chromadb`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# note, if you use a Mac M1 chip, you might need to install grpcio 1.59.0 first such that installing chromadb works\n",
+    "# !pip install grpcio==1.59.0\n",
+    "\n",
+    "!pip install sglang[openai] parea-ai chromadb"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create a Parea API key as outlined [here](https://docs.parea.ai/api-reference/authentication#parea-api-key) and save it in a `.env` file as `PAREA_API_KEY=your-api-key`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Indexing the data\n",
+    "\n",
+    "Now it's time to download the data & index it! For that, we create a collection called `contexts` in Chroma and add the contexts as documents."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import os\n",
+    "from typing import List\n",
+    "\n",
+    "import chromadb\n",
+    "\n",
+    "path_qca = \"airbnb-2023-10k-qca.json\"\n",
+    "\n",
+    "if not os.path.exists(path_qca):\n",
+    "    !wget https://virattt.github.io/datasets/abnb-2023-10k.json -O airbnb-2023-10k-qca.json\n",
+    "\n",
+    "with open(path_qca, \"r\") as f:\n",
+    "    question_context_answers = json.load(f)\n",
+    "\n",
+    "chroma_client = chromadb.PersistentClient()\n",
+    "collection = chroma_client.get_or_create_collection(name=\"contexts\")\n",
+    "if collection.count() == 0:\n",
+    "    collection.add(\n",
+    "        documents=[qca[\"context\"] for qca in question_context_answers],\n",
+    "        ids=[str(i) for i in range(len(question_context_answers))],\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Defining the RAG pipeline\n",
+    "\n",
+    "We will start with importing the necessary packages, setting up tracing of OpenAI calls via Parea and setting OpenAI as the default backend for SGLang."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import time\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "from sglang import function, user, assistant, gen, set_default_backend, OpenAI\n",
+    "from sglang.lang.interpreter import ProgramState\n",
+    "from parea import Parea, trace\n",
+    "\n",
+    "\n",
+    "load_dotenv()\n",
+    "\n",
+    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+    "\n",
+    "p = Parea(api_key=os.getenv(\"PAREA_API_KEY\"), project_name=\"rag_sglang\")\n",
+    "p.integrate_with_sglang()\n",
+    "\n",
+    "set_default_backend(OpenAI(\"gpt-3.5-turbo\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we can define our retrieval step shown below. Notice, the `trace` decorator which will automatically trace inputs, output, latency, etc. of that call."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@trace\n",
+    "def retrieval(question: str) -> List[str]:\n",
+    "    return collection.query(query_texts=[question], n_results=1)[\"documents\"][0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next we will define the generation step which uses SGLang to execute the LLM call."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@function\n",
+    "def generation_sglang(s, question: str, *context: str):\n",
+    "    context = \"\\n\".join(context)\n",
+    "    s += user(\n",
+    "        f\"Given this question:\\n{question}\\n\\nAnd this context:\\n{context}\\n\\nAnswer the question.\"\n",
+    "    )\n",
+    "    s += assistant(gen(\"answer\"))\n",
+    "\n",
+    "\n",
+    "@trace\n",
+    "def generation(question: str, *context):\n",
+    "    state: ProgramState = generation_sglang.run(question, *context)\n",
+    "    while not state.stream_executor.is_finished:\n",
+    "        time.sleep(1)\n",
+    "    return state.stream_executor.variables[\"answer\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we can tie it together and execute a sample query."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@trace\n",
+    "def rag_pipeline(question: str) -> str:\n",
+    "    contexts = retrieval(question)\n",
+    "    return generation(question, *contexts)\n",
+    "\n",
+    "\n",
+    "rag_pipeline(\n",
+    "    \"When did the World Health Organization formally declare an end to the COVID-19 global health emergency?\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Debug Trace\n",
+    "\n",
+    "The output is unfortunately wrong! Using the traced pipeline, we can see that\n",
+    "\n",
+    "- the context is relevant to the question and contains the correct information\n",
+    "- but, the generation step is cut off as max tokens is set to 16\n",
+    "\n",
+    "When opening the generation step in the playground and rerunning the prompt with max. tokens set to 1000, the correct answer is produced.\n",
+    "\n",
+    "![RAG Trace](https://drive.google.com/uc?id=1QI243ogGjzbO01tUrR72g9rFoGzUJqVH)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Evaluating RAG Pipelines\n",
+    "\n",
+    "Before we apply above's fix, let's dive into evaluating RAG pipelines.\n",
+    "\n",
+    "RAG pipelines consist of a retrieval step to fetch relevant information and a generation step to generate a response to a users question. A RAG pipeline can fail at either step. E.g. the retrieval step can fail to find relevant information which makes generating the correct impossible. Another failure mode is that the generation step doesn't leverage the retrieved information correctly. We will apply the following evaluation metrics to understand different failure modes:\n",
+    "\n",
+    "- `context_relevancy`: measures how relevant the context is given the question\n",
+    "- `percent_target_supported_by_context`: measures how much of the target answer is supported by the context; this will give an upper ceiling of how well the generation step can perform\n",
+    "- `answer_context_faithfulness`: measures how much the generated answer utilizes the context\n",
+    "- `answer_matches_target`: measures how well the generated answer matches the target answer judged by a LLM and gives a sense of accuracy of our entire pipeline\n",
+    "\n",
+    "To use these evaluation metrics, we can import them from  `parea.evals.rag` and `parea.evals.general` and apply them to a function by specifying in the `trace` decorator which evaluation metrics to use. The `@trace` decorator will automatically log the results of the evaluation metrics to the Parea dashboard.\n",
+    "\n",
+    "Applying them to the retrieval step:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from parea.evals.rag import (\n",
+    "    context_query_relevancy_factory,\n",
+    "    percent_target_supported_by_context_factory,\n",
+    ")\n",
+    "\n",
+    "\n",
+    "context_relevancy_eval = context_query_relevancy_factory()\n",
+    "percent_target_supported_by_context = percent_target_supported_by_context_factory()\n",
+    "\n",
+    "\n",
+    "@trace(eval_funcs=[context_relevancy_eval, percent_target_supported_by_context])\n",
+    "def retrieval(question: str) -> List[str]:\n",
+    "    return collection.query(query_texts=[question], n_results=1)[\"documents\"][0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we can apply `answer_context_faithfulness` and `answer_matches_target` to the generation step."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from parea.evals.general import answer_matches_target_llm_grader_factory\n",
+    "from parea.evals.rag import answer_context_faithfulness_statement_level_factory\n",
+    "\n",
+    "\n",
+    "answer_context_faithfulness = answer_context_faithfulness_statement_level_factory()\n",
+    "answer_matches_target_llm_grader = answer_matches_target_llm_grader_factory()\n",
+    "\n",
+    "\n",
+    "@function\n",
+    "def generation_sglang(s, question: str, *context: str):\n",
+    "    context = \"\\n\".join(context)\n",
+    "    s += user(\n",
+    "        f\"Given this question:\\n{question}\\n\\nAnd this context:\\n{context}\\n\\nAnswer the question.\"\n",
+    "    )\n",
+    "    s += assistant(gen(\"answer\", max_tokens=1_000))\n",
+    "\n",
+    "\n",
+    "@trace(eval_funcs=[answer_context_faithfulness, answer_matches_target_llm_grader])\n",
+    "def generation(question: str, *context):\n",
+    "    state: ProgramState = generation_sglang.run(question, *context)\n",
+    "    while not state.stream_executor.is_finished:\n",
+    "        time.sleep(1)\n",
+    "    return state.stream_executor.variables[\"answer\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we tie them together & execute the original sample query."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@trace\n",
+    "def rag_pipeline(question: str) -> str:\n",
+    "    contexts = retrieval(question)\n",
+    "    return generation(question, *contexts)\n",
+    "\n",
+    "\n",
+    "rag_pipeline(\n",
+    "    \"When did the World Health Organization formally declare an end to the COVID-19 global health emergency?\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Great, the answer is correct! Can you spot the line where we fixed the output truncation issue?\n",
+    "\n",
+    "The evaluation scores appear in the bottom right of the logs (screenshot below). Note, that there is no score for `answer_matches_target_llm_grader` and `percent_target_supported_by_context` as these evals are automatically skipped if the target answer is not provided.\n",
+    "\n",
+    "![Fixed Max. Tokens](max-tokens-fixed-rag-trace.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Running an experiment\n",
+    "\n",
+    "Now we are (almost) ready to evaluate the performance of our RAG pipeline on the entire dataset. First, we will need to apply the `nest_asyncio` package to avoid issues with the Jupyter notebook event loop."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install nest-asyncio\n",
+    "import nest_asyncio\n",
+    "\n",
+    "nest_asyncio.apply()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Running the actual experiment is straight-forward. For that we use `p.experiment` to initialize the experiment with a name, the data (list of key-value pairs fed into our entry function) and the entry function. We then call `run` on the experiment to execute it. Note, that `target` is a reserved key in the data dictionary and will be used as the target answer for evaluation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "e = p.experiment(\n",
+    "    \"RAG\",\n",
+    "    data=[\n",
+    "        {\n",
+    "            \"question\": qca[\"question\"],\n",
+    "            \"target\": qca[\"answer\"],\n",
+    "        }\n",
+    "        for qca in question_context_answers\n",
+    "    ],\n",
+    "    func=rag_pipeline,\n",
+    ").run()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Analyzing the results\n",
+    "\n",
+    "When opening above experiment, we will see an overview of the experiment as shown below. The upper half shows a summary of the statistics on the left and charts to investigate the distribution and relationships of scores on the right. The lower half is a table with the individual traces which we can use to debug individual samples.\n",
+    "\n",
+    "When looking at the statistics, we can see that the accuracy of our RAG pipeline is 22% as measured by `answer_matches_target_llm_grader`. Though when checking the quality of our retrieval step (`context_query_relevancy`), we can see that our retrieval step is fetching relevant information in only 27% of all samples. As shown in the GIF, we investigate the relationship between the two and see the two scores have 95% agreement. This confirms that the retrieval step is a major bottleneck for our RAG pipeline. So, now it's your turn to improve the retrieval step!\n",
+    "\n",
+    "Note, above link isn't publicly accessible but the experiment can be accessed through [here](https://app.parea.ai/public-experiments/parea/rag_sglang/30f0244a-d56c-44ff-bdfb-8f47626304b6).\n",
+    "\n",
+    "![Experiment Results](https://drive.google.com/uc?id=1KMtJBU47nPB02Pvv3SPPTK7RnHRh5YdA)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/examples/frontend_language/usage/readme_examples.py b/examples/frontend_language/usage/readme_examples.py
new file mode 100644
index 000000000..7269ef148
--- /dev/null
+++ b/examples/frontend_language/usage/readme_examples.py
@@ -0,0 +1,109 @@
+"""
+Usage:
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+python readme_examples.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def tool_use(s, question):
+    s += "To answer this question: " + question + ". "
+    s += (
+        "I need to use a "
+        + sgl.gen("tool", choices=["calculator", "search engine"])
+        + ". "
+    )
+
+    if s["tool"] == "calculator":
+        s += "The math expression is" + sgl.gen("expression")
+    elif s["tool"] == "search engine":
+        s += "The key word to search is" + sgl.gen("word")
+
+
+@sgl.function
+def tip_suggestion(s):
+    s += (
+        "Here are two tips for staying healthy: "
+        "1. Balanced Diet. 2. Regular Exercise.\n\n"
+    )
+
+    forks = s.fork(2)
+    for i, f in enumerate(forks):
+        f += f"Now, expand tip {i+1} into a paragraph:\n"
+        f += sgl.gen(f"detailed_tip", max_tokens=256, stop="\n\n")
+
+    s += "Tip 1:" + forks[0]["detailed_tip"] + "\n"
+    s += "Tip 2:" + forks[1]["detailed_tip"] + "\n"
+    s += "In summary" + sgl.gen("summary")
+
+
+@sgl.function
+def regular_expression_gen(s):
+    s += "Q: What is the IP address of the Google DNS servers?\n"
+    s += "A: " + sgl.gen(
+        "answer",
+        temperature=0,
+        regex=r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
+    )
+
+
+@sgl.function
+def text_qa(s, question):
+    s += "Q: " + question + "\n"
+    s += "A:" + sgl.gen("answer", stop="\n")
+
+
+def driver_tool_use():
+    state = tool_use.run(question="What is the capital of the United States?")
+    print(state.text())
+    print("\n")
+
+
+def driver_tip_suggestion():
+    state = tip_suggestion.run()
+    print(state.text())
+    print("\n")
+
+
+def driver_regex():
+    state = regular_expression_gen.run()
+    print(state.text())
+    print("\n")
+
+
+def driver_batching():
+    states = text_qa.run_batch(
+        [
+            {"question": "What is the capital of the United Kingdom?"},
+            {"question": "What is the capital of France?"},
+            {"question": "What is the capital of Japan?"},
+        ],
+        progress_bar=True,
+    )
+
+    for s in states:
+        print(s.text())
+    print("\n")
+
+
+def driver_stream():
+    state = text_qa.run(
+        question="What is the capital of France?", temperature=0.1, stream=True
+    )
+
+    for out in state.text_iter():
+        print(out, end="", flush=True)
+    print("\n")
+
+
+if __name__ == "__main__":
+    # sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo-instruct"))
+    sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))
+
+    driver_tool_use()
+    driver_tip_suggestion()
+    driver_regex()
+    driver_batching()
+    driver_stream()
diff --git a/examples/frontend_language/usage/sgl_gen_min_tokens.py b/examples/frontend_language/usage/sgl_gen_min_tokens.py
new file mode 100644
index 000000000..a5088199b
--- /dev/null
+++ b/examples/frontend_language/usage/sgl_gen_min_tokens.py
@@ -0,0 +1,35 @@
+"""
+This example demonstrates how to use `min_tokens` to enforce sgl.gen to generate a longer sequence
+
+Usage:
+python3 sgl_gen_min_tokens.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def long_answer(s):
+    s += sgl.user("What is the capital of the United States?")
+    s += sgl.assistant(sgl.gen("answer", min_tokens=64, max_tokens=128))
+
+
+@sgl.function
+def short_answer(s):
+    s += sgl.user("What is the capital of the United States?")
+    s += sgl.assistant(sgl.gen("answer"))
+
+
+if __name__ == "__main__":
+    runtime = sgl.Runtime(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
+    sgl.set_default_backend(runtime)
+
+    state = long_answer.run()
+    print("=" * 20)
+    print("Longer Answer", state["answer"])
+
+    state = short_answer.run()
+    print("=" * 20)
+    print("Short Answer", state["answer"])
+
+    runtime.shutdown()
diff --git a/examples/frontend_language/usage/streaming.py b/examples/frontend_language/usage/streaming.py
new file mode 100644
index 000000000..506ee35c6
--- /dev/null
+++ b/examples/frontend_language/usage/streaming.py
@@ -0,0 +1,49 @@
+"""
+Usage:
+python3 streaming.py
+"""
+
+import asyncio
+
+import sglang as sgl
+
+
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.system("You are a helpful assistant.")
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
+
+
+sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo"))
+
+
+def stream_a_variable():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+        stream=True,
+    )
+
+    for out in state.text_iter(var_name="answer_2"):
+        print(out, end="", flush=True)
+    print("\n")
+
+
+async def async_stream():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+        stream=True,
+    )
+
+    async for out in state.text_async_iter(var_name="answer_2"):
+        print(out, end="", flush=True)
+    print("\n")
+
+
+if __name__ == "__main__":
+    stream_a_variable()
+    asyncio.run(async_stream())
diff --git a/examples/frontend_language/usage/triton/Dockerfile b/examples/frontend_language/usage/triton/Dockerfile
new file mode 100644
index 000000000..e4741a1db
--- /dev/null
+++ b/examples/frontend_language/usage/triton/Dockerfile
@@ -0,0 +1,10 @@
+FROM nvcr.io/nvidia/tritonserver:24.01-py3
+
+WORKDIR /opt
+
+RUN git clone https://github.com/sgl-project/sglang.git
+
+WORKDIR /opt/sglang
+RUN pip install --upgrade pip && \
+    pip install -e "python[all]" && \
+    pip install datasets
diff --git a/examples/frontend_language/usage/triton/README.md b/examples/frontend_language/usage/triton/README.md
new file mode 100644
index 000000000..b2e55961f
--- /dev/null
+++ b/examples/frontend_language/usage/triton/README.md
@@ -0,0 +1,35 @@
+# sglang_triton
+
+Build the docker image:
+```
+docker build -t sglang-triton .
+```
+
+Then do:
+```
+docker run -ti --gpus=all --network=host --name sglang-triton -v ./models:/mnt/models sglang-triton
+```
+
+inside the docker container:
+```
+cd sglang
+python3 -m sglang.launch_server --model-path mistralai/Mistral-7B-Instruct-v0.2 --port 30000 --mem-fraction-static 0.9
+```
+
+with another shell, inside the docker container:
+```
+docker exec -ti sglang-triton /bin/bash
+cd /mnt
+tritonserver --model-repository=/mnt/models
+```
+
+
+Send request to the server:
+```
+curl -X POST http://localhost:8000/v2/models/character_generation/generate \
+-H "Content-Type: application/json" \
+-d '{
+  "INPUT_TEXT": ["harry"]
+}'
+
+```
diff --git a/examples/frontend_language/usage/triton/models/character_generation/config.pbtxt b/examples/frontend_language/usage/triton/models/character_generation/config.pbtxt
new file mode 100644
index 000000000..7546f993a
--- /dev/null
+++ b/examples/frontend_language/usage/triton/models/character_generation/config.pbtxt
@@ -0,0 +1,23 @@
+name: "character_generation"
+backend: "python"
+input [
+    {
+        name: "INPUT_TEXT"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+    }
+]
+output [
+    {
+        name: "OUTPUT_TEXT"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+    }
+]
+instance_group [
+    {
+        count: 1
+        kind: KIND_GPU
+        gpus: [ 0 ]
+    }
+]
diff --git a/examples/monitoring/README.md b/examples/monitoring/README.md
new file mode 100644
index 000000000..3eef0b09b
--- /dev/null
+++ b/examples/monitoring/README.md
@@ -0,0 +1,76 @@
+# SGLang Monitoring Setup
+
+This directory contains a ready-to-use monitoring setup for SGLang using Prometheus and Grafana.
+
+## Prerequisites
+
+- Docker and Docker Compose installed
+- SGLang server running with metrics enabled
+
+## Usage
+
+1. Start your SGLang server with metrics enabled:
+
+```bash
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --enable-metrics
+```
+
+By default, the metrics server will run on `127.0.0.1:30000`.
+
+2. Start the monitoring stack:
+
+```bash
+cd examples/monitoring
+docker compose up
+```
+
+3. Access the monitoring interfaces:
+   - Grafana: [http://localhost:3000](http://localhost:3000)
+   - Prometheus: [http://localhost:9090](http://localhost:9090)
+
+Default Grafana login credentials:
+- Username: `admin`
+- Password: `admin`
+
+You'll be prompted to change the password on first login.
+
+4. The SGLang dashboard will be automatically available in the "SGLang Monitoring" folder.
+
+## Troubleshooting
+
+### Port Conflicts
+If you see errors like "port is already allocated":
+
+1. Check if you already have Prometheus or Grafana running:
+   ```bash
+   docker ps | grep -E 'prometheus|grafana'
+   ```
+
+2. Stop any conflicting containers:
+   ```bash
+   docker stop <container_id>
+   ```
+
+3. Ensure no other services are using ports 9090 and 3000:
+   ```bash
+   lsof -i :9090
+   lsof -i :3000
+   ```
+
+### Connection Issues
+If Grafana cannot connect to Prometheus:
+1. Check that both services are running
+2. Verify the datasource configuration in Grafana
+3. Check that your SGLang server is properly exposing metrics
+
+## Configuration
+
+- Prometheus configuration: `prometheus.yaml`
+- Docker Compose configuration: `docker-compose.yaml`
+- Grafana datasource: `grafana/datasources/datasource.yaml`
+- Grafana dashboard configuration: `grafana/dashboards/config/dashboard.yaml`
+- SGLang dashboard JSON: `grafana/dashboards/json/sglang-dashboard.json`
+
+## Customization
+
+You can customize the monitoring setup by modifying the configuration files as needed.
diff --git a/examples/monitoring/docker-compose.yaml b/examples/monitoring/docker-compose.yaml
new file mode 100644
index 000000000..ce6457fa3
--- /dev/null
+++ b/examples/monitoring/docker-compose.yaml
@@ -0,0 +1,28 @@
+version: '3'
+services:
+  prometheus:
+    image: prom/prometheus:latest
+    container_name: prometheus
+    network_mode: host
+    volumes:
+      - ./prometheus.yaml:/etc/prometheus/prometheus.yml
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+
+  grafana:
+    image: grafana/grafana:latest
+    container_name: grafana
+    network_mode: host
+    volumes:
+      - ./grafana/datasources:/etc/grafana/provisioning/datasources
+      - ./grafana/dashboards/config:/etc/grafana/provisioning/dashboards
+      - ./grafana/dashboards/json:/var/lib/grafana/dashboards
+    environment:
+      - GF_AUTH_ANONYMOUS_ENABLED=true
+      - GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer
+      - GF_AUTH_BASIC_ENABLED=false
+      - GF_USERS_ALLOW_SIGN_UP=false
+      - GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH=/var/lib/grafana/dashboards/sglang-dashboard.json
+    depends_on:
+      - prometheus
diff --git a/examples/monitoring/grafana/dashboards/config/dashboard.yaml b/examples/monitoring/grafana/dashboards/config/dashboard.yaml
new file mode 100644
index 000000000..6c17a6c63
--- /dev/null
+++ b/examples/monitoring/grafana/dashboards/config/dashboard.yaml
@@ -0,0 +1,11 @@
+apiVersion: 1
+providers:
+  - name: 'SGLang'
+    orgId: 1
+    folder: 'SGLang Monitoring'
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 10
+    allowUiUpdates: false
+    options:
+      path: /var/lib/grafana/dashboards
diff --git a/examples/monitoring/grafana/dashboards/json/sglang-dashboard.json b/examples/monitoring/grafana/dashboards/json/sglang-dashboard.json
new file mode 100644
index 000000000..ebd663e41
--- /dev/null
+++ b/examples/monitoring/grafana/dashboards/json/sglang-dashboard.json
@@ -0,0 +1,984 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "grafana",
+          "uid": "-- Grafana --"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": 8,
+  "links": [],
+  "panels": [
+    {
+      "datasource": {
+        "default": true,
+        "type": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
+      "id": 14,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.6.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, sum by (le) (rate(sglang:e2e_request_latency_seconds_bucket[$__rate_interval])))\r\n",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "P99",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.9, sum by (le) (rate(sglang:e2e_request_latency_seconds_bucket[$__rate_interval])))\r\n",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "P90",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.5, sum by (le) (rate(sglang:e2e_request_latency_seconds_bucket[$__rate_interval])))\r\n",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "P50",
+          "range": true,
+          "refId": "C",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "avg(rate(sglang:e2e_request_latency_seconds_sum[$__rate_interval]) /  rate(sglang:e2e_request_latency_seconds_count[$__rate_interval]))\r\n",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Avg",
+          "range": true,
+          "refId": "D",
+          "useBackend": false
+        }
+      ],
+      "title": "End-to-End Request Latency",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "default": true,
+        "type": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "scaleDistribution": {
+              "type": "linear"
+            }
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 0
+      },
+      "id": 17,
+      "maxDataPoints": 30,
+      "options": {
+        "calculate": false,
+        "calculation": {
+          "yBuckets": {
+            "scale": {
+              "type": "linear"
+            }
+          }
+        },
+        "cellGap": 1,
+        "cellValues": {},
+        "color": {
+          "exponent": 0.5,
+          "fill": "dark-orange",
+          "mode": "scheme",
+          "reverse": false,
+          "scale": "exponential",
+          "scheme": "Spectral",
+          "steps": 64
+        },
+        "exemplars": {
+          "color": "rgba(255,0,255,0.7)"
+        },
+        "filterValues": {
+          "le": 1e-9
+        },
+        "legend": {
+          "show": true
+        },
+        "rowsFrame": {
+          "layout": "auto"
+        },
+        "tooltip": {
+          "mode": "single",
+          "showColorScale": true,
+          "yHistogram": false
+        },
+        "yAxis": {
+          "axisPlacement": "left",
+          "reverse": false,
+          "unit": "secs"
+        }
+      },
+      "pluginVersion": "11.6.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "sum(increase(sglang:e2e_request_latency_seconds_bucket{model_name=~\"$model_name\"}[$__rate_interval])) by (le)\r\n",
+          "format": "heatmap",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "{{le}}",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "End-to-End Request Latency(s) Heatmap",
+      "type": "heatmap"
+    },
+    {
+      "datasource": {
+        "default": true,
+        "type": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 8
+      },
+      "id": 20,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.6.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, sum by (le) (rate(sglang:time_to_first_token_seconds_bucket[$__rate_interval])))\r\n",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "P99",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.9, sum by (le) (rate(sglang:time_to_first_token_seconds_bucket[$__rate_interval])))\r\n",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "P90",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.5, sum by (le) (rate(sglang:time_to_first_token_seconds_bucket[$__rate_interval])))\r\n",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "P50",
+          "range": true,
+          "refId": "C",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "avg(rate(sglang:time_to_first_token_seconds_sum[$__rate_interval]) /  rate(sglang:time_to_first_token_seconds_count[$__rate_interval]))\r\n",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Avg",
+          "range": true,
+          "refId": "D",
+          "useBackend": false
+        }
+      ],
+      "title": "Time-To-First-Token Latency",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "default": true,
+        "type": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "scaleDistribution": {
+              "type": "linear"
+            }
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 8
+      },
+      "id": 19,
+      "maxDataPoints": 30,
+      "options": {
+        "calculate": false,
+        "calculation": {
+          "xBuckets": {
+            "value": ""
+          },
+          "yBuckets": {
+            "mode": "size",
+            "scale": {
+              "type": "linear"
+            },
+            "value": ""
+          }
+        },
+        "cellGap": 1,
+        "color": {
+          "exponent": 0.5,
+          "fill": "dark-orange",
+          "mode": "scheme",
+          "reverse": false,
+          "scale": "exponential",
+          "scheme": "Spectral",
+          "steps": 64
+        },
+        "exemplars": {
+          "color": "rgba(255,0,255,0.7)"
+        },
+        "filterValues": {
+          "le": 1e-9
+        },
+        "legend": {
+          "show": true
+        },
+        "rowsFrame": {
+          "layout": "auto"
+        },
+        "tooltip": {
+          "mode": "single",
+          "showColorScale": true,
+          "yHistogram": false
+        },
+        "yAxis": {
+          "axisPlacement": "left",
+          "reverse": false
+        }
+      },
+      "pluginVersion": "11.6.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "exemplar": false,
+          "expr": "sum by(le) (increase(sglang:time_to_first_token_seconds_bucket{model_name=~\"$model_name\"}[$__rate_interval]))",
+          "format": "heatmap",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "interval": "",
+          "legendFormat": "{{le}}",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Time-To-First-Token Seconds Heatmap",
+      "type": "heatmap"
+    },
+    {
+      "datasource": {
+        "default": true,
+        "type": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 16
+      },
+      "id": 7,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.6.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "sglang:num_running_reqs",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "interval": "",
+          "legendFormat": "{{instance}}",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Num Running Requests",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "default": true,
+        "type": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 16
+      },
+      "id": 18,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.6.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "editorMode": "code",
+          "expr": "sglang:gen_throughput",
+          "instant": false,
+          "legendFormat": "{{instance}}",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Token Generation Throughput (Tokens / S)",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "default": true,
+        "type": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 24
+      },
+      "id": 11,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.6.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "sglang:cache_hit_rate",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "{{instance}}",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Cache Hit Rate",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "default": true,
+        "type": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 24
+      },
+      "id": 8,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "11.6.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "sglang:num_queue_reqs",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "{{instance}}",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Number Queued Requests",
+      "type": "timeseries"
+    }
+  ],
+  "preload": false,
+  "refresh": "5s",
+  "schemaVersion": 41,
+  "tags": [],
+  "templating": {
+    "list": [
+      {
+        "current": {
+          "text": "127.0.0.1:30000",
+          "value": "127.0.0.1:30000"
+        },
+        "datasource": {
+          "type": "prometheus"
+        },
+        "definition": "label_values(instance)",
+        "includeAll": false,
+        "label": "instance",
+        "name": "instance",
+        "options": [],
+        "query": {
+          "qryType": 1,
+          "query": "label_values(instance)",
+          "refId": "PrometheusVariableQueryEditor-VariableQuery"
+        },
+        "refresh": 1,
+        "regex": "",
+        "type": "query"
+      },
+      {
+        "current": {
+          "text": "meta-llama/Llama-3.1-8B-Instruct",
+          "value": "meta-llama/Llama-3.1-8B-Instruct"
+        },
+        "datasource": {
+          "type": "prometheus"
+        },
+        "definition": "label_values(model_name)",
+        "includeAll": false,
+        "label": "model name",
+        "name": "model_name",
+        "options": [],
+        "query": {
+          "qryType": 1,
+          "query": "label_values(model_name)",
+          "refId": "PrometheusVariableQueryEditor-VariableQuery"
+        },
+        "refresh": 1,
+        "regex": "",
+        "type": "query"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-30m",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "browser",
+  "title": "SGLang Dashboard",
+  "uid": "sglang-dashboard",
+  "version": 11
+}
diff --git a/examples/monitoring/grafana/datasources/datasource.yaml b/examples/monitoring/grafana/datasources/datasource.yaml
new file mode 100644
index 000000000..1ab0e4a5f
--- /dev/null
+++ b/examples/monitoring/grafana/datasources/datasource.yaml
@@ -0,0 +1,8 @@
+apiVersion: 1
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://localhost:9090
+    isDefault: true
+    editable: false
diff --git a/examples/monitoring/prometheus.yaml b/examples/monitoring/prometheus.yaml
new file mode 100644
index 000000000..ba16ac3bd
--- /dev/null
+++ b/examples/monitoring/prometheus.yaml
@@ -0,0 +1,10 @@
+# prometheus.yaml
+global:
+  scrape_interval: 5s
+  evaluation_interval: 30s
+
+scrape_configs:
+  - job_name: sglang
+    static_configs:
+      - targets:
+          - '127.0.0.1:30000'
diff --git a/examples/profiler/nsys_profile_tools/README.md b/examples/profiler/nsys_profile_tools/README.md
new file mode 100644
index 000000000..687200e05
--- /dev/null
+++ b/examples/profiler/nsys_profile_tools/README.md
@@ -0,0 +1,176 @@
+# gputrc2graph.py
+
+This script processes NVIDIA Nsight Systems (`nsys`) GPU trace files
+(`.nsys-rep`) with -t cuda tracing enabled, and generates kernel-level
+summaries and visualizations of GPU and non-GPU time. It is useful for
+profiling and analyzing nsys profile output.
+
+## Usage
+
+### Command-line Arguments
+
+- `--in_file`
+  **(required)**
+  List of input files and their metadata. Each entry should be in the format:
+  `<nsys-rep>,<engine>,<model>,<elapsed_nonprofiled_sec>`
+  - `nsys-rep`: Path to the `.nsys-rep` file.
+  - `engine`: Engine name (e.g., `sglang`).
+  - `model`: Model name (e.g., `llama`, `gpt-oss`, `ds`).
+  - `elapsed_nonprofiled_sec`: Wall-clock runtime (in seconds) without
+    profiling. Specify `0` to use the elapsed time from the nsys-rep file
+    (this may inflate non-GPU time if actual runtime without profiling is
+    less). Multiple entries can be provided, separated by spaces.
+
+- `--out_dir`
+  Output directory for the generated CSV and HTML files.
+  If not specified, results are saved in the current directory.
+
+- `--title`
+  Title for the HTML chart/visualization.
+
+- `--nsys_cmd`
+  Path to the `nsys` command.
+  Default: `nsys` (assumes it is in your PATH).
+  Use this if `nsys` is not in your system PATH.
+
+## Notes
+
+- Make sure you have pandas installed. Any version is fine.
+- Make sure [nsys](https://developer.nvidia.com/nsight-systems/get-started) is
+installed, and specify the path to the `nsys` command with `--nsys_cmd` if it
+ is not in your PATH. The nsys version must be >= the nsys profile version that
+ was used to collect the traces when profiling the server, so that nsys can
+ process the nsys-rep that was generated.
+
+- For more details on available engines and models, see the help string in
+  the script or run:
+
+```bash
+python3 gputrc2graph.py --help
+```
+
+## Example 1: analyze a single profile
+
+To analyze the GPU cycles of for example, a llama-3.1-8B model with sglang:
+
+1. Run the following command to collect nsys profile, for sglang server config.
+
+   ```bash
+   nsys profile -t cuda -o nsys_res -f true --trace-fork-before-exec=true \
+   --cuda-graph-trace=node --delay <DELAY> --duration <DURATION> \
+   python3 -m sglang.launch_server --model meta-llama/Llama-3.1-8B ...
+   ```
+
+   where:
+
+   - DELAY: how many seconds to delay nsys from collecting profiles, needed so
+     that profiles aren't captured till sglang server has come up and load
+     generation starts.
+   - DURATION: how many seconds for nsys profile to run before generating the
+     profile. This should be > the duration of the run.
+2. After the server starts, run the client load generation command. Once the
+test completes, after DURATION amount of time, nsys profile will generate an
+nsys_res.nsys-rep file and shut down the server.
+
+3. Run step #1 again, this time starting up the server without collecting the
+profile.
+
+4. Run step #2 again, and record the total time to complete the test in
+seconds. This value will be used by the script to calculate the
+   CPU(non-GPU) seconds for the analysis.
+
+5. Say the run elapsed time from step #4 is 132 seconds. Run script to
+   analyze:
+
+   ```bash
+   python3 gputrc2graph.py \
+   --in_file run1.nsys-rep,sglang,llama,132
+   ```
+
+The command will produce 2 files for analysis:
+
+- result.html: this categorizes kernel names into different categories in a
+  stacked bar chart.
+- result.csv: shows how the kernel names are mapped to the different
+  categories.
+
+### HTML visualization with result.html
+
+The html file shows the number of elapsed seconds due to different GPU
+Substages or categories, which consist of attention kernels as the biggest
+category, at 63 seconds, followed by "gemm" kernels. This lets the user
+prioritize the kernels to focus on for performance optimizations.
+
+There's also an appended data table underneath the bar chart for copying out to
+ other post-processing tools.
+
+### Kernel to category mapping with result.csv
+
+Suppose the user would like to focus on improving triton kernels. It's not the
+biggest consumer of cycles at .01 sec but perhaps it hasn't been optimized.
+The next step is to use the result.csv to dive into what the kernels are which
+compose the triton kernel GPU cycles.
+
+## Example 2: analyze multiple profiles
+
+Suppose the user has multiple nsys trace files, captured for different models,
+say llama and gpt-oss in this case, and wish to compare their GPU/non-GPU
+time, something like the following command can be used.
+
+```bash
+python3 gputrc2graph.py \
+--in_file run1.nsys-rep,sglang,llama,100 run2.nsys-rep,sglang,gpt-oss,102 \
+--out_dir results
+```
+
+The analysis process is similar to example 1 but now there will be multiple
+stack bar charts that can be compared.  The categories for the different
+kernels will remain the same, so that it's easy to compare the GPU cycles for
+the same categories.
+
+Once a category is shown to have more cycles for one configuration than
+another, the next step would be to use the csv file to see what kernels are
+mapped into that category, and which kernels are taking the largest amount of
+time which would cause a difference for the overall category.
+
+## Example 3: add new classification for a new model
+
+To create a new engine DEF with model ABC, just add another json file in the same directory as
+gputrc2graph.py with the same format as the other json files. The script will automatically pick up all the json files in the same directory as engine/model specifications.
+
+Then, for this new model, suppose there are 4 kernels to be classified into
+"gemm" and "attn", where the gemm kernels have names with "*H*" or "*I*" in
+them, and attn kernels have names with "*J*" or "*K*" in them, just add another
+ .json file in the same directory as gputrc2graph.py with the same format as
+ the other json files, like the following:
+
+```json
+{
+  "DEF": {
+      "ABC": {
+          "H|I": "gemm",
+          "J|K": "attn",
+          "CUDA mem": "non-gpu-H_D_memops",
+          ".*": "misc"
+      }
+  }
+}
+```
+
+Each entry in the dictionary consists of:
+
+- key: a regex used to classify the kernels
+- value: the category to classify the kernels into.
+
+The last 2 entries are common for all engine/models, consisting of CUDA memory
+operations and a 'misc' for anything that's leftover and can't be classified.
+
+When invoking gputrc2graph.py, specify a trace file with this new model/engine
+like the following:
+
+```bash
+--in_file new.nsys-rep,DEF,ABC,<runtime>
+```
+
+If the engine_DEF.json file already exists, just add the model as a new node in
+ the existing engine file, after the other models.
diff --git a/examples/profiler/nsys_profile_tools/gputrc2graph.py b/examples/profiler/nsys_profile_tools/gputrc2graph.py
new file mode 100755
index 000000000..f17bd1857
--- /dev/null
+++ b/examples/profiler/nsys_profile_tools/gputrc2graph.py
@@ -0,0 +1,344 @@
+"""
+    This generates gpu kernel analysis output from nsys rep. Will call nsys
+    stats  -r cuda_gpu_kern_trace, get non-overlapped gpu cycles, then generate
+    csv and html output for analysis
+"""
+
+import argparse
+import logging
+import os
+
+import regex as re
+
+logger = logging.getLogger(__name__)
+
+
+# helper data class for annotating kernels
+def load_engine_model():
+    """returns engine_model built from all json files in the current dir"""
+    import glob
+    import json
+
+    engine_model = {}
+
+    json_files = glob.glob(os.path.join(os.path.dirname(__file__) or ".", "*.json"))
+    for fname in json_files:
+        with open(fname, encoding="utf-8") as f:
+            engine_model.update(json.load(f))
+    return engine_model
+
+
+class GPUTrace2Graph:
+    """
+    Parses output of nsys report, generates csv and bar chart output
+    """
+
+    def __init__(self):
+        import pandas as pd  # avoid importing till needed
+
+        self.pd = pd
+        self.pd.options.mode.copy_on_write = True
+
+    # helper functions for generating trace->summary csvs
+    def gen_nonoverlapped_sum_from_gputrace(self, in_file, out_file):
+        logger.info("loading %s", in_file)
+        df = self.pd.read_csv(
+            in_file, usecols=["Start (ns)", "Duration (ns)", "Device", "Strm", "Name"]
+        )
+        df["End (ns)"] = df["Start (ns)"] + df["Duration (ns)"]
+        df = self.sum_non_overlapping_intervals(df)
+        # get ready to print table with elapsed times per kernel
+        df["Instances"] = 1
+        df_sum = df.groupby("Name", as_index=False).agg(
+            {"Elapsed Time (ns)": "sum", "Duration (ns)": "sum", "Instances": "size"}
+        )
+
+        # generate csv
+        df_sum["Total Time (sec)"] = df_sum["Duration (ns)"] / 1e9
+        df_sum["Elapsed Time (sec)"] = df_sum["Elapsed Time (ns)"] / 1e9
+        df_sum = df_sum.sort_values(by="Elapsed Time (sec)", ascending=False)
+        df_sum[["Elapsed Time (sec)", "Total Time (sec)", "Instances", "Name"]].to_csv(
+            out_file, index=False
+        )
+
+    def sum_non_overlapping_intervals(self, df):
+        """
+        returns new sorted df with Elapsed Time (ns) column using
+        vectorized operations
+        """
+        logger.info("sorting %s trace records by start time", str(df.shape))
+
+        # Sort by start time and reset index
+        df = df.sort_values(by="Start (ns)").reset_index(drop=True)
+
+        # Initialize elapsed time as duration
+        df["Elapsed Time (ns)"] = df["Duration (ns)"]
+
+        # Get numpy arrays for faster operations
+        starts = df["Start (ns)"].values
+        ends = df["End (ns)"].values
+
+        # Keep track of current interval end
+        current_end = ends[0]
+        display_units = max(1, int(len(df) / 100))
+        # Update current_end for overlapping intervals
+        for i in range(1, len(df)):
+            if i % display_units == 0:
+                print(f"processing trace: {int(i/len(df) * 100)} %", end="\r")
+            if starts[i] <= current_end:
+                if ends[i] > current_end:
+                    # Partial overlap
+                    df.iloc[i, df.columns.get_loc("Elapsed Time (ns)")] = (
+                        ends[i] - current_end
+                    )
+                    current_end = ends[i]
+                else:
+                    # Complete overlap
+                    df.iloc[i, df.columns.get_loc("Elapsed Time (ns)")] = 0
+            else:
+                # No overlap
+                current_end = ends[i]
+
+        return df
+
+    # functions for generating html files
+    def make_html(self, df, output_dir, title):
+        """make html graph from df"""
+        import plotly.express as px
+
+        if df.empty:
+            return
+        output_name = os.path.join(output_dir, "result")
+        if not title:
+            title = "Model_Engine"
+        x = "Model_Engine"
+        y = "Elapsed Time (sec)"
+        color = "Category"
+        """ generate kernel mapping table  """
+        # Sort Model_Engine categories by last field after underscore
+        df["Model_Engine"] = self.pd.Categorical(
+            df["Model_Engine"],
+            sorted(df["Model_Engine"].unique(), key=lambda x: x.split("_")[-1]),
+        )
+        df[["Model_Engine", color, "Instances", "Name", y]].sort_values(
+            by=color
+        ).to_csv(f"{output_name}.csv", index=False)
+        graph = px.histogram(
+            df.round(2),
+            x=x,
+            y=y,
+            title=(f"{y} for {title}"),
+            color=color,
+            text_auto=True,
+        )
+        # wrap x axis labels
+        graph.update_xaxes(automargin=True)
+        graph.write_html(f"{output_name}.html")
+        """
+            Generate data table with columns per Model_Engine into result.html
+        """
+        pivot_df = df.pivot_table(
+            values="Elapsed Time (sec)",
+            index="Category",
+            columns="Model_Engine",
+            aggfunc="sum",
+            observed=False,
+        ).round(2)
+        # Add sum row at bottom
+        pivot_df.loc["total_elapsed_sec"] = pivot_df.sum()
+        pivot_df.fillna("").to_html("temp.html")
+        with (
+            open(f"{output_name}.html", "a", encoding="utf-8") as outfile,
+            open("temp.html", encoding="utf-8") as infile,
+        ):
+            outfile.write(infile.read())
+        os.remove("temp.html")
+
+        print(
+            f"Finished generating: \n"
+            f" {output_name}.html for stack bar chart \n"
+            f" {output_name}.csv for Kernel-Category mapping"
+        )
+
+    def anno_gpu_kernname(self, df, mapping):
+        """add "Category" column"""
+
+        def anno_gpu_kernname_helper(name):
+            for kern_name, val in mapping.items():
+                if re.search(kern_name, name):
+                    return val
+
+        df["Category"] = df["Name"].apply(anno_gpu_kernname_helper)
+
+    def make_nongpu_row(self, df, nongpu_sec):
+        """this will append non-gpu time entry at end of df"""
+        nongpu_row = self.pd.DataFrame([df.iloc[-1]])
+        nongpu_row["Category"] = nongpu_row["Name"] = "CPU(non-GPU)"
+        nongpu_row["Instances"] = 1
+        nongpu_row["Elapsed Time (sec)"] = nongpu_sec
+        return nongpu_row
+
+    def is_valid_file(self, base_file):
+        """asserts if base_file is non-existent or is empty"""
+        assert (
+            os.path.isfile(base_file) and os.path.getsize(base_file) > 0
+        ), f"{base_file} doesn't exist or is empty"
+
+    def should_gen_file(self, new_file, base_file):
+        """figure out if new file should be generated from base_file"""
+        self.is_valid_file(base_file)
+        if (
+            os.path.exists(new_file)
+            and (os.path.getmtime(new_file) > os.path.getmtime(base_file))
+            and (os.path.getsize(base_file) > 0)
+        ):
+            logger.info("reusing %s", new_file)
+            return False
+        else:
+            logger.info("generating %s", new_file)
+            return True
+
+    def gen_sum_file(self, file, nsys_cmd):
+        """
+        generates sum file from nsys trace with times per kernel and
+        returns the name of the sum file
+        """
+        import subprocess
+
+        file_dir = os.path.dirname(file)
+        file_name = os.path.basename(file)
+
+        if not file_dir:
+            file_dir = "."
+        # Walk through trace and get the total non-overlapped time
+        nsys_stats_file = os.path.join(file_dir, f"{file_name}_cuda_gpu_trace.csv")
+        sum_file = os.path.join(file_dir, f"{file_name}_cuda_gpu_kernel_tracesum.csv")
+        if self.should_gen_file(nsys_stats_file, file):
+            cmd = [
+                nsys_cmd,
+                "stats",
+                "-r",
+                "cuda_gpu_trace",
+                file,
+                "-o",
+                f"{file_dir}/{file_name}",
+            ]
+            cmd_str = " ".join(cmd)
+            logger.info("+ %s", cmd_str)
+            # estimate time based on calibrated 240M/min
+            file_size_mb = os.path.getsize(file) / 1e6
+            logger.info(
+                "nsys stats for %.2f MB file expected to take %.2f min",
+                file_size_mb,
+                file_size_mb / 240,
+            )
+            try:
+                subprocess.run(cmd, check=True)
+            except (FileNotFoundError, subprocess.CalledProcessError) as e:
+                logger.error(
+                    "'%s' failed: %s. Use --nsys_cmd to specify nsys path", cmd_str, e
+                )
+                exit(1)
+            logger.info("generating non-overalapped sum %s", sum_file)
+            self.gen_nonoverlapped_sum_from_gputrace(nsys_stats_file, sum_file)
+        self.is_valid_file(sum_file)
+        logger.info("Finished generating %s", sum_file)
+        return sum_file
+
+    def gen_graph(self, in_file, out_dir, title, nsys_cmd, engine_model):
+        """generates graph and csv file from in_file into out_dir"""
+        # Initialize an empty DataFrame to store combined data
+        combined_df = self.pd.DataFrame()
+        for idx, (file, engine, model, total_sec) in enumerate(in_file):
+            file_dir = os.path.dirname(file)
+            file_name = os.path.basename(file)
+            if not file_dir:
+                file_dir = "."
+            sum_file = self.gen_sum_file(file, nsys_cmd)
+            # read kernel summary file
+            df = self.pd.read_csv(sum_file)
+            # annotate kernel to their categories
+            assert engine_model.get(engine), f"engine {engine} unknown"
+            assert engine_model[engine].get(model), f"model {model} unknown"
+            # remove nsys-rep from file_name for shorter x-label
+            file_name = file_name.replace(".nsys-rep", "")
+            df["Model_Engine"] = f"{model}_{engine}_{file_name}_{idx}"
+            self.anno_gpu_kernname(df, engine_model[engine][model])
+            # patch in non-gpu time
+            gpu_sec = round(df["Elapsed Time (sec)"].sum(), 1)
+            total_sec = round(float(total_sec), 1)
+            if total_sec < gpu_sec:
+                logger.warning(
+                    "Elapsed sec %.2f < GPU sec %.2f resetting Elapsed sec ",
+                    total_sec,
+                    gpu_sec,
+                )
+                total_sec = gpu_sec
+            nongpu_row = self.make_nongpu_row(df, total_sec - gpu_sec)
+            df = self.pd.concat([df, nongpu_row], ignore_index=True)
+            combined_df = self.pd.concat([combined_df, df], ignore_index=True)
+        if out_dir is None:
+            out_dir = "."
+        else:
+            os.makedirs(out_dir, exist_ok=True)
+        # generate html file
+        self.make_html(combined_df, out_dir, title)
+
+
+def parse_tuple(s):
+    return tuple(s.split(","))
+
+
+def main():
+    logging.basicConfig(
+        format=("%(asctime)s - %(levelname)s - %(message)s"), level=logging.INFO
+    )
+    parser = argparse.ArgumentParser(
+        description=(
+            "Process nsys rep and generate kernel non-overlapped cycles. \n"
+            "Example:\n"
+            "gputrc2graph.py --in_file d1.nsys-rep,sglang,llama,100 \n"
+            "d2.nsys-rep,sglang,gpt-oss,102 "
+            '--out_dir results/ --title "Model=gpt-oss SGLANG chart"'
+        ),
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
+    # load supported engine_model
+    engine_model_supported = load_engine_model()
+    # Get a string representation of supported engine/model combinations
+    engine_model_supported_str = ", ".join(
+        f"{engine}:[{', '.join(models.keys())}]"
+        for engine, models in engine_model_supported.items()
+    )
+    parser.add_argument(
+        "--in_file",
+        type=parse_tuple,
+        nargs="+",
+        help=(
+            "list of (nsys-rep, engine, model, elapsed_nonprofiled_sec) "
+            "separated by space. Elapsed_nonprofiled_sec is runtime without "
+            "profiling used to calculate non-gpu time. Specify 0 to use "
+            "elapsed time from nsys-rep but that might inflate non-gpu time. "
+            f"Available engine:[model] are: {engine_model_supported_str} "
+            f"Example: --infile d1.nsys-rep,sglan,llama,100 "
+            "d2.nsys-rep,sglang,gpt-oss,102"
+        ),
+        required=True,
+    )
+    parser.add_argument("--out_dir", help=("output dir for result.csv/html"))
+    parser.add_argument("--title", help=("title for html chart"))
+    parser.add_argument(
+        "--nsys_cmd",
+        help=("nsys cmd, e.g. /usr/bin/nsys, Default: nsys"),
+        default="nsys",
+    )
+    args = parser.parse_args()
+    gputrace = GPUTrace2Graph()
+    gputrace.gen_graph(
+        args.in_file, args.out_dir, args.title, args.nsys_cmd, engine_model_supported
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/profiler/nsys_profile_tools/sglang_engine_model.json b/examples/profiler/nsys_profile_tools/sglang_engine_model.json
new file mode 100644
index 000000000..253cc762b
--- /dev/null
+++ b/examples/profiler/nsys_profile_tools/sglang_engine_model.json
@@ -0,0 +1,61 @@
+{
+  "sglang": {
+    "llama": {
+      "gemm|nvjet": "gemm",
+      "fused_moe_kernel|GroupProblemShape|group_gemm_starts|bmm_|GemmUniversal": "moe_gemm",
+      "moe|sigmoid": "moe",
+      "CatArrayBatched|prepare_inputs": "prepare_next",
+      "ncclDevKernel|cross_device_reduce": "nccl_and_custom_ar",
+      "_norm_|Norm": "norm",
+      "topk": "topk",
+      "act_and_mul_": "activation",
+      "Rotary": "rope",
+      "SoftMax": "softmax",
+      "flash|fmha": "attn",
+      "elementwise": "elementwise",
+      "fp8_quant|cvt_|quantize": "quantize",
+      "reduce_kernel": "reduce",
+      "triton": "triton_kernel",
+      "CUDA mem": "non-gpu-H_D_memops",
+      ".*": "misc"
+    },
+    "ds": {
+      "block_fp8_matmul": "block_fp8_gemm",
+      "gemm|matmul|nvjet": "gemm",
+      "fused_moe_kernel": "moe_gemm",
+      "moe|expert|sigmoid": "moe",
+      "CatArrayBatched|write_req_to": "prepare_next",
+      "ncclDevKernel|cross_device_reduce|all_gather": "nccl_and_custom_ar",
+      "Norm": "norm",
+      "topk": "topk",
+      "activation|act_and_mul": "activation",
+      "compute_position_kernel": "rope",
+      "elementwise": "elementwise",
+      "fp8_quant|quant_fp8|quantize": "quantize",
+      "SoftMax": "softmax",
+      "reduce": "reduce",
+      "_fwd_|create_flash|::mla::|KVCache": "attn",
+      "CUDA mem": "non-gpu-H_D_memops",
+      ".*": "misc"
+    },
+    "gpt-oss": {
+      "gemm|nvjet": "gemm",
+      "fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal|bmm_|matmul_ogs_|_topk_forward|_combined_routing|_sum_bitmatrix_rows|_compute_writeback_idx": "moe_gemm",
+      "moe|sigmoid": "moe",
+      "CatArrayBatched|prepare_inputs": "prepare_next",
+      "_norm_|Norm": "norm",
+      "ncclDevKernel|cross_device_reduce|allreduce": "nccl_and_custom_ar",
+      "topk|TopK": "topk",
+      "act_and_mul_": "activation",
+      "Rotary": "rope",
+      "SoftMax": "softmax",
+      "flash|fmha": "attn",
+      "elementwise": "elementwise",
+      "fp8_quant|cvt_|quantize": "quantize",
+      "reduce_kernel": "reduce",
+      "triton": "triton_kernel",
+      "CUDA mem": "non-gpu-H_D_memops",
+      ".*": "misc"
+    }
+  }
+}
diff --git a/examples/runtime/README.md b/examples/runtime/README.md
new file mode 100644
index 000000000..18414452f
--- /dev/null
+++ b/examples/runtime/README.md
@@ -0,0 +1,45 @@
+# Runtime examples
+
+The below examples will mostly need you to start a server in a separate terminal before you can execute them. Please see in the code for detailed instruction.
+
+## Native API
+
+* `lora.py`: An example how to use LoRA adapters.
+* `multimodal_embedding.py`: An example how perform [multi modal embedding](Alibaba-NLP/gme-Qwen2-VL-2B-Instruct).
+* `openai_batch_chat.py`: An example how to process batch requests for chat completions.
+* `openai_batch_complete.py`: An example how to process batch requests for text completions.
+* **`openai_chat_with_response_prefill.py`**:
+  An example that demonstrates how to [prefill a response](https://eugeneyan.com/writing/prompting/#prefill-claudes-responses) using the OpenAI API by enabling the `continue_final_message` parameter.
+  When enabled, the final (partial) assistant message is removed and its content is used as a prefill so that the model continues that message rather  than starting a new turn. See [Anthropic's prefill example](https://docs.anthropic.com/en/docs/build-with-claude/prompt-engineering/prefill-claudes-response#example-structured-data-extraction-with-prefilling) for more context.
+* `reward_model.py`: An example how to extract scores from a reward model.
+* `vertex_predict.py`: An example how to deploy a model to [Vertex AI](https://cloud.google.com/vertex-ai?hl=en).
+
+## Engine
+
+The `engine` folder contains that examples that show how to use [Offline Engine API](https://docs.sglang.ai/backend/offline_engine_api.html#Offline-Engine-API) for common workflows.
+
+* `custom_server.py`: An example how to deploy a custom server.
+* `embedding.py`: An example how to extract embeddings.
+* `launch_engine.py`: An example how to launch the Engine.
+* `offline_batch_inference_eagle.py`: An example how to perform speculative decoding using [EAGLE](https://docs.sglang.ai/backend/speculative_decoding.html).
+* `offline_batch_inference_torchrun.py`: An example how to perform inference using [torchrun](https://pytorch.org/docs/stable/elastic/run.html).
+* `offline_batch_inference_vlm.py`: An example how to use VLMs with the engine.
+* `offline_batch_inference.py`: An example how to use the engine to perform inference on a batch of examples.
+
+## Hidden States
+
+The `hidden_states` folder contains examples on how to extract hidden states using SGLang. Please note that this might degrade throughput due to cuda graph rebuilding.
+
+* `hidden_states_engine.py`: An example how to extract hidden states using the Engine API.
+* `hidden_states_server.py`: An example how to extract hidden states using the Server API.
+
+## Multimodal
+
+SGLang supports multimodal inputs for various model architectures. The `multimodal` folder contains examples showing how to use urls, files or encoded data to make requests to multimodal models. Examples include querying the [Llava-OneVision](multimodal/llava_onevision_server.py) model (image, multi-image, video), Llava-backed [Qwen-Llava](multimodal/qwen_llava_server.py) and [Llama3-Llava](multimodal/llama3_llava_server.py) models (image, multi-image), and Mistral AI's [Pixtral](multimodal/pixtral_server.py) (image, multi-image).
+
+
+## Token In, Token Out
+
+The folder `token_in_token_out` shows how to perform inference, where we provide tokens and get tokens as response.
+
+* `token_in_token_out_{llm|vlm}_{engine|server}.py`: Shows how to perform token in, token out workflow for llm/vlm using either the engine or native API.
diff --git a/examples/runtime/engine/custom_server.py b/examples/runtime/engine/custom_server.py
new file mode 100644
index 000000000..b190a463e
--- /dev/null
+++ b/examples/runtime/engine/custom_server.py
@@ -0,0 +1,53 @@
+from sanic import Sanic, text
+from sanic.response import json
+
+import sglang as sgl
+
+engine = None
+
+# Create an instance of the Sanic app
+app = Sanic("sanic-server")
+
+
+# Define an asynchronous route handler
+@app.route("/generate", methods=["POST"])
+async def generate(request):
+    prompt = request.json.get("prompt")
+    if not prompt:
+        return json({"error": "Prompt is required"}, status=400)
+
+    # async_generate returns a dict
+    result = await engine.async_generate(prompt)
+
+    return text(result["text"])
+
+
+@app.route("/generate_stream", methods=["POST"])
+async def generate_stream(request):
+    prompt = request.json.get("prompt")
+
+    if not prompt:
+        return json({"error": "Prompt is required"}, status=400)
+
+    # async_generate returns a dict
+    result = await engine.async_generate(prompt, stream=True)
+
+    # https://sanic.dev/en/guide/advanced/streaming.md#streaming
+    # init the response
+    response = await request.respond()
+
+    # result is an async generator
+    async for chunk in result:
+        await response.send(chunk["text"])
+
+    await response.eof()
+
+
+def run_server():
+    global engine
+    engine = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
+    app.run(host="0.0.0.0", port=8000, single_process=True)
+
+
+if __name__ == "__main__":
+    run_server()
diff --git a/examples/runtime/engine/embedding.py b/examples/runtime/engine/embedding.py
new file mode 100644
index 000000000..b927a188b
--- /dev/null
+++ b/examples/runtime/engine/embedding.py
@@ -0,0 +1,27 @@
+import sglang as sgl
+
+
+def main():
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    # Create an LLM.
+    llm = sgl.Engine(
+        model_path="Alibaba-NLP/gte-Qwen2-1.5B-instruct", is_embedding=True
+    )
+
+    outputs = llm.encode(prompts)
+    # Print the outputs (embedding vectors)
+    for prompt, output in zip(prompts, outputs):
+        print("===============================")
+        print(f"Prompt: {prompt}\nEmbedding vector: {output['embedding']}")
+
+
+# The __main__ condition is necessary here because we use "spawn" to create subprocesses
+# Spawn starts a fresh program every time, if there is no __main__, it will run into infinite loop to keep spawning processes from sgl.Engine
+if __name__ == "__main__":
+    main()
diff --git a/examples/runtime/engine/fastapi_engine_inference.py b/examples/runtime/engine/fastapi_engine_inference.py
new file mode 100644
index 000000000..a755cf8d8
--- /dev/null
+++ b/examples/runtime/engine/fastapi_engine_inference.py
@@ -0,0 +1,189 @@
+"""
+FastAPI server example for text generation using SGLang Engine and demonstrating client usage.
+
+Starts the server, sends requests to it, and prints responses.
+
+Usage:
+python fastapi_engine_inference.py --model-path Qwen/Qwen2.5-0.5B-Instruct --tp_size 1 --host 127.0.0.1 --port 8000
+"""
+
+import os
+import subprocess
+import time
+from contextlib import asynccontextmanager
+
+import requests
+from fastapi import FastAPI, Request
+
+import sglang as sgl
+from sglang.utils import terminate_process
+
+engine = None
+
+
+# Use FastAPI's lifespan manager to initialize/shutdown the engine
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Manages SGLang engine initialization during server startup."""
+    global engine
+    # Initialize the SGLang engine when the server starts
+    # Adjust model_path and other engine arguments as needed
+    print("Loading SGLang engine...")
+    engine = sgl.Engine(
+        model_path=os.getenv("MODEL_PATH"), tp_size=int(os.getenv("TP_SIZE"))
+    )
+    print("SGLang engine loaded.")
+    yield
+    # Clean up engine resources when the server stops (optional, depends on engine needs)
+    print("Shutting down SGLang engine...")
+    # engine.shutdown() # Or other cleanup if available/necessary
+    print("SGLang engine shutdown.")
+
+
+app = FastAPI(lifespan=lifespan)
+
+
+@app.post("/generate")
+async def generate_text(request: Request):
+    """FastAPI endpoint to handle text generation requests."""
+    global engine
+    if not engine:
+        return {"error": "Engine not initialized"}, 503
+
+    try:
+        data = await request.json()
+        prompt = data.get("prompt")
+        max_new_tokens = data.get("max_new_tokens", 128)
+        temperature = data.get("temperature", 0.7)
+
+        if not prompt:
+            return {"error": "Prompt is required"}, 400
+
+        # Use async_generate for non-blocking generation
+        state = await engine.async_generate(
+            prompt,
+            sampling_params={
+                "max_new_tokens": max_new_tokens,
+                "temperature": temperature,
+            },
+            # Add other parameters like stop, top_p etc. as needed
+        )
+
+        return {"generated_text": state["text"]}
+    except Exception as e:
+        return {"error": str(e)}, 500
+
+
+# Helper function to start the server
+def start_server(args, timeout=60):
+    """Starts the Uvicorn server as a subprocess and waits for it to be ready."""
+    base_url = f"http://{args.host}:{args.port}"
+    command = [
+        "python",
+        "-m",
+        "uvicorn",
+        "fastapi_engine_inference:app",
+        f"--host={args.host}",
+        f"--port={args.port}",
+    ]
+
+    process = subprocess.Popen(command, stdout=None, stderr=None)
+
+    start_time = time.perf_counter()
+    with requests.Session() as session:
+        while time.perf_counter() - start_time < timeout:
+            try:
+                # Check the /docs endpoint which FastAPI provides by default
+                response = session.get(
+                    f"{base_url}/docs", timeout=5
+                )  # Add a request timeout
+                if response.status_code == 200:
+                    print(f"Server {base_url} is ready (responded on /docs)")
+                    return process
+            except requests.ConnectionError:
+                # Specific exception for connection refused/DNS error etc.
+                pass
+            except requests.Timeout:
+                # Specific exception for request timeout
+                print(f"Health check to {base_url}/docs timed out, retrying...")
+                pass
+            except requests.RequestException as e:
+                # Catch other request exceptions
+                print(f"Health check request error: {e}, retrying...")
+                pass
+            # Use a shorter sleep interval for faster startup detection
+            time.sleep(1)
+
+    # If loop finishes, raise the timeout error
+    # Attempt to terminate the failed process before raising
+    if process:
+        print(
+            "Server failed to start within timeout, attempting to terminate process..."
+        )
+        terminate_process(process)  # Use the imported terminate_process
+    raise TimeoutError(
+        f"Server failed to start at {base_url} within the timeout period."
+    )
+
+
+def send_requests(server_url, prompts, max_new_tokens, temperature):
+    """Sends generation requests to the running server for a list of prompts."""
+    # Iterate through prompts and send requests
+    for i, prompt in enumerate(prompts):
+        print(f"\n[{i+1}/{len(prompts)}] Sending prompt: '{prompt}'")
+        payload = {
+            "prompt": prompt,
+            "max_new_tokens": max_new_tokens,
+            "temperature": temperature,
+        }
+
+        try:
+            response = requests.post(f"{server_url}/generate", json=payload, timeout=60)
+
+            result = response.json()
+
+            print(f"Prompt: {prompt}\nResponse: {result['generated_text']}")
+
+        except requests.exceptions.Timeout:
+            print(f"  Error: Request timed out for prompt '{prompt}'")
+        except requests.exceptions.RequestException as e:
+            print(f"  Error sending request for prompt '{prompt}': {e}")
+
+
+if __name__ == "__main__":
+    """Main entry point for the script."""
+
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--model-path", type=str, default="Qwen/Qwen2.5-0.5B-Instruct")
+    parser.add_argument("--tp_size", type=int, default=1)
+    args = parser.parse_args()
+
+    # Pass the model to the child uvicorn process via an env var
+    os.environ["MODEL_PATH"] = args.model_path
+    os.environ["TP_SIZE"] = str(args.tp_size)
+
+    # Start the server
+    process = start_server(args)
+
+    # Define the prompts and sampling parameters
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    max_new_tokens = 64
+    temperature = 0.1
+
+    # Define server url
+    server_url = f"http://{args.host}:{args.port}"
+
+    # Send requests to the server
+    send_requests(server_url, prompts, max_new_tokens, temperature)
+
+    # Terminate the server process
+    terminate_process(process)
diff --git a/examples/runtime/engine/launch_engine.py b/examples/runtime/engine/launch_engine.py
new file mode 100644
index 000000000..b97e0bcd5
--- /dev/null
+++ b/examples/runtime/engine/launch_engine.py
@@ -0,0 +1,17 @@
+"""
+This example demonstrates how to launch the offline engine.
+"""
+
+import sglang as sgl
+
+
+def main():
+    llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
+    llm.generate("What is the capital of France?")
+    llm.shutdown()
+
+
+# The __main__ condition is necessary here because we use "spawn" to create subprocesses
+# Spawn starts a fresh program every time, if there is no __main__, it will run into infinite loop to keep spawning processes from sgl.Engine
+if __name__ == "__main__":
+    main()
diff --git a/examples/runtime/engine/offline_batch_inference.py b/examples/runtime/engine/offline_batch_inference.py
new file mode 100644
index 000000000..92e68dcd7
--- /dev/null
+++ b/examples/runtime/engine/offline_batch_inference.py
@@ -0,0 +1,43 @@
+"""
+Usage:
+python3 offline_batch_inference.py  --model meta-llama/Llama-3.1-8B-Instruct
+"""
+
+import argparse
+import dataclasses
+
+import sglang as sgl
+from sglang.srt.server_args import ServerArgs
+
+
+def main(
+    server_args: ServerArgs,
+):
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    # Create a sampling params object.
+    sampling_params = {"temperature": 0.8, "top_p": 0.95}
+
+    # Create an LLM.
+    llm = sgl.Engine(**dataclasses.asdict(server_args))
+
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for prompt, output in zip(prompts, outputs):
+        print("===============================")
+        print(f"Prompt: {prompt}\nGenerated text: {output['text']}")
+
+
+# The __main__ condition is necessary here because we use "spawn" to create subprocesses
+# Spawn starts a fresh program every time, if there is no __main__, it will run into infinite loop to keep spawning processes from sgl.Engine
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    server_args = ServerArgs.from_cli_args(args)
+    main(server_args)
diff --git a/examples/runtime/engine/offline_batch_inference_async.py b/examples/runtime/engine/offline_batch_inference_async.py
new file mode 100644
index 000000000..578962d78
--- /dev/null
+++ b/examples/runtime/engine/offline_batch_inference_async.py
@@ -0,0 +1,65 @@
+"""
+Usage:
+python offline_batch_inference_async.py --model-path Qwen/Qwen2-VL-7B-Instruct
+
+Note:
+This demo shows the usage of async generation,
+which is useful to implement an online-like generation with batched inference.
+"""
+
+import argparse
+import asyncio
+import dataclasses
+import time
+
+import sglang as sgl
+from sglang.srt.server_args import ServerArgs
+
+
+class InferenceEngine:
+    def __init__(self, **kwargs):
+        self.engine = sgl.Engine(**kwargs)
+
+    async def generate(self, prompt, sampling_params):
+        result = await self.engine.async_generate(prompt, sampling_params)
+        return result
+
+
+async def run_server(server_args):
+    inference = InferenceEngine(**dataclasses.asdict(server_args))
+
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ] * 100
+
+    # Create a sampling params object.
+    sampling_params = {"temperature": 0.8, "top_p": 0.95}
+
+    # Run the generation tasks concurrently in async mode.
+    tasks = []
+    for prompt in prompts:
+        task = asyncio.create_task(inference.generate(prompt, sampling_params))
+        tasks.append(task)
+
+    # Get and print the result
+    for task in tasks:
+        await task
+        while True:
+            if not task.done():
+                time.sleep(1)
+            else:
+                result = task.result()
+                print(f"Generated text: {result['text']}")
+                break
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    server_args = ServerArgs.from_cli_args(args)
+    asyncio.run(run_server(server_args))
diff --git a/examples/runtime/engine/offline_batch_inference_eagle.py b/examples/runtime/engine/offline_batch_inference_eagle.py
new file mode 100644
index 000000000..a7a89ef5c
--- /dev/null
+++ b/examples/runtime/engine/offline_batch_inference_eagle.py
@@ -0,0 +1,38 @@
+import sglang as sgl
+
+
+def main():
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    # Create a sampling params object.
+    sampling_params = {"temperature": 0, "max_new_tokens": 30}
+
+    # Create an LLM.
+    llm = sgl.Engine(
+        model_path="meta-llama/Llama-2-7b-chat-hf",
+        speculative_algorithm="EAGLE",
+        speculative_draft_model_path="lmsys/sglang-EAGLE-llama2-chat-7B",
+        speculative_num_steps=3,
+        speculative_eagle_topk=4,
+        speculative_num_draft_tokens=16,
+        cuda_graph_max_bs=8,
+    )
+
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    for prompt, output in zip(prompts, outputs):
+        print("===============================")
+        print(f"Prompt: {prompt}\nGenerated text: {output['text']}")
+
+
+# The __main__ condition is necessary here because we use "spawn" to create subprocesses
+# Spawn starts a fresh program every time, if there is no __main__, it will run into infinite loop to keep spawning processes from sgl.Engine
+if __name__ == "__main__":
+    main()
diff --git a/examples/runtime/engine/offline_batch_inference_qwen_1m.py b/examples/runtime/engine/offline_batch_inference_qwen_1m.py
new file mode 100644
index 000000000..664efa6d7
--- /dev/null
+++ b/examples/runtime/engine/offline_batch_inference_qwen_1m.py
@@ -0,0 +1,74 @@
+"""
+Usage:
+python3 offline_batch_inference.py
+"""
+
+from urllib.request import urlopen
+
+import sglang as sgl
+
+
+def load_prompt() -> str:
+    # Test cases with various lengths can be found at:
+    #
+    # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/64k.txt
+    # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/200k.txt
+    # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/600k.txt
+    # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/1m.txt
+
+    with urlopen(
+        "https://qianwen-res.oss-cn-beijing.aliyuncs.com"
+        "/Qwen2.5-1M/test-data/64k.txt",
+        timeout=5,
+    ) as response:
+        prompt = response.read().decode("utf-8")
+    return prompt
+
+
+# Processing the prompt.
+def process_requests(llm: sgl.Engine, prompts: list[str]) -> None:
+    # Create a sampling params object.
+    sampling_params = {
+        "temperature": 0.7,
+        "top_p": 0.8,
+        "top_k": 20,
+        "repetition_penalty": 1.05,
+        "max_new_tokens": 256,
+    }
+    # Generate texts from the prompts.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for output in outputs:
+        prompt_token_ids = output["meta_info"]["prompt_tokens"]
+        generated_text = output["text"]
+        print(
+            f"Prompt length: {prompt_token_ids}, " f"Generated text: {generated_text!r}"
+        )
+
+
+# Create an LLM.
+def initialize_engine() -> sgl.Engine:
+    llm = sgl.Engine(
+        model_path="Qwen/Qwen2.5-7B-Instruct-1M",
+        context_length=1048576,
+        page_size=256,
+        attention_backend="dual_chunk_flash_attn",
+        tp_size=4,
+        disable_radix_cache=True,
+        enable_mixed_chunk=False,
+        enable_torch_compile=False,
+        chunked_prefill_size=131072,
+        mem_fraction_static=0.6,
+        log_level="DEBUG",
+    )
+    return llm
+
+
+def main():
+    llm = initialize_engine()
+    prompt = load_prompt()
+    process_requests(llm, [prompt])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/runtime/engine/offline_batch_inference_vlm.py b/examples/runtime/engine/offline_batch_inference_vlm.py
new file mode 100644
index 000000000..392823946
--- /dev/null
+++ b/examples/runtime/engine/offline_batch_inference_vlm.py
@@ -0,0 +1,52 @@
+"""
+Usage:
+python offline_batch_inference_vlm.py --model-path Qwen/Qwen2-VL-7B-Instruct
+"""
+
+import argparse
+import dataclasses
+
+import sglang as sgl
+from sglang.srt.parser.conversation import chat_templates
+from sglang.srt.server_args import ServerArgs
+
+
+def main(
+    server_args: ServerArgs,
+):
+    vlm = sgl.Engine(**dataclasses.asdict(server_args))
+
+    conv = chat_templates[server_args.chat_template].copy()
+    image_token = conv.image_token
+
+    image_url = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
+
+    prompt = f"What's in this image?\n{image_token}"
+
+    sampling_params = {
+        "temperature": 0.001,
+        "max_new_tokens": 30,
+    }
+
+    output = vlm.generate(
+        prompt=prompt,
+        image_data=image_url,
+        sampling_params=sampling_params,
+    )
+
+    print("===============================")
+    print(f"Prompt: {prompt}")
+    print(f"Generated text: {output['text']}")
+
+    vlm.shutdown()
+
+
+# The __main__ condition is necessary here because we use "spawn" to create subprocesses
+# Spawn starts a fresh program every time, if there is no __main__, it will run into infinite loop to keep spawning processes from sgl.Engine
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    args = parser.parse_args()
+
+    server_args = ServerArgs.from_cli_args(args)
+    main(server_args)
diff --git a/examples/runtime/engine/readme.md b/examples/runtime/engine/readme.md
new file mode 100644
index 000000000..fc6e89a6a
--- /dev/null
+++ b/examples/runtime/engine/readme.md
@@ -0,0 +1,54 @@
+# SGLang Engine
+
+SGLang provides a direct inference engine without the need for an HTTP server. There are generally these use cases:
+
+- [Offline Batch Inference](#offline-batch-inference)
+- [Embedding Generation](#embedding-generation)
+- [Custom Server](#custom-server)
+- [Token-In-Token-Out for RLHF](#token-in-token-out-for-rlhf)
+- [Inference Using FastAPI](#inference-using-fastapi)
+
+## Examples
+
+### [Offline Batch Inference](./offline_batch_inference.py)
+
+In this example, we launch an SGLang engine and feed a batch of inputs for inference. If you provide a very large batch, the engine will intelligently schedule the requests to process efficiently and prevent OOM (Out of Memory) errors.
+
+### [Embedding Generation](./embedding.py)
+
+In this example, we launch an SGLang engine and feed a batch of inputs for embedding generation.
+
+### [Custom Server](./custom_server.py)
+
+This example demonstrates how to create a custom server on top of the SGLang Engine. We use [Sanic](https://sanic.dev/en/) as an example. The server supports both non-streaming and streaming endpoints.
+
+#### Steps
+
+1. Install Sanic:
+
+   ```bash
+   pip install sanic
+   ```
+
+2. Run the server:
+
+   ```bash
+   python custom_server
+   ```
+
+3. Send requests:
+
+   ```bash
+   curl -X POST http://localhost:8000/generate  -H "Content-Type: application/json"  -d '{"prompt": "The Transformer architecture is..."}'
+   curl -X POST http://localhost:8000/generate_stream  -H "Content-Type: application/json"  -d '{"prompt": "The Transformer architecture is..."}' --no-buffer
+   ```
+
+   This will send both non-streaming and streaming requests to the server.
+
+### [Token-In-Token-Out for RLHF](../token_in_token_out)
+
+In this example, we launch an SGLang engine, feed tokens as input and generate tokens as output.
+
+### [Inference Using FastAPI](fastapi_engine_inference.py)
+
+This example demonstrates how to create a FastAPI server that uses the SGLang engine for text generation.
diff --git a/examples/runtime/engine/save_remote_state.py b/examples/runtime/engine/save_remote_state.py
new file mode 100644
index 000000000..a428195ca
--- /dev/null
+++ b/examples/runtime/engine/save_remote_state.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Saves each worker's model state dict directly to a checkpoint, which enables a
+fast load path for large tensor-parallel models where each worker only needs to
+read its own shard rather than the entire checkpoint.
+
+Example usage:
+
+python save_remote_state.py \
+    --model-path /path/to/load \
+    --tensor-parallel-size 8 \
+    --remote-model-save-url [protocol]://[host]:[port]/[model_name] \
+
+Then, the model can be loaded with
+
+llm = Engine(
+    model_path="[protocol]://[host]:[port]/[model_name]",
+    tensor_parallel_size=8,
+)
+"""
+import dataclasses
+from argparse import ArgumentParser
+from pathlib import Path
+
+from sglang import Engine, ServerArgs
+
+parser = ArgumentParser()
+ServerArgs.add_cli_args(parser)
+
+parser.add_argument(
+    "--remote-model-save-url",
+    required=True,
+    type=str,
+    help="remote address to store model weights",
+)
+parser.add_argument(
+    "--remote-draft-model-save-url",
+    default=None,
+    type=str,
+    help="remote address to store draft model weights",
+)
+
+
+def main(args):
+    engine_args = ServerArgs.from_cli_args(args)
+    model_path = engine_args.model_path
+    if not Path(model_path).is_dir():
+        raise ValueError("model path must be a local directory")
+    # Create LLM instance from arguments
+    llm = Engine(**dataclasses.asdict(engine_args))
+    llm.save_remote_model(
+        url=args.remote_model_save_url, draft_url=args.remote_draft_model_save_url
+    )
+    print("save remote (draft) model successfully")
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/runtime/engine/save_sharded_state.py b/examples/runtime/engine/save_sharded_state.py
new file mode 100644
index 000000000..80ad5321f
--- /dev/null
+++ b/examples/runtime/engine/save_sharded_state.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Saves each worker's model state dict directly to a checkpoint, which enables a
+fast load path for large tensor-parallel models where each worker only needs to
+read its own shard rather than the entire checkpoint.
+
+Example usage:
+
+python save_sharded_state.py \
+    --model-path /path/to/load \
+    --quantization deepspeedfp \
+    --tensor-parallel-size 8 \
+    --output /path/to/save
+
+Then, the model can be loaded with
+
+llm = Engine(
+    model_path="/path/to/save",
+    load_format="sharded_state",
+    quantization="deepspeedfp",
+    tensor_parallel_size=8,
+)
+"""
+import dataclasses
+import os
+import shutil
+from argparse import ArgumentParser
+from pathlib import Path
+
+from sglang import Engine, ServerArgs
+
+parser = ArgumentParser()
+ServerArgs.add_cli_args(parser)
+
+parser.add_argument(
+    "--output", "-o", required=True, type=str, help="path to output checkpoint"
+)
+parser.add_argument(
+    "--file-pattern", type=str, help="string pattern of saved filenames"
+)
+parser.add_argument(
+    "--max-file-size",
+    type=str,
+    default=5 * 1024**3,
+    help="max size (in bytes) of each safetensors file",
+)
+
+
+def main(args):
+    engine_args = ServerArgs.from_cli_args(args)
+    model_path = engine_args.model_path
+    if not Path(model_path).is_dir():
+        raise ValueError("model path must be a local directory")
+    # Create LLM instance from arguments
+    llm = Engine(**dataclasses.asdict(engine_args))
+    Path(args.output).mkdir(exist_ok=True)
+    llm.save_sharded_model(
+        path=args.output, pattern=args.file_pattern, max_size=args.max_file_size
+    )
+
+    # Copy metadata files to output directory
+    for file in os.listdir(model_path):
+        if os.path.splitext(file)[1] not in (".bin", ".pt", ".safetensors"):
+            if os.path.isdir(os.path.join(model_path, file)):
+                shutil.copytree(
+                    os.path.join(model_path, file), os.path.join(args.output, file)
+                )
+            else:
+                shutil.copy(os.path.join(model_path, file), args.output)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/runtime/hidden_states/hidden_states_engine.py b/examples/runtime/hidden_states/hidden_states_engine.py
new file mode 100644
index 000000000..60ab302ca
--- /dev/null
+++ b/examples/runtime/hidden_states/hidden_states_engine.py
@@ -0,0 +1,66 @@
+"""
+Usage:
+python hidden_states.py
+
+Note that each time you change the `return_hidden_states` parameter,
+the cuda graph will be recaptured, which might lead to a performance hit.
+So avoid getting hidden states and completions alternately.
+"""
+
+import torch
+
+import sglang as sgl
+
+
+def main():
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    # Create an LLM.
+    llm = sgl.Engine(
+        model_path="Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+        enable_return_hidden_states=True,
+    )
+
+    sampling_params = {
+        "temperature": 0.8,
+        "top_p": 0.95,
+        "max_new_tokens": 10,
+    }
+
+    outputs = llm.generate(
+        prompts, sampling_params=sampling_params, return_hidden_states=True
+    )
+
+    llm.shutdown()
+
+    for prompt, output in zip(prompts, outputs):
+        for i in range(len(output["meta_info"]["hidden_states"])):
+            output["meta_info"]["hidden_states"][i] = torch.tensor(
+                output["meta_info"]["hidden_states"][i], dtype=torch.bfloat16
+            )
+        print("===============================")
+        print(
+            f"Prompt: {prompt}\n"
+            f"Generated text: {output['text']}\n"
+            f"Prompt_Tokens: {output['meta_info']['prompt_tokens']}\t"
+            f"Completion_tokens: {output['meta_info']['completion_tokens']}"
+        )
+        print("Hidden states: ")
+        hidden_states = torch.cat(
+            [
+                i.unsqueeze(0) if len(i.shape) == 1 else i
+                for i in output["meta_info"]["hidden_states"]
+            ]
+        )
+        print(hidden_states)
+        print()
+
+
+# The __main__ condition is necessary here because we use "spawn" to create subprocesses
+# Spawn starts a fresh program every time, if there is no __main__, it will run into infinite loop to keep spawning processes from sgl.Engine
+if __name__ == "__main__":
+    main()
diff --git a/examples/runtime/hidden_states/hidden_states_server.py b/examples/runtime/hidden_states/hidden_states_server.py
new file mode 100644
index 000000000..b04f74372
--- /dev/null
+++ b/examples/runtime/hidden_states/hidden_states_server.py
@@ -0,0 +1,81 @@
+"""
+Usage:
+
+python hidden_states_server.py
+
+Note that each time you change the `return_hidden_states` parameter,
+the cuda graph will be recaptured, which might lead to a performance hit.
+So avoid getting hidden states and completions alternately.
+"""
+
+import requests
+import torch
+
+from sglang.test.test_utils import is_in_ci
+from sglang.utils import terminate_process, wait_for_server
+
+if is_in_ci():
+    from docs.backend.patch import launch_server_cmd
+else:
+    from sglang.utils import launch_server_cmd
+
+
+def main():
+    # Launch the server
+    server_process, port = launch_server_cmd(
+        "python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct --enable-return-hidden-states --host 0.0.0.0"
+    )
+    wait_for_server(f"http://localhost:{port}")
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    sampling_params = {
+        "temperature": 0.8,
+        "top_p": 0.95,
+        "max_new_tokens": 10,
+    }
+
+    json_data = {
+        "text": prompts,
+        "sampling_params": sampling_params,
+        "return_hidden_states": True,
+    }
+
+    response = requests.post(
+        f"http://localhost:{port}/generate",
+        json=json_data,
+    )
+
+    terminate_process(server_process)
+
+    outputs = response.json()
+    for prompt, output in zip(prompts, outputs):
+        for i in range(len(output["meta_info"]["hidden_states"])):
+            output["meta_info"]["hidden_states"][i] = torch.tensor(
+                output["meta_info"]["hidden_states"][i], dtype=torch.bfloat16
+            )
+        print("===============================")
+        print(
+            f"Prompt: {prompt}\n"
+            f"Generated text: {output['text']}\n"
+            f"Prompt_Tokens: {output['meta_info']['prompt_tokens']}\t"
+            f"Completion_tokens: {output['meta_info']['completion_tokens']}"
+        )
+        print("Hidden states: ")
+        hidden_states = torch.cat(
+            [
+                i.unsqueeze(0) if len(i.shape) == 1 else i
+                for i in output["meta_info"]["hidden_states"]
+            ]
+        )
+        print(hidden_states)
+        print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/runtime/lora.py b/examples/runtime/lora.py
new file mode 100644
index 000000000..bf3fc2d9e
--- /dev/null
+++ b/examples/runtime/lora.py
@@ -0,0 +1,37 @@
+# launch server
+# python -m sglang.launch_server --model mistralai/Mistral-7B-Instruct-v0.3 --lora-paths /home/ying/test_lora lora1=/home/ying/test_lora_1 lora2=/home/ying/test_lora_2 --disable-radix --disable-cuda-graph --max-loras-per-batch 4
+
+# send requests
+# lora_path[i] specifies the LoRA used for text[i], so make sure they have the same length
+# use None to specify base-only prompt, e.x. "lora_path": [None, "/home/ying/test_lora"]
+import json
+
+import requests
+
+url = "http://127.0.0.1:30000"
+json_data = {
+    "text": [
+        "prompt 1",
+        "prompt 2",
+        "prompt 3",
+        "prompt 4",
+        "prompt 5",
+        "prompt 6",
+        "prompt 7",
+    ],
+    "sampling_params": {"max_new_tokens": 32},
+    "lora_path": [
+        "/home/ying/test_lora",
+        "lora1",
+        "lora2",
+        "lora1",
+        "lora2",
+        None,
+        None,
+    ],
+}
+response = requests.post(
+    url + "/generate",
+    json=json_data,
+)
+print(json.dumps(response.json()))
diff --git a/examples/runtime/multimodal/llama3_llava_server.py b/examples/runtime/multimodal/llama3_llava_server.py
new file mode 100644
index 000000000..a8409af71
--- /dev/null
+++ b/examples/runtime/multimodal/llama3_llava_server.py
@@ -0,0 +1,111 @@
+"""
+Usage:
+# Installing latest llava-next: pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
+# Installing latest sglang.
+
+# Endpoint Service CLI:
+python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000
+
+python3 llama3_llava_server.py
+
+Output:
+"Friends posing for a fun photo with a life-sized teddy bear, creating a playful and memorable moment."
+"""
+
+import argparse
+import asyncio
+import copy
+import json
+
+import aiohttp
+import requests
+from llava.conversation import conv_llava_llama_3
+
+
+async def send_request(url, data, delay=0):
+    await asyncio.sleep(delay)
+    async with aiohttp.ClientSession() as session:
+        async with session.post(url, json=data) as resp:
+            output = await resp.json()
+    return output
+
+
+async def test_concurrent(args):
+    url = f"{args.host}:{args.port}"
+
+    prompt = "<image>\nPlease generate caption towards this image."
+    conv_template = copy.deepcopy(conv_llava_llama_3)
+    conv_template.append_message(role=conv_template.roles[0], message=prompt)
+    conv_template.append_message(role=conv_template.roles[1], message=None)
+    prompt_with_template = conv_template.get_prompt()
+    response = []
+    for i in range(1):
+        response.append(
+            send_request(
+                url + "/generate",
+                {
+                    "text": prompt_with_template,
+                    "image_data": "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg",
+                    "sampling_params": {
+                        "max_new_tokens": 1024,
+                        "temperature": 0,
+                        "top_p": 1.0,
+                        "presence_penalty": 2,
+                        "frequency_penalty": 2,
+                        "stop": "<|eot_id|>",
+                    },
+                },
+            )
+        )
+
+    rets = await asyncio.gather(*response)
+    for ret in rets:
+        print(ret["text"])
+
+
+def test_streaming(args):
+    url = f"{args.host}:{args.port}"
+    prompt = "<image>\nPlease generate caption towards this image."
+    conv_template = copy.deepcopy(conv_llava_llama_3)
+    conv_template.append_message(role=conv_template.roles[0], message=prompt)
+    conv_template.append_message(role=conv_template.roles[1], message=None)
+    prompt_with_template = conv_template.get_prompt()
+    pload = {
+        "text": prompt_with_template,
+        "sampling_params": {
+            "max_new_tokens": 1024,
+            "temperature": 0,
+            "top_p": 1.0,
+            "presence_penalty": 2,
+            "frequency_penalty": 2,
+            "stop": "<|eot_id|>",
+        },
+        "image_data": "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg",
+        "stream": True,
+    }
+    response = requests.post(
+        url + "/generate",
+        json=pload,
+        stream=True,
+    )
+
+    prev = 0
+    for chunk in response.iter_lines(decode_unicode=False):
+        chunk = chunk.decode("utf-8")
+        if chunk and chunk.startswith("data:"):
+            if chunk == "data: [DONE]":
+                break
+            data = json.loads(chunk[5:].strip("\n"))
+            output = data["text"].strip()
+            print(output[prev:], end="", flush=True)
+            prev = len(output)
+    print("")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="http://127.0.0.1")
+    parser.add_argument("--port", type=int, default=30000)
+    args = parser.parse_args()
+    asyncio.run(test_concurrent(args))
+    test_streaming(args)
diff --git a/examples/runtime/multimodal/llava_onevision_server.py b/examples/runtime/multimodal/llava_onevision_server.py
new file mode 100644
index 000000000..ee921b558
--- /dev/null
+++ b/examples/runtime/multimodal/llava_onevision_server.py
@@ -0,0 +1,264 @@
+"""
+Usage:
+
+python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8
+
+python3 llava_onevision_server.py
+"""
+
+import base64
+import io
+import os
+import sys
+import time
+
+import numpy as np
+import openai
+import requests
+from decord import VideoReader, cpu
+from PIL import Image
+
+# pip install httpx==0.23.3
+# pip install decord
+# pip install protobuf==3.20.0
+
+
+def download_video(url, cache_dir):
+    file_path = os.path.join(cache_dir, "jobs.mp4")
+    os.makedirs(cache_dir, exist_ok=True)
+
+    response = requests.get(url)
+    response.raise_for_status()
+
+    with open(file_path, "wb") as f:
+        f.write(response.content)
+
+    print(f"File downloaded and saved to: {file_path}")
+    return file_path
+
+
+def create_openai_client(base_url):
+    return openai.Client(api_key="EMPTY", base_url=base_url)
+
+
+def image_stream_request_test(client):
+    print("----------------------Image Stream Request Test----------------------")
+    stream_request = client.chat.completions.create(
+        model="default",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png"
+                        },
+                    },
+                    {
+                        "type": "text",
+                        "text": "Please describe this image. Please list the benchmarks and the models.",
+                    },
+                ],
+            },
+        ],
+        temperature=0.7,
+        max_tokens=1024,
+        stream=True,
+    )
+    stream_response = ""
+
+    for chunk in stream_request:
+        if chunk.choices[0].delta.content is not None:
+            content = chunk.choices[0].delta.content
+            stream_response += content
+            sys.stdout.write(content)
+            sys.stdout.flush()
+
+    print("-" * 30)
+
+
+def multi_image_stream_request_test(client):
+    print(
+        "----------------------Multi-Images Stream Request Test----------------------"
+    )
+    stream_request = client.chat.completions.create(
+        model="default",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png"
+                        },
+                        "modalities": "multi-images",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png"
+                        },
+                        "modalities": "multi-images",
+                    },
+                    {
+                        "type": "text",
+                        "text": "I have shown you two images. Please describe the two images to me.",
+                    },
+                ],
+            },
+        ],
+        temperature=0.7,
+        max_tokens=1024,
+        stream=True,
+    )
+    stream_response = ""
+
+    for chunk in stream_request:
+        if chunk.choices[0].delta.content is not None:
+            content = chunk.choices[0].delta.content
+            stream_response += content
+            sys.stdout.write(content)
+            sys.stdout.flush()
+
+    print("-" * 30)
+
+
+def video_stream_request_test(client, video_path):
+    print("------------------------Video Stream Request Test----------------------")
+    messages = prepare_video_messages(video_path)
+
+    video_request = client.chat.completions.create(
+        model="default",
+        messages=messages,
+        temperature=0,
+        max_tokens=1024,
+        stream=True,
+    )
+    print("-" * 30)
+    video_response = ""
+
+    for chunk in video_request:
+        if chunk.choices[0].delta.content is not None:
+            content = chunk.choices[0].delta.content
+            video_response += content
+            sys.stdout.write(content)
+            sys.stdout.flush()
+    print("-" * 30)
+
+
+def image_speed_test(client):
+    print("----------------------Image Speed Test----------------------")
+    start_time = time.perf_counter()
+    request = client.chat.completions.create(
+        model="default",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png"
+                        },
+                    },
+                    {
+                        "type": "text",
+                        "text": "Please describe this image. Please list the benchmarks and the models.",
+                    },
+                ],
+            },
+        ],
+        temperature=0,
+        max_tokens=1024,
+    )
+    end_time = time.perf_counter()
+    response = request.choices[0].message.content
+    print(response)
+    print("-" * 30)
+    print_speed_test_results(request, start_time, end_time)
+
+
+def video_speed_test(client, video_path):
+    print("------------------------Video Speed Test------------------------")
+    messages = prepare_video_messages(video_path)
+
+    start_time = time.perf_counter()
+    video_request = client.chat.completions.create(
+        model="default",
+        messages=messages,
+        temperature=0,
+        max_tokens=1024,
+    )
+    end_time = time.perf_counter()
+    video_response = video_request.choices[0].message.content
+    print(video_response)
+    print("-" * 30)
+    print_speed_test_results(video_request, start_time, end_time)
+
+
+def prepare_video_messages(video_path):
+    max_frames_num = 32
+    vr = VideoReader(video_path, ctx=cpu(0))
+    total_frame_num = len(vr)
+    uniform_sampled_frames = np.linspace(
+        0, total_frame_num - 1, max_frames_num, dtype=int
+    )
+    frame_idx = uniform_sampled_frames.tolist()
+    frames = vr.get_batch(frame_idx).asnumpy()
+
+    base64_frames = []
+    for frame in frames:
+        pil_img = Image.fromarray(frame)
+        buff = io.BytesIO()
+        pil_img.save(buff, format="JPEG")
+        base64_str = base64.b64encode(buff.getvalue()).decode("utf-8")
+        base64_frames.append(base64_str)
+
+    messages = [{"role": "user", "content": []}]
+
+    for base64_frame in base64_frames:
+        frame_format = {
+            "type": "image_url",
+            "image_url": {"url": f"data:image/jpeg;base64,{base64_frame}"},
+            "modalities": "video",
+        }
+        messages[0]["content"].append(frame_format)
+
+    prompt = {"type": "text", "text": "Please describe the video in detail."}
+    messages[0]["content"].append(prompt)
+
+    return messages
+
+
+def print_speed_test_results(request, start_time, end_time):
+    total_tokens = request.usage.total_tokens
+    completion_tokens = request.usage.completion_tokens
+    prompt_tokens = request.usage.prompt_tokens
+
+    print(f"Total tokens: {total_tokens}")
+    print(f"Completion tokens: {completion_tokens}")
+    print(f"Prompt tokens: {prompt_tokens}")
+    print(f"Time taken: {end_time - start_time} seconds")
+    print(f"Token per second: {total_tokens / (end_time - start_time)}")
+    print(f"Completion token per second: {completion_tokens / (end_time - start_time)}")
+    print(f"Prompt token per second: {prompt_tokens / (end_time - start_time)}")
+
+
+def main():
+    url = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
+    cache_dir = os.path.expanduser("~/.cache")
+    video_path = download_video(url, cache_dir)
+
+    client = create_openai_client("http://127.0.0.1:30000/v1")
+
+    image_stream_request_test(client)
+    multi_image_stream_request_test(client)
+    video_stream_request_test(client, video_path)
+    image_speed_test(client)
+    video_speed_test(client, video_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/runtime/multimodal/pixtral_server.py b/examples/runtime/multimodal/pixtral_server.py
new file mode 100644
index 000000000..d907de14d
--- /dev/null
+++ b/examples/runtime/multimodal/pixtral_server.py
@@ -0,0 +1,127 @@
+"""
+Usage:
+# Run a Pixtral model with SGLang:
+# HuggingFace:
+python -m sglang.launch_server --model-path mistral-community/pixtral-12b --port=30000
+# ModelScope:
+python -m sglang.launch_server --model-path AI-ModelScope/pixtral-12b --port=30000
+
+# Then test it with:
+python pixtral_server.py
+
+This script tests Pixtral model with both single and multiple images.
+"""
+
+import argparse
+import asyncio
+import json
+
+import aiohttp
+import requests
+
+IMAGE_TOKEN_SEP = "\n[IMG]"
+ROUTE = "/generate"
+
+
+async def send_request(url, data, delay=0):
+    await asyncio.sleep(delay)
+    async with aiohttp.ClientSession() as session:
+        async with session.post(url, json=data) as resp:
+            output = await resp.json()
+    return output
+
+
+async def test_concurrent(args):
+    url = f"{args.host}:{args.port}{ROUTE}"
+
+    # Single image test
+    if args.single_image:
+        prompt = f"<s>[INST]Describe this image in detail.{IMAGE_TOKEN_SEP}[/INST]"
+        image_url = "https://picsum.photos/id/237/400/300"
+        modality = ["image"]
+    # Multiple images test
+    else:
+        image_urls = [
+            "https://picsum.photos/id/237/400/300",
+            "https://picsum.photos/id/27/500/500",
+        ]
+        prompt = f"<s>[INST]How many photos are there? Describe each in a very short sentence.{IMAGE_TOKEN_SEP * len(image_urls)}[/INST]"
+        image_url = image_urls
+        modality = ["multi-images"]
+
+    response = await send_request(
+        url,
+        {
+            "text": prompt,
+            "image_data": image_url,
+            "sampling_params": {
+                "max_new_tokens": 100,
+                "temperature": 0.7,
+                "top_p": 0.9,
+            },
+            "modalities": modality,
+        },
+    )
+
+    print(f"Response: {response}")
+    if "text" in response:
+        print("\nOutput text:", response["text"])
+
+
+def test_streaming(args):
+    url = f"{args.host}:{args.port}/generate"
+
+    # Single image test
+    if args.single_image:
+        prompt = f"<s>[INST]Describe this image in detail.{IMAGE_TOKEN_SEP}[/INST]"
+        image_data = "https://picsum.photos/id/237/400/300"
+        modality = ["image"]
+    # Multiple images test
+    else:
+        image_urls = [
+            "https://picsum.photos/id/237/400/300",
+            "https://picsum.photos/id/27/500/500",
+        ]
+        prompt = f"<s>[INST]How many photos are there? Describe each in a very short sentence.{IMAGE_TOKEN_SEP * len(image_urls)}[/INST]"
+        image_data = image_urls
+        modality = ["multi-images"]
+
+    pload = {
+        "text": prompt,
+        "image_data": image_data,
+        "sampling_params": {"max_new_tokens": 100, "temperature": 0.7, "top_p": 0.9},
+        "modalities": modality,
+        "stream": True,
+    }
+
+    response = requests.post(url, json=pload, stream=True)
+
+    print("Streaming response:")
+    prev = 0
+    for chunk in response.iter_lines(decode_unicode=False):
+        chunk = chunk.decode("utf-8")
+        if chunk and chunk.startswith("data:"):
+            if chunk == "data: [DONE]":
+                break
+            data = json.loads(chunk[5:].strip("\n"))
+            output = data["text"].strip()
+            print(output[prev:], end="", flush=True)
+            prev = len(output)
+    print("\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="http://127.0.0.1")
+    parser.add_argument("--port", type=int, default=30000)
+    parser.add_argument(
+        "--single-image",
+        action="store_true",
+        help="Test with single image instead of multiple images",
+    )
+    parser.add_argument("--no-stream", action="store_true", help="Don't test streaming")
+    args = parser.parse_args()
+
+    asyncio.run(test_concurrent(args))
+    if not args.no_stream:
+        test_streaming(args)
diff --git a/examples/runtime/multimodal/qwen_llava_server.py b/examples/runtime/multimodal/qwen_llava_server.py
new file mode 100644
index 000000000..d8b3226e7
--- /dev/null
+++ b/examples/runtime/multimodal/qwen_llava_server.py
@@ -0,0 +1,111 @@
+"""
+Usage:
+# Installing latest llava-next: pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
+# Installing latest sglang.
+
+# Endpoint Service CLI:
+python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8
+
+python3 qwen_llava_server.py
+
+Output:
+"Two children pose with a large teddy bear, one holding a smaller stuffed bear, in a room with an American flag and potted plants."
+"""
+
+import argparse
+import asyncio
+import copy
+import json
+
+import aiohttp
+import requests
+from llava.conversation import conv_qwen
+
+
+async def send_request(url, data, delay=0):
+    await asyncio.sleep(delay)
+    async with aiohttp.ClientSession() as session:
+        async with session.post(url, json=data) as resp:
+            output = await resp.json()
+    return output
+
+
+async def test_concurrent(args):
+    url = f"{args.host}:{args.port}"
+
+    prompt = "<image>\nPlease generate caption towards this image."
+    conv_template = copy.deepcopy(conv_qwen)
+    conv_template.append_message(role=conv_template.roles[0], message=prompt)
+    conv_template.append_message(role=conv_template.roles[1], message=None)
+    prompt_with_template = conv_template.get_prompt()
+    response = []
+    for i in range(1):
+        response.append(
+            send_request(
+                url + "/generate",
+                {
+                    "text": prompt_with_template,
+                    "image_data": "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg",
+                    "sampling_params": {
+                        "max_new_tokens": 1024,
+                        "temperature": 0,
+                        "top_p": 1.0,
+                        "presence_penalty": 2,
+                        "frequency_penalty": 2,
+                        "stop": "<|im_end|>",
+                    },
+                },
+            )
+        )
+
+    rets = await asyncio.gather(*response)
+    for ret in rets:
+        print(ret["text"])
+
+
+def test_streaming(args):
+    url = f"{args.host}:{args.port}"
+    prompt = "<image>\nPlease generate caption towards this image."
+    conv_template = copy.deepcopy(conv_qwen)
+    conv_template.append_message(role=conv_template.roles[0], message=prompt)
+    conv_template.append_message(role=conv_template.roles[1], message=None)
+    prompt_with_template = conv_template.get_prompt()
+    pload = {
+        "text": prompt_with_template,
+        "sampling_params": {
+            "max_new_tokens": 1024,
+            "temperature": 0,
+            "top_p": 1.0,
+            "presence_penalty": 2,
+            "frequency_penalty": 2,
+            "stop": "<|im_end|>",
+        },
+        "image_data": "https://farm4.staticflickr.com/3175/2653711032_804ff86d81_z.jpg",
+        "stream": True,
+    }
+    response = requests.post(
+        url + "/generate",
+        json=pload,
+        stream=True,
+    )
+
+    prev = 0
+    for chunk in response.iter_lines(decode_unicode=False):
+        chunk = chunk.decode("utf-8")
+        if chunk and chunk.startswith("data:"):
+            if chunk == "data: [DONE]":
+                break
+            data = json.loads(chunk[5:].strip("\n"))
+            output = data["text"].strip()
+            print(output[prev:], end="", flush=True)
+            prev = len(output)
+    print("")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="http://127.0.0.1")
+    parser.add_argument("--port", type=int, default=30000)
+    args = parser.parse_args()
+    asyncio.run(test_concurrent(args))
+    test_streaming(args)
diff --git a/examples/runtime/multimodal_embedding.py b/examples/runtime/multimodal_embedding.py
new file mode 100644
index 000000000..4e8d748b4
--- /dev/null
+++ b/examples/runtime/multimodal_embedding.py
@@ -0,0 +1,18 @@
+# launch server
+# python -m sglang.launch_server --model-path Alibaba-NLP/gme-Qwen2-VL-2B-Instruct --is-embedding
+
+import requests
+
+url = "http://127.0.0.1:30000"
+
+text_input = "Represent this image in embedding space."
+image_path = "https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild/resolve/main/images/023.jpg"
+
+payload = {
+    "model": "gme-qwen2-vl",
+    "input": [{"text": text_input}, {"image": image_path}],
+}
+
+response = requests.post(url + "/v1/embeddings", json=payload).json()
+
+print("Embeddings:", [x.get("embedding") for x in response.get("data", [])])
diff --git a/examples/runtime/openai_chat_with_response_prefill.py b/examples/runtime/openai_chat_with_response_prefill.py
new file mode 100644
index 000000000..6d803a1d1
--- /dev/null
+++ b/examples/runtime/openai_chat_with_response_prefill.py
@@ -0,0 +1,53 @@
+"""
+Usage:
+1) Launch the server in one terminal:
+   python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --port 30000
+
+2) Run this script in another terminal:
+   python openai_chat_with_response_prefill.py
+
+This example demonstrates two chat completion calls:
+- One with continue_final_message enabled (the final assistant message is used as a prefill).
+- One without continue_final_message (the final assistant message remains, starting a new turn).
+"""
+
+import openai
+
+client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
+
+messages = [
+    {"role": "system", "content": "You are a helpful AI assistant."},
+    {
+        "role": "user",
+        "content": """
+Extract the name, size, price, and color from this product description as a JSON object:
+
+<description>
+The SmartHome Mini is a compact smart home assistant available in black or white for only $49.99.
+At just 5 inches wide, it lets you control lights, thermostats, and other connected devices via voice or app—
+no matter where you place it in your home.
+This affordable little hub brings convenient hands-free control to your smart devices.
+</description>
+""",
+    },
+    {"role": "assistant", "content": "{\n"},
+]
+
+# Calling the API with continue_final_message enabled.
+print("=== Prefill with continue_final_messagem ===")
+response_with = client.chat.completions.create(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    messages=messages,
+    temperature=0,
+    extra_body={"continue_final_message": True},
+)
+print(response_with.choices[0].message.content)
+
+# Calling the API without continue_final_message (using default behavior).
+print("\n=== Prefill without continue_final_message ===")
+response_without = client.chat.completions.create(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    messages=messages,
+    temperature=0,
+)
+print(response_without.choices[0].message.content)
diff --git a/examples/runtime/reward_model.py b/examples/runtime/reward_model.py
new file mode 100644
index 000000000..1a1177e66
--- /dev/null
+++ b/examples/runtime/reward_model.py
@@ -0,0 +1,32 @@
+# launch server
+# python -m sglang.launch_server --model LxzGordon/URM-LLaMa-3.1-8B --is-embedding
+
+import requests
+
+url = "http://127.0.0.1:30000"
+
+PROMPT = (
+    "What is the range of the numeric output of a sigmoid node in a neural network?"
+)
+RESPONSE1 = "The output of a sigmoid node is bounded between -1 and 1."
+RESPONSE2 = "The output of a sigmoid node is bounded between 0 and 1."
+
+json_data = {
+    "conv": [
+        [
+            {"role": "user", "content": PROMPT},
+            {"role": "assistant", "content": RESPONSE1},
+        ],
+        [
+            {"role": "user", "content": PROMPT},
+            {"role": "assistant", "content": RESPONSE2},
+        ],
+    ],
+}
+response = requests.post(
+    url + "/classify",
+    json=json_data,
+).json()
+
+print(response)
+print("scores:", [x["embedding"] for x in response])
diff --git a/examples/runtime/token_in_token_out/token_in_token_out_llm_engine.py b/examples/runtime/token_in_token_out/token_in_token_out_llm_engine.py
new file mode 100644
index 000000000..cb1b7ddc1
--- /dev/null
+++ b/examples/runtime/token_in_token_out/token_in_token_out_llm_engine.py
@@ -0,0 +1,43 @@
+"""
+This example demonstrates how to provide tokenized ids to LLM as input instead of text prompt, i.e. a token-in-token-out workflow.
+"""
+
+import sglang as sgl
+from sglang.srt.hf_transformers_utils import get_tokenizer
+
+MODEL_PATH = "meta-llama/Llama-3.1-8B-Instruct"
+
+
+def main():
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    # Create a sampling params object.
+    sampling_params = {"temperature": 0.8, "top_p": 0.95}
+
+    # Tokenize inputs
+    tokenizer = get_tokenizer(MODEL_PATH)
+    token_ids_list = [tokenizer.encode(prompt) for prompt in prompts]
+
+    # Create an LLM.
+    llm = sgl.Engine(model_path=MODEL_PATH, skip_tokenizer_init=True)
+
+    outputs = llm.generate(input_ids=token_ids_list, sampling_params=sampling_params)
+    # Print the outputs.
+    for prompt, output in zip(prompts, outputs):
+        decode_output = tokenizer.decode(output["output_ids"])
+        print("===============================")
+        print(
+            f"Prompt: {prompt}\nGenerated token ids: {output['output_ids']}\nGenerated text: {decode_output}"
+        )
+        print()
+
+
+# The __main__ condition is necessary here because we use "spawn" to create subprocesses
+# Spawn starts a fresh program every time, if there is no __main__, it will run into infinite loop to keep spawning processes from sgl.Engine
+if __name__ == "__main__":
+    main()
diff --git a/examples/runtime/token_in_token_out/token_in_token_out_llm_server.py b/examples/runtime/token_in_token_out/token_in_token_out_llm_server.py
new file mode 100644
index 000000000..00c0988b2
--- /dev/null
+++ b/examples/runtime/token_in_token_out/token_in_token_out_llm_server.py
@@ -0,0 +1,68 @@
+"""
+Usage:
+
+python token_in_token_out_llm_server.py
+
+"""
+
+import requests
+
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.test.test_utils import is_in_ci
+from sglang.utils import terminate_process, wait_for_server
+
+if is_in_ci():
+    from docs.backend.patch import launch_server_cmd
+else:
+    from sglang.utils import launch_server_cmd
+
+
+MODEL_PATH = "meta-llama/Llama-3.1-8B-Instruct"
+
+
+def main():
+    # Launch the server
+    server_process, port = launch_server_cmd(
+        f"python -m sglang.launch_server --model-path {MODEL_PATH} --skip-tokenizer-init --host 0.0.0.0"
+    )
+    wait_for_server(f"http://localhost:{port}")
+
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    # Create a sampling params object.
+    sampling_params = {"temperature": 0.8, "top_p": 0.95}
+
+    # Tokenize inputs
+    tokenizer = get_tokenizer(MODEL_PATH)
+    token_ids_list = [tokenizer.encode(prompt) for prompt in prompts]
+
+    json_data = {
+        "input_ids": token_ids_list,
+        "sampling_params": sampling_params,
+    }
+
+    response = requests.post(
+        f"http://localhost:{port}/generate",
+        json=json_data,
+    )
+
+    outputs = response.json()
+    for prompt, output in zip(prompts, outputs):
+        print("===============================")
+        decode_output = tokenizer.decode(output["output_ids"])
+        print(
+            f"Prompt: {prompt}\nGenerated token ids: {output['output_ids']}\nGenerated text: {decode_output}"
+        )
+        print()
+
+    terminate_process(server_process)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/runtime/token_in_token_out/token_in_token_out_vlm_engine.py b/examples/runtime/token_in_token_out/token_in_token_out_vlm_engine.py
new file mode 100644
index 000000000..bf6f19719
--- /dev/null
+++ b/examples/runtime/token_in_token_out/token_in_token_out_vlm_engine.py
@@ -0,0 +1,74 @@
+import argparse
+import dataclasses
+from typing import Tuple
+
+from transformers import AutoProcessor
+
+from sglang import Engine
+from sglang.lang.chat_template import get_chat_template_by_model_path
+from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.server_args import ServerArgs
+from sglang.test.test_utils import DEFAULT_IMAGE_URL
+
+
+def get_input_ids(
+    server_args: ServerArgs, model_config: ModelConfig
+) -> Tuple[list[int], list]:
+    chat_template = get_chat_template_by_model_path(model_config.model_path)
+    text = f"{chat_template.image_token}What is in this picture?"
+    image_data = [DEFAULT_IMAGE_URL]
+
+    processor = AutoProcessor.from_pretrained(
+        model_config.model_path, trust_remote_code=server_args.trust_remote_code
+    )
+
+    input_ids = (
+        processor.tokenizer(
+            text=[text],
+            return_tensors="pt",
+        )
+        .input_ids[0]
+        .tolist()
+    )
+
+    return input_ids, image_data
+
+
+def token_in_out_example(
+    server_args: ServerArgs,
+):
+    input_ids, image_data = get_input_ids(
+        server_args,
+        ModelConfig(
+            server_args.model_path,
+            trust_remote_code=server_args.trust_remote_code,
+            model_override_args=server_args.json_model_override_args,
+        ),
+    )
+    backend = Engine(**dataclasses.asdict(server_args))
+
+    output = backend.generate(
+        input_ids=input_ids,
+        image_data=image_data,
+        sampling_params={
+            "temperature": 0.8,
+            "max_new_tokens": 32,
+        },
+    )
+
+    print("===============================")
+    print(f"Output token ids: ", output["output_ids"])
+
+    backend.shutdown()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    args = [
+        "--model-path=Qwen/Qwen2-VL-2B",
+    ]
+    args = parser.parse_args(args=args)
+    server_args = ServerArgs.from_cli_args(args)
+    server_args.skip_tokenizer_init = True
+    token_in_out_example(server_args)
diff --git a/examples/runtime/token_in_token_out/token_in_token_out_vlm_server.py b/examples/runtime/token_in_token_out/token_in_token_out_vlm_server.py
new file mode 100644
index 000000000..392e1bf0e
--- /dev/null
+++ b/examples/runtime/token_in_token_out/token_in_token_out_vlm_server.py
@@ -0,0 +1,78 @@
+"""
+Usage:
+
+python token_in_token_out_vlm_server.py
+
+"""
+
+from typing import Tuple
+
+import requests
+from transformers import AutoProcessor
+
+from sglang.lang.chat_template import get_chat_template_by_model_path
+from sglang.test.test_utils import DEFAULT_IMAGE_URL, is_in_ci
+from sglang.utils import terminate_process, wait_for_server
+
+if is_in_ci():
+    from docs.backend.patch import launch_server_cmd
+else:
+    from sglang.utils import launch_server_cmd
+
+
+MODEL_PATH = "Qwen/Qwen2-VL-2B"
+
+
+def get_input_ids() -> Tuple[list[int], list]:
+    chat_template = get_chat_template_by_model_path(MODEL_PATH)
+    text = f"{chat_template.image_token}What is in this picture?"
+    image_data = [DEFAULT_IMAGE_URL]
+
+    processor = AutoProcessor.from_pretrained(MODEL_PATH)
+
+    input_ids = (
+        processor.tokenizer(
+            text=[text],
+            return_tensors="pt",
+        )
+        .input_ids[0]
+        .tolist()
+    )
+
+    return input_ids, image_data
+
+
+def main():
+    # Launch the server
+    server_process, port = launch_server_cmd(
+        f"python -m sglang.launch_server --model-path {MODEL_PATH} --skip-tokenizer-init --host 0.0.0.0"
+    )
+    wait_for_server(f"http://localhost:{port}")
+
+    input_ids, image_data = get_input_ids()
+
+    sampling_params = {
+        "temperature": 0.8,
+        "max_new_tokens": 32,
+    }
+
+    json_data = {
+        "input_ids": input_ids,
+        "image_data": image_data,
+        "sampling_params": sampling_params,
+    }
+
+    response = requests.post(
+        f"http://localhost:{port}/generate",
+        json=json_data,
+    )
+
+    output = response.json()
+    print("===============================")
+    print(f"Output token ids: ", output["output_ids"])
+
+    terminate_process(server_process)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/runtime/vertex_predict.py b/examples/runtime/vertex_predict.py
new file mode 100644
index 000000000..58a41b1c4
--- /dev/null
+++ b/examples/runtime/vertex_predict.py
@@ -0,0 +1,66 @@
+"""
+Usage:
+python -m sglang.launch_server --model meta-llama/Llama-2-7b-hf --port 30000
+python vertex_predict.py
+
+This example shows the request and response formats of the prediction route for
+Google Cloud Vertex AI Online Predictions.
+
+Vertex AI SDK for Python is recommended for deploying models to Vertex AI
+instead of a local server. After deploying the model to a Vertex AI Online
+Prediction Endpoint, send requests via the Python SDK:
+
+response = endpoint.predict(
+    instances=[
+        {"text": "The capital of France is"},
+        {"text": "What is a car?"},
+    ],
+    parameters={"sampling_params": {"max_new_tokens": 16}},
+)
+print(response.predictions)
+
+More details about get online predictions from Vertex AI can be found at
+https://cloud.google.com/vertex-ai/docs/predictions/get-online-predictions.
+"""
+
+from dataclasses import dataclass
+from typing import List, Optional
+
+import requests
+
+
+@dataclass
+class VertexPrediction:
+    predictions: List
+
+
+class LocalVertexEndpoint:
+    def __init__(self) -> None:
+        self.base_url = "http://127.0.0.1:30000"
+
+    def predict(self, instances: List[dict], parameters: Optional[dict] = None):
+        response = requests.post(
+            self.base_url + "/vertex_generate",
+            json={
+                "instances": instances,
+                "parameters": parameters,
+            },
+        )
+        return VertexPrediction(predictions=response.json()["predictions"])
+
+
+endpoint = LocalVertexEndpoint()
+
+# Predict with a single prompt.
+response = endpoint.predict(instances=[{"text": "The capital of France is"}])
+print(response.predictions)
+
+# Predict with multiple prompts and parameters.
+response = endpoint.predict(
+    instances=[
+        {"text": "The capital of France is"},
+        {"text": "What is a car?"},
+    ],
+    parameters={"sampling_params": {"max_new_tokens": 16}},
+)
+print(response.predictions)
diff --git a/package-lock.json b/package-lock.json
new file mode 100644
index 000000000..54cb66f0b
--- /dev/null
+++ b/package-lock.json
@@ -0,0 +1,6 @@
+{
+  "name": "sglang",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {}
+}
diff --git a/python/pyproject.toml b/python/pyproject.toml
new file mode 100755
index 000000000..f2e69b3c0
--- /dev/null
+++ b/python/pyproject.toml
@@ -0,0 +1,167 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "sglang"
+version = "0.5.2"
+description = "SGLang is a fast serving framework for large language models and vision language models."
+readme = "README.md"
+requires-python = ">=3.10"
+license = { file = "LICENSE" }
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: Apache Software License",
+]
+dependencies = ["aiohttp", "requests", "tqdm", "numpy", "IPython", "setproctitle"]
+
+[project.optional-dependencies]
+runtime_common = [
+    "blobfile==3.0.0",
+    "build",
+    "compressed-tensors",
+    "datasets",
+    "einops",
+    "fastapi",
+    "hf_transfer",
+    "huggingface_hub",
+    "interegular",
+    "llguidance>=0.7.11,<0.8.0",
+    "modelscope",
+    "msgspec",
+    "ninja",
+    "openai==1.99.1",
+    "openai-harmony==0.0.4",
+    "orjson",
+    "outlines==0.1.11",
+    "packaging",
+    "partial_json_parser",
+    "pillow",
+    "prometheus-client>=0.20.0",
+    "psutil",
+    "pybase64",
+    "pydantic",
+    "pynvml",
+    "python-multipart",
+    "pyzmq>=25.1.2",
+    "sentencepiece",
+    "soundfile==0.13.1",
+    "scipy",
+    "timm==1.0.16",
+    "tiktoken",
+    "torchao==0.9.0",
+    "transformers==4.56.1",
+    "uvicorn",
+    "uvloop",
+    "xgrammar==0.1.24",
+]
+
+srt = [
+    "sglang[runtime_common]",
+    "sgl-kernel==0.3.9.post2",
+    "torch==2.8.0",
+    "torchaudio==2.8.0",
+    "torchvision",
+    "cuda-python",
+    "flashinfer_python==0.3.1",
+]
+
+blackwell = [
+    "sglang[runtime_common]",
+    "sgl-kernel==0.3.9.post2",
+    "torch==2.8.0",
+    "torchaudio==2.8.0",
+    "torchvision",
+    "cuda-python",
+    "flashinfer_python==0.3.1",
+    "nvidia-cutlass-dsl==4.1.0",
+]
+
+# HIP (Heterogeneous-computing Interface for Portability) for AMD
+# => base docker rocm/vllm-dev:20250114, not from public vllm whl
+srt_hip = [
+    "sglang[runtime_common]",
+    "torch",
+    "petit_kernel==0.0.2",
+    "wave-lang==3.7.0",
+]
+
+# https://docs.sglang.ai/platforms/cpu_server.html
+srt_cpu = ["sglang[runtime_common]", "intel-openmp"]
+
+# https://docs.sglang.ai/platforms/ascend_npu.html
+srt_npu = ["sglang[runtime_common]"]
+
+# xpu is not enabled in public vllm and torch whl,
+# need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
+srt_xpu = ["sglang[runtime_common]"]
+
+# For Intel Gaudi(device : hpu) follow the installation guide
+# https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
+srt_hpu = ["sglang[runtime_common]"]
+
+openai = ["openai==1.99.1", "tiktoken"]
+anthropic = ["anthropic>=0.20.0"]
+litellm = ["litellm>=1.0.0"]
+torch_memory_saver = ["torch_memory_saver==0.0.8"]
+decord = ["decord"]
+test = [
+    "accelerate",
+    "expecttest",
+    "jsonlines",
+    "matplotlib",
+    "pandas",
+    "peft",
+    "sentence_transformers",
+    "pytest",
+    "tabulate",
+]
+all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[torch_memory_saver]", "sglang[decord]"]
+all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"]
+all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"]
+all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"]
+all_cpu = ["sglang[srt_cpu]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"]
+all_npu = ["sglang[srt_npu]", "sglang[openai]", "sglang[anthropic]", "sglang[decord]"]
+
+dev = ["sglang[all]", "sglang[test]"]
+dev_hip = ["sglang[all_hip]", "sglang[test]"]
+dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
+dev_hpu = ["sglang[all_hpu]", "sglang[test]"]
+dev_cpu = ["sglang[all_cpu]", "sglang[test]"]
+
+[project.urls]
+"Homepage" = "https://github.com/sgl-project/sglang"
+"Bug Tracker" = "https://github.com/sgl-project/sglang/issues"
+
+[tool.setuptools.package-data]
+"sglang" = [
+    "srt/layers/moe/fused_moe_triton/configs/*/*.json",
+    "srt/layers/quantization/configs/*.json",
+    "srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp",
+]
+
+[tool.setuptools.packages.find]
+exclude = [
+    "assets*",
+    "benchmark*",
+    "docs*",
+    "dist*",
+    "playground*",
+    "scripts*",
+    "tests*",
+]
+
+[tool.wheel]
+exclude = [
+    "assets*",
+    "benchmark*",
+    "docs*",
+    "dist*",
+    "playground*",
+    "scripts*",
+    "tests*",
+]
+
+[tool.codespell]
+ignore-words-list = "ans, als, hel, boostrap, childs, te, vas, hsa, ment"
+skip = "*.json,*.jsonl,*.patch,*.txt"
diff --git a/python/sglang/README.md b/python/sglang/README.md
new file mode 100644
index 000000000..ae0c479b9
--- /dev/null
+++ b/python/sglang/README.md
@@ -0,0 +1,16 @@
+# Code Structures
+
+- `eval`: The evaluation utilities.
+- `lang`: The frontend language.
+- `srt`: The backend engine for running local models. (SRT = SGLang Runtime).
+- `test`: The test utilities.
+- `api.py`: The public APIs.
+- `bench_offline_throughput.py`: Benchmark the performance in the offline mode.
+- `bench_one_batch.py`: Benchmark the latency of running a single static batch without a server.
+- `bench_one_batch_server.py`: Benchmark the latency of running a single batch with a server.
+- `bench_serving.py`: Benchmark online serving with dynamic requests.
+- `check_env.py`: Check the environment variables and dependencies.
+- `global_config.py`: The global configs and constants.
+- `launch_server.py`: The entry point for launching the local server.
+- `utils.py`: Common utilities.
+- `version.py`: Version info.
diff --git a/python/sglang/__init__.py b/python/sglang/__init__.py
new file mode 100644
index 000000000..509b145a9
--- /dev/null
+++ b/python/sglang/__init__.py
@@ -0,0 +1,83 @@
+# SGLang public APIs
+
+# Frontend Language APIs
+from sglang.global_config import global_config
+from sglang.lang.api import (
+    Engine,
+    Runtime,
+    assistant,
+    assistant_begin,
+    assistant_end,
+    flush_cache,
+    function,
+    gen,
+    gen_int,
+    gen_string,
+    get_server_info,
+    image,
+    select,
+    separate_reasoning,
+    set_default_backend,
+    system,
+    system_begin,
+    system_end,
+    user,
+    user_begin,
+    user_end,
+    video,
+)
+from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
+from sglang.lang.choices import (
+    greedy_token_selection,
+    token_length_normalized,
+    unconditional_likelihood_normalized,
+)
+
+# Lazy import some libraries
+from sglang.utils import LazyImport
+from sglang.version import __version__
+
+Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
+LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
+OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
+VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
+
+# Runtime Engine APIs
+ServerArgs = LazyImport("sglang.srt.server_args", "ServerArgs")
+Engine = LazyImport("sglang.srt.entrypoints.engine", "Engine")
+
+__all__ = [
+    "Engine",
+    "Runtime",
+    "assistant",
+    "assistant_begin",
+    "assistant_end",
+    "flush_cache",
+    "function",
+    "gen",
+    "gen_int",
+    "gen_string",
+    "get_server_info",
+    "image",
+    "select",
+    "separate_reasoning",
+    "set_default_backend",
+    "system",
+    "system_begin",
+    "system_end",
+    "user",
+    "user_begin",
+    "user_end",
+    "video",
+    "RuntimeEndpoint",
+    "greedy_token_selection",
+    "token_length_normalized",
+    "unconditional_likelihood_normalized",
+    "ServerArgs",
+    "Anthropic",
+    "LiteLLM",
+    "OpenAI",
+    "VertexAI",
+    "global_config",
+    "__version__",
+]
diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
new file mode 100644
index 000000000..457d120d9
--- /dev/null
+++ b/python/sglang/bench_offline_throughput.py
@@ -0,0 +1,452 @@
+"""
+Benchmark the throughput in the offline mode.
+It accepts server arguments (the same as launch_server.py) and benchmark arguments (the same as bench_serving.py).
+
+# Usage
+## Sharegpt dataset with default args
+python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --num-prompts 10
+
+## Random dataset with default args
+python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024
+"""
+
+import argparse
+import asyncio
+import dataclasses
+import inspect
+import json
+import logging
+import os
+import random
+import time
+from typing import Dict, List, Optional
+
+import numpy as np
+
+from sglang.bench_serving import (
+    DatasetRow,
+    get_dataset,
+    get_tokenizer,
+    sample_random_requests,
+    set_ulimit,
+)
+from sglang.lang.backend.runtime_endpoint import Runtime
+from sglang.srt.entrypoints.engine import Engine
+from sglang.srt.server_args import ServerArgs
+
+
+@dataclasses.dataclass
+class BenchArgs:
+    backend: str = "engine"
+    result_filename: str = ""
+    dataset_name: str = "sharegpt"
+    dataset_path: str = ""
+    num_prompts: int = 1000
+    sharegpt_output_len: Optional[int] = None
+    sharegpt_context_len: Optional[int] = None
+    random_input_len: int = 1024
+    random_output_len: int = 1024
+    random_range_ratio: float = 0.0
+    gsp_num_groups: int = 64
+    gsp_prompts_per_group: int = 16
+    gsp_system_prompt_len: int = 2048
+    gsp_question_len: int = 128
+    gsp_output_len: int = 256
+    seed: int = 1
+    disable_ignore_eos: bool = False
+    extra_request_body: Optional[str] = None
+    apply_chat_template: bool = False
+    profile: bool = False
+    skip_warmup: bool = False
+    do_not_exit: bool = False
+    prompt_suffix: str = ""
+
+    @staticmethod
+    def add_cli_args(parser: argparse.ArgumentParser):
+        parser.add_argument("--backend", type=str, default=BenchArgs.backend)
+        parser.add_argument(
+            "--result-filename", type=str, default=BenchArgs.result_filename
+        )
+        parser.add_argument(
+            "--dataset-name",
+            type=str,
+            default="sharegpt",
+            choices=["sharegpt", "random", "generated-shared-prefix"],
+            help="Name of the dataset to benchmark on.",
+        )
+        parser.add_argument(
+            "--dataset-path", type=str, default="", help="Path to the dataset."
+        )
+        parser.add_argument(
+            "--num-prompts",
+            type=int,
+            default=BenchArgs.num_prompts,
+            help="Number of prompts to process. Default is 1000.",
+        )
+        parser.add_argument(
+            "--sharegpt-output-len",
+            type=int,
+            default=BenchArgs.sharegpt_output_len,
+            help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
+        )
+        parser.add_argument(
+            "--sharegpt-context-len",
+            type=int,
+            default=BenchArgs.sharegpt_context_len,
+            help="The context length of the model for the ShareGPT dataset. Requests longer than the context length will be dropped.",
+        )
+        parser.add_argument(
+            "--random-input-len",
+            type=int,
+            default=BenchArgs.random_input_len,
+            help="Number of input tokens per request, used only for random dataset.",
+        )
+        parser.add_argument(
+            "--random-output-len",
+            type=int,
+            default=BenchArgs.random_output_len,
+            help="Number of output tokens per request, used only for random dataset.",
+        )
+        parser.add_argument(
+            "--random-range-ratio",
+            type=float,
+            default=BenchArgs.random_range_ratio,
+            help="Range of sampled ratio of input/output length, "
+            "used only for random dataset.",
+        )
+        parser.add_argument(
+            "--gsp-num-groups",
+            type=int,
+            default=BenchArgs.gsp_num_groups,
+            help="Number of groups with shared prefix, used"
+            "only for generate-shared-prefix",
+        )
+        parser.add_argument(
+            "--gsp-prompts-per-group",
+            type=int,
+            default=BenchArgs.gsp_prompts_per_group,
+            help="Number of prompts per group of shared prefix, used"
+            "only for generate-shared-prefix",
+        )
+        parser.add_argument(
+            "--gsp-system-prompt-len",
+            type=int,
+            default=BenchArgs.gsp_system_prompt_len,
+            help="System prompt length, used" "only for generate-shared-prefix",
+        )
+        parser.add_argument(
+            "--gsp-question-len",
+            type=int,
+            default=BenchArgs.gsp_question_len,
+            help="Question length, used" "only for generate-shared-prefix",
+        )
+        parser.add_argument(
+            "--gsp-output-len",
+            type=int,
+            default=BenchArgs.gsp_output_len,
+            help="Target length in tokens for outputs in generated-shared-prefix dataset",
+        )
+        parser.add_argument("--seed", type=int, default=1, help="The random seed.")
+        parser.add_argument(
+            "--disable-ignore-eos",
+            action="store_true",
+            help="Disable ignore EOS token",
+        )
+        parser.add_argument(
+            "--extra-request-body",
+            metavar='{"key1": "value1", "key2": "value2"}',
+            type=str,
+            default=BenchArgs.extra_request_body,
+            help="Append given JSON object to the request payload. You can use this to specify"
+            "additional generate params like sampling params.",
+        )
+        parser.add_argument(
+            "--apply-chat-template",
+            action="store_true",
+            help="Apply chat template",
+        )
+        parser.add_argument(
+            "--profile",
+            action="store_true",
+            help="Use Torch Profiler. The endpoint must be launched with "
+            "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
+        )
+        parser.add_argument(
+            "--skip-warmup",
+            action="store_true",
+            help="Skip the warmup batches.",
+        )
+        parser.add_argument(
+            "--do-not-exit",
+            action="store_true",
+            help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
+        )
+        parser.add_argument(
+            "--prompt-suffix",
+            type=str,
+            default="",
+            help="Suffix applied to the end of all user prompts, followed by assistant prompt suffix.",
+        )
+
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        attrs = [attr.name for attr in dataclasses.fields(cls)]
+        return cls(**{attr: getattr(args, attr) for attr in attrs})
+
+
+def throughput_test_once(
+    backend_name: str,
+    backend,
+    reqs: List[DatasetRow],
+    ignore_eos: bool,
+    extra_request_body: Dict,
+    profile: bool,
+):
+    measurement_results = {
+        "backend": backend_name,
+        "successful_requests": len(reqs),
+        "total_latency": -1,
+        "total_input_tokens": sum(r.prompt_len for r in reqs),
+        "total_output_tokens": -1,
+        "request_throughput": -1,
+        "input_throughput": -1,
+        "output_throughput": -1,
+        "total_throughput": -1,
+    }
+
+    prompt = [r.prompt for r in reqs]
+    sampling_params = [
+        {
+            "temperature": 0,
+            "max_new_tokens": r.output_len,
+            "ignore_eos": ignore_eos,
+            **extra_request_body,
+        }
+        for r in reqs
+    ]
+
+    if profile:
+        assert (
+            "SGLANG_TORCH_PROFILER_DIR" in os.environ
+        ), "Please set SGLANG_TORCH_PROFILER_DIR."
+        os.makedirs(os.environ["SGLANG_TORCH_PROFILER_DIR"], exist_ok=True)
+        backend.start_profile()
+
+    st = time.perf_counter()
+    gen_out = backend.generate(prompt=prompt, sampling_params=sampling_params)
+    latency = time.perf_counter() - st
+
+    if profile:
+        dir = os.getenv("SGLANG_TORCH_PROFILER_DIR")
+        known_files = set(os.listdir(dir))
+        backend.stop_profile()
+        monitor_trace_file(known_files, dir)
+
+    if backend_name == "runtime":
+        gen_out = json.loads(gen_out)
+
+    server_info = backend.get_server_info()
+
+    measurement_results["total_latency"] = latency
+    measurement_results["total_output_tokens"] = sum(
+        o["meta_info"]["completion_tokens"] for o in gen_out
+    )
+    measurement_results["request_throughput"] = (
+        measurement_results["successful_requests"] / latency
+    )
+    measurement_results["input_throughput"] = (
+        measurement_results["total_input_tokens"] / latency
+    )
+    measurement_results["output_throughput"] = (
+        measurement_results["total_output_tokens"] / latency
+    )
+    measurement_results["total_throughput"] = (
+        measurement_results["total_input_tokens"]
+        + measurement_results["total_output_tokens"]
+    ) / latency
+
+    if inspect.isawaitable(server_info):
+        server_info = asyncio.run(server_info)
+
+    measurement_results["last_gen_throughput"] = server_info["internal_states"][0][
+        "last_gen_throughput"
+    ]
+
+    return measurement_results
+
+
+def monitor_trace_file(known_files, directory, interval=1):
+    print(f"Monitoring {directory} for new trace files...")
+
+    while True:
+        flag = False
+        time.sleep(interval)
+        current_files = set(os.listdir(directory))
+
+        new_files = current_files - known_files
+        for new_file in new_files:
+            new_file_path = os.path.join(directory, new_file)
+            print(f"New file detected: {new_file}")
+
+            previous_size = 0
+            while True:
+                try:
+                    current_size = os.path.getsize(new_file_path)
+                except FileNotFoundError:
+                    print(f"File {new_file} is no longer accessible.")
+                    break
+
+                if current_size > previous_size:
+                    previous_size = current_size
+                else:
+                    flag = True
+                    break
+
+                time.sleep(interval)
+        if flag:
+            break
+
+
+def throughput_test(
+    server_args: ServerArgs,
+    bench_args: BenchArgs,
+):
+    if bench_args.backend == "engine":
+        backend = Engine(**dataclasses.asdict(server_args))
+        if not backend:
+            raise ValueError("Please provide valid engine arguments")
+    elif bench_args.backend == "runtime":
+        backend = Runtime(**dataclasses.asdict(server_args))
+    else:
+        raise ValueError('Please set backend to either "engine" or "runtime"')
+
+    tokenizer_id = server_args.tokenizer_path or server_args.model_path
+    tokenizer = get_tokenizer(tokenizer_id)
+
+    # Set global environments
+    set_ulimit()
+    random.seed(bench_args.seed)
+    np.random.seed(bench_args.seed)
+
+    # Parse args
+    extra_request_body = {}
+    if bench_args.extra_request_body:
+        extra_request_body = json.loads(args.extra_request_body)
+
+    # Read dataset
+    input_requests = get_dataset(bench_args, tokenizer)
+
+    warmup_requests = sample_random_requests(
+        input_len=256,
+        output_len=16,
+        num_prompts=min(bench_args.num_prompts, 16),
+        range_ratio=1.0,
+        tokenizer=tokenizer,
+        dataset_path=bench_args.dataset_path,
+    )
+
+    # Warm up
+    if not bench_args.skip_warmup:
+        logging.info("\nWarmup...")
+        throughput_test_once(
+            backend_name=bench_args.backend,
+            backend=backend,
+            reqs=warmup_requests,
+            ignore_eos=not bench_args.disable_ignore_eos,
+            extra_request_body=extra_request_body,
+            profile=False,
+        )
+        time.sleep(0.5)
+
+    logging.info("\nBenchmark...")
+    result = throughput_test_once(
+        backend_name=bench_args.backend,
+        backend=backend,
+        reqs=input_requests,
+        ignore_eos=not bench_args.disable_ignore_eos,
+        extra_request_body=extra_request_body,
+        profile=bench_args.profile,
+    )
+    backend.shutdown()
+
+    if bench_args.result_filename:
+        with open(bench_args.result_filename, "a") as fout:
+            fout.write(json.dumps(result) + "\n")
+
+    print(
+        "\n{s:{c}^{n}}".format(s=" Offline Throughput Benchmark Result ", n=50, c="=")
+    )
+    print("{:<40} {:<10}".format("Backend:", result["backend"]))
+    print("{:<40} {:<10}".format("Successful requests:", result["successful_requests"]))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", result["total_latency"]))
+    print("{:<40} {:<10}".format("Total input tokens:", result["total_input_tokens"]))
+    print(
+        "{:<40} {:<10}".format("Total generated tokens:", result["total_output_tokens"])
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Last generation throughput (tok/s):", result["last_gen_throughput"]
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Request throughput (req/s):", result["request_throughput"]
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Input token throughput (tok/s):", result["input_throughput"]
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Output token throughput (tok/s):", result["output_throughput"]
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Total token throughput (tok/s):", result["total_throughput"]
+        )
+    )
+    print("=" * 50)
+
+    return result
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    BenchArgs.add_cli_args(parser)
+    args = parser.parse_args()
+
+    # handling ModelScope model downloads
+    if os.getenv("SGLANG_USE_MODELSCOPE", "false").lower() in ("true", "1"):
+        if os.path.exists(args.model_path):
+            print(f"Using local model path: {args.model_path}")
+        else:
+            try:
+                from modelscope import snapshot_download
+
+                print(f"Using ModelScope to download model: {args.model_path}")
+
+                # download the model and replace args.model_path
+                args.model_path = snapshot_download(
+                    args.model_path,
+                )
+                print(f"Model downloaded to: {args.model_path}")
+            except Exception as e:
+                print(f"ModelScope download failed: {str(e)}")
+                raise e
+
+    server_args = ServerArgs.from_cli_args(args)
+    bench_args = BenchArgs.from_cli_args(args)
+
+    logging.basicConfig(
+        level=getattr(logging, server_args.log_level.upper()),
+        format="%(message)s",
+    )
+
+    throughput_test(server_args, bench_args)
+
+    while bench_args.do_not_exit:
+        pass
diff --git a/python/sglang/bench_one_batch.py b/python/sglang/bench_one_batch.py
new file mode 100644
index 000000000..ebd461ec3
--- /dev/null
+++ b/python/sglang/bench_one_batch.py
@@ -0,0 +1,665 @@
+"""
+Benchmark the latency of running a single static batch without a server.
+
+This script does not launch a server and uses the low-level APIs.
+It accepts server arguments (the same as launch_server.py) and benchmark arguments (e.g., batch size, input lengths).
+
+# Usage (latency test)
+## with dummy weights:
+python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
+## sweep through multiple data points and store (append) the results in a jsonl file:
+python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --run-name test_run
+## run with profiling:
+python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --profile
+# Usage (correctness test):
+python -m sglang.bench_one_batch --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
+
+## Reference output (of the correctness test above, can be gpu dependent):
+input_ids=[[1, 450, 7483, 310, 3444, 338], [1, 450, 7483, 310, 278, 3303, 13187, 290, 338], [1, 20628, 338, 263, 6575, 1460, 2462, 322, 306, 763]]
+
+prefill logits (first half): tensor([[-10.0312,  -9.5000,   0.8931,  ...,  -4.9414,  -3.2422,  -3.3633],
+        [-10.0312,  -9.5000,   0.8931,  ...,  -4.9414,  -3.2422,  -3.3633],
+        [ -9.1875, -10.2500,   2.7129,  ...,  -4.3359,  -4.0664,  -4.1328]],
+       device='cuda:0')
+
+prefill logits (final): tensor([[-8.3125, -7.1172,  3.3457,  ..., -4.9570, -4.1328, -3.4141],
+        [-8.9141, -9.0156,  4.1445,  ..., -4.9922, -4.4961, -4.0781],
+        [-9.6328, -9.0547,  4.0195,  ..., -5.3047, -4.7148, -4.4570]],
+       device='cuda:0')
+
+========== Prompt 0 ==========
+<s> The capital of France is Paris.
+The capital of the United States is Washington, D.C.
+
+
+========== Prompt 1 ==========
+<s> The capital of the United Kindom is London.
+The capital of the United Kingdom is London.
+The capital of the
+
+========== Prompt 2 ==========
+<s> Today is a sunny day and I like to go for a walk in the park.
+I'm going to the park
+"""
+
+import argparse
+import copy
+import dataclasses
+import itertools
+import json
+import logging
+import multiprocessing
+import os
+import time
+from typing import Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.distributed.parallel_state import destroy_distributed_environment
+from sglang.srt.entrypoints.engine import _set_envs_and_config
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.layers.moe import initialize_moe_config
+from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
+from sglang.srt.managers.scheduler import Scheduler
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_executor.model_runner import ModelRunner
+from sglang.srt.sampling.sampling_params import SamplingParams
+from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+from sglang.srt.utils import (
+    configure_logger,
+    get_bool_env_var,
+    kill_process_tree,
+    require_mlp_sync,
+    require_mlp_tp_gather,
+    set_gpu_proc_affinity,
+    suppress_other_loggers,
+)
+
+
+@dataclasses.dataclass
+class BenchArgs:
+    run_name: str = "default"
+    batch_size: Tuple[int] = (1,)
+    input_len: Tuple[int] = (1024,)
+    output_len: Tuple[int] = (16,)
+    prompt_filename: str = ""
+    result_filename: str = "result.jsonl"
+    correctness_test: bool = False
+    # This is only used for correctness test
+    cut_len: int = 4
+    log_decode_step: int = 0
+    profile: bool = False
+    profile_record_shapes: bool = False
+    profile_filename_prefix: str = "profile"
+
+    @staticmethod
+    def add_cli_args(parser: argparse.ArgumentParser):
+        parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
+        parser.add_argument(
+            "--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
+        )
+        parser.add_argument(
+            "--input-len", type=int, nargs="+", default=BenchArgs.input_len
+        )
+        parser.add_argument(
+            "--output-len", type=int, nargs="+", default=BenchArgs.output_len
+        )
+        parser.add_argument(
+            "--prompt-filename", type=str, default=BenchArgs.prompt_filename
+        )
+        parser.add_argument(
+            "--result-filename", type=str, default=BenchArgs.result_filename
+        )
+        parser.add_argument("--correctness-test", action="store_true")
+        parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
+        parser.add_argument(
+            "--log-decode-step",
+            type=int,
+            default=BenchArgs.log_decode_step,
+            help="Log decode latency by step, default is set to zero to disable.",
+        )
+        parser.add_argument(
+            "--profile", action="store_true", help="Use Torch Profiler."
+        )
+        parser.add_argument(
+            "--profile-record-shapes",
+            action="store_true",
+            help="Record tensor shapes in profiling results.",
+        )
+        parser.add_argument(
+            "--profile-filename-prefix",
+            type=str,
+            default=BenchArgs.profile_filename_prefix,
+            help="Prefix of the profiling file names. The full profiling result file(s) be "
+            '"[profile_filename_prefix]_batch[batch_size]_input[input_len]_output[output_len].trace.json.gz"',
+        )
+
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        # use the default value's type to cast the args into correct types.
+        attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
+        return cls(
+            **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
+        )
+
+
+def load_model(server_args, port_args, tp_rank):
+    suppress_other_loggers()
+    rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
+    moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
+
+    model_config = ModelConfig.from_server_args(server_args)
+    model_runner = ModelRunner(
+        model_config=model_config,
+        mem_fraction_static=server_args.mem_fraction_static,
+        gpu_id=tp_rank,
+        tp_rank=tp_rank,
+        tp_size=server_args.tp_size,
+        moe_ep_rank=moe_ep_rank,
+        moe_ep_size=server_args.ep_size,
+        pp_rank=0,
+        pp_size=1,
+        nccl_port=port_args.nccl_port,
+        server_args=server_args,
+    )
+    rank_print(f"max_total_num_tokens={model_runner.max_total_num_tokens}")
+    tokenizer = get_tokenizer(
+        server_args.tokenizer_path,
+        tokenizer_mode=server_args.tokenizer_mode,
+        trust_remote_code=server_args.trust_remote_code,
+    )
+    if server_args.tp_size > 1:
+        dist.barrier()
+    return model_runner, tokenizer
+
+
+def prepare_inputs_for_correctness_test(bench_args, tokenizer, custom_prompts):
+    prompts = (
+        custom_prompts
+        if custom_prompts
+        else [
+            "The capital of France is",
+            "The capital of the United Kindom is",
+            "Today is a sunny day and I like",
+        ]
+    )
+    input_ids = [tokenizer.encode(p) for p in prompts]
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_new_tokens=BenchArgs.output_len,
+    )
+
+    reqs = []
+    for i in range(len(prompts)):
+        assert len(input_ids[i]) > bench_args.cut_len
+
+        tmp_input_ids = input_ids[i][: bench_args.cut_len]
+        req = Req(
+            rid=i,
+            origin_input_text=prompts[i],
+            origin_input_ids=tmp_input_ids,
+            sampling_params=sampling_params,
+        )
+        req.prefix_indices = []
+        req.fill_ids = req.origin_input_ids
+        req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
+        req.logprob_start_len = len(req.origin_input_ids) - 1
+        reqs.append(req)
+
+    return input_ids, reqs
+
+
+def prepare_extend_inputs_for_correctness_test(
+    bench_args, input_ids, reqs, model_runner
+):
+    for i in range(len(reqs)):
+        req = reqs[i]
+        req.fill_ids += input_ids[i][bench_args.cut_len :]
+        req.prefix_indices = model_runner.req_to_token_pool.req_to_token[
+            i, : bench_args.cut_len
+        ]
+        req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
+        req.logprob_start_len = len(req.origin_input_ids) - 1
+    return reqs
+
+
+def prepare_synthetic_inputs_for_latency_test(
+    batch_size, input_len, custom_inputs=None
+):
+    input_ids = (
+        custom_inputs
+        if custom_inputs
+        else np.random.randint(0, 10000, (batch_size, input_len), dtype=np.int32)
+    )
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_new_tokens=BenchArgs.output_len,
+    )
+
+    reqs = []
+    for i in range(len(input_ids)):
+        req = Req(
+            rid=i,
+            origin_input_text="",
+            origin_input_ids=list(input_ids[i]),
+            sampling_params=sampling_params,
+        )
+        req.prefix_indices = []
+        req.fill_ids = req.origin_input_ids
+        req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
+        req.logprob_start_len = len(req.origin_input_ids) - 1
+        reqs.append(req)
+
+    return reqs
+
+
+@torch.no_grad
+def extend(reqs, model_runner):
+    batch = ScheduleBatch.init_new(
+        reqs=reqs,
+        req_to_token_pool=model_runner.req_to_token_pool,
+        token_to_kv_pool_allocator=model_runner.token_to_kv_pool_allocator,
+        tree_cache=None,
+        model_config=model_runner.model_config,
+        enable_overlap=False,
+        spec_algorithm=SpeculativeAlgorithm.NONE,
+    )
+    batch.prepare_for_extend()
+    _maybe_prepare_mlp_sync_batch(batch, model_runner)
+    model_worker_batch = batch.get_model_worker_batch()
+    forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
+    logits_output, _ = model_runner.forward(forward_batch)
+    next_token_ids = model_runner.sample(logits_output, forward_batch)
+    return next_token_ids, logits_output.next_token_logits, batch
+
+
+@torch.no_grad
+def decode(input_token_ids, batch, model_runner):
+    batch.output_ids = input_token_ids
+    batch.prepare_for_decode()
+    _maybe_prepare_mlp_sync_batch(batch, model_runner)
+    model_worker_batch = batch.get_model_worker_batch()
+    forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
+    logits_output, _ = model_runner.forward(forward_batch)
+    next_token_ids = model_runner.sample(logits_output, forward_batch)
+    return next_token_ids, logits_output.next_token_logits
+
+
+def _maybe_prepare_mlp_sync_batch(batch: ScheduleBatch, model_runner):
+    if require_mlp_sync(model_runner.server_args):
+        Scheduler.prepare_mlp_sync_batch_raw(
+            batch,
+            dp_size=model_runner.server_args.dp_size,
+            attn_tp_size=1,
+            tp_group=model_runner.tp_group,
+            get_idle_batch=None,
+            disable_cuda_graph=model_runner.server_args.disable_cuda_graph,
+            spec_algorithm=SpeculativeAlgorithm.NONE,
+            speculative_num_draft_tokens=None,
+            require_mlp_tp_gather=require_mlp_tp_gather(model_runner.server_args),
+            disable_overlap_schedule=model_runner.server_args.disable_overlap_schedule,
+        )
+
+
+def _read_prompts_from_file(prompt_file, rank_print):
+    """Read custom prompts from the file specified by `--prompt-filename`."""
+    if not prompt_file:
+        return []
+    if not os.path.exists(prompt_file):
+        rank_print(
+            f"Custom prompt file {prompt_file} not found. Using default inputs..."
+        )
+        return []
+    with open(prompt_file, "r") as pf:
+        return pf.readlines()
+
+
+def _save_profile_trace_results(profiler, filename):
+    parent_dir = os.path.dirname(os.path.abspath(filename))
+    os.makedirs(parent_dir, exist_ok=True)
+    profiler.export_chrome_trace(filename)
+    print(
+        profiler.key_averages(group_by_input_shape=True).table(
+            sort_by="self_cpu_time_total"
+        )
+    )
+
+
+def correctness_test(
+    server_args,
+    port_args,
+    bench_args,
+    tp_rank,
+):
+    # Configure the logger
+    configure_logger(server_args, prefix=f" TP{tp_rank}")
+    rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
+
+    # Load the model
+    model_runner, tokenizer = load_model(server_args, port_args, tp_rank)
+
+    # Prepare inputs
+    custom_prompts = _read_prompts_from_file(bench_args.prompt_filename, rank_print)
+    input_ids, reqs = prepare_inputs_for_correctness_test(
+        bench_args, tokenizer, custom_prompts
+    )
+    rank_print(f"\n{input_ids=}\n")
+
+    if bench_args.cut_len > 0:
+        # Prefill
+        next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
+        rank_print(f"prefill logits (first half): {next_token_logits} \n")
+
+    # Prepare extend inputs
+    reqs = prepare_extend_inputs_for_correctness_test(
+        bench_args, input_ids, reqs, model_runner
+    )
+
+    # Extend (prefill w/ KV cache)
+    next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
+    rank_print(f"prefill logits (final): {next_token_logits} \n")
+
+    # Decode
+    output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
+    for _ in range(bench_args.output_len[0] - 1):
+        next_token_ids, _ = decode(next_token_ids, batch, model_runner)
+        next_token_ids_list = next_token_ids.tolist()
+        for i in range(len(reqs)):
+            output_ids[i].append(next_token_ids_list[i])
+
+    # Print output texts
+    for i in range(len(reqs)):
+        rank_print(f"========== Prompt {i} ==========")
+        rank_print(tokenizer.decode(output_ids[i]), "\n")
+
+
+def synchronize(device):
+    torch.get_device_module(device).synchronize()
+
+
+def latency_test_run_once(
+    run_name,
+    model_runner,
+    rank_print,
+    reqs,
+    batch_size,
+    input_len,
+    output_len,
+    device,
+    log_decode_step,
+    profile,
+    profile_record_shapes,
+    profile_filename_prefix,
+):
+    max_batch_size = model_runner.max_total_num_tokens // (input_len + output_len)
+    if batch_size > max_batch_size:
+        rank_print(
+            f"skipping ({batch_size}, {input_len}, {output_len}) due to max batch size limit"
+        )
+        return
+
+    # Clear the pools.
+    model_runner.req_to_token_pool.clear()
+    model_runner.token_to_kv_pool_allocator.clear()
+
+    measurement_results = {
+        "run_name": run_name,
+        "batch_size": batch_size,
+        "input_len": input_len,
+        "output_len": output_len,
+    }
+
+    tot_latency = 0
+
+    profiler = None
+    if profile:
+        profiler = torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+            with_stack=True,
+            record_shapes=profile_record_shapes,
+        )
+        profiler.start()
+
+    # Prefill
+    synchronize(device)
+    tic = time.perf_counter()
+    next_token_ids, _, batch = extend(reqs, model_runner)
+    synchronize(device)
+    prefill_latency = time.perf_counter() - tic
+    tot_latency += prefill_latency
+    throughput = input_len * batch_size / prefill_latency
+    rank_print(
+        f"Prefill. latency: {prefill_latency:6.5f} s, throughput: {throughput:9.2f} token/s"
+    )
+    measurement_results["prefill_latency"] = prefill_latency
+    measurement_results["prefill_throughput"] = throughput
+
+    if profile:
+        profiler.stop()
+        profile_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_prefill.trace.json.gz"
+        _save_profile_trace_results(profiler, profile_filename)
+        rank_print(
+            f"torch profiler chrome trace for prefill saved to {profile_filename}"
+        )
+
+    # Decode
+    decode_latencies = []
+    for i in range(output_len - 1):
+        synchronize(device)
+        if profile and i == output_len / 2:
+            profiler = None
+            profiler = torch.profiler.profile(
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                    torch.profiler.ProfilerActivity.CUDA,
+                ],
+                with_stack=True,
+                record_shapes=profile_record_shapes,
+            )
+            profiler.start()
+
+        tic = time.perf_counter()
+        next_token_ids, _ = decode(next_token_ids, batch, model_runner)
+        synchronize(device)
+        latency = time.perf_counter() - tic
+        tot_latency += latency
+        throughput = batch_size / latency
+        decode_latencies.append(latency)
+        if i < 5 or (log_decode_step > 0 and i % log_decode_step == 0):
+            rank_print(
+                f"Decode {i}. Batch size: {batch_size}, latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
+            )
+
+        if profile and i == output_len / 2:
+            profiler.stop()
+            profile_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_decode.trace.json.gz"
+            _save_profile_trace_results(profiler, profile_filename)
+            rank_print(
+                f"torch profiler chrome trace for decoding 1 token saved to {profile_filename}"
+            )
+
+    # Record decode timing from 2nd output
+    if output_len > 1:
+        med_decode_latency = np.median(decode_latencies)
+        med_decode_throughput = batch_size / med_decode_latency
+        rank_print(
+            f"Decode.  median latency: {med_decode_latency:6.5f} s, median throughput: {med_decode_throughput:9.2f} token/s"
+        )
+        measurement_results["median_decode_latency"] = med_decode_latency
+        measurement_results["median_decode_throughput"] = med_decode_throughput
+
+    throughput = (input_len + output_len) * batch_size / tot_latency
+    rank_print(
+        f"Total. latency: {tot_latency:6.3f} s, throughput: {throughput:9.2f} token/s"
+    )
+    measurement_results["total_latency"] = tot_latency
+    measurement_results["overall_throughput"] = throughput
+    return measurement_results
+
+
+def latency_test(
+    server_args,
+    port_args,
+    bench_args,
+    tp_rank,
+):
+    initialize_moe_config(server_args)
+
+    # Set CPU affinity
+    if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
+        set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, tp_rank)
+
+    # Configure the logger
+    configure_logger(server_args, prefix=f" TP{tp_rank}")
+    rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
+
+    # Load the model
+    model_runner, tokenizer = load_model(server_args, port_args, tp_rank)
+
+    # Prepare inputs for warm up
+    reqs = prepare_synthetic_inputs_for_latency_test(
+        bench_args.batch_size[0], bench_args.input_len[0]
+    )
+
+    # Warm up
+    rank_print("Warmup ...")
+    latency_test_run_once(
+        bench_args.run_name,
+        model_runner,
+        rank_print,
+        reqs,
+        bench_args.batch_size[0],
+        bench_args.input_len[0],
+        min(32, bench_args.output_len[0]),  # shorter decoding to speed up the warmup
+        server_args.device,
+        log_decode_step=0,
+        profile=False,
+        profile_record_shapes=False,
+        profile_filename_prefix="",  # not used
+    )
+
+    rank_print("Benchmark ...")
+
+    custom_inputs = _read_prompts_from_file(bench_args.prompt_filename, rank_print)
+    custom_inputs = [tokenizer.encode(p.strip()) for p in custom_inputs]
+    custom_input_len = len(custom_inputs)
+
+    # Run the sweep
+    result_list = []
+    for bs, il, ol in itertools.product(
+        bench_args.batch_size, bench_args.input_len, bench_args.output_len
+    ):
+        bs_aligned_inputs = []
+        if custom_inputs:
+            if custom_input_len == bs:
+                bs_aligned_inputs = custom_inputs
+            elif custom_input_len > bs:
+                rank_print(
+                    f"Custom input size ({custom_input_len}) is larger than batch_size ({bs}). "
+                    f"Using the first {bs} prompts."
+                )
+                bs_aligned_inputs = copy.deepcopy(custom_inputs[:bs])
+            else:
+                rank_print(
+                    f"Custom input size ({custom_input_len}) is smaller than batch_size ({bs}). "
+                    f"Pad to the desired batch_size with the last prompt."
+                )
+                bs_aligned_inputs = copy.deepcopy(custom_inputs)
+                bs_aligned_inputs.extend(
+                    [bs_aligned_inputs[-1]] * (bs - custom_input_len)
+                )
+
+        reqs = prepare_synthetic_inputs_for_latency_test(bs, il, bs_aligned_inputs)
+        ret = latency_test_run_once(
+            bench_args.run_name,
+            model_runner,
+            rank_print,
+            reqs,
+            bs,
+            il,
+            ol,
+            server_args.device,
+            bench_args.log_decode_step,
+            bench_args.profile if tp_rank == 0 else None,
+            bench_args.profile_record_shapes if tp_rank == 0 else None,
+            bench_args.profile_filename_prefix,
+        )
+        if ret is not None:
+            result_list.append(ret)
+
+    # Write results in jsonlines format on rank 0.
+    if tp_rank == 0 and bench_args.result_filename:
+        with open(bench_args.result_filename, "a") as fout:
+            for result in result_list:
+                fout.write(json.dumps(result) + "\n")
+
+    if server_args.tp_size > 1:
+        destroy_distributed_environment()
+
+
+def main(server_args, bench_args):
+    server_args.cuda_graph_max_bs = max(bench_args.batch_size)
+
+    _set_envs_and_config(server_args)
+
+    if server_args.model_path:
+        if bench_args.correctness_test:
+            work_func = correctness_test
+        else:
+            work_func = latency_test
+    else:
+        raise ValueError(
+            "Provide --model-path for running the tests or "
+            "provide --result-filename for plotting the results"
+        )
+
+    port_args = PortArgs.init_new(server_args)
+
+    if server_args.tp_size == 1:
+        work_func(server_args, port_args, bench_args, 0)
+    else:
+        workers = []
+        for tp_rank in range(server_args.tp_size):
+            proc = multiprocessing.Process(
+                target=work_func,
+                args=(
+                    server_args,
+                    port_args,
+                    bench_args,
+                    tp_rank,
+                ),
+            )
+            proc.start()
+            workers.append(proc)
+
+        for proc in workers:
+            proc.join()
+
+        proc.terminate()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    BenchArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    server_args = ServerArgs.from_cli_args(args)
+    bench_args = BenchArgs.from_cli_args(args)
+
+    logging.basicConfig(
+        level=getattr(logging, server_args.log_level.upper()),
+        format="%(message)s",
+    )
+
+    try:
+        main(server_args, bench_args)
+    finally:
+        if server_args.tp_size != 1:
+            kill_process_tree(os.getpid(), include_parent=False)
diff --git a/python/sglang/bench_one_batch_server.py b/python/sglang/bench_one_batch_server.py
new file mode 100644
index 000000000..8495c110e
--- /dev/null
+++ b/python/sglang/bench_one_batch_server.py
@@ -0,0 +1,439 @@
+"""
+Benchmark the latency of running a single batch with a server.
+
+This script launches a server and uses the HTTP interface.
+It accepts server arguments (the same as launch_server.py) and benchmark arguments (e.g., batch size, input lengths).
+
+Usage:
+python3 -m sglang.bench_one_batch_server --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
+
+python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
+python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8 --show-report --profile --profile-by-stage
+"""
+
+import argparse
+import dataclasses
+import itertools
+import json
+import multiprocessing
+import os
+import time
+from typing import List, Tuple
+
+import requests
+
+from sglang.bench_serving import get_tokenizer, sample_random_requests
+from sglang.profiler import run_profile
+from sglang.srt.entrypoints.http_server import launch_server
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import is_blackwell, kill_process_tree
+from sglang.test.test_utils import is_in_ci, write_github_step_summary
+
+
+@dataclasses.dataclass
+class BenchArgs:
+    run_name: str = "default"
+    batch_size: Tuple[int] = (1,)
+    input_len: Tuple[int] = (1024,)
+    output_len: Tuple[int] = (16,)
+    temperature: float = 0.0
+    return_logprob: bool = False
+    client_stream_interval: int = 1
+    input_len_step_percentage: float = 0.0
+    result_filename: str = "result.jsonl"
+    base_url: str = ""
+    skip_warmup: bool = False
+    show_report: bool = False
+    profile: bool = False
+    profile_steps: int = 3
+    profile_by_stage: bool = False
+    dataset_path: str = ""
+
+    @staticmethod
+    def add_cli_args(parser: argparse.ArgumentParser):
+        parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
+        parser.add_argument(
+            "--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
+        )
+        parser.add_argument(
+            "--input-len", type=int, nargs="+", default=BenchArgs.input_len
+        )
+        parser.add_argument(
+            "--output-len", type=int, nargs="+", default=BenchArgs.output_len
+        )
+        parser.add_argument("--temperature", type=float, default=BenchArgs.temperature)
+        parser.add_argument("--return-logprob", action="store_true")
+        parser.add_argument(
+            "--client-stream-interval",
+            type=int,
+            default=BenchArgs.client_stream_interval,
+        )
+        parser.add_argument(
+            "--input-len-step-percentage",
+            type=float,
+            default=BenchArgs.input_len_step_percentage,
+        )
+        parser.add_argument(
+            "--result-filename", type=str, default=BenchArgs.result_filename
+        )
+        parser.add_argument("--base-url", type=str, default=BenchArgs.base_url)
+        parser.add_argument("--skip-warmup", action="store_true")
+        parser.add_argument("--show-report", action="store_true")
+        parser.add_argument("--profile", action="store_true")
+        parser.add_argument(
+            "--profile-steps", type=int, default=BenchArgs.profile_steps
+        )
+        parser.add_argument("--profile-by-stage", action="store_true")
+        parser.add_argument(
+            "--dataset-path",
+            type=str,
+            default=BenchArgs.dataset_path,
+            help="Path to the dataset.",
+        )
+
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        # use the default value's type to cast the args into correct types.
+        attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
+        return cls(
+            **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
+        )
+
+
+def launch_server_internal(server_args):
+    try:
+        launch_server(server_args)
+    except Exception as e:
+        raise e
+    finally:
+        kill_process_tree(os.getpid(), include_parent=False)
+
+
+def launch_server_process(server_args: ServerArgs):
+    proc = multiprocessing.Process(target=launch_server_internal, args=(server_args,))
+    proc.start()
+    base_url = f"http://{server_args.host}:{server_args.port}"
+    timeout = 600
+
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        try:
+            headers = {
+                "Content-Type": "application/json; charset=utf-8",
+            }
+            response = requests.get(f"{base_url}/v1/models", headers=headers)
+            if response.status_code == 200:
+                return proc, base_url
+        except requests.RequestException:
+            pass
+        time.sleep(10)
+    raise TimeoutError("Server failed to start within the timeout period.")
+
+
+def run_one_case(
+    url: str,
+    batch_size: int,
+    input_len: int,
+    output_len: int,
+    temperature: float,
+    return_logprob: bool,
+    stream_interval: int,
+    input_len_step_percentage: float,
+    run_name: str,
+    result_filename: str,
+    tokenizer,
+    profile: bool = False,
+    profile_steps: int = 3,
+    profile_by_stage: bool = False,
+    dataset_path: str = "",
+):
+    requests.post(url + "/flush_cache")
+    input_requests = sample_random_requests(
+        input_len=input_len,
+        output_len=output_len,
+        num_prompts=batch_size,
+        range_ratio=1.0,
+        tokenizer=tokenizer,
+        dataset_path=dataset_path,
+        random_sample=True,
+        return_text=False,
+    )
+
+    use_structured_outputs = False
+    if use_structured_outputs:
+        texts = []
+        for _ in range(batch_size):
+            texts.append(
+                "Human: What is the capital city of france? can you give as many trivial information as possible about that city? answer in json.\n"
+                * 50
+                + "Assistant:"
+            )
+        json_schema = "$$ANY$$"
+    else:
+        json_schema = None
+
+    profile_link = None
+    if profile:
+        profile_link: str = run_profile(
+            url, profile_steps, ["CPU", "GPU"], None, None, profile_by_stage
+        )
+
+    tic = time.perf_counter()
+    response = requests.post(
+        url + "/generate",
+        json={
+            "input_ids": [req.prompt for req in input_requests],
+            "sampling_params": {
+                "temperature": temperature,
+                "max_new_tokens": output_len,
+                "ignore_eos": True,
+                "json_schema": json_schema,
+                "stream_interval": stream_interval,
+            },
+            "return_logprob": return_logprob,
+            "stream": True,
+        },
+        stream=True,
+    )
+
+    # The TTFT of the last request in the batch
+    ttft = 0.0
+    for chunk in response.iter_lines(decode_unicode=False):
+        chunk = chunk.decode("utf-8")
+        if chunk and chunk.startswith("data:"):
+            if chunk == "data: [DONE]":
+                break
+            data = json.loads(chunk[5:].strip("\n"))
+            if "error" in data:
+                raise RuntimeError(f"Request has failed. {data}.")
+
+            assert (
+                data["meta_info"]["finish_reason"] is None
+                or data["meta_info"]["finish_reason"]["type"] == "length"
+            )
+            if data["meta_info"]["completion_tokens"] == 1:
+                ttft = time.perf_counter() - tic
+
+    latency = time.perf_counter() - tic
+    input_throughput = batch_size * input_len / ttft
+    output_throughput = batch_size * output_len / (latency - ttft)
+    overall_throughput = batch_size * (input_len + output_len) / latency
+
+    server_info = requests.get(url + "/get_server_info").json()
+    acc_length = server_info["internal_states"][0].get("avg_spec_accept_length", None)
+    last_gen_throughput = server_info["internal_states"][0]["last_gen_throughput"]
+
+    print(f"batch size: {batch_size}")
+    print(f"input_len: {input_len}")
+    print(f"output_len: {output_len}")
+    print(f"latency: {latency:.2f} s")
+    print(f"ttft: {ttft:.2f} s")
+    print(f"last generation throughput: {last_gen_throughput:.2f} tok/s")
+    print(f"input throughput: {input_throughput:.2f} tok/s")
+    if output_len != 1:
+        print(f"output throughput: {output_throughput:.2f} tok/s")
+
+    if result_filename:
+        with open(result_filename, "a") as fout:
+            res = {
+                "run_name": run_name,
+                "batch_size": batch_size,
+                "input_len": input_len,
+                "output_len": output_len,
+                "latency": round(latency, 4),
+                "output_throughput": round(output_throughput, 2),
+                "overall_throughput": round(overall_throughput, 2),
+                "last_gen_throughput": round(last_gen_throughput, 2),
+            }
+            fout.write(json.dumps(res) + "\n")
+
+    return (
+        batch_size,
+        latency,
+        ttft,
+        input_throughput,
+        output_throughput,
+        overall_throughput,
+        last_gen_throughput,
+        acc_length,
+        profile_link if profile else None,
+    )
+
+
+def get_report_summary(
+    result: List[Tuple], server_args: ServerArgs, bench_args: BenchArgs
+):
+    import tabulate
+
+    summary = (
+        f"\nInput lens: {bench_args.input_len}. Output lens: {bench_args.output_len}.\n"
+    )
+
+    headers = [
+        "batch size",
+        "latency (s)",
+        "input throughput (tok/s)",
+        "output throughput (tok/s)",
+        "acc length",
+        "ITL (ms)",
+        "input cost ($/1M)",
+        "output cost ($/1M)",
+    ]
+    if bench_args.profile:
+        headers.append("profile")
+    rows = []
+
+    for (
+        batch_size,
+        latency,
+        ttft,
+        input_throughput,
+        output_throughput,
+        _,
+        _,
+        acc_length,
+        trace_link,
+    ) in result:
+        if is_blackwell():
+            hourly_cost_per_gpu = 4  # $4/hour for one B200
+        else:
+            hourly_cost_per_gpu = 2  # $2/hour for one H100
+
+        hourly_cost = hourly_cost_per_gpu * server_args.tp_size
+        input_util = 0.7
+        accept_length = round(acc_length, 2) if acc_length is not None else "n/a"
+        itl = 1 / (output_throughput / batch_size) * 1000
+        input_cost = 1e6 / (input_throughput * input_util) / 3600 * hourly_cost
+        output_cost = 1e6 / output_throughput / 3600 * hourly_cost
+        row = [
+            batch_size,
+            latency,
+            input_throughput,
+            output_throughput,
+            accept_length,
+            itl,
+            input_cost,
+            output_cost,
+        ]
+        if trace_link:
+            row.append(f"[Profile]({trace_link})")
+        rows.append(row)
+
+    summary += tabulate.tabulate(
+        rows, headers=headers, tablefmt="github", floatfmt=".2f"
+    )
+    return summary
+
+
+def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
+    if bench_args.base_url:
+        proc, base_url = None, bench_args.base_url
+    else:
+        proc, base_url = launch_server_process(server_args)
+
+    server_info = requests.get(base_url + "/get_server_info").json()
+    if "tokenizer_path" in server_info:
+        tokenizer_path = server_info["tokenizer_path"]
+    elif "prefill" in server_info:
+        tokenizer_path = server_info["prefill"][0]["tokenizer_path"]
+    tokenizer = get_tokenizer(tokenizer_path)
+
+    # warmup
+    if not bench_args.skip_warmup:
+        print("=" * 8 + " Warmup Begin " + "=" * 8)
+        run_one_case(
+            base_url,
+            batch_size=16,
+            input_len=1024,
+            output_len=16,
+            temperature=bench_args.temperature,
+            return_logprob=bench_args.return_logprob,
+            stream_interval=bench_args.client_stream_interval,
+            input_len_step_percentage=bench_args.input_len_step_percentage,
+            run_name="",
+            result_filename="",
+            tokenizer=tokenizer,
+            dataset_path=bench_args.dataset_path,
+        )
+        print("=" * 8 + " Warmup End   " + "=" * 8 + "\n")
+
+    # benchmark
+    result = []
+    bench_result = []
+    try:
+        for bs, il, ol in itertools.product(
+            bench_args.batch_size, bench_args.input_len, bench_args.output_len
+        ):
+            result.append(
+                run_one_case(
+                    base_url,
+                    bs,
+                    il,
+                    ol,
+                    temperature=bench_args.temperature,
+                    return_logprob=bench_args.return_logprob,
+                    stream_interval=bench_args.client_stream_interval,
+                    input_len_step_percentage=bench_args.input_len_step_percentage,
+                    run_name=bench_args.run_name,
+                    result_filename=bench_args.result_filename,
+                    tokenizer=tokenizer,
+                )
+            )
+
+        if bench_args.profile:
+            try:
+                for bs, il, ol in itertools.product(
+                    bench_args.batch_size, bench_args.input_len, bench_args.output_len
+                ):
+                    bench_result.append(
+                        (
+                            run_one_case(
+                                base_url,
+                                bs,
+                                il,
+                                ol,
+                                temperature=bench_args.temperature,
+                                return_logprob=bench_args.return_logprob,
+                                stream_interval=bench_args.client_stream_interval,
+                                input_len_step_percentage=bench_args.input_len_step_percentage,
+                                run_name=bench_args.run_name,
+                                result_filename=bench_args.result_filename,
+                                tokenizer=tokenizer,
+                                profile=bench_args.profile,
+                                profile_steps=bench_args.profile_steps,
+                                profile_by_stage=bench_args.profile_by_stage,
+                            )[-1],
+                        )
+                    )
+                result = [t1[:-1] + t2 for t1, t2 in zip(result, bench_result)]
+            except Exception as e:
+                print(f"Error profiling, there will be no profile trace dump: {e}")
+    finally:
+        if proc:
+            kill_process_tree(proc.pid)
+
+    print(f"\nResults are saved to {bench_args.result_filename}")
+
+    if not bench_args.show_report:
+        return
+
+    summary = get_report_summary(result, server_args, bench_args)
+    print(summary)
+
+    if is_in_ci():
+        write_github_step_summary(summary)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    BenchArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    server_args = ServerArgs.from_cli_args(args)
+    bench_args = BenchArgs.from_cli_args(args)
+
+    run_benchmark(server_args, bench_args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py
new file mode 100644
index 000000000..27ff8a6da
--- /dev/null
+++ b/python/sglang/bench_serving.py
@@ -0,0 +1,2403 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/backend_request_func.py
+# Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/benchmark_serving.py
+
+"""
+Benchmark online serving with dynamic requests.
+
+Usage:
+python3 -m sglang.bench_serving --backend sglang --num-prompt 10
+
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 3000 --random-input 1024 --random-output 1024 --random-range-ratio 0.5
+"""
+
+import argparse
+import asyncio
+import base64
+import io
+import json
+import os
+import pickle
+import random
+import resource
+import sys
+import time
+import traceback
+import warnings
+from argparse import ArgumentParser
+from dataclasses import dataclass, field
+from datetime import datetime
+from json import JSONDecodeError
+from pathlib import Path
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
+
+import aiohttp
+import numpy as np
+import requests
+from tqdm.asyncio import tqdm
+from transformers import (
+    AutoTokenizer,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerBase,
+    PreTrainedTokenizerFast,
+)
+
+ASSISTANT_SUFFIX = "Assistant:"
+
+global args
+
+
+# don't want to import sglang package here
+def _get_bool_env_var(name: str, default: str = "false") -> bool:
+    value = os.getenv(name, default)
+    return value.lower() in ("true", "1")
+
+
+def _create_bench_client_session():
+    # When the pressure is big, the read buffer could be full before aio thread read
+    # the content. We increase the read_bufsize from 64K to 10M.
+    # Define constants for timeout and buffer size for clarity and maintainability
+    BENCH_AIOHTTP_TIMEOUT_SECONDS = 6 * 60 * 60  # 6 hours
+    BENCH_AIOHTTP_READ_BUFSIZE_BYTES = 10 * 1024**2  # 10 MB
+
+    aiohttp_timeout = aiohttp.ClientTimeout(total=BENCH_AIOHTTP_TIMEOUT_SECONDS)
+    return aiohttp.ClientSession(
+        timeout=aiohttp_timeout, read_bufsize=BENCH_AIOHTTP_READ_BUFSIZE_BYTES
+    )
+
+
+@dataclass
+class RequestFuncInput:
+    prompt: str
+    api_url: str
+    prompt_len: int
+    output_len: int
+    model: str
+    lora_name: str
+    image_data: Optional[List[str]]
+    extra_request_body: Dict[str, Any]
+    timestamp: Optional[float] = None
+
+
+@dataclass
+class RequestFuncOutput:
+    generated_text: str = ""
+    success: bool = False
+    latency: float = 0.0
+    ttft: float = 0.0  # Time to first token
+    itl: List[float] = field(default_factory=list)  # List of inter-token latencies
+    prompt_len: int = 0
+    error: str = ""
+    output_len: int = 0
+
+    @staticmethod
+    def init_new(request_func_input: RequestFuncInput):
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        return output
+
+
+def remove_prefix(text: str, prefix: str) -> str:
+    return text[len(prefix) :] if text.startswith(prefix) else text
+
+
+def remove_suffix(text: str, suffix: str) -> str:
+    return text[: -len(suffix)] if text.endswith(suffix) else text
+
+
+def get_auth_headers() -> Dict[str, str]:
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if api_key:
+        return {"Authorization": f"Bearer {api_key}"}
+    else:
+        return {}
+
+
+# trt llm does not support ignore_eos
+# https://github.com/triton-inference-server/tensorrtllm_backend/issues/505
+async def async_request_trt_llm(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+
+    async with _create_bench_client_session() as session:
+        payload = {
+            "accumulate_tokens": True,
+            "text_input": request_func_input.prompt,
+            "temperature": 0.000001,
+            "top_p": 1.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+            "min_length": request_func_input.output_len,
+            "end_id": 1048576,
+            **request_func_input.extra_request_body,
+        }
+        if args.disable_ignore_eos:
+            del payload["min_length"]
+            del payload["end_id"]
+        output = RequestFuncOutput.init_new(request_func_input)
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data:")
+
+                        data = json.loads(chunk)
+                        output.generated_text += data["text_output"]
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = timestamp - st
+                            output.ttft = ttft
+
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp - most_recent_timestamp)
+
+                        most_recent_timestamp = timestamp
+
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+                    output.output_len = request_func_input.output_len
+
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+# set ignore_eos True by default
+async def async_request_openai_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        "completions"
+    ), "OpenAI Completions API URL must end with 'completions'."
+
+    prompt = request_func_input.prompt
+
+    async with _create_bench_client_session() as session:
+        payload = {
+            "model": request_func_input.model,
+            "prompt": prompt,
+            "temperature": 0.0,
+            "best_of": 1,
+            "max_tokens": request_func_input.output_len,
+            "stream": not args.disable_stream,
+            "ignore_eos": not args.disable_ignore_eos,
+            **request_func_input.extra_request_body,
+        }
+        headers = get_auth_headers()
+
+        output = RequestFuncOutput.init_new(request_func_input)
+
+        generated_text = ""
+        output_len = request_func_input.output_len
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                        latency = time.perf_counter() - st
+                        if chunk == "[DONE]":
+                            pass
+                        else:
+                            data = json.loads(chunk)
+
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if data["choices"][0]["text"]:
+                                timestamp = time.perf_counter()
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text += data["choices"][0]["text"]
+                                output_len = (data.get("usage") or {}).get(
+                                    "completion_tokens", output_len
+                                )
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+                    output.output_len = output_len
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_openai_chat_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Makes a request to the OpenAI Chat Completions API.
+
+    Handles both streaming and non-streaming responses, including support
+    for image data in messages. Calculates and returns various performance
+    metrics.
+
+    Args:
+        request_func_input: Input parameters for the request.
+        pbar: Optional tqdm progress bar to update.
+
+    Returns:
+        RequestFuncOutput: Output of the request, including generated text,
+                           latency, TTFT, ITL, and success status.
+    """
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        "chat/completions"
+    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
+
+    if request_func_input.image_data:
+        # Build multi-image content: a list of image_url entries followed by the text
+        content_items = [
+            {
+                "type": "image_url",
+                "image_url": {"url": img_url},
+            }
+            for img_url in request_func_input.image_data
+        ]
+        content_items.append({"type": "text", "text": request_func_input.prompt})
+        messages = [
+            {
+                "role": "user",
+                "content": content_items,
+            },
+        ]
+    else:
+        messages = [{"role": "user", "content": request_func_input.prompt}]
+
+    async with _create_bench_client_session() as session:
+        payload = {
+            "model": request_func_input.model,
+            "messages": messages,
+            "temperature": 0.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": not args.disable_stream,
+            **request_func_input.extra_request_body,
+        }
+        headers = get_auth_headers()
+
+        output = RequestFuncOutput.init_new(request_func_input)
+
+        generated_text = ""
+        output_len = request_func_input.output_len
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    if args.disable_stream:
+                        # Non-streaming response
+                        response_json = await response.json()
+                        output.generated_text = response_json["choices"][0]["message"][
+                            "content"
+                        ]
+                        output.success = True
+                        output.latency = time.perf_counter() - st
+                        output.ttft = (
+                            output.latency
+                        )  # For non-streaming, TTFT = total latency
+                        output.output_len = response_json.get("usage", {}).get(
+                            "completion_tokens", output_len
+                        )
+                    else:
+                        # Streaming response
+                        async for chunk_bytes in response.content:
+                            chunk_bytes = chunk_bytes.strip()
+                            if not chunk_bytes:
+                                continue
+
+                            chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                            latency = time.perf_counter() - st
+                            if chunk == "[DONE]":
+                                pass
+                            else:
+                                data = json.loads(chunk)
+
+                                # Check if this chunk contains content
+                                delta = data.get("choices", [{}])[0].get("delta", {})
+                                content = delta.get("content", "")
+
+                                if content:
+                                    timestamp = time.perf_counter()
+                                    # First token
+                                    if ttft == 0.0:
+                                        ttft = timestamp - st
+                                        output.ttft = ttft
+
+                                    # Decoding phase
+                                    else:
+                                        output.itl.append(
+                                            timestamp - most_recent_timestamp
+                                        )
+
+                                    most_recent_timestamp = timestamp
+                                    generated_text += content
+
+                                # Check for usage info in final chunk
+                                output_len = (data.get("usage") or {}).get(
+                                    "completion_tokens", output_len
+                                )
+
+                        output.generated_text = generated_text
+                        output.success = True
+                        output.latency = latency
+                        output.output_len = output_len
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_truss(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+
+    prompt = request_func_input.prompt
+
+    async with _create_bench_client_session() as session:
+        payload = {
+            "model": request_func_input.model,
+            "prompt": prompt,
+            "temperature": 0.0,
+            "best_of": 1,
+            "max_tokens": request_func_input.output_len,
+            "stream": not args.disable_stream,
+            "ignore_eos": not args.disable_ignore_eos,
+            **request_func_input.extra_request_body,
+        }
+        headers = get_auth_headers()
+
+        output = RequestFuncOutput.init_new(request_func_input)
+
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                        latency = time.perf_counter() - st
+                        if chunk == "[DONE]":
+                            pass
+                        else:
+                            data = json.loads(chunk)
+
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if data["choices"][0]["text"]:
+                                timestamp = time.perf_counter()
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text += data["choices"][0]["text"]
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+                    output.output_len = request_func_input.output_len
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_sglang_generate(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    prompt = request_func_input.prompt
+
+    async with _create_bench_client_session() as session:
+        payload = {
+            ("text" if isinstance(prompt, str) else "input_ids"): prompt,
+            "sampling_params": {
+                "temperature": 0.0,
+                "max_new_tokens": request_func_input.output_len,
+                "ignore_eos": not args.disable_ignore_eos,
+            },
+            "stream": not args.disable_stream,
+            "lora_path": request_func_input.lora_name,
+            "return_logprob": args.return_logprob,
+            "logprob_start_len": -1,
+            **request_func_input.extra_request_body,
+        }
+
+        # Add image data if available (list of image urls/base64)
+        if request_func_input.image_data:
+            payload["image_data"] = request_func_input.image_data
+
+        headers = get_auth_headers()
+
+        output = RequestFuncOutput.init_new(request_func_input)
+
+        generated_text = ""
+        output_len = request_func_input.output_len
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        last_output_len = 0
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                        latency = time.perf_counter() - st
+                        if chunk == "[DONE]":
+                            pass
+                        else:
+                            data = json.loads(chunk)
+
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if "text" in data and data["text"]:
+                                timestamp = time.perf_counter()
+                                generated_text = data["text"]
+                                output_len = data["meta_info"]["completion_tokens"]
+
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    num_new_tokens = output_len - last_output_len
+                                    if num_new_tokens == 0:
+                                        continue
+                                    adjust_itl = (
+                                        timestamp - most_recent_timestamp
+                                    ) / num_new_tokens
+                                    output.itl.extend([adjust_itl] * num_new_tokens)
+
+                                most_recent_timestamp = timestamp
+                                last_output_len = output_len
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+                    output.output_len = output_len
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+            print(f"{output.error=}")
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_gserver(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    raise NotImplementedError()
+
+
+async def async_request_profile(api_url: str) -> RequestFuncOutput:
+    async with _create_bench_client_session() as session:
+        output = RequestFuncOutput()
+        try:
+            async with session.post(url=api_url) as response:
+                if response.status == 200:
+                    output.success = True
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    return output
+
+
+def get_model(pretrained_model_name_or_path: str) -> str:
+    if os.getenv("SGLANG_USE_MODELSCOPE", "false").lower() == "true":
+        import huggingface_hub.constants
+        from modelscope import snapshot_download
+
+        model_path = snapshot_download(
+            model_id=pretrained_model_name_or_path,
+            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+            ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
+        )
+
+        return model_path
+    return pretrained_model_name_or_path
+
+
+def get_tokenizer(
+    pretrained_model_name_or_path: str,
+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+    assert (
+        pretrained_model_name_or_path is not None
+        and pretrained_model_name_or_path != ""
+    )
+    if pretrained_model_name_or_path.endswith(
+        ".json"
+    ) or pretrained_model_name_or_path.endswith(".model"):
+        from sglang.srt.hf_transformers_utils import get_tokenizer
+
+        return get_tokenizer(pretrained_model_name_or_path)
+
+    if pretrained_model_name_or_path is not None and not os.path.exists(
+        pretrained_model_name_or_path
+    ):
+        pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
+    return AutoTokenizer.from_pretrained(
+        pretrained_model_name_or_path, trust_remote_code=True
+    )
+
+
+def get_dataset(args, tokenizer):
+    tokenize_prompt = getattr(args, "tokenize_prompt", False)
+    if args.dataset_name == "sharegpt":
+        assert not tokenize_prompt
+        input_requests = sample_sharegpt_requests(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            fixed_output_len=args.sharegpt_output_len,
+            context_len=args.sharegpt_context_len,
+            prompt_suffix=args.prompt_suffix,
+            apply_chat_template=args.apply_chat_template,
+        )
+    elif args.dataset_name.startswith("random") and args.dataset_name != "random-image":
+        input_requests = sample_random_requests(
+            input_len=args.random_input_len,
+            output_len=args.random_output_len,
+            num_prompts=args.num_prompts,
+            range_ratio=args.random_range_ratio,
+            tokenizer=tokenizer,
+            dataset_path=args.dataset_path,
+            random_sample=args.dataset_name == "random",
+            return_text=not tokenize_prompt,
+        )
+    elif args.dataset_name == "random-image":
+        assert not tokenize_prompt, "random-image does not support --tokenize-prompt"
+        input_requests = sample_random_image_requests(
+            num_requests=args.num_prompts,
+            num_images=args.random_image_num_images,
+            input_len=args.random_input_len,
+            output_len=args.random_output_len,
+            range_ratio=args.random_range_ratio,
+            tokenizer=tokenizer,
+            apply_chat_template=args.apply_chat_template,
+            image_resolution=args.random_image_resolution,
+        )
+    elif args.dataset_name == "generated-shared-prefix":
+        assert not tokenize_prompt
+        input_requests = sample_generated_shared_prefix_requests(
+            num_groups=args.gsp_num_groups,
+            prompts_per_group=args.gsp_prompts_per_group,
+            system_prompt_len=args.gsp_system_prompt_len,
+            question_len=args.gsp_question_len,
+            output_len=args.gsp_output_len,
+            tokenizer=tokenizer,
+            args=args,
+        )
+    elif args.dataset_name == "mmmu":
+        assert not tokenize_prompt
+        input_requests = sample_mmmu_requests(
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            fixed_output_len=args.random_output_len,
+            apply_chat_template=args.apply_chat_template,
+            random_sample=True,
+        )
+    elif args.dataset_name == "mooncake":
+        # For mooncake, we don't generate the prompts here.
+        # We just load the raw trace data. The async generator will handle the rest.
+        if not args.dataset_path:
+            local_path = os.path.join("/tmp", args.mooncake_workload + "_trace.jsonl")
+        else:
+            local_path = args.dataset_path
+
+        if not os.path.exists(local_path):
+            download_and_cache_file(
+                MOONCAKE_DATASET_URL[args.mooncake_workload], local_path
+            )
+
+        with open(local_path, "r") as f:
+            all_requests_data = [json.loads(line) for line in f if line.strip()]
+
+        # Limit the number of requests based on --num-prompts
+        input_requests = all_requests_data[: args.num_prompts]
+    else:
+        raise ValueError(f"Unknown dataset: {args.dataset_name}")
+    return input_requests
+
+
+ASYNC_REQUEST_FUNCS = {
+    "sglang": async_request_sglang_generate,
+    "sglang-native": async_request_sglang_generate,
+    "sglang-oai": async_request_openai_completions,
+    "sglang-oai-chat": async_request_openai_chat_completions,
+    "vllm": async_request_openai_completions,
+    "vllm-chat": async_request_openai_chat_completions,
+    "lmdeploy": async_request_openai_completions,
+    "lmdeploy-chat": async_request_openai_chat_completions,
+    "trt": async_request_trt_llm,
+    "gserver": async_request_gserver,
+    "truss": async_request_truss,
+}
+
+
+@dataclass
+class BenchmarkMetrics:
+    completed: int
+    total_input: int
+    total_output: int
+    total_output_retokenized: int
+    request_throughput: float
+    input_throughput: float
+    output_throughput: float
+    output_throughput_retokenized: float
+    total_throughput: float
+    total_throughput_retokenized: float
+    mean_ttft_ms: float
+    median_ttft_ms: float
+    std_ttft_ms: float
+    p99_ttft_ms: float
+    mean_tpot_ms: float
+    median_tpot_ms: float
+    std_tpot_ms: float
+    p99_tpot_ms: float
+    mean_itl_ms: float
+    median_itl_ms: float
+    std_itl_ms: float
+    p95_itl_ms: float
+    p99_itl_ms: float
+    max_itl_ms: float
+    mean_e2e_latency_ms: float
+    median_e2e_latency_ms: float
+    std_e2e_latency_ms: float
+    p99_e2e_latency_ms: float
+    concurrency: float
+
+
+SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
+MOONCAKE_DATASET_URL = {
+    "mooncake": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/arxiv-trace/mooncake_trace.jsonl",
+    "conversation": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/traces/conversation_trace.jsonl",
+    "synthetic": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/traces/synthetic_trace.jsonl",
+    "toolagent": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/traces/toolagent_trace.jsonl",
+}
+
+
+def download_and_cache_file(url: str, filename: Optional[str] = None):
+    """Read and cache a file from a url."""
+    if filename is None:
+        filename = os.path.join("/tmp", url.split("/")[-1])
+
+    # Check if the cache file already exists
+    if is_file_valid_json(filename):
+        return filename
+
+    print(f"Downloading from {url} to {filename}")
+
+    # Stream the response to show the progress bar
+    response = requests.get(url, stream=True)
+    response.raise_for_status()  # Check for request errors
+
+    # Total size of the file in bytes
+    total_size = int(response.headers.get("content-length", 0))
+    chunk_size = 1024  # Download in chunks of 1KB
+
+    # Use tqdm to display the progress bar
+    with open(filename, "wb") as f, tqdm(
+        desc=filename,
+        total=total_size,
+        unit="B",
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as bar:
+        for chunk in response.iter_content(chunk_size=chunk_size):
+            f.write(chunk)
+            bar.update(len(chunk))
+
+    return filename
+
+
+def is_file_valid_json(path):
+    if not os.path.isfile(path):
+        return False
+
+    # TODO can fuse into the real file open later
+    try:
+        with open(path) as f:
+            json.load(f)
+        return True
+    except JSONDecodeError as e:
+        print(
+            f"{path} exists but json loading fails ({e=}), thus treat as invalid file"
+        )
+        return False
+
+
+@dataclass
+class DatasetRow:
+    prompt: str
+    prompt_len: int
+    output_len: int
+    image_data: Optional[List[str]] = None
+    timestamp: Optional[float] = None
+
+
+async def get_mooncake_request_over_time(
+    input_requests: List[Dict],
+    tokenizer: PreTrainedTokenizerBase,
+    slowdown_factor: float,
+    num_rounds: int,
+) -> AsyncGenerator[DatasetRow, None]:
+    """
+    An async generator that yields requests based on the timestamps in the Mooncake trace file,
+    with support for multi-round sessions.
+    """
+    if not input_requests:
+        return
+
+    input_requests.sort(key=lambda r: r["timestamp"])
+
+    start_time = time.perf_counter()
+    trace_start_time_ms = input_requests[0]["timestamp"]
+
+    for record in input_requests:
+        # Calculate when this entire session should start
+        relative_arrival_time_s = (record["timestamp"] - trace_start_time_ms) / 1000.0
+        target_arrival_time_s = relative_arrival_time_s * slowdown_factor
+
+        current_elapsed_time_s = time.perf_counter() - start_time
+        sleep_duration_s = target_arrival_time_s - current_elapsed_time_s
+        if sleep_duration_s > 0:
+            await asyncio.sleep(sleep_duration_s)
+
+        # Once the session starts, generate all rounds for it as a burst
+        # This simulates a user engaging in a multi-turn conversation
+
+        # Base user query constructed from hash_ids
+        user_query_base = ""
+        hash_ids = record.get("hash_ids", [])
+        for hash_id in hash_ids:
+            user_query_base += f"{hash_id}" + " ".join(
+                ["hi"] * 128
+            )  # Shorter for multi-round
+        user_query_base += "Tell me a story based on this context."
+
+        output_len_per_round = record.get("output_length", 256)
+        chat_history = []
+
+        for i in range(num_rounds):
+            # Add user query for the current round
+            chat_history.append(
+                {"role": "user", "content": f"Round {i+1}: {user_query_base}"}
+            )
+
+            # Form the full prompt from history
+            try:
+                full_prompt_text = tokenizer.apply_chat_template(
+                    chat_history, tokenize=False, add_generation_prompt=True
+                )
+            except Exception:
+                full_prompt_text = "\n".join(
+                    [f"{msg['role']}: {msg['content']}" for msg in chat_history]
+                )
+
+            prompt_len = len(tokenizer.encode(full_prompt_text))
+
+            yield DatasetRow(
+                prompt=full_prompt_text,
+                prompt_len=prompt_len,
+                output_len=output_len_per_round,
+            )
+
+            # Add a placeholder assistant response for the next round's context
+            # We use a placeholder because we don't know the real response
+            placeholder_response = " ".join(["story"] * output_len_per_round)
+            chat_history.append({"role": "assistant", "content": placeholder_response})
+
+
+def sample_mmmu_requests(
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int] = None,
+    apply_chat_template: bool = True,
+    random_sample: bool = True,
+) -> List[DatasetRow]:
+    """
+    Sample requests from the MMMU dataset using HuggingFace datasets.
+
+    Args:
+        num_requests: Number of requests to sample.
+        tokenizer: Tokenizer to use for token counting.
+        fixed_output_len: If provided, use this fixed output length for all requests.
+        apply_chat_template: Whether to apply the chat template to the prompt.
+        random_sample: Whether to randomly sample or take the first N.
+
+    Returns:
+        List of tuples (prompt, prompt_token_len, output_token_len).
+    """
+    try:
+        import io
+
+        import pybase64
+        from datasets import load_dataset
+    except ImportError:
+        raise ImportError("Please install datasets: pip install datasets")
+
+    print("Loading MMMU dataset from HuggingFace...")
+
+    try:
+        print("Attempting to load MMMU Math dataset...")
+        mmmu_dataset = load_dataset("MMMU/MMMU", "Math", split="test")
+        print(
+            f"Successfully loaded MMMU Math dataset from HuggingFace with {len(mmmu_dataset)} examples"
+        )
+    except Exception as e:
+        print(f"Failed to load MMMU Math dataset: {e}")
+        raise ValueError(f"Failed to load MMMU dataset: {e}")
+
+    # Sample from the dataset
+    if len(mmmu_dataset) > num_requests:
+        if random_sample:
+            # Random sample
+            indices = random.sample(range(len(mmmu_dataset)), num_requests)
+            sample_dataset = mmmu_dataset.select(indices)
+        else:
+            # Take first N
+            sample_dataset = mmmu_dataset.select(
+                range(min(num_requests, len(mmmu_dataset)))
+            )
+    else:
+        print(f"Dataset has less than {num_requests} examples, using all examples")
+        sample_dataset = mmmu_dataset
+
+    print(f"Selected {len(sample_dataset)} examples for benchmarking")
+
+    # Create prompts
+    filtered_dataset = []
+
+    for i, example in enumerate(sample_dataset):
+        try:
+            # Extract image_1
+            image = example.get("image_1")
+
+            if image is not None:
+                if hasattr(image, "save"):
+                    # Convert RGBA images to RGB before encoding
+                    if image.mode == "RGBA":
+                        image = image.convert("RGB")
+
+                    # Encode image to base64 (save as PNG to support palette/alpha modes)
+                    buffered = io.BytesIO()
+                    image.save(buffered, format="PNG")
+                    img_str = pybase64.b64encode(buffered.getvalue()).decode("utf-8")
+                    image_data = f"data:image/png;base64,{img_str}"
+                else:
+                    continue
+
+                # Extract the question
+                question = example.get("question")
+
+                # Construct the prompt
+                prompt = f"Question: {question}\n\nAnswer: "
+                if apply_chat_template:
+                    try:
+                        is_phi4_multimodal = (
+                            "phi-4-multimodal" in tokenizer.name_or_path.lower()
+                        )
+                        if is_phi4_multimodal:
+                            # <|endoftext10|> is the image token used in the phi-4-multimodal model.
+                            content = prompt.replace("image 1", "<|endoftext10|>")
+                        else:
+                            content = [
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": image_data},
+                                },
+                                {"type": "text", "text": prompt},
+                            ]
+                        prompt = tokenizer.apply_chat_template(
+                            [
+                                {
+                                    "role": "user",
+                                    "content": content,
+                                }
+                            ],
+                            add_generation_prompt=True,
+                            tokenize=False,
+                        )
+                    except Exception as e:
+                        # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
+                        print(
+                            f"Error applying chat template: {e}, fallback to <image> tag"
+                        )
+                        prompt = f"<image>{prompt}"
+
+                # Calculate token lengths for text only (without image data)
+                prompt_token_ids = tokenizer.encode(prompt)
+                prompt_len = len(prompt_token_ids)
+
+                output_len = fixed_output_len if fixed_output_len is not None else 256
+
+                filtered_dataset.append(
+                    DatasetRow(
+                        prompt=prompt,
+                        prompt_len=prompt_len,
+                        output_len=output_len,
+                        image_data=[image_data],
+                    )
+                )
+
+        except Exception as e:
+            print(f"Error processing example {i}: {e}")
+
+    print(f"\nCreated {len(filtered_dataset)} MMMU prompts")
+    return filtered_dataset
+
+
+def sample_sharegpt_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int] = None,
+    context_len: Optional[int] = None,
+    prompt_suffix: Optional[str] = "",
+    apply_chat_template=False,
+) -> List[DatasetRow]:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Download sharegpt if necessary
+    if not is_file_valid_json(dataset_path) and dataset_path == "":
+        dataset_path = download_and_cache_file(SHAREGPT_URL)
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+
+    # Filter out the conversations with less than 2 turns.
+    dataset = [
+        data
+        for data in dataset
+        if len(data.get("conversations", data.get("conversation", []))) >= 2
+    ]
+    # Only keep the first two turns of each conversation.
+    dataset = [
+        (
+            data.get("conversations", data.get("conversation", []))[0]["value"],
+            data.get("conversations", data.get("conversation", []))[1]["value"],
+        )
+        for data in dataset
+    ]
+
+    # Shuffle the dataset.
+    random.shuffle(dataset)
+
+    # Filter out sequences that are too long or too short
+    filtered_dataset: List[DatasetRow] = []
+    for i in range(len(dataset)):
+        if len(filtered_dataset) == num_requests:
+            break
+
+        # Tokenize the prompts and completions.
+        prompt = dataset[i][0]
+        if prompt_suffix:
+            prompt = (
+                remove_suffix(prompt, ASSISTANT_SUFFIX)
+                + prompt_suffix
+                + ASSISTANT_SUFFIX
+            )
+
+        if apply_chat_template:
+            prompt = tokenizer.apply_chat_template(
+                [{"role": "user", "content": prompt}],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+            prompt = prompt.replace(tokenizer.bos_token, "")
+
+        prompt_token_ids = tokenizer.encode(prompt)
+        completion = dataset[i][1]
+        completion_token_ids = tokenizer.encode(completion)
+        prompt_len = len(prompt_token_ids)
+        output_len = (
+            len(completion_token_ids) if fixed_output_len is None else fixed_output_len
+        )
+
+        if prompt_len < 2 or output_len < 2:
+            # Prune too short sequences.
+            continue
+
+        if context_len and prompt_len + output_len > context_len:
+            # Prune too long sequences.
+            continue
+
+        filtered_dataset.append(
+            DatasetRow(prompt=prompt, prompt_len=prompt_len, output_len=output_len)
+        )
+
+    print(f"#Input tokens: {np.sum([x.prompt_len for x in filtered_dataset])}")
+    print(f"#Output tokens: {np.sum([x.output_len for x in filtered_dataset])}")
+    return filtered_dataset
+
+
+def sample_random_requests(
+    input_len: int,
+    output_len: int,
+    num_prompts: int,
+    range_ratio: float,
+    tokenizer: PreTrainedTokenizerBase,
+    dataset_path: str,
+    random_sample: bool = True,
+    return_text: bool = True,
+) -> List[DatasetRow]:
+    input_lens = np.random.randint(
+        max(int(input_len * range_ratio), 1),
+        input_len + 1,
+        size=num_prompts,
+    )
+    output_lens = np.random.randint(
+        int(output_len * range_ratio),
+        output_len + 1,
+        size=num_prompts,
+    )
+
+    if random_sample:
+        # Sample token ids from ShareGPT and repeat/truncate them to satisfy the input_lens
+
+        # Download sharegpt if necessary
+        if not is_file_valid_json(dataset_path):
+            dataset_path = download_and_cache_file(SHAREGPT_URL)
+
+        # Load the dataset.
+        with open(dataset_path) as f:
+            dataset = json.load(f)
+        # Filter out the conversations with less than 2 turns.
+        dataset = [
+            data
+            for data in dataset
+            if len(data.get("conversations", data.get("conversation", []))) >= 2
+        ]
+        # Only keep the first two turns of each conversation.
+        dataset = [
+            (
+                data.get("conversations", data.get("conversation", []))[0]["value"],
+                data.get("conversations", data.get("conversation", []))[1]["value"],
+            )
+            for data in dataset
+        ]
+        # Shuffle the dataset.
+        random.shuffle(dataset)
+
+        # Filter out sequences that are too long or too short
+        input_requests: List[DatasetRow] = []
+        for data in dataset:
+            i = len(input_requests)
+            if i == num_prompts:
+                break
+
+            # Tokenize the prompts and completions.
+            prompt = data[0]
+            prompt_token_ids = tokenizer.encode(prompt)
+            prompt_len = len(prompt_token_ids)
+
+            # Skip empty prompt
+            if prompt_len == 0:
+                continue
+
+            if prompt_len > input_lens[i]:
+                input_ids = prompt_token_ids[: input_lens[i]]
+            else:
+                ratio = (input_lens[i] + prompt_len - 1) // prompt_len
+                input_ids = (prompt_token_ids * ratio)[: input_lens[i]]
+            input_content = input_ids
+            if return_text:
+                input_content = tokenizer.decode(input_content)
+            input_requests.append(
+                DatasetRow(
+                    prompt=input_content,
+                    prompt_len=int(input_lens[i]),
+                    output_len=int(output_lens[i]),
+                )
+            )
+    else:
+        # Sample token ids from random integers. This can cause some NaN issues.
+        offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
+        input_requests = []
+        for i in range(num_prompts):
+            input_content = [
+                (offsets[i] + i + j) % tokenizer.vocab_size
+                for j in range(input_lens[i])
+            ]
+            if return_text:
+                input_content = tokenizer.decode(input_content)
+            input_requests.append(
+                DatasetRow(
+                    prompt=input_content,
+                    prompt_len=int(input_lens[i]),
+                    output_len=int(output_lens[i]),
+                )
+            )
+
+    print(f"#Input tokens: {np.sum(input_lens)}")
+    print(f"#Output tokens: {np.sum(output_lens)}")
+    return input_requests
+
+
+def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]:
+    """Parse image resolution into (width, height).
+
+    Supports presets '1080p', '720p', '360p' and custom 'heightxwidth' format
+    (e.g., '1080x1920' means height=1080, width=1920).
+    """
+    resolution_to_size = {
+        "4k": (3840, 2160),
+        "1080p": (1920, 1080),
+        "720p": (1280, 720),
+        "360p": (640, 360),
+    }
+    if image_resolution in resolution_to_size:
+        return resolution_to_size[image_resolution]
+
+    res = image_resolution.strip().lower()
+    if "x" in res:
+        parts = res.split("x")
+        if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
+            height = int(parts[0])
+            width = int(parts[1])
+            if height > 0 and width > 0:
+                return (width, height)
+
+    raise ValueError(
+        f"Unsupported random-image resolution: {image_resolution}. "
+        "Choose from 4k, 1080p, 720p, 360p, or provide custom 'heightxwidth' (e.g., 1080x1920)."
+    )
+
+
+def sample_random_image_requests(
+    num_requests: int,
+    num_images: int,
+    input_len: int,
+    output_len: int,
+    range_ratio: float,
+    tokenizer: PreTrainedTokenizerBase,
+    apply_chat_template: bool = True,
+    image_resolution: str = "1080p",
+) -> List[DatasetRow]:
+    """Generate requests with random images.
+
+    - Each request includes ``num_images`` random images.
+    - Supported resolutions: 4k (3840x2160), 1080p (1920x1080), 720p (1280x720), 360p (640x360),
+      or custom 'heightxwidth' (e.g., 1080x1920).
+    - Text lengths follow the 'random' dataset sampling rule. ``prompt_len``
+      only counts text tokens and excludes image data.
+    """
+    try:
+        import pybase64
+        from PIL import Image
+    except ImportError as e:
+        raise ImportError(
+            "Please install Pillow to generate random images: pip install pillow"
+        ) from e
+
+    # Parse resolution (supports presets and 'heightxwidth')
+    width, height = parse_random_image_resolution(image_resolution)
+
+    # Check for potentially problematic combinations and warn user
+    if width * height >= 1920 * 1080 and num_images * num_requests >= 100:
+        warnings.warn(
+            f"High resolution ({width}x{height}) with {num_images * num_requests} total images "
+            f"may take a long time. Consider reducing resolution or image count.",
+            UserWarning,
+            stacklevel=2,
+        )
+
+    # Sample text lengths
+    input_lens = np.random.randint(
+        max(int(input_len * range_ratio), 1), input_len + 1, size=num_requests
+    )
+    output_lens = np.random.randint(
+        int(output_len * range_ratio), output_len + 1, size=num_requests
+    )
+
+    def _gen_random_image_data_uri(width: int = width, height: int = height) -> str:
+        arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
+        img = Image.fromarray(arr, mode="RGB")
+        buf = io.BytesIO()
+        img.save(buf, format="JPEG", quality=85)
+        encoded = pybase64.b64encode(buf.getvalue()).decode("utf-8")
+        return f"data:image/jpeg;base64,{encoded}"
+
+    dataset: List[DatasetRow] = []
+    for i in range(num_requests):
+        # Generate text prompt
+        text_prompt = gen_prompt(tokenizer, int(input_lens[i]))
+
+        # Generate image list
+        images = [_gen_random_image_data_uri() for _ in range(num_images)]
+
+        prompt_str = text_prompt
+        if apply_chat_template:
+            try:
+                content_items = [
+                    {"type": "image_url", "image_url": {"url": img_url}}
+                    for img_url in images
+                ]
+                content_items.append({"type": "text", "text": text_prompt})
+                prompt_str = tokenizer.apply_chat_template(
+                    [{"role": "user", "content": content_items}],
+                    add_generation_prompt=True,
+                    tokenize=False,
+                )
+            except Exception:
+                # Some tokenizers do not support list content; fall back to a placeholder in the text
+                prompt_str = f"<image>{text_prompt}"
+
+        prompt_token_ids = tokenizer.encode(prompt_str)
+        prompt_token_len = len(prompt_token_ids)
+
+        dataset.append(
+            DatasetRow(
+                prompt=prompt_str,
+                prompt_len=prompt_token_len,
+                output_len=int(output_lens[i]),
+                image_data=images,
+            )
+        )
+
+    print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
+    print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
+    return dataset
+
+
+def gen_prompt(tokenizer, token_num):
+    """Generate a random prompt of specified token length using tokenizer vocabulary."""
+    all_available_tokens = list(tokenizer.get_vocab().values())
+    selected_tokens = random.choices(all_available_tokens, k=token_num)
+    return tokenizer.decode(selected_tokens)
+
+
+def get_gen_prefix_cache_path(args, tokenizer):
+    """Create cache directory under ~/.cache/sglang/benchmark"""
+    cache_dir = Path.home() / ".cache" / "sglang" / "benchmark"
+
+    # Create a unique cache filename based on the generation parameters
+    cache_key = (
+        f"gen_shared_prefix_{args.gsp_num_groups}_{args.gsp_prompts_per_group}_"
+        f"{args.gsp_system_prompt_len}_{args.gsp_question_len}_{args.gsp_output_len}_"
+        f"{tokenizer.__class__.__name__}.pkl"
+    )
+    return cache_dir / cache_key
+
+
+def sample_generated_shared_prefix_requests(
+    num_groups: int,
+    prompts_per_group: int,
+    system_prompt_len: int,
+    question_len: int,
+    output_len: int,
+    tokenizer: PreTrainedTokenizerBase,
+    args: argparse.Namespace,
+) -> List[DatasetRow]:
+    """Generate benchmark requests with shared system prompts using random tokens and caching."""
+    cache_path = get_gen_prefix_cache_path(args, tokenizer)
+
+    # Try to load from cache first
+    if cache_path.exists():
+        print(f"\nLoading cached generated input data from {cache_path}")
+        with open(cache_path, "rb") as f:
+            return pickle.load(f)
+
+    print("\nGenerating new input data...")
+
+    # Generate system prompts for each group
+    system_prompts = []
+    for _ in range(num_groups):
+        system_prompt = gen_prompt(tokenizer, system_prompt_len)
+        system_prompts.append(system_prompt)
+
+    # Generate questions
+    questions = []
+    for _ in range(num_groups * prompts_per_group):
+        question = gen_prompt(tokenizer, question_len)
+        questions.append(question)
+
+    # Combine system prompts with questions
+    input_requests = []
+    total_input_tokens = 0
+    total_output_tokens = 0
+
+    for group_idx in tqdm(range(num_groups), desc="Generating system prompt"):
+        system_prompt = system_prompts[group_idx]
+        for prompt_idx in tqdm(
+            range(prompts_per_group), desc="Generating questions", leave=False
+        ):
+            question = questions[group_idx * prompts_per_group + prompt_idx]
+            full_prompt = f"{system_prompt}\n\n{question}"
+            prompt_len = len(tokenizer.encode(full_prompt))
+
+            input_requests.append(
+                DatasetRow(
+                    prompt=full_prompt, prompt_len=prompt_len, output_len=output_len
+                )
+            )
+            total_input_tokens += prompt_len
+            total_output_tokens += output_len
+
+    # Shuffle questions
+    random.shuffle(input_requests)
+
+    # Print statistics
+    print(f"\nGenerated shared prefix dataset statistics:")
+    print(f"Number of groups: {num_groups}")
+    print(f"Prompts per group: {prompts_per_group}")
+    print(f"Total prompts: {len(input_requests)}")
+    print(f"Total input tokens: {total_input_tokens}")
+    print(f"Total output tokens: {total_output_tokens}")
+    print(
+        f"Average system prompt length: {sum(len(tokenizer.encode(sp)) for sp in system_prompts) / len(system_prompts):.1f} tokens"
+    )
+    print(
+        f"Average question length: {sum(len(tokenizer.encode(q)) for q in questions) / len(questions):.1f} tokens\n"
+    )
+
+    # Save to cache
+    cache_path.parent.mkdir(parents=True, exist_ok=True)
+    print(f"Caching generated input data to {cache_path}")
+    with open(cache_path, "wb") as f:
+        pickle.dump(input_requests, f)
+
+    return input_requests
+
+
+async def get_request(
+    input_requests: List[DatasetRow],
+    request_rate: float,
+    use_trace_timestamps: bool = False,
+    slowdown_factor: float = 1.0,
+) -> AsyncGenerator[DatasetRow, None]:
+    if use_trace_timestamps:
+        print(
+            f"Using trace timestamps for request generation with slowdown factor {slowdown_factor}."
+        )
+        # Sort requests by timestamp for correct replay
+        input_requests.sort(key=lambda r: r.timestamp)
+
+        start_time = time.perf_counter()
+        trace_start_time_ms = input_requests[0].timestamp if input_requests else 0
+
+        for request in input_requests:
+            trace_time_s = (request.timestamp - trace_start_time_ms) / 1000.0
+            target_arrival_time = start_time + (trace_time_s * slowdown_factor)
+
+            sleep_duration = target_arrival_time - time.perf_counter()
+            if sleep_duration > 0:
+                await asyncio.sleep(sleep_duration)
+
+            yield request
+    else:
+        input_requests_iter = iter(input_requests)
+        for request in input_requests_iter:
+            yield request
+
+            if request_rate == float("inf"):
+                # If the request rate is infinity, then we don't need to wait.
+                continue
+
+            # Sample the request interval from the exponential distribution.
+            interval = np.random.exponential(1.0 / request_rate)
+            # The next request will be sent after the interval.
+            await asyncio.sleep(interval)
+
+
+def calculate_metrics(
+    input_requests: List[DatasetRow],
+    outputs: List[RequestFuncOutput],
+    dur_s: float,
+    tokenizer: PreTrainedTokenizerBase,
+    backend: str,
+) -> Tuple[BenchmarkMetrics, List[int]]:
+    output_lens: List[int] = []
+    retokenized_output_lens: List[int] = []
+    total_input = 0
+    completed = 0
+    itls: List[float] = []
+    tpots: List[float] = []
+    ttfts: List[float] = []
+    e2e_latencies: List[float] = []
+    for i in range(len(outputs)):
+        if outputs[i].success:
+            output_len = outputs[i].output_len
+            output_lens.append(output_len)
+            retokenized_output_len = len(
+                tokenizer.encode(outputs[i].generated_text, add_special_tokens=False)
+            )
+            retokenized_output_lens.append(retokenized_output_len)
+            total_input += outputs[i].prompt_len
+            if output_len > 1:
+                tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
+            itls += outputs[i].itl
+            ttfts.append(outputs[i].ttft)
+
+            e2e_latencies.append(outputs[i].latency)
+
+            completed += 1
+        else:
+            output_lens.append(0)
+            retokenized_output_lens.append(0)
+
+    if completed == 0:
+        warnings.warn(
+            "All requests failed. This is likely due to a misconfiguration "
+            "on the benchmark arguments.",
+            stacklevel=2,
+        )
+    metrics = BenchmarkMetrics(
+        completed=completed,
+        total_input=total_input,
+        total_output=sum(output_lens),
+        total_output_retokenized=sum(retokenized_output_lens),
+        request_throughput=completed / dur_s,
+        input_throughput=total_input / dur_s,
+        output_throughput=sum(output_lens) / dur_s,
+        output_throughput_retokenized=sum(retokenized_output_lens) / dur_s,
+        total_throughput=(total_input + sum(output_lens)) / dur_s,
+        total_throughput_retokenized=(total_input + sum(retokenized_output_lens))
+        / dur_s,
+        mean_ttft_ms=np.mean(ttfts or 0)
+        * 1000,  # ttfts is empty if streaming is not supported by backend
+        median_ttft_ms=np.median(ttfts or 0) * 1000,
+        std_ttft_ms=np.std(ttfts or 0) * 1000,
+        p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
+        mean_tpot_ms=np.mean(tpots or 0) * 1000,
+        median_tpot_ms=np.median(tpots or 0) * 1000,
+        std_tpot_ms=np.std(tpots or 0) * 1000,
+        p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
+        mean_itl_ms=np.mean(itls or 0) * 1000,
+        median_itl_ms=np.median(itls or 0) * 1000,
+        std_itl_ms=np.std(itls or 0) * 1000,
+        p95_itl_ms=np.percentile(itls or 0, 95) * 1000,
+        p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
+        max_itl_ms=np.max(itls or 0) * 1000,
+        mean_e2e_latency_ms=np.mean(e2e_latencies) * 1000,
+        median_e2e_latency_ms=np.median(e2e_latencies) * 1000,
+        std_e2e_latency_ms=np.std(e2e_latencies) * 1000,
+        p99_e2e_latency_ms=np.percentile(e2e_latencies, 99) * 1000,
+        concurrency=np.sum(e2e_latencies) / dur_s,
+    )
+
+    return metrics, output_lens
+
+
+async def benchmark(
+    backend: str,
+    api_url: str,
+    base_url: str,
+    model_id: str,
+    tokenizer: PreTrainedTokenizerBase,
+    input_requests: List[DatasetRow],
+    request_rate: float,
+    max_concurrency: Optional[int],
+    disable_tqdm: bool,
+    lora_names: List[str],
+    extra_request_body: Dict[str, Any],
+    profile: bool,
+    pd_separated: bool = False,
+    flush_cache: bool = False,
+    warmup_requests: int = 1,
+    use_trace_timestamps: bool = False,
+    mooncake_slowdown_factor=1.0,
+    mooncake_num_rounds=1,
+):
+    if backend in ASYNC_REQUEST_FUNCS:
+        request_func = ASYNC_REQUEST_FUNCS[backend]
+    else:
+        raise ValueError(f"Unknown backend: {backend}")
+
+    # Limit concurrency
+    # From https://github.com/vllm-project/vllm/pull/9390
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
+
+    async def limited_request_func(request_func_input, pbar):
+        if semaphore is None:
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
+        async with semaphore:
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
+
+    # Warmup
+    print(f"Starting warmup with {warmup_requests} sequences...")
+
+    # Handle the data structure difference for the warmup request
+    if args.dataset_name == "mooncake":
+        # For mooncake, input_requests is a list of dicts.
+        # We need to build a temporary DatasetRow for the warmup phase.
+        warmup_record = input_requests[0]
+
+        # Build prompt from hash_ids, just like in the async generator
+        hash_ids = warmup_record.get("hash_ids", [])
+        prompt_text = ""
+        for hash_id in hash_ids:
+            prompt_text += f"{hash_id}" + " ".join(["hi"] * 512)
+        prompt_text += "Can you tell me a detailed story in 1000 words?"
+
+        output_len = warmup_record.get("output_length", 32)
+        prompt_len = len(tokenizer.encode(prompt_text))
+
+        # Create a temporary DatasetRow object for warmup
+        test_request = DatasetRow(
+            prompt=prompt_text,
+            prompt_len=prompt_len,
+            output_len=output_len,
+            image_data=None,  # Mooncake doesn't have image data
+        )
+    else:
+        # For all other datasets, input_requests is a list of DatasetRow objects
+        test_request = input_requests[0]
+
+    if lora_names is not None and len(lora_names) != 0:
+        lora_name = lora_names[0]
+    else:
+        lora_name = None
+
+    # Create the test input once
+    test_input = RequestFuncInput(
+        model=model_id,
+        prompt=test_request.prompt,
+        api_url=api_url,
+        prompt_len=test_request.prompt_len,
+        output_len=min(test_request.output_len, 32),
+        lora_name=lora_name,
+        image_data=test_request.image_data,
+        extra_request_body=extra_request_body,
+    )
+
+    # Run warmup requests
+    warmup_tasks = []
+    for _ in range(warmup_requests):
+        warmup_tasks.append(
+            asyncio.create_task(request_func(request_func_input=test_input))
+        )
+
+    warmup_outputs = await asyncio.gather(*warmup_tasks)
+
+    # Check if at least one warmup request succeeded
+    if warmup_requests > 0 and not any(output.success for output in warmup_outputs):
+        raise ValueError(
+            "Warmup failed - Please make sure benchmark arguments "
+            f"are correctly specified. Error: {warmup_outputs[0].error}"
+        )
+    else:
+        print(
+            f"Warmup completed with {args.warmup_requests} sequences. Starting main benchmark run..."
+        )
+
+    # Flush cache
+    if ("sglang" in backend and _get_bool_env_var("SGLANG_IS_IN_CI")) or flush_cache:
+        requests.post(base_url + "/flush_cache", headers=get_auth_headers())
+
+    time.sleep(1.0)
+
+    # Start profiler
+    if profile:
+        print("Starting profiler...")
+        profile_output = await async_request_profile(
+            api_url=base_url + "/start_profile"
+        )
+        if profile_output.success:
+            print("Profiler started")
+
+    # Run all requests
+    benchmark_start_time = time.perf_counter()
+    tasks: List[asyncio.Task] = []
+    pbar_total = len(input_requests)
+    if (
+        backend == "sglang" and args.dataset_name == "mooncake"
+    ):  # Assuming mooncake is mainly for sglang or similar backends
+        print("Using time-based Mooncake request scheduler, ignoring --request-rate.")
+        request_generator = get_mooncake_request_over_time(
+            input_requests, tokenizer, mooncake_slowdown_factor, mooncake_num_rounds
+        )
+        print(
+            f"Starting Mooncake trace replay. Sessions: {len(input_requests)}, Rounds per session: {mooncake_num_rounds}. Slowdown factor: {mooncake_slowdown_factor}"
+        )
+        pbar_total *= args.mooncake_num_rounds
+    else:
+        request_generator = get_request(input_requests, request_rate)
+
+    pbar = None if disable_tqdm else tqdm(total=pbar_total)
+    async for request in request_generator:
+        if lora_names is not None and len(lora_names) != 0:
+            idx = random.randint(0, len(lora_names) - 1)
+            lora_name = lora_names[idx]
+        else:
+            lora_name = None
+
+        request_func_input = RequestFuncInput(
+            model=model_id,
+            prompt=request.prompt,
+            api_url=api_url,
+            prompt_len=request.prompt_len,
+            output_len=request.output_len,
+            lora_name=lora_name,
+            image_data=request.image_data,
+            extra_request_body=extra_request_body,
+            timestamp=request.timestamp,
+        )
+
+        tasks.append(
+            asyncio.create_task(
+                limited_request_func(request_func_input=request_func_input, pbar=pbar)
+            )
+        )
+    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
+
+    # Stop profiler
+    if profile:
+        print("Stopping profiler...")
+        profile_output = await async_request_profile(api_url=base_url + "/stop_profile")
+        if profile_output.success:
+            print("Profiler stopped")
+
+    if pbar is not None:
+        pbar.close()
+
+    if "sglang" in backend:
+        server_info = requests.get(base_url + "/get_server_info")
+        if server_info.status_code == 200:
+            server_info_json = server_info.json()
+            if "decode" in server_info_json:
+                server_info_json = server_info_json["decode"][0]
+            accept_length = server_info_json["internal_states"][0].get(
+                "avg_spec_accept_length", None
+            )
+        else:
+            accept_length = None
+    else:
+        accept_length = None
+
+    # Compute metrics and print results
+    benchmark_duration = time.perf_counter() - benchmark_start_time
+    metrics, output_lens = calculate_metrics(
+        input_requests=input_requests,
+        outputs=outputs,
+        dur_s=benchmark_duration,
+        tokenizer=tokenizer,
+        backend=backend,
+    )
+
+    print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
+    print("{:<40} {:<10}".format("Backend:", backend))
+    print(
+        "{:<40} {:<10}".format(
+            "Traffic request rate:", "trace" if use_trace_timestamps else request_rate
+        )
+    )
+    print(
+        "{:<40} {:<10}".format(
+            "Max request concurrency:",
+            max_concurrency if max_concurrency else "not set",
+        )
+    )
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+    print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
+    print(
+        "{:<40} {:<10}".format(
+            "Total generated tokens (retokenized):", metrics.total_output_retokenized
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Request throughput (req/s):", metrics.request_throughput
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Input token throughput (tok/s):", metrics.input_throughput
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Output token throughput (tok/s):", metrics.output_throughput
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Total token throughput (tok/s):", metrics.total_throughput
+        )
+    )
+    print("{:<40} {:<10.2f}".format("Concurrency:", metrics.concurrency))
+    if accept_length:
+        print("{:<40} {:<10.2f}".format("Accept length:", accept_length))
+    print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
+    print(
+        "{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Median E2E Latency (ms):", metrics.median_e2e_latency_ms
+        )
+    )
+    print("{s:{c}^{n}}".format(s="Time to First Token", n=50, c="-"))
+    print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
+    print("{:<40} {:<10.2f}".format("Median TTFT (ms):", metrics.median_ttft_ms))
+    print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
+    print("{s:{c}^{n}}".format(s="Inter-Token Latency", n=50, c="-"))
+    print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
+    print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
+    print("{:<40} {:<10.2f}".format("P95 ITL (ms):", metrics.p95_itl_ms))
+    print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
+    print("{:<40} {:<10.2f}".format("Max ITL (ms):", metrics.max_itl_ms))
+    print("=" * 50)
+
+    if (
+        metrics.median_ttft_ms is not None
+        and metrics.mean_itl_ms is not None
+        and metrics.output_throughput is not None
+    ):
+        result = {
+            # Arguments
+            "backend": args.backend,
+            "dataset_name": args.dataset_name,
+            "request_rate": "trace" if use_trace_timestamps else request_rate,
+            "max_concurrency": max_concurrency,
+            "sharegpt_output_len": args.sharegpt_output_len,
+            "random_input_len": args.random_input_len,
+            "random_output_len": args.random_output_len,
+            "random_range_ratio": args.random_range_ratio,
+            # Results
+            "duration": benchmark_duration,
+            "completed": metrics.completed,
+            "total_input_tokens": metrics.total_input,
+            "total_output_tokens": metrics.total_output,
+            "total_output_tokens_retokenized": metrics.total_output_retokenized,
+            "request_throughput": metrics.request_throughput,
+            "input_throughput": metrics.input_throughput,
+            "output_throughput": metrics.output_throughput,
+            "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
+            "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
+            "std_e2e_latency_ms": metrics.std_e2e_latency_ms,
+            "p99_e2e_latency_ms": metrics.p99_e2e_latency_ms,
+            "mean_ttft_ms": metrics.mean_ttft_ms,
+            "median_ttft_ms": metrics.median_ttft_ms,
+            "std_ttft_ms": metrics.std_ttft_ms,
+            "p99_ttft_ms": metrics.p99_ttft_ms,
+            "mean_tpot_ms": metrics.mean_tpot_ms,
+            "median_tpot_ms": metrics.median_tpot_ms,
+            "std_tpot_ms": metrics.std_tpot_ms,
+            "p99_tpot_ms": metrics.p99_tpot_ms,
+            "mean_itl_ms": metrics.mean_itl_ms,
+            "median_itl_ms": metrics.median_itl_ms,
+            "std_itl_ms": metrics.std_itl_ms,
+            "p95_itl_ms": metrics.p95_itl_ms,
+            "p99_itl_ms": metrics.p99_itl_ms,
+            "concurrency": metrics.concurrency,
+            "accept_length": accept_length,
+        }
+    else:
+        print(f"Error running benchmark for request rate: {request_rate}")
+        print("-" * 30)
+
+    # Determine output file name
+    if args.output_file:
+        output_file_name = args.output_file
+    else:
+        now = datetime.now().strftime("%m%d")
+        if args.dataset_name == "random-image":
+            output_file_name = (
+                f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_"
+                f"{args.random_output_len}_{args.random_image_num_images}imgs_"
+                f"{args.random_image_resolution}.jsonl"
+            )
+        elif args.dataset_name.startswith("random"):
+            output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
+        else:
+            output_file_name = (
+                f"{args.backend}_{now}_{args.num_prompts}_{args.dataset_name}.jsonl"
+            )
+
+    result_details = {
+        "input_lens": [output.prompt_len for output in outputs],
+        "output_lens": output_lens,
+        "ttfts": [output.ttft for output in outputs],
+        "itls": [output.itl for output in outputs],
+        "generated_texts": [output.generated_text for output in outputs],
+        "errors": [output.error for output in outputs],
+    }
+
+    # Append results to a JSONL file
+    with open(output_file_name, "a") as file:
+        if args.output_details:
+            result_for_dump = result | result_details
+        else:
+            result_for_dump = result
+        file.write(json.dumps(result_for_dump) + "\n")
+
+    return result | result_details
+
+
+def check_chat_template(model_path):
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        return "chat_template" in tokenizer.init_kwargs
+    except Exception as e:
+        print(f"Fail to load tokenizer config with error={e}")
+        return False
+
+
+def set_global_args(args_: argparse.Namespace):
+    """Set the global args."""
+    global args
+    args = args_
+
+
+def run_benchmark(args_: argparse.Namespace):
+    global args
+    args = args_
+
+    # Set default value for max_concurrency if not present
+    if not hasattr(args, "max_concurrency"):
+        args.max_concurrency = None
+
+    # Set default value for warmup_requests if not present
+    if not hasattr(args, "warmup_requests"):
+        args.warmup_requests = 1
+
+    if not hasattr(args, "output_details"):
+        args.output_details = False
+
+    if not hasattr(args, "tokenize_prompt"):
+        args.tokenize_prompt = False
+
+    if not hasattr(args, "use_trace_timestamps"):
+        args.use_trace_timestamps = False
+    if not hasattr(args, "mooncake_slowdown_factor"):
+        args.mooncake_slowdown_factor = 1.0
+
+    if not hasattr(args, "mooncake_slowdown_factor"):
+        args.mooncake_slowdown_factor = 1.0
+
+    if not hasattr(args, "mooncake_num_rounds"):
+        args.mooncake_num_rounds = 1
+
+    print(f"benchmark_args={args}")
+
+    # Set global environments
+    set_ulimit()
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    extra_request_body = {}
+    if args.extra_request_body:
+        extra_request_body = json.loads(args.extra_request_body)
+
+    if args.tokenize_prompt:
+        assert (
+            args.backend == "sglang"
+        ), "`--tokenize-prompt` only compatible with `--backend sglang` currently"
+
+    # Set url
+    if args.port is None:
+        args.port = {
+            "sglang": 30000,
+            "sglang-native": 30000,
+            "sglang-oai": 30000,
+            "lmdeploy": 23333,
+            "vllm": 8000,
+            "trt": 8000,
+            "gserver": 9988,
+            "truss": 8080,
+        }.get(args.backend, 30000)
+
+    model_url = (
+        f"{args.base_url}/v1/models"
+        if args.base_url
+        else f"http://{args.host}:{args.port}/v1/models"
+    )
+
+    if args.backend in ["sglang", "sglang-native"]:
+        api_url = (
+            f"{args.base_url}/generate"
+            if args.base_url
+            else f"http://{args.host}:{args.port}/generate"
+        )
+    elif args.backend in ["sglang-oai", "vllm", "lmdeploy"]:
+        api_url = (
+            f"{args.base_url}/v1/completions"
+            if args.base_url
+            else f"http://{args.host}:{args.port}/v1/completions"
+        )
+    elif args.backend in ["sglang-oai-chat", "vllm-chat", "lmdeploy-chat"]:
+        api_url = (
+            f"{args.base_url}/v1/chat/completions"
+            if args.base_url
+            else f"http://{args.host}:{args.port}/v1/chat/completions"
+        )
+    elif args.backend == "trt":
+        api_url = (
+            f"{args.base_url}/v2/models/ensemble/generate_stream"
+            if args.base_url
+            else f"http://{args.host}:{args.port}/v2/models/ensemble/generate_stream"
+        )
+        if args.model is None:
+            print("Please provide a model using `--model` when using `trt` backend.")
+            sys.exit(1)
+    elif args.backend == "gserver":
+        api_url = args.base_url if args.base_url else f"{args.host}:{args.port}"
+        args.model = args.model or "default"
+    elif args.backend == "truss":
+        api_url = (
+            f"{args.base_url}/v1/models/model:predict"
+            if args.base_url
+            else f"http://{args.host}:{args.port}/v1/models/model:predict"
+        )
+    base_url = (
+        f"http://{args.host}:{args.port}" if args.base_url is None else args.base_url
+    )
+
+    # Get model name
+    if args.model is None:
+        if args.backend == "truss":
+            print(
+                "Please provide a model with `--model` when using truss backend. e.g. --model meta-llama/Llama-3.1-8B-Instruct"
+            )
+            sys.exit(1)
+        try:
+            response = requests.get(model_url, headers=get_auth_headers())
+            model_list = response.json().get("data", [])
+            args.model = model_list[0]["id"] if model_list else None
+        except Exception as e:
+            print(f"Failed to fetch model from {model_url}. Error: {e}")
+            print(
+                "Please specify the correct host and port using `--host` and `--port`."
+            )
+            sys.exit(1)
+
+    if args.model is None:
+        print("No model specified or found. Please provide a model using `--model`.")
+        sys.exit(1)
+
+    if not check_chat_template(args.model):
+        print(
+            "\nWARNING It is recommended to use the `Chat` or `Instruct` model for benchmarking.\n"
+            "Because when the tokenizer counts the output tokens, if there is gibberish, it might count incorrectly.\n"
+        )
+
+    print(f"{args}\n")
+
+    # Read dataset
+    backend = args.backend
+    model_id = args.model
+    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+    tokenizer = get_tokenizer(tokenizer_id)
+    input_requests = get_dataset(args, tokenizer)
+
+    # compatible with SimpleNamespace
+    if not hasattr(args, "flush_cache"):
+        args.flush_cache = False
+
+    return asyncio.run(
+        benchmark(
+            backend=backend,
+            api_url=api_url,
+            base_url=base_url,
+            model_id=model_id,
+            tokenizer=tokenizer,
+            input_requests=input_requests,
+            request_rate=args.request_rate,
+            max_concurrency=args.max_concurrency,
+            disable_tqdm=args.disable_tqdm,
+            lora_names=args.lora_name,
+            extra_request_body=extra_request_body,
+            profile=args.profile,
+            pd_separated=args.pd_separated,
+            flush_cache=args.flush_cache,
+            warmup_requests=args.warmup_requests,
+            use_trace_timestamps=args.use_trace_timestamps,
+            mooncake_slowdown_factor=args.mooncake_slowdown_factor,
+            mooncake_num_rounds=args.mooncake_num_rounds,
+        )
+    )
+
+
+def set_ulimit(target_soft_limit=65535):
+    resource_type = resource.RLIMIT_NOFILE
+    current_soft, current_hard = resource.getrlimit(resource_type)
+
+    if current_soft < target_soft_limit:
+        try:
+            resource.setrlimit(resource_type, (target_soft_limit, current_hard))
+        except ValueError as e:
+            print(f"Fail to set RLIMIT_NOFILE: {e}")
+
+
+class LoRAPathAction(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        setattr(namespace, self.dest, [])
+        for lora_name in values:
+            getattr(namespace, self.dest).append(lora_name)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(description="Benchmark the online serving throughput.")
+    parser.add_argument(
+        "--backend",
+        type=str,
+        choices=list(ASYNC_REQUEST_FUNCS.keys()),
+        default="sglang",
+        help="Must specify a backend, depending on the LLM Inference Engine.",
+    )
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default=None,
+        help="Server or API base url if not using http host and port.",
+    )
+    parser.add_argument(
+        "--host", type=str, default="0.0.0.0", help="Default host is 0.0.0.0."
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        help="If not set, the default port is configured according to its default value for different LLM Inference Engines.",
+    )
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        default="sharegpt",
+        choices=[
+            "sharegpt",
+            "random",
+            "random-ids",
+            "generated-shared-prefix",
+            "mmmu",
+            "random-image",
+            "mooncake",
+        ],
+        help="Name of the dataset to benchmark on.",
+    )
+    parser.add_argument(
+        "--dataset-path", type=str, default="", help="Path to the dataset."
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        help="Name or path of the model. If not set, the default model will request /v1/models for conf.",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        help="Name or path of the tokenizer. If not set, using the model conf.",
+    )
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=1000,
+        help="Number of prompts to process. Default is 1000.",
+    )
+    parser.add_argument(
+        "--sharegpt-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
+    )
+    parser.add_argument(
+        "--sharegpt-context-len",
+        type=int,
+        default=None,
+        help="The context length of the model for the ShareGPT dataset. Requests longer than the context length will be dropped.",
+    )
+    parser.add_argument(
+        "--random-input-len",
+        type=int,
+        default=1024,
+        help="Number of input tokens per request, used only for random dataset.",
+    )
+    parser.add_argument(
+        "--random-output-len",
+        default=1024,
+        type=int,
+        help="Number of output tokens per request, used only for random dataset.",
+    )
+    parser.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=0.0,
+        help="Range of sampled ratio of input/output length, "
+        "used only for random dataset.",
+    )
+    # random-image dataset args
+    parser.add_argument(
+        "--random-image-num-images",
+        type=int,
+        default=1,
+        help="Number of images per request (only available with the random-image dataset)",
+    )
+    parser.add_argument(
+        "--random-image-resolution",
+        type=str,
+        default="1080p",
+        help=(
+            "Resolution of random images for random-image dataset. "
+            "Supports presets 4k/1080p/720p/360p or custom 'heightxwidth' (e.g., 1080x1920)."
+        ),
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=float("inf"),
+        help="Number of requests per second. If this is inf, then all the requests are sent at time 0. "
+        "Otherwise, we use Poisson process to synthesize the request arrival times. Default is inf.",
+    )
+    parser.add_argument(
+        "--use-trace-timestamps",
+        action="store_true",
+        help="Use timestamps from the trace file for request scheduling. Only valid for 'mooncake' dataset.",
+    )
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        default=None,
+        help="Maximum number of concurrent requests. This can be used "
+        "to help simulate an environment where a higher level component "
+        "is enforcing a maximum number of concurrent requests. While the "
+        "--request-rate argument controls the rate at which requests are "
+        "initiated, this argument will control how many are actually allowed "
+        "to execute at a time. This means that when used in combination, the "
+        "actual request rate may be lower than specified with --request-rate, "
+        "if the server is not processing requests fast enough to keep up.",
+    )
+    parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
+    parser.add_argument(
+        "--output-details", action="store_true", help="Output details of benchmarking."
+    )
+    parser.add_argument(
+        "--disable-tqdm",
+        action="store_true",
+        help="Specify to disable tqdm progress bar.",
+    )
+    parser.add_argument(
+        "--disable-stream",
+        action="store_true",
+        help="Disable streaming mode.",
+    )
+    parser.add_argument(
+        "--return-logprob",
+        action="store_true",
+        help="Return logprob.",
+    )
+    parser.add_argument("--seed", type=int, default=1, help="The random seed.")
+    parser.add_argument(
+        "--disable-ignore-eos",
+        action="store_true",
+        help="Disable ignoring EOS.",
+    )
+    parser.add_argument(
+        "--extra-request-body",
+        metavar='{"key1": "value1", "key2": "value2"}',
+        type=str,
+        help="Append given JSON object to the request payload. You can use this to specify"
+        "additional generate params like sampling params.",
+    )
+    parser.add_argument(
+        "--apply-chat-template",
+        action="store_true",
+        help="Apply chat template",
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Use Torch Profiler. The endpoint must be launched with "
+        "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
+    )
+    parser.add_argument(
+        "--lora-name",
+        type=str,
+        nargs="*",
+        default=None,
+        action=LoRAPathAction,
+        help="The names of LoRA adapters. You can provide a list of names in the format {name} {name} {name}...",
+    )
+    parser.add_argument(
+        "--prompt-suffix",
+        type=str,
+        default="",
+        help="Suffix applied to the end of all user prompts, followed by assistant prompt suffix.",
+    )
+    parser.add_argument(
+        "--pd-separated",
+        action="store_true",
+        help="Benchmark PD disaggregation server",
+    )
+    parser.add_argument(
+        "--flush-cache",
+        action="store_true",
+        help="Flush the cache before running the benchmark",
+    )
+    parser.add_argument(
+        "--warmup-requests",
+        type=int,
+        default=1,
+        help="Number of warmup requests to run before the benchmark",
+    )
+    parser.add_argument(
+        "--tokenize-prompt",
+        action="store_true",
+        help="Use integer ids instead of string for inputs. Useful to control prompt lengths accurately",
+    )
+
+    group = parser.add_argument_group("generated-shared-prefix dataset arguments")
+    group.add_argument(
+        "--gsp-num-groups",
+        type=int,
+        default=64,
+        help="Number of system prompt groups for generated-shared-prefix dataset",
+    )
+    group.add_argument(
+        "--gsp-prompts-per-group",
+        type=int,
+        default=16,
+        help="Number of prompts per system prompt group for generated-shared-prefix dataset",
+    )
+    group.add_argument(
+        "--gsp-system-prompt-len",
+        type=int,
+        default=2048,
+        help="Target length in tokens for system prompts in generated-shared-prefix dataset",
+    )
+    group.add_argument(
+        "--gsp-question-len",
+        type=int,
+        default=128,
+        help="Target length in tokens for questions in generated-shared-prefix dataset",
+    )
+    group.add_argument(
+        "--gsp-output-len",
+        type=int,
+        default=256,
+        help="Target length in tokens for outputs in generated-shared-prefix dataset",
+    )
+    mooncake_group = parser.add_argument_group("mooncake dataset arguments")
+    mooncake_group.add_argument(
+        "--mooncake-slowdown-factor",
+        type=float,
+        default=1.0,
+        help="Slowdown factor for replaying the mooncake trace. "
+        "A value of 2.0 means the replay is twice as slow. "
+        "NOTE: --request-rate is IGNORED in mooncake mode.",
+    )
+    mooncake_group.add_argument(
+        "--mooncake-num-rounds",
+        type=int,
+        default=1,
+        help="Number of conversation rounds for each session in the mooncake dataset. "
+        "A value > 1 will enable true multi-turn session benchmarking.",
+    )
+    mooncake_group.add_argument(
+        "--mooncake-workload",
+        type=str,
+        default="conversation",
+        choices=[
+            "mooncake",
+            "conversation",
+            "synthetic",
+            "toolagent",
+        ],
+        help="Underlying workload for the mooncake dataset.",
+    )
+    args = parser.parse_args()
+    run_benchmark(args)
diff --git a/python/sglang/check_env.py b/python/sglang/check_env.py
new file mode 100644
index 000000000..1870e3207
--- /dev/null
+++ b/python/sglang/check_env.py
@@ -0,0 +1,305 @@
+"""Check environment configurations and dependency versions."""
+
+import importlib.metadata
+import os
+import resource
+import subprocess
+import sys
+from collections import OrderedDict, defaultdict
+
+import torch
+
+from sglang.srt.utils import is_hip
+
+
+def is_cuda_v2():
+    return torch.version.cuda is not None
+
+
+# List of packages to check versions
+PACKAGE_LIST = [
+    "sglang",
+    "sgl_kernel",
+    "flashinfer_python",
+    "triton",
+    "transformers",
+    "torchao",
+    "numpy",
+    "aiohttp",
+    "fastapi",
+    "hf_transfer",
+    "huggingface_hub",
+    "interegular",
+    "modelscope",
+    "orjson",
+    "outlines",
+    "packaging",
+    "psutil",
+    "pydantic",
+    "python-multipart",
+    "pyzmq",
+    "torchao",
+    "uvicorn",
+    "uvloop",
+    "vllm",
+    "xgrammar",
+    "openai",
+    "tiktoken",
+    "anthropic",
+    "litellm",
+    "decord",
+]
+
+
+def get_package_versions(packages):
+    """
+    Get versions of specified packages.
+    """
+    versions = {}
+    for package in packages:
+        package_name = package.split("==")[0].split(">=")[0].split("<=")[0]
+        try:
+            version = importlib.metadata.version(package_name)
+            versions[package_name] = version
+        except ModuleNotFoundError:
+            versions[package_name] = "Module Not Found"
+    return versions
+
+
+def get_cuda_info():
+    """
+    Get CUDA-related information if available.
+    """
+    if is_cuda_v2():
+        cuda_info = {"CUDA available": torch.cuda.is_available()}
+
+        if cuda_info["CUDA available"]:
+            cuda_info.update(_get_gpu_info())
+            cuda_info.update(_get_cuda_version_info())
+
+        return cuda_info
+    elif is_hip():
+        cuda_info = {"ROCM available": torch.cuda.is_available()}
+
+        if cuda_info["ROCM available"]:
+            cuda_info.update(_get_gpu_info())
+            cuda_info.update(_get_cuda_version_info())
+
+        return cuda_info
+
+
+def _get_gpu_info():
+    """
+    Get information about available GPUs.
+    """
+    devices = defaultdict(list)
+    capabilities = defaultdict(list)
+    for k in range(torch.cuda.device_count()):
+        devices[torch.cuda.get_device_name(k)].append(str(k))
+        capability = torch.cuda.get_device_capability(k)
+        capabilities[f"{capability[0]}.{capability[1]}"].append(str(k))
+
+    gpu_info = {}
+    for name, device_ids in devices.items():
+        gpu_info[f"GPU {','.join(device_ids)}"] = name
+
+    if len(capabilities) == 1:
+        # All GPUs have the same compute capability
+        cap, gpu_ids = list(capabilities.items())[0]
+        gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
+    else:
+        # GPUs have different compute capabilities
+        for cap, gpu_ids in capabilities.items():
+            gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
+
+    return gpu_info
+
+
+def _get_cuda_version_info():
+    """
+    Get CUDA version information.
+    """
+    if is_cuda_v2():
+        from torch.utils.cpp_extension import CUDA_HOME
+
+        cuda_info = {"CUDA_HOME": CUDA_HOME}
+
+        if CUDA_HOME and os.path.isdir(CUDA_HOME):
+            cuda_info.update(_get_nvcc_info())
+            cuda_info.update(_get_cuda_driver_version())
+
+        return cuda_info
+    elif is_hip():
+        from torch.utils.cpp_extension import ROCM_HOME as ROCM_HOME
+
+        cuda_info = {"ROCM_HOME": ROCM_HOME}
+
+        if ROCM_HOME and os.path.isdir(ROCM_HOME):
+            cuda_info.update(_get_nvcc_info())
+            cuda_info.update(_get_cuda_driver_version())
+
+        return cuda_info
+    else:
+        cuda_info = {"CUDA_HOME": ""}
+        return cuda_info
+
+
+def _get_nvcc_info():
+    """
+    Get NVCC version information.
+    """
+    if is_cuda_v2():
+        from torch.utils.cpp_extension import CUDA_HOME
+
+        try:
+            nvcc = os.path.join(CUDA_HOME, "bin/nvcc")
+            nvcc_output = (
+                subprocess.check_output(f'"{nvcc}" -V', shell=True)
+                .decode("utf-8")
+                .strip()
+            )
+            return {
+                "NVCC": nvcc_output[
+                    nvcc_output.rfind("Cuda compilation tools") : nvcc_output.rfind(
+                        "Build"
+                    )
+                ].strip()
+            }
+        except subprocess.SubprocessError:
+            return {"NVCC": "Not Available"}
+    elif is_hip():
+        from torch.utils.cpp_extension import ROCM_HOME
+
+        try:
+            hipcc = os.path.join(ROCM_HOME, "bin/hipcc")
+            hipcc_output = (
+                subprocess.check_output(f'"{hipcc}" --version', shell=True)
+                .decode("utf-8")
+                .strip()
+            )
+            return {
+                "HIPCC": hipcc_output[
+                    hipcc_output.rfind("HIP version") : hipcc_output.rfind("AMD clang")
+                ].strip()
+            }
+        except subprocess.SubprocessError:
+            return {"HIPCC": "Not Available"}
+    else:
+        return {"NVCC": "Not Available"}
+
+
+def _get_cuda_driver_version():
+    """
+    Get CUDA driver version.
+    """
+    versions = set()
+    if is_cuda_v2():
+        try:
+            output = subprocess.check_output(
+                [
+                    "nvidia-smi",
+                    "--query-gpu=driver_version",
+                    "--format=csv,noheader,nounits",
+                ]
+            )
+            versions = set(output.decode().strip().split("\n"))
+            if len(versions) == 1:
+                return {"CUDA Driver Version": versions.pop()}
+            else:
+                return {"CUDA Driver Versions": ", ".join(sorted(versions))}
+        except subprocess.SubprocessError:
+            return {"CUDA Driver Version": "Not Available"}
+    elif is_hip():
+        try:
+            output = subprocess.check_output(
+                [
+                    "rocm-smi",
+                    "--showdriverversion",
+                    "--csv",
+                ]
+            )
+            versions = set(output.decode().strip().split("\n"))
+            versions.discard("name, value")
+            ver = versions.pop()
+            ver = ver.replace('"Driver version", ', "").replace('"', "")
+
+            return {"ROCM Driver Version": ver}
+        except subprocess.SubprocessError:
+            return {"ROCM Driver Version": "Not Available"}
+    else:
+        return {"CUDA Driver Version": "Not Available"}
+
+
+def get_gpu_topology():
+    """
+    Get GPU topology information.
+    """
+    if is_cuda_v2():
+        try:
+            result = subprocess.run(
+                ["nvidia-smi", "topo", "-m"],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=True,
+            )
+            return "\n" + result.stdout if result.returncode == 0 else None
+        except subprocess.SubprocessError:
+            return None
+    elif is_hip():
+        try:
+            result = subprocess.run(
+                ["rocm-smi", "--showtopotype"],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=True,
+            )
+            return "\n" + result.stdout if result.returncode == 0 else None
+        except subprocess.SubprocessError:
+            return None
+    else:
+        return None
+
+
+def get_hypervisor_vendor():
+    try:
+        output = subprocess.check_output(["lscpu"], text=True)
+        for line in output.split("\n"):
+            if "Hypervisor vendor:" in line:
+                return line.split(":")[1].strip()
+        return None
+    except:
+        return None
+
+
+def check_env():
+    """
+    Check and print environment information.
+    """
+    env_info = OrderedDict()
+    env_info["Python"] = sys.version.replace("\n", "")
+    env_info.update(get_cuda_info())
+    env_info["PyTorch"] = torch.__version__
+    env_info.update(get_package_versions(PACKAGE_LIST))
+
+    gpu_topo = get_gpu_topology()
+    if gpu_topo:
+        if is_cuda_v2():
+            env_info["NVIDIA Topology"] = gpu_topo
+        elif is_hip():
+            env_info["AMD Topology"] = gpu_topo
+
+    hypervisor_vendor = get_hypervisor_vendor()
+    if hypervisor_vendor:
+        env_info["Hypervisor vendor"] = hypervisor_vendor
+
+    ulimit_soft, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
+    env_info["ulimit soft"] = ulimit_soft
+
+    for k, v in env_info.items():
+        print(f"{k}: {v}")
+
+
+if __name__ == "__main__":
+    check_env()
diff --git a/python/sglang/compile_deep_gemm.py b/python/sglang/compile_deep_gemm.py
new file mode 100644
index 000000000..e59036f7b
--- /dev/null
+++ b/python/sglang/compile_deep_gemm.py
@@ -0,0 +1,184 @@
+"""
+Compile DeepGEMM Kernels for a model with specify server arguments
+
+This script launches a server for capturing DeepGEMM calls and then compiles the kernels.
+It accepts server arguments (the same as launch_server.py).
+
+Usage:
+python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code
+
+"""
+
+import argparse
+import dataclasses
+import multiprocessing
+import os
+import time
+
+import requests
+
+from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST
+from sglang.srt.entrypoints.http_server import launch_server
+from sglang.srt.managers.io_struct import GenerateReqInput
+from sglang.srt.managers.tokenizer_manager import TokenizerManager
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import kill_process_tree
+from sglang.srt.warmup import warmup
+
+multiprocessing.set_start_method("spawn", force=True)
+
+# Reduce warning
+os.environ["SGL_IN_DEEPGEMM_PRECOMPILE_STAGE"] = "1"
+# Force enable deep gemm
+os.environ["SGL_ENABLE_JIT_DEEPGEMM"] = "1"
+# Force enable mha chunked kv for DeepSeek V3 to avoid missing kv_b_proj DeepGEMM case
+os.environ["SGL_CHUNKED_PREFIX_CACHE_THRESHOLD"] = "0"
+
+
+@dataclasses.dataclass
+class CompileArgs:
+    timeout: int = 3600
+
+    @staticmethod
+    def add_cli_args(parser: argparse.ArgumentParser):
+        parser.add_argument("--timeout", type=int, default=CompileArgs.timeout)
+
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        # use the default value's type to cast the args into correct types.
+        attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
+        return cls(
+            **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
+        )
+
+
+@warmup("compile-deep-gemm")
+async def warm_up_compile(
+    disaggregation_mode: str, tokenizer_manager: TokenizerManager
+):
+    print("\nGenerate warm up request for compiling DeepGEMM...\n")
+    generate_req_input = GenerateReqInput(
+        input_ids=[0, 1, 2, 3],
+        sampling_params={
+            "temperature": 0.0,
+            "max_new_tokens": 8,
+            "ignore_eos": True,
+        },
+    )
+    if disaggregation_mode != "null":
+        generate_req_input.bootstrap_room = 0
+        generate_req_input.bootstrap_host = FAKE_BOOTSTRAP_HOST
+
+    await tokenizer_manager.generate_request(generate_req_input, None).__anext__()
+
+
+def launch_server_internal(server_args):
+    try:
+        launch_server(server_args)
+    except Exception as e:
+        raise e
+    finally:
+        kill_process_tree(os.getpid(), include_parent=False)
+
+
+def launch_server_process_and_send_one_request(
+    server_args: ServerArgs, compile_args: CompileArgs
+):
+    proc = multiprocessing.Process(target=launch_server_internal, args=(server_args,))
+    proc.start()
+    base_url = f"http://{server_args.host}:{server_args.port}"
+    timeout = compile_args.timeout
+
+    start_time = time.perf_counter()
+    while time.perf_counter() - start_time < timeout:
+        try:
+            headers = {
+                "Content-Type": "application/json; charset=utf-8",
+            }
+            if server_args.node_rank == 0:
+                response = requests.get(f"{base_url}/v1/models", headers=headers)
+            else:
+                # This http api is created by launch_dummy_health_check_server for none-rank0 node.
+                response = requests.get(f"{base_url}/health", headers=headers)
+            if response.status_code == 200:
+                # Rank-0 node send a request to sync with other node and then return.
+                if server_args.node_rank == 0:
+                    response = requests.post(
+                        f"{base_url}/generate",
+                        json={
+                            "input_ids": [0, 1, 2, 3],
+                            "sampling_params": {
+                                "max_new_tokens": 8,
+                                "temperature": 0,
+                            },
+                        },
+                        timeout=600,
+                    )
+                    if response.status_code != 200:
+                        error = response.json()
+                        raise RuntimeError(f"Sync request failed: {error}")
+                # Other nodes should wait for the exit signal from Rank-0 node.
+                else:
+                    start_time_waiting = time.perf_counter()
+                    while proc.is_alive():
+                        if time.perf_counter() - start_time_waiting < timeout:
+                            time.sleep(10)
+                        else:
+                            raise TimeoutError("Waiting for main node timeout!")
+                return proc
+        except requests.RequestException:
+            pass
+        time.sleep(10)
+    raise TimeoutError(
+        "DeepGEMM Kernels compilation timeout."
+        "\n\nFeel free and please restart the command."
+    )
+
+
+def refine_server_args(server_args: ServerArgs, compile_args: CompileArgs):
+    # Disable cuda graph and torch compile to save time
+    server_args.disable_cuda_graph = True
+    server_args.enable_torch_compile = False
+    print(f"Disable CUDA Graph and Torch Compile to save time...")
+
+    # Set watchdog timeout to compile_args.timeout because compilation will take a long time
+    server_args.watchdog_timeout = compile_args.timeout
+    server_args.warmups = "compile-deep-gemm"
+
+
+def run_compile(server_args: ServerArgs, compile_args: CompileArgs):
+    print(
+        "Begin DeepGEMM Kernels compilation...\n"
+        "It may take a long time and timeout maybe raised "
+        "while the compilation is still in progress.\n"
+        "Just feel free to restart the command "
+        "until the compilation is fully finished.\n"
+    )
+
+    proc = launch_server_process_and_send_one_request(server_args, compile_args)
+
+    print("\nDeepGEMM Kernels compilation finished successfully.")
+
+    # Sleep for safety
+    time.sleep(10)
+    if proc.is_alive():
+        # This is the rank0 node.
+        kill_process_tree(proc.pid)
+    else:
+        try:
+            kill_process_tree(proc.pid)
+        except Exception:
+            pass
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    CompileArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    server_args = ServerArgs.from_cli_args(args)
+    compile_args = CompileArgs.from_cli_args(args)
+
+    refine_server_args(server_args, compile_args)
+
+    run_compile(server_args, compile_args)
diff --git a/python/sglang/eval/llama3_eval.py b/python/sglang/eval/llama3_eval.py
new file mode 100644
index 000000000..253cdf275
--- /dev/null
+++ b/python/sglang/eval/llama3_eval.py
@@ -0,0 +1,315 @@
+# Adapt from https://github.com/fw-ai/llm_eval_meta
+
+import argparse
+import asyncio
+import os
+import pickle
+import re
+import shutil
+from collections import defaultdict
+from dataclasses import dataclass
+
+import httpx
+import numpy as np
+import openai
+from datasets import load_dataset
+from openai import AsyncOpenAI
+from tqdm import tqdm
+
+# Mapping providers to their clients and models
+provider_to_models = {
+    "b10": {
+        "8b": "meta-llama/Llama-3.1-8B-Instruct",
+        "70b": "meta-llama/Llama-3.1-70B-Instruct",
+        "405b": "meta-llama/Llama-3.1-405B-Instruct",
+    },
+    "oai": {
+        "8b": "meta-llama/Llama-3.1-8B-Instruct",
+        "70b": "meta-llama/Llama-3.1-70B-Instruct",
+        "405b": "meta-llama/Llama-3.1-405B-Instruct",
+    },
+    "sgl": {
+        "8b": "meta-llama/Llama-3.1-8B-Instruct",
+        "70b": "meta-llama/Llama-3.1-70B-Instruct",
+        "405b": "meta-llama/Llama-3.1-405B-Instruct",
+    },
+}
+
+
+async def fetch_responses(
+    client, prompt, semaphore, index, provider, model_size, output_dir, max_tokens
+):
+    output_file = os.path.join(output_dir, f"response_{index}.pkl")
+    if os.path.exists(output_file):
+        print(f"File {output_file} already exists, skipping.")
+        return
+
+    async with semaphore:
+        response = await client.completions.create(
+            model=provider_to_models[provider][model_size],
+            prompt=prompt,
+            temperature=0.0,
+            max_tokens=max_tokens,
+        )
+        if isinstance(response, openai.BadRequestError):
+            with open(output_file, "wb") as f:
+                pickle.dump("bad_response", f)
+        assert isinstance(response, openai.types.completion.Completion)
+        # Save response to a file
+        with open(output_file, "wb") as f:
+            pickle.dump(response, f)
+
+
+TASK_TO_MAX_TOKENS = {
+    "evals__mmlu__details": 1,
+    "evals__mmlu__0_shot__cot__details": 1024,
+    # Official meta uses 1024, but a small % (.05) of questions are answered correctly after relaxing
+    "evals__mmlu_pro__details": 2048,
+    "evals__gsm8k__details": 1024,
+}
+
+TASK_TO_EVAL_SET = {
+    "mmlu": "evals__mmlu__details",
+    "mmlu_cot": "evals__mmlu__0_shot__cot__details",
+    "mmlu_pro": "evals__mmlu_pro__details",
+    "gsm8k": "evals__gsm8k__details",
+}
+
+
+class CustomAsyncHTTPXClient(httpx.AsyncClient):
+    async def send(self, request: httpx.Request, *args, **kwargs) -> httpx.Response:
+        request.url = httpx.URL(
+            f"https://model-{os.getenv('MODEL_ID')}.api.baseten.co/development/predict"
+        )
+        return await super().send(request, *args, **kwargs)
+
+
+def get_client(provider):
+    if provider not in "b10":
+        if os.getenv("OPENAI_API_KEY") == None:
+            os.environ["OPENAI_API_KEY"] = "EMPTY"
+    return {
+        "oai": AsyncOpenAI(base_url="http://127.0.0.1:8000/v1/"),
+        "b10": AsyncOpenAI(
+            api_key=f"Api-Key {os.getenv('OPENAI_API_KEY')}",
+            base_url=f"https://model-{os.getenv('MODEL_ID')}.api.baseten.co/development/predict",
+            http_client=CustomAsyncHTTPXClient(),
+        ),
+        "sgl": AsyncOpenAI(base_url="http://127.0.0.1:30000/v1/"),
+    }[provider]
+
+
+# Define the benchmark function
+async def benchmark(args):
+    ds = load_dataset(
+        "meta-llama/Llama-3.1-405B-Instruct-evals",
+        f"Llama-3.1-405B-Instruct-{TASK_TO_EVAL_SET[args.task]}",
+    )
+    semaphore = asyncio.Semaphore(args.concurrency)  # Limit to 16 concurrent tasks
+
+    if args.num_examples is None:
+        args.num_examples = len(ds["latest"]["input_final_prompts"])
+    prompts = ds["latest"]["input_final_prompts"][: args.num_examples]
+
+    # Create the output directory if it does not exist
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    tasks = []
+    # Create the tasks with tqdm progress bar
+    max_tokens = TASK_TO_MAX_TOKENS[TASK_TO_EVAL_SET[args.task]]
+    client = get_client(args.provider)
+    for idx, prompt in enumerate(tqdm(prompts, desc="Creating tasks")):
+        tasks.append(
+            asyncio.create_task(
+                fetch_responses(
+                    client,
+                    f"<|begin_of_text|>{prompt[0]}",
+                    semaphore,
+                    idx,
+                    args.provider,
+                    args.model_size,
+                    args.output_dir,
+                    max_tokens=max_tokens,
+                )
+            )
+        )
+
+    # Run the tasks with tqdm progress bar
+    for future in tqdm(
+        asyncio.as_completed(tasks), total=len(tasks), desc="Processing tasks"
+    ):
+        await future
+
+
+def get_mmlu_answer(response):
+    if response is not None:
+        return response.choices[0].text.lstrip().rstrip().upper().replace(".", "")
+    return None
+
+
+def get_mmlu_cot_answer(response):
+    pattern = r"The best answer is (.+)\.?"
+    match = re.search(pattern, response.choices[0].text)
+    if match:
+        return match.group(1).replace(".", "").replace("*", "")
+
+    pattern = r"the best answer is (.+)\.?"
+    match = re.search(pattern, response.choices[0].text)
+    if match:
+        return match.group(1).replace(".", "")
+
+    pattern = r"The correct answer is (.+)\.?"
+    match = re.search(pattern, response.choices[0].text)
+    if match:
+        return match.group(1).replace(".", "")
+
+    pattern = r"the correct answer is (.+)\.?"
+    match = re.search(pattern, response.choices[0].text)
+    if match:
+        return match.group(1).replace(".", "")
+
+
+def get_answer_gsm8k(response):
+    pattern = r"The final answer is (.+)\.?"
+    match = re.search(pattern, response.choices[0].text)
+    if match:
+        s = match.group(1)
+        for ok_symbol in ["%", "$"]:
+            s = s.replace(ok_symbol, "")
+        return s
+
+
+TASK_TO_ANSWER_EXTRACTOR = {
+    "evals__mmlu__details": get_mmlu_answer,
+    "evals__mmlu__0_shot__cot__details": get_mmlu_cot_answer,
+    "evals__gsm8k__details": get_answer_gsm8k,
+    "evals__mmlu_pro__details": get_mmlu_cot_answer,
+}
+
+
+def get_dataset_from_task(task, response_path, model_size):
+    ds_405b = load_dataset(
+        f"meta-llama/Llama-3.1-405B-Instruct-evals",
+        f"Llama-3.1-405B-Instruct-{task}",
+    )
+    ds_405b_hash_order = [x[0] for x in ds_405b["latest"]["input_final_prompts_hash"]]
+
+    if "70b" in model_size or "8b" in model_size:
+        if "70" in model_size:
+            ref_model_ds = load_dataset(
+                f"meta-llama/Llama-3.1-70B-Instruct-evals",
+                f"Llama-3.1-70B-Instruct-{task}",
+            )
+        else:
+            ref_model_ds = load_dataset(
+                f"meta-llama/Llama-3.1-8B-Instruct-evals",
+                f"Llama-3.1-8B-Instruct-{task}",
+            )
+
+        hash_to_row = {}
+        for row in ref_model_ds["latest"]:
+            hash_to_row[row["input_final_prompts_hash"][0]] = row
+        reordered_rows = []
+        for prompt_hash in ds_405b_hash_order:
+            reordered_rows.append(hash_to_row[prompt_hash])
+        ref_model_ds["latest"] = reordered_rows
+        return ref_model_ds
+
+    return ds_405b
+
+
+def analyze(task, response_path, model_size):
+    ds = get_dataset_from_task(task, response_path, model_size)
+
+    responses = []
+    total = len(ds["latest"])
+
+    for i in range(0, total):
+        response = pickle.load(
+            open(os.path.join(response_path, f"response_{i}.pkl"), "rb")
+        )
+        responses.append(response)
+
+    @dataclass
+    class Stats:
+        correct: int = 0
+        total: int = 0
+        meta_correct: int = 0
+
+        average: float = None
+
+    subtask_name_to_stats = defaultdict(lambda: Stats())
+
+    for response, ds_row in zip(responses, ds["latest"]):
+        model_answer = TASK_TO_ANSWER_EXTRACTOR[task](response)
+
+        subtask = ds_row["subtask_name"]
+
+        is_eval_correct = model_answer in ds_row["input_correct_responses"]
+        if is_eval_correct:
+            subtask_name_to_stats[subtask].correct += 1
+
+        if ds_row["is_correct"]:
+            subtask_name_to_stats[subtask].meta_correct += 1
+
+        subtask_name_to_stats[subtask].total += 1
+
+    micro_stats = Stats()
+    for subtask, stats in subtask_name_to_stats.items():
+        stats.average = stats.correct / stats.total
+        stats.meta_average = stats.meta_correct / stats.total
+
+        micro_stats.correct += stats.correct
+        micro_stats.total += stats.total
+        micro_stats.meta_correct += stats.meta_correct
+
+    micro_stats.average = micro_stats.correct / micro_stats.total
+    micro_stats.meta_average = micro_stats.meta_correct / micro_stats.total
+
+    print("Macro average", np.mean([x.average for x in subtask_name_to_stats.values()]))
+    print(
+        "Meta Macro average",
+        np.mean([x.meta_average for x in subtask_name_to_stats.values()]),
+    )
+    print("Micro average", micro_stats.average)
+    print("Meta Micro average", micro_stats.meta_average)
+
+
+# Entry point for the script
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Script to run model with specified parameters."
+    )
+    parser.add_argument(
+        "--model-size",
+        type=str,
+        default="8b",
+        help="Size of the model (e.g., 8b or 70b)",
+    )
+    parser.add_argument(
+        "--provider",
+        type=str,
+        default="sgl",
+        help="Provider name (e.g., sgl, oai, b10)",
+    )
+    parser.add_argument(
+        "--task",
+        type=str,
+        required=True,
+        help="Task (e.g., mmlu, mmlu_cot, mmlu_pro, gsm8k)",
+    )
+    parser.add_argument(
+        "--num-examples", type=int, default=None, help="Number of examples to process"
+    )
+    parser.add_argument("--concurrency", type=int, default=16)
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="tmp-output-dir",
+        help="Directory to save responses",
+    )
+
+    args = parser.parse_args()
+    asyncio.run(benchmark(args))
+    analyze(TASK_TO_EVAL_SET[args.task], args.output_dir, args.model_size)
+    shutil.rmtree("tmp-output-dir", ignore_errors=True)
diff --git a/python/sglang/eval/loogle_eval.py b/python/sglang/eval/loogle_eval.py
new file mode 100644
index 000000000..895362cd1
--- /dev/null
+++ b/python/sglang/eval/loogle_eval.py
@@ -0,0 +1,164 @@
+import argparse
+import asyncio
+import os
+import pickle
+from pathlib import Path
+from typing import List
+
+import openai
+import torch
+from bert_score import BERTScorer
+from datasets import load_dataset
+from tqdm import tqdm
+
+
+def get_client(api_url: str) -> openai.AsyncOpenAI:
+    if os.getenv("OPENAI_API_KEY") is None:
+        os.environ["OPENAI_API_KEY"] = "EMPTY"
+    return openai.AsyncOpenAI(base_url=api_url)
+
+
+def get_dataset():
+    return load_dataset("bigai-nlco/LooGLE", "longdep_qa", split="test")
+
+
+async def fetch_response(
+    client: openai.AsyncOpenAI,
+    context: str,
+    question: str,
+    semaphore: asyncio.Semaphore,
+    index: int,
+    model: str,
+    output_dir: Path,
+):
+    output_file = output_dir / f"response_{index}.pkl"
+    if output_file.exists():
+        return
+
+    prompt = (
+        "Please answer the question based on the long texts below.\n"
+        f"{context}\n"
+        f"Question: {question}\n"
+        "Answer:"
+    )
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": prompt},
+    ]
+
+    async with semaphore:
+        try:
+            response = await client.chat.completions.create(
+                model=model,
+                messages=messages,
+                temperature=0.0,
+                max_tokens=512,
+            )
+        except openai.BadRequestError as e:
+            with open(output_file, "wb") as f:
+                pickle.dump({"error": str(e)}, f)
+            return
+
+    with open(output_file, "wb") as f:
+        pickle.dump(response, f)
+
+
+async def benchmark(args):
+    dataset = get_dataset()
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    client = get_client(args.api_url)
+    semaphore = asyncio.Semaphore(args.max_concurrency)
+
+    tasks: List[asyncio.Task] = []
+    for idx, ex in enumerate(dataset):
+        if idx >= args.num_prompts:
+            break
+        tasks.append(
+            asyncio.create_task(
+                fetch_response(
+                    client,
+                    ex["context"],
+                    ex["question"],
+                    semaphore,
+                    idx,
+                    args.model,
+                    output_dir,
+                )
+            )
+        )
+
+    for _ in tqdm(
+        asyncio.as_completed(tasks), total=len(tasks), desc="Running benchmark"
+    ):
+        await _
+
+
+def analyse(args):
+    dataset = get_dataset()
+    output_dir = Path(args.output_dir)
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    scorer = BERTScorer(lang="en", device=device)
+
+    hyps: List[str] = []
+    refs: List[str] = []
+    for idx, ex in enumerate(tqdm(dataset, desc="Loading responses")):
+        if idx >= args.num_prompts:
+            break
+        pkl_file = output_dir / f"response_{idx}.pkl"
+        if not pkl_file.exists():
+            raise FileNotFoundError(pkl_file)
+
+        response = pickle.load(open(pkl_file, "rb"))
+        if isinstance(response, dict) and "error" in response:
+            continue
+
+        hyps.append(response.choices[0].message.content.strip())
+        refs.append(ex["answer"])
+
+    if not hyps:
+        print("No valid responses to score!")
+        return
+
+    batch_size = 64
+    all_f1: List[float] = []
+    for i in tqdm(range(0, len(hyps), batch_size), desc="Scoring batches"):
+        h_batch = hyps[i : i + batch_size]
+        r_batch = refs[i : i + batch_size]
+        _, _, f1_scores = scorer.score(h_batch, r_batch, verbose=False)
+        all_f1.extend([float(x) for x in f1_scores])
+
+    avg = sum(all_f1) / len(all_f1)
+    print(f"Average BERTScore (F1): {avg:.2%}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Run benchmark and evaluation in one go."
+    )
+    parser.add_argument(
+        "--api-url",
+        default="http://127.0.0.1:30000/v1",
+        help="OpenAI‑compatible API base URL",
+    )
+    parser.add_argument(
+        "--model",
+        default="meta-llama/Llama-4-Maverick-17B-128E-Instruct",
+        help="Model name or ID, only used for model name",
+    )
+    parser.add_argument(
+        "--max-concurrency", type=int, default=144, help="Maximum concurrent requests"
+    )
+    parser.add_argument(
+        "--output-dir", default="tmp-output-dir", help="Directory for cached responses"
+    )
+    parser.add_argument(
+        "--num-prompts", type=int, default=10000, help="Number of prompts to run"
+    )
+    args = parser.parse_args()
+
+    asyncio.run(benchmark(args))
+
+    analyse(args)
diff --git a/python/sglang/global_config.py b/python/sglang/global_config.py
new file mode 100644
index 000000000..f006bd94c
--- /dev/null
+++ b/python/sglang/global_config.py
@@ -0,0 +1,53 @@
+"""Global configurations"""
+
+import os
+
+
+class GlobalConfig:
+    """
+    Store some global constants.
+
+    See also python/sglang/srt/managers/schedule_batch.py::global_server_args_dict, which stores
+    many global runtime arguments as well.
+    """
+
+    def __init__(self):
+        # Verbosity level
+        # 0: do not output anything
+        # 2: output final text after every run
+        self.verbosity = 0
+
+        # Default backend of the language
+        self.default_backend = None
+
+        # Runtime constants: New generation token ratio estimation
+        self.default_init_new_token_ratio = float(
+            os.environ.get("SGLANG_INIT_NEW_TOKEN_RATIO", 0.7)
+        )
+        self.default_min_new_token_ratio_factor = float(
+            os.environ.get("SGLANG_MIN_NEW_TOKEN_RATIO_FACTOR", 0.14)
+        )
+        self.default_new_token_ratio_decay_steps = float(
+            os.environ.get("SGLANG_NEW_TOKEN_RATIO_DECAY_STEPS", 600)
+        )
+        self.torch_empty_cache_interval = float(
+            os.environ.get(
+                "SGLANG_EMPTY_CACHE_INTERVAL", -1
+            )  # in seconds. Set if you observe high memory accumulation over a long serving period.
+        )
+        # Runtime constants: others
+        self.retract_decode_steps = 20
+        self.flashinfer_workspace_size = os.environ.get(
+            "FLASHINFER_WORKSPACE_SIZE", 384 * 1024 * 1024
+        )
+
+        # Output tokenization configs
+        self.skip_special_tokens_in_output = True
+        self.spaces_between_special_tokens_in_out = True
+
+        # Language frontend interpreter optimization configs
+        self.enable_precache_with_tracing = True
+        self.enable_parallel_encoding = True
+
+
+global_config = GlobalConfig()
diff --git a/python/sglang/lang/api.py b/python/sglang/lang/api.py
new file mode 100644
index 000000000..a8d2e43e6
--- /dev/null
+++ b/python/sglang/lang/api.py
@@ -0,0 +1,286 @@
+"""Public APIs of the language."""
+
+import re
+from typing import Callable, List, Optional, Union
+
+from sglang.global_config import global_config
+from sglang.lang.backend.base_backend import BaseBackend
+from sglang.lang.choices import ChoicesSamplingMethod, token_length_normalized
+from sglang.lang.ir import (
+    SglExpr,
+    SglExprList,
+    SglFunction,
+    SglGen,
+    SglImage,
+    SglRoleBegin,
+    SglRoleEnd,
+    SglSelect,
+    SglSeparateReasoning,
+    SglVideo,
+)
+
+
+def function(
+    func: Optional[Callable] = None, num_api_spec_tokens: Optional[int] = None
+):
+    if func:
+        return SglFunction(func, num_api_spec_tokens=num_api_spec_tokens)
+
+    def decorator(func):
+        return SglFunction(func, num_api_spec_tokens=num_api_spec_tokens)
+
+    return decorator
+
+
+def Runtime(*args, **kwargs):
+    # Avoid importing unnecessary dependency
+    from sglang.lang.backend.runtime_endpoint import Runtime
+
+    return Runtime(*args, **kwargs)
+
+
+def Engine(*args, **kwargs):
+    # Avoid importing unnecessary dependency
+    from sglang.srt.entrypoints.engine import Engine
+
+    return Engine(*args, **kwargs)
+
+
+def set_default_backend(backend: BaseBackend):
+    global_config.default_backend = backend
+
+
+def flush_cache(backend: Optional[BaseBackend] = None):
+    backend = backend or global_config.default_backend
+    if backend is None:
+        return False
+
+    # If backend is Runtime
+    if hasattr(backend, "endpoint"):
+        backend = backend.endpoint
+    return backend.flush_cache()
+
+
+def get_server_info(backend: Optional[BaseBackend] = None):
+    backend = backend or global_config.default_backend
+    if backend is None:
+        return None
+
+    # If backend is Runtime
+    if hasattr(backend, "endpoint"):
+        backend = backend.endpoint
+    return backend.get_server_info()
+
+
+def gen(
+    name: Optional[str] = None,
+    max_tokens: Optional[int] = None,
+    min_tokens: Optional[int] = None,
+    n: Optional[int] = None,
+    stop: Optional[Union[str, List[str]]] = None,
+    stop_token_ids: Optional[List[int]] = None,
+    temperature: Optional[float] = None,
+    top_p: Optional[float] = None,
+    top_k: Optional[int] = None,
+    min_p: Optional[float] = None,
+    frequency_penalty: Optional[float] = None,
+    presence_penalty: Optional[float] = None,
+    ignore_eos: Optional[bool] = None,
+    return_logprob: Optional[bool] = None,
+    logprob_start_len: Optional[int] = None,
+    top_logprobs_num: Optional[int] = None,
+    return_text_in_logprobs: Optional[bool] = None,
+    dtype: Optional[Union[type, str]] = None,
+    choices: Optional[List[str]] = None,
+    choices_method: Optional[ChoicesSamplingMethod] = None,
+    regex: Optional[str] = None,
+    json_schema: Optional[str] = None,
+):
+    """Call the model to generate. See the meaning of the arguments in docs/backend/sampling_params.md"""
+
+    if choices:
+        return SglSelect(
+            name,
+            choices,
+            0.0 if temperature is None else temperature,
+            token_length_normalized if choices_method is None else choices_method,
+        )
+
+    # check regex is valid
+    if regex is not None:
+        try:
+            re.compile(regex)
+        except re.error as e:
+            raise e
+
+    return SglGen(
+        name,
+        max_tokens,
+        min_tokens,
+        n,
+        stop,
+        stop_token_ids,
+        temperature,
+        top_p,
+        top_k,
+        min_p,
+        frequency_penalty,
+        presence_penalty,
+        ignore_eos,
+        return_logprob,
+        logprob_start_len,
+        top_logprobs_num,
+        return_text_in_logprobs,
+        dtype,
+        regex,
+        json_schema,
+    )
+
+
+def gen_int(
+    name: Optional[str] = None,
+    max_tokens: Optional[int] = None,
+    n: Optional[int] = None,
+    stop: Optional[Union[str, List[str]]] = None,
+    stop_token_ids: Optional[List[int]] = None,
+    temperature: Optional[float] = None,
+    top_p: Optional[float] = None,
+    top_k: Optional[int] = None,
+    min_p: Optional[float] = None,
+    frequency_penalty: Optional[float] = None,
+    presence_penalty: Optional[float] = None,
+    ignore_eos: Optional[bool] = None,
+    return_logprob: Optional[bool] = None,
+    logprob_start_len: Optional[int] = None,
+    top_logprobs_num: Optional[int] = None,
+    return_text_in_logprobs: Optional[bool] = None,
+):
+    return SglGen(
+        name,
+        max_tokens,
+        None,
+        n,
+        stop,
+        stop_token_ids,
+        temperature,
+        top_p,
+        top_k,
+        min_p,
+        frequency_penalty,
+        presence_penalty,
+        ignore_eos,
+        return_logprob,
+        logprob_start_len,
+        top_logprobs_num,
+        return_text_in_logprobs,
+        int,
+        None,
+    )
+
+
+def gen_string(
+    name: Optional[str] = None,
+    max_tokens: Optional[int] = None,
+    n: Optional[int] = None,
+    stop: Optional[Union[str, List[str]]] = None,
+    stop_token_ids: Optional[List[int]] = None,
+    temperature: Optional[float] = None,
+    top_p: Optional[float] = None,
+    top_k: Optional[int] = None,
+    min_p: Optional[float] = None,
+    frequency_penalty: Optional[float] = None,
+    presence_penalty: Optional[float] = None,
+    ignore_eos: Optional[bool] = None,
+    return_logprob: Optional[bool] = None,
+    logprob_start_len: Optional[int] = None,
+    top_logprobs_num: Optional[int] = None,
+    return_text_in_logprobs: Optional[bool] = None,
+):
+    return SglGen(
+        name,
+        max_tokens,
+        None,
+        n,
+        stop,
+        stop_token_ids,
+        temperature,
+        top_p,
+        top_k,
+        min_p,
+        frequency_penalty,
+        presence_penalty,
+        ignore_eos,
+        return_logprob,
+        logprob_start_len,
+        top_logprobs_num,
+        return_text_in_logprobs,
+        str,
+        None,
+    )
+
+
+def image(expr: SglExpr):
+    return SglImage(expr)
+
+
+def video(path: str, num_frames: int):
+    return SglVideo(path, num_frames)
+
+
+def select(
+    name: Optional[str] = None,
+    choices: Optional[List[str]] = None,
+    temperature: float = 0.0,
+    choices_method: ChoicesSamplingMethod = token_length_normalized,
+):
+    assert choices is not None
+    return SglSelect(name, choices, temperature, choices_method)
+
+
+def _role_common(name: str, expr: Optional[SglExpr] = None):
+    if expr is None:
+        return SglExprList([SglRoleBegin(name), SglRoleEnd(name)])
+    else:
+        return SglExprList([SglRoleBegin(name), expr, SglRoleEnd(name)])
+
+
+def system(expr: Optional[SglExpr] = None):
+    return _role_common("system", expr)
+
+
+def user(expr: Optional[SglExpr] = None):
+    return _role_common("user", expr)
+
+
+def assistant(expr: Optional[SglExpr] = None):
+    return _role_common("assistant", expr)
+
+
+def system_begin():
+    return SglRoleBegin("system")
+
+
+def system_end():
+    return SglRoleEnd("system")
+
+
+def user_begin():
+    return SglRoleBegin("user")
+
+
+def user_end():
+    return SglRoleEnd("user")
+
+
+def assistant_begin():
+    return SglRoleBegin("assistant")
+
+
+def assistant_end():
+    return SglRoleEnd("assistant")
+
+
+def separate_reasoning(
+    expr: Optional[SglExpr] = None, model_type: Optional[str] = None
+):
+    return SglExprList([expr, SglSeparateReasoning(model_type, expr=expr)])
diff --git a/python/sglang/lang/backend/anthropic.py b/python/sglang/lang/backend/anthropic.py
new file mode 100644
index 000000000..4918a1703
--- /dev/null
+++ b/python/sglang/lang/backend/anthropic.py
@@ -0,0 +1,73 @@
+from sglang.lang.backend.base_backend import BaseBackend
+from sglang.lang.chat_template import get_chat_template
+from sglang.lang.interpreter import StreamExecutor
+from sglang.lang.ir import SglSamplingParams
+
+try:
+    import anthropic
+except ImportError as e:
+    anthropic = e
+
+
+class Anthropic(BaseBackend):
+    def __init__(self, model_name, *args, **kwargs):
+        super().__init__()
+
+        if isinstance(anthropic, Exception):
+            raise anthropic
+
+        self.model_name = model_name
+        self.chat_template = get_chat_template("claude")
+        self.client = anthropic.Anthropic(*args, **kwargs)
+
+    def get_chat_template(self):
+        return self.chat_template
+
+    def generate(
+        self,
+        s: StreamExecutor,
+        sampling_params: SglSamplingParams,
+    ):
+        if s.messages_:
+            messages = s.messages_
+        else:
+            messages = [{"role": "user", "content": s.text_}]
+
+        if messages and messages[0]["role"] == "system":
+            system = messages.pop(0)["content"]
+        else:
+            system = ""
+
+        ret = self.client.messages.create(
+            model=self.model_name,
+            system=system,
+            messages=messages,
+            **sampling_params.to_anthropic_kwargs(),
+        )
+        comp = ret.content[0].text
+
+        return comp, {}
+
+    def generate_stream(
+        self,
+        s: StreamExecutor,
+        sampling_params: SglSamplingParams,
+    ):
+        if s.messages_:
+            messages = s.messages_
+        else:
+            messages = [{"role": "user", "content": s.text_}]
+
+        if messages and messages[0]["role"] == "system":
+            system = messages.pop(0)["content"]
+        else:
+            system = ""
+
+        with self.client.messages.stream(
+            model=self.model_name,
+            system=system,
+            messages=messages,
+            **sampling_params.to_anthropic_kwargs(),
+        ) as stream:
+            for text in stream.text_stream:
+                yield text, {}
diff --git a/python/sglang/lang/backend/base_backend.py b/python/sglang/lang/backend/base_backend.py
new file mode 100644
index 000000000..62dd50416
--- /dev/null
+++ b/python/sglang/lang/backend/base_backend.py
@@ -0,0 +1,82 @@
+from typing import List, Optional, Union
+
+from sglang.lang.chat_template import get_chat_template
+from sglang.lang.choices import ChoicesDecision, ChoicesSamplingMethod
+from sglang.lang.interpreter import StreamExecutor
+from sglang.lang.ir import SglSamplingParams
+
+
+class BaseBackend:
+    def __init__(self) -> None:
+        self.support_concate_and_append = False
+        self.chat_template = get_chat_template("default")
+
+    def get_model_name(self):
+        raise NotImplementedError()
+
+    def get_chat_template(self):
+        return self.chat_template
+
+    def cache_prefix(self, prefix_str: str):
+        pass
+
+    def uncache_prefix(self, rid: str):
+        pass
+
+    def end_request(self, rid: Union[str, List[str]]):
+        pass
+
+    def begin_program(self, s: StreamExecutor):
+        pass
+
+    def end_program(self, s: Union[StreamExecutor, List[StreamExecutor]]):
+        pass
+
+    def commit_lazy_operations(self, s: StreamExecutor):
+        pass
+
+    def fork_program(
+        self,
+        src: StreamExecutor,
+        dst: List[StreamExecutor],
+        position_ids_offset: Optional[List[int]] = None,
+    ):
+        pass
+
+    def fill_image(self, s: StreamExecutor):
+        pass
+
+    def generate(
+        self,
+        s: StreamExecutor,
+        sampling_params: SglSamplingParams,
+    ):
+        raise NotImplementedError()
+
+    def generate_stream(
+        self,
+        s: StreamExecutor,
+        sampling_params: SglSamplingParams,
+    ):
+        raise NotImplementedError()
+
+    def select(
+        self,
+        s: StreamExecutor,
+        choices: List[str],
+        temperature: float,
+        choices_method: Optional[ChoicesSamplingMethod] = None,
+    ) -> ChoicesDecision:
+        raise NotImplementedError()
+
+    def concatenate_and_append(self, src_rids: List[str], dst_rid: str):
+        raise NotImplementedError()
+
+    def shutdown(self):
+        pass
+
+    def flush_cache(self):
+        pass
+
+    def get_server_info(self):
+        pass
diff --git a/python/sglang/lang/backend/litellm.py b/python/sglang/lang/backend/litellm.py
new file mode 100644
index 000000000..5803b5431
--- /dev/null
+++ b/python/sglang/lang/backend/litellm.py
@@ -0,0 +1,90 @@
+from typing import Mapping, Optional
+
+from sglang.lang.backend.base_backend import BaseBackend
+from sglang.lang.chat_template import get_chat_template_by_model_path
+from sglang.lang.interpreter import StreamExecutor
+from sglang.lang.ir import SglSamplingParams
+
+try:
+    import litellm
+except ImportError as e:
+    litellm = e
+    litellm.num_retries = 1
+
+
+class LiteLLM(BaseBackend):
+    def __init__(
+        self,
+        model_name,
+        chat_template=None,
+        api_key=None,
+        organization: Optional[str] = None,
+        base_url: Optional[str] = None,
+        timeout: Optional[float] = 600,
+        max_retries: Optional[int] = litellm.num_retries,
+        default_headers: Optional[Mapping[str, str]] = None,
+    ):
+        super().__init__()
+
+        if isinstance(litellm, Exception):
+            raise litellm
+
+        self.model_name = model_name
+
+        self.chat_template = chat_template or get_chat_template_by_model_path(
+            model_name
+        )
+
+        self.client_params = {
+            "api_key": api_key,
+            "organization": organization,
+            "base_url": base_url,
+            "timeout": timeout,
+            "max_retries": max_retries,
+            "default_headers": default_headers,
+        }
+
+    def get_chat_template(self):
+        return self.chat_template
+
+    def generate(
+        self,
+        s: StreamExecutor,
+        sampling_params: SglSamplingParams,
+    ):
+        if s.messages_:
+            messages = s.messages_
+        else:
+            messages = [{"role": "user", "content": s.text_}]
+
+        ret = litellm.completion(
+            model=self.model_name,
+            messages=messages,
+            **self.client_params,
+            **sampling_params.to_litellm_kwargs(),
+        )
+        comp = ret.choices[0].message.content
+
+        return comp, {}
+
+    def generate_stream(
+        self,
+        s: StreamExecutor,
+        sampling_params: SglSamplingParams,
+    ):
+        if s.messages_:
+            messages = s.messages_
+        else:
+            messages = [{"role": "user", "content": s.text_}]
+
+        ret = litellm.completion(
+            model=self.model_name,
+            messages=messages,
+            stream=True,
+            **self.client_params,
+            **sampling_params.to_litellm_kwargs(),
+        )
+        for chunk in ret:
+            text = chunk.choices[0].delta.content
+            if text is not None:
+                yield text, {}
diff --git a/python/sglang/lang/backend/openai.py b/python/sglang/lang/backend/openai.py
new file mode 100644
index 000000000..a2d006bb7
--- /dev/null
+++ b/python/sglang/lang/backend/openai.py
@@ -0,0 +1,475 @@
+import dataclasses
+import logging
+import time
+import warnings
+from typing import List, Optional, Union
+
+import numpy as np
+
+from sglang.lang.backend.base_backend import BaseBackend
+from sglang.lang.chat_template import ChatTemplate, get_chat_template_by_model_path
+from sglang.lang.choices import ChoicesDecision, ChoicesSamplingMethod
+from sglang.lang.interpreter import StreamExecutor
+from sglang.lang.ir import SglSamplingParams
+
+try:
+    import openai
+    import tiktoken
+except ImportError as e:
+    openai = tiktoken = e
+
+
+logger = logging.getLogger(__name__)
+
+
+def create_logit_bias_int(tokenizer):
+    """Get logit bias for integer numbers."""
+    int_token_ids = []
+
+    tokens = tokenizer._mergeable_ranks
+    for token, token_id in tokens.items():
+        s = tokenizer.decode([token_id])
+        if all([c.isdigit() for c in s]) or s in [" "]:
+            int_token_ids.append(token_id)
+            if len(int_token_ids) >= 300:  # OpenAI API limit
+                break
+    special_tokens = tokenizer._special_tokens
+    mask = {t: 100 for t in int_token_ids[:299]}
+    mask[special_tokens["<|endoftext|>"]] = 100
+    return mask
+
+
+INSTRUCT_MODEL_NAMES = [
+    "gpt-3.5-turbo-instruct",
+]
+
+
+@dataclasses.dataclass
+class TokenUsage:
+    prompt_tokens: int
+    completion_tokens: int
+
+    def reset(self):
+        self.prompt_tokens = self.completion_tokens = 0
+
+
+class OpenAI(BaseBackend):
+    def __init__(
+        self,
+        model_name: str,
+        is_chat_model: Optional[bool] = None,
+        chat_template: Optional[ChatTemplate] = None,
+        is_azure: bool = False,
+        *args,
+        **kwargs,
+    ):
+        super().__init__()
+
+        if isinstance(openai, Exception):
+            raise openai
+
+        if is_azure:
+            self.client = openai.AzureOpenAI(*args, **kwargs)
+        else:
+            self.client = openai.OpenAI(*args, **kwargs)
+
+        self.model_name = model_name
+        try:
+            self.tokenizer = tiktoken.encoding_for_model(model_name)
+        except KeyError:
+            self.tokenizer = tiktoken.get_encoding("cl100k_base")
+        self.logit_bias_int = create_logit_bias_int(self.tokenizer)
+
+        self.chat_template = chat_template or get_chat_template_by_model_path(
+            model_name
+        )
+
+        if is_chat_model is not None:
+            self.is_chat_model = is_chat_model
+        else:
+            if model_name in INSTRUCT_MODEL_NAMES:
+                self.is_chat_model = False
+            else:
+                self.is_chat_model = True
+
+        self.chat_prefix = self.chat_template.role_prefix_and_suffix["assistant"][0]
+
+        # Usage
+        self.token_usage = TokenUsage(0, 0)
+
+        # API speculative execution
+        # TODO(ying): This does not support multi-threading (run_batch)
+        self.spec_kwargs = {}
+        self.spec_format = []
+        self.spec_max_num_tries = 3
+
+    def get_chat_template(self):
+        return self.chat_template
+
+    def _prepare_spec_execution(
+        self,
+        sampling_params: SglSamplingParams,
+        num_api_spec_tokens: int,
+        spec_var_name: str,
+    ):
+        if "max_tokens" not in self.spec_kwargs:
+            self.spec_kwargs["max_tokens"] = num_api_spec_tokens
+        else:
+            assert self.spec_kwargs["max_tokens"] == num_api_spec_tokens
+
+        params = sampling_params.to_openai_kwargs()
+        for key, value in params.items():
+            if key in ["stop"]:
+                continue
+            if key in ["max_tokens"]:
+                warnings.warn(
+                    "The parameter max_tokens will be overwritten by speculated number of tokens."
+                )
+                continue
+            if key not in self.spec_kwargs:
+                self.spec_kwargs[key] = value
+            else:
+                assert (
+                    value == self.spec_kwargs[key]
+                ), "sampling parameters should be consistent if turn on api speculative execution."
+        self.spec_format.append(
+            {"text": "", "stop": params["stop"], "name": spec_var_name}
+        )
+        return "", {}
+
+    def generate(
+        self,
+        s: StreamExecutor,
+        sampling_params: SglSamplingParams,
+        spec_var_name: str = None,
+    ):
+        if sampling_params.dtype is None:
+            if self.is_chat_model:
+                if s.num_api_spec_tokens is None:
+                    if not s.text_.endswith(self.chat_prefix):
+                        raise RuntimeError(
+                            "This use case is not supported if api speculative execution is off. "
+                            "For OpenAI chat models, sgl.gen must be right after sgl.assistant. "
+                            "Example of adding api speculative execution: @function(num_api_spec_tokens=128)."
+                        )
+                    prompt = s.messages_
+                else:
+                    return self._prepare_spec_execution(
+                        sampling_params, s.num_api_spec_tokens, spec_var_name
+                    )
+            else:
+                prompt = s.text_
+
+            kwargs = sampling_params.to_openai_kwargs()
+            if (
+                self.model_name.startswith("o1")
+                or self.model_name.startswith("o3")
+                or "o1" in self.model_name
+            ):
+                kwargs.pop("max_tokens", None)
+            else:
+                kwargs.pop("max_completion_tokens", None)
+
+            comp = openai_completion(
+                client=self.client,
+                token_usage=self.token_usage,
+                is_chat=self.is_chat_model,
+                model=self.model_name,
+                prompt=prompt,
+                **kwargs,
+            )
+            # Keep the returned list (or string) as is.
+        elif sampling_params.dtype in [str, "str", "string"]:
+            assert (
+                not self.is_chat_model
+            ), "constrained type not supported on chat model"
+            kwargs = sampling_params.to_openai_kwargs()
+            kwargs.pop("stop")
+            comp = openai_completion(
+                client=self.client,
+                token_usage=self.token_usage,
+                is_chat=self.is_chat_model,
+                model=self.model_name,
+                prompt=s.text_ + '"',
+                stop='"',
+                **kwargs,
+            )
+            # Wrap each element in quotes if we have a list.
+            if isinstance(comp, list):
+                comp = ['"' + x + '"' for x in comp]
+            else:
+                comp = '"' + comp + '"'
+        elif sampling_params.dtype in [int, "int"]:
+            assert (
+                not self.is_chat_model
+            ), "constrained type not supported on chat model"
+            kwargs = sampling_params.to_openai_kwargs()
+            kwargs.pop("stop")
+            comp = openai_completion(
+                client=self.client,
+                token_usage=self.token_usage,
+                is_chat=self.is_chat_model,
+                model=self.model_name,
+                prompt=s.text_,
+                logit_bias=self.logit_bias_int,
+                stop=[" "],
+                **kwargs,
+            )
+            # Leave as a list if that's what is returned.
+        else:
+            raise ValueError(f"Unknown dtype: {sampling_params.dtype}")
+
+        return comp, {}
+
+    def spec_fill(self, value: str):
+        assert self.is_chat_model
+        self.spec_format.append({"text": value, "stop": None, "name": None})
+
+    def spec_pattern_match(self, comp):
+        for i, term in enumerate(self.spec_format):
+            text = term["text"]
+            if text != "":
+                if comp.startswith(text):
+                    comp = comp[len(text) :]
+                else:
+                    return False
+            else:
+                pos = comp.find(term["stop"])
+                if pos != -1:
+                    term["text"] = comp[:pos]
+                    comp = comp[pos:]
+                else:
+                    if i == len(self.spec_format) - 1:
+                        term["text"] = comp
+                    else:
+                        return False
+        return True
+
+    def role_end_generate(
+        self,
+        s: StreamExecutor,
+    ):
+        if s.num_api_spec_tokens is None or not s.text_.endswith(self.chat_prefix):
+            return
+
+        comp = ""
+        if not all(x["name"] is None for x in self.spec_format):
+            # TODO(ying): throw errors or warnings
+            for i in range(self.spec_max_num_tries):
+                comp = openai_completion(
+                    client=self.client,
+                    token_usage=self.token_usage,
+                    is_chat=self.is_chat_model,
+                    model=self.model_name,
+                    prompt=s.messages_,
+                    **self.spec_kwargs,
+                )
+                # Use a string for pattern matching.
+                comp_for_match = comp[0] if isinstance(comp, list) else comp
+                if self.spec_pattern_match(comp_for_match):
+                    break
+
+        for term in self.spec_format:
+            s.text_ += term["text"]
+            name = term["name"]
+            if name is not None:
+                s.variables[name] = term["text"]
+                s.meta_info[name] = {}
+                s.variable_event[name].set()
+
+        self.spec_kwargs = {}
+        self.spec_format = []
+
+    def generate_stream(
+        self,
+        s: StreamExecutor,
+        sampling_params: SglSamplingParams,
+    ):
+        if sampling_params.dtype is None:
+            if self.is_chat_model:
+                if not s.text_.endswith(self.chat_prefix):
+                    raise RuntimeError(
+                        "This use case is not supported. "
+                        "For OpenAI chat models, sgl.gen must be right after sgl.assistant"
+                    )
+                prompt = s.messages_
+            else:
+                prompt = s.text_
+
+            kwargs = sampling_params.to_openai_kwargs()
+            generator = openai_completion_stream(
+                client=self.client,
+                token_usage=self.token_usage,
+                is_chat=self.is_chat_model,
+                model=self.model_name,
+                prompt=prompt,
+                **kwargs,
+            )
+            return generator
+        else:
+            raise ValueError(f"Unknown dtype: {sampling_params.dtype}")
+
+    def select(
+        self,
+        s: StreamExecutor,
+        choices: List[str],
+        temperature: float,
+        choices_method: ChoicesSamplingMethod,
+    ) -> ChoicesDecision:
+        """Note: `choices_method` is not used by the OpenAI backend."""
+        if self.is_chat_model:
+            raise NotImplementedError(
+                "select/choices is not supported for chat models. "
+                "Please try to use a non-chat model such as gpt-3.5-turbo-instruct"
+            )
+
+        n_choices = len(choices)
+        token_ids = [self.tokenizer.encode(x) for x in choices]
+        scores = [0] * n_choices
+        valid = [len(x) > 0 for x in token_ids]
+        prompt_tokens = self.tokenizer.encode(s.text_)
+
+        max_len = max([len(x) for x in token_ids])
+        for step in range(max_len):
+            # Build logit bias
+            logit_bias = {}
+            for i in range(n_choices):
+                if valid[i]:
+                    logit_bias[token_ids[i][step]] = 100
+
+            # Call API
+            ret = self.client.completions.create(
+                model=self.model_name,
+                prompt=prompt_tokens,
+                logit_bias=logit_bias,
+                max_tokens=1,
+                temperature=temperature,
+            )
+            ret_str = ret.choices[0].text
+            ret_token = self.tokenizer.encode(ret_str)[0]
+            self.token_usage.prompt_tokens += ret.usage.prompt_tokens
+            self.token_usage.completion_tokens = ret.usage.completion_tokens
+
+            # TODO:
+            # 1. return logits as the scores
+            # 2. compute logits of the full choice
+            # 3. consider chunk-based decoding
+
+            # Update valid
+            hit = False
+            for i in range(n_choices):
+                if valid[i]:
+                    if step == len(token_ids[i]) - 1:
+                        valid[i] = False
+
+                    if ret_token == token_ids[i][step]:
+                        scores[i] += 1
+                        hit = True
+                    else:
+                        valid[i] = False
+            assert hit
+
+            if np.sum(valid) <= 1:
+                break
+
+            prompt_tokens.append(ret_token)
+
+        return ChoicesDecision(
+            decision=choices[np.argmax(scores)],
+            meta_info={"scores": scores},
+        )
+
+
+def openai_completion(
+    client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
+) -> Union[str, List[str]]:
+    # if "ebnf" is in kwargs, warn and remove
+    if "ebnf" in kwargs:
+        warnings.warn("EBNF is not officially supported by OpenAI endpoints. Ignoring.")
+        del kwargs["ebnf"]
+
+    for attempt in range(retries):
+        try:
+            if is_chat:
+                if "stop" in kwargs and kwargs["stop"] is None:
+                    kwargs.pop("stop")
+                ret = client.chat.completions.create(messages=prompt, **kwargs)
+                if len(ret.choices) == 1:
+                    comp = ret.choices[0].message.content
+                else:
+                    comp = [c.message.content for c in ret.choices]
+            else:
+                ret = client.completions.create(prompt=prompt, **kwargs)
+                if isinstance(prompt, (list, tuple)):
+                    comp = [c.text for c in ret.choices]
+                else:
+                    comp = ret.choices[0].text
+                    if len(ret.choices) > 1:
+                        comp = [c.text for c in ret.choices]
+
+            token_usage.prompt_tokens += ret.usage.prompt_tokens
+            token_usage.completion_tokens += ret.usage.completion_tokens
+            break
+        except (openai.APIError, openai.APIConnectionError, openai.RateLimitError) as e:
+            logger.error(f"OpenAI Error: {e}. Waiting 5 seconds...")
+            time.sleep(5)
+            if attempt == retries - 1:
+                raise e
+        except Exception as e:
+            logger.error(f"RuntimeError {e}.")
+            raise e
+
+    return comp
+
+
+def openai_completion_stream(
+    client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
+):
+    # if "ebnf" is in kwargs, warn and remove
+    if "ebnf" in kwargs:
+        warnings.warn("EBNF is not officially supported by OpenAI endpoints. Ignoring.")
+        del kwargs["ebnf"]
+
+    for attempt in range(retries):
+        try:
+            if is_chat:
+                if "stop" in kwargs and kwargs["stop"] is None:
+                    kwargs.pop("stop")
+                generator = client.chat.completions.create(
+                    messages=prompt,
+                    stream=True,
+                    stream_options={"include_usage": True},
+                    **kwargs,
+                )
+                for ret in generator:
+                    if len(ret.choices) == 0:
+                        continue
+                    try:
+                        content = ret.choices[0].delta.content
+                    except IndexError:
+                        content = None
+                    yield content or "", {}
+            else:
+                generator = client.completions.create(
+                    prompt=prompt,
+                    stream=True,
+                    stream_options={"include_usage": True},
+                    **kwargs,
+                )
+                for ret in generator:
+                    if len(ret.choices) == 0:
+                        continue
+                    content = ret.choices[0].text
+                    yield content or "", {}
+
+            token_usage.prompt_tokens += ret.usage.prompt_tokens
+            token_usage.completion_tokens += ret.usage.completion_tokens
+            break
+        except (openai.APIError, openai.APIConnectionError, openai.RateLimitError) as e:
+            logger.error(f"OpenAI Error: {e}. Waiting 5 seconds...")
+            time.sleep(5)
+            if attempt == retries - 1:
+                raise e
+        except Exception as e:
+            logger.error(f"RuntimeError {e}.")
+            raise e
diff --git a/python/sglang/lang/backend/runtime_endpoint.py b/python/sglang/lang/backend/runtime_endpoint.py
new file mode 100644
index 000000000..349f9934a
--- /dev/null
+++ b/python/sglang/lang/backend/runtime_endpoint.py
@@ -0,0 +1,527 @@
+import atexit
+import json
+import multiprocessing
+import warnings
+from typing import Dict, List, Optional, Union
+
+import aiohttp
+import requests
+
+from sglang.global_config import global_config
+from sglang.lang.backend.base_backend import BaseBackend
+from sglang.lang.chat_template import get_chat_template, get_chat_template_by_model_path
+from sglang.lang.choices import ChoicesDecision, ChoicesSamplingMethod
+from sglang.lang.interpreter import StreamExecutor
+from sglang.lang.ir import (
+    REGEX_BOOL,
+    REGEX_FLOAT,
+    REGEX_INT,
+    REGEX_STR,
+    SglSamplingParams,
+)
+from sglang.utils import http_request
+
+
+class RuntimeEndpoint(BaseBackend):
+    def __init__(
+        self,
+        base_url: str,
+        api_key: Optional[str] = None,
+        verify: Optional[str] = None,
+        chat_template_name: Optional[str] = None,
+    ):
+        super().__init__()
+        self.support_concate_and_append = True
+
+        self.base_url = base_url
+        self.api_key = api_key
+        self.verify = verify
+
+        res = http_request(
+            self.base_url + "/get_model_info",
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+        self.model_info = res.json()
+
+        if chat_template_name:
+            self.chat_template = get_chat_template(chat_template_name)
+        else:
+            self.chat_template = get_chat_template_by_model_path(
+                self.model_info["model_path"]
+            )
+
+    def get_model_name(self):
+        return self.model_info["model_path"]
+
+    def flush_cache(self):
+        res = http_request(
+            self.base_url + "/flush_cache",
+            api_key=self.api_key,
+            verify=self.verify,
+            method="POST",
+        )
+        self._assert_success(res)
+
+    def get_server_info(self):
+        res = http_request(
+            self.base_url + "/get_server_info",
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+        return res.json()
+
+    def get_chat_template(self):
+        return self.chat_template
+
+    def cache_prefix(self, prefix_str: str):
+        res = http_request(
+            self.base_url + "/generate",
+            json={"text": prefix_str, "sampling_params": {"max_new_tokens": 0}},
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+
+    def start_profile(self):
+        res = http_request(
+            self.base_url + "/start_profile",
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+
+    def stop_profile(self):
+        res = http_request(
+            self.base_url + "/stop_profile",
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+
+    def commit_lazy_operations(self, s: StreamExecutor):
+        data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
+        self._add_images(s, data)
+        res = http_request(
+            self.base_url + "/generate",
+            json=data,
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+
+    def fill_image(self, s: StreamExecutor):
+        data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
+        self._add_images(s, data)
+        res = http_request(
+            self.base_url + "/generate",
+            json=data,
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+
+    def _handle_dtype_to_regex(self, sampling_params: SglSamplingParams):
+        if sampling_params.dtype is None:
+            return
+
+        if sampling_params.stop == ():
+            sampling_params.stop = []
+
+        dtype_regex = None
+        if sampling_params.dtype in ["int", int]:
+
+            dtype_regex = REGEX_INT
+            sampling_params.stop.extend([" ", "\n"])
+        elif sampling_params.dtype in ["float", float]:
+
+            dtype_regex = REGEX_FLOAT
+            sampling_params.stop.extend([" ", "\n"])
+        elif sampling_params.dtype in ["str", str]:
+
+            dtype_regex = REGEX_STR
+        elif sampling_params.dtype in ["bool", bool]:
+
+            dtype_regex = REGEX_BOOL
+        else:
+            raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
+
+        if dtype_regex is not None and sampling_params.regex is not None:
+            warnings.warn(
+                f"Both dtype and regex are set. Only dtype will be used. dtype: {sampling_params.dtype}, regex: {sampling_params.regex}"
+            )
+
+        sampling_params.regex = dtype_regex
+
+    def generate(
+        self,
+        s: StreamExecutor,
+        sampling_params: SglSamplingParams,
+    ):
+        self._handle_dtype_to_regex(sampling_params)
+        data = {
+            "text": s.text_,
+            "sampling_params": {
+                "skip_special_tokens": global_config.skip_special_tokens_in_output,
+                "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
+                **sampling_params.to_srt_kwargs(),
+            },
+        }
+
+        for item in [
+            "return_logprob",
+            "logprob_start_len",
+            "top_logprobs_num",
+            "return_text_in_logprobs",
+        ]:
+            value = getattr(sampling_params, item, None)
+            if value is not None:
+                data[item] = value
+
+        self._add_images(s, data)
+
+        res = http_request(
+            self.base_url + "/generate",
+            json=data,
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+
+        obj = res.json()
+        comp = obj["text"]
+        return comp, obj["meta_info"]
+
+    def generate_stream(
+        self,
+        s: StreamExecutor,
+        sampling_params: SglSamplingParams,
+    ):
+        self._handle_dtype_to_regex(sampling_params)
+
+        data = {
+            "text": s.text_,
+            "sampling_params": {
+                "skip_special_tokens": global_config.skip_special_tokens_in_output,
+                "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
+                **sampling_params.to_srt_kwargs(),
+            },
+        }
+
+        for item in [
+            "return_logprob",
+            "logprob_start_len",
+            "top_logprobs_num",
+            "return_text_in_logprobs",
+        ]:
+            value = getattr(sampling_params, item, None)
+            if value is not None:
+                data[item] = value
+
+        data["stream"] = True
+        self._add_images(s, data)
+
+        res = http_request(
+            self.base_url + "/generate",
+            json=data,
+            stream=True,
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+        pos = 0
+
+        for chunk in res.iter_lines(decode_unicode=False):
+            chunk = chunk.decode("utf-8")
+            if chunk and chunk.startswith("data:"):
+                if chunk == "data: [DONE]":
+                    break
+                data = json.loads(chunk[5:].strip("\n"))
+                chunk_text = data["text"][pos:]
+                meta_info = data["meta_info"]
+                pos += len(chunk_text)
+                yield chunk_text, meta_info
+
+    def select(
+        self,
+        s: StreamExecutor,
+        choices: List[str],
+        temperature: float,
+        choices_method: ChoicesSamplingMethod,
+    ) -> ChoicesDecision:
+        assert temperature <= 1e-5
+
+        # Cache common prefix
+        data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
+        obj = self._generate_http_request(s, data)
+        prompt_len = obj["meta_info"]["prompt_tokens"]
+        logprob_start_len = max(prompt_len - 2, 0)  # For token healing
+
+        # Compute logprob
+        data = {
+            "text": [s.text_ + c for c in choices],
+            "sampling_params": {
+                "max_new_tokens": 0,
+                "temperature": 0,
+            },
+            "return_logprob": True,
+            "return_text_in_logprobs": True,
+            "logprob_start_len": logprob_start_len,
+        }
+        obj = self._generate_http_request(s, data)
+
+        input_token_logprobs = [r["meta_info"]["input_token_logprobs"] for r in obj]
+        output_token_logprobs = [r["meta_info"]["output_token_logprobs"] for r in obj]
+        normalized_prompt_logprobs = [
+            compute_normalized_prompt_logprobs(r["meta_info"]["input_token_logprobs"])
+            for r in obj
+        ]
+
+        # Remove extra token if no token healing occurred
+        for i in range(len(input_token_logprobs)):
+            healed_token_str = input_token_logprobs[i][0][-1]
+            if s.text_.endswith(healed_token_str):
+                healed_token_logprob = input_token_logprobs[i][0][0]
+                normalized_prompt_logprobs[i] = (
+                    normalized_prompt_logprobs[i] * len(input_token_logprobs[i])
+                    - healed_token_logprob
+                ) / (len(input_token_logprobs[i]) - 1)
+                input_token_logprobs[i] = input_token_logprobs[i][1:]
+
+        # Compute unconditional logprobs if required
+        if choices_method.requires_unconditional_logprobs:
+            input_ids = [[el[1] for el in subl] for subl in input_token_logprobs]
+            data = {
+                "input_ids": input_ids,
+                "sampling_params": {"max_new_tokens": 0},
+                "return_logprob": True,
+            }
+            obj = self._generate_http_request(s, data)
+            unconditional_token_logprobs = [
+                r["meta_info"]["input_token_logprobs"] for r in obj
+            ]
+        else:
+            unconditional_token_logprobs = None
+
+        return choices_method(
+            choices=choices,
+            normalized_prompt_logprobs=normalized_prompt_logprobs,
+            input_token_logprobs=input_token_logprobs,
+            output_token_logprobs=output_token_logprobs,
+            unconditional_token_logprobs=unconditional_token_logprobs,
+        )
+
+    def concatenate_and_append(self, src_rids: List[str], dst_rid: str):
+        res = http_request(
+            self.base_url + "/concate_and_append_request",
+            json={"src_rids": src_rids, "dst_rid": dst_rid},
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+
+    def _generate_http_request(self, s: StreamExecutor, data):
+        self._add_images(s, data)
+        res = http_request(
+            self.base_url + "/generate",
+            json=data,
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+        return res.json()
+
+    def _add_images(self, s: StreamExecutor, data):
+        if s.images_:
+            assert len(s.images_) == 1, "Only support one image."
+            data["image_data"] = s.images_[0][1]
+
+    def _assert_success(self, res):
+        if res.status_code != 200:
+            try:
+                content = res.json()
+            except json.JSONDecodeError:
+                content = res.text
+            raise RuntimeError(content)
+
+
+def compute_normalized_prompt_logprobs(input_logprobs):
+    values = [x[0] for x in input_logprobs if x[0]]
+    return sum(values) / len(values)
+
+
+class Runtime:
+    """
+    A wrapper for the HTTP server.
+    This is used for launching the server in a python program without
+    using the command line interface.
+
+    It is mainly used for the frontend language.
+    You should use the Engine class if you want to do normal offline processing without the frontend language.
+    """
+
+    def __init__(
+        self,
+        log_level: str = "error",
+        *args,
+        **kwargs,
+    ):
+        """See the arguments in server_args.py::ServerArgs"""
+        # We delay the import of any `sglang.srt` components in `sglang.lang`, so users can run
+        # client code without installing SRT server and its dependency if they want.
+        from sglang.srt.entrypoints.http_server import launch_server
+        from sglang.srt.server_args import ServerArgs
+        from sglang.srt.utils import is_port_available
+
+        self.server_args = ServerArgs(*args, log_level=log_level, **kwargs)
+
+        # Pre-allocate ports
+        for port in range(self.server_args.port, 40000):
+            if is_port_available(port):
+                break
+        self.server_args.port = port
+
+        self.url = self.server_args.url()
+        self.generate_url = self.url + "/generate"
+
+        # NOTE: We store pid instead of proc to fix some issues during __delete__
+        self.pid = None
+        pipe_reader, pipe_writer = multiprocessing.Pipe(duplex=False)
+
+        ctx = multiprocessing.get_context("spawn")
+        proc = ctx.Process(
+            target=launch_server,
+            args=(self.server_args, pipe_writer),
+        )
+        proc.start()
+        pipe_writer.close()
+        self.pid = proc.pid
+
+        # Before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
+        atexit.register(self.shutdown)
+
+        # TODO: remove this pipe_writer mechanism and use `/health_generate` instead.
+        try:
+            init_state = pipe_reader.recv()
+        except EOFError:
+            init_state = ""
+
+        if init_state != "ready":
+            self.shutdown()
+            raise RuntimeError(
+                "Initialization failed. Please see the error messages above."
+            )
+
+        self.endpoint = RuntimeEndpoint(self.url)
+
+    def shutdown(self):
+        from sglang.srt.utils import kill_process_tree
+
+        if self.pid is not None:
+            kill_process_tree(self.pid)
+            self.pid = None
+
+    def start_profile(self):
+        self.endpoint.start_profile()
+
+    def stop_profile(self):
+        self.endpoint.stop_profile()
+
+    def cache_prefix(self, prefix: str):
+        self.endpoint.cache_prefix(prefix)
+
+    def get_tokenizer(self):
+        from sglang.srt.hf_transformers_utils import get_tokenizer
+
+        return get_tokenizer(
+            self.server_args.tokenizer_path,
+            tokenizer_mode=self.server_args.tokenizer_mode,
+            trust_remote_code=self.server_args.trust_remote_code,
+            revision=self.server_args.revision,
+        )
+
+    async def async_generate(
+        self,
+        prompt: str,
+        sampling_params: Optional[Dict] = None,
+    ):
+        if self.server_args.skip_tokenizer_init:
+            json_data = {
+                "input_ids": prompt,
+                "sampling_params": sampling_params,
+                "stream": True,
+            }
+        else:
+            json_data = {
+                "text": prompt,
+                "sampling_params": sampling_params,
+                "stream": True,
+            }
+        pos = 0
+
+        timeout = aiohttp.ClientTimeout(total=3 * 3600)
+        async with aiohttp.ClientSession(timeout=timeout, trust_env=True) as session:
+            async with session.post(self.generate_url, json=json_data) as response:
+                async for chunk, _ in response.content.iter_chunks():
+                    chunk = chunk.decode("utf-8")
+                    if chunk and chunk.startswith("data:"):
+                        if chunk == "data: [DONE]\n\n":
+                            break
+                        data = json.loads(chunk[5:].strip("\n"))
+                        if "text" in data:
+                            cur = data["text"][pos:]
+                            if cur:
+                                yield cur
+                            pos += len(cur)
+                        else:
+                            yield data
+
+    add_request = async_generate
+
+    def generate(
+        self,
+        prompt: Union[str, List[str]],
+        sampling_params: Optional[Dict] = None,
+        return_logprob: Optional[Union[List[bool], bool]] = False,
+        logprob_start_len: Optional[Union[List[int], int]] = None,
+        top_logprobs_num: Optional[Union[List[int], int]] = None,
+        lora_path: Optional[List[Optional[str]]] = None,
+    ):
+        json_data = {
+            "text": prompt,
+            "sampling_params": sampling_params,
+            "return_logprob": return_logprob,
+            "logprob_start_len": logprob_start_len,
+            "top_logprobs_num": top_logprobs_num,
+            "lora_path": lora_path,
+        }
+        assert not isinstance(lora_path, list) or len(lora_path) == len(prompt)
+        response = requests.post(
+            self.url + "/generate",
+            json=json_data,
+        )
+        return json.dumps(response.json())
+
+    def encode(
+        self,
+        prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
+    ):
+        json_data = {"text": prompt}
+        response = requests.post(self.url + "/encode", json=json_data)
+        return json.dumps(response.json())
+
+    async def get_server_info(self):
+        async with aiohttp.ClientSession() as session:
+            async with session.get(f"{self.url}/get_server_info") as response:
+                if response.status == 200:
+                    return await response.json()
+                else:
+                    error_data = await response.json()
+                    raise RuntimeError(
+                        f"Failed to get server info. {error_data['error']['message']}"
+                    )
+
+    def __del__(self):
+        self.shutdown()
diff --git a/python/sglang/lang/backend/vertexai.py b/python/sglang/lang/backend/vertexai.py
new file mode 100644
index 000000000..3d51fb137
--- /dev/null
+++ b/python/sglang/lang/backend/vertexai.py
@@ -0,0 +1,148 @@
+import os
+import warnings
+
+from sglang.lang.backend.base_backend import BaseBackend
+from sglang.lang.chat_template import get_chat_template
+from sglang.lang.interpreter import StreamExecutor
+from sglang.lang.ir import SglSamplingParams
+
+try:
+    import vertexai
+    from vertexai.preview.generative_models import (
+        GenerationConfig,
+        GenerativeModel,
+        Image,
+    )
+except ImportError as e:
+    GenerativeModel = e
+
+
+class VertexAI(BaseBackend):
+    def __init__(self, model_name, safety_settings=None):
+        super().__init__()
+
+        if isinstance(GenerativeModel, Exception):
+            raise GenerativeModel
+
+        project_id = os.environ["GCP_PROJECT_ID"]
+        location = os.environ.get("GCP_LOCATION")
+        vertexai.init(project=project_id, location=location)
+
+        self.model_name = model_name
+        self.chat_template = get_chat_template("default")
+        self.safety_settings = safety_settings
+
+    def get_chat_template(self):
+        return self.chat_template
+
+    def generate(
+        self,
+        s: StreamExecutor,
+        sampling_params: SglSamplingParams,
+    ):
+        if s.messages_:
+            prompt = self.messages_to_vertexai_input(s.messages_)
+        else:
+            # single-turn
+            prompt = (
+                self.text_to_vertexai_input(s.text_, s.cur_images)
+                if s.cur_images
+                else s.text_
+            )
+        ret = GenerativeModel(self.model_name).generate_content(
+            prompt,
+            generation_config=GenerationConfig(**sampling_params.to_vertexai_kwargs()),
+            safety_settings=self.safety_settings,
+        )
+
+        comp = ret.text
+
+        return comp, {}
+
+    def generate_stream(
+        self,
+        s: StreamExecutor,
+        sampling_params: SglSamplingParams,
+    ):
+        if s.messages_:
+            prompt = self.messages_to_vertexai_input(s.messages_)
+        else:
+            # single-turn
+            prompt = (
+                self.text_to_vertexai_input(s.text_, s.cur_images)
+                if s.cur_images
+                else s.text_
+            )
+        generator = GenerativeModel(self.model_name).generate_content(
+            prompt,
+            stream=True,
+            generation_config=GenerationConfig(**sampling_params.to_vertexai_kwargs()),
+            safety_settings=self.safety_settings,
+        )
+        for ret in generator:
+            yield ret.text, {}
+
+    def text_to_vertexai_input(self, text, images):
+        input = []
+        # split with image token
+        text_segs = text.split(self.chat_template.image_token)
+        for image_path, image_base64_data in images:
+            text_seg = text_segs.pop(0)
+            if text_seg != "":
+                input.append(text_seg)
+            input.append(Image.from_bytes(image_base64_data))
+        text_seg = text_segs.pop(0)
+        if text_seg != "":
+            input.append(text_seg)
+        return input
+
+    def messages_to_vertexai_input(self, messages):
+        vertexai_message = []
+        # from openai message format to vertexai message format
+        for msg in messages:
+            if isinstance(msg["content"], str):
+                text = msg["content"]
+            else:
+                text = msg["content"][0]["text"]
+
+            if msg["role"] == "system":
+                warnings.warn("Warning: system prompt is not supported in VertexAI.")
+                vertexai_message.append(
+                    {
+                        "role": "user",
+                        "parts": [{"text": "System prompt: " + text}],
+                    }
+                )
+                vertexai_message.append(
+                    {
+                        "role": "model",
+                        "parts": [{"text": "Understood."}],
+                    }
+                )
+                continue
+            if msg["role"] == "user":
+                vertexai_msg = {
+                    "role": "user",
+                    "parts": [{"text": text}],
+                }
+            elif msg["role"] == "assistant":
+                vertexai_msg = {
+                    "role": "model",
+                    "parts": [{"text": text}],
+                }
+
+            # images
+            if isinstance(msg["content"], list) and len(msg["content"]) > 1:
+                for image in msg["content"][1:]:
+                    assert image["type"] == "image_url"
+                    vertexai_msg["parts"].append(
+                        {
+                            "inline_data": {
+                                "data": image["image_url"]["url"].split(",")[1],
+                                "mime_type": "image/jpeg",
+                            }
+                        }
+                    )
+
+            vertexai_message.append(vertexai_msg)
+        return vertexai_message
diff --git a/python/sglang/lang/chat_template.py b/python/sglang/lang/chat_template.py
new file mode 100644
index 000000000..80ea6d963
--- /dev/null
+++ b/python/sglang/lang/chat_template.py
@@ -0,0 +1,662 @@
+import re
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import Callable, Dict, List, Tuple
+
+
+class ChatTemplateStyle(Enum):
+    PLAIN = auto()
+    LLAMA2 = auto()
+
+
+@dataclass
+class ChatTemplate:
+    name: str
+    default_system_prompt: str
+    role_prefix_and_suffix: Dict[str, Tuple[str, str]]
+    stop_str: List[str] = ()
+    image_token: str = "<image>"
+    audio_token: str = "<audio>"
+    style: ChatTemplateStyle = ChatTemplateStyle.PLAIN
+
+    def get_prefix_and_suffix(
+        self, role: str, hist_messages: List[Dict]
+    ) -> Tuple[str, str]:
+        prefix, suffix = self.role_prefix_and_suffix.get(role, ("", ""))
+
+        if self.style == ChatTemplateStyle.LLAMA2:
+            if role == "system" and not hist_messages:
+                user_prefix, _ = self.role_prefix_and_suffix.get("user", ("", ""))
+                system_prefix, system_suffix = self.role_prefix_and_suffix.get(
+                    "system", ("", "")
+                )
+                return (user_prefix + system_prefix, system_suffix)
+            elif (
+                role == "user"
+                and len(hist_messages) == 1
+                and hist_messages[0]["content"] is not None
+            ):
+                return ("", suffix)
+
+        return prefix, suffix
+
+    def get_prompt(self, messages: List[Dict]) -> str:
+        prompt = ""
+        for i, message in enumerate(messages):
+            role, content = message["role"], message["content"]
+            if role == "system" and content is None:
+                content = self.default_system_prompt
+                if content is None:
+                    continue
+
+            prefix, suffix = self.get_prefix_and_suffix(role, messages[:i])
+            prompt += f"{prefix}{content}{suffix}"
+        return prompt
+
+
+chat_template_registry: Dict[str, ChatTemplate] = {}
+matching_function_registry: List[Callable] = []
+
+
+def register_chat_template(template):
+    chat_template_registry[template.name] = template
+
+
+def register_chat_template_matching_function(func):
+    matching_function_registry.append(func)
+
+
+def get_chat_template(name):
+    return chat_template_registry[name]
+
+
+def get_chat_template_by_model_path(model_path):
+    for matching_func in matching_function_registry:
+        template_name = matching_func(model_path)
+        if template_name is not None:
+            return get_chat_template(template_name)
+    return get_chat_template("default")
+
+
+register_chat_template(
+    ChatTemplate(
+        name="default",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("SYSTEM:", "\n"),
+            "user": ("USER:", "\n"),
+            "assistant": ("ASSISTANT:", "\n"),
+        },
+    )
+)
+
+register_chat_template(
+    ChatTemplate(
+        name="claude",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("", ""),
+            "user": ("\n\nHuman: ", ""),
+            "assistant": ("\n\nAssistant:", ""),
+        },
+    )
+)
+
+register_chat_template(
+    ChatTemplate(
+        name="chatml",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("<|im_start|>system\n", "<|im_end|>\n"),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+        stop_str=("<|im_end|>",),
+    )
+)
+
+register_chat_template(
+    ChatTemplate(
+        name="chatml-llava",
+        default_system_prompt="You are a helpful assistant.",
+        role_prefix_and_suffix={
+            "system": ("<|im_start|>system\n", "<|im_end|>\n"),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+        stop_str=("<|im_end|>",),
+        image_token="<image>\n",
+    )
+)
+
+# There is default system prompt for qwen
+# reference: https://modelscope.cn/models/qwen/Qwen2-72B-Instruct/file/view/master?fileName=tokenizer_config.json&status=1
+# The chat template is: "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+register_chat_template(
+    ChatTemplate(
+        name="qwen",
+        default_system_prompt="You are a helpful assistant.",
+        role_prefix_and_suffix={
+            "system": ("<|im_start|>system\n", "<|im_end|>\n"),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+        stop_str=("<|im_end|>",),
+    )
+)
+
+# Reference: https://huggingface.co/docs/transformers/main/model_doc/qwen2_vl#usage-example
+register_chat_template(
+    ChatTemplate(
+        name="qwen2-vl",
+        default_system_prompt="You are a helpful assistant.",
+        role_prefix_and_suffix={
+            "system": ("<|im_start|>system\n", "<|im_end|>\n"),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+        stop_str=("<|im_end|>",),
+        image_token="<|vision_start|><|image_pad|><|vision_end|>",
+    )
+)
+
+# Reference: https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md#prompt-template
+register_chat_template(
+    ChatTemplate(
+        name="vicuna_v1.1",
+        default_system_prompt=(
+            "A chat between a curious user and an artificial intelligence assistant. "
+            "The assistant gives helpful, detailed, and polite answers to the user's questions."
+        ),
+        role_prefix_and_suffix={
+            "system": ("", " "),
+            "user": ("USER:", " "),
+            "assistant": ("ASSISTANT:", "</s>"),
+        },
+        image_token=" <image>\n",
+    )
+)
+
+register_chat_template(
+    ChatTemplate(
+        name="llama-2-chat",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("<<SYS>>\n", "\n<</SYS>>\n\n"),
+            "user": ("[INST] ", " [/INST]"),
+            "assistant": ("", " </s><s>"),
+        },
+        style=ChatTemplateStyle.LLAMA2,
+    )
+)
+
+# Reference: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/blob/main/chat_template.json
+register_chat_template(
+    ChatTemplate(
+        name="mistral",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("[SYSTEM_PROMPT] ", " [/SYSTEM_PROMPT]"),
+            "user": ("[INST] ", " [/INST]"),
+            "assistant": ("", " </s><s>"),
+        },
+        stop_str=("</s>",),
+        image_token="[IMG]",
+    )
+)
+
+register_chat_template(
+    ChatTemplate(
+        name="llama-3-instruct",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": (
+                "<|start_header_id|>system<|end_header_id|>\n\n",
+                "<|eot_id|>",
+            ),
+            "user": (
+                "<|start_header_id|>user<|end_header_id|>\n\n",
+                "<|eot_id|>",
+            ),
+            "assistant": (
+                "<|start_header_id|>assistant<|end_header_id|>\n\n",
+                "<|eot_id|>",
+            ),
+        },
+        stop_str=("<|eot_id|>",),
+        image_token="<|image|>",
+    )
+)
+
+# https://huggingface.co/openbmb/MiniCPM-V-2_6
+register_chat_template(
+    ChatTemplate(
+        name="minicpmv",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("", " "),
+            "user": ("user:", " "),
+            "assistant": ("assistant:", "</s>"),
+        },
+        stop_str=("<|im_end|>", "<|endoftext|>"),
+        image_token="(<image>./</image>)",
+    )
+)
+
+register_chat_template(
+    ChatTemplate(
+        name="janus-pro",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": (
+                "",
+                "",
+            ),
+            "User": (
+                "<｜User｜>",
+                "",
+            ),
+            "assistant": (
+                "<｜Assistant｜>",
+                "<｜end▁of▁sentence｜>",
+            ),
+        },
+        stop_str=("<｜end▁of▁sentence｜>",),
+        image_token="<image_placeholder>\n",
+    )
+)
+
+# https://huggingface.co/openbmb/MiniCPM-o-2_6
+register_chat_template(
+    ChatTemplate(
+        name="minicpmo",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("", " "),
+            "user": ("user:", " "),
+            "assistant": ("assistant:", "</s>"),
+        },
+        stop_str=("<|im_end|>", "<|endoftext|>"),
+        image_token="(<image>./</image>)",
+        audio_token="(<audio>./</audio>)",
+    )
+)
+
+register_chat_template(
+    ChatTemplate(
+        name="janus",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": (
+                "",
+                "",
+            ),
+            "user": (
+                "<｜User｜>",
+                "",
+            ),
+            "assistant": (
+                "<｜Assistant｜>",
+                "<｜end▁of▁sentence｜>",
+            ),
+        },
+        stop_str=("<｜end▁of▁sentence｜>",),
+        image_token="<image_placeholder>\n",
+    )
+)
+
+# The difference between "llama-3-instruct-llava" and "llama-3-instruct" is that llava uses a different image_token.
+register_chat_template(
+    ChatTemplate(
+        name="llama-3-instruct-llava",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": (
+                "<|start_header_id|>system<|end_header_id|>\n\n",
+                "<|eot_id|>",
+            ),
+            "user": (
+                "<|start_header_id|>user<|end_header_id|>\n\n",
+                "<|eot_id|>",
+            ),
+            "assistant": (
+                "<|start_header_id|>assistant<|end_header_id|>\n\n",
+                "<|eot_id|>",
+            ),
+        },
+        stop_str=("<|eot_id|>",),
+        image_token="<image>\n",
+    )
+)
+
+# Reference: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct/blob/main/chat_template.json
+register_chat_template(
+    ChatTemplate(
+        name="llama-4",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": (
+                "<|header_start|>system<|header_end|>\n\n",
+                "<|eot|>",
+            ),
+            "user": (
+                "<|header_start|>user<|header_end|>\n\n",
+                "<|eot|>",
+            ),
+            "assistant": (
+                "<|header_start|>assistant<|header_end|>\n\n",
+                "<|eot|>",
+            ),
+        },
+        stop_str=("<|eot|>",),
+        image_token="<|image|>",
+    )
+)
+
+# Reference: https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/file/view/master?fileName=tokenizer_config.json&status=1
+register_chat_template(
+    ChatTemplate(
+        name="yi-1.5",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("", ""),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n<|im_start|>assistant\n"),
+            "assistant": ("", "<|im_end|>\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+        stop_str=("<|im_end|>",),
+    )
+)
+
+# Reference: https://github.com/01-ai/Yi/tree/main/VL#major-difference-with-llava
+register_chat_template(
+    ChatTemplate(
+        name="yi-vl",
+        default_system_prompt=(
+            "This is a chat between an inquisitive human and an AI assistant. Assume the role of the AI assistant. Read all the images carefully, and respond to the human's questions with informative, helpful, detailed and polite answers."
+            "这是一个好奇的人类和一个人工智能助手之间的对话。假设你扮演这个AI助手的角色。仔细阅读所有的图像，并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。"
+        ),
+        role_prefix_and_suffix={
+            "system": ("", "\n\n"),
+            "user": ("### Human:", "\n"),
+            "assistant": ("### Assistant:", "\n"),
+        },
+        image_token=" <image_placeholder>\n",
+    )
+)
+
+register_chat_template(
+    ChatTemplate(
+        name="gemma-it",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("", ""),
+            "user": ("<start_of_turn>user\n", "<end_of_turn>\n"),
+            "assistant": ("<start_of_turn>model\n", "<end_of_turn>\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+    )
+)
+
+register_chat_template(
+    ChatTemplate(
+        name="dbrx-instruct",
+        default_system_prompt="You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\nYOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\nYou assist with various tasks, from writing to coding (using markdown for code blocks — remember to use ``` with code, JSON, and tables).\n(You do not have real-time data access or code execution capabilities. You avoid stereotyping and provide balanced perspectives on controversial topics. You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\nThis is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER'S QUERY.",
+        role_prefix_and_suffix={
+            "system": ("<|im_start|>system\n", "<|im_end|>"),
+            "user": ("\n<|im_start|>user\n", "<|im_end|>"),
+            "assistant": ("\n<|im_start|>assistant\n", "<|im_end|>"),
+        },
+        stop_str=("<|im_end|>",),
+    )
+)
+
+register_chat_template(
+    ChatTemplate(
+        name="c4ai-command-r",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": (
+                "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>",
+                "<|END_OF_TURN_TOKEN|>",
+            ),
+            "user": ("<|START_OF_TURN_TOKEN|><|USER_TOKEN|>", "<|END_OF_TURN_TOKEN|>"),
+            "assistant": (
+                "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
+                "<|END_OF_TURN_TOKEN|>",
+            ),
+        },
+        style=ChatTemplateStyle.PLAIN,
+    )
+)
+
+# Adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
+register_chat_template(
+    ChatTemplate(
+        name="internvl-2-5",
+        default_system_prompt="你是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。",
+        role_prefix_and_suffix={
+            "system": ("<|im_start|>system\n", "<|im_end|>\n"),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
+        },
+        stop_str=["<|im_end|>", "<|action_end|>"],
+    )
+)
+
+register_chat_template(
+    ChatTemplate(
+        name="interns1",
+        default_system_prompt="You are an AI assistant whose name is Intern-S1 (书生大模型).\n- Intern-S1 (书生大模型) is a vision-language model that is developed by Shanghai AI Laboratory (上海人工智能实验室).  It is designed to be helpful, honest, and harmless.\n- Intern-S1 (书生大模型) can understand and communicate fluently in the language chosen by the user such as English and 中文.\nYou are an expert reasoner with extensive experience in all areas. You approach problems through systematic thinking and rigorous reasoning. Your response should reflect deep understanding and precise logical thinking, making your solution path and reasoning clear to others. Please put your thinking process within <think>...</think> tags.",
+        role_prefix_and_suffix={
+            "system": ("<|im_start|>system\n", "<|im_end|>\n"),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
+        },
+        stop_str=["<|im_end|>", "<|action_end|>"],
+    )
+)
+
+register_chat_template(
+    ChatTemplate(
+        name="granite-3-instruct",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": (
+                "<|start_of_role|>system<|end_of_role|>",
+                "<|end_of_text|>",
+            ),
+            "user": (
+                "<|start_of_role|>user<|end_of_role|>",
+                "<|end_of_text|>",
+            ),
+            "assistant": (
+                "<|start_of_role|>assistant<|end_of_role|>",
+                "<|end_of_text|>",
+            ),
+        },
+        stop_str=("<|end_of_text|>",),
+    )
+)
+
+register_chat_template(
+    ChatTemplate(
+        name="deepseek-v3",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": (
+                "",
+                "",
+            ),
+            "user": (
+                "<｜User｜>",
+                "",
+            ),
+            "assistant": (
+                "<｜Assistant｜>",
+                "<｜end▁of▁sentence｜>",
+            ),
+        },
+        stop_str=("<｜end▁of▁sentence｜>",),
+    )
+)
+
+# Reference: https://huggingface.co/docs/transformers/main/model_doc/glm4_v#usage-example
+register_chat_template(
+    ChatTemplate(
+        name="glm-4v",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("<|system|>\n", "\n"),
+            "user": ("<|user|>\n", "\n"),
+            "assistant": ("<|assistant|>\n", "\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+        stop_str=["<|user|>", "<|endoftext|>", "<|observation|>"],
+        image_token="<|image|>",
+    )
+)
+
+
+@register_chat_template_matching_function
+def match_deepseek(model_path: str):
+    if re.search(r"deepseek-(v3|r1)", model_path, re.IGNORECASE) and not re.search(
+        r"base", model_path, re.IGNORECASE
+    ):
+        return "deepseek-v3"
+
+
+@register_chat_template_matching_function
+def match_deepseek_janus_pro(model_path: str):
+    if re.search(r"janus", model_path, re.IGNORECASE):
+        return "janus-pro"
+
+
+@register_chat_template_matching_function
+def match_dbrx(model_path: str):
+    if re.search(r"dbrx", model_path, re.IGNORECASE) and re.search(
+        r"instruct", model_path, re.IGNORECASE
+    ):
+        return "dbrx-instruct"
+
+
+@register_chat_template_matching_function
+def match_vicuna(model_path: str):
+    if re.search(r"vicuna|llava-v1\.5|llava-next-video-7b", model_path, re.IGNORECASE):
+        return "vicuna_v1.1"
+
+
+@register_chat_template_matching_function
+def match_llama2_chat(model_path: str):
+    if re.search(
+        r"llama-2.*chat|codellama.*instruct",
+        model_path,
+        re.IGNORECASE,
+    ):
+        return "llama-2-chat"
+
+
+@register_chat_template_matching_function
+def match_mistral(model_path: str):
+    if re.search(r"pixtral|(mistral|mixtral).*instruct", model_path, re.IGNORECASE):
+        return "mistral"
+
+
+@register_chat_template_matching_function
+def match_llama3_instruct(model_path: str):
+    if re.search(r"llama-3.*instruct", model_path, re.IGNORECASE):
+        return "llama-3-instruct"
+
+
+@register_chat_template_matching_function
+def match_chat_ml(model_path: str):
+    if re.search(r"tinyllama", model_path, re.IGNORECASE):
+        return "chatml"
+    if re.search(r"qwen.*vl", model_path, re.IGNORECASE):
+        return "qwen2-vl"
+    if re.search(r"glm[-_]?4(\.\d+)?v", model_path, re.IGNORECASE):
+        return "glm-4v"
+    if re.search(r"qwen.*(chat|instruct)", model_path, re.IGNORECASE) and not re.search(
+        r"llava", model_path, re.IGNORECASE
+    ):
+        return "qwen"
+    if re.search(
+        r"llava-v1\.6-34b|llava-v1\.6-yi-34b|llava-next-video-34b|llava-onevision-qwen2",
+        model_path,
+        re.IGNORECASE,
+    ):
+        return "chatml-llava"
+
+
+@register_chat_template_matching_function
+def match_chat_yi(model_path: str):
+    if re.search(r"yi-vl", model_path, re.IGNORECASE) and not re.search(
+        r"llava", model_path, re.IGNORECASE
+    ):
+        return "yi-vl"
+    elif re.search(r"yi-1\.5.*chat", model_path, re.IGNORECASE):
+        return "yi-1.5"
+
+
+@register_chat_template_matching_function
+def match_gemma_it(model_path: str):
+    if re.search(r"gemma.*it", model_path, re.IGNORECASE):
+        return "gemma-it"
+
+
+@register_chat_template_matching_function
+def match_openbmb_minicpm(model_path: str):
+    if re.search(r"minicpm-v", model_path, re.IGNORECASE):
+        return "minicpmv"
+    elif re.search(r"minicpm-o", model_path, re.IGNORECASE):
+        return "minicpmo"
+
+
+@register_chat_template_matching_function
+def match_c4ai_command_r(model_path: str):
+    if re.search(r"c4ai-command-r", model_path, re.IGNORECASE):
+        return "c4ai-command-r"
+
+
+@register_chat_template_matching_function
+def match_granite_instruct(model_path: str):
+    if re.search(r"granite.*instruct", model_path, re.IGNORECASE):
+        return "granite-3-instruct"
+
+
+@register_chat_template_matching_function
+def match_gemma3_instruct(model_path: str):
+    if re.search(r"gemma-3", model_path, re.IGNORECASE):
+        return "gemma-it"
+
+
+@register_chat_template_matching_function
+def match_internvl_chat(model_path: str):
+    if re.search(r"internvl2_5", model_path, re.IGNORECASE):
+        return "internvl-2-5"
+
+
+@register_chat_template_matching_function
+def match_interns1_chat(model_path: str):
+    if re.search(r"intern-s1", model_path, re.IGNORECASE):
+        return "interns1"
+    if re.search(r"interns1", model_path, re.IGNORECASE):
+        return "interns1"
+
+
+if __name__ == "__main__":
+    messages = [
+        {"role": "system", "content": None},  # None means default
+        # {"role": "system", "content": "You are a helpful, respectful and honest assistant."},
+        {"role": "user", "content": "Hello!"},
+        {"role": "assistant", "content": "Hi!"},
+        {"role": "user", "content": "What can you do?"},
+        {"role": "assistant", "content": "I can chat with you."},
+    ]
+
+    template = get_chat_template("llama-2-chat")
+    print(template.get_prompt(messages))
diff --git a/python/sglang/lang/choices.py b/python/sglang/lang/choices.py
new file mode 100644
index 000000000..e52c6b362
--- /dev/null
+++ b/python/sglang/lang/choices.py
@@ -0,0 +1,164 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+
+
+@dataclass
+class ChoicesDecision:
+    decision: str
+    meta_info: Optional[Dict[str, Any]] = None
+
+
+class ChoicesSamplingMethod(ABC):
+
+    @property
+    def requires_unconditional_logprobs(self) -> bool:
+        return False
+
+    @abstractmethod
+    def __call__(
+        self,
+        *,
+        choices: List[str],
+        normalized_prompt_logprobs: List[float],
+        input_token_logprobs: List[List[Any]],
+        output_token_logprobs: List[List[Any]],
+        unconditional_token_logprobs: Optional[List[List[Any]]] = None,
+    ) -> ChoicesDecision: ...
+
+
+class TokenLengthNormalized(ChoicesSamplingMethod):
+
+    def __call__(
+        self,
+        *,
+        choices: List[str],
+        normalized_prompt_logprobs: List[float],
+        input_token_logprobs: List[List[Any]],
+        output_token_logprobs: List[List[Any]],
+        unconditional_token_logprobs: Optional[List[List[Any]]] = None,
+    ) -> ChoicesDecision:
+        """Select the option with the highest token length normalized prompt logprob."""
+        best_choice = choices[np.argmax(normalized_prompt_logprobs)]
+        meta_info = {
+            "normalized_prompt_logprobs": normalized_prompt_logprobs,
+            "input_token_logprobs": input_token_logprobs,
+            "output_token_logprobs": output_token_logprobs,
+        }
+        return ChoicesDecision(decision=best_choice, meta_info=meta_info)
+
+
+token_length_normalized = TokenLengthNormalized()
+
+
+class GreedyTokenSelection(ChoicesSamplingMethod):
+
+    def __call__(
+        self,
+        *,
+        choices: List[str],
+        normalized_prompt_logprobs: List[float],
+        input_token_logprobs: List[List[Any]],
+        output_token_logprobs: List[List[Any]],
+        unconditional_token_logprobs: Optional[List[List[Any]]] = None,
+    ) -> ChoicesDecision:
+        """Select the option based on greedy logprob selection. For overlapping options
+        where one option is a subset of a longer option, extend the shorter option using
+        its average logprob for comparison against the longer option."""
+
+        num_options = len(choices)
+        max_tokens = max(len(option) for option in input_token_logprobs)
+        logprob_matrix = self._build_logprob_matrix(
+            input_token_logprobs, max_tokens, num_options
+        )
+        remaining = self._greedy_selection(logprob_matrix, num_options, max_tokens)
+
+        best_choice = choices[remaining[0]]
+        meta_info = {
+            "normalized_prompt_logprobs": normalized_prompt_logprobs,
+            "input_token_logprobs": input_token_logprobs,
+            "output_token_logprobs": output_token_logprobs,
+            "greedy_logprob_matrix": logprob_matrix.tolist(),
+        }
+        return ChoicesDecision(decision=best_choice, meta_info=meta_info)
+
+    def _build_logprob_matrix(self, input_token_logprobs, max_tokens, num_options):
+        logprob_matrix = np.zeros((num_options, max_tokens))
+        for i, option in enumerate(input_token_logprobs):
+            actual_logprobs = [token[0] for token in option]
+            avg_logprob = np.mean(actual_logprobs)
+            logprob_matrix[i, : len(option)] = actual_logprobs
+            if len(option) < max_tokens:
+                logprob_matrix[i, len(option) :] = avg_logprob
+        return logprob_matrix
+
+    def _greedy_selection(self, logprob_matrix, num_options, max_tokens):
+        remaining = np.arange(num_options)
+        for j in range(max_tokens):
+            max_logprob = np.max(logprob_matrix[remaining, j])
+            remaining = remaining[logprob_matrix[remaining, j] == max_logprob]
+            if len(remaining) == 1:
+                break
+        return remaining
+
+
+greedy_token_selection = GreedyTokenSelection()
+
+
+class UnconditionalLikelihoodNormalized(ChoicesSamplingMethod):
+
+    @property
+    def requires_unconditional_logprobs(self) -> bool:
+        return True
+
+    def __call__(
+        self,
+        *,
+        choices: List[str],
+        normalized_prompt_logprobs: List[float],
+        input_token_logprobs: List[List[Any]],
+        output_token_logprobs: List[List[Any]],
+        unconditional_token_logprobs: Optional[List[List[Any]]] = None,
+    ) -> ChoicesDecision:
+        """Select the option with the highest average token logprob once normalized by
+        the unconditional token logprobs.
+
+        The first unconditional token logprob is assumed to be None. If so, it is
+        replaced with 0 for the purposes of normalization."""
+
+        if unconditional_token_logprobs is None:
+            raise ValueError(
+                "Unconditional token logprobs are required for this method."
+            )
+
+        normalized_unconditional_prompt_logprobs = self._normalize_logprobs(
+            input_token_logprobs, unconditional_token_logprobs
+        )
+
+        best_choice = choices[np.argmax(normalized_unconditional_prompt_logprobs)]
+        meta_info = {
+            "normalized_prompt_logprobs": normalized_prompt_logprobs,
+            "input_token_logprobs": input_token_logprobs,
+            "output_token_logprobs": output_token_logprobs,
+            "unconditional_token_logprobs": unconditional_token_logprobs,
+            "normalized_unconditional_prompt_logprobs": normalized_unconditional_prompt_logprobs,
+        }
+        return ChoicesDecision(decision=best_choice, meta_info=meta_info)
+
+    def _normalize_logprobs(self, input_token_logprobs, unconditional_token_logprobs):
+        normalized_unconditional_prompt_logprobs = []
+        for inputs, unconditionals in zip(
+            input_token_logprobs, unconditional_token_logprobs
+        ):
+            inputs_logprobs = np.array([token[0] for token in inputs])
+            unconditionals_logprobs = np.array([token[0] for token in unconditionals])
+            unconditionals_logprobs[0] = unconditionals_logprobs[0] or 0
+            normalized_unconditional_prompt_logprobs.append(
+                float(np.mean(inputs_logprobs - unconditionals_logprobs))
+            )
+        return normalized_unconditional_prompt_logprobs
+
+
+unconditional_likelihood_normalized = UnconditionalLikelihoodNormalized()
diff --git a/python/sglang/lang/compiler.py b/python/sglang/lang/compiler.py
new file mode 100644
index 000000000..1284232f7
--- /dev/null
+++ b/python/sglang/lang/compiler.py
@@ -0,0 +1,231 @@
+import multiprocessing
+from concurrent.futures import ThreadPoolExecutor
+from queue import Queue
+from typing import List, Union
+
+from sglang.global_config import global_config
+from sglang.lang.interpreter import ProgramState, StreamExecutor, cache_program
+from sglang.lang.ir import SglArgument, SglExpr, SglSamplingParams, SglVariable
+
+
+def compile_func(function, backend):
+    tracer = function.trace(backend=backend)
+    compiler = CompiledFunction(tracer, function)
+    return compiler
+
+
+class CompiledFunction:
+    def __init__(self, tracer, function):
+        self.function = function
+
+        self.last_node = CompGraphNode(tracer.last_node)
+        self.expr_to_node = {}
+        self.build_graph(tracer)
+        self.topological_sort()
+
+    def build_graph(self, tracer):
+        self.nodes = [self.last_node]
+        self.expr_to_node[tracer.last_node] = self.nodes[-1]
+
+        rename_pid = {}
+
+        visited = set([tracer.last_node])
+        head = 0
+        while head < len(self.nodes):
+            cur_node = self.nodes[head]
+
+            # add prev node
+            prev_node = cur_node.expr.prev_node
+            if prev_node is not None:
+                if prev_node not in visited:
+                    visited.add(prev_node)
+                    self.nodes.append(CompGraphNode(prev_node))
+                    self.expr_to_node[prev_node] = self.nodes[-1]
+                cur_node.prev_node = self.expr_to_node[prev_node]
+                self.expr_to_node[prev_node].add_next_node(cur_node)
+
+            # add source node
+            if isinstance(cur_node.expr, SglVariable):
+                if cur_node.expr.name in tracer.variables:
+                    source = tracer.variables[cur_node.expr.name].source
+                else:
+                    source = cur_node.expr.source
+                if source not in visited:
+                    visited.add(source)
+                    self.nodes.append(CompGraphNode(source))
+                    self.expr_to_node[source] = self.nodes[-1]
+                cur_node.source_node = self.expr_to_node[source]
+                self.expr_to_node[source].add_next_node(cur_node)
+            head += 1
+
+            # rename pid
+            if cur_node.expr.pid not in rename_pid:
+                rename_pid[cur_node.expr.pid] = len(rename_pid)
+            cur_node.expr.pid = rename_pid[cur_node.expr.pid]
+
+    def topological_sort(self):
+        prevd = {}
+        cand = Queue()
+        for x in self.nodes:
+            prevd[x] = (x.prev_node is not None) + (x.source_node is not None)
+            if prevd[x] == 0:
+                cand.put(x)
+        new_list = []
+        while cand.qsize() > 0:
+            head = cand.get()
+            new_list.append(head)
+            for x in head.next_nodes:
+                prevd[x] -= 1
+                if prevd[x] == 0:
+                    cand.put(x)
+        self.nodes = new_list
+
+    def print_graph(
+        self,
+    ):
+        for node in self.nodes:
+            print(node)
+
+    def run_internal(
+        self,
+        backend,
+        kwargs,
+        default_sampling_para,
+    ):
+        stream_executor_ids = set([x.expr.pid for x in self.nodes])
+        stream_executors = {}
+        for x in stream_executor_ids:
+            arguments = kwargs if x == self.last_node.expr.pid else {}
+            stream_executors[x] = StreamExecutor(
+                backend, arguments, default_sampling_para, None, False
+            )
+        for node in self.nodes:
+            se_id = node.expr.pid
+            expr = node.expr
+            if isinstance(expr, SglVariable):
+                # Make a copy for SglVariable
+                expr = SglVariable(expr.name, expr.source)
+                expr.source_stream_executor = stream_executors[
+                    node.source_node.expr.pid
+                ]
+            elif isinstance(expr, SglArgument):
+                # Substitute SglArgument
+                expr = kwargs[expr.name]
+            stream_executors[se_id].submit(expr)
+        for stream_executor in stream_executors.values():
+            stream_executor.end()
+        return ProgramState(stream_executors[self.last_node.expr.pid])
+
+    def run(
+        self,
+        *,
+        max_new_tokens: int = 128,
+        stop: Union[str, List[str]] = (),
+        temperature: float = 1.0,
+        top_p: float = 1.0,
+        top_k: int = -1,
+        min_p: float = 0.0,
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
+        backend=None,
+        **kwargs,
+    ):
+        backend = backend or global_config.default_backend
+
+        kwargs.update(self.function.bind_arguments)
+
+        default_sampling_para = SglSamplingParams(
+            max_new_tokens=max_new_tokens,
+            stop=stop,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
+            frequency_penalty=frequency_penalty,
+            presence_penalty=presence_penalty,
+        )
+
+        return self.run_internal(backend, kwargs, default_sampling_para)
+
+    def run_batch(
+        self,
+        batch_kwargs,
+        *,
+        max_new_tokens: int = 128,
+        stop: Union[str, List[str]] = (),
+        temperature: float = 1.0,
+        top_p: float = 1.0,
+        top_k: int = -1,
+        min_p: float = 0.0,
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
+        backend=None,
+        num_threads: Union[str, int] = "auto",
+    ):
+        assert isinstance(batch_kwargs, (list, tuple))
+        if len(batch_kwargs) == 0:
+            return []
+        assert isinstance(batch_kwargs[0], dict)
+
+        backend = backend or global_config.default_backend
+
+        default_sampling_para = SglSamplingParams(
+            max_new_tokens=max_new_tokens,
+            stop=stop,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
+            frequency_penalty=frequency_penalty,
+            presence_penalty=presence_penalty,
+        )
+
+        # Extract prefix by tracing and cache it
+        if len(batch_kwargs) > 1:
+            cache_program(self.function, backend)
+
+        # Run all programs
+        if num_threads == "auto":
+            num_threads = multiprocessing.cpu_count()
+        num_threads = min(num_threads, len(batch_kwargs))
+
+        if num_threads == 1:
+            rets = []
+            for arguments in batch_kwargs:
+                rets.append(
+                    self.run_internal(backend, arguments, default_sampling_para)
+                )
+        else:
+            with ThreadPoolExecutor(num_threads) as executor:
+                futures = []
+                for arguments in batch_kwargs:
+                    futures.append(
+                        executor.submit(
+                            self.run_internal, backend, arguments, default_sampling_para
+                        )
+                    )
+                rets = [f.result() for f in futures]
+            rets[-1].sync()
+
+        return rets
+
+
+class CompGraphNode:
+    def __init__(
+        self, expr: SglExpr, prev_node=None, next_nodes=None, source_node=None
+    ):
+        self.expr = expr
+        self.next_nodes = next_nodes or []
+        self.prev_node = prev_node
+        self.source_node = source_node
+
+    def add_next_node(self, other):
+        self.next_nodes.append(other)
+
+    def __repr__(self):
+        re = f"stream {self.expr.pid:2d}: "
+        re += f"%{self.expr.node_id} = "
+        if self.prev_node is not None:
+            re += f"%{self.prev_node.expr.node_id} + "
+        re += repr(self.expr)
+        return re
diff --git a/python/sglang/lang/interpreter.py b/python/sglang/lang/interpreter.py
new file mode 100644
index 000000000..8b8cdf9c5
--- /dev/null
+++ b/python/sglang/lang/interpreter.py
@@ -0,0 +1,1060 @@
+"""The interpreter that executes SGL programs"""
+
+import asyncio
+import contextvars
+import copy
+import multiprocessing
+import queue
+import threading
+import uuid
+import warnings
+from concurrent.futures import ThreadPoolExecutor
+from contextlib import contextmanager
+from typing import Any, Callable, Dict, List, Optional
+
+import tqdm
+
+from sglang.global_config import global_config
+from sglang.lang.ir import (
+    SglCommitLazy,
+    SglConcateAndAppend,
+    SglConstantText,
+    SglExpr,
+    SglExprList,
+    SglGen,
+    SglImage,
+    SglRoleBegin,
+    SglRoleEnd,
+    SglSelect,
+    SglSeparateReasoning,
+    SglVariable,
+    SglVarScopeBegin,
+    SglVarScopeEnd,
+    SglVideo,
+)
+from sglang.utils import (
+    encode_image_base64,
+    encode_video_base64,
+    get_exception_traceback,
+)
+
+
+def run_internal(state, program, func_args, func_kwargs, sync):
+    try:
+        state.ret_value = program.func(state, *func_args, **func_kwargs)
+    except Exception as e:
+        raise e
+    finally:
+        state.stream_executor.end()
+
+    if sync:
+        state.stream_executor.sync()
+
+    if global_config.verbosity >= 2:
+        print(state.text())
+
+
+def run_program(
+    program,
+    backend,
+    func_args,
+    func_kwargs,
+    default_sampling_para,
+    stream,
+    sync=False,
+    use_thread=True,
+):
+    if hasattr(backend, "endpoint"):
+        backend = backend.endpoint
+    assert backend is not None, "Please specify a backend"
+    func_kwargs.update(program.bind_arguments)
+    stream_executor = StreamExecutor(
+        backend,
+        func_kwargs,
+        default_sampling_para,
+        chat_template=None,
+        stream=stream,
+        num_api_spec_tokens=program.num_api_spec_tokens,
+        use_thread=use_thread,
+    )
+    state = ProgramState(stream_executor)
+
+    if stream:
+        t = threading.Thread(
+            target=run_internal, args=(state, program, func_args, func_kwargs, sync)
+        )
+        t.start()
+        return state
+    else:
+        run_internal(state, program, func_args, func_kwargs, sync)
+        return state
+
+
+def run_program_batch(
+    program,
+    backend,
+    batch_arguments,
+    default_sampling_para,
+    num_threads,
+    progress_bar,
+    generator_style=False,
+):
+    if hasattr(backend, "endpoint"):
+        backend = backend.endpoint
+
+    # Pre-cache the common prefix for a batch. The prefix is extracted by tracing the program.
+    if global_config.enable_precache_with_tracing and len(batch_arguments) > 1:
+        cache_program(program, backend)
+
+    # Run all programs
+    if num_threads == "auto":
+        num_threads = max(96, multiprocessing.cpu_count() * 16)
+    num_threads = min(num_threads, len(batch_arguments))
+
+    if generator_style:
+        return _run_program_batch_generator(
+            program,
+            backend,
+            batch_arguments,
+            default_sampling_para,
+            num_threads,
+            progress_bar,
+        )
+
+    # Original code path when generator_style=False
+    if num_threads == 1:
+        rets = []
+        if progress_bar:
+            for arguments in tqdm.tqdm(batch_arguments):
+                rets.append(
+                    run_program(
+                        program,
+                        backend,
+                        (),
+                        arguments,
+                        default_sampling_para,
+                        False,
+                        True,
+                    )
+                )
+        else:
+            for arguments in batch_arguments:
+                rets.append(
+                    run_program(
+                        program,
+                        backend,
+                        (),
+                        arguments,
+                        default_sampling_para,
+                        False,
+                        True,
+                    )
+                )
+    else:
+        if progress_bar:
+            pbar = tqdm.tqdm(total=len(batch_arguments))
+
+        with ThreadPoolExecutor(num_threads) as executor:
+            futures = []
+            for arguments in batch_arguments:
+                futures.append(
+                    executor.submit(
+                        run_program,
+                        program,
+                        backend,
+                        (),
+                        arguments,
+                        default_sampling_para,
+                        False,
+                        True,
+                    )
+                )
+                if progress_bar:
+                    futures[-1].add_done_callback(lambda _: pbar.update())
+
+            rets = [f.result() for f in futures]
+        rets[-1].sync()
+
+        if progress_bar:
+            pbar.close()
+
+    return rets
+
+
+def _run_program_batch_generator(
+    program,
+    backend,
+    batch_arguments,
+    default_sampling_para,
+    num_threads,
+    progress_bar,
+):
+    """Helper function that yields results one by one using chunking to avoid overwhelming ThreadPoolExecutor."""
+    if num_threads == 1:
+        iterator = tqdm.tqdm(batch_arguments) if progress_bar else batch_arguments
+        for arguments in iterator:
+            yield run_program(
+                program,
+                backend,
+                (),
+                arguments,
+                default_sampling_para,
+                False,
+                True,
+            )
+    else:
+        pbar = tqdm.tqdm(total=len(batch_arguments)) if progress_bar else None
+
+        # Process in chunks to avoid overwhelming ThreadPoolExecutor
+        # Otherwise, ThreadPoolExecutor.submit will block after adding certain number of tasks
+        # so we will never reach "yield" until all tasks are done
+        chunk_size = 200
+
+        with ThreadPoolExecutor(num_threads) as executor:
+            for chunk_start in range(0, len(batch_arguments), chunk_size):
+                chunk_end = min(chunk_start + chunk_size, len(batch_arguments))
+                chunk_futures = []
+
+                # Submit chunk of tasks
+                for i in range(chunk_start, chunk_end):
+                    future = executor.submit(
+                        run_program,
+                        program,
+                        backend,
+                        (),
+                        batch_arguments[i],
+                        default_sampling_para,
+                        False,
+                        True,
+                    )
+                    if pbar:
+                        future.add_done_callback(lambda _: pbar.update())
+                    chunk_futures.append(future)
+
+                # Yield results from this chunk as they complete
+                for future in chunk_futures:
+                    yield future.result()
+
+        if pbar:
+            pbar.close()
+
+
+def cache_program(program, backend):
+    from sglang.lang.tracer import extract_prefix_by_tracing
+
+    prefix = extract_prefix_by_tracing(program, backend)
+    if prefix and len(prefix) > 64:
+        backend.cache_prefix(prefix)
+
+
+class StreamExecutor:
+    """A stream executor that executes SGL expressions in a background thread."""
+
+    def __init__(
+        self,
+        backend,
+        arguments,
+        default_sampling_para,
+        chat_template,
+        stream,
+        num_api_spec_tokens=None,
+        use_thread=True,
+    ):
+        from sglang.lang.backend.base_backend import BaseBackend
+
+        self.sid = uuid.uuid4().hex
+        self.backend: BaseBackend = backend
+        self.arguments: Dict[str, Any] = arguments
+        self.default_sampling_para = default_sampling_para
+        self.stream = stream
+
+        self.variables = {}  # Dict[name: str -> value: str]
+        self.variable_event = {}  # Dict[name: str -> event: threading.Event]
+        self.meta_info = {}  # Dict[name: str -> info: str]
+        self.is_finished = False
+        self.error_ = None
+
+        # For completion
+        self.text_ = ""  # The full text
+
+        # For chat
+        self.messages_ = []  # The messages in the OpenAI API format
+        self.chat_template = chat_template or self.backend.get_chat_template()
+        self.cur_role = None
+        self.cur_role_begin_pos = None
+
+        # For vision
+        self.images_ = []
+        self.cur_images = []
+
+        # For fork/join
+        self.fork_start_text_pos = None
+
+        # For speculative execution
+        self.num_api_spec_tokens = num_api_spec_tokens
+        self.speculated_text = ""
+
+        # Worker thread
+        self.use_thread = use_thread
+        if self.use_thread:
+            self.queue = queue.Queue()
+
+            def _run_worker_in_context():
+                self._thread_worker_func()
+
+            self.worker = threading.Thread(
+                target=contextvars.copy_context().run, args=(_run_worker_in_context,)
+            )
+            self.worker.start()
+
+        # For streaming
+        if stream:
+            self.stream_text_event = threading.Event()
+            self.stream_var_event = {}
+        else:
+            self.stream_text_event = None
+            self.stream_var_event = None
+
+    def submit(self, expr: SglExpr):
+        self._init_var_event(expr)
+
+        if self.use_thread:
+            self.queue.put(expr)
+        else:
+            self._execute(expr)
+
+    def sync(self):
+        if self.use_thread:
+            self.queue.join()
+
+    def get_var(self, name):
+        if name in self.variable_event:
+            self.variable_event[name].wait()
+        return self.variables[name]
+
+    def set_var(self, name, value):
+        self.variables[name] = value
+
+    def get_meta_info(self, name, timeout=None):
+        if name in self.variable_event:
+            got = self.variable_event[name].wait(timeout)
+            if not got:
+                raise TimeoutError(f"Timeout while waiting for event '{name}'")
+        ret = self.meta_info.get(name, None)
+        return ret
+
+    def fork(
+        self,
+        size: int = 1,
+        position_ids_offset: Optional[List[int]] = None,
+    ):
+        if size > 1 and str(self.text_):
+            self.submit(SglCommitLazy())
+
+        self.sync()
+        size = int(size)
+
+        exes = [
+            StreamExecutor(
+                self.backend,
+                self.arguments,
+                self.default_sampling_para,
+                self.chat_template,
+                self.stream,
+            )
+            for _ in range(size)
+        ]
+        for i in range(size):
+            exes[i].variables = dict(self.variables)
+            exes[i].text_ = str(self.text_)
+            exes[i].messages_ = list(self.messages_)
+            exes[i].cur_role = self.cur_role
+            exes[i].cur_role_begin_pos = self.cur_role_begin_pos
+            exes[i].fork_start_text_pos = len(self.text_)
+            exes[i].images_ = list(self.images_)
+
+            # TODO(ying): handle API speculative execution
+
+        return exes
+
+    def text(self):
+        self.sync()
+        return self.text_
+
+    def messages(self):
+        self.sync()
+        return self.messages_
+
+    def error(self):
+        self.sync()
+        return self.error_
+
+    def end(self):
+        if self.use_thread:
+            if self.worker.is_alive():
+                self.queue.put(None)
+        self.backend.end_program(self)
+
+    def _thread_worker_func(self):
+        error = None
+
+        while True:
+            expr = self.queue.get()
+            if expr is None:
+                self.queue.task_done()
+                break
+
+            try:
+                self._execute(expr)
+            except Exception as e:
+                warnings.warn(f"Error in stream_executor: {get_exception_traceback()}")
+                error = e
+                break
+            self.queue.task_done()
+            if self.stream_text_event:
+                self.stream_text_event.set()
+
+        # Clean the queue and events
+        if error is not None:
+            try:
+                while True:
+                    self.queue.task_done()
+                    self.queue.get_nowait()
+            except queue.Empty:
+                pass
+            for name in self.variable_event:
+                self.variable_event[name].set()
+            if self.stream_var_event:
+                for name in self.stream_var_event:
+                    self.stream_var_event[name].set()
+            self.error_ = error
+
+        if self.stream_text_event:
+            self.stream_text_event.set()
+
+        self.is_finished = True
+
+    def _execute(self, other):
+        if isinstance(other, str):
+            other = SglConstantText(other)
+
+        assert isinstance(other, SglExpr), f"{other}"
+
+        if isinstance(other, SglConstantText):
+            self._execute_fill(other.value)
+        elif isinstance(other, SglGen):
+            self._execute_gen(other)
+        elif isinstance(other, SglSelect):
+            self._execute_select(other)
+        elif isinstance(other, SglExprList):
+            for x in other.expr_list:
+                self._execute(x)
+        elif isinstance(other, SglRoleBegin):
+            self._execute_role_begin(other)
+        elif isinstance(other, SglRoleEnd):
+            self._execute_role_end(other)
+        elif isinstance(other, SglImage):
+            self._execute_image(other)
+        elif isinstance(other, SglVideo):
+            self._execute_video(other)
+        elif isinstance(other, SglVariable):
+            self._execute_variable(other)
+        elif isinstance(other, SglVarScopeBegin):
+            self._execute_var_scope_begin(other)
+        elif isinstance(other, SglVarScopeEnd):
+            self._execute_var_scope_end(other)
+        elif isinstance(other, SglCommitLazy):
+            self._execute_commit_lazy_operations(other)
+        elif isinstance(other, SglConcateAndAppend):
+            if (
+                global_config.enable_parallel_encoding
+                and self.backend.support_concate_and_append
+            ):
+                self._execute_concatenate_and_append_kv_cache(other)
+            else:
+                self._execute_concatenate_and_append_text(other)
+        elif isinstance(other, SglSeparateReasoning):
+            self._execute_separate_reasoning(other)
+        else:
+            raise ValueError(f"Unknown type: {type(other)}")
+
+    def _execute_fill(self, value: str, prefix=False):
+        value = str(value)
+
+        if (
+            self.cur_role == "assistant"
+            and self.num_api_spec_tokens is not None
+            and self.backend.is_chat_model
+            and not prefix
+        ):
+            self.backend.spec_fill(value)
+            return
+
+        if self.speculated_text.startswith(value):
+            self.speculated_text = self.speculated_text[len(value) :]
+        else:
+            self.speculated_text = ""
+
+        self.text_ += value
+
+    def _execute_image(self, expr: SglImage):
+        path = expr.path
+
+        base64_data = encode_image_base64(path)
+
+        self.images_.append((path, base64_data))
+        self.cur_images.append((path, base64_data))
+        self.text_ += self.chat_template.image_token
+
+    def _execute_video(self, expr: SglVideo):
+        path = expr.path
+        num_frames = expr.num_frames
+
+        base64_data = encode_video_base64(path, num_frames)
+
+        self.images_.append((path, base64_data))
+        self.cur_images.append((path, base64_data))
+        self.text_ += self.chat_template.image_token
+
+    def _spec_gen(self, sampling_params):
+        stop = sampling_params.stop
+        max_new_tokens = sampling_params.max_new_tokens
+        meta_info = {}
+
+        def regen():
+            nonlocal meta_info
+
+            sampling_params.max_new_tokens = max(
+                sampling_params.max_new_tokens, self.num_api_spec_tokens
+            )
+            sampling_params.stop = None
+            self.speculated_text, meta_info = self.backend.generate(
+                self, sampling_params=sampling_params
+            )
+
+        def find_stop():
+            if isinstance(stop, str):
+                return self.speculated_text.find(stop)
+            elif isinstance(stop, (tuple, list)):
+                pos = -1
+                for stop_str in stop:
+                    stop_pos = self.speculated_text.find(stop_str)
+                    if stop_pos != -1 and (pos == -1 or stop_pos < pos):
+                        pos = stop_pos
+                return pos
+            else:
+                raise Exception("Wrong type of stop in sampling parameters.")
+
+        if stop is None:
+            if len(self.speculated_text) < max_new_tokens:
+                regen()
+            comp = self.speculated_text[:max_new_tokens]
+            self.speculated_text = self.speculated_text[max_new_tokens:]
+        elif isinstance(stop, (str, list, tuple)):
+            if self.speculated_text == "":
+                regen()
+            stop_pos = find_stop()
+            if stop_pos == -1:
+                stop_pos = min(
+                    sampling_params.max_new_tokens,
+                    len(self.speculated_text),
+                )
+            comp = self.speculated_text[:stop_pos]
+            self.speculated_text = self.speculated_text[stop_pos:]
+        else:
+            raise ValueError("Wrong type of stop in sampling parameters.")
+
+        return comp, meta_info
+
+    def _execute_gen(self, expr: SglGen):
+        sampling_params = self._resolve_sampling_params(expr.sampling_params)
+        name = expr.name
+        if not self.stream:
+            if self.num_api_spec_tokens is None:
+                comp, meta_info = self.backend.generate(
+                    self,
+                    sampling_params=sampling_params,
+                )
+
+            else:
+                if self.backend.is_chat_model:
+                    # Speculative execution on models with only chat interface.
+                    # Store the calls into a temporary list.
+                    # They will be lazily executed later.
+                    comp, meta_info = self.backend.generate(
+                        self,
+                        sampling_params=sampling_params,
+                        spec_var_name=name,
+                    )
+                    return
+
+                else:  # Speculative execution on models with completion interface
+                    comp, meta_info = self._spec_gen(sampling_params)
+            if isinstance(comp, list):
+                self.text_ += comp[0]
+            else:
+                assert isinstance(comp, str)
+                self.text_ += comp
+
+            self.variables[name] = comp
+            self.meta_info[name] = meta_info
+            self.variable_event[name].set()
+        else:
+            assert (
+                self.num_api_spec_tokens is None
+            ), "stream is not supported with api speculative execution"
+            generator = self.backend.generate_stream(
+                self, sampling_params=sampling_params
+            )
+
+            self.variables[name] = ""
+            self.stream_var_event[name].set()
+
+            for comp, meta_info in generator:
+                self.text_ += comp
+                self.variables[name] += comp
+                self.meta_info[name] = meta_info
+                self.stream_var_event[name].set()
+                self.stream_text_event.set()
+
+            self.variable_event[name].set()
+            self.stream_var_event[name].set()
+
+    def _execute_select(self, expr: SglSelect):
+        choices_decision = self.backend.select(
+            self, expr.choices, expr.temperature, expr.choices_method
+        )
+        if expr.name is not None:
+            name = expr.name
+            self.variables[name] = choices_decision.decision
+            self.meta_info[name] = choices_decision.meta_info
+            self.variable_event[name].set()
+            if self.stream_var_event:
+                self.stream_var_event[name].set()
+        self.text_ += choices_decision.decision
+
+    def _execute_variable(self, expr: SglVariable):
+        src_executor = expr.source_stream_executor
+        value = src_executor.get_var(expr.name)
+        self._execute_fill(value)
+
+    def _execute_role_begin(self, expr: SglRoleBegin):
+        assert self.cur_role is None, "Nested roles are not allowed."
+
+        if len(self.messages_) == 0 and expr.role != "system":
+            # Insert the default system message
+            default_system = self.chat_template.default_system_prompt
+            if default_system:
+                self._execute_role_begin(SglRoleBegin("system"))
+                self._execute_fill(default_system)
+                self._execute_role_end(SglRoleEnd("system"))
+
+        self.cur_role = expr.role
+
+        prefix, _ = self.chat_template.get_prefix_and_suffix(expr.role, self.messages_)
+
+        self._execute_fill(prefix, prefix=True)
+        self.cur_role_begin_pos = len(self.text_)
+
+    def _execute_role_end(self, expr: SglRoleEnd):
+        if (
+            self.cur_role == "assistant"
+            and self.num_api_spec_tokens is not None
+            and self.backend.is_chat_model
+        ):
+            # Execute the stored lazy generation calls
+            self.backend.role_end_generate(self)
+        self.cur_role = None
+
+        new_text = self.text_[self.cur_role_begin_pos :].lstrip()
+
+        _, suffix = self.chat_template.get_prefix_and_suffix(expr.role, self.messages_)
+        self._execute_fill(suffix)
+
+        if self.cur_images:
+            # OpenAI vision API format
+            last_msg = {
+                "role": expr.role,
+                "content": [{"type": "text", "text": new_text}],
+            }
+            for image_path, image_base64_data in self.cur_images:
+                last_msg["content"].append(
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{image_base64_data}"
+                        },
+                    }
+                )
+            self.messages_.append(last_msg)
+            self.cur_images = []
+        else:
+            # OpenAI chat API format
+            self.messages_.append({"role": expr.role, "content": new_text})
+
+    def _execute_var_scope_begin(self, expr: SglVarScopeBegin):
+        self.variables[expr.name] = int(len(self.text_))
+
+    def _execute_var_scope_end(self, expr: SglVarScopeEnd):
+        self.variables[expr.name] = self.text_[self.variables[expr.name] :]
+        self.variable_event[expr.name].set()
+
+    def _execute_commit_lazy_operations(self, expr: SglCommitLazy):
+        self.backend.commit_lazy_operations(self)
+
+    def _execute_concatenate_and_append_text(self, expr: SglConcateAndAppend):
+        new_text = ""
+        for s in expr.states:
+            exe = s.stream_executor
+            exe.sync()
+            new_text += exe.text_[exe.fork_start_text_pos :]
+
+        self._execute_fill(new_text)
+
+    def _execute_concatenate_and_append_kv_cache(self, expr: SglConcateAndAppend):
+        self_len = len(self.text_)
+
+        for i, s in enumerate(expr.states):
+            exe = s.stream_executor
+            exe.submit(SglCommitLazy())
+
+        for i, s in enumerate(expr.states):
+            exe = s.stream_executor
+            exe.sync()
+            assert exe.fork_start_text_pos == self_len
+            self.text_ += exe.text_[exe.fork_start_text_pos :]
+
+        src_rids = [state.stream_executor.sid for state in expr.states]
+        self.backend.concatenate_and_append(src_rids, self.sid)
+
+    def _execute_separate_reasoning(self, expr: SglSeparateReasoning):
+        if self.stream:
+            # separate reasoning for stream is not supported
+            return
+
+        if (
+            self.cur_role == "assistant"
+            and self.num_api_spec_tokens is not None
+            and self.backend.is_chat_model
+        ):
+            # Execute the stored lazy generation calls
+            self.backend.role_end_generate(self)
+
+        from sglang.srt.parser.reasoning_parser import ReasoningParser
+
+        reasoning_parser = ReasoningParser(expr.model_type)
+        other = expr.expr
+        if not other:
+            return
+        elif isinstance(other, SglGen) or isinstance(other, SglSelect):
+            cur_text = self.get_var(other.name)
+            reasoning, normal_text = reasoning_parser.parse_non_stream(cur_text)
+            reasoning_name = expr.process_name_for_reasoning(other.name)
+            self.set_var(other.name, normal_text)
+            self.set_var(reasoning_name, reasoning)
+            # the variable is ready to be used
+            self.variable_event[reasoning_name].set()
+            self.text_ = self.text_[: self.cur_role_begin_pos] + normal_text
+        elif isinstance(other, SglExprList):
+            for x in other.expr_list:
+                self._execute_separate_reasoning(
+                    SglSeparateReasoning(expr.model_type, x)
+                )
+
+    def _init_var_event(self, expr):
+        if isinstance(
+            expr, (SglGen, SglSelect, SglVarScopeBegin, SglSeparateReasoning)
+        ):
+            self.variable_event[expr.name] = threading.Event()
+            if self.stream:
+                self.stream_var_event[expr.name] = threading.Event()
+        elif isinstance(expr, SglExprList):
+            for e in expr.expr_list:
+                self._init_var_event(e)
+
+    def _resolve_sampling_params(self, sampling_params):
+        """
+        Construct sampling param based on default + override values
+
+        The default values of sampling are populated in `default_sampling_para` via sgl.function.run(...sampling_args)
+        , and `sampling_params` contains the override values from sgl.gen().
+
+        Here we use default_sampling_para as the base and override the values if they exist in `sampling_params`.
+        It also extends the stop tokens based on the chat template.
+        """
+
+        # deepcopy is required because the dict has lists inside
+        clone = copy.deepcopy(self.default_sampling_para)
+
+        for item in [
+            "max_new_tokens",
+            "min_new_tokens",
+            "n",
+            "stop",
+            "stop_token_ids",
+            "temperature",
+            "top_p",
+            "top_k",
+            "min_p",
+            "frequency_penalty",
+            "presence_penalty",
+            "ignore_eos",
+            "return_logprob",
+            "logprob_start_len",
+            "top_logprobs_num",
+            "return_text_in_logprobs",
+            "dtype",
+            "regex",
+            "json_schema",
+        ]:
+            value = getattr(sampling_params, item, None)
+            if value is not None:
+                setattr(clone, item, value)
+
+        if self.chat_template.stop_str:
+            if clone.stop == ():
+                clone.stop = []
+            elif isinstance(clone.stop, str):
+                clone.stop = [clone.stop]
+            clone.stop += self.chat_template.stop_str
+
+        return clone
+
+    def __del__(self):
+        self.end()
+
+
+class ProgramState:
+    """The state of an SGL program."""
+
+    def __init__(self, stream_executor: StreamExecutor):
+        self.stream_executor = stream_executor
+
+    def _role_common(self, name: str, expr: Optional[SglExpr] = None):
+        if expr is not None:
+            role_expr = SglExprList([SglRoleBegin(name), expr, SglRoleEnd(name)])
+            self.stream_executor.submit(role_expr)
+            return role_expr
+        else:
+
+            @contextmanager
+            def role_scope():
+                self.stream_executor.submit(SglRoleBegin(name))
+                yield
+                self.stream_executor.submit(SglRoleEnd(name))
+
+            return role_scope()
+
+    def system(self, expr: Optional[SglExpr] = None):
+        return self._role_common("system", expr)
+
+    def user(self, expr: Optional[SglExpr] = None):
+        return self._role_common("user", expr)
+
+    def assistant(self, expr: Optional[SglExpr] = None):
+        return self._role_common("assistant", expr)
+
+    @contextmanager
+    def var_scope(self, name: str):
+        self.stream_executor.submit(SglVarScopeBegin(name))
+        yield
+        self.stream_executor.submit(SglVarScopeEnd(name))
+
+    def fork(
+        self,
+        size: int = 1,
+        position_ids_offset: Optional[List[int]] = None,
+    ):
+        stream_executors = self.stream_executor.fork(size, position_ids_offset)
+        states = [ProgramState(x) for x in stream_executors]
+        state_group = ProgramStateGroup(states, self)
+        return state_group
+
+    @contextmanager
+    def copy(self, position_ids_offset: Optional[List[int]] = None):
+        state_group = self.fork(1, position_ids_offset)
+        try:
+            yield state_group[0]
+        finally:
+            state_group.join()
+
+    def text(self):
+        return self.stream_executor.text()
+
+    def messages(self):
+        return self.stream_executor.messages()
+
+    def sync(self):
+        return self.stream_executor.sync()
+
+    def error(self):
+        return self.stream_executor.error()
+
+    def text_iter(self, var_name: Optional[str] = None):
+        if self.stream_executor.stream:
+            prev = 0
+            if var_name is None:
+                event = self.stream_executor.stream_text_event
+                while True:
+                    event.wait()
+                    event.clear()
+                    out = str(self.stream_executor.text_[prev:])
+                    prev += len(out)
+                    if out:
+                        yield out
+                    if self.stream_executor.is_finished:
+                        break
+            else:
+                event = None
+                while not event:
+                    if var_name in self.stream_executor.stream_var_event:
+                        event = self.stream_executor.stream_var_event[var_name]
+                    if self.stream_executor.is_finished:
+                        yield ""
+                        return
+
+                while True:
+                    event.wait()
+                    event.clear()
+                    out = str(self.stream_executor.variables[var_name][prev:])
+                    prev += len(out)
+                    if out:
+                        yield out
+                    if self.stream_executor.variable_event[var_name].is_set():
+                        break
+        else:
+            if var_name is None:
+                yield self.text()
+            else:
+                yield self.get_var(var_name)
+
+    async def text_async_iter(
+        self, var_name: Optional[str] = None, return_meta_data: bool = False
+    ):
+        loop = asyncio.get_running_loop()
+
+        if self.stream_executor.stream:
+            prev = 0
+            if var_name is None:
+                event = self.stream_executor.stream_text_event
+                while True:
+                    await loop.run_in_executor(None, event.wait)
+                    event.clear()
+                    out = str(self.stream_executor.text_[prev:])
+                    prev += len(out)
+                    if out:
+                        yield out
+                    if self.stream_executor.is_finished:
+                        break
+            else:
+                event = None
+                while not event:
+                    if var_name in self.stream_executor.stream_var_event:
+                        event = self.stream_executor.stream_var_event[var_name]
+                    if self.stream_executor.is_finished:
+                        yield ""
+                        return
+
+                while True:
+                    await loop.run_in_executor(None, event.wait)
+                    event.clear()
+                    out = str(self.stream_executor.variables[var_name][prev:])
+                    prev += len(out)
+                    if out:
+                        if return_meta_data:
+                            yield out, self.stream_executor.meta_info[var_name]
+                        else:
+                            yield out
+                    if self.stream_executor.variable_event[var_name].is_set():
+                        break
+        else:
+            if var_name is None:
+                yield self.text()
+            else:
+                yield self.get_var(var_name)
+
+    def get_var(self, name):
+        return self.stream_executor.get_var(name)
+
+    def set_var(self, name, value):
+        return self.stream_executor.set_var(name, value)
+
+    def get_meta_info(self, name):
+        return self.stream_executor.get_meta_info(name)
+
+    def __iadd__(self, other):
+        if other is None:
+            raise ValueError("Tried to append None to state.")
+        self.stream_executor.submit(other)
+        return self
+
+    def __getitem__(self, name):
+        return self.get_var(name)
+
+    def __setitem__(self, name, value):
+        self.set_var(name, value)
+
+    def __contains__(self, name):
+        return name in self.stream_executor.variables
+
+    def __del__(self):
+        self.stream_executor.end()
+
+    def __repr__(self) -> str:
+        return f"ProgramState({self.text()})"
+
+
+class ProgramStateGroup:
+    def __init__(
+        self, states: List[ProgramState], src_state: Optional[ProgramState] = None
+    ):
+        self.states = states
+        self.src_state = src_state
+
+    def join(self, mode: str = "gather_variable"):
+        if mode == "gather_variable":
+            # Copy variables back
+            src_vars = self.src_state.stream_executor.variables
+            src_var_set = set(src_vars.keys())
+            for child_state in self.states:
+                child_state.stream_executor.sync()
+                child_vars = child_state.stream_executor.variables
+                new_vars = set(child_vars.keys()) - src_var_set
+
+                for k in new_vars:
+                    if k in src_vars:
+                        src_vars[k].append(child_vars[k])
+                    else:
+                        src_vars[k] = [child_vars[k]]
+        elif mode == "concate_and_append":
+            # Concatenate and append KV cache
+            self.src_state += SglConcateAndAppend(self.states)
+            # Need a sync here. Otherwise, `states` can be deleted.
+            self.src_state.stream_executor.sync()
+        else:
+            raise ValueError(f"Invalid join mode: {mode}")
+
+        for s in self.states:
+            s.stream_executor.end()
+
+    def __getitem__(self, i: int):
+        return self.states[i]
+
+    def __setitem__(self, i: int, value):
+        assert self.states[i] == value
+
+    def __iadd__(self, other):
+        if isinstance(other, Callable):
+            # lambda function
+            for i in range(len(self.states)):
+                self.states[i] += other(i)
+        elif isinstance(other, SglExpr):
+            for i in range(len(self.states)):
+                self.states[i] += other
+        elif isinstance(other, (list, tuple)):
+            for i in range(len(self.states)):
+                self.states[i] += other[i]
+        else:
+            raise ValueError(f"Invalid value: {other}")
+
+        return self
diff --git a/python/sglang/lang/ir.py b/python/sglang/lang/ir.py
new file mode 100644
index 000000000..531705ebe
--- /dev/null
+++ b/python/sglang/lang/ir.py
@@ -0,0 +1,635 @@
+"""The intermediate representation."""
+
+import dataclasses
+import inspect
+import warnings
+from typing import List, Optional, Union
+
+from sglang.global_config import global_config
+from sglang.lang.choices import ChoicesSamplingMethod
+
+REGEX_INT = r"[-+]?[0-9]+[ \n]*"
+REGEX_FLOAT = r"[-+]?[0-9]*\.?[0-9]+[ \n]*"
+REGEX_BOOL = r"(True|False)"
+REGEX_STR = r"\"[\w\d\s]*\""  # bugs with regex r"\".*\"" in interegular pkg
+
+
+@dataclasses.dataclass
+class SglSamplingParams:
+    max_new_tokens: int = 128
+    min_new_tokens: int = 0
+    n: int = 1
+    stop: Union[str, List[str]] = ()
+    stop_token_ids: Optional[List[int]] = ()
+    temperature: float = 1.0
+    top_p: float = 1.0
+    top_k: int = -1  # -1 means disable
+    min_p: float = 0.0
+    frequency_penalty: float = 0.0
+    presence_penalty: float = 0.0
+    ignore_eos: bool = False
+    return_logprob: Optional[bool] = None
+    logprob_start_len: Optional[int] = (None,)
+    top_logprobs_num: Optional[int] = (None,)
+    return_text_in_logprobs: Optional[bool] = (None,)
+    json_schema: Optional[str] = None
+
+    # for constrained generation, not included in to_xxx_kwargs
+    dtype: Optional[str] = None
+    regex: Optional[str] = None
+
+    def clone(self):
+        return SglSamplingParams(
+            self.max_new_tokens,
+            self.min_new_tokens,
+            self.n,
+            self.stop,
+            self.stop_token_ids,
+            self.temperature,
+            self.top_p,
+            self.top_k,
+            self.min_p,
+            self.frequency_penalty,
+            self.presence_penalty,
+            self.ignore_eos,
+            self.return_logprob,
+            self.logprob_start_len,
+            self.top_logprobs_num,
+            self.return_text_in_logprobs,
+            self.json_schema,
+        )
+
+    def to_openai_kwargs(self):
+        # OpenAI does not support top_k, so we drop it here
+        if self.regex is not None:
+            warnings.warn("Regular expression is not supported in the OpenAI backend.")
+        return {
+            "max_tokens": self.max_new_tokens,
+            "max_completion_tokens": self.max_new_tokens,
+            "n": self.n,
+            "stop": self.stop or None,
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "frequency_penalty": self.frequency_penalty,
+            "presence_penalty": self.presence_penalty,
+        }
+
+    def to_vertexai_kwargs(self):
+        if self.regex is not None:
+            warnings.warn(
+                "Regular expression is not supported in the VertexAI backend."
+            )
+        return {
+            "candidate_count": 1,
+            "max_output_tokens": self.max_new_tokens,
+            "stop_sequences": self.stop,
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "top_k": self.top_k if self.top_k > 0 else None,
+        }
+
+    def to_anthropic_kwargs(self):
+        # Anthropic does not support frequency_penalty or presence_penalty, so we drop it here
+        if self.regex is not None:
+            warnings.warn(
+                "Regular expression is not supported in the Anthropic backend."
+            )
+        return {
+            "max_tokens": self.max_new_tokens,
+            "stop_sequences": (
+                self.stop if isinstance(self.stop, (list, tuple)) else [self.stop]
+            ),
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "top_k": self.top_k,
+        }
+
+    def to_litellm_kwargs(self):
+        if self.regex is not None:
+            warnings.warn("Regular expression is not supported in the LiteLLM backend.")
+        return {
+            "max_tokens": self.max_new_tokens,
+            "stop": self.stop or None,
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "frequency_penalty": self.frequency_penalty,
+            "presence_penalty": self.presence_penalty,
+        }
+
+    def to_srt_kwargs(self):
+        return {
+            "max_new_tokens": self.max_new_tokens,
+            "min_new_tokens": self.min_new_tokens,
+            "n": self.n,
+            "stop": self.stop,
+            "stop_token_ids": self.stop_token_ids,
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "top_k": self.top_k,
+            "min_p": self.min_p,
+            "frequency_penalty": self.frequency_penalty,
+            "presence_penalty": self.presence_penalty,
+            "ignore_eos": self.ignore_eos,
+            "regex": self.regex,
+            "json_schema": self.json_schema,
+        }
+
+
+class SglFunction:
+    def __init__(self, func, num_api_spec_tokens=None, bind_arguments=None):
+        self.func = func
+        self.num_api_spec_tokens = num_api_spec_tokens
+        self.bind_arguments = bind_arguments or {}
+        self.pin_prefix_rid = None
+
+        # Parse arguments
+        argspec = inspect.getfullargspec(func)
+        assert argspec.args[0] == "s", 'The first argument must be "s"'
+        self.arg_names = argspec.args[1:]
+        self.arg_defaults = argspec.defaults if argspec.defaults is not None else []
+
+    def bind(self, **kwargs):
+        assert all(key in self.arg_names for key in kwargs)
+
+        new_bind_dict = {**self.bind_arguments, **kwargs}
+        return SglFunction(self.func, bind_arguments=new_bind_dict)
+
+    def run(
+        self,
+        *args,
+        max_new_tokens: int = 128,
+        n: int = 1,
+        stop: Optional[Union[str, List[str]]] = None,
+        stop_token_ids: Optional[List[int]] = None,
+        temperature: float = 1.0,
+        top_p: float = 1.0,
+        top_k: int = -1,
+        min_p: float = 0.0,
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
+        ignore_eos: bool = False,
+        return_logprob: Optional[bool] = None,
+        logprob_start_len: Optional[int] = None,
+        top_logprobs_num: Optional[int] = None,
+        return_text_in_logprobs: Optional[bool] = None,
+        stream: bool = False,
+        backend=None,
+        use_thread: bool = True,
+        **kwargs,
+    ):
+        from sglang.lang.interpreter import run_program
+
+        # avoid using [] as the default arg: https://nikos7am.com/posts/mutable-default-arguments/
+        if stop is None:
+            stop = []
+        if stop_token_ids is None:
+            stop_token_ids = []
+
+        default_sampling_para = SglSamplingParams(
+            max_new_tokens=max_new_tokens,
+            n=n,
+            stop=stop,
+            stop_token_ids=stop_token_ids,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
+            frequency_penalty=frequency_penalty,
+            presence_penalty=presence_penalty,
+            ignore_eos=ignore_eos,
+            return_logprob=return_logprob,
+            logprob_start_len=logprob_start_len,
+            top_logprobs_num=top_logprobs_num,
+            return_text_in_logprobs=return_text_in_logprobs,
+        )
+        backend = backend or global_config.default_backend
+        return run_program(
+            self,
+            backend,
+            args,
+            kwargs,
+            default_sampling_para,
+            stream,
+            use_thread=use_thread,
+        )
+
+    def run_batch(
+        self,
+        batch_kwargs,
+        *,
+        max_new_tokens: int = 128,
+        n: int = 1,
+        stop: Optional[Union[str, List[str]]] = None,
+        stop_token_ids: Optional[List[int]] = None,
+        temperature: float = 1.0,
+        top_p: float = 1.0,
+        top_k: int = -1,
+        min_p: float = 0.0,
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
+        ignore_eos: bool = False,
+        return_logprob: Optional[bool] = None,
+        logprob_start_len: Optional[int] = None,
+        top_logprobs_num: Optional[int] = None,
+        return_text_in_logprobs: Optional[bool] = None,
+        backend=None,
+        num_threads: Union[str, int] = "auto",
+        progress_bar: bool = False,
+        generator_style: bool = False,
+    ):
+        from sglang.lang.interpreter import run_program_batch
+
+        if stop is None:
+            stop = []
+        if stop_token_ids is None:
+            stop_token_ids = []
+
+        assert isinstance(batch_kwargs, (list, tuple))
+        if len(batch_kwargs) == 0:
+            return []
+        if not isinstance(batch_kwargs[0], dict):
+            num_programs = len(batch_kwargs)
+            # change the list of argument values to dict of arg_name -> arg_value
+            batch_kwargs = [
+                {self.arg_names[i]: v for i, v in enumerate(arg_values)}
+                for arg_values in batch_kwargs
+                if isinstance(arg_values, (list, tuple))
+                and len(self.arg_names) - len(self.arg_defaults)
+                <= len(arg_values)
+                <= len(self.arg_names)
+            ]
+            # Ensure to raise an exception if the number of arguments mismatch
+            if len(batch_kwargs) != num_programs:
+                raise Exception("Given arguments mismatch the SGL function signature")
+
+        default_sampling_para = SglSamplingParams(
+            max_new_tokens=max_new_tokens,
+            n=n,
+            stop=stop,
+            stop_token_ids=stop_token_ids,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
+            frequency_penalty=frequency_penalty,
+            presence_penalty=presence_penalty,
+            ignore_eos=ignore_eos,
+            return_logprob=return_logprob,
+            logprob_start_len=logprob_start_len,
+            top_logprobs_num=top_logprobs_num,
+            return_text_in_logprobs=return_text_in_logprobs,
+        )
+        backend = backend or global_config.default_backend
+        return run_program_batch(
+            self,
+            backend,
+            batch_kwargs,
+            default_sampling_para,
+            num_threads,
+            progress_bar,
+            generator_style=generator_style,
+        )
+
+    def trace(self, *, backend=None, **kwargs):
+        from sglang.lang.tracer import trace_program
+
+        backend = backend or global_config.default_backend
+        return trace_program(self, kwargs, backend)
+
+    def cache(self, backend=None):
+        from sglang.lang.interpreter import cache_program
+
+        backend = backend or global_config.default_backend
+        return cache_program(self, backend)
+
+    def compile(self, *, backend=None):
+        from sglang.lang.compiler import compile_func
+
+        return compile_func(self, backend)
+
+    def __call__(self, *args, **kwargs):
+        from sglang.lang.tracer import TracingScope
+
+        tracing_scope = TracingScope.get_current_scope()
+        if tracing_scope is None:
+            return self.run(*args, **kwargs)
+        else:
+            kwargs["backend"] = tracing_scope.tracer_state.backend
+            return self.trace(*args, **kwargs)
+
+
+class SglExpr:
+    node_ct = 0
+
+    def __init__(self):
+        self.node_id = SglExpr.node_ct
+        self.prev_node = None
+        self.pid = None
+        SglExpr.node_ct += 1
+
+    def __add__(self, other):
+        if isinstance(other, str):
+            other = SglConstantText(other)
+        assert isinstance(other, SglExpr)
+
+        return self.concatenate_ir(self, other)
+
+    def __radd__(self, other):
+        if isinstance(other, str):
+            other = SglConstantText(other)
+        assert isinstance(other, SglExpr), f"{other}"
+
+        return self.concatenate_ir(other, self)
+
+    def concatenate_ir(self, a, b):
+        if isinstance(a, SglExprList):
+            if isinstance(b, SglExprList):
+                return SglExprList(a.expr_list + b.expr_list)
+            else:
+                return SglExprList(a.expr_list + [b])
+        elif isinstance(b, SglExprList):
+            return SglExprList([a] + b.expr_list)
+
+        return SglExprList([a, b])
+
+    def print_graph_dfs(self):
+        ret = [""]
+        visited = set()
+
+        def dfs_print(x):
+            if x is None or x in visited:
+                return
+            visited.add(x)
+
+            # Print dependency
+            if x.prev_node is not None:
+                dfs_print(x.prev_node)
+
+            if isinstance(x, SglExprList):
+                for y in x.expr_list:
+                    dfs_print(y)
+            # elif isinstance(x, SglRole):
+            #    dfs_print(x.expr)
+            elif isinstance(x, SglVariable):
+                dfs_print(x.source)
+
+            # Print the node itself
+            if isinstance(x, (SglFork, SglGetForkItem)):
+                ret[0] += f"%{x.node_id} = {x}\n"
+            else:
+                if x.prev_node is not None:
+                    ret[0] += (
+                        f"%{x.node_id} = %{x.prev_node.node_id} + " + str(x) + "\n"
+                    )
+                else:
+                    ret[0] += f"%{x.node_id} = " + str(x) + "\n"
+
+        dfs_print(self)
+        return ret[0]
+
+
+class SglExprList(SglExpr):
+    def __init__(self, expr_list: List[SglExpr]):
+        super().__init__()
+        self.expr_list = expr_list
+
+    def __repr__(self):
+        return f"ExprList({self.expr_list})"
+
+
+class SglArgument(SglExpr):
+    def __init__(self, name: str, value: str):
+        super().__init__()
+        self.name = name
+        self.value = value
+
+    def __repr__(self):
+        return f"Argument(name={self.name}, value={repr(self.value)})"
+
+    def __len__(self):
+        return len(self.value)
+
+    def __getitem__(self, i):
+        return self.value[i]
+
+    def __int__(self):
+        return self.value
+
+    def __bool__(self):
+        return self.value
+
+    def __format__(self, *args):
+        raise TypeError(
+            "Cannot put argument inside a f-string. "
+            "This is not compatible with the tracer. "
+        )
+
+
+class SglImage(SglExpr):
+    def __init__(self, path: str):
+        self.path = path
+
+    def __repr__(self) -> str:
+        return f"SglImage({self.path})"
+
+
+class SglVideo(SglExpr):
+    def __init__(self, path: str, num_frames: int):
+        self.path = path
+        self.num_frames = num_frames
+
+    def __repr__(self) -> str:
+        return f"SglVideo({self.path}, {self.num_frames})"
+
+
+class SglGen(SglExpr):
+    def __init__(
+        self,
+        name: Optional[str] = None,
+        max_new_tokens: Optional[int] = None,
+        min_new_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        stop: Optional[Union[str, List[str]]] = None,
+        stop_token_ids: Optional[List[int]] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        top_k: Optional[int] = None,
+        min_p: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
+        presence_penalty: Optional[float] = None,
+        ignore_eos: Optional[bool] = None,
+        return_logprob: Optional[bool] = None,
+        logprob_start_len: Optional[int] = None,
+        top_logprobs_num: Optional[int] = None,
+        return_text_in_logprobs: Optional[bool] = None,
+        dtype: Optional[type] = None,
+        regex: Optional[str] = None,
+        json_schema: Optional[str] = None,
+    ):
+        """Call the model to generate. See the meaning of the arguments in docs/backend/sampling_params.md"""
+        super().__init__()
+        self.name = name
+        self.sampling_params = SglSamplingParams(
+            max_new_tokens=max_new_tokens,
+            min_new_tokens=min_new_tokens,
+            n=n,
+            stop=stop,
+            stop_token_ids=stop_token_ids,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
+            frequency_penalty=frequency_penalty,
+            presence_penalty=presence_penalty,
+            ignore_eos=ignore_eos,
+            return_logprob=return_logprob,
+            logprob_start_len=logprob_start_len,
+            top_logprobs_num=top_logprobs_num,
+            return_text_in_logprobs=return_text_in_logprobs,
+            dtype=dtype,
+            regex=regex,
+            json_schema=json_schema,
+        )
+
+    def __repr__(self):
+        return f"Gen('{self.name}')"
+
+
+class SglConstantText(SglExpr):
+    def __init__(self, value: str):
+        super().__init__()
+        self.value = value
+
+    def __repr__(self):
+        return f"Constant({repr(self.value)})"
+
+
+class SglRoleBegin(SglExpr):
+    def __init__(self, role: str):
+        super().__init__()
+        self.role = role
+
+    def __repr__(self):
+        return f"RoleBegin({self.role})"
+
+
+class SglRoleEnd(SglExpr):
+    def __init__(self, role: str):
+        super().__init__()
+        self.role = role
+
+    def __repr__(self):
+        return f"RoleEnd({self.role})"
+
+
+class SglSelect(SglExpr):
+
+    def __init__(
+        self,
+        name: str,
+        choices: List[str],
+        temperature: float,
+        choices_method: ChoicesSamplingMethod,
+    ):
+        super().__init__()
+        self.name = name
+        self.choices = choices
+        self.temperature = temperature
+        self.choices_method = choices_method
+
+    def __repr__(self):
+        return f"Select({self.name}, choices={self.choices}, choices_method={self.choices_method})"
+
+
+class SglFork(SglExpr):
+    def __init__(self, number: int, position_ids_offset=None):
+        super().__init__()
+        self.number = number
+        self.position_ids_offset = position_ids_offset
+
+    def __repr__(self):
+        return (
+            f"Fork(%{self.prev_node.node_id}, number={self.number}, "
+            f"position_ids_offset={self.position_ids_offset})"
+        )
+
+
+class SglGetForkItem(SglExpr):
+    def __init__(self, index: int):
+        super().__init__()
+        self.index = index
+
+    def __repr__(self):
+        return f"GetForkItem(%{self.prev_node.node_id}, index={self.index})"
+
+
+class SglVariable(SglExpr):
+    def __init__(self, name: str, source):
+        super().__init__()
+        self.name = name
+        self.source = source
+
+    def __repr__(self):
+        return f"Variable('{self.name}', source=%{self.source.node_id})"
+
+
+class SglVarScopeBegin(SglExpr):
+    def __init__(self, name: str):
+        super().__init__()
+        self.name = name
+
+    def __repr__(self):
+        return f"VarScopeBegin('{self.name}')"
+
+
+class SglVarScopeEnd(SglExpr):
+    def __init__(self, name: str):
+        super().__init__()
+        self.name = name
+
+    def __repr__(self):
+        return f"VarScopeEnd('{self.name}')"
+
+
+class SglConcateAndAppend(SglExpr):
+    def __init__(self, states):
+        super().__init__()
+        self.states = states
+
+    def __repr__(self):
+        return f"ConcatenateAndAppend('{self.states}')"
+
+
+class SglCommitLazy(SglExpr):
+    def __init__(self):
+        super().__init__()
+
+    def __repr__(self):
+        return "CommitLazy()"
+
+
+class SglSeparateReasoning(SglExpr):
+    def __init__(self, model_type: str, expr: SglExpr):
+        super().__init__()
+        self.model_type = model_type
+
+        self.expr = expr
+        self.name = None
+        self._process_expr(expr)
+
+    def process_name_for_reasoning(self, name):
+        if not name:
+            raise ValueError("name must be provided")
+        return f"{name}_reasoning_content"
+
+    def _process_expr(self, expr):
+        if isinstance(expr, SglGen):
+            self.name = self.process_name_for_reasoning(expr.name)
+        elif isinstance(expr, SglSelect):
+            self.name = self.process_name_for_reasoning(expr.name)
+        elif isinstance(expr, SglExprList):
+            for x in expr.expr_list:
+                self._process_expr(x)
+
+    def __repr__(self):
+        return f"SeparateReasoning(model_type={self.model_type}, name={self.name})"
diff --git a/python/sglang/lang/tracer.py b/python/sglang/lang/tracer.py
new file mode 100644
index 000000000..0a2a744f9
--- /dev/null
+++ b/python/sglang/lang/tracer.py
@@ -0,0 +1,279 @@
+"""Tracing a program."""
+
+import uuid
+from typing import Any, Dict, List, Optional
+
+from sglang.lang.backend.base_backend import BaseBackend
+from sglang.lang.interpreter import ProgramState, ProgramStateGroup
+from sglang.lang.ir import (
+    SglArgument,
+    SglConstantText,
+    SglExpr,
+    SglExprList,
+    SglFork,
+    SglGen,
+    SglGetForkItem,
+    SglRoleBegin,
+    SglRoleEnd,
+    SglSelect,
+    SglVariable,
+    SglVarScopeBegin,
+    SglVarScopeEnd,
+)
+
+
+class StopTracing(Exception):
+    pass
+
+
+def extract_prefix_by_tracing(program, backend):
+    # Create dummy arguments
+    dummy_arguments = {name: SglArgument(name, None) for name in program.arg_names}
+    arguments = dummy_arguments
+    arguments.update(program.bind_arguments)
+
+    # Trace
+    tracer = TracerProgramState(backend, arguments, only_trace_prefix=True)
+    try:
+        with TracingScope(tracer):
+            tracer.ret_value = program.func(tracer, **arguments)
+    except (StopTracing, TypeError, AttributeError):
+        # Some exceptions may not be caught
+        pass
+
+    # Run and cache prefix
+    prefix = ""
+    for expr in tracer.flatten_nodes():
+        if isinstance(expr, SglConstantText):
+            prefix += expr.value
+        else:
+            break
+    return prefix
+
+
+def trace_program(program, arguments, backend):
+    # Create dummy backend
+    if backend is None:
+        backend = BaseBackend()
+
+    # Create dummy arguments
+    dummy_arguments = {
+        name: SglArgument(name, None)
+        for name in program.arg_names
+        if name not in arguments
+    }
+    arguments.update(dummy_arguments)
+    arguments.update(program.bind_arguments)
+
+    # Trace
+    tracer = TracerProgramState(backend, arguments, only_trace_prefix=False)
+    with TracingScope(tracer):
+        tracer.ret_value = program.func(tracer, **arguments)
+    return tracer
+
+
+class TracerProgramState(ProgramState):
+    def __init__(self, backend, arguments, only_trace_prefix):
+        self.pid = uuid.uuid4().hex
+        self.backend = backend
+        self.arguments: Dict[str, Any] = arguments
+        self.only_trace_prefix = only_trace_prefix
+
+        if hasattr(backend, "endpoint"):
+            self.backend = backend.endpoint
+
+        self.nodes = []
+        self.last_node = None
+        self.variables = {}
+        self.ret_value = None
+
+        # For completion
+
+        # For chat
+        self.messages_ = []
+        self.cur_role = None
+        self.chat_template = self.backend.get_chat_template()
+
+        # For multi states
+        self.child_states = []
+
+        cur_scope = TracingScope.get_current_scope()
+        if cur_scope is not None:
+            cur_scope.add_child_state(self)
+
+    ##################################
+    ########### Public API ###########
+    ##################################
+
+    def fork(self, size: int = 1, position_ids_offset: Optional[List[int]] = None):
+        assert size >= 1
+
+        if self.only_trace_prefix:
+            raise StopTracing()
+
+        fork_node = SglFork(size)
+        fork_node.prev_node = self.last_node
+
+        states = [
+            TracerProgramState(self.backend, self.arguments, self.only_trace_prefix)
+            for _ in range(size)
+        ]
+
+        for i in range(size):
+            node = SglGetForkItem(i)
+            node.prev_node = fork_node
+            states[i].last_node = node
+            states[i].variables = dict(self.variables)
+            states[i].messages_ = list(self.messages_)
+            states[i].cur_role = self.cur_role
+            states[i].chat_template = self.chat_template
+
+        state_group = ProgramStateGroup(states, self)
+
+        return state_group
+
+    ##################################
+    ########## Internal API ##########
+    ##################################
+
+    def _append_node(self, other: SglExpr):
+        self.nodes.append(other)
+        other.prev_node = self.last_node
+        self.last_node = other
+
+    def _execute(self, other: SglExpr):
+        if isinstance(other, str):
+            other = SglConstantText(other)
+
+        other.pid = self.pid
+
+        if isinstance(other, SglConstantText):
+            self._execute_fill(other)
+        elif isinstance(other, SglGen):
+            self._execute_gen(other)
+        elif isinstance(other, SglSelect):
+            self._execute_select(other)
+        elif isinstance(other, SglExprList):
+            for x in other.expr_list:
+                self._execute(x)
+        elif isinstance(other, SglRoleBegin):
+            self._execute_role_begin(other)
+        elif isinstance(other, SglRoleEnd):
+            self._execute_role_end(other)
+        elif isinstance(other, SglVarScopeBegin):
+            self._execute_var_scope_begin(other)
+        elif isinstance(other, SglVarScopeEnd):
+            self._execute_var_scope_end(other)
+        else:
+            if self.only_trace_prefix:
+                raise StopTracing()
+            else:
+                self._append_node(other)
+
+        return self
+
+    def __iadd__(self, other):
+        self._execute(other)
+        return self
+
+    def _execute_fill(self, expr: SglConstantText):
+        if isinstance(expr, str):
+            expr = SglConstantText(expr)
+        self._append_node(expr)
+
+    def _execute_gen(self, expr: SglGen):
+        name = expr.name if expr.name is not None else "gen_" + str(len(self.variables))
+        new_node = SglVariable(name, source=expr)
+        self.variables[name] = new_node
+        self._append_node(expr)
+
+    def _execute_select(self, expr: SglSelect):
+        name = (
+            expr.name if expr.name is not None else "select_" + str(len(self.variables))
+        )
+        new_node = SglVariable(name, source=expr)
+        self.variables[name] = new_node
+        self._append_node(expr)
+
+    def _execute_role_begin(self, expr: SglRoleBegin):
+        assert self.cur_role is None, "Nested roles are not allowed."
+
+        if len(self.messages_) == 0 and expr.role != "system":
+            # Insert default system message
+            default_system = self.chat_template.default_system_prompt
+            if default_system:
+                self._execute_role_begin(SglRoleBegin("system"))
+                self._execute_fill(default_system)
+                self._execute_role_end(SglRoleEnd("system"))
+
+        self.cur_role = expr.role
+
+        prefix, suffix = self.chat_template.get_prefix_and_suffix(
+            expr.role, self.messages_
+        )
+
+        self._execute_fill(prefix)
+
+    def _execute_role_end(self, expr: SglRoleEnd):
+        prefix, suffix = self.chat_template.get_prefix_and_suffix(
+            expr.role, self.messages_
+        )
+
+        self._execute_fill(suffix)
+
+        self.messages_.append({"role": expr.role, "content": ""})
+
+        self.cur_role = None
+
+    def _execute_var_scope_end(self, expr: SglVarScopeEnd):
+        new_node = SglVariable(expr.name, source=self.last_node)
+        self.variables[expr.name] = new_node
+
+    def get_var(self, name):
+        ret = self.arguments.get(name, None)
+        if ret is not None:
+            return ret
+
+        v = self.variables[name]
+        return SglVariable(v.name, v.source)
+
+    def flatten_nodes(self):
+        def traverse(cur):
+            if isinstance(cur, SglExprList):
+                for child in cur.expr_list:
+                    traverse(child)
+            else:
+                ret.append(cur)
+
+        ret = []
+        for x in self.nodes:
+            traverse(x)
+        return ret
+
+    def __del__(self):
+        pass
+
+
+class TracingScope:
+    cur_scope = None
+
+    def __init__(self, tracer_state: TracerProgramState):
+        self.tracer_state = tracer_state
+        self.last_scope = TracingScope.cur_scope
+
+    def __enter__(self):
+        TracingScope.cur_scope = self
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        TracingScope.cur_scope = self.last_scope
+
+    @staticmethod
+    def get_current_scope():
+        return TracingScope.cur_scope
+
+    def add_child_state(self, state: TracerProgramState):
+        cur_scope = self
+        while cur_scope is not None:
+            cur_scope.tracer_state.child_states.append(state)
+            cur_scope = cur_scope.last_scope
diff --git a/python/sglang/launch_server.py b/python/sglang/launch_server.py
new file mode 100644
index 000000000..caae7b0f6
--- /dev/null
+++ b/python/sglang/launch_server.py
@@ -0,0 +1,16 @@
+"""Launch the inference server."""
+
+import os
+import sys
+
+from sglang.srt.entrypoints.http_server import launch_server
+from sglang.srt.server_args import prepare_server_args
+from sglang.srt.utils import kill_process_tree
+
+if __name__ == "__main__":
+    server_args = prepare_server_args(sys.argv[1:])
+
+    try:
+        launch_server(server_args)
+    finally:
+        kill_process_tree(os.getpid(), include_parent=False)
diff --git a/python/sglang/profiler.py b/python/sglang/profiler.py
new file mode 100644
index 000000000..d872ca320
--- /dev/null
+++ b/python/sglang/profiler.py
@@ -0,0 +1,166 @@
+"""
+Run live profiling.
+
+Usage:
+python3 -m sglang.profiler
+"""
+
+import argparse
+import json
+import os
+import time
+from argparse import ArgumentParser
+from pathlib import Path
+from typing import List, Optional
+
+import requests
+
+PARENT_FOLDER = "/tmp/sglang-profile"
+
+
+def _run_profile(
+    url: Optional[str],
+    num_steps: int,
+    activities: List[str],
+    output_dir: Optional[str] = None,
+    profile_name: Optional[str] = None,
+    profile_by_stage: bool = False,
+) -> str:
+    if output_dir is None:
+        output_dir = PARENT_FOLDER
+
+    output_dir = os.path.normpath(output_dir)
+    output_dir = os.path.abspath(output_dir)
+    output_dir = Path(output_dir)
+
+    # Add "profile_name/timestamp" to the path.
+    if profile_name:
+        output_dir = output_dir / profile_name
+    output_dir = output_dir / str(time.time())
+    output_dir.mkdir(exist_ok=True, parents=True)
+
+    print(f"Dump profiling traces to {output_dir}")
+    print(
+        f"Waiting for {num_steps} steps and the trace to be flushed.... ({profile_by_stage=})"
+    )
+
+    # Dump server args.
+    file_path = Path(output_dir) / "server_args.json"
+    if not file_path.exists():
+        response = requests.get(url + "/get_server_info")
+        response.raise_for_status()
+        server_args_data = response.json()
+        with open(file_path, "w") as file:
+            file.write(json.dumps(server_args_data))
+
+    # Start profiler. The API replies when all steps are processed
+    # and files are generated.
+    json_data = {
+        "output_dir": str(output_dir),
+        "num_steps": str(num_steps),
+        "activities": activities,
+        "profile_by_stage": profile_by_stage,
+    }
+
+    response = requests.post(url=url + "/start_profile", json=json_data)
+    response.raise_for_status()
+
+    trace_link = str(output_dir)
+    return trace_link
+
+
+def run_profile(
+    url: Optional[str],
+    num_steps: int,
+    activities: List[str],
+    output_dir: Optional[str] = None,
+    profile_name: Optional[str] = None,
+    profile_by_stage: bool = False,
+):
+    # step based profile will self terminate on num_steps constraints
+    link = _run_profile(
+        url, num_steps, activities, output_dir, profile_name, profile_by_stage
+    )
+    return link
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(description="Benchmark the online serving throughput.")
+    parser.add_argument(
+        "--url",
+        type=str,
+        default="http://localhost:30000",
+        help="Server or API base url if not using http host and port.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="Profile directory to dump profile traces.",
+    )
+    parser.add_argument(
+        "--profile-name",
+        type=str,
+        default=None,
+        help="The name of this profile run.",
+    )
+    parser.add_argument(
+        "--num-steps",
+        type=int,
+        default=5,
+        help="The number of forward steps to profile.",
+    )
+    parser.add_argument(
+        "--profile-by-stage",
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=False,
+        help="The number of forward steps to profile.",
+    )
+    parser.add_argument(
+        "--cpu",
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=True,
+        help="Whether to profile CPU activity",
+    )
+    parser.add_argument(
+        "--gpu",
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=True,
+        help="Whether to profile GPU activity",
+    )
+    parser.add_argument(
+        "--mem",
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=False,
+        help="Whether to memory usage (https://pytorch.org/memory_viz)",
+    )
+    parser.add_argument(
+        "--rpd",
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=False,
+        help="Whether to use rpd profiler (https://github.com/ROCm/rocmProfileData)",
+    )
+
+    args = parser.parse_args()
+    activities = []
+    if args.cpu:
+        activities.append("CPU")
+    if args.gpu:
+        activities.append("GPU")
+    if args.mem:
+        activities.append("MEM")
+    if args.rpd:
+        activities.append("RPD")
+    run_profile(
+        args.url,
+        args.num_steps,
+        activities,
+        args.output_dir,
+        args.profile_name,
+        args.profile_by_stage,
+    )
diff --git a/python/sglang/srt/_custom_ops.py b/python/sglang/srt/_custom_ops.py
new file mode 100644
index 000000000..5ed175312
--- /dev/null
+++ b/python/sglang/srt/_custom_ops.py
@@ -0,0 +1,177 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/_custom_ops.py
+import logging
+from typing import List, Optional, Tuple
+
+import torch
+
+from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu, is_npu
+
+logger = logging.getLogger(__name__)
+use_vllm_custom_allreduce = get_bool_env_var(
+    "USE_VLLM_CUSTOM_ALLREDUCE", default="false"
+)
+
+if not is_hpu():
+    # ROCm does not use vllm custom allreduce
+    if use_vllm_custom_allreduce and not is_hip():
+        try:
+            import vllm._C
+        except ImportError as e:
+            logger.warning("Failed to import from vllm._C with %r", e)
+    else:
+        try:
+            import sgl_kernel
+        except ImportError as e:
+            logger.warning("Failed to import from custom_ar with %r", e)
+
+
+if not is_hip() and not is_npu():
+    if use_vllm_custom_allreduce:
+        custom_op = torch.ops._C_custom_ar
+    else:
+        custom_op = sgl_kernel.allreduce
+
+    # custom allreduce
+    def init_custom_ar(
+        ipc_tensors: List[torch.Tensor],
+        rank_data: torch.Tensor,
+        rank: int,
+        full_nvlink: bool,
+    ) -> int:
+        return custom_op.init_custom_ar(ipc_tensors, rank_data, rank, full_nvlink)
+
+    def all_reduce(
+        fa: int,
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        reg_buffer: int,
+        reg_buffer_sz_bytes: int,
+    ) -> None:
+        custom_op.all_reduce(fa, inp, out, reg_buffer, reg_buffer_sz_bytes)
+
+    def dispose(fa: int) -> None:
+        custom_op.dispose(fa)
+
+    def meta_size() -> int:
+        return custom_op.meta_size()
+
+    def register_buffer(fa: int, ipc_tensors: List[int]) -> None:
+        return custom_op.register_buffer(fa, ipc_tensors)
+
+    def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
+        return custom_op.get_graph_buffer_ipc_meta(fa)
+
+    def register_graph_buffers(
+        fa: int, handles: List[List[int]], offsets: List[List[int]]
+    ) -> None:
+        custom_op.register_graph_buffers(fa, handles, offsets)
+
+else:
+    # ROCM custom allreduce
+
+    def init_custom_ar(
+        meta: torch.Tensor,
+        rank_data: torch.Tensor,
+        handles: List[str],
+        offsets: List[int],
+        rank: int,
+        full_nvlink: bool,
+    ) -> int:
+        return sgl_kernel.allreduce.init_custom_ar(
+            meta, rank_data, handles, offsets, rank, full_nvlink
+        )
+
+    def all_reduce_reg(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
+        sgl_kernel.allreduce.all_reduce_reg(fa, inp, out)
+
+    def all_reduce_unreg(
+        fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor, out: torch.Tensor
+    ) -> None:
+        sgl_kernel.allreduce.all_reduce_unreg(fa, inp, reg_buffer, out)
+
+    def dispose(fa: int) -> None:
+        sgl_kernel.allreduce.dispose(fa)
+
+    def meta_size() -> int:
+        return sgl_kernel.allreduce.meta_size()
+
+    def register_buffer(
+        fa: int, t: torch.Tensor, handles: List[str], offsets: List[int]
+    ) -> None:
+        return sgl_kernel.allreduce.register_buffer(fa, t, handles, offsets)
+
+    def get_graph_buffer_ipc_meta(fa: int) -> Tuple[torch.Tensor, List[int]]:
+        return sgl_kernel.allreduce.get_graph_buffer_ipc_meta(fa)
+
+    def register_graph_buffers(
+        fa: int, handles: List[str], offsets: List[List[int]]
+    ) -> None:
+        sgl_kernel.allreduce.register_graph_buffers(fa, handles, offsets)
+
+    def allocate_meta_buffer(size: int) -> torch.Tensor:
+        return sgl_kernel.allreduce.allocate_meta_buffer(size)
+
+    def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
+        return sgl_kernel.allreduce.get_meta_buffer_ipc_handle(inp)
+
+    # ROCM custom quick allreduce
+
+    def init_custom_qr(
+        rank: int, world_size: int, qr_max_size: Optional[int] = None
+    ) -> int:
+        return sgl_kernel.allreduce.init_custom_qr(world_size, rank, qr_max_size)
+
+    def qr_get_handle(fa: int) -> torch.Tensor:
+        return sgl_kernel.allreduce.qr_get_handle(fa)
+
+    def qr_open_handles(fa: int, handles: list[torch.Tensor]) -> None:
+        sgl_kernel.allreduce.qr_open_handles(fa, handles)
+
+    def qr_all_reduce(
+        fa: int,
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        quant_level: int,
+        cast_bf2half: bool,
+    ) -> None:
+        sgl_kernel.allreduce.qr_all_reduce(fa, inp, out, quant_level, cast_bf2half)
+
+    def qr_destroy(fa: int) -> None:
+        sgl_kernel.allreduce.qr_destroy(fa)
+
+    def qr_max_size() -> int:
+        return sgl_kernel.allreduce.qr_max_size()
+
+
+def mscclpp_generate_unique_id() -> bytes:
+    return sgl_kernel.allreduce.mscclpp_generate_unique_id()
+
+
+def mscclpp_init_context(
+    unique_id: bytes,
+    rank: int,
+    world_size: int,
+    scratch: torch.Tensor,
+    put_buffer: torch.Tensor,
+    nranks_per_node: int,
+    rank_to_node: List[int],
+    rank_to_ib: List[int],
+    context_selection: int,
+) -> int:
+    return sgl_kernel.allreduce.mscclpp_init_context(
+        unique_id,
+        rank,
+        world_size,
+        scratch,
+        put_buffer,
+        nranks_per_node,
+        rank_to_node,
+        rank_to_ib,
+        context_selection,
+    )
+
+
+def mscclpp_allreduce(
+    context: int, inp: torch.Tensor, out: torch.Tensor, nthreads: int, nblocks: int
+) -> None:
+    return sgl_kernel.allreduce.mscclpp_allreduce(context, inp, out, nthreads, nblocks)
diff --git a/python/sglang/srt/aio_rwlock.py b/python/sglang/srt/aio_rwlock.py
new file mode 100644
index 000000000..deda1fe79
--- /dev/null
+++ b/python/sglang/srt/aio_rwlock.py
@@ -0,0 +1,100 @@
+import asyncio
+
+
+class RWLock:
+    def __init__(self):
+        # Protects internal state
+        self._lock = asyncio.Lock()
+
+        # Condition variable used to wait for state changes
+        self._cond = asyncio.Condition(self._lock)
+
+        # Number of readers currently holding the lock
+        self._readers = 0
+
+        # Whether a writer is currently holding the lock
+        self._writer_active = False
+
+        # How many writers are queued waiting for a turn
+        self._waiting_writers = 0
+
+    @property
+    def reader_lock(self):
+        """
+        A context manager for acquiring a shared (reader) lock.
+
+        Example:
+            async with rwlock.reader_lock:
+                # read-only access
+        """
+        return _ReaderLock(self)
+
+    @property
+    def writer_lock(self):
+        """
+        A context manager for acquiring an exclusive (writer) lock.
+
+        Example:
+            async with rwlock.writer_lock:
+                # exclusive access
+        """
+        return _WriterLock(self)
+
+    async def acquire_reader(self):
+        async with self._lock:
+            # Wait until there is no active writer or waiting writer
+            # to ensure fairness.
+            while self._writer_active or self._waiting_writers > 0:
+                await self._cond.wait()
+            self._readers += 1
+
+    async def release_reader(self):
+        async with self._lock:
+            self._readers -= 1
+            # If this was the last reader, wake up anyone waiting
+            # (potentially a writer or new readers).
+            if self._readers == 0:
+                self._cond.notify_all()
+
+    async def acquire_writer(self):
+        async with self._lock:
+            # Increment the count of writers waiting
+            self._waiting_writers += 1
+            try:
+                # Wait while either a writer is active or readers are present
+                while self._writer_active or self._readers > 0:
+                    await self._cond.wait()
+                self._writer_active = True
+            finally:
+                # Decrement waiting writers only after we've acquired the writer lock
+                self._waiting_writers -= 1
+
+    async def release_writer(self):
+        async with self._lock:
+            self._writer_active = False
+            # Wake up anyone waiting (readers or writers)
+            self._cond.notify_all()
+
+
+class _ReaderLock:
+    def __init__(self, rwlock: RWLock):
+        self._rwlock = rwlock
+
+    async def __aenter__(self):
+        await self._rwlock.acquire_reader()
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self._rwlock.release_reader()
+
+
+class _WriterLock:
+    def __init__(self, rwlock: RWLock):
+        self._rwlock = rwlock
+
+    async def __aenter__(self):
+        await self._rwlock.acquire_writer()
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self._rwlock.release_writer()
diff --git a/python/sglang/srt/bench_utils.py b/python/sglang/srt/bench_utils.py
new file mode 100644
index 000000000..e9f7fcbb4
--- /dev/null
+++ b/python/sglang/srt/bench_utils.py
@@ -0,0 +1,137 @@
+import os
+import sys
+from contextlib import nullcontext
+
+import torch
+
+
+# NOTE copied and modified from DeepGEMM
+class suppress_stdout_stderr:
+    def __enter__(self):
+        self.outnull_file = open(os.devnull, "w")
+        self.errnull_file = open(os.devnull, "w")
+
+        self.old_stdout_fileno_undup = sys.stdout.fileno()
+        self.old_stderr_fileno_undup = sys.stderr.fileno()
+
+        self.old_stdout_fileno = os.dup(sys.stdout.fileno())
+        self.old_stderr_fileno = os.dup(sys.stderr.fileno())
+
+        self.old_stdout = sys.stdout
+        self.old_stderr = sys.stderr
+
+        os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
+        os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)
+
+        sys.stdout = self.outnull_file
+        sys.stderr = self.errnull_file
+        return self
+
+    def __exit__(self, *_):
+        sys.stdout = self.old_stdout
+        sys.stderr = self.old_stderr
+
+        os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
+        os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
+
+        os.close(self.old_stdout_fileno)
+        os.close(self.old_stderr_fileno)
+
+        self.outnull_file.close()
+        self.errnull_file.close()
+
+
+# NOTE copied and modified from DeepGEMM
+def bench_kineto(
+    fn,
+    kernel_names,
+    num_tests: int = 30,
+    suppress_kineto_output: bool = False,
+    trace_path: str = None,
+    flush_l2: bool = True,
+    with_multiple_kernels: bool = False,
+):
+    # Conflict with Nsight Systems
+    using_nsys = int(os.environ.get("SGLANG_NSYS_PROFILING", 0))
+
+    # By default, flush L2 with an excessive 8GB memset to give the GPU some (literal) chill time without full idle
+    flush_l2_size = int(8e9 // 4)
+
+    # For some auto-tuning kernels with prints
+    fn()
+
+    # Profile
+    suppress = (
+        suppress_stdout_stderr
+        if suppress_kineto_output and not using_nsys
+        else nullcontext
+    )
+    with suppress():
+        schedule = (
+            torch.profiler.schedule(wait=0, warmup=1, active=1, repeat=1)
+            if not using_nsys
+            else None
+        )
+        profiler = (
+            torch.profiler.profile(
+                activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule
+            )
+            if not using_nsys
+            else nullcontext()
+        )
+        with profiler:
+            for i in range(2):
+                for _ in range(num_tests):
+                    if flush_l2:
+                        torch.empty(
+                            flush_l2_size, dtype=torch.int, device="cuda"
+                        ).zero_()
+                    fn()
+
+                if not using_nsys:
+                    profiler.step()
+
+    # Return 1 if using Nsight Systems
+    if using_nsys:
+        return 1
+
+    # Parse the profiling table
+    assert isinstance(kernel_names, str) or isinstance(kernel_names, tuple)
+    is_tuple = isinstance(kernel_names, tuple)
+    prof_lines = (
+        profiler.key_averages()
+        .table(sort_by="cuda_time_total", max_name_column_width=100)
+        .split("\n")
+    )
+    kernel_names = (kernel_names,) if isinstance(kernel_names, str) else kernel_names
+    assert all([isinstance(name, str) for name in kernel_names])
+    if not with_multiple_kernels:
+        for name in kernel_names:
+            assert (
+                sum([name in line for line in prof_lines]) == 1
+            ), f"Errors of the kernel {name} in the profiling table (table: {prof_lines})"
+
+    # Save chrome traces
+    if trace_path is not None:
+        profiler.export_chrome_trace(trace_path)
+
+    # Return average kernel times
+    units = {"ms": 1e3, "us": 1e6}
+    kernel_times = []
+    for name in kernel_names:
+        total_time = 0
+        total_num = 0
+        for line in prof_lines:
+            if name in line:
+                time_str = line.split()[-2]
+                num_str = line.split()[-1]
+                for unit, scale in units.items():
+                    if unit in time_str:
+                        total_time += (
+                            float(time_str.replace(unit, "")) / scale * int(num_str)
+                        )
+                        total_num += int(num_str)
+                        break
+        kernel_times.append(total_time / total_num)
+
+    return tuple(kernel_times) if is_tuple else kernel_times[0]
diff --git a/python/sglang/srt/configs/__init__.py b/python/sglang/srt/configs/__init__.py
new file mode 100644
index 000000000..ef880c911
--- /dev/null
+++ b/python/sglang/srt/configs/__init__.py
@@ -0,0 +1,29 @@
+from sglang.srt.configs.chatglm import ChatGLMConfig
+from sglang.srt.configs.dbrx import DbrxConfig
+from sglang.srt.configs.deepseekvl2 import DeepseekVL2Config
+from sglang.srt.configs.exaone import ExaoneConfig
+from sglang.srt.configs.janus_pro import MultiModalityConfig
+from sglang.srt.configs.kimi_vl import KimiVLConfig
+from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
+from sglang.srt.configs.longcat_flash import LongcatFlashConfig
+from sglang.srt.configs.qwen3_next import Qwen3NextConfig
+from sglang.srt.configs.step3_vl import (
+    Step3TextConfig,
+    Step3VisionEncoderConfig,
+    Step3VLConfig,
+)
+
+__all__ = [
+    "ExaoneConfig",
+    "ChatGLMConfig",
+    "DbrxConfig",
+    "DeepseekVL2Config",
+    "LongcatFlashConfig",
+    "MultiModalityConfig",
+    "KimiVLConfig",
+    "MoonViTConfig",
+    "Step3VLConfig",
+    "Step3TextConfig",
+    "Step3VisionEncoderConfig",
+    "Qwen3NextConfig",
+]
diff --git a/python/sglang/srt/configs/chatglm.py b/python/sglang/srt/configs/chatglm.py
new file mode 100644
index 000000000..9370c218a
--- /dev/null
+++ b/python/sglang/srt/configs/chatglm.py
@@ -0,0 +1,78 @@
+# Adapted from
+# https://github.com/THUDM/ChatGLM2-6B
+# https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/configs/chatglm.py
+
+# ChatGLM2 and ChatGLM3 share the same config.
+# ChatGLM4 is officially supported by Huggingface
+# transformers >= 4.46.0 is required
+# https://huggingface.co/docs/transformers/en/model_doc/glm
+from transformers import PretrainedConfig
+
+
+class ChatGLMConfig(PretrainedConfig):
+    model_type = "chatglm"
+    attribute_map = {
+        "num_hidden_layers": "num_layers",
+        "n_head_kv": "multi_query_group_num",
+    }
+
+    def __init__(
+        self,
+        num_layers=28,
+        padded_vocab_size=65024,
+        hidden_size=4096,
+        ffn_hidden_size=13696,
+        kv_channels=128,
+        num_attention_heads=32,
+        seq_length=2048,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        layernorm_epsilon=1e-5,
+        rmsnorm=True,
+        apply_residual_connection_post_layernorm=False,
+        post_layer_norm=True,
+        add_bias_linear=False,
+        add_qkv_bias=False,
+        interleaved_qkv=False,
+        bias_dropout_fusion=True,
+        multi_query_attention=False,
+        multi_query_group_num=1,
+        apply_query_key_layer_scaling=True,
+        attention_softmax_in_fp32=True,
+        fp32_residual_connection=False,
+        quantization_bit=0,
+        pre_seq_len=None,
+        prefix_projection=False,
+        **kwargs
+    ):
+        self.num_layers = num_layers
+        self.vocab_size = padded_vocab_size
+        self.padded_vocab_size = padded_vocab_size
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.kv_channels = kv_channels
+        self.num_attention_heads = num_attention_heads
+        self.seq_length = seq_length
+        # It is to be compatible with long lora.
+        self.max_position_embeddings = seq_length
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.layernorm_epsilon = layernorm_epsilon
+        self.rmsnorm = rmsnorm
+        self.apply_residual_connection_post_layernorm = (
+            apply_residual_connection_post_layernorm
+        )
+        self.post_layer_norm = post_layer_norm
+        self.add_bias_linear = add_bias_linear
+        self.add_qkv_bias = add_qkv_bias
+        self.bias_dropout_fusion = bias_dropout_fusion
+        self.multi_query_attention = multi_query_attention
+        self.multi_query_group_num = multi_query_group_num
+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        self.fp32_residual_connection = fp32_residual_connection
+        self.quantization_bit = quantization_bit
+        self.pre_seq_len = pre_seq_len
+        self.prefix_projection = prefix_projection
+        self.interleaved_qkv = interleaved_qkv
+        super().__init__(**kwargs)
diff --git a/python/sglang/srt/configs/dbrx.py b/python/sglang/srt/configs/dbrx.py
new file mode 100644
index 000000000..75ccbde94
--- /dev/null
+++ b/python/sglang/srt/configs/dbrx.py
@@ -0,0 +1,279 @@
+# Adapted from
+# https://huggingface.co/databricks/dbrx-base/blob/main/configuration_dbrx.py
+# https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/configs/dbrx.py
+"""Dbrx configuration."""
+
+from typing import Any, Optional
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {}  # type: ignore
+
+
+class DbrxAttentionConfig(PretrainedConfig):
+    """Configuration class for Dbrx Attention.
+
+    [`DbrxAttention`] class. It is used to instantiate attention layers
+    according to the specified arguments, defining the layers architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        attn_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the attention layers.
+        clip_qkv (`float`, *optional*, defaults to None):
+            If not `None`, clip the queries, keys, and values in the attention layer to this value.
+        kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
+        rope_theta (float): The base frequency for rope.
+    """
+
+    def __init__(
+        self,
+        attn_pdrop: float = 0,
+        clip_qkv: Optional[float] = None,
+        kv_n_heads: int = 1,
+        rope_theta: float = 10000.0,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.attn_pdrop = attn_pdrop
+        self.clip_qkv = clip_qkv
+        self.kv_n_heads = kv_n_heads
+        self.rope_theta = rope_theta
+
+        for k in ["model_type"]:
+            if k in kwargs:
+                kwargs.pop(k)
+        if len(kwargs) != 0:
+            raise ValueError(f"Found unknown {kwargs=}")
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: str, **kwargs: Any
+    ) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+
+        if config_dict.get("model_type") == "dbrx":
+            config_dict = config_dict["attn_config"]
+
+        if (
+            "model_type" in config_dict
+            and hasattr(cls, "model_type")
+            and config_dict["model_type"] != cls.model_type
+        ):
+            logger.warning(
+                "You are using a model of type %s to instantiate a model of "
+                "type %s. This is not supported for all configurations of "
+                "models and can yield errors.",
+                config_dict["model_type"],
+                cls.model_type,
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class DbrxFFNConfig(PretrainedConfig):
+    """Configuration class for Dbrx FFN.
+
+    [`DbrxFFN`] class. It is used to instantiate feedforward layers according to
+    the specified arguments, defining the layers architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        ffn_act_fn (dict, optional): A dict specifying activation function for the FFN.
+            The dict should have a key 'name' with the value being the name of
+            the activation function along with any additional keyword arguments.
+        ffn_hidden_size (int, optional): The hidden size of the feedforward network.
+        moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
+        moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
+        moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
+        moe_loss_weight (float, optional): The loss weight for the mixture of experts layer.
+        moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
+        uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
+            This should only be used for benchmarking purposes.
+    """
+
+    def __init__(
+        self,
+        ffn_act_fn: Optional[dict] = None,
+        ffn_hidden_size: int = 3584,
+        moe_num_experts: int = 4,
+        moe_top_k: int = 1,
+        moe_jitter_eps: Optional[float] = None,
+        moe_loss_weight: float = 0.01,
+        moe_normalize_expert_weights: Optional[float] = 1,
+        uniform_expert_assignment: bool = False,
+        **kwargs: Any,
+    ):
+        super().__init__()
+        if ffn_act_fn is None:
+            ffn_act_fn = {"name": "silu"}
+        self.ffn_act_fn = ffn_act_fn
+        self.ffn_hidden_size = ffn_hidden_size
+        self.moe_num_experts = moe_num_experts
+        self.moe_top_k = moe_top_k
+        self.moe_jitter_eps = moe_jitter_eps
+        self.moe_loss_weight = moe_loss_weight
+        self.moe_normalize_expert_weights = moe_normalize_expert_weights
+        self.uniform_expert_assignment = uniform_expert_assignment
+
+        for k in ["model_type"]:
+            if k in kwargs:
+                kwargs.pop(k)
+        if len(kwargs) != 0:
+            raise ValueError(f"Found unknown {kwargs=}")
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: str, **kwargs: Any
+    ) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+
+        if config_dict.get("model_type") == "dbrx":
+            config_dict = config_dict["ffn_config"]
+
+        if (
+            "model_type" in config_dict
+            and hasattr(cls, "model_type")
+            and config_dict["model_type"] != cls.model_type
+        ):
+            logger.warning(
+                "You are using a model of type %s to instantiate a model of "
+                "type %s. This is not supported for all "
+                "configurations of models and can yield errors.",
+                config_dict["model_type"],
+                cls.model_type,
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class DbrxConfig(PretrainedConfig):
+    """Configuration class for Dbrx.
+
+    [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
+    specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        d_model (`int`, *optional*, defaults to 6144):
+            Dimensionality of the embeddings and hidden states.
+        n_heads (`int`, *optional*, defaults to 48):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_layers (`int`, *optional*, defaults to 40):
+            Number of hidden layers in the Transformer encoder.
+        max_seq_len (`int`, *optional*, defaults to 32768):
+            The maximum sequence length of the model.
+        vocab_size (`int`, *optional*, defaults to 100352):
+            Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`DbrxModel`].
+        resid_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability applied to the attention output before combining with residual.
+        emb_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the embedding layer.
+        attn_config (`dict`, *optional*):
+            A dictionary used to configure the model's attention module.
+        ffn_config (`dict`, *optional*):
+            A dictionary used to configure the model's FFN module.
+        use_cache (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also
+            allow the model to output the auxiliary loss. See [here]() for more details
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+
+
+    Example:
+    ```python
+    >>> from transformers import DbrxConfig, DbrxModel
+
+    >>> # Initializing a Dbrx configuration
+    >>> configuration = DbrxConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = DbrxModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "dbrx"
+    attribute_map = {
+        "num_attention_heads": "n_heads",
+        "hidden_size": "d_model",
+        "num_hidden_layers": "n_layers",
+        "max_position_embeddings": "max_seq_len",
+    }
+
+    def __init__(
+        self,
+        d_model: int = 2048,
+        n_heads: int = 16,
+        n_layers: int = 24,
+        max_seq_len: int = 2048,
+        vocab_size: int = 32000,
+        resid_pdrop: float = 0.0,
+        emb_pdrop: float = 0.0,
+        attn_config: Optional[DbrxAttentionConfig] = None,
+        ffn_config: Optional[DbrxFFNConfig] = None,
+        use_cache: bool = True,
+        initializer_range: float = 0.02,
+        output_router_logits: bool = False,
+        router_aux_loss_coef: float = 0.05,
+        **kwargs: Any,
+    ):
+        if attn_config is None:
+            self.attn_config = DbrxAttentionConfig()
+        elif isinstance(attn_config, dict):
+            self.attn_config = DbrxAttentionConfig(**attn_config)
+        else:
+            self.attn_config = attn_config
+
+        if ffn_config is None:
+            self.ffn_config = DbrxFFNConfig()
+        elif isinstance(ffn_config, dict):
+            self.ffn_config = DbrxFFNConfig(**ffn_config)
+        else:
+            self.ffn_config = ffn_config
+
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.resid_pdrop = resid_pdrop
+        self.emb_pdrop = emb_pdrop
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+
+        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
+        if tie_word_embeddings:
+            raise ValueError("tie_word_embeddings is not supported for Dbrx models.")
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/python/sglang/srt/configs/deepseekvl2.py b/python/sglang/srt/configs/deepseekvl2.py
new file mode 100644
index 000000000..bcb0afe5a
--- /dev/null
+++ b/python/sglang/srt/configs/deepseekvl2.py
@@ -0,0 +1,688 @@
+import math
+import os
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from PIL import Image, ImageOps
+from transformers import (
+    AutoProcessor,
+    LlamaTokenizerFast,
+    PretrainedConfig,
+    ProcessorMixin,
+)
+
+
+def select_best_resolution(image_size, candidate_resolutions):
+    # used for cropping
+    original_width, original_height = image_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float("inf")
+
+    for width, height in candidate_resolutions:
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(
+            original_height * scale
+        )
+        effective_resolution = min(
+            downscaled_width * downscaled_height, original_width * original_height
+        )
+        wasted_resolution = (width * height) - effective_resolution
+
+        if effective_resolution > max_effective_resolution or (
+            effective_resolution == max_effective_resolution
+            and wasted_resolution < min_wasted_resolution
+        ):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+
+    return best_fit
+
+
+class DictOutput(object):
+    def items(self):
+        return self.__dict__.items()
+
+    def keys(self):
+        return self.__dict__.keys()
+
+    def __getitem__(self, item):
+        return self.__dict__[item]
+
+    def __contains__(self, key):
+        return key in self.__dict__
+
+    def __setitem__(self, key, value):
+        self.__dict__[key] = value
+
+
+@dataclass
+class VLChatProcessorOutput(DictOutput):
+    input_ids: torch.LongTensor
+    target_ids: torch.LongTensor
+    pixel_values: (
+        torch.Tensor
+    )  # rename from "images" to "pixel_values" for compatibility
+    images_seq_mask: torch.BoolTensor
+    images_spatial_crop: torch.LongTensor
+
+    def __len__(self):
+        return len(self.input_ids)
+
+
+class ImageTransform(object):
+    def __init__(
+        self,
+        mean: Optional[Tuple[float, float, float]] = (0.5, 0.5, 0.5),
+        std: Optional[Tuple[float, float, float]] = (0.5, 0.5, 0.5),
+        normalize: bool = True,
+    ):
+        self.mean = mean
+        self.std = std
+        self.normalize = normalize
+
+        # only load torchvision.transforms when needed
+        try:
+            import torchvision.transforms as T
+
+            # FIXME: add version check for gguf
+        except ImportError as err:
+            raise ImportError(
+                "Please install torchvision via `pip install torchvision` to use Deepseek-VL2."
+            ) from err
+
+        transform_pipelines = [T.ToTensor()]
+
+        if normalize:
+            transform_pipelines.append(T.Normalize(mean, std))
+
+        self.transform = T.Compose(transform_pipelines)
+
+    def __call__(self, pil_img: Image.Image):
+        x = self.transform(pil_img)
+        return x
+
+
+class DeepseekVLV2Processor(ProcessorMixin):
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+    attributes = ["tokenizer"]
+
+    def __init__(
+        self,
+        tokenizer: LlamaTokenizerFast,
+        candidate_resolutions: Tuple[Tuple[int, int]],
+        patch_size: int,
+        downsample_ratio: int,
+        image_mean: Tuple[float, float, float] = (0.5, 0.5, 0.5),
+        image_std: Tuple[float, float, float] = (0.5, 0.5, 0.5),
+        normalize: bool = True,
+        image_token: str = "<image>",
+        pad_token: str = "<｜▁pad▁｜>",
+        add_special_token: bool = False,
+        sft_format: str = "deepseek",
+        mask_prompt: bool = True,
+        ignore_id: int = -100,
+        **kwargs,
+    ):
+
+        self.candidate_resolutions = candidate_resolutions
+        self.image_size = candidate_resolutions[0][0]
+        self.patch_size = patch_size
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.normalize = normalize
+        self.downsample_ratio = downsample_ratio
+
+        self.image_transform = ImageTransform(
+            mean=image_mean, std=image_std, normalize=normalize
+        )
+        self.tokenizer = tokenizer
+        # must set this，padding side with make a difference in batch inference
+        self.tokenizer.padding_side = "left"
+
+        # add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
+        if tokenizer.pad_token is None:
+            self.tokenizer.add_special_tokens({"pad_token": pad_token})
+
+        # add image token
+        image_token_id = self.tokenizer.vocab.get(image_token)
+        if image_token_id is None:
+            special_tokens = [image_token]
+            special_tokens_dict = {"additional_special_tokens": special_tokens}
+            self.tokenizer.add_special_tokens(special_tokens_dict)
+        self.image_token_id = self.tokenizer.vocab.get(image_token)
+
+        # add five special tokens for grounding-related tasks
+        # <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
+        special_tokens = ["<|ref|>", "<|/ref|>", "<|det|>", "<|/det|>", "<|grounding|>"]
+        special_tokens_dict = {"additional_special_tokens": special_tokens}
+        self.tokenizer.add_special_tokens(special_tokens_dict)
+
+        # add special tokens for SFT data
+        special_tokens = ["<|User|>", "<|Assistant|>"]
+        special_tokens_dict = {"additional_special_tokens": special_tokens}
+        self.tokenizer.add_special_tokens(special_tokens_dict)
+
+        self.image_token = image_token
+        self.pad_token = pad_token
+        self.add_special_token = add_special_token
+        self.sft_format = sft_format
+        self.mask_prompt = mask_prompt
+        self.ignore_id = ignore_id
+
+        super().__init__(
+            tokenizer,
+            **kwargs,
+        )
+
+    def format_messages_v2(self, messages, pil_images, max_req_input_len=-1):
+        """play the role of format_messages_v2 and get_images_info in the last version"""
+        tokenized_data = []
+        masked_tokenized_data = []  # labels
+        images_list = []
+        images_seq_mask = []
+        images_spatial_crop = []
+
+        image_index = 0
+        image_token_cnt = messages.count(self.image_token)
+        tokenized_str, images, seq_mask, spatial_crop = self.tokenize_with_images(
+            messages,
+            pil_images[image_index : image_index + image_token_cnt],
+            bos=True,
+            eos=True,
+            cropping=len(pil_images) <= 2,
+            max_req_input_len=max_req_input_len,
+        )
+
+        image_index = image_token_cnt
+        tokenized_data += tokenized_str
+        if self.mask_prompt:
+            masked_tokenized_data += [self.ignore_id] * len(tokenized_str)
+        else:
+            masked_tokenized_data += tokenized_str
+        images_list += images
+        images_seq_mask += seq_mask
+        images_spatial_crop += spatial_crop
+
+        assert len(tokenized_data) == len(
+            images_seq_mask
+        ), f"format_messages_v2: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
+
+        return (
+            tokenized_data,
+            masked_tokenized_data,
+            images_list,
+            images_seq_mask,
+            images_spatial_crop,
+        )
+
+    @property
+    def bos_id(self):
+        return self.tokenizer.bos_token_id
+
+    @property
+    def eos_id(self):
+        return self.tokenizer.eos_token_id
+
+    @property
+    def pad_id(self):
+        return self.tokenizer.pad_token_id
+
+    def encode(self, text: str, bos: bool = True, eos: bool = False):
+        t = self.tokenizer.encode(text, add_special_tokens=False)
+
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+
+        return t
+
+    def decode(self, t: List[int], **kwargs) -> str:
+        return self.tokenizer.decode(t, **kwargs)
+
+    def process_one(
+        self,
+        prompt: str = None,
+        conversations: List[Dict[str, str]] = None,
+        images: List[Image.Image] = None,
+        apply_sft_format: bool = False,
+        inference_mode: bool = True,
+        system_prompt: str = "",
+        max_req_input_len: int = -1,
+        **kwargs,
+    ):
+        """
+
+        Args:
+            prompt (str): the formatted prompt;
+            conversations (List[Dict]): conversations with a list of messages;
+            images (List[ImageType]): the list of images;
+            apply_sft_format (bool): if prompt is not None, then apply the SFT format to prompt;
+                if conversations is not None, then it will always apply the SFT format to conversations;
+            inference_mode (bool): if True, then remove the last eos token;
+            system_prompt (str): the system prompt;
+            **kwargs:
+
+        Returns:
+            outputs (BaseProcessorOutput): the output of the processor,
+                - input_ids (torch.LongTensor): [N + image tokens]
+                - target_ids (torch.LongTensor): [N + image tokens]
+                - images (torch.FloatTensor): [n_images, 3, H, W]
+                - image_id (int): the id of the image token
+                - num_image_tokens (List[int]): the number of image tokens
+        """
+
+        assert (
+            prompt is None or conversations is None
+        ), "prompt and conversations cannot be used at the same time."
+
+        (
+            tokenized_str,
+            masked_tokenized_str,
+            images_list,
+            images_seq_mask,
+            images_spatial_crop,
+        ) = self.format_messages_v2(conversations, images, max_req_input_len)
+
+        assert (
+            len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
+        ), (
+            f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
+            f"imags_seq_mask's length {len(images_seq_mask)}, are not equal"
+        )
+
+        input_ids = torch.LongTensor(tokenized_str)
+        target_ids = torch.LongTensor(masked_tokenized_str)
+        images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
+
+        # set input_ids < 0 | input_ids == self.image_token_id as ignore_id
+        target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
+            self.ignore_id
+        )
+        input_ids[input_ids < 0] = self.pad_id
+
+        if inference_mode:
+            assert input_ids[-1] == self.eos_id
+            input_ids = input_ids[:-1]
+            target_ids = target_ids[:-1]
+            images_seq_mask = images_seq_mask[:-1]
+
+        if len(images_list) == 0:
+            images = torch.zeros((1, 3, self.image_size, self.image_size))
+            images_spatial_crop = torch.zeros((1, 2), dtype=torch.long)
+        else:
+            images = torch.stack(images_list, dim=0)
+            images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
+
+        images_spatial_crop = torch.stack(
+            [images_spatial_crop], dim=0
+        )  # stack the tensor to make it a batch of 1
+
+        prepare = VLChatProcessorOutput(
+            input_ids=input_ids,
+            target_ids=target_ids,
+            pixel_values=images,
+            images_seq_mask=images_seq_mask,
+            images_spatial_crop=images_spatial_crop,
+        )
+
+        return prepare
+
+    def __call__(
+        self,
+        *,
+        prompt: str = None,
+        conversations: List[Dict[str, str]] = None,
+        images: List[Image.Image] = None,
+        apply_sft_format: bool = False,
+        inference_mode: bool = True,
+        system_prompt: str = "",
+        max_req_input_len: int = -1,
+        **kwargs,
+    ):
+        prepare = self.process_one(
+            prompt=prompt,
+            conversations=conversations,
+            images=images,
+            apply_sft_format=apply_sft_format,
+            inference_mode=inference_mode,
+            system_prompt=system_prompt,
+            max_req_input_len=max_req_input_len,
+        )
+
+        return prepare
+
+    def find_all_indices(self, messages, target_value):
+        indices = []
+        for index, item in enumerate(messages):
+            if item == target_value:
+                indices.append(index)
+        return indices
+
+    def tokenize_with_images(
+        self,
+        conversation: str,
+        images: List[Image.Image],
+        bos: bool = True,
+        eos: bool = True,
+        cropping: bool = True,
+        max_req_input_len: int = -1,
+    ):
+        """Tokenize text with <image> tags."""
+        images_list, images_seq_mask, images_spatial_crop = [], [], []
+        text_splits = conversation.split(self.image_token)
+        tokenized_str = []
+        for text_sep, image in zip(text_splits, images):
+            """encode text_sep"""
+            tokenized_sep = self.encode(text_sep, bos=False, eos=False)
+            tokenized_str += tokenized_sep
+            images_seq_mask += [False] * len(tokenized_sep)
+
+            """select best resolution for anyres"""
+            if cropping:
+                best_width, best_height = select_best_resolution(
+                    image.size, self.candidate_resolutions
+                )
+            else:
+                best_width, best_height = self.image_size, self.image_size
+            # print(image.size, (best_width, best_height)) # check the select_best_resolutions func
+
+            """process the global view"""
+            global_view = ImageOps.pad(
+                image,
+                (self.image_size, self.image_size),
+                color=tuple(int(x * 255) for x in self.image_transform.mean),
+            )
+            images_list.append(self.image_transform(global_view))
+
+            """process the local views"""
+            local_view = ImageOps.pad(
+                image,
+                (best_width, best_height),
+                color=tuple(int(x * 255) for x in self.image_transform.mean),
+            )
+            for i in range(0, best_height, self.image_size):
+                for j in range(0, best_width, self.image_size):
+                    images_list.append(
+                        self.image_transform(
+                            local_view.crop(
+                                (j, i, j + self.image_size, i + self.image_size)
+                            )
+                        )
+                    )
+
+            """record height / width crop num"""
+            num_width_tiles, num_height_tiles = (
+                best_width // self.image_size,
+                best_height // self.image_size,
+            )
+            images_spatial_crop.append([num_width_tiles, num_height_tiles])
+
+            """add image tokens"""
+            h = w = math.ceil(
+                (self.image_size // self.patch_size) / self.downsample_ratio
+            )
+            # global views tokens h * (w + 1), 1 is for line separator
+            tokenized_image = [self.image_token_id] * h * (w + 1)
+            # add a separator between global and local views
+            tokenized_image += [self.image_token_id]
+            # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
+            tokenized_image += (
+                [self.image_token_id]
+                * (num_height_tiles * h)
+                * (num_width_tiles * w + 1)
+            )
+
+            tokenized_str += tokenized_image
+            images_seq_mask += [True] * len(tokenized_image)
+            # print(width_crop_num, height_crop_num, len(tokenized_image)) # test the correctness of the number of image-related tokens
+
+        """process the last text split"""
+        tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
+        # deal with video, limit with request len
+        if max_req_input_len > -1:
+            if max_req_input_len < len(tokenized_sep) + len(tokenized_str) - 1:
+                rest = max_req_input_len - len(tokenized_sep) - 1 - 1024
+                tokenized_str = tokenized_str[:rest]
+                images_seq_mask = images_seq_mask[:rest]
+        tokenized_str += tokenized_sep
+        images_seq_mask += [False] * len(tokenized_sep)
+
+        """add the bos and eos tokens"""
+        if bos:
+            tokenized_str = [self.bos_id] + tokenized_str
+            images_seq_mask = [False] + images_seq_mask
+        if eos:
+            tokenized_str = tokenized_str + [self.eos_id]
+            images_seq_mask = images_seq_mask + [False]
+
+        assert len(tokenized_str) == len(
+            images_seq_mask
+        ), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
+
+        return tokenized_str, images_list, images_seq_mask, images_spatial_crop
+
+
+class DeepseekVL2VisionEncoderConfig(PretrainedConfig):
+    model_type: str = "vision"
+
+    model_name: str = "siglip_large_patch16_384"
+    image_size: int = 384
+    patch_size: int = 16
+    width: int = 1024
+    layers: int = 24
+    heads: int = 16
+    mlp_ratio: int = 4
+    global_pool: str = "map"
+    ignore_head: bool = True
+    class_token: bool = False
+    num_classes: int = 0
+    use_checkpoint: bool = False
+    weight_init: str = "skip"
+    deterministic: bool = False
+    num_recomputing_layers: int = 0
+
+    def __init__(
+        self,
+        model_name: str = "siglip_large_patch16_384",
+        image_size: int = 384,
+        patch_size: int = 16,
+        width: int = 1024,
+        layers: int = 24,
+        heads: int = 16,
+        mlp_ratio: int = 4,
+        global_pool: str = "map",
+        ignore_head: bool = True,
+        class_token: bool = False,
+        num_classes: int = 0,
+        use_checkpoint: bool = False,
+        **kwargs,
+    ):
+        self.model_name = model_name
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.width = width
+        self.layers = layers
+        self.heads = heads
+        self.mlp_ratio = mlp_ratio
+        self.global_pool = global_pool
+        self.ignore_head = ignore_head
+        self.class_token = class_token
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+
+        super().__init__(**kwargs)
+
+
+class DeepseekVL2MlpProjectorConfig(PretrainedConfig):
+    model_type = "mlp_projector"
+    projector_type: str = "downsample_mlp_gelu"
+    input_dim: int = 1152
+    n_embed: int = 2048
+    depth: int = 2
+    mlp_ratio: int = 1
+    downsample_ratio: int = 2
+    token_pooling: bool = False
+
+    def __init__(
+        self,
+        projector_type: str = "downsample_mlp_gelu",
+        input_dim: int = 1152,
+        n_embed: int = 2048,
+        depth: int = 2,
+        mlp_ratio: int = 1,
+        downsample_ratio: int = 2,
+        **kwargs,
+    ):
+        self.projector_type = projector_type
+        self.input_dim = input_dim
+        self.n_embed = n_embed
+        self.depth = depth
+        self.mlp_ratio = mlp_ratio
+        self.downsample_ratio = downsample_ratio
+
+        super().__init__(**kwargs)
+
+
+class DeepseekV2Config(PretrainedConfig):
+
+    model_type = "deepseek_v2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=102400,
+        hidden_size=4096,
+        intermediate_size=11008,
+        moe_intermediate_size=1407,
+        num_hidden_layers=30,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        n_shared_experts=None,
+        n_routed_experts=None,
+        ep_size=1,
+        routed_scaling_factor=1.0,
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        qk_rope_head_dim=64,
+        v_head_dim=128,
+        qk_nope_head_dim=128,
+        topk_method="gready",
+        n_group=None,
+        topk_group=None,
+        num_experts_per_tok=None,
+        moe_layer_freq=1,
+        first_k_dense_replace=0,
+        norm_topk_prob=False,
+        scoring_func="softmax",
+        aux_loss_alpha=0.001,
+        seq_aux=True,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=100000,
+        eos_token_id=100001,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        use_mla=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.ep_size = ep_size
+        self.routed_scaling_factor = routed_scaling_factor
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.topk_method = topk_method
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.num_experts_per_tok = num_experts_per_tok
+        self.moe_layer_freq = moe_layer_freq
+        self.first_k_dense_replace = first_k_dense_replace
+        self.norm_topk_prob = norm_topk_prob
+        self.scoring_func = scoring_func
+        self.aux_loss_alpha = aux_loss_alpha
+        self.seq_aux = seq_aux
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = float(rms_norm_eps)
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.use_mla = use_mla
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class DeepseekVL2Config(PretrainedConfig):
+    model_type = "deepseek_vl_v2"
+    vision_config: DeepseekVL2VisionEncoderConfig
+    projector_config: DeepseekVL2MlpProjectorConfig
+    language_config: DeepseekV2Config
+
+    tile_tag: str = "2D"
+    global_view_pos: str = "head"
+    candidate_resolutions: Tuple[Tuple[int, int]] = ((384, 384),)
+
+    def __init__(
+        self,
+        tile_tag: str = "tile_tag",
+        global_view_pos: str = "head",
+        candidate_resolutions: Tuple[Tuple[int, int]] = ((384, 384),),
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        vision_config = kwargs.get("vision_config", {})
+        self.vision_config = DeepseekVL2VisionEncoderConfig(**vision_config)
+
+        projector_config = kwargs.get("projector_config", {})
+        self.projector_config = DeepseekVL2MlpProjectorConfig(**projector_config)
+
+        language_config = kwargs.get("language_config", {})
+        if isinstance(language_config, DeepseekV2Config):
+            self.language_config = language_config
+        else:
+            self.language_config = DeepseekV2Config(**language_config)
+
+        self.tile_tag = tile_tag
+        self.global_view_pos = global_view_pos
+        self.candidate_resolutions = candidate_resolutions
+        self.architectures = ["DeepseekVL2ForCausalLM"]
+
+
+AutoProcessor.register(DeepseekVL2Config, DeepseekVLV2Processor)
diff --git a/python/sglang/srt/configs/device_config.py b/python/sglang/srt/configs/device_config.py
new file mode 100644
index 000000000..3b9d3a1ed
--- /dev/null
+++ b/python/sglang/srt/configs/device_config.py
@@ -0,0 +1,17 @@
+import logging
+from typing import Optional
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+class DeviceConfig:
+    device: Optional[torch.device]
+
+    def __init__(self, device: str = "cuda") -> None:
+        if device in ["cuda", "xpu", "hpu", "cpu", "npu"]:
+            self.device_type = device
+        else:
+            raise RuntimeError(f"Not supported device type: {device}")
+        self.device = torch.device(self.device_type)
diff --git a/python/sglang/srt/configs/exaone.py b/python/sglang/srt/configs/exaone.py
new file mode 100644
index 000000000..7b0a2d290
--- /dev/null
+++ b/python/sglang/srt/configs/exaone.py
@@ -0,0 +1,195 @@
+# coding=utf-8
+# Copyright 2024 The LG AI Research EXAONE Lab. All rights reserved.
+# Copyright 2024 The LG CNS AI Engineering Team.
+# Copyright 2023-2024 SGLang Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" EXAONE model configuration """
+from typing import Any, Dict
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: Dict[str, Any] = {}
+
+
+# ruff: noqa: E501
+class ExaoneConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.ExaoneModel`. It is used to
+    instantiate a EXAONE model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Exaone
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 102400):
+            Vocabulary size of the EXAONE model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.ExaoneModel`. Vocabulary size of the model.
+            Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of
+            :class:`~transformers.EXAONEModel`.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_size (:obj:`int`, `optional`, defaults to 2048):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_layers (:obj:`int`, `optional`, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, `optional`, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (:obj:`int`, `optional`):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        intermediate_size (:obj:`int`, `optional`, defaults to `hidden_size * 4`):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        rope_theta (:obj:`float`, `optional`, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (:obj:`Dict`, `optional`):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (:obj:`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (:obj:`float`, `optional`):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (:obj:`int`, `optional`):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (:obj:`float`, `optional`):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (:obj:`float`, `optional`):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (:obj:`float`, `optional`):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (:obj:`List[float]`, `optional`):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (:obj:`List[float]`, `optional`):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (:obj:`float`, `optional`):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (:obj:`float`, `optional`):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if ``configs.is_decoder=True``.
+        bos_token_id (:obj:`int`, `optional`, defaults to 0):
+            Beginning of stream token id.
+        eos_token_id (:obj:`int`, `optional`, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to tie weight embeddings
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+
+        Example::
+
+            >>> from transformers import EXAONEModel, ExaoneConfig
+
+            >>> # Initializing a EXAONE configuration
+            >>> configuration = ExaoneConfig()
+
+            >>> # Initializing a model from configuration
+            >>> model = EXAONEModel(configuration)
+
+            >>> # Accessing the model configuration
+            >>> configuration = model.configs
+    """
+
+    model_type = "exaone"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_hidden_layers": "num_layers"}
+
+    def __init__(
+        self,
+        vocab_size=102400,
+        max_position_embeddings=2048,
+        hidden_size=2048,
+        num_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        intermediate_size=None,
+        activation_function="silu",
+        rope_theta=10000.0,
+        rope_scaling=None,
+        embed_dropout=0.0,
+        attention_dropout=0.0,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_cache=True,
+        bos_token_id=0,
+        eos_token_id=2,
+        tie_word_embeddings=True,
+        **kwargs
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = num_layers
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        if intermediate_size:
+            self.intermediate_size = intermediate_size
+        else:
+            self.intermediate_size = hidden_size * 4
+        self.activation_function = activation_function
+        self.embed_dropout = embed_dropout
+        self.attention_dropout = attention_dropout
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs
+        )
diff --git a/python/sglang/srt/configs/internvl.py b/python/sglang/srt/configs/internvl.py
new file mode 100644
index 000000000..3ba9c61c1
--- /dev/null
+++ b/python/sglang/srt/configs/internvl.py
@@ -0,0 +1,706 @@
+import copy
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import sentencepiece as spm
+from transformers import (
+    TOKENIZER_MAPPING,
+    GptOssConfig,
+    LlamaConfig,
+    PretrainedConfig,
+    PreTrainedTokenizer,
+    Qwen2Config,
+    Qwen3Config,
+    Qwen3MoeConfig,
+)
+
+from sglang.utils import logger
+
+# Copied from: https://github.com/OpenGVLab/InternVL/blob/34a81000402bf8f716bab8c9b57aff1f6b436bd0/internvl_chat/internvl/model/internvl_chat/configuration_internvl_chat.py#L21
+
+
+VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
+
+PRETRAINED_VOCAB_FILES_MAP = {}
+
+
+# Modified from transformers.model.llama.configuration_llama.LlamaConfig
+class InternLM2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InternLM2Model`]. It is used to instantiate
+    an InternLM2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the InternLM2-7B.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the InternLM2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`InternLM2Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        Example:
+
+    """
+
+    model_type = "internlm2"
+    _auto_class = "AutoConfig"
+
+    def __init__(  # pylint: disable=W0102
+        self,
+        vocab_size=103168,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        bias=True,
+        rope_theta=10000,
+        rope_scaling=None,
+        attn_implementation="eager",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.bias = bias
+
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+
+        self.attn_implementation = attn_implementation
+        if self.attn_implementation is None:
+            self.attn_implementation = "eager"
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if (
+            rope_scaling_factor is None
+            or not isinstance(rope_scaling_factor, (float, int))
+            or rope_scaling_factor < 1.0
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s factor field must be a float|int >= 1, got {rope_scaling_factor=}, {type(rope_scaling_factor)=}"
+            )
+        if isinstance(rope_scaling_factor, int):
+            rope_scaling_factor = float(rope_scaling_factor)
+
+
+class InternVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
+    instantiate a vision encoder according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of color channels in the input images (e.g., 3 for RGB).
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        qkv_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+        hidden_size (`int`, *optional*, defaults to 3200):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_attention_heads (`int`, *optional*, defaults to 25):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 12800):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        qk_normalization (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the queries and keys in the self-attention layers.
+        num_hidden_layers (`int`, *optional*, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        use_flash_attn (`bool`, *optional*, defaults to `True`):
+            Whether to use flash attention mechanism.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Dropout rate for stochastic depth.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 0.1):
+            A factor for layer scale.
+    """
+
+    model_type = "intern_vit_6b"
+
+    def __init__(
+        self,
+        num_channels=3,
+        patch_size=14,
+        image_size=224,
+        qkv_bias=False,
+        hidden_size=3200,
+        num_attention_heads=25,
+        intermediate_size=12800,
+        qk_normalization=True,
+        num_hidden_layers=48,
+        use_flash_attn=True,
+        hidden_act="gelu",
+        layer_norm_eps=1e-6,
+        dropout=0.0,
+        drop_path_rate=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=0.1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.drop_path_rate = drop_path_rate
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.qkv_bias = qkv_bias
+        self.qk_normalization = qk_normalization
+        self.use_flash_attn = use_flash_attn
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+
+        if "vision_config" in config_dict:
+            config_dict = config_dict["vision_config"]
+
+        if (
+            "model_type" in config_dict
+            and hasattr(cls, "model_type")
+            and config_dict["model_type"] != cls.model_type
+        ):
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class InternVLChatConfig(PretrainedConfig):
+    model_type = "internvl_chat"
+    is_composition = True
+
+    def __init__(
+        self,
+        vision_config=None,
+        llm_config=None,
+        use_backbone_lora=0,
+        use_llm_lora=0,
+        pad2square=False,
+        select_layer=-1,
+        force_image_size=None,
+        downsample_ratio=0.5,
+        template=None,
+        dynamic_image_size=False,
+        use_thumbnail=False,
+        ps_version="v1",
+        min_dynamic_patch=1,
+        max_dynamic_patch=6,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {"architectures": ["InternVisionModel"]}
+            logger.info(
+                "vision_config is None. Initializing the InternVisionConfig with default values."
+            )
+
+        if llm_config is None:
+            llm_config = {"architectures": ["InternLM2ForCausalLM"]}
+            logger.info(
+                "llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`)."
+            )
+
+        self.vision_config = InternVisionConfig(**vision_config)
+        if llm_config.get("architectures")[0] == "LlamaForCausalLM":
+            self.llm_config = LlamaConfig(**llm_config)
+        elif llm_config.get("architectures")[0] == "InternLM2ForCausalLM":
+            self.llm_config = InternLM2Config(**llm_config)
+        elif llm_config.get("architectures")[0] == "Qwen2ForCausalLM":
+            self.llm_config = Qwen2Config(**llm_config)
+        elif llm_config.get("architectures")[0] == "Qwen3MoeForCausalLM":
+            self.llm_config = Qwen3MoeConfig(**llm_config)
+        elif llm_config.get("architectures")[0] == "Qwen3ForCausalLM":
+            self.llm_config = Qwen3Config(**llm_config)
+        elif llm_config.get("architectures")[0] == "GptOssForCausalLM":
+            self.llm_config = GptOssConfig(**llm_config)
+        else:
+            raise ValueError(
+                "Unsupported architecture: {}".format(
+                    llm_config.get("architectures")[0]
+                )
+            )
+
+        self.use_backbone_lora = use_backbone_lora
+        self.use_llm_lora = use_llm_lora
+        self.pad2square = pad2square
+        self.select_layer = select_layer
+        self.force_image_size = force_image_size
+        self.downsample_ratio = downsample_ratio
+        self.template = template
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.ps_version = ps_version  # pixel shuffle version
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
+
+        self.hidden_size = self.llm_config.hidden_size
+        # By default, we use tie_word_embeddings=False for models of all sizes.
+        self.tie_word_embeddings = False
+        self.llm_config.tie_word_embeddings = self.tie_word_embeddings
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["vision_config"] = self.vision_config.to_dict()
+        output["llm_config"] = self.llm_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        output["use_backbone_lora"] = self.use_backbone_lora
+        output["use_llm_lora"] = self.use_llm_lora
+        output["select_layer"] = self.select_layer
+        output["force_image_size"] = self.force_image_size
+        output["downsample_ratio"] = self.downsample_ratio
+        output["template"] = self.template
+        output["dynamic_image_size"] = self.dynamic_image_size
+        output["use_thumbnail"] = self.use_thumbnail
+        output["ps_version"] = self.ps_version
+        output["min_dynamic_patch"] = self.min_dynamic_patch
+        output["max_dynamic_patch"] = self.max_dynamic_patch
+
+        return output
+
+
+# # Modified from transformers.model.llama.tokenization_llama_fast.LlamaTokenizerFast -> InternLM2TokenizerFast
+# class InternLM2TokenizerFast(PreTrainedTokenizerFast):
+#     vocab_files_names = VOCAB_FILES_NAMES
+#     slow_tokenizer_class = InternLM2Tokenizer
+#     padding_side = 'left'
+#     model_input_names = ['input_ids', 'attention_mask']
+#     _auto_class = 'AutoTokenizer'
+#
+#     def __init__(
+#         self,
+#         vocab_file,
+#         unk_token='<unk>',
+#         bos_token='<s>',
+#         eos_token='</s>',
+#         pad_token='</s>',
+#         sp_model_kwargs: Optional[Dict[str, Any]] = None,
+#         add_bos_token=True,
+#         add_eos_token=False,
+#         decode_with_prefix_space=False,
+#         clean_up_tokenization_spaces=False,
+#         **kwargs,
+#     ):
+#         super().__init__(
+#             vocab_file=vocab_file,
+#             unk_token=unk_token,
+#             bos_token=bos_token,
+#             eos_token=eos_token,
+#             pad_token=pad_token,
+#             sp_model_kwargs=sp_model_kwargs,
+#             add_bos_token=add_bos_token,
+#             add_eos_token=add_eos_token,
+#             decode_with_prefix_space=decode_with_prefix_space,
+#             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+#             **kwargs,
+#         )
+#         self._add_bos_token = add_bos_token
+#         self._add_eos_token = add_eos_token
+#         self.update_post_processor()
+#         self.vocab_file = vocab_file
+#
+#     @property
+#     def can_save_slow_tokenizer(self) -> bool:
+#         return os.path.isfile(self.vocab_file) if self.vocab_file else False
+#
+#     def update_post_processor(self):
+#         """
+#         Updates the underlying post processor with the current `bos_token` and `eos_token`.
+#         """
+#         bos = self.bos_token
+#         bos_token_id = self.bos_token_id
+#         if bos is None and self.add_bos_token:
+#             raise ValueError('add_bos_token = True but bos_token = None')
+#
+#         eos = self.eos_token
+#         eos_token_id = self.eos_token_id
+#         if eos is None and self.add_eos_token:
+#             raise ValueError('add_eos_token = True but eos_token = None')
+#
+#         single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
+#         pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"
+#
+#         special_tokens = []
+#         if self.add_bos_token:
+#             special_tokens.append((bos, bos_token_id))
+#         if self.add_eos_token:
+#             special_tokens.append((eos, eos_token_id))
+#         self._tokenizer.post_processor = processors.TemplateProcessing(
+#             single=single, pair=pair, special_tokens=special_tokens
+#         )
+#
+#     @property
+#     def add_eos_token(self):
+#         return self._add_eos_token
+#
+#     @property
+#     def add_bos_token(self):
+#         return self._add_bos_token
+#
+#     @add_eos_token.setter
+#     def add_eos_token(self, value):
+#         self._add_eos_token = value
+#         self.update_post_processor()
+#
+#     @add_bos_token.setter
+#     def add_bos_token(self, value):
+#         self._add_bos_token = value
+#         self.update_post_processor()
+#
+#     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+#         if not self.can_save_slow_tokenizer:
+#             raise ValueError(
+#                 'Your fast tokenizer does not have the necessary information to save the vocabulary for a slow '
+#                 'tokenizer.'
+#             )
+#
+#         if not os.path.isdir(save_directory):
+#             logger.error(f'Vocabulary path ({save_directory}) should be a directory')
+#             return
+#         out_vocab_file = os.path.join(
+#             save_directory, (filename_prefix + '-' if filename_prefix else '') + VOCAB_FILES_NAMES['vocab_file']
+#         )
+#
+#         if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+#             copyfile(self.vocab_file, out_vocab_file)
+#
+#         return (out_vocab_file,)
+
+
+# Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer
+class InternLM2Tokenizer(PreTrainedTokenizer):
+    """
+    Construct a InternLM2 tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    model_input_names = ["input_ids", "attention_mask"]
+    _auto_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token="</s>",
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        add_bos_token=True,
+        add_eos_token=False,
+        decode_with_prefix_space=False,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ):
+        print("register succeed")
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.decode_with_prefix_space = decode_with_prefix_space
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+        self._no_prefix_space_tokens = None
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+
+    @property
+    def no_prefix_space_tokens(self):
+        if self._no_prefix_space_tokens is None:
+            vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
+            self._no_prefix_space_tokens = {
+                i for i, tok in enumerate(vocab) if not tok.startswith("▁")
+            }
+        return self._no_prefix_space_tokens
+
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+
+    @property
+    def bos_token_id(self) -> Optional[int]:
+        return self.sp_model.bos_id()
+
+    @property
+    def eos_token_id(self) -> Optional[int]:
+        return self.sp_model.eos_id()
+
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text):
+        """Returns a tokenized string."""
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+
+    def _maybe_add_prefix_space(self, tokens, decoded):
+        if tokens and tokens[0] not in self.no_prefix_space_tokens:
+            return " " + decoded
+        else:
+            return decoded
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        out_string = self.clean_up_tokenization(out_string)
+        out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
+        return out_string[1:]
+
+    def save_vocabulary(
+        self, save_directory, filename_prefix: Optional[str] = None
+    ) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "")
+            + VOCAB_FILES_NAMES["vocab_file"],
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(
+            out_vocab_file
+        ) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        if self.add_bos_token:
+            bos_token_ids = [self.bos_token_id]
+        else:
+            bos_token_ids = []
+
+        output = bos_token_ids + token_ids_0
+
+        if token_ids_1 is not None:
+            output = output + token_ids_1
+
+        if self.add_eos_token:
+            output = output + [self.eos_token_id]
+
+        return output
+
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False,
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True,
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
+        use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        eos = [self.eos_token_id]
+
+        if token_ids_1 is None:
+            return len(token_ids_0 + eos) * [0]
+        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
+
+
+TOKENIZER_MAPPING.register(
+    InternVLChatConfig, (InternLM2Tokenizer, None), exist_ok=True
+)
diff --git a/python/sglang/srt/configs/janus_pro.py b/python/sglang/srt/configs/janus_pro.py
new file mode 100644
index 000000000..d574953e9
--- /dev/null
+++ b/python/sglang/srt/configs/janus_pro.py
@@ -0,0 +1,634 @@
+# Adapted from:
+# https://github.com/deepseek-ai/Janus/tree/main/janus/models
+
+from dataclasses import dataclass
+from typing import Dict, List, Tuple, Union
+
+import numpy as np
+import PIL
+import torch
+from PIL.Image import Image
+from transformers import (
+    BaseImageProcessor,
+    BatchFeature,
+    LlamaConfig,
+    LlamaTokenizerFast,
+    PretrainedConfig,
+    ProcessorMixin,
+)
+from transformers.image_utils import to_numpy_array
+
+from sglang.srt.configs.utils import register_image_processor, register_processor
+from sglang.srt.multimodal.mm_utils import expand2square
+
+
+class DictToObject(dict):
+    def __init__(self, dictionary):
+        super(self).__init__(dictionary)
+
+        for key, value in dictionary.items():
+            if isinstance(value, dict):
+                value = DictToObject(value)
+            setattr(self, key, value)
+
+
+class VisionConfig(PretrainedConfig):
+    model_type = "vision"
+    cls: str = ""
+    params = {}
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.cls = kwargs.get("cls", "")
+        if not isinstance(self.cls, str):
+            self.cls = self.cls.__name__
+
+        self.params = kwargs.get("params", {})
+
+
+class GenAlignerConfig(PretrainedConfig):
+    model_type = "gen_aligner"
+    cls: str = ""
+    params = {}
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.cls = kwargs.get("cls", "")
+        if not isinstance(self.cls, str):
+            self.cls = self.cls.__name__
+
+        self.params = kwargs.get("params", {})
+
+
+class GenHeadConfig(PretrainedConfig):
+    model_type = "gen_head"
+    cls: str = ""
+    params = {}
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.cls = kwargs.get("cls", "")
+        if not isinstance(self.cls, str):
+            self.cls = self.cls.__name__
+
+        self.params = kwargs.get("params", {})
+
+
+class AlignerConfig(PretrainedConfig):
+    model_type = "aligner"
+    cls: str = ""
+    params = {}
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.cls = kwargs.get("cls", "")
+        if not isinstance(self.cls, str):
+            self.cls = self.cls.__name__
+
+        self.params = kwargs.get("params", {})
+
+
+class GenVisionConfig(PretrainedConfig):
+    model_type = "gen_vision"
+    cls: str = ""
+    params = {}
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.cls = kwargs.get("cls", "")
+        if not isinstance(self.cls, str):
+            self.cls = self.cls.__name__
+
+        self.params = kwargs.get("params", {})
+
+
+@dataclass
+class SigLIPVisionCfg:
+    width: int = 1152
+    layers: Union[Tuple[int, int, int, int], int] = 27
+    heads: int = 16
+    patch_size: int = 14
+    image_size: Union[Tuple[int, int], int] = 336
+    global_pool: str = "map"
+    mlp_ratio: float = 3.7362
+    class_token: bool = False
+    num_classes: int = 0
+    use_checkpoint: bool = False
+
+
+class MultiModalityConfig(PretrainedConfig):
+    model_type = "multi_modality"
+    vision_config: VisionConfig
+    aligner_config: AlignerConfig
+
+    gen_vision_config: GenVisionConfig
+    gen_aligner_config: GenAlignerConfig
+    gen_head_config: GenHeadConfig
+
+    language_config: LlamaConfig
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        vision_config = kwargs.get("vision_config", {})
+        self.vision_config = VisionConfig(**vision_config)
+
+        aligner_config = kwargs.get("aligner_config", {})
+        self.aligner_config = AlignerConfig(**aligner_config)
+
+        gen_vision_config = kwargs.get("gen_vision_config", {})
+        self.gen_vision_config = GenVisionConfig(**gen_vision_config)
+
+        gen_aligner_config = kwargs.get("gen_aligner_config", {})
+        self.gen_aligner_config = GenAlignerConfig(**gen_aligner_config)
+
+        gen_head_config = kwargs.get("gen_head_config", {})
+        self.gen_head_config = GenHeadConfig(**gen_head_config)
+
+        language_config = kwargs.get("language_config", {})
+        if isinstance(language_config, LlamaConfig):
+            self.language_config = language_config
+        else:
+            self.language_config = LlamaConfig(**language_config)
+
+
+class VLMImageProcessor(BaseImageProcessor):
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        image_size: int,
+        min_size: int = 14,
+        image_mean: Union[Tuple[float, float, float], List[float]] = (
+            0.48145466,
+            0.4578275,
+            0.40821073,
+        ),
+        image_std: Union[Tuple[float, float, float], List[float]] = (
+            0.26862954,
+            0.26130258,
+            0.27577711,
+        ),
+        rescale_factor: float = 1.0 / 255.0,
+        do_normalize: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.image_size = image_size
+        self.rescale_factor = rescale_factor
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.min_size = min_size
+        self.do_normalize = do_normalize
+
+        if image_mean is None:
+            self.background_color = (127, 127, 127)
+        else:
+            self.background_color = tuple([int(x * 255) for x in image_mean])
+
+    def resize(self, pil_img: Image) -> np.ndarray:
+        """
+
+        Args:
+            pil_img (PIL.Image): [H, W, 3] in PIL.Image in RGB
+
+        Returns:
+            x (np.ndarray): [3, self.image_size, self.image_size]
+        """
+
+        width, height = pil_img.size
+        max_size = max(width, height)
+
+        size = [
+            max(int(height / max_size * self.image_size), self.min_size),
+            max(int(width / max_size * self.image_size), self.min_size),
+        ]
+
+        if width <= 0 or height <= 0 or size[0] <= 0 or size[1] <= 0:
+            # print(f"orig size = {pil_img.size}, new size = {size}")
+            raise ValueError("Invalid size!")
+
+        def resize(
+            pil_img, size, interpolation=PIL.Image.Resampling.BICUBIC, antialias=True
+        ):
+            if isinstance(size, int):
+                w, h = pil_img.size
+                if (w <= h and w == size) or (h <= w and h == size):
+                    return pil_img
+                if w < h:
+                    ow = size
+                    oh = int(size * h / w)
+                else:
+                    oh = size
+                    ow = int(size * w / h)
+                size = (ow, oh)
+            else:
+                size = (size[1], size[0])
+
+            return pil_img.resize(
+                size, resample=interpolation, reducing_gap=None if antialias else 3.0
+            )
+
+        pil_img = resize(
+            pil_img, size, interpolation=PIL.Image.Resampling.BICUBIC, antialias=True
+        )
+
+        pil_img = expand2square(pil_img, self.background_color)
+        x = to_numpy_array(pil_img)
+
+        # [H, W, 3] -> [3, H, W]
+        x = np.transpose(x, (2, 0, 1))
+
+        return x
+
+    def preprocess(self, images, return_tensors: str = "pt", **kwargs) -> BatchFeature:
+        # resize and pad to [self.image_size, self.image_size]
+        # then convert from [H, W, 3] to [3, H, W]
+        if not isinstance(images, list):
+            images = [images]
+        images: List[np.ndarray] = [self.resize(image) for image in images]
+        images = [image[:3, ...] for image in images]
+
+        # rescale from [0, 255] -> [0, 1]
+        images = [
+            self.rescale(
+                image=image,
+                scale=self.rescale_factor,
+                input_data_format="channels_first",
+            )
+            for image in images
+        ]
+
+        # normalize
+        if self.do_normalize:
+            images = [
+                self.normalize(
+                    image=image,
+                    mean=self.image_mean,
+                    std=self.image_std,
+                    input_data_format="channels_first",
+                )
+                for image in images
+            ]
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    @property
+    def default_shape(self):
+        return [3, self.image_size, self.image_size]
+
+
+class DictOutput(object):
+    def items(self):
+        return self.__dict__.items()
+
+    def keys(self):
+        return self.__dict__.keys()
+
+    def __getitem__(self, item):
+        return self.__dict__[item]
+
+    def __contains__(self, key):
+        return key in self.__dict__
+
+    def __setitem__(self, key, value):
+        self.__dict__[key] = value
+
+
+@dataclass
+class VLChatProcessorOutput(DictOutput):
+    sft_format: str
+    input_ids: torch.Tensor
+    pixel_values: torch.Tensor
+    num_image_tokens: torch.IntTensor
+
+    def __len__(self):
+        return len(self.input_ids)
+
+
+@dataclass
+class BatchedVLChatProcessorOutput(DictOutput):
+    sft_format: List[str]
+    input_ids: torch.Tensor
+    pixel_values: torch.Tensor
+    attention_mask: torch.Tensor
+    images_seq_mask: torch.BoolTensor
+    images_emb_mask: torch.BoolTensor
+
+
+# FIXME: had to place Official Processor here, since image_processor module would not be imported in all threads,
+# hence AutoProcessor registration would not be affective in some cases
+class VLChatProcessor(ProcessorMixin):
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+
+    attributes = ["image_processor", "tokenizer"]
+
+    def __init__(
+        self,
+        image_processor: VLMImageProcessor,
+        tokenizer: LlamaTokenizerFast,
+        image_tag: str = "<image_placeholder>",
+        image_start_tag: str = "<begin_of_image>",
+        image_end_tag: str = "<end_of_image>",
+        pad_tag: str = "<｜▁pad▁｜>",
+        num_image_tokens: int = 576,
+        add_special_token: bool = False,
+        sft_format: str = "deepseek",
+        mask_prompt: bool = True,
+        ignore_id: int = -100,
+        **kwargs,
+    ):
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+
+        image_id = self.tokenizer.vocab.get(image_tag)
+        if image_id is None:
+            special_tokens = [image_tag]
+            special_tokens_dict = {"additional_special_tokens": special_tokens}
+            self.tokenizer.add_special_tokens(special_tokens_dict)
+            # print(f"Add image tag = {image_tag} to the tokenizer")
+
+        self.image_tag = image_tag
+        self.image_start_tag = image_start_tag
+        self.image_end_tag = image_end_tag
+        self.pad_tag = pad_tag
+
+        self.num_image_tokens = num_image_tokens
+        self.add_special_token = add_special_token
+        self.sft_format = sft_format
+        self.ignore_id = ignore_id
+
+        super().__init__(
+            image_processor,
+            tokenizer,
+            **kwargs,
+        )
+
+    @property
+    def image_token(self):
+        return self.image_tag
+
+    @property
+    def image_id(self) -> int:
+        image_id = self.tokenizer.vocab.get(self.image_tag)
+        return image_id
+
+    @property
+    def image_start_id(self):
+        image_start_id = self.tokenizer.vocab.get(self.image_start_tag)
+        return image_start_id
+
+    @property
+    def image_end_id(self):
+        image_end_id = self.tokenizer.vocab.get(self.image_end_tag)
+        return image_end_id
+
+    @property
+    def image_start_token(self):
+        return self.image_start_tag
+
+    @property
+    def image_end_token(self):
+        return self.image_end_tag
+
+    @property
+    def pad_id(self):
+        pad_id = self.tokenizer.vocab.get(self.pad_tag)
+        return pad_id
+
+    def add_image_token(
+        self,
+        image_indices: List[int],
+        input_ids: torch.LongTensor,
+    ):
+        """
+
+        Args:
+            image_indices (List[int]): [index_0, index_1, ..., index_j]
+            input_ids (torch.LongTensor): [N]
+
+        Returns:
+            input_ids (torch.LongTensor): [N + image tokens]
+            num_image_tokens (torch.IntTensor): [n_images]
+        """
+
+        input_slices = []
+
+        start = 0
+        for index in image_indices:
+            if self.add_special_token:
+                end = index + 1
+            else:
+                end = index
+
+            # original text tokens
+            input_slices.append(input_ids[start:end])
+
+            # add boi, image tokens, eoi and set the mask as False
+            input_slices.append(self.image_start_id * torch.ones((1), dtype=torch.long))
+            input_slices.append(
+                self.image_id * torch.ones((self.num_image_tokens,), dtype=torch.long)
+            )
+            input_slices.append(self.image_end_id * torch.ones((1), dtype=torch.long))
+            start = index + 1
+
+        # the left part
+        input_slices.append(input_ids[start:])
+
+        # concat all slices
+        input_ids = torch.cat(input_slices, dim=0)
+        num_image_tokens = torch.IntTensor([self.num_image_tokens] * len(image_indices))
+
+        return input_ids, num_image_tokens
+
+    def process_one(
+        self,
+        prompt: str = None,
+        images: List[Image] = None,
+        **kwargs,
+    ):
+        """
+
+        Args:
+            prompt (str): the formatted prompt;
+            images (List[ImageType]): the list of images;
+            **kwargs:
+
+        Returns:
+            outputs (BaseProcessorOutput): the output of the processor,
+                - input_ids (torch.LongTensor): [N + image tokens]
+                - target_ids (torch.LongTensor): [N + image tokens]
+                - images (torch.FloatTensor): [n_images, 3, H, W]
+                - image_id (int): the id of the image token
+                - num_image_tokens (List[int]): the number of image tokens
+        """
+
+        sft_format = prompt
+        # tokenize
+        input_ids = self.tokenizer.encode(sft_format)
+        input_ids = torch.LongTensor(input_ids)
+
+        # add image tokens to the input_ids
+        image_token_mask: torch.Tensor = (input_ids == self.image_id).to(torch.bool)
+        image_indices = image_token_mask.nonzero()
+        input_ids, num_image_tokens = self.add_image_token(
+            image_indices=image_indices,
+            input_ids=input_ids,
+        )
+
+        # load images
+        images_outputs = self.image_processor(images, return_tensors="pt")
+
+        prepare = VLChatProcessorOutput(
+            sft_format=sft_format,
+            input_ids=input_ids,
+            pixel_values=images_outputs.pixel_values,
+            num_image_tokens=num_image_tokens,
+        )
+
+        return prepare
+
+    def __call__(
+        self,
+        *,
+        prompt: str = None,
+        conversations: List[Dict[str, str]] = None,
+        images: List[Image] = None,
+        force_batchify: bool = True,
+        **kwargs,
+    ):
+        """
+
+        Args:
+            prompt (str): the formatted prompt;
+            conversations (List[Dict]): conversations with a list of messages;
+            images (List[ImageType]): the list of images;
+            force_batchify (bool): force batchify the inputs;
+            **kwargs:
+
+        Returns:
+            outputs (BaseProcessorOutput): the output of the processor,
+                - input_ids (torch.LongTensor): [N + image tokens]
+                - images (torch.FloatTensor): [n_images, 3, H, W]
+                - image_id (int): the id of the image token
+                - num_image_tokens (List[int]): the number of image tokens
+        """
+
+        prepare = self.process_one(
+            prompt=prompt, conversations=conversations, images=images
+        )
+
+        if force_batchify:
+            prepare = self.batchify([prepare])
+
+        return prepare
+
+    def batchify(
+        self, prepare_list: List[VLChatProcessorOutput]
+    ) -> BatchedVLChatProcessorOutput:
+        """
+        Preprocesses the inputs for multimodal inference.
+
+        Args:
+            prepare_list (List[VLChatProcessorOutput]): A list of VLChatProcessorOutput.
+
+        Returns:
+            BatchedVLChatProcessorOutput: A dictionary of the inputs to use for multimodal inference.
+        """
+
+        batch_size = len(prepare_list)
+        sft_format = []
+        n_images = []
+        seq_lens = []
+        for prepare in prepare_list:
+            n_images.append(len(prepare.num_image_tokens))
+            seq_lens.append(len(prepare))
+
+        input_token_max_len = max(seq_lens)
+        max_n_images = max(1, max(n_images))
+
+        batched_input_ids = torch.full(
+            (batch_size, input_token_max_len), self.pad_id
+        ).long()  # FIXME
+        batched_attention_mask = torch.zeros((batch_size, input_token_max_len)).long()
+        batched_pixel_values = torch.zeros(
+            (batch_size, max_n_images, *self.image_processor.default_shape)
+        ).float()
+        batched_images_seq_mask = torch.zeros((batch_size, input_token_max_len)).bool()
+        batched_images_emb_mask = torch.zeros(
+            (batch_size, max_n_images, self.num_image_tokens)
+        ).bool()
+
+        for i, prepare in enumerate(prepare_list):
+            input_ids = prepare.input_ids
+            seq_len = len(prepare)
+            n_image = len(prepare.num_image_tokens)
+            # left-padding
+            batched_attention_mask[i, -seq_len:] = 1
+            batched_input_ids[i, -seq_len:] = torch.LongTensor(input_ids)
+            batched_images_seq_mask[i, -seq_len:] = input_ids == self.image_id
+
+            if n_image > 0:
+                batched_pixel_values[i, :n_image] = prepare.pixel_values
+                for j, n_image_tokens in enumerate(prepare.num_image_tokens):
+                    batched_images_emb_mask[i, j, :n_image_tokens] = True
+
+            sft_format.append(prepare.sft_format)
+
+        batched_prepares = BatchedVLChatProcessorOutput(
+            input_ids=batched_input_ids,
+            attention_mask=batched_attention_mask,
+            pixel_values=batched_pixel_values,
+            images_seq_mask=batched_images_seq_mask,
+            images_emb_mask=batched_images_emb_mask,
+            sft_format=sft_format,
+        )
+
+        return batched_prepares
+
+
+class VLMImageProcessorConfig(PretrainedConfig):
+    model_type = "deepseek_vlm"
+    image_size: int
+    min_size: int
+    image_mean: Union[Tuple[float, float, float], List[float]]
+    image_std: Union[Tuple[float, float, float], List[float]]
+    rescale_factor: float
+    do_normalize: bool
+
+    def __init__(
+        self,
+        image_size: int,
+        min_size: int = 14,
+        image_mean: Union[Tuple[float, float, float], List[float]] = (
+            0.48145466,
+            0.4578275,
+            0.40821073,
+        ),
+        image_std: Union[Tuple[float, float, float], List[float]] = (
+            0.26862954,
+            0.26130258,
+            0.27577711,
+        ),
+        rescale_factor: float = 1.0 / 255.0,
+        do_normalize: bool = True,
+        **kwargs,
+    ):
+        self.image_size = image_size
+        self.min_size = min_size
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+
+        super().__init__(**kwargs)
+
+
+register_processor(MultiModalityConfig, VLChatProcessor)
+register_image_processor(MultiModalityConfig, VLMImageProcessor)
diff --git a/python/sglang/srt/configs/kimi_vl.py b/python/sglang/srt/configs/kimi_vl.py
new file mode 100644
index 000000000..3c7d20f59
--- /dev/null
+++ b/python/sglang/srt/configs/kimi_vl.py
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
+from typing import Optional, Union
+
+from transformers.configuration_utils import PretrainedConfig
+
+from sglang.srt.configs.deepseekvl2 import DeepseekV2Config
+from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
+
+
+class KimiVLConfig(PretrainedConfig):
+    model_type = "kimi_vl"
+
+    def __init__(
+        self,
+        vision_config: Optional[Union[dict, MoonViTConfig]] = None,
+        text_config: Optional[Union[dict, DeepseekV2Config]] = None,
+        ignore_index: int = -100,
+        media_placeholder_token_id: int = 163605,
+        pad_token_id: int = 0,
+        **kwargs
+    ):
+        if vision_config is None:
+            vision_config = MoonViTConfig()
+        elif isinstance(vision_config, dict):
+            vision_config = MoonViTConfig(**vision_config)
+        self.vision_config = vision_config
+
+        if text_config is None:
+            text_config = DeepseekV2Config()
+        elif isinstance(text_config, dict):
+            text_config = DeepseekV2Config(**text_config)
+        self.text_config = text_config
+
+        self.ignore_index = ignore_index
+        self.media_placeholder_token_id = media_placeholder_token_id
+
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
diff --git a/python/sglang/srt/configs/kimi_vl_moonvit.py b/python/sglang/srt/configs/kimi_vl_moonvit.py
new file mode 100644
index 000000000..166809eb6
--- /dev/null
+++ b/python/sglang/srt/configs/kimi_vl_moonvit.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
+from transformers.configuration_utils import PretrainedConfig
+
+
+class MoonViTConfig(PretrainedConfig):
+    model_type = "moonvit"
+
+    def __init__(
+        self,
+        patch_size: int = 14,
+        init_pos_emb_height: int = 64,
+        init_pos_emb_width: int = 64,
+        num_attention_heads: int = 16,
+        num_hidden_layers: int = 27,
+        hidden_size: int = 1152,
+        intermediate_size: int = 4304,
+        merge_kernel_size: tuple[int, int] = (2, 2),
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.patch_size = patch_size
+        # Positional embedding config
+        self.init_pos_emb_height = init_pos_emb_height
+        self.init_pos_emb_width = init_pos_emb_width
+        # Transformer config
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        # Patch merger config
+        self.merge_kernel_size = merge_kernel_size
diff --git a/python/sglang/srt/configs/load_config.py b/python/sglang/srt/configs/load_config.py
new file mode 100644
index 000000000..be9a40b4b
--- /dev/null
+++ b/python/sglang/srt/configs/load_config.py
@@ -0,0 +1,89 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
+import enum
+import json
+import logging
+from dataclasses import dataclass, field
+from typing import List, Optional, Union
+
+from sglang.srt.utils import is_hip
+
+logger = logging.getLogger(__name__)
+
+
+class LoadFormat(str, enum.Enum):
+    AUTO = "auto"
+    PT = "pt"
+    SAFETENSORS = "safetensors"
+    NPCACHE = "npcache"
+    DUMMY = "dummy"
+    SHARDED_STATE = "sharded_state"
+    GGUF = "gguf"
+    BITSANDBYTES = "bitsandbytes"
+    MISTRAL = "mistral"
+    LAYERED = "layered"
+    JAX = "jax"
+    REMOTE = "remote"
+
+
+@dataclass
+class LoadConfig:
+    """
+    download_dir: Directory to download and load the weights, default to the
+        default cache directory of huggingface.
+    load_format: The format of the model weights to load:
+        "auto" will try to load the weights in the safetensors format and
+            fall back to the pytorch bin format if safetensors format is
+            not available.
+        "pt" will load the weights in the pytorch bin format.
+        "safetensors" will load the weights in the safetensors format.
+        "npcache" will load the weights in pytorch format and store
+            a numpy cache to speed up the loading.
+        "dummy" will initialize the weights with random values, which is
+            mainly for profiling.
+        "bitsandbytes" will load nf4 type weights.
+    ignore_patterns: The list of patterns to ignore when loading the model.
+        Default to "original/**/*" to avoid repeated loading of llama's
+        checkpoints.
+    decryption_key_file: If set, decrypts the output files with a password read
+        from this file (after PBKDF2).
+    """
+
+    load_format: Union[str, LoadFormat] = LoadFormat.AUTO
+    download_dir: Optional[str] = None
+    model_loader_extra_config: Optional[Union[str, dict]] = field(default_factory=dict)
+    ignore_patterns: Optional[Union[List[str], str]] = None
+    decryption_key_file: Optional[str] = None
+
+    def __post_init__(self):
+        model_loader_extra_config = self.model_loader_extra_config or {}
+        if isinstance(model_loader_extra_config, str):
+            self.model_loader_extra_config = json.loads(model_loader_extra_config)
+        self._verify_load_format()
+
+        if self.ignore_patterns is not None and len(self.ignore_patterns) > 0:
+            logger.info(
+                "Ignoring the following patterns when downloading weights: %s",
+                self.ignore_patterns,
+            )
+        else:
+            self.ignore_patterns = ["original/**/*"]
+
+    def _verify_load_format(self) -> None:
+        if not isinstance(self.load_format, str):
+            return
+
+        load_format = self.load_format.lower()
+        self.load_format = LoadFormat(load_format)
+
+        rocm_not_supported_load_format: List[str] = []
+        if is_hip() and load_format in rocm_not_supported_load_format:
+            rocm_supported_load_format = [
+                f
+                for f in LoadFormat.__members__
+                if (f not in rocm_not_supported_load_format)
+            ]
+            raise ValueError(
+                f"load format '{load_format}' is not supported in ROCm. "
+                f"Supported load formats are "
+                f"{rocm_supported_load_format}"
+            )
diff --git a/python/sglang/srt/configs/longcat_flash.py b/python/sglang/srt/configs/longcat_flash.py
new file mode 100644
index 000000000..e6a2dfb02
--- /dev/null
+++ b/python/sglang/srt/configs/longcat_flash.py
@@ -0,0 +1,104 @@
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+FLASH_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class LongcatFlashConfig(PretrainedConfig):
+    model_type = "longcat_flash"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=131072,
+        hidden_size=6144,
+        intermediate_size=None,
+        ffn_hidden_size=12288,
+        expert_ffn_hidden_size=2048,
+        num_layers=28,
+        num_hidden_layers=None,
+        num_attention_heads=64,
+        ep_size=1,
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        qk_rope_head_dim=128,
+        qk_nope_head_dim=128,
+        v_head_dim=128,
+        n_routed_experts=512,
+        moe_topk=12,
+        norm_topk_prob=False,
+        max_position_embeddings=131072,
+        rms_norm_eps=1e-05,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mla_scale_q_lora=True,
+        mla_scale_kv_lora=True,
+        torch_dtype="bfloat16",
+        params_dtype="bfloat16",
+        rounter_params_dtype="float32",
+        router_bias=False,
+        topk_method=None,
+        routed_scaling_factor=6.0,
+        zero_expert_num=256,
+        zero_expert_type="identity",
+        nextn_use_scmoe=False,
+        num_nextn_predict_layers=1,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            torch_dtype=torch_dtype,
+            params_dtype=params_dtype,
+            rounter_params_dtype=rounter_params_dtype,
+            topk_method=topk_method,
+            router_bias=router_bias,
+            nextn_use_scmoe=nextn_use_scmoe,
+            num_nextn_predict_layers=num_nextn_predict_layers,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = (
+            num_hidden_layers if num_hidden_layers is not None else num_layers
+        )
+        self.intermediate_size = (
+            intermediate_size if intermediate_size is not None else ffn_hidden_size
+        )
+        self.moe_intermediate_size = expert_ffn_hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.ep_size = ep_size
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.n_routed_experts = n_routed_experts
+        self.moe_topk = moe_topk
+        self.norm_topk_prob = norm_topk_prob
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mla_scale_q_lora = mla_scale_q_lora
+        self.mla_scale_kv_lora = mla_scale_kv_lora
+        self.zero_expert_num = zero_expert_num
+        self.zero_expert_type = zero_expert_type
+        self.routed_scaling_factor = routed_scaling_factor
+        self.hidden_act = "silu"
diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
new file mode 100644
index 000000000..0dbe37aa0
--- /dev/null
+++ b/python/sglang/srt/configs/model_config.py
@@ -0,0 +1,811 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import json
+import logging
+import math
+import os
+from enum import Enum, IntEnum, auto
+from typing import List, Optional, Set, Union
+
+import torch
+from transformers import PretrainedConfig
+
+from sglang.srt.hf_transformers_utils import (
+    get_config,
+    get_context_length,
+    get_generation_config,
+    get_hf_text_config,
+    get_sparse_attention_config,
+)
+from sglang.srt.layers.quantization import QUANTIZATION_METHODS
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import get_bool_env_var, is_hip
+from sglang.utils import is_in_ci
+
+logger = logging.getLogger(__name__)
+
+
+class AttentionArch(IntEnum):
+    MLA = auto()
+    MHA = auto()
+
+
+class ModelImpl(str, Enum):
+    AUTO = "auto"
+    SGLANG = "sglang"
+    TRANSFORMERS = "transformers"
+
+
+class ModelConfig:
+    def __init__(
+        self,
+        model_path: str,
+        trust_remote_code: bool = True,
+        revision: Optional[str] = None,
+        context_length: Optional[int] = None,
+        model_override_args: str = "{}",
+        is_embedding: Optional[bool] = None,
+        enable_multimodal: Optional[bool] = None,
+        dtype: str = "auto",
+        quantization: Optional[str] = None,
+        override_config_file: Optional[str] = None,
+        is_draft_model: bool = False,
+        hybrid_kvcache_ratio: Optional[float] = None,
+        model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
+    ) -> None:
+        # Parse args
+        self.model_path = model_path
+        self.revision = revision
+        self.quantization = quantization
+        self.model_impl = model_impl
+
+        self.maybe_pull_model_tokenizer_from_remote()
+        self.model_override_args = json.loads(model_override_args)
+        kwargs = {}
+        if override_config_file and override_config_file.strip():
+            kwargs["_configuration_file"] = override_config_file.strip()
+
+        self.hf_config = get_config(
+            self.model_path,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            model_override_args=self.model_override_args,
+            **kwargs,
+        )
+
+        self.hf_generation_config = get_generation_config(
+            self.model_path,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            **kwargs,
+        )
+
+        self.hf_text_config = get_hf_text_config(self.hf_config)
+        self.attention_chunk_size = getattr(
+            self.hf_text_config, "attention_chunk_size", None
+        )
+        self.is_hybrid = is_hybrid_model(
+            self.hf_config.architectures,
+            hybrid_kvcache_ratio=hybrid_kvcache_ratio,
+            context_length=context_length,
+            attention_chunk_size=self.attention_chunk_size,
+        )
+        if self.is_hybrid is not None:
+            self.swa_attention_layer_ids, self.full_attention_layer_ids = (
+                get_hybrid_layer_ids(
+                    self.hf_config.architectures, self.hf_text_config.num_hidden_layers
+                )
+            )
+
+        if enable_multimodal is None:
+            mm_disabled_models = [
+                "Gemma3ForConditionalGeneration",
+                "Llama4ForConditionalGeneration",
+                "Step3VLForConditionalGeneration",
+            ]
+            if self.hf_config.architectures[0] in mm_disabled_models:
+                enable_multimodal = False
+                logger.info(
+                    f"Multimodal is disabled for {self.hf_config.model_type}. To enable it, set --enable-multimodal."
+                )
+            else:
+                enable_multimodal = True
+
+        if (
+            is_draft_model
+            and self.hf_config.architectures[0] == "DeepseekV3ForCausalLM"
+        ):
+            self.hf_config.architectures[0] = "DeepseekV3ForCausalLMNextN"
+
+        if is_draft_model and self.hf_config.architectures[0] == "Glm4MoeForCausalLM":
+            self.hf_config.architectures[0] = "Glm4MoeForCausalLMNextN"
+
+        if (
+            is_draft_model
+            and self.hf_config.architectures[0] == "LongcatFlashForCausalLM"
+        ):
+            self.hf_config.architectures[0] = "LongcatFlashForCausalLMNextN"
+            self.hf_config.num_hidden_layers = self.hf_config.num_nextn_predict_layers
+
+        if is_draft_model and self.hf_config.architectures[0] == "MiMoForCausalLM":
+            self.hf_config.architectures[0] = "MiMoMTP"
+        if (
+            is_draft_model
+            and self.hf_config.architectures[0] == "Ernie4_5_MoeForCausalLM"
+        ):
+            self.hf_config.architectures[0] = "Ernie4_5_MoeForCausalLMMTP"
+
+        if is_draft_model and self.hf_config.architectures[0] == "Qwen3NextForCausalLM":
+            self.hf_config.architectures[0] = "Qwen3NextForCausalLMMTP"
+
+        # Check model type
+        self.is_generation = is_generation_model(
+            self.hf_config.architectures, is_embedding
+        )
+        self.is_multimodal = enable_multimodal and is_multimodal_model(
+            self.hf_config.architectures
+        )
+        self.is_multimodal_gen = enable_multimodal and is_multimodal_gen_model(
+            self.hf_config.architectures
+        )
+        self.is_image_gen = enable_multimodal and is_image_gen_model(
+            self.hf_config.architectures
+        )
+        self.is_audio_model = enable_multimodal and is_audio_model(
+            self.hf_config.architectures
+        )
+        self.is_multimodal_chunked_prefill_supported = (
+            enable_multimodal
+            and is_multimodal_chunked_prefill_supported(self.hf_config.architectures)
+        )
+        self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
+        self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
+
+        # Derive context length
+        derived_context_len = get_context_length(self.hf_text_config)
+        if context_length is not None:
+            if context_length > derived_context_len:
+                reason = "Target model's" if is_draft_model else "User-specified"
+                msg = (
+                    f"Warning: {reason} context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
+                    f"This may lead to incorrect model outputs or CUDA errors. Note that the derived context_length may differ from max_position_embeddings in the model's config."
+                )
+                if (
+                    get_bool_env_var("SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN")
+                    or is_in_ci()  # FIXME: fix this special case
+                ):
+                    logger.warning(msg)
+                    self.context_len = context_length
+                else:
+                    raise ValueError(
+                        f"{msg} To allow overriding this maximum, set the env var SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1"
+                    )
+            else:
+                self.context_len = context_length
+        else:
+            self.context_len = derived_context_len
+
+        # Unify the config keys for hf_text_config
+        self.head_dim = getattr(
+            self.hf_text_config,
+            "head_dim",
+            self.hf_text_config.hidden_size // self.hf_text_config.num_attention_heads,
+        )
+
+        # FIXME: temporary special judge for MLA architecture
+        if (
+            "DeepseekV2ForCausalLM" in self.hf_config.architectures
+            or "DeepseekV3ForCausalLM" in self.hf_config.architectures
+            or "DeepseekV3ForCausalLMNextN" in self.hf_config.architectures
+            or "LongcatFlashForCausalLM" in self.hf_config.architectures
+            or "LongcatFlashForCausalLMNextN" in self.hf_config.architectures
+        ):
+            self.head_dim = 256
+            self.attention_arch = AttentionArch.MLA
+            self.kv_lora_rank = self.hf_config.kv_lora_rank
+            self.qk_nope_head_dim = self.hf_config.qk_nope_head_dim
+            self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
+            self.v_head_dim = self.hf_config.v_head_dim
+
+            # Handle rope scaling with yarn
+            self.scaling = 1 / math.sqrt(self.qk_nope_head_dim + self.qk_rope_head_dim)
+            if self.hf_config.rope_scaling:
+                mscale_all_dim = self.hf_config.rope_scaling.get(
+                    "mscale_all_dim", False
+                )
+                scaling_factor = self.hf_config.rope_scaling["factor"]
+                mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+                self.scaling = self.scaling * mscale * mscale
+
+        elif "MiniCPM3ForCausalLM" in self.hf_config.architectures:
+            self.head_dim = 128
+            self.attention_arch = AttentionArch.MLA
+            self.kv_lora_rank = self.hf_config.kv_lora_rank
+            self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
+        elif "DeepseekVL2ForCausalLM" in self.hf_config.architectures and getattr(
+            self.hf_text_config, "use_mla", True
+        ):
+            self.head_dim = 256
+            self.attention_arch = AttentionArch.MLA
+            self.kv_lora_rank = self.hf_text_config.kv_lora_rank
+            self.qk_rope_head_dim = self.hf_text_config.qk_rope_head_dim
+        elif "KimiVLForConditionalGeneration" in self.hf_config.architectures:
+            self.head_dim = 256
+            self.attention_arch = AttentionArch.MLA
+            self.kv_lora_rank = self.hf_text_config.kv_lora_rank
+            self.qk_rope_head_dim = self.hf_text_config.qk_rope_head_dim
+            self.v_head_dim = self.hf_text_config.v_head_dim
+            self.qk_nope_head_dim = self.hf_text_config.qk_nope_head_dim
+        else:
+            if (
+                "MistralModel" in self.hf_config.architectures
+                or "MixtralForCausalLM" in self.hf_config.architectures
+                or "MistralForCausalLM" in self.hf_config.architectures
+            ):
+                if getattr(self, "head_dim", None) is None:
+                    self.head_dim = (
+                        self.hf_config.hidden_size // self.hf_config.num_attention_heads
+                    )
+                    # In transformers==4.52.3, the head_dim is null in MistralConfig
+                    if (
+                        not hasattr(self.hf_text_config, "head_dim")
+                        or self.hf_text_config.head_dim is None
+                    ):
+                        setattr(self.hf_text_config, "head_dim", self.head_dim)
+
+            self.attention_arch = AttentionArch.MHA
+
+        self.num_attention_heads = self.hf_text_config.num_attention_heads
+        self.num_key_value_heads = getattr(
+            self.hf_text_config, "num_key_value_heads", None
+        )
+
+        # for Dbrx and MPT models
+        if self.hf_config.model_type in ["dbrx", "mpt"]:
+            self.num_key_value_heads = getattr(
+                self.hf_config.attn_config, "kv_n_heads", None
+            )
+
+        if self.num_key_value_heads is None:
+            self.num_key_value_heads = self.num_attention_heads
+        self.hidden_size = self.hf_text_config.hidden_size
+        self.num_hidden_layers = self.hf_text_config.num_hidden_layers
+        self.num_attention_layers = self.num_hidden_layers
+        if "LongcatFlashForCausalLM" in self.hf_config.architectures:
+            self.num_attention_layers = self.num_hidden_layers * 2
+        self.num_nextn_predict_layers = getattr(
+            self.hf_text_config, "num_nextn_predict_layers", None
+        )
+        self.vocab_size = self.hf_text_config.vocab_size
+
+        # Verify quantization
+        self._verify_quantization()
+
+        # Verify dual-chunk attention config
+        self._verify_dual_chunk_attention_config()
+
+        # Cache attributes
+        self.hf_eos_token_id = self.get_hf_eos_token_id()
+
+        # multimodal
+        self.image_token_id = getattr(
+            self.hf_config, "image_token_id", None
+        ) or getattr(self.hf_config, "image_token_index", None)
+
+    @staticmethod
+    def from_server_args(
+        server_args: ServerArgs,
+        model_path: str = None,
+        model_revision: str = None,
+        **kwargs,
+    ):
+        return ModelConfig(
+            model_path=model_path or server_args.model_path,
+            trust_remote_code=server_args.trust_remote_code,
+            revision=model_revision or server_args.revision,
+            context_length=server_args.context_length,
+            model_override_args=server_args.json_model_override_args,
+            is_embedding=server_args.is_embedding,
+            enable_multimodal=server_args.enable_multimodal,
+            dtype=server_args.dtype,
+            quantization=server_args.quantization,
+            hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
+            model_impl=server_args.model_impl,
+            **kwargs,
+        )
+
+    def get_total_num_attention_heads(self) -> int:
+        return self.num_attention_heads
+
+    def get_num_attention_heads(self, tensor_parallel_size) -> int:
+        total_num_attention_heads = self.num_attention_heads
+        return max(1, total_num_attention_heads // tensor_parallel_size)
+
+    # adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py#L289
+    def get_total_num_kv_heads(self) -> int:
+        """Returns the total number of KV heads."""
+        # For GPTBigCode & Falcon:
+        # NOTE: for falcon, when new_decoder_architecture is True, the
+        # multi_query flag is ignored and we use n_head_kv for the number of
+        # KV heads.
+        falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"]
+        new_decoder_arch_falcon = (
+            self.hf_config.model_type in falcon_model_types
+            and getattr(self.hf_config, "new_decoder_architecture", False)
+        )
+        if not new_decoder_arch_falcon and getattr(
+            self.hf_text_config, "multi_query", False
+        ):
+            # Multi-query attention, only one KV head.
+            # Currently, tensor parallelism is not supported in this case.
+            return 1
+
+        # For DBRX and MPT
+        if self.hf_config.model_type in ["mpt"]:
+            if "kv_n_heads" in self.hf_config.attn_config:
+                return self.hf_config.attn_config["kv_n_heads"]
+            return self.hf_config.num_attention_heads
+        if self.hf_config.model_type in ["dbrx"]:
+            return getattr(
+                self.hf_config.attn_config,
+                "kv_n_heads",
+                self.hf_config.num_attention_heads,
+            )
+        if self.hf_config.model_type in ["nemotron-nas"]:
+            nkvh = {
+                self.hf_config.num_attention_heads // block.attention.n_heads_in_group
+                for block in self.hf_config.block_configs
+                if not block.attention.no_op
+            }
+            if len(nkvh) == 0:
+                raise RuntimeError("Couldn't determine number of kv heads")
+            if len(nkvh) > 1:
+                raise ValueError(
+                    "Variable GQA (VGQA) is not yet supported for nemotron-nas in sglang"
+                )
+            return next(iter(nkvh))
+
+        attributes = [
+            # For Falcon:
+            "n_head_kv",
+            "num_kv_heads",
+            # For LLaMA-2:
+            "num_key_value_heads",
+            # For ChatGLM:
+            "multi_query_group_num",
+            # For Step3
+            "num_attention_groups",
+        ]
+        for attr in attributes:
+            num_kv_heads = getattr(self.hf_text_config, attr, None)
+            if num_kv_heads is not None:
+                return num_kv_heads
+
+        # For non-grouped-query attention models, the number of KV heads is
+        # equal to the number of attention heads.
+        return self.hf_text_config.num_attention_heads
+
+    def get_num_kv_heads(self, tensor_parallel_size) -> int:
+        """Returns the number of KV heads per GPU."""
+        total_num_kv_heads = self.get_total_num_kv_heads()
+        # If tensor parallelism is used, we divide the number of KV heads by
+        # the tensor parallel size. We will replicate the KV heads in the
+        # case where the number of KV heads is smaller than the tensor
+        # parallel size so each GPU has at least one KV head.
+        return max(1, total_num_kv_heads // tensor_parallel_size)
+
+    # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
+    def _parse_quant_hf_config(self):
+        quant_cfg = getattr(self.hf_config, "quantization_config", None)
+        if quant_cfg is None:
+            # compressed-tensors uses a "compression_config" key
+            quant_cfg = getattr(self.hf_config, "compression_config", None)
+        if quant_cfg is None:
+            # check if is modelopt or mixed-precision model -- Both of them don't have corresponding field
+            # in hf `config.json` but has a standalone `hf_quant_config.json` in the root directory
+            # example: https://huggingface.co/nvidia/Llama-3.1-8B-Instruct-FP8/tree/main
+            # example: https://huggingface.co/Barrrrry/DeepSeek-R1-W4AFP8/tree/main
+            is_local = os.path.exists(self.model_path)
+            modelopt_quant_config = {"quant_method": "modelopt"}
+            if not is_local:
+                import huggingface_hub
+
+                try:
+                    from huggingface_hub import HfApi
+
+                    hf_api = HfApi()
+                    if hf_api.file_exists(self.model_path, "hf_quant_config.json"):
+                        quant_cfg = modelopt_quant_config
+                except huggingface_hub.errors.OfflineModeIsEnabled:
+                    logger.warning(
+                        "Offline mode is enabled, skipping hf_quant_config.json check"
+                    )
+                    pass
+
+            elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
+                quant_config_file = os.path.join(
+                    self.model_path, "hf_quant_config.json"
+                )
+                with open(quant_config_file) as f:
+                    quant_config_dict = json.load(f)
+                json_quant_configs = quant_config_dict["quantization"]
+                quant_algo = json_quant_configs.get("quant_algo", None)
+                if quant_algo == "MIXED_PRECISION":
+                    quant_cfg = {"quant_method": "w4afp8"}
+                else:
+                    quant_cfg = modelopt_quant_config
+        return quant_cfg
+
+    # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
+    def _verify_quantization(self) -> None:
+        supported_quantization = [*QUANTIZATION_METHODS]
+        rocm_supported_quantization = [
+            "awq",
+            "gptq",
+            "fp8",
+            "compressed_tensors",
+            "compressed-tensors",
+            "fbgemm_fp8",
+            "w8a8_fp8",
+            "petit_nvfp4",
+            "quark",
+            "mxfp4",
+        ]
+        optimized_quantization_methods = [
+            "fp8",
+            "marlin",
+            "modelopt",
+            "gptq_marlin_24",
+            "gptq_marlin",
+            "awq_marlin",
+            "fbgemm_fp8",
+            "compressed_tensors",
+            "compressed-tensors",
+            "experts_int8",
+            "w8a8_int8",
+            "w8a8_fp8",
+            "moe_wna16",
+            "qoq",
+            "w4afp8",
+            "petit_nvfp4",
+        ]
+        compatible_quantization_methods = {
+            "modelopt_fp4": ["modelopt"],
+            "petit_nvfp4": ["modelopt"],
+            "w8a8_int8": ["compressed-tensors", "compressed_tensors"],
+            "w8a8_fp8": ["compressed-tensors", "compressed_tensors"],
+        }
+        if self.quantization is not None:
+            self.quantization = self.quantization.lower()
+
+        # Parse quantization method from the HF model config, if available.
+        quant_cfg = self._parse_quant_hf_config()
+
+        if quant_cfg is not None:
+            quant_method = quant_cfg.get(
+                "quant_method", "" if not self.quantization else self.quantization
+            ).lower()
+
+            # Detect which checkpoint is it
+            for _, method in QUANTIZATION_METHODS.items():
+                quantization_override = method.override_quantization_method(
+                    quant_cfg, self.quantization
+                )
+                if quantization_override:
+                    quant_method = quantization_override
+                    self.quantization = quantization_override
+                    break
+
+            # Verify quantization configurations.
+            if self.quantization is None:
+                self.quantization = quant_method
+            elif self.quantization != quant_method:
+                if (
+                    self.quantization not in compatible_quantization_methods
+                    or quant_method
+                    not in compatible_quantization_methods[self.quantization]
+                ):
+                    raise ValueError(
+                        "Quantization method specified in the model config "
+                        f"({quant_method}) does not match the quantization "
+                        f"method specified in the `quantization` argument "
+                        f"({self.quantization})."
+                    )
+
+        if self.quantization is not None:
+            if self.quantization not in supported_quantization:
+                raise ValueError(
+                    f"Unknown quantization method: {self.quantization}. Must "
+                    f"be one of {supported_quantization}."
+                )
+            if is_hip() and self.quantization not in rocm_supported_quantization:
+                raise ValueError(
+                    f"{self.quantization} quantization is currently not "
+                    f"supported in ROCm."
+                )
+            if self.quantization not in optimized_quantization_methods:
+                logger.warning(
+                    "%s quantization is not fully "
+                    "optimized yet. The speed can be slower than "
+                    "non-quantized models.",
+                    self.quantization,
+                )
+
+    def _verify_dual_chunk_attention_config(self) -> None:
+        if hasattr(self.hf_config, "dual_chunk_attention_config"):
+            # Try loading the sparse attention config
+            sparse_attn_config = get_sparse_attention_config(self.model_path)
+            if not sparse_attn_config:
+                return
+            self.hf_config.dual_chunk_attention_config["sparse_attention_config"] = (
+                sparse_attn_config
+            )
+            if (
+                "sparse_attention_enabled"
+                not in self.hf_config.dual_chunk_attention_config
+            ):
+                self.hf_config.dual_chunk_attention_config[
+                    "sparse_attention_enabled"
+                ] = True
+
+    def get_hf_eos_token_id(self) -> Optional[Set[int]]:
+        eos_ids = getattr(self.hf_config, "eos_token_id", None)
+        if eos_ids is not None:
+            # it can be either int or list of int
+            eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids)
+        if eos_ids is None:
+            eos_ids = set()
+        if self.hf_generation_config:
+            generation_eos_ids = getattr(
+                self.hf_generation_config, "eos_token_id", None
+            )
+            if generation_eos_ids:
+                generation_eos_ids = (
+                    {generation_eos_ids}
+                    if isinstance(generation_eos_ids, int)
+                    else set(generation_eos_ids)
+                )
+                eos_ids = eos_ids | generation_eos_ids
+        return eos_ids
+
+    def maybe_pull_model_tokenizer_from_remote(self) -> None:
+        """
+        Pull the model config files to a temporary
+        directory in case of remote.
+
+        Args:
+            model: The model name or path.
+
+        """
+        from sglang.srt.connector import create_remote_connector
+        from sglang.srt.utils import is_remote_url
+
+        if is_remote_url(self.model_path):
+            logger.info("Pulling model configs from remote...")
+            # BaseConnector implements __del__() to clean up the local dir.
+            # Since config files need to exist all the time, so we DO NOT use
+            # with statement to avoid closing the client.
+            client = create_remote_connector(self.model_path)
+            if is_remote_url(self.model_path):
+                client.pull_files(allow_pattern=["*config.json"])
+                self.model_weights = self.model_path
+                self.model_path = client.get_local_dir()
+
+
+# adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
+_STR_DTYPE_TO_TORCH_DTYPE = {
+    "half": torch.float16,
+    "float16": torch.float16,
+    "float": torch.float32,
+    "float32": torch.float32,
+    "bfloat16": torch.bfloat16,
+}
+
+
+# adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
+def _get_and_verify_dtype(
+    config: PretrainedConfig,
+    dtype: Union[str, torch.dtype],
+) -> torch.dtype:
+    # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
+    # because config.torch_dtype can be None.
+    config_dtype = getattr(config, "torch_dtype", None)
+    if isinstance(config_dtype, str):
+        config_dtype = _STR_DTYPE_TO_TORCH_DTYPE.get(config_dtype, None)
+    if config_dtype is None:
+        config_dtype = torch.float32
+
+    if isinstance(dtype, str):
+        dtype = dtype.lower()
+        if dtype == "auto":
+            if config_dtype == torch.float32:
+                if config.model_type.startswith("gemma"):
+                    if config.model_type == "gemma":
+                        gemma_version = ""
+                    else:
+                        gemma_version = config.model_type[5]
+                    logger.info(
+                        f"For Gemma {gemma_version}, we downcast float32 to bfloat16 instead "
+                        "of float16 by default. Please specify `dtype` if you "
+                        "want to use float16."
+                    )
+                    torch_dtype = torch.bfloat16
+                else:
+                    # Following the common practice, we use float16 for float32
+                    # models.
+                    torch_dtype = torch.float16
+            else:
+                torch_dtype = config_dtype
+        else:
+            if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
+                raise ValueError(f"Unknown dtype: {dtype}")
+            torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
+    elif isinstance(dtype, torch.dtype):
+        torch_dtype = dtype
+    else:
+        raise ValueError(f"Unknown dtype: {dtype}")
+
+    # Verify the dtype.
+    if torch_dtype != config_dtype:
+        if torch_dtype == torch.float32:
+            # Upcasting to float32 is allowed.
+            logger.info("Upcasting %s to %s.", config_dtype, torch_dtype)
+            pass
+        elif config_dtype == torch.float32:
+            # Downcasting from float32 to float16 or bfloat16 is allowed.
+            logger.info("Downcasting %s to %s.", config_dtype, torch_dtype)
+            pass
+        else:
+            # Casting between float16 and bfloat16 is allowed with a warning.
+            logger.warning("Casting %s to %s.", config_dtype, torch_dtype)
+
+    return torch_dtype
+
+
+def is_generation_model(model_architectures: List[str], is_embedding: bool = False):
+    # We have two ways to determine whether a model is a generative model.
+    # 1. Check the model architecture
+    # 2. check the `is_embedding` server args
+
+    if (
+        "LlamaEmbeddingModel" in model_architectures
+        or "MistralModel" in model_architectures
+        or "LlamaForSequenceClassification" in model_architectures
+        or "LlamaForSequenceClassificationWithNormal_Weights" in model_architectures
+        or "InternLM2ForRewardModel" in model_architectures
+        or "Qwen2ForRewardModel" in model_architectures
+        or "Qwen2ForSequenceClassification" in model_architectures
+        or "Qwen3ForSequenceClassification" in model_architectures
+        or "CLIPModel" in model_architectures
+        or "BertModel" in model_architectures
+        or "Contriever" in model_architectures
+        or "BertForSequenceClassification" in model_architectures
+        or "XLMRobertaModel" in model_architectures
+        or "XLMRobertaForSequenceClassification" in model_architectures
+    ):
+        return False
+    else:
+        return not is_embedding
+
+
+multimodal_model_archs = [
+    "CLIPModel",
+    "DeepseekVL2ForCausalLM",
+    "Gemma3ForConditionalGeneration",
+    "Gemma3nForConditionalGeneration",
+    "Glm4vForConditionalGeneration",
+    "Glm4vMoeForConditionalGeneration",
+    "Grok1VForCausalLM",
+    "Grok1AForCausalLM",
+    "LlavaLlamaForCausalLM",
+    "Llama4ForConditionalGeneration",
+    "LlavaMistralForCausalLM",
+    "LlavaQwenForCausalLM",
+    "LlavaForConditionalGeneration",
+    "LlavaVidForCausalLM",
+    "MiniCPMO",
+    "MiniCPMV",
+    "Mistral3ForConditionalGeneration",
+    "MultiModalityCausalLM",
+    "MllamaForConditionalGeneration",
+    "Qwen2AudioForConditionalGeneration",
+    "Qwen2VLForConditionalGeneration",
+    "Qwen2_5_VLForConditionalGeneration",
+    "KimiVLForConditionalGeneration",
+    "InternVLChatModel",
+    "InternS1ForConditionalGeneration",
+    "Phi4MMForCausalLM",
+    "VILAForConditionalGeneration",
+    "Step3VLForConditionalGeneration",
+]
+
+
+def is_multimodal_model(model_architectures: List[str]):
+    if any(
+        multi_model_arch in model_architectures
+        for multi_model_arch in multimodal_model_archs
+    ):
+        return True
+    else:
+        return False
+
+
+def is_multimodal_gen_model(model_architectures: List[str]):
+    return False
+
+
+def is_image_gen_model(model_architectures: List[str]):
+    return False
+
+
+def is_audio_model(model_architectures: List[str]):
+    return False
+
+
+def is_encoder_decoder_model(model_architectures: List[str]):
+    return "MllamaForConditionalGeneration" in model_architectures
+
+
+def is_multimodal_chunked_prefill_supported(model_architectures: List[str]):
+    """Check if chunked prefill is supported for a MultiModal model."""
+    unsupported = [
+        "Grok1VForCausalLM",
+        "Grok1AForCausalLM",
+        "LlavaLlamaForCausalLM",
+        "MllamaForConditionalGeneration",
+        "CLIPModel",
+    ]
+    if any(multi_model_arch in unsupported for multi_model_arch in model_architectures):
+        return False
+    else:
+        return True
+
+
+def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
+def is_hybrid_model(
+    model_architectures: List[str],
+    hybrid_kvcache_ratio: Optional[float],
+    context_length: Optional[int],
+    attention_chunk_size: Optional[int],
+):
+    if hybrid_kvcache_ratio is None:
+        return None
+    elif (
+        hybrid_kvcache_ratio > 0
+        and model_architectures[0] == "Llama4ForConditionalGeneration"
+        and context_length > attention_chunk_size
+    ):
+        return hybrid_kvcache_ratio
+    else:
+        return None
+
+
+def get_hybrid_layer_ids(model_architectures: List[str], num_hidden_layers: int):
+    if "Llama4ForConditionalGeneration" in model_architectures:
+        swa_attention_layer_ids = [
+            i for i in range(num_hidden_layers) if (i + 1) % 4 != 0
+        ]
+        full_attention_layer_ids = [
+            i for i in range(num_hidden_layers) if (i + 1) % 4 == 0
+        ]
+    else:
+        swa_attention_layer_ids = None
+        full_attention_layer_ids = None
+    return swa_attention_layer_ids, full_attention_layer_ids
diff --git a/python/sglang/srt/configs/qwen3_next.py b/python/sglang/srt/configs/qwen3_next.py
new file mode 100644
index 000000000..099d14d41
--- /dev/null
+++ b/python/sglang/srt/configs/qwen3_next.py
@@ -0,0 +1,326 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Qwen3Hybrid model configuration"""
+
+import enum
+import os
+
+import numpy as np
+import torch
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+
+from sglang.srt.distributed.utils import divide
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+
+logger = logging.get_logger(__name__)
+
+
+# NOTE: HybridLayerType
+class HybridLayerType(enum.Enum):
+    full_attention = "attention"
+    swa_attention = "swa_attention"
+    linear_attention = "linear_attention"
+    mamba2 = "mamba"
+
+
+class Qwen3NextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3NextModel`]. It is used to instantiate a
+    Qwen3-Next model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of
+    Qwen3-Next-80B-A3B-Instruct [Qwen/Qwen3-Next-80B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the model. Defines the number of different tokens that can be represented by the
+            `inputs_ids`.
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 5632):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 2):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str`, *optional*, defaults to `"silu"`):
+            The non-linear activation function in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        partial_rotary_factor (`float`, *optional*, defaults to 0.25):
+            Percentage of the query and keys which will have rotary embedding.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        head_dim (`int`, *optional*, defaults to 256):
+            Projection weights dimension in multi-head attention.
+        linear_conv_kernel_dim (`int`, *optional*, defaults to 4):
+            Kernel size of the convolution used in linear attention layers.
+        linear_key_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of each key head in linear attention.
+        linear_value_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of each value head in linear attention.
+        linear_num_key_heads (`int`, *optional*, defaults to 16):
+            Number of key heads used in linear attention layers.
+        linear_num_value_heads (`int`, *optional*, defaults to 32):
+            Number of value heads used in linear attention layers.
+        decoder_sparse_step (`int`, *optional*, defaults to 1):
+            The frequency of the MoE layer.
+        moe_intermediate_size (`int`, *optional*, defaults to 512):
+            Intermediate size of the routed expert.
+        shared_expert_intermediate_size (`int`, *optional*, defaults to 512):
+            Intermediate size of the shared expert.
+        num_experts_per_tok (`int`, *optional*, defaults to 10):
+            Number of selected experts.
+        num_experts (`int`, *optional*, defaults to 512):
+            Number of routed experts.
+        norm_topk_prob (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the topk probabilities.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also
+            allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+        mlp_only_layers (`list[int]`, *optional*, defaults to `[]`):
+            Indicate which layers use Qwen3NextMLP rather than Qwen3NextSparseMoeBlock
+            The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
+            If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
+        layer_types (`list[str]`, *optional*, defaults to None):
+            Types of each layer (attention or linear).
+
+    ```python
+    >>> from transformers import Qwen3NextModel, Qwen3NextConfig
+
+    >>> # Initializing a Qwen3Next style configuration
+    >>> configuration =  Qwen3NextConfig()
+
+    >>> # Initializing a model from the Qwen3-Next-80B-A3B style configuration
+    >>> model = Qwen3NextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "qwen3_next"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=2048,
+        intermediate_size=5632,
+        num_hidden_layers=48,
+        num_attention_heads=16,
+        num_key_value_heads=2,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        partial_rotary_factor=0.25,
+        attention_bias=False,
+        attention_dropout=0.0,
+        head_dim=256,
+        linear_conv_kernel_dim=4,
+        linear_key_head_dim=128,
+        linear_value_head_dim=128,
+        linear_num_key_heads=16,
+        linear_num_value_heads=32,
+        decoder_sparse_step=1,
+        moe_intermediate_size=512,
+        shared_expert_intermediate_size=512,
+        num_experts_per_tok=10,
+        num_experts=512,
+        norm_topk_prob=True,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        mlp_only_layers=[],
+        layer_types=None,
+        **kwargs,
+    ):
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.partial_rotary_factor = partial_rotary_factor
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.head_dim = head_dim
+        rope_config_validation(self)
+
+        # linear attention (gdn now part)
+        self.linear_conv_kernel_dim = linear_conv_kernel_dim
+        self.linear_key_head_dim = linear_key_head_dim
+        self.linear_value_head_dim = linear_value_head_dim
+        self.linear_num_key_heads = linear_num_key_heads
+        self.linear_num_value_heads = linear_num_value_heads
+
+        # MoE arguments
+        self.decoder_sparse_step = decoder_sparse_step
+        self.moe_intermediate_size = moe_intermediate_size
+        self.shared_expert_intermediate_size = shared_expert_intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.norm_topk_prob = norm_topk_prob
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.mlp_only_layers = mlp_only_layers
+
+    @property
+    def layers_block_type(self):
+        layer_type_list = []
+
+        for l in range(self.num_hidden_layers):
+            if (l + 1) % self.full_attention_interval == 0:
+                layer_type_list.append(HybridLayerType.full_attention.value)
+            else:
+                layer_type_list.append(HybridLayerType.linear_attention.value)
+
+        return layer_type_list
+
+    @property
+    def linear_layer_ids(self):
+        return [
+            i
+            for i, type_value in enumerate(self.layers_block_type)
+            if type_value == HybridLayerType.linear_attention.value
+        ]
+
+    @property
+    def full_attention_layer_ids(self):
+        return [
+            i
+            for i, type_value in enumerate(self.layers_block_type)
+            if type_value == HybridLayerType.full_attention.value
+        ]
+
+    @property
+    def hybrid_gdn_params(self):
+        world_size = get_attention_tp_size()
+        conv_dim = (
+            self.linear_key_head_dim * self.linear_num_key_heads * 2
+            + self.linear_value_head_dim * self.linear_num_value_heads
+        )
+        conv_state_shape = (
+            divide(conv_dim, world_size),
+            self.linear_conv_kernel_dim - 1,
+        )
+
+        temporal_state_shape = (
+            divide(self.linear_num_value_heads, world_size),
+            self.linear_key_head_dim,
+            self.linear_value_head_dim,
+        )
+        conv_dtype = torch.bfloat16
+        dtype_map = {
+            "float32": torch.float32,
+            "bfloat16": torch.bfloat16,
+        }
+        ssm_dtype = dtype_map[os.environ["SGLANG_MAMBA_SSM_DTYPE"]]
+        mamba_layers = self.linear_layer_ids
+        return (
+            conv_state_shape,
+            temporal_state_shape,
+            conv_dtype,
+            ssm_dtype,
+            mamba_layers,
+        )
+
+    @property
+    def mamba_cache_per_req(self):
+        conv_state_shape, temporal_state_shape, conv_dtype, ssm_dtype, mamba_layers = (
+            self.hybrid_gdn_params
+        )
+        mamba_layers_len = len(mamba_layers)
+
+        return (
+            int(np.prod(conv_state_shape)) * conv_dtype.itemsize
+            + int(np.prod(temporal_state_shape)) * ssm_dtype.itemsize
+        ) * mamba_layers_len
diff --git a/python/sglang/srt/configs/step3_vl.py b/python/sglang/srt/configs/step3_vl.py
new file mode 100644
index 000000000..5519605c6
--- /dev/null
+++ b/python/sglang/srt/configs/step3_vl.py
@@ -0,0 +1,172 @@
+from typing import Any, Optional, Union
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class Step3VisionEncoderConfig(PretrainedConfig):
+    model_type = "step3_vision_encoder"
+
+    def __init__(
+        self,
+        hidden_size=1792,
+        intermediate_size=3072,
+        output_hidden_size=4096,
+        num_hidden_layers=63,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=728,
+        patch_size=14,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        **kwargs,
+    ):
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.output_hidden_size = output_hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        super().__init__(**kwargs)
+
+
+class Step3TextConfig(PretrainedConfig):
+    model_type = "step3_text"
+    architectures = ["Step3TextForCausalLM"]
+
+    def __init__(
+        self,
+        hidden_size: int = 7168,
+        intermediate_size: int = 18432,
+        num_attention_heads: int = 64,
+        num_attention_groups: int = 1,
+        num_hidden_layers: int = 61,
+        max_seq_len: int = 65536,
+        vocab_size: int = 128815,
+        rms_norm_eps: float = 1e-5,
+        moe_intermediate_size: int = 5120,
+        moe_num_experts: int = 48,
+        moe_top_k: int = 3,
+        rope_theta: float = 500000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embedding: int = 65536,
+        share_expert_dim: int = 5120,
+        share_q_dim: int = 2048,
+        head_dim: int = 256,
+        norm_expert_weight: bool = False,
+        moe_layers_enum: tuple[int] = (
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            11,
+            12,
+            13,
+            14,
+            15,
+            16,
+            17,
+            18,
+            19,
+            20,
+            21,
+            22,
+            23,
+            24,
+            25,
+            26,
+            27,
+            28,
+            29,
+            30,
+            31,
+            32,
+            33,
+            34,
+            35,
+            36,
+            37,
+            38,
+            39,
+            40,
+            41,
+            42,
+            43,
+            44,
+            45,
+            46,
+            47,
+            48,
+            49,
+            50,
+            51,
+            52,
+            53,
+            54,
+            55,
+            56,
+            57,
+            58,
+            59,
+        ),
+        **kwargs,
+    ) -> None:
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.num_attention_groups = num_attention_groups
+        self.num_hidden_layers = num_hidden_layers
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.rms_norm_eps = rms_norm_eps
+        self.moe_intermediate_size = moe_intermediate_size
+        self.moe_num_experts = moe_num_experts
+        self.moe_top_k = moe_top_k
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.max_position_embedding = max_position_embedding
+        self.share_expert_dim = share_expert_dim
+        self.share_q_dim = share_q_dim
+        self.head_dim = head_dim
+        self.norm_expert_weight = norm_expert_weight
+        self.moe_layers_enum = moe_layers_enum
+
+        super().__init__(**kwargs)
+
+
+class Step3VLConfig(PretrainedConfig):
+    model_type = "step3_vl"
+
+    def __init__(
+        self,
+        vision_config: Optional[Union[dict, Step3VisionEncoderConfig]] = None,
+        text_config: Optional[Union[dict, Step3TextConfig]] = None,
+        understand_projector_stride: int = 1,
+        projector_bias: bool = True,
+        image_token_id: int = 128001,
+        **kwargs,
+    ) -> None:
+        if vision_config is None:
+            vision_config = Step3VisionEncoderConfig()
+        elif isinstance(vision_config, dict):
+            vision_config = Step3VisionEncoderConfig(**vision_config)
+        self.vision_config = vision_config
+
+        if text_config is None:
+            text_config = Step3TextConfig()
+        elif isinstance(text_config, dict):
+            text_config = Step3TextConfig(**text_config)
+        self.text_config = text_config
+
+        self.understand_projector_stride = understand_projector_stride
+        self.projector_bias = projector_bias
+        self.hidden_size = text_config.hidden_size
+        self.image_token_id = image_token_id
+
+        super().__init__(**kwargs)
diff --git a/python/sglang/srt/configs/update_config.py b/python/sglang/srt/configs/update_config.py
new file mode 100644
index 000000000..abbd724fb
--- /dev/null
+++ b/python/sglang/srt/configs/update_config.py
@@ -0,0 +1,156 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+DEFAULT_MOE_PADDING_SIZE = 32
+
+
+if TYPE_CHECKING:
+    from sglang.srt.configs.load_config import LoadConfig
+    from sglang.srt.configs.model_config import ModelConfig
+
+
+def may_get_weight_block_size(model_config, load_config):
+    from sglang.srt.model_loader.loader import _get_quantization_config
+    from sglang.srt.model_loader.utils import get_model_architecture
+
+    model_class, _ = get_model_architecture(model_config)
+    packed_modules_mapping = getattr(model_class, "packed_modules_mapping", {})
+
+    quant_config = _get_quantization_config(
+        model_config, load_config, packed_modules_mapping
+    )
+
+    if quant_config is not None and hasattr(quant_config, "weight_block_size"):
+        return getattr(quant_config, "weight_block_size")
+    return None
+
+
+def get_moe_padding_size(weight_block_size):
+    if weight_block_size is not None:
+        # See NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
+        assert (
+            len(weight_block_size) == 2
+        ), "Only len(weight_block_size) == 2 is supported"
+        assert (
+            weight_block_size[0] == weight_block_size[1]
+        ), "Only weight_block_size[0] == weight_block_size[1] is supported"
+
+        return weight_block_size[0]
+
+    return DEFAULT_MOE_PADDING_SIZE
+
+
+def get_num_heads_padding_size(tp_size, weight_block_size):
+    pad_size = (
+        tp_size * 2 if tp_size % 2 == 1 and weight_block_size is not None else tp_size
+    )
+    return pad_size
+
+
+def update_intermediate_size(model_config, attr_name, intermediate_padding_size):
+    attr_value = intermediate_padding_size
+    if hasattr(model_config, "hf_config") and hasattr(
+        model_config.hf_config, attr_name
+    ):
+        attr_value = getattr(model_config.hf_config, attr_name)
+    elif hasattr(model_config, attr_name):
+        attr_value = getattr(model_config, attr_name)
+
+    if attr_value % intermediate_padding_size != 0:
+        from sglang.srt.layers.vocab_parallel_embedding import pad_vocab_size
+
+        attr_value = pad_vocab_size(attr_value, intermediate_padding_size)
+        if hasattr(model_config, "hf_config"):
+            setattr(model_config.hf_config, attr_name, attr_value)
+            if hasattr(model_config, "hf_text_config"):
+                setattr(model_config.hf_text_config, attr_name, attr_value)
+        else:
+            setattr(model_config, attr_name, attr_value)
+
+    return model_config
+
+
+def adjust_config_with_unaligned_cpu_tp(
+    model_config: ModelConfig, load_config: LoadConfig, tp_size: int
+) -> ModelConfig:
+    # Support the case where the num_attention_heads is not divisible by the TP size.
+    weight_block_size = may_get_weight_block_size(model_config, load_config)
+
+    model_config.hf_config.original_num_attention_heads = (
+        model_config.num_attention_heads
+    )
+    model_config.hf_text_config.original_num_attention_heads = (
+        model_config.num_attention_heads
+    )
+
+    model_config.hf_config.original_total_num_kv_heads = (
+        model_config.get_total_num_kv_heads()
+    )
+    model_config.hf_text_config.original_total_num_kv_heads = (
+        model_config.get_total_num_kv_heads()
+    )
+
+    if (
+        model_config.num_attention_heads % tp_size != 0
+        or model_config.get_total_num_kv_heads() % tp_size != 0
+    ):
+        # Compute the head_dim using the model_config.num_attention_heads before padding
+        if not hasattr(model_config.hf_config, "head_dim"):
+            model_config.hf_config.head_dim = (
+                model_config.hidden_size // model_config.num_attention_heads
+            )
+
+        query_heads_per_kv = (
+            model_config.num_attention_heads // model_config.get_total_num_kv_heads()
+        )
+        total_kv_heads = model_config.get_total_num_kv_heads()
+        from sglang.srt.layers.vocab_parallel_embedding import pad_vocab_size
+
+        pad_size = get_num_heads_padding_size(tp_size, weight_block_size)
+        num_key_value_heads = pad_vocab_size(total_kv_heads, pad_size)
+
+        model_config.num_key_value_heads = num_key_value_heads
+        model_config.hf_config.num_key_value_heads = num_key_value_heads
+        model_config.hf_text_config.num_key_value_heads = num_key_value_heads
+
+        num_attention_heads = num_key_value_heads * query_heads_per_kv
+        model_config.num_attention_heads = num_attention_heads
+        model_config.hf_config.num_attention_heads = num_attention_heads
+        model_config.hf_text_config.num_attention_heads = num_attention_heads
+
+    intermediate_padding_size = tp_size * get_moe_padding_size(weight_block_size)
+    model_config = update_intermediate_size(
+        model_config, "moe_intermediate_size", intermediate_padding_size
+    )
+    model_config = update_intermediate_size(
+        model_config, "intermediate_size", intermediate_padding_size
+    )
+    model_config = update_intermediate_size(
+        model_config, "intermediate_size_mlp", intermediate_padding_size
+    )
+    if (
+        hasattr(model_config.hf_config, "vision_config")
+        and model_config.hf_config.vision_config.model_type == "siglip_vision_model"
+    ):
+        model_config.hf_config.vision_config.original_num_attention_heads = (
+            model_config.num_attention_heads
+        )
+        if model_config.hf_config.vision_config.num_attention_heads % tp_size != 0:
+            model_config.hf_config.vision_config.head_dim = (
+                model_config.hf_config.vision_config.hidden_size
+                // model_config.hf_config.vision_config.num_attention_heads
+            )
+            from sglang.srt.layers.vocab_parallel_embedding import pad_vocab_size
+
+            pad_size = get_num_heads_padding_size(tp_size, weight_block_size)
+            model_config.hf_config.vision_config.num_attention_heads = pad_vocab_size(
+                model_config.hf_config.vision_config.num_attention_heads, pad_size
+            )
+        model_config.hf_config.vision_config = update_intermediate_size(
+            model_config.hf_config.vision_config,
+            "intermediate_size",
+            intermediate_padding_size,
+        )
+
+    return model_config
diff --git a/python/sglang/srt/configs/utils.py b/python/sglang/srt/configs/utils.py
new file mode 100644
index 000000000..8403d95da
--- /dev/null
+++ b/python/sglang/srt/configs/utils.py
@@ -0,0 +1,25 @@
+from typing import Type
+
+from transformers import (
+    AutoImageProcessor,
+    AutoProcessor,
+    BaseImageProcessor,
+    PretrainedConfig,
+    ProcessorMixin,
+)
+
+
+def register_image_processor(
+    config: Type[PretrainedConfig], image_processor: Type[BaseImageProcessor]
+):
+    """
+    register customized hf image processor while removing hf impl
+    """
+    AutoImageProcessor.register(config, None, image_processor, None, exist_ok=True)
+
+
+def register_processor(config: Type[PretrainedConfig], processor: Type[ProcessorMixin]):
+    """
+    register customized hf processor while removing hf impl
+    """
+    AutoProcessor.register(config, processor, exist_ok=True)
diff --git a/python/sglang/srt/connector/__init__.py b/python/sglang/srt/connector/__init__.py
new file mode 100644
index 000000000..38e1d5eab
--- /dev/null
+++ b/python/sglang/srt/connector/__init__.py
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import enum
+import logging
+
+from sglang.srt.connector.base_connector import (
+    BaseConnector,
+    BaseFileConnector,
+    BaseKVConnector,
+)
+from sglang.srt.connector.redis import RedisConnector
+from sglang.srt.connector.s3 import S3Connector
+from sglang.srt.utils import parse_connector_type
+
+logger = logging.getLogger(__name__)
+
+
+class ConnectorType(str, enum.Enum):
+    FS = "filesystem"
+    KV = "KV"
+
+
+def create_remote_connector(url, **kwargs) -> BaseConnector:
+    connector_type = parse_connector_type(url)
+    if connector_type == "redis":
+        return RedisConnector(url)
+    elif connector_type == "s3":
+        return S3Connector(url)
+    else:
+        raise ValueError(f"Invalid connector type: {url}")
+
+
+def get_connector_type(client: BaseConnector) -> ConnectorType:
+    if isinstance(client, BaseKVConnector):
+        return ConnectorType.KV
+    if isinstance(client, BaseFileConnector):
+        return ConnectorType.FS
+
+    raise ValueError(f"Invalid connector type: {client}")
+
+
+__all__ = [
+    "BaseConnector",
+    "BaseFileConnector",
+    "BaseKVConnector",
+    "RedisConnector",
+    "S3Connector",
+    "ConnectorType",
+    "create_remote_connector",
+    "get_connector_type",
+]
diff --git a/python/sglang/srt/connector/base_connector.py b/python/sglang/srt/connector/base_connector.py
new file mode 100644
index 000000000..c9a1c36e2
--- /dev/null
+++ b/python/sglang/srt/connector/base_connector.py
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import shutil
+import signal
+import tempfile
+from abc import ABC, abstractmethod
+from typing import Generator, List, Optional, Tuple
+
+import torch
+
+
+class BaseConnector(ABC):
+    """
+    For fs connector such as s3:
+    <connector_type>://<path>/<filename>
+
+    For kv connector such as redis:
+    <connector_type>://<host>:<port>/<model_name>/keys/<key>
+    <connector_type://<host>:<port>/<model_name>/files/<filename>
+    """
+
+    def __init__(self, url: str):
+        self.url = url
+        self.closed = False
+        self.local_dir = tempfile.mkdtemp()
+        for sig in (signal.SIGINT, signal.SIGTERM):
+            existing_handler = signal.getsignal(sig)
+            signal.signal(sig, self._close_by_signal(existing_handler))
+
+    def get_local_dir(self):
+        return self.local_dir
+
+    @abstractmethod
+    def weight_iterator(
+        self, rank: int = 0
+    ) -> Generator[Tuple[str, torch.Tensor], None, None]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def pull_files(
+        self,
+        allow_pattern: Optional[List[str]] = None,
+        ignore_pattern: Optional[List[str]] = None,
+    ) -> None:
+        raise NotImplementedError()
+
+    def close(self):
+        if self.closed:
+            return
+
+        self.closed = True
+        if os.path.exists(self.local_dir):
+            shutil.rmtree(self.local_dir)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+
+    def __del__(self):
+        self.close()
+
+    def _close_by_signal(self, existing_handler=None):
+
+        def new_handler(signum, frame):
+            self.close()
+            if existing_handler:
+                existing_handler(signum, frame)
+
+        return new_handler
+
+
+class BaseKVConnector(BaseConnector):
+
+    @abstractmethod
+    def get(self, key: str) -> Optional[torch.Tensor]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def getstr(self, key: str) -> Optional[str]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def set(self, key: str, obj: torch.Tensor) -> None:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def setstr(self, key: str, obj: str) -> None:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def list(self, prefix: str) -> List[str]:
+        raise NotImplementedError()
+
+
+class BaseFileConnector(BaseConnector):
+    """
+    List full file names from remote fs path and filter by allow pattern.
+
+    Args:
+        allow_pattern: A list of patterns of which files to pull.
+
+    Returns:
+        list[str]: List of full paths allowed by the pattern
+    """
+
+    @abstractmethod
+    def glob(self, allow_pattern: str) -> List[str]:
+        raise NotImplementedError()
diff --git a/python/sglang/srt/connector/redis.py b/python/sglang/srt/connector/redis.py
new file mode 100644
index 000000000..cb1db3f7c
--- /dev/null
+++ b/python/sglang/srt/connector/redis.py
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+from typing import Generator, List, Optional, Tuple
+from urllib.parse import urlparse
+
+import torch
+
+from sglang.srt.connector import BaseKVConnector
+from sglang.srt.connector.serde import create_serde
+from sglang.srt.connector.utils import pull_files_from_db
+
+logger = logging.getLogger(__name__)
+
+
+class RedisConnector(BaseKVConnector):
+
+    def __init__(self, url: str):
+        import redis
+
+        super().__init__(url)
+        parsed_url = urlparse(url)
+        self.connection = redis.Redis(host=parsed_url.hostname, port=parsed_url.port)
+        self.model_name = parsed_url.path.lstrip("/")
+        # TODO: more serde options
+        self.s, self.d = create_serde("safe")
+
+    def get(self, key: str) -> Optional[torch.Tensor]:
+        val = self.connection.get(key)
+
+        if val is None:
+            logger.error("Key %s not found", key)
+            return None
+
+        return self.d.from_bytes(val)
+
+    def getstr(self, key: str) -> Optional[str]:
+        val = self.connection.get(key)
+        if val is None:
+            logger.error("Key %s not found", key)
+            return None
+
+        return val.decode("utf-8")
+
+    def set(self, key: str, tensor: torch.Tensor) -> None:
+        assert tensor is not None
+        self.connection.set(key, self.s.to_bytes(tensor))
+
+    def setstr(self, key: str, obj: str) -> None:
+        self.connection.set(key, obj)
+
+    def list(self, prefix: str) -> List[str]:
+        cursor = 0
+        all_keys: List[bytes] = []
+
+        while True:
+            ret: Tuple[int, List[bytes]] = self.connection.scan(
+                cursor=cursor, match=f"{prefix}*"
+            )  # type: ignore
+            cursor, keys = ret
+            all_keys.extend(keys)
+            if cursor == 0:
+                break
+
+        return [key.decode("utf-8") for key in all_keys]
+
+    def weight_iterator(
+        self, rank: int = 0
+    ) -> Generator[Tuple[str, bytes], None, None]:
+        keys = self.list(f"{self.model_name}/keys/rank_{rank}/")
+        for key in keys:
+            val = self.get(key)
+            key = key.removeprefix(f"{self.model_name}/keys/rank_{rank}/")
+            yield key, val
+
+    def pull_files(
+        self,
+        allow_pattern: Optional[List[str]] = None,
+        ignore_pattern: Optional[List[str]] = None,
+    ) -> None:
+        pull_files_from_db(self, self.model_name, allow_pattern, ignore_pattern)
+
+    def close(self):
+        self.connection.close()
+        super().close()
diff --git a/python/sglang/srt/connector/s3.py b/python/sglang/srt/connector/s3.py
new file mode 100644
index 000000000..7bef8f5d5
--- /dev/null
+++ b/python/sglang/srt/connector/s3.py
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import fnmatch
+import os
+from pathlib import Path
+from typing import Generator, Optional, Tuple
+
+import torch
+
+from sglang.srt.connector import BaseFileConnector
+
+
+def _filter_allow(paths: list[str], patterns: list[str]) -> list[str]:
+    return [
+        path
+        for path in paths
+        if any(fnmatch.fnmatch(path, pattern) for pattern in patterns)
+    ]
+
+
+def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]:
+    return [
+        path
+        for path in paths
+        if not any(fnmatch.fnmatch(path, pattern) for pattern in patterns)
+    ]
+
+
+def list_files(
+    s3,
+    path: str,
+    allow_pattern: Optional[list[str]] = None,
+    ignore_pattern: Optional[list[str]] = None,
+) -> tuple[str, str, list[str]]:
+    """
+    List files from S3 path and filter by pattern.
+
+    Args:
+        s3: S3 client to use.
+        path: The S3 path to list from.
+        allow_pattern: A list of patterns of which files to pull.
+        ignore_pattern: A list of patterns of which files not to pull.
+
+    Returns:
+        tuple[str, str, list[str]]: A tuple where:
+            - The first element is the bucket name
+            - The second element is string represent the bucket
+              and the prefix as a dir like string
+            - The third element is a list of files allowed or
+              disallowed by pattern
+    """
+    parts = path.removeprefix("s3://").split("/")
+    prefix = "/".join(parts[1:])
+    bucket_name = parts[0]
+
+    objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
+    paths = [obj["Key"] for obj in objects.get("Contents", [])]
+
+    paths = _filter_ignore(paths, ["*/"])
+    if allow_pattern is not None:
+        paths = _filter_allow(paths, allow_pattern)
+
+    if ignore_pattern is not None:
+        paths = _filter_ignore(paths, ignore_pattern)
+
+    return bucket_name, prefix, paths
+
+
+class S3Connector(BaseFileConnector):
+
+    def __init__(self, url: str) -> None:
+        import boto3
+
+        super().__init__(url)
+        self.client = boto3.client("s3")
+
+    def glob(self, allow_pattern: Optional[list[str]] = None) -> list[str]:
+        bucket_name, _, paths = list_files(
+            self.client, path=self.url, allow_pattern=allow_pattern
+        )
+        return [f"s3://{bucket_name}/{path}" for path in paths]
+
+    def pull_files(
+        self,
+        allow_pattern: Optional[list[str]] = None,
+        ignore_pattern: Optional[list[str]] = None,
+    ) -> None:
+        """
+        Pull files from S3 storage into the temporary directory.
+
+        Args:
+            s3_model_path: The S3 path of the model.
+            allow_pattern: A list of patterns of which files to pull.
+            ignore_pattern: A list of patterns of which files not to pull.
+
+        """
+        bucket_name, base_dir, files = list_files(
+            self.client, self.url, allow_pattern, ignore_pattern
+        )
+        if len(files) == 0:
+            return
+
+        for file in files:
+            destination_file = os.path.join(self.local_dir, file.removeprefix(base_dir))
+            local_dir = Path(destination_file).parent
+            os.makedirs(local_dir, exist_ok=True)
+            self.client.download_file(bucket_name, file, destination_file)
+
+    def weight_iterator(
+        self, rank: int = 0
+    ) -> Generator[Tuple[str, torch.Tensor], None, None]:
+        from sglang.srt.model_loader.weight_utils import (
+            runai_safetensors_weights_iterator,
+        )
+
+        # only support safetensor files now
+        hf_weights_files = self.glob(allow_pattern=["*.safetensors"])
+        return runai_safetensors_weights_iterator(hf_weights_files)
+
+    def close(self):
+        self.client.close()
+        super().close()
diff --git a/python/sglang/srt/connector/serde/__init__.py b/python/sglang/srt/connector/serde/__init__.py
new file mode 100644
index 000000000..c05b20afa
--- /dev/null
+++ b/python/sglang/srt/connector/serde/__init__.py
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# inspired by LMCache
+from typing import Optional, Tuple
+
+import torch
+
+from sglang.srt.connector.serde.safe_serde import SafeDeserializer, SafeSerializer
+from sglang.srt.connector.serde.serde import Deserializer, Serializer
+
+
+def create_serde(serde_type: str) -> Tuple[Serializer, Deserializer]:
+    s: Optional[Serializer] = None
+    d: Optional[Deserializer] = None
+
+    if serde_type == "safe":
+        s = SafeSerializer()
+        d = SafeDeserializer()
+    else:
+        raise ValueError(f"Unknown serde type: {serde_type}")
+
+    return s, d
+
+
+__all__ = [
+    "Serializer",
+    "Deserializer",
+    "SafeSerializer",
+    "SafeDeserializer",
+    "create_serde",
+]
diff --git a/python/sglang/srt/connector/serde/safe_serde.py b/python/sglang/srt/connector/serde/safe_serde.py
new file mode 100644
index 000000000..3e75f9bfc
--- /dev/null
+++ b/python/sglang/srt/connector/serde/safe_serde.py
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+import torch
+from safetensors.torch import load, save
+
+from sglang.srt.connector.serde.serde import Deserializer, Serializer
+
+
+class SafeSerializer(Serializer):
+
+    def __init__(self):
+        super().__init__()
+
+    def to_bytes(self, t: torch.Tensor) -> bytes:
+        return save({"tensor_bytes": t.cpu().contiguous()})
+
+
+class SafeDeserializer(Deserializer):
+
+    def __init__(self):
+        # TODO: dtype options
+        super().__init__(torch.float32)
+
+    def from_bytes_normal(self, b: Union[bytearray, bytes]) -> torch.Tensor:
+        return load(bytes(b))["tensor_bytes"]
+
+    def from_bytes(self, b: Union[bytearray, bytes]) -> torch.Tensor:
+        return self.from_bytes_normal(b)
diff --git a/python/sglang/srt/connector/serde/serde.py b/python/sglang/srt/connector/serde/serde.py
new file mode 100644
index 000000000..3d6f804d7
--- /dev/null
+++ b/python/sglang/srt/connector/serde/serde.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import abc
+from abc import ABC, abstractmethod
+
+import torch
+
+
+class Serializer(ABC):
+
+    @abstractmethod
+    def to_bytes(self, t: torch.Tensor) -> bytes:
+        """
+        Serialize a pytorch tensor to bytes. The serialized bytes should contain
+        both the data and the metadata (shape, dtype, etc.) of the tensor.
+
+        Input:
+            t: the input pytorch tensor, can be on any device, in any shape,
+               with any dtype
+
+        Returns:
+            bytes: the serialized bytes
+        """
+        raise NotImplementedError
+
+
+class Deserializer(metaclass=abc.ABCMeta):
+
+    def __init__(self, dtype):
+        self.dtype = dtype
+
+    @abstractmethod
+    def from_bytes(self, bs: bytes) -> torch.Tensor:
+        """
+        Deserialize a pytorch tensor from bytes.
+
+        Input:
+            bytes: a stream of bytes
+
+        Output:
+            torch.Tensor: the deserialized pytorch tensor
+        """
+        raise NotImplementedError
diff --git a/python/sglang/srt/connector/utils.py b/python/sglang/srt/connector/utils.py
new file mode 100644
index 000000000..6cd12da33
--- /dev/null
+++ b/python/sglang/srt/connector/utils.py
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from pathlib import Path
+from typing import Optional
+from urllib.parse import urlparse
+
+from sglang.srt.connector import BaseConnector
+
+
+def parse_model_name(url: str) -> str:
+    """
+    Parse the model name from the url.
+    Only used for db connector
+    """
+    parsed_url = urlparse(url)
+    return parsed_url.path.lstrip("/")
+
+
+def pull_files_from_db(
+    connector: BaseConnector,
+    model_name: str,
+    allow_pattern: Optional[list[str]] = None,
+    ignore_pattern: Optional[list[str]] = None,
+) -> None:
+    prefix = f"{model_name}/files/"
+    local_dir = connector.get_local_dir()
+    files = connector.list(prefix)
+
+    for file in files:
+        destination_file = os.path.join(local_dir, file.removeprefix(prefix))
+        local_dir = Path(destination_file).parent
+        os.makedirs(local_dir, exist_ok=True)
+        with open(destination_file, "wb") as f:
+            f.write(connector.getstr(file).encode("utf-8"))
diff --git a/python/sglang/srt/constants.py b/python/sglang/srt/constants.py
new file mode 100644
index 000000000..aa03a089b
--- /dev/null
+++ b/python/sglang/srt/constants.py
@@ -0,0 +1,3 @@
+# GPU Memory Types
+GPU_MEMORY_TYPE_KV_CACHE = "kv_cache"
+GPU_MEMORY_TYPE_WEIGHTS = "weights"
diff --git a/python/sglang/srt/constrained/base_grammar_backend.py b/python/sglang/srt/constrained/base_grammar_backend.py
new file mode 100644
index 000000000..4fe5d6c77
--- /dev/null
+++ b/python/sglang/srt/constrained/base_grammar_backend.py
@@ -0,0 +1,213 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The baseclass of a backend for grammar-guided constrained decoding."""
+
+import logging
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from threading import Event
+from typing import Dict, List, Optional, Tuple
+
+import torch
+
+from sglang.srt.server_args import ServerArgs
+
+logger = logging.getLogger(__name__)
+
+
+class BaseGrammarObject:
+
+    def __init__(self):
+        self._finished = False
+
+    def accept_token(self, token: int) -> None:
+        """
+        Accept a token in the grammar.
+        """
+        raise NotImplementedError()
+
+    def rollback(self, k: int):
+        raise NotImplementedError()
+
+    def is_terminated(self):
+        return False
+
+    def allocate_vocab_mask(
+        self, vocab_size: int, batch_size: int, device
+    ) -> torch.Tensor:
+        raise NotImplementedError()
+
+    def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
+        raise NotImplementedError()
+
+    @staticmethod
+    def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
+        raise NotImplementedError()
+
+    @staticmethod
+    def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
+        raise NotImplementedError()
+
+    def copy(self) -> "BaseGrammarObject":
+        return self
+
+    @property
+    def finished(self):
+        return self._finished
+
+    @finished.setter
+    def finished(self, finished):
+        self._finished = finished
+
+    def try_jump_forward(self, tokenizer) -> Optional[Tuple[List[int], str]]:
+        """
+        Try to jump forward in the grammar.
+
+        Returns:
+            A jump forward helper which may be used in `jump_forward_str_state`.
+            None if the jump forward is not possible.
+        """
+        raise NotImplementedError()
+
+    def jump_forward_str_state(self, helper: Tuple[List[int], str]) -> Tuple[str, int]:
+        """
+        Jump forward for the grammar.
+
+        Returns:
+            A tuple of the jump forward string and the next state of the grammar
+            (which can be used in `jump_and_retokenize` if needed).
+        """
+        raise NotImplementedError()
+
+    def jump_and_retokenize(
+        self, old_output_ids: List[int], new_output_ids: List[int], next_state: int
+    ) -> None:
+        """
+        Jump forward occurs, and update the grammar state if needed.
+        """
+        raise NotImplementedError()
+
+
+INVALID_GRAMMAR_OBJ = BaseGrammarObject()
+
+
+@dataclass
+class CacheEntry:
+    value: BaseGrammarObject
+    event: Event
+
+
+class BaseGrammarBackend:
+    def __init__(self):
+        self.executor = ThreadPoolExecutor()
+        self.cache: Dict[Tuple[str, str], CacheEntry] = {}
+
+    def _not_supported(self, key_type: str, key_string: str) -> None:
+        logger.warning(f"Skip unsupported {key_type=}, {key_string=}")
+
+    def dispatch_fallback(
+        self, key_type: str, key_string: str
+    ) -> Optional[BaseGrammarObject]:
+        """
+        This function should not be reached in any case.
+        """
+        raise ValueError(f"Invalid key_type: {key_type}={key_string}")
+
+    def dispatch_json(self, key_string: str) -> Optional[BaseGrammarObject]:
+        return self._not_supported("json", key_string)
+
+    def dispatch_regex(self, key_string: str) -> Optional[BaseGrammarObject]:
+        return self._not_supported("regex", key_string)
+
+    def dispatch_ebnf(self, key_string: str) -> Optional[BaseGrammarObject]:
+        return self._not_supported("ebnf", key_string)
+
+    def dispatch_structural_tag(self, key_string: str) -> Optional[BaseGrammarObject]:
+        return self._not_supported("structural_tag", key_string)
+
+    def _init_value_dispatch(self, key: Tuple[str, str]) -> Optional[BaseGrammarObject]:
+        key_type, key_string = key
+        if key_type == "json":
+            return self.dispatch_json(key_string)
+        elif key_type == "regex":
+            return self.dispatch_regex(key_string)
+        elif key_type == "ebnf":
+            return self.dispatch_ebnf(key_string)
+        elif key_type == "structural_tag":
+            return self.dispatch_structural_tag(key_string)
+        elif key_type == "structural_pattern":
+            return self.dispatch_structural_pattern(key_string)
+        else:
+            return self.dispatch_fallback(key_type, key_string)
+
+    def get_cached_or_future_value(
+        self, key: Tuple[str, str]
+    ) -> Optional[BaseGrammarObject]:
+        value = self.cache.get(key)
+        if value:
+            return value.copy(), True
+        value = self.executor.submit(self._init_value_dispatch, key)
+        return value, False
+
+    def set_cache(self, key: Tuple[str, str], value: BaseGrammarObject):
+        self.cache[key] = value
+
+    def reset(self):
+        self.cache.clear()
+
+
+def create_grammar_backend(
+    server_args: ServerArgs,
+    tokenizer,
+    vocab_size: int,
+    eos_token_ids: Optional[set] = None,
+) -> Optional[BaseGrammarBackend]:
+    if server_args.grammar_backend == "outlines":
+        from sglang.srt.constrained.outlines_backend import OutlinesGrammarBackend
+
+        grammar_backend = OutlinesGrammarBackend(
+            tokenizer,
+            whitespace_pattern=server_args.constrained_json_whitespace_pattern,
+        )
+    elif server_args.grammar_backend == "xgrammar":
+        from sglang.srt.constrained.xgrammar_backend import XGrammarGrammarBackend
+
+        # Convert Set[int] to List[int] if needed
+        eos_list = list(eos_token_ids) if eos_token_ids else None
+
+        grammar_backend = XGrammarGrammarBackend(
+            tokenizer, vocab_size=vocab_size, model_eos_token_ids=eos_list
+        )
+    elif server_args.grammar_backend == "llguidance":
+        from sglang.srt.constrained.llguidance_backend import GuidanceBackend
+
+        grammar_backend = GuidanceBackend(
+            tokenizer=tokenizer,
+            whitespace_pattern=server_args.constrained_json_whitespace_pattern,
+        )
+    elif server_args.grammar_backend == "none":
+        return None
+    else:
+        raise ValueError(f"Invalid grammar backend: {server_args.grammar_backend}")
+
+    if server_args.reasoning_parser and hasattr(tokenizer, "think_end_id"):
+        from sglang.srt.constrained.reasoner_grammar_backend import (
+            ReasonerGrammarBackend,
+        )
+
+        grammar_backend = ReasonerGrammarBackend(
+            grammar_backend, tokenizer.think_end_id
+        )
+
+    return grammar_backend
diff --git a/python/sglang/srt/constrained/llguidance_backend.py b/python/sglang/srt/constrained/llguidance_backend.py
new file mode 100644
index 000000000..2acbf2c51
--- /dev/null
+++ b/python/sglang/srt/constrained/llguidance_backend.py
@@ -0,0 +1,174 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Constrained decoding with llguidance backend."""
+
+import json
+import logging
+import os
+from typing import List, Optional, Tuple
+
+import torch
+from llguidance import LLMatcher, LLTokenizer, StructTag, grammar_from
+from llguidance.hf import from_tokenizer
+from llguidance.torch import (
+    allocate_token_bitmask,
+    apply_token_bitmask_inplace,
+    fill_next_token_bitmask,
+)
+
+from sglang.srt.constrained.base_grammar_backend import (
+    INVALID_GRAMMAR_OBJ,
+    BaseGrammarBackend,
+    BaseGrammarObject,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class GuidanceGrammar(BaseGrammarObject):
+
+    def __init__(self, llguidance_tokenizer: LLTokenizer, serialized_grammar: str):
+        super().__init__()
+        self.llguidance_tokenizer = llguidance_tokenizer
+        self.serialized_grammar = serialized_grammar
+
+        self.ll_matcher = LLMatcher(
+            self.llguidance_tokenizer,
+            self.serialized_grammar,
+            log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
+        )
+        self.finished = False
+        self.bitmask = None
+
+    def accept_token(self, token: int):
+        if not self.ll_matcher.consume_token(token):
+            logger.warning(f"matcher error: {self.ll_matcher.get_error()}")
+            self.finished = True
+
+    def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
+        if self.ll_matcher.is_stopped():
+            self.finished = True
+
+        fill_next_token_bitmask(self.ll_matcher, vocab_mask, idx)
+
+    def allocate_vocab_mask(
+        self, vocab_size: int, batch_size: int, device
+    ) -> torch.Tensor:
+        if self.bitmask is None or self.bitmask.shape[0] < batch_size:
+            # only create bitmask when batch gets larger
+            self.bitmask = allocate_token_bitmask(
+                batch_size, self.llguidance_tokenizer.vocab_size
+            )
+            bitmask = self.bitmask
+        else:
+            bitmask = self.bitmask[:batch_size]
+
+        return bitmask
+
+    @staticmethod
+    def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
+        return vocab_mask.to(device, non_blocking=True)
+
+    @staticmethod
+    def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
+        apply_token_bitmask_inplace(logits, vocab_mask)
+
+    def copy(self):
+        return GuidanceGrammar(
+            llguidance_tokenizer=self.llguidance_tokenizer,
+            serialized_grammar=self.serialized_grammar,
+        )
+
+    def try_jump_forward(self, tokenizer) -> Optional[Tuple[List[int], str]]:
+        ff_tokens = self.ll_matcher.compute_ff_tokens()
+        if ff_tokens:
+            return ff_tokens, ""
+        else:
+            return None
+
+    def jump_forward_str_state(self, helper: Tuple[List[int], str]) -> Tuple[str, int]:
+        return "", -1
+
+    def jump_and_retokenize(
+        self, old_output_ids: List[int], new_output_ids: List[int], next_state: int
+    ):
+        pass
+
+
+class GuidanceBackend(BaseGrammarBackend):
+
+    def __init__(
+        self,
+        tokenizer,
+        whitespace_pattern: Optional[str] = None,
+        n_vocab: Optional[int] = None,
+    ):
+        super().__init__()
+
+        self.tokenizer = tokenizer
+        self.whitespace_pattern = whitespace_pattern
+        self.llguidance_tokenizer = from_tokenizer(self.tokenizer, n_vocab)
+
+    def _from_serialized(self, serialized_grammar) -> Optional[GuidanceGrammar]:
+        try:
+            return GuidanceGrammar(
+                llguidance_tokenizer=self.llguidance_tokenizer,
+                serialized_grammar=serialized_grammar,
+            )
+        except Exception as e:
+            logger.error(f"Hit invalid grammar: {serialized_grammar=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
+
+    def dispatch_json(self, key_string: str) -> Optional[GuidanceGrammar]:
+        try:
+            serialized_grammar = LLMatcher.grammar_from_json_schema(
+                key_string,
+                defaults={
+                    "whitespace_pattern": self.whitespace_pattern,
+                },
+            )
+        except Exception as e:
+            logger.error(f"Hit invalid json_schema: {key_string=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
+        return self._from_serialized(serialized_grammar)
+
+    def dispatch_regex(self, key_string: str) -> Optional[GuidanceGrammar]:
+        serialized_grammar = grammar_from("regex", key_string)
+        return self._from_serialized(serialized_grammar)
+
+    def dispatch_ebnf(self, key_string: str) -> Optional[GuidanceGrammar]:
+        try:
+            serialized_grammar = grammar_from("ebnf", key_string)
+            return self._from_serialized(serialized_grammar)
+        except ValueError as e:
+            logger.error(f"Hit invalid ebnf: {key_string=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
+
+    def dispatch_structural_tag(self, key_string: str) -> Optional[GuidanceGrammar]:
+        try:
+            structural_tag = json.loads(key_string)
+            tags = [
+                StructTag(
+                    begin=structure["begin"],
+                    grammar=structure["schema"],
+                    end=structure["end"],
+                    trigger=structural_tag["triggers"][0],  # TODO?
+                )
+                for structure in structural_tag["structures"]
+            ]
+            g = StructTag.to_grammar(tags)
+            return self._from_serialized(g)
+        except Exception as e:
+            logging.error(f"Hit invalid structural_tag: {key_string=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
diff --git a/python/sglang/srt/constrained/outlines_backend.py b/python/sglang/srt/constrained/outlines_backend.py
new file mode 100644
index 000000000..5302fadaa
--- /dev/null
+++ b/python/sglang/srt/constrained/outlines_backend.py
@@ -0,0 +1,191 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Constrained decoding with outlines backend."""
+
+import json
+import logging
+from typing import Dict, List, Optional, Tuple, Union
+
+import interegular
+import torch
+from outlines.fsm.guide import RegexGuide
+from outlines.models.transformers import TransformerTokenizer
+from pydantic import BaseModel
+
+from sglang.srt.constrained.base_grammar_backend import (
+    INVALID_GRAMMAR_OBJ,
+    BaseGrammarBackend,
+    BaseGrammarObject,
+)
+from sglang.srt.constrained.outlines_jump_forward import OutlinesJumpForwardMap
+
+try:
+    from outlines.fsm.json_schema import build_regex_from_schema
+except ImportError:
+    from outlines_core.fsm.json_schema import build_regex_from_schema
+
+
+logger = logging.getLogger(__name__)
+
+
+class OutlinesGrammar(BaseGrammarObject):
+    def __init__(
+        self,
+        guide: RegexGuide,
+        jump_forward_map: Union[OutlinesJumpForwardMap, None],
+    ) -> None:
+        super().__init__()
+        self.guide = guide
+        self.jump_forward_map = jump_forward_map
+        self.state = 0
+        self.finished = False
+
+    def accept_token(self, token: int):
+        self.state = self.guide.get_next_state(self.state, token)
+
+    def allocate_vocab_mask(
+        self, vocab_size: int, batch_size: int, device
+    ) -> torch.Tensor:
+        return torch.zeros(batch_size, vocab_size, dtype=torch.bool, device=device)
+
+    @staticmethod
+    def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
+        return vocab_mask
+
+    def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
+        tokens = torch.tensor(
+            self.guide.get_next_instruction(self.state).tokens, dtype=torch.int64
+        ).to(vocab_mask.device, non_blocking=True)
+        vocab_mask = vocab_mask[idx]
+        vocab_mask.fill_(1)
+        vocab_mask.scatter_(0, tokens, torch.zeros_like(tokens, dtype=torch.bool))
+
+    @staticmethod
+    def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor):
+        logits.masked_fill_(vocab_mask, float("-inf"))
+
+    def copy(self):
+        return OutlinesGrammar(self.guide, self.jump_forward_map)
+
+    def try_jump_forward(self, tokenizer) -> Optional[Tuple]:
+        if not self.jump_forward_map:
+            return None
+
+        jump_forward_bytes = self.jump_forward_map.jump_forward_byte(self.state)
+        if jump_forward_bytes is None or len(jump_forward_bytes) <= 1:
+            return None
+
+        # preprocess the jump forward string
+        suffix_bytes = []
+        continuation_range = range(0x80, 0xC0)
+        cur_state = self.state
+        while (
+            len(jump_forward_bytes) and jump_forward_bytes[0][0] in continuation_range
+        ):
+            # continuation bytes
+            byte_edge = jump_forward_bytes.pop(0)
+            suffix_bytes.append(byte_edge[0])
+            cur_state = byte_edge[1]
+
+        suffix_tokens = [f"<0x{hex(b)[2:].upper()}>" for b in suffix_bytes]
+        suffix_ids = tokenizer.convert_tokens_to_ids(suffix_tokens)
+        return suffix_ids, cur_state
+
+    def jump_forward_str_state(self, helper: Tuple[List[int], str]) -> Tuple[str, int]:
+        _, cur_state = helper
+        return self.jump_forward_map.jump_forward_symbol(cur_state)
+
+    def jump_and_retokenize(
+        self, old_output_ids: List[int], new_output_ids: List[int], next_state: int
+    ):
+        self.state = next_state
+
+
+class OutlinesGrammarBackend(BaseGrammarBackend):
+    def __init__(
+        self,
+        tokenizer,
+        whitespace_pattern: bool,
+    ):
+        super().__init__()
+
+        try:
+            self.outlines_tokenizer = TransformerTokenizer(tokenizer)
+        except AttributeError:
+            # FIXME: tmp fix for chatglm2 & chatglm3 (pad_token_id=0)
+            origin_pad_token_id = tokenizer.pad_token_id
+
+            def fset(self, value):
+                self._value = value
+
+            type(tokenizer).pad_token_id = property(
+                fget=type(tokenizer).pad_token_id.fget, fset=fset
+            )
+            self.outlines_tokenizer = TransformerTokenizer(tokenizer)
+            self.outlines_tokenizer.tokenizer.pad_token_id = origin_pad_token_id
+            self.outlines_tokenizer.pad_token_id = origin_pad_token_id
+            self.outlines_tokenizer.pad_token = (
+                self.outlines_tokenizer.tokenizer.pad_token
+            )
+            self.outlines_tokenizer.vocabulary = (
+                self.outlines_tokenizer.tokenizer.get_vocab()
+            )
+        self.whitespace_pattern = whitespace_pattern
+
+    def _compile_regex(self, regex: str) -> Optional[OutlinesGrammar]:
+        try:
+            if hasattr(RegexGuide, "from_regex"):
+                # outlines >= 0.1.1
+                guide = RegexGuide.from_regex(regex, self.outlines_tokenizer)
+            else:
+                # outlines <= 0.0.46
+                guide = RegexGuide(regex, self.outlines_tokenizer)
+        except interegular.patterns.InvalidSyntax as e:
+            logger.error(f"Hit invalid regex schema: {regex=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
+
+        jump_forward_map = None
+        return OutlinesGrammar(guide, jump_forward_map)
+
+    def dispatch_ebnf(self, key_string: str):
+        return super().dispatch_ebnf(key_string)
+
+    def dispatch_structural_tag(self, key_string: str):
+        return super().dispatch_structural_tag(key_string)
+
+    def dispatch_json(self, key_string: str):
+        try:
+            regex = build_regex_from_object(
+                key_string,
+                whitespace_pattern=self.whitespace_pattern,
+            )
+        except (NotImplementedError, json.decoder.JSONDecodeError, ValueError) as e:
+            logger.error(f"Hit invalid json_schema: {key_string=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
+        return self._compile_regex(regex)
+
+    def dispatch_regex(self, key_string: str):
+        return self._compile_regex(key_string)
+
+
+def build_regex_from_object(
+    object: Union[str, BaseModel, Dict], whitespace_pattern: Optional[str] = None
+):
+    if isinstance(object, type(BaseModel)):
+        schema = json.dumps(object.model_json_schema())
+    elif isinstance(object, Dict):
+        schema = json.dumps(object)
+    else:
+        schema = object
+    return build_regex_from_schema(schema, whitespace_pattern)
diff --git a/python/sglang/srt/constrained/outlines_jump_forward.py b/python/sglang/srt/constrained/outlines_jump_forward.py
new file mode 100644
index 000000000..cfc65f75f
--- /dev/null
+++ b/python/sglang/srt/constrained/outlines_jump_forward.py
@@ -0,0 +1,200 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Faster constrained decoding with jump forward decoding / compressed finite state machine.
+Reference: https://lmsys.org/blog/2024-02-05-compressed-fsm/
+"""
+
+import dataclasses
+import logging
+from collections import defaultdict
+from typing import Optional
+
+import interegular
+from interegular import InvalidSyntax
+from outlines.caching import cache
+
+from sglang.srt.utils import get_bool_env_var
+
+try:
+    # outlines >= 0.1.0
+    from outlines_core.fsm.outlines_core_rs import FSMInfo
+    from outlines_core.fsm.regex import make_byte_level_fsm, make_deterministic_fsm
+except ImportError:
+    # outlines <= 0.0.46
+    from outlines.fsm.regex import FSMInfo, make_byte_level_fsm, make_deterministic_fsm
+
+IP_REGEX = r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)"
+
+# Env var was set in sglang.srt.server_args.ServerArgs.__post__init__
+DISABLE_DISK_CACHE = get_bool_env_var("SGLANG_DISABLE_OUTLINES_DISK_CACHE", "true")
+
+logger = logging.getLogger(__name__)
+
+
+@dataclasses.dataclass
+class JumpEdge:
+    symbol: str = None
+    symbol_next_state: int = None
+    byte: int = None
+    byte_next_state: int = None
+
+
+def disk_cache(expire: Optional[float] = None, typed=False, ignore=()):
+    if not DISABLE_DISK_CACHE:
+        return cache(expire, typed, ignore)
+    else:
+        return lambda fn: None
+
+
+@disk_cache()
+def init_state_to_jump_forward(regex_string):
+    try:
+        regex_pattern = interegular.parse_pattern(regex_string)
+    except InvalidSyntax as e:
+        logger.warning(f"skip invalid regex: {regex_string}, {e=}")
+        return
+
+    byte_fsm = make_byte_level_fsm(regex_pattern.to_fsm().reduce(), keep_utf8=True)
+    regex_fsm, _ = make_deterministic_fsm(byte_fsm)
+
+    fsm_info: FSMInfo = regex_fsm.fsm_info
+
+    symbol_to_id = fsm_info.alphabet_symbol_mapping
+    id_to_symbol = {}
+    for symbol, id_ in symbol_to_id.items():
+        id_to_symbol.setdefault(id_, []).append(symbol)
+
+    transitions = fsm_info.transitions
+
+    outgoings_ct = defaultdict(int)
+    # NOTE(lsyin): Final states can lead to terminate, so they have one outgoing edge naturally
+    for s in fsm_info.finals:
+        outgoings_ct[s] = 1
+
+    state_to_jump_forward = {}
+    for (state, id_), next_state in transitions.items():
+        if id_ == fsm_info.alphabet_anything_value:
+            # Arbitrarily symbol cannot be recognized as jump forward
+            continue
+
+        symbols = id_to_symbol[id_]
+        for c in symbols:
+            if len(c) > 1:
+                # Skip byte level transitions like c = "5E"
+                continue
+
+            outgoings_ct[state] += 1
+            if outgoings_ct[state] > 1:
+                if state in state_to_jump_forward:
+                    del state_to_jump_forward[state]
+                break
+
+            state_to_jump_forward[state] = JumpEdge(
+                symbol=c,
+                symbol_next_state=next_state,
+            )
+
+    # Process the byte level jump forward
+    outgoings_ct = defaultdict(int)
+    for s in fsm_info.finals:
+        outgoings_ct[s] = 1
+
+    for (state, id_), next_state in transitions.items():
+        if id_ == fsm_info.alphabet_anything_value:
+            continue
+        symbols = id_to_symbol[id_]
+        for c in symbols:
+            byte_ = None
+            if len(c) == 1 and ord(c) < 0x80:
+                # ASCII character
+                byte_ = ord(c)
+            elif len(c) > 1:
+                # FIXME: This logic is due to the leading \x00
+                # https://github.com/outlines-dev/outlines/pull/930
+                byte_ = int(symbols[0][1:], 16)
+
+            if byte_ is not None:
+                outgoings_ct[state] += 1
+                if outgoings_ct[state] > 1:
+                    if state in state_to_jump_forward:
+                        del state_to_jump_forward[state]
+                    break
+                e = state_to_jump_forward.get(state, JumpEdge())
+                e.byte = byte_
+                e.byte_next_state = next_state
+                state_to_jump_forward[state] = e
+
+    return state_to_jump_forward
+
+
+class OutlinesJumpForwardMap:
+    def __init__(self, regex_string):
+        self.state_to_jump_forward = init_state_to_jump_forward(regex_string)
+
+    def jump_forward_symbol(self, state):
+        jump_forward_str = ""
+        next_state = state
+        while state in self.state_to_jump_forward:
+            e = self.state_to_jump_forward[state]
+            if e.symbol is None:
+                break
+            jump_forward_str += e.symbol
+            next_state = e.symbol_next_state
+            state = next_state
+
+        return jump_forward_str, next_state
+
+    def jump_forward_byte(self, state):
+        if state not in self.state_to_jump_forward:
+            return None
+
+        jump_forward_bytes = []
+        next_state = None
+        while state in self.state_to_jump_forward:
+            e = self.state_to_jump_forward[state]
+            assert e.byte is not None and e.byte_next_state is not None
+            jump_forward_bytes.append((e.byte, e.byte_next_state))
+            next_state = e.byte_next_state
+            state = next_state
+
+        return jump_forward_bytes
+
+    def is_jump_forward_symbol_state(self, state):
+        return (
+            state in self.state_to_jump_forward
+            and self.state_to_jump_forward[state].symbol is not None
+        )
+
+
+def test_main(regex_string):
+    jump_forward_map = OutlinesJumpForwardMap(regex_string)
+    for state, e in jump_forward_map.state_to_jump_forward.items():
+        if e.symbol is not None:
+            jump_forward_str, next_state = jump_forward_map.jump_forward_symbol(state)
+            print(f"{state} -> {next_state}", jump_forward_str)
+        bytes_ = jump_forward_map.jump_forward_byte(state)
+        print(f"{state} -> {bytes_[-1][1]}", [hex(b) for b, _ in bytes_])
+
+
+if __name__ == "__main__":
+    import outlines
+
+    outlines.caching.clear_cache()
+    test_main(r"The google's DNS sever address is " + IP_REGEX)
+    test_main(r"霍格沃茨特快列车|霍比特人比尔博")
+    # 霍格: \xe9\x9c\x8d \xe6\xa0\xbc ...
+    # 霍比: \xe9\x9c\x8d \xe6\xaf\x94 ...
+
+    test_main(r"[-+]?[0-9]+[ ]*")
diff --git a/python/sglang/srt/constrained/reasoner_grammar_backend.py b/python/sglang/srt/constrained/reasoner_grammar_backend.py
new file mode 100644
index 000000000..ca5f118ef
--- /dev/null
+++ b/python/sglang/srt/constrained/reasoner_grammar_backend.py
@@ -0,0 +1,90 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The baseclass of a backend for reasoner grammar-guided constrained decoding."""
+
+from typing import List, Optional, Tuple
+
+import torch
+
+from .base_grammar_backend import BaseGrammarBackend, BaseGrammarObject
+
+
+class ReasonerGrammarObject(BaseGrammarObject):
+    def __init__(self, grammar: BaseGrammarObject, think_end_id):
+        super().__init__()
+        self.grammar = grammar
+        self.think_end_id = think_end_id
+        self.is_in_reasoning = True
+
+    def accept_token(self, token: int):
+        if token == self.think_end_id:
+            self.is_in_reasoning = False
+
+        if not self.is_in_reasoning and token != self.think_end_id:
+            self.grammar.accept_token(token)
+
+    def allocate_vocab_mask(
+        self, vocab_size: int, batch_size: int, device
+    ) -> torch.Tensor:
+        return self.grammar.allocate_vocab_mask(vocab_size, batch_size, device)
+
+    def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
+        if not self.is_in_reasoning:
+            self.grammar.fill_vocab_mask(vocab_mask, idx)
+
+    def move_vocab_mask(self, vocab_mask: torch.Tensor, device) -> torch.Tensor:
+        return self.grammar.move_vocab_mask(vocab_mask, device)
+
+    @property
+    def apply_vocab_mask(self):
+        return self.grammar.apply_vocab_mask
+
+    def copy(self) -> BaseGrammarObject:
+        return ReasonerGrammarObject(self.grammar.copy(), self.think_end_id)
+
+    @property
+    def finished(self):
+        return self.grammar.finished
+
+    @finished.setter
+    def finished(self, finished):
+        self.grammar.finished = finished
+
+    def try_jump_forward(self, tokenizer):
+        return self.grammar.try_jump_forward(tokenizer)
+
+    def jump_forward_str_state(self, helper):
+        return self.grammar.jump_forward_str_state(helper)
+
+    def jump_and_retokenize(
+        self, old_output_ids: List[int], new_output_ids: List[int], next_state: int
+    ):
+        return self.grammar.jump_and_retokenize(
+            old_output_ids, new_output_ids, next_state
+        )
+
+
+class ReasonerGrammarBackend(BaseGrammarBackend):
+    def __init__(self, grammar_backend: BaseGrammarBackend, think_end_id):
+        super().__init__()
+        self.grammar_backend = grammar_backend
+        self.think_end_id = think_end_id
+
+    def _init_value_dispatch(
+        self, key: Tuple[str, str]
+    ) -> Optional[ReasonerGrammarObject]:
+        ret = self.grammar_backend._init_value_dispatch(key)
+        if ret is None:
+            return None
+        return ReasonerGrammarObject(ret, self.think_end_id)
diff --git a/python/sglang/srt/constrained/triton_ops/bitmask_ops.py b/python/sglang/srt/constrained/triton_ops/bitmask_ops.py
new file mode 100644
index 000000000..9a195c006
--- /dev/null
+++ b/python/sglang/srt/constrained/triton_ops/bitmask_ops.py
@@ -0,0 +1,141 @@
+# Adapt from
+# https://github.com/mlc-ai/xgrammar/blob/v0.1.17/python/xgrammar/kernels/apply_token_bitmask_inplace_triton.py
+
+from typing import List, Optional, Union
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.utils import get_device_core_count
+
+
+@triton.jit
+def apply_token_bitmask_inplace_kernel(
+    logits_ptr,
+    bitmask_ptr,
+    indices_ptr,
+    num_rows,
+    vocab_size,
+    logits_strides,
+    bitmask_strides,
+    NUM_SMS: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """Apply a bitmask to logits in-place using Triton. The bitmask is a 01 bitwise compressed tensor,
+    where 0 means the token is masked and 1 means the token is not masked. After applying the bitmask,
+    the masked logits will be set to -inf.
+
+    Parameters
+    ----------
+    logits_ptr : tl.tensor
+        Pointer to the logits tensor to apply the bitmask to.
+
+    bitmask_ptr : tl.tensor
+        Pointer to the bitmask tensor to apply.
+
+    indices_ptr : Optional[tl.tensor]
+        Optional pointer to indices tensor specifying which rows to apply the mask to.
+
+    num_rows : int
+        Number of rows to process. If indices_ptr is provided, this is the number of unique indices.
+
+    vocab_size : int
+        Size of the vocabulary dimension. If the logits does not have a vocab padding, this is the
+        same as the logits's second dimension. Otherwise, this is the actual size of the vocabulary.
+
+    logits_strides : int
+        Stride between rows in the logits tensor.
+
+    bitmask_strides : int
+        Stride between rows in the bitmask tensor.
+
+    NUM_SMS : int
+        Number of streaming multiprocessors to use.
+
+    BLOCK_SIZE : int
+        Size of processing blocks.
+    """
+
+    pid = tl.program_id(0)
+    num_blocks = tl.cdiv(vocab_size, BLOCK_SIZE)
+    for work_id in tl.range(pid, num_rows * num_blocks, NUM_SMS):
+        row_id = work_id // num_blocks
+        block_offset = (work_id % num_blocks) * BLOCK_SIZE
+        batch_id = row_id if indices_ptr is None else tl.load(indices_ptr + row_id)
+        offsets = block_offset + tl.arange(0, BLOCK_SIZE)
+        bitmask_offsets = block_offset // 32 + tl.arange(0, BLOCK_SIZE // 32)
+        vocab_mask = offsets < vocab_size
+        packed_bitmask_mask = bitmask_offsets < bitmask_strides
+        packed_bitmask = tl.load(
+            bitmask_ptr + batch_id * bitmask_strides + bitmask_offsets,
+            packed_bitmask_mask,
+        )
+        bitmask = ((packed_bitmask[:, None] >> (tl.arange(0, 32)[None, :])) & 1) == 0
+        bitmask = bitmask.reshape(BLOCK_SIZE)
+
+        tl.store(
+            logits_ptr + batch_id * logits_strides + offsets,
+            -float("inf"),
+            vocab_mask & bitmask,
+        )
+
+
+def apply_token_bitmask_inplace_triton(
+    logits: torch.Tensor,
+    bitmask: torch.Tensor,
+    indices: Optional[Union[List[int], torch.Tensor]] = None,
+):
+    NUM_SMS = get_device_core_count()
+    BLOCK_SIZE = 4096
+    BITS_PER_BLOCK = 32
+
+    # Check input dtype
+    assert bitmask.dtype == torch.int32, "bitmask must be of type int32"
+
+    # Check input tensor shapes.
+    logits_shape = logits.shape
+    bitmask_shape = bitmask.shape
+    if logits.ndim == 1:
+        logits_shape = (1, logits_shape[0])
+    if bitmask.ndim == 1:
+        bitmask_shape = (1, bitmask_shape[0])
+
+    required_bitmask_width = (logits_shape[1] + BITS_PER_BLOCK - 1) // BITS_PER_BLOCK
+    assert required_bitmask_width >= bitmask_shape[1], (
+        f"Bitmask width too large: allow at most {required_bitmask_width} int32s for "
+        f"logits' width {logits_shape[1]}, but got {bitmask_shape[1]}"
+    )
+
+    vocab_size = min(logits_shape[1], bitmask_shape[1] * BITS_PER_BLOCK)
+
+    num_rows = None
+    if isinstance(indices, list) or isinstance(indices, torch.Tensor):
+        indices = torch.tensor(indices, dtype=torch.int32, device=logits.device)
+        num_rows = indices.shape[0]
+    else:
+        assert (
+            logits_shape[0] == bitmask_shape[0]
+        ), f"batch size mismatch: logits {logits_shape[0]} vs bitmask {bitmask_shape[0]}"
+        num_rows = logits_shape[0]
+
+    if NUM_SMS > 0:
+        grid = (NUM_SMS,)
+    else:
+        num_blocks = triton.cdiv(vocab_size, BLOCK_SIZE)
+        grid = (num_rows * num_blocks,)
+        NUM_SMS = triton.next_power_of_2(grid[0])
+
+    apply_token_bitmask_inplace_kernel[grid](
+        logits,
+        bitmask,
+        indices,
+        num_rows,
+        vocab_size,
+        logits_shape[1],
+        bitmask_shape[1],
+        NUM_SMS,
+        BLOCK_SIZE,
+        num_warps=BLOCK_SIZE // 32 // (16 // logits.element_size()),
+        num_stages=3,
+    )
diff --git a/python/sglang/srt/constrained/xgrammar_backend.py b/python/sglang/srt/constrained/xgrammar_backend.py
new file mode 100644
index 000000000..7b101df4f
--- /dev/null
+++ b/python/sglang/srt/constrained/xgrammar_backend.py
@@ -0,0 +1,239 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Constrained decoding with xgrammar backend."""
+
+import json
+import logging
+from typing import List, Optional, Tuple, Union
+
+import torch
+from xgrammar import (
+    CompiledGrammar,
+    GrammarCompiler,
+    GrammarMatcher,
+    StructuralTagItem,
+    TokenizerInfo,
+    allocate_token_bitmask,
+)
+
+from sglang.srt.constrained.base_grammar_backend import (
+    INVALID_GRAMMAR_OBJ,
+    BaseGrammarBackend,
+    BaseGrammarObject,
+)
+from sglang.srt.utils import is_hip
+
+_is_hip = is_hip()
+if _is_hip:
+    from sgl_kernel import apply_token_bitmask_inplace_cuda
+else:
+    from sglang.srt.constrained.triton_ops.bitmask_ops import (
+        apply_token_bitmask_inplace_triton,
+    )
+logger = logging.getLogger(__name__)
+
+
+MAX_ROLLBACK_TOKENS = 200
+
+
+class XGrammarGrammar(BaseGrammarObject):
+
+    def __init__(
+        self,
+        matcher: GrammarMatcher,
+        vocab_size: int,
+        ctx: CompiledGrammar,
+        override_stop_tokens: Optional[Union[List[int], int]],
+        key_string: Optional[str] = None,  # TODO (sk): for debugging, remove later
+    ) -> None:
+        self.matcher = matcher
+        self.vocab_size = vocab_size
+        self.ctx = ctx
+        self.override_stop_tokens = override_stop_tokens
+        self.finished = False
+        self.accepted_tokens = []
+        self.key_string = key_string
+
+    def accept_token(self, token: int):
+        if not self.is_terminated():
+            accepted = self.matcher.accept_token(token)
+            if not accepted:
+                # log for debugging
+                raise ValueError(
+                    f"Tokens not accepted: {token}\n"
+                    f"Accepted tokens: {self.accepted_tokens}\n"
+                    f"Key string: {self.key_string}"
+                )
+            else:
+                self.accepted_tokens.append(token)
+
+    def rollback(self, k: int):
+        self.matcher.rollback(k)
+        self.accepted_tokens = self.accepted_tokens[:-k]
+
+    def is_terminated(self):
+        return self.matcher.is_terminated()
+
+    def allocate_vocab_mask(
+        self, vocab_size: int, batch_size: int, device
+    ) -> torch.Tensor:
+        return allocate_token_bitmask(batch_size, vocab_size)
+
+    def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
+        self.matcher.fill_next_token_bitmask(vocab_mask, idx)
+
+    @staticmethod
+    def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
+        return vocab_mask.to(device, non_blocking=True)
+
+    def apply_vocab_mask(self, logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
+        if logits.device.type == "cuda":
+            if _is_hip:
+                apply_token_bitmask_inplace_cuda(logits, vocab_mask)
+            else:
+                apply_token_bitmask_inplace_triton(logits, vocab_mask)
+        elif logits.device.type == "cpu" and self.apply_vocab_mask_cpu:
+            self.apply_vocab_mask_cpu(logits, vocab_mask)
+        else:
+            raise RuntimeError(f"Unsupported device: {logits.device.type}")
+
+    def copy(self):
+        matcher = GrammarMatcher(
+            self.ctx,
+            max_rollback_tokens=MAX_ROLLBACK_TOKENS,
+            override_stop_tokens=self.override_stop_tokens,
+        )
+        return XGrammarGrammar(
+            matcher,
+            self.vocab_size,
+            self.ctx,
+            self.override_stop_tokens,
+            self.key_string,
+        )
+
+    def try_jump_forward(self, tokenizer) -> Optional[Tuple[List[int], str]]:
+        s = self.matcher.find_jump_forward_string()
+        if s:
+            return [], s
+        return None
+
+    def jump_forward_str_state(self, helper: Tuple[List[int], str]) -> Tuple[str, int]:
+        _, data = helper
+        return data, -1
+
+    def jump_and_retokenize(
+        self, old_output_ids: List[int], new_output_ids: List[int], next_state: int
+    ):
+        k = 0
+        for i, old_id in enumerate(old_output_ids):
+            if old_id == new_output_ids[i]:
+                k = i + 1
+            else:
+                break
+
+        # rollback to the last token that is the same
+        if k < len(old_output_ids):
+            self.matcher.rollback(len(old_output_ids) - k)
+
+        for i in range(k, len(new_output_ids)):
+            assert self.matcher.accept_token(new_output_ids[i])
+
+    def __repr__(self):
+        return f"XGrammarGrammar({self.key_string=}, {self.accepted_tokens=})"
+
+
+class XGrammarGrammarBackend(BaseGrammarBackend):
+    def __init__(
+        self,
+        tokenizer,
+        vocab_size: int,
+        model_eos_token_ids: Optional[List[int]] = None,
+    ):
+        super().__init__()
+
+        if hasattr(tokenizer, "init_xgrammar"):
+            # For special tokenizer
+            tokenizer_info, override_stop_tokens = tokenizer.init_xgrammar()
+        else:
+            # Create TokenizerInfo with model's EOS tokens as the authoritative stop tokens
+            # This ensures consistency between what the model considers EOS and what XGrammar uses
+            tokenizer_info = TokenizerInfo.from_huggingface(
+                tokenizer, vocab_size=vocab_size, stop_token_ids=model_eos_token_ids
+            )
+            override_stop_tokens = None
+
+        self.grammar_compiler = GrammarCompiler(tokenizer_info=tokenizer_info)
+        self.vocab_size = vocab_size
+        self.override_stop_tokens = override_stop_tokens
+
+    def _from_context(self, ctx: CompiledGrammar, key_string: str) -> XGrammarGrammar:
+        matcher = GrammarMatcher(
+            ctx,
+            max_rollback_tokens=MAX_ROLLBACK_TOKENS,
+            override_stop_tokens=self.override_stop_tokens,
+        )
+        return XGrammarGrammar(
+            matcher, self.vocab_size, ctx, self.override_stop_tokens, key_string
+        )
+
+    def dispatch_json(self, key_string: str) -> Optional[XGrammarGrammar]:
+        try:
+            if key_string == "$$ANY$$":
+                # Note: This builtin JSON grammar includes *all* valid JSON (including, for example, arrays at the root)
+                ctx = self.grammar_compiler.compile_builtin_json_grammar()
+            else:
+                ctx = self.grammar_compiler.compile_json_schema(schema=key_string)
+
+        except (RuntimeError, json.decoder.JSONDecodeError) as e:
+            logging.error(f"Hit invalid json_schema: {key_string=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
+        return self._from_context(ctx, key_string)
+
+    def dispatch_ebnf(self, key_string: str) -> Optional[XGrammarGrammar]:
+        try:
+            ctx = self.grammar_compiler.compile_grammar(key_string)
+        except RuntimeError as e:
+            logging.error(f"Hit invalid ebnf: {key_string=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
+        return self._from_context(ctx, key_string)
+
+    def dispatch_regex(self, key_string: str) -> Optional[XGrammarGrammar]:
+        try:
+            ctx = self.grammar_compiler.compile_regex(key_string)
+        except RuntimeError as e:
+            logging.error(f"Hit invalid regex: {key_string=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
+        return self._from_context(ctx, key_string)
+
+    def dispatch_structural_tag(self, key_string: str) -> Optional[XGrammarGrammar]:
+        try:
+            structural_tag = json.loads(key_string)
+            tags = [
+                StructuralTagItem(
+                    begin=structure["begin"],
+                    schema=json.dumps(structure["schema"]),
+                    end=structure["end"],
+                )
+                for structure in structural_tag["structures"]
+            ]
+            ctx = self.grammar_compiler.compile_structural_tag(
+                tags, structural_tag["triggers"]
+            )
+        except (RuntimeError, json.decoder.JSONDecodeError) as e:
+            logging.error(f"Hit invalid structural_tag: {key_string=}, {e=}")
+            return INVALID_GRAMMAR_OBJ
+        return self._from_context(ctx, key_string)
+
+    def reset(self):
+        self.grammar_compiler.clear_cache()
diff --git a/python/sglang/srt/custom_op.py b/python/sglang/srt/custom_op.py
new file mode 100644
index 000000000..ea3c06e6d
--- /dev/null
+++ b/python/sglang/srt/custom_op.py
@@ -0,0 +1,102 @@
+from torch import nn
+
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    is_cpu,
+    is_cuda,
+    is_hip,
+    is_npu,
+    is_xpu,
+)
+
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+_is_cpu = is_cpu()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_npu = is_npu()
+_is_xpu = is_xpu()
+
+
+class CustomOp(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self._forward_method = self.dispatch_forward()
+
+        # States for torch.compile
+        self._original_forward_method = None
+        self.is_torch_compile = False
+
+    def enter_torch_compile(self, num_tokens: int):
+        # Skip if Op is already entered compile mode.
+        # NOTE(alcanderian): Some Ops(for example RotaryEmbedding) will be reused
+        # among layers and `enter_torch_compile` will be called many times.
+        # We should prevent `self._original_forward_method` from being overridden when
+        # it is not the first time `enter_torch_compile` called.
+        if self.is_torch_compile:
+            return
+
+        self._original_forward_method = self._forward_method
+        # NOTE: Temporarily workaround MoE
+        # The performance of torch.compile on this layer is not always good when bs > 1,
+        # so we decide to only use torch.compile when bs=1
+        if "FusedMoE" in self.__class__.__name__:
+            if num_tokens == 1:
+                from sglang.srt.layers.moe.fused_moe_native import (
+                    fused_moe_forward_native,
+                )
+
+                self._forward_method = fused_moe_forward_native
+        elif "TopK" in self.__class__.__name__:
+            if num_tokens == 1:
+                self._forward_method = self.forward_native
+        else:
+            self._forward_method = self.forward_native
+        self.is_torch_compile = True
+
+    def leave_torch_compile(self):
+        # Skip if Op is already exited compile mode.
+        if not self.is_torch_compile:
+            return
+
+        self._forward_method = self._original_forward_method
+        self._original_forward_method = None
+        self.is_torch_compile = False
+
+    # Please do not override this method, because `self._forward_method` can change when in torch compile mode
+    def forward(self, *args, **kwargs):
+        return self._forward_method(*args, **kwargs)
+
+    def forward_native(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def forward_cuda(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def forward_npu(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def forward_hip(self, *args, **kwargs):
+        return self.forward_cuda(*args, **kwargs)
+
+    def forward_xpu(self, *args, **kwargs):
+        return self.forward_native(*args, **kwargs)
+
+    def forward_hpu(self, *args, **kwargs):
+        return self.forward_native(*args, **kwargs)
+
+    def forward_cpu(self, *args, **kwargs):
+        return self.forward_native(*args, **kwargs)
+
+    def dispatch_forward(self):
+        if _is_cuda:
+            return self.forward_cuda
+        elif _is_hip:
+            return self.forward_hip
+        elif _is_cpu and _is_cpu_amx_available:
+            return self.forward_cpu
+        elif _is_npu:
+            return self.forward_npu
+        elif _is_xpu:
+            return self.forward_xpu
+        else:
+            return self.forward_native
diff --git a/python/sglang/srt/debug_utils/__init__.py b/python/sglang/srt/debug_utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/sglang/srt/debug_utils/dump_comparator.py b/python/sglang/srt/debug_utils/dump_comparator.py
new file mode 100644
index 000000000..aca9c3b7a
--- /dev/null
+++ b/python/sglang/srt/debug_utils/dump_comparator.py
@@ -0,0 +1,168 @@
+import argparse
+import functools
+from pathlib import Path
+
+import polars as pl
+import torch
+
+from sglang.srt.debug_utils.dump_loader import find_row, read_meta
+from sglang.srt.debug_utils.dumper import get_truncated_value
+
+
+def main(args):
+    df_target = read_meta(args.target_path)
+    df_target = df_target.sort("rank", "dump_index")
+    df_target = df_target.filter(
+        (pl.col("forward_pass_id") >= args.start_id)
+        & (pl.col("forward_pass_id") <= args.end_id)
+    )
+    assert all(
+        c in df_target.columns
+        for c in ["rank", "forward_pass_id", "dump_index", "name"]
+    )
+
+    df_baseline = read_meta(args.baseline_path)
+    print("df_target", df_target)
+    print("df_baseline", df_baseline)
+
+    for row in df_target.iter_rows(named=True):
+        path_target = Path(args.target_path) / row["filename"]
+
+        row_baseline = find_row(
+            df_baseline,
+            conditions=dict(
+                forward_pass_id=row["forward_pass_id"]
+                - args.start_id
+                + args.baseline_start_id,
+                **{
+                    k: v
+                    for k, v in row.items()
+                    if k not in ["forward_pass_id", "dump_index", "filename"]
+                },
+            ),
+        )
+
+        if row_baseline is None:
+            print(f"Skip: target={str(path_target)} since no baseline")
+            x_target = _load_object(path_target)
+            if x_target is not None:
+                print(f"x_target(sample)={get_truncated_value(x_target)}")
+            continue
+
+        path_baseline = Path(args.baseline_path) / row_baseline["filename"]
+        print(f"Check: target={str(path_target)} baseline={str(path_baseline)}")
+        check_tensor_pair(
+            path_baseline=path_baseline, path_target=path_target, name=row["name"]
+        )
+        print()
+
+
+def check_tensor_pair(path_baseline, path_target, name=""):
+    x_baseline = _load_object(path_baseline)
+    x_target = _load_object(path_target)
+
+    print(
+        f"Raw "
+        f"[shape] {x_baseline.shape} vs {x_target.shape}\t"
+        f"[dtype] {x_baseline.dtype} vs {x_target.dtype}"
+    )
+
+    x_baseline, x_target = _comparison_preprocessor(x_baseline, x_target, name=name)
+    x_baseline = _try_unify_shape(x_baseline, target_shape=x_target.shape)
+
+    print(
+        f"After preprocessor "
+        f"[shape] {x_baseline.shape} vs {x_target.shape}\t"
+        f"[dtype] {x_baseline.dtype} vs {x_target.dtype}"
+    )
+
+    x_target = x_target.float()
+    x_baseline = x_baseline.float()
+
+    for name, fn in (
+        ("mean", torch.mean),
+        ("std", torch.std),
+        ("min", torch.min),
+        ("max", torch.max),
+        ("p1", functools.partial(torch.quantile, q=0.01)),
+        ("p5", functools.partial(torch.quantile, q=0.05)),
+        ("p95", functools.partial(torch.quantile, q=0.95)),
+        ("p99", functools.partial(torch.quantile, q=0.99)),
+    ):
+        value_baseline = fn(x_baseline).item()
+        value_target = fn(x_target).item()
+        print(
+            f"[{name}] {value_baseline :.4f} vs {value_target:.4f} (diff: {value_target - value_baseline:.4f})"
+        )
+
+    if x_baseline.shape != x_target.shape:
+        print(f"⚠️ Shape mismatch")
+        return
+
+    raw_abs_diff = (x_target - x_baseline).abs()
+
+    max_abs_diff = raw_abs_diff.max().item()
+    mean_abs_diff = raw_abs_diff.mean().item()
+    rel_diff = _calc_rel_diff(x_target, x_baseline)
+
+    needs_print = max_abs_diff > 1e-3
+
+    print(
+        "\t".join(
+            f"{'❌' if value > 1e-3 else '✅'} {name}={value}"
+            for name, value in [
+                ("rel_diff", rel_diff),
+                ("max_abs_diff", max_abs_diff),
+                ("mean_abs_diff", mean_abs_diff),
+            ]
+        )
+    )
+
+    if needs_print:
+        print(f"x_baseline(sample)={get_truncated_value(x_baseline)}")
+        print(f"x_target(sample)={get_truncated_value(x_target)}")
+
+
+def _try_unify_shape(x: torch.Tensor, target_shape):
+    x_shape = x.shape
+    num_dim_to_remove = len(x_shape) - len(target_shape)
+    if (x_shape[num_dim_to_remove:] == target_shape) and all(
+        val == 1 for val in x_shape[:num_dim_to_remove]
+    ):
+        out = functools.reduce(lambda a, _: a.squeeze(0), range(num_dim_to_remove), x)
+        print(f"Unify shape: {x_shape} -> {out.shape} (to match {target_shape})")
+        return out
+
+    return x
+
+
+# Copied from DeepGEMM
+def _calc_rel_diff(x: torch.Tensor, y: torch.Tensor):
+    x, y = x.double(), y.double()
+    denominator = (x * x + y * y).sum()
+    sim = 2 * (x * y).sum() / denominator
+    return 1 - sim
+
+
+def _comparison_preprocessor(x_baseline, x_target, name):
+    # can insert arbitrary adhoc postprocessing logic here
+    return x_baseline, x_target
+
+
+def _load_object(path):
+    x = torch.load(path, weights_only=False)
+    if not isinstance(x, torch.Tensor):
+        print(f"Skip load {path} since {type(x)=} is not a Tensor")
+        return None
+    return x.cuda()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--baseline-path", type=str)
+    parser.add_argument("--target-path", type=str)
+    parser.add_argument("--start-id", type=int, default=0)
+    parser.add_argument("--end-id", type=int, default=1000000)
+    parser.add_argument("--baseline-start-id", type=int, default=0)
+    args = parser.parse_args()
+    main(args)
diff --git a/python/sglang/srt/debug_utils/dump_loader.py b/python/sglang/srt/debug_utils/dump_loader.py
new file mode 100644
index 000000000..8e6f2c79b
--- /dev/null
+++ b/python/sglang/srt/debug_utils/dump_loader.py
@@ -0,0 +1,97 @@
+import functools
+import os
+from pathlib import Path
+from typing import Any, Dict
+
+import polars as pl
+import torch
+
+
+class DumpLoader:
+    def __init__(self):
+        directory = os.environ.get("SGLANG_DUMP_LOADER_DIR")
+
+        self._enable = directory is not None
+        if self._enable:
+            self._directory = Path(directory)
+            self._df = read_meta(directory)
+
+    @property
+    def enable(self):
+        return self._enable
+
+    def load(self, name, **kwargs):
+        assert self._enable, "Please call DumpLoader.load only when it is enabled"
+
+        from sglang.srt.debug_utils.dumper import dumper
+
+        forward_pass_id = dumper._forward_pass_id
+        conditions = dict(name=name, forward_pass_id=forward_pass_id, **kwargs)
+        row = find_row(self._df, conditions=conditions)
+        assert (
+            row is not None
+        ), f"DumpLoader cannot find row given query {name=} {kwargs=} {self._directory=}"
+
+        path = self._directory / row["filename"]
+        output = torch.load(path, weights_only=False)
+
+        print(
+            f"[DumpLoader] load from {path=} (query: {name=} {kwargs=}, output: {type(output)})"
+        )
+        return output
+
+
+def read_meta(directory):
+    directory = Path(directory)
+    assert directory.is_dir(), f"{directory=} should be a directory"
+
+    rows = []
+    for p in directory.glob("*.pt"):
+        full_kwargs = {}
+        for kv in p.stem.split("___"):
+            k, v = kv.split("=")
+            full_kwargs[k] = v
+        rows.append(
+            {
+                "filename": str(p.name),
+                **full_kwargs,
+            }
+        )
+
+    df = pl.DataFrame(rows)
+    df = df.with_columns(
+        pl.col("forward_pass_id").cast(int),
+        pl.col("rank").cast(int),
+        pl.col("dump_index").cast(int),
+    )
+    return df
+
+
+def find_row(df, conditions: Dict[str, Any]):
+    df_sub = df.filter(
+        functools.reduce(
+            lambda a, b: a & b,
+            [
+                pl.col(col) == _cast_to_polars_dtype(conditions[col], df.schema[col])
+                for col in conditions.keys()
+            ],
+        )
+    )
+    assert len(df_sub) <= 1
+    return df_sub.to_dicts()[0] if len(df_sub) > 0 else None
+
+
+def _cast_to_polars_dtype(value, target_dtype):
+    if target_dtype in (pl.Int64, pl.Int32, pl.UInt64, pl.UInt32):
+        return int(value)
+    elif target_dtype in (pl.Float64, pl.Float32):
+        return float(value)
+    elif target_dtype == pl.Boolean:
+        return bool(value)
+    elif target_dtype == pl.String:
+        return str(value)
+    else:
+        return value
+
+
+dump_loader = DumpLoader()
diff --git a/python/sglang/srt/debug_utils/dumper.py b/python/sglang/srt/debug_utils/dumper.py
new file mode 100644
index 000000000..8a9808bb7
--- /dev/null
+++ b/python/sglang/srt/debug_utils/dumper.py
@@ -0,0 +1,116 @@
+import os
+import time
+from pathlib import Path
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+
+
+class _Dumper:
+    """Utility to dump tensors, which can be useful when comparison checking models.
+
+    Example usage:
+    dumper.on_forward_pass_start()
+    dumper.dump("layer_start__hidden_states", hidden_states, layer_id=self.layer_id)
+
+    Import from non-SGLang system:
+    ```
+    import sys
+    sys.path.append("/YOUR_PATH/sglang/python/sglang/srt/debug_utils")
+    from dumper import dumper
+    ```
+
+    Related: `sglang.srt.debug_utils.dump_comparator` for dump comparison
+    """
+
+    def __init__(self):
+        # Do not import `sglang` to make this file standalone
+        self._enable = bool(int(os.environ.get("SGLANG_DUMPER_ENABLE", "1")))
+        self._base_dir = Path(os.environ.get("SGLANG_DUMPER_DIR", "/tmp"))
+        self._enable_write_file = bool(
+            int(os.environ.get("SGLANG_DUMPER_WRITE_FILE", "1"))
+        )
+        self._partial_name: Optional[str] = None
+        self._dump_index = 0
+        self._forward_pass_id = 0
+
+    def on_forward_pass_start(self):
+        self._forward_pass_id += 1
+        print(
+            f"[Dumper] [{time.time()}] on_forward_pass_start id={self._forward_pass_id}"
+        )
+
+    def dump(self, name, value, **kwargs):
+        if not self._enable:
+            return
+
+        assert (
+            self._forward_pass_id >= 1
+        ), "Do you forget to call `dumper.on_forward_pass_start()`?"
+        self._dump_index += 1
+
+        if self._partial_name is None:
+            self._partial_name = _get_partial_name()
+
+        rank = _get_rank()
+        full_kwargs = dict(
+            forward_pass_id=self._forward_pass_id,
+            rank=rank,
+            name=name,
+            dump_index=self._dump_index,
+            **kwargs,
+        )
+        full_filename = "___".join(f"{k}={v}" for k, v in full_kwargs.items()) + ".pt"
+        path = self._base_dir / f"sglang_dump_{self._partial_name}" / full_filename
+
+        sample_value = get_truncated_value(value)
+
+        print(
+            f"[Dumper] [{rank}, {time.time()}] {path} "
+            f"type={type(value)} "
+            f"shape={value.shape if isinstance(value, torch.Tensor) else None} "
+            f"dtype={value.dtype if isinstance(value, torch.Tensor) else None} "
+            f"sample_value={sample_value}"
+        )
+
+        if self._enable_write_file:
+            path.parent.mkdir(parents=True, exist_ok=True)
+            torch.save(value, str(path))
+
+
+def _get_partial_name():
+    rank = _get_rank()
+    object_list = [str(time.time()) if rank == 0 else None]
+    if dist.is_initialized():
+        dist.broadcast_object_list(object_list, device="cuda")
+    return object_list[0]
+
+
+def _get_rank():
+    if dist.is_initialized():
+        return dist.get_rank()
+    else:
+        return 0
+
+
+def get_truncated_value(value):
+    if value is None:
+        return None
+
+    if isinstance(value, tuple):
+        return [get_truncated_value(x) for x in value]
+
+    if not isinstance(value, torch.Tensor):
+        return None
+
+    if value.numel() < 200:
+        return value
+
+    slices = [
+        slice(0, 5) if dim_size > 200 else slice(None) for dim_size in value.shape
+    ]
+    return value[tuple(slices)]
+
+
+dumper = _Dumper()
diff --git a/python/sglang/srt/debug_utils/text_comparator.py b/python/sglang/srt/debug_utils/text_comparator.py
new file mode 100644
index 000000000..3a6df19b9
--- /dev/null
+++ b/python/sglang/srt/debug_utils/text_comparator.py
@@ -0,0 +1,234 @@
+import argparse
+import hashlib
+import json
+from pathlib import Path
+
+import polars as pl
+
+_DESCRIPTION = """Compare and find differences to benchmark outputs.
+
+Supported inputs:
+* The samples jsonl from `lm_eval --log_samples --output_path FOLDER_NAME`
+* The output from `gsm8k/bench_sglang.py --raw-result-file FILE_NAME` (or mmlu)
+"""
+
+
+def main(args):
+    if args.data_type == "simple_evals":
+        df_input = _compute_df_input_mode_simple_evals(args)
+    else:
+        df_input = _transform_df_input(_compute_df_raw(args))
+
+    assert all(
+        c in df_input.columns
+        for c in ["category", "trial_index", "prompt_id", "prompt", "output", "correct"]
+    )
+
+    df_meta = _compute_df_meta(df_input)
+
+    df_correctness_per_trial = df_input.group_by(
+        "category", "trial_index", maintain_order=True
+    ).agg(pl.col("correct").mean())
+    df_correctness_delta = (
+        df_meta.group_by("correctness_delta").len().sort("correctness_delta")
+    )
+    df_good_to_bad = df_meta.filter(pl.col("correctness_delta") < 0)
+    df_bad_to_good = df_meta.filter(pl.col("correctness_delta") > 0)
+
+    print(f"Dump output to {args.output_path}")
+    Path(args.output_path).write_text(
+        json.dumps(
+            dict(
+                df_meta=df_meta.to_dicts(),
+                df_good_to_bad=df_good_to_bad.to_dicts(),
+                df_bad_to_good=df_bad_to_good.to_dicts(),
+            ),
+            indent=4,
+        ),
+    )
+
+    if not args.disable_print_details:
+        with pl.Config(
+            fmt_str_lengths=10000,
+            tbl_cols=-1,
+            tbl_rows=-1,
+            tbl_width_chars=-1,
+            tbl_formatting="UTF8_FULL",
+        ):
+            print("====== Correctness per trial ======")
+            print(df_correctness_per_trial)
+
+            print(
+                "====== Correctness Delta (-1.0 means all-right becomes all-wrong) ======"
+            )
+            print(df_correctness_delta)
+
+            for name, df in [
+                ("Good->Bad", df_good_to_bad),
+                ("Bad->Good", df_bad_to_good),
+            ]:
+                print(f"====== Concrete Examples: {name} ======")
+                print(df)
+
+
+def _compute_df_input_mode_simple_evals(args):
+    return pl.concat(
+        [
+            _compute_df_input_one_mode_simple_evals(**info)
+            for info in _get_file_infos(args=args)
+        ]
+    )
+
+
+def _compute_df_input_one_mode_simple_evals(path, category, trial_index):
+    data = json.loads(Path(path).read_text())
+    rows = []
+
+    for single_eval_result in data["metadata"]["single_eval_results"]:
+        prompt = single_eval_result["example_level_metadata"][
+            "actual_queried_prompt_messages"
+        ]
+        score = single_eval_result["score"]
+        assert score in {0.0, 1.0}, f"{score=}"
+
+        row = dict(
+            category=category,
+            trial_index=trial_index,
+            prompt_id=_compute_id_from_object(prompt),
+            prompt=json.dumps(prompt),
+            output=single_eval_result["example_level_metadata"]["response_text"],
+            correct=score == 1.0,
+        )
+        rows.append(row)
+
+    return pl.DataFrame(rows)
+
+
+def _compute_id_from_object(obj):
+    if isinstance(obj, pl.Series):
+        obj = obj.to_list()
+    json_str = json.dumps(obj, sort_keys=True, ensure_ascii=False)
+    return hashlib.sha256(json_str.encode("utf-8")).hexdigest()
+
+
+def _compute_df_raw(args):
+    return pl.concat(
+        [
+            _read_df_raw(
+                path=info["path"],
+                category=info["category"],
+                trial_index=info["trial_index"],
+            )
+            for info in _get_file_infos(args=args)
+        ]
+    )
+
+
+def _get_file_infos(args):
+    return [
+        dict(path=path, category=category, trial_index=trial_index)
+        for category, paths in [
+            ("baseline", args.baseline_path),
+            ("target", args.target_path),
+        ]
+        for trial_index, path in enumerate(paths)
+    ]
+
+
+def _read_df_raw(path: str, category: str, trial_index: int):
+    return pl.read_ndjson(path).with_columns(
+        category=pl.lit(category), trial_index=trial_index
+    )
+
+
+def _transform_df_input(df: pl.DataFrame):
+    if "doc_id" in df.columns:
+        print("Transform mode: lm_eval")
+
+        filter_names = df["filter"].unique(maintain_order=True).to_list()
+        if len(filter_names) > 1:
+            filter_name = filter_names[0]
+            print(f"Choose {filter_name=} among {filter_names}")
+            df = df.filter(pl.col("filter") == filter_name)
+
+        df = df.select(
+            pl.col("category"),
+            pl.col("trial_index"),
+            prompt_id=pl.col("doc_id"),
+            prompt=pl.col("arguments").struct.field("gen_args_0").struct.field("arg_0"),
+            output=pl.col("resps").list.get(0).list.get(0),
+            correct=pl.col("exact_match").cast(bool),
+        )
+
+        return df
+    elif "prompt_id" in df.columns:
+        print("Transform mode: SGLang bench")
+        return df
+    else:
+        raise Exception(
+            f"Unknown data: {df.columns}. You may need to set `--data-type` if using e.g. simple_evals."
+        )
+
+
+def _compute_df_meta(df_input: pl.DataFrame):
+    df_input = df_input.sort("prompt_id", "category", "trial_index")
+    df_meta = pl.DataFrame(
+        [
+            _handle_one_prompt(df_one_prompt)
+            for df_one_prompt in df_input.partition_by("prompt_id", maintain_order=True)
+        ]
+    )
+    df_meta = df_meta.with_columns(
+        correctness_delta=pl.col("correctness_target") - pl.col("correctness_baseline"),
+    )
+    df_meta = df_meta.sort("correctness_delta", "output_same_prefix_len")
+    return df_meta
+
+
+def _handle_one_prompt(df_one_prompt: pl.DataFrame):
+    assert (
+        len(set(_compute_id_from_object(obj) for obj in df_one_prompt["prompt"])) == 1
+    )
+
+    df_baseline = df_one_prompt.filter(pl.col("category") == "baseline")
+    df_target = df_one_prompt.filter(pl.col("category") == "target")
+
+    outputs_baseline = df_baseline["output"].to_list()
+    outputs_target = df_target["output"].to_list()
+
+    output_same_prefix_len = max(
+        _compute_str_prefix_len(output_baseline, output_target)
+        for output_baseline in outputs_baseline
+        for output_target in outputs_target
+    )
+
+    return dict(
+        prompt_id=df_one_prompt[0, "prompt_id"],
+        correctness_baseline=df_baseline["correct"].mean(),
+        correctness_target=df_target["correct"].mean(),
+        output_same_prefix_len=output_same_prefix_len,
+        prompt=df_one_prompt[0, "prompt"],
+        outputs_baseline=outputs_baseline,
+        outputs_target=outputs_target,
+    )
+
+
+def _compute_str_prefix_len(a: str, b: str) -> int:
+    min_len = min(len(a), len(b))
+    for i in range(min_len):
+        if a[i] != b[i]:
+            return i
+    return min_len
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=_DESCRIPTION)
+    parser.add_argument("--data-type", type=str, default="auto")
+    parser.add_argument("--baseline-path", type=str, nargs="+")
+    parser.add_argument("--target-path", type=str, nargs="+")
+    parser.add_argument(
+        "--output-path", type=str, default="/tmp/text_comparator_output.json"
+    )
+    parser.add_argument("--disable-print-details", action="store_true")
+    args = parser.parse_args()
+    main(args)
diff --git a/python/sglang/srt/disaggregation/ascend/__init__.py b/python/sglang/srt/disaggregation/ascend/__init__.py
new file mode 100644
index 000000000..2550f91a4
--- /dev/null
+++ b/python/sglang/srt/disaggregation/ascend/__init__.py
@@ -0,0 +1,6 @@
+from sglang.srt.disaggregation.ascend.conn import (
+    AscendKVBootstrapServer,
+    AscendKVManager,
+    AscendKVReceiver,
+    AscendKVSender,
+)
diff --git a/python/sglang/srt/disaggregation/ascend/conn.py b/python/sglang/srt/disaggregation/ascend/conn.py
new file mode 100644
index 000000000..b0009fc7c
--- /dev/null
+++ b/python/sglang/srt/disaggregation/ascend/conn.py
@@ -0,0 +1,117 @@
+import concurrent.futures
+import logging
+from typing import List, Tuple
+
+import numpy as np
+import numpy.typing as npt
+
+from sglang.srt.disaggregation.ascend.transfer_engine import AscendTransferEngine
+from sglang.srt.disaggregation.common.utils import group_concurrent_contiguous
+from sglang.srt.disaggregation.mooncake.conn import (
+    MooncakeKVBootstrapServer,
+    MooncakeKVManager,
+    MooncakeKVReceiver,
+    MooncakeKVSender,
+)
+from sglang.srt.utils import get_local_ip_by_remote
+
+logger = logging.getLogger(__name__)
+
+
+class AscendKVManager(MooncakeKVManager):
+    def init_engine(self):
+        # TransferEngine initialized on ascend.
+        local_ip = get_local_ip_by_remote()
+        self.engine = AscendTransferEngine(
+            hostname=local_ip,
+            npu_id=self.kv_args.gpu_id,
+            disaggregation_mode=self.disaggregation_mode,
+        )
+
+    def register_buffer_to_engine(self):
+        self.engine.batch_register(self.kv_args.kv_data_ptrs, self.kv_args.kv_data_lens)
+        # The Ascend backend optimize batch registration for small memory blocks.
+        self.engine.batch_register(
+            self.kv_args.aux_data_ptrs, self.kv_args.aux_data_lens
+        )
+
+    def send_kvcache(
+        self,
+        mooncake_session_id: str,
+        prefill_kv_indices: npt.NDArray[np.int32],
+        dst_kv_ptrs: list[int],
+        dst_kv_indices: npt.NDArray[np.int32],
+        executor: concurrent.futures.ThreadPoolExecutor,
+    ):
+        # Group by indices
+        prefill_kv_blocks, dst_kv_blocks = group_concurrent_contiguous(
+            prefill_kv_indices, dst_kv_indices
+        )
+
+        num_layers = len(self.kv_args.kv_data_ptrs)
+        layers_params = [
+            (
+                self.kv_args.kv_data_ptrs[layer_id],
+                dst_kv_ptrs[layer_id],
+                self.kv_args.kv_item_lens[layer_id],
+            )
+            for layer_id in range(num_layers)
+        ]
+
+        def set_transfer_blocks(
+            src_ptr: int, dst_ptr: int, item_len: int
+        ) -> List[Tuple[int, int, int]]:
+            transfer_blocks = []
+            for prefill_index, decode_index in zip(prefill_kv_blocks, dst_kv_blocks):
+                src_addr = src_ptr + int(prefill_index[0]) * item_len
+                dst_addr = dst_ptr + int(decode_index[0]) * item_len
+                length = item_len * len(prefill_index)
+                transfer_blocks.append((src_addr, dst_addr, length))
+            return transfer_blocks
+
+        # Worker function for processing a single layer
+        def process_layer(src_ptr: int, dst_ptr: int, item_len: int) -> int:
+            transfer_blocks = set_transfer_blocks(src_ptr, dst_ptr, item_len)
+            return self._transfer_data(mooncake_session_id, transfer_blocks)
+
+        # Worker function for processing all layers in a batch
+        def process_layers(layers_params: List[Tuple[int, int, int]]) -> int:
+            transfer_blocks = []
+            for src_ptr, dst_ptr, item_len in layers_params:
+                transfer_blocks.extend(set_transfer_blocks(src_ptr, dst_ptr, item_len))
+            return self._transfer_data(mooncake_session_id, transfer_blocks)
+
+        if self.enable_custom_mem_pool:
+            futures = [
+                executor.submit(
+                    process_layer,
+                    src_ptr,
+                    dst_ptr,
+                    item_len,
+                )
+                for (src_ptr, dst_ptr, item_len) in layers_params
+            ]
+            for future in concurrent.futures.as_completed(futures):
+                status = future.result()
+                if status != 0:
+                    for f in futures:
+                        f.cancel()
+                    return status
+        else:
+            # Combining all layers' params in one batch transfer is more efficient
+            # compared to using multiple threads
+            return process_layers(layers_params)
+
+        return 0
+
+
+class AscendKVSender(MooncakeKVSender):
+    pass
+
+
+class AscendKVReceiver(MooncakeKVReceiver):
+    pass
+
+
+class AscendKVBootstrapServer(MooncakeKVBootstrapServer):
+    pass
diff --git a/python/sglang/srt/disaggregation/ascend/transfer_engine.py b/python/sglang/srt/disaggregation/ascend/transfer_engine.py
new file mode 100644
index 000000000..0ccffffd6
--- /dev/null
+++ b/python/sglang/srt/disaggregation/ascend/transfer_engine.py
@@ -0,0 +1,58 @@
+import logging
+import os
+from typing import List, Optional
+
+from sglang.srt.disaggregation.mooncake.transfer_engine import MooncakeTransferEngine
+from sglang.srt.disaggregation.utils import DisaggregationMode
+
+logger = logging.getLogger(__name__)
+
+
+class AscendTransferEngine(MooncakeTransferEngine):
+
+    def __init__(
+        self, hostname: str, npu_id: int, disaggregation_mode: DisaggregationMode
+    ):
+        try:
+            from mf_adapter import TransferEngine
+        except ImportError as e:
+            raise ImportError(
+                "Please install mf_adapter, for details, see docs/backend/pd_disaggregation.md"
+            ) from e
+
+        self.engine = TransferEngine()
+        self.hostname = hostname
+        self.npu_id = npu_id
+
+        # Centralized storage address of the AscendTransferEngine
+        self.store_url = os.getenv("ASCEND_MF_STORE_URL")
+        if disaggregation_mode == DisaggregationMode.PREFILL:
+            self.role = "Prefill"
+        elif disaggregation_mode == DisaggregationMode.DECODE:
+            self.role = "Decode"
+        else:
+            logger.error(f"Unsupported DisaggregationMode: {disaggregation_mode}")
+            raise ValueError(f"Unsupported DisaggregationMode: {disaggregation_mode}")
+        self.session_id = f"{self.hostname}:{self.engine.get_rpc_port()}"
+        self.initialize()
+
+    def initialize(self) -> None:
+        """Initialize the ascend transfer instance."""
+        ret_value = self.engine.initialize(
+            self.store_url,
+            self.session_id,
+            self.role,
+            self.npu_id,
+        )
+        if ret_value != 0:
+            logger.error("Ascend Transfer Engine initialization failed.")
+            raise RuntimeError("Ascend Transfer Engine initialization failed.")
+
+    def batch_register(self, ptrs: List[int], lengths: List[int]):
+        try:
+            ret_value = self.engine.batch_register_memory(ptrs, lengths)
+        except Exception:
+            # Mark register as failed
+            ret_value = -1
+        if ret_value != 0:
+            logger.debug(f"Ascend memory registration for ptr {ptrs} failed.")
diff --git a/python/sglang/srt/disaggregation/base/__init__.py b/python/sglang/srt/disaggregation/base/__init__.py
new file mode 100644
index 000000000..ef0f797fc
--- /dev/null
+++ b/python/sglang/srt/disaggregation/base/__init__.py
@@ -0,0 +1,8 @@
+from sglang.srt.disaggregation.base.conn import (
+    BaseKVBootstrapServer,
+    BaseKVManager,
+    BaseKVReceiver,
+    BaseKVSender,
+    KVArgs,
+    KVPoll,
+)
diff --git a/python/sglang/srt/disaggregation/base/conn.py b/python/sglang/srt/disaggregation/base/conn.py
new file mode 100644
index 000000000..3f5877ea3
--- /dev/null
+++ b/python/sglang/srt/disaggregation/base/conn.py
@@ -0,0 +1,134 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, List, Optional
+
+import numpy as np
+import numpy.typing as npt
+
+from sglang.srt.server_args import ServerArgs
+
+if TYPE_CHECKING:
+    from sglang.srt.disaggregation.utils import DisaggregationMode
+
+
+class KVArgs:
+    engine_rank: int
+    kv_data_ptrs: List[int]
+    kv_data_lens: List[int]
+    kv_item_lens: List[int]
+    aux_data_ptrs: List[int]
+    aux_data_lens: List[int]
+    aux_item_lens: List[int]
+    ib_device: str
+    ib_traffic_class: str
+    gpu_id: int
+    # for different tp
+    decode_tp_size: int
+    kv_head_num: int
+    page_size: int
+    # for pp prefill
+    prefill_pp_size: int
+    pp_rank: int
+    prefill_start_layer: int
+    # for system dp
+    system_dp_rank: int
+
+
+class KVPoll:
+    Failed = 0
+    Bootstrapping = 1
+    WaitingForInput = 2
+    Transferring = 3
+    Success = 4
+
+
+class BaseKVManager(ABC):
+    """Base class for managing transfers states"""
+
+    @abstractmethod
+    def __init__(
+        self,
+        args: KVArgs,
+        disaggregation_mode: DisaggregationMode,
+        server_args: ServerArgs,
+        is_mla_backend: Optional[bool] = False,
+    ): ...
+
+
+class BaseKVSender(ABC):
+
+    @abstractmethod
+    def __init__(
+        self,
+        mgr: BaseKVManager,
+        bootstrap_addr: str,
+        bootstrap_room: int,
+        dest_tp_ranks: List[int],
+        pp_rank: int,
+    ): ...
+
+    @abstractmethod
+    def init(self, num_kv_indices: int, aux_index: Optional[int] = None):
+        """
+        Notify the decoder server about the kv indices length and aux index
+        """
+        ...
+
+    @abstractmethod
+    def send(self, kv_indices: npt.NDArray[np.int32]):
+        """
+        Send the kv cache at the given kv indices to the decoder server
+        """
+        ...
+
+    @abstractmethod
+    def poll(self) -> KVPoll:
+        """
+        Check the status of the kv cache transfer
+        """
+        ...
+
+    @abstractmethod
+    def failure_exception(self):
+        """
+        Raise an exception if the kv cache transfer fails
+        """
+        ...
+
+
+class BaseKVReceiver(ABC):
+
+    @abstractmethod
+    def __init__(
+        self,
+        mgr: BaseKVManager,
+        bootstrap_addr: str,
+        bootstrap_room: Optional[int] = None,
+    ): ...
+
+    @abstractmethod
+    def init(self, kv_indices: npt.NDArray[np.int32], aux_index: Optional[int] = None):
+        """
+        Notify the prefill server about the kv indices and aux index
+        """
+        ...
+
+    @abstractmethod
+    def poll(self) -> KVPoll:
+        """
+        Check the status of the kv cache transfer
+        """
+        ...
+
+    @abstractmethod
+    def failure_exception(self):
+        """
+        Raise an exception if the kv cache transfer fails
+        """
+        ...
+
+
+class BaseKVBootstrapServer(ABC):
+    @abstractmethod
+    def __init__(self, host: str, port: int): ...
diff --git a/python/sglang/srt/disaggregation/common/__init__.py b/python/sglang/srt/disaggregation/common/__init__.py
new file mode 100644
index 000000000..8294c3892
--- /dev/null
+++ b/python/sglang/srt/disaggregation/common/__init__.py
@@ -0,0 +1,5 @@
+from sglang.srt.disaggregation.common.conn import (
+    CommonKVBootstrapServer,
+    CommonKVManager,
+    CommonKVReceiver,
+)
diff --git a/python/sglang/srt/disaggregation/common/conn.py b/python/sglang/srt/disaggregation/common/conn.py
new file mode 100644
index 000000000..10b6093b9
--- /dev/null
+++ b/python/sglang/srt/disaggregation/common/conn.py
@@ -0,0 +1,438 @@
+from __future__ import annotations
+
+import asyncio
+import logging
+import socket
+import threading
+from functools import cache
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import numpy.typing as npt
+import requests
+import zmq
+from aiohttp import web
+
+from sglang.srt.disaggregation.base.conn import (
+    BaseKVBootstrapServer,
+    BaseKVManager,
+    BaseKVReceiver,
+    BaseKVSender,
+    KVArgs,
+    KVPoll,
+)
+from sglang.srt.disaggregation.utils import DisaggregationMode
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import (
+    format_tcp_address,
+    get_free_port,
+    get_ip,
+    get_local_ip_by_remote,
+    is_valid_ipv6_address,
+    maybe_wrap_ipv6_address,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class CommonKVManager(BaseKVManager):
+    def __init__(
+        self,
+        args: KVArgs,
+        disaggregation_mode: DisaggregationMode,
+        server_args: ServerArgs,
+        is_mla_backend: Optional[bool] = False,
+    ):
+        self.kv_args = args
+        self.is_mla_backend = is_mla_backend
+        self.disaggregation_mode = disaggregation_mode
+        # for p/d multi node infer
+        self.bootstrap_host = server_args.host
+        self.bootstrap_port = server_args.disaggregation_bootstrap_port
+        self.dist_init_addr = server_args.dist_init_addr
+        self.tp_size = server_args.tp_size
+        self.dp_size = server_args.dp_size
+        self.enable_dp_attention = server_args.enable_dp_attention
+        if not server_args.enable_dp_attention and server_args.dp_size != 1:
+            raise ValueError(
+                "If dp_attention is not enabled, dp size must be 1 in disaggregation mode."
+            )
+
+        self.rank_port = get_free_port()
+        if self.disaggregation_mode == DisaggregationMode.PREFILL:
+            self._register_to_bootstrap()
+        elif self.disaggregation_mode == DisaggregationMode.DECODE:
+            self.connection_pool: Dict[str, Dict[str, Union[str, int]]] = {}
+            self.prefill_tp_size_table: Dict[str, int] = {}
+            self.prefill_dp_size_table: Dict[str, int] = {}
+        else:
+            raise ValueError(
+                f"Unsupported DisaggregationMode: {self.disaggregation_mode}"
+            )
+
+    def _register_to_bootstrap(self):
+        """Register KVSender to bootstrap server via HTTP POST."""
+        if self.dist_init_addr:
+            # multi node: bootstrap server's host is dist_init_addr
+            if self.dist_init_addr.startswith("["):  # [ipv6]:port or [ipv6]
+                if self.dist_init_addr.endswith("]"):
+                    host = self.dist_init_addr
+                else:
+                    host, _ = self.dist_init_addr.rsplit(":", 1)
+            else:
+                host = socket.gethostbyname(self.dist_init_addr.rsplit(":", 1)[0])
+        else:
+            # single node: bootstrap server's host is same as http server's host
+            host = self.bootstrap_host
+            host = maybe_wrap_ipv6_address(host)
+
+        bootstrap_server_url = f"{host}:{self.bootstrap_port}"
+        url = f"http://{bootstrap_server_url}/route"
+        payload = {
+            "role": "Prefill",
+            "tp_size": self.tp_size,
+            "dp_size": self.dp_size,
+            "rank_ip": get_local_ip_by_remote(),
+            "rank_port": self.rank_port,
+            "engine_rank": self.kv_args.engine_rank,
+        }
+
+        try:
+            response = requests.put(url, json=payload)
+            if response.status_code == 200:
+                logger.debug("Prefill successfully registered to bootstrap server.")
+            else:
+                logger.error(
+                    f"Prefill Failed to connect to bootstrap server: {response.status_code}, {response.text}"
+                )
+        except Exception as e:
+            logger.error(f"Prefill Failed to register to bootstrap server: {e}")
+
+    @cache
+    def _connect(self, endpoint: str, is_ipv6: bool = False):
+        socket = zmq.Context().socket(zmq.PUSH)
+        if is_ipv6:
+            socket.setsockopt(zmq.IPV6, 1)
+        socket.connect(endpoint)
+        return socket
+
+
+class CommonKVReceiver(BaseKVReceiver):
+    _ctx = zmq.Context()
+    _socket_cache = {}
+    _socket_locks = {}
+    _global_lock = threading.Lock()
+
+    def __init__(
+        self,
+        mgr: BaseKVManager,
+        bootstrap_addr: str,
+        bootstrap_room: Optional[int] = None,
+        prefill_dp_rank: Optional[int] = None,
+    ):
+        self.bootstrap_room = bootstrap_room
+        self.bootstrap_addr = bootstrap_addr
+        self.kv_mgr = mgr
+
+        if self.bootstrap_addr not in self.kv_mgr.prefill_dp_size_table:
+            self.prefill_tp_size, self.prefill_dp_size = (
+                self._get_prefill_dp_size_from_server()
+            )
+            if self.prefill_tp_size is None or self.prefill_dp_size is None:
+                logger.error(
+                    f"Could not fetch prefill parallel info for bootstrap_addr: {self.bootstrap_addr}"
+                )
+            else:
+                self.kv_mgr.prefill_tp_size_table[self.bootstrap_addr] = (
+                    self.prefill_tp_size
+                )
+                self.kv_mgr.prefill_dp_size_table[self.bootstrap_addr] = (
+                    self.prefill_dp_size
+                )
+        else:
+            self.prefill_tp_size = self.kv_mgr.prefill_tp_size_table[
+                self.bootstrap_addr
+            ]
+            self.prefill_dp_size = self.kv_mgr.prefill_dp_size_table[
+                self.bootstrap_addr
+            ]
+
+        # Currently, we don't allow prefill instance and decode instance to
+        # have different TP sizes per DP rank, except for models using MLA.
+        local_tp_size_per_dp_rank = self.kv_mgr.tp_size // self.kv_mgr.dp_size
+        prefill_tp_size_per_dp_rank = self.prefill_tp_size // self.prefill_dp_size
+        if local_tp_size_per_dp_rank == prefill_tp_size_per_dp_rank:
+            self.target_tp_rank = (
+                self.kv_mgr.kv_args.engine_rank % local_tp_size_per_dp_rank
+            )
+            self.required_dst_info_num = 1
+            self.target_tp_ranks = [self.target_tp_rank]
+        elif local_tp_size_per_dp_rank > prefill_tp_size_per_dp_rank:
+            self.target_tp_rank = (
+                self.kv_mgr.kv_args.engine_rank % local_tp_size_per_dp_rank
+            ) // (local_tp_size_per_dp_rank // prefill_tp_size_per_dp_rank)
+            self.required_dst_info_num = (
+                local_tp_size_per_dp_rank // prefill_tp_size_per_dp_rank
+            )
+            self.target_tp_ranks = [self.target_tp_rank]
+        else:
+            assert (
+                self.kv_mgr.is_mla_backend
+            ), "PD with different TP sizes per DP rank is not yet supported for non-MLA models"
+
+            # For non-MLA models, one decode rank needs to retrieve KVCache from multiple prefill ranks for non MLA models;
+            self.target_tp_ranks = [
+                rank
+                for rank in range(
+                    (self.kv_mgr.kv_args.engine_rank % local_tp_size_per_dp_rank)
+                    * (prefill_tp_size_per_dp_rank // local_tp_size_per_dp_rank),
+                    (self.kv_mgr.kv_args.engine_rank % local_tp_size_per_dp_rank + 1)
+                    * (prefill_tp_size_per_dp_rank // local_tp_size_per_dp_rank),
+                )
+            ]
+
+            # For MLA models, we can retrieve KVCache from only one prefill rank, but we still need to maintain
+            # multiple connections in the connection pool and have to send dummy requests to other prefill ranks,
+            # or the KVPoll will never be set correctly
+            self.target_tp_rank = self.target_tp_ranks[0]
+            self.required_dst_info_num = 1
+
+        if prefill_dp_rank is not None:
+            logger.debug(f"Targeting DP rank: {prefill_dp_rank}")
+            self.prefill_dp_rank = prefill_dp_rank
+        else:
+            self.prefill_dp_rank = bootstrap_room % self.prefill_dp_size
+
+        # FIXME: alias here: target_dp_group -> prefill_dp_rank
+        self.target_dp_group = self.prefill_dp_rank
+
+        # NOTE: key distinguished by bootstrap_addr, target_dp_group, and target_tp_rank
+        bootstrap_key = (
+            f"{self.bootstrap_addr}_{self.target_dp_group}_{self.target_tp_rank}"
+        )
+
+        if bootstrap_key not in self.kv_mgr.connection_pool:
+            bootstrap_infos = []
+            for target_tp_rank in self.target_tp_ranks:
+                bootstrap_info = self._get_bootstrap_info_from_server(
+                    target_tp_rank,
+                    self.target_dp_group,
+                )
+                if bootstrap_info is not None:
+                    # NOTE: only support MLA for now: select one prefill rank as real rank
+                    bootstrap_info["is_dummy"] = not bool(
+                        target_tp_rank == self.target_tp_rank
+                        or self.target_tp_rank is None
+                    )
+                    bootstrap_infos.append(bootstrap_info)
+                else:
+                    logger.error(
+                        f"Could not fetch bootstrap info for engine rank: {self.kv_mgr.kv_args.engine_rank} and target_dp_group: {self.target_dp_group}"
+                    )
+            self.bootstrap_infos = bootstrap_infos
+
+            if len(self.bootstrap_infos) == 0:
+                logger.error(
+                    f"Could not fetch bootstrap info for engine rank: {self.kv_mgr.kv_args.engine_rank}"
+                )
+            else:
+                self.kv_mgr.connection_pool[bootstrap_key] = self.bootstrap_infos
+                # Register kv_args only once to prefill KVManager according to the info fetched from the bootstrap server
+                self._register_kv_args()
+        else:
+            self.bootstrap_infos = self.kv_mgr.connection_pool[bootstrap_key]
+
+        assert len(self.bootstrap_infos) > 0
+
+    def _get_bootstrap_info_from_server(self, engine_rank, target_dp_group):
+        """Fetch the bootstrap info from the bootstrap server."""
+        try:
+            url = f"http://{self.bootstrap_addr}/route?engine_rank={engine_rank}&target_dp_group={target_dp_group}"
+            response = requests.get(url)
+            if response.status_code == 200:
+                bootstrap_info = response.json()
+                return bootstrap_info
+            else:
+                logger.error(
+                    f"Failed to get prefill server info: {response.status_code}, {response.text}"
+                )
+                return None
+        except Exception as e:
+            logger.error(f"Error fetching prefill info from bootstrap: {e}")
+            return None
+
+    def _get_prefill_dp_size_from_server(self) -> int:
+        """Fetch the prefill parallel info from the bootstrap server."""
+        try:
+            url = f"http://{self.bootstrap_addr}/route?engine_rank={-1}&target_dp_group={-1}"
+            response = requests.get(url)
+            if response.status_code == 200:
+                prefill_parallel_info = response.json()
+                return int(prefill_parallel_info["prefill_tp_size"]), int(
+                    prefill_parallel_info["prefill_dp_size"]
+                )
+            else:
+                logger.error(
+                    f"Failed to get prefill parallel info: {response.status_code}, {response.text}"
+                )
+                return None
+        except Exception as e:
+            logger.error(f"Error fetching prefill parallel info from bootstrap: {e}")
+            return None
+
+    @classmethod
+    def _connect(cls, endpoint: str, is_ipv6: bool = False):
+        with cls._global_lock:
+            if endpoint not in cls._socket_cache:
+                sock = cls._ctx.socket(zmq.PUSH)
+                if is_ipv6:
+                    sock.setsockopt(zmq.IPV6, 1)
+                sock.connect(endpoint)
+                cls._socket_cache[endpoint] = sock
+                cls._socket_locks[endpoint] = threading.Lock()
+            return cls._socket_cache[endpoint], cls._socket_locks[endpoint]
+
+    @classmethod
+    def _connect_to_bootstrap_server(cls, bootstrap_info: dict):
+        ip_address = bootstrap_info["rank_ip"]
+        port = bootstrap_info["rank_port"]
+        is_ipv6_address = is_valid_ipv6_address(ip_address)
+        sock, lock = cls._connect(
+            format_tcp_address(ip_address, port), is_ipv6=is_ipv6_address
+        )
+        return sock, lock
+
+    def _register_kv_args(self):
+        pass
+
+    def failure_exception(self):
+        raise Exception("Fake KVReceiver Exception")
+
+
+class CommonKVBootstrapServer(BaseKVBootstrapServer):
+    def __init__(self, host: str, port: int):
+        self.host = host
+        self.port = port
+        self.app = web.Application()
+        self.store = dict()
+        self.lock = asyncio.Lock()
+        self._setup_routes()
+        self.tp_size = None
+        self.dp_size = None
+        self.tp_size_per_dp_rank = None
+        self.prefill_port_table: Dict[int, Dict[int, Dict[str, Union[str, int]]]] = {}
+
+        # Start bootstrap server
+        self.thread = threading.Thread(target=self._run_server, daemon=True)
+        self.run()
+
+    def run(self):
+        self.thread.start()
+
+    def _setup_routes(self):
+        self.app.router.add_route("*", "/route", self._handle_route)
+
+    async def _handle_route(self, request: web.Request):
+        method = request.method
+        if method == "PUT":
+            return await self._handle_route_put(request)
+        elif method == "GET":
+            return await self._handle_route_get(request)
+        else:
+            return web.Response(
+                text="Method not allowed", status=405, content_type="application/json"
+            )
+
+    async def _handle_route_put(self, request: web.Request):
+        data = await request.json()
+        role = data["role"]
+        tp_size = data["tp_size"]
+        dp_size = data["dp_size"]
+        rank_ip = data["rank_ip"]
+        rank_port = int(data["rank_port"])
+        engine_rank = int(data["engine_rank"])
+
+        if self.tp_size is None:
+            self.tp_size = tp_size
+
+        if self.dp_size is None:
+            self.dp_size = dp_size
+
+        tp_size_per_dp_rank = tp_size // dp_size
+        if self.tp_size_per_dp_rank == None:
+            self.tp_size_per_dp_rank = tp_size_per_dp_rank
+
+        # Add lock to make sure thread-safe
+        if role == "Prefill":
+            dp_group = engine_rank // tp_size_per_dp_rank
+            tp_rank_in_dp_group = engine_rank % tp_size_per_dp_rank
+
+            async with self.lock:
+                if dp_group not in self.prefill_port_table:
+                    self.prefill_port_table[dp_group] = {}
+
+            self.prefill_port_table[dp_group][tp_rank_in_dp_group] = {
+                "rank_ip": rank_ip,
+                "rank_port": rank_port,
+            }
+            logger.debug(
+                f"Register Prefill bootstrap: {engine_rank} with rank_ip: {rank_ip} and rank_port: {rank_port}"
+            )
+
+        return web.Response(text="OK", status=200)
+
+    async def _handle_route_get(self, request: web.Request):
+        engine_rank = request.query.get("engine_rank")
+        target_dp_group = request.query.get("target_dp_group")
+        if not engine_rank or not target_dp_group:
+            return web.Response(text="Missing inputs for bootstrap server.", status=400)
+
+        # Currently we use engine_rank == -1 and target_dp_group == -1 to sync dp size
+        if int(engine_rank) == -1 and int(target_dp_group) == -1:
+            prefill_parallel_info = {
+                "prefill_tp_size": self.tp_size,
+                "prefill_dp_size": self.dp_size,
+            }
+            return web.json_response(prefill_parallel_info, status=200)
+
+        # Find corresponding prefill info
+        async with self.lock:
+            bootstrap_info = self.prefill_port_table[int(target_dp_group)][
+                int(engine_rank)
+            ]
+
+        if bootstrap_info is not None:
+            return web.json_response(bootstrap_info, status=200)
+        else:
+            return web.Response(text="Bootstrap info not Found", status=404)
+
+    def _run_server(self):
+        try:
+            # Event Loop
+            self._loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(self._loop)
+
+            self._runner = web.AppRunner(self.app)
+            self._loop.run_until_complete(self._runner.setup())
+
+            site = web.TCPSite(self._runner, host=self.host, port=self.port)
+            self._loop.run_until_complete(site.start())
+            self._loop.run_forever()
+        except Exception as e:
+            logger.error(f"Server error: {str(e)}")
+        finally:
+            # Cleanup
+            self._loop.run_until_complete(self._runner.cleanup())
+            self._loop.close()
+
+    def close(self):
+        """Shutdown"""
+        if self._loop is not None and self._loop.is_running():
+            self._loop.call_soon_threadsafe(self._loop.stop)
+            logger.info("Stopping server loop...")
+
+        if self.thread.is_alive():
+            self.thread.join(timeout=2)
+            logger.info("Server thread stopped")
+
+    def poll(self) -> KVPoll: ...
diff --git a/python/sglang/srt/disaggregation/common/utils.py b/python/sglang/srt/disaggregation/common/utils.py
new file mode 100644
index 000000000..6f3da2128
--- /dev/null
+++ b/python/sglang/srt/disaggregation/common/utils.py
@@ -0,0 +1,42 @@
+import threading
+from collections import deque
+from typing import List, Tuple
+
+import numpy as np
+import numpy.typing as npt
+
+
+class FastQueue:
+    def __init__(self):
+        self._buf = deque()
+        self._cond = threading.Condition()
+
+    def put(self, item):
+        with self._cond:
+            self._buf.append(item)
+            # wake up a thread of wait()
+            self._cond.notify()
+
+    def get(self):
+        with self._cond:
+            # if queue is empty  ,block until is notified()
+            while not self._buf:
+                self._cond.wait()
+            return self._buf.popleft()
+
+
+def group_concurrent_contiguous(
+    src_indices: npt.NDArray[np.int32], dst_indices: npt.NDArray[np.int32]
+) -> Tuple[List[npt.NDArray[np.int32]], List[npt.NDArray[np.int32]]]:
+    """Vectorised NumPy implementation."""
+    if src_indices.size == 0:
+        return [], []
+
+    brk = np.where((np.diff(src_indices) != 1) | (np.diff(dst_indices) != 1))[0] + 1
+    src_groups = np.split(src_indices, brk)
+    dst_groups = np.split(dst_indices, brk)
+
+    src_groups = [g.tolist() for g in src_groups]
+    dst_groups = [g.tolist() for g in dst_groups]
+
+    return src_groups, dst_groups
diff --git a/python/sglang/srt/disaggregation/decode.py b/python/sglang/srt/disaggregation/decode.py
new file mode 100644
index 000000000..b79c8ca87
--- /dev/null
+++ b/python/sglang/srt/disaggregation/decode.py
@@ -0,0 +1,894 @@
+"""
+Life cycle of a request in the decode server
+
+1. PreallocQueue:
+    a. Initialize a receiver for each request
+    b. The request handshakes first, and pre-allocate kv once there is available kv.
+    c. Move the request to TransferQueue.
+
+2. TransferQueue:
+    a. Poll the receiver to check the transfer state
+    b. If the transfer has finished, move the request to waiting queue
+
+3. WaitingQueue:
+    a. Use the requests in the queue to construct a PrebuiltExtendBatch
+    b. Skip the prefill forward but only populate metadata
+
+4. RunningBatch:
+    a. Merge the resolved PrebuiltExtendBatch into running batch to run decoding
+"""
+
+from __future__ import annotations
+
+import logging
+from collections import deque
+from dataclasses import dataclass
+from http import HTTPStatus
+from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union
+
+import torch
+from torch.distributed import ProcessGroup
+
+from sglang.srt.constants import GPU_MEMORY_TYPE_KV_CACHE
+from sglang.srt.disaggregation.base import BaseKVManager, BaseKVReceiver, KVPoll
+from sglang.srt.disaggregation.utils import (
+    FAKE_BOOTSTRAP_HOST,
+    DisaggregationMode,
+    KVClassType,
+    MetadataBuffers,
+    ReqToMetadataIdxAllocator,
+    TransferBackend,
+    get_kv_class,
+    is_mla_backend,
+    kv_to_page_indices,
+    poll_and_all_reduce,
+    prepare_abort,
+)
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+from sglang.srt.managers.schedule_batch import FINISH_ABORT, ScheduleBatch
+from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
+from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
+from sglang.srt.mem_cache.memory_pool import KVCache, ReqToTokenPool
+from sglang.srt.model_executor.forward_batch_info import ForwardMode
+from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
+from sglang.srt.utils import get_int_env_var, require_mlp_sync
+
+logger = logging.getLogger(__name__)
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req
+    from sglang.srt.managers.scheduler import Scheduler
+
+CLIP_MAX_NEW_TOKEN = get_int_env_var("SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION", 4096)
+
+
+class DecodeReqToTokenPool:
+    """
+    The difference of DecodeReqToTokenPool and ReqToTokenPool is that
+    DecodeReqToTokenPool subscribes memory for pre-allocated requests.
+
+    In ReqToTokenPool, if `--max-running-requests` is 8,
+    #pre-allocated + #transfer + #running <= 8, but there are in fact more memory can carry pre-allocated requests.
+
+    In DecodeReqToTokenPool, if `--max-running-requests` is 8,
+    #running <= 8, #pre-allocated + #transfer <= pre_alloc_size, so we can use the free memory to pre-allocate requests to unblock prefill.
+    """
+
+    def __init__(
+        self,
+        size: int,
+        max_context_len: int,
+        device: str,
+        enable_memory_saver: bool,
+        pre_alloc_size: int,
+    ):
+        memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=enable_memory_saver
+        )
+
+        self.size = size
+        self.max_context_len = max_context_len
+        self.device = device
+        self.pre_alloc_size = pre_alloc_size
+        with memory_saver_adapter.region(tag=GPU_MEMORY_TYPE_KV_CACHE):
+            self.req_to_token = torch.zeros(
+                (size + pre_alloc_size, max_context_len),
+                dtype=torch.int32,
+                device=device,
+            )
+
+        self.free_slots = list(range(size + pre_alloc_size))
+
+    def write(self, indices, values):
+        self.req_to_token[indices] = values
+
+    def available_size(self):
+        return len(self.free_slots)
+
+    def alloc(self, need_size: int) -> List[int]:
+        if need_size > len(self.free_slots):
+            return None
+
+        select_index = self.free_slots[:need_size]
+        self.free_slots = self.free_slots[need_size:]
+        return select_index
+
+    def free(self, free_index: Union[int, List[int]]):
+        if isinstance(free_index, (int,)):
+            self.free_slots.append(free_index)
+        else:
+            self.free_slots.extend(free_index)
+
+    def clear(self):
+        self.free_slots = list(range(self.size + self.pre_alloc_size))
+
+
+@dataclass
+class DecodeRequest:
+    req: Req
+    kv_receiver: BaseKVReceiver
+    waiting_for_input: bool = False
+    metadata_buffer_index: int = -1
+
+
+class DecodePreallocQueue:
+    """
+    Store the requests that are preallocating.
+    """
+
+    def __init__(
+        self,
+        req_to_token_pool: ReqToTokenPool,
+        token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator,
+        draft_token_to_kv_pool: Optional[KVCache],
+        req_to_metadata_buffer_idx_allocator: ReqToMetadataIdxAllocator,
+        metadata_buffers: MetadataBuffers,
+        scheduler: Scheduler,
+        transfer_queue: DecodeTransferQueue,
+        tree_cache: BasePrefixCache,
+        gloo_group: ProcessGroup,
+        tp_rank: int,
+        tp_size: int,
+        dp_size: int,
+        gpu_id: int,
+        bootstrap_port: int,
+        max_total_num_tokens: int,
+        prefill_pp_size: int,
+        num_reserved_decode_tokens: int,
+        transfer_backend: TransferBackend,
+    ):
+        self.req_to_token_pool = req_to_token_pool
+        self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
+        self.token_to_kv_pool = token_to_kv_pool_allocator.get_kvcache()
+        self.draft_token_to_kv_pool = draft_token_to_kv_pool
+        self.is_mla_backend = is_mla_backend(self.token_to_kv_pool)
+        self.metadata_buffers = metadata_buffers
+        self.req_to_metadata_buffer_idx_allocator = req_to_metadata_buffer_idx_allocator
+        self.scheduler = scheduler
+        self.transfer_queue = transfer_queue
+        self.tree_cache = tree_cache  # this is always a chunk cache
+        self.gloo_group = gloo_group
+        self.tp_rank = tp_rank
+        self.tp_size = tp_size
+        self.dp_size = dp_size
+        self.gpu_id = gpu_id
+        self.bootstrap_port = bootstrap_port
+        self.max_total_num_tokens = max_total_num_tokens
+        self.prefill_pp_size = prefill_pp_size
+        self.num_reserved_decode_tokens = num_reserved_decode_tokens
+        self.transfer_backend = transfer_backend
+        # Queue for requests pending pre-allocation
+        self.queue: List[DecodeRequest] = []
+        self.retracted_queue: List[Req] = []
+        self.prefill_pp_size = prefill_pp_size
+        self.kv_manager = self._init_kv_manager()
+
+    def _init_kv_manager(self) -> BaseKVManager:
+        kv_args_class = get_kv_class(self.transfer_backend, KVClassType.KVARGS)
+        kv_args = kv_args_class()
+
+        attn_tp_size = get_attention_tp_size()
+        kv_args.engine_rank = self.tp_rank % (attn_tp_size)
+
+        kv_args.decode_tp_size = attn_tp_size
+        # Note(shangming): pp is not supported on the decode side yet, so its rank is fixed to 0
+        kv_args.pp_rank = 0
+        kv_args.system_dp_rank = self.scheduler.dp_rank
+        kv_args.prefill_pp_size = self.prefill_pp_size
+        kv_data_ptrs, kv_data_lens, kv_item_lens = (
+            self.token_to_kv_pool.get_contiguous_buf_infos()
+        )
+        if self.draft_token_to_kv_pool is not None:
+            # We should also transfer draft model kv cache. The indices are
+            # always shared with a target model.
+            draft_kv_data_ptrs, draft_kv_data_lens, draft_kv_item_lens = (
+                self.draft_token_to_kv_pool.get_contiguous_buf_infos()
+            )
+            kv_data_ptrs += draft_kv_data_ptrs
+            kv_data_lens += draft_kv_data_lens
+            kv_item_lens += draft_kv_item_lens
+
+        kv_args.kv_data_ptrs = kv_data_ptrs
+        kv_args.kv_data_lens = kv_data_lens
+        kv_args.kv_item_lens = kv_item_lens
+
+        kv_args.aux_data_ptrs, kv_args.aux_data_lens, kv_args.aux_item_lens = (
+            self.metadata_buffers.get_buf_infos()
+        )
+
+        kv_args.ib_device = self.scheduler.server_args.disaggregation_ib_device
+        kv_args.gpu_id = self.scheduler.gpu_id
+        kv_manager_class: Type[BaseKVManager] = get_kv_class(
+            self.transfer_backend, KVClassType.MANAGER
+        )
+        kv_manager: BaseKVManager = kv_manager_class(
+            kv_args,
+            DisaggregationMode.DECODE,
+            self.scheduler.server_args,
+            self.is_mla_backend,
+        )
+        return kv_manager
+
+    def add(self, req: Req, is_retracted: bool = False) -> None:
+        """Add a request to the pending queue."""
+        if self._check_if_req_exceed_kv_capacity(req):
+            return
+
+        if is_retracted:
+            self.retracted_queue.append(req)
+        else:
+            if req.bootstrap_host == FAKE_BOOTSTRAP_HOST:
+                kv_receiver_class = get_kv_class(
+                    TransferBackend.FAKE, KVClassType.RECEIVER
+                )
+            else:
+                kv_receiver_class = get_kv_class(
+                    self.transfer_backend, KVClassType.RECEIVER
+                )
+
+            kv_receiver = kv_receiver_class(
+                mgr=self.kv_manager,
+                bootstrap_addr=f"{req.bootstrap_host}:{req.bootstrap_port}",
+                bootstrap_room=req.bootstrap_room,
+                prefill_dp_rank=req.data_parallel_rank,
+            )
+
+            self.queue.append(
+                DecodeRequest(req=req, kv_receiver=kv_receiver, waiting_for_input=False)
+            )
+
+    def _check_if_req_exceed_kv_capacity(self, req: Req) -> bool:
+        if len(req.origin_input_ids) > self.max_total_num_tokens:
+            message = f"Request {req.rid} exceeds the maximum number of tokens: {len(req.origin_input_ids)} > {self.max_total_num_tokens}"
+            logger.error(message)
+            prepare_abort(req, message, status_code=HTTPStatus.BAD_REQUEST)
+            self.scheduler.stream_output([req], req.return_logprob)
+            return True
+        return False
+
+    def extend(self, reqs: List[Req], is_retracted: bool = False) -> None:
+        """Add a request to the pending queue."""
+        for req in reqs:
+            self.add(req, is_retracted=is_retracted)
+
+    def resume_retracted_reqs(self) -> List[Req]:
+        # TODO refactor the scheduling part, reuse with the unified engine logic as much as possible
+
+        # allocate memory
+        resumed_reqs = []
+        indices_to_remove = set()
+        allocatable_tokens = self._allocatable_tokens(count_retracted=False)
+
+        for i, req in enumerate(self.retracted_queue):
+            if self.req_to_token_pool.available_size() <= 0:
+                break
+
+            required_tokens_for_request = (
+                len(req.origin_input_ids)
+                + len(req.output_ids)
+                + self.num_reserved_decode_tokens
+            )
+            if required_tokens_for_request > allocatable_tokens:
+                break
+
+            resumed_reqs.append(req)
+            indices_to_remove.add(i)
+            req.is_retracted = False
+            self._pre_alloc(req)
+            allocatable_tokens -= required_tokens_for_request
+
+            # load from cpu, release the cpu copy
+            req.load_kv_cache(self.req_to_token_pool, self.token_to_kv_pool_allocator)
+
+        self.retracted_queue = [
+            entry
+            for i, entry in enumerate(self.retracted_queue)
+            if i not in indices_to_remove
+        ]
+
+        return resumed_reqs
+
+    def _update_handshake_waiters(self) -> None:
+        if not self.queue:
+            return
+
+        if all(decode_req.waiting_for_input for decode_req in self.queue):
+            return
+
+        polls = poll_and_all_reduce(
+            [decode_req.kv_receiver for decode_req in self.queue], self.gloo_group
+        )
+
+        for i, (decode_req, poll) in enumerate(zip(self.queue, polls)):
+            if poll == KVPoll.Bootstrapping:
+                pass
+            elif poll == KVPoll.WaitingForInput:
+                decode_req.waiting_for_input = True
+            elif poll == KVPoll.Failed:
+                error_message = f"Decode handshake failed for request rank={self.tp_rank} {decode_req.req.rid=} {decode_req.req.bootstrap_room=}"
+                try:
+                    decode_req.kv_receiver.failure_exception()
+                except Exception as e:
+                    error_message += f" with exception {e}"
+                logger.error(error_message)
+                prepare_abort(
+                    decode_req.req,
+                    error_message,
+                    status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
+                )
+                if self.scheduler.enable_metrics:
+                    self.scheduler.metrics_collector.increment_bootstrap_failed_reqs()
+            else:
+                raise ValueError(f"Unexpected poll case: {poll}")
+
+    def pop_preallocated(self) -> List[DecodeRequest]:
+        """Pop the preallocated requests from the pending queue (FIFO)."""
+        self._update_handshake_waiters()
+
+        preallocated_reqs = []
+        indices_to_remove = set()
+
+        # We need to make sure that the sum of inflight tokens and allocatable tokens is greater than maximum input+output length of each inflight request
+        # Otherwise it is possible for one request running decode out of memory, while all other requests are in the transfer queue that cannot be retracted.
+        retractable_tokens = sum(
+            len(r.origin_input_ids) + len(r.output_ids)
+            for r in self.scheduler.running_batch.reqs
+        )
+        allocatable_tokens = self._allocatable_tokens(
+            retractable_tokens=retractable_tokens, count_retracted=True
+        )
+        # First, remove all failed requests from the queue
+        for i, decode_req in enumerate(self.queue):
+            if isinstance(decode_req.req.finished_reason, FINISH_ABORT):
+                self.scheduler.stream_output(
+                    [decode_req.req], decode_req.req.return_logprob
+                )
+                indices_to_remove.add(i)
+
+        # Then, preallocate the remaining requests if possible
+        for i, decode_req in enumerate(self.queue):
+            if i in indices_to_remove:
+                continue
+
+            if not decode_req.waiting_for_input:
+                continue
+
+            if self.req_to_token_pool.available_size() <= 0:
+                break
+
+            if self.req_to_metadata_buffer_idx_allocator.available_size() <= 0:
+                break
+
+            # Memory estimation: don't add if the projected memory cannot be met
+            # TODO: add new_token ratio
+            origin_input_len = len(decode_req.req.origin_input_ids)
+            required_tokens_for_request = (
+                origin_input_len + self.num_reserved_decode_tokens
+            )
+
+            if (
+                max(
+                    required_tokens_for_request,
+                    origin_input_len
+                    + min(
+                        decode_req.req.sampling_params.max_new_tokens,
+                        CLIP_MAX_NEW_TOKEN,
+                    )
+                    - retractable_tokens,
+                )
+                > allocatable_tokens
+            ):
+                break
+            if required_tokens_for_request > allocatable_tokens:
+                break
+
+            allocatable_tokens -= required_tokens_for_request
+            self._pre_alloc(decode_req.req)
+
+            kv_indices = (
+                self.req_to_token_pool.req_to_token[decode_req.req.req_pool_idx][
+                    : len(decode_req.req.origin_input_ids)
+                ]
+                .cpu()
+                .numpy()
+            )
+
+            decode_req.metadata_buffer_index = (
+                self.req_to_metadata_buffer_idx_allocator.alloc()
+            )
+            assert decode_req.metadata_buffer_index is not None
+            page_indices = kv_to_page_indices(
+                kv_indices, self.token_to_kv_pool_allocator.page_size
+            )
+            decode_req.kv_receiver.init(page_indices, decode_req.metadata_buffer_index)
+            preallocated_reqs.append(decode_req)
+            indices_to_remove.add(i)
+
+        self.queue = [
+            entry for i, entry in enumerate(self.queue) if i not in indices_to_remove
+        ]
+
+        return preallocated_reqs
+
+    @property
+    def num_tokens_pre_allocated(self):
+        return sum(
+            len(decode_req.req.fill_ids) for decode_req in self.transfer_queue.queue
+        )
+
+    def _allocatable_tokens(
+        self, retractable_tokens: Optional[int] = None, count_retracted: bool = True
+    ) -> int:
+        need_space_for_single_req = (
+            max(
+                [
+                    min(x.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKEN)
+                    + len(x.origin_input_ids)
+                    - retractable_tokens
+                    for x in self.scheduler.running_batch.reqs
+                ]
+            )
+            if retractable_tokens is not None
+            and len(self.scheduler.running_batch.reqs) > 0
+            else 0
+        )
+
+        if self.scheduler.model_config.is_hybrid:
+            available_size = min(
+                self.token_to_kv_pool_allocator.full_available_size(),
+                self.token_to_kv_pool_allocator.swa_available_size(),
+            )
+        else:
+            available_size = self.token_to_kv_pool_allocator.available_size()
+
+        allocatable_tokens = available_size - max(
+            # preserve some space for future decode
+            self.num_reserved_decode_tokens
+            * (
+                len(self.scheduler.running_batch.reqs)
+                + len(self.transfer_queue.queue)
+                + len(self.scheduler.waiting_queue)
+            ),
+            # make sure each request can finish if reach max_tokens with all other requests retracted
+            need_space_for_single_req,
+        )
+
+        # Note: if the last fake extend just finishes, and we enter `pop_preallocated` immediately in the next iteration
+        #       the extend batch is not in any queue, so we need to explicitly add the tokens slots here
+        if (
+            self.scheduler.last_batch
+            and self.scheduler.last_batch.forward_mode.is_extend()
+        ):
+            allocatable_tokens -= self.num_reserved_decode_tokens * len(
+                self.scheduler.last_batch.reqs
+            )
+
+        if count_retracted:
+            allocatable_tokens -= sum(
+                [
+                    len(req.origin_input_ids)
+                    + len(req.output_ids)
+                    + self.num_reserved_decode_tokens
+                    for req in self.retracted_queue
+                ]
+            )
+        return allocatable_tokens
+
+    def _pre_alloc(self, req: Req) -> torch.Tensor:
+        """Pre-allocate the memory for req_to_token and token_kv_pool"""
+        req_pool_indices = self.req_to_token_pool.alloc(1)
+
+        assert (
+            req_pool_indices is not None
+        ), "req_pool_indices is full! There is a bug in memory estimation."
+
+        req.req_pool_idx = req_pool_indices[0]
+
+        if self.token_to_kv_pool_allocator.page_size == 1:
+            kv_loc = self.token_to_kv_pool_allocator.alloc(
+                len(req.origin_input_ids) + max(len(req.output_ids) - 1, 0)
+            )
+        else:
+            num_tokens = len(req.origin_input_ids) + max(len(req.output_ids) - 1, 0)
+            kv_loc = self.token_to_kv_pool_allocator.alloc_extend(
+                prefix_lens=torch.tensor(
+                    [0],
+                    dtype=torch.int64,
+                    device=self.token_to_kv_pool_allocator.device,
+                ),
+                seq_lens=torch.tensor(
+                    [num_tokens],
+                    dtype=torch.int64,
+                    device=self.token_to_kv_pool_allocator.device,
+                ),
+                last_loc=torch.tensor(
+                    [-1],
+                    dtype=torch.int64,
+                    device=self.token_to_kv_pool_allocator.device,
+                ),
+                extend_num_tokens=num_tokens,
+            )
+
+        assert (
+            kv_loc is not None
+        ), "KV cache is full! There is a bug in memory estimation."
+
+        self.req_to_token_pool.write((req.req_pool_idx, slice(0, len(kv_loc))), kv_loc)
+
+        # populate metadata
+        req.fill_ids = req.origin_input_ids + req.output_ids
+        req.extend_input_len = len(req.origin_input_ids)
+
+        return kv_loc
+
+
+class DecodeTransferQueue:
+    """
+    Store the requests that is polling kv
+    """
+
+    def __init__(
+        self,
+        gloo_group: ProcessGroup,
+        req_to_metadata_buffer_idx_allocator: ReqToMetadataIdxAllocator,
+        tp_rank: int,
+        metadata_buffers: MetadataBuffers,
+        scheduler: Scheduler,
+        tree_cache: BasePrefixCache,
+    ):
+        self.queue: List[DecodeRequest] = []
+        self.gloo_group = gloo_group
+        self.req_to_metadata_buffer_idx_allocator = req_to_metadata_buffer_idx_allocator
+        self.tp_rank = tp_rank
+        self.metadata_buffers = metadata_buffers
+        self.scheduler = scheduler
+        self.tree_cache = tree_cache
+        self.spec_algorithm = scheduler.spec_algorithm
+
+    def add(self, decode_req: DecodeRequest) -> None:
+        self.queue.append(decode_req)
+
+    def extend(self, decode_reqs: List[DecodeRequest]) -> None:
+        self.queue.extend(decode_reqs)
+
+    def pop_transferred(self) -> List[Req]:
+        if not self.queue:
+            return []
+        polls = poll_and_all_reduce(
+            [decode_req.kv_receiver for decode_req in self.queue], self.gloo_group
+        )
+
+        transferred_reqs = []
+        indices_to_remove = set()
+        for i, (decode_req, poll) in enumerate(zip(self.queue, polls)):
+            if poll == KVPoll.Failed:
+                error_message = f"Decode transfer failed for request rank={self.tp_rank} {decode_req.req.rid=} {decode_req.req.bootstrap_room=}"
+                try:
+                    decode_req.kv_receiver.failure_exception()
+                except Exception as e:
+                    error_message += f" with exception {e}"
+                logger.error(error_message)
+                prepare_abort(
+                    decode_req.req,
+                    error_message,
+                    status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
+                )
+                self.scheduler.stream_output(
+                    [decode_req.req], decode_req.req.return_logprob
+                )
+                # unlock the kv cache or it will have memory leak
+                self.tree_cache.cache_finished_req(decode_req.req)
+                indices_to_remove.add(i)
+                if self.scheduler.enable_metrics:
+                    self.scheduler.metrics_collector.increment_transfer_failed_reqs()
+                continue
+            elif poll == KVPoll.Success:
+
+                idx = decode_req.metadata_buffer_index
+                (
+                    output_id,
+                    output_token_logprobs_val,
+                    output_token_logprobs_idx,
+                    output_top_logprobs_val,
+                    output_top_logprobs_idx,
+                    output_hidden_states,
+                ) = self.metadata_buffers.get_buf(idx)
+
+                decode_req.req.output_ids.append(output_id[0].item())
+                if not self.spec_algorithm.is_none():
+                    decode_req.req.hidden_states_tensor = output_hidden_states
+                if decode_req.req.return_logprob:
+                    decode_req.req.output_token_logprobs_val.append(
+                        output_token_logprobs_val[0].item()
+                    )
+                    decode_req.req.output_token_logprobs_idx.append(
+                        output_token_logprobs_idx[0].item()
+                    )
+                    decode_req.req.output_top_logprobs_val.append(
+                        output_top_logprobs_val[
+                            : decode_req.req.top_logprobs_num
+                        ].tolist()
+                    )
+                    decode_req.req.output_top_logprobs_idx.append(
+                        output_top_logprobs_idx[
+                            : decode_req.req.top_logprobs_num
+                        ].tolist()
+                    )
+
+                if hasattr(decode_req.kv_receiver, "clear"):
+                    decode_req.kv_receiver.clear()
+
+                # special handling for sampling_params.max_new_tokens == 1
+                if decode_req.req.sampling_params.max_new_tokens == 1:
+                    # finish immediately
+                    decode_req.req.check_finished()
+                    self.scheduler.stream_output(
+                        [decode_req.req], decode_req.req.return_logprob
+                    )
+                    self.tree_cache.cache_finished_req(decode_req.req)
+                else:
+                    transferred_reqs.append(decode_req.req)
+
+                indices_to_remove.add(i)
+            elif poll in [
+                KVPoll.Bootstrapping,
+                KVPoll.WaitingForInput,
+                KVPoll.Transferring,
+            ]:
+                pass
+            else:
+                raise ValueError(f"Unexpected poll case: {poll}")
+
+        for i in indices_to_remove:
+            idx = self.queue[i].metadata_buffer_index
+            assert idx != -1
+            self.req_to_metadata_buffer_idx_allocator.free(idx)
+
+        self.queue = [
+            entry for i, entry in enumerate(self.queue) if i not in indices_to_remove
+        ]
+
+        return transferred_reqs
+
+
+class SchedulerDisaggregationDecodeMixin:
+
+    @torch.no_grad()
+    def event_loop_normal_disagg_decode(self: Scheduler):
+        """A normal scheduler loop for decode worker in disaggregation mode."""
+
+        while True:
+            recv_reqs = self.recv_requests()
+            self.process_input_requests(recv_reqs)
+            # polling and allocating kv cache
+            self.process_decode_queue()
+            batch = self.get_next_disagg_decode_batch_to_run()
+            self.cur_batch = batch
+
+            prepare_mlp_sync_flag = require_mlp_sync(self.server_args)
+
+            if batch:
+                # Generate fake extend output.
+                if batch.forward_mode.is_extend():
+                    # Note: Logprobs should be handled on the prefill engine.
+                    self.stream_output(
+                        batch.reqs, any(req.return_logprob for req in batch.reqs)
+                    )
+                    if prepare_mlp_sync_flag:
+                        self._prepare_idle_batch_and_run(None)
+                else:
+                    if prepare_mlp_sync_flag:
+                        self.prepare_mlp_sync_batch(batch)
+                    result = self.run_batch(batch)
+                    self.process_batch_result(batch, result)
+            elif prepare_mlp_sync_flag:
+                batch, _ = self._prepare_idle_batch_and_run(None)
+
+            if batch is None and (
+                len(self.waiting_queue)
+                + len(self.disagg_decode_transfer_queue.queue)
+                + len(self.disagg_decode_prealloc_queue.queue)
+                == 0
+            ):
+                self.self_check_during_idle()
+
+            self.last_batch = batch
+
+    @torch.no_grad()
+    def event_loop_overlap_disagg_decode(self: Scheduler):
+        result_queue = deque()
+        self.last_batch: Optional[ScheduleBatch] = None
+        self.last_batch_in_queue = False  # last batch is modified in-place, so we need another variable to track if it's extend
+
+        while True:
+            recv_reqs = self.recv_requests()
+            self.process_input_requests(recv_reqs)
+            # polling and allocating kv cache
+            self.process_decode_queue()
+            batch = self.get_next_disagg_decode_batch_to_run()
+            self.cur_batch = batch
+            last_batch_in_queue = False
+
+            prepare_mlp_sync_flag = require_mlp_sync(self.server_args)
+
+            if batch:
+                # Generate fake extend output.
+                if batch.forward_mode.is_extend():
+                    # Note: Logprobs should be handled on the prefill engine.
+                    self.stream_output(
+                        batch.reqs, any(req.return_logprob for req in batch.reqs)
+                    )
+                    if prepare_mlp_sync_flag:
+                        batch_, result = self._prepare_idle_batch_and_run(
+                            None, delay_process=True
+                        )
+                        if batch_:
+                            result_queue.append((batch_.copy(), result))
+                            last_batch_in_queue = True
+                else:
+                    if prepare_mlp_sync_flag:
+                        self.prepare_mlp_sync_batch(batch)
+                    result = self.run_batch(batch)
+                    result_queue.append((batch.copy(), result))
+
+                    if (self.last_batch is None) or (not self.last_batch_in_queue):
+                        # Create a dummy first batch to start the pipeline for overlap schedule.
+                        # It is now used for triggering the sampling_info_done event.
+                        tmp_batch = ScheduleBatch(
+                            reqs=None,
+                            forward_mode=ForwardMode.DUMMY_FIRST,
+                            next_batch_sampling_info=self.tp_worker.cur_sampling_info,
+                        )
+                        self.set_next_batch_sampling_info_done(tmp_batch)
+                    last_batch_in_queue = True
+
+            elif prepare_mlp_sync_flag:
+                batch, result = self._prepare_idle_batch_and_run(
+                    None, delay_process=True
+                )
+                if batch:
+                    result_queue.append((batch.copy(), result))
+                    last_batch_in_queue = True
+
+            # Process the results of the previous batch but skip if the last batch is extend
+            if self.last_batch and self.last_batch_in_queue:
+                tmp_batch, tmp_result = result_queue.popleft()
+                tmp_batch.next_batch_sampling_info = (
+                    self.tp_worker.cur_sampling_info if batch else None
+                )
+                self.process_batch_result(tmp_batch, tmp_result)
+
+            if batch is None and (
+                len(self.waiting_queue)
+                + len(self.disagg_decode_transfer_queue.queue)
+                + len(self.disagg_decode_prealloc_queue.queue)
+                == 0
+            ):
+                self.self_check_during_idle()
+
+            self.last_batch = batch
+            self.last_batch_in_queue = last_batch_in_queue
+
+    def _prepare_idle_batch_and_run(self: Scheduler, batch, delay_process=False):
+        batch = self.prepare_mlp_sync_batch(batch)
+        result = None
+        if batch:
+            result = self.run_batch(batch)
+            if not delay_process:
+                self.process_batch_result(batch, result)
+        return batch, result
+
+    def get_next_disagg_decode_batch_to_run(
+        self: Scheduler,
+    ) -> Optional[Tuple[ScheduleBatch, bool]]:
+        """Create fake completed prefill if possible and merge with running batch"""
+        # Merge the prefill batch into the running batch
+        last_batch = self.last_batch
+        if last_batch and last_batch.forward_mode.is_extend():
+            # chunked prefill doesn't happen in decode instance.
+            assert self.chunked_req is None
+            # Filter finished batches.
+            last_batch.filter_batch()
+            if not last_batch.is_empty():
+                if self.running_batch.is_empty():
+                    self.running_batch = last_batch
+                else:
+                    # merge running_batch with prefill batch
+                    self.running_batch.merge_batch(last_batch)
+
+        new_prebuilt_batch = self.get_new_prebuilt_batch()
+
+        ret: Optional[ScheduleBatch] = None
+        if new_prebuilt_batch:
+            ret = new_prebuilt_batch
+        else:
+            if self.running_batch.is_empty():
+                ret = None
+            else:
+                self.running_batch = self.update_running_batch(self.running_batch)
+                ret = self.running_batch if not self.running_batch.is_empty() else None
+
+        return ret
+
+    def get_new_prebuilt_batch(self: Scheduler) -> Optional[ScheduleBatch]:
+        """Create a schedulebatch for fake completed prefill"""
+        if self.grammar_queue:
+            self.move_ready_grammar_requests()
+
+        if len(self.waiting_queue) == 0:
+            return None
+
+        curr_batch_size = self.running_batch.batch_size()
+
+        batch_size = min(self.req_to_token_pool.size, self.max_running_requests)
+
+        num_not_used_batch = batch_size - curr_batch_size
+
+        # pop req from waiting queue
+        can_run_list: List[Req] = []
+        waiting_queue: List[Req] = []
+
+        for i in range(len(self.waiting_queue)):
+            req = self.waiting_queue[i]
+            # we can only add at least `num_not_used_batch` new batch to the running queue
+            if i < num_not_used_batch:
+                can_run_list.append(req)
+                req.init_next_round_input(self.tree_cache)
+            else:
+                waiting_queue.append(req)
+
+        self.waiting_queue = waiting_queue
+        if len(can_run_list) == 0:
+            return None
+
+        # construct a schedule batch with those requests and mark as decode
+        new_batch = ScheduleBatch.init_new(
+            can_run_list,
+            self.req_to_token_pool,
+            self.token_to_kv_pool_allocator,
+            self.tree_cache,
+            self.model_config,
+            self.enable_overlap,
+            self.spec_algorithm,
+        )
+
+        # construct fake completed prefill
+        new_batch.prepare_for_prebuilt_extend()
+        new_batch.process_prebuilt_extend(self.server_args, self.model_config)
+
+        return new_batch
+
+    def process_decode_queue(self: Scheduler):
+        # try to resume retracted requests if there are enough space for another `num_reserved_decode_tokens` decode steps
+        resumed_reqs = self.disagg_decode_prealloc_queue.resume_retracted_reqs()
+        self.waiting_queue.extend(resumed_reqs)
+        if len(self.disagg_decode_prealloc_queue.retracted_queue) > 0:
+            # if there are still retracted requests, we do not allocate new requests
+            return
+
+        req_conns = self.disagg_decode_prealloc_queue.pop_preallocated()
+        self.disagg_decode_transfer_queue.extend(req_conns)
+        alloc_reqs = (
+            self.disagg_decode_transfer_queue.pop_transferred()
+        )  # the requests which kv has arrived
+        self.waiting_queue.extend(alloc_reqs)
diff --git a/python/sglang/srt/disaggregation/decode_schedule_batch_mixin.py b/python/sglang/srt/disaggregation/decode_schedule_batch_mixin.py
new file mode 100644
index 000000000..c1cb17c04
--- /dev/null
+++ b/python/sglang/srt/disaggregation/decode_schedule_batch_mixin.py
@@ -0,0 +1,159 @@
+from __future__ import annotations
+
+import logging
+from http import HTTPStatus
+from typing import TYPE_CHECKING
+
+import torch
+
+from sglang.srt.disaggregation.utils import prepare_abort
+from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, ForwardMode
+from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
+
+logger = logging.getLogger(__name__)
+
+if TYPE_CHECKING:
+    from sglang.srt.configs.model_config import ModelConfig
+    from sglang.srt.managers.schedule_batch import ScheduleBatch
+    from sglang.srt.server_args import ServerArgs
+
+
+class ScheduleBatchDisaggregationDecodeMixin:
+
+    def prepare_for_prebuilt_extend(self: ScheduleBatch):
+        """
+        Prepare a prebuilt extend by populate metadata
+        Adapted from .prepare_for_extend().
+        """
+
+        self.forward_mode = ForwardMode.EXTEND
+        reqs = self.reqs
+        input_ids = [r.fill_ids[len(r.prefix_indices) :] for r in reqs]
+        extend_num_tokens = sum(len(ids) for ids in input_ids)
+        seq_lens = []
+        pre_lens = []
+        req_pool_indices = []
+
+        # Pre-calculate total size
+        total_size = sum(req.extend_input_len for req in reqs)
+        out_cache_loc = torch.empty(total_size, dtype=torch.int64, device=self.device)
+
+        # Fill the tensor in one pass
+        offset = 0
+        for i, req in enumerate(reqs):
+            req_pool_indices.append(req.req_pool_idx)
+
+            chunk = self.req_to_token_pool.req_to_token[req.req_pool_idx][
+                : req.extend_input_len
+            ]
+            assert (
+                offset + req.extend_input_len <= total_size
+            ), f"Exceeds total size: offset={offset}, req.extend_input_len={req.extend_input_len}, total_size={total_size}"
+            out_cache_loc[offset : offset + req.extend_input_len] = chunk
+            offset += req.extend_input_len
+
+            pre_len = len(req.prefix_indices)
+            seq_len = len(req.origin_input_ids) + max(0, len(req.output_ids) - 1)
+            seq_lens.append(seq_len)
+            if len(req.output_ids) == 0:
+                assert (
+                    seq_len - pre_len == req.extend_input_len
+                ), f"seq_len={seq_len}, pre_len={pre_len}, req.extend_input_len={req.extend_input_len}"
+
+            req.cached_tokens += pre_len - req.already_computed
+            req.already_computed = seq_len
+            req.is_retracted = False
+            pre_lens.append(pre_len)
+            req.extend_logprob_start_len = 0
+
+        extend_input_logprob_token_ids = None
+
+        # Set fields
+        self.input_ids = torch.tensor(
+            sum(input_ids, []), dtype=torch.int32, device=self.device
+        )
+        self.req_pool_indices = torch.tensor(
+            req_pool_indices, dtype=torch.int64, device=self.device
+        )
+        self.seq_lens = torch.tensor(seq_lens, dtype=torch.int64, device=self.device)
+        self.orig_seq_lens = torch.tensor(
+            seq_lens, dtype=torch.int32, device=self.device
+        )
+        self.out_cache_loc = out_cache_loc
+        self.seq_lens_sum = sum(seq_lens)
+
+        if self.return_logprob:
+            self.top_logprobs_nums = [r.top_logprobs_num for r in reqs]
+            self.token_ids_logprobs = [r.token_ids_logprob for r in reqs]
+
+        self.extend_num_tokens = extend_num_tokens
+        self.prefix_lens = [len(r.prefix_indices) for r in reqs]
+        self.extend_lens = [r.extend_input_len for r in reqs]
+        self.extend_logprob_start_lens = [r.extend_logprob_start_len for r in reqs]
+        self.extend_input_logprob_token_ids = extend_input_logprob_token_ids
+        self.multimodal_inputs = [r.multimodal_inputs for r in reqs]
+
+        # Build sampling info
+        self.sampling_info = SamplingBatchInfo.from_schedule_batch(
+            self,
+            self.model_config.vocab_size,
+        )
+
+    def process_prebuilt_extend(
+        self: ScheduleBatch, server_args: ServerArgs, model_config: ModelConfig
+    ):
+        """Assign the buffered last input id to schedule batch"""
+        self.output_ids = []
+        for req in self.reqs:
+            self.output_ids.append(req.output_ids[-1])
+            self.tree_cache.cache_unfinished_req(req)
+            if req.grammar is not None:
+                # FIXME: this try-except block is for handling unexpected xgrammar issue.
+                try:
+                    req.grammar.accept_token(req.output_ids[-1])
+                except ValueError as e:
+                    # Grammar accept_token can raise ValueError if the token is not in the grammar.
+                    # This can happen if the grammar is not set correctly or the token is invalid.
+                    error_message = f"Grammar accept_token failed for req {req.rid} with token {req.output_ids[-1]}: {e}"
+                    self.tree_cache.cache_finished_req(req)
+                    prepare_abort(
+                        req, error_message, status_code=HTTPStatus.INTERNAL_SERVER_ERROR
+                    )
+                req.grammar.finished = req.finished()
+        self.output_ids = torch.tensor(self.output_ids, device=self.device)
+
+        # Simulate the eagle run. We add mock data to hidden states for the
+        # ease of implementation now meaning the first token will have acc rate
+        # of 0.
+        if not self.spec_algorithm.is_none():
+
+            b = len(self.reqs)
+            topk_p = torch.arange(
+                b * server_args.speculative_eagle_topk,
+                0,
+                -1,
+                device=self.device,
+                dtype=torch.float32,
+            )
+            topk_p = topk_p.reshape(b, server_args.speculative_eagle_topk)
+            topk_p /= b * server_args.speculative_eagle_topk
+            topk_index = torch.arange(
+                b * server_args.speculative_eagle_topk, device=self.device
+            )
+            topk_index = topk_index.reshape(b, server_args.speculative_eagle_topk)
+
+            hidden_states_list = [req.hidden_states_tensor for req in self.reqs]
+            hidden_states = torch.stack(hidden_states_list, dim=0).to(self.device)
+
+            # local import to avoid circular import
+            from sglang.srt.speculative.eagle_utils import EagleDraftInput
+
+            spec_info = EagleDraftInput(
+                topk_p=topk_p,
+                topk_index=topk_index,
+                hidden_states=hidden_states,
+                verified_id=self.output_ids,
+            )
+            spec_info.prepare_for_extend(self)
+            spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
+            self.spec_info = spec_info
diff --git a/python/sglang/srt/disaggregation/fake/__init__.py b/python/sglang/srt/disaggregation/fake/__init__.py
new file mode 100644
index 000000000..d7cdb4b27
--- /dev/null
+++ b/python/sglang/srt/disaggregation/fake/__init__.py
@@ -0,0 +1 @@
+from sglang.srt.disaggregation.fake.conn import FakeKVReceiver, FakeKVSender
diff --git a/python/sglang/srt/disaggregation/fake/conn.py b/python/sglang/srt/disaggregation/fake/conn.py
new file mode 100644
index 000000000..120633824
--- /dev/null
+++ b/python/sglang/srt/disaggregation/fake/conn.py
@@ -0,0 +1,85 @@
+import logging
+from typing import List, Optional
+
+import numpy as np
+import numpy.typing as npt
+
+from sglang.srt.disaggregation.base.conn import (
+    BaseKVManager,
+    BaseKVReceiver,
+    BaseKVSender,
+    KVPoll,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# For warmup reqs, we don't kv transfer, we use the fake sender and receiver
+class FakeKVSender(BaseKVSender):
+    def __init__(
+        self,
+        mgr: BaseKVManager,
+        bootstrap_addr: str,
+        bootstrap_room: int,
+        dest_tp_ranks: List[int],
+        pp_rank: int,
+    ):
+        self.has_sent = False
+
+    def poll(self) -> KVPoll:
+        if self.has_sent is False:
+            # Assume handshake completed instantly
+            return KVPoll.WaitingForInput
+        else:
+            # Assume transfer completed instantly
+            logger.debug("FakeKVSender poll success")
+            return KVPoll.Success
+
+    def init(
+        self,
+        kv_indices: list[int],
+        aux_index: Optional[int] = None,
+    ):
+        logger.debug(
+            f"FakeKVSender init with kv_indices: {kv_indices}, aux_index: {aux_index}"
+        )
+        pass
+
+    def send(
+        self,
+        kv_indices: npt.NDArray[np.int32],
+    ):
+        self.has_sent = True
+        logger.debug(f"FakeKVSender send with kv_indices: {kv_indices}")
+
+    def failure_exception(self):
+        raise Exception("Fake KVSender Exception")
+
+
+class FakeKVReceiver(BaseKVReceiver):
+    def __init__(
+        self,
+        mgr: BaseKVManager,
+        bootstrap_addr: str,
+        bootstrap_room: Optional[int] = None,
+        prefill_dp_rank: Optional[int] = None,
+    ):
+        self.has_init = False
+
+    def poll(self) -> KVPoll:
+        if self.has_init is False:
+            # Assume handshake completed instantly
+            return KVPoll.WaitingForInput
+        else:
+            # Assume transfer completed instantly
+            logger.debug("FakeKVReceiver poll success")
+            return KVPoll.Success
+
+    def init(self, kv_indices: list[int], aux_index: Optional[int] = None):
+        self.has_init = True
+        logger.debug(
+            f"FakeKVReceiver init with kv_indices: {kv_indices}, aux_index: {aux_index}"
+        )
+
+    def failure_exception(self):
+        raise Exception("Fake KVReceiver Exception")
diff --git a/python/sglang/srt/disaggregation/kv_events.py b/python/sglang/srt/disaggregation/kv_events.py
new file mode 100644
index 000000000..f0a3a4357
--- /dev/null
+++ b/python/sglang/srt/disaggregation/kv_events.py
@@ -0,0 +1,412 @@
+"""
+Copyright 2025 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""
+KV caching events
+"""
+
+import atexit
+import logging
+import queue
+import threading
+import time
+from abc import ABC, abstractmethod
+from collections import deque
+from itertools import count
+from queue import Queue
+from typing import Any, Callable, Optional, Union
+
+import msgspec
+import zmq
+from pydantic import BaseModel
+
+logger = logging.getLogger(__name__)
+
+
+class EventBatch(
+    msgspec.Struct,
+    array_like=True,  # type: ignore[call-arg]
+    omit_defaults=True,  # type: ignore[call-arg]
+    gc=False,  # type: ignore[call-arg]
+):
+    ts: float
+    events: list[Any]
+    attn_dp_rank: Optional[int] = None
+
+
+class KVCacheEvent(
+    msgspec.Struct,
+    array_like=True,  # type: ignore[call-arg]
+    omit_defaults=True,  # type: ignore[call-arg]
+    gc=False,  # type: ignore[call-arg]
+    tag=True,
+):
+    """Base class for all KV cache-related events"""
+
+
+class BlockStored(KVCacheEvent):
+    block_hashes: list[int]
+    parent_block_hash: Optional[int]
+    token_ids: list[int]
+    block_size: int
+    lora_id: Optional[int]
+
+
+class BlockRemoved(KVCacheEvent):
+    block_hashes: list[int]
+
+
+class AllBlocksCleared(KVCacheEvent):
+    pass
+
+
+class KVEventBatch(EventBatch):
+    events: list[Union[BlockStored, BlockRemoved, AllBlocksCleared]]
+
+
+class EventPublisher(ABC):
+    """
+    Lightweight publisher for EventBatch batches with
+    support for DP attention.
+
+    In DP attention - each rank has its own Scheduler and
+    KV cache instance in order to avoid duplicate events
+    and ensure proper event attribution. In our implementation
+
+    - Each DP rank has its own EventPublisher
+    - Publishers annotate events with the dp rank
+    - This allows consumers to distinguish events from different DP ranks
+    """
+
+    def __init__(self, attn_dp_rank: int = 0):
+        self._attn_dp_rank = attn_dp_rank
+
+    @abstractmethod
+    def publish(self, events: EventBatch) -> None:
+        """Emit events in order.
+
+        Implementations should guarantee at-least-once delivery and
+        monotonic ordering (e.g., via sequence numbers).
+        """
+
+    @abstractmethod
+    def shutdown(self) -> None:
+        """Shutdown the publisher."""
+
+
+class NullEventPublisher(EventPublisher):
+    """No-op implementation (default when disabled)."""
+
+    def publish(self, events) -> None:
+        return
+
+    def shutdown(self) -> None:
+        return
+
+
+class ZmqEventPublisher(EventPublisher):
+    """Reliable PUB/ROUTER publisher with an in-memory replay buffer.
+
+    Spawns a separate thread to handle publishing from a queue.
+
+    Parameters
+    ----------
+    endpoint:
+        PUB address. Use ``tcp://*:5557`` to bind or ``tcp://host:5557`` to
+        connect.
+    replay_endpoint:
+        Optional ROUTER address for replay requests. When given, subscribers can
+        request missed batches by sending the starting sequence number as an
+        8-byte big-endian integer.
+    buffer_steps:
+        Number of past batches to keep for replay.
+    hwm:
+        ZeroMQ high-water-mark for PUB socket.
+    max_queue_size:
+        Maximum number of events to buffer in memory.
+    topic:
+        Topic to publish events to.
+    """
+
+    SHUTDOWN_TIMEOUT: float = 1.0
+    END_SEQ = (-1).to_bytes(8, "big", signed=True)
+
+    def __init__(
+        self,
+        attn_dp_rank: int,
+        endpoint: str = "tcp://*:5557",
+        replay_endpoint: Optional[str] = None,
+        buffer_steps: int = 10_000,
+        hwm: int = 100_000,
+        max_queue_size: int = 100_000,
+        topic: str = "",
+    ) -> None:
+        # Storage
+        super().__init__(attn_dp_rank)
+        self._event_queue = Queue[Optional[EventBatch]](maxsize=max_queue_size)
+        self._buffer = deque[tuple[int, bytes]](maxlen=buffer_steps)
+
+        # ZMQ sockets
+        self._ctx = zmq.Context.instance()
+        self._pub: Optional[zmq.Socket] = None
+        self._replay: Optional[zmq.Socket] = None
+        self._dp_rank = attn_dp_rank
+        self._endpoint = self.offset_endpoint_port(endpoint, self._dp_rank)
+        self._replay_endpoint = self.offset_endpoint_port(
+            replay_endpoint, self._dp_rank
+        )
+        self._hwm = hwm
+        self._socket_setup()
+
+        # Payload
+        self._seq_gen = count()
+        self._topic_bytes = topic.encode("utf-8")
+
+        # Thread
+        self._running = True
+        logger.info("Starting ZMQ publisher thread")
+
+        self._thread = threading.Thread(
+            target=self._publisher_thread, daemon=True, name="zmq-publisher"
+        )
+        self._thread.start()
+
+        atexit.register(self.shutdown)
+
+    def publish(self, events: EventBatch) -> None:
+        if not self._running:
+            raise RuntimeError("Publisher is closed")
+        if events.attn_dp_rank is None:
+            events.attn_dp_rank = self._dp_rank
+        self._event_queue.put(events)
+
+    def shutdown(self) -> None:
+        """Stop the publisher thread and clean up resources."""
+        self._running = False
+        self._event_queue.put_nowait(None)
+
+        start = time.time()
+        pending_items = True
+        while pending_items and (time.time() - start < self.SHUTDOWN_TIMEOUT):
+            pending_items = not self._event_queue.empty()
+            if pending_items:
+                time.sleep(0.1)
+
+        if pending_items:
+            logger.warning(
+                "Warning: Queue still has %s items after %s seconds timeout",
+                self._event_queue.qsize(),
+                self.SHUTDOWN_TIMEOUT,
+            )
+
+        if self._thread.is_alive():
+            self._thread.join(timeout=self.SHUTDOWN_TIMEOUT)
+
+        # Clean up ZMQ resources
+        try:
+            if self._pub is not None:
+                self._pub.close(linger=0)
+            if self._replay is not None:
+                self._replay.close(linger=0)
+        finally:
+            pass  # Do not terminate context; other sockets may use it
+
+    def _socket_setup(self) -> None:
+        """Initialize sockets
+        https://pyzmq.readthedocs.io/en/v19.0.0/morethanbindings.html#thread-safety
+        """
+        if self._pub is None:
+            self._pub = self._ctx.socket(zmq.PUB)
+            self._pub.set_hwm(self._hwm)
+            # Heuristic: bind if wildcard / * present, else connect.
+            # bind stable, connect volatile convention
+            if (
+                "*" in self._endpoint
+                or "::" in self._endpoint
+                or self._endpoint.startswith("ipc://")
+                or self._endpoint.startswith("inproc://")
+            ):
+                self._pub.bind(self._endpoint)
+            else:
+                self._pub.connect(self._endpoint)
+
+        # Set up replay socket: use ROUTER
+        # 1) handles multiple REQ clients (identities)
+        # 2) lets us send back one request → many replies (streamed events)
+        # 3) works in our non‑blocking poll loop alongside PUB
+        if self._replay_endpoint is not None:
+            self._replay = self._ctx.socket(zmq.ROUTER)
+            self._replay.bind(self._replay_endpoint)
+
+    def _publisher_thread(self) -> None:
+        """Background thread that processes the event queue."""
+        self._pack = msgspec.msgpack.Encoder()
+
+        assert self._pub is not None  # narrows type for mypy
+
+        while self._running or self._event_queue.qsize() > 0:
+            # --- replay (non-critical) ---------------------------------
+            if self._replay is not None and self._replay.poll(0):
+                try:
+                    self._service_replay()
+                except Exception as e:
+                    logger.exception("Error in replay: %s", e)
+
+            # --- main queue (critical) ---------------------------------
+            try:
+                event = self._event_queue.get(timeout=0.1)
+                if event is None:
+                    break  # Sentinel received, exit thread
+            except queue.Empty:
+                continue
+
+            try:
+                seq = next(self._seq_gen)
+
+                payload = self._pack.encode(event)
+                seq_bytes = seq.to_bytes(8, "big")
+                self._pub.send_multipart((self._topic_bytes, seq_bytes, payload))
+
+                self._buffer.append((seq, payload))
+                self._event_queue.task_done()
+
+            except Exception as e:
+                # Publishing failed;  back-off a bit to avoid a tight error loop
+                logger.exception("Error in publisher thread: %s", e)
+                time.sleep(0.1)
+
+    def _service_replay(self) -> None:
+        """If a replay request is waiting, send buffered batches."""
+        assert self._replay is not None  # narrows type for mypy
+
+        frame = self._replay.recv_multipart()
+        if len(frame) != 3:
+            logger.warning("Invalid replay request: %s", frame)
+            return
+        client_id, _, start_seq_bytes = frame
+        start_seq = int.from_bytes(start_seq_bytes, "big")
+
+        for seq, buf in self._buffer:
+            if seq >= start_seq:
+                # [identity, empty_delim, seq_bytes, payload]
+                # (identity, empty_delim) are stripped off by the router
+                # receiving payload is (seq_bytes, payload)
+                self._replay.send_multipart(
+                    (client_id, b"", seq.to_bytes(8, "big"), buf)
+                )
+        # Send end of sequence marker
+        # receiving payload is (-1, b""")
+        self._replay.send_multipart((client_id, b"", self.END_SEQ, b""))
+
+    @staticmethod
+    def offset_endpoint_port(
+        endpoint: Optional[str], data_parallel_rank: int
+    ) -> Optional[str]:
+        """Helper function to offset the port in an endpoint by
+            the data parallel rank.
+
+        Args:
+            endpoint: The endpoint string
+                (e.g., "tcp://*:5557" or "inproc://cache")
+            data_parallel_rank: The data parallel rank to offset by
+
+        Returns:
+            The endpoint with the port offset by data_parallel_rank
+                or suffix appended
+        """
+        # Do nothing if input is None or data_parallel_rank is 0
+        if not endpoint or data_parallel_rank == 0:
+            return endpoint
+
+        if "inproc" in endpoint:
+            return f"{endpoint}_dp{data_parallel_rank}"
+        if "tcp" in endpoint:
+            if endpoint and ":" in endpoint:
+                # Get everything after the last colon (the port)
+                last_colon_idx = endpoint.rfind(":")
+                base_addr = endpoint[:last_colon_idx]
+                base_port = int(endpoint[last_colon_idx + 1 :])
+                new_port = base_port + data_parallel_rank
+                return f"{base_addr}:{new_port}"
+            return endpoint
+        raise ValueError("Invalid endpoint: must contain 'inproc' or 'tcp'")
+
+
+class KVEventsConfig(BaseModel):
+    """Configuration for KV event publishing."""
+
+    publisher: str = "null"
+    """The publisher to use for publishing kv events. Can be "null", "zmq".
+    """
+
+    endpoint: str = "tcp://*:5557"
+    """The zmq endpoint to use for publishing kv events.
+    """
+
+    replay_endpoint: Optional[str] = None
+    """The zmq endpoint to use for replaying kv events.
+    """
+
+    buffer_steps: int = 10_000
+    """The number of steps to cache for replay endpoint. Will only save
+    events from the last N steps for the replay endpoint.
+    """
+
+    hwm: int = 100_000
+    """The zmq high water mark for the event publisher. After queueing N events,
+    events will start dropping if the consumer is not keeping up.
+    """
+
+    max_queue_size: int = 100_000
+    """The maximum number of events to queue while waiting for publishing.
+    """
+
+    topic: str = ""
+    """The topic to use for the event publisher. Consumers can subscribe to
+    this topic to receive events.
+    """
+
+    @classmethod
+    def from_cli(cls, cli_value: str) -> "KVEventsConfig":
+        """Parse the CLI value for the event publisher config."""
+        return KVEventsConfig.model_validate_json(cli_value)
+
+
+class EventPublisherFactory:
+    _registry: dict[str, Callable[..., EventPublisher]] = {
+        "null": NullEventPublisher,
+        "zmq": ZmqEventPublisher,
+    }
+
+    @classmethod
+    def register_publisher(cls, name: str, ctor: Callable[..., EventPublisher]) -> None:
+        if name in cls._registry:
+            raise KeyError(f"publisher '{name}' already registered")
+        cls._registry[name] = ctor
+
+    @classmethod
+    def create(cls, config: Optional[str], attn_dp_rank: int = 0) -> EventPublisher:
+        """Create publisher from a config mapping."""
+        if not config:
+            return NullEventPublisher()
+        config = KVEventsConfig.from_cli(config)
+        config_dict = config.model_dump()
+
+        kind = config_dict.pop("publisher", "null")
+        try:
+            constructor = cls._registry[kind]
+        except KeyError as exc:
+            raise ValueError(f"Unknown event publisher '{kind}'") from exc
+        return constructor(attn_dp_rank=attn_dp_rank, **config_dict)
diff --git a/python/sglang/srt/disaggregation/mini_lb.py b/python/sglang/srt/disaggregation/mini_lb.py
new file mode 100644
index 000000000..5aaa2a70e
--- /dev/null
+++ b/python/sglang/srt/disaggregation/mini_lb.py
@@ -0,0 +1,6 @@
+raise RuntimeError(
+    """The 'mini_lb' module has been relocated to the 'sglang_router' package.
+    We recommend installing 'sglang-router' with Rust support for optimal performance.
+    If you encounter issues building the router with Rust, set the environment variable
+    'SGLANG_ROUTER_BUILD_NO_RUST=1' and add '--mini-lb' to the command line to use the Python version of 'mini_lb'."""
+)
diff --git a/python/sglang/srt/disaggregation/mooncake/__init__.py b/python/sglang/srt/disaggregation/mooncake/__init__.py
new file mode 100644
index 000000000..bea967e4e
--- /dev/null
+++ b/python/sglang/srt/disaggregation/mooncake/__init__.py
@@ -0,0 +1,6 @@
+from sglang.srt.disaggregation.mooncake.conn import (
+    MooncakeKVBootstrapServer,
+    MooncakeKVManager,
+    MooncakeKVReceiver,
+    MooncakeKVSender,
+)
diff --git a/python/sglang/srt/disaggregation/mooncake/conn.py b/python/sglang/srt/disaggregation/mooncake/conn.py
new file mode 100644
index 000000000..f69d29622
--- /dev/null
+++ b/python/sglang/srt/disaggregation/mooncake/conn.py
@@ -0,0 +1,1704 @@
+from __future__ import annotations
+
+import asyncio
+import concurrent.futures
+import ctypes
+import dataclasses
+import logging
+import os
+import queue
+import socket
+import struct
+import threading
+import time
+from collections import defaultdict
+from functools import cache
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import numpy.typing as npt
+import requests
+import zmq
+from aiohttp import web
+
+from sglang.srt.disaggregation.base.conn import (
+    BaseKVBootstrapServer,
+    BaseKVManager,
+    BaseKVReceiver,
+    BaseKVSender,
+    KVArgs,
+    KVPoll,
+)
+from sglang.srt.disaggregation.common.utils import (
+    FastQueue,
+    group_concurrent_contiguous,
+)
+from sglang.srt.disaggregation.mooncake.transfer_engine import MooncakeTransferEngine
+from sglang.srt.disaggregation.utils import DisaggregationMode
+from sglang.srt.distributed import get_pp_group
+from sglang.srt.layers.dp_attention import (
+    get_attention_dp_rank,
+    get_attention_dp_size,
+    get_attention_tp_rank,
+    get_attention_tp_size,
+)
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import (
+    format_tcp_address,
+    get_bool_env_var,
+    get_free_port,
+    get_int_env_var,
+    get_ip,
+    get_local_ip_auto,
+    is_valid_ipv6_address,
+    maybe_wrap_ipv6_address,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class KVTransferError(Exception):
+    def __init__(self, bootstrap_room: int, failure_reason: str):
+        super().__init__(failure_reason)
+        self.bootstrap_room = bootstrap_room
+        self.failure_reason = failure_reason
+
+    def __str__(self):
+        return f"KVTransferError(bootstrap_room={self.bootstrap_room}): {self.failure_reason}"
+
+
+# prefill
+@dataclasses.dataclass
+class TransferKVChunk:
+    room: int
+    prefill_kv_indices: npt.NDArray[np.int32]
+    index_slice: slice
+    is_last: bool
+    prefill_aux_index: Optional[int]
+
+
+# decode
+@dataclasses.dataclass
+class TransferInfo:
+    room: int
+    endpoint: str
+    dst_port: int
+    mooncake_session_id: str
+    dst_kv_indices: npt.NDArray[np.int32]
+    dst_aux_index: int
+    required_dst_info_num: int
+    is_dummy: bool
+
+    @classmethod
+    def from_zmq(cls, msg: List[bytes]):
+        if msg[4] == b"" and msg[5] == b"":
+            is_dummy = True
+            dst_kv_indices = np.array([], dtype=np.int32)
+            dst_aux_index = None
+        else:
+            dst_kv_indices = np.frombuffer(msg[4], dtype=np.int32)
+            dst_aux_index = int(msg[5].decode("ascii"))
+            is_dummy = False
+        return cls(
+            room=int(msg[0].decode("ascii")),
+            endpoint=msg[1].decode("ascii"),
+            dst_port=int(msg[2].decode("ascii")),
+            mooncake_session_id=msg[3].decode("ascii"),
+            dst_kv_indices=dst_kv_indices,
+            dst_aux_index=dst_aux_index,
+            required_dst_info_num=int(msg[6].decode("ascii")),
+            is_dummy=is_dummy,
+        )
+
+
+# decode
+@dataclasses.dataclass
+class KVArgsRegisterInfo:
+    room: str
+    endpoint: str
+    dst_port: int
+    mooncake_session_id: str
+    dst_kv_ptrs: list[int]
+    dst_aux_ptrs: list[int]
+    dst_tp_rank: int
+    dst_attn_tp_size: int
+    dst_kv_item_len: int
+
+    @classmethod
+    def from_zmq(cls, msg: List[bytes]):
+        return cls(
+            room=str(msg[0].decode("ascii")),
+            endpoint=msg[1].decode("ascii"),
+            dst_port=int(msg[2].decode("ascii")),
+            mooncake_session_id=msg[3].decode("ascii"),
+            dst_kv_ptrs=list(struct.unpack(f"{len(msg[4])//8}Q", msg[4])),
+            dst_aux_ptrs=list(struct.unpack(f"{len(msg[5])//8}Q", msg[5])),
+            dst_tp_rank=int(msg[6].decode("ascii")),
+            dst_attn_tp_size=int(msg[7].decode("ascii")),
+            dst_kv_item_len=int(msg[8].decode("ascii")),
+        )
+
+
+class AuxDataCodec:
+    """Handles serialization and deserialization of auxiliary data buffers"""
+
+    @staticmethod
+    def serialize_data_from_buffer(src_addr, data_length):
+        """Serialize data from memory buffer to bytes"""
+        buffer = (ctypes.c_byte * data_length).from_address(src_addr)
+        return bytes(buffer)
+
+    @staticmethod
+    def deserialize_data_to_buffer(kv_args, buffer_index, aux_index, data):
+        """Deserialize bytes into target memory buffer"""
+        dst_aux_ptr = kv_args.aux_data_ptrs[buffer_index]
+        item_len = kv_args.aux_item_lens[buffer_index]
+        dst_addr = dst_aux_ptr + item_len * aux_index
+        buffer = (ctypes.c_byte * len(data)).from_address(dst_addr)
+        buffer[:] = data
+        return
+
+
+class MooncakeKVManager(BaseKVManager):
+    AUX_DATA_HEADER = b"AUX_DATA"
+
+    def __init__(
+        self,
+        args: KVArgs,
+        disaggregation_mode: DisaggregationMode,
+        server_args: ServerArgs,
+        is_mla_backend: Optional[bool] = False,
+    ):
+        self.kv_args = args
+        self.local_ip = get_local_ip_auto()
+        self.is_mla_backend = is_mla_backend
+        self.disaggregation_mode = disaggregation_mode
+        self.init_engine()
+        # for p/d multi node infer
+        self.bootstrap_host = server_args.host
+        self.bootstrap_port = server_args.disaggregation_bootstrap_port
+        self.dist_init_addr = server_args.dist_init_addr
+        self.attn_tp_size = get_attention_tp_size()
+        self.attn_tp_rank = get_attention_tp_rank()
+        self.attn_dp_size = get_attention_dp_size()
+        self.attn_dp_rank = get_attention_dp_rank()
+        self.system_dp_size = (
+            1 if server_args.enable_dp_attention else server_args.dp_size
+        )
+        self.system_dp_rank = (
+            self.kv_args.system_dp_rank if self.kv_args.system_dp_rank else 0
+        )
+        self.pp_size = server_args.pp_size
+        self.pp_rank = self.kv_args.pp_rank
+        self.request_status: Dict[int, KVPoll] = {}
+        self.rank_port = None
+        self.server_socket = zmq.Context().socket(zmq.PULL)
+        if is_valid_ipv6_address(self.local_ip):
+            self.server_socket.setsockopt(zmq.IPV6, 1)
+
+        self.register_buffer_to_engine()
+        if self.disaggregation_mode == DisaggregationMode.PREFILL:
+            self.transfer_infos: Dict[int, Dict[str, TransferInfo]] = {}
+            self.decode_kv_args_table: Dict[str, KVArgsRegisterInfo] = {}
+            self.start_prefill_thread()
+            self._register_to_bootstrap()
+            self.session_failures = defaultdict(int)
+            self.failed_sessions = set()
+            self.session_lock = threading.Lock()
+            self.pp_group = get_pp_group()
+            # Determine the number of threads to use for kv sender
+            cpu_count = os.cpu_count()
+            transfer_thread_pool_size = get_int_env_var(
+                "SGLANG_DISAGGREGATION_THREAD_POOL_SIZE",
+                min(max(4, int(0.75 * cpu_count) // 8), 12),
+            )
+            transfer_queue_size = get_int_env_var("SGLANG_DISAGGREGATION_QUEUE_SIZE", 4)
+            self.transfer_queues: List[FastQueue] = [
+                FastQueue() for _ in range(transfer_queue_size)
+            ]
+            assert transfer_thread_pool_size >= transfer_queue_size, (
+                f"The environment variable SGLANG_DISAGGREGATION_THREAD_POOL_SIZE={transfer_thread_pool_size} must be "
+                f"greater than or equal to SGLANG_DISAGGREGATION_QUEUE_SIZE={transfer_queue_size}."
+            )
+            self.executors = [
+                concurrent.futures.ThreadPoolExecutor(
+                    transfer_thread_pool_size // transfer_queue_size
+                )
+                for _ in range(transfer_queue_size)
+            ]
+            for queue, executor in zip(self.transfer_queues, self.executors):
+                threading.Thread(
+                    target=self.transfer_worker, args=(queue, executor), daemon=True
+                ).start()
+            # If a timeout happens on the prefill side, it means prefill instances
+            # fail to receive the KV indices from the decode instance of this request.
+            # These timeout requests should be aborted to release the tree cache.
+            self.bootstrap_timeout = get_int_env_var(
+                "SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT", 300
+            )
+
+            self.enable_custom_mem_pool = get_bool_env_var(
+                "SGLANG_MOONCAKE_CUSTOM_MEM_POOL", "false"
+            )
+        elif self.disaggregation_mode == DisaggregationMode.DECODE:
+            self.heartbeat_failures = {}
+            self.session_pool = defaultdict(requests.Session)
+            self.session_pool_lock = threading.Lock()
+            self.addr_to_rooms_tracker = defaultdict(set)
+            self.connection_lock = threading.Lock()
+            self.required_prefill_response_num_table: Dict[int, int] = {}
+            self.prefill_response_tracker: Dict[int, Set[int]] = defaultdict(set)
+            # Heartbeat interval should be at least 2 seconds
+            self.heartbeat_interval = max(
+                float(os.getenv("SGLANG_DISAGGREGATION_HEARTBEAT_INTERVAL", 5.0)), 2.0
+            )
+            # Heartbeat failure should be at least 1
+            self.max_failures = max(
+                get_int_env_var("SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE", 2), 1
+            )
+            self.start_decode_thread()
+            self.connection_pool: Dict[str, Dict[str, Union[str, int]]] = {}
+            self.prefill_attn_tp_size_table: Dict[str, int] = {}
+            self.prefill_dp_size_table: Dict[str, int] = {}
+            self.prefill_pp_size_table: Dict[str, int] = {}
+            # If a timeout happens on the decode side, it means decode instances
+            # fail to receive the KV Cache transfer done signal after bootstrapping.
+            # These timeout requests should be aborted to release the tree cache.
+            self.waiting_timeout = get_int_env_var(
+                "SGLANG_DISAGGREGATION_WAITING_TIMEOUT", 300
+            )
+        else:
+            raise ValueError(
+                f"Unsupported DisaggregationMode: {self.disaggregation_mode}"
+            )
+
+        self.failure_records: Dict[int, str] = {}
+        self.failure_lock = threading.Lock()
+
+    def init_engine(self):
+        self.engine = MooncakeTransferEngine(
+            hostname=self.local_ip,
+            gpu_id=self.kv_args.gpu_id,
+            ib_device=self.kv_args.ib_device,
+        )
+
+    def register_buffer_to_engine(self):
+        # Batch register KV data buffers
+        if self.kv_args.kv_data_ptrs and self.kv_args.kv_data_lens:
+            self.engine.batch_register(
+                self.kv_args.kv_data_ptrs, self.kv_args.kv_data_lens
+            )
+
+        # Batch register auxiliary data buffers
+        if self.kv_args.aux_data_ptrs and self.kv_args.aux_data_lens:
+            self.engine.batch_register(
+                self.kv_args.aux_data_ptrs, self.kv_args.aux_data_lens
+            )
+
+    @cache
+    def _connect(self, endpoint: str, is_ipv6: bool = False):
+        socket = zmq.Context().socket(zmq.PUSH)
+        if is_ipv6:
+            socket.setsockopt(zmq.IPV6, 1)
+        socket.connect(endpoint)
+        return socket
+
+    def _transfer_data(self, mooncake_session_id, transfer_blocks):
+        if not transfer_blocks:
+            return 0
+
+        src_addrs, dst_addrs, lengths = zip(*transfer_blocks)
+        return self.engine.batch_transfer_sync(
+            mooncake_session_id, list(src_addrs), list(dst_addrs), list(lengths)
+        )
+
+    def send_kvcache(
+        self,
+        mooncake_session_id: str,
+        prefill_kv_indices: npt.NDArray[np.int32],
+        dst_kv_ptrs: list[int],
+        dst_kv_indices: npt.NDArray[np.int32],
+        executor: concurrent.futures.ThreadPoolExecutor,
+    ):
+        # Group by indices
+        prefill_kv_blocks, dst_kv_blocks = group_concurrent_contiguous(
+            prefill_kv_indices, dst_kv_indices
+        )
+
+        layers_params = None
+
+        # pp is not supported on the decode side yet
+        start_layer = self.kv_args.prefill_start_layer
+        end_layer = start_layer + len(self.kv_args.kv_data_ptrs)
+        if self.is_mla_backend:
+            src_kv_ptrs = self.kv_args.kv_data_ptrs
+            layers_per_pp_stage = len(src_kv_ptrs)
+            dst_kv_ptrs = dst_kv_ptrs[start_layer:end_layer]
+            kv_item_len = self.kv_args.kv_item_lens[0]
+            layers_params = [
+                (
+                    src_kv_ptrs[layer_id],
+                    dst_kv_ptrs[layer_id],
+                    kv_item_len,
+                )
+                for layer_id in range(layers_per_pp_stage)
+            ]
+        else:
+            num_kv_layers = len(self.kv_args.kv_data_ptrs) // 2
+            dst_num_total_layers = num_kv_layers * self.pp_size
+            src_k_ptrs = self.kv_args.kv_data_ptrs[:num_kv_layers]
+            src_v_ptrs = self.kv_args.kv_data_ptrs[num_kv_layers:]
+            layers_per_pp_stage = len(src_k_ptrs)
+            dst_k_ptrs = dst_kv_ptrs[start_layer:end_layer]
+            dst_v_ptrs = dst_kv_ptrs[
+                dst_num_total_layers + start_layer : dst_num_total_layers + end_layer
+            ]
+            kv_item_len = self.kv_args.kv_item_lens[0]
+            layers_params = [
+                (
+                    src_k_ptrs[layer_id],
+                    dst_k_ptrs[layer_id],
+                    kv_item_len,
+                )
+                for layer_id in range(layers_per_pp_stage)
+            ] + [
+                (
+                    src_v_ptrs[layer_id],
+                    dst_v_ptrs[layer_id],
+                    kv_item_len,
+                )
+                for layer_id in range(layers_per_pp_stage)
+            ]
+        assert layers_params is not None
+
+        def set_transfer_blocks(
+            src_ptr: int, dst_ptr: int, item_len: int
+        ) -> List[Tuple[int, int, int]]:
+            transfer_blocks = []
+            for prefill_index, decode_index in zip(prefill_kv_blocks, dst_kv_blocks):
+                src_addr = src_ptr + int(prefill_index[0]) * item_len
+                dst_addr = dst_ptr + int(decode_index[0]) * item_len
+                length = item_len * len(prefill_index)
+                transfer_blocks.append((src_addr, dst_addr, length))
+            return transfer_blocks
+
+        # Worker function for processing a single layer
+        def process_layer(src_ptr: int, dst_ptr: int, item_len: int) -> int:
+            transfer_blocks = set_transfer_blocks(src_ptr, dst_ptr, item_len)
+            return self._transfer_data(mooncake_session_id, transfer_blocks)
+
+        # Worker function for processing all layers in a batch
+        def process_layers(layers_params: List[Tuple[int, int, int]]) -> int:
+            transfer_blocks = []
+            for src_ptr, dst_ptr, item_len in layers_params:
+                transfer_blocks.extend(set_transfer_blocks(src_ptr, dst_ptr, item_len))
+            return self._transfer_data(mooncake_session_id, transfer_blocks)
+
+        if self.enable_custom_mem_pool:
+            futures = [
+                executor.submit(
+                    process_layer,
+                    src_ptr,
+                    dst_ptr,
+                    item_len,
+                )
+                for (src_ptr, dst_ptr, item_len) in layers_params
+            ]
+            for future in concurrent.futures.as_completed(futures):
+                status = future.result()
+                if status != 0:
+                    for f in futures:
+                        f.cancel()
+                    return status
+        else:
+            # Combining all layers' params in one batch transfer is more efficient
+            # compared to using multiple threads
+            return process_layers(layers_params)
+
+        return 0
+
+    def send_kvcache_slice(
+        self,
+        mooncake_session_id: str,
+        prefill_kv_indices: npt.NDArray[np.int64],
+        dst_kv_ptrs: list[int],
+        dst_kv_indices: npt.NDArray[np.int64],
+        dst_tp_rank: int,
+        dst_attn_tp_size: int,
+        dst_kv_item_len: int,
+        executor: concurrent.futures.ThreadPoolExecutor,
+    ):
+        """
+        Sends KV cache slices from this Prefill rank to a target Decode rank,
+        supporting generic M-to-N TP size configurations.
+
+        NOTE: This implementation calls the transfer engine for each token slot within
+        each page to ensure correctness for any page_size and head-slicing configuration.
+        This may introduce performance overhead (increased TTFT) for long sequences.
+        """
+        # Extract configuration
+        local_tp_rank_in_group = self.kv_args.engine_rank % self.attn_tp_size
+        src_kv_item_len = self.kv_args.kv_item_lens[0]
+        dst_tp_rank_in_group = dst_tp_rank % dst_attn_tp_size
+        num_kv_heads = self.kv_args.kv_head_num
+        num_layers = len(self.kv_args.kv_data_ptrs)
+        page_size = self.kv_args.page_size
+
+        # Calculate head distribution
+        src_heads_per_rank = num_kv_heads
+        dst_heads_per_rank = num_kv_heads * self.attn_tp_size // dst_attn_tp_size
+        bytes_per_head_slice_to_send = (
+            dst_kv_item_len // page_size // dst_heads_per_rank
+        )
+
+        # Determine slicing parameters based on TP configuration
+        if self.attn_tp_size > dst_attn_tp_size:
+            # Send KVCache from multiple prefill instances to 1 decode instance
+            src_head_start_offset = 0
+            num_heads_to_send = src_heads_per_rank
+            dst_head_start_offset = local_tp_rank_in_group * src_heads_per_rank
+        else:
+            # Send KVCache from 1 prefill instance to multiple decode instances
+            src_head_start_offset = (
+                dst_tp_rank_in_group * dst_heads_per_rank
+            ) % src_heads_per_rank
+            num_heads_to_send = dst_heads_per_rank
+            dst_head_start_offset = 0
+
+        # pp is not supported on the decode side yet
+        num_kv_layers = len(self.kv_args.kv_data_ptrs) // 2
+        dst_num_total_layers = num_kv_layers * self.pp_size
+        src_k_ptrs = self.kv_args.kv_data_ptrs[:num_kv_layers]
+        src_v_ptrs = self.kv_args.kv_data_ptrs[num_kv_layers:]
+        layers_per_pp_stage = len(src_k_ptrs)
+        start_layer = self.pp_rank * layers_per_pp_stage
+        end_layer = start_layer + layers_per_pp_stage
+        dst_k_ptrs = dst_kv_ptrs[start_layer:end_layer]
+        dst_v_ptrs = dst_kv_ptrs[
+            dst_num_total_layers + start_layer : dst_num_total_layers + end_layer
+        ]
+
+        # Calculate precise byte offset and length for the sub-slice within the token
+        src_head_slice_offset = src_head_start_offset * bytes_per_head_slice_to_send
+        dst_head_slice_offset = dst_head_start_offset * bytes_per_head_slice_to_send
+        heads_bytes_per_token_to_send = num_heads_to_send * bytes_per_head_slice_to_send
+
+        # Sanity check: The data sub-slice to be sent should fit into the dst buffer.
+        # This means heads_bytes_per_token_to_send <= (dst_kv_item_len // page_size)
+        if heads_bytes_per_token_to_send > (dst_kv_item_len // page_size):
+            logger.error(
+                f"[{mooncake_session_id}] slice size ({heads_bytes_per_token_to_send}) exceeds "
+                f"target token slot size ({dst_kv_item_len // page_size})"
+            )
+            return -1
+
+        layers_params = [
+            (
+                src_k_ptrs[layer_id],
+                dst_k_ptrs[layer_id],
+                src_kv_item_len,
+                dst_kv_item_len,
+                src_head_slice_offset,
+                dst_head_slice_offset,
+                heads_bytes_per_token_to_send,
+            )
+            for layer_id in range(layers_per_pp_stage)
+        ] + [
+            (
+                src_v_ptrs[layer_id],
+                dst_v_ptrs[layer_id],
+                src_kv_item_len,
+                dst_kv_item_len,
+                src_head_slice_offset,
+                dst_head_slice_offset,
+                heads_bytes_per_token_to_send,
+            )
+            for layer_id in range(layers_per_pp_stage)
+        ]
+
+        def process_layer_tp_aware(layer_params):
+            (
+                src_ptr,
+                dst_ptr,
+                src_item_len,
+                dst_item_len,
+                src_head_slice_offset,
+                dst_head_slice_offset,
+                heads_bytes_per_token_to_send,
+            ) = layer_params
+            src_addr_list = []
+            dst_addr_list = []
+            length_list = []
+
+            # Calculate strides for a single token slot
+            bytes_per_token_on_prefill = src_item_len // page_size
+            bytes_per_token_on_decode = dst_item_len // page_size
+
+            for i in range(len(prefill_kv_indices)):
+                prefill_page_idx = int(prefill_kv_indices[i])
+                decode_page_idx = int(dst_kv_indices[i])
+
+                # Get the starting addresses for the current src and dst pages
+                src_page_start_addr = src_ptr + prefill_page_idx * src_item_len
+                dst_page_start_addr = dst_ptr + decode_page_idx * dst_item_len
+
+                # Iterate through each valid token slot within the current page
+                for token_slot_in_page in range(page_size):
+                    # Calculate the start address of the current token slot
+                    src_token_slot_start_addr = (
+                        src_page_start_addr
+                        + token_slot_in_page * bytes_per_token_on_prefill
+                    )
+                    dst_token_slot_start_addr = (
+                        dst_page_start_addr
+                        + token_slot_in_page * bytes_per_token_on_decode
+                    )
+
+                    # Calculate final src and dst addresses by applying head-slice offsets
+                    src_slice_addr = src_token_slot_start_addr + src_head_slice_offset
+                    dst_slice_addr = dst_token_slot_start_addr + dst_head_slice_offset
+
+                    src_addr_list.append(src_slice_addr)
+                    dst_addr_list.append(dst_slice_addr)
+                    length_list.append(heads_bytes_per_token_to_send)
+
+            return self.engine.batch_transfer_sync(
+                mooncake_session_id, src_addr_list, dst_addr_list, length_list
+            )
+
+        futures = [
+            executor.submit(
+                process_layer_tp_aware,
+                layer_params,
+            )
+            for layer_params in layers_params
+        ]
+
+        for future in concurrent.futures.as_completed(futures):
+            status = future.result()
+            if status != 0:
+                for f in futures:
+                    f.cancel()
+                return status
+
+        return 0
+
+    def send_aux(
+        self,
+        req: TransferInfo,
+        prefill_aux_index: int,
+        dst_aux_ptrs: list[int],
+    ):
+        # TODO(shangming): Fix me when nvlink_transport of Mooncake is bug-free
+        if self.enable_custom_mem_pool:
+            return self.send_aux_tcp(req, prefill_aux_index, dst_aux_ptrs)
+
+        transfer_blocks = []
+        prefill_aux_ptrs = self.kv_args.aux_data_ptrs
+        prefill_aux_item_lens = self.kv_args.aux_item_lens
+
+        for i, dst_aux_ptr in enumerate(dst_aux_ptrs):
+            length = prefill_aux_item_lens[i]
+            src_addr = prefill_aux_ptrs[i] + length * prefill_aux_index
+            dst_addr = dst_aux_ptrs[i] + length * req.dst_aux_index
+            transfer_blocks.append((src_addr, dst_addr, length))
+
+        return self._transfer_data(req.mooncake_session_id, transfer_blocks)
+
+    def send_aux_tcp(
+        self,
+        req: TransferInfo,
+        prefill_aux_index: int,
+        dst_aux_ptrs: list[int],
+    ):
+        prefill_aux_ptrs = self.kv_args.aux_data_ptrs
+        prefill_aux_item_lens = self.kv_args.aux_item_lens
+
+        for i in range(len(prefill_aux_ptrs)):
+            length = prefill_aux_item_lens[i]
+            src_addr = prefill_aux_ptrs[i] + length * prefill_aux_index
+            data = AuxDataCodec.serialize_data_from_buffer(src_addr, length)
+
+            self.send_aux_data_to_endpoint(
+                remote=req.endpoint,
+                dst_port=req.dst_port,
+                room=req.room,
+                buffer_index=i,
+                aux_index=req.dst_aux_index,
+                data=data,
+            )
+
+        return 0
+
+    def send_aux_data_to_endpoint(
+        self,
+        remote: str,
+        dst_port: int,
+        room: int,
+        buffer_index: int,
+        aux_index: int,
+        data: bytes,
+    ):
+        socket = self._connect(
+            format_tcp_address(remote, dst_port), is_ipv6=is_valid_ipv6_address(remote)
+        )
+
+        socket.send_multipart(
+            [
+                MooncakeKVManager.AUX_DATA_HEADER,
+                str(room).encode("ascii"),
+                str(buffer_index).encode("ascii"),
+                str(aux_index).encode("ascii"),
+                struct.pack(">I", len(data)),
+                data,
+            ]
+        )
+
+    def sync_status_to_decode_endpoint(
+        self, remote: str, dst_port: int, room: int, status: int, prefill_rank: int
+    ):
+        self._connect(
+            format_tcp_address(remote, dst_port), is_ipv6=is_valid_ipv6_address(remote)
+        ).send_multipart(
+            [
+                str(room).encode("ascii"),
+                str(status).encode("ascii"),
+                str(prefill_rank).encode("ascii"),
+            ]
+        )
+
+    def transfer_worker(
+        self, queue: FastQueue, executor: concurrent.futures.ThreadPoolExecutor
+    ):
+        while True:
+            try:
+                kv_chunk: TransferKVChunk = queue.get()
+                reqs_to_be_processed = (
+                    self.transfer_infos[kv_chunk.room].values()
+                    if kv_chunk.room in self.transfer_infos
+                    else []
+                )
+                polls = []
+                dst_ranks_infos = []
+                local_rank = self.attn_tp_rank * self.pp_size + self.pp_rank
+                for req in reqs_to_be_processed:
+                    if not req.is_dummy:
+                        # Early exit if the request has failed
+                        with self.session_lock:
+                            if req.mooncake_session_id in self.failed_sessions:
+                                self.record_failure(
+                                    kv_chunk.room,
+                                    f"Decode instance could be dead, remote mooncake session {req.mooncake_session_id} is not alive",
+                                )
+                                self.update_status(kv_chunk.room, KVPoll.Failed)
+                                self.sync_status_to_decode_endpoint(
+                                    req.endpoint,
+                                    req.dst_port,
+                                    req.room,
+                                    KVPoll.Failed,
+                                    local_rank,
+                                )
+                                break
+
+                        chunked_dst_kv_indice = req.dst_kv_indices[kv_chunk.index_slice]
+
+                        # NOTE: This is temporarily a workaround to deal with the case where the prefill_kv_indices
+                        # is mismatched with the dst_kv_indices when page size > 1, this should never happen.
+                        if len(chunked_dst_kv_indice) < len(
+                            kv_chunk.prefill_kv_indices
+                        ):
+                            logger.warning(
+                                f"len(chunked_dst_kv_indice) = {len(chunked_dst_kv_indice)}, len(kv_chunk.prefill_kv_indices) = {len(kv_chunk.prefill_kv_indices)}"
+                            )
+                            kv_chunk.prefill_kv_indices = kv_chunk.prefill_kv_indices[
+                                : len(chunked_dst_kv_indice)
+                            ]
+
+                        target_rank_registration_info: KVArgsRegisterInfo = (
+                            self.decode_kv_args_table[req.mooncake_session_id]
+                        )
+                        if self.is_mla_backend or (
+                            self.attn_tp_size
+                            == target_rank_registration_info.dst_attn_tp_size
+                        ):
+                            ret = self.send_kvcache(
+                                req.mooncake_session_id,
+                                kv_chunk.prefill_kv_indices,
+                                target_rank_registration_info.dst_kv_ptrs,
+                                chunked_dst_kv_indice,
+                                executor,
+                            )
+                        else:
+                            ret = self.send_kvcache_slice(
+                                req.mooncake_session_id,
+                                kv_chunk.prefill_kv_indices,
+                                target_rank_registration_info.dst_kv_ptrs,
+                                chunked_dst_kv_indice,
+                                target_rank_registration_info.dst_tp_rank,
+                                target_rank_registration_info.dst_attn_tp_size,
+                                target_rank_registration_info.dst_kv_item_len,
+                                executor,
+                            )
+                        if ret != 0:
+                            with self.session_lock:
+                                self.session_failures[req.mooncake_session_id] += 1
+                                # Failures should never happen if the session is not dead, if the session fails once, mark it as failed
+                                if self.session_failures[req.mooncake_session_id] >= 1:
+                                    self.failed_sessions.add(req.mooncake_session_id)
+                                    logger.error(
+                                        f"Session {req.mooncake_session_id} failed."
+                                    )
+                            self.record_failure(
+                                kv_chunk.room,
+                                f"Failed to send kv chunk of {kv_chunk.room} to {req.endpoint}:{req.dst_port}",
+                            )
+                            self.update_status(kv_chunk.room, KVPoll.Failed)
+                            self.sync_status_to_decode_endpoint(
+                                req.endpoint,
+                                req.dst_port,
+                                req.room,
+                                KVPoll.Failed,
+                                local_rank,
+                            )
+                            break
+
+                        if kv_chunk.is_last:
+                            if self.pp_group.is_last_rank:
+                                # Only the last chunk we need to send the aux data
+                                ret = self.send_aux(
+                                    req,
+                                    kv_chunk.prefill_aux_index,
+                                    target_rank_registration_info.dst_aux_ptrs,
+                                )
+                            polls.append(True if ret == 0 else False)
+                            dst_ranks_infos.append(
+                                (req.endpoint, req.dst_port, req.room)
+                            )
+
+                            # Only sync status when all the dst ranks have received the kvcache
+                            if len(polls) == req.required_dst_info_num:
+                                status = KVPoll.Success if all(polls) else KVPoll.Failed
+                                self.update_status(req.room, status)
+                                for endpoint, dst_port, room in dst_ranks_infos:
+                                    self.sync_status_to_decode_endpoint(
+                                        endpoint, dst_port, room, status, local_rank
+                                    )
+                    else:
+                        # Dummy request means the decode instance is not used, so its status can be marked as success directly
+                        # Dummy request does not need to sync status to decode endpoint
+                        if kv_chunk.is_last and req.room in self.request_status:
+                            self.update_status(req.room, KVPoll.Success)
+
+                if (
+                    kv_chunk.room not in self.request_status
+                    or self.check_status(kv_chunk.room) == KVPoll.Success
+                ):
+                    if kv_chunk.room in self.transfer_infos:
+                        self.transfer_infos.pop(kv_chunk.room)
+
+            except Exception as e:
+                # NOTE(shangming): Remove this when we make sure the transfer thread is bug-free
+                raise RuntimeError(
+                    f"Transfer thread failed because of {e}. Prefill instance with bootstrap_port={self.bootstrap_port} is dead."
+                )
+
+    def _bind_server_socket(self):
+        self.server_socket.bind(format_tcp_address(self.local_ip, self.rank_port))
+
+    def start_prefill_thread(self):
+        self.rank_port = get_free_port()
+        self._bind_server_socket()
+
+        def bootstrap_thread():
+            """This thread recvs pre-alloc notification from the decode engine"""
+            # KVPoll.Bootstrapping -> KVPoll.WaitingForInput
+            while True:
+                waiting_req_bytes = self.server_socket.recv_multipart()
+                room = waiting_req_bytes[0].decode("ascii")
+                mooncake_session_id = waiting_req_bytes[3].decode("ascii")
+                if room == "None":
+                    self.decode_kv_args_table[mooncake_session_id] = (
+                        KVArgsRegisterInfo.from_zmq(waiting_req_bytes)
+                    )
+                    with self.session_lock:
+                        if mooncake_session_id in self.failed_sessions:
+                            self.failed_sessions.remove(mooncake_session_id)
+                        if mooncake_session_id in self.session_failures:
+                            del self.session_failures[mooncake_session_id]
+                    logger.debug(
+                        f"Register KVArgs from {mooncake_session_id} successfully"
+                    )
+                    continue
+                else:
+                    required_dst_info_num = int(waiting_req_bytes[6].decode("ascii"))
+                    room = int(room)
+                    if room not in self.transfer_infos:
+                        self.transfer_infos[room] = {}
+
+                    self.transfer_infos[room][mooncake_session_id] = (
+                        TransferInfo.from_zmq(waiting_req_bytes)
+                    )
+                    # NOTE: after bootstrapping we can mark the req as waiting for input
+                    if len(self.transfer_infos[room]) == required_dst_info_num:
+                        self.update_status(room, KVPoll.WaitingForInput)
+
+        threading.Thread(target=bootstrap_thread).start()
+
+    def _handle_aux_data(self, msg: List[bytes]):
+        """Handle AUX_DATA messages received by the decode thread."""
+        room = int(msg[1].decode("ascii"))
+        buffer_index = int(msg[2].decode("ascii"))
+        aux_index = int(msg[3].decode("ascii"))
+        data_length = struct.unpack(">I", msg[4])[0]
+        data = msg[5]
+
+        if len(data) != data_length:
+            logger.error(f"AUX_DATA length mismatch for bootstrap_room {room}")
+            return
+
+        AuxDataCodec.deserialize_data_to_buffer(
+            self.kv_args, buffer_index, aux_index, data
+        )
+
+        logger.debug(
+            f"Received AUX_DATA for bootstrap_room {room} with length:{len(data)}"
+        )
+
+    def start_decode_thread(self):
+        self.rank_port = get_free_port()
+        self._bind_server_socket()
+
+        def decode_thread():
+            while True:
+                msg = self.server_socket.recv_multipart()
+                if msg[0] == MooncakeKVManager.AUX_DATA_HEADER:
+                    self._handle_aux_data(msg)
+                    continue
+
+                (bootstrap_room, status, prefill_rank) = msg
+                status = int(status.decode("ascii"))
+                bootstrap_room = int(bootstrap_room.decode("ascii"))
+                prefill_rank = int(prefill_rank.decode("ascii"))
+
+                if status == KVPoll.Success:
+                    if bootstrap_room in self.request_status:
+                        self.prefill_response_tracker[bootstrap_room].add(prefill_rank)
+                        expected_response_num = (
+                            self.required_prefill_response_num_table[bootstrap_room]
+                        )
+                        arrived_response_num = len(
+                            self.prefill_response_tracker[bootstrap_room]
+                        )
+                        if arrived_response_num == expected_response_num:
+                            self.update_status(bootstrap_room, KVPoll.Success)
+                elif status == KVPoll.Failed:
+                    self.record_failure(
+                        bootstrap_room,
+                        f"Failed to get kvcache from prefill instance, it might be dead",
+                    )
+                    self.update_status(bootstrap_room, status)
+
+        def heartbeat_checker():
+            while True:
+                time.sleep(self.heartbeat_interval)
+                with self.connection_lock:
+                    addresses = list(self.prefill_dp_size_table.keys())
+
+                for bootstrap_addr in addresses:
+                    session = None
+                    try:
+                        with self.session_pool_lock:
+                            session = self.session_pool[bootstrap_addr]
+                        response = session.get(
+                            f"http://{bootstrap_addr}/health",
+                            timeout=(2, 3),
+                            headers={"Connection": "keep-alive"},
+                        )
+                        if response.status_code == 200:
+                            self.heartbeat_failures[bootstrap_addr] = 0
+
+                            current_rooms = self.addr_to_rooms_tracker[
+                                bootstrap_addr
+                            ].copy()
+
+                            for bootstrap_room in current_rooms:
+                                # Remove KVPoll.Success requests from the tracker
+                                if bootstrap_room not in self.request_status:
+                                    self.addr_to_rooms_tracker[bootstrap_addr].discard(
+                                        bootstrap_room
+                                    )
+                        else:
+                            logger.info(
+                                f"Attempting to reconnect to {bootstrap_addr}..."
+                            )
+                            self.heartbeat_failures[bootstrap_addr] = (
+                                self.heartbeat_failures.get(bootstrap_addr, 0) + 1
+                            )
+                            with self.session_pool_lock:
+                                if bootstrap_addr in self.session_pool:
+                                    del self.session_pool[bootstrap_addr]
+                    except Exception:
+                        logger.info(f"Attempting to reconnect to {bootstrap_addr}...")
+                        self.heartbeat_failures[bootstrap_addr] = (
+                            self.heartbeat_failures.get(bootstrap_addr, 0) + 1
+                        )
+
+                    if (
+                        self.heartbeat_failures.get(bootstrap_addr, 0)
+                        >= self.max_failures
+                    ):
+                        self._handle_node_failure(bootstrap_addr)
+                        with self.session_pool_lock:
+                            if bootstrap_addr in self.session_pool:
+                                del self.session_pool[bootstrap_addr]
+
+        threading.Thread(target=decode_thread).start()
+        threading.Thread(target=heartbeat_checker).start()
+
+    def add_transfer_request(
+        self,
+        bootstrap_room: int,
+        kv_indices: npt.NDArray[np.int32],
+        index_slice: slice,
+        is_last: bool,
+        aux_index: Optional[int] = None,
+    ):
+        assert self.disaggregation_mode == DisaggregationMode.PREFILL
+        assert not is_last or (is_last and aux_index is not None)
+
+        if (
+            bootstrap_room not in self.request_status
+            or self.check_status(bootstrap_room) == KVPoll.Failed
+        ):
+            logger.debug(
+                "Request with bootstrap_room=%s already failed", bootstrap_room
+            )
+            return
+
+        if bootstrap_room not in self.transfer_infos:
+            # This means that the current rank is a dummy rank for this request,
+            # and it has already been marked as success, so there is no need to
+            # add further chunks into the transfer queue.
+            return
+
+        # NOTE(shangming): sharding according to the dst_infos to make sure
+        # requests with the same dst_sessions will be added into the same
+        # queue, which enables early abort with failed sessions.
+        dst_infos = self.transfer_infos[bootstrap_room].keys()
+        session_port_sum = sum(int(session.rsplit(":", 1)[1]) for session in dst_infos)
+        shard_idx = session_port_sum % len(self.transfer_queues)
+
+        self.transfer_queues[shard_idx].put(
+            TransferKVChunk(
+                room=bootstrap_room,
+                prefill_kv_indices=kv_indices,
+                index_slice=index_slice,
+                is_last=is_last,
+                prefill_aux_index=aux_index,
+            )
+        )
+
+    def check_status(self, bootstrap_room: int):
+        return self.request_status[bootstrap_room]
+
+    def update_status(self, bootstrap_room: int, status: KVPoll):
+        if bootstrap_room not in self.request_status:
+            self.request_status[bootstrap_room] = status
+        else:
+            # NOTE: status is only allowed to be incremented unless it is KVPoll.Failed
+            if status == KVPoll.Failed:
+                self.request_status[bootstrap_room] = KVPoll.Failed
+            else:
+                self.request_status[bootstrap_room] = max(
+                    self.request_status[bootstrap_room], status
+                )
+
+    def record_failure(self, bootstrap_room: int, failure_reason: str):
+        with self.failure_lock:
+            self.failure_records[bootstrap_room] = failure_reason
+
+    def get_session_id(self):
+        return self.engine.get_session_id()
+
+    def _register_to_bootstrap(self):
+        """Register KVSender to bootstrap server via HTTP POST."""
+        if self.dist_init_addr:
+            # multi node case: bootstrap server's host is dist_init_addr
+            if self.dist_init_addr.startswith("["):  # [ipv6]:port or [ipv6]
+                if self.dist_init_addr.endswith("]"):
+                    host = self.dist_init_addr
+                else:
+                    host, _ = self.dist_init_addr.rsplit(":", 1)
+            else:
+                host = socket.gethostbyname(self.dist_init_addr.rsplit(":", 1)[0])
+        else:
+            # single node case: bootstrap server's host is same as http server's host
+            host = self.bootstrap_host
+            host = maybe_wrap_ipv6_address(host)
+
+        bootstrap_server_url = f"{host}:{self.bootstrap_port}"
+        url = f"http://{bootstrap_server_url}/route"
+        payload = {
+            "role": "Prefill",
+            "attn_tp_size": self.attn_tp_size,
+            "attn_tp_rank": self.attn_tp_rank,
+            "attn_dp_size": self.attn_dp_size,
+            "attn_dp_rank": self.attn_dp_rank,
+            "pp_size": self.pp_size,
+            "pp_rank": self.pp_rank,
+            "system_dp_size": self.system_dp_size,
+            "system_dp_rank": self.system_dp_rank,
+            "rank_ip": self.local_ip,
+            "rank_port": self.rank_port,
+        }
+
+        try:
+            response = requests.put(url, json=payload, timeout=5)
+            if response.status_code == 200:
+                logger.debug("Prefill successfully registered to bootstrap server.")
+            else:
+                logger.error(
+                    f"Prefill instance failed to connect to bootstrap server: {response.status_code}, {response.text}"
+                )
+        except Exception as e:
+            logger.error(
+                f"Prefill instance failed to register to bootstrap server: {e}"
+            )
+
+    def _handle_node_failure(self, failed_bootstrap_addr):
+        with self.connection_lock:
+            keys_to_remove = [
+                k for k in self.connection_pool if k.startswith(failed_bootstrap_addr)
+            ]
+            for k in keys_to_remove:
+                del self.connection_pool[k]
+            if failed_bootstrap_addr in self.prefill_attn_tp_size_table:
+                del self.prefill_attn_tp_size_table[failed_bootstrap_addr]
+            if failed_bootstrap_addr in self.prefill_dp_size_table:
+                del self.prefill_dp_size_table[failed_bootstrap_addr]
+            if failed_bootstrap_addr in self.prefill_pp_size_table:
+                del self.prefill_pp_size_table[failed_bootstrap_addr]
+
+            possible_affected_rooms = self.addr_to_rooms_tracker.get(
+                failed_bootstrap_addr, []
+            )
+            if failed_bootstrap_addr in self.addr_to_rooms_tracker:
+                del self.addr_to_rooms_tracker[failed_bootstrap_addr]
+
+        # Report the requests associated with the failed bootstrap addr and mark their status as KVPoll.Failed
+        affected_rooms = []
+        for room in possible_affected_rooms:
+            if (
+                room in self.request_status
+                and self.check_status(room) != KVPoll.Success
+            ):
+                self.record_failure(
+                    room,
+                    f"Losing connection with prefill instance (bootstrap_addr: {failed_bootstrap_addr})",
+                )
+                self.update_status(room, KVPoll.Failed)
+                affected_rooms.append(room)
+        logger.error(
+            f"Losing connection with prefill instance (bootstrap_addr: {failed_bootstrap_addr}), {len(affected_rooms)} requests affected"
+        )
+
+
+class MooncakeKVSender(BaseKVSender):
+
+    def __init__(
+        self,
+        mgr: MooncakeKVManager,
+        bootstrap_addr: str,
+        bootstrap_room: int,
+        dest_tp_ranks: List[int],
+        pp_rank: int,
+    ):
+        self.kv_mgr = mgr
+        self.bootstrap_room = bootstrap_room
+        self.kv_mgr.update_status(bootstrap_room, KVPoll.Bootstrapping)
+        self.aux_index = None
+        self.bootstrap_server_url = bootstrap_addr
+        self.conclude_state = None
+        self.init_time = time.time()
+        # inner state
+        self.curr_idx = 0
+
+    def init(self, num_kv_indices: int, aux_index: Optional[int] = None):
+        self.num_kv_indices = num_kv_indices
+        self.aux_index = aux_index
+
+    def send(
+        self,
+        kv_indices: npt.NDArray[np.int32],
+    ):
+        index_slice = slice(self.curr_idx, self.curr_idx + len(kv_indices))
+        self.curr_idx += len(kv_indices)
+        is_last = self.curr_idx == self.num_kv_indices
+
+        if not is_last:
+            self.kv_mgr.add_transfer_request(
+                self.bootstrap_room,
+                kv_indices,
+                index_slice,
+                False,
+            )
+        else:
+            self.kv_mgr.add_transfer_request(
+                self.bootstrap_room,
+                kv_indices,
+                index_slice,
+                True,
+                aux_index=self.aux_index,
+            )
+
+    def poll(self) -> KVPoll:
+        if self.conclude_state is None:
+            status = self.kv_mgr.check_status(self.bootstrap_room)
+            if status in (KVPoll.Success, KVPoll.Failed):
+                self.conclude_state = status
+            elif status == KVPoll.Bootstrapping:
+                if self.init_time is not None:
+                    now = time.time()
+                    elapsed = now - self.init_time
+                    if elapsed >= self.kv_mgr.bootstrap_timeout:
+                        logger.warning_once(
+                            "Some requests timed out when bootstrapping, "
+                            "which means prefill instances fail to receive the KV indices from the decode instance of this request. "
+                            "If a greater mean TTFT is acceptable, you can 'export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=600' (10 minutes) to relax the timeout condition. "
+                        )
+                        self.kv_mgr.record_failure(
+                            self.bootstrap_room,
+                            f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.Bootstrapping",
+                        )
+                        self.conclude_state = KVPoll.Failed
+                        return KVPoll.Failed
+
+            return status
+        else:
+            return self.conclude_state
+
+    def clear(self) -> None:
+        if self.bootstrap_room in self.kv_mgr.request_status:
+            self.kv_mgr.request_status.pop(self.bootstrap_room)
+
+    def failure_exception(self):
+        # Explicitly set the status to failure since this request has failed in another rank
+        if self.conclude_state is None:
+            self.conclude_state = KVPoll.Failed
+
+        self.clear()
+
+        with self.kv_mgr.failure_lock:
+            failure_reason = self.kv_mgr.failure_records.pop(
+                self.bootstrap_room, "Failed due to an unknown reason from another rank"
+            )
+        raise KVTransferError(self.bootstrap_room, failure_reason)
+
+    def abort(self):
+        self.kv_mgr.record_failure(
+            self.bootstrap_room,
+            "Aborted by AbortReq.",
+        )
+        # Explicitly set the status to failure since this request has been aborted
+        self.conclude_state = KVPoll.Failed
+
+
+class MooncakeKVReceiver(BaseKVReceiver):
+    _ctx = zmq.Context()
+    _socket_cache = {}
+    _socket_locks = {}
+    _global_lock = threading.Lock()
+
+    def __init__(
+        self,
+        mgr: MooncakeKVManager,
+        bootstrap_addr: str,
+        bootstrap_room: Optional[int] = None,
+        prefill_dp_rank: Optional[int] = None,
+    ):
+        self.bootstrap_room = bootstrap_room
+        self.bootstrap_addr = bootstrap_addr
+        self.kv_mgr = mgr
+        self.session_id = self.kv_mgr.get_session_id()
+        self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Bootstrapping)
+        self.conclude_state = None
+        self.init_time = None
+
+        if self.bootstrap_addr not in self.kv_mgr.prefill_dp_size_table:
+            (
+                self.prefill_attn_tp_size,
+                self.prefill_dp_size,
+                self.prefill_pp_size,
+            ) = self._get_prefill_parallel_info_from_server()
+            if (
+                self.prefill_attn_tp_size is None
+                or self.prefill_dp_size is None
+                or self.prefill_pp_size is None
+            ):
+                self.kv_mgr.record_failure(
+                    self.bootstrap_room,
+                    f"Could not fetch prefill parallel info from bootstrap_addr: {self.bootstrap_addr}",
+                )
+                self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Failed)
+                return
+            else:
+                logger.debug(
+                    f"Fetch prefill parallel info from [{self.bootstrap_addr}]: DP size:{self.prefill_dp_size}, TP size:{self.prefill_attn_tp_size} PP size:{self.prefill_pp_size}"
+                )
+                self.kv_mgr.prefill_attn_tp_size_table[self.bootstrap_addr] = (
+                    self.prefill_attn_tp_size
+                )
+                self.kv_mgr.prefill_dp_size_table[self.bootstrap_addr] = (
+                    self.prefill_dp_size
+                )
+                self.kv_mgr.prefill_pp_size_table[self.bootstrap_addr] = (
+                    self.prefill_pp_size
+                )
+        else:
+            self.prefill_attn_tp_size = self.kv_mgr.prefill_attn_tp_size_table[
+                self.bootstrap_addr
+            ]
+            self.prefill_dp_size = self.kv_mgr.prefill_dp_size_table[
+                self.bootstrap_addr
+            ]
+            self.prefill_pp_size = self.kv_mgr.prefill_pp_size_table[
+                self.bootstrap_addr
+            ]
+
+        # Currently, we don't allow prefill instance and decode instance to
+        # have different TP sizes per DP rank, except for models using MLA.
+        if self.kv_mgr.attn_tp_size == self.prefill_attn_tp_size:
+            self.target_tp_rank = (
+                self.kv_mgr.kv_args.engine_rank % self.kv_mgr.attn_tp_size
+            )
+            self.required_dst_info_num = 1
+            self.required_prefill_response_num = 1 * (
+                self.prefill_pp_size // self.kv_mgr.pp_size
+            )
+            self.target_tp_ranks = [self.target_tp_rank]
+        elif self.kv_mgr.attn_tp_size > self.prefill_attn_tp_size:
+            if not self.kv_mgr.is_mla_backend:
+                logger.warning_once(
+                    "Performance is NOT guaranteed when using different TP sizes for non-MLA models. "
+                )
+            self.target_tp_rank = (
+                self.kv_mgr.kv_args.engine_rank % self.kv_mgr.attn_tp_size
+            ) // (self.kv_mgr.attn_tp_size // self.prefill_attn_tp_size)
+            self.required_dst_info_num = (
+                self.kv_mgr.attn_tp_size // self.prefill_attn_tp_size
+            )
+            self.required_prefill_response_num = 1 * (
+                self.prefill_pp_size // self.kv_mgr.pp_size
+            )
+            self.target_tp_ranks = [self.target_tp_rank]
+        else:
+            if not self.kv_mgr.is_mla_backend:
+                logger.warning_once(
+                    "Performance is NOT guaranteed when using different TP sizes for non-MLA models. "
+                )
+            # For non-MLA models, one decode rank needs to retrieve KVCache from multiple prefill ranks for non MLA models;
+            self.target_tp_ranks = [
+                rank
+                for rank in range(
+                    (self.kv_mgr.kv_args.engine_rank % self.kv_mgr.attn_tp_size)
+                    * (self.prefill_attn_tp_size // self.kv_mgr.attn_tp_size),
+                    (self.kv_mgr.kv_args.engine_rank % self.kv_mgr.attn_tp_size + 1)
+                    * (self.prefill_attn_tp_size // self.kv_mgr.attn_tp_size),
+                )
+            ]
+
+            # For MLA models, we can retrieve KVCache from only one prefill rank, but we still need to maintain
+            # multiple connections in the connection pool and have to send dummy requests to other prefill ranks,
+            # or the KVPoll will never be set correctly
+            self.target_tp_rank = self.target_tp_ranks[0]
+            self.required_dst_info_num = 1
+            if self.kv_mgr.is_mla_backend:
+                self.required_prefill_response_num = (
+                    self.prefill_pp_size // self.kv_mgr.pp_size
+                )
+            else:
+                self.required_prefill_response_num = (
+                    self.prefill_attn_tp_size // self.kv_mgr.attn_tp_size
+                ) * (self.prefill_pp_size // self.kv_mgr.pp_size)
+
+        if prefill_dp_rank is not None:
+            logger.debug(f"Targeting DP rank: {prefill_dp_rank}")
+            self.prefill_dp_rank = prefill_dp_rank
+        else:
+            self.prefill_dp_rank = bootstrap_room % self.prefill_dp_size
+
+        # FIXME: alias here: target_dp_group -> prefill_dp_rank
+        self.target_dp_group = self.prefill_dp_rank
+
+        self.kv_mgr.required_prefill_response_num_table[self.bootstrap_room] = (
+            self.required_prefill_response_num
+        )
+        # NOTE: key distinguished by bootstrap_addr, target_dp_group, and target_tp_rank
+        bootstrap_key = (
+            f"{self.bootstrap_addr}_{self.target_dp_group}_{self.target_tp_rank}"
+        )
+
+        if bootstrap_key not in self.kv_mgr.connection_pool:
+            bootstrap_infos = []
+            for target_tp_rank in self.target_tp_ranks:
+                for target_pp_rank in range(self.prefill_pp_size):
+                    bootstrap_info = self._get_bootstrap_info_from_server(
+                        target_tp_rank, self.target_dp_group, target_pp_rank
+                    )
+                    if bootstrap_info is not None:
+                        if self.kv_mgr.is_mla_backend:
+                            # For MLA: target_tp_rank is the selected real rank, others are dummy ranks
+                            bootstrap_info["is_dummy"] = not bool(
+                                target_tp_rank == self.target_tp_rank
+                                or self.target_tp_rank is None
+                            )
+                        else:
+                            # For non-MLA: all target_tp_ranks are selected real ranks
+                            bootstrap_info["is_dummy"] = False
+                        logger.debug(
+                            f"Fetched bootstrap info: {bootstrap_info} for DP {self.target_dp_group} TP {target_tp_rank} PP {target_pp_rank}"
+                        )
+                        bootstrap_infos.append(bootstrap_info)
+                    else:
+                        self.kv_mgr.record_failure(
+                            self.bootstrap_room,
+                            f"Could not fetch bootstrap info for engine rank: {self.kv_mgr.kv_args.engine_rank} and target_dp_group: {self.target_dp_group} and target_pp_rank {target_pp_rank}",
+                        )
+                        self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Failed)
+                        return
+
+            self.bootstrap_infos = bootstrap_infos
+            self.kv_mgr.connection_pool[bootstrap_key] = self.bootstrap_infos
+
+            # Register kv_args only once to prefill KVManager according to the info fetched from the bootstrap server
+            self._register_kv_args()
+        else:
+            self.bootstrap_infos = self.kv_mgr.connection_pool[bootstrap_key]
+
+        assert len(self.bootstrap_infos) > 0
+        self.kv_mgr.addr_to_rooms_tracker[self.bootstrap_addr].add(self.bootstrap_room)
+        self.kv_mgr.update_status(self.bootstrap_room, KVPoll.WaitingForInput)
+
+    def _get_bootstrap_info_from_server(
+        self, engine_rank, target_dp_group, target_pp_rank
+    ):
+        """Fetch the bootstrap info from the bootstrap server."""
+        try:
+            url = f"http://{self.bootstrap_addr}/route?engine_rank={engine_rank}&target_dp_group={target_dp_group}&target_pp_rank={target_pp_rank}"
+            response = requests.get(url, timeout=5)
+            if response.status_code == 200:
+                bootstrap_info = response.json()
+                return bootstrap_info
+            else:
+                logger.error(
+                    f"Failed to get prefill server info: {response.status_code}, {response.text}"
+                )
+                return None
+        except Exception as e:
+            logger.error(f"Error fetching prefill info from bootstrap: {e}")
+            return None
+
+    def _get_prefill_parallel_info_from_server(
+        self,
+    ) -> Tuple[Optional[int], Optional[int], Optional[int]]:
+        """Fetch the prefill parallel info from the bootstrap server."""
+        try:
+            url = f"http://{self.bootstrap_addr}/route?engine_rank={-1}&target_dp_group={-1}&target_pp_rank={-1}"
+            response = requests.get(url)
+            if response.status_code == 200:
+                prefill_parallel_info = response.json()
+                return (
+                    int(prefill_parallel_info["prefill_attn_tp_size"]),
+                    int(prefill_parallel_info["prefill_dp_size"]),
+                    int(prefill_parallel_info["prefill_pp_size"]),
+                )
+            else:
+                logger.error(
+                    f"Failed to get prefill parallel info: {response.status_code}, {response.text}"
+                )
+                return None, None, None
+        except Exception as e:
+            logger.error(f"Error fetching prefill parallel info from bootstrap: {e}")
+            return None, None, None
+
+    def _register_kv_args(self):
+        for bootstrap_info in self.bootstrap_infos:
+            packed_kv_data_ptrs = b"".join(
+                struct.pack("Q", ptr) for ptr in self.kv_mgr.kv_args.kv_data_ptrs
+            )
+            packed_aux_data_ptrs = b"".join(
+                struct.pack("Q", ptr) for ptr in self.kv_mgr.kv_args.aux_data_ptrs
+            )
+            # Note(shangming): No need to add pp rank here since pp is not supported on the decode side yet
+            tp_rank = self.kv_mgr.kv_args.engine_rank
+            kv_item_len = self.kv_mgr.kv_args.kv_item_lens[0]
+            dst_tp_rank = str(tp_rank).encode("ascii")
+            dst_attn_tp_size = str(self.kv_mgr.attn_tp_size).encode("ascii")
+            dst_kv_item_len = str(kv_item_len).encode("ascii")
+
+            sock, lock = self._connect_to_bootstrap_server(bootstrap_info)
+            with lock:
+                sock.send_multipart(
+                    [
+                        "None".encode("ascii"),
+                        self.kv_mgr.local_ip.encode("ascii"),
+                        str(self.kv_mgr.rank_port).encode("ascii"),
+                        self.session_id.encode("ascii"),
+                        packed_kv_data_ptrs,
+                        packed_aux_data_ptrs,
+                        dst_tp_rank,
+                        dst_attn_tp_size,
+                        dst_kv_item_len,
+                    ]
+                )
+
+    @classmethod
+    def _connect(cls, endpoint: str, is_ipv6: bool = False):
+        with cls._global_lock:
+            if endpoint not in cls._socket_cache:
+                sock = cls._ctx.socket(zmq.PUSH)
+                if is_ipv6:
+                    sock.setsockopt(zmq.IPV6, 1)
+                sock.connect(endpoint)
+                cls._socket_cache[endpoint] = sock
+                cls._socket_locks[endpoint] = threading.Lock()
+            return cls._socket_cache[endpoint], cls._socket_locks[endpoint]
+
+    @classmethod
+    def _connect_to_bootstrap_server(cls, bootstrap_info: dict):
+        ip_address = bootstrap_info["rank_ip"]
+        port = bootstrap_info["rank_port"]
+        is_ipv6_address = is_valid_ipv6_address(ip_address)
+        sock, lock = cls._connect(
+            format_tcp_address(ip_address, port), is_ipv6=is_ipv6_address
+        )
+        return sock, lock
+
+    def init(self, kv_indices: npt.NDArray[np.int32], aux_index: Optional[int] = None):
+        for bootstrap_info in self.bootstrap_infos:
+            sock, lock = self._connect_to_bootstrap_server(bootstrap_info)
+            is_dummy = bootstrap_info["is_dummy"]
+
+            with lock:
+                sock.send_multipart(
+                    [
+                        str(self.bootstrap_room).encode("ascii"),
+                        self.kv_mgr.local_ip.encode("ascii"),
+                        str(self.kv_mgr.rank_port).encode("ascii"),
+                        self.session_id.encode("ascii"),
+                        kv_indices.tobytes() if not is_dummy else b"",
+                        str(aux_index).encode("ascii") if not is_dummy else b"",
+                        str(self.required_dst_info_num).encode("ascii"),
+                    ]
+                )
+        self.init_time = time.time()
+
+    def poll(self) -> KVPoll:
+        if self.conclude_state is None:
+            status = self.kv_mgr.check_status(self.bootstrap_room)
+            if status in (KVPoll.Success, KVPoll.Failed):
+                self.conclude_state = status
+            elif status == KVPoll.WaitingForInput:
+                if self.init_time is not None:
+                    now = time.time()
+                    elapsed = now - self.init_time
+                    if elapsed >= self.kv_mgr.waiting_timeout:
+                        logger.warning_once(
+                            "Some requests fail to receive KV Cache transfer done signal after bootstrapping. "
+                            "If a greater mean TTFT is acceptable, you can 'export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=600' (10 minutes) to relax the timeout condition. "
+                        )
+                        self.kv_mgr.record_failure(
+                            self.bootstrap_room,
+                            f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.WaitingForInput",
+                        )
+                        self.conclude_state = KVPoll.Failed
+                        return KVPoll.Failed
+
+            return status
+
+        else:
+            return self.conclude_state
+
+    def clear(self) -> None:
+        if self.bootstrap_room in self.kv_mgr.request_status:
+            self.kv_mgr.request_status.pop(self.bootstrap_room)
+
+        if self.bootstrap_room in self.kv_mgr.required_prefill_response_num_table:
+            self.kv_mgr.required_prefill_response_num_table.pop(self.bootstrap_room)
+
+        if self.bootstrap_room in self.kv_mgr.prefill_response_tracker:
+            self.kv_mgr.prefill_response_tracker.pop(self.bootstrap_room)
+
+    def failure_exception(self):
+        # Explicitly set the status to failure since this request has failed in another rank
+        if self.conclude_state is None:
+            self.conclude_state = KVPoll.Failed
+
+        self.clear()
+
+        with self.kv_mgr.failure_lock:
+            failure_reason = self.kv_mgr.failure_records.pop(
+                self.bootstrap_room, "Failed due to an unknown reason from another rank"
+            )
+        raise KVTransferError(self.bootstrap_room, failure_reason)
+
+    def abort(self):
+        self.kv_mgr.record_failure(
+            self.bootstrap_room,
+            "Aborted by AbortReq.",
+        )
+        # Explicitly set the status to failure since this request has been aborted
+        self.conclude_state = KVPoll.Failed
+
+
+class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
+    def __init__(self, host: str, port: int):
+        self.host = host
+        self.port = port
+        self.app = web.Application()
+        self.store = dict()
+        self.lock = asyncio.Lock()
+        self._setup_routes()
+        self.pp_size = None
+        self.attn_tp_size = None
+        self.dp_size = None
+        self.prefill_port_table: Dict[
+            int, Dict[int, Dict[int, Dict[str, Union[str, int]]]]
+        ] = {}
+
+        # Start bootstrap server
+        self.thread = threading.Thread(target=self._run_server, daemon=True)
+        self.run()
+
+    def run(self):
+        self.thread.start()
+
+    def _setup_routes(self):
+        self.app.router.add_route("*", "/route", self._handle_route)
+        self.app.router.add_get("/health", self._handle_health_check)
+
+    async def _handle_health_check(self, request):
+        return web.Response(text="OK", status=200)
+
+    async def _handle_route(self, request: web.Request):
+        method = request.method
+        if method == "PUT":
+            return await self._handle_route_put(request)
+        elif method == "GET":
+            return await self._handle_route_get(request)
+        else:
+            return web.Response(
+                text="Method not allowed", status=405, content_type="application/json"
+            )
+
+    async def _handle_route_put(self, request: web.Request):
+        data = await request.json()
+        role = data["role"]
+        attn_tp_size = data["attn_tp_size"]
+        attn_tp_rank = data["attn_tp_rank"]
+        attn_dp_size = data["attn_dp_size"]
+        attn_dp_rank = data["attn_dp_rank"]
+        pp_size = data["pp_size"]
+        pp_rank = data["pp_rank"]
+        system_dp_size = data["system_dp_size"]
+        system_dp_rank = data["system_dp_rank"]
+        rank_ip = data["rank_ip"]
+        rank_port = int(data["rank_port"])
+
+        if self.attn_tp_size is None:
+            self.attn_tp_size = attn_tp_size
+
+        if self.dp_size is None:
+            self.dp_size = attn_dp_size if system_dp_size == 1 else system_dp_size
+
+        if self.pp_size is None:
+            self.pp_size = pp_size
+
+        if role == "Prefill":
+            if system_dp_size == 1:
+                dp_group = attn_dp_rank
+            else:
+                dp_group = system_dp_rank
+
+            # Add lock to make sure thread-safe
+            async with self.lock:
+                if dp_group not in self.prefill_port_table:
+                    self.prefill_port_table[dp_group] = {}
+                if attn_tp_rank not in self.prefill_port_table[dp_group]:
+                    self.prefill_port_table[dp_group][attn_tp_rank] = {}
+
+            self.prefill_port_table[dp_group][attn_tp_rank][pp_rank] = {
+                "rank_ip": rank_ip,
+                "rank_port": rank_port,
+            }
+            logger.debug(
+                f"Register prefill bootstrap: DP{dp_group} TP{attn_tp_rank} PP{pp_rank} with rank_ip: {rank_ip} and rank_port: {rank_port}"
+            )
+
+        return web.Response(text="OK", status=200)
+
+    async def _handle_route_get(self, request: web.Request):
+        engine_rank = request.query.get("engine_rank")
+        target_dp_group = request.query.get("target_dp_group")
+        target_pp_rank = request.query.get("target_pp_rank")
+        if not engine_rank or not target_dp_group or not target_pp_rank:
+            return web.Response(text="Missing inputs for bootstrap server.", status=400)
+
+        # Currently we use engine_rank == -1 and target_dp_group == -1 to sync dp size
+        if (
+            int(engine_rank) == -1
+            and int(target_dp_group) == -1
+            and int(target_pp_rank) == -1
+        ):
+            prefill_parallel_info = {
+                "prefill_attn_tp_size": self.attn_tp_size,
+                "prefill_dp_size": self.dp_size,
+                "prefill_pp_size": self.pp_size,
+            }
+            return web.json_response(prefill_parallel_info, status=200)
+
+        # Find corresponding prefill info
+        async with self.lock:
+            bootstrap_info = self.prefill_port_table[int(target_dp_group)][
+                int(engine_rank)
+            ][int(target_pp_rank)]
+
+        if bootstrap_info is not None:
+            return web.json_response(bootstrap_info, status=200)
+        else:
+            return web.Response(text="Bootstrap info not Found", status=404)
+
+    def _run_server(self):
+        try:
+            # Event Loop
+            self._loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(self._loop)
+
+            access_log = None
+            if logging.getLogger(__name__).getEffectiveLevel() <= logging.DEBUG:
+                access_log = self.app.logger
+
+            self._runner = web.AppRunner(self.app, access_log=access_log)
+            self._loop.run_until_complete(self._runner.setup())
+
+            site = web.TCPSite(self._runner, host=self.host, port=self.port)
+            self._loop.run_until_complete(site.start())
+            self._loop.run_forever()
+        except Exception as e:
+            logger.error(f"Server error: {str(e)}")
+        finally:
+            # Cleanup
+            self._loop.run_until_complete(self._runner.cleanup())
+            self._loop.close()
+
+    def close(self):
+        """Shutdown"""
+        if self._loop is not None and self._loop.is_running():
+            self._loop.call_soon_threadsafe(self._loop.stop)
+            logger.info("Stopping server loop...")
+
+        if self.thread.is_alive():
+            self.thread.join(timeout=2)
+            logger.info("Server thread stopped")
+
+    def poll(self) -> KVPoll: ...
diff --git a/python/sglang/srt/disaggregation/mooncake/transfer_engine.py b/python/sglang/srt/disaggregation/mooncake/transfer_engine.py
new file mode 100644
index 000000000..54657bb46
--- /dev/null
+++ b/python/sglang/srt/disaggregation/mooncake/transfer_engine.py
@@ -0,0 +1,164 @@
+import logging
+from typing import List, Optional
+
+from sglang.srt.utils import get_bool_env_var, get_free_port, maybe_wrap_ipv6_address
+
+logger = logging.getLogger(__name__)
+
+
+class MooncakeTransferEngine:
+
+    def __init__(self, hostname: str, gpu_id: int, ib_device: Optional[str] = None):
+        try:
+            from mooncake.engine import TransferEngine
+        except ImportError as e:
+            raise ImportError(
+                "Please install mooncake by following the instructions at "
+                "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
+                "to run SGLang with MooncakeTransferEngine."
+            ) from e
+
+        self.engine = TransferEngine()
+        self.hostname = hostname
+        self.gpu_id = gpu_id
+        self.ib_device = ib_device
+
+        self.initialize(
+            hostname=self.hostname,
+            device_name=self.ib_device,
+        )
+        self.session_id = (
+            f"{maybe_wrap_ipv6_address(self.hostname)}:{self.engine.get_rpc_port()}"
+        )
+
+    def register(self, ptr, length):
+        try:
+            ret_value = self.engine.register_memory(ptr, length)
+        except Exception:
+            # Mark register as failed
+            ret_value = -1
+
+        if ret_value != 0:
+            logger.debug("Mooncake memory registration %s failed.", ptr)
+
+    def deregister(self, ptr):
+        try:
+            ret_value = self.engine.unregister_memory(ptr)
+        except Exception:
+            # Mark deregister as failed
+            ret_value = -1
+
+        if ret_value != 0:
+            logger.debug("Mooncake memory deregistration %s failed.", ptr)
+
+    def batch_register(self, ptrs: List[int], lengths: List[int]) -> int:
+        """Batch register multiple memory regions."""
+        try:
+            ret_value = self.engine.batch_register_memory(ptrs, lengths)
+        except Exception:
+            # Mark batch register as failed
+            ret_value = -1
+            if not hasattr(self.engine, "batch_register_memory"):
+                raise RuntimeError(
+                    "Mooncake's batch register requires a newer version of mooncake-transfer-engine. "
+                    "Please upgrade Mooncake."
+                )
+
+        if ret_value != 0:
+            logger.debug("Mooncake batch memory registration failed.")
+        return ret_value
+
+    def batch_deregister(self, ptrs: List[int]) -> int:
+        """Batch deregister multiple memory regions."""
+        try:
+            ret_value = self.engine.batch_unregister_memory(ptrs)
+        except Exception:
+            # Mark batch deregister as failed
+            ret_value = -1
+
+        if ret_value != 0:
+            logger.debug("Mooncake batch memory deregistration failed.")
+        return ret_value
+
+    def initialize(
+        self,
+        hostname: str,
+        device_name: Optional[str],
+    ) -> None:
+        """Initialize the mooncake instance."""
+        if get_bool_env_var("ENABLE_ASCEND_TRANSFER_WITH_MOONCAKE", "false"):
+            hostname += f":{get_free_port()}:npu_{self.gpu_id}"
+            ret_value = self.engine.initialize(
+                hostname,
+                "P2PHANDSHAKE",
+                "ascend",
+                device_name if device_name is not None else "",
+            )
+        else:
+            ret_value = self.engine.initialize(
+                hostname,
+                "P2PHANDSHAKE",
+                "rdma",
+                device_name if device_name is not None else "",
+            )
+        if ret_value != 0:
+            logger.error("Mooncake Transfer Engine initialization failed.")
+            raise RuntimeError("Mooncake Transfer Engine initialization failed.")
+
+    def transfer_sync(
+        self, session_id: str, buffer: int, peer_buffer_address: int, length: int
+    ) -> int:
+        """Synchronously transfer data to the specified address."""
+        try:
+            # the first time: based on session_id (which contains remote_ip) to construct a queue pair, and cache the queue pair
+            # later: based on the cached queue pair to send data
+            ret = self.engine.transfer_sync_write(
+                session_id, buffer, peer_buffer_address, length
+            )
+        except Exception:
+            # Mark transfer request as failed
+            ret = -1
+
+        if ret < 0:
+            # Do not raise an exception here, since some transfer requests fail should be accepted and the execution thread should not be stopped.
+            logger.debug(
+                "Failed to transfer data from %s to %s - %s.",
+                buffer,
+                session_id,
+                peer_buffer_address,
+            )
+
+        return ret
+
+    def batch_transfer_sync(
+        self,
+        session_id: str,
+        buffers: List[int],
+        peer_buffer_addresses: List[int],
+        lengths: List[int],
+    ) -> int:
+        """Synchronously transfer data to the specified addresses in batches."""
+        try:
+            ret = self.engine.batch_transfer_sync_write(
+                session_id, buffers, peer_buffer_addresses, lengths
+            )
+        except Exception:
+            ret = -1
+            # Inform user to upgrade mooncake-transfer-engine >= 0.3.4.post2
+            if not hasattr(self.engine, "batch_transfer_sync_write"):
+                raise RuntimeError(
+                    "Mooncake's batch transfer requires mooncake-transfer-engine >= 0.3.4.post2. "
+                    "Please upgrade Mooncake by 'pip install mooncake-transfer-engine --upgrade'"
+                )
+
+        if ret < 0:
+            logger.debug(
+                "Failed to batch transfer data. Buffers: %s, Session: %s, Peer addresses: %s",
+                buffers,
+                session_id,
+                peer_buffer_addresses,
+            )
+        return ret
+
+    def get_session_id(self):
+        return self.session_id
diff --git a/python/sglang/srt/disaggregation/nixl/__init__.py b/python/sglang/srt/disaggregation/nixl/__init__.py
new file mode 100644
index 000000000..4df7baba2
--- /dev/null
+++ b/python/sglang/srt/disaggregation/nixl/__init__.py
@@ -0,0 +1,6 @@
+from sglang.srt.disaggregation.nixl.conn import (
+    NixlKVBootstrapServer,
+    NixlKVManager,
+    NixlKVReceiver,
+    NixlKVSender,
+)
diff --git a/python/sglang/srt/disaggregation/nixl/conn.py b/python/sglang/srt/disaggregation/nixl/conn.py
new file mode 100644
index 000000000..c911319ea
--- /dev/null
+++ b/python/sglang/srt/disaggregation/nixl/conn.py
@@ -0,0 +1,696 @@
+from __future__ import annotations
+
+import asyncio
+import dataclasses
+import logging
+import queue
+import socket
+import struct
+import threading
+import uuid
+from collections import defaultdict
+from functools import cache
+from typing import Dict, List, Optional, Set, Tuple, TypeAlias, Union
+
+import numpy as np
+import numpy.typing as npt
+import requests
+import zmq
+from aiohttp import web
+
+from sglang.srt.disaggregation.base.conn import BaseKVSender, KVArgs, KVPoll
+from sglang.srt.disaggregation.common.conn import (
+    CommonKVBootstrapServer,
+    CommonKVManager,
+    CommonKVReceiver,
+)
+from sglang.srt.disaggregation.common.utils import group_concurrent_contiguous
+from sglang.srt.disaggregation.utils import DisaggregationMode
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import (
+    format_tcp_address,
+    get_local_ip_auto,
+    is_valid_ipv6_address,
+)
+
+logger = logging.getLogger(__name__)
+
+GUARD = "NixlMsgGuard".encode("ascii")
+
+
+@dataclasses.dataclass
+class TransferInfo:
+    """Contains indices for a transfer, sent by KVReceiver. Received by prefill bootstrap thread."""
+
+    room: int
+    endpoint: str
+    dst_port: int
+    agent_name: str
+    dst_kv_indices: npt.NDArray[np.int32]
+    dst_aux_index: int
+    required_dst_info_num: int
+
+    def is_dummy(self):
+        return self.dst_kv_indices.size == 0
+
+    @classmethod
+    def from_zmq(cls, msg: List[bytes]):
+        return cls(
+            room=int(msg[0].decode("ascii")),
+            endpoint=msg[1].decode("ascii"),
+            dst_port=int(msg[2].decode("ascii")),
+            agent_name=msg[3].decode("ascii"),
+            dst_kv_indices=np.frombuffer(msg[4], dtype=np.int32),
+            dst_aux_index=int(msg[5].decode("ascii")),
+            required_dst_info_num=int(msg[6].decode("ascii")),
+        )
+
+
+@dataclasses.dataclass
+class KVArgsRegisterInfo:
+    """Contains base pointers and other info which only needs to be sent once by KVReceiver. Received by prefill bootstrap thread."""
+
+    room: str
+    endpoint: str
+    dst_port: int
+    agent_name: str
+    agent_metadata: bytes
+    dst_kv_ptrs: list[int]
+    dst_aux_ptrs: list[int]
+    gpu_id: int
+    decode_tp_size: int
+    decode_tp_rank: int
+    dst_kv_item_len: int
+
+    @classmethod
+    def from_zmq(cls, msg: List[bytes]):
+        return cls(
+            room=str(msg[0].decode("ascii")),
+            endpoint=msg[1].decode("ascii"),
+            dst_port=int(msg[2].decode("ascii")),
+            agent_name=msg[3].decode("ascii"),
+            agent_metadata=msg[4],
+            dst_kv_ptrs=list(struct.unpack(f"{len(msg[5])//8}Q", msg[5])),
+            dst_aux_ptrs=list(struct.unpack(f"{len(msg[6])//8}Q", msg[6])),
+            gpu_id=int(msg[7].decode("ascii")),
+            decode_tp_size=int(msg[8].decode("ascii")),
+            decode_tp_rank=int(msg[9].decode("ascii")),
+            dst_kv_item_len=int(msg[10].decode("ascii")),
+        )
+
+
+@dataclasses.dataclass
+class TransferStatus:
+    """Used by KV Receiver to know when a transfer is done."""
+
+    # KV chunk IDs that have been received.
+    received_kvs: Set[int] = dataclasses.field(default_factory=set)
+    # Number of kv chunks to expect, will know this after last chunk is received.
+    num_kvs_expected: Optional[int] = None
+    # Whether aux data has been received.
+    received_aux: bool = False
+
+    def is_done(self):
+        if self.num_kvs_expected is None:
+            return False
+        return self.num_kvs_expected == len(self.received_kvs) and self.received_aux
+
+
+class NixlKVManager(CommonKVManager):
+    def __init__(
+        self,
+        args: KVArgs,
+        disaggregation_mode: DisaggregationMode,
+        server_args: ServerArgs,
+        is_mla_backend: Optional[bool] = False,
+    ):
+        super().__init__(args, disaggregation_mode, server_args, is_mla_backend)
+        try:
+            from nixl._api import nixl_agent
+        except ImportError as e:
+            raise ImportError(
+                "Please install NIXL by following the instructions at "
+                "https://github.com/ai-dynamo/nixl/blob/main/README.md "
+                "to run SGLang with NixlTransferEngine."
+            ) from e
+        self.agent = nixl_agent(str(uuid.uuid4()))
+        self.local_ip = get_local_ip_auto()
+        self.server_socket = zmq.Context().socket(zmq.PULL)
+        if is_valid_ipv6_address(self.local_ip):
+            self.server_socket.setsockopt(zmq.IPV6, 1)
+        self.register_buffer_to_engine()
+
+        if self.disaggregation_mode == DisaggregationMode.PREFILL:
+            self.request_status: Dict[int, KVPoll] = {}
+            self.transfer_infos: Dict[int, Dict[str, TransferInfo]] = {}
+            self.decode_kv_args_table: Dict[str, KVArgsRegisterInfo] = {}
+            self._start_bootstrap_thread()
+        elif self.disaggregation_mode == DisaggregationMode.DECODE:
+            self.transfer_statuses: Dict[int, TransferStatus] = defaultdict(
+                TransferStatus
+            )
+        else:
+            raise ValueError(
+                f"Unsupported DisaggregationMode: {self.disaggregation_mode}"
+            )
+
+    def check_status(self, bootstrap_room: int):
+        return self.request_status[bootstrap_room]
+
+    def update_status(self, bootstrap_room: int, status: KVPoll):
+        if bootstrap_room not in self.request_status:
+            self.request_status[bootstrap_room] = status
+        else:
+            # NOTE: The prefill engine could recv bootstrapping first
+            self.request_status[bootstrap_room] = max(
+                self.request_status[bootstrap_room], status
+            )
+
+    def register_buffer_to_engine(self):
+        kv_addrs = []
+        for kv_data_ptr, kv_data_len in zip(
+            self.kv_args.kv_data_ptrs, self.kv_args.kv_data_lens
+        ):
+            kv_addrs.append((kv_data_ptr, kv_data_len, self.kv_args.gpu_id, ""))
+        self.kv_descs = self.agent.register_memory(kv_addrs, "VRAM")
+        logger.debug(f"Register kv tensors, len(kv_addr)= {len(kv_addrs)}")
+        if not self.kv_descs:
+            raise Exception("NIXL memory registration failed for kv tensors")
+        aux_addrs = []
+        for aux_data_ptr, aux_data_len in zip(
+            self.kv_args.aux_data_ptrs, self.kv_args.aux_data_lens
+        ):
+            aux_addrs.append((aux_data_ptr, aux_data_len, 0, ""))
+        self.aux_descs = self.agent.register_memory(aux_addrs, "DRAM")
+        logger.debug(f"Register aux tensors, len(aux_addrs)= {len(aux_addrs)}")
+        if not self.aux_descs:
+            raise Exception("NIXL memory registration failed for aux tensors")
+
+    def _add_remote_peer(self, decode_kv_args: KVArgsRegisterInfo):
+        agent_name = decode_kv_args.agent_name
+        if agent_name in self.decode_kv_args_table:
+            logger.info(f"Peer {agent_name} was already registered, ignoring.")
+            return
+        self.decode_kv_args_table[agent_name] = decode_kv_args
+        self.agent.add_remote_agent(decode_kv_args.agent_metadata)
+
+    def send_kvcache(
+        self,
+        peer_name: str,
+        prefill_kv_indices: npt.NDArray[np.int32],
+        dst_kv_ptrs: list[int],
+        dst_kv_indices: npt.NDArray[np.int32],
+        dst_gpu_id: int,
+        notif: str,
+    ):
+        # group by indices
+        prefill_kv_blocks, dst_kv_blocks = group_concurrent_contiguous(
+            prefill_kv_indices, dst_kv_indices
+        )
+
+        logger.debug(f"sending kvcache to {peer_name} with notif {notif}")
+        # Make descs
+        num_layers = len(self.kv_args.kv_data_ptrs)
+        src_addrs = []
+        dst_addrs = []
+        for layer_id in range(num_layers):
+            src_ptr = self.kv_args.kv_data_ptrs[layer_id]
+            dst_ptr = dst_kv_ptrs[layer_id]
+            item_len = self.kv_args.kv_item_lens[layer_id]
+
+            for prefill_index, decode_index in zip(prefill_kv_blocks, dst_kv_blocks):
+                src_addr = src_ptr + int(prefill_index[0]) * item_len
+                dst_addr = dst_ptr + int(decode_index[0]) * item_len
+                length = item_len * len(prefill_index)
+                src_addrs.append((src_addr, length, self.kv_args.gpu_id))
+                dst_addrs.append((dst_addr, length, dst_gpu_id))
+
+        logger.debug(
+            f"len(src_addrs): before group: {len(prefill_kv_indices)}, after group: {len(src_addrs)}"
+        )
+        src_descs = self.agent.get_xfer_descs(src_addrs, "VRAM")
+        dst_descs = self.agent.get_xfer_descs(dst_addrs, "VRAM")
+        # Transfer data
+        xfer_handle = self.agent.initialize_xfer(
+            "WRITE",
+            src_descs,
+            dst_descs,
+            peer_name,
+            notif.encode("ascii"),  # type: ignore
+        )
+        if not xfer_handle:
+            raise Exception("KVSender failed to create transfer")
+        state = self.agent.transfer(xfer_handle)
+        if state == "ERR":
+            raise Exception("KVSender failed to post transfer")
+        return xfer_handle
+
+    def send_kvcache_slice(
+        self,
+        peer_name: str,
+        prefill_kv_indices: npt.NDArray[np.int32],
+        dst_kv_ptrs: list[int],
+        dst_kv_indices: npt.NDArray[np.int32],
+        dst_gpu_id: int,
+        notif: str,
+        prefill_tp_size: int,
+        decode_tp_size: int,
+        decode_tp_rank: int,
+        dst_kv_item_len: int,
+    ):
+        # Get configuration from kv_args
+        local_tp_rank_in_group = self.kv_args.engine_rank % prefill_tp_size
+        dst_tp_rank_in_group = decode_tp_rank % decode_tp_size
+        num_kv_heads = self.kv_args.kv_head_num
+
+        # Calculate head distribution
+        src_heads_per_rank = num_kv_heads
+        dst_heads_per_rank = num_kv_heads * prefill_tp_size // decode_tp_size
+
+        src_kv_item_len = self.kv_args.kv_item_lens[0]
+        page_size = self.kv_args.page_size
+
+        bytes_per_head_slice_to_send = (
+            dst_kv_item_len // page_size // dst_heads_per_rank
+        )
+
+        # Determine which heads to send
+        if prefill_tp_size > decode_tp_size:
+            # Multiple prefill ranks to one decode rank
+            src_head_start_offset = 0
+            num_heads_to_send = src_heads_per_rank
+            dst_head_start_offset = local_tp_rank_in_group * src_heads_per_rank
+        else:
+            # Send KVCache from 1 prefill instance to multiple decode instances
+            src_head_start_offset = (
+                dst_tp_rank_in_group * dst_heads_per_rank
+            ) % src_heads_per_rank
+            num_heads_to_send = dst_heads_per_rank
+            dst_head_start_offset = 0
+
+        # Create transfer descriptors
+        src_addrs = []
+        dst_addrs = []
+
+        bytes_per_token_on_prefill = src_kv_item_len // page_size
+        bytes_per_token_on_decode = dst_kv_item_len // page_size
+
+        num_kv_layers = len(self.kv_args.kv_data_ptrs) // 2
+        src_k_ptrs = self.kv_args.kv_data_ptrs[:num_kv_layers]
+        src_v_ptrs = self.kv_args.kv_data_ptrs[num_kv_layers:]
+        dst_k_ptrs = dst_kv_ptrs[0 : len(src_k_ptrs)]
+        dst_v_ptrs = dst_kv_ptrs[num_kv_layers : num_kv_layers + len(src_v_ptrs)]
+
+        # Calculate precise byte offset and length for the sub-slice within the token
+        src_head_slice_offset = src_head_start_offset * bytes_per_head_slice_to_send
+        dst_head_slice_offset = dst_head_start_offset * bytes_per_head_slice_to_send
+        heads_bytes_per_token_to_send = num_heads_to_send * bytes_per_head_slice_to_send
+
+        src_dst_ptr_pairs = [
+            (
+                src_k_ptrs[layer_id],
+                dst_k_ptrs[layer_id],
+            )
+            for layer_id in range(len(src_k_ptrs))
+        ] + [
+            (
+                src_v_ptrs[layer_id],
+                dst_v_ptrs[layer_id],
+            )
+            for layer_id in range(len(src_v_ptrs))
+        ]
+
+        src_addrs = []
+        dst_addrs = []
+
+        # Calculate strides for a single token slot
+        bytes_per_token_on_prefill = src_kv_item_len // page_size
+        bytes_per_token_on_decode = dst_kv_item_len // page_size
+
+        for src_ptr, dst_ptr in src_dst_ptr_pairs:
+            for i in range(len(prefill_kv_indices)):
+                prefill_page_idx = int(prefill_kv_indices[i])
+                decode_page_idx = int(dst_kv_indices[i])
+
+                # Get the starting addresses for the current src and dst pages
+                src_page_start_addr = src_ptr + prefill_page_idx * src_kv_item_len
+                dst_page_start_addr = dst_ptr + decode_page_idx * dst_kv_item_len
+
+                # Iterate through each valid token slot within the current page
+                for token_slot_in_page in range(page_size):
+                    # Calculate the start address of the current token slot
+                    src_token_slot_start_addr = (
+                        src_page_start_addr
+                        + token_slot_in_page * bytes_per_token_on_prefill
+                    )
+                    dst_token_slot_start_addr = (
+                        dst_page_start_addr
+                        + token_slot_in_page * bytes_per_token_on_decode
+                    )
+
+                    # Calculate final src and dst addresses by applying head-slice offsets
+                    src_slice_addr = src_token_slot_start_addr + src_head_slice_offset
+                    dst_slice_addr = dst_token_slot_start_addr + dst_head_slice_offset
+
+                    src_addrs.append(
+                        (
+                            src_slice_addr,
+                            heads_bytes_per_token_to_send,
+                            self.kv_args.gpu_id,
+                        )
+                    )
+                    dst_addrs.append(
+                        (dst_slice_addr, heads_bytes_per_token_to_send, dst_gpu_id)
+                    )
+
+        # Use NIXL agent for transfer
+        src_descs = self.agent.get_xfer_descs(src_addrs, "VRAM")
+        dst_descs = self.agent.get_xfer_descs(dst_addrs, "VRAM")
+
+        xfer_handle = self.agent.initialize_xfer(
+            "WRITE", src_descs, dst_descs, peer_name, notif.encode("ascii")
+        )
+        if not xfer_handle:
+            raise Exception("Failed to create sliced KV transfer")
+
+        state = self.agent.transfer(xfer_handle)
+        if state == "ERR":
+            raise Exception("Failed to post sliced KV transfer")
+
+        return xfer_handle
+
+    def send_aux(
+        self,
+        peer_name: str,
+        prefill_aux_index: int,
+        dst_aux_ptrs: list[int],
+        dst_aux_index: int,
+        notif: str,
+    ):
+        # Make descs
+        aux_item_len = self.kv_args.aux_item_lens[0]
+        prefill_aux_addr = (
+            self.kv_args.aux_data_ptrs[0] + prefill_aux_index * aux_item_len
+        )
+        decode_aux_addr = dst_aux_ptrs[0] + dst_aux_index * aux_item_len
+        src_addrs = [(prefill_aux_addr, aux_item_len, 0)]
+        dst_addrs = [(decode_aux_addr, aux_item_len, 0)]
+        src_descs = self.agent.get_xfer_descs(src_addrs, "DRAM")
+        dst_descs = self.agent.get_xfer_descs(dst_addrs, "DRAM")
+        # Transfer data
+        xfer_handle = self.agent.initialize_xfer(
+            "WRITE",
+            src_descs,
+            dst_descs,
+            peer_name,
+            notif.encode("ascii"),  # type: ignore
+        )
+        if not xfer_handle:
+            raise Exception("KVSender failed to create transfer")
+        state = self.agent.transfer(xfer_handle)
+        if state == "ERR":
+            raise Exception("KVSender failed to post transfer")
+        return xfer_handle
+
+    def add_transfer_request(
+        self,
+        bootstrap_room: int,
+        kv_indices: npt.NDArray[np.int32],
+        index_slice: slice,
+        is_last: bool,
+        chunk_id: int,
+        aux_index: Optional[int] = None,
+    ):
+        assert self.disaggregation_mode == DisaggregationMode.PREFILL
+        assert not is_last or (is_last and aux_index is not None)
+
+        reqs_to_be_processed = self.transfer_infos[bootstrap_room].values()
+        handles = []
+        for req in reqs_to_be_processed:
+            assert bootstrap_room == req.room
+            if req.is_dummy():
+                continue
+
+            chunked_dst_kv_indice = req.dst_kv_indices[index_slice]
+            assert len(chunked_dst_kv_indice) == len(kv_indices)
+            assert req.agent_name in self.decode_kv_args_table
+
+            notif = "_".join([str(req.room), "kv", str(chunk_id), str(int(is_last))])
+            decode_tp_size = self.decode_kv_args_table[req.agent_name].decode_tp_size
+
+            if decode_tp_size == self.tp_size:
+                kv_xfer_handle = self.send_kvcache(
+                    req.agent_name,
+                    kv_indices,
+                    self.decode_kv_args_table[req.agent_name].dst_kv_ptrs,
+                    chunked_dst_kv_indice,
+                    self.decode_kv_args_table[req.agent_name].gpu_id,
+                    notif,
+                )
+            else:
+                kv_xfer_handle = self.send_kvcache_slice(
+                    req.agent_name,
+                    kv_indices,
+                    self.decode_kv_args_table[req.agent_name].dst_kv_ptrs,
+                    chunked_dst_kv_indice,
+                    self.decode_kv_args_table[req.agent_name].gpu_id,
+                    notif,
+                    prefill_tp_size=self.tp_size,
+                    decode_tp_size=decode_tp_size,
+                    decode_tp_rank=self.decode_kv_args_table[
+                        req.agent_name
+                    ].decode_tp_rank,
+                    dst_kv_item_len=self.decode_kv_args_table[
+                        req.agent_name
+                    ].dst_kv_item_len,
+                )
+
+            handles.append(kv_xfer_handle)
+            # Only the last chunk we need to send the aux data.
+            if is_last:
+                assert aux_index is not None
+                aux_xfer_handle = self.send_aux(
+                    req.agent_name,
+                    aux_index,
+                    self.decode_kv_args_table[req.agent_name].dst_aux_ptrs,
+                    req.dst_aux_index,
+                    str(req.room) + "_aux",
+                )
+                handles.append(aux_xfer_handle)
+        if is_last:
+            del self.transfer_infos[bootstrap_room]
+        return handles
+
+    def update_transfer_status(self):
+        # Process notifications from received transfers.
+        notif_map = self.agent.get_new_notifs()
+        for peer_name, messages in notif_map.items():
+            # We could also check that self.bootstrap_info['agent_name'] matches
+            # the message sender. But the bootstrap room alone should be
+            # sufficient to map the status.
+            for msg in messages:
+                components = msg.decode("ascii").split("_")
+                room = int(components[0])
+                if components[1] == "kv":
+                    chunk_id = int(components[2])
+                    is_last = bool(int(components[3]))
+                    self.transfer_statuses[room].received_kvs.add(chunk_id)
+                    if is_last:
+                        self.transfer_statuses[room].num_kvs_expected = chunk_id + 1
+                elif components[1] == "aux":
+                    self.transfer_statuses[room].received_aux = True
+
+    def check_transfer_done(self, room: int):
+        if room not in self.transfer_statuses:
+            return False
+        return self.transfer_statuses[room].is_done()
+
+    def _bind_server_socket(self):
+        self.server_socket.bind(format_tcp_address(self.local_ip, self.rank_port))
+
+    def _start_bootstrap_thread(self):
+        self._bind_server_socket()
+
+        def bootstrap_thread():
+            """This thread recvs transfer info from the decode engine"""
+            while True:
+                waiting_req_bytes = self.server_socket.recv_multipart()
+                logger.debug(
+                    f"Received multipart with total byte size {sum(len(x) for x in waiting_req_bytes)}"
+                )
+                assert (
+                    waiting_req_bytes[0] == GUARD
+                ), f"First message should be {GUARD}. Foreign traffic?"
+                waiting_req_bytes = waiting_req_bytes[1:]
+                room = waiting_req_bytes[0].decode("ascii")
+                agent_name = waiting_req_bytes[3].decode("ascii")
+                if room == "None":
+                    # Register new peer and save KV base pointers.
+                    self._add_remote_peer(
+                        KVArgsRegisterInfo.from_zmq(waiting_req_bytes)
+                    )
+                    logger.debug(f"Register KVArgs from {agent_name} successfully")
+                    continue
+                room = int(room)
+                if room not in self.transfer_infos:
+                    self.transfer_infos[room] = {}
+                self.transfer_infos[room][agent_name] = TransferInfo.from_zmq(
+                    waiting_req_bytes
+                )
+                required_dst_info_num = self.transfer_infos[room][
+                    agent_name
+                ].required_dst_info_num
+                logger.debug(f"got info {room=} {agent_name=} {required_dst_info_num=}")
+                if len(self.transfer_infos[room]) == required_dst_info_num:
+                    logger.debug(f"{room=} is bootstrapped")
+                    self.update_status(room, KVPoll.WaitingForInput)
+
+        threading.Thread(target=bootstrap_thread).start()
+
+
+class NixlKVSender(BaseKVSender):
+
+    def __init__(
+        self,
+        mgr: NixlKVManager,
+        bootstrap_addr: str,
+        bootstrap_room: int,
+        dest_tp_ranks: List[int],
+        pp_rank: int,
+    ):
+        self.kv_mgr = mgr
+        self.bootstrap_room = bootstrap_room
+        self.aux_index = None
+        self.bootstrap_server_url = bootstrap_addr
+        self.xfer_handles = []
+        self.has_sent = False
+        self.chunk_id = 0
+        self.kv_mgr.update_status(self.bootstrap_room, KVPoll.Bootstrapping)
+        # inner state
+        self.curr_idx = 0
+
+    def init(self, num_kv_indices: int, aux_index: Optional[int] = None):
+        self.num_kv_indices = num_kv_indices
+        self.aux_index = aux_index
+
+    def send(
+        self,
+        kv_indices: npt.NDArray[np.int32],
+    ):
+        index_slice = slice(self.curr_idx, self.curr_idx + len(kv_indices))
+        self.curr_idx += len(kv_indices)
+        is_last = self.curr_idx == self.num_kv_indices
+
+        new_xfer_handles = self.kv_mgr.add_transfer_request(
+            self.bootstrap_room,
+            kv_indices,
+            index_slice,
+            is_last,
+            self.chunk_id,
+            self.aux_index,
+        )
+        self.xfer_handles.extend(new_xfer_handles)
+        self.chunk_id += 1
+        if is_last:
+            self.has_sent = True
+            del self.kv_mgr.request_status[self.bootstrap_room]
+
+    def poll(self) -> KVPoll:
+        if not self.has_sent:
+            return self.kv_mgr.check_status(self.bootstrap_room)
+        states = [self.kv_mgr.agent.check_xfer_state(x) for x in self.xfer_handles]
+        if all([x == "DONE" for x in states]):
+            return KVPoll.Success  # type: ignore
+        if any([x == "ERR" for x in states]):
+            raise Exception("KVSender transfer encountered an error.")
+        return KVPoll.WaitingForInput  # type: ignore
+
+    def failure_exception(self):
+        raise Exception("Fake KVSender Exception")
+
+
+class NixlKVReceiver(CommonKVReceiver):
+    def __init__(
+        self,
+        mgr: NixlKVManager,
+        bootstrap_addr: str,
+        bootstrap_room: Optional[int] = None,
+        prefill_dp_rank: Optional[int] = None,
+    ):
+        self.started_transfer = False
+        self.conclude_state = None
+        super().__init__(mgr, bootstrap_addr, bootstrap_room, prefill_dp_rank)
+
+    def init(self, kv_indices: npt.NDArray[np.int32], aux_index: Optional[int] = None):
+        for bootstrap_info in self.bootstrap_infos:
+            logger.debug(
+                f"Fetched bootstrap info: {bootstrap_info} for engine rank: {self.kv_mgr.kv_args.engine_rank}"
+            )
+            sock, lock = self._connect_to_bootstrap_server(bootstrap_info)
+            is_dummy = bootstrap_info["is_dummy"]
+            logger.debug(
+                f"Sending to prefill server with bootstrap room {self.bootstrap_room} {is_dummy=}"
+            )
+            with lock:
+                sock.send_multipart(
+                    [
+                        GUARD,
+                        str(self.bootstrap_room).encode("ascii"),
+                        self.kv_mgr.local_ip.encode("ascii"),
+                        str(self.kv_mgr.rank_port).encode("ascii"),
+                        self.kv_mgr.agent.name.encode("ascii"),
+                        kv_indices.tobytes() if not is_dummy else b"",
+                        str(aux_index).encode("ascii"),
+                        str(self.required_dst_info_num).encode("ascii"),
+                    ]
+                )
+
+        self.started_transfer = True
+
+    def poll(self) -> KVPoll:
+        if self.conclude_state is not None:
+            return self.conclude_state
+        if not self.started_transfer:
+            return KVPoll.WaitingForInput  # type: ignore
+
+        self.kv_mgr.update_transfer_status()
+        if self.kv_mgr.check_transfer_done(self.bootstrap_room):  # type: ignore
+            self.conclude_state = KVPoll.Success
+            del self.kv_mgr.transfer_statuses[self.bootstrap_room]
+            return KVPoll.Success  # type: ignore
+        return KVPoll.WaitingForInput  # type: ignore
+
+    def _register_kv_args(self):
+        for bootstrap_info in self.bootstrap_infos:
+            sock, lock = self._connect_to_bootstrap_server(bootstrap_info)
+            packed_kv_data_ptrs = b"".join(
+                struct.pack("Q", ptr) for ptr in self.kv_mgr.kv_args.kv_data_ptrs
+            )
+            packed_aux_data_ptrs = b"".join(
+                struct.pack("Q", ptr) for ptr in self.kv_mgr.kv_args.aux_data_ptrs
+            )
+
+            with lock:
+                sock.send_multipart(
+                    [
+                        GUARD,
+                        "None".encode("ascii"),
+                        self.kv_mgr.local_ip.encode("ascii"),
+                        str(self.kv_mgr.rank_port).encode("ascii"),
+                        self.kv_mgr.agent.name.encode("ascii"),
+                        self.kv_mgr.agent.get_agent_metadata(),
+                        packed_kv_data_ptrs,
+                        packed_aux_data_ptrs,
+                        str(self.kv_mgr.kv_args.gpu_id).encode("ascii"),
+                        str(self.kv_mgr.kv_args.decode_tp_size).encode("ascii"),
+                        str(self.kv_mgr.kv_args.engine_rank).encode("ascii"),
+                        str(self.kv_mgr.kv_args.kv_item_lens[0]).encode("ascii"),
+                    ]
+                )
+
+    def failure_exception(self):
+        raise Exception("Fake KVReceiver Exception")
+
+
+class NixlKVBootstrapServer(CommonKVBootstrapServer):
+    pass
diff --git a/python/sglang/srt/disaggregation/prefill.py b/python/sglang/srt/disaggregation/prefill.py
new file mode 100644
index 000000000..b70748250
--- /dev/null
+++ b/python/sglang/srt/disaggregation/prefill.py
@@ -0,0 +1,867 @@
+"""
+Life cycle of a request in the prefill server
+
+1. Bootstrap Queue
+    a. Initialize a sender for each request
+    b. Use the queue to store requests whose bootstrap (handshake and preallocation) has not finished
+    c. Poll senders to check bootstrap state
+    d. Once bootstrap is complete, move request to Waiting Queue
+
+2. Waiting Queue
+    a. Use PrefillAdder to pop requests
+    b. Run forward
+    c. Add the request to Inflight Queue
+
+3. Inflight Queue
+    a. Poll (non-blocking) the sender of the request
+    b. Once the transfer has finished, return the request
+"""
+
+from __future__ import annotations
+
+import logging
+import threading
+from collections import deque
+from http import HTTPStatus
+from typing import TYPE_CHECKING, List, Optional, Type
+
+import torch
+
+from sglang.srt.disaggregation.base import BaseKVManager, KVPoll
+from sglang.srt.disaggregation.utils import (
+    FAKE_BOOTSTRAP_HOST,
+    DisaggregationMode,
+    KVClassType,
+    MetadataBuffers,
+    ReqToMetadataIdxAllocator,
+    TransferBackend,
+    get_kv_class,
+    is_mla_backend,
+    kv_to_page_indices,
+    kv_to_page_num,
+    poll_and_all_reduce,
+    prepare_abort,
+)
+from sglang.srt.managers.schedule_batch import FINISH_LENGTH, Req, ScheduleBatch
+from sglang.srt.model_executor.forward_batch_info import ForwardMode, PPProxyTensors
+from sglang.srt.utils import (
+    DynamicGradMode,
+    broadcast_pyobj,
+    point_to_point_pyobj,
+    require_mlp_sync,
+)
+
+if TYPE_CHECKING:
+    from torch.distributed import ProcessGroup
+
+    from sglang.srt.managers.scheduler import GenerationBatchResult, Scheduler
+    from sglang.srt.mem_cache.memory_pool import KVCache
+
+logger = logging.getLogger(__name__)
+
+
+class PrefillBootstrapQueue:
+    """
+    Store the requests in bootstrapping
+    """
+
+    def __init__(
+        self,
+        token_to_kv_pool: KVCache,
+        draft_token_to_kv_pool: Optional[KVCache],
+        req_to_metadata_buffer_idx_allocator: ReqToMetadataIdxAllocator,
+        metadata_buffers: MetadataBuffers,
+        tp_rank: int,
+        tp_size: int,
+        gpu_id: int,
+        bootstrap_port: int,
+        gloo_group: ProcessGroup,
+        max_total_num_tokens: int,
+        decode_tp_size: int,
+        decode_dp_size: int,
+        scheduler: Scheduler,
+        pp_rank: int,
+        pp_size: int,
+        transfer_backend: TransferBackend,
+    ):
+        self.token_to_kv_pool = token_to_kv_pool
+        self.draft_token_to_kv_pool = draft_token_to_kv_pool
+        self.is_mla_backend = is_mla_backend(token_to_kv_pool)
+        self.metadata_buffers = metadata_buffers
+        self.req_to_metadata_buffer_idx_allocator = req_to_metadata_buffer_idx_allocator
+        self.tp_rank = tp_rank
+        self.tp_size = tp_size
+        self.decode_tp_size = decode_tp_size
+        self.decode_dp_size = decode_dp_size
+        self.pp_rank = pp_rank
+        self.pp_size = pp_size
+        self.gpu_id = gpu_id
+        self.bootstrap_port = bootstrap_port
+        self.queue: List[Req] = []
+        self.gloo_group = gloo_group
+        self.max_total_num_tokens = max_total_num_tokens
+        self.scheduler = scheduler
+        self.transfer_backend = transfer_backend
+        self.kv_manager = self._init_kv_manager()
+
+    def _init_kv_manager(self) -> BaseKVManager:
+        kv_args_class = get_kv_class(self.transfer_backend, KVClassType.KVARGS)
+        kv_args = kv_args_class()
+        kv_args.engine_rank = self.tp_rank
+        kv_args.pp_rank = self.pp_rank
+        kv_args.system_dp_rank = self.scheduler.dp_rank
+        kv_args.decode_tp_size = self.decode_tp_size // self.decode_dp_size
+        kv_args.prefill_pp_size = self.pp_size
+        kv_args.prefill_start_layer = self.token_to_kv_pool.start_layer
+        kv_data_ptrs, kv_data_lens, kv_item_lens = (
+            self.token_to_kv_pool.get_contiguous_buf_infos()
+        )
+
+        if self.draft_token_to_kv_pool is not None:
+            # We should also transfer draft model kv cache. The indices are
+            # always shared with a target model.
+            draft_kv_data_ptrs, draft_kv_data_lens, draft_kv_item_lens = (
+                self.draft_token_to_kv_pool.get_contiguous_buf_infos()
+            )
+            kv_data_ptrs += draft_kv_data_ptrs
+            kv_data_lens += draft_kv_data_lens
+            kv_item_lens += draft_kv_item_lens
+
+        kv_args.kv_data_ptrs = kv_data_ptrs
+        kv_args.kv_data_lens = kv_data_lens
+        kv_args.kv_item_lens = kv_item_lens
+        if not self.is_mla_backend:
+            kv_args.kv_head_num = self.token_to_kv_pool.head_num
+        kv_args.page_size = self.token_to_kv_pool.page_size
+
+        kv_args.aux_data_ptrs, kv_args.aux_data_lens, kv_args.aux_item_lens = (
+            self.metadata_buffers.get_buf_infos()
+        )
+        kv_args.ib_device = self.scheduler.server_args.disaggregation_ib_device
+        kv_args.gpu_id = self.scheduler.gpu_id
+
+        kv_manager_class: Type[BaseKVManager] = get_kv_class(
+            self.transfer_backend, KVClassType.MANAGER
+        )
+        kv_manager: BaseKVManager = kv_manager_class(
+            kv_args,
+            DisaggregationMode.PREFILL,
+            self.scheduler.server_args,
+            self.is_mla_backend,
+        )
+        return kv_manager
+
+    def add(self, req: Req, num_kv_heads: int) -> None:
+        if self._check_if_req_exceed_kv_capacity(req):
+            return
+
+        if req.bootstrap_host == FAKE_BOOTSTRAP_HOST:
+            kv_sender_class = get_kv_class(TransferBackend.FAKE, KVClassType.SENDER)
+        else:
+            kv_sender_class = get_kv_class(self.transfer_backend, KVClassType.SENDER)
+
+        dest_tp_ranks = [self.tp_rank]
+
+        req.disagg_kv_sender = kv_sender_class(
+            mgr=self.kv_manager,
+            bootstrap_addr=f"{req.bootstrap_host}:{self.bootstrap_port}",
+            bootstrap_room=req.bootstrap_room,
+            dest_tp_ranks=dest_tp_ranks,
+            pp_rank=self.pp_rank,
+        )
+        self._process_req(req)
+        self.queue.append(req)
+
+    def extend(self, reqs: List[Req], num_kv_heads: int) -> None:
+        for req in reqs:
+            self.add(req, num_kv_heads)
+
+    def _check_if_req_exceed_kv_capacity(self, req: Req) -> bool:
+        if len(req.origin_input_ids) > self.max_total_num_tokens:
+            message = f"Request {req.rid} exceeds the maximum number of tokens: {len(req.origin_input_ids)} > {self.max_total_num_tokens}"
+            logger.error(message)
+            prepare_abort(req, message, status_code=HTTPStatus.BAD_REQUEST)
+            self.scheduler.stream_output([req], req.return_logprob)
+            return True
+        return False
+
+    def _process_req(self, req: Req) -> None:
+        """
+        Set max_new_tokens = 1, so PrefillAdder memory estimation is accurate
+        """
+        req.sampling_params.max_new_tokens = 1
+
+    def pop_bootstrapped(
+        self,
+        return_failed_reqs: bool = False,
+        rids_to_check: Optional[List[str]] = None,
+    ) -> List[Req]:
+        """
+        pop the reqs which has finished bootstrapping
+
+        return_failed_reqs: For PP, on rank 0, also return the failed reqs to notify the next rank
+        rids_to_check: For PP, on rank > 0, check the rids from the previous rank has consensus with the current rank.
+        """
+
+        bootstrapped_reqs = []
+        failed_reqs = []
+        indices_to_remove = set()
+
+        if len(self.queue) == 0:
+            if return_failed_reqs is False:
+                return []
+            else:
+                return [], []
+
+        polls = poll_and_all_reduce(
+            [req.disagg_kv_sender for req in self.queue], self.gloo_group
+        )
+
+        for i, (req, poll) in enumerate(zip(self.queue, polls)):
+            if rids_to_check is not None:
+                # if req not in reqs_info_to_check, skip
+                if req.rid not in rids_to_check:
+                    continue
+                # Either waiting for input or failed
+                assert poll == KVPoll.WaitingForInput or poll == KVPoll.Failed
+
+            if poll == KVPoll.Bootstrapping:
+                continue
+            elif poll == KVPoll.Failed:
+                error_message = f"Prefill bootstrap failed for request rank={self.tp_rank} {req.rid=} {req.bootstrap_room=}"
+                try:
+                    req.disagg_kv_sender.failure_exception()
+                except Exception as e:
+                    error_message += f" with exception {e}"
+                logger.error(error_message)
+                prepare_abort(
+                    req, error_message, status_code=HTTPStatus.INTERNAL_SERVER_ERROR
+                )
+                self.scheduler.stream_output([req], req.return_logprob)
+                indices_to_remove.add(i)
+                failed_reqs.append(req)
+                if self.scheduler.enable_metrics:
+                    self.scheduler.metrics_collector.increment_bootstrap_failed_reqs()
+                continue
+
+            # KV.WaitingForInput - init here
+            num_kv_indices = len(req.origin_input_ids)
+            if self.req_to_metadata_buffer_idx_allocator.available_size() == 0:
+                break
+
+            req.metadata_buffer_index = (
+                self.req_to_metadata_buffer_idx_allocator.alloc()
+            )
+            assert req.metadata_buffer_index is not None
+
+            num_pages = kv_to_page_num(num_kv_indices, self.token_to_kv_pool.page_size)
+            req.disagg_kv_sender.init(num_pages, req.metadata_buffer_index)
+            bootstrapped_reqs.append(req)
+            indices_to_remove.add(i)
+
+        self.queue = [
+            entry for i, entry in enumerate(self.queue) if i not in indices_to_remove
+        ]
+
+        if return_failed_reqs is False:
+            return bootstrapped_reqs
+        else:
+            return bootstrapped_reqs, failed_reqs
+
+
+class SchedulerDisaggregationPrefillMixin:
+    """
+    Mixin for Scheduler to handle disaggregation prefill
+    """
+
+    @torch.no_grad()
+    def event_loop_normal_disagg_prefill(self: Scheduler) -> None:
+        """A normal scheduler loop for prefill worker in disaggregation mode."""
+
+        while True:
+            recv_reqs = self.recv_requests()
+            self.process_input_requests(recv_reqs)
+            self.waiting_queue.extend(
+                self.disagg_prefill_bootstrap_queue.pop_bootstrapped()
+            )
+            self.process_prefill_chunk()
+            batch = self.get_new_batch_prefill()
+
+            if require_mlp_sync(self.server_args):
+                batch = self.prepare_mlp_sync_batch(batch)
+            self.cur_batch = batch
+
+            if batch:
+                result = self.run_batch(batch)
+                self.process_batch_result_disagg_prefill(batch, result)
+
+            if len(self.disagg_prefill_inflight_queue) > 0:
+                self.process_disagg_prefill_inflight_queue()
+
+            if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
+                self.self_check_during_idle()
+
+            self.last_batch = batch
+            # HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
+            # Otherwise, it hangs under high concurrency
+            self.running_batch.batch_is_full = False
+
+    @torch.no_grad()
+    def event_loop_overlap_disagg_prefill(self: Scheduler) -> None:
+        self.result_queue = deque()
+
+        while True:
+            recv_reqs = self.recv_requests()
+            self.process_input_requests(recv_reqs)
+            self.waiting_queue.extend(
+                self.disagg_prefill_bootstrap_queue.pop_bootstrapped()
+            )
+            self.process_prefill_chunk()
+            batch = self.get_new_batch_prefill()
+
+            if require_mlp_sync(self.server_args):
+                batch = self.prepare_mlp_sync_batch(batch)
+            self.cur_batch = batch
+            if batch:
+                result = self.run_batch(batch)
+                self.result_queue.append((batch.copy(), result))
+
+                if self.last_batch is None:
+                    # Create a dummy first batch to start the pipeline for overlap schedule.
+                    # It is now used for triggering the sampling_info_done event.
+                    tmp_batch = ScheduleBatch(
+                        reqs=None,
+                        forward_mode=ForwardMode.DUMMY_FIRST,
+                        next_batch_sampling_info=self.tp_worker.cur_sampling_info,
+                    )
+                    self.set_next_batch_sampling_info_done(tmp_batch)
+
+            if self.last_batch:
+                tmp_batch, tmp_result = self.result_queue.popleft()
+                tmp_batch.next_batch_sampling_info = (
+                    self.tp_worker.cur_sampling_info if batch else None
+                )
+                self.process_batch_result_disagg_prefill(tmp_batch, tmp_result)
+
+            if len(self.disagg_prefill_inflight_queue) > 0:
+                self.process_disagg_prefill_inflight_queue()
+
+            if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
+                self.self_check_during_idle()
+
+            self.last_batch = batch
+            # HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
+            # Otherwise, it hangs under high concurrency
+            self.running_batch.batch_is_full = False
+
+    def process_batch_result_disagg_prefill(
+        self: Scheduler,
+        batch: ScheduleBatch,
+        result: GenerationBatchResult,
+        launch_done: Optional[threading.Event] = None,
+    ) -> None:
+        """
+        Transfer kv for prefill completed requests and add it into disagg_prefill_inflight_queue
+        Adapted from process_batch_result_prefill
+        """
+        (
+            logits_output,
+            next_token_ids,
+            extend_input_len_per_req,
+            extend_logprob_start_len_per_req,
+        ) = (
+            result.logits_output,
+            result.next_token_ids,
+            result.extend_input_len_per_req,
+            result.extend_logprob_start_len_per_req,
+        )
+
+        logprob_pt = 0
+        # Transfer kv for prefill completed requests and add it into disagg_prefill_inflight_queue
+        if self.enable_overlap:
+            # wait
+            logits_output, next_token_ids, _ = self.tp_worker.resolve_last_batch_result(
+                launch_done
+            )
+        else:
+            next_token_ids = result.next_token_ids.tolist()
+            if batch.return_logprob:
+                if logits_output.next_token_logprobs is not None:
+                    logits_output.next_token_logprobs = (
+                        logits_output.next_token_logprobs.tolist()
+                    )
+                if logits_output.input_token_logprobs is not None:
+                    logits_output.input_token_logprobs = tuple(
+                        logits_output.input_token_logprobs.tolist()
+                    )
+
+        hidden_state_offset = 0
+        for i, (req, next_token_id) in enumerate(
+            zip(batch.reqs, next_token_ids, strict=True)
+        ):
+            req: Req
+            if req.is_chunked <= 0:
+                # There is no output_ids for prefill
+                req.output_ids.append(next_token_id)
+                self.tree_cache.cache_unfinished_req(req)  # update the tree and lock
+                self.disagg_prefill_inflight_queue.append(req)
+                if (
+                    logits_output is not None
+                    and logits_output.hidden_states is not None
+                ):
+                    last_hidden_index = (
+                        hidden_state_offset + extend_input_len_per_req[i] - 1
+                    )
+                    req.hidden_states_tensor = (
+                        logits_output.hidden_states[last_hidden_index].cpu().clone()
+                    )
+                    hidden_state_offset += extend_input_len_per_req[i]
+                else:
+                    req.hidden_states_tensor = None
+                if req.return_logprob:
+                    assert extend_logprob_start_len_per_req is not None
+                    assert extend_input_len_per_req is not None
+                    extend_logprob_start_len = extend_logprob_start_len_per_req[i]
+                    extend_input_len = extend_input_len_per_req[i]
+                    num_input_logprobs = extend_input_len - extend_logprob_start_len
+                    self.add_logprob_return_values(
+                        i,
+                        req,
+                        logprob_pt,
+                        next_token_ids,
+                        num_input_logprobs,
+                        logits_output,
+                    )
+                    logprob_pt += num_input_logprobs
+                self.send_kv_chunk(req, last_chunk=True)
+
+                if req.grammar is not None:
+                    # FIXME: this try-except block is for handling unexpected xgrammar issue.
+                    try:
+                        req.grammar.accept_token(next_token_id)
+                    except ValueError as e:
+                        # Grammar accept_token can raise ValueError if the token is not in the grammar.
+                        # This can happen if the grammar is not set correctly or the token is invalid.
+                        error_message = f"Grammar accept_token failed for req {req.rid} with token {next_token_id}: {e}"
+                        self.tree_cache.cache_finished_req(req)
+                        prepare_abort(
+                            req,
+                            error_message,
+                            status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
+                        )
+                    req.grammar.finished = req.finished()
+            else:
+                # being chunked reqs' prefill is not finished
+                req.is_chunked -= 1
+
+                if req.return_logprob:
+                    extend_logprob_start_len = extend_logprob_start_len_per_req[i]
+                    extend_input_len = extend_input_len_per_req[i]
+                    if extend_logprob_start_len < extend_input_len:
+                        # Update input logprobs.
+                        num_input_logprobs = extend_input_len - extend_logprob_start_len
+                        self.add_input_logprob_return_values(
+                            i,
+                            req,
+                            logits_output,
+                            logprob_pt,
+                            num_input_logprobs,
+                            last_prefill_chunk=False,
+                        )
+                        logprob_pt += num_input_logprobs
+
+                if self.enable_overlap:
+                    self.send_kv_chunk(req, last_chunk=False, end_idx=req.tmp_end_idx)
+
+        # We need to remove the sync in the following function for overlap schedule.
+        self.set_next_batch_sampling_info_done(batch)
+        self.maybe_send_health_check_signal()
+
+    def process_disagg_prefill_inflight_queue(
+        self: Scheduler, rids_to_check: Optional[List[str]] = None
+    ) -> List[Req]:
+        """
+        Poll the requests in the middle of transfer. If done, return the request.
+        rids_to_check: For PP, on rank > 0, check the rids from the previous rank has consensus with the current rank.
+        """
+        if len(self.disagg_prefill_inflight_queue) == 0:
+            return []
+
+        done_reqs = []
+
+        polls = poll_and_all_reduce(
+            [req.disagg_kv_sender for req in self.disagg_prefill_inflight_queue],
+            self.attn_tp_cpu_group,
+        )
+
+        undone_reqs: List[Req] = []
+        # Check .poll() for the reqs in disagg_prefill_inflight_queue. If Success, respond to the client and remove it from the queue
+        for req, poll in zip(self.disagg_prefill_inflight_queue, polls):
+
+            if rids_to_check is not None:
+                if req.rid not in rids_to_check:
+                    undone_reqs.append(req)
+                    continue
+
+                assert poll == KVPoll.Success or poll == KVPoll.Failed
+
+            if poll in [KVPoll.WaitingForInput, KVPoll.Transferring]:
+                undone_reqs.append(req)
+            elif poll == KVPoll.Success:  # transfer done
+                self.tree_cache.cache_finished_req(req)  # unlock the tree
+                req.finished_reason = FINISH_LENGTH(length=0)
+                # FIXME: clean up req's data in transfer engine
+                if hasattr(req.disagg_kv_sender, "clear"):
+                    req.disagg_kv_sender.clear()
+                done_reqs.append(req)
+            elif poll == KVPoll.Failed:
+                error_message = f"Prefill transfer failed for request rank={self.tp_rank} {req.rid=} {req.bootstrap_room=}"
+                try:
+                    req.disagg_kv_sender.failure_exception()
+                except Exception as e:
+                    error_message += f" with exception {e}"
+                logger.warning(error_message)
+                self.tree_cache.cache_finished_req(req)  # unlock the tree
+                prepare_abort(
+                    req, error_message, status_code=HTTPStatus.INTERNAL_SERVER_ERROR
+                )
+                done_reqs.append(req)
+                if self.enable_metrics:
+                    self.metrics_collector.increment_transfer_failed_reqs()
+            else:
+                assert False, f"Unexpected polling state {poll=}"
+
+        # Stream requests which have finished transfer
+        self.stream_output(
+            done_reqs,
+            any(req.return_logprob for req in done_reqs),
+            None,
+        )
+        for req in done_reqs:
+            req: Req
+            self.req_to_metadata_buffer_idx_allocator.free(req.metadata_buffer_index)
+            req.metadata_buffer_index = -1
+
+        self.disagg_prefill_inflight_queue = undone_reqs
+
+        return done_reqs
+
+    def get_transferred_rids(self: Scheduler) -> List[str]:
+        """
+        Used by PP, get the transferred rids but **do not pop**
+        """
+        polls = poll_and_all_reduce(
+            [req.disagg_kv_sender for req in self.disagg_prefill_inflight_queue],
+            self.tp_worker.get_tp_group().cpu_group,
+        )
+
+        transferred_rids: List[str] = []
+
+        for req, poll in zip(self.disagg_prefill_inflight_queue, polls):
+            if poll == KVPoll.Success or poll == KVPoll.Failed:
+                transferred_rids.append(req.rid)
+
+        return transferred_rids
+
+    def process_prefill_chunk(self: Scheduler) -> None:
+        if self.last_batch and self.last_batch.forward_mode.is_extend():
+            if self.chunked_req:
+                # Move the chunked request out of the batch so that we can merge
+                # only finished requests to running_batch.
+                self.last_batch.filter_batch(chunked_req_to_exclude=self.chunked_req)
+                self.tree_cache.cache_unfinished_req(self.chunked_req, chunked=True)
+                if self.enable_overlap:
+                    # Delay KV transfer to process_batch_result_disagg_prefill when overlap is enabled to ensure results are resolved
+                    self.chunked_req.tmp_end_idx = min(
+                        len(self.chunked_req.fill_ids),
+                        len(self.chunked_req.origin_input_ids),
+                    )
+                else:
+                    self.send_kv_chunk(self.chunked_req)
+                # chunked request keeps its rid but will get a new req_pool_idx
+                self.req_to_token_pool.free(self.chunked_req.req_pool_idx)
+                self.running_batch.batch_is_full = False
+
+    def send_kv_chunk(
+        self: Scheduler,
+        req: Req,
+        last_chunk: bool = False,
+        end_idx: Optional[int] = None,
+    ) -> None:
+        """
+        Send a prefilled chunk to the decode server
+        """
+        page_size = self.token_to_kv_pool_allocator.page_size
+        start_idx = req.start_send_idx
+        end_idx = (
+            end_idx
+            if end_idx is not None
+            else min(len(req.fill_ids), len(req.origin_input_ids))
+        )
+
+        if not last_chunk:
+            # if not the last chunk and the last page is partial, delay the last partial page to the next send
+            end_idx = end_idx - end_idx % page_size
+
+        kv_indices = (
+            self.req_to_token_pool.req_to_token[req.req_pool_idx, start_idx:end_idx]
+            .cpu()
+            .numpy()
+        )
+        req.start_send_idx = end_idx
+        if last_chunk:
+            self.disagg_metadata_buffers.set_buf(req)
+        page_indices = kv_to_page_indices(kv_indices, page_size)
+        if len(page_indices) == 0:
+            logger.info(
+                f"Skip sending kv chunk for request {req.rid=} {req.bootstrap_room=} because page_indices is empty"
+            )
+            return
+        req.disagg_kv_sender.send(page_indices)
+
+    # PP
+    @DynamicGradMode()
+    def event_loop_pp_disagg_prefill(self: Scheduler):
+        """
+        An event loop for the prefill server in pipeline parallelism.
+
+        Rules:
+        1. Each stage runs in the same order and is notified by the previous stage.
+        2. Each send/recv operation is blocking and matched by the neighboring stage.
+
+        Regular Schedule:
+        ====================================================================
+        Stage i                   | Stage i+1
+        send ith req              | recv ith req
+        send ith proxy            | recv ith proxy
+        send prev (i+1)th carry   | recv prev (i+1)th carry
+        ====================================================================
+
+        Prefill Server Schedule:
+        ====================================================================
+        Stage i                        | Stage i+1
+        send ith req                   | recv ith req
+        send ith bootstrap req         | recv ith bootstrap req
+        send ith transferred req       | recv ith transferred req
+        send ith proxy                 | recv ith proxy
+        send prev (i+1)th carry        | recv prev (i+1)th carry
+        send prev (i+1)th release req  | recv prev (i+1)th release req
+        ====================================================================
+
+        There are two additional elements compared to the regular schedule:
+
+        1. Bootstrap Requests:
+            a. Instead of polling the status on the current workers, we should wait for the previous stage to notify to avoid desynchronization.
+            b. The first stage polls the status and propagates the bootstrapped requests down to all other stages.
+            c. If the first stage polls successfully, by nature, other ranks are also successful because they performed a handshake together.
+
+        2. Transferred Requests + Release Requests:
+            a. The first stage polls the transfer finished requests, performs an intersection with the next stage's finished requests, and propagates down to the last stage.
+            b. The last stage receives the requests that have finished transfer on all stages (consensus), then sends them to the first stage to release the memory.
+            c. The first stage receives the release requests, releases the memory, and then propagates the release requests down to the last stage.
+        """
+        from sglang.srt.managers.scheduler import GenerationBatchResult
+
+        mbs = [None] * self.pp_size
+        last_mbs = [None] * self.pp_size
+        self.running_mbs = [
+            ScheduleBatch(reqs=[], batch_is_full=False) for _ in range(self.pp_size)
+        ]
+        bids = [None] * self.pp_size
+        pp_outputs: Optional[PPProxyTensors] = None
+
+        # Either success or failed
+        bootstrapped_rids: List[str] = []
+        transferred_rids: List[str] = []
+        release_rids: Optional[List[str]] = None
+
+        # transferred microbatch
+        tmbs = [None] * self.pp_size
+
+        ENABLE_RELEASE = True  # For debug
+
+        while True:
+            server_is_idle = True
+
+            for mb_id in range(self.pp_size):
+                self.running_batch = self.running_mbs[mb_id]
+                self.last_batch = last_mbs[mb_id]
+
+                recv_reqs = self.recv_requests()
+
+                self.process_input_requests(recv_reqs)
+
+                if self.pp_group.is_first_rank:
+                    # First rank, pop the bootstrap reqs from the bootstrap queue
+                    bootstrapped_reqs, failed_reqs = (
+                        self.disagg_prefill_bootstrap_queue.pop_bootstrapped(
+                            return_failed_reqs=True
+                        )
+                    )
+                    bootstrapped_rids = [req.rid for req in bootstrapped_reqs] + [
+                        req.rid for req in failed_reqs
+                    ]
+                    self.waiting_queue.extend(bootstrapped_reqs)
+                else:
+                    # Other ranks, receive the bootstrap reqs info from the previous rank and ensure the consensus
+                    bootstrapped_rids = self.recv_pyobj_from_prev_stage()
+                    bootstrapped_reqs = (
+                        self.disagg_prefill_bootstrap_queue.pop_bootstrapped(
+                            rids_to_check=bootstrapped_rids
+                        )
+                    )
+                    self.waiting_queue.extend(bootstrapped_reqs)
+
+                if self.pp_group.is_first_rank:
+                    transferred_rids = self.get_transferred_rids()
+                # if other ranks,
+                else:
+                    # 1. recv previous stage's transferred reqs info
+                    prev_transferred_rids = self.recv_pyobj_from_prev_stage()
+                    # 2. get the current stage's transferred reqs info
+                    curr_transferred_rids = self.get_transferred_rids()
+                    # 3. new consensus rids = intersection(previous consensus rids, transfer finished rids)
+                    transferred_rids = list(
+                        set(prev_transferred_rids) & set(curr_transferred_rids)
+                    )
+
+                tmbs[mb_id] = transferred_rids
+
+                self.process_prefill_chunk()
+                mbs[mb_id] = self.get_new_batch_prefill()
+                self.running_mbs[mb_id] = self.running_batch
+
+                self.cur_batch = mbs[mb_id]
+                if self.cur_batch:
+                    server_is_idle = False
+                    result = self.run_batch(self.cur_batch)
+
+                # send the outputs to the next step
+                if self.pp_group.is_last_rank:
+                    if self.cur_batch:
+                        next_token_ids, bids[mb_id] = (
+                            result.next_token_ids,
+                            result.bid,
+                        )
+                        pp_outputs = PPProxyTensors(
+                            {
+                                "next_token_ids": next_token_ids,
+                            }
+                        )
+                        # send the output from the last round to let the next stage worker run post processing
+                        self.pp_group.send_tensor_dict(
+                            pp_outputs.tensors,
+                            all_gather_group=self.attn_tp_group,
+                        )
+
+                if ENABLE_RELEASE:
+                    if self.pp_group.is_last_rank:
+                        # At the last stage, all stages has reached the consensus to release memory for transferred_rids
+                        release_rids = transferred_rids
+                        # send to the first rank
+                        self.send_pyobj_to_next_stage(release_rids)
+
+                # receive outputs and post-process (filter finished reqs) the coming microbatch
+                next_mb_id = (mb_id + 1) % self.pp_size
+                next_pp_outputs = None
+                next_release_rids = None
+
+                if mbs[next_mb_id] is not None:
+                    next_pp_outputs: Optional[PPProxyTensors] = PPProxyTensors(
+                        self.pp_group.recv_tensor_dict(
+                            all_gather_group=self.attn_tp_group
+                        )
+                    )
+                    mbs[next_mb_id].output_ids = next_pp_outputs["next_token_ids"]
+                    output_result = GenerationBatchResult(
+                        logits_output=None,
+                        pp_hidden_states_proxy_tensors=None,
+                        next_token_ids=next_pp_outputs["next_token_ids"],
+                        extend_input_len_per_req=None,
+                        extend_logprob_start_len_per_req=None,
+                        bid=bids[next_mb_id],
+                        can_run_cuda_graph=result.can_run_cuda_graph,
+                    )
+                    self.process_batch_result_disagg_prefill(
+                        mbs[next_mb_id], output_result
+                    )
+
+                    last_mbs[next_mb_id] = mbs[next_mb_id]
+
+                if ENABLE_RELEASE:
+                    if tmbs[next_mb_id] is not None:
+                        # recv consensus rids from the previous rank
+                        next_release_rids = self.recv_pyobj_from_prev_stage()
+                        self.process_disagg_prefill_inflight_queue(next_release_rids)
+
+                # carry the outputs to the next stage
+                if not self.pp_group.is_last_rank:
+                    if self.cur_batch:
+                        bids[mb_id] = result.bid
+                    if pp_outputs:
+                        # send the outputs from the last round to let the next stage worker run post processing
+                        self.pp_group.send_tensor_dict(
+                            pp_outputs.tensors,
+                            all_gather_group=self.attn_tp_group,
+                        )
+                    if ENABLE_RELEASE:
+                        if release_rids is not None:
+                            self.send_pyobj_to_next_stage(release_rids)
+
+                if not self.pp_group.is_last_rank:
+                    # send out reqs to the next stage
+                    self.send_pyobj_to_next_stage(recv_reqs)
+                    self.send_pyobj_to_next_stage(bootstrapped_rids)
+                    self.send_pyobj_to_next_stage(transferred_rids)
+
+                    # send out proxy tensors to the next stage
+                    if self.cur_batch:
+                        self.pp_group.send_tensor_dict(
+                            result.pp_hidden_states_proxy_tensors,
+                            all_gather_group=self.attn_tp_group,
+                        )
+
+                pp_outputs = next_pp_outputs
+                release_rids = next_release_rids
+
+                self.running_batch.batch_is_full = False
+
+            if not ENABLE_RELEASE:
+                if len(self.disagg_prefill_inflight_queue) > 0:
+                    self.process_disagg_prefill_inflight_queue()
+
+            # When the server is idle, self-check and re-init some states
+            if server_is_idle and len(self.disagg_prefill_inflight_queue) == 0:
+                self.check_memory()
+                self.check_tree_cache()
+                self.new_token_ratio = self.init_new_token_ratio
+
+    def send_pyobj_to_next_stage(self, data):
+        if self.attn_tp_rank == 0:
+            dp_offset = self.attn_dp_rank * self.attn_tp_size
+            point_to_point_pyobj(
+                data,
+                self.pp_rank * self.tp_size + dp_offset,
+                self.world_group.device_group,
+                self.pp_rank * self.tp_size + dp_offset,
+                ((self.pp_rank + 1) % self.pp_size) * self.tp_size + dp_offset,
+            )
+
+    def recv_pyobj_from_prev_stage(self):
+        if self.attn_tp_rank == 0:
+            dp_offset = self.attn_dp_rank * self.attn_tp_size
+            data = point_to_point_pyobj(
+                [],
+                self.pp_rank * self.tp_size + dp_offset,
+                self.world_group.device_group,
+                ((self.pp_rank - 1) % self.pp_size) * self.tp_size + dp_offset,
+                self.pp_rank * self.tp_size + dp_offset,
+            )
+        else:
+            data = None
+
+        if self.tp_size != 1:
+            data = broadcast_pyobj(
+                data, self.tp_group.rank, self.tp_cpu_group, src=self.tp_group.ranks[0]
+            )
+        return data
diff --git a/python/sglang/srt/disaggregation/utils.py b/python/sglang/srt/disaggregation/utils.py
new file mode 100644
index 000000000..43770e3e2
--- /dev/null
+++ b/python/sglang/srt/disaggregation/utils.py
@@ -0,0 +1,329 @@
+from __future__ import annotations
+
+import os
+import random
+from collections import deque
+from contextlib import nullcontext
+from enum import Enum
+from typing import TYPE_CHECKING, List, Optional, Type, Union
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+from sglang.srt.utils import is_npu
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req
+
+#########################
+# Constants & Enums
+#########################
+FAKE_BOOTSTRAP_HOST = "2.2.2.2"
+
+
+class DisaggregationMode(Enum):
+    NULL = "null"
+    PREFILL = "prefill"
+    DECODE = "decode"
+
+
+#########################
+# Synchronization
+#########################
+
+# env var for testing failure, convert to float explicitly
+FAILURE_PROB = float(os.getenv("DISAGGREGATION_TEST_FAILURE_PROB", 0))
+
+
+def poll_and_all_reduce(pollers, gloo_group):
+    # at a certain prob, the poll is failed to simulate failure
+    if FAILURE_PROB > 0:
+        from sglang.srt.disaggregation.base import KVPoll
+
+        polls = [
+            int(KVPoll.Failed) if random.random() < FAILURE_PROB else int(poller.poll())
+            for poller in pollers
+        ]
+    else:
+        polls = [int(poller.poll()) for poller in pollers]
+    tensor_to_reduce = torch.tensor(polls, dtype=torch.uint8, device="cpu")
+    dist.all_reduce(tensor_to_reduce, op=dist.ReduceOp.MIN, group=gloo_group)
+    return tensor_to_reduce.tolist()
+
+
+#########################
+# Metadata Buffers
+#########################
+
+
+class ReqToMetadataIdxAllocator:
+    """A memory pool that maps a request to its first output token location."""
+
+    def __init__(
+        self,
+        size: int,
+    ):
+        self.size = size
+        self.free_slots = deque(list(range(size)))
+
+    def available_size(self):
+        return len(self.free_slots)
+
+    def alloc(self) -> Optional[int]:
+        if len(self.free_slots) == 0:
+            return None
+
+        return self.free_slots.popleft()
+
+    def free(self, free_index: int):
+        self.free_slots.append(free_index)
+
+
+class MetadataBuffers:
+    def __init__(
+        self,
+        size: int,
+        hidden_size: int,
+        dtype: torch.dtype,
+        max_top_logprobs_num: int = 128,
+        custom_mem_pool: torch.cuda.MemPool = None,
+    ):
+        self.custom_mem_pool = custom_mem_pool
+        device = "cpu"
+        if is_npu():
+            # For ascend backend, output tokens are placed in the NPU and will be transferred by D2D channel.
+            device = "npu"
+        elif self.custom_mem_pool:
+            # TODO(shangming): Fix me (use 'cuda') when nvlink_transport of Mooncake is bug-free
+            device = "cpu"
+        with (
+            torch.cuda.use_mem_pool(self.custom_mem_pool)
+            if self.custom_mem_pool
+            else nullcontext()
+        ):
+            # TODO: abort top_logprobs_num > 128 in PD
+
+            # We transfer the metadata of first output token to decode
+            # The minimal size for RDMA is 64Bytes, so we pad it to > 64Bytes
+            self.output_ids = torch.zeros((size, 16), dtype=torch.int32, device=device)
+
+            self.output_token_logprobs_val = torch.zeros(
+                (size, 16), dtype=torch.float32, device=device
+            )
+            self.output_token_logprobs_idx = torch.zeros(
+                (size, 16), dtype=torch.int32, device=device
+            )
+            self.output_top_logprobs_val = torch.zeros(
+                (size, max_top_logprobs_num), dtype=torch.float32, device=device
+            )
+            self.output_top_logprobs_idx = torch.zeros(
+                (size, max_top_logprobs_num), dtype=torch.int32, device=device
+            )
+            self.output_hidden_states = torch.zeros(
+                (size, hidden_size), dtype=dtype, device=device
+            )
+
+    def get_buf_infos(self):
+        ptrs = [
+            self.output_ids.data_ptr(),
+            self.output_token_logprobs_val.data_ptr(),
+            self.output_token_logprobs_idx.data_ptr(),
+            self.output_top_logprobs_val.data_ptr(),
+            self.output_top_logprobs_idx.data_ptr(),
+            self.output_hidden_states.data_ptr(),
+        ]
+        data_lens = [
+            self.output_ids.nbytes,
+            self.output_token_logprobs_val.nbytes,
+            self.output_token_logprobs_idx.nbytes,
+            self.output_top_logprobs_val.nbytes,
+            self.output_top_logprobs_idx.nbytes,
+            self.output_hidden_states.nbytes,
+        ]
+        item_lens = [
+            self.output_ids[0].nbytes,
+            self.output_token_logprobs_val[0].nbytes,
+            self.output_token_logprobs_idx[0].nbytes,
+            self.output_top_logprobs_val[0].nbytes,
+            self.output_top_logprobs_idx[0].nbytes,
+            self.output_hidden_states[0].nbytes,
+        ]
+        return ptrs, data_lens, item_lens
+
+    def get_buf(self, idx: int):
+        return (
+            self.output_ids[idx],
+            self.output_token_logprobs_val[idx],
+            self.output_token_logprobs_idx[idx],
+            self.output_top_logprobs_val[idx],
+            self.output_top_logprobs_idx[idx],
+            self.output_hidden_states[idx],
+        )
+
+    def set_buf(self, req: Req):
+
+        self.output_ids[req.metadata_buffer_index][0] = req.output_ids[0]
+        if req.return_logprob:
+            if req.output_token_logprobs_val:  # not none or empty list
+                self.output_token_logprobs_val[req.metadata_buffer_index][0] = (
+                    req.output_token_logprobs_val[0]
+                )
+            if req.output_token_logprobs_idx:  # not none or empty list
+                self.output_token_logprobs_idx[req.metadata_buffer_index][0] = (
+                    req.output_token_logprobs_idx[0]
+                )
+
+            if req.output_top_logprobs_val:  # not none or empty list
+                self.output_top_logprobs_val[req.metadata_buffer_index][
+                    : len(req.output_top_logprobs_val[0])
+                ] = torch.tensor(
+                    req.output_top_logprobs_val[0], dtype=torch.float32, device="cpu"
+                )
+            if req.output_top_logprobs_idx:  # not none or empty list
+                self.output_top_logprobs_idx[req.metadata_buffer_index][
+                    : len(req.output_top_logprobs_idx[0])
+                ] = torch.tensor(
+                    req.output_top_logprobs_idx[0], dtype=torch.int32, device="cpu"
+                )
+        # for PD + spec decode
+        if req.hidden_states_tensor is not None:
+            self.output_hidden_states[req.metadata_buffer_index].copy_(
+                req.hidden_states_tensor
+            )
+
+
+#########################
+# Transfer Backend
+#########################
+
+
+class TransferBackend(Enum):
+    MOONCAKE = "mooncake"
+    NIXL = "nixl"
+    ASCEND = "ascend"
+    FAKE = "fake"
+
+
+class KVClassType(Enum):
+    KVARGS = "kvargs"
+    MANAGER = "manager"
+    SENDER = "sender"
+    RECEIVER = "receiver"
+    BOOTSTRAP_SERVER = "bootstrap_server"
+
+
+def get_kv_class(
+    transfer_backend: TransferBackend, class_type: KVClassType
+) -> Optional[Type]:
+    from sglang.srt.disaggregation.fake import FakeKVReceiver, FakeKVSender
+
+    if transfer_backend == TransferBackend.MOONCAKE:
+        from sglang.srt.disaggregation.base import KVArgs
+        from sglang.srt.disaggregation.mooncake import (
+            MooncakeKVBootstrapServer,
+            MooncakeKVManager,
+            MooncakeKVReceiver,
+            MooncakeKVSender,
+        )
+
+        class_mapping = {
+            KVClassType.KVARGS: KVArgs,
+            KVClassType.MANAGER: MooncakeKVManager,
+            KVClassType.SENDER: MooncakeKVSender,
+            KVClassType.RECEIVER: (MooncakeKVReceiver),
+            KVClassType.BOOTSTRAP_SERVER: MooncakeKVBootstrapServer,
+        }
+        return class_mapping.get(class_type)
+    elif transfer_backend == TransferBackend.ASCEND:
+        from sglang.srt.disaggregation.ascend import (
+            AscendKVBootstrapServer,
+            AscendKVManager,
+            AscendKVReceiver,
+            AscendKVSender,
+        )
+        from sglang.srt.disaggregation.base import KVArgs
+
+        class_mapping = {
+            KVClassType.KVARGS: KVArgs,
+            KVClassType.MANAGER: AscendKVManager,
+            KVClassType.SENDER: AscendKVSender,
+            KVClassType.RECEIVER: (AscendKVReceiver),
+            KVClassType.BOOTSTRAP_SERVER: AscendKVBootstrapServer,
+        }
+        return class_mapping.get(class_type)
+    elif transfer_backend == TransferBackend.NIXL:
+        from sglang.srt.disaggregation.base import KVArgs
+        from sglang.srt.disaggregation.nixl import (
+            NixlKVBootstrapServer,
+            NixlKVManager,
+            NixlKVReceiver,
+            NixlKVSender,
+        )
+
+        class_mapping = {
+            KVClassType.KVARGS: KVArgs,
+            KVClassType.MANAGER: NixlKVManager,
+            KVClassType.SENDER: NixlKVSender,
+            KVClassType.RECEIVER: (NixlKVReceiver),
+            KVClassType.BOOTSTRAP_SERVER: NixlKVBootstrapServer,
+        }
+        return class_mapping.get(class_type)
+    elif transfer_backend == TransferBackend.FAKE:
+        from sglang.srt.disaggregation.base import KVArgs
+        from sglang.srt.disaggregation.fake import FakeKVReceiver, FakeKVSender
+
+        class_mapping = {
+            KVClassType.KVARGS: KVArgs,
+            KVClassType.SENDER: FakeKVSender,
+            KVClassType.RECEIVER: (FakeKVReceiver),
+        }
+        return class_mapping.get(class_type)
+
+    raise ValueError(f"Unsupported transfer backend: {transfer_backend}")
+
+
+#########################
+# KV Pages
+#########################
+
+
+def kv_to_page_indices(kv_indices: np.ndarray, page_size: int):
+    # 1. The page is guaranteed to be full except the last page.
+    # 2. page index = kv_index // page_size
+    # The return vector is kv_indices[::page_size] // page_size
+    if page_size == 1:  # shortcut
+        return kv_indices
+
+    return kv_indices[::page_size] // page_size
+
+
+def kv_to_page_num(num_kv_indices: int, page_size: int):
+    # ceil(num_kv_indices / page_size)
+    return (num_kv_indices + page_size - 1) // page_size
+
+
+#########################
+# Misc
+#########################
+
+
+def is_mla_backend(target_kv_pool) -> bool:
+    from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool
+
+    return isinstance(target_kv_pool, MLATokenToKVPool)
+
+
+def prepare_abort(req: Req, error_message: str, status_code=None):
+    from sglang.srt.managers.schedule_batch import FINISH_ABORT
+
+    # populate finish metadata and stream output
+    req.finished_reason = FINISH_ABORT(error_message, status_code)
+
+    if req.return_logprob:
+        req.input_token_logprobs_val = []
+        req.input_token_logprobs_idx = []
+        req.input_top_logprobs_val = []
+        req.input_top_logprobs_idx = []
+        req.input_token_ids_logprobs_val = []
+        req.input_token_ids_logprobs_idx = []
diff --git a/python/sglang/srt/distributed/__init__.py b/python/sglang/srt/distributed/__init__.py
new file mode 100644
index 000000000..12f802055
--- /dev/null
+++ b/python/sglang/srt/distributed/__init__.py
@@ -0,0 +1,3 @@
+from sglang.srt.distributed.communication_op import *
+from sglang.srt.distributed.parallel_state import *
+from sglang.srt.distributed.utils import *
diff --git a/python/sglang/srt/distributed/communication_op.py b/python/sglang/srt/distributed/communication_op.py
new file mode 100644
index 000000000..95600edfb
--- /dev/null
+++ b/python/sglang/srt/distributed/communication_op.py
@@ -0,0 +1,35 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/communication_op.py
+
+from typing import Any, Dict, Optional, Union
+
+import torch
+import torch.distributed
+
+from .parallel_state import get_tp_group
+
+
+def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
+    """All-reduce the input tensor across model parallel group."""
+    return get_tp_group().all_reduce(input_)
+
+
+def tensor_model_parallel_all_gather(
+    input_: torch.Tensor, dim: int = -1
+) -> torch.Tensor:
+    """All-gather the input tensor across model parallel group."""
+    return get_tp_group().all_gather(input_, dim)
+
+
+def tensor_model_parallel_gather(
+    input_: torch.Tensor, dst: int = 0, dim: int = -1
+) -> Optional[torch.Tensor]:
+    """Gather the input tensor across model parallel group."""
+    return get_tp_group().gather(input_, dst, dim)
+
+
+def broadcast_tensor_dict(
+    tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None, src: int = 0
+):
+    if not torch.distributed.is_initialized():
+        return tensor_dict
+    return get_tp_group().broadcast_tensor_dict(tensor_dict, src)
diff --git a/python/sglang/srt/distributed/device_communicators/cuda_wrapper.py b/python/sglang/srt/distributed/device_communicators/cuda_wrapper.py
new file mode 100644
index 000000000..c902f3141
--- /dev/null
+++ b/python/sglang/srt/distributed/device_communicators/cuda_wrapper.py
@@ -0,0 +1,183 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/cuda_wrapper.py
+
+"""This file is a pure Python wrapper for the cudart library.
+It avoids the need to compile a separate shared library, and is
+convenient for use when we just need to call a few functions.
+"""
+
+import ctypes
+import logging
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+# this line makes it possible to directly load `libcudart.so` using `ctypes`
+import torch  # noqa
+
+logger = logging.getLogger(__name__)
+
+# === export types and functions from cudart to Python ===
+# for the original cudart definition, please check
+# https://docs.nvidia.com/cuda/cuda-runtime-api/index.html
+
+cudaError_t = ctypes.c_int
+cudaMemcpyKind = ctypes.c_int
+
+
+class cudaIpcMemHandle_t(ctypes.Structure):
+    _fields_ = [("internal", ctypes.c_byte * 128)]
+
+
+@dataclass
+class Function:
+    name: str
+    restype: Any
+    argtypes: List[Any]
+
+
+def find_loaded_library(lib_name) -> Optional[str]:
+    """
+    According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
+    the file `/proc/self/maps` contains the memory maps of the process, which includes the
+    shared libraries loaded by the process. We can use this file to find the path of the
+    a loaded library.
+    """  # noqa
+    found = False
+    with open("/proc/self/maps") as f:
+        for line in f:
+            if lib_name in line:
+                found = True
+                break
+    if not found:
+        # the library is not loaded in the current process
+        return None
+    # if lib_name is libcudart, we need to match a line with:
+    # address /path/to/libcudart-hash.so.11.0
+    start = line.index("/")
+    path = line[start:].strip()
+    filename = path.split("/")[-1]
+    assert filename.rpartition(".so")[0].startswith(
+        lib_name
+    ), f"Unexpected filename: {filename} for library {lib_name}"
+    return path
+
+
+class CudaRTLibrary:
+    exported_functions = [
+        # ​cudaError_t cudaSetDevice ( int  device )
+        Function("cudaSetDevice", cudaError_t, [ctypes.c_int]),
+        # cudaError_t 	cudaDeviceSynchronize ( void )
+        Function("cudaDeviceSynchronize", cudaError_t, []),
+        # ​cudaError_t cudaDeviceReset ( void )
+        Function("cudaDeviceReset", cudaError_t, []),
+        # const char* 	cudaGetErrorString ( cudaError_t error )
+        Function("cudaGetErrorString", ctypes.c_char_p, [cudaError_t]),
+        # ​cudaError_t 	cudaMalloc ( void** devPtr, size_t size )
+        Function(
+            "cudaMalloc",
+            cudaError_t,
+            [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t],
+        ),
+        # ​cudaError_t 	cudaFree ( void* devPtr )
+        Function("cudaFree", cudaError_t, [ctypes.c_void_p]),
+        # ​cudaError_t cudaMemset ( void* devPtr, int  value, size_t count )
+        Function(
+            "cudaMemset", cudaError_t, [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]
+        ),
+        # ​cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ) # noqa
+        Function(
+            "cudaMemcpy",
+            cudaError_t,
+            [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, cudaMemcpyKind],
+        ),
+        # cudaError_t cudaIpcGetMemHandle ( cudaIpcMemHandle_t* handle, void* devPtr ) # noqa
+        Function(
+            "cudaIpcGetMemHandle",
+            cudaError_t,
+            [ctypes.POINTER(cudaIpcMemHandle_t), ctypes.c_void_p],
+        ),
+        # ​cudaError_t cudaIpcOpenMemHandle ( void** devPtr, cudaIpcMemHandle_t handle, unsigned int  flags ) # noqa
+        Function(
+            "cudaIpcOpenMemHandle",
+            cudaError_t,
+            [ctypes.POINTER(ctypes.c_void_p), cudaIpcMemHandle_t, ctypes.c_uint],
+        ),
+    ]
+
+    # class attribute to store the mapping from the path to the library
+    # to avoid loading the same library multiple times
+    path_to_library_cache: Dict[str, Any] = {}
+
+    # class attribute to store the mapping from library path
+    #  to the corresponding dictionary
+    path_to_dict_mapping: Dict[str, Dict[str, Any]] = {}
+
+    def __init__(self, so_file: Optional[str] = None):
+        if so_file is None:
+            so_file = find_loaded_library("libcudart")
+            assert so_file is not None, "libcudart is not loaded in the current process"
+        if so_file not in CudaRTLibrary.path_to_library_cache:
+            lib = ctypes.CDLL(so_file)
+            CudaRTLibrary.path_to_library_cache[so_file] = lib
+        self.lib = CudaRTLibrary.path_to_library_cache[so_file]
+
+        if so_file not in CudaRTLibrary.path_to_dict_mapping:
+            _funcs = {}
+            for func in CudaRTLibrary.exported_functions:
+                f = getattr(self.lib, func.name)
+                f.restype = func.restype
+                f.argtypes = func.argtypes
+                _funcs[func.name] = f
+            CudaRTLibrary.path_to_dict_mapping[so_file] = _funcs
+        self.funcs = CudaRTLibrary.path_to_dict_mapping[so_file]
+
+    def CUDART_CHECK(self, result: cudaError_t) -> None:
+        if result != 0:
+            error_str = self.cudaGetErrorString(result)
+            raise RuntimeError(f"CUDART error: {error_str}")
+
+    def cudaGetErrorString(self, error: cudaError_t) -> str:
+        return self.funcs["cudaGetErrorString"](error).decode("utf-8")
+
+    def cudaSetDevice(self, device: int) -> None:
+        self.CUDART_CHECK(self.funcs["cudaSetDevice"](device))
+
+    def cudaDeviceSynchronize(self) -> None:
+        self.CUDART_CHECK(self.funcs["cudaDeviceSynchronize"]())
+
+    def cudaDeviceReset(self) -> None:
+        self.CUDART_CHECK(self.funcs["cudaDeviceReset"]())
+
+    def cudaMalloc(self, size: int) -> ctypes.c_void_p:
+        devPtr = ctypes.c_void_p()
+        self.CUDART_CHECK(self.funcs["cudaMalloc"](ctypes.byref(devPtr), size))
+        return devPtr
+
+    def cudaFree(self, devPtr: ctypes.c_void_p) -> None:
+        self.CUDART_CHECK(self.funcs["cudaFree"](devPtr))
+
+    def cudaMemset(self, devPtr: ctypes.c_void_p, value: int, count: int) -> None:
+        self.CUDART_CHECK(self.funcs["cudaMemset"](devPtr, value, count))
+
+    def cudaMemcpy(
+        self, dst: ctypes.c_void_p, src: ctypes.c_void_p, count: int
+    ) -> None:
+        cudaMemcpyDefault = 4
+        kind = cudaMemcpyDefault
+        self.CUDART_CHECK(self.funcs["cudaMemcpy"](dst, src, count, kind))
+
+    def cudaIpcGetMemHandle(self, devPtr: ctypes.c_void_p) -> cudaIpcMemHandle_t:
+        handle = cudaIpcMemHandle_t()
+        self.CUDART_CHECK(
+            self.funcs["cudaIpcGetMemHandle"](ctypes.byref(handle), devPtr)
+        )
+        return handle
+
+    def cudaIpcOpenMemHandle(self, handle: cudaIpcMemHandle_t) -> ctypes.c_void_p:
+        cudaIpcMemLazyEnablePeerAccess = 1
+        devPtr = ctypes.c_void_p()
+        self.CUDART_CHECK(
+            self.funcs["cudaIpcOpenMemHandle"](
+                ctypes.byref(devPtr), handle, cudaIpcMemLazyEnablePeerAccess
+            )
+        )
+        return devPtr
diff --git a/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py b/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py
new file mode 100644
index 000000000..0a506d35f
--- /dev/null
+++ b/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py
@@ -0,0 +1,421 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/custom_all_reduce.py
+
+import ctypes
+import logging
+import os
+from contextlib import contextmanager
+from typing import Any, List, Optional, Union
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from sglang.srt import _custom_ops as ops
+from sglang.srt.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
+from sglang.srt.distributed.device_communicators.custom_all_reduce_utils import (
+    gpu_p2p_access_check,
+    is_full_nvlink,
+    is_weak_contiguous,
+)
+from sglang.srt.distributed.parallel_state import in_the_same_node_as
+from sglang.srt.utils import is_cuda, is_hip
+
+logger = logging.getLogger(__name__)
+
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+
+
+try:
+    if ops.use_vllm_custom_allreduce and not _is_hip:
+        # Use vLLM custom allreduce
+        ops.meta_size()
+    else:
+        # Use custom allreduce from sgl kernel (ROCM and TRT-LLM)
+        import sgl_kernel
+    custom_ar = True
+except Exception:
+    # For CPUs
+    custom_ar = False
+
+logger = logging.getLogger(__name__)
+
+
+def _can_p2p(rank: int, world_size: int) -> bool:
+    # SGLANG_SKIP_P2P_CHECK can be set to False in sglang
+    SGLANG_SKIP_P2P_CHECK = os.getenv("SGLANG_SKIP_P2P_CHECK", "0") == "1"
+    for i in range(world_size):
+        if i == rank:
+            continue
+        if SGLANG_SKIP_P2P_CHECK:
+            logger.info("Skipping P2P check and trusting the driver's P2P report.")
+            return torch.cuda.can_device_access_peer(rank, i)
+        if not gpu_p2p_access_check(rank, i):
+            return False
+    return True
+
+
+class CustomAllreduce:
+    _SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]
+    _MAX_CAR_SIZE = 8192 * 1024
+    if _is_hip:
+        # crossover is at 16MB buffer size for ROCm
+        _MAX_CAR_SIZE = 2 * 8192 * 1024
+
+    # max_size: max supported allreduce size
+    def __init__(
+        self,
+        group: ProcessGroup,
+        device: Union[int, str, torch.device],
+        max_size=_MAX_CAR_SIZE,
+    ) -> None:
+        """
+        Args:
+            group: the process group to work on. If None, it will use the
+                default process group.
+            device: the device to bind the CustomAllreduce to. If None,
+                it will be bind to f"cuda:{local_rank}".
+        It is the caller's responsibility to make sure each communicator
+        is bind to a unique device, and all communicators in this group
+        are in the same node.
+        """
+        self._IS_CAPTURING = False
+        self.disabled = True
+
+        if not custom_ar:
+            # disable because of missing custom allreduce library
+            # e.g. in a non-cuda environment
+            return
+
+        self.group = group
+
+        assert (
+            dist.get_backend(group) != dist.Backend.NCCL
+        ), "CustomAllreduce should be attached to a non-NCCL group."
+
+        if not all(in_the_same_node_as(group, source_rank=0)):
+            # No need to initialize custom allreduce for multi-node case.
+            logger.warning(
+                "Custom allreduce is disabled because this process group"
+                " spans across nodes."
+            )
+            return
+
+        rank = dist.get_rank(group=self.group)
+        world_size = dist.get_world_size(group=self.group)
+        if world_size == 1:
+            # No need to initialize custom allreduce for single GPU case.
+            return
+
+        if world_size not in CustomAllreduce._SUPPORTED_WORLD_SIZES:
+            logger.warning(
+                "Custom allreduce is disabled due to an unsupported world"
+                " size: %d. Supported world sizes: %s. To silence this "
+                "warning, specify disable_custom_all_reduce=True explicitly.",
+                world_size,
+                str(CustomAllreduce._SUPPORTED_WORLD_SIZES),
+            )
+            return
+
+        if isinstance(device, int):
+            device = torch.device(f"cuda:{device}")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        # now `device` is a `torch.device` object
+        assert isinstance(device, torch.device)
+        self.device = device
+
+        cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+        if cuda_visible_devices:
+            device_ids = list(map(int, cuda_visible_devices.split(",")))
+        else:
+            device_ids = list(range(torch.cuda.device_count()))
+
+        physical_device_id = device_ids[device.index]
+        tensor = torch.tensor([physical_device_id], dtype=torch.int, device="cpu")
+        gather_list = [
+            torch.tensor([0], dtype=torch.int, device="cpu") for _ in range(world_size)
+        ]
+        dist.all_gather(gather_list, tensor, group=self.group)
+        physical_device_ids = [t.item() for t in gather_list]
+
+        # test nvlink first, this will filter out most of the cases
+        # where custom allreduce is not supported
+        # this checks hardware and driver support for NVLink
+        if _is_cuda or _is_hip:
+            full_nvlink = is_full_nvlink(physical_device_ids, world_size)
+
+        if world_size > 2 and not full_nvlink:
+            logger.warning(
+                "Custom allreduce is disabled because it's not supported on"
+                " more than two PCIe-only GPUs. To silence this warning, "
+                "specify disable_custom_all_reduce=True explicitly."
+            )
+            return
+        # test P2P capability, this checks software/cudaruntime support
+        # this is expensive to compute at the first time
+        # then we cache the result
+        # On AMD GPU, p2p is always enabled between XGMI connected GPUs
+        if not _is_hip and not _can_p2p(rank, world_size):
+            logger.warning(
+                "Custom allreduce is disabled because your platform lacks "
+                "GPU P2P capability or P2P test failed. To silence this "
+                "warning, specify disable_custom_all_reduce=True explicitly."
+            )
+            return
+
+        self.max_size = max_size
+        self.rank = rank
+        self.world_size = world_size
+        self.full_nvlink = full_nvlink
+
+        if not _is_hip:
+            # Buffers memory are owned by this Python class and passed to C++.
+            # Meta data composes of two parts: meta data for synchronization and a
+            # temporary buffer for storing intermediate allreduce results.
+            self.meta_ptrs = self.create_shared_buffer(
+                ops.meta_size() + max_size, group=group
+            )
+            # This is a pre-registered IPC buffer. In eager mode, input tensors
+            # are first copied into this buffer before allreduce is performed
+            self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
+            # This is a buffer for storing the tuples of pointers pointing to
+            # IPC buffers from all ranks. Each registered tuple has size of
+            # 8*world_size bytes where world_size is at most 8. Allocating 8MB
+            # is enough for 131072 such tuples. The largest model I've seen only
+            # needs less than 10000 of registered tuples.
+            self.rank_data = torch.empty(
+                8 * 1024 * 1024, dtype=torch.uint8, device=self.device
+            )
+            self._ptr = ops.init_custom_ar(
+                self.meta_ptrs, self.rank_data, rank, self.full_nvlink
+            )
+            ops.register_buffer(self._ptr, self.buffer_ptrs)
+        else:
+            # meta data buffers need to be "uncached" for signal on MI200
+            self.meta = ops.allocate_meta_buffer(ops.meta_size() + max_size)
+            self.buffer = torch.empty(max_size, dtype=torch.uint8, device=self.device)
+            handle = ops.get_meta_buffer_ipc_handle(self.meta)
+            shard_data = (
+                bytes(handle),  # ipc handle to base ptr
+                0,  # offset of base ptr
+            )
+            handles, offsets = self._gather_ipc_meta(shard_data)
+            self.rank_data = torch.empty(
+                8 * 1024 * 1024, dtype=torch.uint8, device=self.device
+            )
+            self._ptr = ops.init_custom_ar(
+                self.meta, self.rank_data, handles, offsets, rank, self.full_nvlink
+            )
+            self.register_buffer(self.buffer)
+
+        self.disabled = False
+
+    @staticmethod
+    def create_shared_buffer(
+        size_in_bytes: int, group: Optional[ProcessGroup] = None
+    ) -> List[int]:
+        """
+        Creates a shared buffer and returns a list of pointers
+        representing the buffer on all processes in the group.
+        """
+        lib = CudaRTLibrary()
+        pointer = lib.cudaMalloc(size_in_bytes)
+        handle = lib.cudaIpcGetMemHandle(pointer)
+        world_size = dist.get_world_size(group=group)
+        rank = dist.get_rank(group=group)
+        handles = [None] * world_size
+        dist.all_gather_object(handles, handle, group=group)
+
+        pointers: List[int] = []
+        for i, h in enumerate(handles):
+            if i == rank:
+                pointers.append(pointer.value)  # type: ignore
+            else:
+                pointers.append(lib.cudaIpcOpenMemHandle(h).value)  # type: ignore
+
+        return pointers
+
+    @staticmethod
+    def free_shared_buffer(
+        pointers: List[int], group: Optional[ProcessGroup] = None
+    ) -> None:
+        rank = dist.get_rank(group=group)
+        lib = CudaRTLibrary()
+        lib.cudaFree(ctypes.c_void_p(pointers[rank]))
+
+    @contextmanager
+    def capture(self):
+        """
+        The main responsibility of this context manager is the
+        `register_graph_buffers` call at the end of the context.
+        It records all the buffer addresses used in the CUDA graph.
+        """
+        try:
+            self._IS_CAPTURING = True
+            yield
+        finally:
+            self._IS_CAPTURING = False
+            if not self.disabled:
+                self.register_graph_buffers()
+
+    def _get_ipc_meta(self, inp: torch.Tensor):
+        # _share_cuda_() doesn't accept meta buffer not allocated from
+        # PyTorch cache allocator, use direct HIP call to get IPC handle
+        handle = ops.get_meta_buffer_ipc_handle(inp)
+        shard_data = (
+            bytes(handle),  # ipc handle to base ptr
+            0,  # offset of base ptr
+        )
+        return self._gather_ipc_meta(shard_data)
+
+    def _gather_ipc_meta(self, shard_data):
+        # Note: don't use `[[None]] * self.world_size` here
+        # because it will create a list of the same reference
+        all_data: List[Optional[Any]] = [[None] for i in range(self.world_size)]
+        all_data[self.rank][0] = shard_data
+
+        ranks = dist.get_process_group_ranks(group=self.group)
+        ranks.sort()
+        for i, rank in enumerate(ranks):
+            dist.broadcast_object_list(
+                all_data[i], src=rank, group=self.group, device="cpu"
+            )
+
+        # we cannot directly use `dist.all_gather_object` here
+        # because it is incompatible with `gloo` backend under inference mode.
+        # see https://github.com/pytorch/pytorch/issues/126032 for details.
+
+        handles = []
+        offsets = []
+        for i in range(len(all_data)):
+            handles.append(all_data[i][0][0])  # type: ignore
+            offsets.append(all_data[i][0][1])  # type: ignore
+        return handles, offsets
+
+    def register_buffer(self, inp: torch.Tensor):
+        handles, offsets = self._get_ipc_meta(inp)
+        ops.register_buffer(self._ptr, inp, handles, offsets)
+
+    def register_graph_buffers(self):
+        if _is_hip:
+            handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
+            handles, offsets = self._gather_ipc_meta((bytes(handle), offset))
+            logger.info("Registering %d cuda graph addresses", len(offset))
+            ops.register_graph_buffers(self._ptr, handles, offsets)
+        else:
+            handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
+            logger.info("Registering %d cuda graph addresses", len(offset))
+            # We cannot directly use `dist.all_gather_object` here
+            # because it is incompatible with `gloo` backend under inference mode.
+            # see https://github.com/pytorch/pytorch/issues/126032 for details.
+            all_data = [
+                [None, None] for _ in range(dist.get_world_size(group=self.group))
+            ]
+            all_data[self.rank] = [handle, offset]
+            ranks = sorted(dist.get_process_group_ranks(group=self.group))
+            for i, rank in enumerate(ranks):
+                dist.broadcast_object_list(
+                    all_data[i], src=rank, group=self.group, device="cpu"
+                )
+            # Unpack list of tuples to tuple of lists.
+            handles = [d[0] for d in all_data]  # type: ignore
+            offsets = [d[1] for d in all_data]  # type: ignore
+            ops.register_graph_buffers(self._ptr, handles, offsets)
+
+    def should_custom_ar(self, inp: torch.Tensor):
+        if self.disabled:
+            return False
+        inp_size = inp.numel() * inp.element_size()
+        # custom allreduce requires input byte size to be multiples of 16
+        if inp_size % 16 != 0:
+            return False
+        if not is_weak_contiguous(inp):
+            return False
+        # for 4 or more non NVLink-capable GPUs, custom allreduce provides
+        # little performance improvement over NCCL.
+        if not _is_hip:
+            if self.world_size == 2 or self.full_nvlink:
+                return inp_size < self.max_size
+            return False
+
+        if _is_hip:
+            if self.full_nvlink:
+                return inp_size < self.max_size
+            return False
+
+        return False
+
+    # all reduce, assuming inp tensor is IPC registered with register_buffer,
+    # or, in the context of cuda graphs, register_graph_buffers
+    def all_reduce_reg(self, inp: torch.Tensor, out: torch.Tensor = None):
+        if out is None:
+            out = torch.empty_like(inp)
+        ops.all_reduce_reg(self._ptr, inp, out)
+        return out
+
+    # all reduce, assuming inp tensor is NOT IPC registered
+    def all_reduce_unreg(self, inp: torch.Tensor, out: torch.Tensor = None):
+        if out is None:
+            out = torch.empty_like(inp)
+        ops.all_reduce_unreg(self._ptr, inp, self.buffer, out)
+        return out
+
+    def all_reduce(
+        self,
+        inp: torch.Tensor,
+        *,
+        out: torch.Tensor = None,
+        registered: bool = False,
+    ):
+        """Performs an out-of-place all reduce.
+
+        If registered is True, this assumes inp's pointer is already
+        IPC-registered. Otherwise, inp is first copied into a pre-registered
+        buffer.
+        """
+        if out is None:
+            out = torch.empty_like(inp)
+        if registered:
+            ops.all_reduce(self._ptr, inp, out, 0, 0)
+        else:
+            ops.all_reduce(
+                self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size
+            )
+        return out
+
+    def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
+        """The main allreduce API that provides support for cuda graph."""
+        # When custom allreduce is disabled, this will be None.
+        if self.disabled or not self.should_custom_ar(input):
+            return None
+        if self._IS_CAPTURING:
+            if torch.cuda.is_current_stream_capturing():
+                if _is_hip:
+                    return self.all_reduce_reg(input)
+                else:
+                    return self.all_reduce(input, registered=True)
+            else:
+                # If warm up, mimic the allocation pattern since custom
+                # allreduce is out-of-place.
+                return torch.zeros_like(input)
+        else:
+            if _is_hip:
+                # note: outside of cuda graph context,
+                # custom allreduce incurs a cost of cudaMemcpy, which should
+                # be small(<=1% of overall latency) compared to the performance
+                # gains of using custom kernels
+                return self.all_reduce_unreg(input)
+            else:
+                return self.all_reduce(input, registered=False)
+
+    def close(self):
+        if not self.disabled and self._ptr:
+            ops.dispose(self._ptr)
+            if _is_cuda:
+                self.free_shared_buffer(self.meta_ptrs)
+                self.free_shared_buffer(self.buffer_ptrs)
+            self._ptr = 0
+
+    def __del__(self):
+        self.close()
diff --git a/python/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py b/python/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py
new file mode 100644
index 000000000..c7baac845
--- /dev/null
+++ b/python/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py
@@ -0,0 +1,386 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+
+import ctypes
+import json
+import logging
+import os
+import pickle
+import subprocess
+import sys
+import tempfile
+from functools import wraps
+from itertools import product
+from typing import Callable, Dict, List, Optional, Sequence, TypeVar
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from typing_extensions import ParamSpec
+
+from sglang.srt.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
+from sglang.srt.utils import is_cuda, is_hip
+
+logger = logging.getLogger(__name__)
+
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+
+if _is_cuda:
+    try:
+        import pynvml
+    except ImportError as e:
+        logger.warning("Failed to import pynvml with %r", e)
+
+if _is_hip:
+    try:
+        from amdsmi import (
+            AmdSmiException,
+            amdsmi_get_processor_handles,
+            amdsmi_init,
+            amdsmi_shut_down,
+            amdsmi_topo_get_link_type,
+        )
+    except ImportError as e:
+        logger.warning("Failed to import amdsmi with %r", e)
+
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+
+
+def update_environment_variables(envs: Dict[str, str]):
+    for k, v in envs.items():
+        if k in os.environ and os.environ[k] != v:
+            logger.warning(
+                "Overwriting environment variable %s " "from '%s' to '%s'",
+                k,
+                os.environ[k],
+                v,
+            )
+        os.environ[k] = v
+
+
+def producer(
+    batch_src: Sequence[int],
+    producer_queue,
+    consumer_queue,
+    result_queue,
+    cuda_visible_devices: Optional[str] = None,
+):
+    if cuda_visible_devices is not None:
+        update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
+
+    lib = CudaRTLibrary()
+    for i in batch_src:
+        lib.cudaSetDevice(i)
+        pointer = lib.cudaMalloc(1024)
+        lib.cudaMemset(pointer, 1, 1024)
+        lib.cudaDeviceSynchronize()
+        handle = lib.cudaIpcGetMemHandle(pointer)
+        producer_queue.put(handle)
+        open_success = consumer_queue.get()
+        if open_success:
+            # use two queues to simulate barrier
+            producer_queue.put(0)
+            consumer_queue.get()
+            # check if the memory is modified
+            host_data = (ctypes.c_char * 1024)()
+            lib.cudaMemcpy(host_data, pointer, 1024)  # type: ignore
+            for i in range(1024):
+                if ord(host_data[i]) != 2:
+                    open_success = False
+                    break
+        result_queue.put(open_success)
+        lib.cudaDeviceReset()
+
+
+def consumer(
+    batch_tgt: Sequence[int],
+    producer_queue,
+    consumer_queue,
+    result_queue,
+    cuda_visible_devices: Optional[str] = None,
+):
+    if cuda_visible_devices is not None:
+        update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
+
+    lib = CudaRTLibrary()
+    for j in batch_tgt:
+        lib.cudaSetDevice(j)
+        handle = producer_queue.get()
+        open_success = False
+        try:
+            pointer = lib.cudaIpcOpenMemHandle(handle)  # type: ignore
+            open_success = True
+        except RuntimeError:
+            # cannot error out here, because the producer process
+            # is still waiting for the response.
+            pass
+        consumer_queue.put(open_success)
+        if open_success:
+            # modify the memory
+            lib.cudaMemset(pointer, 2, 1024)
+            lib.cudaDeviceSynchronize()
+            # use two queues to simulate barrier
+            producer_queue.get()
+            consumer_queue.put(0)
+            # check if the memory is modified
+            host_data = (ctypes.c_char * 1024)()
+            lib.cudaMemcpy(host_data, pointer, 1024)  # type: ignore
+            for i in range(1024):
+                if ord(host_data[i]) != 2:
+                    open_success = False
+                    break
+        result_queue.put(open_success)
+        lib.cudaDeviceReset()
+
+
+def can_actually_p2p(
+    batch_src: Sequence[int],
+    batch_tgt: Sequence[int],
+) -> Sequence[bool]:
+    """
+    Usually, checking if P2P access is enabled can be done by
+    `torch.cuda.can_device_access_peer(src, tgt)`. However, sometimes
+    the driver might be broken, and `torch.cuda.can_device_access_peer(src, tgt)`
+    returns `True` even if P2P access is not actually possible.
+    See https://github.com/vllm-project/vllm/issues/2728 and
+    https://forums.developer.nvidia.com/t/direct-gpu-gpu-communication-does-not-seem-to-work-properly/283264/10
+    Therefore, we have to perform a real P2P access to check if it is actually
+    possible.
+
+    Note on p2p and cuda IPC:
+    Usually, one process uses one GPU:
+    GPU src --> cuda context src --> tensor src --> process src
+
+    We need to combine p2p and cuda IPC, so that:
+    GPU src --> cuda context src --> tensor src --> process src
+                                      |shared|
+    GPU tgt --> cuda context tgt --> tensor tgt --> process tgt
+    That is to say, process src creates a tensor in GPU src, passes IPC handle to
+    process tgt, and process tgt accesses the tensor in GPU tgt. Any operation on the
+    tensor in process tgt will be reflected in the tensor in process src, because
+    they are the same memory segment.
+    It is important to note that process tgt accesses the tensor in GPU tgt, not
+    GPU src. That's why we need p2p access.
+
+    The most time-consuming part is the process creation. To avoid creating
+    processes for every pair of GPUs, we use batched testing. We create two
+    processes for testing all pairs of GPUs in batch. The trick is to reset
+    the device after each test (which is not available in PyTorch).
+    """  # noqa
+    cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+    # pass the CUDA_VISIBLE_DEVICES to the child process
+    # to make sure they see the same set of GPUs
+
+    # make sure the processes are spawned
+    smp = mp.get_context("spawn")
+    producer_queue = smp.Queue()
+    consumer_queue = smp.Queue()
+    result_queue = smp.Queue()
+    p_src = smp.Process(
+        target=producer,
+        args=(
+            batch_src,
+            producer_queue,
+            consumer_queue,
+            result_queue,
+            cuda_visible_devices,
+        ),
+    )
+    p_tgt = smp.Process(
+        target=consumer,
+        args=(
+            batch_tgt,
+            producer_queue,
+            consumer_queue,
+            result_queue,
+            cuda_visible_devices,
+        ),
+    )
+    p_src.start()
+    p_tgt.start()
+    p_src.join()
+    p_tgt.join()
+    assert p_src.exitcode == 0 and p_tgt.exitcode == 0
+    result: List[bool] = []
+    for src, tgt in zip(batch_src, batch_tgt):
+        a = result_queue.get()
+        b = result_queue.get()
+        if a != b:
+            logger.warning(
+                "Two processes do not agree on the P2P access"
+                " status on %d -> %d, treat as disabled.",
+                src,
+                tgt,
+            )
+            result.append(False)
+        else:
+            result.append(a)
+    return result
+
+
+# why do we need this cache?
+# we are testing peer-to-peer (p2p) access between GPUs,across processes.
+# if we test it every time, it will be very slow, because we need to create
+#  N * N * 2 processes, where N is the world size. This is very slow.
+# to reduce the time, we use a cache file to store the p2p access status.
+# the cache file is generated by the master process if it does not exist.
+# then all the processes can read the cache file to check the p2p access status.
+# Note that the cache file is suffixed by the CUDA_VISIBLE_DEVICES, so that we
+#  can have different cache files for different CUDA_VISIBLE_DEVICES settings,
+#  e.g. used by different vllm engines. The device id in the cache file is a
+#  **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number
+#  of visible devices in the vllm engine.
+_gpu_p2p_access_cache: Optional[Dict[str, bool]] = None
+
+
+def gpu_p2p_access_check(src: int, tgt: int) -> bool:
+    """Check if GPU src can access GPU tgt."""
+
+    # if the cache variable is already calculated,
+    # read from the cache instead of checking it again
+    global _gpu_p2p_access_cache
+    if _gpu_p2p_access_cache is not None:
+        return _gpu_p2p_access_cache[f"{src}->{tgt}"]
+
+    is_distributed = dist.is_initialized()
+
+    num_dev = torch.cuda.device_count()
+    cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+    if cuda_visible_devices is None:
+        cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
+
+    # VLLM_CACHE_ROOT -> SGLANG_CACHE_ROOT
+    # "~/.cache/vllm" -> "~/.cache/sglang"
+    SGLANG_CACHE_ROOT = os.path.expanduser("~/.cache/sglang")
+    path = os.path.join(
+        SGLANG_CACHE_ROOT, f"gpu_p2p_access_cache_for_{cuda_visible_devices}.json"
+    )
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    from sglang.srt.distributed.parallel_state import get_world_group
+
+    if (not is_distributed or get_world_group().local_rank == 0) and (
+        not os.path.exists(path)
+    ):
+        # only the local master process (with local_rank == 0) can
+        #  enter this block to calculate the cache
+        logger.info("generating GPU P2P access cache in %s", path)
+        cache: Dict[str, bool] = {}
+        ids = list(range(num_dev))
+        # batch of all pairs of GPUs
+        batch_src, batch_tgt = zip(*list(product(ids, ids)))
+        # NOTE: we use `subprocess` rather than `multiprocessing` here
+        # because the caller might not have `if __name__ == "__main__":`,
+        # in that case we cannot use spawn method in multiprocessing.
+        # However, `can_actually_p2p` requires spawn method.
+        # The fix is, we use `subprocess` to call the function,
+        # where we have `if __name__ == "__main__":` in this file.
+
+        # use a temporary file to store the result
+        # we don't use the output of the subprocess directly,
+        # because the subprocess might produce logging output
+        with tempfile.NamedTemporaryFile() as output_file:
+            input_bytes = pickle.dumps((batch_src, batch_tgt, output_file.name))
+            returned = subprocess.run(
+                [sys.executable, __file__], input=input_bytes, capture_output=True
+            )
+            # check if the subprocess is successful
+            try:
+                returned.check_returncode()
+            except Exception as e:
+                # wrap raised exception to provide more information
+                raise RuntimeError(
+                    f"Error happened when batch testing "
+                    f"peer-to-peer access from {batch_src} to {batch_tgt}:\n"
+                    f"{returned.stderr.decode()}"
+                ) from e
+            with open(output_file.name, "rb") as f:
+                result = pickle.load(f)
+        for _i, _j, r in zip(batch_src, batch_tgt, result):
+            cache[f"{_i}->{_j}"] = r
+        with open(path, "w") as f:
+            json.dump(cache, f, indent=4)
+    if is_distributed:
+        get_world_group().barrier()
+    logger.info("reading GPU P2P access cache from %s", path)
+    with open(path) as f:
+        cache = json.load(f)
+    _gpu_p2p_access_cache = cache
+    return _gpu_p2p_access_cache[f"{src}->{tgt}"]
+
+
+def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
+    @wraps(fn)
+    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
+        if _is_hip:
+            try:
+                amdsmi_init()
+                return fn(*args, **kwargs)
+            finally:
+                amdsmi_shut_down()
+        else:
+            pynvml.nvmlInit()
+            try:
+                return fn(*args, **kwargs)
+            finally:
+                pynvml.nvmlShutdown()
+
+    return wrapper
+
+
+@with_nvml_context
+def is_full_nvlink(physical_device_ids: List[int], world_size: int) -> bool:
+    if _is_hip:
+        """
+        query if the set of gpus are fully connected by xgmi (1 hop)
+        """
+        handles = [amdsmi_get_processor_handles()[i] for i in physical_device_ids]
+        for i, handle in enumerate(handles):
+            for j, peer_handle in enumerate(handles):
+                if i < j:
+                    try:
+                        link_type = amdsmi_topo_get_link_type(handle, peer_handle)
+                        # type is 2 for XGMI
+                        if link_type["hops"] != 1 or link_type["type"] != 2:
+                            return False
+                    except AmdSmiException as error:
+                        logger.error("AMD 1 hop XGMI detection failed.", exc_info=error)
+                        return False
+        return True
+    else:
+        """
+        query if the set of gpus are fully connected by nvlink (1 hop)
+        """
+        handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in physical_device_ids]
+        for i, handle in enumerate(handles):
+            for j, peer_handle in enumerate(handles):
+                if i < j:
+                    try:
+                        p2p_status = pynvml.nvmlDeviceGetP2PStatus(
+                            handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK
+                        )
+                        if p2p_status != pynvml.NVML_P2P_STATUS_OK:
+                            return False
+                    except pynvml.NVMLError:
+                        logger.exception(
+                            "NVLink detection failed. This is normal if your"
+                            " machine has no NVLink equipped."
+                        )
+                        return False
+        return True
+
+
+def is_weak_contiguous(inp: torch.Tensor):
+    return inp.is_contiguous() or (
+        inp.storage().nbytes() - inp.storage_offset() * inp.element_size()
+        == inp.numel() * inp.element_size()
+    )
+
+
+__all__ = ["gpu_p2p_access_check"]
+
+if __name__ == "__main__":
+    batch_src, batch_tgt, output_file = pickle.loads(sys.stdin.buffer.read())
+    result = can_actually_p2p(batch_src, batch_tgt)
+    with open(output_file, "wb") as f:
+        f.write(pickle.dumps(result))
diff --git a/python/sglang/srt/distributed/device_communicators/hpu_communicator.py b/python/sglang/srt/distributed/device_communicators/hpu_communicator.py
new file mode 100644
index 000000000..722e494cf
--- /dev/null
+++ b/python/sglang/srt/distributed/device_communicators/hpu_communicator.py
@@ -0,0 +1,49 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/hpu_communicator.py
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from sglang.srt.utils import is_hpu
+
+if is_hpu():
+    import habana_frameworks.torch as htorch  # noqa: F401
+
+
+class HpuCommunicator:
+
+    def __init__(self, group: ProcessGroup):
+        if not is_hpu():
+            self.disabled = True
+            return
+        self.disabled = False
+        self.group = group
+        self.world_size = dist.get_world_size(self.group)
+
+    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
+        # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge
+        # occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used
+        # (which is required for tensor parallel HPUGraph inference)
+        htorch.core.mark_step()
+        dist.all_reduce(x, group=self.group)
+        return x
+
+    def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        world_size = self.world_size
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += x.dim()
+        input_size = x.size()
+        # Allocate output tensor.
+        output_tensor = torch.empty(
+            (world_size,) + input_size, dtype=x.dtype, device=x.device
+        )
+        # All-gather.
+        htorch.core.mark_step()
+        dist.all_gather_into_tensor(output_tensor, x, group=self.group)
+        # Reshape
+        output_tensor = output_tensor.movedim(0, dim)
+        output_tensor = output_tensor.reshape(
+            input_size[:dim] + (world_size * input_size[dim],) + input_size[dim + 1 :]
+        )
+        return output_tensor
diff --git a/python/sglang/srt/distributed/device_communicators/npu_communicator.py b/python/sglang/srt/distributed/device_communicators/npu_communicator.py
new file mode 100644
index 000000000..cb6eb88e3
--- /dev/null
+++ b/python/sglang/srt/distributed/device_communicators/npu_communicator.py
@@ -0,0 +1,39 @@
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from sglang.srt.utils import is_npu
+
+
+class NpuCommunicator:
+
+    def __init__(self, group: ProcessGroup):
+        if not is_npu():
+            self.disabled = True
+            return
+        self.disabled = False
+        self.group = group
+        self.world_size = dist.get_world_size(self.group)
+
+    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
+        dist.all_reduce(x, group=self.group)
+        return x
+
+    def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        world_size = self.world_size
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += x.dim()
+        input_size = x.size()
+        output_size = (input_size[0] * world_size,) + input_size[1:]
+        # Allocate output tensor.
+        output_tensor = torch.empty(output_size, dtype=x.dtype, device=x.device)
+        # All-gather.
+        dist.all_gather_into_tensor(output_tensor, x, group=self.group)
+        # Reshape
+        output_tensor = output_tensor.reshape((world_size,) + input_size)
+        output_tensor = output_tensor.movedim(0, dim)
+        output_tensor = output_tensor.reshape(
+            input_size[:dim] + (world_size * input_size[dim],) + input_size[dim + 1 :]
+        )
+        return output_tensor
diff --git a/python/sglang/srt/distributed/device_communicators/pymscclpp.py b/python/sglang/srt/distributed/device_communicators/pymscclpp.py
new file mode 100644
index 000000000..78269ed05
--- /dev/null
+++ b/python/sglang/srt/distributed/device_communicators/pymscclpp.py
@@ -0,0 +1,315 @@
+import bisect
+import logging
+import math
+import os
+from contextlib import contextmanager
+from enum import IntEnum
+from typing import Any, Callable, List, Optional, TypeVar, Union
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup, ReduceOp
+
+from sglang.srt import _custom_ops as ops
+from sglang.srt.utils import is_cuda, is_hip
+
+logger = logging.getLogger(__name__)
+
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+
+mscclpp_is_available = False
+if _is_hip:
+    # TODO(zyksir): mscclpp is untested on AMD and therefore disabled.
+    mscclpp_is_available = False
+if _is_cuda:
+    try:
+        import sgl_kernel
+
+        mscclpp_is_available = True
+    except:
+        mscclpp_is_available = False
+
+
+class MscclContextSelection(IntEnum):
+    MSCCL1SHOT1NODELL = 1
+    MSCCL1SHOT2NODELL = 2
+
+
+def mscclpp_is_weak_contiguous(inp: torch.Tensor):
+    return inp.is_contiguous() or (
+        inp.storage().nbytes() - inp.storage_offset() * inp.element_size()
+        == inp.numel() * inp.element_size()
+    )
+
+
+def mscclpp_convert_to_bytes(size_str):
+    """
+    Converts a human-readable size string (e.g., "1MB", "2.5kb", "3 GB")
+    into the equivalent number of bytes using binary units.
+
+    Args:
+        size_str (str): A string representing size with unit (KB, MB, GB).
+
+    Returns:
+        int: Number of bytes.
+    """
+    size_str = size_str.strip().lower()
+
+    if not size_str:
+        raise ValueError("Empty input string")
+
+    # Extract numeric part and unit
+    for i in range(len(size_str)):
+        if not size_str[i].isdigit() and size_str[i] != ".":
+            break
+    num_str = size_str[:i]
+    unit = size_str[i:].strip()
+
+    try:
+        num = float(num_str)
+    except ValueError:
+        raise ValueError(f"Invalid numeric value in '{size_str}'")
+
+    # Conversion factors
+    if unit == "b":
+        return int(num)
+    elif unit == "kb":
+        return int(num * 1024)
+    elif unit == "mb":
+        return int(num * 1024 * 1024)
+    elif unit == "gb":
+        return int(num * 1024 * 1024 * 1024)
+    else:
+        raise ValueError(f"Unsupported unit: {unit}, support B, KB, MB, GB only")
+
+
+def mscclpp_bench_time(func, test_niter: int = 10, warmup_niter: int = 2):
+    # warmup
+    for _ in range(warmup_niter):
+        func()
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    torch.cuda.synchronize()
+    dist.barrier()
+    start_event.record()
+    for _ in range(test_niter):
+        func()
+    end_event.record()
+    end_event.synchronize()
+    func_cost_us = start_event.elapsed_time(end_event) / test_niter * 1000
+    return func_cost_us
+
+
+class PyMscclppCommunicator:
+    _SUPPORTED_WORLD_SIZES = [8, 16]
+    _MAX_BYTES = mscclpp_convert_to_bytes(os.getenv("SGLANG_MSCCLPP_MAX_BYTES", "1MB"))
+    _SUPPORTED_DTYPE = [torch.float, torch.float16, torch.bfloat16]
+
+    # max_bytes: max supported mscclpp allreduce size
+    # in A100 mscclpp is faster than nccl only under condition of msg size smaller than1MB
+    def __init__(
+        self,
+        group: ProcessGroup,
+        device: Union[int, str, torch.device],
+        max_bytes=_MAX_BYTES,
+    ) -> None:
+        """
+        Args:
+            group: the process group to work on. If None, it will use the
+                default process group.
+            device: the device to bind the CustomAllreduce to. If None,
+                it will be bind to f"cuda:{local_rank}".
+        It is the caller's responsibility to make sure each communicator
+        is bind to a unique device, and all communicators in this group
+        are in the same node.
+        """
+        self._IS_CAPTURING = False
+        self.disabled = True
+
+        if not mscclpp_is_available:
+            # disable because of missing mscclpp library
+            # e.g. in a non-cuda environment
+            return
+
+        self.group = group
+
+        assert (
+            dist.get_backend(group) != dist.Backend.NCCL
+        ), "CustomAllreduce should be attached to a non-NCCL group."
+
+        rank = dist.get_rank(group=self.group)
+        world_size = dist.get_world_size(group=self.group)
+        if world_size == 1:
+            # No need to initialize mscclpp for single GPU case.
+            return
+
+        if world_size not in PyMscclppCommunicator._SUPPORTED_WORLD_SIZES:
+            logger.warning(
+                "PyMscclpp is disabled due to an unsupported world"
+                " size: %d. Supported world sizes: %s. To silence this "
+                "warning, specify disable_mscclpp=True explicitly.",
+                world_size,
+                str(PyMscclppCommunicator._SUPPORTED_WORLD_SIZES),
+            )
+            return
+
+        self.ranks = torch.distributed.get_process_group_ranks(group)
+        self.nranks_per_node = torch.cuda.device_count()
+        # for now mscclpp with stride in the communicator is not tested
+        if not (abs(self.ranks[-1] - self.ranks[0]) == world_size - 1):
+            logger.warning(
+                "PyMscclpp is disabled due to an unsupported group %s."
+                "Please ensure all ranks in the group are consecutive."
+                "To silence this warning, specify disable_mscclpp=True explicitly.",
+                str(self.ranks),
+            )
+            return
+
+        if isinstance(device, int):
+            device = torch.device(f"cuda:{device}")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        # now `device` is a `torch.device` object
+        assert isinstance(device, torch.device)
+        self.device = device
+
+        self.max_bytes = max_bytes
+        self.rank = rank
+        self.world_size = world_size
+
+        if dist.get_rank(group) == 0:
+            unique_id = [ops.mscclpp_generate_unique_id()]
+        else:
+            unique_id = [None]
+        dist.broadcast_object_list(unique_id, src=self.ranks[0], group=self.group)
+        self.unique_id = unique_id[0]
+        self.rank_to_node, self.rank_to_ib = list(range(world_size)), list(
+            range(world_size)
+        )
+        for r in range(world_size):
+            self.rank_to_node[r] = r // 8
+            self.rank_to_ib[r] = self.rank % 8
+
+        self._context = None
+        self.context_selection = None
+        self.msg_size_for_finetune = [
+            2**i for i in range(10, math.floor(math.log2(self.max_bytes)) + 1)
+        ]
+        self.msg_size2best_config = {}
+        if world_size == 8:
+            self.context_selection = MscclContextSelection.MSCCL1SHOT1NODELL
+        elif world_size == 16:
+            self.context_selection = MscclContextSelection.MSCCL1SHOT2NODELL
+        if not _is_hip:
+            self.scratch = torch.empty(
+                self.max_bytes * 8,
+                dtype=torch.uint8,
+                device=self.device,
+            )
+            self.put_buffer = torch.empty(
+                self.max_bytes * 8 // self.nranks_per_node,
+                dtype=torch.uint8,
+                device=self.device,
+            )
+            self._context = ops.mscclpp_init_context(
+                self.unique_id,
+                self.rank,
+                self.world_size,
+                self.scratch,
+                self.put_buffer,
+                self.nranks_per_node,
+                self.rank_to_node,
+                self.rank_to_ib,
+                int(self.context_selection),
+            )
+        else:
+            raise NotImplementedError("HIP Mscclpp is not supported yet.")
+
+        self.msg_size2best_config = {}
+        self.pre_tune_config()
+        if dist.get_rank(group) == 0:
+            msg_size2best_config = [self.msg_size2best_config]
+        else:
+            msg_size2best_config = [None]
+        dist.broadcast_object_list(
+            msg_size2best_config, src=self.ranks[0], group=self.group
+        )
+        self.msg_size2best_config = msg_size2best_config[0]
+
+        # PyMscclpp is enabled only in cuda graph
+        self.disabled = True
+
+    def pre_tune_config(self, dtype=torch.bfloat16) -> bool:
+        logger.debug(f"start to pre-tune configs for rank {self.rank}")
+        nthreads_to_try = [256, 512, 1024]
+        nblocks_to_try = [21, 42, 84]
+        inp_randn = torch.ones(
+            self.msg_size_for_finetune[-1] // dtype.itemsize, dtype=dtype, device="cuda"
+        )
+        oup_randn = torch.empty_like(inp_randn)
+        for msg_size in self.msg_size_for_finetune:
+            mock_inp, mock_outp = (
+                inp_randn[: msg_size // dtype.itemsize],
+                oup_randn[: msg_size // dtype.itemsize],
+            )
+            best_config, best_time = None, None
+            for nthreads in nthreads_to_try:
+                for nblocks in nblocks_to_try:
+                    cur_cost = mscclpp_bench_time(
+                        lambda: ops.mscclpp_allreduce(
+                            self._context, mock_inp, mock_outp, nthreads, nblocks
+                        )
+                    )
+                    if best_time is None or cur_cost < best_time:
+                        best_config = (nthreads, nblocks)
+                        best_time = cur_cost
+            self.msg_size2best_config[msg_size] = best_config
+            if self.rank == 0:
+                logger.debug(
+                    f"for msg_size {msg_size}, best_config: {best_config}, best_time: {best_time}us"
+                )
+
+    def should_mscclpp_allreduce(
+        self, inp: torch.Tensor, op: ReduceOp = ReduceOp.SUM
+    ) -> bool:
+        if self.disabled or self._context is None:
+            return False
+        if inp.dtype not in PyMscclppCommunicator._SUPPORTED_DTYPE:
+            return False
+        if not mscclpp_is_weak_contiguous(inp):
+            return False
+        # only support sum op
+        if op != ReduceOp.SUM:
+            return False
+        if inp.numel() * inp.element_size() > self.max_bytes:
+            return False
+        return True
+
+    def all_reduce(self, tensor: torch.Tensor, op: ReduceOp = ReduceOp.SUM):
+        if self._IS_CAPTURING:
+            if torch.cuda.is_current_stream_capturing():
+                self.graph_input_set.add((tensor.dtype, tensor.numel()))
+        msg_size = tensor.numel() * tensor.itemsize
+        index = bisect.bisect_left(self.msg_size_for_finetune, msg_size)
+        msg_size_finetune = self.msg_size_for_finetune[index]
+        nthreads, nblocks = self.msg_size2best_config[msg_size_finetune]
+        result = torch.empty_like(tensor)
+        ops.mscclpp_allreduce(self._context, tensor, result, nthreads, nblocks)
+        return result
+
+    @contextmanager
+    def change_state(
+        self,
+        enable: Optional[bool] = None,
+    ):
+        if enable is None:
+            # guess a default value when not specified
+            enable = self.available
+
+        old_disable = self.disabled
+        self.disabled = not enable
+
+        yield
+
+        self.disabled = old_disable
diff --git a/python/sglang/srt/distributed/device_communicators/pynccl.py b/python/sglang/srt/distributed/device_communicators/pynccl.py
new file mode 100644
index 000000000..fbb59c477
--- /dev/null
+++ b/python/sglang/srt/distributed/device_communicators/pynccl.py
@@ -0,0 +1,341 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/pynccl.py
+
+import logging
+from contextlib import contextmanager
+from typing import Optional, Union
+
+# ===================== import region =====================
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup, ReduceOp
+
+from sglang.srt.distributed.device_communicators.pynccl_wrapper import (
+    NCCLLibrary,
+    buffer_type,
+    cudaStream_t,
+    ncclComm_t,
+    ncclDataTypeEnum,
+    ncclRedOpTypeEnum,
+    ncclUniqueId,
+)
+from sglang.srt.distributed.utils import StatelessProcessGroup
+
+logger = logging.getLogger(__name__)
+
+
+class PyNcclCommunicator:
+
+    def __init__(
+        self,
+        group: Union[ProcessGroup, StatelessProcessGroup],
+        device: Union[int, str, torch.device],
+        library_path: Optional[str] = None,
+    ):
+        """
+        Args:
+            group: the process group to work on. If None, it will use the
+                default process group.
+            device: the device to bind the PyNcclCommunicator to. If None,
+                it will be bind to f"cuda:{local_rank}".
+            library_path: the path to the NCCL library. If None, it will
+                use the default library path.
+        It is the caller's responsibility to make sure each communicator
+        is bind to a unique device.
+        """
+        if not isinstance(group, StatelessProcessGroup):
+            assert dist.is_initialized()
+            assert (
+                dist.get_backend(group) != dist.Backend.NCCL
+            ), "PyNcclCommunicator should be attached to a non-NCCL group."
+            # note: this rank is the rank in the group
+            self.rank = dist.get_rank(group)
+            self.world_size = dist.get_world_size(group)
+        else:
+            self.rank = group.rank
+            self.world_size = group.world_size
+
+        self.group = group
+
+        # if world_size == 1, no need to create communicator
+        if self.world_size == 1:
+            self.available = False
+            self.disabled = True
+            self.stream = None
+            return
+        try:
+            self.nccl = NCCLLibrary(library_path)
+        except Exception:
+            # disable because of missing NCCL library
+            # e.g. in a non-GPU environment
+            self.available = False
+            self.disabled = True
+            self.stream = None
+            return
+
+        self.available = True
+        self.disabled = False
+
+        self.nccl_version = self.nccl.ncclGetRawVersion()
+        if self.rank == 0:
+            logger.info("sglang is using nccl==%s", self.nccl.ncclGetVersion())
+
+        if self.rank == 0:
+            # get the unique id from NCCL
+            self.unique_id = self.nccl.ncclGetUniqueId()
+        else:
+            # construct an empty unique id
+            self.unique_id = ncclUniqueId()
+
+        if not isinstance(group, StatelessProcessGroup):
+            tensor = torch.ByteTensor(list(self.unique_id.internal))
+            ranks = dist.get_process_group_ranks(group)
+            # arg `src` in `broadcast` is the global rank
+            dist.broadcast(tensor, src=ranks[0], group=group)
+            byte_list = tensor.tolist()
+            for i, byte in enumerate(byte_list):
+                self.unique_id.internal[i] = byte
+        else:
+            self.unique_id = group.broadcast_obj(self.unique_id, src=0)
+        if isinstance(device, int):
+            device = torch.device(f"cuda:{device}")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        # now `device` is a `torch.device` object
+        assert isinstance(device, torch.device)
+        self.device = device
+        # nccl communicator and stream will use this device
+        # `torch.cuda.device` is a context manager that changes the
+        # current cuda device to the specified one
+        with torch.cuda.device(device):
+            self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
+                self.world_size, self.unique_id, self.rank
+            )
+            self.stream = torch.cuda.Stream()
+
+            # A small all_reduce for warmup.
+            data = torch.zeros(1, device=device)
+            self.all_reduce(data)
+            self.stream.synchronize()
+            del data
+
+        # by default it is disabled, e.g. in profiling models and prefill phase.
+        # to use it, use under `with obj.change_state(enable=True)`, usually
+        # when we are using CUDA graph.
+        self.disabled = True
+
+    def all_reduce(
+        self, tensor: torch.Tensor, op: ReduceOp = ReduceOp.SUM, stream=None
+    ):
+        if self.disabled:
+            return
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {tensor.device}"
+        )
+        if stream is None:
+            stream = self.stream
+        self.nccl.ncclAllReduce(
+            buffer_type(tensor.data_ptr()),
+            buffer_type(tensor.data_ptr()),
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            ncclRedOpTypeEnum.from_torch(op),
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+
+    def all_gather(
+        self,
+        output_tensor: torch.Tensor,
+        input_tensor: torch.Tensor,
+        stream=None,
+        sizes: Optional[list[int]] = None,
+    ):
+        if self.disabled:
+            return
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert input_tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {input_tensor.device}"
+        )
+        if stream is None:
+            stream = self.stream
+
+        if sizes is not None:
+            split_offset = 0
+
+            self.nccl.ncclGroupStart()
+            for root, split_size in enumerate(sizes):
+                dst_slice = output_tensor[split_offset : split_offset + split_size]
+                self.nccl.ncclBroadcast(
+                    buffer_type(input_tensor.data_ptr()),
+                    buffer_type(dst_slice.data_ptr()),
+                    dst_slice.numel(),
+                    ncclDataTypeEnum.from_torch(input_tensor.dtype),
+                    root,
+                    self.comm,
+                    cudaStream_t(stream.cuda_stream),
+                )
+                split_offset += split_size
+            self.nccl.ncclGroupEnd()
+        else:
+            self.nccl.ncclAllGather(
+                buffer_type(input_tensor.data_ptr()),
+                buffer_type(output_tensor.data_ptr()),
+                input_tensor.numel(),
+                ncclDataTypeEnum.from_torch(input_tensor.dtype),
+                self.comm,
+                cudaStream_t(stream.cuda_stream),
+            )
+
+    def reduce_scatter(
+        self,
+        output_tensor: torch.Tensor,
+        input_tensor: torch.Tensor,
+        op: ReduceOp = ReduceOp.SUM,
+        stream=None,
+        sizes: Optional[list[int]] = None,
+    ):
+        if self.disabled:
+            return
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert input_tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {input_tensor.device}"
+        )
+        if stream is None:
+            stream = self.stream
+
+        if sizes is not None:
+            split_offset = 0
+            self.nccl.ncclGroupStart()
+            for root, split_size in enumerate(sizes):
+                chunk = input_tensor[split_offset : split_offset + split_size, ...]
+
+                self.nccl.ncclReduce(
+                    buffer_type(chunk.data_ptr()),
+                    buffer_type(output_tensor.data_ptr()),
+                    chunk.numel(),
+                    ncclDataTypeEnum.from_torch(input_tensor.dtype),
+                    ncclRedOpTypeEnum.from_torch(op),
+                    root,
+                    self.comm,
+                    cudaStream_t(stream.cuda_stream),
+                )
+                split_offset += split_size
+            self.nccl.ncclGroupEnd()
+        else:
+            self.nccl.ncclReduceScatter(
+                buffer_type(input_tensor.data_ptr()),
+                buffer_type(output_tensor.data_ptr()),
+                output_tensor.numel(),
+                ncclDataTypeEnum.from_torch(input_tensor.dtype),
+                ncclRedOpTypeEnum.from_torch(op),
+                self.comm,
+                cudaStream_t(stream.cuda_stream),
+            )
+
+    def send(self, tensor: torch.Tensor, dst: int, stream=None):
+        if self.disabled:
+            return
+        assert tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {tensor.device}"
+        )
+        if stream is None:
+            stream = self.stream
+        self.nccl.ncclSend(
+            buffer_type(tensor.data_ptr()),
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            dst,
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+
+    def recv(self, tensor: torch.Tensor, src: int, stream=None):
+        if self.disabled:
+            return
+        assert tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {tensor.device}"
+        )
+        if stream is None:
+            stream = self.stream
+        self.nccl.ncclRecv(
+            buffer_type(tensor.data_ptr()),
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            src,
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+
+    def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
+        if self.disabled:
+            return
+        assert tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {tensor.device}"
+        )
+        if stream is None:
+            stream = self.stream
+        if src == self.rank:
+            sendbuff = buffer_type(tensor.data_ptr())
+            # NCCL requires the sender also to have a receive buffer
+            recvbuff = buffer_type(tensor.data_ptr())
+        else:
+            sendbuff = buffer_type()
+            recvbuff = buffer_type(tensor.data_ptr())
+        self.nccl.ncclBroadcast(
+            sendbuff,
+            recvbuff,
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            src,
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+
+    def register_comm_window_raw(self, ptr: int, size: int):
+        return self.nccl.ncclCommWindowRegister(self.comm, buffer_type(ptr), size, 1)
+
+    def deregister_comm_window(self, window):
+        return self.nccl.ncclCommWindowDeregister(self.comm, window)
+
+    def group_start(self):
+        self.nccl.ncclGroupStart()
+
+    def group_end(self):
+        self.nccl.ncclGroupEnd()
+
+    @contextmanager
+    def change_state(
+        self, enable: Optional[bool] = None, stream: Optional[torch.cuda.Stream] = None
+    ):
+        """
+        A context manager to change the state of the communicator.
+        """
+        if enable is None:
+            # guess a default value when not specified
+            enable = self.available
+
+        if stream is None:
+            stream = self.stream
+
+        old_disable = self.disabled
+        old_stream = self.stream
+
+        self.stream = stream
+        self.disabled = not enable
+        yield
+
+        self.disabled = old_disable
+        self.stream = old_stream
diff --git a/python/sglang/srt/distributed/device_communicators/pynccl_allocator.py b/python/sglang/srt/distributed/device_communicators/pynccl_allocator.py
new file mode 100644
index 000000000..d7274cf2c
--- /dev/null
+++ b/python/sglang/srt/distributed/device_communicators/pynccl_allocator.py
@@ -0,0 +1,133 @@
+import tempfile
+
+import torch
+from packaging import version
+from torch.cuda.memory import CUDAPluggableAllocator
+
+from sglang.srt.distributed.parallel_state import GroupCoordinator
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+
+nccl_allocator_source = """
+#include <nccl.h>
+extern "C" {
+
+void* nccl_alloc_plug(size_t size, int device, void* stream) {
+  void* ptr;
+  ncclResult_t err = ncclMemAlloc(&ptr, size);
+  return ptr;
+
+}
+
+void nccl_free_plug(void* ptr, size_t size, int device, void* stream) {
+  ncclResult_t err = ncclMemFree(ptr);
+}
+
+}
+"""
+
+_allocator = None
+_mem_pool = None
+_registered_base_addrs = set()
+_graph_pool_id = None
+
+
+def is_symmetric_memory_enabled():
+    return global_server_args_dict["enable_symm_mem"]
+
+
+def set_graph_pool_id(graph_pool_id):
+    global _graph_pool_id
+    _graph_pool_id = graph_pool_id
+
+
+def get_nccl_mem_pool():
+    global _allocator, _mem_pool
+    if _mem_pool is None:
+        out_dir = tempfile.gettempdir()
+        nccl_allocator_libname = "nccl_allocator"
+        torch.utils.cpp_extension.load_inline(
+            name=nccl_allocator_libname,
+            cpp_sources=nccl_allocator_source,
+            with_cuda=True,
+            extra_ldflags=["-lnccl"],
+            verbose=True,
+            is_python_module=False,
+            build_directory=out_dir,
+        )
+        _allocator = CUDAPluggableAllocator(
+            f"{out_dir}/{nccl_allocator_libname}.so",
+            "nccl_alloc_plug",
+            "nccl_free_plug",
+        ).allocator()
+        _mem_pool = torch.cuda.MemPool(_allocator)
+    return _mem_pool
+
+
+class use_symmetric_memory:
+    def __init__(self, group_coordinator: GroupCoordinator):
+        if not is_symmetric_memory_enabled():
+            self.group_coordinator = None
+            self._mem_pool_ctx = None
+            self.is_graph_capture = None
+            self.device = None
+            self.pre_2_8_0 = None
+        else:
+            self.group_coordinator = group_coordinator
+            self._mem_pool_ctx = torch.cuda.use_mem_pool(get_nccl_mem_pool())
+            self.is_graph_capture = torch.cuda.is_current_stream_capturing()
+            self.device = torch.cuda.current_device()
+            self.pre_2_8_0 = version.parse(torch.__version__) < version.parse("2.8.0")
+
+    def __enter__(self):
+        if not is_symmetric_memory_enabled():
+            return self
+        assert (
+            self.group_coordinator.pynccl_comm is not None
+        ), f"Symmetric memory requires pynccl to be enabled in group '{self.group_coordinator.group_name}'"
+        assert (
+            self.group_coordinator.pynccl_comm.nccl_version >= 22703
+        ), "NCCL version 2.27.3 or higher is required for NCCL symmetric memory"
+        if self.is_graph_capture:
+            assert (
+                _graph_pool_id is not None
+            ), "graph_pool_id is not set under graph capture"
+            # Pause graph memory pool to use symmetric memory with cuda graph
+            if self.pre_2_8_0:
+                torch._C._cuda_endAllocateCurrentStreamToPool(
+                    self.device, _graph_pool_id
+                )
+            else:
+                torch._C._cuda_endAllocateToPool(self.device, _graph_pool_id)
+        self._mem_pool_ctx.__enter__()
+        return self
+
+    def tag(self, tensor: torch.Tensor):
+        if not is_symmetric_memory_enabled():
+            return
+        tensor.symmetric_memory = True
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if not is_symmetric_memory_enabled():
+            return
+        global _registered_base_addrs
+        self._mem_pool_ctx.__exit__(exc_type, exc_val, exc_tb)
+        for segment in get_nccl_mem_pool().snapshot():
+            if segment["address"] not in _registered_base_addrs:
+                if segment["stream"] == 0 and self.pre_2_8_0:
+                    # PyTorch version < 2.8.0 has a multi-thread MemPool bug
+                    # See https://github.com/pytorch/pytorch/issues/152861
+                    # Fixed at https://github.com/pytorch/pytorch/commit/f01e628e3b31852983ab30b25bf251f557ba9c0b
+                    # WAR is to skip allocations on the default stream since the forward_pass thread always runs on a custom stream
+                    continue
+                self.group_coordinator.pynccl_comm.register_comm_window_raw(
+                    segment["address"], segment["total_size"]
+                )
+                _registered_base_addrs.add(segment["address"])
+
+        if self.is_graph_capture:
+            if self.pre_2_8_0:
+                torch._C._cuda_beginAllocateToPool(self.device, _graph_pool_id)
+            else:
+                torch._C._cuda_beginAllocateCurrentThreadToPool(
+                    self.device, _graph_pool_id
+                )
diff --git a/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py b/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py
new file mode 100644
index 000000000..579811777
--- /dev/null
+++ b/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py
@@ -0,0 +1,563 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/pynccl.py
+
+# This file is a pure Python wrapper for the NCCL library.
+# The main purpose is to use NCCL combined with CUDA graph.
+# Before writing this script, we tried the following approach:
+# 1. We tried to use `cupy`, it calls NCCL correctly, but `cupy` itself
+#  often gets stuck when initializing the NCCL communicator.
+# 2. We tried to use `torch.distributed`, but `torch.distributed.all_reduce`
+#  contains many other potential cuda APIs, that are not allowed during
+#  capturing the CUDA graph. For further details, please check
+# https://discuss.pytorch.org/t/pytorch-cudagraph-with-nccl-operation-failed/ .
+#
+# Another rejected idea is to write a C/C++ binding for NCCL. It is usually
+# doable, but we often encounter issues related with nccl versions, and need
+# to switch between different versions of NCCL. See
+# https://github.com/NVIDIA/nccl/issues/1234 for more details.
+# A C/C++ binding is not flexible enough to handle this. It requires
+# recompilation of the code every time we want to switch between different
+# versions. This current implementation, with a **pure** Python wrapper, is
+# more flexible. We can easily switch between different versions of NCCL by
+# changing the environment variable `SGLANG_NCCL_SO_PATH`, or the `so_file`
+# variable in the code.
+
+import ctypes
+import logging
+import os
+import platform
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch.distributed import ReduceOp
+
+logger = logging.getLogger(__name__)
+
+
+def find_nccl_library() -> str:
+    """
+    We either use the library file specified by the `SGLANG_NCCL_SO_PATH`
+    environment variable, or we find the library file brought by PyTorch.
+    After importing `torch`, `libnccl.so.2` or `librccl.so.1` can be
+    found by `ctypes` automatically.
+    """
+
+    # so_file can be set to None in sglang
+    so_file = os.environ.get("SGLANG_NCCL_SO_PATH", None)
+
+    # manually load the nccl library
+    if so_file:
+        logger.info(
+            "Found nccl from environment variable SGLANG_NCCL_SO_PATH=%s", so_file
+        )
+    else:
+        if torch.version.cuda is not None:
+            so_file = "libnccl.so.2"
+        elif torch.version.hip is not None:
+            so_file = "librccl.so.1"
+        else:
+            raise ValueError("NCCL only supports CUDA and ROCm backends.")
+        logger.debug("Found nccl from library %s", so_file)
+    return so_file
+
+
+# === export types and functions from nccl to Python ===
+# for the original nccl definition, please check
+# https://github.com/NVIDIA/nccl/blob/master/src/nccl.h.in
+
+ncclResult_t = ctypes.c_int
+ncclComm_t = ctypes.c_void_p
+ncclWindow_t = ctypes.c_void_p
+
+
+class ncclUniqueId(ctypes.Structure):
+    _fields_ = [("internal", ctypes.c_byte * 128)]
+
+
+cudaStream_t = ctypes.c_void_p
+buffer_type = ctypes.c_void_p
+
+ncclDataType_t = ctypes.c_int
+
+
+class ncclDataTypeEnum:
+    ncclInt8 = 0
+    ncclChar = 0
+    ncclUint8 = 1
+    ncclInt32 = 2
+    ncclInt = 2
+    ncclUint32 = 3
+    ncclInt64 = 4
+    ncclUint64 = 5
+    ncclFloat16 = 6
+    ncclHalf = 6
+    ncclFloat32 = 7
+    ncclFloat = 7
+    ncclFloat64 = 8
+    ncclDouble = 8
+    ncclBfloat16 = 9
+    ncclNumTypes = 10
+
+    @classmethod
+    def from_torch(cls, dtype: torch.dtype) -> int:
+        if dtype == torch.int8:
+            return cls.ncclInt8
+        if dtype == torch.uint8:
+            return cls.ncclUint8
+        if dtype == torch.int32:
+            return cls.ncclInt32
+        if dtype == torch.int64:
+            return cls.ncclInt64
+        if dtype == torch.float16:
+            return cls.ncclFloat16
+        if dtype == torch.float32:
+            return cls.ncclFloat32
+        if dtype == torch.float64:
+            return cls.ncclFloat64
+        if dtype == torch.bfloat16:
+            return cls.ncclBfloat16
+        raise ValueError(f"Unsupported dtype: {dtype}")
+
+
+ncclRedOp_t = ctypes.c_int
+
+
+class ncclRedOpTypeEnum:
+    ncclSum = 0
+    ncclProd = 1
+    ncclMax = 2
+    ncclMin = 3
+    ncclAvg = 4
+    ncclNumOps = 5
+
+    @classmethod
+    def from_torch(cls, op: ReduceOp) -> int:
+        if op == ReduceOp.SUM:
+            return cls.ncclSum
+        if op == ReduceOp.PRODUCT:
+            return cls.ncclProd
+        if op == ReduceOp.MAX:
+            return cls.ncclMax
+        if op == ReduceOp.MIN:
+            return cls.ncclMin
+        if op == ReduceOp.AVG:
+            return cls.ncclAvg
+        raise ValueError(f"Unsupported op: {op}")
+
+
+@dataclass
+class Function:
+    name: str
+    restype: Any
+    argtypes: List[Any]
+
+
+class NCCLLibrary:
+    exported_functions = [
+        # const char* ncclGetErrorString(ncclResult_t result)
+        Function("ncclGetErrorString", ctypes.c_char_p, [ncclResult_t]),
+        # ncclResult_t  ncclGetVersion(int *version);
+        Function("ncclGetVersion", ncclResult_t, [ctypes.POINTER(ctypes.c_int)]),
+        # ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
+        Function("ncclGetUniqueId", ncclResult_t, [ctypes.POINTER(ncclUniqueId)]),
+        # ncclResult_t  ncclCommInitRank(
+        #   ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+        # note that ncclComm_t is a pointer type, so the first argument
+        # is a pointer to a pointer
+        Function(
+            "ncclCommInitRank",
+            ncclResult_t,
+            [ctypes.POINTER(ncclComm_t), ctypes.c_int, ncclUniqueId, ctypes.c_int],
+        ),
+        # ncclResult_t  ncclAllReduce(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+        #   cudaStream_t stream);
+        # note that cudaStream_t is a pointer type, so the last argument
+        # is a pointer
+        Function(
+            "ncclAllReduce",
+            ncclResult_t,
+            [
+                buffer_type,
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ncclRedOp_t,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
+        # ncclResult_t  ncclAllGather(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, ncclComm_t comm,
+        #   cudaStream_t stream);
+        # note that cudaStream_t is a pointer type, so the last argument
+        # is a pointer
+        Function(
+            "ncclAllGather",
+            ncclResult_t,
+            [
+                buffer_type,
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
+        # ncclResult_t  ncclReduce(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, ncclRedOp_t op, int root,
+        #   ncclComm_t comm,  cudaStream_t stream);
+        # note that cudaStream_t is a pointer type, so the last argument
+        # is a pointer
+        Function(
+            "ncclReduce",
+            ncclResult_t,
+            [
+                buffer_type,
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ncclRedOp_t,
+                ctypes.c_int,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
+        # ncclResult_t  ncclReduceScatter(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+        #   cudaStream_t stream);
+        # note that cudaStream_t is a pointer type, so the last argument
+        # is a pointer
+        Function(
+            "ncclReduceScatter",
+            ncclResult_t,
+            [
+                buffer_type,
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ncclRedOp_t,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
+        # ncclResult_t  ncclSend(
+        #   const void* sendbuff, size_t count, ncclDataType_t datatype,
+        #   int dest, ncclComm_t comm, cudaStream_t stream);
+        Function(
+            "ncclSend",
+            ncclResult_t,
+            [
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ctypes.c_int,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
+        # ncclResult_t  ncclRecv(
+        #   void* recvbuff, size_t count, ncclDataType_t datatype,
+        #   int src, ncclComm_t comm, cudaStream_t stream);
+        Function(
+            "ncclRecv",
+            ncclResult_t,
+            [
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ctypes.c_int,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
+        # ncclResult_t ncclBroadcast(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, int root, ncclComm_t comm,
+        #   cudaStream_t stream);
+        Function(
+            "ncclBroadcast",
+            ncclResult_t,
+            [
+                buffer_type,
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ctypes.c_int,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
+        # be cautious! this is a collective call, it will block until all
+        # processes in the communicator have called this function.
+        # because Python object destruction can happen in random order,
+        # it is better not to call it at all.
+        # ncclResult_t  ncclCommDestroy(ncclComm_t comm);
+        Function("ncclCommDestroy", ncclResult_t, [ncclComm_t]),
+        # ncclResult_t ncclGroupStart();
+        Function("ncclGroupStart", ncclResult_t, []),
+        # ncclResult_t ncclGroupEnd();
+        Function("ncclGroupEnd", ncclResult_t, []),
+    ]
+
+    exported_functions_symm_mem = [
+        # ncclResult_t ncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
+        Function(
+            "ncclCommWindowRegister",
+            ncclResult_t,
+            [
+                ncclComm_t,
+                buffer_type,
+                ctypes.c_size_t,
+                ctypes.POINTER(ncclWindow_t),
+                ctypes.c_int,
+            ],
+        ),
+        # ncclResult_t ncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win);
+        Function("ncclCommWindowDeregister", ncclResult_t, [ncclComm_t, ncclWindow_t]),
+    ]
+
+    # class attribute to store the mapping from the path to the library
+    # to avoid loading the same library multiple times
+    path_to_library_cache: Dict[str, Any] = {}
+
+    # class attribute to store the mapping from library path
+    #  to the corresponding dictionary
+    path_to_dict_mapping: Dict[str, Dict[str, Any]] = {}
+
+    def __init__(self, so_file: Optional[str] = None):
+
+        so_file = so_file or find_nccl_library()
+
+        try:
+            if so_file not in NCCLLibrary.path_to_dict_mapping:
+                lib = ctypes.CDLL(so_file)
+                NCCLLibrary.path_to_library_cache[so_file] = lib
+            self.lib = NCCLLibrary.path_to_library_cache[so_file]
+        except Exception as e:
+            logger.error(
+                "Failed to load NCCL library from %s ."
+                "It is expected if you are not running on NVIDIA/AMD GPUs."
+                "Otherwise, the nccl library might not exist, be corrupted "
+                "or it does not support the current platform %s."
+                "If you already have the library, please set the "
+                "environment variable SGLANG_NCCL_SO_PATH"
+                " to point to the correct nccl library path.",
+                so_file,
+                platform.platform(),
+            )
+            raise e
+
+        if so_file not in NCCLLibrary.path_to_dict_mapping:
+            _funcs: Dict[str, Any] = {}
+            exported_functions = NCCLLibrary.exported_functions
+            if hasattr(self.lib, "ncclCommWindowRegister"):
+                exported_functions.extend(NCCLLibrary.exported_functions_symm_mem)
+            for func in exported_functions:
+                f = getattr(self.lib, func.name)
+                f.restype = func.restype
+                f.argtypes = func.argtypes
+                _funcs[func.name] = f
+            NCCLLibrary.path_to_dict_mapping[so_file] = _funcs
+        self._funcs = NCCLLibrary.path_to_dict_mapping[so_file]
+
+    def ncclGetErrorString(self, result: ncclResult_t) -> str:
+        return self._funcs["ncclGetErrorString"](result).decode("utf-8")
+
+    def NCCL_CHECK(self, result: ncclResult_t) -> None:
+        if result != 0:
+            error_str = self.ncclGetErrorString(result)
+            raise RuntimeError(f"NCCL error: {error_str}")
+
+    def ncclGetRawVersion(self) -> int:
+        version = ctypes.c_int()
+        self.NCCL_CHECK(self._funcs["ncclGetVersion"](ctypes.byref(version)))
+        # something like 21903
+        return version.value
+
+    def ncclGetVersion(self) -> str:
+        version_str = str(self.ncclGetRawVersion())
+        # something like 21903 --> "2.19.3"
+        major = version_str[0].lstrip("0")
+        minor = version_str[1:3].lstrip("0")
+        patch = version_str[3:].lstrip("0")
+        return f"{major}.{minor}.{patch}"
+
+    def ncclGetUniqueId(self) -> ncclUniqueId:
+        unique_id = ncclUniqueId()
+        self.NCCL_CHECK(self._funcs["ncclGetUniqueId"](ctypes.byref(unique_id)))
+        return unique_id
+
+    def ncclCommInitRank(
+        self, world_size: int, unique_id: ncclUniqueId, rank: int
+    ) -> ncclComm_t:
+        comm = ncclComm_t()
+        self.NCCL_CHECK(
+            self._funcs["ncclCommInitRank"](
+                ctypes.byref(comm), world_size, unique_id, rank
+            )
+        )
+        return comm
+
+    def ncclAllReduce(
+        self,
+        sendbuff: buffer_type,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        op: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        # `datatype` actually should be `ncclDataType_t`
+        # and `op` should be `ncclRedOp_t`
+        # both are aliases of `ctypes.c_int`
+        # when we pass int to a function, it will be converted to `ctypes.c_int`
+        # by ctypes automatically
+        self.NCCL_CHECK(
+            self._funcs["ncclAllReduce"](
+                sendbuff, recvbuff, count, datatype, op, comm, stream
+            )
+        )
+
+    def ncclReduce(
+        self,
+        sendbuff: buffer_type,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        op: int,
+        root: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        # `datatype` actually should be `ncclDataType_t`
+        # and `op` should be `ncclRedOp_t`
+        # both are aliases of `ctypes.c_int`
+        # when we pass int to a function, it will be converted to `ctypes.c_int`
+        # by ctypes automatically
+        self.NCCL_CHECK(
+            self._funcs["ncclReduce"](
+                sendbuff, recvbuff, count, datatype, op, root, comm, stream
+            )
+        )
+
+    def ncclReduceScatter(
+        self,
+        sendbuff: buffer_type,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        op: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        # `datatype` actually should be `ncclDataType_t`
+        # and `op` should be `ncclRedOp_t`
+        # both are aliases of `ctypes.c_int`
+        # when we pass int to a function, it will be converted to `ctypes.c_int`
+        # by ctypes automatically
+        self.NCCL_CHECK(
+            self._funcs["ncclReduceScatter"](
+                sendbuff, recvbuff, count, datatype, op, comm, stream
+            )
+        )
+
+    def ncclAllGather(
+        self,
+        sendbuff: buffer_type,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        # `datatype` actually should be `ncclDataType_t`
+        # which is an aliases of `ctypes.c_int`
+        # when we pass int to a function, it will be converted to `ctypes.c_int`
+        # by ctypes automatically
+        self.NCCL_CHECK(
+            self._funcs["ncclAllGather"](
+                sendbuff, recvbuff, count, datatype, comm, stream
+            )
+        )
+
+    def ncclSend(
+        self,
+        sendbuff: buffer_type,
+        count: int,
+        datatype: int,
+        dest: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        self.NCCL_CHECK(
+            self._funcs["ncclSend"](sendbuff, count, datatype, dest, comm, stream)
+        )
+
+    def ncclRecv(
+        self,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        src: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        self.NCCL_CHECK(
+            self._funcs["ncclRecv"](recvbuff, count, datatype, src, comm, stream)
+        )
+
+    def ncclBroadcast(
+        self,
+        sendbuff: buffer_type,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        root: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        self.NCCL_CHECK(
+            self._funcs["ncclBroadcast"](
+                sendbuff, recvbuff, count, datatype, root, comm, stream
+            )
+        )
+
+    def ncclCommDestroy(self, comm: ncclComm_t) -> None:
+        self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))
+
+    def ncclCommWindowRegister(
+        self, comm: ncclComm_t, buff: buffer_type, size: int, win_flags: int
+    ) -> ncclWindow_t:
+        window = ncclWindow_t()
+        self.NCCL_CHECK(
+            self._funcs["ncclCommWindowRegister"](
+                comm, buff, size, ctypes.byref(window), win_flags
+            )
+        )
+        return window
+
+    def ncclCommWindowDeregister(self, comm: ncclComm_t, window: ncclWindow_t) -> None:
+        self.NCCL_CHECK(self._funcs["ncclCommWindowDeregister"](comm, window))
+
+    def ncclGroupStart(self) -> None:
+        self.NCCL_CHECK(self._funcs["ncclGroupStart"]())
+
+    def ncclGroupEnd(self) -> None:
+        self.NCCL_CHECK(self._funcs["ncclGroupEnd"]())
+
+
+__all__ = [
+    "NCCLLibrary",
+    "ncclDataTypeEnum",
+    "ncclRedOpTypeEnum",
+    "ncclUniqueId",
+    "ncclComm_t",
+    "cudaStream_t",
+    "buffer_type",
+]
diff --git a/python/sglang/srt/distributed/device_communicators/quick_all_reduce.py b/python/sglang/srt/distributed/device_communicators/quick_all_reduce.py
new file mode 100644
index 000000000..0113c432d
--- /dev/null
+++ b/python/sglang/srt/distributed/device_communicators/quick_all_reduce.py
@@ -0,0 +1,273 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+import os
+from enum import Enum
+from typing import Union
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from sglang.srt import _custom_ops as ops
+from sglang.srt.distributed.device_communicators.custom_all_reduce_utils import (
+    is_full_nvlink,
+    is_weak_contiguous,
+)
+from sglang.srt.distributed.parallel_state import in_the_same_node_as
+from sglang.srt.utils import is_cuda, is_hip
+
+logger = logging.getLogger(__name__)
+
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+
+
+try:
+    ops.qr_max_size()
+    quick_ar = True
+except Exception:
+    # For CPUs and CUDA
+    quick_ar = False
+
+
+def qr_rocm_arch_available():
+    if not _is_hip:
+        return False
+    try:
+        props = torch.cuda.get_device_properties(0)
+        gcn_arch = getattr(props, "gcnArchName", "")
+        supported_archs = ["gfx94", "gfx95"]
+        return any(gfx in gcn_arch for gfx in supported_archs)
+    except Exception as e:
+        logger.warning("Failed to determine ROCm for quick allreduce: %s", e)
+        return False
+
+
+class QuickReduceRegime(Enum):
+    FP = 0
+    INT8 = 1
+    INT6 = 2
+    INT4 = 3
+    NONE = 4
+
+
+MB = 1024 * 1024
+
+
+class QuickAllReduce:
+
+    _SUPPORTED_WORLD_SIZES = [2, 4, 8]
+    _SUPPORTED_DTYPES = [torch.float16, torch.bfloat16]
+    # The following data is based on kernel tests.
+    # In this order [FP, INT8, INT6, INT4].
+    _QR_MIN_SIZE = {
+        (torch.float16, 2): [1 * MB, 2 * MB, 2 * MB, 1 * MB],
+        (torch.float16, 4): [1 * MB, 16 * MB, 4 * MB, 2 * MB],
+        (torch.float16, 8): [16 * MB, 4 * MB, 4 * MB, 2 * MB],
+        (torch.bfloat16, 2): [2 * MB, 8 * MB, 8 * MB, 8 * MB],
+        (torch.bfloat16, 4): [8 * MB, 64 * MB, 64 * MB, 16 * MB],
+        (torch.bfloat16, 8): [16 * MB, 2048 * MB, 2048 * MB, 2048 * MB],
+    }
+
+    def __init__(
+        self, group: ProcessGroup, device: Union[int, str, torch.device]
+    ) -> None:
+        """
+        Custom allreduce provides non-destructive acceleration and is
+        available for CUDA and ROCm MI300 series.
+        Custom quick allreduce leverages quantization for further
+        acceleration on ROCm. It currently supports Q8, Q6, and Q4
+        quantization formats and FP(float16, bfloat16).
+        Quick allreduce is designed as a complement to custom allreduce.
+        Its initialization requires even stricter conditions.
+        Only the ROCm MI300 series is supported for quick allreduce at
+        this time.
+        Args:
+            group: the process group to work on. If None, it will use the
+                default process group.
+            device: the device to bind the CustomAllreduce to. If None,
+                it will be bind to f"cuda:{local_rank}".
+        It is the caller's responsibility to make sure each communicator
+        is bind to a unique device, and all communicators in this group
+        are in the same node.
+        """
+        self.disabled = True
+        if not qr_rocm_arch_available():
+            logger.debug(
+                "Custom quick allreduce is only supported on ROCm MI300 series."
+            )
+            return
+
+        if not quick_ar:
+            # disable because of missing quick reduce library
+            # e.g. in a cuda environment
+            logger.info(
+                "Custom quick allreduce is disabled because "
+                "of missing custom quick allreduce library"
+            )
+            return
+
+        self.group = group
+        assert (
+            dist.get_backend(group) != dist.Backend.NCCL
+        ), "Custom quick allreduce should be attached to a non-NCCL group."
+        if not all(in_the_same_node_as(group, source_rank=0)):
+            # No need to initialize custom quick allreduce for
+            # multi-node case.
+            logger.warning(
+                "Custom quick allreduce is disabled because this "
+                "process group spans across nodes."
+            )
+            return
+        rank = dist.get_rank(group=self.group)
+        world_size = dist.get_world_size(group=self.group)
+        self.rank = rank
+        self.world_size = world_size
+        if world_size == 1:
+            # No need to initialize QuickReduce for single GPU case.
+            return
+
+        if world_size not in QuickAllReduce._SUPPORTED_WORLD_SIZES:
+            logger.warning(
+                "Custom quick allreduce is disabled due to an "
+                "unsupported world size: %d. Supported world sizes: %s.",
+                world_size,
+                str(QuickAllReduce._SUPPORTED_WORLD_SIZES),
+            )
+            return
+
+        if isinstance(device, int):
+            device = torch.device(f"cuda:{device}")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        assert isinstance(device, torch.device)
+        self.device = device
+
+        cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+        if cuda_visible_devices:
+            device_ids = list(map(int, cuda_visible_devices.split(",")))
+        else:
+            device_ids = list(range(torch.cuda.device_count()))
+        physical_device_id = device_ids[device.index]
+        tensor = torch.tensor([physical_device_id], dtype=torch.int, device="cpu")
+        gather_list = [
+            torch.tensor([0], dtype=torch.int, device="cpu")
+            for _ in range(self.world_size)
+        ]
+        dist.all_gather(gather_list, tensor, group=self.group)
+        physical_device_ids = [t.item() for t in gather_list]
+
+        # test nvlink first, this will filter out most of the cases
+        # where custom quick allreduce is not supported
+        # this checks hardware and driver support for NVLink
+        if _is_cuda or _is_hip:
+            self.fully_connected = is_full_nvlink(physical_device_ids, self.world_size)
+        if self.world_size > 2 and not self.fully_connected:
+            logger.debug(
+                "Custom quick allreduce is disabled because it's not supported "
+                "on more than two PCIe-only GPUs. "
+            )
+            return
+
+        self.init_quick_all_reduce()
+
+    def init_quick_all_reduce(self):
+        # On RocM, bfloat16 kernels are slower than fp16
+        # due to slower match operations
+        # If environment variable is set to 1, we convert input to fp16
+        self.use_fp16_kernels = int(
+            os.environ.get("ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16", 1)
+        )
+        regime_str = os.environ.get("ROCM_QUICK_REDUCE_QUANTIZATION", "NONE")
+        if regime_str not in QuickReduceRegime.__members__:
+            logger.warning(
+                "Custom quick allreduce:",
+                f"Invalid quantization level: {regime_str}. "
+                "Supported levels: "
+                f"{list(QuickReduceRegime.__members__.keys())}",
+            )
+            return
+
+        if regime_str == "NONE":
+            logger.debug(
+                "Custom quick allreduce is disabled based "
+                "on env variable "
+                "ROCM_QUICK_REDUCE_QUANTIZATION='NONE'"
+            )
+            return
+        self.qr_quant_level = QuickReduceRegime[regime_str]
+
+        # TODO: If the dtype is not bfloat16 or then float16,
+        # quickallreduce should not be created.
+
+        # ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB is specified in MB
+        qr_max_size = int(os.environ.get("ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB", 0))
+        if qr_max_size > 0:
+            if qr_max_size < 1:
+                logger.info(
+                    "You should not set a max_size smaller than 1MB, which can "
+                    "lead to error or degradation to custom allreduce or rccl."
+                )
+            qr_max_size = qr_max_size * MB
+        # If qr_max_size is None, then 2GB is used by default.
+        self._ptr = ops.init_custom_qr(self.rank, self.world_size, qr_max_size)
+        self.qr_max_size = qr_max_size if qr_max_size > 0 else ops.qr_max_size()
+        self.create_shared_buffer()
+        self.disabled = False
+
+    def create_shared_buffer(self):
+        """
+        Creates a shared buffer for quickreduce.
+        Has to be called after init_custom_qr
+        """
+        handle = ops.qr_get_handle(self._ptr)
+        world_size = dist.get_world_size(group=self.group)
+        handles = [None] * world_size
+        dist.all_gather_object(handles, handle, group=self.group)
+        ops.qr_open_handles(self._ptr, handles)
+
+    def should_quick_allreduce(self, inp: torch.Tensor):
+        """
+        Check if quickreduce is available
+        """
+        if self.disabled:
+            return False
+        if inp.dtype not in self._SUPPORTED_DTYPES:
+            return False
+        inp_size = inp.numel() * inp.element_size()
+        # custom quick allreduce requires input byte size to be
+        # multiples of 16
+        if inp_size % 16 != 0:
+            return False
+        if not is_weak_contiguous(inp):
+            return False
+        dtype = inp.dtype
+        if self.use_fp16_kernels:
+            dtype = torch.float16
+        return (
+            inp_size <= self.qr_max_size
+            and inp_size
+            >= self._QR_MIN_SIZE[(dtype, self.world_size)][self.qr_quant_level.value]
+        )
+
+    def quick_all_reduce(self, inp: torch.Tensor, *, out: torch.Tensor = None):
+        """Performs an out-of-place custom quick all reduce."""
+        # quick allreduce doesn't require a separate graph mode,
+        # as QR uses static IPC buffer.
+        if out is None:
+            out = torch.empty_like(inp)
+        ops.qr_all_reduce(
+            self._ptr, inp, out, self.qr_quant_level.value, self.use_fp16_kernels
+        )
+        return out
+
+    def close(self):
+        if not self.disabled and getattr(self, "_ptr", None):
+            if ops is not None:
+                ops.qr_destroy(self._ptr)
+            self._ptr = 0
+            self.disabled = True
+
+    def __del__(self):
+        self.close()
diff --git a/python/sglang/srt/distributed/device_communicators/shm_broadcast.py b/python/sglang/srt/distributed/device_communicators/shm_broadcast.py
new file mode 100644
index 000000000..e5b59e7cc
--- /dev/null
+++ b/python/sglang/srt/distributed/device_communicators/shm_broadcast.py
@@ -0,0 +1,509 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/shm_broadcast.py
+
+import logging
+import os
+import pickle
+import time
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from multiprocessing import shared_memory
+from typing import List, Optional
+from unittest.mock import patch
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+from zmq import IPV6  # type: ignore
+from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context  # type: ignore
+
+from sglang.srt.utils import (
+    format_tcp_address,
+    get_ip,
+    get_open_port,
+    is_valid_ipv6_address,
+)
+
+# SGLANG_RINGBUFFER_WARNING_INTERVAL can be set to 60
+SGLANG_RINGBUFFER_WARNING_INTERVAL = int(
+    os.environ.get("SGLANG_RINGBUFFER_WARNING_INTERVAL", "60")
+)
+
+logger = logging.getLogger(__name__)
+
+
+class ShmRingBuffer:
+
+    def __init__(
+        self,
+        n_reader: int,
+        max_chunk_bytes: int,
+        max_chunks: int,
+        name: Optional[str] = None,
+    ):
+        """
+        A shared memory ring buffer implementation for broadcast communication.
+        Essentially, it is a queue where only one will `enqueue` and multiple
+        will `dequeue`. The max size of each item, together with the max number
+        of items that can be stored in the buffer are known in advance.
+        In this case, we don't need to synchronize the access to
+         the buffer.
+
+        Buffer memory layout:
+                  data                                 metadata
+                    |                                      |
+                    | (current_idx)                        | (current_idx)
+                    v                                      v
+        +-------------------------------+----------------------------------------+
+        | chunk0 | chunk1 | ... | chunk | metadata0 | metadata1 | ... | metadata |
+        +-------------------------------+----------------------------------------+
+        | max_chunks x max_chunk_bytes  | max_chunks x (1 + n_reader) bytes      |
+
+        metadata memory layout: each byte is a flag, the first byte is the written
+        flag, and the rest are reader flags. The flags are set to 0 by default.
+        +--------------+--------------+--------------+-----+--------------+
+        | written_flag | reader0_flag | reader1_flag | ... | readerN_flag |
+        +--------------+--------------+--------------+-----+--------------+
+
+        The state of metadata is as follows:
+
+        (case 1) 0???...???: the block is not written yet, cannot read, can write
+        (case 2) 1000...000: the block is just written, can read, cannot write
+        (case 3) 1???...???: the block is written and read by some readers, can read if not read, cannot write
+        (case 4) 1111...111: the block is written and read by all readers, cannot read, can write
+
+        State transition for readers:
+
+        When a reader finds a block that it can read (case 2 or 3), it can yield the block for caller to read.
+        Only after the caller finishes reading the block, the reader can mark the block as read.
+        Readers only mark the block as read (from 0 to 1), the writer marks the block as ready to read (from 1 to 0).
+
+        State transition for writer:
+
+        When the writer writes to a block (case 1 or 4), it first resets the written flag to 0, converting either case
+        to case 1. Then it can yield the block for caller to write. After the caller finishes writing the block, the writer
+        can reset the reader flags to 0, and mark the block as written (from 0 to 1).
+        NOTE: the order is important here, first reset the reader flags (so that we are still in case 1), then mark the block as written. The state transition is atomic. If we do it in the reverse order, it will go through case 3 and then back to case 2, and readers might read the intermediate case 3, which is not correct.
+
+        During creation, `name` is None and the buffer is created. We can pass the
+        created object to other processes by pickling it. The other processes will
+        get the name of the shared memory and open it, so that they can access the
+        same shared memory buffer.
+        """  # noqa
+        self.n_reader = n_reader
+        self.metadata_size = 1 + n_reader
+        self.max_chunk_bytes = max_chunk_bytes
+        self.max_chunks = max_chunks
+        self.total_bytes_of_buffer = (
+            self.max_chunk_bytes + self.metadata_size
+        ) * self.max_chunks
+        self.data_offset = 0
+        self.metadata_offset = self.max_chunk_bytes * self.max_chunks
+
+        if name is None:
+            # we are creating a buffer
+            self.is_creator = True
+            self.shared_memory = shared_memory.SharedMemory(
+                create=True, size=self.total_bytes_of_buffer
+            )
+            # initialize the metadata section to 0
+            with memoryview(
+                self.shared_memory.buf[self.metadata_offset :]
+            ) as metadata_buffer:
+                torch.frombuffer(metadata_buffer, dtype=torch.uint8).fill_(0)
+        else:
+            # we are opening an existing buffer
+            self.is_creator = False
+            # fix to https://stackoverflow.com/q/62748654/9191338
+            # Python incorrectly tracks shared memory even if it is not
+            # created by the process. The following patch is a workaround.
+            with patch(
+                "multiprocessing.resource_tracker.register",
+                lambda *args, **kwargs: None,
+            ):
+                try:
+                    self.shared_memory = shared_memory.SharedMemory(name=name)
+                    assert self.shared_memory.size == self.total_bytes_of_buffer
+                except FileNotFoundError:
+                    # we might deserialize the object in a different node
+                    # in this case, this object is not used,
+                    # and we should suppress the error
+                    pass
+
+    def __reduce__(self):
+        return (
+            self.__class__,
+            (
+                self.n_reader,
+                self.max_chunk_bytes,
+                self.max_chunks,
+                self.shared_memory.name,
+            ),
+        )
+
+    def __del__(self):
+        if hasattr(self, "shared_memory"):
+            self.shared_memory.close()
+            if self.is_creator:
+                self.shared_memory.unlink()
+
+    @contextmanager
+    def get_data(self, current_idx: int):
+        start = self.data_offset + current_idx * self.max_chunk_bytes
+        end = start + self.max_chunk_bytes
+        with memoryview(self.shared_memory.buf[start:end]) as buf:
+            yield buf
+
+    @contextmanager
+    def get_metadata(self, current_idx: int):
+        start = self.metadata_offset + current_idx * self.metadata_size
+        end = start + self.metadata_size
+        with memoryview(self.shared_memory.buf[start:end]) as buf:
+            yield buf
+
+
+@dataclass
+class Handle:
+    connect_ip: str
+    local_reader_ranks: List[int] = field(default_factory=list)
+
+    buffer: Optional[ShmRingBuffer] = None
+    local_subscribe_port: Optional[int] = None
+    remote_subscribe_port: Optional[int] = None
+
+
+class MessageQueue:
+
+    def __init__(
+        self,
+        n_reader,  # number of all readers
+        n_local_reader,  # number of local readers through shared memory
+        local_reader_ranks: Optional[List[int]] = None,
+        max_chunk_bytes: int = 1024 * 1024 * 10,
+        max_chunks: int = 10,
+        connect_ip: Optional[str] = None,
+    ):
+        if local_reader_ranks is None:
+            local_reader_ranks = list(range(n_local_reader))
+        else:
+            assert len(local_reader_ranks) == n_local_reader
+        self.n_local_reader = n_local_reader
+        n_remote_reader = n_reader - n_local_reader
+        self.n_remote_reader = n_remote_reader
+
+        if connect_ip is None:
+            connect_ip = get_ip() if n_remote_reader > 0 else "127.0.0.1"
+
+        context = Context()
+
+        if n_local_reader > 0:
+            # for local readers, we will:
+            # 1. create a shared memory ring buffer to communicate small data
+            # 2. create a publish-subscribe socket to communicate large data
+            self.buffer = ShmRingBuffer(n_local_reader, max_chunk_bytes, max_chunks)
+
+            # XPUB is very similar to PUB,
+            # except that it can receive subscription messages
+            # to confirm the number of subscribers
+            self.local_socket = context.socket(XPUB)
+            # set the verbose option so that we can receive every subscription
+            # message. otherwise, we will only receive the first subscription
+            # see http://api.zeromq.org/3-3:zmq-setsockopt for more details
+            self.local_socket.setsockopt(XPUB_VERBOSE, True)
+            local_subscribe_port = get_open_port()
+            socket_addr = f"tcp://127.0.0.1:{local_subscribe_port}"
+            logger.debug("Binding to %s", socket_addr)
+            self.local_socket.bind(socket_addr)
+
+            self.current_idx = 0
+
+        else:
+            self.buffer = None  # type: ignore
+            local_subscribe_port = None
+            self.local_socket = None
+            self.current_idx = -1
+
+        if n_remote_reader > 0:
+            # for remote readers, we will:
+            # create a publish-subscribe socket to communicate large data
+            self.remote_socket = context.socket(XPUB)
+            self.remote_socket.setsockopt(XPUB_VERBOSE, True)
+            remote_subscribe_port = get_open_port()
+            if is_valid_ipv6_address(connect_ip):
+                self.remote_socket.setsockopt(IPV6, 1)
+            self.remote_socket.bind(
+                format_tcp_address(connect_ip, remote_subscribe_port)
+            )
+
+        else:
+            remote_subscribe_port = None
+            self.remote_socket = None
+
+        self._is_writer = True
+        self._is_local_reader = False
+        self.local_reader_rank = -1
+        # rank does not matter for remote readers
+        self._is_remote_reader = False
+
+        self.handle = Handle(
+            connect_ip=connect_ip,
+            local_reader_ranks=local_reader_ranks,
+            buffer=self.buffer,
+            local_subscribe_port=local_subscribe_port,
+            remote_subscribe_port=remote_subscribe_port,
+        )
+
+        logger.debug("Message queue communication handle: %s", self.handle)
+
+    def export_handle(self) -> Handle:
+        return self.handle
+
+    @staticmethod
+    def create_from_handle(handle: Handle, rank) -> "MessageQueue":
+        self = MessageQueue.__new__(MessageQueue)
+        self.handle = handle
+        self._is_writer = False
+
+        context = Context()
+
+        if rank in handle.local_reader_ranks:
+            assert handle.buffer is not None
+            self.buffer = handle.buffer
+            self.current_idx = 0
+            self.local_reader_rank = handle.local_reader_ranks.index(rank)
+            self._is_local_reader = True
+            self._is_remote_reader = False
+
+            self.local_socket = context.socket(SUB)
+            self.local_socket.setsockopt_string(SUBSCRIBE, "")
+            socket_addr = f"tcp://127.0.0.1:{handle.local_subscribe_port}"
+            logger.debug("Connecting to %s", socket_addr)
+            self.local_socket.connect(socket_addr)
+
+            self.remote_socket = None
+        else:
+            self.buffer = None  # type: ignore
+            self.current_idx = -1
+            self.local_reader_rank = -1
+            self._is_local_reader = False
+            self._is_remote_reader = True
+
+            self.local_socket = None
+
+            self.remote_socket = context.socket(SUB)
+            self.remote_socket.setsockopt_string(SUBSCRIBE, "")
+            if is_valid_ipv6_address(handle.connect_ip):
+                self.remote_socket.setsockopt(IPV6, 1)
+            socket_addr = format_tcp_address(
+                handle.connect_ip, handle.remote_subscribe_port
+            )
+            logger.debug("Connecting to %s", socket_addr)
+            self.remote_socket.connect(socket_addr)
+
+        return self
+
+    def wait_until_ready(self):
+        """This is a collective operation. All processes (including the
+        readers and the writer) should call this function.
+        """
+        if self._is_writer:
+            # wait for all readers to connect
+
+            # local readers
+            for i in range(self.n_local_reader):
+                # wait for subscription messages from all local readers
+                self.local_socket.recv()
+            if self.n_local_reader > 0:
+                # send a message to all local readers
+                # to make sure the publish channel is working
+                self.local_socket.send(b"READY")
+
+            # remote readers
+            for i in range(self.n_remote_reader):
+                # wait for subscription messages from all remote readers
+                self.remote_socket.recv()
+            if self.n_remote_reader > 0:
+                # send a message to all remote readers
+                # to make sure the publish channel is working
+                self.remote_socket.send(b"READY")
+        elif self._is_local_reader:
+            # wait for the writer to send a message
+            recv = self.local_socket.recv()
+            assert recv == b"READY"
+        elif self._is_remote_reader:
+            # wait for the writer to send a message
+            recv = self.remote_socket.recv()
+            assert recv == b"READY"
+
+    @contextmanager
+    def acquire_write(self):
+        assert self._is_writer, "Only writers can acquire write"
+        start_time = time.monotonic()
+        n_warning = 1
+        while True:
+            with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
+                read_count = sum(metadata_buffer[1:])
+                written_flag = metadata_buffer[0]
+                if written_flag and read_count != self.buffer.n_reader:
+                    # this block is written and not read by all readers
+                    # for writers, `self.current_idx` is the next block to write
+                    # if this block is not ready to write,
+                    # we need to wait until it is read by all readers
+
+                    # Release the processor to other threads
+                    os.sched_yield()
+
+                    # if we wait for a long time, we should warn the user
+                    if (
+                        time.monotonic() - start_time
+                        > SGLANG_RINGBUFFER_WARNING_INTERVAL * n_warning
+                    ):
+                        logger.warning(
+                            "No available block found in %s second. ",
+                            SGLANG_RINGBUFFER_WARNING_INTERVAL,
+                        )
+                        n_warning += 1
+
+                    continue
+                # found a block that is either
+                # (1) not written
+                # (2) read by all readers
+
+                # mark the block as not written
+                metadata_buffer[0] = 0
+                # let caller write to the buffer
+                with self.buffer.get_data(self.current_idx) as buf:
+                    yield buf
+
+                # caller has written to the buffer
+                # NOTE: order is important here
+                # first set the read flags to 0
+                # then set the written flag to 1
+                # otherwise, the readers may think they already read the block
+                for i in range(1, self.buffer.n_reader + 1):
+                    # set read flag to 0, meaning it is not read yet
+                    metadata_buffer[i] = 0
+                # mark the block as written
+                metadata_buffer[0] = 1
+                self.current_idx = (self.current_idx + 1) % self.buffer.max_chunks
+                break
+
+    @contextmanager
+    def acquire_read(self):
+        assert self._is_local_reader, "Only readers can acquire read"
+        start_time = time.monotonic()
+        n_warning = 1
+        while True:
+            with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
+                read_flag = metadata_buffer[self.local_reader_rank + 1]
+                written_flag = metadata_buffer[0]
+                if not written_flag or read_flag:
+                    # this block is either
+                    # (1) not written
+                    # (2) already read by this reader
+
+                    # for readers, `self.current_idx` is the next block to read
+                    # if this block is not ready,
+                    # we need to wait until it is written
+
+                    # Release the processor to other threads
+                    os.sched_yield()
+
+                    # if we wait for a long time, we should warn the user
+                    if (
+                        time.monotonic() - start_time
+                        > SGLANG_RINGBUFFER_WARNING_INTERVAL * n_warning
+                    ):
+                        logger.warning(
+                            "No available block found in %s second. ",
+                            SGLANG_RINGBUFFER_WARNING_INTERVAL,
+                        )
+                        n_warning += 1
+
+                    continue
+                # found a block that is not read by this reader
+                # let caller read from the buffer
+                with self.buffer.get_data(self.current_idx) as buf:
+                    yield buf
+
+                # caller has read from the buffer
+                # set the read flag
+                metadata_buffer[self.local_reader_rank + 1] = 1
+                self.current_idx = (self.current_idx + 1) % self.buffer.max_chunks
+                break
+
+    def enqueue(self, obj):
+        assert self._is_writer, "Only writers can enqueue"
+        serialized_obj = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
+        if self.n_local_reader > 0:
+            if len(serialized_obj) >= self.buffer.max_chunk_bytes:
+                with self.acquire_write() as buf:
+                    buf[0] = 1  # overflow
+                self.local_socket.send(serialized_obj)
+            else:
+                with self.acquire_write() as buf:
+                    buf[0] = 0  # not overflow
+                    buf[1 : len(serialized_obj) + 1] = serialized_obj
+        if self.n_remote_reader > 0:
+            self.remote_socket.send(serialized_obj)
+
+    def dequeue(self):
+        if self._is_local_reader:
+            with self.acquire_read() as buf:
+                overflow = buf[0] == 1
+                if not overflow:
+                    # no need to know the size of serialized object
+                    # pickle format contains the size information internally
+                    # see https://docs.python.org/3/library/pickle.html
+                    obj = pickle.loads(buf[1:])
+            if overflow:
+                recv = self.local_socket.recv()
+                obj = pickle.loads(recv)
+        elif self._is_remote_reader:
+            recv = self.remote_socket.recv()
+            obj = pickle.loads(recv)
+        else:
+            raise RuntimeError("Only readers can dequeue")
+        return obj
+
+    def broadcast_object(self, obj=None):
+        if self._is_writer:
+            self.enqueue(obj)
+            return obj
+        else:
+            return self.dequeue()
+
+    @staticmethod
+    def create_from_process_group(
+        pg: ProcessGroup, max_chunk_bytes, max_chunks, writer_rank=0
+    ) -> "MessageQueue":
+        group_rank = dist.get_rank(pg)
+        group_world_size = dist.get_world_size(pg)
+        global_ranks = dist.get_process_group_ranks(pg)
+
+        from sglang.srt.distributed.parallel_state import in_the_same_node_as
+
+        status = in_the_same_node_as(pg, source_rank=writer_rank)
+        same_node_ranks = [i for i, s in enumerate(status) if s]
+        n_reader = group_world_size - 1
+        n_local_reader = len(same_node_ranks) - 1
+        local_reader_ranks = [i for i in same_node_ranks if i != writer_rank]
+        buffer_io: MessageQueue
+        if group_rank == writer_rank:
+            buffer_io = MessageQueue(
+                n_reader=n_reader,
+                n_local_reader=n_local_reader,
+                local_reader_ranks=local_reader_ranks,
+                max_chunk_bytes=max_chunk_bytes,
+                max_chunks=max_chunks,
+            )
+            handle = buffer_io.export_handle()
+            dist.broadcast_object_list(
+                [handle], src=global_ranks[writer_rank], group=pg
+            )
+        else:
+            recv = [None]
+            dist.broadcast_object_list(recv, src=global_ranks[writer_rank], group=pg)
+            handle = recv[0]  # type: ignore
+            buffer_io = MessageQueue.create_from_handle(handle, group_rank)
+        buffer_io.wait_until_ready()
+        return buffer_io
diff --git a/python/sglang/srt/distributed/device_communicators/xpu_communicator.py b/python/sglang/srt/distributed/device_communicators/xpu_communicator.py
new file mode 100644
index 000000000..532279f70
--- /dev/null
+++ b/python/sglang/srt/distributed/device_communicators/xpu_communicator.py
@@ -0,0 +1,48 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/xpu_communicator.py
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from sglang.srt.utils import is_xpu
+
+
+class XpuCommunicator:
+
+    def __init__(self, group: ProcessGroup):
+        if not is_xpu():
+            self.disabled = True
+            return
+        self.disabled = False
+        self.group = group
+        self.world_size = dist.get_world_size(self.group)
+
+    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
+        dist.all_reduce(x, group=self.group)
+        return x
+
+    def gather(
+        self, input_: torch.Tensor, rank_in_group: int, dst: int = 0, dim: int = -1
+    ):
+        # For xpu path, gather doesn't work properly together with ray
+        # cluster so we use all_gather instead for now.
+        input_size = input_.size()
+        # Allocate output tensor.
+        output_tensor = torch.empty(
+            (self.world_size,) + input_size, dtype=input_.dtype, device=input_.device
+        )
+        # All-gather.
+        torch.distributed.all_gather_into_tensor(
+            output_tensor, input_, group=self.group
+        )
+        if rank_in_group == dst:
+            # Reshape
+            output_tensor = output_tensor.movedim(0, dim)
+            output_tensor = output_tensor.reshape(
+                input_size[:dim]
+                + (self.world_size * input_size[dim],)
+                + input_size[dim + 1 :]
+            )
+        else:
+            output_tensor = None
+        return output_tensor
diff --git a/python/sglang/srt/distributed/naive_distributed.py b/python/sglang/srt/distributed/naive_distributed.py
new file mode 100644
index 000000000..61165d90c
--- /dev/null
+++ b/python/sglang/srt/distributed/naive_distributed.py
@@ -0,0 +1,112 @@
+import base64
+import os
+import pickle
+import time
+from pathlib import Path
+from typing import Any, List, Optional
+
+import torch
+
+from sglang.srt.utils import MultiprocessingSerializer
+
+
+class NaiveDistributed:
+    def __init__(self, rank: int, world_size: int, rendezvous: str):
+        self._rank = rank
+        self._world_size = world_size
+        self._operation_index = 0
+        self._directory = Path(rendezvous)
+        self._directory.mkdir(parents=True, exist_ok=True)
+        assert 0 <= rank < world_size
+
+        # both barrier to be safe, and as a sanity check
+        self.barrier()
+
+    def get_rank(self):
+        return self._rank
+
+    def get_world_size(self):
+        return self._world_size
+
+    def scatter(
+        self, tensor: torch.Tensor, scatter_list: List[torch.Tensor], src: int = 0
+    ):
+        if self._rank == src:
+            assert len(scatter_list) == self._world_size
+        else:
+            assert scatter_list is None
+
+        gathered_objects = self.all_gather_object(
+            dict(
+                serialized_scatter_list=[
+                    (
+                        None
+                        if item_rank == src
+                        else MultiprocessingSerializer.serialize(item)
+                    )
+                    for item_rank, item in enumerate(scatter_list)
+                ]
+            )
+            if self._rank == src
+            else dict()
+        )
+
+        remote_serialized_tensor = gathered_objects[src]["serialized_scatter_list"][
+            self._rank
+        ]
+        if self._rank == src:
+            assert remote_serialized_tensor is None
+            remote_tensor = scatter_list[self._rank]
+        else:
+            remote_tensor = MultiprocessingSerializer.deserialize(
+                remote_serialized_tensor
+            )
+        tensor.copy_(remote_tensor)
+
+        # avoid src tensor be deleted too early
+        self.barrier()
+
+    def all_gather_object(self, obj: Any) -> List[Any]:
+        self._operation_index += 1
+
+        text_postfix = "\n"
+
+        def _get_path(interesting_rank: int):
+            return (
+                self._directory
+                / f"rank{interesting_rank}_op{self._operation_index}.txt"
+            )
+
+        _get_path(self._rank).write_text(
+            base64.b64encode(pickle.dumps(obj)).decode("utf-8") + text_postfix
+        )
+
+        def _read_one(interesting_rank: int):
+            p = _get_path(interesting_rank)
+            while True:
+                if p.exists() and (text := p.read_text()).endswith(text_postfix):
+                    return pickle.loads(base64.b64decode(text[: -len(text_postfix)]))
+                time.sleep(0.001)
+
+        return [
+            _read_one(interesting_rank) for interesting_rank in range(self._world_size)
+        ]
+
+    def barrier(self):
+        actual_objs = self.all_gather_object(self._rank)
+        assert actual_objs == list(range(self._world_size)), f"{actual_objs=}"
+
+
+# Can have multi instances if needed
+_instance: Optional[NaiveDistributed] = None
+
+
+def get_naive_distributed():
+    assert _instance is not None
+    return _instance
+
+
+def set_naive_distributed(instance: NaiveDistributed):
+    global _instance
+    assert _instance is None
+    _instance = instance
diff --git a/python/sglang/srt/distributed/parallel_state.py b/python/sglang/srt/distributed/parallel_state.py
new file mode 100644
index 000000000..4f410570d
--- /dev/null
+++ b/python/sglang/srt/distributed/parallel_state.py
@@ -0,0 +1,1776 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/parallel_state.py
+
+# Copyright 2023 The vLLM team.
+# Adapted from
+# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+"""vLLM distributed state.
+It takes over the control of the distributed environment from PyTorch.
+The typical workflow is:
+
+- call `init_distributed_environment` to initialize the distributed environment.
+- call `initialize_model_parallel` or `ensure_model_parallel_initialized` to
+ initialize the model parallel groups.
+
+- any code dealing with the distributed stuff
+
+- call `destroy_model_parallel` to destroy the model parallel groups.
+- call `destroy_distributed_environment` to destroy the distributed environment.
+
+If you only need to use the distributed environment without model/pipeline
+ parallelism, you can skip the model parallel initialization and destruction
+ steps.
+"""
+import contextlib
+import gc
+import logging
+import os
+import pickle
+import weakref
+from collections import namedtuple
+from contextlib import contextmanager, nullcontext
+from dataclasses import dataclass
+from datetime import timedelta
+from multiprocessing import shared_memory
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from unittest.mock import patch
+
+import torch
+import torch.distributed
+from torch.distributed import Backend, ProcessGroup
+
+from sglang.srt.utils import (
+    direct_register_custom_op,
+    get_bool_env_var,
+    get_int_env_var,
+    is_cpu,
+    is_cuda_alike,
+    is_hip,
+    is_npu,
+    is_shm_available,
+    supports_custom_op,
+)
+
+_is_npu = is_npu()
+_is_cpu = is_cpu()
+
+IS_ONE_DEVICE_PER_PROCESS = get_bool_env_var("SGLANG_ONE_DEVICE_PER_PROCESS")
+
+
+@dataclass
+class GraphCaptureContext:
+    stream: torch.cuda.Stream if not _is_npu else torch.npu.Stream
+
+
+TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])
+
+# use int value instead of ReduceOp.SUM to support torch compile
+REDUCE_OP_SUM = int(torch.distributed.ReduceOp.SUM)
+
+
+def _split_tensor_dict(
+    tensor_dict: Dict[str, Union[torch.Tensor, Any]]
+) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
+    """Split the tensor dictionary into two parts:
+    1. A list of (key, value) pairs. If the value is a tensor, it is replaced
+         by its metadata.
+    2. A list of tensors.
+    """
+    metadata_list: List[Tuple[str, Any]] = []
+    tensor_list: List[torch.Tensor] = []
+    for key, value in tensor_dict.items():
+        if isinstance(value, torch.Tensor):
+            # Note: we cannot use `value.device` here,
+            # because it contains not only the device type but also the device
+            # index (e.g. "cuda:0"). We only need the device type.
+            # receiving side will set the device index.
+            device = value.device.type
+            metadata_list.append(
+                (key, TensorMetadata(device, value.dtype, value.size()))
+            )
+            tensor_list.append(value)
+        else:
+            metadata_list.append((key, value))
+    return metadata_list, tensor_list
+
+
+_group_name_counter: Dict[str, int] = {}
+
+
+def _get_unique_name(name: str) -> str:
+    """Get a unique name for the group.
+    Example:
+    _get_unique_name("tp") -> "tp:0"
+    _get_unique_name("tp") -> "tp:1"
+    """
+    if name not in _group_name_counter:
+        _group_name_counter[name] = 0
+    newname = f"{name}:{_group_name_counter[name]}"
+    _group_name_counter[name] += 1
+    return newname
+
+
+_groups: Dict[str, Callable[[], Optional["GroupCoordinator"]]] = {}
+
+
+def _register_group(group: "GroupCoordinator") -> None:
+    _groups[group.unique_name] = weakref.ref(group)
+
+
+if supports_custom_op():
+
+    def inplace_all_reduce(tensor: torch.Tensor, group_name: str) -> None:
+        assert group_name in _groups, f"Group {group_name} is not found."
+        group = _groups[group_name]()
+        if group is None:
+            raise ValueError(f"Group {group_name} is destroyed.")
+        group._all_reduce_in_place(tensor)
+
+    def inplace_all_reduce_fake(tensor: torch.Tensor, group_name: str) -> None:
+        return
+
+    direct_register_custom_op(
+        op_name="inplace_all_reduce",
+        op_func=inplace_all_reduce,
+        mutates_args=["tensor"],
+        fake_impl=inplace_all_reduce_fake,
+    )
+
+    def outplace_all_reduce(
+        tensor: torch.Tensor, group_name: str, outplace_all_reduce_method: str
+    ) -> torch.Tensor:
+        assert group_name in _groups, f"Group {group_name} is not found."
+        group = _groups[group_name]()
+        if group is None:
+            raise ValueError(f"Group {group_name} is destroyed.")
+        return group._all_reduce_out_place(tensor, outplace_all_reduce_method)
+
+    def outplace_all_reduce_fake(
+        tensor: torch.Tensor, group_name: str, outplace_all_reduce_method: str
+    ) -> torch.Tensor:
+        return torch.empty_like(tensor)
+
+    direct_register_custom_op(
+        op_name="outplace_all_reduce",
+        op_func=outplace_all_reduce,
+        mutates_args=[],
+        fake_impl=outplace_all_reduce_fake,
+    )
+
+    def reg_all_gather_into_tensor(
+        output: torch.Tensor, input: torch.Tensor, group_name: str
+    ) -> None:
+        assert group_name in _groups, f"Group {group_name} is not found."
+        group = _groups[group_name]()
+        if group is None:
+            raise ValueError(f"Group {group_name} is destroyed.")
+        group._all_gather_into_tensor(output, input)
+
+    def reg_all_gather_into_tensor_fake(
+        output: torch.Tensor, input: torch.Tensor, group_name: str
+    ) -> None:
+        pass
+
+    direct_register_custom_op(
+        op_name="reg_all_gather_into_tensor",
+        op_func=reg_all_gather_into_tensor,
+        mutates_args=["output"],
+        fake_impl=reg_all_gather_into_tensor_fake,
+    )
+
+
+class GroupCoordinator:
+    """
+    PyTorch ProcessGroup wrapper for a group of processes.
+    PyTorch ProcessGroup is bound to one specific communication backend,
+        e.g. NCCL, Gloo, MPI, etc.
+    GroupCoordinator takes charge of all the communication operations among
+        the processes in the group. It can route the communication to
+        a specific implementation (e.g. switch allreduce implementation
+        based on the tensor size and cuda graph mode).
+    """
+
+    # available attributes:
+    rank: int  # global rank
+    ranks: List[int]  # global ranks in the group
+    world_size: int  # size of the group
+    # difference between `local_rank` and `rank_in_group`:
+    # if we have a group of size 4 across two nodes:
+    # Process | Node | Rank | Local Rank | Rank in Group
+    #   0     |   0  |  0   |     0      |       0
+    #   1     |   0  |  1   |     1      |       1
+    #   2     |   1  |  2   |     0      |       2
+    #   3     |   1  |  3   |     1      |       3
+    local_rank: int  # local rank used to assign devices
+    rank_in_group: int  # rank inside the group
+    cpu_group: ProcessGroup  # group for CPU communication
+    device_group: ProcessGroup  # group for device communication
+    use_pynccl: bool  # a hint of whether to use PyNccl
+    use_pymscclpp: bool  # a hint of whether to use PyMsccl
+    use_custom_allreduce: bool  # a hint of whether to use CustomAllreduce
+    use_message_queue_broadcaster: (
+        bool  # a hint of whether to use message queue broadcaster
+    )
+    # communicators are only created for world size > 1
+    pynccl_comm: Optional[Any]  # PyNccl communicator
+    ca_comm: Optional[Any]  # Custom allreduce communicator
+    mq_broadcaster: Optional[Any]  # shared memory broadcaster
+
+    def __init__(
+        self,
+        group_ranks: List[List[int]],
+        local_rank: int,
+        torch_distributed_backend: Union[str, Backend],
+        use_pynccl: bool,
+        use_pymscclpp: bool,
+        use_custom_allreduce: bool,
+        use_hpu_communicator: bool,
+        use_xpu_communicator: bool,
+        use_npu_communicator: bool,
+        use_message_queue_broadcaster: bool = False,
+        group_name: Optional[str] = None,
+    ):
+        # Set group info
+        group_name = group_name or "anonymous"
+        self.unique_name = _get_unique_name(group_name)
+        _register_group(self)
+
+        # Set rank info
+        self.rank = torch.distributed.get_rank()
+        self.local_rank = local_rank
+        self.device_group = None
+        self.cpu_group = None
+        self.local_size = get_int_env_var("LOCAL_SIZE", 0)
+
+        for ranks in group_ranks:
+            device_group = torch.distributed.new_group(
+                ranks, backend=torch_distributed_backend
+            )
+            # a group with `gloo` backend, to allow direct coordination between
+            # processes through the CPU.
+            cpu_group = torch.distributed.new_group(ranks, backend="gloo")
+            if self.rank in ranks:
+                self.ranks = ranks
+                self.world_size = len(ranks)
+                self.rank_in_group = ranks.index(self.rank)
+                self.device_group = device_group
+                self.cpu_group = cpu_group
+
+        assert self.cpu_group is not None
+        assert self.device_group is not None
+
+        device_id = 0 if IS_ONE_DEVICE_PER_PROCESS else local_rank
+        if is_cuda_alike():
+            self.device = torch.device(f"cuda:{device_id}")
+        elif _is_npu:
+            self.device = torch.device(f"npu:{device_id}")
+        else:
+            self.device = torch.device("cpu")
+        self.device_module = torch.get_device_module(self.device)
+
+        # Import communicators
+        self.use_pynccl = use_pynccl
+        self.use_pymscclpp = use_pymscclpp
+        self.use_custom_allreduce = use_custom_allreduce
+        self.use_hpu_communicator = use_hpu_communicator
+        self.use_xpu_communicator = use_xpu_communicator
+        self.use_npu_communicator = use_npu_communicator
+        self.use_message_queue_broadcaster = use_message_queue_broadcaster
+
+        # lazy import to avoid documentation build error
+        from sglang.srt.distributed.device_communicators.custom_all_reduce import (
+            CustomAllreduce,
+        )
+        from sglang.srt.distributed.device_communicators.pymscclpp import (
+            PyMscclppCommunicator,
+        )
+        from sglang.srt.distributed.device_communicators.pynccl import (
+            PyNcclCommunicator,
+        )
+
+        if is_hip():
+            from sglang.srt.distributed.device_communicators.quick_all_reduce import (
+                QuickAllReduce,
+                qr_rocm_arch_available,
+            )
+
+        self.pynccl_comm: Optional[PyNcclCommunicator] = None
+        if use_pynccl and self.world_size > 1:
+            self.pynccl_comm = PyNcclCommunicator(
+                group=self.cpu_group,
+                device=self.device,
+            )
+
+        self.pymscclpp_comm: Optional[PyMscclppCommunicator] = None
+        if use_pymscclpp and self.world_size > 1:
+            self.pymscclpp_comm = PyMscclppCommunicator(
+                group=self.cpu_group,
+                device=self.device,
+            )
+
+        self.ca_comm: Optional[CustomAllreduce] = None
+        self.qr_comm: Optional[QuickAllReduce] = None
+        if use_custom_allreduce and self.world_size > 1:
+            # Initialize a custom fast all-reduce implementation.
+            try:
+                self.ca_comm = CustomAllreduce(
+                    group=self.cpu_group,
+                    device=self.device,
+                )
+            except Exception as e:
+                logger.warning(
+                    f"Setup Custom allreduce failed with {e}. To silence this "
+                    "warning, specify --disable-custom-all-reduce explicitly."
+                )
+            if is_hip():
+                try:
+                    # Initialize a custom quick all-reduce implementation for AMD
+                    # when rocm >= gfx942. Quick reduce is designed as a
+                    # complement to custom allreduce.
+                    # Based on quickreduce (https://github.com/mk1-project/quickreduce).
+                    if qr_rocm_arch_available():
+                        self.qr_comm = QuickAllReduce(
+                            group=self.cpu_group, device=self.device
+                        )
+                except Exception as e:
+                    logger.warning(f"Failed to initialize QuickAllReduce: {e}")
+
+        # Create communicator for other hardware backends
+        from sglang.srt.distributed.device_communicators.hpu_communicator import (
+            HpuCommunicator,
+        )
+        from sglang.srt.distributed.device_communicators.npu_communicator import (
+            NpuCommunicator,
+        )
+        from sglang.srt.distributed.device_communicators.xpu_communicator import (
+            XpuCommunicator,
+        )
+
+        self.hpu_communicator: Optional[HpuCommunicator] = None
+        if use_hpu_communicator and self.world_size > 1:
+            self.hpu_communicator = HpuCommunicator(group=self.device_group)
+
+        self.xpu_communicator: Optional[XpuCommunicator] = None
+        if use_xpu_communicator and self.world_size > 1:
+            self.xpu_communicator = XpuCommunicator(group=self.device_group)
+
+        self.npu_communicator: Optional[NpuCommunicator] = None
+        if use_npu_communicator and self.world_size > 1:
+            self.npu_communicator = NpuCommunicator(group=self.device_group)
+
+        # Create message queue
+        from sglang.srt.distributed.device_communicators.shm_broadcast import (
+            MessageQueue,
+        )
+
+        self.mq_broadcaster: Optional[MessageQueue] = None
+        if use_message_queue_broadcaster and self.world_size > 1:
+            self.mq_broadcaster = MessageQueue.create_from_process_group(
+                self.cpu_group, 1 << 22, 6
+            )
+
+    def __repr__(self):
+        return (
+            f"ranks={self.ranks} rank={self.rank} local_rank={self.local_rank} use_pynccl={self.use_pynccl} "
+            f"device_group={self.device_group} cpu_group={self.cpu_group} unique_name={self.unique_name} "
+            f"world_size={self.world_size} rank_in_group={self.rank_in_group}"
+        )
+
+    @property
+    def first_rank(self):
+        """Return the global rank of the first process in the group"""
+        return self.ranks[0]
+
+    @property
+    def last_rank(self):
+        """Return the global rank of the last process in the group"""
+        return self.ranks[-1]
+
+    @property
+    def is_first_rank(self):
+        """Return whether the caller is the first process in the group"""
+        return self.rank == self.first_rank
+
+    @property
+    def is_last_rank(self):
+        """Return whether the caller is the last process in the group"""
+        return self.rank == self.last_rank
+
+    @property
+    def next_rank(self):
+        """Return the global rank of the process that follows the caller"""
+        rank_in_group = self.rank_in_group
+        world_size = self.world_size
+        return self.ranks[(rank_in_group + 1) % world_size]
+
+    @property
+    def prev_rank(self):
+        """Return the global rank of the process that precedes the caller"""
+        rank_in_group = self.rank_in_group
+        world_size = self.world_size
+        return self.ranks[(rank_in_group - 1) % world_size]
+
+    @contextmanager
+    def graph_capture(
+        self, graph_capture_context: Optional[GraphCaptureContext] = None
+    ):
+        if graph_capture_context is None:
+            stream = self.device_module.Stream()
+            graph_capture_context = GraphCaptureContext(stream)
+        else:
+            stream = graph_capture_context.stream
+        # We don't need the context of custom quick allreduce because the ipc access
+        # is already collected in init() and we can capture the quick allreduce directly.
+        ca_comm = self.ca_comm
+        maybe_ca_context = nullcontext() if ca_comm is None else ca_comm.capture()
+
+        # ensure all initialization operations complete before attempting to
+        # capture the graph on another stream
+        curr_stream = self.device_module.current_stream()
+        if curr_stream != stream:
+            stream.wait_stream(curr_stream)
+
+        with self.device_module.stream(stream), maybe_ca_context:
+            # In graph mode, we have to be very careful about the collective
+            # operations. The current status is:
+            #     allreduce \ Mode   |  Eager  |  Graph  |
+            # --------------------------------------------
+            # quick allreduce        | enabled | enabled |
+            # custom allreduce       | enabled | enabled |
+            # PyNccl                 | disabled| enabled |
+            # PyMscclpp              | disabled| enabled |
+            # torch.distributed      | enabled | disabled|
+            #
+            # Note: When custom quick allreduce is enabled, a runtime check
+            #  will be performed. If the tensor size is too small, it will
+            #  automatically fall back to the next available option.
+            # Note that custom allreduce will have a runtime check, if the
+            #  tensor size is too large, it will fallback to the next
+            #  available option.
+            # Note that the PyMsccl needs to register the tensor in ahead,
+            #  which will introduce large overhead in the eager case,
+            #  therefore it is only supported in the graph case.
+            # In summary: We select the appropriate allreduce method for
+            #  each mode based on the algorithm order in the table and
+            #  their usage conditions.
+            pynccl_comm = self.pynccl_comm
+            maybe_pynccl_context: Any
+            if not pynccl_comm:
+                maybe_pynccl_context = nullcontext()
+            else:
+                maybe_pynccl_context = pynccl_comm.change_state(
+                    enable=True, stream=torch.cuda.current_stream()
+                )
+
+            pymscclpp_comm = self.pymscclpp_comm
+            maybe_pymscclpp_context: Any
+            if not pymscclpp_comm:
+                maybe_pymscclpp_context = nullcontext()
+            else:
+                maybe_pymscclpp_context = pymscclpp_comm.change_state(enable=True)
+            with maybe_pynccl_context, maybe_pymscclpp_context:
+                yield graph_capture_context
+
+    def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
+        """
+        User-facing all-reduce function before we actually call the
+        all-reduce operation.
+
+        We need this because Dynamo does not support passing an arbitrary
+        object (`self` in this case) to a custom op. We need to pass the
+         group name as a string, and then look up the group coordinator from
+         the group name, dispatch the all-reduce operation to the group
+         coordinator.
+
+        In addition, PyTorch custom ops do not support mutation or returning
+        a new tensor in the same op. So we need to figure out if the op is
+        in-place or out-of-place ahead of time.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return input_
+
+        if input_.is_cpu:
+            if is_shm_available(input_.dtype, self.world_size, self.local_size):
+                torch.ops.sgl_kernel.shm_allreduce(input_, REDUCE_OP_SUM)
+            else:
+                torch.distributed.all_reduce(input_, group=self.device_group)
+            return input_
+
+        if not supports_custom_op():
+            self._all_reduce_in_place(input_)
+            return input_
+
+        if self.hpu_communicator is not None and not self.hpu_communicator.disabled:
+            return self.hpu_communicator.all_reduce(input_)
+
+        if self.xpu_communicator is not None and not self.xpu_communicator.disabled:
+            return self.xpu_communicator.all_reduce(input_)
+
+        if self.npu_communicator is not None and not self.npu_communicator.disabled:
+            return self.npu_communicator.all_reduce(input_)
+
+        if (
+            self.pynccl_comm is not None
+            and hasattr(input_, "symmetric_memory")
+            and input_.symmetric_memory
+        ):
+            with self.pynccl_comm.change_state(
+                enable=True, stream=torch.cuda.current_stream()
+            ):
+                self.pynccl_comm.all_reduce(input_)
+                return input_
+
+        outplace_all_reduce_method = None
+        if (
+            self.qr_comm is not None
+            and not self.qr_comm.disabled
+            and self.qr_comm.should_quick_allreduce(input_)
+        ):
+            outplace_all_reduce_method = "qr"
+        elif (
+            self.ca_comm is not None
+            and not self.ca_comm.disabled
+            and self.ca_comm.should_custom_ar(input_)
+        ):
+            outplace_all_reduce_method = "ca"
+        elif (
+            self.pymscclpp_comm is not None
+            and not self.pymscclpp_comm.disabled
+            and self.pymscclpp_comm.should_mscclpp_allreduce(input_)
+        ):
+            outplace_all_reduce_method = "pymscclpp"
+        if outplace_all_reduce_method is not None:
+            return torch.ops.sglang.outplace_all_reduce(
+                input_,
+                group_name=self.unique_name,
+                outplace_all_reduce_method=outplace_all_reduce_method,
+            )
+        else:
+            torch.ops.sglang.inplace_all_reduce(input_, group_name=self.unique_name)
+            return input_
+
+    def _all_reduce_out_place(
+        self, input_: torch.Tensor, outplace_all_reduce_method: str
+    ) -> torch.Tensor:
+        qr_comm = self.qr_comm
+        ca_comm = self.ca_comm
+        pymscclpp_comm = self.pymscclpp_comm
+        assert any([qr_comm, ca_comm, pymscclpp_comm])
+        if outplace_all_reduce_method == "qr":
+            assert not qr_comm.disabled
+            out = qr_comm.quick_all_reduce(input_)
+        elif outplace_all_reduce_method == "ca":
+            assert not ca_comm.disabled
+            out = ca_comm.custom_all_reduce(input_)
+        else:
+            assert not pymscclpp_comm.disabled
+            out = pymscclpp_comm.all_reduce(input_)
+        assert out is not None
+        return out
+
+    def _all_reduce_in_place(self, input_: torch.Tensor) -> None:
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.all_reduce(input_)
+        else:
+            torch.distributed.all_reduce(input_, group=self.device_group)
+
+    def reduce_scatter_tensor(
+        self,
+        output: torch.Tensor,
+        input: torch.Tensor,
+    ) -> None:
+        # TODO(ch-wan): support other backends
+        torch.distributed.reduce_scatter_tensor(output, input, group=self.device_group)
+        return output
+
+    def reduce_scatter(
+        self,
+        output: torch.Tensor,
+        input_list: List[torch.Tensor],
+    ) -> None:
+        # TODO(ch-wan): support other backends
+        torch.distributed.reduce_scatter(output, input_list, group=self.device_group)
+        return output
+
+    def reduce_scatterv(
+        self,
+        input_: torch.Tensor,
+        output: Optional[torch.Tensor] = None,
+        sizes: Optional[List[int]] = None,
+    ) -> torch.Tensor:
+        world_size = self.world_size
+        pynccl_comm = self.pynccl_comm
+
+        with pynccl_comm.change_state(enable=True, stream=torch.cuda.current_stream()):
+            assert (
+                pynccl_comm is not None and not pynccl_comm.disabled
+            ), "pynccl is required for reduce_scatterv"
+
+            if sizes is not None:
+                assert len(sizes) == world_size
+                assert input_.shape[0] == sum(sizes)
+                chunk_size = sizes[self.rank_in_group]
+            else:
+                assert input_.shape[0] % world_size == 0
+                chunk_size = input_.shape[0] // world_size
+            output_shape = (chunk_size,) + input_.shape[1:]
+
+            if output is None:
+                output = torch.empty(
+                    output_shape, dtype=input_.dtype, device=input_.device
+                )
+            else:
+                assert output.shape == output_shape
+
+            pynccl_comm.reduce_scatter(output, input_, sizes=sizes)
+            return output
+
+    def _all_gather_into_tensor(self, output: torch.Tensor, input: torch.Tensor):
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.all_gather(output, input)
+        else:
+            torch.distributed.all_gather_into_tensor(
+                output, input, group=self.device_group
+            )
+
+    def all_gather_into_tensor(self, output: torch.Tensor, input: torch.Tensor):
+        if _is_npu or not supports_custom_op():
+            self._all_gather_into_tensor(output, input)
+        else:
+            torch.ops.sglang.reg_all_gather_into_tensor(
+                output, input, group_name=self.unique_name
+            )
+
+    def all_gather(
+        self,
+        input_: torch.Tensor,
+        dim: int = -1,
+        output_tensor_list: Optional[List[torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        world_size = self.world_size
+        # Bypass the function if we are using only 1 GPU.
+        if world_size == 1:
+            if output_tensor_list is not None:
+                logger.warning(
+                    "Performing in-place all-gather with a group size of 1. "
+                    "This may be unnecessary; consider bypassing it for better efficiency."
+                )
+                output_tensor_list[0].copy_(input_)
+                return None
+            else:
+                return input_
+
+        if output_tensor_list is not None:
+            # TODO(ch-wan): support other backends
+            return torch.distributed.all_gather(
+                output_tensor_list, input_, group=self.device_group
+            )
+
+        assert (
+            -input_.dim() <= dim < input_.dim()
+        ), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
+
+        # For HPUs, use HPU communicator.
+        hpu_comm = self.hpu_communicator
+        if hpu_comm is not None and not hpu_comm.disabled:
+            return hpu_comm.all_gather(input_, dim)
+
+        # For NPUs, use NPU communicator.
+        npu_comm = self.npu_communicator
+        if npu_comm is not None and not npu_comm.disabled:
+            return npu_comm.all_gather(input_, dim)
+
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        input_size = input_.size()
+        # NOTE: we have to use concat-style all-gather here,
+        # stack-style all-gather has compatibility issues with
+        # torch.compile . see https://github.com/pytorch/pytorch/issues/138795
+        output_size = (input_size[0] * world_size,) + input_size[1:]
+        # Allocate output tensor.
+        output_tensor = torch.empty(
+            output_size, dtype=input_.dtype, device=input_.device
+        )
+
+        # All-gather.
+        if input_.is_cpu and is_shm_available(
+            input_.dtype, self.world_size, self.local_size
+        ):
+            return torch.ops.sgl_kernel.shm_allgather(input_, dim)
+
+        if input_.is_cpu:
+            torch.distributed.all_gather_into_tensor(
+                output_tensor, input_, group=self.device_group
+            )
+        else:
+            self.all_gather_into_tensor(output_tensor, input_)
+
+        # Reshape
+        output_tensor = output_tensor.reshape((world_size,) + input_size)
+        output_tensor = output_tensor.movedim(0, dim)
+        output_tensor = output_tensor.reshape(
+            input_size[:dim] + (world_size * input_size[dim],) + input_size[dim + 1 :]
+        )
+        return output_tensor
+
+    def all_gatherv(
+        self,
+        input_: Union[torch.Tensor, List[torch.Tensor]],
+        sizes: Optional[List[int]] = None,
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        """
+        Supports varying sizes per rank and input tensor list.
+        `sizes`: a list of len(world_size) with the number of items per rank to gather.
+        """
+        world_size = self.world_size
+        pynccl_comm = self.pynccl_comm
+
+        with pynccl_comm.change_state(enable=True, stream=torch.cuda.current_stream()):
+            assert (
+                pynccl_comm is not None and not pynccl_comm.disabled
+            ), "pynccl is required for all_gatherv"
+
+            def _all_gather_single(
+                input_: torch.Tensor, sizes: Optional[List[int]] = None
+            ):
+                input_size = input_.size()
+                if sizes is not None:
+                    assert len(sizes) == world_size
+                    assert input_.shape[0] == sizes[self.rank_in_group]
+                    output_size = (sum(sizes),) + input_size[1:]
+                    # 'sizes' is not needed if all inputs in the same group have the same shape
+                    if all(s == sizes[0] for s in sizes):
+                        sizes = None
+                else:
+                    output_size = (input_size[0] * world_size,) + input_size[1:]
+                # Allocate output tensor.
+                output_tensor = torch.empty(
+                    output_size, dtype=input_.dtype, device=input_.device
+                )
+                pynccl_comm.all_gather(output_tensor, input_, sizes=sizes)
+                return output_tensor
+
+            if isinstance(input_, torch.Tensor):
+                return _all_gather_single(input_, sizes)
+
+            output_list = []
+            pynccl_comm.group_start()
+            for inp in input_:
+                output_list.append(_all_gather_single(inp, sizes=sizes))
+            pynccl_comm.group_end()
+
+            return output_list
+
+    def gather(
+        self, input_: torch.Tensor, dst: int = 0, dim: int = -1
+    ) -> Optional[torch.Tensor]:
+        """
+        NOTE: We assume that the input tensor is on the same device across
+        all the ranks.
+        NOTE: `dst` is the local rank of the destination rank.
+        """
+        world_size = self.world_size
+        # Bypass the function if we are using only 1 GPU.
+        if world_size == 1:
+            return input_
+        assert (
+            -input_.dim() <= dim < input_.dim()
+        ), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        if self.xpu_communicator is not None and not self.xpu_communicator.disabled:
+            return self.xpu_communicator.gather(input_, self.rank_in_group, dst, dim)
+        # Allocate output tensor.
+        if self.rank_in_group == dst:
+            gather_list = [torch.empty_like(input_) for _ in range(world_size)]
+        else:
+            gather_list = None
+        # Gather.
+        torch.distributed.gather(
+            input_, gather_list, dst=self.ranks[dst], group=self.device_group
+        )
+        if self.rank_in_group == dst:
+            output_tensor = torch.cat(gather_list, dim=dim)
+        else:
+            output_tensor = None
+        return output_tensor
+
+    def broadcast(self, input_: torch.Tensor, src: int = 0):
+        """Broadcast the input tensor.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return input_
+        # Broadcast.
+        torch.distributed.broadcast(
+            input_, src=self.ranks[src], group=self.device_group
+        )
+        return input_
+
+    def broadcast_object(self, obj: Optional[Any] = None, src: int = 0):
+        """Broadcast the input object.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return obj
+        if self.mq_broadcaster is not None:
+            assert src == 0, "Message queue broadcaster only supports src=0"
+            return self.mq_broadcaster.broadcast_object(obj)
+        if self.rank_in_group == src:
+            torch.distributed.broadcast_object_list(
+                [obj], src=self.ranks[src], group=self.cpu_group
+            )
+            return obj
+        else:
+            recv = [None]
+            torch.distributed.broadcast_object_list(
+                recv, src=self.ranks[src], group=self.cpu_group
+            )
+            return recv[0]
+
+    def broadcast_object_list(
+        self, obj_list: List[Any], src: int = 0, group: Optional[ProcessGroup] = None
+    ):
+        """Broadcast the input object list.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return obj_list
+        # Broadcast.
+        torch.distributed.broadcast_object_list(
+            obj_list, src=self.ranks[src], group=self.device_group
+        )
+        return obj_list
+
+    def all_gather_object(self, obj: Any) -> List[Any]:
+        objs = [None] * self.world_size
+        torch.distributed.all_gather_object(objs, obj, group=self.cpu_group)
+        return objs
+
+    def send_object(self, obj: Any, dst: int) -> None:
+        """Send the input object list to the destination rank."""
+        """NOTE: `dst` is the local rank of the destination rank."""
+
+        assert dst < self.world_size, f"Invalid dst rank ({dst})"
+
+        assert dst != self.rank_in_group, (
+            "Invalid destination rank. Destination rank is the same "
+            "as the current rank."
+        )
+
+        # Serialize object to tensor and get the size as well
+        object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8).cuda(
+            device=torch.cuda.current_device()
+        )
+
+        size_tensor = torch.tensor(
+            [object_tensor.numel()],
+            dtype=torch.long,
+            device="cpu",
+        )
+        # Send object size
+        torch.distributed.send(size_tensor, dst=self.ranks[dst], group=self.cpu_group)
+
+        # Send object
+        torch.distributed.send(
+            object_tensor,
+            dst=self.ranks[dst],
+            group=self.device_group,
+        )
+
+        return None
+
+    def recv_object(self, src: int) -> Any:
+        """Receive the input object list from the source rank."""
+        """NOTE: `src` is the local rank of the source rank."""
+
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        assert (
+            src != self.rank_in_group
+        ), "Invalid source rank. Source rank is the same as the current rank."
+
+        size_tensor = torch.empty(1, dtype=torch.long, device="cpu")
+
+        # Receive object size
+        rank_size = torch.distributed.recv(
+            size_tensor, src=self.ranks[src], group=self.cpu_group
+        )
+
+        # Tensor to receive serialized objects into.
+        object_tensor = torch.empty(  # type: ignore[call-overload]
+            size_tensor.item(),  # type: ignore[arg-type]
+            dtype=torch.uint8,
+            device=torch.cuda.current_device(),
+        )
+
+        rank_object = torch.distributed.recv(
+            object_tensor, src=self.ranks[src], group=self.device_group
+        )
+
+        assert (
+            rank_object == rank_size
+        ), "Received object sender rank does not match the size sender rank."
+
+        obj = pickle.loads(object_tensor.cpu().numpy())
+
+        return obj
+
+    def broadcast_tensor_dict(
+        self,
+        tensor_dict: Optional[Dict[str, Union[torch.Tensor, Any]]] = None,
+        src: int = 0,
+        group: Optional[ProcessGroup] = None,
+        metadata_group: Optional[ProcessGroup] = None,
+    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
+        """Broadcast the input tensor dictionary.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if not torch.distributed.is_initialized() or self.world_size == 1:
+            return tensor_dict
+
+        group = self.device_group
+        metadata_group = self.cpu_group
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        rank_in_group = self.rank_in_group
+        if rank_in_group == src:
+            metadata_list: List[Tuple[Any, Any]] = []
+            assert isinstance(
+                tensor_dict, dict
+            ), f"Expecting a dictionary, got {type(tensor_dict)}"
+            metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
+            # `metadata_list` lives in CPU memory.
+            # `broadcast_object_list` has serialization & deserialization,
+            # all happening on CPU. Therefore, we can use the CPU group.
+            self.broadcast_object(metadata_list, src=src)
+            async_handles = []
+            for tensor in tensor_list:
+                if tensor.numel() == 0:
+                    # Skip broadcasting empty tensors.
+                    continue
+                if tensor.is_cpu:
+                    # use metadata_group for CPU tensors
+                    handle = torch.distributed.broadcast(
+                        tensor, src=self.ranks[src], group=metadata_group, async_op=True
+                    )
+                else:
+                    # use group for GPU tensors
+                    handle = torch.distributed.broadcast(
+                        tensor, src=self.ranks[src], group=group, async_op=True
+                    )
+                async_handles.append(handle)
+            for async_handle in async_handles:
+                async_handle.wait()
+
+        else:
+            metadata_list = self.broadcast_object(None, src=src)
+            tensor_dict = {}
+            async_handles = []
+            for key, value in metadata_list:
+                if isinstance(value, TensorMetadata):
+                    tensor = torch.empty(
+                        value.size, dtype=value.dtype, device=value.device
+                    )
+                    if tensor.numel() == 0:
+                        # Skip broadcasting empty tensors.
+                        tensor_dict[key] = tensor
+                        continue
+                    if tensor.is_cpu:
+                        # use metadata_group for CPU tensors
+                        handle = torch.distributed.broadcast(
+                            tensor,
+                            src=self.ranks[src],
+                            group=metadata_group,
+                            async_op=True,
+                        )
+                    else:
+                        # use group for GPU tensors
+                        handle = torch.distributed.broadcast(
+                            tensor, src=self.ranks[src], group=group, async_op=True
+                        )
+                    async_handles.append(handle)
+                    tensor_dict[key] = tensor
+                else:
+                    tensor_dict[key] = value
+            for async_handle in async_handles:
+                async_handle.wait()
+        return tensor_dict
+
+    def send_tensor_dict(
+        self,
+        tensor_dict: Dict[str, Union[torch.Tensor, Any]],
+        dst: Optional[int] = None,
+        all_gather_group: Optional["GroupCoordinator"] = None,
+    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
+        """Send the input tensor dictionary.
+        NOTE: `dst` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if not torch.distributed.is_initialized() or self.world_size == 1:
+            return tensor_dict
+
+        all_gather_size = 1 if all_gather_group is None else all_gather_group.world_size
+        all_gather_rank = (
+            0 if all_gather_group is None else all_gather_group.rank_in_group
+        )
+
+        group = self.device_group
+        metadata_group = self.cpu_group
+
+        if dst is None:
+            dst = (self.rank_in_group + 1) % self.world_size
+        assert dst < self.world_size, f"Invalid dst rank ({dst})"
+
+        assert isinstance(
+            tensor_dict, dict
+        ), f"Expecting a dictionary, got {type(tensor_dict)}"
+        metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
+        # Note: While switching to Device-to-Device (D2D) would introduce an extra
+        # Device-to-Host (D2H) memory copy overhead for serialization, our benchmarks
+        # show better overall transmission performance with D2D due to:
+        # 1. Superior D2D transfer bandwidth
+        # 2. Ability to overlap send and recv operations
+        # Thus the net performance gain justifies this approach.
+        self.send_object(metadata_list, dst=dst)
+        for tensor in tensor_list:
+            if tensor.numel() == 0:
+                # Skip sending empty tensors.
+                continue
+
+            # send-allgather: send only a slice, then do allgather.
+            if all_gather_group is not None and tensor.numel() % all_gather_size == 0:
+                tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank]
+
+            if tensor.is_cpu:
+                # use metadata_group for CPU tensors
+                torch.distributed.send(
+                    tensor, dst=self.ranks[dst], group=metadata_group
+                )
+            else:
+                # use group for GPU tensors
+                torch.distributed.send(tensor, dst=self.ranks[dst], group=group)
+        return None
+
+    def recv_tensor_dict(
+        self,
+        src: Optional[int] = None,
+        all_gather_group: Optional["GroupCoordinator"] = None,
+    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
+        """Recv the input tensor dictionary.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if not torch.distributed.is_initialized() or self.world_size == 1:
+            return None
+
+        all_gather_size = 1 if all_gather_group is None else all_gather_group.world_size
+        all_gather_rank = (
+            0 if all_gather_group is None else all_gather_group.rank_in_group
+        )
+
+        group = self.device_group
+        metadata_group = self.cpu_group
+
+        if src is None:
+            src = (self.rank_in_group - 1) % self.world_size
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        recv_metadata_list = self.recv_object(src=src)
+        tensor_dict: Dict[str, Any] = {}
+        for key, value in recv_metadata_list:
+            if isinstance(value, TensorMetadata):
+                tensor = torch.empty(value.size, dtype=value.dtype, device=value.device)
+                if tensor.numel() == 0:
+                    # Skip broadcasting empty tensors.
+                    tensor_dict[key] = tensor
+                    continue
+
+                # send-allgather: send only a slice, then do allgather.
+                use_all_gather = (
+                    all_gather_group is not None
+                    and tensor.numel() % all_gather_size == 0
+                )
+
+                if use_all_gather:
+                    orig_shape = tensor.shape
+                    tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank]
+
+                if tensor.is_cpu:
+                    # use metadata_group for CPU tensors
+                    torch.distributed.recv(
+                        tensor, src=self.ranks[src], group=metadata_group
+                    )
+                else:
+                    # use group for GPU tensors
+                    torch.distributed.recv(tensor, src=self.ranks[src], group=group)
+                if use_all_gather:
+                    # do the allgather
+                    tensor = all_gather_group.all_gather(tensor, dim=0)  # type: ignore
+                    tensor = tensor.reshape(orig_shape)
+
+                tensor_dict[key] = tensor
+            else:
+                tensor_dict[key] = value
+        return tensor_dict
+
+    def barrier(self):
+        """Barrier synchronization among the group.
+        NOTE: don't use `device_group` here! `barrier` in NCCL is
+        terrible because it is internally a broadcast operation with
+        secretly created GPU tensors. It is easy to mess up the current
+        device. Use the CPU group instead.
+        """
+        torch.distributed.barrier(group=self.cpu_group)
+
+    def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
+        """Sends a tensor to the destination rank in a non-blocking way"""
+        """NOTE: `dst` is the local rank of the destination rank."""
+        if dst is None:
+            dst = (self.rank_in_group + 1) % self.world_size
+
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.send(tensor, dst)
+        else:
+            torch.distributed.send(tensor, self.ranks[dst], self.device_group)
+
+    def recv(
+        self, size: torch.Size, dtype: torch.dtype, src: Optional[int] = None
+    ) -> torch.Tensor:
+        """Receives a tensor from the source rank."""
+        """NOTE: `src` is the local rank of the source rank."""
+        if src is None:
+            src = (self.rank_in_group - 1) % self.world_size
+
+        tensor = torch.empty(size, dtype=dtype, device=self.device)
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.recv(tensor, src)
+        else:
+            torch.distributed.recv(tensor, self.ranks[src], self.device_group)
+        return tensor
+
+    def destroy(self):
+        if self.device_group is not None:
+            torch.distributed.destroy_process_group(self.device_group)
+            self.device_group = None
+        if self.cpu_group is not None:
+            torch.distributed.destroy_process_group(self.cpu_group)
+            self.cpu_group = None
+        if self.pynccl_comm is not None:
+            self.pynccl_comm = None
+        if self.ca_comm is not None:
+            self.ca_comm = None
+        if self.mq_broadcaster is not None:
+            self.mq_broadcaster = None
+
+
+_WORLD: Optional[GroupCoordinator] = None
+
+
+def get_world_group() -> GroupCoordinator:
+    assert _WORLD is not None, "world group is not initialized"
+    return _WORLD
+
+
+def init_world_group(
+    ranks: List[int], local_rank: int, backend: str
+) -> GroupCoordinator:
+    return GroupCoordinator(
+        group_ranks=[ranks],
+        local_rank=local_rank,
+        torch_distributed_backend=backend,
+        use_pynccl=False,
+        use_pymscclpp=False,
+        use_custom_allreduce=False,
+        use_hpu_communicator=False,
+        use_xpu_communicator=False,
+        use_npu_communicator=False,
+        group_name="world",
+    )
+
+
+def init_model_parallel_group(
+    group_ranks: List[List[int]],
+    local_rank: int,
+    backend: str,
+    use_custom_allreduce: Optional[bool] = None,
+    use_message_queue_broadcaster: bool = False,
+    group_name: Optional[str] = None,
+    use_mscclpp_allreduce: Optional[bool] = None,
+) -> GroupCoordinator:
+    if use_custom_allreduce is None:
+        use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
+    if use_mscclpp_allreduce is None:
+        use_mscclpp_allreduce = _ENABLE_MSCCLPP_ALL_REDUCE
+    return GroupCoordinator(
+        group_ranks=group_ranks,
+        local_rank=local_rank,
+        torch_distributed_backend=backend,
+        use_pynccl=not _is_npu,
+        use_pymscclpp=use_mscclpp_allreduce,
+        use_custom_allreduce=use_custom_allreduce,
+        use_hpu_communicator=True,
+        use_xpu_communicator=True,
+        use_npu_communicator=True,
+        use_message_queue_broadcaster=use_message_queue_broadcaster,
+        group_name=group_name,
+    )
+
+
+_TP: Optional[GroupCoordinator] = None
+
+# duplicate GroupCoordinator for prefill in PD-Multiplexing
+_PDMUX_PREFILL_TP_GROUP: Optional[GroupCoordinator] = None
+
+_ENABLE_PDMUX_P_TP: bool = False
+
+
+def set_pdmux_status(enable_prefill_multiplexing: bool):
+    global _ENABLE_PDMUX_P_TP
+    _ENABLE_PDMUX_P_TP = enable_prefill_multiplexing
+
+
+def get_tp_group() -> GroupCoordinator:
+    if _ENABLE_PDMUX_P_TP:
+        assert (
+            _PDMUX_PREFILL_TP_GROUP is not None
+        ), "tensor model parallel group for PD-Multiplexing Prefill is not initialized"
+        return _PDMUX_PREFILL_TP_GROUP
+    assert _TP is not None, "tensor model parallel group is not initialized"
+    return _TP
+
+
+_MOE_EP: Optional[GroupCoordinator] = None
+_MOE_TP: Optional[GroupCoordinator] = None
+
+
+def get_moe_ep_group() -> GroupCoordinator:
+    assert _MOE_EP is not None, "expert model parallel group is not initialized"
+    return _MOE_EP
+
+
+def get_moe_tp_group() -> GroupCoordinator:
+    assert _MOE_TP is not None, "expert model parallel group is not initialized"
+    return _MOE_TP
+
+
+# kept for backward compatibility
+get_tensor_model_parallel_group = get_tp_group
+
+_PP: Optional[GroupCoordinator] = None
+
+
+def get_pp_group() -> GroupCoordinator:
+    assert _PP is not None, "pipeline model parallel group is not initialized"
+    return _PP
+
+
+# kept for backward compatibility
+get_pipeline_model_parallel_group = get_pp_group
+
+
+@contextmanager
+def graph_capture():
+    """
+    `graph_capture` is a context manager which should surround the code that
+    is capturing the CUDA graph. Its main purpose is to ensure that the
+    some operations will be run after the graph is captured, before the graph
+    is replayed. It returns a `GraphCaptureContext` object which contains the
+    necessary data for the graph capture. Currently, it only contains the
+    stream that the graph capture is running on. This stream is set to the
+    current CUDA stream when the context manager is entered and reset to the
+    default stream when the context manager is exited. This is to ensure that
+    the graph capture is running on a separate stream from the default stream,
+    in order to explicitly distinguish the kernels to capture
+    from other kernels possibly launched on background in the default stream.
+    """
+    with get_tp_group().graph_capture() as context, get_pp_group().graph_capture(
+        context
+    ):
+        yield context
+
+
+logger = logging.getLogger(__name__)
+
+_ENABLE_CUSTOM_ALL_REDUCE = True
+_ENABLE_MSCCLPP_ALL_REDUCE = False
+
+
+def set_custom_all_reduce(enable: bool):
+    global _ENABLE_CUSTOM_ALL_REDUCE
+    _ENABLE_CUSTOM_ALL_REDUCE = enable
+
+
+def set_mscclpp_all_reduce(enable: bool):
+    global _ENABLE_MSCCLPP_ALL_REDUCE
+    _ENABLE_MSCCLPP_ALL_REDUCE = enable
+
+
+def init_distributed_environment(
+    world_size: int = -1,
+    rank: int = -1,
+    distributed_init_method: str = "env://",
+    local_rank: int = -1,
+    backend: str = "nccl",
+    timeout: Optional[int] = None,
+):
+    logger.debug(
+        "world_size=%d rank=%d local_rank=%d " "distributed_init_method=%s backend=%s",
+        world_size,
+        rank,
+        local_rank,
+        distributed_init_method,
+        backend,
+    )
+    if not torch.distributed.is_initialized():
+        assert distributed_init_method is not None, (
+            "distributed_init_method must be provided when initializing "
+            "distributed environment"
+        )
+        if timeout is not None:
+            assert isinstance(timeout, (int)), "timeout must be a number"
+            assert timeout > 0, "timeout must be positive"
+            timeout = timedelta(seconds=timeout)
+
+        # this backend is used for WORLD
+        torch.distributed.init_process_group(
+            backend=backend,
+            init_method=distributed_init_method,
+            world_size=world_size,
+            rank=rank,
+            timeout=timeout,
+        )
+
+    # set the local rank
+    # local_rank is not available in torch ProcessGroup,
+    # see https://github.com/pytorch/pytorch/issues/122816
+    if local_rank == -1:
+        # local rank not set, this usually happens in single-node
+        # setting, where we can use rank as local rank
+        if distributed_init_method == "env://":
+            local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+        else:
+            local_rank = rank
+    global _WORLD
+    if _WORLD is None:
+        ranks = list(range(torch.distributed.get_world_size()))
+        _WORLD = init_world_group(ranks, local_rank, backend)
+    else:
+        assert (
+            _WORLD.world_size == torch.distributed.get_world_size()
+        ), "world group already initialized with a different world size"
+
+
+def initialize_model_parallel(
+    tensor_model_parallel_size: int = 1,
+    expert_model_parallel_size: int = 1,
+    pipeline_model_parallel_size: int = 1,
+    backend: Optional[str] = None,
+    duplicate_tp_group: bool = False,
+) -> None:
+    """
+    Initialize model parallel groups.
+
+    Arguments:
+        tensor_model_parallel_size: number of GPUs used for tensor model
+            parallelism.
+        pipeline_model_parallel_size: number of GPUs used for pipeline model
+            parallelism.
+
+    Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
+    use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
+    the model pipeline. The present function will
+    create 4 tensor model-parallel groups and 2 pipeline model-parallel groups:
+        4 tensor model-parallel groups:
+            [g0, g1], [g2, g3], [g4, g5], [g6, g7]
+        2 pipeline model-parallel groups:
+            [g0, g2, g4, g6], [g1, g3, g5, g7]
+    Note that for efficiency, the caller should make sure adjacent ranks
+    are on the same DGX box. For example if we are using 2 DGX-1 boxes
+    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+    ranks 8 to 15 belong to the second box.
+    """
+    # Get world size and rank. Ensure some consistencies.
+    assert torch.distributed.is_initialized()
+    world_size: int = torch.distributed.get_world_size()
+    backend = backend or torch.distributed.get_backend(get_world_group().device_group)
+
+    if world_size != tensor_model_parallel_size * pipeline_model_parallel_size:
+        raise RuntimeError(
+            f"world_size ({world_size}) is not equal to "
+            f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
+            f"pipeline_model_parallel_size ({pipeline_model_parallel_size})"
+        )
+
+    # Build the tensor model-parallel groups.
+    num_tensor_model_parallel_groups: int = world_size // tensor_model_parallel_size
+    global _TP
+    assert _TP is None, "tensor model parallel group is already initialized"
+    group_ranks = []
+    for i in range(num_tensor_model_parallel_groups):
+        ranks = list(
+            range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
+        )
+        group_ranks.append(ranks)
+
+    # message queue broadcaster is only used in tensor model parallel group
+    _TP = init_model_parallel_group(
+        group_ranks,
+        get_world_group().local_rank,
+        backend,
+        use_message_queue_broadcaster=get_bool_env_var(
+            "SGLANG_USE_MESSAGE_QUEUE_BROADCASTER", "true"
+        ),
+        group_name="tp",
+    )
+
+    if duplicate_tp_group:
+        global _PDMUX_PREFILL_TP_GROUP
+        assert (
+            _PDMUX_PREFILL_TP_GROUP is None
+        ), "tensor model parallel group for PD-Multiplexing Prefill is already initialized"
+        _PDMUX_PREFILL_TP_GROUP = init_model_parallel_group(
+            group_ranks,
+            get_world_group().local_rank,
+            backend,
+            use_message_queue_broadcaster=get_bool_env_var(
+                "SGLANG_USE_MESSAGE_QUEUE_BROADCASTER", "true"
+            ),
+            group_name="pdmux_prefill_tp",
+        )
+        _TP.pynccl_comm.disabled = False
+        _PDMUX_PREFILL_TP_GROUP.pynccl_comm.disabled = False
+
+    moe_ep_size = expert_model_parallel_size
+    moe_tp_size = tensor_model_parallel_size // moe_ep_size
+
+    global _MOE_EP
+    assert _MOE_EP is None, "expert model parallel group is already initialized"
+
+    if moe_ep_size == tensor_model_parallel_size:
+        _MOE_EP = _TP
+    else:
+        # TODO(ch-wan): use split_group to save memory
+        group_ranks = []
+        for i in range(num_tensor_model_parallel_groups):
+            for j in range(moe_tp_size):
+                st = i * tensor_model_parallel_size + j
+                en = (i + 1) * tensor_model_parallel_size + j
+                ranks = list(range(st, en, moe_tp_size))
+                group_ranks.append(ranks)
+        _MOE_EP = init_model_parallel_group(
+            group_ranks,
+            get_world_group().local_rank,
+            backend,
+            group_name="moe_ep",
+        )
+
+    global _MOE_TP
+    assert _MOE_TP is None, "expert model parallel group is already initialized"
+
+    if moe_tp_size == tensor_model_parallel_size:
+        _MOE_TP = _TP
+    else:
+        # TODO(ch-wan): use split_group to save memory
+        group_ranks = []
+        for i in range(num_tensor_model_parallel_groups):
+            for j in range(moe_ep_size):
+                st = i * tensor_model_parallel_size + j * moe_tp_size
+                en = i * tensor_model_parallel_size + (j + 1) * moe_tp_size
+                ranks = list(range(st, en))
+                group_ranks.append(ranks)
+        _MOE_TP = init_model_parallel_group(
+            group_ranks,
+            get_world_group().local_rank,
+            backend,
+            group_name="moe_tp",
+        )
+
+    # Build the pipeline model-parallel groups.
+    num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
+    global _PP
+    assert _PP is None, "pipeline model parallel group is already initialized"
+    group_ranks = []
+    for i in range(num_pipeline_model_parallel_groups):
+        ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
+        group_ranks.append(ranks)
+    # pipeline parallel does not need custom allreduce
+    _PP = init_model_parallel_group(
+        group_ranks,
+        get_world_group().local_rank,
+        backend,
+        use_custom_allreduce=False,
+        group_name="pp",
+    )
+
+
+def ensure_model_parallel_initialized(
+    tensor_model_parallel_size: int,
+    expert_model_parallel_size: int,
+    pipeline_model_parallel_size: int,
+    backend: Optional[str] = None,
+) -> None:
+    """Helper to initialize model parallel groups if they are not initialized,
+    or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
+    values if the model parallel groups are initialized.
+    """
+    backend = backend or torch.distributed.get_backend(get_world_group().device_group)
+    if not model_parallel_is_initialized():
+        initialize_model_parallel(
+            tensor_model_parallel_size,
+            expert_model_parallel_size,
+            pipeline_model_parallel_size,
+            backend,
+        )
+        return
+
+    assert get_tensor_model_parallel_world_size() == tensor_model_parallel_size, (
+        "tensor parallel group already initialized, but of unexpected size: "
+        f"{get_tensor_model_parallel_world_size()=} vs. "
+        f"{tensor_model_parallel_size=}"
+    )
+    pp_world_size = get_pp_group().world_size
+    assert pp_world_size == pipeline_model_parallel_size, (
+        "pipeline parallel group already initialized, but of unexpected size: "
+        f"{pp_world_size=} vs. "
+        f"{pipeline_model_parallel_size=}"
+    )
+
+
+def model_parallel_is_initialized():
+    """Check if tensor and pipeline parallel groups are initialized."""
+    return _TP is not None and _PP is not None
+
+
+_TP_STATE_PATCHED = False
+
+
+@contextmanager
+def patch_tensor_parallel_group(tp_group: GroupCoordinator):
+    """Patch the tp group temporarily until this function ends.
+
+    This method is for draft workers of speculative decoding to run draft model
+    with different tp degree from that of target model workers.
+
+    Args:
+        tp_group (GroupCoordinator): the tp group coordinator
+    """
+    global _TP_STATE_PATCHED
+    assert not _TP_STATE_PATCHED, "Should not call when it's already patched"
+
+    _TP_STATE_PATCHED = True
+    old_tp_group = get_tp_group()
+    global _TP
+    _TP = tp_group
+    try:
+        yield
+    finally:
+        # restore the original state
+        _TP_STATE_PATCHED = False
+        _TP = old_tp_group
+
+
+def get_world_size():
+    """Return world size for the world group."""
+    return get_world_group().world_size
+
+
+def get_world_rank():
+    """Return my rank for the world group."""
+    return get_world_group().rank_in_group
+
+
+def get_tensor_model_parallel_world_size():
+    """Return world size for the tensor model parallel group."""
+    return get_tp_group().world_size
+
+
+def get_tensor_model_parallel_rank():
+    """Return my rank for the tensor model parallel group."""
+    return get_tp_group().rank_in_group
+
+
+def get_pipeline_model_parallel_world_size():
+    """Return world size for the pipeline model parallel group."""
+    return get_pp_group().world_size
+
+
+def get_pipeline_model_parallel_rank():
+    """Return my rank for the pipeline model parallel group."""
+    return get_pp_group().rank_in_group
+
+
+def get_moe_expert_parallel_world_size():
+    """Return world size for the moe expert parallel group."""
+    return get_moe_ep_group().world_size
+
+
+def get_moe_expert_parallel_rank():
+    """Return my rank for the moe expert parallel group."""
+    return get_moe_ep_group().rank_in_group
+
+
+def get_moe_tensor_parallel_world_size():
+    """Return world size for the moe tensor parallel group."""
+    return get_moe_tp_group().world_size
+
+
+def get_moe_tensor_parallel_rank():
+    """Return my rank for the moe tensor parallel group."""
+    return get_moe_tp_group().rank_in_group
+
+
+def destroy_model_parallel():
+    """Set the groups to none and destroy them."""
+    global _TP
+    if _TP:
+        _TP.destroy()
+    _TP = None
+
+    global _PP
+    if _PP:
+        _PP.destroy()
+    _PP = None
+
+
+def destroy_distributed_environment():
+    global _WORLD
+    if _WORLD:
+        _WORLD.destroy()
+    _WORLD = None
+    if torch.distributed.is_initialized():
+        torch.distributed.destroy_process_group()
+
+
+def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
+    destroy_model_parallel()
+    destroy_distributed_environment()
+    with contextlib.suppress(AssertionError):
+        torch.distributed.destroy_process_group()
+    if shutdown_ray:
+        import ray  # Lazy import Ray
+
+        ray.shutdown()
+    gc.collect()
+    if not _is_cpu:
+        if hasattr(torch, "cuda") and torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            if hasattr(torch._C, "_host_emptyCache"):
+                torch._C._host_emptyCache()
+            else:
+                logger.warning(
+                    "torch._C._host_emptyCache() only available in Pytorch >=2.5"
+                )
+        elif hasattr(torch, "xpu") and torch.xpu.is_available():
+            torch.xpu.empty_cache()
+        elif hasattr(torch, "npu") and torch.npu.is_available():
+            torch.npu.empty_cache()
+
+
+def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:
+    """
+    This is a collective operation that returns if each rank is in the same node
+    as the source rank. It tests if processes are attached to the same
+    memory system (shared access to shared memory).
+    """
+    assert (
+        torch.distributed.get_backend(pg) != torch.distributed.Backend.NCCL
+    ), "in_the_same_node_as should be tested with a non-NCCL group."
+    # local rank inside the group
+    rank = torch.distributed.get_rank(group=pg)
+    world_size = torch.distributed.get_world_size(group=pg)
+
+    # local tensor in each process to store the result
+    is_in_the_same_node = torch.tensor([0] * world_size, dtype=torch.int32)
+
+    # global ranks of the processes in the group
+    ranks = torch.distributed.get_process_group_ranks(pg)
+
+    magic_message = b"magic_message"
+    shm = None
+
+    try:
+        with contextlib.suppress(OSError):
+            if rank == source_rank:
+                # create a shared memory segment
+                shm = shared_memory.SharedMemory(create=True, size=128)
+                shm.buf[: len(magic_message)] = magic_message
+                torch.distributed.broadcast_object_list(
+                    [shm.name], src=ranks[source_rank], group=pg
+                )
+                is_in_the_same_node[rank] = 1
+            else:
+                # try to open the shared memory segment
+                recv = [None]
+                torch.distributed.broadcast_object_list(
+                    recv, src=ranks[source_rank], group=pg
+                )
+                name = recv[0]
+                # fix to https://stackoverflow.com/q/62748654/9191338
+                # Python incorrectly tracks shared memory even if it is not
+                # created by the process. The following patch is a workaround.
+                with patch(
+                    "multiprocessing.resource_tracker.register",
+                    lambda *args, **kwargs: None,
+                ):
+                    shm = shared_memory.SharedMemory(name=name)
+                if shm.buf[: len(magic_message)] == magic_message:
+                    is_in_the_same_node[rank] = 1
+    except Exception as e:
+        logger.error("Error ignored in is_in_the_same_node: %s", e)
+    finally:
+        if shm:
+            shm.close()
+
+    torch.distributed.barrier(group=pg)
+
+    # clean up the shared memory segment
+    with contextlib.suppress(OSError):
+        if rank == source_rank and shm:
+            shm.unlink()
+    torch.distributed.all_reduce(is_in_the_same_node, group=pg)
+
+    return [x == 1 for x in is_in_the_same_node.tolist()]
+
+
+vllm_get_pp_group = None
+vllm_get_tp_group = None
+vllm_get_world_group = None
+
+
+def monkey_patch_vllm_parallel_state(reverse: bool = False):
+    try:
+        import vllm.distributed.parallel_state as vllm_parrlel_state
+    except ImportError:
+        return
+
+    global vllm_get_pp_group, vllm_get_tp_group, vllm_get_world_group
+    if vllm_get_pp_group is None:
+        vllm_get_pp_group = vllm_parrlel_state.get_pp_group
+        vllm_get_tp_group = vllm_parrlel_state.get_tp_group
+        vllm_get_world_group = vllm_parrlel_state.get_world_group
+    if reverse:
+        setattr(vllm_parrlel_state, "get_pp_group", vllm_get_pp_group)
+        setattr(vllm_parrlel_state, "get_tp_group", vllm_get_tp_group)
+        setattr(vllm_parrlel_state, "get_world_group", vllm_get_world_group)
+    else:
+        setattr(vllm_parrlel_state, "get_pp_group", get_pp_group)
+        setattr(vllm_parrlel_state, "get_tp_group", get_tp_group)
+        setattr(vllm_parrlel_state, "get_world_group", get_world_group)
diff --git a/python/sglang/srt/distributed/utils.py b/python/sglang/srt/distributed/utils.py
new file mode 100644
index 000000000..bfe54b9d4
--- /dev/null
+++ b/python/sglang/srt/distributed/utils.py
@@ -0,0 +1,224 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/utils.py
+
+# Copyright 2023 The vLLM team.
+# Adapted from
+# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+import dataclasses
+import logging
+import os
+import pickle
+import time
+from collections import deque
+from typing import Any, Deque, Dict, Optional, Sequence, Tuple
+
+import torch
+from torch.distributed import TCPStore
+
+logger = logging.getLogger(__name__)
+
+
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, "{} is not divisible by {}".format(
+        numerator, denominator
+    )
+
+
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+
+
+def split_tensor_along_last_dim(
+    tensor: torch.Tensor,
+    num_partitions: int,
+    contiguous_split_chunks: bool = False,
+) -> Sequence[torch.Tensor]:
+    """Split a tensor along its last dimension.
+
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+
+    Returns:
+        A list of Tensors
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # NOTE: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+def get_pp_indices(
+    num_hidden_layers: int, pp_rank: int, pp_size: int
+) -> Tuple[int, int]:
+    """Try to evenly distribute layers across partitions.
+    If the number of layers is not divisible by the number of partitions,
+    the last partition will have the remaining layers.
+    """
+    # partition_list_str can be set to None in sglang
+    partition_list_str = os.getenv("SGLANG_PP_LAYER_PARTITION", None)
+    if partition_list_str is not None:
+        try:
+            partitions = [int(layer) for layer in partition_list_str.split(",")]
+        except ValueError as err:
+            raise ValueError(
+                "Invalid partition string: {}".format(partition_list_str)
+            ) from err
+        if len(partitions) != pp_size:
+            raise ValueError(f"{len(partitions)=} does not match {pp_size=}.")
+        if sum(partitions) != num_hidden_layers:
+            raise ValueError(f"{sum(partitions)=} does not match {num_hidden_layers=}.")
+        start_layer = sum(partitions[:pp_rank])
+        end_layer = start_layer + partitions[pp_rank]
+    else:
+        layers_per_partition = num_hidden_layers // pp_size
+        start_layer = pp_rank * layers_per_partition
+        end_layer = start_layer + layers_per_partition
+
+        if pp_rank == pp_size - 1:
+            end_layer = num_hidden_layers
+
+    return (start_layer, end_layer)
+
+
+@dataclasses.dataclass
+class StatelessProcessGroup:
+    """A dataclass to hold a metadata store, and the rank, world_size of the
+    group. Only use it to communicate metadata between processes.
+    For data-plane communication, create NCCL-related objects.
+    """
+
+    rank: int
+    world_size: int
+    store: torch._C._distributed_c10d.Store
+    data_expiration_seconds: int = 3600  # 1 hour
+
+    # dst rank -> counter
+    send_dst_counter: Dict[int, int] = dataclasses.field(default_factory=dict)
+    # src rank -> counter
+    recv_src_counter: Dict[int, int] = dataclasses.field(default_factory=dict)
+    broadcast_send_counter: int = 0
+    broadcast_recv_src_counter: Dict[int, int] = dataclasses.field(default_factory=dict)
+
+    # A deque to store the data entries, with key and timestamp.
+    entries: Deque[Tuple[str, float]] = dataclasses.field(default_factory=deque)
+
+    def __post_init__(self):
+        assert self.rank < self.world_size
+        self.send_dst_counter = {i: 0 for i in range(self.world_size)}
+        self.recv_src_counter = {i: 0 for i in range(self.world_size)}
+        self.broadcast_recv_src_counter = {i: 0 for i in range(self.world_size)}
+
+    def send_obj(self, obj: Any, dst: int):
+        """Send an object to a destination rank."""
+        self.expire_data()
+        key = f"send_to/{dst}/{self.send_dst_counter[dst]}"
+        self.store.set(key, pickle.dumps(obj))
+        self.send_dst_counter[dst] += 1
+        self.entries.append((key, time.perf_counter()))
+
+    def expire_data(self):
+        """Expire data that is older than `data_expiration_seconds` seconds."""
+        while self.entries:
+            # check the oldest entry
+            key, timestamp = self.entries[0]
+            if time.perf_counter() - timestamp > self.data_expiration_seconds:
+                self.store.delete_key(key)
+                self.entries.popleft()
+            else:
+                break
+
+    def recv_obj(self, src: int) -> Any:
+        """Receive an object from a source rank."""
+        obj = pickle.loads(
+            self.store.get(f"send_to/{self.rank}/{self.recv_src_counter[src]}")
+        )
+        self.recv_src_counter[src] += 1
+        return obj
+
+    def broadcast_obj(self, obj: Optional[Any], src: int) -> Any:
+        """Broadcast an object from a source rank to all other ranks.
+        It does not clean up after all ranks have received the object.
+        Use it for limited times, e.g., for initialization.
+        """
+        if self.rank == src:
+            self.expire_data()
+            key = f"broadcast_from/{src}/" f"{self.broadcast_send_counter}"
+            self.store.set(key, pickle.dumps(obj))
+            self.broadcast_send_counter += 1
+            self.entries.append((key, time.perf_counter()))
+            return obj
+        else:
+            key = f"broadcast_from/{src}/" f"{self.broadcast_recv_src_counter[src]}"
+            recv_obj = pickle.loads(self.store.get(key))
+            self.broadcast_recv_src_counter[src] += 1
+            return recv_obj
+
+    def all_gather_obj(self, obj: Any) -> list[Any]:
+        """All gather an object from all ranks."""
+        gathered_objs = []
+        for i in range(self.world_size):
+            if i == self.rank:
+                gathered_objs.append(obj)
+                self.broadcast_obj(obj, src=self.rank)
+            else:
+                recv_obj = self.broadcast_obj(None, src=i)
+                gathered_objs.append(recv_obj)
+        return gathered_objs
+
+    def barrier(self):
+        """A barrier to synchronize all ranks."""
+        for i in range(self.world_size):
+            if i == self.rank:
+                self.broadcast_obj(None, src=self.rank)
+            else:
+                self.broadcast_obj(None, src=i)
+
+    @staticmethod
+    def create(
+        host: str,
+        port: int,
+        rank: int,
+        world_size: int,
+        data_expiration_seconds: int = 3600,
+    ) -> "StatelessProcessGroup":
+        """A replacement for `torch.distributed.init_process_group` that does not
+        pollute the global state.
+
+        If we have process A and process B called `torch.distributed.init_process_group`
+        to form a group, and then we want to form another group with process A, B, C,
+        D, it is not possible in PyTorch, because process A and process B have already
+        formed a group, and process C and process D cannot join that group. This
+        function is a workaround for this issue.
+
+        `torch.distributed.init_process_group` is a global call, while this function
+        is a stateless call. It will return a `StatelessProcessGroup` object that can be
+        used for exchanging metadata. With this function, process A and process B
+        can call `StatelessProcessGroup.create` to form a group, and then process A, B,
+        C, and D can call `StatelessProcessGroup.create` to form another group.
+        """  # noqa
+        store = TCPStore(
+            host_name=host,
+            port=port,
+            world_size=world_size,
+            is_master=(rank == 0),
+        )
+
+        return StatelessProcessGroup(
+            rank=rank,
+            world_size=world_size,
+            store=store,
+            data_expiration_seconds=data_expiration_seconds,
+        )
diff --git a/python/sglang/srt/entrypoints/EngineBase.py b/python/sglang/srt/entrypoints/EngineBase.py
new file mode 100644
index 000000000..42ecb12aa
--- /dev/null
+++ b/python/sglang/srt/entrypoints/EngineBase.py
@@ -0,0 +1,72 @@
+from abc import ABC, abstractmethod
+from typing import Dict, Iterator, List, Optional, Tuple, Union
+
+import torch
+
+
+class EngineBase(ABC):
+    """
+    Abstract base class for engine interfaces that support generation, weight updating, and memory control.
+    This base class provides a unified API for both HTTP-based engines and engines.
+    """
+
+    @abstractmethod
+    def generate(
+        self,
+        prompt: Optional[Union[List[str], str]] = None,
+        sampling_params: Optional[Union[List[Dict], Dict]] = None,
+        input_ids: Optional[Union[List[List[int]], List[int]]] = None,
+        image_data: Optional[Union[List[str], str]] = None,
+        return_logprob: Optional[Union[List[bool], bool]] = False,
+        logprob_start_len: Optional[Union[List[int], int]] = None,
+        top_logprobs_num: Optional[Union[List[int], int]] = None,
+        token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None,
+        lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None,
+        custom_logit_processor: Optional[Union[List[str], str]] = None,
+        return_hidden_states: Optional[bool] = None,
+        stream: Optional[bool] = None,
+        bootstrap_host: Optional[Union[List[str], str]] = None,
+        bootstrap_port: Optional[Union[List[int], int]] = None,
+        bootstrap_room: Optional[Union[List[int], int]] = None,
+        data_parallel_rank: Optional[int] = None,
+    ) -> Union[Dict, Iterator[Dict]]:
+        """Generate outputs based on given inputs."""
+        pass
+
+    @abstractmethod
+    def flush_cache(self):
+        """Flush the cache of the engine."""
+        pass
+
+    @abstractmethod
+    def update_weights_from_tensor(
+        self,
+        named_tensors: List[Tuple[str, torch.Tensor]],
+        load_format: Optional[str] = None,
+        flush_cache: bool = True,
+    ):
+        """Update model weights with in-memory tensor data."""
+        pass
+
+    def load_lora_adapter(self, lora_name: str, lora_path: str):
+        """Load a new LoRA adapter without re-launching the engine."""
+        pass
+
+    def unload_lora_adapter(self, lora_name: str):
+        """Unload a LoRA adapter without re-launching the engine."""
+        pass
+
+    @abstractmethod
+    def release_memory_occupation(self):
+        """Release GPU memory occupation temporarily."""
+        pass
+
+    @abstractmethod
+    def resume_memory_occupation(self):
+        """Resume GPU memory occupation which is previously released."""
+        pass
+
+    @abstractmethod
+    def shutdown(self):
+        """Shutdown the engine and clean up resources."""
+        pass
diff --git a/python/sglang/srt/entrypoints/context.py b/python/sglang/srt/entrypoints/context.py
new file mode 100644
index 000000000..66f58200f
--- /dev/null
+++ b/python/sglang/srt/entrypoints/context.py
@@ -0,0 +1,246 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copied from vLLM
+import json
+import logging
+from abc import ABC, abstractmethod
+from typing import Union
+
+logger = logging.getLogger(__name__)
+
+try:
+    from mcp import ClientSession
+except ImportError as e:
+    mcp = e
+
+from openai_harmony import Author, Message, Role, StreamState, TextContent
+
+from sglang.srt.entrypoints.harmony_utils import (
+    get_encoding,
+    get_streamable_parser_for_assistant,
+    render_for_completion,
+)
+from sglang.srt.entrypoints.tool import Tool
+
+
+class ConversationContext(ABC):
+
+    @abstractmethod
+    def append_output(self, output) -> None:
+        pass
+
+    @abstractmethod
+    async def call_tool(self) -> list[Message]:
+        pass
+
+    @abstractmethod
+    def need_builtin_tool_call(self) -> bool:
+        pass
+
+    @abstractmethod
+    def render_for_completion(self) -> list[int]:
+        pass
+
+
+class SimpleContext(ConversationContext):
+
+    def __init__(self):
+        self.last_output = None
+
+    def append_output(self, output) -> None:
+        self.last_output = output
+
+    def need_builtin_tool_call(self) -> bool:
+        return False
+
+    async def call_tool(self) -> list[Message]:
+        raise NotImplementedError("Should not be called.")
+
+    def render_for_completion(self) -> list[int]:
+        raise NotImplementedError("Should not be called.")
+
+
+class HarmonyContext(ConversationContext):
+
+    def __init__(
+        self,
+        messages: list,
+        tool_sessions: dict[str, Union["ClientSession", Tool]],
+    ):
+        # TODO: Remove the hack of Union[ClientSession, Tool] by using MCP
+        # when demo.
+        self._messages = messages
+        self.tool_sessions = tool_sessions
+
+        self.parser = get_streamable_parser_for_assistant()
+        self.num_init_messages = len(messages)
+        # TODO
+        self.num_prompt_tokens = 0
+        self.num_cached_tokens = 0
+        self.num_output_tokens = 0
+        self.num_reasoning_tokens = 0
+
+    def append_output(self, output) -> None:
+        if isinstance(output, dict) and "output_ids" in output:
+            output_token_ids = output["output_ids"]
+
+            # TODO: REMOVE here:
+            # Very hacky, find the first occurrence of token 200006 and cut from there
+            try:
+                start_index = output_token_ids.index(200006)
+                output_token_ids = output_token_ids[start_index:]
+            except ValueError:
+                pass
+
+            for token_id in output_token_ids:
+                self.parser.process(token_id)
+            output_msgs = self.parser.messages
+
+            meta_info = output["meta_info"]
+
+            if isinstance(meta_info, dict):
+                if "prompt_token_ids" in meta_info:
+                    self.num_prompt_tokens = meta_info["prompt_tokens"]
+                if "cached_tokens" in meta_info:
+                    self.num_cached_tokens = meta_info["cached_tokens"]
+                if "completion_tokens" in meta_info:
+                    self.num_output_tokens += meta_info["completion_tokens"]
+
+        else:
+            output_msgs = output
+
+        self._messages.extend(output_msgs)
+
+    @property
+    def messages(self) -> list:
+        return self._messages
+
+    def need_builtin_tool_call(self) -> bool:
+        if not self.messages:
+            return False
+        last_msg = self.messages[-1]
+        recipient = last_msg.recipient
+        return recipient is not None and (
+            recipient.startswith("browser.") or recipient.startswith("python")
+        )
+
+    async def call_tool(self) -> list[Message]:
+        if not self.messages:
+            return []
+        last_msg = self.messages[-1]
+        recipient = last_msg.recipient
+        if recipient is not None:
+            if recipient.startswith("browser."):
+                return await self.call_search_tool(
+                    self.tool_sessions["browser"], last_msg
+                )
+            elif recipient.startswith("python"):
+                return await self.call_python_tool(
+                    self.tool_sessions["python"], last_msg
+                )
+        raise ValueError("No tool call found")
+
+    def render_for_completion(self) -> list[int]:
+        return render_for_completion(self.messages)
+
+    async def call_search_tool(
+        self, tool_session: Union["ClientSession", Tool], last_msg: Message
+    ) -> list[Message]:
+        if isinstance(tool_session, Tool):
+            return await tool_session.get_result(self)
+        tool_name = last_msg.recipient.split(".")[1]
+        args = json.loads(last_msg.content[0].text)
+        result = await tool_session.call_tool(tool_name, args)
+        result_str = result.content[0].text
+        content = TextContent(text=result_str)
+        author = Author(role=Role.TOOL, name=last_msg.recipient)
+        return [Message(author=author, content=[content], recipient=Role.ASSISTANT)]
+
+    async def call_python_tool(
+        self, tool_session: Union["ClientSession", Tool], last_msg: Message
+    ) -> list[Message]:
+        if isinstance(tool_session, Tool):
+            return await tool_session.get_result(self)
+        param = {
+            "code": last_msg.content[0].text,
+        }
+        result = await tool_session.call_tool("python", param)
+        result_str = result.content[0].text
+
+        content = TextContent(text=result_str)
+        author = Author(role=Role.TOOL, name="python")
+
+        return [
+            Message(
+                author=author,
+                content=[content],
+                channel=last_msg.channel,
+                recipient=Role.ASSISTANT,
+            )
+        ]
+
+
+class StreamingHarmonyContext(HarmonyContext):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.last_output = None
+
+        self.parser = get_streamable_parser_for_assistant()
+        self.encoding = get_encoding()
+        self.last_tok = None
+
+    @property
+    def messages(self) -> list:
+        return self.parser.messages
+
+    def append_output(self, output) -> None:
+        if isinstance(output, dict) and "output_ids" in output:
+            # RequestOutput from SGLang with outputs
+            output_token_ids = output["output_ids"]
+
+            # TODO: REMOVE here:
+            # Very hacky, find the first occurrence of token 200006 and cut from there
+            # Find the first occurrence of token 200006 and cut from there
+            try:
+                start_index = output_token_ids.index(200006)
+                output_token_ids = output_token_ids[start_index:]
+            except ValueError:
+                pass
+
+            for token_id in output_token_ids:
+                self.parser.process(token_id)
+
+        else:
+            # Handle the case of tool output in direct message format
+            assert len(output) == 1, "Tool output should be a single message"
+            msg = output[0]
+            # Sometimes the recipient is not set for tool messages,
+            # so we set it to "assistant"
+            if msg.author.role == Role.TOOL and msg.recipient is None:
+                msg.recipient = "assistant"
+            toks = self.encoding.render(msg)
+            for tok in toks:
+                self.parser.process(tok)
+            self.last_tok = toks[-1]
+
+    def is_expecting_start(self) -> bool:
+        return self.parser.state == StreamState.EXPECT_START
+
+    def is_assistant_action_turn(self) -> bool:
+        return self.last_tok in self.encoding.stop_tokens_for_assistant_actions()
+
+    def render_for_completion(self) -> list[int]:
+        # now this list of tokens as next turn's starting tokens
+        # `<|start|>assistant``,
+        # we need to process them in parser.
+        rendered_tokens = super().render_for_completion()
+
+        last_n = -1
+        to_process = []
+        while rendered_tokens[last_n] != self.last_tok:
+            to_process.append(rendered_tokens[last_n])
+            last_n -= 1
+        for tok in reversed(to_process):
+            self.parser.process(tok)
+
+        return rendered_tokens
diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py
new file mode 100644
index 000000000..fbd923d91
--- /dev/null
+++ b/python/sglang/srt/entrypoints/engine.py
@@ -0,0 +1,872 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+The entry point of inference server. (SRT = SGLang Runtime)
+
+This file implements python APIs for the inference engine.
+"""
+
+import asyncio
+import atexit
+import dataclasses
+import logging
+import multiprocessing as mp
+import os
+import random
+import signal
+import threading
+import time
+from typing import AsyncIterator, Dict, Iterator, List, Optional, Tuple, Union
+
+import zmq
+import zmq.asyncio
+from PIL.Image import Image
+
+# Fix a bug of Python threading
+setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
+
+import torch
+import uvloop
+
+from sglang.srt.entrypoints.EngineBase import EngineBase
+from sglang.srt.managers.data_parallel_controller import (
+    run_data_parallel_controller_process,
+)
+from sglang.srt.managers.detokenizer_manager import run_detokenizer_process
+from sglang.srt.managers.io_struct import (
+    EmbeddingReqInput,
+    GenerateReqInput,
+    GetWeightsByNameReqInput,
+    InitWeightsUpdateGroupReqInput,
+    LoadLoRAAdapterReqInput,
+    MultimodalDataInputFormat,
+    ReleaseMemoryOccupationReqInput,
+    ResumeMemoryOccupationReqInput,
+    RpcReqInput,
+    RpcReqOutput,
+    UnloadLoRAAdapterReqInput,
+    UpdateWeightFromDiskReqInput,
+    UpdateWeightsFromDistributedReqInput,
+    UpdateWeightsFromTensorReqInput,
+)
+from sglang.srt.managers.multi_tokenizer_mixin import MultiTokenizerRouter
+from sglang.srt.managers.scheduler import run_scheduler_process
+from sglang.srt.managers.template_manager import TemplateManager
+from sglang.srt.managers.tokenizer_manager import TokenizerManager
+from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
+from sglang.srt.utils import (
+    MultiprocessingSerializer,
+    assert_pkg_version,
+    configure_logger,
+    get_bool_env_var,
+    get_zmq_socket,
+    is_cuda,
+    kill_process_tree,
+    launch_dummy_health_check_server,
+    prepare_model_and_tokenizer,
+    set_prometheus_multiproc_dir,
+    set_ulimit,
+)
+from sglang.version import __version__
+
+logger = logging.getLogger(__name__)
+asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+
+_is_cuda = is_cuda()
+
+
+class Engine(EngineBase):
+    """
+    The entry point to the inference engine.
+
+    - The engine consists of three components:
+        1. TokenizerManager: Tokenizes the requests and sends them to the scheduler.
+        2. Scheduler (subprocess): Receives requests from the Tokenizer Manager, schedules batches, forwards them, and sends the output tokens to the Detokenizer Manager.
+        3. DetokenizerManager (subprocess): Detokenizes the output tokens and sends the result back to the Tokenizer Manager.
+
+    Note:
+    1. The HTTP server, Engine, and TokenizerManager all run in the main process.
+    2. Inter-process communication (IPC) is handled via the ZMQ library, with each process using a different port.
+    """
+
+    def __init__(self, **kwargs):
+        """
+        The arguments of this function is the same as `sglang/srt/server_args.py::ServerArgs`.
+        Please refer to `ServerArgs` for the documentation.
+        """
+        if "server_args" in kwargs:
+            # Directly load server_args
+            server_args = kwargs["server_args"]
+        else:
+            # Construct server_args from kwargs
+            if "log_level" not in kwargs:
+                # Do not print logs by default
+                kwargs["log_level"] = "error"
+            server_args = ServerArgs(**kwargs)
+
+        # Shutdown the subprocesses automatically when the program exits
+        atexit.register(self.shutdown)
+
+        # Allocate ports for inter-process communications
+        self.port_args = PortArgs.init_new(server_args)
+        logger.info(f"{server_args=}")
+
+        # Launch subprocesses
+        tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
+            server_args=server_args,
+            port_args=self.port_args,
+        )
+        self.server_args = server_args
+        self.tokenizer_manager = tokenizer_manager
+        self.template_manager = template_manager
+        self.scheduler_info = scheduler_info
+
+        context = zmq.Context(2)
+        self.send_to_rpc = get_zmq_socket(
+            context, zmq.DEALER, self.port_args.rpc_ipc_name, True
+        )
+
+    def generate(
+        self,
+        # The input prompt. It can be a single prompt or a batch of prompts.
+        prompt: Optional[Union[List[str], str]] = None,
+        sampling_params: Optional[Union[List[Dict], Dict]] = None,
+        # The token ids for text; one can either specify text or input_ids.
+        input_ids: Optional[Union[List[List[int]], List[int]]] = None,
+        # The image input. It can be an image instance, file name, URL, or base64 encoded string.
+        # Can be formatted as:
+        # - Single image for a single request
+        # - List of images (one per request in a batch)
+        # - List of lists of images (multiple images per request)
+        # See also python/sglang/srt/utils.py:load_image for more details.
+        image_data: Optional[MultimodalDataInputFormat] = None,
+        audio_data: Optional[MultimodalDataInputFormat] = None,
+        video_data: Optional[MultimodalDataInputFormat] = None,
+        return_logprob: Optional[Union[List[bool], bool]] = False,
+        logprob_start_len: Optional[Union[List[int], int]] = None,
+        top_logprobs_num: Optional[Union[List[int], int]] = None,
+        token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None,
+        lora_path: Optional[List[Optional[str]]] = None,
+        custom_logit_processor: Optional[Union[List[str], str]] = None,
+        return_hidden_states: bool = False,
+        stream: bool = False,
+        bootstrap_host: Optional[Union[List[str], str]] = None,
+        bootstrap_port: Optional[Union[List[int], int]] = None,
+        bootstrap_room: Optional[Union[List[int], int]] = None,
+        data_parallel_rank: Optional[int] = None,
+    ) -> Union[Dict, Iterator[Dict]]:
+        """
+        The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
+        Please refer to `GenerateReqInput` for the documentation.
+        """
+        if self.server_args.enable_dp_attention:
+            if data_parallel_rank is None:
+                logger.debug("data_parallel_rank not provided, using default dispatch")
+            elif data_parallel_rank < 0:
+                raise ValueError("data_parallel_rank must be non-negative")
+            elif data_parallel_rank >= self.server_args.dp_size:
+                raise ValueError(
+                    f"data_parallel_rank must be less than dp_size: {self.server_args.dp_size}"
+                )
+
+        obj = GenerateReqInput(
+            text=prompt,
+            input_ids=input_ids,
+            sampling_params=sampling_params,
+            image_data=image_data,
+            audio_data=audio_data,
+            video_data=video_data,
+            return_logprob=return_logprob,
+            logprob_start_len=logprob_start_len,
+            top_logprobs_num=top_logprobs_num,
+            token_ids_logprob=token_ids_logprob,
+            lora_path=lora_path,
+            custom_logit_processor=custom_logit_processor,
+            return_hidden_states=return_hidden_states,
+            stream=stream,
+            bootstrap_host=bootstrap_host,
+            bootstrap_port=bootstrap_port,
+            bootstrap_room=bootstrap_room,
+            data_parallel_rank=data_parallel_rank,
+        )
+        loop = asyncio.get_event_loop()
+        generator = self.tokenizer_manager.generate_request(obj, None)
+
+        if stream:
+
+            def generator_wrapper():
+                while True:
+                    try:
+                        chunk = loop.run_until_complete(generator.__anext__())
+                        yield chunk
+                    except StopAsyncIteration:
+                        break
+
+            return generator_wrapper()
+        else:
+            ret = loop.run_until_complete(generator.__anext__())
+            return ret
+
+    async def async_generate(
+        self,
+        # The input prompt. It can be a single prompt or a batch of prompts.
+        prompt: Optional[Union[List[str], str]] = None,
+        sampling_params: Optional[Union[List[Dict], Dict]] = None,
+        # The token ids for text; one can either specify text or input_ids.
+        input_ids: Optional[Union[List[List[int]], List[int]]] = None,
+        # The image input. It can be an image instance, file name, URL, or base64 encoded string.
+        # Can be formatted as:
+        # - Single image for a single request
+        # - List of images (one per request in a batch)
+        # - List of lists of images (multiple images per request)
+        # See also python/sglang/srt/utils.py:load_image for more details.
+        image_data: Optional[MultimodalDataInputFormat] = None,
+        audio_data: Optional[MultimodalDataInputFormat] = None,
+        video_data: Optional[MultimodalDataInputFormat] = None,
+        return_logprob: Optional[Union[List[bool], bool]] = False,
+        logprob_start_len: Optional[Union[List[int], int]] = None,
+        top_logprobs_num: Optional[Union[List[int], int]] = None,
+        token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None,
+        lora_path: Optional[List[Optional[str]]] = None,
+        custom_logit_processor: Optional[Union[List[str], str]] = None,
+        return_hidden_states: bool = False,
+        stream: bool = False,
+        bootstrap_host: Optional[Union[List[str], str]] = None,
+        bootstrap_port: Optional[Union[List[int], int]] = None,
+        bootstrap_room: Optional[Union[List[int], int]] = None,
+        data_parallel_rank: Optional[int] = None,
+    ) -> Union[Dict, AsyncIterator[Dict]]:
+        """
+        The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
+        Please refer to `GenerateReqInput` for the documentation.
+        """
+
+        if self.server_args.enable_dp_attention:
+            if data_parallel_rank is None:
+                logger.debug("data_parallel_rank not provided, using default dispatch")
+            elif data_parallel_rank < 0:
+                raise ValueError("data_parallel_rank must be non-negative")
+            elif data_parallel_rank >= self.server_args.dp_size:
+                raise ValueError(
+                    f"data_parallel_rank must be in range [0, {self.server_args.dp_size-1}]"
+                )
+
+        logger.debug(f"data_parallel_rank: {data_parallel_rank}")
+        obj = GenerateReqInput(
+            text=prompt,
+            input_ids=input_ids,
+            sampling_params=sampling_params,
+            image_data=image_data,
+            audio_data=audio_data,
+            video_data=video_data,
+            return_logprob=return_logprob,
+            logprob_start_len=logprob_start_len,
+            top_logprobs_num=top_logprobs_num,
+            token_ids_logprob=token_ids_logprob,
+            lora_path=lora_path,
+            return_hidden_states=return_hidden_states,
+            stream=stream,
+            custom_logit_processor=custom_logit_processor,
+            bootstrap_host=bootstrap_host,
+            bootstrap_port=bootstrap_port,
+            bootstrap_room=bootstrap_room,
+            data_parallel_rank=data_parallel_rank,
+        )
+        generator = self.tokenizer_manager.generate_request(obj, None)
+
+        if stream is True:
+            return generator
+        else:
+            return await generator.__anext__()
+
+    def encode(
+        self,
+        prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
+        image_data: Optional[MultimodalDataInputFormat] = None,
+        audio_data: Optional[MultimodalDataInputFormat] = None,
+        video_data: Optional[MultimodalDataInputFormat] = None,
+    ) -> Dict:
+        """
+        The arguments of this function is the same as `sglang/srt/managers/io_struct.py::EmbeddingReqInput`.
+        Please refer to `EmbeddingReqInput` for the documentation.
+        """
+        obj = EmbeddingReqInput(
+            text=prompt,
+            image_data=image_data,
+            audio_data=audio_data,
+            video_data=video_data,
+        )
+        loop = asyncio.get_event_loop()
+        generator = self.tokenizer_manager.generate_request(obj, None)
+        ret = loop.run_until_complete(generator.__anext__())
+        return ret
+
+    async def async_encode(
+        self,
+        prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
+        image_data: Optional[MultimodalDataInputFormat] = None,
+        audio_data: Optional[MultimodalDataInputFormat] = None,
+        video_data: Optional[MultimodalDataInputFormat] = None,
+    ) -> Dict:
+        """
+        Asynchronous version of encode method.
+
+        The arguments of this function is the same as `sglang/srt/managers/io_struct.py::EmbeddingReqInput`.
+        Please refer to `EmbeddingReqInput` for the documentation.
+        """
+        obj = EmbeddingReqInput(
+            text=prompt,
+            image_data=image_data,
+            audio_data=audio_data,
+            video_data=video_data,
+        )
+        generator = self.tokenizer_manager.generate_request(obj, None)
+        return await generator.__anext__()
+
+    def rerank(
+        self,
+        prompt: Union[List[List[str]]],
+    ) -> Dict:
+        """
+        The arguments of this function is the same as `sglang/srt/managers/io_struct.py::EmbeddingReqInput`.
+        Please refer to `EmbeddingReqInput` for the documentation.
+        """
+        obj = EmbeddingReqInput(text=prompt, is_cross_encoder_request=True)
+        loop = asyncio.get_event_loop()
+        generator = self.tokenizer_manager.generate_request(obj, None)
+        ret = loop.run_until_complete(generator.__anext__())
+        return ret
+
+    def shutdown(self):
+        """Shutdown the engine"""
+        kill_process_tree(os.getpid(), include_parent=False)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.shutdown()
+        return False
+
+    def flush_cache(self):
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(self.tokenizer_manager.flush_cache())
+
+    def start_profile(self):
+        loop = asyncio.get_event_loop()
+        loop.run_until_complete(self.tokenizer_manager.start_profile())
+
+    def stop_profile(self):
+        loop = asyncio.get_event_loop()
+        loop.run_until_complete(self.tokenizer_manager.stop_profile())
+
+    def start_expert_distribution_record(self):
+        loop = asyncio.get_event_loop()
+        loop.run_until_complete(
+            self.tokenizer_manager.start_expert_distribution_record()
+        )
+
+    def stop_expert_distribution_record(self):
+        loop = asyncio.get_event_loop()
+        loop.run_until_complete(
+            self.tokenizer_manager.stop_expert_distribution_record()
+        )
+
+    def dump_expert_distribution_record(self):
+        loop = asyncio.get_event_loop()
+        loop.run_until_complete(
+            self.tokenizer_manager.dump_expert_distribution_record()
+        )
+
+    def get_server_info(self):
+        loop = asyncio.get_event_loop()
+        internal_states = loop.run_until_complete(
+            self.tokenizer_manager.get_internal_state()
+        )
+        return {
+            **dataclasses.asdict(self.tokenizer_manager.server_args),
+            **self.scheduler_info,
+            "internal_states": internal_states,
+            "version": __version__,
+        }
+
+    def init_weights_update_group(
+        self,
+        master_address: str,
+        master_port: int,
+        rank_offset: int,
+        world_size: int,
+        group_name: str,
+        backend: str = "nccl",
+    ):
+        """Initialize parameter update group."""
+        obj = InitWeightsUpdateGroupReqInput(
+            master_address=master_address,
+            master_port=master_port,
+            rank_offset=rank_offset,
+            world_size=world_size,
+            group_name=group_name,
+            backend=backend,
+        )
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.tokenizer_manager.init_weights_update_group(obj, None)
+        )
+
+    def update_weights_from_distributed(
+        self,
+        names: list[str],
+        dtypes: list[str],
+        shapes: list[list[int]],
+        group_name: str = "weight_update_group",
+        flush_cache: bool = True,
+    ):
+        """Update weights from distributed source."""
+        obj = UpdateWeightsFromDistributedReqInput(
+            names=names,
+            dtypes=dtypes,
+            shapes=shapes,
+            group_name=group_name,
+            flush_cache=flush_cache,
+        )
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.tokenizer_manager.update_weights_from_distributed(obj, None)
+        )
+
+    def update_weights_from_tensor(
+        self,
+        named_tensors: List[Tuple[str, torch.Tensor]],
+        load_format: Optional[str] = None,
+        flush_cache: bool = True,
+    ):
+        """Update weights from distributed source. If there are going to be more updates, set `flush_cache` to be false
+        to avoid duplicated cache cleaning operation."""
+        if load_format == "flattened_bucket":
+            serialized_named_tensors = named_tensors
+        else:
+            serialized_named_tensors = [
+                MultiprocessingSerializer.serialize(named_tensors)
+                for _ in range(self.server_args.tp_size)
+            ]
+        obj = UpdateWeightsFromTensorReqInput(
+            serialized_named_tensors=serialized_named_tensors,
+            load_format=load_format,
+            flush_cache=flush_cache,
+        )
+        loop = asyncio.get_event_loop()
+
+        return loop.run_until_complete(
+            self.tokenizer_manager.update_weights_from_tensor(obj, None)
+        )
+
+    def update_weights_from_disk(
+        self,
+        model_path: str,
+        load_format: Optional[str] = None,
+    ):
+        """Update the weights from disk inplace without re-launching the engine.
+
+        This method allows updating the model weights from disk without restarting
+        the engine. It can be used to load a different model or update weights with
+        new training.
+        """
+        obj = UpdateWeightFromDiskReqInput(
+            model_path=model_path,
+            load_format=load_format,
+        )
+
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.tokenizer_manager.update_weights_from_disk(obj, None)
+        )
+
+    def get_weights_by_name(self, name: str, truncate_size: int = 100):
+        """Get weights by parameter name."""
+        obj = GetWeightsByNameReqInput(name=name, truncate_size=truncate_size)
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.tokenizer_manager.get_weights_by_name(obj, None)
+        )
+
+    def load_lora_adapter(self, lora_name: str, lora_path: str, pinned: bool = False):
+        """Load a new LoRA adapter without re-launching the engine."""
+
+        obj = LoadLoRAAdapterReqInput(
+            lora_name=lora_name,
+            lora_path=lora_path,
+            pinned=pinned,
+        )
+
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.tokenizer_manager.load_lora_adapter(obj, None)
+        )
+
+    def unload_lora_adapter(self, lora_name: str):
+        """Unload a LoRA adapter without re-launching the engine."""
+
+        obj = UnloadLoRAAdapterReqInput(lora_name=lora_name)
+
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.tokenizer_manager.unload_lora_adapter(obj, None)
+        )
+
+    def release_memory_occupation(self, tags: Optional[List[str]] = None):
+        obj = ReleaseMemoryOccupationReqInput(tags=tags)
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.tokenizer_manager.release_memory_occupation(obj, None)
+        )
+
+    def resume_memory_occupation(self, tags: Optional[List[str]] = None):
+        obj = ResumeMemoryOccupationReqInput(tags=tags)
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.tokenizer_manager.resume_memory_occupation(obj, None)
+        )
+
+    def freeze_gc(self):
+        """
+        To maintain a high performance server with low latency, we want to reduce the
+        stalls caused by the garbage collector scanning through a large number of objects.
+
+        It is usually helpful to start the server and warm it up with real requests to
+        initialize many of the long-lived objects that do not need to be garbage collected.
+
+        After sufficient warmup, we can call this function to freeze the garbage collector
+        so that all objects created before this point are considered out of scope for garbage
+        collection.
+        """
+
+        loop = asyncio.get_event_loop()
+        loop.run_until_complete(self.tokenizer_manager.freeze_gc())
+
+    """
+    Execute an RPC call on all scheduler processes.
+    """
+
+    def collective_rpc(self, method: str, **kwargs):
+        obj = RpcReqInput(method=method, parameters=kwargs)
+        self.send_to_rpc.send_pyobj(obj)
+        recv_req = self.send_to_rpc.recv_pyobj(zmq.BLOCKY)
+        assert isinstance(recv_req, RpcReqOutput)
+        assert recv_req.success, recv_req.message
+
+    def save_remote_model(self, **kwargs):
+        self.collective_rpc("save_remote_model", **kwargs)
+
+    def save_sharded_model(self, **kwargs):
+        self.collective_rpc("save_sharded_model", **kwargs)
+
+    def score(
+        self,
+        query: Optional[Union[str, List[int]]] = None,
+        items: Optional[Union[str, List[str], List[List[int]]]] = None,
+        label_token_ids: Optional[List[int]] = None,
+        apply_softmax: bool = False,
+        item_first: bool = False,
+    ) -> List[List[float]]:
+        """
+        Score the probability of specified token IDs appearing after the given (query + item) pair. For example:
+        query = "<|user|>Is the following city the capital of France? "
+        items = ["Paris <|assistant|>", "London <|assistant|>", "Berlin <|assistant|>"]
+        label_token_ids = [2332, 1223] # Token IDs for "Yes" and "No"
+        item_first = False
+
+        This would pass the following prompts to the model:
+        "<|user|>Is the following city the capital of France? Paris <|assistant|>"
+        "<|user|>Is the following city the capital of France? London <|assistant|>"
+        "<|user|>Is the following city the capital of France? Berlin <|assistant|>"
+        The api would then return the probabilities of the model producing "Yes" and "No" as the next token.
+        The output would look like:
+        [[0.9, 0.1], [0.2, 0.8], [0.1, 0.9]]
+
+
+        Args:
+            query: The query text or pre-tokenized query token IDs. Must be provided.
+            items: The item text(s) or pre-tokenized item token IDs. Must be provided.
+            label_token_ids: List of token IDs to compute probabilities for. If None, no token probabilities will be computed.
+            apply_softmax: Whether to normalize probabilities using softmax.
+            item_first: If True, prepend items to query. Otherwise append items to query.
+
+        Returns:
+            List of dictionaries mapping token IDs to their probabilities for each item.
+            Each dictionary in the list corresponds to one item input.
+
+        Raises:
+            ValueError: If query is not provided, or if items is not provided,
+                      or if token IDs are out of vocabulary, or if logprobs are not available for the specified tokens.
+        """
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.tokenizer_manager.score_request(
+                query=query,
+                items=items,
+                label_token_ids=label_token_ids,
+                apply_softmax=apply_softmax,
+                item_first=item_first,
+                request=None,
+            )
+        )
+
+    async def async_score(
+        self,
+        query: Optional[Union[str, List[int]]] = None,
+        items: Optional[Union[str, List[str], List[List[int]]]] = None,
+        label_token_ids: Optional[List[int]] = None,
+        apply_softmax: bool = False,
+        item_first: bool = False,
+    ) -> List[List[float]]:
+        """
+        Asynchronous version of score method.
+
+        See score() for detailed documentation.
+        """
+        return await self.tokenizer_manager.score_request(
+            query=query,
+            items=items,
+            label_token_ids=label_token_ids,
+            apply_softmax=apply_softmax,
+            item_first=item_first,
+            request=None,
+        )
+
+
+def _set_envs_and_config(server_args: ServerArgs):
+    # Set global environments
+    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+    os.environ["NCCL_CUMEM_ENABLE"] = str(int(server_args.enable_symm_mem))
+    if not server_args.enable_symm_mem:
+        os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls))
+    os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
+    os.environ["CUDA_MODULE_LOADING"] = "AUTO"
+    # flashinfer uses this environment variable for various kernels from MoE to quant kernels
+    if os.environ.get("TRTLLM_ENABLE_PDL", "1") != "0":
+        os.environ["TRTLLM_ENABLE_PDL"] = "1"
+
+    # Can also be passed as argument
+    os.environ["SGLANG_RUN_ID"] = (
+        f"sglang-run-{time.time()}-{random.randint(0, 100000000)}"
+    )
+
+    # Set prometheus env vars
+    if server_args.enable_metrics:
+        set_prometheus_multiproc_dir()
+
+    # Set ulimit
+    set_ulimit()
+
+    # Check flashinfer version
+    if server_args.attention_backend == "flashinfer":
+        assert_pkg_version(
+            "flashinfer_python",
+            "0.3.1",
+            "Please uninstall the old version and "
+            "reinstall the latest version by following the instructions "
+            "at https://docs.flashinfer.ai/installation.html.",
+        )
+    if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
+        assert_pkg_version(
+            "sgl-kernel",
+            "0.3.9.post2",
+            "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
+        )
+
+    if True:  # Keep this check for internal code compatibility
+        # Register the signal handler.
+        # The child processes will send SIGQUIT to this process when any error happens
+        # This process then clean up the whole process tree
+        # Note: This sigquit handler is used in the launch phase, and may be replaced by
+        # the running_phase_sigquit_handler in the tokenizer manager after the grpc server is launched.
+        def launch_phase_sigquit_handler(signum, frame):
+            logger.error(
+                "Received sigquit from a child process. It usually means the child failed."
+            )
+            kill_process_tree(os.getpid())
+
+        signal.signal(signal.SIGQUIT, launch_phase_sigquit_handler)
+
+    # Set mp start method
+    mp.set_start_method("spawn", force=True)
+
+
+def _init_tokenizer_manager(
+    server_args: ServerArgs, port_args: PortArgs
+) -> TokenizerManager:
+    # Launch tokenizer process
+    tokenizer_manager = TokenizerManager(server_args, port_args)
+
+    # Initialize templates
+    template_manager = TemplateManager()
+    template_manager.initialize_templates(
+        tokenizer_manager=tokenizer_manager,
+        model_path=server_args.model_path,
+        chat_template=server_args.chat_template,
+        completion_template=server_args.completion_template,
+    )
+
+    return tokenizer_manager, template_manager
+
+
+def _launch_subprocesses(
+    server_args: ServerArgs, port_args: Optional[PortArgs] = None
+) -> Tuple[TokenizerManager, TemplateManager, Dict]:
+    """
+    Launch the TokenizerManager in the main process, the Scheduler in a subprocess, and the DetokenizerManager in another subprocess.
+    """
+    # Configure global environment
+    configure_logger(server_args)
+    server_args.check_server_args()
+    _set_envs_and_config(server_args)
+
+    # Allocate ports for inter-process communications
+    if port_args is None:
+        port_args = PortArgs.init_new(server_args)
+        logger.info(f"{server_args=}")
+
+    # If using model from www.modelscope.cn, first download the model.
+    server_args.model_path, server_args.tokenizer_path = prepare_model_and_tokenizer(
+        server_args.model_path, server_args.tokenizer_path
+    )
+
+    scheduler_procs = []
+    if server_args.dp_size == 1:
+        memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=server_args.enable_memory_saver
+        )
+        scheduler_pipe_readers = []
+
+        nnodes_per_tp_group = max(server_args.nnodes // server_args.pp_size, 1)
+        tp_size_per_node = server_args.tp_size // nnodes_per_tp_group
+        tp_rank_range = range(
+            tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group),
+            tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group + 1),
+        )
+
+        pp_size_per_node = max(server_args.pp_size // server_args.nnodes, 1)
+        pp_rank_range = range(
+            pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group),
+            pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group + 1),
+        )
+
+        for pp_rank in pp_rank_range:
+            for tp_rank in tp_rank_range:
+                reader, writer = mp.Pipe(duplex=False)
+                gpu_id = (
+                    server_args.base_gpu_id
+                    + ((pp_rank % pp_size_per_node) * tp_size_per_node)
+                    + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
+                )
+                moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
+                proc = mp.Process(
+                    target=run_scheduler_process,
+                    args=(
+                        server_args,
+                        port_args,
+                        gpu_id,
+                        tp_rank,
+                        moe_ep_rank,
+                        pp_rank,
+                        None,
+                        writer,
+                        None,
+                    ),
+                )
+
+                with memory_saver_adapter.configure_subprocess():
+                    proc.start()
+                scheduler_procs.append(proc)
+                scheduler_pipe_readers.append(reader)
+    else:
+        # Launch the data parallel controller
+        reader, writer = mp.Pipe(duplex=False)
+        scheduler_pipe_readers = [reader]
+        proc = mp.Process(
+            target=run_data_parallel_controller_process,
+            args=(server_args, port_args, writer),
+        )
+        proc.start()
+        scheduler_procs.append(proc)
+
+    if server_args.node_rank >= 1:
+        # In multi-node cases, non-zero rank nodes do not need to run tokenizer or detokenizer,
+        # so they can just wait here.
+
+        for reader in scheduler_pipe_readers:
+            data = reader.recv()
+            assert data["status"] == "ready"
+
+        if os.getenv("SGLANG_BLOCK_NONZERO_RANK_CHILDREN") == "0":
+            # When using `Engine` as a Python API, we don't want to block here.
+            return None, None, None
+
+        launch_dummy_health_check_server(
+            server_args.host, server_args.port, server_args.enable_metrics
+        )
+
+        for proc in scheduler_procs:
+            proc.join()
+            logger.error(
+                f"Scheduler or DataParallelController {proc.pid} terminated with {proc.exitcode}"
+            )
+        return None, None, None
+
+    # Launch detokenizer process
+    detoken_proc = mp.Process(
+        target=run_detokenizer_process,
+        args=(
+            server_args,
+            port_args,
+        ),
+    )
+    detoken_proc.start()
+
+    # Init tokenizer manager first, as the bootstrap server is initialized here
+    if server_args.tokenizer_worker_num > 1:
+        # Launch multi-tokenizer router
+        tokenizer_manager = MultiTokenizerRouter(server_args, port_args)
+        template_manager = None
+    else:
+        tokenizer_manager, template_manager = _init_tokenizer_manager(
+            server_args, port_args
+        )
+
+    # Wait for the model to finish loading
+    scheduler_infos = []
+    for i in range(len(scheduler_pipe_readers)):
+        try:
+            data = scheduler_pipe_readers[i].recv()
+        except EOFError:
+            logger.error(
+                f"Rank {i} scheduler is dead. Please check if there are relevant logs."
+            )
+            scheduler_procs[i].join()
+            logger.error(f"Exit code: {scheduler_procs[i].exitcode}")
+            raise
+
+        if data["status"] != "ready":
+            raise RuntimeError(
+                "Initialization failed. Please see the error messages above."
+            )
+        scheduler_infos.append(data)
+
+    # Assume all schedulers have the same scheduler_info
+    scheduler_info = scheduler_infos[0]
+
+    tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
+
+    return tokenizer_manager, template_manager, scheduler_info
diff --git a/python/sglang/srt/entrypoints/harmony_utils.py b/python/sglang/srt/entrypoints/harmony_utils.py
new file mode 100644
index 000000000..5ebb653b3
--- /dev/null
+++ b/python/sglang/srt/entrypoints/harmony_utils.py
@@ -0,0 +1,372 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from vLLM: https://github.com/vllm-project/vllm/blob/1b9902806915040ac9b3029f2ab7522ec505afc3/vllm/entrypoints/harmony_utils.py
+# Slight differences in processing chat messages
+import datetime
+import json
+from collections.abc import Iterable
+from typing import Literal, Optional, Union
+
+from openai.types.responses import (
+    ResponseOutputItem,
+    ResponseOutputMessage,
+    ResponseOutputText,
+    ResponseReasoningItem,
+)
+from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
+from openai.types.responses.response_function_web_search import (
+    ActionFind,
+    ActionOpenPage,
+    ActionSearch,
+    ResponseFunctionWebSearch,
+)
+from openai.types.responses.response_reasoning_item import (
+    Content as ResponseReasoningTextContent,
+)
+from openai.types.responses.tool import Tool
+from openai_harmony import (
+    Author,
+    Conversation,
+    DeveloperContent,
+    HarmonyEncodingName,
+    Message,
+    ReasoningEffort,
+    Role,
+    StreamableParser,
+    SystemContent,
+    TextContent,
+    ToolDescription,
+    load_harmony_encoding,
+)
+
+from sglang.srt.entrypoints.openai.protocol import ResponseInputOutputItem
+from sglang.srt.utils import random_uuid
+
+REASONING_EFFORT = {
+    "high": ReasoningEffort.HIGH,
+    "medium": ReasoningEffort.MEDIUM,
+    "low": ReasoningEffort.LOW,
+}
+
+_harmony_encoding = None
+
+
+def get_encoding():
+    global _harmony_encoding
+    if _harmony_encoding is None:
+        _harmony_encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+    return _harmony_encoding
+
+
+def get_system_message(
+    model_identity: Optional[str] = None,
+    reasoning_effort: Optional[Literal["high", "medium", "low"]] = None,
+    start_date: Optional[str] = None,
+    browser_description: Optional[str] = None,
+    python_description: Optional[str] = None,
+) -> Message:
+    sys_msg_content = SystemContent.new()
+    if model_identity is not None:
+        sys_msg_content = sys_msg_content.with_model_identity(model_identity)
+    if reasoning_effort is not None:
+        sys_msg_content = sys_msg_content.with_reasoning_effort(
+            REASONING_EFFORT[reasoning_effort]
+        )
+    if start_date is None:
+        start_date = datetime.datetime.now().strftime("%Y-%m-%d")
+    sys_msg_content = sys_msg_content.with_conversation_start_date(start_date)
+    if browser_description is not None:
+        sys_msg_content = sys_msg_content.with_tools(browser_description)
+    if python_description is not None:
+        sys_msg_content = sys_msg_content.with_tools(python_description)
+    sys_msg = Message.from_role_and_content(Role.SYSTEM, sys_msg_content)
+    return sys_msg
+
+
+def get_developer_message(
+    instructions: Optional[str] = None, tools: Optional[list[Tool]] = None
+) -> Message:
+    dev_msg_content = DeveloperContent.new()
+    if instructions is not None:
+        dev_msg_content = dev_msg_content.with_instructions(instructions)
+    if tools is not None:
+        function_tools = []
+        for tool in tools:
+            if tool.type in ("web_search_preview", "code_interpreter"):
+                # These are built-in tools that are added to the system message.
+                pass
+            elif tool.type == "function":
+                function_tools.append(tool)
+            else:
+                raise ValueError(f"tool type {tool.type} not supported")
+        if function_tools:
+            function_tool_descriptions = [
+                ToolDescription.new(
+                    name=tool.name,
+                    description=tool.description,
+                    parameters=tool.parameters,
+                )
+                for tool in function_tools
+            ]
+            dev_msg_content = dev_msg_content.with_function_tools(
+                function_tool_descriptions
+            )
+    dev_msg = Message.from_role_and_content(Role.DEVELOPER, dev_msg_content)
+    return dev_msg
+
+
+def get_user_message(content: str) -> Message:
+    return Message.from_role_and_content(Role.USER, content)
+
+
+def parse_response_input(
+    response_msg: ResponseInputOutputItem,
+    prev_responses: list[Union[ResponseOutputItem, ResponseReasoningItem]],
+) -> Message:
+    if not isinstance(response_msg, dict):
+        response_msg = response_msg.model_dump()
+    if "type" not in response_msg or response_msg["type"] == "message":
+        role = response_msg["role"]
+        content = response_msg["content"]
+        if role == "system":
+            # User is trying to set a system message. Change it to:
+            # <|start|>developer<|message|># Instructions
+            # {instructions}<|end|>
+            role = "developer"
+            text_prefix = "Instructions:\n"
+        else:
+            text_prefix = ""
+        if isinstance(content, str):
+            msg = Message.from_role_and_content(role, text_prefix + content)
+        else:
+            contents = [TextContent(text=text_prefix + c["text"]) for c in content]
+            msg = Message.from_role_and_contents(role, contents)
+    elif response_msg["type"] == "function_call_output":
+        call_id = response_msg["call_id"]
+        call_response: Optional[ResponseFunctionToolCall] = None
+        for prev_response in reversed(prev_responses):
+            if (
+                isinstance(prev_response, ResponseFunctionToolCall)
+                and prev_response.call_id == call_id
+            ):
+                call_response = prev_response
+                break
+        if call_response is None:
+            raise ValueError(f"No call message found for {call_id}")
+        msg = Message.from_author_and_content(
+            Author.new(Role.TOOL, f"functions.{call_response.name}"),
+            response_msg["output"],
+        )
+    elif response_msg["type"] == "reasoning":
+        content = response_msg["content"]
+        assert len(content) == 1
+        msg = Message.from_role_and_content(Role.ASSISTANT, content[0]["text"])
+    elif response_msg["type"] == "function_call":
+        msg = Message.from_role_and_content(Role.ASSISTANT, response_msg["arguments"])
+        msg = msg.with_channel("commentary")
+        msg = msg.with_recipient(f"functions.{response_msg['name']}")
+        msg = msg.with_content_type("json")
+    else:
+        raise ValueError(f"Unknown input type: {response_msg['type']}")
+    return msg
+
+
+def parse_response_output(output: ResponseOutputItem) -> Message:
+    if isinstance(output, ResponseOutputMessage):
+        role = output.role
+        contents = [TextContent(text=c.text) for c in output.content]
+        msg = Message.from_role_and_contents(role, contents)
+        return msg
+    elif isinstance(output, ResponseFunctionToolCall):
+        msg = Message.from_role_and_content(Role.ASSISTANT, output.arguments)
+        msg = msg.with_channel("commentary")
+        msg = msg.with_recipient(output.name)
+        msg = msg.with_content_type("json")
+        return msg
+    else:
+        raise ValueError(f"Unknown output type: {type(output)}")
+
+
+def parse_chat_input(chat_msg) -> Message:
+    role = chat_msg.role
+    content = chat_msg.content
+    if isinstance(content, str):
+        contents = [TextContent(text=content)]
+    else:
+        # TODO: Support refusal.
+        contents = [TextContent(text=c.text) for c in content]
+    msg = Message.from_role_and_contents(role, contents)
+    return msg
+
+
+def render_for_completion(messages: list[Message]) -> list[int]:
+    conversation = Conversation.from_messages(messages)
+    token_ids = get_encoding().render_conversation_for_completion(
+        conversation, Role.ASSISTANT
+    )
+    return token_ids
+
+
+def get_stop_tokens_for_assistant_actions() -> list[int]:
+    return get_encoding().stop_tokens_for_assistant_actions()
+
+
+def get_streamable_parser_for_assistant() -> StreamableParser:
+    return StreamableParser(get_encoding(), role=Role.ASSISTANT)
+
+
+def parse_output_message(message: Message):
+    if message.author.role != "assistant":
+        # This is a message from a tool to the assistant (e.g., search result).
+        # Don't include it in the final output for now. This aligns with
+        # OpenAI's behavior on models like o4-mini.
+        return []
+
+    output_items = []
+    recipient = message.recipient
+    if recipient is not None and recipient.startswith("browser."):
+        if len(message.content) != 1:
+            raise ValueError("Invalid number of contents in browser message")
+        content = message.content[0]
+        browser_call = json.loads(content.text)
+        # TODO: translate to url properly!
+        if recipient == "browser.search":
+            action = ActionSearch(
+                query=f"cursor:{browser_call.get('query', '')}", type="search"
+            )
+        elif recipient == "browser.open":
+            action = ActionOpenPage(
+                url=f"cursor:{browser_call.get('url', '')}", type="open_page"
+            )
+        elif recipient == "browser.find":
+            action = ActionFind(
+                pattern=browser_call["pattern"],
+                url=f"cursor:{browser_call.get('url', '')}",
+                type="find",
+            )
+        else:
+            raise ValueError(f"Unknown browser action: {recipient}")
+        web_search_item = ResponseFunctionWebSearch(
+            id=f"ws_{random_uuid()}",
+            action=action,
+            status="completed",
+            type="web_search_call",
+        )
+        output_items.append(web_search_item)
+    elif message.channel == "analysis":
+        for content in message.content:
+            reasoning_item = ResponseReasoningItem(
+                id=f"rs_{random_uuid()}",
+                type="reasoning",
+                summary=[],
+                content=[
+                    ResponseReasoningTextContent(
+                        text=content.text, type="reasoning_text"
+                    )
+                ],
+                status=None,
+            )
+            output_items.append(reasoning_item)
+    elif message.channel == "commentary":
+        if message.recipient.startswith("functions."):
+            function_name = message.recipient.split(".")[-1]
+            for content in message.content:
+                random_id = random_uuid()
+                response_item = ResponseFunctionToolCall(
+                    arguments=content.text,
+                    call_id=f"call_{random_id}",
+                    type="function_call",
+                    name=function_name,
+                    id=f"ft_{random_id}",
+                )
+                output_items.append(response_item)
+        elif message.recipient.startswith("python") or message.recipient.startswith(
+            "browser"
+        ):
+            for content in message.content:
+                reasoning_item = ResponseReasoningItem(
+                    id=f"rs_{random_uuid()}",
+                    type="reasoning",
+                    summary=[],
+                    content=[
+                        ResponseReasoningTextContent(
+                            text=content.text, type="reasoning_text"
+                        )
+                    ],
+                    status=None,
+                )
+                output_items.append(reasoning_item)
+        else:
+            raise ValueError(f"Unknown recipient: {message.recipient}")
+    elif message.channel == "final":
+        contents = []
+        for content in message.content:
+            output_text = ResponseOutputText(
+                text=content.text,
+                annotations=[],  # TODO
+                type="output_text",
+                logprobs=None,  # TODO
+            )
+            contents.append(output_text)
+        text_item = ResponseOutputMessage(
+            id=f"msg_{random_uuid()}",
+            content=contents,
+            role=message.author.role,
+            status="completed",
+            type="message",
+        )
+        output_items.append(text_item)
+    else:
+        raise ValueError(f"Unknown channel: {message.channel}")
+    return output_items
+
+
+def parse_remaining_state(parser: StreamableParser):
+    if not parser.current_content:
+        return []
+    if parser.current_role != Role.ASSISTANT:
+        return []
+    current_recipient = parser.current_recipient
+    if current_recipient is not None and current_recipient.startswith("browser."):
+        return []
+
+    if parser.current_channel == "analysis":
+        reasoning_item = ResponseReasoningItem(
+            id=f"rs_{random_uuid()}",
+            type="reasoning",
+            summary=[],
+            content=[
+                ResponseReasoningTextContent(
+                    text=parser.current_content, type="reasoning_text"
+                )
+            ],
+            status=None,
+        )
+        return [reasoning_item]
+    elif parser.current_channel == "final":
+        output_text = ResponseOutputText(
+            content=[
+                ResponseReasoningTextContent(
+                    text=parser.current_content, type="reasoning_text"
+                )
+            ],
+            annotations=[],  # TODO
+            type="output_text",
+            logprobs=None,  # TODO
+        )
+        text_item = ResponseOutputMessage(
+            id=f"msg_{random_uuid()}",
+            content=[output_text],
+            role="assistant",
+            status="completed",
+            type="message",
+        )
+        return [text_item]
+    return []
+
+
+def parse_output_into_messages(token_ids: Iterable[int]):
+    parser = get_streamable_parser_for_assistant()
+    for token_id in token_ids:
+        parser.process(token_id)
+    return parser
diff --git a/python/sglang/srt/entrypoints/http_server.py b/python/sglang/srt/entrypoints/http_server.py
new file mode 100644
index 000000000..9adac76ce
--- /dev/null
+++ b/python/sglang/srt/entrypoints/http_server.py
@@ -0,0 +1,1398 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+The entry point of inference server. (SRT = SGLang Runtime)
+
+This file implements HTTP APIs for the inference engine via fastapi.
+"""
+
+import asyncio
+import dataclasses
+import json
+import logging
+import multiprocessing as multiprocessing
+import os
+import tempfile
+import threading
+import time
+from http import HTTPStatus
+from typing import Any, AsyncIterator, Callable, Dict, List, Optional
+
+import setproctitle
+
+# Fix a bug of Python threading
+setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
+
+from contextlib import asynccontextmanager
+from typing import AsyncGenerator
+
+import numpy as np
+import orjson
+import requests
+import uvicorn
+import uvloop
+from fastapi import Depends, FastAPI, HTTPException, Request, UploadFile
+from fastapi.exceptions import RequestValidationError
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import ORJSONResponse, Response, StreamingResponse
+
+from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
+from sglang.srt.entrypoints.engine import _launch_subprocesses
+from sglang.srt.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    CompletionRequest,
+    EmbeddingRequest,
+    ErrorResponse,
+    ModelCard,
+    ModelList,
+    ResponsesRequest,
+    ScoringRequest,
+    V1RerankReqInput,
+)
+from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
+from sglang.srt.entrypoints.openai.serving_completions import OpenAIServingCompletion
+from sglang.srt.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
+from sglang.srt.entrypoints.openai.serving_rerank import OpenAIServingRerank
+from sglang.srt.entrypoints.openai.serving_score import OpenAIServingScore
+from sglang.srt.function_call.function_call_parser import FunctionCallParser
+from sglang.srt.managers.io_struct import (
+    AbortReq,
+    CloseSessionReqInput,
+    ConfigureLoggingReq,
+    EmbeddingReqInput,
+    GenerateReqInput,
+    GetWeightsByNameReqInput,
+    InitWeightsUpdateGroupReqInput,
+    LoadLoRAAdapterReqInput,
+    OpenSessionReqInput,
+    ParseFunctionCallReq,
+    ProfileReqInput,
+    ReleaseMemoryOccupationReqInput,
+    ResumeMemoryOccupationReqInput,
+    SeparateReasoningReqInput,
+    SetInternalStateReq,
+    SlowDownReqInput,
+    UnloadLoRAAdapterReqInput,
+    UpdateWeightFromDiskReqInput,
+    UpdateWeightsFromDistributedReqInput,
+    UpdateWeightsFromTensorReqInput,
+    UpdateWeightVersionReqInput,
+    VertexGenerateReqInput,
+)
+from sglang.srt.managers.multi_tokenizer_mixin import (
+    MultiTokenizerManager,
+    get_main_process_id,
+    monkey_patch_uvicorn_multiprocessing,
+    read_from_shared_memory,
+    write_data_for_multi_tokenizer,
+)
+from sglang.srt.managers.template_manager import TemplateManager
+from sglang.srt.managers.tokenizer_manager import ServerStatus, TokenizerManager
+from sglang.srt.metrics.func_timer import enable_func_timer
+from sglang.srt.parser.reasoning_parser import ReasoningParser
+from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.utils import (
+    add_api_key_middleware,
+    add_prometheus_middleware,
+    delete_directory,
+    get_bool_env_var,
+    kill_process_tree,
+    set_uvicorn_logging_configs,
+)
+from sglang.srt.warmup import execute_warmups
+from sglang.utils import get_exception_traceback
+from sglang.version import __version__
+
+logger = logging.getLogger(__name__)
+asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+
+HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
+
+
+# Store global states
+@dataclasses.dataclass
+class _GlobalState:
+    tokenizer_manager: TokenizerManager
+    template_manager: TemplateManager
+    scheduler_info: Dict
+
+
+_global_state: Optional[_GlobalState] = None
+
+
+def set_global_state(global_state: _GlobalState):
+    global _global_state
+    _global_state = global_state
+
+
+async def init_multi_tokenizer() -> ServerArgs:
+    """Read args information from shm and init tokenizer manager for current process"""
+    pid = os.getpid()
+    main_pid = get_main_process_id()
+    logger.info(f"current worker_id: {pid}, main processID: {main_pid}")
+
+    # Read configuration from shared memory
+    port_args, server_args, scheduler_info = read_from_shared_memory(
+        f"multi_tokenizer_args_{main_pid}"
+    )
+    server_args: ServerArgs
+
+    # API key authentication is not supported in multi-tokenizer mode
+    assert (
+        server_args.api_key is None
+    ), "API key is not supported in multi-tokenizer mode"
+
+    port_args.tokenizer_ipc_name = (
+        f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
+    )
+
+    # Launch multi-tokenizer manager process
+    tokenizer_manager = MultiTokenizerManager(server_args, port_args)
+    template_manager = TemplateManager()
+    template_manager.initialize_templates(
+        tokenizer_manager=tokenizer_manager,
+        model_path=server_args.model_path,
+        chat_template=server_args.chat_template,
+        completion_template=server_args.completion_template,
+    )
+    # Register this tokenizer with the main tokenizer manager
+    await tokenizer_manager.register_to_main_tokenizer_manager()
+
+    tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
+    set_global_state(
+        _GlobalState(
+            tokenizer_manager=tokenizer_manager,
+            template_manager=template_manager,
+            scheduler_info=scheduler_info,
+        )
+    )
+    return server_args
+
+
+@asynccontextmanager
+async def lifespan(fast_api_app: FastAPI):
+    if not getattr(fast_api_app, "is_single_tokenizer_mode", False):
+        # Initialize multi-tokenizer support for worker processes
+        fast_api_app.server_args: ServerArgs = await init_multi_tokenizer()
+
+        # only metrics middleware is supported in multi-tokenizer mode
+        worker_pid = os.getpid()
+        if fast_api_app.server_args.enable_metrics:
+            add_prometheus_middleware(app)
+            enable_func_timer()
+
+        logger.info(f"Worker {worker_pid} added prometheus middleware")
+        fast_api_app.warmup_thread = threading.Thread(
+            target=_wait_and_warmup,
+            args=(
+                fast_api_app.server_args,
+                None,  # pipe_finish_writer not needed in worker
+                None,  # launch_callback not needed in worker
+            ),
+        )
+
+    # Initialize OpenAI serving handlers
+    fast_api_app.state.openai_serving_completion = OpenAIServingCompletion(
+        _global_state.tokenizer_manager, _global_state.template_manager
+    )
+    fast_api_app.state.openai_serving_chat = OpenAIServingChat(
+        _global_state.tokenizer_manager, _global_state.template_manager
+    )
+    fast_api_app.state.openai_serving_embedding = OpenAIServingEmbedding(
+        _global_state.tokenizer_manager, _global_state.template_manager
+    )
+    fast_api_app.state.openai_serving_score = OpenAIServingScore(
+        _global_state.tokenizer_manager
+    )
+    fast_api_app.state.openai_serving_rerank = OpenAIServingRerank(
+        _global_state.tokenizer_manager
+    )
+
+    server_args: ServerArgs = fast_api_app.server_args
+
+    tool_server = None
+    if server_args.tool_server == "demo":
+        from sglang.srt.entrypoints.openai.tool_server import DemoToolServer
+
+        tool_server = DemoToolServer()
+    elif server_args.tool_server:
+        from sglang.srt.entrypoints.openai.tool_server import MCPToolServer
+
+        tool_server = MCPToolServer()
+        await tool_server.add_tool_server(server_args.tool_server)
+
+    try:
+        from sglang.srt.entrypoints.openai.serving_responses import (
+            OpenAIServingResponses,
+        )
+
+        fast_api_app.state.openai_serving_responses = OpenAIServingResponses(
+            _global_state.tokenizer_manager,
+            _global_state.template_manager,
+            enable_prompt_tokens_details=True,
+            enable_force_include_usage=True,
+            tool_server=tool_server,
+        )
+    except Exception as e:
+        import traceback
+
+        traceback.print_exc()
+        logger.warning(f"Can not initialize OpenAIServingResponses, error: {e}")
+
+    if server_args.warmups is not None:
+        await execute_warmups(
+            server_args.disaggregation_mode,
+            server_args.warmups.split(","),
+            _global_state.tokenizer_manager,
+        )
+        logger.info("Warmup ended")
+
+    warmup_thread = getattr(fast_api_app, "warmup_thread", None)
+    if warmup_thread is not None:
+        warmup_thread.start()
+
+    try:
+        yield
+    finally:
+        if server_args.tokenizer_worker_num > 1:
+            pid = os.getpid()
+            logger.info(f"uvicorn worker {pid} ending...")
+            warmup_thread.join()
+            logger.info(f"uvicorn worker {pid} ended.")
+
+
+# Fast API
+app = FastAPI(
+    lifespan=lifespan,
+    openapi_url=None if get_bool_env_var("DISABLE_OPENAPI_DOC") else "/openapi.json",
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+@app.exception_handler(HTTPException)
+async def validation_exception_handler(request: Request, exc: HTTPException):
+    """Enrich HTTP exception with status code and other details"""
+    error = ErrorResponse(
+        object="error",
+        message=exc.detail,
+        type=str(exc.status_code),
+        code=exc.status_code,
+    )
+    return ORJSONResponse(content=error.model_dump(), status_code=exc.status_code)
+
+
+# Custom exception handlers to change validation error status codes
+@app.exception_handler(RequestValidationError)
+async def validation_exception_handler(request: Request, exc: RequestValidationError):
+    """Override FastAPI's default 422 validation error with 400"""
+    exc_str = str(exc)
+    errors_str = str(exc.errors())
+
+    if errors_str and errors_str != exc_str:
+        message = f"{exc_str} {errors_str}"
+    else:
+        message = exc_str
+
+    err = ErrorResponse(
+        message=message,
+        type=HTTPStatus.BAD_REQUEST.phrase,
+        code=HTTPStatus.BAD_REQUEST.value,
+    )
+
+    return ORJSONResponse(
+        status_code=400,
+        content=err.model_dump(),
+    )
+
+
+async def validate_json_request(raw_request: Request):
+    """Validate that the request content-type is application/json."""
+    content_type = raw_request.headers.get("content-type", "").lower()
+    media_type = content_type.split(";", maxsplit=1)[0]
+    if media_type != "application/json":
+        raise RequestValidationError(
+            errors=[
+                {
+                    "loc": ["header", "content-type"],
+                    "msg": "Unsupported Media Type: Only 'application/json' is allowed",
+                    "type": "value_error",
+                }
+            ]
+        )
+
+
+##### Native API endpoints #####
+
+
+@app.get("/health")
+@app.get("/health_generate")
+async def health_generate(request: Request) -> Response:
+    """
+    Check the health of the inference server by sending a special request to generate one token.
+
+    If the server is running something, this request will be ignored, so it creates zero overhead.
+    If the server is not running anything, this request will be run, so we know whether the server is healthy.
+    """
+
+    if _global_state.tokenizer_manager.gracefully_exit:
+        logger.info("Health check request received during shutdown. Returning 503.")
+        return Response(status_code=503)
+
+    if _global_state.tokenizer_manager.server_status == ServerStatus.Starting:
+        return Response(status_code=503)
+
+    sampling_params = {"max_new_tokens": 1, "temperature": 0.0}
+    rid = f"HEALTH_CHECK_{time.time()}"
+
+    if _global_state.tokenizer_manager.is_image_gen:
+        # Keep this branch for some internal use cases.
+        raise NotImplementedError("Image generation is not supported yet.")
+    elif _global_state.tokenizer_manager.is_generation:
+        gri = GenerateReqInput(
+            rid=rid,
+            input_ids=[0],
+            sampling_params=sampling_params,
+            log_metrics=False,
+        )
+        if (
+            _global_state.tokenizer_manager.server_args.disaggregation_mode
+            != DisaggregationMode.NULL
+        ):
+            gri.bootstrap_host = FAKE_BOOTSTRAP_HOST
+            gri.bootstrap_room = 0
+    else:
+        gri = EmbeddingReqInput(
+            rid=rid, input_ids=[0], sampling_params=sampling_params, log_metrics=False
+        )
+
+    async def gen():
+        async for _ in _global_state.tokenizer_manager.generate_request(gri, request):
+            break
+
+    task = asyncio.create_task(gen())
+
+    # As long as we receive any response from the detokenizer/scheduler, we consider the server is healthy.
+    tic = time.time()
+    while time.time() < tic + HEALTH_CHECK_TIMEOUT:
+        await asyncio.sleep(1)
+        if _global_state.tokenizer_manager.last_receive_tstamp > tic:
+            task.cancel()
+            _global_state.tokenizer_manager.rid_to_state.pop(rid, None)
+            _global_state.tokenizer_manager.server_status = ServerStatus.Up
+            return Response(status_code=200)
+
+    task.cancel()
+    tic_time = time.strftime("%H:%M:%S", time.localtime(tic))
+    last_receive_time = time.strftime(
+        "%H:%M:%S", time.localtime(_global_state.tokenizer_manager.last_receive_tstamp)
+    )
+    logger.error(
+        f"Health check failed. Server couldn't get a response from detokenizer for last "
+        f"{HEALTH_CHECK_TIMEOUT} seconds. tic start time: {tic_time}. "
+        f"last_heartbeat time: {last_receive_time}"
+    )
+    _global_state.tokenizer_manager.rid_to_state.pop(rid, None)
+    _global_state.tokenizer_manager.server_status = ServerStatus.UnHealthy
+    return Response(status_code=503)
+
+
+@app.get("/get_model_info")
+async def get_model_info():
+    """Get the model information."""
+    result = {
+        "model_path": _global_state.tokenizer_manager.model_path,
+        "tokenizer_path": _global_state.tokenizer_manager.server_args.tokenizer_path,
+        "is_generation": _global_state.tokenizer_manager.is_generation,
+        "preferred_sampling_params": _global_state.tokenizer_manager.server_args.preferred_sampling_params,
+        "weight_version": _global_state.tokenizer_manager.server_args.weight_version,
+    }
+    return result
+
+
+@app.get("/get_weight_version")
+async def get_weight_version():
+    """Get the current weight version."""
+    return {
+        "weight_version": _global_state.tokenizer_manager.server_args.weight_version
+    }
+
+
+@app.get("/get_server_info")
+async def get_server_info():
+    # Returns interna states per DP.
+    internal_states: List[Dict[Any, Any]] = (
+        await _global_state.tokenizer_manager.get_internal_state()
+    )
+    return {
+        **dataclasses.asdict(_global_state.tokenizer_manager.server_args),
+        **_global_state.scheduler_info,
+        "internal_states": internal_states,
+        "version": __version__,
+    }
+
+
+@app.get("/get_load")
+async def get_load():
+    return await _global_state.tokenizer_manager.get_load()
+
+
+# example usage:
+# curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"max_micro_batch_size": 8}}'
+@app.api_route("/set_internal_state", methods=["POST", "PUT"])
+async def set_internal_state(obj: SetInternalStateReq, request: Request):
+    res = await _global_state.tokenizer_manager.set_internal_state(obj)
+    return res
+
+
+# fastapi implicitly converts json in the request to obj (dataclass)
+@app.api_route("/generate", methods=["POST", "PUT"])
+async def generate_request(obj: GenerateReqInput, request: Request):
+    """Handle a generate request."""
+    if obj.stream:
+
+        async def stream_results() -> AsyncIterator[bytes]:
+            try:
+                async for out in _global_state.tokenizer_manager.generate_request(
+                    obj, request
+                ):
+                    yield b"data: " + orjson.dumps(
+                        out, option=orjson.OPT_NON_STR_KEYS
+                    ) + b"\n\n"
+            except ValueError as e:
+                out = {"error": {"message": str(e)}}
+                logger.error(f"[http_server] Error: {e}")
+                yield b"data: " + orjson.dumps(
+                    out, option=orjson.OPT_NON_STR_KEYS
+                ) + b"\n\n"
+            yield b"data: [DONE]\n\n"
+
+        return StreamingResponse(
+            stream_results(),
+            media_type="text/event-stream",
+            background=_global_state.tokenizer_manager.create_abort_task(obj),
+        )
+    else:
+        try:
+            ret = await _global_state.tokenizer_manager.generate_request(
+                obj, request
+            ).__anext__()
+            return ret
+        except ValueError as e:
+            logger.error(f"[http_server] Error: {e}")
+            return _create_error_response(e)
+
+
+@app.api_route("/generate_from_file", methods=["POST"])
+async def generate_from_file_request(file: UploadFile, request: Request):
+    """Handle a generate request, this is purely to work with input_embeds."""
+    content = await file.read()
+    input_embeds = json.loads(content.decode("utf-8"))
+
+    obj = GenerateReqInput(
+        input_embeds=input_embeds,
+        sampling_params={
+            "temperature": 0.0,
+            "max_new_tokens": 512,
+        },
+    )
+
+    try:
+        ret = await _global_state.tokenizer_manager.generate_request(
+            obj, request
+        ).__anext__()
+        return ret
+    except ValueError as e:
+        logger.error(f"Error: {e}")
+        return _create_error_response(e)
+
+
+@app.api_route("/encode", methods=["POST", "PUT"])
+async def encode_request(obj: EmbeddingReqInput, request: Request):
+    """Handle an embedding request."""
+    try:
+        ret = await _global_state.tokenizer_manager.generate_request(
+            obj, request
+        ).__anext__()
+        return ret
+    except ValueError as e:
+        return _create_error_response(e)
+
+
+@app.api_route("/classify", methods=["POST", "PUT"])
+async def classify_request(obj: EmbeddingReqInput, request: Request):
+    """Handle a reward model request. Now the arguments and return values are the same as embedding models."""
+    try:
+        ret = await _global_state.tokenizer_manager.generate_request(
+            obj, request
+        ).__anext__()
+        return ret
+    except ValueError as e:
+        return _create_error_response(e)
+
+
+@app.api_route("/flush_cache", methods=["GET", "POST"])
+async def flush_cache():
+    """Flush the radix cache."""
+    ret = await _global_state.tokenizer_manager.flush_cache()
+    return Response(
+        content="Cache flushed.\nPlease check backend logs for more details. "
+        "(When there are running or waiting requests, the operation will not be performed.)\n",
+        status_code=200 if ret.success else HTTPStatus.BAD_REQUEST,
+    )
+
+
+@app.api_route("/clear_hicache_storage_backend", methods=["GET", "POST"])
+async def clear_hicache_storage_backend():
+    """Clear the hierarchical cache storage backend."""
+    ret = await _global_state.tokenizer_manager.clear_hicache_storage()
+    return Response(
+        content="Hierarchical cache storage backend cleared.\n",
+        status_code=200 if ret.success else HTTPStatus.BAD_REQUEST,
+    )
+
+
+@app.api_route("/start_profile", methods=["GET", "POST"])
+async def start_profile_async(obj: Optional[ProfileReqInput] = None):
+    """Start profiling."""
+    if obj is None:
+        obj = ProfileReqInput()
+
+    await _global_state.tokenizer_manager.start_profile(
+        output_dir=obj.output_dir,
+        start_step=obj.start_step,
+        num_steps=obj.num_steps,
+        activities=obj.activities,
+        with_stack=obj.with_stack,
+        record_shapes=obj.record_shapes,
+        profile_by_stage=obj.profile_by_stage,
+    )
+    return Response(
+        content="Start profiling.\n",
+        status_code=200,
+    )
+
+
+@app.api_route("/stop_profile", methods=["GET", "POST"])
+async def stop_profile_async():
+    """Stop profiling."""
+    await _global_state.tokenizer_manager.stop_profile()
+    return Response(
+        content="Stop profiling. This will take some time.\n",
+        status_code=200,
+    )
+
+
+@app.api_route("/freeze_gc", methods=["GET", "POST"])
+async def freeze_gc_async():
+    """
+    See engine.freeze_gc for more details.
+    """
+    await _global_state.tokenizer_manager.freeze_gc()
+    return Response(
+        content="Garbage collection frozen.\n",
+        status_code=200,
+    )
+
+
+@app.api_route("/start_expert_distribution_record", methods=["GET", "POST"])
+async def start_expert_distribution_record_async():
+    """Start recording the expert distribution. Clear the previous record if any."""
+    await _global_state.tokenizer_manager.start_expert_distribution_record()
+    return Response(
+        content="Start recording the expert distribution.\n",
+        status_code=200,
+    )
+
+
+@app.api_route("/stop_expert_distribution_record", methods=["GET", "POST"])
+async def stop_expert_distribution_record_async():
+    """Stop recording the expert distribution."""
+    await _global_state.tokenizer_manager.stop_expert_distribution_record()
+    return Response(
+        content="Stop recording the expert distribution.\n",
+        status_code=200,
+    )
+
+
+@app.api_route("/dump_expert_distribution_record", methods=["GET", "POST"])
+async def dump_expert_distribution_record_async():
+    """Dump expert distribution record."""
+    await _global_state.tokenizer_manager.dump_expert_distribution_record()
+    return Response(
+        content="Dump expert distribution record.\n",
+        status_code=200,
+    )
+
+
+@app.post("/update_weights_from_disk")
+async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: Request):
+    """Update the weights from disk inplace without re-launching the server."""
+    success, message, num_paused_requests = (
+        await _global_state.tokenizer_manager.update_weights_from_disk(obj, request)
+    )
+
+    # Update weight version if provided and weights update was successful
+    if success and obj.weight_version is not None:
+        _update_weight_version_if_provided(obj.weight_version)
+        message += f" Weight version updated to {obj.weight_version}."
+
+    content = {
+        "success": success,
+        "message": message,
+        "num_paused_requests": num_paused_requests,
+    }
+    if success:
+        return ORJSONResponse(
+            content,
+            status_code=HTTPStatus.OK,
+        )
+    else:
+        return ORJSONResponse(
+            content,
+            status_code=HTTPStatus.BAD_REQUEST,
+        )
+
+
+@app.post("/init_weights_update_group")
+async def init_weights_update_group(
+    obj: InitWeightsUpdateGroupReqInput, request: Request
+):
+    """Initialize the parameter update group."""
+    success, message = await _global_state.tokenizer_manager.init_weights_update_group(
+        obj, request
+    )
+    content = {"success": success, "message": message}
+    if success:
+        return ORJSONResponse(content, status_code=200)
+    else:
+        return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
+
+
+@app.post("/update_weights_from_tensor")
+async def update_weights_from_tensor(
+    obj: UpdateWeightsFromTensorReqInput, request: Request
+):
+    """Update the weights from tensor inplace without re-launching the server.
+    Notes:
+    1. Ensure that the model is on the correct device (e.g., GPU) before calling this endpoint. If the model is moved to the CPU unexpectedly, it may cause performance issues or runtime errors.
+    2. HTTP will transmit only the metadata of the tensor, while the tensor itself will be directly copied to the model.
+    3. Any binary data in the named tensors should be base64 encoded.
+    """
+
+    success, message = await _global_state.tokenizer_manager.update_weights_from_tensor(
+        obj, request
+    )
+
+    # Update weight version if provided and weights update was successful
+    if success and obj.weight_version is not None:
+        _update_weight_version_if_provided(obj.weight_version)
+        message += f" Weight version updated to {obj.weight_version}."
+
+    content = {"success": success, "message": message}
+    return ORJSONResponse(
+        content, status_code=200 if success else HTTPStatus.BAD_REQUEST
+    )
+
+
+@app.post("/update_weights_from_distributed")
+async def update_weights_from_distributed(
+    obj: UpdateWeightsFromDistributedReqInput, request: Request
+):
+    """Update model parameter from distributed online."""
+    success, message = (
+        await _global_state.tokenizer_manager.update_weights_from_distributed(
+            obj, request
+        )
+    )
+
+    # Update weight version if provided and weights update was successful
+    if success and obj.weight_version is not None:
+        _update_weight_version_if_provided(obj.weight_version)
+        message += f" Weight version updated to {obj.weight_version}."
+
+    content = {"success": success, "message": message}
+    if success:
+        return ORJSONResponse(content, status_code=200)
+    else:
+        return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
+
+
+@app.post("/update_weight_version")
+async def update_weight_version(obj: UpdateWeightVersionReqInput, request: Request):
+    """Update the weight version. This operation requires no active requests."""
+    if obj.abort_all_requests:
+        _global_state.tokenizer_manager.abort_request(abort_all=True)
+
+    # Use a simple approach without the complex lock mechanism for now
+    # since weight_version update is a simple operation that doesn't affect model weights
+    try:
+        # Update the weight version in server args (the single source of truth)
+        _global_state.tokenizer_manager.server_args.weight_version = obj.new_version
+
+        return ORJSONResponse(
+            {
+                "success": True,
+                "message": f"Weight version updated to {obj.new_version}",
+                "new_version": obj.new_version,
+            },
+            status_code=HTTPStatus.OK,
+        )
+    except Exception as e:
+        return ORJSONResponse(
+            {
+                "success": False,
+                "message": f"Failed to update weight version: {str(e)}",
+            },
+            status_code=HTTPStatus.BAD_REQUEST,
+        )
+
+
+@app.api_route("/get_weights_by_name", methods=["GET", "POST"])
+async def get_weights_by_name(obj: GetWeightsByNameReqInput, request: Request):
+    """Get model parameter by name."""
+    try:
+        ret = await _global_state.tokenizer_manager.get_weights_by_name(obj, request)
+        if ret is None:
+            return _create_error_response("Get parameter by name failed")
+        else:
+            return ORJSONResponse(ret, status_code=200)
+    except Exception as e:
+        return _create_error_response(e)
+
+
+@app.api_route("/release_memory_occupation", methods=["GET", "POST"])
+async def release_memory_occupation(
+    obj: ReleaseMemoryOccupationReqInput, request: Request
+):
+    """Release GPU memory occupation temporarily."""
+    try:
+        await _global_state.tokenizer_manager.release_memory_occupation(obj, request)
+    except Exception as e:
+        return _create_error_response(e)
+
+
+@app.api_route("/resume_memory_occupation", methods=["GET", "POST"])
+async def resume_memory_occupation(
+    obj: ResumeMemoryOccupationReqInput, request: Request
+):
+    """Resume GPU memory occupation."""
+    try:
+        await _global_state.tokenizer_manager.resume_memory_occupation(obj, request)
+    except Exception as e:
+        return _create_error_response(e)
+
+
+@app.api_route("/slow_down", methods=["GET", "POST"])
+async def slow_down(obj: SlowDownReqInput, request: Request):
+    """Slow down the system deliberately. Only for testing. Example scenario:
+    when we want to test performance of D in large-scale PD disaggregation and have no enough nodes for P,
+    we can use this to slow down D to let it have enough running sequences, and then disable slowdown
+    to let it run in full batch size.
+    """
+    try:
+        await _global_state.tokenizer_manager.slow_down(obj, request)
+    except Exception as e:
+        return _create_error_response(e)
+
+
+@app.api_route("/load_lora_adapter", methods=["POST"])
+async def load_lora_adapter(obj: LoadLoRAAdapterReqInput, request: Request):
+    """Load a new LoRA adapter without re-launching the server."""
+    result = await _global_state.tokenizer_manager.load_lora_adapter(obj, request)
+
+    if result.success:
+        return ORJSONResponse(
+            result,
+            status_code=HTTPStatus.OK,
+        )
+    else:
+        return ORJSONResponse(
+            result,
+            status_code=HTTPStatus.BAD_REQUEST,
+        )
+
+
+@app.api_route("/unload_lora_adapter", methods=["POST"])
+async def unload_lora_adapter(obj: UnloadLoRAAdapterReqInput, request: Request):
+    """Load a new LoRA adapter without re-launching the server."""
+    result = await _global_state.tokenizer_manager.unload_lora_adapter(obj, request)
+
+    if result.success:
+        return ORJSONResponse(
+            result,
+            status_code=HTTPStatus.OK,
+        )
+    else:
+        return ORJSONResponse(
+            result,
+            status_code=HTTPStatus.BAD_REQUEST,
+        )
+
+
+@app.api_route("/open_session", methods=["GET", "POST"])
+async def open_session(obj: OpenSessionReqInput, request: Request):
+    """Open a session, and return its unique session id."""
+    try:
+        session_id = await _global_state.tokenizer_manager.open_session(obj, request)
+        if session_id is None:
+            raise Exception(
+                "Failed to open the session. Check if a session with the same id is still open."
+            )
+        return session_id
+    except Exception as e:
+        return _create_error_response(e)
+
+
+@app.api_route("/close_session", methods=["GET", "POST"])
+async def close_session(obj: CloseSessionReqInput, request: Request):
+    """Close the session."""
+    try:
+        await _global_state.tokenizer_manager.close_session(obj, request)
+        return Response(status_code=200)
+    except Exception as e:
+        return _create_error_response(e)
+
+
+@app.api_route("/configure_logging", methods=["GET", "POST"])
+async def configure_logging(obj: ConfigureLoggingReq, request: Request):
+    """Configure the request logging options."""
+    _global_state.tokenizer_manager.configure_logging(obj)
+    return Response(status_code=200)
+
+
+@app.post("/abort_request")
+async def abort_request(obj: AbortReq, request: Request):
+    """Abort a request."""
+    try:
+        _global_state.tokenizer_manager.abort_request(
+            rid=obj.rid, abort_all=obj.abort_all
+        )
+        return Response(status_code=200)
+    except Exception as e:
+        return _create_error_response(e)
+
+
+@app.post("/parse_function_call")
+async def parse_function_call_request(obj: ParseFunctionCallReq, request: Request):
+    """
+    A native API endpoint to parse function calls from a text.
+    """
+    # 1) Initialize the parser based on the request body
+    parser = FunctionCallParser(tools=obj.tools, tool_call_parser=obj.tool_call_parser)
+
+    # 2) Call the non-stream parsing method (non-stream)
+    normal_text, calls = parser.parse_non_stream(obj.text)
+
+    # 3) Organize the response content
+    response_data = {
+        "normal_text": normal_text,
+        "calls": [
+            call.model_dump() for call in calls
+        ],  # Convert pydantic objects to dictionaries
+    }
+
+    return ORJSONResponse(content=response_data, status_code=200)
+
+
+@app.post("/separate_reasoning")
+async def separate_reasoning_request(obj: SeparateReasoningReqInput, request: Request):
+    """
+    A native API endpoint to separate reasoning from a text.
+    """
+    # 1) Initialize the parser based on the request body
+    parser = ReasoningParser(model_type=obj.reasoning_parser)
+
+    # 2) Call the non-stream parsing method (non-stream)
+    reasoning_text, normal_text = parser.parse_non_stream(obj.text)
+
+    # 3) Organize the response content
+    response_data = {
+        "reasoning_text": reasoning_text,
+        "text": normal_text,
+    }
+
+    return ORJSONResponse(content=response_data, status_code=200)
+
+
+@app.post("/pause_generation")
+async def pause_generation(request: Request):
+    """Pause generation."""
+    await _global_state.tokenizer_manager.pause_generation()
+    return ORJSONResponse(
+        content={"message": "Generation paused successfully.", "status": "ok"},
+        status_code=200,
+    )
+
+
+@app.post("/continue_generation")
+async def continue_generation(request: Request):
+    """Continue generation."""
+    await _global_state.tokenizer_manager.continue_generation()
+    return ORJSONResponse(
+        content={"message": "Generation continued successfully.", "status": "ok"},
+        status_code=200,
+    )
+
+
+##### OpenAI-compatible API endpoints #####
+
+
+@app.post("/v1/completions", dependencies=[Depends(validate_json_request)])
+async def openai_v1_completions(request: CompletionRequest, raw_request: Request):
+    """OpenAI-compatible text completion endpoint."""
+    return await raw_request.app.state.openai_serving_completion.handle_request(
+        request, raw_request
+    )
+
+
+@app.post("/v1/chat/completions", dependencies=[Depends(validate_json_request)])
+async def openai_v1_chat_completions(
+    request: ChatCompletionRequest, raw_request: Request
+):
+    """OpenAI-compatible chat completion endpoint."""
+    return await raw_request.app.state.openai_serving_chat.handle_request(
+        request, raw_request
+    )
+
+
+@app.post(
+    "/v1/embeddings",
+    response_class=ORJSONResponse,
+    dependencies=[Depends(validate_json_request)],
+)
+async def openai_v1_embeddings(request: EmbeddingRequest, raw_request: Request):
+    """OpenAI-compatible embeddings endpoint."""
+    return await raw_request.app.state.openai_serving_embedding.handle_request(
+        request, raw_request
+    )
+
+
+@app.get("/v1/models", response_class=ORJSONResponse)
+async def available_models():
+    """Show available models. OpenAI-compatible endpoint."""
+    served_model_names = [_global_state.tokenizer_manager.served_model_name]
+    model_cards = []
+    for served_model_name in served_model_names:
+        model_cards.append(
+            ModelCard(
+                id=served_model_name,
+                root=served_model_name,
+                max_model_len=_global_state.tokenizer_manager.model_config.context_len,
+            )
+        )
+    return ModelList(data=model_cards)
+
+
+@app.get("/v1/models/{model:path}", response_class=ORJSONResponse)
+async def retrieve_model(model: str):
+    """Retrieves a model instance, providing basic information about the model."""
+    served_model_names = [_global_state.tokenizer_manager.served_model_name]
+
+    if model not in served_model_names:
+        return ORJSONResponse(
+            status_code=404,
+            content={
+                "error": {
+                    "message": f"The model '{model}' does not exist",
+                    "type": "invalid_request_error",
+                    "param": "model",
+                    "code": "model_not_found",
+                }
+            },
+        )
+
+    return ModelCard(
+        id=model,
+        root=model,
+        max_model_len=_global_state.tokenizer_manager.model_config.context_len,
+    )
+
+
+@app.post("/v1/score", dependencies=[Depends(validate_json_request)])
+async def v1_score_request(request: ScoringRequest, raw_request: Request):
+    """Endpoint for the decoder-only scoring API. See Engine.score() for detailed documentation."""
+    return await raw_request.app.state.openai_serving_score.handle_request(
+        request, raw_request
+    )
+
+
+@app.post("/v1/responses", dependencies=[Depends(validate_json_request)])
+async def v1_responses_request(request: dict, raw_request: Request):
+    """Endpoint for the responses API with reasoning support."""
+
+    request_obj = ResponsesRequest(**request)
+    result = await raw_request.app.state.openai_serving_responses.create_responses(
+        request_obj, raw_request
+    )
+
+    # Handle streaming responses
+    if isinstance(result, AsyncGenerator):
+        return StreamingResponse(
+            result,
+            media_type="text/event-stream",
+            headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
+        )
+
+    return result
+
+
+@app.get("/v1/responses/{response_id}")
+async def v1_retrieve_responses(response_id: str, raw_request: Request):
+    """Retrieve a response by ID."""
+    return await raw_request.app.state.openai_serving_responses.retrieve_responses(
+        response_id
+    )
+
+
+@app.post("/v1/responses/{response_id}/cancel")
+async def v1_cancel_responses(response_id: str, raw_request: Request):
+    """Cancel a background response."""
+    return await raw_request.app.state.openai_serving_responses.cancel_responses(
+        response_id
+    )
+
+
+@app.api_route(
+    "/v1/rerank", methods=["POST", "PUT"], dependencies=[Depends(validate_json_request)]
+)
+async def v1_rerank_request(request: V1RerankReqInput, raw_request: Request):
+    """Endpoint for reranking documents based on query relevance."""
+    return await raw_request.app.state.openai_serving_rerank.handle_request(
+        request, raw_request
+    )
+
+
+## SageMaker API
+@app.get("/ping")
+async def sagemaker_health() -> Response:
+    """Check the health of the http server."""
+    return Response(status_code=200)
+
+
+@app.post("/invocations")
+async def sagemaker_chat_completions(
+    request: ChatCompletionRequest, raw_request: Request
+):
+    """OpenAI-compatible chat completion endpoint."""
+    return await raw_request.app.state.openai_serving_chat.handle_request(
+        request, raw_request
+    )
+
+
+## Vertex AI API
+@app.post(os.environ.get("AIP_PREDICT_ROUTE", "/vertex_generate"))
+async def vertex_generate(vertex_req: VertexGenerateReqInput, raw_request: Request):
+    if not vertex_req.instances:
+        return []
+    inputs = {}
+    for input_key in ("text", "input_ids", "input_embeds"):
+        if vertex_req.instances[0].get(input_key):
+            inputs[input_key] = [
+                instance.get(input_key) for instance in vertex_req.instances
+            ]
+            break
+    image_data = [
+        instance.get("image_data")
+        for instance in vertex_req.instances
+        if instance.get("image_data") is not None
+    ] or None
+    req = GenerateReqInput(
+        **inputs,
+        image_data=image_data,
+        **(vertex_req.parameters or {}),
+    )
+    ret = await generate_request(req, raw_request)
+    if isinstance(ret, Response):
+        return ret
+    return ORJSONResponse({"predictions": ret})
+
+
+def _update_weight_version_if_provided(weight_version: Optional[str]) -> None:
+    """Update weight version if provided."""
+    if weight_version is not None:
+        _global_state.tokenizer_manager.server_args.weight_version = weight_version
+
+
+def _create_error_response(e):
+    return ORJSONResponse(
+        {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
+    )
+
+
+def launch_server(
+    server_args: ServerArgs,
+    pipe_finish_writer: Optional[multiprocessing.connection.Connection] = None,
+    launch_callback: Optional[Callable[[], None]] = None,
+):
+    """
+    Launch SRT (SGLang Runtime) Server.
+
+    The SRT server consists of an HTTP server and an SRT engine.
+
+    - HTTP server: A FastAPI server that routes requests to the engine.
+    - The engine consists of three components:
+        1. TokenizerManager: Tokenizes the requests and sends them to the scheduler.
+        2. Scheduler (subprocess): Receives requests from the Tokenizer Manager, schedules batches, forwards them, and sends the output tokens to the Detokenizer Manager.
+        3. DetokenizerManager (subprocess): Detokenizes the output tokens and sends the result back to the Tokenizer Manager.
+
+    Note:
+    1. The HTTP server, Engine, and TokenizerManager both run in the main process.
+    2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
+    """
+    if server_args.tokenizer_worker_num > 1:
+        setproctitle.setproctitle(f"sglang::http_server/multi_tokenizer_router")
+        port_args = PortArgs.init_new(server_args)
+        port_args.tokenizer_worker_ipc_name = (
+            f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
+        )
+        tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
+            server_args=server_args, port_args=port_args
+        )
+    else:
+        setproctitle.setproctitle(f"sglang::http_server/tokenizer_manager")
+        tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
+            server_args=server_args,
+        )
+
+    set_global_state(
+        _GlobalState(
+            tokenizer_manager=tokenizer_manager,
+            template_manager=template_manager,
+            scheduler_info=scheduler_info,
+        )
+    )
+
+    if server_args.tokenizer_worker_num > 1:
+        multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
+            port_args,
+            server_args,
+            scheduler_info,
+        )
+    else:
+        # Add api key authorization
+        if server_args.api_key:
+            add_api_key_middleware(app, server_args.api_key)
+
+        # Add prometheus middleware
+        if server_args.enable_metrics:
+            add_prometheus_middleware(app)
+            enable_func_timer()
+
+        # Send a warmup request - we will create the thread launch it
+        # in the lifespan after all other warmups have fired.
+        warmup_thread = threading.Thread(
+            target=_wait_and_warmup,
+            args=(
+                server_args,
+                pipe_finish_writer,
+                launch_callback,
+            ),
+        )
+        app.warmup_thread = warmup_thread
+
+    try:
+        # Update logging configs
+        set_uvicorn_logging_configs()
+        app.server_args = server_args
+        # Listen for HTTP requests
+        if server_args.tokenizer_worker_num > 1:
+            from uvicorn.config import LOGGING_CONFIG
+
+            LOGGING_CONFIG["loggers"]["sglang.srt.entrypoints.http_server"] = {
+                "handlers": ["default"],
+                "level": "INFO",
+                "propagate": False,
+            }
+
+            monkey_patch_uvicorn_multiprocessing()
+
+            uvicorn.run(
+                "sglang.srt.entrypoints.http_server:app",
+                host=server_args.host,
+                port=server_args.port,
+                log_level=server_args.log_level_http or server_args.log_level,
+                timeout_keep_alive=5,
+                loop="uvloop",
+                workers=server_args.tokenizer_worker_num,
+            )
+        else:
+            app.is_single_tokenizer_mode = True
+            uvicorn.run(
+                app,
+                host=server_args.host,
+                port=server_args.port,
+                log_level=server_args.log_level_http or server_args.log_level,
+                timeout_keep_alive=5,
+                loop="uvloop",
+            )
+    finally:
+        if server_args.tokenizer_worker_num > 1:
+            multi_tokenizer_args_shm.unlink()
+            _global_state.tokenizer_manager.socket_mapping.clear_all_sockets()
+        else:
+            warmup_thread.join()
+
+
+def _execute_server_warmup(
+    server_args: ServerArgs,
+    pipe_finish_writer: Optional[multiprocessing.connection.Connection],
+):
+    headers = {}
+    url = server_args.url()
+    if server_args.api_key:
+        headers["Authorization"] = f"Bearer {server_args.api_key}"
+
+    # Wait until the server is launched
+    success = False
+    for _ in range(120):
+        time.sleep(1)
+        try:
+            res = requests.get(url + "/get_model_info", timeout=5, headers=headers)
+            assert res.status_code == 200, f"{res=}, {res.text=}"
+            success = True
+            break
+        except (AssertionError, requests.exceptions.RequestException):
+            last_traceback = get_exception_traceback()
+            pass
+
+    if not success:
+        if pipe_finish_writer is not None:
+            pipe_finish_writer.send(last_traceback)
+        logger.error(f"Initialization failed. warmup error: {last_traceback}")
+        kill_process_tree(os.getpid())
+        return success
+
+    model_info = res.json()
+
+    # Send a warmup request
+    request_name = "/generate" if model_info["is_generation"] else "/encode"
+    max_new_tokens = 8 if model_info["is_generation"] else 1
+    json_data = {
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": max_new_tokens,
+        },
+    }
+    if server_args.skip_tokenizer_init:
+        json_data["input_ids"] = [[10, 11, 12] for _ in range(server_args.dp_size)]
+        # TODO Workaround the bug that embedding errors for list of size 1
+        if server_args.dp_size == 1:
+            json_data["input_ids"] = json_data["input_ids"][0]
+    else:
+        json_data["text"] = ["The capital city of France is"] * server_args.dp_size
+        # TODO Workaround the bug that embedding errors for list of size 1
+        if server_args.dp_size == 1:
+            json_data["text"] = json_data["text"][0]
+
+    # Debug dumping
+    if server_args.debug_tensor_dump_input_file:
+        json_data.pop("text", None)
+        json_data["input_ids"] = np.load(
+            server_args.debug_tensor_dump_input_file
+        ).tolist()
+        json_data["sampling_params"]["max_new_tokens"] = 0
+
+    try:
+        if server_args.disaggregation_mode == "null":
+            res = requests.post(
+                url + request_name,
+                json=json_data,
+                headers=headers,
+                timeout=600,
+            )
+            assert res.status_code == 200, f"{res}"
+            _global_state.tokenizer_manager.server_status = ServerStatus.Up
+
+        else:
+            logger.info(f"Start of pd disaggregation warmup ...")
+            json_data = {
+                "sampling_params": {
+                    "temperature": 0.0,
+                    "max_new_tokens": 8,
+                    "ignore_eos": True,
+                },
+                "bootstrap_host": [FAKE_BOOTSTRAP_HOST] * server_args.dp_size,
+                # This is a hack to ensure fake transfer is enabled during prefill warmup
+                # ensure each dp rank has a unique bootstrap_room during prefill warmup
+                "bootstrap_room": [
+                    i * (2**63 // server_args.dp_size) + (i % server_args.tp_size)
+                    for i in range(server_args.dp_size)
+                ],
+                "input_ids": [[0, 1, 2, 3]] * server_args.dp_size,
+            }
+            res = requests.post(
+                url + request_name,
+                json=json_data,
+                headers=headers,
+                timeout=1800,  # because of deep gemm precache is very long if not precache.
+            )
+            if res.status_code == 200:
+                logger.info(
+                    f"End of prefill disaggregation mode warmup with status {res.status_code}, resp: {res.json()}"
+                )
+                _global_state.tokenizer_manager.server_status = ServerStatus.Up
+            else:
+                logger.info(
+                    "Prefill disaggregation mode warm Up Failed, status code: {}".format(
+                        res.status_code
+                    )
+                )
+                _global_state.tokenizer_manager.server_status = ServerStatus.UnHealthy
+
+    except Exception:
+        last_traceback = get_exception_traceback()
+        if pipe_finish_writer is not None:
+            pipe_finish_writer.send(last_traceback)
+        logger.error(f"Initialization failed. warmup error: {last_traceback}")
+        kill_process_tree(os.getpid())
+        return False
+
+    # Debug print
+    # logger.info(f"warmup request returns: {res.json()=}")
+    return success
+
+
+def _wait_and_warmup(
+    server_args: ServerArgs,
+    pipe_finish_writer: Optional[multiprocessing.connection.Connection],
+    launch_callback: Optional[Callable[[], None]] = None,
+):
+    if not server_args.skip_server_warmup:
+        if not _execute_server_warmup(
+            server_args,
+            pipe_finish_writer,
+        ):
+            return
+    else:
+        _global_state.tokenizer_manager.server_status = ServerStatus.Up
+
+    logger.info("The server is fired up and ready to roll!")
+
+    if pipe_finish_writer is not None:
+        pipe_finish_writer.send("ready")
+
+    if server_args.delete_ckpt_after_loading:
+        delete_directory(server_args.model_path)
+
+    if server_args.debug_tensor_dump_input_file:
+        kill_process_tree(os.getpid())
+
+    if launch_callback is not None:
+        launch_callback()
diff --git a/python/sglang/srt/entrypoints/http_server_engine.py b/python/sglang/srt/entrypoints/http_server_engine.py
new file mode 100644
index 000000000..d1db80d65
--- /dev/null
+++ b/python/sglang/srt/entrypoints/http_server_engine.py
@@ -0,0 +1,142 @@
+import copy
+import dataclasses
+import multiprocessing
+import pickle
+import threading
+import time
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import pybase64
+import requests
+import torch
+import torch.distributed as dist
+
+from sglang.srt.entrypoints.EngineBase import EngineBase
+from sglang.srt.entrypoints.http_server import launch_server
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import MultiprocessingSerializer, kill_process_tree
+
+
+def launch_server_process(server_args: ServerArgs) -> multiprocessing.Process:
+
+    p = multiprocessing.Process(target=launch_server, args=(server_args,))
+    p.start()
+
+    base_url = server_args.url()
+    timeout = 300.0  # Increased timeout to 5 minutes for downloading large models
+    start_time = time.perf_counter()
+
+    with requests.Session() as session:
+        while time.perf_counter() - start_time < timeout:
+            try:
+                headers = {
+                    "Content-Type": "application/json; charset=utf-8",
+                    "Authorization": f"Bearer {server_args.api_key}",
+                }
+                response = session.get(f"{base_url}/health_generate", headers=headers)
+                if response.status_code == 200:
+                    return p
+            except requests.RequestException:
+                pass
+
+            if not p.is_alive():
+                raise Exception("Server process terminated unexpectedly.")
+
+            time.sleep(2)
+
+    p.terminate()
+    raise TimeoutError("Server failed to start within the timeout period.")
+
+
+class HttpServerEngineAdapter(EngineBase):
+    """
+    You can use this class to launch a server from a VerlEngine instance.
+    We recommend using this class only you need to use http server.
+    Otherwise, you can use Engine directly.
+    """
+
+    def __init__(self, **kwargs):
+        self.server_args = ServerArgs(**kwargs)
+        print(
+            f"Launch HttpServerEngineAdapter at: {self.server_args.host}:{self.server_args.port}"
+        )
+        self.process = launch_server_process(self.server_args)
+
+    def _make_request(self, endpoint: str, payload: Optional[dict] = None):
+        """Make a POST request to the specified endpoint with the given payload.
+        Args:
+            endpoint: The API endpoint to call
+            payload: The JSON payload to send (default: empty dict)
+        Returns:
+            The JSON response from the server
+        """
+        url = f"http://{self.server_args.host}:{self.server_args.port}/{endpoint}"
+        response = requests.post(url, json=payload or {})
+        response.raise_for_status()
+        return response.json()
+
+    def update_weights_from_tensor(
+        self,
+        named_tensors: List[Tuple[str, torch.Tensor]],
+        load_format: Optional[str] = None,
+        flush_cache: bool = False,
+    ):
+        """
+        Update model weights from tensor data. The HTTP server will only post meta data, and the real weights will be copied directly from GPUs.
+        Note: The model should be on GPUs rather than CPU for this functionality to work properly.
+        If you encounter issues, ensure your model is loaded on GPU devices rather than CPU.
+        """
+
+        return self._make_request(
+            "update_weights_from_tensor",
+            {
+                "serialized_named_tensors": [
+                    MultiprocessingSerializer.serialize(named_tensors, output_str=True)
+                    for _ in range(self.server_args.tp_size)
+                ],
+                "load_format": load_format,
+                "flush_cache": flush_cache,
+            },
+        )
+
+    def shutdown(self):
+        kill_process_tree(self.process.pid)
+
+    def generate(
+        self,
+        prompt=None,
+        sampling_params=None,
+        input_ids=None,
+        image_data=None,
+        return_logprob=False,
+        logprob_start_len=None,
+        top_logprobs_num=None,
+        token_ids_logprob=None,
+        lora_path=None,
+        custom_logit_processor=None,
+    ):
+        payload = {
+            "text": prompt,
+            "sampling_params": sampling_params,
+            "input_ids": input_ids,
+            "image_data": image_data,
+            "return_logprob": return_logprob,
+            "logprob_start_len": logprob_start_len,
+            "top_logprobs_num": top_logprobs_num,
+            "token_ids_logprob": token_ids_logprob,
+            "lora_path": lora_path,
+            "custom_logit_processor": custom_logit_processor,
+        }
+        # Filter out None values
+        payload = {k: v for k, v in payload.items() if v is not None}
+
+        return self._make_request("generate", payload)
+
+    def release_memory_occupation(self):
+        return self._make_request("release_memory_occupation")
+
+    def resume_memory_occupation(self):
+        return self._make_request("resume_memory_occupation")
+
+    def flush_cache(self):
+        return self._make_request("flush_cache")
diff --git a/python/sglang/srt/entrypoints/openai/__init__.py b/python/sglang/srt/entrypoints/openai/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/sglang/srt/entrypoints/openai/protocol.py b/python/sglang/srt/entrypoints/openai/protocol.py
new file mode 100644
index 000000000..f73e67d0b
--- /dev/null
+++ b/python/sglang/srt/entrypoints/openai/protocol.py
@@ -0,0 +1,929 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Pydantic models for OpenAI API protocol"""
+
+import time
+import uuid
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, TypeAlias, Union
+
+from openai.types.responses import (
+    ResponseFunctionToolCall,
+    ResponseInputItemParam,
+    ResponseOutputItem,
+    ResponseReasoningItem,
+)
+from openai.types.responses.response import ToolChoice
+from openai.types.responses.tool import Tool
+from pydantic import (
+    BaseModel,
+    Field,
+    field_validator,
+    model_serializer,
+    model_validator,
+)
+from typing_extensions import Literal
+
+DEFAULT_MODEL_NAME = "default"
+
+
+class ModelCard(BaseModel):
+    """Model cards."""
+
+    id: str
+    object: str = "model"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    owned_by: str = "sglang"
+    root: Optional[str] = None
+    max_model_len: Optional[int] = None
+
+
+class ModelList(BaseModel):
+    """Model list consists of model cards."""
+
+    object: str = "list"
+    data: List[ModelCard] = Field(default_factory=list)
+
+
+class ErrorResponse(BaseModel):
+    object: str = "error"
+    message: str
+    type: str
+    param: Optional[str] = None
+    code: int
+
+
+class LogProbs(BaseModel):
+    text_offset: List[int] = Field(default_factory=list)
+    token_logprobs: List[Optional[float]] = Field(default_factory=list)
+    tokens: List[str] = Field(default_factory=list)
+    top_logprobs: List[Optional[Dict[str, float]]] = Field(default_factory=list)
+
+
+class TopLogprob(BaseModel):
+    token: str
+    bytes: List[int]
+    logprob: float
+
+
+class ChatCompletionTokenLogprob(BaseModel):
+    token: str
+    bytes: List[int]
+    logprob: float
+    top_logprobs: List[TopLogprob]
+
+
+class ChoiceLogprobs(BaseModel):
+    # build for v1/chat/completions response
+    content: List[ChatCompletionTokenLogprob]
+
+
+class UsageInfo(BaseModel):
+    prompt_tokens: int = 0
+    total_tokens: int = 0
+    completion_tokens: Optional[int] = 0
+    # only used to return cached tokens when --enable-cache-report is set
+    prompt_tokens_details: Optional[Dict[str, int]] = None
+    reasoning_tokens: Optional[int] = 0
+
+
+class StreamOptions(BaseModel):
+    include_usage: Optional[bool] = False
+
+
+class JsonSchemaResponseFormat(BaseModel):
+    name: str
+    description: Optional[str] = None
+    # use alias to workaround pydantic conflict
+    schema_: Optional[Dict[str, object]] = Field(alias="schema", default=None)
+    strict: Optional[bool] = False
+
+
+class ResponseFormat(BaseModel):
+    type: Literal["text", "json_object", "json_schema"]
+    json_schema: Optional[JsonSchemaResponseFormat] = None
+
+
+class StructuresResponseFormat(BaseModel):
+    begin: str
+    schema_: Optional[Dict[str, object]] = Field(alias="schema", default=None)
+    end: str
+
+
+class StructuralTagResponseFormat(BaseModel):
+    type: Literal["structural_tag"]
+    structures: List[StructuresResponseFormat]
+    triggers: List[str]
+
+
+class FileRequest(BaseModel):
+    # https://platform.openai.com/docs/api-reference/files/create
+    file: bytes  # The File object (not file name) to be uploaded
+    purpose: str = (
+        "batch"  # The intended purpose of the uploaded file, default is "batch"
+    )
+
+
+class FileResponse(BaseModel):
+    id: str
+    object: str = "file"
+    bytes: int
+    created_at: int
+    filename: str
+    purpose: str
+
+
+class FileDeleteResponse(BaseModel):
+    id: str
+    object: str = "file"
+    deleted: bool
+
+
+class BatchRequest(BaseModel):
+    input_file_id: (
+        str  # The ID of an uploaded file that contains requests for the new batch
+    )
+    endpoint: str  # The endpoint to be used for all requests in the batch
+    completion_window: str  # The time frame within which the batch should be processed
+    metadata: Optional[dict] = None  # Optional custom metadata for the batch
+
+
+class BatchResponse(BaseModel):
+    id: str
+    object: str = "batch"
+    endpoint: str
+    errors: Optional[dict] = None
+    input_file_id: str
+    completion_window: str
+    status: str = "validating"
+    output_file_id: Optional[str] = None
+    error_file_id: Optional[str] = None
+    created_at: int
+    in_progress_at: Optional[int] = None
+    expires_at: Optional[int] = None
+    finalizing_at: Optional[int] = None
+    completed_at: Optional[int] = None
+    failed_at: Optional[int] = None
+    expired_at: Optional[int] = None
+    cancelling_at: Optional[int] = None
+    cancelled_at: Optional[int] = None
+    request_counts: Optional[dict] = None
+    metadata: Optional[dict] = None
+
+
+class CompletionRequest(BaseModel):
+    # Ordered by official OpenAI API documentation
+    # https://platform.openai.com/docs/api-reference/completions/create
+    model: str = DEFAULT_MODEL_NAME
+    prompt: Union[List[int], List[List[int]], str, List[str]]
+    best_of: Optional[int] = None
+    echo: bool = False
+    frequency_penalty: float = 0.0
+    logit_bias: Optional[Dict[str, float]] = None
+    logprobs: Optional[int] = None
+    max_tokens: int = 16
+    n: int = 1
+    presence_penalty: float = 0.0
+    seed: Optional[int] = None
+    stop: Optional[Union[str, List[str]]] = None
+    stream: bool = False
+    stream_options: Optional[StreamOptions] = None
+    suffix: Optional[str] = None
+    temperature: float = 1.0
+    top_p: float = 1.0
+    user: Optional[str] = None
+    return_hidden_states: bool = False
+
+    # Extra parameters for SRT backend only and will be ignored by OpenAI models.
+    top_k: int = -1
+    min_p: float = 0.0
+    min_tokens: int = 0
+    json_schema: Optional[str] = None
+    regex: Optional[str] = None
+    ebnf: Optional[str] = None
+    repetition_penalty: float = 1.0
+    stop_token_ids: Optional[List[int]] = None
+    no_stop_trim: bool = False
+    ignore_eos: bool = False
+    skip_special_tokens: bool = True
+    lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
+    session_params: Optional[Dict] = None
+    response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None
+
+    # For PD disaggregation
+    bootstrap_host: Optional[Union[List[str], str]] = None
+    bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
+    bootstrap_room: Optional[Union[List[int], int]] = None
+
+    # For request id
+    rid: Optional[Union[List[str], str]] = None
+
+    @field_validator("max_tokens")
+    @classmethod
+    def validate_max_tokens_positive(cls, v):
+        if v is not None and v <= 0:
+            raise ValueError("max_tokens must be positive")
+        return v
+
+
+class CompletionResponseChoice(BaseModel):
+    index: int
+    text: str
+    logprobs: Optional[LogProbs] = None
+    finish_reason: Optional[Literal["stop", "length", "content_filter", "abort"]] = None
+    matched_stop: Union[None, int, str] = None
+    hidden_states: Optional[object] = None
+
+    @model_serializer(mode="wrap")
+    def _serialize(self, handler):
+        data = handler(self)
+        if self.hidden_states is None:
+            data.pop("hidden_states", None)
+        return data
+
+
+class CompletionResponse(BaseModel):
+    id: str
+    object: str = "text_completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[CompletionResponseChoice]
+    usage: UsageInfo
+    metadata: Optional[Dict[str, Any]] = None
+
+
+class CompletionResponseStreamChoice(BaseModel):
+    index: int
+    text: str
+    logprobs: Optional[LogProbs] = None
+    finish_reason: Optional[Literal["stop", "length", "content_filter", "abort"]] = None
+    matched_stop: Union[None, int, str] = None
+    hidden_states: Optional[object] = None
+
+    @model_serializer(mode="wrap")
+    def _serialize(self, handler):
+        data = handler(self)
+        if self.hidden_states is None:
+            data.pop("hidden_states", None)
+        return data
+
+
+class CompletionStreamResponse(BaseModel):
+    id: str
+    object: str = "text_completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[CompletionResponseStreamChoice]
+    usage: Optional[UsageInfo] = None
+
+
+class ChatCompletionMessageContentTextPart(BaseModel):
+    type: Literal["text"]
+    text: str
+
+
+class ChatCompletionMessageContentImageURL(BaseModel):
+    url: str
+    detail: Optional[Literal["auto", "low", "high"]] = "auto"
+
+
+class ChatCompletionMessageContentVideoURL(BaseModel):
+    url: str
+
+
+class ChatCompletionMessageContentAudioURL(BaseModel):
+    url: str
+
+
+class ChatCompletionMessageContentImagePart(BaseModel):
+    type: Literal["image_url"]
+    image_url: ChatCompletionMessageContentImageURL
+    modalities: Optional[Literal["image", "multi-images", "video"]] = "image"
+
+
+class ChatCompletionMessageContentVideoPart(BaseModel):
+    type: Literal["video_url"]
+    video_url: ChatCompletionMessageContentVideoURL
+
+
+class ChatCompletionMessageContentAudioPart(BaseModel):
+    type: Literal["audio_url"]
+    audio_url: ChatCompletionMessageContentAudioURL
+
+
+ChatCompletionMessageContentPart = Union[
+    ChatCompletionMessageContentTextPart,
+    ChatCompletionMessageContentImagePart,
+    ChatCompletionMessageContentVideoPart,
+    ChatCompletionMessageContentAudioPart,
+]
+
+
+class FunctionResponse(BaseModel):
+    """Function response."""
+
+    name: Optional[str] = None
+    arguments: Optional[str] = None
+
+
+class ToolCall(BaseModel):
+    """Tool call response."""
+
+    id: Optional[str] = None
+    index: Optional[int] = None
+    type: Literal["function"] = "function"
+    function: FunctionResponse
+
+
+class ChatCompletionMessageGenericParam(BaseModel):
+    role: Literal["system", "assistant", "tool", "function"]
+    content: Union[str, List[ChatCompletionMessageContentTextPart], None] = Field(
+        default=None
+    )
+    tool_call_id: Optional[str] = None
+    name: Optional[str] = None
+    reasoning_content: Optional[str] = None
+    tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
+
+    @field_validator("role", mode="before")
+    @classmethod
+    def _normalize_role(cls, v):
+        if isinstance(v, str):
+            v_lower = v.lower()
+            if v_lower not in {"system", "assistant", "tool", "function"}:
+                raise ValueError(
+                    "'role' must be one of 'system', 'assistant', 'tool', or 'function' (case-insensitive)."
+                )
+            return v_lower
+        raise ValueError("'role' must be a string")
+
+
+class ChatCompletionMessageUserParam(BaseModel):
+    role: Literal["user"]
+    content: Union[str, List[ChatCompletionMessageContentPart]]
+
+
+ChatCompletionMessageParam = Union[
+    ChatCompletionMessageGenericParam, ChatCompletionMessageUserParam
+]
+
+
+class Function(BaseModel):
+    """Function descriptions."""
+
+    description: Optional[str] = Field(default=None, examples=[None])
+    name: Optional[str] = None
+    parameters: Optional[object] = None
+    strict: bool = False
+
+
+class Tool(BaseModel):
+    """Function wrapper."""
+
+    type: str = Field(default="function", examples=["function"])
+    function: Function
+
+
+class ToolChoiceFuncName(BaseModel):
+    """The name of tool choice function."""
+
+    name: Optional[str] = None
+
+
+class ToolChoice(BaseModel):
+    """The tool choice definition."""
+
+    function: ToolChoiceFuncName
+    type: Literal["function"] = Field(default="function", examples=["function"])
+
+
+class ChatCompletionRequest(BaseModel):
+    # Ordered by official OpenAI API documentation
+    # https://platform.openai.com/docs/api-reference/chat/create
+    messages: List[ChatCompletionMessageParam]
+    model: str = DEFAULT_MODEL_NAME
+    frequency_penalty: float = 0.0
+    logit_bias: Optional[Dict[str, float]] = None
+    logprobs: bool = False
+    top_logprobs: Optional[int] = None
+    max_tokens: Optional[int] = Field(
+        default=None,
+        deprecated="max_tokens is deprecated in favor of the max_completion_tokens field",
+        description="The maximum number of tokens that can be generated in the chat completion. ",
+    )
+    max_completion_tokens: Optional[int] = Field(
+        default=None,
+        description="The maximum number of completion tokens for a chat completion request, "
+        "including visible output tokens and reasoning tokens. Input tokens are not included. ",
+    )
+    n: int = 1
+    presence_penalty: float = 0.0
+    response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None
+    seed: Optional[int] = None
+    stop: Optional[Union[str, List[str]]] = None
+    stream: bool = False
+    stream_options: Optional[StreamOptions] = None
+    temperature: float = 0.7
+    top_p: float = 1.0
+    user: Optional[str] = None
+    tools: Optional[List[Tool]] = Field(default=None, examples=[None])
+    tool_choice: Union[ToolChoice, Literal["auto", "required", "none"]] = Field(
+        default="auto", examples=["none"]
+    )  # noqa
+    return_hidden_states: bool = False
+    reasoning_effort: Optional[Literal["low", "medium", "high"]] = Field(
+        default="medium",
+        description="Constrains effort on reasoning for reasoning models. "
+        "'low' is the least effort, 'high' is the most effort. Reducing reasoning effort can "
+        "result in faster responses and fewer tokens used on reasoning in a response. "
+        "Currently only supported for OpenAI models.",
+    )
+
+    @model_validator(mode="before")
+    @classmethod
+    def set_tool_choice_default(cls, values):
+        if values.get("tool_choice") is None:
+            if values.get("tools") is None:
+                values["tool_choice"] = "none"
+            else:
+                values["tool_choice"] = "auto"
+        return values
+
+    @model_validator(mode="before")
+    @classmethod
+    def normalize_reasoning_inputs(cls, values: Dict):
+        r = values.get("reasoning")
+        if r is None:
+            return values
+
+        if isinstance(r, dict):
+            effort = r.get("effort") or r.get("reasoning_effort")
+            if effort in {"low", "medium", "high"}:
+                values["reasoning_effort"] = effort
+
+            enabled = (
+                r.get("enabled")
+                if r.get("enabled") is not None
+                else r.get("enable", False)
+            )
+            if isinstance(enabled, str):
+                enabled = enabled.strip().lower() in {"1", "true", "yes", "y", "on"}
+            if enabled:
+                ctk = values.get("chat_template_kwargs")
+                if not isinstance(ctk, dict):
+                    ctk = {}
+                ctk.setdefault("thinking", True)
+                values["chat_template_kwargs"] = ctk
+
+        return values
+
+    @model_validator(mode="before")
+    @classmethod
+    def set_json_schema(cls, values):
+        response_format = values.get("response_format")
+        if not response_format:
+            return values
+
+        if response_format.get("type") != "json_schema":
+            return values
+
+        schema = response_format.pop("schema", None)
+        json_schema = response_format.get("json_schema")
+
+        if json_schema:
+            return values
+
+        if schema:
+            name_ = schema.get("title", "Schema")
+            strict_ = False
+            if "properties" in schema and "strict" in schema["properties"]:
+                item = schema["properties"].pop("strict", None)
+                if item and item.get("default", False):
+                    strict_ = True
+
+            response_format["json_schema"] = {
+                "name": name_,
+                "schema": schema,
+                "strict": strict_,
+            }
+
+        return values
+
+    # Extra parameters for SRT backend only and will be ignored by OpenAI models.
+    top_k: int = -1
+    min_p: float = 0.0
+    min_tokens: int = 0
+    regex: Optional[str] = None
+    ebnf: Optional[str] = None
+    repetition_penalty: float = 1.0
+    stop_token_ids: Optional[List[int]] = None
+    no_stop_trim: bool = False
+    ignore_eos: bool = False
+    continue_final_message: bool = False
+    skip_special_tokens: bool = True
+    lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
+    session_params: Optional[Dict] = None
+    separate_reasoning: bool = True
+    stream_reasoning: bool = True
+    chat_template_kwargs: Optional[Dict] = None
+
+    # For request id
+    rid: Optional[Union[List[str], str]] = None
+
+    # For PD disaggregation
+    bootstrap_host: Optional[Union[List[str], str]] = None
+    bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
+    bootstrap_room: Optional[Union[List[int], int]] = None
+
+
+class ChatMessage(BaseModel):
+    role: Optional[str] = None
+    content: Optional[str] = None
+    reasoning_content: Optional[str] = None
+    tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
+
+
+class ChatCompletionResponseChoice(BaseModel):
+    index: int
+    message: ChatMessage
+    logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
+    finish_reason: Optional[
+        Literal[
+            "stop", "length", "tool_calls", "content_filter", "function_call", "abort"
+        ]
+    ] = None
+    matched_stop: Union[None, int, str] = None
+    hidden_states: Optional[object] = None
+
+    @model_serializer(mode="wrap")
+    def _serialize(self, handler):
+        data = handler(self)
+        if self.hidden_states is None:
+            data.pop("hidden_states", None)
+        return data
+
+
+class ChatCompletionResponse(BaseModel):
+    id: str
+    object: str = "chat.completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[ChatCompletionResponseChoice]
+    usage: UsageInfo
+    metadata: Optional[Dict[str, Any]] = None
+
+
+class DeltaMessage(BaseModel):
+    role: Optional[str] = None
+    content: Optional[str] = None
+    reasoning_content: Optional[str] = None
+    tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
+    hidden_states: Optional[object] = None
+
+    @model_serializer(mode="wrap")
+    def _serialize(self, handler):
+        data = handler(self)
+        if self.hidden_states is None:
+            data.pop("hidden_states", None)
+        return data
+
+
+class ChatCompletionResponseStreamChoice(BaseModel):
+    index: int
+    delta: DeltaMessage
+    logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
+    finish_reason: Optional[
+        Literal[
+            "stop", "length", "tool_calls", "content_filter", "function_call", "abort"
+        ]
+    ] = None
+    matched_stop: Union[None, int, str] = None
+
+
+class ChatCompletionStreamResponse(BaseModel):
+    id: str
+    object: str = "chat.completion.chunk"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[ChatCompletionResponseStreamChoice]
+    usage: Optional[UsageInfo] = None
+
+
+class MultimodalEmbeddingInput(BaseModel):
+    text: Optional[str] = None
+    image: Optional[str] = None
+
+
+EmbeddingInput = Union[
+    List[int], List[List[int]], str, List[str], List[MultimodalEmbeddingInput]
+]
+
+
+class EmbeddingRequest(BaseModel):
+    # Ordered by official OpenAI API documentation
+    # https://platform.openai.com/docs/api-reference/embeddings/create
+    input: EmbeddingInput
+    model: str = DEFAULT_MODEL_NAME
+    encoding_format: str = "float"
+    dimensions: Optional[int] = None
+    user: Optional[str] = None
+
+    # The request id.
+    rid: Optional[Union[List[str], str]] = None
+
+
+class EmbeddingObject(BaseModel):
+    embedding: List[float]
+    index: int
+    object: str = "embedding"
+
+
+class EmbeddingResponse(BaseModel):
+    data: List[EmbeddingObject]
+    model: str
+    object: str = "list"
+    usage: Optional[UsageInfo] = None
+
+
+class ScoringRequest(BaseModel):
+    query: Optional[Union[str, List[int]]] = (
+        None  # Query text or pre-tokenized token IDs
+    )
+    items: Optional[Union[str, List[str], List[List[int]]]] = (
+        None  # Item text(s) or pre-tokenized token IDs
+    )
+    label_token_ids: Optional[List[int]] = (
+        None  # Token IDs to compute probabilities for
+    )
+    apply_softmax: bool = False
+    item_first: bool = False
+    model: str = DEFAULT_MODEL_NAME
+
+
+class ScoringResponse(BaseModel):
+    scores: List[
+        List[float]
+    ]  # List of lists of probabilities, each in the order of label_token_ids
+    model: str
+    usage: Optional[UsageInfo] = None
+    object: str = "scoring"
+
+
+class V1RerankReqInput(BaseModel):
+    query: str
+    documents: List[str]
+
+
+class RerankResponse(BaseModel):
+    score: float
+    document: str
+    index: int
+    meta_info: Optional[dict] = None
+
+
+OpenAIServingRequest = Union[
+    ChatCompletionRequest,
+    CompletionRequest,
+    EmbeddingRequest,
+    ScoringRequest,
+    V1RerankReqInput,
+]
+
+
+# Response API protocol definitions
+class ResponseReasoningParam(BaseModel):
+    """Reasoning parameters for responses."""
+
+    effort: Optional[Literal["low", "medium", "high"]] = Field(
+        default="medium",
+        description="Constrains effort on reasoning for reasoning models.",
+    )
+
+
+class ResponseTool(BaseModel):
+    """Tool definition for responses."""
+
+    type: Literal["web_search_preview", "code_interpreter"] = Field(
+        description="Type of tool to enable"
+    )
+
+
+ResponseInputOutputItem: TypeAlias = Union[
+    ResponseInputItemParam,
+    "ResponseReasoningItem",
+    ResponseFunctionToolCall,
+]
+
+
+class ResponsesRequest(BaseModel):
+    """Request body for v1/responses endpoint."""
+
+    # Core OpenAI API fields (ordered by official documentation)
+    background: Optional[bool] = False
+    include: Optional[
+        List[
+            Literal[
+                "code_interpreter_call.outputs",
+                "computer_call_output.output.image_url",
+                "file_search_call.results",
+                "message.input_image.image_url",
+                "message.output_text.logprobs",
+                "reasoning.encrypted_content",
+            ]
+        ]
+    ] = None
+    input: Union[str, List[ResponseInputOutputItem]]
+    instructions: Optional[str] = None
+    max_output_tokens: Optional[int] = None
+    max_tool_calls: Optional[int] = None
+    metadata: Optional[Dict[str, Any]] = None
+    model: Optional[str] = None  # Made optional to match vLLM
+    parallel_tool_calls: Optional[bool] = True
+    previous_response_id: Optional[str] = None
+    reasoning: Optional[ResponseReasoningParam] = None
+    service_tier: Literal["auto", "default", "flex", "scale", "priority"] = "auto"
+    store: Optional[bool] = True
+    stream: Optional[bool] = False
+    temperature: Optional[float] = None
+    tool_choice: Literal["auto", "required", "none"] = "auto"
+    tools: List[ResponseTool] = Field(default_factory=list)
+    top_logprobs: Optional[int] = 0
+    top_p: Optional[float] = None
+    truncation: Optional[Literal["auto", "disabled"]] = "disabled"
+    user: Optional[str] = None
+
+    # Extra SGLang parameters
+    request_id: str = Field(
+        default_factory=lambda: f"resp_{uuid.uuid4().hex}",
+        description="The request_id related to this request. If the caller does not set it, a random uuid will be generated.",
+    )
+    priority: int = Field(default=0, description="Request priority")
+
+    # SGLang-specific sampling parameters
+    frequency_penalty: float = 0.0
+    presence_penalty: float = 0.0
+    stop: Optional[Union[str, List[str]]] = None
+    top_k: int = -1
+    min_p: float = 0.0
+    repetition_penalty: float = 1.0
+
+    # Default sampling parameters
+    _DEFAULT_SAMPLING_PARAMS = {
+        "temperature": 0.7,
+        "top_p": 1.0,
+        "top_k": -1,
+        "min_p": 0.0,
+        "repetition_penalty": 1.0,
+    }
+
+    def to_sampling_params(
+        self, default_max_tokens: int, default_params: Optional[Dict] = None
+    ) -> Dict[str, Any]:
+        """Convert to sampling parameters for generation."""
+        if default_params is None:
+            default_params = {}
+
+        # Use max_output_tokens if available, otherwise use max_tokens for backwards compatibility
+        if self.max_output_tokens is not None:
+            max_tokens = min(self.max_output_tokens, default_max_tokens)
+        else:
+            max_tokens = default_max_tokens
+
+        # Avoid exceed the context length by minus 2 token
+        max_tokens -= 2
+
+        # Get parameters with defaults
+        temperature = self.temperature
+        if temperature is None:
+            temperature = default_params.get(
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
+            )
+
+        top_p = self.top_p
+        if top_p is None:
+            top_p = default_params.get("top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
+
+        params = {
+            "max_new_tokens": max_tokens,
+            "temperature": temperature,
+            "top_p": top_p,
+            "frequency_penalty": self.frequency_penalty,
+            "presence_penalty": self.presence_penalty,
+            "stop": self.stop,
+            "top_k": self.top_k,
+            "min_p": self.min_p,
+            "repetition_penalty": self.repetition_penalty,
+        }
+
+        # Apply any additional default parameters
+        for key, value in default_params.items():
+            if key not in params or params[key] is None:
+                params[key] = value
+
+        return params
+
+
+class PromptTokenUsageInfo(BaseModel):
+    """Prompt token usage details."""
+
+    cached_tokens: int = 0
+
+
+class ResponsesResponse(BaseModel):
+    """Response body for v1/responses endpoint."""
+
+    id: str = Field(default_factory=lambda: f"resp_{time.time()}")
+    object: Literal["response"] = "response"
+    created_at: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+
+    output: List[
+        Union[ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall]
+    ] = Field(default_factory=list)
+    status: Literal["queued", "in_progress", "completed", "failed", "cancelled"]
+    usage: Optional[UsageInfo] = None
+    parallel_tool_calls: bool = True
+    tool_choice: str = "auto"
+    tools: List[ResponseTool] = Field(default_factory=list)
+
+    @classmethod
+    def from_request(
+        cls,
+        request: ResponsesRequest,
+        sampling_params: Any,
+        model_name: str,
+        created_time: int,
+        output: List[
+            Union[ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall]
+        ],
+        status: str,
+        usage: Optional[UsageInfo],
+    ) -> "ResponsesResponse":
+        """Create a response from a request."""
+        return cls(
+            id=request.request_id,
+            created_at=created_time,
+            model=model_name,
+            output=output,
+            status=status,
+            usage=usage,
+            parallel_tool_calls=request.parallel_tool_calls or True,
+            tool_choice=request.tool_choice,
+            tools=request.tools,
+        )
+
+
+class RequestResponseMetadata(BaseModel):
+    """Metadata for request/response tracking."""
+
+    request_id: str
+    final_usage_info: Optional[UsageInfo] = None
+
+
+@dataclass
+class MessageProcessingResult:
+    """Result of processing chat messages and applying templates.
+
+    This dataclass encapsulates all the outputs from message processing including
+    prompt generation, multimodal data extraction, and constraint preparation.
+    Used internally by OpenAIServingChat to pass processed data between methods.
+
+    Args:
+        prompt: The final text prompt after applying chat template
+        prompt_ids: Either the text prompt (str) or tokenized IDs (List[int])
+        image_data: Extracted image data from messages, if any
+        audio_data: Extracted audio data from messages, if any
+        modalities: List of modality types present in the messages
+        stop: Combined stop strings from template and request
+        tool_call_constraint: Optional constraint for structured tool calls
+    """
+
+    prompt: str
+    prompt_ids: Union[str, List[int]]
+    image_data: Optional[Any]
+    audio_data: Optional[Any]
+    video_data: Optional[Any]
+    modalities: List[str]
+    stop: List[str]
+    tool_call_constraint: Optional[Any] = None
+
+
+class ResponseReasoningTextContent(BaseModel):
+    text: str
+    type: Literal["reasoning_text"] = "reasoning_text"
+
+
+ResponseInputOutputItem: TypeAlias = Union[
+    ResponseInputItemParam, "ResponseReasoningItem", ResponseFunctionToolCall
+]
diff --git a/python/sglang/srt/entrypoints/openai/serving_base.py b/python/sglang/srt/entrypoints/openai/serving_base.py
new file mode 100644
index 000000000..28b317e6d
--- /dev/null
+++ b/python/sglang/srt/entrypoints/openai/serving_base.py
@@ -0,0 +1,156 @@
+from __future__ import annotations
+
+import json
+import logging
+import uuid
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+from fastapi import HTTPException, Request
+from fastapi.responses import ORJSONResponse, StreamingResponse
+
+from sglang.srt.entrypoints.openai.protocol import ErrorResponse, OpenAIServingRequest
+from sglang.srt.managers.io_struct import GenerateReqInput
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.tokenizer_manager import TokenizerManager
+
+logger = logging.getLogger(__name__)
+
+
+# Base class for specific endpoint handlers
+class OpenAIServingBase(ABC):
+    """Abstract base class for OpenAI endpoint handlers"""
+
+    def __init__(self, tokenizer_manager: TokenizerManager):
+        self.tokenizer_manager = tokenizer_manager
+
+    async def handle_request(
+        self, request: OpenAIServingRequest, raw_request: Request
+    ) -> Union[Any, StreamingResponse, ErrorResponse]:
+        """Handle the specific request type with common pattern"""
+        try:
+            # Validate request
+            error_msg = self._validate_request(request)
+            if error_msg:
+                return self.create_error_response(error_msg)
+
+            # Convert to internal format
+            adapted_request, processed_request = self._convert_to_internal_request(
+                request
+            )
+
+            # Note(Xinyuan): raw_request below is only used for detecting the connection of the client
+            if hasattr(request, "stream") and request.stream:
+                return await self._handle_streaming_request(
+                    adapted_request, processed_request, raw_request
+                )
+            else:
+                return await self._handle_non_streaming_request(
+                    adapted_request, processed_request, raw_request
+                )
+        except HTTPException as e:
+            return self.create_error_response(
+                message=e.detail, err_type=str(e.status_code), status_code=e.status_code
+            )
+        except Exception as e:
+            logger.exception(f"Error in request: {e}")
+            return self.create_error_response(
+                message=f"Internal server error: {str(e)}",
+                err_type="InternalServerError",
+                status_code=500,
+            )
+
+    @abstractmethod
+    def _request_id_prefix(self) -> str:
+        """Generate request ID based on request type"""
+        pass
+
+    def _generate_request_id_base(self, request: OpenAIServingRequest) -> Optional[str]:
+        """Generate request ID based on request type"""
+        return None
+
+        # TODO(chang): the rid is used in io_strcut check and often violates `The rid should be a list` AssertionError
+        # Temporarily return None in this function until the rid logic is clear.
+        if rid := getattr(request, "rid", None):
+            return rid
+
+        return f"{self._request_id_prefix()}{uuid.uuid4().hex}"
+
+    @abstractmethod
+    def _convert_to_internal_request(
+        self,
+        request: OpenAIServingRequest,
+    ) -> tuple[GenerateReqInput, OpenAIServingRequest]:
+        """Convert OpenAI request to internal format"""
+        pass
+
+    async def _handle_streaming_request(
+        self,
+        adapted_request: GenerateReqInput,
+        request: OpenAIServingRequest,
+        raw_request: Request,
+    ) -> Union[StreamingResponse, ErrorResponse, ORJSONResponse]:
+        """Handle streaming request
+
+        Override this method in child classes that support streaming requests.
+        """
+        return self.create_error_response(
+            message=f"{self.__class__.__name__} does not support streaming requests",
+            err_type="NotImplementedError",
+            status_code=501,
+        )
+
+    async def _handle_non_streaming_request(
+        self,
+        adapted_request: GenerateReqInput,
+        request: OpenAIServingRequest,
+        raw_request: Request,
+    ) -> Union[Any, ErrorResponse, ORJSONResponse]:
+        """Handle non-streaming request
+
+        Override this method in child classes that support non-streaming requests.
+        """
+        return self.create_error_response(
+            message=f"{self.__class__.__name__} does not support non-streaming requests",
+            err_type="NotImplementedError",
+            status_code=501,
+        )
+
+    def _validate_request(self, _: OpenAIServingRequest) -> Optional[str]:
+        """Validate request"""
+        pass
+
+    def create_error_response(
+        self,
+        message: str,
+        err_type: str = "BadRequestError",
+        status_code: int = 400,
+        param: Optional[str] = None,
+    ) -> ORJSONResponse:
+        """Create an error response"""
+        # TODO: remove fastapi dependency in openai and move response handling to the entrypoint
+        error = ErrorResponse(
+            object="error",
+            message=message,
+            type=err_type,
+            param=param,
+            code=status_code,
+        )
+        return ORJSONResponse(content=error.model_dump(), status_code=status_code)
+
+    def create_streaming_error_response(
+        self,
+        message: str,
+        err_type: str = "BadRequestError",
+        status_code: int = 400,
+    ) -> str:
+        """Create a streaming error response"""
+        error = ErrorResponse(
+            object="error",
+            message=message,
+            type=err_type,
+            param=None,
+            code=status_code,
+        )
+        return json.dumps({"error": error.model_dump()})
diff --git a/python/sglang/srt/entrypoints/openai/serving_chat.py b/python/sglang/srt/entrypoints/openai/serving_chat.py
new file mode 100644
index 000000000..d67cbfde3
--- /dev/null
+++ b/python/sglang/srt/entrypoints/openai/serving_chat.py
@@ -0,0 +1,1083 @@
+from __future__ import annotations
+
+import copy
+import json
+import logging
+import time
+import uuid
+from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
+
+from fastapi import Request
+from fastapi.responses import ORJSONResponse, StreamingResponse
+
+from sglang.srt.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ChatCompletionResponseChoice,
+    ChatCompletionResponseStreamChoice,
+    ChatCompletionStreamResponse,
+    ChatCompletionTokenLogprob,
+    ChatMessage,
+    ChoiceLogprobs,
+    DeltaMessage,
+    ErrorResponse,
+    FunctionResponse,
+    LogProbs,
+    MessageProcessingResult,
+    ToolCall,
+    TopLogprob,
+)
+from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
+from sglang.srt.entrypoints.openai.usage_processor import UsageProcessor
+from sglang.srt.entrypoints.openai.utils import (
+    process_hidden_states_from_ret,
+    to_openai_style_logprobs,
+)
+from sglang.srt.function_call.function_call_parser import FunctionCallParser
+from sglang.srt.managers.io_struct import GenerateReqInput
+from sglang.srt.parser.conversation import generate_chat_conv
+from sglang.srt.parser.jinja_template_utils import process_content_for_template_format
+from sglang.srt.parser.reasoning_parser import ReasoningParser
+from sglang.utils import convert_json_schema_to_str
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.template_manager import TemplateManager
+    from sglang.srt.managers.tokenizer_manager import TokenizerManager
+
+logger = logging.getLogger(__name__)
+
+
+class OpenAIServingChat(OpenAIServingBase):
+    """Handler for /v1/chat/completions requests"""
+
+    def __init__(
+        self,
+        tokenizer_manager: TokenizerManager,
+        template_manager: TemplateManager,
+    ):
+        super().__init__(tokenizer_manager)
+        self.template_manager = template_manager
+        self.tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
+
+    def _request_id_prefix(self) -> str:
+        return "chatcmpl-"
+
+    def _validate_request(self, request: ChatCompletionRequest) -> Optional[str]:
+        """Validate that the input is valid."""
+        if not request.messages:
+            return "Messages cannot be empty."
+
+        if (
+            isinstance(request.tool_choice, str)
+            and request.tool_choice.lower() == "required"
+            and not request.tools
+        ):
+            return "Tools cannot be empty if tool choice is set to required."
+
+        max_output_tokens = request.max_completion_tokens or request.max_tokens
+        server_context_length = self.tokenizer_manager.server_args.context_length
+        if (
+            max_output_tokens
+            and server_context_length
+            and max_output_tokens > server_context_length
+        ):
+            return (
+                f"max_completion_tokens is too large: {max_output_tokens}."
+                f"This model supports at most {server_context_length} completion tokens."
+            )
+
+        if request.response_format and request.response_format.type == "json_schema":
+            schema = getattr(request.response_format.json_schema, "schema_", None)
+            if schema is None:
+                return "schema_ is required for json_schema response format request."
+
+        return None
+
+    def _convert_to_internal_request(
+        self,
+        request: ChatCompletionRequest,
+    ) -> tuple[GenerateReqInput, ChatCompletionRequest]:
+        reasoning_effort = (
+            request.chat_template_kwargs.pop("reasoning_effort", None)
+            if request.chat_template_kwargs
+            else None
+        )
+        if reasoning_effort is not None:
+            request.reasoning_effort = reasoning_effort
+
+        """Convert OpenAI chat completion request to internal format"""
+        is_multimodal = self.tokenizer_manager.model_config.is_multimodal
+
+        # Process messages and apply chat template
+        processed_messages = self._process_messages(request, is_multimodal)
+
+        # Build sampling parameters
+        sampling_params = self._build_sampling_params(
+            request,
+            processed_messages.stop,
+            processed_messages.tool_call_constraint,
+        )
+
+        # Handle single vs multiple requests
+        if is_multimodal:
+            prompt_kwargs = {"text": processed_messages.prompt}
+        else:
+            if isinstance(processed_messages.prompt_ids, str):
+                prompt_kwargs = {"text": processed_messages.prompt_ids}
+            else:
+                prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
+
+        adapted_request = GenerateReqInput(
+            **prompt_kwargs,
+            image_data=processed_messages.image_data,
+            video_data=processed_messages.video_data,
+            audio_data=processed_messages.audio_data,
+            sampling_params=sampling_params,
+            return_logprob=request.logprobs,
+            logprob_start_len=-1,
+            top_logprobs_num=request.top_logprobs or 0,
+            stream=request.stream,
+            return_text_in_logprobs=True,
+            modalities=processed_messages.modalities,
+            lora_path=request.lora_path,
+            bootstrap_host=request.bootstrap_host,
+            bootstrap_port=request.bootstrap_port,
+            bootstrap_room=request.bootstrap_room,
+            return_hidden_states=request.return_hidden_states,
+            rid=request.rid,
+        )
+
+        return adapted_request, request
+
+    def _process_messages(
+        self, request: ChatCompletionRequest, is_multimodal: bool
+    ) -> MessageProcessingResult:
+        """Process chat messages and apply chat template"""
+        is_gpt_oss = (
+            hasattr(self.tokenizer_manager.model_config, "hf_config")
+            and hasattr(self.tokenizer_manager.model_config.hf_config, "model_type")
+            and self.tokenizer_manager.model_config.hf_config.model_type == "gpt_oss"
+        )
+
+        # GptOss model needs to keep special tokens for harmony parsing
+        if is_gpt_oss:
+            request.skip_special_tokens = False
+
+        tool_call_constraint = None
+
+        # Apply chat template and its stop strings
+        tools = None
+        if request.tools and request.tool_choice != "none":
+            request.skip_special_tokens = False
+            if not isinstance(request.tool_choice, str):
+                tools = [
+                    item.function.model_dump()
+                    for item in request.tools
+                    if item.function.name == request.tool_choice.function.name
+                ]
+            else:
+                tools = [item.function.model_dump() for item in request.tools]
+            if self.tool_call_parser:
+                parser = FunctionCallParser(request.tools, self.tool_call_parser)
+                tool_call_constraint = parser.get_structure_constraint(
+                    request.tool_choice
+                )
+
+        # Use chat template
+        if self.template_manager.chat_template_name is None:
+            result = self._apply_jinja_template(request, tools, is_multimodal)
+        else:
+            result = self._apply_conversation_template(request, is_multimodal)
+
+        result.tool_call_constraint = tool_call_constraint
+        return result
+
+    def _apply_jinja_template(
+        self,
+        request: ChatCompletionRequest,
+        tools: Optional[List[Dict]],
+        is_multimodal: bool,
+    ) -> MessageProcessingResult:
+        """Apply Jinja chat template"""
+        prompt = ""
+        prompt_ids = []
+        openai_compatible_messages = []
+        image_data = []
+        video_data = []
+        audio_data = []
+        modalities = []
+
+        template_content_format = self.template_manager.jinja_template_content_format
+
+        for message in request.messages:
+            if message.content is None:
+                message.content = ""
+            msg_dict = message.model_dump()
+
+            # Process content based on detected template format
+            processed_msg = process_content_for_template_format(
+                msg_dict,
+                template_content_format,
+                image_data,
+                video_data,
+                audio_data,
+                modalities,
+            )
+
+            # per the Transformers docs & maintainers, tool call arguments in
+            # assistant-role messages with tool_calls need to be dicts not JSON str -
+            # this is how tool-use chat templates will expect them moving forwards
+            # so, for messages that have tool_calls, parse the string (which we get
+            # from openAI format) to dict
+            if (
+                processed_msg["role"] == "assistant"
+                and "tool_calls" in processed_msg
+                and isinstance(processed_msg["tool_calls"], list)
+            ):
+                for item in processed_msg["tool_calls"]:
+                    if "arguments" in item["function"] and isinstance(
+                        item["function"]["arguments"], str
+                    ):
+                        item["function"]["arguments"] = json.loads(
+                            item["function"]["arguments"]
+                        )
+
+            openai_compatible_messages.append(processed_msg)
+
+        # Handle assistant prefix for continue_final_message
+        assistant_prefix = None
+        if (
+            openai_compatible_messages
+            and openai_compatible_messages[-1]["role"] == "assistant"
+        ):
+            if request.continue_final_message:
+                assistant_prefix = openai_compatible_messages[-1]["content"]
+                openai_compatible_messages = openai_compatible_messages[:-1]
+
+        try:
+            prompt_ids = self.tokenizer_manager.tokenizer.apply_chat_template(
+                openai_compatible_messages,
+                tokenize=True,
+                add_generation_prompt=True,
+                tools=tools,
+                reasoning_effort=request.reasoning_effort,
+                **(
+                    request.chat_template_kwargs if request.chat_template_kwargs else {}
+                ),
+            )
+        except Exception:
+            # This except branch will be triggered when the chosen model
+            # has a different tools input format that is not compatible
+            # with openAI's apply_chat_template tool_call format, like Mistral.
+            tools = (
+                [t if "function" in t else {"function": t} for t in tools]
+                if tools
+                else None
+            )
+            prompt_ids = self.tokenizer_manager.tokenizer.apply_chat_template(
+                openai_compatible_messages,
+                tokenize=True,
+                add_generation_prompt=True,
+                tools=tools,
+                reasoning_effort=request.reasoning_effort,
+                **(
+                    request.chat_template_kwargs if request.chat_template_kwargs else {}
+                ),
+            )
+
+        if assistant_prefix:
+            encoded = self.tokenizer_manager.tokenizer.encode(assistant_prefix)
+            if encoded and encoded[0] == self.tokenizer_manager.tokenizer.bos_token_id:
+                encoded = encoded[1:]
+            prompt_ids += encoded
+
+        if is_multimodal:
+            prompt = self.tokenizer_manager.tokenizer.decode(prompt_ids)
+
+        stop = request.stop
+        image_data = image_data if image_data else None
+        audio_data = audio_data if audio_data else None
+        video_data = video_data if video_data else None
+        modalities = modalities if modalities else []
+        return MessageProcessingResult(
+            prompt=prompt,
+            prompt_ids=prompt_ids,
+            image_data=image_data,
+            video_data=video_data,
+            audio_data=audio_data,
+            modalities=modalities,
+            stop=stop,
+        )
+
+    def _apply_conversation_template(
+        self,
+        request: ChatCompletionRequest,
+        is_multimodal: bool,
+    ) -> MessageProcessingResult:
+        """Apply conversation template"""
+        prompt = ""
+        prompt_ids = []
+        conv = generate_chat_conv(request, self.template_manager.chat_template_name)
+
+        # If we should continue the final assistant message, adjust the conversation.
+        if (
+            request.continue_final_message
+            and request.messages
+            and request.messages[-1].role == "assistant"
+        ):
+            # Remove the auto-added blank assistant turn, if present.
+            if conv.messages and conv.messages[-1][1] is None:
+                conv.messages.pop()
+            # Rebuild the prompt from the conversation.
+            prompt = conv.get_prompt()
+            # Strip trailing stop tokens or separators that indicate end-of-assistant.
+            if isinstance(conv.stop_str, list):
+                for stop_token in conv.stop_str:
+                    if prompt.endswith(stop_token):
+                        prompt = prompt[: -len(stop_token)]
+            elif isinstance(conv.stop_str, str) and prompt.endswith(conv.stop_str):
+                prompt = prompt[: -len(conv.stop_str)]
+            if conv.sep and prompt.endswith(conv.sep):
+                prompt = prompt[: -len(conv.sep)]
+            if getattr(conv, "sep2", None) and prompt.endswith(conv.sep2):
+                prompt = prompt[: -len(conv.sep2)]
+        else:
+            prompt = conv.get_prompt()
+            if self._get_enable_thinking_from_request(request):
+                prompt += "<think>"  # Note(Xinyuan): hard code thinking token
+
+        image_data = conv.image_data if conv.image_data else None
+        video_data = conv.video_data if conv.video_data else None
+        audio_data = conv.audio_data if conv.audio_data else None
+        modalities = conv.modalities if conv.modalities else []
+        stop = copy.copy(conv.stop_str or [] if not request.ignore_eos else [])
+
+        if request.stop:
+            if isinstance(request.stop, str):
+                stop.append(request.stop)
+            else:
+                stop.extend(request.stop)
+
+        if not is_multimodal:
+            prompt_ids = self.tokenizer_manager.tokenizer.encode(prompt)
+
+        return MessageProcessingResult(
+            prompt=prompt,
+            prompt_ids=prompt_ids,
+            image_data=image_data,
+            video_data=video_data,
+            audio_data=audio_data,
+            modalities=modalities,
+            stop=stop,
+        )
+
+    def _build_sampling_params(
+        self,
+        request: ChatCompletionRequest,
+        stop: List[str],
+        tool_call_constraint: Optional[Any],
+    ) -> Dict[str, Any]:
+        """Build sampling parameters for the request"""
+
+        sampling_params = {
+            "temperature": request.temperature,
+            "max_new_tokens": request.max_tokens or request.max_completion_tokens,
+            "min_new_tokens": request.min_tokens,
+            "stop": stop,
+            "stop_token_ids": request.stop_token_ids,
+            "top_p": request.top_p,
+            "top_k": request.top_k,
+            "min_p": request.min_p,
+            "presence_penalty": request.presence_penalty,
+            "frequency_penalty": request.frequency_penalty,
+            "repetition_penalty": request.repetition_penalty,
+            "regex": request.regex,
+            "ebnf": request.ebnf,
+            "n": request.n,
+            "no_stop_trim": request.no_stop_trim,
+            "ignore_eos": request.ignore_eos,
+            "skip_special_tokens": request.skip_special_tokens,
+            "logit_bias": request.logit_bias,
+        }
+
+        if request.response_format and request.response_format.type == "json_schema":
+            sampling_params["json_schema"] = convert_json_schema_to_str(
+                request.response_format.json_schema.schema_
+            )
+        elif request.response_format and request.response_format.type == "json_object":
+            sampling_params["json_schema"] = '{"type": "object"}'
+        elif (
+            request.response_format and request.response_format.type == "structural_tag"
+        ):
+            sampling_params["structural_tag"] = convert_json_schema_to_str(
+                request.response_format.model_dump(by_alias=True)
+            )
+
+        # Check if there are already existing output constraints
+        has_existing_constraints = (
+            sampling_params.get("regex")
+            or sampling_params.get("ebnf")
+            or sampling_params.get("structural_tag")
+            or sampling_params.get("json_schema")
+        )
+
+        if tool_call_constraint and has_existing_constraints:
+            logger.warning("Constrained decoding is not compatible with tool calls.")
+        elif tool_call_constraint:
+            constraint_type, constraint_value = tool_call_constraint
+            if constraint_type == "structural_tag":
+                sampling_params[constraint_type] = convert_json_schema_to_str(
+                    constraint_value.model_dump(by_alias=True)
+                )
+            else:
+                sampling_params[constraint_type] = constraint_value
+        return sampling_params
+
+    async def _handle_streaming_request(
+        self,
+        adapted_request: GenerateReqInput,
+        request: ChatCompletionRequest,
+        raw_request: Request,
+    ) -> StreamingResponse:
+        """Handle streaming chat completion request"""
+        return StreamingResponse(
+            self._generate_chat_stream(adapted_request, request, raw_request),
+            media_type="text/event-stream",
+            background=self.tokenizer_manager.create_abort_task(adapted_request),
+        )
+
+    async def _generate_chat_stream(
+        self,
+        adapted_request: GenerateReqInput,
+        request: ChatCompletionRequest,
+        raw_request: Request,
+    ) -> AsyncGenerator[str, None]:
+        """Generate streaming chat completion response"""
+        # Parsers for tool calls and reasoning
+        parser_dict = {}
+        reasoning_parser_dict = {}
+
+        # State tracking for streaming
+        is_firsts = {}
+        stream_buffers = {}
+        n_prev_tokens = {}
+        has_tool_calls = {}
+        finish_reasons = {}
+
+        # Usage tracking
+        prompt_tokens = {}
+        completion_tokens = {}
+        cached_tokens = {}
+        hidden_states = {}
+
+        try:
+            async for content in self.tokenizer_manager.generate_request(
+                adapted_request, raw_request
+            ):
+                index = content.get("index", 0)
+
+                prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
+                completion_tokens[index] = content["meta_info"]["completion_tokens"]
+                cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
+                hidden_states[index] = content["meta_info"].get("hidden_states", None)
+
+                # Handle logprobs
+                choice_logprobs = None
+                if request.logprobs:
+                    choice_logprobs = self._process_streaming_logprobs(
+                        content, n_prev_tokens.get(index, 0)
+                    )
+                    n_prev_tokens[index] = len(
+                        content["meta_info"]["output_token_logprobs"]
+                    )
+
+                finish_reason = content["meta_info"]["finish_reason"]
+                finish_reason_type = finish_reason["type"] if finish_reason else None
+
+                # Track finish_reason for each index
+                if finish_reason_type:
+                    finish_reasons[index] = finish_reason
+
+                # First chunk with role
+                if is_firsts.get(index, True):
+                    is_firsts[index] = False
+                    delta = DeltaMessage(role="assistant", content="")
+                    choice_data = ChatCompletionResponseStreamChoice(
+                        index=index,
+                        delta=delta,
+                        finish_reason=None,
+                        logprobs=None,
+                    )
+                    chunk = ChatCompletionStreamResponse(
+                        id=content["meta_info"]["id"],
+                        created=int(time.time()),
+                        choices=[choice_data],
+                        model=request.model,
+                    )
+                    yield f"data: {chunk.model_dump_json()}\n\n"
+
+                stream_buffer = stream_buffers.get(index, "")
+                delta = content["text"][len(stream_buffer) :]
+                stream_buffers[index] = stream_buffer + delta
+
+                # Handle reasoning content
+                if (
+                    self.tokenizer_manager.server_args.reasoning_parser
+                    and request.separate_reasoning
+                ):
+                    reasoning_text, delta = self._process_reasoning_stream(
+                        index, delta, reasoning_parser_dict, content, request
+                    )
+                    if reasoning_text:
+                        choice_data = ChatCompletionResponseStreamChoice(
+                            index=index,
+                            delta=DeltaMessage(reasoning_content=reasoning_text),
+                            finish_reason=None,
+                        )
+                        chunk = ChatCompletionStreamResponse(
+                            id=content["meta_info"]["id"],
+                            created=int(time.time()),
+                            choices=[choice_data],
+                            model=request.model,
+                        )
+                        yield f"data: {chunk.model_dump_json()}\n\n"
+
+                # Handle tool calls
+                if (
+                    request.tool_choice != "none"
+                    and request.tools
+                    and self.tool_call_parser
+                ):
+                    async for chunk in self._process_tool_call_stream(
+                        index,
+                        delta,
+                        parser_dict,
+                        content,
+                        request,
+                        has_tool_calls,
+                    ):
+                        if chunk:
+                            yield chunk
+
+                    # Send any remaining tool call arguments when generation finishes
+                    if finish_reason_type is not None and index in parser_dict:
+                        parser = parser_dict[index]
+                        remaining_chunk = self._check_for_unstreamed_tool_args(
+                            parser, content, request, index
+                        )
+                        if remaining_chunk:
+                            yield remaining_chunk
+
+                else:
+                    # Regular content
+                    if delta:
+                        choice_data = ChatCompletionResponseStreamChoice(
+                            index=index,
+                            delta=DeltaMessage(content=delta),
+                            finish_reason=None,
+                            matched_stop=None,
+                            logprobs=choice_logprobs,
+                        )
+                        chunk = ChatCompletionStreamResponse(
+                            id=content["meta_info"]["id"],
+                            created=int(time.time()),
+                            choices=[choice_data],
+                            model=request.model,
+                        )
+                        yield f"data: {chunk.model_dump_json()}\n\n"
+
+            # Send finish_reason chunks for each index that completed
+            for idx, finish_reason_data in finish_reasons.items():
+                finish_reason_type = finish_reason_data["type"]
+
+                # Change finish_reason to "tool_calls" if we had tool calls and stopped naturally
+                final_finish_reason = finish_reason_type
+                if has_tool_calls.get(idx, False) and finish_reason_type == "stop":
+                    final_finish_reason = "tool_calls"
+
+                finish_reason_chunk = ChatCompletionStreamResponse(
+                    id=content["meta_info"][
+                        "id"
+                    ],  # NOTE: openai uses the same chatcmpl-id for all indices
+                    created=int(time.time()),
+                    choices=[
+                        ChatCompletionResponseStreamChoice(
+                            index=idx,
+                            delta=DeltaMessage(),
+                            finish_reason=final_finish_reason,
+                            matched_stop=(
+                                finish_reason_data["matched"]
+                                if "matched" in finish_reason_data
+                                else None
+                            ),
+                        )
+                    ],
+                    model=request.model,
+                    usage=None,
+                )
+                yield f"data: {finish_reason_chunk.model_dump_json()}\n\n"
+
+            # Send hidden states if requested
+            if request.return_hidden_states and hidden_states:
+                for index, choice_hidden_states in hidden_states.items():
+                    if choice_hidden_states:
+                        last_token_hidden_states = (
+                            choice_hidden_states[-1]
+                            if len(choice_hidden_states) > 1
+                            else []
+                        )
+                        hidden_states_chunk = ChatCompletionStreamResponse(
+                            id=content["meta_info"]["id"],
+                            created=int(time.time()),
+                            choices=[
+                                ChatCompletionResponseStreamChoice(
+                                    index=index,
+                                    delta=DeltaMessage(
+                                        hidden_states=last_token_hidden_states
+                                    ),
+                                    finish_reason=None,  # Hidden states don't need finish_reason
+                                )
+                            ],
+                            model=request.model,
+                        )
+                        yield f"data: {hidden_states_chunk.model_dump_json()}\n\n"
+
+            # Additional usage chunk
+            if request.stream_options and request.stream_options.include_usage:
+                usage = UsageProcessor.calculate_streaming_usage(
+                    prompt_tokens,
+                    completion_tokens,
+                    cached_tokens,
+                    n_choices=request.n,
+                    enable_cache_report=self.tokenizer_manager.server_args.enable_cache_report,
+                )
+                usage_chunk = ChatCompletionStreamResponse(
+                    id=content["meta_info"]["id"],
+                    created=int(time.time()),
+                    choices=[],  # Empty choices array as per OpenAI spec
+                    model=request.model,
+                    usage=usage,
+                )
+                yield f"data: {usage_chunk.model_dump_json()}\n\n"
+
+        except ValueError as e:
+            error = self.create_streaming_error_response(str(e))
+            yield f"data: {error}\n\n"
+
+        yield "data: [DONE]\n\n"
+
+    async def _handle_non_streaming_request(
+        self,
+        adapted_request: GenerateReqInput,
+        request: ChatCompletionRequest,
+        raw_request: Request,
+    ) -> Union[ChatCompletionResponse, ErrorResponse, ORJSONResponse]:
+        """Handle non-streaming chat completion request"""
+        try:
+            ret = await self.tokenizer_manager.generate_request(
+                adapted_request, raw_request
+            ).__anext__()
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        if not isinstance(ret, list):
+            ret = [ret]
+
+        response = self._build_chat_response(
+            request,
+            ret,
+            int(time.time()),
+        )
+
+        return response
+
+    def _build_chat_response(
+        self,
+        request: ChatCompletionRequest,
+        ret: List[Dict[str, Any]],
+        created: int,
+    ) -> Union[ChatCompletionResponse, ORJSONResponse]:
+        """Build chat completion response from generation results"""
+        choices = []
+
+        for idx, ret_item in enumerate(ret):
+            # Process logprobs
+            choice_logprobs = None
+            if request.logprobs:
+                choice_logprobs = self._process_response_logprobs(ret_item)
+
+            # Handle hidden states
+            hidden_states = process_hidden_states_from_ret(ret_item, request)
+
+            finish_reason = ret_item["meta_info"]["finish_reason"]
+            text = ret_item["text"]
+
+            # Handle reasoning content
+            reasoning_text = None
+            reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
+            if reasoning_parser and request.separate_reasoning:
+                is_force_reasoning = (
+                    self.template_manager.force_reasoning
+                    or self._get_enable_thinking_from_request(request)
+                )
+                try:
+                    parser = ReasoningParser(
+                        model_type=reasoning_parser,
+                        stream_reasoning=False,
+                        force_reasoning=is_force_reasoning,
+                    )
+                    reasoning_text, text = parser.parse_non_stream(text)
+                except Exception as e:
+                    logger.error(f"Reasoning parsing error: {e}")
+                    return self.create_error_response(
+                        "Failed to parse reasoning content",
+                        err_type="InternalServerError",
+                        status_code=500,
+                    )
+
+            # Handle tool calls
+            tool_calls = None
+            if (
+                request.tool_choice != "none"
+                and request.tools
+                and self.tool_call_parser
+            ):
+                tool_calls, text, finish_reason = self._process_tool_calls(
+                    text, request.tools, finish_reason
+                )
+
+            choice_data = ChatCompletionResponseChoice(
+                index=idx,
+                message=ChatMessage(
+                    role="assistant",
+                    content=text if text else None,
+                    tool_calls=tool_calls,
+                    reasoning_content=reasoning_text if reasoning_text else None,
+                ),
+                logprobs=choice_logprobs,
+                finish_reason=finish_reason["type"] if finish_reason else None,
+                matched_stop=(
+                    finish_reason["matched"]
+                    if finish_reason and "matched" in finish_reason
+                    else None
+                ),
+                hidden_states=hidden_states,
+            )
+            choices.append(choice_data)
+
+        # Calculate usage
+        usage = UsageProcessor.calculate_response_usage(
+            ret,
+            n_choices=request.n,
+            enable_cache_report=self.tokenizer_manager.server_args.enable_cache_report,
+        )
+
+        return ChatCompletionResponse(
+            id=ret[0]["meta_info"]["id"],
+            created=created,
+            model=request.model,
+            choices=choices,
+            usage=usage,
+            metadata={"weight_version": ret[0]["meta_info"]["weight_version"]},
+        )
+
+    def _process_logprobs_tokens(
+        self, logprobs: LogProbs, use_token_index: bool = False
+    ) -> List[ChatCompletionTokenLogprob]:
+        """Common helper to process logprobs tokens for both streaming and non-streaming
+
+        Args:
+            logprobs: LogProbs data from model
+            use_token_index: True for non-streaming (use token_idx), False for streaming (use index 0)
+        """
+        token_logprobs = []
+
+        for token_idx, (token, logprob) in enumerate(
+            zip(logprobs.tokens, logprobs.token_logprobs)
+        ):
+            token_bytes = list(token.encode("utf-8"))
+            top_logprobs = []
+            if logprobs.top_logprobs:
+                # - Non-streaming (use_token_index=True): uses token_idx for full data
+                # - Streaming (use_token_index=False): uses index 0 for pre-sliced data
+                top_logprobs_idx = token_idx if use_token_index else 0
+                for top_token, top_logprob in logprobs.top_logprobs[
+                    top_logprobs_idx
+                ].items():
+                    top_token_bytes = list(top_token.encode("utf-8"))
+                    top_logprobs.append(
+                        TopLogprob(
+                            token=top_token,
+                            bytes=top_token_bytes,
+                            logprob=top_logprob,
+                        )
+                    )
+            token_logprobs.append(
+                ChatCompletionTokenLogprob(
+                    token=token,
+                    bytes=token_bytes,
+                    logprob=logprob,
+                    top_logprobs=top_logprobs,
+                )
+            )
+
+        return token_logprobs
+
+    def _process_response_logprobs(self, ret_item: Dict[str, Any]) -> ChoiceLogprobs:
+        """Process logprobs for non-streaming response"""
+        logprobs = to_openai_style_logprobs(
+            output_token_logprobs=ret_item["meta_info"]["output_token_logprobs"],
+            output_top_logprobs=ret_item["meta_info"].get("output_top_logprobs", None),
+        )
+
+        token_logprobs = self._process_logprobs_tokens(logprobs, use_token_index=True)
+        return ChoiceLogprobs(content=token_logprobs)
+
+    def _process_tool_calls(
+        self,
+        text: str,
+        tools: List[Any],
+        finish_reason: Dict[str, Any],
+    ) -> tuple[Optional[List[ToolCall]], str, Dict[str, Any]]:
+        """Process tool calls in the response"""
+        parser = FunctionCallParser(tools, self.tool_call_parser)
+        if parser.has_tool_call(text):
+            if finish_reason["type"] == "stop":
+                finish_reason["type"] = "tool_calls"
+                finish_reason["matched"] = None
+            try:
+                text, call_info_list = parser.parse_non_stream(text)
+                tool_calls = []
+                for call_info in call_info_list:
+                    # For Kimi-K2, align tool_call_id with the model format: functions.{name}:{index}
+                    if (
+                        self.tool_call_parser == "kimi_k2"
+                        and call_info.name is not None
+                    ):
+                        tool_id = f"functions.{call_info.name}:{call_info.tool_index}"
+                    else:
+                        tool_id = f"call_{uuid.uuid4().hex[:24]}"
+
+                    tool_calls.append(
+                        ToolCall(
+                            id=tool_id,
+                            index=getattr(call_info, "tool_index", None),
+                            function=FunctionResponse(
+                                name=call_info.name, arguments=call_info.parameters
+                            ),
+                        )
+                    )
+                return tool_calls, text, finish_reason
+            except Exception as e:
+                logger.error(f"Tool call parsing error: {e}")
+                # Return error but don't fail the whole request
+                return None, text, finish_reason
+
+        return None, text, finish_reason
+
+    def _process_streaming_logprobs(
+        self, content: Dict[str, Any], n_prev_token: int
+    ) -> ChoiceLogprobs:
+        """Process logprobs for streaming response"""
+        logprobs = to_openai_style_logprobs(
+            output_token_logprobs=content["meta_info"]["output_token_logprobs"][
+                n_prev_token:
+            ],
+            output_top_logprobs=content["meta_info"].get("output_top_logprobs", [])[
+                n_prev_token:
+            ],
+        )
+
+        token_logprobs = self._process_logprobs_tokens(logprobs, use_token_index=False)
+        return ChoiceLogprobs(content=token_logprobs)
+
+    def _process_reasoning_stream(
+        self,
+        index: int,
+        delta: str,
+        reasoning_parser_dict: Dict[int, ReasoningParser],
+        content: Dict[str, Any],
+        request: ChatCompletionRequest,
+    ) -> tuple[Optional[str], str]:
+        """Process reasoning content in streaming response"""
+        if index not in reasoning_parser_dict:
+            is_force_reasoning = (
+                self.template_manager.force_reasoning
+                or self._get_enable_thinking_from_request(request)
+            )
+            reasoning_parser_dict[index] = ReasoningParser(
+                self.tokenizer_manager.server_args.reasoning_parser,
+                request.stream_reasoning,
+                is_force_reasoning,
+            )
+        reasoning_parser = reasoning_parser_dict[index]
+        return reasoning_parser.parse_stream_chunk(delta)
+
+    def _get_enable_thinking_from_request(self, request: ChatCompletionRequest) -> bool:
+        """Extracts the 'enable_thinking' flag from request chat_template_kwargs.
+
+        NOTE: This parameter is only useful for models that support enable_thinking
+        flag, such as Qwen3.
+
+        Args:
+            request_obj: The request object (or an item from a list of requests).
+        Returns:
+            The boolean value of 'enable_thinking' if found, otherwise False.
+        """
+        if hasattr(request, "chat_template_kwargs") and request.chat_template_kwargs:
+            # For Qwen3 models, `enable_thinking` is supported.
+            if request.chat_template_kwargs.get("enable_thinking") is not None:
+                return request.chat_template_kwargs.get("enable_thinking")
+            # For DeepSeek-V3.1 models, `thinking` is supported.
+            elif request.chat_template_kwargs.get("thinking") is not None:
+                return request.chat_template_kwargs.get("thinking")
+            else:
+                return False
+        return False
+
+    async def _process_tool_call_stream(
+        self,
+        index: int,
+        delta: str,
+        parser_dict: Dict[int, FunctionCallParser],
+        content: Dict[str, Any],
+        request: ChatCompletionRequest,
+        has_tool_calls: Dict[int, bool],
+    ):
+        """Process tool calls in streaming response"""
+        if index not in parser_dict:
+            parser_dict[index] = FunctionCallParser(
+                tools=request.tools,
+                tool_call_parser=self.tool_call_parser,
+            )
+        parser = parser_dict[index]
+
+        normal_text, calls = parser.parse_stream_chunk(delta)
+
+        # Yield normal text
+        if normal_text:
+            choice_data = ChatCompletionResponseStreamChoice(
+                index=index,
+                delta=DeltaMessage(content=normal_text),
+                finish_reason=None,
+            )
+            chunk = ChatCompletionStreamResponse(
+                id=content["meta_info"]["id"],
+                created=int(time.time()),
+                choices=[choice_data],
+                model=request.model,
+            )
+            yield f"data: {chunk.model_dump_json()}\n\n"
+
+        # Yield tool calls
+        for call_item in calls:
+            # Mark that this choice has tool calls
+            has_tool_calls[index] = True
+
+            # Tool call ID should be generated only once per tool call
+            if call_item.name:
+                # First chunk: include ID and function name
+                if self.tool_call_parser == "kimi_k2":
+                    # Align with Kimi-K2 format: functions.{name}:{index}
+                    tool_call_id = f"functions.{call_item.name}:{call_item.tool_index}"
+                else:
+                    tool_call_id = f"call_{uuid.uuid4().hex[:24]}"
+                function_name = call_item.name
+            else:
+                # Subsequent chunks: null ID and name for argument deltas
+                tool_call_id = None
+                function_name = None
+
+            tool_call = ToolCall(
+                id=tool_call_id,
+                index=call_item.tool_index,
+                function=FunctionResponse(
+                    name=function_name,
+                    arguments=call_item.parameters,
+                ),
+            )
+
+            choice_data = ChatCompletionResponseStreamChoice(
+                index=index,
+                delta=DeltaMessage(tool_calls=[tool_call]),
+                finish_reason=None,
+            )
+            chunk = ChatCompletionStreamResponse(
+                id=content["meta_info"]["id"],
+                created=int(time.time()),
+                choices=[choice_data],
+                model=request.model,
+            )
+            yield f"data: {chunk.model_dump_json()}\n\n"
+
+    def _check_for_unstreamed_tool_args(
+        self,
+        parser: FunctionCallParser,
+        content: Dict[str, Any],
+        request: ChatCompletionRequest,
+        index: int,
+    ) -> Optional[str]:
+        """
+        Check for any remaining tool call arguments that need to be streamed
+        when generation finishes. This ensures tool calls are properly completed
+        even if the model generates the final arguments in the last chunk.
+        """
+        # Only check if we have tool calls and the parser has tracked data
+        if (
+            not hasattr(parser.detector, "prev_tool_call_arr")
+            or not parser.detector.prev_tool_call_arr
+        ):
+            return None
+
+        if (
+            not hasattr(parser.detector, "streamed_args_for_tool")
+            or not parser.detector.streamed_args_for_tool
+        ):
+            return None
+
+        # Get the last tool call that was being processed
+        tool_index = len(parser.detector.prev_tool_call_arr) - 1
+        if tool_index < 0 or tool_index >= len(parser.detector.streamed_args_for_tool):
+            return None
+
+        # Get expected vs actual arguments
+        expected_args = parser.detector.prev_tool_call_arr[tool_index].get(
+            "arguments", {}
+        )
+        expected_call = json.dumps(expected_args, ensure_ascii=False)
+        actual_call = parser.detector.streamed_args_for_tool[tool_index]
+
+        # Check if there are remaining arguments to send
+        remaining_call = (
+            expected_call.replace(actual_call, "", 1)
+            if actual_call in expected_call
+            else ""
+        )
+
+        if remaining_call:
+            # Create tool call chunk with remaining arguments
+            tool_call = ToolCall(
+                id=None,  # No ID for argument deltas
+                index=tool_index,
+                function=FunctionResponse(
+                    name=None,  # No name for argument deltas
+                    arguments=remaining_call,
+                ),
+            )
+
+            choice_data = ChatCompletionResponseStreamChoice(
+                index=index,
+                delta=DeltaMessage(tool_calls=[tool_call]),
+                finish_reason=None,  # Don't send finish_reason with this chunk
+            )
+
+            chunk = ChatCompletionStreamResponse(
+                id=content["meta_info"]["id"],
+                created=int(time.time()),
+                choices=[choice_data],
+                model=request.model,
+            )
+
+            return f"data: {chunk.model_dump_json()}\n\n"
+
+        return None
diff --git a/python/sglang/srt/entrypoints/openai/serving_completions.py b/python/sglang/srt/entrypoints/openai/serving_completions.py
new file mode 100644
index 000000000..6fe02d325
--- /dev/null
+++ b/python/sglang/srt/entrypoints/openai/serving_completions.py
@@ -0,0 +1,455 @@
+from __future__ import annotations
+
+import logging
+import time
+from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
+
+from fastapi import Request
+from fastapi.responses import ORJSONResponse, StreamingResponse
+
+from sglang.srt.entrypoints.openai.protocol import (
+    CompletionRequest,
+    CompletionResponse,
+    CompletionResponseChoice,
+    CompletionResponseStreamChoice,
+    CompletionStreamResponse,
+    ErrorResponse,
+)
+from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
+from sglang.srt.entrypoints.openai.usage_processor import UsageProcessor
+from sglang.srt.entrypoints.openai.utils import (
+    process_hidden_states_from_ret,
+    to_openai_style_logprobs,
+)
+from sglang.srt.managers.io_struct import GenerateReqInput
+from sglang.srt.parser.code_completion_parser import (
+    generate_completion_prompt_from_request,
+)
+from sglang.utils import convert_json_schema_to_str
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.template_manager import TemplateManager
+    from sglang.srt.managers.tokenizer_manager import TokenizerManager
+
+logger = logging.getLogger(__name__)
+
+
+class OpenAIServingCompletion(OpenAIServingBase):
+    """Handler for /v1/completion requests"""
+
+    def __init__(
+        self,
+        tokenizer_manager: TokenizerManager,
+        template_manager: TemplateManager,
+    ):
+        super().__init__(tokenizer_manager)
+        self.template_manager = template_manager
+
+    def _request_id_prefix(self) -> str:
+        return "cmpl-"
+
+    def _validate_request(self, request: CompletionRequest) -> Optional[str]:
+        """Validate that the input is valid."""
+        prompt = request.prompt
+        if not prompt or (isinstance(prompt, list) and all(not p for p in prompt)):
+            return "Prompt cannot be empty"
+
+        return None
+
+    def _convert_to_internal_request(
+        self,
+        request: CompletionRequest,
+    ) -> tuple[GenerateReqInput, CompletionRequest]:
+        """Convert OpenAI completion request to internal format"""
+        # NOTE: with openai API, the prompt's logprobs are always not computed
+        if request.echo and request.logprobs:
+            logger.warning(
+                "Echo is not compatible with logprobs. "
+                "To compute logprobs of input prompt, please use the native /generate API."
+            )
+        # Process prompt
+        prompt = request.prompt
+        if self.template_manager.completion_template_name is not None:
+            prompt = generate_completion_prompt_from_request(request)
+
+        # Set logprob start length based on echo and logprobs
+        if request.echo and request.logprobs:
+            logprob_start_len = 0
+        else:
+            logprob_start_len = -1
+
+        # Build sampling parameters
+        sampling_params = self._build_sampling_params(request)
+
+        # Determine prompt format
+        if isinstance(prompt, str) or (
+            isinstance(prompt, list) and isinstance(prompt[0], str)
+        ):
+            prompt_kwargs = {"text": prompt}
+        else:
+            prompt_kwargs = {"input_ids": prompt}
+
+        adapted_request = GenerateReqInput(
+            **prompt_kwargs,
+            sampling_params=sampling_params,
+            return_logprob=request.logprobs is not None,
+            top_logprobs_num=request.logprobs if request.logprobs is not None else 0,
+            logprob_start_len=logprob_start_len,
+            return_text_in_logprobs=True,
+            stream=request.stream,
+            lora_path=request.lora_path,
+            bootstrap_host=request.bootstrap_host,
+            bootstrap_port=request.bootstrap_port,
+            bootstrap_room=request.bootstrap_room,
+            return_hidden_states=request.return_hidden_states,
+            rid=request.rid,
+        )
+
+        return adapted_request, request
+
+    def _build_sampling_params(self, request: CompletionRequest) -> Dict[str, Any]:
+        """Build sampling parameters for the request"""
+        # Start with common parameters
+        sampling_params = {
+            "temperature": request.temperature,
+            "max_new_tokens": request.max_tokens,
+            "min_new_tokens": request.min_tokens,
+            "stop": request.stop,
+            "stop_token_ids": request.stop_token_ids,
+            "top_p": request.top_p,
+            "top_k": request.top_k,
+            "min_p": request.min_p,
+            "presence_penalty": request.presence_penalty,
+            "frequency_penalty": request.frequency_penalty,
+            "repetition_penalty": request.repetition_penalty,
+            "regex": request.regex,
+            "json_schema": request.json_schema,
+            "ebnf": request.ebnf,
+            "n": request.n,
+            "no_stop_trim": request.no_stop_trim,
+            "ignore_eos": request.ignore_eos,
+            "skip_special_tokens": request.skip_special_tokens,
+            "logit_bias": request.logit_bias,
+        }
+
+        # Handle response_format constraints
+        if request.response_format and request.response_format.type == "json_schema":
+            sampling_params["json_schema"] = convert_json_schema_to_str(
+                request.response_format.json_schema.schema_
+            )
+        elif request.response_format and request.response_format.type == "json_object":
+            sampling_params["json_schema"] = '{"type": "object"}'
+        elif (
+            request.response_format and request.response_format.type == "structural_tag"
+        ):
+            sampling_params["structural_tag"] = convert_json_schema_to_str(
+                request.response_format.model_dump(by_alias=True)
+            )
+
+        return sampling_params
+
+    async def _handle_streaming_request(
+        self,
+        adapted_request: GenerateReqInput,
+        request: CompletionRequest,
+        raw_request: Request,
+    ) -> StreamingResponse:
+        """Handle streaming completion request"""
+        return StreamingResponse(
+            self._generate_completion_stream(adapted_request, request, raw_request),
+            media_type="text/event-stream",
+            background=self.tokenizer_manager.create_abort_task(adapted_request),
+        )
+
+    async def _generate_completion_stream(
+        self,
+        adapted_request: GenerateReqInput,
+        request: CompletionRequest,
+        raw_request: Request,
+    ) -> AsyncGenerator[str, None]:
+        """Generate streaming completion response"""
+        created = int(time.time())
+
+        # State tracking for streaming
+        stream_buffers = {}
+        n_prev_tokens = {}
+
+        # Usage tracking
+        prompt_tokens = {}
+        completion_tokens = {}
+        cached_tokens = {}
+        hidden_states = {}
+
+        try:
+            async for content in self.tokenizer_manager.generate_request(
+                adapted_request, raw_request
+            ):
+                index = content.get("index", 0)
+
+                text = content["text"]
+                prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
+                completion_tokens[index] = content["meta_info"]["completion_tokens"]
+                cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
+                hidden_states[index] = content["meta_info"].get("hidden_states", None)
+
+                stream_buffer = stream_buffers.get(index, "")
+                # Handle echo for first chunk
+                if not stream_buffer:  # The first chunk
+                    if request.echo:
+                        echo_text = self._get_echo_text(request, index)
+                        text = echo_text + text
+
+                # Handle logprobs
+                logprobs = None
+                if request.logprobs is not None:
+                    # The first chunk and echo is enabled.
+                    if not stream_buffer and request.echo:
+                        input_token_logprobs = content["meta_info"][
+                            "input_token_logprobs"
+                        ]
+                        input_top_logprobs = content["meta_info"]["input_top_logprobs"]
+                    else:
+                        input_token_logprobs = None
+                        input_top_logprobs = None
+
+                    n_prev_token = n_prev_tokens.get(index, 0)
+                    logprobs = to_openai_style_logprobs(
+                        input_token_logprobs=input_token_logprobs,
+                        input_top_logprobs=input_top_logprobs,
+                        output_token_logprobs=content["meta_info"][
+                            "output_token_logprobs"
+                        ][n_prev_token:],
+                        output_top_logprobs=content["meta_info"]["output_top_logprobs"][
+                            n_prev_token:
+                        ],
+                    )
+                    n_prev_tokens[index] = len(
+                        content["meta_info"]["output_token_logprobs"]
+                    )
+
+                # Generate delta
+                delta = text[len(stream_buffer) :]
+                stream_buffers[index] = stream_buffer + delta
+                finish_reason = content["meta_info"]["finish_reason"]
+
+                choice_data = CompletionResponseStreamChoice(
+                    index=index,
+                    text=delta,
+                    logprobs=logprobs,
+                    finish_reason=finish_reason["type"] if finish_reason else None,
+                    matched_stop=(
+                        finish_reason["matched"]
+                        if finish_reason and "matched" in finish_reason
+                        else None
+                    ),
+                )
+                chunk = CompletionStreamResponse(
+                    id=content["meta_info"]["id"],
+                    created=created,
+                    object="text_completion",
+                    choices=[choice_data],
+                    model=request.model,
+                )
+
+                yield f"data: {chunk.model_dump_json()}\n\n"
+
+            if request.return_hidden_states and hidden_states:
+                for index, choice_hidden_states in hidden_states.items():
+                    if choice_hidden_states:
+                        last_token_hidden_states = (
+                            choice_hidden_states[-1]
+                            if len(choice_hidden_states) > 1
+                            else []
+                        )
+                        hidden_states_chunk = CompletionStreamResponse(
+                            id=content["meta_info"]["id"],
+                            created=created,
+                            object="text_completion",
+                            choices=[
+                                CompletionResponseStreamChoice(
+                                    index=index,
+                                    text="",
+                                    hidden_states=last_token_hidden_states,
+                                    finish_reason=None,
+                                )
+                            ],
+                            model=request.model,
+                        )
+                        yield f"data: {hidden_states_chunk.model_dump_json()}\n\n"
+
+            # Handle final usage chunk
+            if request.stream_options and request.stream_options.include_usage:
+                usage = UsageProcessor.calculate_streaming_usage(
+                    prompt_tokens,
+                    completion_tokens,
+                    cached_tokens,
+                    n_choices=request.n,
+                    enable_cache_report=self.tokenizer_manager.server_args.enable_cache_report,
+                )
+                final_usage_chunk = CompletionStreamResponse(
+                    id=content["meta_info"]["id"],
+                    created=created,
+                    choices=[],
+                    model=request.model,
+                    usage=usage,
+                )
+                final_usage_data = final_usage_chunk.model_dump_json(exclude_none=True)
+                yield f"data: {final_usage_data}\n\n"
+
+        except Exception as e:
+            error = self.create_streaming_error_response(str(e))
+            yield f"data: {error}\n\n"
+
+        yield "data: [DONE]\n\n"
+
+    async def _handle_non_streaming_request(
+        self,
+        adapted_request: GenerateReqInput,
+        request: CompletionRequest,
+        raw_request: Request,
+    ) -> Union[CompletionResponse, ErrorResponse, ORJSONResponse]:
+        """Handle non-streaming completion request"""
+        try:
+            generator = self.tokenizer_manager.generate_request(
+                adapted_request, raw_request
+            )
+            ret = await generator.__anext__()
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        if not isinstance(ret, list):
+            ret = [ret]
+
+        response = self._build_completion_response(
+            request,
+            ret,
+            int(time.time()),
+        )
+
+        return response
+
+    def _build_completion_response(
+        self,
+        request: CompletionRequest,
+        ret: List[Dict[str, Any]],
+        created: int,
+    ) -> CompletionResponse:
+        """Build completion response from generation results"""
+        choices = []
+        echo = False
+
+        # Prepare echo prompts if needed
+        echo_prompts = []
+        if request.echo:
+            echo_prompts = self._prepare_echo_prompts(request)
+            echo = True
+
+        for idx, ret_item in enumerate(ret):
+            text = ret_item["text"]
+
+            # Handle echo
+            if echo:
+                prompt_index = idx // request.n
+                text = echo_prompts[prompt_index] + text
+
+            # Handle logprobs
+            logprobs = None
+            if request.logprobs is not None:
+                if echo:
+                    input_token_logprobs = ret_item["meta_info"]["input_token_logprobs"]
+                    input_top_logprobs = ret_item["meta_info"]["input_top_logprobs"]
+                else:
+                    input_token_logprobs = None
+                    input_top_logprobs = None
+
+                logprobs = to_openai_style_logprobs(
+                    input_token_logprobs=input_token_logprobs,
+                    input_top_logprobs=input_top_logprobs,
+                    output_token_logprobs=ret_item["meta_info"][
+                        "output_token_logprobs"
+                    ],
+                    output_top_logprobs=ret_item["meta_info"]["output_top_logprobs"],
+                )
+
+            # Handle hidden states
+            hidden_states = process_hidden_states_from_ret(ret_item, request)
+
+            finish_reason = ret_item["meta_info"]["finish_reason"]
+
+            choice_data = CompletionResponseChoice(
+                index=idx,
+                text=text,
+                logprobs=logprobs,
+                finish_reason=finish_reason["type"] if finish_reason else None,
+                matched_stop=(
+                    finish_reason["matched"]
+                    if finish_reason and "matched" in finish_reason
+                    else None
+                ),
+                hidden_states=hidden_states,
+            )
+            choices.append(choice_data)
+
+        # Calculate usage
+        cache_report = self.tokenizer_manager.server_args.enable_cache_report
+        usage = UsageProcessor.calculate_response_usage(
+            ret, n_choices=request.n, enable_cache_report=cache_report
+        )
+
+        return CompletionResponse(
+            id=ret[0]["meta_info"]["id"],
+            model=request.model,
+            created=created,
+            choices=choices,
+            usage=usage,
+            metadata={"weight_version": ret[0]["meta_info"]["weight_version"]},
+        )
+
+    def _get_echo_text(self, request: CompletionRequest, index: int) -> str:
+        """Get echo text for streaming response"""
+        if isinstance(request.prompt, str):
+            # for the case of single str prompts
+            return request.prompt
+        elif isinstance(request.prompt, list):
+            if isinstance(request.prompt[0], str):
+                # for the case of multiple str prompts
+                return request.prompt[index // request.n]
+            elif isinstance(request.prompt[0], int):
+                # for the case of single token ids prompt
+                return self.tokenizer_manager.tokenizer.decode(
+                    request.prompt, skip_special_tokens=True
+                )
+            elif isinstance(request.prompt[0], list) and isinstance(
+                request.prompt[0][0], int
+            ):
+                # for the case of multiple token ids prompts
+                return self.tokenizer_manager.tokenizer.decode(
+                    request.prompt[index // request.n],
+                    skip_special_tokens=True,
+                )
+        return ""
+
+    def _prepare_echo_prompts(self, request: CompletionRequest) -> List[str]:
+        """Prepare echo prompts for non-streaming response"""
+        # TODO: handle the case prompt is token ids
+        if isinstance(request.prompt, list) and isinstance(request.prompt[0], str):
+            # for the case of multiple str prompts
+            return request.prompt
+        elif isinstance(request.prompt, list) and isinstance(request.prompt[0], list):
+            # for the case of multiple token ids prompts
+            return [
+                self.tokenizer_manager.tokenizer.decode(
+                    prompt, skip_special_tokens=True
+                )
+                for prompt in request.prompt
+            ]
+        elif isinstance(request.prompt, list) and isinstance(request.prompt[0], int):
+            # for the case of single token ids prompt
+            return [
+                self.tokenizer_manager.tokenizer.decode(
+                    request.prompt, skip_special_tokens=True
+                )
+            ]
+        else:
+            # for the case of single str prompt
+            return [request.prompt]
diff --git a/python/sglang/srt/entrypoints/openai/serving_embedding.py b/python/sglang/srt/entrypoints/openai/serving_embedding.py
new file mode 100644
index 000000000..63c4fc34a
--- /dev/null
+++ b/python/sglang/srt/entrypoints/openai/serving_embedding.py
@@ -0,0 +1,174 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+from fastapi import Request
+from fastapi.responses import ORJSONResponse
+
+from sglang.srt.entrypoints.openai.protocol import (
+    EmbeddingObject,
+    EmbeddingRequest,
+    EmbeddingResponse,
+    ErrorResponse,
+    MultimodalEmbeddingInput,
+    UsageInfo,
+)
+from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
+from sglang.srt.managers.io_struct import EmbeddingReqInput
+from sglang.srt.parser.conversation import generate_embedding_convs
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.template_manager import TemplateManager
+    from sglang.srt.managers.tokenizer_manager import TokenizerManager
+
+
+class OpenAIServingEmbedding(OpenAIServingBase):
+    """Handler for v1/embeddings requests"""
+
+    def __init__(
+        self,
+        tokenizer_manager: TokenizerManager,
+        template_manager: TemplateManager,
+    ):
+        super().__init__(tokenizer_manager)
+        self.template_manager = template_manager
+
+    def _request_id_prefix(self) -> str:
+        return "embd-"
+
+    def _validate_request(self, request: EmbeddingRequest) -> Optional[str]:
+        """Validate that the input is not empty or whitespace only."""
+        if not (input := request.input):
+            return "Input cannot be empty"
+
+        # Handle single string
+        if isinstance(input, str):
+            if not input.strip():
+                return "Input cannot be empty or whitespace only"
+            return None
+
+        # Handle list inputs
+        if isinstance(input, list):
+            if len(input) == 0:
+                return "Input cannot be empty"
+
+            # Check first element to determine type
+            first_item = input[0]
+
+            if isinstance(first_item, str):
+                # List of strings
+                for i, item in enumerate(input):
+                    if not isinstance(item, str):
+                        return f"All items in input list must be strings"
+                    if not item.strip():
+                        return f"Input at index {i} cannot be empty or whitespace only"
+            elif isinstance(first_item, int):
+                # List of integers (token IDs)
+                for i, item in enumerate(input):
+                    if not isinstance(item, int):
+                        return f"All items in input list must be integers"
+                    if item < 0:
+                        return f"Token ID at index {i} must be non-negative"
+        return None
+
+    def _convert_to_internal_request(
+        self,
+        request: EmbeddingRequest,
+    ) -> tuple[EmbeddingReqInput, EmbeddingRequest]:
+        """Convert OpenAI embedding request to internal format"""
+        prompt = request.input
+
+        if isinstance(prompt, str):
+            # Single string input
+            prompt_kwargs = {"text": prompt}
+        elif isinstance(prompt, list):
+            if len(prompt) > 0 and isinstance(prompt[0], str):
+                prompt_kwargs = {"text": prompt}
+            elif len(prompt) > 0 and isinstance(prompt[0], MultimodalEmbeddingInput):
+                # Handle multimodal embedding inputs
+                texts = []
+                images = []
+                for item in prompt:
+                    # Use padding for text if None - this could be improved
+                    texts.append(item.text if item.text is not None else "padding")
+                    images.append(item.image if item.image is not None else None)
+
+                generate_prompts = []
+                # Check if we have a chat template for multimodal embeddings
+                if self.template_manager.chat_template_name is not None:
+                    convs = generate_embedding_convs(
+                        texts, images, self.template_manager.chat_template_name
+                    )
+                    for conv in convs:
+                        generate_prompts.append(conv.get_prompt())
+                else:
+                    generate_prompts = texts
+
+                if len(generate_prompts) == 1:
+                    prompt_kwargs = {
+                        "text": generate_prompts[0],
+                        "image_data": images[0],
+                    }
+                else:
+                    prompt_kwargs = {
+                        "text": generate_prompts,
+                        "image_data": images,
+                    }
+            else:
+                # List of integers (token IDs) or empty list
+                prompt_kwargs = {"input_ids": prompt}
+        else:
+            # Other types (should not happen but handle gracefully)
+            prompt_kwargs = {"input_ids": prompt}
+
+        adapted_request = EmbeddingReqInput(
+            **prompt_kwargs,
+            rid=request.rid,
+        )
+
+        return adapted_request, request
+
+    async def _handle_non_streaming_request(
+        self,
+        adapted_request: EmbeddingReqInput,
+        request: EmbeddingRequest,
+        raw_request: Request,
+    ) -> Union[EmbeddingResponse, ErrorResponse, ORJSONResponse]:
+        """Handle the embedding request"""
+        try:
+            ret = await self.tokenizer_manager.generate_request(
+                adapted_request, raw_request
+            ).__anext__()
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        if not isinstance(ret, list):
+            ret = [ret]
+
+        response = self._build_embedding_response(ret)
+        return response
+
+    def _build_embedding_response(self, ret: List[Dict[str, Any]]) -> EmbeddingResponse:
+        """Build the embedding response"""
+        embedding_objects = []
+        prompt_tokens = 0
+
+        for idx, ret_item in enumerate(ret):
+            embedding_objects.append(
+                EmbeddingObject(
+                    embedding=ret_item["embedding"],
+                    index=idx,
+                )
+            )
+            # Handle missing prompt_tokens gracefully
+            meta_info = ret_item.get("meta_info", {})
+            prompt_tokens += meta_info.get("prompt_tokens", 0)
+
+        return EmbeddingResponse(
+            data=embedding_objects,
+            model=self.tokenizer_manager.model_path,
+            usage=UsageInfo(
+                prompt_tokens=prompt_tokens,
+                total_tokens=prompt_tokens,
+            ),
+        )
diff --git a/python/sglang/srt/entrypoints/openai/serving_rerank.py b/python/sglang/srt/entrypoints/openai/serving_rerank.py
new file mode 100644
index 000000000..b053c55b3
--- /dev/null
+++ b/python/sglang/srt/entrypoints/openai/serving_rerank.py
@@ -0,0 +1,102 @@
+import logging
+from typing import Any, Dict, List, Optional, Union
+
+from fastapi import Request
+from fastapi.responses import ORJSONResponse
+
+from sglang.srt.entrypoints.openai.protocol import (
+    ErrorResponse,
+    RerankResponse,
+    V1RerankReqInput,
+)
+from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
+from sglang.srt.managers.io_struct import EmbeddingReqInput
+
+logger = logging.getLogger(__name__)
+
+
+class OpenAIServingRerank(OpenAIServingBase):
+    """Handler for /v1/rerank requests"""
+
+    # NOTE: /v1/rerank is not an official OpenAI endpoint. This module may be moved
+    # to another module in the future.
+
+    def _request_id_prefix(self) -> str:
+        return "rerank-"
+
+    def _validate_request(self, request: V1RerankReqInput) -> Optional[str]:
+        """Validate rerank request format and content"""
+        if not request.query:
+            return "Query cannot be empty"
+
+        if isinstance(request.query, str):
+            if not request.query.strip():
+                return "Query cannot be empty or whitespace only"
+
+        if not request.documents:
+            return "Documents cannot be empty"
+
+        for doc in request.documents:
+            if not doc:
+                return "Each document must be a non-empty string"
+            if isinstance(doc, str) and not doc.strip():
+                return "Each document cannot be empty or whitespace only"
+
+        return None
+
+    def _convert_to_internal_request(
+        self, request: V1RerankReqInput
+    ) -> tuple[EmbeddingReqInput, V1RerankReqInput]:
+        """Convert OpenAI rerank request to internal embedding format"""
+        # Create pairs of [query, document] for each document
+        pairs = []
+        for doc in request.documents:
+            pairs.append([request.query, doc])
+
+        adapted_request = EmbeddingReqInput(
+            text=pairs,
+            is_cross_encoder_request=True,
+        )
+
+        return adapted_request, request
+
+    async def _handle_non_streaming_request(
+        self,
+        adapted_request: EmbeddingReqInput,
+        request: V1RerankReqInput,
+        raw_request: Request,
+    ) -> Union[List[RerankResponse], ErrorResponse, ORJSONResponse]:
+        """Handle the rerank request"""
+        try:
+            ret = await self.tokenizer_manager.generate_request(
+                adapted_request, raw_request
+            ).__anext__()
+
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        if not isinstance(ret, list):
+            ret = [ret]
+
+        responses = self._build_rerank_response(ret, request)
+        return responses
+
+    def _build_rerank_response(
+        self, ret: List[Dict[str, Any]], request: V1RerankReqInput
+    ) -> List[RerankResponse]:
+        """Build the rerank response from generation results"""
+        responses = []
+        for idx, ret_item in enumerate(ret):
+            responses.append(
+                RerankResponse(
+                    score=ret_item["embedding"],
+                    document=request.documents[idx],
+                    index=idx,
+                    meta_info=ret_item["meta_info"],
+                )
+            )
+
+        # Sort by score in descending order (highest relevance first)
+        responses.sort(key=lambda x: x.score, reverse=True)
+
+        return responses
diff --git a/python/sglang/srt/entrypoints/openai/serving_responses.py b/python/sglang/srt/entrypoints/openai/serving_responses.py
new file mode 100644
index 000000000..3f7619678
--- /dev/null
+++ b/python/sglang/srt/entrypoints/openai/serving_responses.py
@@ -0,0 +1,1276 @@
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from vLLM's OpenAIServingResponses
+"""Handler for /v1/responses requests"""
+from __future__ import annotations
+
+import asyncio
+import copy
+import json
+import logging
+import time
+from contextlib import AsyncExitStack
+from http import HTTPStatus
+from typing import TYPE_CHECKING, Any, AsyncGenerator, AsyncIterator, Optional, Union
+
+import jinja2
+import openai.types.responses as openai_responses_types
+from fastapi import Request
+from fastapi.responses import ORJSONResponse
+from openai.types.responses import (
+    ResponseOutputMessage,
+    ResponseOutputText,
+    ResponseReasoningItem,
+)
+from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
+from openai.types.responses.response_reasoning_item import (
+    Content as ResponseReasoningTextContent,
+)
+from openai_harmony import Message as OpenAIMessage
+
+from sglang.srt.entrypoints.context import (
+    ConversationContext,
+    HarmonyContext,
+    SimpleContext,
+    StreamingHarmonyContext,
+)
+from sglang.srt.entrypoints.harmony_utils import (
+    get_developer_message,
+    get_stop_tokens_for_assistant_actions,
+    get_system_message,
+    get_user_message,
+    parse_output_message,
+    parse_remaining_state,
+    parse_response_input,
+    render_for_completion,
+)
+from sglang.srt.entrypoints.openai.protocol import (
+    ChatCompletionMessageParam,
+    ChatCompletionRequest,
+    PromptTokenUsageInfo,
+    RequestResponseMetadata,
+    ResponsesRequest,
+    ResponsesResponse,
+    UsageInfo,
+)
+from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
+from sglang.srt.entrypoints.openai.tool_server import MCPToolServer, ToolServer
+from sglang.srt.managers.io_struct import GenerateReqInput
+from sglang.srt.parser.reasoning_parser import ReasoningParser
+from sglang.srt.utils import random_uuid
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.template_manager import TemplateManager
+    from sglang.srt.managers.tokenizer_manager import TokenizerManager
+
+logger = logging.getLogger(__name__)
+
+
+class OpenAIServingResponses(OpenAIServingChat):
+    """Handler for /v1/responses requests"""
+
+    def __init__(
+        self,
+        tokenizer_manager: TokenizerManager,
+        template_manager: TemplateManager,
+        *,
+        enable_prompt_tokens_details: bool = False,
+        enable_force_include_usage: bool = False,
+        tool_server: Optional[ToolServer] = None,
+    ) -> None:
+        super().__init__(tokenizer_manager, template_manager)
+
+        # template_manager is already set by parent class
+        self.reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
+        self.enable_prompt_tokens_details = enable_prompt_tokens_details
+        self.enable_force_include_usage = enable_force_include_usage
+
+        # Get default sampling params from model config if available
+        self.default_sampling_params = {}
+
+        self.supports_browsing = (
+            tool_server.has_tool("browser") if tool_server else False
+        )
+        self.supports_code_interpreter = (
+            tool_server.has_tool("python") if tool_server else False
+        )
+        self.tool_server = tool_server
+        # Get from model config
+        self.use_harmony = (
+            self.tokenizer_manager.model_config.hf_config.model_type == "gpt_oss"
+        )
+
+        if self.use_harmony:
+            # OpenAI models have two EOS-like tokens: <|return|> and <|call|>.
+            # We need to add them to the stop token ids.
+            if "stop_token_ids" not in self.default_sampling_params:
+                self.default_sampling_params["stop_token_ids"] = []
+            self.default_sampling_params["stop_token_ids"].extend(
+                get_stop_tokens_for_assistant_actions()
+            )
+
+        # Response storage for background and retrieval operations
+        # Note: In production, this should use a proper storage backend (Redis, database)
+        # with TTL/expiration to prevent memory leaks
+        self.response_store: dict[str, ResponsesResponse] = {}
+        self.response_store_lock = asyncio.Lock()
+
+        # Message storage for conversation continuity
+        # Note: In production, this should use a proper storage backend (Redis, database)
+        # with TTL/expiration to prevent memory leaks
+        self.msg_store: dict[
+            str, Union[list[ChatCompletionMessageParam], list["OpenAIMessage"]]
+        ] = {}
+
+        self.background_tasks: dict[str, asyncio.Task] = {}
+
+    def _request_id_prefix(self) -> str:
+        return "resp_"
+
+    async def create_responses(
+        self,
+        request: ResponsesRequest,
+        raw_request: Optional[Request] = None,
+    ) -> Union[AsyncGenerator[str, None], ResponsesResponse, ORJSONResponse]:
+        # Validate model
+        if not self.tokenizer_manager:
+            return self.create_error_response("Model not loaded")
+
+        # FIXME: If the engine is dead, raise an error
+        # This is required for the streaming case
+
+        # Handle the previous response ID
+        prev_response_id = request.previous_response_id
+        if prev_response_id is not None:
+            if not prev_response_id.startswith("resp_"):
+                return self._make_invalid_id_error(prev_response_id)
+            async with self.response_store_lock:
+                prev_response = self.response_store.get(prev_response_id)
+            if prev_response is None:
+                return self._make_not_found_error(prev_response_id)
+        else:
+            prev_response = None
+
+        try:
+            model_name = request.model
+            tokenizer = self.tokenizer_manager.tokenizer
+
+            if self.use_harmony:
+                messages, request_prompts, engine_prompts = (
+                    self._make_request_with_harmony(request, prev_response)
+                )
+            else:
+                messages, request_prompts, engine_prompts = await self._make_request(
+                    request, prev_response, tokenizer
+                )
+
+        except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(f"{e} {e.__cause__}")
+
+        request_metadata = RequestResponseMetadata(request_id=request.request_id)
+        if raw_request:
+            raw_request.state.request_metadata = request_metadata
+
+        if (
+            self.tool_server is not None
+            and isinstance(self.tool_server, MCPToolServer)
+            and (request.background or request.stream)
+            and request.tools
+            and any(
+                tool.type in ["web_search_preview", "code_interpreter"]
+                for tool in request.tools
+            )
+        ):
+            return self.create_error_response(
+                "MCP tool server is not supported in background mode and "
+                "streaming mode"
+            )
+
+        # Schedule the request and get the result generator
+        generators: list[AsyncGenerator[Any, None]] = []
+        tool_list = []
+        if self.use_harmony:
+            if self.supports_browsing:
+                tool_list.append("browser")
+            if self.supports_code_interpreter:
+                tool_list.append("python")
+        async with AsyncExitStack() as exit_stack:
+            try:
+                if self.tool_server is not None:
+                    tool_session_ctxs: dict[str, Any] = {
+                        tool_name: exit_stack.enter_async_context(
+                            self.tool_server.get_tool_session(tool_name)
+                        )
+                        for tool_name in tool_list
+                    }
+                    tool_sessions = {}
+                    for tool_name in tool_list:
+                        tool_sessions[tool_name] = await tool_session_ctxs[tool_name]
+                else:
+                    assert len(tool_list) == 0
+                    tool_sessions = {}
+                for i, engine_prompt in enumerate(engine_prompts):
+                    # Calculate default max tokens from context length minus prompt length
+                    if hasattr(engine_prompt, "__len__"):
+                        prompt_length = len(engine_prompt)
+                    elif isinstance(engine_prompt, list):
+                        prompt_length = len(engine_prompt)
+                    else:
+                        prompt_length = 0
+
+                    context_len = (
+                        self.tokenizer_manager.model_config.context_len
+                        if hasattr(self.tokenizer_manager.model_config, "context_len")
+                        else 4096
+                    )
+                    default_max_tokens = max(
+                        context_len - prompt_length, 512
+                    )  # Ensure minimum 512 tokens
+                    sampling_params = request.to_sampling_params(
+                        default_max_tokens, self.default_sampling_params
+                    )
+
+                    context: ConversationContext
+                    if self.use_harmony:
+                        if request.stream:
+                            context = StreamingHarmonyContext(messages, tool_sessions)
+                        else:
+                            context = HarmonyContext(messages, tool_sessions)
+                    else:
+                        context = SimpleContext()
+
+                    # Create GenerateReqInput for SGLang
+                    adapted_request = GenerateReqInput(
+                        input_ids=engine_prompt,
+                        sampling_params=sampling_params,
+                        stream=request.stream,
+                        rid=request.request_id,
+                        background=request.background,
+                    )
+
+                    generator = self._generate_with_builtin_tools(
+                        request.request_id,
+                        request_prompts[i],
+                        adapted_request,
+                        sampling_params,
+                        context,
+                        raw_request=raw_request,
+                        priority=request.priority,
+                    )
+                    generators.append(generator)
+            except ValueError as e:
+                return self.create_error_response(str(e))
+
+            assert len(generators) == 1
+            (result_generator,) = generators
+
+            # Store the input messages
+            if request.store:
+                self.msg_store[request.request_id] = messages
+
+            if request.background:
+                created_time = int(time.time())
+                response = ResponsesResponse.from_request(
+                    request,
+                    sampling_params,
+                    model_name=model_name,
+                    created_time=created_time,
+                    output=[],
+                    status="queued",
+                    usage=None,
+                )
+                async with self.response_store_lock:
+                    self.response_store[response.id] = response
+
+                # Run the request in the background
+                task = asyncio.create_task(
+                    self._run_background_request(
+                        request,
+                        sampling_params,
+                        result_generator,
+                        context,
+                        model_name,
+                        tokenizer,
+                        request_metadata,
+                        created_time,
+                    ),
+                    name=f"create_{response.id}",
+                )
+
+                # For cleanup
+                self.background_tasks[response.id] = task
+                task.add_done_callback(
+                    lambda _: self.background_tasks.pop(response.id, None)
+                )
+                return response
+
+            if request.stream:
+                return self.responses_stream_generator(
+                    request,
+                    sampling_params,
+                    result_generator,
+                    context,
+                    model_name,
+                    tokenizer,
+                    request_metadata,
+                )
+            try:
+                result: Union[ORJSONResponse, ResponsesResponse] = (
+                    await self.responses_full_generator(
+                        request,
+                        sampling_params,
+                        result_generator,
+                        context,
+                        model_name,
+                        tokenizer,
+                        request_metadata,
+                    )
+                )
+                return result
+            except Exception as e:
+                return self.create_error_response(str(e))
+        return self.create_error_response("Unknown error")
+
+    async def _make_request(
+        self,
+        request: ResponsesRequest,
+        prev_response: Optional[ResponsesResponse],
+        tokenizer: Any,
+    ):
+        # Construct the input messages
+        messages = self._construct_input_messages(request, prev_response)
+
+        # Follow SGLang's pattern: create a ChatCompletionRequest and process messages
+        try:
+            # Convert ResponsesRequest to ChatCompletionRequest for processing
+            chat_request = ChatCompletionRequest(
+                model=request.model,
+                messages=messages,
+                stream=request.stream,
+            )
+
+            # Follow SGLang's _process_messages pattern
+            is_multimodal = self.tokenizer_manager.model_config.is_multimodal
+            processed_messages = self._process_messages(chat_request, is_multimodal)
+
+            # Extract the results
+            if is_multimodal:
+                request_prompts = [processed_messages.prompt]
+                engine_prompts = [processed_messages.prompt]
+            else:
+                request_prompts = [processed_messages.prompt_ids]
+                engine_prompts = [processed_messages.prompt_ids]
+
+        except Exception as e:
+            logger.warning(f"Chat processing failed, using fallback: {e}")
+            # Fallback to simple encoding
+            prompt_text = ""
+            for msg in messages:
+                role = msg.get("role", "user")
+                content = msg.get("content", "")
+                prompt_text += f"{role}: {content}\n"
+            prompt_ids = tokenizer.encode(prompt_text)
+            request_prompts = [prompt_ids]
+            engine_prompts = [prompt_ids]
+
+        return messages, request_prompts, engine_prompts
+
+    def _make_request_with_harmony(
+        self,
+        request: ResponsesRequest,
+        prev_response: Optional[ResponsesResponse],
+    ):
+        if request.tool_choice != "auto":
+            raise NotImplementedError(
+                "Only 'auto' tool_choice is supported in " "response API"
+            )
+        messages = self._construct_input_messages_with_harmony(request, prev_response)
+        prompt_token_ids = render_for_completion(messages)
+        engine_prompt = prompt_token_ids
+        return messages, [prompt_token_ids], [engine_prompt]
+
+    async def responses_full_generator(
+        self,
+        request: ResponsesRequest,
+        sampling_params: Any,
+        result_generator: AsyncIterator[Any],
+        context: ConversationContext,
+        model_name: str,
+        tokenizer: Any,
+        request_metadata: RequestResponseMetadata,
+        created_time: Optional[int] = None,
+    ) -> Union[ResponsesResponse, ORJSONResponse]:
+        if created_time is None:
+            created_time = int(time.time())
+
+        try:
+            async for _ in result_generator:
+                pass
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        if self.use_harmony:
+            assert isinstance(context, HarmonyContext)
+            output = self._make_response_output_items_with_harmony(context)
+            # TODO: these are all 0 for now!
+            num_prompt_tokens = context.num_prompt_tokens
+            num_generated_tokens = context.num_output_tokens
+            num_cached_tokens = context.num_cached_tokens
+            num_reasoning_tokens = context.num_reasoning_tokens
+        else:
+            assert isinstance(context, SimpleContext)
+            final_res = context.last_output
+            assert final_res is not None
+
+            output = self._make_response_output_items(
+                request, final_res["text"], tokenizer
+            )
+
+            # Calculate usage from actual output
+            if hasattr(final_res, "meta_info"):
+                num_prompt_tokens = final_res.meta_info.get("prompt_tokens", 0)
+                num_generated_tokens = final_res.meta_info.get("completion_tokens", 0)
+                num_cached_tokens = final_res.meta_info.get("cached_tokens", 0)
+            elif hasattr(final_res, "prompt_token_ids") and hasattr(
+                final_res, "outputs"
+            ):
+                # Fallback calculation if meta_info not available
+                num_prompt_tokens = (
+                    len(final_res.prompt_token_ids) if final_res.prompt_token_ids else 0
+                )
+                num_generated_tokens = (
+                    len(final_res.outputs[0].token_ids)
+                    if final_res.outputs and final_res.outputs[0].token_ids
+                    else 0
+                )
+                num_cached_tokens = getattr(final_res, "num_cached_tokens", 0)
+                num_reasoning_tokens = 0
+            else:
+                # Final fallback
+                num_prompt_tokens = 0
+                num_generated_tokens = 0
+                num_cached_tokens = 0
+                num_reasoning_tokens = 0
+
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            completion_tokens=num_generated_tokens,
+            total_tokens=num_prompt_tokens + num_generated_tokens,
+            reasoning_tokens=num_reasoning_tokens,
+        )
+        if self.enable_prompt_tokens_details and num_cached_tokens:
+            usage.prompt_tokens_details = PromptTokenUsageInfo(
+                cached_tokens=num_cached_tokens
+            )
+        request_metadata.final_usage_info = usage
+
+        response = ResponsesResponse.from_request(
+            request,
+            sampling_params,
+            model_name=model_name,
+            created_time=created_time,
+            output=output,
+            status="completed",
+            usage=usage,
+        )
+
+        if request.store:
+            async with self.response_store_lock:
+                stored_response = self.response_store.get(response.id)
+                # If the response is already cancelled, don't update it
+                if stored_response is None or stored_response.status != "cancelled":
+                    self.response_store[response.id] = response
+
+        return response
+
+    def _make_response_output_items(
+        self,
+        request: ResponsesRequest,
+        final_output: Any,
+        tokenizer: Any,
+    ):
+        # Handle reasoning parsing if enabled
+        if self.reasoning_parser:
+            # Use standard reasoning parser (openai maps to T4Detector internally)
+            reasoning_parser = ReasoningParser(
+                model_type=self.reasoning_parser, stream_reasoning=False
+            )
+            reasoning_content, content = reasoning_parser.parse_non_stream(final_output)
+        else:
+            reasoning_content = None
+            content = final_output
+
+        output_items = []
+        if reasoning_content:
+            reasoning_item = ResponseReasoningItem(
+                id=f"rs_{random_uuid()}",
+                type="reasoning",
+                summary=[],
+                content=[
+                    ResponseReasoningTextContent(
+                        type="reasoning_text", text=reasoning_content
+                    ),
+                ],
+                status=None,
+            )
+            output_items.append(reasoning_item)
+        if content:
+            output_text = ResponseOutputText(
+                text=content,
+                annotations=[],  # TODO
+                type="output_text",
+                logprobs=None,  # TODO
+            )
+            message = ResponseOutputMessage(
+                id=f"msg_{random_uuid()}",
+                content=[output_text],
+                role="assistant",
+                status="completed",
+                type="message",
+            )
+            output_items.append(message)
+        return output_items
+
+    def _make_response_output_items_with_harmony(
+        self,
+        context: HarmonyContext,
+    ):
+        output_items = []
+        num_init_messages = context.num_init_messages
+        for msg in context.messages[num_init_messages:]:
+            output_items.extend(parse_output_message(msg))
+        # Handle the generation stopped in the middle (if any).
+        last_items = parse_remaining_state(context.parser)
+        if last_items:
+            output_items.extend(last_items)
+        return output_items
+
+    def _construct_input_messages(
+        self,
+        request: ResponsesRequest,
+        prev_response: Optional[ResponsesResponse] = None,
+    ) -> list[ChatCompletionMessageParam]:
+        messages: list[ChatCompletionMessageParam] = []
+        if request.instructions:
+            messages.append(
+                {
+                    "role": "system",
+                    "content": request.instructions,
+                }
+            )
+
+        # Prepend the conversation history
+        if prev_response is not None:
+            # Add the previous messages
+            prev_msg = self.msg_store[prev_response.id]
+            messages.extend(prev_msg)
+
+            # Add the previous output
+            for output_item in prev_response.output:
+                # NOTE: We skip the reasoning output of the previous response
+                if isinstance(output_item, ResponseReasoningItem):
+                    continue
+                for content in output_item.content:
+                    messages.append(
+                        {
+                            "role": "system",
+                            "content": request.instructions,
+                        }
+                    )
+
+        # Append the new input
+        # Responses API supports simple text inputs without chat format
+        if isinstance(request.input, str):
+            messages.append({"role": "user", "content": request.input})
+        else:
+            messages.extend(request.input)  # type: ignore
+        return messages
+
+    def _construct_input_messages_with_harmony(
+        self,
+        request: ResponsesRequest,
+        prev_response: Optional[ResponsesResponse],
+    ) -> list["OpenAIMessage"]:
+        messages: list["OpenAIMessage"] = []
+        if prev_response is None:
+            # New conversation.
+            reasoning_effort = request.reasoning.effort if request.reasoning else None
+            tool_types = [tool.type for tool in request.tools]
+            enable_browser = (
+                "web_search_preview" in tool_types and self.tool_server is not None
+            )
+            enable_code_interpreter = (
+                "code_interpreter" in tool_types and self.tool_server is not None
+            )
+            sys_msg = get_system_message(
+                reasoning_effort=reasoning_effort,
+                browser_description=(
+                    self.tool_server.get_tool_description("browser")
+                    if self.tool_server and enable_browser
+                    else None
+                ),
+                python_description=(
+                    self.tool_server.get_tool_description("python")
+                    if self.tool_server and enable_code_interpreter
+                    else None
+                ),
+            )
+            messages.append(sys_msg)
+            dev_msg = get_developer_message(request.instructions, request.tools)
+            messages.append(dev_msg)
+        else:
+            # Continue the previous conversation.
+            # FIXME: Currently, request params like reasoning and
+            # instructions are ignored.
+            prev_msgs = self.msg_store[prev_response.id]
+            # Remove the previous chain-of-thoughts if there is a new "final"
+            # message.
+            if (
+                len(prev_msgs) > 0
+                and hasattr(prev_msgs[-1], "channel")
+                and prev_msgs[-1].channel == "final"
+            ):  # type: ignore[union-attr]
+                prev_final_msg_idx = -1
+                for i in range(len(prev_msgs) - 2, -1, -1):
+                    if (
+                        hasattr(prev_msgs[i], "channel")
+                        and prev_msgs[i].channel == "final"
+                    ):  # type: ignore[union-attr]
+                        prev_final_msg_idx = i
+                        break
+                recent_turn_msgs = prev_msgs[prev_final_msg_idx + 1 :]
+                del prev_msgs[prev_final_msg_idx + 1 :]
+                for msg in recent_turn_msgs:
+                    if (
+                        hasattr(msg, "channel") and msg.channel != "analysis"
+                    ):  # type: ignore[union-attr]
+                        prev_msgs.append(msg)
+            messages.extend(prev_msgs)
+        # Append the new input.
+        # Responses API supports simple text inputs without chat format.
+        if isinstance(request.input, str):
+            messages.append(get_user_message(request.input))
+        else:
+            if prev_response is not None:
+                prev_outputs = copy(prev_response.output)
+            else:
+                prev_outputs = []
+            for response_msg in request.input:
+                messages.append(parse_response_input(response_msg, prev_outputs))
+                if isinstance(response_msg, ResponseFunctionToolCall):
+                    prev_outputs.append(response_msg)
+        return messages
+
+    async def _run_background_request(
+        self,
+        request: ResponsesRequest,
+        sampling_params: Any,
+        result_generator: AsyncIterator[Any],
+        context: ConversationContext,
+        model_name: str,
+        tokenizer: Any,
+        request_metadata: RequestResponseMetadata,
+        created_time: Optional[int] = None,
+        *args,
+        **kwargs,
+    ):
+        try:
+            # Update the status to "in_progress"
+            async with self.response_store_lock:
+                stored_response = self.response_store.get(request.request_id)
+                assert stored_response is not None
+                stored_response.status = "in_progress"
+
+            response = await self.responses_full_generator(
+                request,
+                sampling_params,
+                result_generator,
+                context,
+                model_name,
+                tokenizer,
+                request_metadata,
+                created_time,
+                *args,
+                **kwargs,
+            )
+        except Exception as e:
+            logger.exception("Background request failed for %s", request.request_id)
+            response = self.create_error_response(str(e))
+
+        if isinstance(response, ORJSONResponse):
+            # If the request has failed, update the status to "failed"
+            response_id = request.request_id
+            async with self.response_store_lock:
+                stored_response = self.response_store.get(response_id)
+                assert stored_response is not None
+                if stored_response.status not in ("completed", "cancelled"):
+                    stored_response.status = "failed"
+
+    async def retrieve_responses(
+        self,
+        response_id: str,
+    ) -> Union[ResponsesResponse, ORJSONResponse]:
+        if not response_id.startswith("resp_"):
+            return self._make_invalid_id_error(response_id)
+
+        async with self.response_store_lock:
+            response = self.response_store.get(response_id)
+
+        if response is None:
+            return self._make_not_found_error(response_id)
+        return response
+
+    async def cancel_responses(
+        self,
+        response_id: str,
+    ) -> Union[ResponsesResponse, ORJSONResponse]:
+        if not response_id.startswith("resp_"):
+            return self._make_invalid_id_error(response_id)
+
+        async with self.response_store_lock:
+            response = self.response_store.get(response_id)
+            if response is None:
+                return self._make_not_found_error(response_id)
+
+            prev_status = response.status
+            if prev_status not in ("queued", "in_progress"):
+                return self.create_error_response(
+                    err_type="invalid_request_error",
+                    message="Cannot cancel a synchronous response.",
+                )
+
+            # Update the status to "cancelled"
+            response.status = "cancelled"
+
+        # Abort the request
+        if task := self.background_tasks.get(response_id):
+            task.cancel()
+            try:
+                await task
+            except asyncio.CancelledError:
+                logger.exception("Background task for %s was cancelled", response_id)
+        return response
+
+    def _make_invalid_id_error(self, response_id: str):
+        return self.create_error_response(
+            message=(
+                f"Invalid 'response_id': '{response_id}'. "
+                "Expected an ID that begins with 'resp'."
+            ),
+            err_type="invalid_request_error",
+            param="response_id",
+        )
+
+    def _make_not_found_error(self, response_id: str):
+        return self.create_error_response(
+            message=f"Response with id '{response_id}' not found.",
+            err_type="invalid_request_error",
+            status_code=HTTPStatus.NOT_FOUND,
+            param="response_id",
+        )
+
+    async def responses_stream_generator(
+        self,
+        request: ResponsesRequest,
+        sampling_params: Any,
+        result_generator: AsyncIterator[StreamingHarmonyContext],
+        context: StreamingHarmonyContext,
+        model_name: str,
+        tokenizer: Any,
+        request_metadata: RequestResponseMetadata,
+        created_time: Optional[int] = None,
+    ) -> AsyncGenerator[str, None]:
+        # TODO:
+        # 1. Handle disconnect
+
+        created_time = created_time or int(time.time())
+
+        sequence_number = 0
+
+        def _send_event(event):
+            nonlocal sequence_number
+            # Set sequence_number if the event has this attribute
+            if hasattr(event, "sequence_number"):
+                event.sequence_number = sequence_number
+            sequence_number += 1
+            # Get event type from the event's type field if it exists
+            event_type = getattr(event, "type", "unknown")
+            return (
+                f"event: {event_type}\n"
+                f"data: {event.model_dump_json(indent=None)}\n\n"
+            )
+
+        current_content_index = 0
+        current_output_index = 0
+        current_item_id = f"item_{random_uuid()}"
+        sent_output_item_added = False
+
+        initial_response = ResponsesResponse.from_request(
+            request,
+            sampling_params,
+            model_name=model_name,
+            created_time=created_time,
+            output=[],
+            status="in_progress",
+            usage=None,
+        ).model_dump()
+        yield _send_event(
+            openai_responses_types.ResponseCreatedEvent(
+                type="response.created",
+                sequence_number=-1,
+                response=initial_response,
+            )
+        )
+        yield _send_event(
+            openai_responses_types.ResponseInProgressEvent(
+                type="response.in_progress",
+                sequence_number=-1,
+                response=initial_response,
+            )
+        )
+
+        async for ctx in result_generator:
+
+            if ctx.is_expecting_start():
+                current_output_index += 1
+                sent_output_item_added = False
+
+                if len(ctx.parser.messages) > 0:
+                    previous_item = ctx.parser.messages[-1]
+                    if previous_item.recipient is not None:
+                        # Deal with tool call here
+                        pass
+                    elif previous_item.channel == "analysis":
+                        reasoning_item = ResponseReasoningItem(
+                            id=f"rs_{random_uuid()}",
+                            type="reasoning",
+                            summary=[],
+                            content=[
+                                ResponseReasoningTextContent(
+                                    text=previous_item.content[0].text,
+                                    type="reasoning_text",
+                                ),
+                            ],
+                            status="completed",
+                        )
+                        yield _send_event(
+                            openai_responses_types.ResponseReasoningTextDoneEvent(
+                                type="response.reasoning_text.done",
+                                item_id=current_item_id,
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                content_index=current_content_index,
+                                text=previous_item.content[0].text,
+                            )
+                        )
+                        yield _send_event(
+                            openai_responses_types.ResponseOutputItemDoneEvent(
+                                type="response.output_item.done",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item=reasoning_item,
+                            )
+                        )
+                    elif previous_item.channel == "final":
+                        text_content = openai_responses_types.ResponseOutputText(
+                            type="output_text",
+                            text=previous_item.content[0].text,
+                            annotations=[],
+                        )
+                        yield _send_event(
+                            openai_responses_types.ResponseTextDoneEvent(
+                                type="response.output_text.done",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                content_index=current_content_index,
+                                text=previous_item.content[0].text,
+                                logprobs=[],
+                                item_id=current_item_id,
+                            )
+                        )
+                        yield _send_event(
+                            openai_responses_types.ResponseContentPartDoneEvent(
+                                type="response.content_part.done",
+                                sequence_number=-1,
+                                item_id=current_item_id,
+                                output_index=current_output_index,
+                                content_index=current_content_index,
+                                part=text_content,
+                            )
+                        )
+                        yield _send_event(
+                            openai_responses_types.ResponseOutputItemDoneEvent(
+                                type="response.output_item.done",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item=openai_responses_types.ResponseOutputMessage(
+                                    id=current_item_id,
+                                    type="message",
+                                    role="assistant",
+                                    content=[text_content],
+                                    status="completed",
+                                ),
+                            )
+                        )
+
+            if ctx.parser.last_content_delta:
+                if (
+                    ctx.parser.current_channel == "final"
+                    and ctx.parser.current_recipient is None
+                ):
+                    if not sent_output_item_added:
+                        sent_output_item_added = True
+                        yield _send_event(
+                            openai_responses_types.ResponseOutputItemAddedEvent(
+                                type="response.output_item.added",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item=openai_responses_types.ResponseOutputMessage(
+                                    id=current_item_id,
+                                    type="message",
+                                    role="assistant",
+                                    content=[],
+                                    status="in_progress",
+                                ),
+                            )
+                        )
+                        yield _send_event(
+                            openai_responses_types.ResponseContentPartAddedEvent(
+                                type="response.content_part.added",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item_id=current_item_id,
+                                content_index=current_content_index,
+                                part=openai_responses_types.ResponseOutputText(
+                                    type="output_text",
+                                    text="",
+                                    annotations=[],
+                                    logprobs=None,
+                                ),
+                            )
+                        )
+                    yield _send_event(
+                        openai_responses_types.ResponseTextDeltaEvent(
+                            type="response.output_text.delta",
+                            sequence_number=-1,
+                            content_index=current_content_index,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                            delta=ctx.parser.last_content_delta,
+                            # TODO, use logprobs from ctx.last_request_output
+                            logprobs=[],
+                        )
+                    )
+                elif (
+                    ctx.parser.current_channel == "analysis"
+                    and ctx.parser.current_recipient is None
+                ):
+                    if not sent_output_item_added:
+                        sent_output_item_added = True
+                        yield _send_event(
+                            openai_responses_types.ResponseOutputItemAddedEvent(
+                                type="response.output_item.added",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item=openai_responses_types.ResponseReasoningItem(
+                                    type="reasoning",
+                                    id=current_item_id,
+                                    summary=[],
+                                    status="in_progress",
+                                ),
+                            )
+                        )
+                        yield _send_event(
+                            openai_responses_types.ResponseContentPartAddedEvent(
+                                type="response.content_part.added",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item_id=current_item_id,
+                                content_index=current_content_index,
+                                # TODO: migrate this to
+                                # ResponseReasoningTextContent for now
+                                part=openai_responses_types.ResponseOutputText(
+                                    type="output_text",
+                                    text="",
+                                    annotations=[],
+                                    logprobs=None,
+                                ),
+                            )
+                        )
+                    # TODO: migrate to OpenAI types once updated.
+                    yield _send_event(
+                        openai_responses_types.ResponseReasoningTextDeltaEvent(
+                            type="response.reasoning_text.delta",
+                            item_id=current_item_id,
+                            output_index=current_output_index,
+                            content_index=current_content_index,
+                            delta=ctx.parser.last_content_delta,
+                            sequence_number=-1,
+                        )
+                    )
+
+            if ctx.is_assistant_action_turn() and len(ctx.parser.messages) > 0:
+                previous_item = ctx.parser.messages[-1]
+                if (
+                    self.supports_browsing
+                    and previous_item.recipient is not None
+                    and previous_item.recipient.startswith("browser.")
+                ):
+                    function_name = previous_item.recipient[len("browser.") :]
+                    action = None
+                    parsed_args = json.loads(previous_item.content[0].text)
+                    if function_name == "search":
+                        action = openai_responses_types.response_function_web_search.ActionSearch(
+                            type="search",
+                            query=parsed_args["query"],
+                        )
+                    elif function_name == "open":
+                        action = openai_responses_types.response_function_web_search.ActionOpenPage(
+                            type="open_page",
+                            # TODO: translate to url
+                            url=f"cursor:{parsed_args.get('cursor', '')}",
+                        )
+                    elif function_name == "find":
+                        action = openai_responses_types.response_function_web_search.ActionFind(
+                            type="find",
+                            pattern=parsed_args["pattern"],
+                            # TODO: translate to url
+                            url=f"cursor:{parsed_args.get('cursor', '')}",
+                        )
+                    else:
+                        raise ValueError(f"Unknown function name: {function_name}")
+
+                    yield _send_event(
+                        openai_responses_types.ResponseOutputItemAddedEvent(
+                            type="response.output_item.added",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item=openai_responses_types.response_function_web_search.ResponseFunctionWebSearch(
+                                # TODO: generate a unique id for web search call
+                                type="web_search_call",
+                                id=current_item_id,
+                                action=action,
+                                status="in_progress",
+                            ),
+                        )
+                    )
+                    yield _send_event(
+                        openai_responses_types.ResponseWebSearchCallInProgressEvent(
+                            type="response.web_search_call.in_progress",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                        )
+                    )
+                    yield _send_event(
+                        openai_responses_types.ResponseWebSearchCallSearchingEvent(
+                            type="response.web_search_call.searching",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                        )
+                    )
+
+                    # enqueue
+                    yield _send_event(
+                        openai_responses_types.ResponseWebSearchCallCompletedEvent(
+                            type="response.web_search_call.completed",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                        )
+                    )
+                    yield _send_event(
+                        openai_responses_types.ResponseOutputItemDoneEvent(
+                            type="response.output_item.done",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item=openai_responses_types.ResponseFunctionWebSearch(
+                                type="web_search_call",
+                                id=current_item_id,
+                                action=action,
+                                status="completed",
+                            ),
+                        )
+                    )
+
+                if (
+                    self.supports_code_interpreter
+                    and previous_item.recipient is not None
+                    and previous_item.recipient.startswith("python")
+                ):
+                    yield _send_event(
+                        openai_responses_types.ResponseOutputItemAddedEvent(
+                            type="response.output_item.added",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item=openai_responses_types.ResponseCodeInterpreterToolCallParam(
+                                type="code_interpreter_call",
+                                id=current_item_id,
+                                code="",
+                                container_id="auto",
+                                outputs=[],
+                                status="in_progress",
+                            ),
+                        )
+                    )
+                    yield _send_event(
+                        openai_responses_types.ResponseCodeInterpreterCallInProgressEvent(
+                            type="response.code_interpreter_call.in_progress",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                        )
+                    )
+                    # TODO: do we need to add delta event here?
+                    yield _send_event(
+                        openai_responses_types.ResponseCodeInterpreterCallCodeDoneEvent(
+                            type="response.code_interpreter_call_code.done",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                            code=previous_item.content[0].text,
+                        )
+                    )
+                    yield _send_event(
+                        openai_responses_types.ResponseCodeInterpreterCallInterpretingEvent(
+                            type="response.code_interpreter_call.interpreting",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                        )
+                    )
+                    yield _send_event(
+                        openai_responses_types.ResponseCodeInterpreterCallCompletedEvent(
+                            type="response.code_interpreter_call.completed",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                        )
+                    )
+                    yield _send_event(
+                        openai_responses_types.ResponseOutputItemDoneEvent(
+                            type="response.output_item.done",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item=openai_responses_types.ResponseCodeInterpreterToolCallParam(
+                                type="code_interpreter_call",
+                                id=current_item_id,
+                                code=previous_item.content[0].text,
+                                container_id="auto",
+                                # TODO: add outputs here
+                                outputs=[],
+                                status="completed",
+                            ),
+                        )
+                    )
+
+        async def empty_async_generator():
+            if False:
+                yield
+
+        final_response = await self.responses_full_generator(
+            request,
+            sampling_params,
+            empty_async_generator(),
+            context,
+            model_name,
+            tokenizer,
+            request_metadata,
+            created_time=created_time,
+        )
+        # Convert final_response to the format expected by ResponseCompletedEvent
+        response_dict = final_response.model_dump()
+
+        # Convert UsageInfo to ResponseUsage format
+        if response_dict.get("usage"):
+            usage_info = response_dict["usage"]
+            response_dict["usage"] = {
+                "input_tokens": usage_info.get("prompt_tokens", 0),
+                "input_tokens_details": {
+                    "cached_tokens": usage_info.get("cached_tokens", 0)
+                },
+                "output_tokens": usage_info.get("completion_tokens", 0),
+                "output_tokens_details": {
+                    "reasoning_tokens": usage_info.get("reasoning_tokens", 0)
+                },
+                "total_tokens": usage_info.get("total_tokens", 0),
+            }
+
+        yield _send_event(
+            openai_responses_types.ResponseCompletedEvent(
+                type="response.completed",
+                sequence_number=-1,
+                response=response_dict,
+            )
+        )
+
+    async def _generate_with_builtin_tools(
+        self,
+        request_id: str,
+        request_prompt: Any,
+        adapted_request: GenerateReqInput,
+        sampling_params: Any,
+        context: ConversationContext,
+        raw_request: Optional[Request] = None,
+        priority: Optional[int] = None,
+        **kwargs,
+    ) -> AsyncGenerator[Any, None]:
+        """Generate with builtin tool support for harmony-based models."""
+        orig_priority = priority or 0
+
+        while True:
+            # Generate using SGLang's tokenizer manager
+            generator = self.tokenizer_manager.generate_request(
+                adapted_request, raw_request
+            )
+
+            async for res in generator:
+                context.append_output(res)
+                # NOTE(woosuk): The stop condition is handled by the engine.
+                yield context
+
+            if not context.need_builtin_tool_call():
+                # The model did not ask for a tool call, so we're done.
+                break
+
+            # Call the tool and update the context with the result.
+            tool_output = await context.call_tool()
+            context.append_output(tool_output)
+
+            # Prepare for the next generation turn
+            # Render the updated conversation for the next completion
+            prompt_token_ids = context.render_for_completion()
+
+            # Update the adapted request with new prompt
+            adapted_request = GenerateReqInput(
+                input_ids=prompt_token_ids,
+                sampling_params=sampling_params,
+                stream=adapted_request.stream,
+                rid=request_id,
+                return_logprob=adapted_request.return_logprob,
+                logprob_start_len=adapted_request.logprob_start_len,
+                top_logprobs_num=adapted_request.top_logprobs_num,
+                return_text_in_logprobs=adapted_request.return_text_in_logprobs,
+                return_hidden_states=adapted_request.return_hidden_states,
+                background=adapted_request.background,
+            )
+
+            # Update sampling params with reduced max_tokens
+            if hasattr(sampling_params, "max_new_tokens") or isinstance(
+                sampling_params, dict
+            ):
+                context_len = getattr(
+                    self.tokenizer_manager.model_config, "context_len", 4096
+                )
+                remaining_tokens = context_len - len(prompt_token_ids) - 1
+
+                if isinstance(sampling_params, dict):
+                    sampling_params["max_new_tokens"] = max(remaining_tokens, 1)
+                else:
+                    sampling_params.max_new_tokens = max(remaining_tokens, 1)
+
+            # Slightly reduce priority for subsequent tool calls
+            priority = orig_priority - 1
diff --git a/python/sglang/srt/entrypoints/openai/serving_score.py b/python/sglang/srt/entrypoints/openai/serving_score.py
new file mode 100644
index 000000000..fc8ce5dca
--- /dev/null
+++ b/python/sglang/srt/entrypoints/openai/serving_score.py
@@ -0,0 +1,61 @@
+import logging
+from typing import Union
+
+from fastapi import Request
+
+from sglang.srt.entrypoints.openai.protocol import (
+    ErrorResponse,
+    ScoringRequest,
+    ScoringResponse,
+)
+from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
+
+logger = logging.getLogger(__name__)
+
+
+class OpenAIServingScore(OpenAIServingBase):
+    """Handler for /v1/score requests"""
+
+    # NOTE: /v1/rerank is not an official OpenAI endpoint. This module may be moved
+    # to another module in the future.
+
+    def _request_id_prefix(self) -> str:
+        return "score-"
+
+    def _convert_to_internal_request(
+        self,
+        request: ScoringRequest,
+    ) -> tuple[ScoringRequest, ScoringRequest]:
+        """Convert OpenAI scoring request to internal format"""
+        # For scoring, we pass the request directly as the tokenizer_manager
+        # has a specialized score_request method that doesn't use GenerateReqInput
+
+        return request, request
+
+    async def _handle_non_streaming_request(
+        self,
+        adapted_request: ScoringRequest,
+        request: ScoringRequest,
+        raw_request: Request,
+    ) -> Union[ScoringResponse, ErrorResponse]:
+        """Handle the scoring request"""
+        try:
+            # Use tokenizer_manager's score_request method directly
+            scores = await self.tokenizer_manager.score_request(
+                query=request.query,
+                items=request.items,
+                label_token_ids=request.label_token_ids,
+                apply_softmax=request.apply_softmax,
+                item_first=request.item_first,
+                request=raw_request,
+            )
+
+            # Create response with just the scores, without usage info
+            response = ScoringResponse(
+                scores=scores,
+                model=request.model,
+            )
+            return response
+
+        except ValueError as e:
+            return self.create_error_response(str(e))
diff --git a/python/sglang/srt/entrypoints/openai/tool_server.py b/python/sglang/srt/entrypoints/openai/tool_server.py
new file mode 100644
index 000000000..269d9e99e
--- /dev/null
+++ b/python/sglang/srt/entrypoints/openai/tool_server.py
@@ -0,0 +1,175 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import logging
+from abc import ABC, abstractmethod
+from contextlib import AbstractAsyncContextManager, asynccontextmanager
+from typing import Any
+
+try:
+    from mcp import ClientSession
+    from mcp.client.sse import sse_client
+    from mcp.types import ListToolsResult
+except ImportError as e:
+    ClientSession = sse_client = ListToolsResult = e
+
+from openai_harmony import ToolDescription, ToolNamespaceConfig
+
+logger = logging.getLogger(__name__)
+
+
+async def list_server_and_tools(server_url: str):
+
+    async with sse_client(url=server_url) as streams, ClientSession(
+        *streams
+    ) as session:
+        initialize_response = await session.initialize()
+        list_tools_response = await session.list_tools()
+        return initialize_response, list_tools_response
+
+
+def trim_schema(schema: dict) -> dict:
+    # Turn JSON Schema from MCP generated into Harmony's variant.
+    if "title" in schema:
+        del schema["title"]
+    if "default" in schema and schema["default"] is None:
+        del schema["default"]
+    if "anyOf" in schema:
+        # Turn "anyOf": [{"type": "type-1"}, {"type": "type-2"}]
+        # into "type": ["type-1", "type-2"]
+        # if there's more than 1 types, also remove "null" type as Harmony will
+        # just ignore it
+        types = [
+            type_dict["type"]
+            for type_dict in schema["anyOf"]
+            if type_dict["type"] != "null"
+        ]
+        schema["type"] = types
+        del schema["anyOf"]
+    if "properties" in schema:
+        schema["properties"] = {
+            k: trim_schema(v) for k, v in schema["properties"].items()
+        }
+    return schema
+
+
+def post_process_tools_description(
+    list_tools_result: "ListToolsResult",
+) -> "ListToolsResult":
+    # Adapt the MCP tool result for Harmony
+    for tool in list_tools_result.tools:
+        tool.inputSchema = trim_schema(tool.inputSchema)
+
+    # Some tools schema don't need to be part of the prompt (e.g. simple text
+    # in text out for Python)
+    list_tools_result.tools = [
+        tool
+        for tool in list_tools_result.tools
+        if getattr(tool.annotations, "include_in_prompt", True)
+    ]
+
+    return list_tools_result
+
+
+class ToolServer(ABC):
+
+    @abstractmethod
+    def has_tool(self, tool_name: str):
+        pass
+
+    @abstractmethod
+    def get_tool_description(self, tool_name: str):
+        pass
+
+    @abstractmethod
+    def get_tool_session(self, tool_name: str) -> AbstractAsyncContextManager[Any]: ...
+
+
+class MCPToolServer(ToolServer):
+
+    def __init__(self):
+        self.harmony_tool_descriptions = {}
+
+    async def add_tool_server(self, server_url: str):
+        tool_urls = server_url.split(",")
+        self.harmony_tool_descriptions = {}
+        self.urls: dict[str, str] = {}
+        for url in tool_urls:
+            url = f"http://{url}/sse"
+            initialize_response, list_tools_response = await list_server_and_tools(url)
+
+            list_tools_response = post_process_tools_description(list_tools_response)
+
+            tool_from_mcp = ToolNamespaceConfig(
+                name=initialize_response.serverInfo.name,
+                description=initialize_response.instructions,
+                tools=[
+                    ToolDescription.new(
+                        name=tool.name,
+                        description=tool.description,
+                        parameters=tool.inputSchema,
+                    )
+                    for tool in list_tools_response.tools
+                ],
+            )
+            self.harmony_tool_descriptions[tool_from_mcp.name] = tool_from_mcp
+            if tool_from_mcp.name not in self.urls:
+                self.urls[tool_from_mcp.name] = url
+            else:
+                logger.warning(
+                    "Tool %s already exists. Ignoring duplicate tool server %s",
+                    tool_from_mcp.name,
+                    url,
+                )
+
+    def has_tool(self, tool_name: str):
+        return tool_name in self.harmony_tool_descriptions
+
+    def get_tool_description(self, tool_name: str):
+        return self.harmony_tool_descriptions.get(tool_name)
+
+    @asynccontextmanager
+    async def get_tool_session(self, tool_name: str):
+        url = self.urls.get(tool_name)
+        if url:
+            async with sse_client(url=url) as streams, ClientSession(
+                *streams
+            ) as session:
+                await session.initialize()
+                yield session
+        else:
+            logger.warning("Tool %s not found", tool_name)
+
+
+class DemoToolServer(ToolServer):
+
+    def __init__(self):
+        from sglang.srt.entrypoints.tool import (
+            HarmonyBrowserTool,
+            HarmonyPythonTool,
+            Tool,
+        )
+
+        self.tools: dict[str, Tool] = {}
+        browser_tool = HarmonyBrowserTool()
+        if browser_tool.enabled:
+            self.tools["browser"] = browser_tool
+        python_tool = HarmonyPythonTool()
+        if python_tool.enabled:
+            self.tools["python"] = python_tool
+
+    def has_tool(self, tool_name: str):
+        return tool_name in self.tools
+
+    def get_tool_description(self, tool_name: str):
+        if tool_name not in self.tools:
+            return None
+        if tool_name == "browser":
+            return ToolNamespaceConfig.browser()
+        elif tool_name == "python":
+            return ToolNamespaceConfig.python()
+        else:
+            raise ValueError(f"Unknown tool {tool_name}")
+
+    @asynccontextmanager
+    async def get_tool_session(self, tool_name: str):
+        yield self.tools[tool_name]
diff --git a/python/sglang/srt/entrypoints/openai/usage_processor.py b/python/sglang/srt/entrypoints/openai/usage_processor.py
new file mode 100644
index 000000000..cdd14d74e
--- /dev/null
+++ b/python/sglang/srt/entrypoints/openai/usage_processor.py
@@ -0,0 +1,81 @@
+from __future__ import annotations
+
+from typing import Any, Dict, List, Mapping, Optional, final
+
+from sglang.srt.entrypoints.openai.protocol import UsageInfo
+
+
+@final
+class UsageProcessor:
+    """Stateless helpers that turn raw token counts into a UsageInfo."""
+
+    @staticmethod
+    def _details_if_cached(count: int) -> Optional[Dict[str, int]]:
+        """Return {"cached_tokens": N} only when N > 0 (keeps JSON slim)."""
+        return {"cached_tokens": count} if count > 0 else None
+
+    @staticmethod
+    def calculate_response_usage(
+        responses: List[Dict[str, Any]],
+        n_choices: int = 1,
+        enable_cache_report: bool = False,
+    ) -> UsageInfo:
+        completion_tokens = sum(r["meta_info"]["completion_tokens"] for r in responses)
+
+        prompt_tokens = sum(
+            responses[i]["meta_info"]["prompt_tokens"]
+            for i in range(0, len(responses), n_choices)
+        )
+
+        cached_details = None
+        if enable_cache_report:
+            cached_total = sum(
+                r["meta_info"].get("cached_tokens", 0) for r in responses
+            )
+            cached_details = UsageProcessor._details_if_cached(cached_total)
+
+        return UsageProcessor.calculate_token_usage(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            cached_tokens=cached_details,
+        )
+
+    @staticmethod
+    def calculate_streaming_usage(
+        prompt_tokens: Mapping[int, int],
+        completion_tokens: Mapping[int, int],
+        cached_tokens: Mapping[int, int],
+        n_choices: int,
+        enable_cache_report: bool = False,
+    ) -> UsageInfo:
+        # index % n_choices == 0 marks the first choice of a prompt
+        total_prompt_tokens = sum(
+            tok for idx, tok in prompt_tokens.items() if idx % n_choices == 0
+        )
+        total_completion_tokens = sum(completion_tokens.values())
+
+        cached_details = (
+            UsageProcessor._details_if_cached(sum(cached_tokens.values()))
+            if enable_cache_report
+            else None
+        )
+
+        return UsageProcessor.calculate_token_usage(
+            prompt_tokens=total_prompt_tokens,
+            completion_tokens=total_completion_tokens,
+            cached_tokens=cached_details,
+        )
+
+    @staticmethod
+    def calculate_token_usage(
+        prompt_tokens: int,
+        completion_tokens: int,
+        cached_tokens: Optional[Dict[str, int]] = None,
+    ) -> UsageInfo:
+        """Calculate token usage information"""
+        return UsageInfo(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=prompt_tokens + completion_tokens,
+            prompt_tokens_details=cached_tokens,
+        )
diff --git a/python/sglang/srt/entrypoints/openai/utils.py b/python/sglang/srt/entrypoints/openai/utils.py
new file mode 100644
index 000000000..94ac5458d
--- /dev/null
+++ b/python/sglang/srt/entrypoints/openai/utils.py
@@ -0,0 +1,72 @@
+import logging
+from typing import Any, Dict, List, Optional, Union
+
+from sglang.srt.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    CompletionRequest,
+    LogProbs,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def to_openai_style_logprobs(
+    input_token_logprobs=None,
+    output_token_logprobs=None,
+    input_top_logprobs=None,
+    output_top_logprobs=None,
+):
+    ret_logprobs = LogProbs()
+
+    def append_token_logprobs(token_logprobs):
+        for logprob, _, token_text in token_logprobs:
+            ret_logprobs.tokens.append(token_text)
+            ret_logprobs.token_logprobs.append(logprob)
+
+            # Not supported yet
+            ret_logprobs.text_offset.append(-1)
+
+    def append_top_logprobs(top_logprobs):
+        for tokens in top_logprobs:
+            if tokens is not None:
+                ret_logprobs.top_logprobs.append(
+                    {token[2]: token[0] for token in tokens}
+                )
+            else:
+                ret_logprobs.top_logprobs.append(None)
+
+    if input_token_logprobs is not None:
+        append_token_logprobs(input_token_logprobs)
+    if output_token_logprobs is not None:
+        append_token_logprobs(output_token_logprobs)
+    if input_top_logprobs is not None:
+        append_top_logprobs(input_top_logprobs)
+    if output_top_logprobs is not None:
+        append_top_logprobs(output_top_logprobs)
+
+    return ret_logprobs
+
+
+def process_hidden_states_from_ret(
+    ret_item: Dict[str, Any],
+    request: Union[
+        ChatCompletionRequest,
+        CompletionRequest,
+    ],
+) -> Optional[List]:
+    """Process hidden states from a ret item in non-streaming response.
+
+    Args:
+        ret_item: Response item containing meta_info
+        request: The original request object
+
+    Returns:
+        Processed hidden states for the last token, or None
+    """
+    if not request.return_hidden_states:
+        return None
+
+    hidden_states = ret_item["meta_info"].get("hidden_states", None)
+    if hidden_states is not None:
+        hidden_states = hidden_states[-1] if len(hidden_states) > 1 else []
+    return hidden_states
diff --git a/python/sglang/srt/entrypoints/tool.py b/python/sglang/srt/entrypoints/tool.py
new file mode 100644
index 000000000..45b87ac3a
--- /dev/null
+++ b/python/sglang/srt/entrypoints/tool.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+import logging
+import os
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any
+
+from sglang.srt.utils import print_info_once, print_warning_once
+
+if TYPE_CHECKING:
+    # Avoid circular import.
+    from sglang.srt.entrypoints.context import ConversationContext
+
+logger = logging.getLogger(__name__)
+
+
+class Tool(ABC):
+
+    @abstractmethod
+    async def get_result(self, context: "ConversationContext") -> Any:
+        pass
+
+
+class HarmonyBrowserTool(Tool):
+
+    def __init__(self):
+        self.enabled = True
+        exa_api_key = os.getenv("EXA_API_KEY")
+        if not exa_api_key:
+            self.enabled = False
+            print_warning_once("EXA_API_KEY is not set, browsing is disabled")
+            return
+
+        try:
+            from gpt_oss.tools.simple_browser import SimpleBrowserTool
+            from gpt_oss.tools.simple_browser.backend import ExaBackend
+        except ImportError:
+            self.enabled = False
+            print_warning_once("gpt_oss is not installed, browsing is disabled")
+            return
+
+        browser_backend = ExaBackend(source="web", api_key=exa_api_key)
+        self.browser_tool = SimpleBrowserTool(backend=browser_backend)
+        print_info_once("Browser tool initialized")
+
+    async def get_result(self, context: "ConversationContext") -> Any:
+        from sglang.srt.entrypoints.context import HarmonyContext
+
+        assert isinstance(context, HarmonyContext)
+        last_msg = context.messages[-1]
+        tool_output_msgs = []
+        async for msg in self.browser_tool.process(last_msg):
+            tool_output_msgs.append(msg)
+        return tool_output_msgs
+
+    @property
+    def tool_config(self) -> Any:
+        return self.browser_tool.tool_config
+
+
+class HarmonyPythonTool(Tool):
+
+    def __init__(self):
+        self.enabled = True
+
+        try:
+            from gpt_oss.tools.python_docker.docker_tool import PythonTool
+        except ImportError:
+            self.enabled = False
+            print_warning_once("gpt_oss is not installed, code interpreter is disabled")
+            return
+
+        self.python_tool = PythonTool()
+        print_info_once("Code interpreter tool initialized")
+
+    async def get_result(self, context: "ConversationContext") -> Any:
+        from sglang.srt.entrypoints.context import HarmonyContext
+
+        assert isinstance(context, HarmonyContext)
+        last_msg = context.messages[-1]
+        tool_output_msgs = []
+        async for msg in self.python_tool.process(last_msg):
+            tool_output_msgs.append(msg)
+        return tool_output_msgs
+
+    @property
+    def tool_config(self) -> Any:
+        return self.python_tool.tool_config
diff --git a/python/sglang/srt/eplb/__init__.py b/python/sglang/srt/eplb/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/sglang/srt/eplb/eplb_algorithms/__init__.py b/python/sglang/srt/eplb/eplb_algorithms/__init__.py
new file mode 100644
index 000000000..e2a267810
--- /dev/null
+++ b/python/sglang/srt/eplb/eplb_algorithms/__init__.py
@@ -0,0 +1,63 @@
+from enum import Enum, auto
+from typing import Optional
+
+import torch
+
+from sglang.srt.eplb.eplb_algorithms import deepseek, deepseek_vec
+
+
+class EplbAlgorithm(Enum):
+    deepseek = auto()
+    deepseek_hierarchical = auto()
+    deepseek_vec = auto()
+    deepseek_vec_hierarchical = auto()
+    # TODO may have more algorithm later
+
+
+def rebalance_experts(
+    tokens_per_expert: torch.Tensor,
+    num_physical_experts: int,
+    num_local_physical_experts: int,
+    num_groups: Optional[int],
+    num_nodes: int,
+    algorithm: EplbAlgorithm,
+):
+    if algorithm in [EplbAlgorithm.deepseek, EplbAlgorithm.deepseek_hierarchical]:
+        return deepseek.rebalance_experts(
+            weight=tokens_per_expert.sum(dim=0),
+            num_replicas=num_physical_experts,
+            num_groups=num_groups,
+            num_nodes=num_nodes,
+            num_gpus=num_physical_experts // num_local_physical_experts,
+            enable_hierarchical=algorithm == EplbAlgorithm.deepseek_hierarchical,
+        )
+
+    if algorithm in [
+        EplbAlgorithm.deepseek_vec,
+        EplbAlgorithm.deepseek_vec_hierarchical,
+    ]:
+        return deepseek_vec.rebalance_experts(
+            tokens_per_expert=tokens_per_expert,
+            num_physical_experts=num_physical_experts,
+            num_local_physical_experts=num_local_physical_experts,
+            num_groups=num_groups,
+            num_nodes=num_nodes,
+            enable_hierarchical=algorithm == EplbAlgorithm.deepseek_vec_hierarchical,
+        )
+
+    raise NotImplementedError
+
+
+def compute_algorithm(
+    raw_algorithm: str,
+    num_groups: Optional[int],
+    num_nodes: int,
+) -> EplbAlgorithm:
+    if raw_algorithm != "auto":
+        return EplbAlgorithm[raw_algorithm]
+
+    # TODO test on real scenarios and know which ones perform better
+    if (num_groups is not None) and (num_groups % num_nodes == 0):
+        return EplbAlgorithm.deepseek_hierarchical
+    else:
+        return EplbAlgorithm.deepseek
diff --git a/python/sglang/srt/eplb/eplb_algorithms/deepseek.py b/python/sglang/srt/eplb/eplb_algorithms/deepseek.py
new file mode 100644
index 000000000..180ccdee4
--- /dev/null
+++ b/python/sglang/srt/eplb/eplb_algorithms/deepseek.py
@@ -0,0 +1,223 @@
+# This file is copied from https://github.com/deepseek-ai/EPLB/blob/main/eplb.py since that one is not a pypi package
+from typing import Tuple
+
+import torch
+
+from sglang.srt.utils import get_bool_env_var
+
+
+def balanced_packing(
+    weight: torch.Tensor, num_packs: int
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Pack n weighted objects to m packs, such that each bin contains exactly n/m objects and the weights of all packs
+    are as balanced as possible.
+
+    Parameters:
+        weight: [X, n], the weight of each item
+        num_packs: number of packs
+
+    Returns:
+        pack_index: [X, n], the pack index of each item
+        rank_in_pack: [X, n], the rank of the item in the pack
+    """
+    num_layers, num_groups = weight.shape
+    assert num_groups % num_packs == 0
+    groups_per_pack = num_groups // num_packs
+
+    if groups_per_pack == 1:
+        pack_index = torch.arange(
+            weight.size(-1), dtype=torch.int64, device=weight.device
+        ).expand(weight.shape)
+        rank_in_pack = torch.zeros_like(weight, dtype=torch.int64)
+        return pack_index, rank_in_pack
+
+    indices = weight.float().sort(-1, descending=True).indices.cpu()
+    pack_index = torch.full_like(weight, fill_value=-1, dtype=torch.int64, device="cpu")
+    rank_in_pack = torch.full_like(pack_index, fill_value=-1)
+    for i in range(num_layers):
+        pack_weights = [0] * num_packs
+        pack_items = [0] * num_packs
+        for group in indices[i]:
+            pack = min(
+                (i for i in range(num_packs) if pack_items[i] < groups_per_pack),
+                key=pack_weights.__getitem__,
+            )
+            assert pack_items[pack] < groups_per_pack
+            pack_index[i, group] = pack
+            rank_in_pack[i, group] = pack_items[pack]
+            pack_weights[pack] += weight[i, group]
+            pack_items[pack] += 1
+    return pack_index, rank_in_pack
+
+
+def replicate_experts(
+    weight: torch.Tensor, num_phy: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Replicate `num_log` experts to `num_phy` replicas, such that the maximum load of all replicas is minimized.
+
+    Parameters:
+        weight: [X, num_log]
+        num_phy: total number of experts after replication
+
+    Returns:
+        phy2log: [X, num_phy], logical expert id of each physical expert
+        rank: [X, num_phy], the replica rank
+        logcnt: [X, num_log], number of replicas for each logical expert
+    """
+    n, num_log = weight.shape
+    num_redundant = num_phy - num_log
+    assert num_redundant >= 0
+    device = weight.device
+    phy2log = torch.arange(num_phy, dtype=torch.int64, device=device).repeat(n, 1)
+    rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device)
+    logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device)
+    arangen = torch.arange(n, dtype=torch.int64, device=device)
+    for i in range(num_log, num_phy):
+        redundant_indices = (weight / logcnt).max(dim=-1).indices
+        phy2log[:, i] = redundant_indices
+        rank[:, i] = logcnt[arangen, redundant_indices]
+        logcnt[arangen, redundant_indices] += 1
+    return phy2log, rank, logcnt
+
+
+def rebalance_experts_hierarchical(
+    weight: torch.Tensor,
+    num_physical_experts: int,
+    num_groups: int,
+    num_nodes: int,
+    num_gpus: int,
+):
+    """
+    Parameters:
+        weight: [num_moe_layers, num_logical_experts]
+        num_physical_experts: number of physical experts after replication
+        num_groups: number of expert groups
+        num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
+        num_gpus: number of GPUs, must be a multiple of `num_nodes`
+
+    Returns:
+        physical_to_logical_map: [num_moe_layers, num_physical_experts]
+        logical_to_physical_map: [num_moe_layers, num_logical_experts, X]
+        logical_count: [num_moe_layers, num_logical_experts]
+    """
+    num_layers, num_logical_experts = weight.shape
+    assert num_logical_experts % num_groups == 0
+    group_size = num_logical_experts // num_groups
+    assert num_groups % num_nodes == 0
+    groups_per_node = num_groups // num_nodes
+    assert num_gpus % num_nodes == 0
+    assert num_physical_experts % num_gpus == 0
+    phy_experts_per_gpu = num_physical_experts // num_gpus
+
+    def inverse(perm: torch.Tensor) -> torch.Tensor:
+        inv = torch.empty_like(perm)
+        inv.scatter_(
+            1,
+            perm,
+            torch.arange(perm.size(1), dtype=torch.int64, device=perm.device).expand(
+                perm.shape
+            ),
+        )
+        return inv
+
+    # Step 1: pack groups to nodes
+    tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1)
+    group_pack_index, group_rank_in_pack = balanced_packing(tokens_per_group, num_nodes)
+    log2mlog = (
+        (
+            (group_pack_index * groups_per_node + group_rank_in_pack) * group_size
+        ).unsqueeze(-1)
+        + torch.arange(group_size, dtype=torch.int64, device=group_pack_index.device)
+    ).flatten(-2)
+    mlog2log = inverse(log2mlog)
+
+    # Step 2: construct redundant experts within nodes
+    # [num_layers * num_nodes, num_logical_experts // num_nodes]
+    tokens_per_mlog = weight.gather(-1, mlog2log).view(
+        -1, num_logical_experts // num_nodes
+    )
+    phy2mlog, phyrank, mlogcnt = replicate_experts(
+        tokens_per_mlog, num_physical_experts // num_nodes
+    )
+
+    # Step 3: pack physical_experts to GPUs
+    # [num_layers * num_nodes, num_physical_experts // num_nodes]
+    tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog)
+    pack_index, rank_in_pack = balanced_packing(tokens_per_phy, num_gpus // num_nodes)
+    phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack
+    pphy2phy = inverse(phy2pphy)
+
+    pphy2mlog = phy2mlog.gather(
+        -1, pphy2phy
+    )  # [num_layers * num_nodes, num_log_per_nodes]
+    pphy2mlog = (
+        pphy2mlog.view(num_layers, num_nodes, -1)
+        + torch.arange(
+            0,
+            num_logical_experts,
+            num_logical_experts // num_nodes,
+            device=group_pack_index.device,
+        ).view(1, -1, 1)
+    ).flatten(-2)
+    pphy2log = mlog2log.gather(-1, pphy2mlog)
+    pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1)
+    logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog)
+    return pphy2log, pphyrank, logcnt
+
+
+def rebalance_experts(
+    weight: torch.Tensor,
+    num_replicas: int,
+    num_groups: int,
+    num_nodes: int,
+    num_gpus: int,
+    enable_hierarchical: bool,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Entry point for expert-parallelism load balancer.
+
+    Parameters:
+        weight: [layers, num_logical_experts], the load statistics for all logical experts
+        num_replicas: number of physical experts, must be a multiple of `num_gpus`
+        num_groups: number of expert groups
+        num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
+        num_gpus: number of GPUs, must be a multiple of `num_nodes`
+
+    Returns:
+        physical_to_logical_map: [layers, num_replicas], the expert index of each replica
+        logical_to_physical_map: [layers, num_logical_experts, X], the replica indices for each expert
+        expert_count: [layers, num_logical_experts], number of physical replicas for each logical expert
+    """
+
+    num_layers, num_logical_experts = weight.shape
+    weight = weight.float().cpu()
+    if enable_hierarchical:
+        # use hierarchical load-balance policy
+        phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
+            weight, num_replicas, num_groups, num_nodes, num_gpus
+        )
+    else:
+        # use global load-balance policy
+        phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
+            weight, num_replicas, 1, 1, num_gpus
+        )
+    maxlogcnt = logcnt.max().item()
+    log2phy: torch.Tensor = torch.full(
+        (num_layers, num_logical_experts, maxlogcnt),
+        -1,
+        dtype=torch.int64,
+        device=logcnt.device,
+    )
+    log2phy.view(num_layers, -1).scatter_(
+        -1,
+        phy2log * maxlogcnt + phyrank,
+        torch.arange(num_replicas, dtype=torch.int64, device=log2phy.device).expand(
+            num_layers, -1
+        ),
+    )
+    return phy2log, log2phy, logcnt
+
+
+__all__ = ["rebalance_experts"]
diff --git a/python/sglang/srt/eplb/eplb_algorithms/deepseek_vec.py b/python/sglang/srt/eplb/eplb_algorithms/deepseek_vec.py
new file mode 100644
index 000000000..cb165448a
--- /dev/null
+++ b/python/sglang/srt/eplb/eplb_algorithms/deepseek_vec.py
@@ -0,0 +1,276 @@
+# This file is copied from https://github.com/deepseek-ai/EPLB/blob/main/eplb.py since that one is not a pypi package
+from typing import Optional, Tuple
+
+import torch
+
+
+def pack_groups(tokens_per_group: torch.Tensor, num_nodes: int) -> torch.Tensor:
+    num_layers, num_groups = tokens_per_group.shape
+    assert num_groups % num_nodes == 0
+    groups_per_rank = num_groups // num_nodes
+
+    indices = tokens_per_group.float().sort(-1, descending=True).indices.cpu()
+    ret = torch.full_like(
+        tokens_per_group, fill_value=-1, dtype=torch.int64, device="cpu"
+    )
+    for layer in range(num_layers):
+        node_tokens = [0] * num_nodes
+        node_groups = [0] * num_nodes
+        for group in indices[layer]:
+
+            def key_func(rank: int) -> int:
+                if node_groups[rank] >= groups_per_rank:
+                    return 1, 0
+                else:
+                    return 0, node_tokens[rank]
+
+            rank = min(range(num_nodes), key=key_func)
+            assert node_groups[rank] < groups_per_rank
+            ret[layer, group] = rank * groups_per_rank + node_groups[rank]
+            node_tokens[rank] += tokens_per_group[layer, group]
+            node_groups[rank] += 1
+    return ret
+
+
+def make_redundant_experts_chunkwise(
+    tokens_per_expert: torch.Tensor,
+    num_physical_experts: int,
+    num_local_physical_experts: int,
+    num_physical_experts_per_chunk: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    num_steps, num_moe_layers, num_logical_experts = tokens_per_expert.shape
+    num_redundancy_experts = num_physical_experts - num_logical_experts
+
+    physical_to_logical_map = torch.empty(
+        num_moe_layers,
+        num_physical_experts,
+        dtype=torch.int,
+        device=tokens_per_expert.device,
+    )
+    logical_to_physical_map = torch.full(
+        (num_moe_layers, num_logical_experts, num_redundancy_experts + 1),
+        -1,
+        dtype=torch.int,
+        device=tokens_per_expert.device,
+    )
+    logical_count = torch.ones(
+        num_moe_layers,
+        num_logical_experts,
+        dtype=torch.int,
+        device=tokens_per_expert.device,
+    )
+
+    assert num_physical_experts % num_physical_experts_per_chunk == 0
+    num_chunks = num_physical_experts // num_physical_experts_per_chunk
+    assert num_logical_experts % num_chunks == 0
+    num_logical_experts_per_group = num_logical_experts // num_chunks
+    assert num_redundancy_experts % num_chunks == 0
+    num_redundancy_experts_per_group = num_redundancy_experts // num_chunks
+
+    arange_num_moe_layers_num_groups = torch.arange(
+        num_moe_layers * num_chunks, dtype=torch.int, device=tokens_per_expert.device
+    )
+    arange_num_logical_experts = torch.arange(
+        num_logical_experts, dtype=torch.int, device=tokens_per_expert.device
+    )
+    arange_num_logical_experts_per_group = torch.arange(
+        num_logical_experts_per_group, dtype=torch.int, device=tokens_per_expert.device
+    )
+    arange_num_groups = torch.arange(
+        num_chunks, dtype=torch.int, device=tokens_per_expert.device
+    )
+    physical_to_logical_map.view(
+        num_moe_layers, num_chunks, num_physical_experts_per_chunk
+    )[:, :, :num_logical_experts_per_group] = arange_num_logical_experts.view(
+        num_chunks, num_logical_experts_per_group
+    )
+    logical_to_physical_map[:, :, 0] = (
+        arange_num_logical_experts_per_group.expand(
+            num_chunks, num_logical_experts_per_group
+        )
+        + arange_num_groups[:, None] * num_physical_experts_per_chunk
+    ).view(num_logical_experts)
+
+    tokens_per_expert_all_diff = tokens_per_expert + arange_num_logical_experts * 1e-4
+    for i in range(num_redundancy_experts_per_group):
+        score = (
+            tokens_per_expert_all_diff / logical_count
+        )  # NOTE: Values in score must be different from each other
+        score1 = tokens_per_expert / (logical_count + 1)
+        score = score.view(
+            num_steps, num_moe_layers, num_chunks, num_logical_experts_per_group
+        )
+        score1 = score1.view_as(score)
+        values, indices = score.max(-1, keepdim=True)
+        values = values.expand_as(score).contiguous()
+        score.scatter_(-1, indices, score1.gather(-1, indices))
+        values.scatter_(-1, indices, score.max(-1, keepdim=True).values)
+        redundancy_indices = values.sum(0).argmin(-1)
+        physical_to_logical_map.view(
+            num_moe_layers, num_chunks, num_physical_experts_per_chunk
+        )[:, :, num_logical_experts_per_group + i] = (
+            redundancy_indices + arange_num_groups * num_logical_experts_per_group
+        )
+        redundancy_count = (
+            logical_count.view(
+                num_moe_layers * num_chunks, num_logical_experts_per_group
+            )
+            .gather(-1, redundancy_indices.view(num_moe_layers * num_chunks, 1))
+            .squeeze(1)
+        )
+        physical_redundancy_indices = (
+            (
+                arange_num_groups * num_physical_experts_per_chunk
+                + num_logical_experts_per_group
+                + i
+            )
+            .expand(num_moe_layers, num_chunks)
+            .flatten()
+        )
+        logical_to_physical_map.view(
+            num_moe_layers * num_chunks,
+            num_logical_experts_per_group,
+            num_redundancy_experts + 1,
+        )[
+            arange_num_moe_layers_num_groups,
+            redundancy_indices.view(num_moe_layers * num_chunks),
+            redundancy_count,
+        ] = physical_redundancy_indices
+        logical_count.view(num_moe_layers * num_chunks, num_logical_experts_per_group)[
+            arange_num_moe_layers_num_groups,
+            redundancy_indices.view(num_moe_layers * num_chunks),
+        ] += 1
+
+    if num_local_physical_experts > 1:
+        # Load-balancing between GPUs
+        physical_to_logical_map_int64 = physical_to_logical_map.to(torch.int64)
+        counts = logical_count.gather(-1, physical_to_logical_map_int64)
+        score = tokens_per_expert.sum(0).gather(-1, physical_to_logical_map_int64)
+        score = score / counts
+        score = score.view(num_moe_layers, num_chunks, num_physical_experts_per_chunk)
+        indices = score.argsort(-1, descending=True)
+        indices += torch.arange(
+            0,
+            num_physical_experts,
+            num_physical_experts_per_chunk,
+            dtype=indices.dtype,
+            device=indices.device,
+        )[None, :, None]
+
+        assert num_physical_experts_per_chunk % num_local_physical_experts == 0
+        num_local_groups = num_physical_experts_per_chunk // num_local_physical_experts
+        indices = indices.view(
+            num_moe_layers, num_chunks, num_local_physical_experts, num_local_groups
+        )
+        indices[:, :, 1::2, :] = indices[:, :, 1::2, :].flip(-1)
+        indices = indices.transpose(2, 3)
+        indices = indices.reshape(num_moe_layers, num_physical_experts)
+        physical_to_logical_map = physical_to_logical_map.gather(-1, indices)
+        mask = logical_to_physical_map == -1
+        logical_to_physical_map[mask] = 0
+        logical_to_physical_map = (
+            indices.argsort(-1)
+            .gather(
+                -1, logical_to_physical_map.view(num_moe_layers, -1).to(torch.int64)
+            )
+            .view_as(logical_to_physical_map)
+            .to(torch.int)
+        )
+        logical_to_physical_map[mask] = -1
+
+    return physical_to_logical_map, logical_to_physical_map, logical_count
+
+
+def decode_rebalance_experts(
+    tokens_per_expert: torch.Tensor,
+    num_physical_experts: int,
+    num_local_physical_experts: int,
+):
+    return make_redundant_experts_chunkwise(
+        tokens_per_expert,
+        num_physical_experts,
+        num_local_physical_experts,
+        num_physical_experts,
+    )
+
+
+def prefill_rebalance_experts(
+    tokens_per_expert: torch.Tensor,
+    num_physical_experts: int,
+    num_local_physical_experts: int,
+    num_groups: int,
+    num_nodes: int,
+):
+    tokens_per_expert = tokens_per_expert.float().cpu()
+
+    num_steps, _, num_logical_experts = tokens_per_expert.shape
+    assert num_logical_experts % num_groups == 0
+    group_size = num_logical_experts // num_groups
+    assert num_groups % num_nodes == 0, f"{num_groups=} {num_nodes=}"
+
+    tokens_per_group = tokens_per_expert.sum(0).unflatten(-1, (num_groups, -1)).sum(-1)
+    group_perm = pack_groups(
+        tokens_per_group, num_nodes
+    )  # [num_moe_layers, num_groups] => [num_moe_layers, num_nodes]
+
+    # log2mlog [layers, #logexp] -> [layers, #logexp]
+    log2mlog = (
+        (group_perm * group_size).unsqueeze(-1)
+        + torch.arange(group_size, dtype=torch.int64, device=group_perm.device)
+    ).flatten(-2)
+
+    # mlog2log [layers, #logexp] -> [layers, #logexp], inverse of log2mlog
+    mlog2log = torch.empty_like(log2mlog)
+    arange = torch.arange(
+        num_logical_experts, dtype=torch.int64, device=mlog2log.device
+    )
+    mlog2log.scatter_(1, log2mlog, arange.expand(log2mlog.size(0), -1))
+
+    # tokens_per_mlog[i][j][k] = tokens_per_expert[i][j][mlog2log[j][k]]
+    tokens_per_mlog = tokens_per_expert.gather(
+        2, mlog2log.unsqueeze(0).expand(num_steps, -1, -1)
+    )
+
+    phy2mlog, mlog2phy, mlog_count = make_redundant_experts_chunkwise(
+        tokens_per_mlog,
+        num_physical_experts,
+        num_local_physical_experts,
+        num_physical_experts // num_nodes,
+    )
+
+    # phy2log[i][j] = mlog2log[i][phy2mlog[i][j]]
+    phy2log = mlog2log.gather(1, phy2mlog.to(torch.int64))
+
+    # mlog2phy: [num_moe_layers, num_logical_experts, ...]
+    # log2phy[i][j][k] = mlog2phy[i][log2mlog[i][j]][k]
+    log2phy = mlog2phy.gather(
+        1, log2mlog.unsqueeze(-1).expand(-1, -1, mlog2phy.size(-1)).to(torch.int64)
+    )
+
+    # log_count[i][j] = mlog_count[i][log2mlog[i][j]]
+    log_count = mlog_count.gather(1, log2mlog)
+    return phy2log, log2phy, log_count
+
+
+def rebalance_experts(
+    tokens_per_expert: torch.Tensor,
+    num_physical_experts: int,
+    num_local_physical_experts: int,
+    num_groups: Optional[int],
+    num_nodes: int,
+    enable_hierarchical: bool,
+):
+    if enable_hierarchical:
+        return prefill_rebalance_experts(
+            tokens_per_expert=tokens_per_expert,
+            num_physical_experts=num_physical_experts,
+            num_local_physical_experts=num_local_physical_experts,
+            num_groups=num_groups,
+            num_nodes=num_nodes,
+        )
+    else:
+        return decode_rebalance_experts(
+            tokens_per_expert=tokens_per_expert,
+            num_physical_experts=num_physical_experts,
+            num_local_physical_experts=num_local_physical_experts,
+        )
diff --git a/python/sglang/srt/eplb/eplb_manager.py b/python/sglang/srt/eplb/eplb_manager.py
new file mode 100644
index 000000000..e88a3d28e
--- /dev/null
+++ b/python/sglang/srt/eplb/eplb_manager.py
@@ -0,0 +1,118 @@
+import logging
+import time
+from typing import TYPE_CHECKING, List
+
+import torch.cuda
+
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location import ExpertLocationMetadata
+
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+logger = logging.getLogger(__name__)
+
+
+class EPLBManager:
+    def __init__(self, model_runner: "ModelRunner"):
+        super().__init__()
+        self._model_runner = model_runner
+        self._server_args = model_runner.server_args
+        self._rebalance_layers_per_chunk = (
+            self._server_args.eplb_rebalance_layers_per_chunk
+        )
+        self._rebalance_num_iterations = self._server_args.eplb_rebalance_num_iterations
+
+        # Otherwise, the circular buffer will contain stale data. If the case is needed, it can be implemented.
+        assert (
+            self._server_args.eplb_rebalance_num_iterations
+            >= self._server_args.expert_distribution_recorder_buffer_size
+        ), "eplb_rebalance_num_iterations must be greater than expert_distribution_recorder_buffer_size"
+
+        if not get_global_expert_distribution_recorder().recording:
+            get_global_expert_distribution_recorder().start_record()
+
+        logger.info(
+            f"[EPLBManager] system started, will rebalance per {self._rebalance_num_iterations} iterations."
+        )
+
+        self._main_generator = self._entrypoint()
+
+    def on_forward_pass_end(self):
+        next(self._main_generator)
+
+    # can be more complex if needed
+    def _entrypoint(self):
+        while True:
+            for _ in range(self._rebalance_num_iterations):
+                yield
+
+            yield from self.rebalance()
+
+    def rebalance(self):
+        logger.info("[EPLBManager] rebalance start")
+
+        enable_timing = self._rebalance_layers_per_chunk is None
+
+        if enable_timing:
+            torch.get_device_module().synchronize()
+            time_start = time.time()
+
+        dump_record_output = get_global_expert_distribution_recorder().dump_record(
+            output_mode="object"
+        )
+        logical_count = dump_record_output["logical_count"]
+        average_utilization_rate_over_window = dump_record_output[
+            "average_utilization_rate_over_window"
+        ]
+
+        # Check whether rebalancing is needed
+        if not self._check_rebalance_needed(average_utilization_rate_over_window):
+            return
+
+        expert_location_metadata = ExpertLocationMetadata.init_by_eplb(
+            self._server_args, self._model_runner.model_config, logical_count
+        )
+
+        update_layer_ids_chunks = self._compute_update_layer_ids_chunks()
+        for chunk_index, update_layer_ids in enumerate(update_layer_ids_chunks):
+            if len(update_layer_ids_chunks) > 1:
+                yield
+            self._model_runner.update_expert_location(
+                expert_location_metadata,
+                update_layer_ids=update_layer_ids,
+            )
+
+        msg = f"[EPLBManager] rebalance end"
+        if enable_timing:
+            torch.get_device_module().synchronize()
+            time_end = time.time()
+            msg += f" time={time_end - time_start:.3f}s"
+        logger.info(msg)
+
+    def _check_rebalance_needed(self, average_utilization_rate_over_window):
+        if average_utilization_rate_over_window is None:
+            return True
+
+        if (
+            average_utilization_rate_over_window
+            > self._server_args.eplb_min_rebalancing_utilization_threshold
+        ):
+            logger.info(
+                f"[EPLBManager] Skipped ep rebalancing: current GPU utilization {average_utilization_rate_over_window:.2f} > minimum rebalance threshold {self._server_args.eplb_min_rebalancing_utilization_threshold:.2f}"
+            )
+            return False
+
+        return True
+
+    def _compute_update_layer_ids_chunks(self) -> List[List[int]]:
+        all_layer_ids = sorted(
+            list(self._model_runner.model.routed_experts_weights_of_layer.keys())
+        )
+        chunk_size = self._rebalance_layers_per_chunk or 1000000
+        return list(_chunk_list(all_layer_ids, chunk_size=chunk_size))
+
+
+def _chunk_list(items: List, chunk_size):
+    for start_index in range(0, len(items), chunk_size):
+        yield items[start_index : start_index + chunk_size]
diff --git a/python/sglang/srt/eplb/eplb_simulator/__init__.py b/python/sglang/srt/eplb/eplb_simulator/__init__.py
new file mode 100644
index 000000000..a1fcbdf00
--- /dev/null
+++ b/python/sglang/srt/eplb/eplb_simulator/__init__.py
@@ -0,0 +1 @@
+from . import reader
diff --git a/python/sglang/srt/eplb/eplb_simulator/reader.py b/python/sglang/srt/eplb/eplb_simulator/reader.py
new file mode 100644
index 000000000..97405c319
--- /dev/null
+++ b/python/sglang/srt/eplb/eplb_simulator/reader.py
@@ -0,0 +1,51 @@
+from collections import defaultdict
+from pathlib import Path
+
+import torch
+from tqdm import tqdm
+
+from sglang.srt.eplb.expert_distribution import (
+    _convert_global_physical_count_to_logical_count,
+)
+
+convert_global_physical_count_to_logical_count = (
+    _convert_global_physical_count_to_logical_count
+)
+
+
+def read_mode_per_pass(dir_data: Path):
+    """Read data from ExpertDistributionRecorder when recorded with mode `per_pass`"""
+
+    # gpc := global_physical_count
+    gpc_of_forward_pass_and_rank = defaultdict(lambda: defaultdict())
+    for path in tqdm(list(dir_data.glob("*.pt"))):
+        data_pack = torch.load(path, weights_only=True)
+        last_physical_to_logical_map = data_pack["last_physical_to_logical_map"]
+        for record in data_pack["records"]:
+            forward_pass_id = record["forward_pass_id"]
+            rank = record["rank"]
+            assert (
+                gpc_of_forward_pass_and_rank[forward_pass_id].get(rank) is None
+            ), f"Duplicated {forward_pass_id=} {rank=}"
+            gpc_of_forward_pass_and_rank[forward_pass_id][rank] = record[
+                "global_physical_count"
+            ]
+
+    forward_pass_ids = sorted(gpc_of_forward_pass_and_rank.keys())
+    print(f"Make {forward_pass_ids=} into array")
+
+    items = []
+    for forward_pass_id, gpc_of_rank in sorted(gpc_of_forward_pass_and_rank.items()):
+        gpc_of_rank_tensor = torch.stack(
+            [gpc for rank, gpc in sorted(gpc_of_rank.items())]
+        ).sum(dim=0)
+        items.append(gpc_of_rank_tensor)
+
+    gpc_of_forward_pass = torch.stack(items)
+    print(f"{gpc_of_forward_pass.shape=}")
+
+    return dict(
+        global_physical_count_of_forward_pass=gpc_of_forward_pass,
+        last_physical_to_logical_map=last_physical_to_logical_map,
+        forward_pass_ids=forward_pass_ids,
+    )
diff --git a/python/sglang/srt/eplb/expert_distribution.py b/python/sglang/srt/eplb/expert_distribution.py
new file mode 100644
index 000000000..3faf981ef
--- /dev/null
+++ b/python/sglang/srt/eplb/expert_distribution.py
@@ -0,0 +1,966 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import annotations
+
+import logging
+import math
+import os
+import time
+from abc import ABC
+from collections import deque
+from contextlib import contextmanager
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Type
+
+import einops
+import torch
+import torch.distributed
+
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import Withable, get_bool_env_var, is_npu
+
+_is_npu = is_npu()
+
+if TYPE_CHECKING:
+    from sglang.srt.eplb.expert_location import ExpertLocationMetadata
+
+logger = logging.getLogger(__name__)
+
+# --------------------------------------- Entrypoint -----------------------------------------
+
+_OutputMode = Literal["file", "object"]
+
+
+class ExpertDistributionRecorder(ABC):
+    """Global expert distribution recording"""
+
+    @staticmethod
+    def init_new(
+        server_args: ServerArgs,
+        expert_location_metadata: ExpertLocationMetadata,
+        rank: int,
+    ):
+        if server_args.expert_distribution_recorder_mode is not None:
+            assert (
+                expert_location_metadata is not None
+            ), "ExpertLocationMetadata is required for expert distribution recording. One possible"
+            "reason is that you are using a model that does not support expert distribution"
+            "recording. Try setting `get_model_config_for_expert_location` in your model."
+            return _ExpertDistributionRecorderReal(
+                server_args, expert_location_metadata, rank
+            )
+        else:
+            return _ExpertDistributionRecorderNoop()
+
+    @contextmanager
+    def with_current_layer(self, layer_idx):
+        yield
+
+    @contextmanager
+    def with_debug_name(self, debug_name):
+        yield
+
+    @contextmanager
+    def disable_this_region(self):
+        yield
+
+    @contextmanager
+    def with_forward_pass(self, forward_pass_id: int, forward_batch: ForwardBatch):
+        yield
+
+    def on_select_experts(self, topk_ids: torch.Tensor):
+        pass
+
+    def on_deepep_dispatch_normal(
+        self,
+        local_physical_count_of_layer: List[int],
+        num_tokens_per_rank,
+        num_tokens_per_rdma_rank,
+        num_tokens_per_expert,
+    ):
+        pass
+
+    def on_deepep_dispatch_low_latency(
+        self, local_physical_count_of_layer: torch.Tensor
+    ):
+        pass
+
+    def start_record(self):
+        self._on_not_implemented()
+
+    def stop_record(self):
+        self._on_not_implemented()
+
+    def dump_record(self, output_mode: _OutputMode = "file"):
+        self._on_not_implemented()
+
+    @property
+    def recording(self):
+        return False
+
+    def _on_not_implemented(self):
+        raise Exception(
+            "Please set ServerArgs.expert_distribution_recorder_mode to use ExpertDistributionRecorder."
+        )
+
+
+class _ExpertDistributionRecorderNoop(ExpertDistributionRecorder):
+    pass
+
+
+class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        expert_location_metadata: ExpertLocationMetadata,
+        rank: int,
+    ):
+        self._server_args = server_args
+        self._expert_location_metadata = expert_location_metadata
+
+        self._recording = False
+        self._disable_all = False
+        self._current_forward_pass_id = Withable()
+        self._current_layer_idx = Withable()
+        self._current_debug_name = Withable()
+        self._accumulator = _Accumulator.init_new(
+            server_args, expert_location_metadata, rank
+        )
+        self._single_pass_gatherers = {
+            k: _SinglePassGatherer.init_new(server_args, expert_location_metadata, rank)
+            for k in self._accumulator.get_single_pass_gatherer_keys()
+        }
+
+        if server_args.enable_expert_distribution_metrics:
+            logger.info(
+                "ExpertDistributionRecorder auto start record since enable_expert_distribution_metrics"
+            )
+            self.start_record()
+
+    def with_current_layer(self, layer_idx):
+        return self._current_layer_idx.with_value(layer_idx)
+
+    def with_debug_name(self, debug_name):
+        return self._current_debug_name.with_value(debug_name)
+
+    @contextmanager
+    def with_forward_pass(self, forward_pass_id: int, forward_batch: ForwardBatch):
+        with self._current_forward_pass_id.with_value(forward_pass_id):
+            self._on_forward_pass_start(forward_batch)
+            try:
+                yield
+            finally:
+                self._on_forward_pass_end(forward_pass_id)
+
+    @contextmanager
+    def disable_this_region(self):
+        """Context manager to temporarily disable recording."""
+        previous_disable_all = self._disable_all
+        self._disable_all = True
+        try:
+            yield
+        finally:
+            self._disable_all = previous_disable_all
+
+    def _on_forward_pass_start(self, forward_batch: ForwardBatch):
+        if not self._recording:
+            return
+        for gatherer_key, gatherer in self._single_pass_gatherers.items():
+            gatherer.reset()
+            gatherer.on_forward_pass_start(forward_batch)
+
+    def _on_forward_pass_end(self, forward_pass_id: int):
+        if not self._recording:
+            return
+        for gatherer_key, gatherer in self._single_pass_gatherers.items():
+            single_pass_data = gatherer.collect()
+            self._accumulator.append(forward_pass_id, gatherer_key, single_pass_data)
+
+    def on_select_experts(self, topk_ids: torch.Tensor):
+        self._on_hook("on_select_experts", topk_ids=topk_ids)
+
+    def on_deepep_dispatch_normal(
+        self,
+        local_physical_count_of_layer: List[int],
+        num_tokens_per_rank,
+        num_tokens_per_rdma_rank,
+        num_tokens_per_expert,
+    ):
+        self._on_hook(
+            "on_deepep_dispatch_normal",
+            local_physical_count_of_layer=local_physical_count_of_layer,
+            num_tokens_per_rank=num_tokens_per_rank,
+            num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
+            num_tokens_per_expert=num_tokens_per_expert,
+        )
+
+    def on_deepep_dispatch_low_latency(
+        self, local_physical_count_of_layer: torch.Tensor
+    ):
+        self._on_hook(
+            "on_deepep_dispatch_low_latency",
+            local_physical_count_of_layer=local_physical_count_of_layer,
+        )
+
+    def _on_hook(self, hook_name: str, **kwargs):
+        if self._disable_all:
+            return
+        if not (
+            self._recording or torch.get_device_module().is_current_stream_capturing()
+        ):
+            return
+        gatherer = self._single_pass_gatherers[
+            self._accumulator.get_single_pass_gatherer_key(
+                self._current_debug_name.value
+            )
+        ]
+        getattr(gatherer, hook_name)(layer_idx=self._current_layer_idx.value, **kwargs)
+
+    def _reset(self):
+        """Reset the expert distribution recorder."""
+        logger.info("Resetting ExpertDistributionRecorder...")
+        assert (
+            self._current_layer_idx.value is None
+        ), f"{self._current_layer_idx.value=}"
+        for gatherer in self._single_pass_gatherers.values():
+            gatherer.reset()
+        self._accumulator.reset()
+
+    def start_record(self):
+        """Start recording the expert distribution."""
+        if self._recording:
+            logger.warning(
+                "SGLang server is already recording expert ids. Did you forget to dump the expert ids recorded so far by sending requests to the `/stop_expert_distribution_record` and `/dump_expert_distribution_record` endpoints?"
+            )
+        self._reset()
+        self._recording = True
+
+    def stop_record(self):
+        """Stop recording the expert distribution."""
+        if not self._recording:
+            logger.warning(
+                "SGLang server has not been recording expert ids. Did you forget to start recording by sending request to the `/start_expert_distribution_record` endpoint?"
+            )
+        self._recording = False
+
+    def dump_record(self, output_mode: _OutputMode = "file"):
+        """Dump the expert distribution record and reset the recorder after dumping."""
+        output = self._accumulator.dump(output_mode=output_mode)
+        self._reset()
+        return output
+
+    @property
+    def recording(self):
+        return self._recording
+
+
+_global_expert_distribution_recorder: Optional[ExpertDistributionRecorder] = (
+    _ExpertDistributionRecorderNoop()
+)
+
+
+def get_global_expert_distribution_recorder():
+    return _global_expert_distribution_recorder
+
+
+def set_global_expert_distribution_recorder(value):
+    global _global_expert_distribution_recorder
+    _global_expert_distribution_recorder = value
+
+
+# --------------------------------------- SinglePassGatherer -----------------------------------------
+
+
+class _SinglePassGatherer(ABC):
+    @staticmethod
+    def init_new(
+        server_args: ServerArgs,
+        expert_location_metadata: ExpertLocationMetadata,
+        rank: int,
+    ) -> "_SinglePassGatherer":
+        if server_args.expert_distribution_recorder_mode == "per_token":
+            return _DetailSinglePassGatherer(
+                server_args, expert_location_metadata, rank
+            )
+
+        if server_args.expert_distribution_recorder_mode == "stat_approx":
+            if server_args.moe_a2a_backend != "none" and (
+                server_args.deepep_mode == "normal"
+            ):
+                return _DeepepNormalSinglePassGatherer(expert_location_metadata, rank)
+            else:
+                raise NotImplementedError
+
+        if server_args.moe_a2a_backend != "none":
+            if server_args.deepep_mode == "normal":
+                return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
+            elif server_args.deepep_mode == "low_latency":
+                return _DeepepLowLatencySinglePassGatherer(
+                    expert_location_metadata, rank
+                )
+            else:
+                raise NotImplementedError
+
+        return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
+
+    def __init__(self, expert_location_metadata: ExpertLocationMetadata, rank: int):
+        self._expert_location_metadata = expert_location_metadata
+        self._rank = rank
+
+    def on_forward_pass_start(self, forward_batch: ForwardBatch):
+        pass
+
+    def on_select_experts(self, layer_idx: int, topk_ids: torch.Tensor):
+        pass
+
+    def on_deepep_dispatch_normal(
+        self,
+        layer_idx: int,
+        local_physical_count_of_layer: List[int],
+        num_tokens_per_rank,
+        num_tokens_per_rdma_rank,
+        num_tokens_per_expert,
+    ):
+        pass
+
+    def on_deepep_dispatch_low_latency(
+        self, layer_idx: int, local_physical_count_of_layer: torch.Tensor
+    ):
+        pass
+
+    def reset(self):
+        raise NotImplementedError
+
+    def collect(self) -> Dict:
+        raise NotImplementedError
+
+
+class _DetailSinglePassGatherer(_SinglePassGatherer):
+    # DeepSeek V3 has this value; should generalize later
+    _TOP_K_NUM = 8
+
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        expert_location_metadata: ExpertLocationMetadata,
+        rank: int,
+    ):
+        super().__init__(expert_location_metadata, rank)
+        self._metadata: Optional[Dict[str, Any]] = None
+        self._topk_ids_of_layer = torch.zeros(
+            (
+                expert_location_metadata.num_layers,
+                # TODO determine the max number
+                server_args.chunked_prefill_size * 8,
+                self._TOP_K_NUM,
+            ),
+            dtype=torch.int32,
+            device=server_args.device,
+        )
+        self._misc_objects: List[Dict[str, Any]] = []
+        assert (
+            not server_args.enable_two_batch_overlap
+        ), "DetailSinglePassGatherer does not support TBO yet"
+        # TODO assert shared experts fusion is disabled, o/w data is wrong
+
+    def on_forward_pass_start(self, forward_batch: ForwardBatch):
+        assert self._metadata is None
+        self._metadata = dict(
+            # TODO pr-chain
+            # rids=forward_batch.rids,
+            input_ids=forward_batch.input_ids.cpu().tolist(),
+            positions=forward_batch.positions.cpu().tolist(),
+            extend_seq_lens=forward_batch.extend_seq_lens_cpu,
+            forward_mode=forward_batch.forward_mode.value,
+        )
+
+    def on_select_experts(self, layer_idx: int, topk_ids: torch.Tensor):
+        self._topk_ids_of_layer[layer_idx, : topk_ids.shape[0], : topk_ids.shape[1]] = (
+            topk_ids
+        )
+
+    def on_deepep_dispatch_normal(
+        self,
+        layer_idx: int,
+        local_physical_count_of_layer: List[int],
+        num_tokens_per_rank,
+        num_tokens_per_rdma_rank,
+        num_tokens_per_expert,
+    ):
+        self._misc_objects.append(
+            dict(
+                layer_id=layer_idx,
+                num_tokens_per_rank=num_tokens_per_rank.cpu().tolist(),
+                num_tokens_per_rdma_rank=num_tokens_per_rdma_rank.cpu().tolist(),
+                num_tokens_per_expert=num_tokens_per_expert.cpu().tolist(),
+            )
+        )
+
+    def reset(self):
+        self._topk_ids_of_layer[...] = -1
+        self._misc_objects.clear()
+        self._metadata = None
+
+    def collect(self) -> Dict:
+        num_tokens = len(self._metadata["input_ids"])
+        return dict(
+            **self._metadata,
+            topk_ids_of_layer=self._topk_ids_of_layer[:, :num_tokens, :].clone().cpu(),
+            misc_objects=self._misc_objects,
+        )
+
+
+class _LayerBasedCpuSinglePassGatherer(_SinglePassGatherer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._objects_of_layer = {}
+
+    def _on_layer_data(self, layer_idx: int, objects: List[int]):
+        assert 0 <= layer_idx < self._expert_location_metadata.num_layers
+        if layer_idx in self._objects_of_layer:
+            self._objects_of_layer[layer_idx] = _list_sum(
+                self._objects_of_layer[layer_idx], objects
+            )
+        else:
+            self._objects_of_layer[layer_idx] = objects
+
+    def reset(self):
+        self._objects_of_layer.clear()
+
+    def _collect_objects(self, pad_len: int) -> torch.Tensor:
+        data = [
+            self._objects_of_layer.get(layer_index) or ([0] * pad_len)
+            for layer_index in range(self._expert_location_metadata.num_layers)
+        ]
+        return torch.tensor(data)
+
+
+def _list_sum(a: List, b: List) -> List:
+    return [x + y for x, y in zip(a, b, strict=True)]
+
+
+class _LayerBasedGpuSinglePassGatherer(_SinglePassGatherer):
+    def __init__(self, *args, enable_global_physical_experts: bool, **kwargs):
+        super().__init__(*args, **kwargs)
+        if not _is_npu:
+            device = "cuda"
+        else:
+            device = "npu"
+        self._enable_global_physical_experts = enable_global_physical_experts
+        self._data = torch.zeros(
+            (
+                self._expert_location_metadata.num_layers,
+                (
+                    self._expert_location_metadata.num_physical_experts
+                    if enable_global_physical_experts
+                    else self._expert_location_metadata.num_local_physical_experts
+                ),
+            ),
+            dtype=torch.int,
+            device=device,
+        )
+
+    def reset(self):
+        self._data[...] = 0
+
+    def collect(self) -> Dict:
+        if self._enable_global_physical_experts:
+            global_physical_count = self._data
+        else:
+            # Can optimize if bottleneck
+            global_physical_count = _convert_local_to_global_physical_count(
+                self._data,
+                rank=self._rank,
+                num_local_physical_experts=self._expert_location_metadata.num_local_physical_experts,
+                num_physical_experts=self._expert_location_metadata.num_physical_experts,
+            )
+
+        return dict(global_physical_count=global_physical_count)
+
+
+class _SelectExpertsSinglePassGatherer(_LayerBasedGpuSinglePassGatherer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs, enable_global_physical_experts=True)
+
+    # can optimize (e.g. fuse / compile)
+    def on_select_experts(self, layer_idx: int, topk_ids: torch.Tensor):
+        topk_ids = topk_ids.flatten()
+        mask = topk_ids != -1
+        self._data[layer_idx, :].scatter_add_(
+            dim=0, index=topk_ids.masked_fill(~mask, 0).long(), src=mask.int()
+        )
+
+
+class _DeepepNormalSinglePassGatherer(_LayerBasedCpuSinglePassGatherer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if torch.distributed.get_rank() == 0:
+            logger.info(
+                "DeepepNormalSinglePassGatherer gathers approximate statistics. "
+                "If used with small batch size, consider using expert_distribution_recorder_mode=stat."
+            )
+
+    def on_deepep_dispatch_normal(
+        self,
+        layer_idx: int,
+        local_physical_count_of_layer: List[int],
+        num_tokens_per_rank,
+        num_tokens_per_rdma_rank,
+        num_tokens_per_expert,
+    ):
+        assert isinstance(local_physical_count_of_layer, list)
+        self._on_layer_data(layer_idx, local_physical_count_of_layer)
+
+    def collect(self) -> Dict:
+        local_physical_count = super()._collect_objects(
+            pad_len=self._expert_location_metadata.num_local_physical_experts
+        )
+        global_physical_count = _convert_local_to_global_physical_count(
+            local_physical_count,
+            rank=self._rank,
+            num_local_physical_experts=self._expert_location_metadata.num_local_physical_experts,
+            num_physical_experts=self._expert_location_metadata.num_physical_experts,
+        )
+        return dict(global_physical_count=global_physical_count)
+
+
+class _DeepepLowLatencySinglePassGatherer(_LayerBasedGpuSinglePassGatherer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs, enable_global_physical_experts=False)
+
+    def on_deepep_dispatch_low_latency(
+        self, layer_idx: int, local_physical_count_of_layer: torch.Tensor
+    ):
+        # Most naive implementation, can optimize later
+        self._data[layer_idx, :] += local_physical_count_of_layer
+
+
+def _convert_local_to_global_physical_count(
+    local_physical_count: torch.Tensor,
+    rank: int,
+    num_local_physical_experts: int,
+    num_physical_experts: int,
+) -> torch.Tensor:
+    dtype = local_physical_count.dtype
+    device = local_physical_count.device
+    num_layers, _ = local_physical_count.shape
+
+    ans = torch.zeros((num_layers, num_physical_experts), dtype=dtype, device=device)
+    ans[
+        :, num_local_physical_experts * rank : num_local_physical_experts * (rank + 1)
+    ] = local_physical_count
+    return ans
+
+
+# --------------------------------------- Accumulator -----------------------------------------
+
+_SINGLE_PASS_GATHERER_KEY_PRIMARY = "primary"
+
+
+class _Accumulator(ABC):
+    @staticmethod
+    def init_new(
+        server_args: ServerArgs,
+        expert_location_metadata: ExpertLocationMetadata,
+        rank: int,
+    ) -> "_Accumulator":
+        return _Accumulator.get_class(server_args)(
+            server_args, expert_location_metadata, rank
+        )
+
+    @staticmethod
+    def get_class(server_args: ServerArgs) -> Type["_Accumulator"]:
+        return {
+            "stat": _StatAccumulator,
+            "stat_approx": _StatAccumulator,
+            "per_pass": _DetailAccumulator,
+            "per_token": _DetailAccumulator,
+        }[server_args.expert_distribution_recorder_mode]
+
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        expert_location_metadata: ExpertLocationMetadata,
+        rank: int,
+    ):
+        self._server_args = server_args
+        self._expert_location_metadata = expert_location_metadata
+        self._rank = rank
+
+    def get_single_pass_gatherer_keys(self):
+        return [_SINGLE_PASS_GATHERER_KEY_PRIMARY]
+
+    def get_single_pass_gatherer_key(self, debug_name: Optional[str]):
+        return _SINGLE_PASS_GATHERER_KEY_PRIMARY
+
+    def append(
+        self,
+        forward_pass_id: int,
+        gatherer_key: str,
+        single_pass_data: Dict,
+    ):
+        pass
+
+    def reset(self):
+        pass
+
+    def dump(self, output_mode: _OutputMode):
+        pass
+
+
+class _UtilizationRateAccumulatorMixin(_Accumulator):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self._enable = self._server_args.enable_expert_distribution_metrics
+
+        if self._enable:
+            self.window_sizes = [10, 100, 1000]
+            self._history = _DequeCollection(maxlens=self.window_sizes)
+            self._rank = torch.distributed.get_rank()
+
+    def append(
+        self,
+        forward_pass_id: int,
+        gatherer_key: str,
+        single_pass_data: Dict,
+    ):
+        super().append(forward_pass_id, gatherer_key, single_pass_data)
+        if self._enable:
+            self._append_utilization_rate(
+                forward_pass_id, single_pass_data["global_physical_count"]
+            )
+
+    def reset(self):
+        super().reset()
+        if self._enable:
+            self._history.clear()
+
+    def _append_utilization_rate(
+        self, forward_pass_id: int, single_pass_global_physical_count: torch.Tensor
+    ):
+        gpu_physical_count = compute_gpu_physical_count(
+            single_pass_global_physical_count,
+            num_gpu=self._expert_location_metadata.ep_size,
+        )
+        gpu_physical_count = gpu_physical_count.to(self._server_args.device)
+        torch.distributed.reduce(
+            gpu_physical_count, dst=0, op=torch.distributed.ReduceOp.SUM
+        )
+
+        if self._rank == 0:
+            utilization_rate_tensor = compute_utilization_rate(gpu_physical_count)
+            utilization_rate = torch.mean(utilization_rate_tensor).item()
+            self._history.append(utilization_rate)
+
+            gpu_physical_count_sum = gpu_physical_count.sum().item()
+
+            logger.info(
+                f"[Expert Balancedness] "
+                f"forward_pass_id={forward_pass_id} "
+                f"current_pass_balancedness={utilization_rate:.03f} "
+                f"{''.join(f'last_{size}_average_balancedness={value:.03f} ' for size, value in self._history.mean().items())} "
+                f"gpu_physical_count_sum={gpu_physical_count_sum}"
+                # f"current_pass_per_layer={[round(x, 2) for x in utilization_rate_tensor.cpu().tolist()]}"
+            )
+
+
+class _DequeCollection:
+    def __init__(self, maxlens: List[int]):
+        self._dequeues = [deque(maxlen=maxlen) for maxlen in maxlens]
+
+    def append(self, value):
+        for d in self._dequeues:
+            d.append(value)
+
+    def clear(self):
+        for d in self._dequeues:
+            d.clear()
+
+    def mean(self) -> Dict[int, float]:
+        return {d.maxlen: sum(d) / len(d) for d in self._dequeues}
+
+
+class _DetailAccumulator(_UtilizationRateAccumulatorMixin):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._records = []
+
+    def get_single_pass_gatherer_keys(self):
+        if False:  # TODO `server_args.enable_two_batch_overlap`
+            return [_SINGLE_PASS_GATHERER_KEY_PRIMARY, "child_a", "child_b"]
+        return super().get_single_pass_gatherer_keys()
+
+    def get_single_pass_gatherer_key(self, debug_name: Optional[str]):
+        if False:  # TODO `server_args.enable_two_batch_overlap`
+            return debug_name or _SINGLE_PASS_GATHERER_KEY_PRIMARY
+        return super().get_single_pass_gatherer_key(debug_name)
+
+    def append(
+        self,
+        forward_pass_id: int,
+        gatherer_key: str,
+        single_pass_data: Dict,
+    ):
+        super().append(forward_pass_id, gatherer_key, single_pass_data)
+
+        def _process_object(obj):
+            if isinstance(obj, torch.Tensor):
+                return obj.cpu().clone()
+            return obj
+
+        single_pass_data_processed = {
+            k: _process_object(v) for k, v in single_pass_data.items()
+        }
+
+        self._records.append(
+            dict(
+                forward_pass_id=forward_pass_id,
+                rank=self._rank,
+                gatherer_key=gatherer_key,
+                **single_pass_data_processed,
+            )
+        )
+
+    def reset(self):
+        super().reset()
+        self._records.clear()
+
+    def dump(self, output_mode: _OutputMode):
+        assert output_mode == "file"
+        output = dict(
+            records=self._records,
+            # NOTE: This may change during recording, so here we say it is the "last" one
+            last_physical_to_logical_map=self._expert_location_metadata.physical_to_logical_map,
+        )
+        _dump_to_file(
+            f"expert_distribution_recorder_{time.time()}_{self._rank}.pt", output
+        )
+
+
+class _StatAccumulator(_UtilizationRateAccumulatorMixin):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._global_physical_count_of_buffered_step = _Buffer.init_new(
+            item_shape=(
+                self._expert_location_metadata.num_layers,
+                # Cannot use local_physical_count to support select_experts
+                self._expert_location_metadata.num_physical_experts,
+            ),
+            buffer_size=self._server_args.expert_distribution_recorder_buffer_size,
+            dtype=torch.int32,
+            device=self._server_args.device,
+        )
+        self._first_dump = True
+
+    def append(
+        self,
+        forward_pass_id: int,
+        gatherer_key: str,
+        single_pass_data: Dict,
+    ):
+        super().append(forward_pass_id, gatherer_key, single_pass_data)
+        # Can optimize if overhead here is large
+        self._global_physical_count_of_buffered_step.append(
+            single_pass_data["global_physical_count"]
+        )
+
+    def reset(self):
+        super().reset()
+        self._global_physical_count_of_buffered_step.reset()
+
+    def dump(self, output_mode: _OutputMode):
+        logical_count_of_buffered_step = _convert_global_physical_count_to_logical_count(
+            self._global_physical_count_of_buffered_step.get_all(),
+            num_layers=self._expert_location_metadata.num_layers,
+            num_logical_experts=self._expert_location_metadata.num_logical_experts,
+            physical_to_logical_map=self._expert_location_metadata.physical_to_logical_map,
+        )
+
+        if self._first_dump:
+            self._first_dump = False
+            torch.get_device_module().empty_cache()
+
+        torch.distributed.all_reduce(
+            logical_count_of_buffered_step, op=torch.distributed.ReduceOp.SUM
+        )
+
+        output = dict(
+            rank=self._rank,
+            logical_count=logical_count_of_buffered_step,
+            average_utilization_rate_over_window=self._get_global_average_utilization_rate(),
+        )
+
+        if output_mode == "file":
+            if self._rank == 0:
+                _dump_to_file(f"expert_distribution_recorder_{time.time()}.pt", output)
+        elif output_mode == "object":
+            return output
+        else:
+            raise NotImplementedError
+
+    def _get_global_average_utilization_rate(self):
+        if not self._enable or math.isclose(
+            self._server_args.eplb_min_rebalancing_utilization_threshold, 1.0
+        ):
+            return None
+
+        if self._rank == 0:
+            utilization_mean_rates = self._history.mean()
+            window_index = self.window_sizes[-1]
+            average_utilization_rate_over_window = (
+                utilization_mean_rates[window_index]
+                if window_index in utilization_mean_rates
+                else 0
+            )
+
+            avg_rate_tensor = torch.tensor(
+                [average_utilization_rate_over_window],
+                dtype=torch.float32,
+                device="cuda",
+            )
+        else:
+            avg_rate_tensor = torch.empty(1, dtype=torch.float32, device="cuda")
+        torch.distributed.broadcast(avg_rate_tensor, src=0)
+        return avg_rate_tensor.item()
+
+
+def _dump_to_file(name, data):
+    save_dir = Path(os.environ.get("SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR", "/tmp"))
+    path_output = save_dir / name
+    logger.info(f"Write expert distribution to {path_output}")
+    if not save_dir.exists():
+        save_dir.mkdir(parents=True, exist_ok=True)
+    torch.save(data, str(path_output))
+
+
+class _Buffer:
+    @staticmethod
+    def init_new(item_shape: Tuple, buffer_size: int, dtype, device):
+        if buffer_size < 0:
+            return _InfiniteBuffer(item_shape, dtype=dtype, device=device)
+        else:
+            return _CircularBuffer(item_shape, buffer_size, dtype=dtype, device=device)
+
+    def append(self, value: torch.Tensor):
+        raise NotImplementedError
+
+    def get_all(self) -> torch.Tensor:
+        raise NotImplementedError
+
+    def reset(self):
+        raise NotImplementedError
+
+
+class _CircularBuffer(_Buffer):
+    def __init__(self, item_shape: Tuple, buffer_size: int, dtype, device):
+        self._buffer = torch.zeros(
+            (buffer_size, *item_shape), dtype=dtype, device=device
+        )
+        self._curr_index = 0
+
+    def append(self, value: torch.Tensor):
+        self._buffer[self._curr_index] = value
+        self._curr_index = (self._curr_index + 1) % len(self._buffer)
+
+    def get_all(self) -> torch.Tensor:
+        return self._buffer
+
+    def reset(self):
+        self._buffer[...] = 0
+
+
+class _InfiniteBuffer(_Buffer):
+    def __init__(self, item_shape: Tuple, dtype, device):
+        self._item_shape = item_shape
+        self._buffer = torch.zeros((128, *item_shape), dtype=dtype, device=device)
+        self._size = 0
+
+    def append(self, value: torch.Tensor):
+        curr_buffer_size = len(self._buffer)
+        dtype = self._buffer.dtype
+        device = self._buffer.device
+
+        if self._size == curr_buffer_size:
+            new_buffer = torch.zeros(
+                (2 * curr_buffer_size, *self._item_shape), dtype=dtype, device=device
+            )
+            new_buffer[:curr_buffer_size] = self._buffer
+            self._buffer = new_buffer
+
+        self._buffer[self._size] = value
+        self._size += 1
+
+    def get_all(self) -> torch.Tensor:
+        return self._buffer[: self._size]
+
+    def reset(self):
+        self._buffer[...] = 0
+        self._size = 0
+
+
+def _convert_global_physical_count_to_logical_count(
+    # (whatever, num_layers, num_physical_experts)
+    global_physical_count: torch.Tensor,
+    num_layers: int,
+    num_logical_experts: int,
+    physical_to_logical_map: torch.Tensor,
+):
+    dim_extra, _, _ = global_physical_count.shape
+    dtype = global_physical_count.dtype
+    device = global_physical_count.device
+    logical_count = torch.zeros(
+        (dim_extra, num_layers, num_logical_experts), dtype=dtype, device=device
+    )
+    logical_count.scatter_add_(
+        dim=2,
+        index=physical_to_logical_map.unsqueeze(0)
+        .expand(dim_extra, -1, -1)
+        .to(torch.int64),
+        src=global_physical_count,
+    )
+    return logical_count
+
+
+def compute_gpu_physical_count(
+    physical_count_of_whatever: torch.Tensor,  # (..., num_layer, num_physical_expert)
+    num_gpu: int,
+):
+    """output: gpu_physical_count_of_batch (..., num_layer, num_gpu)"""
+    return einops.reduce(
+        physical_count_of_whatever,
+        "... num_layer (num_gpu num_expert_per_gpu) -> ... num_layer num_gpu",
+        "sum",
+        num_gpu=num_gpu,
+    )
+
+
+def compute_utilization_rate(
+    gpu_physical_count_of_batch: torch.Tensor,  # (..., num_layer, num_gpu)
+):
+    """output: utilization_rate (..., num_layer)"""
+    gpu_physical_count_of_batch = gpu_physical_count_of_batch.float()
+    max_gpu_physical_count = einops.reduce(
+        gpu_physical_count_of_batch,
+        "... num_layer num_gpu -> ... num_layer",
+        "max",
+    )
+    avg_gpu_physical_count = einops.reduce(
+        gpu_physical_count_of_batch,
+        "... num_layer num_gpu -> ... num_layer",
+        "mean",
+    )
+    return (avg_gpu_physical_count + 1e-5) / (max_gpu_physical_count + 1e-5)
diff --git a/python/sglang/srt/eplb/expert_location.py b/python/sglang/srt/eplb/expert_location.py
new file mode 100644
index 000000000..ee5f2c7ca
--- /dev/null
+++ b/python/sglang/srt/eplb/expert_location.py
@@ -0,0 +1,468 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import annotations
+
+import json
+import logging
+import random
+from dataclasses import dataclass
+from pathlib import Path
+from typing import TYPE_CHECKING, List, Optional
+
+import torch
+import torch.distributed
+import torch.nn.functional as F
+
+from sglang.srt.eplb import eplb_algorithms
+from sglang.srt.model_loader import get_model_architecture
+
+if TYPE_CHECKING:
+    from sglang.srt.configs.model_config import ModelConfig
+    from sglang.srt.server_args import ServerArgs
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ExpertLocationMetadata:
+    physical_to_logical_map: torch.Tensor  # (layers, num_physical_experts)
+    physical_to_logical_map_cpu: torch.Tensor
+    logical_to_all_physical_map: torch.Tensor  # (layers, num_logical_experts, X)
+    logical_to_all_physical_map_cpu: torch.Tensor  # CPU copy for performance
+    logical_to_all_physical_map_num_valid: torch.Tensor  # (layers, num_logical_experts)
+    # (layers, num_logical_experts)
+    logical_to_rank_dispatch_physical_map: Optional[torch.Tensor]
+
+    # -------------------------------- properties ------------------------------------
+
+    @property
+    def num_layers(self) -> int:
+        return self.physical_to_logical_map.shape[0]
+
+    @property
+    def num_physical_experts(self) -> int:
+        return self.physical_to_logical_map.shape[1]
+
+    @property
+    def num_local_physical_experts(self) -> int:
+        ans, remainder = divmod(self.num_physical_experts, self.ep_size)
+        assert remainder == 0
+        return ans
+
+    @property
+    def num_logical_experts(self) -> int:
+        return self.logical_to_all_physical_map.shape[1]
+
+    @property
+    def ep_size(self):
+        # TODO change when EP size != world size
+        return torch.distributed.get_world_size()
+
+    def __post_init__(self):
+        num_layers_0, num_physical_experts_0 = self.physical_to_logical_map.shape
+        num_layers_1, num_logical_experts_0, num_physical_experts_1 = (
+            self.logical_to_all_physical_map.shape
+        )
+        num_layers_2, num_logical_experts_1 = (
+            self.logical_to_all_physical_map_num_valid.shape
+        )
+        assert num_layers_0 == num_layers_1 == num_layers_2
+        assert num_logical_experts_0 == num_logical_experts_1
+        assert num_physical_experts_0 == num_physical_experts_1
+
+    # -------------------------------- construction ------------------------------------
+
+    @staticmethod
+    def init_trivial(server_args: ServerArgs, model_config: ModelConfig):
+        """Trivial location - logical expert i corresponds to physical expert i"""
+        common = ExpertLocationMetadata._init_common(server_args, model_config)
+
+        if common is None:
+            return None
+
+        num_physical_experts = common["num_physical_experts"]
+        model_config_for_expert_location = common["model_config_for_expert_location"]
+        num_layers = model_config_for_expert_location.num_layers
+        num_logical_experts = model_config_for_expert_location.num_logical_experts
+
+        physical_to_logical_map = (
+            torch.arange(0, num_physical_experts).repeat(num_layers, 1)
+            % num_logical_experts
+        )
+
+        return ExpertLocationMetadata.init_by_mapping(
+            server_args,
+            model_config,
+            physical_to_logical_map=physical_to_logical_map,
+        )
+
+    @staticmethod
+    def init_by_mapping(
+        server_args: ServerArgs,
+        model_config: ModelConfig,
+        physical_to_logical_map,
+    ):
+        if not isinstance(physical_to_logical_map, torch.Tensor):
+            physical_to_logical_map = torch.tensor(physical_to_logical_map)
+        physical_to_logical_map = physical_to_logical_map.to(server_args.device)
+
+        common = ExpertLocationMetadata._init_common(server_args, model_config)
+
+        if common is None:
+            return None
+
+        model_config_for_expert_location = common["model_config_for_expert_location"]
+        logical_to_all_physical_map = _compute_logical_to_all_physical_map(
+            physical_to_logical_map,
+            num_logical_experts=model_config_for_expert_location.num_logical_experts,
+        )
+
+        return ExpertLocationMetadata._init_raw(
+            server_args=server_args,
+            ep_size=common["ep_size"],
+            physical_to_logical_map=physical_to_logical_map,
+            logical_to_all_physical_map=logical_to_all_physical_map,
+        )
+
+    @staticmethod
+    def init_by_eplb(
+        server_args: ServerArgs, model_config: ModelConfig, logical_count: torch.Tensor
+    ):
+        if not isinstance(logical_count, torch.Tensor):
+            logical_count = torch.tensor(logical_count)
+        if len(logical_count.shape) == 2:
+            logical_count = logical_count.unsqueeze(0)
+        logical_count = logical_count.to(server_args.device)
+
+        common = ExpertLocationMetadata._init_common(server_args, model_config)
+
+        if common is None:
+            return None
+
+        model_config_for_expert_location = common["model_config_for_expert_location"]
+        num_physical_experts = common["num_physical_experts"]
+        num_groups = model_config_for_expert_location.num_groups
+        num_nodes = server_args.nnodes
+
+        physical_to_logical_map, logical_to_all_physical_map, expert_count = (
+            eplb_algorithms.rebalance_experts(
+                tokens_per_expert=logical_count,
+                num_physical_experts=num_physical_experts,
+                num_local_physical_experts=num_physical_experts // common["ep_size"],
+                num_groups=num_groups,
+                num_nodes=num_nodes,
+                algorithm=eplb_algorithms.compute_algorithm(
+                    raw_algorithm=server_args.eplb_algorithm,
+                    num_groups=num_groups,
+                    num_nodes=num_nodes,
+                ),
+            )
+        )
+
+        return ExpertLocationMetadata._init_raw(
+            server_args=server_args,
+            ep_size=common["ep_size"],
+            physical_to_logical_map=physical_to_logical_map.to(server_args.device),
+            logical_to_all_physical_map=logical_to_all_physical_map.to(
+                server_args.device
+            ),
+        )
+
+    @staticmethod
+    def _init_common(server_args: ServerArgs, model_config: ModelConfig):
+        model_config_for_expert_location = (
+            ModelConfigForExpertLocation.from_model_config(model_config)
+        )
+
+        if model_config_for_expert_location is None:
+            return None
+
+        num_physical_experts = (
+            model_config_for_expert_location.num_logical_experts
+            + server_args.ep_num_redundant_experts
+        )
+        ep_size = server_args.ep_size
+        assert num_physical_experts % ep_size == 0
+        num_local_physical_experts = num_physical_experts // ep_size
+
+        return dict(
+            model_config_for_expert_location=model_config_for_expert_location,
+            num_physical_experts=num_physical_experts,
+            num_local_physical_experts=num_local_physical_experts,
+            ep_size=ep_size,
+        )
+
+    @staticmethod
+    def _init_raw(
+        server_args: ServerArgs,
+        ep_size: int,
+        physical_to_logical_map: torch.Tensor,
+        logical_to_all_physical_map: torch.Tensor,
+    ):
+        _, num_physical_experts = physical_to_logical_map.shape
+
+        logical_to_all_physical_map_padded = F.pad(
+            logical_to_all_physical_map,
+            (0, num_physical_experts - logical_to_all_physical_map.shape[-1]),
+            value=-1,
+        )
+
+        logical_to_all_physical_map_num_valid = torch.count_nonzero(
+            logical_to_all_physical_map != -1, dim=-1
+        )
+
+        return ExpertLocationMetadata(
+            physical_to_logical_map=physical_to_logical_map,
+            physical_to_logical_map_cpu=physical_to_logical_map.cpu(),
+            logical_to_all_physical_map=logical_to_all_physical_map_padded,
+            logical_to_all_physical_map_cpu=logical_to_all_physical_map_padded.cpu(),
+            logical_to_all_physical_map_num_valid=logical_to_all_physical_map_num_valid,
+            logical_to_rank_dispatch_physical_map=(
+                compute_logical_to_rank_dispatch_physical_map(
+                    logical_to_all_physical_map=logical_to_all_physical_map,
+                    num_gpus=ep_size,
+                    num_physical_experts=num_physical_experts,
+                    # TODO improve when we have real EP rank
+                    ep_rank=torch.distributed.get_rank() % ep_size,
+                )
+                if server_args.ep_dispatch_algorithm == "static"
+                else None
+            ),
+        )
+
+    # -------------------------------- mutation ------------------------------------
+
+    def update(
+        self,
+        other: "ExpertLocationMetadata",
+        update_layer_ids: List[int],
+    ):
+        for field in [
+            "ep_size",
+        ]:
+            assert getattr(self, field) == getattr(other, field)
+
+        for field in [
+            "physical_to_logical_map",
+            "physical_to_logical_map_cpu",
+            "logical_to_all_physical_map",
+            "logical_to_all_physical_map_cpu",
+            "logical_to_all_physical_map_num_valid",
+            "logical_to_rank_dispatch_physical_map",
+        ]:
+            other_field = getattr(other, field)
+            self_field = getattr(self, field)
+            assert (other_field is not None) == (self_field is not None)
+            if self_field is not None:
+                mask_update = torch.tensor(
+                    [i in update_layer_ids for i in range(self.num_layers)]
+                )
+                mask_update = mask_update.view(*([-1] + [1] * (self_field.dim() - 1)))
+                mask_update = mask_update.to(self_field.device, non_blocking=True)
+                self_field[...] = torch.where(mask_update, other_field, self_field)
+
+    # -------------------------------- usage ------------------------------------
+
+    def logical_to_all_physical(
+        self, layer_id: int, logical_expert_id: int
+    ) -> List[int]:
+        # Use CPU copy to avoid GPU→CPU sync on every call, which is expensive in update weights scenario
+        return [
+            physical_expert_id
+            for physical_expert_id in self.logical_to_all_physical_map_cpu[
+                layer_id, logical_expert_id
+            ].tolist()
+            if physical_expert_id != -1
+        ]
+
+
+_global_expert_location_metadata: Optional[ExpertLocationMetadata] = None
+
+
+def get_global_expert_location_metadata():
+    return _global_expert_location_metadata
+
+
+def set_global_expert_location_metadata(value):
+    global _global_expert_location_metadata
+    assert _global_expert_location_metadata is None
+    _global_expert_location_metadata = value
+
+
+def _compute_logical_to_all_physical_map(
+    physical_to_logical_map: torch.Tensor, num_logical_experts: int
+):
+    # This is rarely called, so we use for loops for maximum clarity
+
+    num_layers, num_physical_experts = physical_to_logical_map.shape
+
+    logical_to_all_physical_map = [
+        [[] for _ in range(num_logical_experts)] for _ in range(num_layers)
+    ]
+    for layer_id in range(num_layers):
+        for physical_expert_id in range(num_physical_experts):
+            logical_expert_id = physical_to_logical_map[
+                layer_id, physical_expert_id
+            ].item()
+            logical_to_all_physical_map[layer_id][logical_expert_id].append(
+                physical_expert_id
+            )
+
+    logical_to_all_physical_map = _pad_nested_array(
+        logical_to_all_physical_map, pad_value=-1
+    )
+
+    return torch.tensor(
+        logical_to_all_physical_map, device=physical_to_logical_map.device
+    )
+
+
+def _pad_nested_array(arr, pad_value):
+    max_len = max(len(inner) for outer in arr for inner in outer)
+    padded = [
+        [inner + [pad_value] * (max_len - len(inner)) for inner in outer]
+        for outer in arr
+    ]
+    return padded
+
+
+# TODO optimize performance (rewrite and/or run in separate process with overlap)
+def compute_logical_to_rank_dispatch_physical_map(
+    logical_to_all_physical_map: torch.Tensor,
+    num_gpus: int,
+    num_physical_experts: int,
+    ep_rank: int,
+    seed: int = 42,
+):
+    r = random.Random(seed)
+
+    num_local_physical_experts = num_physical_experts // num_gpus
+    num_layers, num_logical_experts, _ = logical_to_all_physical_map.shape
+    dtype = logical_to_all_physical_map.dtype
+
+    logical_to_rank_dispatch_physical_map = torch.full(
+        size=(num_gpus, num_layers, num_logical_experts),
+        fill_value=-1,
+        dtype=dtype,
+    )
+
+    for layer_id in range(num_layers):
+        for logical_expert_id in range(num_logical_experts):
+            candidate_physical_expert_ids = _logical_to_all_physical_raw(
+                logical_to_all_physical_map, layer_id, logical_expert_id
+            )
+            output_partial = logical_to_rank_dispatch_physical_map[
+                :, layer_id, logical_expert_id
+            ]
+
+            for gpu_id in range(num_gpus):
+                same_gpu_physical_expert_ids = [
+                    physical_expert_id
+                    for physical_expert_id in candidate_physical_expert_ids
+                    if _compute_gpu_id_of_physical_expert(
+                        physical_expert_id, num_local_physical_experts
+                    )
+                    == gpu_id
+                ]
+                if len(same_gpu_physical_expert_ids) > 0:
+                    output_partial[gpu_id] = same_gpu_physical_expert_ids[0]
+
+            num_remain = torch.sum(output_partial == -1).item()
+            output_partial[output_partial == -1] = torch.tensor(
+                _fair_choices(candidate_physical_expert_ids, k=num_remain, r=r),
+                dtype=dtype,
+            )
+
+    assert torch.all(logical_to_rank_dispatch_physical_map != -1)
+
+    device = logical_to_all_physical_map.device
+    return logical_to_rank_dispatch_physical_map[ep_rank, :, :].to(device)
+
+
+def _logical_to_all_physical_raw(
+    logical_to_all_physical_map, layer_id: int, logical_expert_id: int
+) -> List[int]:
+    return [
+        physical_expert_id
+        for physical_expert_id in logical_to_all_physical_map[
+            layer_id, logical_expert_id
+        ].tolist()
+        if physical_expert_id != -1
+    ]
+
+
+def _compute_gpu_id_of_physical_expert(
+    physical_expert_id: int, num_local_physical_experts: int
+) -> int:
+    return physical_expert_id // num_local_physical_experts
+
+
+def _fair_choices(arr: List, k: int, r: random.Random) -> List:
+    quotient, remainder = divmod(k, len(arr))
+    ans = arr * quotient + r.sample(arr, k=remainder)
+    r.shuffle(ans)
+    return ans
+
+
+@dataclass
+class ModelConfigForExpertLocation:
+    num_layers: int
+    num_logical_experts: int
+    num_groups: Optional[int] = None
+
+    @staticmethod
+    def from_model_config(model_config: ModelConfig):
+        model_class, _ = get_model_architecture(model_config)
+        if hasattr(model_class, "get_model_config_for_expert_location"):
+            return model_class.get_model_config_for_expert_location(
+                model_config.hf_config
+            )
+        else:
+            return None
+
+
+def compute_initial_expert_location_metadata(
+    server_args: ServerArgs, model_config: ModelConfig
+) -> Optional[ExpertLocationMetadata]:
+    data = server_args.init_expert_location
+    if data == "trivial":
+        return ExpertLocationMetadata.init_trivial(server_args, model_config)
+
+    # TODO unify with the utils function
+    if data.endswith(".pt"):
+        data_dict = torch.load(data, weights_only=True)
+    elif data.endswith(".json"):
+        data_dict = json.loads(Path(data).read_text())
+    else:
+        data_dict = json.loads(data)
+
+    if "physical_to_logical_map" in data_dict:
+        logger.info(
+            "init_expert_location from init_by_mapping using ServerArgs.init_expert_location"
+        )
+        return ExpertLocationMetadata.init_by_mapping(
+            server_args, model_config, **data_dict
+        )
+    elif "logical_count" in data_dict:
+        logger.info(
+            "init_expert_location from init_by_eplb using ServerArgs.init_expert_location"
+        )
+        return ExpertLocationMetadata.init_by_eplb(
+            server_args, model_config, logical_count=data_dict["logical_count"]
+        )
+    else:
+        raise NotImplementedError(
+            f"Unknown init_expert_location format ({list(data_dict.keys())=})"
+        )
diff --git a/python/sglang/srt/eplb/expert_location_dispatch.py b/python/sglang/srt/eplb/expert_location_dispatch.py
new file mode 100644
index 000000000..624dc3fd1
--- /dev/null
+++ b/python/sglang/srt/eplb/expert_location_dispatch.py
@@ -0,0 +1,109 @@
+# Copyright 2023-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from dataclasses import dataclass
+from typing import Literal, Optional
+
+import torch
+
+from sglang.srt.eplb.expert_location import get_global_expert_location_metadata
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+
+
+@dataclass
+class ExpertLocationDispatchInfo:
+    ep_dispatch_algorithm: Literal["static", "random"]
+    # (num_logical_experts,)
+    partial_logical_to_rank_dispatch_physical_map: Optional[torch.Tensor]
+    # (num_logical_experts, X)
+    partial_logical_to_all_physical_map: torch.Tensor
+    # (num_logical_experts,)
+    partial_logical_to_all_physical_map_num_valid: torch.Tensor
+    num_physical_experts: int
+
+    @classmethod
+    def init_new(cls, layer_id: int):
+        ep_dispatch_algorithm = global_server_args_dict["ep_dispatch_algorithm"]
+        expert_location_metadata = get_global_expert_location_metadata()
+        assert expert_location_metadata is not None
+
+        if ep_dispatch_algorithm is None:
+            return None
+
+        return cls(
+            ep_dispatch_algorithm=ep_dispatch_algorithm,
+            partial_logical_to_rank_dispatch_physical_map=(
+                expert_location_metadata.logical_to_rank_dispatch_physical_map[
+                    layer_id, :
+                ]
+                if expert_location_metadata.logical_to_rank_dispatch_physical_map
+                is not None
+                else None
+            ),
+            partial_logical_to_all_physical_map=expert_location_metadata.logical_to_all_physical_map[
+                layer_id, :
+            ],
+            partial_logical_to_all_physical_map_num_valid=expert_location_metadata.logical_to_all_physical_map_num_valid[
+                layer_id, :
+            ],
+            num_physical_experts=expert_location_metadata.num_physical_experts,
+        )
+
+
+def transform_select_experts_inputs(
+    router_logits: torch.Tensor,
+    correction_bias: Optional[torch.Tensor],
+    info: Optional[ExpertLocationDispatchInfo],
+):
+    if (info is not None) and (info.ep_dispatch_algorithm == "fake"):
+        router_logits.uniform_(5, 10)
+        if correction_bias is not None:
+            correction_bias = torch.zeros_like(correction_bias)
+    return router_logits, correction_bias
+
+
+def topk_ids_logical_to_physical(
+    topk_ids: torch.Tensor, info: Optional[ExpertLocationDispatchInfo]
+) -> torch.Tensor:
+    if info is None:
+        return topk_ids
+
+    if info.ep_dispatch_algorithm == "static":
+        return _topk_ids_logical_to_physical_static(topk_ids, info)
+    if info.ep_dispatch_algorithm in ["dynamic", "fake"]:
+        return _topk_ids_logical_to_physical_dynamic(topk_ids, info)
+    raise NotImplementedError(f"Unknown algorithm {info.ep_dispatch_algorithm}")
+
+
+def _topk_ids_logical_to_physical_static(
+    topk_ids: torch.Tensor, info: Optional[ExpertLocationDispatchInfo]
+) -> torch.Tensor:
+    return info.partial_logical_to_rank_dispatch_physical_map[topk_ids]
+
+
+def _topk_ids_logical_to_physical_dynamic(
+    topk_ids: torch.Tensor, info: Optional[ExpertLocationDispatchInfo]
+) -> torch.Tensor:
+    topk_ids_original_shape = topk_ids.shape
+    device = topk_ids.device
+    topk_ids = topk_ids.flatten()
+
+    chosen_dispatch_index = (
+        torch.randint(0, 65536, topk_ids.shape, dtype=torch.int32, device=device)
+        % info.partial_logical_to_all_physical_map_num_valid[topk_ids]
+    )
+    topk_ids = info.partial_logical_to_all_physical_map[topk_ids, chosen_dispatch_index]
+
+    topk_ids = topk_ids.view(topk_ids_original_shape)
+    return topk_ids
diff --git a/python/sglang/srt/eplb/expert_location_updater.py b/python/sglang/srt/eplb/expert_location_updater.py
new file mode 100644
index 000000000..772e65f18
--- /dev/null
+++ b/python/sglang/srt/eplb/expert_location_updater.py
@@ -0,0 +1,575 @@
+# Copyright 2023-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import logging
+from collections import defaultdict
+from typing import Dict, List, Optional, Tuple
+
+import einops
+import torch
+import torch.distributed
+from torch.distributed import P2POp
+
+from sglang.srt.eplb.expert_location import (
+    ExpertLocationMetadata,
+    get_global_expert_location_metadata,
+)
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.utils import get_bool_env_var
+
+logger = logging.getLogger(__name__)
+
+
+_LOG_INPUT = get_bool_env_var("SGLANG_EXPERT_LOCATION_UPDATER_LOG_INPUT")
+
+
+class ExpertLocationUpdater:
+    def __init__(self):
+        self._first_execution = True
+
+    def update(
+        self,
+        routed_experts_weights_of_layer: Dict[int, List[torch.Tensor]],
+        new_expert_location_metadata: ExpertLocationMetadata,
+        update_layer_ids: List[int],
+        nnodes: int,
+        rank: int,
+    ):
+        if self._first_execution:
+            self._first_execution = False
+            torch.get_device_module().empty_cache()
+
+        old_expert_location_metadata = get_global_expert_location_metadata()
+        assert old_expert_location_metadata is not None
+
+        _update_expert_weights(
+            routed_experts_weights_of_layer=routed_experts_weights_of_layer,
+            old_expert_location_metadata=old_expert_location_metadata,
+            new_expert_location_metadata=new_expert_location_metadata,
+            update_layer_ids=update_layer_ids,
+            nnodes=nnodes,
+            rank=rank,
+        )
+        old_expert_location_metadata.update(
+            new_expert_location_metadata,
+            update_layer_ids=update_layer_ids,
+        )
+
+
+def _update_expert_weights(**kwargs):
+    if get_bool_env_var("SGLANG_EXPERT_LOCATION_UPDATER_CANARY"):
+        return _update_expert_weights_with_canary(**kwargs)
+    else:
+        return _update_expert_weights_raw(**kwargs)
+
+
+# can add watchdog as well
+def _update_expert_weights_with_canary(
+    routed_experts_weights_of_layer: Dict[int, List[torch.Tensor]],
+    old_expert_location_metadata: ExpertLocationMetadata,
+    new_expert_location_metadata: ExpertLocationMetadata,
+    update_layer_ids: List[int],
+    nnodes: int,
+    rank: int,
+):
+    num_local_physical_experts = old_expert_location_metadata.num_local_physical_experts
+
+    def _get_canary_value(meta: ExpertLocationMetadata, layer_id: int):
+        return meta.physical_to_logical_map_cpu[
+            layer_id,
+            num_local_physical_experts * rank : num_local_physical_experts * (rank + 1),
+        ]
+
+    routed_experts_weights_of_layer = {
+        k: [x for x in v] for k, v in routed_experts_weights_of_layer.items()
+    }
+    for layer_id in update_layer_ids:
+        canary_tensor = (
+            _get_canary_value(old_expert_location_metadata, layer_id)
+            .clone()
+            .to(device=global_server_args_dict["device"], non_blocking=True)
+        )
+        routed_experts_weights_of_layer[layer_id].append(canary_tensor)
+
+    _update_expert_weights_raw(
+        routed_experts_weights_of_layer=routed_experts_weights_of_layer,
+        old_expert_location_metadata=old_expert_location_metadata,
+        new_expert_location_metadata=new_expert_location_metadata,
+        update_layer_ids=update_layer_ids,
+        nnodes=nnodes,
+        rank=rank,
+    )
+
+    for layer_id in update_layer_ids:
+        # can optimize speed if needed
+        expect_value = _get_canary_value(new_expert_location_metadata, layer_id)
+        actual_value = routed_experts_weights_of_layer[layer_id][-1].cpu()
+        assert torch.all(expect_value == actual_value), (
+            f"{expect_value=} {actual_value=} {layer_id=} "
+            f"{old_expert_location_metadata.physical_to_logical_map_cpu.tolist()=} "
+            f"{new_expert_location_metadata.physical_to_logical_map_cpu.tolist()=} "
+        )
+
+
+def _update_expert_weights_raw(
+    routed_experts_weights_of_layer: Dict[int, List[torch.Tensor]],
+    old_expert_location_metadata: ExpertLocationMetadata,
+    new_expert_location_metadata: ExpertLocationMetadata,
+    update_layer_ids: List[int],
+    nnodes: int,
+    rank: int,
+):
+    log_metrics = get_bool_env_var("SGLANG_EXPERT_LOCATION_UPDATER_LOG_METRICS")
+
+    temp_buffers = create_temp_buffers(
+        routed_experts_weights_of_layer[update_layer_ids[0]]
+    )
+
+    world_size = torch.distributed.get_world_size()
+    num_local_physical_experts = old_expert_location_metadata.num_local_physical_experts
+    num_gpu_per_node = world_size // nnodes
+
+    for layer_id in update_layer_ids:
+        update_expert_weights_single_layer(
+            routed_experts_weights=routed_experts_weights_of_layer[layer_id],
+            temp_buffers=temp_buffers,
+            old_physical_to_logical_map=old_expert_location_metadata.physical_to_logical_map_cpu[
+                layer_id
+            ].tolist(),
+            new_physical_to_logical_map=new_expert_location_metadata.physical_to_logical_map_cpu[
+                layer_id
+            ].tolist(),
+            num_local_physical_experts=num_local_physical_experts,
+            num_gpu_per_node=num_gpu_per_node,
+            rank=rank,
+            world_size=world_size,
+            log_metrics=log_metrics,
+        )
+
+
+def create_temp_buffers(sample_tensors):
+    return [torch.empty_like(tensor) for tensor in sample_tensors]
+
+
+def update_expert_weights_single_layer(
+    routed_experts_weights: List[torch.Tensor],
+    temp_buffers: List[torch.Tensor],
+    old_physical_to_logical_map: List[int],  # (num_physical_Experts,)
+    new_physical_to_logical_map: List[int],  # (num_physical_Experts,)
+    num_local_physical_experts: int,
+    num_gpu_per_node: int,
+    rank: int,
+    world_size: Optional[int] = None,
+    debug: bool = False,
+    log_metrics: bool = False,
+):
+    assert all(
+        tensor.shape[0] == num_local_physical_experts
+        for tensor in routed_experts_weights
+    ), f"{num_local_physical_experts=} {[x.shape for x in routed_experts_weights]=}"
+    assert isinstance(old_physical_to_logical_map, list)
+    assert isinstance(new_physical_to_logical_map, list)
+
+    if _LOG_INPUT:
+        logger.info(
+            "update_expert_weights_single_layer "
+            f"{[x.shape for x in routed_experts_weights]=} "
+            f"{[x.shape for x in temp_buffers]=} "
+            f"{old_physical_to_logical_map=} "
+            f"{new_physical_to_logical_map=} "
+            f"{num_local_physical_experts=} "
+            f"{num_gpu_per_node=} "
+            f"{rank=} "
+            f"{world_size=} "
+        )
+
+    output_logs = [] if debug else None
+
+    num_physical_experts = len(old_physical_to_logical_map)
+    num_tensors = len(routed_experts_weights)
+
+    self_node_id = rank // num_gpu_per_node
+
+    local_expert_location_range = (
+        rank * num_local_physical_experts,
+        (rank + 1) * num_local_physical_experts,
+    )
+
+    def _entrypoint():
+        # List[Tuple[logical_expert_id, List[P2POp]]]
+        p2p_op_infos: List[Tuple[int, List[P2POp]]] = []
+        # List[Tuple[temp_buffers_expert_location, routed_experts_weights_expert_location]]
+        buffer2weight_copy_infos: List[Tuple[int, int]] = []
+
+        _handle_recv(buffer2weight_copy_infos, p2p_op_infos)
+        _create_isend_ops(p2p_op_infos)
+        _execute_p2p_ops(p2p_op_infos)
+        _execute_buffer2weight_copies(buffer2weight_copy_infos)
+
+        if log_metrics:
+            _log_p2p_op_metrics(
+                p2p_op_infos,
+                world_size=world_size,
+                num_gpu_per_node=num_gpu_per_node,
+                self_node_id=self_node_id,
+            )
+
+        if debug:
+            output_logs.append(f"{p2p_op_infos=}")
+            output_logs.append(f"{buffer2weight_copy_infos=}")
+
+    def _handle_recv(buffer2weight_copy_infos, p2p_op_infos):
+        for dst_expert_location in range(*local_expert_location_range):
+            _handle_recv_of_dst_expert_location(
+                dst_expert_location, buffer2weight_copy_infos, p2p_op_infos
+            )
+
+    def _handle_recv_of_dst_expert_location(
+        dst_expert_location: int, buffer2weight_copy_infos, p2p_op_infos
+    ):
+        logical_expert_id = new_physical_to_logical_map[dst_expert_location]
+
+        # case 1: unchanged
+        if old_physical_to_logical_map[dst_expert_location] == logical_expert_id:
+            if debug:
+                output_logs.append(
+                    f"handle_recv_of_dst_expert_location {dst_expert_location=} case=unchanged"
+                )
+            return
+
+        # case 2: same-gpu
+        for src_expert_location in range(*local_expert_location_range):
+            if old_physical_to_logical_map[src_expert_location] == logical_expert_id:
+                for i in range(num_tensors):
+                    _get_tensor(temp_buffers, i, dst_expert_location).copy_(
+                        _get_tensor(routed_experts_weights, i, src_expert_location)
+                    )
+                buffer2weight_copy_infos.append(
+                    (dst_expert_location, dst_expert_location)
+                )
+                if debug:
+                    output_logs.append(
+                        f"handle_recv_of_dst_expert_location {dst_expert_location=} case=same-gpu {src_expert_location=}"
+                    )
+                return
+
+        # case 3: free-rider
+        for src_expert_location in range(
+            rank * num_local_physical_experts, dst_expert_location
+        ):
+            if new_physical_to_logical_map[src_expert_location] == logical_expert_id:
+                buffer2weight_copy_infos.append(
+                    (src_expert_location, dst_expert_location)
+                )
+                if debug:
+                    output_logs.append(
+                        f"handle_recv_of_dst_expert_location {dst_expert_location=} case=free-rider {src_expert_location=}"
+                    )
+                return
+
+        same_node_mapping, cross_node_mapping, need_comm_self_node_dst_ranks = (
+            _compute_comm_info(logical_expert_id=logical_expert_id)
+        )
+
+        # case 4: same-node
+        if rank in need_comm_self_node_dst_ranks:
+            chosen_src_rank = same_node_mapping.chunk_value_from_element_value(
+                element_value=rank
+            )
+            _create_p2p_recv_and_buffer2weight_copy(
+                buffer2weight_copy_infos,
+                p2p_op_infos,
+                src_rank=chosen_src_rank,
+                logical_expert_id=logical_expert_id,
+                dst_expert_location=dst_expert_location,
+            )
+            if debug:
+                output_logs.append(
+                    f"handle_recv_of_dst_expert_location {dst_expert_location=} case=same-node {chosen_src_rank=}"
+                )
+            return
+
+        # case 5: cross-node
+        # Future work: can optimize when there are multiple ranks in the same dst node that uses the same logical expert
+        chosen_src_rank = cross_node_mapping.chunk_value_from_element_value(
+            element_value=rank
+        )
+        _create_p2p_recv_and_buffer2weight_copy(
+            buffer2weight_copy_infos,
+            p2p_op_infos,
+            src_rank=chosen_src_rank,
+            logical_expert_id=logical_expert_id,
+            dst_expert_location=dst_expert_location,
+        )
+        if debug:
+            output_logs.append(
+                f"handle_recv_of_dst_expert_location {dst_expert_location=} case=cross-node {chosen_src_rank=}"
+            )
+        return
+
+    def _create_p2p_recv_and_buffer2weight_copy(
+        buffer2weight_copy_infos,
+        p2p_op_infos,
+        *,
+        logical_expert_id: int,
+        src_rank: int,
+        dst_expert_location: int,
+    ):
+        p2p_op_infos.append(
+            (
+                logical_expert_id,
+                [
+                    P2POp(
+                        op=torch.distributed.irecv,
+                        tensor=_get_tensor(temp_buffers, i, dst_expert_location),
+                        peer=src_rank,
+                    )
+                    for i in range(num_tensors)
+                ],
+            )
+        )
+        buffer2weight_copy_infos.append((dst_expert_location, dst_expert_location))
+
+    def _create_isend_ops(p2p_op_infos):
+        handled_logical_expert_ids = set()
+        for src_expert_location in range(*local_expert_location_range):
+            logical_expert_id = old_physical_to_logical_map[src_expert_location]
+
+            if logical_expert_id in handled_logical_expert_ids:
+                continue
+            handled_logical_expert_ids.add(logical_expert_id)
+
+            _create_isend_ops_of_logical_expert_id(
+                logical_expert_id, src_expert_location, p2p_op_infos
+            )
+
+    def _create_isend_ops_of_logical_expert_id(
+        logical_expert_id, src_expert_location, p2p_op_infos
+    ):
+        same_node_mapping, cross_node_mapping, need_comm_self_node_dst_ranks = (
+            _compute_comm_info(logical_expert_id=logical_expert_id)
+        )
+
+        same_node_dst_ranks = same_node_mapping.element_values_from_chunk_value(
+            chunk_value=rank
+        )
+        cross_node_dst_ranks = cross_node_mapping.element_values_from_chunk_value(
+            chunk_value=rank
+        )
+        all_dst_ranks = same_node_dst_ranks + cross_node_dst_ranks
+
+        if debug:
+            output_logs.append(
+                f"create_isend_ops_of_logical_expert_id {logical_expert_id=} {src_expert_location=} {same_node_dst_ranks=} {cross_node_dst_ranks=}"
+            )
+
+        p2p_op_infos.append(
+            (
+                logical_expert_id,
+                [
+                    P2POp(
+                        op=torch.distributed.isend,
+                        tensor=_get_tensor(
+                            routed_experts_weights, i, src_expert_location
+                        ),
+                        peer=dst_rank,
+                    )
+                    for dst_rank in all_dst_ranks
+                    for i in range(num_tensors)
+                ],
+            )
+        )
+
+    def _compute_comm_info(logical_expert_id: int):
+        all_src_ranks = _deduplicate_ordered(
+            [
+                x // num_local_physical_experts
+                for x in range(num_physical_experts)
+                if old_physical_to_logical_map[x] == logical_expert_id
+            ]
+        )
+        all_src_nodes = [x // num_gpu_per_node for x in all_src_ranks]
+        self_node_src_ranks = [
+            x for x in all_src_ranks if x // num_gpu_per_node == self_node_id
+        ]
+
+        need_comm_dst_ranks = _deduplicate_ordered(
+            [
+                x // num_local_physical_experts
+                for x in range(num_physical_experts)
+                if new_physical_to_logical_map[x] == logical_expert_id
+                and x // num_local_physical_experts not in all_src_ranks
+            ]
+        )
+        need_comm_self_node_dst_ranks = (
+            [x for x in need_comm_dst_ranks if x // num_gpu_per_node == self_node_id]
+            if len(self_node_src_ranks) > 0
+            else []
+        )
+        need_comm_cross_node_dst_ranks = [
+            x
+            for x in need_comm_dst_ranks
+            if (x // num_gpu_per_node) not in all_src_nodes
+        ]
+
+        same_node_mapping = _ChunkUtils(
+            chunk_values=self_node_src_ranks,
+            element_values=need_comm_self_node_dst_ranks,
+        )
+
+        cross_node_mapping = _ChunkUtils(
+            chunk_values=all_src_ranks,
+            element_values=need_comm_cross_node_dst_ranks,
+        )
+
+        return same_node_mapping, cross_node_mapping, need_comm_self_node_dst_ranks
+
+    def _execute_p2p_ops(p2p_op_infos):
+        sorted_infos = sorted(p2p_op_infos, key=lambda info: info[0])
+        p2p_ops = [op for _, ops in sorted_infos for op in ops]
+        if len(p2p_ops) == 0:
+            return
+
+        reqs = torch.distributed.batch_isend_irecv(p2p_ops)
+        for req in reqs:
+            req.wait()
+
+    def _execute_buffer2weight_copies(buffer2weight_copy_infos):
+        for (
+            temp_buffers_expert_location,
+            routed_experts_weights_expert_location,
+        ) in buffer2weight_copy_infos:
+            for i in range(num_tensors):
+                _get_tensor(
+                    routed_experts_weights, i, routed_experts_weights_expert_location
+                ).copy_(_get_tensor(temp_buffers, i, temp_buffers_expert_location))
+
+    def _get_tensor(tensors, tensor_index: int, expert_location: int) -> torch.Tensor:
+        return tensors[tensor_index][_get_local_expert_location(expert_location)]
+
+    def _get_local_expert_location(expert_location: int) -> int:
+        assert (
+            local_expert_location_range[0]
+            <= expert_location
+            < local_expert_location_range[1]
+        )
+        return expert_location % num_local_physical_experts
+
+    _entrypoint()
+
+    return output_logs
+
+
+class _ChunkUtils:
+    def __init__(self, *, chunk_values: List, element_values: List):
+        self.chunk_values = chunk_values
+        self.element_values = element_values
+
+    def chunk_value_from_element_value(self, element_value):
+        chunk_index = self._chunk_index_from_element_index(
+            num_elements=len(self.element_values),
+            num_chunks=len(self.chunk_values),
+            element_index=self.element_values.index(element_value),
+        )
+        return self.chunk_values[chunk_index]
+
+    def element_values_from_chunk_value(self, chunk_value) -> List:
+        if len(self.element_values) == 0:
+            return []
+        element_slice = self._element_slice_from_chunk_index(
+            num_elements=len(self.element_values),
+            num_chunks=len(self.chunk_values),
+            chunk_index=self.chunk_values.index(chunk_value),
+        )
+        return self.element_values[element_slice]
+
+    @staticmethod
+    def _chunk_index_from_element_index(
+        num_elements: int, num_chunks: int, element_index: int
+    ) -> int:
+        short_chunk_size, num_long_chunks = divmod(num_elements, num_chunks)
+        num_elements_for_long_chunks = num_long_chunks * (short_chunk_size + 1)
+        if element_index < num_elements_for_long_chunks:
+            return element_index // (short_chunk_size + 1)
+        else:
+            return (
+                num_long_chunks
+                + (element_index - num_elements_for_long_chunks) // short_chunk_size
+            )
+
+    @staticmethod
+    def _element_slice_from_chunk_index(
+        num_elements: int, num_chunks: int, chunk_index: int
+    ) -> slice:
+        short_chunk_size, num_long_chunks = divmod(num_elements, num_chunks)
+        start = chunk_index * short_chunk_size + min(chunk_index, num_long_chunks)
+        end = start + short_chunk_size + int(chunk_index < num_long_chunks)
+        return slice(start, end)
+
+
+def _deduplicate_ordered(arr: List[int]):
+    output = []
+    for item in arr:
+        if len(output) == 0 or item != output[-1]:
+            output.append(item)
+    return output
+
+
+def _log_p2p_op_metrics(
+    p2p_op_infos: List[Tuple[int, List[P2POp]]],
+    num_gpu_per_node: int,
+    world_size: int,
+    self_node_id: int,
+):
+    text = ""
+    all_ops = [op for _, ops in p2p_op_infos for op in ops]
+
+    for direction, ops in _group_by(all_ops, _get_direction_from_op).items():
+        nbytes_of_gpu = [0] * world_size
+        for op in ops:
+            nbytes_of_gpu[op.peer] += op.tensor.nbytes
+        nbytes_of_gpu = torch.tensor(nbytes_of_gpu, dtype=torch.int64)
+
+        nbytes_of_node = einops.reduce(
+            nbytes_of_gpu,
+            "(num_nodes num_gpu_per_node) -> num_nodes",
+            num_gpu_per_node=num_gpu_per_node,
+            reduction="sum",
+        )
+
+        nbytes_curr_node = nbytes_of_node[self_node_id]
+        nbytes_cross_node = torch.sum(nbytes_of_node) - nbytes_curr_node
+
+        text += (
+            f"{direction}_nbytes_of_gpu={nbytes_of_gpu.tolist()} "
+            f"{direction}_nbytes_of_node={nbytes_of_node.tolist()} "
+            f"{direction}_nbytes_curr_node={nbytes_curr_node.item()} "
+            f"{direction}_nbytes_cross_node={nbytes_cross_node.item()} "
+        )
+
+    logger.info(f"[ExpertLocationUpdater] {text}")
+
+
+def _get_direction_from_op(op: P2POp):
+    if op.op == torch.distributed.isend:
+        return "isend"
+    if op.op == torch.distributed.irecv:
+        return "irecv"
+    raise NotImplementedError
+
+
+def _group_by(items, keyfunc):
+    ans = defaultdict(list)
+    for item in items:
+        ans[keyfunc(item)].append(item)
+    return dict(ans)
diff --git a/python/sglang/srt/function_call/base_format_detector.py b/python/sglang/srt/function_call/base_format_detector.py
new file mode 100644
index 000000000..39bb92f5f
--- /dev/null
+++ b/python/sglang/srt/function_call/base_format_detector.py
@@ -0,0 +1,367 @@
+import json
+import logging
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List
+
+from partial_json_parser.core.exceptions import MalformedJSON
+from partial_json_parser.core.options import Allow
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.utils import (
+    _find_common_prefix,
+    _is_complete_json,
+    _partial_json_loads,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class BaseFormatDetector(ABC):
+    """Base class providing two sets of interfaces: one-time and streaming incremental."""
+
+    def __init__(self):
+        # Streaming state management
+        # Buffer for accumulating incomplete patterns that arrive across multiple streaming chunks
+        self._buffer = ""
+        # Stores complete tool call info (name and arguments) for each tool being parsed.
+        # Used by serving layer for completion handling when streaming ends.
+        # Format: [{"name": str, "arguments": dict}, ...]
+        self.prev_tool_call_arr: List[Dict] = []
+        # Index of currently streaming tool call. Starts at -1 (no active tool),
+        # increments as each tool completes. Tracks which tool's arguments are streaming.
+        self.current_tool_id: int = -1
+        # Flag for whether current tool's name has been sent to client.
+        # Tool names sent first with empty parameters, then arguments stream incrementally.
+        self.current_tool_name_sent: bool = False
+        # Tracks raw JSON string content streamed to client for each tool's arguments.
+        # Critical for serving layer to calculate remaining content when streaming ends.
+        # Each index corresponds to a tool_id. Example: ['{"location": "San Francisco"', '{"temp": 72']
+        self.streamed_args_for_tool: List[str] = []
+
+        # Token configuration (override in subclasses)
+        self.bot_token = ""
+        self.eot_token = ""
+        self.tool_call_separator = ", "
+
+    def _get_tool_indices(self, tools: List[Tool]) -> Dict[str, int]:
+        """
+        Get a mapping of tool names to their indices in the tools list.
+
+        This utility method creates a dictionary mapping function names to their
+        indices in the tools list, which is commonly needed for tool validation
+        and ToolCallItem creation.
+
+        Args:
+            tools: List of available tools
+
+        Returns:
+            Dictionary mapping tool names to their indices
+        """
+        return {
+            tool.function.name: i for i, tool in enumerate(tools) if tool.function.name
+        }
+
+    def parse_base_json(self, action: Any, tools: List[Tool]) -> List[ToolCallItem]:
+        tool_indices = self._get_tool_indices(tools)
+        if not isinstance(action, list):
+            action = [action]
+
+        results = []
+        for act in action:
+            name = act.get("name")
+            if name and name in tool_indices:
+                results.append(
+                    ToolCallItem(
+                        tool_index=-1,  # Caller should update this based on the actual tools array called
+                        name=name,
+                        parameters=json.dumps(
+                            act.get("parameters") or act.get("arguments", {}),
+                            ensure_ascii=False,
+                        ),
+                    )
+                )
+            else:
+                logger.warning(f"Model attempted to call undefined function: {name}")
+
+        return results
+
+    @abstractmethod
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        Parses the text in one go. Returns success=True if the format matches, otherwise False.
+        Note that leftover_text here represents "content that this parser will not consume further".
+        """
+        action = json.loads(text)
+        return StreamingParseResult(calls=self.parse_base_json(action, tools))
+
+    def _ends_with_partial_token(self, buffer: str, bot_token: str) -> int:
+        """
+        Check if buffer ends with a partial bot_token.
+        Return the length of the partial bot_token.
+
+        For some format, the bot_token is not a token in model's vocabulary, such as
+        `[TOOL_CALLS] [` in Mistral.
+        """
+        for i in range(1, min(len(buffer) + 1, len(bot_token))):
+            if bot_token.startswith(buffer[-i:]):
+                return i
+        return 0
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing with tool validation.
+
+        This base implementation works best with formats where:
+        1. bot_token is followed immediately by JSON (e.g., bot_token + JSON_array)
+        2. JSON can be parsed incrementally using partial_json_loads
+        3. Multiple tool calls are separated by "; " or ", "
+
+        Examples of incompatible formats (need custom implementation, may reuse some logic from this class):
+        - Each tool call is wrapped in a separate block: See Qwen25Detector
+        - Multiple separate blocks: [TOOL_CALLS] [...] \n [TOOL_CALLS] [...]
+        - Tool call is Pythonic style
+
+        For incompatible formats, detectors should override this method with custom logic.
+        """
+        # Append new text to buffer
+        self._buffer += new_text
+        current_text = self._buffer
+
+        # The current_text has tool_call if it is the start of a new tool call sequence
+        # or it is the start of a new tool call after a tool call separator, when there is a previous tool call
+        if not (
+            self.has_tool_call(current_text)
+            or (
+                self.current_tool_id > 0
+                and current_text.startswith(self.tool_call_separator)
+            )
+        ):
+            # Only clear buffer if we're sure no tool call is starting
+            if not self._ends_with_partial_token(self._buffer, self.bot_token):
+                normal_text = self._buffer
+                self._buffer = ""
+                if self.eot_token in normal_text:
+                    normal_text = normal_text.replace(self.eot_token, "")
+                return StreamingParseResult(normal_text=normal_text)
+            else:
+                # Might be partial bot_token, keep buffering
+                return StreamingParseResult()
+
+        # Build tool indices if not already built
+        if not hasattr(self, "_tool_indices"):
+            self._tool_indices = self._get_tool_indices(tools)
+
+        flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR
+
+        try:
+            try:
+                if current_text.startswith(self.bot_token):
+                    start_idx = len(self.bot_token)
+                elif self.current_tool_id > 0 and current_text.startswith(
+                    self.tool_call_separator + self.bot_token
+                ):
+                    start_idx = len(self.tool_call_separator + self.bot_token)
+                elif self.current_tool_id > 0 and current_text.startswith(
+                    self.tool_call_separator
+                ):
+                    start_idx = len(self.tool_call_separator)
+                else:
+                    start_idx = 0
+
+                if start_idx >= len(current_text):
+                    return StreamingParseResult()
+
+                (obj, end_idx) = _partial_json_loads(current_text[start_idx:], flags)
+
+                is_current_complete = _is_complete_json(
+                    current_text[start_idx : start_idx + end_idx]
+                )
+
+                # Validate tool name if present
+                if "name" in obj and obj["name"] not in self._tool_indices:
+                    # Invalid tool name - reset state
+                    self._buffer = ""
+                    self.current_tool_id = -1
+                    self.current_tool_name_sent = False
+                    if self.streamed_args_for_tool:
+                        self.streamed_args_for_tool.pop()
+                    return StreamingParseResult()
+
+                # Handle parameters/arguments consistency
+                # NOTE: we assume here that the obj is always partial of a single tool call
+                if "parameters" in obj:
+                    assert (
+                        "arguments" not in obj
+                    ), "model generated both parameters and arguments"
+                    obj["arguments"] = obj["parameters"]
+
+                current_tool_call = obj
+
+            except MalformedJSON:
+                return StreamingParseResult()
+
+            if not current_tool_call:
+                return StreamingParseResult()
+
+            # Case 1: Handle tool name streaming
+            # This happens when we encounter a tool but haven't sent its name yet
+            if not self.current_tool_name_sent:
+                function_name = current_tool_call.get("name")
+
+                if function_name and function_name in self._tool_indices:
+                    # If this is a new tool (current_tool_id was -1), initialize it
+                    if self.current_tool_id == -1:
+                        self.current_tool_id = 0
+                        self.streamed_args_for_tool.append("")
+                    # If this is a subsequent tool, ensure streamed_args_for_tool is large enough
+                    elif self.current_tool_id >= len(self.streamed_args_for_tool):
+                        while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                            self.streamed_args_for_tool.append("")
+
+                    # Send the tool name with empty parameters
+                    res = StreamingParseResult(
+                        calls=[
+                            ToolCallItem(
+                                tool_index=self.current_tool_id,
+                                name=function_name,
+                                parameters="",
+                            )
+                        ],
+                    )
+                    self.current_tool_name_sent = True
+                else:
+                    res = StreamingParseResult()
+
+            # Case 2: Handle streaming arguments
+            # This happens when we've already sent the tool name and now need to stream arguments incrementally
+            else:
+                cur_arguments = current_tool_call.get("arguments")
+                res = StreamingParseResult()
+
+                if cur_arguments:
+                    # Calculate how much of the arguments we've already streamed
+                    sent = len(self.streamed_args_for_tool[self.current_tool_id])
+                    cur_args_json = json.dumps(cur_arguments)
+                    prev_arguments = None
+                    if self.current_tool_id < len(self.prev_tool_call_arr):
+                        prev_arguments = self.prev_tool_call_arr[
+                            self.current_tool_id
+                        ].get("arguments")
+
+                    argument_diff = None
+
+                    # If the current tool's JSON is complete, send all remaining arguments
+                    if is_current_complete:
+                        argument_diff = cur_args_json[sent:]
+                        completing_tool_id = (
+                            self.current_tool_id
+                        )  # Save the ID of the tool that's completing
+
+                        # Only remove the processed portion, keep unprocessed content
+                        self._buffer = current_text[start_idx + end_idx :]
+
+                        if self.current_tool_id < len(self.prev_tool_call_arr):
+                            self.prev_tool_call_arr[self.current_tool_id].clear()
+                        self.current_tool_name_sent = False
+                        self.streamed_args_for_tool[self.current_tool_id] = ""
+                        self.current_tool_id += 1
+
+                    # If the tool is still being parsed, send incremental changes
+                    elif prev_arguments:
+                        prev_args_json = json.dumps(prev_arguments)
+                        if cur_args_json != prev_args_json:
+                            prefix = _find_common_prefix(prev_args_json, cur_args_json)
+                            argument_diff = prefix[sent:]
+
+                    # Send the argument diff if there's something new
+                    if argument_diff is not None:
+                        # Use the correct tool_index: completing_tool_id for completed tools, current_tool_id for ongoing
+                        tool_index_to_use = (
+                            completing_tool_id
+                            if is_current_complete
+                            else self.current_tool_id
+                        )
+                        res = StreamingParseResult(
+                            calls=[
+                                ToolCallItem(
+                                    tool_index=tool_index_to_use,
+                                    parameters=argument_diff,
+                                )
+                            ],
+                        )
+                        if not is_current_complete:
+                            self.streamed_args_for_tool[
+                                self.current_tool_id
+                            ] += argument_diff
+
+            # Update prev_tool_call_arr with current state
+            if self.current_tool_id >= 0:
+                # Ensure prev_tool_call_arr is large enough
+                while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                    self.prev_tool_call_arr.append({})
+                self.prev_tool_call_arr[self.current_tool_id] = current_tool_call
+
+            return res
+
+        except Exception as e:
+            logger.error(f"Error in parse_streaming_increment: {e}")
+            return StreamingParseResult()
+
+    @abstractmethod
+    def has_tool_call(self, text: str) -> bool:
+        """
+        Check if the given text contains function call markers specific to this format.
+        """
+        raise NotImplementedError()
+
+    def supports_structural_tag(self) -> bool:
+        """Return True if this detector supports structural tag format."""
+        return True
+
+    @abstractmethod
+    def structure_info(self) -> _GetInfoFunc:
+        """
+        Return a function that creates StructureInfo for constrained generation.
+
+        The returned function takes a tool name and returns a StructureInfo object
+        containing the begin/end patterns and trigger tokens needed for constrained
+        generation of function calls in this format.
+
+        Returns:
+            A function that takes a tool name (str) and returns StructureInfo
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def build_ebnf(self, tools: List[Tool]) -> str:
+        """
+        Build an EBNF grammar for constrained generation of function calls.
+
+        This method generates an Extended Backus-Naur Form (EBNF) grammar that
+        constrains the model's output to valid function calls in this format.
+        The grammar should include all available tools and their parameter schemas.
+
+        Args:
+            tools: List of available tools/functions that can be called
+
+        Returns:
+            A string containing the EBNF grammar for this function call format
+
+        The EBNF grammar should:
+            - Define the overall structure of function calls in this format
+            - Include all tool names from the provided tools list
+            - Define valid JSON structures for function arguments
+            - Handle multiple function calls if the format supports them
+
+        Note:
+            Most implementations use EBNFComposer.build_ebnf() utility with
+            format-specific parameters rather than writing EBNF from scratch.
+        """
+        raise NotImplementedError()
diff --git a/python/sglang/srt/function_call/core_types.py b/python/sglang/srt/function_call/core_types.py
new file mode 100644
index 000000000..1ea87df79
--- /dev/null
+++ b/python/sglang/srt/function_call/core_types.py
@@ -0,0 +1,34 @@
+from dataclasses import dataclass
+from typing import Callable, List, Optional
+
+from pydantic import BaseModel
+
+
+class ToolCallItem(BaseModel):
+    """Simple encapsulation of the parsed ToolCall result for easier usage in streaming contexts."""
+
+    tool_index: int
+    name: Optional[str] = None
+    parameters: str  # JSON string
+
+
+class StreamingParseResult(BaseModel):
+    """Result of streaming incremental parsing."""
+
+    normal_text: str = ""
+    calls: List[ToolCallItem] = []
+
+
+@dataclass
+class StructureInfo:
+    begin: str
+    end: str
+    trigger: str
+
+
+"""
+Helper alias of function
+Usually it is a function that takes a name string and returns a StructureInfo object,
+which can be used to construct a structural_tag object
+"""
+_GetInfoFunc = Callable[[str], StructureInfo]
diff --git a/python/sglang/srt/function_call/deepseekv31_detector.py b/python/sglang/srt/function_call/deepseekv31_detector.py
new file mode 100644
index 000000000..2045d8daa
--- /dev/null
+++ b/python/sglang/srt/function_call/deepseekv31_detector.py
@@ -0,0 +1,222 @@
+import json
+import logging
+import re
+from typing import List
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    StructureInfo,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.ebnf_composer import EBNFComposer
+from sglang.srt.function_call.utils import _is_complete_json
+
+logger = logging.getLogger(__name__)
+
+
+class DeepSeekV31Detector(BaseFormatDetector):
+    """
+    Detector for DeepSeek V3 model function call format.
+
+    The DeepSeek V3 format uses special Unicode tokens to delimit function calls
+    with JSON code blocks for arguments.
+
+    Format Structure:
+    ```
+    <｜tool▁calls▁begin｜><｜tool▁call▁begin｜>{function_name}<｜tool▁sep｜>{json_arguments}<｜tool▁calls▁end｜><｜end▁of▁sentence｜>
+    ```
+    Examples:
+    ```
+    <｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_current_weather<｜tool▁sep｜>{"location": "Tokyo"}<｜tool▁call▁end｜><｜tool▁call▁begin｜>get_current_weather<｜tool▁sep｜>{"location": "Paris"}<｜tool▁call▁end｜><｜tool▁calls▁end｜><｜end▁of▁sentence｜>
+    ```
+
+    Key Components:
+    - Tool Calls Section: Wrapped between `<｜tool▁calls▁begin｜>` and `<｜tool▁calls▁end｜>`
+    - Individual Tool Call: Wrapped between `<｜tool▁call▁begin｜>` and `<｜tool▁call▁end｜>`
+    - Function Declaration: `<｜tool▁call▁begin｜>{function_name}<｜tool▁sep｜>`
+    - Arguments: JSON code block between `<｜tool▁sep｜>` and `<｜tool▁call▁end｜>`
+    - Supports multiple tool calls
+
+    Reference: https://www.modelscope.cn/models/deepseek-ai/DeepSeek-V3.1
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.bot_token = "<｜tool▁calls▁begin｜>"
+        self.eot_token = "<｜tool▁calls▁end｜>"
+        self.func_call_regex = r"<｜tool▁call▁begin｜>.*?<｜tool▁call▁end｜>"
+        self.func_detail_regex = (
+            r"<｜tool▁call▁begin｜>(.*)<｜tool▁sep｜>(.*)<｜tool▁call▁end｜>"
+        )
+        self._last_arguments = ""
+        self.current_tool_id = -1
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a deepseek format tool call."""
+        return self.bot_token in text
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+
+        :param text: The complete text to parse.
+        :param tools: List of available tools.
+        :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
+        """
+        idx = text.find(self.bot_token)
+        normal_text = text[:idx].strip() if idx != -1 else text
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=normal_text, calls=[])
+        match_result_list = re.findall(self.func_call_regex, text, re.DOTALL)
+        calls = []
+        try:
+            for match_result in match_result_list:
+                # Get function name
+                func_detail = re.search(self.func_detail_regex, match_result, re.DOTALL)
+                func_name = func_detail.group(1)
+                func_args = func_detail.group(2)
+                func_args = json.loads(func_args)
+                # construct match_result for parse_base_json
+                match_result = {"name": func_name, "parameters": func_args}
+                calls.extend(self.parse_base_json(match_result, tools))
+            return StreamingParseResult(normal_text=normal_text, calls=calls)
+        except Exception as e:
+            logger.error(f"Error in detect_and_parse: {e}")
+            # return the normal text if parsing fails
+            return StreamingParseResult(normal_text=text)
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing tool calls for DeepSeekV3 format.
+        """
+        self._buffer += new_text
+        current_text = self._buffer
+
+        # Check if we have a tool call (either the start token or individual tool call)
+        has_tool_call = (
+            self.bot_token in current_text or "<｜tool▁call▁begin｜>" in current_text
+        )
+
+        if not has_tool_call:
+            self._buffer = ""
+            for e_token in [self.eot_token, "<｜tool▁call▁end｜>"]:
+                if e_token in new_text:
+                    new_text = new_text.replace(e_token, "")
+            return StreamingParseResult(normal_text=new_text)
+
+        if not hasattr(self, "_tool_indices"):
+            self._tool_indices = self._get_tool_indices(tools)
+
+        calls: list[ToolCallItem] = []
+        try:
+            partial_match = re.search(
+                pattern=r"<｜tool▁call▁begin｜>(.*)<｜tool▁sep｜>(.*)<｜tool▁call▁end｜>",
+                string=current_text,
+                flags=re.DOTALL,
+            )
+            if partial_match:
+                func_name = partial_match.group(1).strip()
+                func_args_raw = partial_match.group(2).strip()
+
+                # Initialize state if this is the first tool call
+                if self.current_tool_id == -1:
+                    self.current_tool_id = 0
+                    self.prev_tool_call_arr = []
+                    self.streamed_args_for_tool = [""]
+
+                # Ensure we have enough entries in our tracking arrays
+                while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                    self.prev_tool_call_arr.append({})
+                while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                    self.streamed_args_for_tool.append("")
+
+                if not self.current_tool_name_sent:
+                    calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            name=func_name,
+                            parameters="",
+                        )
+                    )
+                    self.current_tool_name_sent = True
+                    # Store the tool call info for serving layer completions endpoint
+                    self.prev_tool_call_arr[self.current_tool_id] = {
+                        "name": func_name,
+                        "arguments": {},
+                    }
+                else:
+                    argument_diff = (
+                        func_args_raw[len(self._last_arguments) :]
+                        if func_args_raw.startswith(self._last_arguments)
+                        else func_args_raw
+                    )
+
+                    if argument_diff:
+                        calls.append(
+                            ToolCallItem(
+                                tool_index=self.current_tool_id,
+                                name=None,
+                                parameters=argument_diff,
+                            )
+                        )
+                        self._last_arguments += argument_diff
+                        self.streamed_args_for_tool[
+                            self.current_tool_id
+                        ] += argument_diff
+
+                    if _is_complete_json(func_args_raw):
+                        # Update the stored arguments
+                        try:
+                            parsed_args = json.loads(func_args_raw)
+                            self.prev_tool_call_arr[self.current_tool_id][
+                                "arguments"
+                            ] = parsed_args
+                        except json.JSONDecodeError:
+                            pass
+
+                        # Find the end of the current tool call and remove only that part from buffer
+                        tool_call_end_pattern = (
+                            r"<｜tool▁call▁begin｜>.*?<｜tool▁call▁end｜>"
+                        )
+                        match = re.search(
+                            tool_call_end_pattern, current_text, re.DOTALL
+                        )
+                        if match:
+                            # Remove the completed tool call from buffer, keep any remaining content
+                            self._buffer = current_text[match.end() :]
+                        else:
+                            self._buffer = ""
+
+                        result = StreamingParseResult(normal_text="", calls=calls)
+                        self.current_tool_id += 1
+                        self._last_arguments = ""
+                        self.current_tool_name_sent = False
+                        return result
+
+            return StreamingParseResult(normal_text="", calls=calls)
+
+        except Exception as e:
+            logger.error(f"Error in parse_streaming_increment: {e}")
+            return StreamingParseResult(normal_text=current_text)
+
+    def structure_info(self) -> _GetInfoFunc:
+        return lambda name: StructureInfo(
+            begin="<｜tool▁call▁begin｜>" + name + "<｜tool▁sep｜>",
+            end="<｜tool▁call▁end｜>",
+            trigger="<｜tool▁call▁begin｜>" + name + "<｜tool▁sep｜>",
+        )
+
+    def build_ebnf(self, tools: List[Tool]):
+        return EBNFComposer.build_ebnf(
+            tools,
+            sequence_start_token=self.bot_token,
+            sequence_end_token=self.eot_token,
+            tool_call_separator="",
+            call_rule_fmt='"<｜tool▁call▁begin｜>{name}<｜tool▁sep｜>{arguments_rule}<｜tool▁call▁end｜>"',
+            function_format="json",
+        )
diff --git a/python/sglang/srt/function_call/deepseekv3_detector.py b/python/sglang/srt/function_call/deepseekv3_detector.py
new file mode 100644
index 000000000..33c4dfc44
--- /dev/null
+++ b/python/sglang/srt/function_call/deepseekv3_detector.py
@@ -0,0 +1,220 @@
+import json
+import logging
+import re
+from typing import List
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    StructureInfo,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.ebnf_composer import EBNFComposer
+from sglang.srt.function_call.utils import _is_complete_json
+
+logger = logging.getLogger(__name__)
+
+
+class DeepSeekV3Detector(BaseFormatDetector):
+    """
+    Detector for DeepSeek V3 model function call format.
+
+    The DeepSeek V3 format uses special Unicode tokens to delimit function calls
+    with JSON code blocks for arguments.
+
+    Format Structure:
+    ```
+    <｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>{function_name}\n```json\n{json_arguments}\n```<｜tool▁calls▁end｜><｜end▁of▁sentence｜>
+    ```
+    Examples:
+    ```
+    <｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>get_current_weather\n```json\n{"location": "Tokyo"}\n```<｜tool▁call▁end｜>\n<｜tool▁call▁begin｜>function<｜tool▁sep｜>get_current_weather\n```json\n{"location": "Paris"}\n```<｜tool▁call▁end｜><｜tool▁calls▁end｜><｜end▁of▁sentence｜>
+    ```
+
+    Key Components:
+    - Tool Calls Section: Wrapped between `<｜tool▁calls▁begin｜>` and `<｜tool▁calls▁end｜>`
+    - Individual Tool Call: Wrapped between `<｜tool▁call▁begin｜>` and `<｜tool▁call▁end｜>`
+    - Function Declaration: `function<｜tool▁sep｜>{function_name}`
+    - Arguments: JSON code block between ````json` and ````
+    - Supports multiple tool calls
+
+    Reference: https://huggingface.co/deepseek-ai/DeepSeek-V3-0324?chat_template=default
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.bot_token = "<｜tool▁calls▁begin｜>"
+        self.eot_token = "<｜tool▁calls▁end｜>"
+        self.func_call_regex = r"<｜tool▁call▁begin｜>.*?<｜tool▁call▁end｜>"
+        self.func_detail_regex = r"<｜tool▁call▁begin｜>(.*)<｜tool▁sep｜>(.*)\n```json\n(.*)\n```<｜tool▁call▁end｜>"
+        self._last_arguments = ""
+        self.current_tool_id = -1
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a deepseek format tool call."""
+        return self.bot_token in text
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+
+        :param text: The complete text to parse.
+        :param tools: List of available tools.
+        :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
+        """
+        idx = text.find(self.bot_token)
+        normal_text = text[:idx].strip() if idx != -1 else text
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=normal_text, calls=[])
+        match_result_list = re.findall(self.func_call_regex, text, re.DOTALL)
+        calls = []
+        try:
+            for match_result in match_result_list:
+                # Get function name
+                func_detail = re.search(self.func_detail_regex, match_result, re.DOTALL)
+                func_name = func_detail.group(2)
+                func_args = func_detail.group(3)
+                func_args = json.loads(func_args)
+                # construct match_result for parse_base_json
+                match_result = {"name": func_name, "parameters": func_args}
+                calls.extend(self.parse_base_json(match_result, tools))
+            return StreamingParseResult(normal_text=normal_text, calls=calls)
+        except Exception as e:
+            logger.error(f"Error in detect_and_parse: {e}")
+            # return the normal text if parsing fails
+            return StreamingParseResult(normal_text=text)
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing tool calls for DeepSeekV3 format.
+        """
+        self._buffer += new_text
+        current_text = self._buffer
+
+        # Check if we have a tool call (either the start token or individual tool call)
+        has_tool_call = (
+            self.bot_token in current_text or "<｜tool▁call▁begin｜>" in current_text
+        )
+
+        if not has_tool_call:
+            self._buffer = ""
+            for e_token in [self.eot_token, "```", "<｜tool▁call▁end｜>"]:
+                if e_token in new_text:
+                    new_text = new_text.replace(e_token, "")
+            return StreamingParseResult(normal_text=new_text)
+
+        if not hasattr(self, "_tool_indices"):
+            self._tool_indices = self._get_tool_indices(tools)
+
+        calls: list[ToolCallItem] = []
+        try:
+            partial_match = re.search(
+                pattern=r"<｜tool▁call▁begin｜>(.*)<｜tool▁sep｜>(.*)\n```json\n(.*)\n```.*",
+                string=current_text,
+                flags=re.DOTALL,
+            )
+            if partial_match:
+                func_name = partial_match.group(2).strip()
+                func_args_raw = partial_match.group(3).strip()
+
+                # Initialize state if this is the first tool call
+                if self.current_tool_id == -1:
+                    self.current_tool_id = 0
+                    self.prev_tool_call_arr = []
+                    self.streamed_args_for_tool = [""]
+
+                # Ensure we have enough entries in our tracking arrays
+                while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                    self.prev_tool_call_arr.append({})
+                while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                    self.streamed_args_for_tool.append("")
+
+                if not self.current_tool_name_sent:
+                    calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            name=func_name,
+                            parameters="",
+                        )
+                    )
+                    self.current_tool_name_sent = True
+                    # Store the tool call info for serving layer completions endpoint
+                    self.prev_tool_call_arr[self.current_tool_id] = {
+                        "name": func_name,
+                        "arguments": {},
+                    }
+                else:
+                    argument_diff = (
+                        func_args_raw[len(self._last_arguments) :]
+                        if func_args_raw.startswith(self._last_arguments)
+                        else func_args_raw
+                    )
+
+                    if argument_diff:
+                        calls.append(
+                            ToolCallItem(
+                                tool_index=self.current_tool_id,
+                                name=None,
+                                parameters=argument_diff,
+                            )
+                        )
+                        self._last_arguments += argument_diff
+                        self.streamed_args_for_tool[
+                            self.current_tool_id
+                        ] += argument_diff
+
+                    if _is_complete_json(func_args_raw):
+                        # Update the stored arguments
+                        try:
+                            parsed_args = json.loads(func_args_raw)
+                            self.prev_tool_call_arr[self.current_tool_id][
+                                "arguments"
+                            ] = parsed_args
+                        except json.JSONDecodeError:
+                            pass
+
+                        # Find the end of the current tool call and remove only that part from buffer
+                        tool_call_end_pattern = (
+                            r"<｜tool▁call▁begin｜>.*?<｜tool▁call▁end｜>"
+                        )
+                        match = re.search(
+                            tool_call_end_pattern, current_text, re.DOTALL
+                        )
+                        if match:
+                            # Remove the completed tool call from buffer, keep any remaining content
+                            self._buffer = current_text[match.end() :]
+                        else:
+                            self._buffer = ""
+
+                        result = StreamingParseResult(normal_text="", calls=calls)
+                        self.current_tool_id += 1
+                        self._last_arguments = ""
+                        self.current_tool_name_sent = False
+                        return result
+
+            return StreamingParseResult(normal_text="", calls=calls)
+
+        except Exception as e:
+            logger.error(f"Error in parse_streaming_increment: {e}")
+            return StreamingParseResult(normal_text=current_text)
+
+    def structure_info(self) -> _GetInfoFunc:
+        return lambda name: StructureInfo(
+            begin=">" + name + "\n```json\n",
+            end="\n```<",
+            trigger=">" + name + "\n```json\n",
+        )
+
+    def build_ebnf(self, tools: List[Tool]):
+        return EBNFComposer.build_ebnf(
+            tools,
+            sequence_start_token=self.bot_token,
+            sequence_end_token=self.eot_token,
+            tool_call_separator="",
+            call_rule_fmt='"<｜tool▁call▁begin｜>function<｜tool▁sep｜>{name}\\n```json\\n"{arguments_rule}"\\n```<｜tool▁call▁end｜>"',
+            function_format="json",
+        )
diff --git a/python/sglang/srt/function_call/ebnf_composer.py b/python/sglang/srt/function_call/ebnf_composer.py
new file mode 100644
index 000000000..21b313982
--- /dev/null
+++ b/python/sglang/srt/function_call/ebnf_composer.py
@@ -0,0 +1,344 @@
+from typing import Any, Dict, Literal, Optional
+
+
+class EBNFComposer:
+    # Adapted from https://xgrammar.mlc.ai/docs/how_to/ebnf_guided_generation.html#try-out-via-hf-transformers
+    # Shared primitive grammar rules used across all formats
+    BASE_PRIMITIVE_GRAMMAR = r"""
+        basic_string ::= (([\"] basic_string_1 [\"]))
+        basic_string_1 ::= "" | [^"\\\x00-\x1F] basic_string_1 | "\\" escape basic_string_1
+        escape ::= ["\\/bfnrt] | "u" [A-Fa-f0-9]{4}
+        basic_integer ::= ("0" | "-"? [1-9] [0-9]*) ".0"?
+        basic_number ::= ("0" | "-"? [1-9] [0-9]*) ("." [0-9]+)? ([eE] [+-]? [0-9]+)?
+        basic_array ::= "[" ("" | ws basic_any (ws "," ws basic_any)*) ws "]"
+        basic_object ::= "{" ("" | ws basic_string ws ":" ws basic_any ( ws "," ws basic_string ws ":" ws basic_any)*) ws "}"
+        ws ::= [ \n\t]*
+    """
+
+    # Format-specific extensions
+    json_grammar_ebnf_str = (
+        r"""
+        json ::= basic_array | basic_object
+        basic_any ::= basic_number | basic_string | basic_boolean | basic_null | basic_array | basic_object
+        basic_boolean ::= "true" | "false"
+        basic_null ::= "null"
+    """
+        + BASE_PRIMITIVE_GRAMMAR
+    )
+
+    pythonic_grammar_ebnf_str = (
+        r"""
+        pythonic ::= basic_number | basic_string | basic_array | "True" | "False" | "None"
+        basic_any ::= basic_number | basic_string | basic_array | basic_object
+        basic_boolean ::= "True" | "False"
+        basic_null ::= "None"
+    """
+        + BASE_PRIMITIVE_GRAMMAR
+    )
+
+    xml_grammar_ebnf_str = (
+        r"""
+        xml ::= xml_element | xml_text
+        xml_element ::= basic_string | basic_number | basic_boolean | basic_null | basic_array | basic_object
+        xml_text ::= [^<>]*
+        basic_any ::= basic_number | basic_string | basic_boolean | basic_null | basic_array | basic_object
+        basic_boolean ::= "true" | "false"
+        basic_null ::= "null"
+    """
+        + BASE_PRIMITIVE_GRAMMAR
+    )
+
+    CALL_RULE_MAP = {
+        "pythonic": 'call_{name} ::= "{name}" "(" {arguments_rule} ")"',
+        "json": 'call_{name} ::= "{{" ws "\\"name\\"" ws ":" ws "\\"{name}\\"" ws "," ws "\\"arguments\\"" ws ":" ws {arguments_rule} ws "}}"',
+        "xml": 'call_{name} ::= "<function={name}>\\n" {arguments_rule} "\\n</function>"',
+    }
+
+    ARGUMENTS_RULE_MAP = {
+        "pythonic": "{arg_rules}",
+        "json": '"{{" ws {arg_rules} ws "}}"',
+        "xml": "{arg_rules}",
+    }
+
+    KEY_VALUE_RULE_MAP = {
+        "pythonic": '"{key}" "=" {valrule}',
+        "json": '"\\"{key}\\"" ws ":" ws {valrule}',
+        "xml": '"<parameter={key}>\\n" {valrule} "\\n</parameter>"',
+    }
+
+    # Base type mapping - most types are the same across formats
+    BASE_TYPE_MAPPING = {
+        "string": "basic_string",
+        "number": "basic_number",
+        "integer": "basic_number",
+        "boolean": "basic_boolean",
+        "null": "basic_null",
+        "array": "basic_array",
+        "object": "basic_object",
+    }
+
+    # Format-specific overrides for types that differ
+    FORMAT_TYPE_OVERRIDES = {
+        "pythonic": {
+            "boolean": '"True" | "False"',
+            "null": '"None"',
+        },
+        "xml": {
+            "string": "xml_text",
+        },
+    }
+
+    @staticmethod
+    def get_value_rule(
+        prop: dict, function_format: Literal["pythonic", "json", "xml"] = "json"
+    ) -> str:
+        if "enum" in prop:
+            return EBNFComposer._handle_enum(prop, function_format)
+
+        if "type" in prop:
+            return EBNFComposer._handle_type(prop, function_format)
+
+        return function_format
+
+    @staticmethod
+    def _handle_enum(prop: dict, function_format: str) -> str:
+        """Handle enum properties by formatting each value according to type and format."""
+        enum_values = prop["enum"]
+        prop_type = prop.get("type", "string")
+
+        def format_enum_val(v: Any) -> str:
+            if prop_type == "boolean":
+                if function_format == "json" or function_format == "xml":
+                    return "true" if v else "false"
+                elif function_format == "pythonic":
+                    return "True" if v else "False"
+                else:
+                    return str(v)  # fallback
+
+            if prop_type == "string":
+                if function_format == "xml":
+                    return f'"{v}"'
+                else:  # json or pythonic
+                    return f'"\\"{v}\\""'  # escape quote-wrapped string
+
+            # All other types (number, integer, etc.)
+            return str(v)
+
+        formatted_values = [format_enum_val(v) for v in enum_values]
+        enum_rule = " | ".join(formatted_values)
+        return f"({enum_rule})" if len(formatted_values) > 1 else enum_rule
+
+    @staticmethod
+    def get_type_mapping(function_format: str) -> Dict[str, str]:
+        """Get the complete type mapping for a given format."""
+        mapping = EBNFComposer.BASE_TYPE_MAPPING.copy()
+        overrides = EBNFComposer.FORMAT_TYPE_OVERRIDES.get(function_format, {})
+        mapping.update({k: v for k, v in overrides.items() if v is not None})
+        return mapping
+
+    @staticmethod
+    def _handle_type(prop: dict, function_format: str) -> str:
+        """Handle type properties using the appropriate type mapping."""
+        prop_type = prop["type"]
+        type_mapping = EBNFComposer.get_type_mapping(function_format)
+
+        if isinstance(prop_type, list):
+            type_rules = [
+                type_mapping.get(single_type, function_format)
+                for single_type in prop_type
+            ]
+            return " | ".join(type_rules) if type_rules else function_format
+
+        return type_mapping.get(prop_type, function_format)
+
+    @staticmethod
+    def build_ebnf(
+        tools,
+        function_format: Literal["pythonic", "json", "xml"] = "json",
+        # Parameters for wrapping the entire sequence of tool calls
+        sequence_start_token: Optional[str] = None,
+        sequence_end_token: Optional[str] = None,
+        # Parameters for wrapping individual tool calls
+        individual_call_start_token: Optional[str] = None,
+        individual_call_end_token: Optional[str] = None,
+        # Parameter for separating multiple tool calls
+        tool_call_separator: Optional[str] = None,
+        call_rule_fmt: Optional[str] = None,
+        key_value_rule_fmt: Optional[str] = None,
+        key_value_separator: str = 'ws "," ws',
+    ):
+        """
+        Generalized EBNF builder for all detectors.
+        Args:
+            tools: List of Tool objects to generate EBNF grammar for
+            function_format: The format of function calls, either "pythonic" or "json"
+            sequence_start_token: Token that wraps the entire sequence of tool calls (start)
+            sequence_end_token: Token that wraps the entire sequence of tool calls (end)
+            individual_call_start_token: Token that wraps each individual tool call (start)
+            individual_call_end_token: Token that wraps each individual tool call (end)
+            tool_call_separator: The separator between multiple tool calls
+            call_rule_fmt: Optional custom format string for call_{name} rule. It should define each function call's format, with
+                the placeholders {name} for the function name and {arguments_rule} for the arguments rule. If None, a default
+                format based on function_format will be used.
+            key_value_rule_fmt: Optional custom format string for key-value pairs. It should define how each parameter is formatted,
+                with placeholders {key} for the parameter name and {valrule} for the value rule. If None, a default format
+                based on function_format will be used.
+            key_value_separator: Raw EBNF fragment inserted between key-value pairs.
+                This string is used verbatim (not auto-quoted). Pass:
+                - Quoted terminals when you need a literal token (e.g. '","' or '"\\n"').
+                - Raw/non-terminals when you need grammar tokens (e.g. 'ws "," ws').
+        """
+        # =================================================================
+        # Step 1: Determine the root tool calls rule
+        # =================================================================
+        # Handle a single function call
+        if individual_call_start_token and individual_call_end_token:
+            function_call_unit = f'"{individual_call_start_token}" function_call "{individual_call_end_token}"'
+        else:
+            function_call_unit = "function_call"
+
+        # Handle multiple function calls with separators
+        if tool_call_separator is not None:
+            base_pattern = f'{function_call_unit} ( "{tool_call_separator}" {function_call_unit} )*'
+        else:
+            # Assume only support single function call
+            base_pattern = function_call_unit
+
+        # Apply sequence-level wrapping if needed
+        if sequence_start_token and sequence_end_token:
+            root_rule = (
+                f'"{sequence_start_token}" {base_pattern} "{sequence_end_token}"'
+            )
+        else:
+            root_rule = base_pattern
+
+        # =================================================================
+        # Step 2: Build the header rules
+        # =================================================================
+        ebnf_lines = [
+            f"root ::= {root_rule}",
+            "function_call ::= "
+            + " | ".join([f"call_{tool.function.name}" for tool in tools]),
+        ]
+
+        # =================================================================
+        # Step 3: Set up formatting templates
+        # =================================================================
+        call_template = (
+            f"call_{{name}} ::= {call_rule_fmt}"
+            if call_rule_fmt
+            else EBNFComposer.CALL_RULE_MAP[function_format]
+        )
+        args_template = EBNFComposer.ARGUMENTS_RULE_MAP[function_format]
+        key_value_template = (
+            key_value_rule_fmt
+            if key_value_rule_fmt
+            else EBNFComposer.KEY_VALUE_RULE_MAP[function_format]
+        )
+
+        # =================================================================
+        # Step 4: Build rules for each tool
+        # =================================================================
+        for tool in tools:
+            tool_name = tool.function.name
+            params = tool.function.parameters or {}
+            properties = params.get("properties", {})
+            required_props = set(params.get("required", []))
+
+            # The generated pattern ensures:
+            # 1. Required properties appear first, joined by commas
+            # 2. Optional properties are wrapped with comma included: ( "," ( "prop" : value )? )?
+            # 3. For multiple optional properties, we allow flexible ordering:
+            #    - Each optional can be skipped entirely
+            #    - They can appear in any combination
+            #
+            # Example patterns generated:
+            # - One required, one optional:
+            #   "{" "location" ":" string ( "," ( "unit" ":" enum ) )? "}"
+            #   Allows: {"location": "Paris"} or {"location": "Paris", "unit": "celsius"}
+            #
+            # - Multiple optional properties with flexible ordering:
+            #   "{" "req" ":" string ( "," ( "opt1" ":" value ( "," "opt2" ":" value )? | "opt2" ":" value ) )? "}"
+            #   Allows: {"req": "x"}, {"req": "x", "opt1": "y"}, {"req": "x", "opt2": "z"},
+            #           {"req": "x", "opt1": "y", "opt2": "z"}
+            #
+            # - All optional properties with flexible ordering:
+            #   "{" ( "opt1" ":" value ( "," "opt2" ":" value )? | "opt2" ":" value )? "}"
+            #   Allows: {}, {"opt1": "x"}, {"opt2": "y"}, {"opt1": "x", "opt2": "y"}
+
+            prop_kv_pairs = {}
+            ordered_props = list(properties.keys())
+
+            for prop_name, prop_schema in properties.items():
+                value_rule = EBNFComposer.get_value_rule(prop_schema, function_format)
+                # Create key=value pair
+                pair = key_value_template.format(key=prop_name, valrule=value_rule)
+                prop_kv_pairs[prop_name] = pair
+
+            # Separate into required and optional while preserving order
+            required = [p for p in ordered_props if p in required_props]
+            optional = [p for p in ordered_props if p not in required_props]
+
+            # Build the combined rule
+            rule_parts = []
+
+            # Add required properties joined by commas
+            if required:
+                rule_parts.append(
+                    f" {key_value_separator} ".join(prop_kv_pairs[k] for k in required)
+                )
+
+            # Add optional properties with flexible ordering
+            if optional:
+                # Build alternatives where any optional property can appear first
+                opt_alternatives = []
+                for i in range(len(optional)):
+                    # Build pattern for optional[i] appearing first
+                    opt_parts = []
+                    for j in range(i, len(optional)):
+                        if j == i:
+                            opt_parts.append(prop_kv_pairs[optional[j]])
+                        else:
+                            opt_parts.append(
+                                f" ( {key_value_separator} {prop_kv_pairs[optional[j]]} )?"
+                            )
+                    opt_alternatives.append("".join(opt_parts))
+
+                # Wrap with appropriate comma handling based on whether we have required properties
+                if required:
+                    # Required properties exist, so optional group needs outer comma
+                    rule_parts.append(f" ( {key_value_separator} ( ")
+                    rule_parts.append(" | ".join(opt_alternatives))
+                    rule_parts.append(" ) )?")
+                else:
+                    # All properties are optional
+                    rule_parts.append("( ")
+                    rule_parts.append(" | ".join(opt_alternatives))
+                    rule_parts.append(" )?")
+
+            combined_args = "".join(rule_parts)
+            arguments_rule = args_template.format(arg_rules=combined_args)
+            arguments_rule = arguments_rule or '""'
+
+            # Add the function call rule and its arguments rule
+            ebnf_lines.append(
+                call_template.format(
+                    name=tool_name, arguments_rule=f"arguments_{tool_name}"
+                )
+            )
+            ebnf_lines.append(f"arguments_{tool_name} ::= {arguments_rule}")
+
+        # =================================================================
+        # Step 5: Add base grammar rules
+        # =================================================================
+        grammar_dict = {
+            "pythonic": EBNFComposer.pythonic_grammar_ebnf_str,
+            "json": EBNFComposer.json_grammar_ebnf_str,
+            "xml": EBNFComposer.xml_grammar_ebnf_str,
+        }
+        base_grammar = grammar_dict.get(
+            function_format, EBNFComposer.json_grammar_ebnf_str
+        )
+        ebnf_lines.append(base_grammar)
+
+        return "\n".join(ebnf_lines)
diff --git a/python/sglang/srt/function_call/function_call_parser.py b/python/sglang/srt/function_call/function_call_parser.py
new file mode 100644
index 000000000..18fe488e4
--- /dev/null
+++ b/python/sglang/srt/function_call/function_call_parser.py
@@ -0,0 +1,214 @@
+import logging
+from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Type, Union
+
+from sglang.srt.entrypoints.openai.protocol import (
+    StructuralTagResponseFormat,
+    StructuresResponseFormat,
+    Tool,
+    ToolChoice,
+)
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import ToolCallItem
+from sglang.srt.function_call.deepseekv3_detector import DeepSeekV3Detector
+from sglang.srt.function_call.deepseekv31_detector import DeepSeekV31Detector
+from sglang.srt.function_call.glm4_moe_detector import Glm4MoeDetector
+from sglang.srt.function_call.gpt_oss_detector import GptOssDetector
+from sglang.srt.function_call.kimik2_detector import KimiK2Detector
+from sglang.srt.function_call.llama32_detector import Llama32Detector
+from sglang.srt.function_call.mistral_detector import MistralDetector
+from sglang.srt.function_call.pythonic_detector import PythonicDetector
+from sglang.srt.function_call.qwen3_coder_detector import Qwen3CoderDetector
+from sglang.srt.function_call.qwen25_detector import Qwen25Detector
+from sglang.srt.function_call.step3_detector import Step3Detector
+
+logger = logging.getLogger(__name__)
+
+
+class FunctionCallParser:
+    """
+    Parser for function/tool calls in model outputs.
+
+    This class handles both streaming and non-streaming parsing of function calls using a detector.
+    In streaming scenarios, each time new_text is received, it calls detector.parse_streaming_increment
+    and returns the resulting normal_text and calls to the upper layer (or SSE).
+    """
+
+    ToolCallParserEnum: Dict[str, Type[BaseFormatDetector]] = {
+        "llama3": Llama32Detector,
+        "qwen25": Qwen25Detector,
+        "mistral": MistralDetector,
+        "deepseekv3": DeepSeekV3Detector,
+        "deepseekv31": DeepSeekV31Detector,
+        "pythonic": PythonicDetector,
+        "kimi_k2": KimiK2Detector,
+        "qwen3_coder": Qwen3CoderDetector,
+        "glm45": Glm4MoeDetector,
+        "step3": Step3Detector,
+        "gpt-oss": GptOssDetector,
+    }
+
+    def __init__(self, tools: List[Tool], tool_call_parser: str):
+        detector: Type[BaseFormatDetector] = None
+        detector_class = self.ToolCallParserEnum.get(tool_call_parser)
+        if detector_class:
+            detector = detector_class()
+        else:
+            raise ValueError(f"Unsupported tool_call_parser: {tool_call_parser}")
+
+        self.detector = detector
+        self.tools = tools
+
+    def has_tool_call(self, text: str) -> bool:
+        """
+        Check if the given text contains a tool call in the format supported by this parser.
+        This delegates to the detector's implementation.
+
+        Args:
+            text: The text to check for tool calls
+
+        Returns:
+            True if the text contains a tool call, False otherwise
+        """
+        return self.detector.has_tool_call(text)
+
+    def parse_non_stream(self, full_text: str) -> Tuple[str, list[ToolCallItem]]:
+        """
+        One-time parsing of the full text to extract tool calls.
+
+        Args:
+            full_text: The complete text to parse
+
+        Returns:
+            A tuple containing:
+            - The remaining text after parsing that was not consumed by the detector (can be treated as normal text)
+            - A list of tool calls parsed from the text
+        """
+        parsed_result = self.detector.detect_and_parse(full_text, self.tools)
+        tool_call_list = parsed_result.calls
+        if tool_call_list:
+            return parsed_result.normal_text, tool_call_list
+        else:
+            return full_text, []
+
+    def parse_stream_chunk(self, chunk_text: str) -> Tuple[str, list[ToolCallItem]]:
+        """
+        Streaming incremental parsing of chunks of text as they arrive.
+
+        Args:
+            chunk_text: The new chunk of text to parse
+
+        Returns:
+            A tuple containing:
+            - The normal text that should be displayed to the user
+            - A list of tool calls parsed from the chunk
+        """
+        final_normal_text = ""
+        final_calls = []
+
+        sp_result = self.detector.parse_streaming_increment(chunk_text, self.tools)
+        if sp_result.normal_text:
+            final_normal_text = sp_result.normal_text
+        if sp_result.calls:
+            final_calls.extend(sp_result.calls)
+            final_normal_text = sp_result.normal_text
+
+        return final_normal_text, final_calls
+
+    def get_structure_tag(self) -> StructuralTagResponseFormat:
+        """
+        Generate a structural tag response format for all available tools.
+
+        This creates the necessary structural tags that guide the model's output format.
+        """
+        tool_structures: List[StructuresResponseFormat] = list()
+        tool_trigger_set: Set[str] = set()
+
+        get_structure_info = self.detector.structure_info()
+        for tool in self.tools:
+            function = tool.function
+            name = function.name
+            assert name is not None
+            info = get_structure_info(name)
+
+            # accept all if not strict, otherwise only accept the schema
+            schema = function.parameters if function.strict else {}
+
+            tool_structures.append(
+                StructuresResponseFormat(
+                    begin=info.begin,
+                    schema=schema,  # type: ignore
+                    end=info.end,
+                )
+            )
+            tool_trigger_set.add(info.trigger)
+
+        return StructuralTagResponseFormat(
+            type="structural_tag",
+            structures=tool_structures,
+            triggers=list(tool_trigger_set),
+        )
+
+    def get_structure_constraint(
+        self, tool_choice: Union[ToolChoice, Literal["auto", "required"]]
+    ) -> Optional[Tuple[str, Any]]:
+        """
+        Returns the appropriate structure constraint for tool calls based on the tool_choice.
+        The constraint is used to guide the model's output format.
+
+        Args:
+            tool_choice: The tool choice setting from the request
+
+        Returns:
+            A tuple of (constraint_type, constraint_value) to be added to sampling parameters,
+            or None if no constraint applies.
+        """
+        # NOTE: structural_tag only supports JSON-compatible content between the begin and end.
+        # It cannot parse or validate function call Pythonic or XML-ish syntax.
+        if (
+            self.detector.supports_structural_tag()
+            and tool_choice == "auto"
+            and any(tool.function.strict for tool in self.tools)
+        ):
+            strict_tag = self.get_structure_tag()
+            return ("structural_tag", strict_tag)
+        elif tool_choice == "required" or isinstance(tool_choice, ToolChoice):
+            ebnf = self.get_ebnf(tool_choice)
+            return ("ebnf", ebnf) if ebnf is not None else None
+
+    def get_ebnf(
+        self, tool_choice: Union[ToolChoice, Literal["required"]]
+    ) -> Optional[str]:
+        """
+        Get the EBNF grammar for the specified tool choice.
+
+        Args:
+            tool_choice: The tool choice specification
+
+        Returns:
+            EBNF grammar string, or None if no valid tools found
+
+        Note:
+            If a specific function is requested but not found in available tools,
+            logs a warning and falls back to using all available tools for backward compatibility.
+        """
+        filtered_tools = []
+        if isinstance(tool_choice, ToolChoice):
+            fn_name = tool_choice.function.name
+            filtered_tools = [t for t in self.tools if t.function.name == fn_name]
+
+            # Check if the requested function exists in available tools
+            if not filtered_tools:
+                available_functions = [t.function.name for t in self.tools]
+                logger.warning(
+                    f"Function '{fn_name}' not found in available tools. "
+                    f"Available functions: {available_functions}. "
+                    f"Skipping tool choice."
+                )
+
+                # TODO: Return a 400 error instead of warning when adapter supports proper error handling
+                # For now, fall back to return None
+                return None
+        else:
+            filtered_tools = self.tools
+
+        return self.detector.build_ebnf(filtered_tools)
diff --git a/python/sglang/srt/function_call/glm4_moe_detector.py b/python/sglang/srt/function_call/glm4_moe_detector.py
new file mode 100644
index 000000000..6e89fe0a1
--- /dev/null
+++ b/python/sglang/srt/function_call/glm4_moe_detector.py
@@ -0,0 +1,164 @@
+import ast
+import json
+import logging
+import re
+from typing import List
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    StructureInfo,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.ebnf_composer import EBNFComposer
+
+logger = logging.getLogger(__name__)
+
+
+def get_argument_type(func_name: str, arg_key: str, defined_tools: list):
+    name2tool = {tool.function.name: tool for tool in defined_tools}
+    if func_name not in name2tool:
+        return None
+    tool = name2tool[func_name]
+    if arg_key not in tool.function.parameters["properties"]:
+        return None
+    return tool.function.parameters["properties"][arg_key].get("type", None)
+
+
+def parse_arguments(json_value):
+    try:
+        try:
+            parsed_value = json.loads(json_value)
+        except:
+            parsed_value = ast.literal_eval(json_value)
+        return parsed_value, True
+    except:
+        return json_value, False
+
+
+class Glm4MoeDetector(BaseFormatDetector):
+    """
+    Detector for GLM-4.5 models.
+    Assumes function call format:
+      <tool_call>get_weather\n<arg_key>city</arg_key>\n<arg_value>北京</arg_value>\n<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n</tool_call>\n<tool_call>get_weather\n<arg_key>city</arg_key>\n<arg_value>上海</arg_value>\n<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n</tool_call>
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.bot_token = "<tool_call>"
+        self.eot_token = "</tool_call>"
+        self.func_call_regex = r"<tool_call>.*?</tool_call>"
+        self.func_detail_regex = r"<tool_call>([^\n]*)\n(.*)</tool_call>"
+        self.func_arg_regex = r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>"
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a glm-4.5 format tool call."""
+        return self.bot_token in text
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+
+        :param text: The complete text to parse.
+        :param tools: List of available tools.
+        :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
+        """
+        idx = text.find(self.bot_token)
+        normal_text = text[:idx].strip() if idx != -1 else text
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=normal_text, calls=[])
+        match_result_list = re.findall(self.func_call_regex, text, re.DOTALL)
+        calls = []
+        try:
+            for match_result in match_result_list:
+                # Get function name
+                func_detail = re.search(self.func_detail_regex, match_result, re.DOTALL)
+                func_name = func_detail.group(1)
+                func_args = func_detail.group(2)
+                pairs = re.findall(
+                    r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>",
+                    func_args,
+                    re.DOTALL,
+                )
+                arguments = {}
+                for arg_key, arg_value in pairs:
+                    arg_key = arg_key.strip()
+                    arg_value = arg_value.strip()
+                    arg_type = get_argument_type(func_name, arg_key, tools)
+                    if arg_type != "string":
+                        arg_value, is_good_json = parse_arguments(arg_value)
+                    arguments[arg_key] = arg_value
+                # construct match_result for parse_base_json
+                match_result = {"name": func_name, "parameters": arguments}
+                calls.extend(self.parse_base_json(match_result, tools))
+            return StreamingParseResult(normal_text=normal_text, calls=calls)
+        except Exception as e:
+            logger.error(f"Error in detect_and_parse: {e}")
+            # return the normal text if parsing fails
+            return StreamingParseResult(normal_text=text)
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing tool calls for GLM-4.5 format.
+        """
+        self._buffer += new_text
+        current_text = self._buffer
+
+        start = current_text.find(self.bot_token)
+        if start == -1:
+            self._buffer = ""
+            if self.current_tool_id > 0:
+                current_text = ""
+            return StreamingParseResult(normal_text=current_text)
+        # find ensures we find the first self.eot_token so there will be at most one tool_call in current_text[:end+len(self.eot_token)
+        end = current_text.find(self.eot_token)
+        if end != -1:
+            # Initialize state if this is the first tool call
+            if self.current_tool_id == -1:
+                self.current_tool_id = 0
+                self.prev_tool_call_arr = []
+                self.streamed_args_for_tool = [""]
+            # Ensure we have enough entries in our tracking arrays
+            while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                self.prev_tool_call_arr.append({})
+            while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                self.streamed_args_for_tool.append("")
+            result = self.detect_and_parse(
+                current_text[: end + len(self.eot_token)], tools=tools
+            )
+            if result.calls:
+                self.prev_tool_call_arr[self.current_tool_id] = {
+                    "name": result.calls[0].name,
+                    "arguments": json.loads(result.calls[0].parameters),
+                }
+                self.streamed_args_for_tool[self.current_tool_id] = result.calls[
+                    0
+                ].parameters
+                result.calls[0].tool_index = self.current_tool_id
+                self.current_tool_id += 1
+            self._buffer = current_text[end + len(self.eot_token) :]
+            return result
+        normal_text = current_text[:start]
+        self._buffer = current_text[start:]
+        return StreamingParseResult(normal_text=normal_text)
+
+    def supports_structural_tag(self) -> bool:
+        return False
+
+    def structure_info(self) -> _GetInfoFunc:
+        raise NotImplementedError()
+
+    def build_ebnf(self, tools: List[Tool]):
+        return EBNFComposer.build_ebnf(
+            tools,
+            individual_call_start_token=self.bot_token,
+            individual_call_end_token=self.eot_token,
+            tool_call_separator="\\n",
+            function_format="xml",
+            call_rule_fmt='"{name}" "\\n" ( {arguments_rule} "\\n" )?',
+            key_value_rule_fmt='"<arg_key>{key}</arg_key>" "\\n" "<arg_value>" {valrule} "</arg_value>"',
+            key_value_separator='"\\n"',
+        )
diff --git a/python/sglang/srt/function_call/gpt_oss_detector.py b/python/sglang/srt/function_call/gpt_oss_detector.py
new file mode 100644
index 000000000..7fe0a7dc8
--- /dev/null
+++ b/python/sglang/srt/function_call/gpt_oss_detector.py
@@ -0,0 +1,219 @@
+import json
+import logging
+import re
+from typing import List, Optional
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+from sglang.srt.parser.harmony_parser import HarmonyParser
+
+logger = logging.getLogger(__name__)
+
+
+class GptOssDetector(BaseFormatDetector):
+    """
+    Detector for T4-style function calls using HarmonyParser.
+
+    Handles tool calls in the format:
+    <|channel|>commentary to={namespace.function}<|constrain|>json<|message|>{args}<|call|>
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.harmony_parser = HarmonyParser()
+        self.bot_token = "<|start|>assistant<|channel|>commentary"
+        self.eot_token = "<|call|>"
+
+        # Pattern to extract function name and JSON from tool_call event content
+        self.tool_extract_pattern = re.compile(
+            r"to=([a-zA-Z_][a-zA-Z0-9_.]*)\s*<\|constrain\|>json<\|message\|>(.*?)(?:<\|call\|>|$)",
+            re.DOTALL,
+        )
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if text contains TypeScript-style function call markers."""
+        return self.bot_token in text
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """Parse TypeScript-style function calls from complete text."""
+        if not self.has_tool_call(text):
+            return StreamingParseResult(normal_text=text, calls=[])
+
+        # Parse with HarmonyParser
+        events = self.harmony_parser.parse(text)
+        # Flush buffer for complete parsing
+        events += self.harmony_parser.parse("")
+
+        tool_indices = self._get_tool_indices(tools)
+        calls = []
+        normal_parts = []
+        tool_index = 0
+
+        for event in events:
+            if event.event_type == "tool_call":
+                # Extract tool call from event content
+                tool_call = self._extract_tool_call_from_event(
+                    event.raw_text if event.raw_text else event.content,
+                    tool_indices,
+                    tool_index,
+                )
+                if tool_call:
+                    calls.append(tool_call)
+                    tool_index += 1
+            elif event.event_type == "normal":
+                normal_parts.append(event.content)
+            # Ignore reasoning events in function call context
+
+        normal_text = " ".join(normal_parts).strip()
+        return StreamingParseResult(normal_text=normal_text, calls=calls)
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """Parse incremental streaming text for TypeScript-style function calls."""
+        self._buffer += new_text
+
+        # Always use HarmonyParser for parsing to ensure proper filtering
+        events = self.harmony_parser.parse(new_text)
+
+        # Quick check if we might have tool calls
+        if (
+            "<|channel|>commentary to=" not in self._buffer
+            and not self.current_tool_name_sent
+        ):
+            # No tool calls detected, check for final content
+            if (
+                "<|channel|>final" in self._buffer
+                or "assistantfinal" in self._buffer.lower()
+            ):
+                # Extract normal text from events
+                normal_text = "".join(
+                    [e.content for e in events if e.event_type == "normal"]
+                )
+                if normal_text:
+                    self._buffer = ""
+                    return StreamingParseResult(normal_text=normal_text, calls=[])
+
+            # For other content, extract normal text from events (with filtering applied)
+            normal_text = "".join(
+                [e.content for e in events if e.event_type == "normal"]
+            )
+            if normal_text or events:
+                self._buffer = ""
+                return StreamingParseResult(normal_text=normal_text, calls=[])
+            else:
+                # No events processed, continue buffering
+                return StreamingParseResult(normal_text="", calls=[])
+
+        if not events:
+            # No complete events yet
+            return StreamingParseResult(normal_text="", calls=[])
+
+        # Initialize state if needed
+        if not hasattr(self, "_tool_indices"):
+            self._tool_indices = self._get_tool_indices(tools)
+
+        calls = []
+        normal_text = ""
+
+        for event in events:
+            if event.event_type == "tool_call":
+                # We got a complete tool call from HarmonyParser
+                tool_call_info = self._extract_tool_call_from_event(
+                    event.raw_text if event.raw_text else event.content,
+                    self._tool_indices,
+                    self.current_tool_id if self.current_tool_id >= 0 else 0,
+                )
+
+                if tool_call_info:
+                    # Initialize state if first tool
+                    if self.current_tool_id == -1:
+                        self.current_tool_id = 0
+                        self.prev_tool_call_arr = []
+                        self.streamed_args_for_tool = [""]
+
+                    # Ensure arrays are large enough
+                    while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                        self.prev_tool_call_arr.append({})
+                    while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                        self.streamed_args_for_tool.append("")
+
+                    # Store tool call info
+                    self.prev_tool_call_arr[self.current_tool_id] = {
+                        "name": tool_call_info.name,
+                        "arguments": json.loads(tool_call_info.parameters),
+                    }
+
+                    # Emit the complete tool call at once
+                    # (Could be modified to emit name first, then args, if needed)
+                    calls.append(tool_call_info)
+
+                    # Mark as streamed
+                    self.streamed_args_for_tool[self.current_tool_id] = (
+                        tool_call_info.parameters
+                    )
+
+                    # Move to next tool
+                    self.current_tool_id += 1
+                    self.current_tool_name_sent = False
+
+            elif event.event_type == "normal":
+                normal_text += event.content
+
+        # Clear buffer since HarmonyParser handles buffering
+        self._buffer = ""
+
+        return StreamingParseResult(normal_text=normal_text, calls=calls)
+
+    def _extract_tool_call_from_event(
+        self, content: str, tool_indices: dict, tool_index: int
+    ) -> Optional[ToolCallItem]:
+        """
+        Extract tool call information from HarmonyParser event content.
+
+        Content format: "commentary to=functions.get_weather<|constrain|>json<|message|>{...}"
+        """
+        match = self.tool_extract_pattern.search(content)
+
+        if not match:
+            logger.debug(f"Could not extract tool call from: {content[:100]}")
+            return None
+
+        full_function_name = match.group(1)
+        json_content = match.group(2)
+
+        # Extract function name (last part after .)
+        function_name = (
+            full_function_name.split(".")[-1]
+            if "." in full_function_name
+            else full_function_name
+        )
+
+        # Check if tool exists
+        if function_name not in tool_indices:
+            logger.debug(f"Function {function_name} not in available tools")
+            return None
+
+        # Parse JSON arguments
+        try:
+            arguments = json.loads(json_content) if json_content.strip() else {}
+        except json.JSONDecodeError as e:
+            logger.debug(f"Failed to parse JSON arguments: {e}")
+            return None
+
+        return ToolCallItem(
+            tool_index=tool_index,
+            name=function_name,
+            parameters=json.dumps(arguments, ensure_ascii=False),
+        )
+
+    def structure_info(self) -> _GetInfoFunc:
+        raise NotImplementedError("structure_info not used with HarmonyParser")
+
+    def build_ebnf(self, tools: List[Tool]) -> str:
+        raise NotImplementedError("build_ebnf not used with HarmonyParser")
diff --git a/python/sglang/srt/function_call/kimik2_detector.py b/python/sglang/srt/function_call/kimik2_detector.py
new file mode 100644
index 000000000..4f39433a6
--- /dev/null
+++ b/python/sglang/srt/function_call/kimik2_detector.py
@@ -0,0 +1,245 @@
+import json
+import logging
+import re
+from typing import List
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    StructureInfo,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.ebnf_composer import EBNFComposer
+from sglang.srt.function_call.utils import _is_complete_json
+
+logger = logging.getLogger(__name__)
+
+
+class KimiK2Detector(BaseFormatDetector):
+    """
+    Detector for Kimi K2 model function call format.
+
+    Format Structure:
+    ```
+    <|tool_calls_section_begin|>
+    <|tool_call_begin|>functions.{func_name}:{index}<|tool_call_argument_begin|>{json_args}<|tool_call_end|>
+    <|tool_calls_section_end|>
+    ```
+
+    Reference: https://huggingface.co/moonshotai/Kimi-K2-Instruct/blob/main/docs/tool_call_guidance.md
+    """
+
+    def __init__(self):
+        super().__init__()
+
+        self.bot_token: str = "<|tool_calls_section_begin|>"
+        self.eot_token: str = "<|tool_calls_section_end|>"
+
+        self.tool_call_start_token: str = "<|tool_call_begin|>"
+        self.tool_call_end_token: str = "<|tool_call_end|>"
+
+        self.tool_call_regex = re.compile(
+            r"<\|tool_call_begin\|>\s*(?P<tool_call_id>[\w\.]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>\{.*?\})\s*<\|tool_call_end\|>"
+        )
+
+        self.stream_tool_call_portion_regex = re.compile(
+            r"<\|tool_call_begin\|>\s*(?P<tool_call_id>[\w\.]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>\{.*)"
+        )
+
+        self._last_arguments = ""
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a KimiK2 format tool call."""
+        return self.bot_token in text
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+
+        :param text: The complete text to parse.
+        :param tools: List of available tools.
+        :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
+        """
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=text, calls=[])
+        try:
+            # there are two possible captures - between tags, or between a
+            # tag and end-of-string so the result of
+            # findall is an array of tuples where one is a function call and
+            # the other is None
+            function_call_tuples = self.tool_call_regex.findall(text)
+
+            logger.debug("function_call_tuples: %s", function_call_tuples)
+
+            tool_calls = []
+            for match in function_call_tuples:
+                function_id, function_args = match
+                function_name = function_id.split(".")[1].split(":")[0]
+                function_idx = int(function_id.split(".")[1].split(":")[1])
+
+                logger.info(f"function_name {function_name}")
+
+                tool_calls.append(
+                    ToolCallItem(
+                        tool_index=function_idx,  # Use the call index in the response, not tool position
+                        name=function_name,
+                        parameters=function_args,
+                    )
+                )
+
+            content = text[: text.find(self.bot_token)]
+            return StreamingParseResult(normal_text=content, calls=tool_calls)
+
+        except Exception as e:
+            logger.error(f"Error in detect_and_parse: {e}")
+            # return the normal text if parsing fails
+            return StreamingParseResult(normal_text=text)
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing tool calls for KimiK2 format.
+        """
+        self._buffer += new_text
+        current_text = self._buffer
+
+        # Check if we have a tool call (either the start token or individual tool call)
+        has_tool_call = (
+            self.bot_token in current_text or self.tool_call_start_token in current_text
+        )
+
+        if not has_tool_call:
+            self._buffer = ""
+            for e_token in [self.eot_token, self.tool_call_end_token]:
+                if e_token in new_text:
+                    new_text = new_text.replace(e_token, "")
+            return StreamingParseResult(normal_text=new_text)
+
+        if not hasattr(self, "_tool_indices"):
+            self._tool_indices = self._get_tool_indices(tools)
+
+        calls: list[ToolCallItem] = []
+        try:
+            match = self.stream_tool_call_portion_regex.search(current_text)
+            if match:
+                function_id = match.group("tool_call_id")
+                function_args = match.group("function_arguments")
+
+                function_name = function_id.split(".")[1].split(":")[0]
+
+                # Initialize state if this is the first tool call
+                if self.current_tool_id == -1:
+                    self.current_tool_id = 0
+                    self.prev_tool_call_arr = []
+                    self.streamed_args_for_tool = [""]
+
+                # Ensure we have enough entries in our tracking arrays
+                while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                    self.prev_tool_call_arr.append({})
+                while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                    self.streamed_args_for_tool.append("")
+
+                if not self.current_tool_name_sent:
+                    calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            name=function_name,
+                            parameters="",
+                        )
+                    )
+                    self.current_tool_name_sent = True
+                    # Store the tool call info for serving layer completions endpoint
+                    self.prev_tool_call_arr[self.current_tool_id] = {
+                        "name": function_name,
+                        "arguments": {},
+                    }
+                else:
+                    argument_diff = (
+                        function_args[len(self._last_arguments) :]
+                        if function_args.startswith(self._last_arguments)
+                        else function_args
+                    )
+
+                    parsed_args_diff = argument_diff.split("<|tool_call_end|>", 1)[0]
+
+                    if parsed_args_diff:
+
+                        calls.append(
+                            ToolCallItem(
+                                tool_index=self.current_tool_id,
+                                name=None,
+                                parameters=parsed_args_diff,
+                            )
+                        )
+                        self._last_arguments += argument_diff
+                        self.streamed_args_for_tool[
+                            self.current_tool_id
+                        ] += parsed_args_diff
+
+                    parsed_args = function_args.split("<|tool_call_end|>", 1)[0]
+                    if _is_complete_json(parsed_args):
+                        try:
+                            parsed_args = json.loads(parsed_args)
+                            self.prev_tool_call_arr[self.current_tool_id][
+                                "arguments"
+                            ] = parsed_args
+                        except json.JSONDecodeError:
+                            pass
+
+                        # Find the end of the current tool call and remove only that part from buffer
+                        tool_call_end_pattern = (
+                            r"<\|tool_call_begin\|>.*?<\|tool_call_end\|>"
+                        )
+                        match = re.search(
+                            tool_call_end_pattern, current_text, re.DOTALL
+                        )
+                        if match:
+                            # Remove the completed tool call from buffer, keep any remaining content
+                            self._buffer = current_text[match.end() :]
+                        else:
+                            self._buffer = ""
+
+                        result = StreamingParseResult(normal_text="", calls=calls)
+                        self.current_tool_id += 1
+                        self._last_arguments = ""
+                        self.current_tool_name_sent = False
+                        return result
+
+            return StreamingParseResult(normal_text="", calls=calls)
+
+        except Exception as e:
+            logger.error(f"Error in parse_streaming_increment: {e}")
+            return StreamingParseResult(normal_text=current_text)
+
+    def structure_info(self) -> _GetInfoFunc:
+        """Return function that creates StructureInfo for guided generation."""
+
+        def get_info(name: str) -> StructureInfo:
+            return StructureInfo(
+                begin=f"<|tool_calls_section_begin|><|tool_call_begin|>functions.{name}:0<|tool_call_argument_begin|>",
+                end="<|tool_call_end|><|tool_calls_section_end|>",
+                trigger="<|tool_calls_section_begin|>",
+            )
+
+        return get_info
+
+    def build_ebnf(self, tools: List[Tool]) -> str:
+        """
+        Build EBNF grammar for KimiK2 tool call format.
+
+        NOTE: The call_rule_fmt uses [0-9]+ for the function index to allow the grammar
+        to accept any numeric index (0, 1, 2, etc.) for proper sequential indexing in
+        multiple function call scenarios, while still maintaining the correct KimiK2
+        format structure for constrained generation.
+        """
+        return EBNFComposer.build_ebnf(
+            tools,
+            sequence_start_token=self.bot_token,
+            sequence_end_token=self.eot_token,
+            tool_call_separator="",
+            call_rule_fmt='"<|tool_call_begin|>functions.{name}:"[0-9]+"<|tool_call_argument_begin|>"{arguments_rule}"<|tool_call_end|>"',
+            function_format="json",
+        )
diff --git a/python/sglang/srt/function_call/llama32_detector.py b/python/sglang/srt/function_call/llama32_detector.py
new file mode 100644
index 000000000..453bcbc9a
--- /dev/null
+++ b/python/sglang/srt/function_call/llama32_detector.py
@@ -0,0 +1,96 @@
+import json
+import logging
+from typing import List
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    StructureInfo,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.ebnf_composer import EBNFComposer
+
+logger = logging.getLogger(__name__)
+
+
+class Llama32Detector(BaseFormatDetector):
+    """
+    Detector for Llama 3.2 models with json tool call format.
+
+    Format Structure:
+    ```
+    <python_tag>{"name":"xxx", "arguments":{...}}
+    ```
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.bot_token = "<|python_tag|>"
+        # NOTE: technically Llama3.2 doesn't support well with parallel tool calls
+        # They need specific prompt engineering to support parallel tool calls
+        # Here we use ';' as the separator, which might have compatibility issues
+        # if users define to use a different separator in their prompt
+        self.tool_call_separator = ";"
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a Llama 3.2 format tool call."""
+        # depending on the prompt format the Llama model may or may not
+        # prefix the output with the <|python_tag|> token
+        return "<|python_tag|>" in text or text.startswith("{")
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """Parse function calls from text, handling multiple JSON objects."""
+        if "<|python_tag|>" not in text and not text.startswith("{"):
+            return StreamingParseResult(normal_text=text, calls=[])
+
+        if "<|python_tag|>" in text:
+            normal_text, action_text = text.split("<|python_tag|>", maxsplit=1)
+        else:
+            normal_text, action_text = "", text
+
+        decoder = json.JSONDecoder()
+        idx = 0
+        safe_idx = idx  # the index of the last valid JSON object
+        all_actions = []
+        action_text_len = len(action_text)
+        while idx < action_text_len:
+            try:
+                obj, end = decoder.raw_decode(action_text[idx:])
+                all_actions.append(obj)
+                idx += end + len(self.tool_call_separator)
+                safe_idx = idx
+            except json.JSONDecodeError as e:
+                # Find where next `{"name"` appears and try again
+                logger.warning(
+                    f"Failed to parse JSON part: {action_text[idx:]}, JSON parse error: {str(e)}"
+                )
+                next_obj_start = action_text.find('{"name":', idx + 1)
+                if next_obj_start == -1:
+                    break
+                idx = next_obj_start
+                continue
+
+        # Only process if we found valid JSON objects
+        calls = self.parse_base_json(all_actions, tools) if all_actions else []
+        # Use safe_idx to avoid idx containing the last part of an invalid JSON object
+        trailing_text = (
+            action_text[safe_idx:].strip() if safe_idx < action_text_len else ""
+        )
+        return StreamingParseResult(
+            normal_text=normal_text + trailing_text, calls=calls
+        )
+
+    def structure_info(self) -> _GetInfoFunc:
+        return lambda name: StructureInfo(
+            begin='<|python_tag|>{"name":"' + name + '", "arguments":',
+            end="}",
+            trigger="<|python_tag|>",
+        )
+
+    def build_ebnf(self, tools: List[Tool]):
+        return EBNFComposer.build_ebnf(
+            tools,
+            function_format="json",
+            tool_call_separator=self.tool_call_separator,
+        )
diff --git a/python/sglang/srt/function_call/mistral_detector.py b/python/sglang/srt/function_call/mistral_detector.py
new file mode 100644
index 000000000..49767fd53
--- /dev/null
+++ b/python/sglang/srt/function_call/mistral_detector.py
@@ -0,0 +1,139 @@
+import json
+import logging
+import re
+from typing import List
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    StructureInfo,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.ebnf_composer import EBNFComposer
+
+logger = logging.getLogger(__name__)
+
+
+class MistralDetector(BaseFormatDetector):
+    """
+    Detector for Mistral model function call format.
+
+    The Mistral format uses a simple bracket-delimited structure with JSON arrays
+    containing function call objects.
+
+    Format Structure:
+    ```
+    [TOOL_CALLS] [{"name": "function_name", "arguments": {json_args}}, ...]
+    ```
+
+    Reference: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3?chat_template=default
+    """
+
+    def __init__(self):
+        """
+        Initializes the detector with necessary state variables.
+        """
+        super().__init__()
+        self.bot_token = "[TOOL_CALLS] ["
+        self.eot_token = "]"
+        self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
+        self.tool_call_separator = ", "
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a Mistral format tool call."""
+        return self.bot_token in text
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+
+        :param text: The complete text to parse.
+        :param tools: List of available tools.
+        :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
+        """
+        idx = text.find(self.bot_token)
+        normal_text = text[:idx].strip() if idx != -1 else text
+
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=normal_text, calls=[])
+
+        # Extract the JSON array part from [TOOL_CALLS] [...]
+        # Use bracket counting to properly handle nested brackets in JSON content
+        json_array_str = self._extract_json_array(text)
+        if not json_array_str:
+            return StreamingParseResult(normal_text=normal_text, calls=[])
+
+        calls = []
+        try:
+            function_call_arr = json.loads(json_array_str)
+            # Handle both single object and array of objects
+            if not isinstance(function_call_arr, list):
+                function_call_arr = [function_call_arr]
+            calls = self.parse_base_json(function_call_arr, tools)
+        except json.JSONDecodeError as e:
+            logger.warning(
+                f"Failed to parse JSON part: {json_array_str}, JSON parse error: {str(e)}"
+            )
+
+        return StreamingParseResult(normal_text=normal_text, calls=calls)
+
+    def _extract_json_array(self, text: str) -> str:
+        """
+        Extract the JSON array part using bracket counting to handle nested brackets.
+
+        :param text: The complete text containing [TOOL_CALLS] [...]
+        :return: The JSON array string or None if not found
+        """
+        start_idx = text.find(self.bot_token)
+        if start_idx == -1:
+            return None
+
+        # Start from the opening bracket after [TOOL_CALLS]
+        json_start = (
+            start_idx + len(self.bot_token) - 1
+        )  # -1 to include the opening bracket
+        bracket_count = 0
+        in_string = False
+        escape_next = False
+
+        for i in range(json_start, len(text)):
+            char = text[i]
+
+            if escape_next:
+                escape_next = False
+                continue
+
+            if char == "\\":
+                escape_next = True
+                continue
+
+            if char == '"' and not escape_next:
+                in_string = not in_string
+                continue
+
+            if not in_string:
+                if char == "[":
+                    bracket_count += 1
+                elif char == "]":
+                    bracket_count -= 1
+                    if bracket_count == 0:
+                        return text[json_start : i + 1]
+
+        return None
+
+    def structure_info(self) -> _GetInfoFunc:
+        return lambda name: StructureInfo(
+            begin='[TOOL_CALLS] [{"name":"' + name + '", "arguments":',
+            end="}]",
+            trigger="[TOOL_CALLS]",
+        )
+
+    def build_ebnf(self, tools: List[Tool]):
+        return EBNFComposer.build_ebnf(
+            tools,
+            sequence_start_token=self.bot_token,
+            sequence_end_token=self.eot_token,
+            function_format="json",
+            tool_call_separator=self.tool_call_separator,
+        )
diff --git a/python/sglang/srt/function_call/pythonic_detector.py b/python/sglang/srt/function_call/pythonic_detector.py
new file mode 100644
index 000000000..be183c6bf
--- /dev/null
+++ b/python/sglang/srt/function_call/pythonic_detector.py
@@ -0,0 +1,231 @@
+import ast
+import json
+import logging
+import re
+from typing import List, Optional
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.ebnf_composer import EBNFComposer
+
+logger = logging.getLogger(__name__)
+
+
+class PythonicDetector(BaseFormatDetector):
+    """
+    Detector for Llama-4 models with Pythonic tool call format.
+
+    The Pythonic format uses Python function call syntax within square brackets,
+    with arguments as Python literals rather than JSON.
+
+    Format Structure:
+    ```
+    [tool1(arg1=val1, arg2=val2), tool2(arg1=val3)]
+    ```
+
+    Reference: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct?chat_template=default
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.tool_call_regex = re.compile(
+            r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]",
+            re.DOTALL,
+        )
+
+    @staticmethod
+    def _text_strip(text: str) -> str:
+        # Llama 4 model sometime will output <|python_start|> and <|python_end|> tokens
+        # remove those tokens
+        text = text.replace("<|python_start|>", "")
+        text = text.replace("<|python_end|>", "")
+        return text
+
+    def has_tool_call(self, text: str) -> bool:
+        return bool(self.tool_call_regex.search(self._text_strip(text.strip())))
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        # Try parsing the text as a Python list of function calls
+        text = text.strip()
+
+        # Remove unexpected <|python_start|> and <|python_end|> for llama4
+        text = self._text_strip(text)
+
+        match = self.tool_call_regex.search(text)
+        if match is None:
+            return StreamingParseResult(normal_text=text, calls=[])
+
+        # Extract the tool call part and any text before/after it
+        tool_call_start = match.start()
+        tool_call_end = match.end()
+
+        normal_text_before = text[:tool_call_start] if tool_call_start > 0 else ""
+        tool_call_text = text[tool_call_start:tool_call_end]
+        normal_text_after = text[tool_call_end:] if tool_call_end < len(text) else ""
+
+        # Combine normal text
+        normal_text = normal_text_before + normal_text_after
+
+        try:
+            module = ast.parse(tool_call_text)
+            parsed = getattr(module.body[0], "value", None)
+            if not (
+                isinstance(parsed, ast.List)
+                and all(isinstance(e, ast.Call) for e in parsed.elts)
+            ):
+                return StreamingParseResult(normal_text=normal_text, calls=[])
+
+            calls = []
+            tool_indices = self._get_tool_indices(tools)
+            for call_index, call in enumerate(parsed.elts):
+                if not isinstance(call.func, ast.Name):
+                    continue
+                function_name = call.func.id
+                # Validate that the function exists in the tools
+                if function_name not in tool_indices:
+                    logger.warning(
+                        f"Model attempted to call undefined function: {function_name}"
+                    )
+                    continue
+                arguments = {}
+                for keyword in call.keywords:
+                    arguments[keyword.arg] = self._get_parameter_value(keyword.value)
+                calls.append(
+                    ToolCallItem(
+                        tool_index=call_index,  # Use the call index in the response, not tool position
+                        name=function_name,
+                        parameters=json.dumps(arguments, ensure_ascii=False),
+                    )
+                )
+
+            return StreamingParseResult(normal_text=normal_text, calls=calls)
+        except Exception:
+            logger.exception("Error in pythonic tool call parsing.")
+            return StreamingParseResult(normal_text=normal_text, calls=[])
+
+    def _find_matching_bracket(self, buffer: str, start: int) -> int:
+        """
+        Find the matching closing bracket for the opening bracket at start position.
+        Properly handles nested brackets.
+
+        Args:
+            buffer: The text buffer to search in
+            start: Position of the opening bracket '['
+
+        Returns:
+            Position of the matching closing bracket ']', or -1 if not found
+        """
+        bracket_count = 0
+        for i in range(start, len(buffer)):
+            if buffer[i] == "[":
+                bracket_count += 1
+            elif buffer[i] == "]":
+                bracket_count -= 1
+                if bracket_count == 0:
+                    return i
+        return -1  # No matching bracket found
+
+    def _strip_and_split_buffer(self, buffer: str) -> tuple[str, str]:
+        """
+        Strip special tokens from buffer and split into safe_text and held_back_text.
+
+        Returns:
+            tuple of (safe_text_to_output, text_to_hold_in_buffer)
+        """
+        # Check if original buffer ends with a partial token at the end
+        special_tokens = ["<|python_start|>", "<|python_end|>"]
+
+        for token in special_tokens:
+            partial_length = self._ends_with_partial_token(buffer, token)
+            if partial_length > 0:
+                # Split buffer: safe part + held back partial token
+                safe_text = buffer[:-partial_length]
+                held_back = buffer[-partial_length:]
+                # Strip complete special tokens from safe part only
+                safe_text = self._text_strip(safe_text)
+                return safe_text, held_back
+
+        # No partial tokens found, strip complete tokens from entire buffer
+        safe_text = self._text_strip(buffer)
+        return safe_text, ""
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing for pythonic tool calls.
+        Buffers input until a complete pythonic tool call (from [ to ]) is found,
+        then parses and emits any detected calls.
+        """
+        self._buffer += new_text
+
+        # Strip special tokens from entire buffer and handle partial tokens
+        stripped_buffer, held_back = self._strip_and_split_buffer(self._buffer)
+
+        start = stripped_buffer.find("[")
+
+        if start == -1:
+            # No tool call bracket found
+            self._buffer = held_back
+            return StreamingParseResult(normal_text=stripped_buffer)
+
+        normal_text = stripped_buffer[:start] if start > 0 else ""
+
+        end = self._find_matching_bracket(stripped_buffer, start)
+        if end != -1:
+            # Found complete tool call
+            call_text = stripped_buffer[start : end + 1]
+            result = self.detect_and_parse(call_text, tools)
+
+            # Update buffer with remaining text after tool call plus any held back text
+            remaining_text = stripped_buffer[end + 1 :] + held_back
+            self._buffer = remaining_text
+
+            # If we had normal text before the tool call, add it to the result
+            if normal_text:
+                result.normal_text = normal_text + (result.normal_text or "")
+
+            return result
+
+        # We have an opening bracket but no closing bracket yet
+        # Put back everything from the bracket onwards plus held back text
+        self._buffer = stripped_buffer[start:] + held_back
+
+        if normal_text:
+            return StreamingParseResult(normal_text=normal_text)
+
+        # Otherwise, we're still accumulating a potential tool call
+        return StreamingParseResult(normal_text="")
+
+    def _get_parameter_value(self, val):
+        if isinstance(val, ast.Constant):
+            return val.value
+        elif isinstance(val, ast.Dict):
+            return {
+                k.value: self._get_parameter_value(v)
+                for k, v in zip(val.keys, val.values)
+            }
+        elif isinstance(val, ast.List):
+            return [self._get_parameter_value(v) for v in val.elts]
+        else:
+            raise ValueError("Tool call arguments must be literals")
+
+    def supports_structural_tag(self) -> bool:
+        return False
+
+    def structure_info(self) -> _GetInfoFunc:
+        raise NotImplementedError
+
+    def build_ebnf(self, tools: List[Tool]) -> Optional[str]:
+        return EBNFComposer.build_ebnf(
+            tools,
+            sequence_start_token="[",
+            sequence_end_token="]",
+            tool_call_separator=",",
+            function_format="pythonic",
+        )
diff --git a/python/sglang/srt/function_call/qwen25_detector.py b/python/sglang/srt/function_call/qwen25_detector.py
new file mode 100644
index 000000000..40a65e5df
--- /dev/null
+++ b/python/sglang/srt/function_call/qwen25_detector.py
@@ -0,0 +1,130 @@
+import json
+import logging
+import re
+from typing import List
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    StructureInfo,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.ebnf_composer import EBNFComposer
+
+logger = logging.getLogger(__name__)
+
+
+class Qwen25Detector(BaseFormatDetector):
+    """
+    Detector for Qwen 2.5 and Qwen 3 model function call format.
+
+    Format Structure:
+    ```
+    <tool_call>\n{"name":"func1", "arguments":{...}}\n</tool_call>\n<tool_call>\n{"name":"func2", "arguments":{...}}\n</tool_call>
+    ```
+
+    Key Components:
+    - Tool Call Tags: `<tool_call>` and `</tool_call>` wrap each individual call
+    - Function Call Object: JSON object with "name" and "arguments" fields
+
+    Reference: https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct?chat_template=default
+    """
+
+    def __init__(self):
+        """
+        Initializes the detector with necessary state variables.
+        """
+        super().__init__()
+        self.bot_token = "<tool_call>\n"
+        self.eot_token = "\n</tool_call>"
+        self.tool_call_separator = "\n"
+        self._normal_text_buffer = ""  # Buffer for handling partial end tokens
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a Qwen 2.5 format tool call."""
+        return self.bot_token in text
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+
+        :param text: The complete text to parse.
+        :param tools: List of available tools.
+        :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
+        """
+        idx = text.find(self.bot_token)
+        normal_text = text[:idx].strip() if idx != -1 else text
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=normal_text, calls=[])
+
+        # Find all <tool_call>\n...\n</tool_call> blocks
+        pattern = rf"{re.escape(self.bot_token)}(.*?){re.escape(self.eot_token)}"
+        match_result_list = re.findall(pattern, text, re.DOTALL)
+        calls = []
+        for match_result in match_result_list:
+            try:
+                parsed_call = json.loads(match_result.strip())
+                calls.extend(self.parse_base_json(parsed_call, tools))
+            except json.JSONDecodeError as e:
+                logger.warning(
+                    f"Failed to parse JSON part: {match_result}, JSON parse error: {str(e)}"
+                )
+                continue
+        return StreamingParseResult(normal_text=normal_text, calls=calls)
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing for Qwen 2.5 tool calls.
+        Uses base class implementation with buffering to handle partial end tokens.
+        """
+        result = super().parse_streaming_increment(new_text, tools)
+
+        # Handle partial end tokens that are streamed character by character
+        if result.normal_text:
+            self._normal_text_buffer += result.normal_text
+
+            # Check if buffer contains complete end token (without leading newline)
+            end_token_without_newline = self.eot_token[1:]  # "</tool_call>"
+            if end_token_without_newline in self._normal_text_buffer:
+                cleaned_text = self._normal_text_buffer.replace(
+                    end_token_without_newline, ""
+                )
+                self._normal_text_buffer = ""
+                result.normal_text = cleaned_text
+            else:
+                # Check if buffer might contain partial end token at the end
+                partial_match_len = self._ends_with_partial_token(
+                    self._normal_text_buffer, end_token_without_newline
+                )
+
+                if partial_match_len:
+                    # Keep potential partial match in buffer, return the rest
+                    result.normal_text = self._normal_text_buffer[:-partial_match_len]
+                    self._normal_text_buffer = self._normal_text_buffer[
+                        -partial_match_len:
+                    ]
+                else:
+                    # No partial match, return all buffered text
+                    result.normal_text = self._normal_text_buffer
+                    self._normal_text_buffer = ""
+
+        return result
+
+    def structure_info(self) -> _GetInfoFunc:
+        return lambda name: StructureInfo(
+            begin='<tool_call>\n{"name":"' + name + '", "arguments":',
+            end="}\n</tool_call>",
+            trigger="<tool_call>",
+        )
+
+    def build_ebnf(self, tools: List[Tool]):
+        return EBNFComposer.build_ebnf(
+            tools,
+            individual_call_start_token=self.bot_token.replace("\n", "\\n"),
+            individual_call_end_token=self.eot_token.replace("\n", "\\n"),
+            tool_call_separator="\\n",
+            function_format="json",
+        )
diff --git a/python/sglang/srt/function_call/qwen3_coder_detector.py b/python/sglang/srt/function_call/qwen3_coder_detector.py
new file mode 100644
index 000000000..9bd3c7c24
--- /dev/null
+++ b/python/sglang/srt/function_call/qwen3_coder_detector.py
@@ -0,0 +1,362 @@
+import ast
+import html
+import json
+import logging
+import re
+from typing import Any, Dict, List, Tuple
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.ebnf_composer import EBNFComposer
+
+logger = logging.getLogger(__name__)
+
+
+def _safe_val(raw: str) -> Any:
+    raw = html.unescape(raw.strip())
+    try:
+        return json.loads(raw)
+    except Exception:
+        try:
+            return ast.literal_eval(raw)
+        except Exception:
+            return raw
+
+
+class Qwen3CoderDetector(BaseFormatDetector):
+    """
+    Detector for Qwen 3 models.
+    Assumes function call format:
+        <tool_call>
+        <function=execute_bash>
+        <parameter=command>
+        pwd && ls
+        </parameter>
+        </function>
+        </tool_call>
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.tool_call_start_token: str = "<tool_call>"
+        self.tool_call_end_token: str = "</tool_call>"
+        self.tool_call_prefix: str = "<function="
+        self.tool_call_regex = re.compile(
+            r"<tool_call>(.*?)</tool_call>|<tool_call>(.*?)$", re.DOTALL
+        )
+        self.tool_call_function_regex = re.compile(
+            r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL
+        )
+        self.tool_call_parameter_regex = re.compile(
+            r"<parameter=(.*?)</parameter>|<parameter=(.*?)$", re.DOTALL
+        )
+        self._buf: str = ""
+
+        # Streaming state variables
+        self._current_function_name: str = ""
+        self._current_parameters: Dict[str, Any] = {}
+        self._streamed_parameters: Dict[str, str] = (
+            {}
+        )  # Track what parameter content we've streamed
+        self._in_tool_call: bool = False
+        self._function_name_sent: bool = False
+
+    def has_tool_call(self, text: str) -> bool:
+        return self.tool_call_start_token in text
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        normal, calls = self._extract(text, tools)
+        return StreamingParseResult(normal_text=normal, calls=calls)
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        self._buf += new_text
+        normal = ""
+        calls: List[ToolCallItem] = []
+
+        # Build tool indices for validation
+        if not hasattr(self, "_tool_indices"):
+            self._tool_indices = self._get_tool_indices(tools)
+
+        while True:
+            # If we're not in a tool call and don't see a start token, return normal text
+            if not self._in_tool_call and self.tool_call_start_token not in self._buf:
+                normal += self._buf
+                self._buf = ""
+                break
+
+            # Look for tool call start
+            if not self._in_tool_call:
+                s = self._buf.find(self.tool_call_start_token)
+                if s == -1:
+                    normal += self._buf
+                    self._buf = ""
+                    break
+
+                normal += self._buf[:s]
+                self._buf = self._buf[s:]
+
+                self._in_tool_call = True
+                self._function_name_sent = False
+                self._current_function_name = ""
+                self._current_parameters = {}
+                self._streamed_parameters = {}
+
+                # Remove the start token
+                self._buf = self._buf[len(self.tool_call_start_token) :]
+                continue
+
+            # We're in a tool call, try to parse function name if not sent yet
+            if not self._function_name_sent:
+                # Look for function name pattern: <function=name>
+                function_match = re.search(r"<function=([^>]+)>", self._buf)
+                if function_match:
+                    function_name = function_match.group(1).strip()
+
+                    # Validate function name
+                    if function_name in self._tool_indices:
+                        self._current_function_name = function_name
+                        self._function_name_sent = True
+
+                        # Initialize tool call tracking
+                        if self.current_tool_id == -1:
+                            self.current_tool_id = 0
+
+                        # Ensure tracking arrays are large enough
+                        while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                            self.prev_tool_call_arr.append({})
+                        while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                            self.streamed_args_for_tool.append("")
+
+                        # Store tool call info
+                        self.prev_tool_call_arr[self.current_tool_id] = {
+                            "name": function_name,
+                            "arguments": {},
+                        }
+
+                        # Send tool name with empty parameters
+                        calls.append(
+                            ToolCallItem(
+                                tool_index=self.current_tool_id,
+                                name=function_name,
+                                parameters="",
+                            )
+                        )
+
+                        # Remove the processed function declaration
+                        self._buf = self._buf[function_match.end() :]
+                        continue
+                    else:
+                        # Invalid function name, reset state
+                        logger.warning(f"Invalid function name: {function_name}")
+                        self._reset_streaming_state()
+                        normal += self._buf
+                        self._buf = ""
+                        break
+                else:
+                    # Function name not complete yet, wait for more text
+                    break
+
+            # Parse parameters incrementally
+            if self._function_name_sent:
+                # Process parameters and get any calls to emit
+                parameter_calls = self._parse_and_stream_parameters(self._buf)
+                calls.extend(parameter_calls)
+
+                # Check if tool call is complete
+                if self.tool_call_end_token in self._buf:
+                    end_pos = self._buf.find(self.tool_call_end_token)
+
+                    # Add closing brace to complete the JSON object
+                    current_streamed = self.streamed_args_for_tool[self.current_tool_id]
+                    if current_streamed:
+                        # Count opening and closing braces to check if JSON is complete
+                        open_braces = current_streamed.count("{")
+                        close_braces = current_streamed.count("}")
+                        if open_braces > close_braces:
+                            calls.append(
+                                ToolCallItem(
+                                    tool_index=self.current_tool_id,
+                                    name=None,
+                                    parameters="}",
+                                )
+                            )
+                            self.streamed_args_for_tool[self.current_tool_id] = (
+                                current_streamed + "}"
+                            )
+
+                    # Complete the tool call
+                    self._buf = self._buf[end_pos + len(self.tool_call_end_token) :]
+                    self._reset_streaming_state()
+                    self.current_tool_id += 1
+                    continue
+                else:
+                    # Tool call not complete yet, wait for more text
+                    break
+
+        return StreamingParseResult(normal_text=normal, calls=calls)
+
+    def _parse_and_stream_parameters(self, text_to_parse: str) -> List[ToolCallItem]:
+        """
+        Parse complete parameter blocks from text and return any tool call items to emit.
+
+        This method:
+        1. Finds all complete <parameter> blocks
+        2. Parses them into a dictionary
+        3. Compares with current parameters and generates diff if needed
+        4. Updates internal state
+
+        Args:
+            text_to_parse: The text to search for parameter blocks
+
+        Returns:
+            List of ToolCallItem objects to emit (may be empty)
+        """
+        calls: List[ToolCallItem] = []
+
+        # Find all complete parameter patterns
+        param_matches = list(
+            re.finditer(
+                r"<parameter=([^>]+)>(.*?)</parameter>", text_to_parse, re.DOTALL
+            )
+        )
+
+        # Build new parameters dictionary
+        new_params = {}
+        for match in param_matches:
+            param_name = match.group(1).strip()
+            param_value = match.group(2)
+            new_params[param_name] = _safe_val(param_value)
+
+        # Calculate parameter diff to stream with proper incremental JSON building
+        if new_params != self._current_parameters:
+            previous_args_json = self.streamed_args_for_tool[self.current_tool_id]
+
+            # Build incremental JSON properly
+            if not self._current_parameters:
+                # First parameter(s) - start JSON object but don't close it yet
+                items = []
+                for key, value in new_params.items():
+                    items.append(
+                        f"{json.dumps(key, ensure_ascii=False)}: {json.dumps(value, ensure_ascii=False)}"
+                    )
+                json_fragment = "{" + ", ".join(items)
+
+                calls.append(
+                    ToolCallItem(
+                        tool_index=self.current_tool_id,
+                        name=None,
+                        parameters=json_fragment,
+                    )
+                )
+                self.streamed_args_for_tool[self.current_tool_id] = json_fragment
+
+            else:
+                # Additional parameters - add them incrementally
+                new_keys = set(new_params.keys()) - set(self._current_parameters.keys())
+                if new_keys:
+                    # Build the continuation part (no closing brace yet)
+                    continuation_parts = []
+                    for key in new_keys:
+                        value = new_params[key]
+                        continuation_parts.append(
+                            f"{json.dumps(key, ensure_ascii=False)}: {json.dumps(value, ensure_ascii=False)}"
+                        )
+
+                    json_fragment = ", " + ", ".join(continuation_parts)
+
+                    calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            name=None,
+                            parameters=json_fragment,
+                        )
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] = (
+                        previous_args_json + json_fragment
+                    )
+
+            # Update current state
+            self._current_parameters = new_params
+            self.prev_tool_call_arr[self.current_tool_id]["arguments"] = new_params
+
+        return calls
+
+    def _reset_streaming_state(self):
+        """Reset streaming state for the next tool call"""
+        self._in_tool_call = False
+        self._function_name_sent = False
+        self._current_function_name = ""
+        self._current_parameters = {}
+        self._streamed_parameters = {}
+        self.current_tool_name_sent = False
+
+    def _extract(self, text: str, tools: List[Tool]) -> Tuple[str, List[ToolCallItem]]:
+        normal_parts: List[str] = []
+        calls: List[ToolCallItem] = []
+        cursor = 0
+        while True:
+            s = text.find(self.tool_call_start_token, cursor)
+            if s == -1:
+                normal_parts.append(text[cursor:])
+                break
+            normal_parts.append(text[cursor:s])
+            e = text.find(self.tool_call_end_token, s)
+            if e == -1:
+                normal_parts.append(text[s:])
+                break
+            block = text[s : e + len(self.tool_call_end_token)]
+            cursor = e + len(self.tool_call_end_token)
+            calls.extend(self._parse_block(block, tools))
+        return "".join(normal_parts), calls
+
+    def _parse_block(self, block: str, tools: List[Tool]) -> List[ToolCallItem]:
+        res: List[ToolCallItem] = []
+        for m in self.tool_call_function_regex.findall(block):
+            txt = m[0] if m[0] else m[1]
+            if ">" not in txt:
+                continue
+            idx = txt.index(">")
+            fname = txt[:idx].strip()
+            body = txt[idx + 1 :]
+            params: Dict[str, Any] = {}
+            for pm in self.tool_call_parameter_regex.findall(body):
+                ptxt = pm[0] if pm[0] else pm[1]
+                if ">" not in ptxt:
+                    continue
+                pidx = ptxt.index(">")
+                pname = ptxt[:pidx].strip()
+                pval = ptxt[pidx + 1 :].lstrip("\n").rstrip("\n")
+                params[pname] = _safe_val(pval)
+            raw = {"name": fname, "arguments": params}
+            try:
+                # TODO: fix idx in function call, the index for a function
+                # call will always be -1 in parse_base_json
+                res.extend(self.parse_base_json(raw, tools))
+            except Exception:
+                logger.warning("invalid tool call for %s dropped", fname)
+        return res
+
+    def supports_structural_tag(self) -> bool:
+        return False
+
+    def structure_info(self) -> _GetInfoFunc:
+        raise NotImplementedError
+
+    def build_ebnf(self, tools: List[Tool]):
+        return EBNFComposer.build_ebnf(
+            tools,
+            individual_call_start_token=self.tool_call_start_token.replace("\n", "\\n"),
+            individual_call_end_token=self.tool_call_end_token.replace("\n", "\\n"),
+            tool_call_separator="\\n",
+            function_format="xml",
+            call_rule_fmt='"<function={name}>\\n" {arguments_rule} "\\n</function>"',
+            key_value_rule_fmt='"<parameter={key}>\\n" {valrule} "\\n</parameter>"',
+            key_value_separator='"\\n"',
+        )
diff --git a/python/sglang/srt/function_call/step3_detector.py b/python/sglang/srt/function_call/step3_detector.py
new file mode 100644
index 000000000..b46f4544f
--- /dev/null
+++ b/python/sglang/srt/function_call/step3_detector.py
@@ -0,0 +1,436 @@
+import ast
+import json
+import logging
+import re
+from typing import Any, Dict, List
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.ebnf_composer import EBNFComposer
+
+logger = logging.getLogger(__name__)
+
+
+def get_argument_type(func_name: str, arg_key: str, defined_tools: List[Tool]) -> str:
+    """Get the expected type for a function argument from tool schema."""
+    name2tool = {tool.function.name: tool for tool in defined_tools}
+    if func_name not in name2tool:
+        return None
+    tool = name2tool[func_name]
+    parameters = tool.function.parameters or {}
+    properties = parameters.get("properties", {})
+    if arg_key not in properties:
+        return None
+    return properties[arg_key].get("type", None)
+
+
+def parse_arguments(value: str) -> tuple[Any, bool]:
+    """Parse a string value to appropriate type. Returns (parsed_value, success)."""
+    try:
+        try:
+            parsed_value = json.loads(value)
+        except:
+            parsed_value = ast.literal_eval(value)
+        return parsed_value, True
+    except:
+        return value, False
+
+
+class Step3Detector(BaseFormatDetector):
+    """
+    Detector for Step3 model function call format.
+
+    The Step3 format uses special Unicode tokens to delimit function calls
+    with steptml XML format for invocations.
+
+    Format Structure:
+    ```
+    <｜tool_calls_begin｜>
+    <｜tool_call_begin｜>function<｜tool_sep｜><steptml:invoke name="function_name">
+    <steptml:parameter name="param1">value1</steptml:parameter>
+    <steptml:parameter name="param2">value2</steptml:parameter>
+    </steptml:invoke><｜tool_call_end｜>
+    <｜tool_calls_end｜>
+    ```
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.bot_token = "<｜tool_calls_begin｜>"
+        self.eot_token = "<｜tool_calls_end｜>"
+        self.tool_call_begin = "<｜tool_call_begin｜>"
+        self.tool_call_end = "<｜tool_call_end｜>"
+        self.tool_sep = "<｜tool_sep｜>"
+
+        # Regex for parsing steptml invocations
+        self.invoke_regex = re.compile(
+            r'<steptml:invoke name="([^"]+)">(.+?)</steptml:invoke>', re.DOTALL
+        )
+        self.param_regex = re.compile(
+            r'<steptml:parameter name="([^"]+)">([^<]*)</steptml:parameter>', re.DOTALL
+        )
+
+        # Streaming state variables
+        self._in_tool_block: bool = False
+        self._tool_block_finished: bool = False
+        self._current_function_name: str = ""
+        self._current_parameters: Dict[str, Any] = {}
+        self._in_tool_call: bool = False
+        self._function_name_sent: bool = False
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a Step3 format tool call."""
+        return self.bot_token in text
+
+    def _parse_steptml_invoke(
+        self, text: str, tools: List[Tool] = None
+    ) -> tuple[str, dict]:
+        """Parse steptml invoke format to extract function name and parameters."""
+        invoke_match = self.invoke_regex.search(text)
+        if not invoke_match:
+            return None, {}
+
+        func_name = invoke_match.group(1)
+        params_text = invoke_match.group(2)
+
+        params = {}
+        for param_match in self.param_regex.finditer(params_text):
+            param_name = param_match.group(1)
+            param_value = param_match.group(2).strip()
+
+            # If tools provided, use schema-aware parsing
+            if tools:
+                arg_type = get_argument_type(func_name, param_name, tools)
+                if arg_type and arg_type != "string":
+                    parsed_value, _ = parse_arguments(param_value)
+                    params[param_name] = parsed_value
+                else:
+                    params[param_name] = param_value
+            else:
+                # Fallback to generic parsing if no tools provided
+                parsed_value, _ = parse_arguments(param_value)
+                params[param_name] = parsed_value
+
+        return func_name, params
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+        """
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=text, calls=[])
+
+        try:
+            pre_text, rest = text.split(self.bot_token, 1)
+
+            # If no end token, return everything as normal text
+            if self.eot_token not in rest:
+                return StreamingParseResult(normal_text=text, calls=[])
+
+            tool_section, post_text = rest.split(self.eot_token, 1)
+
+            # Find all individual tool calls using regex
+            calls = []
+            tool_call_pattern = (
+                f"{re.escape(self.tool_call_begin)}(.*?){re.escape(self.tool_call_end)}"
+            )
+
+            for match in re.finditer(tool_call_pattern, tool_section, re.DOTALL):
+                call_content = match.group(1)
+
+                # Check if it's a function call
+                if self.tool_sep not in call_content:
+                    continue
+
+                type_part, invoke_part = call_content.split(self.tool_sep, 1)
+                if type_part.strip() != "function":
+                    continue
+
+                func_name, params = self._parse_steptml_invoke(invoke_part, tools)
+                if func_name:
+                    # Use parse_base_json to create the ToolCallItem
+                    action = {"name": func_name, "arguments": params}
+                    calls.extend(self.parse_base_json(action, tools))
+
+            # Combine pre and post text
+            normal_text = pre_text + post_text
+
+            return StreamingParseResult(normal_text=normal_text, calls=calls)
+
+        except Exception as e:
+            logger.error(f"Error in detect_and_parse: {e}")
+            # Return the original text if parsing fails
+            return StreamingParseResult(normal_text=text)
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing for Step3 format.
+        """
+        self._buffer += new_text
+
+        # Build tool indices for validation
+        if not hasattr(self, "_tool_indices"):
+            self._tool_indices = self._get_tool_indices(tools)
+
+        # If we've finished the tool block, everything is normal text
+        if self._tool_block_finished:
+            normal_text = self._buffer
+            self._buffer = ""
+            return StreamingParseResult(normal_text=normal_text)
+
+        # Check if tool block hasn't started yet
+        if not self._in_tool_block:
+            if self.bot_token in self._buffer:
+                idx = self._buffer.find(self.bot_token)
+                normal_text = self._buffer[:idx]
+                self._buffer = self._buffer[idx + len(self.bot_token) :]
+                self._in_tool_block = True
+                return StreamingParseResult(normal_text=normal_text)
+            else:
+                # Check if we might have a partial bot_token
+                partial_len = self._ends_with_partial_token(
+                    self._buffer, self.bot_token
+                )
+                if partial_len:
+                    return StreamingParseResult()  # Wait for more text
+                else:
+                    normal_text = self._buffer
+                    self._buffer = ""
+                    return StreamingParseResult(normal_text=normal_text)
+
+        # We're inside the tool block
+        calls: List[ToolCallItem] = []
+
+        # Check if tool block is ending
+        if self.eot_token in self._buffer:
+            idx = self._buffer.find(self.eot_token)
+
+            # If we're in the middle of a tool call, we need to handle it
+            if self._in_tool_call:
+                # The buffer before eot_token might contain the end of the current tool call
+                before_eot = self._buffer[:idx]
+                if self.tool_call_end in before_eot:
+                    # Parse this final tool call
+                    result = self._parse_partial_tool_call(tools)
+                    calls.extend(result.calls)
+                else:
+                    # Incomplete tool call - log warning
+                    logger.warning("Tool block ended with incomplete tool call")
+
+            remaining = self._buffer[idx + len(self.eot_token) :]
+            self._buffer = ""
+            self._tool_block_finished = True
+
+            # Reset any partial tool call state
+            self._reset_streaming_state()
+
+            return StreamingParseResult(normal_text=remaining, calls=calls)
+
+        # Check if we're in a tool call or need to start one
+        if not self._in_tool_call:
+            if self.tool_call_begin in self._buffer:
+                idx = self._buffer.find(self.tool_call_begin)
+                # Remove any content before tool call begin (shouldn't happen but be safe)
+                self._buffer = self._buffer[idx + len(self.tool_call_begin) :]
+                self._in_tool_call = True
+                self._function_name_sent = False
+                self._current_function_name = ""
+                self._current_parameters = {}
+                # Fall through to parse the partial tool call
+            else:
+                # Wait for tool call to begin
+                return StreamingParseResult()
+
+        # Parse partial tool call
+        if self._in_tool_call:
+            return self._parse_partial_tool_call(tools)
+
+        return StreamingParseResult()
+
+    def _parse_partial_tool_call(self, tools: List[Tool]) -> StreamingParseResult:
+        """Parse partial tool call for streaming scenarios."""
+        calls = []
+
+        # Check if we have tool_sep (means we're past the type declaration)
+        if self.tool_sep not in self._buffer:
+            return StreamingParseResult(calls=calls)  # Wait for more text
+
+        type_part, invoke_part = self._buffer.split(self.tool_sep, 1)
+        if type_part.strip() != "function":
+            # Invalid tool type, skip this tool call
+            self._reset_streaming_state()
+            return StreamingParseResult(calls=calls)
+
+        # Try to extract function name if not sent yet
+        if not self._function_name_sent:
+            name_match = re.search(r'<steptml:invoke name="([^"]+)">', invoke_part)
+            if name_match:
+                func_name = name_match.group(1)
+
+                # Validate function name
+                if func_name in self._tool_indices:
+                    self._current_function_name = func_name
+                    self._function_name_sent = True
+
+                    # Initialize tool tracking
+                    if self.current_tool_id == -1:
+                        self.current_tool_id = 0
+
+                    # Ensure tracking arrays are large enough
+                    while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                        self.prev_tool_call_arr.append({})
+                    while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                        self.streamed_args_for_tool.append("")
+
+                    # Store tool call info
+                    self.prev_tool_call_arr[self.current_tool_id] = {
+                        "name": func_name,
+                        "arguments": {},
+                    }
+
+                    # Send tool name with empty parameters
+                    calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            name=func_name,
+                            parameters="",
+                        )
+                    )
+                else:
+                    # Invalid function name
+                    logger.warning(f"Invalid function name: {func_name}")
+                    self._reset_streaming_state()
+                    return StreamingParseResult(calls=calls)
+            else:
+                # Function name not complete yet
+                return StreamingParseResult(calls=calls)
+
+        # Parse parameters incrementally
+        if self._function_name_sent:
+            # Extract all complete parameters
+            new_params = {}
+            for param_match in self.param_regex.finditer(invoke_part):
+                param_name = param_match.group(1)
+                param_value = param_match.group(2).strip()
+
+                # Use schema-aware parsing
+                arg_type = get_argument_type(
+                    self._current_function_name, param_name, tools
+                )
+                if arg_type and arg_type != "string":
+                    parsed_value, _ = parse_arguments(param_value)
+                    new_params[param_name] = parsed_value
+                else:
+                    new_params[param_name] = param_value
+
+            # Check if we have new parameters to stream
+            if new_params != self._current_parameters:
+                # Build the JSON content without the closing brace for streaming
+                if not self._current_parameters:
+                    # First parameters - send opening brace and content
+                    params_content = json.dumps(new_params, ensure_ascii=False)
+                    if len(params_content) > 2:  # More than just "{}"
+                        # Send everything except the closing brace
+                        diff = params_content[:-1]
+                    else:
+                        diff = "{"
+                else:
+                    # Subsequent parameters - calculate the incremental diff
+                    old_json = json.dumps(self._current_parameters, ensure_ascii=False)
+                    new_json = json.dumps(new_params, ensure_ascii=False)
+
+                    # Remove closing braces for comparison
+                    old_without_brace = old_json[:-1]
+                    new_without_brace = new_json[:-1]
+
+                    # The new content should extend the old content
+                    if new_without_brace.startswith(old_without_brace):
+                        diff = new_without_brace[len(old_without_brace) :]
+                    else:
+                        # Parameters changed in unexpected way - shouldn't happen in normal streaming
+                        diff = ""
+
+                if diff:
+                    calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            parameters=diff,
+                        )
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] += diff
+
+                # Update current state
+                self._current_parameters = new_params
+                self.prev_tool_call_arr[self.current_tool_id]["arguments"] = new_params
+
+            # Check if tool call is complete
+            if self.tool_call_end in self._buffer:
+                # Send closing brace if we've sent any parameters
+                if self.streamed_args_for_tool[self.current_tool_id]:
+                    calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            parameters="}",
+                        )
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] += "}"
+
+                # Find the end position
+                end_idx = self._buffer.find(self.tool_call_end)
+                # Remove the processed tool call from buffer
+                self._buffer = self._buffer[end_idx + len(self.tool_call_end) :]
+
+                # Reset state for next tool call
+                self._reset_streaming_state()
+                self.current_tool_id += 1
+
+        return StreamingParseResult(calls=calls)
+
+    def _reset_streaming_state(self):
+        """Reset streaming state for the next tool call"""
+        self._in_tool_call = False
+        self._function_name_sent = False
+        self._current_function_name = ""
+        self._current_parameters = {}
+
+    def supports_structural_tag(self) -> bool:
+        """Return True if this detector supports structural tag format."""
+        return False
+
+    def structure_info(self) -> _GetInfoFunc:
+        raise NotImplementedError()
+
+    def build_ebnf(self, tools: List[Tool]) -> str:
+        """
+        Build EBNF grammar for Step3 tool call format.
+        """
+        # Custom call rule for steptml format
+        call_rule_fmt = (
+            '"function" "<｜tool_sep｜>" "<steptml:invoke name=\\"{name}\\">" '
+            '{arguments_rule} "</steptml:invoke>"'
+        )
+
+        # Custom key-value rule for steptml parameters
+        key_value_rule_fmt = (
+            '"<steptml:parameter name=\\"{key}\\">" {valrule} "</steptml:parameter>"'
+        )
+
+        return EBNFComposer.build_ebnf(
+            tools,
+            sequence_start_token=self.bot_token,
+            sequence_end_token=self.eot_token,
+            individual_call_start_token=self.tool_call_begin,
+            individual_call_end_token=self.tool_call_end,
+            tool_call_separator="",
+            function_format="xml",
+            call_rule_fmt=call_rule_fmt,
+            key_value_rule_fmt=key_value_rule_fmt,
+            key_value_separator="",
+        )
diff --git a/python/sglang/srt/function_call/utils.py b/python/sglang/srt/function_call/utils.py
new file mode 100644
index 000000000..c4da456f3
--- /dev/null
+++ b/python/sglang/srt/function_call/utils.py
@@ -0,0 +1,52 @@
+import json
+from json import JSONDecodeError, JSONDecoder
+from typing import Any, Tuple
+
+import partial_json_parser
+from partial_json_parser.core.options import Allow
+
+
+def _find_common_prefix(s1: str, s2: str) -> str:
+    prefix = ""
+    min_length = min(len(s1), len(s2))
+    for i in range(0, min_length):
+        if s1[i] == s2[i]:
+            prefix += s1[i]
+        else:
+            break
+    return prefix
+
+
+def _partial_json_loads(input_str: str, flags: Allow) -> Tuple[Any, int]:
+    """
+    Parse incomplete or partial JSON strings commonly encountered during streaming.
+
+    Args:
+        input_str (str): The potentially incomplete JSON string to parse.
+        flags (Allow): Bitwise flags controlling what types of partial data are allowed.
+            Common flags include:
+            - Allow.STR: Allow partial strings (e.g., '"hello wo' -> 'hello wo')
+            - Allow.OBJ: Allow partial objects (e.g., '{"key":' -> {'key': None})
+            - Allow.ARR: Allow partial arrays (e.g., '[1, 2,' -> [1, 2])
+            - Allow.ALL: Allow all types of partial data
+
+    Returns:
+        Tuple[Any, int]: A tuple containing:
+            - parsed_object: The Python object parsed from the JSON
+            - consumed_length: Number of characters consumed from input_str
+    """
+    try:
+        return (partial_json_parser.loads(input_str, flags), len(input_str))
+    except JSONDecodeError as e:
+        if "Extra data" in e.msg:
+            dec = JSONDecoder()
+            return dec.raw_decode(input_str)
+        raise
+
+
+def _is_complete_json(input_str: str) -> bool:
+    try:
+        json.loads(input_str)
+        return True
+    except JSONDecodeError:
+        return False
diff --git a/python/sglang/srt/hf_transformers_utils.py b/python/sglang/srt/hf_transformers_utils.py
new file mode 100644
index 000000000..d7dcf8904
--- /dev/null
+++ b/python/sglang/srt/hf_transformers_utils.py
@@ -0,0 +1,442 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for Huggingface Transformers."""
+
+import contextlib
+import json
+import os
+import warnings
+from pathlib import Path
+from typing import Any, Dict, Optional, Type, Union
+
+import torch
+from huggingface_hub import snapshot_download
+from transformers import (
+    AutoConfig,
+    AutoProcessor,
+    AutoTokenizer,
+    GenerationConfig,
+    PretrainedConfig,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerBase,
+    PreTrainedTokenizerFast,
+)
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+
+from sglang.srt.configs import (
+    ChatGLMConfig,
+    DbrxConfig,
+    DeepseekVL2Config,
+    ExaoneConfig,
+    KimiVLConfig,
+    LongcatFlashConfig,
+    MultiModalityConfig,
+    Qwen3NextConfig,
+    Step3VLConfig,
+)
+from sglang.srt.configs.internvl import InternVLChatConfig
+from sglang.srt.connector import create_remote_connector
+from sglang.srt.utils import is_remote_url, logger, lru_cache_frozenset
+
+_CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
+    ChatGLMConfig.model_type: ChatGLMConfig,
+    DbrxConfig.model_type: DbrxConfig,
+    ExaoneConfig.model_type: ExaoneConfig,
+    DeepseekVL2Config.model_type: DeepseekVL2Config,
+    MultiModalityConfig.model_type: MultiModalityConfig,
+    KimiVLConfig.model_type: KimiVLConfig,
+    InternVLChatConfig.model_type: InternVLChatConfig,
+    Step3VLConfig.model_type: Step3VLConfig,
+    LongcatFlashConfig.model_type: LongcatFlashConfig,
+    Qwen3NextConfig.model_type: Qwen3NextConfig,
+}
+
+for name, cls in _CONFIG_REGISTRY.items():
+    with contextlib.suppress(ValueError):
+        AutoConfig.register(name, cls)
+
+
+def download_from_hf(
+    model_path: str,
+    allow_patterns: Optional[Union[str, list]] = None,
+):
+    if os.path.exists(model_path):
+        return model_path
+
+    if not allow_patterns:
+        allow_patterns = ["*.json", "*.bin", "*.model"]
+
+    return snapshot_download(model_path, allow_patterns=allow_patterns)
+
+
+def get_hf_text_config(config: PretrainedConfig):
+    """Get the "sub" config relevant to llm for multi modal models.
+    No op for pure text models.
+    """
+    if config.architectures is not None:
+        class_name = config.architectures[0]
+        if class_name.startswith("Llava") and class_name.endswith("ForCausalLM"):
+            # We support non-hf version of llava models, so we do not want to
+            # read the wrong values from the unused default text_config.
+            # NOTE(HandH1998): We set `torch_dtype` of config to `torch.float16` for the weights, as
+            # `torch.float16` is default used for image features in `python/sglang/srt/models/llava.py`.
+            setattr(config, "torch_dtype", torch.float16)
+            return config
+
+    if hasattr(config, "text_config"):
+        # The code operates under the assumption that text_config should have
+        # `num_attention_heads` (among others). Assert here to fail early
+        # if transformers config doesn't align with this assumption.
+        assert hasattr(config.text_config, "num_attention_heads")
+        return config.text_config
+    if hasattr(config, "language_config"):
+        return config.language_config
+    if hasattr(config, "thinker_config"):
+        # qwen2.5 omni
+        thinker_config = config.thinker_config
+        if hasattr(thinker_config, "text_config"):
+            setattr(
+                thinker_config.text_config,
+                "torch_dtype",
+                getattr(thinker_config, "torch_dtype", None),
+            )
+            return thinker_config.text_config
+        return thinker_config
+    else:
+        return config
+
+
+@lru_cache_frozenset(maxsize=32)
+def get_config(
+    model: str,
+    trust_remote_code: bool,
+    revision: Optional[str] = None,
+    model_override_args: Optional[dict] = None,
+    **kwargs,
+):
+    is_gguf = check_gguf_file(model)
+    if is_gguf:
+        kwargs["gguf_file"] = model
+        model = Path(model).parent
+
+    if is_remote_url(model):
+        # BaseConnector implements __del__() to clean up the local dir.
+        # Since config files need to exist all the time, so we DO NOT use
+        # with statement to avoid closing the client.
+        client = create_remote_connector(model)
+        client.pull_files(ignore_pattern=["*.pt", "*.safetensors", "*.bin"])
+        model = client.get_local_dir()
+
+    config = AutoConfig.from_pretrained(
+        model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
+    )
+    if (
+        config.architectures is not None
+        and config.architectures[0] == "Phi4MMForCausalLM"
+    ):
+        # Phi4MMForCausalLM uses a hard-coded vision_config. See:
+        # https://github.com/vllm-project/vllm/blob/6071e989df1531b59ef35568f83f7351afb0b51e/vllm/model_executor/models/phi4mm.py#L71
+        # We set it here to support cases where num_attention_heads is not divisible by the TP size.
+        from transformers import SiglipVisionConfig
+
+        vision_config = {
+            "hidden_size": 1152,
+            "image_size": 448,
+            "intermediate_size": 4304,
+            "model_type": "siglip_vision_model",
+            "num_attention_heads": 16,
+            "num_hidden_layers": 26,  # Model is originally 27-layer, we only need the first 26 layers for feature extraction.
+            "patch_size": 14,
+        }
+        config.vision_config = SiglipVisionConfig(**vision_config)
+    text_config = get_hf_text_config(config=config)
+
+    if isinstance(model, str) and text_config is not None:
+        for key, val in text_config.__dict__.items():
+            if not hasattr(config, key) and getattr(text_config, key, None) is not None:
+                setattr(config, key, val)
+
+    if config.model_type in _CONFIG_REGISTRY:
+        config_class = _CONFIG_REGISTRY[config.model_type]
+        config = config_class.from_pretrained(model, revision=revision)
+        # NOTE(HandH1998): Qwen2VL requires `_name_or_path` attribute in `config`.
+        setattr(config, "_name_or_path", model)
+
+    if isinstance(model, str) and config.model_type == "internvl_chat":
+        for key, val in config.llm_config.__dict__.items():
+            if not hasattr(config, key):
+                setattr(config, key, val)
+
+    if config.model_type == "multi_modality":
+        config.update({"architectures": ["MultiModalityCausalLM"]})
+
+    if model_override_args:
+        config.update(model_override_args)
+
+    # Special architecture mapping check for GGUF models
+    if is_gguf:
+        if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
+            raise RuntimeError(f"Can't get gguf config for {config.model_type}.")
+        model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
+        config.update({"architectures": [model_type]})
+
+    return config
+
+
+@lru_cache_frozenset(maxsize=32)
+def get_generation_config(
+    model: str,
+    trust_remote_code: bool,
+    revision: Optional[str] = None,
+    **kwargs,
+):
+    try:
+        return GenerationConfig.from_pretrained(
+            model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
+        )
+    except OSError as e:
+        return None
+
+
+# Qwen-1M related
+def get_sparse_attention_config(
+    model: str,
+    sparse_attention_config_filename: str = "sparse_attention_config.json",
+) -> Dict[str, Any]:
+    is_local = os.path.isdir(model)
+    if not is_local:
+        # Download the config files.
+        model = download_from_hf(model, allow_patterns=["*.json"])
+
+    config_file = os.path.join(model, sparse_attention_config_filename)
+    if not os.path.exists(config_file):
+        return {}
+
+    # Load the sparse attention config.
+    with open(config_file) as f:
+        config = json.load(f)
+    return config
+
+
+# Models don't use the same configuration key for determining the maximum
+# context length.  Store them here so we can sanely check them.
+# NOTE: The ordering here is important. Some models have two of these and we
+# have a preference for which value gets used.
+CONTEXT_LENGTH_KEYS = [
+    "max_sequence_length",
+    "seq_length",
+    "max_seq_len",
+    "model_max_length",
+    "max_position_embeddings",
+]
+
+
+def get_context_length(config):
+    """Get the context length of a model from a huggingface model configs."""
+    text_config = config
+    rope_scaling = getattr(text_config, "rope_scaling", None)
+    if rope_scaling:
+        rope_scaling_factor = rope_scaling.get("factor", 1)
+        if "original_max_position_embeddings" in rope_scaling:
+            rope_scaling_factor = 1
+        if rope_scaling.get("rope_type", None) == "llama3":
+            rope_scaling_factor = 1
+    else:
+        rope_scaling_factor = 1
+
+    for key in CONTEXT_LENGTH_KEYS:
+        val = getattr(text_config, key, None)
+        if val is not None:
+            return int(rope_scaling_factor * val)
+    return 2048
+
+
+# A fast LLaMA tokenizer with the pre-processed `tokenizer.json` file.
+_FAST_LLAMA_TOKENIZER = "hf-internal-testing/llama-tokenizer"
+
+
+def get_tokenizer(
+    tokenizer_name: str,
+    *args,
+    tokenizer_mode: str = "auto",
+    trust_remote_code: bool = False,
+    tokenizer_revision: Optional[str] = None,
+    **kwargs,
+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+    """Gets a tokenizer for the given model name via Huggingface."""
+    if tokenizer_name.endswith(".json"):
+        from sglang.srt.tokenizer.tiktoken_tokenizer import TiktokenTokenizer
+
+        return TiktokenTokenizer(tokenizer_name)
+
+    if tokenizer_mode == "slow":
+        if kwargs.get("use_fast", False):
+            raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
+        kwargs["use_fast"] = False
+
+    # TODO(Xinyuan): Remove this once we have a proper tokenizer for Devstral
+    if tokenizer_name == "mistralai/Devstral-Small-2505":
+        tokenizer_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+
+    is_gguf = check_gguf_file(tokenizer_name)
+    if is_gguf:
+        kwargs["gguf_file"] = tokenizer_name
+        tokenizer_name = Path(tokenizer_name).parent
+
+    if is_remote_url(tokenizer_name):
+        # BaseConnector implements __del__() to clean up the local dir.
+        # Since config files need to exist all the time, so we DO NOT use
+        # with statement to avoid closing the client.
+        client = create_remote_connector(tokenizer_name)
+        client.pull_files(ignore_pattern=["*.pt", "*.safetensors", "*.bin"])
+        tokenizer_name = client.get_local_dir()
+
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name,
+            *args,
+            trust_remote_code=trust_remote_code,
+            tokenizer_revision=tokenizer_revision,
+            clean_up_tokenization_spaces=False,
+            **kwargs,
+        )
+    except TypeError as e:
+        # The LLaMA tokenizer causes a protobuf error in some environments.
+        err_msg = (
+            "Failed to load the tokenizer. If you are using a LLaMA V1 model "
+            f"consider using '{_FAST_LLAMA_TOKENIZER}' instead of the "
+            "original tokenizer."
+        )
+        raise RuntimeError(err_msg) from e
+    except ValueError as e:
+        # If the error pertains to the tokenizer class not existing or not
+        # currently being imported, suggest using the --trust-remote-code flag.
+        if not trust_remote_code and (
+            "does not exist or is not currently imported." in str(e)
+            or "requires you to execute the tokenizer file" in str(e)
+        ):
+            err_msg = (
+                "Failed to load the tokenizer. If the tokenizer is a custom "
+                "tokenizer not yet available in the HuggingFace transformers "
+                "library, consider setting `trust_remote_code=True` in LLM "
+                "or using the `--trust-remote-code` flag in the CLI."
+            )
+            raise RuntimeError(err_msg) from e
+        else:
+            raise e
+
+    if not isinstance(tokenizer, PreTrainedTokenizerFast):
+        warnings.warn(
+            "Using a slow tokenizer. This might cause a significant "
+            "slowdown. Consider using a fast tokenizer instead."
+        )
+
+    attach_additional_stop_token_ids(tokenizer)
+    return tokenizer
+
+
+# Some models doesn't have an available processor, e.g.: InternVL
+def get_tokenizer_from_processor(processor):
+    if isinstance(processor, PreTrainedTokenizerBase):
+        return processor
+    return processor.tokenizer
+
+
+def get_processor(
+    tokenizer_name: str,
+    *args,
+    tokenizer_mode: str = "auto",
+    trust_remote_code: bool = False,
+    tokenizer_revision: Optional[str] = None,
+    use_fast: Optional[bool] = True,
+    **kwargs,
+):
+    # pop 'revision' from kwargs if present.
+    revision = kwargs.pop("revision", tokenizer_revision)
+
+    config = AutoConfig.from_pretrained(
+        tokenizer_name,
+        trust_remote_code=trust_remote_code,
+        revision=revision,
+        **kwargs,
+    )
+
+    # fix: for Qwen2-VL model, inject default 'size' if not provided.
+    if config.model_type in {"qwen2_vl"}:
+        if "size" not in kwargs:
+            kwargs["size"] = {"shortest_edge": 3136, "longest_edge": 1003520}
+
+    if config.model_type not in {"llava", "clip"}:
+        kwargs["use_fast"] = use_fast
+    try:
+        if "InternVL3_5" in tokenizer_name:
+            processor = AutoTokenizer.from_pretrained(
+                tokenizer_name,
+                *args,
+                trust_remote_code=trust_remote_code,
+                revision=revision,
+                **kwargs,
+            )
+        else:
+            processor = AutoProcessor.from_pretrained(
+                tokenizer_name,
+                *args,
+                trust_remote_code=trust_remote_code,
+                revision=revision,
+                **kwargs,
+            )
+
+    except ValueError as e:
+        error_message = str(e)
+        if "does not have a slow version" in error_message:
+            logger.info(
+                f"Processor {tokenizer_name} does not have a slow version. Automatically use fast version"
+            )
+            kwargs["use_fast"] = True
+            processor = AutoProcessor.from_pretrained(
+                tokenizer_name,
+                *args,
+                trust_remote_code=trust_remote_code,
+                revision=revision,
+                **kwargs,
+            )
+        else:
+            raise e
+    tokenizer = get_tokenizer_from_processor(processor)
+
+    attach_additional_stop_token_ids(tokenizer)
+    return processor
+
+
+def attach_additional_stop_token_ids(tokenizer):
+    # Special handling for stop token <|eom_id|> generated by llama 3 tool use.
+    if "<|eom_id|>" in tokenizer.get_added_vocab():
+        tokenizer.additional_stop_token_ids = set(
+            [tokenizer.get_added_vocab()["<|eom_id|>"]]
+        )
+    else:
+        tokenizer.additional_stop_token_ids = None
+
+
+def check_gguf_file(model: Union[str, os.PathLike]) -> bool:
+    """Check if the file is a GGUF model."""
+    model = Path(model)
+    if not model.is_file():
+        return False
+    elif model.suffix == ".gguf":
+        return True
+
+    with open(model, "rb") as f:
+        header = f.read(4)
+    return header == b"GGUF"
diff --git a/python/sglang/srt/host_shared_memory.py b/python/sglang/srt/host_shared_memory.py
new file mode 100644
index 000000000..c599527f9
--- /dev/null
+++ b/python/sglang/srt/host_shared_memory.py
@@ -0,0 +1,83 @@
+import logging
+import os
+from dataclasses import dataclass
+from multiprocessing import shared_memory
+from pathlib import Path
+from typing import List, Optional
+
+import numpy as np
+import torch
+
+from sglang.srt.distributed.naive_distributed import get_naive_distributed
+from sglang.srt.utils import check_cuda_result
+
+logger = logging.getLogger(__name__)
+
+
+class HostSharedMemoryManager:
+    def __init__(self, base_name: str):
+        self._base_name = Path(base_name)
+        self._operation_index = 0
+        self._records: List[_Record] = []
+
+    def malloc(self, *, shape, dtype):
+        meta_tensor = torch.empty(size=shape, dtype=dtype, device="meta")
+        raw = self._malloc_raw(num_bytes=meta_tensor.nbytes)
+        return raw.view(dtype).view(*shape)
+
+    def _malloc_raw(self, *, num_bytes: int) -> torch.Tensor:
+        import cuda.bindings.runtime as cuda_rt
+
+        self._operation_index += 1
+        shm_name = f"{self._base_name}_op{self._operation_index}"
+
+        # TODO handle dispose
+        if get_naive_distributed().get_rank() == 0:
+            shm = shared_memory.SharedMemory(name=shm_name, create=True, size=num_bytes)
+
+        get_naive_distributed().barrier()
+
+        if get_naive_distributed().get_rank() != 0:
+            shm = shared_memory.SharedMemory(name=shm_name)
+
+        np_array = np.ndarray((num_bytes,), dtype=np.uint8, buffer=shm.buf)
+        tensor = torch.from_numpy(np_array)
+
+        check_cuda_result(
+            cuda_rt.cudaHostRegister(
+                tensor.data_ptr(), num_bytes, cuda_rt.cudaHostRegisterPortable
+            )
+        )
+
+        get_naive_distributed().barrier()
+
+        self._records.append(
+            _Record(
+                shm=shm,
+                np_array=np_array,
+                tensor=tensor,
+            )
+        )
+        return tensor
+
+
+@dataclass
+class _Record:
+    shm: shared_memory.SharedMemory
+    np_array: np.ndarray
+    tensor: torch.Tensor
+
+
+# Can have multi instances if needed
+_instance: Optional[HostSharedMemoryManager] = None
+
+
+def get_host_shared_memory_manager():
+    assert _instance is not None
+    return _instance
+
+
+def set_host_shared_memory_manager(instance: HostSharedMemoryManager):
+    global _instance
+    assert _instance is None
+    _instance = instance
diff --git a/python/sglang/srt/layers/activation.py b/python/sglang/srt/layers/activation.py
new file mode 100644
index 000000000..3d973393e
--- /dev/null
+++ b/python/sglang/srt/layers/activation.py
@@ -0,0 +1,272 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Fused operators for activation layers."""
+
+import logging
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig
+
+from sglang.srt.custom_op import CustomOp
+from sglang.srt.distributed import (
+    divide,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    is_cpu,
+    is_cuda,
+    is_hip,
+    is_npu,
+    is_xpu,
+    set_weight_attrs,
+)
+from sglang.utils import resolve_obj_by_qualname
+
+_is_cuda = is_cuda()
+_is_npu = is_npu()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+_is_hip = is_hip()
+_is_xpu = is_xpu()
+
+if _is_cuda or _is_xpu:
+    from sgl_kernel import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
+elif _is_hip:
+    from sgl_kernel import gelu_and_mul, gelu_quick, gelu_tanh_and_mul, silu_and_mul
+
+if is_npu():
+    import torch_npu
+
+logger = logging.getLogger(__name__)
+
+
+class SiluAndMul(CustomOp):
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        return F.silu(x[..., :d]) * x[..., d:]
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        silu_and_mul(x, out)
+        return out
+
+    def forward_cpu(self, x: torch.Tensor) -> torch.Tensor:
+        if _is_cpu_amx_available:
+            out = torch.ops.sgl_kernel.silu_and_mul_cpu(x)
+            return out
+        else:
+            return self.forward_native(x)
+
+    def forward_npu(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch_npu.npu_swiglu(x)
+        return out
+
+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        silu_and_mul(x, out)
+        return out
+
+
+class GeluAndMul(CustomOp):
+    def __init__(self, approximate="tanh"):
+        super().__init__()
+        self.approximate = approximate
+
+    def _forward_impl(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        if self.approximate == "tanh":
+            gelu_tanh_and_mul(x, out)
+        elif self.approximate == "none":
+            gelu_and_mul(x, out)
+        else:
+            raise RuntimeError("GeluAndMul only support tanh or none")
+        return out
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]
+
+    def forward_cpu(self, x: torch.Tensor) -> torch.Tensor:
+        if _is_cpu_amx_available and self.approximate == "tanh":
+            return torch.ops.sgl_kernel.gelu_tanh_and_mul_cpu(x)
+        elif _is_cpu_amx_available and self.approximate == "none":
+            return torch.ops.sgl_kernel.gelu_and_mul_cpu(x)
+        else:
+            return self.forward_native(x)
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        return self._forward_impl(x)
+
+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        return self._forward_impl(x)
+
+    def forward_npu(self, x: torch.Tensor) -> torch.Tensor:
+        y_npu, gelu_npu = torch_npu.npu_geglu(
+            x,
+            dim=-1,
+            approximate=1 if self.approximate == "tanh" else 0,
+            activate_left=True,
+        )
+        return y_npu
+
+
+class NewGELU(CustomOp):
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        c = math.sqrt(2.0 / math.pi)
+        return 0.5 * x * (1.0 + torch.tanh(c * (x + 0.044715 * torch.pow(x, 3.0))))
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        # TODO: Implement the CUDA kernel for NewGELU in sgl-kernel
+        return self.forward_native(x)
+
+
+class ReLU2(nn.Module):
+    """
+    Applies the squared Rectified Linear Unit function.
+    y = max(0, x)^2
+    """
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.relu(x)
+        return x * x
+
+
+class QuickGELU(CustomOp):
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        return x * torch.sigmoid(1.702 * x)
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        return self.forward_native(x)
+
+    def forward_hip(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty(x.shape, dtype=x.dtype, device=x.device)
+        gelu_quick(x, out)
+        return out
+
+    def forward_npu(self, x: torch.Tensor) -> torch.Tensor:
+        return torch_npu.npu_fast_gelu(x)
+
+
+class ScaledActivation(nn.Module):
+    """An activation function with post-scale parameters.
+
+    This is used for some quantization methods like AWQ.
+    """
+
+    def __init__(
+        self,
+        act_module: nn.Module,
+        intermediate_size: int,
+        input_is_parallel: bool = True,
+        params_dtype: Optional[torch.dtype] = None,
+    ):
+        super().__init__()
+        self.act = act_module
+        self.input_is_parallel = input_is_parallel
+        if input_is_parallel:
+            tp_size = get_tensor_model_parallel_world_size()
+            intermediate_size_per_partition = divide(intermediate_size, tp_size)
+        else:
+            intermediate_size_per_partition = intermediate_size
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.scales = nn.Parameter(
+            torch.empty(intermediate_size_per_partition, dtype=params_dtype)
+        )
+        set_weight_attrs(self.scales, {"weight_loader": self.weight_loader})
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.act(x) / self.scales
+
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        param_data = param.data
+        if self.input_is_parallel:
+            tp_rank = get_tensor_model_parallel_rank()
+            shard_size = param_data.shape[0]
+            start_idx = tp_rank * shard_size
+            loaded_weight = loaded_weight.narrow(0, start_idx, shard_size)
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+
+_ACTIVATION_REGISTRY = {
+    "gelu": nn.GELU(),
+    "gelu_pytorch_tanh": nn.GELU(approximate="tanh"),
+    "gelu_new": NewGELU(),
+    "relu2": ReLU2(),
+}
+
+
+def get_act_fn(
+    act_fn_name: str,
+    quant_config: Optional[QuantizationConfig] = None,
+    intermediate_size: Optional[int] = None,
+    input_is_parallel: bool = True,
+    params_dtype: Optional[torch.dtype] = None,
+) -> nn.Module:
+    """Get an activation function by name."""
+    act_fn_name = act_fn_name.lower()
+    if act_fn_name not in _ACTIVATION_REGISTRY:
+        raise ValueError(f"Activation function {act_fn_name!r} is not supported.")
+
+    act_fn = _ACTIVATION_REGISTRY[act_fn_name]
+    if quant_config is not None and act_fn_name in quant_config.get_scaled_act_names():
+        if intermediate_size is None:
+            raise ValueError(
+                "intermediate_size must be specified for scaled "
+                "activation functions."
+            )
+        return ScaledActivation(
+            act_fn, intermediate_size, input_is_parallel, params_dtype
+        )
+    return act_fn
+
+
+def get_cross_encoder_activation_function(config: PretrainedConfig):
+    if (
+        hasattr(config, "sbert_ce_default_activation_function")
+        and config.sbert_ce_default_activation_function is not None
+    ):
+
+        function_name = config.sbert_ce_default_activation_function
+        assert function_name.startswith("torch.nn.modules."), (
+            "Loading of activation functions is restricted to "
+            "torch.nn.modules for security reasons"
+        )
+        return resolve_obj_by_qualname(function_name)()
+    else:
+        # adapt bge-reranker
+        return nn.Identity()
+
+
+if not (
+    _is_cuda or _is_npu or (_is_cpu and _is_cpu_amx_available) or _is_hip or _is_xpu
+):
+    logger.info(
+        "sgl-kernel is not available on Non-NV, Non-AMD platforms or Non-AMX CPUs. Fallback to other kernel libraries."
+    )
+    from vllm.model_executor.layers.activation import GeluAndMul, SiluAndMul
diff --git a/python/sglang/srt/layers/amx_utils.py b/python/sglang/srt/layers/amx_utils.py
new file mode 100644
index 000000000..df2a05ba5
--- /dev/null
+++ b/python/sglang/srt/layers/amx_utils.py
@@ -0,0 +1,86 @@
+import logging
+
+import torch
+
+from sglang.srt.utils import cpu_has_amx_support
+
+logger = logging.getLogger(__name__)
+
+
+def amx_process_weight_after_loading(weight):
+    if weight.device != torch.device("cpu"):
+        return weight
+    if not cpu_has_amx_support():
+        return weight
+
+    return torch.ops.sgl_kernel.convert_weight_packed(weight)
+
+
+# TODO: currently gemm kernel has the below requirements:
+# OC % TILE_N == 0, where TILE_N = 16
+# IC % TILE_K == 0, where TILE_K = 32
+def dim_is_supported(weight):
+    TILE_N = 16
+    TILE_K = 32
+    ndim = weight.ndim
+    OC = weight.size(1) if ndim == 3 else weight.size(0)
+    IC = weight.size(2) if ndim == 3 else weight.size(1)
+    return OC % TILE_N == 0 and IC % TILE_K == 0
+
+
+def _amx_process_weight_after_loading(
+    module, weight_names, transpose_dims=None
+) -> None:
+    # Pack weight for get better performance on CPU
+    devices = {getattr(module, weight_name).device for weight_name in weight_names}
+    assert len(devices) == 1, f"Expects all weights to be on the same device"
+    device = devices.pop()
+
+    if transpose_dims:
+        assert len(weight_names) == len(
+            transpose_dims
+        ), "len(weight_names) should be equal to len(transpose_dims)"
+
+    for i, weight_name in enumerate(weight_names):
+        weight_tensor = getattr(module, weight_name)
+
+        if transpose_dims and transpose_dims[i]:
+            weight_tensor = weight_tensor.transpose(*transpose_dims[i])
+
+        # We don't pack weight or use intel amx backend if any weight of this module has unsupported dim.
+        if not dim_is_supported(weight_tensor):
+            logger.warning(
+                f"Unsupported dimension for prepacking for weight '{weight_name}' with shape {weight_tensor.shape} in {module}. "
+                f"The derived (OC, IC) dimensions must be divisible by (16, 32). "
+            )
+            module.use_intel_amx_backend = False
+            return
+
+        packed_weight = torch.nn.Parameter(
+            amx_process_weight_after_loading(weight_tensor),
+            requires_grad=False,
+        )
+        packed_weight.__dict__ = weight_tensor.__dict__
+        setattr(module, weight_name, packed_weight)
+
+    module.use_intel_amx_backend = (
+        device == torch.device("cpu") and cpu_has_amx_support()
+    )
+
+    if (
+        module.use_intel_amx_backend
+        and hasattr(module, "bias")
+        and module.bias is not None
+    ):
+        module.bias = torch.nn.Parameter(module.bias.data.float(), requires_grad=False)
+
+
+class PackWeightMethod:
+    def __init__(self, weight_names, transpose_dims=None):
+        self.weight_names = weight_names
+        self.transpose_dims = transpose_dims
+
+    def process_weights_after_loading(self, module) -> None:
+        _amx_process_weight_after_loading(
+            module, self.weight_names, self.transpose_dims
+        )
diff --git a/python/sglang/srt/layers/attention/aiter_backend.py b/python/sglang/srt/layers/attention/aiter_backend.py
new file mode 100644
index 000000000..188d772c7
--- /dev/null
+++ b/python/sglang/srt/layers/attention/aiter_backend.py
@@ -0,0 +1,1179 @@
+from __future__ import annotations
+
+"""
+end to end attention solution with aiter kernels
+"""
+
+import math
+import os
+from dataclasses import dataclass
+from enum import Enum, auto
+from functools import partial
+from typing import TYPE_CHECKING, List, Optional, Union
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.global_config import global_config
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.speculative.spec_info import SpecInfo
+
+try:
+    from aiter import (
+        flash_attn_varlen_func,
+        mha_batch_prefill_func,
+        paged_attention_ragged,
+    )
+    from aiter.mla import mla_decode_fwd, mla_prefill_fwd
+except ImportError:
+    print(
+        "aiter is AMD specific kernel library. Please make sure aiter is installed on your AMD device."
+    )
+
+from sglang.srt.configs.model_config import AttentionArch
+
+
+class WrapperDispatch(Enum):
+    SLIDING_WINDOW = auto()
+    CROSS_ATTENTION = auto()
+
+
+@dataclass
+class ForwardMetadata:
+    kv_indptr: torch.Tensor
+    kv_indices: torch.Tensor
+    qo_indptr: torch.Tensor
+    kv_last_page_len: torch.Tensor
+    max_q_len: int
+    max_kv_len: Optional[int]
+
+
+global_workspace_buffer = None
+
+_AITER_PARTITION_SIZE_ROCM = 256
+
+
+class AiterAttnBackend(AttentionBackend):
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        skip_prefill: bool = False,
+        kv_indptr_buf: Optional[torch.Tensor] = None,
+    ):
+        super().__init__()
+        # Lazy import to avoid the initialization of cuda context
+        from sglang.srt.layers.attention.triton_ops.extend_attention import (
+            extend_attention_fwd,
+        )
+
+        self.extend_attention_fwd = torch.compiler.disable(extend_attention_fwd)
+
+        self.device = model_runner.device
+        self.is_multimodal = model_runner.model_config.is_multimodal
+        self.num_draft_tokens = model_runner.server_args.speculative_num_draft_tokens
+        self.speculative_num_steps = model_runner.server_args.speculative_num_steps
+        self.num_head = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.head_dim = model_runner.model_config.head_dim
+        self.v_head_dim = model_runner.token_to_kv_pool.get_value_buffer(0).shape[-1]
+        self.num_kv_head = model_runner.model_config.get_num_kv_heads(
+            get_attention_tp_size()
+        )
+        self.kv_cache_dtype = model_runner.kv_cache_dtype
+
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+
+        self.use_mla = model_runner.model_config.attention_arch == AttentionArch.MLA
+
+        # Parse constants
+        self.max_context_len = model_runner.model_config.context_len
+        self.skip_prefill = skip_prefill
+
+        max_bs = model_runner.req_to_token_pool.size
+
+        if kv_indptr_buf is None:
+            self.kv_indptr = torch.zeros(
+                (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+            )
+        else:
+            self.kv_indptr = kv_indptr_buf
+
+        self.kv_last_page_len = torch.ones(
+            (max_bs,), dtype=torch.int32, device=model_runner.device
+        )
+        self.qo_indptr = torch.zeros(
+            (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+        )
+
+        # Create prefill indices updater
+        if not skip_prefill:
+            self.indices_updater_prefill = AiterIndicesUpdaterPrefill(
+                model_runner, self
+            )
+            if self.use_mla:
+                self.mla_indices_updater_prefill = AiterMlaIndicesUpdaterPrefill(
+                    model_runner, self
+                )
+
+        # aiter kernel related initialization
+        self.max_num_partitions = (
+            self.max_context_len + _AITER_PARTITION_SIZE_ROCM - 1
+        ) // _AITER_PARTITION_SIZE_ROCM
+
+        nbyes_per_qo_elem = torch.finfo(torch.float32).bits // 8
+
+        if not self.use_mla:
+            self.workspace_buffer = torch.empty(
+                (max_bs * self.num_head * self.max_num_partitions * self.head_dim)
+                * nbyes_per_qo_elem
+                + 2 * (max_bs * self.num_head * self.max_num_partitions) * 4,
+                dtype=torch.uint8,
+                device=self.device,
+            )
+
+        self.scale = float(1.0 / (self.head_dim**0.5))
+        self.k_scale = self.v_scale = torch.tensor([1.0], dtype=torch.float32).to(
+            self.device
+        )
+
+        self.logits_soft_cap = 0.0
+
+        self.forward_metadata: ForwardMetadata = None
+
+        if self.use_mla:
+            self.qo_indptr_ = torch.zeros(
+                (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+            )
+
+            self.enable_dp_attention = is_dp_attention_enabled()
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Init auxiliary variables for triton attention backend."""
+
+        bs = forward_batch.batch_size
+        kv_indptr = self.kv_indptr
+        spec_info = forward_batch.spec_info
+        qo_indptr = None
+        kv_last_page_len = None
+        max_q_len = None
+
+        if forward_batch.forward_mode.is_decode_or_idle():
+            if spec_info is None:
+                kv_indptr[1 : bs + 1] = torch.cumsum(forward_batch.seq_lens, dim=0)
+                kv_indptr = kv_indptr[: bs + 1]
+                kv_indices = torch.empty(
+                    forward_batch.seq_lens_sum, dtype=torch.int32, device=self.device
+                )
+                create_flashinfer_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    forward_batch.req_pool_indices,
+                    forward_batch.seq_lens,
+                    kv_indptr,
+                    None,
+                    kv_indices,
+                    self.req_to_token.stride(0),
+                )
+            else:
+                kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
+                bs = kv_indptr.shape[0] - 1
+
+            if self.use_mla:
+                qo_indptr = self.qo_indptr_[: bs + 1]
+                qo_indptr[1 : bs + 1] = torch.cumsum(self.kv_last_page_len[:bs], dim=0)
+                kv_last_page_len = self.kv_last_page_len[:bs]
+                max_q_len = 1
+
+            self.forward_metadata = ForwardMetadata(
+                kv_indptr,
+                kv_indices,
+                qo_indptr,
+                kv_last_page_len,
+                max_q_len,
+                None,
+            )
+
+        elif forward_batch.forward_mode.is_draft_extend():
+            if self.use_mla:
+                kv_indices, kv_indptr, qo_indptr, custom_mask = (
+                    spec_info.generate_attn_arg_prefill(
+                        forward_batch.req_pool_indices,
+                        forward_batch.seq_lens,
+                        forward_batch.seq_lens_sum,
+                        self.req_to_token,
+                    )
+                )
+                self.forward_metadata = ForwardMetadata(
+                    kv_indptr,
+                    kv_indices,
+                    qo_indptr,
+                    # self.mla_indices_updater_prefill.kv_last_page_len,
+                    self.kv_last_page_len[:bs],
+                    max(forward_batch.extend_seq_lens_cpu),
+                    forward_batch.seq_lens_cpu.max().item(),
+                )
+            else:
+                self.indices_updater_prefill.update(
+                    forward_batch.req_pool_indices,
+                    forward_batch.seq_lens,
+                    forward_batch.seq_lens_sum,
+                    prefix_lens=None,
+                    encoder_lens=forward_batch.encoder_lens,
+                    spec_info=forward_batch.spec_info,
+                )
+                self.forward_metadata = ForwardMetadata(
+                    self.indices_updater_prefill.kv_indptr,
+                    self.indices_updater_prefill.kv_indices,
+                    None,
+                    None,
+                    self.indices_updater_prefill.max_q_len,
+                    self.indices_updater_prefill.max_kv_len,
+                )
+        elif forward_batch.forward_mode.is_target_verify():
+            if self.use_mla:
+                draft_num = spec_info.draft_token_num
+                kv_lens = forward_batch.seq_lens + draft_num
+                kv_lens_sum = forward_batch.seq_lens_sum + draft_num * bs
+                device = forward_batch.seq_lens.device
+
+                qo_indptr = torch.arange(
+                    0,
+                    (1 + bs) * draft_num,
+                    step=draft_num,
+                    dtype=torch.int32,
+                    device=device,
+                )
+                kv_indptr = self.kv_indptr
+                kv_indptr[1 : bs + 1] = torch.cumsum(kv_lens, dim=0)
+                kv_indptr = kv_indptr[: bs + 1]
+                kv_indices = torch.empty(
+                    kv_lens_sum,
+                    dtype=torch.int32,
+                    device=device,
+                )
+                create_flashinfer_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    forward_batch.req_pool_indices,
+                    kv_lens,
+                    kv_indptr,
+                    None,
+                    kv_indices,
+                    self.req_to_token.stride(0),
+                )
+                self.forward_metadata = ForwardMetadata(
+                    kv_indptr,
+                    kv_indices,
+                    qo_indptr,
+                    # self.mla_indices_updater_prefill.kv_last_page_len,
+                    self.kv_last_page_len[:bs],
+                    draft_num,
+                    None,
+                )
+            else:
+                self.indices_updater_prefill.update(
+                    forward_batch.req_pool_indices,
+                    forward_batch.seq_lens,
+                    forward_batch.seq_lens_sum,
+                    prefix_lens=None,
+                    encoder_lens=forward_batch.encoder_lens,
+                    spec_info=forward_batch.spec_info,
+                )
+                self.forward_metadata = ForwardMetadata(
+                    self.indices_updater_prefill.kv_indptr,
+                    self.indices_updater_prefill.kv_indices,
+                    None,
+                    None,
+                    self.indices_updater_prefill.max_q_len,
+                    self.indices_updater_prefill.max_kv_len,
+                )
+        else:
+            prefix_lens = forward_batch.extend_prefix_lens
+
+            if self.is_multimodal:
+                extend_no_prefix = False
+            else:
+                extend_no_prefix = not any(forward_batch.extend_prefix_lens_cpu)
+            if self.use_mla:
+                self.mla_indices_updater_prefill.update(
+                    forward_batch.req_pool_indices,
+                    forward_batch.seq_lens,
+                    forward_batch.seq_lens_sum,
+                    forward_batch.extend_seq_lens,
+                    forward_batch.extend_seq_lens.max().item(),
+                    forward_batch.seq_lens.max().item(),
+                    spec_info=None,
+                )
+
+                kv_indices = self.mla_indices_updater_prefill.kv_indices
+
+                self.forward_metadata = ForwardMetadata(
+                    self.mla_indices_updater_prefill.kv_indptr,
+                    kv_indices,
+                    self.mla_indices_updater_prefill.qo_indptr,
+                    self.kv_last_page_len[:bs],
+                    self.mla_indices_updater_prefill.max_q_len,
+                    self.mla_indices_updater_prefill.max_kv_len,
+                )
+            else:
+                self.indices_updater_prefill.update(
+                    forward_batch.req_pool_indices,
+                    forward_batch.seq_lens,
+                    forward_batch.seq_lens_sum,
+                    prefix_lens,
+                    encoder_lens=forward_batch.encoder_lens,
+                    spec_info=None,
+                )
+                self.forward_metadata = ForwardMetadata(
+                    self.indices_updater_prefill.kv_indptr,
+                    self.indices_updater_prefill.kv_indices,
+                    None,
+                    None,
+                    self.indices_updater_prefill.max_q_len,
+                    self.indices_updater_prefill.max_kv_len,
+                )
+
+    def init_cuda_graph_state(
+        self,
+        max_bs: int,
+        max_num_tokens: int,
+        kv_indices_buf: Optional[torch.Tensor] = None,
+    ):
+        self.cuda_graph_kv_last_page_len = torch.ones(max_bs, dtype=torch.int)
+        if kv_indices_buf is None:
+            self.cuda_graph_kv_indices = torch.zeros(
+                (max_bs * self.max_context_len),
+                dtype=torch.int32,
+                device=self.device,
+            )
+        else:
+            self.cuda_graph_kv_indices = kv_indices_buf
+
+        if not self.skip_prefill:
+            self.cuda_graph_custom_mask = torch.zeros(
+                (max_num_tokens * self.max_context_len),
+                dtype=torch.uint8,
+                device=self.device,
+            )
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInfo],
+    ):
+        if forward_mode.is_decode_or_idle():
+            qo_indptr = None
+            kv_last_page_len = None
+            max_q_len = None
+
+            if spec_info is None:
+                kv_indptr = self.kv_indptr
+                kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
+                kv_indptr = kv_indptr[: bs + 1]
+                kv_indices = self.cuda_graph_kv_indices
+                create_flashinfer_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    req_pool_indices,
+                    seq_lens,
+                    kv_indptr,
+                    None,
+                    kv_indices,
+                    self.req_to_token.stride(0),
+                )
+            else:
+                kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
+
+            if self.use_mla:
+                qo_indptr = self.qo_indptr_[: bs + 1]
+                qo_indptr[1 : bs + 1] = torch.cumsum(
+                    self.cuda_graph_kv_last_page_len[:bs], dim=0
+                )
+                kv_last_page_len = self.cuda_graph_kv_last_page_len[:bs]
+                max_q_len = 1
+
+            self.forward_metadata = ForwardMetadata(
+                kv_indptr,
+                kv_indices,
+                qo_indptr,
+                kv_last_page_len,
+                max_q_len,
+                None,
+            )
+
+        elif forward_mode.is_target_verify():
+            if self.use_mla:
+                qo_indptr = self.qo_indptr[: bs + 1]
+                qo_indptr[: bs + 1] = torch.arange(
+                    0,
+                    (1 + bs) * self.num_draft_tokens,
+                    step=self.num_draft_tokens,
+                    dtype=torch.int32,
+                    device=self.device,
+                )
+                kv_indptr = self.kv_indptr[: bs + 1]
+                kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
+                kv_indices = self.cuda_graph_kv_indices
+                create_flashinfer_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    req_pool_indices,
+                    seq_lens,
+                    kv_indptr,
+                    None,
+                    kv_indices,
+                    self.req_to_token.stride(0),
+                )
+                kv_last_page_len = self.cuda_graph_kv_last_page_len[:bs]
+                max_q_len = self.num_draft_tokens
+
+                self.forward_metadata = ForwardMetadata(
+                    kv_indptr,
+                    kv_indices,
+                    qo_indptr,
+                    kv_last_page_len,
+                    max_q_len,
+                    None,
+                )
+            else:
+                seq_lens_sum = seq_lens.sum().item()
+                self.indices_updater_prefill.update(
+                    req_pool_indices,
+                    seq_lens,
+                    seq_lens_sum,
+                    prefix_lens=None,
+                    encoder_lens=encoder_lens,
+                    spec_info=spec_info,
+                )
+                self.forward_metadata = ForwardMetadata(
+                    self.indices_updater_prefill.kv_indptr,
+                    self.indices_updater_prefill.kv_indices,
+                    None,
+                    None,
+                    self.indices_updater_prefill.max_q_len,
+                    self.indices_updater_prefill.max_kv_len,
+                )
+        elif forward_mode.is_draft_extend():
+            num_tokens_per_bs = self.speculative_num_steps + 1
+            qo_indptr = self.qo_indptr[: bs + 1]
+            qo_indptr[: bs + 1] = torch.arange(
+                0,
+                bs * num_tokens_per_bs + 1,
+                step=num_tokens_per_bs,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            kv_indptr = self.kv_indptr[: bs + 1]
+            kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
+            kv_indices = self.cuda_graph_kv_indices
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                seq_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+            kv_last_page_len = self.cuda_graph_kv_last_page_len[:bs]
+            max_q_len = num_tokens_per_bs
+            self.forward_metadata = ForwardMetadata(
+                kv_indptr,
+                kv_indices,
+                qo_indptr,
+                kv_last_page_len,
+                max_q_len,
+                None,
+            )
+        else:
+            raise ValueError(f"Invalid mode: {forward_mode=}")
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInfo],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        if forward_mode.is_decode_or_idle():
+            kv_indptr = self.kv_indptr
+            kv_indices = self.cuda_graph_kv_indices
+            if spec_info is None:
+                kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens[:bs], dim=0)
+                kv_indptr = kv_indptr[: bs + 1]
+                create_flashinfer_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    req_pool_indices[:bs],
+                    seq_lens[:bs],
+                    kv_indptr,
+                    None,
+                    kv_indices,
+                    self.req_to_token.stride(0),
+                )
+            else:
+                kv_indptr[: spec_info.kv_indptr.shape[0]] = spec_info.kv_indptr
+                kv_indices[: spec_info.kv_indices.shape[0]] = spec_info.kv_indices
+
+        elif forward_mode.is_target_verify():
+            bs = len(req_pool_indices)
+            qo_indptr = self.qo_indptr[: bs + 1]
+            qo_indptr[: bs + 1] = torch.arange(
+                0,
+                (1 + bs) * self.num_draft_tokens,
+                step=self.num_draft_tokens,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            kv_lens = seq_lens + self.num_draft_tokens
+            kv_indptr = self.kv_indptr[: bs + 1]
+            kv_indptr[1 : bs + 1] = torch.cumsum(kv_lens, dim=0)
+            kv_indices = self.cuda_graph_kv_indices
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                kv_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+        elif forward_mode.is_draft_extend():
+            seq_lens = seq_lens[:bs]
+            accept_lens = spec_info.accept_length[:bs]
+            qo_indptr = self.qo_indptr[: bs + 1]
+            qo_indptr[1 : bs + 1] = torch.cumsum(accept_lens, dim=0)
+            kv_indptr = self.kv_indptr[: bs + 1]
+            kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
+            kv_indices = self.cuda_graph_kv_indices
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                seq_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+        else:
+            raise ValueError("Invalid forward mode")
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        return 1
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        cache_loc = (
+            forward_batch.out_cache_loc
+            if not layer.is_cross_attention
+            else forward_batch.encoder_out_cache_loc
+        )
+
+        self.logits_soft_cap = layer.logit_cap
+
+        if k is not None:
+            assert v is not None
+            if save_kv_cache:
+                if self.use_mla:
+                    forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
+                else:
+                    forward_batch.token_to_kv_pool.set_kv_buffer(
+                        layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+                    )
+
+        if self.use_mla:
+            max_q_len = self.forward_metadata.max_q_len
+            max_kv_len = self.forward_metadata.max_kv_len
+            kv_indptr = self.forward_metadata.kv_indptr
+            kv_indices = self.forward_metadata.kv_indices
+            qo_indptr = self.forward_metadata.qo_indptr
+            K_Buffer = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+            V_Buffer = forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id)
+            kv_lora_rank = V_Buffer.shape[-1]
+            qk_rope_head_dim = K_Buffer.shape[-1] - kv_lora_rank
+            qk_nope_head_dim = k.shape[-1] - qk_rope_head_dim
+            assert len(q.shape) == 3
+            assert len(k.shape) == 3
+            assert len(v.shape) == 3
+
+            if forward_batch.forward_mode.is_extend():
+                if kv_indices.shape[0] == 0:
+                    o = flash_attn_varlen_func(
+                        q,
+                        k,
+                        v,
+                        qo_indptr,
+                        qo_indptr,
+                        max_q_len,
+                        max_q_len,
+                        softmax_scale=layer.scaling,
+                        causal=True,
+                    )
+                    return o
+                elif layer.qk_head_dim != (kv_lora_rank + qk_rope_head_dim):
+                    K_Buffer = torch.index_select(K_Buffer, 0, kv_indices)
+                    kvc, k_pe = torch.split(
+                        K_Buffer, [kv_lora_rank, qk_rope_head_dim], dim=-1
+                    )
+                    kvprefix = layer.kv_b_proj(kvc.contiguous())[0]
+
+                    kvprefix = kvprefix.view(
+                        -1, layer.tp_k_head_num, qk_nope_head_dim + layer.v_head_dim
+                    )
+                    k_prefix, v_prefix = torch.split(
+                        kvprefix, [qk_nope_head_dim, layer.v_head_dim], dim=-1
+                    )
+                    k_prefix = torch.cat(
+                        [
+                            k_prefix,
+                            torch.broadcast_to(
+                                k_pe,
+                                (k_pe.shape[0], layer.tp_k_head_num, k_pe.shape[2]),
+                            ),
+                        ],
+                        dim=-1,
+                    )
+                    assert (
+                        forward_batch.extend_prefix_lens.shape
+                        == forward_batch.extend_seq_lens.shape
+                    )
+
+                    k = k_prefix
+                    v = v_prefix
+
+                    o = flash_attn_varlen_func(
+                        q,
+                        k,
+                        v,
+                        qo_indptr,
+                        kv_indptr,
+                        max_q_len,
+                        max_kv_len,
+                        softmax_scale=layer.scaling,
+                        causal=True,
+                    )
+                    return o
+
+                else:
+                    if layer.qk_head_dim != layer.v_head_dim:
+                        o = q.new_empty(
+                            (q.shape[0], layer.tp_q_head_num * layer.v_head_dim)
+                        )
+                    else:
+                        o = torch.empty_like(q)
+
+                    mla_prefill_fwd(
+                        q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+                        K_Buffer.view(-1, 1, 1, layer.qk_head_dim),
+                        o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+                        qo_indptr,
+                        kv_indptr,
+                        kv_indices,
+                        self.forward_metadata.kv_last_page_len,
+                        self.forward_metadata.max_q_len,
+                        layer.scaling,
+                        layer.logit_cap,
+                    )
+                    K_Buffer = K_Buffer.view(-1, layer.tp_k_head_num, layer.qk_head_dim)
+                    return o
+            elif forward_batch.forward_mode.is_target_verify():
+                o = q.new_empty((q.shape[0], layer.tp_q_head_num, layer.v_head_dim))
+                mla_decode_fwd(
+                    q,
+                    K_Buffer.view(-1, 1, 1, layer.qk_head_dim),
+                    o,
+                    self.forward_metadata.qo_indptr,
+                    self.forward_metadata.kv_indptr,
+                    self.forward_metadata.kv_indices,
+                    self.forward_metadata.kv_last_page_len,
+                    self.forward_metadata.max_q_len,
+                    layer.scaling,
+                    layer.logit_cap,
+                )
+                K_Buffer = K_Buffer.view(-1, 1, layer.qk_head_dim)
+                return o
+            elif forward_batch.forward_mode.is_draft_extend():
+                o = q.new_empty((q.shape[0], layer.tp_q_head_num, layer.v_head_dim))
+                causal = True
+                sliding_window_size = -1
+                kv_indptr = self.forward_metadata.kv_indptr
+                kv_indices = self.forward_metadata.kv_indices
+                mla_prefill_fwd(
+                    q,
+                    K_Buffer.view(-1, 1, 1, layer.qk_head_dim),
+                    o,
+                    self.forward_metadata.qo_indptr,
+                    self.forward_metadata.kv_indptr,
+                    self.forward_metadata.kv_indices,
+                    self.forward_metadata.kv_last_page_len,
+                    self.forward_metadata.max_q_len,
+                    layer.scaling,
+                    layer.logit_cap,
+                )
+                K_Buffer = K_Buffer.view(-1, 1, layer.qk_head_dim)
+                return o
+                # self.extend_attention_fwd(
+                #     q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+                #     k.contiguous(),
+                #     v.contiguous(),
+                #     o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+                #     forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+                #     forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+                #     self.forward_metadata.qo_indptr,
+                #     kv_indptr,
+                #     kv_indices,
+                #     None,
+                #     causal,
+                #     None,
+                #     self.forward_metadata.max_q_len,
+                #     layer.scaling,
+                #     layer.logit_cap,
+                #     sliding_window_size,
+                # )
+                # return o
+            else:
+                raise ValueError(
+                    f"Invalid forward mode for MLA prefill: {forward_batch.forward_mode=}"
+                )
+        else:
+            k_cache, v_cache = forward_batch.token_to_kv_pool.get_kv_buffer(
+                layer.layer_id
+            )
+
+            bs0 = forward_batch.batch_size + 1
+
+            o = mha_batch_prefill_func(
+                q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
+                k_cache,
+                v_cache,
+                self.qo_indptr[:bs0],
+                self.forward_metadata.kv_indptr[:bs0],
+                self.forward_metadata.kv_indices,
+                self.forward_metadata.max_q_len,
+                self.forward_metadata.max_kv_len,
+                causal=True,
+                logits_soft_cap=self.logits_soft_cap,
+                alibi_slopes=None,
+                return_lse=False,
+                return_attn_probs=False,
+            )
+
+            return o.view(-1, layer.tp_q_head_num * layer.head_dim)
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+
+        q = q.reshape(-1, layer.tp_q_head_num * layer.qk_head_dim)
+
+        if layer.qk_head_dim != layer.v_head_dim:
+            o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        if save_kv_cache:
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                layer, forward_batch.out_cache_loc, k, v
+            )
+
+        if self.use_mla:
+            k_buffer = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+            mla_decode_fwd(
+                q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+                k_buffer.view(-1, 1, 1, layer.qk_head_dim),
+                o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+                self.forward_metadata.qo_indptr,
+                self.forward_metadata.kv_indptr,
+                self.forward_metadata.kv_indices,
+                self.forward_metadata.kv_last_page_len,
+                self.forward_metadata.max_q_len,
+                layer.scaling,
+                layer.logit_cap,
+            )
+            k_buffer = k_buffer.view(-1, 1, layer.qk_head_dim)
+        else:
+            self.logits_soft_cap = layer.logit_cap
+            paged_attention_ragged(
+                o.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+                self.workspace_buffer,
+                q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+                forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id).view(
+                    -1, 1, layer.tp_k_head_num, layer.qk_head_dim
+                ),
+                forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id).view(
+                    -1, 1, layer.tp_v_head_num, layer.v_head_dim
+                ),
+                self.scale,
+                self.forward_metadata.kv_indptr,
+                self.forward_metadata.kv_indices,
+                self.kv_last_page_len,
+                1,
+                self.max_num_partitions,
+                None,
+                "auto",
+                "NHD",
+                self.logits_soft_cap,
+                self.k_scale,
+                self.v_scale,
+                None,
+                _AITER_PARTITION_SIZE_ROCM,
+            )
+
+        return o
+
+
+class AiterIndicesUpdaterPrefill:
+    def __init__(self, model_runner: ModelRunner, attn_backend: AttentionBackend):
+        # Parse Constants
+        self.num_qo_heads = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.num_kv_heads = model_runner.model_config.get_num_kv_heads(
+            get_attention_tp_size()
+        )
+        self.head_dim = model_runner.model_config.head_dim
+        self.data_type = model_runner.kv_cache_dtype
+        self.q_data_type = model_runner.dtype
+        self.sliding_window_size = model_runner.sliding_window_size
+        self.attn_backend = attn_backend
+
+        # Buffers and wrappers
+        self.kv_indptr = attn_backend.kv_indptr
+        self.kv_last_page_len = attn_backend.kv_last_page_len
+        self.qo_indptr = attn_backend.qo_indptr
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.update = self.update_single_wrapper
+
+        self.kv_indices = None
+        self.max_q_len = 0
+        self.max_kv_len = 0
+
+    def update(
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        prefix_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        spec_info: Optional[SpecInfo],
+    ):
+        # Keep the signature for type checking. It will be assigned during runtime.
+        raise NotImplementedError()
+
+    def update_single_wrapper(
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        prefix_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        spec_info: Optional[SpecInfo],
+    ):
+
+        kv_start_idx = None
+        kv_indptr = self.kv_indptr
+        qo_indptr = self.qo_indptr
+        paged_kernel_lens = seq_lens
+        paged_kernel_lens_sum = seq_lens_sum
+
+        bs = len(req_pool_indices)
+        if spec_info is None:
+            # Normal extend
+            kv_indptr[1 : bs + 1] = torch.cumsum(paged_kernel_lens, dim=0)
+            kv_indptr = kv_indptr[: bs + 1]
+
+            # (TODO: Kk) WA - CI test_moe_eval_accuracy_large.py
+            # mha_batch_prefill reads 128 data to do computatoin
+            # if real data is not long enough then original padding value 0 is used
+            # but the 0 location will be made nan (noqa) in cuda graph capture mode
+            # this will cause the output tensor value becomes nan
+            # WA is to assure that last index of pool not changed
+            kv_indices = torch.empty(
+                paged_kernel_lens_sum + 256,
+                dtype=torch.int32,
+                device=req_pool_indices.device,
+            )
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                paged_kernel_lens,
+                kv_indptr,
+                kv_start_idx,
+                kv_indices,
+                self.req_to_token.shape[1],
+            )
+
+            token_num = kv_indptr[-1]
+            kv_indices[token_num:] = kv_indices[0]
+
+            self.max_kv_len = torch.max(paged_kernel_lens).item()
+
+            extend_lens = seq_lens - prefix_lens
+            self.max_q_len = torch.max(extend_lens).item()
+
+            qo_indptr[1 : bs + 1] = torch.cumsum(extend_lens, dim=0)
+            qo_indptr = qo_indptr[: bs + 1]
+            custom_mask = None
+        else:
+            kv_indices, kv_indptr, qo_indptr, custom_mask = (
+                spec_info.generate_attn_arg_prefill(
+                    req_pool_indices,
+                    paged_kernel_lens,
+                    paged_kernel_lens_sum,
+                    self.req_to_token,
+                )
+            )
+
+        self.kv_indices = kv_indices
+
+
+class AiterMlaIndicesUpdaterPrefill:
+    def __init__(self, model_runner: ModelRunner, attn_backend: AttentionBackend):
+        # Parse Constants
+        self.attn_backend = attn_backend
+
+        # Buffers and wrappers
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.update = self.update_single_wrapper
+
+        self.kv_indptr = None
+        self.kv_indices = None
+        self.qo_indptr = None
+        self.kv_last_page_len = None
+        self.max_q_len = 0
+        self.max_kv_len = 0
+
+    def update(
+        self,
+        req_pool_indices: torch.Tensor,
+        kv_lens: torch.Tensor,
+        kv_lens_sum: int,
+        extend_lens: torch.Tensor,
+        max_q_len: int,
+        max_kv_len: int,
+        spec_info: Optional[SpecInfo],
+    ):
+        # Keep the signature for type checking. It will be assigned during runtime.
+        raise NotImplementedError()
+
+    def update_single_wrapper(
+        self,
+        req_pool_indices: torch.Tensor,
+        kv_lens: torch.Tensor,
+        kv_lens_sum: int,
+        extend_lens: torch.Tensor,
+        max_q_len: int,
+        max_kv_len: int,
+        spec_info: Optional[SpecInfo],
+    ):
+        bs = len(req_pool_indices)
+
+        kv_indptr = self.attn_backend.kv_indptr
+
+        if spec_info is None:
+            # Normal extend
+            kv_indptr[1 : bs + 1] = torch.cumsum(kv_lens, dim=0)
+            kv_indptr = kv_indptr[: bs + 1]
+            kv_indices = torch.empty(
+                kv_lens_sum,
+                dtype=torch.int32,
+                device=req_pool_indices.device,
+            )
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                kv_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+
+            qo_indptr = self.attn_backend.qo_indptr
+            qo_indptr[1 : bs + 1] = torch.cumsum(extend_lens, dim=0)
+            qo_indptr = qo_indptr[: bs + 1]
+        else:
+            kv_indices, kv_indptr, qo_indptr, custom_mask = (
+                spec_info.generate_attn_arg_prefill(
+                    req_pool_indices,
+                    kv_lens,
+                    kv_lens_sum,
+                    self.req_to_token,
+                )
+            )
+
+        self.kv_indptr = kv_indptr
+        self.kv_indices = kv_indices
+        self.qo_indptr = qo_indptr
+        self.max_q_len = max_q_len
+        self.max_kv_len = max_kv_len
+
+
+class AiterMultiStepDraftBackend:
+    """
+    Wrap multiple triton attention backends as one for multiple consecutive
+    draft decoding steps.
+    """
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        topk: int,
+        speculative_num_steps: int,
+    ):
+        from sglang.srt.speculative.eagle_utils import generate_draft_decode_kv_indices
+
+        self.topk = topk
+        self.speculative_num_steps = speculative_num_steps
+        self.generate_draft_decode_kv_indices = generate_draft_decode_kv_indices
+        max_bs = model_runner.req_to_token_pool.size * self.topk
+        self.kv_indptr = torch.zeros(
+            (
+                self.speculative_num_steps,
+                max_bs + 1,
+            ),
+            dtype=torch.int32,
+            device=model_runner.device,
+        )
+        self.attn_backends = []
+        for i in range(self.speculative_num_steps):
+            self.attn_backends.append(
+                AiterAttnBackend(
+                    model_runner,
+                    skip_prefill=True,
+                    kv_indptr_buf=self.kv_indptr[i],
+                )
+            )
+        self.max_context_len = self.attn_backends[0].max_context_len
+        self.num_head = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.device = model_runner.device
+        # Cached variables for generate_draft_decode_kv_indices
+        self.pool_len = model_runner.req_to_token_pool.req_to_token.shape[1]
+        self.page_size = model_runner.server_args.page_size
+        assert self.page_size == 1, "Page size must be 1"
+
+    def common_template(
+        self, forward_batch: ForwardBatch, kv_indices_buffer: torch.Tensor, call_fn: int
+    ):
+        num_seqs = forward_batch.batch_size
+        bs = self.topk * num_seqs
+        seq_lens_sum = forward_batch.seq_lens_sum
+
+        self.generate_draft_decode_kv_indices[
+            (self.speculative_num_steps, num_seqs, self.topk)
+        ](
+            forward_batch.req_pool_indices,
+            forward_batch.req_to_token_pool.req_to_token,
+            forward_batch.seq_lens,
+            kv_indices_buffer,
+            self.kv_indptr,
+            forward_batch.positions,
+            self.pool_len,
+            kv_indices_buffer.shape[1],
+            self.kv_indptr.shape[1],
+            triton.next_power_of_2(num_seqs),
+            triton.next_power_of_2(self.speculative_num_steps),
+            triton.next_power_of_2(bs),
+            self.page_size,
+        )
+
+        for i in range(self.speculative_num_steps):
+            forward_batch.spec_info.kv_indptr = self.kv_indptr[i, : bs + 1]
+            forward_batch.spec_info.kv_indices = kv_indices_buffer[i][
+                : seq_lens_sum * self.topk + bs * (i + 1)
+            ]
+            call_fn(i, forward_batch)
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        kv_indices = torch.empty(
+            (
+                self.speculative_num_steps,
+                forward_batch.batch_size * self.topk * self.max_context_len,
+            ),
+            dtype=torch.int32,
+            device=self.device,
+        )
+
+        def call_fn(i, forward_batch):
+            forward_batch.spec_info.kv_indptr = (
+                forward_batch.spec_info.kv_indptr.clone()
+            )
+            forward_batch.spec_info.kv_indices = (
+                forward_batch.spec_info.kv_indices.clone()
+            )
+            self.attn_backends[i].init_forward_metadata(forward_batch)
+
+        self.common_template(forward_batch, kv_indices, call_fn)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        self.cuda_graph_kv_indices = torch.zeros(
+            (self.speculative_num_steps, max_num_tokens * self.max_context_len),
+            dtype=torch.int32,
+            device=self.device,
+        )
+        for i in range(self.speculative_num_steps):
+            self.attn_backends[i].init_cuda_graph_state(
+                max_bs, max_num_tokens, kv_indices_buf=self.cuda_graph_kv_indices[i]
+            )
+
+    def init_forward_metadata_capture_cuda_graph(self, forward_batch: ForwardBatch):
+        def call_fn(i, forward_batch):
+            self.attn_backends[i].init_forward_metadata_capture_cuda_graph(
+                forward_batch.batch_size,
+                forward_batch.batch_size * self.topk,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                encoder_lens=None,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+            )
+
+        self.common_template(forward_batch, self.cuda_graph_kv_indices, call_fn)
+
+    def init_forward_metadata_replay_cuda_graph(
+        self, forward_batch: ForwardBatch, bs: int
+    ):
+        def call_fn(i, forward_batch):
+            self.attn_backends[i].init_forward_metadata_replay_cuda_graph(
+                bs,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                seq_lens_sum=-1,
+                encoder_lens=None,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+                seq_lens_cpu=None,
+            )
+
+        self.common_template(forward_batch, self.cuda_graph_kv_indices, call_fn)
diff --git a/python/sglang/srt/layers/attention/ascend_backend.py b/python/sglang/srt/layers/attention/ascend_backend.py
new file mode 100644
index 000000000..6d5ed0a5c
--- /dev/null
+++ b/python/sglang/srt/layers/attention/ascend_backend.py
@@ -0,0 +1,577 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, List, Optional
+
+import torch
+import torch_npu
+from torch.nn.functional import scaled_dot_product_attention
+
+from sglang.srt.configs.model_config import AttentionArch
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.attention.torch_native_backend import TorchNativeAttnBackend
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+from sglang.srt.layers.radix_attention import AttentionType
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.utils import get_bool_env_var
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+import os
+
+import numpy as np
+
+
+@dataclass
+class ForwardMetadata:
+
+    # calculated map for kv positions [bs * maxseqlen]
+    block_tables: Optional[torch.Tensor] = None
+
+    # seq len inputs
+    extend_seq_lens_cpu_int: Optional[torch.Tensor] = None
+    seq_lens_cpu_int: Optional[torch.Tensor] = None
+    seq_lens_cpu_list: Optional[List[int]] = None
+    seq_lens_list_cumsum: Optional[List[int]] = None
+
+
+class AscendAttnBackend(AttentionBackend):
+
+    def gen_attention_mask(self, max_seq_len: int, dtype=torch.float16):
+        mask_flag = torch.tril(
+            torch.ones((max_seq_len, max_seq_len), dtype=torch.bool)
+        ).view(max_seq_len, max_seq_len)
+        mask_flag = ~mask_flag
+        if dtype == torch.float16:
+            mask_value = torch.finfo(torch.float32).min
+        else:
+            mask_value = 1
+        self.mask = (
+            torch.masked_fill(
+                torch.zeros(size=(max_seq_len, max_seq_len)), mask_flag, mask_value
+            )
+            .to(dtype)
+            .to(self.device)
+        )
+        self.mask_len = max_seq_len
+
+    def __init__(self, model_runner: ModelRunner):
+        super().__init__()
+        self.forward_metadata = None
+        self.device = model_runner.device
+        self.page_size = model_runner.page_size
+        self.use_mla = model_runner.model_config.attention_arch == AttentionArch.MLA
+        if self.use_mla:
+            self.kv_lora_rank = model_runner.model_config.kv_lora_rank
+            self.qk_rope_head_dim = model_runner.model_config.qk_rope_head_dim
+        self.native_attn = TorchNativeAttnBackend(model_runner)
+        self.graph_metadata = {}
+        self.max_context_len = model_runner.model_config.context_len
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.graph_mode = False
+        self.use_fia = get_bool_env_var("ASCEND_USE_FIA", "False")
+        if not self.use_fia:
+            self.gen_attention_mask(128, model_runner.dtype)
+        mask_length = 2048
+        self.fia_mask = ~torch.tril(
+            torch.ones(
+                (mask_length, mask_length),
+                dtype=torch.bool,
+                device=model_runner.device,
+            )
+        )
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Init the metadata for a forward pass."""
+        tp_size = get_attention_tp_size()
+        self.forward_metadata = ForwardMetadata()
+
+        self.forward_metadata.block_tables = (
+            forward_batch.req_to_token_pool.req_to_token[
+                forward_batch.req_pool_indices, : forward_batch.seq_lens.max()
+            ][:, :: self.page_size]
+            // self.page_size
+        )
+        if forward_batch.extend_seq_lens is not None:
+            self.forward_metadata.extend_seq_lens_cpu_int = (
+                forward_batch.extend_seq_lens.cpu().int()
+            )
+        self.forward_metadata.seq_lens_cpu_int = forward_batch.seq_lens_cpu.int()
+
+        seq_lens_list_cumsum = np.cumsum(forward_batch.extend_seq_lens_cpu)
+        if forward_batch.is_extend_in_batch:
+            seq_lens_list_cumsum[-1] = (
+                (seq_lens_list_cumsum[-1] - 1) // tp_size + 1
+            ) * tp_size
+        self.forward_metadata.seq_lens_list_cumsum = seq_lens_list_cumsum
+
+        self.graph_mode = False
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        self.graph_metadata = {
+            "block_tables": torch.empty(
+                (max_bs, self.max_context_len // self.page_size),
+                dtype=torch.int32,
+                device=self.device,
+            ),
+        }
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        metadata = ForwardMetadata()
+
+        metadata.block_tables = self.graph_metadata["block_tables"][:bs, :]
+        metadata.seq_lens_cpu_list = seq_lens.cpu().int().tolist()
+
+        self.graph_metadata[bs] = metadata
+        self.forward_metadata = metadata
+
+        self.graph_mode = True
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        metadata = self.graph_metadata[bs]
+        max_len = seq_lens_cpu[:bs].max().item()
+        max_seq_pages = (max_len + self.page_size - 1) // self.page_size
+
+        metadata.block_tables[:bs, :max_seq_pages].copy_(
+            self.req_to_token[req_pool_indices[:bs], :max_len][:, :: self.page_size]
+            // self.page_size
+        )
+        metadata.block_tables[:bs, max_seq_pages:].fill_(0)
+        metadata.block_tables[bs:, :].fill_(0)
+
+        self.forward_metadata = metadata
+
+        self.graph_mode = True
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        return 0
+
+    def forward_extend(
+        self,
+        q,
+        k,
+        v,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+    ):
+        if not self.use_mla:
+            if save_kv_cache:
+                forward_batch.token_to_kv_pool.set_kv_buffer(
+                    layer, forward_batch.out_cache_loc, k, v
+                )
+
+            k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+            v_cache = forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id)
+
+            if self.use_fia:
+                """FIA will support multi-bs in the later version of CANN"""
+                q = q.reshape(-1, layer.tp_q_head_num, layer.qk_head_dim)
+                attn_output = torch.empty(
+                    (q.size(0), layer.tp_q_head_num, layer.v_head_dim),
+                    device=q.device,
+                    dtype=q.dtype,
+                )
+                q_len_offset = 0
+                for q_len in forward_batch.extend_seq_lens_cpu:
+                    attn_output[q_len_offset : q_len_offset + q_len] = (
+                        torch.ops.npu.npu_fused_infer_attention_score(
+                            q[None, q_len_offset : q_len_offset + q_len],
+                            k[None, q_len_offset : q_len_offset + q_len],
+                            v[None, q_len_offset : q_len_offset + q_len],
+                            num_heads=layer.tp_q_head_num,
+                            num_key_value_heads=layer.tp_k_head_num,
+                            input_layout="BSND",  # todo, TND not supports q_heads!=k_heads
+                            atten_mask=self.fia_mask.unsqueeze(0),
+                            sparse_mode=3,
+                            scale=layer.scaling,
+                            next_tokens=0,
+                        )[0]
+                    )
+                    q_len_offset += q_len
+                attn_output = attn_output.view(
+                    -1, layer.tp_q_head_num * layer.v_head_dim
+                )
+
+            else:
+                if layer.qk_head_dim <= 128:
+                    query = q.reshape(-1, layer.tp_q_head_num * layer.qk_head_dim)
+                    attn_output = torch.empty(
+                        (query.shape[0], layer.tp_q_head_num * layer.v_head_dim),
+                        dtype=query.dtype,
+                        device=query.device,
+                    )
+
+                    torch_npu._npu_flash_attention_qlens(
+                        query=query,
+                        key_cache=k_cache,
+                        value_cache=v_cache,
+                        mask=self.mask,
+                        block_table=self.forward_metadata.block_tables,
+                        seq_len=self.forward_metadata.extend_seq_lens_cpu_int,
+                        context_lens=self.forward_metadata.seq_lens_cpu_int,
+                        scale_value=layer.scaling,
+                        num_heads=layer.tp_q_head_num,
+                        num_kv_heads=layer.tp_k_head_num,
+                        out=attn_output,
+                    )
+                else:
+                    if layer.qk_head_dim != layer.v_head_dim:
+                        attn_output = q.new_empty(
+                            (q.shape[0], layer.tp_q_head_num * layer.v_head_dim)
+                        )
+                    else:
+                        attn_output = torch.empty_like(q)
+
+                    use_gqa = layer.tp_q_head_num != layer.tp_k_head_num
+
+                    q_ = q.view(-1, layer.tp_q_head_num, layer.qk_head_dim)
+                    o_ = attn_output.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+
+                    causal = True
+                    if (
+                        layer.is_cross_attention
+                        or layer.attn_type == AttentionType.ENCODER_ONLY
+                    ):
+                        causal = False
+
+                    self.native_attn._run_sdpa_forward_extend(
+                        q_,
+                        o_,
+                        k_cache.view(-1, layer.tp_k_head_num, layer.qk_head_dim),
+                        v_cache.view(-1, layer.tp_v_head_num, layer.v_head_dim),
+                        forward_batch.req_to_token_pool.req_to_token,
+                        forward_batch.req_pool_indices,
+                        forward_batch.seq_lens,
+                        forward_batch.extend_prefix_lens,
+                        forward_batch.extend_seq_lens,
+                        scaling=layer.scaling,
+                        enable_gqa=use_gqa,
+                        causal=causal,
+                    )
+        else:
+            assert (
+                layer.qk_head_dim != layer.v_head_dim
+            ), "FIA only supports qk_head_dim != v_head_dim"
+            q_nope, q_rope = q.split([layer.v_head_dim, self.qk_rope_head_dim], dim=-1)
+            k_nope, k_rope = k.split([layer.v_head_dim, self.qk_rope_head_dim], dim=-1)
+
+            attn_output, _ = torch.ops.npu.npu_fused_infer_attention_score(
+                q_nope,
+                k_nope,
+                v,
+                query_rope=q_rope,
+                key_rope=k_rope,
+                num_heads=layer.tp_q_head_num,
+                input_layout="TND",
+                atten_mask=self.fia_mask,
+                sparse_mode=3,
+                actual_seq_lengths=self.forward_metadata.seq_lens_list_cumsum,
+                actual_seq_lengths_kv=self.forward_metadata.seq_lens_list_cumsum,
+                scale=layer.scaling,
+                next_tokens=0,
+            )
+
+        return attn_output
+
+    def forward_decode_graph(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        q_rope: Optional[torch.Tensor] = None,
+        k_rope: Optional[torch.Tensor] = None,
+    ):
+        if save_kv_cache:
+            if self.use_mla:
+                k = k.view(-1, layer.tp_k_head_num, self.kv_lora_rank)
+                k_rope = k_rope.view(-1, layer.tp_k_head_num, self.qk_rope_head_dim)
+                forward_batch.token_to_kv_pool.set_kv_buffer(
+                    layer, forward_batch.out_cache_loc, k, k_rope
+                )
+            else:
+                forward_batch.token_to_kv_pool.set_kv_buffer(
+                    layer, forward_batch.out_cache_loc, k, v
+                )
+
+        if not self.use_mla:
+            k_cache = forward_batch.token_to_kv_pool.get_key_buffer(
+                layer.layer_id
+            ).view(-1, self.page_size, layer.tp_k_head_num * layer.qk_head_dim)
+            v_cache = forward_batch.token_to_kv_pool.get_value_buffer(
+                layer.layer_id
+            ).view(-1, self.page_size, layer.tp_v_head_num * layer.v_head_dim)
+            query = q.reshape(-1, 1, layer.tp_q_head_num * layer.qk_head_dim)
+            if self.forward_metadata.seq_lens_cpu_int is None:
+                actual_seq_len_kv = self.forward_metadata.seq_lens_cpu_list
+            else:
+                actual_seq_len_kv = (
+                    self.forward_metadata.seq_lens_cpu_int.cpu().int().tolist()
+                )
+            num_tokens = query.shape[0]
+            workspace = torch_npu._npu_fused_infer_attention_score_get_max_workspace(
+                query,
+                k_cache,
+                v_cache,
+                block_table=self.forward_metadata.block_tables,
+                block_size=self.page_size,
+                num_heads=layer.tp_q_head_num,
+                num_key_value_heads=layer.tp_k_head_num,
+                input_layout="BSH",
+                scale=layer.scaling,
+                actual_seq_lengths_kv=actual_seq_len_kv,
+            )
+            output = torch.empty(
+                (num_tokens, 1, layer.tp_q_head_num * layer.v_head_dim),
+                dtype=q.dtype,
+                device=q.device,
+            )
+            softmax_lse = torch.empty(1, dtype=q.dtype, device=q.device)
+            torch_npu.npu_fused_infer_attention_score.out(
+                query,
+                k_cache,
+                v_cache,
+                block_table=self.forward_metadata.block_tables,
+                block_size=self.page_size,
+                num_heads=layer.tp_q_head_num,
+                num_key_value_heads=layer.tp_k_head_num,
+                input_layout="BSH",
+                scale=layer.scaling,
+                actual_seq_lengths_kv=actual_seq_len_kv,
+                workspace=workspace,
+                out=[output, softmax_lse],
+            )
+            return output.view(num_tokens, layer.tp_q_head_num * layer.v_head_dim)
+        else:
+            c_kv, k_rope = forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id)
+            k_rope_cache = k_rope.view(
+                -1, layer.tp_k_head_num, self.page_size, self.qk_rope_head_dim
+            )
+            c_kv_cache = c_kv.view(
+                -1, layer.tp_v_head_num, self.page_size, self.kv_lora_rank
+            )
+
+            q_nope = q.view(-1, layer.tp_q_head_num, 1, self.kv_lora_rank).contiguous()
+            q_rope = q_rope.view(-1, layer.tp_q_head_num, 1, self.qk_rope_head_dim)
+            if self.forward_metadata.seq_lens_cpu_int is None:
+                actual_seq_len_kv = self.forward_metadata.seq_lens_cpu_list
+            else:
+                actual_seq_len_kv = (
+                    self.forward_metadata.seq_lens_cpu_int.cpu().int().tolist()
+                )
+
+            workspace = torch_npu._npu_fused_infer_attention_score_get_max_workspace(
+                q_nope,
+                c_kv_cache,
+                c_kv_cache,
+                query_rope=q_rope,
+                key_rope=k_rope_cache,
+                num_heads=layer.tp_q_head_num,
+                num_key_value_heads=layer.tp_k_head_num,
+                block_table=self.forward_metadata.block_tables,
+                block_size=self.page_size,
+                input_layout="BNSD",
+                scale=layer.scaling,
+                actual_seq_lengths_kv=actual_seq_len_kv,
+                antiquant_mode=0,
+                antiquant_scale=None,
+                sparse_mode=0,
+            )
+            output = torch.zeros_like(q_nope, dtype=q.dtype, device=q.device)
+            softmax_lse = torch.empty(1, dtype=q.dtype, device=q.device)
+
+            torch_npu.npu_fused_infer_attention_score.out(
+                q_nope,
+                c_kv_cache,
+                c_kv_cache,
+                query_rope=q_rope,
+                key_rope=k_rope_cache,
+                num_heads=layer.tp_q_head_num,
+                num_key_value_heads=layer.tp_k_head_num,
+                block_table=self.forward_metadata.block_tables,
+                block_size=self.page_size,
+                input_layout="BNSD",
+                scale=layer.scaling,
+                actual_seq_lengths_kv=actual_seq_len_kv,
+                antiquant_mode=0,
+                antiquant_scale=None,
+                sparse_mode=0,
+                workspace=workspace,
+                out=[output, softmax_lse],
+            )
+            return output.view(-1, layer.tp_q_head_num * self.kv_lora_rank)
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        # For multi-head latent attention
+        q_rope: Optional[torch.Tensor] = None,
+        k_rope: Optional[torch.Tensor] = None,
+    ):
+        if self.graph_mode:
+            return self.forward_decode_graph(
+                q,
+                k,
+                v,
+                layer,
+                forward_batch,
+                save_kv_cache,
+                q_rope=q_rope,
+                k_rope=k_rope,
+            )
+
+        if not self.use_mla:
+            if save_kv_cache:
+                forward_batch.token_to_kv_pool.set_kv_buffer(
+                    layer, forward_batch.out_cache_loc, k, v
+                )
+            num_tokens = q.shape[0]
+            k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+            v_cache = forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id)
+            if self.use_fia:
+                attn_output, _ = torch.ops.npu.npu_fused_infer_attention_score(
+                    q.view(
+                        forward_batch.batch_size,
+                        -1,
+                        layer.tp_q_head_num,
+                        layer.qk_head_dim,
+                    ),
+                    k_cache.view(
+                        -1, self.page_size, layer.tp_k_head_num * layer.qk_head_dim
+                    ),
+                    v_cache.view(
+                        -1, self.page_size, layer.tp_v_head_num * layer.qk_head_dim
+                    ),
+                    num_heads=layer.tp_q_head_num,
+                    num_key_value_heads=layer.tp_k_head_num,
+                    input_layout="BSND",
+                    atten_mask=None,
+                    block_size=self.page_size,
+                    block_table=self.forward_metadata.block_tables,
+                    actual_seq_lengths_kv=self.forward_metadata.seq_lens_cpu_int,
+                    scale=layer.scaling,
+                )
+            else:
+                query = q.reshape(-1, layer.tp_q_head_num, layer.qk_head_dim)
+                num_tokens = query.shape[0]
+                attn_output = torch.empty(
+                    (num_tokens, layer.tp_q_head_num, layer.v_head_dim),
+                    dtype=query.dtype,
+                    device=query.device,
+                )
+
+                torch_npu._npu_paged_attention(
+                    query=query,
+                    key_cache=k_cache,
+                    value_cache=v_cache,
+                    num_heads=layer.tp_q_head_num,
+                    num_kv_heads=layer.tp_k_head_num,
+                    scale_value=layer.scaling,
+                    block_table=self.forward_metadata.block_tables,
+                    context_lens=self.forward_metadata.seq_lens_cpu_int,
+                    out=attn_output,
+                )
+            return attn_output.view(num_tokens, layer.tp_q_head_num * layer.v_head_dim)
+        else:
+            if save_kv_cache:
+                forward_batch.token_to_kv_pool.set_kv_buffer(
+                    layer, forward_batch.out_cache_loc, k, k_rope
+                )
+            num_tokens = q.shape[0]
+            kv_c = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+            k_pe = forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id)
+
+            if self.use_fia and (layer.tp_q_head_num // layer.tp_k_head_num) >= 8:
+                """layer.tp_q_head_num // layer.tp_k_head_num < 8 will support in the later version of CANN"""
+                kv_c = kv_c.view(
+                    -1, self.page_size, layer.tp_k_head_num * self.kv_lora_rank
+                )
+                k_pe = k_pe.view(
+                    -1, self.page_size, layer.tp_k_head_num * self.qk_rope_head_dim
+                )
+                q = q.view(
+                    forward_batch.batch_size, -1, layer.tp_q_head_num, self.kv_lora_rank
+                )
+                q_rope = q_rope.view(
+                    forward_batch.batch_size,
+                    -1,
+                    layer.tp_q_head_num,
+                    self.qk_rope_head_dim,
+                )
+                attn_output, _ = torch.ops.npu.npu_fused_infer_attention_score(
+                    q,
+                    kv_c,
+                    kv_c,
+                    query_rope=q_rope,
+                    key_rope=k_pe,
+                    num_heads=layer.tp_q_head_num,
+                    num_key_value_heads=layer.tp_k_head_num,
+                    input_layout="BSND",
+                    atten_mask=None,
+                    sparse_mode=0,
+                    scale=layer.scaling,
+                    antiquant_mode=0,
+                    antiquant_scale=None,
+                    block_table=self.forward_metadata.block_tables,
+                    block_size=self.page_size,
+                    actual_seq_lengths_kv=self.forward_metadata.seq_lens_cpu_int,
+                )
+            else:
+                assert (
+                    self.graph_mode == False
+                )  # _npu_paged_attention_mla not support graph mode
+                q = torch.cat([q, q_rope], dim=-1)
+                query = q.view(-1, layer.tp_q_head_num, layer.head_dim)
+                kv_c_and_k_pe_cache = torch.cat([kv_c, k_pe], dim=-1)
+                kv_c_and_k_pe_cache = kv_c_and_k_pe_cache.view(
+                    -1,
+                    self.page_size,
+                    layer.tp_k_head_num,
+                    self.kv_lora_rank + self.qk_rope_head_dim,
+                )
+                attn_output = torch.empty(
+                    [num_tokens, layer.tp_q_head_num, self.kv_lora_rank],
+                    dtype=q.dtype,
+                    device=q.device,
+                )
+                torch_npu._npu_paged_attention_mla(
+                    query=query,
+                    key_cache=kv_c_and_k_pe_cache,
+                    num_kv_heads=layer.tp_k_head_num,
+                    num_heads=layer.tp_q_head_num,
+                    scale_value=layer.scaling,
+                    block_table=self.forward_metadata.block_tables,
+                    context_lens=self.forward_metadata.seq_lens_cpu_int,
+                    mla_vheadsize=self.kv_lora_rank,
+                    out=attn_output,
+                )
+            return attn_output.view(num_tokens, layer.tp_q_head_num * self.kv_lora_rank)
diff --git a/python/sglang/srt/layers/attention/base_attn_backend.py b/python/sglang/srt/layers/attention/base_attn_backend.py
new file mode 100644
index 000000000..3025d0b11
--- /dev/null
+++ b/python/sglang/srt/layers/attention/base_attn_backend.py
@@ -0,0 +1,117 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+    from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
+
+
+class AttentionBackend(ABC):
+    """The base class of attention backends"""
+
+    @abstractmethod
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Init the metadata for a forward pass."""
+        raise NotImplementedError()
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        """Init the global shared states for cuda graph."""
+        raise NotImplementedError()
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        """Init the metadata for a forward pass for capturing a cuda graph."""
+        raise NotImplementedError()
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        """Init the metadata for a forward pass for replaying a cuda graph."""
+        raise NotImplementedError()
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        """Get the fill value for padded seq lens. Typically, it is 0 or 1."""
+        raise NotImplementedError()
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        **kwargs,
+    ):
+        """Run forward on an attention layer."""
+        if forward_batch.forward_mode.is_idle():
+            return q.new_empty(q.shape[0], layer.tp_q_head_num * layer.v_head_dim)
+        elif forward_batch.forward_mode.is_decode():
+            return self.forward_decode(
+                q,
+                k,
+                v,
+                layer,
+                forward_batch,
+                save_kv_cache=save_kv_cache,
+                **kwargs,
+            )
+        else:
+            return self.forward_extend(
+                q,
+                k,
+                v,
+                layer,
+                forward_batch,
+                save_kv_cache=save_kv_cache,
+                **kwargs,
+            )
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+    ):
+        """Run a forward for decode."""
+        raise NotImplementedError()
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+    ):
+        """Run a forward for extend."""
+        raise NotImplementedError()
+
+    def support_triton(self):
+        """Check if the current backend supports triton."""
+        return True
diff --git a/python/sglang/srt/layers/attention/cutlass_mla_backend.py b/python/sglang/srt/layers/attention/cutlass_mla_backend.py
new file mode 100644
index 000000000..eb0cae262
--- /dev/null
+++ b/python/sglang/srt/layers/attention/cutlass_mla_backend.py
@@ -0,0 +1,285 @@
+from __future__ import annotations
+
+"""
+Support attention backend for Cutlass MLA.
+
+"""
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+import triton
+
+from sglang.srt.layers.attention.flashinfer_mla_backend import FlashInferMLAAttnBackend
+from sglang.srt.layers.attention.utils import create_flashmla_kv_indices_triton
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.utils import is_cuda
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.speculative.spec_info import SpecInfo
+
+_is_cuda = is_cuda()
+if _is_cuda:
+    from sgl_kernel import cutlass_mla_decode, cutlass_mla_get_workspace_size
+
+
+# Cutlass MLA only supports pagesize=128
+PAGE_SIZE = 128
+
+
+@dataclass
+class CutlassMLADecodeMetadata:
+    workspace: Optional[torch.Tensor] = None
+    block_kv_indices: Optional[torch.Tensor] = None
+
+    def __init__(
+        self,
+        workspace: Optional[torch.Tensor] = None,
+        block_kv_indices: Optional[torch.Tensor] = None,
+    ):
+        self.workspace = workspace
+        self.block_kv_indices = block_kv_indices
+
+
+class CutlassMLABackend(FlashInferMLAAttnBackend):
+    """Cutlass attention kernels."""
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        skip_prefill: bool = False,
+        kv_indptr_buf: Optional[torch.Tensor] = None,
+        kv_last_page_len_buf: Optional[torch.Tensor] = None,
+    ):
+        super().__init__(
+            model_runner, skip_prefill, kv_indptr_buf, kv_last_page_len_buf
+        )
+
+        self.num_q_heads = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.num_kv_heads = model_runner.model_config.get_num_kv_heads(
+            get_attention_tp_size()
+        )
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.num_local_heads = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.forward_metadata: Union[CutlassMLADecodeMetadata] = None
+        self.kv_lora_rank = model_runner.model_config.kv_lora_rank
+        self.qk_nope_head_dim = model_runner.model_config.qk_nope_head_dim
+        self.qk_rope_head_dim = model_runner.model_config.qk_rope_head_dim
+        self.v_head_dim = model_runner.model_config.v_head_dim
+        self.scaling = model_runner.model_config.scaling
+        self.data_type = model_runner.kv_cache_dtype
+        self.q_data_type = model_runner.dtype
+        self.kv_cache_dim = self.kv_lora_rank + self.qk_rope_head_dim
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+
+        bs = forward_batch.batch_size
+        spec_info = forward_batch.spec_info
+        if forward_batch.forward_mode.is_decode_or_idle():
+            if spec_info is None:
+                max_seqlen_pad = triton.cdiv(
+                    forward_batch.seq_lens_cpu.max().item(), PAGE_SIZE
+                )
+                block_kv_indices = torch.full(
+                    (bs, max_seqlen_pad),
+                    -1,
+                    dtype=torch.int32,
+                    device=forward_batch.seq_lens.device,
+                )
+                create_flashmla_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    forward_batch.req_pool_indices,
+                    forward_batch.seq_lens,
+                    None,
+                    block_kv_indices,
+                    self.req_to_token.stride(0),
+                    max_seqlen_pad,
+                    PAGED_SIZE=PAGE_SIZE,
+                )
+                workspace_size = cutlass_mla_get_workspace_size(
+                    max_seqlen_pad * PAGE_SIZE, bs, num_kv_splits=1
+                )
+                workspace = torch.empty(
+                    workspace_size, device="cuda", dtype=torch.uint8
+                )
+                self.forward_metadata = CutlassMLADecodeMetadata(
+                    workspace,
+                    block_kv_indices,
+                )
+            else:
+                super().init_forward_metadata(forward_batch)
+        else:
+            super().init_forward_metadata(forward_batch)
+
+    def init_cuda_graph_state(
+        self,
+        max_bs: int,
+        max_num_tokens: int,
+        block_kv_indices: Optional[torch.Tensor] = None,
+    ):
+        if block_kv_indices is None:
+            cuda_graph_kv_indices = torch.full(
+                (max_bs, (self.max_context_len + PAGE_SIZE) // PAGE_SIZE),
+                1,
+                dtype=torch.int32,
+                device="cuda",
+            )
+        else:
+            cuda_graph_kv_indices = block_kv_indices
+
+        workspace_size = cutlass_mla_get_workspace_size(
+            cuda_graph_kv_indices.shape[1] * PAGE_SIZE, max_bs, num_kv_splits=1
+        )
+        self.cuda_graph_mla_workspace = torch.empty(
+            workspace_size, device="cuda", dtype=torch.uint8
+        )
+        self.cuda_graph_kv_indices = cuda_graph_kv_indices
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInfo],
+    ):
+        if forward_mode.is_decode_or_idle():
+            if spec_info is None:
+                max_seqlen_pad = self.cuda_graph_kv_indices.shape[1]
+
+                create_flashmla_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    req_pool_indices,
+                    seq_lens,
+                    None,
+                    self.cuda_graph_kv_indices,
+                    self.req_to_token.stride(0),
+                    self.cuda_graph_kv_indices.stride(0),
+                    PAGED_SIZE=PAGE_SIZE,
+                )
+                self.forward_metadata = CutlassMLADecodeMetadata(
+                    self.cuda_graph_mla_workspace,
+                    self.cuda_graph_kv_indices[:bs, :max_seqlen_pad],
+                )
+        else:
+            super().init_forward_metadata_capture_cuda_graph(
+                bs,
+                num_tokens,
+                req_pool_indices,
+                seq_lens,
+                encoder_lens,
+                forward_mode,
+                spec_info,
+            )
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInfo],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+
+        if forward_mode.is_decode_or_idle():
+            assert seq_lens_cpu is not None
+            seq_lens = seq_lens[:bs]
+
+            create_flashmla_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices[:bs],
+                seq_lens,
+                None,
+                self.cuda_graph_kv_indices,
+                self.req_to_token.stride(0),
+                self.cuda_graph_kv_indices.stride(0),
+                PAGED_SIZE=PAGE_SIZE,
+            )
+        else:
+            super().init_forward_metadata_replay_cuda_graph(
+                bs,
+                req_pool_indices,
+                seq_lens,
+                seq_lens_sum,
+                encoder_lens,
+                forward_mode,
+                spec_info,
+                seq_lens_cpu,
+            )
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        return 1
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        # For multi-head latent attention
+        q_rope: Optional[torch.Tensor] = None,
+        k_rope: Optional[torch.Tensor] = None,
+    ):
+        cache_loc = forward_batch.out_cache_loc
+
+        if k is not None:
+            assert v is not None
+            if save_kv_cache:
+                if k_rope is not None:
+                    forward_batch.token_to_kv_pool.set_mla_kv_buffer(
+                        layer,
+                        cache_loc,
+                        k,
+                        k_rope,
+                    )
+                else:
+                    forward_batch.token_to_kv_pool.set_kv_buffer(
+                        layer,
+                        cache_loc,
+                        k,
+                        v,
+                    )
+
+        # Reshape inputs
+        if q_rope is not None:
+            q_nope = q.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+            q_rope = q_rope.view(
+                -1, layer.tp_q_head_num, layer.head_dim - layer.v_head_dim
+            )
+        else:
+            reshaped_q = q.view(-1, layer.tp_q_head_num, layer.head_dim)
+            q_nope = reshaped_q[:, :, : layer.v_head_dim]
+            q_rope = reshaped_q[:, :, layer.v_head_dim :]
+
+        q_nope = q_nope.to(self.q_data_type)
+        q_rope = q_rope.to(self.q_data_type)
+
+        k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+
+        o = cutlass_mla_decode(
+            q_nope=q_nope,
+            q_pe=q_rope,
+            kv_c_and_k_pe_cache=k_cache.view(-1, PAGE_SIZE, self.kv_cache_dim),
+            seq_lens=forward_batch.seq_lens.to(torch.int32),
+            page_table=self.forward_metadata.block_kv_indices,
+            workspace=self.forward_metadata.workspace,
+            sm_scale=layer.scaling,
+            num_kv_splits=1,
+        )
+
+        return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
diff --git a/python/sglang/srt/layers/attention/double_sparsity_backend.py b/python/sglang/srt/layers/attention/double_sparsity_backend.py
new file mode 100644
index 000000000..47b867f61
--- /dev/null
+++ b/python/sglang/srt/layers/attention/double_sparsity_backend.py
@@ -0,0 +1,257 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import torch
+
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+
+class DoubleSparseAttnBackend(AttentionBackend):
+    def __init__(self, model_runner: ModelRunner):
+        # Lazy import to avoid the initialization of cuda context
+        from sglang.srt.layers.attention.triton_ops.double_sparsity_attention import (
+            extend_attention_fwd,
+            flash_decode_attention_fwd,
+            flash_decode_sparse_attention_fwd,
+        )
+
+        super().__init__()
+
+        self.decode_attention_fwd = flash_decode_attention_fwd
+        self.decode_sparse_attention_fwd = flash_decode_sparse_attention_fwd
+        self.extend_attention_fwd = extend_attention_fwd
+        self.num_head = model_runner.model_config.num_attention_heads
+        self.head_dim = model_runner.model_config.hidden_size // self.num_head
+        self.heavy_token_num = model_runner.server_args.ds_heavy_token_num
+
+        self.sorted_channels = model_runner.sorted_channels
+        self.sparse_decode_thresold = (
+            model_runner.server_args.ds_sparse_decode_threshold
+        )
+        self.att_out_approx: torch.Tensor = None
+        self.mid_out: torch.Tensor = None
+        self.mid_o_logexpsum: torch.Tensor = None
+
+        # TODO: Change the hard-coded block_seq_num
+        self.BLOCK_SEQ = 128
+
+        if global_server_args_dict.get("triton_attention_reduce_in_fp32", False):
+            self.reduce_dtype = torch.float32
+        else:
+            self.reduce_dtype = torch.float16
+
+        self.forward_metadata = None
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Init auxiliary variables for triton attention backend."""
+
+        if forward_batch.forward_mode.is_decode():
+            start_loc = torch.zeros_like(forward_batch.seq_lens, dtype=torch.int32)
+            start_loc[1:] = torch.cumsum(forward_batch.seq_lens[:-1], dim=0)
+
+            total_num_tokens = torch.sum(forward_batch.seq_lens).item()
+            attn_logits = torch.empty(
+                (self.num_head, total_num_tokens),
+                dtype=self.reduce_dtype,
+                device="cuda",
+            )
+
+            max_seq_len = torch.max(forward_batch.seq_lens).item()
+            min_seq_len = torch.min(forward_batch.seq_lens).item()
+            max_extend_len = None
+            # NOTE: Align sequence order with req_to_token order
+            ds_req_to_token = forward_batch.req_to_token_pool.req_to_token[
+                forward_batch.req_pool_indices
+            ]
+
+            bsz = forward_batch.seq_lens.shape[0]
+
+            att_out_approx = torch.empty(
+                [self.num_head, bsz, max_seq_len],
+                dtype=self.reduce_dtype,
+                device="cuda",
+            )
+
+            block_seq_num = (
+                self.heavy_token_num + self.BLOCK_SEQ - 1
+            ) // self.BLOCK_SEQ
+
+            mid_out = torch.empty(
+                [bsz, self.num_head, block_seq_num, self.head_dim],
+                dtype=torch.float32,
+                device="cuda",
+            )
+            mid_o_logexpsum = torch.empty(
+                [bsz, self.num_head, block_seq_num], dtype=torch.float32, device="cuda"
+            )
+            self.att_out_approx = att_out_approx
+            self.mid_out = mid_out
+            self.mid_o_logexpsum = mid_o_logexpsum
+
+        else:
+            start_loc = attn_logits = max_seq_len = min_seq_len = None
+            prefix_lens = forward_batch.extend_prefix_lens
+            max_extend_len = torch.max(forward_batch.seq_lens - prefix_lens).item()
+            ds_req_to_token = None
+
+        self.forward_metadata = (
+            start_loc,
+            attn_logits,
+            max_seq_len,
+            min_seq_len,
+            max_extend_len,
+            ds_req_to_token,
+        )
+
+    def forward_extend(
+        self,
+        q,
+        k,
+        v,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        # TODO: reuse the buffer across layers
+        if layer.qk_head_dim != layer.v_head_dim:
+            o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        k_label = torch.gather(
+            k,
+            2,
+            self.sorted_channels[layer.layer_id]
+            .unsqueeze(0)
+            .expand(k.shape[0], -1, -1),
+        )
+
+        if save_kv_cache:
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                layer, forward_batch.out_cache_loc, k, v, k_label
+            )
+
+        (
+            start_loc,
+            attn_logits,
+            max_seq_len,
+            min_seq_len,
+            max_extend_len,
+            ds_req_to_token,
+        ) = self.forward_metadata
+        self.extend_attention_fwd(
+            q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+            k.contiguous(),
+            v.contiguous(),
+            o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+            forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+            forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+            forward_batch.req_to_token_pool.req_to_token,
+            forward_batch.req_pool_indices,
+            forward_batch.seq_lens,
+            forward_batch.extend_seq_lens,
+            forward_batch.extend_start_loc,
+            max_extend_len,
+            layer.scaling,
+            layer.logit_cap,
+        )
+        return o
+
+    def forward_decode(
+        self,
+        q,
+        k,
+        v,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        # During torch.compile, there is a bug in rotary_emb that causes the
+        # output value to have a 3D tensor shape. This reshapes the output correctly.
+        q = q.reshape(-1, layer.tp_q_head_num * layer.qk_head_dim)
+
+        # TODO: reuse the buffer across layers
+        if layer.qk_head_dim != layer.v_head_dim:
+            o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        # TODO: Add min seqlen
+        (
+            start_loc,
+            attn_logits,
+            max_seq_len,
+            min_seq_len,
+            max_extend_len,
+            ds_req_to_token,
+        ) = self.forward_metadata
+
+        k_label = torch.gather(
+            k,
+            2,
+            self.sorted_channels[layer.layer_id]
+            .unsqueeze(0)
+            .expand(k.shape[0], -1, -1),
+        )
+
+        if save_kv_cache:
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                layer, forward_batch.out_cache_loc, k, v, k_label
+            )
+
+        # NOTE(Andy) shouldn't be used when max_len_in_batch < heavy_token_num
+        #            and set a minimum value for sparse_decode
+        if (
+            min_seq_len < self.heavy_token_num
+            or max_seq_len < self.sparse_decode_thresold
+        ):
+            self.decode_attention_fwd(
+                q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+                forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+                forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+                o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+                forward_batch.req_to_token_pool.req_to_token,
+                forward_batch.req_pool_indices,
+                start_loc,
+                forward_batch.seq_lens,
+                attn_logits,
+                max_seq_len,
+                layer.scaling,
+                layer.logit_cap,
+            )
+        else:
+            # TODO(Andy): indexing with torch.gather or torch.index_select or customized kernel
+            q_label = torch.gather(
+                q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+                2,
+                self.sorted_channels[layer.layer_id]
+                .unsqueeze(0)
+                .expand(q.shape[0], -1, -1),
+            )
+            self.decode_sparse_attention_fwd(
+                q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+                forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+                forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+                o.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+                q_label,
+                forward_batch.token_to_kv_pool.get_label_buffer(layer.layer_id),
+                ds_req_to_token,
+                forward_batch.seq_lens,
+                max_seq_len,
+                layer.scaling,
+                layer.logit_cap,
+                self.heavy_token_num,
+                self.att_out_approx,
+                self.mid_out,
+                self.mid_o_logexpsum,
+                self.BLOCK_SEQ,
+            )
+
+        return o
diff --git a/python/sglang/srt/layers/attention/dual_chunk_flashattention_backend.py b/python/sglang/srt/layers/attention/dual_chunk_flashattention_backend.py
new file mode 100644
index 000000000..84876b438
--- /dev/null
+++ b/python/sglang/srt/layers/attention/dual_chunk_flashattention_backend.py
@@ -0,0 +1,1700 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Attention layer with Dual chunk flash attention and sparse attention.
+"""
+import functools
+import logging
+import math
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from sgl_kernel.flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache
+from sgl_kernel.sparse_flash_attn import (
+    convert_vertical_slash_indexes,
+    convert_vertical_slash_indexes_mergehead,
+    sparse_attn_func,
+)
+
+from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_rank
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.attention.flashattention_backend import FlashAttentionMetadata
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class DualChunkFlashAttentionMetadata:
+    """Metadata for FlashAttentionBackend.
+
+    NOTE: Any python object stored here is not updated when it is
+    cuda-graph replayed. If you have values that need to be changed
+    dynamically, it should be stored in tensor. The tensor has to be
+    updated from `CUDAGraphRunner.forward` API.
+    """
+
+    # (batch_size,). The sequence length per sequence. Sequence length means
+    # the computed tokens + new tokens None if it is a decoding.
+    seq_lens: Optional[List[int]] = None
+    # seq_lens stored as a tensor.
+    seq_lens_tensor: Optional[torch.Tensor] = None
+    # Maximum sequence length among prefill batch. 0 if there are decoding
+    # requests only.
+    max_seq_len: int = None
+
+    # (batch_size,). The orig sequence length per sequence.
+    orig_seq_lens: Optional[List[int]] = None
+
+    # orig_seq_lens stored as a tensor.
+    orig_seq_lens_tensor: Optional[torch.Tensor] = None
+
+    # Block addresses per sequence. (Seq id -> list of physical block)
+    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
+    # in the kv cache. Each block can contain up to block_size tokens.
+    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
+    # captured.
+    block_tables: Optional[torch.Tensor] = None
+
+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+    # the batch, used to index into subquery. E.g., if the subquery length
+    # is [4, 6], it is [0, 4, 10].
+    query_start_loc: Optional[torch.Tensor] = None
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    seq_start_loc: Optional[torch.Tensor] = None
+
+    # Length scaling factor
+    scaling_factor: Optional[torch.Tensor] = None
+
+    # (batch_size,). Sequence lengths for intra attention.
+    seq_lens_intra: Optional[torch.Tensor] = None
+
+    # Max sequence length for intra attention.
+    max_seq_len_intra: Optional[int] = None
+
+    # (batch_size, num_blocks). Block table for intra attention.
+    block_tables_intra: Optional[torch.Tensor] = None
+
+    # (batch_size,). Sequence lengths for succ attention.
+    seq_lens_succ: Optional[torch.Tensor] = None
+
+    # Max sequence length for succ attention.
+    max_seq_len_succ: Optional[int] = None
+
+    # (batch_size, num_blocks). Block table for succ attention.
+    block_tables_succ: Optional[torch.Tensor] = None
+
+    # (batch_size,). Sequence lengths for inter attention.
+    seq_lens_inter: Optional[torch.Tensor] = None
+
+    # Max sequence length for inter attention.
+    max_seq_len_inter: Optional[int] = None
+
+
+class DualChunkFlashAttentionBackend(AttentionBackend):
+    def __init__(
+        self,
+        model_runner: "ModelRunner",
+    ) -> None:
+        self.forward_metadata: FlashAttentionMetadata = None
+        self.device = model_runner.device
+        self.max_context_len = model_runner.model_config.context_len
+        self.num_heads = model_runner.model_config.get_num_attention_heads(
+            model_runner.server_args.tp_size
+        )
+        self.num_kv_heads = model_runner.model_config.get_num_kv_heads(
+            model_runner.server_args.tp_size
+        )
+        self.head_size = model_runner.model_config.head_dim
+
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.kv_cache_dtype = model_runner.kv_cache_dtype
+        self.kv_cache_dtype_str = model_runner.server_args.kv_cache_dtype
+        self.page_size = model_runner.page_size
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        dual_chunk_attention_config = getattr(
+            model_runner.model_config.hf_config, "dual_chunk_attention_config", None
+        )
+        assert dual_chunk_attention_config is not None
+        self.chunk_size = dual_chunk_attention_config.get("chunk_size", 8192)
+        self.local_size = dual_chunk_attention_config.get("local_size", 1024)
+        self.original_max_position_embeddings = dual_chunk_attention_config.get(
+            "original_max_position_embeddings", 0
+        )
+        self.sparse_attention_config = dual_chunk_attention_config.get(
+            "sparse_attention_config", None
+        )
+        if not self.sparse_attention_config:
+            logger.warning_once(
+                "Sparse attention will not be enabled as "
+                "sparse attention config is not provided."
+            )
+        self.sparse_attention_enabled = dual_chunk_attention_config.get(
+            "sparse_attention_enabled", self.sparse_attention_config is not None
+        )
+        self.sparse_attention_threshold = dual_chunk_attention_config.get(
+            "sparse_attention_threshold", 32768
+        )
+        self.sparse_attention_last_q = dual_chunk_attention_config.get(
+            "sparse_attention_last_q", 64
+        )
+        self.dual_chunk_attention_config = dual_chunk_attention_config
+
+        if self.sparse_attention_enabled:
+            self.arange = torch.arange(self.sparse_attention_last_q, device="cuda")
+            self.last_q_mask = (
+                self.arange[None, None, :, None] >= self.arange[None, None, None, :]
+            )
+
+    @functools.lru_cache()
+    def get_sparse_attention_config(self, layer_idx) -> List[Dict[str, Any]]:
+        layer_sparse_attention_config = {
+            int(i): j for i, j in self.sparse_attention_config[layer_idx].items()
+        }
+        start_head = self.num_heads * get_tensor_model_parallel_rank()
+        end_head = start_head + self.num_heads
+        return [layer_sparse_attention_config[i] for i in range(start_head, end_head)]
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Initialize forward metadata hence all layers in the forward pass can reuse it."""
+
+        forward_mode: ForwardMode = forward_batch.forward_mode
+        assert forward_mode.is_prefill() or forward_mode.is_decode()
+        batch_size = forward_batch.batch_size
+
+        metadata = DualChunkFlashAttentionMetadata()
+        metadata.seq_lens_tensor = forward_batch.seq_lens.to(torch.int32)
+        metadata.seq_lens = forward_batch.seq_lens.tolist()
+        metadata.max_seq_len = forward_batch.seq_lens.max().item()
+
+        metadata.orig_seq_lens_tensor = forward_batch.orig_seq_lens
+        metadata.orig_seq_lens = forward_batch.orig_seq_lens.tolist()
+
+        metadata.block_tables = forward_batch.req_to_token_pool.req_to_token[
+            forward_batch.req_pool_indices, : metadata.max_seq_len
+        ]
+        # Convert the block table to a strided format.
+        if self.page_size > 1:
+            strided_indices = torch.arange(
+                0, metadata.block_tables.shape[1], self.page_size, device=self.device
+            )
+            metadata.block_tables = (
+                metadata.block_tables[:, strided_indices] // self.page_size
+            )
+
+        metadata.query_start_loc = torch.zeros(
+            batch_size + 1, dtype=torch.int32, device=metadata.seq_lens_tensor.device
+        )
+        if forward_mode.is_prefill():
+            metadata.query_start_loc[1:] = torch.cumsum(
+                forward_batch.extend_seq_lens.to(torch.int32), dim=0, dtype=torch.int32
+            )
+        else:
+            metadata.query_start_loc[1:] = torch.cumsum(
+                torch.arange(
+                    batch_size,
+                    dtype=metadata.query_start_loc.dtype,
+                    device=metadata.query_start_loc.device,
+                ),
+                dim=0,
+                dtype=torch.int32,
+            )
+        metadata.seq_start_loc = torch.zeros(
+            batch_size + 1, dtype=torch.int32, device=metadata.seq_lens_tensor.device
+        )
+        metadata.seq_start_loc[1:] = torch.cumsum(
+            metadata.seq_lens_tensor, dim=0, dtype=torch.int32
+        )
+
+        if self.original_max_position_embeddings > 0:
+            if forward_mode.is_prefill():
+                metadata.scaling_factor = (
+                    0.1
+                    * torch.log(
+                        metadata.orig_seq_lens_tensor
+                        / self.original_max_position_embeddings
+                    )
+                    + 1.0
+                ).clip(min=1)
+            else:
+                metadata.scaling_factor = (
+                    0.1
+                    * torch.log(
+                        metadata.orig_seq_lens_tensor
+                        / self.original_max_position_embeddings
+                    )
+                    + 1.0
+                ).clip(min=1)
+
+        if forward_mode.is_decode():
+            cache_seq_lens = metadata.orig_seq_lens_tensor
+
+            chunk_len = self.chunk_size - self.local_size
+            chunk_num_curr = (cache_seq_lens - 1) // chunk_len
+
+            seq_lens_intra = cache_seq_lens - chunk_num_curr * chunk_len
+            max_seq_len_intra = seq_lens_intra.max().item()
+            metadata.seq_lens_intra = seq_lens_intra
+            metadata.max_seq_len_intra = max_seq_len_intra
+
+            block_tables_intra = torch.zeros(
+                batch_size,
+                (max_seq_len_intra - 1) // self.page_size + 1,
+                dtype=metadata.block_tables.dtype,
+                device=metadata.block_tables.device,
+            )
+            for i in range(batch_size):
+                st = chunk_num_curr[i] * chunk_len // self.page_size
+                ed = min(
+                    st + (max_seq_len_intra - 1) // self.page_size + 1,
+                    (cache_seq_lens[i] - 1) // self.page_size + 1,
+                )
+                block_tables_intra[i, : ed - st] = metadata.block_tables[i, st:ed]
+            metadata.block_tables_intra = block_tables_intra
+
+            metadata.seq_lens_succ = (
+                chunk_num_curr - (chunk_num_curr - 1).clip(min=0)
+            ) * chunk_len
+            metadata.max_seq_len_succ = metadata.seq_lens_succ.max().item()
+            if metadata.max_seq_len_succ:
+                block_tables_succ = torch.zeros(
+                    batch_size,
+                    (metadata.max_seq_len_succ - 1) // self.page_size + 1,
+                    dtype=metadata.block_tables.dtype,
+                    device=metadata.block_tables.device,
+                )
+                for i in range(batch_size):
+                    start = (
+                        (chunk_num_curr[i] - 1).clip(min=0)
+                        * chunk_len
+                        // self.page_size
+                    )
+                    end = min(
+                        start + (metadata.max_seq_len_succ - 1) // self.page_size + 1,
+                        (cache_seq_lens[i] - 1) // self.page_size + 1,
+                    )
+                    block_tables_succ[i, : end - start] = metadata.block_tables[
+                        i, start:end
+                    ]
+                metadata.block_tables_succ = block_tables_succ
+
+            metadata.seq_lens_inter = (chunk_num_curr - 1).clip(min=0) * chunk_len
+            metadata.max_seq_len_inter = metadata.seq_lens_inter.max().item()
+
+        self.forward_metadata = metadata
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: "RadixAttention",
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        # Use precomputed metadata across all layers
+        metadata = self.forward_metadata
+
+        (
+            query,
+            query_succ,
+            query_inter,
+            query_succ_critical,
+            query_inter_critical,
+        ) = torch.split(q, q.shape[-1] // 5, dim=-1)
+
+        # Reshape the query, key, and value tensors.
+        query = query.view(-1, self.num_heads, self.head_size)
+        query_succ = query_succ.view(-1, self.num_heads, self.head_size)
+        query_inter = query_inter.view(-1, self.num_heads, self.head_size)
+        query_succ_critical = query_succ_critical.view(
+            -1, self.num_heads, self.head_size
+        )
+        query_inter_critical = query_inter_critical.view(
+            -1, self.num_heads, self.head_size
+        )
+        key = k.view(-1, self.num_kv_heads, self.head_size)
+        value = v.view(-1, self.num_kv_heads, self.head_size)
+
+        # apply DCA scaling
+        if self.original_max_position_embeddings > 0:
+            assert metadata.scaling_factor is not None
+            assert metadata.query_start_loc is not None
+            assert metadata.orig_seq_lens is not None
+            current_start = 0
+            query_start_loc_cpu = metadata.query_start_loc.cpu()
+            for i in range(len(metadata.orig_seq_lens)):
+                current_end = (
+                    current_start
+                    + (query_start_loc_cpu[i + 1] - query_start_loc_cpu[i]).item()
+                )
+                key[current_start:current_end].mul_(metadata.scaling_factor[i])
+                current_start = current_end
+            assert current_end <= self.max_context_len
+
+        # Do multi-head attention
+        key_cache, value_cache = forward_batch.token_to_kv_pool.get_kv_buffer(
+            layer.layer_id
+        )
+        key_cache = key_cache.view(
+            -1, self.page_size, layer.tp_k_head_num, layer.head_dim
+        )
+        value_cache = value_cache.view(
+            -1, self.page_size, layer.tp_v_head_num, layer.head_dim
+        )
+
+        if key is not None and value is not None:
+            if save_kv_cache:
+                forward_batch.token_to_kv_pool.set_kv_buffer(
+                    layer,
+                    forward_batch.out_cache_loc,
+                    key,
+                    value,
+                    layer.k_scale,
+                    layer.v_scale,
+                )
+
+        if not save_kv_cache:
+            # profile run
+            o = flash_attn_varlen_func(
+                q=query,
+                k=key,
+                v=value,
+                cu_seqlens_q=metadata.seq_start_loc,
+                cu_seqlens_k=metadata.seq_start_loc,
+                max_seqlen_q=metadata.max_seq_len,
+                max_seqlen_k=metadata.max_seq_len,
+                softmax_scale=layer.scaling,
+                causal=True,
+            )
+        else:
+            # prefill/chunked-prefill
+            # get per layer sparse attention config
+            if self.sparse_attention_enabled:
+                self.layer_sparse_attention_config = self.get_sparse_attention_config(
+                    layer.layer_id
+                )
+            assert metadata.orig_seq_lens is not None
+            o = self._dual_chunk_flash_attn_prefill(
+                q=query,
+                q_succ=query_succ,
+                q_inter=query_inter,
+                q_succ_critical=query_succ_critical,
+                q_inter_critical=query_inter_critical,
+                k=key_cache,
+                v=value_cache,
+                cu_seqlens_q=metadata.query_start_loc,
+                cu_seqlens_k=metadata.seq_start_loc,
+                orig_seq_lens=metadata.orig_seq_lens,
+                scaling_factor=metadata.scaling_factor,
+                softmax_scale=layer.scaling,
+                causal=True,
+                window_size=(-1, -1),
+                block_table=metadata.block_tables,
+                chunk_size=self.chunk_size,
+                local_size=self.local_size,
+            )
+        return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: "RadixAttention",
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ) -> torch.Tensor:
+        # Use precomputed metadata across all layers
+        metadata = self.forward_metadata
+
+        (
+            query,
+            query_succ,
+            query_inter,
+            query_succ_critical,
+            query_inter_critical,
+        ) = torch.split(q, q.shape[-1] // 5, dim=-1)
+
+        # Reshape the query, key, and value tensors.
+        query = query.view(-1, self.num_heads, self.head_size)
+        query_succ = query_succ.view(-1, self.num_heads, self.head_size)
+        query_inter = query_inter.view(-1, self.num_heads, self.head_size)
+        query_succ_critical = query_succ_critical.view(
+            -1, self.num_heads, self.head_size
+        )
+        query_inter_critical = query_inter_critical.view(
+            -1, self.num_heads, self.head_size
+        )
+        key = k.view(-1, self.num_kv_heads, self.head_size)
+        value = v.view(-1, self.num_kv_heads, self.head_size)
+
+        key_cache, value_cache = forward_batch.token_to_kv_pool.get_kv_buffer(
+            layer.layer_id
+        )
+        key_cache = key_cache.view(
+            -1, self.page_size, layer.tp_k_head_num, layer.head_dim
+        )
+        value_cache = value_cache.view(
+            -1, self.page_size, layer.tp_v_head_num, layer.head_dim
+        )
+
+        if key is not None and value is not None:
+            if save_kv_cache:
+                forward_batch.token_to_kv_pool.set_kv_buffer(
+                    layer,
+                    forward_batch.out_cache_loc,
+                    key,
+                    value,
+                    layer.k_scale,
+                    layer.v_scale,
+                )
+
+        # apply DCA scaling
+        if self.original_max_position_embeddings > 0:
+            assert metadata.scaling_factor is not None
+            scaling_factor = metadata.scaling_factor
+            key.mul_(scaling_factor.unsqueeze(-1).unsqueeze(-1))
+
+        o = self._dual_chunk_flash_attn_decoding(
+            query.unsqueeze(1),
+            query_succ.unsqueeze(1),
+            query_inter.unsqueeze(1),
+            key_cache,
+            value_cache,
+            block_table=metadata.block_tables,
+            cache_seqlens=metadata.seq_lens_tensor,
+            softmax_scale=layer.scaling,
+            causal=True,
+            chunk_size=self.chunk_size,
+            local_size=self.local_size,
+            original_max_position_embeddings=self.original_max_position_embeddings,
+            decode_meta=metadata,
+        ).squeeze(1)
+        return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        """Initialize CUDA graph state for the attention backend.
+
+        Args:
+            max_bs (int): Maximum batch size to support in CUDA graphs
+
+        This creates fixed-size tensors that will be reused during CUDA graph replay
+        to avoid memory allocations.
+        """
+        self.decode_metadata = {
+            "seq_lens_tensor": torch.zeros(
+                max_bs, dtype=torch.int32, device=self.device
+            ),
+            "orig_seq_lens_tensor": torch.zeros(
+                max_bs, dtype=torch.int32, device=self.device
+            ),
+            "scaling_factor": torch.zeros(
+                max_bs, dtype=torch.float32, device=self.device
+            ),
+            "block_tables": torch.zeros(
+                max_bs,
+                (self.max_context_len - 1) // self.page_size + 1,
+                dtype=torch.int32,
+                device=self.device,
+            ),
+            "block_tables_intra": torch.zeros(
+                max_bs,
+                (self.max_context_len - 1) // self.page_size + 1,
+                dtype=torch.int32,
+                device=self.device,
+            ),
+            "seq_lens_intra": torch.zeros(
+                max_bs, dtype=torch.int32, device=self.device
+            ),
+            "block_tables_succ": torch.zeros(
+                max_bs,
+                (self.max_context_len - 1) // self.page_size + 1,
+                dtype=torch.int32,
+                device=self.device,
+            ),
+            "seq_lens_succ": torch.zeros(max_bs, dtype=torch.int32, device=self.device),
+            "seq_lens_inter": torch.zeros(
+                max_bs, dtype=torch.int32, device=self.device
+            ),
+        }
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[None],
+    ):
+        metadata = DualChunkFlashAttentionMetadata()
+
+        if forward_mode.is_decode_or_idle():
+            if self.original_max_position_embeddings > 0:
+                metadata.scaling_factor = self.decode_metadata["scaling_factor"][:bs]
+
+            metadata.seq_lens_tensor = self.decode_metadata["seq_lens_tensor"][:bs]
+            metadata.orig_seq_lens_tensor = self.decode_metadata[
+                "orig_seq_lens_tensor"
+            ][:bs]
+            metadata.max_seq_len = self.max_context_len
+            metadata.block_tables = self.decode_metadata["block_tables"][
+                req_pool_indices, :
+            ]
+
+            # intra
+            metadata.max_seq_len_intra = self.max_context_len
+            metadata.seq_lens_intra = self.decode_metadata["seq_lens_intra"][:bs]
+
+            metadata.block_tables_intra = self.decode_metadata["block_tables_intra"][
+                :bs, :
+            ]
+
+            # succ
+            metadata.seq_lens_succ = self.decode_metadata["seq_lens_succ"][:bs]
+            metadata.max_seq_len_succ = self.max_context_len
+
+            metadata.block_tables_succ = self.decode_metadata["block_tables_succ"][
+                :bs, :
+            ]
+
+            metadata.seq_lens_inter = self.decode_metadata["seq_lens_inter"][:bs]
+            metadata.max_seq_len_inter = self.max_context_len
+
+            self.decode_metadata[bs] = metadata
+
+        self.forward_metadata = metadata
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[None],
+        seq_lens_cpu: Optional[torch.Tensor],
+        out_cache_loc: torch.Tensor = None,
+    ):
+        """Initialize forward metadata for replaying CUDA graph."""
+        assert forward_mode.is_decode()
+        seq_lens = seq_lens[:bs]
+        req_pool_indices = req_pool_indices[:bs]
+        metadata = self.decode_metadata[bs]
+
+        metadata.seq_lens_tensor.copy_(seq_lens.to(torch.int32))
+        metadata.seq_lens = seq_lens.tolist()
+        metadata.max_seq_len = seq_lens.max().item()
+
+        metadata.orig_seq_lens_tensor.copy_(seq_lens)
+        metadata.orig_seq_lens = seq_lens.tolist()
+
+        block_tables = self.req_to_token[req_pool_indices, : metadata.max_seq_len]
+        # Convert the block table to a strided format.
+        if self.page_size > 1:
+            strided_indices = torch.arange(
+                0, block_tables.shape[1], self.page_size, device=self.device
+            )
+            block_tables = block_tables[:, strided_indices] // self.page_size
+        metadata.block_tables.fill_(0)
+        metadata.block_tables[: block_tables.shape[0], : block_tables.shape[1]].copy_(
+            block_tables
+        )
+
+        if self.original_max_position_embeddings > 0:
+            scaling_factor = (
+                0.1
+                * torch.log(
+                    metadata.orig_seq_lens_tensor
+                    / self.original_max_position_embeddings
+                )
+                + 1.0
+            ).clip(min=1)
+            metadata.scaling_factor.copy_(scaling_factor)
+
+        cache_seq_lens = metadata.orig_seq_lens_tensor
+
+        chunk_len = self.chunk_size - self.local_size
+        chunk_num_curr = (cache_seq_lens - 1) // chunk_len
+
+        seq_lens_intra = cache_seq_lens - chunk_num_curr * chunk_len
+        max_seq_len_intra = seq_lens_intra.max().item()
+        metadata.seq_lens_intra.copy_(seq_lens_intra)
+        metadata.max_seq_len_intra = max_seq_len_intra
+
+        metadata.block_tables_intra.fill_(0)
+        for i in range(bs):
+            st = chunk_num_curr[i] * chunk_len // self.page_size
+            ed = min(
+                st + (max_seq_len_intra - 1) // self.page_size + 1,
+                (cache_seq_lens[i] - 1) // self.page_size + 1,
+            )
+            metadata.block_tables_intra[i, : ed - st] = metadata.block_tables[i, st:ed]
+
+        seq_lens_succ = (chunk_num_curr - (chunk_num_curr - 1).clip(min=0)) * chunk_len
+        metadata.seq_lens_succ.copy_(seq_lens_succ)
+        metadata.max_seq_len_succ = metadata.seq_lens_succ.max().item()
+        if metadata.max_seq_len_succ:
+            metadata.block_tables_succ.fill_(0)
+            for i in range(bs):
+                start = (
+                    (chunk_num_curr[i] - 1).clip(min=0) * chunk_len // self.page_size
+                )
+                end = min(
+                    start + (metadata.max_seq_len_succ - 1) // self.page_size + 1,
+                    (cache_seq_lens[i] - 1) // self.page_size + 1,
+                )
+                metadata.block_tables_succ[i, : end - start] = metadata.block_tables[
+                    i, start:end
+                ]
+
+        seq_lens_inter = (chunk_num_curr - 1).clip(min=0) * chunk_len
+        metadata.seq_lens_inter.copy_(seq_lens_inter)
+        metadata.max_seq_len_inter = metadata.seq_lens_inter.max().item()
+
+        self.forward_metadata = metadata
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        """Get the fill value for sequence length in CUDA graph."""
+        return 1
+
+    def _dual_chunk_flash_attn_prefill(
+        self,
+        q,
+        q_succ,
+        q_inter,
+        q_succ_critical,
+        q_inter_critical,
+        k,
+        v,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        orig_seq_lens: List[int],
+        scaling_factor: torch.Tensor,
+        softmax_scale: float,
+        causal: Optional[bool] = True,
+        window_size: Tuple[int, int] = (-1, -1),
+        block_table: Optional[torch.Tensor] = None,
+        chunk_size: int = 8192,
+        local_size: int = 1024,
+    ):
+        if not causal:
+            raise ValueError("Dual Chunk Attention does not support causal=False")
+        if window_size != (-1, -1):
+            raise ValueError("Dual Chunk Attention does not support window_size")
+
+        cu_seqlens_q_cpu = cu_seqlens_q.cpu().tolist()
+        cu_seqlens_k_cpu = cu_seqlens_k.cpu().tolist()
+        all_outputs = []
+
+        for i in range(0, len(cu_seqlens_q_cpu) - 1):
+            qs = cu_seqlens_q_cpu[i]
+            qe = cu_seqlens_q_cpu[i : i + 2][-1]
+            ks = cu_seqlens_k_cpu[i]
+            ke = cu_seqlens_k_cpu[i : i + 2][-1]
+
+            current_q = q[qs:qe]
+            current_q_succ = q_succ[qs:qe]
+            current_q_inter = q_inter[qs:qe]
+            current_q_succ_critical = q_succ_critical[qs:qe]
+            current_q_inter_critical = q_inter_critical[qs:qe]
+
+            if block_table is None:
+                current_k = k[ks:ke]
+                current_v = v[ks:ke]
+                current_block_table = None
+                current_orig_seq_len = orig_seq_lens[i]
+            else:
+                current_block_table = block_table[i]
+                current_orig_seq_len = orig_seq_lens[i]
+                current_k = k
+                current_v = v
+            sparse_attn_enabled = (
+                self.sparse_attention_enabled
+                and current_orig_seq_len > self.sparse_attention_threshold
+            )
+
+            if current_q.shape[0] == 0:
+                continue
+
+            if current_k.shape[0] == 0:
+                all_outputs.append(
+                    torch.zeros(
+                        (current_q.shape[0], current_q.shape[1], v.shape[2]),
+                        device=q.device,
+                        dtype=q.dtype,
+                    )
+                )
+                continue
+
+            current_output = torch.empty_like(current_q)
+            group_size = int(current_q.size(-2) / current_k.size(-2))
+
+            if sparse_attn_enabled:
+                num_device_q_heads = current_q.size(-2)
+                heads_vertical_size = torch.empty(
+                    size=(num_device_q_heads,), dtype=torch.int32
+                )
+                heads_slash_size = torch.empty(
+                    size=(num_device_q_heads,), dtype=torch.int32
+                )
+                for head_id in range(current_q.size(-2)):
+                    (
+                        ty,
+                        vertical_size,
+                        slash_size,
+                        _,
+                    ) = self.layer_sparse_attention_config[head_id]
+                    assert ty == "vertical_and_slash", "only support slash mode"
+
+                    if vertical_size == 30:
+                        vertical_size += 100
+                    heads_vertical_size[head_id] = vertical_size
+                    heads_slash_size[head_id] = slash_size
+
+                current_output = self._dual_chunk_flash_attn_prefill_func(
+                    current_q,  # allheads
+                    current_q_succ,
+                    current_q_inter,
+                    current_q_succ_critical,
+                    current_q_inter_critical,
+                    current_k,
+                    current_v,
+                    current_block_table,
+                    softmax_scale,
+                    chunk_size,
+                    local_size,
+                    scaling_factor[i].item(),
+                    ke - ks,
+                    sparse_attn_enabled=sparse_attn_enabled,
+                    heads_vertical_size=heads_vertical_size,
+                    heads_slash_size=heads_slash_size,
+                    group_size=group_size,
+                )
+            else:
+                for head_id in range(current_q.size(-2)):
+                    # (seq_len, num_heads, head_size)
+                    current_q_head = current_q[:, head_id, :].unsqueeze(1)
+                    current_q_succ_head = current_q_succ[:, head_id, :].unsqueeze(1)
+                    current_q_inter_head = current_q_inter[:, head_id, :].unsqueeze(1)
+                    current_q_succ_head_critical = current_q_succ_critical[
+                        :, head_id, :
+                    ].unsqueeze(1)
+                    current_q_inter_head_critical = current_q_inter_critical[
+                        :, head_id, :
+                    ].unsqueeze(1)
+                    if block_table is not None:
+                        current_k_head = current_k[
+                            ..., head_id // group_size, :
+                        ].unsqueeze(2)
+                        current_v_head = current_v[
+                            ..., head_id // group_size, :
+                        ].unsqueeze(2)
+
+                    else:
+                        current_k_head = current_k[:, head_id, :].unsqueeze(1)
+                        current_v_head = current_v[:, head_id, :].unsqueeze(1)
+
+                    current_out = self._dual_chunk_flash_attn_prefill_func(
+                        current_q_head,
+                        current_q_succ_head,
+                        current_q_inter_head,
+                        current_q_succ_head_critical,
+                        current_q_inter_head_critical,
+                        current_k_head,
+                        current_v_head,
+                        current_block_table,
+                        softmax_scale,
+                        chunk_size,
+                        local_size,
+                        scaling_factor[i].item(),
+                        ke - ks,
+                        sparse_attn_enabled=sparse_attn_enabled,
+                    )
+                    current_output[:, head_id : head_id + 1, :] = current_out
+            all_outputs.append(current_output)
+        return torch.cat(all_outputs, dim=0)
+
+    def _dual_chunk_flash_attn_prefill_func(
+        self,
+        q,
+        q_succ,
+        q_inter,
+        q_succ_critical,
+        q_inter_critical,
+        k,
+        v,
+        block_table,
+        softmax_scale: float,
+        chunk_size: int,
+        local_size: int,
+        scaling_factor: float,
+        k_length: int,
+        sparse_attn_enabled: Optional[bool] = True,
+        heads_vertical_size=None,
+        heads_slash_size=None,
+        group_size=None,
+    ):
+        flash_results = []
+        chunk_len = chunk_size - local_size
+
+        if block_table is not None:
+            block_size = v.shape[1]
+            if chunk_len % block_size != 0:
+                raise ValueError("chunk_len must be divisible by block_size.")
+        else:
+            block_size = 1
+
+        if self.original_max_position_embeddings > 0:
+            softmax_scale = softmax_scale * scaling_factor
+
+        begin = k_length - q.shape[0]
+        while begin < k_length:
+            flash_per_chunk = []
+
+            prev_chunk_end_pos = (begin // chunk_len) * chunk_len
+            next_chunk_end_pos = prev_chunk_end_pos + chunk_len
+            end = min(next_chunk_end_pos, k_length)
+            qbegin = begin - (k_length - q.shape[0])
+            qend = end - (k_length - q.shape[0])
+
+            qk_chunks = []
+            q_states_intra = q[qbegin:qend]
+            # choose critical token
+            if block_table is not None:
+                block_tables_intra = _get_block(
+                    block_table, block_size, prev_chunk_end_pos, end
+                )
+                k_states_intra = k[block_tables_intra].view(-1, *k.shape[-2:])[
+                    : (end - prev_chunk_end_pos)
+                ]
+                v_states_intra = v[block_tables_intra].view(-1, *v.shape[-2:])[
+                    : (end - prev_chunk_end_pos)
+                ]
+            else:
+                block_tables_intra = None
+                k_states_intra = k[prev_chunk_end_pos:end]
+                v_states_intra = v[prev_chunk_end_pos:end]
+
+            if sparse_attn_enabled:
+                last_q_size = min(qend - qbegin, self.sparse_attention_last_q)
+                _, num_device_k_heads, head_dim = k_states_intra.shape
+                k_states_intra = (
+                    k_states_intra.unsqueeze(2)
+                    .repeat(1, 1, group_size, 1)
+                    .reshape(-1, num_device_k_heads * group_size, head_dim)
+                )
+                v_states_intra = (
+                    v_states_intra.unsqueeze(2)
+                    .repeat(1, 1, group_size, 1)
+                    .reshape(-1, num_device_k_heads * group_size, head_dim)
+                )
+                qk_chunks.append(
+                    (q_states_intra.transpose(0, 1)[:, -last_q_size:] * softmax_scale)
+                    @ k_states_intra.permute(1, 2, 0)
+                )
+
+            if prev_chunk_end_pos - chunk_len >= 0:
+                q_states_succ = q_succ[qbegin:qend]
+                q_states_succ_critical = q_succ_critical[qbegin:qend]
+                if block_table is not None:
+                    block_tables_succ = _get_block(
+                        block_table,
+                        block_size,
+                        prev_chunk_end_pos - chunk_len,
+                        prev_chunk_end_pos,
+                    )
+                    k_states_succ = k[block_tables_succ].view(-1, *k.shape[-2:])[
+                        :chunk_len
+                    ]
+                    v_states_succ = v[block_tables_succ].view(-1, *v.shape[-2:])[
+                        :chunk_len
+                    ]
+                else:
+                    k_states_succ = k[
+                        prev_chunk_end_pos - chunk_len : prev_chunk_end_pos
+                    ]
+                    v_states_succ = v[
+                        prev_chunk_end_pos - chunk_len : prev_chunk_end_pos
+                    ]
+
+                if sparse_attn_enabled:
+                    k_states_succ = (
+                        k_states_succ.unsqueeze(2)
+                        .repeat(1, 1, group_size, 1)
+                        .reshape(-1, num_device_k_heads * group_size, head_dim)
+                    )
+                    v_states_succ = (
+                        v_states_succ.unsqueeze(2)
+                        .repeat(1, 1, group_size, 1)
+                        .reshape(-1, num_device_k_heads * group_size, head_dim)
+                    )
+                    qk_chunks.append(
+                        (
+                            q_states_succ_critical.transpose(0, 1)[:, -last_q_size:]
+                            * softmax_scale
+                        )
+                        @ k_states_succ.permute(1, 2, 0)
+                    )
+
+            if prev_chunk_end_pos - chunk_len * 2 >= 0:
+                q_states_inter = q_inter[qbegin:qend]
+                q_states_inter_critical = q_inter_critical[qbegin:qend]
+                if block_table is not None:
+                    block_tables_inter = _get_block(
+                        block_table, block_size, 0, prev_chunk_end_pos - chunk_len
+                    )
+                    k_states_inter = k[block_tables_inter].view(-1, *k.shape[-2:])[
+                        : (prev_chunk_end_pos - chunk_len)
+                    ]
+                    v_states_inter = v[block_tables_inter].view(-1, *v.shape[-2:])[
+                        : (prev_chunk_end_pos - chunk_len)
+                    ]
+                else:
+                    k_states_inter = k[: prev_chunk_end_pos - chunk_len]
+                    v_states_inter = v[: prev_chunk_end_pos - chunk_len]
+
+                if sparse_attn_enabled:
+                    k_states_inter = (
+                        k_states_inter.unsqueeze(2)
+                        .repeat(1, 1, group_size, 1)
+                        .reshape(-1, num_device_k_heads * group_size, head_dim)
+                    )
+                    v_states_inter = (
+                        v_states_inter.unsqueeze(2)
+                        .repeat(1, 1, group_size, 1)
+                        .reshape(-1, num_device_k_heads * group_size, head_dim)
+                    )
+                    qk_chunks.append(
+                        (
+                            q_states_inter_critical.transpose(0, 1)[:, -last_q_size:]
+                            * softmax_scale
+                        )
+                        @ k_states_inter.permute(1, 2, 0)
+                    )
+
+            if sparse_attn_enabled:
+                reversed_qk = qk_chunks[::-1]
+                qk = torch.cat(reversed_qk, dim=-1)
+
+                qk[:, :, -last_q_size:] = torch.where(
+                    self.last_q_mask[..., -last_q_size:, -last_q_size:].to(qk.device),
+                    qk[:, :, -last_q_size:],
+                    -torch.inf,
+                )
+                qk = F.softmax(qk, dim=-1, dtype=torch.float32)
+
+                vertical = qk.sum(-2, keepdim=True)
+                vertical[..., :30] = torch.inf
+
+                # Avoid sorting by using the min/max ints to fill the indexer
+                # buffers.
+                int32_max = torch.iinfo(torch.int32).max
+                int32_min = torch.iinfo(torch.int32).min
+                n_heads = qk.size()[0]
+                max_slash_topk = torch.max(heads_slash_size).item()
+                max_vertical_topk = torch.max(heads_vertical_size).item()
+                # store each head's slash topk, vertical topk
+                vertical = vertical.reshape((n_heads, -1))
+                # prevent out of range when prompt size < max_vertical_topk
+                max_vertical_topk = min(vertical.shape[-1], max_vertical_topk)
+                vertical_topk_buffer = torch.topk(
+                    vertical, max_vertical_topk, -1
+                ).indices
+                slash_topk_buffer = torch.empty(
+                    size=(n_heads, max_slash_topk), dtype=torch.int64, device=qk.device
+                )
+                for head_i in range(n_heads):
+                    #  (nqheads=1, lastq, k_len)
+                    head_score = qk[head_i : head_i + 1, :, :]
+                    slash_scores = _sum_all_diagonal_matrix(head_score)
+                    if head_score.size(1) != 1:
+                        # drop right up corner
+                        slash_scores = slash_scores[..., : -last_q_size + 1]
+                    slash_scores[..., -100:] = torch.inf
+
+                    head_slash_size = heads_slash_size[head_i]
+                    head_slash_size = min(head_slash_size, vertical.size(-1))
+                    slash_topk = torch.topk(slash_scores, head_slash_size, -1).indices
+                    # （nheads, max_topk）
+                    slash_topk_buffer[head_i, :head_slash_size] = slash_topk
+
+                    # reset heads topk
+                    heads_slash_size[head_i] = head_slash_size
+                    heads_vertical_size[head_i] = min(
+                        heads_vertical_size[head_i], max_vertical_topk
+                    )
+
+                # store
+                vertical_buffer = torch.full(
+                    (n_heads, max_vertical_topk),
+                    int32_max,
+                    dtype=torch.int64,
+                    device=q.device,
+                )
+                slash_buffer = torch.full(
+                    (n_heads, max_slash_topk),
+                    int32_min,
+                    dtype=torch.int64,
+                    device=q.device,
+                )
+                succ_vertical_buffer = torch.full(
+                    (n_heads, max_vertical_topk),
+                    int32_max,
+                    dtype=torch.int64,
+                    device=q.device,
+                )
+                succ_slash_buffer = torch.full(
+                    (n_heads, max_slash_topk),
+                    int32_min,
+                    dtype=torch.int64,
+                    device=q.device,
+                )
+                inter_vertical_buffer = torch.full(
+                    (n_heads, max_vertical_topk),
+                    int32_max,
+                    dtype=torch.int64,
+                    device=q.device,
+                )
+                inter_slash_buffer = torch.full(
+                    (n_heads, max_slash_topk),
+                    int32_min,
+                    dtype=torch.int64,
+                    device=q.device,
+                )
+
+                vertical_size_buffer = torch.empty(
+                    size=(n_heads,), dtype=torch.int32, device=q.device
+                )
+                slash_sizes_buffer = torch.empty(
+                    size=(n_heads,), dtype=torch.int32, device=q.device
+                )
+                succ_vertical_size_buffer = torch.empty(
+                    size=(n_heads,), dtype=torch.int32, device=q.device
+                )
+                succ_slash_sizes_buffer = torch.empty(
+                    size=(n_heads,), dtype=torch.int32, device=q.device
+                )
+                inter_vertical_size_buffer = torch.empty(
+                    size=(n_heads,), dtype=torch.int32, device=q.device
+                )
+                inter_slash_sizes_buffer = torch.empty(
+                    size=(n_heads,), dtype=torch.int32, device=q.device
+                )
+
+                for head_i in range(n_heads):
+                    vertical_topk = vertical_topk_buffer[
+                        head_i, : heads_vertical_size[head_i]
+                    ]
+                    # intra
+                    intra_vertical_indices = (
+                        vertical_topk[vertical_topk >= prev_chunk_end_pos]
+                        - prev_chunk_end_pos
+                    )
+                    if intra_vertical_indices.nelement() == 0:
+                        intra_vertical_indices = torch.cat(
+                            [
+                                intra_vertical_indices,
+                                torch.arange(
+                                    0,
+                                    k_states_intra.size(0),
+                                    max(1, k_states_intra.size(0) / 5),
+                                    dtype=torch.int32,
+                                    device=intra_vertical_indices.device,
+                                ),
+                            ]
+                        )
+                    slash_topk = slash_topk_buffer[head_i, : heads_slash_size[head_i]]
+                    intra_slash_indices = (qk.size(-1) - 1) - slash_topk[
+                        slash_topk >= prev_chunk_end_pos
+                    ]
+                    # fill buffer
+                    v_count = intra_vertical_indices.nelement()
+                    s_count = intra_slash_indices.nelement()
+                    vertical_size_buffer[head_i] = v_count
+                    slash_sizes_buffer[head_i] = s_count
+                    vertical_buffer[head_i, :v_count].copy_(intra_vertical_indices)
+                    slash_buffer[head_i, :s_count].copy_(intra_slash_indices)
+                    # succ
+                    if prev_chunk_end_pos - chunk_len >= 0:
+                        succ_vertical_indices = vertical_topk[
+                            (vertical_topk < prev_chunk_end_pos)
+                            & (vertical_topk >= prev_chunk_end_pos - chunk_len)
+                        ] - (prev_chunk_end_pos - chunk_len)
+                        # TODO: support no vertical
+                        if succ_vertical_indices.nelement() == 0:
+                            succ_vertical_indices = torch.cat(
+                                [
+                                    succ_vertical_indices,
+                                    torch.arange(
+                                        0,
+                                        k_states_succ.size(0),
+                                        max(1, k_states_succ.size(0) / 5),
+                                        dtype=torch.int32,
+                                        device=intra_vertical_indices.device,
+                                    ),
+                                ]
+                            )
+                        succ_slash_indices = (
+                            prev_chunk_end_pos + (qend - qbegin) - 1
+                        ) - slash_topk[
+                            (
+                                (slash_topk >= (prev_chunk_end_pos - chunk_len))
+                                & (slash_topk < (prev_chunk_end_pos + (qend - qbegin)))
+                            )
+                        ]
+                        if succ_slash_indices.nelement() == 0:
+                            succ_slash_indices = torch.cat(
+                                [
+                                    succ_slash_indices,
+                                    torch.arange(
+                                        0,
+                                        k_states_succ.size(0),
+                                        max(1, k_states_succ.size(0) / 5),
+                                        dtype=torch.int32,
+                                        device=intra_vertical_indices.device,
+                                    ),
+                                ]
+                            )
+                        # fill buffer
+                        v_count = succ_vertical_indices.nelement()
+                        s_count = succ_slash_indices.nelement()
+                        succ_vertical_size_buffer[head_i] = v_count
+                        succ_slash_sizes_buffer[head_i] = s_count
+                        succ_vertical_buffer[head_i, :v_count].copy_(
+                            succ_vertical_indices
+                        )
+                        succ_slash_buffer[head_i, :s_count].copy_(succ_slash_indices)
+
+                    if prev_chunk_end_pos - 2 * chunk_len >= 0:
+                        inter_vertical_indices = vertical_topk[
+                            vertical_topk < prev_chunk_end_pos - chunk_len
+                        ]
+
+                        if inter_vertical_indices.nelement() == 0:
+                            inter_vertical_indices = torch.cat(
+                                [
+                                    inter_vertical_indices,
+                                    torch.arange(
+                                        0,
+                                        k_states_inter.size(0),
+                                        max(1, k_states_inter.size(0) / 5),
+                                        dtype=torch.int32,
+                                        device=intra_vertical_indices.device,
+                                    ),
+                                ]
+                            )
+                        inter_slash_indices = (
+                            prev_chunk_end_pos - chunk_len + (qend - qbegin) - 1
+                        ) - slash_topk[
+                            slash_topk
+                            < (prev_chunk_end_pos - chunk_len + (qend - qbegin))
+                        ]
+                        if inter_slash_indices.nelement() == 0:
+                            inter_slash_indices = torch.cat(
+                                [
+                                    inter_slash_indices,
+                                    torch.arange(
+                                        0,
+                                        k_states_inter.size(0),
+                                        max(1, k_states_inter.size(0) / 5),
+                                        dtype=torch.int32,
+                                        device=intra_vertical_indices.device,
+                                    ),
+                                ]
+                            )
+                        # fill buffer
+                        v_count = inter_vertical_indices.nelement()
+                        s_count = inter_slash_indices.nelement()
+                        inter_vertical_size_buffer[head_i] = v_count
+                        inter_slash_sizes_buffer[head_i] = s_count
+                        inter_vertical_buffer[head_i, :v_count].copy_(
+                            inter_vertical_indices
+                        )
+                        inter_slash_buffer[head_i, :s_count].copy_(inter_slash_indices)
+            else:
+                intra_vertical_indices, intra_slash_indices = None, None
+                succ_vertical_indices, succ_slash_indices = None, None
+                inter_vertical_indices, inter_slash_indices = None, None
+
+            if sparse_attn_enabled:
+                flash_result = self._do_flash_attn(
+                    q_states_intra,
+                    k_states_intra,
+                    v_states_intra,
+                    softmax_scale=softmax_scale,
+                    causal=True,
+                    stage="intra",
+                    vertical_indices=vertical_buffer,
+                    slash_indices=slash_buffer,
+                    vertical_indices_count=vertical_size_buffer,
+                    slash_indices_count=slash_sizes_buffer,
+                    mergehead_softmax_scale=softmax_scale,
+                    sparse_attn_enabled=sparse_attn_enabled,
+                )
+            else:
+                flash_result = self._do_flash_attn(
+                    q_states_intra,
+                    k_states_intra,
+                    v_states_intra,
+                    softmax_scale=softmax_scale,
+                    causal=True,
+                    stage="intra",
+                    vertical_indices=intra_vertical_indices,
+                    slash_indices=intra_slash_indices,
+                    sparse_attn_enabled=sparse_attn_enabled,
+                )
+            flash_per_chunk.append(flash_result)
+
+            if prev_chunk_end_pos - chunk_len >= 0:
+                if sparse_attn_enabled:
+                    flash_result = self._do_flash_attn(
+                        q_states_succ,
+                        k_states_succ,
+                        v_states_succ,
+                        softmax_scale=softmax_scale,
+                        causal=False,
+                        stage="succ",
+                        vertical_indices=succ_vertical_buffer,
+                        slash_indices=succ_slash_buffer,
+                        vertical_indices_count=succ_vertical_size_buffer,
+                        slash_indices_count=succ_slash_sizes_buffer,
+                        mergehead_softmax_scale=softmax_scale,
+                        sparse_attn_enabled=sparse_attn_enabled,
+                    )
+                else:
+                    flash_result = self._do_flash_attn(
+                        q_states_succ,
+                        k_states_succ,
+                        v_states_succ,
+                        softmax_scale=softmax_scale,
+                        causal=False,
+                        stage="succ",
+                        vertical_indices=succ_vertical_indices,
+                        slash_indices=succ_slash_indices,
+                        sparse_attn_enabled=sparse_attn_enabled,
+                    )
+                flash_per_chunk.append(flash_result)
+
+            if prev_chunk_end_pos - chunk_len * 2 >= 0:
+                if sparse_attn_enabled:
+                    flash_result = self._do_flash_attn(
+                        q_states_inter,
+                        k_states_inter,
+                        v_states_inter,
+                        softmax_scale=softmax_scale,
+                        causal=False,
+                        stage="inter",
+                        vertical_indices=inter_vertical_buffer,
+                        slash_indices=inter_slash_buffer,
+                        vertical_indices_count=inter_vertical_size_buffer,
+                        slash_indices_count=inter_slash_sizes_buffer,
+                        mergehead_softmax_scale=softmax_scale,
+                        sparse_attn_enabled=sparse_attn_enabled,
+                    )
+                else:
+                    flash_result = self._do_flash_attn(
+                        q_states_inter,
+                        k_states_inter,
+                        v_states_inter,
+                        softmax_scale=softmax_scale,
+                        causal=False,
+                        stage="inter",
+                        vertical_indices=inter_vertical_indices,
+                        slash_indices=inter_slash_indices,
+                        sparse_attn_enabled=sparse_attn_enabled,
+                    )
+                flash_per_chunk.append(flash_result)
+
+            flash_results.append(flash_per_chunk)
+            begin = end
+
+        attn_output = self._merge_attn_outputs(flash_results)
+        del flash_results
+        return attn_output
+
+    def _do_flash_attn(
+        self,
+        query_states: torch.Tensor,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        softmax_scale: float,
+        causal: bool = True,
+        max_seqlen_k: Optional[int] = None,
+        stage: str = "intra",
+        vertical_indices: Optional[torch.Tensor] = None,
+        slash_indices: Optional[torch.Tensor] = None,
+        vertical_indices_count: Optional[torch.Tensor] = None,
+        slash_indices_count: Optional[torch.Tensor] = None,
+        mergehead_softmax_scale: Optional[float] = None,
+        sparse_attn_enabled: Optional[bool] = False,
+    ):
+        if max_seqlen_k is None:
+            max_seqlen_k = key_states.shape[0]
+
+        q_len = query_states.shape[0]
+        q_heads = query_states.shape[1]
+        h_dim = query_states.shape[-1]
+
+        if sparse_attn_enabled:
+            assert slash_indices is not None
+            if stage == "intra":
+                assert causal
+            else:
+                assert not causal
+
+            query_states = query_states.unsqueeze(0).transpose(1, 2)
+            key_states = key_states.unsqueeze(0).transpose(1, 2)
+            value_states = value_states.unsqueeze(0).transpose(1, 2)
+
+            q = query_states
+            k = key_states
+            v = value_states
+
+            if vertical_indices_count is not None and slash_indices_count is not None:
+                assert mergehead_softmax_scale is not None
+
+                res, s_lse = _vertical_slash_sparse_attention(
+                    q,
+                    k,
+                    v,
+                    vertical_indices,
+                    slash_indices,
+                    mergehead_softmax_scale,
+                    causal=causal,
+                    stage=stage,
+                    vertical_indices_count=vertical_indices_count,
+                    slash_indices_count=slash_indices_count,
+                )
+                res = res.view(q_heads, q_len, h_dim).transpose(
+                    0, 1
+                )  # (qlen,nhead,h_dim)
+                s_lse = (
+                    s_lse.view(q_heads, q_len, 1).squeeze(-1).unsqueeze(0).float()
+                )  # (1, nhead,qlen)
+            else:
+                res, s_lse = _vertical_slash_sparse_attention(
+                    q,
+                    k,
+                    v,
+                    vertical_indices,
+                    slash_indices,
+                    softmax_scale,
+                    causal=causal,
+                    stage=stage,
+                )
+                res = res.view(q_len, q_heads, h_dim)
+                s_lse = s_lse.view(q_len, q_heads, 1).transpose(0, 2).float()
+            return res, s_lse
+
+        output, softmax_lse, *rest = flash_attn_varlen_func(
+            q=query_states,
+            k=key_states,
+            v=value_states,
+            softmax_scale=softmax_scale,
+            cu_seqlens_q=torch.tensor(
+                [0, query_states.shape[0]],
+                dtype=torch.int32,
+                device=query_states.device,
+            ),
+            max_seqlen_q=query_states.shape[0],
+            cu_seqlens_k=torch.tensor(
+                [0, max_seqlen_k], dtype=torch.int32, device=query_states.device
+            ),
+            max_seqlen_k=max_seqlen_k,
+            causal=causal,
+            return_softmax_lse=True,
+        )
+        softmax_lse = softmax_lse.view(q_len, q_heads, 1).transpose(0, 2).float()
+        return output, softmax_lse
+
+    def _merge_attn_outputs(
+        self,
+        flash_results: List[List[Tuple[torch.Tensor, torch.Tensor]]],
+        return_lse: Optional[bool] = False,
+    ) -> torch.Tensor:
+        attn_outputs_all = []
+        logits_all = []
+
+        for flash_per_chunk in flash_results:
+            if len(flash_per_chunk) == 1:
+                attn_outputs_all.append(flash_per_chunk[0][0])
+                if return_lse:
+                    logits_all.append(flash_per_chunk[0][1])
+                continue
+
+            attn_outputs = torch.stack(
+                [flash_attn_output[0] for flash_attn_output in flash_per_chunk]
+            )
+            logits = torch.stack(
+                [flash_attn_output[1] for flash_attn_output in flash_per_chunk]
+            )
+            logits = logits.to(torch.float32)
+
+            if return_lse:
+                max_val = torch.max(logits, dim=0).values
+                diff = torch.abs(logits[0] - logits[1])
+                log_sum_exp = max_val + torch.log1p(torch.exp(-diff))
+                logits_all.append(log_sum_exp)
+
+            max_logits = torch.max(logits, dim=0).values
+            stable_logits = logits - max_logits.unsqueeze(0)
+            lse_s = torch.exp(stable_logits).detach()
+            lse_sum = torch.sum(lse_s, dim=0)
+            lse_s /= lse_sum
+            attn_outputs *= lse_s.unsqueeze(-1).transpose(2, 3).squeeze(1)
+            attn_outputs_all.append(attn_outputs.sum(dim=0))
+
+        if return_lse:
+            return (torch.cat(attn_outputs_all, dim=0), torch.cat(logits_all, dim=-1))
+        else:
+            return torch.cat(attn_outputs_all, dim=0)
+
+    def _dual_chunk_flash_attn_decoding(
+        self,
+        query: torch.Tensor,
+        query_succ: torch.Tensor,
+        query_inter: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_table: torch.Tensor,
+        cache_seqlens: torch.Tensor,
+        softmax_scale: float,
+        causal: bool,
+        chunk_size: int,
+        local_size: int,
+        original_max_position_embeddings: int,
+        decode_meta: DualChunkFlashAttentionMetadata,
+    ):
+        if not causal:
+            raise ValueError("Dual Chunk Attention does not support causal=False")
+
+        block_size = value_cache.shape[1]
+        chunk_len = chunk_size - local_size
+        if chunk_len % block_size != 0:
+            raise ValueError("chunk_len must be divisible by block_size.")
+        if original_max_position_embeddings > 0:
+            assert decode_meta.scaling_factor is not None
+            scaling_factor = decode_meta.scaling_factor
+            query = (query * scaling_factor.view(-1, 1, 1, 1)).to(
+                query.dtype
+            )  # possible for numerical issue, need to fused in the kernel
+            query_succ = (query_succ * scaling_factor.view(-1, 1, 1, 1)).to(query.dtype)
+            query_inter = (query_inter * scaling_factor.view(-1, 1, 1, 1)).to(
+                query.dtype
+            )
+        outputs_list = []
+        softmax_lses_list = []
+
+        # intra-attention
+        intra_output, intra_softmax_lse = (
+            self._dual_chunk_flash_attn_decoding_with_exp_sums(
+                query,
+                key_cache,
+                value_cache,
+                decode_meta.block_tables_intra,
+                decode_meta.seq_lens_intra,
+                softmax_scale,
+                causal=False,
+            )
+        )
+        outputs_list.append(intra_output)
+        softmax_lses_list.append(intra_softmax_lse)
+
+        # succ-attention
+        if decode_meta.max_seq_len_succ:
+            succ_output, succ_softmax_lse = (
+                self._dual_chunk_flash_attn_decoding_with_exp_sums(
+                    query_succ,
+                    key_cache,
+                    value_cache,
+                    decode_meta.block_tables_succ,
+                    decode_meta.seq_lens_succ,
+                    softmax_scale,
+                    causal=False,
+                )
+            )
+            outputs_list.append(succ_output)
+            softmax_lses_list.append(succ_softmax_lse)
+
+        # inter-attention
+        if decode_meta.max_seq_len_inter:
+            inter_output, inter_softmax_lse = (
+                self._dual_chunk_flash_attn_decoding_with_exp_sums(
+                    query_inter,
+                    key_cache,
+                    value_cache,
+                    block_table[:, : decode_meta.max_seq_len_inter],
+                    decode_meta.seq_lens_inter,
+                    softmax_scale,
+                    causal=False,
+                )
+            )
+            outputs_list.append(inter_output)
+            softmax_lses_list.append(inter_softmax_lse)
+        outputs = torch.stack(outputs_list, dim=0)
+        del outputs_list
+        softmax_lses = torch.stack(softmax_lses_list, dim=0).to(torch.float32)
+        del softmax_lses_list
+        max_logits = torch.max(softmax_lses, dim=0).values
+        stable_logits = softmax_lses - max_logits.unsqueeze(0)
+        lse_s = torch.exp(stable_logits).detach()
+        lse_sum = torch.sum(lse_s, dim=0)
+        lse_s /= lse_sum
+        outputs *= lse_s.unsqueeze(-1).transpose(2, 3)
+        return outputs.sum(0)
+
+    def _dual_chunk_flash_attn_decoding_with_exp_sums(
+        self,
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_table: torch.Tensor,
+        cache_seqlens: torch.Tensor,
+        softmax_scale: float,
+        causal: bool,
+    ):
+        out, softmax_lse, *rest_expand = flash_attn_with_kvcache(
+            q=query,
+            k_cache=key_cache,
+            v_cache=value_cache,
+            page_table=block_table,
+            cache_seqlens=cache_seqlens,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            return_softmax_lse=True,
+        )
+        mask = cache_seqlens == 0
+        out[mask] = 0
+        softmax_lse[mask] = -float("inf")
+        return out, softmax_lse
+
+
+def _vertical_slash_sparse_attention(
+    query: torch.Tensor,  # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    key: torch.Tensor,  # [BATCH, N_HEADS, N_KV_CTX, D_HEAD]
+    value: torch.Tensor,  # [BATCH, N_HEADS, N_KV_CTX, D_HEAD]
+    v_idx: torch.Tensor,  # [BATCH, N_HEADS, NNZ_V]
+    s_idx: torch.Tensor,  # [BATCH, N_HEADS, NNZ_S]
+    softmax_scale: float,
+    causal: bool = True,
+    stage: str = "intra",
+    block_size_M: int = 64,
+    block_size_N: int = 64,
+    vertical_indices_count: torch.Tensor = None,  # [N_HEADS,]
+    slash_indices_count: torch.Tensor = None,
+):
+    if stage == "intra":
+        assert causal
+    else:
+        assert not causal
+
+    batch_size, num_heads, context_size, head_dim = query.shape
+    _, _, kv_seq_len, _ = key.shape
+
+    if head_dim not in [16, 32, 64, 128, 256, 512]:
+        target_dim = 2 ** math.ceil(math.log2(head_dim)) - head_dim
+        query = F.pad(query, [0, target_dim, 0, 0, 0, 0, 0, 0])
+        key = F.pad(key, [0, target_dim, 0, 0, 0, 0, 0, 0])
+        value = F.pad(value, [0, target_dim, 0, 0, 0, 0, 0, 0])
+
+    v_idx = (
+        v_idx.to(torch.int32)
+        .reshape((batch_size, num_heads, -1))
+        .sort(dim=-1, descending=False)[0]
+    )
+    s_idx = (
+        s_idx.to(torch.int32)
+        .reshape((batch_size, num_heads, -1))
+        .sort(dim=-1, descending=True)[0]
+    )
+    q_seqlens = torch.tensor([context_size], dtype=torch.int32, device=query.device)
+    kv_seqlens = torch.tensor([kv_seq_len], dtype=torch.int32, device=query.device)
+
+    if vertical_indices_count is not None and slash_indices_count is not None:
+        (
+            block_count,
+            block_offset,
+            column_count,
+            column_index,
+        ) = convert_vertical_slash_indexes_mergehead(
+            q_seqlens,
+            kv_seqlens,
+            v_idx,
+            s_idx,
+            vertical_indices_count,
+            slash_indices_count,
+            context_size,
+            block_size_M,
+            block_size_N,
+            causal,
+        )
+    else:
+        (
+            block_count,
+            block_offset,
+            column_count,
+            column_index,
+        ) = convert_vertical_slash_indexes(
+            q_seqlens,
+            kv_seqlens,
+            v_idx,
+            s_idx,
+            context_size,
+            block_size_M,
+            block_size_N,
+            causal,
+        )
+
+    q = query.transpose(1, 2).contiguous()
+    k = key.transpose(1, 2).contiguous()
+    v = value.transpose(1, 2).contiguous()
+    out, lse = sparse_attn_func(
+        q,
+        k,
+        v,
+        block_count,
+        block_offset,
+        column_count,
+        column_index,
+        causal=causal,
+        softmax_scale=softmax_scale,
+        return_softmax_lse=True,
+    )
+    out = out.transpose(1, 2).contiguous()
+    softmax_lse = lse.reshape(*lse.shape, 1)
+    return (out[..., :context_size, :head_dim], softmax_lse[..., :context_size, :])
+
+
+def _sum_all_diagonal_matrix(mat: torch.tensor):
+    h, n, m = mat.shape
+    # Zero matrix used for padding
+    zero_mat = torch.zeros((h, n, n), device=mat.device)
+    # pads the matrix on left and right
+    mat_padded = torch.cat((zero_mat, mat, zero_mat), -1)
+    # Change the strides
+    mat_strided = mat_padded.as_strided(
+        (1, n, n + m), (n * (2 * n + m), 2 * n + m + 1, 1)
+    )
+    # Sums the resulting matrix's columns
+    sum_diags = torch.sum(mat_strided, 1)
+    return sum_diags[:, 1:]  # drop left bottom corner
+
+
+def _get_block(block_table: torch.Tensor, block_size: int, begin: int, end: int):
+    begin_block = begin // block_size
+    end_block = (end - 1) // block_size + 1
+    return block_table[begin_block:end_block]
diff --git a/python/sglang/srt/layers/attention/fla/chunk.py b/python/sglang/srt/layers/attention/fla/chunk.py
new file mode 100644
index 000000000..a48a9e649
--- /dev/null
+++ b/python/sglang/srt/layers/attention/fla/chunk.py
@@ -0,0 +1,242 @@
+# Adapted from https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/gated_delta_rule/chunk.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+import warnings
+from typing import Optional
+
+import torch
+from einops import rearrange
+
+from sglang.srt.layers.attention.fla.chunk_delta_h import chunk_gated_delta_rule_fwd_h
+from sglang.srt.layers.attention.fla.chunk_o import chunk_fwd_o
+from sglang.srt.layers.attention.fla.chunk_scaled_dot_kkt import (
+    chunk_scaled_dot_kkt_fwd,
+)
+from sglang.srt.layers.attention.fla.cumsum import chunk_local_cumsum
+from sglang.srt.layers.attention.fla.l2norm import l2norm_fwd
+from sglang.srt.layers.attention.fla.solve_tril import solve_tril
+from sglang.srt.layers.attention.fla.utils import (
+    SUPPRESS_LEVEL,
+    autocast_custom_fwd,
+    input_guard,
+)
+from sglang.srt.layers.attention.fla.wy_fast import recompute_w_u_fwd
+
+
+def chunk_gated_delta_rule_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float,
+    initial_state: torch.Tensor,
+    output_final_state: bool,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+):
+    g = chunk_local_cumsum(g, chunk_size=64, cu_seqlens=cu_seqlens)
+    # obtain WY representation. u is actually the new v.
+    A = chunk_scaled_dot_kkt_fwd(
+        k=k, beta=beta, g_cumsum=g, cu_seqlens=cu_seqlens, output_dtype=torch.float32
+    )
+    A = solve_tril(A=A, cu_seqlens=cu_seqlens, output_dtype=k.dtype)
+    w, u = recompute_w_u_fwd(
+        k=k,
+        v=v,
+        beta=beta,
+        A=A,
+        g_cumsum=g,
+        cu_seqlens=cu_seqlens,
+    )
+    h, v_new, final_state = chunk_gated_delta_rule_fwd_h(
+        k=k,
+        w=w,
+        u=u,
+        g=g,
+        initial_state=initial_state,
+        output_final_state=output_final_state,
+        cu_seqlens=cu_seqlens,
+    )
+    o = chunk_fwd_o(
+        q=q,
+        k=k,
+        v=v_new,
+        h=h,
+        g=g,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+    )
+    if SUPPRESS_LEVEL < 3:
+        return g, o, A, final_state, None, None, None
+    elif SUPPRESS_LEVEL >= 3:
+        return g, o, A, final_state, w, h, v_new
+
+
+class ChunkGatedDeltaRuleFunction(torch.autograd.Function):
+
+    @staticmethod
+    @input_guard
+    @autocast_custom_fwd
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        g: torch.Tensor,
+        beta: torch.Tensor,
+        scale: float,
+        initial_state: torch.Tensor,
+        output_final_state: bool,
+        cu_seqlens: Optional[torch.LongTensor] = None,
+        use_qk_l2norm_in_kernel: bool = False,
+    ):
+        q_orig = q
+        k_orig = k
+
+        if use_qk_l2norm_in_kernel:
+            q = l2norm_fwd(q)
+            k = l2norm_fwd(k)
+
+        g, o, A, final_state, w, h, v_new = chunk_gated_delta_rule_fwd(
+            q=q,
+            k=k,
+            v=v,
+            g=g,
+            beta=beta,
+            scale=scale,
+            initial_state=initial_state,
+            output_final_state=output_final_state,
+            cu_seqlens=cu_seqlens,
+        )
+        return o.to(q.dtype), final_state
+
+
+@torch.compiler.disable
+def chunk_gated_delta_rule(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float = None,
+    initial_state: torch.Tensor = None,
+    output_final_state: bool = False,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+    use_qk_l2norm_in_kernel: bool = False,
+):
+    r"""
+    Args:
+        q (torch.Tensor):
+            queries of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`.
+        k (torch.Tensor):
+            keys of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`.
+        v (torch.Tensor):
+            values of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`.
+        g (torch.Tensor):
+            (forget) gating tensor (in log space!) of shape `[B, T, H]` if `head_first=False` else `[B, H, T]`.
+        beta (torch.Tensor):
+            betas of shape `[B, T, H]` if `head_first=False` else `[B, H, T]`.
+        scale (Optional[int]):
+            Scale factor for the RetNet attention scores.
+            If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
+        initial_state (Optional[torch.Tensor]):
+            Initial state of shape `[N, H, K, V]` for `N` input sequences.
+            For equal-length input sequences, `N` equals the batch size `B`.
+            Default: `None`.
+        output_final_state (Optional[bool]):
+            Whether to output the final state of shape `[N, H, K, V]`. Default: `False`.
+        cu_seqlens (torch.LongTensor):
+            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
+            consistent with the FlashAttention API.
+        head_first (Optional[bool]):
+            Whether the inputs are in the head-first format, which is not supported for variable-length inputs.
+            Default: `False`.
+
+    Returns:
+        o (torch.Tensor):
+            Outputs of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`.
+        final_state (torch.Tensor):
+            Final state of shape `[N, H, K, V]` if `output_final_state=True` else `None`.
+
+    Examples::
+        >>> import torch
+        >>> import torch.nn.functional as F
+        >>> from einops import rearrange
+        >>> from fla.ops.gated_delta_rule import chunk_gated_delta_rule
+        # inputs with equal lengths
+        >>> B, T, H, K, V = 4, 2048, 4, 512, 512
+        >>> q = torch.randn(B, T, H, K, dtype=torch.bfloat16, device='cuda')
+        >>> k = F.normalize(torch.randn(B, T, H, K, dtype=torch.bfloat16, device='cuda'), p=2, dim=-1)
+        >>> v = torch.randn(B, T, H, V, dtype=torch.bfloat16, device='cuda')
+        >>> beta = torch.rand(B, T, H, dtype=torch.bfloat16, device='cuda').sigmoid()
+        >>> g = F.logsigmoid(torch.rand(B, T, H, dtype=torch.bfloat16, device='cuda'))
+        >>> h0 = torch.randn(B, H, K, V, dtype=torch.bfloat16, device='cuda')
+        >>> o, ht = chunk_gated_delta_rule(
+            q, k, v, g, beta,
+            initial_state=h0,
+            output_final_state=True
+        )
+        # for variable-length inputs, the batch size `B` is expected to be 1 and `cu_seqlens` is required
+        >>> q, k, v, beta, g = map(lambda x: rearrange(x, 'b t ... -> 1 (b t) ...'), (q, k, v, beta, g))
+        # for a batch with 4 sequences, `cu_seqlens` with 5 start/end positions are expected
+        >>> cu_seqlens = q.new_tensor([0, 2048, 4096, 6144, 8192], dtype=torch.long)
+        >>> o_var, ht_var = chunk_gated_delta_rule(
+            q, k, v, g, beta,
+            initial_state=h0,
+            output_final_state=True,
+            cu_seqlens=cu_seqlens
+        )
+    """
+    assert q.dtype == k.dtype == v.dtype
+    assert (
+        q.dtype != torch.float32
+    ), "ChunkGatedDeltaRuleFunction does not support float32. Please use bfloat16."
+    assert (
+        len(beta.shape) == 3
+    ), "beta must be of shape [B, T, H] if head_first=False, or [B, H, T] otherwise."
+
+    if head_first:
+        raise DeprecationWarning(
+            "head_first is deprecated and will be removed in a future version. "
+            "Please use head_first=False for now instead."
+        )
+        q, k, v, beta, g = map(
+            lambda x: rearrange(x, "b h t ... -> b t h ..."), (q, k, v, beta, g)
+        )
+    # if not head_first and q.shape[1] < q.shape[2]:
+    #     warnings.warn(
+    #         f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). "
+    #         "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
+    #         "when head_first=False was specified. "
+    #         "Please verify your input tensor format matches the expected shape [B, T, H, ...]."
+    #     )
+    if cu_seqlens is not None:
+        if q.shape[0] != 1:
+            raise ValueError(
+                f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
+                f"Please flatten variable-length inputs before processing."
+            )
+        if initial_state is not None and initial_state.shape[0] != len(cu_seqlens) - 1:
+            raise ValueError(
+                f"The number of initial states is expected to be equal to the number of input sequences, "
+                f"i.e., {len(cu_seqlens) - 1} rather than {initial_state.shape[0]}."
+            )
+    if scale is None:
+        scale = k.shape[-1] ** -0.5
+    o, final_state = ChunkGatedDeltaRuleFunction.apply(
+        q,
+        k,
+        v,
+        g,
+        beta,
+        scale,
+        initial_state,
+        output_final_state,
+        cu_seqlens,
+        use_qk_l2norm_in_kernel,
+    )
+    if head_first:
+        o = rearrange(o, "b t h ... -> b h t ...")
+    return o, final_state
diff --git a/python/sglang/srt/layers/attention/fla/chunk_delta_h.py b/python/sglang/srt/layers/attention/fla/chunk_delta_h.py
new file mode 100644
index 000000000..8f4109abd
--- /dev/null
+++ b/python/sglang/srt/layers/attention/fla/chunk_delta_h.py
@@ -0,0 +1,890 @@
+# Adapted from https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/common/chunk_delta_h.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+# from typing import Optional, Tuple
+
+# import torch
+# import triton
+# import triton.language as tl
+
+# from sglang.srt.layers.attention.fla.index import (
+#     prepare_chunk_indices,
+#     prepare_chunk_offsets,
+# )
+# from sglang.srt.layers.attention.fla.op import exp, safe_exp
+# from sglang.srt.layers.attention.fla.utils import is_nvidia_hopper
+
+# NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8, 16]
+
+
+# @triton.heuristics(
+#     {
+#         "USE_G": lambda args: args["g"] is not None,
+#         "USE_INITIAL_STATE": lambda args: args["h0"] is not None,
+#         "STORE_FINAL_STATE": lambda args: args["ht"] is not None,
+#         "SAVE_NEW_VALUE": lambda args: args["v_new"] is not None,
+#         "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+#     }
+# )
+# # @triton.autotune(
+# #     configs=[
+# #         triton.Config({"BV": BV}, num_warps=num_warps, num_stages=num_stages)
+# #         for num_warps in [2, 4]
+# #         for num_stages in [2, 3, 4]
+# #         for BV in [32, 64]
+# #     ],
+# #     key=["H", "K", "V", "BT", "USE_G"],
+# #     use_cuda_graph=use_cuda_graph,
+# # )
+# @triton.jit(do_not_specialize=["T"])
+# def chunk_gated_delta_rule_fwd_kernel_h_blockdim64(
+#     k,
+#     v,
+#     w,
+#     v_new,
+#     g,
+#     h,
+#     h0,
+#     ht,
+#     cu_seqlens,
+#     chunk_offsets,
+#     T,
+#     H: tl.constexpr,
+#     Hg: tl.constexpr,
+#     K: tl.constexpr,
+#     V: tl.constexpr,
+#     BT: tl.constexpr,
+#     BV: tl.constexpr,
+#     USE_G: tl.constexpr,
+#     USE_INITIAL_STATE: tl.constexpr,
+#     STORE_FINAL_STATE: tl.constexpr,
+#     SAVE_NEW_VALUE: tl.constexpr,
+#     IS_VARLEN: tl.constexpr,
+# ):
+#     i_v, i_nh = tl.program_id(0), tl.program_id(1)
+#     i_n, i_h = i_nh // H, i_nh % H
+#     if IS_VARLEN:
+#         bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(
+#             cu_seqlens + i_n + 1
+#         ).to(tl.int32)
+#         T = eos - bos
+#         NT = tl.cdiv(T, BT)
+#         boh = tl.load(chunk_offsets + i_n).to(tl.int32)
+#     else:
+#         bos, eos = i_n * T, i_n * T + T
+#         NT = tl.cdiv(T, BT)
+#         boh = i_n * NT
+
+#     # [BK, BV]
+#     b_h1 = tl.zeros([64, BV], dtype=tl.float32)
+#     if K > 64:
+#         b_h2 = tl.zeros([64, BV], dtype=tl.float32)
+#     if K > 128:
+#         b_h3 = tl.zeros([64, BV], dtype=tl.float32)
+#     if K > 192:
+#         b_h4 = tl.zeros([64, BV], dtype=tl.float32)
+
+#     # calculate offset
+#     h += (boh * H + i_h) * K * V
+#     v += (bos * H + i_h) * V
+#     k += (bos * Hg + i_h // (H // Hg)) * K
+#     w += (bos * H + i_h) * K
+#     if SAVE_NEW_VALUE:
+#         v_new += (bos * H + i_h) * V
+#     stride_v = H * V
+#     stride_h = H * K * V
+#     stride_k = Hg * K
+#     stride_w = H * K
+#     if USE_INITIAL_STATE:
+#         h0 = h0 + i_nh * K * V
+#     if STORE_FINAL_STATE:
+#         ht = ht + i_nh * K * V
+
+#     # load initial state
+#     if USE_INITIAL_STATE:
+#         p_h0_1 = tl.make_block_ptr(h0, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+#         b_h1 += tl.load(p_h0_1, boundary_check=(0, 1)).to(tl.float32)
+#         if K > 64:
+#             p_h0_2 = tl.make_block_ptr(
+#                 h0, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0)
+#             )
+#             b_h2 += tl.load(p_h0_2, boundary_check=(0, 1)).to(tl.float32)
+#         if K > 128:
+#             p_h0_3 = tl.make_block_ptr(
+#                 h0, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0)
+#             )
+#             b_h3 += tl.load(p_h0_3, boundary_check=(0, 1)).to(tl.float32)
+#         if K > 192:
+#             p_h0_4 = tl.make_block_ptr(
+#                 h0, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0)
+#             )
+#             b_h4 += tl.load(p_h0_4, boundary_check=(0, 1)).to(tl.float32)
+
+#     # main recurrence
+#     for i_t in range(NT):
+#         p_h1 = tl.make_block_ptr(
+#             h + i_t * stride_h, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0)
+#         )
+#         tl.store(p_h1, b_h1.to(p_h1.dtype.element_ty), boundary_check=(0, 1))
+#         if K > 64:
+#             p_h2 = tl.make_block_ptr(
+#                 h + i_t * stride_h, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0)
+#             )
+#             tl.store(p_h2, b_h2.to(p_h2.dtype.element_ty), boundary_check=(0, 1))
+#         if K > 128:
+#             p_h3 = tl.make_block_ptr(
+#                 h + i_t * stride_h, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0)
+#             )
+#             tl.store(p_h3, b_h3.to(p_h3.dtype.element_ty), boundary_check=(0, 1))
+#         if K > 192:
+#             p_h4 = tl.make_block_ptr(
+#                 h + i_t * stride_h, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0)
+#             )
+#             tl.store(p_h4, b_h4.to(p_h4.dtype.element_ty), boundary_check=(0, 1))
+
+#         p_v = tl.make_block_ptr(
+#             v, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)
+#         )
+#         p_v_new = (
+#             tl.make_block_ptr(
+#                 v_new, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)
+#             )
+#             if SAVE_NEW_VALUE
+#             else None
+#         )
+#         b_v_new = tl.zeros([BT, BV], dtype=tl.float32)
+#         p_w = tl.make_block_ptr(
+#             w, (T, K), (stride_w, 1), (i_t * BT, 0), (BT, 64), (1, 0)
+#         )
+#         b_w = tl.load(p_w, boundary_check=(0, 1))
+#         b_v_new += tl.dot(b_w, b_h1.to(b_w.dtype))
+#         if K > 64:
+#             p_w = tl.make_block_ptr(
+#                 w, (T, K), (stride_w, 1), (i_t * BT, 64), (BT, 64), (1, 0)
+#             )
+#             b_w = tl.load(p_w, boundary_check=(0, 1))
+#             b_v_new += tl.dot(b_w, b_h2.to(b_w.dtype))
+#         if K > 128:
+#             p_w = tl.make_block_ptr(
+#                 w, (T, K), (stride_w, 1), (i_t * BT, 128), (BT, 64), (1, 0)
+#             )
+#             b_w = tl.load(p_w, boundary_check=(0, 1))
+#             b_v_new += tl.dot(b_w, b_h3.to(b_w.dtype))
+#         if K > 192:
+#             p_w = tl.make_block_ptr(
+#                 w, (T, K), (stride_w, 1), (i_t * BT, 192), (BT, 64), (1, 0)
+#             )
+#             b_w = tl.load(p_w, boundary_check=(0, 1))
+#             b_v_new += tl.dot(b_w, b_h4.to(b_w.dtype))
+#         b_v_new = -b_v_new + tl.load(p_v, boundary_check=(0, 1))
+
+#         if SAVE_NEW_VALUE:
+#             p_v_new = tl.make_block_ptr(
+#                 v_new, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)
+#             )
+#             tl.store(
+#                 p_v_new, b_v_new.to(p_v_new.dtype.element_ty), boundary_check=(0, 1)
+#             )
+
+#         if USE_G:
+#             last_idx = min((i_t + 1) * BT, T) - 1
+#             b_g_last = tl.load(g + bos * H + last_idx * H + i_h)
+#             p_g = tl.make_block_ptr(
+#                 g + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,)
+#             )
+#             b_g = tl.load(p_g, boundary_check=(0,))
+#             b_v_new = b_v_new * safe_exp(b_g_last - b_g)[:, None]
+#             b_g_last = exp(b_g_last)
+#             b_h1 = b_h1 * b_g_last
+#             if K > 64:
+#                 b_h2 = b_h2 * b_g_last
+#             if K > 128:
+#                 b_h3 = b_h3 * b_g_last
+#             if K > 192:
+#                 b_h4 = b_h4 * b_g_last
+#         b_v_new = b_v_new.to(k.dtype.element_ty)
+#         p_k = tl.make_block_ptr(
+#             k, (K, T), (1, stride_k), (0, i_t * BT), (64, BT), (0, 1)
+#         )
+#         b_k = tl.load(p_k, boundary_check=(0, 1))
+#         b_h1 += tl.dot(b_k, b_v_new)
+#         if K > 64:
+#             p_k = tl.make_block_ptr(
+#                 k, (K, T), (1, stride_k), (64, i_t * BT), (64, BT), (0, 1)
+#             )
+#             b_k = tl.load(p_k, boundary_check=(0, 1))
+#             b_h2 += tl.dot(b_k, b_v_new)
+#         if K > 128:
+#             p_k = tl.make_block_ptr(
+#                 k, (K, T), (1, stride_k), (128, i_t * BT), (64, BT), (0, 1)
+#             )
+#             b_k = tl.load(p_k, boundary_check=(0, 1))
+#             b_h3 += tl.dot(b_k, b_v_new)
+#         if K > 192:
+#             p_k = tl.make_block_ptr(
+#                 k, (K, T), (1, stride_k), (192, i_t * BT), (64, BT), (0, 1)
+#             )
+#             b_k = tl.load(p_k, boundary_check=(0, 1))
+#             b_h4 += tl.dot(b_k, b_v_new)
+
+#     # epilogue
+#     if STORE_FINAL_STATE:
+#         p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+#         tl.store(p_ht, b_h1.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+#         if K > 64:
+#             p_ht = tl.make_block_ptr(
+#                 ht, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0)
+#             )
+#             tl.store(p_ht, b_h2.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+#         if K > 128:
+#             p_ht = tl.make_block_ptr(
+#                 ht, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0)
+#             )
+#             tl.store(p_ht, b_h3.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+#         if K > 192:
+#             p_ht = tl.make_block_ptr(
+#                 ht, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0)
+#             )
+#             tl.store(p_ht, b_h4.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+
+
+# def chunk_gated_delta_rule_fwd_h(
+#     k: torch.Tensor,
+#     w: torch.Tensor,
+#     u: torch.Tensor,
+#     g: Optional[torch.Tensor] = None,
+#     initial_state: Optional[torch.Tensor] = None,
+#     output_final_state: bool = False,
+#     chunk_size: int = 64,  # SY: remove this argument and force chunk size 64?
+#     save_new_value: bool = True,
+#     cu_seqlens: Optional[torch.LongTensor] = None,
+# ) -> Tuple[torch.Tensor, torch.Tensor]:
+#     B, T, Hg, K, V = *k.shape, u.shape[-1]
+#     H = u.shape[-2]
+#     BT = chunk_size
+
+#     chunk_indices = (
+#         prepare_chunk_indices(cu_seqlens, chunk_size)
+#         if cu_seqlens is not None
+#         else None
+#     )
+#     # N: the actual number of sequences in the batch with either equal or variable lengths
+#     if cu_seqlens is None:
+#         N, NT, chunk_offsets = B, triton.cdiv(T, BT), None
+#     else:
+#         N, NT, chunk_offsets = (
+#             len(cu_seqlens) - 1,
+#             len(chunk_indices),
+#             prepare_chunk_offsets(cu_seqlens, BT),
+#         )
+#     assert K <= 256, "current kernel does not support head dimension larger than 256."
+
+#     h = k.new_empty(B, NT, H, K, V)
+#     final_state = (
+#         k.new_empty(N, H, K, V, dtype=torch.float32) if output_final_state else None
+#     )
+
+#     v_new = torch.empty_like(u) if save_new_value else None
+
+#     def grid(meta):
+#         return (triton.cdiv(V, meta["BV"]), N * H)
+#     # if hasattr(torch.version, 'hip') and torch.version.hip is not None:
+#     # # HIP环境回退实现
+#     #     return h, v_new, final_state
+#     chunk_gated_delta_rule_fwd_kernel_h_blockdim64[grid](
+#         k=k,
+#         v=u,
+#         w=w,
+#         v_new=v_new,
+#         g=g,
+#         h=h,
+#         h0=initial_state,
+#         ht=final_state,
+#         cu_seqlens=cu_seqlens,
+#         chunk_offsets=chunk_offsets,
+#         T=T,
+#         H=H,
+#         Hg=Hg,
+#         K=K,
+#         V=V,
+#         BT=BT,
+#         BV=32,
+#         num_warps=4,
+#         num_stages=2,
+#     )
+#     return h, v_new, final_state
+
+
+from typing import Optional, Tuple
+import math
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.fla.index import (
+    prepare_chunk_indices,
+    prepare_chunk_offsets,
+)
+from sglang.srt.layers.attention.fla.op import exp, safe_exp
+from sglang.srt.layers.attention.fla.utils import is_nvidia_hopper
+
+# 保守一些在 HIP/ROCm 上更稳；NVIDIA 可调大
+NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8, 16]
+
+
+@triton.heuristics(
+    {
+        "USE_G": lambda args: args["g"] is not None,
+        "USE_INITIAL_STATE": lambda args: args["h0"] is not None,
+        "STORE_FINAL_STATE": lambda args: args["ht"] is not None,
+        "SAVE_NEW_VALUE": lambda args: args["v_new"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_gated_delta_rule_fwd_kernel_h_blockdim64(
+    k,            # [B, T, Hg, K]
+    v,            # [B, T, H,  V]
+    w,            # [B, T, H,  K]
+    v_new,        # [B, T, H,  V] or None
+    g,            # [B, T, H] or None
+    h,            # [B, NT, H, K, V]
+    h0,           # [B, H, K, V] or None
+    ht,           # [B, H, K, V] or None
+    cu_seqlens,   # [N+1] or None
+    chunk_offsets,# [N] or None
+    T,            # 全局 T（固定长时就是实际长度）
+    H: tl.constexpr,
+    Hg: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,    # chunk size（通常 64）
+    BV: tl.constexpr,    # V 方向 tile 宽度（通常 32）
+    NT: tl.constexpr,    # host 计算的上界（chunk 数）
+    USE_G: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,
+    STORE_FINAL_STATE: tl.constexpr,
+    SAVE_NEW_VALUE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    # 程序实例：X 维为 V/BV，Y 维为 N*H
+    i_v, i_nh = tl.program_id(0), tl.program_id(1)
+    i_n = i_nh // H
+    i_h = i_nh % H
+
+    # 计算该序列 runtime 真实长度 TR、时间起点 bos、h 的 chunk 偏移 boh
+    if IS_VARLEN:
+        bos = tl.load(cu_seqlens + i_n).to(tl.int32)
+        eos = tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        TR = (eos - bos).to(tl.int32)
+        TR = tl.maximum(TR, 0)
+        boh = tl.load(chunk_offsets + i_n).to(tl.int32)
+    else:
+        bos = (i_n * T).to(tl.int32)
+        eos = (i_n * T + T).to(tl.int32)
+        TR = T.to(tl.int32)
+        boh = (i_n * NT).to(tl.int32)
+
+    # 预计算步长（以扁平指针计）
+    stride_h_t = H * K * V      # h 时间块步长
+    stride_k_hg = Hg * K        # k 的 Hg*K 步长
+    stride_w_hk = H * K         # w 的 H*K 步长
+    stride_v_hv = H * V         # v/v_new 的 H*V 步长
+
+    # 移动到当前 sequence/head 的基址
+    h = h + (boh * H + i_h) * (K * V)
+    v = v + (bos * stride_v_hv + i_h * V)
+    if SAVE_NEW_VALUE:
+        v_new = v_new + (bos * stride_v_hv + i_h * V)
+    k = k + (bos * stride_k_hg + (i_h // (H // Hg)) * K)
+    w = w + (bos * stride_w_hk + i_h * K)
+
+    # hidden state tiles
+    b_h1 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 64:
+        b_h2 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 128:
+        b_h3 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 192:
+        b_h4 = tl.zeros([64, BV], dtype=tl.float32)
+
+    # 初始状态
+    if USE_INITIAL_STATE:
+        h0 = h0 + i_nh * (K * V)
+        p_h0_1 = tl.make_block_ptr(h0, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+        b_h1 += tl.load(p_h0_1, boundary_check=(0, 1)).to(tl.float32)
+        if K > 64:
+            p_h0_2 = tl.make_block_ptr(h0, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0))
+            b_h2 += tl.load(p_h0_2, boundary_check=(0, 1)).to(tl.float32)
+        if K > 128:
+            p_h0_3 = tl.make_block_ptr(h0, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0))
+            b_h3 += tl.load(p_h0_3, boundary_check=(0, 1)).to(tl.float32)
+        if K > 192:
+            p_h0_4 = tl.make_block_ptr(h0, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0))
+            b_h4 += tl.load(p_h0_4, boundary_check=(0, 1)).to(tl.float32)
+
+    # 最终状态写回基址
+    if STORE_FINAL_STATE:
+        ht = ht + i_nh * (K * V)
+
+    # 主循环（上界 NT）
+    for i_t in range(NT):
+        # 先把当前 hidden state 写到 h（合法，因为 h 的时间维是 NT 上界）
+        p_h1 = tl.make_block_ptr(h + i_t * stride_h_t, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+        tl.store(p_h1, b_h1.to(p_h1.dtype.element_ty), boundary_check=(0, 1))
+        if K > 64:
+            p_h2 = tl.make_block_ptr(h + i_t * stride_h_t, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_h2, b_h2.to(p_h2.dtype.element_ty), boundary_check=(0, 1))
+        if K > 128:
+            p_h3 = tl.make_block_ptr(h + i_t * stride_h_t, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_h3, b_h3.to(p_h3.dtype.element_ty), boundary_check=(0, 1))
+        if K > 192:
+            p_h4 = tl.make_block_ptr(h + i_t * stride_h_t, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_h4, b_h4.to(p_h4.dtype.element_ty), boundary_check=(0, 1))
+
+        # 这个 chunk 是否有效（t0 >= TR 则整个 tile 越界 -> 跳过所有 block_ptr 操作）
+        t0 = i_t * BT
+        active = t0 < TR
+        if active:
+            # v_new = v - w @ h
+            # --- w @ h ---
+            b_v_new = tl.zeros([BT, BV], dtype=tl.float32)
+
+            # w 的 0..63 列
+            p_w = tl.make_block_ptr(w, (TR, K), (stride_w_hk, 1), (t0, 0), (BT, 64), (1, 0))
+            b_w = tl.load(p_w, boundary_check=(0, 1))        # 不能使用 mask
+            b_v_new += tl.dot(b_w, b_h1.to(b_w.dtype))
+
+            if K > 64:
+                p_w = tl.make_block_ptr(w, (TR, K), (stride_w_hk, 1), (t0, 64), (BT, 64), (1, 0))
+                b_w = tl.load(p_w, boundary_check=(0, 1))
+                b_v_new += tl.dot(b_w, b_h2.to(b_w.dtype))
+            if K > 128:
+                p_w = tl.make_block_ptr(w, (TR, K), (stride_w_hk, 1), (t0, 128), (BT, 64), (1, 0))
+                b_w = tl.load(p_w, boundary_check=(0, 1))
+                b_v_new += tl.dot(b_w, b_h3.to(b_w.dtype))
+            if K > 192:
+                p_w = tl.make_block_ptr(w, (TR, K), (stride_w_hk, 1), (t0, 192), (BT, 64), (1, 0))
+                b_w = tl.load(p_w, boundary_check=(0, 1))
+                b_v_new += tl.dot(b_w, b_h4.to(b_w.dtype))
+
+            # v - (w @ h)
+            p_v = tl.make_block_ptr(v, (TR, V), (stride_v_hv, 1), (t0, i_v * BV), (BT, BV), (1, 0))
+            b_v_chunk = tl.load(p_v, boundary_check=(0, 1))   # 不能使用 mask
+            b_v_new = -b_v_new + b_v_chunk
+
+            # 保存 v_new
+            if SAVE_NEW_VALUE:
+                p_vn = tl.make_block_ptr(v_new, (TR, V), (stride_v_hv, 1), (t0, i_v * BV), (BT, BV), (1, 0))
+                tl.store(p_vn, b_v_new.to(p_vn.dtype.element_ty), boundary_check=(0, 1))
+
+            # gating：对 g 用普通指针 + mask 读取（不是 block_ptr）
+            if USE_G:
+                offs_t = t0 + tl.arange(0, BT)                 # [BT]
+                t_mask = offs_t < TR                            # [BT]
+                g_base = g + bos * H + i_h
+                b_g = tl.load(g_base + offs_t * H, mask=t_mask, other=0.0)  # ✔ 这里不是 block_ptr，可用 mask
+
+                t_end = tl.minimum(t0 + BT, TR)
+                last_idx = tl.maximum(t_end - 1, 0).to(tl.int32)
+                b_g_last = tl.load(g_base + last_idx * H, mask=(TR > 0), other=0.0)
+
+                gating_factor = safe_exp(b_g_last - b_g)[:, None]
+                b_v_new = b_v_new * gating_factor
+
+                g_last_exp = exp(b_g_last)
+                b_h1 = b_h1 * g_last_exp
+                if K > 64:
+                    b_h2 = b_h2 * g_last_exp
+                if K > 128:
+                    b_h3 = b_h3 * g_last_exp
+                if K > 192:
+                    b_h4 = b_h4 * g_last_exp
+
+            # h += k^T @ v_new
+            b_v_new_cast = b_v_new.to(k.dtype.element_ty)
+            p_k = tl.make_block_ptr(k, (K, TR), (1, stride_k_hg), (0, t0), (64, BT), (0, 1))
+            b_k = tl.load(p_k, boundary_check=(0, 1))         # 不能使用 mask
+            b_h1 += tl.dot(b_k, b_v_new_cast)
+
+            if K > 64:
+                p_k = tl.make_block_ptr(k, (K, TR), (1, stride_k_hg), (64, t0), (64, BT), (0, 1))
+                b_k = tl.load(p_k, boundary_check=(0, 1))
+                b_h2 += tl.dot(b_k, b_v_new_cast)
+            if K > 128:
+                p_k = tl.make_block_ptr(k, (K, TR), (1, stride_k_hg), (128, t0), (64, BT), (0, 1))
+                b_k = tl.load(p_k, boundary_check=(0, 1))
+                b_h3 += tl.dot(b_k, b_v_new_cast)
+            if K > 192:
+                p_k = tl.make_block_ptr(k, (K, TR), (1, stride_k_hg), (192, t0), (64, BT), (0, 1))
+                b_k = tl.load(p_k, boundary_check=(0, 1))
+                b_h4 += tl.dot(b_k, b_v_new_cast)
+
+    # 末态写回
+    if STORE_FINAL_STATE:
+        p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+        tl.store(p_ht, b_h1.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+        if K > 64:
+            p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_ht, b_h2.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+        if K > 128:
+            p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_ht, b_h3.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+        if K > 192:
+            p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_ht, b_h4.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_gated_delta_rule_fwd_h(
+    k: torch.Tensor,                    # [B, T, Hg, K]
+    w: torch.Tensor,                    # [B, T, H,  K]
+    u: torch.Tensor,                    # [B, T, H,  V]
+    g: Optional[torch.Tensor] = None,   # [B, T, H]
+    initial_state: Optional[torch.Tensor] = None,  # [B, H, K, V]
+    output_final_state: bool = False,
+    chunk_size: int = 64,
+    save_new_value: bool = True,
+    cu_seqlens: Optional[torch.LongTensor] = None, # [N+1] 或 None
+) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    """
+    Host wrapper：
+    - 计算 NT（以 chunk 为单位的上界，constexpr）；
+    - kernel 内对块是否有效做 runtime 守卫（active）；
+    - block_ptr 的读写只用 boundary_check，严禁 mask/other；
+    - 对 g 使用标量指针 + mask 读取。
+    """
+    B, T, Hg, K = k.shape
+    H, V = u.shape[-2], u.shape[-1]
+    BT = chunk_size
+
+    if cu_seqlens is None:
+        N = B
+        NT = math.ceil(T / BT)                 # 固定长：全局上界
+        chunk_offsets = None
+    else:
+        N = len(cu_seqlens) - 1
+        chunk_indices = prepare_chunk_indices(cu_seqlens, BT)
+        NT = len(chunk_indices)                # 变长：所有序列的总 chunk 数
+        chunk_offsets = prepare_chunk_offsets(cu_seqlens, BT)
+
+    assert K <= 256, "current kernel does not support head dimension larger than 256."
+
+    h = k.new_empty(B, NT, H, K, V)
+    final_state = k.new_empty(N, H, K, V, dtype=torch.float32) if output_final_state else None
+    v_new = torch.empty_like(u) if save_new_value else None
+
+    def grid(meta):
+        return (triton.cdiv(V, meta["BV"]), N * H)
+
+    chunk_gated_delta_rule_fwd_kernel_h_blockdim64[grid](
+        k=k,
+        v=u,
+        w=w,
+        v_new=v_new,
+        g=g,
+        h=h,
+        h0=initial_state,
+        ht=final_state,
+        cu_seqlens=cu_seqlens,
+        chunk_offsets=chunk_offsets,
+        T=T,
+        H=H,
+        Hg=Hg,
+        K=K,
+        V=V,
+        BT=BT,
+        BV=32,
+        NT=NT,
+        num_warps=2,
+        num_stages=1,
+    )
+    return h, v_new, final_state
+
+
+
+
+# from typing import Optional, Tuple
+# import math
+# import torch
+# import triton
+# import triton.language as tl
+
+# from sglang.srt.layers.attention.fla.index import (
+#     prepare_chunk_indices,
+#     prepare_chunk_offsets,
+# )
+# from sglang.srt.layers.attention.fla.op import exp, safe_exp
+# from sglang.srt.layers.attention.fla.utils import is_nvidia_hopper
+
+# # 保守一些在 HIP/ROCm 上更稳；NVIDIA 可调大
+# NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8, 16]
+
+
+# @triton.heuristics(
+#     {
+#         "USE_G": lambda args: args["g"] is not None,
+#         "USE_INITIAL_STATE": lambda args: args["h0"] is not None,
+#         "STORE_FINAL_STATE": lambda args: args["ht"] is not None,
+#         "SAVE_NEW_VALUE": lambda args: args["v_new"] is not None,
+#         "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+#     }
+# )
+# @triton.jit(do_not_specialize=["T"])
+# def chunk_gated_delta_rule_fwd_kernel_h_blockdim64(
+#     k,            # [B, T, Hg, K]
+#     v,            # [B, T, H,  V]
+#     w,            # [B, T, H,  K]
+#     v_new,        # [B, T, H,  V] or None
+#     g,            # [B, T, H] or None
+#     h,            # [B, NT, H, K, V]
+#     h0,           # [B, H, K, V] or None
+#     ht,           # [B, H, K, V] or None
+#     cu_seqlens,   # [N+1] or None
+#     chunk_offsets,# [N] or None
+#     T,            # 全局 T（固定长时就是实际长度）
+#     H: tl.constexpr,
+#     Hg: tl.constexpr,
+#     K: tl.constexpr,
+#     V: tl.constexpr,
+#     BT: tl.constexpr,    # chunk size（通常 64）
+#     BV: tl.constexpr,    # V 方向 tile 宽度（通常 32）
+#     NT: tl.constexpr,    # host 计算的上界（chunk 数）
+#     USE_G: tl.constexpr,
+#     USE_INITIAL_STATE: tl.constexpr,
+#     STORE_FINAL_STATE: tl.constexpr,
+#     SAVE_NEW_VALUE: tl.constexpr,
+#     IS_VARLEN: tl.constexpr,
+# ):
+#     # 程序实例：X 维为 V/BV，Y 维为 N*H
+#     i_v, i_nh = tl.program_id(0), tl.program_id(1)
+#     i_n = i_nh // H
+#     i_h = i_nh % H
+
+#     # 计算该序列 runtime 真实长度 TR、时间起点 bos、h 的 chunk 偏移 boh
+#     if IS_VARLEN:
+#         bos = tl.load(cu_seqlens + i_n).to(tl.int32)
+#         eos = tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+#         TR = (eos - bos).to(tl.int32)
+#         TR = tl.maximum(TR, 0)
+#         boh = tl.load(chunk_offsets + i_n).to(tl.int32)
+#     else:
+#         bos = (i_n * T).to(tl.int32)
+#         eos = (i_n * T + T).to(tl.int32)
+#         TR = T.to(tl.int32)
+#         boh = (i_n * NT).to(tl.int32)
+
+#     # 预计算步长（以扁平指针计）
+#     stride_h_t = H * K * V      # h 时间块步长
+#     stride_k_hg = Hg * K        # k 的 Hg*K 步长
+#     stride_w_hk = H * K         # w 的 H*K 步长
+#     stride_v_hv = H * V         # v/v_new 的 H*V 步长
+
+#     # 移动到当前 sequence/head 的基址
+#     h = h + (boh * H + i_h) * (K * V)
+#     v = v + (bos * stride_v_hv + i_h * V)
+#     if SAVE_NEW_VALUE:
+#         v_new = v_new + (bos * stride_v_hv + i_h * V)
+#     k = k + (bos * stride_k_hg + (i_h // (H // Hg)) * K)
+#     w = w + (bos * stride_w_hk + i_h * K)
+
+#     # hidden state tiles
+#     b_h1 = tl.zeros([64, BV], dtype=tl.float32)
+#     if K > 64:
+#         b_h2 = tl.zeros([64, BV], dtype=tl.float32)
+#     if K > 128:
+#         b_h3 = tl.zeros([64, BV], dtype=tl.float32)
+#     if K > 192:
+#         b_h4 = tl.zeros([64, BV], dtype=tl.float32)
+
+#     # 初始状态
+#     if USE_INITIAL_STATE:
+#         h0 = h0 + i_nh * (K * V)
+#         p_h0_1 = tl.make_block_ptr(h0, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+#         b_h1 += tl.load(p_h0_1, boundary_check=(0, 1)).to(tl.float32)
+#         if K > 64:
+#             p_h0_2 = tl.make_block_ptr(h0, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0))
+#             b_h2 += tl.load(p_h0_2, boundary_check=(0, 1)).to(tl.float32)
+#         if K > 128:
+#             p_h0_3 = tl.make_block_ptr(h0, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0))
+#             b_h3 += tl.load(p_h0_3, boundary_check=(0, 1)).to(tl.float32)
+#         if K > 192:
+#             p_h0_4 = tl.make_block_ptr(h0, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0))
+#             b_h4 += tl.load(p_h0_4, boundary_check=(0, 1)).to(tl.float32)
+
+#     # 最终状态写回基址
+#     if STORE_FINAL_STATE:
+#         ht = ht + i_nh * (K * V)
+
+#     # 主循环（上界 NT）
+#     for i_t in range(NT):
+#         # 先把当前 hidden state 写到 h（合法，因为 h 的时间维是 NT 上界）
+#         p_h1 = tl.make_block_ptr(h + i_t * stride_h_t, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+#         tl.store(p_h1, b_h1.to(p_h1.dtype.element_ty), boundary_check=(0, 1))
+#         if K > 64:
+#             p_h2 = tl.make_block_ptr(h + i_t * stride_h_t, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0))
+#             tl.store(p_h2, b_h2.to(p_h2.dtype.element_ty), boundary_check=(0, 1))
+#         if K > 128:
+#             p_h3 = tl.make_block_ptr(h + i_t * stride_h_t, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0))
+#             tl.store(p_h3, b_h3.to(p_h3.dtype.element_ty), boundary_check=(0, 1))
+#         if K > 192:
+#             p_h4 = tl.make_block_ptr(h + i_t * stride_h_t, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0))
+#             tl.store(p_h4, b_h4.to(p_h4.dtype.element_ty), boundary_check=(0, 1))
+
+#         # 这个 chunk 是否有效（t0 >= TR 则整个 tile 越界 -> 跳过所有 block_ptr 操作）
+#         t0 = i_t * BT
+#         active = t0 < TR
+#         if active:
+#             # v_new = v - w @ h
+#             # --- w @ h ---
+#             b_v_new = tl.zeros([BT, BV], dtype=tl.float32)
+
+#             # w 的 0..63 列
+#             p_w = tl.make_block_ptr(w, (TR, K), (stride_w_hk, 1), (t0, 0), (BT, 64), (1, 0))
+#             b_w = tl.load(p_w, boundary_check=(0, 1))        # 不能使用 mask
+#             b_v_new += tl.dot(b_w, b_h1.to(b_w.dtype))
+
+#             if K > 64:
+#                 p_w = tl.make_block_ptr(w, (TR, K), (stride_w_hk, 1), (t0, 64), (BT, 64), (1, 0))
+#                 b_w = tl.load(p_w, boundary_check=(0, 1))
+#                 b_v_new += tl.dot(b_w, b_h2.to(b_w.dtype))
+#             if K > 128:
+#                 p_w = tl.make_block_ptr(w, (TR, K), (stride_w_hk, 1), (t0, 128), (BT, 64), (1, 0))
+#                 b_w = tl.load(p_w, boundary_check=(0, 1))
+#                 b_v_new += tl.dot(b_w, b_h3.to(b_w.dtype))
+#             if K > 192:
+#                 p_w = tl.make_block_ptr(w, (TR, K), (stride_w_hk, 1), (t0, 192), (BT, 64), (1, 0))
+#                 b_w = tl.load(p_w, boundary_check=(0, 1))
+#                 b_v_new += tl.dot(b_w, b_h4.to(b_w.dtype))
+
+#             # v - (w @ h)
+#             p_v = tl.make_block_ptr(v, (TR, V), (stride_v_hv, 1), (t0, i_v * BV), (BT, BV), (1, 0))
+#             b_v_chunk = tl.load(p_v, boundary_check=(0, 1))   # 不能使用 mask
+#             b_v_new = -b_v_new + b_v_chunk
+
+#             # 保存 v_new
+#             if SAVE_NEW_VALUE:
+#                 p_vn = tl.make_block_ptr(v_new, (TR, V), (stride_v_hv, 1), (t0, i_v * BV), (BT, BV), (1, 0))
+#                 tl.store(p_vn, b_v_new.to(p_vn.dtype.element_ty), boundary_check=(0, 1))
+
+#             # gating：对 g 用普通指针 + mask 读取（不是 block_ptr）
+#             if USE_G:
+#                 offs_t = t0 + tl.arange(0, BT)                 # [BT]
+#                 t_mask = offs_t < TR                            # [BT]
+#                 g_base = g + bos * H + i_h
+#                 b_g = tl.load(g_base + offs_t * H, mask=t_mask, other=0.0)  # ✔ 这里不是 block_ptr，可用 mask
+
+#                 t_end = tl.minimum(t0 + BT, TR)
+#                 last_idx = tl.maximum(t_end - 1, 0).to(tl.int32)
+#                 b_g_last = tl.load(g_base + last_idx * H, mask=(TR > 0), other=0.0)
+
+#                 gating_factor = safe_exp(b_g_last - b_g)[:, None]
+#                 b_v_new = b_v_new * gating_factor
+
+#                 g_last_exp = exp(b_g_last)
+#                 b_h1 = b_h1 * g_last_exp
+#                 if K > 64:
+#                     b_h2 = b_h2 * g_last_exp
+#                 if K > 128:
+#                     b_h3 = b_h3 * g_last_exp
+#                 if K > 192:
+#                     b_h4 = b_h4 * g_last_exp
+
+#             # h += k^T @ v_new
+#             b_v_new_cast = b_v_new.to(k.dtype.element_ty)
+#             p_k = tl.make_block_ptr(k, (K, TR), (1, stride_k_hg), (0, t0), (64, BT), (0, 1))
+#             b_k = tl.load(p_k, boundary_check=(0, 1))         # 不能使用 mask
+#             b_h1 += tl.dot(b_k, b_v_new_cast)
+
+#             if K > 64:
+#                 p_k = tl.make_block_ptr(k, (K, TR), (1, stride_k_hg), (64, t0), (64, BT), (0, 1))
+#                 b_k = tl.load(p_k, boundary_check=(0, 1))
+#                 b_h2 += tl.dot(b_k, b_v_new_cast)
+#             if K > 128:
+#                 p_k = tl.make_block_ptr(k, (K, TR), (1, stride_k_hg), (128, t0), (64, BT), (0, 1))
+#                 b_k = tl.load(p_k, boundary_check=(0, 1))
+#                 b_h3 += tl.dot(b_k, b_v_new_cast)
+#             if K > 192:
+#                 p_k = tl.make_block_ptr(k, (K, TR), (1, stride_k_hg), (192, t0), (64, BT), (0, 1))
+#                 b_k = tl.load(p_k, boundary_check=(0, 1))
+#                 b_h4 += tl.dot(b_k, b_v_new_cast)
+
+#     # 末态写回
+#     if STORE_FINAL_STATE:
+#         p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+#         tl.store(p_ht, b_h1.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+#         if K > 64:
+#             p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0))
+#             tl.store(p_ht, b_h2.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+#         if K > 128:
+#             p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0))
+#             tl.store(p_ht, b_h3.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+#         if K > 192:
+#             p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0))
+#             tl.store(p_ht, b_h4.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+
+
+# def chunk_gated_delta_rule_fwd_h(
+#     k: torch.Tensor,                    # [B, T, Hg, K]
+#     w: torch.Tensor,                    # [B, T, H,  K]
+#     u: torch.Tensor,                    # [B, T, H,  V]
+#     g: Optional[torch.Tensor] = None,   # [B, T, H]
+#     initial_state: Optional[torch.Tensor] = None,  # [B, H, K, V]
+#     output_final_state: bool = False,
+#     chunk_size: int = 64,
+#     save_new_value: bool = True,
+#     cu_seqlens: Optional[torch.LongTensor] = None, # [N+1] 或 None
+# ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+#     """
+#     Host wrapper：
+#     - 计算 NT（以 chunk 为单位的上界，constexpr）；
+#     - kernel 内对块是否有效做 runtime 守卫（active）；
+#     - block_ptr 的读写只用 boundary_check，严禁 mask/other；
+#     - 对 g 使用标量指针 + mask 读取。
+#     """
+#     B, T, Hg, K = k.shape
+#     H, V = u.shape[-2], u.shape[-1]
+#     BT = chunk_size
+
+#     if cu_seqlens is None:
+#         N = B
+#         NT = math.ceil(T / BT)                 # 固定长：全局上界
+#         chunk_offsets = None
+#     else:
+#         N = len(cu_seqlens) - 1
+#         chunk_indices = prepare_chunk_indices(cu_seqlens, BT)
+#         NT = len(chunk_indices)                # 变长：所有序列的总 chunk 数
+#         chunk_offsets = prepare_chunk_offsets(cu_seqlens, BT)
+
+#     assert K <= 256, "current kernel does not support head dimension larger than 256."
+
+#     h = k.new_empty(B, NT, H, K, V)
+#     final_state = k.new_empty(N, H, K, V, dtype=torch.float32) if output_final_state else None
+#     v_new = torch.empty_like(u) if save_new_value else None
+
+#     def grid(meta):
+#         return (triton.cdiv(V, meta["BV"]), N * H)
+
+#     chunk_gated_delta_rule_fwd_kernel_h_blockdim64[grid](
+#         k=k,
+#         v=u,
+#         w=w,
+#         v_new=v_new,
+#         g=g,
+#         h=h,
+#         h0=initial_state,
+#         ht=final_state,
+#         cu_seqlens=cu_seqlens,
+#         chunk_offsets=chunk_offsets,
+#         T=T,
+#         H=H,
+#         Hg=Hg,
+#         K=K,
+#         V=V,
+#         BT=BT,
+#         BV=32,
+#         NT=NT,
+#         num_warps=2,
+#         num_stages=1,
+#     )
+#     return h, v_new, final_state
diff --git a/python/sglang/srt/layers/attention/fla/chunk_o.py b/python/sglang/srt/layers/attention/fla/chunk_o.py
new file mode 100644
index 000000000..d672c646b
--- /dev/null
+++ b/python/sglang/srt/layers/attention/fla/chunk_o.py
@@ -0,0 +1,178 @@
+# Adapted from https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/common/chunk_o.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+from typing import Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.fla.index import prepare_chunk_indices
+from sglang.srt.layers.attention.fla.op import exp, safe_exp
+from sglang.srt.layers.attention.fla.utils import check_shared_mem, is_nvidia_hopper
+
+BKV_LIST = [64, 128] if check_shared_mem() else [32, 64]
+NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8]
+
+
+@triton.heuristics(
+    {
+        "USE_G": lambda args: args["g"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+# @triton.autotune(
+#     configs=[
+#         triton.Config({"BK": BK, "BV": BV}, num_warps=num_warps, num_stages=num_stages)
+#         for BK in BKV_LIST
+#         for BV in BKV_LIST
+#         for num_warps in NUM_WARPS
+#         for num_stages in [2, 3, 4]
+#     ],
+#     key=["H", "K", "V", "BT"],
+# )
+@triton.jit(do_not_specialize=["T"])
+def chunk_fwd_kernel_o(
+    q,
+    k,
+    v,
+    h,
+    g,
+    o,
+    cu_seqlens,
+    chunk_indices,
+    scale,
+    T,
+    H: tl.constexpr,
+    Hg: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_G: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+
+    if IS_VARLEN:
+        i_tg = i_t
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(
+            chunk_indices + i_t * 2 + 1
+        ).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(
+            cu_seqlens + i_n + 1
+        ).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        NT = tl.cdiv(T, BT)
+        i_tg = i_b * NT + i_t
+        bos, eos = i_b * T, i_b * T + T
+
+    # offset calculation
+    q += (bos * Hg + i_h // (H // Hg)) * K
+    k += (bos * Hg + i_h // (H // Hg)) * K
+    v += (bos * H + i_h) * V
+    o += (bos * H + i_h) * V
+    h += (i_tg * H + i_h).to(tl.int64) * K * V
+
+    b_o = tl.zeros([BT, BV], dtype=tl.float32)
+    b_A = tl.zeros([BT, BT], dtype=tl.float32)
+
+    for i_k in range(tl.cdiv(K, BK)):
+        p_q = tl.make_block_ptr(
+            q, (T, K), (Hg * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)
+        )
+        p_k = tl.make_block_ptr(
+            k, (K, T), (1, Hg * K), (i_k * BK, i_t * BT), (BK, BT), (0, 1)
+        )
+        p_h = tl.make_block_ptr(
+            h, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)
+        )
+        # [BT, BK]
+        b_q = tl.load(p_q, boundary_check=(0, 1))
+        # [BK, BT]
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        # [BK, BV]
+        b_h = tl.load(p_h, boundary_check=(0, 1))
+
+        # [BT, BK] @ [BK, BV] -> [BT, BV]
+        b_o += tl.dot(b_q, b_h)
+        # [BT, BK] @ [BK, BT] -> [BT, BT]
+        b_A += tl.dot(b_q, b_k)
+
+    if USE_G:
+        g += bos * H + i_h
+        p_g = tl.make_block_ptr(g, (T,), (H,), (i_t * BT,), (BT,), (0,))
+        b_g = tl.load(p_g, boundary_check=(0,))
+        b_o = b_o * exp(b_g)[:, None]
+        b_A = b_A * safe_exp(b_g[:, None] - b_g[None, :])
+
+    o_i = tl.arange(0, BT)
+    m_A = o_i[:, None] >= o_i[None, :]
+    b_A = tl.where(m_A, b_A, 0)
+
+    p_v = tl.make_block_ptr(
+        v, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)
+    )
+    p_o = tl.make_block_ptr(
+        o, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)
+    )
+    b_v = tl.load(p_v, boundary_check=(0, 1))
+
+    # to fix mma -> mma layout conversion
+    # already solved by triton v3.2 or higher
+    b_o = b_o * scale + tl.dot(b_A.to(b_v.dtype), b_v) * scale
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_fwd_o(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    h: torch.Tensor,
+    g: Optional[torch.Tensor] = None,  # cumsum of log decay
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    chunk_size: int = 64,
+) -> torch.Tensor:
+    B, T, Hg, K, V = *q.shape, v.shape[-1]
+    H = v.shape[-2]
+    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    )
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    if scale is None:
+        scale = k.shape[-1] ** -0.5
+
+    o = torch.empty_like(v)
+
+    def grid(meta):
+        return (triton.cdiv(V, meta["BV"]), NT, B * H)
+
+    chunk_fwd_kernel_o[grid](
+        q,
+        k,
+        v,
+        h,
+        g,
+        o,
+        cu_seqlens,
+        chunk_indices,
+        scale,
+        T=T,
+        H=H,
+        Hg=Hg,
+        K=K,
+        V=V,
+        BT=BT,
+        BK=128,
+        BV=64,
+        num_warps=4,
+        num_stages=2,
+    )
+    return o
diff --git a/python/sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py b/python/sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py
new file mode 100644
index 000000000..699350d31
--- /dev/null
+++ b/python/sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py
@@ -0,0 +1,151 @@
+# Adapted from https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/common/chunk_scaled_dot_kkt.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+from typing import Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.fla.index import prepare_chunk_indices
+from sglang.srt.layers.attention.fla.op import safe_exp
+
+
+@triton.heuristics(
+    {
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+        "USE_G": lambda args: args["g_cumsum"] is not None,
+    }
+)
+# @triton.autotune(
+#     configs=[
+#         triton.Config({"BK": BK}, num_warps=num_warps, num_stages=num_stages)
+#         for BK in [32, 64, 128]
+#         for num_warps in [2, 4, 8]
+#         for num_stages in [2, 3, 4]
+#     ],
+#     key=["H", "K", "BT", "IS_VARLEN"],
+# )
+@triton.jit(do_not_specialize=["T"])
+def chunk_scaled_dot_kkt_fwd_kernel(
+    k,
+    beta,
+    g_cumsum,
+    A,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    Hg: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    USE_G: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(
+            chunk_indices + i_t * 2 + 1
+        ).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(
+            cu_seqlens + i_n + 1
+        ).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    o_t = tl.arange(0, BT)
+
+    p_beta = tl.make_block_ptr(
+        beta + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,)
+    )
+    b_beta = tl.load(p_beta, boundary_check=(0,))
+
+    b_A = tl.zeros([BT, BT], dtype=tl.float32)
+    for i_k in range(tl.cdiv(K, BK)):
+        p_k = tl.make_block_ptr(
+            k + (bos * Hg + i_h // (H // Hg)) * K,
+            (T, K),
+            (Hg * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_kb = b_k * b_beta[:, None]
+        b_A += tl.dot(b_kb.to(b_k.dtype), tl.trans(b_k))
+
+    if USE_G:
+        p_g = tl.make_block_ptr(
+            g_cumsum + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,)
+        )
+        b_g = tl.load(p_g, boundary_check=(0,))
+        b_g_diff = b_g[:, None] - b_g[None, :]
+        b_A = b_A * safe_exp(b_g_diff)
+
+    b_A = tl.where(o_t[:, None] > o_t[None, :], b_A, 0)
+    p_A = tl.make_block_ptr(
+        A + (bos * H + i_h) * BT, (T, BT), (BT * H, 1), (i_t * BT, 0), (BT, BT), (1, 0)
+    )
+    tl.store(p_A, b_A.to(p_A.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_scaled_dot_kkt_fwd(
+    k: torch.Tensor,
+    beta: torch.Tensor,
+    g_cumsum: Optional[torch.Tensor] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    chunk_size: int = 64,
+    output_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    r"""
+    Compute beta * K * K^T.
+
+    Args:
+        k (torch.Tensor):
+            The key tensor of shape `[B, T, H, K]`.
+        beta (torch.Tensor):
+            The beta tensor of shape `[B, T, H]`.
+        g_cumsum (torch.Tensor):
+            The cumulative sum of the gate tensor of shape `[B, T, H]`.
+            Default: None
+        cu_seqlens (torch.LongTensor):
+            The cumulative sequence lengths of the input tensor.
+            Default: None
+        chunk_size (int):
+            The chunk size. Default: 64.
+        output_dtype (torch.dtype):
+            The dtype of the output tensor. Default: `torch.float32`
+
+    Returns:
+        beta * K * K^T of shape `[B, T, H, BT]` where `BT` is the chunk size.
+    """
+
+    B, T, Hg, K = k.shape
+
+    H = beta.shape[-1]
+    BT = chunk_size
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    )
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    A = torch.empty(B, T, H, BT, device=k.device, dtype=output_dtype)
+    chunk_scaled_dot_kkt_fwd_kernel[(NT, B * H)](
+        k=k,
+        beta=beta,
+        g_cumsum=g_cumsum,
+        A=A,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        Hg=Hg,
+        K=K,
+        BT=BT,
+        BK=64,
+        num_warps=8,
+        num_stages=3,
+    )
+    return A
diff --git a/python/sglang/srt/layers/attention/fla/cumsum.py b/python/sglang/srt/layers/attention/fla/cumsum.py
new file mode 100644
index 000000000..b8e3cdde1
--- /dev/null
+++ b/python/sglang/srt/layers/attention/fla/cumsum.py
@@ -0,0 +1,300 @@
+# Adapt from https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/utils/cumsum.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+from typing import Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.fla.index import prepare_chunk_indices
+from sglang.srt.layers.attention.fla.utils import check_shared_mem, input_guard
+
+BS_LIST = [32, 64] if check_shared_mem() else [16, 32]
+
+
+@triton.heuristics(
+    {
+        "HAS_SCALE": lambda args: args["scale"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+# @triton.autotune(
+#     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]],
+#     key=["B", "H", "BT", "IS_VARLEN", "REVERSE"],
+# )
+@triton.jit(do_not_specialize=["T"])
+def chunk_local_cumsum_scalar_kernel(
+    s,
+    o,
+    scale,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    REVERSE: tl.constexpr,
+    HAS_SCALE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(
+            chunk_indices + i_t * 2 + 1
+        ).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(
+            cu_seqlens + i_n + 1
+        ).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    if HEAD_FIRST:
+        p_s = tl.make_block_ptr(
+            s + bos * H + i_h * T, (T,), (1,), (i_t * BT,), (BT,), (0,)
+        )
+        p_o = tl.make_block_ptr(
+            o + bos * H + i_h * T, (T,), (1,), (i_t * BT,), (BT,), (0,)
+        )
+    else:
+        p_s = tl.make_block_ptr(s + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+        p_o = tl.make_block_ptr(o + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+    # [BT]
+    b_s = tl.load(p_s, boundary_check=(0,)).to(tl.float32)
+    b_o = tl.cumsum(b_s, axis=0)
+    if REVERSE:
+        b_z = tl.sum(b_s, axis=0)
+        b_o = -b_o + b_z[None] + b_s
+    if HAS_SCALE:
+        b_o *= scale
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0,))
+
+
+@triton.heuristics(
+    {
+        "HAS_SCALE": lambda args: args["scale"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[
+        triton.Config({"BS": BS}, num_warps=num_warps)
+        for BS in BS_LIST
+        for num_warps in [2, 4, 8]
+    ],
+    key=["B", "H", "S", "BT", "IS_VARLEN", "REVERSE"],
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_local_cumsum_vector_kernel(
+    s,
+    o,
+    scale,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    S: tl.constexpr,
+    BT: tl.constexpr,
+    BS: tl.constexpr,
+    REVERSE: tl.constexpr,
+    HAS_SCALE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+):
+    i_s, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(
+            chunk_indices + i_t * 2 + 1
+        ).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(
+            cu_seqlens + i_n + 1
+        ).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    o_i = tl.arange(0, BT)
+    if REVERSE:
+        m_s = tl.where(o_i[:, None] <= o_i[None, :], 1.0, 0.0)
+    else:
+        m_s = tl.where(o_i[:, None] >= o_i[None, :], 1.0, 0.0)
+
+    if HEAD_FIRST:
+        p_s = tl.make_block_ptr(
+            s + (bos * H + i_h * T) * S,
+            (T, S),
+            (S, 1),
+            (i_t * BT, i_s * BS),
+            (BT, BS),
+            (1, 0),
+        )
+        p_o = tl.make_block_ptr(
+            o + (bos * H + i_h * T) * S,
+            (T, S),
+            (S, 1),
+            (i_t * BT, i_s * BS),
+            (BT, BS),
+            (1, 0),
+        )
+    else:
+        p_s = tl.make_block_ptr(
+            s + (bos * H + i_h) * S,
+            (T, S),
+            (H * S, 1),
+            (i_t * BT, i_s * BS),
+            (BT, BS),
+            (1, 0),
+        )
+        p_o = tl.make_block_ptr(
+            o + (bos * H + i_h) * S,
+            (T, S),
+            (H * S, 1),
+            (i_t * BT, i_s * BS),
+            (BT, BS),
+            (1, 0),
+        )
+    # [BT, BS]
+    b_s = tl.load(p_s, boundary_check=(0, 1)).to(tl.float32)
+    b_o = tl.dot(m_s, b_s, allow_tf32=False)
+    if HAS_SCALE:
+        b_o *= scale
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_local_cumsum_scalar(
+    g: torch.Tensor,
+    chunk_size: int,
+    reverse: bool = False,
+    scale: float = None,
+    cu_seqlens: Optional[torch.Tensor] = None,
+    head_first: bool = False,
+    output_dtype: Optional[torch.dtype] = torch.float,
+) -> torch.Tensor:
+    if head_first:
+        B, H, T = g.shape
+    else:
+        B, T, H = g.shape
+    assert chunk_size == 2 ** (
+        chunk_size.bit_length() - 1
+    ), "chunk_size must be a power of 2"
+    BT = chunk_size
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    )
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    g_org, g = g, torch.empty_like(g, dtype=output_dtype or g.dtype)
+    grid = (NT, B * H)
+    chunk_local_cumsum_scalar_kernel[grid](
+        s=g_org,
+        o=g,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        B=B,
+        H=H,
+        BT=BT,
+        HEAD_FIRST=head_first,
+        REVERSE=reverse,
+        num_warps=8,
+        num_stages=3,
+    )
+    return g
+
+
+def chunk_local_cumsum_vector(
+    g: torch.Tensor,
+    chunk_size: int,
+    reverse: bool = False,
+    scale: float = None,
+    cu_seqlens: Optional[torch.Tensor] = None,
+    head_first: bool = False,
+    output_dtype: Optional[torch.dtype] = torch.float,
+) -> torch.Tensor:
+    if head_first:
+        B, H, T, S = g.shape
+    else:
+        B, T, H, S = g.shape
+    BT = chunk_size
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, chunk_size)
+        if cu_seqlens is not None
+        else None
+    )
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    assert chunk_size == 2 ** (
+        chunk_size.bit_length() - 1
+    ), "chunk_size must be a power of 2"
+
+    g_org, g = g, torch.empty_like(g, dtype=output_dtype or g.dtype)
+
+    def grid(meta):
+        return (triton.cdiv(meta["S"], meta["BS"]), NT, B * H)
+
+    # keep cumulative normalizer in fp32
+    # this kernel is equivalent to
+    # g = g.view(B, H, NT, BT, -1).cumsum(-2).view(B, H, T, -1)
+    chunk_local_cumsum_vector_kernel[grid](
+        s=g_org,
+        o=g,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        B=B,
+        H=H,
+        S=S,
+        BT=BT,
+        HEAD_FIRST=head_first,
+        REVERSE=reverse,
+    )
+    return g
+
+
+@input_guard
+def chunk_local_cumsum(
+    g: torch.Tensor,
+    chunk_size: int,
+    reverse: bool = False,
+    scale: float = None,
+    cu_seqlens: Optional[torch.Tensor] = None,
+    head_first: bool = False,
+    output_dtype: Optional[torch.dtype] = torch.float,
+    **kwargs,
+) -> torch.Tensor:
+    if cu_seqlens is not None:
+        assert (
+            g.shape[0] == 1
+        ), "Only batch size 1 is supported when cu_seqlens are provided"
+    if len(g.shape) == 3:
+        return chunk_local_cumsum_scalar(
+            g=g,
+            chunk_size=chunk_size,
+            reverse=reverse,
+            scale=scale,
+            cu_seqlens=cu_seqlens,
+            head_first=head_first,
+            output_dtype=output_dtype,
+        )
+    elif len(g.shape) == 4:
+        return chunk_local_cumsum_vector(
+            g=g,
+            chunk_size=chunk_size,
+            reverse=reverse,
+            scale=scale,
+            cu_seqlens=cu_seqlens,
+            head_first=head_first,
+            output_dtype=output_dtype,
+        )
+    else:
+        raise ValueError(
+            f"Unsupported input shape {g.shape}, "
+            f"which should be (B, T, H, D) if `head_first=False` "
+            f"or (B, H, T, D) otherwise"
+        )
diff --git a/python/sglang/srt/layers/attention/fla/fused_recurrent.py b/python/sglang/srt/layers/attention/fla/fused_recurrent.py
new file mode 100644
index 000000000..fa7262ce2
--- /dev/null
+++ b/python/sglang/srt/layers/attention/fla/fused_recurrent.py
@@ -0,0 +1,640 @@
+# Adapt from https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/gated_delta_rule/fused_recurrent.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+from typing import Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.fla.op import exp
+from sglang.srt.layers.attention.fla.utils import input_guard
+
+
+@triton.heuristics(
+    {
+        "USE_INITIAL_STATE": lambda args: args["h0"] is not None,
+        "STORE_FINAL_STATE": lambda args: args["ht"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.jit(do_not_specialize=["T"])
+def fused_recurrent_gated_delta_rule_fwd_kernel(
+    q,
+    k,
+    v,
+    g,
+    beta,
+    o,
+    h0,
+    ht,
+    cu_seqlens,
+    scale,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    HV: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,  # whether to use initial state
+    STORE_FINAL_STATE: tl.constexpr,  # whether to store final state
+    IS_BETA_HEADWISE: tl.constexpr,  # whether beta is headwise vector or scalar,
+    USE_QK_L2NORM_IN_KERNEL: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_n, i_hv = i_nh // HV, i_nh % HV
+    i_h = i_hv // (HV // H)
+    if IS_VARLEN:
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int64), tl.load(
+            cu_seqlens + i_n + 1
+        ).to(tl.int64)
+        all = T
+        T = eos - bos
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        all = B * T
+    o_k = i_k * BK + tl.arange(0, BK)
+    o_v = i_v * BV + tl.arange(0, BV)
+
+    p_q = q + (bos * H + i_h) * K + o_k
+    p_k = k + (bos * H + i_h) * K + o_k
+    p_v = v + (bos * HV + i_hv) * V + o_v
+    if IS_BETA_HEADWISE:
+        p_beta = beta + (bos * HV + i_hv) * V + o_v
+    else:
+        p_beta = beta + bos * HV + i_hv
+    p_g = g + bos * HV + i_hv
+    p_o = o + ((i_k * all + bos) * HV + i_hv) * V + o_v
+
+    mask_k = o_k < K
+    mask_v = o_v < V
+    mask_h = mask_k[:, None] & mask_v[None, :]
+
+    b_h = tl.zeros([BK, BV], dtype=tl.float32)
+    if USE_INITIAL_STATE:
+        p_h0 = h0 + i_nh * K * V + o_k[:, None] * V + o_v[None, :]
+        b_h += tl.load(p_h0, mask=mask_h, other=0).to(tl.float32)
+
+    for _ in range(0, T):
+        b_q = tl.load(p_q, mask=mask_k, other=0).to(tl.float32)
+        b_k = tl.load(p_k, mask=mask_k, other=0).to(tl.float32)
+        b_v = tl.load(p_v, mask=mask_v, other=0).to(tl.float32)
+        b_g = tl.load(p_g).to(tl.float32)
+
+        if USE_QK_L2NORM_IN_KERNEL:
+            b_q = b_q / (tl.sqrt(tl.sum(b_q * b_q)) + 1e-6)
+            b_k = b_k / (tl.sqrt(tl.sum(b_k * b_k)) + 1e-6)
+        b_q = b_q * scale
+        # [BK, BV]
+        b_h *= exp(b_g)
+        # [BV]
+        b_v -= tl.sum(b_h * b_k[:, None], 0)
+        if IS_BETA_HEADWISE:
+            b_beta = tl.load(p_beta, mask=mask_v, other=0).to(tl.float32)
+        else:
+            b_beta = tl.load(p_beta).to(tl.float32)
+        b_v *= b_beta
+        # [BK, BV]
+        b_h += b_k[:, None] * b_v[None, :]
+        # [BV]
+        b_o = tl.sum(b_h * b_q[:, None], 0)
+        tl.store(p_o, b_o.to(p_o.dtype.element_ty), mask=mask_v)
+
+        p_q += H * K
+        p_k += H * K
+        p_o += HV * V
+        p_v += HV * V
+        p_g += HV
+        p_beta += HV * (V if IS_BETA_HEADWISE else 1)
+
+    if STORE_FINAL_STATE:
+        p_ht = ht + i_nh * K * V + o_k[:, None] * V + o_v[None, :]
+        tl.store(p_ht, b_h.to(p_ht.dtype.element_ty), mask=mask_h)
+
+
+def fused_recurrent_gated_delta_rule_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float,
+    initial_state: torch.Tensor,
+    output_final_state: bool,
+    use_qk_l2norm_in_kernel: bool = False,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    B, T, H, K, V = *k.shape, v.shape[-1]
+    HV = v.shape[2]
+    N = B if cu_seqlens is None else len(cu_seqlens) - 1
+    BK, BV = triton.next_power_of_2(K), min(triton.next_power_of_2(V), 8)
+    NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV)
+    assert NK == 1, "NK > 1 is not supported yet"
+    num_stages = 3
+    num_warps = 1
+
+    o = q.new_empty(NK, *v.shape)
+    if output_final_state:
+        final_state = q.new_empty(N, HV, K, V, dtype=torch.float32)
+    else:
+        final_state = None
+
+    grid = (NK, NV, N * HV)
+    fused_recurrent_gated_delta_rule_fwd_kernel[grid](
+        q=q,
+        k=k,
+        v=v,
+        g=g,
+        beta=beta,
+        o=o,
+        h0=initial_state,
+        ht=final_state,
+        cu_seqlens=cu_seqlens,
+        scale=scale,
+        T=T,
+        B=B,
+        H=H,
+        HV=HV,
+        K=K,
+        V=V,
+        BK=BK,
+        BV=BV,
+        IS_BETA_HEADWISE=beta.ndim == v.ndim,
+        USE_QK_L2NORM_IN_KERNEL=use_qk_l2norm_in_kernel,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    o = o.squeeze(0)
+    return o, final_state
+
+
+class FusedRecurrentFunction(torch.autograd.Function):
+
+    @staticmethod
+    @input_guard
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        g: torch.Tensor,
+        beta: torch.Tensor,
+        scale: float,
+        initial_state: torch.Tensor,
+        output_final_state: bool,
+        cu_seqlens: Optional[torch.LongTensor] = None,
+        use_qk_l2norm_in_kernel: bool = False,
+    ):
+        o, final_state = fused_recurrent_gated_delta_rule_fwd(
+            q=q,
+            k=k,
+            v=v,
+            g=g,
+            beta=beta,
+            scale=scale,
+            initial_state=initial_state,
+            output_final_state=output_final_state,
+            use_qk_l2norm_in_kernel=use_qk_l2norm_in_kernel,
+            cu_seqlens=cu_seqlens,
+        )
+
+        return o, final_state
+
+    @staticmethod
+    @input_guard
+    def backward(ctx, do, dht):
+        raise NotImplementedError(
+            "Backward pass is not implemented yet and we do not have plans to implement it "
+            "because we haven't figured out how to compute dg without materializing the full "
+            "hidden states for all time steps."
+        )
+
+
+def fused_recurrent_gated_delta_rule(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor = None,
+    scale: float = None,
+    initial_state: torch.Tensor = None,
+    output_final_state: bool = False,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    use_qk_l2norm_in_kernel: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    r"""
+    Args:
+        q (torch.Tensor):
+            queries of shape `[B, T, H, K]`.
+        k (torch.Tensor):
+            keys of shape `[B, T, H, K]`.
+        v (torch.Tensor):
+            values of shape `[B, T, HV, V]`.
+            GVA is applied if `HV > H`.
+        g (torch.Tensor):
+            g (decays) of shape `[B, T, HV]`.
+        beta (torch.Tensor):
+            betas of shape `[B, T, HV]`.
+        scale (Optional[int]):
+            Scale factor for the RetNet attention scores.
+            If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
+        initial_state (Optional[torch.Tensor]):
+            Initial state of shape `[N, HV, K, V]` for `N` input sequences.
+            For equal-length input sequences, `N` equals the batch size `B`.
+            Default: `None`.
+        output_final_state (Optional[bool]):
+            Whether to output the final state of shape `[N, HV, K, V]`. Default: `False`.
+        cu_seqlens (torch.LongTensor):
+            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
+            consistent with the FlashAttention API.
+    Returns:
+        o (torch.Tensor):
+            Outputs of shape `[B, T, HV, V]`.
+        final_state (torch.Tensor):
+            Final state of shape `[N, HV, K, V]` if `output_final_state=True` else `None`.
+    Examples::
+        >>> import torch
+        >>> import torch.nn.functional as F
+        >>> from einops import rearrange
+        >>> from fla.ops.gated_delta_rule import fused_recurrent_gated_delta_rule
+        # inputs with equal lengths
+        >>> B, T, H, HV, K, V = 4, 2048, 4, 8, 512, 512
+        >>> q = torch.randn(B, T, H, K, device='cuda')
+        >>> k = F.normalize(torch.randn(B, T, H, K, device='cuda'), p=2, dim=-1)
+        >>> v = torch.randn(B, T, HV, V, device='cuda')
+        >>> g = F.logsigmoid(torch.rand(B, T, HV, device='cuda'))
+        >>> beta = torch.rand(B, T, HV, device='cuda').sigmoid()
+        >>> h0 = torch.randn(B, HV, K, V, device='cuda')
+        >>> o, ht = fused_gated_recurrent_delta_rule(
+            q, k, v, g, beta,
+            initial_state=h0,
+            output_final_state=True
+        )
+        # for variable-length inputs, the batch size `B` is expected to be 1 and `cu_seqlens` is required
+        >>> q, k, v, g, beta = map(lambda x: rearrange(x, 'b t ... -> 1 (b t) ...'), (q, k, v, g, beta))
+        # for a batch with 4 sequences, `cu_seqlens` with 5 start/end positions are expected
+        >>> cu_seqlens = q.new_tensor([0, 2048, 4096, 6144, 8192], dtype=torch.long)
+        >>> o_var, ht_var = fused_gated_recurrent_delta_rule(
+            q, k, v, g, beta,
+            initial_state=h0,
+            output_final_state=True,
+            cu_seqlens=cu_seqlens
+        )
+    """
+    if cu_seqlens is not None:
+        if q.shape[0] != 1:
+            raise ValueError(
+                f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
+                f"Please flatten variable-length inputs before processing."
+            )
+        if initial_state is not None and initial_state.shape[0] != len(cu_seqlens) - 1:
+            raise ValueError(
+                f"The number of initial states is expected to be equal to the number of input sequences, "
+                f"i.e., {len(cu_seqlens) - 1} rather than {initial_state.shape[0]}."
+            )
+    if scale is None:
+        scale = k.shape[-1] ** -0.5
+    else:
+        assert scale > 0, "scale must be positive"
+    if beta is None:
+        beta = torch.ones_like(q[..., 0])
+    o, final_state = FusedRecurrentFunction.apply(
+        q,
+        k,
+        v,
+        g,
+        beta,
+        scale,
+        initial_state,
+        output_final_state,
+        cu_seqlens,
+        use_qk_l2norm_in_kernel,
+    )
+    return o, final_state
+
+
+@triton.heuristics(
+    {
+        "USE_INITIAL_STATE": lambda args: args["h0_source"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+        "CACHE_INTERMEDIATE_STATES": lambda args: args["intermediate_states_buffer"]
+        is not None,
+    }
+)
+@triton.jit(do_not_specialize=["T"])
+def fused_recurrent_gated_delta_rule_update_fwd_kernel(
+    q,
+    k,
+    v,
+    g,
+    beta,
+    o,
+    h0_source,
+    h0_indices,
+    cu_seqlens,
+    scale,
+    intermediate_states_buffer,
+    cache_steps,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    HV: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,  # whether to use initial state
+    IS_BETA_HEADWISE: tl.constexpr,  # whether beta is headwise vector or scalar,
+    USE_QK_L2NORM_IN_KERNEL: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    DISABLE_STATE_UPDATE: tl.constexpr,  # whether to disable final state update
+    DISABLE_OUTPUT_CALCULATION: tl.constexpr,  # whether to disable output calculation
+    CACHE_INTERMEDIATE_STATES: tl.constexpr,
+):
+    i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_n, i_hv = i_nh // HV, i_nh % HV
+    i_h = i_hv // (HV // H)
+    if IS_VARLEN:
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int64), tl.load(
+            cu_seqlens + i_n + 1
+        ).to(tl.int64)
+        all = T
+        T = eos - bos
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        all = B * T
+    o_k = i_k * BK + tl.arange(0, BK)
+    o_v = i_v * BV + tl.arange(0, BV)
+
+    p_q = q + (bos * H + i_h) * K + o_k
+    p_k = k + (bos * H + i_h) * K + o_k
+    p_v = v + (bos * HV + i_hv) * V + o_v
+    if IS_BETA_HEADWISE:
+        p_beta = beta + (bos * HV + i_hv) * V + o_v
+    else:
+        p_beta = beta + bos * HV + i_hv
+    p_g = g + bos * HV + i_hv
+    p_o = o + ((i_k * all + bos) * HV + i_hv) * V + o_v
+
+    mask_k = o_k < K
+    mask_v = o_v < V
+    mask_h = mask_k[:, None] & mask_v[None, :]
+
+    b_h = tl.zeros([BK, BV], dtype=tl.float32)
+    if USE_INITIAL_STATE:
+        idx = tl.load(h0_indices + i_n)
+        # Add bounds checking for idx
+        if idx >= 0:  # Assuming negative indices are invalid
+            p_h0 = (
+                h0_source
+                + idx * HV * K * V
+                + i_hv * K * V
+                + o_k[:, None] * V
+                + o_v[None, :]
+            )
+            b_h += tl.load(p_h0, mask=mask_h, other=0).to(tl.float32)
+
+    # Prepare intermediate state cache variables if enabled
+    cache_idx = -1
+    if CACHE_INTERMEDIATE_STATES:
+        cache_idx = tl.load(h0_indices + i_n)
+
+    step_idx = 0
+    for _ in range(0, T):
+        b_q = tl.load(p_q, mask=mask_k, other=0).to(tl.float32)
+        b_k = tl.load(p_k, mask=mask_k, other=0).to(tl.float32)
+        b_v = tl.load(p_v, mask=mask_v, other=0).to(tl.float32)
+        b_g = tl.load(p_g).to(tl.float32)
+
+        if USE_QK_L2NORM_IN_KERNEL:
+            b_q = b_q / (tl.sqrt(tl.sum(b_q * b_q)) + 1e-6)
+            b_k = b_k / (tl.sqrt(tl.sum(b_k * b_k)) + 1e-6)
+        b_q = b_q * scale
+        # [BK, BV]
+        b_h *= exp(b_g)
+        # [BV]
+        b_v -= tl.sum(b_h * b_k[:, None], 0)
+        if IS_BETA_HEADWISE:
+            b_beta = tl.load(p_beta, mask=mask_v, other=0).to(tl.float32)
+        else:
+            b_beta = tl.load(p_beta).to(tl.float32)
+        b_v *= b_beta
+        # [BK, BV]
+        b_h += b_k[:, None] * b_v[None, :]
+        # [BV]
+        if not DISABLE_OUTPUT_CALCULATION:
+            b_o = tl.sum(b_h * b_q[:, None], 0)
+            # core attn output
+            tl.store(p_o, b_o.to(p_o.dtype.element_ty), mask=mask_v)
+
+        # store intermediate states if enabled
+        if CACHE_INTERMEDIATE_STATES:
+            if cache_idx >= 0:
+                # Compute cache pointer for this step
+                step_offset = step_idx * HV * K * V
+                cache_ptr = (
+                    intermediate_states_buffer
+                    + cache_idx * cache_steps * HV * K * V
+                    + step_offset
+                    + i_hv * K * V
+                    + o_k[:, None] * V
+                    + o_v[None, :]
+                )
+                tl.store(cache_ptr, b_h.to(cache_ptr.dtype.element_ty), mask=mask_h)
+
+        step_idx += 1
+
+        p_q += H * K
+        p_k += H * K
+        p_o += HV * V
+        p_v += HV * V
+        p_g += HV
+        p_beta += HV * (V if IS_BETA_HEADWISE else 1)
+
+    # Store final state back to h0_source with bounds checking
+    # ssm states
+    if not DISABLE_STATE_UPDATE:
+        idx = tl.load(h0_indices + i_n)
+        if idx >= 0:  # Add bounds checking
+            p_h0 = (
+                h0_source
+                + idx * HV * K * V
+                + i_hv * K * V
+                + o_k[:, None] * V
+                + o_v[None, :]
+            )
+            tl.store(p_h0, b_h.to(p_h0.dtype.element_ty), mask=mask_h)
+
+
+def fused_recurrent_gated_delta_rule_update_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float,
+    initial_state_source: torch.Tensor,
+    initial_state_indices: torch.Tensor,
+    use_qk_l2norm_in_kernel: bool = False,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    disable_state_update: bool = False,
+    disable_output_calculation: bool = False,
+    intermediate_states_buffer: Optional[torch.Tensor] = None,
+    cache_steps: Optional[int] = None,
+) -> torch.Tensor:
+    B, T, H, K, V = *k.shape, v.shape[-1]
+    HV = v.shape[2]
+    N = B if cu_seqlens is None else len(cu_seqlens) - 1
+    BK, BV = triton.next_power_of_2(K), min(triton.next_power_of_2(V), 8)
+    NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV)
+    assert NK == 1, "NK > 1 is not supported yet"
+    num_stages = 3
+    num_warps = 1
+
+    if disable_output_calculation:
+        # When output calculation is disabled, allocate minimal tensor
+        o = q.new_empty(NK, 1, 1, 1, 1)  # minimal allocation
+    else:
+        o = q.new_empty(NK, *v.shape)
+
+    grid = (NK, NV, N * HV)
+
+    fused_recurrent_gated_delta_rule_update_fwd_kernel[grid](
+        q=q,
+        k=k,
+        v=v,
+        g=g,
+        beta=beta,
+        o=o,
+        h0_source=initial_state_source,
+        h0_indices=initial_state_indices,
+        cu_seqlens=cu_seqlens,
+        scale=scale,
+        intermediate_states_buffer=intermediate_states_buffer,
+        cache_steps=0 if cache_steps is None else cache_steps,
+        T=T,
+        B=B,
+        H=H,
+        HV=HV,
+        K=K,
+        V=V,
+        BK=BK,
+        BV=BV,
+        IS_BETA_HEADWISE=beta.ndim == v.ndim,
+        USE_QK_L2NORM_IN_KERNEL=use_qk_l2norm_in_kernel,
+        DISABLE_STATE_UPDATE=disable_state_update,
+        DISABLE_OUTPUT_CALCULATION=disable_output_calculation,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    o = o.squeeze(0)
+    return o
+
+
+class FusedRecurrentUpdateFunction(torch.autograd.Function):
+
+    @staticmethod
+    @input_guard
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        g: torch.Tensor,
+        beta: torch.Tensor,
+        scale: float,
+        initial_state_source: torch.Tensor,
+        initial_state_indices: torch.Tensor,
+        cu_seqlens: Optional[torch.LongTensor] = None,
+        use_qk_l2norm_in_kernel: bool = False,
+        disable_state_update: bool = False,
+        disable_output_calculation: bool = False,
+        intermediate_states_buffer: Optional[torch.Tensor] = None,
+        cache_steps: Optional[int] = None,
+    ):
+        o = fused_recurrent_gated_delta_rule_update_fwd(
+            q=q,
+            k=k,
+            v=v,
+            g=g,
+            beta=beta,
+            scale=scale,
+            initial_state_source=initial_state_source,
+            initial_state_indices=initial_state_indices,
+            use_qk_l2norm_in_kernel=use_qk_l2norm_in_kernel,
+            cu_seqlens=cu_seqlens,
+            disable_state_update=disable_state_update,
+            disable_output_calculation=disable_output_calculation,
+            intermediate_states_buffer=intermediate_states_buffer,
+            cache_steps=cache_steps,
+        )
+
+        return o
+
+    @staticmethod
+    @input_guard
+    def backward(ctx, do, dht):
+        raise NotImplementedError(
+            "Backward pass is not implemented yet and we do not have plans to implement it "
+            "because we haven't figured out how to compute dg without materializing the full "
+            "hidden states for all time steps."
+        )
+
+
+def fused_recurrent_gated_delta_rule_update(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor = None,
+    scale: float = None,
+    initial_state_source: torch.Tensor = None,
+    initial_state_indices: torch.Tensor = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    use_qk_l2norm_in_kernel: bool = False,
+    disable_state_update: bool = False,
+    disable_output_calculation: bool = False,
+    intermediate_states_buffer: Optional[torch.Tensor] = None,
+    cache_steps: Optional[int] = None,
+) -> torch.Tensor:
+    if cu_seqlens is not None:
+        if q.shape[0] != 1:
+            raise ValueError(
+                f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
+                f"Please flatten variable-length inputs before processing."
+            )
+        if (
+            initial_state_source is not None
+            and initial_state_indices.shape[0] != len(cu_seqlens) - 1
+        ):
+            raise ValueError(
+                f"The number of initial states is expected to be equal to the number of input sequences, "
+                f"i.e., {len(cu_seqlens) - 1} rather than {initial_state_indices.shape[0]}."
+            )
+    if scale is None:
+        scale = k.shape[-1] ** -0.5
+    else:
+        assert scale > 0, "scale must be positive"
+    if beta is None:
+        beta = torch.ones_like(q[..., 0])
+    o = FusedRecurrentUpdateFunction.apply(
+        q,
+        k,
+        v,
+        g,
+        beta,
+        scale,
+        initial_state_source,
+        initial_state_indices,
+        cu_seqlens,
+        use_qk_l2norm_in_kernel,
+        disable_state_update,
+        disable_output_calculation,
+        intermediate_states_buffer,
+        cache_steps,
+    )
+    return o
diff --git a/python/sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py b/python/sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py
new file mode 100644
index 000000000..41837b980
--- /dev/null
+++ b/python/sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py
@@ -0,0 +1,232 @@
+from typing import Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.fla.utils import input_guard
+
+
+@triton.heuristics(
+    {
+        "USE_INITIAL_STATE": lambda args: args["h0_source"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.jit(do_not_specialize=["T"])
+def fused_sigmoid_gating_delta_rule_update_kernel(
+    A_log,
+    a,
+    dt_bias,
+    softplus_beta,
+    softplus_threshold,
+    q,
+    k,
+    v,
+    b,
+    o,
+    h0_source,
+    h0_indices,
+    cu_seqlens,
+    scale,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    HV: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,
+    USE_QK_L2NORM_IN_KERNEL: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    """
+    Fused kernel that combines sigmoid gating computation with recurrent delta rule update.
+    """
+    i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_n, i_hv = i_nh // HV, i_nh % HV
+    i_h = i_hv // (HV // H)
+
+    if IS_VARLEN:
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int64),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int64),
+        )
+        all = T
+        T = eos - bos
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        all = B * T
+
+    o_k = i_k * BK + tl.arange(0, BK)
+    o_v = i_v * BV + tl.arange(0, BV)
+
+    p_q = q + (bos * H + i_h) * K + o_k
+    p_k = k + (bos * H + i_h) * K + o_k
+    p_v = v + (bos * HV + i_hv) * V + o_v
+    p_b = b + bos * HV + i_hv
+    p_o = o + ((i_k * all + bos) * HV + i_hv) * V + o_v
+
+    # Gating computation pointers
+    p_A_log = A_log + i_hv
+    p_a = a + bos * HV + i_hv
+    p_dt_bias = dt_bias + i_hv
+
+    mask_k = o_k < K
+    mask_v = o_v < V
+    mask_h = mask_k[:, None] & mask_v[None, :]
+
+    b_h = tl.zeros([BK, BV], dtype=tl.float32)
+    if USE_INITIAL_STATE:
+        idx = tl.load(h0_indices + i_n)
+        if idx >= 0:
+            p_h0 = (
+                h0_source
+                + idx * HV * K * V
+                + i_hv * K * V
+                + o_k[:, None] * V
+                + o_v[None, :]
+            )
+            b_h += tl.load(p_h0, mask=mask_h, other=0).to(tl.float32)
+
+    for _ in range(0, T):
+        # Load inputs
+        b_q = tl.load(p_q, mask=mask_k, other=0).to(tl.float32)
+        b_k = tl.load(p_k, mask=mask_k, other=0).to(tl.float32)
+        b_v = tl.load(p_v, mask=mask_v, other=0).to(tl.float32)
+        b_b = tl.load(p_b).to(tl.float32)
+
+        # Compute sigmoid gating
+        # Load gating parameters
+        b_A_log = tl.load(p_A_log).to(tl.float32)
+        b_a = tl.load(p_a).to(tl.float32)
+        b_dt_bias = tl.load(p_dt_bias).to(tl.float32)
+
+        # Compute g = -exp(A_log) * softplus(a + dt_bias)
+        x = b_a + b_dt_bias
+        beta_x = softplus_beta * x
+        # Apply softplus with numerical stability
+        softplus_x = tl.where(
+            beta_x <= softplus_threshold,
+            (1.0 / softplus_beta) * tl.log(1.0 + tl.exp(beta_x)),
+            x,
+        )
+        b_g = -tl.exp(b_A_log) * softplus_x
+
+        # Compute beta = sigmoid(b)
+        b_beta = 1.0 / (1.0 + tl.exp(-b_b))
+
+        # Apply L2 normalization if enabled
+        if USE_QK_L2NORM_IN_KERNEL:
+            b_q = b_q / (tl.sqrt(tl.sum(b_q * b_q)) + 1e-6)
+            b_k = b_k / (tl.sqrt(tl.sum(b_k * b_k)) + 1e-6)
+
+        b_q = b_q * scale
+
+        # Apply gating to hidden state: h *= exp(g)
+        b_h *= tl.exp(b_g)
+
+        # Delta rule: v -= sum(h * k, dim=0)
+        b_v -= tl.sum(b_h * b_k[:, None], 0)
+
+        # Apply beta gating: v *= beta
+        b_v *= b_beta
+
+        # Update hidden state: h += k[:, None] * v[None, :]
+        b_h += b_k[:, None] * b_v[None, :]
+
+        # Compute output: o = sum(h * q, dim=0)
+        b_o = tl.sum(b_h * b_q[:, None], 0)
+        tl.store(p_o, b_o.to(p_o.dtype.element_ty), mask=mask_v)
+
+        # Update pointers for next timestep
+        p_q += H * K
+        p_k += H * K
+        p_o += HV * V
+        p_v += HV * V
+        p_b += HV
+        p_a += HV
+
+    # Store final state back to h0_source with bounds checking
+    if USE_INITIAL_STATE:
+        idx = tl.load(h0_indices + i_n)
+        if idx >= 0:
+            p_h0 = (
+                h0_source
+                + idx * HV * K * V
+                + i_hv * K * V
+                + o_k[:, None] * V
+                + o_v[None, :]
+            )
+            tl.store(p_h0, b_h.to(p_h0.dtype.element_ty), mask=mask_h)
+
+
+@input_guard
+def fused_sigmoid_gating_delta_rule_update(
+    A_log: torch.Tensor,
+    a: torch.Tensor,
+    dt_bias: torch.Tensor,
+    softplus_beta: float,
+    softplus_threshold: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    b: torch.Tensor,
+    initial_state_source: torch.Tensor,
+    initial_state_indices: torch.Tensor,
+    scale: Optional[float] = None,
+    use_qk_l2norm_in_kernel: bool = False,
+    cu_seqlens: Optional[torch.Tensor] = None,
+):
+    """
+    Fused triton implementation of sigmoid gating delta rule update.
+    This function uses a single fused kernel that combines both sigmoid gating computation
+    and the recurrent delta rule update for better performance.
+    """
+    B, T, H, K, V = *k.shape, v.shape[-1]
+    HV = v.shape[2]
+    N = B if cu_seqlens is None else len(cu_seqlens) - 1
+    BK, BV = triton.next_power_of_2(K), min(triton.next_power_of_2(V), 8)
+    NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV)
+    assert NK == 1, "NK > 1 is not supported yet"
+    num_stages = 3
+    num_warps = 1
+
+    if scale is None:
+        scale = k.shape[-1] ** -0.5
+    else:
+        assert scale > 0, "scale must be positive"
+
+    o = q.new_empty(NK, *v.shape)
+    grid = (NK, NV, N * HV)
+
+    fused_sigmoid_gating_delta_rule_update_kernel[grid](
+        A_log=A_log,
+        a=a,
+        dt_bias=dt_bias,
+        softplus_beta=softplus_beta,
+        softplus_threshold=softplus_threshold,
+        q=q,
+        k=k,
+        v=v,
+        b=b,
+        o=o,
+        h0_source=initial_state_source,
+        h0_indices=initial_state_indices,
+        cu_seqlens=cu_seqlens,
+        scale=scale,
+        T=T,
+        B=B,
+        H=H,
+        HV=HV,
+        K=K,
+        V=V,
+        BK=BK,
+        BV=BV,
+        USE_QK_L2NORM_IN_KERNEL=use_qk_l2norm_in_kernel,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    o = o.squeeze(0)
+    return o
diff --git a/python/sglang/srt/layers/attention/fla/index.py b/python/sglang/srt/layers/attention/fla/index.py
new file mode 100644
index 000000000..754b98714
--- /dev/null
+++ b/python/sglang/srt/layers/attention/fla/index.py
@@ -0,0 +1,37 @@
+# Adapt from https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/utils/index.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.fla.utils import tensor_cache
+
+
+@tensor_cache
+def prepare_lens(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return cu_seqlens[1:] - cu_seqlens[:-1]
+
+
+@tensor_cache
+def prepare_chunk_indices(
+    cu_seqlens: torch.LongTensor, chunk_size: int
+) -> torch.LongTensor:
+    indices = torch.cat(
+        [
+            torch.arange(n)
+            for n in triton.cdiv(prepare_lens(cu_seqlens), chunk_size).tolist()
+        ]
+    )
+    return torch.stack([indices.eq(0).cumsum(0) - 1, indices], 1).to(cu_seqlens)
+
+
+@tensor_cache
+def prepare_chunk_offsets(
+    cu_seqlens: torch.LongTensor, chunk_size: int
+) -> torch.LongTensor:
+    return torch.cat(
+        [cu_seqlens.new_tensor([0]), triton.cdiv(prepare_lens(cu_seqlens), chunk_size)]
+    ).cumsum(-1)
diff --git a/python/sglang/srt/layers/attention/fla/l2norm.py b/python/sglang/srt/layers/attention/fla/l2norm.py
new file mode 100644
index 000000000..d6b6ae7f7
--- /dev/null
+++ b/python/sglang/srt/layers/attention/fla/l2norm.py
@@ -0,0 +1,150 @@
+# Adapt from https://github.com/fla-org/flash-linear-attention/blob/main/fla/modules/l2norm.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.fla.utils import input_guard
+
+BT_LIST = [8, 16, 32, 64, 128]
+
+
+# @triton.autotune(
+#     configs=[
+#         triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8, 16, 32]
+#     ],
+#     key=["D"],
+# )
+@triton.jit
+def l2norm_fwd_kernel1(
+    x,
+    y,
+    D,
+    BD: tl.constexpr,
+    eps,
+):
+    i_t = tl.program_id(0)
+    x += i_t * D
+    y += i_t * D
+    # Compute mean and variance
+    cols = tl.arange(0, BD)
+    mask = cols < D
+    b_x = tl.load(x + cols, mask=mask, other=0.0).to(tl.float32)
+    b_var = tl.sum(b_x * b_x, axis=0)
+    b_rstd = 1 / tl.sqrt(b_var + eps)
+    # tl.store(Rstd + i_t, rstd)
+    # Normalize and apply linear transformation
+    b_y = b_x * b_rstd
+    tl.store(y + cols, b_y, mask=mask)
+
+
+# @triton.autotune(
+#     configs=[
+#         triton.Config({"BT": BT}, num_warps=num_warps)
+#         for num_warps in [1, 2, 4, 8, 16]
+#         for BT in BT_LIST
+#     ],
+#     key=["D", "NB"],
+# )
+@triton.jit
+def l2norm_fwd_kernel(
+    x,
+    y,
+    eps,
+    NB: tl.constexpr,
+    T: tl.constexpr,
+    D: tl.constexpr,
+    BT: tl.constexpr,
+    BD: tl.constexpr,
+):
+    i_t = tl.program_id(0)
+    p_x = tl.make_block_ptr(x, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0))
+    b_x = tl.load(p_x, boundary_check=(0, 1)).to(tl.float32)
+    b_var = tl.sum(b_x * b_x, axis=1)
+    b_y = b_x / tl.sqrt(b_var + eps)[:, None]
+    p_y = tl.make_block_ptr(y, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0))
+    tl.store(p_y, b_y.to(p_y.dtype.element_ty), boundary_check=(0, 1))
+
+
+def l2norm_fwd(
+    x: torch.Tensor, eps: float = 1e-6, output_dtype: Optional[torch.dtype] = None
+):
+    x_shape_og = x.shape
+    x = x.view(-1, x.shape[-1])
+    # allocate output
+    if output_dtype is None:
+        y = torch.empty_like(x)
+    else:
+        y = torch.empty_like(x, dtype=output_dtype)
+    assert y.stride(-1) == 1
+    T, D = x.shape[0], x.shape[-1]
+    # rstd = torch.empty((T,), dtype=torch.float32, device=x.device)
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BD = min(MAX_FUSED_SIZE, triton.next_power_of_2(D))
+    if D > BD:
+        raise RuntimeError("This layer doesn't support feature dim >= 64KB.")
+
+    if D <= 512:
+        NB = triton.cdiv(T, 2048)
+
+        def grid(meta):
+            return (triton.cdiv(T, meta["BT"]),)
+
+        l2norm_fwd_kernel[grid](
+            x,
+            y,
+            eps,
+            NB=NB,
+            T=T,
+            D=D,
+            BD=BD,
+            BT=16,
+            num_warps=8,
+            num_stages=3,
+        )
+    else:
+        l2norm_fwd_kernel1[(T,)](
+            x,
+            y,
+            eps=eps,
+            D=D,
+            BD=BD,
+            num_warps=8,
+            num_stages=3,
+        )
+
+    return y.view(x_shape_og)
+
+
+class L2NormFunction(torch.autograd.Function):
+
+    @staticmethod
+    @input_guard
+    def forward(ctx, x, eps=1e-6, output_dtype=None):
+        return l2norm_fwd(x, eps, output_dtype)
+
+
+def l2norm(
+    x: torch.Tensor, eps: float = 1e-6, output_dtype: Optional[torch.dtype] = None
+) -> torch.Tensor:
+    return L2NormFunction.apply(x, eps, output_dtype)
+
+
+l2_norm = l2norm
+
+
+class L2Norm(nn.Module):
+
+    def __init__(self, eps: float = 1e-6, output_dtype: Optional[torch.dtype] = None):
+        super().__init__()
+        self.eps = eps
+        self.output_dtype = output_dtype
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return l2norm(x, self.eps, self.output_dtype)
diff --git a/python/sglang/srt/layers/attention/fla/layernorm_gated.py b/python/sglang/srt/layers/attention/fla/layernorm_gated.py
new file mode 100644
index 000000000..bd53d0d64
--- /dev/null
+++ b/python/sglang/srt/layers/attention/fla/layernorm_gated.py
@@ -0,0 +1,326 @@
+# Adapt from https://github.com/fla-org/flash-linear-attention/blob/main/fla/modules/layernorm_gated.py
+# Copyright (c) 2024, Tri Dao.
+# Based on the Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
+# For the backward pass, we keep weight_grad and bias_grad in registers and accumulate.
+# This backward pass is faster for dimensions up to 8k, but after that it's much slower due to register spilling.
+# The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
+
+import math
+
+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+from einops import rearrange
+
+
+def rms_norm_ref(
+    x,
+    weight,
+    bias,
+    z=None,
+    eps=1e-6,
+    group_size=None,
+    norm_before_gate=True,
+    upcast=True,
+):
+    dtype = x.dtype
+    N = x.shape[-1]
+    weight = weight.float()
+    bias = bias.float() if bias is not None else None
+    if upcast:
+        x = x.float()
+        z = z.float() if z is not None else z
+    if z is not None and not norm_before_gate:
+        x = x * F.silu(z)
+    if group_size is None:
+        rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
+        out = (x * rstd * weight) + bias if bias is not None else (x * rstd * weight)
+    else:
+        x_group = rearrange(x, "... (g d) -> ... g d", d=group_size)
+        rstd = 1 / torch.sqrt((x_group.square()).mean(dim=-1, keepdim=True) + eps)
+        out = rearrange(x_group * rstd, "... g d -> ... (g d)") * weight
+        if bias is not None:
+            out = out + bias
+    if z is not None and norm_before_gate:
+        out *= F.silu(z)
+    return out.to(dtype)
+
+
+@triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
+@triton.heuristics({"HAS_Z": lambda args: args["Z"] is not None})
+@triton.jit
+def _layer_norm_fwd_1pass_kernel(
+    X,  # pointer to the input
+    Y,  # pointer to the output
+    W,  # pointer to the weights
+    B,  # pointer to the biases
+    Z,  # pointer to the other branch
+    Mean,  # pointer to the mean
+    Rstd,  # pointer to the 1/std
+    stride_x_row,  # how much to increase the pointer when moving by 1 row
+    stride_y_row,
+    stride_z_row,
+    M,  # number of rows in X
+    N,  # number of columns in X
+    eps,  # epsilon to avoid division by zero
+    BLOCK_N: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+    HAS_Z: tl.constexpr,
+    NORM_BEFORE_GATE: tl.constexpr,
+    IS_RMS_NORM: tl.constexpr,
+):
+    # Map the program id to the row of X and Y it should compute.
+    row = tl.program_id(0)
+    group = tl.program_id(1)
+    X += row * stride_x_row + group * N
+    Y += row * stride_y_row + group * N
+    if HAS_Z:
+        Z += row * stride_z_row + group * N
+    if not IS_RMS_NORM:
+        Mean += group * M
+    Rstd += group * M
+    W += group * N
+    if HAS_BIAS:
+        B += group * N
+    # Compute mean and variance
+    cols = tl.arange(0, BLOCK_N)
+    x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32)
+    if HAS_Z and not NORM_BEFORE_GATE:
+        z = tl.load(Z + cols, mask=cols < N).to(tl.float32)
+        x *= z * tl.sigmoid(z)
+    if not IS_RMS_NORM:
+        mean = tl.sum(x, axis=0) / N
+        tl.store(Mean + row, mean)
+        xbar = tl.where(cols < N, x - mean, 0.0)
+        var = tl.sum(xbar * xbar, axis=0) / N
+    else:
+        xbar = tl.where(cols < N, x, 0.0)
+        var = tl.sum(xbar * xbar, axis=0) / N
+    rstd = 1 / tl.sqrt(var + eps)
+    tl.store(Rstd + row, rstd)
+    # Normalize and apply linear transformation
+    mask = cols < N
+    w = tl.load(W + cols, mask=mask).to(tl.float32)
+    if HAS_BIAS:
+        b = tl.load(B + cols, mask=mask).to(tl.float32)
+    x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
+    y = x_hat * w + b if HAS_BIAS else x_hat * w
+    if HAS_Z and NORM_BEFORE_GATE:
+        z = tl.load(Z + cols, mask=mask).to(tl.float32)
+        y *= z * tl.sigmoid(z)
+    # Write output
+    tl.store(Y + cols, y, mask=mask)
+
+
+def _layer_norm_fwd(
+    x,
+    weight,
+    bias,
+    eps,
+    z=None,
+    out=None,
+    group_size=None,
+    norm_before_gate=True,
+    is_rms_norm=False,
+):
+    M, N = x.shape
+    if group_size is None:
+        group_size = N
+    assert N % group_size == 0
+    ngroups = N // group_size
+    assert x.stride(-1) == 1
+    if z is not None:
+        assert z.stride(-1) == 1
+        assert z.shape == (M, N)
+    assert weight.shape == (N,)
+    assert weight.stride(-1) == 1
+    if bias is not None:
+        assert bias.stride(-1) == 1
+        assert bias.shape == (N,)
+    # allocate output
+    if out is not None:
+        assert out.shape == x.shape
+    else:
+        out = torch.empty_like(x)
+    assert out.stride(-1) == 1
+    mean = (
+        torch.empty((ngroups * M,), dtype=torch.float32, device=x.device)
+        if not is_rms_norm
+        else None
+    )
+    rstd = torch.empty((ngroups * M,), dtype=torch.float32, device=x.device)
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size))
+    if group_size > BLOCK_N:
+        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK_N // 256, 1), 8)
+    grid = (M, ngroups)
+    with torch.cuda.device(x.device.index):
+        _layer_norm_fwd_1pass_kernel[grid](
+            x,
+            out,
+            weight,
+            bias,
+            z,
+            mean,
+            rstd,
+            x.stride(0),
+            out.stride(0),
+            z.stride(0) if z is not None else 0,
+            M,
+            group_size,
+            eps,
+            BLOCK_N=BLOCK_N,
+            NORM_BEFORE_GATE=norm_before_gate,
+            IS_RMS_NORM=is_rms_norm,
+            num_warps=num_warps,
+        )
+    return out, mean, rstd
+
+
+class LayerNormFn(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx,
+        x,
+        weight,
+        bias,
+        z=None,
+        eps=1e-6,
+        group_size=None,
+        norm_before_gate=True,
+        is_rms_norm=False,
+    ):
+        """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))"""
+
+        x_shape_og = x.shape
+        # reshape input data into 2D tensor
+        x = x.reshape(-1, x.shape[-1])
+        if x.stride(-1) != 1:
+            x = x.contiguous()
+        if z is not None:
+            assert z.shape == x_shape_og
+            z = z.reshape(-1, z.shape[-1])
+            if z.stride(-1) != 1:
+                z = z.contiguous()
+        weight = weight.contiguous()
+        if bias is not None:
+            bias = bias.contiguous()
+        y, mean, rstd = _layer_norm_fwd(
+            x,
+            weight,
+            bias,
+            eps,
+            z=z,
+            group_size=group_size,
+            norm_before_gate=norm_before_gate,
+            is_rms_norm=is_rms_norm,
+        )
+        return y.reshape(x_shape_og)
+
+
+def layernorm_fn(
+    x,
+    weight,
+    bias,
+    z=None,
+    eps=1e-6,
+    group_size=None,
+    norm_before_gate=True,
+    is_rms_norm=False,
+):
+    return LayerNormFn.apply(
+        x, weight, bias, z, eps, group_size, norm_before_gate, is_rms_norm
+    )
+
+
+def rmsnorm_fn(
+    x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True
+):
+    return LayerNormFn.apply(
+        x, weight, bias, z, eps, group_size, norm_before_gate, True
+    )
+
+
+class LayerNorm(torch.nn.Module):
+
+    def __init__(
+        self,
+        hidden_size,
+        eps=1e-5,
+        group_size=None,
+        norm_before_gate=True,
+        device=None,
+        dtype=None,
+    ):
+        """If group_size is not None, we do GroupNorm with each group having group_size elements.
+        group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group).
+        """
+
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+        self.bias = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+        self.group_size = group_size
+        self.norm_before_gate = norm_before_gate
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        torch.nn.init.ones_(self.weight)
+        torch.nn.init.zeros_(self.bias)
+
+    def forward(self, x, z=None):
+        """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))"""
+        return layernorm_fn(
+            x,
+            self.weight,
+            self.bias,
+            z=z,
+            group_size=self.group_size,
+            eps=self.eps,
+            norm_before_gate=self.norm_before_gate,
+        )
+
+
+class RMSNorm(torch.nn.Module):
+
+    def __init__(
+        self,
+        hidden_size,
+        eps=1e-5,
+        group_size=None,
+        norm_before_gate=True,
+        device=None,
+        dtype=None,
+    ):
+        """If group_size is not None, we do GroupNorm with each group having group_size elements.
+        group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group).
+        """
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+        self.register_parameter("bias", None)
+        self.group_size = group_size
+        self.norm_before_gate = norm_before_gate
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        torch.nn.init.ones_(self.weight)
+
+    def forward(self, x, z=None):
+        """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))"""
+        return rmsnorm_fn(
+            x,
+            self.weight,
+            self.bias,
+            z=z,
+            eps=self.eps,
+            group_size=self.group_size,
+            norm_before_gate=self.norm_before_gate,
+        )
diff --git a/python/sglang/srt/layers/attention/fla/op.py b/python/sglang/srt/layers/attention/fla/op.py
new file mode 100644
index 000000000..9b3191075
--- /dev/null
+++ b/python/sglang/srt/layers/attention/fla/op.py
@@ -0,0 +1,66 @@
+# Adapt from https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/utils/op.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+import os
+
+import triton
+import triton.language as tl
+import triton.language.extra.libdevice as tldevice
+
+from sglang.srt.layers.attention.fla.utils import is_gather_supported
+
+if os.environ.get("FLA_USE_FAST_OPS", "0") == "1":
+    exp = tldevice.fast_expf
+    exp2 = tldevice.exp2
+    log = tldevice.fast_logf
+    log2 = tldevice.fast_log2f
+else:
+    exp = tl.exp
+    exp2 = tl.math.exp2
+    log = tl.log
+    log2 = tl.log2
+
+
+@triton.jit
+def safe_exp(x):
+    return exp(tl.where(x <= 0, x, float("-inf")))
+
+
+if not is_gather_supported:
+
+    @triton.jit
+    def gather(src, index, axis, _builder=None):
+        """
+        Gather operation that works when tl.gather is not supported.
+        This is a fallback implementation that returns None.
+        Just to make triton compiler happy.
+        """
+        return None
+
+else:
+    gather = tl.gather
+
+
+if hasattr(triton.language, "_experimental_make_tensor_descriptor"):
+    # For Triton 3.3.x
+    make_tensor_descriptor = triton.language._experimental_make_tensor_descriptor
+elif hasattr(triton.language, "make_tensor_descriptor"):
+    # For Triton 3.4.x and later
+    make_tensor_descriptor = triton.language.make_tensor_descriptor
+else:
+    """
+    Fallback implementation when TMA is not supported.
+    Returns None to indicate TMA descriptors are unavailable.
+    Just make triton compiler happy.
+    """
+
+    @triton.jit
+    def make_tensor_descriptor(
+        base,
+        shape,
+        strides,
+        block_shape,
+        _builder=None,
+    ):
+        return None
diff --git a/python/sglang/srt/layers/attention/fla/solve_tril.py b/python/sglang/srt/layers/attention/fla/solve_tril.py
new file mode 100644
index 000000000..5c519507d
--- /dev/null
+++ b/python/sglang/srt/layers/attention/fla/solve_tril.py
@@ -0,0 +1,465 @@
+# Adapt from https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/utils/solve_tril.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+from typing import Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.fla.index import prepare_chunk_indices
+from sglang.srt.layers.attention.fla.utils import input_guard
+
+
+@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None})
+# @triton.autotune(
+#     configs=[
+#         triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+#         for num_warps in [1, 2, 4, 8]
+#         for num_stages in [2, 3, 4, 5]
+#     ],
+#     key=["BT"],
+# )
+@triton.jit(do_not_specialize=["T"])
+def solve_tril_16x16_kernel(
+    A,
+    Ad,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(
+            chunk_indices + i_t * 2 + 1
+        ).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(
+            cu_seqlens + i_n + 1
+        ).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    A = A + (bos * H + i_h) * BT
+    Ad = Ad + (bos * H + i_h) * 16
+
+    offset = (i_t * 16) % BT
+    p_A = tl.make_block_ptr(
+        A, (T, BT), (H * BT, 1), (i_t * 16, offset), (16, 16), (1, 0)
+    )
+    p_Ai = tl.make_block_ptr(Ad, (T, 16), (H * 16, 1), (i_t * 16, 0), (16, 16), (1, 0))
+    b_A = tl.load(p_A, boundary_check=(0, 1)).to(tl.float32)
+    b_A = -tl.where(tl.arange(0, 16)[:, None] > tl.arange(0, 16)[None, :], b_A, 0)
+
+    o_i = tl.arange(0, 16)
+    for i in range(1, min(16, T - i_t * 16)):
+        b_a = -tl.load(A + (i_t * 16 + i) * H * BT + o_i + offset)
+        b_a = b_a + tl.sum(b_a[:, None] * b_A, 0)
+        mask = o_i == i
+        b_A = tl.where(mask[:, None], b_a, b_A)
+    b_A += o_i[:, None] == o_i[None, :]
+    tl.store(
+        p_Ai,
+        b_A.to(p_Ai.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+
+
+@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None})
+# @triton.autotune(
+#     configs=[
+#         triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+#         for num_warps in [1, 2, 4, 8]
+#         for num_stages in [2, 3, 4, 5]
+#     ],
+#     key=["H", "BT", "IS_VARLEN"],
+# )
+@triton.jit(do_not_specialize=["T"])
+def merge_16x16_to_32x32_inverse_kernel(
+    A,
+    Ad,
+    Ai,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(
+            chunk_indices + i_t * 2 + 1
+        ).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(
+            cu_seqlens + i_n + 1
+        ).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    A += (bos * H + i_h) * 32
+    Ad += (bos * H + i_h) * 16
+    Ai += (bos * H + i_h) * 32
+
+    p_A_21 = tl.make_block_ptr(
+        A, (T, 32), (H * 32, 1), (i_t * 32 + 16, 0), (16, 16), (1, 0)
+    )
+    p_Ad_11 = tl.make_block_ptr(
+        Ad, (T, 16), (H * 16, 1), (i_t * 32, 0), (16, 16), (1, 0)
+    )
+    p_Ad_22 = tl.make_block_ptr(
+        Ad, (T, 16), (H * 16, 1), (i_t * 32 + 16, 0), (16, 16), (1, 0)
+    )
+    p_Ai_11 = tl.make_block_ptr(
+        Ai, (T, 32), (H * 32, 1), (i_t * 32, 0), (16, 16), (1, 0)
+    )
+    p_Ai_22 = tl.make_block_ptr(
+        Ai, (T, 32), (H * 32, 1), (i_t * 32 + 16, 16), (16, 16), (1, 0)
+    )
+    p_Ai_21 = tl.make_block_ptr(
+        Ai, (T, 32), (H * 32, 1), (i_t * 32 + 16, 0), (16, 16), (1, 0)
+    )
+
+    A_21 = tl.load(p_A_21, boundary_check=(0, 1)).to(tl.float32)
+    Ai_11 = tl.load(p_Ad_11, boundary_check=(0, 1)).to(tl.float32)
+    Ai_22 = tl.load(p_Ad_22, boundary_check=(0, 1)).to(tl.float32)
+    Ai_21 = -tl.dot(
+        tl.dot(Ai_22, A_21, input_precision="ieee"), Ai_11, input_precision="ieee"
+    )
+    tl.store(
+        p_Ai_11,
+        Ai_11.to(p_Ai_11.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_22,
+        Ai_22.to(p_Ai_22.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_21,
+        Ai_21.to(p_Ai_21.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+
+
+@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None})
+# @triton.autotune(
+#     configs=[
+#         triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+#         for num_warps in [2, 4, 8]
+#         for num_stages in [2, 3, 4, 5]
+#     ],
+#     key=["H", "BT", "IS_VARLEN"],
+# )
+@triton.jit(do_not_specialize=["T"])
+def merge_16x16_to_64x64_inverse_kernel(
+    A,
+    Ad,
+    Ai,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(
+            chunk_indices + i_t * 2 + 1
+        ).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(
+            cu_seqlens + i_n + 1
+        ).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    A += (bos * H + i_h) * 64
+    Ad += (bos * H + i_h) * 16
+    Ai += (bos * H + i_h) * 64
+
+    p_A_21 = tl.make_block_ptr(
+        A, (T, 64), (H * 64, 1), (i_t * 64 + 16, 0), (16, 16), (1, 0)
+    )
+    p_A_32 = tl.make_block_ptr(
+        A, (T, 64), (H * 64, 1), (i_t * 64 + 32, 16), (16, 16), (1, 0)
+    )
+    p_A_31 = tl.make_block_ptr(
+        A, (T, 64), (H * 64, 1), (i_t * 64 + 32, 0), (16, 16), (1, 0)
+    )
+    p_A_43 = tl.make_block_ptr(
+        A, (T, 64), (H * 64, 1), (i_t * 64 + 48, 32), (16, 16), (1, 0)
+    )
+    p_A_42 = tl.make_block_ptr(
+        A, (T, 64), (H * 64, 1), (i_t * 64 + 48, 16), (16, 16), (1, 0)
+    )
+    p_A_41 = tl.make_block_ptr(
+        A, (T, 64), (H * 64, 1), (i_t * 64 + 48, 0), (16, 16), (1, 0)
+    )
+    p_Ad_11 = tl.make_block_ptr(
+        Ad, (T, 16), (H * 16, 1), (i_t * 64, 0), (16, 16), (1, 0)
+    )
+    p_Ad_22 = tl.make_block_ptr(
+        Ad, (T, 16), (H * 16, 1), (i_t * 64 + 16, 0), (16, 16), (1, 0)
+    )
+    p_Ad_33 = tl.make_block_ptr(
+        Ad, (T, 16), (H * 16, 1), (i_t * 64 + 32, 0), (16, 16), (1, 0)
+    )
+    p_Ad_44 = tl.make_block_ptr(
+        Ad, (T, 16), (H * 16, 1), (i_t * 64 + 48, 0), (16, 16), (1, 0)
+    )
+
+    A_21 = tl.load(p_A_21, boundary_check=(0, 1)).to(tl.float32)
+    A_32 = tl.load(p_A_32, boundary_check=(0, 1)).to(tl.float32)
+    A_31 = tl.load(p_A_31, boundary_check=(0, 1)).to(tl.float32)
+    A_43 = tl.load(p_A_43, boundary_check=(0, 1)).to(tl.float32)
+    A_42 = tl.load(p_A_42, boundary_check=(0, 1)).to(tl.float32)
+    A_41 = tl.load(p_A_41, boundary_check=(0, 1)).to(tl.float32)
+
+    Ai_11 = tl.load(p_Ad_11, boundary_check=(0, 1)).to(tl.float32)
+    Ai_22 = tl.load(p_Ad_22, boundary_check=(0, 1)).to(tl.float32)
+    Ai_33 = tl.load(p_Ad_33, boundary_check=(0, 1)).to(tl.float32)
+    Ai_44 = tl.load(p_Ad_44, boundary_check=(0, 1)).to(tl.float32)
+
+    Ai_21 = -tl.dot(
+        tl.dot(Ai_22, A_21, input_precision="ieee"), Ai_11, input_precision="ieee"
+    )
+    Ai_32 = -tl.dot(
+        tl.dot(Ai_33, A_32, input_precision="ieee"), Ai_22, input_precision="ieee"
+    )
+    Ai_43 = -tl.dot(
+        tl.dot(Ai_44, A_43, input_precision="ieee"), Ai_33, input_precision="ieee"
+    )
+
+    Ai_31 = -tl.dot(
+        Ai_33,
+        tl.dot(A_31, Ai_11, input_precision="ieee")
+        + tl.dot(A_32, Ai_21, input_precision="ieee"),
+        input_precision="ieee",
+    )
+    Ai_42 = -tl.dot(
+        Ai_44,
+        tl.dot(A_42, Ai_22, input_precision="ieee")
+        + tl.dot(A_43, Ai_32, input_precision="ieee"),
+        input_precision="ieee",
+    )
+    Ai_41 = -tl.dot(
+        Ai_44,
+        tl.dot(A_41, Ai_11, input_precision="ieee")
+        + tl.dot(A_42, Ai_21, input_precision="ieee")
+        + tl.dot(A_43, Ai_31, input_precision="ieee"),
+        input_precision="ieee",
+    )
+
+    p_Ai_11 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64, 0), (16, 16), (1, 0)
+    )
+    p_Ai_22 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64 + 16, 16), (16, 16), (1, 0)
+    )
+    p_Ai_33 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64 + 32, 32), (16, 16), (1, 0)
+    )
+    p_Ai_44 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64 + 48, 48), (16, 16), (1, 0)
+    )
+    p_Ai_21 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64 + 16, 0), (16, 16), (1, 0)
+    )
+    p_Ai_31 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64 + 32, 0), (16, 16), (1, 0)
+    )
+    p_Ai_32 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64 + 32, 16), (16, 16), (1, 0)
+    )
+    p_Ai_41 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64 + 48, 0), (16, 16), (1, 0)
+    )
+    p_Ai_42 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64 + 48, 16), (16, 16), (1, 0)
+    )
+    p_Ai_43 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64 + 48, 32), (16, 16), (1, 0)
+    )
+    tl.store(
+        p_Ai_11,
+        Ai_11.to(p_Ai_11.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_22,
+        Ai_22.to(p_Ai_22.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_33,
+        Ai_33.to(p_Ai_33.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_44,
+        Ai_44.to(p_Ai_44.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_21,
+        Ai_21.to(p_Ai_21.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_31,
+        Ai_31.to(p_Ai_31.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_32,
+        Ai_32.to(p_Ai_32.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_41,
+        Ai_41.to(p_Ai_41.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_42,
+        Ai_42.to(p_Ai_42.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_43,
+        Ai_43.to(p_Ai_43.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+
+    fill_zeros = tl.zeros((16, 16), dtype=tl.float32)
+    p_Ai_12 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64, 16), (16, 16), (1, 0)
+    )
+    p_Ai_13 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64, 32), (16, 16), (1, 0)
+    )
+    p_Ai_14 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64, 48), (16, 16), (1, 0)
+    )
+    p_Ai_23 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64 + 16, 32), (16, 16), (1, 0)
+    )
+    p_Ai_24 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64 + 16, 48), (16, 16), (1, 0)
+    )
+    p_Ai_34 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64 + 32, 48), (16, 16), (1, 0)
+    )
+    tl.store(
+        p_Ai_12,
+        fill_zeros.to(p_Ai_12.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_13,
+        fill_zeros.to(p_Ai_13.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_14,
+        fill_zeros.to(p_Ai_14.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_23,
+        fill_zeros.to(p_Ai_23.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_24,
+        fill_zeros.to(p_Ai_24.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_34,
+        fill_zeros.to(p_Ai_34.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+
+
+@input_guard
+def solve_tril(
+    A: torch.Tensor,
+    cu_seqlens: Optional[torch.Tensor] = None,
+    output_dtype: torch.dtype = torch.float,
+) -> torch.Tensor:
+    """
+    Compute the inverse of the lower triangular matrix
+    A should be strictly lower triangular, i.e., A.triu() == 0.
+
+    Args:
+        A (torch.Tensor):
+            [B, T, H, K]
+        cu_seqlens (torch.Tensor):
+            The cumulative sequence lengths of the input tensor.
+            Default: None.
+        output_dtype (torch.dtype):
+            The dtype of the output tensor. Default: `torch.float`
+
+    Returns:
+        (I + A)^-1 with the same shape as A
+    """
+    assert A.shape[-1] in [16, 32, 64]
+
+    B, T, H, BT = A.shape
+    Ad = torch.empty(
+        B, T, H, 16, device=A.device, dtype=torch.float if BT != 16 else output_dtype
+    )
+
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, 16) if cu_seqlens is not None else None
+    )
+    NT = len(chunk_indices) if cu_seqlens is not None else triton.cdiv(T, 16)
+    solve_tril_16x16_kernel[NT, B * H](
+        A=A,
+        Ad=Ad,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        BT=BT,
+        num_warps=1,
+        num_stages=4,
+    )
+    if BT == 16:
+        return Ad
+
+    Ai = torch.empty(B, T, H, BT, device=A.device, dtype=output_dtype)
+    merge_fn = (
+        merge_16x16_to_32x32_inverse_kernel
+        if BT == 32
+        else merge_16x16_to_64x64_inverse_kernel
+    )
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    )
+    NT = len(chunk_indices) if cu_seqlens is not None else triton.cdiv(T, BT)
+    merge_fn[NT, B * H](
+        A=A,
+        Ad=Ad,
+        Ai=Ai,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        BT=BT,
+        num_warps=4,
+        num_stages=3,
+    )
+    return Ai
diff --git a/python/sglang/srt/layers/attention/fla/utils.py b/python/sglang/srt/layers/attention/fla/utils.py
new file mode 100644
index 000000000..3caf70de5
--- /dev/null
+++ b/python/sglang/srt/layers/attention/fla/utils.py
@@ -0,0 +1,331 @@
+# Adapt from https://github.com/fla-org/flash-linear-attention/blob/main/fla/utils.py
+# -*- coding: utf-8 -*-
+
+import contextlib
+import functools
+import logging
+import os
+import sys
+from enum import Enum
+from functools import lru_cache
+from typing import Any, Callable, Dict, Literal, Optional, Tuple
+
+import torch
+import triton
+from packaging import version
+
+logger = logging.getLogger(__name__)
+
+COMPILER_MODE = os.getenv("FLA_COMPILER_MODE") == "1"
+FLA_CI_ENV = os.getenv("FLA_CI_ENV") == "1"
+
+
+@lru_cache(maxsize=1)
+def check_environments():
+    """
+    Checks the current operating system, Triton version, and Python version,
+    issuing warnings if they don't meet recommendations.
+    This function's body only runs once due to lru_cache.
+    """
+    # Check Operating System
+    if sys.platform == "win32":
+        logger.warning(
+            "Detected Windows operating system. Triton does not have an official Windows release, "
+            "thus FLA will not be adapted for Windows, and any potential errors will not be fixed. "
+            "Please consider using a Linux environment for compatibility."
+        )
+
+    triton_version = version.parse(triton.__version__)
+    required_triton_version = version.parse("3.2.0")
+
+    if triton_version < required_triton_version:
+        logger.warning(
+            f"Current Triton version {triton_version} is below the recommended 3.2.0 version. "
+            "Errors may occur and these issues will not be fixed. "
+            "Please consider upgrading Triton."
+        )
+
+    # Check Python version
+    py_version = version.parse(f"{sys.version_info.major}.{sys.version_info.minor}")
+    required_py_version = version.parse("3.11")
+
+    if py_version < required_py_version:
+        logger.warning(
+            f"Current Python version {py_version} is below the recommended 3.11 version. "
+            "It is recommended to upgrade to Python 3.11 or higher for the best experience."
+        )
+
+    return None
+
+
+check_environments()
+
+
+def get_abs_err(x, y):
+    return (x.detach() - y.detach()).flatten().abs().max().item()
+
+
+def get_err_ratio(x, y):
+    err = (x.detach() - y.detach()).flatten().square().mean().sqrt().item()
+    base = (x.detach()).flatten().square().mean().sqrt().item()
+    return err / (base + 1e-8)
+
+
+def assert_close(prefix, ref, tri, ratio, warning=False, err_atol=1e-6):
+    abs_atol = get_abs_err(ref, tri)
+    msg = f"{prefix} diff: {abs_atol:.6f} ratio: {get_err_ratio(ref, tri):.6f}"
+    logger.info(msg)
+    error_rate = get_err_ratio(ref, tri)
+    if abs_atol <= err_atol:
+        return
+    if warning or (FLA_CI_ENV and (error_rate < 0.01 or abs_atol <= 0.3)):
+        if error_rate > ratio:
+            import warnings
+
+            warnings.warn(msg)
+    else:
+        assert error_rate < ratio, msg
+
+
+SUPPRESS_LEVEL = int(os.getenv("GDN_RECOMPUTE_SUPPRESS_LEVEL", "0"))
+
+
+def tensor_cache(fn: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]:
+    """
+    A decorator that caches the most recent results of a function with tensor inputs.
+    This decorator will store the output of the decorated function for the most recent set of input tensors.
+    The cache is limited to a fixed size (default is 4). When the cache is full, the oldest entry will be removed.
+    Args:
+        fn (Callable[..., torch.Tensor]):
+            The function to be decorated. It should take tensor inputs and return tensor outputs.
+    Returns:
+        Callable[..., torch.Tensor]:
+            A wrapped version of the input function with single-entry caching.
+    """
+
+    cache_entries: Tuple[Optional[Tuple], Optional[Dict], Any] = []
+    cache_size = 4
+
+    @functools.wraps(fn)
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+        nonlocal cache_entries, cache_size
+        for i, entry in enumerate(cache_entries):
+            last_args, last_kwargs, last_result = entry
+            if len(args) == len(last_args) and len(kwargs) == len(last_kwargs):
+                if all(a is b for a, b in zip(args, last_args)) and all(
+                    k in last_kwargs and v is last_kwargs[k] for k, v in kwargs.items()
+                ):
+                    cache_entries = (
+                        cache_entries[:i]
+                        + cache_entries[i + 1 :]
+                        + [(args, kwargs, last_result)]
+                    )
+                    return last_result
+
+        result = fn(*args, **kwargs)
+
+        if len(cache_entries) >= cache_size:
+            cache_entries = cache_entries[1:]
+        cache_entries.append((args, kwargs, result))
+        return result
+
+    return wrapper
+
+
+def input_guard(fn: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]:
+    """
+    A decorator to make sure all input tensors are contiguous and set the device based on input tensors.
+    """
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        contiguous_args = (
+            i if not isinstance(i, torch.Tensor) else i.contiguous() for i in args
+        )
+        contiguous_kwargs = {
+            k: (v if not isinstance(v, torch.Tensor) else v.contiguous())
+            for k, v in kwargs.items()
+        }
+
+        tensor = None
+        for arg in args:
+            if isinstance(arg, torch.Tensor):
+                tensor = arg
+                break
+        if tensor is None:
+            for value in kwargs.values():
+                if isinstance(value, torch.Tensor):
+                    tensor = value
+                    break
+
+        if tensor is not None:
+            ctx = custom_device_ctx(tensor.device.index)
+        else:
+            ctx = contextlib.nullcontext()
+
+        with ctx:
+            return fn(*contiguous_args, **contiguous_kwargs)
+
+    return wrapper
+
+
+contiguous = input_guard
+
+
+def require_version(version, hint):
+    """
+    Perform a runtime check of the dependency versions, using the exact same syntax used by pip.
+    """
+
+    def decorator(fn):
+        @functools.wraps(fn)
+        def wrapper(ctx, *args, **kwargs):
+            from transformers.utils.versions import require_version
+
+            require_version(version, hint)
+            return fn(
+                ctx,
+                *(
+                    i if not isinstance(i, torch.Tensor) else i.contiguous()
+                    for i in args
+                ),
+                **{
+                    k: (v if not isinstance(v, torch.Tensor) else v.contiguous())
+                    for k, v in kwargs.items()
+                },
+            )
+
+        return wrapper
+
+    return decorator
+
+
+def checkpoint(fn):
+    def wrapper(*args, **kwargs):
+        return torch.utils.checkpoint.checkpoint(fn, *args, **kwargs)
+
+    return wrapper
+
+
+@lru_cache(maxsize=None)
+def check_pytorch_version(version_s: str = "2.4") -> bool:
+    return version.parse(torch.__version__) >= version.parse(version_s)
+
+
+def _cpu_device_warning():
+    import warnings
+
+    warnings.warn(
+        ("Triton is not supported on current platform, roll back to CPU."), stacklevel=1
+    )
+
+
+@lru_cache(maxsize=None)
+def get_multiprocessor_count(tensor_idx: int = 0) -> int:
+    try:
+        return triton.runtime.driver.active.utils.get_device_properties(tensor_idx)[
+            "multiprocessor_count"
+        ]
+    except BaseException:
+        _cpu_device_warning()
+        return -1
+
+
+@lru_cache(maxsize=None)
+def get_available_device() -> str:
+    try:
+        return triton.runtime.driver.active.get_current_target().backend
+    except BaseException:
+        _cpu_device_warning()
+        return "cpu"
+
+
+@lru_cache(maxsize=None)
+def _check_platform() -> Literal["nvidia", "amd", "intel", "musa"]:
+    device = get_available_device()
+    if device == "cuda":
+        return "nvidia"
+    elif device == "hip":
+        return "amd"
+    elif device == "xpu":
+        return "intel"
+    else:
+        return device
+
+
+# For AMD GPUs, the triton backend is 'hip', while for Nvidia GPUs, the triton backend is 'cuda'.
+# However, the torch backend is 'cuda' for both Nvidia and AMD GPUs.
+# Therefore, we need to check the triton backend to determine the actual GPU vendor.
+device = get_available_device() if get_available_device() != "hip" else "cuda"
+device_torch_lib = getattr(torch, device)
+device_platform = _check_platform()
+
+is_amd = device_platform == "amd"
+is_intel = device_platform == "intel"
+is_nvidia = device_platform == "nvidia"
+is_intel_alchemist = is_intel and "Intel(R) Arc(TM) A" in torch.xpu.get_device_name(0)
+is_nvidia_hopper = is_nvidia and (
+    "NVIDIA H" in torch.cuda.get_device_name(0)
+    or torch.cuda.get_device_capability()[0] >= 9
+)
+use_cuda_graph = is_nvidia and os.environ.get("FLA_USE_CUDA_GRAPH", "0") == "1"
+
+# Nvidia Ampere or newer, haven't check AMD and intel yet.
+is_tf32_supported = is_nvidia and torch.cuda.get_device_capability(0)[0] >= 8
+is_gather_supported = hasattr(triton.language, "gather")
+
+
+def get_all_max_shared_mem():
+    try:
+        return [
+            triton.runtime.driver.active.utils.get_device_properties(i)[
+                "max_shared_mem"
+            ]
+            for i in range(device_torch_lib.device_count())
+        ]
+    except BaseException:
+        _cpu_device_warning()
+        return [-1]
+
+
+class Backend(Enum):
+    ADA = 101376  # RTX 4090
+    AMPERE = 166912  # A100
+    HOPPER = 232448  # H100
+    DEFAULT = 102400  # Default
+
+    @classmethod
+    def get_shared_memory(cls, arch: str) -> int:
+        try:
+            return cls[arch.upper()].value
+        except KeyError:
+            return cls.DEFAULT.value
+
+
+@lru_cache(maxsize=None)
+def check_shared_mem(arch: str = "none", tensor_idx: int = 0) -> bool:
+    try:
+        device_shared_mem_list = get_all_max_shared_mem()
+        max_shared_memory = device_shared_mem_list[tensor_idx]
+        return max_shared_memory >= Backend.get_shared_memory(arch)
+    except Exception:
+        return False
+
+
+if check_pytorch_version("2.4"):
+    device = "cuda" if device == "cpu" else device
+    autocast_custom_fwd = functools.partial(torch.amp.custom_fwd, device_type=device)
+    autocast_custom_bwd = functools.partial(torch.amp.custom_bwd, device_type=device)
+
+    def custom_device_ctx(index: int):
+        return device_torch_lib.device(index)
+
+else:
+    assert (
+        device == "cuda"
+    ), "Only cuda device is supported for PyTorch version < 2.4.0."
+    autocast_custom_fwd = device_torch_lib.amp.custom_fwd
+    autocast_custom_bwd = device_torch_lib.amp.custom_bwd
+
+    def custom_device_ctx(index: int):
+        return torch.cuda.device(index)
diff --git a/python/sglang/srt/layers/attention/fla/wy_fast.py b/python/sglang/srt/layers/attention/fla/wy_fast.py
new file mode 100644
index 000000000..d51500eb4
--- /dev/null
+++ b/python/sglang/srt/layers/attention/fla/wy_fast.py
@@ -0,0 +1,158 @@
+# Adapt from https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/gated_delta_rule/wy_fast.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+from typing import Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.fla.index import prepare_chunk_indices
+from sglang.srt.layers.attention.fla.op import safe_exp
+from sglang.srt.layers.attention.fla.utils import check_shared_mem
+
+
+@triton.heuristics({"IS_VARLEN": lambda args: args["cu_seqlens"] is not None})
+# @triton.autotune(
+#     configs=[
+#         triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+#         for num_warps in [2, 4, 8]
+#         for num_stages in [2, 3, 4]
+#     ],
+#     key=["H", "K", "V", "BT", "BK", "BV", "IS_VARLEN"],
+# )
+@triton.jit(do_not_specialize=["T"])
+def recompute_w_u_fwd_kernel(
+    k,
+    v,
+    beta,
+    w,
+    u,
+    A,
+    g,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    Hg: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(
+            chunk_indices + i_t * 2 + 1
+        ).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(
+            cu_seqlens + i_n + 1
+        ).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    p_beta = tl.make_block_ptr(
+        beta + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,)
+    )
+    p_g = tl.make_block_ptr(g + (bos * H + i_h), (T,), (H,), (i_t * BT,), (BT,), (0,))
+    p_A = tl.make_block_ptr(
+        A + (bos * H + i_h) * BT, (T, BT), (H * BT, 1), (i_t * BT, 0), (BT, BT), (1, 0)
+    )
+    b_beta = tl.load(p_beta, boundary_check=(0,))
+    b_A = tl.load(p_A, boundary_check=(0, 1))
+    b_g = tl.exp(tl.load(p_g, boundary_check=(0,)))
+
+    for i_v in range(tl.cdiv(V, BV)):
+        p_v = tl.make_block_ptr(
+            v + (bos * H + i_h) * V,
+            (T, V),
+            (H * V, 1),
+            (i_t * BT, i_v * BV),
+            (BT, BV),
+            (1, 0),
+        )
+        p_u = tl.make_block_ptr(
+            u + (bos * H + i_h) * V,
+            (T, V),
+            (H * V, 1),
+            (i_t * BT, i_v * BV),
+            (BT, BV),
+            (1, 0),
+        )
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_vb = (b_v * b_beta[:, None]).to(b_v.dtype)
+        b_u = tl.dot(b_A, b_vb, allow_tf32=False)
+        tl.store(p_u, b_u.to(p_u.dtype.element_ty), boundary_check=(0, 1))
+
+    for i_k in range(tl.cdiv(K, BK)):
+        p_k = tl.make_block_ptr(
+            k + (bos * Hg + i_h // (H // Hg)) * K,
+            (T, K),
+            (Hg * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        p_w = tl.make_block_ptr(
+            w + (bos * H + i_h) * K,
+            (T, K),
+            (H * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_kb = (b_k * b_beta[:, None] * b_g[:, None]).to(b_k.dtype)
+        b_w = tl.dot(b_A, b_kb)
+        tl.store(p_w, b_w.to(p_w.dtype.element_ty), boundary_check=(0, 1))
+
+
+def recompute_w_u_fwd(
+    k: torch.Tensor,
+    v: torch.Tensor,
+    beta: torch.Tensor,
+    g_cumsum: torch.Tensor,
+    A: torch.Tensor,
+    cu_seqlens: Optional[torch.LongTensor],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    B, T, Hg, K, V = *k.shape, v.shape[-1]
+    H = v.shape[-2]
+    BT = A.shape[-1]
+
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    )
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    BK = 64
+    BV = 64
+    u = torch.empty_like(v)
+    w = k.new_empty(B, T, H, K)
+    recompute_w_u_fwd_kernel[(NT, B * H)](
+        k=k,
+        v=v,
+        beta=beta,
+        w=w,
+        u=u,
+        A=A,
+        g=g_cumsum,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        Hg=Hg,
+        K=K,
+        V=V,
+        BT=BT,
+        BK=BK,
+        BV=BV,
+        num_warps=4,
+        num_stages=3,
+    )
+    return w, u
+
+
+fwd_recompute_w_u = recompute_w_u_fwd
diff --git a/python/sglang/srt/layers/attention/flashattention_backend.py b/python/sglang/srt/layers/attention/flashattention_backend.py
new file mode 100644
index 000000000..f7ca5e203
--- /dev/null
+++ b/python/sglang/srt/layers/attention/flashattention_backend.py
@@ -0,0 +1,2365 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional, Union
+
+import numpy as np
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.configs.model_config import AttentionArch
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.mem_cache.memory_pool import SWAKVPool
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+from sgl_kernel import merge_state_v2
+from sgl_kernel.flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache
+
+
+@dataclass
+class FlashAttentionMetadata:
+    """Metadata to be init once in the model forward pass,
+    each layer's forward pass can reuse the metadata.
+
+    For each init metadata function, we will try set up them in below order
+    """
+
+    # Sequence lengths for the forward batch
+    cache_seqlens_int32: torch.Tensor = None
+    # Maximum sequence length for query
+    max_seq_len_q: int = 1
+    # Maximum sequence length for key
+    max_seq_len_k: int = 0
+    # Cumulative sequence lengths for query
+    cu_seqlens_q: torch.Tensor = None
+    # Cumulative sequence lengths for key
+    cu_seqlens_k: torch.Tensor = None
+    # Window size (typically used by Gemma)
+    window_size: tuple = (-1, -1)
+    # Page table, the index of KV Cache Tables/Blocks
+    page_table: torch.Tensor = None
+
+    # Encoder metadata
+    # Cumulative sequence lengths for encoder key
+    encoder_cu_seqlens_k: torch.Tensor = None
+    # Maximum sequence length for encoder key
+    encoder_max_seq_len_k: int = 0
+    # Sequence lengths for the forward batch
+    encoder_lens_int32: torch.Tensor = None
+    # Page table for the encoder
+    encoder_page_table: torch.Tensor = None
+
+    @dataclass
+    class LocalAttentionMetadata:
+        local_query_start_loc: torch.Tensor = None  # cu_seqlens_q for local attention
+        local_seqused_k: torch.Tensor = None  # sequence lengths for local attention
+        local_block_table: torch.Tensor = None  # block table for local attention
+        local_max_query_len: int = 0  # max query length for local attention
+        local_max_seq_len: int = 0  # max sequence length for local attention
+
+    local_attn_metadata: Optional[LocalAttentionMetadata] = None
+
+    # For sliding window attention topk>1 spec decoding
+    swa_spec_metadata: Optional[FlashAttentionMetadata] = None
+
+
+# Copied from:
+# https://github.com/houseroad/vllm/blob/4e45bfcaf928bdb9bd952b4ac922a3c205589ae8/vllm/v1/attention/backends/flash_attn.py
+#
+# Take in `query_start_loc_np` and `seq_lens_np` and break the sequences into
+# local attention blocks, where each block is passed to the attention kernel
+# as an independent local ("virtual") batch item.
+#
+# For example, if are performing a chunked prefill a batch of 3 sequences:
+#   q_seqlens  = [4, 10, 5]
+#   kv_seqlens = [6, 17, 9]
+# Then normally for regular attention we would compute with an attention mask
+#  for batch idx 0 (q_seqlens = 4, kv_seqlens = 6) like:
+#   batch idx: 0 (q_seqlens = 4, kv_seqlens = 6)
+#        k_toks >   0 1 2 3 4 5
+#        q_toks v  _____________
+#               0 | 1 1 1
+#               1 | 1 1 1 1
+#               2 | 1 1 1 1 1
+#               3 | 1 1 1 1 1 1
+#
+# for local attention (with attn_chunk_size = 4) we would compute with an
+#  attention mask like:
+#   batch idx: 0  (q_seqlens = 4, kv_seqlens = 6, attn_chunk_size = 4)
+#        k_toks >   0 1 2 3 4 5
+#        q_toks v  _____________
+#               0 | 1 1 1
+#               1 | 1 1 1 1
+#               2 |         1
+#               3 |         1 1
+#
+# We can simulate this mask using standard flash-attention by breaking the
+#  sequences into local ("virtual") batches, where each local batch item is a
+#  local attention block, so in this case batch idx 0 would be broken up into:
+#
+#   local-batch idx: 0 (q_seqlens = 2, kv_seqlens = 4)  (batch 0)
+#        k_toks >   0 1 2 3
+#        q_toks v  _____________
+#               0 | 1 1 1
+#               1 | 1 1 1 1
+#   local-batch idx: 1 (q_seqlens = 2, kv_seqlens = 2) (batch 0)
+#        k_toks >   4 5
+#        q_toks v  _____________
+#               2 | 1
+#               3 | 1 1
+#
+# e.g. if we have:
+#   attn_chunk_size = 4
+#   query_start_loc_np = [0, 4, 14, 19] (q_seqlens = [4, 10, 5])
+# Then this function would return:
+#                           __b0__  ______b1______  __b2__ < orig batch indices
+#   q_seqlens_local    = [   2,  2,  1,  4,  4,  1,  4,  1]
+#   cu_seqlens_q_local = [0, 4,  6, 10, 14, 18, 19, 23, 24]
+#   seqlens_k_local    = [   4,  2,  4,  4,  4,  1,  4,  1]
+#   block_table_local  : shape[local_virtual_batches, pages_per_local_batch]
+def make_local_attention_virtual_batches(
+    attn_chunk_size: int,
+    query_start_loc_np: np.ndarray,
+    seq_lens_np: np.ndarray,
+    block_table: torch.Tensor,
+    page_size: int = 0,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray, torch.Tensor]:
+    """
+    Take in `query_start_loc_np` and `seq_lens_np` and break the sequences into
+    local attention blocks, where each block is passed to the attention kernel
+    as an independent local ("virtual") batch item.
+
+    Args:
+        attn_chunk_size: Size of local attention chunks
+        query_start_loc_np: Cumulative sum of query lengths (numpy array)
+        seq_lens_np: Sequence lengths (numpy array)
+        block_table: Block table for KV cache
+        page_size: Size of each page in the KV cache
+
+    Returns:
+        seqlens_q_local: Query sequence lengths for local attention
+        cu_seqlens_q_local: Cumulative sum of query sequence lengths for local attention
+        seqlens_k_local: Key sequence lengths for local attention
+        block_table_local: Block table for local attention
+    """
+    # Adjust attention_chunk_size based on the actual sequence length
+    # to avoid index out of bounds errors
+    max_seq_len = seq_lens_np.max()
+    effective_chunk_size = min(attn_chunk_size, max_seq_len)
+    # Make sure effective_chunk_size is divisible by page_size
+    effective_chunk_size = (effective_chunk_size // page_size) * page_size
+    if effective_chunk_size < page_size:
+        effective_chunk_size = page_size
+    attn_chunk_size = effective_chunk_size
+
+    q_seqlens = query_start_loc_np[1:] - query_start_loc_np[:-1]
+    actual_batch_size = seq_lens_np.shape[0]
+
+    # Handle if we are starting in the middle of a local attention block,
+    #  we assume q_seqlens > 0 (for all elements), for each batch idx we compute
+    #  the number of tokens that are not in the first local attention block and
+    #  then we can simply use a cdiv for the rest.
+    # For example if we have:
+    #   attn_chunk_size = 4
+    #   q_seqlens = [4, 10, 5]
+    #   k_seqlens = [6, 17, 9]
+    # Then we would get:
+    #   new_tokens_in_first_block = [2, 1, 4]
+    #   local_blocks = [2, 4, 2]
+    q_tokens_in_first_block = np.minimum(
+        attn_chunk_size - ((seq_lens_np - q_seqlens) % attn_chunk_size), q_seqlens
+    ).astype(np.int32)
+    tokens_in_last_block = attn_chunk_size + (seq_lens_np % -attn_chunk_size)
+    local_blocks = 1 + cdiv(q_seqlens - q_tokens_in_first_block, attn_chunk_size)
+
+    # Once we know the number of local blocks we can compute the request spans
+    #  for each batch idx, we can figure out the number of "virtual" requests we
+    #  have to make,
+    # For the above example we would get:
+    #   seqlens_q_local = [2, 2, 1, 4, 4, 1, 4, 1]
+    #
+    # First Get batched arange. (E.g., [2, 4, 2] -> [0, 1, 0, 1, 2, 3, 0, 1])
+    #   (TODO: max a utility to share this code with _prepare_inputs)
+    # arange step 1. [2, 4, 2] -> [2, 6, 8]
+    cu_num_blocks = np.cumsum(local_blocks)
+    virtual_batches = cu_num_blocks[-1]
+    # arange step 2. [2, 6, 8] -> [0, 0, 2, 2, 2, 2, 6, 6]
+    block_offsets = np.repeat(cu_num_blocks - local_blocks, local_blocks)
+    # arange step 3. [0, 1, 0, 1, 2, 3, 0, 1]
+    arange = np.arange(virtual_batches, dtype=np.int32) - block_offsets
+    # also compute reverse arange (i.e. [1, 0, 3, 2, 1, 0, 1, 0])
+    rarange = np.repeat(local_blocks, local_blocks) - arange - 1
+    # Then we can compute the seqlens_q_local, handling the fact that the
+    #  first and last blocks could be partial
+    seqlens_q_local = np.repeat(q_seqlens - q_tokens_in_first_block, local_blocks)
+    # set the first block since this may be a partial block
+    seqlens_q_local[arange == 0] = q_tokens_in_first_block
+    # set the remaining blocks
+    seqlens_q_local[arange > 0] = np.minimum(
+        seqlens_q_local - attn_chunk_size * (arange - 1), attn_chunk_size
+    )[arange > 0]
+
+    # convert from q_seqlens to cu_seqlens_q
+    cu_seqlens_q_local = np.pad(np.cumsum(seqlens_q_local), (1, 0)).astype(np.int32)
+
+    # compute the seqlens_k_local,
+    #  basically a full local attention block for all but the last block in each
+    #  batch
+    # For our example this will be:
+    #   seqlens_k_local = [4, 2, 4, 4, 4, 1, 4, 1]
+    seqlens_k_local = np.full(cu_num_blocks[-1], attn_chunk_size, dtype=np.int32)
+    seqlens_k_local[cu_num_blocks - 1] = tokens_in_last_block
+
+    k_seqstarts_absolute = np.repeat(seq_lens_np, local_blocks) - (
+        rarange * attn_chunk_size + np.repeat(tokens_in_last_block, local_blocks)
+    )
+    # For the example the local attention blocks start at:
+    #                           _b0_  _____b1_____  _b2_
+    #   k_seqstarts_absolute = [0, 4, 4, 8, 12, 16, 4, 8]
+    block_starts = k_seqstarts_absolute // page_size
+
+    assert attn_chunk_size % page_size == 0, (
+        f"attn_chunk_size {attn_chunk_size} is not "
+        f"divisible by page_size {page_size}"
+    )
+    pages_per_local_batch = attn_chunk_size // page_size
+
+    # Create a block_table for the local attention blocks
+    # For out example if we have a block-table like (assuming page_size=2):
+    #   block_table = [
+    #     [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],  < batch 0
+    #     [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],  < batch 1
+    #     [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],  < batch 2
+    #   ]
+    # Then for the local batches we would want a block-table like
+    #   block_table_local = [
+    #     [  0,  1 ], < local-batch 0, (batch 0, starting from k[0])
+    #     [  2,  3 ], < local-batch 1, (batch 0, starting from k[4])
+    #     [ 12, 13 ], < local-batch 2, (batch 1, starting from k[4])
+    #     [ 14, 15 ], < local-batch 3, (batch 1, starting from k[8])
+    #     [ 16, 17 ], < local-batch 4, (batch 1, starting from k[12])
+    #     [ 18, 19 ], < local-batch 5, (batch 1, starting from k[16])
+    #     [ 22, 23 ], < local-batch 6, (batch 2, starting from k[4])
+    #     [ 24, 25 ], < local-batch 7, (batch 2, starting from k[8])
+    #   ]
+    block_indices = np.broadcast_to(
+        np.arange(pages_per_local_batch, dtype=np.int32),
+        (virtual_batches, pages_per_local_batch),
+    ) + np.expand_dims(block_starts, axis=1)
+    # Ensure block_indices doesn't exceed block_table dimensions
+    # This is a critical safety check that prevents index out of bounds errors
+    # when dealing with large sequences (>8192 tokens) or when the block_table
+    # dimensions are smaller than what would be needed for the full attention chunk size.
+    block_indices = block_indices.flatten().clip(max=block_table.shape[1] - 1)
+    batch_indices = np.repeat(
+        np.arange(actual_batch_size, dtype=np.int32),
+        local_blocks * pages_per_local_batch,
+    )
+    block_table_local = block_table[batch_indices, block_indices].view(
+        virtual_batches, -1
+    )
+
+    return seqlens_q_local, cu_seqlens_q_local, seqlens_k_local, block_table_local
+
+
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+
+
+# TODO(hebiao064): remove this once we have a better way to handle the merge_state_v2 torch.compile issue
+@torch._dynamo.disable()
+def merge_state_v2_wrapper(o, s_a, o_exp, s_b):
+    return merge_state_v2(o, s_a, o_exp, s_b)
+
+
+class FlashAttentionBackend(AttentionBackend):
+    """FlashAttention backend implementation.
+
+    Note about the init:
+    - If no spec decoding
+        - FlashAttentionBackend will be init once when the server starts.
+    - If spec decoding
+        - FlashAttentionBackend will be init once for the target worker
+        - FlashAttentionMultiStepBackend will be once for the draft worker
+            - It will spawn num_steps FlashAttentionBackend for the draft worker
+
+    Note about CUDA Graph:
+    - We only support CUDA Graph for Decode (Normal Decode and Draft Decode) and Target Verify.
+    - We don't support CUDA Graph for Extend and Draft Extend.
+    - When server init, init_cuda_graph_state will be called first and then init_cuda_graph_capture will be called.
+    - For each forward batch, init_replay_cuda_graph will be called first and then replay the graph.
+    """
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        skip_prefill: bool = False,
+        speculative_step_id=0,
+        topk=0,
+        speculative_num_steps=0,
+    ):
+        super().__init__()
+
+        assert not (
+            model_runner.sliding_window_size is not None
+            and model_runner.model_config.is_encoder_decoder
+        ), "Sliding window and cross attention are not supported together"
+
+        self.forward_metadata: FlashAttentionMetadata = None
+        # extra metadata for handling speculative decoding topk > 1, extended draft decode and verify
+        self.forward_metadata_spec_decode_expand: FlashAttentionMetadata = None
+        self.max_context_len = model_runner.model_config.context_len
+        self.device = model_runner.device
+        self.decode_cuda_graph_metadata = {}
+        self.target_verify_metadata = {}
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.kv_cache_dtype = model_runner.kv_cache_dtype
+        self.kv_cache_dtype_str = model_runner.server_args.kv_cache_dtype
+        self.page_size = model_runner.page_size
+        self.use_mla = model_runner.model_config.attention_arch == AttentionArch.MLA
+        self.skip_prefill = skip_prefill
+        self.is_hybrid = model_runner.is_hybrid
+        if self.is_hybrid:
+            self.full_to_swa_index_mapping = (
+                model_runner.token_to_kv_pool.full_to_swa_index_mapping
+            )
+        self.topk = model_runner.server_args.speculative_eagle_topk or 0
+        self.speculative_num_steps = speculative_num_steps
+        self.speculative_num_draft_tokens = (
+            model_runner.server_args.speculative_num_draft_tokens
+        )
+        self.speculative_step_id = speculative_step_id
+
+        # Local attention settings
+        self.attention_chunk_size = (
+            model_runner.attention_chunk_size
+            if hasattr(model_runner, "attention_chunk_size")
+            else None
+        )
+
+        # For each layer, the sliding_window_size can be different. This is only used for preparing SWA metadata.
+        # We use `layer.sliding_window_size` to decide whether to use SWA for each layer.
+        self.sliding_window_size = model_runner.sliding_window_size
+        self.has_swa = (
+            self.sliding_window_size is not None and self.sliding_window_size > -1
+        )
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Initialize forward metadata hence all layers in the forward pass can reuse it."""
+        metadata = FlashAttentionMetadata()
+        seqlens_in_batch = forward_batch.seq_lens
+        batch_size = forward_batch.batch_size
+        device = seqlens_in_batch.device
+
+        if forward_batch.forward_mode.is_decode_or_idle():
+            # Draft Decode
+            if forward_batch.spec_info is not None:
+                if self.topk <= 1:
+                    metadata.cache_seqlens_int32 = (
+                        seqlens_in_batch + (self.speculative_step_id + 1)
+                    ).to(torch.int32)
+                    metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item() + (
+                        self.speculative_step_id + 1
+                    )
+                    metadata.cu_seqlens_q = torch.arange(
+                        0, batch_size + 1, dtype=torch.int32, device=device
+                    )
+                    metadata.cu_seqlens_k = torch.nn.functional.pad(
+                        torch.cumsum(
+                            metadata.cache_seqlens_int32, dim=0, dtype=torch.int32
+                        ),
+                        (1, 0),
+                    )
+                    metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                        forward_batch.req_pool_indices, : metadata.max_seq_len_k
+                    ]
+                else:
+                    metadata.cache_seqlens_int32 = (seqlens_in_batch).to(torch.int32)
+                    metadata.max_seq_len_q = self.topk
+                    metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item()
+                    metadata.cu_seqlens_q = torch.arange(
+                        0,
+                        batch_size * self.topk + 1,
+                        step=self.topk,
+                        dtype=torch.int32,
+                        device=device,
+                    )
+                    metadata.cu_seqlens_k = torch.nn.functional.pad(
+                        torch.cumsum(
+                            metadata.cache_seqlens_int32, dim=0, dtype=torch.int32
+                        ),
+                        (1, 0),
+                    )
+                    metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                        forward_batch.req_pool_indices, : metadata.max_seq_len_k
+                    ]
+
+                    metadata_expand = FlashAttentionMetadata()
+                    decode_length = self.speculative_step_id + 1
+                    metadata_expand.cache_seqlens_int32 = torch.full(
+                        (seqlens_in_batch.numel() * self.topk,),
+                        decode_length,
+                        device=device,
+                        dtype=torch.int32,
+                    )
+                    metadata_expand.max_seq_len_q = 1
+                    metadata_expand.cu_seqlens_q = torch.arange(
+                        0,
+                        metadata_expand.cache_seqlens_int32.numel() + 1,
+                        dtype=torch.int32,
+                        device=device,
+                    )
+                    metadata_expand.cu_seqlens_k = torch.arange(
+                        0,
+                        metadata_expand.cache_seqlens_int32.numel() * decode_length + 1,
+                        step=decode_length,
+                        dtype=torch.int32,
+                        device=device,
+                    )
+                    # shape: [bs, num_steps, topk] -> [bs x topk, num_steps]
+                    cache_loc = forward_batch.out_cache_loc.view(
+                        -1, self.speculative_num_steps
+                    )
+                    metadata_expand.page_table = (
+                        cache_loc[:, :decode_length].contiguous().to(torch.int32)
+                    )
+                    self.forward_metadata_spec_decode_expand = metadata_expand
+            else:
+                # Normal Decode
+                metadata.cache_seqlens_int32 = seqlens_in_batch.to(torch.int32)
+                metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item()
+                metadata.cu_seqlens_q = torch.arange(
+                    0, batch_size + 1, dtype=torch.int32, device=device
+                )
+                metadata.cu_seqlens_k = torch.nn.functional.pad(
+                    torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)
+                )
+                metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                    forward_batch.req_pool_indices, : metadata.max_seq_len_k
+                ]
+            # TODO: we need to test this part for llama 4 eagle case
+            self._init_local_attn_metadata(forward_batch, metadata, device)
+        elif forward_batch.forward_mode.is_target_verify():
+            if self.topk <= 1:
+                metadata.cache_seqlens_int32 = (
+                    forward_batch.seq_lens + self.speculative_num_draft_tokens
+                ).to(torch.int32)
+                metadata.max_seq_len_q = self.speculative_num_draft_tokens
+                metadata.max_seq_len_k = (
+                    forward_batch.seq_lens_cpu.max().item()
+                    + self.speculative_num_draft_tokens
+                )
+                metadata.cu_seqlens_q = torch.arange(
+                    0,
+                    batch_size * self.speculative_num_draft_tokens + 1,
+                    self.speculative_num_draft_tokens,
+                    dtype=torch.int32,
+                    device=device,
+                )
+                metadata.cu_seqlens_k = torch.nn.functional.pad(
+                    torch.cumsum(
+                        metadata.cache_seqlens_int32, dim=0, dtype=torch.int32
+                    ),
+                    (1, 0),
+                )
+                metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                    forward_batch.req_pool_indices, : metadata.max_seq_len_k
+                ]
+
+                self._init_local_attn_metadata(forward_batch, metadata, device)
+            else:
+                metadata.cache_seqlens_int32 = forward_batch.seq_lens.to(torch.int32)
+                metadata.max_seq_len_q = self.speculative_num_draft_tokens
+                metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item()
+                metadata.cu_seqlens_q = torch.arange(
+                    0,
+                    batch_size * self.speculative_num_draft_tokens + 1,
+                    step=self.speculative_num_draft_tokens,
+                    dtype=torch.int32,
+                    device=device,
+                )
+                metadata.cu_seqlens_k = torch.nn.functional.pad(
+                    torch.cumsum(
+                        metadata.cache_seqlens_int32, dim=0, dtype=torch.int32
+                    ),
+                    (1, 0),
+                )
+                metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                    forward_batch.req_pool_indices, : metadata.max_seq_len_k
+                ]
+
+                metadata_expand = FlashAttentionMetadata()
+
+                metadata_expand.max_seq_len_q = 1
+                metadata_expand.cu_seqlens_q = torch.arange(
+                    0,
+                    forward_batch.seq_lens.numel() * self.speculative_num_draft_tokens
+                    + 1,
+                    dtype=torch.int32,
+                    device=device,
+                )
+
+                # create expand page table
+                offsets = torch.arange(
+                    self.speculative_num_draft_tokens, device=device
+                ).unsqueeze(
+                    0
+                )  # shape: (1, self.speculative_num_draft_tokens)
+                cols = offsets.expand(
+                    forward_batch.seq_lens.numel(), -1
+                ) + forward_batch.seq_lens.unsqueeze(1)
+                cum_len = torch.nn.functional.pad(
+                    torch.cumsum(
+                        (
+                            forward_batch.seq_lens + self.speculative_num_draft_tokens
+                        ).repeat_interleave(self.speculative_num_draft_tokens),
+                        dim=0,
+                    ),
+                    (1, 0),
+                )[:-1]
+                mask_extraction_indices = (
+                    cols.repeat_interleave(self.speculative_num_draft_tokens, dim=0)
+                    + cum_len[:, None]
+                ).view(1, -1)
+                mask = forward_batch.spec_info.custom_mask[
+                    mask_extraction_indices
+                ].view(
+                    -1, self.speculative_num_draft_tokens
+                )  # (bsz * draft_num, draft_num)
+
+                # shift table indices to avoid padding
+                # non_masked_page_table [[8, 9, 10],   mask (display with int format) [[1, 0, 0],
+                #                        [8, 9, 10],                                   [1, 1, 0],
+                #                        [8, 9, 10]]                                   [1, 0, 1]]
+                # if masked with padding [[8, 0, 0],   our mask without padding       [[8, 9, 10],
+                #                        [8, 9, 0],                                    [8, 9, 10],
+                #                        [8, 0, 10]]                                   [8, 10, 9]]
+                # note here cache_seqlens_int32 is [1, 2, 2] so extra page indices will be ignored in each row
+                col_indices = offsets.expand(
+                    mask.shape[0], self.speculative_num_draft_tokens
+                )
+                # Build keys: if an entry is valid (mask==True), keep its original index;
+                # if not, add self.speculative_num_draft_tokens so that it sorts after all valid entries.
+                keys = torch.where(
+                    mask, col_indices, col_indices + self.speculative_num_draft_tokens
+                )
+                _, sort_order = torch.sort(keys, dim=1)
+                non_masked_page_table = (
+                    forward_batch.req_to_token_pool.req_to_token[
+                        forward_batch.req_pool_indices, :
+                    ]
+                    .gather(1, cols)
+                    .repeat_interleave(self.speculative_num_draft_tokens, dim=0)
+                )  # (bsz, draft_num)
+                metadata_expand.page_table = non_masked_page_table.gather(1, sort_order)
+                metadata_expand.cache_seqlens_int32 = mask.sum(dim=1).to(torch.int32)
+                metadata_expand.cu_seqlens_k = torch.nn.functional.pad(
+                    torch.cumsum(
+                        metadata_expand.cache_seqlens_int32, dim=0, dtype=torch.int32
+                    ),
+                    (1, 0),
+                )
+                self.forward_metadata_spec_decode_expand = metadata_expand
+
+                if self.has_swa:
+                    self._init_sliding_window_attn_spec_metadata(
+                        metadata, metadata_expand
+                    )
+
+        elif forward_batch.forward_mode.is_extend_or_draft_extend_or_mixed():
+            metadata.cache_seqlens_int32 = seqlens_in_batch.to(torch.int32)
+            metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item()
+            metadata.cu_seqlens_k = torch.nn.functional.pad(
+                torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)
+            )
+            metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                forward_batch.req_pool_indices, : metadata.max_seq_len_k
+            ]
+
+            if (
+                any(forward_batch.extend_prefix_lens_cpu)
+                or forward_batch.forward_mode == ForwardMode.DRAFT_EXTEND
+            ):
+                extend_seq_lens = forward_batch.extend_seq_lens
+                metadata.max_seq_len_q = max(forward_batch.extend_seq_lens_cpu)
+                metadata.cu_seqlens_q = torch.nn.functional.pad(
+                    torch.cumsum(extend_seq_lens, dim=0, dtype=torch.int32), (1, 0)
+                )
+            else:
+                metadata.max_seq_len_q = metadata.max_seq_len_k
+                metadata.cu_seqlens_q = metadata.cu_seqlens_k
+
+            # Setup local attention if enabled
+            if forward_batch.forward_mode == ForwardMode.EXTEND:
+                self._init_local_attn_metadata(forward_batch, metadata, device)
+
+        # Encoder metadata for cross attention
+        if forward_batch.encoder_lens is not None:
+            assert (
+                forward_batch.encoder_lens.numel() == 1
+            ), "Only encoder size 1 is supported for now"
+
+            metadata.encoder_lens_int32 = forward_batch.encoder_lens.to(torch.int32)
+            metadata.encoder_cu_seqlens_k = torch.nn.functional.pad(
+                torch.cumsum(metadata.encoder_lens_int32, dim=0, dtype=torch.int32),
+                (1, 0),
+            )
+            metadata.encoder_max_seq_len_k = metadata.encoder_lens_int32.max().item()
+            metadata.encoder_page_table = forward_batch.req_to_token_pool.req_to_token[
+                forward_batch.req_pool_indices, : metadata.encoder_max_seq_len_k
+            ]
+
+            # Currently only support forward_batch.encoder_lens.numel() == 1
+            metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                forward_batch.req_pool_indices,
+                metadata.encoder_max_seq_len_k : (
+                    metadata.encoder_max_seq_len_k + metadata.max_seq_len_k
+                ),
+            ]
+
+        # Convert the page table to a strided format which is needed by FA3 API
+        if self.page_size > 1:
+            self.strided_indices = torch.arange(
+                0, metadata.page_table.shape[1], self.page_size, device=self.device
+            )
+            metadata.page_table = (
+                metadata.page_table[:, self.strided_indices] // self.page_size
+            )
+
+        self.forward_metadata = metadata
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+        # For multi-head latent attention
+        q_rope: Optional[torch.Tensor] = None,
+        k_rope: Optional[torch.Tensor] = None,
+        sinks: Optional[torch.Tensor] = None,
+    ):
+        if k is not None:
+            assert v is not None
+            if save_kv_cache:
+                cache_loc = (
+                    forward_batch.out_cache_loc
+                    if not layer.is_cross_attention
+                    else forward_batch.encoder_out_cache_loc
+                )
+                if not self.use_mla:
+                    forward_batch.token_to_kv_pool.set_kv_buffer(
+                        layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+                    )
+                else:
+                    forward_batch.token_to_kv_pool.set_mla_kv_buffer(
+                        layer,
+                        cache_loc,
+                        k,
+                        k_rope,
+                    )
+
+        # Use precomputed metadata across all layers
+        metadata = self.forward_metadata
+
+        # Calculate window size (can be moved to metadata if layer properties don't change)
+        # we don't do layer.sliding_window_size - 1 since in model.get_attention_sliding_window_size() we already - 1
+        # here is two side inclusive
+        is_swa = (
+            layer.sliding_window_size is not None and layer.sliding_window_size > -1
+        )
+        window_size = (layer.sliding_window_size, 0) if is_swa else (-1, -1)
+        k_descale, v_descale = None, None
+        # only use kv scaling if: 1) fp8 kv is explicitly enabled, 2) RadixAttention
+        # has corresponding quantization method so that layer.k_scale is not None,
+        # 3) layer.head_dim <= 256 since fa3 kernel require fp16 and bf16 data type in this case.
+        if self.kv_cache_dtype_str != "auto" and layer.head_dim <= 256:
+            if layer.k_scale is not None:
+                descale_shape = (forward_batch.batch_size, layer.tp_k_head_num)
+                k_descale = layer.k_scale.expand(descale_shape)
+                v_descale = layer.v_scale.expand(descale_shape)
+            q = q.to(self.kv_cache_dtype)
+            q_rope = q_rope.to(self.kv_cache_dtype) if q_rope is not None else None
+            k_rope = k_rope.to(self.kv_cache_dtype) if k_rope is not None else None
+        causal = not layer.is_cross_attention
+
+        # Check if we should use local attention
+        use_local_attn = (
+            self.attention_chunk_size is not None
+            and metadata.local_attn_metadata is not None
+            and (hasattr(layer, "use_irope") and layer.use_irope)
+        )
+
+        # We do cascade attention for Target Verify with topk > 1
+        # We don't use cascade attention for Sliding Window Attention:
+        # - Different window sizes should be passed in for each q in the first stage of cascade attention, but FA3 interface doesn't support pass in a list of window sizes.
+        # - The overhead of duplicated computation of the common prefix part is small for sliding window layers (seq_len <= window_size), so we can just expand it.
+        use_cascade_attn = (
+            forward_batch.forward_mode.is_target_verify()
+            and self.topk > 1
+            and not is_swa
+        )
+
+        # For fa3 interface version compatibility, we put new fields into conditional keyword args
+        kwargs = {}
+        if sinks is not None:
+            kwargs["sinks"] = sinks
+
+        # Get the appropriate page table based on whether we're using local attention
+        if use_local_attn:
+            local_metadata = metadata.local_attn_metadata
+            page_table = local_metadata.local_block_table
+            cu_seqlens_q = local_metadata.local_query_start_loc
+            cache_seqlens = local_metadata.local_seqused_k
+            max_seqlen_q = local_metadata.local_max_query_len
+        elif is_swa and metadata.swa_spec_metadata is not None:
+            swa_spec_metadata = metadata.swa_spec_metadata
+            page_table = swa_spec_metadata.page_table
+            cu_seqlens_q = swa_spec_metadata.cu_seqlens_q
+            cache_seqlens = swa_spec_metadata.cache_seqlens_int32
+            max_seqlen_q = swa_spec_metadata.max_seq_len_q
+            cu_seqlens_k = swa_spec_metadata.cu_seqlens_k
+        else:
+            page_table = metadata.page_table
+            cu_seqlens_q = metadata.cu_seqlens_q
+            cache_seqlens = metadata.cache_seqlens_int32
+            max_seqlen_q = metadata.max_seq_len_q
+            cu_seqlens_k = metadata.cu_seqlens_k
+
+        # Use Flash Attention for prefill
+        if not self.use_mla:
+            # Do multi-head attention
+            key_cache, value_cache = forward_batch.token_to_kv_pool.get_kv_buffer(
+                layer.layer_id
+            )
+            key_cache = key_cache.view(
+                -1, self.page_size, layer.tp_k_head_num, layer.head_dim
+            )
+            value_cache = value_cache.view(
+                -1, self.page_size, layer.tp_v_head_num, layer.head_dim
+            )
+            if layer.is_cross_attention:
+                page_table = metadata.encoder_page_table
+                cache_seqlens = metadata.encoder_lens_int32
+                cu_seqlens_k = metadata.encoder_cu_seqlens_k
+                window_size = (-1, -1)
+
+            result = flash_attn_with_kvcache(
+                q=q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
+                k_cache=key_cache,
+                v_cache=value_cache,
+                page_table=page_table,
+                cache_seqlens=cache_seqlens,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k_new=cu_seqlens_k if not use_local_attn else None,
+                max_seqlen_q=max_seqlen_q,
+                softmax_scale=layer.scaling,
+                causal=False if use_cascade_attn else causal,
+                window_size=window_size,
+                softcap=layer.logit_cap,
+                k_descale=k_descale,
+                v_descale=v_descale,
+                return_softmax_lse=use_cascade_attn,
+                **kwargs,
+            )
+
+            if use_cascade_attn:
+                o, softmax_lse, *rest = result
+                o_expand, softmax_lse_expand, *rest_expand = flash_attn_with_kvcache(
+                    q=q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
+                    k_cache=key_cache,
+                    v_cache=value_cache,
+                    page_table=self.forward_metadata_spec_decode_expand.page_table,
+                    cache_seqlens=self.forward_metadata_spec_decode_expand.cache_seqlens_int32,
+                    cu_seqlens_q=self.forward_metadata_spec_decode_expand.cu_seqlens_q,
+                    cu_seqlens_k_new=self.forward_metadata_spec_decode_expand.cu_seqlens_k,
+                    max_seqlen_q=self.forward_metadata_spec_decode_expand.max_seq_len_q,
+                    softmax_scale=layer.scaling,
+                    causal=False,
+                    window_size=window_size,
+                    softcap=layer.logit_cap,
+                    k_descale=k_descale,
+                    v_descale=v_descale,
+                    return_softmax_lse=True,
+                    **kwargs,
+                )
+                o, _ = merge_state_v2_wrapper(
+                    o,
+                    softmax_lse.T.contiguous(),
+                    o_expand,
+                    softmax_lse_expand.T.contiguous(),
+                )
+            else:
+                o = result
+        else:
+            if (
+                forward_batch.attn_attend_prefix_cache is not None
+                and not forward_batch.forward_mode.is_target_verify()
+                and not forward_batch.forward_mode.is_draft_extend()
+            ):
+                # Do multi-head attention with chunked prefix cache
+                if forward_batch.attn_attend_prefix_cache:
+                    assert not global_server_args_dict["disable_chunked_prefix_cache"]
+                    # MHA for chunked prefix kv cache when running model with MLA
+                    assert forward_batch.prefix_chunk_idx is not None
+                    assert forward_batch.prefix_chunk_cu_seq_lens is not None
+                    assert forward_batch.prefix_chunk_max_seq_lens is not None
+
+                    chunk_idx = forward_batch.prefix_chunk_idx
+                    assert chunk_idx >= 0
+
+                    assert forward_batch.mha_return_lse
+                    output = flash_attn_varlen_func(
+                        q=q.view(-1, layer.tp_q_head_num, layer.head_dim),
+                        k=k.view(-1, layer.tp_k_head_num, layer.head_dim).to(q.dtype),
+                        v=v.view(-1, layer.tp_k_head_num, layer.v_head_dim).to(q.dtype),
+                        cu_seqlens_q=metadata.cu_seqlens_q,
+                        cu_seqlens_k=forward_batch.prefix_chunk_cu_seq_lens[chunk_idx],
+                        max_seqlen_q=metadata.max_seq_len_q,
+                        max_seqlen_k=forward_batch.prefix_chunk_max_seq_lens[chunk_idx],
+                        softmax_scale=layer.scaling,
+                        causal=False,
+                        return_softmax_lse=True,
+                    )
+                else:
+                    # MHA for extend part of sequence without attending prefix kv cache
+                    output = flash_attn_varlen_func(
+                        q=q.view(-1, layer.tp_q_head_num, layer.head_dim),
+                        k=k.view(-1, layer.tp_k_head_num, layer.head_dim).to(q.dtype),
+                        v=v.view(-1, layer.tp_k_head_num, layer.v_head_dim).to(q.dtype),
+                        cu_seqlens_q=metadata.cu_seqlens_q,
+                        cu_seqlens_k=metadata.cu_seqlens_q,
+                        max_seqlen_q=metadata.max_seq_len_q,
+                        max_seqlen_k=metadata.max_seq_len_q,
+                        softmax_scale=layer.scaling,
+                        causal=True,
+                        return_softmax_lse=forward_batch.mha_return_lse,
+                    )
+                if forward_batch.mha_return_lse:
+                    output, lse, *rest = output
+                    lse = torch.transpose(lse, 0, 1).contiguous()
+                    return output, lse
+                return output
+            else:
+                # Do absorbed multi-latent attention
+                kv_cache = forward_batch.token_to_kv_pool.get_key_buffer(
+                    layer.layer_id
+                ).to(q.dtype)
+                k_rope = kv_cache[:, :, layer.v_head_dim :]
+                c_kv = kv_cache[:, :, : layer.v_head_dim]
+                k_rope_cache = k_rope.view(
+                    -1,
+                    self.page_size,
+                    layer.tp_k_head_num,
+                    layer.head_dim - layer.v_head_dim,
+                )
+                c_kv_cache = c_kv.view(
+                    -1, self.page_size, layer.tp_v_head_num, layer.v_head_dim
+                )
+                if q_rope is not None:
+                    q_nope = q.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+                    q_rope = q_rope.view(
+                        -1, layer.tp_q_head_num, layer.head_dim - layer.v_head_dim
+                    )
+                else:
+                    q_all = q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim)
+                    q_nope = q_all[:, :, : layer.v_head_dim]
+                    q_rope = q_all[:, :, layer.v_head_dim :]
+
+                result = flash_attn_with_kvcache(
+                    q=q_rope,
+                    k_cache=k_rope_cache,
+                    v_cache=c_kv_cache,
+                    qv=q_nope,
+                    page_table=page_table,
+                    cache_seqlens=cache_seqlens,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k_new=cu_seqlens_k if not use_local_attn else None,
+                    max_seqlen_q=max_seqlen_q,
+                    softmax_scale=layer.scaling,
+                    causal=False if use_cascade_attn else causal,
+                    softcap=layer.logit_cap,
+                    k_descale=k_descale,
+                    v_descale=v_descale,
+                    return_softmax_lse=use_cascade_attn,
+                )
+                if use_cascade_attn:
+                    o, softmax_lse, *rest = result
+                    o_expand, softmax_lse_expand, *rest_expand = (
+                        flash_attn_with_kvcache(
+                            q=q_rope,
+                            k_cache=k_rope_cache,
+                            v_cache=c_kv_cache,
+                            qv=q_nope,
+                            page_table=self.forward_metadata_spec_decode_expand.page_table,
+                            cache_seqlens=self.forward_metadata_spec_decode_expand.cache_seqlens_int32,
+                            cu_seqlens_q=self.forward_metadata_spec_decode_expand.cu_seqlens_q,
+                            cu_seqlens_k_new=self.forward_metadata_spec_decode_expand.cu_seqlens_k,
+                            max_seqlen_q=self.forward_metadata_spec_decode_expand.max_seq_len_q,
+                            softmax_scale=layer.scaling,
+                            causal=False,
+                            window_size=window_size,
+                            softcap=layer.logit_cap,
+                            k_descale=k_descale,
+                            v_descale=v_descale,
+                            return_softmax_lse=True,
+                        )
+                    )
+                    o, _ = merge_state_v2_wrapper(
+                        o,
+                        softmax_lse.T.contiguous(),
+                        o_expand,
+                        softmax_lse_expand.T.contiguous(),
+                    )
+                else:
+                    o = result
+
+        return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+        # For multi-head latent attention
+        q_rope: Optional[torch.Tensor] = None,
+        k_rope: Optional[torch.Tensor] = None,
+        sinks: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if k is not None:
+            assert v is not None
+            if save_kv_cache:
+                cache_loc = (
+                    forward_batch.out_cache_loc
+                    if not layer.is_cross_attention
+                    else forward_batch.encoder_out_cache_loc
+                )
+                if not self.use_mla:
+                    forward_batch.token_to_kv_pool.set_kv_buffer(
+                        layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+                    )
+                else:
+                    forward_batch.token_to_kv_pool.set_mla_kv_buffer(
+                        layer,
+                        cache_loc,
+                        k,
+                        k_rope,
+                    )
+
+        # Use precomputed metadata across all layers
+        metadata = self.forward_metadata
+        local_attn_metadata = getattr(metadata, "local_attn_metadata", None)
+        use_local_attn = (
+            self.attention_chunk_size is not None
+            and local_attn_metadata is not None
+            and (hasattr(layer, "use_irope") and layer.use_irope)
+        )
+
+        # When Spec Decode enabled, forward_decode would be called with two mode:
+        # 1. DRAFT_DECODE: we enable cascade attention when top_k > 1
+        # 2. IDLE: we don’t need cascade attention, spec_info will be none in this case
+        use_cascade_attn = forward_batch.spec_info is not None and self.topk > 1
+
+        # Calculate window size (can be moved to metadata if layer properties don't change)
+        # we don't do layer.sliding_window_size - 1 since in model.get_attention_sliding_window_size() we already - 1
+        # here is two side inclusive
+        window_size = (
+            (layer.sliding_window_size, 0)
+            if layer.sliding_window_size is not None and layer.sliding_window_size > -1
+            else (-1, -1)
+        )
+        causal = not layer.is_cross_attention
+
+        # For fa3 interface version compatibility, we put new fields into conditional keyword args
+        kwargs = {}
+        if sinks is not None:
+            kwargs["sinks"] = sinks
+
+        k_descale, v_descale = None, None
+        # only use kv scaling if: 1) fp8 kv is explicitly enabled, 2) RadixAttention
+        # has corresponding quantization method so that layer.k_scale is not None,
+        # 3) layer.head_dim <= 256 since fa3 kernel require fp16 and bf16 data type in this case.
+        if self.kv_cache_dtype_str != "auto" and layer.head_dim <= 256:
+            if layer.k_scale is not None:
+                descale_shape = (forward_batch.batch_size, layer.tp_k_head_num)
+                k_descale = layer.k_scale.expand(descale_shape)
+                v_descale = layer.v_scale.expand(descale_shape)
+            q = q.to(self.kv_cache_dtype)
+            q_rope = q_rope.to(self.kv_cache_dtype) if q_rope is not None else None
+            k_rope = k_rope.to(self.kv_cache_dtype) if k_rope is not None else None
+        if not self.use_mla:
+            # Do multi-head attention
+
+            key_cache, value_cache = forward_batch.token_to_kv_pool.get_kv_buffer(
+                layer.layer_id
+            )
+            key_cache = key_cache.view(
+                -1, self.page_size, layer.tp_k_head_num, layer.head_dim
+            )
+            value_cache = value_cache.view(
+                -1, self.page_size, layer.tp_v_head_num, layer.head_dim
+            )
+
+            if layer.is_cross_attention:
+                # Always use non-chunked logic for cross-attention
+                o = flash_attn_with_kvcache(
+                    q=q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
+                    k_cache=key_cache,
+                    v_cache=value_cache,
+                    page_table=metadata.encoder_page_table,
+                    cache_seqlens=metadata.encoder_lens_int32,
+                    cu_seqlens_q=metadata.cu_seqlens_q,
+                    cu_seqlens_k_new=metadata.encoder_cu_seqlens_k,
+                    max_seqlen_q=1,
+                    softmax_scale=layer.scaling,
+                    causal=False,
+                    window_size=(-1, -1),
+                    softcap=layer.logit_cap,
+                    k_descale=k_descale,
+                    v_descale=v_descale,
+                    **kwargs,
+                )
+            elif use_local_attn:
+                # Use chunked (local) attention batching for self-attention
+                o = flash_attn_with_kvcache(
+                    q=q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
+                    k_cache=key_cache,
+                    v_cache=value_cache,
+                    page_table=local_attn_metadata.local_block_table,
+                    cache_seqlens=local_attn_metadata.local_seqused_k,
+                    cu_seqlens_q=local_attn_metadata.local_query_start_loc,
+                    cu_seqlens_k_new=None,
+                    max_seqlen_q=local_attn_metadata.local_max_query_len,
+                    softmax_scale=layer.scaling,
+                    causal=True,
+                    window_size=(-1, -1),
+                    softcap=layer.logit_cap,
+                    k_descale=k_descale,
+                    v_descale=v_descale,
+                    **kwargs,
+                )
+            else:
+                page_table = metadata.page_table
+                cache_seqlens = metadata.cache_seqlens_int32
+                cu_seqlens_k = metadata.cu_seqlens_k
+                max_seqlen_q = metadata.max_seq_len_q
+                q_reshaped = q.contiguous().view(
+                    -1, layer.tp_q_head_num, layer.head_dim
+                )
+
+                # Default: single-token self-attention
+                result = flash_attn_with_kvcache(
+                    q=q_reshaped,
+                    k_cache=key_cache,
+                    v_cache=value_cache,
+                    page_table=page_table,
+                    cache_seqlens=cache_seqlens,
+                    cu_seqlens_q=metadata.cu_seqlens_q,
+                    cu_seqlens_k_new=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_q,
+                    softmax_scale=layer.scaling,
+                    causal=False if use_cascade_attn else causal,
+                    window_size=window_size,
+                    softcap=layer.logit_cap,
+                    k_descale=k_descale,
+                    v_descale=v_descale,
+                    return_softmax_lse=use_cascade_attn,
+                    **kwargs,
+                )
+                if use_cascade_attn:
+                    o, softmax_lse, *rest = result
+                    o_expand, softmax_lse_expand, *rest_expand = (
+                        flash_attn_with_kvcache(
+                            q=q_reshaped,
+                            k_cache=key_cache,
+                            v_cache=value_cache,
+                            page_table=self.forward_metadata_spec_decode_expand.page_table,
+                            cache_seqlens=self.forward_metadata_spec_decode_expand.cache_seqlens_int32,
+                            cu_seqlens_q=self.forward_metadata_spec_decode_expand.cu_seqlens_q,
+                            cu_seqlens_k_new=self.forward_metadata_spec_decode_expand.cu_seqlens_k,
+                            max_seqlen_q=self.forward_metadata_spec_decode_expand.max_seq_len_q,
+                            softmax_scale=layer.scaling,
+                            causal=False,
+                            window_size=window_size,
+                            softcap=layer.logit_cap,
+                            k_descale=k_descale,
+                            v_descale=v_descale,
+                            return_softmax_lse=True,
+                            **kwargs,
+                        )
+                    )
+                    o, _ = merge_state_v2(
+                        o,
+                        softmax_lse.T.contiguous(),
+                        o_expand,
+                        softmax_lse_expand.T.contiguous(),
+                    )
+                else:
+                    o = result
+        else:
+            # Do absorbed multi-latent attention
+            kv_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id).to(
+                q.dtype
+            )
+            k_rope = kv_cache[:, :, layer.v_head_dim :]
+            c_kv = kv_cache[:, :, : layer.v_head_dim]
+            k_rope_cache = k_rope.view(
+                -1,
+                self.page_size,
+                layer.tp_k_head_num,
+                layer.head_dim - layer.v_head_dim,
+            )
+            c_kv_cache = c_kv.view(
+                -1, self.page_size, layer.tp_v_head_num, layer.v_head_dim
+            )
+
+            if q_rope is not None:
+                q_nope = q.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+                q_rope = q_rope.view(
+                    -1, layer.tp_q_head_num, layer.head_dim - layer.v_head_dim
+                )
+            else:
+                q_all = q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim)
+                q_nope = q_all[:, :, : layer.v_head_dim]
+                q_rope = q_all[:, :, layer.v_head_dim :]
+            max_seqlen_q = metadata.max_seq_len_q
+
+            result = flash_attn_with_kvcache(
+                q=q_rope,
+                k_cache=k_rope_cache,
+                v_cache=c_kv_cache,
+                qv=q_nope,
+                page_table=metadata.page_table,
+                cache_seqlens=metadata.cache_seqlens_int32,
+                cu_seqlens_q=metadata.cu_seqlens_q,
+                cu_seqlens_k_new=metadata.cu_seqlens_k,
+                max_seqlen_q=max_seqlen_q,
+                softmax_scale=layer.scaling,
+                causal=False if use_cascade_attn else causal,
+                softcap=layer.logit_cap,
+                k_descale=k_descale,
+                v_descale=v_descale,
+                return_softmax_lse=use_cascade_attn,  # softmax_lse is needed for merge states
+            )
+            if use_cascade_attn:
+                o, softmax_lse, *rest = result
+                o_expand, softmax_lse_expand, *rest_expand = flash_attn_with_kvcache(
+                    q=q_rope,
+                    k_cache=k_rope_cache,
+                    v_cache=c_kv_cache,
+                    qv=q_nope,
+                    page_table=self.forward_metadata_spec_decode_expand.page_table,
+                    cache_seqlens=self.forward_metadata_spec_decode_expand.cache_seqlens_int32,
+                    cu_seqlens_q=self.forward_metadata_spec_decode_expand.cu_seqlens_q,
+                    cu_seqlens_k_new=self.forward_metadata_spec_decode_expand.cu_seqlens_k,
+                    max_seqlen_q=self.forward_metadata_spec_decode_expand.max_seq_len_q,
+                    softmax_scale=layer.scaling,
+                    causal=False,
+                    window_size=window_size,
+                    softcap=layer.logit_cap,
+                    k_descale=k_descale,
+                    v_descale=v_descale,
+                    return_softmax_lse=True,
+                )
+                o, _ = merge_state_v2(
+                    o,
+                    softmax_lse.T.contiguous(),
+                    o_expand,
+                    softmax_lse_expand.T.contiguous(),
+                )
+            else:
+                o = result
+
+        return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        """Initialize CUDA graph state for the attention backend.
+
+        Args:
+            max_bs (int): Maximum batch size to support in CUDA graphs
+
+        This creates fixed-size tensors that will be reused during CUDA graph replay
+        to avoid memory allocations.
+        """
+        max_num_pages = (self.max_context_len + self.page_size - 1) // self.page_size
+
+        # This is being used by normal decode and draft decode when topk == 1
+        self.decode_cuda_graph_metadata = {
+            "cache_seqlens": torch.zeros(max_bs, dtype=torch.int32, device=self.device),
+            "cu_seqlens_q": torch.arange(
+                0, max_bs + 1, dtype=torch.int32, device=self.device
+            ),
+            "cu_seqlens_k": torch.zeros(
+                max_bs + 1, dtype=torch.int32, device=self.device
+            ),
+            "page_table": torch.zeros(
+                max_bs,
+                max_num_pages,
+                dtype=torch.int32,
+                device=self.device,
+            ),
+            "strided_indices": torch.arange(
+                0, self.max_context_len, self.page_size, device=self.device
+            ),
+        }
+        # Only allocate local attention buffers if local attention is enabled
+        # This prevents OOM errors when local attention is not being used
+        if self.attention_chunk_size is not None:
+            # Estimate maximum sizes for local attention metadata
+            max_seq_len = self.max_context_len
+            page_size = self.page_size or 1
+            attn_chunk_size = self.attention_chunk_size
+            max_virtual_batches = max_bs * (
+                (max_seq_len + attn_chunk_size - 1) // attn_chunk_size
+            )
+            max_pages_per_block = (attn_chunk_size + page_size - 1) // page_size
+
+            self.decode_cuda_graph_local_attn_metadata = {
+                "local_query_start_loc": torch.zeros(
+                    max_virtual_batches + 1, dtype=torch.int32, device=self.device
+                ),
+                "local_seqused_k": torch.zeros(
+                    max_virtual_batches, dtype=torch.int32, device=self.device
+                ),
+                "local_block_table": torch.zeros(
+                    max_virtual_batches,
+                    max_pages_per_block,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+            }
+
+        # This is used by draft decode's first half of metadata when topk > 1
+        if self.topk > 1:
+            self.draft_decode_metadata_topk_normal = {
+                "cache_seqlens": torch.zeros(
+                    max_bs, dtype=torch.int32, device=self.device
+                ),
+                "cu_seqlens_q": torch.arange(
+                    0,
+                    max_bs * self.topk + 1,
+                    step=self.topk,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "cu_seqlens_k": torch.zeros(
+                    max_bs + 1, dtype=torch.int32, device=self.device
+                ),
+                "page_table": torch.zeros(
+                    max_bs,
+                    self.max_context_len,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+            }
+
+            # This is used by draft decode's second half of metadata when topk > 1
+            decode_length = self.speculative_step_id + 1
+            self.draft_decode_metadata_topk_expand = {
+                "cache_seqlens": torch.full(
+                    (max_bs * self.topk,),
+                    decode_length,
+                    device=self.device,
+                    dtype=torch.int32,
+                ),
+                "cu_seqlens_q": torch.arange(
+                    0,
+                    max_bs * self.topk + 1,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "cu_seqlens_k": torch.arange(
+                    0,
+                    max_bs * self.topk * decode_length + 1,
+                    step=decode_length,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "page_table": torch.zeros(
+                    max_bs * self.topk,
+                    decode_length,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+            }
+
+        if (
+            self.speculative_num_draft_tokens is not None
+            and self.speculative_num_draft_tokens > 0
+        ):
+            # "page_table_draft_decode" will be set only when spec decoding enabled to save memory
+            self.decode_cuda_graph_metadata["page_table_draft_decode"] = torch.zeros(
+                max_bs,
+                max_num_pages,
+                dtype=torch.int32,
+                device=self.device,
+            )
+
+            self.target_verify_metadata = {
+                "cache_seqlens": torch.zeros(
+                    max_bs, dtype=torch.int32, device=self.device
+                ),
+                "cu_seqlens_q": torch.arange(
+                    0,
+                    max_bs * self.speculative_num_draft_tokens + 1,
+                    step=self.speculative_num_draft_tokens,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "cu_seqlens_k": torch.zeros(
+                    max_bs + 1, dtype=torch.int32, device=self.device
+                ),
+                "page_table": torch.zeros(
+                    max_bs,
+                    max_num_pages,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "strided_indices": torch.arange(
+                    0, self.max_context_len, self.page_size, device=self.device
+                ),
+            }
+
+            self.draft_extend_metadata = {
+                "cache_seqlens": torch.zeros(
+                    max_bs, dtype=torch.int32, device=self.device
+                ),
+                "cu_seqlens_q": torch.zeros(
+                    max_bs + 1,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "cu_seqlens_k": torch.zeros(
+                    max_bs + 1, dtype=torch.int32, device=self.device
+                ),
+                "page_table": torch.zeros(
+                    max_bs,
+                    max_num_pages,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "strided_indices": torch.arange(
+                    0, self.max_context_len, self.page_size, device=self.device
+                ),
+            }
+
+        if self.topk > 1:
+            self.target_verify_metadata_topk_normal = {
+                "cache_seqlens": torch.zeros(
+                    max_bs, dtype=torch.int32, device=self.device
+                ),
+                "cu_seqlens_q": torch.arange(
+                    0,
+                    max_bs * self.speculative_num_draft_tokens + 1,
+                    step=self.speculative_num_draft_tokens,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "cu_seqlens_k": torch.zeros(
+                    max_bs + 1, dtype=torch.int32, device=self.device
+                ),
+                "page_table": torch.zeros(
+                    max_bs,
+                    self.max_context_len,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+            }
+
+            self.target_verify_metadata_topk_expand = {
+                "cache_seqlens": torch.zeros(
+                    max_bs * self.speculative_num_draft_tokens,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "cu_seqlens_k": torch.zeros(
+                    max_bs * self.speculative_num_draft_tokens + 1,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "cu_seqlens_q": torch.arange(
+                    0,
+                    max_bs * self.speculative_num_draft_tokens + 1,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "page_table": torch.zeros(
+                    max_bs * self.speculative_num_draft_tokens,
+                    self.speculative_num_draft_tokens,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+            }
+
+            if self.has_swa:
+                self.target_verify_metadata_topk_swa = {
+                    "cache_seqlens": torch.zeros(
+                        max_bs * self.speculative_num_draft_tokens,
+                        dtype=torch.int32,
+                        device=self.device,
+                    ),
+                    "cu_seqlens_k": torch.zeros(
+                        max_bs * self.speculative_num_draft_tokens + 1,
+                        dtype=torch.int32,
+                        device=self.device,
+                    ),
+                    "cu_seqlens_q": torch.arange(
+                        0,
+                        max_bs * self.speculative_num_draft_tokens + 1,
+                        dtype=torch.int32,
+                        device=self.device,
+                    ),
+                    "page_table": torch.zeros(
+                        max_bs * self.speculative_num_draft_tokens,
+                        self.max_context_len,
+                        dtype=torch.int32,
+                        device=self.device,
+                    ),
+                }
+
+        self.encoder_metadata = {
+            "encoder_page_table": torch.zeros(
+                max_bs,
+                self.max_context_len,
+                dtype=torch.int32,
+                device=self.device,
+            ),
+            "encoder_lens_int32": torch.zeros(
+                max_bs, dtype=torch.int32, device=self.device
+            ),
+            "encoder_cu_seqlens_k": torch.zeros(
+                max_bs + 1, dtype=torch.int32, device=self.device
+            ),
+        }
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        """Initialize forward metadata for capturing CUDA graph."""
+        metadata = FlashAttentionMetadata()
+
+        # metadata_expand is needed for Spec Decoding when top k > 1
+        metadata_expand = FlashAttentionMetadata()
+
+        device = seq_lens.device
+        if forward_mode.is_decode_or_idle():
+            if spec_info is not None:
+                # Draft Decode
+                if self.topk <= 1:
+                    # When topk = 1, we use the normal decode metadata
+                    metadata.cache_seqlens_int32 = self.decode_cuda_graph_metadata[
+                        "cache_seqlens"
+                    ][:bs]
+                    metadata.max_seq_len_k = seq_lens.max().item() + (
+                        self.speculative_step_id + 1
+                    )
+                    metadata.cu_seqlens_q = self.decode_cuda_graph_metadata[
+                        "cu_seqlens_q"
+                    ][: bs + 1]
+                    metadata.cu_seqlens_k = torch.nn.functional.pad(
+                        torch.cumsum(
+                            metadata.cache_seqlens_int32, dim=0, dtype=torch.int32
+                        ),
+                        (1, 0),
+                    )
+                    metadata.page_table = self.decode_cuda_graph_metadata[
+                        "page_table_draft_decode"
+                    ][:bs, :]
+                    self.decode_cuda_graph_metadata[bs] = metadata
+                else:
+                    # When top k > 1, we need two specific draft decode metadata, and then merge states
+                    # 1. The first half of metadata for prefix tokens
+                    metadata.cache_seqlens_int32 = (
+                        self.draft_decode_metadata_topk_normal["cache_seqlens"][:bs]
+                    )
+                    metadata.max_seq_len_q = self.topk
+                    metadata.max_seq_len_k = seq_lens.max().item()
+                    metadata.cu_seqlens_q = self.draft_decode_metadata_topk_normal[
+                        "cu_seqlens_q"
+                    ][: bs + 1]
+                    metadata.cu_seqlens_k = self.draft_decode_metadata_topk_normal[
+                        "cu_seqlens_k"
+                    ][: bs + 1]
+                    metadata.page_table = self.draft_decode_metadata_topk_normal[
+                        "page_table"
+                    ][:bs, :]
+
+                    # 2. The second half of metadata for draft tokens (per_batch_num_tokens = topk)
+                    metadata_expand.cache_seqlens_int32 = (
+                        self.draft_decode_metadata_topk_expand["cache_seqlens"][
+                            : bs * self.topk
+                        ]
+                    )
+                    metadata_expand.max_seq_len_q = 1
+                    metadata_expand.cu_seqlens_q = (
+                        self.draft_decode_metadata_topk_expand["cu_seqlens_q"][
+                            : bs * self.topk + 1
+                        ]
+                    )
+                    metadata_expand.cu_seqlens_k = (
+                        self.draft_decode_metadata_topk_expand["cu_seqlens_k"][
+                            : bs * self.topk + 1
+                        ]
+                    )
+                    metadata_expand.page_table = self.draft_decode_metadata_topk_expand[
+                        "page_table"
+                    ][: bs * self.topk]
+                    self.draft_decode_metadata_topk_normal[bs] = metadata
+                    self.draft_decode_metadata_topk_expand[bs] = metadata_expand
+            else:
+                # Normal Decode
+                # Get sequence information
+                metadata.cache_seqlens_int32 = seq_lens.to(torch.int32)
+                batch_size = len(seq_lens)
+                device = seq_lens.device
+                metadata.cu_seqlens_k = torch.nn.functional.pad(
+                    torch.cumsum(seq_lens, dim=0, dtype=torch.int32), (1, 0)
+                )
+                # Precompute maximum sequence length
+                metadata.max_seq_len_k = seq_lens.max().item()
+                # Precompute page table
+                metadata.page_table = self.decode_cuda_graph_metadata["page_table"][
+                    :bs, :
+                ]
+                # Precompute cumulative sequence lengths
+                metadata.cu_seqlens_q = torch.arange(
+                    0, batch_size + 1, dtype=torch.int32, device=device
+                )
+                self.decode_cuda_graph_metadata[bs] = metadata
+
+                if self.attention_chunk_size is not None:
+                    self._update_local_attn_metadata_for_capture(metadata, batch_size)
+
+        elif forward_mode.is_target_verify():
+            if self.topk <= 1:
+                metadata.cache_seqlens_int32 = self.target_verify_metadata[
+                    "cache_seqlens"
+                ][:bs]
+                metadata.cache_seqlens_int32.copy_(
+                    (seq_lens + self.speculative_num_draft_tokens)
+                )
+
+                metadata.max_seq_len_q = self.speculative_num_draft_tokens
+                metadata.max_seq_len_k = (
+                    seq_lens.max().item() + self.speculative_num_draft_tokens
+                )
+
+                metadata.cu_seqlens_q = torch.arange(
+                    0,
+                    bs * self.speculative_num_draft_tokens + 1,
+                    self.speculative_num_draft_tokens,
+                    dtype=torch.int32,
+                    device=device,
+                )
+
+                metadata.cu_seqlens_k = self.target_verify_metadata["cu_seqlens_k"][
+                    : (bs + 1)
+                ]
+
+                metadata.page_table = self.target_verify_metadata["page_table"][:bs, :]
+
+                self.target_verify_metadata[bs] = metadata
+            else:
+                # When topk > 1, we need two specific target verify metadata, and then merge states
+                # 1. The first half of metadata for prefix tokens
+                metadata.cache_seqlens_int32 = self.target_verify_metadata_topk_normal[
+                    "cache_seqlens"
+                ][:bs]
+                metadata.max_seq_len_q = self.speculative_num_draft_tokens
+                # metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item(), do this in replay
+                metadata.cu_seqlens_q = self.target_verify_metadata_topk_normal[
+                    "cu_seqlens_q"
+                ][: bs + 1]
+                metadata.cu_seqlens_k = self.target_verify_metadata_topk_normal[
+                    "cu_seqlens_k"
+                ][: bs + 1]
+                metadata.page_table = self.target_verify_metadata_topk_normal[
+                    "page_table"
+                ][:bs, :]
+
+                # 2. The second half of metadata for draft tokens (per_batch_num_tokens = topk)
+                metadata_expand.cache_seqlens_int32 = (
+                    self.target_verify_metadata_topk_expand["cache_seqlens"][
+                        : bs * self.speculative_num_draft_tokens
+                    ]
+                )
+                metadata_expand.max_seq_len_q = 1
+                metadata_expand.cu_seqlens_q = self.target_verify_metadata_topk_expand[
+                    "cu_seqlens_q"
+                ][: bs * self.speculative_num_draft_tokens + 1]
+                metadata_expand.cu_seqlens_k = self.target_verify_metadata_topk_expand[
+                    "cu_seqlens_k"
+                ][: bs * self.speculative_num_draft_tokens + 1]
+
+                metadata_expand.page_table = self.target_verify_metadata_topk_expand[
+                    "page_table"
+                ][: bs * self.speculative_num_draft_tokens]
+
+                self.target_verify_metadata_topk_normal[bs] = metadata
+                self.target_verify_metadata_topk_expand[bs] = metadata_expand
+
+                if self.has_swa:
+                    metadata_swa = FlashAttentionMetadata()
+                    metadata_swa.cache_seqlens_int32 = (
+                        self.target_verify_metadata_topk_swa["cache_seqlens"][
+                            : bs * self.speculative_num_draft_tokens
+                        ]
+                    )
+                    metadata_swa.max_seq_len_q = 1
+                    metadata_swa.cu_seqlens_q = self.target_verify_metadata_topk_swa[
+                        "cu_seqlens_q"
+                    ][: bs * self.speculative_num_draft_tokens + 1]
+                    metadata_swa.cu_seqlens_k = self.target_verify_metadata_topk_swa[
+                        "cu_seqlens_k"
+                    ][: bs * self.speculative_num_draft_tokens + 1]
+
+                    metadata_swa.page_table = self.target_verify_metadata_topk_swa[
+                        "page_table"
+                    ][: bs * self.speculative_num_draft_tokens]
+                    self.target_verify_metadata_topk_swa[bs] = metadata_swa
+                    metadata.swa_spec_metadata = metadata_swa
+
+        elif forward_mode.is_draft_extend():
+            metadata.cache_seqlens_int32 = self.draft_extend_metadata["cache_seqlens"][
+                :bs
+            ]
+            metadata.cache_seqlens_int32.copy_(seq_lens)
+
+            num_tokens_per_bs = num_tokens // bs
+            metadata.max_seq_len_q = num_tokens_per_bs
+            metadata.max_seq_len_k = seq_lens.max().item()
+
+            metadata.cu_seqlens_q = torch.arange(
+                0,
+                bs * num_tokens_per_bs + 1,
+                num_tokens_per_bs,
+                dtype=torch.int32,
+                device=device,
+            )
+
+            metadata.cu_seqlens_k = self.draft_extend_metadata["cu_seqlens_k"][
+                : (bs + 1)
+            ]
+            metadata.page_table = self.draft_extend_metadata["page_table"][:bs, :]
+
+            self.draft_extend_metadata[bs] = metadata
+
+        if encoder_lens is not None:
+            encoder_bs = encoder_lens.numel()
+            metadata.encoder_lens_int32 = self.encoder_metadata["encoder_lens_int32"][
+                :encoder_bs
+            ]
+            metadata.encoder_cu_seqlens_k = self.encoder_metadata[
+                "encoder_cu_seqlens_k"
+            ][: (encoder_bs + 1)]
+
+            metadata.encoder_page_table = self.encoder_metadata["encoder_page_table"][
+                :bs, :
+            ]
+
+        self.forward_metadata = metadata
+        self.forward_metadata_spec_decode_expand = metadata_expand
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+        seq_lens_cpu: Optional[torch.Tensor],
+        out_cache_loc: Optional[torch.Tensor] = None,
+    ):
+        """Initialize forward metadata for replaying CUDA graph."""
+        seq_lens = seq_lens[:bs]
+        seq_lens_cpu = seq_lens_cpu[:bs]
+        req_pool_indices = req_pool_indices[:bs]
+        device = seq_lens.device
+        metadata = None
+        metadata_expand = None
+
+        if forward_mode.is_decode_or_idle():
+
+            if spec_info is not None:
+                # Draft Decode
+                if self.topk <= 1:
+                    # When topk = 1, we use the normal decode metadata
+                    metadata = self.decode_cuda_graph_metadata[bs]
+                    max_len = seq_lens_cpu.max().item()
+                    metadata.max_seq_len_k = max_len + self.speculative_step_id + 1
+                    max_seq_pages = (
+                        metadata.max_seq_len_k + self.page_size - 1
+                    ) // self.page_size
+
+                    normal_decode_set_metadata(
+                        metadata.cache_seqlens_int32,
+                        metadata.cu_seqlens_k,
+                        metadata.page_table,
+                        self.req_to_token,
+                        req_pool_indices,
+                        self.decode_cuda_graph_metadata["strided_indices"],
+                        max_seq_pages,
+                        seq_lens,
+                        self.speculative_step_id + 1,
+                        self.page_size,
+                    )
+
+                else:
+                    # When top k > 1, we need two specific draft decode metadata, and then merge states
+                    # 1. The first half of metadata for prefix tokens
+                    metadata = self.draft_decode_metadata_topk_normal[bs]
+                    metadata.cache_seqlens_int32.copy_(seq_lens)
+                    # metadata.max_seq_len_q = self.topk, already set in capture
+                    metadata.max_seq_len_k = seq_lens_cpu.max().item()
+                    # metadata.cu_seqlens_q already set in capture
+                    metadata.cu_seqlens_k[1:].copy_(
+                        torch.cumsum(
+                            metadata.cache_seqlens_int32, dim=0, dtype=torch.int32
+                        )
+                    )
+
+                    page_table = self.req_to_token[
+                        req_pool_indices, : metadata.max_seq_len_k
+                    ]
+
+                    metadata.page_table[:, : metadata.max_seq_len_k].copy_(page_table)
+
+                    # 2. The second half of metadata for draft tokens (per_batch_num_tokens = topk)
+                    metadata_expand = self.draft_decode_metadata_topk_expand[bs]
+                    decode_length = self.speculative_step_id + 1
+                    # shape: [bs, num_steps, topk] -> [bs x topk, num_steps]
+                    cache_loc = out_cache_loc.view(-1, self.speculative_num_steps)
+                    metadata_expand.page_table[: cache_loc.shape[0]].copy_(
+                        cache_loc[:, :decode_length]
+                    )
+                # TODO: Handle local attention metadata for draft decode when llama4 eagle is supported
+            else:
+                # Normal Decode
+                metadata = self.decode_cuda_graph_metadata[bs]
+                max_len = seq_lens_cpu.max().item()
+                max_seq_pages = (max_len + self.page_size - 1) // self.page_size
+                metadata.max_seq_len_k = max_len
+
+                normal_decode_set_metadata(
+                    metadata.cache_seqlens_int32,
+                    metadata.cu_seqlens_k,
+                    metadata.page_table,
+                    self.req_to_token,
+                    req_pool_indices,
+                    self.decode_cuda_graph_metadata["strided_indices"],
+                    max_seq_pages,
+                    seq_lens,
+                    0,
+                    self.page_size,
+                )
+
+                self._update_local_attn_metadata_for_replay(
+                    metadata,
+                    bs,
+                )
+        elif forward_mode.is_target_verify():
+            if self.topk <= 1:
+                metadata = self.target_verify_metadata[bs]
+                metadata.cache_seqlens_int32.copy_(
+                    (seq_lens + self.speculative_num_draft_tokens)
+                )
+
+                metadata.max_seq_len_k = (
+                    seq_lens_cpu.max().item() + self.speculative_num_draft_tokens
+                )
+                metadata.cu_seqlens_k[1:].copy_(
+                    torch.cumsum(metadata.cache_seqlens_int32, dim=0, dtype=torch.int32)
+                )
+                max_seq_pages = (
+                    metadata.max_seq_len_k + self.page_size - 1
+                ) // self.page_size
+                page_indices = self.req_to_token[
+                    req_pool_indices[:, None],
+                    self.decode_cuda_graph_metadata["strided_indices"][:max_seq_pages],
+                ]
+                page_indices //= self.page_size
+                metadata.page_table[:, :max_seq_pages].copy_(page_indices)
+            else:
+                # When topk > 1, we need two specific target verify metadata, and then merge states
+                # 1. The first half of metadata for prefix tokens
+                metadata = self.target_verify_metadata_topk_normal[bs]
+                metadata.cache_seqlens_int32.copy_(seq_lens)
+                # metadata.max_seq_len_q = self.speculative_num_draft_tokens, already set in capture
+                metadata.max_seq_len_k = seq_lens_cpu.max().item()
+                # metadata.cu_seqlens_q already set in capture
+                metadata.cu_seqlens_k[1:].copy_(
+                    torch.cumsum(metadata.cache_seqlens_int32, dim=0, dtype=torch.int32)
+                )
+                page_table = self.req_to_token[
+                    req_pool_indices, : metadata.max_seq_len_k
+                ]
+                metadata.page_table[:, : metadata.max_seq_len_k].copy_(page_table)
+
+                # 2. The second half of metadata for draft tokens (per_batch_num_tokens = topk)
+                metadata_expand = self.target_verify_metadata_topk_expand[bs]
+
+                # metadata_expand.max_seq_len_q = 1, already set in capture
+                # metadata_expand.cu_seqlens_q already set in capture
+                offsets = torch.arange(
+                    self.speculative_num_draft_tokens, device=device
+                ).unsqueeze(
+                    0
+                )  # shape: (1, self.speculative_num_draft_tokens)
+
+                cols = offsets.expand(seq_lens.numel(), -1) + seq_lens.unsqueeze(1)
+                cum_len = torch.nn.functional.pad(
+                    torch.cumsum(
+                        (
+                            seq_lens + self.speculative_num_draft_tokens
+                        ).repeat_interleave(self.speculative_num_draft_tokens),
+                        dim=0,
+                    ),
+                    (1, 0),
+                )[:-1]
+                mask_extraction_indices = (
+                    cols.repeat_interleave(self.speculative_num_draft_tokens, dim=0)
+                    + cum_len[:, None]
+                ).view(1, -1)
+                # avoid extracting padded seq indices which will be out of boundary
+                mask_extraction_indices[
+                    :,
+                    spec_info.positions.numel() * self.speculative_num_draft_tokens :,
+                ].fill_(0)
+                mask = spec_info.custom_mask[mask_extraction_indices].view(
+                    -1, self.speculative_num_draft_tokens
+                )  # (bsz * draft_num, draft_num)
+
+                col_indices = offsets.expand(
+                    mask.shape[0], self.speculative_num_draft_tokens
+                )
+                keys = torch.where(
+                    mask,
+                    col_indices,
+                    col_indices + self.speculative_num_draft_tokens,
+                )
+                _, sort_order = torch.sort(keys, dim=1)
+
+                non_masked_page_table = (
+                    self.req_to_token[req_pool_indices, :]
+                    .gather(1, cols)
+                    .repeat_interleave(self.speculative_num_draft_tokens, dim=0)
+                )  # (bsz, draft_num)
+
+                metadata_expand.page_table.copy_(
+                    non_masked_page_table.gather(1, sort_order)
+                )
+                metadata_expand.cache_seqlens_int32.copy_(mask.sum(dim=1))
+                metadata_expand.cu_seqlens_k[1:].copy_(
+                    torch.cumsum(
+                        metadata_expand.cache_seqlens_int32,
+                        dim=0,
+                        dtype=torch.int32,
+                    )
+                )
+
+                if self.has_swa:
+                    metadata_swa = self.target_verify_metadata_topk_swa[bs]
+                    self._init_sliding_window_attn_spec_metadata(
+                        metadata, metadata_expand, metadata_swa
+                    )
+
+        elif forward_mode.is_draft_extend():
+            metadata = self.draft_extend_metadata[bs]
+            metadata.cache_seqlens_int32.copy_(seq_lens)
+
+            metadata.max_seq_len_k = seq_lens_cpu.max().item()
+            metadata.cu_seqlens_k[1:].copy_(
+                torch.cumsum(metadata.cache_seqlens_int32, dim=0, dtype=torch.int32)
+            )
+            accept_length = spec_info.accept_length[:bs]
+            if spec_info.accept_length_cpu:
+                metadata.max_seq_len_q = max(spec_info.accept_length_cpu) + 1
+            else:
+                metadata.max_seq_len_q = 1
+
+            metadata.cu_seqlens_q[1:].copy_(
+                torch.cumsum(accept_length, dim=0, dtype=torch.int32)
+            )
+
+            max_seq_pages = (
+                metadata.max_seq_len_k + self.page_size - 1
+            ) // self.page_size
+            page_indices = self.req_to_token[
+                req_pool_indices[:, None],
+                self.draft_extend_metadata["strided_indices"][:max_seq_pages],
+            ]
+            metadata.page_table[:, :max_seq_pages].copy_(page_indices // self.page_size)
+
+        if encoder_lens is not None:
+            # Only support encoder size 1 for now
+            metadata.encoder_max_seq_len_k = encoder_lens[0]
+            metadata.encoder_lens_int32.copy_(encoder_lens[:1])
+            metadata.encoder_cu_seqlens_k[1:].copy_(
+                torch.cumsum(metadata.encoder_lens_int32, dim=0, dtype=torch.int32)
+            )
+
+            metadata.encoder_page_table[:, : metadata.encoder_max_seq_len_k].copy_(
+                self.req_to_token[req_pool_indices, : metadata.encoder_max_seq_len_k]
+            )
+
+            # Update the regular page table
+            page_table = self.req_to_token[
+                req_pool_indices,
+                metadata.encoder_max_seq_len_k : (
+                    metadata.encoder_max_seq_len_k + metadata.max_seq_len_k
+                ),
+            ]
+            metadata.page_table[:, : metadata.max_seq_len_k].copy_(page_table)
+
+        self.forward_metadata = metadata
+        self.forward_metadata_spec_decode_expand = metadata_expand
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        """Get the fill value for sequence length in CUDA graph."""
+        return 1
+
+    def _init_local_attn_metadata(
+        self, forwardbatch: ForwardBatch, metadata: FlashAttentionMetadata, device
+    ):
+        """Centralized utility to initialize local_attn_metadata if chunked attention is enabled."""
+        if self.attention_chunk_size is None:
+            metadata.local_attn_metadata = None
+            return
+
+        cu_seqlens_q = metadata.cu_seqlens_q
+        cache_seqlens_int32 = metadata.cache_seqlens_int32
+        if self.is_hybrid:
+            page_table = self.full_to_swa_index_mapping[metadata.page_table].to(
+                torch.int32
+            )
+        else:
+            page_table = metadata.page_table
+        if cu_seqlens_q is None or cache_seqlens_int32 is None or page_table is None:
+            metadata.local_attn_metadata = None
+            return
+
+        cu_seqlens_q_np = cu_seqlens_q.cpu().numpy()
+        seq_lens_np = cache_seqlens_int32.cpu().numpy()
+        (
+            seqlens_q_local_np,
+            cu_seqlens_q_local_np,
+            seqlens_k_local_np,
+            block_table_local,
+        ) = make_local_attention_virtual_batches(
+            self.attention_chunk_size,
+            cu_seqlens_q_np,
+            seq_lens_np,
+            page_table,
+            self.page_size,
+        )
+
+        local_metadata = FlashAttentionMetadata.LocalAttentionMetadata(
+            local_query_start_loc=torch.from_numpy(cu_seqlens_q_local_np).to(device),
+            local_seqused_k=torch.from_numpy(seqlens_k_local_np).to(device),
+            local_block_table=block_table_local.to(device),
+            local_max_query_len=int(seqlens_q_local_np.max()),
+            local_max_seq_len=int(seqlens_k_local_np.max()),
+        )
+        metadata.local_attn_metadata = local_metadata
+
+    def _update_local_attn_metadata_for_capture(
+        self, metadata: FlashAttentionMetadata, bs: int
+    ):
+        """Update local attention metadata during CUDA graph capture phase.
+
+        This method calculates the exact buffer sizes needed for local attention metadata
+        during the CUDA graph capture phase, optimizing memory usage by creating views of
+        pre-allocated buffers with exactly the sizes needed.
+        """
+        seq_lens_capture = metadata.cache_seqlens_int32
+        max_seq_len = int(seq_lens_capture.max().item())
+        page_table_capture = metadata.page_table
+
+        cu_seqlens_q_np = metadata.cu_seqlens_q.cpu().numpy()
+        seqlens_np = seq_lens_capture.cpu().numpy()
+        (
+            seqlens_q_local_np,
+            cu_seqlens_q_local_np,
+            seqlens_k_local_np,
+            block_table_local_np,
+        ) = make_local_attention_virtual_batches(
+            self.attention_chunk_size,
+            cu_seqlens_q_np,
+            seqlens_np,
+            page_table_capture,
+            self.page_size,
+        )
+
+        # Get exact dimensions from the calculation
+        q_len = len(cu_seqlens_q_local_np)
+        k_len = len(seqlens_k_local_np)
+        b0 = block_table_local_np.shape[0] if block_table_local_np.shape[0] > 0 else bs
+        b1 = block_table_local_np.shape[1] if block_table_local_np.shape[1] > 0 else 1
+
+        # Create views of the pre-allocated buffers with exactly these sizes
+        # This is the key optimization - we only use the memory we actually need
+        local_query_start_loc = self.decode_cuda_graph_local_attn_metadata[
+            "local_query_start_loc"
+        ][:q_len]
+
+        local_seqused_k = self.decode_cuda_graph_local_attn_metadata["local_seqused_k"][
+            :k_len
+        ]
+
+        local_block_table = self.decode_cuda_graph_local_attn_metadata[
+            "local_block_table"
+        ][:b0, :b1]
+
+        metadata.local_attn_metadata = FlashAttentionMetadata.LocalAttentionMetadata(
+            local_query_start_loc=local_query_start_loc,
+            local_seqused_k=local_seqused_k,
+            local_block_table=local_block_table,
+            local_max_query_len=1,
+            local_max_seq_len=max_seq_len,
+        )
+
+    def _update_local_attn_metadata_for_replay(
+        self,
+        metadata: FlashAttentionMetadata,
+        bs: int,
+    ):
+        """Update preallocated local attention metadata in-place before CUDA graph replay."""
+        if self.attention_chunk_size is None:
+            return
+
+        # Access preallocated buffers
+        local_q_buf = self.decode_cuda_graph_local_attn_metadata[
+            "local_query_start_loc"
+        ]
+        local_k_buf = self.decode_cuda_graph_local_attn_metadata["local_seqused_k"]
+        local_block_buf = self.decode_cuda_graph_local_attn_metadata[
+            "local_block_table"
+        ]
+        cu_seqlens_q = self.decode_cuda_graph_metadata["cu_seqlens_q"]
+
+        # Create a modified version for local attention that only processes the last token
+        # This mimics the normal decode pattern
+        cu_seqlens_q = torch.arange(
+            bs + 1, device=cu_seqlens_q.device, dtype=cu_seqlens_q.dtype
+        )
+        seqlens = metadata.cache_seqlens_int32[:bs]
+        # Slice the page_table to match the batch size and actual sequence length
+        # This serves three important purposes:
+        # 1. Ensures we only process the actual batch size (bs) and not the maximum batch size
+        # 2. Limits the sequence length to prevent processing padding tokens or garbage values
+        # 3. Prevents zeros in the block table which can cause garbage output during replay
+        #
+        # Without this slicing, the pre-allocated page_table may contain zeros or invalid indices
+        # beyond the actual sequence length, leading to incorrect attention calculations
+        max_seq_len = int(seqlens.max().item())
+        if self.is_hybrid:
+            sliced_page_table = self.full_to_swa_index_mapping[
+                metadata.page_table[:bs, :max_seq_len]
+            ].to(torch.int32)
+        else:
+            sliced_page_table = metadata.page_table[:bs, :max_seq_len]
+
+        cu_seqlens_q_np = cu_seqlens_q.cpu().numpy()
+        seqlens_np = seqlens.cpu().numpy()
+        (
+            seqlens_q_local_np,
+            cu_seqlens_q_local_np,
+            seqlens_k_local_np,
+            block_table_local,
+        ) = make_local_attention_virtual_batches(
+            self.attention_chunk_size,
+            cu_seqlens_q_np,
+            seqlens_np,
+            sliced_page_table,
+            self.page_size,
+        )
+
+        # Convert back to tensors
+        device = local_q_buf.device
+        cu_seqlens_q_local = torch.from_numpy(cu_seqlens_q_local_np).to(device)
+        seqlens_k_local = torch.from_numpy(seqlens_k_local_np).to(device)
+        block_table_local = block_table_local.to(device)
+        # Get sizes
+        q_len = cu_seqlens_q_local.shape[0]
+        k_len = seqlens_k_local.shape[0]
+        b0, b1 = block_table_local.shape
+
+        # In-place updates into preallocated tensors and zero out the unused space
+        local_q_buf[:q_len].copy_(cu_seqlens_q_local)
+        local_q_buf[q_len:].fill_(0)
+        local_k_buf[:k_len].copy_(seqlens_k_local)
+        local_k_buf[k_len:].fill_(0)
+        local_block_buf[:b0, :b1].copy_(block_table_local)
+        local_block_buf[b0:, :].fill_(0)
+        local_block_buf[:b0, b1:].fill_(0)
+
+        if metadata.local_attn_metadata is not None:
+            lam = metadata.local_attn_metadata
+            lam.local_max_query_len = int(seqlens_q_local_np.max())
+            lam.local_max_seq_len = int(seqlens_k_local_np.max())
+
+    def _init_sliding_window_attn_spec_metadata(
+        self,
+        metadata: FlashAttentionMetadata,
+        metadata_expand: FlashAttentionMetadata,
+        metadata_swa: Optional[FlashAttentionMetadata] = None,
+    ):
+        # TODO: support page_size > 1 for swa spec
+        assert (
+            self.page_size == 1
+        ), "FlashAttention backend doesn't support topk > 1 speculative decoding with page size > 1 sliding window attention"
+
+        cache_seqlens_int32 = (
+            metadata.cache_seqlens_int32.repeat_interleave(
+                self.speculative_num_draft_tokens
+            )
+            + metadata_expand.cache_seqlens_int32
+        )
+        cu_seqlens_k = torch.nn.functional.pad(
+            torch.cumsum(cache_seqlens_int32, dim=0, dtype=torch.int32), (1, 0)
+        )
+        bs = cache_seqlens_int32.shape[0]
+        page_table = (
+            metadata.page_table.new_zeros(
+                (bs, metadata.max_seq_len_k + metadata_expand.page_table.shape[1])
+            )
+            if metadata_swa is None
+            else metadata_swa.page_table
+        )
+
+        prepare_swa_spec_page_table_triton(
+            page_table,
+            metadata.page_table,
+            metadata_expand.page_table,
+            metadata.cache_seqlens_int32,
+            metadata_expand.cache_seqlens_int32,
+            self.speculative_num_draft_tokens,
+        )
+
+        if metadata_swa is None:
+            metadata_swa = FlashAttentionMetadata()
+            metadata_swa.max_seq_len_q = 1
+            metadata_swa.cu_seqlens_q = metadata_expand.cu_seqlens_q
+            metadata_swa.cache_seqlens_int32 = cache_seqlens_int32
+            metadata_swa.cu_seqlens_k = cu_seqlens_k
+            metadata_swa.page_table = page_table
+        else:
+            metadata_swa.cache_seqlens_int32.copy_(cache_seqlens_int32)
+            metadata_swa.cu_seqlens_k.copy_(cu_seqlens_k)
+
+        metadata.swa_spec_metadata = metadata_swa
+
+
+@triton.jit
+def _prepare_swa_spec_page_table_kernel(
+    dst_ptr,
+    src_a_ptr,
+    src_b_ptr,
+    seq_len_a_ptr,
+    seq_len_b_ptr,
+    dst_stride_m,
+    dst_stride_n,
+    a_stride_m,
+    a_stride_n,
+    b_stride_m,
+    b_stride_n,
+    LEN_A: tl.constexpr,
+    LEN_B: tl.constexpr,
+    REPEAT_STEP: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    idx_a = pid_m // REPEAT_STEP
+    idx_b = pid_m
+    seq_len_a = tl.load(seq_len_a_ptr + idx_a)
+    seq_len_b = tl.load(seq_len_b_ptr + idx_b)
+
+    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    total_len = seq_len_a + seq_len_b
+
+    if pid_n * BLOCK_N >= total_len:
+        return
+
+    mask = offs_n < total_len
+    dst = dst_ptr + pid_m * dst_stride_m + offs_n * dst_stride_n
+
+    if (pid_n + 1) * BLOCK_N < seq_len_a:
+        a_ptr = src_a_ptr + idx_a * a_stride_m + offs_n * a_stride_n
+        a_mask = mask & (offs_n < LEN_A)
+        val = tl.load(a_ptr, mask=a_mask, other=0)
+        tl.store(dst, val, mask=mask)
+    elif pid_n * BLOCK_N >= seq_len_a:
+        offs_b = offs_n - seq_len_a
+        b_ptr = src_b_ptr + idx_b * b_stride_m + offs_b * b_stride_n
+        b_mask = mask & (offs_b < LEN_B)
+        val = tl.load(b_ptr, mask=b_mask, other=0)
+        tl.store(dst, val, mask=mask)
+    else:
+        # mixed part
+        a_offs = offs_n
+        a_mask = (a_offs < seq_len_a) & (a_offs < LEN_A)
+        a_ptr = src_a_ptr + idx_a * a_stride_m + a_offs * a_stride_n
+        a_val = tl.load(a_ptr, mask=a_mask, other=0)
+
+        b_offs = offs_n - seq_len_a
+        b_mask = (b_offs >= 0) & (b_offs < seq_len_b) & (b_offs < LEN_B)
+        b_ptr = src_b_ptr + idx_b * b_stride_m + b_offs * b_stride_n
+        b_val = tl.load(b_ptr, mask=b_mask, other=0)
+
+        result = tl.where(offs_n < seq_len_a, a_val, b_val)
+        tl.store(dst, result, mask=mask)
+
+
+def prepare_swa_spec_page_table_triton(
+    page_table_dst: torch.Tensor,
+    page_table_a: torch.Tensor,
+    page_table_b: torch.Tensor,  # expand page table
+    seq_len_a: torch.Tensor,
+    seq_len_b: torch.Tensor,  # expand seq lens
+    speculative_num_draft_tokens: int,
+):
+    # concat page_table and expand page_table by kv seq length
+    bs = seq_len_a.numel()
+    bs_expand = seq_len_b.numel()
+    assert bs_expand == bs * speculative_num_draft_tokens
+
+    LEN_A = page_table_a.shape[1]
+    LEN_B = page_table_b.shape[1]
+    LEN_OUT = LEN_A + LEN_B
+    REPEAT_STEP = speculative_num_draft_tokens
+    BLOCK_N = 256
+
+    grid = (bs_expand, triton.cdiv(LEN_OUT, BLOCK_N))
+    _prepare_swa_spec_page_table_kernel[grid](
+        page_table_dst,
+        page_table_a,
+        page_table_b,
+        seq_len_a,
+        seq_len_b,
+        page_table_dst.stride(0),
+        page_table_dst.stride(1),
+        page_table_a.stride(0),
+        page_table_a.stride(1),
+        page_table_b.stride(0),
+        page_table_b.stride(1),
+        LEN_A=LEN_A,
+        LEN_B=LEN_B,
+        REPEAT_STEP=REPEAT_STEP,
+        BLOCK_N=BLOCK_N,
+        num_warps=4,
+    )
+
+
+class FlashAttentionMultiStepBackend:
+
+    def __init__(
+        self, model_runner: ModelRunner, topk: int, speculative_num_steps: int
+    ):
+        self.model_runner = model_runner
+        self.topk = topk
+        self.speculative_num_steps = speculative_num_steps
+        self.attn_backends = []
+        for i in range(self.speculative_num_steps):
+            self.attn_backends.append(
+                FlashAttentionBackend(
+                    model_runner,
+                    speculative_step_id=i,
+                    topk=self.topk,
+                    speculative_num_steps=self.speculative_num_steps,
+                )
+            )
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends[i].init_forward_metadata(forward_batch)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        for i in range(self.speculative_num_steps):
+            self.attn_backends[i].init_cuda_graph_state(max_bs, max_num_tokens)
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        forward_batch: ForwardBatch,
+    ):
+        assert forward_batch.spec_info is not None
+        assert isinstance(forward_batch.spec_info, EagleDraftInput)
+
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends[i].init_forward_metadata_capture_cuda_graph(
+                forward_batch.batch_size,
+                forward_batch.batch_size * self.topk,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                encoder_lens=forward_batch.encoder_lens,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+            )
+
+    def init_forward_metadata_replay_cuda_graph(
+        self, forward_batch: ForwardBatch, bs: int
+    ):
+        assert forward_batch.spec_info is not None
+        assert isinstance(forward_batch.spec_info, EagleDraftInput)
+
+        for i in range(self.speculative_num_steps - 1):
+            # TODO: incrementally update the metadata for the later steps,
+            # so that they do not need to recompute everything from scratch.
+            self.attn_backends[i].init_forward_metadata_replay_cuda_graph(
+                bs,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                forward_batch.seq_lens_sum,
+                encoder_lens=forward_batch.encoder_lens,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+                seq_lens_cpu=forward_batch.seq_lens_cpu,
+                out_cache_loc=forward_batch.out_cache_loc,
+            )
+
+
+# @torch.compile(dynamic=True, backend=get_compiler_backend())
+# TODO: fuse these kernels
+# NOTE: torch.compile makes it slower in speculative decoding
+def normal_decode_set_metadata(
+    cache_seqlens_int32: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    page_table: torch.Tensor,
+    req_to_token: torch.Tensor,
+    req_pool_indices: torch.Tensor,
+    strided_indices: torch.Tensor,
+    max_seq_pages: torch.Tensor,
+    seq_lens: torch.Tensor,
+    seq_len_delta: int,
+    page_size: int,
+):
+    cache_seqlens_int32.copy_(seq_lens + seq_len_delta)
+    cu_seqlens_k[1:].copy_(torch.cumsum(cache_seqlens_int32, dim=0, dtype=torch.int32))
+    page_indices = req_to_token[
+        req_pool_indices[:, None],
+        strided_indices[:max_seq_pages][None, :],
+    ]
+    page_table[:, :max_seq_pages].copy_(page_indices // page_size)
diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py
new file mode 100644
index 000000000..a5b207c77
--- /dev/null
+++ b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -0,0 +1,1441 @@
+from __future__ import annotations
+
+"""
+Support different attention backends.
+Now there are two backends: FlashInfer and Triton.
+FlashInfer is faster and Triton is easier to customize.
+Each backend supports two operators: extend (i.e. prefill with cached prefix) and decode.
+"""
+
+import os
+from dataclasses import dataclass
+from enum import Enum, auto
+from functools import partial
+from typing import TYPE_CHECKING, Callable, List, Optional, Union
+
+import torch
+
+if os.environ["SGLANG_ENABLE_TORCH_COMPILE"] == "1":
+    import logging
+
+    torch._logging.set_logs(dynamo=logging.ERROR)
+    torch._dynamo.config.suppress_errors = True
+
+from sglang.global_config import global_config
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+from sglang.srt.layers.radix_attention import AttentionType
+from sglang.srt.mem_cache.allocator import SWATokenToKVPoolAllocator
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
+from sglang.srt.utils import (
+    is_flashinfer_available,
+    is_sm100_supported,
+    next_power_of_2,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+if is_flashinfer_available():
+    from flashinfer import (
+        BatchDecodeWithPagedKVCacheWrapper,
+        BatchPrefillWithPagedKVCacheWrapper,
+        BatchPrefillWithRaggedKVCacheWrapper,
+    )
+    from flashinfer.cascade import merge_state
+    from flashinfer.decode import _get_range_buf, get_seq_lens
+
+
+class WrapperDispatch(Enum):
+    SLIDING_WINDOW = auto()
+    CROSS_ATTENTION = auto()
+
+
+@dataclass
+class DecodeMetadata:
+    decode_wrappers: List[BatchDecodeWithPagedKVCacheWrapper]
+
+
+@dataclass
+class PrefillMetadata:
+    prefill_wrappers: List[BatchPrefillWithPagedKVCacheWrapper]
+    use_ragged: bool
+    extend_no_prefix: bool
+
+
+# Reuse this workspace buffer across all flashinfer wrappers
+global_workspace_buffer = None
+
+# Use as a fast path to override the indptr in flashinfer's plan function
+# This is used to remove some host-to-device copy overhead.
+global_override_indptr_cpu = None
+
+
+class FlashInferAttnBackend(AttentionBackend):
+    """Flashinfer attention kernels."""
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        skip_prefill: bool = False,
+        kv_indptr_buf: Optional[torch.Tensor] = None,
+        kv_last_page_len_buf: Optional[torch.Tensor] = None,
+    ):
+        super().__init__()
+
+        # Parse constants
+        self.decode_use_tensor_cores = should_use_tensor_core(
+            kv_cache_dtype=model_runner.kv_cache_dtype,
+            num_attention_heads=model_runner.model_config.num_attention_heads
+            // get_attention_tp_size(),
+            num_kv_heads=model_runner.model_config.get_num_kv_heads(
+                get_attention_tp_size()
+            ),
+        )
+        self.max_context_len = model_runner.model_config.context_len
+        self.skip_prefill = skip_prefill
+        self.is_multimodal = model_runner.model_config.is_multimodal
+
+        assert not (
+            model_runner.sliding_window_size is not None
+            and model_runner.model_config.is_encoder_decoder
+        ), "Sliding window and cross attention are not supported together"
+
+        if model_runner.sliding_window_size is not None:
+            self.num_wrappers = 2
+            self.dispatch_reason = WrapperDispatch.SLIDING_WINDOW
+        elif model_runner.model_config.is_encoder_decoder:
+            self.num_wrappers = 2
+            self.dispatch_reason = WrapperDispatch.CROSS_ATTENTION
+        else:
+            self.num_wrappers = 1
+            self.dispatch_reason = None
+
+        # Qwen2/Qwen3 models require higher flashinfer workspace size
+        if (
+            "Qwen2ForCausalLM" in model_runner.model_config.hf_config.architectures
+            or "Qwen3ForCausalLM" in model_runner.model_config.hf_config.architectures
+            or "MiMoForCausalLM" in model_runner.model_config.hf_config.architectures
+        ):
+            global_config.flashinfer_workspace_size = 512 * 1024 * 1024
+
+        # Allocate buffers
+        global global_workspace_buffer
+        if global_workspace_buffer is None:
+            # different from flashinfer zero_init_global_workspace_buffer
+            global_workspace_buffer = torch.empty(
+                global_config.flashinfer_workspace_size,
+                dtype=torch.uint8,
+                device=model_runner.device,
+            )
+        self.workspace_buffer = global_workspace_buffer
+        max_bs = model_runner.req_to_token_pool.size
+        if kv_indptr_buf is None:
+            self.kv_indptr = [
+                torch.zeros(
+                    (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+                )
+                for _ in range(self.num_wrappers)
+            ]
+        else:
+            assert self.num_wrappers == 1
+            self.kv_indptr = [kv_indptr_buf]
+
+        if kv_last_page_len_buf is None:
+            self.kv_last_page_len = torch.ones(
+                (max_bs,), dtype=torch.int32, device=model_runner.device
+            )
+        else:
+            assert self.num_wrappers == 1
+            self.kv_last_page_len = kv_last_page_len_buf
+
+        if not self.skip_prefill:
+            self.qo_indptr = [
+                torch.zeros(
+                    (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+                )
+                for _ in range(self.num_wrappers)
+            ]
+
+        fmha_backend = "auto"
+        if is_sm100_supported():
+            fmha_backend = "cutlass"
+        self.prefill_wrapper_ragged = BatchPrefillWithRaggedKVCacheWrapper(
+            self.workspace_buffer, "NHD", backend=fmha_backend
+        )
+
+        # Two wrappers: one for sliding window attention and one for full attention.
+        # Using two wrappers is unnecessary in the current PR, but are prepared for future PRs
+        self.prefill_wrappers_paged = []
+        self.prefill_wrappers_verify = []
+        self.decode_wrappers = []
+        for _ in range(self.num_wrappers):
+            if not skip_prefill:
+                self.prefill_wrappers_paged.append(
+                    BatchPrefillWithPagedKVCacheWrapper(
+                        self.workspace_buffer,
+                        "NHD",
+                        backend="fa2",
+                    )
+                )
+                self.prefill_wrappers_verify.append(
+                    BatchPrefillWithPagedKVCacheWrapper(
+                        self.workspace_buffer,
+                        "NHD",
+                    )
+                )
+            self.decode_wrappers.append(
+                BatchDecodeWithPagedKVCacheWrapper(
+                    self.workspace_buffer,
+                    "NHD",
+                    use_tensor_cores=self.decode_use_tensor_cores,
+                )
+            )
+
+        # Create indices updater
+        if not skip_prefill:
+            self.indices_updater_prefill = FlashInferIndicesUpdaterPrefill(
+                model_runner, self
+            )  # for verify
+        self.indices_updater_decode = FlashInferIndicesUpdaterDecode(model_runner, self)
+
+        # Other metadata
+        self.forward_metadata: Union[PrefillMetadata, DecodeMetadata] = None
+        self.decode_cuda_graph_metadata = {}
+        self.prefill_cuda_graph_metadata = {}  # For verify
+        self.draft_extend_cuda_graph_metadata = {}  # For draft extend
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        if forward_batch.forward_mode.is_decode_or_idle():
+            self.indices_updater_decode.update(
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                forward_batch.seq_lens_cpu,
+                forward_batch.seq_lens_sum,
+                decode_wrappers=self.decode_wrappers,
+                encoder_lens=forward_batch.encoder_lens,
+                spec_info=forward_batch.spec_info,
+            )
+            self.forward_metadata = DecodeMetadata(self.decode_wrappers)
+        elif forward_batch.forward_mode.is_draft_extend():
+            self.indices_updater_prefill.update(
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                forward_batch.seq_lens_cpu,
+                forward_batch.seq_lens_sum,
+                prefix_lens=None,
+                prefill_wrappers=self.prefill_wrappers_paged,
+                use_ragged=False,
+                encoder_lens=forward_batch.encoder_lens,
+                spec_info=forward_batch.spec_info,
+            )
+            self.forward_metadata = PrefillMetadata(
+                self.prefill_wrappers_paged, False, False
+            )
+        elif forward_batch.forward_mode.is_target_verify():
+            self.indices_updater_prefill.update(
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                forward_batch.seq_lens_cpu,
+                forward_batch.seq_lens_sum,
+                prefix_lens=None,
+                prefill_wrappers=self.prefill_wrappers_verify,
+                use_ragged=False,
+                encoder_lens=forward_batch.encoder_lens,
+                spec_info=forward_batch.spec_info,
+            )
+            self.forward_metadata = PrefillMetadata(
+                self.prefill_wrappers_verify, False, False
+            )
+        else:
+            prefix_lens = forward_batch.extend_prefix_lens
+
+            if self.is_multimodal:
+                use_ragged = False
+                extend_no_prefix = False
+            else:
+                use_ragged = True
+                extend_no_prefix = not any(forward_batch.extend_prefix_lens_cpu)
+
+            self.indices_updater_prefill.update(
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                forward_batch.seq_lens_cpu,
+                forward_batch.seq_lens_sum,
+                prefix_lens,
+                prefill_wrappers=self.prefill_wrappers_paged,
+                use_ragged=use_ragged,
+                encoder_lens=forward_batch.encoder_lens,
+                spec_info=None,
+            )
+            self.forward_metadata = PrefillMetadata(
+                self.prefill_wrappers_paged, use_ragged, extend_no_prefix
+            )
+
+    def init_cuda_graph_state(
+        self,
+        max_bs: int,
+        max_num_tokens: int,
+        kv_indices_buf: Optional[torch.Tensor] = None,
+    ):
+        if kv_indices_buf is None:
+            cuda_graph_kv_indices = torch.zeros(
+                (max_num_tokens * self.max_context_len,),
+                dtype=torch.int32,
+                device="cuda",
+            )
+        else:
+            cuda_graph_kv_indices = kv_indices_buf
+
+        self.cuda_graph_kv_indices = [cuda_graph_kv_indices] + [
+            cuda_graph_kv_indices.clone() for _ in range(self.num_wrappers - 1)
+        ]
+
+        # Ensure tensors are properly allocated
+        for i in range(self.num_wrappers):
+            # Force allocation by performing a small operation
+            if len(self.cuda_graph_kv_indices[i]) > 0:
+                self.cuda_graph_kv_indices[i][0] = 0
+
+        if not self.skip_prefill:
+            self.cuda_graph_custom_mask = torch.zeros(
+                (max_num_tokens * self.max_context_len),
+                dtype=torch.uint8,
+                device="cuda",
+            )
+            self.cuda_graph_qk_indptr = [x.clone() for x in self.kv_indptr]
+            self.cuda_graph_qo_indptr = [x.clone() for x in self.kv_indptr]
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        if forward_mode.is_decode_or_idle():
+            decode_wrappers = []
+            for i in range(self.num_wrappers):
+                decode_wrappers.append(
+                    BatchDecodeWithPagedKVCacheWrapper(
+                        self.workspace_buffer,
+                        "NHD",
+                        use_cuda_graph=True,
+                        use_tensor_cores=self.decode_use_tensor_cores,
+                        paged_kv_indptr_buffer=self.kv_indptr[i][: num_tokens + 1],
+                        paged_kv_indices_buffer=self.cuda_graph_kv_indices[i],
+                        paged_kv_last_page_len_buffer=self.kv_last_page_len[
+                            :num_tokens
+                        ],
+                    )
+                )
+            seq_lens_sum = seq_lens.sum().item()
+            self.indices_updater_decode.update(
+                req_pool_indices,
+                seq_lens,
+                seq_lens.cpu(),  # may add a little overhead in capture stage
+                seq_lens_sum,
+                decode_wrappers=decode_wrappers,
+                encoder_lens=encoder_lens,
+                spec_info=spec_info,
+            )
+            self.decode_cuda_graph_metadata[bs] = decode_wrappers
+            self.forward_metadata = DecodeMetadata(decode_wrappers)
+            for i in range(self.num_wrappers):
+                decode_wrappers[i].begin_forward = partial(
+                    fast_decode_plan, decode_wrappers[i]
+                )
+        elif forward_mode.is_target_verify():
+            prefill_wrappers = []
+            for i in range(self.num_wrappers):
+                prefill_wrappers.append(
+                    BatchPrefillWithPagedKVCacheWrapper(
+                        self.workspace_buffer,
+                        "NHD",
+                        use_cuda_graph=True,
+                        qo_indptr_buf=self.cuda_graph_qo_indptr[i][: bs + 1],
+                        paged_kv_indptr_buf=self.kv_indptr[i][: bs + 1],
+                        paged_kv_indices_buf=self.cuda_graph_kv_indices[i],
+                        paged_kv_last_page_len_buf=self.kv_last_page_len[:bs],
+                        custom_mask_buf=self.cuda_graph_custom_mask,
+                        mask_indptr_buf=self.cuda_graph_qk_indptr[i][: bs + 1],
+                    )
+                )
+            seq_lens_sum = seq_lens.sum().item()
+            self.indices_updater_prefill.update(
+                req_pool_indices,
+                seq_lens,
+                seq_lens.cpu(),  # may add a little overhead in capture stage
+                seq_lens_sum,
+                prefix_lens=None,
+                prefill_wrappers=prefill_wrappers,
+                use_ragged=False,
+                encoder_lens=encoder_lens,
+                spec_info=spec_info,
+            )
+            self.prefill_cuda_graph_metadata[bs] = prefill_wrappers
+            self.forward_metadata = PrefillMetadata(prefill_wrappers, False, False)
+        elif forward_mode.is_draft_extend():
+            prefill_wrappers = []
+            for i in range(self.num_wrappers):
+                prefill_wrappers.append(
+                    BatchPrefillWithPagedKVCacheWrapper(
+                        self.workspace_buffer,
+                        "NHD",
+                        backend="fa2",
+                        use_cuda_graph=True,
+                        qo_indptr_buf=self.cuda_graph_qo_indptr[i][: bs + 1],
+                        paged_kv_indptr_buf=self.kv_indptr[i][: bs + 1],
+                        paged_kv_indices_buf=self.cuda_graph_kv_indices[i],
+                        paged_kv_last_page_len_buf=self.kv_last_page_len[:bs],
+                    )
+                )
+
+            seq_lens_sum = seq_lens.sum().item()
+            self.indices_updater_prefill.update(
+                req_pool_indices,
+                seq_lens,
+                seq_lens.cpu(),  # may add a little overhead in capture stage
+                seq_lens_sum,
+                prefix_lens=None,
+                prefill_wrappers=prefill_wrappers,
+                use_ragged=False,
+                encoder_lens=encoder_lens,
+                spec_info=spec_info,
+            )
+            self.prefill_cuda_graph_metadata[bs] = prefill_wrappers
+            self.forward_metadata = PrefillMetadata(prefill_wrappers, False, False)
+        else:
+            raise ValueError(f"Invalid mode: {forward_mode=}")
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        if forward_mode.is_decode_or_idle():
+            self.indices_updater_decode.update(
+                req_pool_indices[:bs],
+                seq_lens[:bs],
+                seq_lens_cpu[:bs] if seq_lens_cpu is not None else None,
+                seq_lens_sum,
+                decode_wrappers=self.decode_cuda_graph_metadata[bs],
+                encoder_lens=encoder_lens[:bs] if encoder_lens is not None else None,
+                spec_info=spec_info,
+            )
+        elif forward_mode.is_target_verify():
+            self.indices_updater_prefill.update(
+                req_pool_indices[:bs],
+                seq_lens[:bs],
+                seq_lens_cpu[:bs] if seq_lens_cpu is not None else None,
+                seq_lens_sum,
+                prefix_lens=None,
+                prefill_wrappers=self.prefill_cuda_graph_metadata[bs],
+                use_ragged=False,
+                encoder_lens=encoder_lens[:bs] if encoder_lens is not None else None,
+                spec_info=spec_info,
+            )
+        elif forward_mode.is_draft_extend():
+            self.indices_updater_prefill.update(
+                req_pool_indices[:bs],
+                seq_lens[:bs],
+                seq_lens_cpu[:bs] if seq_lens_cpu is not None else None,
+                seq_lens_sum,
+                prefix_lens=None,
+                prefill_wrappers=self.prefill_cuda_graph_metadata[bs],
+                use_ragged=False,
+                encoder_lens=encoder_lens[:bs] if encoder_lens is not None else None,
+                spec_info=spec_info,
+            )
+        else:
+            raise ValueError("Invalid forward mode")
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        return 1
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        prefill_wrapper_paged = self.forward_metadata.prefill_wrappers[
+            self._get_wrapper_idx(layer)
+        ]
+        cache_loc = (
+            forward_batch.out_cache_loc
+            if not layer.is_cross_attention
+            else forward_batch.encoder_out_cache_loc
+        )
+
+        logits_soft_cap = layer.logit_cap
+
+        q = q.contiguous()
+        if not self.forward_metadata.use_ragged:
+            if k is not None:
+                assert v is not None
+                if save_kv_cache:
+                    forward_batch.token_to_kv_pool.set_kv_buffer(
+                        layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+                    )
+
+            o = prefill_wrapper_paged.forward(
+                q.view(-1, layer.tp_q_head_num, layer.head_dim),
+                forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id),
+                causal=not layer.is_cross_attention,
+                sm_scale=layer.scaling,
+                window_left=layer.sliding_window_size,
+                logits_soft_cap=logits_soft_cap,
+                # Must use _float to avoid device-to-host copy that breaks cuda graph capture.
+                k_scale=layer.k_scale_float,
+                v_scale=layer.v_scale_float,
+            )
+        else:
+            causal = True
+            if layer.attn_type == AttentionType.ENCODER_ONLY:
+                save_kv_cache = False
+                causal = False
+
+            if self.forward_metadata.extend_no_prefix:
+                # NOTE: FlashInfer currently has limitations with head_dim = 32 or other dimensions
+                # The FlashInfer head_dim limitation itself is tracked here:
+                # https://github.com/flashinfer-ai/flashinfer/issues/1048
+                o = self.prefill_wrapper_ragged.forward(
+                    q.view(-1, layer.tp_q_head_num, layer.head_dim),
+                    k.view(-1, layer.tp_k_head_num, layer.head_dim),
+                    v.view(-1, layer.tp_v_head_num, layer.head_dim),
+                    causal=causal,
+                    sm_scale=layer.scaling,
+                    logits_soft_cap=logits_soft_cap,
+                )
+
+            else:
+                o1, s1 = self.prefill_wrapper_ragged.forward_return_lse(
+                    q.view(-1, layer.tp_q_head_num, layer.head_dim),
+                    k.view(-1, layer.tp_k_head_num, layer.head_dim),
+                    v.view(-1, layer.tp_v_head_num, layer.head_dim),
+                    causal=True,
+                    sm_scale=layer.scaling,
+                    logits_soft_cap=logits_soft_cap,
+                )
+                o2, s2 = prefill_wrapper_paged.forward_return_lse(
+                    q.view(-1, layer.tp_q_head_num, layer.head_dim),
+                    forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id),
+                    causal=False,
+                    sm_scale=layer.scaling,
+                    logits_soft_cap=logits_soft_cap,
+                )
+
+                o, _ = merge_state(o1, s1, o2, s2)
+
+            if save_kv_cache:
+                forward_batch.token_to_kv_pool.set_kv_buffer(
+                    layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+                )
+
+        return o.view(-1, layer.tp_q_head_num * layer.head_dim)
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        decode_wrapper = self.forward_metadata.decode_wrappers[
+            self._get_wrapper_idx(layer)
+        ]
+        cache_loc = (
+            forward_batch.out_cache_loc
+            if not layer.is_cross_attention
+            else forward_batch.encoder_out_cache_loc
+        )
+
+        if k is not None:
+            assert v is not None
+            if save_kv_cache:
+                forward_batch.token_to_kv_pool.set_kv_buffer(
+                    layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+                )
+
+        # Call the wrapped function
+        o = decode_wrapper.forward(
+            q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
+            forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id),
+            sm_scale=layer.scaling,
+            logits_soft_cap=layer.logit_cap,
+            # Must use _float to avoid device-to-host copy that breaks cuda graph capture.
+            k_scale=layer.k_scale_float,
+            v_scale=layer.v_scale_float,
+        )
+
+        return o.view(-1, layer.tp_q_head_num * layer.head_dim)
+
+    def _get_wrapper_idx(self, layer: RadixAttention):
+        if self.num_wrappers == 1:
+            return 0
+
+        if self.dispatch_reason == WrapperDispatch.SLIDING_WINDOW:
+            return layer.sliding_window_size == -1
+        if self.dispatch_reason == WrapperDispatch.CROSS_ATTENTION:
+            return layer.is_cross_attention
+
+        raise ValueError(f"Unknown dispatch reason: {self.dispatch_reason}")
+
+
+class FlashInferIndicesUpdaterDecode:
+    def __init__(self, model_runner: ModelRunner, attn_backend: FlashInferAttnBackend):
+        # Parse Constants
+        self.num_qo_heads = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.num_kv_heads = model_runner.model_config.get_num_kv_heads(
+            get_attention_tp_size()
+        )
+        self.head_dim = model_runner.model_config.head_dim
+        self.data_type = model_runner.kv_cache_dtype
+        self.q_data_type = model_runner.dtype
+        self.sliding_window_size = model_runner.sliding_window_size
+        self.attn_backend = attn_backend
+
+        # Buffers and wrappers
+        self.kv_indptr = attn_backend.kv_indptr
+        self.kv_last_page_len = attn_backend.kv_last_page_len
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.token_to_kv_pool_allocator = model_runner.token_to_kv_pool_allocator
+
+        # Dispatch the update function
+        if self.attn_backend.dispatch_reason == WrapperDispatch.SLIDING_WINDOW:
+            self.update = self.update_sliding_window
+        elif self.attn_backend.dispatch_reason == WrapperDispatch.CROSS_ATTENTION:
+            self.update = self.update_cross_attention
+        else:
+            assert self.attn_backend.num_wrappers == 1
+            self.update = self.update_single_wrapper
+
+    def update(
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_cpu: Optional[torch.Tensor],
+        seq_lens_sum: int,
+        decode_wrappers: List[BatchDecodeWithPagedKVCacheWrapper],
+        encoder_lens: Optional[torch.Tensor],
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        # Keep the signature for type checking. It will be assigned during runtime.
+        raise NotImplementedError()
+
+    def update_single_wrapper(
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_cpu: Optional[torch.Tensor],
+        seq_lens_sum: int,
+        decode_wrappers: List[BatchDecodeWithPagedKVCacheWrapper],
+        encoder_lens: Optional[torch.Tensor],
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        decode_wrappers = decode_wrappers or self.decode_wrappers
+        self.call_begin_forward(
+            decode_wrappers[0],
+            req_pool_indices,
+            seq_lens,
+            seq_lens_sum,
+            self.kv_indptr[0],
+            None,
+            spec_info,
+            seq_lens_cpu,
+        )
+
+    def update_sliding_window(
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_cpu: Optional[torch.Tensor],
+        seq_lens_sum: int,
+        decode_wrappers: List[BatchDecodeWithPagedKVCacheWrapper],
+        encoder_lens: Optional[torch.Tensor],
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        assert self.sliding_window_size is not None
+        for wrapper_id in range(2):
+            if wrapper_id == 0:
+                # Sliding window attention
+                paged_kernel_lens_tmp = torch.clamp(
+                    seq_lens, max=self.sliding_window_size + 1
+                )
+                if seq_lens_cpu is not None:
+                    seq_lens_cpu_tmp = torch.clamp(
+                        seq_lens_cpu, max=self.sliding_window_size + 1
+                    )
+                    paged_kernel_lens_sum_tmp = seq_lens_cpu_tmp.sum().item()
+                else:
+                    paged_kernel_lens_sum_tmp = paged_kernel_lens_tmp.sum().item()
+                kv_start_idx_tmp = seq_lens - paged_kernel_lens_tmp
+            else:
+                # Full attention
+                paged_kernel_lens_tmp = seq_lens
+                paged_kernel_lens_sum_tmp = seq_lens_sum
+                seq_lens_cpu_tmp = seq_lens_cpu
+                kv_start_idx_tmp = None
+
+            use_sliding_window_kv_pool = wrapper_id == 0 and isinstance(
+                self.token_to_kv_pool_allocator, SWATokenToKVPoolAllocator
+            )
+
+            self.call_begin_forward(
+                decode_wrappers[wrapper_id],
+                req_pool_indices,
+                paged_kernel_lens_tmp,
+                paged_kernel_lens_sum_tmp,
+                self.kv_indptr[wrapper_id],
+                kv_start_idx_tmp,
+                spec_info,
+                seq_lens_cpu=seq_lens_cpu_tmp,
+                use_sliding_window_kv_pool=use_sliding_window_kv_pool,
+            )
+
+    def update_cross_attention(
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_cpu: Optional[torch.Tensor],
+        seq_lens_sum: int,
+        decode_wrappers: List[BatchDecodeWithPagedKVCacheWrapper],
+        encoder_lens: Optional[torch.Tensor],
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        for wrapper_id in range(2):
+            if wrapper_id == 0:
+                # Normal attention
+                paged_kernel_lens = seq_lens
+                kv_start_idx = encoder_lens
+            else:
+                # Cross attention
+                paged_kernel_lens = encoder_lens
+                kv_start_idx = torch.zeros_like(encoder_lens)
+                seq_lens_sum = encoder_lens.sum().item()
+
+            self.call_begin_forward(
+                decode_wrappers[wrapper_id],
+                req_pool_indices,
+                paged_kernel_lens,
+                seq_lens_sum,
+                self.kv_indptr[wrapper_id],
+                kv_start_idx,
+                spec_info,
+                seq_lens_cpu=seq_lens_cpu,
+            )
+
+    def call_begin_forward(
+        self,
+        wrapper: BatchDecodeWithPagedKVCacheWrapper,
+        req_pool_indices: torch.Tensor,
+        paged_kernel_lens: torch.Tensor,
+        paged_kernel_lens_sum: int,
+        kv_indptr: torch.Tensor,
+        kv_start_idx: torch.Tensor,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+        seq_lens_cpu: Optional[torch.Tensor],
+        use_sliding_window_kv_pool: bool = False,
+    ):
+        if spec_info is None:
+            bs = len(req_pool_indices)
+            kv_indptr[1 : bs + 1] = torch.cumsum(paged_kernel_lens, dim=0)
+            kv_indptr = kv_indptr[: bs + 1]
+
+            if wrapper.is_cuda_graph_enabled:
+                # Directly write to the cuda graph input buffer
+                kv_indices = wrapper._paged_kv_indices_buf
+            else:
+                kv_indices = torch.empty(
+                    paged_kernel_lens_sum, dtype=torch.int32, device="cuda"
+                )
+
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                paged_kernel_lens,
+                kv_indptr,
+                kv_start_idx,
+                kv_indices,
+                self.req_to_token.shape[1],
+            )
+        else:
+            kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
+            bs = kv_indptr.shape[0] - 1
+
+        if use_sliding_window_kv_pool:
+            kv_last_index = kv_indptr[-1]
+            kv_indices[:kv_last_index] = (
+                self.token_to_kv_pool_allocator.translate_loc_from_full_to_swa(
+                    kv_indices[:kv_last_index]
+                )
+            )
+
+        global global_override_indptr_cpu
+        locally_override = False
+        if seq_lens_cpu is not None and global_override_indptr_cpu is None:
+            locally_override = True
+            global_override_indptr_cpu = torch.empty_like(kv_indptr, device="cpu")
+            global_override_indptr_cpu[0] = 0
+            global_override_indptr_cpu[1 : bs + 1] = torch.cumsum(seq_lens_cpu, dim=0)
+
+        wrapper.begin_forward(
+            kv_indptr,
+            kv_indices,
+            self.kv_last_page_len[:bs],
+            self.num_qo_heads,
+            self.num_kv_heads,
+            self.head_dim,
+            1,
+            data_type=self.data_type,
+            q_data_type=self.q_data_type,
+            non_blocking=True,
+        )
+
+        if locally_override:
+            global_override_indptr_cpu = None
+
+
+class FlashInferIndicesUpdaterPrefill:
+    def __init__(self, model_runner: ModelRunner, attn_backend: FlashInferAttnBackend):
+        # Parse Constants
+        self.num_qo_heads = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.num_kv_heads = model_runner.model_config.get_num_kv_heads(
+            get_attention_tp_size()
+        )
+        self.head_dim = model_runner.model_config.head_dim
+        self.data_type = model_runner.kv_cache_dtype
+        self.q_data_type = model_runner.dtype
+        self.sliding_window_size = model_runner.sliding_window_size
+        self.attn_backend = attn_backend
+
+        # Buffers and wrappers
+        self.kv_indptr = attn_backend.kv_indptr
+        self.kv_last_page_len = attn_backend.kv_last_page_len
+        self.qo_indptr = attn_backend.qo_indptr
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.token_to_kv_pool_allocator = model_runner.token_to_kv_pool_allocator
+        self.prefill_wrapper_ragged = attn_backend.prefill_wrapper_ragged
+
+        # Dispatch the update function
+        if self.attn_backend.dispatch_reason == WrapperDispatch.SLIDING_WINDOW:
+            self.update = self.update_sliding_window
+        elif self.attn_backend.dispatch_reason == WrapperDispatch.CROSS_ATTENTION:
+            self.update = self.update_cross_attention
+        else:
+            assert self.attn_backend.num_wrappers == 1
+            self.update = self.update_single_wrapper
+
+    def update(
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_cpu: Optional[torch.Tensor],
+        seq_lens_sum: int,
+        prefix_lens: torch.Tensor,
+        prefill_wrappers: List[BatchPrefillWithPagedKVCacheWrapper],
+        use_ragged: bool,
+        encoder_lens: Optional[torch.Tensor],
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        # Keep the signature for type checking. It will be assigned during runtime.
+        raise NotImplementedError()
+
+    def update_single_wrapper(
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_cpu: Optional[torch.Tensor],
+        seq_lens_sum: int,
+        prefix_lens: torch.Tensor,
+        prefill_wrappers: List[BatchPrefillWithPagedKVCacheWrapper],
+        use_ragged: bool,
+        encoder_lens: Optional[torch.Tensor],
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        if use_ragged:
+            # TODO: remove this device sync, we can use forward_batch.extend_prefix_lens_cpu
+            # and forward_batch.extend_seq_lens_cpu
+            paged_kernel_lens = prefix_lens
+            paged_kernel_lens_sum = paged_kernel_lens.sum().item()
+        else:
+            paged_kernel_lens = seq_lens
+            paged_kernel_lens_sum = seq_lens_sum
+
+        self.call_begin_forward(
+            self.prefill_wrapper_ragged,
+            prefill_wrappers[0],
+            req_pool_indices,
+            paged_kernel_lens,
+            paged_kernel_lens_sum,
+            seq_lens,
+            prefix_lens,
+            None,
+            self.kv_indptr[0],
+            self.qo_indptr[0],
+            use_ragged,
+            spec_info,
+        )
+
+    def update_sliding_window(
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_cpu: Optional[torch.Tensor],
+        seq_lens_sum: int,
+        prefix_lens: torch.Tensor,
+        prefill_wrappers: List[BatchPrefillWithPagedKVCacheWrapper],
+        use_ragged: bool,
+        encoder_lens: Optional[torch.Tensor],
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        for wrapper_id in range(2):
+            if wrapper_id == 0:
+                # window attention use paged only
+                paged_kernel_lens = torch.minimum(
+                    seq_lens,
+                    torch.tensor(self.sliding_window_size) + seq_lens - prefix_lens,
+                )
+                paged_kernel_lens_sum = paged_kernel_lens.sum().item()
+            else:
+                # full attention
+                paged_kernel_lens = seq_lens
+                paged_kernel_lens_sum = seq_lens_sum
+
+            kv_start_idx = seq_lens - paged_kernel_lens
+            use_sliding_window_kv_pool = wrapper_id == 0 and isinstance(
+                self.token_to_kv_pool_allocator, SWATokenToKVPoolAllocator
+            )
+
+            self.call_begin_forward(
+                self.prefill_wrapper_ragged,
+                prefill_wrappers[wrapper_id],
+                req_pool_indices,
+                paged_kernel_lens,
+                paged_kernel_lens_sum,
+                seq_lens,
+                prefix_lens,
+                kv_start_idx,
+                self.kv_indptr[wrapper_id],
+                self.qo_indptr[wrapper_id],
+                use_ragged,
+                spec_info,
+                use_sliding_window_kv_pool=use_sliding_window_kv_pool,
+            )
+
+    def update_cross_attention(
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_cpu: Optional[torch.Tensor],
+        seq_lens_sum: int,
+        prefix_lens: torch.Tensor,
+        prefill_wrappers: List[BatchPrefillWithPagedKVCacheWrapper],
+        use_ragged: bool,
+        encoder_lens: Optional[torch.Tensor],
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        for wrapper_id in range(2):
+            if wrapper_id == 0:
+                # normal attention
+                paged_kernel_lens = seq_lens
+                kv_start_idx = encoder_lens
+                paged_kernel_lens_sum = seq_lens_sum
+            else:
+                # cross attention
+                paged_kernel_lens = encoder_lens
+                kv_start_idx = torch.zeros_like(encoder_lens)
+                paged_kernel_lens_sum = paged_kernel_lens.sum().item()
+
+            self.call_begin_forward(
+                self.prefill_wrapper_ragged,
+                prefill_wrappers[wrapper_id],
+                req_pool_indices,
+                paged_kernel_lens,
+                paged_kernel_lens_sum,
+                seq_lens,
+                prefix_lens,
+                kv_start_idx,
+                self.kv_indptr[wrapper_id],
+                self.qo_indptr[wrapper_id],
+                use_ragged,
+                spec_info,
+            )
+
+    def call_begin_forward(
+        self,
+        wrapper_ragged: BatchPrefillWithRaggedKVCacheWrapper,
+        wrapper_paged: BatchPrefillWithPagedKVCacheWrapper,
+        req_pool_indices: torch.Tensor,
+        paged_kernel_lens: torch.Tensor,
+        paged_kernel_lens_sum: int,
+        seq_lens: torch.Tensor,
+        prefix_lens: torch.Tensor,
+        kv_start_idx: torch.Tensor,
+        kv_indptr: torch.Tensor,
+        qo_indptr: torch.Tensor,
+        use_ragged: bool,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+        use_sliding_window_kv_pool: bool = False,
+    ):
+        bs = len(seq_lens)
+        if spec_info is None:
+            assert len(seq_lens) == len(req_pool_indices)
+            # Normal extend
+            kv_indptr[1 : bs + 1] = torch.cumsum(paged_kernel_lens, dim=0)
+            kv_indptr = kv_indptr[: bs + 1]
+            kv_indices = torch.empty(
+                paged_kernel_lens_sum + 256,
+                dtype=torch.int32,
+                device=req_pool_indices.device,
+            )
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                paged_kernel_lens,
+                kv_indptr,
+                kv_start_idx,
+                kv_indices,
+                self.req_to_token.shape[1],
+            )
+            qo_indptr[1 : bs + 1] = torch.cumsum(seq_lens - prefix_lens, dim=0)
+            qo_indptr = qo_indptr[: bs + 1]
+            custom_mask = None
+        else:
+            assert isinstance(spec_info, EagleDraftInput) or isinstance(
+                spec_info, EagleVerifyInput
+            )
+            kv_indices, kv_indptr, qo_indptr, custom_mask = (
+                spec_info.generate_attn_arg_prefill(
+                    req_pool_indices,
+                    paged_kernel_lens,
+                    paged_kernel_lens_sum,
+                    self.req_to_token,
+                )
+            )
+
+        # extend part
+        if use_ragged:
+            wrapper_ragged.begin_forward(
+                qo_indptr,
+                qo_indptr,
+                self.num_qo_heads,
+                self.num_kv_heads,
+                self.head_dim,
+                q_data_type=self.q_data_type,
+            )
+
+        if use_sliding_window_kv_pool:
+            kv_last_index = kv_indptr[-1]
+            kv_indices[:kv_last_index] = (
+                self.token_to_kv_pool_allocator.translate_loc_from_full_to_swa(
+                    kv_indices[:kv_last_index]
+                )
+            )
+
+        # cached part
+        wrapper_paged.begin_forward(
+            qo_indptr,
+            kv_indptr,
+            kv_indices,
+            self.kv_last_page_len[:bs],
+            self.num_qo_heads,
+            self.num_kv_heads,
+            self.head_dim,
+            1,
+            q_data_type=self.q_data_type,
+            kv_data_type=self.data_type,
+            custom_mask=custom_mask,
+            non_blocking=True,
+        )
+
+
+class FlashInferMultiStepDraftBackend:
+    """
+    Wrap multiple flashinfer attention backends as one for multiple consecutive
+    draft decoding steps.
+    """
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        topk: int,
+        speculative_num_steps: int,
+    ):
+        from sglang.srt.speculative.eagle_utils import generate_draft_decode_kv_indices
+
+        self.topk = topk
+        self.speculative_num_steps = speculative_num_steps
+        self.generate_draft_decode_kv_indices = generate_draft_decode_kv_indices
+        self.page_size = model_runner.page_size
+
+        max_bs = model_runner.req_to_token_pool.size * self.topk
+        self.kv_indptr = torch.zeros(
+            (
+                self.speculative_num_steps,
+                max_bs + 1,
+            ),
+            dtype=torch.int32,
+            device=model_runner.device,
+        )
+        self.kv_last_page_len = torch.ones(
+            (max_bs,), dtype=torch.int32, device=model_runner.device
+        )
+        self.attn_backends: List[FlashInferAttnBackend] = []
+        for i in range(self.speculative_num_steps):
+            self.attn_backends.append(
+                FlashInferAttnBackend(
+                    model_runner,
+                    skip_prefill=True,
+                    kv_indptr_buf=self.kv_indptr[i],
+                    kv_last_page_len_buf=self.kv_last_page_len,
+                )
+            )
+
+        self.max_context_len = self.attn_backends[0].max_context_len
+
+        # Cached variables for generate_draft_decode_kv_indices
+        self.pool_len = model_runner.req_to_token_pool.req_to_token.shape[1]
+
+    def common_template(
+        self,
+        forward_batch: ForwardBatch,
+        kv_indices_buffer: torch.Tensor,
+        call_fn: Callable,
+    ):
+        num_seqs = forward_batch.batch_size
+        bs = self.topk * num_seqs
+        seq_lens_sum = forward_batch.seq_lens_sum
+
+        self.generate_draft_decode_kv_indices[
+            (self.speculative_num_steps, num_seqs, self.topk)
+        ](
+            forward_batch.req_pool_indices,
+            forward_batch.req_to_token_pool.req_to_token,
+            forward_batch.seq_lens,
+            kv_indices_buffer,
+            self.kv_indptr,
+            forward_batch.positions,
+            self.pool_len,
+            kv_indices_buffer.shape[1],
+            self.kv_indptr.shape[1],
+            next_power_of_2(num_seqs),
+            next_power_of_2(self.speculative_num_steps),
+            next_power_of_2(bs),
+            self.page_size,
+        )
+
+        assert forward_batch.spec_info is not None
+        assert isinstance(forward_batch.spec_info, EagleDraftInput)
+
+        # Copy the kv_indptr once to avoid multiple device-to-host copies in flashinfer's plan.
+        indptr_cpu_whole = self.kv_indptr[:, : bs + 1].cpu()
+        global global_override_indptr_cpu
+
+        for i in range(self.speculative_num_steps - 1):
+            forward_batch.spec_info.kv_indptr = self.kv_indptr[i, : bs + 1]
+            forward_batch.spec_info.kv_indices = kv_indices_buffer[i][
+                : seq_lens_sum * self.topk + bs * (i + 1)
+            ]
+            global_override_indptr_cpu = indptr_cpu_whole[i]
+            call_fn(i, forward_batch)
+
+        global_override_indptr_cpu = None
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        kv_indices = torch.empty(
+            (
+                self.speculative_num_steps,
+                forward_batch.batch_size * self.topk * self.max_context_len,
+            ),
+            dtype=torch.int32,
+            device="cuda",
+        )
+
+        def call_fn(i, forward_batch):
+            forward_batch.spec_info.kv_indptr = (
+                forward_batch.spec_info.kv_indptr.clone()
+            )
+            forward_batch.spec_info.kv_indices = (
+                forward_batch.spec_info.kv_indices.clone()
+            )
+            self.attn_backends[i].init_forward_metadata(forward_batch)
+
+        self.common_template(forward_batch, kv_indices, call_fn)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        self.cuda_graph_kv_indices = torch.zeros(
+            (self.speculative_num_steps, max_bs * self.max_context_len),
+            dtype=torch.int32,
+            device="cuda",
+        )
+
+        for i in range(self.speculative_num_steps):
+            self.attn_backends[i].init_cuda_graph_state(
+                max_bs, max_num_tokens, kv_indices_buf=self.cuda_graph_kv_indices[i]
+            )
+
+    def init_forward_metadata_capture_cuda_graph(self, forward_batch: ForwardBatch):
+        def call_fn(i, forward_batch):
+            self.attn_backends[i].init_forward_metadata_capture_cuda_graph(
+                forward_batch.batch_size,
+                forward_batch.batch_size * self.topk,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                encoder_lens=None,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+            )
+
+        self.common_template(forward_batch, self.cuda_graph_kv_indices, call_fn)
+
+    def init_forward_metadata_replay_cuda_graph(
+        self, forward_batch: ForwardBatch, bs: int
+    ):
+        def call_fn(i, forward_batch):
+            self.attn_backends[i].init_forward_metadata_replay_cuda_graph(
+                bs,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                seq_lens_sum=-1,
+                encoder_lens=None,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+                seq_lens_cpu=forward_batch.seq_lens_cpu,
+            )
+
+        self.common_template(forward_batch, self.cuda_graph_kv_indices, call_fn)
+
+
+def should_use_tensor_core(
+    kv_cache_dtype: torch.dtype,
+    num_attention_heads: int,
+    num_kv_heads: int,
+) -> bool:
+    """
+    Determine whether to use tensor cores for attention computation.
+
+    Args:
+        kv_cache_dtype: Data type of the KV cache
+        num_attention_heads: Number of attention heads
+        num_kv_heads: Number of key/value heads
+
+    Returns:
+        bool: Whether to use tensor cores
+    """
+    # Try to use environment variable first
+    env_override = os.environ.get("SGLANG_FLASHINFER_USE_TENSOR_CORE")
+    if env_override is not None:
+        return env_override.lower() == "true"
+
+    # Try to use _grouped_size_compiled_for_decode_kernels if available
+    # This is for flashinfer <=0.1.6. Otherwise, there is an accuracy bug
+    try:
+        from flashinfer.decode import _grouped_size_compiled_for_decode_kernels
+
+        if not _grouped_size_compiled_for_decode_kernels(
+            num_attention_heads,
+            num_kv_heads,
+        ):
+            return True
+        else:
+            return False
+    except (ImportError, AttributeError):
+        pass
+
+    # Calculate GQA group size
+    gqa_group_size = num_attention_heads // num_kv_heads
+
+    # For Flashinfer, a GQA group size of at least 4 is needed to efficiently
+    # use Tensor Cores, as it fuses the head group with the token dimension in MMA.
+    if kv_cache_dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
+        return True
+    elif kv_cache_dtype in (torch.float16, torch.half, torch.bfloat16):
+        return gqa_group_size >= 4
+    else:
+        return False
+
+
+# Use as a fast path to override the indptr in flashinfer's plan function
+# This is used to remove some host-to-device copy overhead.
+global_override_indptr_cpu = None
+
+
+def fast_decode_plan(
+    self,
+    indptr: torch.Tensor,
+    indices: torch.Tensor,
+    last_page_len: torch.Tensor,
+    num_qo_heads: int,
+    num_kv_heads: int,
+    head_dim: int,
+    page_size: int,
+    pos_encoding_mode: str = "NONE",
+    window_left: int = -1,
+    logits_soft_cap: Optional[float] = None,
+    q_data_type: Optional[Union[str, torch.dtype]] = None,
+    kv_data_type: Optional[Union[str, torch.dtype]] = None,
+    data_type: Optional[Union[str, torch.dtype]] = None,
+    sm_scale: Optional[float] = None,
+    rope_scale: Optional[float] = None,
+    rope_theta: Optional[float] = None,
+    non_blocking: bool = True,
+) -> None:
+    """
+    A faster version of BatchDecodeWithPagedKVCacheWrapper::plan used for FlashInferMultiStepDraftBackend.
+    Modifications:
+    - Remove unnecessary device-to-device copy for the cuda graph buffers.
+    - Remove unnecessary host-to-device copy for the metadata buffers.
+    """
+    batch_size = len(last_page_len)
+    if logits_soft_cap is None:
+        logits_soft_cap = 0.0
+
+    # Handle data types consistently
+    if data_type is not None:
+        if q_data_type is None:
+            q_data_type = data_type
+        if kv_data_type is None:
+            kv_data_type = data_type
+    elif q_data_type is None:
+        q_data_type = "float16"
+
+    if kv_data_type is None:
+        kv_data_type = q_data_type
+
+    if self.use_tensor_cores:
+        qo_indptr_host = _get_range_buf(batch_size + 1, "cpu")
+
+    if self.is_cuda_graph_enabled:
+        if batch_size != self._fixed_batch_size:
+            raise ValueError(
+                "The batch size should be fixed in cudagraph mode, the runtime batch size {} "
+                " mismatches the batch size set during initialization {}".format(
+                    batch_size, self._fixed_batch_size
+                )
+            )
+        if len(indices) > len(self._paged_kv_indices_buf):
+            raise ValueError(
+                "The size of indices should be less than or equal to the allocated buffer"
+            )
+    else:
+        self._paged_kv_indptr_buf = indptr
+        self._paged_kv_indices_buf = indices
+        self._paged_kv_last_page_len_buf = last_page_len
+        if self.use_tensor_cores:
+            self._qo_indptr_buf = qo_indptr_host.to(
+                self.device, non_blocking=non_blocking
+            )
+
+    # Create empty tensors for dtype info if needed
+    empty_q_data = torch.empty(
+        0,
+        dtype=(
+            getattr(torch, q_data_type) if isinstance(q_data_type, str) else q_data_type
+        ),
+        device=self.device,
+    )
+
+    empty_kv_cache = torch.empty(
+        0,
+        dtype=(
+            getattr(torch, kv_data_type)
+            if isinstance(kv_data_type, str)
+            else kv_data_type
+        ),
+        device=self.device,
+    )
+
+    indptr_host = (
+        global_override_indptr_cpu
+        if global_override_indptr_cpu is not None
+        else indptr.cpu()
+    )
+
+    with torch.cuda.device(self.device):
+
+        if self.use_tensor_cores:
+            # ALSO convert last_page_len to CPU
+            if page_size == 1:
+                # When page size is 1, last_page_len is always 1.
+                # Directly construct the host tensor rather than executing a device-to-host copy.
+                last_page_len_host = torch.ones(
+                    (batch_size,), dtype=torch.int32, device="cpu"
+                )
+            else:
+                last_page_len_host = last_page_len.cpu()
+
+            kv_lens_arr_host = get_seq_lens(indptr_host, last_page_len_host, page_size)
+
+            try:
+                # Make sure we pass exactly 15 arguments for tensor core version
+                self._plan_info = self._cached_module.plan(
+                    self._float_workspace_buffer,
+                    self._int_workspace_buffer,
+                    self._pin_memory_int_workspace_buffer,
+                    qo_indptr_host,
+                    indptr_host,
+                    kv_lens_arr_host,
+                    batch_size,  # total_num_rows
+                    batch_size,
+                    num_qo_heads,
+                    num_kv_heads,
+                    page_size,
+                    self.is_cuda_graph_enabled,
+                    head_dim,
+                    head_dim,
+                    False,  # causal
+                )
+            except Exception as e:
+                raise RuntimeError(f"Error in standard plan: {e}")
+        else:
+            try:
+                # Make sure we pass exactly 15 arguments for standard version
+                self._plan_info = self._cached_module.plan(
+                    self._float_workspace_buffer,
+                    self._int_workspace_buffer,
+                    self._pin_memory_int_workspace_buffer,
+                    indptr_host,
+                    batch_size,
+                    num_qo_heads,
+                    num_kv_heads,
+                    page_size,
+                    self.is_cuda_graph_enabled,
+                    window_left,
+                    logits_soft_cap,
+                    head_dim,
+                    head_dim,
+                    empty_q_data,
+                    empty_kv_cache,
+                )
+            except Exception as e:
+                raise RuntimeError(f"Error in standard plan: {e}")
+
+    self._pos_encoding_mode = pos_encoding_mode
+    self._window_left = window_left
+    self._logits_soft_cap = logits_soft_cap
+    self._sm_scale = sm_scale
+    self._rope_scale = rope_scale
+    self._rope_theta = rope_theta
diff --git a/python/sglang/srt/layers/attention/flashinfer_mla_backend.py b/python/sglang/srt/layers/attention/flashinfer_mla_backend.py
new file mode 100644
index 000000000..05e9bef80
--- /dev/null
+++ b/python/sglang/srt/layers/attention/flashinfer_mla_backend.py
@@ -0,0 +1,1079 @@
+from __future__ import annotations
+
+"""
+Support attention backend for flashinfer MLA.
+The flashinfer_mla_disable_ragged flag controls whether to use ragged prefill wrapper and defaults to be false.
+When it's set to false, all wrappers are BatchMLAPaged wrapper.
+When it's set to true, the backend uses BatchRagged and BatchMLAPaged wrapper for prefilling,
+and uses BatchMLAPaged wrapper for decoding.
+More details can be found in https://docs.flashinfer.ai/api/mla.html
+"""
+
+import os
+from dataclasses import dataclass
+from functools import partial
+from typing import TYPE_CHECKING, Callable, Optional, Union
+
+import torch
+
+if os.environ["SGLANG_ENABLE_TORCH_COMPILE"] == "1":
+    import logging
+
+    torch._logging.set_logs(dynamo=logging.ERROR)
+    torch._dynamo.config.suppress_errors = True
+
+from sglang.global_config import global_config
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.attention.flashinfer_backend import (
+    create_flashinfer_kv_indices_triton,
+)
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
+from sglang.srt.utils import (
+    is_flashinfer_available,
+    is_sm100_supported,
+    next_power_of_2,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.speculative.spec_info import SpecInfo
+
+if is_flashinfer_available():
+    from flashinfer import (
+        BatchMLAPagedAttentionWrapper,
+        BatchPrefillWithRaggedKVCacheWrapper,
+    )
+
+
+@dataclass
+class DecodeMetadata:
+    decode_wrapper: BatchMLAPagedAttentionWrapper
+
+
+@dataclass
+class PrefillMetadata:
+    prefill_wrapper: BatchMLAPagedAttentionWrapper
+    use_ragged: bool
+
+
+# Reuse this workspace buffer across all flashinfer wrappers
+global_workspace_buffer = None
+
+
+class FlashInferMhaChunkKVRunner:
+    def __init__(
+        self, model_runner: ModelRunner, attn_backend: "FlashInferMlaAttnBackend"
+    ):
+        # Parse Constants
+        self.num_local_heads = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.qk_nope_head_dim = model_runner.model_config.qk_nope_head_dim
+        self.qk_rope_head_dim = model_runner.model_config.qk_rope_head_dim
+        self.v_head_dim = model_runner.model_config.v_head_dim
+        self.data_type = model_runner.dtype
+        self.q_data_type = model_runner.dtype
+
+        # Buffers and wrappers
+        self.qo_indptr = attn_backend.qo_indptr
+        self.workspace_buffer = attn_backend.workspace_buffer
+        self.fmha_backend = attn_backend.fmha_backend
+
+        self.chunk_ragged_wrappers = []
+        self.ragged_wrapper = attn_backend.prefill_wrapper_ragged
+
+    def update_prefix_chunks(self, num_prefix_chunks: int):
+        while num_prefix_chunks > len(self.chunk_ragged_wrappers):
+            ragged_wrapper = BatchPrefillWithRaggedKVCacheWrapper(
+                self.workspace_buffer, "NHD", backend=self.fmha_backend
+            )
+            self.chunk_ragged_wrappers.append(ragged_wrapper)
+
+    def update_wrapper(
+        self,
+        forward_batch: ForwardBatch,
+        disable_flashinfer_ragged: bool = False,
+    ):
+        assert forward_batch.num_prefix_chunks is not None
+        num_prefix_chunks = forward_batch.num_prefix_chunks
+        self.update_prefix_chunks(num_prefix_chunks)
+
+        prefix_lens = forward_batch.extend_prefix_lens
+        seq_lens = forward_batch.seq_lens
+
+        bs = len(seq_lens)
+        qo_indptr = self.qo_indptr
+        qo_indptr[1 : bs + 1] = torch.cumsum(seq_lens - prefix_lens, dim=0)
+        qo_indptr = qo_indptr[: bs + 1]
+
+        for chunk_idx in range(forward_batch.num_prefix_chunks):
+            # MHA for chunked prefix kv cache when running model with MLA
+            assert forward_batch.prefix_chunk_idx is not None
+            assert forward_batch.prefix_chunk_cu_seq_lens is not None
+            assert forward_batch.prefix_chunk_max_seq_lens is not None
+
+            kv_indptr = forward_batch.prefix_chunk_cu_seq_lens[chunk_idx]
+            wrapper = self.chunk_ragged_wrappers[chunk_idx]
+            wrapper.begin_forward(
+                qo_indptr=qo_indptr,
+                kv_indptr=kv_indptr,
+                num_qo_heads=self.num_local_heads,
+                num_kv_heads=self.num_local_heads,
+                head_dim_qk=self.qk_nope_head_dim + self.qk_rope_head_dim,
+                head_dim_vo=self.v_head_dim,
+                q_data_type=self.q_data_type,
+                causal=False,
+            )
+        # ragged prefill
+        if not disable_flashinfer_ragged:
+            self.ragged_wrapper.begin_forward(
+                qo_indptr=qo_indptr,
+                kv_indptr=qo_indptr,
+                num_qo_heads=self.num_local_heads,
+                num_kv_heads=self.num_local_heads,
+                head_dim_qk=self.qk_nope_head_dim + self.qk_rope_head_dim,
+                head_dim_vo=self.v_head_dim,
+                q_data_type=self.q_data_type,
+                causal=True,
+            )
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+    ):
+        logits_soft_cap = layer.logit_cap
+        if forward_batch.attn_attend_prefix_cache:
+            chunk_idx = forward_batch.prefix_chunk_idx
+            assert chunk_idx >= 0
+            wrapper = self.chunk_ragged_wrappers[chunk_idx]
+            o1, s1 = wrapper.forward_return_lse(
+                q.view(-1, layer.tp_q_head_num, layer.head_dim),
+                k.view(-1, layer.tp_k_head_num, layer.head_dim).to(q.dtype),
+                v.view(-1, layer.tp_v_head_num, layer.v_head_dim).to(q.dtype),
+                causal=False,
+                sm_scale=layer.scaling,
+                logits_soft_cap=logits_soft_cap,
+            )
+        else:
+            o1, s1 = self.ragged_wrapper.forward_return_lse(
+                q.view(-1, layer.tp_q_head_num, layer.head_dim),
+                k.view(-1, layer.tp_k_head_num, layer.head_dim).to(q.dtype),
+                v.view(-1, layer.tp_v_head_num, layer.v_head_dim).to(q.dtype),
+                causal=True,
+                sm_scale=layer.scaling,
+                logits_soft_cap=logits_soft_cap,
+            )
+
+        return o1, s1
+
+
+class FlashInferMLAAttnBackend(AttentionBackend):
+    """Flashinfer attention kernels."""
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        skip_prefill: bool = False,
+        kv_indptr_buf: Optional[torch.Tensor] = None,
+        q_indptr_decode_buf: Optional[torch.Tensor] = None,
+    ):
+        super().__init__()
+
+        # Parse constants
+        self.max_context_len = model_runner.model_config.context_len
+        self.device = model_runner.device
+        self.skip_prefill = skip_prefill
+        self.enable_chunk_kv = (
+            not skip_prefill
+            and global_server_args_dict["disaggregation_mode"] != "decode"
+            and not global_server_args_dict["disable_chunked_prefix_cache"]
+            and not global_server_args_dict["flashinfer_mla_disable_ragged"]
+        )
+        self.page_size = model_runner.page_size
+
+        # Allocate buffers
+        global global_workspace_buffer
+        if global_workspace_buffer is None:
+            # different from flashinfer zero_init_global_workspace_buffer
+            global_workspace_buffer = torch.empty(
+                global_config.flashinfer_workspace_size,
+                dtype=torch.uint8,
+                device=model_runner.device,
+            )
+        self.workspace_buffer = global_workspace_buffer
+
+        max_bs = model_runner.req_to_token_pool.size
+        if kv_indptr_buf is None:
+            self.kv_indptr = torch.zeros(
+                (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+            )
+        else:
+            self.kv_indptr = kv_indptr_buf
+
+        if not self.skip_prefill:
+            self.qo_indptr = torch.zeros(
+                (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+            )
+
+        if q_indptr_decode_buf is None:
+            self.q_indptr_decode = torch.arange(
+                0, max_bs + 1, dtype=torch.int32, device=model_runner.device
+            )
+        else:
+            self.q_indptr_decode = q_indptr_decode_buf
+
+        self.fmha_backend = "auto"
+        if is_sm100_supported():
+            self.fmha_backend = "cutlass"
+        self.prefill_wrapper_ragged = BatchPrefillWithRaggedKVCacheWrapper(
+            self.workspace_buffer, "NHD", backend=self.fmha_backend
+        )
+
+        if not self.skip_prefill:
+            self.prefill_wrapper_paged = BatchMLAPagedAttentionWrapper(
+                self.workspace_buffer,
+                backend="auto",
+            )
+
+            # FlashinferMLA backend uses mla wrapper for target verify
+            self.prefill_wrapper_verify = BatchMLAPagedAttentionWrapper(
+                self.workspace_buffer,
+                backend="auto",
+            )
+
+        self.decode_wrapper = BatchMLAPagedAttentionWrapper(
+            self.workspace_buffer, backend="auto"
+        )
+
+        # Create indices updater
+        if not skip_prefill:
+            self.indices_updater_prefill = FlashInferMLAIndicesUpdaterPrefill(
+                model_runner, self
+            )
+            if self.enable_chunk_kv:
+                self.mha_chunk_kv_cache = FlashInferMhaChunkKVRunner(model_runner, self)
+
+        self.indices_updater_decode = FlashInferMLAIndicesUpdaterDecode(
+            model_runner, self
+        )
+
+        # Other metadata
+        self.forward_metadata: Union[PrefillMetadata, DecodeMetadata] = None
+        self.decode_cuda_graph_metadata = {}
+        self.prefill_cuda_graph_metadata = {}  # For verify
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        if forward_batch.forward_mode.is_decode_or_idle():
+            self.indices_updater_decode.update(
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                forward_batch.seq_lens_sum,
+                decode_wrapper=self.decode_wrapper,
+                init_metadata_replay=False,
+            )
+            self.forward_metadata = DecodeMetadata(self.decode_wrapper)
+        elif forward_batch.forward_mode.is_draft_extend():
+            self.indices_updater_prefill.update(
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                forward_batch.seq_lens_sum,
+                prefix_lens=None,
+                prefill_wrapper_paged=self.prefill_wrapper_paged,
+                use_ragged=False,
+                spec_info=forward_batch.spec_info,
+            )
+            self.forward_metadata = PrefillMetadata(self.prefill_wrapper_paged, False)
+        elif forward_batch.forward_mode.is_target_verify():
+            self.indices_updater_prefill.update(
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                forward_batch.seq_lens_sum,
+                prefix_lens=None,
+                prefill_wrapper_paged=self.prefill_wrapper_verify,
+                use_ragged=False,
+                spec_info=forward_batch.spec_info,
+            )
+            self.forward_metadata = PrefillMetadata(self.prefill_wrapper_verify, False)
+        else:
+            prefix_lens = forward_batch.extend_prefix_lens
+            extend_no_prefix = not any(forward_batch.extend_prefix_lens_cpu)
+            use_ragged = (
+                not global_server_args_dict["flashinfer_mla_disable_ragged"]
+                and extend_no_prefix
+            )
+
+            self.indices_updater_prefill.update(
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                forward_batch.seq_lens_sum,
+                prefix_lens,
+                prefill_wrapper_paged=self.prefill_wrapper_paged,
+                use_ragged=use_ragged,
+            )
+            self.forward_metadata = PrefillMetadata(
+                self.prefill_wrapper_paged, use_ragged
+            )
+
+    def init_cuda_graph_state(
+        self,
+        max_bs: int,
+        max_num_tokens: int,
+        kv_indices_buf: Optional[torch.Tensor] = None,
+    ):
+        if kv_indices_buf is None:
+            cuda_graph_kv_indices = torch.zeros(
+                (max_bs * self.max_context_len,),
+                dtype=torch.int32,
+                device="cuda",
+            )
+        else:
+            cuda_graph_kv_indices = kv_indices_buf
+
+        self.cuda_graph_kv_indices = cuda_graph_kv_indices
+        self.cuda_graph_qo_indptr = self.q_indptr_decode.clone()
+        self.cuda_graph_kv_indptr = self.kv_indptr.clone()
+        self.cuda_graph_kv_lens = torch.ones(
+            (max_bs,), dtype=torch.int32, device=self.device
+        )
+
+        # For fast decode plan in graph replaying
+        self.cuda_graph_qo_indptr_cpu = self.cuda_graph_qo_indptr.to("cpu")
+        self.cuda_graph_kv_indptr_cpu = self.cuda_graph_kv_indptr.to("cpu")
+        self.fast_decode_kwargs = {
+            "qo_indptr_cpu": self.cuda_graph_qo_indptr_cpu,
+            "kv_indptr_cpu": self.cuda_graph_kv_indptr_cpu,
+            "kv_indices": self.cuda_graph_kv_indices,
+        }
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInfo],
+    ):
+        if forward_mode.is_decode_or_idle():
+            decode_wrapper = BatchMLAPagedAttentionWrapper(
+                self.workspace_buffer,
+                use_cuda_graph=True,
+                qo_indptr=self.cuda_graph_qo_indptr[: num_tokens + 1],
+                kv_indptr=self.cuda_graph_kv_indptr[: num_tokens + 1],
+                kv_indices=self.cuda_graph_kv_indices,
+                kv_len_arr=self.cuda_graph_kv_lens[:num_tokens],
+                backend="auto",
+            )
+
+            seq_lens_sum = seq_lens.sum().item()
+            self.indices_updater_decode.update(
+                req_pool_indices,
+                seq_lens,
+                seq_lens_sum,
+                decode_wrapper=decode_wrapper,
+                init_metadata_replay=False,
+                spec_info=spec_info,
+            )
+            self.decode_cuda_graph_metadata[bs] = decode_wrapper
+            self.forward_metadata = DecodeMetadata(decode_wrapper)
+            decode_wrapper.plan = partial(fast_mla_decode_plan, decode_wrapper)
+        elif forward_mode.is_target_verify():
+            verify_wrapper = BatchMLAPagedAttentionWrapper(
+                self.workspace_buffer,
+                use_cuda_graph=True,
+                qo_indptr=self.cuda_graph_qo_indptr[: bs + 1],
+                kv_indptr=self.cuda_graph_kv_indptr[: bs + 1],
+                kv_indices=self.cuda_graph_kv_indices,
+                kv_len_arr=self.cuda_graph_kv_lens[:bs],
+                backend="auto",
+            )
+            seq_lens_sum = seq_lens.sum().item()
+            self.indices_updater_prefill.update(
+                req_pool_indices,
+                seq_lens,
+                seq_lens_sum,
+                prefix_lens=None,
+                prefill_wrapper_paged=verify_wrapper,
+                use_ragged=False,
+                spec_info=spec_info,
+            )
+            self.prefill_cuda_graph_metadata[bs] = verify_wrapper
+            self.forward_metadata = PrefillMetadata(verify_wrapper, False)
+        elif forward_mode.is_draft_extend():
+            draft_extend_wrapper = BatchMLAPagedAttentionWrapper(
+                self.workspace_buffer,
+                use_cuda_graph=True,
+                qo_indptr=self.cuda_graph_qo_indptr[: bs + 1],
+                kv_indptr=self.cuda_graph_kv_indptr[: bs + 1],
+                kv_indices=self.cuda_graph_kv_indices,
+                kv_len_arr=self.cuda_graph_kv_lens[:bs],
+                backend="auto",
+            )
+            seq_lens_sum = seq_lens.sum().item()
+            self.indices_updater_prefill.update(
+                req_pool_indices,
+                seq_lens,
+                seq_lens_sum,
+                prefix_lens=None,
+                prefill_wrapper_paged=draft_extend_wrapper,
+                use_ragged=False,
+                spec_info=spec_info,
+            )
+            self.prefill_cuda_graph_metadata[bs] = draft_extend_wrapper
+            self.forward_metadata = PrefillMetadata(draft_extend_wrapper, False)
+        else:
+            raise ValueError(f"Invalid mode: {forward_mode=}")
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInfo],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        if forward_mode.is_decode_or_idle():
+            assert seq_lens_cpu is not None
+            kv_len_arr_cpu = seq_lens_cpu[:bs]
+            self.cuda_graph_kv_indptr_cpu[1 : bs + 1] = torch.cumsum(
+                kv_len_arr_cpu, dim=0
+            )
+            self.fast_decode_kwargs.update(
+                {
+                    "qo_indptr_cpu": self.cuda_graph_qo_indptr_cpu[: bs + 1],
+                    "kv_indptr_cpu": self.cuda_graph_kv_indptr_cpu[: bs + 1],
+                    "kv_len_arr_cpu": kv_len_arr_cpu,
+                }
+            )
+
+            self.indices_updater_decode.update(
+                req_pool_indices[:bs],
+                seq_lens[:bs],
+                seq_lens_sum,
+                decode_wrapper=self.decode_cuda_graph_metadata[bs],
+                init_metadata_replay=True,
+                spec_info=spec_info,
+                **self.fast_decode_kwargs,
+            )
+        elif forward_mode.is_target_verify():
+            self.indices_updater_prefill.update(
+                req_pool_indices[:bs],
+                seq_lens[:bs],
+                seq_lens_sum,
+                prefix_lens=None,
+                prefill_wrapper_paged=self.prefill_cuda_graph_metadata[bs],
+                use_ragged=False,
+                spec_info=spec_info,
+            )
+        elif forward_mode.is_draft_extend():
+            self.indices_updater_prefill.update(
+                req_pool_indices[:bs],
+                seq_lens[:bs],
+                seq_lens_sum,
+                prefix_lens=None,
+                prefill_wrapper_paged=self.prefill_cuda_graph_metadata[bs],
+                use_ragged=False,
+                spec_info=spec_info,
+            )
+        else:
+            raise ValueError(f"Invalid forward mode: {forward_mode=}")
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        return 1
+
+    def init_mha_chunk_metadata(
+        self, forward_batch: ForwardBatch, disable_flashinfer_ragged: bool = False
+    ):
+        """Init the metadata for a forward pass."""
+        self.mha_chunk_kv_cache.update_wrapper(forward_batch, disable_flashinfer_ragged)
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        q_rope: Optional[torch.Tensor] = None,
+        k_rope: Optional[torch.Tensor] = None,
+    ):
+        if (
+            forward_batch.attn_attend_prefix_cache is not None
+            and forward_batch.mha_return_lse
+        ):  # MHA Chunk
+            assert self.enable_chunk_kv
+            assert q_rope is None
+            assert k_rope is None
+            o1, s1 = self.mha_chunk_kv_cache.forward(q, k, v, layer, forward_batch)
+            return o1, s1
+
+        cache_loc = forward_batch.out_cache_loc
+        logits_soft_cap = layer.logit_cap
+        prefill_wrapper_paged = self.forward_metadata.prefill_wrapper
+
+        # Save kv cache
+        if save_kv_cache and k is not None:
+            assert v is not None
+            if save_kv_cache:
+                if k_rope is not None:
+                    forward_batch.token_to_kv_pool.set_mla_kv_buffer(
+                        layer, cache_loc, k, k_rope
+                    )
+                else:
+                    forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
+        if q_rope is not None:
+            q = q.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+            q_rope = q_rope.view(
+                -1, layer.tp_q_head_num, layer.head_dim - layer.v_head_dim
+            )
+
+        if self.forward_metadata.use_ragged:
+            # ragged prefill
+            if q_rope is not None:
+                q = torch.cat([q, q_rope], dim=-1)
+            qall = q.view(-1, layer.tp_q_head_num, layer.head_dim)
+            if k_rope is not None:
+                k = torch.cat([k, k_rope], dim=-1)
+            o = self.prefill_wrapper_ragged.forward(
+                qall,
+                k.view(-1, layer.tp_k_head_num, layer.head_dim).to(q.dtype),
+                v.view(-1, layer.tp_k_head_num, layer.v_head_dim).to(q.dtype),
+                causal=True,
+                sm_scale=layer.scaling,
+                logits_soft_cap=logits_soft_cap,
+            )
+        else:
+            # mla paged prefill
+            k_buf = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id).to(
+                q.dtype
+            )
+            if q_rope is None:
+                qall = q.view(-1, layer.tp_q_head_num, layer.head_dim)
+                q, q_rope = (
+                    qall[:, :, : layer.v_head_dim],
+                    qall[:, :, layer.v_head_dim :],
+                )
+            o = q.new_empty(q.shape)
+            o = prefill_wrapper_paged.run(
+                q,
+                q_rope,
+                k_buf[:, :, : layer.v_head_dim],
+                k_buf[:, :, layer.v_head_dim :],
+                out=o,
+            )
+
+        return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        # For multi-head latent attention
+        q_rope: Optional[torch.Tensor] = None,
+        k_rope: Optional[torch.Tensor] = None,
+    ):
+        decode_wrapper = self.forward_metadata.decode_wrapper
+        cache_loc = forward_batch.out_cache_loc
+
+        if k is not None:
+            assert v is not None
+            if save_kv_cache:
+                if k_rope is not None:
+                    forward_batch.token_to_kv_pool.set_mla_kv_buffer(
+                        layer,
+                        cache_loc,
+                        k,
+                        k_rope,
+                    )
+                else:
+                    forward_batch.token_to_kv_pool.set_kv_buffer(
+                        layer,
+                        cache_loc,
+                        k,
+                        v,
+                    )
+
+        # Reshape inputs
+        if q_rope is not None:
+            q_nope = q.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+            q_rope = q_rope.view(
+                -1, layer.tp_q_head_num, layer.head_dim - layer.v_head_dim
+            )
+        else:
+            reshaped_q = q.view(-1, layer.tp_q_head_num, layer.head_dim)
+            q_nope = reshaped_q[:, :, : layer.v_head_dim]
+            q_rope = reshaped_q[:, :, layer.v_head_dim :]
+
+        k_buffer = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id).to(
+            q.dtype
+        )
+
+        o = q_nope.new_empty(q_nope.shape)
+        # Direct call to run without the wrapper
+        o = decode_wrapper.run(
+            q_nope,
+            q_rope,
+            k_buffer[:, :, : layer.v_head_dim],
+            k_buffer[:, :, layer.v_head_dim :],
+            out=o,
+        )
+
+        return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+
+
+class FlashInferMLAIndicesUpdaterDecode:
+    def __init__(self, model_runner: ModelRunner, attn_backend: AttentionBackend):
+        # Parse Constants
+        self.num_local_heads = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.kv_lora_rank = model_runner.model_config.kv_lora_rank
+        self.qk_nope_head_dim = model_runner.model_config.qk_nope_head_dim
+        self.qk_rope_head_dim = model_runner.model_config.qk_rope_head_dim
+        self.scaling = model_runner.model_config.scaling
+        self.data_type = model_runner.dtype
+        self.attn_backend = attn_backend
+
+        # Buffers and wrappers
+        self.kv_indptr = attn_backend.kv_indptr
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.q_indptr = attn_backend.q_indptr_decode
+
+    def update(
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        decode_wrapper: BatchMLAPagedAttentionWrapper,
+        init_metadata_replay: bool = False,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]] = None,
+        **fast_decode_kwargs,
+    ):
+        decode_wrapper = decode_wrapper or self.decode_wrapper
+        self.call_begin_forward(
+            decode_wrapper,
+            req_pool_indices,
+            seq_lens,
+            seq_lens_sum,
+            self.q_indptr,
+            self.kv_indptr,
+            init_metadata_replay,
+            spec_info,
+            **fast_decode_kwargs,
+        )
+
+    def call_begin_forward(
+        self,
+        wrapper: BatchMLAPagedAttentionWrapper,
+        req_pool_indices: torch.Tensor,
+        paged_kernel_lens: torch.Tensor,
+        paged_kernel_lens_sum: int,
+        q_indptr: torch.Tensor,
+        kv_indptr: torch.Tensor,
+        init_metadata_replay: bool = False,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]] = None,
+        **fast_decode_kwargs,
+    ):
+        bs = len(req_pool_indices)
+        q_indptr = q_indptr[: bs + 1]
+        kv_lens = paged_kernel_lens.to(torch.int32)
+        sm_scale = self.scaling
+        if spec_info is None:
+            kv_indptr[1 : bs + 1] = torch.cumsum(paged_kernel_lens, dim=0)
+            kv_indptr = kv_indptr[: bs + 1]
+            kv_indices = (
+                torch.empty(paged_kernel_lens_sum, dtype=torch.int32, device="cuda")
+                if not init_metadata_replay
+                else fast_decode_kwargs["kv_indices"]
+            )
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                paged_kernel_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.shape[1],
+            )
+        else:
+            kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
+
+        if not init_metadata_replay:
+            wrapper.plan(
+                q_indptr,
+                kv_indptr,
+                kv_indices,
+                kv_lens,
+                self.num_local_heads,
+                self.kv_lora_rank,
+                self.qk_rope_head_dim,
+                1,
+                False,
+                sm_scale,
+                self.data_type,
+                self.data_type,
+            )
+        else:
+            wrapper.plan(
+                fast_decode_kwargs["qo_indptr_cpu"],
+                fast_decode_kwargs["kv_indptr_cpu"],
+                kv_indices,
+                fast_decode_kwargs["kv_len_arr_cpu"],
+                self.num_local_heads,
+                self.kv_lora_rank,
+                self.qk_rope_head_dim,
+                1,
+                False,
+                sm_scale,
+                self.data_type,
+                self.data_type,
+            )
+
+
+class FlashInferMLAIndicesUpdaterPrefill:
+    def __init__(self, model_runner: ModelRunner, attn_backend: AttentionBackend):
+        # Parse Constants
+        self.num_local_heads = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.kv_lora_rank = model_runner.model_config.kv_lora_rank
+        self.qk_nope_head_dim = model_runner.model_config.qk_nope_head_dim
+        self.qk_rope_head_dim = model_runner.model_config.qk_rope_head_dim
+        self.v_head_dim = model_runner.model_config.v_head_dim
+        self.scaling = model_runner.model_config.scaling
+        self.data_type = model_runner.dtype
+        self.q_data_type = model_runner.dtype
+        self.attn_backend = attn_backend
+
+        # Buffers and wrappers
+        self.kv_indptr = attn_backend.kv_indptr
+        self.qo_indptr = attn_backend.qo_indptr
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.prefill_wrapper_ragged = attn_backend.prefill_wrapper_ragged
+
+    def update(
+        self,
+        req_pool_indices: torch.Tnesor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        prefix_lens: torch.Tensor,
+        prefill_wrapper_paged: BatchMLAPagedAttentionWrapper,
+        use_ragged: bool,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]] = None,
+    ):
+        if use_ragged:
+            paged_kernel_lens = prefix_lens
+            paged_kernel_lens_sum = paged_kernel_lens.sum().item()
+        else:
+            paged_kernel_lens = seq_lens
+            paged_kernel_lens_sum = seq_lens_sum
+
+        self.call_begin_forward(
+            self.prefill_wrapper_ragged,
+            prefill_wrapper_paged,
+            req_pool_indices,
+            paged_kernel_lens,
+            paged_kernel_lens_sum,
+            seq_lens,
+            prefix_lens,
+            self.kv_indptr,
+            self.qo_indptr,
+            use_ragged,
+            spec_info,
+        )
+
+    def call_begin_forward(
+        self,
+        wrapper_ragged: BatchPrefillWithRaggedKVCacheWrapper,
+        wrapper_paged: BatchMLAPagedAttentionWrapper,
+        req_pool_indices: torch.Tensor,
+        paged_kernel_lens: torch.Tensor,
+        paged_kernel_lens_sum: int,
+        seq_lens: torch.Tensor,
+        prefix_lens: torch.Tensor,
+        kv_indptr: torch.Tensor,
+        qo_indptr: torch.Tensor,
+        use_ragged: bool,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]] = None,
+    ):
+        bs = len(seq_lens)
+        sm_scale = self.scaling
+
+        if spec_info is None:
+            assert len(seq_lens) == len(req_pool_indices)
+            kv_indptr[1 : bs + 1] = torch.cumsum(paged_kernel_lens, dim=0)
+            kv_indptr = kv_indptr[: bs + 1]
+            kv_indices = torch.empty(
+                paged_kernel_lens_sum,
+                dtype=torch.int32,
+                device=req_pool_indices.device,
+            )
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                paged_kernel_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.shape[1],
+            )
+            qo_indptr[1 : bs + 1] = torch.cumsum(seq_lens - prefix_lens, dim=0)
+            qo_indptr = qo_indptr[: bs + 1]
+            custom_mask = None
+        else:
+            assert isinstance(spec_info, EagleDraftInput) or isinstance(
+                spec_info, EagleVerifyInput
+            )
+            # TODO: Support topk > 1 with custom mask
+            kv_indices, kv_indptr, qo_indptr, custom_mask = (
+                spec_info.generate_attn_arg_prefill(
+                    req_pool_indices,
+                    paged_kernel_lens,
+                    paged_kernel_lens_sum,
+                    self.req_to_token,
+                )
+            )
+
+        if use_ragged:
+            # ragged prefill
+            wrapper_ragged.begin_forward(
+                qo_indptr=qo_indptr,
+                kv_indptr=qo_indptr,
+                num_qo_heads=self.num_local_heads,
+                num_kv_heads=self.num_local_heads,
+                head_dim_qk=self.qk_nope_head_dim + self.qk_rope_head_dim,
+                head_dim_vo=self.v_head_dim,
+                q_data_type=self.q_data_type,
+                causal=True,
+            )
+        else:
+            # mla paged prefill
+            kv_len_arr = kv_indptr[1:] - kv_indptr[:-1]
+            wrapper_paged.plan(
+                qo_indptr,
+                kv_indptr,
+                kv_indices,
+                kv_len_arr,
+                self.num_local_heads,
+                self.kv_lora_rank,
+                self.qk_rope_head_dim,
+                1,
+                True,
+                sm_scale,
+                self.q_data_type,
+                self.data_type,
+            )
+
+
+class FlashInferMLAMultiStepDraftBackend:
+    """
+    Wrap multiple flashinfer mla attention backends as one for multiple consecutive
+    draft decoding steps.
+    """
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        topk: int,
+        speculative_num_steps: int,
+    ):
+        from sglang.srt.speculative.eagle_utils import generate_draft_decode_kv_indices
+
+        if topk > 1:
+            raise ValueError(
+                "Currently Flashinfer MLA only supports topk=1 for speculative decoding"
+            )
+        self.topk = topk
+        self.speculative_num_steps = speculative_num_steps
+        self.generate_draft_decode_kv_indices = generate_draft_decode_kv_indices
+
+        max_bs = model_runner.req_to_token_pool.size * self.topk
+        self.kv_indptr = torch.zeros(
+            (
+                self.speculative_num_steps,
+                max_bs + 1,
+            ),
+            dtype=torch.int32,
+            device=model_runner.device,
+        )
+        self.q_indptr_decode = torch.arange(
+            0, max_bs + 1, dtype=torch.int32, device=model_runner.device
+        )
+
+        self.attn_backends = []
+        for i in range(self.speculative_num_steps):
+            self.attn_backends.append(
+                FlashInferMLAAttnBackend(
+                    model_runner,
+                    skip_prefill=True,
+                    kv_indptr_buf=self.kv_indptr[i],
+                    q_indptr_decode_buf=self.q_indptr_decode,
+                )
+            )
+
+        self.max_context_len = self.attn_backends[0].max_context_len
+
+        # Cached variables for generate_draft_decode_kv_indices
+        self.pool_len = model_runner.req_to_token_pool.req_to_token.shape[1]
+        self.page_size = model_runner.server_args.page_size
+
+    def common_template(
+        self,
+        forward_batch: ForwardBatch,
+        kv_indices_buffer: torch.Tensor,
+        call_fn: Callable,
+    ):
+        num_seqs = forward_batch.batch_size
+        bs = self.topk * num_seqs
+        seq_lens_sum = forward_batch.seq_lens_sum
+
+        self.generate_draft_decode_kv_indices[
+            (self.speculative_num_steps, num_seqs, self.topk)
+        ](
+            forward_batch.req_pool_indices,
+            forward_batch.req_to_token_pool.req_to_token,
+            forward_batch.seq_lens,
+            kv_indices_buffer,
+            self.kv_indptr,
+            forward_batch.positions,
+            self.pool_len,
+            kv_indices_buffer.shape[1],
+            self.kv_indptr.shape[1],
+            next_power_of_2(num_seqs),
+            next_power_of_2(self.speculative_num_steps),
+            next_power_of_2(bs),
+            self.page_size,
+        )
+
+        assert forward_batch.spec_info is not None
+        assert isinstance(forward_batch.spec_info, EagleDraftInput)
+
+        for i in range(self.speculative_num_steps - 1):
+            forward_batch.spec_info.kv_indptr = self.kv_indptr[i, : bs + 1]
+            forward_batch.spec_info.kv_indices = kv_indices_buffer[i][
+                : seq_lens_sum * self.topk + bs * (i + 1)
+            ]
+            call_fn(i, forward_batch)
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        kv_indices = torch.zeros(
+            (
+                self.speculative_num_steps,
+                forward_batch.batch_size * self.topk * self.max_context_len,
+            ),
+            dtype=torch.int32,
+            device="cuda",
+        )
+
+        def call_fn(i, forward_batch):
+            assert forward_batch.spec_info is not None
+            assert isinstance(forward_batch.spec_info, EagleDraftInput)
+            forward_batch.spec_info.kv_indptr = (
+                forward_batch.spec_info.kv_indptr.clone()
+            )
+            forward_batch.spec_info.kv_indices = (
+                forward_batch.spec_info.kv_indices.clone()
+            )
+            self.attn_backends[i].init_forward_metadata(forward_batch)
+
+        self.common_template(forward_batch, kv_indices, call_fn)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        self.cuda_graph_kv_indices = torch.zeros(
+            (self.speculative_num_steps, max_bs * self.max_context_len),
+            dtype=torch.int32,
+            device="cuda",
+        )
+
+        for i in range(self.speculative_num_steps):
+            self.attn_backends[i].init_cuda_graph_state(
+                max_bs, max_num_tokens, kv_indices_buf=self.cuda_graph_kv_indices[i]
+            )
+
+    def init_forward_metadata_capture_cuda_graph(self, forward_batch: ForwardBatch):
+        def call_fn(i, forward_batch):
+            self.attn_backends[i].init_forward_metadata_capture_cuda_graph(
+                forward_batch.batch_size,
+                forward_batch.batch_size * self.topk,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                encoder_lens=None,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+            )
+
+        self.common_template(forward_batch, self.cuda_graph_kv_indices, call_fn)
+
+    def init_forward_metadata_replay_cuda_graph(
+        self, forward_batch: ForwardBatch, bs: int
+    ):
+        def call_fn(i, forward_batch):
+            self.attn_backends[i].init_forward_metadata_replay_cuda_graph(
+                bs,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                seq_lens_sum=-1,
+                encoder_lens=None,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+                seq_lens_cpu=forward_batch.seq_lens_cpu,
+            )
+
+        self.common_template(forward_batch, self.cuda_graph_kv_indices, call_fn)
+
+
+def fast_mla_decode_plan(
+    self,
+    qo_indptr_cpu: torch.Tensor,
+    kv_indptr_cpu: torch.Tensor,
+    kv_indices: torch.Tensor,
+    kv_len_arr_cpu: torch.Tensor,
+    num_heads: int,
+    head_dim_ckv: int,
+    head_dim_kpe: int,
+    page_size: int,
+    causal: bool,
+    sm_scale: float,
+    q_data_type: torch.dtype,
+    kv_data_type: torch.dtype,
+) -> None:
+    """A faster version of BatchMLAPagedAttentionWrapper::plan,
+    for skipping the stream synchronization in original plan function during
+    cuda graph replaying.
+    """
+    self._causal = causal
+    self._page_size = page_size
+    self._sm_scale = sm_scale
+
+    try:
+        # Standard version with just the required arguments (no use_profiler)
+        self._cached_module.plan.default(
+            self._float_workspace_buffer,
+            self._int_workspace_buffer,
+            self._pin_memory_int_workspace_buffer,
+            qo_indptr_cpu,
+            kv_indptr_cpu,
+            kv_len_arr_cpu,
+            num_heads,
+            head_dim_ckv,
+            causal,
+        )
+    except Exception as e:
+        raise RuntimeError(f"Error in alternate MLA plan: {e}")
diff --git a/python/sglang/srt/layers/attention/flashmla_backend.py b/python/sglang/srt/layers/attention/flashmla_backend.py
new file mode 100644
index 000000000..d1acb1a58
--- /dev/null
+++ b/python/sglang/srt/layers/attention/flashmla_backend.py
@@ -0,0 +1,541 @@
+from __future__ import annotations
+
+"""
+Support attention backend for FlashMLA.
+"""
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Callable, Optional, Tuple, Union
+
+import torch
+import triton
+from flash_mla import flash_mla_with_kvcache, get_mla_metadata
+
+from sglang.srt.layers.attention.flashinfer_mla_backend import FlashInferMLAAttnBackend
+from sglang.srt.layers.attention.utils import create_flashmla_kv_indices_triton
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.speculative.spec_info import SpecInfo
+
+
+# FlashMLA only supports pagesize=64
+PAGE_SIZE = 64
+
+# FlashMLA FP8 issue: https://github.com/deepseek-ai/FlashMLA/issues/56
+
+
+@dataclass
+class FlashMLADecodeMetadata:
+    flashmla_metadata: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
+    num_splits: Optional[torch.Tensor] = None
+    block_kv_indices: Optional[torch.Tensor] = None
+
+    def __init__(
+        self,
+        flashmla_metadata: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        num_splits: Optional[torch.Tensor] = None,
+        block_kv_indices: Optional[torch.Tensor] = None,
+    ):
+        self.flashmla_metadata = flashmla_metadata
+        self.num_splits = num_splits
+        self.block_kv_indices = block_kv_indices
+
+
+class FlashMLABackend(FlashInferMLAAttnBackend):
+    """Flashmla attention kernels."""
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        skip_prefill: bool = False,
+        kv_indptr_buf: Optional[torch.Tensor] = None,
+        kv_last_page_len_buf: Optional[torch.Tensor] = None,
+    ):
+        super().__init__(
+            model_runner, skip_prefill, kv_indptr_buf, kv_last_page_len_buf
+        )
+
+        self.num_q_heads = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.num_local_heads = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.forward_metadata: Union[FlashMLADecodeMetadata] = None
+        self.kv_lora_rank = model_runner.model_config.kv_lora_rank
+        self.qk_nope_head_dim = model_runner.model_config.qk_nope_head_dim
+        self.qk_rope_head_dim = model_runner.model_config.qk_rope_head_dim
+        self.v_head_dim = model_runner.model_config.v_head_dim
+        self.scaling = model_runner.model_config.scaling
+        self.data_type = model_runner.kv_cache_dtype
+        self.q_data_type = model_runner.dtype
+        self.kv_cache_dim = self.kv_lora_rank + self.qk_rope_head_dim
+
+        self.num_draft_tokens = model_runner.server_args.speculative_num_draft_tokens
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+
+        bs = forward_batch.batch_size
+        if forward_batch.forward_mode.is_decode_or_idle():
+            max_seqlen_pad = triton.cdiv(
+                forward_batch.seq_lens_cpu.max().item(), PAGE_SIZE
+            )
+            block_kv_indices = torch.full(
+                (bs, max_seqlen_pad),
+                -1,
+                dtype=torch.int32,
+                device=forward_batch.seq_lens.device,
+            )
+            create_flashmla_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                None,
+                block_kv_indices,
+                self.req_to_token.stride(0),
+                max_seqlen_pad,
+            )
+            mla_metadata, num_splits = get_mla_metadata(
+                forward_batch.seq_lens.to(torch.int32),
+                self.num_q_heads,
+                1,
+            )
+            self.forward_metadata = FlashMLADecodeMetadata(
+                mla_metadata,
+                num_splits,
+                block_kv_indices,
+            )
+        elif forward_batch.forward_mode.is_target_verify():
+            seq_lens_cpu = forward_batch.seq_lens_cpu + self.num_draft_tokens
+            seq_lens = forward_batch.seq_lens + self.num_draft_tokens
+
+            max_seqlen_pad = triton.cdiv(seq_lens_cpu.max().item(), PAGE_SIZE)
+            block_kv_indices = torch.full(
+                (bs, max_seqlen_pad),
+                -1,
+                dtype=torch.int32,
+                device=seq_lens.device,
+            )
+            create_flashmla_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                forward_batch.req_pool_indices,
+                seq_lens,
+                None,
+                block_kv_indices,
+                self.req_to_token.stride(0),
+                max_seqlen_pad,
+            )
+            mla_metadata, num_splits = get_mla_metadata(
+                seq_lens.to(torch.int32),
+                self.num_draft_tokens * self.num_q_heads,
+                1,
+            )
+
+            # Use FlashMLADecodeMetadata which has the attributes forward_extend expects
+            self.forward_metadata = FlashMLADecodeMetadata(
+                mla_metadata,
+                num_splits,
+                block_kv_indices,
+            )
+        else:
+            super().init_forward_metadata(forward_batch)
+
+    def init_cuda_graph_state(
+        self,
+        max_bs: int,
+        max_num_tokens: int,
+        block_kv_indices: Optional[torch.Tensor] = None,
+    ):
+        if block_kv_indices is None:
+            cuda_graph_kv_indices = torch.full(
+                (max_bs, (self.max_context_len + PAGE_SIZE) // PAGE_SIZE),
+                1,
+                dtype=torch.int32,
+                device="cuda",
+            )
+        else:
+            cuda_graph_kv_indices = block_kv_indices
+
+        if self.num_draft_tokens:
+            self.cuda_graph_mla_metadata, self.cuda_graph_num_splits = get_mla_metadata(
+                torch.ones(
+                    max_bs, dtype=torch.int32, device=cuda_graph_kv_indices.device
+                ),
+                self.num_draft_tokens * self.num_q_heads,
+                1,
+            )
+        else:
+            self.cuda_graph_mla_metadata, self.cuda_graph_num_splits = get_mla_metadata(
+                torch.ones(
+                    max_bs, dtype=torch.int32, device=cuda_graph_kv_indices.device
+                ),
+                self.num_q_heads,
+                1,
+            )
+        self.cuda_graph_kv_indices = cuda_graph_kv_indices
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInfo],
+    ):
+        if forward_mode.is_decode_or_idle():
+            max_seqlen_pad = triton.cdiv(seq_lens.max().item(), PAGE_SIZE)
+
+            create_flashmla_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                seq_lens,
+                None,
+                self.cuda_graph_kv_indices,
+                self.req_to_token.stride(0),
+                self.cuda_graph_kv_indices.stride(0),
+            )
+            mla_metadata, num_splits = get_mla_metadata(
+                seq_lens.to(torch.int32),
+                self.num_q_heads,
+                1,
+            )
+            self.cuda_graph_mla_metadata.copy_(mla_metadata)
+            self.cuda_graph_num_splits[: bs + 1].copy_(num_splits)
+            self.forward_metadata = FlashMLADecodeMetadata(
+                self.cuda_graph_mla_metadata,
+                self.cuda_graph_num_splits[: bs + 1],
+                self.cuda_graph_kv_indices[:bs, :max_seqlen_pad],
+            )
+        elif forward_mode.is_target_verify():
+            seq_lens = seq_lens + self.num_draft_tokens
+            max_seqlen_pad = triton.cdiv(seq_lens.max().item(), PAGE_SIZE)
+
+            create_flashmla_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                seq_lens,
+                None,
+                self.cuda_graph_kv_indices,
+                self.req_to_token.stride(0),
+                self.cuda_graph_kv_indices.stride(0),
+            )
+            mla_metadata, num_splits = get_mla_metadata(
+                seq_lens.to(torch.int32),
+                self.num_draft_tokens * self.num_q_heads,
+                1,
+            )
+            self.cuda_graph_mla_metadata.copy_(mla_metadata)
+            self.cuda_graph_num_splits[: bs + 1].copy_(num_splits)
+            self.forward_metadata = FlashMLADecodeMetadata(
+                self.cuda_graph_mla_metadata,
+                self.cuda_graph_num_splits[: bs + 1],
+                self.cuda_graph_kv_indices[:bs, :max_seqlen_pad],
+            )
+        else:
+            super().init_forward_metadata_capture_cuda_graph(
+                bs,
+                num_tokens,
+                req_pool_indices,
+                seq_lens,
+                encoder_lens,
+                forward_mode,
+                spec_info,
+            )
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInfo],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+
+        if forward_mode.is_decode_or_idle():
+            assert seq_lens_cpu is not None
+            seq_lens = seq_lens[:bs]
+            seq_lens_cpu = seq_lens_cpu[:bs]
+            max_seqlen_pad = triton.cdiv(seq_lens_cpu.max().item(), PAGE_SIZE)
+            create_flashmla_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices[:bs],
+                seq_lens,
+                None,
+                self.cuda_graph_kv_indices,
+                self.req_to_token.stride(0),
+                self.cuda_graph_kv_indices.stride(0),
+            )
+            mla_metadata, num_splits = get_mla_metadata(
+                seq_lens.to(torch.int32),
+                self.num_q_heads,
+                1,
+            )
+            self.cuda_graph_mla_metadata.copy_(mla_metadata)
+            self.cuda_graph_num_splits[: bs + 1].copy_(num_splits)
+            self.forward_metadata.mla_metadata = self.cuda_graph_mla_metadata
+            self.forward_metadata.num_splits = self.cuda_graph_num_splits[: bs + 1]
+            self.forward_metadata.block_kv_indices = self.cuda_graph_kv_indices[
+                :bs, :max_seqlen_pad
+            ]
+        elif forward_mode.is_target_verify():
+            seq_lens = seq_lens[:bs] + self.num_draft_tokens
+            seq_lens_cpu = seq_lens_cpu[:bs] + self.num_draft_tokens
+            max_seqlen_pad = triton.cdiv(seq_lens_cpu.max().item(), PAGE_SIZE)
+            create_flashmla_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices[:bs],
+                seq_lens,
+                None,
+                self.cuda_graph_kv_indices,
+                self.req_to_token.stride(0),
+                self.cuda_graph_kv_indices.stride(0),
+            )
+            mla_metadata, num_splits = get_mla_metadata(
+                seq_lens.to(torch.int32),
+                self.num_draft_tokens * self.num_q_heads,
+                1,
+            )
+            self.cuda_graph_mla_metadata.copy_(mla_metadata)
+            self.cuda_graph_num_splits[: bs + 1].copy_(num_splits)
+            self.forward_metadata.mla_metadata = self.cuda_graph_mla_metadata
+            self.forward_metadata.num_splits = self.cuda_graph_num_splits[: bs + 1]
+            self.forward_metadata.block_kv_indices = self.cuda_graph_kv_indices[
+                :bs, :max_seqlen_pad
+            ]
+        else:
+            super().init_forward_metadata_replay_cuda_graph(
+                bs,
+                req_pool_indices,
+                seq_lens,
+                seq_lens_sum,
+                encoder_lens,
+                forward_mode,
+                spec_info,
+                seq_lens_cpu,
+            )
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        return 1
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+    ):
+        cache_loc = forward_batch.out_cache_loc
+
+        if k is not None:
+            assert v is not None
+            if save_kv_cache:
+                forward_batch.token_to_kv_pool.set_kv_buffer(
+                    layer,
+                    cache_loc,
+                    k,
+                    v,
+                )
+        bs = forward_batch.batch_size
+        k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+
+        reshape_q = q.view(bs, -1, layer.tp_q_head_num, layer.head_dim)
+        if self.data_type == torch.float8_e4m3fn:
+            reshape_q_fp8 = reshape_q.to(torch.float8_e4m3fn)
+            o, _ = flash_mla_with_kvcache(
+                q=reshape_q_fp8,
+                k_cache=k_cache.view(-1, PAGE_SIZE, 1, self.kv_cache_dim),
+                block_table=self.forward_metadata.block_kv_indices[:bs],
+                cache_seqlens=forward_batch.seq_lens.to(torch.int32),
+                head_dim_v=self.kv_lora_rank,  # TODO Retrieve from config.
+                tile_scheduler_metadata=self.forward_metadata.flashmla_metadata,
+                num_splits=self.forward_metadata.num_splits,
+                softmax_scale=layer.scaling,
+                causal=True,
+                descale_q=torch.ones((1), dtype=torch.float32, device=reshape_q.device),
+                descale_k=torch.ones((1), dtype=torch.float32, device=reshape_q.device),
+            )
+
+            return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+        else:
+            # todo: need check all causal True or False?
+            o, _ = flash_mla_with_kvcache(
+                q=reshape_q,
+                k_cache=k_cache.view(-1, PAGE_SIZE, 1, self.kv_cache_dim),
+                block_table=self.forward_metadata.block_kv_indices[:bs],
+                cache_seqlens=forward_batch.seq_lens.to(torch.int32),
+                head_dim_v=self.kv_lora_rank,  # TODO Retrieve from config.
+                tile_scheduler_metadata=self.forward_metadata.flashmla_metadata,
+                num_splits=self.forward_metadata.num_splits,
+                softmax_scale=layer.scaling,
+                causal=True,
+            )
+
+            return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+    ):
+        if (
+            forward_batch.forward_mode == ForwardMode.EXTEND
+            or forward_batch.forward_mode == ForwardMode.DRAFT_EXTEND
+        ):
+            return super().forward_extend(q, k, v, layer, forward_batch, save_kv_cache)
+        else:
+            cache_loc = forward_batch.out_cache_loc
+
+            if k is not None:
+                assert v is not None
+                if save_kv_cache:
+                    forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
+
+            bs = forward_batch.batch_size
+            k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+
+            reshape_q = q.view(bs, -1, layer.tp_q_head_num, layer.head_dim)
+            if self.data_type == torch.float8_e4m3fn:
+                reshape_q_fp8 = reshape_q.to(torch.float8_e4m3fn)
+                o, _ = flash_mla_with_kvcache(
+                    q=reshape_q_fp8,
+                    k_cache=k_cache.view(-1, PAGE_SIZE, 1, self.kv_cache_dim),
+                    block_table=self.forward_metadata.block_kv_indices[:bs],
+                    cache_seqlens=forward_batch.seq_lens.to(torch.int32)
+                    + self.num_draft_tokens,
+                    head_dim_v=self.kv_lora_rank,
+                    tile_scheduler_metadata=self.forward_metadata.flashmla_metadata,
+                    num_splits=self.forward_metadata.num_splits,
+                    softmax_scale=layer.scaling,
+                    causal=True,
+                    descale_q=torch.ones(
+                        (1), dtype=torch.float32, device=reshape_q.device
+                    ),
+                    descale_k=torch.ones(
+                        (1), dtype=torch.float32, device=reshape_q.device
+                    ),
+                )
+            else:
+                o, _ = flash_mla_with_kvcache(
+                    q=reshape_q,
+                    k_cache=k_cache.view(-1, PAGE_SIZE, 1, self.kv_cache_dim),
+                    block_table=self.forward_metadata.block_kv_indices[:bs],
+                    cache_seqlens=forward_batch.seq_lens.to(torch.int32)
+                    + self.num_draft_tokens,
+                    head_dim_v=self.kv_lora_rank,
+                    tile_scheduler_metadata=self.forward_metadata.flashmla_metadata,
+                    num_splits=self.forward_metadata.num_splits,
+                    softmax_scale=layer.scaling,
+                    causal=True,
+                )
+            return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+
+
+# TODO: multi step kv indices optimization
+class FlashMLAMultiStepDraftBackend:
+    """
+    Wrap multiple flashmla attention backends as one for multiple consecutive
+    draft decoding steps.
+    """
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        topk: int,
+        speculative_num_steps: int,
+    ):
+        if topk > 1:
+            raise ValueError(
+                "Currently FlashMLA only supports topk=1 for speculative decoding"
+            )
+        self.topk = topk
+        self.speculative_num_steps = speculative_num_steps
+        max_bs = model_runner.req_to_token_pool.size * self.topk
+        self.kv_indptr = torch.zeros(
+            (
+                self.speculative_num_steps,
+                max_bs + 1,
+            ),
+            dtype=torch.int32,
+            device=model_runner.device,
+        )
+
+        self.attn_backends = []
+        for i in range(self.speculative_num_steps):
+            self.attn_backends.append(
+                FlashMLABackend(
+                    model_runner,
+                    skip_prefill=True,
+                    kv_indptr_buf=self.kv_indptr[i],
+                    kv_last_page_len_buf=None,
+                )
+            )
+
+    def common_template(
+        self,
+        forward_batch: ForwardBatch,
+        call_fn: Callable,
+    ):
+        assert forward_batch.spec_info is not None
+
+        for i in range(self.speculative_num_steps - 1):
+            call_fn(i, forward_batch)
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        def call_fn(i, forward_batch):
+            assert forward_batch.spec_info is not None
+            self.attn_backends[i].init_forward_metadata(forward_batch)
+
+        self.common_template(forward_batch, call_fn)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        for i in range(self.speculative_num_steps):
+            self.attn_backends[i].init_cuda_graph_state(
+                max_bs, max_num_tokens, block_kv_indices=None
+            )
+
+    def init_forward_metadata_capture_cuda_graph(self, forward_batch: ForwardBatch):
+        def call_fn(i, forward_batch):
+            self.attn_backends[i].init_forward_metadata_capture_cuda_graph(
+                forward_batch.batch_size,
+                forward_batch.batch_size * self.topk,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                encoder_lens=None,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+            )
+
+        self.common_template(forward_batch, call_fn)
+
+    def init_forward_metadata_replay_cuda_graph(
+        self, forward_batch: ForwardBatch, bs: int
+    ):
+        def call_fn(i, forward_batch):
+            self.attn_backends[i].init_forward_metadata_replay_cuda_graph(
+                bs,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                seq_lens_sum=-1,
+                encoder_lens=None,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+                seq_lens_cpu=forward_batch.seq_lens_cpu,
+            )
+
+        self.common_template(forward_batch, call_fn)
diff --git a/python/sglang/srt/layers/attention/hybrid_attn_backend.py b/python/sglang/srt/layers/attention/hybrid_attn_backend.py
new file mode 100644
index 000000000..580a977ec
--- /dev/null
+++ b/python/sglang/srt/layers/attention/hybrid_attn_backend.py
@@ -0,0 +1,139 @@
+from typing import Optional, Union
+
+import torch
+
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.model_executor.model_runner import ModelRunner
+from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
+
+
+class HybridAttnBackend(AttentionBackend):
+    """Support different backends for prefill and decode."""
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        prefill_backend: AttentionBackend,
+        decode_backend: AttentionBackend,
+    ):
+        self.model_runner = model_runner
+        self.prefill_backend = prefill_backend
+        self.decode_backend = decode_backend
+
+    def _select_backend(self, forward_mode: ForwardMode) -> AttentionBackend:
+        """
+        Select the appropriate attention backend based on the forward mode.
+
+        Args:
+            forward_mode: The current forward mode indicating the operation type
+
+        Returns:
+            The selected attention backend (prefill or decode)
+
+        Note:
+            - decode_or_idle: Always uses decode backend
+            - target_verify or draft_extend: Uses decode backend if speculative_attention_mode is "decode", otherwise prefill backend
+            - prefill: Always uses prefill backend
+        """
+        if forward_mode.is_decode_or_idle():
+            return self.decode_backend
+        elif forward_mode.is_target_verify() or forward_mode.is_draft_extend():
+            return (
+                self.decode_backend
+                if self.model_runner.server_args.speculative_attention_mode == "decode"
+                else self.prefill_backend
+            )
+        else:
+            return self.prefill_backend
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        backend = self._select_backend(forward_batch.forward_mode)
+        backend.init_forward_metadata(forward_batch)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        self.decode_backend.init_cuda_graph_state(max_bs, max_num_tokens)
+        if (
+            self.model_runner.server_args.speculative_algorithm is not None
+            and self.model_runner.server_args.speculative_attention_mode == "prefill"
+        ):
+            # When speculative decoding is enabled, we need to initialize the backend
+            # that will be used for target_verify.
+            self.prefill_backend.init_cuda_graph_state(max_bs, max_num_tokens)
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        backend = self._select_backend(forward_mode)
+        backend.init_forward_metadata_capture_cuda_graph(
+            bs,
+            num_tokens,
+            req_pool_indices,
+            seq_lens,
+            encoder_lens,
+            forward_mode,
+            spec_info,
+        )
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        backend = self._select_backend(forward_mode)
+        backend.init_forward_metadata_replay_cuda_graph(
+            bs,
+            req_pool_indices,
+            seq_lens,
+            seq_lens_sum,
+            encoder_lens,
+            forward_mode,
+            spec_info,
+            seq_lens_cpu,
+        )
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        return self.decode_backend.get_cuda_graph_seq_len_fill_value()
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        **kwargs,
+    ):
+        return self.decode_backend.forward_decode(
+            q, k, v, layer, forward_batch, save_kv_cache, **kwargs
+        )
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        **kwargs,
+    ):
+        backend = self._select_backend(forward_batch.forward_mode)
+        return backend.forward_extend(
+            q, k, v, layer, forward_batch, save_kv_cache, **kwargs
+        )
diff --git a/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py b/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py
new file mode 100644
index 000000000..a676573f2
--- /dev/null
+++ b/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py
@@ -0,0 +1,584 @@
+from dataclasses import astuple, dataclass
+from functools import lru_cache
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.attention.fla.chunk import chunk_gated_delta_rule
+from sglang.srt.layers.attention.fla.fused_recurrent import (
+    fused_recurrent_gated_delta_rule_update,
+)
+from sglang.srt.layers.attention.fla.fused_sigmoid_gating_recurrent import (
+    fused_sigmoid_gating_delta_rule_update,
+)
+from sglang.srt.layers.attention.mamba.causal_conv1d_triton import (
+    causal_conv1d_fn,
+    causal_conv1d_update,
+)
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.mem_cache.memory_pool import HybridReqToTokenPool
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.model_executor.model_runner import ModelRunner
+from sglang.srt.models.qwen3_next import Qwen3HybridLinearDecoderLayer, fused_gdn_gating
+from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
+
+
+@dataclass
+class ForwardMetadata:
+    query_start_loc: Optional[torch.Tensor]
+    mamba_cache_indices: torch.Tensor
+
+
+class MambaAttnBackend(AttentionBackend):
+    """Attention backend using Mamba kernel."""
+
+    def __init__(self, model_runner: ModelRunner):
+        super().__init__()
+        self.pad_slot_id = -1  # Default pad slot id
+        self.device = model_runner.device
+        self.req_to_token_pool: HybridReqToTokenPool = model_runner.req_to_token_pool
+        self.forward_metadata: ForwardMetadata = None
+        self.state_indices_list = []
+        self.query_start_loc_list = []
+
+    @classmethod
+    @lru_cache(maxsize=128)
+    def _get_cached_arange(cls, bs: int, device_str: str) -> torch.Tensor:
+        """Cache torch.arange tensors for common batch sizes to avoid repeated allocation."""
+        device = torch.device(device_str)
+        return torch.arange(0, bs + 1, dtype=torch.int32, device=device)
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        bs = forward_batch.batch_size
+        if forward_batch.forward_mode.is_decode_or_idle():
+            query_start_loc = self._get_cached_arange(bs, str(self.device))
+        elif forward_batch.forward_mode.is_extend():
+            if forward_batch.forward_mode.is_target_verify():
+                query_start_loc = torch.arange(
+                    0,
+                    forward_batch.input_ids.shape[0] + 1,
+                    step=forward_batch.spec_info.draft_token_num,
+                    dtype=torch.int32,
+                    device=forward_batch.input_ids.device,
+                )
+            else:
+                query_start_loc = torch.empty(
+                    (bs + 1,), dtype=torch.int32, device=self.device
+                )
+                query_start_loc[:bs] = forward_batch.extend_start_loc
+                query_start_loc[bs] = (
+                    forward_batch.extend_start_loc[-1]
+                    + forward_batch.extend_seq_lens[-1]
+                )
+        else:
+            raise ValueError(f"Invalid forward mode: {forward_batch.forward_mode=}")
+        mamba_cache_indices = self.req_to_token_pool.get_mamba_indices(
+            forward_batch.req_pool_indices
+        )
+        self.forward_metadata = ForwardMetadata(
+            query_start_loc=query_start_loc,
+            mamba_cache_indices=mamba_cache_indices,
+        )
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        for i in range(max_bs):
+            self.state_indices_list.append(
+                torch.full((i + 1,), self.pad_slot_id, dtype=torch.int32, device="cuda")
+            )
+            self.query_start_loc_list.append(
+                torch.empty((i + 2,), dtype=torch.int32, device="cuda")
+            )
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        if forward_mode.is_decode_or_idle():
+            self.query_start_loc_list[bs - 1].copy_(self._get_cached_arange(bs, "cuda"))
+        elif forward_mode.is_target_verify():
+            self.query_start_loc_list[bs - 1].copy_(
+                torch.arange(
+                    0,
+                    bs * spec_info.draft_token_num + 1,
+                    step=spec_info.draft_token_num,
+                    dtype=torch.int32,
+                    device="cuda",
+                )
+            )
+        else:
+            raise ValueError(f"Invalid forward mode: {forward_mode=}")
+        mamba_indices = self.req_to_token_pool.get_mamba_indices(req_pool_indices)
+        self.state_indices_list[bs - 1][: len(mamba_indices)].copy_(mamba_indices)
+        self.forward_metadata = ForwardMetadata(
+            query_start_loc=self.query_start_loc_list[bs - 1],
+            mamba_cache_indices=self.state_indices_list[bs - 1],
+        )
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        num_padding = torch.count_nonzero(
+            seq_lens_cpu == self.get_cuda_graph_seq_len_fill_value()
+        )
+        # Make sure forward metadata is correctly handled for padding reqs
+        req_pool_indices[bs - num_padding :] = 0
+        mamba_indices = self.req_to_token_pool.get_mamba_indices(req_pool_indices)
+        mamba_indices[bs - num_padding :] = -1
+        self.state_indices_list[bs - 1][: len(mamba_indices)].copy_(mamba_indices)
+        if forward_mode.is_decode_or_idle():
+            self.query_start_loc_list[bs - 1].copy_(self._get_cached_arange(bs, "cuda"))
+            if num_padding > 0:
+                self.query_start_loc_list[bs - 1][bs - num_padding :] = bs - num_padding
+        elif forward_mode.is_target_verify():
+            self.query_start_loc_list[bs - 1].copy_(
+                torch.arange(
+                    0,
+                    bs * spec_info.draft_token_num + 1,
+                    step=spec_info.draft_token_num,
+                    dtype=torch.int32,
+                    device="cuda",
+                )
+            )
+            if num_padding > 0:
+                self.query_start_loc_list[bs - 1][bs - num_padding :] = (
+                    bs - num_padding
+                ) * spec_info.draft_token_num
+        else:
+            raise ValueError(f"Invalid forward mode: {forward_mode=}")
+
+        self.forward_metadata = ForwardMetadata(
+            query_start_loc=self.query_start_loc_list[bs - 1],
+            mamba_cache_indices=self.state_indices_list[bs - 1],
+        )
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        return 1  # Mamba attn does not use seq lens to index kv cache
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        **kwargs,
+    ):
+        mixed_qkv = kwargs["mixed_qkv"]
+        conv_weights = kwargs["conv_weights"]
+        bias = kwargs["bias"]
+        activation = kwargs["activation"]
+        key_dim = kwargs["key_dim"]
+        value_dim = kwargs["value_dim"]
+        attn_tp_size = kwargs["attention_tp_size"]
+        head_k_dim = kwargs["head_k_dim"]
+        head_v_dim = kwargs["head_v_dim"]
+        a = kwargs["a"]
+        b = kwargs["b"]
+        A_log = kwargs["A_log"]
+        dt_bias = kwargs["dt_bias"]
+        layer_id = kwargs["layer_id"]
+
+        conv_states, ssm_states, *rest = self.req_to_token_pool.get_mamba_params(
+            layer_id
+        )
+        query_start_loc = self.forward_metadata.query_start_loc
+        cache_indices = self.forward_metadata.mamba_cache_indices
+
+        mixed_qkv = causal_conv1d_update(
+            mixed_qkv,
+            conv_states,
+            conv_weights,
+            bias,
+            activation,
+            conv_state_indices=cache_indices,
+        )
+
+        query, key, value = torch.split(
+            mixed_qkv,
+            [
+                key_dim // attn_tp_size,
+                key_dim // attn_tp_size,
+                value_dim // attn_tp_size,
+            ],
+            dim=-1,
+        )
+        # Reshape from [l, h*d] to [1, l, h, d]
+        seq_len = query.shape[0]
+        num_heads = query.shape[1] // head_k_dim
+        query = query.view(1, seq_len, num_heads, head_k_dim)
+        key = key.view(1, seq_len, num_heads, head_k_dim)
+        value = value.view(1, seq_len, value.shape[1] // head_v_dim, head_v_dim)
+
+        core_attn_out = fused_sigmoid_gating_delta_rule_update(
+            A_log=A_log,
+            dt_bias=dt_bias,
+            q=query,
+            k=key,
+            v=value,
+            a=a,
+            b=b,
+            initial_state_source=ssm_states,
+            initial_state_indices=cache_indices,
+            cu_seqlens=query_start_loc,
+            use_qk_l2norm_in_kernel=True,
+            softplus_beta=1.0,
+            softplus_threshold=20.0,
+        )
+
+        return core_attn_out
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        **kwargs,
+    ):
+        mixed_qkv = kwargs["mixed_qkv"]
+        conv_weights = kwargs["conv_weights"]
+        bias = kwargs["bias"]
+        activation = kwargs["activation"]
+        key_dim = kwargs["key_dim"]
+        value_dim = kwargs["value_dim"]
+        attn_tp_size = kwargs["attention_tp_size"]
+        head_k_dim = kwargs["head_k_dim"]
+        head_v_dim = kwargs["head_v_dim"]
+        a = kwargs["a"]
+        b = kwargs["b"]
+        A_log = kwargs["A_log"]
+        dt_bias = kwargs["dt_bias"]
+        layer_id = kwargs["layer_id"]
+        seq_len = kwargs["seq_len"]
+
+        is_target_verify = forward_batch.forward_mode.is_target_verify()
+
+        query_start_loc = self.forward_metadata.query_start_loc
+        cache_indices = self.forward_metadata.mamba_cache_indices
+
+        if is_target_verify:
+            (
+                conv_states,
+                ssm_states,
+                intermediate_state_cache,
+                intermediate_conv_window_cache,
+            ) = self.req_to_token_pool.get_mamba_params(layer_id)
+            has_initial_states = torch.ones(
+                seq_len // forward_batch.spec_info.draft_token_num,
+                dtype=torch.bool,
+                device=forward_batch.input_ids.device,
+            )
+            conv_states_to_use = conv_states.clone()
+        else:
+            conv_states, ssm_states, *rest = self.req_to_token_pool.get_mamba_params(
+                layer_id
+            )
+            has_initial_states = forward_batch.extend_prefix_lens > 0
+            conv_states_to_use = conv_states
+
+        if is_target_verify:
+            batch_size = seq_len // forward_batch.spec_info.draft_token_num
+            draft_token_num = forward_batch.spec_info.draft_token_num
+            mixed_qkv_reshaped = (
+                mixed_qkv.view(batch_size, draft_token_num, -1)
+                .transpose(1, 2)
+                .contiguous()
+            )
+            mixed_qkv_processed = causal_conv1d_update(
+                mixed_qkv_reshaped,
+                conv_states_to_use,
+                conv_weights,
+                bias,
+                activation,
+                conv_state_indices=cache_indices[:batch_size],
+                intermediate_conv_window=intermediate_conv_window_cache,
+            )
+            mixed_qkv = (
+                mixed_qkv_processed.transpose(1, 2).contiguous().view(seq_len, -1)
+            )
+        else:
+            mixed_qkv = causal_conv1d_fn(
+                mixed_qkv.transpose(0, 1),
+                conv_weights,
+                bias,
+                activation=activation,
+                conv_states=conv_states_to_use,
+                has_initial_state=has_initial_states,
+                cache_indices=cache_indices,
+                query_start_loc=query_start_loc,
+            ).transpose(0, 1)[:seq_len]
+
+        key_split_dim = key_dim // attn_tp_size
+        value_split_dim = value_dim // attn_tp_size
+
+        query, key, value = torch.split(
+            mixed_qkv,
+            [key_split_dim, key_split_dim, value_split_dim],
+            dim=-1,
+        )
+
+        actual_seq_len = query.shape[0]
+        num_heads = query.shape[1] // head_k_dim
+        num_value_heads = value.shape[1] // head_v_dim
+
+        query = query.view(1, actual_seq_len, num_heads, head_k_dim)
+        key = key.view(1, actual_seq_len, num_heads, head_k_dim)
+        value = value.view(1, actual_seq_len, num_value_heads, head_v_dim)
+
+        beta = b.sigmoid()
+        g = fused_gdn_gating(A_log, a, dt_bias)
+
+        g = g.unsqueeze(0)
+        beta = beta.unsqueeze(0)
+
+        if is_target_verify:
+            core_attn_out = fused_recurrent_gated_delta_rule_update(
+                q=query,
+                k=key,
+                v=value,
+                g=g,
+                beta=beta,
+                initial_state_source=ssm_states,
+                initial_state_indices=cache_indices,
+                cu_seqlens=query_start_loc,
+                use_qk_l2norm_in_kernel=True,
+                disable_state_update=True,
+                intermediate_states_buffer=intermediate_state_cache,
+                cache_steps=forward_batch.spec_info.draft_token_num,
+            )
+        else:
+            recurrent_state = ssm_states[cache_indices]
+            core_attn_out, last_recurrent_state = chunk_gated_delta_rule(
+                q=query,
+                k=key,
+                v=value,
+                g=g,
+                beta=beta,
+                initial_state=recurrent_state,
+                output_final_state=True,
+                cu_seqlens=query_start_loc,
+                head_first=False,
+                use_qk_l2norm_in_kernel=True,
+            )
+            last_recurrent_state = last_recurrent_state.to(ssm_states.dtype, copy=False)
+            ssm_states[cache_indices] = last_recurrent_state
+
+        return core_attn_out
+
+
+class HybridLinearAttnBackend(AttentionBackend):
+    """Support different backends for prefill and decode."""
+
+    def __init__(
+        self,
+        full_attn_backend: AttentionBackend,
+        linear_attn_backend: AttentionBackend,
+        full_attn_layers: list[int],
+    ):
+        self.full_attn_layers = full_attn_layers
+        self.attn_backend_list = [full_attn_backend, linear_attn_backend]
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        for attn_backend in self.attn_backend_list:
+            attn_backend.init_forward_metadata(forward_batch)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        for attn_backend in self.attn_backend_list:
+            attn_backend.init_cuda_graph_state(max_bs, max_num_tokens)
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        for attn_backend in self.attn_backend_list:
+            attn_backend.init_forward_metadata_capture_cuda_graph(
+                bs,
+                num_tokens,
+                req_pool_indices,
+                seq_lens,
+                encoder_lens,
+                forward_mode,
+                spec_info,
+            )
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        for attn_backend in self.attn_backend_list:
+            attn_backend.init_forward_metadata_replay_cuda_graph(
+                bs,
+                req_pool_indices,
+                seq_lens,
+                seq_lens_sum,
+                encoder_lens,
+                forward_mode,
+                spec_info,
+                seq_lens_cpu,
+            )
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        return self.attn_backend_list[0].get_cuda_graph_seq_len_fill_value()
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        **kwargs,
+    ):
+        layer_id = layer.layer_id if layer else kwargs["layer_id"]
+        if layer_id in self.full_attn_layers:
+            return self.attn_backend_list[0].forward_decode(
+                q, k, v, layer, forward_batch, save_kv_cache, **kwargs
+            )
+        return self.attn_backend_list[1].forward_decode(
+            q, k, v, layer, forward_batch, save_kv_cache, **kwargs
+        )
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        **kwargs,
+    ):
+        layer_id = layer.layer_id if layer else kwargs["layer_id"]
+        if layer_id in self.full_attn_layers:
+            return self.attn_backend_list[0].forward_extend(
+                q, k, v, layer, forward_batch, save_kv_cache, **kwargs
+            )
+        return self.attn_backend_list[1].forward_extend(
+            q, k, v, layer, forward_batch, save_kv_cache, **kwargs
+        )
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        **kwargs,
+    ):
+        """Run forward on an attention layer."""
+        if forward_batch.forward_mode.is_idle():
+            if layer is None:
+                return torch.empty_like(kwargs["z"])
+            return q.new_empty(q.shape[0], layer.tp_q_head_num * layer.v_head_dim)
+        elif forward_batch.forward_mode.is_decode():
+            return self.forward_decode(
+                q,
+                k,
+                v,
+                layer,
+                forward_batch,
+                save_kv_cache=save_kv_cache,
+                **kwargs,
+            )
+        else:
+            return self.forward_extend(
+                q,
+                k,
+                v,
+                layer,
+                forward_batch,
+                save_kv_cache=save_kv_cache,
+                **kwargs,
+            )
+
+    def update_mamba_state_after_mtp_verify(self, accepted_length, model):
+        request_number = accepted_length.shape[0]
+
+        state_indices_tensor = self.attn_backend_list[
+            1
+        ].forward_metadata.mamba_cache_indices[:request_number]
+
+        mamba_caches = self.attn_backend_list[
+            1
+        ].req_to_token_pool.get_mamba_params_all_layers()
+
+        (
+            conv_states,
+            ssm_states,
+            intermediate_state_cache,
+            intermediate_conv_window_cache,
+        ) = mamba_caches
+
+        # SSM state updates (chunked to reduce peak memory)
+        valid_mask = accepted_length > 0
+
+        # Compute common indices once to avoid duplication
+        last_steps_all = (accepted_length - 1).to(torch.int64)
+        valid_state_indices = state_indices_tensor[valid_mask].to(torch.int64)
+        last_steps = last_steps_all[valid_mask].to(torch.int64)
+
+        if valid_state_indices.numel() > 0:
+            chunk = 256
+            num_valid = valid_state_indices.numel()
+
+            # SSM state updates
+            for i in range(0, num_valid, chunk):
+                idx = valid_state_indices[i : i + chunk]
+                steps = last_steps[i : i + chunk]
+                # per (cache line, step)
+                for j in range(idx.numel()):
+                    ci = idx[j].item()
+                    st = steps[j].item()
+                    ssm_states[:, ci, :].copy_(
+                        intermediate_state_cache[:, ci, st].to(
+                            ssm_states.dtype, copy=False
+                        )
+                    )
+
+            # Conv window updates
+            for i in range(0, num_valid, chunk):
+                idx = valid_state_indices[i : i + chunk]
+                steps = last_steps[i : i + chunk]
+                for j in range(idx.numel()):
+                    ci = idx[j].item()
+                    st = steps[j].item()
+                    conv_states[:, ci, :, :].copy_(
+                        intermediate_conv_window_cache[:, ci, st].to(
+                            conv_states.dtype, copy=False
+                        )
+                    )
diff --git a/python/sglang/srt/layers/attention/intel_amx_backend.py b/python/sglang/srt/layers/attention/intel_amx_backend.py
new file mode 100644
index 000000000..39e5c7428
--- /dev/null
+++ b/python/sglang/srt/layers/attention/intel_amx_backend.py
@@ -0,0 +1,131 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import torch
+
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+
+class IntelAMXAttnBackend(AttentionBackend):
+    def __init__(self, model_runner: ModelRunner):
+        import sgl_kernel
+
+        super().__init__()
+        self.forward_metadata = None
+        self.device = model_runner.device
+
+        self.num_head = (
+            model_runner.model_config.num_attention_heads // model_runner.tp_size
+        )
+
+        self.v_head_dim = model_runner.token_to_kv_pool.get_value_buffer(0).shape[-1]
+
+        self.decode_attention_fwd = torch.ops.sgl_kernel.decode_attention_cpu
+        self.extend_attention_fwd = torch.ops.sgl_kernel.extend_attention_cpu
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Init the metadata for a forward pass."""
+
+        bs = forward_batch.batch_size
+        attn_logits = torch.zeros(
+            (
+                bs,
+                self.num_head,
+                8,  # self.num_kv_splits,
+                self.v_head_dim + 1,
+            ),
+            dtype=torch.float32,
+            device=self.device,
+        )
+        if forward_batch.forward_mode.is_decode_or_idle():
+            max_extend_len = None
+        else:
+            max_extend_len = torch.max(forward_batch.extend_seq_lens).item()
+        self.forward_metadata = (attn_logits, max_extend_len)
+
+    def get_graph_seq_len_fill_value(self):
+        return 1
+
+    def forward_extend(
+        self,
+        q,
+        k,
+        v,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        if layer.qk_head_dim != layer.v_head_dim:
+            o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        if save_kv_cache:
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                layer, forward_batch.out_cache_loc, k, v
+            )
+
+        _, max_extend_len = self.forward_metadata
+
+        self.extend_attention_fwd(
+            q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+            k,
+            v,
+            o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+            forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+            forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+            forward_batch.req_to_token_pool.req_to_token,
+            forward_batch.req_pool_indices,
+            forward_batch.seq_lens,
+            forward_batch.extend_seq_lens,
+            forward_batch.extend_start_loc,
+            max_extend_len,
+            layer.scaling,
+            layer.logit_cap,
+        )
+        return o
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        attn_logits, _ = self.forward_metadata
+
+        q = q.reshape(-1, layer.tp_q_head_num * layer.qk_head_dim)
+
+        if layer.qk_head_dim != layer.v_head_dim:
+            o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        self.decode_attention_fwd(
+            q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+            forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+            forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+            o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+            k,
+            v,
+            forward_batch.out_cache_loc,
+            attn_logits,
+            forward_batch.req_to_token_pool.req_to_token,
+            forward_batch.req_pool_indices,
+            forward_batch.seq_lens,
+            layer.scaling,
+            layer.logit_cap,
+        )
+
+        return o
+
+    def support_triton(self):
+        return False
diff --git a/python/sglang/srt/layers/attention/mamba/causal_conv1d.py b/python/sglang/srt/layers/attention/mamba/causal_conv1d.py
new file mode 100644
index 000000000..d004337ff
--- /dev/null
+++ b/python/sglang/srt/layers/attention/mamba/causal_conv1d.py
@@ -0,0 +1,128 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) 2024, Tri Dao.
+# Adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/causal_conv1d/causal_conv1d_interface.py
+
+from typing import Optional
+
+import torch
+from sgl_kernel import causal_conv1d_fwd
+from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel
+
+PAD_SLOT_ID = -1
+
+
+def causal_conv1d_fn(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    query_start_loc: Optional[torch.Tensor] = None,
+    cache_indices: Optional[torch.Tensor] = None,
+    has_initial_state: Optional[torch.Tensor] = None,
+    conv_states: Optional[torch.Tensor] = None,
+    activation: Optional[str] = "silu",
+    pad_slot_id: int = PAD_SLOT_ID,
+):
+    """
+    x: (batch, dim, seqlen) or (dim,cu_seq_len) for varlen
+        sequences are concatenated from left to right for varlen
+    weight: (dim, width)
+    bias: (dim,)
+    query_start_loc: (batch + 1) int32
+        The cumulative sequence lengths of the sequences in
+        the batch, used to index into sequence. prepended by 0.
+        for example: query_start_loc = torch.Tensor([0,10,16,17]),
+        x.shape=(dim,17)
+    cache_indices: (batch)  int32
+        indicates the corresponding state index,
+        like so: conv_state = conv_states[cache_indices[batch_id]]
+    has_initial_state: (batch) bool
+        indicates whether should the kernel take the current state as initial
+        state for the calculations
+    conv_states: (...,dim,width - 1) itype
+        updated inplace if provided
+    activation: either None or "silu" or "swish"
+    pad_slot_id: int
+            if cache_indices is passed, lets the kernel identify padded
+            entries that will not be processed,
+            for example: cache_indices = [pad_slot_id, 1, 20, pad_slot_id]
+            in this case, the kernel will not process entries at
+            indices 0 and 3
+
+
+    out: (batch, dim, seqlen)
+    """
+    if activation not in [None, "silu", "swish"]:
+        raise NotImplementedError("activation must be None, silu, or swish")
+    if x.stride(-1) != 1:
+        x = x.contiguous()
+    bias = bias.contiguous() if bias is not None else None
+
+    causal_conv1d_fwd(
+        x,
+        weight,
+        bias,
+        conv_states,
+        query_start_loc,
+        cache_indices,
+        has_initial_state,
+        activation in ["silu", "swish"],
+        pad_slot_id,
+    )
+    return x
+
+
+def causal_conv1d_update(
+    x: torch.Tensor,
+    conv_state: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    activation: Optional[str] = None,
+    cache_seqlens: Optional[torch.Tensor] = None,
+    conv_state_indices: Optional[torch.Tensor] = None,
+    pad_slot_id: int = PAD_SLOT_ID,
+):
+    """
+    x: (batch, dim) or (batch, dim, seqlen)
+    conv_state: (batch, dim, state_len), where state_len >= width - 1
+    weight: (dim, width)
+    bias: (dim,)
+    cache_seqlens: (batch,), dtype int32.
+        If not None, the conv_state is treated as a circular buffer.
+        The conv_state will be updated by copying x to the conv_state
+        starting at the index
+        @cache_seqlens % state_len.
+    conv_state_indices: (batch,), dtype int32
+        If not None, the conv_state is a larger tensor along the batch dim,
+        and we are selecting the batch coords specified by conv_state_indices.
+        Useful for a continuous batching scenario.
+    pad_slot_id: int
+            if cache_indices is passed, lets the kernel identify padded
+            entries that will not be processed,
+            for example: cache_indices = [pad_slot_id, 1 ,20 ,pad_slot_id]
+            in this case, the kernel will not process entries at
+            indices 0 and 3
+    out: (batch, dim) or (batch, dim, seqlen)
+    """
+    if activation not in [None, "silu", "swish"]:
+        raise NotImplementedError(
+            f"activation must be None, silu, or swish, actual: {activation}"
+        )
+    activation_val = activation in ["silu", "swish"]
+    unsqueeze = x.dim() == 2
+    if unsqueeze:
+        x = x.unsqueeze(-1)
+    causal_conv1d_update_kernel(
+        x,
+        conv_state,
+        weight,
+        bias,
+        activation_val,
+        cache_seqlens,
+        conv_state_indices,
+        pad_slot_id,
+    )
+    if unsqueeze:
+        x = x.squeeze(-1)
+    return x
diff --git a/python/sglang/srt/layers/attention/mamba/causal_conv1d_triton.py b/python/sglang/srt/layers/attention/mamba/causal_conv1d_triton.py
new file mode 100644
index 000000000..3c1bdec48
--- /dev/null
+++ b/python/sglang/srt/layers/attention/mamba/causal_conv1d_triton.py
@@ -0,0 +1,1052 @@
+# Copyright (c) 2024, Tri Dao.
+# Adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/causal_conv1d/causal_conv1d_interface.py
+# and https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
+
+from typing import Optional, Union
+
+import numpy as np
+import torch
+
+PAD_SLOT_ID = -1
+import triton
+import triton.language as tl
+
+
+@triton.jit()
+def _causal_conv1d_fwd_kernel(  # continuous batching
+    # Pointers to matrices
+    x_ptr,  # (dim, cu_seqlen) holding `batch` of actual sequences + padded sequences
+    w_ptr,  # (dim, width)
+    bias_ptr,
+    initial_states_ptr,  # conv_states_ptr
+    cache_indices_ptr,  # conv_state_indices_ptr
+    has_initial_states_ptr,
+    query_start_loc_ptr,
+    batch_ptr,
+    token_chunk_offset_ptr,
+    o_ptr,  # (dim, seqlen) - actually pointing to x_ptr
+    # Matrix dimensions
+    batch: tl.int32,  # actually padded_batch
+    dim: tl.constexpr,
+    seqlen: tl.int32,  # cu_seqlen
+    num_cache_lines: tl.constexpr,  # added to support vLLM larger cache lines
+    # Strides
+    stride_x_seq: tl.constexpr,  # stride to get to next sequence,
+    stride_x_dim: tl.constexpr,  # stride to get to next feature-value,
+    stride_x_token: tl.constexpr,  # stride to get to next token (same feature-index, same sequence-index)
+    stride_w_dim: tl.constexpr,  # stride to get to next dim-axis value
+    stride_w_width: tl.constexpr,  # stride to get to next width-axis value
+    stride_istate_seq: tl.constexpr,
+    stride_istate_dim: tl.constexpr,
+    stride_istate_token: tl.constexpr,
+    stride_o_seq: tl.constexpr,
+    stride_o_dim: tl.constexpr,
+    stride_o_token: tl.constexpr,
+    # others
+    pad_slot_id: tl.constexpr,
+    # Meta-parameters
+    HAS_BIAS: tl.constexpr,
+    KERNEL_WIDTH: tl.constexpr,
+    SILU_ACTIVATION: tl.constexpr,
+    HAS_INITIAL_STATES: tl.constexpr,
+    HAS_CACHE: tl.constexpr,
+    IS_CONTINUOUS_BATCHING: tl.constexpr,
+    USE_PAD_SLOT: tl.constexpr,
+    NP2_STATELEN: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    conv_states_ptr = initial_states_ptr
+    conv_state_indices_ptr = cache_indices_ptr
+    stride_conv_state_seq = stride_istate_seq
+    stride_conv_state_dim = stride_istate_dim
+    stride_conv_state_tok = stride_istate_token
+    state_len = (
+        KERNEL_WIDTH - 1
+    )  # can be passed via argument if it's not the same as this value
+
+    # one program handles one chunk in a single sequence
+    # rather than mixing sequences - to make updating initial_states across sequences efficiently
+
+    # single-sequence id
+    idx_seq = tl.load(batch_ptr + tl.program_id(0))
+    chunk_offset = tl.load(token_chunk_offset_ptr + tl.program_id(0))
+
+    # BLOCK_N elements along the feature-dimension (channel)
+    idx_feats = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    if idx_seq == pad_slot_id:
+        return
+
+    sequence_start_index = tl.load(query_start_loc_ptr + idx_seq)
+    sequence_end_index = tl.load(query_start_loc_ptr + idx_seq + 1)
+    # find the actual sequence length
+    seqlen = sequence_end_index - sequence_start_index
+
+    token_offset = BLOCK_M * chunk_offset
+    segment_len = min(BLOCK_M, seqlen - token_offset)
+
+    # base of the sequence
+    x_base = (
+        x_ptr + sequence_start_index * stride_x_token + idx_feats * stride_x_dim
+    )  # [BLOCK_N,]
+
+    if IS_CONTINUOUS_BATCHING:
+        # cache_idx
+        conv_state_batch_coord = tl.load(conv_state_indices_ptr + idx_seq).to(tl.int64)
+    else:
+        # cache_idx
+        conv_state_batch_coord = idx_seq
+    if USE_PAD_SLOT:  # noqa
+        if conv_state_batch_coord == pad_slot_id:
+            # not processing as this is not the actual sequence
+            return
+    conv_states_base = (
+        conv_states_ptr
+        + (conv_state_batch_coord * stride_conv_state_seq)
+        + (idx_feats * stride_conv_state_dim)
+    )  # [BLOCK_N,]
+
+    w_base = w_ptr + (idx_feats * stride_w_dim)  # [BLOCK_N,]
+
+    # Does 2 things:
+    # 1. READ prior-block init-state data - [done by every Triton programs]
+    # 2. update conv_state with new data [only by the Triton program handles chunk_offset=0]
+    if chunk_offset == 0:
+        # read from conv_states
+        load_init_state = False
+        if HAS_INITIAL_STATES:  # the new HAS_INITIAL_STATES
+            load_init_state = tl.load(has_initial_states_ptr + idx_seq).to(tl.int1)
+        if load_init_state:
+            # load from conv_states
+            prior_tokens = conv_states_base + (state_len - 1) * stride_conv_state_tok
+            mask_w = idx_feats < dim
+            if KERNEL_WIDTH == 2:
+                conv_states_ptrs = prior_tokens  # [BLOCK_N]
+                col0 = tl.load(conv_states_ptrs, mask_w, 0.0)
+            if KERNEL_WIDTH == 3:
+                conv_states_ptrs = prior_tokens  # [BLOCK_N]
+                col1 = tl.load(conv_states_ptrs, mask_w, 0.0)
+                conv_states_ptrs = prior_tokens - 1 * stride_conv_state_tok  # [BLOCK_N]
+                col0 = tl.load(conv_states_ptrs, mask_w, 0.0)
+            if KERNEL_WIDTH == 4:
+                conv_states_ptrs = prior_tokens  # [BLOCK_N]
+                col2 = tl.load(conv_states_ptrs, mask_w, 0.0)
+                conv_states_ptrs = prior_tokens - 1 * stride_conv_state_tok  # [BLOCK_N]
+                col1 = tl.load(conv_states_ptrs, mask_w, 0.0)
+                conv_states_ptrs = prior_tokens - 2 * stride_conv_state_tok  # [BLOCK_N]
+                col0 = tl.load(conv_states_ptrs, mask_w, 0.0)
+            if KERNEL_WIDTH == 5:
+                conv_states_ptrs = prior_tokens  # [BLOCK_N]
+                col3 = tl.load(conv_states_ptrs, mask_w, 0.0)
+                conv_states_ptrs = prior_tokens - 1 * stride_conv_state_tok  # [BLOCK_N]
+                col2 = tl.load(conv_states_ptrs, mask_w, 0.0)
+                conv_states_ptrs = prior_tokens - 2 * stride_conv_state_tok  # [BLOCK_N]
+                col1 = tl.load(conv_states_ptrs, mask_w, 0.0)
+                conv_states_ptrs = prior_tokens - 3 * stride_conv_state_tok  # [BLOCK_N]
+                col0 = tl.load(conv_states_ptrs, mask_w, 0.0)
+        else:
+            # prior-tokens are zeros
+            if KERNEL_WIDTH >= 2:  # STRATEGY1
+                # first chunk and does not have prior-token, so just set to 0
+                col0 = tl.zeros((BLOCK_N,), dtype=x_ptr.dtype.element_ty)
+            if KERNEL_WIDTH >= 3:  # STRATEGY1
+                col1 = tl.zeros((BLOCK_N,), dtype=x_ptr.dtype.element_ty)
+            if KERNEL_WIDTH >= 4:  # STRATEGY1
+                col2 = tl.zeros((BLOCK_N,), dtype=x_ptr.dtype.element_ty)
+            if KERNEL_WIDTH >= 5:  # STRATEGY1
+                col3 = tl.zeros((BLOCK_N,), dtype=x_ptr.dtype.element_ty)
+
+        # STEP 2:
+        # here prepare data for updating conv_state
+        if (
+            state_len <= seqlen
+        ):  # SMALL_CACHE=True (only move part of 'x' into conv_state cache)
+            # just read from 'x'
+            # copy 'x' data to conv_state
+            # load only 'x' data (and set 0 before 'x' if seqlen < state_len)
+            idx_tokens_last = (seqlen - state_len) + tl.arange(
+                0, NP2_STATELEN
+            )  # [BLOCK_M]
+            x_ptrs = (
+                x_ptr
+                + ((sequence_start_index + idx_tokens_last) * stride_x_token)[:, None]
+                + (idx_feats * stride_x_dim)[None, :]
+            )  # [BLOCK_M,BLOCK_N,]
+            mask_x = (
+                (idx_tokens_last >= 0)[:, None]
+                & (idx_tokens_last < seqlen)[:, None]
+                & (idx_feats < dim)[None, :]
+            )  # token-index  # token-index  # feature-index
+            loaded_x = tl.load(x_ptrs, mask_x, 0.0)
+            new_conv_state = tl.load(x_ptrs, mask_x, 0.0)
+            idx_tokens_conv = tl.arange(0, NP2_STATELEN)  # [BLOCK_M]
+            conv_states_ptrs_target = (
+                conv_states_base[None, :]
+                + (idx_tokens_conv * stride_conv_state_tok)[:, None]
+            )
+
+            mask = (idx_tokens_conv < state_len)[:, None] & (idx_feats < dim)[None, :]
+            tl.debug_barrier()  #  NOTE: use this due to bug in Triton compiler
+            tl.store(conv_states_ptrs_target, new_conv_state, mask)
+
+        else:
+            if load_init_state:
+                # update conv_state by shifting left, i.e. take last few cols from conv_state + cols from 'x'
+                idx_tokens_conv = tl.arange(0, NP2_STATELEN)  # [BLOCK_M]
+
+                conv_states_ptrs_source = (
+                    conv_states_ptr
+                    + (conv_state_batch_coord * stride_conv_state_seq)
+                    + (idx_feats * stride_conv_state_dim)[None, :]
+                    + ((idx_tokens_conv + seqlen) * stride_conv_state_tok)[:, None]
+                )  # [BLOCK_M, BLOCK_N]
+                mask = (
+                    (conv_state_batch_coord < num_cache_lines)
+                    & ((idx_tokens_conv + seqlen) < state_len)[:, None]
+                    & (idx_feats < dim)[None, :]
+                )
+                conv_state = tl.load(conv_states_ptrs_source, mask, other=0.0)
+
+                VAL = state_len - seqlen
+
+                x_ptrs = (
+                    x_base[None, :]
+                    + ((idx_tokens_conv - VAL) * stride_x_token)[:, None]
+                )  # [BLOCK_M, BLOCK_N]
+
+                mask_x = (
+                    (idx_tokens_conv - VAL >= 0)[:, None]
+                    & (idx_tokens_conv - VAL < seqlen)[:, None]
+                    & (idx_feats < dim)[None, :]
+                )  # token-index  # token-index  # feature-index
+                loaded_x = tl.load(x_ptrs, mask_x, 0.0)
+
+                tl.debug_barrier()  # need this due to the bug in tl.where not enforcing this when data is the result of another tl.load
+                new_conv_state = tl.where(
+                    mask, conv_state, loaded_x
+                )  # BUG in 'tl.where'  which requires a barrier before this
+                conv_states_ptrs_target = (
+                    conv_states_base
+                    + (idx_tokens_conv * stride_conv_state_tok)[:, None]
+                )  # [BLOCK_M, BLOCK_N]
+                mask = (idx_tokens_conv < state_len)[:, None] & (idx_feats < dim)[
+                    None, :
+                ]
+                tl.store(conv_states_ptrs_target, new_conv_state, mask)
+            else:  # load_init_state == False
+                # update conv_state by shifting left, BUT
+                # set cols prior to 'x' as zeros + cols from 'x'
+                idx_tokens_conv = tl.arange(0, NP2_STATELEN)  # [BLOCK_M]
+
+                VAL = state_len - seqlen
+
+                x_ptrs = (
+                    x_base[None, :]
+                    + ((idx_tokens_conv - VAL) * stride_x_token)[:, None]
+                )  # [BLOCK_M, BLOCK_N]
+
+                mask_x = (
+                    (idx_tokens_conv - VAL >= 0)[:, None]
+                    & (idx_tokens_conv - VAL < seqlen)[:, None]
+                    & (idx_feats < dim)[None, :]
+                )  # token-index  # token-index  # feature-index
+                new_conv_state = tl.load(x_ptrs, mask_x, 0.0)
+
+                conv_states_ptrs_target = (
+                    conv_states_base
+                    + (idx_tokens_conv * stride_conv_state_tok)[:, None]
+                )  # [BLOCK_M, BLOCK_N]
+                mask = (idx_tokens_conv < state_len)[:, None] & (idx_feats < dim)[
+                    None, :
+                ]
+                tl.store(conv_states_ptrs_target, new_conv_state, mask)
+
+    else:  # chunk_offset > 0
+        # read prior-token data from `x`
+        load_init_state = True
+        prior_tokens = x_base + (token_offset - 1) * stride_x_token
+        mask_w = idx_feats < dim
+        if KERNEL_WIDTH == 2:
+            conv_states_ptrs = prior_tokens  # [BLOCK_N]
+            col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
+        if KERNEL_WIDTH == 3:
+            conv_states_ptrs = prior_tokens  # [BLOCK_N]
+            col1 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
+            conv_states_ptrs = prior_tokens - 1 * stride_x_token  # [BLOCK_N]
+            col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
+        if KERNEL_WIDTH == 4:
+            conv_states_ptrs = prior_tokens  # [BLOCK_N]
+            col2 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
+            conv_states_ptrs = prior_tokens - 1 * stride_x_token  # [BLOCK_N]
+            col1 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
+            conv_states_ptrs = prior_tokens - 2 * stride_x_token  # [BLOCK_N]
+            col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
+        if KERNEL_WIDTH == 5:
+            # ruff: noqa: F841
+            conv_states_ptrs = prior_tokens  # [BLOCK_N]
+            col3 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
+            conv_states_ptrs = prior_tokens - 1 * stride_x_token  # [BLOCK_N]
+            col2 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
+            conv_states_ptrs = prior_tokens - 2 * stride_x_token  # [BLOCK_N]
+            col1 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
+            conv_states_ptrs = prior_tokens - 3 * stride_x_token  # [BLOCK_N]
+            col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
+
+    if HAS_BIAS:
+        bias = bias_ptr + idx_feats
+        mask_bias = idx_feats < dim
+        acc_preload = tl.load(bias, mask=mask_bias, other=0.0).to(
+            tl.float32
+        )  # [BLOCK_N]
+    else:
+        acc_preload = tl.zeros((BLOCK_N,), dtype=tl.float32)
+
+    x_base_1d = x_base + token_offset * stride_x_token  # starting of chunk
+
+    # PRE-LOAD WEIGHTS
+    mask_w = idx_feats < dim
+    if KERNEL_WIDTH >= 2:
+        w_ptrs = w_base + (0 * stride_w_width)  # [BLOCK_N] tensor
+        w_col0 = tl.load(w_ptrs, mask_w, other=0.0)
+        w_ptrs = w_base + (1 * stride_w_width)  # [BLOCK_N] tensor
+        w_col1 = tl.load(w_ptrs, mask_w, other=0.0)
+    if KERNEL_WIDTH >= 3:
+        w_ptrs = w_base + (2 * stride_w_width)  # [BLOCK_N] tensor
+        w_col2 = tl.load(w_ptrs, mask_w, other=0.0)
+    if KERNEL_WIDTH >= 4:
+        w_ptrs = w_base + (3 * stride_w_width)  # [BLOCK_N] tensor
+        w_col3 = tl.load(w_ptrs, mask_w, other=0.0)
+    mask_x_1d = idx_feats < dim
+    for idx_token in range(segment_len):
+        acc = acc_preload
+
+        matrix_w = w_col0
+        matrix_x = col0
+        for j in tl.static_range(KERNEL_WIDTH):
+
+            if KERNEL_WIDTH == 2:
+                if j == 1:  # KERNEL_WIDTH-1:
+                    matrix_w = w_col1
+                    x_ptrs_1d = x_base_1d + idx_token * stride_x_token  # [BLOCK_N]
+                    matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d)
+            elif KERNEL_WIDTH == 3:
+                if j == 1:
+                    matrix_w = w_col1
+                    matrix_x = col1
+                elif j == 2:
+                    matrix_w = w_col2
+                    x_ptrs_1d = x_base_1d + idx_token * stride_x_token  # [BLOCK_N]
+                    matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d)
+            elif KERNEL_WIDTH == 4:
+                if j == 1:
+                    matrix_w = w_col1
+                    matrix_x = col1
+                elif j == 2:
+                    matrix_w = w_col2
+                    matrix_x = col2
+                elif j == 3:
+                    matrix_w = w_col3
+                    x_ptrs_1d = x_base_1d + idx_token * stride_x_token  # [BLOCK_N]
+                    matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d)
+
+            acc += matrix_x * matrix_w  # [BLOCK_N]
+
+        if KERNEL_WIDTH == 2:
+            col0 = matrix_x
+        elif KERNEL_WIDTH == 3:
+            col0 = col1
+            col1 = matrix_x
+        elif KERNEL_WIDTH == 4:
+            col0 = col1
+            col1 = col2
+            col2 = matrix_x
+
+        if SILU_ACTIVATION:
+            acc = acc / (1 + tl.exp(-acc))
+        mask_1d = (idx_token < segment_len) & (
+            idx_feats < dim
+        )  # token-index  # feature-index
+        o_ptrs = (
+            o_ptr
+            + (sequence_start_index + token_offset + idx_token) * stride_o_token
+            + (idx_feats * stride_o_dim)
+        )
+
+        tl.store(o_ptrs, acc, mask=mask_1d)
+
+
+def causal_conv1d_fn(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Union[torch.Tensor, None],
+    conv_states: torch.Tensor,
+    query_start_loc: torch.Tensor,
+    cache_indices: Optional[torch.Tensor] = None,
+    has_initial_state: Optional[torch.Tensor] = None,
+    activation: Optional[str] = "silu",
+    pad_slot_id: int = PAD_SLOT_ID,
+    metadata=None,
+    validate_data=False,
+):
+    """support varlen + continuous batching when x is 2D tensor
+
+    x: (dim,cu_seq_len)
+        cu_seq_len = total tokens of all seqs in that batch
+        sequences are concatenated from left to right for varlen
+    weight: (dim, width)
+    conv_states: (...,dim,width - 1) itype
+        updated inplace if provided
+        [it use `cache_indices` to get the index to the cache of conv_state for that sequence
+
+        conv_state[cache_indices[i]] for seq-i - to be used as initial_state when has_initial_state[i] = True
+             and after that conv_state[cache_indices[i]] need to be shift-left and updated with values from 'x'
+        ]
+    query_start_loc: (batch + 1) int32
+        The cumulative sequence lengths of the sequences in
+        the batch, used to index into sequence. prepended by 0.
+        if
+        x = [5, 1, 1, 1] <- continuous batching (batch=4)
+        then
+        query_start_loc = [0, 5, 6, 7, 8] <- the starting index of the next sequence; while the last value is
+           the ending index of the last sequence
+        [length(query_start_loc)-1 == batch]
+        for example: query_start_loc = torch.Tensor([0,10,16,17]),
+        x.shape=(dim,17)
+    cache_indices: (batch)  int32
+        indicates the corresponding state index,
+        like so: conv_state = conv_states[cache_indices[batch_id]]
+    has_initial_state: (batch) bool
+        indicates whether should the kernel take the current state as initial
+        state for the calculations
+        [single boolean for each sequence in the batch: True or False]
+    bias: (dim,)
+    activation: either None or "silu" or "swish" or True
+    pad_slot_id: int
+        if cache_indices is passed, lets the kernel identify padded
+        entries that will not be processed,
+        for example: cache_indices = [pad_slot_id, 1, 20, pad_slot_id]
+        in this case, the kernel will not process entries at
+        indices 0 and 3
+
+    out: same shape as `x`
+    """
+    if isinstance(activation, bool) and activation:
+        activation = "silu"
+
+    args = None
+    out = torch.empty_like(x)
+    if metadata is not None:
+        cu_seqlen = metadata.cu_seqlen
+        nums_dict = metadata.nums_dict
+        # x = metadata.x
+        args = nums_dict
+        batch_ptr = metadata.batch_ptr
+        token_chunk_offset_ptr = metadata.token_chunk_offset_ptr
+    else:
+        seqlens = np.diff(query_start_loc.to("cpu"))
+        args = seqlens
+        MAX_NUM_PROGRAMS = 1024
+
+        batch_ptr = torch.full(
+            (MAX_NUM_PROGRAMS,), PAD_SLOT_ID, dtype=torch.int32, device=x.device
+        )  # tracking which seq-idx the Triton program is handling
+        token_chunk_offset_ptr = torch.full(
+            (MAX_NUM_PROGRAMS,), PAD_SLOT_ID, dtype=torch.int32, device=x.device
+        )  # tracking BLOCK_M-based index in the sequence the Triton program is handling
+
+    is_channel_last = (x.stride(0) == 1) & (x.stride(1) > 1)
+    dim, cu_seqlen = x.shape
+    _, width = weight.shape
+    state_len = width - 1
+    np2_statelen = triton.next_power_of_2(state_len)
+
+    padded_batch = query_start_loc.size(0) - 1
+    stride_x_seq = 0
+    stride_x_dim = x.stride(0)
+    stride_x_token = x.stride(1)
+    stride_w_dim = weight.stride(0)
+    stride_w_width = weight.stride(1)
+    stride_istate_seq = 0
+    stride_istate_dim = 0
+    stride_istate_token = 0
+    num_cache_lines = 0
+    if conv_states is not None:
+        # extensions to support vLLM:
+        # 1. conv_states is used to replaced initial_states
+        # 2. conv_states serve as a cache with num cache lines can be larger than batch size
+        # 3. mapping from sequence x[idx] to a cache line at index as specified via cache_indices[idx]
+        # 4. computation can be skipped if cache_indices[idx] == pad_slot_id
+        num_cache_lines = conv_states.size(0)
+        assert (
+            num_cache_lines == conv_states.shape[0]
+            and dim == conv_states.shape[1]
+            and width - 1 <= conv_states.shape[2]
+        )
+        stride_istate_seq = conv_states.stride(0)
+        stride_istate_dim = conv_states.stride(1)
+        stride_istate_token = conv_states.stride(2)
+        # assert stride_istate_dim == 1
+    if out.dim() == 2:
+        stride_o_seq = 0
+        stride_o_dim = out.stride(0)
+        stride_o_token = out.stride(1)
+    else:
+        stride_o_seq = out.stride(0)
+        stride_o_dim = out.stride(1)
+        stride_o_token = out.stride(2)
+
+    if validate_data:
+        assert x.dim() == 2
+        assert query_start_loc is not None
+        assert query_start_loc.dim() == 1
+        assert x.stride(0) == 1 or x.stride(1) == 1
+        if bias is not None:
+            assert bias.dim() == 1
+            assert dim == bias.size(0)
+        if cache_indices is not None:
+            assert cache_indices.dim() == 1
+            assert padded_batch == cache_indices.size(0)
+        if has_initial_state is not None:
+            assert has_initial_state.size() == (padded_batch,)
+            assert (
+                conv_states is not None
+            ), "ERROR: `has_initial_state` is used, which needs also `conv_states`"
+        assert weight.stride(1) == 1
+        assert (dim, width) == weight.shape
+        assert is_channel_last, "Need to run in channel-last layout"
+
+    if metadata is None:
+
+        def num_program(META, seqlens):
+            tot = 0
+
+            mlist = []
+            offsetlist = []  # type: ignore
+
+            nums = -(-seqlens // META["BLOCK_M"])
+
+            tot = nums.sum().item()
+            mlist = np.repeat(np.arange(len(nums)), nums)
+            for idx, num in enumerate(nums):
+                offsetlist.extend(
+                    range(num)
+                )  # chunk-idx if a sequence is split into multiple chunks
+
+            if META["batch_ptr"].nelement() < len(mlist):
+                newlen = len(mlist) + 1
+                META["batch_ptr"].resize_(newlen).fill_(PAD_SLOT_ID)
+                META["token_chunk_offset_ptr"].resize_(newlen).fill_(PAD_SLOT_ID)
+
+            if META["batch_ptr"].nelement() >= len(mlist):
+                META["batch_ptr"][0 : len(mlist)].copy_(
+                    torch.from_numpy(np.array(mlist))
+                )
+                META["token_chunk_offset_ptr"][0 : len(mlist)].copy_(
+                    torch.from_numpy(np.array(offsetlist))
+                )
+
+            META["batch_ptr"] = META["batch_ptr"].to(META["x_ptr"].device)
+            META["token_chunk_offset_ptr"] = META["token_chunk_offset_ptr"].to(
+                META["x_ptr"].device
+            )
+            return tot
+
+    else:
+
+        def num_program(META, nums_dict):
+            tot = nums_dict[META["BLOCK_M"]]["tot"]
+
+            mlist = nums_dict[META["BLOCK_M"]]["mlist"]
+            mlist_len = nums_dict[META["BLOCK_M"]]["mlist_len"]
+
+            offsetlist = nums_dict[META["BLOCK_M"]]["offsetlist"]
+
+            if nums_dict[META["BLOCK_M"]]["batch_ptr"] is not None:
+                META["batch_ptr"] = nums_dict[META["BLOCK_M"]]["batch_ptr"]
+                META["token_chunk_offset_ptr"] = nums_dict[META["BLOCK_M"]][
+                    "token_chunk_offset_ptr"
+                ]
+            else:
+                if META["batch_ptr"].nelement() < mlist_len:
+                    newlen = mlist_len + 1
+                    META["batch_ptr"].resize_(newlen).fill_(PAD_SLOT_ID)
+                    META["token_chunk_offset_ptr"].resize_(newlen).fill_(PAD_SLOT_ID)
+
+                if META["batch_ptr"].nelement() >= mlist_len:
+                    META["batch_ptr"][0:mlist_len].copy_(mlist)
+                    META["token_chunk_offset_ptr"][0:mlist_len].copy_(offsetlist)
+            return tot
+
+    def grid(META):
+        return (
+            num_program(META, args),
+            triton.cdiv(dim, META["BLOCK_N"]),
+        )
+
+    if batch_ptr.device != x.device:
+        batch_ptr = batch_ptr.to(x.device)
+        token_chunk_offset_ptr = token_chunk_offset_ptr.to(x.device)
+
+    _causal_conv1d_fwd_kernel[grid](
+        # Pointers to matrices
+        x,
+        weight,
+        bias,
+        conv_states,
+        cache_indices,
+        has_initial_state,
+        query_start_loc,
+        batch_ptr,
+        token_chunk_offset_ptr,
+        out,
+        # Matrix dimensions
+        padded_batch,
+        dim,
+        cu_seqlen,
+        num_cache_lines,
+        # stride
+        stride_x_seq,
+        stride_x_dim,
+        stride_x_token,
+        stride_w_dim,
+        stride_w_width,
+        stride_istate_seq,
+        stride_istate_dim,
+        stride_istate_token,
+        stride_o_seq,
+        stride_o_dim,
+        stride_o_token,
+        # others
+        pad_slot_id,
+        # META
+        HAS_BIAS=bias is not None,
+        KERNEL_WIDTH=width,
+        SILU_ACTIVATION=activation in ["silu", "swish"],
+        HAS_INITIAL_STATES=has_initial_state is not None,
+        HAS_CACHE=conv_states is not None,
+        IS_CONTINUOUS_BATCHING=cache_indices is not None,
+        USE_PAD_SLOT=pad_slot_id is not None,
+        NP2_STATELEN=np2_statelen,
+        # launch_cooperative_grid=True
+        BLOCK_M=8,
+        BLOCK_N=256,
+        num_stages=2,
+    )
+    return out
+
+
+@triton.jit()
+def _causal_conv1d_update_kernel(
+    # Pointers to matrices
+    x_ptr,  # (batch, dim, seqlen)
+    w_ptr,  # (dim, width)
+    bias_ptr,
+    conv_state_ptr,
+    cache_seqlens_ptr,  # circular buffer
+    conv_state_indices_ptr,
+    num_accepted_tokens_ptr,
+    intermediate_conv_window_ptr,
+    o_ptr,  # (batch, dim, seqlen)
+    # Matrix dimensions
+    batch: int,
+    dim: tl.constexpr,
+    seqlen: tl.constexpr,
+    state_len: tl.constexpr,
+    num_cache_lines: tl.constexpr,  # added to support vLLM larger cache lines
+    # Strides
+    stride_x_seq: tl.constexpr,
+    stride_x_dim: tl.constexpr,
+    stride_x_token: tl.constexpr,
+    stride_w_dim: tl.constexpr,
+    stride_w_width: tl.constexpr,
+    stride_conv_state_seq: tl.constexpr,
+    stride_conv_state_dim: tl.constexpr,
+    stride_conv_state_tok: tl.constexpr,
+    stride_state_indices: tl.constexpr,
+    stride_inter_seq: tl.constexpr,
+    stride_inter_step: tl.constexpr,
+    stride_inter_dim: tl.constexpr,
+    stride_inter_win: tl.constexpr,
+    stride_o_seq: tl.constexpr,
+    stride_o_dim: tl.constexpr,
+    stride_o_token: tl.constexpr,
+    # others
+    pad_slot_id: tl.constexpr,
+    # Meta-parameters
+    HAS_BIAS: tl.constexpr,
+    KERNEL_WIDTH: tl.constexpr,
+    SILU_ACTIVATION: tl.constexpr,
+    IS_CONTINUOUS_BATCHING: tl.constexpr,
+    IS_SPEC_DECODING: tl.constexpr,
+    NP2_STATELEN: tl.constexpr,
+    USE_PAD_SLOT: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    SAVE_INTERMEDIATE: tl.constexpr,
+):
+    # ruff: noqa: E501
+    idx_seq = tl.program_id(0)
+    if idx_seq >= batch:
+        return
+
+    # [BLOCK_N,] elements along the feature-dimension (channel)
+    idx_feats = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    if IS_CONTINUOUS_BATCHING:
+        # mask = idx_seq < batch
+        conv_state_batch_coord = tl.load(
+            conv_state_indices_ptr + idx_seq * stride_state_indices
+        ).to(tl.int64)
+    else:
+        conv_state_batch_coord = idx_seq
+    if USE_PAD_SLOT:  # noqa
+        if conv_state_batch_coord == pad_slot_id:
+            # not processing as this is not the actual sequence
+            return
+
+    if IS_SPEC_DECODING:
+        # The rolling of conv state:
+        #
+        # Before forward, the conv_state is:
+        # [history1, history2, ..., historyM].
+        #
+        # After forward, the conv_state becomes:
+        # [history2, ..., historyM, draft1, draft2, ..., draftN].
+        #
+        # After acceptance, it becomes:
+        #
+        # - accept 1 tokens: [history2, ..., historyM, draft1]
+        # - accept 2 tokens: [history3, ..., historyM, draft1, draft2]
+        # - and so on.
+        conv_state_token_offset = tl.load(num_accepted_tokens_ptr + idx_seq) - 1
+    else:
+        conv_state_token_offset = 0
+
+    # STEP 1: READ init_state data
+    conv_states_base = (
+        conv_state_ptr
+        + (conv_state_batch_coord * stride_conv_state_seq)
+        + (idx_feats * stride_conv_state_dim)
+    )
+    mask_w = idx_feats < dim
+
+    prior_tokens = conv_states_base + conv_state_token_offset * stride_conv_state_tok
+    if KERNEL_WIDTH >= 2:
+        conv_states_ptrs = prior_tokens  # [BLOCK_N]
+        col0 = tl.load(conv_states_ptrs, mask_w, 0.0)
+    if KERNEL_WIDTH >= 3:
+        conv_states_ptrs = prior_tokens + 1 * stride_conv_state_tok  # [BLOCK_N]
+        col1 = tl.load(conv_states_ptrs, mask_w, 0.0)
+    if KERNEL_WIDTH >= 4:
+        conv_states_ptrs = prior_tokens + 2 * stride_conv_state_tok  # [BLOCK_N]
+        col2 = tl.load(conv_states_ptrs, mask_w, 0.0)
+    if KERNEL_WIDTH == 5:
+        conv_states_ptrs = prior_tokens + 3 * stride_conv_state_tok  # [BLOCK_N]
+        col3 = tl.load(conv_states_ptrs, mask_w, 0.0)
+
+    # STEP 2: assume state_len > seqlen
+    idx_tokens = tl.arange(0, NP2_STATELEN)  # [BLOCK_M]
+
+    # The conv_state updates works in a sliding window manner,
+    # at each forward pass, the tokens are shift by 1, so we
+    # load since idx_tokens + 1.
+    conv_state_ptrs_source = (
+        conv_state_ptr
+        + (conv_state_batch_coord * stride_conv_state_seq)
+        + conv_state_token_offset * stride_conv_state_tok
+        + (idx_feats * stride_conv_state_dim)[None, :]
+        + ((idx_tokens + 1) * stride_conv_state_tok)[:, None]
+    )  # [BLOCK_M, BLOCK_N]
+    mask = (
+        (conv_state_batch_coord < num_cache_lines)
+        & ((idx_tokens + seqlen) < state_len)[:, None]
+        & (idx_feats < dim)[None, :]
+    )
+    conv_state = tl.load(conv_state_ptrs_source, mask, other=0.0)
+
+    VAL = state_len - seqlen
+    x_base = x_ptr + (idx_seq * stride_x_seq) + (idx_feats * stride_x_dim)  # [BLOCK_N]
+
+    x_ptrs = (
+        x_base[None, :] + ((idx_tokens - VAL) * stride_x_token)[:, None]
+    )  # [BLOCK_M, BLOCK_N]
+
+    mask_x = (
+        (idx_tokens - VAL >= 0)[:, None]
+        & (idx_tokens - VAL < seqlen)[:, None]
+        & (idx_feats < dim)[None, :]
+    )  # token-index  # token-index  # feature-index
+    loaded_x = tl.load(x_ptrs, mask_x, 0.0)
+    tl.debug_barrier()
+
+    new_conv_state = tl.where(mask, conv_state, loaded_x)
+
+    conv_state_base = (
+        conv_state_ptr
+        + (conv_state_batch_coord * stride_conv_state_seq)
+        + (idx_feats * stride_conv_state_dim)
+    )  # [BLOCK_N,]
+    conv_state_ptrs_target = (
+        conv_state_base + (idx_tokens * stride_conv_state_tok)[:, None]
+    )  # [BLOCK_M, BLOCK_N]
+    mask = (idx_tokens < state_len)[:, None] & (idx_feats < dim)[None, :]
+    tl.store(conv_state_ptrs_target, new_conv_state, mask)
+
+    # STEP 3: init accumulator
+    if HAS_BIAS:
+        bias = bias_ptr + idx_feats
+        mask_bias = idx_feats < dim
+        acc_preload = tl.load(bias, mask=mask_bias, other=0.0).to(
+            tl.float32
+        )  # [BLOCK_N]
+    else:
+        acc_preload = tl.zeros((BLOCK_N,), dtype=tl.float32)
+
+    # STEP 4:
+    # PRE-LOAD WEIGHTS
+    # first kernel column, configured for weights to handle BLOCK_N features in range
+    w_base = w_ptr + (idx_feats * stride_w_dim)  # [BLOCK_N,]
+    mask_w = idx_feats < dim
+    if KERNEL_WIDTH >= 2:
+        w_ptrs = w_base + (0 * stride_w_width)  # [BLOCK_N] tensor
+        w_col0 = tl.load(w_ptrs, mask_w, other=0.0)
+        w_ptrs = w_base + (1 * stride_w_width)  # [BLOCK_N] tensor
+        w_col1 = tl.load(w_ptrs, mask_w, other=0.0)
+    if KERNEL_WIDTH >= 3:
+        w_ptrs = w_base + (2 * stride_w_width)  # [BLOCK_N] tensor
+        w_col2 = tl.load(w_ptrs, mask_w, other=0.0)
+    if KERNEL_WIDTH >= 4:
+        w_ptrs = w_base + (3 * stride_w_width)  # [BLOCK_N] tensor
+        w_col3 = tl.load(w_ptrs, mask_w, other=0.0)
+
+    x_base_1d = x_base  # starting of chunk [BLOCK_N]
+    mask_x_1d = idx_feats < dim
+
+    # STEP 5: compute each token
+    for idx_token in tl.static_range(seqlen):
+        acc = acc_preload
+
+        matrix_w = w_col0
+        matrix_x = col0
+        for j in tl.static_range(KERNEL_WIDTH):
+            if KERNEL_WIDTH == 2:
+                if j == 1:  # KERNEL_WIDTH-1:
+                    matrix_w = w_col1
+                    x_ptrs_1d = x_base_1d + idx_token * stride_x_token  # [BLOCK_N]
+                    matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d)
+            elif KERNEL_WIDTH == 3:
+                if j == 1:
+                    matrix_w = w_col1
+                    matrix_x = col1
+                elif j == 2:
+                    matrix_w = w_col2
+                    x_ptrs_1d = x_base_1d + idx_token * stride_x_token  # [BLOCK_N]
+                    matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d)
+            elif KERNEL_WIDTH == 4:
+                if j == 1:
+                    matrix_w = w_col1
+                    matrix_x = col1
+                elif j == 2:
+                    matrix_w = w_col2
+                    matrix_x = col2
+                elif j == 3:
+                    matrix_w = w_col3
+                    x_ptrs_1d = x_base_1d + idx_token * stride_x_token  # [BLOCK_N]
+                    matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d)
+
+            acc += matrix_x * matrix_w  # [BLOCK_N]
+
+        if KERNEL_WIDTH == 2:
+            col0 = matrix_x
+        elif KERNEL_WIDTH == 3:
+            col0 = col1
+            col1 = matrix_x
+        elif KERNEL_WIDTH == 4:
+            col0 = col1
+            col1 = col2
+            col2 = matrix_x
+
+        if SILU_ACTIVATION:
+            acc = acc / (1 + tl.exp(-acc))
+        mask_1d = (idx_token < seqlen) & (
+            idx_feats < dim
+        )  # token-index  # feature-index
+        o_ptrs = (
+            o_ptr
+            + (idx_seq) * stride_o_seq
+            + idx_token * stride_o_token
+            + (idx_feats * stride_o_dim)
+        )
+
+        tl.store(o_ptrs, acc, mask=mask_1d)
+
+        if SAVE_INTERMEDIATE:
+            # Save the window state after consuming this token
+            # Layout: [seq(cache line), step, dim, win(K-1)]
+            base_ptr = (
+                intermediate_conv_window_ptr
+                + conv_state_batch_coord * stride_inter_seq
+                + idx_token * stride_inter_step
+                + idx_feats * stride_inter_dim
+            )
+            if KERNEL_WIDTH >= 2:
+                tl.store(base_ptr + 0 * stride_inter_win, col0, mask=mask_w)
+            if KERNEL_WIDTH >= 3:
+                tl.store(base_ptr + 1 * stride_inter_win, col1, mask=mask_w)
+            if KERNEL_WIDTH >= 4:
+                tl.store(base_ptr + 2 * stride_inter_win, col2, mask=mask_w)
+
+
+def causal_conv1d_update(
+    x: torch.Tensor,
+    conv_state: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    activation: Union[bool, str, None] = None,
+    cache_seqlens: Optional[torch.Tensor] = None,
+    conv_state_indices: Optional[torch.Tensor] = None,
+    num_accepted_tokens: Optional[torch.Tensor] = None,
+    intermediate_conv_window: Optional[torch.Tensor] = None,
+    pad_slot_id: int = PAD_SLOT_ID,
+    metadata=None,
+    validate_data=False,
+):
+    """
+    x: (batch, dim) or (batch, dim, seqlen)
+        [shape=2: single token prediction]
+        [shape=3: single or multiple tokens prediction]
+    conv_state: (..., dim, state_len), where state_len >= width - 1
+    weight: (dim, width)
+    bias: (dim,)
+    cache_seqlens: (batch,), dtype int32.
+        If not None, the conv_state is treated as a circular buffer.
+        The conv_state will be updated by copying x to the conv_state
+        starting at the index
+        @cache_seqlens % state_len.
+    conv_state_indices: (batch,), dtype int32
+        If not None, the conv_state is a larger tensor along the batch dim,
+        and we are selecting the batch coords specified by conv_state_indices.
+        Useful for a continuous batching scenario.
+    pad_slot_id: int
+            if cache_indices is passed, lets the kernel identify padded
+            entries that will not be processed,
+            for example: cache_indices = [pad_slot_id, 1 ,20 ,pad_slot_id]
+            in this case, the kernel will not process entries at
+            indices 0 and 3
+    out: (batch, dim) or (batch, dim, seqlen)
+    """
+    if validate_data:
+        assert cache_seqlens is None  # not implemented yet - ok for vLLM
+        assert pad_slot_id is not None
+        assert x.stride(1) == 1
+    if isinstance(activation, bool):
+        activation = "silu" if activation is True else None
+    elif activation is not None:
+        assert activation in ["silu", "swish"]
+    unsqueeze = x.dim() == 2
+    if unsqueeze:
+        # make it (batch, dim, seqlen) with seqlen == 1
+        x = x.unsqueeze(-1)
+    batch, dim, seqlen = x.shape
+    _, width = weight.shape
+    # conv_state: (..., dim, state_len), where state_len >= width - 1
+    num_cache_lines, _, state_len = conv_state.size()
+
+    if validate_data:
+        assert dim == weight.size(0)
+        assert (
+            conv_state.stride(-2) == 1
+        ), f"ERROR: expect contiguous along feat-dim of conv_state (currently stride={conv_state.stride()})"
+        assert state_len >= width - 1
+        # when above happens, we don't shift-left to keep any records in conv_state
+        assert dim == conv_state.size(1)
+        if conv_state_indices is None:
+            assert conv_state.size(0) >= batch
+        else:
+            assert (batch,) == conv_state_indices.shape
+
+        assert num_cache_lines >= batch
+        assert weight.stride(1) == 1  # Need this
+        assert cache_seqlens is None  # not needed for vLLM - circular buffer
+
+    # adopt the strategy in vLLM that overwrite on 'x' directly, rather than creating a new tensor 'o'
+    out = x
+    stride_w_dim, stride_w_width = weight.stride()
+
+    stride_x_seq, stride_x_dim, stride_x_token = x.stride()  # X (batch, dim, seqlen)
+
+    stride_o_seq, stride_o_dim, stride_o_token = out.stride()
+    stride_istate_seq, stride_istate_dim, stride_istate_token = conv_state.stride()
+    stride_state_indices = (
+        conv_state_indices.stride(0) if conv_state_indices is not None else 0
+    )
+    state_len = width - 1 + (seqlen - 1)  # effective state_len needed
+    np2_statelen = triton.next_power_of_2(state_len)
+
+    def grid(META):
+        return (
+            batch,
+            triton.cdiv(dim, META["BLOCK_N"]),
+        )
+
+    # prepare intermediate buffer strides if provided
+    if intermediate_conv_window is not None:
+        stride_inter_seq, stride_inter_step, stride_inter_dim, stride_inter_win = (
+            intermediate_conv_window.stride(0),
+            intermediate_conv_window.stride(1),
+            intermediate_conv_window.stride(2),
+            intermediate_conv_window.stride(3),
+        )
+    else:
+        stride_inter_seq = stride_inter_step = stride_inter_dim = stride_inter_win = 0
+
+    _causal_conv1d_update_kernel[grid](
+        # Pointers to matrices
+        x,
+        weight,
+        bias,
+        conv_state,
+        cache_seqlens,
+        conv_state_indices,
+        num_accepted_tokens,
+        intermediate_conv_window if intermediate_conv_window is not None else x,
+        out,
+        # Matrix dimensions
+        batch,
+        dim,
+        seqlen,
+        state_len,
+        num_cache_lines,
+        # stride
+        stride_x_seq,
+        stride_x_dim,
+        stride_x_token,
+        stride_w_dim,
+        stride_w_width,
+        stride_istate_seq,
+        stride_istate_dim,
+        stride_istate_token,
+        stride_state_indices,
+        stride_inter_seq,
+        stride_inter_step,
+        stride_inter_dim,
+        stride_inter_win,
+        stride_o_seq,
+        stride_o_dim,
+        stride_o_token,
+        # others
+        pad_slot_id,
+        # META
+        HAS_BIAS=bias is not None,
+        KERNEL_WIDTH=width,
+        SILU_ACTIVATION=activation in ["silu", "swish"],
+        IS_CONTINUOUS_BATCHING=conv_state_indices is not None,
+        IS_SPEC_DECODING=num_accepted_tokens is not None,
+        NP2_STATELEN=np2_statelen,
+        USE_PAD_SLOT=pad_slot_id is not None,
+        BLOCK_N=256,
+        SAVE_INTERMEDIATE=intermediate_conv_window is not None,
+    )
+    if unsqueeze:
+        out = out.squeeze(-1)
+    return out
diff --git a/python/sglang/srt/layers/attention/mamba/mamba.py b/python/sglang/srt/layers/attention/mamba/mamba.py
new file mode 100644
index 000000000..045a04048
--- /dev/null
+++ b/python/sglang/srt/layers/attention/mamba/mamba.py
@@ -0,0 +1,64 @@
+from typing import Callable, List, Tuple
+
+import torch
+
+LoaderFunction = Callable[[torch.Tensor, torch.Tensor], None]
+
+
+def mamba_v2_sharded_weight_loader(
+    shard_spec: List[Tuple[int, int, float]],
+    tp_size: int,
+    tp_rank: int,
+) -> LoaderFunction:
+    """Create a weight loader for mamba v2. This ensures that the projections
+    are correctly sharded so that they can be split into x, B, C. It also
+    ensures the the all the groups corresponding to a head shard is placed
+    together with it.
+    """
+
+    def loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+
+        # - track boundary of (sharded) param, and loaded_weight, respectively
+        boundary, loaded_boundary = 0, 0
+
+        # - iterate over the shard specs
+        for full_dim, extra, duplicate_groups in shard_spec:
+            # - full dim is the model dim (before TP).
+            # - extra > 0, means there is expected overall increase
+            #   of dimensions. This is so because of replication.
+            # - ratio is used map the tp_rank to the actual shard
+            #   rank. This is useful when there is replication of
+            #   groups to accompany head shards.
+
+            # - size of the loaded shard
+            shard_size = full_dim // tp_size
+
+            # - compute the rank into the loaded shard.
+            # - if there is replication, different TP shards will
+            #   take from the same rank.
+            # NOTE: currently we only support duplication
+            # in the case where num_groups == 1
+            rank = 0 if duplicate_groups else tp_rank
+
+            # - leftmost boundary index into loaded weight.
+            loaded_skip = rank * shard_size
+            loaded_start_idx = loaded_boundary + loaded_skip
+
+            # - take these many dims from the loaded weight.
+            take = min(shard_size, full_dim - extra - loaded_skip)
+
+            # - always shard on dim 0
+            # - the ignore is for a mundane mypy error as it does not
+            #   seem to handle slices well.
+            # https://github.com/python/mypy/issues/2410
+            param.data[
+                boundary : (boundary + take), ...  # type: ignore[misc]
+            ] = loaded_weight[
+                loaded_start_idx : (loaded_start_idx + take)  # type: ignore[misc]
+            ]  # type: ignore[misc]
+
+            # move indexing boundaries
+            boundary += shard_size
+            loaded_boundary += full_dim - extra
+
+    return loader
diff --git a/python/sglang/srt/layers/attention/merge_state.py b/python/sglang/srt/layers/attention/merge_state.py
new file mode 100644
index 000000000..245418a9d
--- /dev/null
+++ b/python/sglang/srt/layers/attention/merge_state.py
@@ -0,0 +1,46 @@
+from typing import Optional, Tuple
+
+import torch
+from sgl_kernel import merge_state_v2
+
+from sglang.srt.layers.attention.triton_ops.merge_state import merge_state_triton
+from sglang.srt.utils import is_cuda
+
+_is_cuda = is_cuda()
+
+
+# Automatically fallback to the Triton kernel in some cases
+# (e.g., for AMD GPUs, when the head dimension is not a multiple
+# of 4 or 8, and in FP8 precision)
+def _supported_dtypes(o: torch.Tensor) -> bool:
+    return o.dtype in [torch.float32, torch.half, torch.bfloat16]
+
+
+def _supported_headdim(o: torch.Tensor) -> bool:
+    headdim = o.shape[2]  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    if o.dtype == torch.float32:
+        return headdim % 4 == 0
+    return headdim % 8 == 0
+
+
+def merge_state(
+    prefix_output: torch.Tensor,
+    prefix_lse: torch.Tensor,
+    suffix_output: torch.Tensor,
+    suffix_lse: torch.Tensor,
+    output: Optional[torch.Tensor] = None,
+    output_lse: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    if (
+        _is_cuda
+        and _supported_dtypes(prefix_output)
+        and _supported_headdim(prefix_output)
+    ):
+        return merge_state_v2(
+            prefix_output, prefix_lse, suffix_output, suffix_lse, output, output_lse
+        )
+    else:
+        # Fallback to Triton kernel
+        return merge_state_triton(
+            prefix_output, prefix_lse, suffix_output, suffix_lse, output, output_lse
+        )
diff --git a/python/sglang/srt/layers/attention/tbo_backend.py b/python/sglang/srt/layers/attention/tbo_backend.py
new file mode 100644
index 000000000..06cfbd4ef
--- /dev/null
+++ b/python/sglang/srt/layers/attention/tbo_backend.py
@@ -0,0 +1,260 @@
+from typing import TYPE_CHECKING, Callable, List, Optional, Union
+
+import torch
+
+from sglang.srt import two_batch_overlap
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
+
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+
+
+class TboAttnBackend(AttentionBackend):
+    def __init__(self, primary: AttentionBackend, children: List[AttentionBackend]):
+        super().__init__()
+        self.primary = primary
+        self.children = children
+
+    @classmethod
+    def init_new(cls, creator: Callable[[], AttentionBackend]):
+        return cls(
+            primary=creator(),
+            children=[creator() for _ in range(2)],
+        )
+
+    def init_forward_metadata(self, forward_batch: "ForwardBatch"):
+        self.primary.init_forward_metadata(forward_batch=forward_batch)
+        if forward_batch.tbo_children is not None:
+            for child, forward_batch_child in zip(
+                self.children, forward_batch.tbo_children, strict=True
+            ):
+                if forward_batch_child.batch_size > 0:
+                    child.init_forward_metadata(forward_batch=forward_batch_child)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        self.primary.init_cuda_graph_state(max_bs=max_bs, max_num_tokens=max_num_tokens)
+        for item in self.children:
+            # TODO for children, maybe can provide *smaller* max_bs to optimize
+            item.init_cuda_graph_state(max_bs=max_bs, max_num_tokens=max_num_tokens)
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: "ForwardMode",
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        self.primary.init_forward_metadata_capture_cuda_graph(
+            bs=bs,
+            num_tokens=num_tokens,
+            req_pool_indices=req_pool_indices,
+            seq_lens=seq_lens,
+            encoder_lens=encoder_lens,
+            forward_mode=forward_mode,
+            spec_info=spec_info,
+        )
+
+        self._init_forward_metadata_cuda_graph_children(
+            fn_name="init_forward_metadata_capture_cuda_graph",
+            bs=bs,
+            req_pool_indices=req_pool_indices,
+            seq_lens=seq_lens,
+            encoder_lens=encoder_lens,
+            forward_mode=forward_mode,
+            spec_info=spec_info,
+            capture_num_tokens=num_tokens,
+        )
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: "ForwardMode",
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        self.primary.init_forward_metadata_replay_cuda_graph(
+            bs=bs,
+            req_pool_indices=req_pool_indices,
+            seq_lens=seq_lens,
+            seq_lens_sum=seq_lens_sum,
+            encoder_lens=encoder_lens,
+            forward_mode=forward_mode,
+            spec_info=spec_info,
+            seq_lens_cpu=seq_lens_cpu,
+        )
+
+        self._init_forward_metadata_cuda_graph_children(
+            fn_name="init_forward_metadata_replay_cuda_graph",
+            bs=bs,
+            req_pool_indices=req_pool_indices,
+            seq_lens=seq_lens,
+            encoder_lens=encoder_lens,
+            forward_mode=forward_mode,
+            spec_info=spec_info,
+            replay_seq_lens_sum=seq_lens_sum,
+            replay_seq_lens_cpu=seq_lens_cpu,
+        )
+
+    def _init_forward_metadata_cuda_graph_children(
+        self,
+        fn_name: str,
+        # common args
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: "ForwardMode",
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+        # capture args
+        capture_num_tokens: int = None,
+        # replay args
+        replay_seq_lens_sum: int = None,
+        replay_seq_lens_cpu: Optional[torch.Tensor] = None,
+    ):
+        token_num_per_seq = two_batch_overlap.get_token_num_per_seq(
+            forward_mode=forward_mode, spec_info=spec_info
+        )
+        if fn_name == "init_forward_metadata_capture_cuda_graph":
+            assert (
+                capture_num_tokens == bs * token_num_per_seq
+            ), "For target-verify or decode mode, num_tokens should be equal to token_num_per_seq * bs"
+        num_tokens = bs * token_num_per_seq
+
+        tbo_split_seq_index, tbo_split_token_index = (
+            two_batch_overlap.compute_split_indices_for_cuda_graph_replay(
+                forward_mode=forward_mode,
+                cuda_graph_num_tokens=num_tokens,
+                spec_info=spec_info,
+            )
+        )
+
+        num_tokens_child_left = tbo_split_token_index
+        num_tokens_child_right = num_tokens - tbo_split_token_index
+        bs_child_left = tbo_split_seq_index
+        bs_child_right = bs - bs_child_left
+
+        assert (
+            num_tokens_child_left > 0 and num_tokens_child_right > 0
+        ), f"{num_tokens_child_left=} {num_tokens_child_right=} {forward_mode=} {num_tokens=}"
+
+        common_pre_split_args = dict(
+            fn_name=fn_name,
+            bs=bs,
+            req_pool_indices=req_pool_indices,
+            seq_lens=seq_lens,
+            encoder_lens=encoder_lens,
+            forward_mode=forward_mode,
+            spec_info=spec_info,
+            capture_num_tokens=capture_num_tokens,
+            replay_seq_lens_sum=replay_seq_lens_sum,
+            replay_seq_lens_cpu=replay_seq_lens_cpu,
+        )
+
+        args_left = _init_forward_metadata_cuda_graph_split(
+            output_bs=bs_child_left,
+            seq_slice=slice(None, tbo_split_seq_index),
+            **common_pre_split_args,
+        )
+        args_right = _init_forward_metadata_cuda_graph_split(
+            output_bs=bs_child_right,
+            seq_slice=slice(tbo_split_seq_index, None),
+            **common_pre_split_args,
+        )
+
+        child_left, child_right = self.children
+        getattr(child_left, fn_name)(**args_left)
+        getattr(child_right, fn_name)(**args_right)
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        ans = self.primary.get_cuda_graph_seq_len_fill_value()
+        for child in self.children:
+            assert ans == child.get_cuda_graph_seq_len_fill_value()
+        return ans
+
+    def forward_extend(self, *args, **kwargs):
+        return self.primary.forward_extend(*args, **kwargs)
+
+    def forward_decode(self, *args, **kwargs):
+        return self.primary.forward_decode(*args, **kwargs)
+
+
+def _init_forward_metadata_cuda_graph_split(
+    fn_name: str,
+    seq_slice: slice,
+    output_bs: int,
+    # common args
+    bs: int,
+    req_pool_indices: torch.Tensor,
+    seq_lens: torch.Tensor,
+    encoder_lens: Optional[torch.Tensor],
+    forward_mode: "ForwardMode",
+    spec_info: Optional[EagleVerifyInput],
+    # capture args
+    capture_num_tokens: int = None,
+    # replay args
+    replay_seq_lens_sum: int = None,
+    replay_seq_lens_cpu: Optional[torch.Tensor] = None,
+):
+    token_num_per_seq = two_batch_overlap.get_token_num_per_seq(
+        forward_mode=forward_mode, spec_info=spec_info
+    )
+    assert encoder_lens is None, "encoder_lens is not supported yet"
+    if spec_info is not None:
+        output_spec_info = two_batch_overlap.split_spec_info(
+            spec_info=spec_info,
+            start_seq_index=seq_slice.start if seq_slice.start is not None else 0,
+            end_seq_index=seq_slice.stop if seq_slice.stop is not None else bs,
+            start_token_index=(
+                seq_slice.start * token_num_per_seq
+                if seq_slice.start is not None
+                else 0
+            ),
+            end_token_index=(
+                seq_slice.stop * token_num_per_seq
+                if seq_slice.stop is not None
+                else bs * token_num_per_seq
+            ),
+        )
+
+    else:
+        output_spec_info = None
+    ans = dict(
+        bs=output_bs,
+        req_pool_indices=req_pool_indices[seq_slice],
+        seq_lens=seq_lens[seq_slice],
+        # directly forward
+        forward_mode=forward_mode,
+        # ignore
+        encoder_lens=None,
+        spec_info=output_spec_info,
+    )
+
+    if fn_name == "init_forward_metadata_capture_cuda_graph":
+        assert (
+            capture_num_tokens == bs * token_num_per_seq
+        ), "Only support num_tokens==bs * token_num_per_seq for target-verify or decode mode"
+        ans.update(
+            dict(
+                num_tokens=output_bs * token_num_per_seq,
+            )
+        )
+    elif fn_name == "init_forward_metadata_replay_cuda_graph":
+        output_seq_lens_cpu = replay_seq_lens_cpu[seq_slice]
+        ans.update(
+            dict(
+                seq_lens_sum=output_seq_lens_cpu.sum().item(),
+                seq_lens_cpu=output_seq_lens_cpu,
+            )
+        )
+    else:
+        raise NotImplementedError
+
+    return ans
diff --git a/python/sglang/srt/layers/attention/torch_native_backend.py b/python/sglang/srt/layers/attention/torch_native_backend.py
new file mode 100644
index 000000000..6a67ea947
--- /dev/null
+++ b/python/sglang/srt/layers/attention/torch_native_backend.py
@@ -0,0 +1,276 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import torch
+from torch.nn.functional import scaled_dot_product_attention
+
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.radix_attention import AttentionType
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+
+class TorchNativeAttnBackend(AttentionBackend):
+    def __init__(self, model_runner: ModelRunner):
+        super().__init__()
+        self.forward_metadata = None
+        self.device = model_runner.device
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Init the metadata for a forward pass."""
+        pass
+
+    def _run_sdpa_forward_extend(
+        self,
+        query: torch.Tensor,
+        output: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        req_to_token: torch.Tensor,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        extend_prefix_lens: torch.Tensor,
+        extend_seq_lens: torch.Tensor,
+        scaling=None,
+        enable_gqa=False,
+        causal=False,
+    ):
+        """Run the extend forward by using torch native sdpa op.
+
+        Args:
+            query: [num_tokens, num_heads, head_size]
+            output: [num_tokens, num_heads, head_size]
+            k_cache: [max_total_num_tokens, num_heads, head_size]
+            v_cache: [max_total_num_tokens, num_heads, head_size]
+            req_to_token: [max_num_reqs, max_context_len]
+            req_pool_indices: [num_seqs]
+            seq_lens: [num_seqs]
+            extend_prefix_lens: [num_seqs]
+            extend_seq_lens: [num_seqs]
+            scaling: float or None
+            enable_gqa: bool
+            causal: bool
+
+        Returns:
+            output: [num_tokens, num_heads, head_size]
+        """
+
+        assert seq_lens.shape[0] == extend_prefix_lens.shape[0]
+        assert seq_lens.shape[0] == extend_seq_lens.shape[0]
+
+        # [num_tokens, num_heads, head_size] -> [num_heads, num_tokens, head_size]
+        query = query.movedim(0, query.dim() - 2)
+
+        start_q, start_kv = 0, 0
+        for seq_idx in range(seq_lens.shape[0]):
+            # TODO: this loop process a sequence per iter, this is inefficient.
+            # Need optimize the performance later.
+
+            extend_seq_len_q = extend_seq_lens[seq_idx]
+            prefill_seq_len_q = extend_prefix_lens[seq_idx]
+
+            seq_len_kv = seq_lens[seq_idx]
+            end_q = start_q + extend_seq_len_q
+            end_kv = start_kv + seq_len_kv
+
+            per_req_query = query[:, start_q:end_q, :]
+            per_req_query_redudant = torch.empty(
+                (per_req_query.shape[0], seq_len_kv, per_req_query.shape[2]),
+                dtype=per_req_query.dtype,
+                device=per_req_query.device,
+            )
+
+            per_req_query_redudant[:, prefill_seq_len_q:, :] = per_req_query
+
+            # get key and value from cache. per_req_tokens contains the kv cache
+            # index for each token in the sequence.
+            req_pool_idx = req_pool_indices[seq_idx]
+            per_req_tokens = req_to_token[req_pool_idx, :seq_len_kv]
+            per_req_key = k_cache[per_req_tokens].movedim(0, query.dim() - 2)
+            per_req_value = v_cache[per_req_tokens].movedim(0, query.dim() - 2)
+
+            per_req_out_redudant = (
+                scaled_dot_product_attention(
+                    per_req_query_redudant.unsqueeze(0),
+                    per_req_key.unsqueeze(0),
+                    per_req_value.unsqueeze(0),
+                    enable_gqa=enable_gqa,
+                    scale=scaling,
+                    is_causal=causal,
+                )
+                .squeeze(0)
+                .movedim(query.dim() - 2, 0)
+            )
+            output[start_q:end_q, :, :] = per_req_out_redudant[prefill_seq_len_q:, :, :]
+            start_q, start_kv = end_q, end_kv
+        return output
+
+    def _run_sdpa_forward_decode(
+        self,
+        query: torch.Tensor,
+        output: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        req_to_token: torch.Tensor,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        scaling=None,
+        enable_gqa=False,
+        causal=False,
+    ):
+        """Run the decode forward by using torch native sdpa op.
+
+        Args:
+            query: [num_tokens, num_heads, head_size]
+            output: [num_tokens, num_heads, head_size]
+            k_cache: [max_total_num_tokens, num_heads, head_size]
+            v_cache: [max_total_num_tokens, num_heads, head_size]
+            req_to_token: [max_num_reqs, max_context_len]
+            req_pool_indices: [num_seqs]
+            seq_lens: [num_seqs]
+            scaling: float or None
+            enable_gqa: bool
+            causal: bool
+
+        Returns:
+            output: [num_tokens, num_heads, head_size]
+        """
+
+        # [num_tokens, num_heads, head_size] -> [num_heads, num_tokens, head_size]
+        query = query.movedim(0, query.dim() - 2)
+
+        start_q, start_kv = 0, 0
+        for seq_idx in range(seq_lens.shape[0]):
+            # TODO: this loop process a sequence per iter, this is inefficient.
+            # Need optimize the performance later.
+
+            seq_len_q = 1
+            seq_len_kv = seq_lens[seq_idx]
+            end_q = start_q + seq_len_q
+            end_kv = start_kv + seq_len_kv
+
+            per_req_query = query[:, start_q:end_q, :]
+
+            # get key and value from cache. per_req_tokens contains the kv cache
+            # index for each token in the sequence.
+            req_pool_idx = req_pool_indices[seq_idx]
+            per_req_tokens = req_to_token[req_pool_idx, :seq_len_kv]
+            per_req_key = k_cache[per_req_tokens].movedim(0, query.dim() - 2)
+            per_req_value = v_cache[per_req_tokens].movedim(0, query.dim() - 2)
+
+            per_req_out = (
+                scaled_dot_product_attention(
+                    per_req_query.unsqueeze(0),
+                    per_req_key.unsqueeze(0),
+                    per_req_value.unsqueeze(0),
+                    enable_gqa=enable_gqa,
+                    scale=scaling,
+                    is_causal=causal,
+                )
+                .squeeze(0)
+                .movedim(query.dim() - 2, 0)
+            )
+            output[start_q:end_q, :, :] = per_req_out
+            start_q, start_kv = end_q, end_kv
+
+        return output
+
+    def forward_extend(
+        self,
+        q,
+        k,
+        v,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        if layer.qk_head_dim != layer.v_head_dim:
+            o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        if layer.is_cross_attention:
+            cache_loc = forward_batch.encoder_out_cache_loc
+        else:
+            cache_loc = forward_batch.out_cache_loc
+
+        if save_kv_cache:
+            forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
+
+        use_gqa = layer.tp_q_head_num != layer.tp_k_head_num
+
+        q_ = q.view(-1, layer.tp_q_head_num, layer.qk_head_dim)
+        o_ = o.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+
+        causal = True
+        if layer.is_cross_attention or layer.attn_type == AttentionType.ENCODER_ONLY:
+            causal = False
+
+        self._run_sdpa_forward_extend(
+            q_,
+            o_,
+            forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+            forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+            forward_batch.req_to_token_pool.req_to_token,
+            forward_batch.req_pool_indices,
+            forward_batch.seq_lens,
+            forward_batch.extend_prefix_lens,
+            forward_batch.extend_seq_lens,
+            scaling=layer.scaling,
+            enable_gqa=use_gqa,
+            causal=causal,
+        )
+        return o
+
+    def forward_decode(
+        self,
+        q,
+        k,
+        v,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        # During torch.compile, there is a bug in rotary_emb that causes the
+        # output value to have a 3D tensor shape. This reshapes the output correctly.
+        q = q.reshape(-1, layer.tp_q_head_num * layer.qk_head_dim)
+
+        if layer.qk_head_dim != layer.v_head_dim:
+            o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        if layer.is_cross_attention:
+            cache_loc = forward_batch.encoder_out_cache_loc
+        else:
+            cache_loc = forward_batch.out_cache_loc
+
+        if save_kv_cache:
+            forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
+
+        use_gqa = layer.tp_q_head_num != layer.tp_k_head_num
+
+        q_ = q.view(-1, layer.tp_q_head_num, layer.qk_head_dim)
+        o_ = o.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+
+        self._run_sdpa_forward_decode(
+            q_,
+            o_,
+            forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+            forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+            forward_batch.req_to_token_pool.req_to_token,
+            forward_batch.req_pool_indices,
+            forward_batch.seq_lens,
+            scaling=layer.scaling,
+            enable_gqa=use_gqa,
+            causal=False,
+        )
+
+        return o
+
+    def support_triton(self):
+        return False
diff --git a/python/sglang/srt/layers/attention/triton_backend.py b/python/sglang/srt/layers/attention/triton_backend.py
new file mode 100644
index 000000000..26241d849
--- /dev/null
+++ b/python/sglang/srt/layers/attention/triton_backend.py
@@ -0,0 +1,1090 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+from sglang.srt.layers.radix_attention import AttentionType
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.utils import get_bool_env_var, get_device_core_count, next_power_of_2
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
+
+
+def logit_capping_mod(logit_capping_method, logit_cap):
+    # positive logit_cap -> tanh cap
+    if logit_capping_method == "tanh":
+        return logit_cap
+    else:
+        raise ValueError()
+
+
+@dataclass
+class ForwardMetadata:
+    attn_logits: torch.Tensor
+    attn_lse: torch.Tensor
+    max_extend_len: int
+    num_kv_splits: torch.Tensor
+    kv_indptr: torch.Tensor
+    kv_indices: torch.Tensor
+    qo_indptr: torch.Tensor
+    custom_mask: torch.Tensor
+    mask_indptr: torch.Tensor
+    # Sliding window
+    window_kv_indptr: torch.Tensor
+    window_kv_indices: torch.Tensor
+    window_num_kv_splits: torch.Tensor
+    window_kv_offsets: torch.Tensor
+
+
+class TritonAttnBackend(AttentionBackend):
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        skip_prefill: bool = False,
+        kv_indptr_buf: Optional[torch.Tensor] = None,
+    ):
+        # Lazy import to avoid the initialization of cuda context
+        from sglang.srt.layers.attention.triton_ops.decode_attention import (
+            decode_attention_fwd,
+        )
+        from sglang.srt.layers.attention.triton_ops.extend_attention import (
+            extend_attention_fwd,
+        )
+
+        super().__init__()
+
+        self.decode_attention_fwd = torch.compiler.disable(decode_attention_fwd)
+        self.extend_attention_fwd = torch.compiler.disable(extend_attention_fwd)
+
+        # Parse args
+        self.skip_prefill = skip_prefill
+        max_bs = model_runner.req_to_token_pool.size
+        self.sliding_window_size = model_runner.sliding_window_size
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.token_to_kv_pool_allocator = model_runner.token_to_kv_pool_allocator
+        self.num_draft_tokens = model_runner.server_args.speculative_num_draft_tokens
+        self.speculative_num_steps = model_runner.server_args.speculative_num_steps
+        self.num_head = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.num_kv_head = model_runner.model_config.get_num_kv_heads(
+            get_attention_tp_size()
+        )
+        self.v_head_dim = model_runner.token_to_kv_pool.get_value_buffer(0).shape[-1]
+        self.max_context_len = model_runner.model_config.context_len
+        self.device = model_runner.device
+        self.device_core_count = get_device_core_count(model_runner.gpu_id)
+        self.static_kv_splits = get_bool_env_var(
+            "SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITS", "false"
+        )
+        self.max_kv_splits = model_runner.server_args.triton_attention_num_kv_splits
+
+        # Check arguments
+        assert not (
+            model_runner.sliding_window_size is not None
+            and model_runner.model_config.is_encoder_decoder
+        ), "Sliding window and cross attention are not supported together"
+
+        # Initialize buffers
+        # TODO(Jianan Ji): Make sure it behaves as expected when kv_indptr_buf is provided and sliding window is enabled
+        if kv_indptr_buf is None:
+            self.kv_indptr = torch.zeros(
+                (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+            )
+        else:
+            self.kv_indptr = kv_indptr_buf
+
+        # If sliding window is enabled, we might need two sets of buffers
+        # because of interleaved attention types (e.g. for Gemma3)
+        self.window_kv_indptr = None
+        if self.sliding_window_size is not None and self.sliding_window_size > 0:
+            if kv_indptr_buf is None:
+                self.window_kv_indptr = torch.zeros(
+                    (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+                )
+            else:
+                # When provided a buffer, create a clone for the second buffer
+                self.window_kv_indptr = torch.zeros_like(kv_indptr_buf)
+
+        if not self.skip_prefill:
+            self.qo_indptr = torch.zeros(
+                (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+            )
+
+            self.mask_indptr = torch.zeros(
+                (max_bs + 1,), dtype=torch.int64, device=model_runner.device
+            )
+
+        # Initialize forward metadata
+        self.forward_metadata: ForwardMetadata = None
+
+    def get_num_kv_splits(
+        self,
+        num_kv_splits: torch.Tensor,
+        seq_lens: torch.Tensor,
+    ):
+        num_token, num_seq = num_kv_splits.shape[0], seq_lens.shape[0]
+        # NOTE(alcanderian): Considering speculative_decodeing,
+        # num_kv_splits.shape[0] will be topk * real_num_token.
+        # And the real_num_token is num_seq in decoding phase.
+        num_group = num_token // num_seq
+
+        assert (
+            num_group * num_seq == num_token
+        ), f"num_seq({num_seq}), num_token({num_token}), something goes wrong!"
+
+        if self.static_kv_splits or self.device_core_count <= 0:
+            num_kv_splits.fill_(self.max_kv_splits)
+            return
+
+        if num_seq < 256:
+            SCHEDULE_SEQ = 256
+        else:
+            SCHEDULE_SEQ = triton.next_power_of_2(num_seq)
+
+        get_num_kv_splits_triton[(1,)](
+            num_kv_splits,
+            seq_lens,
+            num_seq,
+            num_group,
+            self.num_head,
+            self.num_kv_head,
+            self.max_kv_splits,
+            self.device_core_count,
+            MAX_NUM_SEQ=SCHEDULE_SEQ,
+        )
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Init auxiliary variables for triton attention backend."""
+
+        bs = forward_batch.batch_size
+        kv_indptr = self.kv_indptr
+        window_kv_indptr = self.window_kv_indptr
+        window_kv_indices = None
+        window_num_kv_splits = None
+        window_kv_offsets = None
+        spec_info = forward_batch.spec_info
+
+        if forward_batch.forward_mode.is_decode_or_idle():
+            if spec_info is None:
+                kv_indptr[1 : bs + 1] = torch.cumsum(forward_batch.seq_lens, dim=0)
+                kv_indptr = kv_indptr[: bs + 1]
+                kv_indices = torch.empty(
+                    forward_batch.seq_lens_sum, dtype=torch.int64, device=self.device
+                )
+                create_flashinfer_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    forward_batch.req_pool_indices,
+                    forward_batch.seq_lens,
+                    kv_indptr,
+                    None,
+                    kv_indices,
+                    self.req_to_token.stride(0),
+                )
+                # Sliding window
+                if (
+                    self.sliding_window_size is not None
+                    and self.sliding_window_size > 0
+                ):
+                    window_kv_indptr, window_kv_indices, window_kv_lens, _ = (
+                        update_sliding_window_buffer(
+                            self.window_kv_indptr,
+                            self.req_to_token,
+                            self.sliding_window_size,
+                            forward_batch.seq_lens,
+                            forward_batch.req_pool_indices,
+                            bs,
+                            self.device,
+                            self.token_to_kv_pool_allocator,
+                        )
+                    )
+                    window_num_kv_splits = torch.empty(
+                        (bs,), dtype=torch.int32, device=self.device
+                    )
+                    self.get_num_kv_splits(window_num_kv_splits, window_kv_lens)
+            else:
+                kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
+                bs = kv_indptr.shape[0] - 1
+
+            attn_logits = torch.empty(
+                (bs, self.num_head, self.max_kv_splits, self.v_head_dim),
+                dtype=torch.float32,
+                device=self.device,
+            )
+            attn_lse = torch.empty(
+                (bs, self.num_head, self.max_kv_splits),
+                dtype=torch.float32,
+                device=self.device,
+            )
+            num_kv_splits = torch.empty((bs,), dtype=torch.int32, device=self.device)
+            self.get_num_kv_splits(num_kv_splits, forward_batch.seq_lens)
+
+            qo_indptr = None
+            custom_mask = None
+            mask_indptr = None
+            max_extend_len = None
+        elif forward_batch.forward_mode.is_target_verify():
+            bs = len(forward_batch.req_pool_indices)
+            qo_indptr = torch.arange(
+                0,
+                (1 + bs) * self.num_draft_tokens,
+                step=self.num_draft_tokens,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            # Different with flashinfer kv_indptr and kv_indices construction
+            kv_indptr[1 : bs + 1] = torch.cumsum(forward_batch.seq_lens, dim=0)
+            kv_indptr = kv_indptr[: bs + 1]
+            kv_indices = torch.empty(
+                kv_indptr[-1], dtype=torch.int64, device=self.device
+            )
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+
+            if self.sliding_window_size is not None and self.sliding_window_size > 0:
+                # window_kv_offsets is used to calculate the start position in custom mask
+                (
+                    window_kv_indptr,
+                    window_kv_indices,
+                    window_kv_lens,
+                    window_kv_offsets,
+                ) = update_sliding_window_buffer(
+                    self.window_kv_indptr,
+                    self.req_to_token,
+                    self.sliding_window_size,
+                    forward_batch.seq_lens,
+                    forward_batch.req_pool_indices,
+                    bs,
+                    self.device,
+                    self.token_to_kv_pool_allocator,
+                )
+
+            custom_mask = spec_info.custom_mask
+            seq_mask_len = self.num_draft_tokens * (
+                forward_batch.seq_lens + self.num_draft_tokens
+            )
+            mask_indptr = self.mask_indptr
+            mask_indptr[1 : bs + 1] = torch.cumsum(seq_mask_len[:bs], dim=0)
+            mask_indptr = mask_indptr[: bs + 1]
+            max_extend_len = self.num_draft_tokens
+            num_kv_splits = None
+            attn_logits = None
+            attn_lse = None
+
+        elif forward_batch.forward_mode.is_draft_extend():
+            kv_indices, kv_indptr, qo_indptr, custom_mask = (
+                spec_info.generate_attn_arg_prefill(
+                    forward_batch.req_pool_indices,
+                    forward_batch.seq_lens,
+                    None,
+                    self.req_to_token,
+                )
+            )
+            kv_indices = kv_indices.to(torch.int64)
+            mask_indptr = None
+            # TODO(FIXME): This will trigger an invalid Eagle tree when using
+            # `max(spec_info.accept_length_cpu)`.
+            # It might have been forgotten to update somewhere.
+            max_extend_len = torch.max(spec_info.accept_length).item()
+            num_kv_splits = None
+            attn_logits = None
+            attn_lse = None
+        else:
+            kv_indptr[1 : bs + 1] = torch.cumsum(
+                forward_batch.extend_prefix_lens, dim=0
+            )
+            kv_indptr = kv_indptr[: bs + 1]
+            kv_indices = torch.empty(
+                forward_batch.extend_prefix_lens.sum().item(),
+                dtype=torch.int64,
+                device=self.device,
+            )
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                forward_batch.req_pool_indices,
+                forward_batch.extend_prefix_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+            # Sliding window
+            if self.sliding_window_size is not None and self.sliding_window_size > 0:
+                window_kv_indptr, window_kv_indices, _, _ = (
+                    update_sliding_window_buffer(
+                        self.window_kv_indptr,
+                        self.req_to_token,
+                        self.sliding_window_size,
+                        forward_batch.extend_prefix_lens,
+                        forward_batch.req_pool_indices,
+                        bs,
+                        self.device,
+                        self.token_to_kv_pool_allocator,
+                    )
+                )
+
+            qo_indptr = self.qo_indptr
+            qo_indptr[1 : bs + 1] = torch.cumsum(forward_batch.extend_seq_lens, dim=0)
+            qo_indptr = qo_indptr[: bs + 1]
+            custom_mask = None
+            mask_indptr = None
+            attn_logits = None
+            attn_lse = None
+            max_extend_len = max(forward_batch.extend_seq_lens_cpu)
+            num_kv_splits = None
+
+        self.forward_metadata = ForwardMetadata(
+            attn_logits,
+            attn_lse,
+            max_extend_len,
+            num_kv_splits,
+            kv_indptr,
+            kv_indices,
+            qo_indptr,
+            custom_mask,
+            mask_indptr,
+            window_kv_indptr,
+            window_kv_indices,
+            window_num_kv_splits,
+            window_kv_offsets,
+        )
+
+    def init_cuda_graph_state(
+        self,
+        max_bs: int,
+        max_num_tokens: int,
+        kv_indices_buf: Optional[torch.Tensor] = None,
+    ):
+        self.cuda_graph_attn_logits = torch.zeros(
+            (max_num_tokens, self.num_head, self.max_kv_splits, self.v_head_dim),
+            dtype=torch.float32,
+            device=self.device,
+        )
+        self.cuda_graph_attn_lse = torch.zeros(
+            (max_num_tokens, self.num_head, self.max_kv_splits),
+            dtype=torch.float32,
+            device=self.device,
+        )
+        self.cuda_graph_num_kv_splits = torch.full(
+            (max_num_tokens,), self.max_kv_splits, dtype=torch.int32, device=self.device
+        )
+        if kv_indices_buf is None:
+            self.cuda_graph_kv_indices = torch.zeros(
+                (max_num_tokens * self.max_context_len),
+                dtype=torch.int64,
+                device=self.device,
+            )
+        else:
+            self.cuda_graph_kv_indices = kv_indices_buf
+
+        if not self.skip_prefill:
+            self.cuda_graph_custom_mask = torch.zeros(
+                (max_num_tokens * self.max_context_len),
+                dtype=torch.uint8,
+                device=self.device,
+            )
+
+        if self.sliding_window_size is not None and self.sliding_window_size > 0:
+            if kv_indices_buf is None:
+                self.cuda_graph_window_kv_indices = torch.zeros(
+                    (max_num_tokens * self.sliding_window_size),
+                    dtype=torch.int64,
+                    device=self.device,
+                )
+            else:
+                self.cuda_graph_window_kv_indices = torch.zeros_like(kv_indices_buf)
+
+            self.cuda_graph_window_num_kv_splits = torch.full(
+                (max_num_tokens,),
+                self.max_kv_splits,
+                dtype=torch.int32,
+                device=self.device,
+            )
+
+            self.cuda_graph_window_kv_offsets = torch.zeros(
+                (max_bs,),
+                dtype=torch.int32,
+                device=self.device,
+            )
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        assert encoder_lens is None, "Not supported"
+        window_kv_indptr = self.window_kv_indptr
+        window_kv_indices = None
+        window_num_kv_splits = None
+        window_kv_offsets = None
+
+        if forward_mode.is_decode_or_idle():
+            if spec_info is None:
+                kv_indptr = self.kv_indptr
+                kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
+                kv_indptr = kv_indptr[: bs + 1]
+                kv_indices = self.cuda_graph_kv_indices
+                create_flashinfer_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    req_pool_indices,
+                    seq_lens,
+                    kv_indptr,
+                    None,
+                    kv_indices,
+                    self.req_to_token.stride(0),
+                )
+                if (
+                    self.sliding_window_size is not None
+                    and self.sliding_window_size > 0
+                ):
+                    window_kv_indices = self.cuda_graph_window_kv_indices
+                    window_num_kv_splits = self.cuda_graph_window_num_kv_splits
+                    window_kv_indptr, window_kv_indices, _, _ = (
+                        update_sliding_window_buffer_cuda_graph(
+                            self.window_kv_indptr,
+                            window_kv_indices,
+                            self.req_to_token,
+                            self.sliding_window_size,
+                            seq_lens[:bs],
+                            req_pool_indices,
+                            bs,
+                            self.token_to_kv_pool_allocator,
+                        )
+                    )
+            else:
+                kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
+
+            attn_logits = self.cuda_graph_attn_logits
+            attn_lse = self.cuda_graph_attn_lse
+            max_extend_len = None
+            num_kv_splits = self.cuda_graph_num_kv_splits
+            qo_indptr = None
+            custom_mask = None
+            mask_indptr = None
+        elif forward_mode.is_target_verify():
+            qo_indptr = self.qo_indptr[: bs + 1]
+            qo_indptr[: bs + 1] = torch.arange(
+                0,
+                (1 + bs) * self.num_draft_tokens,
+                step=self.num_draft_tokens,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            kv_indptr = self.kv_indptr[: bs + 1]
+            kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
+            kv_indices = self.cuda_graph_kv_indices
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                seq_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+
+            if self.sliding_window_size is not None and self.sliding_window_size > 0:
+                window_kv_indices = self.cuda_graph_window_kv_indices
+                window_num_kv_splits = self.cuda_graph_window_num_kv_splits
+                window_kv_offsets = self.cuda_graph_window_kv_offsets
+                window_kv_indptr, window_kv_indices, _, window_kv_offsets[:bs] = (
+                    update_sliding_window_buffer_cuda_graph(
+                        self.window_kv_indptr,
+                        window_kv_indices,
+                        self.req_to_token,
+                        self.sliding_window_size,
+                        seq_lens[:bs],
+                        req_pool_indices,
+                        bs,
+                        self.token_to_kv_pool_allocator,
+                    )
+                )
+
+            custom_mask = self.cuda_graph_custom_mask
+            custom_mask[: spec_info.custom_mask.shape[0]] = spec_info.custom_mask
+            seq_mask_len = self.num_draft_tokens * (seq_lens + self.num_draft_tokens)
+            mask_indptr = self.mask_indptr[: bs + 1]
+            mask_indptr[1 : bs + 1] = torch.cumsum(seq_mask_len, dim=0)
+            max_extend_len = self.num_draft_tokens
+            num_kv_splits = None
+            attn_logits = None
+            attn_lse = None
+        elif forward_mode.is_draft_extend():
+            num_tokens_per_bs = self.speculative_num_steps + 1
+            qo_indptr = self.qo_indptr[: bs + 1]
+            qo_indptr[: bs + 1] = torch.arange(
+                0,
+                bs * num_tokens_per_bs + 1,
+                step=num_tokens_per_bs,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            kv_indptr = self.kv_indptr[: bs + 1]
+            kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
+            kv_indices = self.cuda_graph_kv_indices
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                seq_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+            custom_mask = None
+            mask_indptr = None
+            max_extend_len = num_tokens_per_bs
+            num_kv_splits = None
+            attn_logits = None
+            attn_lse = None
+        else:
+            raise ValueError(
+                f"Invalid forward mode: {forward_mode=} for CUDA Graph capture."
+            )
+
+        self.forward_metadata = ForwardMetadata(
+            attn_logits,
+            attn_lse,
+            max_extend_len,
+            num_kv_splits,
+            kv_indptr,
+            kv_indices,
+            qo_indptr,
+            custom_mask,
+            mask_indptr,
+            window_kv_indptr,
+            window_kv_indices,
+            window_num_kv_splits,
+            window_kv_offsets,
+        )
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        # NOTE: encoder_lens expected to be zeros or None
+        if forward_mode.is_decode_or_idle():
+            # Update kv_indptr, kv_indices
+            kv_indptr = self.kv_indptr
+            kv_indices = self.cuda_graph_kv_indices
+            num_kv_splits = self.cuda_graph_num_kv_splits
+            if spec_info is None:
+                kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens[:bs], dim=0)
+                kv_indptr = kv_indptr[: bs + 1]
+                create_flashinfer_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    req_pool_indices[:bs],
+                    seq_lens[:bs],
+                    kv_indptr,
+                    None,
+                    kv_indices,
+                    self.req_to_token.stride(0),
+                )
+                num_token = bs
+                if (
+                    self.sliding_window_size is not None
+                    and self.sliding_window_size > 0
+                ):
+                    window_num_kv_splits = self.cuda_graph_window_num_kv_splits
+                    window_kv_indices = self.cuda_graph_window_kv_indices
+                    _, _, window_kv_lens, _ = update_sliding_window_buffer_cuda_graph(
+                        self.window_kv_indptr,
+                        window_kv_indices,
+                        self.req_to_token,
+                        self.sliding_window_size,
+                        seq_lens[:bs],
+                        req_pool_indices[:bs],
+                        bs,
+                        self.token_to_kv_pool_allocator,
+                    )
+                    self.get_num_kv_splits(
+                        window_num_kv_splits[:num_token], window_kv_lens[:bs]
+                    )
+
+            else:
+                kv_indptr[: spec_info.kv_indptr.shape[0]] = spec_info.kv_indptr
+                kv_indices[: spec_info.kv_indices.shape[0]] = spec_info.kv_indices
+                num_token = spec_info.kv_indptr.shape[0] - 1
+            self.get_num_kv_splits(num_kv_splits[:num_token], seq_lens[:bs])
+
+        elif forward_mode.is_target_verify():
+            # Update qo_indptr, kv_indptr, kv_indices, custom_mask, mask_indptr
+            bs = len(req_pool_indices)
+            qo_indptr = self.qo_indptr[: bs + 1]
+            qo_indptr[: bs + 1] = torch.arange(
+                0,
+                (1 + bs) * self.num_draft_tokens,
+                step=self.num_draft_tokens,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            kv_indptr = self.kv_indptr[: bs + 1]
+            kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
+            kv_indices = self.cuda_graph_kv_indices
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                seq_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+            if self.sliding_window_size is not None and self.sliding_window_size > 0:
+                window_num_kv_splits = self.cuda_graph_window_num_kv_splits
+                window_kv_indices = self.cuda_graph_window_kv_indices
+                window_kv_offsets = self.cuda_graph_window_kv_offsets
+                _, _, window_kv_lens, window_kv_offsets[:bs] = (
+                    update_sliding_window_buffer_cuda_graph(
+                        self.window_kv_indptr,
+                        window_kv_indices,
+                        self.req_to_token,
+                        self.sliding_window_size,
+                        seq_lens[:bs],
+                        req_pool_indices,
+                        bs,
+                        self.token_to_kv_pool_allocator,
+                    )
+                )
+            custom_mask = self.cuda_graph_custom_mask
+            custom_mask[: spec_info.custom_mask.shape[0]] = spec_info.custom_mask
+            seq_mask_len = self.num_draft_tokens * (seq_lens + self.num_draft_tokens)
+            mask_indptr = self.mask_indptr[: bs + 1]
+            mask_indptr[1 : bs + 1] = torch.cumsum(seq_mask_len, dim=0)
+        elif forward_mode.is_draft_extend():
+            seq_lens = seq_lens[:bs]
+            accept_lens = spec_info.accept_length[:bs]
+            qo_indptr = self.qo_indptr[: bs + 1]
+            qo_indptr[1 : bs + 1] = torch.cumsum(accept_lens, dim=0)
+            kv_indptr = self.kv_indptr[: bs + 1]
+            kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
+            kv_indices = self.cuda_graph_kv_indices
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                seq_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+        else:
+            raise ValueError(
+                f"Invalid forward mode: {forward_mode=} for CUDA Graph replay."
+            )
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        return 1
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+        sinks=None,
+    ):
+        # TODO: reuse the buffer across layers
+        if layer.qk_head_dim != layer.v_head_dim:
+            o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        if save_kv_cache:
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                layer, forward_batch.out_cache_loc, k, v
+            )
+
+        logits_soft_cap = logit_capping_mod(layer.logit_capping_method, layer.logit_cap)
+
+        causal = True
+        if layer.attn_type == AttentionType.ENCODER_ONLY:
+            causal = False
+
+        if layer.sliding_window_size is not None and layer.sliding_window_size > -1:
+            sliding_window_size = (
+                layer.sliding_window_size
+            )  # Needed for sliding window mask
+            kv_indptr = self.forward_metadata.window_kv_indptr
+            kv_indices = self.forward_metadata.window_kv_indices
+            window_kv_offsets = self.forward_metadata.window_kv_offsets
+        else:
+            sliding_window_size = -1
+            kv_indptr = self.forward_metadata.kv_indptr
+            kv_indices = self.forward_metadata.kv_indices
+            window_kv_offsets = None
+
+        self.extend_attention_fwd(
+            q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+            k.contiguous(),
+            v.contiguous(),
+            o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+            forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+            forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+            self.forward_metadata.qo_indptr,
+            kv_indptr,
+            kv_indices,
+            self.forward_metadata.custom_mask,
+            causal,
+            self.forward_metadata.mask_indptr,
+            self.forward_metadata.max_extend_len,
+            layer.scaling,
+            logit_cap=logits_soft_cap,
+            sliding_window_size=sliding_window_size,
+            sinks=sinks,
+            window_kv_offsets=window_kv_offsets,
+            xai_temperature_len=layer.xai_temperature_len,
+        )
+        return o
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+        sinks=None,
+    ):
+        # During torch.compile, there is a bug in rotary_emb that causes the
+        # output value to have a 3D tensor shape. This reshapes the output correctly.
+        q = q.reshape(-1, layer.tp_q_head_num * layer.qk_head_dim)
+
+        # TODO: reuse the buffer across layers
+        if layer.qk_head_dim != layer.v_head_dim:
+            o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        logits_soft_cap = logit_capping_mod(layer.logit_capping_method, layer.logit_cap)
+
+        if save_kv_cache:
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                layer, forward_batch.out_cache_loc, k, v
+            )
+
+        if layer.sliding_window_size is not None and layer.sliding_window_size > -1:
+            kv_indptr = self.forward_metadata.window_kv_indptr
+            kv_indices = self.forward_metadata.window_kv_indices
+        else:
+            kv_indptr = self.forward_metadata.kv_indptr
+            kv_indices = self.forward_metadata.kv_indices
+
+        self.decode_attention_fwd(
+            q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+            forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+            forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+            o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+            kv_indptr,
+            kv_indices,
+            self.forward_metadata.attn_logits,
+            self.forward_metadata.attn_lse,
+            self.forward_metadata.num_kv_splits,
+            self.max_kv_splits,
+            layer.scaling,
+            logit_cap=logits_soft_cap,
+            sinks=sinks,
+            xai_temperature_len=layer.xai_temperature_len,
+        )
+        return o
+
+
+class TritonMultiStepDraftBackend:
+    """
+    Wrap multiple triton attention backends as one for multiple consecutive
+    draft decoding steps.
+    """
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        topk: int,
+        speculative_num_steps: int,
+    ):
+        from sglang.srt.speculative.eagle_utils import generate_draft_decode_kv_indices
+
+        self.topk = topk
+        self.speculative_num_steps = speculative_num_steps
+        self.generate_draft_decode_kv_indices = generate_draft_decode_kv_indices
+        max_bs = model_runner.req_to_token_pool.size * self.topk
+        self.kv_indptr = torch.zeros(
+            (
+                self.speculative_num_steps,
+                max_bs + 1,
+            ),
+            dtype=torch.int32,
+            device=model_runner.device,
+        )
+        self.attn_backends = []
+        for i in range(self.speculative_num_steps):
+            self.attn_backends.append(
+                TritonAttnBackend(
+                    model_runner,
+                    skip_prefill=True,
+                    kv_indptr_buf=self.kv_indptr[i],
+                )
+            )
+        self.max_context_len = self.attn_backends[0].max_context_len
+        self.num_head = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.device = model_runner.device
+        # Cached variables for generate_draft_decode_kv_indices
+        self.pool_len = model_runner.req_to_token_pool.req_to_token.shape[1]
+        self.page_size = model_runner.server_args.page_size
+
+    def common_template(
+        self, forward_batch: ForwardBatch, kv_indices_buffer: torch.Tensor, call_fn: int
+    ):
+        num_seqs = forward_batch.batch_size
+        bs = self.topk * num_seqs
+        seq_lens_sum = forward_batch.seq_lens_sum
+
+        self.generate_draft_decode_kv_indices[
+            (self.speculative_num_steps, num_seqs, self.topk)
+        ](
+            forward_batch.req_pool_indices,
+            forward_batch.req_to_token_pool.req_to_token,
+            forward_batch.seq_lens,
+            kv_indices_buffer,
+            self.kv_indptr,
+            forward_batch.positions,
+            self.pool_len,
+            kv_indices_buffer.shape[1],
+            self.kv_indptr.shape[1],
+            next_power_of_2(num_seqs),
+            next_power_of_2(self.speculative_num_steps),
+            next_power_of_2(bs),
+            self.page_size,
+        )
+
+        for i in range(self.speculative_num_steps):
+            forward_batch.spec_info.kv_indptr = self.kv_indptr[i, : bs + 1]
+            forward_batch.spec_info.kv_indices = kv_indices_buffer[i][
+                : seq_lens_sum * self.topk + bs * (i + 1)
+            ]
+            call_fn(i, forward_batch)
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        kv_indices = torch.empty(
+            (
+                self.speculative_num_steps,
+                forward_batch.batch_size * self.topk * self.max_context_len,
+            ),
+            dtype=torch.int64,
+            device=self.device,
+        )
+
+        def call_fn(i, forward_batch):
+            forward_batch.spec_info.kv_indptr = (
+                forward_batch.spec_info.kv_indptr.clone()
+            )
+            forward_batch.spec_info.kv_indices = (
+                forward_batch.spec_info.kv_indices.clone()
+            )
+            self.attn_backends[i].init_forward_metadata(forward_batch)
+
+        self.common_template(forward_batch, kv_indices, call_fn)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        self.cuda_graph_kv_indices = torch.zeros(
+            (self.speculative_num_steps, max_num_tokens * self.max_context_len),
+            dtype=torch.int64,
+            device=self.device,
+        )
+        for i in range(self.speculative_num_steps):
+            self.attn_backends[i].init_cuda_graph_state(
+                max_bs, max_num_tokens, kv_indices_buf=self.cuda_graph_kv_indices[i]
+            )
+
+    def init_forward_metadata_capture_cuda_graph(self, forward_batch: ForwardBatch):
+        def call_fn(i, forward_batch):
+            self.attn_backends[i].init_forward_metadata_capture_cuda_graph(
+                forward_batch.batch_size,
+                forward_batch.batch_size * self.topk,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                encoder_lens=None,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+            )
+
+        self.common_template(forward_batch, self.cuda_graph_kv_indices, call_fn)
+
+    def init_forward_metadata_replay_cuda_graph(
+        self, forward_batch: ForwardBatch, bs: int
+    ):
+        def call_fn(i, forward_batch):
+            self.attn_backends[i].init_forward_metadata_replay_cuda_graph(
+                bs,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                seq_lens_sum=-1,
+                encoder_lens=None,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+                seq_lens_cpu=None,
+            )
+
+        self.common_template(forward_batch, self.cuda_graph_kv_indices, call_fn)
+
+
+@triton.jit
+def get_num_kv_splits_triton(
+    num_kv_splits_ptr,
+    seq_lens_ptr,
+    num_seq,
+    num_group,
+    num_head,
+    num_kv_head,
+    max_kv_splits,
+    device_core_count,
+    MAX_NUM_SEQ: tl.constexpr,
+):
+    # TODO: this method is tunable, we need more online serving data to tune it
+    offs_seq = tl.arange(0, MAX_NUM_SEQ)
+    mask_seq = offs_seq < num_seq
+
+    seq_lens = tl.load(seq_lens_ptr + offs_seq, mask=mask_seq, other=0)
+    max_seq_len = tl.max(seq_lens)
+    seq_lens = tl.load(seq_lens_ptr + offs_seq, mask=mask_seq, other=max_seq_len)
+    min_seq_len = tl.min(seq_lens)
+    if max_seq_len * 8 < min_seq_len * 10:
+        min_seq_len = max_seq_len
+    max_kv_splits_1 = tl.minimum(tl.cdiv(max_seq_len, min_seq_len), max_kv_splits)
+    kv_chunk_size_1 = tl.cdiv(max_seq_len, max_kv_splits_1)
+
+    # NOTE: this is a hack to let num_kv_split grows up with seqlen gradually
+    ext_seq_len = tl.cast(max_seq_len, tl.float32) / 64.0
+    ext_device_core_count = tl.cast(
+        device_core_count * tl.maximum(tl.log2(ext_seq_len), 1.0), tl.int32
+    )
+    block_h, num_kv_group = 16, num_head // num_kv_head
+    if num_kv_group == 1:
+        token_grid = num_seq * num_group * num_head
+    else:
+        # from triton_ops/decode_attention.py:_decode_grouped_att_m_fwd
+        block_h = tl.minimum(block_h, num_kv_group)
+        token_grid = num_seq * num_group * tl.cdiv(num_head, block_h)
+    max_kv_splits_2 = tl.minimum(
+        tl.cdiv(ext_device_core_count, token_grid), max_kv_splits
+    )
+    kv_chunk_size_2 = tl.cdiv(max_seq_len, max_kv_splits_2)
+
+    num_kv_splits = tl.maximum(
+        tl.cdiv(seq_lens, kv_chunk_size_1), tl.cdiv(seq_lens, kv_chunk_size_2)
+    )
+
+    offs_token = offs_seq * num_group
+    mask_token = offs_token < num_seq * num_group
+    for i in range(0, num_group):
+        tl.store(num_kv_splits_ptr + i + offs_token, num_kv_splits, mask=mask_token)
+
+
+def update_sliding_window_buffer(
+    window_kv_indptr,
+    req_to_token,
+    sliding_window_size,
+    seq_lens,
+    req_pool_indices,
+    bs,
+    device,
+    token_to_kv_pool_allocator=None,
+):
+    window_kv_lens = torch.minimum(
+        seq_lens,
+        torch.tensor(sliding_window_size),
+    )
+    window_kv_indptr[1 : bs + 1] = torch.cumsum(window_kv_lens, dim=0)
+    window_kv_indptr = window_kv_indptr[: bs + 1]
+    window_kv_indices = torch.empty(
+        window_kv_indptr[-1], dtype=torch.int64, device=device
+    )
+    window_kv_start_idx = seq_lens - window_kv_lens
+    create_flashinfer_kv_indices_triton[(bs,)](
+        req_to_token,
+        req_pool_indices,
+        window_kv_lens,
+        window_kv_indptr,
+        window_kv_start_idx,
+        window_kv_indices,
+        req_to_token.stride(0),
+    )
+    # full to swa index mapping
+    if hasattr(token_to_kv_pool_allocator, "translate_loc_from_full_to_swa"):
+        kv_last_index = window_kv_indptr[-1]
+        window_kv_indices[:kv_last_index] = (
+            token_to_kv_pool_allocator.translate_loc_from_full_to_swa(
+                window_kv_indices[:kv_last_index]
+            )
+        )
+    return window_kv_indptr, window_kv_indices, window_kv_lens, window_kv_start_idx
+
+
+def update_sliding_window_buffer_cuda_graph(
+    window_kv_indptr,
+    window_kv_indices,
+    req_to_token,
+    sliding_window_size,
+    seq_lens,
+    req_pool_indices,
+    bs,
+    token_to_kv_pool_allocator=None,
+):
+    window_kv_lens = torch.minimum(
+        seq_lens,
+        torch.tensor(sliding_window_size),
+    )
+    window_kv_indptr[1 : bs + 1] = torch.cumsum(window_kv_lens, dim=0)
+    window_kv_indptr = window_kv_indptr[: bs + 1]
+    window_kv_start_idx = seq_lens - window_kv_lens
+    create_flashinfer_kv_indices_triton[(bs,)](
+        req_to_token,
+        req_pool_indices,
+        window_kv_lens,
+        window_kv_indptr,
+        window_kv_start_idx,
+        window_kv_indices,
+        req_to_token.stride(0),
+    )
+    # full to swa index mapping
+    if hasattr(token_to_kv_pool_allocator, "translate_loc_from_full_to_swa"):
+        kv_last_index = window_kv_indptr[-1]
+        window_kv_indices[:kv_last_index] = (
+            token_to_kv_pool_allocator.translate_loc_from_full_to_swa(
+                window_kv_indices[:kv_last_index]
+            )
+        )
+    return window_kv_indptr, window_kv_indices, window_kv_lens, window_kv_start_idx
diff --git a/python/sglang/srt/layers/attention/triton_ops/decode_attention.py b/python/sglang/srt/layers/attention/triton_ops/decode_attention.py
new file mode 100644
index 000000000..1ba5d463d
--- /dev/null
+++ b/python/sglang/srt/layers/attention/triton_ops/decode_attention.py
@@ -0,0 +1,776 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Memory-efficient attention for decoding.
+It supports page size = 1.
+"""
+
+# Adapted from
+# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage1.py
+# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage2.py
+
+import logging
+
+import triton
+import triton.language as tl
+
+from sglang.srt.utils import is_hip
+
+_is_hip = is_hip()
+
+logger = logging.getLogger(__name__)
+
+
+_MIN_BLOCK_KV = 32
+
+
+@triton.jit
+def tanh(x):
+    # Tanh is just a scaled sigmoid
+    return 2 * tl.sigmoid(2 * x) - 1
+
+
+@triton.jit
+def _fwd_kernel_stage1(
+    Q,
+    K_Buffer,
+    V_Buffer,
+    sm_scale,
+    kv_indptr,
+    kv_indices,
+    Att_Out,
+    Att_Lse,
+    num_kv_splits,
+    stride_qbs,
+    stride_qh,
+    stride_buf_kbs,
+    stride_buf_kh,
+    stride_buf_vbs,
+    stride_buf_vh,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    kv_group_num: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    MIN_BLOCK_KV: tl.constexpr,
+    logit_cap: tl.constexpr,
+    Lk: tl.constexpr,
+    Lv: tl.constexpr,
+    xai_temperature_len: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    split_kv_id = tl.program_id(2)
+
+    cur_kv_head = cur_head // kv_group_num
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dv = tl.arange(0, BLOCK_DV)
+    mask_d = offs_d < Lk
+    mask_dv = offs_dv < Lv
+
+    cur_batch_kv_start_idx = tl.load(kv_indptr + cur_batch)
+    cur_batch_seq_len = tl.load(kv_indptr + cur_batch + 1) - cur_batch_kv_start_idx
+    kv_splits = tl.load(num_kv_splits + cur_batch)
+
+    if xai_temperature_len > 0:
+        offs_qidx = cur_batch_seq_len - 1
+        xai_temperature_scale = 1.0 / tl.log2(float(xai_temperature_len))
+        _qtemp = tl.log2(offs_qidx.to(tl.float32)) * xai_temperature_scale
+        xai_temperature_reg = tl.where(offs_qidx > xai_temperature_len, _qtemp, 1.0)
+
+    off_q = cur_batch * stride_qbs + cur_head * stride_qh + offs_d
+
+    kv_len_per_split = (
+        tl.cdiv(tl.cdiv(cur_batch_seq_len, kv_splits), MIN_BLOCK_KV) * MIN_BLOCK_KV
+    )
+    split_kv_start = kv_len_per_split * split_kv_id
+    split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, cur_batch_seq_len)
+
+    e_max = -float("inf")
+    e_sum = 0.0
+    acc = tl.zeros([BLOCK_DV], dtype=tl.float32)
+
+    if split_kv_end > split_kv_start:
+        q = tl.load(Q + off_q, mask=mask_d, other=0.0)
+        for start_n in range(split_kv_start, split_kv_end, BLOCK_N):
+            offs_n = start_n + tl.arange(0, BLOCK_N)
+            kv_loc = tl.load(
+                kv_indices + cur_batch_kv_start_idx + offs_n,
+                mask=offs_n < split_kv_end,
+                other=0,
+            )
+            offs_buf_k = (
+                kv_loc[:, None] * stride_buf_kbs
+                + cur_kv_head * stride_buf_kh
+                + offs_d[None, :]
+            )
+            k = tl.load(
+                K_Buffer + offs_buf_k,
+                mask=(offs_n[:, None] < split_kv_end) & (mask_d[None, :]),
+                other=0.0,
+            )
+            qk = tl.sum(q[None, :] * k, 1)
+            qk *= sm_scale
+
+            if logit_cap > 0:
+                qk = logit_cap * tanh(qk / logit_cap)
+
+            if xai_temperature_len > 0:
+                qk *= xai_temperature_reg
+
+            qk = tl.where(offs_n < split_kv_end, qk, float("-inf"))
+
+            offs_buf_v = (
+                kv_loc[:, None] * stride_buf_vbs
+                + cur_kv_head * stride_buf_vh
+                + offs_dv[None, :]
+            )
+            v = tl.load(
+                V_Buffer + offs_buf_v,
+                mask=(offs_n[:, None] < split_kv_end) & (mask_dv[None, :]),
+                other=0.0,
+            )
+
+            n_e_max = tl.maximum(tl.max(qk, 0), e_max)
+            re_scale = tl.exp(e_max - n_e_max)
+            p = tl.exp(qk - n_e_max)
+            acc *= re_scale
+            acc += tl.sum(p[:, None] * v, 0)
+
+            e_sum = e_sum * re_scale + tl.sum(p, 0)
+            e_max = n_e_max
+
+        offs_mid_o = (
+            cur_batch * stride_mid_ob
+            + cur_head * stride_mid_oh
+            + split_kv_id * stride_mid_os
+            + offs_dv
+        )
+
+        tl.store(
+            Att_Out + offs_mid_o,
+            acc / e_sum,
+            mask=(mask_dv),
+        )
+
+        offs_mid_o_1 = (
+            cur_batch * stride_mid_ob
+            + cur_head * stride_mid_oh
+            + split_kv_id * stride_mid_os
+        ) // Lv
+
+        tl.store(
+            Att_Lse + offs_mid_o_1,
+            e_max + tl.log(e_sum),
+        )
+
+
+def _decode_att_m_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    att_out,
+    att_lse,
+    kv_indptr,
+    kv_indices,
+    num_kv_splits,
+    max_kv_splits,
+    sm_scale,
+    logit_cap,
+    xai_temperature_len=-1,
+):
+    BLOCK = 64
+    # [TODO] work around SGPR limit on MI3xx
+    if _is_hip:
+        BLOCK = 8
+    MAX_KV_SPLITS = max_kv_splits
+    Lk = k_buffer.shape[-1]
+    Lv = v_buffer.shape[-1]
+
+    batch, head_num = q.shape[0], q.shape[1]
+
+    grid = (batch, head_num, MAX_KV_SPLITS)
+    kv_group_num = q.shape[1] // k_buffer.shape[1]
+
+    if kv_group_num == 1:
+        num_warps = 4
+    else:
+        num_warps = 2
+        if _is_hip:
+            num_warps = 1
+
+    BLOCK_DMODEL = triton.next_power_of_2(Lk)
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    _fwd_kernel_stage1[grid](
+        q,
+        k_buffer,
+        v_buffer,
+        sm_scale,
+        kv_indptr,
+        kv_indices,
+        att_out,
+        att_lse,
+        num_kv_splits,
+        q.stride(0),
+        q.stride(1),
+        k_buffer.stride(0),
+        k_buffer.stride(1),
+        v_buffer.stride(0),
+        v_buffer.stride(1),
+        att_out.stride(0),
+        att_out.stride(1),
+        att_out.stride(2),
+        kv_group_num=kv_group_num,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        BLOCK_DV=BLOCK_DV,
+        BLOCK_N=BLOCK,
+        MIN_BLOCK_KV=_MIN_BLOCK_KV,
+        logit_cap=logit_cap,
+        xai_temperature_len=xai_temperature_len,
+        num_warps=num_warps,
+        num_stages=2,
+        Lk=Lk,
+        Lv=Lv,
+    )
+
+
+@triton.jit
+def _fwd_grouped_kernel_stage1(
+    Q,
+    K_Buffer,
+    V_Buffer,
+    sm_scale,
+    kv_indptr,
+    kv_indices,
+    Att_Out,
+    Att_Lse,
+    num_kv_splits,
+    stride_qbs,
+    stride_qh,
+    stride_buf_kbs,
+    stride_buf_kh,
+    stride_buf_vbs,
+    stride_buf_vh,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    kv_group_num: tl.constexpr,
+    q_head_num: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DPE: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+    MIN_BLOCK_KV: tl.constexpr,
+    logit_cap: tl.constexpr,
+    xai_temperature_len: tl.constexpr,
+    Lk: tl.constexpr,
+    Lv: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head_id = tl.program_id(1)
+    cur_kv_head = cur_head_id // tl.cdiv(kv_group_num, BLOCK_H)
+    split_kv_id = tl.program_id(2)
+
+    if BLOCK_H < kv_group_num:
+        VALID_BLOCK_H: tl.constexpr = BLOCK_H
+    else:
+        VALID_BLOCK_H: tl.constexpr = kv_group_num
+    cur_head = cur_head_id * VALID_BLOCK_H + tl.arange(0, BLOCK_H)
+    mask_h = cur_head < (cur_head_id + 1) * VALID_BLOCK_H
+    mask_h = mask_h & (cur_head < q_head_num)
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dv = tl.arange(0, BLOCK_DV)
+    mask_d = offs_d < Lk
+    mask_dv = offs_dv < Lv
+
+    cur_batch_kv_start_idx = tl.load(kv_indptr + cur_batch)
+    cur_batch_seq_len = tl.load(kv_indptr + cur_batch + 1) - cur_batch_kv_start_idx
+    kv_splits = tl.load(num_kv_splits + cur_batch)
+
+    if xai_temperature_len > 0:
+        offs_qidx = cur_batch_seq_len - 1
+        xai_temperature_scale = 1.0 / tl.log2(float(xai_temperature_len))
+        _qtemp = tl.log2(offs_qidx.to(tl.float32)) * xai_temperature_scale
+        xai_temperature_reg = tl.where(offs_qidx > xai_temperature_len, _qtemp, 1.0)
+
+    offs_q = cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_d[None, :]
+
+    if BLOCK_DPE > 0:
+        offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE)
+        mask_dpe = offs_dpe < Lk
+        off_qpe = (
+            cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_dpe[None, :]
+        )
+
+    kv_len_per_split = (
+        tl.cdiv(tl.cdiv(cur_batch_seq_len, kv_splits), MIN_BLOCK_KV) * MIN_BLOCK_KV
+    )
+    split_kv_start = kv_len_per_split * split_kv_id
+    split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, cur_batch_seq_len)
+
+    e_max = tl.zeros([BLOCK_H], dtype=tl.float32) - float("inf")
+    e_sum = tl.zeros([BLOCK_H], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_H, BLOCK_DV], dtype=tl.float32)
+
+    if split_kv_end > split_kv_start:
+        q = tl.load(Q + offs_q, mask=(mask_h[:, None]) & (mask_d[None, :]), other=0.0)
+        if BLOCK_DPE > 0:
+            qpe = tl.load(
+                Q + off_qpe, mask=(mask_h[:, None]) & (mask_dpe[None, :]), other=0.0
+            )
+        for start_n in range(split_kv_start, split_kv_end, BLOCK_N):
+            offs_n = start_n + tl.arange(0, BLOCK_N)
+            kv_loc = tl.load(
+                kv_indices + cur_batch_kv_start_idx + offs_n,
+                mask=offs_n < split_kv_end,
+                other=0,
+            )
+            offs_buf_k = (
+                kv_loc[None, :] * stride_buf_kbs
+                + cur_kv_head * stride_buf_kh
+                + offs_d[:, None]
+            )
+            k = tl.load(
+                K_Buffer + offs_buf_k,
+                mask=(offs_n[None, :] < split_kv_end) & (mask_d[:, None]),
+                other=0.0,
+            )
+            qk = tl.dot(q, k.to(q.dtype))
+            if BLOCK_DPE > 0:
+                offs_buf_kpe = (
+                    kv_loc[None, :] * stride_buf_kbs
+                    + cur_kv_head * stride_buf_kh
+                    + offs_dpe[:, None]
+                )
+                kpe = tl.load(
+                    K_Buffer + offs_buf_kpe,
+                    mask=(offs_n[None, :] < split_kv_end) & (mask_dpe[:, None]),
+                    other=0.0,
+                )
+                qk += tl.dot(qpe, kpe.to(qpe.dtype))
+            qk *= sm_scale
+
+            if logit_cap > 0:
+                qk = logit_cap * tanh(qk / logit_cap)
+
+            if xai_temperature_len > 0:
+                qk *= xai_temperature_reg[:, None]
+
+            qk = tl.where(
+                mask_h[:, None] & (offs_n[None, :] < split_kv_end), qk, float("-inf")
+            )
+
+            offs_buf_v = (
+                kv_loc[:, None] * stride_buf_vbs
+                + cur_kv_head * stride_buf_vh
+                + offs_dv[None, :]
+            )
+            v = tl.load(
+                V_Buffer + offs_buf_v,
+                mask=(offs_n[:, None] < split_kv_end) & (mask_dv[None, :]),
+                other=0.0,
+            )
+
+            n_e_max = tl.maximum(tl.max(qk, 1), e_max)
+            re_scale = tl.exp(e_max - n_e_max)
+            p = tl.exp(qk - n_e_max[:, None])
+            acc *= re_scale[:, None]
+            acc += tl.dot(p.to(v.dtype), v)
+
+            e_sum = e_sum * re_scale + tl.sum(p, 1)
+            e_max = n_e_max
+
+        offs_mid_o = (
+            cur_batch * stride_mid_ob
+            + cur_head[:, None] * stride_mid_oh
+            + split_kv_id * stride_mid_os
+            + offs_dv[None, :]
+        )
+
+        tl.store(
+            Att_Out + offs_mid_o,
+            acc / e_sum[:, None],
+            mask=(mask_h[:, None]) & (mask_dv[None, :]),
+        )
+
+        offs_mid_o_1 = (
+            cur_batch * stride_mid_ob
+            + cur_head * stride_mid_oh
+            + split_kv_id * stride_mid_os
+        ) // Lv
+
+        tl.store(
+            Att_Lse + offs_mid_o_1,
+            e_max + tl.log(e_sum),
+            mask=mask_h,
+        )
+
+
+def _decode_grouped_att_m_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    att_out,
+    att_lse,
+    kv_indptr,
+    kv_indices,
+    num_kv_splits,
+    max_kv_splits,
+    sm_scale,
+    logit_cap,
+    xai_temperature_len=-1,
+):
+    BLOCK = 32
+    Lk = k_buffer.shape[-1]
+    Lv = v_buffer.shape[-1]
+
+    # [TODO] work around shmem limit on MI3xx
+    if _is_hip and Lk >= 576:
+        BLOCK = 16
+
+    if Lk == 576:
+        BLOCK_DMODEL = 512
+        BLOCK_DPE = 64
+    elif Lk == 288:
+        BLOCK_DMODEL = 256
+        BLOCK_DPE = 32
+    else:
+        BLOCK_DMODEL = triton.next_power_of_2(Lk)
+        BLOCK_DPE = 0
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    batch, head_num = q.shape[0], q.shape[1]
+    kv_group_num = q.shape[1] // k_buffer.shape[1]
+
+    BLOCK_H = 16
+    MAX_KV_SPLITS = max_kv_splits
+    grid = (
+        batch,
+        triton.cdiv(head_num, min(BLOCK_H, kv_group_num)),
+        MAX_KV_SPLITS,
+    )
+
+    extra_kargs = {}
+    num_stages = 2
+    if _is_hip:
+        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
+        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
+        extra_kargs = {"waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 2}
+        num_stages = 1
+
+    _fwd_grouped_kernel_stage1[grid](
+        q,
+        k_buffer,
+        v_buffer,
+        sm_scale,
+        kv_indptr,
+        kv_indices,
+        att_out,
+        att_lse,
+        num_kv_splits,
+        q.stride(0),
+        q.stride(1),
+        k_buffer.stride(0),
+        k_buffer.stride(1),
+        v_buffer.stride(0),
+        v_buffer.stride(1),
+        att_out.stride(0),
+        att_out.stride(1),
+        att_out.stride(2),
+        kv_group_num=kv_group_num,
+        q_head_num=head_num,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        BLOCK_DPE=BLOCK_DPE,
+        BLOCK_DV=BLOCK_DV,
+        BLOCK_N=BLOCK,
+        BLOCK_H=BLOCK_H,
+        MIN_BLOCK_KV=_MIN_BLOCK_KV,
+        logit_cap=logit_cap,
+        xai_temperature_len=xai_temperature_len,
+        num_warps=4,
+        num_stages=num_stages,
+        Lk=Lk,
+        Lv=Lv,
+        **extra_kargs,
+    )
+
+
+@triton.jit
+def _fwd_kernel_stage2(
+    Mid_O,
+    Mid_O_1,
+    O,
+    kv_indptr,
+    num_kv_splits,
+    sink_ptr,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    stride_obs,
+    stride_oh,
+    MAX_KV_SPLITS: tl.constexpr,
+    MIN_BLOCK_KV: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    Lv: tl.constexpr,
+    HAS_SINK: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+
+    cur_batch_seq_len = tl.load(kv_indptr + cur_batch + 1) - tl.load(
+        kv_indptr + cur_batch
+    )
+    kv_splits = tl.load(num_kv_splits + cur_batch)
+
+    offs_d = tl.arange(0, BLOCK_DV)
+    mask_d = offs_d < Lv
+
+    e_sum = 0.0
+    e_max = -float("inf")
+    acc = tl.zeros([BLOCK_DV], dtype=tl.float32)
+
+    offs_v = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + offs_d
+    offs_logic = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh) // Lv
+    kv_len_per_split = (
+        tl.cdiv(tl.cdiv(cur_batch_seq_len, kv_splits), MIN_BLOCK_KV) * MIN_BLOCK_KV
+    )
+
+    for split_kv_id in range(0, MAX_KV_SPLITS):
+        split_kv_start = kv_len_per_split * split_kv_id
+        split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, cur_batch_seq_len)
+
+        if split_kv_end > split_kv_start:
+            tv = tl.load(
+                Mid_O + offs_v + split_kv_id * stride_mid_os, mask=mask_d, other=0.0
+            )
+            tlogic = tl.load(Mid_O_1 + offs_logic + split_kv_id * stride_mid_os // Lv)
+            n_e_max = tl.maximum(tlogic, e_max)
+
+            old_scale = tl.exp(e_max - n_e_max)
+            acc *= old_scale
+            exp_logic = tl.exp(tlogic - n_e_max)
+            acc += exp_logic * tv
+
+            e_sum = e_sum * old_scale + exp_logic
+            e_max = n_e_max
+
+    if HAS_SINK:
+        cur_sink = tl.load(sink_ptr + cur_head)
+        e_sum += tl.exp(cur_sink - e_max)
+
+    tl.store(
+        O + cur_batch * stride_obs + cur_head * stride_oh + offs_d,
+        acc / e_sum,
+        mask=mask_d,
+    )
+
+
+def _decode_softmax_reducev_fwd(
+    logits,
+    lse,
+    q,
+    o,
+    v_buffer,
+    kv_indptr,
+    num_kv_splits,
+    max_kv_splits,
+    sinks=None,
+):
+    batch, head_num = q.shape[0], q.shape[1]
+    Lv = v_buffer.shape[-1]
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    MAX_KV_SPLITS = max_kv_splits
+    HAS_SINK = sinks is not None
+
+    extra_kargs = {}
+    if _is_hip:
+        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
+        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
+        extra_kargs = {"waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2}
+
+    grid = (batch, head_num)
+    _fwd_kernel_stage2[grid](
+        logits,
+        lse,
+        o,
+        kv_indptr,
+        num_kv_splits,
+        sinks,
+        logits.stride(0),
+        logits.stride(1),
+        logits.stride(2),
+        o.stride(0),
+        o.stride(1),
+        MAX_KV_SPLITS=MAX_KV_SPLITS,
+        MIN_BLOCK_KV=_MIN_BLOCK_KV,
+        BLOCK_DV=BLOCK_DV,
+        Lv=Lv,
+        HAS_SINK=HAS_SINK,
+        num_warps=4,
+        num_stages=2,
+        **extra_kargs,
+    )
+
+
+def decode_attention_fwd_normal(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    kv_indptr,
+    kv_indices,
+    attn_logits,
+    attn_lse,
+    num_kv_splits,
+    max_kv_splits,
+    sm_scale,
+    logit_cap=0.0,
+    sinks=None,
+    xai_temperature_len=-1,
+):
+    _decode_att_m_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        attn_logits,
+        attn_lse,
+        kv_indptr,
+        kv_indices,
+        num_kv_splits,
+        max_kv_splits,
+        sm_scale,
+        logit_cap,
+        xai_temperature_len,
+    )
+    _decode_softmax_reducev_fwd(
+        attn_logits,
+        attn_lse,
+        q,
+        o,
+        v_buffer,
+        kv_indptr,
+        num_kv_splits,
+        max_kv_splits,
+        sinks,
+    )
+
+
+def decode_attention_fwd_grouped(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    kv_indptr,
+    kv_indices,
+    attn_logits,
+    attn_lse,
+    num_kv_splits,
+    max_kv_splits,
+    sm_scale,
+    logit_cap=0.0,
+    sinks=None,
+    xai_temperature_len=-1,
+):
+    _decode_grouped_att_m_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        attn_logits,
+        attn_lse,
+        kv_indptr,
+        kv_indices,
+        num_kv_splits,
+        max_kv_splits,
+        sm_scale,
+        logit_cap,
+        xai_temperature_len,
+    )
+    _decode_softmax_reducev_fwd(
+        attn_logits,
+        attn_lse,
+        q,
+        o,
+        v_buffer,
+        kv_indptr,
+        num_kv_splits,
+        max_kv_splits,
+        sinks,
+    )
+
+
+def decode_attention_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    kv_indptr,
+    kv_indices,
+    attn_logits,
+    attn_lse,
+    num_kv_splits,
+    max_kv_splits,
+    sm_scale,
+    logit_cap=0.0,
+    sinks=None,
+    xai_temperature_len=-1,
+):
+    assert max_kv_splits == attn_logits.shape[2]
+    assert q.shape[0] <= kv_indptr.shape[0] - 1
+    assert q.shape[0] <= attn_logits.shape[0]
+
+    kv_group_num = q.shape[1] // v_buffer.shape[1]
+
+    if kv_group_num == 1:
+        # MHA
+        decode_attention_fwd_normal(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            kv_indptr,
+            kv_indices,
+            attn_logits,
+            attn_lse,
+            num_kv_splits,
+            max_kv_splits,
+            sm_scale,
+            logit_cap=logit_cap,
+            sinks=sinks,
+            xai_temperature_len=xai_temperature_len,
+        )
+    else:
+        # GQA/MQA/MLA
+        decode_attention_fwd_grouped(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            kv_indptr,
+            kv_indices,
+            attn_logits,
+            attn_lse,
+            num_kv_splits,
+            max_kv_splits,
+            sm_scale,
+            logit_cap=logit_cap,
+            sinks=sinks,
+            xai_temperature_len=xai_temperature_len,
+        )
diff --git a/python/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py b/python/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py
new file mode 100644
index 000000000..72e0bfe78
--- /dev/null
+++ b/python/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py
@@ -0,0 +1,1106 @@
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.utils import is_cuda, is_hip
+
+_is_cuda = is_cuda()
+if _is_cuda:
+    CUDA_CAPABILITY = torch.cuda.get_device_capability()
+
+_is_hip = is_hip()
+
+if global_server_args_dict.get("attention_reduce_in_fp32", False):
+    REDUCE_TRITON_TYPE = tl.float32
+    REDUCE_TORCH_TYPE = torch.float32
+else:
+    REDUCE_TRITON_TYPE = tl.float16
+    REDUCE_TORCH_TYPE = torch.float16
+
+
+@triton.jit
+def tanh(x):
+    # Tanh is just a scaled sigmoid
+    return 2 * tl.sigmoid(2 * x) - 1
+
+
+@triton.jit
+def _fwd_kernel_flash_decode_stage1(
+    Q,
+    K,
+    V,
+    sm_scale,
+    Req_to_tokens,
+    B_req_idx,
+    B_Seqlen,
+    Mid_O,  # [batch, head, seq_block_num, head_dim]
+    Mid_O_LogExpSum,  # [batch, head, seq_block_num]
+    stride_req_to_tokens_b,
+    stride_req_to_tokens_s,
+    stride_qbs,
+    stride_qh,
+    stride_qd,
+    stride_kbs,
+    stride_kh,
+    stride_kd,
+    stride_vbs,
+    stride_vh,
+    stride_vd,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    stride_mid_od,
+    stride_mid_o_eb,
+    stride_mid_o_eh,
+    stride_mid_o_es,
+    gqa_group_size,
+    BLOCK_SEQ: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    seq_start_block = tl.program_id(2)
+    cur_kv_head = cur_head // gqa_group_size
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_req_idx = tl.load(B_req_idx + cur_batch)
+    cur_batch_start_index = seq_start_block * BLOCK_SEQ
+    cur_batch_end_index = tl.minimum(
+        cur_batch_seq_len, cur_batch_start_index + BLOCK_SEQ
+    )
+
+    off_q = cur_batch * stride_qbs + cur_head * stride_qh + offs_d
+
+    block_n_size = (
+        tl.where(
+            cur_batch_end_index - cur_batch_start_index <= 0,
+            0,
+            cur_batch_end_index - cur_batch_start_index + BLOCK_N - 1,
+        )
+        // BLOCK_N
+    )
+
+    offs_n = cur_batch_start_index + tl.arange(0, BLOCK_N)
+
+    q = tl.load(Q + off_q)
+
+    sum_exp = 0.0
+    max_logic = -float("inf")
+    acc = tl.zeros([BLOCK_DMODEL], dtype=tl.float32)
+
+    for start_n in range(0, block_n_size, 1):
+        offs_n_new = start_n * BLOCK_N + offs_n
+        k_loc = tl.load(
+            Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx + offs_n_new,
+            mask=offs_n_new < cur_batch_end_index,
+            other=0,
+        )
+        off_k = k_loc[:, None] * stride_kbs + cur_kv_head * stride_kh + offs_d[None, :]
+        k = tl.load(
+            K + off_k, mask=offs_n_new[:, None] < cur_batch_end_index, other=0.0
+        )
+        att_value = tl.sum(q[None, :] * k, 1)
+        att_value *= sm_scale
+        att_value = tl.where(offs_n_new < cur_batch_end_index, att_value, float("-inf"))
+        v = tl.load(
+            V + off_k, mask=offs_n_new[:, None] < cur_batch_end_index, other=0.0
+        )
+
+        cur_max_logic = tl.max(att_value, axis=0)
+        new_max_logic = tl.maximum(cur_max_logic, max_logic)
+
+        exp_logic = tl.exp(att_value - new_max_logic)
+        logic_scale = tl.exp(max_logic - new_max_logic)
+        acc *= logic_scale
+        acc += tl.sum(exp_logic[:, None] * v, axis=0)
+
+        sum_exp = sum_exp * logic_scale + tl.sum(exp_logic, axis=0)
+        max_logic = new_max_logic
+
+    need_store = tl.where(block_n_size == 0, 0, 1)
+    for _ in range(0, need_store, 1):
+        off_mid_o = (
+            cur_batch * stride_mid_ob
+            + cur_head * stride_mid_oh
+            + seq_start_block * stride_mid_os
+            + offs_d
+        )
+        off_mid_o_logexpsum = (
+            cur_batch * stride_mid_o_eb + cur_head * stride_mid_o_eh + seq_start_block
+        )
+        tl.store(Mid_O + off_mid_o, acc / sum_exp)
+        tl.store(Mid_O_LogExpSum + off_mid_o_logexpsum, max_logic + tl.log(sum_exp))
+    return
+
+
+@triton.jit
+def _fwd_kernel_flash_decode_stage2(
+    B_Seqlen,
+    Mid_O,  # [batch, head, seq_block_num, head_dim]
+    Mid_O_LogExpSum,  # [batch, head, seq_block_num]
+    O,  # [batch, head, head_dim]
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    stride_mid_od,
+    stride_mid_o_eb,
+    stride_mid_o_eh,
+    stride_mid_o_es,
+    stride_obs,
+    stride_oh,
+    stride_od,
+    BLOCK_SEQ: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+
+    block_n_size = (
+        tl.where(cur_batch_seq_len <= 0, 0, cur_batch_seq_len + BLOCK_SEQ - 1)
+        // BLOCK_SEQ
+    )
+
+    sum_exp = 0.0
+    max_logic = -float("inf")
+    acc = tl.zeros([BLOCK_DMODEL], dtype=tl.float32)
+
+    offs_v = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + offs_d
+    offs_logic = cur_batch * stride_mid_o_eb + cur_head * stride_mid_o_eh
+    for block_seq_n in range(0, block_n_size, 1):
+        tv = tl.load(Mid_O + offs_v + block_seq_n * stride_mid_os)
+        tlogic = tl.load(Mid_O_LogExpSum + offs_logic + block_seq_n)
+        new_max_logic = tl.maximum(tlogic, max_logic)
+
+        old_scale = tl.exp(max_logic - new_max_logic)
+        acc *= old_scale
+        exp_logic = tl.exp(tlogic - new_max_logic)
+        acc += exp_logic * tv
+        sum_exp = sum_exp * old_scale + exp_logic
+        max_logic = new_max_logic
+
+    tl.store(O + cur_batch * stride_obs + cur_head * stride_oh + offs_d, acc / sum_exp)
+    return
+
+
+@torch.no_grad()
+def flash_decode_stage1(
+    q,
+    k,
+    v,
+    Req_to_tokens,
+    B_req_idx,
+    B_Seqlen,
+    max_len_in_batch,
+    mid_out,
+    mid_out_logsumexp,
+    block_seq,
+):
+    BLOCK_SEQ = block_seq
+    BLOCK_N = 16
+    assert BLOCK_SEQ % BLOCK_N == 0
+    # shape constraints
+    Lq, Lk = q.shape[-1], k.shape[-1]
+    assert Lq == Lk
+    assert Lk in {16, 32, 64, 128}
+    sm_scale = 1.0 / (Lk**0.5)
+    batch, head_num = B_req_idx.shape[0], q.shape[1]
+    grid = (batch, head_num, triton.cdiv(max_len_in_batch, BLOCK_SEQ))
+    gqa_group_size = q.shape[1] // k.shape[1]
+
+    _fwd_kernel_flash_decode_stage1[grid](
+        q,
+        k,
+        v,
+        sm_scale,
+        Req_to_tokens,
+        B_req_idx,
+        B_Seqlen,
+        mid_out,
+        mid_out_logsumexp,
+        Req_to_tokens.stride(0),
+        Req_to_tokens.stride(1),
+        q.stride(0),
+        q.stride(1),
+        q.stride(2),
+        k.stride(0),
+        k.stride(1),
+        k.stride(2),
+        v.stride(0),
+        v.stride(1),
+        v.stride(2),
+        mid_out.stride(0),
+        mid_out.stride(1),
+        mid_out.stride(2),
+        mid_out.stride(3),
+        mid_out_logsumexp.stride(0),
+        mid_out_logsumexp.stride(1),
+        mid_out_logsumexp.stride(2),
+        gqa_group_size,
+        BLOCK_SEQ=BLOCK_SEQ,
+        BLOCK_DMODEL=Lk,
+        BLOCK_N=BLOCK_N,
+        num_warps=1,
+        num_stages=2,
+    )
+    return
+
+
+@torch.no_grad()
+def flash_decode_stage2(mid_out, mid_out_logexpsum, B_Seqlen, O, block_seq):
+    Lk = mid_out.shape[-1]
+    assert Lk in {16, 32, 64, 128}
+    batch, head_num = mid_out.shape[0], mid_out.shape[1]
+    grid = (batch, head_num)
+
+    _fwd_kernel_flash_decode_stage2[grid](
+        B_Seqlen,
+        mid_out,
+        mid_out_logexpsum,
+        O,
+        mid_out.stride(0),
+        mid_out.stride(1),
+        mid_out.stride(2),
+        mid_out.stride(3),
+        mid_out_logexpsum.stride(0),
+        mid_out_logexpsum.stride(1),
+        mid_out_logexpsum.stride(2),
+        O.stride(0),
+        O.stride(1),
+        O.stride(2),
+        BLOCK_SEQ=block_seq,
+        BLOCK_DMODEL=Lk,
+        num_warps=4,
+        num_stages=2,
+    )
+    return
+
+
+def flash_decode_attention_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    req_to_token,
+    b_req_idx,
+    b_start_loc,
+    b_seq_len,
+    attn_logits,
+    max_len_in_batch,
+    sm_scale,
+    logit_cap=0.0,
+):
+    BLOCK_SEQ = 256
+    kv_group_num = q.shape[1] // v_buffer.shape[1]
+    # batch_size = q.shape[0]
+
+    block_seq_num = (max_len_in_batch + BLOCK_SEQ - 1) // BLOCK_SEQ
+
+    mid_o = torch.empty(
+        [q.shape[0], q.shape[1], block_seq_num, q.shape[-1]],
+        dtype=torch.float32,
+        device="cuda",
+    )
+    mid_o_logexpsum = torch.empty(
+        [q.shape[0], q.shape[1], block_seq_num], dtype=torch.float32, device="cuda"
+    )
+
+    flash_decode_stage1(
+        q,
+        k_buffer,
+        v_buffer,
+        req_to_token,
+        b_req_idx,
+        b_seq_len,
+        max_len_in_batch,
+        mid_o,
+        mid_o_logexpsum,
+        BLOCK_SEQ,
+    )
+    flash_decode_stage2(mid_o, mid_o_logexpsum, b_seq_len, o, BLOCK_SEQ)
+
+
+@triton.jit
+def _sparse_fwd_kernel_flash_decode_stage1(  # Double Sparsity's approximate attention
+    Q_Label,
+    K_Label_Buffer,
+    sm_scale,
+    Req_to_tokens,  # shape: [B, S]
+    B_Seqlen,
+    Att_Out,  # shape: [H, B, S] easier for topk
+    stride_req_to_tokens_b,
+    stride_qbs,
+    stride_qh,
+    stride_buf_kbs,
+    stride_buf_kh,
+    att_stride_h,
+    att_stride_b,
+    kv_group_num: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    logit_cap: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    start_n = tl.program_id(2)
+
+    cur_kv_head = cur_head // kv_group_num
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+
+    cur_batch_start_index = 0
+    cur_batch_end_index = cur_batch_seq_len
+
+    min_val = -float("inf")
+    att_value = tl.full([BLOCK_N], min_val, dtype=tl.float32)
+
+    off_q = cur_batch * stride_qbs + cur_head * stride_qh + offs_d
+
+    offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    block_index = start_n * BLOCK_N
+    block_mask = tl.where(block_index < cur_batch_seq_len, 1, 0)
+
+    for start_mark in range(0, block_mask, 1):
+        q = tl.load(Q_Label + off_q + start_mark).to(REDUCE_TRITON_TYPE)
+        offs_n_new = cur_batch_start_index + offs_n
+        k_loc = tl.load(
+            Req_to_tokens + stride_req_to_tokens_b * cur_batch + offs_n_new,
+            mask=offs_n_new < cur_batch_end_index,
+            other=0,
+        )
+        offs_buf_k = (
+            k_loc[:, None] * stride_buf_kbs
+            + cur_kv_head * stride_buf_kh
+            + offs_d[None, :]
+        )
+        k = tl.load(
+            K_Label_Buffer + offs_buf_k,
+            mask=offs_n_new[:, None] < cur_batch_end_index,
+            other=0.0,
+        ).to(REDUCE_TRITON_TYPE)
+
+        att_value = tl.sum(q[None, :] * k, 1)
+        att_value *= sm_scale
+
+        if logit_cap > 0:
+            att_value = logit_cap * tanh(att_value / logit_cap)
+
+    att_value = tl.where(offs_n < cur_batch_end_index, att_value, min_val)
+    off_o = cur_head * att_stride_h + (cur_batch * att_stride_b + offs_n)
+    tl.store(Att_Out + off_o, att_value)
+
+
+@triton.jit
+def _sparse_fwd_kernel_flash_decode_stage2(
+    Q,
+    K,
+    V,
+    sm_scale,
+    Req_to_tokens,  # shape: [B, S]
+    Topk_token_indices,  # shape: [H, B, k]
+    Mid_O,  # [batch, head, seq_block_num, head_dim]
+    Mid_O_LogExpSum,  # [batch, head, seq_block_num]
+    Heavy_token_num,  # NOTE: This can be used as constexpr but we may support dynamic heavy token number in the future
+    stride_req_to_tokens_b,
+    stride_topk_token_indices_h,
+    stride_topk_token_indices_b,
+    stride_qbs,
+    stride_qh,
+    stride_kbs,
+    stride_kh,
+    stride_vbs,
+    stride_vh,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    stride_mid_o_eb,
+    stride_mid_o_eh,
+    gqa_group_size,
+    BLOCK_SEQ: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    seq_start_block = tl.program_id(2)
+    cur_kv_head = cur_head // gqa_group_size
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    cur_batch_start_index = seq_start_block * BLOCK_SEQ
+    cur_batch_end_index = tl.minimum(Heavy_token_num, cur_batch_start_index + BLOCK_SEQ)
+
+    off_q = cur_batch * stride_qbs + cur_head * stride_qh + offs_d
+
+    block_n_size = (
+        tl.where(
+            cur_batch_end_index - cur_batch_start_index <= 0,
+            0,
+            cur_batch_end_index - cur_batch_start_index + BLOCK_N - 1,
+        )
+        // BLOCK_N
+    )
+
+    # offs_n = cur_batch_start_index + tl.arange(0, BLOCK_N)
+    offs_n = tl.arange(0, BLOCK_N)
+
+    q = tl.load(Q + off_q)
+
+    sum_exp = 0.0
+    max_logic = -float("inf")
+    acc = tl.zeros([BLOCK_DMODEL], dtype=tl.float32)
+
+    for start_n in range(cur_batch_start_index, cur_batch_end_index, BLOCK_N):
+        # for start_n in range(0, block_n_size, 1):
+        # offs_n_new = start_n * BLOCK_N + offs_n
+        offs_n_new = start_n + offs_n
+        # offs_n_new = cur_batch_start_index + start_n * BLOCK_N + offs_n
+        topk_token_indices = tl.load(
+            Topk_token_indices
+            + stride_topk_token_indices_h * cur_head
+            + stride_topk_token_indices_b * cur_batch
+            + offs_n_new,
+            mask=offs_n_new < cur_batch_end_index,
+            other=0,
+        )
+        k_loc = tl.load(
+            Req_to_tokens + stride_req_to_tokens_b * cur_batch + topk_token_indices,
+            mask=offs_n_new < cur_batch_end_index,
+            other=0,
+        )
+        off_k = k_loc[:, None] * stride_kbs + cur_kv_head * stride_kh + offs_d[None, :]
+        k = tl.load(
+            K + off_k, mask=offs_n_new[:, None] < cur_batch_end_index, other=0.0
+        )
+        att_value = tl.sum(q[None, :] * k, 1)
+        att_value *= sm_scale
+        att_value = tl.where(offs_n_new < cur_batch_end_index, att_value, float("-inf"))
+        v = tl.load(
+            V + off_k, mask=offs_n_new[:, None] < cur_batch_end_index, other=0.0
+        )
+
+        cur_max_logic = tl.max(att_value, axis=0)
+        new_max_logic = tl.maximum(cur_max_logic, max_logic)
+
+        exp_logic = tl.exp(att_value - new_max_logic)
+        logic_scale = tl.exp(max_logic - new_max_logic)
+        acc *= logic_scale
+        acc += tl.sum(exp_logic[:, None] * v, axis=0)
+
+        sum_exp = sum_exp * logic_scale + tl.sum(exp_logic, axis=0)
+        max_logic = new_max_logic
+
+    # need_store = tl.where(block_n_size == 0, 0, 1)
+    need_store = 1
+    for _ in range(0, need_store, 1):
+        off_mid_o = (
+            cur_batch * stride_mid_ob
+            + cur_head * stride_mid_oh
+            + seq_start_block * stride_mid_os
+            + offs_d
+        )
+        off_mid_o_logexpsum = (
+            cur_batch * stride_mid_o_eb + cur_head * stride_mid_o_eh + seq_start_block
+        )
+        tl.store(Mid_O + off_mid_o, acc / sum_exp)
+        tl.store(Mid_O_LogExpSum + off_mid_o_logexpsum, max_logic + tl.log(sum_exp))
+    return
+
+
+@triton.jit
+def _sparse_fwd_kernel_flash_decode_stage3(
+    Mid_O,  # [batch, head, seq_block_num, head_dim]
+    Mid_O_LogExpSum,  # [batch, head, seq_block_num]
+    O,  # [batch, head, head_dim]
+    seq_len,  # NOTE: This can be used as constexpr but we may support dynamic heavy token number in the future
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    stride_mid_o_eb,
+    stride_mid_o_eh,
+    stride_obs,
+    stride_oh,
+    BLOCK_SEQ: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+
+    block_n_size = tl.where(seq_len <= 0, 0, seq_len + BLOCK_SEQ - 1) // BLOCK_SEQ
+
+    sum_exp = 0.0
+    max_logic = -float("inf")
+    acc = tl.zeros([BLOCK_DMODEL], dtype=tl.float32)
+
+    offs_v = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + offs_d
+    offs_logic = cur_batch * stride_mid_o_eb + cur_head * stride_mid_o_eh
+    for block_seq_n in range(0, block_n_size, 1):
+        tv = tl.load(Mid_O + offs_v + block_seq_n * stride_mid_os)
+        tlogic = tl.load(Mid_O_LogExpSum + offs_logic + block_seq_n)
+        new_max_logic = tl.maximum(tlogic, max_logic)
+
+        old_scale = tl.exp(max_logic - new_max_logic)
+        acc *= old_scale
+        exp_logic = tl.exp(tlogic - new_max_logic)
+        acc += exp_logic * tv
+        sum_exp = sum_exp * old_scale + exp_logic
+        max_logic = new_max_logic
+
+    tl.store(O + cur_batch * stride_obs + cur_head * stride_oh + offs_d, acc / sum_exp)
+    return
+
+
+def sparse_flash_decode_stage1(
+    q_label,
+    k_label_buffer,
+    att_out,
+    Req_to_tokens,
+    B_Seqlen,
+    max_len_in_batch,
+    sm_scale,
+    logit_cap,
+):
+    BLOCK = 32
+    # shape constraints
+    Lq, Lk = q_label.shape[-1], k_label_buffer.shape[-1]
+    assert Lq == Lk
+    assert Lk in {16, 32, 64, 128, 256, 576}
+
+    BLOCK_DMODEL = Lk
+
+    batch, head_num = q_label.shape[0], q_label.shape[1]
+
+    grid = (batch, head_num, triton.cdiv(max_len_in_batch, BLOCK))
+    kv_group_num = q_label.shape[1] // k_label_buffer.shape[1]
+
+    if kv_group_num == 1:
+        num_warps = 4
+    else:
+        num_warps = 2
+
+    _sparse_fwd_kernel_flash_decode_stage1[grid](
+        q_label,
+        k_label_buffer,
+        sm_scale,
+        Req_to_tokens,
+        B_Seqlen,
+        att_out,
+        Req_to_tokens.stride(0),
+        q_label.stride(0),
+        q_label.stride(1),
+        k_label_buffer.stride(0),
+        k_label_buffer.stride(1),
+        att_out.stride(0),
+        att_out.stride(1),
+        kv_group_num,
+        BLOCK_DMODEL,
+        BLOCK,
+        logit_cap,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+
+
+@torch.no_grad()
+def sparse_flash_decode_stage2(
+    q,
+    k,
+    v,
+    Req_to_tokens,
+    Topk_token_indices,
+    heavy_token_num,
+    mid_out,
+    mid_out_logsumexp,
+    block_seq,
+    sm_scale,
+):
+    BLOCK_SEQ = block_seq
+    BLOCK_N = 16
+    assert BLOCK_SEQ % BLOCK_N == 0
+    # shape constraints
+    Lq, Lk = q.shape[-1], k.shape[-1]
+    assert Lq == Lk
+    assert Lk in {16, 32, 64, 128}
+    assert heavy_token_num == Topk_token_indices.shape[-1]
+    # sm_scale = 1.0 / (Lk ** 0.5)
+    batch, head_num = q.shape[0], q.shape[1]
+    grid = (batch, head_num, triton.cdiv(heavy_token_num, BLOCK_SEQ))
+
+    gqa_group_size = q.shape[1] // k.shape[1]
+
+    _sparse_fwd_kernel_flash_decode_stage2[grid](
+        q,
+        k,
+        v,
+        sm_scale,
+        Req_to_tokens,
+        Topk_token_indices,
+        mid_out,
+        mid_out_logsumexp,
+        heavy_token_num,
+        Req_to_tokens.stride(0),
+        Topk_token_indices.stride(0),
+        Topk_token_indices.stride(1),
+        q.stride(0),
+        q.stride(1),
+        k.stride(0),
+        k.stride(1),
+        v.stride(0),
+        v.stride(1),
+        mid_out.stride(0),
+        mid_out.stride(1),
+        mid_out.stride(2),
+        mid_out_logsumexp.stride(0),
+        mid_out_logsumexp.stride(1),
+        gqa_group_size,
+        BLOCK_SEQ=BLOCK_SEQ,
+        BLOCK_DMODEL=Lk,
+        BLOCK_N=BLOCK_N,
+        num_warps=1,
+        num_stages=2,
+    )
+    return
+
+
+@torch.no_grad()
+def sparse_flash_decode_stage3(Seqlen, mid_out, mid_out_logexpsum, O, block_seq):
+    Lk = mid_out.shape[-1]
+    assert Lk in {16, 32, 64, 128}
+    batch, head_num = mid_out.shape[0], mid_out.shape[1]
+    grid = (batch, head_num)
+
+    _sparse_fwd_kernel_flash_decode_stage3[grid](
+        mid_out,
+        mid_out_logexpsum,
+        O,
+        Seqlen,
+        mid_out.stride(0),
+        mid_out.stride(1),
+        mid_out.stride(2),
+        mid_out_logexpsum.stride(0),
+        mid_out_logexpsum.stride(1),
+        O.stride(0),
+        O.stride(1),
+        BLOCK_SEQ=block_seq,
+        BLOCK_DMODEL=Lk,
+        num_warps=4,
+        num_stages=2,
+    )
+    return
+
+
+def flash_decode_sparse_attention_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    q_label,
+    k_label_buffer,
+    req_to_token,
+    b_seq_len,
+    max_len_in_batch,
+    sm_scale,
+    logit_cap,
+    heavy_token_num=32,
+    att_out_approx=None,
+    mid_out=None,
+    mid_o_logexpsum=None,
+    BLOCK_SEQ=256,
+):
+    # TODO(Andy): Tune BLOCK_SEQ & BLOCK_D
+    kv_group_num = q.shape[1] // v_buffer.shape[1]
+    # batch_size = q.shape[0]
+
+    # Step 1: BGEMV approximate attention (page implementation)
+
+    if att_out_approx is None:
+        att_out_approx = torch.empty(
+            [q.shape[1], q.shape[0], max_len_in_batch],
+            dtype=REDUCE_TORCH_TYPE,
+            device=q.device,
+        )
+
+    if mid_out is None:
+        block_seq_num = (heavy_token_num + BLOCK_SEQ - 1) // BLOCK_SEQ
+
+        mid_out = torch.empty(
+            [q.shape[0], q.shape[1], block_seq_num, q.shape[-1]],
+            dtype=torch.float32,
+            device=q.device,
+        )
+        mid_o_logexpsum = torch.empty(
+            [q.shape[0], q.shape[1], block_seq_num],
+            dtype=torch.float32,
+            device=q.device,
+        )
+
+    sparse_flash_decode_stage1(
+        q_label,
+        k_label_buffer,
+        att_out_approx,
+        req_to_token,
+        b_seq_len,
+        max_len_in_batch,
+        sm_scale,
+        logit_cap,
+    )
+
+    # Step 2: TopK token selection
+    # NOTE(Andy): Apply sparse decoding when min > heavy_token_num and max > sparse decoding threshold
+    # TODO(Andy): Change a faster topk implementation
+    topk_token_indices = torch.topk(att_out_approx, heavy_token_num, dim=-1).indices
+    # topk_token_indices: [H, B, k], Req_to_tokens: [B, S]
+    # topk_token_indices = torch.arange(0, heavy_token_num, device=q.device).unsqueeze(0).unsqueeze(0).expand(q.shape[1], q.shape[0], -1)
+
+    sparse_flash_decode_stage2(
+        q,
+        k_buffer,
+        v_buffer,
+        req_to_token,
+        topk_token_indices,
+        heavy_token_num,
+        mid_out,
+        mid_o_logexpsum,
+        BLOCK_SEQ,
+        sm_scale,
+    )
+
+    sparse_flash_decode_stage3(heavy_token_num, mid_out, mid_o_logexpsum, o, BLOCK_SEQ)
+
+
+# Extend attention kernel for Double Sparsity
+# Moved from https://github.com/sgl-project/sglang/blob/v0.4.2.post1/python/sglang/srt/layers/attention/triton_ops/extend_attention.py
+@triton.jit
+def _fwd_kernel(
+    Q_Extend,
+    K_Extend,
+    V_Extend,
+    O_Extend,
+    K_Buffer,
+    V_Buffer,
+    Req_to_tokens,
+    B_req_idx,
+    B_Seq_Len,
+    B_Start_Loc_Extend,
+    B_Seq_Len_Extend,
+    sm_scale,
+    kv_group_num,
+    stride_qbs,
+    stride_qh,
+    stride_kbs,
+    stride_kh,
+    stride_vbs,
+    stride_vh,
+    stride_obs,
+    stride_oh,
+    stride_buf_kbs,
+    stride_buf_kh,
+    stride_buf_vbs,
+    stride_buf_vh,
+    stride_req_to_tokens_b,
+    logit_cap: tl.constexpr,
+    Lq: tl.constexpr,
+    Lv: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DPE: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    cur_seq = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    cur_block_m = tl.program_id(2)
+    cur_kv_head = cur_head // kv_group_num
+
+    cur_seq_len = tl.load(B_Seq_Len + cur_seq)
+    cur_seq_len_extend = tl.load(B_Seq_Len_Extend + cur_seq)
+    cur_seq_len_prefix = cur_seq_len - cur_seq_len_extend
+
+    cur_seq_prefix_start_in_loc = 0
+    cur_seq_extend_start_contiguous = tl.load(B_Start_Loc_Extend + cur_seq)
+    cur_batch_req_idx = tl.load(B_req_idx + cur_seq)
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dv = tl.arange(0, BLOCK_DV)
+    offs_m = tl.arange(0, BLOCK_M)
+    mask_m = (cur_block_m * BLOCK_M + offs_m) < cur_seq_len_extend
+
+    mask_d = offs_d < Lq
+    mask_dv = offs_dv < Lv
+
+    offs_q = (
+        (cur_seq_extend_start_contiguous + cur_block_m * BLOCK_M + offs_m[:, None])
+        * stride_qbs
+        + cur_head * stride_qh
+        + offs_d[None, :]
+    )
+    q = tl.load(
+        Q_Extend + offs_q, mask=(mask_m[:, None]) & (mask_d[None, :]), other=0.0
+    )
+
+    if BLOCK_DPE > 0:
+        offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE)
+        offs_qpe = (
+            (cur_seq_extend_start_contiguous + cur_block_m * BLOCK_M + offs_m[:, None])
+            * stride_qbs
+            + cur_head * stride_qh
+            + offs_dpe[None, :]
+        )
+        qpe = tl.load(Q_Extend + offs_qpe, mask=mask_m[:, None], other=0.0)
+
+    # stage 1: compute scores with prefix
+    offs_n = tl.arange(0, BLOCK_N)
+
+    acc = tl.zeros([BLOCK_M, BLOCK_DV], dtype=tl.float32)
+    deno = tl.zeros([BLOCK_M], dtype=tl.float32)
+    e_max = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+
+    for start_n in range(0, cur_seq_len_prefix, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        mask_n = (start_n + offs_n) < cur_seq_len_prefix
+        offs_b_loc_prefix = cur_batch_req_idx * stride_req_to_tokens_b + (
+            cur_seq_prefix_start_in_loc + start_n + offs_n
+        )
+        offs_kv_loc = tl.load(Req_to_tokens + offs_b_loc_prefix, mask=mask_n, other=0)
+
+        # load k in transposed way
+        offs_buf_k = (
+            offs_kv_loc[None, :] * stride_buf_kbs
+            + cur_kv_head * stride_buf_kh
+            + offs_d[:, None]
+        )
+        k = tl.load(
+            K_Buffer + offs_buf_k, mask=(mask_n[None, :]) & (mask_d[:, None]), other=0.0
+        )
+
+        qk = tl.dot(q.to(k.dtype), k)
+        if BLOCK_DPE > 0:
+            offs_kpe = (
+                offs_kv_loc[None, :] * stride_buf_kbs
+                + cur_kv_head * stride_buf_kh
+                + offs_dpe[:, None]
+            )
+            kpe = tl.load(
+                K_Buffer + offs_kpe,
+                mask=mask_n[None, :],
+                other=0.0,
+            )
+            qk += tl.dot(qpe.to(kpe.dtype), kpe)
+        qk *= sm_scale
+
+        if logit_cap > 0:
+            qk = logit_cap * tanh(qk / logit_cap)
+
+        qk = tl.where(mask_m[:, None] & mask_n[None, :], qk, float("-inf"))
+
+        n_e_max = tl.maximum(tl.max(qk, 1), e_max)
+        re_scale = tl.exp(e_max - n_e_max)
+        p = tl.exp(qk - n_e_max[:, None])
+        deno = deno * re_scale + tl.sum(p, 1)
+
+        offs_buf_v = (
+            offs_kv_loc[:, None] * stride_buf_vbs
+            + cur_kv_head * stride_buf_vh
+            + offs_dv[None, :]
+        )
+        v = tl.load(
+            V_Buffer + offs_buf_v, mask=mask_n[:, None] & mask_dv[None, :], other=0.0
+        )
+        p = p.to(v.dtype)
+        acc = acc * re_scale[:, None] + tl.dot(p, v)
+
+        e_max = n_e_max
+
+    # stage 2: compute the triangle part
+
+    cur_block_m_end = tl.minimum(cur_seq_len_extend, (cur_block_m + 1) * BLOCK_M)
+    for start_n in range(0, cur_block_m_end, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        mask_n = (start_n + offs_n) < cur_block_m_end
+
+        # load k in transposed way
+        offs_k = (
+            (cur_seq_extend_start_contiguous + start_n + offs_n[None, :]) * stride_kbs
+            + cur_kv_head * stride_kh
+            + offs_d[:, None]
+        )
+        k = tl.load(
+            K_Extend + offs_k, mask=(mask_n[None, :]) & (mask_d[:, None]), other=0.0
+        )
+
+        qk = tl.dot(q, k, out_dtype=tl.float32)
+        if BLOCK_DPE > 0:
+            offs_kpe = (
+                (cur_seq_extend_start_contiguous + start_n + offs_n[None, :])
+                * stride_kbs
+                + cur_kv_head * stride_kh
+                + offs_dpe[:, None]
+            )
+            kpe = tl.load(
+                K_Extend + offs_kpe,
+                mask=mask_n[None, :],
+                other=0.0,
+            )
+            qk += tl.dot(qpe, kpe)
+
+        qk *= sm_scale
+
+        if logit_cap > 0:
+            qk = logit_cap * tanh(qk / logit_cap)
+
+        mask_causual = (cur_block_m * BLOCK_M + offs_m[:, None]) >= (
+            start_n + offs_n[None, :]
+        )
+        mask_causual &= mask_m[:, None] & mask_n[None, :]
+        qk = tl.where(mask_causual, qk, float("-inf"))
+
+        n_e_max = tl.maximum(tl.max(qk, 1), e_max)
+        re_scale = tl.exp(e_max - n_e_max)
+        p = tl.exp(qk - n_e_max[:, None])
+        deno = deno * re_scale + tl.sum(p, 1)
+
+        offs_v = (
+            (cur_seq_extend_start_contiguous + start_n + offs_n[:, None]) * stride_vbs
+            + cur_kv_head * stride_vh
+            + offs_dv[None, :]
+        )
+        v = tl.load(
+            V_Extend + offs_v, mask=mask_n[:, None] & mask_dv[None, :], other=0.0
+        )
+        p = p.to(v.dtype)
+        acc = acc * re_scale[:, None] + tl.dot(p, v)
+
+        e_max = n_e_max
+
+    offs_o = (
+        (cur_seq_extend_start_contiguous + cur_block_m * BLOCK_M + offs_m[:, None])
+        * stride_obs
+        + cur_head * stride_oh
+        + offs_dv[None, :]
+    )
+    tl.store(
+        O_Extend + offs_o, acc / deno[:, None], mask=mask_m[:, None] & mask_dv[None, :]
+    )
+
+
+def extend_attention_fwd(
+    q_extend,
+    k_extend,
+    v_extend,
+    o_extend,
+    k_buffer,
+    v_buffer,
+    req_to_tokens,
+    b_req_idx,
+    b_seq_len,
+    b_seq_len_extend,
+    b_start_loc_extend,
+    max_len_extend,
+    sm_scale=None,
+    logit_cap=0.0,
+):
+    """
+    q_extend, k_extend, v_extend, o_extend: contiguous tensors
+
+    k_buffer, v_buffer: (prefix + extend) tensors in mem_manager
+    """
+    Lq, Lk, Lv = (
+        q_extend.shape[-1],
+        k_extend.shape[-1],
+        v_extend.shape[-1],
+    )
+
+    if Lq == 576:
+        BLOCK_DMODEL = 512
+        BLOCK_DPE = 64
+    elif Lq == 288:
+        BLOCK_DMODEL = 256
+        BLOCK_DPE = 32
+    elif Lq == 192:
+        BLOCK_DMODEL = 128
+        BLOCK_DPE = 64
+    else:
+        BLOCK_DMODEL = triton.next_power_of_2(Lq)
+        BLOCK_DPE = 0
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    if _is_hip:
+        BLOCK_M, BLOCK_N = (64, 64)
+        num_warps = 4
+
+    else:
+        if _is_cuda and CUDA_CAPABILITY[0] >= 9:
+            if Lq <= 256:
+                BLOCK_M, BLOCK_N = (128, 64)
+            else:
+                BLOCK_M, BLOCK_N = (32, 64)
+        elif _is_cuda and CUDA_CAPABILITY[0] >= 8:
+            if Lq <= 128:
+                BLOCK_M, BLOCK_N = (128, 128)
+            elif Lq <= 256:
+                BLOCK_M, BLOCK_N = (64, 64)
+            else:
+                BLOCK_M, BLOCK_N = (32, 64)
+        else:
+            BLOCK_M, BLOCK_N = (64, 64) if Lq <= 128 else (32, 32)
+
+        num_warps = 4 if Lk <= 64 else 8
+
+    sm_scale = sm_scale or 1.0 / (Lq**0.5)
+    batch_size, head_num = b_seq_len.shape[0], q_extend.shape[1]
+    kv_group_num = q_extend.shape[1] // k_extend.shape[1]
+
+    grid = (batch_size, head_num, triton.cdiv(max_len_extend, BLOCK_M))
+    num_stages = 1
+
+    extra_kargs = {}
+    if _is_hip:
+        extra_kargs = {"waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2}
+
+    _fwd_kernel[grid](
+        q_extend,
+        k_extend,
+        v_extend,
+        o_extend,
+        k_buffer,
+        v_buffer,
+        req_to_tokens,
+        b_req_idx,
+        b_seq_len,
+        b_start_loc_extend,
+        b_seq_len_extend,
+        sm_scale,
+        kv_group_num,
+        q_extend.stride(0),
+        q_extend.stride(1),
+        k_extend.stride(0),
+        k_extend.stride(1),
+        v_extend.stride(0),
+        v_extend.stride(1),
+        o_extend.stride(0),
+        o_extend.stride(1),
+        k_buffer.stride(0),
+        k_buffer.stride(1),
+        v_buffer.stride(0),
+        v_buffer.stride(1),
+        req_to_tokens.stride(0),
+        logit_cap=logit_cap,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        BLOCK_DPE=BLOCK_DPE,
+        BLOCK_DV=BLOCK_DV,
+        BLOCK_M=BLOCK_M,
+        BLOCK_N=BLOCK_N,
+        Lq=Lq,
+        Lv=Lv,
+        num_warps=num_warps,
+        num_stages=num_stages,
+        **extra_kargs,
+    )
diff --git a/python/sglang/srt/layers/attention/triton_ops/extend_attention.py b/python/sglang/srt/layers/attention/triton_ops/extend_attention.py
new file mode 100644
index 000000000..e91467743
--- /dev/null
+++ b/python/sglang/srt/layers/attention/triton_ops/extend_attention.py
@@ -0,0 +1,550 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Memory-efficient attention for prefill.
+It supports page size = 1 and prefill with KV cache (i.e. extend).
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.triton_ops.prefill_attention import (
+    context_attention_fwd,
+)
+from sglang.srt.utils import is_cuda, is_hip
+
+_is_cuda = is_cuda()
+if _is_cuda:
+    CUDA_CAPABILITY = torch.cuda.get_device_capability()
+
+_is_hip = is_hip()
+
+
+@triton.jit
+def tanh(x):
+    # Tanh is just a scaled sigmoid
+    return 2 * tl.sigmoid(2 * x) - 1
+
+
+@triton.jit
+def _fwd_kernel(
+    Q_Extend,
+    K_Extend,
+    V_Extend,
+    O_Extend,
+    K_Buffer,
+    V_Buffer,
+    qo_indptr,
+    kv_indptr,
+    kv_indices,
+    mask_ptr,
+    mask_indptr,
+    sink_ptr,
+    window_kv_offset_ptr,
+    sm_scale,
+    kv_group_num,
+    stride_qbs,
+    stride_qh,
+    stride_kbs,
+    stride_kh,
+    stride_vbs,
+    stride_vh,
+    stride_obs,
+    stride_oh,
+    stride_buf_kbs,
+    stride_buf_kh,
+    stride_buf_vbs,
+    stride_buf_vh,
+    SLIDING_WINDOW_SIZE: tl.constexpr,
+    logit_cap: tl.constexpr,
+    xai_temperature_len: tl.constexpr,
+    Lq: tl.constexpr,
+    Lv: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DPE: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    USE_CUSTOM_MASK: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    SKIP_PREFIX_CUSTOM_MASK: tl.constexpr,
+    STORE_TRANSPOSE: tl.constexpr,
+    HAS_SINK: tl.constexpr,
+):
+    cur_seq = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    cur_block_m = tl.program_id(2)
+    cur_kv_head = cur_head // kv_group_num
+
+    cur_seq_extend_start_idx = tl.load(qo_indptr + cur_seq)
+    cur_seq_len_extend = tl.load(qo_indptr + cur_seq + 1) - cur_seq_extend_start_idx
+    cur_seq_kv_start_idx = tl.load(kv_indptr + cur_seq)
+    cur_seq_len_prefix = tl.load(kv_indptr + cur_seq + 1) - cur_seq_kv_start_idx
+    cur_seq_len = cur_seq_len_prefix + cur_seq_len_extend
+
+    if USE_CUSTOM_MASK:
+        cur_seq_mask_start_idx = tl.load(mask_indptr + cur_seq)
+
+    # For SWA, we should only load the mask in the sliding window
+    window_kv_offset = 0
+    if USE_CUSTOM_MASK and SLIDING_WINDOW_SIZE > 0:
+        window_kv_offset = tl.load(window_kv_offset_ptr + cur_seq)
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dv = tl.arange(0, BLOCK_DV)
+    offs_m = tl.arange(0, BLOCK_M)
+    mask_m = (cur_block_m * BLOCK_M + offs_m) < cur_seq_len_extend
+
+    mask_d = offs_d < Lq
+    mask_dv = offs_dv < Lv
+
+    if xai_temperature_len > 0:
+        offs_qidx = cur_seq_len_prefix + cur_block_m * BLOCK_M + offs_m
+        xai_temperature_scale = 1.0 / tl.log2(float(xai_temperature_len))
+        xai_temperature_reg = tl.where(
+            offs_qidx > xai_temperature_len,
+            tl.log2(offs_qidx.to(tl.float32)) * xai_temperature_scale,
+            1.0,
+        )
+
+    offs_q = (
+        (cur_seq_extend_start_idx + cur_block_m * BLOCK_M + offs_m[:, None])
+        * stride_qbs
+        + cur_head * stride_qh
+        + offs_d[None, :]
+    )
+    q = tl.load(
+        Q_Extend + offs_q, mask=(mask_m[:, None]) & (mask_d[None, :]), other=0.0
+    )
+
+    if BLOCK_DPE > 0:
+        offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE)
+        offs_qpe = (
+            (cur_seq_extend_start_idx + cur_block_m * BLOCK_M + offs_m[:, None])
+            * stride_qbs
+            + cur_head * stride_qh
+            + offs_dpe[None, :]
+        )
+        qpe = tl.load(Q_Extend + offs_qpe, mask=mask_m[:, None], other=0.0)
+
+    # stage 1: compute scores with prefix
+    offs_n = tl.arange(0, BLOCK_N)
+
+    acc = tl.zeros([BLOCK_M, BLOCK_DV], dtype=tl.float32)
+    deno = tl.zeros([BLOCK_M], dtype=tl.float32)
+    e_max = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+
+    for start_n in range(0, cur_seq_len_prefix, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        mask_n = (start_n + offs_n) < cur_seq_len_prefix
+
+        final_mask = mask_m[:, None] & mask_n[None, :]
+        if USE_CUSTOM_MASK and not SKIP_PREFIX_CUSTOM_MASK:
+            custom_mask = tl.load(
+                mask_ptr
+                + cur_seq_mask_start_idx
+                + (cur_block_m * BLOCK_M + offs_m[:, None])
+                * (cur_seq_len + window_kv_offset)
+                + window_kv_offset
+                + start_n
+                + offs_n[None, :],
+                mask=(mask_m[:, None] & mask_n[None, :]),
+                other=0,
+            )
+            final_mask &= custom_mask
+        if SLIDING_WINDOW_SIZE > 0:
+            # Add mask where q_id <= kv_id + sliding_window_size
+            # q_id = prefix_len + cur_m, kv_id = cur_n
+            window_mask = (
+                cur_seq_len_prefix + cur_block_m * BLOCK_M + offs_m[:, None]
+            ) <= (start_n + offs_n[None, :] + SLIDING_WINDOW_SIZE)
+            final_mask &= window_mask
+
+        SKIP_TILE = False
+        if (USE_CUSTOM_MASK and not SKIP_PREFIX_CUSTOM_MASK) or SLIDING_WINDOW_SIZE > 0:
+            SKIP_TILE = tl.max(tl.max(final_mask.to(tl.int32), axis=1), axis=0) == 0
+
+        if not SKIP_TILE:
+            offs_kv_loc = tl.load(
+                kv_indices + cur_seq_kv_start_idx + start_n + offs_n,
+                mask=mask_n,
+                other=0,
+            )
+
+            # load k in transposed way
+            offs_buf_k = (
+                offs_kv_loc[None, :] * stride_buf_kbs
+                + cur_kv_head * stride_buf_kh
+                + offs_d[:, None]
+            )
+            k = tl.load(
+                K_Buffer + offs_buf_k,
+                mask=(mask_n[None, :]) & (mask_d[:, None]),
+                other=0.0,
+            )
+
+            qk = tl.dot(q.to(k.dtype), k)
+            if BLOCK_DPE > 0:
+                offs_kpe = (
+                    offs_kv_loc[None, :] * stride_buf_kbs
+                    + cur_kv_head * stride_buf_kh
+                    + offs_dpe[:, None]
+                )
+                kpe = tl.load(
+                    K_Buffer + offs_kpe,
+                    mask=mask_n[None, :],
+                    other=0.0,
+                )
+                qk += tl.dot(qpe.to(kpe.dtype), kpe)
+            qk *= sm_scale
+
+            if logit_cap > 0:
+                qk = logit_cap * tanh(qk / logit_cap)
+
+            if xai_temperature_len > 0:
+                qk *= xai_temperature_reg[:, None]
+
+            qk = tl.where(final_mask, qk, float("-inf"))
+
+            row_max = tl.max(qk, 1)
+            row_max_fixed = tl.where(row_max == float("-inf"), -1e20, row_max)
+            n_e_max = tl.maximum(row_max_fixed, e_max)
+
+            re_scale = tl.exp(e_max - n_e_max)
+            p = tl.exp(qk - n_e_max[:, None])
+            deno = deno * re_scale + tl.sum(p, 1)
+
+            offs_buf_v = (
+                offs_kv_loc[:, None] * stride_buf_vbs
+                + cur_kv_head * stride_buf_vh
+                + offs_dv[None, :]
+            )
+            v = tl.load(
+                V_Buffer + offs_buf_v,
+                mask=mask_n[:, None] & mask_dv[None, :],
+                other=0.0,
+            )
+            p = p.to(v.dtype)
+            acc = acc * re_scale[:, None] + tl.dot(p, v)
+
+            e_max = n_e_max
+
+    # stage 2: compute the triangle part
+
+    cur_block_m_end = (
+        cur_seq_len_extend
+        if not IS_CAUSAL
+        else tl.minimum(cur_seq_len_extend, (cur_block_m + 1) * BLOCK_M)
+    )
+    for start_n in range(0, cur_block_m_end, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        mask_n = (start_n + offs_n) < cur_block_m_end
+
+        final_mask = mask_m[:, None] & mask_n[None, :]
+        if USE_CUSTOM_MASK:
+            custom_mask = tl.load(
+                mask_ptr
+                + cur_seq_mask_start_idx
+                + (cur_block_m * BLOCK_M + offs_m[:, None])
+                * (cur_seq_len + window_kv_offset)
+                + window_kv_offset
+                + cur_seq_len_prefix
+                + start_n
+                + offs_n[None, :],
+                mask=(mask_m[:, None] & mask_n[None, :]),
+                other=0,
+            )
+            custom_mask &= mask_m[:, None] & mask_n[None, :]
+            final_mask &= custom_mask
+        elif IS_CAUSAL:
+            mask_causual = (cur_block_m * BLOCK_M + offs_m[:, None]) >= (
+                start_n + offs_n[None, :]
+            )
+            mask_causual &= mask_m[:, None] & mask_n[None, :]
+            final_mask &= mask_causual
+        else:
+            mask_non_causal = mask_m[:, None] & mask_n[None, :]
+            final_mask &= mask_non_causal
+
+        if SLIDING_WINDOW_SIZE > 0:
+            # Add mask where q_id <= kv_id + sliding_window_size
+            window_mask = (cur_block_m * BLOCK_M + offs_m[:, None]) <= (
+                start_n + offs_n[None, :] + SLIDING_WINDOW_SIZE
+            )
+            final_mask &= window_mask
+
+        SKIP_TILE = False
+        if USE_CUSTOM_MASK or SLIDING_WINDOW_SIZE > 0:
+            SKIP_TILE = tl.max(tl.max(final_mask.to(tl.int32), axis=1), axis=0) == 0
+
+        if not SKIP_TILE:
+            # load k in transposed way
+            offs_k = (
+                (cur_seq_extend_start_idx + start_n + offs_n[None, :]) * stride_kbs
+                + cur_kv_head * stride_kh
+                + offs_d[:, None]
+            )
+            k = tl.load(
+                K_Extend + offs_k, mask=(mask_n[None, :]) & (mask_d[:, None]), other=0.0
+            )
+
+            qk = tl.dot(q, k, out_dtype=tl.float32)
+            if BLOCK_DPE > 0:
+                offs_kpe = (
+                    (cur_seq_extend_start_idx + start_n + offs_n[None, :]) * stride_kbs
+                    + cur_kv_head * stride_kh
+                    + offs_dpe[:, None]
+                )
+                kpe = tl.load(
+                    K_Extend + offs_kpe,
+                    mask=mask_n[None, :],
+                    other=0.0,
+                )
+                qk += tl.dot(qpe, kpe)
+
+            qk *= sm_scale
+
+            if logit_cap > 0:
+                qk = logit_cap * tanh(qk / logit_cap)
+
+            if xai_temperature_len > 0:
+                qk *= xai_temperature_reg[:, None]
+
+            qk = tl.where(final_mask, qk, float("-inf"))
+
+            row_max = tl.max(qk, 1)
+            row_max_fixed = tl.where(row_max == float("-inf"), -1e20, row_max)
+            n_e_max = tl.maximum(row_max_fixed, e_max)
+
+            re_scale = tl.exp(e_max - n_e_max)
+            p = tl.exp(qk - n_e_max[:, None])
+            deno = deno * re_scale + tl.sum(p, 1)
+
+            offs_v = (
+                (cur_seq_extend_start_idx + start_n + offs_n[:, None]) * stride_vbs
+                + cur_kv_head * stride_vh
+                + offs_dv[None, :]
+            )
+            v = tl.load(
+                V_Extend + offs_v, mask=mask_n[:, None] & mask_dv[None, :], other=0.0
+            )
+            p = p.to(v.dtype)
+            acc = acc * re_scale[:, None] + tl.dot(p, v)
+
+            e_max = n_e_max
+
+    if HAS_SINK:
+        cur_sink = tl.load(sink_ptr + cur_head)
+        deno += tl.exp(cur_sink - e_max)
+
+    offs_o = (
+        (cur_seq_extend_start_idx + cur_block_m * BLOCK_M + offs_m[:, None])
+        * stride_obs
+        + cur_head * stride_oh
+        + offs_dv[None, :]
+    )
+    if STORE_TRANSPOSE:
+        tl.store(
+            O_Extend + offs_o.T,
+            (acc / deno[:, None]).T,
+            mask=(mask_m[:, None] & mask_dv[None, :]).T,
+        )
+    else:
+        tl.store(
+            O_Extend + offs_o,
+            acc / deno[:, None],
+            mask=mask_m[:, None] & mask_dv[None, :],
+        )
+
+
+def extend_attention_fwd(
+    q_extend,
+    k_extend,
+    v_extend,
+    o_extend,
+    k_buffer,
+    v_buffer,
+    qo_indptr,
+    kv_indptr,
+    kv_indices,
+    custom_mask,
+    is_causal,
+    mask_indptr,
+    max_len_extend,
+    sm_scale=None,
+    logit_cap=0.0,
+    skip_prefix_custom_mask=True,
+    sliding_window_size=-1,
+    sinks=None,
+    window_kv_offsets=None,
+    xai_temperature_len=-1,
+):
+    """
+    q_extend, k_extend, v_extend, o_extend: contiguous tensors
+
+    k_buffer, v_buffer: (prefix + extend) tensors in mem_manager
+    """
+    Lq, Lk, Lv = (
+        q_extend.shape[-1],
+        k_extend.shape[-1],
+        v_extend.shape[-1],
+    )
+
+    if Lq == 576:
+        BLOCK_DMODEL = 512
+        BLOCK_DPE = 64
+    elif Lq == 288:
+        BLOCK_DMODEL = 256
+        BLOCK_DPE = 32
+    elif Lq == 192:
+        BLOCK_DMODEL = 128
+        BLOCK_DPE = 64
+    else:
+        BLOCK_DMODEL = triton.next_power_of_2(Lq)
+        BLOCK_DPE = 0
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    if _is_hip:
+        BLOCK_M, BLOCK_N = (64, 64)
+        num_warps = 4
+
+    else:
+        if _is_cuda and CUDA_CAPABILITY[0] >= 9:
+            if Lq <= 256:
+                BLOCK_M, BLOCK_N = (128, 64)
+            else:
+                BLOCK_M, BLOCK_N = (32, 64)
+        elif _is_cuda and CUDA_CAPABILITY[0] >= 8:
+            # sm86/sm89 has a much smaller shared memory size (100K) than sm80 (160K)
+            if CUDA_CAPABILITY[1] == 9 or CUDA_CAPABILITY[1] == 6:
+                if Lq <= 128:
+                    BLOCK_M, BLOCK_N = (64, 128)
+                elif Lq <= 256:
+                    BLOCK_M, BLOCK_N = (64, 64)
+                else:
+                    BLOCK_M, BLOCK_N = (32, 32)
+            else:
+                if Lq <= 128:
+                    BLOCK_M, BLOCK_N = (128, 128)
+                elif Lq <= 256:
+                    BLOCK_M, BLOCK_N = (64, 64)
+                else:
+                    BLOCK_M, BLOCK_N = (32, 64)
+        else:
+            BLOCK_M, BLOCK_N = (64, 64) if Lq <= 128 else (32, 32)
+
+        num_warps = 4 if Lk <= 64 else 8
+
+    sm_scale = sm_scale or 1.0 / (Lq**0.5)
+    batch_size, head_num = qo_indptr.shape[0] - 1, q_extend.shape[1]
+    kv_group_num = q_extend.shape[1] // k_extend.shape[1]
+
+    USE_CUSTOM_MASK = custom_mask is not None
+    # Skip custom mask for prefix part
+    SKIP_PREFIX_CUSTOM_MASK = skip_prefix_custom_mask
+
+    HAS_SINK = sinks is not None
+
+    grid = (batch_size, head_num, triton.cdiv(max_len_extend, BLOCK_M))
+    num_stages = 1
+
+    extra_kargs = {}
+    if _is_hip:
+        extra_kargs = {"waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 2}
+
+    _fwd_kernel[grid](
+        q_extend,
+        k_extend,
+        v_extend,
+        o_extend,
+        k_buffer,
+        v_buffer,
+        qo_indptr,
+        kv_indptr,
+        kv_indices,
+        custom_mask,
+        mask_indptr,
+        sinks,
+        window_kv_offsets,
+        sm_scale,
+        kv_group_num,
+        q_extend.stride(0),
+        q_extend.stride(1),
+        k_extend.stride(0),
+        k_extend.stride(1),
+        v_extend.stride(0),
+        v_extend.stride(1),
+        o_extend.stride(0),
+        o_extend.stride(1),
+        k_buffer.stride(0),
+        k_buffer.stride(1),
+        v_buffer.stride(0),
+        v_buffer.stride(1),
+        SLIDING_WINDOW_SIZE=sliding_window_size,
+        logit_cap=logit_cap,
+        xai_temperature_len=xai_temperature_len,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        BLOCK_DPE=BLOCK_DPE,
+        BLOCK_DV=BLOCK_DV,
+        BLOCK_M=BLOCK_M,
+        BLOCK_N=BLOCK_N,
+        Lq=Lq,
+        Lv=Lv,
+        USE_CUSTOM_MASK=USE_CUSTOM_MASK,
+        IS_CAUSAL=is_causal,
+        SKIP_PREFIX_CUSTOM_MASK=SKIP_PREFIX_CUSTOM_MASK,
+        HAS_SINK=HAS_SINK,
+        STORE_TRANSPOSE=_is_hip,
+        num_warps=num_warps,
+        num_stages=num_stages,
+        **extra_kargs,
+    )
+
+
+def redundant_attention(
+    q_extend,
+    o_extend,
+    k_buffer,
+    v_buffer,
+    b_req_idx,
+    b_start_loc,
+    b_seq_len,
+    b_seq_len_prefix,
+    max_len_in_batch,
+):
+    total_token_num = k_buffer.shape[0]
+    B, H_Q, D = b_req_idx.shape[0], q_extend.shape[-2], q_extend.shape[-1]
+    q_buffer = torch.empty(
+        (total_token_num, H_Q, D), dtype=q_extend.dtype, device=q_extend.device
+    )
+
+    pt = 0
+    for i in range(B):
+        cur_seq_len_extend = b_seq_len[i] - b_seq_len_prefix[i]
+        pl, pr = b_start_loc[i] + b_seq_len_prefix[i], b_start_loc[i] + b_seq_len[i]
+        q_buffer[pl:pr] = q_extend[pt : pt + cur_seq_len_extend]
+        pt += cur_seq_len_extend
+
+    o_buffer = torch.empty_like(q_buffer)
+    context_attention_fwd(
+        q_buffer, k_buffer, v_buffer, o_buffer, b_start_loc, b_seq_len, max_len_in_batch
+    )
+
+    pt = 0
+    for i in range(B):
+        cur_seq_len_extend = b_seq_len[i] - b_seq_len_prefix[i]
+        pl, pr = b_start_loc[i] + b_seq_len_prefix[i], b_start_loc[i] + b_seq_len[i]
+        o_extend[pt : pt + cur_seq_len_extend] = o_buffer[pl:pr]
+        pt += cur_seq_len_extend
diff --git a/python/sglang/srt/layers/attention/triton_ops/merge_state.py b/python/sglang/srt/layers/attention/triton_ops/merge_state.py
new file mode 100644
index 000000000..955097fc9
--- /dev/null
+++ b/python/sglang/srt/layers/attention/triton_ops/merge_state.py
@@ -0,0 +1,96 @@
+from typing import Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def merge_state_kernel(
+    output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] v_merged
+    output_lse,  # [NUM_TOKENS, NUM_HEADS] s_merged
+    prefix_output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] v_a
+    prefix_lse,  # [NUM_TOKENS, NUM_HEADS] s_a
+    suffix_output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] v_b
+    suffix_lse,  # [NUM_TOKENS, NUM_HEADS] s_b
+    HEAD_SIZE: tl.constexpr,
+    PADDED_HEAD_SIZE: tl.constexpr,
+    OUTPUT_LSE: tl.constexpr,
+):
+    token_idx = tl.program_id(0)
+    num_tokens = tl.num_programs(0)
+    head_idx = tl.program_id(1)
+    num_heads = tl.num_programs(1)
+
+    p_lse = tl.load(prefix_lse + token_idx * num_heads + head_idx)
+    s_lse = tl.load(suffix_lse + token_idx * num_heads + head_idx)
+    p_lse = float("-inf") if p_lse == float("inf") else p_lse
+    s_lse = float("-inf") if s_lse == float("inf") else s_lse
+
+    max_lse = tl.maximum(p_lse, s_lse)
+    p_lse = p_lse - max_lse
+    s_lse = s_lse - max_lse
+    out_se = tl.exp(p_lse) + tl.exp(s_lse)
+
+    if OUTPUT_LSE:
+        out_lse = tl.log(out_se) + max_lse
+        tl.store(output_lse + token_idx * num_heads + head_idx, out_lse)
+
+    head_arange = tl.arange(0, PADDED_HEAD_SIZE)
+    head_mask = head_arange < HEAD_SIZE
+    p_out = tl.load(
+        prefix_output
+        + token_idx * num_heads * HEAD_SIZE
+        + head_idx * HEAD_SIZE
+        + head_arange,
+        mask=head_mask,
+    )
+    s_out = tl.load(
+        suffix_output
+        + token_idx * num_heads * HEAD_SIZE
+        + head_idx * HEAD_SIZE
+        + head_arange,
+        mask=head_mask,
+    )
+
+    p_scale = tl.exp(p_lse) / out_se
+    s_scale = tl.exp(s_lse) / out_se
+    out = p_out * p_scale + s_out * s_scale
+    tl.store(
+        output + token_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE + head_arange,
+        out,
+        mask=head_mask,
+    )
+
+
+def merge_state_triton(
+    prefix_output: torch.Tensor,
+    prefix_lse: torch.Tensor,
+    suffix_output: torch.Tensor,
+    suffix_lse: torch.Tensor,
+    output: Optional[torch.Tensor] = None,
+    output_lse: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    # Avoid creating new tensors if they are already provided
+    if output is None:
+        output = torch.empty_like(prefix_output)
+    if output_lse is None:
+        output_lse = torch.empty_like(prefix_lse)
+
+    num_tokens = output.shape[0]
+    num_query_heads = output.shape[1]
+    head_size = output.shape[2]
+    padded_head_size = triton.next_power_of_2(head_size)
+
+    merge_state_kernel[(num_tokens, num_query_heads)](
+        output,
+        output_lse,
+        prefix_output,
+        prefix_lse,
+        suffix_output,
+        suffix_lse,
+        head_size,
+        padded_head_size,
+        output_lse is not None,
+    )
+    return output, output_lse
diff --git a/python/sglang/srt/layers/attention/triton_ops/prefill_attention.py b/python/sglang/srt/layers/attention/triton_ops/prefill_attention.py
new file mode 100644
index 000000000..ac0fc72af
--- /dev/null
+++ b/python/sglang/srt/layers/attention/triton_ops/prefill_attention.py
@@ -0,0 +1,217 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Memory-efficient attention for prefill.
+It supporst page size = 1.
+"""
+
+# Adapted from
+# https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py#L1
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.utils import is_cuda, is_hip
+
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+
+if _is_cuda or _is_hip:
+    CUDA_CAPABILITY = torch.cuda.get_device_capability()
+
+
+@triton.jit
+def _fwd_kernel(
+    Q,
+    K,
+    V,
+    sm_scale,
+    B_Start_Loc,
+    B_Seqlen,
+    Out,
+    stride_qbs,
+    stride_qh,
+    stride_kbs,
+    stride_kh,
+    stride_vbs,
+    stride_vh,
+    stride_obs,
+    stride_oh,
+    kv_group_num: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    Lk: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    start_m = tl.program_id(2)
+
+    cur_kv_head = cur_head // kv_group_num
+
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+
+    block_start_loc = BLOCK_M * start_m
+
+    # initialize offsets
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_q = (
+        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs
+        + cur_head * stride_qh
+        + offs_d[None, :]
+    )
+    off_k = offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh + offs_d[:, None]
+    off_v = offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh + offs_d[None, :]
+
+    mask_d = offs_d < Lk
+
+    q = tl.load(
+        Q + off_q,
+        mask=(offs_m[:, None] < cur_batch_seq_len) & (mask_d[None, :]),
+        other=0.0,
+    )
+
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+
+    block_mask = tl.where(block_start_loc < cur_batch_seq_len, 1, 0)
+
+    end_n = (
+        cur_batch_seq_len
+        if not IS_CAUSAL
+        else tl.minimum((start_m + 1) * BLOCK_M, cur_batch_seq_len)
+    )
+    for start_n in range(0, block_mask * end_n, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        k = tl.load(
+            k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs,
+            mask=((start_n + offs_n[None, :]) < cur_batch_seq_len) & (mask_d[:, None]),
+            other=0.0,
+        )
+        # mask = tl.load(mask_ptrs + start_n, mask=start_n + offs_n < cur_batch_end_loc, other=0.0)
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        qk *= sm_scale
+
+        if IS_CAUSAL:
+            qk += tl.where(
+                (start_n + offs_n[None, :] < cur_batch_seq_len)
+                & (offs_m[:, None] >= (start_n + offs_n[None, :])),
+                0,
+                float("-inf"),
+            )
+        else:
+            qk += tl.where(
+                (start_n + offs_n[None, :]) < cur_batch_seq_len, 0, float("-inf")
+            )
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        p = tl.exp(qk - m_ij[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+        m_i_new = tl.maximum(m_i, m_ij)
+        alpha = tl.exp(m_i - m_i_new)
+        beta = tl.exp(m_ij - m_i_new)
+        l_i_new = alpha * l_i + beta * l_ij
+        # -- update output accumulator --
+        # scale p
+        p_scale = beta / l_i_new
+        p = p * p_scale[:, None]
+        # scale acc
+        acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(
+            v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs,
+            mask=((start_n + offs_n[:, None]) < cur_batch_seq_len) & (mask_d[None, :]),
+            other=0.0,
+        )
+
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+    # initialize pointers to output
+    off_o = (
+        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs
+        + cur_head * stride_oh
+        + offs_d[None, :]
+    )
+    out_ptrs = Out + off_o
+    tl.store(
+        out_ptrs, acc, mask=(offs_m[:, None] < cur_batch_seq_len) & (mask_d[None, :])
+    )
+
+
+def context_attention_fwd(
+    q, k, v, o, b_start_loc, b_seq_len, max_input_len, is_causal=True
+):
+    """
+    q, k, v: [b * s, head, head_dim]
+    b_start_loc: [b]
+    b_seq_len: [b]
+    out: [b * s, head, head_dim]
+    """
+    if (_is_cuda or _is_hip) and CUDA_CAPABILITY[0] > 8:
+        BLOCK = 128
+    else:
+        BLOCK = 64
+
+    Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+
+    sm_scale = 1.0 / (Lq**0.5)
+    batch, head = b_seq_len.shape[0], q.shape[1]
+    kv_group_num = q.shape[1] // k.shape[1]
+
+    grid = (batch, head, triton.cdiv(max_input_len, BLOCK))
+    num_warps = 4 if Lk <= 64 else 8
+
+    _fwd_kernel[grid](
+        q,
+        k,
+        v,
+        sm_scale,
+        b_start_loc,
+        b_seq_len,
+        o,
+        q.stride(0),
+        q.stride(1),
+        k.stride(0),
+        k.stride(1),
+        v.stride(0),
+        v.stride(1),
+        o.stride(0),
+        o.stride(1),
+        kv_group_num=kv_group_num,
+        BLOCK_M=BLOCK,
+        BLOCK_DMODEL=triton.next_power_of_2(Lk),
+        BLOCK_N=BLOCK,
+        IS_CAUSAL=is_causal,
+        num_warps=num_warps,
+        num_stages=1,
+        Lk=Lk,
+    )
diff --git a/python/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py b/python/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py
new file mode 100644
index 000000000..89b17ce2d
--- /dev/null
+++ b/python/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py
@@ -0,0 +1,439 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Memory-efficient attention for decoding.
+It supports page size = 1.
+"""
+
+# Adapted from
+# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage1.py
+# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage2.py
+
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.triton_ops.decode_attention import (
+    _decode_softmax_reducev_fwd,
+)
+
+
+def is_hip():
+    return triton.runtime.driver.active.get_current_target().backend == "hip"
+
+
+_is_hip = is_hip()
+
+
+@triton.jit
+def tanh(x):
+    # Tanh is just a scaled sigmoid
+    return 2 * tl.sigmoid(2 * x) - 1
+
+
+@triton.jit
+def _fwd_grouped_kernel_stage1_rope(
+    Q,  # Holds [Q_NOPE; Q_PE], b x h x (d+r)
+    K_Buffer,  # Holds [KV; K_PE], b*s x (c+r)
+    V_buffer,  # Holds [KV], b*s x (c)
+    cos_sin_cache,  # max_seq_len x (rotary_dim * 2)
+    positions,  # sequence positions
+    sm_scale,
+    kv_indptr,
+    kv_indices,
+    Att_Out,  # b x h x NUM_KV_SPLITS x (kv_lora_rank + 1)
+    k_pe_t_out,
+    stride_qb,
+    stride_qh,
+    stride_buf_kbs,
+    stride_buf_vbs,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    stride_kpe_tokens_out_b,
+    stride_cos_sin_cache_s,
+    stride_positions_b,
+    rotary_dim: tl.constexpr,
+    kv_lora_rank: tl.constexpr,
+    qk_rope_head_dim: tl.constexpr,
+    kv_group_num: tl.constexpr,
+    q_head_num: tl.constexpr,
+    BLOCK_C: tl.constexpr,
+    BLOCK_R: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+    NUM_KV_SPLITS: tl.constexpr,
+    logit_cap: tl.constexpr,
+    USE_ROPE: tl.constexpr,
+    IS_NEOX_STYLE: tl.constexpr,
+):
+
+    cur_batch = tl.program_id(0)
+    cur_head_id = tl.program_id(1)
+    split_kv_id = tl.program_id(2)
+
+    if BLOCK_H < kv_group_num:
+        VALID_BLOCK_H: tl.constexpr = BLOCK_H
+    else:
+        VALID_BLOCK_H: tl.constexpr = kv_group_num
+    cur_head = cur_head_id * VALID_BLOCK_H + tl.arange(0, BLOCK_H)
+    mask_h = cur_head < (cur_head_id + 1) * VALID_BLOCK_H
+    mask_h = mask_h & (cur_head < q_head_num)
+
+    offs_c = tl.arange(0, BLOCK_C)
+    offs_qk_r = tl.arange(kv_lora_rank, kv_lora_rank + BLOCK_R)  # to get the k_pe
+
+    off_q_pe = (
+        cur_batch * stride_qb + cur_head[:, None] * stride_qh + offs_qk_r[None, :]
+    )
+    offs_q = cur_batch * stride_qb + cur_head[:, None] * stride_qh + offs_c[None, :]
+
+    mask_c = offs_c < kv_lora_rank
+    mask_qk_r = offs_qk_r < (kv_lora_rank + qk_rope_head_dim)
+
+    cur_batch_kv_start_idx = tl.load(kv_indptr + cur_batch)
+    cur_batch_seq_len = tl.load(kv_indptr + cur_batch + 1) - cur_batch_kv_start_idx
+
+    q = tl.load(Q + offs_q, mask=(mask_h[:, None]) & (mask_c[None, :]), other=0.0)
+    q_pe = tl.load(
+        Q + off_q_pe, mask=(mask_h[:, None]) & (mask_qk_r[None, :]), other=0.0
+    )
+
+    kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
+    split_kv_start = kv_len_per_split * split_kv_id
+    split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, cur_batch_seq_len)
+
+    # apply rotary embedding for q_pe, and k_pe (last token per batch of K_PE)
+    LAST_SPLIT = split_kv_end == cur_batch_seq_len
+    k_pe_last_token = tl.zeros([BLOCK_R], dtype=q.dtype)
+
+    if USE_ROPE:
+        if IS_NEOX_STYLE:
+            # [BLOCK_ROTARY // 2, BLOCK_ROTARY // 2 + 1, BLOCK_ROTARY // 2 + 2, ..., 0, 1, 2, ..., BLOCK_ROTARY // 2 - 1, pass:]
+            offs_qk_rot_r = kv_lora_rank + (
+                (tl.arange(0, BLOCK_R) + (rotary_dim // 2)) % rotary_dim
+            )
+            # Which elements to flip
+            mask_rotate = tl.arange(0, BLOCK_R) < (rotary_dim // 2)
+            # [0 , 1, 2, ..., rotary_dim // 2 - 1, 0 , 1, 2, ..., rotary_dim // 2 - 1]
+            offs_rotary = tl.arange(0, BLOCK_R) % (rotary_dim // 2)
+        else:
+            # [1, 0, 3, 2, 5, 4, ..., BLOCK_R, BLOCK_R - 1]
+            offs_qk_rot_r = (
+                kv_lora_rank
+                + (((tl.arange(0, BLOCK_R) + 1) % 2) * 2)
+                - 1
+                + tl.arange(0, BLOCK_R)
+            )
+            mask_rotate = tl.arange(0, BLOCK_R) % 2 < 1
+            # [0, 0, 1, 1, ..., rotary_dim // 2 - 1, rotary_dim // 2 - 1]
+            offs_rotary = tl.arange(0, BLOCK_R) // 2
+
+        if qk_rope_head_dim > rotary_dim:
+            offs_qk_rot_r = tl.where(
+                tl.arange(0, BLOCK_R) < rotary_dim, offs_qk_rot_r, tl.arange(0, BLOCK_R)
+            )
+            offs_rotary = tl.where(
+                tl.arange(0, BLOCK_R) < rotary_dim, offs_rotary, tl.arange(0, BLOCK_R)
+            )
+
+        mask_rotary = tl.arange(0, BLOCK_R) < rotary_dim
+
+        pos = tl.load(positions + cur_batch * stride_positions_b)
+        cos = tl.load(
+            cos_sin_cache + pos * stride_cos_sin_cache_s + offs_rotary,
+            mask=mask_rotary,
+            other=1.0,
+        )
+        sin = tl.load(
+            cos_sin_cache
+            + pos * stride_cos_sin_cache_s
+            + offs_rotary
+            + rotary_dim // 2,
+            mask_rotary,
+            other=0.0,
+        )
+
+        off_q_pe_rot = (
+            cur_batch * stride_qb
+            + cur_head[:, None] * stride_qh
+            + offs_qk_rot_r[None, :]
+        )
+        mask_qk_rot_r = offs_qk_rot_r < (kv_lora_rank + qk_rope_head_dim)
+
+        # 0, 2, 4,.... 1, 3, 5...
+        q_pe_rot = tl.load(
+            Q + off_q_pe_rot,
+            mask=(mask_h[:, None]) & (mask_qk_rot_r[None, :]),
+            other=0.0,
+        )
+        q_pe_rot = tl.where(mask_rotate[None, :], -q_pe_rot, q_pe_rot)
+
+        q_pe = q_pe * cos + q_pe_rot * sin
+
+        # we only apply to the last token in the K_PE
+        if LAST_SPLIT:
+            # debug assert
+            if (cur_batch == 0 and cur_head == 0) and split_kv_id < NUM_KV_SPLITS - 1:
+                tl.device_assert(False, "Only last split should compute k_pe")
+
+            kv_loc = tl.load(
+                kv_indices + cur_batch_kv_start_idx + cur_batch_seq_len - 1
+            )
+            offs_buf_k_pe_last_token = kv_loc * stride_buf_kbs + offs_qk_r
+            offs_buf_k_pe_rot_last_token = kv_loc * stride_buf_kbs + offs_qk_rot_r
+            k_pe_last_token = tl.load(K_Buffer + offs_buf_k_pe_last_token)
+
+            k_pe_rot_last_token = tl.load(K_Buffer + offs_buf_k_pe_rot_last_token)
+            k_pe_rot_last_token = tl.where(
+                mask_rotate, -k_pe_rot_last_token, k_pe_rot_last_token
+            )
+
+            k_pe_last_token = k_pe_last_token * cos + k_pe_rot_last_token * sin
+
+    e_max = tl.zeros([BLOCK_H], dtype=tl.float32) - float("inf")
+    e_sum = tl.zeros([BLOCK_H], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_H, BLOCK_C], dtype=tl.float32)
+
+    if split_kv_end > split_kv_start:
+        for start_n in range(split_kv_start, split_kv_end, BLOCK_N):
+            offs_n = start_n + tl.arange(0, BLOCK_N)
+            kv_loc = tl.load(
+                kv_indices + cur_batch_kv_start_idx + offs_n,
+                mask=offs_n < split_kv_end,
+                other=0,
+            )
+
+            offs_buf_kv = kv_loc[None, :] * stride_buf_kbs + offs_c[:, None]
+            offs_buf_k_pe = kv_loc[None, :] * stride_buf_kbs + offs_qk_r[:, None]
+
+            k_pe = tl.load(
+                K_Buffer + offs_buf_k_pe,
+                mask=(offs_n[None, :] < split_kv_end) & (mask_qk_r[:, None]),
+                other=0.0,
+            )  # positional embedding part of keys
+
+            if (USE_ROPE and LAST_SPLIT) and start_n >= cur_batch_seq_len - BLOCK_N:
+                k_pe = tl.where(
+                    offs_n[None, :] != (split_kv_end - 1),
+                    k_pe,
+                    k_pe_last_token[:, None],
+                )
+
+            # (16, 64) x (64, 32)
+            # dot product of rope parts
+            qk = tl.dot(q_pe, k_pe.to(q_pe.dtype))
+
+            kv = tl.load(
+                K_Buffer + offs_buf_kv,
+                mask=(offs_n[None, :] < split_kv_end) & (mask_c[:, None]),
+                other=0.0,
+            )  # the shared latent tensor for keys and values
+
+            # (16, 512) x (512, 32)
+            # dot product of nope parts
+            qk += tl.dot(q, kv)
+
+            qk *= sm_scale
+
+            if logit_cap > 0:
+                qk = logit_cap * tanh(qk / logit_cap)
+
+            qk = tl.where(
+                mask_h[:, None] & (offs_n[None, :] < split_kv_end), qk, float("-inf")
+            )
+
+            offs_buf_v = kv_loc[:, None] * stride_buf_vbs + offs_c[None, :]
+            v = tl.load(
+                V_buffer + offs_buf_v,
+                mask=(offs_n[:, None] < split_kv_end) & (mask_c[None, :]),
+                other=0.0,
+            )
+
+            n_e_max = tl.maximum(tl.max(qk, 1), e_max)
+            re_scale = tl.exp(e_max - n_e_max)
+            p = tl.exp(qk - n_e_max[:, None])
+            acc *= re_scale[:, None]
+            # (16, 32) x (32, 512)
+            acc += tl.dot(p.to(v.dtype), v)
+
+            e_sum = e_sum * re_scale + tl.sum(p, 1)
+            e_max = n_e_max
+
+        offs_mid_o = (
+            cur_batch * stride_mid_ob
+            + cur_head[:, None] * stride_mid_oh
+            + split_kv_id * stride_mid_os
+            + offs_c[None, :]
+        )
+
+        if USE_ROPE:
+            if LAST_SPLIT:
+                k_pe_last_token_ptrs = (
+                    k_pe_t_out
+                    + cur_batch * stride_kpe_tokens_out_b
+                    + tl.arange(0, BLOCK_R)
+                )
+                tl.store(k_pe_last_token_ptrs, k_pe_last_token, mask=mask_qk_r)
+
+        tl.store(
+            Att_Out + offs_mid_o,
+            acc / e_sum[:, None],
+            mask=(mask_h[:, None]) & (mask_c[None, :]),
+        )
+
+        offs_mid_o_1 = (
+            cur_batch * stride_mid_ob
+            + cur_head * stride_mid_oh
+            + split_kv_id * stride_mid_os
+            + kv_lora_rank
+        )
+
+        tl.store(
+            Att_Out + offs_mid_o_1,
+            e_max + tl.log(e_sum),
+            mask=mask_h,
+        )
+
+
+# TODO rope offset
+def _decode_grouped_att_m_fwd_rope(
+    q,
+    k_buffer,
+    v_buffer,
+    att_out,
+    k_pe_tokens_out,
+    kv_lora_rank,  # c
+    cos_sin_cache,
+    positions,
+    rotary_dim,
+    kv_indptr,
+    kv_indices,
+    num_kv_splits,
+    sm_scale,
+    logit_cap,
+    use_rope,
+    is_neox_style=True,
+):
+    if use_rope:
+        assert (
+            k_pe_tokens_out is not None
+        ), "We must output the k_pe tokens with rope applied if rope fusion enabled."
+
+    BLOCK = 32
+
+    # # [TODO] work around shmem limit on MI3xx
+    # if _is_hip and kv_lora_rank >= 576:
+    #     BLOCK = 16
+
+    qk_rope_head_dim = k_buffer.shape[-1] - kv_lora_rank
+    batch, head_num = kv_indptr.shape[0] - 1, q.shape[1]
+    kv_group_num = q.shape[1] // k_buffer.shape[1]
+
+    BLOCK_C = triton.next_power_of_2(kv_lora_rank)
+    BLOCK_R = triton.next_power_of_2(qk_rope_head_dim)
+
+    BLOCK_H = 16
+    NUM_KV_SPLITS = num_kv_splits
+    grid = (
+        batch,
+        triton.cdiv(head_num, min(BLOCK_H, kv_group_num)),
+        NUM_KV_SPLITS,
+    )
+
+    extra_kargs = {}
+    num_stages = 2
+    if _is_hip:
+        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
+        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
+        extra_kargs = {"waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 2}
+        num_stages = 1
+
+    _fwd_grouped_kernel_stage1_rope[grid](
+        q,
+        k_buffer,
+        v_buffer,
+        cos_sin_cache,
+        positions,
+        sm_scale,
+        kv_indptr,
+        kv_indices,
+        att_out,
+        k_pe_tokens_out,
+        q.stride(0),
+        q.stride(1),
+        k_buffer.stride(0),
+        v_buffer.stride(0),
+        att_out.stride(0),
+        att_out.stride(1),
+        att_out.stride(2),
+        k_pe_tokens_out.stride(0) if use_rope else 0,
+        cos_sin_cache.stride(0) if use_rope else 0,
+        positions.stride(0) if use_rope else 0,
+        rotary_dim,
+        kv_lora_rank,
+        qk_rope_head_dim,
+        kv_group_num=kv_group_num,
+        q_head_num=head_num,
+        BLOCK_C=BLOCK_C,
+        BLOCK_R=BLOCK_R,
+        BLOCK_N=BLOCK,
+        BLOCK_H=BLOCK_H,
+        NUM_KV_SPLITS=NUM_KV_SPLITS,
+        logit_cap=logit_cap,
+        USE_ROPE=use_rope,
+        IS_NEOX_STYLE=is_neox_style,
+        num_warps=4,
+        num_stages=num_stages,
+        **extra_kargs
+    )
+
+
+def decode_attention_fwd_grouped_rope(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    kv_indptr,
+    kv_indices,
+    k_pe_tokens,
+    kv_lora_rank,
+    rotary_dim,
+    cos_sin_cache,
+    positions,
+    attn_logits,
+    num_kv_splits,
+    sm_scale,
+    logit_cap=0.0,
+    use_rope=False,
+    is_neox_style=False,
+):
+    _decode_grouped_att_m_fwd_rope(
+        q,
+        k_buffer,
+        v_buffer,
+        attn_logits,
+        k_pe_tokens,
+        kv_lora_rank,
+        cos_sin_cache,
+        positions,
+        rotary_dim,
+        kv_indptr,
+        kv_indices,
+        num_kv_splits,
+        sm_scale,
+        logit_cap,
+        use_rope,
+        is_neox_style,
+    )
+    _decode_softmax_reducev_fwd(attn_logits, q, o, v_buffer, kv_indptr, num_kv_splits)
diff --git a/python/sglang/srt/layers/attention/trtllm_mha_backend.py b/python/sglang/srt/layers/attention/trtllm_mha_backend.py
new file mode 100644
index 000000000..a48cc9794
--- /dev/null
+++ b/python/sglang/srt/layers/attention/trtllm_mha_backend.py
@@ -0,0 +1,694 @@
+from __future__ import annotations
+
+"""
+Support attention backend for TRTLLM MHA kernels from flashinfer.
+The kernel supports sm100 only, with sliding window and attention sink features.
+"""
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from sglang.srt.layers.attention.flashinfer_backend import (
+    FlashInferAttnBackend,
+    FlashInferMultiStepDraftBackend,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.utils import is_flashinfer_available
+
+if is_flashinfer_available():
+    import flashinfer
+
+from sglang.srt.speculative.eagle_utils import EagleDraftInput
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.speculative.spec_info import SpecInfo
+
+# Constants
+DEFAULT_WORKSPACE_SIZE_MB = (
+    512  # Memory workspace size in MB, todo(Yingyi): read from config
+)
+
+# Reuse this workspace buffer across all TRTLLM MHA wrappers
+global_zero_init_workspace_buffer = None
+
+
+@dataclass
+class TRTLLMMHAMetadata:
+    # Sequence lengths for the forward batch
+    cache_seqlens_int32: torch.Tensor = None
+    # Maximum sequence length for query
+    max_seq_len_q: int = 1
+    # Maximum sequence length for key
+    max_seq_len_k: int = 0
+    # Cumulative sequence lengths for `query
+    cu_seqlens_q: torch.Tensor = None
+    # Cumulative sequence lengths for key
+    cu_seqlens_k: torch.Tensor = None
+    # Page table, the index of KV Cache Tables/Blocks
+    page_table: torch.Tensor = None
+
+
+class TRTLLMHAAttnBackend(FlashInferAttnBackend):
+    """TRTLLM MHA attention kernel from flashinfer."""
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        skip_prefill: bool = False,
+        kv_indptr_buf: Optional[torch.Tensor] = None,
+        kv_last_page_len_buf: Optional[torch.Tensor] = None,
+        speculative_step_id: int = 0,
+    ):
+        super().__init__(
+            model_runner, skip_prefill, kv_indptr_buf, kv_last_page_len_buf
+        )
+
+        config = model_runner.model_config
+
+        # MHA-specific dimensions
+        self.max_context_len = model_runner.model_config.context_len
+        self.hidden_size = config.hidden_size
+
+        # Runtime parameters
+        self.data_type = model_runner.kv_cache_dtype
+        self.q_data_type = model_runner.dtype
+        self.page_size = model_runner.page_size
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.device = model_runner.device
+
+        # Workspace allocation
+        self.workspace_size = DEFAULT_WORKSPACE_SIZE_MB * 1024 * 1024
+        # Allocate buffers
+        global global_zero_init_workspace_buffer
+        if global_zero_init_workspace_buffer is None:
+            global_zero_init_workspace_buffer = torch.zeros(
+                self.workspace_size,
+                dtype=torch.uint8,
+                device=model_runner.device,
+            )
+        self.workspace_buffer = global_zero_init_workspace_buffer
+
+        # CUDA graph state
+        self.decode_cuda_graph_metadata = {}
+
+        # Speculative decoding
+        # Only support topk <= 1 for now.
+        self.topk = model_runner.server_args.speculative_eagle_topk or 0
+        self.speculative_step_id = speculative_step_id
+        self.target_verify_metadata = {}
+
+        self.speculative_num_draft_tokens = (
+            model_runner.server_args.speculative_num_draft_tokens
+        )
+
+        # Forward metadata
+        self.forward_metadata: Optional[TRTLLMMHAMetadata] = None
+
+    def init_cuda_graph_state(
+        self,
+        max_bs: int,
+        max_num_tokens: int,
+        kv_indices_buf: Optional[torch.Tensor] = None,
+    ):
+        """Initialize CUDA graph state for TRTLLM MHA."""
+        max_num_pages = (self.max_context_len + self.page_size - 1) // self.page_size
+        self.decode_cuda_graph_metadata = {
+            "cache_seqlens": torch.zeros(max_bs, dtype=torch.int32, device=self.device),
+            "page_table": torch.zeros(
+                max_bs,
+                max_num_pages,
+                dtype=torch.int32,
+                device=self.device,
+            ),
+            "strided_indices": torch.arange(
+                0, self.max_context_len, self.page_size, device=self.device
+            ),
+        }
+
+        if (
+            self.speculative_num_draft_tokens is not None
+            and self.speculative_num_draft_tokens > 0
+        ):
+            self.decode_cuda_graph_metadata["cu_seqlens_q"] = torch.arange(
+                0, max_bs + 1, dtype=torch.int32, device=self.device
+            )
+            self.decode_cuda_graph_metadata["cu_seqlens_k"] = torch.zeros(
+                max_bs + 1, dtype=torch.int32, device=self.device
+            )
+            self.decode_cuda_graph_metadata["page_table_draft_decode"] = torch.zeros(
+                max_bs,
+                max_num_pages,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            self.target_verify_metadata = {
+                "cache_seqlens": torch.zeros(
+                    max_bs, dtype=torch.int32, device=self.device
+                ),
+                "cu_seqlens_q": torch.arange(
+                    0,
+                    max_bs * self.speculative_num_draft_tokens + 1,
+                    step=self.speculative_num_draft_tokens,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "cu_seqlens_k": torch.zeros(
+                    max_bs + 1, dtype=torch.int32, device=self.device
+                ),
+                "page_table": torch.zeros(
+                    max_bs,
+                    max_num_pages,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "strided_indices": torch.arange(
+                    0, self.max_context_len, self.page_size, device=self.device
+                ),
+            }
+
+            self.draft_extend_metadata = {
+                "cache_seqlens": torch.zeros(
+                    max_bs, dtype=torch.int32, device=self.device
+                ),
+                "cu_seqlens_q": torch.zeros(
+                    max_bs + 1,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "cu_seqlens_k": torch.zeros(
+                    max_bs + 1, dtype=torch.int32, device=self.device
+                ),
+                "page_table": torch.zeros(
+                    max_bs,
+                    max_num_pages,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "strided_indices": torch.arange(
+                    0, self.max_context_len, self.page_size, device=self.device
+                ),
+            }
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInfo],
+    ):
+        """Initialize metadata for CUDA graph capture."""
+        metadata = TRTLLMMHAMetadata()
+        device = seq_lens.device
+
+        if forward_mode.is_decode_or_idle():
+            if spec_info is not None:
+                # Draft Decode
+                # Here we only support topk = 1 for now.
+                metadata.cache_seqlens_int32 = self.decode_cuda_graph_metadata[
+                    "cache_seqlens"
+                ][:bs]
+                metadata.max_seq_len_k = seq_lens.max().item() + (
+                    self.speculative_step_id + 1
+                )
+                metadata.cu_seqlens_q = self.decode_cuda_graph_metadata["cu_seqlens_q"][
+                    : bs + 1
+                ]
+                metadata.cu_seqlens_k = torch.nn.functional.pad(
+                    torch.cumsum(
+                        metadata.cache_seqlens_int32, dim=0, dtype=torch.int32
+                    ),
+                    (1, 0),
+                )
+                metadata.page_table = self.decode_cuda_graph_metadata[
+                    "page_table_draft_decode"
+                ][:bs, :]
+                self.decode_cuda_graph_metadata[bs] = metadata
+            else:
+                # Normal Decode
+                # Get sequence information
+                metadata.cache_seqlens_int32 = seq_lens[:bs].to(torch.int32)
+                batch_size = len(seq_lens)
+                metadata.cu_seqlens_k = torch.nn.functional.pad(
+                    torch.cumsum(seq_lens, dim=0, dtype=torch.int32), (1, 0)
+                )
+
+                # Precompute maximum sequence length
+                metadata.max_seq_len_k = seq_lens.max().item()
+                # Precompute cumulative sequence lengths
+                metadata.cu_seqlens_q = torch.arange(
+                    0, batch_size + 1, dtype=torch.int32, device=device
+                )
+                # Precompute page table
+                metadata.page_table = self.decode_cuda_graph_metadata["page_table"][
+                    :bs, :
+                ]
+                self.decode_cuda_graph_metadata[bs] = metadata
+        elif forward_mode.is_target_verify():
+            # Target Verify
+            # Here we only support topk = 1 for now.
+            metadata.cache_seqlens_int32 = self.target_verify_metadata["cache_seqlens"][
+                :bs
+            ]
+            metadata.cache_seqlens_int32.copy_(
+                (seq_lens + self.speculative_num_draft_tokens)
+            )
+
+            metadata.cu_seqlens_q = torch.arange(
+                0,
+                bs * self.speculative_num_draft_tokens + 1,
+                self.speculative_num_draft_tokens,
+                dtype=torch.int32,
+                device=device,
+            )
+
+            metadata.cu_seqlens_k = self.target_verify_metadata["cu_seqlens_k"][
+                : (bs + 1)
+            ]
+
+            metadata.max_seq_len_q = self.speculative_num_draft_tokens
+            metadata.max_seq_len_k = (
+                seq_lens.max().item() + self.speculative_num_draft_tokens
+            )
+
+            metadata.page_table = self.target_verify_metadata["page_table"][:bs, :]
+
+            self.target_verify_metadata[bs] = metadata
+        elif forward_mode.is_draft_extend():
+            metadata.cache_seqlens_int32 = self.draft_extend_metadata["cache_seqlens"][
+                :bs
+            ]
+            metadata.cache_seqlens_int32.copy_(seq_lens)
+            num_tokens_per_bs = num_tokens // bs
+            metadata.cu_seqlens_q = torch.arange(
+                0,
+                bs * num_tokens_per_bs + 1,
+                num_tokens_per_bs,
+                dtype=torch.int32,
+                device=device,
+            )
+
+            metadata.cu_seqlens_k = self.draft_extend_metadata["cu_seqlens_k"][
+                : (bs + 1)
+            ]
+            num_tokens_per_bs = num_tokens // bs
+            metadata.max_seq_len_q = num_tokens_per_bs
+            metadata.max_seq_len_k = seq_lens.max().item()
+
+            metadata.page_table = self.draft_extend_metadata["page_table"][:bs, :]
+
+            self.draft_extend_metadata[bs] = metadata
+        self.forward_metadata = metadata
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInfo],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        """Replay CUDA graph with new inputs."""
+        seq_lens = seq_lens[:bs]
+        seq_lens_cpu = seq_lens_cpu[:bs]
+        req_pool_indices = req_pool_indices[:bs]
+        metadata = None
+        if forward_mode.is_decode_or_idle():
+            if spec_info is not None:
+                # Draft Decode
+                # Here we only support topk = 1 for now.
+                metadata = self.decode_cuda_graph_metadata[bs]
+                max_len = seq_lens_cpu.max().item()
+                metadata.max_seq_len_k = max_len + self.speculative_step_id + 1
+
+                max_seq_pages = (
+                    metadata.max_seq_len_k + self.page_size - 1
+                ) // self.page_size
+
+                metadata.cache_seqlens_int32.copy_(
+                    seq_lens + self.speculative_step_id + 1
+                )
+            else:
+                # Normal Decode
+                metadata = self.decode_cuda_graph_metadata[bs]
+                max_len = seq_lens_cpu.max().item()
+                max_seq_pages = (max_len + self.page_size - 1) // self.page_size
+                metadata.max_seq_len_k = max_len
+
+                metadata.cache_seqlens_int32.copy_(seq_lens)
+
+            metadata.cu_seqlens_k[1:].copy_(
+                torch.cumsum(metadata.cache_seqlens_int32, dim=0, dtype=torch.int32)
+            )
+            page_indices = self.req_to_token[
+                req_pool_indices[:, None],
+                self.decode_cuda_graph_metadata["strided_indices"][:max_seq_pages][
+                    None, :
+                ],
+            ]
+            metadata.page_table[:, :max_seq_pages].copy_(page_indices // self.page_size)
+        elif forward_mode.is_target_verify():
+            # Here we only support topk = 1 for now.
+            metadata = self.target_verify_metadata[bs]
+            metadata.cache_seqlens_int32.copy_(
+                (seq_lens + self.speculative_num_draft_tokens)
+            )
+
+            metadata.max_seq_len_k = (
+                seq_lens_cpu.max().item() + self.speculative_num_draft_tokens
+            )
+            max_len = seq_lens_cpu.max().item()
+            metadata.cu_seqlens_k[1:].copy_(
+                torch.cumsum(metadata.cache_seqlens_int32, dim=0, dtype=torch.int32)
+            )
+            max_seq_pages = (
+                metadata.max_seq_len_k + self.page_size - 1
+            ) // self.page_size
+            page_indices = self.req_to_token[
+                req_pool_indices[:, None],
+                self.decode_cuda_graph_metadata["strided_indices"][:max_seq_pages],
+            ]
+            page_indices //= self.page_size
+            metadata.page_table[:, :max_seq_pages].copy_(page_indices)
+        elif forward_mode.is_draft_extend():
+            metadata = self.draft_extend_metadata[bs]
+            metadata.cache_seqlens_int32.copy_(seq_lens)
+
+            metadata.max_seq_len_k = seq_lens_cpu.max().item()
+            max_len = seq_lens_cpu.max().item()
+            metadata.cu_seqlens_k[1:].copy_(
+                torch.cumsum(metadata.cache_seqlens_int32, dim=0, dtype=torch.int32)
+            )
+            accept_length = spec_info.accept_length[:bs]
+            if spec_info.accept_length_cpu:
+                metadata.max_seq_len_q = max(spec_info.accept_length_cpu) + 1
+            else:
+                metadata.max_seq_len_q = 1
+
+            metadata.cu_seqlens_q[1:].copy_(
+                torch.cumsum(accept_length, dim=0, dtype=torch.int32)
+            )
+
+            max_seq_pages = (
+                metadata.max_seq_len_k + self.page_size - 1
+            ) // self.page_size
+            page_indices = self.req_to_token[
+                req_pool_indices[:, None],
+                self.draft_extend_metadata["strided_indices"][:max_seq_pages],
+            ]
+            metadata.page_table[:, :max_seq_pages].copy_(page_indices // self.page_size)
+        self.forward_metadata = metadata
+
+    def get_cuda_graph_seq_len_fill_value(self) -> int:
+        """Get the fill value for sequence lengths in CUDA graph."""
+        return 1
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Initialize the metadata for a forward pass."""
+
+        metadata = TRTLLMMHAMetadata()
+        seqlens_in_batch = forward_batch.seq_lens
+        batch_size = forward_batch.batch_size
+        device = seqlens_in_batch.device
+
+        if forward_batch.forward_mode.is_decode_or_idle():
+            if forward_batch.spec_info is not None:
+                # Draft Decode
+                # Here we only support topk = 1 for now.
+                metadata.cache_seqlens_int32 = (
+                    seqlens_in_batch + (self.speculative_step_id + 1)
+                ).to(torch.int32)
+                metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item() + (
+                    self.speculative_step_id + 1
+                )
+                metadata.cu_seqlens_q = torch.arange(
+                    0, batch_size + 1, dtype=torch.int32, device=device
+                )
+                metadata.cu_seqlens_k = torch.nn.functional.pad(
+                    torch.cumsum(
+                        metadata.cache_seqlens_int32, dim=0, dtype=torch.int32
+                    ),
+                    (1, 0),
+                )
+                metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                    forward_batch.req_pool_indices, : metadata.max_seq_len_k
+                ]
+            else:
+                # Normal Decode
+                metadata.cache_seqlens_int32 = seqlens_in_batch.to(torch.int32)
+                metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item()
+                metadata.cu_seqlens_q = torch.arange(
+                    0, batch_size + 1, dtype=torch.int32, device=device
+                )
+                metadata.cu_seqlens_k = torch.nn.functional.pad(
+                    torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)
+                )
+                metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                    forward_batch.req_pool_indices, : metadata.max_seq_len_k
+                ]
+        elif forward_batch.forward_mode.is_target_verify():
+            # Only support topk = 1 for now.
+            metadata.cache_seqlens_int32 = (
+                forward_batch.seq_lens + self.speculative_num_draft_tokens
+            ).to(torch.int32)
+            metadata.max_seq_len_q = self.speculative_num_draft_tokens
+            metadata.max_seq_len_k = (
+                forward_batch.seq_lens_cpu.max().item()
+                + self.speculative_num_draft_tokens
+            )
+            metadata.cu_seqlens_q = torch.arange(
+                0,
+                batch_size * self.speculative_num_draft_tokens + 1,
+                self.speculative_num_draft_tokens,
+                dtype=torch.int32,
+                device=device,
+            )
+            metadata.cu_seqlens_k = torch.nn.functional.pad(
+                torch.cumsum(metadata.cache_seqlens_int32, dim=0, dtype=torch.int32),
+                (1, 0),
+            )
+            metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                forward_batch.req_pool_indices, : metadata.max_seq_len_k
+            ]
+
+        else:
+            metadata.cache_seqlens_int32 = seqlens_in_batch.to(torch.int32)
+            metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item()
+            metadata.cu_seqlens_k = torch.nn.functional.pad(
+                torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)
+            )
+            metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                forward_batch.req_pool_indices, : metadata.max_seq_len_k
+            ]
+
+            if (
+                any(forward_batch.extend_prefix_lens_cpu)
+                or forward_batch.forward_mode == ForwardMode.DRAFT_EXTEND
+            ):
+                extend_seq_lens = forward_batch.extend_seq_lens
+                metadata.max_seq_len_q = max(forward_batch.extend_seq_lens_cpu)
+                metadata.cu_seqlens_q = torch.nn.functional.pad(
+                    torch.cumsum(extend_seq_lens, dim=0, dtype=torch.int32), (1, 0)
+                )
+            else:
+                metadata.max_seq_len_q = metadata.max_seq_len_k
+                metadata.cu_seqlens_q = metadata.cu_seqlens_k
+
+        # Convert the page table to a strided format
+        if self.page_size > 1:
+            self.strided_indices = torch.arange(
+                0, metadata.page_table.shape[1], self.page_size, device=self.device
+            )
+            metadata.page_table = (
+                metadata.page_table[:, self.strided_indices] // self.page_size
+            )
+
+        self.forward_metadata = metadata
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        **kwargs,
+    ) -> torch.Tensor:
+        """Run forward for decode using TRTLLM MHA kernel."""
+        cache_loc = forward_batch.out_cache_loc
+        if save_kv_cache and k is not None:
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+            )
+
+        q = q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim)
+        k_cache, v_cache = forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id)
+        # shape conversion:
+        # [num_pages, page_size, num_kv_heads, head_dim] -> [num_pages, num_kv_heads, page_size, head_dim]
+        k_cache = k_cache.view(
+            -1, self.page_size, layer.tp_k_head_num, layer.head_dim
+        ).permute(0, 2, 1, 3)
+        v_cache = v_cache.view(
+            -1, self.page_size, layer.tp_v_head_num, layer.head_dim
+        ).permute(0, 2, 1, 3)
+        kv_cache = (k_cache, v_cache)
+
+        # TODO: add support for quantization
+        q_scale = 1.0
+        k_scale = (
+            layer.k_scale_float
+            if getattr(layer, "k_scale_float", None) is not None
+            else 1.0
+        )
+        bmm1_scale = q_scale * k_scale * layer.scaling
+        bmm2_scale = 1.0
+        # sink: additional value per head in the denominator of the softmax.
+        attention_sink = kwargs.get("sinks", None)
+
+        # Call TRT-LLM kernel
+        # raw_out: like q, [bs, acc_q_len, num_q_heads, head_dim] but with output dtype
+        o = flashinfer.decode.trtllm_batch_decode_with_kv_cache(
+            query=q,
+            kv_cache=kv_cache,
+            workspace_buffer=self.workspace_buffer,
+            block_tables=self.forward_metadata.page_table,
+            seq_lens=self.forward_metadata.cache_seqlens_int32,
+            max_seq_len=self.max_context_len,
+            bmm1_scale=bmm1_scale,
+            bmm2_scale=bmm2_scale,
+            window_left=layer.sliding_window_size,
+            # TODO: add attention_sink operation or nvfp4 scale factor if needed
+            sinks=attention_sink,
+        )
+
+        return o.view(-1, layer.tp_q_head_num * layer.head_dim)
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+        **kwargs,
+    ):
+        cache_loc = forward_batch.out_cache_loc
+        if save_kv_cache and k is not None:
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+            )
+        q = q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim)
+        # [num_pages, page_size, num_kv_heads, head_dim] -> [num_pages, num_kv_heads, page_size, head_dim]
+        k_cache, v_cache = forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id)
+        k_cache = k_cache.view(
+            -1, self.page_size, layer.tp_k_head_num, layer.head_dim
+        ).permute(0, 2, 1, 3)
+        v_cache = v_cache.view(
+            -1, self.page_size, layer.tp_v_head_num, layer.head_dim
+        ).permute(0, 2, 1, 3)
+        kv_cache = (k_cache, v_cache)
+
+        # sink: additional value per head in the denominator of the softmax.
+        attention_sink = kwargs.get("sinks", None)
+        # TODO: add support for quantization
+        q_scale = 1.0
+        k_scale = (
+            layer.k_scale_float
+            if getattr(layer, "k_scale_float", None) is not None
+            else 1.0
+        )
+        bmm1_scale = q_scale * k_scale * layer.scaling
+        bmm2_scale = 1.0
+
+        o = flashinfer.prefill.trtllm_batch_context_with_kv_cache(
+            query=q,
+            kv_cache=kv_cache,
+            workspace_buffer=self.workspace_buffer,
+            block_tables=self.forward_metadata.page_table,
+            seq_lens=self.forward_metadata.cache_seqlens_int32,
+            max_q_len=self.forward_metadata.max_seq_len_q,
+            max_kv_len=self.max_context_len,
+            bmm1_scale=bmm1_scale,
+            bmm2_scale=bmm2_scale,
+            batch_size=forward_batch.batch_size,
+            cum_seq_lens_q=self.forward_metadata.cu_seqlens_q,
+            cum_seq_lens_kv=self.forward_metadata.cu_seqlens_k,
+            window_left=layer.sliding_window_size,
+            # TODO: add attention_sink operation or nvfp4 scale factor if needed
+            sinks=attention_sink,
+        )
+
+        return o.view(-1, layer.tp_q_head_num * layer.head_dim)
+
+
+class TRTLLMHAAttnMultiStepDraftBackend(FlashInferMultiStepDraftBackend):
+    """Multi-step TRTLLM MHA attention kernel used by EAGLE."""
+
+    def __init__(
+        self, model_runner: ModelRunner, topk: int, speculative_num_steps: int
+    ):
+        super().__init__(model_runner, topk, speculative_num_steps)
+        for i in range(speculative_num_steps):
+            self.attn_backends[i] = TRTLLMHAAttnBackend(
+                model_runner,
+                skip_prefill=True,
+                kv_indptr_buf=self.kv_indptr[i],
+                kv_last_page_len_buf=self.kv_last_page_len,
+                speculative_step_id=i,
+            )
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends[i].init_forward_metadata(forward_batch)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        for i in range(self.speculative_num_steps):
+            self.attn_backends[i].init_cuda_graph_state(max_bs, max_num_tokens)
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        forward_batch: ForwardBatch,
+    ):
+        assert forward_batch.spec_info is not None
+        assert isinstance(forward_batch.spec_info, EagleDraftInput)
+
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends[i].init_forward_metadata_capture_cuda_graph(
+                forward_batch.batch_size,
+                forward_batch.batch_size * self.topk,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                encoder_lens=forward_batch.encoder_lens,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+            )
+
+    def init_forward_metadata_replay_cuda_graph(
+        self, forward_batch: ForwardBatch, bs: int
+    ):
+        assert forward_batch.spec_info is not None
+        assert isinstance(forward_batch.spec_info, EagleDraftInput)
+
+        for i in range(self.speculative_num_steps - 1):
+
+            self.attn_backends[i].init_forward_metadata_replay_cuda_graph(
+                bs,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                forward_batch.seq_lens_sum,
+                encoder_lens=forward_batch.encoder_lens,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+                seq_lens_cpu=forward_batch.seq_lens_cpu,
+            )
diff --git a/python/sglang/srt/layers/attention/trtllm_mla_backend.py b/python/sglang/srt/layers/attention/trtllm_mla_backend.py
new file mode 100755
index 000000000..b8d62c3fa
--- /dev/null
+++ b/python/sglang/srt/layers/attention/trtllm_mla_backend.py
@@ -0,0 +1,589 @@
+from __future__ import annotations
+
+"""
+Support attention backend for TRTLLM MLA kernels from flashinfer.
+"""
+
+import math
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+import triton
+
+from sglang.srt.layers.attention.flashinfer_mla_backend import (
+    FlashInferMLAAttnBackend,
+    FlashInferMLAMultiStepDraftBackend,
+)
+from sglang.srt.layers.attention.utils import (
+    TRITON_PAD_NUM_PAGE_PER_BLOCK,
+    create_flashmla_kv_indices_triton,
+)
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.utils import is_flashinfer_available
+
+if is_flashinfer_available():
+    import flashinfer
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.speculative.spec_info import SpecInfo
+
+# Constants
+DEFAULT_WORKSPACE_SIZE_MB = 128  # Memory workspace size in MB
+
+# Block constraint from flashinfer requirements
+# From flashinfer.decode._check_trtllm_gen_mla_shape:
+#   block_num % (128 / block_size) == 0
+# This imposes that the total number of blocks must be divisible by
+# (128 / block_size). We capture the 128 constant here so we can
+# compute the LCM with other padding constraints.
+TRTLLM_BLOCK_CONSTRAINT = 128
+
+global_zero_init_workspace_buffer = None
+
+
+@dataclass
+class TRTLLMMLAPrefillMetadata:
+    """Metadata for TRTLLM MLA prefill operations."""
+
+    max_seq_len: int
+    cum_seq_lens: torch.Tensor
+    seq_lens: torch.Tensor
+
+
+@dataclass
+class TRTLLMMLADecodeMetadata:
+    """Metadata for TRTLLM MLA decode operations."""
+
+    block_kv_indices: Optional[torch.Tensor] = None
+    max_seq_len: Optional[int] = None
+
+
+class TRTLLMMLABackend(FlashInferMLAAttnBackend):
+    """TRTLLM MLA attention kernel from flashinfer."""
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        skip_prefill: bool = False,
+        kv_indptr_buf: Optional[torch.Tensor] = None,
+        q_indptr_decode_buf: Optional[torch.Tensor] = None,
+    ):
+        super().__init__(model_runner, skip_prefill, kv_indptr_buf, q_indptr_decode_buf)
+
+        config = model_runner.model_config
+
+        # Model parameters
+        self.num_q_heads = config.num_attention_heads // get_attention_tp_size()
+        self.num_kv_heads = config.get_num_kv_heads(get_attention_tp_size())
+        self.num_local_heads = config.num_attention_heads // get_attention_tp_size()
+
+        # MLA-specific dimensions
+        self.kv_lora_rank = config.kv_lora_rank
+        self.qk_nope_head_dim = config.qk_nope_head_dim
+        self.qk_rope_head_dim = config.qk_rope_head_dim
+        self.v_head_dim = config.v_head_dim
+        self.kv_cache_dim = self.kv_lora_rank + self.qk_rope_head_dim
+
+        # Runtime parameters
+        self.scaling = config.scaling
+        self.data_type = model_runner.kv_cache_dtype
+        self.q_data_type = model_runner.dtype
+        self.page_size = model_runner.page_size
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+
+        # Workspace allocation
+        self.workspace_size = DEFAULT_WORKSPACE_SIZE_MB * 1024 * 1024
+        global global_zero_init_workspace_buffer
+        if global_zero_init_workspace_buffer is None:
+            global_zero_init_workspace_buffer = torch.zeros(
+                self.workspace_size,
+                dtype=torch.uint8,
+                device=model_runner.device,
+            )
+        self.workspace_buffer = global_zero_init_workspace_buffer
+
+        # CUDA graph state
+        self.decode_cuda_graph_metadata = {}
+        self.decode_cuda_graph_kv_indices = None
+        self.forward_prefill_metadata: Optional[TRTLLMMLAPrefillMetadata] = None
+        self.forward_decode_metadata: Union[TRTLLMMLADecodeMetadata, None] = None
+
+    def _calc_padded_blocks(self, max_seq_len: int) -> int:
+        """
+        Calculate padded block count that satisfies both TRT-LLM and Triton constraints.
+
+        Args:
+            max_seq_len: Maximum sequence length in tokens
+
+        Returns:
+            Number of blocks padded to satisfy all constraints
+        """
+        blocks = triton.cdiv(max_seq_len, self.page_size)
+
+        # Apply dual constraints (take LCM to satisfy both):
+        # 1. TRT-LLM: block_num % (128 / page_size) == 0
+        # 2. Triton: page table builder uses 64-index bursts, needs multiple of 64
+        trtllm_constraint = TRTLLM_BLOCK_CONSTRAINT // self.page_size
+        constraint_lcm = math.lcm(trtllm_constraint, TRITON_PAD_NUM_PAGE_PER_BLOCK)
+
+        if blocks % constraint_lcm != 0:
+            blocks = triton.cdiv(blocks, constraint_lcm) * constraint_lcm
+        return blocks
+
+    def _create_block_kv_indices(
+        self,
+        batch_size: int,
+        max_blocks: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        device: torch.device,
+    ) -> torch.Tensor:
+        """
+        Create block KV indices tensor using Triton kernel.
+
+        Args:
+            batch_size: Batch size
+            max_blocks: Maximum number of blocks per sequence
+            req_pool_indices: Request pool indices
+            seq_lens: Sequence lengths
+            device: Target device
+
+        Returns:
+            Block KV indices tensor
+        """
+        block_kv_indices = torch.full(
+            (batch_size, max_blocks), -1, dtype=torch.int32, device=device
+        )
+
+        create_flashmla_kv_indices_triton[(batch_size,)](
+            self.req_to_token,
+            req_pool_indices,
+            seq_lens,
+            None,
+            block_kv_indices,
+            self.req_to_token.stride(0),
+            max_blocks,
+            NUM_PAGE_PER_BLOCK=TRITON_PAD_NUM_PAGE_PER_BLOCK,
+            PAGED_SIZE=self.page_size,
+        )
+
+        return block_kv_indices
+
+    def init_cuda_graph_state(
+        self,
+        max_bs: int,
+        max_num_tokens: int,
+        kv_indices_buf: Optional[torch.Tensor] = None,
+    ):
+        """Initialize CUDA graph state for TRTLLM MLA."""
+
+        max_blocks_per_seq = self._calc_padded_blocks(self.max_context_len)
+
+        self.decode_cuda_graph_kv_indices = torch.full(
+            (max_bs, max_blocks_per_seq), -1, dtype=torch.int32, device=self.device
+        )
+
+        super().init_cuda_graph_state(max_bs, max_num_tokens, kv_indices_buf)
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInfo],
+    ):
+        """Initialize metadata for CUDA graph capture."""
+
+        # Delegate to parent for non-decode modes.
+        if not forward_mode.is_decode_or_idle():
+            return super().init_forward_metadata_capture_cuda_graph(
+                bs,
+                num_tokens,
+                req_pool_indices,
+                seq_lens,
+                encoder_lens,
+                forward_mode,
+                spec_info,
+            )
+
+        # Custom fast-path for decode/idle.
+        # Capture with full width so future longer sequences are safe during replay
+        max_blocks_per_seq = self._calc_padded_blocks(self.max_context_len)
+        block_kv_indices = self.decode_cuda_graph_kv_indices[:bs, :max_blocks_per_seq]
+
+        create_flashmla_kv_indices_triton[(bs,)](
+            self.req_to_token,
+            req_pool_indices,
+            seq_lens,
+            None,
+            block_kv_indices,
+            self.req_to_token.stride(0),
+            max_blocks_per_seq,
+            NUM_PAGE_PER_BLOCK=TRITON_PAD_NUM_PAGE_PER_BLOCK,
+            PAGED_SIZE=self.page_size,
+        )
+
+        # Record the true maximum sequence length for this capture batch so that
+        # the kernel launch path (which requires an int not a tensor) can reuse
+        # it safely during both capture and replay.
+        max_seq_len_val = int(seq_lens.max().item())
+
+        metadata = TRTLLMMLADecodeMetadata(
+            block_kv_indices,
+            max_seq_len_val,
+        )
+        self.decode_cuda_graph_metadata[bs] = metadata
+        self.forward_decode_metadata = metadata
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInfo],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        """Replay CUDA graph with new inputs."""
+        # Delegate to parent for non-decode modes.
+        if not forward_mode.is_decode_or_idle():
+            return super().init_forward_metadata_replay_cuda_graph(
+                bs,
+                req_pool_indices,
+                seq_lens,
+                seq_lens_sum,
+                encoder_lens,
+                forward_mode,
+                spec_info,
+                seq_lens_cpu,
+            )
+
+        metadata = self.decode_cuda_graph_metadata[bs]
+
+        # Update block indices for new sequences.
+        create_flashmla_kv_indices_triton[(bs,)](
+            self.req_to_token,
+            req_pool_indices[:bs],
+            seq_lens[:bs],
+            None,
+            metadata.block_kv_indices,
+            self.req_to_token.stride(0),
+            metadata.block_kv_indices.shape[1],
+            NUM_PAGE_PER_BLOCK=TRITON_PAD_NUM_PAGE_PER_BLOCK,
+            PAGED_SIZE=self.page_size,
+        )
+
+        # Update stored max_seq_len so subsequent kernel calls use the correct value
+        # Prefer CPU tensor to avoid GPU synchronization when available.
+        if seq_lens_cpu is not None:
+            metadata.max_seq_len = int(seq_lens_cpu.max().item())
+        else:
+            metadata.max_seq_len = int(seq_lens.max().item())
+
+    def get_cuda_graph_seq_len_fill_value(self) -> int:
+        """Get the fill value for sequence lengths in CUDA graph."""
+        return 1
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Initialize the metadata for a forward pass."""
+        # Delegate to parent for non-decode modes.
+        if (
+            forward_batch.forward_mode.is_extend()
+            and not forward_batch.forward_mode.is_target_verify()
+            and not forward_batch.forward_mode.is_draft_extend()
+        ):
+            seq_lens = forward_batch.seq_lens - forward_batch.extend_prefix_lens
+            cum_seq_lens_q = torch.cat(
+                (
+                    torch.tensor([0], device=forward_batch.seq_lens.device),
+                    torch.cumsum(seq_lens, dim=0),
+                )
+            ).int()
+            max_seq_len = max(forward_batch.extend_seq_lens_cpu)
+            self.forward_prefill_metadata = TRTLLMMLAPrefillMetadata(
+                max_seq_len,
+                cum_seq_lens_q,
+                seq_lens,
+            )
+        elif forward_batch.forward_mode.is_decode_or_idle():
+            bs = forward_batch.batch_size
+
+            # Get maximum sequence length.
+            if getattr(forward_batch, "seq_lens_cpu", None) is not None:
+                max_seq = forward_batch.seq_lens_cpu.max().item()
+            else:
+                max_seq = forward_batch.seq_lens.max().item()
+
+            max_seqlen_pad = self._calc_padded_blocks(max_seq)
+            block_kv_indices = self._create_block_kv_indices(
+                bs,
+                max_seqlen_pad,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                forward_batch.seq_lens.device,
+            )
+
+            max_seq_len_val = int(max_seq)
+            self.forward_decode_metadata = TRTLLMMLADecodeMetadata(
+                block_kv_indices, max_seq_len_val
+            )
+            forward_batch.decode_trtllm_mla_metadata = self.forward_decode_metadata
+        else:
+            return super().init_forward_metadata(forward_batch)
+
+    def init_mha_chunk_metadata(self, forward_batch: ForwardBatch):
+        super().init_mha_chunk_metadata(forward_batch, disable_flashinfer_ragged=True)
+
+    def quantize_and_rope_for_fp8(
+        self,
+        q_nope: torch.Tensor,
+        q_rope: torch.Tensor,
+        k_nope: torch.Tensor,
+        k_rope: torch.Tensor,
+        forward_batch: ForwardBatch,
+        cos_sin_cache: torch.Tensor,
+        is_neox: bool,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Quantize and apply RoPE for FP8 attention path.
+
+        This function handles the FP8 quantization and RoPE application for MLA attention.
+        It takes separate query/key nope and rope components, applies RoPE to the rope parts,
+        quantizes all components to FP8, and merges the query components into a single tensor.
+
+        Args:
+            q_nope: Query no-position-encoding component [seq_len, num_heads, kv_lora_rank]
+                - expected dtype: torch.bfloat16
+            q_rope: Query RoPE component [seq_len, num_heads, qk_rope_head_dim]
+                - expected dtype: torch.bfloat16
+            k_nope: Key no-position-encoding component [seq_len, num_heads, kv_lora_rank]
+                - expected dtype: torch.bfloat16
+            k_rope: Key RoPE component [seq_len, num_heads, qk_rope_head_dim]
+                - expected dtype: torch.bfloat16
+            forward_batch: Forward batch containing position information
+            cos_sin_cache: Precomputed cosine/sine cache for RoPE
+                - expected dtype: matches q_/k_ input dtype (torch.bfloat16)
+            is_neox: Whether to use NeoX-style RoPE (interleaved) or GPT-style (half rotation)
+
+        Returns:
+            tuple: (merged_q_out, k_nope_out, k_rope_out) quantized to FP8
+                - merged_q_out: [seq_len, num_heads, kv_lora_rank + qk_rope_head_dim], dtype=torch.float8_e4m3fn
+                - k_nope_out:   [seq_len, num_heads, kv_lora_rank], dtype=torch.float8_e4m3fn
+                - k_rope_out:   [seq_len, num_heads, qk_rope_head_dim], dtype=torch.float8_e4m3fn
+        """
+        attn_dtype = torch.float8_e4m3fn
+        q_len, num_heads = q_rope.shape[0], q_rope.shape[1]
+
+        # Allocate output tensors with FP8 dtype
+        # Query output will contain merged nope + rope components
+        q_out = q_rope.new_empty(
+            q_len,
+            num_heads,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            dtype=attn_dtype,
+        )
+
+        # Key outputs maintain original shapes but with FP8 dtype
+        k_rope_out = k_rope.new_empty(k_rope.shape, dtype=attn_dtype)
+        k_nope_out = k_nope.new_empty(k_nope.shape, dtype=attn_dtype)
+
+        # Apply RoPE and quantize all components in a single fused kernel call
+        # This kernel handles:
+        # 1. RoPE application to q_rope and k_rope using cos_sin_cache and positions
+        # 2. Quantization of all components to FP8 format
+        # 3. Output placement into pre-allocated tensors
+        flashinfer.rope.mla_rope_quantize_fp8(
+            q_rope=q_rope,
+            k_rope=k_rope,
+            q_nope=q_nope,
+            k_nope=k_nope,
+            cos_sin_cache=cos_sin_cache,
+            pos_ids=forward_batch.positions,
+            is_neox=is_neox,
+            quantize_dtype=attn_dtype,
+            # Output tensor slicing: q_out contains [nope_part, rope_part]
+            q_rope_out=q_out[..., self.kv_lora_rank :],  # RoPE part goes to end
+            k_rope_out=k_rope_out,
+            q_nope_out=q_out[..., : self.kv_lora_rank],  # Nope part goes to beginning
+            k_nope_out=k_nope_out,
+            # Quantization scales (set to 1.0 for no additional scaling)
+            quant_scale_q=1.0,
+            quant_scale_kv=1.0,
+        )
+
+        return q_out, k_nope_out, k_rope_out
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,  # q_nope
+        k: torch.Tensor,  # k_nope
+        v: torch.Tensor,  # not used in this backend
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        q_rope: Optional[torch.Tensor] = None,
+        k_rope: Optional[torch.Tensor] = None,
+        cos_sin_cache: Optional[torch.Tensor] = None,
+        is_neox: Optional[bool] = False,
+    ) -> torch.Tensor:
+        """Run forward for decode using TRTLLM MLA kernel."""
+        merge_query = q_rope is not None
+        if self.data_type == torch.float8_e4m3fn:
+            # For FP8 path, we quantize the query and rope parts and merge them into a single tensor
+            # Note: rope application in deepseek_v2.py:forward_absorb_prepare is skipped for FP8 decode path of this trtllm_mla backend
+            assert all(
+                x is not None for x in [q_rope, k_rope, cos_sin_cache]
+            ), "For FP8 path and using flashinfer.rope.mla_rope_quantize we need all of q_rope, k_rope and cos_sin_cache to be not None."
+            q, k, k_rope = self.quantize_and_rope_for_fp8(
+                q,
+                q_rope,
+                k.squeeze(1),
+                k_rope.squeeze(1),
+                forward_batch,
+                cos_sin_cache,
+                is_neox,
+            )
+            merge_query = False
+
+        # Save KV cache if requested
+        if save_kv_cache:
+            assert (
+                k is not None and k_rope is not None
+            ), "For populating trtllm_mla kv cache, both k_nope and k_rope should be not None."
+            forward_batch.token_to_kv_pool.set_mla_kv_buffer(
+                layer, forward_batch.out_cache_loc, k, k_rope
+            )
+
+        # Prepare query tensor inline
+        if merge_query:
+            # For FP16 path, we merge the query and rope parts into a single tensor
+            q_nope = q.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+            q_rope_reshaped = q_rope.view(
+                -1, layer.tp_q_head_num, layer.head_dim - layer.v_head_dim
+            )
+            query = torch.cat([q_nope, q_rope_reshaped], dim=-1)
+        else:
+            # For FP8 path, we already have the query and rope parts merged because of the quantize_and_rope_for_fp8 function
+            query = q.view(-1, layer.tp_q_head_num, layer.head_dim)
+
+        # Ensure query has shape [bs, acc_q_len, num_q_heads, head_dim] when seq_len 1
+        if query.dim() == 3:
+            query = query.unsqueeze(1)
+
+        # Prepare KV cache inline
+        k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+        kv_cache = k_cache.view(-1, self.page_size, self.kv_cache_dim).unsqueeze(1)
+
+        # Get metadata
+        metadata = (
+            getattr(forward_batch, "decode_trtllm_mla_metadata", None)
+            or self.forward_decode_metadata
+        )
+
+        # Scale computation for TRTLLM MLA kernel BMM1 operation:
+        # The final BMM1 scale is computed as: q_scale * k_scale * softmax_scale
+        # Scale components:
+        # - q_scale: Query scaling factor (set to 1.0 for both FP16/FP8 paths)
+        # - k_scale: Key scaling factor from model checkpoint (defaults to 1.0 if not available)
+        # - softmax_scale: Attention softmax scaling = 1/sqrt(head_dim), pre-computed as layer.scaling
+        # This unified approach works for both FP16 and FP8 quantized attention paths.
+        q_scale = 1.0
+        k_scale = (
+            layer.k_scale_float
+            if getattr(layer, "k_scale_float", None) is not None
+            else 1.0
+        )
+
+        bmm1_scale = q_scale * k_scale * layer.scaling
+
+        # Call TRT-LLM kernel
+        raw_out = flashinfer.decode.trtllm_batch_decode_with_kv_cache_mla(
+            query=query,
+            kv_cache=kv_cache,
+            workspace_buffer=self.workspace_buffer,
+            qk_nope_head_dim=self.qk_nope_head_dim,
+            kv_lora_rank=self.kv_lora_rank,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            block_tables=metadata.block_kv_indices,
+            seq_lens=forward_batch.seq_lens.to(torch.int32),
+            max_seq_len=metadata.max_seq_len,
+            bmm1_scale=bmm1_scale,
+        )
+
+        # Reshape output directly without slicing
+        output = raw_out.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+        return output
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        q_rope: Optional[torch.Tensor] = None,
+        k_rope: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if (
+            forward_batch.forward_mode.is_target_verify()
+            or forward_batch.forward_mode.is_draft_extend()
+        ):
+            return super().forward_extend(
+                q, k, v, layer, forward_batch, save_kv_cache, q_rope, k_rope
+            )
+
+        if not forward_batch.attn_attend_prefix_cache:
+            q = q.view(-1, layer.tp_q_head_num, layer.head_dim)
+            k = k.view(-1, layer.tp_k_head_num, layer.head_dim)
+            v = v.view(-1, layer.tp_k_head_num, layer.v_head_dim)
+            output = flashinfer.prefill.trtllm_ragged_attention_deepseek(
+                query=q,
+                key=k,
+                value=v,
+                workspace_buffer=self.workspace_buffer,
+                seq_lens=self.forward_prefill_metadata.seq_lens,
+                max_q_len=self.forward_prefill_metadata.max_seq_len,
+                max_kv_len=self.forward_prefill_metadata.max_seq_len,
+                bmm1_scale=layer.scaling,
+                bmm2_scale=1.0,
+                o_sf_scale=1.0,
+                batch_size=forward_batch.batch_size,
+                window_left=-1,
+                cum_seq_lens_q=self.forward_prefill_metadata.cum_seq_lens,
+                cum_seq_lens_kv=self.forward_prefill_metadata.cum_seq_lens,
+                enable_pdl=False,
+                is_causal=True,
+                return_lse=forward_batch.mha_return_lse,
+            )
+        else:
+            # replace with trtllm ragged attention once accuracy is resolved.
+            output = super().forward_extend(
+                q, k, v, layer, forward_batch, save_kv_cache, q_rope, k_rope
+            )
+        return output
+
+
+class TRTLLMMLAMultiStepDraftBackend(FlashInferMLAMultiStepDraftBackend):
+    """Multi-step draft backend for TRT-LLM MLA used by EAGLE."""
+
+    def __init__(
+        self, model_runner: "ModelRunner", topk: int, speculative_num_steps: int
+    ):
+        super().__init__(model_runner, topk, speculative_num_steps)
+
+        for i in range(self.speculative_num_steps):
+            self.attn_backends[i] = TRTLLMMLABackend(
+                model_runner,
+                skip_prefill=True,
+                kv_indptr_buf=self.kv_indptr[i],
+                q_indptr_decode_buf=self.q_indptr_decode,
+            )
diff --git a/python/sglang/srt/layers/attention/utils.py b/python/sglang/srt/layers/attention/utils.py
new file mode 100644
index 000000000..e8cd2e158
--- /dev/null
+++ b/python/sglang/srt/layers/attention/utils.py
@@ -0,0 +1,99 @@
+import triton
+import triton.language as tl
+
+# Keep this in sync with the Triton kernel inside `create_flashmla_kv_indices_triton`.
+# Number of pages that the kernel writes per iteration.
+# Exposed here so other Python modules can import it instead of hard-coding 64.
+TRITON_PAD_NUM_PAGE_PER_BLOCK = 64
+
+
+@triton.jit
+def create_flashinfer_kv_indices_triton(
+    req_to_token_ptr,  # [max_batch, max_context_len]
+    req_pool_indices_ptr,
+    page_kernel_lens_ptr,
+    kv_indptr,
+    kv_start_idx,
+    kv_indices_ptr,
+    req_to_token_ptr_stride: tl.constexpr,
+):
+    BLOCK_SIZE: tl.constexpr = 512
+    pid = tl.program_id(axis=0)
+
+    # find the req pool idx, this is for batch to token
+    req_pool_index = tl.load(req_pool_indices_ptr + pid)
+    kv_indices_offset = tl.load(kv_indptr + pid)
+
+    kv_start = 0
+    kv_end = 0
+    if kv_start_idx:
+        kv_start = tl.load(kv_start_idx + pid).to(tl.int32)
+        kv_end = kv_start
+    kv_end += tl.load(page_kernel_lens_ptr + pid).to(tl.int32)
+
+    num_loop = tl.cdiv(kv_end - kv_start, BLOCK_SIZE)
+    for i in range(num_loop):
+        # index into req_to_token_ptr needs to be int64
+        offset = tl.arange(0, BLOCK_SIZE).to(tl.int64) + i * BLOCK_SIZE
+        mask = offset < kv_end - kv_start
+        data = tl.load(
+            req_to_token_ptr
+            + req_pool_index * req_to_token_ptr_stride
+            + kv_start
+            + offset,
+            mask=mask,
+        )
+        tl.store(kv_indices_ptr + kv_indices_offset + offset, data, mask=mask)
+
+
+@triton.jit
+def create_flashmla_kv_indices_triton(
+    req_to_token_ptr,  # [max_batch, max_context_len]
+    req_pool_indices_ptr,
+    page_kernel_lens_ptr,
+    kv_start_idx,
+    kv_indices_ptr,
+    req_to_token_ptr_stride: tl.constexpr,
+    kv_indices_ptr_stride: tl.constexpr,
+    NUM_PAGE_PER_BLOCK: tl.constexpr = TRITON_PAD_NUM_PAGE_PER_BLOCK,
+    PAGED_SIZE: tl.constexpr = 64,
+):
+    BLOCK_SIZE: tl.constexpr = 4096
+    pid = tl.program_id(axis=0)
+
+    # find the req pool idx, this is for batch to token
+    req_pool_index = tl.load(req_pool_indices_ptr + pid)
+
+    kv_start = 0
+    kv_end = 0
+    if kv_start_idx:
+        kv_start = tl.load(kv_start_idx + pid).to(tl.int32)
+        kv_end = kv_start
+
+    kv_end += tl.load(page_kernel_lens_ptr + pid).to(tl.int32)
+
+    num_paged = tl.cdiv(kv_end - kv_start, PAGED_SIZE)
+    num_pages_loop = tl.cdiv(kv_end - kv_start, BLOCK_SIZE)
+
+    for i in range(num_pages_loop):
+        # index into req_to_token_ptr needs to be int64
+        paged_offset = (
+            tl.arange(0, NUM_PAGE_PER_BLOCK).to(tl.int64) + i * NUM_PAGE_PER_BLOCK
+        ) * PAGED_SIZE
+        paged_offset_out = tl.arange(0, NUM_PAGE_PER_BLOCK) + i * NUM_PAGE_PER_BLOCK
+
+        mask = paged_offset < num_paged * PAGED_SIZE
+        mask_out = paged_offset_out < num_paged
+
+        data = tl.load(
+            req_to_token_ptr
+            + req_pool_index * req_to_token_ptr_stride
+            + kv_start
+            + paged_offset,
+            mask=mask,
+        )
+        tl.store(
+            kv_indices_ptr + pid * kv_indices_ptr_stride + paged_offset_out,
+            data // PAGED_SIZE,
+            mask=mask_out,
+        )
diff --git a/python/sglang/srt/layers/attention/vision.py b/python/sglang/srt/layers/attention/vision.py
new file mode 100644
index 000000000..2be3e450b
--- /dev/null
+++ b/python/sglang/srt/layers/attention/vision.py
@@ -0,0 +1,630 @@
+from __future__ import annotations
+
+import dataclasses
+import functools
+import math
+from functools import lru_cache, partial
+from typing import Any, Callable, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+
+from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size
+from sglang.srt.utils import (
+    get_device_capability,
+    is_blackwell,
+    is_cuda,
+    print_info_once,
+)
+
+_is_cuda = is_cuda()
+
+if _is_cuda:
+    from sgl_kernel.flash_attn import flash_attn_varlen_func
+
+from sglang.srt.distributed import (
+    split_tensor_along_last_dim,
+    tensor_model_parallel_all_gather,
+)
+from sglang.srt.distributed import utils as dist_utils
+from sglang.srt.layers.attention.triton_ops.prefill_attention import (
+    context_attention_fwd,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.layers.rotary_embedding import apply_rotary_pos_emb
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.utils import add_prefix
+
+ROTARY_EMBED_CLASSES = {
+    "normal": apply_rotary_pos_emb,
+}
+
+
+@dataclasses.dataclass
+class SingletonCache:
+    data: Any = None
+
+    def set_data(self, value: Any) -> None:
+        self.data = value
+
+    def get_data(self) -> Optional[Any]:
+        return self.data
+
+    def empty(self) -> bool:
+        return self.get_data() is None
+
+
+# TODO: requires real seqlens from images
+@functools.lru_cache(maxsize=128)
+def _get_cu_seqlens_for_shape(batch_size: int, seqlen: int, device) -> torch.Tensor:
+    """
+    Generates cumulative sequence lengths (cu_seqlens) for a given batch_size, seqlen, and device.
+    Caches the result based on these parameters.
+    """
+    cu_seqlens = torch.arange(
+        0,
+        (batch_size + 1) * seqlen,
+        step=seqlen,
+        dtype=torch.int32,
+        device=device,
+    )
+    return cu_seqlens
+
+
+class VisionSdpaAttention(nn.Module):
+    r"""
+    Scaled Dot Product Attention inner product
+
+    """
+
+    def __init__(
+        self,
+        head_dim: int,
+        num_heads: int,
+        num_kv_heads: int,
+        dropout: float = 0.0,
+        flatten_batch: bool = False,
+        softmax_in_single_precision: bool = False,
+        **kwargs,
+    ):
+        super().__init__()
+        self.head_size = head_dim
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.flatten_batch = flatten_batch
+        self.softmax_in_single_precision = softmax_in_single_precision
+        self.dropout = dropout
+        self.scale = 1.0 / math.sqrt(self.head_size)
+
+    @staticmethod
+    @lru_cache(maxsize=128)
+    def _generate_mask_cache(
+        s: int, flatten_batch: bool, cu_seqlens: tuple
+    ) -> torch.BoolTensor:
+        """
+        Generate a boolean attention mask with caching mechanism.
+        Args:
+            s: sequence length
+            flatten_batch: whether to flatten batch dimension
+            cu_seqlens: tuple of cumulative sequence lengths
+        Returns:
+            attention mask tensor of shape [b, 1, s, s] or [1, s, s]
+        """
+        if flatten_batch:
+            mask = torch.zeros([1, s, s], dtype=torch.bool)
+            for i in range(1, len(cu_seqlens)):
+                start = cu_seqlens[i - 1]
+                end = cu_seqlens[i]
+                mask[..., start:end, start:end] = True
+        else:
+            # [1, 1, 1, s]
+            row_indices = torch.arange(s).view(1, 1, 1, s)
+            # [1, 1, s, 1]
+            col_indices = torch.arange(s).view(1, 1, s, 1)
+            # [b, 1, 1, 1]
+            seq_lens = torch.tensor(
+                [end - start for start, end in zip(cu_seqlens[:-1], cu_seqlens[1:])],
+            ).view(-1, 1, 1, 1)
+
+            mask = (row_indices < seq_lens) & (col_indices < seq_lens)
+
+        return mask
+
+    def generate_patch_attention_mask(
+        self,
+        s: int,
+        cu_seqlens: Optional[torch.Tensor],
+        flatten_batch: bool = False,
+    ) -> Optional[torch.Tensor]:
+        r"""
+        Creates a non-causal 4D mask of shape `(b, 1, s, s)` or `(1, 1, s, s)`.
+        Args:
+            s: sequence length
+            cu_seqlens: cumulative sequence lengths tensor. If not, returns an empty mask
+            flatten_batch: whether to flatten batch dimension
+        Returns:
+            attention mask tensor or None
+        """
+        if cu_seqlens is None:
+            return None
+
+        cu_seqlens_tuple = tuple(cu_seqlens.cpu().tolist())
+
+        return self._generate_mask_cache(s, flatten_batch, cu_seqlens_tuple)
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        bsz: int,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            cu_seqlens: [b]
+        Returns:
+             [b * s, h, head_size]
+        """
+        if self.flatten_batch:
+            assert bsz == 1, "flatten_batch is True, bsz must be 1"
+
+        assert q.dim() == 3, q.shape
+
+        s = q.shape[0] // bsz
+
+        # [b, 1, s, s]
+        if attention_mask is None:
+            attention_mask = self.generate_patch_attention_mask(
+                s, cu_seqlens, flatten_batch=self.flatten_batch
+            )
+
+        if attention_mask is None:
+            if self.softmax_in_single_precision:
+                raise RuntimeError("Empty attention mask")
+        else:
+            attention_mask = attention_mask.to(device=q.device)
+
+        q, k, v = [rearrange(x, "(b s) h d -> b h s d", b=bsz) for x in [q, k, v]]
+
+        if self.softmax_in_single_precision:
+            k = rearrange(k, "b h s d -> b h d s")
+            attn_weights = torch.matmul(q, k) * self.scale
+            del k
+            # masking
+            attention_mask = (~attention_mask) * torch.finfo(q.dtype).min
+            attn_weights = attn_weights + attention_mask
+            del attention_mask
+            # full-precision
+            attn_weights = nn.functional.softmax(
+                attn_weights, dim=-1, dtype=torch.float32
+            ).to(q.dtype)
+            attn_weights = nn.functional.dropout(
+                attn_weights, p=self.dropout, training=False
+            )
+            output = torch.matmul(attn_weights, v)
+            del attn_weights, v
+        else:
+            # SDPA
+            # [b, h, s, head_size]
+            output = F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=attention_mask,
+                dropout_p=self.dropout,
+                is_causal=False,
+            )
+
+        # [b, h, s, head_size] --> [b * s, h, head_size]
+        output = rearrange(output, "b h s d -> (b s) h d")
+
+        return output
+
+
+class VisionTritonAttention(nn.Module):
+    """
+    Triton-implemented attention without a causal mask
+    """
+
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        super().__init__()
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        cu_seqlens: Optional[torch.Tensor],
+        bsz: int,
+        seq_len: int,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            cu_seqlens: [b]
+        Returns:
+             [b * s, h, head_size]
+        """
+        if cu_seqlens is None:
+            cu_seqlens = _get_cu_seqlens_for_shape(bsz, seq_len, device=q.device)
+
+        # [b * s, head, head_size]
+        output = torch.empty_like(q)
+        seq_lens = cu_seqlens[1:] - cu_seqlens[:-1]
+        max_seqlen = seq_lens.max().item()
+        context_attention_fwd(
+            q,
+            k,
+            v,
+            output,
+            cu_seqlens.cuda(),
+            seq_lens.cuda(),
+            max_seqlen,
+            is_causal=False,
+        )
+
+        return output
+
+
+class VisionFlash3Attention(nn.Module):
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        if not _is_cuda:
+            raise Exception("VisionFlash3Attention is only available for cuda")
+        super().__init__()
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        cu_seqlens: Optional[Union[SingletonCache, torch.Tensor]],
+        bsz: int,
+        seq_len: int,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            cu_seqlens: [b]
+        Returns:
+             [b * s, h, head_size]
+        """
+        if cu_seqlens is None:
+            cu_seqlens = _get_cu_seqlens_for_shape(bsz, seq_len, device=q.device)
+        elif isinstance(cu_seqlens, SingletonCache):
+            if cu_seqlens.empty():
+                cu_seqlens.set_data(
+                    _get_cu_seqlens_for_shape(bsz, seq_len, device=q.device)
+                )
+            cu_seqlens = cu_seqlens.get_data()
+
+        cu_seqlens = cu_seqlens.to(dtype=torch.int32).to(q.device)
+        seq_lens = cu_seqlens[1:] - cu_seqlens[:-1]
+        max_seqlen = seq_lens.max().item()
+
+        output = flash_attn_varlen_func(
+            q,
+            k,
+            v,
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_k=cu_seqlens,
+            max_seqlen_q=max_seqlen,
+            max_seqlen_k=max_seqlen,
+        )
+
+        return output
+
+
+QKV_BACKEND_IMPL = {
+    "triton_attn": VisionTritonAttention,
+    "sdpa": VisionSdpaAttention,
+    "fa3": VisionFlash3Attention,
+}
+
+
+class VisionAttention(nn.Module):
+    r"""
+        Multi-headed attention without any cache, mostly used for multimodal transformers.
+
+
+    Args:
+        use_qkv_parallel (bool, optional): If True, use QKV-parallel attention.
+        softmax_in_single_precision (bool, default to False):
+            if ``True``, the softmax will be performed in single-precision
+            Otherwise, it will be performed in half-precision
+
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        projection_size: int,
+        use_qkv_parallel: bool,
+        qkv_backend: Optional[str] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        dropout: float = 0.0,
+        softmax_in_single_precision: bool = False,
+        flatten_batch: bool = False,
+        prefix: str = "",
+        proj_bias: bool = True,
+        num_dummy_heads: int = 0,
+        qkv_bias: bool = True,
+        qk_normalization: bool = False,
+        layer_norm_eps: float = 1e-06,
+        customized_position_embedding_applier: Callable[
+            [torch.Tensor, torch.Tensor, Any, Any], Tuple[torch.Tensor, torch.Tensor]
+        ] = None,
+        **kwargs,
+    ):
+        super().__init__()
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+        self.tp_size = attn_tp_size
+        self.tp_rank = attn_tp_rank
+        self.dropout = dropout
+        self.head_size = embed_dim // num_heads
+        self.hidden_size_per_attention_head = dist_utils.divide(
+            projection_size, num_heads
+        )
+        self.num_attention_heads_per_partition = dist_utils.divide(
+            num_dummy_heads + num_heads, self.tp_size
+        )
+        self.num_attention_kv_heads_per_partition = dist_utils.divide(
+            num_dummy_heads + num_heads, self.tp_size
+        )
+
+        self.q_size = self.num_attention_heads_per_partition * self.head_size
+        self.kv_size = self.num_attention_kv_heads_per_partition * self.head_size
+
+        self.qk_normalization = qk_normalization
+
+        # Additional dummy heads are used to enable TP for common GPU counts.
+        self.dummy_dim = (num_dummy_heads + num_heads) * self.head_size
+
+        if self.qk_normalization:
+            self.q_norm = RMSNorm(
+                self.dummy_dim, eps=layer_norm_eps, var_hidden_size=embed_dim
+            )
+            self.k_norm = RMSNorm(
+                self.dummy_dim, eps=layer_norm_eps, var_hidden_size=embed_dim
+            )
+
+        # Select attention backend via a unified method
+        _passed_backend = qkv_backend
+        qkv_backend = self._determine_attention_backend(_passed_backend)
+        if (
+            global_server_args_dict["mm_attention_backend"] is None
+            and _passed_backend is None
+        ):
+            print_info_once(f"Multimodal attention backend not set. Use {qkv_backend}.")
+        print_info_once(f"Using {qkv_backend} as multimodal attention backend.")
+
+        self.customized_position_embedding_applier = (
+            customized_position_embedding_applier
+        )
+        self.qkv_backend = QKV_BACKEND_IMPL[qkv_backend](
+            head_dim=self.head_size,
+            num_heads=self.num_attention_heads_per_partition,
+            num_kv_heads=self.num_attention_kv_heads_per_partition,
+            dropout=dropout,
+            flatten_batch=flatten_batch,
+            softmax_in_single_precision=softmax_in_single_precision,
+        )
+
+        self.use_qkv_parallel = use_qkv_parallel
+        if use_qkv_parallel:
+            self.qkv_proj = QKVParallelLinear(
+                hidden_size=embed_dim,
+                head_size=self.head_size,
+                total_num_heads=num_dummy_heads + num_heads,
+                total_num_kv_heads=num_dummy_heads + num_heads,
+                bias=qkv_bias,
+                quant_config=quant_config,
+                tp_rank=self.tp_rank,
+                tp_size=self.tp_size,
+                prefix=add_prefix("qkv_proj", prefix),
+            )
+        else:
+            self.qkv_proj = ColumnParallelLinear(
+                input_size=embed_dim,
+                output_size=3 * self.dummy_dim,
+                bias=qkv_bias,
+                quant_config=quant_config,
+                tp_rank=self.tp_rank,
+                tp_size=self.tp_size,
+                prefix=add_prefix("qkv_proj", prefix),
+            )
+        self.proj = RowParallelLinear(
+            input_size=self.dummy_dim,
+            output_size=embed_dim,
+            bias=proj_bias,
+            quant_config=quant_config,
+            tp_rank=self.tp_rank,
+            tp_size=self.tp_size,
+            prefix=add_prefix("proj", prefix),
+        )
+
+    def _determine_attention_backend(self, passed_backend: Optional[str]) -> str:
+        """Decide the multimodal attention backend string.
+
+        Priority: server args override > constructor arg > platform default.
+
+        Platform defaults:
+        - CUDA: "triton_attn"
+        - Non-CUDA: "sdpa"
+        """
+        override_backend = global_server_args_dict["mm_attention_backend"]
+        if override_backend is not None:
+            backend = override_backend
+        elif passed_backend is not None:
+            backend = passed_backend
+        elif is_cuda():
+            major, minor = get_device_capability()
+            if major == 9:
+                backend = "fa3"
+            else:
+                backend = "triton_attn"
+        else:
+            backend = "sdpa"
+        if backend == "fa3" and is_blackwell():
+            raise ValueError("The 'fa3' backend is not supported on Blackwell GPUs")
+
+        return backend
+
+    def _apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor):
+        """apply qk norm for internvl vit attn"""
+        q = q.flatten(1, 2)
+        k = k.flatten(1, 2)
+
+        if self.tp_size > 1:
+            q = tensor_model_parallel_all_gather(q.contiguous())
+            k = tensor_model_parallel_all_gather(k.contiguous())
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        if self.tp_size > 1:
+            splitter = partial(split_tensor_along_last_dim, num_partitions=self.tp_size)
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+        q = q.unflatten(-1, (-1, self.head_size))
+        k = k.unflatten(-1, (-1, self.head_size))
+        return q, k
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            x: [b, s, embed_dim]
+            cu_seqlens: [b]
+        Returns:
+             [s, b, head * head_size]
+        """
+        if x.dim() == 2:
+            x = x.unsqueeze(0)
+        assert x.dim() == 3, x.shape
+        x_shape = x.shape
+        bsz, s, _ = x_shape
+        head = self.num_attention_heads_per_partition
+        kv_head = self.num_attention_kv_heads_per_partition
+        if self.use_qkv_parallel:
+            # [b, s, embed_dim] --> [b, s, embed_dim]
+            qkv, _ = self.qkv_proj(x)
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+            # [b, s, embed_dim] --> [b * s, head, head_size]
+            q = q.reshape(bsz * s, head, -1).contiguous()
+            k = k.reshape(bsz * s, kv_head, -1).contiguous()
+            v = v.reshape(bsz * s, kv_head, -1).contiguous()
+        else:
+            # [b, s, embed_dim] --> [s, b, embed_dim]
+            x = rearrange(x, "b s ... -> s b ...")
+            # [s, b, embed_dim] --> [s, b, head * 3 * head_size]
+            qkv, _ = self.qkv_proj(x)
+
+            # [s, b, head, head_dim_sum]
+            new_x_shape = qkv.size()[:-1] + (
+                head,
+                self.q_size + 2 * self.kv_size,
+            )
+            qkv = qkv.view(*new_x_shape)
+
+            # [s, b, head, 3 * head_size] --> 3 [s, b, head, head_size]
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+            # [s, b, head, head_size] --> [b, s, head, head_size]
+            q, k, v = [
+                rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)
+            ]
+
+        if position_embeddings is not None:
+            original_shape = q.shape
+
+            if self.customized_position_embedding_applier is not None:
+                q, k = self.customized_position_embedding_applier(
+                    q, k, position_embeddings, x_shape
+                )
+                q = q.view(original_shape)
+                k = k.view(original_shape)
+            else:
+                cos, sin = position_embeddings
+
+                # [total_tokens, head, head_size]
+                q = q.view(-1, head, self.head_size)
+                k = k.view(-1, head, self.head_size)
+
+                q, k = apply_rotary_pos_emb(q, k, cos, sin)
+
+                q = q.view(original_shape)
+                k = k.view(original_shape)
+
+        if q.dim() == 4:
+            # [b, s, head, head_size] --> [b * s, head, head_size]
+            q = rearrange(q, "b s ... -> (b s) ...")
+        if k.dim() == 4:
+            # [b, s, head, head_size] --> [b * s, head, head_size]
+            k = rearrange(k, "b s ... -> (b s) ...")
+        if v.dim() == 4:
+            # [b, s, head, head_size] --> [b * s, head, head_size]
+            v = rearrange(v, "b s ... -> (b s) ...")
+
+        assert q.dim() == 3, q.dim()
+        assert k.dim() == 3, k.dim()
+        assert v.dim() == 3, v.dim()
+
+        # internvl
+        if self.qk_normalization:
+            q, k = self._apply_qk_norm(q, k)
+
+        output = self.qkv_backend.forward(
+            q=q,
+            k=k,
+            v=v,
+            bsz=bsz,
+            seq_len=s,
+            cu_seqlens=cu_seqlens,
+            attention_mask=attention_mask,
+        )
+
+        assert output.dim() == 3, output.shape
+
+        if self.use_qkv_parallel:
+            # [b * s, h, head_size] --> [b, s, h * head_size]
+            output = rearrange(output, "(b s) ... h d -> b s ... (h d)", b=bsz)
+
+            # [b, s, h * head_size] --> [b, s, h * head_size]
+            output, _ = self.proj(output)
+        else:
+            # [b * s, h, head_size] --> [s, b, h * head_size]
+            context_layer = rearrange(
+                output, "(b s) h d -> s b (h d)", b=bsz, s=s
+            ).contiguous()
+
+            # [s, b, h * head_size] --> [s, b, h * head_size]
+            output, _ = self.proj(context_layer)
+
+            # [s, b, h * head_size] --> [b, s, h * head_size]
+            output = output.view(bsz, s, -1)
+
+        return output
diff --git a/python/sglang/srt/layers/attention/vision_utils.py b/python/sglang/srt/layers/attention/vision_utils.py
new file mode 100644
index 000000000..ecccb1f85
--- /dev/null
+++ b/python/sglang/srt/layers/attention/vision_utils.py
@@ -0,0 +1,65 @@
+"""Utility functions for vision attention layers."""
+
+import torch
+
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+
+
+def update_vit_attn_dummy_heads_config(config):
+    """Update HF config to ensure vision attention num_attention_heads is divisible by tp_size"""
+    tp_size = get_attention_tp_size()
+    num_heads = getattr(
+        config.vision_config,
+        "num_heads",
+        getattr(config.vision_config, "num_attention_heads", None),
+    )
+    head_dim = config.vision_config.hidden_size // num_heads
+    num_dummy_heads = 0
+
+    if num_heads % tp_size != 0:
+        num_dummy_heads = ((num_heads + tp_size - 1) // tp_size) * tp_size - num_heads
+
+    setattr(config.vision_config, "head_dim", head_dim)
+    setattr(config.vision_config, "num_dummy_heads", num_dummy_heads)
+
+
+def pad_vit_attn_dummy_heads(config, name: str, loaded_weight: torch.Tensor):
+    """Pad attention qkv weights for dummy heads"""
+    num_dummy_heads = config.vision_config.num_dummy_heads
+    if num_dummy_heads == 0:
+        return loaded_weight
+    head_dim = config.vision_config.head_dim
+
+    if "attn.qkv_proj" in name:
+        wq, wk, wv = loaded_weight.chunk(3, dim=0)
+        if name.endswith(".weight"):
+            dummy_shape = [num_dummy_heads, head_dim, wq.shape[-1]]
+        elif name.endswith(".bias"):
+            dummy_shape = [num_dummy_heads, head_dim]
+        else:
+            raise RuntimeError(f"Unsupported weight with name={name}")
+        pad_func = lambda x: torch.cat(
+            [x.unflatten(0, (-1, head_dim)), x.new_zeros(dummy_shape)], dim=0
+        ).flatten(0, 1)
+        wq, wk, wv = pad_func(wq), pad_func(wk), pad_func(wv)
+        loaded_weight = torch.cat([wq, wk, wv], dim=0)
+    elif any([_ in name for _ in ["attn.q_proj", "attn.k_proj", "attn.v_proj"]]):
+        if name.endswith(".weight"):
+            dummy_shape = [num_dummy_heads, head_dim, loaded_weight.shape[-1]]
+        elif name.endswith(".bias"):
+            dummy_shape = [num_dummy_heads, head_dim]
+        else:
+            raise RuntimeError(f"Unsupported weight with name={name}")
+        padded_weight = loaded_weight.new_zeros(dummy_shape)
+        loaded_weight = torch.cat(
+            [loaded_weight.unflatten(0, (-1, head_dim)), padded_weight], dim=0
+        ).flatten(0, 1)
+    elif "attn.proj.weight" in name:
+        padded_weight = loaded_weight.new_zeros(
+            loaded_weight.shape[0], head_dim * num_dummy_heads
+        )
+        loaded_weight = torch.cat([loaded_weight, padded_weight], dim=-1)
+    elif "attn.q_norm.weight" in name or "attn.k_norm.weight" in name:
+        padded_weight = loaded_weight.new_zeros(head_dim * num_dummy_heads)
+        loaded_weight = torch.cat([loaded_weight, padded_weight], dim=0)
+    return loaded_weight
diff --git a/python/sglang/srt/layers/attention/wave_backend.py b/python/sglang/srt/layers/attention/wave_backend.py
new file mode 100644
index 000000000..eb6e061ac
--- /dev/null
+++ b/python/sglang/srt/layers/attention/wave_backend.py
@@ -0,0 +1,627 @@
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.utils import get_bool_env_var, get_device_core_count
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
+
+logger = logging.getLogger(__name__)
+
+
+@triton.jit
+def get_num_kv_splits_triton(
+    num_kv_splits_ptr,
+    seq_lens_ptr,
+    num_seq,
+    num_group,
+    num_head,
+    num_kv_head,
+    max_kv_splits,
+    device_core_count,
+    MAX_NUM_SEQ: tl.constexpr,
+):
+    # TODO: this method is tunable, we need more online serving data to tune it
+    offs_seq = tl.arange(0, MAX_NUM_SEQ)
+    mask_seq = offs_seq < num_seq
+
+    seq_lens = tl.load(seq_lens_ptr + offs_seq, mask=mask_seq, other=0)
+    max_seq_len = tl.max(seq_lens)
+    seq_lens = tl.load(seq_lens_ptr + offs_seq, mask=mask_seq, other=max_seq_len)
+    min_seq_len = tl.min(seq_lens)
+    if max_seq_len * 8 < min_seq_len * 10:
+        min_seq_len = max_seq_len
+    max_kv_splits_1 = tl.minimum(tl.cdiv(max_seq_len, min_seq_len), max_kv_splits)
+    kv_chunk_size_1 = tl.cdiv(max_seq_len, max_kv_splits_1)
+
+    # NOTE: this is a hack to let num_kv_split grows up with seqlen gradually
+    ext_seq_len = tl.cast(max_seq_len, tl.float32) / 64.0
+    ext_device_core_count = tl.cast(
+        device_core_count * tl.maximum(tl.log2(ext_seq_len), 1.0), tl.int32
+    )
+    block_h, num_kv_group = 16, num_head // num_kv_head
+    if num_kv_group == 1:
+        token_grid = num_seq * num_group * num_head
+    else:
+        # from triton_ops/decode_attention.py:_decode_grouped_att_m_fwd
+        block_h = tl.minimum(block_h, num_kv_group)
+        token_grid = num_seq * num_group * tl.cdiv(num_head, block_h)
+    max_kv_splits_2 = tl.minimum(
+        tl.cdiv(ext_device_core_count, token_grid), max_kv_splits
+    )
+    kv_chunk_size_2 = tl.cdiv(max_seq_len, max_kv_splits_2)
+
+    num_kv_splits = tl.maximum(
+        tl.cdiv(seq_lens, kv_chunk_size_1), tl.cdiv(seq_lens, kv_chunk_size_2)
+    )
+
+    offs_token = offs_seq * num_group
+    mask_token = offs_token < num_seq * num_group
+    for i in range(0, num_group):
+        tl.store(num_kv_splits_ptr + i + offs_token, num_kv_splits, mask=mask_token)
+
+
+@dataclass
+class ForwardMetadata:
+    attn_logits: torch.Tensor
+    attn_lse: torch.Tensor
+    max_extend_len: int
+    num_kv_splits: torch.Tensor
+    kv_indptr: torch.Tensor
+    kv_indices: torch.Tensor
+    qo_indptr: torch.Tensor
+    custom_mask: torch.Tensor
+    mask_indptr: torch.Tensor
+
+
+class WaveAttnBackend(AttentionBackend):
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        skip_prefill: bool = False,
+        kv_indptr_buf: Optional[torch.Tensor] = None,
+    ):
+        # Lazy import to avoid the initialization of cuda context
+        from sglang.srt.layers.attention.wave_ops.decode_attention import (
+            decode_attention_fwd,
+        )
+        from sglang.srt.layers.attention.wave_ops.extend_attention import (
+            extend_attention_wave,
+        )
+
+        super().__init__()
+
+        # Set unique cache dir for each process to avoid cache write races
+        import wave_lang.kernel.wave.cache as cache
+
+        base_cache_dir = cache.CACHE_BASE_DIR
+        new_dir = base_cache_dir / f"worker_{model_runner.tp_rank}"
+        logger.info(f"Setting Wave cache dir: {new_dir}")
+        cache.CACHE_BASE_DIR = new_dir
+
+        self.decode_attention_fwd = decode_attention_fwd
+        self.extend_attention_fwd = extend_attention_wave
+
+        self.skip_prefill = skip_prefill
+
+        max_bs = model_runner.req_to_token_pool.size
+
+        if kv_indptr_buf is None:
+            self.kv_indptr = torch.zeros(
+                (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+            )
+        else:
+            self.kv_indptr = kv_indptr_buf
+
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+
+        if not self.skip_prefill:
+            self.qo_indptr = torch.zeros(
+                (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+            )
+
+            self.mask_indptr = torch.zeros(
+                (max_bs + 1,), dtype=torch.int64, device=model_runner.device
+            )
+
+        self.num_draft_tokens = model_runner.server_args.speculative_num_draft_tokens
+
+        self.num_head = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.num_kv_head = model_runner.model_config.get_num_kv_heads(
+            get_attention_tp_size()
+        )
+
+        self.static_kv_splits = get_bool_env_var(
+            "SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITS", "false"
+        )
+        self.max_kv_splits = model_runner.server_args.triton_attention_num_kv_splits
+        self.v_head_dim = model_runner.token_to_kv_pool.get_value_buffer(0).shape[-1]
+
+        self.forward_metadata: ForwardMetadata = None
+
+        self.max_context_len = model_runner.model_config.context_len
+
+        self.device = model_runner.device
+        self.device_core_count = get_device_core_count(model_runner.gpu_id)
+
+    def get_num_kv_splits(
+        self,
+        num_kv_splits: torch.Tensor,
+        seq_lens: torch.Tensor,
+    ):
+        num_token, num_seq = num_kv_splits.shape[0], seq_lens.shape[0]
+        num_group = num_token // num_seq
+
+        assert (
+            num_group * num_seq == num_token
+        ), f"num_seq({num_seq}), num_token({num_token}), something goes wrong!"
+
+        if self.static_kv_splits or self.device_core_count <= 0:
+            num_kv_splits.fill_(self.max_kv_splits)
+            return
+
+        if num_seq < 256:
+            SCHEDULE_SEQ = 256
+        else:
+            SCHEDULE_SEQ = triton.next_power_of_2(num_seq)
+
+        get_num_kv_splits_triton[(1,)](
+            num_kv_splits,
+            seq_lens,
+            num_seq,
+            num_group,
+            self.num_head,
+            self.num_kv_head,
+            self.max_kv_splits,
+            self.device_core_count,
+            MAX_NUM_SEQ=SCHEDULE_SEQ,
+        )
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Init auxiliary variables for wave attention backend."""
+
+        bs = forward_batch.batch_size
+        kv_indptr = self.kv_indptr
+        spec_info = forward_batch.spec_info
+
+        if forward_batch.forward_mode.is_decode_or_idle():
+            if spec_info is None:
+                kv_indptr[1 : bs + 1] = torch.cumsum(forward_batch.seq_lens, dim=0)
+                kv_indptr = kv_indptr[: bs + 1]
+                kv_indices = torch.empty(
+                    forward_batch.seq_lens_sum, dtype=torch.int32, device=self.device
+                )
+                create_flashinfer_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    forward_batch.req_pool_indices,
+                    forward_batch.seq_lens,
+                    kv_indptr,
+                    None,
+                    kv_indices,
+                    self.req_to_token.stride(0),
+                )
+            else:
+                kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
+                bs = kv_indptr.shape[0] - 1
+
+            from sglang.srt.layers.attention.wave_ops.decode_attention import (
+                decode_attention_intermediate_arrays_shapes,
+            )
+
+            attn_logits_shape, attn_logits_max_shape = (
+                decode_attention_intermediate_arrays_shapes(
+                    bs, self.v_head_dim, self.num_head, self.max_kv_splits
+                )
+            )
+            attn_logits = torch.empty(
+                attn_logits_shape,
+                dtype=torch.float32,
+                device=self.device,
+            )
+            attn_lse = torch.empty(
+                attn_logits_max_shape,
+                dtype=torch.float32,
+                device=self.device,
+            )
+            num_kv_splits = torch.empty((bs,), dtype=torch.int32, device=self.device)
+
+            self.get_num_kv_splits(num_kv_splits, forward_batch.seq_lens)
+
+            qo_indptr = None
+            custom_mask = None
+            mask_indptr = None
+            max_extend_len = None
+        elif forward_batch.forward_mode.is_target_verify():
+            bs = len(forward_batch.req_pool_indices)
+            qo_indptr = torch.arange(
+                0,
+                (1 + bs) * self.num_draft_tokens,
+                step=self.num_draft_tokens,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            # Different with flashinfer kv_indptr and kv_indices construction
+            kv_indptr[1 : bs + 1] = torch.cumsum(forward_batch.seq_lens, dim=0)
+            kv_indptr = kv_indptr[: bs + 1]
+            kv_indices = torch.empty(
+                kv_indptr[-1], dtype=torch.int32, device=self.device
+            )
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+
+            custom_mask = spec_info.custom_mask
+            seq_mask_len = self.num_draft_tokens * (
+                forward_batch.seq_lens + self.num_draft_tokens
+            )
+            mask_indptr = self.mask_indptr
+            mask_indptr[1 : bs + 1] = torch.cumsum(seq_mask_len[:bs], dim=0)
+            mask_indptr = mask_indptr[: bs + 1]
+            max_extend_len = self.num_draft_tokens
+            num_kv_splits = None
+            attn_logits = None
+            attn_lse = None
+        elif forward_batch.forward_mode.is_draft_extend():
+            kv_indices, kv_indptr, qo_indptr, custom_mask = (
+                spec_info.generate_attn_arg_prefill(
+                    forward_batch.req_pool_indices,
+                    forward_batch.seq_lens,
+                    None,
+                    self.req_to_token,
+                )
+            )
+            mask_indptr = None
+            # TODO(FIXME): This will trigger an invalid Eagle tree when using
+            # `max(spec_info.accept_length_cpu)`.
+            # It might have been forgotten to update somewhere.
+            max_extend_len = torch.max(spec_info.accept_length).item()
+            num_kv_splits = None
+            attn_logits = None
+            attn_lse = None
+        else:
+            kv_indptr[1 : bs + 1] = torch.cumsum(
+                forward_batch.extend_prefix_lens, dim=0
+            )
+            kv_indptr = kv_indptr[: bs + 1]
+            kv_indices = torch.empty(
+                forward_batch.extend_prefix_lens.sum().item(),
+                dtype=torch.int32,
+                device=self.device,
+            )
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                forward_batch.req_pool_indices,
+                forward_batch.extend_prefix_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+
+            qo_indptr = self.qo_indptr
+            qo_indptr[1 : bs + 1] = torch.cumsum(forward_batch.extend_seq_lens, dim=0)
+            qo_indptr = qo_indptr[: bs + 1]
+            custom_mask = None
+            mask_indptr = None
+            attn_logits = None
+            attn_lse = None
+            max_extend_len = torch.max(forward_batch.extend_seq_lens).item()
+            num_kv_splits = None
+
+        self.forward_metadata = ForwardMetadata(
+            attn_logits,
+            attn_lse,
+            max_extend_len,
+            num_kv_splits,
+            kv_indptr,
+            kv_indices,
+            qo_indptr,
+            custom_mask,
+            mask_indptr,
+        )
+
+    def init_cuda_graph_state(
+        self,
+        max_bs: int,
+        max_num_tokens: int,
+        kv_indices_buf: Optional[torch.Tensor] = None,
+    ):
+        from sglang.srt.layers.attention.wave_ops.decode_attention import (
+            decode_attention_intermediate_arrays_shapes,
+        )
+
+        attn_logits_shape, attn_logits_max_shape = (
+            decode_attention_intermediate_arrays_shapes(
+                max_bs, self.v_head_dim, self.num_head, self.max_kv_splits
+            )
+        )
+        self.cuda_graph_attn_logits = torch.zeros(
+            attn_logits_shape,
+            dtype=torch.float32,
+            device=self.device,
+        )
+        self.cuda_graph_attn_lse = torch.zeros(
+            attn_logits_max_shape,
+            dtype=torch.float32,
+            device=self.device,
+        )
+        self.cuda_graph_num_kv_splits = torch.full(
+            (max_bs,), self.max_kv_splits, dtype=torch.int32, device=self.device
+        )
+        if kv_indices_buf is None:
+            self.cuda_graph_kv_indices = torch.zeros(
+                (max_bs * self.max_context_len),
+                dtype=torch.int32,
+                device=self.device,
+            )
+        else:
+            self.cuda_graph_kv_indices = kv_indices_buf
+
+        if not self.skip_prefill:
+            self.cuda_graph_custom_mask = torch.zeros(
+                (max_bs * self.max_context_len),
+                dtype=torch.uint8,
+                device=self.device,
+            )
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        assert encoder_lens is None, "Not supported"
+
+        if forward_mode.is_decode_or_idle():
+            if spec_info is None:
+                kv_indptr = self.kv_indptr
+                kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
+                kv_indptr = kv_indptr[: bs + 1]
+                kv_indices = self.cuda_graph_kv_indices
+                create_flashinfer_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    req_pool_indices,
+                    seq_lens,
+                    kv_indptr,
+                    None,
+                    kv_indices,
+                    self.req_to_token.stride(0),
+                )
+            else:
+                kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
+
+            attn_logits = self.cuda_graph_attn_logits
+            attn_lse = self.cuda_graph_attn_lse
+            max_extend_len = None
+            num_kv_splits = self.cuda_graph_num_kv_splits
+            qo_indptr = None
+            custom_mask = None
+            mask_indptr = None
+        elif forward_mode.is_target_verify():
+            qo_indptr = self.qo_indptr[: bs + 1]
+            qo_indptr[: bs + 1] = torch.arange(
+                0,
+                (1 + bs) * self.num_draft_tokens,
+                step=self.num_draft_tokens,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            kv_indptr = self.kv_indptr[: bs + 1]
+            kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
+            kv_indices = self.cuda_graph_kv_indices
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                seq_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+
+            custom_mask = self.cuda_graph_custom_mask
+            seq_mask_len = self.num_draft_tokens * (seq_lens + self.num_draft_tokens)
+            mask_indptr = self.mask_indptr[: bs + 1]
+            mask_indptr[1 : bs + 1] = torch.cumsum(seq_mask_len, dim=0)
+            max_extend_len = self.num_draft_tokens
+            num_kv_splits = None
+            attn_logits = None
+            attn_lse = None
+        else:
+            raise ValueError(
+                f"Invalid forward mode: {forward_mode=} for CUDA Graph capture."
+            )
+
+        self.forward_metadata = ForwardMetadata(
+            attn_logits,
+            attn_lse,
+            max_extend_len,
+            num_kv_splits,
+            kv_indptr,
+            kv_indices,
+            qo_indptr,
+            custom_mask,
+            mask_indptr,
+        )
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        # NOTE: encoder_lens expected to be zeros or None
+        if forward_mode.is_decode_or_idle():
+            # Update kv_indptr, kv_indices
+            kv_indptr = self.kv_indptr
+            kv_indices = self.cuda_graph_kv_indices
+            num_kv_splits = self.cuda_graph_num_kv_splits
+            if spec_info is None:
+                kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens[:bs], dim=0)
+                kv_indptr = kv_indptr[: bs + 1]
+                create_flashinfer_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    req_pool_indices[:bs],
+                    seq_lens[:bs],
+                    kv_indptr,
+                    None,
+                    kv_indices,
+                    self.req_to_token.stride(0),
+                )
+                num_token = bs
+            else:
+                kv_indptr[: spec_info.kv_indptr.shape[0]] = spec_info.kv_indptr
+                kv_indices[: spec_info.kv_indices.shape[0]] = spec_info.kv_indices
+                num_token = spec_info.kv_indptr.shape[0] - 1
+            self.get_num_kv_splits(num_kv_splits[:num_token], seq_lens[:bs])
+        elif forward_mode.is_target_verify():
+            # Update qo_indptr, kv_indptr, kv_indices, custom_mask, mask_indptr
+            bs = len(req_pool_indices)
+            qo_indptr = self.qo_indptr[: bs + 1]
+            qo_indptr[: bs + 1] = torch.arange(
+                0,
+                (1 + bs) * self.num_draft_tokens,
+                step=self.num_draft_tokens,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            kv_indptr = self.kv_indptr[: bs + 1]
+            kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
+            kv_indices = self.cuda_graph_kv_indices
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                seq_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+            custom_mask = self.cuda_graph_custom_mask
+            custom_mask[: spec_info.custom_mask.shape[0]] = spec_info.custom_mask
+            seq_mask_len = self.num_draft_tokens * (seq_lens + self.num_draft_tokens)
+            mask_indptr = self.mask_indptr[: bs + 1]
+            mask_indptr[1 : bs + 1] = torch.cumsum(seq_mask_len, dim=0)
+        else:
+            raise ValueError(
+                f"Invalid forward mode: {forward_mode=} for CUDA Graph replay."
+            )
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        return 1
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        # TODO: reuse the buffer across layers
+        if layer.qk_head_dim != layer.v_head_dim:
+            o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        if save_kv_cache:
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                layer, forward_batch.out_cache_loc, k, v
+            )
+
+        max_extend_len = self.forward_metadata.max_extend_len
+        computed_max_ext_seq_len = torch.max(forward_batch.extend_seq_lens)
+        if computed_max_ext_seq_len != max_extend_len:
+            assert len(forward_batch.extend_seq_lens) == 1
+            forward_batch.extend_seq_lens[0] = max_extend_len
+            forward_batch.seq_lens = max_extend_len
+
+        self.extend_attention_fwd(
+            q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+            k.contiguous(),
+            v.contiguous(),
+            forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+            forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+            self.forward_metadata.qo_indptr,
+            self.forward_metadata.kv_indptr,
+            self.forward_metadata.kv_indices,
+            self.forward_metadata.custom_mask,
+            self.forward_metadata.mask_indptr,
+            self.forward_metadata.max_extend_len,
+            o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+            is_causal=True,
+            layer_scaling=layer.scaling,
+            logit_cap=layer.logit_cap,
+        )
+        return o
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        # During torch.compile, there is a bug in rotary_emb that causes the
+        # output value to have a 3D tensor shape. This reshapes the output correctly.
+        q = q.reshape(-1, layer.tp_q_head_num * layer.qk_head_dim)
+
+        # TODO: reuse the buffer across layers
+        if layer.qk_head_dim != layer.v_head_dim:
+            o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        if save_kv_cache:
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                layer, forward_batch.out_cache_loc, k, v
+            )
+
+        self.decode_attention_fwd(
+            q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+            forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+            forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+            o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+            self.forward_metadata.kv_indptr,
+            self.forward_metadata.kv_indices,
+            self.forward_metadata.attn_logits,
+            self.forward_metadata.attn_lse,
+            self.forward_metadata.num_kv_splits,
+            self.max_kv_splits,
+            layer.scaling,
+            layer.logit_cap,
+        )
+        return o
diff --git a/python/sglang/srt/layers/attention/wave_ops/decode_attention.py b/python/sglang/srt/layers/attention/wave_ops/decode_attention.py
new file mode 100644
index 000000000..c76bee9af
--- /dev/null
+++ b/python/sglang/srt/layers/attention/wave_ops/decode_attention.py
@@ -0,0 +1,184 @@
+"""
+Memory-efficient attention for decoding.
+It supports page size = 1.
+"""
+
+import functools
+import logging
+
+from wave_lang.kernel.lang.global_symbols import *
+from wave_lang.kernel.wave.compile import WaveCompileOptions, wave_compile
+from wave_lang.kernel.wave.constraints import GenericDot, MMAOperand, MMAType
+from wave_lang.kernel.wave.templates.paged_decode_attention import (
+    get_paged_decode_attention_kernels,
+    get_paged_decode_intermediate_arrays_shapes,
+    paged_decode_attention_shape,
+)
+from wave_lang.kernel.wave.utils.general_utils import get_default_scheduling_params
+from wave_lang.kernel.wave.utils.run_utils import set_default_run_config
+
+logger = logging.getLogger(__name__)
+import os
+
+dump_generated_mlir = int(os.environ.get("WAVE_DUMP_MLIR", 0))
+
+
+@functools.lru_cache(maxsize=4096)
+def get_wave_kernel(
+    shape: paged_decode_attention_shape,
+    max_kv_splits,
+    input_dtype,
+    output_dtype,
+    logit_cap,
+):
+    mha = (shape.num_query_heads // shape.num_kv_heads) == 1
+
+    # Get the kernels (either compile or load from cache).
+    if mha:
+        mfma_variant = (
+            GenericDot(along_dim=MMAOperand.M, k_vec_size=4, k_mult=1),
+            GenericDot(along_dim=MMAOperand.M, k_vec_size=1, k_mult=64),
+        )
+    else:
+        mfma_variant = (MMAType.F32_16x16x16_F16, MMAType.F32_16x16x16_F16)
+
+    (
+        phase_0,
+        phase_1,
+        hyperparams_0,
+        hyperparams_1,
+        dynamic_symbols_0,
+        dynamic_symbols_1,
+    ) = get_paged_decode_attention_kernels(
+        shape,
+        mfma_variant,
+        max_kv_splits,
+        input_dtype=input_dtype,
+        output_dtype=output_dtype,
+        logit_cap=logit_cap,
+    )
+    hyperparams_0.update(get_default_scheduling_params())
+    hyperparams_1.update(get_default_scheduling_params())
+
+    options = WaveCompileOptions(
+        subs=hyperparams_0,
+        canonicalize=True,
+        run_bench=False,
+        use_buffer_ops=True,
+        waves_per_eu=2,
+        dynamic_symbols=dynamic_symbols_0,
+        wave_runtime=True,
+    )
+    options = set_default_run_config(options)
+    phase_0 = wave_compile(options, phase_0)
+
+    options = WaveCompileOptions(
+        subs=hyperparams_1,
+        canonicalize=True,
+        run_bench=False,
+        use_buffer_ops=False,
+        waves_per_eu=4,
+        dynamic_symbols=dynamic_symbols_1,
+        wave_runtime=True,
+    )
+    options = set_default_run_config(options)
+    phase_1 = wave_compile(options, phase_1)
+
+    return phase_0, phase_1
+
+
+def decode_attention_intermediate_arrays_shapes(
+    num_seqs, head_size_kv, num_query_heads, max_kv_splits
+):
+    # Not all fields are used, but we need to pass them to the function
+    shape = paged_decode_attention_shape(
+        num_query_heads=num_query_heads,
+        num_kv_heads=0,
+        head_size=0,
+        head_size_kv=head_size_kv,
+        block_size=0,
+        num_seqs=num_seqs,
+    )
+    return get_paged_decode_intermediate_arrays_shapes(shape, max_kv_splits)
+
+
+def decode_attention_wave(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    b_req_idx,
+    req_to_token,
+    attn_logits,
+    attn_logits_max,
+    num_kv_splits,
+    max_kv_splits,
+    sm_scale,
+    logit_cap,
+):
+    num_seqs, num_query_heads, head_size = q.shape
+    _, num_kv_heads, _ = k_buffer.shape
+    _, _, head_size_kv = v_buffer.shape
+    block_size = 32
+    shape = paged_decode_attention_shape(
+        num_query_heads,
+        num_kv_heads,
+        head_size,
+        head_size_kv,
+        block_size,
+        num_seqs,
+    )
+
+    phase_0, phase_1 = get_wave_kernel(
+        shape, max_kv_splits, q.dtype, o.dtype, logit_cap
+    )
+
+    mb_qk = phase_0(
+        q,
+        k_buffer,
+        v_buffer,
+        b_req_idx,
+        req_to_token,
+        attn_logits,
+        attn_logits_max,
+    )
+    if dump_generated_mlir:
+        filename = f"wave_decode_attention_phase0_{'x'.join(map(str, shape))}.mlir"
+        with open(filename, "w") as f:
+            f.write(mb_qk.module_op.get_asm())
+
+    mb_sv = phase_1(attn_logits, attn_logits_max, b_req_idx, o)
+    if dump_generated_mlir:
+        filename = f"wave_decode_attention_phase1_{'x'.join(map(str, shape))}.mlir"
+        with open(filename, "w") as f:
+            f.write(mb_sv.module_op.get_asm())
+
+
+def decode_attention_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    b_req_idx,
+    req_to_token,
+    attn_logits,
+    attn_logits_max,
+    num_kv_splits,
+    max_kv_splits,
+    sm_scale,
+    logit_cap=0.0,
+):
+    decode_attention_wave(
+        q,
+        k_buffer,
+        v_buffer,
+        o,
+        b_req_idx,
+        req_to_token,
+        attn_logits,
+        attn_logits_max,
+        num_kv_splits,
+        max_kv_splits,
+        sm_scale,
+        logit_cap,
+    )
diff --git a/python/sglang/srt/layers/attention/wave_ops/extend_attention.py b/python/sglang/srt/layers/attention/wave_ops/extend_attention.py
new file mode 100644
index 000000000..27e674db2
--- /dev/null
+++ b/python/sglang/srt/layers/attention/wave_ops/extend_attention.py
@@ -0,0 +1,147 @@
+"""
+Memory-efficient attention for prefill.
+It support page size = 1.
+"""
+
+import functools
+import os
+
+import torch
+from wave_lang.kernel.lang.global_symbols import *
+from wave_lang.kernel.wave.compile import WaveCompileOptions, wave_compile
+from wave_lang.kernel.wave.constraints import MMAType
+from wave_lang.kernel.wave.scheduling.schedule import SchedulingType
+from wave_lang.kernel.wave.templates.attention_common import AttentionShape
+from wave_lang.kernel.wave.templates.extend_attention import get_extend_attention_kernel
+from wave_lang.kernel.wave.utils.general_utils import get_default_scheduling_params
+from wave_lang.kernel.wave.utils.run_utils import set_default_run_config
+
+dump_generated_mlir = int(os.environ.get("WAVE_DUMP_MLIR", 0))
+
+
+@functools.lru_cache
+def get_wave_kernel(
+    shape: AttentionShape,
+    q_shape: tuple[int],
+    k_shape: tuple[int],
+    v_shape: tuple[int],
+    k_cache_shape: tuple[int],
+    v_cache_shape: tuple[int],
+    o_shape: tuple[int],
+    input_dtype: torch.dtype,
+    output_dtype: torch.dtype,
+    size_dtype: torch.dtype,
+    is_causal: bool,
+    logit_cap: float,
+    layer_scaling: float,
+):
+    assert shape.num_query_heads % shape.num_kv_heads == 0
+
+    mfma_variant = (MMAType.F32_16x16x32_K8_F16, MMAType.F32_16x16x16_F16)
+    (
+        extend_attention,
+        hyperparams,
+        dynamic_symbols,
+    ) = get_extend_attention_kernel(
+        shape,
+        mfma_variant,
+        q_shape,
+        k_shape,
+        v_shape,
+        k_cache_shape,
+        v_cache_shape,
+        o_shape,
+        input_dtype=input_dtype,
+        output_dtype=output_dtype,
+        size_dtype=size_dtype,
+        is_causal=is_causal,
+        layer_scaling=layer_scaling,
+        logit_cap=logit_cap,
+    )
+
+    hyperparams.update(get_default_scheduling_params())
+    options = WaveCompileOptions(
+        subs=hyperparams,
+        canonicalize=True,
+        run_bench=False,
+        schedule=SchedulingType.NONE,
+        use_scheduling_barriers=False,
+        dynamic_symbols=dynamic_symbols,
+        use_buffer_ops=True,
+        waves_per_eu=2,
+        denorm_fp_math_f32="preserve-sign",
+        wave_runtime=True,
+    )
+    options = set_default_run_config(options)
+    extend_attention = wave_compile(options, extend_attention)
+
+    return extend_attention
+
+
+def extend_attention_wave(
+    q_extend,
+    k_extend,
+    v_extend,
+    k_buffer,
+    v_buffer,
+    qo_indptr,
+    kv_indptr,
+    kv_indices,
+    custom_mask,
+    mask_indptr,
+    max_seq_len,
+    output,
+    is_causal=True,
+    layer_scaling=None,
+    logit_cap=0,
+):
+    shape = AttentionShape(
+        num_query_heads=q_extend.shape[1],
+        num_kv_heads=k_extend.shape[1],
+        head_size=q_extend.shape[2],
+        head_size_kv=k_extend.shape[2],
+        num_seqs=kv_indptr.shape[0] - 1,
+        max_seq_len=max_seq_len,
+    )
+
+    # Run the wave kernel.
+    extend_attention = get_wave_kernel(
+        shape,
+        q_extend.shape,
+        k_extend.shape,
+        v_extend.shape,
+        k_buffer.shape,
+        v_buffer.shape,
+        output.shape,
+        input_dtype=q_extend.dtype,
+        output_dtype=output.dtype,
+        size_dtype=qo_indptr.dtype,
+        is_causal=is_causal,
+        layer_scaling=layer_scaling,
+        logit_cap=logit_cap,
+    )
+
+    mb = extend_attention(
+        q_extend,
+        k_extend,
+        v_extend,
+        k_buffer,
+        v_buffer,
+        qo_indptr,
+        kv_indptr,
+        kv_indices,
+        max_seq_len,
+        output,
+    )
+
+    if dump_generated_mlir:
+        shape_list = [
+            q_extend.shape[0],
+            q_extend.shape[1],
+            k_extend.shape[1],
+            q_extend.shape[2],
+            k_extend.shape[2],
+        ]
+        filename = f"wave_prefill_attention_{'x'.join(map(str, shape_list))}.mlir"
+        with open(filename, "w") as f:
+            f.write(mb.module_op.get_asm())
diff --git a/python/sglang/srt/layers/attention/wave_ops/prefill_attention.py b/python/sglang/srt/layers/attention/wave_ops/prefill_attention.py
new file mode 100644
index 000000000..2d8aa4678
--- /dev/null
+++ b/python/sglang/srt/layers/attention/wave_ops/prefill_attention.py
@@ -0,0 +1,79 @@
+"""
+Memory-efficient attention for prefill.
+It support page size = 1.
+"""
+
+import math
+import os
+
+from wave_lang.kernel.lang.global_symbols import *
+from wave_lang.kernel.wave.compile import WaveCompileOptions, wave_compile
+from wave_lang.kernel.wave.constraints import MMAType
+from wave_lang.kernel.wave.templates.attention_common import AttentionShape
+from wave_lang.kernel.wave.templates.prefill_attention import (
+    get_prefill_attention_kernel,
+)
+from wave_lang.kernel.wave.utils.general_utils import get_default_scheduling_params
+from wave_lang.kernel.wave.utils.run_utils import set_default_run_config
+
+dump_generated_mlir = int(os.environ.get("WAVE_DUMP_MLIR", 0))
+
+
+def prefill_attention_wave(
+    q, k, v, o, b_start_loc, b_seq_len, max_seq_len, is_causal=True
+):
+
+    shape = AttentionShape(
+        num_query_heads=q.shape[1],
+        num_kv_heads=k.shape[1],
+        head_size=q.shape[2],
+        head_size_kv=k.shape[2],
+        num_seqs=b_seq_len.shape[0],
+        max_seq_len=max_seq_len,
+        total_seq_len=q.shape[0],
+    )
+
+    assert shape.num_query_heads % shape.num_kv_heads == 0
+
+    output_shape = (shape.total_seq_len, shape.num_query_heads, shape.head_size_kv)
+    # Run the wave kernel.
+    mfma_variant = (MMAType.F32_16x16x16_F16, MMAType.F32_16x16x16_F16)
+    (prefill, hyperparams) = get_prefill_attention_kernel(
+        shape,
+        mfma_variant,
+        q.shape,
+        k.shape,
+        v.shape,
+        output_shape,
+        input_dtype=q.dtype,
+        output_dtype=o.dtype,
+        size_dtype=b_seq_len.dtype,
+    )
+
+    hyperparams.update(get_default_scheduling_params())
+
+    log2e = 1.44269504089
+    dk_sqrt = math.sqrt(1.0 / shape.head_size)
+
+    options = WaveCompileOptions(
+        subs=hyperparams,
+        canonicalize=True,
+        run_bench=False,
+        use_scheduling_barriers=False,
+    )
+    options = set_default_run_config(options)
+    prefill = wave_compile(options, prefill)
+
+    mb = prefill(
+        q * dk_sqrt * log2e,
+        k,
+        v,
+        b_start_loc,
+        b_seq_len,
+        o,
+    )
+    if dump_generated_mlir:
+        shape_list = [q.shape[0], q.shape[1], k.shape[1], q.shape[2], k.shape[2]]
+        filename = f"wave_prefill_attention_{'x'.join(map(str, shape_list))}.mlir"
+        with open(filename, "w") as f:
+            f.write(mb.module_op.get_asm())
diff --git a/python/sglang/srt/layers/communicator.py b/python/sglang/srt/layers/communicator.py
new file mode 100644
index 000000000..fba8d8f18
--- /dev/null
+++ b/python/sglang/srt/layers/communicator.py
@@ -0,0 +1,676 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from dataclasses import dataclass
+from enum import Enum, auto
+from functools import partial
+from typing import Dict, Optional
+
+import torch
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.layers.dp_attention import (
+    attn_tp_all_gather_into_tensor,
+    attn_tp_reduce_scatter_tensor,
+    dp_gather_partial,
+    dp_reduce_scatter_tensor,
+    dp_scatter,
+    get_attention_dp_size,
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    get_global_dp_buffer,
+    get_local_dp_buffer,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.moe import (
+    get_moe_a2a_backend,
+    should_use_flashinfer_cutlass_moe_fp4_allgather,
+)
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.utils import (
+    get_bool_env_var,
+    is_cuda,
+    is_flashinfer_available,
+    is_gfx95_supported,
+    is_hip,
+    is_sm90_supported,
+    is_sm100_supported,
+)
+
+_is_flashinfer_available = is_flashinfer_available()
+_is_sm90_supported = is_cuda() and is_sm90_supported()
+_is_sm100_supported = is_cuda() and is_sm100_supported()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and is_hip()
+_is_gfx95_supported = is_gfx95_supported()
+
+if _use_aiter and _is_gfx95_supported:
+    from sglang.srt.layers.quantization.rocm_mxfp4_utils import fused_rms_mxfp4_quant
+
+FUSE_ALLREDUCE_MAX_BATCH_SIZE = 2048
+
+
+class ScatterMode(Enum):
+    """
+    Suppose we have TP=4, DP=2, enable-dp-attention, and the system handles seq a,b,c,d
+    Model input/output: [ab, ab, cd, cd] for four ranks respectively
+    SCATTERED: [a, b, c, d]
+    TP_ATTN_FULL: [ab, ab, cd, cd], i.e. all ranks inside a TP attn group have full data of the group
+    FULL: [abcd, abcd, abcd, abcd]
+    """
+
+    SCATTERED = auto()
+    TP_ATTN_FULL = auto()
+    FULL = auto()
+
+    @staticmethod
+    def model_input_output():
+        """The scatter mode for model forward pass input and output data"""
+        return ScatterMode.TP_ATTN_FULL
+
+
+@dataclass
+class _LayerModeComputationContext:
+    num_layers: int
+    layer_id: int
+    is_layer_sparse: bool
+    is_previous_layer_sparse: Optional[bool]
+
+    def previous_layer(self):
+        assert self.is_previous_layer_sparse is not None
+        return _LayerModeComputationContext(
+            layer_id=self.layer_id - 1,
+            is_layer_sparse=self.is_previous_layer_sparse,
+            is_previous_layer_sparse=None,
+            num_layers=self.num_layers,
+        )
+
+
+@dataclass
+class LayerScatterModes:
+    layer_input_mode: ScatterMode
+    attn_mode: ScatterMode
+    # Can be further split into e.g. mlp_input_mode and mlp_output_mode if needed
+    mlp_mode: ScatterMode
+    middle_residual_mode: ScatterMode
+    layer_output_mode: ScatterMode
+
+    @classmethod
+    def init_new(cls, **kwargs):
+        context = _LayerModeComputationContext(**kwargs)
+        return cls(
+            layer_input_mode=cls._compute_layer_input_mode(context),
+            attn_mode=ScatterMode.TP_ATTN_FULL,
+            mlp_mode=cls._compute_mlp_mode(context),
+            middle_residual_mode=cls._compute_middle_residual_mode(context),
+            layer_output_mode=cls._compute_layer_output_mode(context),
+        )
+
+    @classmethod
+    def _compute_layer_input_mode(cls, context: _LayerModeComputationContext):
+        if context.layer_id == 0:
+            return ScatterMode.model_input_output()
+        return cls._compute_layer_output_mode(context.previous_layer())
+
+    @classmethod
+    def _compute_mlp_mode(cls, context: _LayerModeComputationContext):
+        if context.is_layer_sparse:
+            return (
+                ScatterMode.SCATTERED
+                if (
+                    # Token dispatch/combine will be handled outside of LayerCommunicator for these modes.
+                    not get_moe_a2a_backend().is_none()
+                    or should_use_flashinfer_cutlass_moe_fp4_allgather()
+                )
+                else ScatterMode.FULL
+            )
+        else:
+            return (
+                ScatterMode.SCATTERED
+                if enable_moe_dense_fully_dp()
+                else ScatterMode.FULL
+            )
+
+    @classmethod
+    def _compute_middle_residual_mode(cls, context: _LayerModeComputationContext):
+        mlp_mode = cls._compute_mlp_mode(context)
+        if mlp_mode == ScatterMode.SCATTERED:
+            return ScatterMode.SCATTERED
+        if mlp_mode == ScatterMode.FULL:
+            return ScatterMode.TP_ATTN_FULL
+        raise NotImplementedError
+
+    @classmethod
+    def _compute_layer_output_mode(cls, context: _LayerModeComputationContext):
+        mlp_mode = cls._compute_mlp_mode(context)
+        if context.layer_id == context.num_layers - 1:
+            return ScatterMode.model_input_output()
+        if mlp_mode == ScatterMode.SCATTERED:
+            return ScatterMode.SCATTERED
+        if mlp_mode == ScatterMode.FULL:
+            return ScatterMode.TP_ATTN_FULL
+        raise NotImplementedError
+
+
+def enable_moe_dense_fully_dp():
+    return global_server_args_dict["moe_dense_tp_size"] == 1
+
+
+class LayerCommunicator:
+    def __init__(
+        self,
+        layer_scatter_modes: LayerScatterModes,
+        input_layernorm: torch.nn.Module,
+        post_attention_layernorm: torch.nn.Module,
+        # Reduce scatter requires skipping all-reduce in model code after MoE/MLP, so only enable for models which have that implemented. Remove flag once done for all models that use LayerCommunicator.
+        allow_reduce_scatter: bool = False,
+        is_last_layer: bool = False,
+    ):
+        self.layer_scatter_modes = layer_scatter_modes
+        self.input_layernorm = input_layernorm
+        self.post_attention_layernorm = post_attention_layernorm
+        self.allow_reduce_scatter = allow_reduce_scatter
+        self.is_last_layer = is_last_layer
+
+        self._context = CommunicateContext.init_new()
+        self._communicate_simple_fn = CommunicateSimpleFn.get_fn(
+            input_mode=self.layer_scatter_modes.layer_input_mode,
+            output_mode=self.layer_scatter_modes.attn_mode,
+            context=self._context,
+        )
+        self._communicate_with_all_reduce_and_layer_norm_fn = (
+            CommunicateWithAllReduceAndLayerNormFn.get_fn(
+                hidden_states_input_mode=self.layer_scatter_modes.attn_mode,
+                residual_input_mode=self.layer_scatter_modes.layer_input_mode,
+                hidden_states_output_mode=self.layer_scatter_modes.mlp_mode,
+                residual_output_mode=self.layer_scatter_modes.middle_residual_mode,
+                context=self._context,
+            )
+        )
+        self._communicate_summable_tensor_pair_fn = (
+            CommunicateSummableTensorPairFn.get_fn(
+                hidden_states_input_mode=self.layer_scatter_modes.mlp_mode,
+                residual_input_mode=self.layer_scatter_modes.middle_residual_mode,
+                output_mode=self.layer_scatter_modes.layer_output_mode,
+                context=self._context,
+            )
+        )
+
+    def prepare_attn(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        qaunt_format: str = "",
+    ):
+        if hidden_states.shape[0] == 0:
+            residual = hidden_states
+        else:
+            if (
+                residual is not None
+                and hasattr(hidden_states, "_sglang_needs_allreduce_fusion")
+                and hidden_states._sglang_needs_allreduce_fusion
+            ):
+                hidden_states, residual = (
+                    self.input_layernorm.forward_with_allreduce_fusion(
+                        hidden_states, residual
+                    )
+                )
+            else:
+                if residual is None:
+                    residual = hidden_states
+
+                    if _use_aiter and _is_gfx95_supported and ("mxfp4" in qaunt_format):
+                        hidden_states = fused_rms_mxfp4_quant(
+                            hidden_states,
+                            self.input_layernorm.weight,
+                            self.input_layernorm.variance_epsilon,
+                            None,
+                            None,
+                            None,
+                            None,
+                        )
+                    else:
+                        hidden_states = self.input_layernorm(hidden_states)
+                else:
+                    if _use_aiter and _is_gfx95_supported and ("mxfp4" in qaunt_format):
+                        hidden_states, residual = fused_rms_mxfp4_quant(
+                            hidden_states,
+                            self.input_layernorm.weight,
+                            self.input_layernorm.variance_epsilon,
+                            None,
+                            None,
+                            None,
+                            residual,
+                        )
+                    else:
+                        hidden_states, residual = self.input_layernorm(
+                            hidden_states, residual
+                        )
+
+        hidden_states = self._communicate_simple_fn(
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+            context=self._context,
+        )
+
+        return hidden_states, residual
+
+    def prepare_mlp(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        return self._communicate_with_all_reduce_and_layer_norm_fn(
+            hidden_states=hidden_states,
+            residual=residual,
+            forward_batch=forward_batch,
+            layernorm=self.post_attention_layernorm,
+            context=self._context,
+        )
+
+    def postprocess_layer(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        return self._communicate_summable_tensor_pair_fn(
+            hidden_states=hidden_states,
+            residual=residual,
+            forward_batch=forward_batch,
+            context=self._context,
+            allow_reduce_scatter=self.allow_reduce_scatter,
+        )
+
+    def should_use_reduce_scatter(self, forward_batch: ForwardBatch):
+        return (
+            self.allow_reduce_scatter
+            and self._communicate_summable_tensor_pair_fn
+            is CommunicateSummableTensorPairFn._scatter_hidden_states
+            and forward_batch.dp_padding_mode.is_max_len()
+        )
+
+    def should_fuse_mlp_allreduce_with_next_layer(
+        self, forward_batch: ForwardBatch
+    ) -> bool:
+        speculative_algo = global_server_args_dict.get("speculative_algorithm", None)
+        if (
+            is_dp_attention_enabled()
+            and speculative_algo is not None
+            and speculative_algo.is_eagle()
+        ):
+            return False
+
+        batch_size = (
+            forward_batch.input_ids.shape[0]
+            if hasattr(forward_batch, "input_ids")
+            else 0
+        )
+        if batch_size > FUSE_ALLREDUCE_MAX_BATCH_SIZE:
+            return False
+
+        static_conditions_met = (
+            (not self.is_last_layer)
+            and (self._context.tp_size > 1)
+            and global_server_args_dict.get("enable_flashinfer_allreduce_fusion", False)
+            and _is_flashinfer_available
+        )
+
+        if not static_conditions_met:
+            return False
+
+        return (
+            batch_size > 0
+            and batch_size <= FUSE_ALLREDUCE_MAX_BATCH_SIZE
+            and (not self.is_last_layer)
+        )
+
+
+@dataclass
+class CommunicateContext:
+    process_group_sizes: Dict[ScatterMode, int]
+    attn_tp_rank: int
+    attn_tp_size: int
+    attn_dp_size: int
+    tp_size: int
+
+    def is_same_group_size(self, a: ScatterMode, b: ScatterMode):
+        return self.process_group_sizes[a] == self.process_group_sizes[b]
+
+    @classmethod
+    def init_new(cls):
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+        attn_dp_size = get_attention_dp_size()
+        tp_size = get_tensor_model_parallel_world_size()
+        process_group_sizes = {
+            ScatterMode.SCATTERED: 1,
+            ScatterMode.TP_ATTN_FULL: attn_tp_size,
+            # TODO: support --moe-dense-tp-size > 1
+            ScatterMode.FULL: tp_size,
+        }
+        return cls(
+            process_group_sizes=process_group_sizes,
+            attn_tp_rank=attn_tp_rank,
+            attn_tp_size=attn_tp_size,
+            attn_dp_size=attn_dp_size,
+            tp_size=tp_size,
+        )
+
+
+class CommunicateSimpleFn:
+    @staticmethod
+    def get_fn(
+        input_mode: ScatterMode,
+        output_mode: ScatterMode,
+        context: CommunicateContext,
+    ):
+        if context.is_same_group_size(input_mode, output_mode):
+            return CommunicateSimpleFn._trivial
+
+        if (input_mode == ScatterMode.SCATTERED) and (
+            output_mode == ScatterMode.TP_ATTN_FULL
+        ):
+            return CommunicateSimpleFn._scattered_to_tp_attn_full
+
+        raise NotImplementedError(f"{input_mode=} {output_mode=}")
+
+    @staticmethod
+    def _trivial(
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        context: CommunicateContext,
+    ) -> torch.Tensor:
+        return hidden_states
+
+    @staticmethod
+    def _scattered_to_tp_attn_full(
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        context: CommunicateContext,
+    ) -> torch.Tensor:
+        hidden_states, local_hidden_states = (
+            get_local_dp_buffer(),
+            hidden_states,
+        )
+        attn_tp_all_gather_into_tensor(
+            hidden_states,
+            local_hidden_states,
+        )
+        return hidden_states
+
+
+class CommunicateWithAllReduceAndLayerNormFn:
+    """Besides communication, needs to
+    1. All reduce in tp_attn_group on hidden_states
+    2. Apply layer norm
+    """
+
+    @staticmethod
+    def get_fn(
+        hidden_states_input_mode: ScatterMode,
+        residual_input_mode: ScatterMode,
+        hidden_states_output_mode: ScatterMode,
+        residual_output_mode: ScatterMode,
+        context: CommunicateContext,
+    ):
+
+        if (
+            context.is_same_group_size(
+                hidden_states_input_mode, hidden_states_output_mode
+            )
+            and context.is_same_group_size(residual_input_mode, residual_output_mode)
+            and context.attn_tp_size == 1
+        ):
+            return CommunicateWithAllReduceAndLayerNormFn._simple
+
+        if (
+            (hidden_states_input_mode == ScatterMode.TP_ATTN_FULL)
+            and (
+                residual_input_mode in [ScatterMode.SCATTERED, ScatterMode.TP_ATTN_FULL]
+            )
+            and (hidden_states_output_mode == ScatterMode.FULL)
+            and (residual_output_mode == ScatterMode.TP_ATTN_FULL)
+        ):
+            return partial(
+                CommunicateWithAllReduceAndLayerNormFn._gather_hidden_states_and_residual,
+                residual_input_mode=residual_input_mode,
+            )
+
+        if (
+            (hidden_states_input_mode == ScatterMode.TP_ATTN_FULL)
+            and (
+                residual_input_mode in [ScatterMode.SCATTERED, ScatterMode.TP_ATTN_FULL]
+            )
+            and (hidden_states_output_mode == ScatterMode.SCATTERED)
+            and (residual_output_mode == ScatterMode.SCATTERED)
+        ):
+            return partial(
+                CommunicateWithAllReduceAndLayerNormFn._scatter_hidden_states_and_residual,
+                residual_input_mode=residual_input_mode,
+            )
+
+        raise NotImplementedError(
+            f"{hidden_states_input_mode=} {residual_input_mode=} {hidden_states_output_mode=} {residual_output_mode=}"
+        )
+
+    @staticmethod
+    def _simple(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        layernorm: torch.nn.Module,
+        context: CommunicateContext,
+    ):
+        # TODO move these `if shape != 0` into LayerNorm itself
+        if hidden_states.shape[0] != 0:
+            hidden_states, residual = layernorm(hidden_states, residual)
+        return hidden_states, residual
+
+    @staticmethod
+    def _gather_hidden_states_and_residual(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        layernorm: torch.nn.Module,
+        context: CommunicateContext,
+        *,
+        residual_input_mode,
+    ):
+        if residual_input_mode == ScatterMode.SCATTERED and context.attn_tp_size > 1:
+            residual, local_residual = (
+                get_local_dp_buffer(),
+                residual,
+            )
+            attn_tp_all_gather_into_tensor(residual, local_residual)
+        if context.attn_dp_size != 1:
+            if context.attn_tp_rank == 0:
+                hidden_states += residual
+
+            # Perform layernorm on smaller data before comm. Only valid when attn_tp_size is 1 (tp_size == dp_size)
+            use_layer_norm_before_gather = context.attn_tp_size == 1
+            if use_layer_norm_before_gather and hidden_states.shape[0] != 0:
+                residual = hidden_states
+                hidden_states = layernorm(hidden_states)
+            hidden_states, local_hidden_states = (
+                get_global_dp_buffer(),
+                hidden_states,
+            )
+            dp_gather_partial(hidden_states, local_hidden_states, forward_batch)
+
+            if not use_layer_norm_before_gather:
+                dp_scatter(residual, hidden_states, forward_batch)
+                if hidden_states.shape[0] != 0:
+                    hidden_states = layernorm(hidden_states)
+        else:
+            # According to the discussion in https://github.com/flashinfer-ai/flashinfer/issues/1223#issuecomment-3047256465
+            # We set the max token num to 128 for allreduce fusion with min-latency case(use_oneshot=True).
+            if (
+                (_is_sm100_supported or _is_sm90_supported)
+                and _is_flashinfer_available
+                and hasattr(layernorm, "forward_with_allreduce_fusion")
+                and global_server_args_dict["enable_flashinfer_allreduce_fusion"]
+                and hidden_states.shape[0] <= 4096
+            ):
+                hidden_states, residual = layernorm.forward_with_allreduce_fusion(
+                    hidden_states, residual
+                )
+            else:
+                hidden_states = tensor_model_parallel_all_reduce(hidden_states)
+                hidden_states, residual = layernorm(hidden_states, residual)
+        return hidden_states, residual
+
+    @staticmethod
+    def _scatter_hidden_states_and_residual(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        layernorm: torch.nn.Module,
+        context: CommunicateContext,
+        *,
+        residual_input_mode,
+    ):
+        input_hidden_states = hidden_states
+        hidden_states = hidden_states.tensor_split(context.attn_tp_size)[
+            context.attn_tp_rank
+        ]
+        attn_tp_reduce_scatter_tensor(hidden_states, input_hidden_states)
+        if residual_input_mode == ScatterMode.TP_ATTN_FULL:
+            residual = residual.tensor_split(context.attn_tp_size)[context.attn_tp_rank]
+        if hidden_states.shape[0] != 0:
+            hidden_states, residual = layernorm(hidden_states, residual)
+        return hidden_states, residual
+
+
+class CommunicateSummableTensorPairFn:
+    """It is allowed to make (hidden_states, residual) := (hidden_states + residual, None) if needed."""
+
+    @classmethod
+    def execute(
+        cls,
+        hidden_states_input_mode,
+        residual_input_mode,
+        output_mode,
+        context,
+        **kwargs,
+    ):
+        return cls.get_fn(
+            hidden_states_input_mode=hidden_states_input_mode,
+            residual_input_mode=residual_input_mode,
+            output_mode=output_mode,
+            context=context,
+        )(context=context, **kwargs)
+
+    @staticmethod
+    def get_fn(
+        hidden_states_input_mode: ScatterMode,
+        residual_input_mode: ScatterMode,
+        output_mode: ScatterMode,
+        context: CommunicateContext,
+    ):
+        if context.is_same_group_size(
+            hidden_states_input_mode, output_mode
+        ) and context.is_same_group_size(residual_input_mode, output_mode):
+            return CommunicateSummableTensorPairFn._trivial
+
+        if (
+            (hidden_states_input_mode == ScatterMode.FULL)
+            and (residual_input_mode == ScatterMode.TP_ATTN_FULL)
+            and (output_mode == ScatterMode.TP_ATTN_FULL)
+        ):
+            return CommunicateSummableTensorPairFn._scatter_hidden_states
+
+        if (
+            (hidden_states_input_mode == ScatterMode.SCATTERED)
+            and (residual_input_mode == ScatterMode.SCATTERED)
+            and (output_mode == ScatterMode.TP_ATTN_FULL)
+        ):
+            return CommunicateSummableTensorPairFn._gather
+
+        if (
+            (hidden_states_input_mode == ScatterMode.TP_ATTN_FULL)
+            and (residual_input_mode == ScatterMode.TP_ATTN_FULL)
+            and (output_mode == ScatterMode.SCATTERED)
+        ):
+            return CommunicateSummableTensorPairFn._scatter
+
+        raise NotImplementedError(
+            f"{hidden_states_input_mode=} {residual_input_mode=} {output_mode=}"
+        )
+
+    @staticmethod
+    def _trivial(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        context: CommunicateContext,
+        **kwargs,
+    ):
+        return hidden_states, residual
+
+    @staticmethod
+    def _scatter_hidden_states(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        context: CommunicateContext,
+        allow_reduce_scatter: bool = False,
+    ):
+        hidden_states, global_hidden_states = (
+            get_local_dp_buffer(),
+            hidden_states,
+        )
+        if allow_reduce_scatter and forward_batch.dp_padding_mode.is_max_len():
+            # When using padding, all_reduce is skipped after MLP and MOE and reduce scatter is used here instead.
+            dp_reduce_scatter_tensor(hidden_states, global_hidden_states)
+        else:
+            dp_scatter(hidden_states, global_hidden_states, forward_batch)
+        return hidden_states, residual
+
+    @staticmethod
+    def _gather(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        context: CommunicateContext,
+        **kwargs,
+    ):
+        hidden_states += residual
+        residual = None
+        hidden_states, local_hidden_states = (
+            get_local_dp_buffer(),
+            hidden_states,
+        )
+        attn_tp_all_gather_into_tensor(
+            hidden_states,
+            local_hidden_states,
+        )
+        return hidden_states, residual
+
+    @staticmethod
+    def _scatter(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        context: CommunicateContext,
+    ):
+        assert residual is None, "not yet handled residual!=None"
+        tensor_list = list(hidden_states.tensor_split(context.attn_tp_size))
+        hidden_states = tensor_list[context.attn_tp_rank]
+        return hidden_states, residual
diff --git a/python/sglang/srt/layers/dp_attention.py b/python/sglang/srt/layers/dp_attention.py
new file mode 100644
index 000000000..1250636eb
--- /dev/null
+++ b/python/sglang/srt/layers/dp_attention.py
@@ -0,0 +1,489 @@
+from __future__ import annotations
+
+import functools
+import logging
+from contextlib import contextmanager
+from enum import IntEnum, auto
+from typing import TYPE_CHECKING, List, Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.distributed import (
+    GroupCoordinator,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    get_tp_group,
+    tensor_model_parallel_all_reduce,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.configs.model_config import ModelConfig
+    from sglang.srt.server_args import ServerArgs
+
+logger = logging.getLogger(__name__)
+
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+_ATTN_TP_GROUP: Optional[GroupCoordinator] = None
+_ATTN_TP_RANK: Optional[int] = None
+_ATTN_TP_SIZE: Optional[int] = None
+_ATTN_DP_RANK: Optional[int] = None
+_ATTN_DP_SIZE: Optional[int] = None
+_LOCAL_ATTN_DP_SIZE: Optional[int] = None
+_LOCAL_ATTN_DP_RANK: Optional[int] = None
+_ENABLE_DP_ATTENTION_FLAG: bool = False
+
+
+class DpPaddingMode(IntEnum):
+
+    # Padding tokens to max length and then gather tokens using `all_gather_into_tensor`
+    MAX_LEN = auto()
+    # Padding tokens to sum length and then gather tokens using `all_reduce`
+    SUM_LEN = auto()
+
+    def is_max_len(self):
+        return self == DpPaddingMode.MAX_LEN
+
+    def is_sum_len(self):
+        return self == DpPaddingMode.SUM_LEN
+
+    @classmethod
+    def get_dp_padding_mode(cls, global_num_tokens: List[int]) -> DpPaddingMode:
+        # we choose the mode that minimizes the communication cost
+        max_len = max(global_num_tokens)
+        sum_len = sum(global_num_tokens)
+        if sum_len * 2 > max_len * get_attention_dp_size():
+            return cls.MAX_LEN
+        else:
+            return cls.SUM_LEN
+
+    @classmethod
+    def get_default_mode_in_cuda_graph(cls) -> DpPaddingMode:
+        return cls.MAX_LEN
+
+
+class _DpGatheredBufferWrapper:
+
+    _hidden_size: int
+    _dtype: torch.dtype
+    _device: torch.device
+    _global_dp_buffer_len: int
+    _local_dp_buffer_len: int
+    _global_num_tokens: Optional[List[int]]
+
+    @classmethod
+    def set_metadata(cls, hidden_size: int, dtype: torch.dtype, device: torch.device):
+        cls._hidden_size = hidden_size
+        cls._dtype = dtype
+        cls._device = device
+
+    @classmethod
+    def set_dp_buffer_len(
+        cls,
+        global_dp_buffer_len: int,
+        local_dp_buffer_len: int,
+        global_num_tokens: Optional[List[int]] = None,
+    ):
+        cls._global_dp_buffer_len = global_dp_buffer_len
+        cls._local_dp_buffer_len = local_dp_buffer_len
+        cls._global_num_tokens = global_num_tokens
+
+    @classmethod
+    def get_global_dp_buffer(cls) -> torch.Tensor:
+        return torch.empty(
+            (cls._global_dp_buffer_len, cls._hidden_size),
+            dtype=cls._dtype,
+            device=cls._device,
+        )
+
+    @classmethod
+    def get_local_dp_buffer(cls) -> torch.Tensor:
+        return torch.empty(
+            (cls._local_dp_buffer_len, cls._hidden_size),
+            dtype=cls._dtype,
+            device=cls._device,
+        )
+
+    @classmethod
+    def get_global_dp_buffer_len(cls) -> int:
+        return cls._global_dp_buffer_len
+
+    @classmethod
+    def get_local_dp_buffer_len(cls) -> int:
+        return cls._local_dp_buffer_len
+
+    @classmethod
+    def get_dp_global_num_tokens(cls) -> List[int]:
+        return cls._global_num_tokens
+
+
+def set_dp_buffer_len(
+    global_dp_buffer_len: int,
+    local_dp_buffer_len: int,
+    global_num_tokens: Optional[List[int]] = None,
+):
+    _DpGatheredBufferWrapper.set_dp_buffer_len(
+        global_dp_buffer_len, local_dp_buffer_len, global_num_tokens
+    )
+
+
+def get_global_dp_buffer() -> torch.Tensor:
+    return _DpGatheredBufferWrapper.get_global_dp_buffer()
+
+
+def get_local_dp_buffer() -> torch.Tensor:
+    return _DpGatheredBufferWrapper.get_local_dp_buffer()
+
+
+def get_global_dp_buffer_len() -> int:
+    return _DpGatheredBufferWrapper.get_global_dp_buffer_len()
+
+
+def get_local_dp_buffer_len() -> int:
+    return _DpGatheredBufferWrapper.get_local_dp_buffer_len()
+
+
+def get_dp_global_num_tokens() -> List[int]:
+    return _DpGatheredBufferWrapper.get_dp_global_num_tokens()
+
+
+def compute_dp_attention_world_info(enable_dp_attention, tp_rank, tp_size, dp_size):
+    if not enable_dp_attention:
+        return tp_rank, tp_size, 0
+
+    attn_tp_size = tp_size // dp_size
+    attn_dp_rank = tp_rank // attn_tp_size
+    attn_tp_rank = tp_rank % attn_tp_size
+
+    return attn_tp_rank, attn_tp_size, attn_dp_rank
+
+
+def compute_dp_attention_local_info(
+    enable_dp_attention, tp_rank, tp_size, dp_size, moe_dense_tp_size
+):
+    if not enable_dp_attention:
+        return tp_rank, tp_size, 0
+
+    local_tp_size = moe_dense_tp_size if moe_dense_tp_size else tp_size
+    local_tp_rank = tp_rank % local_tp_size
+    local_dp_size = max(1, dp_size // (tp_size // local_tp_size))
+
+    local_attn_tp_size = local_tp_size // local_dp_size
+    local_attn_dp_rank = local_tp_rank // local_attn_tp_size
+    local_attn_tp_rank = local_tp_rank % local_attn_tp_size
+
+    return local_attn_tp_rank, local_attn_tp_size, local_attn_dp_rank
+
+
+def initialize_dp_attention(
+    server_args: ServerArgs,
+    model_config: ModelConfig,
+):
+    global _ATTN_TP_GROUP, _ATTN_TP_RANK, _ATTN_TP_SIZE, _ATTN_DP_RANK, _ATTN_DP_SIZE
+    global _LOCAL_ATTN_DP_SIZE, _LOCAL_ATTN_DP_RANK, _ENABLE_DP_ATTENTION_FLAG
+
+    from sglang.srt.layers.sampler import SYNC_TOKEN_IDS_ACROSS_TP
+
+    enable_dp_attention = server_args.enable_dp_attention
+    tp_size = server_args.tp_size
+    dp_size = server_args.dp_size
+    moe_dense_tp_size = server_args.moe_dense_tp_size
+    pp_size = server_args.pp_size
+
+    tp_rank = get_tensor_model_parallel_rank()
+
+    _ENABLE_DP_ATTENTION_FLAG = enable_dp_attention
+
+    _ATTN_TP_RANK, _ATTN_TP_SIZE, _ATTN_DP_RANK = compute_dp_attention_world_info(
+        enable_dp_attention, tp_rank, tp_size, dp_size
+    )
+    _, _, _LOCAL_ATTN_DP_RANK = compute_dp_attention_local_info(
+        enable_dp_attention, tp_rank, tp_size, dp_size, moe_dense_tp_size
+    )
+
+    if enable_dp_attention:
+        _ATTN_DP_SIZE = dp_size
+        if moe_dense_tp_size is None:
+            _LOCAL_ATTN_DP_SIZE = _ATTN_DP_SIZE
+        else:
+            _LOCAL_ATTN_DP_SIZE = max(1, dp_size // (tp_size // moe_dense_tp_size))
+    else:
+        _ATTN_DP_SIZE = 1
+        _LOCAL_ATTN_DP_SIZE = 1
+
+    tp_group = get_tp_group()
+    _ATTN_TP_GROUP = GroupCoordinator(
+        [
+            list(range(head, head + _ATTN_TP_SIZE))
+            for head in range(0, pp_size * tp_size, _ATTN_TP_SIZE)
+        ],
+        tp_group.local_rank,
+        torch.distributed.get_backend(tp_group.device_group),
+        use_pynccl=SYNC_TOKEN_IDS_ACROSS_TP,
+        use_pymscclpp=False,
+        use_custom_allreduce=False,
+        use_hpu_communicator=False,
+        use_xpu_communicator=False,
+        use_npu_communicator=False,
+        group_name="attention_tp",
+    )
+
+    _DpGatheredBufferWrapper.set_metadata(
+        hidden_size=model_config.hidden_size,
+        dtype=model_config.dtype,
+        device=torch.device(server_args.device),
+    )
+
+
+def is_dp_attention_enabled() -> bool:
+    return _ENABLE_DP_ATTENTION_FLAG
+
+
+def get_attention_tp_group() -> GroupCoordinator:
+    assert _ATTN_TP_GROUP is not None, "dp attention not initialized!"
+    return _ATTN_TP_GROUP
+
+
+def get_attention_tp_rank() -> int:
+    assert _ATTN_TP_RANK is not None, "dp attention not initialized!"
+    return _ATTN_TP_RANK
+
+
+def get_attention_tp_size() -> int:
+    assert _ATTN_TP_SIZE is not None, "dp attention not initialized!"
+    return _ATTN_TP_SIZE
+
+
+def get_attention_dp_rank() -> int:
+    assert _ATTN_DP_RANK is not None, "dp attention not initialized!"
+    return _ATTN_DP_RANK
+
+
+def get_attention_dp_size() -> int:
+    assert _ATTN_DP_SIZE is not None, "dp attention not initialized!"
+    return _ATTN_DP_SIZE
+
+
+def get_local_attention_dp_rank() -> int:
+    assert _LOCAL_ATTN_DP_RANK is not None, "dp attention not initialized!"
+    return _LOCAL_ATTN_DP_RANK
+
+
+def get_local_attention_dp_size() -> int:
+    assert _LOCAL_ATTN_DP_SIZE is not None, "dp attention not initialized!"
+    return _LOCAL_ATTN_DP_SIZE
+
+
+@contextmanager
+def disable_dp_size():
+    """Patch the tp group temporarily until this function ends.
+
+    This method is for draft workers of speculative decoding to run draft model
+    with different tp degree from that of target model workers.
+
+    Args:
+        tp_group (GroupCoordinator): the tp group coordinator
+    """
+    global _ATTN_DP_SIZE
+    assert _ATTN_DP_SIZE is not None, "dp attention not initialized!"
+
+    old_dp_size = _ATTN_DP_SIZE
+    _ATTN_DP_SIZE = 1
+    try:
+        yield
+    finally:
+        _ATTN_DP_SIZE = old_dp_size
+
+
+def get_dp_local_info(forward_batch: ForwardBatch) -> Tuple[torch.Tensor, torch.Tensor]:
+    # `get_dp_local_info` is only called in global DP gather and scatter. We use global DP rank here.
+    dp_rank = get_attention_dp_rank()
+
+    if forward_batch.dp_local_start_pos is None:
+        cumtokens = torch.cumsum(forward_batch.global_num_tokens_gpu, dim=0)
+        if dp_rank == 0:
+            local_start_pos = torch.zeros_like(cumtokens[0])
+        else:
+            local_start_pos = cumtokens[dp_rank - 1]
+        local_num_tokens = forward_batch.global_num_tokens_gpu[dp_rank]
+
+        forward_batch.dp_local_start_pos = local_start_pos
+        forward_batch.dp_local_num_tokens = local_num_tokens
+
+    return forward_batch.dp_local_start_pos, forward_batch.dp_local_num_tokens
+
+
+@triton.jit
+def memcpy_triton_kernel(
+    dst_ptr,
+    src_ptr,
+    offset_ptr,
+    sz_ptr,
+    offset_src: tl.constexpr,
+    chunk_size,  # multiplied for offset and sz
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0).to(tl.int64)
+    offset = tl.load(offset_ptr).to(tl.int64) * chunk_size
+    sz = tl.load(sz_ptr).to(tl.int64) * chunk_size
+
+    start_index = pid * BLOCK_SIZE
+    offs = tl.arange(0, BLOCK_SIZE)
+    mask = start_index + offs < sz
+
+    if offset_src:
+        data = tl.load(src_ptr + offset + start_index + offs, mask=mask)
+        tl.store(dst_ptr + start_index + offs, data, mask=mask)
+    else:
+        data = tl.load(src_ptr + start_index + offs, mask=mask)
+        tl.store(dst_ptr + offset + start_index + offs, data, mask=mask)
+
+
+def prod(x):
+    return functools.reduce(lambda a, b: a * b, x, 1)
+
+
+def memcpy_triton(dst, src, dim, offset, sz, offset_src):
+    max_size = min(src.numel(), dst.numel())
+    assert dim == 0, "dim != 0 unsupported"
+    assert src.shape[1:] == dst.shape[1:], "src and dst must have same shape"
+    chunk_size = prod(src.shape[1:])
+    BLOCK_SIZE = 8192
+    grid = (triton.cdiv(max_size, BLOCK_SIZE),)
+
+    memcpy_triton_kernel[grid](dst, src, offset, sz, offset_src, chunk_size, BLOCK_SIZE)
+
+
+def _dp_gather_via_all_reduce(
+    global_tokens: torch.Tensor,
+    local_tokens: torch.Tensor,
+    forward_batch: ForwardBatch,
+    is_partial: bool,
+):
+    local_start_pos, local_num_tokens = get_dp_local_info(forward_batch)
+
+    global_tokens.fill_(0)
+    assert local_tokens.is_contiguous()
+    assert global_tokens.is_contiguous()
+
+    if local_tokens.shape[0] > 0 and (is_partial or get_attention_tp_rank() == 0):
+        assert (
+            local_tokens.untyped_storage() is not global_tokens.untyped_storage()
+        ), "aliasing between global_tokens and local_tokens not allowed"
+
+        memcpy_triton(
+            global_tokens, local_tokens, 0, local_start_pos, local_num_tokens, False
+        )
+
+    # Input IDs are in int 32. We should use inplace_all_reduce for local case because of custom all reduce.
+    NUM_GPUS_PER_NODE = 8
+    if (
+        not local_tokens.dtype.is_floating_point
+        and get_tensor_model_parallel_world_size() <= NUM_GPUS_PER_NODE
+    ):
+        torch.ops.sglang.inplace_all_reduce(
+            global_tokens, group_name=get_tp_group().unique_name
+        )
+
+    else:
+        global_tokens[:] = tensor_model_parallel_all_reduce(global_tokens)
+
+
+def _dp_gather_via_all_gather(
+    global_tokens: torch.Tensor,
+    local_tokens: torch.Tensor,
+    forward_batch: ForwardBatch,
+    is_partial: bool,
+):
+    if get_attention_tp_size() == 1:
+        get_tp_group().all_gather_into_tensor(global_tokens, local_tokens)
+        return
+
+    if not is_partial:
+        if get_attention_tp_rank() != 0:
+            local_tokens.fill_(0)
+    scattered_local_tokens = local_tokens.tensor_split(get_attention_tp_size())[
+        get_attention_tp_rank()
+    ]
+    get_attention_tp_group().reduce_scatter_tensor(scattered_local_tokens, local_tokens)
+    get_tp_group().all_gather_into_tensor(global_tokens, scattered_local_tokens)
+
+
+def _dp_gather(
+    global_tokens: torch.Tensor,
+    local_tokens: torch.Tensor,
+    forward_batch: ForwardBatch,
+    is_partial: bool,
+):
+    if forward_batch.dp_padding_mode.is_max_len():
+        _dp_gather_via_all_gather(
+            global_tokens, local_tokens, forward_batch, is_partial
+        )
+    else:
+        _dp_gather_via_all_reduce(
+            global_tokens, local_tokens, forward_batch, is_partial
+        )
+
+
+def dp_gather_partial(
+    global_tokens: torch.Tensor,
+    local_tokens: torch.Tensor,
+    forward_batch: ForwardBatch,
+):
+    _dp_gather(global_tokens, local_tokens, forward_batch, is_partial=True)
+
+
+def dp_gather_replicate(
+    global_tokens: torch.Tensor,
+    local_tokens: torch.Tensor,
+    forward_batch: ForwardBatch,
+):
+    _dp_gather(global_tokens, local_tokens, forward_batch, is_partial=False)
+
+
+def dp_scatter(
+    local_tokens: torch.Tensor,  # output
+    global_tokens: torch.Tensor,  # input
+    forward_batch: ForwardBatch,
+):
+    # local_num_tokens is not necessarily the same as local_tokens.shape[0],
+    # since local_tokens may be padded for cuda graph
+    local_start_pos, local_num_tokens = get_dp_local_info(forward_batch)
+
+    local_tokens.fill_(0)
+    assert local_tokens.is_contiguous()
+    assert global_tokens.is_contiguous()
+    if local_tokens.shape[0] > 0:
+        assert (
+            local_tokens.untyped_storage() is not global_tokens.untyped_storage()
+        ), "aliasing between local_tokens and global_tokens not allowed"
+
+        memcpy_triton(
+            local_tokens, global_tokens, 0, local_start_pos, local_num_tokens, True
+        )
+
+
+def dp_reduce_scatter_tensor(output: torch.Tensor, input: torch.Tensor):
+    if get_tensor_model_parallel_world_size() == get_attention_dp_size():
+        get_tp_group().reduce_scatter_tensor(output, input)
+    else:
+        scattered_local_tokens = input.tensor_split(
+            get_tensor_model_parallel_world_size()
+        )[get_tensor_model_parallel_rank()]
+        get_tp_group().reduce_scatter_tensor(scattered_local_tokens, input)
+        get_attention_tp_group().all_gather_into_tensor(output, scattered_local_tokens)
+
+
+def attn_tp_reduce_scatter_tensor(output: torch.Tensor, input: torch.Tensor):
+    return get_attention_tp_group().reduce_scatter_tensor(output, input)
+
+
+def attn_tp_all_gather_into_tensor(output: torch.Tensor, input: torch.Tensor):
+    return get_attention_tp_group().all_gather_into_tensor(output, input)
+
+
+def attn_tp_all_gather(output_list: List[torch.Tensor], input: torch.Tensor):
+    return get_attention_tp_group().all_gather(input, output_tensor_list=output_list)
diff --git a/python/sglang/srt/layers/elementwise.py b/python/sglang/srt/layers/elementwise.py
new file mode 100644
index 000000000..e05d88b32
--- /dev/null
+++ b/python/sglang/srt/layers/elementwise.py
@@ -0,0 +1,582 @@
+from typing import Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.utils import is_hip
+
+_is_hip = is_hip()
+
+
+fused_softcap_autotune = triton.autotune(
+    configs=[
+        triton.Config(kwargs={"BLOCK_SIZE": 128}, num_warps=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 128}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 128}, num_warps=16),
+        triton.Config(kwargs={"BLOCK_SIZE": 256}, num_warps=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 256}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 512}, num_warps=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 512}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 512}, num_warps=16),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=16),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=32),
+        triton.Config(kwargs={"BLOCK_SIZE": 2048}, num_warps=32),
+        triton.Config(kwargs={"BLOCK_SIZE": 4096}, num_warps=32),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=32),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=32),
+        triton.Config(kwargs={"BLOCK_SIZE": 32768}, num_warps=32),
+    ],
+    key=["n_ele"],
+)
+
+
+@triton.jit
+def fused_softcap_kernel(
+    output_ptr,
+    input_ptr,
+    n_ele,
+    softcap_const: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_ele
+    x = tl.load(input_ptr + offsets, mask=mask)
+    fx = x.to(tl.float32)
+    fxs = fx / softcap_const
+    exped = tl.exp(2 * fxs)
+    top = exped - 1
+    bottom = exped + 1
+    output = top / bottom * softcap_const
+    tl.store(output_ptr + offsets, output, mask=mask)
+
+
+fused_softcap_kernel_autotuned = fused_softcap_autotune(fused_softcap_kernel)
+
+
+def fused_softcap(x, softcap_const, autotune=False):
+    output = torch.empty_like(x, dtype=torch.float32)
+    n_elements = output.numel()
+    if autotune:
+        grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+        fused_softcap_kernel_autotuned[grid](output, x, n_elements, softcap_const)
+    else:
+        fused_softcap_kernel[(triton.cdiv(n_elements, 128),)](
+            output, x, n_elements, softcap_const, BLOCK_SIZE=128, num_warps=8
+        )
+    return output
+
+
+# cast to float + softcap
+class Softcap:
+    def __init__(self, softcap_const: float):
+        self.softcap_const = softcap_const
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if x.is_cuda:
+            return self.forward_cuda(x)
+        else:
+            return self.forward_native(x)
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.tanh(x.float() / self.softcap_const) * self.softcap_const
+
+    def forward_cuda(self, x: torch.Tensor, autotune=False) -> torch.Tensor:
+        return fused_softcap(x, self.softcap_const, autotune=autotune)
+
+
+rmsnorm_autotune = triton.autotune(
+    configs=[
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=4, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=8, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=16, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=16),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=4, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=8, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=16, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=8, num_stages=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=16, num_stages=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 2048}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 2048}, num_warps=16),
+        triton.Config(kwargs={"BLOCK_SIZE": 2048}, num_warps=8, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 2048}, num_warps=16, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 4096}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 4096}, num_warps=16),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=16),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=32),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=8, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=16, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=32, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=8, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=16, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=32, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=16),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=32),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=8, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=16, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=32, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=8, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=16, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=32, num_stages=4),
+    ],
+    key=["hidden_dim"],
+)
+
+
+@triton.jit
+def fused_dual_residual_rmsnorm_kernel(
+    output_ptr,
+    mid_ptr,
+    activ_ptr,
+    residual_ptr,
+    weight1_ptr,
+    weight2_ptr,
+    eps: tl.constexpr,
+    hidden_dim: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    input_start = pid * hidden_dim
+
+    offsets = tl.arange(0, BLOCK_SIZE)
+    mask = offsets < hidden_dim
+
+    a_ = tl.load(activ_ptr + input_start + offsets, mask=mask, other=0.0)
+    a = a_.to(tl.float32)
+    rms = tl.sqrt(tl.sum(a * a, axis=0) / hidden_dim + eps)
+
+    r = tl.load(residual_ptr + input_start + offsets, mask=mask, other=0.0)
+    w1_ = tl.load(weight1_ptr + offsets, mask=mask, other=0.0)
+    w1 = w1_.to(tl.float32)
+
+    a2r = r + (a / rms * w1).to(r.dtype)
+    tl.store(
+        mid_ptr + input_start + offsets,
+        a2r,
+        mask=mask,
+    )
+
+    a2r = a2r.to(tl.float32)
+    rms2 = tl.sqrt(tl.sum(a2r * a2r, axis=0) / hidden_dim + eps)
+
+    w2_ = tl.load(weight2_ptr + offsets, mask=mask, other=0.0)
+    w2 = w2_.to(tl.float32)
+
+    tl.store(
+        output_ptr + input_start + offsets,
+        a2r / rms2 * w2,  # implicitly casts to output dtype here
+        mask=mask,
+    )
+
+
+fused_dual_residual_rmsnorm_kernel_autotune = rmsnorm_autotune(
+    fused_dual_residual_rmsnorm_kernel
+)
+
+
+def fused_dual_residual_rmsnorm(x, residual, weight1, weight2, eps, autotune=False):
+    assert len(x.shape) == 2
+    assert x.shape == residual.shape and x.dtype == residual.dtype
+    output, mid = torch.empty_like(x), torch.empty_like(x)
+    bs, hidden_dim = x.shape
+    if autotune:
+        fused_dual_residual_rmsnorm_kernel_autotune[(bs,)](
+            output, mid, x, residual, weight1, weight2, eps=eps, hidden_dim=hidden_dim
+        )
+    else:
+        max_warps = 16 if _is_hip else 32
+        config = {
+            "BLOCK_SIZE": triton.next_power_of_2(hidden_dim),
+            "num_warps": max(
+                min(triton.next_power_of_2(triton.cdiv(hidden_dim, 256)), max_warps), 4
+            ),
+        }
+
+        fused_dual_residual_rmsnorm_kernel[(bs,)](
+            output,
+            mid,
+            x,
+            residual,
+            weight1,
+            weight2,
+            eps=eps,
+            hidden_dim=hidden_dim,
+            **config,
+        )
+
+    return output, mid
+
+
+@triton.jit
+def fused_rmsnorm_kernel(
+    output_ptr,
+    activ_ptr,
+    weight_ptr,
+    eps: tl.constexpr,
+    hidden_dim: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    input_start = pid * hidden_dim
+
+    offsets = tl.arange(0, BLOCK_SIZE)
+    mask = offsets < hidden_dim
+
+    a_ = tl.load(activ_ptr + input_start + offsets, mask=mask, other=0.0)
+    a = a_.to(tl.float32)
+    rms = tl.sqrt(tl.sum(a * a, axis=0) / hidden_dim + eps)
+
+    w1_ = tl.load(weight_ptr + offsets, mask=mask, other=0.0)
+    w1 = w1_.to(tl.float32)
+
+    a_rms = a / rms * w1
+
+    tl.store(
+        output_ptr + input_start + offsets,
+        a_rms,  # implicitly casts to output dtype here
+        mask=mask,
+    )
+
+
+def fused_rmsnorm(x, weight, eps, autotune=False, inplace=False):
+    assert len(x.shape) == 2
+    if inplace:
+        output = x
+    else:
+        output = torch.empty_like(x)
+    bs, hidden_dim = x.shape
+    max_warps = 16 if _is_hip else 32
+    config = {
+        "BLOCK_SIZE": triton.next_power_of_2(hidden_dim),
+        "num_warps": max(
+            min(triton.next_power_of_2(triton.cdiv(hidden_dim, 256)), max_warps), 4
+        ),
+    }
+
+    fused_rmsnorm_kernel[(bs,)](
+        output, x, weight, eps=eps, hidden_dim=hidden_dim, **config
+    )
+    return output
+
+
+class FusedDualResidualRMSNorm:
+    """
+    Fused implementation of
+    y = RMSNorm2(RMSNorm1(x) + residual))
+    """
+
+    def __init__(self, rmsnorm1, rmsnorm2) -> None:  # the one after rmsnorm1
+        self.rmsnorm1 = rmsnorm1
+        self.rmsnorm2 = rmsnorm2
+        self.variance_epsilon = self.rmsnorm1.variance_epsilon
+        assert self.rmsnorm1.variance_epsilon == self.rmsnorm2.variance_epsilon
+        assert self.rmsnorm1.weight.shape == self.rmsnorm2.weight.shape
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
+    def forward(
+        self, x: torch.Tensor, residual: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if x.is_cuda:
+            return self.forward_cuda(x, residual)
+        else:
+            return self.forward_flashinfer(x, residual)
+
+    def forward_cuda(
+        self, x: torch.Tensor, residual: torch.Tensor, autotune=False
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        return fused_dual_residual_rmsnorm(
+            x,
+            residual,
+            self.rmsnorm1.weight,
+            self.rmsnorm2.weight,
+            self.variance_epsilon,
+            autotune=autotune,
+        )
+
+    def forward_flashinfer(
+        self,
+        x: torch.Tensor,
+        residual: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        normed1 = self.rmsnorm1(x)
+        residual = normed1 + residual
+        return self.rmsnorm2(residual), residual
+
+    def forward_native(
+        self,
+        x: torch.Tensor,
+        residual: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        normed1 = self.rmsnorm1.forward_native(x)
+        residual = normed1 + residual
+        return self.rmsnorm2.forward_native(residual), residual
+
+
+@triton.jit
+def experts_combine_kernel(
+    out_hidden_states,
+    moe_hidden_states,
+    mlp_hidden_states,
+    combine_k: tl.constexpr,
+    hidden_dim: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    start_index_mlp = pid * hidden_dim
+    start_index_rmoe = pid * hidden_dim * combine_k
+    offsets = tl.arange(0, BLOCK_SIZE)
+    mask = offsets < hidden_dim
+    combine_k_offsets = tl.arange(0, combine_k)
+
+    moe_x = tl.load(
+        moe_hidden_states
+        + start_index_rmoe
+        + combine_k_offsets[:, None] * hidden_dim
+        + offsets[None, :],
+        mask=mask[None, :],
+        other=0.0,
+    )
+    moe_x = tl.sum(moe_x, axis=0)
+    mlp_x = tl.load(mlp_hidden_states + start_index_mlp + offsets, mask=mask, other=0.0)
+    combined_x = (moe_x + mlp_x) / 1.4142135623730951
+
+    tl.store(out_hidden_states + start_index_mlp + offsets, combined_x, mask=mask)
+
+
+def experts_combine_triton(moe_hidden_states, mlp_hidden_states, output_buffer=None):
+    assert moe_hidden_states.is_contiguous()
+    assert mlp_hidden_states.is_contiguous()
+
+    if len(moe_hidden_states.shape) == 2:
+        combine_k = 1  # pre-combined
+    else:
+        combine_k = moe_hidden_states.shape[1]
+
+    if output_buffer is None:
+        out_hidden_states = torch.empty_like(mlp_hidden_states)
+    else:
+        flat_output_buffer = output_buffer.view(mlp_hidden_states.dtype).reshape(-1)
+        assert flat_output_buffer.numel() >= mlp_hidden_states.numel()
+        out_hidden_states = flat_output_buffer[: mlp_hidden_states.numel()].reshape(
+            mlp_hidden_states.shape
+        )
+
+    bs, hidden_dim = mlp_hidden_states.shape
+
+    config = {
+        "BLOCK_SIZE": triton.next_power_of_2(hidden_dim),
+        "num_warps": max(
+            min(triton.next_power_of_2(triton.cdiv(hidden_dim, 1024)), 8), 4
+        ),
+    }
+
+    experts_combine_kernel[(bs,)](
+        out_hidden_states,
+        moe_hidden_states,
+        mlp_hidden_states,
+        combine_k,
+        hidden_dim,
+        **config,
+    )
+    return out_hidden_states
+
+
+# gelu on first half of vector
+@triton.jit
+def gelu_and_mul_kernel(
+    out_hidden_states_ptr,  # (bs, hidden_dim)
+    out_scales_ptr,  # (bs,)
+    hidden_states_ptr,  # (bs, hidden_dim * 2)
+    quant_max: tl.constexpr,
+    static_scale: tl.constexpr,
+    hidden_dim: tl.constexpr,  # the output hidden_dim
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+
+    input_start = pid * hidden_dim * 2
+    output_start = pid * hidden_dim
+
+    input1_offs = tl.arange(0, BLOCK_SIZE)
+    mask = tl.arange(0, BLOCK_SIZE) < hidden_dim  # shared for input1, input3, output
+    input3_offs = hidden_dim + tl.arange(0, BLOCK_SIZE)
+    output_offs = tl.arange(0, BLOCK_SIZE)
+
+    x1 = tl.load(
+        hidden_states_ptr + input_start + input1_offs, mask=mask, other=0.0
+    ).to(tl.float32)
+    x3 = tl.load(
+        hidden_states_ptr + input_start + input3_offs, mask=mask, other=0.0
+    ).to(tl.float32)
+
+    # gelu
+    # cast down before mul to better match training?
+    gelu_x1 = 0.5 * (1.0 + tl.erf(x1 * 0.7071067811865475)) * x1
+    out = x3 * gelu_x1.to(hidden_states_ptr.dtype.element_ty)
+
+    if quant_max is not None:
+        raise NotImplementedError()
+
+    tl.store(out_hidden_states_ptr + output_start + output_offs, out, mask=mask)
+
+
+def gelu_and_mul_triton(
+    hidden_states,
+    scales=None,
+    quantize=None,  # dtype to quantize to
+    out=None,
+):
+    bs, in_hidden_dim = hidden_states.shape
+    hidden_dim = in_hidden_dim // 2
+
+    if out is None:
+        out_hidden_states = torch.empty(
+            (bs, hidden_dim),
+            dtype=quantize or hidden_states.dtype,
+            device=hidden_states.device,
+        )
+    else:
+        assert out.shape == (bs, hidden_dim)
+        assert out.dtype == (quantize or hidden_states.dtype)
+        out_hidden_states = out
+    out_scales = None
+    static_scale = False
+    if quantize is not None:
+        if scales is None:
+            out_scales = torch.empty(
+                (bs,), dtype=torch.float32, device=hidden_states.device
+            )
+        else:
+            out_scales = scales
+            static_scale = True
+
+    max_warps = 16 if _is_hip else 32
+    config = {
+        # 8 ele per thread (not tuned)
+        "num_warps": max(
+            min(triton.next_power_of_2(triton.cdiv(hidden_dim, 8 * 32)), max_warps), 4
+        ),
+    }
+
+    gelu_and_mul_kernel[(bs,)](
+        out_hidden_states,
+        out_scales,
+        hidden_states,
+        quant_max=torch.finfo(quantize).max if quantize is not None else None,
+        static_scale=static_scale,
+        hidden_dim=hidden_dim,
+        BLOCK_SIZE=triton.next_power_of_2(hidden_dim),
+        **config,
+    )
+
+    if quantize is not None:
+        return out_hidden_states, out_scales
+    else:
+        return out_hidden_states, None
+
+
+# silu on first half of vector
+@triton.jit
+def silu_and_mul_kernel(
+    out_hidden_states_ptr,  # (bs, hidden_dim)
+    out_scales_ptr,  # (bs,)
+    hidden_states_ptr,  # (bs, hidden_dim * 2)
+    quant_max: tl.constexpr,
+    static_scale: tl.constexpr,
+    hidden_dim: tl.constexpr,  # the output hidden_dim
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+
+    input_start = pid * hidden_dim * 2
+    output_start = pid * hidden_dim
+
+    input1_offs = tl.arange(0, BLOCK_SIZE)
+    mask = tl.arange(0, BLOCK_SIZE) < hidden_dim  # shared for input1, input3, output
+    input3_offs = hidden_dim + tl.arange(0, BLOCK_SIZE)
+    output_offs = tl.arange(0, BLOCK_SIZE)
+
+    x1 = tl.load(
+        hidden_states_ptr + input_start + input1_offs, mask=mask, other=0.0
+    ).to(tl.float32)
+    x3 = tl.load(
+        hidden_states_ptr + input_start + input3_offs, mask=mask, other=0.0
+    ).to(tl.float32)
+
+    # silu
+    # cast down before mul to better match training?
+    silu_x1 = x1 * tl.sigmoid(x1)
+    out = x3 * silu_x1.to(hidden_states_ptr.dtype.element_ty)
+
+    if quant_max is not None:
+        raise NotImplementedError()
+
+    tl.store(out_hidden_states_ptr + output_start + output_offs, out, mask=mask)
+
+
+def silu_and_mul_triton(
+    hidden_states,
+    scales=None,
+    quantize=None,  # dtype to quantize to
+    out=None,
+):
+    bs, in_hidden_dim = hidden_states.shape
+    hidden_dim = in_hidden_dim // 2
+
+    if out is None:
+        out_hidden_states = torch.empty(
+            (bs, hidden_dim),
+            dtype=quantize or hidden_states.dtype,
+            device=hidden_states.device,
+        )
+    else:
+        assert out.shape == (bs, hidden_dim)
+        assert out.dtype == (quantize or hidden_states.dtype)
+        out_hidden_states = out
+    out_scales = None
+    static_scale = False
+    if quantize is not None:
+        if scales is None:
+            out_scales = torch.empty(
+                (bs,), dtype=torch.float32, device=hidden_states.device
+            )
+        else:
+            out_scales = scales
+            static_scale = True
+
+    max_warps = 16 if _is_hip else 32
+    config = {
+        # 8 ele per thread (not tuned)
+        "num_warps": max(
+            min(triton.next_power_of_2(triton.cdiv(hidden_dim, 8 * 32)), max_warps), 4
+        ),
+    }
+
+    silu_and_mul_kernel[(bs,)](
+        out_hidden_states,
+        out_scales,
+        hidden_states,
+        quant_max=torch.finfo(quantize).max if quantize is not None else None,
+        static_scale=static_scale,
+        hidden_dim=hidden_dim,
+        BLOCK_SIZE=triton.next_power_of_2(hidden_dim),
+        **config,
+    )
+
+    if quantize is not None:
+        return out_hidden_states, out_scales
+    else:
+        return out_hidden_states, None
diff --git a/python/sglang/srt/layers/flashinfer_comm_fusion.py b/python/sglang/srt/layers/flashinfer_comm_fusion.py
new file mode 100644
index 000000000..81280db0a
--- /dev/null
+++ b/python/sglang/srt/layers/flashinfer_comm_fusion.py
@@ -0,0 +1,230 @@
+import logging
+from typing import Optional, Tuple
+
+import torch
+import torch.distributed as dist
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.utils import (
+    direct_register_custom_op,
+    is_flashinfer_available,
+    supports_custom_op,
+)
+
+logger = logging.getLogger(__name__)
+
+_flashinfer_comm = None
+_workspace_manager = None
+
+if is_flashinfer_available():
+    try:
+        import flashinfer.comm as comm
+
+        _flashinfer_comm = comm
+    except ImportError:
+        logger.warning(
+            "flashinfer.comm is not available, falling back to standard "
+            "implementation"
+        )
+
+
+class FlashInferWorkspaceManager:
+    def __init__(self):
+        self.workspace_tensor = None
+        self.ipc_handles = None
+        self.world_size = None
+        self.rank = None
+        self.initialized = False
+
+    def initialize(
+        self,
+        world_size: int,
+        rank: int,
+        max_token_num: int,
+        hidden_dim: int,
+        group=None,
+        use_fp32_lamport: bool = False,
+    ):
+        """Initialize workspace"""
+        if self.initialized and self.world_size == world_size:
+            return
+
+        if _flashinfer_comm is None:
+            logger.warning(
+                "FlashInfer comm not available, skipping workspace " "initialization"
+            )
+            return
+
+        self.cleanup()
+
+        self.ipc_handles, self.workspace_tensor = (
+            comm.trtllm_create_ipc_workspace_for_all_reduce_fusion(
+                rank,
+                world_size,
+                max_token_num,
+                hidden_dim,
+                group=group,
+                use_fp32_lamport=use_fp32_lamport,
+            )
+        )
+
+        self.world_size = world_size
+        self.rank = rank
+        self.initialized = True
+
+        logger.info(
+            f"FlashInfer workspace initialized for rank {rank}, "
+            f"world_size {world_size}"
+        )
+
+    def cleanup(self):
+        """Clean up workspace"""
+        if self.initialized and self.ipc_handles is not None:
+            try:
+                _flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce(
+                    self.ipc_handles, group=dist.group.WORLD
+                )
+            except Exception as e:
+                logger.warning(f"Failed to cleanup FlashInfer workspace: {e}")
+            finally:
+                self.workspace_tensor = None
+                self.ipc_handles = None
+                self.initialized = False
+
+
+_workspace_manager = FlashInferWorkspaceManager()
+
+
+def ensure_workspace_initialized(
+    max_token_num: int = 2048, hidden_dim: int = 4096, use_fp32_lamport: bool = False
+):
+    """Ensure workspace is initialized"""
+    if not is_flashinfer_available() or _flashinfer_comm is None:
+        return False
+
+    world_size = get_tensor_model_parallel_world_size()
+    if world_size <= 1:
+        return False
+
+    rank = dist.get_rank()
+
+    if (
+        not _workspace_manager.initialized
+        or _workspace_manager.world_size != world_size
+    ):
+        _workspace_manager.initialize(
+            world_size=world_size,
+            rank=rank,
+            max_token_num=max_token_num,
+            hidden_dim=hidden_dim,
+            use_fp32_lamport=use_fp32_lamport,
+        )
+
+    return _workspace_manager.initialized
+
+
+def flashinfer_allreduce_residual_rmsnorm(
+    input_tensor: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+    max_token_num: int = 2048,
+    use_oneshot: Optional[bool] = None,
+    trigger_completion_at_end: bool = False,
+    fp32_acc: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Use FlashInfer's fused allreduce + residual + RMS norm operation
+
+    Args:
+        input_tensor: Input tensor that needs allreduce
+        residual: Residual tensor
+        weight: RMS norm weight
+        eps: RMS norm epsilon
+        max_token_num: Maximum token number
+        use_oneshot: Whether to use oneshot mode
+        trigger_completion_at_end: Whether to trigger completion at end
+        fp32_acc: Whether to use fp32 precision
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: (norm_output, residual_output)
+    """
+    if not is_flashinfer_available() or _flashinfer_comm is None:
+        logger.debug(
+            "FlashInfer not available, falling back to standard " "implementation"
+        )
+        return None, None
+
+    world_size = get_tensor_model_parallel_world_size()
+    if world_size <= 1:
+        logger.debug("Single GPU, no need for allreduce fusion")
+        return None, None
+
+    if not ensure_workspace_initialized(
+        max_token_num=max_token_num,
+        hidden_dim=input_tensor.shape[-1],
+        use_fp32_lamport=(input_tensor.dtype == torch.float32),
+    ):
+        logger.debug("FlashInfer workspace not available")
+        return None, None
+
+    token_num, hidden_dim = input_tensor.shape
+
+    residual_out = torch.empty_like(residual)
+    norm_out = torch.empty_like(input_tensor)
+
+    _flashinfer_comm.trtllm_allreduce_fusion(
+        allreduce_in=input_tensor,
+        world_size=world_size,
+        world_rank=dist.get_rank(),
+        token_num=token_num,
+        hidden_dim=hidden_dim,
+        workspace_ptrs=_workspace_manager.workspace_tensor,
+        launch_with_pdl=True,
+        use_oneshot=use_oneshot,
+        trigger_completion_at_end=trigger_completion_at_end,
+        fp32_acc=fp32_acc,
+        pattern_code=(_flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm),
+        allreduce_out=None,
+        residual_in=residual,
+        residual_out=residual_out,
+        norm_out=norm_out,
+        quant_out=None,
+        scale_out=None,
+        rms_gamma=weight,
+        rms_eps=eps,
+        scale_factor=None,
+        layout_code=None,
+    )
+
+    return norm_out, residual_out
+
+
+def fake_flashinfer_allreduce_residual_rmsnorm(
+    input_tensor: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+    max_token_num: int = 2048,
+    use_oneshot: Optional[bool] = None,
+    trigger_completion_at_end: bool = False,
+    fp32_acc: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    residual_out = torch.empty_like(residual)
+    norm_out = torch.empty_like(input_tensor)
+    return norm_out, residual_out
+
+
+if supports_custom_op():
+    direct_register_custom_op(
+        "flashinfer_allreduce_residual_rmsnorm",
+        flashinfer_allreduce_residual_rmsnorm,
+        mutates_args=["input_tensor", "residual", "weight"],
+        fake_impl=fake_flashinfer_allreduce_residual_rmsnorm,
+    )
+
+
+def cleanup_flashinfer_workspace():
+    global _workspace_manager
+    if _workspace_manager is not None:
+        _workspace_manager.cleanup()
diff --git a/python/sglang/srt/layers/layernorm.py b/python/sglang/srt/layers/layernorm.py
new file mode 100644
index 000000000..12cd49adf
--- /dev/null
+++ b/python/sglang/srt/layers/layernorm.py
@@ -0,0 +1,343 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Fused operators for normalization layers."""
+
+import logging
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from packaging.version import Version
+
+from sglang.srt.custom_op import CustomOp
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    get_bool_env_var,
+    is_cpu,
+    is_cuda,
+    is_flashinfer_available,
+    is_hip,
+    is_npu,
+    is_xpu,
+    supports_custom_op,
+)
+
+_is_cuda = is_cuda()
+_is_flashinfer_available = is_flashinfer_available()
+_is_hip = is_hip()
+_is_npu = is_npu()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+_is_xpu = is_xpu()
+
+if _is_cuda:
+    if _is_flashinfer_available:
+        from flashinfer.norm import fused_add_rmsnorm
+    else:
+        from sgl_kernel import fused_add_rmsnorm
+    from sgl_kernel import gemma_fused_add_rmsnorm, gemma_rmsnorm, rmsnorm
+
+if _use_aiter:
+    from aiter import rmsnorm2d_fwd as rms_norm
+    from aiter import rmsnorm2d_fwd_with_add as fused_add_rms_norm
+elif _is_hip:
+    import vllm
+    from vllm._custom_ops import fused_add_rms_norm, rms_norm
+
+    _vllm_version = Version(vllm.__version__)
+
+logger = logging.getLogger(__name__)
+
+if _is_npu:
+    import torch_npu
+
+
+class RMSNorm(CustomOp):
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+        var_hidden_size: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+        self.hidden_size = hidden_size
+        self.variance_size_override = (
+            None if var_hidden_size == hidden_size else var_hidden_size
+        )
+        if _use_aiter:
+            self._forward_method = self.forward_aiter
+
+    def forward_cuda(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if self.variance_size_override is not None:
+            return self.forward_native(x, residual)
+        if residual is not None:
+            fused_add_rmsnorm(x, residual, self.weight.data, self.variance_epsilon)
+            return x, residual
+        out = rmsnorm(x, self.weight.data, self.variance_epsilon)
+        return out
+
+    def forward_npu(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if residual is not None:
+            out, _, residual_out = torch_npu.npu_add_rms_norm(
+                residual, x, self.weight.data, self.variance_epsilon
+            )
+            return out, residual_out
+        return torch_npu.npu_rms_norm(x, self.weight.data, self.variance_epsilon)[0]
+
+    def forward_aiter(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if residual is not None:
+            residual_out = torch.empty_like(x)
+            output = torch.empty_like(x)
+            fused_add_rms_norm(
+                output,
+                x,
+                residual,
+                residual_out,
+                self.weight.data,
+                self.variance_epsilon,
+            )
+            return output, residual_out
+        return rms_norm(x, self.weight.data, self.variance_epsilon)
+
+    def forward_hip(
+    self,
+    x: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    ):
+        if not x.is_contiguous():
+            x = x.contiguous()
+
+        if residual is not None:
+            try:
+                output = torch.empty_like(x)
+                residual_out = torch.empty_like(x)
+                fused_add_rms_norm(
+                    output,
+                    x,
+                    residual_out,
+                    residual,
+                    self.weight.data,
+                    self.variance_epsilon,
+                )
+                return output, residual_out
+            except TypeError:
+                fused_add_rms_norm(
+                    x,
+                    residual,
+                    self.weight.data,
+                    self.variance_epsilon,
+                )
+                return x, residual
+
+        out = torch.empty_like(x)
+        rms_norm(out, x, self.weight.data, self.variance_epsilon)
+        return out
+
+
+    def forward_native(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        orig_dtype = x.dtype
+        x = x.to(torch.float32)
+        if residual is not None:
+            x = x + residual.to(torch.float32)
+            residual = x.to(orig_dtype)
+
+        hidden_size = x.shape[-1]
+        if hidden_size != self.hidden_size:
+            raise ValueError(
+                "Expected hidden_size to be "
+                f"{self.hidden_size}, but found: {hidden_size}"
+            )
+
+        if self.variance_size_override is None:
+            x_var = x
+        else:
+            if hidden_size < self.variance_size_override:
+                raise ValueError(
+                    "Expected hidden_size to be at least "
+                    f"{self.variance_size_override}, but found: {hidden_size}"
+                )
+
+            x_var = x[..., : self.variance_size_override]
+
+        variance = x_var.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        x = (x * self.weight).to(orig_dtype)
+        if residual is None:
+            return x
+        else:
+            return x, residual
+
+    def forward_cpu(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if _is_cpu_amx_available:
+            if residual is not None:
+                torch.ops.sgl_kernel.fused_add_rmsnorm_cpu(
+                    x, residual, self.weight.data, self.variance_epsilon
+                )
+                return x, residual
+            return torch.ops.sgl_kernel.rmsnorm_cpu(
+                x, self.weight.data, self.variance_epsilon
+            )
+        else:
+            return self.forward_native(x, residual)
+
+    def forward_with_allreduce_fusion(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Forward method with allreduce fusion, prioritizing flashinfer fused operations
+        """
+        if residual is not None:
+            from sglang.srt.distributed import get_tensor_model_parallel_world_size
+            from sglang.srt.layers.flashinfer_comm_fusion import (
+                flashinfer_allreduce_residual_rmsnorm,
+            )
+
+            fused_op = (
+                torch.ops.sglang.flashinfer_allreduce_residual_rmsnorm
+                if supports_custom_op()
+                else flashinfer_allreduce_residual_rmsnorm
+            )
+
+            if get_tensor_model_parallel_world_size() > 1:
+                fused_result = fused_op(
+                    input_tensor=x,
+                    residual=residual,
+                    weight=self.weight,
+                    eps=self.variance_epsilon,
+                )
+                if fused_result[0] is not None:
+                    return fused_result
+
+        return self.forward(x, residual)
+
+
+class GemmaRMSNorm(CustomOp):
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+
+        # Re-dispatch
+        if _is_hip:
+            self._forward_method = self.forward_native
+
+    def forward_native(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        orig_dtype = x.dtype
+        if residual is not None:
+            x = x + residual
+            residual = x
+
+        x = x.float()
+        variance = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        x = x * (1.0 + self.weight.float())
+        x = x.to(orig_dtype)
+        return x if residual is None else (x, residual)
+
+    def forward_cuda(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if residual is not None:
+            gemma_fused_add_rmsnorm(
+                x, residual, self.weight.data, self.variance_epsilon
+            )
+            return x, residual
+        out = gemma_rmsnorm(x, self.weight.data, self.variance_epsilon)
+        return out
+
+    def forward_npu(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if residual is not None:
+            x = x + residual
+            residual = x
+
+        x, _ = torch_npu.npu_gemma_rms_norm(x, self.weight, self.variance_epsilon)
+        return x if residual is None else (x, residual)
+
+
+class Gemma3RMSNorm(CustomOp):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.zeros(dim))
+        # Re-dispatch
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward_native(self, x):
+        output = self._norm(x.float())
+        # Llama does x.to(float16) * w whilst Gemma3 is (x * w).to(float16)
+        # See https://github.com/huggingface/transformers/pull/29402
+        output = output * (1.0 + self.weight.float())
+        return output.type_as(x)
+
+    def forward_cuda(self, x):
+        return self.forward_native(x)
+
+    def forward_npu(self, x):
+        output, _ = torch_npu.npu_gemma_rms_norm(x, self.weight, self.eps)
+        return output
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
+
+
+if not (
+    _is_cuda or _is_hip or _is_npu or (_is_cpu and _is_cpu_amx_available) or _is_xpu
+):
+    logger.info(
+        "sgl-kernel layernorm implementation is not available on current platform. Fallback to other kernel libraries."
+    )
+    from vllm.model_executor.layers.layernorm import GemmaRMSNorm, RMSNorm
diff --git a/python/sglang/srt/layers/linear.py b/python/sglang/srt/layers/linear.py
new file mode 100644
index 000000000..df2b77e08
--- /dev/null
+++ b/python/sglang/srt/layers/linear.py
@@ -0,0 +1,1337 @@
+"""Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/layers/linear.py"""
+
+from __future__ import annotations
+
+import itertools
+import logging
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
+
+import torch
+from torch.nn.parameter import Parameter, UninitializedParameter
+
+from sglang.srt.distributed import (
+    divide,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    parallel_state,
+    split_tensor_along_last_dim,
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    use_symmetric_memory,
+)
+from sglang.srt.layers.parameter import (
+    BasevLLMParameter,
+    BlockQuantScaleParameter,
+    PackedColumnParameter,
+    PackedvLLMParameter,
+    PerTensorScaleParameter,
+    RowvLLMParameter,
+    _ColumnvLLMParameter,
+)
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+from sglang.srt.utils import is_cpu, is_npu, set_weight_attrs
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.quantization.base_config import (
+        QuantizationConfig,
+        QuantizeMethodBase,
+    )
+
+logger = logging.getLogger(__name__)
+
+WEIGHT_LOADER_V2_SUPPORTED = [
+    "CompressedTensorsLinearMethod",
+    "AWQMarlinLinearMethod",
+    "AWQLinearMethod",
+    "GPTQMarlinLinearMethod",
+    "Fp8LinearMethod",
+    "BlockInt8LinearMethod",
+    "MarlinLinearMethod",
+    "QQQLinearMethod",
+    "GPTQMarlin24LinearMethod",
+    "TPUInt8LinearMethod",
+    "GPTQLinearMethod",
+    "FBGEMMFp8LinearMethod",
+    "ModelOptFp8LinearMethod",
+    "ModelOptFp4LinearMethod",
+    "IPEXAWQLinearMethod",
+    "PetitNvFp4LinearMethod",
+]
+
+_is_cpu = is_cpu()
+_is_npu = is_npu()
+
+
+def adjust_marlin_shard(param, shard_size, shard_offset):
+    marlin_tile_size = getattr(param, "marlin_tile_size", None)
+    if marlin_tile_size is None:
+        return shard_size, shard_offset
+
+    return shard_size * marlin_tile_size, shard_offset * marlin_tile_size
+
+
+def adjust_bitsandbytes_4bit_shard(
+    param: Parameter, shard_offsets: Dict[str, Tuple[int, int]], loaded_shard_id: str
+) -> Tuple[int, int]:
+    """Adjust the quantization offsets and sizes for BitsAndBytes sharding."""
+
+    total, _ = shard_offsets["total"]
+    orig_offset, orig_size = shard_offsets[loaded_shard_id]
+
+    quantized_total = param.data.shape[0]
+    quantized_offset = orig_offset * quantized_total // total
+    quantized_size = orig_size * quantized_total // total
+
+    return quantized_size, quantized_offset
+
+
+def adjust_scalar_to_fused_array(param, loaded_weight, shard_id):
+    """For fused modules (QKV and MLP) we have an array of length
+    N that holds 1 scale for each "logical" matrix. So the param
+    is an array of length N. The loaded_weight corresponds to
+    one of the shards on disk. Here, we slice the param based on
+    the shard_id for loading.
+    """
+    qkv_idxs = {"q": 0, "k": 1, "v": 2}
+
+    if isinstance(shard_id, str):
+        shard_id = qkv_idxs[shard_id]
+    elif not isinstance(shard_id, int):
+        raise ValueError(f"Unknown Shard Id {shard_id}")
+
+    # AutoFP8 scales do not have a shape
+    # compressed-tensors scales do have a shape
+    if len(loaded_weight.shape) != 0:
+        assert loaded_weight.shape[0] == 1
+        loaded_weight = loaded_weight[0]
+
+    return param[shard_id], loaded_weight
+
+
+def adjust_shard_offsets(shard_offsets, loaded_weight, dim):
+    actual_weight_size = loaded_weight.size(dim)
+    target_weight_size = shard_offsets[-1][-1] + shard_offsets[-1][-2]
+    if actual_weight_size != target_weight_size:
+        new_shard_offsets = []
+        new_offset = 0
+        for shard_id, shard_offset, shard_size in shard_offsets:
+            actual_shard_size = actual_weight_size * shard_size // target_weight_size
+            new_shard_offsets.append((shard_id, new_offset, actual_shard_size))
+            new_offset += actual_shard_size
+        return new_shard_offsets
+    return shard_offsets
+
+
+class LinearBase(torch.nn.Module):
+    """Base linear layer.
+
+    Args:
+        input_size: input dimension of the linear layer.
+        output_size: output dimension of the linear layer.
+        bias: If true, add bias.
+        skip_bias_add: If true, skip adding bias but instead return it.
+        params_dtype: Data type for the parameters.
+        quant_config: Quantization configure.
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.skip_bias_add = skip_bias_add
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+        if quant_config is None:
+            self.quant_method: Optional[QuantizeMethodBase] = UnquantizedLinearMethod()
+        else:
+            self.quant_method = quant_config.get_quant_method(self, prefix=prefix)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+
+class ReplicatedLinear(LinearBase):
+    """Replicated linear layer.
+
+    Args:
+        input_size: input dimension of the linear layer.
+        output_size: output dimension of the linear layer.
+        bias: If true, add bias.
+        skip_bias_add: If true, skip adding bias but instead return it.
+        params_dtype: Data type for the parameters.
+        quant_config: Quantization configure.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.qkv_proj)
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = True,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__(
+            input_size,
+            output_size,
+            skip_bias_add,
+            params_dtype,
+            quant_config,
+            prefix=prefix,
+        )
+
+        # All the linear layer supports quant method.
+        assert self.quant_method is not None
+        self.quant_method.create_weights(
+            self,
+            self.input_size,
+            [self.output_size],
+            self.input_size,
+            self.output_size,
+            self.params_dtype,
+            weight_loader=self.weight_loader,
+        )
+
+        if bias:
+            self.bias = Parameter(
+                torch.empty(self.output_size, dtype=self.params_dtype)
+            )
+            set_weight_attrs(
+                self.bias,
+                {
+                    "output_dim": 0,
+                    "weight_loader": self.weight_loader,
+                },
+            )
+        else:
+            self.register_parameter("bias", None)
+
+    def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+        # If the weight on disk does not have a shape, give it one
+        # (such scales for AutoFp8).
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
+
+        # The per-tensor quant-scale must be 1 dimension
+        if _is_npu:
+            if param.size() != loaded_weight.size() and param.size(0) == 1:
+                if torch.allclose(loaded_weight, loaded_weight[0]):
+                    loaded_weight = loaded_weight[:1]
+                else:
+                    raise ValueError(f"{loaded_weight} are not all equal")
+
+        assert param.size() == loaded_weight.size()
+        param.data.copy_(loaded_weight)
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        bias = self.bias if not self.skip_bias_add else None
+        assert self.quant_method is not None
+        output = self.quant_method.apply(self, x, bias)
+        output_bias = self.bias if self.skip_bias_add else None
+        return output, output_bias
+
+    def extra_repr(self) -> str:
+        s = f"in_features={self.input_size}"
+        s += f", output_features={self.output_size}"
+        s += f", bias={self.bias is not None}"
+        return s
+
+
+class ColumnParallelLinear(LinearBase):
+    """Linear layer with column parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its second dimension as A = [A_1, ..., A_p].
+
+    Args:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias.
+        gather_output: If true, call all-gather on output and make Y available
+                       to all GPUs, otherwise, every GPU will have its output
+                       which is Y_i = XA_i
+        skip_bias_add: This was added to enable performance optimizations where
+                       bias can be fused with other element-wise operations. we
+                       skip adding bias but instead return it.
+        params_dtype: Data type for the parameters.
+        quant_config: Quantization configure.
+        output_sizes: list of output sizes packed into one output, like for QKV
+                       the list would be size 3.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.qkv_proj)
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = True,
+        gather_output: bool = False,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        output_sizes: Optional[List[int]] = None,
+        prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
+        use_presharded_weights: bool = False,
+    ):
+        super().__init__(
+            input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix
+        )
+
+        self.gather_output = gather_output
+        self.use_presharded_weights = use_presharded_weights
+
+        # Divide the weight matrix along the last dimension.
+        if tp_rank is None:
+            tp_rank = get_tensor_model_parallel_rank()
+        if tp_size is None:
+            tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank, self.tp_size = tp_rank, tp_size
+        assert self.quant_method is not None
+        self.output_size_per_partition = divide(self.output_size, tp_size)
+        self.output_partition_sizes = [self.output_size_per_partition]
+        # If QKV or MergedColumn, use output size of each partition.
+        if hasattr(self, "output_sizes"):
+            self.output_partition_sizes = [
+                divide(output_size, tp_size) for output_size in self.output_sizes
+            ]
+
+        if output_sizes is None:
+            output_sizes = [output_size]
+
+        self.quant_method.create_weights(
+            layer=self,
+            input_size_per_partition=self.input_size,
+            output_partition_sizes=self.output_partition_sizes,
+            input_size=self.input_size,
+            output_size=self.output_size,
+            params_dtype=self.params_dtype,
+            weight_loader=(
+                self.weight_loader_v2
+                if self.quant_method.__class__.__name__ in WEIGHT_LOADER_V2_SUPPORTED
+                else self.weight_loader
+            ),
+        )
+        if bias:
+            self.bias = Parameter(
+                torch.empty(self.output_size_per_partition, dtype=params_dtype)
+            )
+            set_weight_attrs(
+                self.bias,
+                {
+                    "output_dim": 0,
+                    "weight_loader": self.weight_loader,
+                },
+            )
+        else:
+            self.register_parameter("bias", None)
+
+    def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+        output_dim = getattr(param, "output_dim", None)
+
+        # Special case for GGUF
+        is_gguf_weight = getattr(param, "is_gguf_weight", False)
+        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
+        if is_gguf_weight_type:
+            param.weight_type = loaded_weight.item()
+
+        # Materialize GGUF UninitializedParameter
+        if is_gguf_weight and isinstance(param, UninitializedParameter):
+            param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype)
+
+        use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+
+        param_data = param.data
+        # bitsandbytes loads the weights of the specific portion
+        # no need to narrow here
+        if output_dim is not None and not use_bitsandbytes_4bit:
+            shard_size = param_data.shape[output_dim]
+            start_idx = self.tp_rank * shard_size
+
+            if _is_cpu:
+                from sglang.srt.model_loader.weight_utils import (
+                    narrow_padded_param_and_loaded_weight,
+                )
+
+                param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                    param_data,
+                    loaded_weight,
+                    0,  # param_data_start
+                    start_idx,
+                    output_dim,
+                    shard_size,
+                    not self.use_presharded_weights,
+                )
+            else:
+                if not self.use_presharded_weights:
+                    loaded_weight = loaded_weight.narrow(
+                        output_dim, start_idx, shard_size
+                    )
+
+        # Special case for loading scales off disk, which often do not
+        # have a shape (such as in the case of AutoFP8).
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
+
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+    def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor):
+        # Special case for loading scales off disk, which often do not
+        # have a shape (such as in the case of AutoFP8).
+        if len(loaded_weight.shape) == 0:
+            assert loaded_weight.numel() == 1
+            loaded_weight = loaded_weight.reshape(1)
+
+        if isinstance(param, _ColumnvLLMParameter):
+            param.load_column_parallel_weight(
+                loaded_weight,
+                tp_rank=self.tp_rank,
+                use_presharded_weights=self.use_presharded_weights,
+            )
+        else:
+            # FIXME: This branch is needed to load deepseek v3 awq.
+            # However, we should fix this and avoid the branching here.
+            param.load_column_parallel_weight(loaded_weight)
+
+    def forward(self, input_):
+        bias = self.bias if not self.skip_bias_add else None
+
+        # Matrix multiply.
+        assert self.quant_method is not None
+        output_parallel = self.quant_method.apply(self, input_, bias)
+        if self.gather_output:
+            # All-gather across the partitions.
+            output = tensor_model_parallel_all_gather(output_parallel)
+        else:
+            output = output_parallel
+        output_bias = self.bias if self.skip_bias_add else None
+        return output, output_bias
+
+    def extra_repr(self) -> str:
+        s = f"in_features={self.input_size}"
+        s += f", output_features={self.output_size_per_partition}"
+        s += f", bias={self.bias is not None}"
+        s += f", tp_size={self.tp_size}"
+        s += f", gather_output={self.gather_output}"
+        return s
+
+
+class MergedColumnParallelLinear(ColumnParallelLinear):
+    """Packed linear layers with column parallelism.
+
+    Similar to ColumnParallelLinear, but the weight matrix is concatenated
+    along the output dimension. When the weight matrix is loaded, the
+    different partitions are sharded separately.
+
+    Args:
+        input_size: input dimension of the linear layer.
+        output_sizes: list of output dimensions of the linear layer.
+        bias: If true, add bias.
+        gather_output: If true, call all-gather on output and make the output
+                       available to all GPUs, otherwise, every GPU will have
+                       its own output.
+        skip_bias_add: This was added to enable performance optimizations where
+                       bias can be fused with other element-wise operations. we
+                       skip adding bias but instead return it.
+        params_dtype: Data type for the parameters.
+        quant_config: Quantization configure.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.qkv_proj)
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_sizes: List[int],
+        bias: bool = True,
+        gather_output: bool = False,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
+        use_presharded_weights: bool = False,
+    ):
+        self.output_sizes = output_sizes
+        if tp_rank is None:
+            tp_rank = get_tensor_model_parallel_rank()
+        if tp_size is None:
+            tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank, self.tp_size = tp_rank, tp_size
+        assert all(output_size % tp_size == 0 for output_size in output_sizes)
+        self.use_presharded_weights = use_presharded_weights
+        super().__init__(
+            input_size=input_size,
+            output_size=sum(output_sizes),
+            bias=bias,
+            gather_output=gather_output,
+            skip_bias_add=skip_bias_add,
+            params_dtype=params_dtype,
+            quant_config=quant_config,
+            prefix=prefix,
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+            use_presharded_weights=use_presharded_weights,
+        )
+        self.prefix = prefix
+
+    def weight_loader(
+        self,
+        param: Parameter,
+        loaded_weight: torch.Tensor,
+        loaded_shard_id: Optional[int] = None,
+    ):
+
+        # Special case for GGUF
+        # initialize GGUF param after we know the quantize type
+        is_gguf_weight = getattr(param, "is_gguf_weight", False)
+        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
+        if is_gguf_weight_type:
+            param.data[loaded_shard_id].copy_(loaded_weight)
+            param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
+            return
+
+        if is_gguf_weight:
+            output_dim = getattr(param, "output_dim", None)
+            shard_size = loaded_weight.size(output_dim) // self.tp_size
+            start_idx = self.tp_rank * shard_size
+
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+
+            param.shard_id.append(loaded_shard_id)
+            param.shard_id_map[loaded_shard_id] = len(param.data_container)
+            param.data_container.append(loaded_weight)
+            return
+
+        param_data = param.data
+        output_dim = getattr(param, "output_dim", None)
+        # Special case for AQLM codebooks.
+        is_metadata = getattr(param, "is_metadata", False)
+        # Special case for per-tensor scale to load scalar into fused array.
+        needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
+
+        if loaded_shard_id is None:
+            # Loaded weight is already fused on disk (qkv/mlp).
+            if output_dim is None:
+                if needs_scalar_to_array:
+                    param_data, loaded_weight = adjust_scalar_to_fused_array(
+                        param_data, loaded_weight, 0
+                    )
+
+                assert param_data.shape == loaded_weight.shape
+                param_data.copy_(loaded_weight)
+                return
+            current_shard_offset = 0
+            shard_offsets: List[Tuple[int, int, int]] = []
+            for i, output_size in enumerate(self.output_sizes):
+                shard_offsets.append((i, current_shard_offset, output_size))
+                current_shard_offset += output_size
+            packed_dim = getattr(param, "packed_dim", None)
+
+            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+            if _is_cpu:
+                shard_offsets = adjust_shard_offsets(
+                    shard_offsets, loaded_weight, output_dim
+                )
+
+            for shard_id, shard_offset, shard_size in shard_offsets:
+                # Special case for Quantization.
+                # If quantized, we need to adjust the offset and size to account
+                # for the packing.
+                if packed_dim == output_dim:
+                    shard_size = shard_size // param.pack_factor
+                    shard_offset = shard_offset // param.pack_factor
+                    # Special case for Marlin.
+                    shard_size, shard_offset = adjust_marlin_shard(
+                        param, shard_size, shard_offset
+                    )
+
+                if use_bitsandbytes_4bit:
+                    index = list(itertools.accumulate([0] + self.output_sizes))
+                    orig_offsets = {
+                        str(i): (index[i], size)
+                        for i, size in enumerate(self.output_sizes)
+                    }
+                    orig_offsets["total"] = (self.output_size, 0)
+                    shard_size, shard_offset = adjust_bitsandbytes_4bit_shard(
+                        param, orig_offsets, str(shard_id)
+                    )
+
+                loaded_weight_shard = loaded_weight.narrow(
+                    output_dim, shard_offset, shard_size
+                )
+                self.weight_loader(param, loaded_weight_shard, shard_id)
+            return
+
+        assert loaded_shard_id < len(self.output_sizes)
+        if output_dim is not None:
+            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size
+            shard_size = self.output_sizes[loaded_shard_id] // self.tp_size
+            # Special case for quantization.
+            # If quantized, we need to adjust the offset and size to account
+            # for the packing.
+            packed_dim = getattr(param, "packed_dim", None)
+            if packed_dim == output_dim:
+                shard_size = shard_size // param.pack_factor
+                shard_offset = shard_offset // param.pack_factor
+                # Special case for Marlin.
+                shard_size, shard_offset = adjust_marlin_shard(
+                    param, shard_size, shard_offset
+                )
+
+            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+            if use_bitsandbytes_4bit:
+                shard_size = loaded_weight.shape[output_dim]
+                shard_offset = loaded_weight.shape[output_dim] * loaded_shard_id
+
+            param_data = param_data.narrow(output_dim, shard_offset, shard_size)
+            start_idx = self.tp_rank * shard_size
+
+            if _is_cpu:
+                from sglang.srt.model_loader.weight_utils import (
+                    narrow_padded_param_and_loaded_weight,
+                )
+
+                param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                    param_data,
+                    loaded_weight,
+                    0,  # param_data_start
+                    start_idx,
+                    output_dim,
+                    shard_size,
+                    not use_bitsandbytes_4bit and not self.use_presharded_weights,
+                )
+            else:
+                # bitsandbytes loads the weights of the specific portion
+                # no need to narrow here
+                if not use_bitsandbytes_4bit and not self.use_presharded_weights:
+                    loaded_weight = loaded_weight.narrow(
+                        output_dim, start_idx, shard_size
+                    )
+
+        # Special case for AQLM codebooks.
+        elif is_metadata:
+            # metadata indicates fixed size concatenated along dim 0
+            shard_size = loaded_weight.shape[0]
+            shard_offset = loaded_shard_id * shard_size
+            param_data = param_data.narrow(0, shard_offset, shard_size)
+
+        # Special case for per-tensor scales in fused case.
+        elif needs_scalar_to_array:
+            param_data, loaded_weight = adjust_scalar_to_fused_array(
+                param_data, loaded_weight, loaded_shard_id
+            )
+
+        else:
+            ignore_warning = getattr(param, "ignore_warning", False)
+            if not ignore_warning:
+                logger.warning(
+                    "Loading a weight without `output_dim` attribute in "
+                    "MergedColumnParallelLinear, assume the weight is "
+                    "the same for all partitions."
+                )
+
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+    def _load_fused_module_from_checkpoint(
+        self, param: BasevLLMParameter, loaded_weight: torch.Tensor
+    ):
+        """
+        Handle special case for models where MLP layers are already
+        fused on disk. In this case, we have no shard id. This function
+        determmines the shard id by splitting these layers and then calls
+        the weight loader using the shard id.
+
+        An example of a model with these fused layers:
+        https://huggingface.co/microsoft/Phi-3-mini-4k-instruct
+        """
+
+        current_shard_offset = 0
+        shard_offsets: List[Tuple[int, int, int]] = []
+        for i, output_size in enumerate(self.output_sizes):
+            shard_offsets.append((i, current_shard_offset, output_size))
+            current_shard_offset += output_size
+
+        for shard_id, shard_offset, shard_size in shard_offsets:
+            # Special case for Quantization.
+            # If quantized, we need to adjust the offset and size to account
+            # for the packing.
+            if (
+                isinstance(param, (PackedColumnParameter, PackedvLLMParameter))
+                and param.packed_dim == param.output_dim
+            ):
+                shard_size, shard_offset = param.adjust_shard_indexes_for_packing(
+                    shard_size=shard_size, shard_offset=shard_offset
+                )
+
+            loaded_weight_shard = loaded_weight.narrow(
+                param.output_dim, shard_offset, shard_size
+            )
+            self.weight_loader_v2(param, loaded_weight_shard, shard_id)
+
+    def weight_loader_v2(
+        self,
+        param: BasevLLMParameter,
+        loaded_weight: torch.Tensor,
+        loaded_shard_id: Optional[int] = None,
+    ):
+        if loaded_shard_id is None:
+            if isinstance(param, PerTensorScaleParameter):
+                param.load_merged_column_weight(
+                    loaded_weight=loaded_weight,
+                    shard_id=0,
+                    tp_rank=self.tp_rank,
+                    tp_size=self.tp_size,
+                )
+                return
+            elif type(param) in (RowvLLMParameter, BasevLLMParameter):
+                param.load_merged_column_weight(
+                    loaded_weight=loaded_weight,
+                    tp_rank=self.tp_rank,
+                    tp_size=self.tp_size,
+                )
+                return
+            # TODO: @dsikka - move to parameter.py
+            self._load_fused_module_from_checkpoint(param, loaded_weight)
+            return
+
+        assert loaded_shard_id < len(self.output_sizes)
+
+        if isinstance(param, BlockQuantScaleParameter):
+            weight_block_size = self.quant_method.quant_config.weight_block_size
+            block_n, _ = weight_block_size[0], weight_block_size[1]
+            shard_offset = (
+                (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) // block_n
+            ) // self.tp_size
+            shard_size = (
+                (self.output_sizes[loaded_shard_id] + block_n - 1)
+                // block_n
+                // self.tp_size
+            )
+        else:
+            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size
+            shard_size = self.output_sizes[loaded_shard_id] // self.tp_size
+
+        param.load_merged_column_weight(
+            loaded_weight=loaded_weight,
+            shard_id=loaded_shard_id,
+            shard_offset=shard_offset,
+            shard_size=shard_size,
+            use_presharded_weights=self.use_presharded_weights,
+            tp_rank=self.tp_rank,
+            tp_size=self.tp_size,
+        )
+
+
+class QKVParallelLinear(ColumnParallelLinear):
+    """Linear layers for the attention's QKV transformation.
+
+    Linear layers for the linear transformation of the query, key, and value
+    vectors in the attention layer. The weight matrix is concatenated along
+    the output dimension. The layer is parallelized along the head dimension.
+    When the number of key/value heads is smaller than the number of query
+    heads (e.g., multi-query/grouped-query attention), the key/value head may
+    be replicated while the query heads are partitioned.
+
+    Args:
+        hidden_size: input hidden state size of the transformer.
+        head_size: size of each attention head.
+        total_num_heads: total number of attention query heads.
+        total_num_kv_heads: total number of attention key/value heads. If
+                            None, assume total_num_kv_heads = total_num_heads.
+        bias: If true, add bias.
+        skip_bias_add: This was added to enable performance optimizations where
+                       bias can be fused with other element-wise operations. we
+                       skip adding bias but instead return it.
+        params_dtype: Data type for the parameters.
+        quant_config: Quantization configure.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.qkv_proj)
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        head_size: int,
+        total_num_heads: int,
+        total_num_kv_heads: Optional[int] = None,
+        bias: bool = True,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
+        load_presharded_attn: bool = False,
+    ):
+        self.hidden_size = hidden_size
+        self.head_size = head_size
+        self.total_num_heads = total_num_heads
+        if total_num_kv_heads is None:
+            total_num_kv_heads = total_num_heads
+        self.total_num_kv_heads = total_num_kv_heads
+        # Divide the weight matrix along the last dimension.
+        if tp_rank is None:
+            tp_rank = get_tensor_model_parallel_rank()
+        if tp_size is None:
+            tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank, self.tp_size = tp_rank, tp_size
+        self.num_heads = divide(self.total_num_heads, tp_size)
+        if tp_size >= self.total_num_kv_heads:
+            self.num_kv_heads = 1
+            self.num_kv_head_replicas = divide(tp_size, self.total_num_kv_heads)
+        else:
+            self.num_kv_heads = divide(self.total_num_kv_heads, tp_size)
+            self.num_kv_head_replicas = 1
+        self.q_proj_shard_size = self.num_heads * self.head_size
+        self.kv_proj_shard_size = self.num_kv_heads * self.head_size
+        input_size = self.hidden_size
+        output_size = (
+            (self.num_heads + 2 * self.num_kv_heads) * tp_size * self.head_size
+        )
+        self.output_sizes = [
+            self.num_heads * self.head_size * tp_size,  # q_proj
+            self.num_kv_heads * self.head_size * tp_size,  # k_proj
+            self.num_kv_heads * self.head_size * tp_size,  # v_proj
+        ]
+        self.use_presharded_weights = load_presharded_attn
+
+        super().__init__(
+            input_size=input_size,
+            output_size=output_size,
+            bias=bias,
+            gather_output=False,
+            skip_bias_add=skip_bias_add,
+            params_dtype=params_dtype,
+            quant_config=quant_config,
+            prefix=prefix,
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+            use_presharded_weights=self.use_presharded_weights,
+        )
+
+    def _get_shard_offset_mapping(self, loaded_shard_id: str):
+        shard_offset_mapping = {
+            "q": 0,
+            "k": self.num_heads * self.head_size,
+            "v": (self.num_heads + self.num_kv_heads) * self.head_size,
+            "total": (self.num_heads + 2 * self.num_kv_heads) * self.head_size,
+        }
+        return shard_offset_mapping.get(loaded_shard_id)
+
+    def _get_shard_size_mapping(self, loaded_shard_id: str):
+        shard_size_mapping = {
+            "q": self.num_heads * self.head_size,
+            "k": self.num_kv_heads * self.head_size,
+            "v": self.num_kv_heads * self.head_size,
+        }
+        return shard_size_mapping.get(loaded_shard_id)
+
+    def _load_fused_module_from_checkpoint(
+        self, param: BasevLLMParameter, loaded_weight: torch.Tensor
+    ):
+        """
+        Handle special case for models where QKV layers are already
+        fused on disk. In this case, we have no shard id. This function
+        determmines the shard id by splitting these layers and then calls
+        the weight loader using the shard id.
+
+        An example of a model with these fused layers:
+        https://huggingface.co/microsoft/Phi-3-mini-4k-instruct
+        """
+        shard_offsets = [
+            # (shard_id, shard_offset, shard_size)
+            ("q", 0, self.total_num_heads * self.head_size),
+            (
+                "k",
+                self.total_num_heads * self.head_size,
+                self.total_num_kv_heads * self.head_size,
+            ),
+            (
+                "v",
+                (self.total_num_heads + self.total_num_kv_heads) * self.head_size,
+                self.total_num_kv_heads * self.head_size,
+            ),
+        ]
+
+        for shard_id, shard_offset, shard_size in shard_offsets:
+            # Special case for Quantization.
+            # If quantized, we need to adjust the offset and size to account
+            # for the packing.
+            if (
+                isinstance(param, (PackedColumnParameter, PackedvLLMParameter))
+                and param.packed_dim == param.output_dim
+            ):
+                shard_size, shard_offset = param.adjust_shard_indexes_for_packing(
+                    shard_size=shard_size, shard_offset=shard_offset
+                )
+
+            if not self.use_presharded_weights:
+                loaded_weight_shard = loaded_weight.narrow(
+                    param.output_dim, shard_offset, shard_size
+                )
+            self.weight_loader_v2(param, loaded_weight_shard, shard_id)
+
+    def weight_loader_v2(
+        self,
+        param: BasevLLMParameter,
+        loaded_weight: torch.Tensor,
+        loaded_shard_id: Optional[str] = None,
+    ):
+        if loaded_shard_id is None:  # special case for certain models
+            if isinstance(param, PerTensorScaleParameter):
+                param.load_qkv_weight(loaded_weight=loaded_weight, shard_id=0)
+                return
+            elif type(param) in (RowvLLMParameter, BasevLLMParameter):
+                param.load_qkv_weight(loaded_weight=loaded_weight)
+                return
+            # TODO: @dsikka - move to parameter.py
+            self._load_fused_module_from_checkpoint(param, loaded_weight)
+            return
+
+        assert loaded_shard_id in ["q", "k", "v"]
+
+        shard_offset = self._get_shard_offset_mapping(loaded_shard_id)
+        shard_size = self._get_shard_size_mapping(loaded_shard_id)
+
+        if isinstance(param, BlockQuantScaleParameter):
+            weight_block_size = self.quant_method.quant_config.weight_block_size
+            block_n, _ = weight_block_size[0], weight_block_size[1]
+            shard_offset = (shard_offset + block_n - 1) // block_n
+            shard_size = (shard_size + block_n - 1) // block_n
+
+        param.load_qkv_weight(
+            loaded_weight=loaded_weight,
+            num_heads=self.num_kv_head_replicas,
+            shard_id=loaded_shard_id,
+            shard_offset=shard_offset,
+            shard_size=shard_size,
+            tp_rank=self.tp_rank,
+            use_presharded_weights=self.use_presharded_weights,
+        )
+
+    def weight_loader(
+        self,
+        param: Parameter,
+        loaded_weight: torch.Tensor,
+        loaded_shard_id: Optional[str] = None,
+    ):
+
+        # Special case for GGUF
+        # initialize GGUF param after we know the quantize type
+        is_gguf_weight = getattr(param, "is_gguf_weight", False)
+        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
+        if is_gguf_weight_type and loaded_shard_id is not None:
+            idx_map = {"q": 0, "k": 1, "v": 2}
+            param.data[idx_map[loaded_shard_id]].copy_(loaded_weight)
+            param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
+            return
+
+        if is_gguf_weight:
+            output_dim = getattr(param, "output_dim", None)
+            shard_size = loaded_weight.size(output_dim) // self.tp_size
+            start_idx = self.tp_rank * shard_size
+
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+
+            param.shard_id.append(loaded_shard_id)
+            param.shard_id_map[loaded_shard_id] = len(param.data_container)
+            param.data_container.append(loaded_weight)
+            return
+
+        param_data = param.data
+        output_dim = getattr(param, "output_dim", None)
+        # Special case for AQLM codebooks.
+        is_metadata = getattr(param, "is_metadata", False)
+
+        # Special case for per-tensor scales in fused case.
+        needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
+
+        if loaded_shard_id is None:
+            # Loaded weight is already fused on disk (qkv/mlp).
+            if output_dim is None:
+                if needs_scalar_to_array:
+                    param_data, loaded_weight = adjust_scalar_to_fused_array(
+                        param_data, loaded_weight, 0
+                    )
+
+                assert param_data.shape == loaded_weight.shape
+                param_data.copy_(loaded_weight)
+                return
+            shard_offsets = [
+                # (shard_id, shard_offset, shard_size)
+                ("q", 0, self.total_num_heads * self.head_size),
+                (
+                    "k",
+                    self.total_num_heads * self.head_size,
+                    self.total_num_kv_heads * self.head_size,
+                ),
+                (
+                    "v",
+                    (self.total_num_heads + self.total_num_kv_heads) * self.head_size,
+                    self.total_num_kv_heads * self.head_size,
+                ),
+            ]
+            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+
+            packed_dim = getattr(param, "packed_dim", None)
+            if _is_cpu:
+                shard_offsets = adjust_shard_offsets(
+                    shard_offsets, loaded_weight, output_dim
+                )
+
+            for shard_id, shard_offset, shard_size in shard_offsets:
+                # Special case for Quantized Weights.
+                # If quantized, we need to adjust the offset and size to account
+                # for the packing.
+                if packed_dim == output_dim:
+                    shard_size = shard_size // param.pack_factor
+                    shard_offset = shard_offset // param.pack_factor
+
+                    # Special case for Marlin.
+                    shard_size, shard_offset = adjust_marlin_shard(
+                        param, shard_size, shard_offset
+                    )
+
+                if use_bitsandbytes_4bit:
+                    orig_qkv_offsets = {
+                        "q": (0, self.total_num_heads * self.head_size),
+                        "k": (
+                            self.total_num_heads * self.head_size,
+                            self.total_num_kv_heads * self.head_size,
+                        ),
+                        "v": (
+                            (self.total_num_heads + self.total_num_kv_heads)
+                            * self.head_size,
+                            self.total_num_kv_heads * self.head_size,
+                        ),
+                        "total": (
+                            (self.total_num_heads + 2 * self.total_num_kv_heads)
+                            * self.head_size,
+                            0,
+                        ),
+                    }
+
+                    shard_size, shard_offset = adjust_bitsandbytes_4bit_shard(
+                        param, orig_qkv_offsets, shard_id
+                    )
+
+                if not self.use_presharded_weights:
+                    loaded_weight_shard = loaded_weight.narrow(
+                        output_dim, shard_offset, shard_size
+                    )
+                self.weight_loader(param, loaded_weight_shard, shard_id)
+            return
+
+        assert loaded_shard_id in ["q", "k", "v"]
+
+        # If output dim is defined, use the default loading process.
+        if output_dim is not None:
+            if loaded_shard_id == "q":
+                shard_offset = 0
+                shard_size = self.num_heads * self.head_size
+            elif loaded_shard_id == "k":
+                shard_offset = self.num_heads * self.head_size
+                shard_size = self.num_kv_heads * self.head_size
+            elif loaded_shard_id == "v":
+                shard_offset = (self.num_heads + self.num_kv_heads) * self.head_size
+                shard_size = self.num_kv_heads * self.head_size
+            # Special case for Quantized Weights.
+            # If quantized, we need to adjust the offset and size to account
+            # for the packing.
+            packed_dim = getattr(param, "packed_dim", None)
+            if packed_dim == output_dim:
+                shard_size = shard_size // param.pack_factor
+                shard_offset = shard_offset // param.pack_factor
+
+                # Special case for Marlin.
+                shard_size, shard_offset = adjust_marlin_shard(
+                    param, shard_size, shard_offset
+                )
+
+            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+            if use_bitsandbytes_4bit:
+                orig_qkv_offsets = {
+                    "q": (0, self.num_heads * self.head_size),
+                    "k": (
+                        self.num_heads * self.head_size,
+                        self.num_kv_heads * self.head_size,
+                    ),
+                    "v": (
+                        (self.num_heads + self.num_kv_heads) * self.head_size,
+                        self.num_kv_heads * self.head_size,
+                    ),
+                    "total": (
+                        (self.num_heads + 2 * self.num_kv_heads) * self.head_size,
+                        0,
+                    ),
+                }
+                shard_size, shard_offset = adjust_bitsandbytes_4bit_shard(
+                    param, orig_qkv_offsets, loaded_shard_id
+                )
+
+            param_data = param_data.narrow(output_dim, shard_offset, shard_size)
+            if loaded_shard_id == "q":
+                shard_id = self.tp_rank
+            else:
+                shard_id = self.tp_rank // self.num_kv_head_replicas
+            start_idx = shard_id * shard_size
+
+            if _is_cpu:
+                from sglang.srt.model_loader.weight_utils import (
+                    narrow_padded_param_and_loaded_weight,
+                )
+
+                param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                    param_data,
+                    loaded_weight,
+                    0,  # param_data_start
+                    start_idx,
+                    output_dim,
+                    shard_size,
+                    not use_bitsandbytes_4bit and not self.use_presharded_weights,
+                )
+            else:
+                # bitsandbytes loads the weights of the specific portion
+                # no need to narrow here
+                if not use_bitsandbytes_4bit and not self.use_presharded_weights:
+                    loaded_weight = loaded_weight.narrow(
+                        output_dim, start_idx, shard_size
+                    )
+
+        # Special case for for AQLM codebooks.
+        elif is_metadata:
+            # metadata indicates fixed size concatenated along dim 0
+            shard_size = loaded_weight.shape[0]
+            shard_index = ["q", "k", "v"].index(loaded_shard_id)
+            param_data = param_data.narrow(0, shard_index * shard_size, shard_size)
+        # Special case for per-tensor scales in fused case.
+        elif needs_scalar_to_array:
+            param_data, loaded_weight = adjust_scalar_to_fused_array(
+                param_data, loaded_weight, loaded_shard_id
+            )
+        else:
+            ignore_warning = getattr(param, "ignore_warning", False)
+            if not ignore_warning:
+                logger.warning(
+                    "Loading a weight without `output_dim` attribute in "
+                    "QKVParallelLinear, assume the weight is the same "
+                    "for all partitions."
+                )
+
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+
+class RowParallelLinear(LinearBase):
+    """Linear layer with row parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its first dimension and X along its second dimension as:
+               -   -
+              | A_1 |
+              | .   |
+          A = | .   |        X = [X_1, ..., X_p]
+              | .   |
+              | A_p |
+               -   -
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias. Note that bias is not parallelized.
+        input_is_parallel: If true, we assume that the input is already
+                           split across the GPUs and we do not split
+                           again.
+        skip_bias_add: This was added to enable performance optimization where
+                       bias can be fused with other element-wise operations.
+                       We skip adding bias but instead return it.
+        params_dtype: Data type for the parameters.
+        quant_config: Quantization configure.
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = True,
+        input_is_parallel: bool = True,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        reduce_results: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
+        use_presharded_weights: bool = False,
+    ):
+        super().__init__(
+            input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix
+        )
+
+        self.input_is_parallel = input_is_parallel
+        self.reduce_results = reduce_results
+
+        # Divide the weight matrix along the last dimension.
+        if tp_rank is None:
+            tp_rank = get_tensor_model_parallel_rank()
+        if tp_size is None:
+            tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank, self.tp_size = tp_rank, tp_size
+        self.input_size_per_partition = divide(input_size, self.tp_size)
+        assert self.quant_method is not None
+        self.use_presharded_weights = use_presharded_weights
+
+        self.quant_method.create_weights(
+            layer=self,
+            input_size_per_partition=self.input_size_per_partition,
+            output_partition_sizes=[self.output_size],
+            input_size=self.input_size,
+            output_size=self.output_size,
+            params_dtype=self.params_dtype,
+            weight_loader=(
+                self.weight_loader_v2
+                if self.quant_method.__class__.__name__ in WEIGHT_LOADER_V2_SUPPORTED
+                else self.weight_loader
+            ),
+        )
+
+        if bias:
+            self.bias = Parameter(torch.empty(self.output_size, dtype=params_dtype))
+            set_weight_attrs(
+                self.bias,
+                {
+                    "output_dim": 0,
+                    "weight_loader": self.weight_loader,
+                },
+            )
+        else:
+            self.register_parameter("bias", None)
+
+    def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+        input_dim = getattr(param, "input_dim", None)
+        use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+
+        # Special case for GGUF
+        is_gguf_weight = getattr(param, "is_gguf_weight", False)
+        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
+        if is_gguf_weight_type:
+            param.weight_type = loaded_weight.item()
+
+        # Materialize GGUF UninitializedParameter
+        if is_gguf_weight and isinstance(param, UninitializedParameter):
+            weight_shape = list(loaded_weight.shape)
+            if input_dim:
+                weight_shape[input_dim] = weight_shape[input_dim] // self.tp_size
+            param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype)
+
+        param_data = param.data
+        # bitsandbytes loads the weights of the specific portion
+        # no need to narrow here
+        if (
+            input_dim is not None
+            and not use_bitsandbytes_4bit
+            and not self.use_presharded_weights
+        ):
+            shard_size = param_data.shape[input_dim]
+            start_idx = self.tp_rank * shard_size
+
+            if _is_cpu:
+                from sglang.srt.model_loader.weight_utils import (
+                    narrow_padded_param_and_loaded_weight,
+                )
+
+                param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                    param_data,
+                    loaded_weight,
+                    0,  # param_data_start
+                    start_idx,
+                    input_dim,
+                    shard_size,
+                )
+            else:
+                loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size)
+
+        # Special case for loading scales off disk, which often do not
+        # have a shape (such as in the case of AutoFP8).
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
+
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+    def weight_loader_v2(self, param: BasevLLMParameter, loaded_weight: torch.Tensor):
+
+        # Special case for loading scales off disk, which often do not
+        # have a shape (such as in the case of AutoFP8).
+        if len(loaded_weight.shape) == 0:
+            assert loaded_weight.numel() == 1
+            loaded_weight = loaded_weight.reshape(1)
+
+        if isinstance(param, RowvLLMParameter):
+            # This `BasevLLMParameter` is defined in sglang/srt/layers/parameter.py,
+            # It supports additional parameters like tp_rank and use_presharded_weights.
+            param.load_row_parallel_weight(
+                loaded_weight,
+                tp_rank=self.tp_rank,
+                use_presharded_weights=self.use_presharded_weights,
+            )
+        else:
+            # `params` is defined in `vllm/model_executor/parameter.py`,
+            # It does not support additional parameters.
+            param.load_row_parallel_weight(loaded_weight)
+
+    def forward(self, input_, skip_all_reduce=False):
+        if self.input_is_parallel:
+            input_parallel = input_
+        else:
+            splitted_input = split_tensor_along_last_dim(
+                input_, num_partitions=self.tp_size
+            )
+            input_parallel = splitted_input[self.tp_rank].contiguous()
+
+        # Matrix multiply.
+        assert self.quant_method is not None
+        # Only fuse bias add into GEMM for rank 0 (this ensures that
+        # bias will not get added more than once in TP>1 case)
+        bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
+        with use_symmetric_memory(parallel_state.get_tp_group()) as sm:
+            output_parallel = self.quant_method.apply(self, input_parallel, bias=bias_)
+            sm.tag(output_parallel)
+
+        if self.reduce_results and self.tp_size > 1 and not skip_all_reduce:
+            output = tensor_model_parallel_all_reduce(output_parallel)
+        else:
+            output = output_parallel
+
+        output_bias = self.bias if self.skip_bias_add else None
+
+        return output, output_bias
+
+    def extra_repr(self) -> str:
+        s = f"input_features={self.input_size_per_partition}"
+        s += f", output_features={self.output_size}"
+        s += f", bias={self.bias is not None}"
+        s += f", tp_size={self.tp_size}"
+        s += f", reduce_results={self.reduce_results}"
+        return s
diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py
new file mode 100644
index 000000000..f6603907a
--- /dev/null
+++ b/python/sglang/srt/layers/logits_processor.py
@@ -0,0 +1,652 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Logits processing."""
+
+import dataclasses
+import logging
+from typing import List, Optional, Union
+
+import torch
+import triton
+import triton.language as tl
+from torch import nn
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_gather,
+)
+from sglang.srt.layers.dp_attention import (
+    DpPaddingMode,
+    attn_tp_all_gather,
+    attn_tp_all_gather_into_tensor,
+    dp_gather_replicate,
+    dp_scatter,
+    get_attention_dp_rank,
+    get_attention_dp_size,
+    get_attention_tp_size,
+    get_global_dp_buffer,
+    get_local_attention_dp_size,
+    set_dp_buffer_len,
+)
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.model_executor.forward_batch_info import (
+    CaptureHiddenMode,
+    ForwardBatch,
+    ForwardMode,
+)
+from sglang.srt.utils import dump_to_file, is_npu, use_intel_amx_backend
+
+logger = logging.getLogger(__name__)
+
+_is_npu = is_npu()
+
+
+@dataclasses.dataclass
+class LogitsProcessorOutput:
+    ## Part 1: This part will be assigned in python/sglang/srt/layers/logits_processor.py::LogitsProcessor
+    # The logits of the next tokens.       shape: [#seq, vocab_size]
+    next_token_logits: torch.Tensor
+    # Used by speculative decoding (EAGLE)
+    # The last hidden layers
+    hidden_states: Optional[torch.Tensor] = None
+
+    ## Part 2: This part will be assigned in python/sglang/srt/layers/sampler.py::Sampler
+    # he log probs of output tokens, if RETURN_ORIGINAL_LOGPROB = True, will get the log probs before applying temperature. If False, will get the log probs before applying temperature.
+    next_token_logprobs: Optional[torch.Tensor] = None
+    # The logprobs and ids of the top-k tokens in output positions. shape: [#seq, k]
+    next_token_top_logprobs_val: Optional[List] = None
+    next_token_top_logprobs_idx: Optional[List] = None
+    # The logprobs and ids of the requested token ids in output positions. shape: [#seq, n] (n is the number of requested token ids)
+    next_token_token_ids_logprobs_val: Optional[List] = None
+    next_token_token_ids_logprobs_idx: Optional[List] = None
+
+    ## Part 3: Prefill-only. This part will be assigned in python/sglang/srt/layers/logits_processor.py::LogitsProcessor
+    # The logprobs of input tokens.        shape: [#token]
+    input_token_logprobs: Optional[torch.Tensor] = None
+    # The logprobs and ids of the top-k tokens in input positions.  shape: [#seq, #token, k]
+    input_top_logprobs_val: List = None
+    input_top_logprobs_idx: List = None
+    # The logprobs and ids of the requested token ids in input positions. shape: [#seq, n] (n is the number of requested token ids)
+    input_token_ids_logprobs_val: Optional[List] = None
+    input_token_ids_logprobs_idx: Optional[List] = None
+
+
+@dataclasses.dataclass
+class LogitsMetadata:
+    forward_mode: ForwardMode
+    capture_hidden_mode: CaptureHiddenMode = CaptureHiddenMode.NULL
+    next_token_logits_buffer: Optional[torch.Tensor] = None
+
+    extend_return_logprob: bool = False
+    extend_return_top_logprob: bool = False
+    extend_token_ids_logprob: bool = False
+    extend_seq_lens: Optional[torch.Tensor] = None
+    extend_seq_lens_cpu: Optional[List[int]] = None
+    extend_logprob_start_lens_cpu: Optional[List[int]] = None
+    extend_logprob_pruned_lens_cpu: Optional[List[int]] = None
+    top_logprobs_nums: Optional[List[int]] = None
+    extend_input_logprob_token_ids_gpu: Optional[torch.Tensor] = None
+    token_ids_logprobs: Optional[List[List[int]]] = None
+
+    # logits and logprobs post processing
+    temp_scaled_logprobs: bool = False
+    temperature: torch.Tensor = None
+    top_p_normalized_logprobs: bool = False
+    top_p: torch.Tensor = None
+
+    # DP attention metadata. Not needed when DP attention is not used.
+    # Number of tokens in the request.
+    global_num_tokens_gpu: Optional[torch.Tensor] = None
+    # The start position of local hidden states.
+    dp_local_start_pos: Optional[torch.Tensor] = None
+    dp_local_num_tokens: Optional[torch.Tensor] = None
+    global_dp_buffer_len: Optional[int] = None
+    # Number of tokens to sample per DP rank
+    global_num_tokens_for_logprob_cpu: Optional[torch.Tensor] = None
+    global_num_tokens_for_logprob_gpu: Optional[torch.Tensor] = None
+    # The gather mode for DP attention
+    dp_padding_mode: Optional[DpPaddingMode] = None
+    # for padding
+    padded_static_len: int = -1
+
+    @classmethod
+    def from_forward_batch(cls, forward_batch: ForwardBatch):
+        if (
+            forward_batch.forward_mode.is_extend()
+            and forward_batch.return_logprob
+            and not forward_batch.forward_mode.is_target_verify()
+        ):
+            extend_return_top_logprob = any(
+                x > 0 for x in forward_batch.top_logprobs_nums
+            )
+            extend_token_ids_logprob = any(
+                x is not None for x in forward_batch.token_ids_logprobs
+            )
+            extend_return_logprob = False
+            extend_logprob_pruned_lens_cpu = []
+            for extend_len, start_len in zip(
+                forward_batch.extend_seq_lens_cpu,
+                forward_batch.extend_logprob_start_lens_cpu,
+            ):
+                if extend_len - start_len > 0:
+                    extend_return_logprob = True
+                extend_logprob_pruned_lens_cpu.append(extend_len - start_len)
+        else:
+            extend_return_logprob = extend_return_top_logprob = (
+                extend_token_ids_logprob
+            ) = extend_logprob_pruned_lens_cpu = False
+
+        return cls(
+            forward_mode=forward_batch.forward_mode,
+            capture_hidden_mode=forward_batch.capture_hidden_mode,
+            next_token_logits_buffer=forward_batch.next_token_logits_buffer,
+            extend_return_logprob=extend_return_logprob,
+            extend_return_top_logprob=extend_return_top_logprob,
+            extend_token_ids_logprob=extend_token_ids_logprob,
+            extend_seq_lens=forward_batch.extend_seq_lens,
+            extend_seq_lens_cpu=forward_batch.extend_seq_lens_cpu,
+            extend_logprob_start_lens_cpu=forward_batch.extend_logprob_start_lens_cpu,
+            extend_logprob_pruned_lens_cpu=extend_logprob_pruned_lens_cpu,
+            top_logprobs_nums=forward_batch.top_logprobs_nums,
+            token_ids_logprobs=forward_batch.token_ids_logprobs,
+            extend_input_logprob_token_ids_gpu=forward_batch.extend_input_logprob_token_ids_gpu,
+            padded_static_len=forward_batch.padded_static_len,
+            global_num_tokens_gpu=forward_batch.global_num_tokens_gpu,
+            dp_local_start_pos=forward_batch.dp_local_start_pos,
+            dp_local_num_tokens=forward_batch.dp_local_num_tokens,
+            global_dp_buffer_len=forward_batch.global_dp_buffer_len,
+            global_num_tokens_for_logprob_cpu=forward_batch.global_num_tokens_for_logprob_cpu,
+            global_num_tokens_for_logprob_gpu=forward_batch.global_num_tokens_for_logprob_gpu,
+            dp_padding_mode=DpPaddingMode.SUM_LEN,
+        )
+
+    def compute_dp_attention_metadata(self):
+
+        cumtokens = torch.cumsum(self.global_num_tokens_for_logprob_gpu, dim=0)
+        dp_rank = get_attention_dp_rank()
+        if dp_rank == 0:
+            dp_local_start_pos = torch.zeros_like(
+                self.global_num_tokens_for_logprob_gpu[0]
+            )
+        else:
+            dp_local_start_pos = cumtokens[dp_rank - 1]
+        dp_local_num_tokens = self.global_num_tokens_for_logprob_gpu[dp_rank]
+
+        self.dp_local_start_pos = dp_local_start_pos
+        self.dp_local_num_tokens = dp_local_num_tokens
+
+        if self.global_num_tokens_for_logprob_cpu is not None:
+            # create a smaller buffer to reduce peak memory usage
+            self.global_dp_buffer_len = sum(self.global_num_tokens_for_logprob_cpu)
+        else:
+            self.global_dp_buffer_len = self.global_dp_buffer_len
+
+        set_dp_buffer_len(
+            self.global_dp_buffer_len,
+            self.dp_local_num_tokens,
+            self.global_num_tokens_for_logprob_cpu,
+        )
+
+
+class LogitsProcessor(nn.Module):
+    def __init__(
+        self, config, skip_all_gather: bool = False, logit_scale: Optional[float] = None
+    ):
+        super().__init__()
+        self.config = config
+        self.logit_scale = logit_scale
+        self.use_attn_tp_group = global_server_args_dict["enable_dp_lm_head"]
+        if self.use_attn_tp_group:
+            self.attn_tp_size = get_attention_tp_size()
+            self.do_tensor_parallel_all_gather = (
+                not skip_all_gather and self.attn_tp_size > 1
+            )
+            self.do_tensor_parallel_all_gather_dp_attn = False
+        else:
+            self.do_tensor_parallel_all_gather = (
+                not skip_all_gather and get_tensor_model_parallel_world_size() > 1
+            )
+            self.do_tensor_parallel_all_gather_dp_attn = (
+                self.do_tensor_parallel_all_gather and get_attention_dp_size() != 1
+            )
+        self.final_logit_softcapping = getattr(
+            self.config, "final_logit_softcapping", None
+        )
+        if (
+            self.final_logit_softcapping is not None
+            and self.final_logit_softcapping < 0
+        ):
+            self.final_logit_softcapping = None
+
+        self.debug_tensor_dump_output_folder = global_server_args_dict.get(
+            "debug_tensor_dump_output_folder", None
+        )
+
+    def forward(
+        self,
+        input_ids,
+        hidden_states,
+        lm_head: VocabParallelEmbedding,
+        logits_metadata: Union[LogitsMetadata, ForwardBatch],
+        aux_hidden_states: Optional[torch.Tensor] = None,
+    ) -> LogitsProcessorOutput:
+        if isinstance(logits_metadata, ForwardBatch):
+            logits_metadata = LogitsMetadata.from_forward_batch(logits_metadata)
+        # Get the last hidden states and last logits for the next token prediction
+        if (
+            logits_metadata.forward_mode.is_decode_or_idle()
+            or logits_metadata.forward_mode.is_target_verify()
+        ):
+            pruned_states = hidden_states
+            if aux_hidden_states is not None:
+                aux_pruned_states = [hidden for hidden in aux_hidden_states]
+            sample_indices = None
+            input_logprob_indices = None
+        elif (
+            logits_metadata.forward_mode.is_extend()
+            and not logits_metadata.extend_return_logprob
+        ):
+            # Prefill without input logprobs.
+            if logits_metadata.padded_static_len < 0:
+                last_index = torch.cumsum(logits_metadata.extend_seq_lens, dim=0) - 1
+            else:
+                # If padding_static length is 5 and extended_seq_lens is [2, 3],
+                # then our batch looks like [t00, t01, p, p, p, t10, t11, t12, p, p]
+                # and this retrieves t01 and t12, which are the valid last tokens
+                idx = torch.arange(
+                    len(logits_metadata.extend_seq_lens),
+                    device=logits_metadata.extend_seq_lens.device,
+                )
+                last_index = (
+                    idx * logits_metadata.padded_static_len
+                    + logits_metadata.extend_seq_lens
+                    - 1
+                )
+            pruned_states = hidden_states[last_index]
+            if aux_hidden_states is not None:
+                aux_pruned_states = [hidden[last_index] for hidden in aux_hidden_states]
+            sample_indices = None
+            input_logprob_indices = None
+        else:
+            # Input logprobs are required.
+            # Find 3 different indices.
+            # 1. pruned_states: hidden states that we want logprobs from.
+            # 2. sample_indices: Indices that have sampled tokens.
+            # 3. input_logprob_indices: Indices that have input logprob tokens.
+            sample_index_pt = -1
+            sample_indices = []
+            input_logprob_indices_pt = 0
+            input_logprob_indices = []
+            pt, pruned_states = 0, []
+            for extend_logprob_start_len, extend_len in zip(
+                logits_metadata.extend_logprob_start_lens_cpu,
+                logits_metadata.extend_seq_lens_cpu,
+            ):
+                # It can happen in chunked prefill. We still need to sample 1 token,
+                # But we don't want to include it in input logprob.
+                if extend_len == extend_logprob_start_len:
+                    start_len = extend_logprob_start_len - 1
+                else:
+                    start_len = extend_logprob_start_len
+
+                # We always need at least 1 token to sample because that's required
+                # by a caller.
+                assert extend_len > start_len
+                pruned_states.append(hidden_states[pt + start_len : pt + extend_len])
+                pt += extend_len
+                sample_index_pt += extend_len - start_len
+                sample_indices.append(sample_index_pt)
+                input_logprob_indices.extend(
+                    [
+                        input_logprob_indices_pt + i
+                        for i in range(extend_len - extend_logprob_start_len)
+                    ]
+                )
+                input_logprob_indices_pt += extend_len - start_len
+
+            pruned_states = torch.cat(pruned_states)
+            sample_indices = torch.tensor(
+                sample_indices, device=pruned_states.device, dtype=torch.int64
+            )
+            input_logprob_indices = torch.tensor(
+                input_logprob_indices, device=pruned_states.device, dtype=torch.int64
+            )
+
+        # Compute logits for both input and sampled tokens.
+        logits = self._get_logits(pruned_states, lm_head, logits_metadata)
+        sampled_logits = (
+            logits[sample_indices] if sample_indices is not None else logits
+        )
+
+        if self.debug_tensor_dump_output_folder:
+            assert (
+                not self.do_tensor_parallel_all_gather
+                or get_local_attention_dp_size() == 1
+            ), "dp attention + sharded lm_head doesn't support full logits"
+            full_logits = self._get_logits(hidden_states, lm_head, logits_metadata)
+            dump_to_file(self.debug_tensor_dump_output_folder, "logits", full_logits)
+
+        hidden_states_to_store: Optional[torch.Tensor] = None
+        if logits_metadata.capture_hidden_mode.need_capture():
+            if logits_metadata.capture_hidden_mode.is_full():
+                if aux_hidden_states is not None:
+                    aux_hidden_states = torch.cat(aux_hidden_states, dim=-1)
+                    hidden_states_to_store = aux_hidden_states
+                else:
+                    hidden_states_to_store = hidden_states
+            elif logits_metadata.capture_hidden_mode.is_last():
+                # Get the last token hidden states. If sample_indices is None,
+                # pruned states only contain the last tokens already.
+                if aux_hidden_states is not None:
+                    aux_pruned_states = torch.cat(aux_pruned_states, dim=-1)
+                    hidden_states_to_store = (
+                        aux_pruned_states[sample_indices]
+                        if sample_indices is not None
+                        else aux_pruned_states
+                    )
+                else:
+                    hidden_states_to_store = (
+                        pruned_states[sample_indices]
+                        if sample_indices is not None
+                        else pruned_states
+                    )
+            else:
+                assert False, "Should never reach"
+
+        if not logits_metadata.extend_return_logprob:
+            # Decode mode or extend mode without return_logprob.
+            return LogitsProcessorOutput(
+                next_token_logits=sampled_logits,
+                hidden_states=hidden_states_to_store,
+            )
+        else:
+            input_logprobs = logits[input_logprob_indices]
+            del hidden_states, logits
+
+            # Normalize the logprob w/o temperature, top-p
+            pruned_lens = torch.tensor(
+                logits_metadata.extend_logprob_pruned_lens_cpu,
+                device=input_logprobs.device,
+            )
+            if logits_metadata.temp_scaled_logprobs:
+                logits_metadata.temperature = torch.repeat_interleave(
+                    logits_metadata.temperature.view(-1),
+                    pruned_lens,
+                ).view(-1, 1)
+            if logits_metadata.top_p_normalized_logprobs:
+                logits_metadata.top_p = torch.repeat_interleave(
+                    logits_metadata.top_p,
+                    pruned_lens,
+                )
+            input_logprobs = self.compute_temp_top_p_normalized_logprobs(
+                input_logprobs, logits_metadata
+            )
+
+            # Get the logprob of top-k tokens
+            if logits_metadata.extend_return_top_logprob:
+                (
+                    input_top_logprobs_val,
+                    input_top_logprobs_idx,
+                ) = self.get_top_logprobs(input_logprobs, logits_metadata)
+            else:
+                input_top_logprobs_val = input_top_logprobs_idx = None
+
+            # Get the logprob of given token id
+            if logits_metadata.extend_token_ids_logprob:
+                (
+                    input_token_ids_logprobs_val,
+                    input_token_ids_logprobs_idx,
+                ) = self.get_token_ids_logprobs(input_logprobs, logits_metadata)
+            else:
+                input_token_ids_logprobs_val = input_token_ids_logprobs_idx = None
+
+            input_token_logprobs = input_logprobs[
+                torch.arange(input_logprobs.shape[0], device=input_logprobs.device),
+                logits_metadata.extend_input_logprob_token_ids_gpu,
+            ]
+
+            return LogitsProcessorOutput(
+                next_token_logits=sampled_logits,
+                input_token_logprobs=input_token_logprobs,
+                input_top_logprobs_val=input_top_logprobs_val,
+                input_top_logprobs_idx=input_top_logprobs_idx,
+                hidden_states=hidden_states_to_store,
+                input_token_ids_logprobs_val=input_token_ids_logprobs_val,
+                input_token_ids_logprobs_idx=input_token_ids_logprobs_idx,
+            )
+
+    def _get_logits(
+        self,
+        hidden_states: torch.Tensor,
+        lm_head: VocabParallelEmbedding,
+        logits_metadata: LogitsMetadata,
+        embedding_bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Get logits from hidden_states.
+
+        If sampled_logits_only is True, it means hidden_states only contain the
+        last position (e.g., extend without input logprobs). The caller should
+        guarantee the given hidden_states follow this constraint.
+        """
+        if self.do_tensor_parallel_all_gather_dp_attn:
+            logits_metadata.compute_dp_attention_metadata()
+            hidden_states, local_hidden_states = (
+                get_global_dp_buffer(),
+                hidden_states,
+            )
+            dp_gather_replicate(hidden_states, local_hidden_states, logits_metadata)
+
+        if hasattr(lm_head, "weight"):
+            if use_intel_amx_backend(lm_head):
+                logits = torch.ops.sgl_kernel.weight_packed_linear(
+                    hidden_states.to(lm_head.weight.dtype),
+                    lm_head.weight,
+                    None,  # bias
+                    True,  # is_vnni
+                )
+            else:
+                logits = torch.matmul(
+                    hidden_states.to(lm_head.weight.dtype), lm_head.weight.T
+                )
+        else:
+            # GGUF models
+            # TODO: use weight_packed_linear for GGUF models
+            logits = lm_head.quant_method.apply(lm_head, hidden_states, embedding_bias)
+
+        if self.logit_scale is not None:
+            logits.mul_(self.logit_scale)
+
+        if self.do_tensor_parallel_all_gather:
+            if self.use_attn_tp_group:
+                if self.config.vocab_size % self.attn_tp_size == 0:
+                    global_logits = torch.empty(
+                        (
+                            self.attn_tp_size,
+                            logits.shape[0],
+                            self.config.vocab_size // self.attn_tp_size,
+                        ),
+                        device=logits.device,
+                        dtype=logits.dtype,
+                    )
+                    attn_tp_all_gather_into_tensor(global_logits, logits)
+                    global_logits = global_logits.permute(1, 0, 2).reshape(
+                        logits.shape[0], self.config.vocab_size
+                    )
+                else:
+                    global_logits = torch.empty(
+                        (self.config.vocab_size, logits.shape[0]),
+                        device=logits.device,
+                        dtype=logits.dtype,
+                    )
+                    global_logits = global_logits.T
+                    attn_tp_all_gather(
+                        list(global_logits.tensor_split(self.attn_tp_size, dim=-1)),
+                        logits,
+                    )
+                logits = global_logits
+            else:
+                logits = tensor_model_parallel_all_gather(logits)
+
+        if self.do_tensor_parallel_all_gather_dp_attn:
+            logits, global_logits = (
+                torch.empty(
+                    (local_hidden_states.shape[0], logits.shape[1]),
+                    device=logits.device,
+                    dtype=logits.dtype,
+                ),
+                logits,
+            )
+            dp_scatter(logits, global_logits, logits_metadata)
+
+        if logits_metadata.next_token_logits_buffer is not None:
+            logits_buffer = logits_metadata.next_token_logits_buffer
+            assert logits_buffer.dtype == torch.float
+            logits_buffer.copy_(logits[:, : self.config.vocab_size])
+            logits = logits_buffer
+        else:
+            logits = logits[:, : self.config.vocab_size].float()
+
+        if self.final_logit_softcapping:
+            if not _is_npu:
+                fused_softcap(logits, self.final_logit_softcapping)
+            else:
+                logits = self.final_logit_softcapping * torch.tanh(
+                    logits / self.final_logit_softcapping
+                )
+
+        return logits
+
+    @staticmethod
+    def get_top_logprobs(all_logprobs: torch.Tensor, logits_metadata: LogitsMetadata):
+        max_k = max(logits_metadata.top_logprobs_nums)
+        ret = all_logprobs.topk(max_k, dim=1)
+        values = ret.values.tolist()
+        indices = ret.indices.tolist()
+
+        input_top_logprobs_val, input_top_logprobs_idx = [], []
+
+        pt = 0
+        for k, pruned_len in zip(
+            logits_metadata.top_logprobs_nums,
+            logits_metadata.extend_logprob_pruned_lens_cpu,
+        ):
+            if pruned_len <= 0:
+                input_top_logprobs_val.append([])
+                input_top_logprobs_idx.append([])
+                continue
+
+            input_top_logprobs_val.append(
+                [values[pt + j][:k] for j in range(pruned_len)]
+            )
+            input_top_logprobs_idx.append(
+                [indices[pt + j][:k] for j in range(pruned_len)]
+            )
+            pt += pruned_len
+
+        return input_top_logprobs_val, input_top_logprobs_idx
+
+    @staticmethod
+    def get_token_ids_logprobs(
+        all_logprobs: torch.Tensor, logits_metadata: LogitsMetadata
+    ):
+        input_token_ids_logprobs_val, input_token_ids_logprobs_idx = [], []
+        pt = 0
+        for token_ids, pruned_len in zip(
+            logits_metadata.token_ids_logprobs,
+            logits_metadata.extend_logprob_pruned_lens_cpu,
+        ):
+            if pruned_len <= 0:
+                input_token_ids_logprobs_val.append([])
+                input_token_ids_logprobs_idx.append([])
+                continue
+
+            input_token_ids_logprobs_val.append(
+                [all_logprobs[pt + j, token_ids].tolist() for j in range(pruned_len)]
+            )
+            input_token_ids_logprobs_idx.append([token_ids for _ in range(pruned_len)])
+            pt += pruned_len
+
+        return input_token_ids_logprobs_val, input_token_ids_logprobs_idx
+
+    @staticmethod
+    def compute_temp_top_p_normalized_logprobs(
+        last_logits: torch.Tensor, logits_metadata: LogitsMetadata
+    ) -> torch.Tensor:
+        """
+        compute logprobs for the output token from the given logits.
+
+        Returns:
+            torch.Tensor: logprobs from logits
+        """
+        # Scale logits if temperature scaling is enabled
+        if logits_metadata.temp_scaled_logprobs:
+            last_logits = last_logits / logits_metadata.temperature
+
+        # Normalize logprobs if top_p normalization is enabled
+        # NOTE: only normalize logprobs when top_p is set and not equal to 1.0
+        if (
+            logits_metadata.top_p_normalized_logprobs
+            and (logits_metadata.top_p != 1.0).any()
+        ):
+            from sglang.srt.layers.sampler import top_p_normalize_probs_torch
+
+            probs = torch.softmax(last_logits, dim=-1)
+            del last_logits
+            probs = top_p_normalize_probs_torch(probs, logits_metadata.top_p)
+            return torch.log(probs)
+        else:
+            return torch.nn.functional.log_softmax(last_logits, dim=-1)
+
+
+@triton.jit
+def fused_softcap_kernel(
+    full_logits_ptr,
+    softcapping_value,
+    n_elements,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0).to(tl.int64)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+
+    # Load values
+    x = tl.load(full_logits_ptr + offsets, mask=mask)
+
+    # Perform operations in-place
+    x = x / softcapping_value
+
+    # Manual tanh implementation using exp
+    exp2x = tl.exp(2 * x)
+    x = (exp2x - 1) / (exp2x + 1)
+
+    x = x * softcapping_value
+
+    # Store result
+    tl.store(full_logits_ptr + offsets, x, mask=mask)
+
+
+def fused_softcap(full_logits, final_logit_softcapping):
+    n_elements = full_logits.numel()
+    BLOCK_SIZE = 1024
+    grid = ((n_elements + BLOCK_SIZE - 1) // BLOCK_SIZE, 1, 1)
+
+    fused_softcap_kernel[grid](
+        full_logits_ptr=full_logits,
+        softcapping_value=final_logit_softcapping,
+        n_elements=n_elements,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+    return full_logits
diff --git a/python/sglang/srt/layers/model_parallel.py b/python/sglang/srt/layers/model_parallel.py
new file mode 100644
index 000000000..53c8b622e
--- /dev/null
+++ b/python/sglang/srt/layers/model_parallel.py
@@ -0,0 +1,155 @@
+"""
+Common utilities for torch model parallelism.
+"""
+
+from typing import Optional, Sequence
+
+import torch
+import torch.nn as nn
+from torch.distributed.device_mesh import DeviceMesh
+
+try:
+    import torch.distributed.tensor as dt
+except ImportError:
+    # torch 2.4 or older
+    import torch.distributed._tensor as dt
+
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    RowwiseParallel,
+    parallelize_module,
+)
+
+
+def _shard_tensor(
+    full_tensor: torch.Tensor,
+    device_mesh: DeviceMesh,
+    placements: Sequence[dt.Shard],
+) -> "dt.DTensor":
+    """
+    Locally shards a full tensor based on indicated sharding arrangement, and
+    returns a DTensor containing the local shard.
+
+    .. warning:: This is a private API that is subject to change. It skips the
+        communication otherwise required by `distribute_tensor`. It is only
+        applicable to cases where all ranks have the same `full_tensor`. For
+        example, in distributed inference all ranks load from the same
+        checkpoint. This API will not check for data equality between ranks, it
+        is thus user's responsibility to ensure the `full_tensor` is the same
+        across ranks.
+
+    Args:
+        full_tensor (torch.Tensor): the full tensor to be sharded.
+        device_mesh (:class:`DeviceMesh`): DeviceMesh to place the
+            DTensor.  Must have same dimension as the number of placements.
+        placements (Sequence[:class:`Shard`]): the placements that
+            describes how to place the local tensor on DeviceMesh.
+
+    Returns:
+        A :class:`DTensor` object with the shard as its local tensor.
+
+    Examples:
+        >>> # xdoctest: +SKIP("need world_size and rank")
+        >>> device_mesh = dist.init_device_mesh("cuda", (world_size,))
+        >>> full_tensor = torch.arange(world_size, device=f"cuda:{rank}")
+        >>> dtensor = _shard_tensor(full_tensor, device_mesh, [Shard(1)])
+    """
+    shape, offset = dt._utils.compute_local_shape_and_global_offset(
+        full_tensor.shape, device_mesh, placements
+    )
+    slices = [
+        slice(cur_offset, cur_offset + cur_shape)
+        for cur_shape, cur_offset in zip(shape, offset)
+    ]
+    local_tensor = full_tensor[slices]
+    return dt.DTensor.from_local(local_tensor, device_mesh, placements)
+
+
+class ColwiseParallelSharded(ColwiseParallel):
+    """
+    A version of ColwiseParallel where the local weight has been already
+    sharded.  This is used for the fused wqkv case, where during loading, we
+    already sharded wq, wk, wv before fusing them.
+    """
+
+    # Override the _partition_linear_fn in ColwiseParallel
+    def _partition_linear_fn(self, name, module, device_mesh):
+        # colwise shard weight/bias to Shard(0), weight be Shard(0)
+        # means Colwise as Linear is input * weight^T + bias, where
+        # weight would become Shard(1)
+        for name, param in module.named_parameters():
+            dtensor = dt.DTensor.from_local(param, device_mesh, [dt.Shard(0)])
+            dist_param = torch.nn.Parameter(dtensor, requires_grad=False)
+            module.register_parameter(name, dist_param)
+
+
+class RowwiseParallelMaybeWait(RowwiseParallel):
+    """
+    A version of RowwiseParallel that waits for the output (establish dependency
+    between comm stream and compute stream in CUDA sense) before going into the
+    next op. This is needed to workaround the current interaction between
+    AsyncCollectiveTensor and custom ops, such as `class RMSNorm(CustomOp)`.
+    """
+
+    def _partition_linear_fn(self, name, module, device_mesh):
+        # Rowwise shard weight to Shard(1), bias to Replicate(), weight be Shard(1)
+        # means Rowwise as nn.Linear is input * weight^T + bias, where
+        # weight would become Shard(0)
+        module.register_parameter(
+            "weight",
+            nn.Parameter(_shard_tensor(module.weight, device_mesh, [dt.Shard(1)])),
+        )
+        if getattr(module, "bias", None) is not None:
+            # The Linear module has bias
+            module.register_parameter(
+                "bias",
+                nn.Parameter(
+                    dt.distribute_tensor(module.bias, device_mesh, [dt.Replicate()])
+                ),
+            )
+
+    @staticmethod
+    def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
+        outputs = super(
+            RowwiseParallelMaybeWait, RowwiseParallelMaybeWait
+        )._prepare_output_fn(
+            output_layouts, use_local_output, mod, outputs, device_mesh
+        )
+        return torch.distributed._functional_collectives.wait_tensor(outputs)
+
+
+def tensor_parallel(
+    module: torch.nn.Module,
+    device_mesh: Optional[DeviceMesh] = None,
+):
+    """
+    Tensor parallelize the model across the given device mesh.
+    Args:
+        module (`torch.nn.Module`):
+            The module to tensor parallelize.
+        device_mesh (`torch.distributed.DeviceMesh`):
+            The device mesh to use for tensor parallelism.
+    """
+
+    # Tensor parallelize a nn.Module based on the `_tp_plan` attribute of the module.
+    # No op if `_tp_plan` attribute does not exist under the module.
+    # This is a helper function to be used with `model.apply` to recursively
+    # parallelize a model.
+    def tplize(mod: torch.nn.Module) -> None:
+        tp_plan = getattr(mod, "_tp_plan", None)
+        if tp_plan is None:
+            return
+        for child_name, tp_style in tp_plan.items():
+            submod = mod.get_submodule(child_name)
+            if tp_style == "Colwise":
+                parallelize_module(submod, device_mesh, ColwiseParallel())
+            elif tp_style == "Rowwise":
+                parallelize_module(submod, device_mesh, RowwiseParallelMaybeWait())
+            elif tp_style == "Colwise_Sharded":
+                parallelize_module(submod, device_mesh, ColwiseParallelSharded())
+            else:
+                raise ValueError(f"Unknown TP style {tp_style}")
+
+    # `apply` is a native method of `nn.Module` that recursively applies a
+    # function to every submodule.
+    module.apply(tplize)
diff --git a/python/sglang/srt/layers/moe/__init__.py b/python/sglang/srt/layers/moe/__init__.py
new file mode 100644
index 000000000..5c75a3682
--- /dev/null
+++ b/python/sglang/srt/layers/moe/__init__.py
@@ -0,0 +1,32 @@
+from sglang.srt.layers.moe.moe_runner import MoeRunner, MoeRunnerConfig
+from sglang.srt.layers.moe.utils import (
+    DeepEPMode,
+    MoeA2ABackend,
+    MoeRunnerBackend,
+    get_deepep_config,
+    get_deepep_mode,
+    get_moe_a2a_backend,
+    get_moe_runner_backend,
+    get_tbo_token_distribution_threshold,
+    initialize_moe_config,
+    is_tbo_enabled,
+    should_use_flashinfer_cutlass_moe_fp4_allgather,
+    should_use_flashinfer_trtllm_moe,
+)
+
+__all__ = [
+    "DeepEPMode",
+    "MoeA2ABackend",
+    "MoeRunner",
+    "MoeRunnerConfig",
+    "MoeRunnerBackend",
+    "initialize_moe_config",
+    "get_moe_a2a_backend",
+    "get_moe_runner_backend",
+    "get_deepep_mode",
+    "should_use_flashinfer_trtllm_moe",
+    "should_use_flashinfer_cutlass_moe_fp4_allgather",
+    "is_tbo_enabled",
+    "get_tbo_token_distribution_threshold",
+    "get_deepep_config",
+]
diff --git a/python/sglang/srt/layers/moe/cutlass_moe.py b/python/sglang/srt/layers/moe/cutlass_moe.py
new file mode 100755
index 000000000..d0fb4e3ef
--- /dev/null
+++ b/python/sglang/srt/layers/moe/cutlass_moe.py
@@ -0,0 +1,365 @@
+"""CUTLASS based Fused MoE kernels."""
+
+import torch
+
+from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams
+from sglang.srt.utils import is_cuda
+
+_is_cuda = is_cuda()
+if _is_cuda:
+    from sgl_kernel import (
+        apply_shuffle_mul_sum,
+        cutlass_fp4_group_mm,
+        fp8_blockwise_scaled_grouped_mm,
+        prepare_moe_input,
+        scaled_fp4_experts_quant,
+        shuffle_rows,
+        silu_and_mul,
+    )
+
+
+def cutlass_fused_experts_fp8(
+    a: torch.Tensor,
+    w1_q: torch.Tensor,
+    w2_q: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    a1_strides: torch.Tensor,
+    c1_strides: torch.Tensor,
+    a2_strides: torch.Tensor,
+    c2_strides: torch.Tensor,
+    workspace: torch.Tensor,
+    a_ptrs: torch.Tensor,
+    b_ptrs: torch.Tensor,
+    out_ptrs: torch.Tensor,
+    a_scales_ptrs: torch.Tensor,
+    b_scales_ptrs: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    problem_sizes1: torch.Tensor,
+    problem_sizes2: torch.Tensor,
+    use_fp8_blockscale: bool = True,
+) -> torch.Tensor:
+    """Performs Fused MoE computation using CUTLASS-like kernels with FP8 weights and activations.
+
+    This function implements a Mixture of Experts (MoE) layer with a SwiGLU/SiLU
+    activation, leveraging custom kernels likely derived from CUTLASS principles
+    for grouped matrix multiplication (`fp8_blockwise_scaled_grouped_mm`) and
+    data preparation (`prepare_moe_input`, `silu_and_mul`).
+
+    It handles per-token routing, quantizes input activations to FP8 with
+    per-token scales, performs the expert computations using FP8 GEMMs with
+    pre-quantized FP8 weights (per-block scales), applies the SiLU activation,
+    and combines the results weighted by the router scores.
+
+    Args:
+        a (torch.Tensor): Input activations. Shape: `(m, k)`, where `m` is the total
+            number of tokens and `k` is the hidden size. Expected dtype: `torch.half`
+            or `torch.bfloat16`.
+        w1_q (torch.Tensor): Pre-quantized FP8 weight tensor for the first GEMM
+            (up-projection part of SwiGLU). Expected shape: `(E, k, n*2)`, where
+            `E` is the number of experts, `k` is the hidden size, and `n*2` is the
+            intermediate size (`I`). Expected dtype: `torch.float8_e4m3fn`.
+            Note: This shape implies weights are stored as (num_experts, hidden_size, intermediate_size).
+        w2_q (torch.Tensor): Pre-quantized FP8 weight tensor for the second GEMM
+            (down-projection). Expected shape: `(E, n, k)`, where `n` is half the
+            intermediate size (`I // 2`). Expected dtype: `torch.float8_e4m3fn`.
+            Note: This shape implies weights are stored as (num_experts, intermediate_size // 2, hidden_size).
+        w1_scale (torch.Tensor): Scales corresponding to `w1_q` (per-block scales).
+            Shape: `(E, num_blocks_n, num_blocks_k)`. Dtype: `torch.float32`.
+        w2_scale (torch.Tensor): Scales corresponding to `w2_q` (per-block scales).
+             Shape: `(E, num_blocks_k, num_blocks_n)`. Dtype: `torch.float32`.
+        topk_weights (torch.Tensor): Router weights for the selected top-k experts
+            for each token. Shape: `(m, topk)`. Dtype should ideally match `a`.
+        topk_ids (torch.Tensor): Indices of the selected top-k experts for each token.
+            Shape: `(m, topk)`. Dtype: `torch.int32`.
+        a1_strides (torch.Tensor): Stride information for the first GEMM's 'a' input.
+            Passed directly to the underlying kernel. Expected shape `(E,)`, dtype `torch.int64`.
+            Note: Its exact usage within `fp8_blockwise_scaled_grouped_mm` needs clarification
+            as it's passed as both a_stride and b_stride in the first call.
+        c1_strides (torch.Tensor): Stride information for the first GEMM's 'c' output.
+            Passed directly to the underlying kernel. Expected shape `(E,)`, dtype `torch.int64`.
+        a2_strides (torch.Tensor): Stride information for the second GEMM's 'a' input.
+            Passed directly to the underlying kernel. Expected shape `(E,)`, dtype `torch.int64`.
+            Note: Its exact usage within `fp8_blockwise_scaled_grouped_mm` needs clarification
+            as it's passed as both a_stride and b_stride in the second call.
+        c2_strides (torch.Tensor): Stride information for the second GEMM's 'c' output.
+            Passed directly to the underlying kernel. Expected shape `(E,)`, dtype `torch.int64`.
+        workspace (torch.Tensor): Reusable workspace for the underlying kernel.
+        a_ptrs (torch.Tensor): Pointers container for calculating offsets of the input activations for each expert.
+        b_ptrs (torch.Tensor): Pointers container for calculating offsets of the input weights for each expert.
+        out_ptrs (torch.Tensor): Pointers container for calculating offsets of the output activations for each expert.
+        a_scales_ptrs (torch.Tensor): Pointers container for calculating offsets of the input scales for each expert.
+        b_scales_ptrs (torch.Tensor): Pointers container for calculating offsets of the input scales for each expert.
+        use_fp8_blockscale (bool, optional): Flag indicating usage of FP8 with
+            block scaling. Currently, only `True` is supported. Defaults to `True`.
+
+    Returns:
+        torch.Tensor: The computed MoE layer output. Shape: `(m, k)`, dtype matches `a`.
+
+    Raises:
+        AssertionError: If input shapes, dtypes, or flags are inconsistent or unsupported.
+        NotImplementedError: If CUDA is not available or `sgl_kernel` is not properly installed.
+    """
+    assert use_fp8_blockscale, "Only support fp8 blockscale for now"
+    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert w1_q.dtype == torch.float8_e4m3fn
+    assert w2_q.dtype == torch.float8_e4m3fn
+    assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1"
+    assert w1_q.shape[2] == w2_q.shape[1] * 2, "Hidden size mismatch w2"
+    assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
+    assert w1_q.shape[0] == w2_q.shape[0], "Weights expert number mismatch"
+    assert w1_q.shape[0] == w1_scale.shape[0], "w1 scales expert number mismatch"
+    assert w1_q.shape[0] == w2_scale.shape[0], "w2 scales expert number mismatch"
+    assert a.dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
+
+    if is_cuda:
+        from sglang.srt.layers.quantization.fp8_kernel import (
+            per_group_transpose,
+            per_token_group_quant_fp8_hopper_moe_mn_major,
+            sglang_per_token_group_quant_fp8,
+        )
+
+    out_dtype = a.dtype
+    num_experts = w1_q.size(0)
+    m = a.size(0)
+    k = w1_q.size(1)
+    n = w2_q.size(1)
+
+    topk = topk_ids.size(1)
+    device = a.device
+
+    a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+    c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+
+    prepare_moe_input(
+        topk_ids,
+        expert_offsets,
+        problem_sizes1,
+        problem_sizes2,
+        a_map,
+        c_map,
+        num_experts,
+        n,
+        k,
+    )
+
+    a_q, a1_scale = sglang_per_token_group_quant_fp8(a, 128)
+    rep_a_q = shuffle_rows(a_q, a_map, (m * topk, k))
+    rep_a1_scales = shuffle_rows(a1_scale, a_map, (m * topk, int(k / 128)))
+
+    c1 = torch.empty((m * topk, n * 2), device=device, dtype=out_dtype)
+    c2 = torch.empty((m * topk, k), device=device, dtype=out_dtype)
+
+    a_sf_layout = torch.empty((num_experts, 5), device=device, dtype=torch.int)
+    w_sf_layout = torch.empty((num_experts, 5), device=device, dtype=torch.int)
+
+    fp8_blockwise_scaled_grouped_mm(
+        c1,
+        a_ptrs,
+        b_ptrs,
+        out_ptrs,
+        a_scales_ptrs,
+        b_scales_ptrs,
+        rep_a_q,
+        w1_q,
+        rep_a1_scales,
+        w1_scale,
+        a1_strides,
+        a1_strides,
+        c1_strides,
+        a_sf_layout,
+        w_sf_layout,
+        problem_sizes1,
+        expert_offsets[:-1],
+        workspace,
+    )
+
+    intermediate = torch.empty((m * topk, n), device=device, dtype=out_dtype)
+    silu_and_mul(c1, intermediate)
+
+    intemediate_q, a2_scale = sglang_per_token_group_quant_fp8(intermediate, 128)
+
+    fp8_blockwise_scaled_grouped_mm(
+        c2,
+        a_ptrs,
+        b_ptrs,
+        out_ptrs,
+        a_scales_ptrs,
+        b_scales_ptrs,
+        intemediate_q,
+        w2_q,
+        a2_scale,
+        w2_scale,
+        a2_strides,
+        a2_strides,
+        c2_strides,
+        a_sf_layout,
+        w_sf_layout,
+        problem_sizes2,
+        expert_offsets[:-1],
+        workspace,
+    )
+
+    result = torch.empty((m, k), device=device, dtype=out_dtype)
+    apply_shuffle_mul_sum(c2, result, c_map, topk_weights.to(out_dtype))
+    return result
+
+
+FLOAT4_E2M1_MAX = 6.0
+FLOAT8_E4M3_MAX = 448.0
+
+
+def cutlass_moe_fp4(
+    a: torch.Tensor,
+    a1_gscale: torch.Tensor,
+    w1_fp4: torch.Tensor,
+    w1_blockscale: torch.Tensor,
+    w1_alphas: torch.Tensor,
+    a2_gscale: torch.Tensor,
+    w2_fp4: torch.Tensor,
+    w2_blockscale: torch.Tensor,
+    w2_alphas: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    params: CutlassMoEParams,
+    apply_router_weight_on_input: bool = False,
+):
+    """
+    MoE implementation for FP4 Inputs
+
+    # Gemm 1
+    a: Input tensor: [m, k] (half/bfloat16)
+    a1_gscale: Activation scale per expert: [e]  (float32)
+    w1(gate up) (not an argument to cutlass_moe_fp4): [e, 2 * n, k]
+    w1_fp4: [e, 2 * n, k // 2], dtype: torch.uint8 (stacked fp4: E2M1)
+    (Note: `n` is the up projection output dim, `k` is the input dim in
+     full precision)
+    w1_blockscale: [e, 2 * n, k // block_size] (float8_e4m3)
+                   (Block size = 16 for NVFP4)
+
+    # Gemm 2
+    a2_gscale: Activation scale per expert: [e]
+    w2(down projection) (not an argument to cutlass_moe_fp4): [e, k, n]
+    w2_fp4: [e, k, n // 2], dtype: torch.uint8 (stacked E2M1)
+    w2_blockscale: [e, k, n // block_size], dtype: float8_e4m3
+
+    Strides for activations, weights and output in logical number of elements.
+    The activations & output stride is the number of elements to the next row.
+    The weights stride is the number of elements to the next row per expert.
+    For example, if the weight is [e, n, k], then the b_stride is a tensor of
+    shape [e] with each element being k. Similarly for activations, if the
+    shape is [m, k], then the a_stride has shape [e] with each value k.
+    Similarly for output, if the output is [m, n], then the c_stride is a
+    tensor of shape [e] with each element being k.
+
+    Note: cutlass_fp4_group_mm is designed to accept the strides of
+    activations and weights to be the same, so it is passed in as a single
+    tensor.
+    ab_strides_13: [e] dtype: int64 [Gemm 1: Activation / Weight strides]
+    ab_strides_2: [e] dtype: int64 [Gemm 2: Activation / Weight strides]
+    c_strides_13: [e] dtype: int64 [Gemm 1: Output Strides]
+    c_strides_2: [e] dtype: int64 [Gemm 1: Output Strides]
+
+    topk_weights: [m, topk] dtype: float8
+    topk_ids: [m, topk] dtype: float8
+
+    m, n, k: Unquantized weight shapes, dtype: int
+    e: number of experts for the current rank, dtype: int
+    assumes that topk < k < n to satisfy - up/down projection expectations.
+    """
+    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert w1_fp4.dtype == torch.uint8, "weight 1 must be uint8"
+    assert w2_fp4.dtype == torch.uint8, "weight 2 must be uint8"
+    assert (
+        w1_fp4.ndim == 3
+        and w2_fp4.ndim == 3
+        and w1_blockscale.ndim == 3
+        and w2_blockscale.ndim == 3
+    ), "All Weights must be of rank 3 for cutlass_moe_fp4"
+    m_a, k_a = a.shape
+    e_w1, nx2_w1, half_k_w1 = w1_fp4.shape
+    e_w2, k_w2, half_n_w2 = w2_fp4.shape
+
+    assert e_w1 == e_w2 and e_w1 == params.num_experts, (
+        "Number of experts must match",
+        " between weights.",
+    )
+    assert (
+        k_a // 2 == half_k_w1 and params.hidden_size == k_w2
+    ), "Hidden size mismatch between a, w1 and w2"
+    assert (
+        nx2_w1 == params.intermediate_size_per_partition * 2
+        and half_n_w2 == params.intermediate_size_per_partition // 2
+    ), ("mismatch in " "expected `n`")
+    assert 2 * half_k_w1 == k_w2, "Hidden size mismatch w2 and w1"
+    assert a.dtype in [torch.half, torch.bfloat16], "Invalid input dtype"
+
+    out_dtype = a.dtype
+    num_topk = topk_ids.shape[1]
+    device = a.device
+    a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+    c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+    prepare_moe_input(
+        topk_ids,
+        params.expert_offsets,
+        params.problem_sizes1,
+        params.problem_sizes2,
+        a_map,
+        c_map,
+        params.num_experts,
+        params.intermediate_size_per_partition,
+        params.hidden_size,
+        params.blockscale_offsets,
+    )
+
+    rep_a_fp4, rep_a_blockscale = scaled_fp4_experts_quant(
+        a,
+        a1_gscale,
+        params.expert_offsets,
+        params.blockscale_offsets,
+        num_topk,
+        expert_map=a_map,
+    )
+    c1 = cutlass_fp4_group_mm(
+        rep_a_fp4,
+        w1_fp4,
+        rep_a_blockscale,
+        w1_blockscale,
+        w1_alphas,
+        out_dtype,
+        device,
+        params.to_gemm1_args(),
+    )
+    del rep_a_fp4, rep_a_blockscale
+
+    # hidden size dimension is split to one halfpytho sized tensor.
+    intermediate = torch.empty(
+        (m_a * num_topk, w1_fp4.shape[1] // 2), device=device, dtype=out_dtype
+    )
+    silu_and_mul(c1, intermediate)
+
+    int_fp4, int_blockscale = scaled_fp4_experts_quant(
+        intermediate,
+        a2_gscale,
+        params.expert_offsets,
+        params.blockscale_offsets,
+        num_topk,
+    )
+    c2 = cutlass_fp4_group_mm(
+        int_fp4,
+        w2_fp4,
+        int_blockscale,
+        w2_blockscale,
+        w2_alphas,
+        out_dtype,
+        device,
+        params.to_gemm2_args(),
+    )
+    del int_fp4, int_blockscale
+    c2 = shuffle_rows(c2, c_map, (m_a * num_topk, params.hidden_size))
+    c2 = c2.view(m_a, num_topk, params.hidden_size)
+    if not apply_router_weight_on_input:
+        c2 = c2 * topk_weights.view(m_a, num_topk, 1).to(out_dtype)
+    return c2.sum(dim=1).to(out_dtype)
diff --git a/python/sglang/srt/layers/moe/cutlass_moe_params.py b/python/sglang/srt/layers/moe/cutlass_moe_params.py
new file mode 100644
index 000000000..f3de60e04
--- /dev/null
+++ b/python/sglang/srt/layers/moe/cutlass_moe_params.py
@@ -0,0 +1,169 @@
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import Optional
+
+import torch
+
+
+class CutlassMoEType(Enum):
+    """
+    Enum for the different types of cutlass moe operations
+    that are currently supported in SGLang.
+    """
+
+    BlockscaledFP8 = auto()
+    BlockscaledFP4 = auto()
+
+
+@dataclass
+class CutlassMoEParams:
+    """
+    Parameters for the cutlass moe operation.
+    """
+
+    #  Type as defined above
+    cutlass_moe_type: CutlassMoEType
+
+    # Strides for activations, weights and output in logical number of elements.
+    # The activations & output stride is the number of elements to the next row.
+    # The weights stride is the number of elements to the next row per expert.
+    # For example, if the weight is [e, n, k], then the b_stride is a tensor of
+    # shape [e] with each element being k. Similarly for activations, if the
+    # shape is [m, k], then the a_stride has shape [e] with each value k.
+    # Similarly for output, if the output is [m, n], then the c_stride is a
+    # tensor of shape [e] with each element being k.
+
+    # Note: cutlass_fp4_group_mm is designed to accept the strides of
+    # activations and weights to be the same, so it is passed in as a single
+    # tensor.
+    # ab_strides_13: [e] dtype: int64 [Gemm 1: Activation / Weight strides]
+    # ab_strides_2: [e] dtype: int64 [Gemm 2: Activation / Weight strides]
+    # c_strides_13: [e] dtype: int64 [Gemm 1: Output Strides]
+    # c_strides_2: [e] dtype: int64 [Gemm 2: Output Strides]
+    ab_strides_13: torch.Tensor
+    ab_strides_2: torch.Tensor
+    c_strides_13: torch.Tensor
+    c_strides_2: torch.Tensor
+
+    # m: Total number of tokens
+    # n: intermediate size per partition
+    # k: hidden size per expert
+    # e: Number of experts
+    # device: Device to run computation on and store tensors
+    m: int
+    intermediate_size_per_partition: int
+    hidden_size: int
+    num_experts: int
+    device: torch.device
+
+    # Pointers container for calculating offsets of the input activations for each expert
+    # a_ptrs: [e] dtype: int64
+    a_ptrs: torch.Tensor
+
+    # Pointers container for calculating offsets of the input weights for each expert
+    # b_ptrs: [e] dtype: int64
+    b_ptrs: torch.Tensor
+
+    # Pointers container for calculating offsets of the output activations for each expert
+    # out_ptrs: [e] dtype: int64
+    out_ptrs: torch.Tensor
+    # Pointers container for calculating offsets of the input scales for each expert
+    # a_scales_ptrs: [e] dtype: int64
+    # b_scales_ptrs: [e] dtype: int64
+    a_scales_ptrs: torch.Tensor
+    b_scales_ptrs: torch.Tensor
+
+    # Offsets that mark at which token index each expert begins its computation
+    # The number of tokens computed with expert E is expert_offsets[E + 1] - expert_offsets[E]
+    # expert_offsets: [e+1] dtype: int32
+    expert_offsets: torch.Tensor
+
+    # Problem size: (num_experts, (m,2n,k)) for first GEMM
+    # problem_sizes1: [e, 3] dtype: int32
+    # Problem size: (num_experts, (m,n,k)) for second GEMM
+    # problem_sizes2: [e, 3] dtype: int32
+    problem_sizes1: torch.Tensor
+    problem_sizes2: torch.Tensor
+    # Similar to expert_offsets, but for blockscales for FP4 blockscaled Group GEMM
+    blockscale_offsets: Optional[torch.Tensor] = None
+
+    def __init__(
+        self,
+        cutlass_moe_type: CutlassMoEType,
+        device: torch.device,
+        num_experts: int,
+        intermediate_size_per_partition: int,
+        hidden_size: int,
+    ):
+        self.cutlass_moe_type = cutlass_moe_type
+        self.device = device
+        self.num_experts = num_experts
+        self.intermediate_size_per_partition = intermediate_size_per_partition
+        self.hidden_size = hidden_size
+        self.n = self.intermediate_size_per_partition
+        self.k = self.hidden_size
+        self.e = self.num_experts
+        self.ab_strides_13 = torch.full(
+            (self.e,), self.k, dtype=torch.int64, device=self.device
+        )
+        self.ab_strides_2 = torch.full(
+            (self.e,), self.n, dtype=torch.int64, device=self.device
+        )
+        self.c_strides_13 = torch.full(
+            (self.e,), 2 * self.n, dtype=torch.int64, device=self.device
+        )
+        self.c_strides_2 = torch.full(
+            (self.e,), self.k, dtype=torch.int64, device=self.device
+        )
+        self.expert_offsets = torch.empty(
+            (self.e + 1,), dtype=torch.int32, device=self.device
+        )
+        self.problem_sizes1 = torch.empty(
+            (self.e, 3), dtype=torch.int32, device=self.device
+        )
+        self.problem_sizes2 = torch.empty(
+            (self.e, 3), dtype=torch.int32, device=self.device
+        )
+        if self.cutlass_moe_type == CutlassMoEType.BlockscaledFP4:
+            self.blockscale_offsets = torch.empty(
+                (self.e + 1,), dtype=torch.int32, device=self.device
+            )
+        else:
+            self.blockscale_offsets = None
+        self.a_ptrs = torch.empty((self.e,), dtype=torch.int64, device=self.device)
+        self.b_ptrs = torch.empty((self.e,), dtype=torch.int64, device=self.device)
+        self.out_ptrs = torch.empty((self.e,), dtype=torch.int64, device=self.device)
+        self.a_scales_ptrs = torch.empty(
+            (self.e,), dtype=torch.int64, device=self.device
+        )
+        self.b_scales_ptrs = torch.empty(
+            (self.e,), dtype=torch.int64, device=self.device
+        )
+
+    def to_gemm1_args(self) -> dict:
+        return {
+            "ab_strides": self.ab_strides_13,
+            "c_strides": self.c_strides_13,
+            "problem_sizes": self.problem_sizes1,
+            "expert_offsets": self.expert_offsets[:-1],
+            "blockscale_offsets": self.blockscale_offsets[:-1],
+            #    "a_ptrs": self.a_ptrs,
+            #    "b_ptrs": self.b_ptrs,
+            #    "out_ptrs": self.out_ptrs,
+            #    "a_scales_ptrs": self.a_scales_ptrs,
+            #    "b_scales_ptrs": self.b_scales_ptrs,
+        }
+
+    def to_gemm2_args(self) -> dict:
+        return {
+            "ab_strides": self.ab_strides_2,
+            "c_strides": self.c_strides_2,
+            "problem_sizes": self.problem_sizes2,
+            "expert_offsets": self.expert_offsets[:-1],
+            "blockscale_offsets": self.blockscale_offsets[:-1],
+            #    "a_ptrs": self.a_ptrs,
+            #    "b_ptrs": self.b_ptrs,
+            #    "out_ptrs": self.out_ptrs,
+            #    "a_scales_ptrs": self.a_scales_ptrs,
+            #    "b_scales_ptrs": self.b_scales_ptrs,
+        }
diff --git a/python/sglang/srt/layers/moe/cutlass_w4a8_moe.py b/python/sglang/srt/layers/moe/cutlass_w4a8_moe.py
new file mode 100644
index 000000000..216424eea
--- /dev/null
+++ b/python/sglang/srt/layers/moe/cutlass_w4a8_moe.py
@@ -0,0 +1,206 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Cutlass W4A8 MoE kernel."""
+from typing import Optional
+
+import torch
+from sgl_kernel import (
+    cutlass_w4a8_moe_mm,
+    get_cutlass_w4a8_moe_mm_data,
+    sgl_per_tensor_quant_fp8,
+    silu_and_mul,
+)
+
+from sglang.srt.layers.moe.ep_moe.kernels import (
+    post_reorder_triton_kernel_for_cutlass_moe,
+    pre_reorder_triton_kernel_for_cutlass_moe,
+    run_cutlass_moe_ep_preproess,
+)
+
+
+def cutlass_w4a8_moe(
+    start_expert_id: int,
+    end_expert_id: int,
+    total_num_experts: int,
+    a: torch.Tensor,
+    w1_q: torch.Tensor,
+    w2_q: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids_: torch.Tensor,
+    local_topk_ids: torch.Tensor,
+    a_strides1: torch.Tensor,
+    b_strides1: torch.Tensor,
+    c_strides1: torch.Tensor,
+    a_strides2: torch.Tensor,
+    b_strides2: torch.Tensor,
+    c_strides2: torch.Tensor,
+    s_strides13: torch.Tensor,
+    s_strides2: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    problem_sizes1: torch.Tensor,
+    problem_sizes2: torch.Tensor,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    apply_router_weight_on_input: bool = False,
+) -> torch.Tensor:
+    """
+    This function computes a w4a8-quantized Mixture of Experts (MoE) layer
+    using two sets of quantized weights, w1_q and w2_q, and top-k gating
+    mechanism. The matrix multiplications are implemented with CUTLASS
+    grouped gemm.
+
+    Parameters:
+    - a (torch.Tensor): The input tensor to the MoE layer.
+        Shape: [M, K]
+    - w1_q (torch.Tensor): The first set of int4-quantized expert weights.
+        Shape: [num_experts, N * 2,  K // 2]
+        (the weights are passed transposed and int4-packed)
+    - w2_q (torch.Tensor): The second set of int4-quantized expert weights.
+        Shape: [num_experts, K, N // 2]
+        (the weights are passed transposed and int4-packed)
+    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
+        Shape: [num_experts, K // 512, N * 8]
+    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
+        Shape: [num_experts, N // 512, K * 4]
+    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
+    - a_strides1 (torch.Tensor): The input strides of the first grouped gemm.
+    - b_strides1 (torch.Tensor): The weights strides of the first grouped gemm.
+    - c_strides1 (torch.Tensor): The output strides of the first grouped gemm.
+    - a_strides2 (torch.Tensor): The input strides of the second grouped gemm.
+    - b_strides2 (torch.Tensor): The weights strides of the second grouped gemm.
+    - c_strides2 (torch.Tensor): The output strides of the second grouped gemm.
+    - s_strides13 (torch.Tensor): The input and scale strides of the first grouped gemm.
+    - s_strides2 (torch.Tensor): The scale strides of the second grouped gemm.
+    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
+        Shape: scalar or [1, K]
+    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
+        quantize the intermediate result between the gemms.
+        Shape: scalar or [1, N]
+    - apply_router_weight_on_input (bool): When true, the topk weights are
+        applied directly on the inputs. This is only applicable when topk is 1.
+
+    Returns:
+    - torch.Tensor: The fp8 output tensor after applying the MoE layer.
+    """
+    assert topk_weights.shape == topk_ids_.shape, "topk shape mismatch"
+    assert w1_q.dtype == torch.int8
+    assert w2_q.dtype == torch.int8
+    assert a.shape[1] // 2 == w1_q.shape[2], "Hidden size mismatch w1"
+    assert w1_q.shape[2] * 2 == w2_q.shape[1], "Hidden size mismatch w2"
+    assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
+    assert w1_q.shape[0] == w1_scale.shape[0], "w1 scales expert number mismatch"
+    assert w1_q.shape[0] == w2_scale.shape[0], "w2 scales expert number mismatch"
+
+    assert a_strides1.shape[0] == w1_q.shape[0], "A Strides 1 expert number mismatch"
+    assert b_strides1.shape[0] == w1_q.shape[0], "B Strides 1 expert number mismatch"
+    assert a_strides2.shape[0] == w2_q.shape[0], "A Strides 2 expert number mismatch"
+    assert b_strides2.shape[0] == w2_q.shape[0], "B Strides 2 expert number mismatch"
+    num_experts = w1_q.size(0)
+    m = a.size(0)
+    k = w1_q.size(2) * 2  # w1_q is transposed and packed
+    n = w2_q.size(2) * 2  # w2_q is transposed and packed
+    topk = topk_ids_.size(1)
+
+    if apply_router_weight_on_input:
+        assert topk == 1, "apply_router_weight_on_input is only implemented for topk=1"
+
+    device = a.device
+
+    _, src2dst, _ = run_cutlass_moe_ep_preproess(
+        local_topk_ids,
+        num_experts,
+    )
+
+    gateup_input = torch.empty(
+        (m * topk, k),
+        device=device,
+        dtype=torch.float8_e4m3fn,
+    )
+
+    pre_reorder_triton_kernel_for_cutlass_moe[(m,)](
+        a,
+        gateup_input,
+        src2dst,
+        local_topk_ids,
+        a1_scale,
+        total_num_experts,
+        topk,
+        k,
+        BLOCK_SIZE=512,
+    )
+
+    # NOTE: a_map and c_map are not used in the get_cutlass_w4a8_moe_mm_data kernel,
+    # they are kept to allow for a quick switch of the permutation logic
+    # from the current triton kernel implementation to the cutlass-based one if needed.
+    a_map = torch.empty((local_topk_ids.numel()), dtype=torch.int32, device=device)
+    c_map = torch.empty((local_topk_ids.numel()), dtype=torch.int32, device=device)
+    get_cutlass_w4a8_moe_mm_data(
+        local_topk_ids,
+        expert_offsets,
+        problem_sizes1,
+        problem_sizes2,
+        a_map,
+        c_map,
+        num_experts,
+        n,
+        k,
+    )
+
+    c1 = torch.empty((m * topk, n * 2), device=device, dtype=torch.bfloat16)
+    c2 = torch.zeros((m * topk, k), device=device, dtype=torch.bfloat16)
+
+    cutlass_w4a8_moe_mm(
+        c1,
+        gateup_input,
+        w1_q,
+        a1_scale.float(),
+        w1_scale,
+        expert_offsets[:-1],
+        problem_sizes1,
+        a_strides1,
+        b_strides1,
+        c_strides1,
+        s_strides13,
+        128,
+        topk,
+    )
+
+    intermediate = torch.empty((m * topk, n), device=device, dtype=torch.bfloat16)
+    silu_and_mul(c1, intermediate)
+
+    intermediate_q = torch.empty(
+        intermediate.shape, dtype=torch.float8_e4m3fn, device=device
+    )
+    sgl_per_tensor_quant_fp8(intermediate, intermediate_q, a2_scale.float(), True)
+
+    cutlass_w4a8_moe_mm(
+        c2,
+        intermediate_q,
+        w2_q,
+        a2_scale.float(),
+        w2_scale,
+        expert_offsets[:-1],
+        problem_sizes2,
+        a_strides2,
+        b_strides2,
+        c_strides2,
+        s_strides2,
+        128,
+        topk,
+    )
+
+    output = torch.empty_like(a)
+    post_reorder_triton_kernel_for_cutlass_moe[(m,)](
+        c2,
+        output,
+        src2dst,
+        local_topk_ids,
+        topk_weights,
+        num_experts,
+        topk,
+        k,
+        0,
+        BLOCK_SIZE=512,
+    )
+    return output
diff --git a/python/sglang/srt/layers/moe/ep_moe/__init__.py b/python/sglang/srt/layers/moe/ep_moe/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/sglang/srt/layers/moe/ep_moe/kernels.py b/python/sglang/srt/layers/moe/ep_moe/kernels.py
new file mode 100644
index 000000000..08660812d
--- /dev/null
+++ b/python/sglang/srt/layers/moe/ep_moe/kernels.py
@@ -0,0 +1,1438 @@
+import logging
+from typing import List, Optional
+
+import torch
+import triton
+
+from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
+from sglang.srt.utils import ceil_div, dispose_tensor, is_cuda
+from sglang.utils import is_in_ci
+
+logger = logging.getLogger(__name__)
+
+_is_cuda = is_cuda()
+if _is_cuda:
+    from sglang.srt.layers.quantization.fp8_kernel import (
+        sglang_per_token_group_quant_fp8 as per_token_group_quant_fp8,
+    )
+
+import triton.language as tl
+
+
+@triton.jit
+def deepep_permute_triton_kernel(
+    input_ptr,
+    gateup_input_ptr,
+    src2dst_ptr,
+    topk_ids_ptr,
+    a1_scales_ptr,
+    topk,
+    hidden_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    OutDtype = gateup_input_ptr.dtype.element_ty
+
+    src_idx = tl.program_id(0)
+    src2dst_ptr = src2dst_ptr + src_idx * topk
+    topk_ids_ptr = topk_ids_ptr + src_idx * topk
+
+    src_ptr = input_ptr + src_idx * hidden_size
+
+    for start_offset in tl.range(0, hidden_size, BLOCK_SIZE):
+        offset = start_offset + tl.arange(0, BLOCK_SIZE)
+        mask = offset < hidden_size
+        in_data = tl.load(src_ptr + offset, mask=mask).to(OutDtype)
+
+        for idx in range(topk):
+            dst_idx = tl.load(src2dst_ptr + idx)
+            if dst_idx >= 0:
+                dst_ptr = gateup_input_ptr + dst_idx * hidden_size
+                tl.store(dst_ptr + offset, in_data, mask=mask)
+
+
+@triton.jit
+def deepep_post_reorder_triton_kernel(
+    down_output_ptr,
+    output_ptr,
+    src2dst_ptr,
+    topk_ids_ptr,
+    topk_weights_ptr,
+    topk,
+    hidden_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    InDtype = down_output_ptr.dtype.element_ty
+
+    src_idx = tl.program_id(0)
+    src2dst_ptr = src2dst_ptr + src_idx * topk
+    topk_ids_ptr = topk_ids_ptr + src_idx * topk
+    topk_weights_ptr = topk_weights_ptr + src_idx * topk
+
+    store_ptr = output_ptr + src_idx * hidden_size
+    for start_offset in tl.range(0, hidden_size, BLOCK_SIZE):
+        offset = start_offset + tl.arange(0, BLOCK_SIZE)
+        mask = offset < hidden_size
+        sum_vec = tl.zeros([BLOCK_SIZE], dtype=InDtype)
+        for idx in range(topk):
+            dst_idx = tl.load(src2dst_ptr + idx)
+            if dst_idx >= 0:
+                weigh_scale = tl.load(topk_weights_ptr + idx).to(InDtype)
+                load_ptr = down_output_ptr + dst_idx * hidden_size
+                in_data = tl.load(load_ptr + offset, mask=mask)
+                sum_vec += in_data * weigh_scale
+        tl.store(store_ptr + offset, sum_vec, mask=mask)
+
+
+@triton.jit
+def compute_src2dst_triton_kernel(
+    reorder_ids, src2dst, num_toks, BLOCK_SIZE: tl.constexpr
+):
+    pid = tl.program_id(axis=0)
+    dst_id = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = dst_id < num_toks
+    src_id = tl.load(reorder_ids + dst_id, mask=mask)
+    tl.store(src2dst + src_id, dst_id, mask=mask)
+
+
+@triton.jit
+def deepep_compute_src2dst_triton_kernel(
+    reorder_ids, src2dst, num_toks, num_minus_one, BLOCK_SIZE: tl.constexpr
+):
+    pid = tl.program_id(axis=0)
+    dst_id = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = dst_id < num_toks
+    src_id = tl.load(reorder_ids + dst_id, mask=mask)
+    num_invalid = tl.load(num_minus_one)
+    tl.store(src2dst + src_id, dst_id - num_invalid, mask=mask)
+
+
+def deepep_run_moe_deep_preprocess(topk_ids: torch.Tensor, num_experts: int):
+    reorder_topk_ids, reorder_ids = torch.sort(topk_ids.view(-1), stable=True)
+    seg_indptr = torch.empty(num_experts + 1, device=topk_ids.device, dtype=torch.int64)
+    src2dst = torch.empty(topk_ids.numel(), device=topk_ids.device, dtype=torch.int64)
+
+    # Find offset
+    expert_ids = torch.arange(
+        num_experts + 1, device=topk_ids.device, dtype=reorder_topk_ids.dtype
+    )
+    torch.searchsorted(reorder_topk_ids, expert_ids, out=seg_indptr)
+    num_minus_one = seg_indptr[0]
+    seg_indptr = seg_indptr - num_minus_one
+
+    BLOCK_SIZE = 512
+    grid = (triton.cdiv(topk_ids.numel(), BLOCK_SIZE),)
+    deepep_compute_src2dst_triton_kernel[grid](
+        reorder_ids, src2dst, topk_ids.numel(), num_minus_one, BLOCK_SIZE
+    )
+    reorder_topk_ids = reorder_topk_ids[num_minus_one:]
+    return reorder_topk_ids, src2dst, seg_indptr
+
+
+@triton.jit
+def compute_seg_indptr_triton_kernel(reorder_topk_ids, seg_indptr, num_toks):
+    expert = tl.program_id(0)
+    low = 0
+    high = num_toks - 1
+    target_location = -1
+    while low <= high:
+        mid = (low + high) // 2
+
+        if tl.load(reorder_topk_ids + mid) > expert:
+            high = mid - 1
+        else:
+            low = mid + 1
+            target_location = mid
+    tl.store(seg_indptr + expert + 1, target_location + 1)
+
+
+def run_moe_ep_preproess(topk_ids: torch.Tensor, num_experts: int):
+    reorder_topk_ids, reorder_ids = torch.sort(topk_ids.view(-1), stable=True)
+
+    seg_indptr = torch.zeros(num_experts + 1, device=topk_ids.device, dtype=torch.int64)
+    src2dst = torch.empty(topk_ids.numel(), device=topk_ids.device, dtype=torch.int32)
+
+    compute_seg_indptr_triton_kernel[(num_experts,)](
+        reorder_topk_ids, seg_indptr, topk_ids.numel()
+    )
+
+    BLOCK_SIZE = 512
+    grid = (triton.cdiv(topk_ids.numel(), BLOCK_SIZE),)
+    compute_src2dst_triton_kernel[grid](
+        reorder_ids, src2dst, topk_ids.numel(), BLOCK_SIZE
+    )
+
+    return reorder_topk_ids, src2dst, seg_indptr
+
+
+def run_cutlass_moe_ep_preproess(local_topk_ids: torch.Tensor, local_num_experts: int):
+    reorder_topk_ids, reorder_ids = torch.sort(local_topk_ids.view(-1), stable=True)
+
+    seg_indptr = torch.zeros(
+        local_num_experts + 1, device=local_topk_ids.device, dtype=torch.int64
+    )
+    src2dst = torch.empty(
+        local_topk_ids.numel(), device=local_topk_ids.device, dtype=torch.int32
+    )
+
+    BLOCK_SIZE = 512
+    grid = (triton.cdiv(local_topk_ids.numel(), BLOCK_SIZE),)
+    compute_src2dst_triton_kernel[grid](
+        reorder_ids, src2dst, local_topk_ids.numel(), BLOCK_SIZE
+    )
+
+    return reorder_topk_ids, src2dst, seg_indptr
+
+
+@triton.jit
+def pre_reorder_triton_kernel_for_cutlass_moe(
+    input_ptr,
+    gateup_input_ptr,
+    src2dst_ptr,
+    topk_ids_ptr,
+    a1_scales_ptr,
+    num_experts,
+    topk,
+    hidden_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    OutDtype = gateup_input_ptr.dtype.element_ty
+
+    src_idx = tl.program_id(0)
+    src2dst_ptr = src2dst_ptr + src_idx * topk
+    topk_ids_ptr = topk_ids_ptr + src_idx * topk
+
+    src_ptr = input_ptr + src_idx * hidden_size
+    for idx in range(topk):
+        expert_id = tl.load(topk_ids_ptr + idx)
+        if expert_id != num_experts:
+            if a1_scales_ptr is not None:
+                scale = 1.0 / tl.load(a1_scales_ptr)
+            else:
+                scale = 1.0
+
+            dst_idx = tl.load(src2dst_ptr + idx)
+            dst_ptr = gateup_input_ptr + dst_idx * hidden_size
+            for start_offset in tl.range(0, hidden_size, BLOCK_SIZE):
+                offset = start_offset + tl.arange(0, BLOCK_SIZE)
+                mask = offset < hidden_size
+                in_data = tl.load(src_ptr + offset, mask=mask).to(tl.float32)
+                out_data = (in_data * scale).to(OutDtype)
+                tl.store(dst_ptr + offset, out_data, mask=mask)
+
+
+@triton.jit
+def pre_reorder_triton_kernel(
+    input_ptr,
+    gateup_input_ptr,
+    src2dst_ptr,
+    topk_ids_ptr,
+    a1_scales_ptr,
+    start_expert_id,
+    end_expert_id,
+    topk,
+    hidden_size,
+    BLOCK_SIZE: tl.constexpr,
+    use_per_token_if_dynamic: tl.constexpr,
+):
+    OutDtype = gateup_input_ptr.dtype.element_ty
+
+    src_idx_int32 = tl.program_id(0)
+    src_idx = src_idx_int32.to(tl.int64)
+    src2dst_ptr = src2dst_ptr + src_idx * topk
+    topk_ids_ptr = topk_ids_ptr + src_idx * topk
+    src_ptr = input_ptr + src_idx * hidden_size
+
+    vec = tl.arange(0, BLOCK_SIZE)
+
+    if a1_scales_ptr is not None and use_per_token_if_dynamic:
+        scale = 1.0 / tl.load(a1_scales_ptr + src_idx)
+
+    for idx in range(topk):
+        expert_id = tl.load(topk_ids_ptr + idx)
+        if expert_id >= start_expert_id and expert_id <= end_expert_id:
+            if a1_scales_ptr is not None:
+                if not use_per_token_if_dynamic:
+                    scale = 1.0 / tl.load(a1_scales_ptr + expert_id - start_expert_id)
+            else:
+                scale = 1.0
+
+            dst_idx_int32 = tl.load(src2dst_ptr + idx)
+            dst_idx = dst_idx_int32.to(tl.int64)
+            dst_ptr = gateup_input_ptr + dst_idx * hidden_size
+            for start_offset in tl.range(0, hidden_size, BLOCK_SIZE):
+                offset = start_offset + vec
+                mask = offset < hidden_size
+                in_data = tl.load(src_ptr + offset, mask=mask).to(tl.float32)
+                out_data = (in_data * scale).to(OutDtype)
+                tl.store(dst_ptr + offset, out_data, mask=mask)
+
+
+@triton.jit
+def silu_and_mul_triton_kernel(
+    gateup_output,
+    down_input,
+    hidden_size,
+    reorder_topk_ids,
+    scales,
+    start_expert_id,
+    end_expert_id,
+    BLOCK_SIZE: tl.constexpr,
+):
+    InDtype = gateup_output.dtype.element_ty
+    OutDtype = down_input.dtype.element_ty
+
+    half_hidden_size = hidden_size // 2
+
+    pid = tl.program_id(0)
+    expert_id = tl.load(reorder_topk_ids + pid)
+    if expert_id >= start_expert_id and expert_id <= end_expert_id:
+        gateup_output_ptr = gateup_output + pid * hidden_size
+        gate_output_ptr = gateup_output_ptr
+        up_output_ptr = gateup_output_ptr + half_hidden_size
+        down_input_ptr = down_input + pid * half_hidden_size
+
+        if scales is not None:
+            scale = tl.load(scales + expert_id - start_expert_id)
+            scale = (1 / scale).to(InDtype)
+        else:
+            scale = 1
+
+        for start_offset in tl.range(0, half_hidden_size, BLOCK_SIZE):
+            offset = start_offset + tl.arange(0, BLOCK_SIZE)
+            mask = offset < half_hidden_size
+
+            gate_output = tl.load(gate_output_ptr + offset, mask=mask).to(tl.float32)
+            up_output = tl.load(up_output_ptr + offset, mask=mask)
+
+            # silu & mul & quantize
+            gate_output = gate_output * tl.sigmoid(gate_output)
+            gate_output = gate_output.to(InDtype)
+
+            silu_mul_output = gate_output * up_output * scale
+            silu_mul_output = silu_mul_output.to(OutDtype)
+            tl.store(down_input_ptr + offset, silu_mul_output, mask=mask)
+
+
+# copy from https://github.com/ModelTC/lightllm/blob/a000ab69098654df4731f5b12587dd4e7f0a4f41/lightllm/common/fused_moe/moe_silu_and_mul_mix_quant_ep.py
+@triton.jit
+def _silu_and_mul_post_quant_kernel(
+    input_ptr,
+    stride_input_0,
+    stride_input_1,
+    stride_input_2,
+    output_ptr,
+    stride_output_0,
+    stride_output_1,
+    stride_output_2,
+    output_scale_ptr,
+    stride_output_scale_0,
+    stride_output_scale_1,
+    stride_output_scale_2,
+    masked_m_ptr,
+    size_n,
+    fp8_max,
+    fp8_min,
+    BLOCK_N: tl.constexpr,
+    NUM_STAGE: tl.constexpr,
+    SCALE_UE8M0: tl.constexpr,
+):
+    expert_id = tl.program_id(2)
+    token_id = tl.program_id(1)
+    hidden_dim_block_index = tl.program_id(0)
+
+    block_num_per_expert = tl.num_programs(1)
+
+    token_num_cur_expert = tl.load(masked_m_ptr + expert_id)
+
+    stride_input_0 = tl.cast(stride_input_0, dtype=tl.int64)
+    stride_output_0 = tl.cast(stride_output_0, dtype=tl.int64)
+    stride_input_1 = tl.cast(stride_input_1, dtype=tl.int64)
+    stride_output_1 = tl.cast(stride_output_1, dtype=tl.int64)
+
+    offs_in_d = hidden_dim_block_index * BLOCK_N + tl.arange(0, BLOCK_N)
+    input_ptr_offs = input_ptr + expert_id * stride_input_0 + offs_in_d
+    output_ptr_offs = output_ptr + expert_id * stride_output_0 + offs_in_d
+    output_scale_offs = (
+        output_scale_ptr
+        + expert_id * stride_output_scale_0
+        + hidden_dim_block_index * stride_output_scale_2
+    )
+
+    for token_index in tl.range(
+        token_id, token_num_cur_expert, block_num_per_expert, num_stages=NUM_STAGE
+    ):
+        gate = tl.load(
+            input_ptr_offs + token_index * stride_input_1,
+            mask=offs_in_d < size_n,
+            other=0.0,
+        ).to(tl.float32)
+        up = tl.load(
+            input_ptr_offs + token_index * stride_input_1 + size_n,
+            mask=offs_in_d < size_n,
+            other=0.0,
+        )
+        gate = gate / (1 + tl.exp(-gate))
+        gate = gate.to(input_ptr.dtype.element_ty)
+        gate_up = up * gate
+        _absmax = tl.maximum(tl.max(tl.abs(gate_up)), 1e-10)
+        output_s = _absmax / fp8_max
+        if SCALE_UE8M0:
+            output_s = tl.exp2(tl.ceil(tl.log2(tl.abs(output_s))))
+        output_q = tl.clamp(gate_up / output_s, fp8_min, fp8_max).to(
+            output_ptr.dtype.element_ty
+        )
+        tl.store(
+            output_ptr_offs + token_index * stride_output_1,
+            output_q,
+            mask=offs_in_d < size_n,
+        )
+        tl.store(
+            output_scale_offs + token_index * stride_output_scale_1,
+            output_s,
+        )
+
+
+def silu_and_mul_masked_post_quant_fwd(
+    input: torch.Tensor,
+    output: torch.Tensor,
+    output_scale: torch.Tensor,
+    quant_group_size: int,
+    masked_m: torch.Tensor,
+    scale_ue8m0: bool = False,
+):
+    """
+    input shape [expert_num, token_num_padded, hidden_dim]
+    output shape [expert_num, token_num_padded, hidden_dim // 2], dtype fp8
+    output_scale [expert_num token_num_paddded, hidden_dim // 2 // 128] dtype float32
+    quant_group_size  int,
+    masked_m shape [expert_num],
+    """
+
+    assert input.is_contiguous()
+    assert output.dtype == torch.float8_e4m3fn
+    assert output.is_contiguous()
+    assert len(input.shape) == 3
+    assert input.shape[0] == masked_m.shape[0]
+    assert input.shape[-1] % 2 == 0
+
+    size_n = input.shape[-1] // 2
+    assert size_n % quant_group_size == 0
+
+    expert_num = len(masked_m)
+
+    if expert_num < 4:
+        BLOCK_NUM_PER_EXPERT = 64
+    else:
+        BLOCK_NUM_PER_EXPERT = 32
+
+    BLOCK_N = quant_group_size
+    num_warps = 1
+    NUM_STAGES = 6
+    hidden_dim_split_block_num = triton.cdiv(size_n, BLOCK_N)
+    assert BLOCK_N % quant_group_size == 0
+
+    grid = (
+        hidden_dim_split_block_num,
+        BLOCK_NUM_PER_EXPERT,
+        expert_num,
+    )
+
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    fp8_max = finfo.max
+    fp8_min = -fp8_max
+
+    _silu_and_mul_post_quant_kernel[grid](
+        input,
+        *input.stride(),
+        output,
+        *output.stride(),
+        output_scale,
+        *output_scale.stride(),
+        masked_m,
+        size_n,
+        fp8_max,
+        fp8_min,
+        BLOCK_N=BLOCK_N,
+        NUM_STAGE=NUM_STAGES,
+        num_warps=num_warps,
+        SCALE_UE8M0=scale_ue8m0,
+    )
+    return
+
+
+@triton.jit
+def tanh(x):
+    return 2 * tl.sigmoid(2 * x) - 1
+
+
+@triton.jit
+def gelu_and_mul_triton_kernel(
+    gateup_output,
+    down_input,
+    hidden_size,
+    reorder_topk_ids,
+    scales,
+    start_expert_id,
+    end_expert_id,
+    BLOCK_SIZE: tl.constexpr,
+):
+    InDtype = gateup_output.dtype.element_ty
+    OutDtype = down_input.dtype.element_ty
+
+    half_hidden_size = hidden_size // 2
+
+    pid = tl.program_id(0)
+    expert_id = tl.load(reorder_topk_ids + pid)
+    if expert_id >= start_expert_id and expert_id <= end_expert_id:
+        gateup_output_ptr = gateup_output + pid * hidden_size
+        gate_output_ptr = gateup_output_ptr
+        up_output_ptr = gateup_output_ptr + half_hidden_size
+        down_input_ptr = down_input + pid * half_hidden_size
+
+        if scales is not None:
+            scale = tl.load(scales + expert_id - start_expert_id)
+            scale = (1 / scale).to(InDtype)
+        else:
+            scale = 1
+
+        for start_offset in tl.range(0, half_hidden_size, BLOCK_SIZE):
+            offset = start_offset + tl.arange(0, BLOCK_SIZE)
+            mask = offset < half_hidden_size
+
+            gate_output = tl.load(gate_output_ptr + offset, mask=mask).to(tl.float32)
+            up_output = tl.load(up_output_ptr + offset, mask=mask)
+
+            # gelu & mul & quantize
+            # https://pytorch.org/docs/stable/generated/torch.nn.GELU.html
+            # sqrt(2/pi)
+            kAlpha = 0.7978845608028654
+            gate_output = (
+                0.5
+                * gate_output
+                * (
+                    1
+                    + tanh(
+                        kAlpha
+                        * (
+                            gate_output
+                            + 0.044715 * gate_output * gate_output * gate_output
+                        )
+                    )
+                )
+            )
+            gate_output = gate_output.to(InDtype)
+
+            gelu_mul_output = gate_output * up_output * scale
+            gelu_mul_output = gelu_mul_output.to(OutDtype)
+            tl.store(down_input_ptr + offset, gelu_mul_output, mask=mask)
+
+
+@triton.jit
+def post_reorder_triton_kernel(
+    down_output_ptr,
+    output_ptr,
+    src2dst_ptr,
+    topk_ids_ptr,
+    topk_weights_ptr,
+    start_expert_id,
+    end_expert_id,
+    topk,
+    hidden_size,
+    dst_start,
+    BLOCK_SIZE: tl.constexpr,
+):
+    InDtype = down_output_ptr.dtype.element_ty
+
+    src_idx_int32 = tl.program_id(0)
+    src_idx = src_idx_int32.to(tl.int64)
+    src2dst_ptr = src2dst_ptr + src_idx * topk
+    topk_ids_ptr = topk_ids_ptr + src_idx * topk
+    topk_weights_ptr = topk_weights_ptr + src_idx * topk
+
+    computed = False
+    store_ptr = output_ptr + src_idx * hidden_size
+
+    vec = tl.arange(0, BLOCK_SIZE)
+
+    for start_offset in tl.range(0, hidden_size, BLOCK_SIZE):
+        offset = start_offset + vec
+        mask = offset < hidden_size
+
+        sum_vec = tl.zeros([BLOCK_SIZE], dtype=InDtype)
+        for idx in range(topk):
+            expert_id = tl.load(topk_ids_ptr + idx)
+            if expert_id >= start_expert_id and expert_id <= end_expert_id:
+                computed = True
+                dst_idx_int32 = tl.load(src2dst_ptr + idx)
+                dst_idx = dst_idx_int32.to(tl.int64)
+                dst_idx = dst_idx - dst_start
+                weigh_scale = tl.load(topk_weights_ptr + idx).to(InDtype)
+                load_ptr = down_output_ptr + dst_idx * hidden_size
+                in_data = tl.load(load_ptr + offset, mask=mask)
+                sum_vec += in_data * weigh_scale
+        tl.store(store_ptr + offset, sum_vec, mask=mask)
+
+    if computed == False:
+        for start_offset in tl.range(0, hidden_size, BLOCK_SIZE):
+            offset = start_offset + vec
+            mask = offset < hidden_size
+            tl.store(
+                store_ptr + offset, tl.zeros([BLOCK_SIZE], dtype=InDtype), mask=mask
+            )
+
+
+@triton.jit
+def post_reorder_triton_kernel_for_cutlass_moe(
+    down_output_ptr,
+    output_ptr,
+    src2dst_ptr,
+    topk_ids_ptr,
+    topk_weights_ptr,
+    num_experts,
+    topk,
+    hidden_size,
+    dst_start,
+    BLOCK_SIZE: tl.constexpr,
+):
+    InDtype = down_output_ptr.dtype.element_ty
+
+    src_idx_int32 = tl.program_id(0)
+    src_idx = src_idx_int32.to(tl.int64)
+    src2dst_ptr = src2dst_ptr + src_idx * topk
+    topk_ids_ptr = topk_ids_ptr + src_idx * topk
+    topk_weights_ptr = topk_weights_ptr + src_idx * topk
+
+    store_ptr = output_ptr + src_idx * hidden_size
+
+    vec = tl.arange(0, BLOCK_SIZE)
+
+    for start_offset in tl.range(0, hidden_size, BLOCK_SIZE):
+        offset = start_offset + vec
+        mask = offset < hidden_size
+
+        sum_vec = tl.zeros([BLOCK_SIZE], dtype=InDtype)
+        for idx in range(topk):
+            expert_id = tl.load(topk_ids_ptr + idx)
+            if expert_id != num_experts:
+                dst_idx_int32 = tl.load(src2dst_ptr + idx)
+                dst_idx = dst_idx_int32.to(tl.int64)
+                dst_idx = dst_idx - dst_start
+                weigh_scale = tl.load(topk_weights_ptr + idx).to(InDtype)
+                load_ptr = down_output_ptr + dst_idx * hidden_size
+                in_data = tl.load(load_ptr + offset, mask=mask)
+                sum_vec += in_data * weigh_scale
+        tl.store(store_ptr + offset, sum_vec, mask=mask)
+
+
+@triton.jit
+def compute_m_range(
+    pid,
+    batch_size,
+    seg_indptr,
+    weight_indices,
+    m_num_tiles_indptr,
+    BLOCK_SIZE_M: tl.constexpr,
+):
+    idx = 0
+    for bs in range(batch_size):
+        tiles = tl.load(m_num_tiles_indptr + bs)
+        if pid >= tiles:
+            idx = bs
+
+    idx_start = tl.load(m_num_tiles_indptr + idx)
+
+    m_range_start = tl.load(seg_indptr + idx) + (pid - idx_start) * BLOCK_SIZE_M
+    m_range_end = min(tl.load(seg_indptr + idx + 1), m_range_start + BLOCK_SIZE_M)
+    expert_id = tl.load(weight_indices + idx)
+    return m_range_start, m_range_end, expert_id
+
+
+@triton.jit
+def grouped_gemm_triton_kernel(
+    a,
+    b,
+    c,
+    batch_size,
+    N,
+    K,
+    seg_indptr,
+    weight_indices,
+    m_num_tiles_indptr,
+    scale_a,
+    scale_b,
+    use_fp8_w8a8: tl.constexpr,
+    group_n: tl.constexpr,
+    group_k: tl.constexpr,
+    a_stride_0: tl.constexpr,
+    b_stride_0: tl.constexpr,
+    b_stride_1: tl.constexpr,
+    as_stride_0: tl.constexpr,
+    as_stride_1: tl.constexpr,
+    bs_stride_0: tl.constexpr,
+    bs_stride_2: tl.constexpr,
+    bs_stride_1: tl.constexpr,
+    use_per_token_if_dynamic: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+):
+    c_dtype = c.dtype.element_ty
+
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+    total_m_block = tl.load(m_num_tiles_indptr + batch_size)
+    if pid_m >= total_m_block:
+        return
+
+    m_range_start, m_range_end, expert_id = compute_m_range(
+        pid_m, batch_size, seg_indptr, weight_indices, m_num_tiles_indptr, BLOCK_SIZE_M
+    )
+    if m_range_end - m_range_start == 0:
+        return
+
+    n_range_start = pid_n * BLOCK_SIZE_N
+    n_range_end = min(n_range_start + BLOCK_SIZE_N, N)
+
+    offs_am = tl.arange(0, BLOCK_SIZE_M)
+    offs_bn = tl.arange(0, BLOCK_SIZE_N)
+
+    offs_am = tl.where(offs_am < m_range_end - m_range_start, offs_am, 0)
+    offs_bn = tl.where(offs_bn < n_range_end - n_range_start, offs_bn, 0)
+    offs_am = tl.max_contiguous(tl.multiple_of(offs_am, BLOCK_SIZE_M), BLOCK_SIZE_M)
+    offs_bn = tl.max_contiguous(tl.multiple_of(offs_bn, BLOCK_SIZE_N), BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+
+    a_ptr = a + (m_range_start + offs_am[:, None]) * a_stride_0 + offs_k[None, :]
+    b_ptr = b + (
+        (expert_id * b_stride_0)
+        + (n_range_start + offs_bn[:, None]) * b_stride_1
+        + offs_k[None, :]
+    )
+
+    if group_k > 0 and group_n > 0:
+        a_scale_ptrs = scale_a + (m_range_start + offs_am[:, None]) * as_stride_0
+        offs_bsn = (n_range_start + offs_bn) // group_n
+        b_scale_ptrs = scale_b + (expert_id * bs_stride_0) + offs_bsn * bs_stride_1
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a_tile = tl.load(
+            a_ptr, mask=offs_k[None, :] < (K - k * BLOCK_SIZE_K), other=0.0
+        )
+        b_tile = tl.load(
+            b_ptr, mask=offs_k[None, :] < (K - k * BLOCK_SIZE_K), other=0.0
+        )
+
+        if group_k > 0 and group_n > 0:
+            k_start = k * BLOCK_SIZE_K
+            offs_ks = k_start // group_k
+            a_scale = tl.load(a_scale_ptrs + offs_ks * as_stride_1)
+            b_scale = tl.load(b_scale_ptrs + offs_ks * bs_stride_2)
+            accumulator += tl.dot(a_tile, b_tile.T) * a_scale * b_scale[None, :]
+        else:
+            accumulator = tl.dot(a_tile, b_tile.T, accumulator)
+        a_ptr += BLOCK_SIZE_K
+        b_ptr += BLOCK_SIZE_K
+
+    if use_fp8_w8a8 and not (group_k > 0 and group_n > 0):
+        if use_per_token_if_dynamic:
+            scale_a_value = tl.load(scale_a + (m_range_start + offs_am[:, None]))
+        else:
+            scale_a_value = tl.load(scale_a + expert_id)
+        scale_b_value = tl.load(scale_b + expert_id)
+        accumulator *= scale_a_value * scale_b_value
+
+    c_tile = accumulator.to(c_dtype)
+
+    offs_cm = m_range_start + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = n_range_start + tl.arange(0, BLOCK_SIZE_N)
+    c_ptr = c + offs_cm[:, None] * N + offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < m_range_end) & (offs_cn[None, :] < n_range_end)
+    tl.store(c_ptr, c_tile, mask=c_mask)
+
+
+@triton.jit
+def compute_m_num_tiles_indptr(
+    m_num_tiles_indptr, seg_indptr, batch_size: tl.constexpr, BLOCK_SIZE_M: tl.constexpr
+):
+    for bs in range(batch_size):
+        m = tl.load(seg_indptr + bs + 1) - tl.load(seg_indptr + bs)
+        cur_num_tiles = tl.cdiv(m, BLOCK_SIZE_M)
+        pre_num_tiles = tl.load(m_num_tiles_indptr + bs)
+        tl.store(m_num_tiles_indptr + bs + 1, pre_num_tiles + cur_num_tiles)
+
+
+def grouped_gemm_triton(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    c: torch.Tensor,
+    batch_size: int,
+    weight_column_major: bool,
+    seg_indptr: Optional[torch.Tensor] = None,
+    weight_indices: Optional[torch.Tensor] = None,
+    use_fp8_w8a8: bool = False,
+    scale_a: torch.Tensor = None,
+    scale_b: torch.Tensor = None,
+    block_shape: Optional[List[int]] = None,
+    c_dtype=None,
+    use_per_token_if_dynamic: bool = True,
+):
+    assert weight_column_major == True  # TODO: more
+    if use_fp8_w8a8 and block_shape is None:
+        assert scale_a is not None and scale_b is not None
+
+    if block_shape is not None:
+        a_original = a
+
+        assert len(block_shape) == 2
+        block_n, block_k = block_shape[0], block_shape[1]
+        a, scale_a = per_token_group_quant_fp8(a, block_k)
+
+        assert triton.cdiv(a.shape[-1], block_k) == scale_a.shape[-1]
+        assert triton.cdiv(b.shape[-2], block_n) == scale_b.shape[-2]
+        assert triton.cdiv(b.shape[-1], block_k) == scale_b.shape[-1]
+
+        dispose_tensor(a_original)
+
+    # TODO: adjust config or tune kernel
+    # Reduce block size to prevent L40 shared memory overflow.
+    config = {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+    }
+
+    m_num_tiles_indptr = torch.zeros(batch_size + 1, device=a.device, dtype=torch.int64)
+    compute_m_num_tiles_indptr[(1,)](
+        m_num_tiles_indptr, seg_indptr, batch_size, config["BLOCK_SIZE_M"]
+    )
+
+    if c is None:
+        assert c_dtype is not None
+        c = torch.empty(a.shape[0], b.shape[1], device=a.device, dtype=c_dtype)
+
+    grid = lambda META: (
+        triton.cdiv(a.size(0), META["BLOCK_SIZE_M"]) + batch_size,
+        triton.cdiv(b.size(1), META["BLOCK_SIZE_N"]),
+    )
+
+    if use_fp8_w8a8 and block_shape is None and use_per_token_if_dynamic:
+        assert (
+            scale_a.shape[0] == a.shape[0]
+        ), f"scale_a.shape: {scale_a.shape}, a.shape: {a.shape}"
+
+    grouped_gemm_triton_kernel[grid](
+        a,
+        b,
+        c,
+        batch_size,
+        b.size(1),
+        b.size(2),
+        seg_indptr,
+        weight_indices,
+        m_num_tiles_indptr,
+        scale_a,
+        scale_b,
+        use_fp8_w8a8,
+        0 if block_shape is None else block_shape[0],
+        0 if block_shape is None else block_shape[1],
+        a.stride(0),
+        b.stride(0),
+        b.stride(1),
+        scale_a.stride(0) if scale_a is not None and scale_a.ndim == 2 else 0,
+        scale_a.stride(1) if scale_a is not None and scale_a.ndim == 2 else 0,
+        scale_b.stride(0) if scale_b is not None and scale_b.ndim >= 2 else 0,
+        scale_b.stride(2) if scale_b is not None and scale_b.ndim == 3 else 0,
+        scale_b.stride(1) if scale_b is not None and scale_b.ndim >= 2 else 0,
+        use_per_token_if_dynamic,
+        **config,
+    )
+    return c
+
+
+@triton.jit
+def _fwd_kernel_ep_scatter_1(
+    num_recv_tokens_per_expert,
+    expert_start_loc,
+    m_indices,
+    num_experts: tl.constexpr,
+    BLOCK_E: tl.constexpr,
+    BLOCK_EXPERT_NUM: tl.constexpr,
+):
+    cur_expert = tl.program_id(0)
+
+    offset_cumsum = tl.arange(0, BLOCK_EXPERT_NUM)
+    tokens_per_expert = tl.load(
+        num_recv_tokens_per_expert + offset_cumsum,
+        mask=offset_cumsum < num_experts,
+        other=0,
+    )
+    cumsum = tl.cumsum(tokens_per_expert) - tokens_per_expert
+    tl.store(expert_start_loc + offset_cumsum, cumsum, mask=offset_cumsum < num_experts)
+
+    cur_expert_start = tl.load(expert_start_loc + cur_expert)
+    cur_expert_token_num = tl.load(num_recv_tokens_per_expert + cur_expert)
+
+    m_indices_start_ptr = m_indices + cur_expert_start
+    off_expert = tl.arange(0, BLOCK_E)
+
+    for start_m in tl.range(0, cur_expert_token_num, BLOCK_E, num_stages=4):
+        tl.store(
+            m_indices_start_ptr + start_m + off_expert,
+            cur_expert,
+        )
+
+
+@triton.jit
+def _fwd_kernel_ep_scatter_2(
+    total_token_num,
+    expert_start_loc,
+    recv_x,
+    recv_x_stride0,
+    recv_x_stride1,
+    recv_x_scale,
+    recv_x_scale_stride0,
+    recv_x_scale_stride1,
+    recv_topk,
+    recv_topk_stride0,
+    recv_topk_stride1,
+    output_tensor,
+    output_tensor_stride0,
+    output_tensor_stride1,
+    output_tensor_scale,
+    output_tensor_scale_stride0,
+    output_tensor_scale_stride1,
+    output_index,
+    output_index_stride0,
+    output_index_stride1,
+    topk_num: tl.constexpr,
+    HIDDEN_SIZE: tl.constexpr,
+    HIDDEN_SIZE_PAD: tl.constexpr,
+    SCALE_HIDDEN_SIZE: tl.constexpr,
+    SCALE_HIDDEN_SIZE_PAD: tl.constexpr,
+):
+    start_token_id = tl.program_id(0)
+    grid_num = tl.num_programs(0)
+
+    offset_in = tl.arange(0, HIDDEN_SIZE_PAD)
+    mask = offset_in < HIDDEN_SIZE
+
+    index_in_s = tl.arange(0, SCALE_HIDDEN_SIZE_PAD)
+    mask_s = index_in_s < SCALE_HIDDEN_SIZE
+
+    for token_id_int32 in range(start_token_id, total_token_num, grid_num):
+        token_id = token_id_int32.to(tl.int64)
+        to_copy = tl.load(recv_x + token_id * recv_x_stride0 + offset_in, mask=mask)
+        to_copy_s = tl.load(
+            recv_x_scale
+            + token_id * recv_x_scale_stride0
+            + index_in_s * recv_x_scale_stride1,
+            mask=mask_s,
+        )
+
+        for topk_idx_int32 in tl.range(0, topk_num, 1, num_stages=4):
+            topk_index = topk_idx_int32.to(tl.int64)
+            expert_id = tl.load(recv_topk + token_id * recv_topk_stride0 + topk_index)
+            if expert_id >= 0:
+                dest_token_index_int32 = tl.atomic_add(expert_start_loc + expert_id, 1)
+                dest_token_index = dest_token_index_int32.to(tl.int64)
+
+                tl.store(
+                    output_index + token_id * output_index_stride0 + topk_index,
+                    dest_token_index_int32,
+                )
+                output_tensor_ptr = (
+                    output_tensor + dest_token_index * output_tensor_stride0
+                )
+                output_tensor_scale_ptr = (
+                    output_tensor_scale + dest_token_index * output_tensor_scale_stride0
+                )
+                tl.store(output_tensor_ptr + offset_in, to_copy, mask=mask)
+                tl.store(
+                    output_tensor_scale_ptr + index_in_s * output_tensor_scale_stride1,
+                    to_copy_s,
+                    mask=mask_s,
+                )
+
+
+# copy from https://github.com/ModelTC/lightllm/blob/main/lightllm/common/fused_moe/deepep_scatter_gather.py
+@torch.no_grad()
+def ep_scatter(
+    recv_x: torch.Tensor,
+    recv_x_scale: torch.Tensor,
+    recv_topk: torch.Tensor,
+    num_recv_tokens_per_expert: torch.Tensor,
+    expert_start_loc: torch.Tensor,
+    output_tensor: torch.Tensor,
+    output_tensor_scale: torch.Tensor,
+    m_indices: torch.Tensor,
+    output_index: torch.Tensor,
+    scale_ue8m0: bool = False,
+):
+    BLOCK_E = 128  # token num of per expert is aligned to 128
+    BLOCK_D = 128  # block size of quantization
+    num_warps = 8
+    num_experts = num_recv_tokens_per_expert.shape[0]
+    hidden_size = recv_x.shape[1]
+    # grid = (triton.cdiv(hidden_size, BLOCK_D), num_experts)
+    grid = num_experts
+
+    scale_hidden_size = hidden_size // BLOCK_D
+    if scale_ue8m0:
+        # ue8m0 scales are packed here (4 scales per int32),
+        # hence the effective size of this dimension is divided by 4.
+        scale_hidden_size = ceil_div(scale_hidden_size, 4)
+
+    assert m_indices.shape[0] % BLOCK_E == 0
+    assert recv_x_scale.dtype == output_tensor_scale.dtype
+    assert recv_x_scale.shape[1] == output_tensor_scale.shape[1] == scale_hidden_size
+
+    _fwd_kernel_ep_scatter_1[(grid,)](
+        num_recv_tokens_per_expert,
+        expert_start_loc,
+        m_indices,
+        num_experts=num_experts,
+        num_warps=num_warps,
+        BLOCK_E=BLOCK_E,
+        BLOCK_EXPERT_NUM=triton.next_power_of_2(num_experts),
+    )
+
+    grid = min(recv_topk.shape[0], 1024 * 8)
+
+    _fwd_kernel_ep_scatter_2[(grid,)](
+        recv_topk.shape[0],
+        expert_start_loc,
+        recv_x,
+        recv_x.stride(0),
+        recv_x.stride(1),
+        recv_x_scale,
+        recv_x_scale.stride(0),
+        recv_x_scale.stride(1),
+        recv_topk,
+        recv_topk.stride(0),
+        recv_topk.stride(1),
+        output_tensor,
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        output_tensor_scale,
+        output_tensor_scale.stride(0),
+        output_tensor_scale.stride(1),
+        output_index,
+        output_index.stride(0),
+        output_index.stride(1),
+        topk_num=recv_topk.shape[1],
+        num_warps=num_warps,
+        HIDDEN_SIZE=hidden_size,
+        HIDDEN_SIZE_PAD=triton.next_power_of_2(hidden_size),
+        SCALE_HIDDEN_SIZE=scale_hidden_size,
+        SCALE_HIDDEN_SIZE_PAD=triton.next_power_of_2(scale_hidden_size),
+    )
+    return
+
+
+@triton.jit
+def _fwd_kernel_ep_gather(
+    total_token_num,
+    input_tensor,
+    input_tensor_stride0,
+    input_tensor_stride1,
+    recv_topk_ids,
+    recv_topk_ids_stride0,
+    recv_topk_ids_stride1,
+    recv_topk_weight,
+    recv_topk_weight_stride0,
+    recv_topk_weight_stride1,
+    input_index,
+    input_index_stride0,
+    input_index_stride1,
+    output_tensor,
+    output_tensor_stride0,
+    output_tensor_stride1,
+    topk_num: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    cur_block_int32 = tl.program_id(0)
+    cur_block = cur_block_int32.to(tl.int64)
+
+    start_cur_token_int32 = tl.program_id(1)
+
+    grid_num = tl.num_programs(1)
+
+    for cur_token_int32 in range(start_cur_token_int32, total_token_num, grid_num):
+        cur_token = cur_token_int32.to(tl.int64)
+
+        off_d = tl.arange(0, BLOCK_D)
+        accumulator = tl.zeros([BLOCK_D], dtype=tl.float32)
+
+        for topk_index_int32 in range(0, topk_num):
+            topk_index = topk_index_int32.to(tl.int64)
+
+            expert_id = tl.load(
+                recv_topk_ids + cur_token * recv_topk_ids_stride0 + topk_index
+            )
+            if expert_id >= 0:
+                source_token_index_int32 = tl.load(
+                    input_index + cur_token * input_index_stride0 + topk_index
+                )
+                source_token_index = source_token_index_int32.to(tl.int64)
+
+                acc_weight = tl.load(
+                    recv_topk_weight + cur_token * recv_topk_weight_stride0 + topk_index
+                )
+                tmp = tl.load(
+                    input_tensor
+                    + source_token_index * input_tensor_stride0
+                    + cur_block * BLOCK_D
+                    + off_d
+                )
+                accumulator += tmp.to(tl.float32) * acc_weight
+
+        tl.store(
+            output_tensor
+            + cur_token * output_tensor_stride0
+            + cur_block * BLOCK_D
+            + off_d,
+            accumulator.to(output_tensor.dtype.element_ty),
+        )
+
+
+@torch.no_grad()
+def ep_gather(
+    input_tensor: torch.Tensor,
+    recv_topk_ids: torch.Tensor,
+    recv_topk_weight: torch.Tensor,
+    input_index: torch.Tensor,
+    output_tensor: torch.Tensor,
+):
+    BLOCK_D = 1024 if not is_in_ci() else 128  # block size of quantization
+    num_warps = 2
+    num_tokens = output_tensor.shape[0]
+    hidden_size = input_tensor.shape[1]
+    assert hidden_size % BLOCK_D == 0
+    grid = (triton.cdiv(hidden_size, BLOCK_D), min(num_tokens, 1024))
+    _fwd_kernel_ep_gather[grid](
+        num_tokens,
+        input_tensor,
+        input_tensor.stride(0),
+        input_tensor.stride(1),
+        recv_topk_ids,
+        recv_topk_ids.stride(0),
+        recv_topk_ids.stride(1),
+        recv_topk_weight,
+        recv_topk_weight.stride(0),
+        recv_topk_weight.stride(1),
+        input_index,
+        input_index.stride(0),
+        input_index.stride(1),
+        output_tensor,
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        topk_num=recv_topk_ids.shape[1],
+        num_warps=num_warps,
+        BLOCK_D=BLOCK_D,
+    )
+    return
+
+
+# copy from
+# https://github.com/deepseek-ai/DeepGEMM/blob/bd2a77552886b98c205af12f8d7d2d61247c4b27/deep_gemm/jit_kernels/utils.py#L58
+def get_tma_aligned_size(x: int, element_size: int) -> int:
+    """
+    Global memory address of TMA must be 16-byte aligned.
+    Since we use column-major layout for the LHS scaling tensor,
+        the M-axis of the LHS scaling tensor needs to be padded to a multiple of 16 bytes.
+
+    Arguments:
+        x: original M-axis shape of the LHS scaling tensor.
+        element_size: element size of the LHS scaling tensor.
+
+    Returns:
+        M-axis shape of the LHS scaling tensor after padding.
+    """
+    tma_alignment_bytes = 16
+    assert tma_alignment_bytes % element_size == 0
+    alignment = tma_alignment_bytes // element_size
+    return ceil_div(x, alignment) * alignment
+
+
+@triton.jit
+def _tma_align_input_scale_kernel(
+    input_scale_ptr,
+    output_ptr,
+    m,
+    k_div_block_size,
+    input_scale_stride_m,
+    input_scale_stride_k,
+    output_stride_m,
+    output_stride_k,
+    BLOCK_SIZE_K: tl.constexpr,
+):
+    pid_m = tl.program_id(axis=0)
+    grid_m = tl.num_programs(0)
+    k_offsets = tl.arange(0, BLOCK_SIZE_K)
+
+    for m_base in range(pid_m, m, grid_m):
+        input_offset = (
+            input_scale_ptr
+            + m_base * input_scale_stride_m
+            + k_offsets * input_scale_stride_k
+        )
+        input_data = tl.load(input_offset, mask=k_offsets < k_div_block_size)
+
+        output_offset = (
+            output_ptr + k_offsets * output_stride_k + m_base * output_stride_m
+        )
+        tl.store(output_offset, input_data, mask=k_offsets < k_div_block_size)
+
+
+# copy from https://github.com/ModelTC/lightllm/blob/main/lightllm/common/quantization/triton_quant/fp8/fp8act_quant_kernel.py
+def tma_align_input_scale(input_scale: torch.Tensor):
+    assert input_scale.dim() == 2
+    m, k_div_block_size = input_scale.shape
+    padd_m = get_tma_aligned_size(m, input_scale.element_size())
+    output = torch.empty(
+        (k_div_block_size, padd_m), dtype=input_scale.dtype, device=input_scale.device
+    )
+
+    grid_m = min(m, 8192)
+    BLOCK_SIZE_K = triton.next_power_of_2(k_div_block_size)
+
+    _tma_align_input_scale_kernel[(grid_m,)](
+        input_scale_ptr=input_scale,
+        output_ptr=output,
+        m=m,
+        k_div_block_size=k_div_block_size,
+        input_scale_stride_m=input_scale.stride(0),
+        input_scale_stride_k=input_scale.stride(1),
+        output_stride_m=output.stride(1),  # Note: these are swapped
+        output_stride_k=output.stride(0),  # for column-major
+        BLOCK_SIZE_K=BLOCK_SIZE_K,
+    )
+    return output.t()[:m]
+
+
+@triton.jit
+def compute_masked_m_triton_kernel(seg_indptr, masked_m):
+    expert_id = tl.program_id(0)
+    start = tl.load(seg_indptr + expert_id)
+    end = tl.load(seg_indptr + expert_id + 1)
+    tl.store(masked_m + expert_id, (end - start))
+
+
+@triton.jit
+def deepgemm_compute_src2dst_triton_kernel(
+    topk_ids,
+    reorder_ids,
+    seg_indptr,
+    src2dst,
+    m_max,
+    num_toks,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    dst_id = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = dst_id < num_toks
+    src_id = tl.load(reorder_ids + dst_id, mask=mask)
+    expert_id = tl.load(topk_ids + src_id, mask=(src_id < num_toks))
+    expert_dst_start = tl.load(seg_indptr + expert_id)
+    expert_dst_offset = dst_id - expert_dst_start
+    dst_id = expert_id * m_max + expert_dst_offset
+    tl.store(src2dst + src_id, dst_id, mask=mask)
+
+
+@triton.jit
+def fill_gateup_input_triton_kernel(
+    input_ptr,
+    scale_ptr,
+    gateup_input_ptr,
+    gateup_input_scale_ptr,
+    src2dst_ptr,
+    topk_ids_ptr,
+    start_expert_id,
+    end_expert_id,
+    topk,
+    m_max,
+    hidden_size,
+    scale_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+
+    src_idx_int32 = tl.program_id(0)
+    src_idx = src_idx_int32.to(tl.int64)
+    src2dst_ptr = src2dst_ptr + src_idx * topk
+    topk_ids_ptr = topk_ids_ptr + src_idx * topk
+    src_ptr = input_ptr + src_idx * hidden_size
+    scale_src_ptr = scale_ptr + src_idx * scale_size
+
+    vec = tl.arange(0, BLOCK_SIZE)
+    for idx in range(topk):
+        expert_id = tl.load(topk_ids_ptr + idx)
+        if expert_id >= start_expert_id and expert_id <= end_expert_id:
+            dst_idx_int32 = tl.load(src2dst_ptr + idx)
+            dst_idx = dst_idx_int32.to(tl.int64)
+            dst_idx = dst_idx - start_expert_id * m_max
+            dst_ptr = gateup_input_ptr + dst_idx * hidden_size
+            for start_offset in tl.range(0, hidden_size, BLOCK_SIZE):
+                offset = start_offset + vec
+                mask = offset < hidden_size
+                in_data = tl.load(src_ptr + offset, mask=mask)
+                tl.store(dst_ptr + offset, in_data, mask=mask)
+            scale_dst_ptr = gateup_input_scale_ptr + dst_idx * scale_size
+            for start_offset in tl.range(0, scale_size, BLOCK_SIZE):
+                offset = start_offset + vec
+                mask = offset < scale_size
+                in_scale = tl.load(scale_src_ptr + offset, mask=mask)
+                tl.store(scale_dst_ptr + offset, in_scale, mask=mask)
+
+
+def moe_ep_deepgemm_preprocess(
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    hidden_states: torch.Tensor,
+    top_k: int,
+    start_expert_id,
+    end_expert_id,
+    block_shape,
+    output_dtype: torch.dtype = torch.float8_e4m3fn,
+):
+    reorder_topk_ids, reorder_ids = torch.sort(topk_ids.view(-1), stable=True)
+    seg_indptr = torch.zeros(num_experts + 1, device=topk_ids.device, dtype=torch.int64)
+    src2dst = torch.empty(topk_ids.numel(), device=topk_ids.device, dtype=torch.int32)
+    masked_m = torch.zeros(num_experts, device=topk_ids.device, dtype=torch.int32)
+
+    compute_seg_indptr_triton_kernel[(num_experts,)](
+        reorder_topk_ids, seg_indptr, topk_ids.numel()
+    )
+
+    grid = lambda meta: (triton.cdiv(topk_ids.numel(), meta["BLOCK_SIZE"]),)
+    compute_masked_m_triton_kernel[(num_experts,)](seg_indptr, masked_m)
+
+    # For masked grouped GEMM, shape M should be multiple of the block M (current block M: {block_m}) https://github.com/deepseek-ai/DeepGEMM/blob/main/deep_gemm/jit_kernels/m_grouped_gemm.py#L165
+    m_max = (hidden_states.size(0) + 255) // 256 * 256
+    expected_m = (topk_ids.numel() + num_experts - 1) // num_experts
+    gateup_input = torch.empty(
+        (int(end_expert_id - start_expert_id + 1), m_max, hidden_states.size(1)),
+        device=hidden_states.device,
+        dtype=output_dtype,
+    )
+
+    deepgemm_compute_src2dst_triton_kernel[grid](
+        topk_ids,
+        reorder_ids,
+        seg_indptr,
+        src2dst,
+        m_max,
+        topk_ids.numel(),
+        BLOCK_SIZE=256,
+    )
+
+    if block_shape is None:
+        block_shape = [128, 128]
+    assert len(block_shape) == 2
+    block_n, block_k = block_shape[0], block_shape[1]
+    hidden_states, scale = per_token_group_quant_fp8(hidden_states, block_k)
+
+    gateup_input_scale = torch.empty(
+        (gateup_input.size(0), gateup_input.size(1), scale.size(1)),
+        device=hidden_states.device,
+        dtype=scale.dtype,
+    )
+
+    fill_gateup_input_triton_kernel[(hidden_states.shape[0],)](
+        hidden_states,
+        scale,
+        gateup_input,
+        gateup_input_scale,
+        src2dst,
+        topk_ids,
+        start_expert_id,
+        end_expert_id,
+        top_k,
+        m_max,
+        hidden_states.size(1),
+        scale.size(1),
+        BLOCK_SIZE=1024,
+    )
+
+    return (
+        m_max,
+        masked_m[start_expert_id : (end_expert_id + 1)],
+        expected_m,
+        src2dst,
+        gateup_input,
+        gateup_input_scale,
+    )
+
+
+@triton.jit
+def compute_identity_kernel(
+    top_k,
+    hidden_states_ptr,
+    expert_scales_ptr,
+    num_tokens,
+    output_ptr,
+    hidden_dim,
+    scales_stride,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    batch_id = pid // (hidden_dim // BLOCK_SIZE)
+    dim_offset = pid % (hidden_dim // BLOCK_SIZE) * BLOCK_SIZE
+
+    if batch_id >= num_tokens or dim_offset >= hidden_dim:
+        return
+
+    h = tl.load(
+        hidden_states_ptr
+        + batch_id * hidden_dim
+        + dim_offset
+        + tl.arange(0, BLOCK_SIZE),
+        mask=(dim_offset + tl.arange(0, BLOCK_SIZE)) < hidden_dim,
+    )
+
+    result = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
+    for i in range(top_k):
+        scale = tl.load(expert_scales_ptr + batch_id * scales_stride + i)
+        result += h * scale
+
+    tl.store(
+        output_ptr + batch_id * hidden_dim + dim_offset + tl.arange(0, BLOCK_SIZE),
+        result,
+        mask=(dim_offset + tl.arange(0, BLOCK_SIZE)) < hidden_dim,
+    )
+
+
+def zero_experts_compute_triton(
+    expert_indices, expert_scales, num_experts, zero_expert_type, hidden_states
+):
+    N = expert_indices.numel()
+    top_k = expert_indices.size(-1)
+    grid = lambda meta: (triton.cdiv(N, meta["BLOCK_SIZE"]),)
+
+    if zero_expert_type == "identity":
+        zero_expert_mask = expert_indices < num_experts
+        zero_expert_scales = expert_scales.clone()
+        zero_expert_scales[zero_expert_mask] = 0.0
+
+    normal_expert_mask = expert_indices >= num_experts
+    expert_indices[normal_expert_mask] = -1
+    expert_scales[normal_expert_mask] = 0.0
+
+    output = torch.zeros_like(hidden_states).to(hidden_states.device)
+    hidden_dim = hidden_states.size(-1)
+    num_tokens = hidden_states.size(0)
+
+    grid = lambda meta: (num_tokens * (hidden_dim // meta["BLOCK_SIZE"]),)
+    compute_identity_kernel[grid](
+        top_k,
+        hidden_states,
+        zero_expert_scales,
+        num_tokens,
+        output,
+        hidden_dim,
+        zero_expert_scales.stride(0),
+        BLOCK_SIZE=256,
+    )
+
+    return output
diff --git a/python/sglang/srt/layers/moe/ep_moe/layer.py b/python/sglang/srt/layers/moe/ep_moe/layer.py
new file mode 100644
index 000000000..ef33665c3
--- /dev/null
+++ b/python/sglang/srt/layers/moe/ep_moe/layer.py
@@ -0,0 +1,869 @@
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+
+from sglang.srt.distributed.parallel_state import get_moe_expert_parallel_world_size
+from sglang.srt.layers.moe import (
+    get_deepep_mode,
+    get_moe_a2a_backend,
+    get_moe_runner_backend,
+    should_use_flashinfer_trtllm_moe,
+)
+from sglang.srt.layers.moe.ep_moe.kernels import (
+    ep_gather,
+    ep_scatter,
+    moe_ep_deepgemm_preprocess,
+    post_reorder_triton_kernel,
+    silu_and_mul_masked_post_quant_fwd,
+    tma_align_input_scale,
+)
+from sglang.srt.layers.moe.fused_moe_triton.layer import FlashInferFusedMoE, FusedMoE
+from sglang.srt.layers.moe.topk import TopKOutput
+from sglang.srt.layers.quantization import deep_gemm_wrapper
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.quantization.fp8 import Fp8Config
+from sglang.srt.layers.quantization.fp8_kernel import (
+    is_fp8_fnuz,
+    sglang_per_token_group_quant_fp8,
+)
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.utils import ceil_div, dispose_tensor, get_bool_env_var, is_hip, is_npu
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import (
+        DeepEPLLOutput,
+        DeepEPNormalOutput,
+        DispatchOutput,
+    )
+
+_is_hip = is_hip()
+_is_npu = is_npu()
+_is_fp8_fnuz = is_fp8_fnuz()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+if not (_is_npu or _is_hip):
+    from sgl_kernel import silu_and_mul
+
+if _use_aiter:
+    from aiter import ActivationType, QuantType
+    from aiter.fused_moe import fused_moe
+
+logger = logging.getLogger(__name__)
+
+
+# TODO(kaixih@nvidia): ideally we should merge this logic into
+# `fill_gateup_input_triton_kernel` to directly generate e8m0 scale.
+@torch.compile
+def _cast_to_e8m0_with_rounding_up(x: torch.Tensor) -> torch.Tensor:
+    temp = x.to(torch.float32).view(torch.int32)
+    exp = torch.bitwise_right_shift(temp, 23)
+    mant = torch.bitwise_and(temp, 0x7FFFFF)
+    is_ru = torch.logical_and(
+        torch.logical_and((mant > 0), (exp != 0xFE)),
+        ~torch.logical_and((exp == 0), (mant <= 0x400000)),
+    )
+    exp = torch.where(is_ru, exp + 1, exp)
+    new_x = exp.to(torch.uint8).view(torch.int)
+    return new_x.transpose(1, 2).contiguous().transpose(1, 2)
+
+
+class EPMoE(FusedMoE):
+    """
+    MoE Expert Parallel Impl
+
+
+    """
+
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        layer_id: int,
+        num_fused_shared_experts: int = 0,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        activation: str = "silu",
+        routed_scaling_factor: Optional[float] = None,
+        gemm1_alpha: Optional[float] = None,
+        gemm1_clamp_limit: Optional[float] = None,
+        with_bias: bool = False,
+    ):
+        super().__init__(
+            num_experts=num_experts,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_fused_shared_experts=num_fused_shared_experts,
+            layer_id=layer_id,
+            top_k=top_k,
+            params_dtype=params_dtype,
+            quant_config=quant_config,
+            prefix=prefix,
+            activation=activation,
+            # apply_router_weight_on_input=apply_router_weight_on_input,
+            routed_scaling_factor=routed_scaling_factor,
+            gemm1_alpha=gemm1_alpha,
+            gemm1_clamp_limit=gemm1_clamp_limit,
+            with_bias=with_bias,
+        )
+
+        self.intermediate_size = intermediate_size
+
+        if isinstance(quant_config, Fp8Config):
+            self.use_block_quant = getattr(self.quant_method, "block_quant", False)
+            self.block_shape = (
+                self.quant_method.quant_config.weight_block_size
+                if self.use_block_quant
+                else None
+            )
+            self.use_fp8_w8a8 = True
+            self.fp8_dtype = torch.float8_e4m3fn
+            self.activation_scheme = quant_config.activation_scheme
+        else:
+            self.use_fp8_w8a8 = False
+            self.use_block_quant = False
+            self.block_shape = None
+            self.activation_scheme = None
+
+    def forward(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
+        if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM and self.use_fp8_w8a8:
+            return self.forward_deepgemm(hidden_states, topk_output)
+        else:
+            return super().forward(hidden_states, topk_output)
+
+    def forward_deepgemm(
+        self,
+        hidden_states: torch.Tensor,
+        topk_output: TopKOutput,
+    ):
+
+        self.w13_weight_fp8 = (
+            self.w13_weight,
+            (
+                self.w13_weight_scale_inv
+                if self.use_block_quant
+                else self.w13_weight_scale
+            ),
+        )
+        self.w2_weight_fp8 = (
+            self.w2_weight,
+            self.w2_weight_scale_inv if self.use_block_quant else self.w2_weight_scale,
+        )
+
+        assert self.quant_method is not None
+        assert self.moe_runner_config.activation == "silu"
+
+        hidden_states_shape = hidden_states.shape
+        hidden_states_dtype = hidden_states.dtype
+        hidden_states_device = hidden_states.device
+
+        topk_weights, topk_ids, _ = topk_output
+
+        if not self.use_block_quant:
+            # Convert per-tensor quant to per-block quant by repeating scales for forward_deepgemm
+            scale_block_size = 128
+            w13_weight_scale_n = 2 * (
+                (self.intermediate_size + scale_block_size - 1) // scale_block_size
+            )
+            w13_weight_scale_k = (
+                hidden_states_shape[-1] + scale_block_size - 1
+            ) // scale_block_size
+            w13_weight_scale = (
+                self.w13_weight_scale.unsqueeze(1)
+                .repeat_interleave(w13_weight_scale_n, dim=1)
+                .unsqueeze(2)
+                .repeat_interleave(w13_weight_scale_k, dim=2)
+            )
+            self.w13_weight_fp8 = (
+                self.w13_weight,
+                w13_weight_scale,
+            )
+            w2_weight_scale_n = (
+                hidden_states_shape[-1] + scale_block_size - 1
+            ) // scale_block_size
+            w2_weight_scale_k = (
+                self.intermediate_size + scale_block_size - 1
+            ) // scale_block_size
+            w2_weight_scale = (
+                self.w2_weight_scale.unsqueeze(1)
+                .repeat_interleave(w2_weight_scale_n, dim=1)
+                .unsqueeze(2)
+                .repeat_interleave(w2_weight_scale_k, dim=2)
+            )
+            self.w2_weight_fp8 = (
+                self.w2_weight,
+                w2_weight_scale,
+            )
+
+        # PreReorder
+        m_max, masked_m, expected_m, src2dst, gateup_input, gateup_input_scale = (
+            moe_ep_deepgemm_preprocess(
+                topk_ids,
+                self.num_experts,
+                hidden_states,
+                self.top_k,
+                self.start_expert_id,
+                self.end_expert_id,
+                self.block_shape,
+            )
+        )
+
+        dispose_tensor(hidden_states)
+
+        if deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0:
+            b, s_mn, s_k = gateup_input_scale.shape
+            assert (
+                s_mn % 4 == 0 and s_k % 4 == 0
+            ), f"scales must be aligned to 4, but got ({b}, {s_mn}, {s_k})"
+
+        # GroupGemm-0
+        gateup_input_fp8 = (
+            gateup_input,
+            (
+                _cast_to_e8m0_with_rounding_up(gateup_input_scale)
+                if deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
+                else deep_gemm_wrapper.get_mn_major_tma_aligned_tensor(
+                    gateup_input_scale
+                )
+            ),
+        )
+        num_groups, m, k = gateup_input_fp8[0].size()
+        n = self.w13_weight.size(1)
+        gateup_output = torch.empty(
+            (num_groups, m, n), device=hidden_states_device, dtype=torch.bfloat16
+        )
+        deep_gemm_wrapper.grouped_gemm_nt_f8f8bf16_masked(
+            gateup_input_fp8,
+            self.w13_weight_fp8,
+            gateup_output,
+            masked_m,
+            expected_m,
+        )
+        del gateup_input
+        del gateup_input_fp8
+
+        # Act
+        down_input = torch.empty(
+            (
+                gateup_output.shape[0],
+                gateup_output.shape[1],
+                gateup_output.shape[2] // 2,
+            ),
+            device=hidden_states_device,
+            dtype=self.fp8_dtype,
+        )
+        scale_block_size = 128
+        down_input_scale = torch.empty(
+            (
+                gateup_output.shape[0],
+                gateup_output.shape[1],
+                gateup_output.shape[2] // 2 // scale_block_size,
+            ),
+            device=hidden_states_device,
+            dtype=torch.float32,
+        )
+        silu_and_mul_masked_post_quant_fwd(
+            gateup_output,
+            down_input,
+            down_input_scale,
+            scale_block_size,
+            masked_m,
+            scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
+        )
+        del gateup_output
+
+        # GroupGemm-1
+        n = self.w2_weight.size(1)
+        down_input_fp8 = (
+            down_input,
+            (
+                down_input_scale
+                if deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
+                else deep_gemm_wrapper.get_mn_major_tma_aligned_tensor(down_input_scale)
+            ),
+        )
+        down_output = torch.empty(
+            (num_groups, m, n), device=hidden_states_device, dtype=torch.bfloat16
+        )
+        deep_gemm_wrapper.grouped_gemm_nt_f8f8bf16_masked(
+            down_input_fp8,
+            self.w2_weight_fp8,
+            down_output,
+            masked_m,
+            expected_m,
+        )
+        del down_input
+        del down_input_fp8
+
+        # PostReorder
+        output = torch.empty(
+            hidden_states_shape, dtype=hidden_states_dtype, device=hidden_states_device
+        )
+        post_reorder_triton_kernel[(hidden_states_shape[0],)](
+            down_output,
+            output,
+            src2dst,
+            topk_ids,
+            topk_weights,
+            self.start_expert_id,
+            self.end_expert_id,
+            self.top_k,
+            hidden_states_shape[1],
+            m_max * self.start_expert_id,
+            BLOCK_SIZE=512,
+        )
+        if self.moe_runner_config.routed_scaling_factor is not None:
+            output *= self.moe_runner_config.routed_scaling_factor
+        return output
+
+
+class DeepEPMoE(EPMoE):
+    """
+    MoE Expert Parallel Impl based on DeepEP (https://github.com/deepseek-ai/DeepEP/tree/main)
+    """
+
+    _has_printed = False
+
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        layer_id: int,
+        num_fused_shared_experts: int = 0,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        activation: str = "silu",
+        routed_scaling_factor: Optional[float] = None,
+    ):
+        super().__init__(
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            layer_id=layer_id,
+            num_fused_shared_experts=num_fused_shared_experts,
+            params_dtype=params_dtype,
+            quant_config=quant_config,
+            prefix=prefix,
+            activation=activation,
+            routed_scaling_factor=routed_scaling_factor,
+        )
+        self.deepep_mode = get_deepep_mode()
+
+        # TODO: move to the beginning of the file
+        from sglang.srt.distributed.parallel_state import get_tp_group
+        from sglang.srt.two_batch_overlap import MaybeTboDeepEPDispatcher
+
+        self.deepep_dispatcher = MaybeTboDeepEPDispatcher(
+            group=get_tp_group().device_group,
+            router_topk=self.top_k,
+            permute_fusion=True,
+            num_experts=self.num_experts,
+            num_local_experts=self.num_local_experts,
+            hidden_size=hidden_size,
+            params_dtype=params_dtype,
+            deepep_mode=self.deepep_mode,
+            async_finish=True,  # TODO
+            return_recv_hook=True,
+        )
+
+        if self.deepep_mode.enable_low_latency() and not _is_npu:
+            # NPU supports low_latency deepep without deepgemm
+            assert (
+                deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
+            ), f"DeepEP {self.deepep_mode} mode requires deep_gemm"
+        if _use_aiter:
+            # expert_mask is of size (self.num_local_experts + 1),
+            # the extra 1 is for invalid rank_id (in original deepep, the invalid rank_id is -1, but aiter does not allow -1, we use a mask to make those ids invalid)
+            # for instance, if we have 4 experts on this rank, we would have a expert_mask like:
+            #     self.expert_mask = [1, 1, 1, 1, 0]
+            # idx from 0-3 is valid and will be processed, while idx == 4 will be masked out
+            self.expert_mask = torch.zeros(
+                (self.num_local_experts + 1),
+                device=torch.cuda.current_device(),
+                dtype=torch.int,
+            )
+            # the last one is invalid rank_id
+            self.expert_mask[:-1] = 1
+        elif not _is_npu:
+            self.w13_weight_fp8 = (
+                self.w13_weight,
+                (
+                    self.w13_weight_scale_inv
+                    if self.use_block_quant
+                    else self.w13_weight_scale
+                ),
+            )
+            self.w2_weight_fp8 = (
+                self.w2_weight,
+                (
+                    self.w2_weight_scale_inv
+                    if self.use_block_quant
+                    else self.w2_weight_scale
+                ),
+            )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        topk_idx: torch.Tensor,
+        topk_weights: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        dispatch_output = self.dispatch(
+            hidden_states, topk_idx, topk_weights, forward_batch
+        )
+        hidden_states = self.moe_impl(dispatch_output)
+        hidden_states = self.combine(
+            hidden_states,
+            dispatch_output.topk_idx,
+            dispatch_output.topk_weights,
+            forward_batch,
+        )
+        return hidden_states
+
+    def dispatch(
+        self,
+        hidden_states: torch.Tensor,
+        topk_idx: torch.Tensor,
+        topk_weights: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        return self.deepep_dispatcher.dispatch(
+            hidden_states=hidden_states,
+            topk_idx=topk_idx,
+            topk_weights=topk_weights,
+            forward_batch=forward_batch,
+        )
+
+    def moe_impl(self, dispatch_output: DispatchOutput):
+        from sglang.srt.layers.moe.token_dispatcher import DispatchOutputChecker
+
+        if _use_aiter:
+            assert DispatchOutputChecker.format_is_deepep(dispatch_output)
+            # in forward_aiter, we skip token permutation and unpermutation, which have been fused inside aiter kernel
+            return self.forward_aiter(dispatch_output)
+        if _is_npu:
+            assert DispatchOutputChecker.format_is_deepep(dispatch_output)
+            return self.forward_npu(dispatch_output)
+        if DispatchOutputChecker.format_is_deepep_normal(dispatch_output):
+            assert deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM and self.use_fp8_w8a8
+            return self.forward_deepgemm_contiguous(dispatch_output)
+        elif DispatchOutputChecker.format_is_deepep_ll(dispatch_output):
+            assert deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM and self.use_fp8_w8a8
+            return self.forward_deepgemm_masked(dispatch_output)
+        else:
+            raise ValueError(
+                f"Dispatch output format {dispatch_output.format} is not supported"
+            )
+
+    def combine(
+        self,
+        hidden_states: torch.Tensor,
+        topk_idx: torch.Tensor,
+        topk_weights: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        return self.deepep_dispatcher.combine(
+            hidden_states=hidden_states,
+            topk_idx=topk_idx,
+            topk_weights=topk_weights,
+            forward_batch=forward_batch,
+        )
+
+    def forward_aiter(
+        self,
+        dispatch_output: Union[DeepEPNormalOutput, DeepEPLLOutput],
+    ):
+        hidden_states, topk_idx, topk_weights = (
+            dispatch_output.hidden_states,
+            dispatch_output.topk_idx,
+            dispatch_output.topk_weights,
+        )
+        if hidden_states.shape[0] == 0:
+            return hidden_states
+        # in original deepep, idx == -1 meaning invalid and will not be processed.
+        # aiter does not accept -1, we use a expert mask to make these idx invalid
+        # (idx == num_local_experts) meaning not used in aiter fused_moe
+        topk_idx_copy = topk_idx.to(torch.int32)
+        topk_idx_copy[topk_idx_copy == -1] = self.num_local_experts
+
+        return fused_moe(
+            hidden_states,
+            self.w13_weight,
+            self.w2_weight,
+            topk_weights,
+            topk_idx_copy,
+            w1_scale=self.w13_weight_scale_inv,
+            w2_scale=self.w2_weight_scale_inv,
+            quant_type=QuantType.per_128x128,
+            activation=(
+                ActivationType.Silu
+                if self.moe_runner_config.activation == "silu"
+                else ActivationType.Gelu
+            ),
+            expert_mask=self.expert_mask,
+        )
+
+    def forward_deepgemm_contiguous(
+        self,
+        dispatch_output: DeepEPNormalOutput,
+    ):
+        hidden_states_fp8, topk_idx, topk_weights, num_recv_tokens_per_expert = (
+            dispatch_output
+        )
+        hidden_states_fp8, hidden_states_scale = hidden_states_fp8
+        assert self.quant_method is not None
+        assert self.moe_runner_config.activation == "silu"
+        if num_recv_tokens_per_expert is None:
+            return hidden_states_fp8.bfloat16()
+        all_tokens = sum(num_recv_tokens_per_expert)
+        if all_tokens <= 0:
+            return hidden_states_fp8.bfloat16()
+        M, K = hidden_states_fp8.size()
+        N = self.w13_weight.size(1)
+        scale_block_size = 128
+
+        hidden_states_fp8_shape = hidden_states_fp8.shape
+        hidden_states_fp8_device = hidden_states_fp8.device
+        hidden_states_fp8_dtype = hidden_states_fp8.dtype
+
+        input_tensor = [
+            torch.empty(
+                (all_tokens, K),
+                device=hidden_states_fp8.device,
+                dtype=hidden_states_fp8.dtype,
+            ),
+            (
+                # TODO check whether need `zeros`
+                torch.zeros(
+                    (ceil_div(K // 128, 4), all_tokens),
+                    device=hidden_states_fp8.device,
+                    dtype=torch.int,
+                ).transpose(0, 1)
+                if deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
+                else torch.empty(
+                    (all_tokens, K // 128),
+                    device=hidden_states_fp8.device,
+                    dtype=torch.float32,
+                )
+            ),
+        ]
+        m_indices = torch.empty(
+            all_tokens, device=hidden_states_fp8.device, dtype=torch.int32
+        )
+        output_index = torch.empty_like(topk_idx)
+
+        num_recv_tokens_per_expert_gpu = torch.tensor(
+            num_recv_tokens_per_expert,
+            dtype=torch.int32,
+            pin_memory=True,
+            device="cpu",
+        ).cuda(non_blocking=True)
+        expert_start_loc = torch.empty_like(num_recv_tokens_per_expert_gpu)
+
+        ep_scatter(
+            hidden_states_fp8,
+            hidden_states_scale,
+            topk_idx,
+            num_recv_tokens_per_expert_gpu,
+            expert_start_loc,
+            input_tensor[0],
+            input_tensor[1],
+            m_indices,
+            output_index,
+            scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
+        )
+        dispose_tensor(hidden_states_fp8)
+
+        gateup_output = torch.empty(
+            (all_tokens, N),
+            device=hidden_states_fp8_device,
+            dtype=torch.bfloat16,
+        )
+        if not deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0:
+            input_tensor[1] = tma_align_input_scale(input_tensor[1])
+        deep_gemm_wrapper.grouped_gemm_nt_f8f8bf16_contig(
+            input_tensor, self.w13_weight_fp8, gateup_output, m_indices
+        )
+        del input_tensor
+        down_input = torch.empty(
+            (
+                all_tokens,
+                N // 2,
+            ),
+            device=gateup_output.device,
+            dtype=torch.bfloat16,
+        )
+        silu_and_mul(gateup_output.view(-1, N), down_input)
+        del gateup_output
+        down_output = torch.empty(
+            (all_tokens, K),
+            device=hidden_states_fp8_device,
+            dtype=torch.bfloat16,
+        )
+        down_input_fp8, down_input_scale = sglang_per_token_group_quant_fp8(
+            down_input,
+            scale_block_size,
+            column_major_scales=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
+            scale_tma_aligned=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
+            scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
+        )
+        del down_input
+        if not deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0:
+            down_input_scale = tma_align_input_scale(down_input_scale)
+        deep_gemm_wrapper.grouped_gemm_nt_f8f8bf16_contig(
+            (down_input_fp8, down_input_scale),
+            self.w2_weight_fp8,
+            down_output,
+            m_indices,
+        )
+        del down_input_fp8, down_input_scale
+
+        gather_out = torch.empty(
+            hidden_states_fp8_shape,
+            device=hidden_states_fp8_device,
+            dtype=torch.bfloat16,
+        )
+        ep_gather(down_output, topk_idx, topk_weights, output_index, gather_out)
+
+        return gather_out
+
+    def forward_deepgemm_masked(
+        self,
+        dispatch_output: DeepEPLLOutput,
+    ):
+        hidden_states_fp8, _, _, masked_m, expected_m = dispatch_output
+        assert self.quant_method is not None
+        assert self.moe_runner_config.activation == "silu"
+
+        # GroupGemm-0
+        num_groups, m, k = hidden_states_fp8[0].size()
+        n = self.w13_weight.size(1)
+        expected_m = min(expected_m, m)
+        gateup_output = torch.empty(
+            (num_groups, m, n), device=hidden_states_fp8[0].device, dtype=torch.bfloat16
+        )
+        deep_gemm_wrapper.grouped_gemm_nt_f8f8bf16_masked(
+            hidden_states_fp8,
+            self.w13_weight_fp8,
+            gateup_output,
+            masked_m,
+            expected_m,
+        )
+        dispose_tensor(hidden_states_fp8[0])
+
+        # Act
+        down_input = torch.empty(
+            (
+                gateup_output.shape[0],
+                gateup_output.shape[1],
+                gateup_output.shape[2] // 2,
+            ),
+            device=gateup_output.device,
+            dtype=self.fp8_dtype,
+        )
+        scale_block_size = 128
+        down_input_scale = torch.empty(
+            (
+                gateup_output.shape[0],
+                gateup_output.shape[1],
+                gateup_output.shape[2] // 2 // scale_block_size,
+            ),
+            device=gateup_output.device,
+            dtype=torch.float32,
+        )
+        silu_and_mul_masked_post_quant_fwd(
+            gateup_output,
+            down_input,
+            down_input_scale,
+            scale_block_size,
+            masked_m,
+            scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
+        )
+        del gateup_output
+
+        # GroupGemm-1
+        n = self.w2_weight.size(1)
+        down_input_fp8 = (
+            down_input,
+            (
+                down_input_scale
+                if deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
+                else deep_gemm_wrapper.get_mn_major_tma_aligned_tensor(down_input_scale)
+            ),
+        )
+        down_output = torch.empty(
+            (num_groups, m, n), device=down_input.device, dtype=torch.bfloat16
+        )
+        deep_gemm_wrapper.grouped_gemm_nt_f8f8bf16_masked(
+            down_input_fp8,
+            self.w2_weight_fp8,
+            down_output,
+            masked_m,
+            expected_m,
+        )
+
+        return down_output
+
+    def forward_npu(
+        self,
+        dispatch_output: Union[DeepEPNormalOutput, DeepEPLLOutput],
+    ):
+        assert self.quant_method is not None
+        assert self.moe_runner_config.activation == "silu"
+
+        import torch_npu
+
+        from sglang.srt.layers.moe.token_dispatcher import DispatchOutputChecker
+
+        # NOTE: Ascend's Dispatch & Combine does not support FP16
+        output_dtype = torch.bfloat16
+        group_list_type = 1
+
+        def _forward_normal(dispatch_output: DeepEPNormalOutput):
+            if TYPE_CHECKING:
+                assert isinstance(dispatch_output, DeepEPNormalOutput)
+            hidden_states, _, _, num_recv_tokens_per_expert = dispatch_output
+
+            if isinstance(hidden_states, tuple):
+                per_token_scale = hidden_states[1]
+                hidden_states = hidden_states[0]
+            else:
+                # dynamic quant
+                hidden_states, per_token_scale = torch_npu.npu_dynamic_quant(
+                    hidden_states
+                )
+
+            group_list = torch.tensor(num_recv_tokens_per_expert, dtype=torch.int64).to(
+                hidden_states.device
+            )
+
+            # gmm1: gate_up_proj
+            hidden_states = torch_npu.npu_grouped_matmul(
+                x=[hidden_states],
+                weight=[self.w13_weight],
+                scale=[self.w13_weight_scale.to(output_dtype)],
+                per_token_scale=[per_token_scale],
+                split_item=2,
+                group_list_type=group_list_type,
+                group_type=0,
+                group_list=group_list,
+                output_dtype=output_dtype,
+            )[0]
+
+            # act_fn: swiglu
+            hidden_states = torch_npu.npu_swiglu(hidden_states)
+            hidden_states, swiglu_out_scale = torch_npu.npu_dynamic_quant(hidden_states)
+
+            # gmm2: down_proj
+            hidden_states = torch_npu.npu_grouped_matmul(
+                x=[hidden_states],
+                weight=[self.w2_weight],
+                scale=[self.w2_weight_scale.to(output_dtype)],
+                per_token_scale=[swiglu_out_scale],
+                split_item=2,
+                group_list_type=group_list_type,
+                group_type=0,
+                group_list=group_list,
+                output_dtype=output_dtype,
+            )[0]
+
+            return hidden_states
+
+        def _forward_ll(dispatch_output: DeepEPLLOutput):
+            if TYPE_CHECKING:
+                assert isinstance(dispatch_output, DeepEPLLOutput)
+            hidden_states, topk_idx, topk_weights, group_list, _ = dispatch_output
+
+            per_token_scale = hidden_states[1]
+            hidden_states = hidden_states[0]
+
+            group_list = group_list.to(torch.int64)
+
+            # gmm1: gate_up_proj
+            hidden_states = torch_npu.npu_grouped_matmul(
+                x=[hidden_states],
+                weight=[self.w13_weight],
+                split_item=2,
+                group_list_type=group_list_type,
+                group_type=0,
+                group_list=group_list,
+                output_dtype=torch.int32,
+            )[0]
+
+            # act_fn: swiglu
+            hidden_states, swiglu_out_scale = torch_npu.npu_dequant_swiglu_quant(
+                x=hidden_states,
+                weight_scale=self.w13_weight_scale.to(torch.float32),
+                activation_scale=per_token_scale,
+                bias=None,
+                quant_scale=None,
+                quant_offset=None,
+                group_index=group_list,
+                activate_left=True,
+                quant_mode=1,
+            )
+
+            # gmm2: down_proj
+            hidden_states = torch_npu.npu_grouped_matmul(
+                x=[hidden_states],
+                weight=[self.w2_weight],
+                scale=[self.w2_weight_scale.to(output_dtype)],
+                per_token_scale=[swiglu_out_scale],
+                split_item=2,
+                group_list_type=group_list_type,
+                group_type=0,
+                group_list=group_list,
+                output_dtype=output_dtype,
+            )[0]
+
+            return hidden_states
+
+        if DispatchOutputChecker.format_is_deepep_normal(dispatch_output):
+            return _forward_normal(dispatch_output)
+        elif DispatchOutputChecker.format_is_deepep_ll(dispatch_output):
+            return _forward_ll(dispatch_output)
+        else:
+            raise ValueError(f"Not Supported DeepEP format {dispatch_output.format}")
+
+
+def get_moe_impl_class(quant_config: Optional[QuantizationConfig] = None):
+    if get_moe_a2a_backend().is_deepep():
+        return DeepEPMoE
+
+    # NEW: Direct FP4 detection (bypasses EP requirements)
+    # Check for FP4 quantization with TRTLLM flag, regardless of EP
+    if get_moe_runner_backend().is_flashinfer_trtllm():
+        # FlashInferFP4MoE must be paired with ModelOptNvFp4FusedMoEMethod.
+        # If UnquantizedFusedMoEMethod is detected, fall back to FusedMoE instead.
+        if quant_config is None:
+            return FusedMoE
+        try:
+            # Check the quantization argument directly
+            quantization = global_server_args_dict.get("quantization")
+            if quantization == "modelopt_fp4":
+                from sglang.srt.layers.moe.fused_moe_triton.layer import (
+                    FlashInferFP4MoE,
+                )
+
+                return FlashInferFP4MoE
+        except:
+            pass
+
+    if should_use_flashinfer_trtllm_moe():
+        return FlashInferFusedMoE
+    if get_moe_runner_backend().is_flashinfer_cutlass():
+        return FusedMoE
+    if get_moe_expert_parallel_world_size() > 1:
+        return EPMoE
+    return FusedMoE
diff --git a/python/sglang/srt/layers/moe/fused_moe_native.py b/python/sglang/srt/layers/moe/fused_moe_native.py
new file mode 100644
index 000000000..a3d3a09bf
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_native.py
@@ -0,0 +1,101 @@
+"""
+Torch-native implementation for FusedMoE. This is used for torch.compile.
+It is based on https://github.com/pytorch-labs/gpt-fast/blob/32971d3129541c5bfb4f715abc33d1c5f408d204/mixtral-moe/model.py#L204
+"""
+
+import torch
+from torch.nn import functional as F
+
+from sglang.srt.layers.activation import GeluAndMul, SiluAndMul
+from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
+from sglang.srt.layers.moe.token_dispatcher import StandardDispatchOutput
+from sglang.srt.layers.moe.topk import StandardTopKOutput
+
+
+def fused_moe_forward_native(
+    layer: torch.nn.Module,
+    dispatch_output: StandardDispatchOutput,
+) -> torch.Tensor:
+
+    x, topk_output = dispatch_output
+    moe_runner_config = layer.moe_runner_config
+
+    if moe_runner_config.apply_router_weight_on_input:
+        raise NotImplementedError()
+
+    topk_weights, topk_ids, _ = topk_output
+
+    w13_weights = layer.w13_weight[topk_ids]
+    w1_weights, w3_weights = torch.chunk(w13_weights, 2, dim=2)
+    w2_weights = layer.w2_weight[topk_ids]
+    x1 = torch.einsum("ti,taoi -> tao", x, w1_weights)
+    if moe_runner_config.activation == "silu":
+        x1 = F.silu(x1)
+    elif moe_runner_config.activation == "gelu":
+        x1 = F.gelu(x1)
+    else:
+        raise ValueError(f"Unsupported activation: {moe_runner_config.activation=}")
+    x3 = torch.einsum("ti, taoi -> tao", x, w3_weights)
+    expert_outs = torch.einsum("tao, taio -> tai", (x1 * x3), w2_weights)
+    return torch.einsum("tai,ta -> ti", expert_outs, topk_weights.to(expert_outs.dtype))
+
+
+def moe_forward_native(
+    layer: torch.nn.Module,
+    x: torch.Tensor,
+    topk_output: StandardTopKOutput,
+    moe_runner_config: MoeRunnerConfig,
+) -> torch.Tensor:
+
+    if moe_runner_config.apply_router_weight_on_input:
+        raise NotImplementedError()
+
+    topk_weights, topk_ids, _ = topk_output
+
+    # Ref code from https://huggingface.co/deepseek-ai/DeepSeek-V2/blob/e0828e3cc0a03408724b80c3cc92c8e072db8d01/modeling_deepseek.py#L589
+    len_experts = layer.num_experts
+
+    cnts = topk_ids.new_zeros((topk_ids.shape[0], len_experts))
+    cnts.scatter_(1, topk_ids.to(torch.int64), 1)
+    tokens_per_expert = cnts.sum(dim=0)
+    idxs = topk_ids.view(-1).argsort()
+
+    sorted_tokens = x[idxs // topk_ids.shape[1]]
+    tokens_per_expert = tokens_per_expert.cpu().numpy()
+
+    if moe_runner_config.activation == "silu":
+        act = SiluAndMul()
+    elif moe_runner_config.activation == "gelu":
+        act = GeluAndMul()
+    else:
+        raise ValueError(f"Unsupported activation: {moe_runner_config.activation=}")
+
+    outputs = []
+    start_idx = 0
+    for i, num_tokens in enumerate(tokens_per_expert):
+        end_idx = start_idx + num_tokens
+        if num_tokens == 0:
+            continue
+        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
+
+        layer_w13_weight = layer.w13_weight[i]
+        layer_w2_weight = layer.w2_weight[i]
+
+        gate_up = F.linear(tokens_for_this_expert, layer_w13_weight)
+        gate_up = act(gate_up)
+        expert_out = F.linear(gate_up, layer_w2_weight)
+        outputs.append(expert_out)
+        start_idx = end_idx
+
+    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
+    new_x = torch.empty_like(outs)
+
+    new_x[idxs] = outs
+    final_out = (
+        new_x.view(*topk_ids.shape, -1)
+        .type(topk_weights.dtype)
+        .mul_(topk_weights.unsqueeze(dim=-1))
+        .sum(dim=1)
+        .type(new_x.dtype)
+    )
+    return final_out
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/__init__.py b/python/sglang/srt/layers/moe/fused_moe_triton/__init__.py
new file mode 100644
index 000000000..be3ed3af4
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/__init__.py
@@ -0,0 +1,42 @@
+from contextlib import contextmanager
+from typing import Any, Dict, Optional
+
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe_triton_config import (
+    get_config_file_name,
+    try_get_optimal_moe_config,
+)
+from sglang.srt.layers.moe.fused_moe_triton.layer import (
+    FusedMoE,
+    FusedMoeWeightScaleSupported,
+)
+from sglang.srt.layers.moe.fused_moe_triton.moe_align_block_size import (
+    moe_align_block_size,
+)
+
+_config: Optional[Dict[str, Any]] = None
+
+
+@contextmanager
+def override_config(config):
+    global _config
+    old_config = _config
+    _config = config
+    yield
+    _config = old_config
+
+
+def get_config() -> Optional[Dict[str, Any]]:
+    return _config
+
+
+__all__ = [
+    "FusedMoE",
+    "FusedMoeWeightScaleSupported",
+    "override_config",
+    "get_config",
+    "fused_experts",
+    "get_config_file_name",
+    "moe_align_block_size",
+    "try_get_optimal_moe_config",
+]
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/README.md b/python/sglang/srt/layers/moe/fused_moe_triton/configs/README.md
new file mode 100644
index 000000000..3679e698a
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/README.md
@@ -0,0 +1,15 @@
+This directory contains tuned configurations for different settings of the fused_moe kernel.
+For different settings of
+- E (number of experts)
+- N (intermediate size)
+- device_name (torch.cuda.get_device_name())
+- dtype: The data type used by the fused MoE kernel for computation. Supported types include fp8_w8a8, int8_w8a8, int8_w8a16, int4_w4a16, etc. This determines the precision and quantization scheme for both weights and activations.
+- block_shape: The block quantization shape introduced starting from DeepSeek V3/R1 models. This parameter defines the granularity for block-wise quantization, typically specified as `[block_n, block_k]` where `block_n` and `block_k` represent the block dimensions. For example, DeepSeek V3 commonly uses `[128, 128]` block shapes for efficient block-wise FP8 quantization.
+
+the JSON file contains a mapping from M (batch size) to the chosen configuration.
+
+The example configurations provided are for the Mixtral model for TP2 on H100
+and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have
+N = 7168 and for TP4 we have N = 3584.
+
+See `benchmark/kernels/fused_moe_triton/README.md` on how to generate these config files.
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 000000000..56c1a4e3a
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 2
+  },
+  "48": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 000000000..21924c7e7
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "24": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "32": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "48": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "64": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "96": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "128": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "256": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "512": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 000000000..598993c61
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,218 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "5120": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "9216": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "13312": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "17408": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "25600": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "33792": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "41984": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "50176": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "58368": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 000000000..a7c626e9b
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,218 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "24": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "32": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "48": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "64": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "96": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "128": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "256": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "5120": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "9216": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "13312": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "17408": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "25600": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "33792": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "41984": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "50176": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "58368": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 000000000..14bf13935
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,218 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "24": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "48": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "5120": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "9216": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "13312": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "17408": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "25600": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "33792": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "41984": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "50176": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "58368": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
new file mode 100644
index 000000000..29972c5f3
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
@@ -0,0 +1,218 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "24": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "48": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 2
+  },
+  "128": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 2
+  },
+  "256": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "5120": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "9216": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "13312": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "17408": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "25600": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "33792": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "41984": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "50176": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "58368": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..082fba99b
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,218 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "24": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "32": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "48": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "64": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "96": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "128": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "256": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "512": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "5120": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "9216": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "13312": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "17408": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "25600": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "33792": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "41984": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "50176": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "58368": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 3
+  }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 000000000..dbed86769
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,218 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "96": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "5120": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "9216": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "13312": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "17408": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "25600": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "33792": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "41984": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "50176": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "58368": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 000000000..b89336a26
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,218 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "24": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "32": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "48": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "64": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "96": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "128": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "256": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "5120": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "9216": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "13312": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "17408": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "25600": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "33792": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "41984": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "50176": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "58368": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 000000000..39bc5b9e5
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,218 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "5120": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "9216": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "13312": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "17408": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "25600": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "33792": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "41984": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "50176": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "58368": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 000000000..35d67f837
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,218 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "24": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "32": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "48": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "64": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "96": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "128": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "256": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "5120": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "9216": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "13312": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "17408": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "25600": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "33792": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "41984": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "50176": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "58368": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..587fb2f2e
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..089894816
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json
new file mode 100644
index 000000000..9814a3819
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json
new file mode 100644
index 000000000..9262a74a4
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 000000000..d251f9b5a
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..0ecf814a2
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 000000000..6443c5cd0
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 000000000..1fe9aad96
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "4": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 2
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 4
+  }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 000000000..9b48fe26d
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,218 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "128": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "5120": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "9216": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "13312": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "17408": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "25600": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "33792": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "41984": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "50176": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "58368": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 000000000..5ff4a1600
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,218 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "4": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "48": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 2
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "96": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 2
+  },
+  "128": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 2
+  },
+  "256": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "5120": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "9216": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "13312": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "17408": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "25600": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "33792": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "41984": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "50176": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "58368": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..7e09be1ff
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 000000000..039a10ed1
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..3793fcafe
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 000000000..f1f626667
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "96": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
new file mode 100644
index 000000000..6c809a096
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "128": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "256": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "512": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..3a9e2ef0f
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,130 @@
+{
+    "3328": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "768": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1792": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2560": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2816": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3584": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3840": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1280": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2304": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 000000000..1bfd02a41
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 2
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 000000000..0bb90b7e4
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,218 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "128": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "256": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "512": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "5120": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "9216": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "13312": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "17408": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "25600": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "33792": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "41984": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "50176": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "58368": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..033ba506d
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,130 @@
+{
+    "3840": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1792": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3584": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2816": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1280": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "768": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3328": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2560": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2304": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
new file mode 100644
index 000000000..14723c576
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 2
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 4
+  }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 000000000..9e9d27b1c
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 5
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 2
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "128": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 3
+  }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
new file mode 100644
index 000000000..6ac5ba746
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
@@ -0,0 +1,146 @@
+{
+  "1": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "2": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "4": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "8": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "16": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "24": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 32,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "32": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "48": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "64": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 16,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "96": {
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 256,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 4,
+    "num_stages": 4
+  },
+  "128": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "256": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 4,
+    "num_stages": 3
+  },
+  "512": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "1024": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 5
+  },
+  "1536": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "2048": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 64,
+    "num_warps": 8,
+    "num_stages": 3
+  },
+  "3072": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 32,
+    "num_warps": 8,
+    "num_stages": 4
+  },
+  "4096": {
+    "BLOCK_SIZE_M": 256,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 3
+  }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..c87ca0de8
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,130 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1792": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3328": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2560": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "768": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2816": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2304": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1280": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3840": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3584": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 000000000..09d812937
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..d471f9821
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..fe8dab46e
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..1cd253d11
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json
new file mode 100644
index 000000000..9acb73b02
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..ddd55abe0
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
new file mode 100644
index 000000000..4ce62139b
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..2e692a158
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json
new file mode 100644
index 000000000..4e7a2a1a0
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..857d11e48
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..90facc4a8
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..e25d0492d
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..e25d0492d
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..5f32e830a
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..2840e9f47
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..c6c98c28e
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..6fcf40875
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 000000000..4dcfefa5b
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json
new file mode 100644
index 000000000..ac9349fef
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json
new file mode 100644
index 000000000..b99349fb8
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..e9c5b9251
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 000000000..8cc6c643f
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 000000000..283ffd8ff
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..e2f8164cc
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..ef6a0479c
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..67bf2b720
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json
new file mode 100644
index 000000000..da71451a7
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..14c6f8c1a
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..2b974a78d
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json
new file mode 100644
index 000000000..054873f38
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..869f75127
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..40d85e43e
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..4ebd7a781
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json
new file mode 100644
index 000000000..90416662e
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..3483f05a2
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 000000000..b2799ed3a
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 000000000..8a18afe7d
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..a1ec0b3ce
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..96437b5ba
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..6e6050694
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..824009a59
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json
new file mode 100644
index 000000000..bf2aa0d5f
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 000000000..4d4b752fa
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json
new file mode 100644
index 000000000..4d4b752fa
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json
new file mode 100644
index 000000000..4d4b752fa
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..3f3ccdafa
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,138 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..62fd21136
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json
new file mode 100644
index 000000000..da40db132
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 000000000..a218fc406
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json
new file mode 100644
index 000000000..a218fc406
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json
new file mode 100644
index 000000000..a218fc406
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json
new file mode 100644
index 000000000..f4c0f8417
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 000000000..5c8185cfd
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..97c9f4445
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..9d0db1cdc
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json
new file mode 100644
index 000000000..1c906494a
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 000000000..0bb423b28
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..555718733
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..26bcbf269
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..430f50090
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json
new file mode 100644
index 000000000..c51565f67
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 000000000..3682cc548
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json
new file mode 100644
index 000000000..3682cc548
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json
new file mode 100644
index 000000000..3682cc548
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json
new file mode 100644
index 000000000..b41f9d443
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 000000000..edf2a38d1
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..7532826a6
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..673bae2ba
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..b2100cebb
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..449a13384
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json
new file mode 100644
index 000000000..d317994cb
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json
new file mode 100644
index 000000000..6499c8586
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json
@@ -0,0 +1,173 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 2,
+        "num_warps": 4,
+        "num_ctas": 1,
+        "num_stages": 7
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 128,
+        "num_warps": 2,
+        "num_ctas": 1,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_ctas": 1,
+        "num_stages": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_ctas": 1,
+        "num_stages": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 2,
+        "num_warps": 4,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 2,
+        "num_warps": 4,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 2,
+        "num_warps": 4,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "192": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 16,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 128,
+        "num_warps": 2,
+        "num_ctas": 1,
+        "num_stages": 8
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_ctas": 1,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 16,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 16,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "6144": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_ctas": 1,
+        "num_stages": 2
+    },
+    "8192": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 16,
+        "num_ctas": 1,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..d7f14d665
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,178 @@
+{
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 4,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 4,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8192": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16384": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32768": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "65536": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "131072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..d7f14d665
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
@@ -0,0 +1,178 @@
+{
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 4,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 4,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8192": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16384": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32768": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "65536": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "131072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..d7f14d665
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json
@@ -0,0 +1,178 @@
+{
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 4,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 4,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8192": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16384": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32768": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "65536": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "131072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 000000000..dbc624731
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..cc614e635
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..32c0c9da4
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..72c3f560b
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json
new file mode 100644
index 000000000..dd07b3f6e
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 000000000..21742854c
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json
new file mode 100644
index 000000000..21742854c
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json
new file mode 100644
index 000000000..21742854c
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 000000000..f578c8d01
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..918f68396
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..e341a6791
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..a841518ca
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json
new file mode 100644
index 000000000..13cc2cee1
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..d9d2f5eac
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,175 @@
+{
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 4,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 4,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8192": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16384": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32768": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "65536": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "131072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..d9d2f5eac
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
@@ -0,0 +1,175 @@
+{
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 4,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 4,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8192": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16384": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32768": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "65536": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "131072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..d9d2f5eac
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json
@@ -0,0 +1,175 @@
+{
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 4,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 4,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8192": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16384": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32768": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "65536": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 1,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "131072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 2,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..34b916e57
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..b50cfc13d
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 000000000..952b750aa
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..5de5605d4
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json
new file mode 100644
index 000000000..2221e99cd
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json
new file mode 100644
index 000000000..74374c573
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..663f32539
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..b34b6e4e8
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json
new file mode 100644
index 000000000..ab169a018
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..b11acb1ee
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json
new file mode 100644
index 000000000..ab6e15552
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..249359fb9
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 000000000..4d3fcf75c
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..9259aa780
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..b4efc9b7e
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json
new file mode 100644
index 000000000..03dfc73b6
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..e7f9143f5
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json
new file mode 100644
index 000000000..beaac7f64
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json
new file mode 100644
index 000000000..ebff99e26
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
new file mode 100644
index 000000000..ff3fc2d01
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
new file mode 100644
index 000000000..550d314f0
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
new file mode 100644
index 000000000..763a1657f
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..63ba315b0
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..42d00fe60
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..739979fad
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..0762b9084
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..29b5aab37
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
new file mode 100644
index 000000000..eb5b9b741
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..741060e32
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..f3f8f7198
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..52b459d2e
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
new file mode 100644
index 000000000..da6cf2a04
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..e078fd083
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..5ed7ad20f
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 000000000..b00455054
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 000000000..f8f336e26
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..e341a6791
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json
new file mode 100644
index 000000000..1fa444bca
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..f8fd97b5e
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..c4751c00f
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..6d6af0808
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..379708af4
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json
new file mode 100644
index 000000000..1f3648ca8
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..adeaacb0e
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json
new file mode 100644
index 000000000..a3022a054
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..c3c6e0ac0
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..1eba1e4b8
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..8e9668889
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..3bac45f89
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..c9c8bb66e
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..68f6fb5ac
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..8ddf8beae
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..532c16e89
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..5ee3d4d35
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..358873315
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..02a119508
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..faf1aa4d4
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..b5c45dd72
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..6f586d1fa
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..c848ea577
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..41d97b17b
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..f8fd97b5e
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..b962d1950
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..f8fd97b5e
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..4e36c1544
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..a6c635be4
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 000000000..dc8d6d68b
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..486b94720
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..b8f35b62e
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json
new file mode 100644
index 000000000..039d5ade7
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json
new file mode 100644
index 000000000..991b315f7
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..64861b390
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
new file mode 100644
index 000000000..3a64c1457
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
@@ -0,0 +1,701 @@
+# NOTE: this file will be separated into sglang/srt/layers/moe/moe_runner/triton_utils.py
+# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/model_executor/layers/fused_moe/fused_moe.py
+
+"""Fused MoE kernel."""
+
+from __future__ import annotations
+
+import functools
+import os
+from typing import TYPE_CHECKING, List, Optional
+
+import torch
+import triton.language as tl
+
+from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    direct_register_custom_op,
+    get_bool_env_var,
+    is_cpu,
+    is_cuda,
+    is_hip,
+)
+
+from .fused_moe_triton_config import get_config_dtype_str, try_get_optimal_moe_config
+from .fused_moe_triton_kernels import invoke_fused_moe_kernel, moe_sum_reduce_triton
+from .moe_align_block_size import moe_align_block_size
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.topk import StandardTopKOutput
+
+_is_hip = is_hip()
+_is_cuda = is_cuda()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+if _is_cuda:
+    from sgl_kernel import gelu_and_mul, silu_and_mul
+elif _is_cpu and _is_cpu_amx_available:
+    pass
+elif _is_hip:
+    from sgl_kernel import gelu_and_mul, silu_and_mul
+
+    if _use_aiter:
+        try:
+            from aiter import moe_sum
+        except ImportError:
+            raise ImportError("aiter is required when SGLANG_USE_AITER is set to True")
+    else:
+        from vllm import _custom_ops as vllm_ops
+
+padding_size = 128 if bool(int(os.getenv("SGLANG_MOE_PADDING", "0"))) else 0
+
+
+def inplace_fused_experts(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    b1: Optional[torch.Tensor] = None,
+    b2: Optional[torch.Tensor] = None,
+    activation: int = 0,#0 silu 1 gelu
+    apply_router_weight_on_input: bool = False,
+    use_fp8_w8a8: bool = False,
+    use_int8_w8a8: bool = False,
+    use_int8_w8a16: bool = False,
+    use_int4_w4a16: bool = False,
+    per_channel_quant: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    w1_zp: Optional[torch.Tensor] = None,
+    w2_zp: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[List[int]] = None,
+    routed_scaling_factor: Optional[float] = None,
+    gemm1_alpha: Optional[float] = None,
+    gemm1_limit: Optional[float] = None,
+) -> None:
+    if isinstance(activation, int):
+        activation = "silu" if activation == 0 else "gelu"
+    fused_experts_impl(
+        hidden_states,
+        w1,
+        w2,
+        topk_weights,
+        topk_ids,
+        b1,
+        b2,
+        True,
+        activation,
+        apply_router_weight_on_input,
+        use_fp8_w8a8,
+        use_int8_w8a8,
+        use_int8_w8a16,
+        use_int4_w4a16,
+        per_channel_quant,
+        w1_scale,
+        w2_scale,
+        w1_zp,
+        w2_zp,
+        a1_scale,
+        a2_scale,
+        block_shape,
+        False,
+        routed_scaling_factor,
+        gemm1_alpha,
+        gemm1_limit,
+    )
+
+
+def inplace_fused_experts_fake(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    b1: Optional[torch.Tensor] = None,
+    b2: Optional[torch.Tensor] = None,
+    activation: int = 0,#0 silu 1 gelu
+    apply_router_weight_on_input: bool = False,
+    use_fp8_w8a8: bool = False,
+    use_int8_w8a8: bool = False,
+    use_int8_w8a16: bool = False,
+    use_int4_w4a16: bool = False,
+    per_channel_quant: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    w1_zp: Optional[torch.Tensor] = None,
+    w2_zp: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[List[int]] = None,
+    routed_scaling_factor: Optional[float] = None,
+    gemm1_alpha: Optional[float] = None,
+    gemm1_limit: Optional[float] = None,
+) -> None:
+    pass
+
+
+direct_register_custom_op(
+    op_name="inplace_fused_experts",
+    op_func=inplace_fused_experts,
+    mutates_args=["hidden_states"],
+    fake_impl=inplace_fused_experts_fake,
+)
+
+
+def outplace_fused_experts(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    b1: Optional[torch.Tensor] = None,
+    b2: Optional[torch.Tensor] = None,
+    activation: int = 0,#0 silu 1 gelu
+    apply_router_weight_on_input: bool = False,
+    use_fp8_w8a8: bool = False,
+    use_int8_w8a8: bool = False,
+    use_int8_w8a16: bool = False,
+    use_int4_w4a16: bool = False,
+    per_channel_quant: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    w1_zp: Optional[torch.Tensor] = None,
+    w2_zp: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[List[int]] = None,
+    no_combine: bool = False,
+    routed_scaling_factor: Optional[float] = None,
+    gemm1_alpha: Optional[float] = None,
+    gemm1_limit: Optional[float] = None,
+) -> torch.Tensor:
+    if isinstance(activation, int):
+        activation = "silu" if activation == 0 else "gelu"
+    return fused_experts_impl(
+        hidden_states,
+        w1,
+        w2,
+        topk_weights,
+        topk_ids,
+        b1,
+        b2,
+        False,
+        activation,
+        apply_router_weight_on_input,
+        use_fp8_w8a8,
+        use_int8_w8a8,
+        use_int8_w8a16,
+        use_int4_w4a16,
+        per_channel_quant,
+        w1_scale,
+        w2_scale,
+        w1_zp,
+        w2_zp,
+        a1_scale,
+        a2_scale,
+        block_shape,
+        no_combine=no_combine,
+        routed_scaling_factor=routed_scaling_factor,
+        gemm1_alpha=gemm1_alpha,
+        gemm1_limit=gemm1_limit,
+    )
+
+
+def outplace_fused_experts_fake(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    b1: Optional[torch.Tensor] = None,
+    b2: Optional[torch.Tensor] = None,
+    activation: int = 0,#0 silu 1 gelu
+    apply_router_weight_on_input: bool = False,
+    use_fp8_w8a8: bool = False,
+    use_int8_w8a8: bool = False,
+    use_int8_w8a16: bool = False,
+    use_int4_w4a16: bool = False,
+    per_channel_quant: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    w1_zp: Optional[torch.Tensor] = None,
+    w2_zp: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[List[int]] = None,
+    no_combine: bool = False,
+    routed_scaling_factor: Optional[float] = None,
+    gemm1_alpha: Optional[float] = None,
+    gemm1_limit: Optional[float] = None,
+) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
+direct_register_custom_op(
+    op_name="outplace_fused_experts",
+    op_func=outplace_fused_experts,
+    mutates_args=[],
+    fake_impl=outplace_fused_experts_fake,
+)
+
+
+def fused_experts(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_output: StandardTopKOutput,
+    moe_runner_config: MoeRunnerConfig,
+    b1: Optional[torch.Tensor] = None,
+    b2: Optional[torch.Tensor] = None,
+    use_fp8_w8a8: bool = False,
+    use_int8_w8a8: bool = False,
+    use_int8_w8a16: bool = False,
+    use_int4_w4a16: bool = False,
+    per_channel_quant: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    w1_zp: Optional[torch.Tensor] = None,
+    w2_zp: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[List[int]] = None,
+):
+    topk_weights, topk_ids, _ = topk_output
+    # 统一把 activation -> int（0=silu, 1=gelu）
+    act_id = (
+        0 if (
+            moe_runner_config.activation == 0
+            or (isinstance(moe_runner_config.activation, str)
+                and moe_runner_config.activation.lower() == "silu")
+        ) else 1
+    )
+    if moe_runner_config.inplace:
+        assert not moe_runner_config.no_combine, "no combine + inplace makes no sense"
+        torch.ops.sglang.inplace_fused_experts(
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            b1,
+            b2,
+            # moe_runner_config.activation,
+            act_id, 
+            moe_runner_config.apply_router_weight_on_input,
+            use_fp8_w8a8,
+            use_int8_w8a8,
+            use_int8_w8a16,
+            use_int4_w4a16,
+            per_channel_quant,
+            w1_scale,
+            w2_scale,
+            w1_zp,
+            w2_zp,
+            a1_scale,
+            a2_scale,
+            block_shape,
+            moe_runner_config.routed_scaling_factor,
+            moe_runner_config.gemm1_alpha,
+            moe_runner_config.gemm1_clamp_limit,
+        )
+        return hidden_states
+    else:
+        return torch.ops.sglang.outplace_fused_experts(
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            b1,
+            b2,
+            # moe_runner_config.activation,
+            act_id,
+            moe_runner_config.apply_router_weight_on_input,
+            use_fp8_w8a8,
+            use_int8_w8a8,
+            use_int8_w8a16,
+            use_int4_w4a16,
+            per_channel_quant,
+            w1_scale,
+            w2_scale,
+            w1_zp,
+            w2_zp,
+            a1_scale,
+            a2_scale,
+            block_shape,
+            no_combine=moe_runner_config.no_combine,
+            routed_scaling_factor=moe_runner_config.routed_scaling_factor,
+            gemm1_alpha=moe_runner_config.gemm1_alpha,
+            gemm1_limit=moe_runner_config.gemm1_clamp_limit,
+        )
+
+
+@torch.compile
+def moe_sum_reduce_torch_compile(x, out, routed_scaling_factor):
+    torch.sum(x, dim=1, out=out)
+    out.mul_(routed_scaling_factor)
+
+
+@torch.compile
+def swiglu_with_alpha_and_limit(x, gemm1_alpha, gemm1_limit):
+    gate, up = x[..., ::2], x[..., 1::2]
+    gate = gate.clamp(min=None, max=gemm1_limit)
+    up = up.clamp(min=-gemm1_limit, max=gemm1_limit)
+    return gate * torch.sigmoid(gate * gemm1_alpha) * (up + 1)
+
+
+def fused_experts_impl(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    b1: Optional[torch.Tensor] = None,
+    b2: Optional[torch.Tensor] = None,
+    inplace: bool = False,
+    activation: int = 0,#0 silu 1 gelu
+    apply_router_weight_on_input: bool = False,
+    use_fp8_w8a8: bool = False,
+    use_int8_w8a8: bool = False,
+    use_int8_w8a16: bool = False,
+    use_int4_w4a16: bool = False,
+    per_channel_quant: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    w1_zp: Optional[torch.Tensor] = None,
+    w2_zp: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[List[int]] = None,
+    no_combine: bool = False,
+    routed_scaling_factor: Optional[float] = None,
+    gemm1_alpha: Optional[float] = None,
+    gemm1_limit: Optional[float] = None,
+):
+    padded_size = padding_size
+    if not (use_fp8_w8a8 or use_int8_w8a8) or block_shape is not None or _use_aiter:
+        padded_size = 0
+
+    # Check constraints.
+    if use_int4_w4a16:
+        assert hidden_states.shape[1] // 2 == w1.shape[2], "Hidden size mismatch"
+    else:
+        assert (
+            hidden_states.shape[1] == w1.shape[2] - padded_size
+        ), f"Hidden size mismatch"
+    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
+    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
+    assert hidden_states.dtype in [torch.float32, torch.float16, torch.bfloat16]
+
+    num_tokens, _ = hidden_states.shape
+    E, N, _ = w1.shape
+    # We execute the fused_moe kernel in chunks to circumvent this issue:
+    # https://github.com/vllm-project/vllm/issues/5938
+    CHUNK_SIZE = 64 * 1024
+    M = min(num_tokens, CHUNK_SIZE)
+    config_dtype = get_config_dtype_str(
+        use_fp8_w8a8=use_fp8_w8a8,
+        use_int8_w8a8=use_int8_w8a8,
+        use_int8_w8a16=use_int8_w8a16,
+        use_int4_w4a16=use_int4_w4a16,
+        dtype=hidden_states.dtype,
+    )
+
+    get_config_func = functools.partial(
+        try_get_optimal_moe_config,
+        w1.shape,
+        (w2.shape[0], w2.shape[1], w2.shape[2] - padded_size),
+        topk_ids.shape[1],
+        config_dtype,
+        block_shape=block_shape,
+    )
+
+    config = get_config_func(M)
+
+    cache = torch.empty(
+        M * topk_ids.shape[1] * max(N, w2.shape[1]),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+    intermediate_cache1 = cache[: M * topk_ids.shape[1] * N].view(
+        (M, topk_ids.shape[1], N),
+    )
+    intermediate_cache2 = torch.empty(
+        (M * topk_ids.shape[1], N // 2),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+    intermediate_cache3 = cache[: M * topk_ids.shape[1] * w2.shape[1]].view(
+        (M, topk_ids.shape[1], w2.shape[1]),
+    )
+
+    compute_type = tl.bfloat16 if hidden_states.dtype == torch.bfloat16 else tl.float16
+
+    if no_combine:
+        assert not inplace
+        out_hidden_states = torch.empty(
+            (num_tokens, topk_ids.shape[1], w2.shape[1]),
+            device=hidden_states.device,
+            dtype=hidden_states.dtype,
+        )
+    elif inplace:
+        out_hidden_states = hidden_states
+    else:
+        out_hidden_states = torch.empty_like(hidden_states)
+
+    for chunk in range((num_tokens // CHUNK_SIZE) + 1):
+        begin_chunk_idx, end_chunk_idx = (
+            chunk * CHUNK_SIZE,
+            min((chunk + 1) * CHUNK_SIZE, num_tokens),
+        )
+        curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx]
+        tokens_in_chunk, _ = curr_hidden_states.shape
+
+        if tokens_in_chunk == 0:
+            break
+
+        if tokens_in_chunk < CHUNK_SIZE and chunk > 0:
+            # Adjust the intermediate cache size and config for the last
+            # chunk. Note that in most cases we only have one chunk
+            # so the cache size and config are already set correctly and
+            # do not need to be adjusted.
+            intermediate_cache1 = intermediate_cache1[:tokens_in_chunk]
+            intermediate_cache2 = intermediate_cache2[
+                : tokens_in_chunk * topk_ids.shape[1]
+            ]
+            intermediate_cache3 = intermediate_cache3[:tokens_in_chunk]
+            config = get_config_func(tokens_in_chunk)
+
+        curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
+        curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
+
+        sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
+            curr_topk_ids, config["BLOCK_SIZE_M"], E
+        )
+
+        invoke_fused_moe_kernel(
+            curr_hidden_states,
+            w1,
+            b1,
+            intermediate_cache1,
+            a1_scale,
+            w1_scale,
+            w1_zp,
+            curr_topk_weights,
+            curr_topk_ids,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            apply_router_weight_on_input,
+            topk_ids.shape[1],
+            config,
+            compute_type=compute_type,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            use_int4_w4a16=use_int4_w4a16,
+            per_channel_quant=per_channel_quant,
+            block_shape=block_shape,
+        )
+        if activation == "silu":
+            if gemm1_alpha is not None:
+                assert gemm1_limit is not None
+                intermediate_cache2 = swiglu_with_alpha_and_limit(
+                    intermediate_cache1.view(-1, N),
+                    gemm1_alpha,
+                    gemm1_limit,
+                )
+            elif _is_cuda or _is_hip:
+                silu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2)
+            else:
+                vllm_ops.silu_and_mul(
+                    intermediate_cache2, intermediate_cache1.view(-1, N)
+                )
+        elif activation == "gelu":
+            assert gemm1_alpha is None, "gemm1_alpha is not supported for gelu"
+            assert gemm1_limit is None, "gemm1_limit is not supported for gelu"
+            if _is_cuda or _is_hip:
+                gelu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2)
+            else:
+                vllm_ops.gelu_and_mul(
+                    intermediate_cache2, intermediate_cache1.view(-1, N)
+                )
+        else:
+            raise ValueError(f"Unsupported activation: {activation=}")
+
+        invoke_fused_moe_kernel(
+            intermediate_cache2,
+            w2,
+            b2,
+            (
+                intermediate_cache3
+                if not no_combine and topk_ids.shape[1] != 1
+                else out_hidden_states[begin_chunk_idx:end_chunk_idx].unsqueeze(0)
+            ),
+            a2_scale,
+            w2_scale,
+            w2_zp,
+            curr_topk_weights,
+            curr_topk_ids,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            not apply_router_weight_on_input,
+            1,
+            config,
+            compute_type=compute_type,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            use_int4_w4a16=use_int4_w4a16,
+            per_channel_quant=per_channel_quant,
+            block_shape=block_shape,
+        )
+
+        if routed_scaling_factor is None:
+            routed_scaling_factor = 1.0
+
+        if no_combine:
+            pass
+        elif _is_cuda:
+            if topk_ids.shape[1] == 1 and routed_scaling_factor == 1.0:
+                pass  # we write directly into out_hidden_states
+            elif topk_ids.shape[1] == 2 and routed_scaling_factor == 1.0:
+                torch.add(
+                    intermediate_cache3[:, 0],
+                    intermediate_cache3[:, 1],
+                    out=out_hidden_states[begin_chunk_idx:end_chunk_idx],
+                ).squeeze(dim=1)
+            else:
+                # According to micro benchmark results, torch.compile can get better performance for small token.
+                if tokens_in_chunk <= 32:
+                    moe_sum_reduce_torch_compile(
+                        intermediate_cache3.view(*intermediate_cache3.shape),
+                        out_hidden_states[begin_chunk_idx:end_chunk_idx],
+                        routed_scaling_factor,
+                    )
+                else:
+                    moe_sum_reduce_triton(
+                        intermediate_cache3.view(*intermediate_cache3.shape),
+                        out_hidden_states[begin_chunk_idx:end_chunk_idx],
+                        routed_scaling_factor,
+                    )
+        elif _is_hip:
+            if _use_aiter:
+                moe_sum(
+                    intermediate_cache3.view(*intermediate_cache3.shape),
+                    out_hidden_states[begin_chunk_idx:end_chunk_idx],
+                )
+            else:
+                # According to micro benchmark results, torch.compile can get better performance for small token.
+                if tokens_in_chunk <= 32:
+                    moe_sum_reduce_torch_compile(
+                        intermediate_cache3.view(*intermediate_cache3.shape),
+                        out_hidden_states[begin_chunk_idx:end_chunk_idx],
+                        routed_scaling_factor,
+                    )
+                else:
+                    moe_sum_reduce_triton(
+                        intermediate_cache3.view(*intermediate_cache3.shape),
+                        out_hidden_states[begin_chunk_idx:end_chunk_idx],
+                        routed_scaling_factor,
+                    )
+        else:
+            vllm_ops.moe_sum(
+                intermediate_cache3.view(*intermediate_cache3.shape),
+                out_hidden_states[begin_chunk_idx:end_chunk_idx],
+            )
+
+    return out_hidden_states
+
+
+def fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_output: StandardTopKOutput,
+    moe_runner_config: MoeRunnerConfig = MoeRunnerConfig(),
+    b1: Optional[torch.Tensor] = None,
+    b2: Optional[torch.Tensor] = None,
+    use_fp8_w8a8: bool = False,
+    use_int8_w8a8: bool = False,
+    use_int8_w8a16: bool = False,
+    use_int4_w4a16: bool = False,
+    per_channel_quant: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    w1_zp: Optional[torch.Tensor] = None,
+    w2_zp: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[List[int]] = None,
+) -> torch.Tensor:
+    """
+    This function computes a Mixture of Experts (MoE) layer using two sets of
+    weights, w1 and w2, and top-k gating mechanism.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+    - w1 (torch.Tensor): The first set of expert weights.
+    - w2 (torch.Tensor): The second set of expert weights.
+    - topk_output (StandardTopKOutput): The top-k output of the experts.
+    - moe_runner_config (MoeRunnerConfig): The configuration for the MoE runner.
+    - b1 (Optional[torch.Tensor]): Optional bias for w1.
+    - b2 (Optional[torch.Tensor]): Optional bias for w2.
+    - use_fp8_w8a8 (bool): If True, use fp8 arithmetic to compute the inner
+        products for w1 and w2. Defaults to False.
+    - use_int8_w8a8 (bool): If True, use int8 arithmetic to compute the inner
+        products for w1 and w2. Defaults to False.
+    - use_int8_w8a16 (bool): If True, use fp8 arithmetic to compute the inner
+        products for w1 and w2. Defaults to False.
+    - use_int4_w4a16 (bool): If True, use matmul of int4 weight and bf16/fp16
+        activation to compute the inner products for w1 and w2.
+        Defaults to False.
+    - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w1.
+    - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w2.
+    - a1_scale (Optional[torch.Tensor]): Optional scale to be used for
+        a1.
+    - a2_scale (Optional[torch.Tensor]): Optional scale to be used for
+        a2.
+    - block_shape: (Optional[List[int]]): Optional block size for block-wise
+        quantization.
+    - gemm1_alpha (Optional[float]): Optional gemm1_alpha for the activation
+        function.
+    - gemm1_limit (Optional[float]): Optional gemm1_limit for the swiglu activation
+        function.
+
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+
+    return fused_experts(
+        hidden_states,
+        w1,
+        w2,
+        topk_output,
+        moe_runner_config=moe_runner_config,
+        b1=b1,
+        b2=b2,
+        use_fp8_w8a8=use_fp8_w8a8,
+        use_int8_w8a8=use_int8_w8a8,
+        use_int8_w8a16=use_int8_w8a16,
+        use_int4_w4a16=use_int4_w4a16,
+        per_channel_quant=per_channel_quant,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        w1_zp=w1_zp,
+        w2_zp=w2_zp,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+        block_shape=block_shape,
+    )
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py
new file mode 100644
index 000000000..51114aade
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py
@@ -0,0 +1,212 @@
+from __future__ import annotations
+
+import functools
+import json
+import logging
+import os
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+import triton
+
+from sglang.srt.utils import get_device_name, is_hip
+
+logger = logging.getLogger(__name__)
+_is_hip = is_hip()
+
+
+def get_config_file_name(
+    E: int, N: int, dtype: Optional[str], block_shape: Optional[int] = None
+) -> str:
+    device_name = get_device_name().replace(" ", "_")
+    dtype_selector = "" if not dtype else f",dtype={dtype}"
+    block_shape_selector = (
+        "" if not block_shape or not all(block_shape) else f",block_shape={block_shape}"
+    )
+    return f"E={E},N={N},device_name={device_name}{dtype_selector}{block_shape_selector}.json"
+
+
+@functools.lru_cache
+def get_moe_configs(
+    E: int,
+    N: int,
+    dtype: Optional[str],
+    block_n: Optional[int] = 0,
+    block_k: Optional[int] = 0,
+) -> Optional[Dict[int, Any]]:
+    """
+    Return optimized configurations for the fused MoE kernel.
+
+    The return value will be a dictionary that maps an irregular grid of
+    batch sizes to configurations of the fused_moe kernel. To evaluate the
+    kernel on a given batch size bs, the closest batch size in the grid should
+    be picked and the associated configuration chosen to invoke the kernel.
+    """
+    # Supported Triton versions, should be sorted from the newest to the oldest
+    supported_triton_versions = ["3.3.1", "3.2.0", "3.1.0"]
+
+    # First look up if an optimized configuration is available in the configs
+    # directory
+    json_file_name = get_config_file_name(E, N, dtype, [block_n, block_k])
+
+    # We found that using the fused_moe_kernel config from Triton 3.1.0 with Triton 3.2.0 results in negative performance gains,
+    # so we also include the Triton version as a key for finding the fused_moe_kernel config to achieve the best performance.
+    triton_version = triton.__version__
+    version_dir = f"triton_{triton_version.replace('.', '_')}"
+    config_file_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)),
+        "configs",
+        version_dir,
+        json_file_name,
+    )
+    if os.path.exists(config_file_path):
+        with open(config_file_path) as f:
+            # Please note that although we find the config files, performance might still be suboptimal.
+            # This is because the tuning environment might differ from your current environment.
+            # For example, updating the Triton version might cause all old configs to become suboptimal.
+            # To achieve the best performance, consider re-tuning the Triton fused MOE kernel in your environment.
+            # For the tuning method, refer to: https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
+            logger.info(f"Using MoE kernel config from {config_file_path}.")
+            # If a configuration has been found, return it
+            return {int(key): val for key, val in json.load(f).items()}
+
+    # Searching for other triton versions that supports the same config
+    for try_triton_version in supported_triton_versions:
+        if try_triton_version == triton_version:
+            continue
+        try_config_file_path = os.path.join(
+            os.path.dirname(os.path.realpath(__file__)),
+            "configs",
+            f"triton_{try_triton_version.replace('.', '_')}",
+            json_file_name,
+        )
+        if os.path.exists(try_config_file_path):
+            with open(try_config_file_path) as f:
+                logger.warning(
+                    f"Config file not found at {config_file_path}. Fallback to triton version {try_triton_version} and use MoE kernel config from {try_config_file_path}. Performance might be sub-optimal!",
+                )
+                # If a configuration has been found, return it
+                return {int(key): val for key, val in json.load(f).items()}
+
+    # If no optimized configuration is available, we will use the default
+    # configuration
+    logger.warning(
+        (
+            "Using default MoE kernel config. Performance might be sub-optimal! "
+            "Config file not found at %s, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton"
+        ),
+        config_file_path,
+    )
+    return None
+
+
+def get_default_config(
+    M: int,
+    E: int,
+    N: int,
+    K: int,
+    topk: int,
+    dtype: Optional[str],
+    is_marlin: bool,
+    block_shape: Optional[List[int]] = None,
+) -> Dict[str, int]:
+    if dtype == "fp8_w8a8":
+        if block_shape is None:
+            config = {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 32,
+                "num_warps": 8,
+                "num_stages": 2 if _is_hip else 4,
+            }
+            if M <= E:
+                config = {
+                    "BLOCK_SIZE_M": 64,
+                    "BLOCK_SIZE_N": 128,
+                    "BLOCK_SIZE_K": 128,
+                    "GROUP_SIZE_M": 1,
+                    "num_warps": 4,
+                    "num_stages": 2 if _is_hip else 4,
+                }
+        else:
+            # Block-wise quant: BLOCK_SIZE_K must be divisible by block_shape[1]
+            config = {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": block_shape[0],
+                "BLOCK_SIZE_K": block_shape[1],
+                "GROUP_SIZE_M": 32,
+                "num_warps": 4,
+                "num_stages": 2 if _is_hip else 3,
+            }
+    else:
+        config = {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 8,
+        }
+        # A heuristic: fused marlin works faster with this config for small M
+        if M <= E or (is_marlin and M <= 32):
+            config = {
+                "BLOCK_SIZE_M": 16,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 1,
+            }
+    return config
+
+
+def try_get_optimal_moe_config(
+    w1_shape: Tuple[int, ...],
+    w2_shape: Tuple[int, ...],
+    top_k: int,
+    dtype: Optional[str],
+    M: int,
+    is_marlin: bool = False,
+    block_shape: Optional[List[int]] = None,
+):
+    from sglang.srt.layers.moe.fused_moe_triton import get_config
+
+    override_config = get_config()
+    if override_config:
+        config = override_config
+    else:
+        # First try to load optimal config from the file
+        E, _, N = w2_shape
+        block_n = block_shape[0] if block_shape else 0
+        block_k = block_shape[1] if block_shape else 0
+        configs = get_moe_configs(E, N, dtype, block_n, block_k)
+
+        if configs:
+            # If an optimal configuration map has been found, look up the
+            # optimal config
+            config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+        else:
+            # Else use the default config
+            config = get_default_config(
+                M, E, N, w1_shape[2], top_k, dtype, is_marlin, block_shape
+            )
+    return config
+
+
+def get_config_dtype_str(
+    dtype: torch.dtype,
+    use_int8_w8a16: Optional[bool] = False,
+    use_int4_w4a16: Optional[bool] = False,
+    use_fp8_w8a8: Optional[bool] = False,
+    use_int8_w8a8: Optional[bool] = False,
+):
+    if use_fp8_w8a8:
+        return "fp8_w8a8"
+    elif use_int8_w8a8:
+        return "int8_w8a8"
+    elif use_int4_w4a16:
+        return "int4_w4a16"
+    elif use_int8_w8a16:
+        return "int8_w8a16"
+    elif dtype == torch.float:
+        # avoiding cases where kernel fails when float32 MoE
+        # use fp16/bfloat16 configs
+        return "float32"
+    return None
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py
new file mode 100644
index 000000000..6a7229a9b
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py
@@ -0,0 +1,799 @@
+from __future__ import annotations
+
+import os
+from typing import Any, Dict, List, Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.quantization.fp8_kernel import (
+    per_token_group_quant_fp8,
+    scaled_fp8_quant,
+    sglang_per_token_group_quant_fp8,
+)
+from sglang.srt.layers.quantization.int8_kernel import (
+    per_token_group_quant_int8,
+    per_token_quant_int8,
+    sglang_per_token_group_quant_int8,
+)
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    get_bool_env_var,
+    is_cpu,
+    is_cuda,
+    is_hip,
+)
+
+_is_hip = is_hip()
+_is_cuda = is_cuda()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+if _is_cuda:
+    pass
+elif _is_cpu and _is_cpu_amx_available:
+    pass
+elif _is_hip:
+    pass
+
+padding_size = 128 if bool(int(os.getenv("SGLANG_MOE_PADDING", "0"))) else 0
+
+
+@triton.jit
+def write_zeros_to_output(
+    c_ptr,
+    stride_cm,
+    stride_cn,
+    pid_n,
+    N,
+    offs_token,
+    token_mask,
+    BLOCK_SIZE_M,
+    BLOCK_SIZE_N,
+    compute_type,
+):
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=compute_type)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+@triton.jit
+def fused_moe_kernel_gptq_awq(
+    # Pointers to matrices
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    b_scale_ptr,
+    b_zp_ptr,
+    topk_weights_ptr,
+    sorted_token_ids_ptr,
+    expert_ids_ptr,
+    num_tokens_post_padded_ptr,
+    # Matrix dimensions
+    N: tl.constexpr,
+    K: tl.constexpr,
+    EM,
+    num_valid_tokens,
+    # The stride variables represent how much to increase the ptr by when
+    # moving by 1 element in a particular dimension. E.g. `stride_am` is
+    # how much to increase `a_ptr` by to get the element one row down
+    # (A has M rows).
+    stride_am,
+    stride_ak,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_bse,
+    stride_bsk,
+    stride_bsn,
+    stride_bze,
+    stride_bzk,
+    stride_bzn,
+    group_size: tl.constexpr,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    MUL_ROUTED_WEIGHT: tl.constexpr,
+    top_k: tl.constexpr,
+    compute_type: tl.constexpr,
+    has_zp: tl.constexpr,
+    use_int4_w4a16: tl.constexpr,
+    use_int8_w8a16: tl.constexpr,
+    even_Ks: tl.constexpr,
+):
+    """
+    Implements the fused computation for a Mixture of Experts (MOE) using
+    token and expert matrices.
+    Key Parameters:
+    - A: The input tensor representing tokens with shape (*, K), where '*' can
+        be any shape representing batches and K is the feature dimension of
+        each token.
+    - B: The stacked MOE weight tensor with shape (E, N, K), where E is
+        the number of experts, K is the input feature dimension, and N is
+        the output feature dimension.
+    - C: The output cache tensor with shape (M, topk, N), where M is the
+        total number of tokens post padding, topk is the number of times
+        each token is repeated, and N is the output feature dimension.
+    - sorted_token_ids: A tensor containing the sorted indices of tokens,
+        repeated topk times and arranged by the expert index they are
+        assigned to.
+    - expert_ids: A tensor containing the indices of the expert for each
+        block. It determines which expert matrix from B should be used for
+        each block in A.
+    This kernel performs the multiplication of a token by its corresponding
+    expert matrix as determined by `expert_ids`. The sorting of
+    `sorted_token_ids` by expert index and padding ensures divisibility by
+    BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix
+    multiplication across different blocks processed by the same expert.
+    """
+    # -----------------------------------------------------------
+    # Map program ids `pid` to the block of C it should compute.
+    # This is done in a grouped ordering to promote L2 data reuse.
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    # ----------------------------------------------------------
+    # Create pointers for the first blocks of A and B.
+    # We will advance this pointer as we move in the K direction
+    # and accumulate
+    # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
+    # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers
+    num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
+    if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
+        return
+    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
+    offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
+    token_mask = offs_token < num_valid_tokens
+
+    off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64)
+    if off_experts == -1:
+        # -----------------------------------------------------------
+        # Write back zeros to the output when the expert is not
+        # in the current expert parallel rank.
+        write_zeros_to_output(
+            c_ptr,
+            stride_cm,
+            stride_cn,
+            pid_n,
+            N,
+            offs_token,
+            token_mask,
+            BLOCK_SIZE_M,
+            BLOCK_SIZE_N,
+            compute_type,
+        )
+        return
+
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (
+        offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak
+    )
+
+    if use_int4_w4a16:
+        b_ptrs = (
+            b_ptr
+            + off_experts * stride_be
+            + (offs_k[:, None] // 2) * stride_bk
+            + offs_bn[None, :] * stride_bn
+        )
+        b_shifter = (offs_k[:, None] % 2) * 4
+    elif use_int8_w8a16:
+        b_ptrs = (
+            b_ptr
+            + off_experts * stride_be
+            + offs_k[:, None] * stride_bk
+            + offs_bn[None, :] * stride_bn
+        )
+
+    if not has_zp and use_int4_w4a16:
+        b_zp_num = 8
+    if not has_zp and use_int8_w8a16:
+        b_zp_num = 128
+    elif has_zp and use_int4_w4a16:
+        b_zp_shifter = (offs_bn[None, :] % 2) * 4
+
+    # -----------------------------------------------------------
+    # Iterate to compute a block of the C matrix.
+    # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
+    # of fp32 values for higher accuracy.
+    # `accumulator` will be converted back to fp16 after the loop.
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        # Load the next block of A and B, generate a mask by checking the
+        # K dimension.
+
+        if not even_Ks:
+            k_mask = offs_k[:, None] < K - k * BLOCK_SIZE_K
+            k_other = 0.0
+        else:
+            k_mask = None
+            k_other = None
+
+        a = tl.load(
+            a_ptrs,
+            mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+            other=0.0,
+        )
+        b = tl.load(b_ptrs)
+        if use_int4_w4a16:
+            b = (b >> b_shifter) & 0xF
+
+        b_scale_ptrs = (
+            b_scale_ptr
+            + off_experts * stride_bse
+            + offs_bn[None, :] * stride_bsn
+            + ((offs_k[:, None] + BLOCK_SIZE_K * k) // group_size) * stride_bsk
+        )
+        b_scale = tl.load(b_scale_ptrs, mask=k_mask, other=k_other)
+        b_scale = b_scale.to(tl.float32)
+
+        if has_zp and use_int4_w4a16:
+            offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size
+            b_zp_ptrs = (
+                b_zp_ptr
+                + off_experts * stride_bze
+                + (offs_bn[None, :] // 2) * stride_bzn
+                + offs_k_true * stride_bzk
+            )
+            b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other)
+            b_zp = (b_zp >> b_zp_shifter) & 0xF
+            b_zp = b_zp.to(tl.float32)
+        elif has_zp and use_int8_w8a16:
+            offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size
+            b_zp_ptrs = (
+                b_zp_ptr
+                + off_experts * stride_bze
+                + offs_bn[None, :] * stride_bzn
+                + offs_k_true * stride_bzk
+            )
+            b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other)
+            b_zp = b_zp.to(tl.float32)
+
+        # We accumulate along the K dimension.
+        if has_zp:
+            b = ((b.to(tl.float32) - b_zp) * b_scale).to(compute_type)
+        else:
+            b = ((b.to(tl.float32) - b_zp_num) * b_scale).to(compute_type)
+        accumulator = tl.dot(a, b, acc=accumulator)
+
+        # Advance the ptrs to the next K block.
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        if use_int4_w4a16:
+            b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk
+        else:
+            b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    if MUL_ROUTED_WEIGHT:
+        moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0)
+        accumulator = accumulator * moe_weight[:, None]
+
+    accumulator = accumulator.to(compute_type)
+    # -----------------------------------------------------------
+    # Write back the block of the output
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+@triton.jit
+def fused_moe_kernel(
+    # Pointers to matrices
+    a_ptr,
+    b_ptr,
+    bias_ptr,
+    c_ptr,
+    a_scale_ptr,
+    b_scale_ptr,
+    topk_weights_ptr,
+    sorted_token_ids_ptr,
+    expert_ids_ptr,
+    num_tokens_post_padded_ptr,
+    # Matrix dimensions
+    N,
+    K,
+    EM,
+    num_valid_tokens,
+    # The stride variables represent how much to increase the ptr by when
+    # moving by 1 element in a particular dimension. E.g. `stride_am` is
+    # how much to increase `a_ptr` by to get the element one row down
+    # (A has M rows).
+    stride_am,
+    stride_ak,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_bias_e,
+    stride_bias_n,
+    stride_cm,
+    stride_cn,
+    stride_asm,
+    stride_ask,
+    stride_bse,
+    stride_bsk,
+    stride_bsn,
+    # Block size for block-wise quantization
+    group_n: tl.constexpr,
+    group_k: tl.constexpr,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    MUL_ROUTED_WEIGHT: tl.constexpr,
+    top_k: tl.constexpr,
+    compute_type: tl.constexpr,
+    use_fp8_w8a8: tl.constexpr,
+    use_int8_w8a8: tl.constexpr,
+    use_int8_w8a16: tl.constexpr,
+    per_channel_quant: tl.constexpr,
+    even_Ks: tl.constexpr,
+):
+    """
+    Implements the fused computation for a Mixture of Experts (MOE) using
+    token and expert matrices.
+
+    Key Parameters:
+    - A: The input tensor representing tokens with shape (*, K), where '*' can
+        be any shape representing batches and K is the feature dimension of
+        each token.
+    - B: The stacked MOE weight tensor with shape (E, N, K), where E is
+        the number of experts, K is the input feature dimension, and N is
+        the output feature dimension.
+    - C: The output cache tensor with shape (M, topk, N), where M is the
+        total number of tokens post padding, topk is the number of times
+        each token is repeated, and N is the output feature dimension.
+    - sorted_token_ids: A tensor containing the sorted indices of tokens,
+        repeated topk times and arranged by the expert index they are
+        assigned to.
+    - expert_ids: A tensor containing the indices of the expert for each
+        block. It determines which expert matrix from B should be used for
+        each block in A.
+
+    This kernel performs the multiplication of a token by its corresponding
+    expert matrix as determined by `expert_ids`. The sorting of
+    `sorted_token_ids` by expert index and padding ensures divisibility by
+    BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix
+    multiplication across different blocks processed by the same expert.
+    """
+    # -----------------------------------------------------------
+    # Map program ids `pid` to the block of C it should compute.
+    # This is done in a grouped ordering to promote L2 data reuse.
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    # ----------------------------------------------------------
+    # Create pointers for the first blocks of A and B.
+    # We will advance this pointer as we move in the K direction
+    # and accumulate
+    # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
+    # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers
+    num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
+    if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
+        return
+    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
+    offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
+    offs_token = offs_token.to(tl.int64)
+    token_mask = offs_token < num_valid_tokens
+
+    off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64)
+
+    if off_experts == -1:
+        # -----------------------------------------------------------
+        # Write back zeros to the output when the expert is not
+        # in the current expert parallel rank.
+        write_zeros_to_output(
+            c_ptr,
+            stride_cm,
+            stride_cn,
+            pid_n,
+            N,
+            offs_token,
+            token_mask,
+            BLOCK_SIZE_M,
+            BLOCK_SIZE_N,
+            compute_type,
+        )
+        return
+
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (
+        offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak
+    )
+
+    b_ptrs = (
+        b_ptr
+        + off_experts * stride_be
+        + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+    )
+    if bias_ptr is not None:
+        bias = tl.load(
+            bias_ptr + off_experts * stride_bias_e + offs_bn[None, :] * stride_bias_n
+        )
+    if use_int8_w8a16:
+        b_scale_ptrs = (
+            b_scale_ptr + off_experts * stride_bse + offs_bn[None, :] * stride_bsn
+        )
+        b_scale = tl.load(b_scale_ptrs)
+
+    if use_fp8_w8a8 or use_int8_w8a8:
+        # block-wise
+        if group_k > 0 and group_n > 0:
+            a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
+            offs_bsn = offs_bn // group_n
+            b_scale_ptrs = (
+                b_scale_ptr + off_experts * stride_bse + offs_bsn * stride_bsn
+            )
+        # channel-wise
+        elif per_channel_quant:
+            b_scale_ptrs = (
+                b_scale_ptr + off_experts * stride_bse + offs_bn[None, :] * stride_bsn
+            )
+            b_scale = tl.load(b_scale_ptrs)
+            # Load per-token scale for activations
+            a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
+            a_scale = tl.load(a_scale_ptrs, mask=token_mask, other=0.0)[:, None]
+        # tensor-wise
+        else:
+            a_scale = tl.load(a_scale_ptr)
+            b_scale = tl.load(b_scale_ptr + off_experts)
+
+    # -----------------------------------------------------------
+    # Iterate to compute a block of the C matrix.
+    # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
+    # of fp32 values for higher accuracy.
+    # `accumulator` will be converted back to fp16 after the loop.
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        # Load the next block of A and B, generate a mask by checking the
+        # K dimension.
+        if even_Ks:
+            a = tl.load(
+                a_ptrs,
+                mask=token_mask[:, None],
+                other=0.0,
+            )
+            b = tl.load(b_ptrs)
+        else:
+            a = tl.load(
+                a_ptrs,
+                mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+                other=0.0,
+            )
+            b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
+
+        # We accumulate along the K dimension.
+        if use_int8_w8a16:
+            accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)
+        elif use_fp8_w8a8 or use_int8_w8a8:
+            if group_k > 0 and group_n > 0:
+                k_start = k * BLOCK_SIZE_K
+                offs_ks = k_start // group_k
+                a_scale = tl.load(
+                    a_scale_ptrs + offs_ks * stride_ask, mask=token_mask, other=0.0
+                )
+                b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk)
+
+                accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale[None, :]
+            else:
+                if use_fp8_w8a8:
+                    accumulator = tl.dot(a, b, acc=accumulator)
+                else:
+                    accumulator += tl.dot(a, b)
+        else:
+            accumulator += tl.dot(a, b)
+        # Advance the ptrs to the next K block.
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    if use_int8_w8a16:
+        accumulator *= b_scale
+    elif use_fp8_w8a8 or use_int8_w8a8:
+        if group_k == 0 or group_n == 0:
+            accumulator *= a_scale * b_scale
+
+    if bias_ptr is not None:
+        accumulator += bias
+
+    if MUL_ROUTED_WEIGHT:
+        moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0)
+        accumulator *= moe_weight[:, None]
+
+    accumulator = accumulator.to(compute_type)
+    # -----------------------------------------------------------
+    # Write back the block of the output
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+def invoke_fused_moe_kernel(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    bias: Optional[torch.Tensor],
+    C: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    B_scale: Optional[torch.Tensor],
+    B_zp: Optional[torch.Tensor],
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    sorted_token_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_tokens_post_padded: torch.Tensor,
+    mul_routed_weight: bool,
+    top_k: int,
+    config: Dict[str, Any],
+    compute_type: tl.dtype,
+    use_fp8_w8a8: bool,
+    use_int8_w8a8: bool,
+    use_int8_w8a16: bool,
+    use_int4_w4a16: bool,
+    per_channel_quant: bool,
+    block_shape: Optional[List[int]] = None,
+    no_combine: bool = False,
+) -> None:
+    assert topk_weights.stride(1) == 1
+    assert sorted_token_ids.stride(0) == 1
+
+    padded_size = 0
+    if use_fp8_w8a8:
+        assert B_scale is not None
+        if block_shape is None:
+            # activation tensor-wise fp8 quantization, dynamic or static
+            padded_size = padding_size
+            # activations apply per-token quantization when weights apply per-channel quantization by default
+            A, A_scale = scaled_fp8_quant(
+                A, A_scale, use_per_token_if_dynamic=per_channel_quant
+            )
+        else:
+            # activation block-wise fp8 quantization
+            assert len(block_shape) == 2
+            block_n, block_k = block_shape[0], block_shape[1]
+            if _is_cuda:
+                A, A_scale = sglang_per_token_group_quant_fp8(A, block_k)
+            else:
+                A, A_scale = per_token_group_quant_fp8(A, block_k)
+            assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+            assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2]
+            assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1]
+    elif use_int8_w8a8:
+        assert B_scale is not None
+        if block_shape is None:
+            # activation channel-wise int8 quantization
+            assert (
+                per_channel_quant
+            ), "int8 quantization only supports channel-wise quantization except for block-wise quantization"
+            A, A_scale = per_token_quant_int8(A)
+        else:
+            # activation block-wise int8 quantization
+            assert len(block_shape) == 2
+            block_n, block_k = block_shape[0], block_shape[1]
+            if _is_cuda:
+                A, A_scale = sglang_per_token_group_quant_int8(A, block_k)
+            else:
+                A, A_scale = per_token_group_quant_int8(A, block_k)
+            assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+            assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2]
+            assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1]
+    elif use_int8_w8a16 or use_int4_w4a16:
+        assert B_scale is not None
+        assert block_shape is None or block_shape[0] == 0
+    else:
+        assert A_scale is None
+        assert B_scale is None
+
+    grid = lambda META: (
+        triton.cdiv(sorted_token_ids.shape[0], META["BLOCK_SIZE_M"])
+        * triton.cdiv(B.shape[1], META["BLOCK_SIZE_N"]),
+    )
+
+    K = B.shape[2] - padded_size
+    if K % config["BLOCK_SIZE_K"] == 0:
+        even_Ks = True
+    else:
+        even_Ks = False
+
+    if (
+        (use_int8_w8a16 or use_int4_w4a16)
+        and block_shape is not None
+        and block_shape[1] > 0
+    ):
+        assert B_scale is not None and B_scale.ndim == 3
+        assert B_zp is None or B_zp.ndim == 3
+        assert bias is None
+        fused_moe_kernel_gptq_awq[grid](
+            A,
+            B,
+            C,
+            B_scale,
+            B_zp,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            B.shape[1],
+            A.shape[1],
+            sorted_token_ids.shape[0],
+            topk_ids.numel(),
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(1),
+            C.stride(2),
+            B_scale.stride(0),
+            B_scale.stride(2),
+            B_scale.stride(1),
+            B_zp.stride(0) if B_zp is not None else 0,
+            B_zp.stride(2) if B_zp is not None else 0,
+            B_zp.stride(1) if B_zp is not None else 0,
+            group_size=block_shape[1],
+            MUL_ROUTED_WEIGHT=mul_routed_weight,
+            top_k=top_k,
+            compute_type=compute_type,
+            has_zp=B_zp is not None,
+            use_int4_w4a16=use_int4_w4a16,
+            use_int8_w8a16=use_int8_w8a16,
+            even_Ks=even_Ks,
+            **config,
+        )
+
+    else:
+
+        fused_moe_kernel[grid](
+            A,
+            B,
+            bias,
+            C,
+            A_scale,
+            B_scale,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            B.shape[1],
+            B.shape[2] - padded_size,
+            sorted_token_ids.shape[0],
+            topk_ids.numel(),
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            bias.stride(0) if bias is not None else 0,
+            bias.stride(1) if bias is not None else 0,
+            C.stride(1),
+            C.stride(2),
+            A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0,
+            A_scale.stride(1) if A_scale is not None and A_scale.ndim == 2 else 0,
+            B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0,
+            B_scale.stride(2) if B_scale is not None and B_scale.ndim == 3 else 0,
+            B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0,
+            0 if block_shape is None else block_shape[0],
+            0 if block_shape is None else block_shape[1],
+            MUL_ROUTED_WEIGHT=mul_routed_weight,
+            top_k=top_k,
+            compute_type=compute_type,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            per_channel_quant=per_channel_quant,
+            even_Ks=even_Ks,
+            **config,
+        )
+
+
+# _moe_sum_reduce_kernel kernel modified from https://github.com/ModelTC/lightllm/blob/main/lightllm/common/fused_moe/moe_sum_reduce.py
+@triton.jit
+def _moe_sum_reduce_kernel(
+    input_ptr,
+    input_stride_0,
+    input_stride_1,
+    input_stride_2,
+    output_ptr,
+    output_stride_0,
+    output_stride_1,
+    token_num: int,
+    topk_num: int,
+    hidden_dim: int,
+    routed_scaling_factor: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DIM: tl.constexpr,
+    NUM_STAGE: tl.constexpr,
+):
+    input_stride_0 = tl.cast(input_stride_0, dtype=tl.int64)
+    input_stride_1 = tl.cast(input_stride_1, dtype=tl.int64)
+    output_stride_0 = tl.cast(output_stride_0, dtype=tl.int64)
+
+    token_block_id = tl.program_id(0)
+    dim_block_id = tl.program_id(1)
+
+    offs_token = token_block_id * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_dim = dim_block_id * BLOCK_DIM + tl.arange(0, BLOCK_DIM)
+
+    mask_token = offs_token < token_num
+    mask_dim = offs_dim < hidden_dim
+
+    base_ptrs = input_ptr + offs_token[:, None] * input_stride_0 + offs_dim[None, :]
+
+    accumulator = tl.zeros((BLOCK_M, BLOCK_DIM), dtype=tl.float32)
+
+    for i in tl.range(0, topk_num, num_stages=NUM_STAGE):
+        tile = tl.load(
+            base_ptrs + i * input_stride_1,
+            mask=mask_token[:, None] & mask_dim[None, :],
+            other=0.0,
+        )
+        accumulator += tile.to(tl.float32)
+    accumulator *= routed_scaling_factor
+
+    # -------- Write back --------
+    store_ptrs = output_ptr + offs_token[:, None] * output_stride_0 + offs_dim[None, :]
+    tl.store(
+        store_ptrs,
+        accumulator.to(input_ptr.dtype.element_ty),
+        mask=mask_token[:, None] & mask_dim[None, :],
+    )
+
+
+def moe_sum_reduce_triton(
+    input: torch.Tensor, output: torch.Tensor, routed_scaling_factor: float
+):
+    assert input.is_contiguous()
+    assert output.is_contiguous()
+
+    token_num, topk_num, hidden_dim = input.shape
+    assert output.shape[0] == token_num and output.shape[1] == hidden_dim
+
+    BLOCK_M = 1
+    BLOCK_DIM = 2048
+    NUM_STAGE = 1
+    num_warps = 16
+
+    grid = (
+        triton.cdiv(token_num, BLOCK_M),
+        triton.cdiv(hidden_dim, BLOCK_DIM),
+    )
+
+    _moe_sum_reduce_kernel[grid](
+        input,
+        *input.stride(),
+        output,
+        *output.stride(),
+        token_num=token_num,
+        topk_num=topk_num,
+        hidden_dim=hidden_dim,
+        routed_scaling_factor=routed_scaling_factor,
+        BLOCK_M=BLOCK_M,
+        BLOCK_DIM=BLOCK_DIM,
+        NUM_STAGE=NUM_STAGE,
+        num_warps=num_warps,
+    )
+    return
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
new file mode 100644
index 000000000..0ea1fa1eb
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
@@ -0,0 +1,1078 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/model_executor/layers/fused_moe/layer.py
+
+import logging
+from enum import Enum
+from typing import List, Optional, Tuple
+
+import torch
+
+from sglang.srt.distributed import (
+    get_moe_expert_parallel_rank,
+    get_moe_expert_parallel_world_size,
+    get_moe_tensor_parallel_rank,
+    get_moe_tensor_parallel_world_size,
+    get_tp_group,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    use_symmetric_memory,
+)
+from sglang.srt.eplb.expert_location import get_global_expert_location_metadata
+from sglang.srt.layers.moe import (
+    MoeRunnerConfig,
+    get_moe_runner_backend,
+    should_use_flashinfer_trtllm_moe,
+)
+from sglang.srt.layers.moe.token_dispatcher.standard import (
+    CombineInput,
+    StandardDispatcher,
+)
+from sglang.srt.layers.moe.topk import TopKOutput, TopKOutputChecker
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.fp8 import Fp8MoEMethod
+from sglang.srt.layers.quantization.modelopt_quant import ModelOptNvFp4FusedMoEMethod
+from sglang.srt.layers.quantization.unquant import UnquantizedFusedMoEMethod
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.model_loader.weight_utils import narrow_padded_param_and_loaded_weight
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    get_bool_env_var,
+    is_cpu,
+    is_flashinfer_available,
+    is_hip,
+    next_power_of_2,
+    round_up,
+)
+
+if is_flashinfer_available():
+    from flashinfer import (
+        RoutingMethodType,
+        fp4_quantize,
+        reorder_rows_for_gated_act_gemm,
+        shuffle_matrix_a,
+        shuffle_matrix_sf_a,
+    )
+
+_is_hip = is_hip()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+
+
+# Try to import FP4 TRTLLM function if flashinfer is available
+trtllm_fp4_block_scale_moe = None
+if should_use_flashinfer_trtllm_moe():
+    try:
+        from flashinfer.fused_moe import trtllm_fp4_block_scale_moe
+    except ImportError:
+        trtllm_fp4_block_scale_moe = None
+
+logger = logging.getLogger(__name__)
+
+
+def _is_fp4_quantization_enabled():
+    """Check if ModelOpt FP4 quantization is enabled."""
+    try:
+        # Use the same simple check that works for class selection
+        quantization = global_server_args_dict.get("quantization")
+        return quantization == "modelopt_fp4"
+    except:
+        return False
+
+
+def _get_tile_tokens_dim(num_tokens, top_k, num_experts):
+    # Guess tokens per expert assuming perfect expert distribution first.
+    num_tokens_per_expert = (num_tokens * top_k) // num_experts
+    # And pad the number to the next power of 2.
+    tile_tokens_dim = next_power_of_2(num_tokens_per_expert)
+    # Cap to 8-64 tokens per CTA tile as it's the range supported by the kernel.
+    tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
+    return tile_tokens_dim
+
+
+class FusedMoeWeightScaleSupported(Enum):
+    TENSOR = "tensor"
+    CHANNEL = "channel"
+    GROUP = "group"
+    BLOCK = "block"
+
+
+class FusedMoE(torch.nn.Module):
+    """FusedMoE layer for MoE models.
+
+    This layer contains both MergedColumnParallel weights (gate_up_proj /
+    w13) and RowParallelLinear weights (down_proj/ w2).
+
+    Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
+    copy that naming convention here and handle any remapping in the
+    load_weights function in each model implementation.
+
+    Args:
+        num_experts: Number of experts in the model
+        top_k: Number of experts selected for each token
+        hidden_size: Input hidden state size of the transformer
+        intermediate_size: Intermediate size of the experts
+        params_dtype: Data type for the parameters.
+        reduce_results: Whether to apply all_reduce on the output of the layer
+        quant_config: Quantization configuration.
+        inplace: suggestion to compute inplace (modify input activation).
+    """
+
+    def __init__(
+        self,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        layer_id: int,
+        top_k: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
+        params_dtype: Optional[torch.dtype] = None,
+        reduce_results: bool = False,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        activation: str = "silu",
+        apply_router_weight_on_input: bool = False,
+        use_presharded_weights: bool = False,
+        inplace: bool = True,
+        no_combine: bool = False,
+        routed_scaling_factor: Optional[float] = None,
+        gemm1_alpha: Optional[float] = None,
+        gemm1_clamp_limit: Optional[float] = None,
+        use_weight_loader_fused: bool = False,
+        with_bias=False,
+    ):
+        super().__init__()
+
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+
+        self.layer_id = layer_id
+        self.top_k = top_k
+        self.hidden_size = hidden_size
+        self.num_experts = num_experts
+        self.num_fused_shared_experts = num_fused_shared_experts
+        self.expert_map_cpu = None
+        self.expert_map_gpu = None
+
+        enable_flashinfer_cutlass_moe = get_moe_runner_backend().is_flashinfer_cutlass()
+
+        if enable_flashinfer_cutlass_moe and quant_config is None:
+            logger.warning("Disable flashinfer MoE when quantization config is None.")
+            enable_flashinfer_cutlass_moe = False
+
+        self.enable_flashinfer_cutlass_moe = enable_flashinfer_cutlass_moe
+        self.moe_ep_size = get_moe_expert_parallel_world_size()
+        self.moe_ep_rank = get_moe_expert_parallel_rank()
+        self.moe_tp_size = get_moe_tensor_parallel_world_size()
+        self.moe_tp_rank = get_moe_tensor_parallel_rank()
+        assert num_experts % self.moe_ep_size == 0
+        self.num_local_experts = num_experts // self.moe_ep_size
+        self.start_expert_id = self.moe_ep_rank * self.num_local_experts
+        self.end_expert_id = self.start_expert_id + self.num_local_experts - 1
+        if self.moe_ep_size > 1:
+            # TODO(ch-wan): support shared experts fusion
+            # Create a tensor of size num_experts filled with -1
+            self.expert_map_cpu = torch.full(
+                (self.num_experts,), -1, dtype=torch.int32, device="cpu"
+            )
+            # Create a expert map for the local experts
+            self.expert_map_cpu[
+                self.moe_ep_rank
+                * self.num_local_experts : (self.moe_ep_rank + 1)
+                * self.num_local_experts
+            ] = torch.arange(0, self.num_local_experts, dtype=torch.int32, device="cpu")
+
+        assert intermediate_size % self.moe_tp_size == 0
+        self.intermediate_size_per_partition = intermediate_size // self.moe_tp_size
+        self.reduce_results = reduce_results
+        self.use_presharded_weights = use_presharded_weights
+
+        self.use_triton_kernels = get_moe_runner_backend().is_triton_kernel()
+
+        self.quant_config = quant_config
+        self.use_flashinfer_mxfp4_moe = get_moe_runner_backend().is_flashinfer_mxfp4()
+        # TODO maybe we should remove this `if`, since `Mxfp4MoEMethod` does another round-up logic
+        if (
+            self.quant_config is not None
+            and self.quant_config.get_name() == "mxfp4"
+            and self.use_flashinfer_mxfp4_moe
+        ):
+            hidden_size = round_up(hidden_size, 256)
+        self.hidden_size = hidden_size
+
+        self.moe_runner_config = MoeRunnerConfig(
+            num_experts=num_experts,
+            num_local_experts=self.num_local_experts,
+            hidden_size=hidden_size,
+            intermediate_size_per_partition=self.intermediate_size_per_partition,
+            layer_id=layer_id,
+            top_k=top_k,
+            num_fused_shared_experts=num_fused_shared_experts,
+            params_dtype=params_dtype,
+            activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            inplace=inplace,
+            no_combine=no_combine,
+            routed_scaling_factor=routed_scaling_factor,
+            gemm1_alpha=gemm1_alpha,
+            gemm1_clamp_limit=gemm1_clamp_limit,
+        )
+
+        if quant_config is None:
+            self.quant_method: FusedMoEMethodBase = UnquantizedFusedMoEMethod(
+                self.use_triton_kernels
+            )
+        else:
+            self.quant_method: FusedMoEMethodBase = quant_config.get_quant_method(
+                self, prefix
+            )
+        assert self.quant_method is not None
+
+        self.quant_method.create_weights(
+            layer=self,
+            num_experts=self.num_local_experts,
+            hidden_size=hidden_size,
+            intermediate_size_per_partition=self.intermediate_size_per_partition,
+            params_dtype=params_dtype,
+            weight_loader=(
+                self.weight_loader
+                if not use_weight_loader_fused
+                else self.weight_loader_fused
+            ),
+            with_bias=with_bias,
+        )
+
+        self.quant_method.create_moe_runner(self, self.moe_runner_config)
+        self.dispatcher = StandardDispatcher()
+
+    def _load_per_tensor_weight_scale(
+        self,
+        shard_id: str,
+        param: torch.nn.Parameter,
+        loaded_weight: torch.Tensor,
+        expert_id: int,
+    ):
+        param_data = param.data
+        # for per tensor weight quantization
+        if shard_id in ("w1", "w3"):
+            # We have to keep the weight scales of w1 and w3 because
+            # we need to re-quantize w1/w3 weights after weight loading.
+            idx = 0 if shard_id == "w1" else 1
+            param_data[expert_id][idx] = loaded_weight
+        # If we are in the row parallel case (down_proj)
+        elif shard_id == "w2":
+            param_data[expert_id] = loaded_weight
+
+    def _load_model_weight_or_group_weight_scale(
+        self,
+        shard_dim: int,
+        expert_data: torch.Tensor,
+        shard_id: str,
+        loaded_weight: torch.Tensor,
+        tp_rank: int,
+        is_bias: bool = False,
+    ):
+        # Load grouped weight scales for group quantization
+        # or model weights
+        if shard_id == "w2":
+            self._load_w2(
+                shard_id=shard_id,
+                shard_dim=shard_dim,
+                loaded_weight=loaded_weight,
+                expert_data=expert_data,
+                tp_rank=tp_rank,
+                is_bias=is_bias,
+            )
+        elif shard_id in ("w1", "w3", "w13"):
+            self._load_w13(
+                shard_id=shard_id,
+                shard_dim=shard_dim,
+                loaded_weight=loaded_weight,
+                expert_data=expert_data,
+                tp_rank=tp_rank,
+                is_bias=is_bias,
+            )
+
+    def _load_per_channel_weight_scale(
+        self,
+        expert_data: torch.Tensor,
+        shard_dim: int,
+        shard_id: str,
+        loaded_weight: torch.Tensor,
+        tp_rank: int,
+    ):
+        # for per channel weight quantization
+        if shard_id == "w2":
+            expert_data.copy_(loaded_weight)
+        elif shard_id in ("w1", "w3"):
+            self._load_w13(
+                shard_id=shard_id,
+                shard_dim=shard_dim,
+                loaded_weight=loaded_weight,
+                expert_data=expert_data,
+                tp_rank=tp_rank,
+            )
+
+    def _load_w13(
+        self,
+        expert_data: torch.Tensor,
+        shard_dim: int,
+        shard_id: str,
+        loaded_weight: torch.Tensor,
+        tp_rank: int,
+        is_bias: bool = False,
+    ):
+
+        # Index the loaded weight for tp sharding.
+        # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim
+        assert shard_id in {"w1", "w3", "w13"}
+
+        if is_bias:
+            # if this weight is a bias, the last dimension must be the sharded dimension
+            shard_dim = -1
+
+        if shard_id in {"w1", "w3"}:
+            # non-fused version
+            shard_size = expert_data.shape[shard_dim] // 2
+        elif shard_id in {"w13"}:
+            # fused version
+            shard_size = expert_data.shape[shard_dim]
+        else:
+            raise NotImplementedError
+
+        # Narrow parameter and load.
+        # w1, gate_proj: Load into first logical weight of w13.
+        # w3, up_proj: Load into second logical weight of w13.
+        # trtllm cutlass kernel assumes differently
+        switch_w13 = getattr(self.quant_method, "load_up_proj_weight_first", False)
+        if (switch_w13 and shard_id == "w1") or (not switch_w13 and shard_id == "w3"):
+            start = shard_size
+        else:
+            start = 0
+
+        if _is_cpu:
+            expert_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                expert_data,
+                loaded_weight,
+                start,
+                shard_size * tp_rank,
+                shard_dim,
+                shard_size,
+                not self.use_presharded_weights,
+            )
+        else:
+            if not self.use_presharded_weights:
+                if not is_bias and self.use_triton_kernels:
+                    # do not transpose for bias
+                    loaded_weight = loaded_weight.transpose(-2, -1)
+                loaded_weight = loaded_weight.narrow(
+                    shard_dim, shard_size * tp_rank, shard_size
+                )
+
+            expert_data = expert_data.narrow(shard_dim, start, shard_size)
+        expert_data.copy_(loaded_weight)
+
+    def _load_w2(
+        self,
+        expert_data: torch.Tensor,
+        shard_dim: int,
+        shard_id: str,
+        loaded_weight: torch.Tensor,
+        tp_rank: int,
+        is_bias: bool = False,
+    ):
+        """Load w2 weights for down projection.
+
+        Args:
+            expert_data: The expert data tensor to load into
+            shard_dim: The dimension to shard along
+            shard_id: The shard ID (must be "w2")
+            loaded_weight: The weight tensor to load from
+            tp_rank: The tensor parallel rank
+        """
+        if not isinstance(expert_data, torch.Tensor) or not isinstance(
+            loaded_weight, torch.Tensor
+        ):
+            raise ValueError("expert_data and loaded_weight must be torch.Tensor")
+
+        if (
+            self.quant_config is not None
+            and "modelopt" in self.quant_config.get_name()
+            and (expert_data.dim() != 2 or loaded_weight.dim() != 2)
+        ):
+            raise ValueError(
+                f"Expected 2D tensors, got expert_data shape {expert_data.shape} and loaded_weight shape {loaded_weight.shape}"
+            )
+
+        if shard_id != "w2":
+            raise ValueError(f"shard_id must be 'w2', got {shard_id}")
+
+        # Index the loaded weight for tp sharding.
+        # down_proj: "RowParallel" so tp sharding on input_dim
+        # Narrow parameter and load.
+        if is_bias:
+            # this expert_data is a bias, not weight,
+            # for w2_weight_bias in TP, it does not need to be sharded
+            shard_size = expert_data.shape[-1]
+        else:
+            # this parameter is a weight matrix
+            # for w2 in TP, it shards the input_features, i.e., shard_dim=2
+            shard_size = expert_data.shape[shard_dim]
+
+        if _is_cpu:
+            expert_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                expert_data,
+                loaded_weight,
+                0,  # param_data_start
+                shard_size * tp_rank,
+                shard_dim,
+                shard_size,
+                not self.use_presharded_weights,
+            )
+        else:
+            if not is_bias and not self.use_presharded_weights:
+                if self.use_triton_kernels:
+                    loaded_weight = loaded_weight.transpose(-2, -1)
+                loaded_weight = loaded_weight.narrow(
+                    shard_dim, shard_size * tp_rank, shard_size
+                )
+
+        # w2, down_proj: Load into only logical weight of w2.
+        expert_data.copy_(loaded_weight)
+
+    def _load_single_value(
+        self, param: torch.nn.Parameter, loaded_weight: torch.Tensor, expert_id: int
+    ):
+        param_data = param.data
+
+        # Input scales can be loaded directly and should be equal.
+        param_data[expert_id] = loaded_weight
+
+    def _load_g_idx(
+        self,
+        shard_id: str,
+        expert_data: torch.Tensor,
+        shard_dim: int,
+        loaded_weight: torch.Tensor,
+        tp_rank: int,
+    ):
+
+        if shard_id == "w2":
+            self._load_w2(
+                shard_id=shard_id,
+                shard_dim=shard_dim,
+                loaded_weight=loaded_weight,
+                expert_data=expert_data,
+                tp_rank=tp_rank,
+            )
+        else:
+            assert shard_id in ("w1", "w3")
+            expert_data.copy_(loaded_weight)
+
+    def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int:
+        if self.expert_map_cpu is None:
+            return expert_id
+        return self.expert_map_cpu[expert_id].item()
+
+    def weight_loader(
+        self,
+        param: torch.nn.Parameter,
+        loaded_weight: torch.Tensor,
+        weight_name: str,
+        shard_id: str,
+        expert_id: Optional[int],
+    ) -> None:
+
+        # if expert_id is None, then
+        # all the experts are loaded at the same time
+        if (
+            not expert_id
+            and self.quant_config is not None
+            and self.quant_config.get_name() == "mxfp4"
+            and self.quant_config.is_static_cfg()
+        ):
+            if "bias" in weight_name:
+                dim1 = loaded_weight.shape[1]
+                param.data[:, :dim1].copy_(loaded_weight)
+            else:
+                dim1 = loaded_weight.shape[1]
+                dim2 = loaded_weight.shape[2]
+                param.data[:, :dim1, :dim2].copy_(loaded_weight)
+            return
+
+        global_expert_location_metadata = get_global_expert_location_metadata()
+        if global_expert_location_metadata is None:
+            self._weight_loader_impl(
+                param=param,
+                loaded_weight=loaded_weight,
+                weight_name=weight_name,
+                shard_id=shard_id,
+                expert_id=expert_id,
+            )
+            return
+
+        if expert_id >= self.num_experts - self.num_fused_shared_experts:
+            # This is a shared expert.
+            physical_expert_ids = [expert_id]
+        else:
+            physical_expert_ids = (
+                global_expert_location_metadata.logical_to_all_physical(
+                    self.layer_id, expert_id
+                )
+            )
+
+        for physical_expert_id in physical_expert_ids:
+            self._weight_loader_physical(
+                param=param,
+                loaded_weight=loaded_weight,
+                weight_name=weight_name,
+                shard_id=shard_id,
+                expert_id=physical_expert_id,
+            )
+
+    def _weight_loader_physical(
+        self,
+        param: torch.nn.Parameter,
+        loaded_weight: torch.Tensor,
+        weight_name: str,
+        shard_id: str,
+        expert_id: int,
+    ) -> None:
+
+        expert_id = self._map_global_expert_id_to_local_expert_id(expert_id)
+        if expert_id == -1:
+            return
+        self._weight_loader_impl(
+            param=param,
+            loaded_weight=loaded_weight,
+            weight_name=weight_name,
+            shard_id=shard_id,
+            expert_id=expert_id,
+        )
+
+    def _weight_loader_impl(
+        self,
+        param: torch.nn.Parameter,
+        loaded_weight: torch.Tensor,
+        weight_name: str,
+        shard_id: str,
+        expert_id: int,
+    ) -> None:
+
+        tp_rank = self.moe_tp_rank
+
+        # compressed-tensors checkpoints with packed weights are stored flipped
+        # TODO (mgoin): check self.quant_method.quant_config.quant_format
+        # against known CompressionFormat enum values that have this quality
+        loaded_weight = (
+            loaded_weight.t().contiguous()
+            if (
+                self.quant_method.__class__.__name__
+                == "CompressedTensorsWNA16MoEMethod"
+            )
+            else loaded_weight
+        )
+
+        if shard_id not in ("w1", "w2", "w3"):
+            raise ValueError(
+                f"shard_id must be ['w1','w2','w3'] but " f"got {shard_id}."
+            )
+
+        # Flashinfer assumes w31 format for w13_weight. Same for the scales.
+        if should_use_flashinfer_trtllm_moe():
+            shard_id = {"w1": "w3", "w3": "w1", "w2": "w2"}[shard_id]
+
+        WEIGHT_SCALE_SUPPORTED = [e.value for e in FusedMoeWeightScaleSupported]
+        # Fetch the dim to shard the parameter/loaded weight
+        # based on the shard id. This will be whatever
+        # dimension intermediate_size is used.
+        SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0}
+
+        expert_data = param.data[expert_id]
+
+        # is_transposed: if the dim to shard the weight
+        # should be flipped. Required by GPTQ, compressed-tensors
+        # should be whatever dimension intermediate_size is
+        is_transposed = getattr(param, "is_transposed", False)
+        shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id]
+        if self.use_triton_kernels:
+            is_transposed = True
+        if is_transposed:
+            shard_dim = int(not shard_dim)
+
+        # Case input scale: input_scale loading is only supported for fp8
+        if "input_scale" in weight_name:
+            # INT4-FP8 (INT4 MoE Weight, FP8 Compute): Adjust input_scale for e4m3fnuz (AMD)
+            if _is_hip and get_bool_env_var("SGLANG_INT4_WEIGHT"):
+                loaded_weight = loaded_weight * 2.0
+
+            # this is needed for compressed-tensors only
+            loaded_weight = loaded_weight.to(param.data.device)
+
+            if (
+                (
+                    "compressed" in self.quant_method.__class__.__name__.lower()
+                    or "w4afp8" in self.quant_config.get_name()
+                )
+                and (param.data[expert_id] != 1).any()
+                and ((param.data[expert_id] - loaded_weight).abs() > 1e-5).any()
+            ):
+                raise ValueError(
+                    "input_scales of w1 and w3 of a layer "
+                    f"must be equal. But got {param.data[expert_id]} "
+                    f"vs. {loaded_weight}"
+                )
+
+            self._load_single_value(
+                param=param, loaded_weight=loaded_weight, expert_id=expert_id
+            )
+            return
+
+        # Case g_idx
+        if "g_idx" in weight_name:
+            self._load_g_idx(
+                shard_dim=0,
+                shard_id=shard_id,
+                loaded_weight=loaded_weight,
+                expert_data=expert_data,
+                tp_rank=tp_rank,
+            )
+            return
+
+        if "ModelOpt" in self.quant_method.__class__.__name__:
+            # Determine per-tensor weight scale patterns based on variant
+            is_fp4_variant = isinstance(self.quant_method, ModelOptNvFp4FusedMoEMethod)
+
+            # FP4 uses "weight_scale_2" for per-tensor, FP8 uses "weight_scale" for per-tensor
+            per_tensor_conditions = (
+                "weight_scale_2" in weight_name
+                if is_fp4_variant
+                else "weight_scale" in weight_name
+            ) or "input_scale" in weight_name
+
+            if per_tensor_conditions:
+                self._load_per_tensor_weight_scale(
+                    shard_id=shard_id,
+                    param=param,
+                    loaded_weight=loaded_weight,
+                    expert_id=expert_id,
+                )
+            elif "weight" in weight_name:
+                self._load_model_weight_or_group_weight_scale(
+                    shard_id=shard_id,
+                    shard_dim=shard_dim,
+                    loaded_weight=loaded_weight,
+                    expert_data=expert_data,
+                    tp_rank=tp_rank,
+                )
+            return
+
+        # Case weight scales and zero_points
+        if "scale" in weight_name or "zero" in weight_name or "offset" in weight_name:
+            # load the weight scales and zp based on the quantization scheme
+            # supported weight scales/zp can be found in
+            # FusedMoeWeightScaleSupported
+            # TODO @dsikka: once hardened, refactor to use vLLM Parameters
+            # specific to each case
+            quant_method = getattr(param, "quant_method", None)
+            if quant_method == FusedMoeWeightScaleSupported.CHANNEL.value:
+                # INT4-FP8 (INT4 MoE Weight, FP8 Compute): Adjust INT4 column-wise scaling number to e4m3fnuz (AMD)
+                if _is_hip and get_bool_env_var("SGLANG_INT4_WEIGHT"):
+                    loaded_weight = loaded_weight * 0.5
+
+                self._load_per_channel_weight_scale(
+                    shard_id=shard_id,
+                    shard_dim=shard_dim,
+                    loaded_weight=loaded_weight,
+                    expert_data=expert_data,
+                    tp_rank=tp_rank,
+                )
+            elif quant_method in [
+                FusedMoeWeightScaleSupported.GROUP.value,
+                FusedMoeWeightScaleSupported.BLOCK.value,
+            ]:
+                self._load_model_weight_or_group_weight_scale(
+                    shard_id=shard_id,
+                    shard_dim=shard_dim,
+                    loaded_weight=loaded_weight,
+                    expert_data=expert_data,
+                    tp_rank=tp_rank,
+                )
+            elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value:
+                # INT4-FP8 (INT4 MoE Weight, FP8 Compute): Adjust FP8 per-tensor scaling number for e4m3fnuz (AMD)
+                if _is_hip and get_bool_env_var("SGLANG_INT4_WEIGHT"):
+                    loaded_weight = loaded_weight * 2.0
+
+                self._load_per_tensor_weight_scale(
+                    shard_id=shard_id,
+                    param=param,
+                    loaded_weight=loaded_weight,
+                    expert_id=expert_id,
+                )
+            else:
+                raise ValueError(
+                    f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}"
+                )
+            return
+
+        # Case weight_shape
+        if "weight_shape" in weight_name:
+            # only required by compressed-tensors
+            self._load_single_value(
+                param=param, loaded_weight=loaded_weight, expert_id=expert_id
+            )
+            return
+
+        # Case model weights
+        if "weight" in weight_name:
+            self._load_model_weight_or_group_weight_scale(
+                shard_id=shard_id,
+                shard_dim=shard_dim,
+                loaded_weight=loaded_weight,
+                expert_data=expert_data,
+                tp_rank=tp_rank,
+            )
+            return
+
+    def weight_loader_fused(
+        self,
+        param: torch.nn.Parameter,
+        loaded_weight: torch.Tensor,
+        weight_name: str,
+        shard_id: str,
+    ) -> None:
+        tp_rank = self.moe_tp_rank
+
+        if (
+            self.quant_config is not None
+            and self.quant_config.get_name() == "mxfp4"
+            and self.quant_config.is_static_cfg()
+        ):
+            if "bias" in weight_name:
+                dim1 = loaded_weight.shape[1]
+                param.data[:, :dim1].copy_(loaded_weight)
+            elif "scale" in weight_name:
+                param.data.copy_(loaded_weight)
+            else:
+                dim1 = loaded_weight.shape[1]
+                dim2 = loaded_weight.shape[2]
+                param.data[:, :dim1, :dim2].copy_(loaded_weight)
+            return
+
+        # compressed-tensors checkpoints with packed weights are stored flipped
+        # TODO: check self.quant_method.quant_config.quant_format
+        # against known CompressionFormat enum values that have this quality
+        loaded_weight = (
+            loaded_weight.t().contiguous()
+            if (
+                self.quant_method.__class__.__name__
+                == "CompressedTensorsWNA16MoEMethod"
+            )
+            else loaded_weight
+        )
+
+        if shard_id not in ("w13", "w2"):
+            raise ValueError(f"shard_id must be ['w13','w2'] but " f"got {shard_id}.")
+
+        # Fetch the dim to shard the parameter/loaded weight
+        # based on the shard id. This will be whatever
+        # dimension intermediate_size is used.
+        SHARD_ID_TO_SHARDED_DIM = {"w13": 1, "w2": 2}
+        SHARD_ID_TO_SHARDED_DIM_TRANSPOSE = {"w13": 2, "w2": 1}
+
+        expert_data = param.data
+        is_bias = expert_data.dim() == 2
+
+        # is_transposed: if the dim to shard the weight
+        # should be flipped. Required by GPTQ, compressed-tensors
+        # should be whatever dimension intermediate_size is
+        is_transposed = getattr(param, "is_transposed", False)
+
+        if self.use_triton_kernels:
+            is_transposed = True
+        shard_dim = (
+            SHARD_ID_TO_SHARDED_DIM[shard_id]
+            if not is_transposed
+            else SHARD_ID_TO_SHARDED_DIM_TRANSPOSE[shard_id]
+        )
+
+        # Case model weights
+        if "weight" in weight_name:
+            self._load_model_weight_or_group_weight_scale(
+                shard_id=shard_id,
+                shard_dim=shard_dim,
+                loaded_weight=loaded_weight,
+                expert_data=expert_data,
+                tp_rank=tp_rank,
+                is_bias=is_bias,
+            )
+            return
+        else:
+            logging.warning(
+                f"Unsupported weight_name {weight_name} for FusedMoE weight_loader_fused. Nothing is loaded."
+            )
+
+    def forward(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
+        origin_hidden_states_dim = hidden_states.shape[-1]
+        assert self.quant_method is not None
+
+        if self.moe_ep_size > 1 and not self.enable_flashinfer_cutlass_moe:
+            if self.expert_map_cpu is not None and self.expert_map_gpu is None:
+                # If we are in EP mode, we need to move the expert map to GPU.
+                self.expert_map_gpu = self.expert_map_cpu.to(device="cuda")
+
+        if self.expert_map_gpu is not None:
+            if TopKOutputChecker.format_is_standard(topk_output):
+                topk_output = topk_output._replace(
+                    topk_ids=self.expert_map_gpu[topk_output.topk_ids]
+                )
+            elif TopKOutputChecker.format_is_triton_kernel(topk_output):
+                raise NotImplementedError()
+
+        dispatch_output = self.dispatcher.dispatch(
+            hidden_states=hidden_states, topk_output=topk_output
+        )
+
+        # TODO: consider using symmetric memory
+        combine_input = self.quant_method.apply(
+            layer=self,
+            dispatch_output=dispatch_output,
+        )
+
+        final_hidden_states = self.dispatcher.combine(combine_input)
+
+        final_hidden_states = final_hidden_states[
+            ..., :origin_hidden_states_dim
+        ].contiguous()
+
+        if self.reduce_results and (self.moe_tp_size > 1 or self.moe_ep_size > 1):
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states
+
+    @classmethod
+    def make_expert_params_mapping(
+        cls,
+        ckpt_gate_proj_name: str,
+        ckpt_down_proj_name: str,
+        ckpt_up_proj_name: str,
+        num_experts: int,
+    ) -> List[Tuple[str, str, int, str]]:
+
+        return [
+            # (param_name, weight_name, expert_id, shard_id)
+            (
+                (
+                    "experts.w13_"
+                    if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name]
+                    else "experts.w2_"
+                ),
+                f"experts.{expert_id}.{weight_name}.",
+                expert_id,
+                shard_id,
+            )
+            for expert_id in range(num_experts)
+            for shard_id, weight_name in [
+                ("w1", ckpt_gate_proj_name),
+                ("w2", ckpt_down_proj_name),
+                ("w3", ckpt_up_proj_name),
+            ]
+        ]
+
+    @classmethod
+    def make_expert_params_mapping_fused(
+        cls,
+        ckpt_gate_up_proj_name: str,
+        ckpt_down_proj_name: str,
+        ckpt_gate_up_proj_bias_name: str,
+        ckpt_down_proj_bias_name: str,
+    ):
+        return [
+            ("experts.w13_weight", f"experts.{ckpt_gate_up_proj_name}", "w13"),
+            (
+                "experts.w13_weight_bias",
+                f"experts.{ckpt_gate_up_proj_bias_name}",
+                "w13",
+            ),
+            ("experts.w2_weight", f"experts.{ckpt_down_proj_name}", "w2"),
+            ("experts.w2_weight_bias", f"experts.{ckpt_down_proj_bias_name}", "w2"),
+        ]
+
+    @classmethod
+    def make_expert_params_mapping_fused_mxfp4(
+        cls,
+        ckpt_gate_up_proj_name: str,
+        ckpt_down_proj_name: str,
+        ckpt_gate_up_proj_bias_name: str,
+        ckpt_down_proj_bias_name: str,
+        ckpt_gate_up_proj_scale_name: str,
+        ckpt_down_proj_scale_name: str,
+    ):
+        return [
+            ("experts.w13_weight", f"experts.{ckpt_gate_up_proj_name}", "w13"),
+            (
+                "experts.w13_weight_bias",
+                f"experts.{ckpt_gate_up_proj_bias_name}",
+                "w13",
+            ),
+            ("experts.w2_weight", f"experts.{ckpt_down_proj_name}", "w2"),
+            ("experts.w2_weight_bias", f"experts.{ckpt_down_proj_bias_name}", "w2"),
+            (
+                "experts.w13_weight_scale",
+                f"experts.{ckpt_gate_up_proj_scale_name}",
+                "w13",
+            ),
+            ("experts.w2_weight_scale", f"experts.{ckpt_down_proj_scale_name}", "w2"),
+        ]
+
+    @classmethod
+    def make_expert_input_scale_params_mapping(
+        cls,
+        num_experts: int,
+    ) -> List[Tuple[str, str, int, str]]:
+        # (param_name, weight_name, expert_id, shard_id)
+        return [
+            (
+                "experts.w13_" if shard_id in ["w1", "w3"] else "experts.w2_",
+                f"experts.{expert_id}.{shard_id}.",
+                expert_id,
+                shard_id,
+            )
+            for expert_id in range(num_experts)
+            for shard_id in ["w1", "w2", "w3"]
+        ]
+
+    def should_fuse_routed_scaling_factor_in_topk(self):
+        return isinstance(self.quant_method, ModelOptNvFp4FusedMoEMethod) or (
+            isinstance(self.quant_method, Fp8MoEMethod)
+            and self.quant_method.use_cutlass_fused_experts_fp8
+        )
+
+
+class FlashInferFusedMoE(FusedMoE):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.use_flashinfer_trtllm_moe = should_use_flashinfer_trtllm_moe()
+
+    def forward(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
+        assert self.use_flashinfer_trtllm_moe
+        assert (
+            self.moe_runner_config.activation == "silu"
+        ), "Only silu is supported for flashinfer blockscale fp8 moe"
+        assert self.quant_method is not None
+        assert (
+            topk_output.topk_config.renormalize
+        ), "Renormalize is required for flashinfer blockscale fp8 moe"
+        assert (
+            self.num_fused_shared_experts == 0
+        ), "Fused shared experts are not supported for flashinfer blockscale fp8 moe"
+
+        assert TopKOutputChecker.format_is_bypassed(topk_output)
+
+        # Matrix multiply.
+        final_hidden_states = self.quant_method.apply_with_router_logits(
+            layer=self,
+            x=hidden_states,
+            topk_output=topk_output,
+        )
+
+        if self.reduce_results and (self.moe_tp_size > 1 or self.moe_ep_size > 1):
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states
+
+
+class FlashInferFP4MoE(FusedMoE):
+    """FP4 TRTLLM MoE implementation using FlashInfer."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    # ---------------------------------------------------------------------
+    # Helper: quantize hidden states to FP4 each forward pass
+    # ---------------------------------------------------------------------
+    def _quantize_hidden_states_fp4(self, hidden_states: torch.Tensor):
+        """
+        Quantize hidden states using global scale factor from quantization method.
+
+        Global scale factor is set by ModelOptNvFp4FusedMoEMethod during weight loading.
+        Only block scales are computed at runtime for efficiency.
+
+        Returns (packed_fp4_uint8, scale_float8_e4m3fn_runtime, global_scale_float32)
+        """
+
+        # flashinfer.fp4_quantize returns (packed_uint8, scale_fp8)
+        # Only the block scales are computed at runtime
+        hs_fp4_bytes, hs_sf_bytes = fp4_quantize(
+            hidden_states,
+            self.w13_input_scale_quant,
+            16,  # sf_vec_size
+            False,  # use_ue8m0
+            False,  # is_sf_swizzled_layout
+        )
+
+        hs_fp4 = hs_fp4_bytes.reshape(
+            hidden_states.shape[0], hidden_states.shape[1] // 2
+        )
+        hs_sf = hs_sf_bytes.view(torch.float8_e4m3fn).reshape(-1)
+
+        return hs_fp4, hs_sf
+
+    def forward(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
+        """Forward pass using FP4 TRTLLM kernel.
+
+        Args:
+            hidden_states: Input tensor
+            topk_output: TopKOutput object with Bypassed format
+        """
+        assert isinstance(self.quant_method, ModelOptNvFp4FusedMoEMethod)
+
+        assert TopKOutputChecker.format_is_bypassed(topk_output)
+
+        router_logits = topk_output.router_logits
+        topk_config = topk_output.topk_config
+
+        hs_fp4, hs_scale_linear = self._quantize_hidden_states_fp4(hidden_states)
+
+        router_logits = router_logits.to(torch.float32)
+
+        result = trtllm_fp4_block_scale_moe(
+            routing_logits=router_logits,
+            routing_bias=topk_config.correction_bias.to(hidden_states.dtype),
+            hidden_states=hs_fp4,
+            hidden_states_scale=hs_scale_linear.view(torch.float8_e4m3fn).flatten(),
+            gemm1_weights=self.gemm1_weights_fp4_shuffled.data,
+            gemm1_weights_scale=self.gemm1_scales_fp4_shuffled.data.view(
+                torch.float8_e4m3fn
+            ),
+            gemm1_bias=None,
+            gemm1_alpha=None,
+            gemm1_beta=None,
+            gemm1_clamp_limit=None,
+            gemm2_weights=self.gemm2_weights_fp4_shuffled.data,
+            gemm2_weights_scale=self.gemm2_scales_fp4_shuffled.data.view(
+                torch.float8_e4m3fn
+            ),
+            gemm2_bias=None,
+            output1_scale_scalar=self.g1_scale_c.data,
+            output1_scale_gate_scalar=self.g1_alphas.data,
+            output2_scale_scalar=self.g2_alphas.data,
+            num_experts=self.num_experts,
+            top_k=topk_config.top_k,
+            n_group=topk_config.num_expert_group,
+            topk_group=topk_config.topk_group,
+            intermediate_size=self.intermediate_size_per_partition,
+            local_expert_offset=self.moe_ep_rank * self.num_local_experts,
+            local_num_experts=self.num_local_experts,
+            routed_scaling_factor=self.moe_runner_config.routed_scaling_factor,
+            tile_tokens_dim=_get_tile_tokens_dim(
+                hidden_states.shape[0], topk_config.top_k, self.num_local_experts
+            ),
+            routing_method_type=RoutingMethodType.DeepSeekV3,
+            do_finalize=True,
+        )[0]
+
+        return result
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py b/python/sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py
new file mode 100644
index 000000000..64d0126d6
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py
@@ -0,0 +1,87 @@
+from __future__ import annotations
+
+from typing import Tuple
+
+import torch
+import triton
+
+from sglang.srt.utils import is_cuda, is_hip
+
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+
+if _is_cuda or _is_hip:
+    from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
+
+
+def moe_align_block_size(
+    topk_ids: torch.Tensor, block_size: int, num_experts: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Aligns the token distribution across experts to be compatible with block
+    size for matrix multiplication.
+
+    Parameters:
+    - topk_ids: A tensor of shape [total_tokens, top_k] representing the
+        top-k expert indices for each token.
+    - block_size: The block size used in block matrix multiplication.
+    - num_experts: The total number of experts.
+
+    Returns:
+    - sorted_token_ids: A tensor containing the sorted token indices according
+        to their allocated expert.
+    - expert_ids: A tensor indicating the assigned expert index for each block.
+    - num_tokens_post_padded: The total number of tokens after padding,
+        ensuring divisibility by block_size.
+
+    This function pads the number of tokens that each expert needs to process
+    so that it is divisible by block_size.
+    Padding ensures that during block matrix multiplication, the dimensions
+    align correctly.
+
+    Example:
+    Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]],
+    block_size = 4, and num_experts = 4:
+    - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts,
+        with each expert needing to process 3 tokens.
+    - As block_size is 4, we pad 1 token for each expert.
+    - First, flatten topk_ids to [2, 3, 4, 1, 2, 4, 1, 3, 4, 1, 2, 3].
+    - Then append padding tokens [12, 12, 12, 12] for each block.
+    - After sorting by expert index, we obtain token_ids
+        [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12].
+        Tokens 12 are non-existent (padding) and are ignored in
+        the subsequent matrix multiplication.
+    - The padding ensures that the total number of tokens is now divisible
+        by block_size for proper block matrix operations.
+    """
+    max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1)
+    sorted_ids = torch.empty(
+        (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
+    )
+    max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
+    expert_ids = torch.empty(
+        (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
+    )
+    num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
+
+    # In EP, expert_ids for filtered experts are -1. We have num_experts + 1 ids in total.
+    cumsum_buffer = torch.empty(
+        (num_experts + 2,), dtype=torch.int32, device=topk_ids.device
+    )
+
+    # Threshold based on benchmark results
+    fuse_sorted_ids_padding = sorted_ids.shape[0] <= 4096
+    if not fuse_sorted_ids_padding:
+        sorted_ids.fill_(topk_ids.numel())
+
+    sgl_moe_align_block_size(
+        topk_ids,
+        num_experts + 1,
+        block_size,
+        sorted_ids,
+        expert_ids,
+        num_tokens_post_pad,
+        cumsum_buffer,
+        fuse_sorted_ids_padding,
+    )
+    return sorted_ids, expert_ids, num_tokens_post_pad
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py b/python/sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py
new file mode 100644
index 000000000..5d39b8bbc
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py
@@ -0,0 +1,335 @@
+# Adapted from https://github.com/vllm-project/vllm/pull/18595/files#diff-f426a6de78c82ffec568eff6811bfbf0043dab5f87f1a8c0cffdbdcb8a81e035
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Optional
+
+import torch
+from sgl_kernel import gelu_and_mul, silu_and_mul
+from triton_kernels.matmul_ogs import (
+    FlexCtx,
+    FnSpecs,
+    FusedActivation,
+    PrecisionConfig,
+    matmul_ogs,
+)
+from triton_kernels.numerics import InFlexData
+from triton_kernels.routing import GatherIndx, RoutingData, ScatterIndx
+from triton_kernels.swiglu import swiglu_fn
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
+    from sglang.srt.layers.moe.topk import TopKOutput
+
+
+def quantize(w, dtype, dev, **opt):
+    if dtype == "bf16":
+        return w.to(torch.bfloat16), InFlexData()
+    elif dtype == "fp8":
+        wq = w.to(torch.float8_e4m3fn).transpose(-1, -2).contiguous().transpose(-1, -2)
+        return (
+            wq,
+            InFlexData(dtype=wq.dtype, scale=w.abs().max().unsqueeze(0)),
+            MicroscalingCtx(),
+        )
+    else:
+        assert dtype == "mx4", f"{dtype=}"
+        swizzle_mx_scale = opt["swizzle_mx_scale"]
+        swizzle_axis = 2 if swizzle_mx_scale else None
+        w = w.to(torch.bfloat16)
+        w, mx_scales, weight_scale_shape = downcast_to_mxfp(
+            w, torch.uint8, axis=1, swizzle_axis=swizzle_axis
+        )
+        return (
+            w,
+            InFlexData(),
+            MicroscalingCtx(
+                weight_scale=mx_scales,
+                swizzle_mx=swizzle_mx_scale,
+                actual_weight_scale_shape=weight_scale_shape,
+            ),
+        )
+
+
+def triton_kernel_moe_forward(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_output: TopKOutput,
+    moe_runner_config: MoeRunnerConfig,
+    apply_router_weight_on_input: bool = False,
+    use_fp8_w8a8: bool = False,
+    per_channel_quant: bool = False,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+
+    from sglang.srt.layers.moe.topk import TopKOutputChecker
+
+    assert TopKOutputChecker.format_is_triton_kernel(topk_output)
+
+    routing_data, gather_idx, scatter_idx = topk_output
+
+    return triton_kernel_fused_experts(
+        hidden_states,
+        w1,
+        w2,
+        routing_data,
+        gather_idx,
+        scatter_idx,
+        inplace=False,  # triton kernel doesn't support inplace
+        activation=moe_runner_config.activation,
+        apply_router_weight_on_input=apply_router_weight_on_input,
+        use_fp8_w8a8=use_fp8_w8a8,
+        per_channel_quant=per_channel_quant,
+        global_num_experts=global_num_experts,
+        expert_map=expert_map,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+        block_shape=block_shape,
+    )
+
+
+# This is a triton implementation of the fused_experts function
+def triton_kernel_fused_experts(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    routing_data: RoutingData,
+    gather_indx: GatherIndx,
+    scatter_indx: ScatterIndx,
+    inplace: bool = False,
+    activation: str = "silu",
+    apply_router_weight_on_input: bool = False,
+    use_fp8_w8a8: bool = False,
+    per_channel_quant: bool = False,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+
+    assert use_fp8_w8a8 == False, "use_fp8_w8a8 is not supported"
+    assert per_channel_quant == False, "per_channel_quant is not supported"
+    assert expert_map == None, "expert_map is not supported"
+    assert w1_scale == None, "w1_scale is not supported"
+    assert w2_scale == None, "w2_scale is not supported"
+    assert a1_scale == None, "a1_scale is not supported"
+    assert a2_scale == None, "a2_scale is not supported"
+    assert block_shape == None, "block_shape is not supported"
+
+    # type check
+    assert hidden_states.dtype == torch.bfloat16, "hidden_states must be bfloat16"
+    assert w1.dtype == torch.bfloat16, "w1 must be bfloat16"
+    assert w2.dtype == torch.bfloat16, "w2 must be bfloat16"
+
+    # Shape check
+    assert hidden_states.ndim == 2, "hidden_states must be 2D"
+    assert (
+        hidden_states.shape[-1] == w1.shape[-2]
+    ), f"hidden_states shape[-1] {hidden_states.shape} must be equal to w1 shape[-2] {w1.shape}"
+    assert (
+        w2.shape[-1] == w1.shape[1]
+    ), f"w2 shape[-1] {w2.shape[-1]} must be equal to w1 shape[1] {w1.shape[1]}"
+
+    # feature check
+    assert inplace == False, "Inplace is not supported in new triton MoE kernel"
+
+    M, K = hidden_states.shape
+    E, _, N = w1.shape
+    n_expts_act = routing_data.n_expts_act
+    dtype = hidden_states.dtype
+
+    if global_num_experts == -1:
+        global_num_experts = E
+
+    # consistent with default implementation
+    intermediate_cache2 = torch.empty(
+        (M * n_expts_act, N // 2), device="cuda", dtype=dtype
+    )
+
+    intermediate_cache1 = matmul_ogs(
+        hidden_states,
+        w1,
+        None,
+        routing_data,
+        gather_indx=gather_indx,
+        gammas=routing_data.gate_scal if apply_router_weight_on_input else None,
+    )
+
+    if activation == "silu":
+        silu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2)
+    elif activation == "gelu":
+        gelu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}")
+
+    intermediate_cache3 = matmul_ogs(
+        intermediate_cache2,
+        w2,
+        None,
+        routing_data,
+        scatter_indx=scatter_indx,
+        gammas=None if apply_router_weight_on_input else routing_data.gate_scal,
+    )
+
+    return intermediate_cache3
+
+
+def triton_kernel_moe_with_bias_forward(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w1_pcg,
+    b1: torch.Tensor,
+    w2: torch.Tensor,
+    w2_pcg,
+    b2: torch.Tensor,
+    topk_output: TopKOutput,
+    moe_runner_config: MoeRunnerConfig,
+    use_fp8_w8a8: bool = False,
+    per_channel_quant: bool = False,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+    from sglang.srt.layers.moe.topk import TopKOutputChecker
+
+    assert TopKOutputChecker.format_is_triton_kernel(topk_output)
+
+    routing_data, gather_idx, scatter_idx = topk_output
+
+    return triton_kernel_fused_experts_with_bias(
+        hidden_states,
+        w1=w1,
+        w1_pcg=w1_pcg,
+        b1=b1,
+        w2=w2,
+        w2_pcg=w2_pcg,
+        b2=b2,
+        routing_data=routing_data,
+        gather_indx=gather_idx,
+        scatter_indx=scatter_idx,
+        inplace=False,  # triton kernel doesn't support inplace
+        activation=moe_runner_config.activation,
+        use_fp8_w8a8=use_fp8_w8a8,
+        per_channel_quant=per_channel_quant,
+        global_num_experts=global_num_experts,
+        expert_map=expert_map,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+        block_shape=block_shape,
+        gemm1_alpha=moe_runner_config.gemm1_alpha,
+        gemm1_clamp_limit=moe_runner_config.gemm1_clamp_limit,
+    )
+
+
+def triton_kernel_fused_experts_with_bias(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w1_pcg,
+    b1: torch.Tensor,
+    w2: torch.Tensor,
+    w2_pcg,
+    b2: torch.Tensor,
+    routing_data: RoutingData,
+    gather_indx: GatherIndx,
+    scatter_indx: ScatterIndx,
+    inplace: bool = False,
+    activation: str = "silu",
+    use_fp8_w8a8: bool = False,
+    per_channel_quant: bool = False,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+    gemm1_alpha: Optional[float] = None,
+    gemm1_clamp_limit: Optional[float] = None,
+) -> torch.Tensor:
+    assert use_fp8_w8a8 == False, "use_fp8_w8a8 is not supported"
+    assert per_channel_quant == False, "per_channel_quant is not supported"
+    assert expert_map == None, "expert_map is not supported"
+    assert w1_scale == None, "w1_scale is not supported"
+    assert w2_scale == None, "w2_scale is not supported"
+    assert a1_scale == None, "a1_scale is not supported"
+    assert a2_scale == None, "a2_scale is not supported"
+    assert block_shape == None, "block_shape is not supported"
+
+    # type check
+    assert hidden_states.dtype == torch.bfloat16, "hidden_states must be bfloat16"
+    for w in (w1, w2):
+        # TODO assert bf16 or mxfp4
+        # assert (w.dtype == torch.bfloat16) or check-is-mxfp4, f"w must be bfloat16 or mxfp4 {w1.dtype=}"
+        pass
+
+    # Shape check
+    assert hidden_states.ndim == 2, "hidden_states must be 2D"
+    assert (
+        hidden_states.shape[-1] == w1.shape[-2]
+    ), f"hidden_states shape[-1] {hidden_states.shape} must be equal to w1 shape[-2] {w1.shape}"
+    assert (
+        w2.shape[-1] == w1.shape[1]
+    ), f"w2 shape[-1] {w2.shape[-1]} must be equal to w1 shape[1] {w1.shape[1]}"
+
+    # feature check
+    assert inplace == False, "Inplace is not supported in new triton MoE kernel"
+
+    E, _, _ = w1.shape
+
+    if global_num_experts == -1:
+        global_num_experts = E
+
+    # TODO maybe completely remove this branch
+    if w1.dtype == torch.bfloat16:
+        device = "cuda"
+        optg = dict()
+        w1, w1_flex = quantize(w1, "bf16", device, **optg)
+        w1_pcg = PrecisionConfig(flex_ctx=FlexCtx(rhs_data=w1_flex))
+
+        w2, w2_flex = quantize(w2, "bf16", device, **optg)
+        w2_pcg = PrecisionConfig(flex_ctx=FlexCtx(rhs_data=w2_flex))
+
+    act = FusedActivation(
+        FnSpecs("swiglu", swiglu_fn, ("alpha", "limit")),
+        (gemm1_alpha, gemm1_clamp_limit),
+        2,
+    )
+
+    intermediate_cache = matmul_ogs(
+        hidden_states,
+        w1,
+        b1,
+        routing_data,
+        gather_indx=gather_indx,
+        precision_config=w1_pcg,
+        gammas=None,
+        fused_activation=act,
+    )
+
+    return matmul_ogs(
+        intermediate_cache,
+        w2,
+        b2,
+        routing_data,
+        scatter_indx=scatter_indx,
+        precision_config=w2_pcg,
+        gammas=routing_data.gate_scal,
+    )
diff --git a/python/sglang/srt/layers/moe/moe_runner/__init__.py b/python/sglang/srt/layers/moe/moe_runner/__init__.py
new file mode 100644
index 000000000..3320a7875
--- /dev/null
+++ b/python/sglang/srt/layers/moe/moe_runner/__init__.py
@@ -0,0 +1,4 @@
+from sglang.srt.layers.moe.moe_runner.base import MoeRunnerConfig
+from sglang.srt.layers.moe.moe_runner.runner import MoeRunner
+
+__all__ = ["MoeRunnerConfig", "MoeRunner"]
diff --git a/python/sglang/srt/layers/moe/moe_runner/base.py b/python/sglang/srt/layers/moe/moe_runner/base.py
new file mode 100644
index 000000000..4d95540e6
--- /dev/null
+++ b/python/sglang/srt/layers/moe/moe_runner/base.py
@@ -0,0 +1,286 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Callable, Optional, Tuple, TypeGuard
+
+import torch
+
+from sglang.srt.layers.moe.utils import MoeA2ABackend, MoeRunnerBackend
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.moe_runner.triton import (
+        TritonRunnerCore,
+        TritonRunnerInput,
+        TritonRunnerOutput,
+    )
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        CombineInputFormat,
+        DispatchOutput,
+        DispatchOutputFormat,
+    )
+
+
+@dataclass
+class MoeRunnerConfig:
+
+    # MoE parameters
+    num_experts: Optional[int] = None
+    num_local_experts: Optional[int] = None
+    hidden_size: Optional[int] = None
+    intermediate_size_per_partition: Optional[int] = None
+    layer_id: Optional[int] = None
+    top_k: Optional[int] = None
+    num_fused_shared_experts: Optional[int] = None
+    params_dtype: Optional[torch.dtype] = None
+
+    # Runner configuration
+    activation: str = "silu"
+    apply_router_weight_on_input: bool = False
+    inplace: bool = True
+    no_combine: bool = False
+    routed_scaling_factor: Optional[float] = None
+    gemm1_alpha: Optional[float] = None
+    gemm1_clamp_limit: Optional[float] = None
+
+
+@dataclass
+class RunnerInput(ABC):
+
+    @property
+    @abstractmethod
+    def runner_backend(self) -> MoeRunnerBackend: ...
+
+    def runner_backend_is_triton(self) -> TypeGuard[TritonRunnerInput]:
+        return self.runner_backend == MoeRunnerBackend.TRITON
+
+
+class RunnerOutput(ABC):
+
+    @property
+    @abstractmethod
+    def runner_backend(self) -> MoeRunnerBackend: ...
+
+    def runner_backend_is_triton(self) -> TypeGuard[TritonRunnerOutput]:
+        return self.runner_backend == MoeRunnerBackend.TRITON
+
+
+@dataclass
+class MoeQuantInfo(ABC):
+    """Moe quantization data."""
+
+    pass
+
+
+class MoeRunnerCore(ABC):
+
+    def __init__(self, config: MoeRunnerConfig):
+        self.config = config
+
+    @abstractmethod
+    def run(
+        self, runner_input: RunnerInput, quant_info: MoeQuantInfo, running_state: dict
+    ) -> RunnerOutput:
+        pass
+
+    @property
+    @abstractmethod
+    def runner_backend(self) -> MoeRunnerBackend: ...
+
+    def runner_backend_is_triton(self) -> TypeGuard[TritonRunnerCore]:
+        return self.runner_backend == MoeRunnerBackend.TRITON
+
+
+class FusedOpPool:
+
+    _fused_funcs: dict[str, Callable] = {}
+
+    @classmethod
+    def register_fused_func(
+        cls, a2a_backend_name: str, runner_backend_name: str, fused_func: Callable
+    ):
+        key = (a2a_backend_name, runner_backend_name)
+        if key in cls._fused_funcs:
+            raise ValueError(
+                f"Fused function for {a2a_backend_name} to {runner_backend_name} is already registered."
+            )
+        assert MoeA2ABackend(
+            a2a_backend_name
+        ), f"Invalid dispatch name: {a2a_backend_name}"
+        assert MoeRunnerBackend(
+            runner_backend_name
+        ), f"Invalid runner name: {runner_backend_name}"
+        cls._fused_funcs[key] = fused_func
+
+    @classmethod
+    def get_fused_func(cls, dispatch_name: str, runner_name: str) -> Optional[Callable]:
+        key = (dispatch_name, runner_name)
+        fused_func = cls._fused_funcs.get(key)
+        return fused_func
+
+
+class PermuteMethodPool:
+
+    _pre_permute_methods: dict[
+        Tuple[DispatchOutputFormat, MoeRunnerBackend], Callable
+    ] = {}
+    _post_permute_methods: dict[
+        Tuple[MoeRunnerBackend, CombineInputFormat], Callable
+    ] = {}
+
+    @classmethod
+    def register_pre_permute(
+        cls,
+        dispatch_output_name: str,
+        runner_backend_name: str,
+        permute_func: Callable,
+    ):
+        """
+        Register a customized pre-permute function for the given DispatchOutputFormat and MoeRunnerBackend.
+
+        :param dispatch_output_name: The DispatchOutputFormat name.
+        :param runner_backend_name: The MoeRunnerBackend name.
+        :param permute_func: The permute function to register.
+        """
+        # TODO: check if registration is valid
+        key = (dispatch_output_name, runner_backend_name)
+        if key in cls._pre_permute_methods:
+            raise ValueError(
+                f"Pre-permute method for {dispatch_output_name} to {runner_backend_name} is already registered."
+            )
+        cls._pre_permute_methods[key] = permute_func
+
+    @classmethod
+    def register_post_permute(
+        cls,
+        runner_backend_name: str,
+        combine_input_name: str,
+        permute_func: Callable,
+    ):
+        """
+        Register a customized post-permute function for the given MoeRunnerBackend and CombineInputFormat.
+
+        :param runner_backend_name: The MoeRunnerBackend name.
+        :param combine_input_name: The CombineInputFormat name.
+        :param permute_func: The permute function to register.
+        """
+        # TODO: check if registration is valid
+        key = (runner_backend_name, combine_input_name)
+        if key in cls._post_permute_methods:
+            raise ValueError(
+                f"Post-permute method for {runner_backend_name} to {combine_input_name} is already registered."
+            )
+        cls._post_permute_methods[key] = permute_func
+
+    @classmethod
+    def get_pre_permute(
+        cls,
+        dispatch_output_format: DispatchOutputFormat,
+        runner_input_format: MoeRunnerBackend,
+    ) -> Callable:
+        """
+        Retrieve the pre-permute function for the given DispatchOutputFormat and MoeRunnerBackend.
+
+        :param dispatch_output_format: The DispatchOutputFormat type.
+        :param runner_input_format: The MoeRunnerBackend type.
+        :return: The registered permute function or None if not found.
+        """
+        key = (dispatch_output_format, runner_input_format)
+        pre_permute_func = cls._pre_permute_methods.get(key)
+        assert (
+            pre_permute_func is not None
+        ), f"Pre-permute function for {dispatch_output_format} to {runner_input_format} is not registered"
+        return pre_permute_func
+
+    @classmethod
+    def get_post_permute(
+        cls,
+        runner_output_format: MoeRunnerBackend,
+        combine_input_format: CombineInputFormat,
+    ) -> Callable:
+        """
+        Retrieve the post-permute function for the given MoeRunnerBackend and CombineInputFormat.
+
+        :param runner_output_format: The MoeRunnerBackend type.
+        :param combine_input_format: The CombineInputFormat type.
+        :return: The registered permute function or None if not found.
+        """
+        key = (runner_output_format, combine_input_format)
+        post_permute_func = cls._post_permute_methods.get(key)
+        assert (
+            post_permute_func is not None
+        ), f"Post-permute function for {runner_output_format} to {combine_input_format} is not registered"
+        return post_permute_func
+
+
+def register_fused_func(
+    a2a_backend_name: str,
+    runner_backend_name: str,
+) -> Callable:
+    """
+    Decorator to register a fused function for the given DispatchOutputFormat and MoeRunnerBackend.
+
+    :param a2a_backend_name: The A2A backend name.
+    :param runner_backend_name: The MoeRunnerBackend name.
+    :return: The decorator function.
+    """
+
+    def decorator(fused_func: Callable):
+        FusedOpPool.register_fused_func(
+            a2a_backend_name, runner_backend_name, fused_func
+        )
+        return fused_func
+
+    return decorator
+
+
+def register_pre_permute(
+    dispatch_output_name: str,
+    runner_backend_name: str,
+) -> Callable:
+    """
+    Decorator to register a pre-permute function for the given DispatchOutputFormat and MoeRunnerBackend.
+
+    :param dispatch_output_name: The DispatchOutputFormat name.
+    :param runner_backend_name: The MoeRunnerBackend name.
+    :return: The decorator function.
+    """
+
+    def decorator(
+        permute_func: Callable[
+            [DispatchOutput, MoeQuantInfo, MoeRunnerConfig, dict], RunnerInput
+        ]
+    ) -> Callable:
+
+        PermuteMethodPool.register_pre_permute(
+            dispatch_output_name, runner_backend_name, permute_func
+        )
+        return permute_func
+
+    return decorator
+
+
+def register_post_permute(
+    runner_backend_name: str,
+    combine_input_name: str,
+) -> Callable:
+    """
+    Decorator to register a post-permute function for the given MoeRunnerBackend and CombineInputFormat.
+
+    :param runner_backend_name: The MoeRunnerBackend name.
+    :param combine_input_name: The CombineInputFormat name.
+    :return: The decorator function.
+    """
+
+    def decorator(
+        permute_func: Callable[
+            [RunnerOutput, MoeQuantInfo, MoeRunnerConfig, dict], CombineInput
+        ]
+    ) -> Callable:
+        PermuteMethodPool.register_post_permute(
+            runner_backend_name, combine_input_name, permute_func
+        )
+        return permute_func
+
+    return decorator
diff --git a/python/sglang/srt/layers/moe/moe_runner/runner.py b/python/sglang/srt/layers/moe/moe_runner/runner.py
new file mode 100644
index 000000000..3b6fcd980
--- /dev/null
+++ b/python/sglang/srt/layers/moe/moe_runner/runner.py
@@ -0,0 +1,80 @@
+from __future__ import annotations
+
+import logging
+import os
+from typing import TYPE_CHECKING
+
+from sglang.srt.layers.moe.moe_runner.base import (
+    FusedOpPool,
+    MoeRunnerConfig,
+    PermuteMethodPool,
+)
+from sglang.srt.layers.moe.moe_runner.triton import TritonRunnerCore
+from sglang.srt.layers.moe.utils import get_moe_a2a_backend
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.moe_runner.base import MoeQuantInfo
+    from sglang.srt.layers.moe.token_dispatcher.base import CombineInput, DispatchOutput
+    from sglang.srt.layers.moe.utils import MoeRunnerBackend
+
+logger = logging.getLogger(__name__)
+
+
+class MoeRunner:
+
+    def __init__(self, runner_backend: MoeRunnerBackend, config: MoeRunnerConfig):
+        self.runner_backend = runner_backend
+        self.config = config
+
+        self.fused_func = None
+
+        if runner_backend.is_triton():
+            self.runner_core = TritonRunnerCore(config)
+        else:
+            raise NotImplementedError(f"Unsupported runner backend: {runner_backend}")
+
+        a2a_backend_name = get_moe_a2a_backend().value
+        runner_backend_name = runner_backend.value
+
+        self.fused_func = FusedOpPool.get_fused_func(
+            a2a_backend_name, runner_backend_name
+        )
+
+        SGLANG_CI_DISABLE_MOE_FUSED_FUNC = os.environ.get(
+            "SGLANG_CI_DISABLE_MOE_FUSED_FUNC", "0"
+        )
+        if SGLANG_CI_DISABLE_MOE_FUSED_FUNC == "1":
+            logger.info(
+                "SGLANG_CI_DISABLE_MOE_FUSED_FUNC is set to 1, disabling fused func"
+            )
+            self.fused_func = None
+
+    def run(
+        self, dispatch_output: DispatchOutput, quant_info: MoeQuantInfo
+    ) -> CombineInput:
+
+        if self.fused_func is not None:
+            return self.fused_func(dispatch_output, quant_info, self.config)
+
+        dispatch_format = dispatch_output.format.value
+        runner_format = self.runner_core.runner_backend.value
+        self.pre_permute_func = PermuteMethodPool.get_pre_permute(
+            dispatch_format, runner_format
+        )
+
+        running_state = {}
+        runner_input = self.pre_permute_func(
+            dispatch_output, quant_info, self.config, running_state
+        )
+        runner_output = self.runner_core.run(runner_input, quant_info, running_state)
+
+        runner_format = self.runner_core.runner_backend.value
+        combine_format = dispatch_output.format.value
+        self.post_permute_func = PermuteMethodPool.get_post_permute(
+            runner_format, combine_format
+        )
+        combine_input = self.post_permute_func(
+            runner_output, quant_info, self.config, running_state
+        )
+
+        return combine_input
diff --git a/python/sglang/srt/layers/moe/moe_runner/triton.py b/python/sglang/srt/layers/moe/moe_runner/triton.py
new file mode 100644
index 000000000..116fdcaa0
--- /dev/null
+++ b/python/sglang/srt/layers/moe/moe_runner/triton.py
@@ -0,0 +1,448 @@
+from __future__ import annotations
+
+import functools
+import os
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, List, Optional
+
+import torch
+import triton.language as tl
+
+from sglang.srt.layers.moe.moe_runner.base import (
+    MoeQuantInfo,
+    MoeRunnerConfig,
+    MoeRunnerCore,
+    RunnerInput,
+    RunnerOutput,
+    register_fused_func,
+    register_post_permute,
+    register_pre_permute,
+)
+from sglang.srt.layers.moe.utils import MoeRunnerBackend
+from sglang.srt.utils import cpu_has_amx_support, is_cpu, is_cuda, is_hip
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher.standard import (
+        StandardCombineInput,
+        StandardDispatchOutput,
+    )
+
+
+_is_hip = is_hip()
+_is_cuda = is_cuda()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+_use_aiter = bool(int(os.getenv("SGLANG_MOE_USE_AITER", "0")))
+_MOE_PADDING_SIZE = 128 if bool(int(os.getenv("SGLANG_MOE_PADDING", "0"))) else 0
+
+
+if _is_cuda:
+    from sgl_kernel import gelu_and_mul, silu_and_mul
+elif _is_cpu and _is_cpu_amx_available:
+    pass
+elif _is_hip:
+    from vllm import _custom_ops as vllm_ops  # gelu_and_mul, silu_and_mul
+
+    if _use_aiter:
+        try:
+            from aiter import moe_sum
+        except ImportError:
+            raise ImportError("aiter is required when SGLANG_USE_AITER is set to True")
+
+
+if _is_cuda or _is_hip:
+    from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
+
+
+@dataclass
+class TritonRunnerInput(RunnerInput):
+
+    hidden_states: torch.Tensor
+    topk_weights: torch.Tensor
+    topk_ids: torch.Tensor
+    sorted_token_ids: torch.Tensor
+    expert_ids: torch.Tensor
+    num_tokens_post_padded: torch.Tensor
+
+    @property
+    def runner_backend(self) -> MoeRunnerBackend:
+        return MoeRunnerBackend.TRITON
+
+
+@dataclass
+class TritonRunnerOutput(RunnerOutput):
+
+    hidden_states: torch.Tensor
+
+    @property
+    def runner_backend(self) -> MoeRunnerBackend:
+        return MoeRunnerBackend.TRITON
+
+
+@dataclass
+class TritonMoeQuantInfo(MoeQuantInfo):
+    w13_weight: torch.Tensor
+    w2_weight: torch.Tensor
+    b13: Optional[torch.Tensor] = None
+    b2: Optional[torch.Tensor] = None
+    use_fp8_w8a8: bool = False
+    use_int8_w8a8: bool = False
+    use_int8_w8a16: bool = False
+    use_int4_w4a16: bool = False
+    per_channel_quant: bool = False
+    w13_scale: Optional[torch.Tensor] = None
+    w2_scale: Optional[torch.Tensor] = None
+    w13_zp: Optional[torch.Tensor] = None
+    w2_zp: Optional[torch.Tensor] = None
+    a13_scale: Optional[torch.Tensor] = None
+    a2_scale: Optional[torch.Tensor] = None
+    block_shape: Optional[List[int]] = None
+
+
+class TritonRunnerCore(MoeRunnerCore):
+
+    def __init__(self, config: MoeRunnerConfig):
+        super().__init__(config)
+
+    def run(
+        self,
+        runner_input: TritonRunnerInput,
+        quant_info: TritonMoeQuantInfo,
+        running_state: dict,
+    ) -> TritonRunnerOutput:
+
+        # TODO: move these functions to the triton runner
+        from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
+            invoke_fused_moe_kernel,
+            moe_sum_reduce_torch_compile,
+            moe_sum_reduce_triton,
+            swiglu_with_alpha_and_limit,
+        )
+
+        hidden_states = runner_input.hidden_states
+        topk_weights = runner_input.topk_weights
+        topk_ids = runner_input.topk_ids
+        sorted_token_ids = runner_input.sorted_token_ids
+        expert_ids = runner_input.expert_ids
+        num_tokens_post_padded = runner_input.num_tokens_post_padded
+
+        w13 = quant_info.w13_weight
+        w2 = quant_info.w2_weight
+        b13 = quant_info.b13
+        b2 = quant_info.b2
+        a13_scale = quant_info.a13_scale
+        a2_scale = quant_info.a2_scale
+        w13_scale = quant_info.w13_scale
+        w2_scale = quant_info.w2_scale
+        w13_zp = quant_info.w13_zp
+        w2_zp = quant_info.w2_zp
+        block_shape = quant_info.block_shape
+        per_channel_quant = quant_info.per_channel_quant
+        use_fp8_w8a8 = quant_info.use_fp8_w8a8
+        use_int8_w8a8 = quant_info.use_int8_w8a8
+        use_int8_w8a16 = quant_info.use_int8_w8a16
+        use_int4_w4a16 = quant_info.use_int4_w4a16
+
+        activation = self.config.activation
+        no_combine = self.config.no_combine
+        inplace = self.config.inplace
+        gemm1_alpha = self.config.gemm1_alpha
+        gemm1_limit = self.config.gemm1_clamp_limit
+        routed_scaling_factor = self.config.routed_scaling_factor
+        apply_router_weight_on_input = self.config.apply_router_weight_on_input
+
+        M = hidden_states.shape[0]
+        E, N, _ = w13.shape
+        compute_type = (
+            tl.bfloat16 if hidden_states.dtype == torch.bfloat16 else tl.float16
+        )
+
+        intermediate_cache1 = torch.empty(
+            (M, topk_ids.shape[1], N),
+            device=hidden_states.device,
+            dtype=hidden_states.dtype,
+        )
+
+        invoke_fused_moe_kernel(
+            hidden_states,
+            w13,
+            b13,
+            intermediate_cache1,
+            a13_scale,
+            w13_scale,
+            w13_zp,
+            topk_weights,
+            topk_ids,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            apply_router_weight_on_input,
+            topk_ids.shape[1],
+            running_state["config"],
+            compute_type=compute_type,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            use_int4_w4a16=use_int4_w4a16,
+            per_channel_quant=per_channel_quant,
+            block_shape=block_shape,
+        )
+
+        intermediate_cache2 = torch.empty(
+            (M * topk_ids.shape[1], N // 2),
+            device=hidden_states.device,
+            dtype=hidden_states.dtype,
+        )
+
+        if activation == "silu":
+            if gemm1_alpha is not None:
+                assert gemm1_limit is not None
+                intermediate_cache2 = swiglu_with_alpha_and_limit(
+                    intermediate_cache1.view(-1, N),
+                    gemm1_alpha,
+                    gemm1_limit,
+                )
+            elif _is_cuda:
+                silu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2)
+            else:
+                vllm_ops.silu_and_mul(
+                    intermediate_cache2, intermediate_cache1.view(-1, N)
+                )
+        elif activation == "gelu":
+            assert gemm1_alpha is None, "gemm1_alpha is not supported for gelu"
+            assert gemm1_limit is None, "gemm1_limit is not supported for gelu"
+            if _is_cuda:
+                gelu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2)
+            else:
+                vllm_ops.gelu_and_mul(
+                    intermediate_cache2, intermediate_cache1.view(-1, N)
+                )
+        else:
+            raise ValueError(f"Unsupported activation: {activation=}")
+
+        intermediate_cache3 = torch.empty(
+            (M, topk_ids.shape[1], w2.shape[1]),
+            device=hidden_states.device,
+            dtype=hidden_states.dtype,
+        )
+
+        if no_combine:
+            assert not inplace
+            out_hidden_states = torch.empty(
+                (M, topk_ids.shape[1], w2.shape[1]),
+                device=hidden_states.device,
+                dtype=hidden_states.dtype,
+            )
+        elif inplace:
+            out_hidden_states = hidden_states
+        else:
+            out_hidden_states = torch.empty_like(hidden_states)
+
+        invoke_fused_moe_kernel(
+            intermediate_cache2,
+            w2,
+            b2,
+            (
+                intermediate_cache3
+                if not no_combine and topk_ids.shape[1] != 1
+                else out_hidden_states.unsqueeze(0)
+            ),
+            a2_scale,
+            w2_scale,
+            w2_zp,
+            topk_weights,
+            topk_ids,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            not apply_router_weight_on_input,
+            1,
+            running_state["config"],
+            compute_type=compute_type,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            use_int4_w4a16=use_int4_w4a16,
+            per_channel_quant=per_channel_quant,
+            block_shape=block_shape,
+        )
+
+        if routed_scaling_factor is None:
+            routed_scaling_factor = 1.0
+
+        if no_combine:
+            pass
+        elif _is_cuda:
+            if topk_ids.shape[1] == 1 and routed_scaling_factor == 1.0:
+                pass  # we write directly into out_hidden_states
+            elif topk_ids.shape[1] == 2 and routed_scaling_factor == 1.0:
+                torch.add(
+                    intermediate_cache3[:, 0],
+                    intermediate_cache3[:, 1],
+                    out=out_hidden_states,
+                ).squeeze(dim=1)
+            else:
+                # According to micro benchmark results, torch.compile can get better performance for small token.
+                if M <= 32:
+                    moe_sum_reduce_torch_compile(
+                        intermediate_cache3.view(*intermediate_cache3.shape),
+                        out_hidden_states,
+                        routed_scaling_factor,
+                    )
+                else:
+                    moe_sum_reduce_triton(
+                        intermediate_cache3.view(*intermediate_cache3.shape),
+                        out_hidden_states,
+                        routed_scaling_factor,
+                    )
+        elif _is_hip:
+            if _use_aiter:
+                moe_sum(
+                    intermediate_cache3.view(*intermediate_cache3.shape),
+                    out_hidden_states,
+                )
+            else:
+                vllm_ops.moe_sum(
+                    intermediate_cache3.view(*intermediate_cache3.shape),
+                    out_hidden_states,
+                )
+        else:
+            vllm_ops.moe_sum(
+                intermediate_cache3.view(*intermediate_cache3.shape),
+                out_hidden_states,
+            )
+
+        return TritonRunnerOutput(
+            hidden_states=out_hidden_states,
+        )
+
+    @property
+    def runner_backend(self) -> MoeRunnerBackend:
+        return MoeRunnerBackend.TRITON
+
+
+@register_fused_func("none", "triton")
+def fused_experts_none_to_triton(
+    dispatch_output: StandardDispatchOutput,
+    quant_info: TritonMoeQuantInfo,
+    runner_config: MoeRunnerConfig,
+) -> StandardCombineInput:
+    from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
+    from sglang.srt.layers.moe.token_dispatcher.standard import StandardCombineInput
+
+    output = fused_experts(
+        hidden_states=dispatch_output.hidden_states,
+        w1=quant_info.w13_weight,
+        w2=quant_info.w2_weight,
+        topk_output=dispatch_output.topk_output,
+        moe_runner_config=runner_config,
+        b1=quant_info.b13,
+        b2=quant_info.b2,
+        use_fp8_w8a8=quant_info.use_fp8_w8a8,
+        use_int8_w8a8=quant_info.use_int8_w8a8,
+        use_int8_w8a16=quant_info.use_int8_w8a16,
+        use_int4_w4a16=quant_info.use_int4_w4a16,
+        per_channel_quant=quant_info.per_channel_quant,
+        w1_scale=quant_info.w13_scale,
+        w2_scale=quant_info.w2_scale,
+        w1_zp=quant_info.w13_zp,
+        w2_zp=quant_info.w2_zp,
+        a1_scale=quant_info.a13_scale,
+        a2_scale=quant_info.a2_scale,
+        block_shape=quant_info.block_shape,
+    )
+
+    return StandardCombineInput(
+        hidden_states=output,
+    )
+
+
+@register_pre_permute("standard", "triton")
+def pre_permute_standard_to_triton(
+    dispatch_output: StandardDispatchOutput,
+    quant_info: TritonMoeQuantInfo,
+    runner_config: MoeRunnerConfig,
+    running_state: dict,
+) -> TritonRunnerInput:
+
+    # NOTE: this is dead code as a fused func for standard format is registered.
+    # This is left here for testing and examples.
+
+    from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
+        get_config_dtype_str,
+        moe_align_block_size,
+        try_get_optimal_moe_config,
+    )
+    from sglang.srt.layers.moe.topk import TopKOutputChecker
+
+    hidden_states, topk_output = dispatch_output
+
+    assert TopKOutputChecker.format_is_standard(topk_output)
+
+    num_tokens = hidden_states.shape[0]
+    num_local_experts = runner_config.num_local_experts
+
+    if (
+        not (quant_info.use_fp8_w8a8 or quant_info.use_int8_w8a8)
+        or quant_info.block_shape is not None
+        or _use_aiter
+    ):
+        padding_size = 0
+    else:
+        padding_size = _MOE_PADDING_SIZE
+
+    config_dtype = get_config_dtype_str(
+        use_fp8_w8a8=quant_info.use_fp8_w8a8,
+        use_int8_w8a8=quant_info.use_int8_w8a8,
+        use_int8_w8a16=quant_info.use_int8_w8a16,
+        use_int4_w4a16=quant_info.use_int4_w4a16,
+        dtype=hidden_states.dtype,
+    )
+
+    get_config_func = functools.partial(
+        try_get_optimal_moe_config,
+        quant_info.w13_weight.shape,
+        (
+            num_local_experts,
+            quant_info.w2_weight.shape[1],
+            quant_info.w2_weight.shape[2] - padding_size,
+        ),
+        topk_output.topk_ids.shape[1],
+        config_dtype,
+        block_shape=quant_info.block_shape,
+    )
+
+    config = get_config_func(num_tokens)
+
+    sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
+        topk_output.topk_ids, config["BLOCK_SIZE_M"], num_local_experts
+    )
+
+    running_state["config"] = config
+
+    return TritonRunnerInput(
+        hidden_states=hidden_states,
+        topk_weights=topk_output.topk_weights,
+        topk_ids=topk_output.topk_ids,
+        sorted_token_ids=sorted_token_ids,
+        expert_ids=expert_ids,
+        num_tokens_post_padded=num_tokens_post_padded,
+    )
+
+
+@register_post_permute("triton", "standard")
+def post_permute_triton_to_standard(
+    runner_output: TritonRunnerOutput,
+    quant_info: TritonMoeQuantInfo,
+    runner_config: MoeRunnerConfig,
+    running_state: dict,
+) -> StandardCombineInput:
+
+    # NOTE: this is dead code as a fused func for standard format is registered.
+    # This is left here for testing and examples.
+
+    from sglang.srt.layers.moe.token_dispatcher.standard import StandardCombineInput
+
+    return StandardCombineInput(
+        hidden_states=runner_output.hidden_states,
+    )
diff --git a/python/sglang/srt/layers/moe/rocm_moe_utils.py b/python/sglang/srt/layers/moe/rocm_moe_utils.py
new file mode 100644
index 000000000..5fe2de1e5
--- /dev/null
+++ b/python/sglang/srt/layers/moe/rocm_moe_utils.py
@@ -0,0 +1,141 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.9.1rc2/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from enum import IntEnum
+from functools import cache
+from typing import Optional
+
+import torch
+
+from sglang.srt.utils import direct_register_custom_op, get_bool_env_var, is_hip
+
+_is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+
+class ActivationMethod(IntEnum):
+    # This allows interfacing with AITER ActivationType enum
+    # without importing the ActivationType enum from AITER globally.
+    SILU = 0
+    GELU = 1
+
+
+def rocm_aiter_asm_moe_tkw1_impl(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    fc1_scale: Optional[torch.Tensor] = None,
+    fc2_scale: Optional[torch.Tensor] = None,
+    fc1_smooth_scale: Optional[torch.Tensor] = None,
+    fc2_smooth_scale: Optional[torch.Tensor] = None,
+    a16: bool = False,
+    per_tensor_quant_scale: Optional[torch.Tensor] = None,
+    expert_mask: Optional[torch.Tensor] = None,
+    activation_method: int = ActivationMethod.SILU.value,
+) -> torch.Tensor:
+
+    from aiter import ActivationType
+    from aiter.fused_moe_bf16_asm import asm_moe_tkw1
+
+    activation = ActivationType(activation_method)
+
+    return asm_moe_tkw1(
+        hidden_states,
+        w1,
+        w2,
+        topk_weights,
+        topk_ids,
+        fc1_scale=fc1_scale,
+        fc2_scale=fc2_scale,
+        fc1_smooth_scale=fc1_smooth_scale,
+        fc2_smooth_scale=fc2_smooth_scale,
+        a16=a16,
+        per_tensor_quant_scale=per_tensor_quant_scale,
+        expert_mask=expert_mask,
+        activation=activation,
+    )
+
+
+def rocm_aiter_asm_moe_tkw1_fake(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    fc1_scale: Optional[torch.Tensor] = None,
+    fc2_scale: Optional[torch.Tensor] = None,
+    fc1_smooth_scale: Optional[torch.Tensor] = None,
+    fc2_smooth_scale: Optional[torch.Tensor] = None,
+    a16: bool = False,
+    per_tensor_quant_scale: Optional[torch.Tensor] = None,
+    expert_mask: Optional[torch.Tensor] = None,
+    activation_method: int = ActivationMethod.SILU.value,
+) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
+if _use_aiter:
+
+    direct_register_custom_op(
+        op_name="rocm_aiter_asm_moe_tkw1",
+        op_func=rocm_aiter_asm_moe_tkw1_impl,
+        mutates_args=[],
+        fake_impl=rocm_aiter_asm_moe_tkw1_fake,
+    )
+
+
+def rocm_fused_experts_tkw1(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    activation: str = "silu",
+    apply_router_weight_on_input: bool = False,
+    use_fp8_w8a8: bool = False,
+    per_channel_quant: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+
+    activation_method = (
+        ActivationMethod.SILU if activation == "silu" else ActivationMethod.GELU
+    )
+    # All AITER Fused MoE kernels are expecting the following datatypes
+    topk_weights = topk_weights.to(torch.float32)
+    topk_ids = topk_ids.to(torch.int32)
+
+    # w8a8 per-channel quantization
+    if per_channel_quant and apply_router_weight_on_input and use_fp8_w8a8:
+        # AITER tkw1 kernel for FP8 models with `apply_router_weight_on_input`
+        # This applies topk_weights on the GEMM output of the first FC layer
+        #  rather than the second FC.
+        assert (
+            topk_weights.dim() == 2
+        ), "`topk_weights` should be in shape (num_tokens, topk)"
+        assert topk_weights.shape[-1] == 1, (
+            "Only support topk=1 when" " `apply_router_weight_on_input` is True"
+        )
+
+        return torch.ops.sglang.rocm_aiter_asm_moe_tkw1(
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            fc1_scale=w1_scale,
+            fc2_scale=w2_scale,
+            fc1_smooth_scale=None,
+            fc2_smooth_scale=None,
+            a16=False,
+            per_tensor_quant_scale=None,
+            expert_mask=None,
+            activation_method=activation_method,
+        )
+    else:
+        assert False, "This should not be called."
diff --git a/python/sglang/srt/layers/moe/router.py b/python/sglang/srt/layers/moe/router.py
new file mode 100644
index 000000000..0138dcdad
--- /dev/null
+++ b/python/sglang/srt/layers/moe/router.py
@@ -0,0 +1,392 @@
+from typing import Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.moe.topk import fused_topk
+from sglang.srt.utils import is_hip
+
+_is_hip = is_hip()
+
+
+@triton.jit
+def fused_moe_router_kernel(
+    input_ptr,  # input (bs, hidden_dim)
+    moe_router_weight_ptr,  # input (num_experts, hidden_dim)
+    topk_weights_ptr,  # output (bs, topk)
+    topk_ids_ptr,  # output (bs, topk)
+    correction_bias_ptr,
+    is_correction_bias: tl.constexpr,
+    num_experts: tl.constexpr,
+    topk: tl.constexpr,
+    moe_softcapping: tl.constexpr,
+    moe_renormalize: tl.constexpr,  # not supported
+    hidden_dim: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+
+    offsets = tl.arange(0, BLOCK_SIZE)
+    mask = offsets < hidden_dim
+
+    # moe_router_weight is k major
+    expert_offsets = tl.arange(0, num_experts)[:, None]
+    router_mask = mask[None, :]
+    w_router = tl.load(
+        moe_router_weight_ptr + expert_offsets * hidden_dim + offsets[None, :],
+        mask=router_mask,
+        other=0.0,
+    )
+
+    x = tl.load(input_ptr + pid * hidden_dim + offsets, mask=mask, other=0.0)
+
+    # todo: tl.dot?
+    logits = tl.sum((w_router.to(tl.float32) * x[None, :].to(tl.float32)), axis=-1)
+
+    # logit softcap
+    if moe_softcapping == 0:
+        logits_softcapped = logits
+    else:
+        logits_scaled = logits / moe_softcapping
+        exped = tl.exp(2 * logits_scaled)
+        top = exped - 1
+        bottom = exped + 1
+        logits_softcapped = top / bottom * moe_softcapping
+
+    # Add bias after softcapping
+    if is_correction_bias:
+        bias = tl.load(correction_bias_ptr + tl.arange(0, num_experts))
+        logits_softcapped = logits_softcapped + bias
+
+    # topk
+    # assert 1 <= topk <= num_experts
+
+    # 5.38 us
+
+    top1 = tl.argmax(logits_softcapped, axis=0)
+    tl.store(topk_ids_ptr + pid * topk + 0, top1)  # 5.63 us
+
+    top1_v = tl.max(logits_softcapped, axis=0)
+    invsumexp = 1.0 / tl.sum(tl.exp(logits_softcapped - top1_v), axis=0)
+
+    tl.store(
+        topk_weights_ptr + pid * topk + 0,
+        invsumexp,
+    )  # 5.73 us
+
+    if topk >= 2:
+        top2 = tl.argmax(
+            tl.where(
+                tl.arange(0, num_experts) != top1, logits_softcapped, float("-inf")
+            ),
+            axis=0,
+        )
+        tl.store(topk_ids_ptr + pid * topk + 1, top2)
+        top2_v = tl.sum(logits_softcapped * (tl.arange(0, num_experts) == top2), axis=0)
+        tl.store(
+            topk_weights_ptr + pid * topk + 1,
+            tl.exp(top2_v - top1_v) * invsumexp,
+        )  # 5.95us
+
+    # probably slow
+    if topk > 2:
+        topk_mask = tl.full(logits_softcapped.shape, 1.0, dtype=logits_softcapped.dtype)
+        topk_mask = tl.where(
+            tl.arange(0, num_experts) != top1, topk_mask, float("-inf")
+        )
+        topk_mask = tl.where(
+            tl.arange(0, num_experts) != top2, topk_mask, float("-inf")
+        )
+        for i in range(2, topk):
+            topi = tl.argmax(logits_softcapped + topk_mask, axis=0)
+            topk_mask = tl.where(
+                tl.arange(0, num_experts) != topi, topk_mask, float("-inf")
+            )
+            tl.store(topk_ids_ptr + pid * topk + i, topi)
+            topi_v = tl.sum(
+                logits_softcapped * (tl.arange(0, num_experts) == topi), axis=0
+            )
+            tl.store(
+                topk_weights_ptr + pid * topk + i,
+                tl.exp(topi_v - top1_v) * invsumexp,
+            )
+    # assert not moe_renormalize, "moe weight renormalization not implemented"
+
+
+def fused_moe_router_impl(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    topk: int,
+    moe_softcapping: float,
+    correction_bias: Optional[torch.Tensor] = None,
+):
+    assert len(x.shape) == 2 and x.shape[1] == router_weight.shape[1]
+    bs, hidden_dim = x.shape
+    num_experts = router_weight.shape[0]
+
+    # router_logits = torch.empty((bs, num_experts), dtype=torch.float32, device=x.device)
+    topk_weights = torch.empty((bs, topk), dtype=torch.float32, device=x.device)
+    topk_ids = torch.empty((bs, topk), dtype=torch.int32, device=x.device)
+    is_correction_bias = correction_bias is not None
+
+    max_warps = 16 if _is_hip else 32
+    config = {
+        "BLOCK_SIZE": triton.next_power_of_2(hidden_dim),
+        "num_warps": max(
+            min(triton.next_power_of_2(triton.cdiv(hidden_dim, 256)), max_warps), 4
+        ),
+    }
+
+    fused_moe_router_kernel[(bs,)](
+        x,
+        router_weight,
+        topk_weights,
+        topk_ids,
+        correction_bias,
+        is_correction_bias=is_correction_bias,
+        num_experts=num_experts,
+        topk=topk,
+        moe_softcapping=moe_softcapping,
+        moe_renormalize=False,
+        hidden_dim=hidden_dim,
+        **config,
+    )
+
+    return topk_weights, topk_ids
+
+
+@triton.jit
+def fused_moe_router_large_bs_kernel(
+    a_ptr,  # input (bs, hidden_dim)
+    b_ptr,  # input (num_experts, hidden_dim)
+    topk_weights_ptr,  # output (bs, topk)
+    topk_ids_ptr,  # output (bs, topk)
+    bs,
+    num_experts: tl.constexpr,
+    topk: tl.constexpr,  # only support topk <= 2
+    moe_softcapping: tl.constexpr,
+    moe_renormalize: tl.constexpr,  # not supported
+    K: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    stride_am: tl.constexpr,
+    stride_bn: tl.constexpr,
+):
+
+    # 1. get block id
+    pid = tl.program_id(axis=0)
+
+    # 2. create pointers for the first block of A and B
+    # 2.1. setup a_ptrs with offsets in m and k
+    offs_m = pid * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)[:, None]
+    bs_mask = offs_m < bs
+    offs_k = tl.arange(0, BLOCK_SIZE_K)[None, :]
+    a_ptrs = a_ptr + (offs_m * stride_am + offs_k)
+
+    # 2.2. setup b_ptrs with offsets in k and n.
+    #      Note: b matrix is k-major.
+    offs_k = tl.arange(0, BLOCK_SIZE_K)[None, :]
+    offs_n = tl.arange(0, BLOCK_SIZE_N)[:, None]
+    expert_mask = offs_n < num_experts
+    b_ptrs = b_ptr + (offs_n * stride_bn + offs_k)
+
+    # 3. Create an accumulator of float32 of size [BLOCK_SIZE_M, BLOCK_SIZE_N]
+    #    3.1. iterate in K dimension
+    #    3.2. transpose tile B
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, K // BLOCK_SIZE_K):  # hidden_dim % BLOCK_SIZE_K == 0
+        a = tl.load(
+            a_ptrs,
+            mask=bs_mask,
+            other=0.0,
+        ).to(tl.float32)
+        b = tl.load(b_ptrs, mask=expert_mask, other=0.0).to(tl.float32).T
+        acc += tl.dot(a, b)
+
+        # Advance the ptrs to the next K block.
+        a_ptrs += BLOCK_SIZE_K
+        b_ptrs += BLOCK_SIZE_K
+
+    # 4. logit softcap
+    if moe_softcapping == 0:
+        logits_softcapped = acc
+    else:
+        logits_scaled = acc / moe_softcapping
+        exped = tl.exp(2 * logits_scaled)
+        logits_softcapped = (exped - 1) / (exped + 1) * moe_softcapping
+
+    # 5. top1
+    arange_block_size_n = tl.arange(0, BLOCK_SIZE_N)[None, :]
+    cond_top1 = arange_block_size_n < num_experts
+    top1 = tl.argmax(tl.where(cond_top1, logits_softcapped, float("-inf")), axis=1)
+    top1_v = tl.max(
+        tl.where(cond_top1, logits_softcapped, float("-inf")), axis=1, keep_dims=True
+    )
+    top1_invsumexp = 1.0 / tl.sum(
+        tl.where(cond_top1, tl.exp(logits_softcapped - top1_v), 0.0), axis=1
+    )
+
+    # 6. store top1 to output
+    offs_top1 = pid * topk * BLOCK_SIZE_M + topk * tl.arange(0, BLOCK_SIZE_M)
+    top1_mask = offs_top1 < bs * topk
+    tl.store(topk_ids_ptr + offs_top1, top1, mask=top1_mask)
+    tl.store(
+        topk_weights_ptr + offs_top1,
+        top1_invsumexp,
+        mask=top1_mask,
+    )
+
+    # 7. handle topk == 2
+    if topk == 2:
+        cond_top2 = (arange_block_size_n < num_experts) & (
+            arange_block_size_n != top1[:, None]
+        )
+        top2 = tl.argmax(
+            tl.where(cond_top2, logits_softcapped, float("-inf")),
+            axis=1,
+            keep_dims=True,
+        )
+        top2_v = tl.sum(
+            logits_softcapped * (arange_block_size_n == top2), axis=1, keep_dims=True
+        )
+        top2_invsumexp = tl.exp(top2_v - top1_v) * top1_invsumexp[:, None]
+
+        # store top2
+        offs_top2 = (
+            pid * topk * BLOCK_SIZE_M + topk * tl.arange(0, BLOCK_SIZE_M)[:, None] + 1
+        )
+        top2_mask = offs_top2 < bs * topk
+        tl.store(topk_ids_ptr + offs_top2, top2, mask=top2_mask)
+        tl.store(
+            topk_weights_ptr + offs_top2,
+            top2_invsumexp,
+            mask=top2_mask,
+        )
+
+
+def fused_moe_router_large_bs_impl(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    topk: int,
+    moe_softcapping: float,
+    BLOCK_SIZE_M: int,
+    BLOCK_SIZE_N: int,
+    BLOCK_SIZE_K: int,
+):
+    assert len(x.shape) == 2 and x.shape[1] == router_weight.shape[1]
+    bs, hidden_dim = x.shape
+    num_experts = router_weight.shape[0]
+
+    assert num_experts <= BLOCK_SIZE_N
+    assert hidden_dim % BLOCK_SIZE_K == 0
+    assert topk <= 2
+
+    topk_weights = torch.empty((bs, topk), dtype=torch.float32, device=x.device)
+    topk_ids = torch.empty((bs, topk), dtype=torch.int32, device=x.device)
+
+    grid = (triton.cdiv(bs, BLOCK_SIZE_M) * triton.cdiv(num_experts, BLOCK_SIZE_N),)
+
+    fused_moe_router_large_bs_kernel[grid](
+        a_ptr=x,
+        b_ptr=router_weight,
+        topk_weights_ptr=topk_weights,
+        topk_ids_ptr=topk_ids,
+        bs=bs,
+        num_experts=num_experts,
+        topk=topk,
+        moe_softcapping=moe_softcapping,
+        moe_renormalize=False,
+        K=hidden_dim,
+        BLOCK_SIZE_M=BLOCK_SIZE_M,
+        BLOCK_SIZE_N=BLOCK_SIZE_N,
+        BLOCK_SIZE_K=BLOCK_SIZE_K,
+        stride_am=hidden_dim,
+        stride_bn=hidden_dim,
+    )
+
+    return topk_weights, topk_ids
+
+
+def fused_moe_router_shim(
+    moe_softcapping,
+    hidden_states,
+    gating_output,
+    topk,
+    renormalize,
+    correction_bias: Optional[torch.Tensor] = None,
+):
+    assert not renormalize
+    assert (
+        len(hidden_states.shape) == 2
+        and hidden_states.shape[1] == gating_output.shape[1]
+    )
+    bs, hidden_dim = hidden_states.shape
+    num_experts = gating_output.shape[0]
+    BLOCK_SIZE_M = 32
+    BLOCK_SIZE_N = 16
+    BLOCK_SIZE_K = 256
+    if (
+        bs >= 512
+        and topk <= 2
+        and num_experts <= BLOCK_SIZE_N
+        and hidden_dim % BLOCK_SIZE_K == 0
+    ):
+        return fused_moe_router_large_bs_impl(
+            x=hidden_states,
+            router_weight=gating_output,
+            topk=topk,
+            moe_softcapping=moe_softcapping,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            BLOCK_SIZE_N=BLOCK_SIZE_N,
+            BLOCK_SIZE_K=BLOCK_SIZE_K,
+        )
+    else:
+        return fused_moe_router_impl(
+            x=hidden_states,
+            router_weight=gating_output,
+            topk=topk,
+            moe_softcapping=moe_softcapping,
+            correction_bias=correction_bias,
+        )
+
+
+class FusedMoeRouter:
+    def __init__(self, router_linear, topk, moe_softcapping) -> None:
+        self.router_linear = router_linear
+        self.topk = topk
+        self.moe_softcapping = moe_softcapping
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
+    def forward(
+        self, x: torch.Tensor, residual: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if x.is_cuda:
+            return self.forward_cuda(x, residual)
+        else:
+            return self.forward_vllm(x, residual)
+
+    def forward_cuda(
+        self, x: torch.Tensor, autotune=False
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        return fused_moe_router_shim(
+            moe_softcapping=self.moe_softcapping,
+            hidden_states=x,
+            gating_output=self.router_linear.weight,
+            topk=self.topk,
+            renormalize=False,
+        )
+
+    def forward_vllm(
+        self,
+        x: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # g, _ = self.router_linear.forward(x)
+        g = x.float() @ self.router_linear.weight.T.float()
+
+        g = torch.tanh(g.float() / self.moe_softcapping) * self.moe_softcapping
+
+        return fused_topk(x, g, self.topk, False)
diff --git a/python/sglang/srt/layers/moe/token_dispatcher/__init__.py b/python/sglang/srt/layers/moe/token_dispatcher/__init__.py
new file mode 100644
index 000000000..e1dbcdd44
--- /dev/null
+++ b/python/sglang/srt/layers/moe/token_dispatcher/__init__.py
@@ -0,0 +1,41 @@
+from sglang.srt.layers.moe.token_dispatcher.base import (
+    BaseDispatcher,
+    BaseDispatcherConfig,
+    CombineInput,
+    CombineInputChecker,
+    CombineInputFormat,
+    DispatchOutput,
+    DispatchOutputChecker,
+    DispatchOutputFormat,
+)
+from sglang.srt.layers.moe.token_dispatcher.deepep import (
+    DeepEPConfig,
+    DeepEPDispatcher,
+    DeepEPLLCombineInput,
+    DeepEPLLOutput,
+    DeepEPNormalCombineInput,
+    DeepEPNormalOutput,
+)
+from sglang.srt.layers.moe.token_dispatcher.standard import (
+    StandardCombineInput,
+    StandardDispatchOutput,
+)
+
+__all__ = [
+    "BaseDispatcher",
+    "BaseDispatcherConfig",
+    "CombineInput",
+    "CombineInputChecker",
+    "CombineInputFormat",
+    "DispatchOutput",
+    "DispatchOutputFormat",
+    "DispatchOutputChecker",
+    "StandardDispatchOutput",
+    "StandardCombineInput",
+    "DeepEPConfig",
+    "DeepEPDispatcher",
+    "DeepEPNormalOutput",
+    "DeepEPLLOutput",
+    "DeepEPLLCombineInput",
+    "DeepEPNormalCombineInput",
+]
diff --git a/python/sglang/srt/layers/moe/token_dispatcher/base.py b/python/sglang/srt/layers/moe/token_dispatcher/base.py
new file mode 100644
index 000000000..155860886
--- /dev/null
+++ b/python/sglang/srt/layers/moe/token_dispatcher/base.py
@@ -0,0 +1,150 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from enum import Enum
+from typing import TYPE_CHECKING, Protocol, TypeGuard, Union, runtime_checkable
+
+import torch
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import (
+        DeepEPLLCombineInput,
+        DeepEPLLOutput,
+        DeepEPNormalCombineInput,
+        DeepEPNormalOutput,
+        StandardCombineInput,
+        StandardDispatchOutput,
+    )
+    from sglang.srt.layers.moe.topk import TopKOutput
+
+# ------------------------------ Dispatch Output -------------------------------------
+
+
+class DispatchOutputChecker:
+
+    @staticmethod
+    def format_is_standard(
+        dispatch_output: DispatchOutput,
+    ) -> TypeGuard[StandardDispatchOutput]:
+        return dispatch_output.format.is_standard()
+
+    @staticmethod
+    def format_is_deepep_normal(
+        dispatch_output: DispatchOutput,
+    ) -> TypeGuard[DeepEPNormalOutput]:
+        return dispatch_output.format.is_deepep_normal()
+
+    @staticmethod
+    def format_is_deepep_ll(
+        dispatch_output: DispatchOutput,
+    ) -> TypeGuard[DeepEPLLOutput]:
+        return dispatch_output.format.is_deepep_ll()
+
+    @staticmethod
+    def format_is_deepep(
+        dispatch_output: DispatchOutput,
+    ) -> TypeGuard[Union[DeepEPNormalOutput, DeepEPLLOutput]]:
+        return dispatch_output.format.is_deepep()
+
+
+class DispatchOutputFormat(Enum):
+
+    STANDARD = "standard"
+    DEEPEP_NORMAL = "deepep_normal"
+    DEEPEP_LL = "deepep_ll"
+
+    def is_standard(self) -> bool:
+        return self == DispatchOutputFormat.STANDARD
+
+    def is_deepep_normal(self) -> bool:
+        return self == DispatchOutputFormat.DEEPEP_NORMAL
+
+    def is_deepep_ll(self) -> bool:
+        return self == DispatchOutputFormat.DEEPEP_LL
+
+    def is_deepep(self) -> bool:
+        return self in [
+            DispatchOutputFormat.DEEPEP_NORMAL,
+            DispatchOutputFormat.DEEPEP_LL,
+        ]
+
+
+@runtime_checkable
+class DispatchOutput(Protocol):
+    """Protocol for dispatch outputs in different formats."""
+
+    # TODO: add hidden_states to the protocol
+
+    @property
+    def format(self) -> DispatchOutputFormat: ...
+
+
+# ------------------------------ Combine Input -------------------------------------
+
+
+class CombineInputChecker:
+    @staticmethod
+    def format_is_standard(
+        combine_input: CombineInput,
+    ) -> TypeGuard[StandardCombineInput]:
+        return combine_input.format == CombineInputFormat.STANDARD
+
+    @staticmethod
+    def format_is_deepep_normal(
+        combine_input: CombineInput,
+    ) -> TypeGuard[DeepEPNormalCombineInput]:
+        return combine_input.format == CombineInputFormat.DEEPEP_NORMAL
+
+    @staticmethod
+    def format_is_deepep_ll(
+        combine_input: CombineInput,
+    ) -> TypeGuard[DeepEPLLCombineInput]:
+        return combine_input.format == CombineInputFormat.DEEPEP_LL
+
+    @staticmethod
+    def format_is_deepep(
+        combine_input: CombineInput,
+    ) -> TypeGuard[Union[DeepEPNormalCombineInput, DeepEPLLCombineInput]]:
+        return combine_input.format in [
+            CombineInputFormat.DEEPEP_NORMAL,
+            CombineInputFormat.DEEPEP_LL,
+        ]
+
+
+class CombineInputFormat(Enum):
+    STANDARD = "standard"
+    DEEPEP_NORMAL = "deepep_normal"
+    DEEPEP_LL = "deepep_ll"
+
+
+@runtime_checkable
+class CombineInput(Protocol):
+    """Protocol for combine inputs in different formats."""
+
+    # TODO: add hidden_states to the protocol
+
+    @property
+    def format(self) -> CombineInputFormat: ...
+
+
+# ------------------------------ Base Dispatcher -------------------------------------
+
+
+class BaseDispatcherConfig(ABC):
+    """Base class for dispatcher configs."""
+
+    pass
+
+
+class BaseDispatcher(ABC):
+    """Base class for dispatchers."""
+
+    @abstractmethod
+    def dispatch(
+        self, hidden_states: torch.Tensor, topk_output: TopKOutput, **kwargs
+    ) -> DispatchOutput:
+        pass
+
+    @abstractmethod
+    def combine(self, combine_input: CombineInput, **kwargs) -> torch.Tensor:
+        pass
diff --git a/python/sglang/srt/layers/moe/token_dispatcher/deepep.py b/python/sglang/srt/layers/moe/token_dispatcher/deepep.py
new file mode 100644
index 000000000..450cff0cb
--- /dev/null
+++ b/python/sglang/srt/layers/moe/token_dispatcher/deepep.py
@@ -0,0 +1,732 @@
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, List, NamedTuple, Optional, Tuple, Union
+
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.layers.moe.token_dispatcher.base import (
+    BaseDispatcher,
+    BaseDispatcherConfig,
+    CombineInput,
+    CombineInputFormat,
+    DispatchOutput,
+    DispatchOutputFormat,
+)
+from sglang.srt.layers.moe.utils import DeepEPMode, get_deepep_config, is_tbo_enabled
+from sglang.srt.layers.quantization import deep_gemm_wrapper
+from sglang.srt.utils import (
+    get_bool_env_var,
+    get_int_env_var,
+    is_hip,
+    is_npu,
+    load_json_config,
+)
+
+_is_npu = is_npu()
+
+try:
+    from deep_ep import Buffer, Config
+
+    if not _is_npu:
+        from sglang.srt.layers.quantization.fp8_kernel import (
+            sglang_per_token_group_quant_fp8,
+        )
+
+    use_deepep = True
+except ImportError:
+    use_deepep = False
+
+from enum import Enum, IntEnum, auto
+
+import torch
+import torch.distributed as dist
+
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and is_hip()
+
+logger = logging.getLogger(__name__)
+
+
+class DeepEPNormalOutput(NamedTuple):
+    """DeepEP normal dispatch output."""
+
+    hidden_states: torch.Tensor | Tuple[torch.Tensor, torch.Tensor]
+    # hidden_states_scale
+    topk_idx: torch.Tensor
+    topk_weights: torch.Tensor
+    num_recv_tokens_per_expert: List[int]
+
+    @property
+    def format(self) -> DispatchOutputFormat:
+        return DispatchOutputFormat.DEEPEP_NORMAL
+
+
+class DeepEPLLOutput(NamedTuple):
+    """DeepEP low latency dispatch output."""
+
+    hidden_states_fp8: Tuple[torch.Tensor, torch.Tensor]
+    topk_idx: torch.Tensor
+    topk_weights: torch.Tensor
+    masked_m: torch.Tensor
+    expected_m: int
+
+    @property
+    def format(self) -> DispatchOutputFormat:
+        return DispatchOutputFormat.DEEPEP_LL
+
+
+assert isinstance(DeepEPNormalOutput, DispatchOutput)
+assert isinstance(DeepEPLLOutput, DispatchOutput)
+
+
+class DeepEPNormalCombineInput(NamedTuple):
+    """DeepEP normal combine input."""
+
+    pass
+
+    @property
+    def format(self) -> CombineInputFormat:
+        return CombineInputFormat.DEEPEP_NORMAL
+
+
+class DeepEPLLCombineInput(NamedTuple):
+    """DeepEP low latency combine input."""
+
+    pass
+
+    @property
+    def format(self) -> CombineInputFormat:
+        return CombineInputFormat.DEEPEP_LL
+
+
+assert isinstance(DeepEPNormalCombineInput, CombineInput)
+assert isinstance(DeepEPLLCombineInput, CombineInput)
+
+
+class DeepEPDispatchMode(IntEnum):
+    NORMAL = auto()
+    LOW_LATENCY = auto()
+
+
+class DeepEPBuffer:
+    _buffer = None
+    _dispatch_mode: Optional[DeepEPDispatchMode] = None
+    _hidden_size: Optional[int] = None
+    _num_max_dispatch_tokens_per_rank: Optional[int] = None
+    _num_experts: Optional[int] = None
+
+    @classmethod
+    def get_deepep_buffer(
+        cls,
+        group: dist.ProcessGroup,
+        hidden_size: int,
+        param_bytes: int,
+        deepep_mode: DeepEPMode,
+        num_max_dispatch_tokens_per_rank: int = -1,
+        num_experts: int = -1,
+    ):
+        if cls._buffer is not None:
+            return cls._buffer
+
+        cls._hidden_size = hidden_size
+        cls._num_max_dispatch_tokens_per_rank = num_max_dispatch_tokens_per_rank
+        cls._num_experts = num_experts
+
+        num_nvl_bytes, num_rdma_bytes = 0, 0
+        if deepep_mode.enable_normal():
+            hidden_bytes = hidden_size * param_bytes
+            for config in (
+                DeepEPConfig.get_instance().normal_dispatch_config
+                or Buffer.get_dispatch_config(group.size()),
+                DeepEPConfig.get_instance().normal_combine_config
+                or Buffer.get_combine_config(group.size()),
+            ):
+                num_nvl_bytes = max(
+                    config.get_nvl_buffer_size_hint(hidden_bytes, group.size()),
+                    num_nvl_bytes,
+                )
+                num_rdma_bytes = max(
+                    config.get_rdma_buffer_size_hint(hidden_bytes, group.size()),
+                    num_rdma_bytes,
+                )
+        if deepep_mode.enable_low_latency():
+            assert num_max_dispatch_tokens_per_rank != -1
+            assert num_experts != -1 and num_experts % group.size() == 0
+            num_rdma_bytes = max(
+                Buffer.get_low_latency_rdma_size_hint(
+                    num_max_dispatch_tokens_per_rank,
+                    hidden_size,
+                    group.size(),
+                    num_experts,
+                ),
+                num_rdma_bytes,
+            )
+
+        if deepep_mode == DeepEPMode.NORMAL:
+            num_qps_per_rank = DeepEPConfig.get_instance().num_sms // 2
+        elif deepep_mode in [DeepEPMode.LOW_LATENCY, DeepEPMode.AUTO]:
+            num_qps_per_rank = num_experts // group.size()
+        else:
+            raise NotImplementedError
+
+        if not _is_npu:
+            total_num_sms = torch.cuda.get_device_properties(
+                device="cuda"
+            ).multi_processor_count
+            if (
+                (deepep_mode != DeepEPMode.LOW_LATENCY)
+                and not is_tbo_enabled()
+                and (DeepEPConfig.get_instance().num_sms < total_num_sms // 2)
+            ):
+                logger.warning(
+                    f"Only use {DeepEPConfig.get_instance().num_sms} SMs for DeepEP communication. "
+                    f"This may result in highly suboptimal performance. "
+                    f"Consider using --deepep-config to change the behavior."
+                )
+
+        cls._buffer = Buffer(
+            group,
+            num_nvl_bytes,
+            num_rdma_bytes,
+            low_latency_mode=deepep_mode.enable_low_latency(),
+            num_qps_per_rank=num_qps_per_rank,
+            # TODO can be false when unneeded
+            allow_mnnvl=True,
+        )
+        return cls._buffer
+
+    @classmethod
+    def clean_buffer(cls):
+        if not cls._buffer.low_latency_mode:
+            return
+        cls._buffer.clean_low_latency_buffer(
+            cls._num_max_dispatch_tokens_per_rank,
+            cls._hidden_size,
+            cls._num_experts,
+        )
+
+    @classmethod
+    def set_dispatch_mode_as_normal(cls):
+        cls._dispatch_mode = DeepEPDispatchMode.NORMAL
+
+    @classmethod
+    def set_dispatch_mode_as_low_latency(cls):
+        if cls._dispatch_mode == DeepEPDispatchMode.NORMAL:
+            cls.clean_buffer()
+        cls._dispatch_mode = DeepEPDispatchMode.LOW_LATENCY
+
+
+class DeepEPConfig(BaseDispatcherConfig):
+    _instance = None
+
+    def __init__(self):
+        config_str = get_deepep_config()
+        if config_str:
+            config_parsed = load_json_config(config_str)
+            if torch.distributed.get_rank() == 0:
+                logger.info(f"Use DeepEP Config: {config_parsed}")
+            config_dispatch = config_parsed["normal_dispatch"]
+            config_combine = config_parsed["normal_combine"]
+
+            self.normal_dispatch_config = Config(**config_dispatch)
+            self.normal_combine_config = Config(**config_combine)
+
+            assert config_dispatch["num_sms"] == config_combine["num_sms"]
+            self.num_sms = config_dispatch["num_sms"]
+        else:
+            self.normal_dispatch_config = None
+            self.normal_combine_config = None
+            self.num_sms = Buffer.num_sms
+
+    @classmethod
+    def get_instance(cls):
+        if cls._instance is None:
+            cls._instance = DeepEPConfig()
+        return cls._instance
+
+
+class _DeepEPDispatcherImplBase:
+    def __init__(
+        self,
+        group: torch.distributed.ProcessGroup,
+        router_topk: int,
+        permute_fusion: bool,
+        num_experts: int,
+        num_local_experts: int,
+        hidden_size: int,
+        params_dtype: torch.dtype,
+        deepep_mode: DeepEPMode,
+    ):
+        if not use_deepep:
+            raise ImportError(
+                "DeepEP is not installed. Please install DeepEP package from "
+                "https://github.com/deepseek-ai/deepep."
+            )
+
+        self.group = group
+        self.router_topk = router_topk
+        self.permute_fusion = permute_fusion
+        self.num_experts = num_experts
+        self.num_local_experts = num_local_experts
+        self.hidden_size = hidden_size
+        self.params_dtype = params_dtype
+        self.deepep_mode = deepep_mode
+
+        self.params_bytes = 2
+        self.num_max_dispatch_tokens_per_rank = get_int_env_var(
+            "SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK", 128
+        )
+        # DeepEP internode_ll dispatch uses FINISHED_SUM_TAG=1024
+        # and the logic requires num-tokens-sent-from-one-rank-to-another-rank less than it
+        assert self.num_max_dispatch_tokens_per_rank <= 1024
+
+        self.handle = None
+
+    def dispatch_a(
+        self,
+        hidden_states: torch.Tensor,
+        topk_idx: torch.Tensor,
+        topk_weights: torch.Tensor,
+    ):
+        raise NotImplementedError
+
+    def dispatch_b(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def combine_a(
+        self,
+        hidden_states: torch.Tensor,
+        topk_idx: torch.Tensor,
+        topk_weights: torch.Tensor,
+    ):
+        raise NotImplementedError
+
+    def combine_b(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def _get_buffer(self):
+        raise NotImplementedError
+
+
+class _DeepEPDispatcherImplNormal(_DeepEPDispatcherImplBase):
+    def __init__(self, async_finish: bool, **kwargs):
+        super().__init__(**kwargs)
+
+        self.async_finish = async_finish
+        self.src2dst = None
+
+    def dispatch_a(
+        self,
+        hidden_states: torch.Tensor,
+        topk_idx: torch.Tensor,
+        topk_weights: torch.Tensor,
+    ):
+        topk_idx = topk_idx.to(torch.int64)
+        if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM:
+            # TODO hard code 128 block quant,use fp8 communication
+            hidden_states = sglang_per_token_group_quant_fp8(
+                hidden_states,
+                128,
+                column_major_scales=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
+                scale_tma_aligned=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
+                scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
+            )
+        previous_event = Buffer.capture() if self.async_finish else None
+        return hidden_states, topk_idx, topk_weights, previous_event
+
+    def dispatch_b(self, hidden_states, topk_idx, topk_weights, previous_event):
+        (
+            hidden_states,
+            topk_idx,
+            topk_weights,
+            num_recv_tokens_per_expert,
+            event,
+        ) = self._dispatch_core(hidden_states, topk_idx, topk_weights, previous_event)
+        event.current_stream_wait() if self.async_finish else ()
+        return DeepEPNormalOutput(
+            hidden_states, topk_idx, topk_weights, num_recv_tokens_per_expert
+        )
+
+    def _dispatch_core(
+        self,
+        x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        topk_idx: torch.Tensor,
+        topk_weights: torch.Tensor,
+        previous_event,
+    ):
+        buffer = self._get_buffer()
+        (
+            num_tokens_per_rank,
+            num_tokens_per_rdma_rank,
+            num_tokens_per_expert,
+            is_token_in_rank,
+            previous_event,
+        ) = buffer.get_dispatch_layout(
+            topk_idx,
+            self.num_experts,
+            previous_event=previous_event,
+            async_finish=self.async_finish,
+            allocate_on_comm_stream=previous_event is not None,
+        )
+
+        # FIXME: `handle` should be transmitted with tokens from dispatch to combine.
+        # However, doing this would incur an unknown synchronization error, but keeping
+        # `handle` as a member variable works.
+
+        (
+            recv_x,
+            recv_topk_idx,
+            recv_topk_weights,
+            num_recv_tokens_per_expert,
+            self.handle,
+            event,
+        ) = buffer.dispatch(
+            x,
+            topk_idx=topk_idx,
+            topk_weights=topk_weights,
+            num_tokens_per_rank=num_tokens_per_rank,
+            num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
+            is_token_in_rank=is_token_in_rank,
+            num_tokens_per_expert=num_tokens_per_expert,
+            previous_event=previous_event,
+            async_finish=self.async_finish,
+            allocate_on_comm_stream=(previous_event is not None) and self.async_finish,
+            expert_alignment=128 if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM else 1,
+            config=DeepEPConfig.get_instance().normal_dispatch_config,
+        )
+
+        get_global_expert_distribution_recorder().on_deepep_dispatch_normal(
+            num_recv_tokens_per_expert,
+            num_tokens_per_rank=num_tokens_per_rank,
+            num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
+            num_tokens_per_expert=num_tokens_per_expert,
+        )
+
+        return (
+            recv_x,
+            recv_topk_idx,
+            recv_topk_weights,
+            num_recv_tokens_per_expert,
+            event,
+        )
+
+    def combine_a(
+        self,
+        hidden_states: torch.Tensor,
+        topk_idx: torch.Tensor,
+        topk_weights: torch.Tensor,
+    ):
+        from sglang.srt.layers.moe.ep_moe.kernels import (
+            deepep_post_reorder_triton_kernel,
+        )
+
+        if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM or _use_aiter or _is_npu:
+            output = hidden_states
+        else:
+            if hidden_states.shape[0] > 0:
+                num_tokens = self.src2dst.shape[0] // self.router_topk
+                output = torch.empty(
+                    (num_tokens, hidden_states.shape[1]),
+                    device=hidden_states.device,
+                    dtype=hidden_states.dtype,
+                )
+                deepep_post_reorder_triton_kernel[(num_tokens,)](
+                    hidden_states,
+                    output,
+                    self.src2dst,
+                    topk_idx,
+                    topk_weights,
+                    self.router_topk,
+                    hidden_states.shape[1],
+                    BLOCK_SIZE=512,
+                )
+            else:
+                output = torch.zeros(
+                    (0, hidden_states.shape[1]),
+                    device=hidden_states.device,
+                    dtype=hidden_states.dtype,
+                )
+        previous_event = Buffer.capture() if self.async_finish else None
+        return output, previous_event
+
+    def combine_b(self, output, previous_event):
+        hidden_states, event = self._combine_core(output, previous_event)
+        event.current_stream_wait() if self.async_finish else ()
+        self.handle = None
+        self.src2dst = None
+        return hidden_states
+
+    def _combine_core(self, x: torch.Tensor, previous_event):
+        buffer = self._get_buffer()
+        combined_x, _, event = buffer.combine(
+            x,
+            self.handle,
+            async_finish=self.async_finish,
+            previous_event=previous_event,
+            allocate_on_comm_stream=previous_event is not None,
+            config=DeepEPConfig.get_instance().normal_combine_config,
+        )
+        return combined_x, event
+
+    def _get_buffer(self):
+        DeepEPBuffer.set_dispatch_mode_as_normal()
+
+        return DeepEPBuffer.get_deepep_buffer(
+            self.group,
+            self.hidden_size,
+            self.params_bytes,
+            self.deepep_mode,
+            self.num_max_dispatch_tokens_per_rank,
+            self.num_experts,
+        )
+
+
+class _DeepEPDispatcherImplLowLatency(_DeepEPDispatcherImplBase):
+    def __init__(self, return_recv_hook: bool, **kwargs):
+        super().__init__(**kwargs)
+
+        """
+        num_max_dispatch_tokens_per_rank: the actual batch size in the decoding engine should be less than 256
+        https://github.com/deepseek-ai/DeepEP?tab=readme-ov-file#example-use-in-inference-decoding
+        """
+        self.return_recv_hook = return_recv_hook
+
+    def dispatch_a(
+        self,
+        hidden_states: torch.Tensor,
+        topk_idx: torch.Tensor,
+        topk_weights: torch.Tensor,
+    ):
+        buffer = self._get_buffer()
+        topk_idx = topk_idx.to(torch.int64)
+        expected_m = (
+            hidden_states.shape[0] * buffer.group_size * topk_idx.shape[1]
+            + self.num_experts
+        ) // self.num_experts
+        hidden_states, masked_m, event, hook = self._dispatch_core(
+            hidden_states,
+            topk_idx,
+            use_fp8=True,
+        )
+        return (
+            hidden_states,
+            topk_idx,
+            topk_weights,
+            masked_m,
+            expected_m,
+            event,
+            hook,
+        )
+
+    def dispatch_b(
+        self,
+        hidden_states,
+        topk_idx,
+        topk_weights,
+        masked_m,
+        expected_m,
+        event,
+        hook,
+    ):
+        hook() if self.return_recv_hook else event.current_stream_wait()
+
+        get_global_expert_distribution_recorder().on_deepep_dispatch_low_latency(
+            masked_m
+        )
+
+        deepep_output = DeepEPLLOutput(
+            hidden_states,
+            topk_idx,
+            topk_weights,
+            masked_m,
+            expected_m,
+        )
+        return deepep_output
+
+    def _dispatch_core(
+        self,
+        hidden_states: torch.Tensor,
+        topk_idx: torch.Tensor,
+        use_fp8: bool = False,
+    ):
+        buffer = self._get_buffer()
+        packed_recv_hidden, packed_recv_count, self.handle, event, hook = (
+            buffer.low_latency_dispatch(
+                hidden_states,
+                topk_idx,
+                self.num_max_dispatch_tokens_per_rank,
+                self.num_experts,
+                use_fp8=use_fp8,
+                async_finish=not self.return_recv_hook,
+                return_recv_hook=self.return_recv_hook,
+                round_scale=deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
+                and deep_gemm_wrapper.DEEPGEMM_BLACKWELL,
+                use_ue8m0=deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
+                and deep_gemm_wrapper.DEEPGEMM_BLACKWELL,
+            )
+        )
+        return packed_recv_hidden, packed_recv_count, event, hook
+
+    def combine_a(
+        self,
+        hidden_states: torch.Tensor,
+        topk_idx: torch.Tensor,
+        topk_weights: torch.Tensor,
+    ):
+        hidden_states, event, hook = self._combine_core(
+            hidden_states,
+            topk_idx,
+            topk_weights,
+        )
+        return hidden_states, event, hook
+
+    def combine_b(self, hidden_states, event, hook):
+        hook() if self.return_recv_hook else event.current_stream_wait()
+        return hidden_states
+
+    def _combine_core(
+        self,
+        hidden_states: torch.Tensor,
+        topk_idx: torch.Tensor,
+        topk_weights: torch.Tensor,
+    ):
+        buffer = self._get_buffer()
+        combined_hidden_states, event, hook = buffer.low_latency_combine(
+            hidden_states,
+            topk_idx,
+            topk_weights,
+            self.handle,
+            async_finish=not self.return_recv_hook,
+            return_recv_hook=self.return_recv_hook,
+        )
+        self.handle = None
+        return combined_hidden_states, event, hook
+
+    def _get_buffer(self):
+        DeepEPBuffer.set_dispatch_mode_as_low_latency()
+        return DeepEPBuffer.get_deepep_buffer(
+            self.group,
+            self.hidden_size,
+            self.params_bytes,
+            self.deepep_mode,
+            self.num_max_dispatch_tokens_per_rank,
+            self.num_experts,
+        )
+
+
+@dataclass
+class _Stage(Enum):
+    INITIAL = auto()
+    AFTER_DISPATCH_A = auto()
+    AFTER_DISPATCH_B = auto()
+    AFTER_COMBINE_A = auto()
+
+
+class DeepEPDispatcher(BaseDispatcher):
+    def __init__(
+        self,
+        group: torch.distributed.ProcessGroup,
+        router_topk: int,
+        permute_fusion: bool = False,
+        num_experts: int = None,
+        num_local_experts: int = None,
+        hidden_size: int = None,
+        params_dtype: torch.dtype = None,
+        deepep_mode: DeepEPMode = DeepEPMode.AUTO,
+        async_finish: bool = False,
+        return_recv_hook: bool = False,
+    ):
+        self.deepep_mode = deepep_mode
+
+        common_kwargs = dict(
+            group=group,
+            router_topk=router_topk,
+            permute_fusion=permute_fusion,
+            num_experts=num_experts,
+            num_local_experts=num_local_experts,
+            hidden_size=hidden_size,
+            params_dtype=params_dtype,
+            deepep_mode=deepep_mode,
+        )
+
+        if self.deepep_mode.enable_low_latency():
+            self._low_latency_dispatcher = _DeepEPDispatcherImplLowLatency(
+                return_recv_hook=return_recv_hook,
+                **common_kwargs,
+            )
+        if self.deepep_mode.enable_normal():
+            self._normal_dispatcher = _DeepEPDispatcherImplNormal(
+                async_finish=async_finish,
+                **common_kwargs,
+            )
+
+        self._stage = _Stage.INITIAL
+
+    def dispatch(self, *args, **kwargs) -> DispatchOutput:
+        self.dispatch_a(*args, **kwargs)
+        ret = self.dispatch_b()
+        return ret
+
+    def dispatch_a(
+        self,
+        hidden_states: torch.Tensor,
+        topk_idx: torch.Tensor,
+        topk_weights: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        self._update_stage(_Stage.INITIAL, _Stage.AFTER_DISPATCH_A)
+        inner_state = self._get_impl(forward_batch).dispatch_a(
+            hidden_states=hidden_states,
+            topk_idx=topk_idx,
+            topk_weights=topk_weights,
+        )
+        self._dispatch_intermediate_state = forward_batch, inner_state
+
+    def dispatch_b(self):
+        self._update_stage(_Stage.AFTER_DISPATCH_A, _Stage.AFTER_DISPATCH_B)
+        forward_batch, inner_state = self._dispatch_intermediate_state
+        del self._dispatch_intermediate_state
+        return self._get_impl(forward_batch).dispatch_b(*inner_state)
+
+    def combine(self, *args, **kwargs) -> Tuple:
+        self.combine_a(*args, **kwargs)
+        ret = self.combine_b()
+        return ret
+
+    def combine_a(
+        self,
+        hidden_states: torch.Tensor,
+        topk_idx: torch.Tensor,
+        topk_weights: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        self._update_stage(_Stage.AFTER_DISPATCH_B, _Stage.AFTER_COMBINE_A)
+        inner_state = self._get_impl(forward_batch).combine_a(
+            hidden_states=hidden_states,
+            topk_idx=topk_idx,
+            topk_weights=topk_weights,
+        )
+        self._combine_intermediate_state = forward_batch, inner_state
+
+    def combine_b(self):
+        self._update_stage(_Stage.AFTER_COMBINE_A, _Stage.INITIAL)
+        forward_batch, inner_state = self._combine_intermediate_state
+        del self._combine_intermediate_state
+        return self._get_impl(forward_batch).combine_b(*inner_state)
+
+    def _get_impl(self, forward_batch: ForwardBatch) -> _DeepEPDispatcherImplBase:
+        resolved_deepep_mode = self.deepep_mode.resolve(
+            forward_batch.is_extend_in_batch
+        )
+        if resolved_deepep_mode == DeepEPMode.NORMAL:
+            return self._normal_dispatcher
+        elif resolved_deepep_mode == DeepEPMode.LOW_LATENCY:
+            return self._low_latency_dispatcher
+        else:
+            raise ValueError(f"Invalid deepep_mode: {self.deepep_mode}")
+
+    def _update_stage(self, old_stage, new_stage):
+        assert self._stage == old_stage
+        self._stage = new_stage
diff --git a/python/sglang/srt/layers/moe/token_dispatcher/standard.py b/python/sglang/srt/layers/moe/token_dispatcher/standard.py
new file mode 100644
index 000000000..f984104f6
--- /dev/null
+++ b/python/sglang/srt/layers/moe/token_dispatcher/standard.py
@@ -0,0 +1,61 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, NamedTuple
+
+import torch
+
+from sglang.srt.layers.moe.token_dispatcher.base import (
+    BaseDispatcher,
+    CombineInput,
+    CombineInputFormat,
+    DispatchOutput,
+    DispatchOutputFormat,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.topk import TopKOutput
+
+
+class StandardDispatchOutput(NamedTuple):
+    """Standard dispatch output."""
+
+    hidden_states: torch.Tensor
+    topk_output: TopKOutput
+
+    @property
+    def format(self) -> DispatchOutputFormat:
+        return DispatchOutputFormat.STANDARD
+
+
+assert isinstance(StandardDispatchOutput, DispatchOutput)
+
+
+class StandardCombineInput(NamedTuple):
+    """Standard combine input."""
+
+    hidden_states: torch.Tensor
+
+    @property
+    def format(self) -> CombineInputFormat:
+        return CombineInputFormat.STANDARD
+
+
+assert isinstance(StandardCombineInput, CombineInput)
+
+
+class StandardDispatcher(BaseDispatcher):
+
+    def dispatch(
+        self, hidden_states: torch.Tensor, topk_output: TopKOutput
+    ) -> DispatchOutput:
+        return StandardDispatchOutput(
+            hidden_states=hidden_states, topk_output=topk_output
+        )
+
+    def combine(self, combine_input: CombineInput) -> torch.Tensor:
+        if isinstance(combine_input, StandardCombineInput):
+            return combine_input.hidden_states
+        else:
+            # TODO: this branch should be removed in the future
+            assert isinstance(combine_input, torch.Tensor)
+            return combine_input
diff --git a/python/sglang/srt/layers/moe/topk.py b/python/sglang/srt/layers/moe/topk.py
new file mode 100644
index 000000000..b8f73473c
--- /dev/null
+++ b/python/sglang/srt/layers/moe/topk.py
@@ -0,0 +1,876 @@
+# Copyright 2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import annotations
+
+import logging
+import math
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import (
+    Callable,
+    NamedTuple,
+    Optional,
+    Protocol,
+    TypeGuard,
+    runtime_checkable,
+)
+
+import torch
+import torch.nn.functional as F
+
+from sglang.srt.custom_op import CustomOp
+from sglang.srt.eplb import expert_location_dispatch
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location_dispatch import (
+    ExpertLocationDispatchInfo,
+    topk_ids_logical_to_physical,
+)
+from sglang.srt.layers.moe import (
+    get_moe_runner_backend,
+    should_use_flashinfer_trtllm_moe,
+)
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    get_bool_env_var,
+    get_compiler_backend,
+    is_cpu,
+    is_cuda,
+    is_hip,
+    is_npu,
+)
+
+try:
+    from triton_kernels.routing import GatherIndx, RoutingData, ScatterIndx, routing
+except ImportError:
+    pass
+logger = logging.getLogger(__name__)
+
+
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+_is_cpu = is_cpu()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_npu = is_npu()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+if _is_cuda:
+    from sgl_kernel import moe_fused_gate
+
+if _is_cuda or _is_hip:
+    from sgl_kernel import topk_softmax
+if _use_aiter:
+    try:
+        from aiter import biased_grouped_topk as aiter_biased_grouped_topk
+    except ImportError:
+        raise ImportError("aiter is required when SGLANG_USE_AITER is set to True")
+if _is_npu:
+    import torch_npu
+
+# -------------------------------- TopKConfig ---------------------------------------
+
+
+@dataclass
+class TopKConfig:
+    top_k: int
+    use_grouped_topk: bool = False
+    topk_group: Optional[int] = None
+    num_expert_group: Optional[int] = None
+    renormalize: bool = True
+    num_fused_shared_experts: int = 0
+    custom_routing_function: Optional[Callable] = None
+    correction_bias: Optional[torch.Tensor] = None
+    torch_native: bool = False
+    routed_scaling_factor: Optional[float] = None
+    apply_routed_scaling_factor_on_output: bool = False
+
+
+# -------------------------------- TopKOutput ---------------------------------------
+
+
+class TopKOutputChecker:
+
+    @staticmethod
+    def format_is_standard(topk_output: TopKOutput) -> TypeGuard[StandardTopKOutput]:
+        return topk_output.format.is_standard()
+
+    @staticmethod
+    def format_is_triton_kernel(
+        topk_output: TopKOutput,
+    ) -> TypeGuard[TritonKernelTopKOutput]:
+        return topk_output.format.is_triton_kernel()
+
+    @staticmethod
+    def format_is_bypassed(topk_output: TopKOutput) -> TypeGuard[BypassedTopKOutput]:
+        return topk_output.format.is_bypassed()
+
+
+class TopKOutputFormat(Enum):
+    STANDARD = auto()
+    TRITON_KERNEL = auto()
+    BYPASSED = auto()
+
+    def is_standard(self) -> bool:
+        return self == TopKOutputFormat.STANDARD
+
+    def is_triton_kernel(self) -> bool:
+        return self == TopKOutputFormat.TRITON_KERNEL
+
+    def is_bypassed(self) -> bool:
+        return self == TopKOutputFormat.BYPASSED
+
+
+@runtime_checkable
+class TopKOutput(Protocol):
+    """Protocol for top-k outputs in different formats."""
+
+    @property
+    def format(self) -> TopKOutputFormat:
+        """The format of the output."""
+        ...
+
+
+class StandardTopKOutput(NamedTuple):
+    """Standard top-k output format."""
+
+    topk_weights: torch.Tensor
+    topk_ids: torch.Tensor
+    router_logits: torch.Tensor
+
+    @property
+    def format(self) -> TopKOutputFormat:
+        return TopKOutputFormat.STANDARD
+
+
+class TritonKernelTopKOutput(NamedTuple):
+    """Triton kernel top-k output format."""
+
+    routing_data: RoutingData
+    gather_indx: GatherIndx
+    scatter_indx: ScatterIndx
+
+    @property
+    def format(self) -> TopKOutputFormat:
+        return TopKOutputFormat.TRITON_KERNEL
+
+
+class BypassedTopKOutput(NamedTuple):
+    """Bypassed top-k output format."""
+
+    hidden_states: torch.Tensor
+    router_logits: torch.Tensor
+    topk_config: TopKConfig
+    num_token_non_padded: Optional[torch.Tensor] = None
+    expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None
+
+    @property
+    def format(self) -> TopKOutputFormat:
+        return TopKOutputFormat.BYPASSED
+
+
+# -------------------------------- TopK ---------------------------------------
+
+
+class TopK(CustomOp):
+
+    def __init__(
+        self,
+        top_k: int,
+        *,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        renormalize: bool = True,
+        num_fused_shared_experts: int = 0,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        correction_bias: Optional[torch.Tensor] = None,
+        routed_scaling_factor: Optional[float] = None,
+        apply_routed_scaling_factor_on_output: Optional[bool] = False,
+        force_topk: bool = False,
+    ):
+        # NOTE: scoring_func is not used for now, but we keep it for future use
+        # see https://github.com/sgl-project/sglang/pull/4505 for more details
+        super().__init__()
+
+        if use_grouped_topk:
+            assert num_expert_group is not None and topk_group is not None
+
+        self.topk_config = TopKConfig(
+            top_k=top_k,
+            use_grouped_topk=use_grouped_topk,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
+            custom_routing_function=custom_routing_function,
+            correction_bias=correction_bias,
+            routed_scaling_factor=routed_scaling_factor,
+            apply_routed_scaling_factor_on_output=apply_routed_scaling_factor_on_output,
+        )
+
+        self.use_triton_kernels = get_moe_runner_backend().is_triton_kernel()
+        self.force_topk = force_topk
+
+    def forward_native(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        *,
+        num_token_non_padded: Optional[torch.Tensor] = None,
+        expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
+    ) -> TopKOutput:
+        self.topk_config.torch_native = True
+        return select_experts(
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+            topk_config=self.topk_config,
+            num_token_non_padded=num_token_non_padded,
+            expert_location_dispatch_info=expert_location_dispatch_info,
+        )
+
+    def forward_cuda(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        *,
+        num_token_non_padded: Optional[torch.Tensor] = None,
+        expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
+    ) -> TopKOutput:
+        if self.use_triton_kernels:
+            # renormalize=True is equivalent to sm_first=False
+            routing_data, gather_idx, scatter_idx = routing(
+                router_logits,
+                self.topk_config.top_k,
+                sm_first=not self.topk_config.renormalize,
+            )
+            return TritonKernelTopKOutput(routing_data, gather_idx, scatter_idx)
+        elif not self.force_topk and (
+            should_use_flashinfer_trtllm_moe()
+            or get_moe_runner_backend().is_flashinfer_mxfp4()
+        ):
+            return BypassedTopKOutput(
+                hidden_states=hidden_states,
+                router_logits=router_logits,
+                topk_config=self.topk_config,
+                num_token_non_padded=num_token_non_padded,
+                expert_location_dispatch_info=expert_location_dispatch_info,
+            )
+        else:
+            self.topk_config.torch_native = False
+            return select_experts(
+                hidden_states=hidden_states,
+                router_logits=router_logits,
+                topk_config=self.topk_config,
+                num_token_non_padded=num_token_non_padded,
+                expert_location_dispatch_info=expert_location_dispatch_info,
+            )
+
+    def forward_cpu(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        *,
+        num_token_non_padded: Optional[torch.Tensor] = None,
+        expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
+    ) -> TopKOutput:
+        return select_experts(
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+            topk_config=self.topk_config,
+            num_token_non_padded=num_token_non_padded,
+            expert_location_dispatch_info=expert_location_dispatch_info,
+        )
+
+    def forward_npu(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        *,
+        num_token_non_padded: Optional[torch.Tensor] = None,
+        expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
+    ) -> TopKOutput:
+        global_num_experts = router_logits.shape[-1]
+
+        # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern
+        if global_num_experts == 256:
+
+            routed_scaling_factor = self.topk_config.routed_scaling_factor or 1
+            router_logits = router_logits.to(torch.float32)
+
+            topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k(
+                router_logits,
+                k=self.topk_config.top_k,
+                bias=self.topk_config.correction_bias.to(torch.float32),
+                k_group=self.topk_config.topk_group,
+                group_count=self.topk_config.num_expert_group,
+                group_select_mode=1,
+                renorm=0,
+                norm_type=1,
+                routed_scaling_factor=routed_scaling_factor,
+                eps=float(1e-20),
+            )
+
+            if self.topk_config.renormalize:
+                topk_weights_sum = (
+                    topk_weights.sum(dim=-1, keepdim=True)
+                    if self.topk_config.num_fused_shared_experts == 0
+                    else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
+                )
+                topk_weights = topk_weights / topk_weights_sum
+
+            if expert_location_dispatch_info is not None:
+                topk_ids = topk_ids_logical_to_physical(
+                    topk_ids, expert_location_dispatch_info
+                )
+            get_global_expert_distribution_recorder().on_select_experts(
+                topk_ids=topk_ids
+            )
+
+            return StandardTopKOutput(topk_weights, topk_ids, _)
+        else:
+            self.topk_config.torch_native = True
+            return select_experts(
+                hidden_states=hidden_states,
+                router_logits=router_logits,
+                topk_config=self.topk_config,
+                num_token_non_padded=num_token_non_padded,
+                expert_location_dispatch_info=expert_location_dispatch_info,
+            )
+
+    def empty_topk_output(self, device: torch.device) -> TopKOutput:
+        topk = self.topk_config.top_k - self.topk_config.num_fused_shared_experts
+        topk_weights = torch.empty((0, topk), dtype=torch.float32, device=device)
+        topk_idx = torch.full((0, topk), -1, dtype=torch.int32, device=device)
+        router_logits = torch.empty((0, topk), dtype=torch.float32, device=device)
+        return StandardTopKOutput(topk_weights, topk_idx, router_logits)
+
+
+# ------------------------------- TopK implementation -------------------------------------
+
+
+def fused_topk_torch_native(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    correction_bias: torch.Tensor = None,
+):
+    if correction_bias is not None:
+        n_routed_experts = gating_output.shape[-1]
+        scores = gating_output.softmax(dim=-1)
+        scores_for_choice = scores.view(
+            -1, n_routed_experts
+        ) + correction_bias.unsqueeze(0)
+        topk_ids = torch.topk(scores_for_choice, k=topk, dim=-1, sorted=False)[1]
+        topk_weights = scores.gather(1, topk_ids)
+    else:
+        assert (
+            hidden_states.shape[0] == gating_output.shape[0]
+        ), f"Number of tokens mismatch, {hidden_states.shape=} vs {gating_output.shape=}"
+        M, _ = hidden_states.shape
+        topk_weights = torch.empty(
+            M, topk, dtype=torch.float32, device=hidden_states.device
+        )
+        topk_ids = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device)
+        topk_weights = F.softmax(gating_output.float(), dim=-1)
+        topk_weights, topk_ids = torch.topk(topk_weights, topk, dim=-1)
+
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    return topk_weights, topk_ids
+
+
+def fused_topk_cpu(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_token_non_padded: Optional[torch.Tensor] = None,
+    expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
+    correction_bias: torch.Tensor = None,
+):
+    topk_weights, topk_ids = torch.ops.sgl_kernel.topk_softmax_cpu(
+        hidden_states=hidden_states,
+        gating_output=gating_output,
+        topk=topk,
+        renormalize=renormalize,
+    )
+    topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
+    _mask_topk_ids_padded_region(topk_ids, num_token_non_padded)
+    return topk_weights, topk_ids
+
+
+def apply_topk_weights_cpu(need_apply, topk_weights, inputs):
+    if not need_apply:
+        return inputs, topk_weights
+
+    # TODO: fuse below processing in fused_experts_cpu kernel
+    inputs = inputs * topk_weights.to(inputs.dtype)
+    topk_weights = torch.ones_like(
+        topk_weights, dtype=torch.float32
+    )  # clear topk_weights as already applied
+
+    return inputs, topk_weights
+
+
+def fused_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_token_non_padded: Optional[torch.Tensor] = None,
+    expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
+):
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+
+    M, _ = hidden_states.shape
+
+    topk_weights = torch.empty(
+        M, topk, dtype=torch.float32, device=hidden_states.device
+    )
+    topk_ids = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device)
+
+    topk_softmax(
+        topk_weights,
+        topk_ids,
+        gating_output,
+        renormalize,
+    )
+
+    topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
+    _mask_topk_ids_padded_region(topk_ids, num_token_non_padded)
+    return topk_weights, topk_ids
+
+
+# This is used by the Deepseek V2/V3/R1 series models
+@torch.compile(dynamic=True, backend=get_compiler_backend())
+def grouped_topk_gpu(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: Optional[int] = None,
+    topk_group: Optional[int] = None,
+    num_fused_shared_experts: int = 0,
+    routed_scaling_factor: Optional[float] = None,
+    num_token_non_padded: Optional[torch.Tensor] = None,
+    expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
+    apply_routed_scaling_factor_on_output: Optional[bool] = False,
+):
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+
+    scores = torch.softmax(gating_output, dim=-1)
+    # NPU compiler limitation
+    if _is_npu and scores.dtype == torch.bfloat16:
+        scores = scores.to(torch.float16)
+    num_token = scores.shape[0]
+    num_experts = scores.shape[1]
+    group_scores = (
+        scores.view(num_token, num_expert_group, -1).max(dim=-1).values
+    )  # [n, n_group]
+    group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[
+        1
+    ]  # [n, top_k_group]
+    group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+    group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+    score_mask = (
+        group_mask.unsqueeze(-1)
+        .expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group)
+        .reshape(num_token, -1)
+    )  # [n, e]
+    tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
+    # TODO: NPU can't support directly evaluating a comparison for now
+    topk_weights, topk_ids = torch.topk(
+        tmp_scores,
+        k=topk,
+        dim=-1,
+        sorted=(True if num_fused_shared_experts > 0 else False),
+    )
+    if num_fused_shared_experts:
+        topk_ids[:, -1] = torch.randint(
+            low=num_experts,
+            high=num_experts + num_fused_shared_experts,
+            size=(topk_ids.size(0),),
+            dtype=topk_ids.dtype,
+            device=topk_ids.device,
+        )
+        topk_weights[:, -1] = topk_weights[:, :-1].sum(dim=-1) / routed_scaling_factor
+
+    if renormalize:
+        topk_weights_sum = (
+            topk_weights.sum(dim=-1, keepdim=True)
+            if num_fused_shared_experts == 0
+            else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
+        )
+        topk_weights = topk_weights / topk_weights_sum
+        if apply_routed_scaling_factor_on_output:
+            topk_weights *= routed_scaling_factor
+
+    topk_weights, topk_ids = topk_weights.to(torch.float32), topk_ids.to(torch.int32)
+    topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
+    _mask_topk_ids_padded_region(topk_ids, num_token_non_padded)
+    return topk_weights, topk_ids
+
+
+def grouped_topk_cpu(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: Optional[int] = None,
+    topk_group: Optional[int] = None,
+    num_fused_shared_experts: int = 0,
+    routed_scaling_factor: Optional[float] = None,
+    num_token_non_padded: Optional[torch.Tensor] = None,
+    expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
+    apply_routed_scaling_factor_on_output: Optional[bool] = False,
+):
+    assert not apply_routed_scaling_factor_on_output
+    assert expert_location_dispatch_info is None
+    return torch.ops.sgl_kernel.grouped_topk_cpu(
+        hidden_states,
+        gating_output,
+        topk,
+        renormalize,
+        num_expert_group,
+        topk_group,
+        num_fused_shared_experts,
+        routed_scaling_factor,
+        num_token_non_padded,
+    )
+
+
+@torch.compile(dynamic=True, backend=get_compiler_backend(), disable=_is_npu)
+def biased_grouped_topk_impl(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    correction_bias: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: Optional[int] = None,
+    topk_group: Optional[int] = None,
+    num_fused_shared_experts: int = 0,
+    routed_scaling_factor: Optional[float] = None,
+    num_token_non_padded: Optional[torch.Tensor] = None,
+    expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
+    apply_routed_scaling_factor_on_output: Optional[bool] = False,
+):
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+
+    scores = gating_output.sigmoid()
+    num_token = scores.shape[0]
+    num_experts = scores.shape[1]
+    scores_for_choice = scores.view(num_token, -1) + correction_bias.unsqueeze(0)
+    group_scores = (
+        scores_for_choice.view(num_token, num_expert_group, -1)
+        .topk(2, dim=-1)[0]
+        .sum(dim=-1)
+    )  # [n, n_group]
+    group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[
+        1
+    ]  # [n, top_k_group]
+    group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+    group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+    score_mask = (
+        group_mask.unsqueeze(-1)
+        .expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group)
+        .reshape(num_token, -1)
+    )  # [n, e]
+    tmp_scores = scores_for_choice.masked_fill(
+        ~score_mask.bool(), float("-inf")
+    )  # [n, e]
+    # TODO: NPU can't support directly evaluating a comparison for now
+    _, topk_ids = torch.topk(
+        tmp_scores,
+        k=topk,
+        dim=-1,
+        sorted=(True if num_fused_shared_experts > 0 else False),
+    )
+    topk_weights = scores.gather(1, topk_ids)
+
+    if num_fused_shared_experts:
+        topk_ids[:, -1] = torch.randint(
+            low=num_experts,
+            high=num_experts + num_fused_shared_experts,
+            size=(topk_ids.size(0),),
+            dtype=topk_ids.dtype,
+            device=topk_ids.device,
+        )
+        topk_weights[:, -1] = topk_weights[:, :-1].sum(dim=-1) / routed_scaling_factor
+
+    if renormalize:
+        topk_weights_sum = (
+            topk_weights.sum(dim=-1, keepdim=True)
+            if num_fused_shared_experts == 0
+            else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
+        )
+        topk_weights = topk_weights / topk_weights_sum
+        if apply_routed_scaling_factor_on_output:
+            topk_weights *= routed_scaling_factor
+
+    topk_weights, topk_ids = topk_weights.to(torch.float32), topk_ids.to(torch.int32)
+    topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
+    _mask_topk_ids_padded_region(topk_ids, num_token_non_padded)
+    return topk_weights, topk_ids
+
+
+def is_power_of_two(n):
+    return n > 0 and math.log2(n).is_integer()
+
+
+def _mask_topk_ids_padded_region(
+    topk_ids: torch.Tensor,
+    num_token_non_padded: Optional[torch.Tensor] = None,
+):
+    if num_token_non_padded is None:
+        return
+    indices = torch.arange(0, topk_ids.shape[0], device=topk_ids.device)
+    topk_ids[indices >= num_token_non_padded, :] = -1
+
+
+@torch.compile(dynamic=True, backend=get_compiler_backend())
+def _biased_grouped_topk_postprocess(
+    topk_ids, expert_location_dispatch_info, num_token_non_padded
+):
+    topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
+    _mask_topk_ids_padded_region(topk_ids, num_token_non_padded)
+    return topk_ids
+
+
+def biased_grouped_topk_gpu(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    correction_bias: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: Optional[int] = None,
+    topk_group: Optional[int] = None,
+    num_fused_shared_experts: int = 0,
+    routed_scaling_factor: Optional[float] = None,
+    num_token_non_padded: Optional[torch.Tensor] = None,
+    expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
+    apply_routed_scaling_factor_on_output: Optional[bool] = False,
+):
+    assert (
+        routed_scaling_factor is not None
+    ), "routed_scaling_factor is required for biased_grouped_topk"
+    # TODO: moe_fused_gate kernel is not supported for num_fused_shared_experts > 0 now.
+    if (
+        _is_cuda
+        and gating_output.shape[1] // num_expert_group
+        <= 32  # moe_fused_gate kernel ensure that num_experts/num_expert_group does not exceed MAX_VPT=32 now. And when kernel can handle MAX_VPT > 32, we can remove this assertion.
+        and is_power_of_two(correction_bias.shape[0])
+    ):
+        topk_weights, topk_ids = moe_fused_gate(
+            gating_output.to(dtype=torch.float32),
+            correction_bias,
+            num_expert_group,
+            topk_group,
+            topk,
+            num_fused_shared_experts,
+            routed_scaling_factor,
+            apply_routed_scaling_factor_on_output,
+        )
+        # TODO merge into kernel
+        if (expert_location_dispatch_info is not None) or (
+            num_token_non_padded is not None
+        ):
+            topk_ids = _biased_grouped_topk_postprocess(
+                topk_ids, expert_location_dispatch_info, num_token_non_padded
+            )
+        return topk_weights, topk_ids
+    elif _use_aiter:
+        assert not apply_routed_scaling_factor_on_output, "Not implemented"
+        token = gating_output.shape[0]
+        device = gating_output.device
+        assert (
+            hidden_states.shape[0] == gating_output.shape[0]
+        ), f"Number of tokens mismatch: hidden_states.shape[0] = {hidden_states.shape[0]}, gating_output.shape[0] = {gating_output.shape[0]}"
+        topk_weights = torch.empty((token, topk), dtype=torch.float32, device=device)
+        topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device)
+        aiter_biased_grouped_topk(
+            gating_output.to(dtype=torch.float32),
+            correction_bias,
+            topk_weights,
+            topk_ids,
+            num_expert_group,
+            topk_group,
+            renormalize,
+            routed_scaling_factor,
+        )
+        return topk_weights, topk_ids
+    else:
+        return biased_grouped_topk_impl(
+            hidden_states,
+            gating_output,
+            correction_bias,
+            topk,
+            renormalize,
+            num_expert_group,
+            topk_group,
+            num_fused_shared_experts=num_fused_shared_experts,
+            routed_scaling_factor=routed_scaling_factor,
+            num_token_non_padded=num_token_non_padded,
+            expert_location_dispatch_info=expert_location_dispatch_info,
+            apply_routed_scaling_factor_on_output=apply_routed_scaling_factor_on_output,
+        )
+
+
+def biased_grouped_topk_cpu(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    correction_bias: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: Optional[int] = None,
+    topk_group: Optional[int] = None,
+    compiled: bool = True,
+    num_fused_shared_experts: int = 0,
+    routed_scaling_factor: Optional[float] = None,
+    num_token_non_padded: Optional[torch.Tensor] = None,
+    expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
+    apply_routed_scaling_factor_on_output: Optional[bool] = False,
+):
+    assert expert_location_dispatch_info is None
+    assert not apply_routed_scaling_factor_on_output, "Not implemented"
+    return torch.ops.sgl_kernel.biased_grouped_topk_cpu(
+        hidden_states,
+        gating_output,
+        correction_bias,
+        topk,
+        renormalize,
+        num_expert_group,
+        topk_group,
+        num_fused_shared_experts,
+        routed_scaling_factor,
+        num_token_non_padded,
+    )
+
+
+if _is_cpu and _is_cpu_amx_available:
+    biased_grouped_topk = biased_grouped_topk_cpu
+    grouped_topk = grouped_topk_cpu
+    fused_topk_native = fused_topk_cpu
+    fused_topk = fused_topk_cpu
+else:
+    biased_grouped_topk = biased_grouped_topk_gpu
+    grouped_topk = grouped_topk_gpu
+    fused_topk_native = fused_topk_torch_native
+
+
+def select_experts(
+    hidden_states: torch.Tensor,
+    router_logits: torch.Tensor,
+    topk_config: TopKConfig,
+    *,
+    num_token_non_padded: Optional[torch.Tensor] = None,
+    expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
+) -> StandardTopKOutput:
+
+    top_k = topk_config.top_k
+    use_grouped_topk = topk_config.use_grouped_topk
+    topk_group = topk_config.topk_group
+    num_expert_group = topk_config.num_expert_group
+    renormalize = topk_config.renormalize
+    num_fused_shared_experts = topk_config.num_fused_shared_experts
+    custom_routing_function = topk_config.custom_routing_function
+    correction_bias = topk_config.correction_bias
+    torch_native = topk_config.torch_native
+    routed_scaling_factor = topk_config.routed_scaling_factor
+    apply_routed_scaling_factor_on_output = (
+        topk_config.apply_routed_scaling_factor_on_output
+    )
+
+    router_logits, correction_bias = (
+        expert_location_dispatch.transform_select_experts_inputs(
+            router_logits=router_logits,
+            correction_bias=correction_bias,
+            info=expert_location_dispatch_info,
+        )
+    )
+
+    # DeepSeek V2/V3/R1 series models use grouped_top_k
+    if use_grouped_topk:
+        assert topk_group is not None
+        assert num_expert_group is not None
+        if correction_bias is None:
+            topk_weights, topk_ids = grouped_topk(
+                hidden_states=hidden_states,
+                gating_output=router_logits,
+                topk=top_k,
+                renormalize=renormalize,
+                num_expert_group=num_expert_group,
+                topk_group=topk_group,
+                num_fused_shared_experts=num_fused_shared_experts,
+                routed_scaling_factor=routed_scaling_factor,
+                num_token_non_padded=num_token_non_padded,
+                expert_location_dispatch_info=expert_location_dispatch_info,
+                apply_routed_scaling_factor_on_output=apply_routed_scaling_factor_on_output,
+            )
+        else:
+            topk_weights, topk_ids = biased_grouped_topk(
+                hidden_states=hidden_states,
+                gating_output=router_logits,
+                correction_bias=correction_bias,
+                topk=top_k,
+                renormalize=renormalize,
+                num_expert_group=num_expert_group,
+                topk_group=topk_group,
+                num_fused_shared_experts=num_fused_shared_experts,
+                routed_scaling_factor=routed_scaling_factor,
+                num_token_non_padded=num_token_non_padded,
+                expert_location_dispatch_info=expert_location_dispatch_info,
+                apply_routed_scaling_factor_on_output=apply_routed_scaling_factor_on_output,
+            )
+    elif torch_native and custom_routing_function is None:
+        assert (
+            num_token_non_padded is None
+        ), "num_token_non_padded is not yet supported in fused_topk_native"
+        assert expert_location_dispatch_info is None
+        assert not apply_routed_scaling_factor_on_output, "Not implemented"
+        topk_weights, topk_ids = fused_topk_native(
+            hidden_states=hidden_states,
+            gating_output=router_logits,
+            topk=top_k,
+            renormalize=renormalize,
+            correction_bias=correction_bias,
+        )
+    elif custom_routing_function is None:
+        assert not apply_routed_scaling_factor_on_output, "Not implemented"
+        # Qwen3MOE uses fused_topk
+        topk_weights, topk_ids = fused_topk(
+            hidden_states=hidden_states,
+            gating_output=router_logits,
+            topk=top_k,
+            renormalize=renormalize,
+            num_token_non_padded=num_token_non_padded,
+            expert_location_dispatch_info=expert_location_dispatch_info,
+        )
+    else:
+        assert (
+            num_token_non_padded is None
+        ), "num_token_non_padded is not yet supported in custom_routing_function"
+        assert expert_location_dispatch_info is None
+        assert not apply_routed_scaling_factor_on_output, "Not implemented"
+        topk_weights, topk_ids = custom_routing_function(
+            hidden_states=hidden_states,
+            gating_output=router_logits,
+            topk=top_k,
+            renormalize=renormalize,
+        )
+
+    get_global_expert_distribution_recorder().on_select_experts(topk_ids=topk_ids)
+
+    return StandardTopKOutput(topk_weights, topk_ids, router_logits)
diff --git a/python/sglang/srt/layers/moe/utils.py b/python/sglang/srt/layers/moe/utils.py
new file mode 100644
index 000000000..b4e4ec424
--- /dev/null
+++ b/python/sglang/srt/layers/moe/utils.py
@@ -0,0 +1,201 @@
+from __future__ import annotations
+
+import importlib.util
+import logging
+from enum import Enum
+from functools import lru_cache
+from typing import TYPE_CHECKING, Optional
+
+from packaging import version as pkg_version
+
+from sglang.srt.distributed.parallel_state import get_moe_expert_parallel_world_size
+from sglang.srt.layers.dp_attention import (
+    get_attention_dp_size,
+    is_dp_attention_enabled,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.server_args import ServerArgs
+
+logger = logging.getLogger(__name__)
+
+
+class MoeA2ABackend(Enum):
+
+    NONE = "none"
+    DEEPEP = "deepep"
+
+    @classmethod
+    def _missing_(cls, value):
+        if value is None:
+            return cls.NONE
+        for member in cls:
+            if value == member.value:
+                return member
+        raise ValueError(f"No {cls.__name__} member for value {value}")
+
+    def is_none(self):
+        return self == MoeA2ABackend.NONE
+
+    def is_deepep(self):
+        return self == MoeA2ABackend.DEEPEP
+
+
+class MoeRunnerBackend(Enum):
+
+    AUTO = "auto"
+    TRITON = "triton"
+    TRITON_KERNEL = "triton_kernel"
+    FLASHINFER = "flashinfer_trtllm"
+    FLASHINFER_CUTLASS = "flashinfer_cutlass"
+    FLASHINFER_MXFP4 = "flashinfer_mxfp4"
+
+    def is_auto(self):
+        return self == MoeRunnerBackend.AUTO
+
+    def is_triton(self):
+        return self == MoeRunnerBackend.TRITON
+
+    def is_triton_kernel(self):
+        return self == MoeRunnerBackend.TRITON_KERNEL
+
+    def is_flashinfer_trtllm(self):
+        return self == MoeRunnerBackend.FLASHINFER
+
+    def is_flashinfer_cutlass(self):
+        return self == MoeRunnerBackend.FLASHINFER_CUTLASS
+
+    def is_flashinfer_mxfp4(self):
+        return self == MoeRunnerBackend.FLASHINFER_MXFP4
+
+
+class DeepEPMode(Enum):
+
+    NORMAL = "normal"
+    LOW_LATENCY = "low_latency"
+    AUTO = "auto"
+
+    def enable_normal(self) -> bool:
+        return self in [DeepEPMode.NORMAL, DeepEPMode.AUTO]
+
+    def enable_low_latency(self) -> bool:
+        return self in [DeepEPMode.LOW_LATENCY, DeepEPMode.AUTO]
+
+    def resolve(self, is_extend_in_batch: bool) -> DeepEPMode:
+        if self != DeepEPMode.AUTO:
+            return self
+
+        if is_extend_in_batch:
+            return DeepEPMode.NORMAL
+        else:
+            return DeepEPMode.LOW_LATENCY
+
+    def is_normal(self) -> bool:
+        return self == DeepEPMode.NORMAL
+
+    def is_low_latency(self) -> bool:
+        return self == DeepEPMode.LOW_LATENCY
+
+    def is_auto(self) -> bool:
+        return self == DeepEPMode.AUTO
+
+
+MOE_A2A_BACKEND: Optional[MoeA2ABackend] = None
+MOE_RUNNER_BACKEND: Optional[MoeRunnerBackend] = None
+DEEPEP_MODE: Optional[DeepEPMode] = None
+IS_TBO_ENABLED: Optional[bool] = None
+TBO_TOKEN_DISTRIBUTION_THRESHOLD: Optional[float] = None
+DEEPEP_CONFIG: Optional[str] = None
+DISABLE_FLASHINFER_CUTLASS_MOE_FP4_ALLGATHER: Optional[bool] = None
+
+
+def initialize_moe_config(server_args: ServerArgs):
+    global MOE_A2A_BACKEND
+    global MOE_RUNNER_BACKEND
+    global DEEPEP_MODE
+    global DEEPEP_CONFIG
+    global IS_TBO_ENABLED
+    global TBO_TOKEN_DISTRIBUTION_THRESHOLD
+    global DISABLE_FLASHINFER_CUTLASS_MOE_FP4_ALLGATHER
+
+    MOE_A2A_BACKEND = MoeA2ABackend(server_args.moe_a2a_backend)
+    MOE_RUNNER_BACKEND = MoeRunnerBackend(server_args.moe_runner_backend)
+    DEEPEP_MODE = DeepEPMode(server_args.deepep_mode)
+    DEEPEP_CONFIG = server_args.deepep_config or ""
+    IS_TBO_ENABLED = server_args.enable_two_batch_overlap
+    TBO_TOKEN_DISTRIBUTION_THRESHOLD = server_args.tbo_token_distribution_threshold
+    DISABLE_FLASHINFER_CUTLASS_MOE_FP4_ALLGATHER = (
+        server_args.disable_flashinfer_cutlass_moe_fp4_allgather
+    )
+
+
+def get_moe_a2a_backend() -> MoeA2ABackend:
+    global MOE_A2A_BACKEND
+    if MOE_A2A_BACKEND is None:
+        logger.warning("MOE_A2A_BACKEND is not initialized, using default backend")
+        MOE_A2A_BACKEND = MoeA2ABackend.NONE
+    return MOE_A2A_BACKEND
+
+
+def get_moe_runner_backend() -> MoeRunnerBackend:
+    global MOE_RUNNER_BACKEND
+    if MOE_RUNNER_BACKEND is None:
+        logger.warning("MOE_RUNNER_BACKEND is not initialized, using triton backend")
+        MOE_RUNNER_BACKEND = MoeRunnerBackend.AUTO
+    return MOE_RUNNER_BACKEND
+
+
+def get_deepep_mode() -> DeepEPMode:
+    global DEEPEP_MODE
+    if DEEPEP_MODE is None:
+        logger.warning("DEEPEP_MODE is not initialized, using auto mode")
+        DEEPEP_MODE = DeepEPMode.AUTO
+    return DEEPEP_MODE
+
+
+def get_deepep_config() -> str:
+    global DEEPEP_CONFIG
+    if DEEPEP_CONFIG is None:
+        logger.warning("DEEPEP_CONFIG is not initialized, using default config")
+        DEEPEP_CONFIG = ""
+    return DEEPEP_CONFIG
+
+
+def is_tbo_enabled() -> bool:
+    global IS_TBO_ENABLED
+    if IS_TBO_ENABLED is None:
+        IS_TBO_ENABLED = False
+    return IS_TBO_ENABLED
+
+
+def get_tbo_token_distribution_threshold() -> float:
+    global TBO_TOKEN_DISTRIBUTION_THRESHOLD
+    if TBO_TOKEN_DISTRIBUTION_THRESHOLD is None:
+        logger.warning(
+            "TBO_TOKEN_DISTRIBUTION_THRESHOLD is not initialized, using 0.48"
+        )
+        TBO_TOKEN_DISTRIBUTION_THRESHOLD = 0.48
+    return TBO_TOKEN_DISTRIBUTION_THRESHOLD
+
+
+@lru_cache(maxsize=1)
+def should_use_flashinfer_trtllm_moe():
+    result = get_moe_runner_backend().is_flashinfer_trtllm() and (
+        not importlib.util.find_spec("flashinfer")
+        or pkg_version.parse(__import__("flashinfer").__version__)
+        >= pkg_version.parse("0.2.9rc1")
+    )
+    return result
+
+
+@lru_cache(maxsize=1)
+def should_use_flashinfer_cutlass_moe_fp4_allgather():
+    """
+    Perform FP4 quantize before all-gather for flashinfer cutlass moe to reduce communication cost for high-throughput serving.
+    """
+    return (
+        not DISABLE_FLASHINFER_CUTLASS_MOE_FP4_ALLGATHER
+        and get_moe_runner_backend().is_flashinfer_cutlass()
+        and is_dp_attention_enabled()
+        and get_moe_expert_parallel_world_size() == get_attention_dp_size()
+    )
diff --git a/python/sglang/srt/layers/multimodal.py b/python/sglang/srt/layers/multimodal.py
new file mode 100644
index 000000000..738c65830
--- /dev/null
+++ b/python/sglang/srt/layers/multimodal.py
@@ -0,0 +1,189 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Logits processing."""
+
+import torch
+import triton
+import triton.language as tl
+
+FMIX32_C1 = 0x85EBCA6B
+FMIX32_C2 = 0xC2B2AE35
+POS_C1 = 0x27D4EB2D
+POS_C2 = 0x165667B1
+
+
+@triton.jit
+def _rotl32(x, r: tl.constexpr):
+    return (x << r) | (x >> (32 - r))
+
+
+@triton.jit
+def _fmix32(x, C1: tl.constexpr, C2: tl.constexpr):
+    c1 = tl.full((), C1, tl.uint32)
+    c2 = tl.full((), C2, tl.uint32)
+    x ^= x >> 16
+    x = x * c1
+    x ^= x >> 13
+    x = x * c2
+    x ^= x >> 16
+    return x
+
+
+@triton.jit
+def hash_tiles32_kernel_blocked(
+    in_ptr,
+    out_ptr,
+    n_u32,
+    seed1,
+    seed2,
+    FM_C1: tl.constexpr,
+    FM_C2: tl.constexpr,
+    POS_A: tl.constexpr,
+    POS_B: tl.constexpr,
+    TILE: tl.constexpr,
+    BLOCK: tl.constexpr,
+    USE_CG: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    base = pid * TILE
+
+    s1 = tl.full((), seed1, tl.uint32)
+    s2 = tl.full((), seed2, tl.uint32)
+    posA = tl.full((), POS_A, tl.uint32)
+    posB = tl.full((), POS_B, tl.uint32)
+
+    h1 = tl.zeros((), dtype=tl.uint32)
+    h2 = tl.zeros((), dtype=tl.uint32)
+
+    for off in tl.static_range(0, TILE, BLOCK):
+        idx = base + off + tl.arange(0, BLOCK)
+        m = idx < n_u32
+
+        if USE_CG:
+            v = tl.load(in_ptr + idx, mask=m, other=0, cache_modifier=".cg")
+        else:
+            v = tl.load(in_ptr + idx, mask=m, other=0)
+        v = v.to(tl.uint32)
+
+        iu = idx.to(tl.uint32)
+        p1 = (iu * posA + s1) ^ _rotl32(iu, 15)
+        p2 = (iu * posB + s2) ^ _rotl32(iu, 13)
+
+        k1 = _fmix32(v ^ p1, C1=FM_C1, C2=FM_C2)
+        k2 = _fmix32(v ^ p2, C1=FM_C1, C2=FM_C2)
+
+        zero32 = tl.zeros_like(k1)
+        k1 = tl.where(m, k1, zero32)
+        k2 = tl.where(m, k2, zero32)
+
+        h1 += tl.sum(k1, axis=0).to(tl.uint32)
+        h2 += tl.sum(k2, axis=0).to(tl.uint32)
+
+    nbytes = tl.full((), n_u32 * 4, tl.uint32)
+    h1 ^= nbytes
+    h2 ^= nbytes
+    h1 = _fmix32(h1, C1=FM_C1, C2=FM_C2)
+    h2 = (
+        _fmix32(h2, C1=FMIX32_C1, C2=FMIX32_C2)
+        if False
+        else _fmix32(h2, C1=FM_C1, C2=FM_C2)
+    )
+
+    out = (h1.to(tl.uint64) << 32) | h2.to(tl.uint64)
+    tl.store(out_ptr + pid, out)
+
+
+@triton.jit
+def add_tree_reduce_u64_kernel(in_ptr, out_ptr, n_elems, CHUNK: tl.constexpr):
+    pid = tl.program_id(axis=0)
+    start = pid * CHUNK
+    h = tl.zeros((), dtype=tl.uint64)
+    for i in tl.static_range(0, CHUNK):
+        idx = start + i
+        m = idx < n_elems
+        v = tl.load(in_ptr + idx, mask=m, other=0).to(tl.uint64)
+        h += v
+    tl.store(out_ptr + pid, h)
+
+
+def _as_uint32_words(t: torch.Tensor) -> torch.Tensor:
+    assert t.is_cuda, "Use .cuda() first"
+    tb = t.contiguous().view(torch.uint8)
+    nbytes = tb.numel()
+    pad = (4 - (nbytes & 3)) & 3
+    if pad:
+        tb_p = torch.empty(nbytes + pad, dtype=torch.uint8, device=tb.device)
+        tb_p[:nbytes].copy_(tb)
+        tb_p[nbytes:].zero_()
+        tb = tb_p
+    return tb.view(torch.uint32)
+
+
+def _final_splitmix64(x: int) -> int:
+    mask = (1 << 64) - 1
+    x &= mask
+    x ^= x >> 30
+    x = (x * 0xBF58476D1CE4E5B9) & mask
+    x ^= x >> 27
+    x = (x * 0x94D049BB133111EB) & mask
+    x ^= x >> 31
+    return x
+
+
+@torch.inference_mode()
+def gpu_tensor_hash(
+    tensor: torch.Tensor,
+    *,
+    seed: int = 0x243F6A88,
+    tile_words: int = 8192,
+    block_words: int = 256,
+    reduce_chunk: int = 1024,
+    num_warps: int = 4,
+    num_stages: int = 4,
+    use_cg: bool = True,
+) -> int:
+    assert tensor.is_cuda, "Use .cuda() first"
+    u32 = _as_uint32_words(tensor)
+    n = u32.numel()
+    if n == 0:
+        return 0
+
+    grid1 = (triton.cdiv(n, tile_words),)
+    partials = torch.empty(grid1[0], dtype=torch.uint64, device=u32.device)
+    hash_tiles32_kernel_blocked[grid1](
+        u32,
+        partials,
+        n,
+        seed1=seed & 0xFFFFFFFF,
+        seed2=((seed * 0x9E3779B1) ^ 0xDEADBEEF) & 0xFFFFFFFF,
+        FM_C1=FMIX32_C1,
+        FM_C2=FMIX32_C2,
+        POS_A=POS_C1,
+        POS_B=POS_C2,
+        TILE=tile_words,
+        BLOCK=block_words,
+        USE_CG=use_cg,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+
+    cur = partials
+    while cur.numel() > 1:
+        n_elems = cur.numel()
+        grid2 = (triton.cdiv(n_elems, reduce_chunk),)
+        nxt = torch.empty(grid2[0], dtype=torch.uint64, device=cur.device)
+        add_tree_reduce_u64_kernel[grid2](cur, nxt, n_elems, CHUNK=reduce_chunk)
+        cur = nxt
+
+    return _final_splitmix64(int(cur.item()))
diff --git a/python/sglang/srt/layers/parameter.py b/python/sglang/srt/layers/parameter.py
new file mode 100644
index 000000000..1ea75d70c
--- /dev/null
+++ b/python/sglang/srt/layers/parameter.py
@@ -0,0 +1,533 @@
+"""Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/parameter.py"""
+
+import logging
+from fractions import Fraction
+from typing import Callable, Optional, Union
+
+import torch
+from torch.nn import Parameter
+
+from sglang.srt.utils import is_cpu
+
+__all__ = [
+    "BasevLLMParameter",
+    "PackedvLLMParameter",
+    "PerTensorScaleParameter",
+    "ModelWeightParameter",
+    "ChannelQuantScaleParameter",
+    "GroupQuantScaleParameter",
+    "BlockQuantScaleParameter",
+    "PackedColumnParameter",
+    "RowvLLMParameter",
+]
+
+logger = logging.getLogger(__name__)
+
+_is_cpu = is_cpu()
+
+
+class BasevLLMParameter(Parameter):
+    """
+    Base parameter for vLLM linear layers. Extends the torch.nn.parameter
+    by taking in a linear weight loader. Will copy the loaded weight
+    into the parameter when the provided weight loader is called.
+    """
+
+    def __new__(cls, data: torch.Tensor, **kwargs):
+
+        return super().__new__(cls, data=data, requires_grad=False)
+
+    def __init__(self, data: torch.Tensor, weight_loader: Callable):
+        """
+        Initialize the BasevLLMParameter
+
+        :param data: torch tensor with the parameter data
+        :param weight_loader: weight loader callable
+
+        :returns: a torch.nn.parameter
+        """
+
+        self._weight_loader = weight_loader
+
+    @property
+    def weight_loader(self):
+        return self._weight_loader
+
+    def _assert_and_load(self, loaded_weight: torch.Tensor):
+        assert self.data.shape == loaded_weight.shape
+        self.data.copy_(loaded_weight)
+
+    def load_column_parallel_weight(self, loaded_weight: torch.Tensor):
+        self._assert_and_load(loaded_weight)
+
+    def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
+        self._assert_and_load(loaded_weight)
+
+    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        self._assert_and_load(loaded_weight)
+
+    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        self._assert_and_load(loaded_weight)
+
+
+class _ColumnvLLMParameter(BasevLLMParameter):
+    """
+    Private class defining weight loading functionality
+    (load_merged_column_weight, load_qkv_weight)
+    for parameters being loaded into linear layers with column
+    parallelism. This includes QKV and MLP layers which are
+    not already fused on disk. Requires an output dimension
+    to be defined. Called within the weight loader of
+    each of the column parallel linear layers.
+    """
+
+    def __init__(self, output_dim: int, **kwargs):
+        self._output_dim = output_dim
+        super().__init__(**kwargs)
+
+    @property
+    def output_dim(self):
+        return self._output_dim
+
+    def load_column_parallel_weight(
+        self,
+        loaded_weight: torch.Tensor,
+        tp_rank: int,
+        use_presharded_weights: bool = False,
+    ):
+        if not use_presharded_weights:
+            shard_size = self.data.shape[self.output_dim]
+
+            from sglang.srt.model_loader.weight_utils import (
+                narrow_padded_param_and_loaded_weight,
+            )
+
+            if _is_cpu:
+                param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                    self.data,
+                    loaded_weight,
+                    0,  # param_data_start
+                    tp_rank * shard_size,
+                    self.output_dim,
+                    shard_size,
+                )
+                assert param_data.shape == loaded_weight.shape
+                param_data.copy_(loaded_weight)
+                return
+            else:
+                loaded_weight = loaded_weight.narrow(
+                    self.output_dim, tp_rank * shard_size, shard_size
+                )
+
+        assert self.data.shape == loaded_weight.shape
+        self.data.copy_(loaded_weight)
+
+    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
+
+        shard_offset = kwargs.get("shard_offset")
+        shard_size = kwargs.get("shard_size")
+        tp_rank = kwargs.get("tp_rank")
+        use_presharded_weights = kwargs.get("use_presharded_weights")
+        if (
+            isinstance(self, (PackedColumnParameter, PackedvLLMParameter))
+            and self.packed_dim == self.output_dim
+        ):
+            shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
+                shard_offset=shard_offset, shard_size=shard_size
+            )
+
+        param_data = self.data
+
+        param_data = param_data.narrow(self.output_dim, shard_offset, shard_size)
+
+        from sglang.srt.model_loader.weight_utils import (
+            narrow_padded_param_and_loaded_weight,
+        )
+
+        if _is_cpu:
+            param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                param_data,
+                loaded_weight,
+                0,  # param_data_start
+                tp_rank * shard_size,
+                self.output_dim,
+                shard_size,
+                not use_presharded_weights,
+            )
+        else:
+            if not use_presharded_weights:
+                loaded_weight = loaded_weight.narrow(
+                    self.output_dim, tp_rank * shard_size, shard_size
+                )
+
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+    def load_qkv_weight(
+        self,
+        loaded_weight: torch.Tensor,
+        tp_rank: int,
+        use_presharded_weights: bool = False,
+        **kwargs,
+    ):
+
+        shard_offset = kwargs.get("shard_offset")
+        shard_size = kwargs.get("shard_size")
+        shard_id = kwargs.get("shard_id")
+        num_heads = kwargs.get("num_heads")
+
+        if (
+            isinstance(self, (PackedColumnParameter, PackedvLLMParameter))
+            and self.output_dim == self.packed_dim
+        ):
+            shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
+                shard_offset=shard_offset, shard_size=shard_size
+            )
+
+        param_data = self.data
+        shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads
+        param_data = param_data.narrow(self.output_dim, shard_offset, shard_size)
+
+        if _is_cpu:
+            from sglang.srt.model_loader.weight_utils import (
+                narrow_padded_param_and_loaded_weight,
+            )
+
+            param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                param_data,
+                loaded_weight,
+                0,  # param_data_start
+                shard_id * shard_size,
+                self.output_dim,
+                shard_size,
+                not use_presharded_weights,
+            )
+        else:
+            if not use_presharded_weights:
+                loaded_weight = loaded_weight.narrow(
+                    self.output_dim, shard_id * shard_size, shard_size
+                )
+
+        assert (
+            param_data.shape == loaded_weight.shape
+        ), f"{param_data.shape=}, {loaded_weight.shape=}"
+        param_data.copy_(loaded_weight)
+
+
+class RowvLLMParameter(BasevLLMParameter):
+    """
+    Parameter class defining weight_loading functionality
+    (load_row_parallel_weight) for parameters being loaded
+    into linear layers with row parallel functionality.
+    Requires an input_dim to be defined.
+    """
+
+    def __init__(self, input_dim: int, **kwargs):
+        self._input_dim = input_dim
+        super().__init__(**kwargs)
+
+    @property
+    def input_dim(self):
+        return self._input_dim
+
+    def load_row_parallel_weight(
+        self,
+        loaded_weight: torch.Tensor,
+        tp_rank: int,
+        use_presharded_weights: bool = False,
+    ):
+        if not use_presharded_weights:
+            shard_size = self.data.shape[self.input_dim]
+
+            from sglang.srt.model_loader.weight_utils import (
+                narrow_padded_param_and_loaded_weight,
+            )
+
+            if _is_cpu:
+                param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                    self.data,
+                    loaded_weight,
+                    0,  # param_data_start
+                    tp_rank * shard_size,
+                    self.input_dim,
+                    shard_size,
+                )
+
+                assert param_data.shape == loaded_weight.shape
+                param_data.copy_(loaded_weight)
+
+                return
+            else:
+                loaded_weight = loaded_weight.narrow(
+                    self.input_dim, tp_rank * shard_size, shard_size
+                )
+
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
+
+        assert self.data.shape == loaded_weight.shape
+        self.data.copy_(loaded_weight)
+
+
+class ModelWeightParameter(_ColumnvLLMParameter, RowvLLMParameter):
+    """
+    Parameter class for linear layer weights. Uses both column and
+    row parallelism.
+    """
+
+    pass
+
+
+class GroupQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
+    """
+    Parameter class for weight scales loaded for weights with
+    grouped quantization. Uses both column and row parallelism.
+    """
+
+    pass
+
+
+class ChannelQuantScaleParameter(_ColumnvLLMParameter):
+    """
+    Parameter class for weight scales loaded for weights with
+    channel-wise quantization. Equivalent to _ColumnvLLMParameter.
+    """
+
+    pass
+
+
+class BlockQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
+    """
+    Parameter class for weight scales loaded for weights with
+    block-wise quantization. Uses both column and row parallelism.
+    """
+
+    pass
+
+
+class PerTensorScaleParameter(BasevLLMParameter):
+    """
+    Parameter class for scales where the number of scales is
+    equivalent to the number of logical matrices in fused linear
+    layers (e.g. for QKV, there are 3 scales loaded from disk).
+    This is relevant to weights with per-tensor quantization.
+    Adds functionality to map the scalers to a shard during
+    weight loading.
+
+    Note: additional parameter manipulation may be handled
+    for each quantization config specifically, within
+    process_weights_after_loading
+    """
+
+    def __init__(self, **kwargs):
+        self.qkv_idxs = {"q": 0, "k": 1, "v": 2}
+        super().__init__(**kwargs)
+
+    def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
+        if isinstance(shard_id, int):
+            return shard_id
+
+        # if not int, assume shard_id for qkv
+        # map to int and return
+        assert isinstance(shard_id, str)
+        assert shard_id in self.qkv_idxs
+        return self.qkv_idxs[shard_id]
+
+    # For row parallel layers, no sharding needed
+    # load weight into parameter as is
+    def load_row_parallel_weight(self, *args, **kwargs):
+        kwargs.pop("tp_rank", None)
+        kwargs.pop("use_presharded_weights", None)
+        super().load_row_parallel_weight(*args, **kwargs)
+
+    def load_merged_column_weight(self, *args, **kwargs):
+        self._load_into_shard_id(*args, **kwargs)
+
+    def load_qkv_weight(self, *args, **kwargs):
+        self._load_into_shard_id(*args, **kwargs)
+
+    def load_column_parallel_weight(self, *args, **kwargs):
+        kwargs.pop("tp_rank", None)
+        kwargs.pop("use_presharded_weights", None)
+        super().load_row_parallel_weight(*args, **kwargs)
+
+    def _load_into_shard_id(
+        self, loaded_weight: torch.Tensor, shard_id: Union[str, int], **kwargs
+    ):
+        """
+        Slice the parameter data based on the shard id for
+        loading.
+        """
+
+        param_data = self.data
+        shard_id = self._shard_id_as_int(shard_id)
+
+        # AutoFP8 scales do not have a shape
+        # compressed-tensors scales do have a shape
+        if len(loaded_weight.shape) != 0:
+            assert loaded_weight.shape[0] == 1
+            loaded_weight = loaded_weight[0]
+
+        param_data = param_data[shard_id]
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+
+class PackedColumnParameter(_ColumnvLLMParameter):
+    """
+    Parameter for model parameters which are packed on disk
+    and support column parallelism only. See PackedvLLMParameter
+    for more details on the packed properties.
+    """
+
+    def __init__(
+        self,
+        packed_factor: Union[int, Fraction],
+        packed_dim: int,
+        marlin_tile_size: Optional[int] = None,
+        **kwargs,
+    ):
+        self._packed_factor = packed_factor
+        self._packed_dim = packed_dim
+        self._marlin_tile_size = marlin_tile_size
+        super().__init__(**kwargs)
+
+    @property
+    def packed_dim(self):
+        return self._packed_dim
+
+    @property
+    def packed_factor(self):
+        return self._packed_factor
+
+    @property
+    def marlin_tile_size(self):
+        return self._marlin_tile_size
+
+    def adjust_shard_indexes_for_packing(self, shard_size, shard_offset):
+        return _adjust_shard_indexes_for_packing(
+            shard_size=shard_size,
+            shard_offset=shard_offset,
+            packed_factor=self.packed_factor,
+            marlin_tile_size=self.marlin_tile_size,
+        )
+
+
+class PackedvLLMParameter(ModelWeightParameter):
+    """
+    Parameter for model weights which are packed on disk.
+    Example: GPTQ Marlin weights are int4 or int8, packed into int32.
+    Extends the ModelWeightParameter to take in the
+    packed factor, the packed dimension, and optionally, marlin
+    tile size for marlin kernels. Adjusts the shard_size and
+    shard_offset for fused linear layers model weight loading
+    by accounting for packing and optionally, marlin tile size.
+    """
+
+    def __init__(
+        self,
+        packed_factor: Union[int, Fraction],
+        packed_dim: int,
+        marlin_tile_size: Optional[int] = None,
+        **kwargs,
+    ):
+        self._packed_factor = packed_factor
+        self._packed_dim = packed_dim
+        self._marlin_tile_size = marlin_tile_size
+        super().__init__(**kwargs)
+
+    @property
+    def packed_dim(self):
+        return self._packed_dim
+
+    @property
+    def packed_factor(self):
+        return self._packed_factor
+
+    @property
+    def marlin_tile_size(self):
+        return self._marlin_tile_size
+
+    def adjust_shard_indexes_for_packing(self, shard_size, shard_offset):
+        return _adjust_shard_indexes_for_packing(
+            shard_size=shard_size,
+            shard_offset=shard_offset,
+            packed_factor=self.packed_factor,
+            marlin_tile_size=self.marlin_tile_size,
+        )
+
+
+def permute_param_layout_(
+    param: BasevLLMParameter, input_dim: int, output_dim: int, **kwargs
+) -> BasevLLMParameter:
+    """
+    Permute a parameter's layout to the specified input and output dimensions,
+    useful for forcing the parameter into a known layout, for example, if I need
+    a packed (quantized) weight matrix to be in the layout
+        {input_dim = 0, output_dim = 1, packed_dim = 0}
+    then I can call:
+        permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
+    to ensure x is in the correct layout (permuting it to the correct layout if
+    required, asserting if it cannot get it to the correct layout)
+    """
+
+    curr_input_dim = getattr(param, "input_dim", None)
+    curr_output_dim = getattr(param, "output_dim", None)
+
+    if curr_input_dim is None or curr_output_dim is None:
+        assert param.data.dim() == 2, (
+            "permute_param_layout_ only supports 2D parameters when either "
+            "input_dim or output_dim is not set"
+        )
+
+    # if one of the dimensions is not set, set it to the opposite of the other
+    #  we can only do this since we asserted the parameter is 2D above
+    if curr_input_dim is None:
+        assert curr_output_dim is not None, "either input or output dim must be set"
+        curr_input_dim = (curr_output_dim + 1) % 2
+    if curr_output_dim is None:
+        assert curr_input_dim is not None, "either input or output dim must be set"
+        curr_output_dim = (curr_input_dim + 1) % 2
+
+    # create permutation from the current layout to the layout with
+    # self.input_dim at input_dim and self.output_dim at output_dim preserving
+    # other dimensions
+    perm = [
+        i for i in range(param.data.dim()) if i not in [curr_input_dim, curr_output_dim]
+    ]
+    perm.insert(input_dim, curr_input_dim)
+    perm.insert(output_dim, curr_output_dim)
+
+    if "packed_dim" in kwargs:
+        assert (
+            hasattr(param, "packed_dim")
+            and param.packed_dim == perm[kwargs["packed_dim"]]
+        ), "permute_param_layout_ currently doesn't support repacking"
+
+    param.data = param.data.permute(*perm)
+    if hasattr(param, "_input_dim"):
+        param._input_dim = input_dim
+    if hasattr(param, "_output_dim"):
+        param._output_dim = output_dim
+    if "packed_dim" in kwargs and hasattr(param, "_packed_dim"):
+        param._packed_dim = kwargs["packed_dim"]
+
+    return param
+
+
+def _adjust_shard_indexes_for_marlin(shard_size, shard_offset, marlin_tile_size):
+    return shard_size * marlin_tile_size, shard_offset * marlin_tile_size
+
+
+def _adjust_shard_indexes_for_packing(
+    shard_size, shard_offset, packed_factor, marlin_tile_size
+):
+    shard_size = shard_size // packed_factor
+    shard_offset = shard_offset // packed_factor
+    if marlin_tile_size is not None:
+        return _adjust_shard_indexes_for_marlin(
+            shard_size=shard_size,
+            shard_offset=shard_offset,
+            marlin_tile_size=marlin_tile_size,
+        )
+    return shard_size, shard_offset
diff --git a/python/sglang/srt/layers/pooler.py b/python/sglang/srt/layers/pooler.py
new file mode 100644
index 000000000..26bc5899e
--- /dev/null
+++ b/python/sglang/srt/layers/pooler.py
@@ -0,0 +1,112 @@
+# adapted from
+# https://github.com/vllm-project/vllm/blob/82a1b1a82b1fbb454c82a9ef95730b929c9b270c/vllm/model_executor/layers/pooler.py
+
+from dataclasses import dataclass
+from enum import IntEnum
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+
+from sglang.srt.layers.activation import get_cross_encoder_activation_function
+from sglang.srt.model_executor.model_runner import ForwardBatch
+
+
+class PoolingType(IntEnum):
+    LAST = 0
+    CLS = 1
+
+
+@dataclass
+class EmbeddingPoolerOutput:
+    embeddings: torch.Tensor
+
+
+class Pooler(nn.Module):
+    """A layer that pools specific information from hidden states.
+    This layer does the following:
+    1. Extracts specific tokens or aggregates data based on pooling method.
+    2. Normalizes output if specified.
+    3. Returns structured results as `PoolerOutput`.
+    Attributes:
+        pooling_type: The type of pooling to use (LAST, AVERAGE, MAX).
+        normalize: Whether to normalize the pooled data.
+    """
+
+    def __init__(self, pooling_type: PoolingType, normalize: bool):
+        super().__init__()
+        self.pooling_type = pooling_type
+        self.normalize = normalize
+
+    def forward(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> EmbeddingPoolerOutput:
+        if self.pooling_type == PoolingType.LAST:
+            last_token_indices = torch.cumsum(forward_batch.extend_seq_lens, dim=0) - 1
+            pooled_data = hidden_states[last_token_indices]
+        elif self.pooling_type == PoolingType.CLS:
+            prompt_lens = forward_batch.extend_seq_lens
+            first_token_flat_indices = torch.zeros_like(prompt_lens)
+            first_token_flat_indices[1:] += torch.cumsum(prompt_lens, dim=0)[:-1]
+            pooled_data = hidden_states[first_token_flat_indices]
+        else:
+            raise ValueError(f"Invalid pooling type: {self.pooling_type}")
+
+        if self.normalize:
+            pooled_data = nn.functional.normalize(pooled_data, p=2, dim=1)
+
+        return EmbeddingPoolerOutput(embeddings=pooled_data)
+
+
+class CrossEncodingPooler(nn.Module):
+    """A layer that pools specific information from hidden states.
+
+    This layer does the following:
+    1. Extracts specific tokens or aggregates data based on pooling method.
+    2. Normalizes output if specified.
+    3. Returns structured results as `EmbeddingPoolerOutput`.
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        classifier: nn.Module,
+        pooler: Optional[nn.Module] = None,
+    ):
+        super().__init__()
+        self.classifier = classifier
+        self.pooler = pooler
+        self.default_activation_function = get_cross_encoder_activation_function(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> EmbeddingPoolerOutput:
+        """Pools sentence pair scores from the hidden_states."""
+
+        prompt_lens = forward_batch.extend_seq_lens
+
+        offset = 0
+        pooled_data_lst = []
+        for prompt_len in prompt_lens:
+            pooled_data_i = hidden_states[offset : offset + prompt_len]
+
+            if self.pooler is not None:
+                final_shape_tensor = self.pooler(pooled_data_i, forward_batch)
+            else:
+                final_shape_tensor = self.classifier(pooled_data_i)
+
+            pooled_data_lst.append(final_shape_tensor)
+            offset += prompt_len
+
+        pooled_output = torch.stack(pooled_data_lst)
+
+        if self.pooler is not None:
+            # apply classifier once on the full batch if possible
+            pooled_output = self.classifier(pooled_output)
+
+        scores = self.default_activation_function(pooled_output).squeeze(-1)
+
+        return EmbeddingPoolerOutput(embeddings=scores)
diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py
new file mode 100644
index 000000000..ff3c2b148
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/__init__.py
@@ -0,0 +1,224 @@
+# Adapted from https://raw.githubusercontent.com/vllm-project/vllm/v0.5.5/vllm/model_executor/layers/quantization/__init__.py
+from __future__ import annotations
+
+import builtins
+import inspect
+from typing import TYPE_CHECKING, Dict, Optional, Type
+
+import torch
+
+try:
+    from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
+    from vllm.model_executor.layers.quantization.bitsandbytes import BitsAndBytesConfig
+    from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (
+        CompressedTensorsW8A8Fp8MoEMethod,
+        CompressedTensorsWNA16MoEMethod,
+    )
+    from vllm.model_executor.layers.quantization.deepspeedfp import DeepSpeedFPConfig
+    from vllm.model_executor.layers.quantization.experts_int8 import ExpertsInt8Config
+    from vllm.model_executor.layers.quantization.gguf import GGUFConfig
+    from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
+        GPTQMarlin24Config,
+    )
+    from vllm.model_executor.layers.quantization.marlin import MarlinConfig
+    from vllm.model_executor.layers.quantization.qqq import QQQConfig
+    from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig
+
+    VLLM_AVAILABLE = True
+except ImportError as e:
+    VLLM_AVAILABLE = False
+    VLLM_IMPORT_ERROR = e
+
+    # Define empty classes as placeholders when vllm is not available
+    class DummyConfig:
+        def override_quantization_method(self, *args, **kwargs):
+            return None
+
+    AQLMConfig = BitsAndBytesConfig = CompressedTensorsConfig = DeepSpeedFPConfig = (
+        ExpertsInt8Config
+    ) = GGUFConfig = GPTQMarlin24Config = MarlinConfig = QQQConfig = Int8TpuConfig = (
+        DummyConfig
+    )
+
+
+from sglang.srt.layers.quantization.awq import AWQConfig, AWQMarlinConfig
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.quantization.blockwise_int8 import BlockInt8Config
+from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import (
+    CompressedTensorsConfig,
+)
+from sglang.srt.layers.quantization.fp8 import Fp8Config
+from sglang.srt.layers.quantization.fpgemm_fp8 import FBGEMMFp8Config
+from sglang.srt.layers.quantization.gptq import GPTQConfig, GPTQMarlinConfig
+from sglang.srt.layers.quantization.modelopt_quant import (
+    ModelOptFp4Config,
+    ModelOptFp8Config,
+)
+from sglang.srt.layers.quantization.moe_wna16 import MoeWNA16Config
+from sglang.srt.layers.quantization.mxfp4 import Mxfp4Config
+from sglang.srt.layers.quantization.petit import PetitNvFp4Config
+from sglang.srt.layers.quantization.qoq import QoQConfig
+from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config
+from sglang.srt.layers.quantization.w8a8_fp8 import W8A8Fp8Config
+from sglang.srt.layers.quantization.w8a8_int8 import W8A8Int8Config
+from sglang.srt.utils import is_cuda, is_hip, mxfp_supported
+
+_is_mxfp_supported = mxfp_supported()
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.topk import TopKOutput
+
+# Base quantization methods that don't depend on vllm
+BASE_QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
+    "fp8": Fp8Config,
+    "blockwise_int8": BlockInt8Config,
+    "modelopt": ModelOptFp8Config,
+    "modelopt_fp4": ModelOptFp4Config,
+    "w8a8_int8": W8A8Int8Config,
+    "w8a8_fp8": W8A8Fp8Config,
+    "awq": AWQConfig,
+    "awq_marlin": AWQMarlinConfig,
+    "gptq": GPTQConfig,
+    "gptq_marlin": GPTQMarlinConfig,
+    "moe_wna16": MoeWNA16Config,
+    "compressed-tensors": CompressedTensorsConfig,
+    "qoq": QoQConfig,
+    "w4afp8": W4AFp8Config,
+    "petit_nvfp4": PetitNvFp4Config,
+    "fbgemm_fp8": FBGEMMFp8Config,
+}
+
+
+if is_cuda():
+    BASE_QUANTIZATION_METHODS.update(
+        {
+            "quark": Mxfp4Config,
+            "mxfp4": Mxfp4Config,
+        }
+    )
+elif _is_mxfp_supported and is_hip():
+    from sglang.srt.layers.quantization.quark.quark import QuarkConfig
+
+    BASE_QUANTIZATION_METHODS.update(
+        {
+            "quark": QuarkConfig,
+            "mxfp4": Mxfp4Config,
+        }
+    )
+# VLLM-dependent quantization methods
+VLLM_QUANTIZATION_METHODS = {
+    "aqlm": AQLMConfig,
+    "deepspeedfp": DeepSpeedFPConfig,
+    "tpu_int8": Int8TpuConfig,
+    "marlin": MarlinConfig,
+    "gguf": GGUFConfig,
+    "gptq_marlin_24": GPTQMarlin24Config,
+    "bitsandbytes": BitsAndBytesConfig,
+    "qqq": QQQConfig,
+    "experts_int8": ExpertsInt8Config,
+}
+
+QUANTIZATION_METHODS = {**BASE_QUANTIZATION_METHODS, **VLLM_QUANTIZATION_METHODS}
+
+
+def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
+    if quantization not in QUANTIZATION_METHODS:
+        raise ValueError(
+            f"Invalid quantization method: {quantization}. "
+            f"Available methods: {list(QUANTIZATION_METHODS.keys())}"
+        )
+    if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
+        raise ValueError(
+            f"{quantization} quantization requires some operators from vllm. "
+            f"Please install vllm by `pip install vllm==0.9.0.1`\n"
+            f"Import error: {VLLM_IMPORT_ERROR}"
+        )
+
+    return QUANTIZATION_METHODS[quantization]
+
+
+original_isinstance = builtins.isinstance
+
+
+def monkey_patch_isinstance_for_vllm_base_layer(reverse: bool = False):
+    """
+    Patch isinstance so that the `get_quant_method` in vllm's QuantizationConfig
+    can recognize sglang layers
+    """
+    if not VLLM_AVAILABLE:
+        return
+
+    if reverse:
+        builtins.isinstance = original_isinstance
+        return
+
+    from vllm.model_executor.layers.fused_moe import FusedMoE
+    from vllm.model_executor.layers.linear import LinearBase
+    from vllm.model_executor.layers.vocab_parallel_embedding import (
+        VocabParallelEmbedding,
+    )
+
+    from sglang.srt.layers.linear import LinearBase as PatchedLinearBase
+    from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE as PatchedFusedMoE
+    from sglang.srt.layers.vocab_parallel_embedding import (
+        VocabParallelEmbedding as PatchedVocabParallelEmbedding,
+    )
+
+    def patched_isinstance(obj, classinfo):
+        if classinfo is LinearBase:
+            return original_isinstance(obj, PatchedLinearBase)
+        if classinfo is FusedMoE:
+            return original_isinstance(obj, PatchedFusedMoE)
+        if classinfo is VocabParallelEmbedding:
+            return original_isinstance(obj, PatchedVocabParallelEmbedding)
+        return original_isinstance(obj, classinfo)
+
+    builtins.isinstance = patched_isinstance
+
+
+def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
+    """
+    Monkey patch the apply function of vllm's FusedMoEMethodBase.
+    Convert sglang arguments to vllm arguments.
+    """
+    original_apply = class_obj.apply
+    sig = inspect.signature(original_apply)
+    param_names = list(sig.parameters.keys())
+    has_correction_bias = "e_score_correction_bias" in param_names
+
+    def new_apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        topk_output: TopKOutput,
+        *,
+        activation: str = "silu",
+        apply_router_weight_on_input: bool = False,
+        inplace: bool = True,
+        no_combine: bool = False,
+        routed_scaling_factor: Optional[float] = None,
+    ):
+        assert activation == "silu"
+        assert inplace and not no_combine
+
+        kwargs = {
+            "self": self,
+            "layer": layer,
+            "x": x,
+            "topk_output": topk_output,
+        }
+        return original_apply(**kwargs)
+
+    setattr(class_obj, "apply", new_apply)
+
+
+def monkey_patch_quant_configs():
+    """Apply all monkey patches in one place."""
+
+    monkey_patch_moe_apply(CompressedTensorsW8A8Fp8MoEMethod)
+    monkey_patch_moe_apply(CompressedTensorsWNA16MoEMethod)
+
+
+# Only apply monkey patches if vllm is available
+if VLLM_AVAILABLE:
+    monkey_patch_quant_configs()
diff --git a/python/sglang/srt/layers/quantization/awq.py b/python/sglang/srt/layers/quantization/awq.py
new file mode 100644
index 000000000..9cba60c2b
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/awq.py
@@ -0,0 +1,782 @@
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import logging
+import warnings
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
+
+import torch
+
+from sglang.srt.layers.linear import LinearBase, set_weight_attrs
+from sglang.srt.layers.parameter import GroupQuantScaleParameter, PackedvLLMParameter
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.marlin_utils import (
+    apply_awq_marlin_linear,
+    awq_to_marlin_zero_points,
+    check_marlin_supported,
+    check_marlin_supports_layer,
+    check_moe_marlin_supports_layer,
+    marlin_make_empty_g_idx,
+    marlin_make_workspace,
+    marlin_moe_permute_scales,
+    marlin_permute_scales,
+    moe_awq_to_marlin_zero_points,
+    verify_marlin_supported,
+    verify_marlin_supports_shape,
+)
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+from sglang.srt.layers.quantization.utils import get_scalar_types, replace_parameter
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
+    from sglang.srt.layers.moe.token_dispatcher import (
+        StandardDispatchOutput,
+        CombineInput,
+    )
+
+from sglang.srt.utils import is_cuda, is_hip
+
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+if _is_cuda:
+    from sgl_kernel import (
+        awq_dequantize,
+        awq_marlin_moe_repack,
+        awq_marlin_repack,
+        fused_marlin_moe,
+    )
+
+
+elif _is_hip:
+    from sglang.srt.layers.quantization.awq_triton import (
+        awq_dequantize_triton as awq_dequantize,
+    )
+
+    warnings.warn(f"HIP does not support fused_marlin_moe currently.")
+else:
+    warnings.warn(f"Only CUDA and HIP support AWQ currently.")
+
+logger = logging.getLogger(__name__)
+
+
+ScalarType, scalar_types = get_scalar_types()
+
+
+def is_layer_skipped_awq(prefix: str, modules_to_not_convert: List[str]):
+    return any(module_name in prefix for module_name in modules_to_not_convert)
+
+
+class AWQConfig(QuantizationConfig):
+    """Config class for AWQ.
+
+    Reference: https://arxiv.org/abs/2306.00978
+    """
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        zero_point: bool,
+        modules_to_not_convert: Optional[List[str]] = None,
+    ) -> None:
+        super().__init__()
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.zero_point = zero_point
+        self.modules_to_not_convert = modules_to_not_convert or []
+
+        if self.weight_bits != 4:
+            raise ValueError(
+                "Currently, only 4-bit weight quantization is supported for "
+                f"AWQ, but got {self.weight_bits} bits."
+            )
+        self.pack_factor = 32 // self.weight_bits
+
+    def __repr__(self) -> str:
+        return (
+            f"AWQConfig(weight_bits={self.weight_bits}, "
+            f"group_size={self.group_size}, "
+            f"zero_point={self.zero_point}, "
+            f"modules_to_not_convert={self.modules_to_not_convert})"
+        )
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+    def get_name(self) -> str:
+        return "awq"
+
+    def get_supported_act_dtypes(self) -> List[torch.dtype]:
+        return [torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # The AWQ kernel only supports Turing or newer GPUs.
+        return 75
+
+    @staticmethod
+    def get_config_filenames() -> List[str]:
+        return [
+            "quant_config.json",  # E.g., casperhansen/vicuna-7b-v1.5-awq
+            # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq
+            "quantize_config.json",
+        ]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> AWQConfig:
+        weight_bits = cls.get_from_keys(config, ["w_bit", "bits"])
+        group_size = cls.get_from_keys(config, ["q_group_size", "group_size"])
+        zero_point = cls.get_from_keys(config, ["zero_point"])
+        modules_to_not_convert = cls.get_from_keys_or(
+            config, ["modules_to_not_convert"], None
+        )
+        return cls(weight_bits, group_size, zero_point, modules_to_not_convert)
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[LinearMethodBase]:
+        from sglang.srt.layers.linear import LinearBase
+
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
+                return UnquantizedLinearMethod()
+            return AWQLinearMethod(self)
+        return None
+
+
+class AWQMarlinConfig(QuantizationConfig):
+    """Config class for AWQ Marlin"""
+
+    # num_bits -> type
+    TYPE_MAP = {
+        4: scalar_types.uint4,
+        8: scalar_types.uint8,
+    }
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        zero_point: bool,
+        lm_head_quantized: bool,
+        modules_to_not_convert: Optional[list[str]],
+        full_config: dict[str, Any],
+    ) -> None:
+        super().__init__()
+        self.pack_factor = 32 // weight_bits  # packed into int32
+        self.group_size = group_size
+        self.zero_point = zero_point
+        self.lm_head_quantized = lm_head_quantized
+        self.weight_bits = weight_bits
+        self.modules_to_not_convert = modules_to_not_convert or []
+        self.full_config = full_config
+
+        if self.weight_bits not in self.TYPE_MAP:
+            raise ValueError(
+                f"Unsupported num_bits = {self.weight_bits}. "
+                f"Supported num_bits = {self.TYPE_MAP.keys()}"
+            )
+
+        self.quant_type = self.TYPE_MAP[self.weight_bits]
+
+        verify_marlin_supported(
+            self.quant_type, group_size=self.group_size, has_zp=self.zero_point
+        )
+
+    def __repr__(self) -> str:
+        return (
+            f"AWQMarlinConfig(quant_type={self.quant_type}, "
+            f"group_size={self.group_size}, "
+            f"zero_point={self.zero_point}, "
+            f"lm_head_quantized={self.lm_head_quantized}, "
+            f"modules_to_not_convert={self.modules_to_not_convert})"
+        )
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "awq_marlin"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> AWQMarlinConfig:
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        zero_point = cls.get_from_keys(config, ["zero_point"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
+        modules_to_not_convert = cls.get_from_keys_or(
+            config, ["modules_to_not_convert"], None
+        )
+        return cls(
+            weight_bits,
+            group_size,
+            zero_point,
+            lm_head_quantized,
+            modules_to_not_convert,
+            config,
+        )
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:
+        can_convert = cls.is_awq_marlin_compatible(hf_quant_cfg)
+        is_valid_user_quant = (
+            user_quant is None or user_quant == "marlin" or user_quant == "awq_marlin"
+        )
+
+        if can_convert and is_valid_user_quant:
+            msg = (
+                "The model is convertible to {} during runtime."
+                " Using {} kernel.".format(cls.get_name(), cls.get_name())
+            )
+            logger.info(msg)
+            return cls.get_name()
+
+        if can_convert and user_quant == "awq":
+            logger.info(
+                "Detected that the model can run with awq_marlin"
+                ", however you specified quantization=awq explicitly,"
+                " so forcing awq. Use quantization=awq_marlin for"
+                " faster inference"
+            )
+        return None
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[QuantizeMethodBase]:
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+        from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+
+        if isinstance(layer, LinearBase) or (
+            isinstance(layer, ParallelLMHead) and self.lm_head_quantized
+        ):
+            if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
+                return UnquantizedLinearMethod()
+            # Check if the layer is supported by AWQMarlin.
+            if not check_marlin_supports_layer(layer, self.group_size):
+                logger.warning_once(
+                    "Layer '%s' is not supported by AWQMarlin. Falling back to unoptimized AWQ kernels.",  # noqa: E501
+                    prefix,
+                )
+                return AWQConfig.from_config(self.full_config).get_quant_method(
+                    layer, prefix
+                )
+            return AWQMarlinLinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            from sglang.srt.layers.quantization.moe_wna16 import MoeWNA16Config
+
+            if not check_moe_marlin_supports_layer(layer, self.group_size):
+                logger.warning_once(
+                    f"Layer '{prefix}' is not supported by AWQMoeMarlin. "
+                    "Falling back to Moe WNA16 kernels."
+                )
+                return MoeWNA16Config.from_config(self.full_config).get_quant_method(
+                    layer, prefix
+                )
+            return AWQMoEMethod(self)
+        return None
+
+    @classmethod
+    def is_awq_marlin_compatible(cls, quant_config: dict[str, Any]):
+        # Extract data from quant config.
+        quant_method = quant_config.get("quant_method", "").lower()
+        num_bits = quant_config.get("bits")
+        group_size = quant_config.get("group_size")
+        zero_point = quant_config.get("zero_point")
+
+        if not _is_cuda:
+            return False
+
+        if quant_method != "awq":
+            return False
+
+        # If we cannot find the info needed in the config, cannot convert.
+        if num_bits is None or group_size is None or zero_point is None:
+            return False
+
+        if num_bits not in cls.TYPE_MAP:
+            return False
+
+        return check_marlin_supported(
+            quant_type=cls.TYPE_MAP[num_bits], group_size=group_size, has_zp=zero_point
+        )
+
+
+class AWQLinearMethod(LinearMethodBase):
+    """Linear method for AWQ.
+
+    Args:
+        quant_config: The AWQ quantization config.
+    """
+
+    def __init__(self, quant_config: AWQConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        if input_size_per_partition % self.quant_config.group_size != 0:
+            raise ValueError(
+                "The input size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size."
+            )
+
+        output_size_per_partition = sum(output_partition_sizes)
+        if output_size_per_partition % self.quant_config.pack_factor != 0:
+            raise ValueError(
+                "The output size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size."
+            )
+
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader,
+        )
+
+        qzeros = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // self.quant_config.group_size,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader,
+        )
+
+        scales = GroupQuantScaleParameter(
+            data=torch.empty(
+                input_size_per_partition // self.quant_config.group_size,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            input_dim=0,
+            output_dim=1,
+            weight_loader=weight_loader,
+        )
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("qzeros", qzeros)
+        layer.register_parameter("scales", scales)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.qweight = torch.nn.Parameter(layer.qweight.data, requires_grad=False)
+        layer.qzeros = torch.nn.Parameter(layer.qzeros.data, requires_grad=False)
+        layer.scales = torch.nn.Parameter(layer.scales.data, requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qweight = layer.qweight
+        scales = layer.scales
+        qzeros = layer.qzeros
+        pack_factor = self.quant_config.pack_factor
+        out_shape = x.shape[:-1] + (qweight.shape[-1] * pack_factor,)
+        reshaped_x = x.reshape(-1, x.shape[-1])
+        out = awq_dequantize(qweight, scales, qzeros)
+        out = torch.matmul(reshaped_x, out)
+
+        if bias is not None:
+            out.add_(bias)
+        return out.reshape(out_shape)
+
+
+class AWQMarlinLinearMethod(LinearMethodBase):
+    """Linear method for AWQ Marlin.
+
+    Args:
+        quant_config: The AWQ Marlin quantization config.
+    """
+
+    def __init__(self, quant_config: AWQMarlinConfig) -> None:
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        del output_size
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        verify_marlin_supports_shape(
+            output_size_per_partition=output_size_per_partition,
+            input_size_per_partition=input_size_per_partition,
+            input_size=input_size,
+            group_size=group_size,
+        )
+
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader,
+        )
+
+        num_groups = input_size_per_partition // group_size
+
+        qzeros = PackedvLLMParameter(
+            data=torch.empty(
+                num_groups,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader,
+        )
+
+        scales = GroupQuantScaleParameter(
+            data=torch.empty(
+                num_groups,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            input_dim=0,
+            output_dim=1,
+            weight_loader=weight_loader,
+        )
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("qzeros", qzeros)
+        layer.register_parameter("scales", scales)
+
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.num_groups = num_groups
+
+    # TODO: Update this docs
+    # Checkpoints are serialized in AutoAWQ format, which is different from the
+    # marlin format. This function is called after the weights are loaded.
+    # Here, we handle the repacking
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        device = layer.qweight.device
+        layer.qweight = torch.nn.Parameter(layer.qweight.data, requires_grad=False)
+        layer.qzeros = torch.nn.Parameter(layer.qzeros.data, requires_grad=False)
+        layer.scales = torch.nn.Parameter(layer.scales.data, requires_grad=False)
+
+        # Allocate marlin workspace
+        layer.workspace = marlin_make_workspace(device)
+
+        # Repack weights from AWQ format to marlin format.
+        marlin_qweight = awq_marlin_repack(
+            layer.qweight,
+            size_k=layer.input_size_per_partition,
+            size_n=layer.output_size_per_partition,
+            num_bits=self.quant_config.quant_type.size_bits,
+        )
+        replace_parameter(layer, "qweight", marlin_qweight)
+
+        # Permute scales from AWQ format to marlin format.
+        marlin_scales = marlin_permute_scales(
+            layer.scales,
+            size_k=layer.input_size_per_partition,
+            size_n=layer.output_size_per_partition,
+            group_size=self.quant_config.group_size,
+        )
+        replace_parameter(layer, "scales", marlin_scales)
+
+        # Permute zero-points from AWQ format to marlin format.
+        marlin_zp = awq_to_marlin_zero_points(
+            layer.qzeros,
+            size_k=layer.num_groups,
+            size_n=layer.output_size_per_partition,
+            num_bits=self.quant_config.quant_type.size_bits,
+        )
+        replace_parameter(layer, "qzeros", marlin_zp)
+
+        # Not-used
+        layer.g_idx = marlin_make_empty_g_idx(device)
+        layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return apply_awq_marlin_linear(
+            input=x,
+            weight=layer.qweight,
+            weight_scale=layer.scales,
+            weight_zp=layer.qzeros,
+            g_idx=layer.g_idx,
+            g_idx_sort_indices=layer.g_idx_sort_indices,
+            workspace=layer.workspace,
+            quant_type=self.quant_config.quant_type,
+            output_size_per_partition=layer.output_size_per_partition,
+            input_size_per_partition=layer.input_size_per_partition,
+            bias=bias,
+        )
+
+
+class AWQMoEMethod(FusedMoEMethodBase):
+
+    def __init__(self, quant_config: AWQMarlinConfig):
+        self.quant_config = quant_config
+        if self.quant_config.weight_bits != 4:
+            raise ValueError("AWQMoEMethod only supports 4bit now.")
+        self.quant_type = scalar_types.uint4
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        # Delay the import to avoid circular dependency
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        extra_weight_attrs.update(
+            {
+                "is_transposed": True,
+                "quant_method": FusedMoeWeightScaleSupported.GROUP.value,
+            }
+        )
+
+        w13_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                2 * intermediate_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qweight", w13_qweight)
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+
+        w2_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition,
+                hidden_size // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qweight", w2_qweight)
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+
+        num_groups_w13 = hidden_size // self.quant_config.group_size
+        num_groups_w2 = intermediate_size_per_partition // self.quant_config.group_size
+
+        # WEIGHT_SCALES
+        # Allocate 2 scales for w1 and w3 respectively.
+        w13_scales = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                num_groups_w13,
+                intermediate_size_per_partition * 2,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_scales", w13_scales)
+        set_weight_attrs(w13_scales, extra_weight_attrs)
+
+        w2_scales = torch.nn.Parameter(
+            torch.empty(num_experts, num_groups_w2, hidden_size, dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_scales", w2_scales)
+        set_weight_attrs(w2_scales, extra_weight_attrs)
+
+        # WEIGHT_ZERO_POINT
+        # Allocate 2 zero points for w1 and w3 respectively.
+        w13_qzeros = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                num_groups_w13,
+                2 * intermediate_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qzeros", w13_qzeros)
+        set_weight_attrs(w13_qzeros, extra_weight_attrs)
+
+        w2_qzeros = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                num_groups_w2,
+                hidden_size // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qzeros", w2_qzeros)
+        set_weight_attrs(w2_qzeros, extra_weight_attrs)
+
+        device = layer.w13_qweight.device
+        layer.workspace = marlin_make_workspace(device, 4)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        num_experts = layer.w13_qweight.shape[0]
+        device = layer.w13_qweight.device
+
+        layer.w13_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+            requires_grad=False,
+        )
+        layer.w2_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+            requires_grad=False,
+        )
+
+        marlin_w13_qweight = awq_marlin_moe_repack(
+            layer.w13_qweight,
+            layer.w13_g_idx_sort_indices,
+            size_k=layer.w13_qweight.shape[1],
+            size_n=layer.w13_qweight.shape[2] * self.quant_config.pack_factor,
+            num_bits=self.quant_config.weight_bits,
+        )
+        replace_parameter(layer, "w13_qweight", marlin_w13_qweight)
+
+        marlin_w2_qweight = awq_marlin_moe_repack(
+            layer.w2_qweight,
+            layer.w2_g_idx_sort_indices,
+            size_k=layer.w2_qweight.shape[1],
+            size_n=layer.w2_qweight.shape[2] * self.quant_config.pack_factor,
+            num_bits=self.quant_config.weight_bits,
+        )
+        replace_parameter(layer, "w2_qweight", marlin_w2_qweight)
+
+        # hidden_size->intermediate_size
+        marlin_w13_scales = marlin_moe_permute_scales(
+            s=layer.w13_scales,
+            size_k=layer.intermediate_size_per_partition,
+            size_n=layer.w13_scales.shape[2],
+            group_size=self.quant_config.group_size,
+        )
+
+        replace_parameter(layer, "w13_scales", marlin_w13_scales)
+
+        marlin_w2_scales = marlin_moe_permute_scales(
+            s=layer.w2_scales,
+            size_k=layer.intermediate_size_per_partition,
+            size_n=layer.w2_scales.shape[2],
+            group_size=self.quant_config.group_size,
+        )
+        replace_parameter(layer, "w2_scales", marlin_w2_scales)
+
+        marlin_w13_zp = moe_awq_to_marlin_zero_points(
+            layer.w13_qzeros,
+            size_k=layer.w13_qzeros.shape[1],
+            size_n=layer.w13_qzeros.shape[2] * self.quant_config.pack_factor,
+            num_bits=self.quant_config.weight_bits,
+        )
+        replace_parameter(layer, "w13_qzeros", marlin_w13_zp)
+
+        marlin_w2_zp = moe_awq_to_marlin_zero_points(
+            layer.w2_qzeros,
+            size_k=layer.w2_qzeros.shape[1],
+            size_n=layer.w2_qzeros.shape[2] * self.quant_config.pack_factor,
+            num_bits=self.quant_config.weight_bits,
+        )
+        replace_parameter(layer, "w2_qzeros", marlin_w2_zp)
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        assert (
+            self.moe_runner_config.activation == "silu"
+        ), "Only SiLU activation is supported."
+
+        # The input must currently be float16
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        orig_dtype = x.dtype
+        x = x.half()
+
+        topk_weights, topk_ids, router_logits = topk_output
+
+        output = fused_marlin_moe(
+            x,
+            layer.w13_qweight,
+            layer.w2_qweight,
+            layer.w13_scales,
+            layer.w2_scales,
+            router_logits,
+            topk_weights,
+            topk_ids,
+            sort_indices1=layer.w13_g_idx_sort_indices,
+            sort_indices2=layer.w2_g_idx_sort_indices,
+            w1_zeros=layer.w13_qzeros,
+            w2_zeros=layer.w2_qzeros,
+            num_bits=self.quant_config.weight_bits,
+        ).to(orig_dtype)
+        return StandardCombineInput(hidden_states=output)
diff --git a/python/sglang/srt/layers/quantization/awq_triton.py b/python/sglang/srt/layers/quantization/awq_triton.py
new file mode 100644
index 000000000..13352efdb
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/awq_triton.py
@@ -0,0 +1,339 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/awq_triton.py
+
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+import triton
+import triton.language as tl
+
+AWQ_TRITON_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
+
+
+@triton.jit
+def awq_dequantize_kernel(
+    qweight_ptr,  # quantized matrix
+    scales_ptr,  # scales, per group
+    zeros_ptr,  # zeros, per group
+    group_size,  # Should always be one of the supported group sizes
+    result_ptr,  # Output matrix
+    num_cols,  # input num cols in qweight
+    num_rows,  # input num rows in qweight
+    BLOCK_SIZE_X: tl.constexpr,
+    BLOCK_SIZE_Y: tl.constexpr,
+):
+    # Setup the pids.
+    pid_x = tl.program_id(axis=0)
+    pid_y = tl.program_id(axis=1)
+
+    # Compute offsets and masks for qweight_ptr.
+    offsets_y = pid_y * BLOCK_SIZE_Y + tl.arange(0, BLOCK_SIZE_Y)
+    offsets_x = pid_x * BLOCK_SIZE_X + tl.arange(0, BLOCK_SIZE_X)
+    offsets = num_cols * offsets_y[:, None] + offsets_x[None, :]
+
+    masks_y = offsets_y < num_rows
+    masks_x = offsets_x < num_cols
+
+    masks = masks_y[:, None] & masks_x[None, :]
+
+    # Compute offsets and masks for result output ptr.
+    result_offsets_y = pid_y * BLOCK_SIZE_Y + tl.arange(0, BLOCK_SIZE_Y)
+    result_offsets_x = pid_x * BLOCK_SIZE_X * 8 + tl.arange(0, BLOCK_SIZE_X * 8)
+    result_offsets = (
+        8 * num_cols * result_offsets_y[:, None] + result_offsets_x[None, :]
+    )
+
+    result_masks_y = result_offsets_y < num_rows
+    result_masks_x = result_offsets_x < num_cols * 8
+    result_masks = result_masks_y[:, None] & result_masks_x[None, :]
+
+    # Load the weights.
+    iweights = tl.load(qweight_ptr + offsets, masks, 0.0)
+    iweights = tl.interleave(iweights, iweights)
+    iweights = tl.interleave(iweights, iweights)
+    iweights = tl.interleave(iweights, iweights)
+
+    # Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7]
+    # that will map given indices to the correct order.
+    reverse_awq_order_tensor = (
+        (tl.arange(0, 2) * 4)[None, :] + tl.arange(0, 4)[:, None]
+    ).reshape(8)
+
+    # Use this to compute a set of shifts that can be used to unpack and
+    # reorder the values in iweights and zeros.
+    shifts = reverse_awq_order_tensor * 4
+    shifts = tl.broadcast_to(shifts[None, :], (BLOCK_SIZE_Y * BLOCK_SIZE_X, 8))
+    shifts = tl.reshape(shifts, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8))
+
+    # Unpack and reorder: shift out the correct 4-bit value and mask.
+    iweights = (iweights >> shifts) & 0xF
+
+    # Compute zero offsets and masks.
+    zero_offsets_y = pid_y * BLOCK_SIZE_Y // group_size + tl.arange(0, 1)
+    zero_offsets_x = pid_x * BLOCK_SIZE_X + tl.arange(0, BLOCK_SIZE_X)
+    zero_offsets = num_cols * zero_offsets_y[:, None] + zero_offsets_x[None, :]
+
+    zero_masks_y = zero_offsets_y < num_rows // group_size
+    zero_masks_x = zero_offsets_x < num_cols
+    zero_masks = zero_masks_y[:, None] & zero_masks_x[None, :]
+
+    # Load the zeros.
+    zeros = tl.load(zeros_ptr + zero_offsets, zero_masks, 0.0)
+    zeros = tl.interleave(zeros, zeros)
+    zeros = tl.interleave(zeros, zeros)
+    zeros = tl.interleave(zeros, zeros)
+    zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8))
+
+    # Unpack and reorder: shift out the correct 4-bit value and mask.
+    zeros = (zeros >> shifts) & 0xF
+
+    # Compute scale offsets and masks.
+    scale_offsets_y = pid_y * BLOCK_SIZE_Y // group_size + tl.arange(0, 1)
+    scale_offsets_x = pid_x * BLOCK_SIZE_X * 8 + tl.arange(0, BLOCK_SIZE_X * 8)
+    scale_offsets = num_cols * 8 * scale_offsets_y[:, None] + scale_offsets_x[None, :]
+    scale_masks_y = scale_offsets_y < num_rows // group_size
+    scale_masks_x = scale_offsets_x < num_cols * 8
+    scale_masks = scale_masks_y[:, None] & scale_masks_x[None, :]
+
+    # Load the scales.
+    scales = tl.load(scales_ptr + scale_offsets, scale_masks, 0.0)
+    scales = tl.broadcast_to(scales, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8))
+
+    # Dequantize.
+    iweights = (iweights - zeros) * scales
+    iweights = iweights.to(result_ptr.type.element_ty)
+
+    # Finally, store.
+    tl.store(result_ptr + result_offsets, iweights, result_masks)
+
+
+@triton.jit
+def awq_gemm_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    zeros_ptr,
+    scales_ptr,
+    M,
+    N,
+    K,
+    group_size,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    SPLIT_K: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    pid_z = tl.program_id(1)
+
+    # NOTE: This doesn't work in TRITON_INTERPRET=1 mode.  Use below instead.
+    # num_pid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+
+    pid_m = pid // num_pid_n
+    pid_n = pid % num_pid_n
+
+    accumulator_dtype = c_ptr.type.element_ty
+
+    # NOTE: This doesn't work in TRITON_INTERPRET=1 mode.  Use below instead.
+    # accumulator = tl.arange(0, BLOCK_SIZE_N)
+    # accumulator = tl.broadcast_to(accumulator[None, :],
+    # (BLOCK_SIZE_M, BLOCK_SIZE_N))
+    # accumulator = accumulator & 0x0
+    # accumulator = accumulator.to(accumulator_dtype)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=accumulator_dtype)
+
+    # Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7]
+    # that will map given indices to the correct order.
+    reverse_awq_order_tensor = (
+        (tl.arange(0, 2) * 4)[None, :] + tl.arange(0, 4)[:, None]
+    ).reshape(8)
+
+    # Create the necessary shifts to use to unpack.
+    shifts = reverse_awq_order_tensor * 4
+    shifts = tl.broadcast_to(shifts[None, :], (BLOCK_SIZE_K * (BLOCK_SIZE_N // 8), 8))
+    shifts = tl.reshape(shifts, (BLOCK_SIZE_K, BLOCK_SIZE_N))
+
+    # Offsets and masks.
+    offsets_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    masks_am = offsets_am < M
+
+    offsets_bn = pid_n * (BLOCK_SIZE_N // 8) + tl.arange(0, BLOCK_SIZE_N // 8)
+    masks_bn = offsets_bn < N // 8
+
+    offsets_zn = pid_n * (BLOCK_SIZE_N // 8) + tl.arange(0, BLOCK_SIZE_N // 8)
+    masks_zn = offsets_zn < N // 8
+
+    offsets_sn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    masks_sn = offsets_sn < N
+
+    offsets_k = pid_z * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
+    offsets_a = K * offsets_am[:, None] + offsets_k[None, :]
+    offsets_b = (N // 8) * offsets_k[:, None] + offsets_bn[None, :]
+
+    a_ptrs = a_ptr + offsets_a
+    b_ptrs = b_ptr + offsets_b
+
+    # NOTE: Use this in TRITON_INTERPRET=1 mode instead of tl.cdiv
+    # block_offset = BLOCK_SIZE_K * SPLIT_K
+    # for k in range(0, (K + block_offset - 1) // (block_offset)):
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)):
+        masks_k = offsets_k < K
+        masks_a = masks_am[:, None] & masks_k[None, :]
+        a = tl.load(a_ptrs, mask=masks_a, other=0.0)
+
+        masks_b = masks_k[:, None] & masks_bn[None, :]
+        b = tl.load(b_ptrs, mask=masks_b, other=0.0)
+        b = tl.interleave(b, b)
+        b = tl.interleave(b, b)
+        b = tl.interleave(b, b)
+
+        # Dequantize b.
+        offsets_szk = (
+            BLOCK_SIZE_K * SPLIT_K * k + pid_z * BLOCK_SIZE_K
+        ) // group_size + tl.arange(0, 1)
+        offsets_z = (N // 8) * offsets_szk[:, None] + offsets_zn[None, :]
+        masks_zk = offsets_szk < K // group_size
+        masks_z = masks_zk[:, None] & masks_zn[None, :]
+        zeros_ptrs = zeros_ptr + offsets_z
+        zeros = tl.load(zeros_ptrs, mask=masks_z, other=0.0)
+        zeros = tl.interleave(zeros, zeros)
+        zeros = tl.interleave(zeros, zeros)
+        zeros = tl.interleave(zeros, zeros)
+        zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_K, BLOCK_SIZE_N))
+
+        offsets_s = N * offsets_szk[:, None] + offsets_sn[None, :]
+        masks_sk = offsets_szk < K // group_size
+        masks_s = masks_sk[:, None] & masks_sn[None, :]
+        scales_ptrs = scales_ptr + offsets_s
+        scales = tl.load(scales_ptrs, mask=masks_s, other=0.0)
+        scales = tl.broadcast_to(scales, (BLOCK_SIZE_K, BLOCK_SIZE_N))
+
+        b = (b >> shifts) & 0xF
+        zeros = (zeros >> shifts) & 0xF
+        b = (b - zeros) * scales
+        b = b.to(c_ptr.type.element_ty)
+
+        # Accumulate results.
+        accumulator = tl.dot(a, b, accumulator, out_dtype=accumulator_dtype)
+
+        offsets_k += BLOCK_SIZE_K * SPLIT_K
+        a_ptrs += BLOCK_SIZE_K * SPLIT_K
+        b_ptrs += BLOCK_SIZE_K * SPLIT_K * (N // 8)
+
+    c = accumulator.to(c_ptr.type.element_ty)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + pid_z * N * M + N * offs_cm[:, None] + offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
+# qweights - [K     , M // 8], int32
+# scales   - [K // G, M     ], float16
+# zeros    - [K // G, M // 8], int32
+def awq_dequantize_triton(
+    qweight: torch.Tensor,
+    scales: torch.Tensor,
+    zeros: torch.Tensor,
+    block_size_x: int = 32,
+    block_size_y: int = 32,
+) -> torch.Tensor:
+    K = qweight.shape[0]
+    M = scales.shape[1]
+    group_size = qweight.shape[0] // scales.shape[0]
+
+    assert K > 0 and M > 0
+    assert scales.shape[0] == K // group_size and scales.shape[1] == M
+    assert zeros.shape[0] == K // group_size and zeros.shape[1] == M // 8
+    assert group_size <= K
+    assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K
+
+    # Result tensor:
+    # number of rows = same as input tensor
+    # number of cols = 8 x input tensor num cols
+    result = torch.empty(
+        qweight.shape[0],
+        qweight.shape[1] * 8,
+        device=qweight.device,
+        dtype=scales.dtype,
+    )
+
+    Y = qweight.shape[0]  # num rows
+    X = qweight.shape[1]  # num cols
+
+    grid = lambda META: (
+        triton.cdiv(X, META["BLOCK_SIZE_X"]),
+        triton.cdiv(Y, META["BLOCK_SIZE_Y"]),
+    )
+    awq_dequantize_kernel[grid](
+        qweight,
+        scales,
+        zeros,
+        group_size,
+        result,
+        X,
+        Y,
+        BLOCK_SIZE_X=block_size_x,
+        BLOCK_SIZE_Y=block_size_y,
+    )
+
+    return result
+
+
+# input   - [M, K]
+# qweight - [K, N // 8]
+# qzeros  - [K // G, N // 8]
+# scales  - [K // G, N]
+# split_k_iters - parallelism along K-dimension, int, power of 2.
+def awq_gemm_triton(
+    input: torch.Tensor,
+    qweight: torch.Tensor,
+    scales: torch.Tensor,
+    qzeros: torch.Tensor,
+    split_k_iters: int,
+    block_size_m: int = 32,
+    block_size_n: int = 32,
+    block_size_k: int = 32,
+) -> torch.Tensor:
+    M, K = input.shape
+    N = qweight.shape[1] * 8
+    group_size = qweight.shape[0] // qzeros.shape[0]
+
+    assert N > 0 and K > 0 and M > 0
+    assert qweight.shape[0] == K and qweight.shape[1] == N // 8
+    assert qzeros.shape[0] == K // group_size and qzeros.shape[1] == N // 8
+    assert scales.shape[0] == K // group_size and scales.shape[1] == N
+    assert split_k_iters & (split_k_iters - 1) == 0 and split_k_iters != 0
+    assert split_k_iters <= 32
+    assert group_size <= K
+    assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K
+
+    grid = lambda META: (
+        triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
+        split_k_iters,
+    )
+
+    result = torch.zeros((split_k_iters, M, N), dtype=scales.dtype, device=input.device)
+
+    # A = input, B = qweight, C = result
+    # A = M x K, B = K x N, C = M x N
+    awq_gemm_kernel[grid](
+        input,
+        qweight,
+        result,
+        qzeros,
+        scales,
+        M,
+        N,
+        K,
+        group_size,
+        BLOCK_SIZE_M=block_size_m,
+        BLOCK_SIZE_N=block_size_n,
+        BLOCK_SIZE_K=block_size_k,
+        SPLIT_K=split_k_iters,
+    )
+
+    result = result.sum(0)
+
+    return result
diff --git a/python/sglang/srt/layers/quantization/base_config.py b/python/sglang/srt/layers/quantization/base_config.py
new file mode 100644
index 000000000..4a5b7905e
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/base_config.py
@@ -0,0 +1,216 @@
+# Adapted from https://raw.githubusercontent.com/vllm-project/vllm/v0.5.5/vllm/model_executor/layers/quantization/base_config.py
+from __future__ import annotations
+
+import inspect
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type
+
+import torch
+from torch import nn
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
+    from sglang.srt.layers.moe.token_dispatcher import CombineInput, DispatchOutput
+
+
+class QuantizeMethodBase(ABC):
+    """Base class for different quantized methods."""
+
+    @abstractmethod
+    def create_weights(
+        self, layer: torch.nn.Module, *weight_args, **extra_weight_attrs
+    ):
+        """Create weights for a layer.
+
+        The weights will be set as attributes of the layer."""
+        raise NotImplementedError()
+
+    @abstractmethod
+    def apply(self, layer: torch.nn.Module, *args, **kwargs) -> torch.Tensor:
+        """Apply the weights in layer to the input tensor.
+
+        Expects create_weights to have been called before on the layer."""
+        raise NotImplementedError()
+
+    def process_weights_after_loading(self, layer: nn.Module) -> None:
+        """Process the weight after loading.
+
+        This can be used for example, to transpose weights for computation.
+        """
+        return
+
+
+class LinearMethodBase(QuantizeMethodBase):
+    """Base class for different (maybe quantized) linear methods."""
+
+    @abstractmethod
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        """Create weights for a linear layer.
+           The weights will be set as attributes of the layer.
+
+        Args:
+            layer: The layer that is using the LinearMethodBase factory.
+            input_size_per_partition: Size of the weight input dim on rank X.
+            output_partition_sizes: Sizes of the output dim of each logical
+                weight on rank X. E.g., output_partition_sizes for QKVLinear
+                is a list contains the width of Wq, Wk, Wv on rank X.
+            input_size: Size of the input dim of the weight across all ranks.
+            output_size: Size of the output dim of the weight across all ranks.
+            params_dtype: Datatype of the parameters.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Apply the weights in layer to the input tensor.
+        Expects create_weights to have been called before on the layer."""
+        raise NotImplementedError()
+
+
+class FusedMoEMethodBase(QuantizeMethodBase):
+
+    @abstractmethod
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        raise NotImplementedError
+
+    @abstractmethod
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: DispatchOutput,
+    ) -> CombineInput:
+        raise NotImplementedError
+
+
+class QuantizationConfig(ABC):
+    """Base class for quantization configs."""
+
+    def __init__(self):
+        super().__init__()
+        # mapping is updated by models as they initialize
+        self.packed_modules_mapping: Dict[str, List[str]] = dict()
+
+    @abstractmethod
+    def get_name(self) -> str:
+        """Name of the quantization method."""
+        raise NotImplementedError()
+
+    @abstractmethod
+    def get_supported_act_dtypes(self) -> List[torch.dtype]:
+        """List of supported activation dtypes."""
+        raise NotImplementedError()
+
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        """Minimum GPU capability to support the quantization method.
+
+        E.g., 70 for Volta, 75 for Turing, 80 for Ampere.
+        This requirement is due to the custom CUDA kernels used by the
+        quantization method.
+        """
+        raise NotImplementedError()
+
+    @staticmethod
+    @abstractmethod
+    def get_config_filenames() -> List[str]:
+        """List of filenames to search for in the model directory."""
+        raise NotImplementedError()
+
+    @classmethod
+    @abstractmethod
+    def from_config(cls, config: Dict[str, Any]) -> "QuantizationConfig":
+        """Create a config class from the model's quantization config."""
+        raise NotImplementedError()
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:
+        """
+        Detects if this quantization method can support a given checkpoint
+        format by overriding the user specified quantization method --
+        this method should only be overwritten by subclasses in exceptional
+        circumstances
+        """
+        return None
+
+    @staticmethod
+    def get_from_keys(config: Dict[str, Any], keys: List[str]) -> Any:
+        """Get a value from the model's quantization config."""
+        for key in keys:
+            if key in config:
+                return config[key]
+        raise ValueError(
+            f"Cannot find any of {keys} in the model's " "quantization config."
+        )
+
+    @staticmethod
+    def get_from_keys_or(config: Dict[str, Any], keys: List[str], default: Any) -> Any:
+        """Get a optional value from the model's quantization config."""
+        try:
+            return QuantizationConfig.get_from_keys(config, keys)
+        except ValueError:
+            return default
+
+    @abstractmethod
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[QuantizeMethodBase]:
+        """Get the quantize method to use for the quantized layer.
+
+        Args:
+            layer: The layer for the quant method.
+            prefix: The full name of the layer in the state dict
+        Returns:
+            The quantize method. None if the given layer doesn't support quant
+            method.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def get_scaled_act_names(self) -> List[str]:
+        """Returns the activation function names that should be post-scaled.
+
+        For now, this is only used by AWQ.
+        """
+        raise NotImplementedError()
+
+
+def method_has_implemented_embedding(method_class: Type[QuantizeMethodBase]) -> bool:
+    """
+    Not all quant methods have embedding implemented, so we need to check that
+    it exists for our given method. We check this by making sure the function
+    has been changed from the base implementation.
+    """
+    base_embedding = inspect.getattr_static(QuantizeMethodBase, "embedding", None)
+    class_embedding = inspect.getattr_static(method_class, "embedding", None)
+
+    return class_embedding is not None and class_embedding is not base_embedding
diff --git a/python/sglang/srt/layers/quantization/blockwise_int8.py b/python/sglang/srt/layers/quantization/blockwise_int8.py
new file mode 100644
index 000000000..60d4e3929
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/blockwise_int8.py
@@ -0,0 +1,380 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/layers/quantization/fp8.py
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+import torch
+from torch.nn import Module
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+from sglang.srt.layers.parameter import BlockQuantScaleParameter, ModelWeightParameter
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.int8_utils import apply_w8a8_block_int8_linear
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+from sglang.srt.layers.quantization.utils import is_layer_skipped
+from sglang.srt.utils import set_weight_attrs
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+
+ACTIVATION_SCHEMES = ["static", "dynamic"]
+
+logger = logging.getLogger(__name__)
+
+
+class BlockInt8Config(QuantizationConfig):
+    """Config class for INT8."""
+
+    def __init__(
+        self,
+        is_checkpoint_int8_serialized: bool = False,
+        activation_scheme: str = "dynamic",
+        ignored_layers: Optional[List[str]] = None,
+        weight_block_size: List[int] = None,
+    ) -> None:
+        self.is_checkpoint_int8_serialized = is_checkpoint_int8_serialized
+        if is_checkpoint_int8_serialized:
+            logger.warning(
+                "Detected int8 checkpoint. Please note that the "
+                "format is experimental and subject to change."
+            )
+        if activation_scheme not in ACTIVATION_SCHEMES:
+            raise ValueError(f"Unsupported activation scheme {activation_scheme}")
+        self.activation_scheme = activation_scheme
+        self.ignored_layers = ignored_layers or []
+        if weight_block_size is not None:
+            if not is_checkpoint_int8_serialized:
+                raise ValueError(
+                    f"The block-wise quantization only supports int8-serialized checkpoint for now."
+                )
+            if len(weight_block_size) != 2:
+                raise ValueError(
+                    f"The quantization block size of weight must have 2 dimensions, but got {len(weight_block_size)} dimensions."
+                )
+            if activation_scheme != "dynamic":
+                raise ValueError(
+                    f"The block-wise quantization only supports dynamic activation scheme for now, but got {activation_scheme} activation scheme."
+                )
+        self.weight_block_size = weight_block_size
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "blockwise_int8"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> BlockInt8Config:
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        is_checkpoint_int8_serialized = "int8" in quant_method
+        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
+        ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None)
+        weight_block_size = cls.get_from_keys_or(config, ["weight_block_size"], None)
+        return cls(
+            is_checkpoint_int8_serialized=is_checkpoint_int8_serialized,
+            activation_scheme=activation_scheme,
+            ignored_layers=ignored_layers,
+            weight_block_size=weight_block_size,
+        )
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[QuantizeMethodBase]:
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix, self.ignored_layers):
+                return UnquantizedLinearMethod()
+            return BlockInt8LinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return BlockInt8MoEMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class BlockInt8LinearMethod(LinearMethodBase):
+    """Linear method for INT8.
+    Supports loading INT8 checkpoints with static weight scale and
+    dynamic activation scale.
+
+    Limitations:
+    Only support block-wise int8 quantization and int8 checkpoint
+
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self, quant_config: BlockInt8Config):
+        self.quant_config = quant_config
+        assert self.quant_config.weight_block_size is not None
+        assert self.quant_config.is_checkpoint_int8_serialized
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        tp_size = get_tensor_model_parallel_world_size()
+
+        block_n, block_k = (
+            self.quant_config.weight_block_size[0],
+            self.quant_config.weight_block_size[1],
+        )
+        # Required by row parallel
+        if tp_size > 1 and input_size // input_size_per_partition == tp_size:
+            if input_size_per_partition % block_k != 0:
+                raise ValueError(
+                    f"Weight input_size_per_partition = "
+                    f"{input_size_per_partition} is not divisible by "
+                    f"weight quantization block_k = {block_k}."
+                )
+        # Required by column parallel or enabling merged weights
+        if (tp_size > 1 and output_size // output_size_per_partition == tp_size) or len(
+            output_partition_sizes
+        ) > 1:
+            for output_partition_size in output_partition_sizes:
+                if output_partition_size % block_n != 0:
+                    raise ValueError(
+                        f"Weight output_partition_size = "
+                        f"{output_partition_size} is not divisible by "
+                        f"weight quantization block_n = {block_n}."
+                    )
+
+        layer.logical_widths = output_partition_sizes
+
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.orig_dtype = params_dtype
+
+        # WEIGHT
+        weight_dtype = (
+            torch.int8
+            if self.quant_config.is_checkpoint_int8_serialized
+            else params_dtype
+        )
+
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition, input_size_per_partition, dtype=weight_dtype
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+
+        scale = BlockQuantScaleParameter(
+            data=torch.empty(
+                (output_size_per_partition + block_n - 1) // block_n,
+                (input_size_per_partition + block_k - 1) // block_k,
+                dtype=torch.float32,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        scale[:] = torch.finfo(torch.float32).min
+        layer.register_parameter("weight_scale_inv", scale)
+
+        # INPUT ACTIVATION SCALE
+        assert self.quant_config.activation_scheme == "dynamic"
+        layer.register_parameter("input_scale", None)
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        # Block quant doesn't need to process weights after loading
+        # Use torch Parameter to avoid cuda graph capturing issue
+        layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False)
+        layer.weight_scale_inv = torch.nn.Parameter(
+            layer.weight_scale_inv.data, requires_grad=False
+        )
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return apply_w8a8_block_int8_linear(
+            input=x,
+            weight=layer.weight,
+            block_size=self.quant_config.weight_block_size,
+            weight_scale=layer.weight_scale_inv,
+            input_scale=None,
+            bias=bias,
+        )
+
+
+class BlockInt8MoEMethod(FusedMoEMethodBase):
+    """MoE method for INT8.
+    Supports loading INT8 checkpoints with static weight scale and
+    dynamic activation scale.
+
+    Limitations:
+    Only support block-wise int8 quantization and int8 checkpoint
+
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self, quant_config: BlockInt8Config):
+        self.quant_config = quant_config
+        assert self.quant_config.weight_block_size is not None
+        assert self.quant_config.is_checkpoint_int8_serialized
+
+    def create_weights(
+        self,
+        layer: Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        if self.quant_config.is_checkpoint_int8_serialized:
+            params_dtype = torch.int8
+        tp_size = get_tensor_model_parallel_world_size()
+
+        block_n, block_k = (
+            self.quant_config.weight_block_size[0],
+            self.quant_config.weight_block_size[1],
+        )
+        # NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
+        # Required by column parallel or enabling merged weights
+        if intermediate_size_per_partition % block_n != 0:
+            raise ValueError(
+                f"The output_size of gate's and up's weight = "
+                f"{intermediate_size_per_partition} is not divisible by "
+                f"weight quantization block_n = {block_n}."
+            )
+        if tp_size > 1:
+            # Required by row parallel
+            if intermediate_size_per_partition % block_k != 0:
+                raise ValueError(
+                    f"The input_size of down's weight = "
+                    f"{intermediate_size_per_partition} is not divisible by "
+                    f"weight quantization block_k = {block_k}."
+                )
+
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # WEIGHT_SCALES
+        w13_weight_scale = torch.nn.Parameter(
+            torch.ones(
+                num_experts,
+                2 * ((intermediate_size_per_partition + block_n - 1) // block_n),
+                (hidden_size + block_k - 1) // block_k,
+                dtype=torch.float32,
+            ),
+            requires_grad=False,
+        )
+        w2_weight_scale = torch.nn.Parameter(
+            torch.ones(
+                num_experts,
+                (hidden_size + block_n - 1) // block_n,
+                (intermediate_size_per_partition + block_k - 1) // block_k,
+                dtype=torch.float32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale_inv", w13_weight_scale)
+        layer.register_parameter("w2_weight_scale_inv", w2_weight_scale)
+
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value}
+        )
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # INPUT_SCALES
+        assert self.quant_config.activation_scheme == "dynamic"
+        layer.w13_input_scale = None
+        layer.w2_input_scale = None
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        # Block quant doesn't need to process weights after loading
+        return
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+        self.runner = MoeRunner(MoeRunnerBackend.TRITON, moe_runner_config)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+
+        quant_info = TritonMoeQuantInfo(
+            w13_weight=layer.w13_weight,
+            w2_weight=layer.w2_weight,
+            use_int8_w8a8=True,
+            w13_scale=layer.w13_weight_scale_inv,
+            w2_scale=layer.w2_weight_scale_inv,
+            a13_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
+            block_shape=self.quant_config.weight_block_size,
+        )
+
+        return self.runner.run(dispatch_output, quant_info)
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/README.md b/python/sglang/srt/layers/quantization/compressed_tensors/README.md
new file mode 100644
index 000000000..e8caf538b
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/README.md
@@ -0,0 +1,6 @@
+# quantization compressed_tensors module
+
+To support compressed_tensors format quantization models, we adapted https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors into SGLang.
+
+
+For practical purposes, we have only applied the compressed_tensors format of `w8a8_fp8`. If you have requirements for other formats, you can submit an issue through this [link](https://github.com/sgl-project/sglang/issues).
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/__init__.py b/python/sglang/srt/layers/quantization/compressed_tensors/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
new file mode 100644
index 000000000..8afc15a73
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -0,0 +1,666 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/v0.8.2/vllm/model_executor/layers/quantization/compressed_tensors
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import logging
+from contextlib import suppress
+from typing import Any, Dict, List, Literal, NamedTuple, Optional, Tuple, cast
+
+import torch
+from compressed_tensors.config import (
+    CompressionFormat,
+    SparsityCompressionConfig,
+    SparsityStructure,
+)
+from compressed_tensors.quantization import (
+    QuantizationArgs,
+    QuantizationStrategy,
+    QuantizationType,
+)
+from pydantic import BaseModel
+
+from sglang.srt.layers.quantization.base_config import (
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import (  # noqa: E501
+    CompressedTensorsMoEMethod,
+)
+from sglang.srt.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme,
+    CompressedTensorsW8A8Fp8,
+    CompressedTensorsW8A16Fp8,
+)
+from sglang.srt.layers.quantization.compressed_tensors.utils import (
+    find_matched_target,
+    is_activation_quantization_format,
+    should_ignore_layer,
+)
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+
+try:
+    from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import (
+        WNA16_SUPPORTED_BITS,
+        CompressedTensorsWNA16,
+    )
+
+    VLLM_AVAILABLE = True
+except ImportError:
+    VLLM_AVAILABLE = False
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["CompressedTensorsLinearMethod"]
+
+SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config"
+QUANTIZATION_SCHEME_MAP_TYPE = Dict[str, Optional[Dict[str, QuantizationArgs]]]
+
+
+class DeviceCapability(NamedTuple):
+    major: int
+    minor: int
+
+    def as_version_str(self) -> str:
+        return f"{self.major}.{self.minor}"
+
+    def to_int(self) -> int:
+        """
+        Express device capability as an integer ``<major><minor>``.
+
+        It is assumed that the minor version is always a single digit.
+        """
+        assert 0 <= self.minor < 10
+        return self.major * 10 + self.minor
+
+
+class CompressedTensorsConfig(QuantizationConfig):
+
+    def __init__(
+        self,
+        target_scheme_map: Dict[str, Any],
+        ignore: List[str],
+        quant_format: str,
+        sparsity_scheme_map: Dict[str, SparsityCompressionConfig],
+        sparsity_ignore_list: List[str],
+        kv_cache_scheme: Optional[Dict[str, Any]] = None,
+        config: Optional[Dict[str, Any]] = None,
+        packed_modules_mapping: Dict[str, List[str]] = {},
+    ):
+        super().__init__()
+        self.ignore = ignore
+        self.quant_format = quant_format
+        # Map from [target -> scheme]
+        self.target_scheme_map = target_scheme_map
+        self.kv_cache_scheme = kv_cache_scheme
+        self.sparsity_scheme_map = sparsity_scheme_map
+        self.sparsity_ignore_list = sparsity_ignore_list
+        self.config = config
+        self.packed_modules_mapping = packed_modules_mapping
+
+    def get_linear_method(self) -> CompressedTensorsLinearMethod:
+        return CompressedTensorsLinearMethod(self)
+
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    def get_name(self) -> str:
+        return "compressed_tensors"
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+    def get_quant_method(
+        self,
+        layer: torch.nn.Module,
+        prefix: str,
+    ) -> Optional[QuantizeMethodBase]:
+        from sglang.srt.layers.linear import LinearBase
+
+        # Check if the layer is skipped for quantization.
+        # TODO (@robertgshaw2): support module names
+        if should_ignore_layer(
+            prefix, ignore=self.ignore, fused_mapping=self.packed_modules_mapping
+        ):
+            return UnquantizedLinearMethod()
+        if isinstance(layer, LinearBase):
+            scheme = self.get_scheme(layer=layer, layer_name=prefix)
+            if scheme is None:
+                return UnquantizedLinearMethod()
+            layer.scheme = scheme
+            return CompressedTensorsLinearMethod(self)
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+
+        if isinstance(layer, FusedMoE):
+            return CompressedTensorsMoEMethod.get_moe_method(self)
+        return None
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> CompressedTensorsConfig:
+        ignore: List[str] = cast(List[str], config.get("ignore", []))
+        quant_format = cast(str, config.get("format"))
+        target_scheme_map = cls._quantization_scheme_map_from_config(config=config)
+        sparsity_scheme_map, sparsity_ignore_list = cls._parse_sparsity_config(
+            config=config
+        )
+        packed_modules_mapping = config.get("packed_modules_mapping", {})
+
+        return cls(
+            target_scheme_map=target_scheme_map,
+            ignore=ignore,
+            quant_format=quant_format,
+            sparsity_scheme_map=sparsity_scheme_map,
+            sparsity_ignore_list=sparsity_ignore_list,
+            config=config,
+            packed_modules_mapping=packed_modules_mapping,
+        )
+
+    @classmethod
+    def _parse_sparsity_config(
+        cls, config: Dict[str, Any]
+    ) -> Tuple[Dict[str, SparsityCompressionConfig], List[str]]:
+        """
+        :param config: The `quantization_config` dictionary from config.json
+        :return: A tuple with two elements
+            1. A dictionary mapping target layer names to their corresponding
+                sparsity_config
+            2. A list of layer names to ignore for sparsity
+        """
+        if not (sparsity_config := config.get(SPARSITY_CONFIG_NAME)):
+            return dict(), []
+
+        sparsity_config = SparsityCompressionConfig.model_validate(sparsity_config)
+        sparse_scheme_map: Dict[str, SparsityCompressionConfig] = {
+            target: sparsity_config for target in sparsity_config.targets or list()
+        }
+        sparsity_ignore_list = sparsity_config.ignore or list()
+        return sparse_scheme_map, sparsity_ignore_list
+
+    @classmethod
+    def _quantization_scheme_map_from_config(
+        cls, config: Dict[str, Any]
+    ) -> QUANTIZATION_SCHEME_MAP_TYPE:
+        """
+        :param config: The `quantization_config` dictionary from config.json
+        :return: A dictionary mapping target layer names to their corresponding
+            quantization_args for weights and input activations
+        """
+        target_scheme_map: Dict[str, Any] = dict()
+        quant_format = cast(str, config.get("format"))
+
+        # The quant_config has multiple config_groups, each containing
+        # an input_activations key with details about how the activations are
+        # quantized, a weights key indicating how the weights are quantized,
+        # and a list of targets under the `targets` key, dictating which
+        # layers are impacted by the quantization details. The quantization
+        # details follow the structure defined by the QuantizationArgs
+        # pydantic model, which is used to verify the structure of the
+        # quant_config and also store the details for later use.
+
+        config_groups = config.get("config_groups", dict())
+        for _, quant_config in config_groups.items():
+            targets = quant_config.get("targets")
+            for target in targets:
+                target_scheme_map[target] = {}
+                target_scheme_map[target]["weights"] = QuantizationArgs.model_validate(
+                    quant_config.get("weights")
+                )
+
+                target_scheme_map[target]["input_activations"] = None
+                if is_activation_quantization_format(quant_format):
+                    input_activations = quant_config.get("input_activations")
+                    # The only case where we have activation quant supported
+                    # but no input_activations provided in the config
+                    # should be w8a16fp8 w8a16fp8 can also run for cases where
+                    # there is an input_quant but it is ignored
+                    if not input_activations:
+                        assert (
+                            target_scheme_map[target]["weights"].type
+                            == QuantizationType.FLOAT
+                        )
+                    else:
+                        target_scheme_map[target]["input_activations"] = (
+                            QuantizationArgs.model_validate(  # noqa: E501
+                                quant_config.get("input_activations")
+                            )
+                        )
+        return target_scheme_map
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []
+
+    def _check_scheme_supported(self, min_capability: int, error: bool = True) -> bool:
+        capability_tuple = DeviceCapability(*torch.cuda.get_device_capability())
+
+        if capability_tuple is not None:
+            capability = capability_tuple.to_int()
+            supported = capability >= min_capability
+            if error and not supported:
+                raise RuntimeError(
+                    "Quantization scheme is not supported for ",
+                    f"the current GPU. Min capability: {min_capability}. ",
+                    f"Current capability: {capability}.",
+                )
+            return supported
+        else:
+            return False
+
+    def _is_static_tensor_w8a8(
+        self, weight_quant: BaseModel, input_quant: BaseModel
+    ) -> bool:
+        is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR.value
+            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+        )
+        is_tensor = (
+            weight_strategy
+            and input_quant.strategy == QuantizationStrategy.TENSOR.value
+        )
+        is_static = not weight_quant.dynamic and not input_quant.dynamic
+
+        # Both symmetric and asymmetric input quantization supported.
+        # Only symmetric weight quantization supported.
+        return is_8_bits and is_tensor and weight_quant.symmetric and is_static
+
+    def _is_dynamic_token_w8a8(
+        self, weight_quant: BaseModel, input_quant: BaseModel
+    ) -> bool:
+        is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR.value
+            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+        )
+        is_token = (
+            weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value
+        )
+        is_dynamic = not weight_quant.dynamic and input_quant.dynamic
+
+        # Both symmetric and asymmetric input quantization supported.
+        # Only symmetric weight quantization supported.
+        return is_8_bits and is_token and weight_quant.symmetric and is_dynamic
+
+    def _is_fp8_w8a8(self, weight_quant: BaseModel, input_quant: BaseModel) -> bool:
+        # Confirm weights and activations quantized.
+        if weight_quant is None or input_quant is None:
+            return False
+
+        # Confirm weight scheme is supported.
+        is_floating_point = (
+            weight_quant.type == QuantizationType.FLOAT
+            and input_quant.type == QuantizationType.FLOAT
+        )
+        is_symmetric_weight = weight_quant.symmetric
+        is_static_weight = not weight_quant.dynamic
+        is_per_tensor_or_channel_weight = weight_quant.strategy in [
+            QuantizationStrategy.TENSOR,
+            QuantizationStrategy.CHANNEL,
+        ]
+        if not (
+            is_floating_point
+            and is_symmetric_weight
+            and is_static_weight
+            and is_per_tensor_or_channel_weight
+        ):
+            return False
+
+        # Dynamic quantization is always supported if weights supported.
+        if input_quant.dynamic:
+            return True
+
+        # Confirm activation scheme is supported.
+        is_symmetric_activation = input_quant.symmetric
+        is_per_tensor_activation = input_quant.strategy == QuantizationStrategy.TENSOR
+        return is_symmetric_activation and is_per_tensor_activation
+
+    def _is_fp8_w8a16(self, weight_quant: BaseModel, input_quant: BaseModel) -> bool:
+        # Confirm weights quantized.
+        if weight_quant is None:
+            return False
+
+        # Confirm we have floating points.
+        if weight_quant.type != QuantizationType.FLOAT:
+            return False
+
+        # Confirm weight scheme is supported.
+        is_symmetric_weight = weight_quant.symmetric
+        is_static_weight = not weight_quant.dynamic
+        is_per_tensor_or_channel_weight = weight_quant.strategy in [
+            QuantizationStrategy.TENSOR,
+            QuantizationStrategy.CHANNEL,
+        ]
+        if not (
+            is_symmetric_weight
+            and is_static_weight  # noqa: SIM103
+            and is_per_tensor_or_channel_weight
+        ):
+            return False
+
+        # All conditions satisfied.
+        return True
+
+    def _is_wNa16_group_channel(
+        self, weight_quant: BaseModel, input_quant: BaseModel
+    ) -> bool:
+        input_quant_none = input_quant is None
+        is_symmetric = weight_quant.symmetric
+        is_channel_group = (
+            weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+            or weight_quant.strategy == QuantizationStrategy.GROUP.value
+        )
+        is_static = not weight_quant.dynamic
+
+        return is_channel_group and input_quant_none and is_symmetric and is_static
+
+    def _get_scheme_from_parts(
+        self, weight_quant: BaseModel, input_quant: BaseModel
+    ) -> CompressedTensorsScheme:
+
+        # Detect If Mixed Precision
+        if self._is_wNa16_group_channel(weight_quant, input_quant):
+            if not VLLM_AVAILABLE:
+                raise ImportError(
+                    "vllm is not installed, to use CompressedTensorsW4A16Sparse24 and CompressedTensorsWNA16, please install vllm"
+                )
+            if (
+                self.quant_format == CompressionFormat.marlin_24.value
+                and weight_quant.num_bits in W4A16SPARSE24_SUPPORTED_BITS
+            ):
+                return CompressedTensorsW4A16Sparse24(
+                    strategy=weight_quant.strategy,
+                    num_bits=weight_quant.num_bits,
+                    group_size=weight_quant.group_size,
+                )
+            if (
+                self.quant_format == CompressionFormat.pack_quantized.value
+                and weight_quant.num_bits in WNA16_SUPPORTED_BITS
+            ):
+                return CompressedTensorsWNA16(
+                    num_bits=weight_quant.num_bits,
+                    strategy=weight_quant.strategy,
+                    group_size=weight_quant.group_size,
+                    actorder=weight_quant.actorder,
+                )
+
+        if is_activation_quantization_format(self.quant_format):
+            if self._is_fp8_w8a8(weight_quant, input_quant):
+                is_fp8_w8a8_supported = self._check_scheme_supported(
+                    CompressedTensorsW8A8Fp8.get_min_capability(), error=False
+                )
+                if is_fp8_w8a8_supported:
+                    return CompressedTensorsW8A8Fp8(
+                        strategy=weight_quant.strategy,
+                        is_static_input_scheme=(
+                            input_quant and not input_quant.dynamic
+                        ),
+                    )
+                else:
+                    # note: input_quant will be present for converted models;
+                    # will be ignored during inference post loading
+                    return CompressedTensorsW8A16Fp8(
+                        strategy=weight_quant.strategy,
+                        is_static_input_scheme=not input_quant.dynamic,
+                    )
+
+            # note: input_quant can be None
+            if self._is_fp8_w8a16(weight_quant, input_quant):
+                if not VLLM_AVAILABLE:
+                    raise ImportError(
+                        "vllm is not installed, to use CompressedTensorsW8A16Fp8, please install vllm"
+                    )
+                is_static_input_scheme = input_quant and not input_quant.dynamic
+                return CompressedTensorsW8A16Fp8(
+                    strategy=weight_quant.strategy,
+                    is_static_input_scheme=is_static_input_scheme,
+                )
+
+            if self._is_static_tensor_w8a8(weight_quant, input_quant):
+                return CompressedTensorsW8A8Int8(
+                    strategy=weight_quant.strategy,
+                    is_static_input_scheme=True,
+                    input_symmetric=input_quant.symmetric,
+                )
+
+            if self._is_dynamic_token_w8a8(weight_quant, input_quant):
+                return CompressedTensorsW8A8Int8(
+                    strategy=weight_quant.strategy,
+                    is_static_input_scheme=False,
+                    input_symmetric=input_quant.symmetric,
+                )
+
+        raise NotImplementedError("No compressed-tensors compatible scheme was found.")
+
+    def get_scheme(
+        self, layer: torch.nn.Module, layer_name: Optional[str] = None
+    ) -> Optional[CompressedTensorsScheme]:
+        """
+        compressed-tensors supports non uniform in the following way:
+
+        targets of config_groups: There can be N config_groups which each
+            have a quantization scheme. Each config_group has a list of targets
+            which can be a full layer_name, a regex for a layer_name, or
+            an nn.Module name.
+
+        Detect whether a layer_name is found in any target and
+        use the quantization scheme corresponding to the matched target
+        to select the CompressedTensorsScheme used for infernece.
+        """
+
+        # Find the "target" in the compressed-tensors config
+        # that our layer conforms to.
+        # TODO (@robertgshaw): add compressed-tensors as dep
+        # so we do not have to re-write these functions
+        # need to make accelerate optional in ct to do this
+
+        # Will be empty for models with only sparsity
+        weight_quant = input_quant = None
+        if self.target_scheme_map:
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=self.target_scheme_map.keys(),
+                fused_mapping=self.packed_modules_mapping,
+            )
+
+            scheme_dict = self.target_scheme_map[matched_target]
+            weight_quant = scheme_dict.get("weights")
+            input_quant = scheme_dict.get("input_activations")
+
+        # Find the sparsity scheme of the layer
+        # assume that fused layers inerhit first component's sparsity scheme
+        sparsity_targets = self.sparsity_scheme_map.keys() - set(
+            self.sparsity_ignore_list
+        )
+        sparsity_scheme: Optional[SparsityCompressionConfig] = None
+        with suppress(ValueError):
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=sparsity_targets,
+                fused_mapping=self.packed_modules_mapping,
+            )
+            sparsity_scheme = self.sparsity_scheme_map[matched_target]
+
+        if self.supports_cutlass_24(
+            weight_quant=weight_quant,
+            input_quant=input_quant,
+            sparsity_scheme=sparsity_scheme,
+        ):
+            if not VLLM_AVAILABLE:
+                raise ImportError(
+                    "vllm is not installed, to use CompressedTensors24, please install vllm"
+                )
+            # Have a valid sparsity scheme
+            # Validate layer is supported by Cutlass 2:4 Kernel
+            model_compression_config = (
+                None
+                if sparsity_scheme is None or sparsity_scheme.format == "dense"
+                else self.config
+            )
+
+            scheme = CompressedTensors24(
+                quantized=weight_quant is not None or input_quant is not None,
+                weight_quant=weight_quant,
+                input_quant=input_quant,
+                model_compression_config=model_compression_config,
+            )
+        elif weight_quant is None:
+            logger.warning_once(
+                "Acceleration for non-quantized schemes is "
+                "not supported by Compressed Tensors. "
+                "Falling back to UnquantizedLinearMethod"
+            )
+            return None
+
+        else:
+            # Find the quant_scheme
+            scheme = self._get_scheme_from_parts(  # type: ignore
+                weight_quant=weight_quant,
+                input_quant=input_quant,
+            )
+
+        # Raise error if device does not support the scheme
+        # (e.g. fp8 needs ada lovelace)
+        self._check_scheme_supported(scheme.get_min_capability())
+        logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name)
+        return scheme
+
+    def get_cache_scale(self, name: str) -> Optional[str]:
+        """
+        Check whether the param name matches the format for k/v cache scales
+        in compressed-tensors. If this is the case, return its equivalent
+        param name expected by vLLM
+
+        :param name: param name
+        :return: matching param name for KV cache scale in vLLM
+        """
+        if name.endswith(".output_scale") and ".k_proj" in name:
+            return name.replace(".k_proj.output_scale", ".attn.k_scale")
+        if name.endswith(".output_scale") and ".v_proj" in name:
+            return name.replace(".v_proj.output_scale", ".attn.v_scale")
+        # If no matches, return None
+        return None
+
+    @staticmethod
+    def supports_cutlass_24(
+        weight_quant: Optional[QuantizationArgs],
+        input_quant: Optional[QuantizationArgs],
+        sparsity_scheme: Optional[SparsityCompressionConfig] = None,
+    ) -> bool:
+        """
+        Check if the layer is supported by the Cutlass 2:4 Kernel
+        Conditions:
+            - Overarching condition: Sparsity Structure is 2:4
+            - Unquantized cases are supported
+            - Weight only quantization is not-supported
+            - Supported weight quantization strategies are TENSOR and CHANNEL
+            - Supported input quantization strategies are TENSOR and TOKEN
+            - Only 8 bit quantization is supported
+
+        :return: True if the layer is supported by the Cutlass 2:4 Kernel
+            False otherwise
+        """
+        if sparsity_scheme is None:
+            return False
+
+        is_valid_sparsity_structure: bool = (
+            sparsity_scheme.sparsity_structure == SparsityStructure.TWO_FOUR.value
+        )
+
+        valid_compressors = {
+            CompressionFormat.dense.value,
+            CompressionFormat.sparse_24_bitmask.value,
+        }
+
+        is_valid_sparsity = (
+            is_valid_sparsity_structure and sparsity_scheme.format in valid_compressors
+        )
+
+        if not is_valid_sparsity:
+            return False
+
+        # Unquantized cases are supported
+        if weight_quant is None and input_quant is None:
+            return True
+
+        # Weight only quantization is not-supported
+        if weight_quant is not None and input_quant is None:
+            return False
+
+        supported_weight_quant_strategies = [
+            QuantizationStrategy.TENSOR.value,
+            QuantizationStrategy.CHANNEL.value,
+        ]
+
+        assert weight_quant is not None
+        assert input_quant is not None
+        if weight_quant.strategy not in supported_weight_quant_strategies:
+            return False
+
+        supported_input_quant_strategies = [
+            QuantizationStrategy.TENSOR.value,
+            QuantizationStrategy.TOKEN.value,
+        ]
+
+        if input_quant.strategy not in supported_input_quant_strategies:
+            return False
+
+        return weight_quant.num_bits == input_quant.num_bits == 8
+
+
+class CompressedTensorsLinearMethod(LinearMethodBase):
+
+    def __init__(self, quantization_config: CompressedTensorsConfig):
+        self.quantization_config = quantization_config
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.scheme.process_weights_after_loading(layer)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        """
+        Use the CompressedTensorsScheme associated with each layer to create
+        the necessary parameters for the layer. See LinearMethodBase for param
+        details
+        """
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.scheme.create_weights(
+            layer=layer,
+            input_size=input_size,
+            input_size_per_partition=input_size_per_partition,
+            output_partition_sizes=output_partition_sizes,
+            output_size=output_size,
+            params_dtype=params_dtype,
+            weight_loader=weight_loader,
+        )
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ):
+        """
+        Use the output of create_weights and the CompressedTensorsScheme
+        associated with the layer to apply the forward pass with the
+        layer input.  See LinearMethodBase for param details
+
+        """
+
+        scheme = layer.scheme
+        if scheme is None:
+            raise ValueError("A scheme must be defined for each layer")
+        return scheme.apply_weights(layer, x, bias=bias)
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
new file mode 100644
index 000000000..e2ff25e68
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -0,0 +1,692 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/v0.8.2/vllm/model_executor/layers/quantization/compressed_tensors
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import enum
+import logging
+from enum import Enum
+from typing import TYPE_CHECKING, List, Optional
+
+import torch
+from compressed_tensors import CompressionFormat
+from compressed_tensors.quantization import QuantizationStrategy
+
+from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz, scaled_fp8_quant
+from sglang.srt.layers.quantization.fp8_utils import normalize_e4m3fn_to_e4m3fnuz
+from sglang.srt.layers.quantization.utils import (
+    all_close_1d,
+    per_tensor_dequantize,
+    replace_parameter,
+)
+from sglang.srt.utils import (
+    get_bool_env_var,
+    is_cpu,
+    is_cuda,
+    is_hip,
+    is_npu,
+    set_weight_attrs,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+    from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import (
+        CompressedTensorsConfig,
+    )
+
+_is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+if _use_aiter:
+    from aiter.ops.shuffle import shuffle_weight
+
+    from sglang.srt.layers.moe.rocm_moe_utils import rocm_fused_experts_tkw1
+
+try:
+    import vllm
+
+    VLLM_AVAILABLE = True
+except ImportError:
+    VLLM_AVAILABLE = False
+
+logger = logging.getLogger(__name__)
+
+
+class GPTQMarlinState(Enum):
+    REPACK = enum.auto()
+    READY = enum.auto()
+
+
+__all__ = [
+    "CompressedTensorsMoEMethod",
+    "CompressedTensorsW8A8Fp8MoEMethod",
+    "CompressedTensorsWNA16MoEMethod",
+]
+
+
+class CompressedTensorsMoEMethod(FusedMoEMethodBase):
+    def __new__(cls, *args, **kwargs):
+        if cls is CompressedTensorsMoEMethod:
+            return super().__new__(cls)
+        return super().__new__(cls)
+
+    @staticmethod
+    def get_moe_method(
+        quant_config: CompressedTensorsConfig,
+    ) -> "CompressedTensorsMoEMethod":
+        # TODO: @dsikka: refactor this to use schemes as other kernels
+        # are supported + check if the layer is being ignored.
+        weight_quant = quant_config.target_scheme_map["Linear"].get("weights")
+        input_quant = quant_config.target_scheme_map["Linear"].get("input_activations")
+
+        if quant_config._is_wNa16_group_channel(weight_quant, input_quant):
+            if not VLLM_AVAILABLE:
+                raise ImportError(
+                    "vllm is not installed, to use CompressedTensorsWNA16MoEMethod, please install vllm."
+                )
+            return CompressedTensorsWNA16MoEMethod(quant_config)
+        elif quant_config._is_fp8_w8a8(weight_quant, input_quant):
+            return CompressedTensorsW8A8Fp8MoEMethod(quant_config)
+        else:
+            raise RuntimeError(
+                f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}"
+            )
+
+
+class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
+
+    def __init__(self, quant_config: CompressedTensorsConfig):
+        self.quant_config = quant_config
+        self.weight_quant = self.quant_config.target_scheme_map["Linear"].get("weights")
+        self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
+            "input_activations"
+        )
+
+        self.static_input_scales = not self.input_quant.dynamic
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        params_dtype = torch.float8_e4m3fn
+
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # WEIGHT_SCALES
+        # per-tensor quantization
+        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
+            # Allocate 2 scales for w1 and w3 respectively.
+            # They will be combined to a single scale after weight loading.
+            w13_weight_scale = torch.nn.Parameter(
+                torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
+            )
+            w2_weight_scale = torch.nn.Parameter(
+                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+            )
+            weight_quant_method = FusedMoeWeightScaleSupported.TENSOR.value
+        elif self.weight_quant.strategy == QuantizationStrategy.CHANNEL:
+            w13_weight_scale = torch.nn.Parameter(
+                torch.ones(
+                    num_experts,
+                    2 * intermediate_size_per_partition,
+                    1,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            w2_weight_scale = torch.nn.Parameter(
+                torch.ones(num_experts, hidden_size, 1, dtype=torch.float32),
+                requires_grad=False,
+            )
+            weight_quant_method = FusedMoeWeightScaleSupported.CHANNEL.value
+        else:
+            raise ValueError(
+                f"Unsupported weight quantization strategy: {self.weight_quant.strategy}"
+            )
+
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        # Add the quantization method used (per tensor/grouped/channel)
+        # to ensure the weight scales are loaded in properly
+        extra_weight_attrs.update({"quant_method": weight_quant_method})
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # INPUT_SCALES
+        if self.static_input_scales:
+            assert (
+                self.input_quant.strategy == QuantizationStrategy.TENSOR
+            ), "Only per-tensor quantization is supported for static input scales"
+            w13_input_scale = torch.nn.Parameter(
+                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+            )
+            layer.register_parameter("w13_input_scale", w13_input_scale)
+            set_weight_attrs(w13_input_scale, extra_weight_attrs)
+
+            w2_input_scale = torch.nn.Parameter(
+                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+            )
+            layer.register_parameter("w2_input_scale", w2_input_scale)
+            set_weight_attrs(w2_input_scale, extra_weight_attrs)
+        else:
+            layer.w13_input_scale = None
+            layer.w2_input_scale = None
+
+    def process_weights_after_loading(self, layer: FusedMoE) -> None:
+        # Fp8 moe kernels require a single activation scale.
+        # We take the max of all the scales in case they differ.
+        if self.static_input_scales:
+            if layer.w13_input_scale is None or layer.w2_input_scale is None:
+                raise ValueError(
+                    "QuantConfig has static quantization, but found "
+                    "activation scales are None."
+                )
+            if not all_close_1d(layer.w13_input_scale) or not all_close_1d(
+                layer.w2_input_scale
+            ):
+                logger.warning(
+                    "Found input_scales that are not equal for "
+                    "fp8 MoE layer. Using the maximum across experts "
+                    "for each layer."
+                )
+            layer.w13_input_scale = torch.nn.Parameter(
+                layer.w13_input_scale.max(), requires_grad=False
+            )
+            layer.w2_input_scale = torch.nn.Parameter(
+                layer.w2_input_scale.max(), requires_grad=False
+            )
+
+        if is_fp8_fnuz():
+            # Normalize the weights and scales
+            w13_weight, w13_weight_scale, w13_input_scale = (
+                normalize_e4m3fn_to_e4m3fnuz(
+                    layer.w13_weight, layer.w13_weight_scale, layer.w13_input_scale
+                )
+            )
+            w2_weight, w2_weight_scale, w2_input_scale = normalize_e4m3fn_to_e4m3fnuz(
+                layer.w2_weight, layer.w2_weight_scale, layer.w2_input_scale
+            )
+            # Reset the parameter
+            layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
+            layer.w13_weight_scale = torch.nn.Parameter(
+                w13_weight_scale, requires_grad=False
+            )
+            if w13_input_scale is not None:
+                layer.w13_input_scale = torch.nn.Parameter(
+                    w13_input_scale, requires_grad=False
+                )
+            layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+            layer.w2_weight_scale = torch.nn.Parameter(
+                w2_weight_scale, requires_grad=False
+            )
+            if w2_input_scale is not None:
+                layer.w2_input_scale = torch.nn.Parameter(
+                    w2_input_scale, requires_grad=False
+                )
+        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
+            # Fp8 moe kernel needs single weight scale for w13 per expert.
+            # We take the max then dequant and requant each expert.
+            assert layer.w13_weight_scale is not None
+            shard_size = layer.intermediate_size_per_partition
+            max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+            for expert_id in range(layer.num_local_experts):
+                start = 0
+                for shard_id in range(2):
+                    dq_weight = per_tensor_dequantize(
+                        layer.w13_weight[expert_id][start : start + shard_size, :],
+                        layer.w13_weight_scale[expert_id][shard_id],
+                    )
+                    (
+                        layer.w13_weight[expert_id][start : start + shard_size, :],
+                        _,
+                    ) = scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
+
+                    start += shard_size
+
+            layer.w13_weight_scale = torch.nn.Parameter(
+                max_w13_scales, requires_grad=False
+            )
+
+        if _use_aiter:
+            with torch.no_grad():
+                # Pre-shuffle weights
+                layer.w13_weight = torch.nn.Parameter(
+                    shuffle_weight(layer.w13_weight.data, (16, 16)),
+                    requires_grad=False,
+                )
+                torch.cuda.empty_cache()
+                layer.w2_weight = torch.nn.Parameter(
+                    shuffle_weight(layer.w2_weight.data, (16, 16)),
+                    requires_grad=False,
+                )
+                torch.cuda.empty_cache()
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+        self.runner = MoeRunner(MoeRunnerBackend.TRITON, moe_runner_config)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        moe_runner_config = self.moe_runner_config
+
+        if (
+            _use_aiter
+            and self.weight_quant.strategy == QuantizationStrategy.CHANNEL
+            and moe_runner_config.apply_router_weight_on_input
+        ):
+            topk_weights, topk_ids, _ = topk_output
+            output = rocm_fused_experts_tkw1(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                activation=moe_runner_config.activation,
+                apply_router_weight_on_input=moe_runner_config.apply_router_weight_on_input,
+                use_fp8_w8a8=True,
+                per_channel_quant=self.weight_quant.strategy
+                == QuantizationStrategy.CHANNEL,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
+                a1_scale=layer.w13_input_scale,
+                a2_scale=layer.w2_input_scale,
+            )
+            return StandardCombineInput(hidden_states=output)
+        else:
+            quant_info = TritonMoeQuantInfo(
+                w13_weight=layer.w13_weight,
+                w2_weight=layer.w2_weight,
+                use_fp8_w8a8=True,
+                per_channel_quant=self.weight_quant.strategy
+                == QuantizationStrategy.CHANNEL,
+                w13_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
+                a13_scale=layer.w13_input_scale,
+                a2_scale=layer.w2_input_scale,
+            )
+            return self.runner.run(dispatch_output, quant_info)
+
+
+class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
+
+    def __init__(self, quant_config: CompressedTensorsConfig):
+        self.quant_config = quant_config
+        # TODO: @dsikka: refactor this to use schemes as other kernels
+        # are supported + check if the layer is being ignored.
+        config = self.quant_config.target_scheme_map["Linear"].get("weights")
+        self.num_bits = config.num_bits
+        self.packed_factor = 32 // config.num_bits
+        self.strategy = config.strategy
+        self.group_size = config.group_size
+        self.actorder = config.actorder
+        assert config.symmetric, "Only symmetric quantization is supported for MoE"
+
+        if not (
+            self.quant_config.quant_format == CompressionFormat.pack_quantized.value
+            and self.num_bits in WNA16_SUPPORTED_BITS
+        ):
+            raise ValueError(
+                "For Fused MoE layers, only ",
+                f"{CompressionFormat.pack_quantized.value} ",
+                "is supported for the following bits: ",
+                f"{WNA16_SUPPORTED_BITS}",
+            )
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+
+        assert (
+            params_dtype == torch.float16
+        ), "float16 is required for MoE compressed models. Set dtype=torch.float16"  # noqa: E501
+
+        # Will transpose the loaded weight along the
+        # intermediate and hidden dim sizes. Will
+        # shard for TP along the transposed dims
+        extra_weight_attrs.update(
+            {"is_transposed": True, "quant_method": self.strategy}
+        )
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size // self.packed_factor,
+                2 * intermediate_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_packed", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition // self.packed_factor,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_packed", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # In the case where we have actorder/g_idx,
+        # we do not partition the w2 scales
+        load_full_w2 = self.actorder and self.group_size != -1
+
+        if load_full_w2:
+            w2_scales_size = intermediate_size_per_partition * layer.moe_tp_size
+        else:
+            w2_scales_size = intermediate_size_per_partition
+
+        self.is_k_full = (not self.actorder) or layer.moe_tp_size == 1
+
+        if self.strategy == "channel":
+            num_groups_w2 = num_groups_w13 = 1
+            self.group_size = -1
+        else:
+            num_groups_w2 = w2_scales_size // self.group_size
+            num_groups_w13 = hidden_size // self.group_size
+
+        w13_scale = torch.nn.Parameter(
+            torch.ones(
+                num_experts,
+                num_groups_w13,
+                2 * intermediate_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale", w13_scale)
+        set_weight_attrs(w13_scale, extra_weight_attrs)
+
+        w2_scale = torch.nn.Parameter(
+            torch.ones(num_experts, num_groups_w2, hidden_size, dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_scale", w2_scale)
+        set_weight_attrs(w2_scale, extra_weight_attrs)
+        set_weight_attrs(w2_scale, {"load_full_w2": load_full_w2})
+
+        w2_weight_shape = torch.nn.Parameter(
+            torch.empty(num_experts, 2), requires_grad=False
+        )
+        layer.register_parameter("w2_weight_shape", w2_weight_shape)
+        set_weight_attrs(w2_weight_shape, extra_weight_attrs)
+        w13_weight_shape = torch.nn.Parameter(
+            torch.empty(num_experts, 2), requires_grad=False
+        )
+
+        layer.register_parameter("w13_weight_shape", w13_weight_shape)
+        set_weight_attrs(w13_weight_shape, extra_weight_attrs)
+
+        w13_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_g_idx", w13_g_idx)
+        set_weight_attrs(w13_g_idx, extra_weight_attrs)
+
+        w2_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_g_idx", w2_g_idx)
+        set_weight_attrs(w2_g_idx, extra_weight_attrs)
+
+        w13_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_g_idx_sort_indices", w13_g_idx_sort_indices)
+        set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
+
+        w2_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_g_idx_sort_indices", w2_g_idx_sort_indices)
+        set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
+
+        layer.a13_scale = None
+        layer.a2_scale = None
+        layer.marlin_state = GPTQMarlinState.REPACK
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+
+        def replace_tensor(name, new_t):
+            # It is important to use resize_() here since it ensures
+            # the same buffer is reused
+            getattr(layer, name).resize_(new_t.shape)
+            getattr(layer, name).copy_(new_t)
+            del new_t
+
+        def get_scale_perms(num_bits: int):
+            scale_perm: List[int] = []
+            for i in range(8):
+                scale_perm.extend([i + 8 * j for j in range(8)])
+            scale_perm_single: List[int] = []
+            for i in range(4):
+                scale_perm_single.extend(
+                    [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]
+                )
+            return scale_perm, scale_perm_single
+
+        def marlin_permute_scales(
+            s: torch.Tensor, size_k: int, size_n: int, group_size: int, num_bits: int
+        ):
+            scale_perm, scale_perm_single = get_scale_perms(num_bits)
+            if group_size < size_k and group_size != -1:
+                s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
+            else:
+                s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
+            s = s.reshape((-1, size_n)).contiguous()
+            return s
+
+        def marlin_moe_permute_scales(
+            s: torch.Tensor, size_k: int, size_n: int, group_size: int, num_bits: int
+        ):
+            num_experts = s.shape[0]
+            output = torch.empty(
+                (num_experts, s.shape[1], s.shape[2]), device=s.device, dtype=s.dtype
+            )
+            for e in range(num_experts):
+                output[e] = marlin_permute_scales(
+                    s[e], size_k, size_n, group_size, num_bits
+                )
+            return output
+
+        size_k2 = layer.w2_weight_packed.shape[2]
+        size_k13 = layer.w13_weight_packed.shape[2]
+
+        num_experts = layer.w13_weight_g_idx.shape[0]
+        device = layer.w13_weight_g_idx.device
+
+        # when running models with grouped act order,
+        # resort to g_idx values provided in checkpoint
+        if self.actorder == "group":
+            w13_g_idx_sort_indices = torch.empty_like(layer.w13_weight_g_idx)
+            w2_g_idx_sort_indices = torch.empty_like(layer.w2_weight_g_idx)
+            w13_sorted_g_idx = torch.empty_like(layer.w13_weight_g_idx)
+            w2_sorted_g_idx = torch.empty_like(layer.w2_weight_g_idx)
+
+            for e in range(num_experts):
+                w13_g_idx_sort_indices[e] = torch.argsort(layer.w13_weight_g_idx[e]).to(
+                    torch.int32
+                )
+                w2_g_idx_sort_indices[e] = torch.argsort(layer.w2_weight_g_idx[e]).to(
+                    torch.int32
+                )
+                w13_sorted_g_idx[e] = layer.w13_weight_g_idx[e][
+                    w13_g_idx_sort_indices[e]
+                ]
+                w2_sorted_g_idx[e] = layer.w2_weight_g_idx[e][w2_g_idx_sort_indices[e]]
+
+            replace_parameter(layer, "w13_weight_g_idx", w13_sorted_g_idx)
+            replace_parameter(layer, "w2_weight_g_idx", w2_sorted_g_idx)
+            replace_parameter(layer, "w13_g_idx_sort_indices", w13_g_idx_sort_indices)
+            replace_parameter(layer, "w2_g_idx_sort_indices", w2_g_idx_sort_indices)
+
+        else:
+            layer.w13_weight_g_idx = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+                requires_grad=False,
+            )
+            layer.w2_weight_g_idx = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+                requires_grad=False,
+            )
+            layer.w13_g_idx_sort_indices = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+                requires_grad=False,
+            )
+            layer.w2_g_idx_sort_indices = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+                requires_grad=False,
+            )
+
+        from vllm import _custom_ops as vllm_ops
+
+        marlin_w13_qweight = vllm_ops.gptq_marlin_moe_repack(
+            layer.w13_weight_packed,
+            layer.w13_g_idx_sort_indices,
+            layer.w13_weight_packed.shape[1] * self.packed_factor,
+            layer.w13_weight_packed.shape[2],
+            self.num_bits,
+        )
+        replace_tensor("w13_weight_packed", marlin_w13_qweight)
+        marlin_w2_qweight = vllm_ops.gptq_marlin_moe_repack(
+            layer.w2_weight_packed,
+            layer.w2_g_idx_sort_indices,
+            layer.w2_weight_packed.shape[1] * self.packed_factor,
+            layer.w2_weight_packed.shape[2],
+            self.num_bits,
+        )
+        replace_tensor("w2_weight_packed", marlin_w2_qweight)
+        # Repack scales
+        marlin_w13_scales = marlin_moe_permute_scales(
+            layer.w13_weight_scale,
+            size_k13,
+            layer.w13_weight_scale.shape[2],
+            self.group_size,
+            self.num_bits,
+        )
+        replace_tensor("w13_weight_scale", marlin_w13_scales)
+        marlin_w2_scales = marlin_moe_permute_scales(
+            layer.w2_weight_scale,
+            layer.w2_weight_scale.shape[1]
+            * (self.group_size if self.group_size != -1 else self.packed_factor),
+            size_k2,
+            self.group_size,
+            self.num_bits,
+        )
+        replace_tensor("w2_weight_scale", marlin_w2_scales)
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        assert (
+            self.moe_runner_config.activation == "silu"
+        ), "Only SiLU activation is supported."
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        topk_weights, topk_ids, router_logits = topk_output
+
+        output = torch.ops.vllm.fused_marlin_moe(
+            x,
+            layer.w13_weight_packed,
+            layer.w2_weight_packed,
+            layer.w13_weight_scale,
+            layer.w2_weight_scale,
+            router_logits,
+            topk_weights,
+            topk_ids,
+            g_idx1=layer.w13_weight_g_idx,
+            g_idx2=layer.w2_weight_g_idx,
+            sort_indices1=layer.w13_g_idx_sort_indices,
+            sort_indices2=layer.w2_g_idx_sort_indices,
+            num_bits=self.num_bits,
+            is_k_full=self.is_k_full,
+        )
+        return StandardCombineInput(hidden_states=output)
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py
new file mode 100644
index 000000000..c94575316
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from .compressed_tensors_scheme import CompressedTensorsScheme
+from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8
+from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8
+
+__all__ = [
+    "CompressedTensorsScheme",
+    "CompressedTensorsW8A8Fp8",
+    "CompressedTensorsW8A16Fp8",
+]
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
new file mode 100644
index 000000000..3795d0a54
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
@@ -0,0 +1,56 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
+# SPDX-License-Identifier: Apache-2.0
+
+from abc import ABC, abstractmethod
+from typing import Optional
+
+import torch
+
+__all__ = ["CompressedTensorsScheme"]
+
+
+class CompressedTensorsScheme(ABC):
+    """
+    Abstract class used to describe the weight creation and forward pass
+    of different quantization schemes supported by CompressedTensors.
+    """
+
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        """
+        Get minimum device capability.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def create_weights(self, *args, **kwargs):
+        """
+        Weight creation for the particular scheme. Inputs to this function
+
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_weights(
+        self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor]
+    ):
+        """
+        Run the forward pass for the particular scheme. This is where
+        scheme-specific dequant/quant steps/kernels should be applied.
+
+        :param layer: torch.nn.Module with the registered weights and
+            other parameters relevant to the particular scheme.
+        :param x: input to the layer
+        :param bias: bias parameter
+
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        """
+        Called after weight loading is complete for any cleanup that
+        needs to occur.
+        """
+        raise NotImplementedError
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
new file mode 100644
index 000000000..af4f1a0e0
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
@@ -0,0 +1,153 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Callable, List, Optional
+
+import torch
+from compressed_tensors.quantization import QuantizationStrategy
+
+from sglang.srt.layers.parameter import (
+    ChannelQuantScaleParameter,
+    ModelWeightParameter,
+    PerTensorScaleParameter,
+)
+from sglang.srt.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme,
+)
+from sglang.srt.layers.quantization.utils import convert_to_channelwise
+
+try:
+    from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
+        apply_fp8_marlin_linear,
+        prepare_fp8_layer_for_marlin,
+    )
+
+    MARLIN_FP8_AVAILABLE = True
+except ImportError:
+    MARLIN_FP8_AVAILABLE = False
+
+    def apply_fp8_marlin_linear(*args, **kwargs):
+        raise ImportError("vllm is not installed")
+
+    def prepare_fp8_layer_for_marlin(*args, **kwargs):
+        raise ImportError("vllm is not installed")
+
+
+__all__ = ["CompressedTensorsW8A16Fp8"]
+
+SUPPORTED_STRATEGIES = [QuantizationStrategy.CHANNEL, QuantizationStrategy.TENSOR]
+
+
+class CompressedTensorsW8A16Fp8(CompressedTensorsScheme):
+    def __init__(self, strategy: str, is_static_input_scheme: bool):
+        self.strategy = strategy
+        self.is_static_input_scheme = is_static_input_scheme
+
+        if not MARLIN_FP8_AVAILABLE:
+            raise ImportError(
+                "vllm is not installed. To use CompressedTensorsW8A16Fp8, please install vllm"
+            )
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # ampere and up
+        return 80
+
+    # W8A8-Fp8 kernels support only per-tensor and per-channel cases.
+    # So if we have a fused module (QKV, MLP) with per tensor scales,
+    # we expand each scale to its shard's channels.
+    def process_weights_after_loading(self, layer) -> None:
+        if self.strategy == QuantizationStrategy.TENSOR:
+            ws_channelwise = convert_to_channelwise(
+                layer.weight_scale, layer.logical_widths
+            )
+            layer.weight_scale = torch.nn.Parameter(ws_channelwise, requires_grad=False)
+        else:
+            # required by torch.compile to be torch.nn.Parameter
+            layer.weight_scale = torch.nn.Parameter(
+                layer.weight_scale.data, requires_grad=False
+            )
+
+        # Weights must be transposed for marlin
+        layer.weight = torch.nn.Parameter(layer.weight.t(), requires_grad=False)
+
+        if self.is_static_input_scheme:
+            # required by torch.compile to be torch.nn.Parameter
+            layer.input_scale = torch.nn.Parameter(
+                layer.input_scale.data, requires_grad=False
+            )
+        prepare_fp8_layer_for_marlin(layer, size_k_first=True)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size: int,
+        output_partition_sizes: List[int],
+        input_size_per_partition: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs,
+    ):
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.orig_dtype = params_dtype
+
+        # WEIGHT
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition,
+                dtype=torch.float8_e4m3fn,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            weight_scale = ChannelQuantScaleParameter(
+                data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32),
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
+        elif self.strategy == QuantizationStrategy.TENSOR:
+            weight_scale = PerTensorScaleParameter(
+                data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
+        else:
+            raise ValueError(
+                f"Unsupported weight strategy={self.strategy}, "
+                f"supported strategies are {SUPPORTED_STRATEGIES}"
+            )
+
+        weight_scale[:] = torch.finfo(torch.float32).min
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # INPUT SCALE (to deal with converted checkpoints)
+        if self.is_static_input_scheme:
+            input_scale = PerTensorScaleParameter(
+                data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("input_scale", input_scale)
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return apply_fp8_marlin_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            workspace=layer.workspace,
+            size_n=layer.output_size_per_partition,
+            size_k=layer.input_size_per_partition,
+            bias=bias,
+        )
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
new file mode 100644
index 000000000..a157ebc3e
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -0,0 +1,172 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Callable, List, Optional
+
+import torch
+from compressed_tensors.quantization import QuantizationStrategy
+from torch.nn import Parameter
+
+from sglang.srt.layers.parameter import (
+    ChannelQuantScaleParameter,
+    ModelWeightParameter,
+    PerTensorScaleParameter,
+)
+from sglang.srt.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme,
+)
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
+from sglang.srt.layers.quantization.fp8_utils import (
+    apply_fp8_linear,
+    normalize_e4m3fn_to_e4m3fnuz,
+)
+from sglang.srt.layers.quantization.utils import requantize_with_max_scale
+from sglang.srt.utils import get_bool_env_var, is_hip
+
+__all__ = ["CompressedTensorsW8A8Fp8"]
+
+_is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+if _use_aiter:
+    from aiter.ops.shuffle import shuffle_weight
+
+
+class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
+
+    def __init__(self, strategy: str, is_static_input_scheme: bool):
+        self.strategy = strategy
+        self.is_static_input_scheme = is_static_input_scheme
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # lovelace and up
+        return 89
+
+    def process_weights_after_loading(self, layer) -> None:
+        # If per tensor, when we have a fused module (e.g. QKV) with per
+        # tensor scales (thus N scales being passed to the kernel),
+        # requantize so we can always run per tensor
+        if self.strategy == QuantizationStrategy.TENSOR:
+            max_w_scale, weight = requantize_with_max_scale(
+                weight=layer.weight,
+                weight_scale=layer.weight_scale,
+                logical_widths=layer.logical_widths,
+            )
+
+            if is_fp8_fnuz():
+                input_scale = getattr(layer, "input_scale", None)
+
+                weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
+                    weight=weight, weight_scale=max_w_scale, input_scale=input_scale
+                )
+                if input_scale is not None:
+                    layer.input_scale = Parameter(input_scale, requires_grad=False)
+
+            layer.weight = Parameter(weight.t(), requires_grad=False)
+            layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
+
+        # If channelwise, scales are already lined up, so just transpose.
+        elif self.strategy == QuantizationStrategy.CHANNEL:
+            weight = layer.weight
+
+            if is_fp8_fnuz():
+                input_scale = getattr(layer, "input_scale", None)
+
+                weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
+                    weight=weight,
+                    weight_scale=layer.weight_scale,
+                    input_scale=input_scale,
+                )
+                if input_scale is not None:
+                    layer.input_scale = Parameter(input_scale, requires_grad=False)
+            else:
+                weight_scale = layer.weight_scale.data
+
+            if _use_aiter:
+                layer.weight = Parameter(
+                    shuffle_weight(weight, (16, 16)), requires_grad=False
+                )
+            else:
+                layer.weight = Parameter(weight.t(), requires_grad=False)
+
+            # required by torch.compile to be torch.nn.Parameter
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+
+        else:
+            raise ValueError(f"Unknown quantization strategy {self.strategy}")
+
+        # INPUT SCALE
+        if self.is_static_input_scheme and hasattr(layer, "input_scale"):
+            layer.input_scale = Parameter(layer.input_scale.max(), requires_grad=False)
+        else:
+            layer.input_scale = None
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        output_partition_sizes: List[int],
+        input_size_per_partition: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs,
+    ):
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+
+        # WEIGHT
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition,
+                dtype=torch.float8_e4m3fn,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+        # TODO: update create_xxx_parameter functions to return
+        # the newly added parameters
+        if self.strategy == QuantizationStrategy.CHANNEL:
+            weight_scale = ChannelQuantScaleParameter(
+                data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32),
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
+        else:
+            assert self.strategy == QuantizationStrategy.TENSOR
+            weight_scale = PerTensorScaleParameter(
+                data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
+
+        # min requirement for fp8 kernels
+        weight_scale[:] = torch.finfo(torch.float32).min
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # INPUT SCALE
+        if self.is_static_input_scheme:
+            input_scale = PerTensorScaleParameter(
+                data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
+            input_scale[:] = torch.finfo(torch.float32).min
+            layer.register_parameter("input_scale", input_scale)
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return apply_fp8_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=layer.input_scale,
+            bias=bias,
+            use_per_token_if_dynamic=True,
+            compressed_tensor_quant=True,
+        )
diff --git a/python/sglang/srt/layers/quantization/compressed_tensors/utils.py b/python/sglang/srt/layers/quantization/compressed_tensors/utils.py
new file mode 100644
index 000000000..ddefa5ea3
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/utils.py
@@ -0,0 +1,218 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
+# SPDX-License-Identifier: Apache-2.0
+
+import re
+from types import MappingProxyType
+from typing import Iterable, List, Mapping, Optional
+
+from compressed_tensors import CompressionFormat
+from torch.nn import Module
+
+
+def is_activation_quantization_format(format: str) -> bool:
+    _ACTIVATION_QUANTIZATION_FORMATS = [
+        CompressionFormat.naive_quantized.value,
+        CompressionFormat.int_quantized.value,
+        CompressionFormat.float_quantized.value,
+    ]
+    return format in _ACTIVATION_QUANTIZATION_FORMATS
+
+
+def should_ignore_layer(
+    layer_name: Optional[str],
+    ignore: Iterable[str] = tuple(),
+    fused_mapping: Mapping[str, List[str]] = MappingProxyType({}),
+) -> bool:
+    if layer_name is None:
+        return False
+
+    # layer_name = model.layers.0.self_attn.qkv_proj
+    # proj_name = qkv_proj
+    proj_name = layer_name.split(".")[-1]
+
+    # Fused layers like gate_up_proj or qkv_proj will not be fused
+    # in the safetensors checkpoint. So, we convert the name
+    # from the fused version to unfused + check to make sure that
+    # each shard of the fused layer has the same scheme.
+    if proj_name in fused_mapping and layer_name not in ignore:
+        shard_proj_names = fused_mapping[proj_name]
+
+        # Convert fused_name --> [shard_names]
+        shard_names = [
+            layer_name.replace(proj_name, shard_proj_name)
+            for shard_proj_name in shard_proj_names
+        ]
+
+        # Layer should be ignored if shards are ignored.
+        should_ignore_layer = None
+        for shard_name in shard_names:
+            should_ignore_shard = check_equal_or_regex_match(
+                layer_name=shard_name, targets=ignore
+            )
+
+            # If shard_idx=0, set layer ignore to match shard.
+            if should_ignore_layer is None:
+                should_ignore_layer = should_ignore_shard
+
+            # If shard_idx=1+ confirm scheme matches prior shards.
+            elif should_ignore_shard != should_ignore_layer:
+                raise ValueError(
+                    f"Found a different quantization schemes for "
+                    f"{shard_proj_names} in {layer_name}. vLLM "
+                    "requires all to use the same scheme."
+                )
+
+    # Unfused layers like down_proj and o_proj will match
+    # the safetensors checkpoint already.
+    else:
+        should_ignore_layer = check_equal_or_regex_match(
+            layer_name=layer_name, targets=ignore
+        )
+
+    assert should_ignore_layer is not None
+    return should_ignore_layer
+
+
+def check_equal_or_regex_match(layer_name: str, targets: Iterable[str]) -> bool:
+    """
+    Checks whether a layer_name is exactly equal or a regex match for
+    if target starts with 're:' to any target in list.
+    """
+    for target in targets:
+        if _is_equal_or_regex_match(layer_name, target):
+            return True
+    return False
+
+
+def find_matched_target(
+    layer_name: Optional[str],
+    module: Module,
+    targets: Iterable[str],
+    fused_mapping: Mapping[str, List[str]] = MappingProxyType({}),
+) -> str:
+    """
+    Helper function to look up which "target" in the compressed-tensors
+    config that a layer corresponds to.
+
+    Recall that a compressed-tensors configs has a concept of
+    config_groups, where each layer can be quantized with with a different
+    scheme.
+
+    targets in each config_group will be a list of either layer names
+    (or regexes corresponding to layer names) or names of torch Modules.
+
+    First, we try to match the layer_name with a target
+    Second, we try to match the module's name with a target
+    Third, we try to map the layer_name to a list of fused module names.
+        *All* component module names must match in order for a match to be
+        successful. A successful match returns the first component target
+
+    :param layer_name: layer name
+    :param module: torch.nn.Module
+    :param targets: list of targets to match the layer against
+    :param fused_mapping: map from fused layer names to its components
+    :param fused_strategy: either "all" or "any". If using "all", fused
+        layers match if "all" of its components match
+    """
+
+    if layer_name is None:
+        layer_name = ""
+
+    matched_target = (
+        _find_first_match(layer_name, targets)
+        or _find_first_match(module.__class__.__name__, targets, True)
+        or _match_fused_layer(layer_name, targets, fused_mapping)
+    )
+
+    if matched_target is None:
+        raise ValueError(
+            f"Unable to find matching target for {layer_name} in the "
+            "compressed-tensors config."
+        )
+
+    return matched_target
+
+
+def _find_first_match(
+    value: str, targets: Iterable[str], check_contains: bool = False
+) -> Optional[str]:
+    """
+    Returns first element of target that matches value either
+    exactly or as a regex after 're:'. If check_contains is set to True,
+    additionally checks if the target string is contained within the value.
+
+    :param value: string to compare the list of targets against
+    :param targets: list of targets to match the layer against
+    :param check_contains: whether or not to do a substring match
+    """
+
+    for target in targets:
+        if _is_equal_or_regex_match(value, target, check_contains=check_contains):
+            return target
+    return None
+
+
+def _is_equal_or_regex_match(
+    value: str, target: str, check_contains: bool = False
+) -> bool:
+    """
+    Checks whether a value is exactly equal or a regex match for target
+    if target starts with 're:'. If check_contains is set to True,
+    additionally checks if the target string is contained within the value.
+    """
+
+    if target.startswith("re:"):
+        pattern = target[3:]
+        if re.match(pattern, value):
+            return True
+    elif check_contains:
+        if target.lower() in value.lower():
+            return True
+    elif target == value:
+        return True
+    return False
+
+
+def _match_fused_layer(
+    layer_name: str,
+    target_layers: Iterable[str],
+    fused_mapping: Mapping[str, List[str]],
+) -> Optional[str]:
+    """
+    Match a fused layer name to its corresponding individual layer in
+    target_layers. Returns first value in fused_mapping which matches targets
+
+    Implements an "all" matching strategy where a fused layer matches iff
+    "all" of its components match
+
+    :param layer_name: layer name
+    :param target_layers: list of targets to match the layer against
+    :param fused_mapping: map from fused layer names to its components
+
+    Examples:
+        layer_name = "model.layers.0.self_attn.qkv_proj"
+        target_layers = ["model.layers.0.self_attn.q_proj",
+                        "model.layers.0.self_attn.k_proj",
+                        "model.layers.0.self_attn.v_proj"]
+    """
+    # find layer_name in mapping
+    fused = next((key for key in fused_mapping if layer_name.endswith(key)), None)
+    if fused is None:
+        return None
+
+    # expand path of unfused components
+    unfused_paths = [
+        layer_name.replace(fused, unfused) for unfused in fused_mapping[fused]
+    ]
+
+    # for each unfused component, find a match in targets
+    unfused_matches: List[Optional[str]] = []
+    for unfused in unfused_paths:
+        for target in target_layers:
+            if _is_equal_or_regex_match(unfused, target):
+                unfused_matches.append(target)
+                break
+        else:
+            unfused_matches.append(None)
+
+    return unfused_matches[0] if all(unfused_matches) else None
diff --git a/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..6ae89c757
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..ae4c397ef
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..6496a38fb
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..effb7bda7
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..eb1848894
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..c098ef2db
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..c098ef2db
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..3633e363f
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..653bb997e
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..31d29e10c
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..3618053b6
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..cb573ccdf
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..46a982f5e
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..a5679be13
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..7d8044772
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..eb11c1696
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..035ec027f
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..62ccf8e6f
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..8b49f2781
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..dd66019cc
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..5011fd572
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..1957a7f32
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..851bc9f9f
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..6c2eebe0f
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..d1227c215
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..448daf360
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..6f5adbb93
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..6f5adbb93
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..6f5adbb93
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..7b10d6d78
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..e76d41dfd
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..77ba0d747
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..1c61451fb
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..66ca4ee16
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..48b64a87d
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..63e661c80
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..d521e2bd0
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..d72288781
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..c65a16f3d
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..cf3540379
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..a8ab31bfe
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..1477e063d
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..6f5adbb93
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..6f5adbb93
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..80d23e969
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..0a5d7bfdb
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..970345270
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..eccb86a76
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..cb91a279d
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..88af48431
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..a794fc821
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..dd069726d
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..4225c78eb
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..4225c78eb
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..4225c78eb
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..933170b7b
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..797ee3d45
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..7febe3d27
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..56b939e52
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..1c36fe951
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..30932a1cc
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..63d9a0bf5
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..65989f012
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..7fa398c15
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..f15d8f64c
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..4225c78eb
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..4225c78eb
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..a5518d979
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..9d7658bfc
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..cd3e07804
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..932321cca
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..9d5a329d7
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..5e6789d00
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..5e6789d00
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..44c67c008
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..03dba5ad1
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..96e1594a3
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..017dca003
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..5ffd367df
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..49ac14d2a
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..49ac14d2a
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..09502f05c
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..9a5ff48b8
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..b57588f9b
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..eabc42394
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..dcbb0efc5
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..dcbb0efc5
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..7fd942e3e
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..13e3c20b7
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..ecbca9d8d
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..386928de1
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..51e237b91
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..a8ab31bfe
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..22ca71e2c
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..6280219c9
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..78db73078
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..5e40bbe64
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..044868d2d
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..40c01c0b9
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..b96981dec
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..c6fd36597
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..f4e026497
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..246062a22
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..0b24efa37
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..160f12ed3
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..a3a434e12
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..e5c4a1d2c
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..2f3962f68
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..bfdff8adc
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..d694716c3
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..2bf5eb27e
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..871a0cf2e
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..f211a5d38
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..dfe5c1e43
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..dfe5c1e43
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..dfe5c1e43
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..b2cab0831
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..9de2ba9c8
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..9c908e804
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..0a1e14cff
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..c4b1bd435
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..c87cb4f95
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..15b1c93f6
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..5bf208ce4
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..6ae00329e
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..59b79432f
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..f78e7060e
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..8ff12e64c
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..e997126f9
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..b474e6265
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..4532f9368
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..a3f5371f1
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..dfe5c1e43
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..dfe5c1e43
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..72ab3dff2
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..1d3ce5c94
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..ca7f32b95
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..12e305853
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..5acea242c
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..a87f5de1b
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..a87f5de1b
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..37208577d
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..3ab5796ee
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..58cdd93e9
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..a5ce5e49b
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..b72e0371d
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..468f9e78d
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..468f9e78d
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..d8cc0f896
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..3cb7eaa07
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..129f59e0c
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000..293adce38
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py b/python/sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py
new file mode 100644
index 000000000..32f3ebe04
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py
@@ -0,0 +1 @@
+from .entrypoint import *
diff --git a/python/sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py b/python/sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py
new file mode 100644
index 000000000..e374759c4
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py
@@ -0,0 +1,241 @@
+import logging
+import os
+from contextlib import contextmanager
+from enum import IntEnum, auto
+from typing import Dict, List, Tuple
+
+import torch
+from tqdm import tqdm
+
+from sglang.srt.layers.quantization.deep_gemm_wrapper.configurer import (
+    ENABLE_JIT_DEEPGEMM,
+)
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import ceil_div, get_bool_env_var, get_int_env_var
+
+logger = logging.getLogger(__name__)
+
+if ENABLE_JIT_DEEPGEMM:
+    import deep_gemm
+
+
+_BUILTIN_M_LIST = list(range(1, 1024 * 16 + 1))
+_ENABLE_JIT_DEEPGEMM_PRECOMPILE = get_bool_env_var(
+    "SGL_JIT_DEEPGEMM_PRECOMPILE", "true"
+)
+_DO_COMPILE_ALL = True
+_IS_FIRST_RANK_ON_NODE = get_bool_env_var("SGL_IS_FIRST_RANK_ON_NODE", "true")
+_COMPILE_WORKERS = get_int_env_var("SGL_JIT_DEEPGEMM_COMPILE_WORKERS", 4)
+_IN_PRECOMPILE_STAGE = get_bool_env_var("SGL_IN_DEEPGEMM_PRECOMPILE_STAGE", "false")
+
+# Force redirect deep_gemm cache_dir
+os.environ["DG_JIT_CACHE_DIR"] = os.getenv(
+    "SGL_DG_CACHE_DIR", os.path.join(os.path.expanduser("~"), ".cache", "deep_gemm")
+)
+
+# Refer to https://github.com/deepseek-ai/DeepGEMM/commit/d75b218b7b8f4a5dd5406ac87905039ead3ae42f
+# NVRTC may have performance loss with some cases.
+# And NVCC JIT speed is also 9x faster in the ref commit
+os.environ["DG_JIT_USE_NVRTC"] = os.getenv("SGL_DG_USE_NVRTC", "0")
+
+
+def update_deep_gemm_config(gpu_id: int, server_args: ServerArgs):
+    global _BUILTIN_M_LIST
+    global _DO_COMPILE_ALL
+    global _IS_FIRST_RANK_ON_NODE
+
+    # Generate m_max
+    m_max = 1024 * 16
+    if server_args.chunked_prefill_size < 1:
+        m_max = 1024 * 64
+    elif server_args.chunked_prefill_size > 8192:
+        m_max = server_args.chunked_prefill_size * 2
+    m_max = min(1024 * 128, m_max)
+    _BUILTIN_M_LIST = list(range(1, m_max + 1))
+
+    _IS_FIRST_RANK_ON_NODE = ServerArgs.base_gpu_id == gpu_id
+
+    # Check if is the first rank on node.
+    # Default each rank will try compile all Ms to
+    # load all symbols at the launch stages.
+    # Avoid loading symbols at the serving stages.
+    _DO_COMPILE_ALL = _IS_FIRST_RANK_ON_NODE
+
+
+class DeepGemmKernelType(IntEnum):
+    GROUPED_GEMM_NT_F8F8BF16_MASKED = auto()
+    GROUPED_GEMM_NT_F8F8BF16_CONTIG = auto()
+    GEMM_NT_F8F8BF16 = auto()
+
+
+_INITIALIZATION_DICT: Dict[Tuple[DeepGemmKernelType, int, int, int], bool] = dict()
+
+
+# TODO improve code
+def _maybe_compile_deep_gemm_one_type_all(
+    kernel_type: DeepGemmKernelType,
+    n: int,
+    k: int,
+    num_groups: int,
+) -> None:
+    global _INITIALIZATION_DICT
+    global _BUILTIN_M_LIST
+
+    query_key = (kernel_type, n, k, num_groups)
+    if (
+        _ENABLE_JIT_DEEPGEMM_PRECOMPILE
+        and _DO_COMPILE_ALL
+        and _INITIALIZATION_DICT.get(query_key) is None
+    ):
+        _INITIALIZATION_DICT[query_key] = True
+
+        # TODO maybe improve logs
+        if not _IN_PRECOMPILE_STAGE and _IS_FIRST_RANK_ON_NODE:
+            logger.warning(
+                "Entering DeepGEMM JIT Pre-Compile session. "
+                "It may take a long time (typically 10-20 mins) "
+                "if you have not run `sglang.compile_deep_gemm`. "
+                "It is recommended to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
+                " for pre-compilation to reduce the overhead if you have not run it before. "
+                "For example: "
+                "`python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code`"
+            )
+
+        logger.info(
+            f"Try DeepGEMM JIT Compiling for "
+            f"<{kernel_type.name}> N={n}, K={k}, num_groups={num_groups} with all Ms."
+            f"{' It only takes a little time (typically 1 sec) if you have run `python3 -m sglang.compile_deep_gemm`. ' if not _IN_PRECOMPILE_STAGE else ''}"
+        )
+
+        _compile_deep_gemm_one_type_all(
+            kernel_type=kernel_type,
+            n=n,
+            k=k,
+            num_groups=num_groups,
+            m_list=_BUILTIN_M_LIST,
+        )
+
+
+# NOTE(alcanderian): get_num_sms should be change when 2-batch-overlap is introduced
+def _compile_deep_gemm_one_type_all(
+    kernel_type: DeepGemmKernelType,
+    n: int,
+    k: int,
+    num_groups: int,
+    m_list: List[int],
+) -> None:
+    if kernel_type == DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_CONTIG:
+        m_alignment = deep_gemm.get_mk_alignment_for_contiguous_layout()
+        m_list = sorted(list(set(m for m in m_list if m % m_alignment == 0)))
+
+    executor = _BaseWarmupExecutor.create(
+        kernel_type, max_m=max(m_list), n=n, k=k, num_groups=num_groups
+    )
+
+    old_compile_mode = deep_gemm.get_compile_mode()
+    deep_gemm.set_compile_mode(1)
+    # TODO can use multi thread
+    for m in tqdm(m_list, desc=f"DeepGEMM warmup"):
+        executor.execute(m=m)
+    deep_gemm.set_compile_mode(old_compile_mode)
+
+    # clean up input buffers
+    torch.cuda.current_stream().synchronize()
+    del executor
+    torch.cuda.empty_cache()
+
+
+class _BaseWarmupExecutor:
+    @staticmethod
+    def create(kernel_type: DeepGemmKernelType, **kwargs):
+        return {
+            DeepGemmKernelType.GEMM_NT_F8F8BF16: _NormalWarmupExecutor,
+            DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_CONTIG: _GroupedContWarmupExecutor,
+            DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_MASKED: _GroupedMaskedWarmupExecutor,
+        }[kernel_type](**kwargs)
+
+    def execute(self, m):
+        raise NotImplementedError
+
+
+def _empty_token_fp8(size):
+    *dims, k = size
+    return (
+        torch.empty(size, device="cuda", dtype=torch.float8_e4m3fn),
+        torch.empty(
+            (*dims, ceil_div(k, _BLOCK_SIZE)), device="cuda", dtype=torch.float32
+        ),
+    )
+
+
+def _empty_block_fp8(size):
+    *dims, n, k = size
+    return (
+        torch.empty(size, device="cuda", dtype=torch.float8_e4m3fn),
+        torch.empty(
+            (*dims, ceil_div(n, _BLOCK_SIZE), ceil_div(k, _BLOCK_SIZE)),
+            device="cuda",
+            dtype=torch.float32,
+        ),
+    )
+
+
+_BLOCK_SIZE = 128
+
+
+class _NormalWarmupExecutor(_BaseWarmupExecutor):
+    def __init__(self, max_m: int, n: int, k: int, num_groups: int):
+        self.lhs_q, self.lhs_s = _empty_token_fp8((max_m, k))
+        self.rhs_q, self.rhs_s = _empty_block_fp8((n, k))
+        self.out = torch.empty((max_m, n), device="cuda", dtype=torch.bfloat16)
+
+    def execute(self, m):
+        deep_gemm.fp8_gemm_nt(
+            (self.lhs_q[:m], self.lhs_s[:m]),
+            (self.rhs_q, self.rhs_s),
+            self.out[:m],
+        )
+
+
+class _GroupedContWarmupExecutor(_BaseWarmupExecutor):
+    def __init__(self, max_m: int, n: int, k: int, num_groups: int):
+        self.lhs_q, self.lhs_s = _empty_token_fp8((max_m, k))
+        self.rhs_q, self.rhs_s = _empty_block_fp8((num_groups, n, k))
+        self.m_indices = torch.zeros((max_m,), device="cuda", dtype=torch.int32)
+        self.out = torch.empty((max_m, n), device="cuda", dtype=torch.bfloat16)
+
+    def execute(self, m):
+        deep_gemm.m_grouped_fp8_gemm_nt_contiguous(
+            (self.lhs_q[:m], self.lhs_s[:m]),
+            (self.rhs_q, self.rhs_s),
+            self.out[:m],
+            m_indices=self.m_indices[:m],
+        )
+
+
+class _GroupedMaskedWarmupExecutor(_BaseWarmupExecutor):
+    def __init__(self, max_m: int, n: int, k: int, num_groups: int):
+        self.lhs_q, self.lhs_s = _empty_token_fp8((num_groups, max_m, k))
+        self.rhs_q, self.rhs_s = _empty_block_fp8((num_groups, n, k))
+        self.masked_m = torch.zeros((num_groups,), device="cuda", dtype=torch.int32)
+        self.out = torch.empty(
+            (num_groups, max_m, n), device="cuda", dtype=torch.bfloat16
+        )
+
+    def execute(self, m):
+        deep_gemm.fp8_m_grouped_gemm_nt_masked(
+            (self.lhs_q, self.lhs_s),
+            (self.rhs_q, self.rhs_s),
+            self.out,
+            masked_m=self.masked_m,
+            # DeepGEMM uses `expect_m` instead of input shape for `get_best_config`
+            expected_m=m,
+        )
+
+
+@contextmanager
+def deep_gemm_execution_hook(
+    m: int, n: int, k: int, num_groups: int, kernel_type: DeepGemmKernelType
+):
+    _maybe_compile_deep_gemm_one_type_all(kernel_type, n, k, num_groups)
+    yield
diff --git a/python/sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py b/python/sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py
new file mode 100644
index 000000000..ecf7d1647
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py
@@ -0,0 +1,32 @@
+import logging
+
+import torch
+
+from sglang.srt.utils import get_bool_env_var, get_device_sm
+
+logger = logging.getLogger(__name__)
+
+
+def _compute_enable_deep_gemm():
+    sm_version = get_device_sm()
+    if sm_version < 90:
+        return False
+
+    try:
+        import deep_gemm
+    except ImportError:
+        logger.warning("Failed to import deep_gemm, disable ENABLE_JIT_DEEPGEMM.")
+        return False
+
+    return get_bool_env_var("SGL_ENABLE_JIT_DEEPGEMM", default="true")
+
+
+def _is_blackwell_arch() -> bool:
+    major, minor = torch.cuda.get_device_capability(torch.cuda.current_device())
+    return major == 10
+
+
+ENABLE_JIT_DEEPGEMM = _compute_enable_deep_gemm()
+
+DEEPGEMM_BLACKWELL = ENABLE_JIT_DEEPGEMM and _is_blackwell_arch()
+DEEPGEMM_SCALE_UE8M0 = DEEPGEMM_BLACKWELL
diff --git a/python/sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py b/python/sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py
new file mode 100644
index 000000000..02945f449
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py
@@ -0,0 +1,119 @@
+import logging
+from contextlib import contextmanager
+from typing import Tuple
+
+import torch
+
+from sglang.srt.layers.quantization.deep_gemm_wrapper import compile_utils
+from sglang.srt.layers.quantization.deep_gemm_wrapper.configurer import (
+    DEEPGEMM_BLACKWELL,
+    DEEPGEMM_SCALE_UE8M0,
+    ENABLE_JIT_DEEPGEMM,
+)
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import get_bool_env_var
+
+logger = logging.getLogger(__name__)
+
+if ENABLE_JIT_DEEPGEMM:
+    import deep_gemm
+    from deep_gemm.utils.layout import get_mn_major_tma_aligned_tensor
+
+_SANITY_CHECK = get_bool_env_var("SGLANG_DEEPGEMM_SANITY_CHECK")
+
+
+# TODO maybe rename these functions
+def grouped_gemm_nt_f8f8bf16_masked(
+    lhs: Tuple[torch.Tensor, torch.Tensor],
+    rhs: Tuple[torch.Tensor, torch.Tensor],
+    out: torch.Tensor,
+    masked_m: torch.Tensor,
+    expected_m: int,
+):
+    num_groups, _, k = lhs[0].shape
+    _, n, _ = rhs[0].shape
+    kernel_type = compile_utils.DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_MASKED
+
+    _sanity_check_input(lhs)
+    _sanity_check_input(rhs)
+
+    with compile_utils.deep_gemm_execution_hook(
+        expected_m, n, k, num_groups, kernel_type
+    ):
+        deep_gemm.fp8_m_grouped_gemm_nt_masked(
+            lhs,
+            rhs,
+            out,
+            masked_m,
+            expected_m,
+        )
+
+
+def grouped_gemm_nt_f8f8bf16_contig(
+    lhs: Tuple[torch.Tensor, torch.Tensor],
+    rhs: Tuple[torch.Tensor, torch.Tensor],
+    out: torch.Tensor,
+    m_indices: torch.Tensor,
+):
+    m, k = lhs[0].shape
+    num_groups, n, _ = rhs[0].shape
+    kernel_type = compile_utils.DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_CONTIG
+
+    _sanity_check_input(lhs)
+    _sanity_check_input(rhs)
+
+    with compile_utils.deep_gemm_execution_hook(m, n, k, num_groups, kernel_type):
+        deep_gemm.m_grouped_fp8_gemm_nt_contiguous(lhs, rhs, out, m_indices)
+
+
+def gemm_nt_f8f8bf16(
+    lhs: Tuple[torch.Tensor, torch.Tensor],
+    rhs: Tuple[torch.Tensor, torch.Tensor],
+    out: torch.Tensor,
+):
+    m, k = lhs[0].shape
+    n, _ = rhs[0].shape
+    num_groups = 1
+    kernel_type = compile_utils.DeepGemmKernelType.GEMM_NT_F8F8BF16
+
+    _sanity_check_input(lhs)
+    _sanity_check_input(rhs)
+
+    with compile_utils.deep_gemm_execution_hook(m, n, k, num_groups, kernel_type):
+        deep_gemm.fp8_gemm_nt(
+            lhs,
+            rhs,
+            out,
+        )
+
+
+def update_deep_gemm_config(gpu_id: int, server_args: ServerArgs):
+    compile_utils.update_deep_gemm_config(gpu_id, server_args)
+
+
+@contextmanager
+def configure_deep_gemm_num_sms(num_sms):
+    if num_sms is None:
+        yield
+    else:
+        original_num_sms = deep_gemm.get_num_sms()
+        deep_gemm.set_num_sms(num_sms)
+        try:
+            yield
+        finally:
+            deep_gemm.set_num_sms(original_num_sms)
+
+
+def _sanity_check_input(x_fp8: Tuple[torch.Tensor, torch.Tensor]):
+    if not _SANITY_CHECK:
+        return
+
+    x, x_scale = x_fp8
+
+    if x_scale.dtype == torch.int:
+        return
+
+    from sglang.srt.layers.quantization.fp8_utils import ceil_to_ue8m0
+
+    x_scale_ceil = ceil_to_ue8m0(x_scale)
+    assert torch.all(x_scale == x_scale_ceil), f"{x_scale=} {x_scale_ceil=}"
diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py
new file mode 100644
index 000000000..b020e4188
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/fp8.py
@@ -0,0 +1,1240 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/layers/quantization/fp8.py
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch.nn import Module
+from torch.nn.parameter import Parameter
+
+try:
+    from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
+        apply_fp8_marlin_linear,
+        prepare_fp8_layer_for_marlin,
+    )
+
+    MARLIN_FP8_AVAILABLE = True
+except ImportError:
+    MARLIN_FP8_AVAILABLE = False
+
+    def dummy_func(*args, **kwargs):
+        raise ImportError(
+            "marlin FP8 requires some operators from vllm. Please install vllm."
+        )
+
+    apply_fp8_marlin_linear = prepare_fp8_layer_for_marlin = dummy_func
+
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.amx_utils import _amx_process_weight_after_loading
+from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+from sglang.srt.layers.moe.token_dispatcher.base import DispatchOutputChecker
+from sglang.srt.layers.parameter import (
+    BlockQuantScaleParameter,
+    ModelWeightParameter,
+    PerTensorScaleParameter,
+)
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.fp8_kernel import (
+    fp8_dtype,
+    is_fp8_fnuz,
+    per_token_group_quant_fp8,
+    scaled_fp8_quant,
+)
+from sglang.srt.layers.quantization.fp8_utils import (
+    apply_fp8_linear,
+    can_auto_enable_marlin_fp8,
+    cutlass_fp8_supported,
+    dispatch_w8a8_block_fp8_linear,
+    input_to_float8,
+    normalize_e4m3fn_to_e4m3fnuz,
+)
+from sglang.srt.layers.quantization.kv_cache import BaseKVCacheMethod
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+from sglang.srt.layers.quantization.utils import (
+    all_close_1d,
+    convert_to_channelwise,
+    is_layer_skipped,
+    per_tensor_dequantize,
+    requantize_with_max_scale,
+)
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    get_bool_env_var,
+    is_cpu,
+    is_cuda,
+    is_hip,
+    is_npu,
+    is_sm90_supported,
+    is_sm100_supported,
+    log_info_on_rank0,
+    next_power_of_2,
+    print_warning_once,
+    set_weight_attrs,
+    use_intel_amx_backend,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        DispatchOutput,
+        StandardDispatchOutput,
+    )
+    from sglang.srt.layers.moe.topk import TopKOutput
+    from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config
+
+_is_hip = is_hip()
+_is_cuda = is_cuda()
+_is_npu = is_npu()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+
+_is_fp8_fnuz = is_fp8_fnuz()
+
+_use_hip_int4 = get_bool_env_var("SGLANG_INT4_WEIGHT")
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+if _is_hip and (_use_aiter or _use_hip_int4):
+    from aiter import ActivationType, QuantType
+    from aiter.fused_moe import fused_moe
+    from aiter.ops.shuffle import shuffle_weight
+
+
+ACTIVATION_SCHEMES = ["static", "dynamic"]
+
+logger = logging.getLogger(__name__)
+
+
+class Fp8Config(QuantizationConfig):
+    """Config class for FP8."""
+
+    def __init__(
+        self,
+        is_checkpoint_fp8_serialized: bool = False,
+        activation_scheme: str = "dynamic",
+        ignored_layers: Optional[List[str]] = None,
+        weight_block_size: List[int] = None,
+    ) -> None:
+        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+        if is_checkpoint_fp8_serialized:
+            log_info_on_rank0(logger, "Detected fp8 checkpoint.")
+        if activation_scheme not in ACTIVATION_SCHEMES:
+            raise ValueError(f"Unsupported activation scheme {activation_scheme}")
+        self.activation_scheme = activation_scheme
+        self.ignored_layers = ignored_layers or []
+        if weight_block_size is not None:
+            if not is_checkpoint_fp8_serialized:
+                raise ValueError(
+                    f"The block-wise quantization only supports fp8-serialized checkpoint for now."
+                )
+            if len(weight_block_size) != 2:
+                raise ValueError(
+                    f"The quantization block size of weight must have 2 dimensions, but got {len(weight_block_size)} dimensions."
+                )
+            if activation_scheme != "dynamic":
+                raise ValueError(
+                    f"The block-wise quantization only supports dynamic activation scheme for now, but got {activation_scheme} activation scheme."
+                )
+        self.weight_block_size = weight_block_size
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "fp8"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> Fp8Config:
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        is_checkpoint_fp8_serialized = "fp8" in quant_method
+        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
+        ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None)
+        weight_block_size = cls.get_from_keys_or(config, ["weight_block_size"], None)
+        return cls(
+            is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
+            activation_scheme=activation_scheme,
+            ignored_layers=ignored_layers,
+            weight_block_size=weight_block_size,
+        )
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[QuantizeMethodBase]:
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix, self.ignored_layers):
+                return UnquantizedLinearMethod()
+            return Fp8LinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return Fp8MoEMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class Fp8LinearMethod(LinearMethodBase):
+    """Linear method for FP8.
+    Supports loading FP8 checkpoints with static weight scale and
+    dynamic/static activation scale.
+
+    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
+    activation scaling. The weight scaling factor will be initialized after
+    the model weights are loaded.
+
+    Limitations:
+    1. Only support per-tensor quantization due to torch._scaled_mm support.
+    2. Only support float8_e4m3fn data type due to the limitation of
+       torch._scaled_mm (https://github.com/pytorch/pytorch/blob/2e48b39603411a41c5025efbe52f89560b827825/aten/src/ATen/native/cuda/Blas.cpp#L854-L856)
+
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self, quant_config: Union[Fp8Config, W4AFp8Config]):
+        self.quant_config = quant_config
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+
+        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
+        # kernel for fast weight-only FP8 quantization
+        self.use_marlin = False
+        if _is_cuda and MARLIN_FP8_AVAILABLE:
+            force_marlin = get_bool_env_var("SGLANG_FORCE_FP8_MARLIN")
+            auto_enable = can_auto_enable_marlin_fp8()
+            self.use_marlin = force_marlin or auto_enable
+
+        self.block_quant = self.quant_config.weight_block_size is not None
+
+        self.w8a8_block_fp8_linear = dispatch_w8a8_block_fp8_linear()
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        tp_size = get_tensor_model_parallel_world_size()
+        if self.block_quant:
+            block_n, block_k = (
+                self.quant_config.weight_block_size[0],
+                self.quant_config.weight_block_size[1],
+            )
+            # Required by row parallel
+            if tp_size > 1 and input_size // input_size_per_partition == tp_size:
+                if input_size_per_partition % block_k != 0:
+                    raise ValueError(
+                        f"Weight input_size_per_partition = "
+                        f"{input_size_per_partition} is not divisible by "
+                        f"weight quantization block_k = {block_k}."
+                    )
+            # Required by column parallel or enabling merged weights
+            if (
+                tp_size > 1 and output_size // output_size_per_partition == tp_size
+            ) or len(output_partition_sizes) > 1:
+                for output_partition_size in output_partition_sizes:
+                    if output_partition_size % block_n != 0:
+                        raise ValueError(
+                            f"Weight output_partition_size = "
+                            f"{output_partition_size} is not divisible by "
+                            f"weight quantization block_n = {block_n}."
+                        )
+
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.orig_dtype = params_dtype
+
+        # WEIGHT
+        weight_dtype = (
+            torch.float8_e4m3fn
+            if self.quant_config.is_checkpoint_fp8_serialized
+            else params_dtype
+        )
+
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition, input_size_per_partition, dtype=weight_dtype
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        # If checkpoint is serialized fp8, load them.
+        # Otherwise, wait until process_weights_after_loading.
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            # WEIGHT SCALE
+            if self.block_quant:
+                if hasattr(self.quant_config, "activation_scheme"):
+                    assert self.quant_config.activation_scheme == "dynamic"
+                elif hasattr(self.quant_config, "linear_activation_scheme"):
+                    assert self.quant_config.linear_activation_scheme == "dynamic"
+                scale = BlockQuantScaleParameter(
+                    data=torch.empty(
+                        (output_size_per_partition + block_n - 1) // block_n,
+                        (input_size_per_partition + block_k - 1) // block_k,
+                        dtype=torch.float32,
+                    ),
+                    input_dim=1,
+                    output_dim=0,
+                    weight_loader=weight_loader,
+                )
+                scale[:] = torch.finfo(torch.float32).min
+                layer.register_parameter("weight_scale_inv", scale)
+            else:
+                scale = PerTensorScaleParameter(
+                    data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+                    weight_loader=weight_loader,
+                )
+                scale[:] = torch.finfo(torch.float32).min
+                layer.register_parameter("weight_scale", scale)
+
+            # INPUT ACTIVATION SCALE
+            if (
+                hasattr(self.quant_config, "activation_scheme")
+                and self.quant_config.activation_scheme == "static"
+            ) or (
+                hasattr(self.quant_config, "linear_activation_scheme")
+                and self.quant_config.linear_activation_scheme == "static"
+            ):
+                scale = PerTensorScaleParameter(
+                    data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+                    weight_loader=weight_loader,
+                )
+
+                scale[:] = torch.finfo(torch.float32).min
+                layer.register_parameter("input_scale", scale)
+            else:
+                layer.register_parameter("input_scale", None)
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        if self.block_quant:
+            # If ROCm, normalize the weights and scales to e4m3fnuz
+            if _is_fp8_fnuz:
+                # activation_scheme: dynamic
+                weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                    weight=layer.weight,
+                    weight_scale=layer.weight_scale_inv,
+                    input_scale=None,
+                )
+                layer.input_scale = None
+            elif _is_cpu:
+                assert (
+                    _is_cpu_amx_available
+                ), "Fp8LinearMethod on CPU requires that CPU has AMX support"
+                _amx_process_weight_after_loading(layer, ["weight"])
+                layer.weight_scale_inv = torch.nn.Parameter(
+                    layer.weight_scale_inv.data, requires_grad=False
+                )
+                return
+            else:
+                weight, weight_scale = layer.weight.data, layer.weight_scale_inv.data
+            layer.weight = Parameter(weight, requires_grad=False)
+            layer.weight_scale_inv = Parameter(weight_scale, requires_grad=False)
+        else:
+            layer.weight = Parameter(layer.weight.data, requires_grad=False)
+
+            # If checkpoint not serialized fp8, quantize the weights.
+            if not self.quant_config.is_checkpoint_fp8_serialized:
+                if self.cutlass_fp8_supported or self.use_marlin:
+                    # apply per-channel quantization default as
+                    # cutlass sgl-kernel and marlin only support per-channel scale
+                    qweight, weight_scale = per_token_group_quant_fp8(
+                        layer.weight, layer.weight.shape[-1]
+                    )
+                    weight_scale = weight_scale.t().contiguous()
+                else:
+                    # per-tensor quantization
+                    qweight, weight_scale = input_to_float8(layer.weight)
+
+                # Update the layer with the new values.
+                layer.weight = Parameter(qweight.t(), requires_grad=False)
+                layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+                layer.input_scale = None
+
+            # If checkpoint is fp8, handle that there are N scales for N
+            # shards in a fused module
+            else:
+                layer.weight_scale = Parameter(
+                    layer.weight_scale.data, requires_grad=False
+                )
+                if (
+                    hasattr(self.quant_config, "activation_scheme")
+                    and self.quant_config.activation_scheme == "static"
+                ) or (
+                    hasattr(self.quant_config, "linear_activation_scheme")
+                    and self.quant_config.linear_activation_scheme == "static"
+                ):
+                    layer.input_scale = Parameter(
+                        layer.input_scale.data, requires_grad=False
+                    )
+
+                # cutlass sgl-kernel and marlin only support per-channel scale
+                if self.cutlass_fp8_supported or self.use_marlin:
+                    weight = layer.weight
+                    weight_scale = convert_to_channelwise(
+                        layer.weight_scale, layer.logical_widths
+                    )
+                else:
+                    # Dequant -> Quant with max scale so we can run per tensor.
+                    weight = layer.weight
+                    weight_scale = layer.weight_scale
+                    # If ROCm, normalize the weights and scales to e4m3fnuz
+                    if _is_fp8_fnuz:
+                        weight, weight_scale, input_scale = (
+                            normalize_e4m3fn_to_e4m3fnuz(
+                                weight=weight,
+                                weight_scale=weight_scale,
+                                input_scale=layer.input_scale,
+                            )
+                        )
+                        if input_scale is not None:
+                            layer.input_scale = Parameter(
+                                input_scale, requires_grad=False
+                            )
+
+                    weight_scale, weight = requantize_with_max_scale(
+                        weight=weight,
+                        weight_scale=weight_scale,
+                        logical_widths=layer.logical_widths,
+                    )
+
+                # Update layer with new values.
+                layer.weight = Parameter(weight.t(), requires_grad=False)
+                layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+                if (
+                    hasattr(self.quant_config, "activation_scheme")
+                    and self.quant_config.activation_scheme == "static"
+                ) or (
+                    hasattr(self.quant_config, "linear_activation_scheme")
+                    and self.quant_config.linear_activation_scheme == "static"
+                ):
+                    layer.input_scale = Parameter(
+                        layer.input_scale.max(), requires_grad=False
+                    )
+
+        if self.use_marlin:
+            if self.block_quant:
+                layer.weight_block_size = self.quant_config.weight_block_size
+            prepare_fp8_layer_for_marlin(layer, not self.block_quant)
+            # Activations not quantized for marlin.
+            del layer.input_scale
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if self.use_marlin:
+            return apply_fp8_marlin_linear(
+                input=x,
+                weight=layer.weight,
+                weight_scale=layer.weight_scale,
+                workspace=layer.workspace,
+                size_n=layer.output_size_per_partition,
+                size_k=layer.input_size_per_partition,
+                bias=bias,
+            )
+
+        if self.block_quant:
+            if use_intel_amx_backend(layer):
+                return torch.ops.sgl_kernel.fp8_scaled_mm_cpu(
+                    x,
+                    layer.weight,
+                    layer.weight_scale_inv,
+                    self.quant_config.weight_block_size,
+                    bias,
+                    x.dtype,
+                    True,  # is_vnni
+                )
+
+            return self.w8a8_block_fp8_linear(
+                input=x,
+                weight=layer.weight,
+                block_size=self.quant_config.weight_block_size,
+                weight_scale=layer.weight_scale_inv,
+                input_scale=None,
+                bias=bias,
+            )
+
+        return apply_fp8_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=layer.input_scale,
+            bias=bias,
+            cutlass_fp8_supported=self.cutlass_fp8_supported,
+            use_per_token_if_dynamic=False,
+        )
+
+
+def get_tile_tokens_dim(num_tokens, top_k, num_experts):
+    # Guess tokens per expert assuming perfect expert distribution first.
+    num_tokens_per_expert = (num_tokens * top_k) // num_experts
+    # And pad the number to the next power of 2.
+    tile_tokens_dim = next_power_of_2(num_tokens_per_expert)
+    # Cap to 8-64 tokens per CTA tile as it's the range supported by the kernel.
+    tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
+    return tile_tokens_dim
+
+
+class Fp8MoEMethod(FusedMoEMethodBase):
+    """MoE method for FP8.
+    Supports loading FP8 checkpoints with static weight scale and
+    dynamic/static activation scale.
+
+    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
+    activation scaling. The weight scaling factor will be initialized after
+    the model weights are loaded.
+
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self, quant_config: Fp8Config):
+        self.quant_config = quant_config
+        self.block_quant = self.quant_config.weight_block_size is not None
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+        self.use_cutlass_fused_experts_fp8 = (
+            get_bool_env_var("SGLANG_CUTLASS_MOE")
+            and self.cutlass_fp8_supported
+            and self.block_quant
+            and (is_sm100_supported() or is_sm90_supported())
+        )
+
+    def create_weights(
+        self,
+        layer: Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            params_dtype = torch.uint32 if _use_hip_int4 else torch.float8_e4m3fn
+        tp_size = get_tensor_model_parallel_world_size()
+        if self.block_quant:
+            block_n, block_k = (
+                self.quant_config.weight_block_size[0],
+                self.quant_config.weight_block_size[1],
+            )
+            # NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
+            # Required by column parallel or enabling merged weights
+            if intermediate_size_per_partition % block_n != 0:
+                raise ValueError(
+                    f"The output_size of gate's and up's weight = "
+                    f"{intermediate_size_per_partition} is not divisible by "
+                    f"weight quantization block_n = {block_n}."
+                )
+            if tp_size > 1:
+                # Required by row parallel
+                if intermediate_size_per_partition % block_k != 0:
+                    raise ValueError(
+                        f"The input_size of down's weight = "
+                        f"{intermediate_size_per_partition} is not divisible by "
+                        f"weight quantization block_k = {block_k}."
+                    )
+
+        # WEIGHTS
+        if _is_hip and _use_hip_int4:
+            # INT4 MoE weight - INT32 packed
+            w13_weight = torch.nn.Parameter(
+                torch.empty(
+                    num_experts,
+                    2 * intermediate_size_per_partition,
+                    hidden_size // 8,
+                    dtype=params_dtype,
+                ),
+                requires_grad=False,
+            )
+            w2_weight = torch.nn.Parameter(
+                torch.empty(
+                    num_experts,
+                    hidden_size,
+                    intermediate_size_per_partition // 8,
+                    dtype=params_dtype,
+                ),
+                requires_grad=False,
+            )
+        else:
+            w13_weight = torch.nn.Parameter(
+                torch.empty(
+                    num_experts,
+                    2 * intermediate_size_per_partition,
+                    hidden_size,
+                    dtype=params_dtype,
+                ),
+                requires_grad=False,
+            )
+            w2_weight = torch.nn.Parameter(
+                torch.empty(
+                    num_experts,
+                    hidden_size,
+                    intermediate_size_per_partition,
+                    dtype=params_dtype,
+                ),
+                requires_grad=False,
+            )
+
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # WEIGHT_SCALES
+        if self.block_quant:
+            w13_weight_scale = torch.nn.Parameter(
+                torch.ones(
+                    num_experts,
+                    2 * ((intermediate_size_per_partition + block_n - 1) // block_n),
+                    (hidden_size + block_k - 1) // block_k,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            w2_weight_scale = torch.nn.Parameter(
+                torch.ones(
+                    num_experts,
+                    (hidden_size + block_n - 1) // block_n,
+                    (intermediate_size_per_partition + block_k - 1) // block_k,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            layer.register_parameter("w13_weight_scale_inv", w13_weight_scale)
+            layer.register_parameter("w2_weight_scale_inv", w2_weight_scale)
+            assert self.quant_config.activation_scheme == "dynamic"
+            if self.use_cutlass_fused_experts_fp8:
+                self.ab_strides1 = torch.full(
+                    (num_experts,),
+                    hidden_size,
+                    device=w13_weight.device,
+                    dtype=torch.int64,
+                )
+                self.c_strides1 = torch.full(
+                    (num_experts,),
+                    2 * intermediate_size_per_partition,
+                    device=w13_weight.device,
+                    dtype=torch.int64,
+                )
+                self.ab_strides2 = torch.full(
+                    (num_experts,),
+                    intermediate_size_per_partition,
+                    device=w2_weight.device,
+                    dtype=torch.int64,
+                )
+                self.c_strides2 = torch.full(
+                    (num_experts,),
+                    hidden_size,
+                    device=w2_weight.device,
+                    dtype=torch.int64,
+                )
+                self.workspace = torch.empty(
+                    90000, device=w13_weight.device, dtype=torch.uint8
+                )
+                self.a_ptr = torch.empty(
+                    num_experts, device=w13_weight.device, dtype=torch.int64
+                )
+                self.b_ptr = torch.empty(
+                    num_experts, device=w13_weight.device, dtype=torch.int64
+                )
+                self.out_ptr = torch.empty(
+                    num_experts, device=w13_weight.device, dtype=torch.int64
+                )
+                self.a_scales_ptr = torch.empty(
+                    num_experts, device=w13_weight.device, dtype=torch.int64
+                )
+                self.b_scales_ptr = torch.empty(
+                    num_experts, device=w13_weight.device, dtype=torch.int64
+                )
+                self.expert_offsets = torch.empty(
+                    num_experts + 1, device=w13_weight.device, dtype=torch.int32
+                )
+                self.problem_sizes1 = torch.empty(
+                    num_experts, 3, device=w13_weight.device, dtype=torch.int32
+                )
+                self.problem_sizes2 = torch.empty(
+                    num_experts, 3, device=w13_weight.device, dtype=torch.int32
+                )
+
+        else:
+            # Allocate 2 scales for w1 and w3 respectively.
+            # They will be combined to a single scale after weight loading.
+            w13_weight_scale = torch.nn.Parameter(
+                torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
+            )
+            w2_weight_scale = torch.nn.Parameter(
+                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+            )
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+
+            if _is_hip:  # _use_aiter: TODO: add check back after triton kernel
+                # ROCm - using column scaling, duplicate scaling numbers in case per tensor scaling
+                w13_weight_scale1 = torch.nn.Parameter(
+                    torch.ones(
+                        num_experts,
+                        2 * intermediate_size_per_partition,
+                        dtype=torch.float32,
+                    ),
+                    requires_grad=False,
+                )
+                w2_weight_scale1 = torch.nn.Parameter(
+                    torch.ones(num_experts, hidden_size, dtype=torch.float32),
+                    requires_grad=False,
+                )
+                layer.register_parameter("w13_weight_scale1", w13_weight_scale1)
+                layer.register_parameter("w2_weight_scale1", w2_weight_scale1)
+
+        # Add the quantization method used (per tensor/grouped/channel)
+        # to ensure the weight scales are loaded in properly
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value}
+            if self.block_quant
+            else {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+        )
+        # If loading fp8 checkpoint, pass the weight loaders.
+        # If loading an fp16 checkpoint, do not (we will quantize in
+        #   process_weights_after_loading()
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+            set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+            if _is_hip and _use_hip_int4:
+                extra_weight_attrs.update(
+                    {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
+                )
+                set_weight_attrs(w13_weight_scale1, extra_weight_attrs)
+                set_weight_attrs(w2_weight_scale1, extra_weight_attrs)
+
+        # INPUT_SCALES
+        if self.quant_config.activation_scheme == "static":
+            if not self.quant_config.is_checkpoint_fp8_serialized:
+                raise ValueError(
+                    "Found static activation scheme for checkpoint that "
+                    "was not serialized fp8."
+                )
+
+            w13_input_scale = torch.nn.Parameter(
+                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+            )
+            layer.register_parameter("w13_input_scale", w13_input_scale)
+            set_weight_attrs(w13_input_scale, extra_weight_attrs)
+
+            w2_input_scale = torch.nn.Parameter(
+                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+            )
+            layer.register_parameter("w2_input_scale", w2_input_scale)
+            set_weight_attrs(w2_input_scale, extra_weight_attrs)
+
+        else:
+            layer.w13_input_scale = None
+            layer.w2_input_scale = None
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        if _is_hip and _use_hip_int4:
+            self.process_weights_hip_int4(layer)
+            return
+
+        # Block quant doesn't need to process weights after loading
+        if self.block_quant:
+            # If ROCm, normalize the weights and scales to e4m3fnuz
+            if _is_fp8_fnuz:
+                # activation_scheme: dynamic
+                w13_weight, w13_weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                    weight=layer.w13_weight,
+                    weight_scale=layer.w13_weight_scale_inv,
+                    input_scale=None,
+                )
+                w2_weight, w2_weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                    weight=layer.w2_weight,
+                    weight_scale=layer.w2_weight_scale_inv,
+                    input_scale=None,
+                )
+                # Reset the parameter
+                layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
+                layer.w13_weight_scale_inv = torch.nn.Parameter(
+                    w13_weight_scale, requires_grad=False
+                )
+                layer.w13_input_scale = None
+                layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+                layer.w2_weight_scale_inv = torch.nn.Parameter(
+                    w2_weight_scale, requires_grad=False
+                )
+                layer.w2_input_scale = None
+
+            if _use_aiter:
+                # Pre-shuffle weights
+                layer.w13_weight.data = shuffle_weight(
+                    layer.w13_weight.contiguous(), (16, 16)
+                )
+                layer.w2_weight.data = shuffle_weight(
+                    layer.w2_weight.contiguous(), (16, 16)
+                )
+
+            if _is_cpu:
+                assert (
+                    _is_cpu_amx_available
+                ), "Fp8MoEMethod on CPU requires that CPU has AMX support"
+                _amx_process_weight_after_loading(layer, ["w13_weight", "w2_weight"])
+
+            return
+
+        # If checkpoint is fp16 or bfloat16, quantize in place.
+        if not self.quant_config.is_checkpoint_fp8_serialized:
+            # If ROCm, fp8_dtype will be float8_e4m3fnuz (MI300x HW)
+            w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
+            w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype)
+
+            # Re-initialize w13_scale because we directly quantize
+            # merged w13 weights and generate a single scaling factor.
+            layer.w13_weight_scale = torch.nn.Parameter(
+                torch.ones(
+                    layer.num_local_experts,
+                    dtype=torch.float32,
+                    device=w13_weight.device,
+                ),
+                requires_grad=False,
+            )
+            for expert in range(layer.num_local_experts):
+                w13_weight[expert, :, :], layer.w13_weight_scale[expert] = (
+                    scaled_fp8_quant(layer.w13_weight.data[expert, :, :])
+                )
+                w2_weight[expert, :, :], layer.w2_weight_scale[expert] = (
+                    scaled_fp8_quant(layer.w2_weight.data[expert, :, :])
+                )
+            layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+
+            if _is_hip:
+                self.process_weights_hip_scale_padding(layer)
+            return
+
+        # If checkpoint is fp8, we need to handle that the
+        # MoE kernels require single activation scale and single weight
+        # scale for w13 per expert.
+        else:
+            # Fp8 moe kernels require a single activation scale.
+            # We take the max of all the scales in case they differ.
+            if self.quant_config.activation_scheme == "static":
+                if layer.w13_input_scale is None or layer.w2_input_scale is None:
+                    raise ValueError(
+                        "QuantConfig has static quantization, but found "
+                        "activation scales are None."
+                    )
+                if not all_close_1d(layer.w13_input_scale) or not all_close_1d(
+                    layer.w2_input_scale
+                ):
+                    print_warning_once(
+                        "Found input_scales that are not equal for "
+                        "fp8 MoE layer. Using the maximum across experts "
+                        "for each layer. "
+                    )
+                layer.w13_input_scale = torch.nn.Parameter(
+                    layer.w13_input_scale.max(), requires_grad=False
+                )
+                layer.w2_input_scale = torch.nn.Parameter(
+                    layer.w2_input_scale.max(), requires_grad=False
+                )
+
+            # If ROCm, normalize the weights and scales to e4m3fnuz
+            if _is_fp8_fnuz:
+                # Normalize the weights and scales
+                w13_weight, w13_weight_scale, w13_input_scale = (
+                    normalize_e4m3fn_to_e4m3fnuz(
+                        layer.w13_weight, layer.w13_weight_scale, layer.w13_input_scale
+                    )
+                )
+                w2_weight, w2_weight_scale, w2_input_scale = (
+                    normalize_e4m3fn_to_e4m3fnuz(
+                        layer.w2_weight, layer.w2_weight_scale, layer.w2_input_scale
+                    )
+                )
+                # Reset the parameter
+                layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
+                layer.w13_weight_scale = torch.nn.Parameter(
+                    w13_weight_scale, requires_grad=False
+                )
+                if w13_input_scale is not None:
+                    layer.w13_input_scale = torch.nn.Parameter(
+                        w13_input_scale, requires_grad=False
+                    )
+                layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+                layer.w2_weight_scale = torch.nn.Parameter(
+                    w2_weight_scale, requires_grad=False
+                )
+                if w2_input_scale is not None:
+                    layer.w2_input_scale = torch.nn.Parameter(
+                        w2_input_scale, requires_grad=False
+                    )
+            # Fp8 moe kernel needs single weight scale for w13 per expert.
+            # We take the max then dequant and requant each expert.
+            assert layer.w13_weight_scale is not None
+            shard_size = layer.intermediate_size_per_partition
+            max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+            for expert_id in range(layer.num_local_experts):
+                start = 0
+                for shard_id in range(2):
+                    dq_weight = per_tensor_dequantize(
+                        layer.w13_weight[expert_id][start : start + shard_size, :],
+                        layer.w13_weight_scale[expert_id][shard_id],
+                    )
+                    (
+                        layer.w13_weight[expert_id][start : start + shard_size, :],
+                        _,
+                    ) = scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
+                    start += shard_size
+
+            layer.w13_weight_scale = torch.nn.Parameter(
+                max_w13_scales, requires_grad=False
+            )
+
+            if _is_hip:
+                self.process_weights_hip_scale_padding(layer)
+            return
+
+    def process_weights_hip_int4(self, layer: Module):
+        # TODO: _use_aiter: add after triton kernel added
+        # INT4-FP8 (INT4 MoE Weight, FP8 Compute)
+        # Weight Permutation
+        layer.w13_weight = torch.nn.Parameter(
+            shuffle_weight(layer.w13_weight.data, (16, 16)),
+            requires_grad=False,
+        )
+        torch.cuda.empty_cache()
+        layer.w2_weight = torch.nn.Parameter(
+            shuffle_weight(layer.w2_weight.data, (16, 16)),
+            requires_grad=False,
+        )
+        torch.cuda.empty_cache()
+
+        # INT4-FP8 : offset INT4 w13_weight_scale1 to single w13_weight_scale
+        # Fp8 moe kernel needs single fp8 w13_weight_scale for w13 per expert.
+        # We won't do requant each expert's fp8 weight (not direct available),
+        # instead we adjust half of INT4 w13_weight_scale1 numbers
+        assert layer.w13_weight_scale is not None
+        shard_size = layer.intermediate_size_per_partition
+        max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+        for expert_id in range(layer.num_local_experts):
+            start = 0
+            max_w13_scale_fp8 = max_w13_scales[expert_id]
+            for shard_id in range(2):
+                if layer.w13_weight_scale[expert_id][shard_id] != max_w13_scale_fp8:
+                    int4_rescale = (
+                        layer.w13_weight_scale[expert_id][shard_id] / max_w13_scale_fp8
+                    )
+                    layer.w13_weight_scale1[expert_id][
+                        start : start + shard_size
+                    ] *= int4_rescale
+                start += shard_size
+
+        layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales, requires_grad=False)
+
+        # special hack to asm_moe, which takes (weight_scale1 * weight_scale) as post GEMM scaling
+        # optimal design - shall apply per-column weight_scale1 before GEMM, and weight_scale post
+        for expert_id in range(layer.num_local_experts):
+            layer.w13_weight_scale1[expert_id] *= max_w13_scales[expert_id]
+            layer.w2_weight_scale1[expert_id] *= layer.w2_weight_scale[expert_id]
+
+    def process_weights_hip_scale_padding(self, layer: Module):
+        from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
+            padding_size,  # Avoid circular import
+        )
+
+        if _use_aiter:
+            layer.w13_weight = torch.nn.Parameter(
+                shuffle_weight(layer.w13_weight.data, (16, 16)),
+                requires_grad=False,
+            )
+            torch.cuda.empty_cache()
+            layer.w2_weight = torch.nn.Parameter(
+                shuffle_weight(layer.w2_weight.data, (16, 16)),
+                requires_grad=False,
+            )
+            torch.cuda.empty_cache()
+
+            # ROCm (_use_aiter): using column-wise scaling
+            layer.w13_weight_scale1 *= layer.w13_weight_scale.unsqueeze(-1)
+            layer.w2_weight_scale1 *= layer.w2_weight_scale.unsqueeze(-1)
+        elif get_bool_env_var("SGLANG_MOE_PADDING"):
+            # If ROCm, apply weight padding (min. Mem channel contention) only if set
+            layer.w13_weight = torch.nn.Parameter(
+                F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0),
+                requires_grad=False,
+            )
+            torch.cuda.empty_cache()
+            layer.w2_weight = torch.nn.Parameter(
+                F.pad(layer.w2_weight.data, (0, padding_size), "constant", 0),
+                requires_grad=False,
+            )
+            torch.cuda.empty_cache()
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+        self.runner = MoeRunner(MoeRunnerBackend.TRITON, moe_runner_config)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: DispatchOutput,
+    ) -> CombineInput:
+
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+        moe_runner_config = self.moe_runner_config
+
+        if use_intel_amx_backend(layer):
+            from sglang.srt.layers.moe.topk import apply_topk_weights_cpu
+
+            topk_weights, topk_ids, _ = topk_output
+            x, topk_weights = apply_topk_weights_cpu(
+                moe_runner_config.apply_router_weight_on_input, topk_weights, x
+            )
+
+            output = torch.ops.sgl_kernel.fused_experts_cpu(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights,
+                topk_ids,
+                False,  # inplace See [Note] inplace should be False in fused_experts.
+                False,  # use_int8_w8a8
+                True,  # use_fp8_w8a16
+                layer.w13_weight_scale_inv,  # w1_scale
+                layer.w2_weight_scale_inv,  # w2_scale
+                self.quant_config.weight_block_size,  # block_size
+                None,  # a1_scale
+                None,  # a2_scale
+                True,  # is_vnni
+            )
+            return StandardCombineInput(hidden_states=output)
+
+        if _is_hip:
+            ret = self.maybe_apply_hip_fused_experts(
+                layer,
+                x,
+                topk_output,
+                moe_runner_config.activation,
+                moe_runner_config.no_combine,
+            )
+            if ret is not None:
+                return StandardCombineInput(hidden_states=ret)
+
+        if self.use_cutlass_fused_experts_fp8:
+            from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts_fp8
+
+            topk_weights, topk_ids, _ = topk_output
+            output = cutlass_fused_experts_fp8(
+                x,
+                layer.w13_weight.transpose(1, 2),
+                layer.w2_weight.transpose(1, 2),
+                layer.w13_weight_scale_inv.transpose(1, 2),
+                layer.w2_weight_scale_inv.transpose(1, 2),
+                topk_weights,
+                topk_ids,
+                self.ab_strides1,
+                self.c_strides1,
+                self.ab_strides2,
+                self.c_strides2,
+                self.workspace,
+                self.a_ptr,
+                self.b_ptr,
+                self.out_ptr,
+                self.a_scales_ptr,
+                self.b_scales_ptr,
+                self.expert_offsets,
+                self.problem_sizes1,
+                self.problem_sizes2,
+                use_fp8_blockscale=True,
+            )
+            return StandardCombineInput(hidden_states=output)
+
+        quant_info = TritonMoeQuantInfo(
+            w13_weight=layer.w13_weight,
+            w2_weight=layer.w2_weight,
+            use_fp8_w8a8=True,
+            w13_scale=(
+                layer.w13_weight_scale_inv
+                if self.block_quant
+                else layer.w13_weight_scale
+            ),
+            w2_scale=(
+                layer.w2_weight_scale_inv if self.block_quant else layer.w2_weight_scale
+            ),
+            a13_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
+            block_shape=self.quant_config.weight_block_size,
+        )
+        return self.runner.run(dispatch_output, quant_info)
+
+    def apply_with_router_logits(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> torch.Tensor:
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        activation = self.moe_runner_config.activation
+        routed_scaling_factor = self.moe_runner_config.routed_scaling_factor
+
+        from flashinfer.fused_moe import trtllm_fp8_block_scale_moe
+
+        from sglang.srt.layers.moe.topk import TopKOutputChecker
+
+        assert TopKOutputChecker.format_is_bypassed(topk_output)
+        router_logits = topk_output.router_logits
+        topk_config = topk_output.topk_config
+        assert (
+            activation == "silu"
+        ), "Only silu is supported for flashinfer blockscale fp8 moe"
+        a_q, a_sf = per_token_group_quant_fp8(x, self.quant_config.weight_block_size[1])
+        # NOTE: scales of hidden states have to be transposed!
+        a_sf_t = a_sf.t().contiguous()
+
+        assert (
+            topk_config.num_expert_group is not None
+            and topk_config.topk_group is not None
+        ), "Current trtllm_fp8_block_scale_moe kernel does not support these two arguments as None"
+
+        correction_bias = (
+            None
+            if topk_config.correction_bias is None
+            else topk_config.correction_bias.to(x.dtype)
+        )
+
+        return trtllm_fp8_block_scale_moe(
+            routing_logits=router_logits.to(torch.float32),
+            routing_bias=correction_bias,
+            hidden_states=a_q,
+            hidden_states_scale=a_sf_t,
+            gemm1_weights=layer.w13_weight,
+            gemm1_weights_scale=layer.w13_weight_scale_inv,
+            gemm2_weights=layer.w2_weight,
+            gemm2_weights_scale=layer.w2_weight_scale_inv,
+            num_experts=layer.num_experts,
+            top_k=topk_config.top_k,
+            n_group=topk_config.num_expert_group,
+            topk_group=topk_config.topk_group,
+            intermediate_size=layer.w2_weight.shape[2],
+            local_expert_offset=layer.moe_ep_rank * layer.num_local_experts,
+            local_num_experts=layer.num_local_experts,
+            routed_scaling_factor=(
+                routed_scaling_factor if routed_scaling_factor is not None else 1.0
+            ),
+            tile_tokens_dim=get_tile_tokens_dim(
+                x.shape[0], topk_config.top_k, layer.num_experts
+            ),
+            routing_method_type=2,  # DeepSeek-styled routing method
+            use_shuffled_weight=False,
+        )
+
+    def maybe_apply_hip_fused_experts(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        topk_output: TopKOutput,
+        activation: str = "silu",
+        no_combine: bool = False,
+    ) -> Optional[torch.Tensor]:
+        topk_weights, topk_ids, _ = topk_output
+        if _use_hip_int4:
+            # TODO: add triton kernel and add check _use_aiter
+            assert not no_combine, f"{no_combine=} is not supported."
+            return fused_moe(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights,
+                topk_ids,
+                quant_type=QuantType.per_Token,
+                w1_scale=layer.w13_weight_scale1,
+                w2_scale=layer.w2_weight_scale1,
+                activation=(
+                    ActivationType.Silu if activation == "silu" else ActivationType.Gelu
+                ),
+            )
+
+        if _use_aiter:
+            assert not no_combine, f"{no_combine=} is not supported."
+            if self.block_quant:
+                return fused_moe(
+                    x,
+                    layer.w13_weight,
+                    layer.w2_weight,
+                    topk_weights,
+                    topk_ids,
+                    w1_scale=layer.w13_weight_scale_inv,
+                    w2_scale=layer.w2_weight_scale_inv,
+                    quant_type=QuantType.per_128x128,
+                    activation=(
+                        ActivationType.Silu
+                        if activation == "silu"
+                        else ActivationType.Gelu
+                    ),
+                    expert_mask=None,
+                )
+            else:
+                return fused_moe(
+                    x,
+                    layer.w13_weight,
+                    layer.w2_weight,
+                    topk_weights,
+                    topk_ids,
+                    quant_type=QuantType.per_Token,
+                    w1_scale=layer.w13_weight_scale1,
+                    w2_scale=layer.w2_weight_scale1,
+                    activation=(
+                        ActivationType.Silu
+                        if activation == "silu"
+                        else ActivationType.Gelu
+                    ),
+                )
+        return None
+
+
+class Fp8KVCacheMethod(BaseKVCacheMethod):
+    """
+    Supports loading kv-cache scaling factors from FP8 checkpoints.
+    """
+
+    def __init__(self, quant_config: Fp8Config):
+        super().__init__(quant_config)
diff --git a/python/sglang/srt/layers/quantization/fp8_kernel.py b/python/sglang/srt/layers/quantization/fp8_kernel.py
new file mode 100644
index 000000000..f0512365b
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/fp8_kernel.py
@@ -0,0 +1,1806 @@
+# Copyright 2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import functools
+import json
+import logging
+import os
+from functools import lru_cache
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.quantization import deep_gemm_wrapper
+from sglang.srt.utils import (
+    align,
+    direct_register_custom_op,
+    get_bool_env_var,
+    get_device_core_count,
+    get_device_name,
+    is_cpu,
+    is_cuda,
+    is_hip,
+    log_info_on_rank0,
+    supports_custom_op,
+)
+
+_is_hip = is_hip()
+_is_cuda = is_cuda()
+_is_cpu = is_cpu()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+if _is_cuda:
+    from sgl_kernel import (
+        sgl_per_tensor_quant_fp8,
+        sgl_per_token_group_quant_fp8,
+        sgl_per_token_quant_fp8,
+    )
+
+if _is_hip:
+    if _use_aiter:
+        try:
+            from aiter import (  # v0.1.3
+                dynamic_per_tensor_quant,
+                dynamic_per_token_scaled_quant,
+                static_per_tensor_quant,
+            )
+        except ImportError:
+            raise ImportError("aiter is required when SGLANG_USE_AITER is set to True")
+    else:
+        try:
+            import vllm._C
+        except ImportError:
+            raise ImportError("vllm is required when SGLANG_USE_AITER is set to False")
+
+logger = logging.getLogger(__name__)
+
+
+@lru_cache()
+def is_fp8_fnuz() -> bool:
+    if _is_hip:
+        # only device 0 is checked, this assumes MI300 platforms are homogeneous
+        return "gfx94" in torch.cuda.get_device_properties(0).gcnArchName
+    return False
+
+
+if is_fp8_fnuz():
+    fp8_dtype = torch.float8_e4m3fnuz
+    fp8_max = 224.0
+else:
+    fp8_dtype = torch.float8_e4m3fn
+    fp8_max = torch.finfo(fp8_dtype).max
+fp8_min = -fp8_max
+
+if supports_custom_op():
+
+    def deep_gemm_fp8_fp8_bf16_nt(
+        A: torch.Tensor,
+        As: torch.Tensor,
+        B: torch.Tensor,
+        Bs: torch.Tensor,
+        C: torch.Tensor,
+    ) -> None:
+        deep_gemm_wrapper.gemm_nt_f8f8bf16((A, As), (B, Bs), C)
+
+    def deep_gemm_fp8_fp8_bf16_nt_fake(
+        A: torch.Tensor,
+        As: torch.Tensor,
+        B: torch.Tensor,
+        Bs: torch.Tensor,
+        C: torch.Tensor,
+    ) -> None:
+        return
+
+    direct_register_custom_op(
+        op_name="deep_gemm_fp8_fp8_bf16_nt",
+        op_func=deep_gemm_fp8_fp8_bf16_nt,
+        mutates_args=["C"],
+        fake_impl=deep_gemm_fp8_fp8_bf16_nt_fake,
+    )
+
+
+@triton.jit
+def _per_token_group_quant_8bit(
+    # Pointers to inputs and output
+    y_ptr,
+    y_q_ptr,
+    y_s_ptr,
+    # Stride of input
+    y_stride,
+    # Columns of input
+    N,
+    # Avoid to divide zero
+    eps,
+    # Information for float8
+    bit8_min,
+    bit8_max,
+    # Meta-parameters
+    BLOCK: tl.constexpr,
+):
+    """A Triton-accelerated function to perform per-token-group quantization on a
+    tensor.
+
+    This function converts the tensor values into float8 values.
+    """
+    # Map the program id to the row of X and Y it should compute.
+    g_id = tl.program_id(0)
+    y_ptr += g_id * y_stride
+    y_q_ptr += g_id * y_stride
+    y_s_ptr += g_id
+
+    cols = tl.arange(0, BLOCK)  # N <= BLOCK
+    mask = cols < N
+
+    y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+    # Quant
+    _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
+    y_s = _absmax / bit8_max
+    y_s_inv = 1.0 / y_s
+    y_q = tl.clamp(y * y_s_inv, bit8_min, bit8_max).to(y_q_ptr.dtype.element_ty)
+
+    tl.store(y_q_ptr + cols, y_q, mask=mask)
+    tl.store(y_s_ptr, y_s)
+
+
+@triton.jit
+def _per_token_group_quant_8bit_colmajor(
+    # Pointers to inputs and output
+    y_ptr,
+    y_q_ptr,
+    y_s_ptr,
+    group_size,
+    # Num columns of y
+    y_num_columns,
+    # Stride from one column to the next of y_s
+    y_s_col_stride,
+    # Avoid to divide zero
+    eps,
+    # Information for float8
+    bit8_min,
+    bit8_max,
+    # Meta-parameters
+    BLOCK: tl.constexpr,
+    SCALE_UE8M0: tl.constexpr,
+):
+    """A Triton-accelerated function to perform per-token-group
+    quantization on a tensor.
+    This function converts the tensor values into float8 values.
+    """
+    # Map the program id to the row of X and Y it should compute.
+    g_id = tl.program_id(0)
+    y_ptr += g_id.to(tl.int64) * group_size
+    y_q_ptr += g_id.to(tl.int64) * group_size
+
+    # Convert g_id the flattened block coordinate to 2D so we can index
+    # into the output y_scales matrix
+    blocks_per_row = y_num_columns // group_size
+    scale_col = g_id % blocks_per_row
+    scale_row = g_id // blocks_per_row
+    y_s_ptr += scale_col * y_s_col_stride + scale_row
+
+    cols = tl.arange(0, BLOCK)  # group_size <= BLOCK
+    mask = cols < group_size
+
+    y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+    # Quant
+    _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
+    y_s = _absmax / bit8_max
+    if SCALE_UE8M0:
+        y_s = tl.exp2(tl.ceil(tl.log2(tl.abs(y_s))))
+    y_q = tl.clamp(y / y_s, bit8_min, bit8_max).to(y_q_ptr.dtype.element_ty)
+
+    tl.store(y_q_ptr + cols, y_q, mask=mask)
+    tl.store(y_s_ptr, y_s)
+
+
+def _per_token_group_quant_8bit_raw(
+    x: torch.Tensor,
+    group_size: int,
+    eps: float = 1e-10,
+    dtype: torch.dtype = fp8_dtype,
+    column_major_scales: bool = False,
+    scale_tma_aligned: bool = False,
+    scale_ue8m0: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Function to perform per-token-group quantization on an input tensor `x`.
+
+    It converts the tensor values into signed float8 values and returns the
+    quantized tensor along with the scaling factor used for quantization.
+
+    Args:
+        x: The input tenosr with ndim >= 2.
+        group_size: The group size used for quantization.
+        eps: The minimum to avoid dividing zero.
+        dtype: The dype of output tensor.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the scaling factor for quantization.
+    """
+    assert (
+        x.shape[-1] % group_size == 0
+    ), "the last dimension of `x` cannot be divisible by `group_size`"
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    if _is_hip:
+        if dtype == torch.int8:
+            bit8_max = 127.0
+        else:
+            bit8_max = 224.0
+        bit8_min = -bit8_max  # TODO incorrect for int8
+    else:
+        if dtype == torch.int8:
+            info = torch.iinfo(dtype)
+        else:
+            info = torch.finfo(dtype)
+        bit8_max = info.max
+        bit8_min = info.min
+
+    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+    x_s = create_per_token_group_quant_fp8_output_scale(
+        x_shape=x.shape,
+        device=x.device,
+        group_size=group_size,
+        column_major_scales=column_major_scales,
+        scale_tma_aligned=scale_tma_aligned,
+        scale_ue8m0=False,
+    )
+
+    M = x.numel() // group_size
+    N = group_size
+
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+    num_stages = 1
+    if column_major_scales:
+        _per_token_group_quant_8bit_colmajor[(M,)](
+            x,
+            x_q,
+            x_s,
+            group_size,
+            x.shape[1],
+            x_s.stride(1),
+            eps,
+            bit8_min=bit8_min,
+            bit8_max=bit8_max,
+            BLOCK=BLOCK,
+            num_warps=num_warps,
+            num_stages=num_stages,
+            SCALE_UE8M0=scale_ue8m0,
+        )
+    else:
+        assert not scale_ue8m0
+        _per_token_group_quant_8bit[(M,)](
+            x,
+            x_q,
+            x_s,
+            group_size,
+            N,
+            eps,
+            bit8_min=bit8_min,
+            bit8_max=bit8_max,
+            BLOCK=BLOCK,
+            num_warps=num_warps,
+            num_stages=num_stages,
+        )
+
+    if scale_ue8m0:
+        from deep_gemm import transform_sf_into_required_layout
+
+        assert group_size == 128
+        x_s = transform_sf_into_required_layout(
+            x_s,
+            num_groups=None,
+            mn=x_q.shape[0],
+            k=x_q.shape[1],
+            recipe=(1, group_size, group_size),
+            is_sfa=True,
+        )
+
+    return x_q, x_s
+
+
+# backward compatibility
+per_token_group_quant_fp8 = _per_token_group_quant_8bit_raw
+
+
+def _per_token_group_quant_8bit_fuse_silu_and_mul(
+    x: torch.Tensor,
+    group_size: int,
+    dst_dtype: torch.dtype,
+    column_major_scales: bool,
+    scale_tma_aligned: bool,
+    scale_ue8m0: bool,
+    masked_m: Optional[torch.Tensor],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Another way to implement (can be used in e.g. comparison tests)
+    # from sgl_kernel import silu_and_mul
+    # x_after_silu_and_mul = silu_and_mul(x)
+    # return per_token_group_quant_fp8(
+    #     x_after_silu_and_mul,
+    #     group_size=group_size,
+    #     eps=eps,
+    #     column_major_scales=column_major_scales,
+    #     scale_tma_aligned=scale_tma_aligned,
+    #     scale_ue8m0=scale_ue8m0,
+    # )
+
+    from deep_gemm import transform_sf_into_required_layout
+
+    from sglang.srt.layers.moe.ep_moe.kernels import silu_and_mul_masked_post_quant_fwd
+
+    assert column_major_scales
+    assert scale_tma_aligned
+    assert scale_ue8m0
+
+    needs_unsqueeze = x.dim() == 2
+    if needs_unsqueeze:
+        num_tokens, _ = x.shape
+        x = x.unsqueeze(0)
+        assert masked_m is None
+        masked_m = torch.tensor([num_tokens], device=x.device, dtype=torch.int32)
+
+    # Use `zeros` for easier testing
+    output = torch.zeros(
+        (*x.shape[:-1], x.shape[-1] // 2),
+        device=x.device,
+        dtype=dst_dtype,
+    )
+    # Use `zeros` for easier testing
+    output_scale_for_kernel = torch.zeros(
+        (*x.shape[:-1], x.shape[-1] // 2 // group_size),
+        device=x.device,
+        dtype=torch.float32,
+    )
+    silu_and_mul_masked_post_quant_fwd(
+        input=x,
+        output=output,
+        output_scale=output_scale_for_kernel,
+        quant_group_size=group_size,
+        masked_m=masked_m,
+        scale_ue8m0=scale_ue8m0,
+    )
+
+    assert group_size == 128
+    output_scale = transform_sf_into_required_layout(
+        output_scale_for_kernel,
+        num_groups=output.shape[0],
+        mn=output.shape[-2],
+        k=output.shape[-1],
+        recipe=(1, group_size, group_size),
+        is_sfa=True,
+    )
+
+    if needs_unsqueeze:
+        output = output.squeeze(0)
+        output_scale = output_scale.squeeze(0)
+
+    return output, output_scale
+
+
+def per_token_group_quant_8bit(
+    x: torch.Tensor,
+    group_size: int,
+    dst_dtype: torch.dtype,
+    eps: float = 1e-10,
+    column_major_scales: bool = False,
+    scale_tma_aligned: bool = False,
+    scale_ue8m0: bool = False,
+    fuse_silu_and_mul: bool = False,
+    masked_m: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if fuse_silu_and_mul:
+        return _per_token_group_quant_8bit_fuse_silu_and_mul(
+            x=x,
+            group_size=group_size,
+            dst_dtype=dst_dtype,
+            column_major_scales=column_major_scales,
+            scale_tma_aligned=scale_tma_aligned,
+            scale_ue8m0=scale_ue8m0,
+            masked_m=masked_m,
+        )
+    else:
+        return _per_token_group_quant_8bit_raw(
+            x=x,
+            group_size=group_size,
+            eps=eps,
+            column_major_scales=column_major_scales,
+            scale_tma_aligned=scale_tma_aligned,
+            scale_ue8m0=scale_ue8m0,
+            dtype=dst_dtype,
+        )
+
+
+def create_per_token_group_quant_fp8_output_scale(
+    x_shape,
+    device,
+    group_size,
+    column_major_scales: bool,
+    scale_tma_aligned: bool,
+    scale_ue8m0: bool,
+):
+    if scale_ue8m0:
+        assert column_major_scales and scale_tma_aligned
+        *x_batch, x_q_mn, x_q_k = x_shape
+        x_s_mn, x_s_k = x_q_mn, x_q_k // 128
+        aligned_mn = align(x_s_mn, 4)
+        aligned_k = align(x_s_k, 4)
+        # TODO(FIXME): Fix cuda kernel and recover here to empty.
+        return torch.empty(
+            (*x_batch, aligned_k // 4, aligned_mn),
+            device=device,
+            dtype=torch.int,
+        ).transpose(-1, -2)[..., :x_s_mn, :]
+    elif column_major_scales:
+        if scale_tma_aligned:
+            # TODO extract "align" function
+            # aligned to 4 * sizeof(float)
+            aligned_size = (x_shape[-2] + 3) // 4 * 4
+            return torch.empty(
+                x_shape[:-2] + (x_shape[-1] // group_size, aligned_size),
+                device=device,
+                dtype=torch.float32,
+            ).permute(-1, -2)[: x_shape[-2], :]
+        else:
+            return torch.empty(
+                (x_shape[-1] // group_size,) + x_shape[:-1],
+                device=device,
+                dtype=torch.float32,
+            ).permute(-1, -2)
+    else:
+        return torch.empty(
+            x_shape[:-1] + (x_shape[-1] // group_size,),
+            device=device,
+            dtype=torch.float32,
+        )
+
+
+def sglang_per_token_group_quant_fp8(
+    x: torch.Tensor,
+    group_size: int,
+    eps: float = 1e-10,
+    column_major_scales: bool = False,
+    scale_tma_aligned: bool = False,
+    scale_ue8m0: bool = False,
+    fuse_silu_and_mul: bool = False,
+    masked_m: Optional[torch.Tensor] = None,
+):
+    assert (
+        x.shape[-1] % group_size == 0
+    ), "the last dimension of `x` cannot be divisible by `group_size`"
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    out_shape = (*x.shape[:-1], x.shape[-1] // (2 if fuse_silu_and_mul else 1))
+
+    x_q = torch.empty(out_shape, device=x.device, dtype=fp8_dtype)
+    x_s = create_per_token_group_quant_fp8_output_scale(
+        x_shape=out_shape,
+        device=x.device,
+        group_size=group_size,
+        column_major_scales=column_major_scales,
+        scale_tma_aligned=scale_tma_aligned,
+        scale_ue8m0=scale_ue8m0,
+    )
+
+    if x.shape[0] > 0:
+        sgl_per_token_group_quant_fp8(
+            x, x_q, x_s, group_size, eps, fp8_min, fp8_max, scale_ue8m0
+        )
+
+    return x_q, x_s
+
+
+# TODO maybe unify int8 and fp8 code later
+def sglang_per_token_group_quant_8bit(
+    x: torch.Tensor,
+    group_size: int,
+    dst_dtype: torch.dtype,
+    eps: float = 1e-10,
+    column_major_scales: bool = False,
+    scale_tma_aligned: bool = False,
+    scale_ue8m0: bool = False,
+    fuse_silu_and_mul: bool = False,
+    masked_m: Optional[torch.Tensor] = None,
+):
+    from sglang.srt.layers.quantization.int8_kernel import (
+        sglang_per_token_group_quant_int8,
+    )
+
+    if dst_dtype == torch.int8:
+        assert not column_major_scales
+        assert not scale_tma_aligned
+        assert not fuse_silu_and_mul
+        assert masked_m is None
+        return sglang_per_token_group_quant_int8(
+            x=x,
+            group_size=group_size,
+            eps=eps,
+            dtype=dst_dtype,
+        )
+
+    return sglang_per_token_group_quant_fp8(
+        x=x,
+        group_size=group_size,
+        eps=eps,
+        column_major_scales=column_major_scales,
+        scale_tma_aligned=scale_tma_aligned,
+        scale_ue8m0=scale_ue8m0,
+        fuse_silu_and_mul=fuse_silu_and_mul,
+        masked_m=masked_m,
+    )
+
+
+def sglang_per_token_quant_fp8(
+    x: torch.Tensor,
+    dtype: torch.dtype = fp8_dtype,
+):
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+    x_s = torch.empty(
+        x.shape[0],
+        1,
+        device=x.device,
+        dtype=torch.float32,
+    )
+
+    sgl_per_token_quant_fp8(x, x_q, x_s)
+
+    return x_q, x_s
+
+
+@triton.jit
+def _static_quant_fp8(
+    # Pointers to inputs and output
+    y_ptr,
+    y_q_ptr,
+    y_s_ptr,
+    y_s_repeat_ptr,
+    # Stride of input
+    y_stride,
+    # Columns of input
+    N,
+    # Information for float8
+    fp8_min,
+    fp8_max,
+    # Meta-parameters
+    BLOCK: tl.constexpr,
+    REPEAT_SCALE: tl.constexpr,
+):
+    """A Triton-accelerated function to perform quantization using the given scale on a
+    tensor
+
+    This function converts the tensor values into float8 values.
+    """
+    # Map the program id to the row of X and Y it should compute.
+    g_id = tl.program_id(0)
+    y_ptr += g_id * y_stride
+    y_q_ptr += g_id * y_stride
+    if REPEAT_SCALE:
+        y_s_repeat_ptr += g_id
+
+    cols = tl.arange(0, BLOCK)  # N <= BLOCK
+    mask = cols < N
+
+    y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+    y_s = tl.load(y_s_ptr).to(tl.float32)
+    y_s_inv = 1.0 / y_s
+    y_q = tl.clamp(y * y_s_inv, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
+
+    tl.store(y_q_ptr + cols, y_q, mask=mask)
+    if REPEAT_SCALE:
+        tl.store(y_s_repeat_ptr, y_s)
+
+
+def static_quant_fp8(
+    x: torch.Tensor,
+    x_s: torch.Tensor,
+    repeat_scale: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Function to perform static quantization using the given scale on an input tensor `x`.
+
+    It converts the tensor values into signed float8 values and returns the
+    quantized tensor along with the scaling factor used for quantization.
+
+    Args:
+        x: The input tenosr with ndim >= 2.
+        x_s: The quantization scale.
+        repeat_scale: Whether to broadcast per-tensor scale to per-channel scale.
+        dtype: The dype of output tensor.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the scaling factor for quantization.
+    """
+    assert x.is_contiguous(), "`x` is not contiguous"
+    assert x_s.numel() == 1, "only supports per-tensor scale"
+
+    x_q = torch.empty_like(x, device=x.device, dtype=fp8_dtype)
+    M = x.numel() // x.shape[-1]
+    N = x.shape[-1]
+    if repeat_scale:
+        x_s_repeat = torch.empty(
+            (M, 1),
+            device=x.device,
+            dtype=torch.float32,
+        )
+    else:
+        x_s_repeat = None
+
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+    num_stages = 1
+    _static_quant_fp8[(M,)](
+        x,
+        x_q,
+        x_s,
+        x_s_repeat,
+        N,
+        N,
+        fp8_min=fp8_min,
+        fp8_max=fp8_max,
+        BLOCK=BLOCK,
+        REPEAT_SCALE=repeat_scale,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    x_s = x_s_repeat if repeat_scale else x_s
+    return x_q, x_s
+
+
+@triton.jit
+def _w8a8_block_fp8_matmul(
+    # Pointers to inputs and output
+    A,
+    B,
+    C,
+    As,
+    Bs,
+    # Shape for matmul
+    M,
+    N,
+    K,
+    # Block size for block-wise quantization
+    group_n,
+    group_k,
+    # Stride for inputs and output
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_As_m,
+    stride_As_k,
+    stride_Bs_k,
+    stride_Bs_n,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """Triton-accelerated function used to perform linear operations (dot
+    product) on input tensors `A` and `B` with block-wise quantization, and store the result in output
+    tensor `C`.
+    """
+
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+
+    As_ptrs = As + offs_am * stride_As_m
+    offs_bsn = offs_bn // group_n
+    Bs_ptrs = Bs + offs_bsn * stride_Bs_n
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
+
+        k_start = k * BLOCK_SIZE_K
+        offs_ks = k_start // group_k
+        a_s = tl.load(As_ptrs + offs_ks * stride_As_k)
+        b_s = tl.load(Bs_ptrs + offs_ks * stride_Bs_k)
+
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
+@triton.jit
+def _w8a8_block_fp8_matmul_unrolledx4(
+    # Pointers to inputs and output
+    A,
+    B,
+    C,
+    As,
+    Bs,
+    # Shape for matmul
+    M,
+    N,
+    K,
+    # Block size for block-wise quantization
+    group_n,
+    group_k,
+    # Stride for inputs and output
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_As_m,
+    stride_As_k,
+    stride_Bs_k,
+    stride_Bs_n,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """Triton-accelerated function used to perform linear operations (dot
+    product) on input tensors `A` and `B` with block-wise quantization, and store the result in output
+    tensor `C`.
+    """
+
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+
+    As_ptrs = As + offs_am * stride_As_m
+    offs_bsn = offs_bn // group_n
+    Bs_ptrs = Bs + offs_bsn * stride_Bs_n
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    # manually unroll to 4 iterations
+    UNROLL_FACTOR = 4
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K * UNROLL_FACTOR)):
+        # 1st iteration
+        a = tl.load(
+            a_ptrs,
+            mask=offs_k[None, :] < K - (k * UNROLL_FACTOR) * BLOCK_SIZE_K,
+            other=0.0,
+        )
+        b = tl.load(
+            b_ptrs,
+            mask=offs_k[:, None] < K - (k * UNROLL_FACTOR) * BLOCK_SIZE_K,
+            other=0.0,
+        )
+
+        k_start = (k * UNROLL_FACTOR) * BLOCK_SIZE_K
+        offs_ks = k_start // group_k
+        a_s = tl.load(As_ptrs + offs_ks * stride_As_k)
+        b_s = tl.load(Bs_ptrs + offs_ks * stride_Bs_k)
+
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+        # 2nd iteration
+        a = tl.load(
+            a_ptrs,
+            mask=offs_k[None, :] < K - (k * UNROLL_FACTOR + 1) * BLOCK_SIZE_K,
+            other=0.0,
+        )
+        b = tl.load(
+            b_ptrs,
+            mask=offs_k[:, None] < K - (k * UNROLL_FACTOR + 1) * BLOCK_SIZE_K,
+            other=0.0,
+        )
+
+        k_start = k_start + BLOCK_SIZE_K
+        offs_ks = k_start // group_k
+        a_s = tl.load(As_ptrs + offs_ks * stride_As_k)
+        b_s = tl.load(Bs_ptrs + offs_ks * stride_Bs_k)
+
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+        # 3rd iteration
+        a = tl.load(
+            a_ptrs,
+            mask=offs_k[None, :] < K - (k * UNROLL_FACTOR + 2) * BLOCK_SIZE_K,
+            other=0.0,
+        )
+        b = tl.load(
+            b_ptrs,
+            mask=offs_k[:, None] < K - (k * UNROLL_FACTOR + 2) * BLOCK_SIZE_K,
+            other=0.0,
+        )
+
+        k_start = k_start + BLOCK_SIZE_K
+        offs_ks = k_start // group_k
+        a_s = tl.load(As_ptrs + offs_ks * stride_As_k)
+        b_s = tl.load(Bs_ptrs + offs_ks * stride_Bs_k)
+
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+        # 4th iteration
+        a = tl.load(
+            a_ptrs,
+            mask=offs_k[None, :] < K - (k * UNROLL_FACTOR + 3) * BLOCK_SIZE_K,
+            other=0.0,
+        )
+        b = tl.load(
+            b_ptrs,
+            mask=offs_k[:, None] < K - (k * UNROLL_FACTOR + 3) * BLOCK_SIZE_K,
+            other=0.0,
+        )
+
+        k_start = k_start + BLOCK_SIZE_K
+        offs_ks = k_start // group_k
+        a_s = tl.load(As_ptrs + offs_ks * stride_As_k)
+        b_s = tl.load(Bs_ptrs + offs_ks * stride_Bs_k)
+
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
+@functools.lru_cache
+def get_w8a8_block_fp8_configs(
+    N: int, K: int, block_n: int, block_k: int
+) -> Optional[Dict[int, Any]]:
+    """
+    Return optimized configurations for the w8a8 block fp8 kernel.
+
+    The return value will be a dictionary that maps an irregular grid of
+    batch sizes to configurations of the w8a8 block fp8 kernel. To evaluate the
+    kernel on a given batch size bs, the closest batch size in the grid should
+    be picked and the associated configuration chosen to invoke the kernel.
+    """
+
+    # First look up if an optimized configuration is available in the configs
+    # directory
+    device_name = get_device_name().replace(" ", "_")
+    json_file_name = f"N={N},K={K},device_name={device_name},dtype=fp8_w8a8,block_shape=[{block_n}, {block_k}].json"
+
+    config_file_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name
+    )
+    if os.path.exists(config_file_path):
+        with open(config_file_path) as f:
+            log_info_on_rank0(
+                logger,
+                f"Using configuration from {config_file_path} for W8A8 Block FP8 kernel.",
+            )
+            # If a configuration has been found, return it
+            return {int(key): val for key, val in json.load(f).items()}
+
+    # If no optimized configuration is available, we will use the default
+    # configuration
+    logger.warning(
+        (
+            "Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! "
+            "Config file not found at %s"
+        ),
+        config_file_path,
+    )
+    return None
+
+
+def select_w8a8_block_fp8_matmul_kernel(M, N, META):
+    return _w8a8_block_fp8_matmul
+
+
+if _is_hip:
+
+    def use_w8a8_block_fp8_matmul_unrolledx4(M, N, META):
+        # Use manually unrolledx4 kernel on AMD GPU when the grid size is small.
+        # Empirical testing shows the sweet spot lies when it's less than the # of
+        # compute units available on the device.
+        num_workgroups = triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(
+            N, META["BLOCK_SIZE_N"]
+        )
+        num_workgroups <= get_device_core_count()
+
+    def select_w8a8_block_fp8_matmul_kernel(M, N, META):
+        if use_w8a8_block_fp8_matmul_unrolledx4(M, N, META):
+            return _w8a8_block_fp8_matmul_unrolledx4
+        else:
+            return _w8a8_block_fp8_matmul
+
+
+def prepare_block_fp8_matmul_inputs(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: List[int],
+    output_dtype: torch.dtype = torch.float16,
+) -> Tuple[int, int, int]:
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+
+    assert A.shape[-1] == B.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1]
+    assert A.is_contiguous()
+
+    if As.dtype == torch.float:
+        assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
+    elif As.dtype == torch.int:
+        assert (
+            triton.cdiv(triton.cdiv(A.shape[-1], block_k), 4) == As.shape[-1]
+        ), f"{A.shape=} {As.shape=} {block_size=}"
+    else:
+        raise NotImplementedError
+
+    M = A.numel() // A.shape[-1]
+
+    assert B.ndim == 2
+    assert B.is_contiguous()
+    assert Bs.ndim == 2
+    N, K = B.shape
+
+    if Bs.dtype == torch.float:
+        assert triton.cdiv(N, block_n) == Bs.shape[0]
+        assert triton.cdiv(K, block_k) == Bs.shape[1]
+    elif Bs.dtype == torch.int:
+        assert N == Bs.shape[0], f"{B.shape=} {Bs.shape=} {block_size=}"
+        assert (
+            triton.cdiv(triton.cdiv(K, block_k), 4) == Bs.shape[1]
+        ), f"{B.shape=} {Bs.shape=} {block_size=}"
+    else:
+        raise NotImplementedError
+
+    C_shape = A.shape[:-1] + (N,)
+    C = A.new_empty(C_shape, dtype=output_dtype)
+
+    return M, N, K, C
+
+
+def w8a8_block_fp8_matmul_deepgemm(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: List[int],
+    output_dtype: torch.dtype,
+) -> torch.Tensor:
+    M, N, K, C = prepare_block_fp8_matmul_inputs(A, B, As, Bs, block_size, output_dtype)
+
+    # Deepgemm only supports output tensor type as bfloat16
+    assert C.dtype == torch.bfloat16 and deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
+
+    if supports_custom_op():
+        torch.ops.sglang.deep_gemm_fp8_fp8_bf16_nt(A, As, B, Bs, C)
+    else:
+        deep_gemm_wrapper.gemm_nt_f8f8bf16((A, As), (B, Bs), C)
+
+    return C
+
+
+def w8a8_block_fp8_matmul_triton(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: List[int],
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    """This function performs matrix multiplication with block-wise quantization.
+
+    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
+    The output is returned in the specified `output_dtype`.
+
+    Args:
+        A: The input tensor, e.g., activation.
+        B: The input tensor, e.g., weight.
+        As: The per-token-group quantization scale for `A`.
+        Bs: The per-block quantization scale for `B`.
+        block_size: The block size for per-block quantization. It should be 2-dim, e.g., [128, 128].
+        output_dytpe: The dtype of the returned tensor.
+
+    Returns:
+        torch.Tensor: The result of matmul.
+    """
+
+    M, N, K, C = prepare_block_fp8_matmul_inputs(A, B, As, Bs, block_size, output_dtype)
+
+    block_n, block_k = block_size
+
+    configs = get_w8a8_block_fp8_configs(N, K, block_size[0], block_size[1])
+    if configs:
+        # If an optimal configuration map has been found, look up the
+        # optimal config
+        config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+    else:
+        # Default config
+        # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
+        config = {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": block_size[0],
+            "BLOCK_SIZE_K": block_size[1],
+            "GROUP_SIZE_M": 32,
+            "num_warps": 4,
+            "num_stages": 3,
+        }
+
+    def grid(META):
+        return (
+            triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
+        )
+
+    kernel = select_w8a8_block_fp8_matmul_kernel(M, N, config)
+
+    kernel[grid](
+        A,
+        B,
+        C,
+        As,
+        Bs,
+        M,
+        N,
+        K,
+        block_n,
+        block_k,
+        A.stride(-2),
+        A.stride(-1),
+        B.stride(1),
+        B.stride(0),
+        C.stride(-2),
+        C.stride(-1),
+        As.stride(-2),
+        As.stride(-1),
+        Bs.stride(1),
+        Bs.stride(0),
+        **config,
+    )
+
+    return C
+
+
+# universal entry point, for testing purposes
+def w8a8_block_fp8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: List[int],
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    if output_dtype == torch.bfloat16 and deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM:
+        return w8a8_block_fp8_matmul_deepgemm(
+            A, B, As, Bs, block_size, output_dtype=output_dtype
+        )
+
+    return w8a8_block_fp8_matmul_triton(
+        A, B, As, Bs, block_size, output_dtype=output_dtype
+    )
+
+
+@triton.jit
+def _per_tensor_quant_mla_fp8_stage1(
+    x_ptr,
+    x_s_ptr,
+    head_size,
+    x_stride_h,
+    x_stride_s,
+    eps,
+    fp8_max,
+    BLOCK_SIZE: tl.constexpr,
+):
+    seq_id = tl.program_id(0)
+    head_id = tl.program_id(1)
+    offset = tl.arange(0, BLOCK_SIZE)
+    mask = offset < head_size
+
+    x_ptr += head_id * x_stride_h + seq_id * x_stride_s
+    x = tl.load(x_ptr + offset, mask=mask, other=0.0).to(tl.float32)
+    _absmax = tl.maximum(tl.max(tl.abs(x)), eps)
+
+    tl.atomic_max(x_s_ptr, _absmax / fp8_max)
+
+
+@triton.jit
+def _per_tensor_quant_mla_fp8_stage2(
+    x_ptr,
+    x_s_ptr,
+    x_q_ptr,
+    num_seq,
+    head_size,
+    x_stride_h,
+    x_stride_s,
+    fp8_min,
+    fp8_max,
+    BLOCK_SIZE: tl.constexpr,
+):
+    seq_id = tl.program_id(0)
+    head_id = tl.program_id(1)
+    offset = tl.arange(0, BLOCK_SIZE)
+    mask = offset < head_size
+
+    x_s = tl.load(x_s_ptr)
+    x_s_inv = 1.0 / x_s
+
+    x_ptr += head_id * x_stride_h + seq_id * x_stride_s
+    x_q_ptr += head_id * num_seq * head_size + seq_id * head_size
+
+    x = tl.load(x_ptr + offset, mask=mask, other=0.0).to(tl.float32)
+    x_q = tl.clamp(x * x_s_inv, fp8_min, fp8_max).to(x_q_ptr.dtype.element_ty)
+    tl.store(x_q_ptr + offset, x_q, mask=mask)
+
+
+def per_tensor_quant_mla_fp8(
+    x: torch.Tensor, x_s_out: torch.Tensor, eps: float = 1e-12
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    This function quantizes input values to float8 values with tensor-wise quantization
+    and specialized for mla absorbed case.
+    """
+    assert x.dim() == 3, "`x` is not a 3d-tensor"
+    assert (
+        x_s_out.shape == (1,)
+        and x_s_out.dtype == torch.float32
+        and x_s_out.device == x.device
+    )
+
+    x_q = x.new_empty(x.size(), dtype=fp8_dtype)
+
+    num_head, num_seq, head_size = x.shape
+    BLOCK_SIZE = triton.next_power_of_2(head_size)
+    grid = (num_seq, num_head)
+
+    _per_tensor_quant_mla_fp8_stage1[grid](
+        x,
+        x_s_out,
+        head_size,
+        x.stride(0),
+        x.stride(1),
+        eps,
+        fp8_max,
+        BLOCK_SIZE,
+    )
+    _per_tensor_quant_mla_fp8_stage2[grid](
+        x,
+        x_s_out,
+        x_q,
+        num_seq,
+        head_size,
+        x.stride(0),
+        x.stride(1),
+        fp8_min,
+        fp8_max,
+        BLOCK_SIZE,
+    )
+
+    return x_q, x_s_out
+
+
+@triton.jit
+def _per_token_group_quant_mla_deep_gemm_masked_fp8(
+    y_ptr,
+    y_q_ptr,
+    y_s_ptr,
+    masked_m_ptr,
+    group_size,
+    y_stride_b,
+    y_stride_t,
+    y_q_stride_b,
+    y_q_stride_t,
+    y_s_stride_b,
+    y_s_stride_g,
+    eps,
+    fp8_min,
+    fp8_max,
+    NUM_GROUP: tl.constexpr,
+    BLOCK: tl.constexpr,
+):
+    """A Triton-accelerated function to perform per-token-group
+    quantization on a tensor for deep_gemm grouped_gemm_masked.
+    This function converts the tensor values into float8 values.
+    y and y_q: (b, t, k)
+    y_s: (b, k//group_size, t)
+    """
+    t_id = tl.program_id(0)
+    b_id = tl.program_id(1)
+
+    y_ptr += b_id * y_stride_b + t_id * y_stride_t
+    y_q_ptr += b_id * y_q_stride_b + t_id * y_q_stride_t
+    y_s_ptr += b_id * y_s_stride_b + t_id
+
+    if t_id == 0:
+        tl.store(masked_m_ptr + b_id, tl.num_programs(0))
+
+    cols = tl.arange(0, BLOCK)  # group_size <= BLOCK
+    mask = cols < group_size
+
+    for gid in range(NUM_GROUP):
+        y = tl.load(y_ptr + gid * group_size + cols, mask=mask, other=0.0).to(
+            tl.float32
+        )
+        _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
+        y_s = _absmax / fp8_max
+        y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
+
+        tl.store(y_q_ptr + gid * group_size + cols, y_q, mask=mask)
+        tl.store(y_s_ptr + gid * y_s_stride_g, y_s)
+
+
+def per_token_group_quant_mla_deep_gemm_masked_fp8(
+    x: torch.Tensor,
+    group_size: int = 128,
+    eps: float = 1e-12,
+    dtype: torch.dtype = fp8_dtype,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    This function quantizes input values to float8 values with per-token-group-quantization
+    for deep_gemm grouped_gemm_masked and specialized for mla absorbed case.
+    """
+    assert x.dim() == 3, "`x` is not a 3d-tensor"
+
+    b, m, k = x.shape
+    aligned_m = (m + 255) // 256 * 256  # 256 is the max block_m of the gemm kernel
+    num_tiles_k = k // group_size
+    assert num_tiles_k * group_size == k, f"k % {group_size} must be zero"
+
+    x_q = x.new_empty((b, aligned_m, k), dtype=dtype)
+    x_s = x.new_empty((b, num_tiles_k, aligned_m), dtype=torch.float32)
+    masked_m = x.new_empty((b,), dtype=torch.int32)
+
+    BLOCK_SIZE = triton.next_power_of_2(group_size)
+    grid = (m, b)
+
+    _per_token_group_quant_mla_deep_gemm_masked_fp8[grid](
+        x,
+        x_q,
+        x_s,
+        masked_m,
+        group_size,
+        x.stride(0),
+        x.stride(1),
+        x_q.stride(0),
+        x_q.stride(1),
+        x_s.stride(0),
+        x_s.stride(1),
+        eps,
+        -fp8_max,
+        fp8_max,
+        num_tiles_k,
+        BLOCK_SIZE,
+    )
+
+    return x_q, x_s.transpose(1, 2), masked_m, m, aligned_m
+
+
+"""
+Quantize input tensor to FP8 (8-bit floating point) format.
+
+Args:
+    input (torch.Tensor): Input tensor to be quantized
+    scale (Optional[torch.Tensor]): Pre-computed scaling factor for static quantization.
+        If None, scales will be computed dynamically.
+    num_token_padding (Optional[int]): If specified, pad the first dimension
+        of the output to at least this value.
+    use_per_token_if_dynamic (bool): When using dynamic scaling (scale=None),
+        determines the quantization granularity:
+        - True: compute scale per token
+        - False: compute single scale per tensor
+
+Returns:
+    Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
+        - quantized_tensor: The FP8 quantized version of input
+        - scale_tensor: The scaling factors used for quantization
+
+Raises:
+    AssertionError: If input is not 2D or if static scale's numel != 1
+"""
+if _is_hip:
+
+    def scaled_fp8_quant(
+        input: torch.Tensor,
+        scale: Optional[torch.Tensor] = None,
+        num_token_padding: Optional[int] = None,
+        use_per_token_if_dynamic: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        assert input.ndim == 2, f"Expected 2D input tensor, got {input.ndim}D"
+        shape = input.shape
+        if num_token_padding:
+            shape = (max(num_token_padding, input.shape[0]), shape[1])
+        output = torch.empty(shape, device=input.device, dtype=fp8_dtype)
+
+        if scale is None:
+            # Dynamic scaling
+            if use_per_token_if_dynamic:
+                scale = torch.empty(
+                    (shape[0], 1), device=input.device, dtype=torch.float32
+                )
+                if _use_aiter:
+                    dynamic_per_token_scaled_quant(output, input, scale)
+                else:
+                    torch.ops._C.dynamic_per_token_scaled_fp8_quant(
+                        output, input.contiguous(), scale, None
+                    )
+            else:
+                scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+                if _use_aiter:
+                    dynamic_per_tensor_quant(output, input, scale)
+                else:
+                    torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
+        else:
+            # Static scaling
+            assert (
+                scale.numel() == 1
+            ), f"Expected scalar scale, got numel={scale.numel()}"
+            if _use_aiter:
+                static_per_tensor_quant(output, input, scale)
+            else:
+                torch.ops._C.static_scaled_fp8_quant(output, input, scale)
+
+        return output, scale
+
+else:
+
+    def scaled_fp8_quant(
+        input: torch.Tensor,
+        scale: Optional[torch.Tensor] = None,
+        num_token_padding: Optional[int] = None,
+        use_per_token_if_dynamic: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+
+        assert input.ndim == 2, f"Expected 2D input tensor, got {input.ndim}D"
+        shape = input.shape
+        if num_token_padding:
+            shape = (max(num_token_padding, input.shape[0]), shape[1])
+        output = torch.empty(shape, device=input.device, dtype=fp8_dtype)
+
+        if scale is None:
+            # Dynamic scaling
+            if use_per_token_if_dynamic:
+                scale = torch.empty(
+                    (shape[0], 1), device=input.device, dtype=torch.float32
+                )
+                sgl_per_token_quant_fp8(input, output, scale)
+            else:
+                scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+                sgl_per_tensor_quant_fp8(
+                    input, output, scale, is_static=False
+                )  # False for dynamic
+        else:
+            # Static scaling
+            assert (
+                scale.numel() == 1
+            ), f"Expected scalar scale, got numel={scale.numel()}"
+            sgl_per_tensor_quant_fp8(
+                input, output, scale, is_static=True
+            )  # True for static
+
+        return output, scale
+
+
+fp8_autotune = triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_M": block_m}, num_warps=num_warps)
+        for block_m in [16, 32, 64, 128]
+        for num_warps in [2, 4, 8]
+    ],
+    key=["K", "BLOCK_K", "M_ALIGNMENT"],
+)
+
+
+@triton.jit
+def _per_token_group_quant_fp8_hopper_moe_mn_major(
+    a,  # (M, K):(K, 1)
+    expert_offsets,  # (num_experts,)
+    problem_sizes,  # (num_experts, 3)
+    a_fp8,  # (M, K):(K, 1)
+    sfa,  # (M, k)
+    K: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    M_ALIGNMENT: tl.constexpr,
+    BLOCK_M: tl.constexpr,  # tune
+):
+    k_offset = tl.program_id(0)
+    expert_id = tl.program_id(1)
+
+    m = tl.load(problem_sizes + expert_id * 3)
+    current_expert_offset = tl.load(expert_offsets + expert_id).to(tl.int64)
+    tl.multiple_of(m, M_ALIGNMENT)
+    tl.multiple_of(current_expert_offset, M_ALIGNMENT)
+
+    coord_k = k_offset * BLOCK_K + tl.arange(0, BLOCK_K)
+    for i in tl.range(tl.cdiv(m, BLOCK_M)):
+        coord_m = i * BLOCK_M + tl.arange(0, BLOCK_M)
+        a_ptrs = a + current_expert_offset * K + coord_m[:, None] * K + coord_k[None, :]
+        a_mask = (coord_m < m)[:, None] & (coord_k < K)[None, :]
+
+        inp = tl.load(a_ptrs, mask=a_mask).to(tl.float32)  # [BLOCK_M, BLOCK_K]
+        inp_amax = tl.max(tl.abs(inp), axis=1)  # [BLOCK_M,]
+        inp_amax = tl.clamp(inp_amax, min=1e-4, max=float("inf"))
+        inp_fp8 = (inp * (448.0 / inp_amax[:, None])).to(tl.float8e4nv)
+
+        # Store fp8
+        a_fp8_ptrs = (
+            a_fp8 + current_expert_offset * K + coord_m[:, None] * K + coord_k[None, :]
+        )
+        tl.store(a_fp8_ptrs, inp_fp8, mask=a_mask)
+
+        # Store sfa
+        k = tl.cdiv(K, BLOCK_K)
+        sfa_ptrs = (
+            sfa + current_expert_offset * k + k_offset * m + coord_m
+        )  # MN-Major with sfa
+        tl.store(sfa_ptrs, inp_amax / 448.0, mask=coord_m < m)
+
+
+if not _is_cpu:
+    _per_token_group_quant_fp8_hopper_moe_mn_major = fp8_autotune(
+        _per_token_group_quant_fp8_hopper_moe_mn_major
+    )
+
+
+def per_token_group_quant_fp8_hopper_moe_mn_major(
+    A: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    problem_sizes: torch.Tensor,
+    group_size: int,
+    expert_tokens_alignment: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert A.dim() == 2
+    assert A.is_contiguous(), "`A` is not contiguous"
+    assert (
+        A.shape[-1] % group_size == 0
+    ), "the last dimension of `A` cannot be divisible by `group_size`"
+
+    a_q = torch.empty_like(A, device=A.device, dtype=fp8_dtype)
+    M, K = A.shape[0], A.shape[1]
+    k = K // group_size
+    sfa = torch.empty((M, k), device=A.device, dtype=torch.float32)
+    num_experts = problem_sizes.shape[0]
+    grid = (k, num_experts)
+    _per_token_group_quant_fp8_hopper_moe_mn_major[grid](
+        A,
+        expert_offsets,
+        problem_sizes,
+        a_q,
+        sfa,
+        K,
+        group_size,
+        expert_tokens_alignment,
+    )
+    return a_q, sfa
+
+
+@triton.jit
+def _per_group_transpose(
+    data_ptr: torch.Tensor,
+    trans_data_ptr: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    k: int,
+    M_ALIGNMENT: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+):
+    expert_id = tl.program_id(0)
+    m_id = tl.program_id(1)
+    k_id = tl.program_id(2)
+
+    curr_expert_offset = tl.load(expert_offsets + expert_id)
+    next_expert_offset = tl.load(expert_offsets + expert_id + 1)
+    num_tokens_of_expert = next_expert_offset - curr_expert_offset
+    tl.multiple_of(curr_expert_offset, M_ALIGNMENT)
+    tl.multiple_of(next_expert_offset, M_ALIGNMENT)
+
+    data_start_ptr = data_ptr + curr_expert_offset * k
+    trans_data_start_ptr = trans_data_ptr + curr_expert_offset * k
+
+    k_coord = k_id * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
+    k_mask = k_coord < k
+    for start_m in tl.range(0, num_tokens_of_expert, BLOCK_SIZE_M * tl.num_programs(1)):
+        m_coord = start_m + m_id * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+        m_mask = m_coord < num_tokens_of_expert
+        off = m_coord[:, None] * k + k_coord[None, :]
+        trans_off = m_coord[:, None] + k_coord[None, :] * num_tokens_of_expert
+        mask = m_mask[:, None] & k_mask[None, :]
+
+        data = tl.load(data_start_ptr + off, mask=mask)
+        tl.store(trans_data_start_ptr + trans_off, data, mask=mask)
+
+
+def per_group_transpose(
+    a: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    M_ALIGNMENT: int = 1,
+) -> torch.Tensor:
+    assert a.dim() == 2
+    assert a.is_contiguous(), "`a` is not contiguous"
+
+    m, k = a.size()
+    trans_a = torch.empty_like(a)
+    num_experts = expert_offsets.size(0) - 1
+
+    grid = lambda META: (
+        num_experts,
+        triton.cdiv((m + num_experts - 1) // num_experts, META["BLOCK_SIZE_M"]),
+        triton.cdiv(k, META["BLOCK_SIZE_K"]),
+    )
+    _per_group_transpose[grid](
+        a, trans_a, expert_offsets, k, M_ALIGNMENT, BLOCK_SIZE_M=16, BLOCK_SIZE_K=8
+    )
+    return trans_a
+
+
+def is_weak_contiguous(x: torch.Tensor):
+    strides = x.stride()
+    sizes = x.shape
+    is_not_transpose = strides[0] == 1 and (strides[1] >= max(1, sizes[0]))
+    is_transpose = strides[1] == 1 and (strides[0] >= max(1, sizes[1]))
+    return is_transpose or is_not_transpose
+
+
+@triton.jit
+def scaled_mm_kernel(
+    a_ptr,
+    b_ptr,
+    scale_a_ptr,
+    scale_b_ptr,
+    c_ptr,
+    bias_ptr,
+    M,
+    N,
+    K,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    ACCUMULATOR_DTYPE: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    BLOCK_SIZE_SCALE_A: tl.constexpr,
+    BLOCK_SIZE_SCALE_B: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+
+    pid_m = pid // num_pid_n
+    pid_n = pid % num_pid_n
+
+    accumulator_dtype = ACCUMULATOR_DTYPE
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=accumulator_dtype)
+
+    # NOTE: Some tensor inputs are so large, they will cause int32 overflow
+    # so it is necessary to use tl.int64 for all the offsets, else SEGV will
+    # eventually occur.
+
+    # Offsets and masks.
+    offsets_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
+    masks_am = offsets_am < M
+
+    offsets_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)
+    masks_bn = offsets_bn < N
+
+    offsets_k = tl.arange(0, BLOCK_SIZE_K).to(tl.int64)
+    offsets_a = stride_am * offsets_am[:, None] + stride_ak * offsets_k[None, :]
+    offsets_b = stride_bk * offsets_k[:, None] + stride_bn * offsets_bn[None, :]
+
+    # NOTE: BLOCK_SIZE_SCALE_A could be 1 or BLOCK_SIZE_M, so need to create
+    # appropriate offsets and masks for each case. Same goes for
+    # BLOCK_SIZE_SCALE_B.
+    offsets_scale_am = (
+        tl.arange(0, BLOCK_SIZE_SCALE_A)
+        + (BLOCK_SIZE_SCALE_A > 1) * pid_m * BLOCK_SIZE_M
+    )
+    masks_scale_am = offsets_scale_am < M
+
+    offsets_scale_bn = (
+        tl.arange(0, BLOCK_SIZE_SCALE_B)
+        + (BLOCK_SIZE_SCALE_B > 1) * pid_n * BLOCK_SIZE_N
+    )
+    masks_scale_bn = offsets_scale_bn < N
+
+    a_ptrs = a_ptr + offsets_a
+    b_ptrs = b_ptr + offsets_b
+
+    scale_a_ptrs = scale_a_ptr + offsets_scale_am
+    scale_b_ptrs = scale_b_ptr + offsets_scale_bn
+
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        masks_k = offsets_k < K
+        masks_a = masks_am[:, None] & masks_k[None, :]
+        a = tl.load(a_ptrs, mask=masks_a)
+
+        masks_b = masks_k[:, None] & masks_bn[None, :]
+        b = tl.load(b_ptrs, mask=masks_b)
+
+        # Accumulate results.
+        accumulator = tl.dot(a, b, accumulator, out_dtype=accumulator_dtype)
+
+        offsets_k += BLOCK_SIZE_K
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    # Apply scale at end.
+    masks_scale_a = masks_scale_am[:, None] & (tl.arange(0, 1) < 1)[:, None]
+    scale_a = tl.load(scale_a_ptrs[:, None], masks_scale_a)
+    # Need to broadcast to the appropriate size, if scale_a is already
+    # (BLOCK_SIZE_M, 1) then it will broadcast to its own shape. Same goes
+    # for scale_b below.
+    scale_a = scale_a.broadcast_to((BLOCK_SIZE_M, 1))
+    accumulator = scale_a * accumulator.to(tl.float32)
+
+    masks_scale_b = masks_scale_bn[:, None] & (tl.arange(0, 1) < 1)[None, :]
+    scale_b = tl.load(scale_b_ptrs[:, None], masks_scale_b)
+    scale_b = scale_b.broadcast_to((BLOCK_SIZE_N, 1))
+    accumulator = scale_b.T * accumulator.to(tl.float32)
+
+    # Convert to output format.
+    c = accumulator.to(c_ptr.type.element_ty)
+
+    # Add bias, it's already in output format, so add it after conversion.
+    if bias_ptr:
+        offsets_bias = offsets_bn
+        bias_ptrs = bias_ptr + offsets_bias
+        bias_mask = offsets_bias < N
+        bias = tl.load(bias_ptrs, bias_mask)
+        c += bias
+
+    # Save output
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)
+    offs_cm = offs_cm.to(tl.int64)
+    offs_cn = offs_cn.to(tl.int64)
+    c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
+# input  - [M, K]
+# weight - [K, N]
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
+def triton_scaled_mm(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    out_dtype: type[torch.dtype],
+    bias: Optional[torch.Tensor] = None,
+    block_size_m: int = 32,
+    block_size_n: int = 32,
+    block_size_k: int = 32,
+    use_heuristic=True,
+) -> torch.Tensor:
+    M, K = input.shape
+    N = weight.shape[1]
+
+    assert N > 0 and K > 0 and M > 0
+    assert weight.shape[0] == K
+    assert input.dtype == weight.dtype
+
+    scale_a = scale_a.reshape(-1, 1) if scale_a.dim() <= 1 else scale_a
+    scale_b = scale_b.reshape(-1, 1) if scale_b.dim() <= 1 else scale_b
+
+    assert scale_a.dtype == scale_b.dtype and scale_a.is_floating_point()
+    assert scale_a.shape[1] == 1 and (scale_a.shape[0] == 1 or scale_a.shape[0] == M)
+    assert scale_b.shape[1] == 1 and (scale_b.shape[0] == 1 or scale_b.shape[0] == N)
+    assert out_dtype.is_floating_point
+    assert bias is None or bias.is_floating_point()
+    assert is_weak_contiguous(input)
+    assert is_weak_contiguous(weight)
+
+    grid = lambda META: (
+        triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
+    )
+
+    result = torch.empty((M, N), dtype=out_dtype, device=input.device)
+
+    has_scalar = lambda x: x.shape[0] == 1 and x.shape[1] == 1
+
+    if use_heuristic:
+        is_small_N = N < 8192
+        next_power_of_2_M = max(32, triton.next_power_of_2(M))
+        if next_power_of_2_M <= 32:
+            tile_shape = (64, 64, 256) if is_small_N else (64, 128, 256)
+        elif next_power_of_2_M <= 64:
+            tile_shape = (64, 64, 256)
+        elif next_power_of_2_M <= 128:
+            tile_shape = (64, 128, 128)
+        else:
+            tile_shape = (128, 128, 128)
+
+    block_size_m, block_size_n, block_size_k = tile_shape
+
+    block_size_sa = 1 if has_scalar(scale_a) else block_size_m
+    block_size_sb = 1 if has_scalar(scale_b) else block_size_n
+
+    accumulator_dtype = tl.float32 if input.is_floating_point() else tl.int32
+
+    # A = input, B = weight, C = result
+    # A = M x K, B = K x N, C = M x N
+    scaled_mm_kernel[grid](
+        input,
+        weight,
+        scale_a,
+        scale_b,
+        result,
+        bias,
+        M,
+        N,
+        K,
+        input.stride(0),
+        input.stride(1),
+        weight.stride(0),
+        weight.stride(1),
+        result.stride(0),
+        result.stride(1),
+        accumulator_dtype,
+        BLOCK_SIZE_M=block_size_m,
+        BLOCK_SIZE_N=block_size_n,
+        BLOCK_SIZE_K=block_size_k,
+        BLOCK_SIZE_SCALE_A=block_size_sa,
+        BLOCK_SIZE_SCALE_B=block_size_sb,
+    )
+
+    return result.to(out_dtype)
diff --git a/python/sglang/srt/layers/quantization/fp8_utils.py b/python/sglang/srt/layers/quantization/fp8_utils.py
new file mode 100644
index 000000000..998423b86
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/fp8_utils.py
@@ -0,0 +1,829 @@
+from typing import Callable, List, Optional, Tuple
+
+import torch
+
+from sglang.srt.layers.quantization import deep_gemm_wrapper
+from sglang.srt.layers.quantization.fp8_kernel import sglang_per_token_group_quant_fp8
+from sglang.srt.layers.quantization.mxfp4_tensor import MXFP4QuantizeUtil
+from sglang.srt.utils import is_sm100_supported
+
+try:
+    from vllm import _custom_ops as ops
+
+    VLLM_AVAILABLE = True
+except ImportError:
+    VLLM_AVAILABLE = False
+
+from sglang.srt.layers.quantization.fp8_kernel import (
+    fp8_dtype,
+    fp8_max,
+    is_fp8_fnuz,
+    per_token_group_quant_fp8,
+    scaled_fp8_quant,
+    sglang_per_token_quant_fp8,
+    static_quant_fp8,
+    triton_scaled_mm,
+    w8a8_block_fp8_matmul_deepgemm,
+    w8a8_block_fp8_matmul_triton,
+)
+from sglang.srt.utils import (
+    align,
+    ceil_div,
+    get_bool_env_var,
+    get_cuda_version,
+    get_device_capability,
+    is_cuda,
+    is_flashinfer_available,
+    is_hip,
+)
+
+_is_hip = is_hip()
+_is_cuda = is_cuda()
+_is_fp8_fnuz = is_fp8_fnuz()
+
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+if _use_aiter:
+    import aiter
+    from aiter import gemm_a8w8_blockscale, gemm_a8w8_bpreshuffle, get_hip_quant
+
+    aiter_per1x128_quant = get_hip_quant(aiter.QuantType.per_1x128)
+
+if _is_cuda:
+    from sgl_kernel import fp8_blockwise_scaled_mm, fp8_scaled_mm
+
+use_vllm_cutlass_w8a8_fp8_kernel = get_bool_env_var("USE_VLLM_CUTLASS_W8A8_FP8_KERNEL")
+use_triton_w8a8_fp8_kernel = get_bool_env_var("USE_TRITON_W8A8_FP8_KERNEL")
+
+# Input scaling factors are no longer optional in _scaled_mm starting
+# from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
+TORCH_DEVICE_IDENTITY = None
+
+
+def use_rowwise_torch_scaled_mm():
+    _TORCH_VERSION = torch.__version__.split("+")[0]
+    try:
+        _TORCH_VERSION_TUPLE = tuple(map(int, _TORCH_VERSION.split(".")[:3]))
+    except ValueError:
+        _TORCH_VERSION_TUPLE = (0, 0, 0)
+    if _is_hip:
+        # The condition to determine if it is on a platform that supports
+        # torch._scaled_mm rowwise feature.
+        # The condition is determined once as the operations
+        # are time consuming.
+        return get_device_capability() >= (9, 4) and _TORCH_VERSION_TUPLE >= (2, 7, 0)
+    return False
+
+
+USE_ROWWISE_TORCH_SCALED_MM = use_rowwise_torch_scaled_mm()
+
+
+def cutlass_fp8_supported():
+    if not _is_cuda:
+        return False
+    major, minor = get_device_capability()
+    cuda_version = get_cuda_version()
+    if major >= 9:
+        return cuda_version >= (12, 0)
+    elif major == 8 and minor == 9:
+        return cuda_version >= (12, 4)
+    return False
+
+
+def normalize_e4m3fn_to_e4m3fnuz(
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    assert weight.dtype == torch.float8_e4m3fn
+    # The bits pattern 10000000(-128) represents zero in e4m3fn
+    # but NaN in e4m3fnuz. So here we set it to 0.
+    # https://onnx.ai/onnx/technical/float8.html
+    weight_as_int8 = weight.view(torch.int8)
+    ROCM_FP8_NAN_AS_INT = -128
+    weight_as_int8[weight_as_int8 == ROCM_FP8_NAN_AS_INT] = 0
+    weight = weight_as_int8.view(torch.float8_e4m3fnuz)
+
+    # For the same bits representation, e4m3fnuz value is half of
+    # the e4m3fn value, so we should double the scaling factor to
+    # get the same dequantized value.
+    # https://onnx.ai/onnx/technical/float8.html
+    weight_scale = weight_scale * 2.0
+    if input_scale is not None:
+        input_scale = input_scale * 2.0
+    return weight, weight_scale, input_scale
+
+
+# TODO(ch-wan): define these backends in --moe-runner-backend
+def cutlass_block_fp8_supported() -> bool:
+    if not get_bool_env_var("SGLANG_SUPPORT_CUTLASS_BLOCK_FP8"):
+        return False
+    if _is_cuda:
+        major, minor = torch.cuda.get_device_capability()
+        sm_version = major * 10 + minor
+        cuda_version = tuple(map(int, torch.version.cuda.split(".")))
+        if cuda_version >= (12, 0) and sm_version >= 90:
+            return True
+    return False
+
+
+CUTLASS_BLOCK_FP8_SUPPORTED = cutlass_block_fp8_supported()
+ENABLE_FLASHINFER_GEMM = (
+    get_bool_env_var("SGLANG_ENABLE_FLASHINFER_GEMM")
+    and is_sm100_supported()
+    and is_flashinfer_available()
+)
+if ENABLE_FLASHINFER_GEMM:
+    from flashinfer.gemm import gemm_fp8_nt_groupwise
+
+
+def dispatch_w8a8_block_fp8_linear() -> Callable:
+    if ENABLE_FLASHINFER_GEMM:
+        return flashinfer_gemm_w8a8_block_fp8_linear
+    elif CUTLASS_BLOCK_FP8_SUPPORTED:
+        return cutlass_w8a8_block_fp8_linear_with_fallback
+    elif _use_aiter:
+        return aiter_w8a8_block_fp8_linear
+    elif deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM:
+        return deepgemm_w8a8_block_fp8_linear_with_fallback
+    else:
+        return triton_w8a8_block_fp8_linear
+
+
+def flashinfer_gemm_w8a8_block_fp8_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    block_size: List[int],
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    assert input_scale is None
+
+    input_2d = input.view(-1, input.shape[-1])
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+
+    q_input, x_scale = sglang_per_token_group_quant_fp8(
+        input_2d, block_size[1], column_major_scales=True
+    )
+    # TRTLLM requires column-major scaling factors
+    output = gemm_fp8_nt_groupwise(
+        q_input,
+        weight,
+        x_scale,
+        weight_scale,
+        out_dtype=input_2d.dtype,
+        backend="trtllm",
+    )
+
+    if bias is not None:
+        output += bias
+
+    return output.to(dtype=input_2d.dtype).view(*output_shape)
+
+
+def cutlass_w8a8_block_fp8_linear_with_fallback(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    block_size: List[int],
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    assert input_scale is None
+
+    # TODO: add more robust shape check here
+    shape_supported = weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0
+
+    if not shape_supported:
+        # fallback to triton
+        return triton_w8a8_block_fp8_linear(
+            input, weight, block_size, weight_scale, input_scale, bias
+        )
+
+    input_2d = input.view(-1, input.shape[-1])
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+
+    q_input, x_scale = per_token_group_quant_fp8(
+        input_2d, block_size[1], column_major_scales=True
+    )
+    output = fp8_blockwise_scaled_mm(
+        q_input, weight.T, x_scale, weight_scale.T, out_dtype=input_2d.dtype
+    )
+    if bias is not None:
+        output += bias
+    return output.to(dtype=input_2d.dtype).view(*output_shape)
+
+
+def deepgemm_w8a8_block_fp8_linear_with_fallback(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    block_size: List[int],
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    assert input_scale is None
+
+    output_dtype = input.dtype
+    dtype_supported = output_dtype == torch.bfloat16
+
+    # TODO: https://github.com/sgl-project/sglang/pull/6890#issuecomment-2943395737
+    shape_supported = weight.shape[0] % 64 == 0 and weight.shape[1] % 128 == 0
+
+    if not (shape_supported and dtype_supported):
+        # fall back to triton
+        return triton_w8a8_block_fp8_linear(
+            input, weight, block_size, weight_scale, input_scale, bias
+        )
+
+    input_2d = input.view(-1, input.shape[-1])
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+
+    q_input, x_scale = sglang_per_token_group_quant_fp8(
+        input_2d,
+        block_size[1],
+        column_major_scales=True,
+        scale_tma_aligned=True,
+        scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
+    )
+
+    output = w8a8_block_fp8_matmul_deepgemm(
+        q_input, weight, x_scale, weight_scale, block_size, output_dtype=output_dtype
+    )
+    if bias is not None:
+        output += bias
+    return output.to(dtype=output_dtype).view(*output_shape)
+
+
+def aiter_w8a8_block_fp8_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    block_size: List[int],
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    assert input_scale is None
+    input_2d = input.view(-1, input.shape[-1])
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+
+    q_input, x_scale = aiter_per1x128_quant(input_2d, quant_dtype=aiter.dtypes.fp8)
+    output = gemm_a8w8_blockscale(
+        q_input, weight, x_scale, weight_scale, dtype=input.dtype
+    )
+
+    if bias is not None:
+        output += bias
+
+    return output.to(dtype=input_2d.dtype).view(*output_shape)
+
+
+def triton_w8a8_block_fp8_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    block_size: List[int],
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    assert input_scale is None
+    input_2d = input.view(-1, input.shape[-1])
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+
+    q_input, x_scale = per_token_group_quant_fp8(
+        input_2d, block_size[1], column_major_scales=False
+    )
+    output = w8a8_block_fp8_matmul_triton(
+        q_input, weight, x_scale, weight_scale, block_size, output_dtype=input_2d.dtype
+    )
+    if bias is not None:
+        output += bias
+    return output.to(dtype=input_2d.dtype).view(*output_shape)
+
+
+def dequant_mxfp4(
+    w_block: torch.Tensor,
+    w_scale: torch.Tensor,
+    out_dtype,
+) -> torch.Tensor:
+    """
+    :param w_block: (batch, n, k, 16), uint8, pack two mxfp4 into one byte
+    :param w_scale: (batch, n, k), uint8
+    :return: (batch, n, k * 32), float32
+    """
+
+    assert w_block.dtype == torch.uint8
+    assert w_scale.dtype == torch.uint8
+
+    batch, n, k, pack_dim = w_block.shape
+    batch_, n_, k_ = w_scale.shape
+    assert pack_dim == 16
+    assert batch == batch_
+    assert n == n_
+    assert k == k_
+
+    out_raw = MXFP4QuantizeUtil.dequantize(
+        quantized_data=w_block, scale=w_scale, dtype=out_dtype, block_sizes=[32]
+    )
+    return out_raw.reshape(batch, n, k * 32)
+
+
+def input_to_float8(
+    x: torch.Tensor, dtype: torch.dtype = fp8_dtype
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """This function quantizes input values to float8 values with tensor-wise quantization."""
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).float().clamp(min=1e-12)
+
+    if _is_fp8_fnuz:
+        dtype = fp8_dtype
+        fp_max = fp8_max
+    else:
+        finfo = torch.finfo(dtype)
+        fp_max = finfo.max
+
+    scale = fp_max / amax
+    x_scl_sat = (x.float() * scale).clamp(min=-fp_max, max=fp_max)
+    return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
+
+
+def block_quant_to_tensor_quant(
+    x_q_block: torch.Tensor,
+    x_s: torch.Tensor,
+    block_size: List[int],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """This function converts block-wise quantization to tensor-wise quantization.
+    The inputs are block-wise quantization tensor `x_q_block`, block-wise quantization scale
+    and the block size.
+    The outputs are tensor-wise quantization tensor and tensor-wise quantization scale.
+    Note only float8 is supported for now.
+    """
+    block_n, block_k = block_size[0], block_size[1]
+    n, k = x_q_block.shape
+    n_tiles = (n + block_n - 1) // block_n
+    k_tiles = (k + block_k - 1) // block_k
+    assert n_tiles == x_s.shape[0]
+    assert k_tiles == x_s.shape[1]
+
+    x_dq_block = x_q_block.to(torch.float32)
+
+    x_dq_block_tiles = [
+        [
+            x_dq_block[
+                j * block_n : min((j + 1) * block_n, n),
+                i * block_k : min((i + 1) * block_k, k),
+            ]
+            for i in range(k_tiles)
+        ]
+        for j in range(n_tiles)
+    ]
+
+    for i in range(k_tiles):
+        for j in range(n_tiles):
+            x_dq_block_tiles[j][i][:, :] = x_dq_block_tiles[j][i] * x_s[j][i]
+
+    x_q_tensor, scale = (
+        scaled_fp8_quant(x_dq_block)
+        if _is_cuda
+        else input_to_float8(x_dq_block, dtype=x_q_block.dtype)
+    )
+    return x_q_tensor, scale
+
+
+def block_quant_dequant(
+    x_q_block: torch.Tensor,
+    x_s: torch.Tensor,
+    block_size: List[int],
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    """This function converts block-wise quantization to unquantized.
+    The inputs are block-wise quantization tensor `x_q_block`, block-wise quantization scale
+    and the block size.
+    The output is an unquantized tensor with dtype.
+    """
+    block_n, block_k = block_size[0], block_size[1]
+    *_, n, k = x_q_block.shape
+
+    # ... n_scale k_scale -> ... (n_scale block_n) (k_scale block_k)
+    x_scale_repeat = x_s.repeat_interleave(block_n, dim=-2).repeat_interleave(
+        block_k, dim=-1
+    )
+    x_scale_repeat = x_scale_repeat[..., :n, :k]
+
+    return (x_q_block.to(torch.float32) * x_scale_repeat).to(dtype)
+
+
+def requant_weight_ue8m0_inplace(weight, weight_scale_inv, weight_block_size):
+    assert isinstance(weight, torch.nn.Parameter)
+    assert isinstance(weight_scale_inv, torch.nn.Parameter)
+    weight.data, weight_scale_inv.data = _requant_weight_ue8m0(
+        weight, weight_scale_inv, weight_block_size
+    )
+
+
+def _requant_weight_ue8m0(
+    weight: torch.Tensor,
+    weight_scale_inv: torch.Tensor,
+    weight_block_size: List[int],
+):
+    assert weight_block_size == [128, 128]
+
+    *_, n, k = weight.shape
+
+    weight_dequant = block_quant_dequant(
+        weight,
+        weight_scale_inv,
+        weight_block_size,
+        torch.bfloat16,
+    )
+
+    weight_dequant_flat = weight_dequant.view((-1, k))
+    out_w_flat, out_s_flat = per_block_cast_to_fp8(weight_dequant_flat)
+
+    out_w = out_w_flat.view(weight.shape)
+    out_s = out_s_flat.view(weight_scale_inv.shape)
+
+    # NOTE copy and modified from DeepGEMM
+    def _transform_scale(sf, mn: int):
+        import deep_gemm.utils.layout
+
+        sf = sf.index_select(-2, torch.arange(mn, device=sf.device) // 128)
+        sf = deep_gemm.utils.layout.get_mn_major_tma_aligned_packed_ue8m0_tensor(sf)
+        return sf
+
+    out_s = _transform_scale(out_s, mn=out_w.shape[-2])
+
+    return out_w, out_s
+
+
+# COPIED FROM DeepGEMM
+def per_block_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros(
+        (align(m, 128), align(n, 128)), dtype=x.dtype, device=x.device
+    )
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    sf = ceil_to_ue8m0(x_amax / 448.0)
+    x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
+        x_view.size(0), x_view.size(2)
+    )
+
+
+# COPIED FROM DeepGEMM
+def ceil_to_ue8m0(x: torch.Tensor):
+    return torch.pow(2.0, torch.ceil(torch.log2(x.abs())))
+
+
+def channel_quant_to_tensor_quant(
+    x_q_channel: torch.Tensor,
+    x_s: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    x_dq_channel = x_q_channel.to(torch.float32) * x_s
+    x_q_tensor, scale = (
+        scaled_fp8_quant(x_dq_channel)
+        if _is_cuda
+        else input_to_float8(x_dq_channel, dtype=x_q_channel.dtype)
+    )
+    return x_q_tensor, scale
+
+
+def _process_scaled_mm_output(output, input_2d_shape, output_shape):
+    if type(output) is tuple and len(output) == 2:
+        output = output[0]
+    return torch.narrow(output, 0, 0, input_2d_shape[0]).view(*output_shape)
+
+
+def _apply_fallback_scaled_mm(
+    qinput,
+    weight,
+    x_scale,
+    weight_scale,
+    input_2d_shape,
+    output_shape,
+    bias,
+    input_dtype,
+):
+    global TORCH_DEVICE_IDENTITY
+    if TORCH_DEVICE_IDENTITY is None:
+        TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32, device=weight.device)
+
+    output = torch._scaled_mm(
+        qinput,
+        weight,
+        scale_a=TORCH_DEVICE_IDENTITY,
+        scale_b=TORCH_DEVICE_IDENTITY,
+        out_dtype=torch.float32,
+    )
+
+    output = _process_scaled_mm_output(output, input_2d_shape, output_shape)
+    x_scale = torch.narrow(x_scale, 0, 0, input_2d_shape[0])
+
+    output = output * x_scale * weight_scale.t()
+    if bias is not None:
+        output = output + bias
+    return output.to(dtype=input_dtype)
+
+
+def apply_fp8_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    input_scale_ub: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+    cutlass_fp8_supported: bool = cutlass_fp8_supported(),
+    use_per_token_if_dynamic: bool = False,
+    pad_output: Optional[bool] = None,
+    compressed_tensor_quant: bool = False,
+) -> torch.Tensor:
+    # Note: we pad the input because torch._scaled_mm is more performant
+    # for matrices with batch dimension > 16.
+    # This could change in the future.
+    # We also don't pad when using torch.compile,
+    # as it breaks with dynamic shapes.
+    if pad_output is None:
+        pad_output = (
+            not get_bool_env_var("SGLANG_ENABLE_TORCH_COMPILE")
+            and not cutlass_fp8_supported
+        )
+    output_padding = 17 if pad_output else None
+
+    # View input as 2D matrix for fp8 methods
+    input_2d = input.view(-1, input.shape[-1])
+    output_shape = [*input.shape[:-1], weight.shape[1]]
+
+    if compressed_tensor_quant:
+        # cutlass_scaled_mm supports per tensor/channel W and per tensor/token A
+        # for sgl-kernel fp8_scaled_mm, it support per channel W now
+        if cutlass_fp8_supported and weight_scale.numel() == weight.shape[1]:
+            qinput, x_scale = scaled_fp8_quant(
+                input_2d,
+                input_scale,
+                use_per_token_if_dynamic=use_per_token_if_dynamic,
+            )
+
+            # Fused GEMM_DQ
+            if VLLM_AVAILABLE and use_vllm_cutlass_w8a8_fp8_kernel:
+                # Fall back to vllm cutlass w8a8 fp8 kernel
+                output = ops.cutlass_scaled_mm(
+                    qinput,
+                    weight,
+                    out_dtype=input.dtype,
+                    scale_a=x_scale,
+                    scale_b=weight_scale,
+                    bias=bias,
+                )
+            else:
+                assert (
+                    weight_scale.numel() == weight.shape[1]
+                ), "cutlass w8a8 fp8 sgl-kernel only supports per-channel scale"
+
+                cutlass_compatible_b = (
+                    weight.shape[0] % 16 == 0 and weight.shape[1] % 16 == 0
+                )
+                if not cutlass_compatible_b or use_triton_w8a8_fp8_kernel:
+                    # Massage the input to be 2D
+                    qinput = qinput.view(-1, qinput.shape[-1])
+                    output = triton_scaled_mm(
+                        qinput, weight, x_scale, weight_scale, input.dtype, bias
+                    )
+                else:
+                    output = fp8_scaled_mm(
+                        qinput,
+                        weight,
+                        x_scale,
+                        weight_scale,
+                        out_dtype=input.dtype,
+                        bias=bias,
+                    )
+            return output.view(*output_shape)
+
+        # torch.scaled_mm supports per tensor weights + activations only
+        # so fallback to naive if per channel or per token
+        else:
+            # Maybe apply padding to output, see comment in __init__
+            qinput, x_scale = (
+                scaled_fp8_quant(
+                    input_2d,
+                    input_scale,
+                    num_token_padding=output_padding,
+                    use_per_token_if_dynamic=use_per_token_if_dynamic,
+                )
+                if _is_cuda
+                else ops.scaled_fp8_quant(
+                    input_2d,
+                    input_scale,
+                    num_token_padding=output_padding,
+                    use_per_token_if_dynamic=use_per_token_if_dynamic,
+                )
+            )
+
+            per_tensor_weights = weight_scale.numel() == 1
+            per_tensor_activations = x_scale.numel() == 1
+
+            if per_tensor_weights and per_tensor_activations:
+                # Fused GEMM_DQ
+                output = torch._scaled_mm(
+                    qinput,
+                    weight,
+                    out_dtype=input.dtype,
+                    scale_a=x_scale,
+                    scale_b=weight_scale,
+                    bias=bias,
+                )
+                return _process_scaled_mm_output(output, input_2d.shape, output_shape)
+
+            elif (
+                use_per_token_if_dynamic
+                and not per_tensor_weights
+                and not per_tensor_activations
+                and (USE_ROWWISE_TORCH_SCALED_MM or _use_aiter)
+            ):
+                # into this sector means use dynamic per-token-per-channel quant
+                # per-token scale quant for input matrix, every row(one token) have one scale factor
+                # per-channel scale quant for weight matrix, every col(one channel) have one scale factor
+                if _use_aiter:
+                    # gemm_a8w8_bpreshuffle(XQ, WQ, x_scale, w_scale, dtype)
+                    # XQ -> input tensor, shape = (m, k)
+                    # WQ -> weight tensor, shape = (n, k), with preshuffe get better perf
+                    # x_scale -> input scale tensor, shape = (m, 1)
+                    # w_scale -> weight scale tensor, shape = (n ,1)
+                    # dtype -> output dtype
+                    output = gemm_a8w8_bpreshuffle(
+                        XQ=qinput,
+                        WQ=weight,
+                        x_scale=x_scale,
+                        w_scale=weight_scale,
+                        dtype=input.dtype,
+                    )
+                    if bias is not None:
+                        output += bias
+                    return _process_scaled_mm_output(
+                        output, input_2d.shape, [*input.shape[:-1], weight.shape[0]]
+                    )
+                else:
+                    # For now validated on ROCm platform
+                    # fp8 rowwise scaling in torch._scaled_mm is introduced in
+                    # https://github.com/pytorch/pytorch/pull/144432 using hipBLASLt
+                    # and ROCm 6.3, which only exists in torch 2.7 and above.
+                    # For CUDA platform please validate if the
+                    # torch._scaled_mm support rowwise scaled GEMM
+                    # Fused GEMM_DQ Rowwise GEMM
+                    output = torch._scaled_mm(
+                        qinput,
+                        weight,
+                        out_dtype=input.dtype,
+                        scale_a=x_scale,
+                        scale_b=weight_scale.t(),
+                        bias=bias,
+                    )
+                    return _process_scaled_mm_output(
+                        output, input_2d.shape, output_shape
+                    )
+            else:
+                # Fallback for channelwise case, where we use unfused DQ
+                # due to limitations with scaled_mm
+
+                # Symmetric quantized GEMM by definition computes the following:
+                #   C = (s_x * X) (s_w * W) + bias
+                # This is equivalent to dequantizing the weights and activations
+                # before applying a GEMM.
+                #
+                # In order to compute quantized operands, a quantized kernel
+                # will rewrite the above like so:
+                #   C = s_w * s_x * (X * W) + bias
+                #
+                # For the scaled_mm fallback case, we break this down, since it
+                # does not support s_w being a vector.
+                return _apply_fallback_scaled_mm(
+                    qinput,
+                    weight,
+                    x_scale,
+                    weight_scale,
+                    input_2d.shape,
+                    output_shape,
+                    bias,
+                    input.dtype,
+                )
+    else:
+        # cutlass w8a8 fp8 sgl-kernel only supports per-token scale
+        if input_scale is not None:
+            assert input_scale.numel() == 1
+            # broadcast per-tensor scale to per-token scale when supporting cutlass
+            qinput, x_scale = static_quant_fp8(
+                input_2d, input_scale, repeat_scale=cutlass_fp8_supported
+            )
+        else:
+            # default use per-token quantization if dynamic
+            if _is_cuda:
+                qinput, x_scale = sglang_per_token_quant_fp8(input_2d)
+            else:
+                # TODO(kkhuang): temporarily enforce per-tensor activation scaling if weight is per-tensor scaling
+                # final solution should be: 1. add support to per-tensor activation scaling.
+                # 2. solve the torch.compile error from weight_scale.numel() == 1 and x_scale.numel() > 1 (below line#308)
+                if _is_hip and weight_scale.numel() == 1:
+                    qinput, x_scale = ops.scaled_fp8_quant(
+                        input_2d,
+                        input_scale,
+                        use_per_token_if_dynamic=use_per_token_if_dynamic,
+                    )
+                else:
+                    qinput, x_scale = per_token_group_quant_fp8(
+                        input_2d, group_size=input_2d.shape[1]
+                    )
+
+        if cutlass_fp8_supported:
+            try:
+                if VLLM_AVAILABLE and use_vllm_cutlass_w8a8_fp8_kernel:
+                    # Fall back to vllm cutlass w8a8 fp8 kernel
+                    output = ops.cutlass_scaled_mm(
+                        qinput,
+                        weight,
+                        out_dtype=input.dtype,
+                        scale_a=x_scale,
+                        scale_b=weight_scale,
+                        bias=bias,
+                    )
+                else:
+                    assert (
+                        weight_scale.numel() == weight.shape[1]
+                    ), "cutlass w8a8 fp8 sgl-kernel only supports per-channel scale"
+
+                    cutlass_compatible_b = (
+                        weight.shape[0] % 16 == 0 and weight.shape[1] % 16 == 0
+                    )
+                    if not cutlass_compatible_b or use_triton_w8a8_fp8_kernel:
+                        # Massage the input to be 2D
+                        qinput = qinput.view(-1, qinput.shape[-1])
+                        output = triton_scaled_mm(
+                            qinput, weight, x_scale, weight_scale, input.dtype, bias
+                        )
+                    else:
+                        output = fp8_scaled_mm(
+                            qinput,
+                            weight,
+                            x_scale,
+                            weight_scale,
+                            out_dtype=input.dtype,
+                            bias=bias,
+                        )
+                return output.view(*output_shape)
+            except (ImportError, NameError, AttributeError):
+                pass
+
+        # torch.scaled_mm supports per tensor weights + activations only
+        # so fallback to naive if per channel or per token
+        per_tensor_weights = weight_scale.numel() == 1
+        per_tensor_activations = x_scale.numel() == 1
+
+        if per_tensor_weights and per_tensor_activations:
+            # Fused GEMM_DQ
+            output = torch._scaled_mm(
+                qinput,
+                weight,
+                out_dtype=input.dtype,
+                scale_a=x_scale,
+                scale_b=weight_scale,
+                bias=bias,
+            )
+            return _process_scaled_mm_output(output, input_2d.shape, output_shape)
+
+        else:
+            # Fallback for channelwise case, where we use unfused DQ
+            # due to limitations with scaled_mm
+
+            # Symmetric quantized GEMM by definition computes the following:
+            #   C = (s_x * X) (s_w * W) + bias
+            # This is equivalent to dequantizing the weights and activations
+            # before applying a GEMM.
+            #
+            # In order to compute quantized operands, a quantized kernel
+            # will rewrite the above like so:
+            #   C = s_w * s_x * (X * W) + bias
+            #
+            # For the scaled_mm fallback case, we break this down, since it
+            # does not support s_w being a vector.
+            return _apply_fallback_scaled_mm(
+                qinput,
+                weight,
+                x_scale,
+                weight_scale,
+                input_2d.shape,
+                output_shape,
+                bias,
+                input.dtype,
+            )
+
+
+def can_auto_enable_marlin_fp8() -> bool:
+    try:
+        major, minor = get_device_capability()
+        sm = major * 10 + minor
+        return 80 <= sm < 89
+    except Exception:
+        return False
diff --git a/python/sglang/srt/layers/quantization/fpgemm_fp8.py b/python/sglang/srt/layers/quantization/fpgemm_fp8.py
new file mode 100644
index 000000000..5a78626ff
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/fpgemm_fp8.py
@@ -0,0 +1,203 @@
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import logging
+from typing import Any, Optional
+
+import torch
+from torch.nn import Module
+from torch.nn.parameter import Parameter
+
+from sglang.srt.layers.linear import LinearBase
+from sglang.srt.layers.parameter import ChannelQuantScaleParameter, ModelWeightParameter
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
+from sglang.srt.layers.quantization.fp8_utils import (
+    apply_fp8_linear,
+    can_auto_enable_marlin_fp8,
+    cutlass_fp8_supported,
+    normalize_e4m3fn_to_e4m3fnuz,
+)
+from sglang.srt.layers.quantization.marlin_utils_fp8 import (
+    apply_fp8_marlin_linear,
+    prepare_fp8_layer_for_marlin,
+)
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+from sglang.srt.layers.quantization.utils import is_layer_skipped, replace_parameter
+from sglang.srt.utils import get_bool_env_var, is_cuda
+
+_is_cuda = is_cuda()
+_is_fp8_fnuz = is_fp8_fnuz()
+
+logger = logging.getLogger(__name__)
+
+
+class FBGEMMFp8Config(QuantizationConfig):
+    """Config class for FBGEMM Fp8."""
+
+    def __init__(self, ignore_list: list[str], input_scale_ub: float):
+        super().__init__()
+        self.ignore_list = ignore_list if ignore_list else []
+        self.input_scale_ub = input_scale_ub
+
+        # For GPUs that lack FP8 hardware suspport, we can leverage the Marlin
+        # kernel for fast weight-only FP8 quantization
+        # self.use_marlin = not marlin_fp8_supported()
+        self.use_marlin = False
+        if _is_cuda:
+            force_marlin = get_bool_env_var("SGLANG_FORCE_FP8_MARLIN")
+            auto_enable = can_auto_enable_marlin_fp8()
+            self.use_marlin = force_marlin or auto_enable
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "fbgemm_fp8"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.bfloat16, torch.float16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> FBGEMMFp8Config:
+        ignore_list = cls.get_from_keys(config, ["modules_to_not_convert"])
+        input_scale_ub = cls.get_from_keys(config, ["activation_scale_ub"])
+        return cls(ignore_list=ignore_list, input_scale_ub=input_scale_ub)
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[QuantizeMethodBase]:
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(
+                prefix=prefix,
+                ignored_layers=self.ignore_list,
+                fused_mapping=self.packed_modules_mapping,
+            ):
+                return UnquantizedLinearMethod()
+            return FBGEMMFp8LinearMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class FBGEMMFp8LinearMethod(LinearMethodBase):
+
+    def __init__(self, quant_config: FBGEMMFp8Config):
+        self.quant_config = quant_config
+        # self.fp8_linear = Fp8LinearOp(
+        #     act_quant_static=False, act_quant_group_shape=GroupShape.PER_TOKEN)
+        self.out_dtype = torch.get_default_dtype()
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        # maybe_create_device_identity()
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        del input_size, output_size
+        output_size_per_partition = sum(output_partition_sizes)
+
+        layer.logical_widths = output_partition_sizes
+
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.orig_dtype = params_dtype
+
+        # WEIGHT
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition,
+                dtype=torch.float8_e4m3fn,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+        weight_scale = ChannelQuantScaleParameter(
+            data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32),
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        weight_scale[:] = torch.finfo(torch.float32).min
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # INPUT SCALE UPPER BOUND
+        input_scale_ub = torch.nn.Parameter(
+            torch.tensor((self.quant_config.input_scale_ub), dtype=torch.float32),
+            requires_grad=False,
+        )
+        layer.input_scale_ub = input_scale_ub
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        # required by torch.compile
+        layer.weight_scale = Parameter(layer.weight_scale.data, requires_grad=False)
+        layer.weight = Parameter(layer.weight.data, requires_grad=False)
+
+        weight = layer.weight
+
+        if _is_fp8_fnuz:
+            weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
+                weight=weight, weight_scale=layer.weight_scale, input_scale=None
+            )
+            if input_scale is not None:
+                layer.input_scale = Parameter(input_scale, requires_grad=False)
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+
+        layer.weight = Parameter(weight.t(), requires_grad=False)
+        if self.quant_config.use_marlin:
+            prepare_fp8_layer_for_marlin(layer)
+            # Activations not quantized for marlin.
+            del layer.input_scale_ub
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        if self.quant_config.use_marlin:
+            return apply_fp8_marlin_linear(
+                input=x,
+                weight=layer.weight,
+                weight_scale=layer.weight_scale,
+                workspace=layer.workspace,
+                size_n=layer.output_size_per_partition,
+                size_k=layer.input_size_per_partition,
+                bias=bias,
+            )
+
+        return apply_fp8_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=None,
+            input_scale_ub=layer.input_scale_ub,
+            bias=bias,
+            cutlass_fp8_supported=self.cutlass_fp8_supported,
+            use_per_token_if_dynamic=False,
+        )
diff --git a/python/sglang/srt/layers/quantization/gptq.py b/python/sglang/srt/layers/quantization/gptq.py
new file mode 100644
index 000000000..ccd3d46f7
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/gptq.py
@@ -0,0 +1,1097 @@
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from fractions import Fraction
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
+
+import torch
+
+from sglang.srt.layers.parameter import (
+    BasevLLMParameter,
+    ChannelQuantScaleParameter,
+    GroupQuantScaleParameter,
+    PackedColumnParameter,
+    PackedvLLMParameter,
+    RowvLLMParameter,
+    permute_param_layout_,
+)
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.marlin_utils import (
+    apply_gptq_marlin_linear,
+    check_marlin_supported,
+    check_marlin_supports_shape,
+    marlin_is_k_full,
+    marlin_make_empty_g_idx,
+    marlin_make_workspace,
+    marlin_moe_permute_scales,
+    marlin_permute_scales,
+    marlin_repeat_scales_on_all_ranks,
+    marlin_sort_g_idx,
+    marlin_zero_points,
+    verify_marlin_supported,
+)
+from sglang.srt.layers.quantization.utils import (
+    get_linear_quant_method,
+    get_scalar_types,
+    replace_parameter,
+    unpack_cols,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
+    from sglang.srt.layers.moe.token_dispatcher import (
+        StandardDispatchOutput,
+        CombineInput,
+    )
+
+from sglang.srt.utils import is_cuda
+
+_is_cuda = is_cuda()
+
+if _is_cuda:
+    from sgl_kernel import fused_marlin_moe, gptq_gemm, gptq_marlin_repack, gptq_shuffle
+
+
+logger = logging.getLogger(__name__)
+ScalarType, scalar_types = get_scalar_types()
+
+
+def check_marlin_format(hf_quant_cfg: Dict[str, Any]) -> bool:
+    # compat: gptqmodel and autogptq (eol) main use checkpoint_format: str
+    # compat: autogptq <=0.7.1 is_marlin_format: bool
+    return hf_quant_cfg.get("checkpoint_format") == "marlin" or hf_quant_cfg.get(
+        "is_marlin_format", False
+    )
+
+
+def gptq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = gptq_marlin_repack(b_q_weight[e], perm[e], size_k, size_n, num_bits)
+    return output
+
+
+@dataclass
+class MarlinLinearLayerConfig:
+    full_weight_shape: tuple[int, int]  # [in, out]
+    partition_weight_shape: tuple[int, int]
+    weight_type: ScalarType
+    act_type: torch.dtype
+    group_size: int
+    zero_points: bool
+    has_g_idx: bool
+
+
+class GPTQConfig(QuantizationConfig):
+    """Config class for GPTQ.
+
+    Reference: https://arxiv.org/abs/2210.17323
+    """
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        desc_act: bool,
+        lm_head_quantized: bool,
+        dynamic: Dict[str, Dict[str, Union[int, bool]]],
+    ) -> None:
+        # GPTQModel use `dynamic` config property to allow per module
+        # quantization config so each module can be individually optimized.
+        # Format is Dict[str, Dict] where key is a regex string that can
+        # perform both positive ("+:" prefixed) or negative ("-:" prefixed)
+        # matching of a module.
+        # Default to positive match, override base quant config mode, if no
+        # prefix is used. Value is in dict format of field key and override
+        # value.
+        # Negative matching will skip quantization init for this module
+        # entirely:
+        # non-quantized inference. More details and quantization examples can be
+        # found at: https://github.com/ModelCloud/GPTQModel
+        # Example:
+        #  # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9
+        #  # last 1/4 of the layers 16-21 has 8bit and group_size 64
+        # dynamic = {
+        #  #`.*\.` matches the layers_node prefix
+        #  # positive match layer 10-15
+        #  r"+:.*\.(?:1[0-5])\..*": {"bits": 8,},
+        #  # positive match layer 16-21
+        #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,},
+        #  r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers
+        # }
+        super().__init__()
+        self.dynamic = dynamic
+
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.desc_act = desc_act
+        self.lm_head_quantized = lm_head_quantized
+        self.pack_factor = Fraction(32, self.weight_bits)
+        if self.weight_bits not in [2, 3, 4, 8]:
+            raise ValueError(
+                "Currently, only 2/3/4/8-bit weight quantization is "
+                f"supported for GPTQ, but got {self.weight_bits} bits."
+            )
+
+    def __repr__(self) -> str:
+        return (
+            f"GPTQConfig(weight_bits={self.weight_bits}, "
+            f"group_size={self.group_size}, "
+            f"desc_act={self.desc_act}),"
+            f"lm_head_quantized={self.lm_head_quantized}), "
+            f"dynamic={self.dynamic}"
+        )
+
+    def get_scaled_act_names(self) -> List[str]:
+        """Returns the activation function names that should be post-scaled.
+
+        For now, this is only used by AWQ.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "gptq"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half]
+
+    @classmethod
+    # Need to figure it out
+    def get_min_capability(cls) -> int:
+        return 60
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> GPTQConfig:
+        dynamic = cls.get_from_keys_or(config, ["dynamic"], default={})
+        dynamic = {} if dynamic is None else dynamic
+
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        desc_act = cls.get_from_keys(config, ["desc_act"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
+        return cls(weight_bits, group_size, desc_act, lm_head_quantized, dynamic)
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[LinearMethodBase]:
+        # Delay the import to avoid circular dependency
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+
+        if isinstance(layer, FusedMoE):
+            raise TypeError("GPTQ Method does not support MoE, please use gptq_marlin")
+        else:
+            return get_linear_quant_method(
+                self, layer, prefix=prefix, linear_method_cls=GPTQLinearMethod
+            )
+
+
+class GPTQMarlinConfig(QuantizationConfig):
+    """Config class for GPTQ Marlin"""
+
+    # (num_bits, is_sym) -> quant_type
+    TYPE_MAP = {
+        (4, True): scalar_types.uint4b8,
+        (8, True): scalar_types.uint8b128,
+    }
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        desc_act: bool,
+        is_sym: bool,
+        lm_head_quantized: bool,
+        dynamic: Dict[str, Dict[str, Union[int, bool]]],
+        full_config: Dict[str, Any],
+    ) -> None:
+        super().__init__()
+        if desc_act and group_size == -1:
+            # In this case, act_order == True is the same as act_order == False
+            # (since we have only one group per output channel)
+            desc_act = False
+
+        # GPTQModel use `dynamic` config property to allow per module
+        # quantization config so each module can be individually optimized.
+        # Format is Dict[str, Dict] where key is a regex string that can
+        # perform both positive ("+:" prefixed) or negative ("-:" prefixed)
+        # matching of a module.
+        # Default to positive match, override base quant config mode, if no
+        # prefix is used. Value is in dict format of field key and override
+        # value.
+        # Negative matching will skip quantization init for this module
+        # entirely:
+        # non-quantized inference. More details and quantization examples can be
+        # found at: https://github.com/ModelCloud/GPTQModel
+        # Example:
+        #  # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9
+        #  # last 1/4 of the layers 16-21 has 8bit and group_size 64
+        # dynamic = {
+        #  #`.*\.` matches the layers_node prefix
+        #  # positive match layer 10-15
+        #  r"+:.*\.(?:1[0-5])\..*": {"bits": 8,},
+        #  # positive match layer 16-21
+        #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,},
+        #  r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers
+        # }
+        self.dynamic = dynamic
+
+        self.weight_bits = weight_bits
+        self.is_sym = is_sym
+
+        self.pack_factor = 32 // weight_bits  # packed into int32
+        self.group_size = group_size
+        self.desc_act = desc_act
+        self.lm_head_quantized = lm_head_quantized
+        self.full_config = full_config
+
+        if (weight_bits, is_sym) not in self.TYPE_MAP:
+            raise ValueError(
+                "Unsupported quantization config: " f"bits={weight_bits}, sym={is_sym}"
+            )
+
+        # (num_bits, is_sym) -> quant_type
+        self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)]
+
+    def __repr__(self) -> str:
+        return (
+            f"GPTQMarlinConfig(quant_type={self.quant_type}, "
+            f"group_size={self.group_size}, "
+            f"desc_act={self.desc_act}, "
+            f"lm_head_quantized={self.lm_head_quantized}), "
+            f"dynamic={self.dynamic}"
+        )
+
+    def get_scaled_act_names(self) -> List[str]:
+        """Returns the activation function names that should be post-scaled.
+
+        For now, this is only used by AWQ.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "gptq_marlin"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> GPTQMarlinConfig:
+        dynamic = cls.get_from_keys_or(config, ["dynamic"], default={})
+        dynamic = {} if dynamic is None else dynamic
+
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        desc_act = cls.get_from_keys(config, ["desc_act"])
+        is_sym = cls.get_from_keys(config, ["sym"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
+        return cls(
+            weight_bits,
+            group_size,
+            desc_act,
+            is_sym,
+            lm_head_quantized,
+            dynamic,
+            config,
+        )
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:
+        is_marlin_format = check_marlin_format(hf_quant_cfg)
+
+        can_convert = cls.is_gptq_marlin_compatible(hf_quant_cfg)
+
+        is_valid_user_quant = (
+            user_quant is None or user_quant == "marlin" or user_quant == "gptq_marlin"
+        )
+
+        if not is_marlin_format and can_convert and is_valid_user_quant:
+            msg = (
+                "The model is convertible to {} during runtime."
+                " Using {} kernel.".format(cls.get_name(), cls.get_name())
+            )
+            logger.info(msg)
+            return cls.get_name()
+
+        if not is_marlin_format and can_convert and user_quant == "gptq":
+            logger.info(
+                "Detected that the model can run with gptq_marlin"
+                ", however you specified quantization=gptq explicitly,"
+                " so forcing gptq. Use quantization=gptq_marlin for"
+                " faster inference"
+            )
+        return None
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[QuantizeMethodBase]:
+        # Delay the import to avoid circular dependency
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+
+        if isinstance(layer, FusedMoE):
+            return GPTQMarlinMoEMethod(self)
+        return get_linear_quant_method(self, layer, prefix, GPTQMarlinLinearMethod)
+
+    @classmethod
+    def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]):
+        quant_method = quant_config.get("quant_method", "").lower()
+        num_bits = quant_config.get("bits")
+        group_size = quant_config.get("group_size")
+        sym = quant_config.get("sym")
+        desc_act = quant_config.get("desc_act")
+
+        if not _is_cuda:
+            return False
+
+        if quant_method != "gptq":
+            return False
+
+        # Marlin conversion is only valid if required properties are found
+        if num_bits is None or group_size is None or sym is None or desc_act is None:
+            return False
+
+        if (num_bits, sym) not in cls.TYPE_MAP:
+            return False
+
+        return check_marlin_supported(
+            quant_type=cls.TYPE_MAP[(num_bits, sym)], group_size=group_size
+        )
+
+
+class GPTQLinearMethod(LinearMethodBase):
+    """Linear method for GPTQ.
+
+    Args:
+        quant_config: The GPTQ quantization config.
+    """
+
+    def __init__(self, quant_config: GPTQConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del output_size  # Unused.
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        if input_size_per_partition % self.quant_config.group_size != 0:
+            raise ValueError(
+                "The input size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size."
+            )
+        output_size_per_partition = sum(output_partition_sizes)
+        if output_size_per_partition % self.quant_config.pack_factor.numerator != 0:
+            raise ValueError(
+                "The output size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size."
+            )
+
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        self.use_shuffle = True
+        scale_and_zero_size = input_size // group_size
+        scale_and_zero_input_dim = None
+        if (
+            input_size != input_size_per_partition
+            and self.quant_config.group_size != -1
+        ):
+            if self.quant_config.desc_act:
+                self.use_shuffle = False
+            else:
+                # we need to partition qzeros and scales for exllama kernel
+                scale_and_zero_size = input_size_per_partition // group_size
+                scale_and_zero_input_dim = 0
+
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // self.quant_config.pack_factor,
+                output_size_per_partition,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=0,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader,
+        )
+
+        g_idx = RowvLLMParameter(
+            data=torch.tensor(
+                [
+                    i // self.quant_config.group_size
+                    for i in range(input_size_per_partition)
+                ],
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            weight_loader=weight_loader,
+        )
+        qzeros_args = {
+            "data": torch.empty(
+                scale_and_zero_size,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            "weight_loader": weight_loader,
+        }
+        weight_scale_args = {
+            "data": torch.empty(
+                scale_and_zero_size,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            "weight_loader": weight_loader,
+        }
+        if scale_and_zero_input_dim is None:
+            scales = ChannelQuantScaleParameter(output_dim=1, **weight_scale_args)
+            qzeros = PackedColumnParameter(
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args,
+            )
+
+        else:
+            scales = GroupQuantScaleParameter(
+                output_dim=1, input_dim=0, **weight_scale_args
+            )
+            qzeros = PackedvLLMParameter(
+                input_dim=0,
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args,
+            )
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("g_idx", g_idx)
+        layer.register_parameter("qzeros", qzeros)
+        layer.register_parameter("scales", scales)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # for torch.compile
+        layer.qzeros = torch.nn.Parameter(layer.qzeros.data, requires_grad=False)
+        layer.qweight = torch.nn.Parameter(layer.qweight.data, requires_grad=False)
+        layer.g_idx = torch.nn.Parameter(layer.g_idx.data, requires_grad=False)
+        layer.scales = torch.nn.Parameter(layer.scales.data, requires_grad=False)
+
+        # exllama needs to shuffle the weight after the weight is loaded
+        # here we do the shuffle on first forward pass
+        if self.use_shuffle:
+            if self.quant_config.desc_act:
+                layer.g_idx.data = torch.argsort(layer.g_idx).to(torch.int)
+            else:
+                layer.g_idx.data = torch.empty(
+                    (0,), dtype=torch.int, device=layer.g_idx.device
+                )
+            gptq_shuffle(layer.qweight, layer.g_idx, self.quant_config.weight_bits)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        out_shape = x.shape[:-1] + (layer.qweight.shape[-1],)
+        reshaped_x = x.reshape(-1, x.shape[-1])
+
+        output = gptq_gemm(
+            reshaped_x,
+            layer.qweight,
+            layer.qzeros,
+            layer.scales,
+            layer.g_idx,
+            self.use_shuffle,
+            self.quant_config.weight_bits,
+        )
+        if bias is not None:
+            output.add_(bias)
+        return output.reshape(out_shape)
+
+
+class GPTQMarlinLinearMethod(LinearMethodBase):
+    """Linear method for GPTQ Marlin.
+
+    Args:
+        quant_config: The GPTQ Marlin quantization config.
+    """
+
+    _kernel_backends_being_used: set[str] = set()
+
+    def __init__(self, quant_config: GPTQMarlinConfig) -> None:
+        self.quant_config = quant_config
+
+        # Verify supported on platform.
+        verify_marlin_supported(
+            quant_type=self.quant_config.quant_type,
+            group_size=self.quant_config.group_size,
+        )
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        output_size_per_partition = sum(output_partition_sizes)
+        is_row_parallel = input_size != input_size_per_partition
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        self.kernel_config = MarlinLinearLayerConfig(
+            full_weight_shape=(input_size, output_size),
+            partition_weight_shape=(
+                input_size_per_partition,
+                output_size_per_partition,
+            ),
+            weight_type=self.quant_config.quant_type,
+            act_type=params_dtype,
+            group_size=self.quant_config.group_size,
+            zero_points=False,
+            has_g_idx=self.quant_config.desc_act,
+        )
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        # Determine sharding
+        if marlin_repeat_scales_on_all_ranks(
+            self.quant_config.desc_act, self.quant_config.group_size, is_row_parallel
+        ):
+            # By setting scale_dim == None, weight_loader will
+            # repeat the scales on each GPU in TP>1 case.
+            scales_and_zp_input_dim = None
+            scales_and_zp_size = input_size // group_size
+        else:
+            # By setting scale_dim == 0, weight_loader will
+            # shard the scales in TP>1 case.
+            scales_and_zp_input_dim = 0
+            scales_and_zp_size = input_size_per_partition // group_size
+
+        # Quantized weights
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // self.quant_config.pack_factor,
+                output_size_per_partition,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=0,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader,
+        )
+
+        # Activation order
+        g_idx = RowvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            weight_loader=weight_loader,
+        )
+
+        qzeros_args = {
+            "data": torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            "weight_loader": weight_loader,
+        }
+        weight_scale_args = {
+            "data": torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            "weight_loader": weight_loader,
+        }
+
+        if scales_and_zp_input_dim is None:
+            scales = ChannelQuantScaleParameter(output_dim=1, **weight_scale_args)
+            qzeros = PackedColumnParameter(
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args,
+            )
+
+        else:
+            scales = GroupQuantScaleParameter(
+                output_dim=1, input_dim=0, **weight_scale_args
+            )
+            qzeros = PackedvLLMParameter(
+                input_dim=0,
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args,
+            )
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("g_idx", g_idx)
+        layer.register_parameter("scales", scales)
+        layer.register_parameter("qzeros", qzeros)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        device = getattr(layer, "qweight").device
+        c = self.kernel_config
+
+        check_marlin_supports_shape(
+            c.partition_weight_shape[1],  # out_features
+            c.partition_weight_shape[0],  # in_features
+            c.full_weight_shape[0],  # in_features
+            c.group_size,
+        )
+
+        row_parallel = c.partition_weight_shape[0] != c.full_weight_shape[0]
+        self.is_k_full = marlin_is_k_full(c.has_g_idx, row_parallel)
+
+        # Allocate marlin workspace.
+        self.workspace = marlin_make_workspace(device)
+
+        # Default names since marlin requires empty parameters for these,
+        # TODO: remove this requirement from marlin (allow optional tensors)
+        self.w_q_name = "qweight"
+        self.w_s_name = "scales"
+        self.w_zp_name = "qzeros"
+        self.w_gidx_name = "g_idx"
+
+        def _transform_param(
+            layer: torch.nn.Module, name: Optional[str], fn: Callable
+        ) -> None:
+            if name is not None and getattr(layer, name, None) is not None:
+
+                old_param = getattr(layer, name)
+                new_param = fn(old_param)
+                # replace the parameter with torch.nn.Parameter for TorchDynamo
+                # compatibility
+                replace_parameter(
+                    layer, name, torch.nn.Parameter(new_param.data, requires_grad=False)
+                )
+
+        def transform_w_q(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
+            x.data = gptq_marlin_repack(
+                x.data.contiguous(),
+                perm=layer.g_idx_sort_indices,
+                size_k=c.partition_weight_shape[0],
+                size_n=c.partition_weight_shape[1],
+                num_bits=c.weight_type.size_bits,
+            )
+            return x
+
+        def transform_w_s(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1)
+            x.data = marlin_permute_scales(
+                x.data.contiguous(),
+                size_k=c.partition_weight_shape[0],
+                size_n=c.partition_weight_shape[1],
+                group_size=c.group_size,
+            )
+            return x
+
+        if c.has_g_idx:
+            g_idx, g_idx_sort_indices = marlin_sort_g_idx(
+                getattr(layer, self.w_gidx_name)
+            )
+            _transform_param(layer, self.w_gidx_name, lambda _: g_idx)
+            layer.g_idx_sort_indices = g_idx_sort_indices
+        else:
+            setattr(layer, self.w_gidx_name, marlin_make_empty_g_idx(device))
+            layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
+
+        if c.zero_points:
+            grouped_k = (
+                c.partition_weight_shape[0] // c.group_size if c.group_size != -1 else 1
+            )
+            _transform_param(
+                layer,
+                self.w_zp_name,
+                lambda x: marlin_zero_points(
+                    unpack_cols(
+                        x.t(),
+                        c.weight_type.size_bits,
+                        grouped_k,
+                        c.partition_weight_shape[1],
+                    ),
+                    size_k=grouped_k,
+                    size_n=c.partition_weight_shape[1],
+                    num_bits=c.weight_type.size_bits,
+                ),
+            )
+        else:
+            setattr(layer, self.w_zp_name, marlin_make_empty_g_idx(device))
+        _transform_param(layer, self.w_q_name, transform_w_q)
+        _transform_param(layer, self.w_s_name, transform_w_s)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        c = self.kernel_config
+
+        def _get_weight_params(
+            layer: torch.nn.Module,
+        ) -> tuple[
+            torch.Tensor,  # w_q
+            torch.Tensor,  # w_s
+            Optional[torch.Tensor],  # w_zp,
+            Optional[torch.Tensor],  # w_gidx
+        ]:
+            return (
+                getattr(layer, self.w_q_name),
+                getattr(layer, self.w_s_name),
+                getattr(layer, self.w_zp_name or "", None),
+                getattr(layer, self.w_gidx_name or "", None),
+            )
+
+        w_q, w_s, w_zp, w_gidx = _get_weight_params(layer)
+
+        # `process_weights_after_loading` will ensure w_zp and w_gidx are not
+        #  None for marlin
+        return apply_gptq_marlin_linear(
+            input=x,
+            weight=w_q,
+            weight_scale=w_s,
+            weight_zp=w_zp,  # type: ignore
+            g_idx=w_gidx,  # type: ignore
+            g_idx_sort_indices=layer.g_idx_sort_indices,
+            workspace=self.workspace,
+            wtype=c.weight_type,
+            input_size_per_partition=c.partition_weight_shape[0],
+            output_size_per_partition=c.partition_weight_shape[1],
+            is_k_full=self.is_k_full,
+            bias=bias,
+        )
+
+
+class GPTQMarlinMoEMethod(FusedMoEMethodBase):
+    """MoE Marlin method with quantization."""
+
+    def __init__(self, quant_config: GPTQMarlinConfig) -> None:
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        # Delay the import to avoid circular dependency
+        from sglang.srt.layers.linear import set_weight_attrs
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        self.is_k_full = (not self.quant_config.desc_act) or layer.moe_tp_size == 1
+
+        if self.quant_config.group_size != -1:
+            scales_size13 = hidden_size // self.quant_config.group_size
+            if self.quant_config.desc_act:
+                w2_scales_size = intermediate_size_per_partition
+            else:
+                w2_scales_size = intermediate_size_per_partition * layer.moe_tp_size
+            scales_size2 = w2_scales_size // self.quant_config.group_size
+            strategy = FusedMoeWeightScaleSupported.GROUP.value
+        else:
+            scales_size13 = 1
+            scales_size2 = 1
+            strategy = FusedMoeWeightScaleSupported.CHANNEL.value
+
+        extra_weight_attrs.update({"quant_method": strategy, "is_transposed": True})
+        # Fused gate_up_proj (column parallel)
+        w13_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size // self.quant_config.pack_factor,
+                2 * intermediate_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qweight", w13_qweight)
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+        # down_proj (row parallel)
+        w2_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition // self.quant_config.pack_factor,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qweight", w2_qweight)
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+        # up_proj scales
+        w13_scales = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                scales_size13,
+                2 * intermediate_size_per_partition,
+                dtype=torch.half,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_scales", w13_scales)
+        set_weight_attrs(w13_scales, extra_weight_attrs)
+        # down_proj scales
+        w2_scales = torch.nn.Parameter(
+            torch.empty(num_experts, scales_size2, hidden_size, dtype=torch.half),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_scales", w2_scales)
+        set_weight_attrs(w2_scales, extra_weight_attrs)
+        # dont shard the w2 scales when running act order
+        set_weight_attrs(w2_scales, {"load_full_w2": self.quant_config.desc_act})
+        # up_proj scales
+        w13_qzeros = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                scales_size13,
+                2 * intermediate_size_per_partition // self.quant_config.pack_factor,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qzeros", w13_qzeros)
+        set_weight_attrs(w13_qzeros, extra_weight_attrs)
+        # down_proj scales
+        w2_qzeros = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                scales_size2,
+                hidden_size // self.quant_config.pack_factor,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qzeros", w2_qzeros)
+        set_weight_attrs(w2_qzeros, extra_weight_attrs)
+        # dont shard the w2 scales when running act order
+        set_weight_attrs(w2_qzeros, {"load_full_w2": self.quant_config.desc_act})
+        w13_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_g_idx", w13_g_idx)
+        set_weight_attrs(w13_g_idx, extra_weight_attrs)
+        w2_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_g_idx", w2_g_idx)
+        set_weight_attrs(w2_g_idx, extra_weight_attrs)
+        w13_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_g_idx_sort_indices", w13_g_idx_sort_indices)
+        set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
+        w2_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_g_idx_sort_indices", w2_g_idx_sort_indices)
+        set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+
+        # Process act_order
+        if self.quant_config.desc_act:
+            # Get sorting based on g_idx
+            num_experts = layer.w13_g_idx.shape[0]
+            w13_g_idx_sort_indices = torch.empty_like(layer.w13_g_idx)
+            w2_g_idx_sort_indices = torch.empty_like(layer.w2_g_idx)
+            w13_sorted_g_idx = torch.empty_like(layer.w13_g_idx)
+            w2_sorted_g_idx = torch.empty_like(layer.w2_g_idx)
+            for e in range(num_experts):
+                w13_g_idx_sort_indices[e] = torch.argsort(layer.w13_g_idx[e]).to(
+                    torch.int32
+                )
+                w2_g_idx_sort_indices[e] = torch.argsort(layer.w2_g_idx[e]).to(
+                    torch.int32
+                )
+                w13_sorted_g_idx[e] = layer.w13_g_idx[e][w13_g_idx_sort_indices[e]]
+                w2_sorted_g_idx[e] = layer.w2_g_idx[e][w2_g_idx_sort_indices[e]]
+            replace_parameter(layer, "w13_g_idx", w13_sorted_g_idx)
+            replace_parameter(layer, "w2_g_idx", w2_sorted_g_idx)
+            replace_parameter(layer, "w13_g_idx_sort_indices", w13_g_idx_sort_indices)
+            replace_parameter(layer, "w2_g_idx_sort_indices", w2_g_idx_sort_indices)
+        else:
+            # Reset g_idx related tensors
+            num_experts = layer.w13_g_idx.shape[0]
+            device = layer.w13_g_idx.device
+            layer.w13_g_idx = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+                requires_grad=False,
+            )
+            layer.w2_g_idx = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+                requires_grad=False,
+            )
+            layer.w13_g_idx_sort_indices = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+                requires_grad=False,
+            )
+            layer.w2_g_idx_sort_indices = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+                requires_grad=False,
+            )
+        # Repack weights
+        marlin_w13_qweight = gptq_marlin_moe_repack(
+            layer.w13_qweight,
+            layer.w13_g_idx_sort_indices,
+            layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
+            layer.w13_qweight.shape[2],
+            self.quant_config.weight_bits,
+        )
+        replace_parameter(layer, "w13_qweight", marlin_w13_qweight)
+        marlin_w2_qweight = gptq_marlin_moe_repack(
+            layer.w2_qweight,
+            layer.w2_g_idx_sort_indices,
+            layer.w2_qweight.shape[1] * self.quant_config.pack_factor,
+            layer.w2_qweight.shape[2],
+            self.quant_config.weight_bits,
+        )
+        replace_parameter(layer, "w2_qweight", marlin_w2_qweight)
+        # Repack scales
+        marlin_w13_scales = marlin_moe_permute_scales(
+            s=layer.w13_scales,
+            size_k=layer.intermediate_size_per_partition,
+            size_n=layer.w13_scales.shape[2],
+            group_size=self.quant_config.group_size,
+        )
+        replace_parameter(layer, "w13_scales", marlin_w13_scales)
+        marlin_w2_scales = marlin_moe_permute_scales(
+            s=layer.w2_scales,
+            size_k=layer.w2_scales.shape[1]
+            * (
+                self.quant_config.group_size
+                if self.quant_config.group_size != -1
+                else self.quant_config.pack_factor
+            ),
+            size_n=layer.w2_scales.shape[2],
+            group_size=self.quant_config.group_size,
+        )
+        replace_parameter(layer, "w2_scales", marlin_w2_scales)
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        # Delay the import to avoid circular dependency
+
+        assert (
+            self.moe_runner_config.activation == "silu"
+        ), "Only SiLU activation is supported."
+
+        # The input must currently be float16
+        orig_dtype = x.dtype
+        x = x.half()
+
+        topk_weights, topk_ids, router_logits = topk_output
+
+        output = fused_marlin_moe(
+            x,
+            layer.w13_qweight,
+            layer.w2_qweight,
+            layer.w13_scales,
+            layer.w2_scales,
+            router_logits,
+            topk_weights,
+            topk_ids,
+            g_idx1=layer.w13_g_idx,
+            g_idx2=layer.w2_g_idx,
+            sort_indices1=layer.w13_g_idx_sort_indices,
+            sort_indices2=layer.w2_g_idx_sort_indices,
+            num_bits=self.quant_config.weight_bits,
+            is_k_full=self.is_k_full,
+        ).to(orig_dtype)
+        return StandardCombineInput(hidden_states=output)
diff --git a/python/sglang/srt/layers/quantization/int8_kernel.py b/python/sglang/srt/layers/quantization/int8_kernel.py
new file mode 100644
index 000000000..7c6c3dbd4
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/int8_kernel.py
@@ -0,0 +1,425 @@
+import functools
+import json
+import logging
+import os
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.utils import get_device_name, is_cuda
+
+_is_cuda = is_cuda()
+if _is_cuda:
+    from sgl_kernel import sgl_per_token_group_quant_int8
+
+logger = logging.getLogger(__name__)
+
+
+@triton.jit
+def _per_token_quant_int8(
+    x_ptr,
+    xq_ptr,
+    scale_ptr,
+    x_sum_ptr,
+    stride_x,
+    stride_xq,
+    N,
+    CAL_SUM: tl.constexpr,
+    BLOCK: tl.constexpr,
+):
+    # Adapted from https://github.com/InternLM/lmdeploy/blob/086481ed84b59bee3b8e4274e5fc69620040c048/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py#L282
+    row_id = tl.program_id(0)
+
+    cols = tl.arange(0, BLOCK)
+    mask = cols < N
+
+    x = tl.load(x_ptr + row_id * stride_x + cols, mask=mask, other=0.0).to(tl.float32)
+    absmax = tl.maximum(tl.max(tl.abs(x)), 1e-10)
+    scale_x = absmax / 127
+    x_q = x * (127 / absmax)
+    x_q = tl.extra.cuda.libdevice.round(x_q).to(tl.int8)
+    if CAL_SUM:
+        x_sum = tl.sum(x, axis=0)
+        tl.store(x_sum_ptr + row_id, x_sum.to(x_sum_ptr.dtype.element_ty))
+
+    tl.store(xq_ptr + row_id * stride_xq + cols, x_q, mask=mask)
+    tl.store(scale_ptr + row_id, scale_x.to(scale_ptr.dtype.element_ty))
+
+
+def per_token_quant_int8(x, scale_dtype=torch.float32, cal_sum=False):
+    M = x.numel() // x.shape[-1]
+    N = x.shape[-1]
+    x_q = torch.empty_like(x, device=x.device, dtype=torch.int8)
+    scales = torch.empty(x.shape[:-1] + (1,), device=x.device, dtype=scale_dtype)
+    if cal_sum:
+        x_sum = torch.empty(x.shape[:-1], device=x.device, dtype=x.dtype)
+    else:
+        x_sum = None
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+
+    assert x.is_contiguous()
+    _per_token_quant_int8[(M,)](
+        x,
+        x_q,
+        scales,
+        x_sum,
+        stride_x=x.stride(-2),
+        stride_xq=x_q.stride(-2),
+        N=N,
+        CAL_SUM=cal_sum,
+        BLOCK=BLOCK,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+    if cal_sum:
+        return x_q, scales, x_sum
+    else:
+        return x_q, scales
+
+
+@triton.jit
+def _per_token_group_quant_int8(
+    # Pointers to inputs and output
+    y_ptr,
+    y_q_ptr,
+    y_s_ptr,
+    # Stride of input
+    y_stride,
+    # Columns of input
+    N,
+    # Avoid to divide zero
+    eps,
+    # Information for int8
+    int8_min,
+    int8_max,
+    # Meta-parameters
+    BLOCK: tl.constexpr,
+):
+    """A Triton-accelerated function to perform per-token-group quantization on a
+    tensor.
+
+    This function converts the tensor values into int8 values.
+    """
+    # Map the program id to the row of X and Y it should compute.
+    g_id = tl.program_id(0)
+    y_ptr += g_id * y_stride
+    y_q_ptr += g_id * y_stride
+    y_s_ptr += g_id
+
+    cols = tl.arange(0, BLOCK)  # N <= BLOCK
+    mask = cols < N
+
+    y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+    # Quant
+    _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
+    y_s = _absmax / int8_max
+    y_q = tl.clamp(y / y_s, int8_min, int8_max).to(y_q_ptr.dtype.element_ty)
+
+    tl.store(y_q_ptr + cols, y_q, mask=mask)
+    tl.store(y_s_ptr, y_s)
+
+
+def per_token_group_quant_int8(
+    x: torch.Tensor,
+    group_size: int,
+    eps: float = 1e-10,
+    dtype: torch.dtype = torch.int8,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Function to perform per-token-group quantization on an input tensor `x`.
+
+    It converts the tensor values into signed int8 values and returns the
+    quantized tensor along with the scaling factor used for quantization.
+
+    Args:
+        x: The input tenosr with ndim >= 2.
+        group_size: The group size used for quantization.
+        eps: The minimum to avoid dividing zero.
+        dtype: The dype of output tensor. Note that only `torch.int8` is supported for now.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the scaling factor for quantization.
+    """
+    assert (
+        x.shape[-1] % group_size == 0
+    ), "the last dimension of `x` cannot be divisible by `group_size`"
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    iinfo = torch.iinfo(dtype)
+    int8_max = iinfo.max
+    int8_min = iinfo.min
+
+    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+    M = x.numel() // group_size
+    N = group_size
+    x_s = torch.empty(
+        x.shape[:-1] + (x.shape[-1] // group_size,),
+        device=x.device,
+        dtype=torch.float32,
+    )
+
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+    num_stages = 1
+    _per_token_group_quant_int8[(M,)](
+        x,
+        x_q,
+        x_s,
+        group_size,
+        N,
+        eps,
+        int8_min=int8_min,
+        int8_max=int8_max,
+        BLOCK=BLOCK,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+
+    return x_q, x_s
+
+
+def sglang_per_token_group_quant_int8(
+    x: torch.Tensor,
+    group_size: int,
+    eps: float = 1e-10,
+    dtype: torch.dtype = torch.int8,
+):
+    assert (
+        x.shape[-1] % group_size == 0
+    ), "the last dimension of `x` cannot be divisible by `group_size`"
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    iinfo = torch.iinfo(dtype)
+    int8_max = iinfo.max
+    int8_min = iinfo.min
+
+    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+    x_s = torch.empty(
+        x.shape[:-1] + (x.shape[-1] // group_size,),
+        device=x.device,
+        dtype=torch.float32,
+    )
+
+    sgl_per_token_group_quant_int8(x, x_q, x_s, group_size, eps, int8_min, int8_max)
+
+    return x_q, x_s
+
+
+@triton.jit
+def _w8a8_block_int8_matmul(
+    # Pointers to inputs and output
+    A,
+    B,
+    C,
+    As,
+    Bs,
+    # Shape for matmul
+    M,
+    N,
+    K,
+    # Block size for block-wise quantization
+    group_n,
+    group_k,
+    # Stride for inputs and output
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_As_m,
+    stride_As_k,
+    stride_Bs_k,
+    stride_Bs_n,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """Triton-accelerated function used to perform linear operations (dot
+    product) on input tensors `A` and `B` with block-wise quantization, and store the result in output
+    tensor `C`.
+    """
+
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+
+    As_ptrs = As + offs_am * stride_As_m
+    offs_bsn = offs_bn // group_n
+    Bs_ptrs = Bs + offs_bsn * stride_Bs_n
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
+
+        k_start = k * BLOCK_SIZE_K
+        offs_ks = k_start // group_k
+        a_s = tl.load(As_ptrs + offs_ks * stride_As_k)
+        b_s = tl.load(Bs_ptrs + offs_ks * stride_Bs_k)
+
+        accumulator += tl.dot(a, b).to(tl.float32) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
+@functools.lru_cache
+def get_w8a8_block_int8_configs(
+    N: int, K: int, block_n: int, block_k: int
+) -> Optional[Dict[int, Any]]:
+    """
+    Return optimized configurations for the w8a8 block fp8 kernel.
+
+    The return value will be a dictionary that maps an irregular grid of
+    batch sizes to configurations of the w8a8 block fp8 kernel. To evaluate the
+    kernel on a given batch size bs, the closest batch size in the grid should
+    be picked and the associated configuration chosen to invoke the kernel.
+    """
+
+    # First look up if an optimized configuration is available in the configs
+    # directory
+    device_name = get_device_name().replace(" ", "_")
+    json_file_name = f"N={N},K={K},device_name={device_name},dtype=int8_w8a8,block_shape=[{block_n}, {block_k}].json"
+
+    config_file_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name
+    )
+    if os.path.exists(config_file_path):
+        with open(config_file_path) as f:
+            logger.info(
+                "Using configuration from %s for W8A8 Block INT8 kernel.",
+                config_file_path,
+            )
+            # If a configuration has been found, return it
+            return {int(key): val for key, val in json.load(f).items()}
+
+    # If no optimized configuration is available, we will use the default
+    # configuration
+    logger.warning(
+        (
+            "Using default W8A8 Block INT8 kernel config. Performance might be sub-optimal! "
+            "Config file not found at %s"
+        ),
+        config_file_path,
+    )
+    return None
+
+
+def w8a8_block_int8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: List[int],
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    """This function performs matrix multiplication with block-wise quantization.
+
+    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
+    The output is returned in the specified `output_dtype`.
+
+    Args:
+        A: The input tensor, e.g., activation.
+        B: The input tensor, e.g., weight.
+        As: The per-token-group quantization scale for `A`.
+        Bs: The per-block quantization scale for `B`.
+        block_size: The block size for per-block quantization. It should be 2-dim, e.g., [128, 128].
+        output_dytpe: The dtype of the returned tensor.
+
+    Returns:
+        torch.Tensor: The result of matmul.
+    """
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+
+    assert A.shape[-1] == B.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
+    assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
+    M = A.numel() // A.shape[-1]
+
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    N, K = B.shape
+    assert triton.cdiv(N, block_n) == Bs.shape[0]
+    assert triton.cdiv(K, block_k) == Bs.shape[1]
+
+    C_shape = A.shape[:-1] + (N,)
+    C = A.new_empty(C_shape, dtype=output_dtype)
+
+    configs = get_w8a8_block_int8_configs(N, K, block_size[0], block_size[1])
+    if configs:
+        # If an optimal configuration map has been found, look up the
+        # optimal config
+        config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+    else:
+        # Default config
+        # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
+        config = {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": block_size[0],
+            "BLOCK_SIZE_K": block_size[1],
+            "GROUP_SIZE_M": 32,
+            "num_warps": 4,
+            "num_stages": 3,
+        }
+
+    def grid(META):
+        return (
+            triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
+        )
+
+    _w8a8_block_int8_matmul[grid](
+        A,
+        B,
+        C,
+        As,
+        Bs,
+        M,
+        N,
+        K,
+        block_n,
+        block_k,
+        A.stride(-2),
+        A.stride(-1),
+        B.stride(1),
+        B.stride(0),
+        C.stride(-2),
+        C.stride(-1),
+        As.stride(-2),
+        As.stride(-1),
+        Bs.stride(1),
+        Bs.stride(0),
+        **config,
+    )
+
+    return C
diff --git a/python/sglang/srt/layers/quantization/int8_utils.py b/python/sglang/srt/layers/quantization/int8_utils.py
new file mode 100644
index 000000000..9a7aa6d82
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/int8_utils.py
@@ -0,0 +1,73 @@
+from typing import List, Optional, Tuple
+
+import torch
+
+from sglang.srt.layers.quantization.int8_kernel import (
+    per_token_group_quant_int8,
+    w8a8_block_int8_matmul,
+)
+
+
+def apply_w8a8_block_int8_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    block_size: List[int],
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    assert input_scale is None
+    # View input as 2D matrix for fp8 methods
+    input_2d = input.view(-1, input.shape[-1])
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+
+    q_input, x_scale = per_token_group_quant_int8(input_2d, block_size[1])
+    output = w8a8_block_int8_matmul(
+        q_input, weight, x_scale, weight_scale, block_size, output_dtype=input.dtype
+    )
+
+    if bias is not None:
+        output = output + bias
+    return output.to(dtype=input.dtype).view(*output_shape)
+
+
+def input_to_int8(
+    x: torch.Tensor, dtype: torch.dtype = torch.int8
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """This function quantizes input values to int8 values with tensor-wise quantization."""
+    iinfo = torch.iinfo(dtype)
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    int8_min, int8_max = iinfo.min, iinfo.max
+    scale = int8_max / amax
+    x_scl_sat = (x * scale).clamp(min=int8_min, max=int8_max)
+    return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
+
+
+def block_dequant(
+    x_q_block: torch.Tensor,
+    x_s: torch.Tensor,
+    block_size: List[int],
+) -> torch.Tensor:
+    """This function conducts block-wise dequantization.
+    The inputs are block-wise quantization tensor `x_q_block`, block-wise quantization scale
+    and the block size.
+    The outputs are dequantized tensor.
+    """
+    block_n, block_k = block_size[0], block_size[1]
+    n, k = x_q_block.shape
+    n_tiles = (n + block_n - 1) // block_n
+    k_tiles = (k + block_k - 1) // block_k
+    assert n_tiles == x_s.shape[0]
+    assert k_tiles == x_s.shape[1]
+
+    x_dq_block = x_q_block.to(torch.float32)
+
+    for i in range(k_tiles):
+        for j in range(n_tiles):
+            x_dq_block[
+                j * block_n : min((j + 1) * block_n, n),
+                i * block_k : min((i + 1) * block_k, k),
+            ] *= x_s[j][i]
+
+    return x_dq_block
diff --git a/python/sglang/srt/layers/quantization/kv_cache.py b/python/sglang/srt/layers/quantization/kv_cache.py
new file mode 100644
index 000000000..7da2dac17
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/kv_cache.py
@@ -0,0 +1,82 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/kv_cache.py
+
+import logging
+
+import torch
+
+from sglang.srt.layers.quantization.base_config import (
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
+from sglang.srt.layers.radix_attention import RadixAttention
+
+logger = logging.getLogger(__name__)
+
+
+class BaseKVCacheMethod(QuantizeMethodBase):
+    """
+    Quant method that adds `k_scale` and `v_scale` attributes to the
+    Attention layer to support loading those scaling factors from checkpoints.
+    The k/v_scale will be used to:
+        - quantize k/v_cache entries before saving them to the cache
+        - dequantize k/v_cache entries before fetching them from the cache
+
+    :param quant_config: the appropriate QuantizationConfig
+    """
+
+    def __init__(self, quant_config: QuantizationConfig):
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: torch.nn.Module):
+        """
+        Create "weight" (aka k_scale and v_scale) for an attention layer.
+        """
+        # Initialize the KV cache scales to -1.0, which is an invalid value.
+        # If the k/v_scale appears in the checkpoint, it will be
+        # overwritten when loading weights.
+        layer.k_scale = torch.nn.Parameter(
+            torch.tensor(-1.0, dtype=torch.float32), requires_grad=False
+        )
+        layer.v_scale = torch.nn.Parameter(
+            torch.tensor(-1.0, dtype=torch.float32), requires_grad=False
+        )
+
+    def apply(self, layer: torch.nn.Module) -> torch.Tensor:
+        raise RuntimeError(f"{self.__class__.__name__}.apply should not be called.")
+
+    def process_weights_after_loading(self, layer: RadixAttention) -> None:
+        if layer.k_scale > 0.0 and layer.v_scale > 0.0:
+            # We prefer to use separate k_scale and v_scale if present
+            k_scale = layer.k_scale.to("cpu").tolist()
+            v_scale = layer.v_scale.to("cpu").tolist()
+            if is_fp8_fnuz():
+                k_scale *= 2
+                v_scale *= 2
+        elif layer.k_scale < 0.0 and layer.v_scale < 0.0:
+            # If no scales were loaded (both scales are invalid negative
+            # values), use the default value of 1.0
+            k_scale = 1.0
+            v_scale = 1.0
+        else:
+            # If we find a single kv_scale in the checkpoint, we remap
+            # kv_scale to k_scale during weight loading, and duplicate
+            # k_scale to v_scale here
+            assert layer.k_scale > 0.0
+            scale_to_duplicate = max(layer.k_scale, layer.v_scale)
+            k_scale = scale_to_duplicate.to("cpu").tolist()
+            v_scale = scale_to_duplicate.to("cpu").tolist()
+            if is_fp8_fnuz():
+                k_scale *= 2
+                v_scale *= 2
+
+        if not isinstance(k_scale, float) or not isinstance(v_scale, float):
+            raise ValueError(
+                "Only support per-tensor scaling factor " "for fp8 KV cache"
+            )
+
+        # These are used in the final Attention.forward()
+        layer.k_scale.copy_(k_scale)
+        layer.v_scale.copy_(v_scale)
+        layer.k_scale_float = k_scale
+        layer.v_scale_float = v_scale
diff --git a/python/sglang/srt/layers/quantization/marlin_utils.py b/python/sglang/srt/layers/quantization/marlin_utils.py
new file mode 100644
index 000000000..e0b398c25
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/marlin_utils.py
@@ -0,0 +1,808 @@
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any, Optional
+
+import numpy
+import torch
+
+from sglang.srt.layers.parameter import (
+    BasevLLMParameter,
+    ChannelQuantScaleParameter,
+    GroupQuantScaleParameter,
+    PackedvLLMParameter,
+)
+from sglang.srt.layers.quantization.base_config import (
+    LinearMethodBase,
+    QuantizationConfig,
+)
+from sglang.srt.layers.quantization.utils import (
+    get_scalar_types,
+    pack_cols,
+    unpack_cols,
+)
+from sglang.srt.utils import get_device_capability, is_cuda
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.linear import LinearBase
+    from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+
+try:
+    from vllm import _custom_ops as ops
+except ImportError:
+    ops = None
+
+_is_cuda = is_cuda()
+
+if _is_cuda:
+    from sgl_kernel import gptq_marlin_gemm
+
+logger = logging.getLogger(__name__)
+
+ScalarType, scalar_types = get_scalar_types()
+
+GPTQ_MARLIN_TILE = 16
+GPTQ_MARLIN_MIN_THREAD_N = 64
+GPTQ_MARLIN_MIN_THREAD_K = 128
+GPTQ_MARLIN_MAX_PARALLEL = 16
+
+MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
+
+# In case there is a performance issue with Marlin, the variable below can be
+# changed to False, which allows Marlin to perform global reductions in fp16
+# precision (instead of fp32), and therefore, save on some memory movements.
+USE_FP32_REDUCE_DEFAULT = True
+
+
+# For binary size and compile time, we don't support the same types for with and
+#  without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
+#  TODO: we may want to move this into the C++ so its closer to the actual impl
+def query_marlin_supported_quant_types(
+    has_zp: Optional[bool] = None,
+    include_fp_type: bool = True,
+    device_capability: Optional[int] = None,
+):
+    if device_capability is None:
+        major, minor = get_device_capability()
+        capability = major * 10 + minor
+        device_capability = -1 if capability is None else capability
+
+    if device_capability < 80:
+        return []
+
+    # - has_zp is True: return quant_types that has zero points
+    # - has_zp is False: return quant_types that has not zero points
+    # - has_zp is None: both
+    if has_zp is None:
+        types0 = query_marlin_supported_quant_types(
+            False, include_fp_type, device_capability
+        )
+        types1 = query_marlin_supported_quant_types(
+            True, include_fp_type, device_capability
+        )
+        return types0 + types1
+
+    if has_zp:
+        # AWQ style, unsigned + runtime zero-point
+        return [scalar_types.uint4]
+    else:
+        # GPTQ style, unsigned + symmetric bias
+        res = [scalar_types.uint4b8, scalar_types.uint8b128]
+        if include_fp_type:
+            res += [scalar_types.float8_e4m3fn, scalar_types.float4_e2m1f]
+        return res
+
+
+def _check_marlin_supported(
+    quant_type: ScalarType,
+    group_size: Optional[int],
+    has_zp: bool,
+    device_capability: Optional[int] = None,
+) -> tuple[bool, Optional[str]]:
+
+    if device_capability is None:
+        major, minor = get_device_capability()
+        capability = major * 10 + minor
+        device_capability = -1 if capability is None else capability
+
+    supported_types = query_marlin_supported_quant_types(
+        has_zp, True, device_capability
+    )
+
+    if quant_type not in supported_types:
+        return (
+            False,
+            f"Marlin does not support weight_bits = {quant_type}. "
+            f"Only types = {supported_types} "
+            f"are supported (for group_size = {group_size}, "
+            f"device_capability = {device_capability}, zp = {has_zp}).",
+        )
+    if group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES:
+        return (
+            False,
+            f"Marlin does not support group_size = {group_size}. "
+            f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
+            "are supported.",
+        )
+
+    return True, None
+
+
+def check_marlin_supported(
+    quant_type: ScalarType,
+    group_size: int,
+    has_zp: bool = False,
+    device_capability: Optional[int] = None,
+) -> bool:
+    cond, _ = _check_marlin_supported(quant_type, group_size, has_zp, device_capability)
+    return cond
+
+
+def verify_marlin_supported(
+    quant_type: ScalarType, group_size: int, has_zp: bool = False
+) -> None:
+    cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
+    if not cond:
+        assert err_msg is not None
+        raise ValueError(err_msg)
+
+
+def verify_marlin_supports_shape(
+    output_size_per_partition: int,
+    input_size_per_partition: int,
+    input_size: int,
+    group_size: int,
+) -> None:
+
+    # Validate output_size_per_partition
+    if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
+        raise ValueError(
+            f"Weight output_size_per_partition = "
+            f"{output_size_per_partition} is not divisible by "
+            f" min_thread_n = {GPTQ_MARLIN_MIN_THREAD_N}. "
+            "Consider reducing tensor_parallel_size or running "
+            "with --quantization gptq."
+        )
+
+    # Validate input_size_per_partition
+    if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
+        raise ValueError(
+            f"Weight input_size_per_partition = "
+            f"{input_size_per_partition} is not divisible "
+            f"by min_thread_k = {GPTQ_MARLIN_MIN_THREAD_K}. "
+            "Consider reducing tensor_parallel_size or running "
+            "with --quantization gptq."
+        )
+
+    if group_size < input_size and input_size_per_partition % group_size != 0:
+        raise ValueError(
+            f"Weight input_size_per_partition = {input_size_per_partition}"
+            f" is not divisible by group_size = {group_size}. "
+            "Consider reducing tensor_parallel_size or running "
+            "with --quantization gptq."
+        )
+
+
+def check_marlin_supports_shape(
+    output_size_per_partition: int,
+    input_size_per_partition: int,
+    input_size: int,
+    group_size: int,
+) -> tuple[bool, Optional[str]]:
+    try:
+        verify_marlin_supports_shape(
+            output_size_per_partition, input_size_per_partition, input_size, group_size
+        )
+    except ValueError as e:
+        return False, e.__str__()
+    return True, None
+
+
+def check_marlin_supports_layer(layer: LinearBase, group_size: int) -> bool:
+    output_size_per_partition = (
+        getattr(layer, "output_size_per_partition", None) or layer.output_size
+    )
+    input_size_per_partition = (
+        getattr(layer, "input_size_per_partition", None) or layer.input_size
+    )
+
+    return check_marlin_supports_shape(
+        output_size_per_partition=output_size_per_partition,
+        input_size_per_partition=input_size_per_partition,
+        input_size=layer.input_size,
+        group_size=group_size,
+    )[0]
+
+
+def check_moe_marlin_supports_layer(layer: FusedMoE, group_size: int) -> bool:
+    hidden_size = layer.hidden_size
+    intermediate_size_per_partition = layer.intermediate_size_per_partition
+    # apply_router_weight_on_input is not supported for moe marlin
+    supports_router_weight = not layer.moe_runner_config.apply_router_weight_on_input
+    # moe marlin requires the activation to be silu
+    supports_activation = layer.moe_runner_config.activation == "silu"
+
+    # gate-up: (n, k) = (intermediate_size_per_partition * 2, hidden_size)
+    # down: (n, k) = (hidden_size, intermediate_size_per_partition)
+    # moe marlin requires n % 128 == 0 and k % 64 == 0
+    supports_shape = (
+        hidden_size % 128 == 0
+        and intermediate_size_per_partition % max(64, group_size) == 0
+    )
+    supports_group_size = group_size in [-1, 32, 64, 128]
+    return (
+        supports_shape
+        and supports_group_size
+        and supports_router_weight
+        and supports_activation
+    )
+
+
+def marlin_make_workspace(
+    device: torch.device, max_blocks_per_sm: int = 1
+) -> torch.Tensor:
+    # In the new marlin kernel, we use the num of threadblocks as workspace
+    # size. The num of threadblocks is is sms_count * max_blocks_per_sm.
+    sms = torch.cuda.get_device_properties(device).multi_processor_count
+    return torch.zeros(
+        sms * max_blocks_per_sm, dtype=torch.int, device=device, requires_grad=False
+    )
+
+
+def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
+    return (not act_order) or (act_order and not is_row_parallel)
+
+
+def marlin_repeat_scales_on_all_ranks(
+    act_order: bool, group_size: int, is_row_parallel: bool
+) -> bool:
+    # Need to repeat scales on every rank if act_ordering or
+    # channelwise and RowParallelLinear
+    is_channelwise = group_size == -1
+    return act_order or (is_channelwise and is_row_parallel)
+
+
+def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(
+        torch.empty(0, dtype=torch.int, device=device), requires_grad=False
+    )
+
+
+def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(
+        torch.empty(0, dtype=torch.int, device=device), requires_grad=False
+    )
+
+
+def marlin_sort_g_idx(g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
+    return g_idx[g_idx_sort_indices], g_idx_sort_indices
+
+
+def get_scale_perms():
+    scale_perm: list[int] = []
+    for i in range(8):
+        scale_perm.extend([i + 8 * j for j in range(8)])
+    scale_perm_single: list[int] = []
+    for i in range(4):
+        scale_perm_single.extend([2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
+    return scale_perm, scale_perm_single
+
+
+def marlin_permute_scales(
+    s: torch.Tensor, size_k: int, size_n: int, group_size: int
+) -> torch.Tensor:
+
+    scale_perm, scale_perm_single = get_scale_perms()
+    if group_size < size_k and group_size != -1:
+        s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
+    else:
+        s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
+    s = s.reshape((-1, size_n)).contiguous()
+
+    return s
+
+
+def marlin_permute_bias(s: torch.Tensor) -> torch.Tensor:
+    origin_shape = s.shape
+    _, scale_perm_single = get_scale_perms()
+    s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
+    return s.reshape(*origin_shape).contiguous()
+
+
+def marlin_moe_permute_scales(
+    s: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    group_size: int,
+):
+    num_experts = s.shape[0]
+    output = torch.empty(
+        (num_experts, s.shape[1], s.shape[2]),
+        device=s.device,
+        dtype=s.dtype,
+    )
+
+    for e in range(num_experts):
+        output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size)
+    return output
+
+
+def marlin_zero_points(
+    zp: torch.Tensor, size_k: int, size_n: int, num_bits: int
+) -> torch.Tensor:
+    # Permute zero-points in a similar way to scales, but do not use the
+    # "single" permutation, since zero-points are applied on every MMA
+    scale_perm, _ = get_scale_perms()
+    zp = zp.reshape((-1, len(scale_perm)))[:, scale_perm]
+
+    # Interleave column dim (for the dequantize code) and pack it to int32
+    if num_bits == 4:
+        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
+    elif num_bits == 8:
+        interleave = numpy.array([0, 2, 1, 3])
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    zp = zp.reshape((-1, len(interleave)))[:, interleave].ravel()
+    zp = zp.reshape((-1, size_n)).contiguous()
+    zp = pack_cols(zp, num_bits, size_k, size_n)
+
+    return zp
+
+
+def awq_to_marlin_zero_points(
+    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
+) -> torch.Tensor:
+    # AWQ zero-points are quantized and packed on the column dim.
+    # In addition, the values are permuted based on dequantizer.
+    # Here we undo both of these, and then apply marlin permutation
+    # and pack it back.
+    q_zp = unpack_cols(q_zp_packed, num_bits, size_k, size_n)
+
+    # Undo interleaving (use argsort(..) to get inverse perm)
+    if num_bits == 4:
+        undo_interleave = numpy.argsort(numpy.array([0, 2, 4, 6, 1, 3, 5, 7]))
+    elif num_bits == 8:
+        undo_interleave = numpy.argsort(numpy.array([0, 2, 1, 3]))
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    q_zp = q_zp.reshape((-1, len(undo_interleave)))[:, undo_interleave].ravel()
+    q_zp = q_zp.reshape((-1, size_n)).contiguous()
+
+    marlin_zp = marlin_zero_points(q_zp, size_k, size_n, num_bits)
+    return marlin_zp
+
+
+def moe_awq_to_marlin_zero_points(
+    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
+):
+    num_experts = q_zp_packed.shape[0]
+    output = torch.empty(
+        (num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
+        device=q_zp_packed.device,
+        dtype=q_zp_packed.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n, num_bits)
+    return output
+
+
+def maybe_warn_marlin_atomic_add(device, dtype):
+    if torch.compiler.is_dynamo_compiling():
+        return
+    device_capability = torch.cuda.get_device_capability(device)
+    if device_capability[0] < 9 and dtype == torch.bfloat16:
+        logger.info_once(
+            "You are running Marlin kernel with bf16 on GPUs before SM90. "
+            "You can consider change to fp16 to achieve better performance "
+            "if possible."
+        )
+
+
+def maybe_warn_marlin_atomic_add_env():
+    if torch.compiler.is_dynamo_compiling():
+        return
+    # TODO(yiyun): Need to add sglang's MARLIN_USE_ATOMIC_ADD: bool = False
+    if True:
+        return
+    # if envs.VLLM_MARLIN_USE_ATOMIC_ADD:
+    #     return
+    logger.info_once(
+        "Marlin kernel can achieve better performance for small size_n "
+        "with experimental use_atomic_add feature. "
+        "You can consider set environment variable "
+        "VLLM_MARLIN_USE_ATOMIC_ADD to 1 if possible."
+    )
+
+
+def should_use_atomic_add_reduce(
+    m: int, n: int, k: int, device: torch.device, dtype: torch.dtype
+) -> bool:
+
+    # the performance of atomicAdd is better than global reduce
+    # only when m*n is small and k is large
+    if n >= 2048 or k < 2048 or device.type != "cuda":
+        return False
+
+    # disable atomicAdd reduce by default,
+    # one can enable it with VLLM_MARLIN_USE_ATOMIC_ADD=1
+    # TODO: Need to add sglang's MARLIN_USE_ATOMIC_ADD: bool = False
+    if not True:
+        maybe_warn_marlin_atomic_add_env()
+        return False
+
+    # sm8x doesn't support atomicAdd + bfloat16 natively
+    device_capability = torch.cuda.get_device_capability(device)
+    if device_capability[0] < 9 and dtype == torch.bfloat16:
+        maybe_warn_marlin_atomic_add(device, dtype)
+        return False
+
+    return True
+
+
+def apply_gptq_marlin_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    weight_zp: torch.Tensor,
+    g_idx: torch.Tensor,
+    g_idx_sort_indices: torch.Tensor,
+    workspace: torch.Tensor,
+    wtype: ScalarType,
+    output_size_per_partition: int,
+    input_size_per_partition: int,
+    is_k_full: bool,
+    bias: Optional[torch.Tensor] = None,
+    use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
+) -> torch.Tensor:
+    reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (output_size_per_partition,)
+
+    use_atomic_add = should_use_atomic_add_reduce(
+        m=reshaped_x.size(0),
+        n=output_size_per_partition,
+        k=reshaped_x.size(1),
+        device=input.device,
+        dtype=input.dtype,
+    )
+
+    output = gptq_marlin_gemm(
+        reshaped_x,
+        None,
+        weight,
+        weight_scale,
+        None,
+        weight_zp,
+        g_idx,
+        g_idx_sort_indices,
+        workspace,
+        wtype,
+        size_m=reshaped_x.shape[0],
+        size_n=output_size_per_partition,
+        size_k=input_size_per_partition,
+        is_k_full=is_k_full,
+        use_atomic_add=use_atomic_add,
+        use_fp32_reduce=use_fp32_reduce,
+        is_zp_float=False,
+    )
+
+    if bias is not None:
+        output.add_(bias)  # In-place add
+
+    return output.reshape(out_shape)
+
+
+def apply_awq_marlin_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    weight_zp: torch.Tensor,
+    g_idx: torch.Tensor,
+    g_idx_sort_indices: torch.Tensor,
+    workspace: torch.Tensor,
+    quant_type: ScalarType,
+    output_size_per_partition: int,
+    input_size_per_partition: int,
+    bias: Optional[torch.Tensor] = None,
+    use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
+) -> torch.Tensor:
+    reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (output_size_per_partition,)
+
+    use_atomic_add = should_use_atomic_add_reduce(
+        m=reshaped_x.size(0),
+        n=output_size_per_partition,
+        k=reshaped_x.size(1),
+        device=input.device,
+        dtype=input.dtype,
+    )
+
+    output = gptq_marlin_gemm(
+        reshaped_x,
+        None,
+        weight,
+        weight_scale,
+        None,
+        weight_zp,
+        g_idx,
+        g_idx_sort_indices,
+        workspace,
+        quant_type,
+        size_m=reshaped_x.shape[0],
+        size_n=output_size_per_partition,
+        size_k=input_size_per_partition,
+        use_atomic_add=use_atomic_add,
+        use_fp32_reduce=use_fp32_reduce,
+        is_zp_float=False,
+    )
+
+    if bias is not None:
+        output.add_(bias)  # In-place add
+
+    return output.reshape(out_shape)
+
+
+class MarlinConfig(QuantizationConfig):
+    """Config class for Marlin.
+
+    Reference: https://github.com/IST-DASLab/marlin/tree/master
+    """
+
+    def __init__(
+        self,
+        group_size: int,
+        lm_head_quantized: bool,
+    ) -> None:
+        super().__init__()
+
+        # Group size for the quantization.
+        self.group_size = group_size
+        self.lm_head_quantized = lm_head_quantized
+        if self.group_size != 128 and self.group_size != -1:
+            raise ValueError(
+                "Currently, only group size 128 and -1 (channelwise) "
+                "is supported for Marlin, but got group_size of "
+                f"{self.group_size}"
+            )
+
+        # 4 Bits packed into 32 bit datatype.
+        self.pack_factor = 32 // 4
+
+        # Tile size used by marlin kernels.
+        self.tile_size = 16
+
+        # Min out_features dim
+        self.min_n_threads = 64
+
+        # Min in_features dim
+        self.min_k_threads = 128
+
+        # Max parallel problems to solve at once (improves large
+        # batch performance)
+        self.max_parallel = 16
+
+        # Permutation length used by the marlin kernels.
+        self.perm_len = 1024
+
+    def __repr__(self) -> str:
+        return (
+            f"MarlinConfig(group_size={self.group_size}, "
+            f"lm_head_quantized={self.lm_head_quantized})"
+        )
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "marlin"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.half]
+
+    @classmethod
+    # Need to figure it out
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "MarlinConfig":
+        group_size = cls.get_from_keys(config, ["group_size"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
+        return cls(group_size, lm_head_quantized)
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:
+        # compat: autogptq >=0.8.0 use checkpoint_format: str
+        # compat: autogptq <=0.7.1 is_marlin_format: bool
+        is_marlin_format = hf_quant_cfg.get(
+            "checkpoint_format"
+        ) == "marlin" or hf_quant_cfg.get("is_marlin_format", False)
+
+        is_valid_user_quant = (
+            user_quant is None or user_quant == "gptq" or user_quant == "marlin"
+        )
+
+        if is_marlin_format and is_valid_user_quant:
+            msg = "The model is serialized in {} format. Using {} kernel.".format(
+                cls.get_name(), cls.get_name()
+            )
+            logger.info(msg)
+            return cls.get_name()
+
+        return None
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[MarlinLinearMethod]:
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+
+        if isinstance(layer, LinearBase) or (
+            isinstance(layer, ParallelLMHead) and self.lm_head_quantized
+        ):
+            return MarlinLinearMethod(self)
+        return None
+
+
+class MarlinLinearMethod(LinearMethodBase):
+    """Linear method for Marlin.
+
+    Args:
+        quant_config: The Marlin quantization config.
+    """
+
+    def __init__(self, quant_config: MarlinConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del output_size  # Unused.
+        weight_loader = extra_weight_attrs["weight_loader"]
+
+        if params_dtype != torch.float16:
+            raise ValueError(
+                f"The params dtype must be float16, but got {params_dtype}"
+            )
+
+        # Validate output_size_per_partition
+        output_size_per_partition = sum(output_partition_sizes)
+        if output_size_per_partition % self.quant_config.min_n_threads != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by "
+                f"min_n_threads = {self.quant_config.min_n_threads}."
+            )
+        if output_size_per_partition % self.quant_config.pack_factor != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by "
+                f"pack_factor = {self.quant_config.pack_factor}."
+            )
+
+        # Validate input_size_per_partition
+        if input_size_per_partition % self.quant_config.min_k_threads != 0:
+            raise ValueError(
+                f"Weight input_size_per_partition = "
+                f"{input_size_per_partition} is not divisible by "
+                f"min_k_threads = {self.quant_config.min_k_threads}."
+            )
+        if (
+            self.quant_config.group_size != -1
+            and input_size_per_partition % self.quant_config.group_size != 0
+        ):
+            raise ValueError(
+                f"Weight input_size_per_partition = "
+                f"{input_size_per_partition} is not divisible by "
+                f"group_size = {self.quant_config.group_size}."
+            )
+
+        # Check that we have at least 4 tiles horizontally in the shard
+        num_tiles_per_perm = self.quant_config.perm_len // (
+            self.quant_config.tile_size**2
+        )
+        if output_size_per_partition % num_tiles_per_perm != 0:
+            raise ValueError("Each permutation group must reside on the same gpu")
+
+        # Quantized 4Bit weights packed into Int32.
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // self.quant_config.tile_size,
+                output_size_per_partition
+                * self.quant_config.tile_size
+                // self.quant_config.pack_factor,
+                device="cuda",
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            marlin_tile_size=self.quant_config.tile_size,
+            weight_loader=weight_loader,
+        )
+
+        # Determine if channelwise or not
+        input_groups = (
+            1
+            if self.quant_config.group_size == -1
+            else input_size_per_partition // self.quant_config.group_size
+        )
+
+        weight_scale_args = {
+            "data": torch.empty(
+                input_groups,
+                output_size_per_partition,
+                device="cuda",
+                dtype=params_dtype,
+            ),
+            "weight_loader": weight_loader,
+        }
+        if input_groups == 1:
+            scales = ChannelQuantScaleParameter(output_dim=1, **weight_scale_args)
+        else:
+            scales = GroupQuantScaleParameter(
+                output_dim=1, input_dim=0, **weight_scale_args
+            )
+
+        # Allocate workspace (Used for internal locking mechanism)
+        max_workspace_size = (
+            output_size_per_partition // self.quant_config.min_n_threads
+        ) * self.quant_config.max_parallel
+
+        workspace = BasevLLMParameter(
+            data=torch.zeros(max_workspace_size, device="cuda", dtype=torch.int),
+            weight_loader=weight_loader,
+        )
+
+        layer.register_parameter("B", qweight)
+        layer.register_parameter("s", scales)
+        layer.register_parameter("workspace", workspace)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # required by torch.compile
+        layer.B = torch.nn.Parameter(layer.B.data, requires_grad=False)
+        layer.s = torch.nn.Parameter(layer.s.data, requires_grad=False)
+        layer.workspace = torch.nn.Parameter(layer.workspace.data, requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qweight = layer.B
+        scales = layer.s
+        workspace = layer.workspace
+
+        x_2d = x.view(-1, x.shape[-1])
+
+        size_m = x_2d.shape[0]
+        size_k = x_2d.shape[1]
+        size_n = scales.shape[1]
+
+        output_2d = ops.marlin_gemm(
+            x_2d, qweight, scales, workspace, size_m, size_n, size_k
+        )
+
+        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1],))
+
+        if bias is not None:
+            output.add_(bias)  # In-place add
+
+        return output
diff --git a/python/sglang/srt/layers/quantization/marlin_utils_fp8.py b/python/sglang/srt/layers/quantization/marlin_utils_fp8.py
new file mode 100644
index 000000000..94326d71e
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/marlin_utils_fp8.py
@@ -0,0 +1,352 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+from typing import Optional
+
+import torch
+
+from sglang.srt.layers.quantization.marlin_utils import (
+    USE_FP32_REDUCE_DEFAULT,
+    marlin_make_workspace,
+    marlin_permute_bias,
+    marlin_permute_scales,
+    should_use_atomic_add_reduce,
+)
+from sglang.srt.layers.quantization.utils import get_scalar_types
+from sglang.srt.utils import is_cuda
+
+_is_cuda = is_cuda()
+if _is_cuda:
+    from sgl_kernel import gptq_marlin_gemm, gptq_marlin_repack
+
+ScalarType, scalar_types = get_scalar_types()
+
+logger = logging.getLogger(__name__)
+
+
+def fp8_fused_exponent_bias_into_scales(scales):
+    fp8_exponent = 4
+    if scales.dtype == torch.half:
+        target_exponent = 5
+    elif scales.dtype == torch.bfloat16:
+        target_exponent = 8
+    # exponent_bias_fp16 = 2 ** 4 - 2 ** 3 = 8
+    # exponent_bias_bf16 = 2 ** 7 - 2 ** 3 = 120
+    exponent_bias = 2 ** (target_exponent - 1) - 2 ** (fp8_exponent - 1)
+    s = torch.ones_like(scales) * 2
+    s = s**exponent_bias
+    return scales * s
+
+
+def apply_fp8_marlin_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    workspace: torch.Tensor,
+    size_n: int,
+    size_k: int,
+    bias: Optional[torch.Tensor],
+    use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
+) -> torch.Tensor:
+    # For GPUs that lack FP8 hardware support, we can leverage the
+    # Marlin kernel for fast weight-only FP8 quantization
+
+    reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (size_n,)
+
+    use_atomic_add = should_use_atomic_add_reduce(
+        m=reshaped_x.size(0), n=size_n, k=size_k, device=input.device, dtype=input.dtype
+    )
+
+    output = gptq_marlin_gemm(
+        a=reshaped_x,
+        c=None,
+        b_q_weight=weight,
+        b_bias=bias,
+        b_scales=weight_scale,
+        global_scale=None,
+        b_zeros=None,
+        g_idx=None,
+        perm=None,
+        workspace=workspace,
+        b_q_type=scalar_types.float8_e4m3fn,
+        size_m=reshaped_x.size(0),
+        size_n=size_n,
+        size_k=size_k,
+        use_atomic_add=use_atomic_add,
+        use_fp32_reduce=use_fp32_reduce,
+    )
+
+    return output.reshape(out_shape)
+
+
+def prepare_fp8_layer_for_marlin(
+    layer: torch.nn.Module, size_k_first: bool = True
+) -> None:
+    logger.warning_once(
+        "Your GPU does not have native support for FP8 computation but "
+        "FP8 quantization is being used. Weight-only FP8 compression will "
+        "be used leveraging the Marlin kernel. This may degrade "
+        "performance for compute-heavy workloads."
+    )
+
+    part_size_n = layer.output_size_per_partition
+    part_size_k = layer.input_size_per_partition
+    weight_block_size = getattr(layer, "weight_block_size", None)
+
+    if size_k_first:
+        assert layer.weight.shape == (part_size_k, part_size_n)
+    else:
+        assert layer.weight.shape == (part_size_n, part_size_k)
+
+    device = layer.weight.device
+
+    # WORKSPACE
+    layer.workspace = marlin_make_workspace(device)
+
+    # WEIGHT
+    # Repack weights to marlin format
+    perm = torch.empty(0, dtype=torch.int, device=device)
+    qweight = pack_fp8_to_int32(layer.weight, size_k_first)
+    if not size_k_first:
+        qweight = qweight.T.contiguous()
+
+    marlin_qweight = gptq_marlin_repack(
+        b_q_weight=qweight,
+        perm=perm,
+        size_k=part_size_k,
+        size_n=part_size_n,
+        num_bits=8,
+    )
+    layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
+
+    # WEIGHT SCALES
+    # Permute scales
+    if "weight_scale" in dir(layer):
+        scales = layer.weight_scale.to(layer.orig_dtype)
+    elif "weight_scale_inv" in dir(layer):
+        scales = layer.weight_scale_inv.to(layer.orig_dtype)
+        del layer.weight_scale_inv
+
+    group_size = -1 if weight_block_size is None else weight_block_size[1]
+
+    # marlin kernel only support channel-wise and group-wise quantization
+    # we need to convert the scales
+    if weight_block_size is None:
+        if scales.nelement() == 1:
+            # tensor-wise quantization -> channel-wise quantization
+            # (1, 1) =>(repeat)=> (1, size_n)
+            scales = scales.view(1, 1).repeat_interleave(part_size_n, 1)
+        elif scales.nelement() > 1 and scales.nelement() != part_size_n:
+            assert part_size_n % scales.nelement() == 0
+            s_size = scales.nelement()
+            # tensor-wise quantization (for gate-up proj)
+            #     -> channel-wise quantization
+            # (1, s_size) =>(repeat)=> (1, size_n)
+            scales = scales.view(1, s_size)
+            scales = scales.repeat_interleave(part_size_n // s_size, 1)
+        else:
+            # channel-wise quantization
+            # (1, size_n)
+            scales = scales.view(1, part_size_n)
+    else:
+        # block-wise quantization -> group-wise quantization
+        # (size_k // block_size[1], ceil(size_n / block_size[0]))
+        #  =>(repeat)=> (size_k // block_size[1], size_n)
+        if not size_k_first:
+            scales = scales.T.contiguous()
+        block_n = weight_block_size[0]
+        scales = scales.repeat_interleave(block_n, 1)
+        # size_n may not divisible by block_size[0]
+        scales = scales[:, :part_size_n]
+
+    marlin_scales = marlin_permute_scales(
+        s=scales, size_k=part_size_k, size_n=part_size_n, group_size=group_size
+    )
+    marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
+    layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
+
+    if hasattr(layer, "bias") and layer.bias is not None:
+        assert layer.bias.shape == (part_size_n,)
+        bias = marlin_permute_bias(layer.bias)
+        layer.bias = torch.nn.Parameter(bias, requires_grad=False)
+
+
+def prepare_moe_fp8_layer_for_marlin(
+    layer: torch.nn.Module, size_k_first: bool = True
+) -> None:
+    logger.warning_once(
+        "Your GPU does not have native support for FP8 computation but "
+        "FP8 quantization is being used. Weight-only FP8 compression will "
+        "be used leveraging the Marlin kernel. This may degrade "
+        "performance for compute-heavy workloads."
+    )
+
+    e = layer.num_experts
+    k = layer.hidden_size
+    n = layer.intermediate_size_per_partition
+    weight_block_size = getattr(layer, "weight_block_size", None)
+
+    # WORKSPACE
+    device = layer.w13_weight.device
+    layer.workspace = marlin_make_workspace(device, 4)
+    perm = torch.empty(0, dtype=torch.int, device=device)
+
+    # WEIGHT
+    # Repack weights to marlin format
+    for name in ["w13_weight", "w2_weight"]:
+        weight = getattr(layer, name)
+        tensor_list = []
+        if "w13" in name:
+            size_n, size_k = n * 2, k
+        else:
+            size_n, size_k = k, n
+
+        if size_k_first:
+            assert weight.shape == (e, size_k, size_n)
+        else:
+            assert weight.shape == (e, size_n, size_k)
+
+        for i in range(e):
+            qweight = pack_fp8_to_int32(weight[i], size_k_first)
+            if not size_k_first:
+                qweight = qweight.T.contiguous()
+
+            marlin_qweight = gptq_marlin_repack(
+                b_q_weight=qweight, perm=perm, size_k=size_k, size_n=size_n, num_bits=8
+            )
+            tensor_list.append(marlin_qweight)
+
+        weight = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        weight = torch.nn.Parameter(weight, requires_grad=False)
+
+        setattr(layer, name, weight)
+
+    # WEIGHT SCALES
+    # Permute scales
+    group_size = -1 if weight_block_size is None else weight_block_size[1]
+
+    for name in ["w13", "w2"]:
+        if name + "_weight_scale" in dir(layer):
+            new_name = name + "_weight_scale"
+            scales = getattr(layer, new_name).to(layer.orig_dtype)
+            delattr(layer, new_name)
+        elif name + "_weight_scale_inv" in dir(layer):
+            new_name = name + "_weight_scale_inv"
+            scales = getattr(layer, new_name).to(layer.orig_dtype)
+            delattr(layer, new_name)
+
+        tensor_list = []
+        if "w13" in name:
+            size_n, size_k = n * 2, k
+        else:
+            size_n, size_k = k, n
+
+        # marlin kernel only support channel-wise and group-wise quantization
+        # we need to convert the scales
+        if weight_block_size is None:
+            if scales.nelement() == e:
+                # tensor-wise quantization -> channel-wise quantization
+                # (e, 1, 1) =>(repeat)=> (e, 1, size_n)
+                scales = scales.view(e, 1, 1).repeat_interleave(size_n, 2)
+            elif scales.nelement() > e and scales.nelement() != e * size_n:
+                assert (e * size_n) % scales.nelement() == 0
+                s_size = scales.nelement() // e
+                # tensor-wise quantization (for gate-up proj)
+                #     -> channel-wise quantization
+                # (e, 1, s_size) =>(repeat)=> (e, 1, size_n)
+                scales = scales.view(e, 1, s_size)
+                scales = scales.repeat_interleave(size_n // s_size, 2)
+            else:
+                # channel-wise quantization
+                # (e, 1, size_n)
+                scales = scales.view(e, 1, size_n)
+        else:
+            # block-wise quantization -> group-wise quantization
+            # (e, size_k // block_size[1], ceil(size_n / block_size[0]))
+            #  =>(repeat)=> (e, size_k // block_size[1], size_n)
+            if not size_k_first:
+                scales = scales.permute(0, 2, 1)
+            block_n = weight_block_size[0]
+            scales = scales.repeat_interleave(block_n, 2)
+            # size_n may not divisible by block_size[0]
+            scales = scales[..., :size_n].contiguous()
+
+        for i in range(e):
+            marlin_scales = marlin_permute_scales(
+                s=scales[i], size_k=size_k, size_n=size_n, group_size=group_size
+            )
+            tensor_list.append(marlin_scales)
+
+        scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        scales = fp8_fused_exponent_bias_into_scales(scales)
+        scales = torch.nn.Parameter(scales, requires_grad=False)
+
+        setattr(layer, name + "_weight_scale", scales)
+
+    # BIAS
+    # Permute bias
+    for name in ["w13_bias", "w2_bias"]:
+        if not hasattr(layer, name):
+            continue
+        bias = getattr(layer, name).to(layer.orig_dtype)
+
+        tensor_list = []
+        for i in range(e):
+            expert_bias = bias[i]
+
+            tensor_list.append(marlin_permute_bias(expert_bias))
+
+        bias = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        bias = torch.nn.Parameter(bias, requires_grad=False)
+        setattr(layer, name, bias)
+
+
+def pack_fp8_to_int32(
+    fp8_tensor: torch.Tensor, size_k_first: bool = True
+) -> torch.Tensor:
+    """
+    Repack FP8 weights to gptq format (packed int32 elements)
+    """
+    assert fp8_tensor.dtype == torch.float8_e4m3fn
+    assert fp8_tensor.ndim == 2
+
+    fp8_tensor = fp8_tensor.T if size_k_first else fp8_tensor
+    fp8_tensor = fp8_tensor.contiguous()
+    # fp8_tensor is contiguous and have shape (N, K) now
+    # with `.view(torch.int32)`, it become (N, K // 4)
+    int32_tensor = fp8_tensor.view(torch.int32)
+    return int32_tensor.T.contiguous() if size_k_first else int32_tensor
+
+
+def marlin_quant_fp8_torch(weight, group_size):
+    size_n, size_k = weight.shape
+    device = weight.device
+
+    if group_size != -1:
+        scales = weight.view(size_n, -1, group_size).abs().max(-1)[0] / 448
+        repeated_scales = scales.repeat_interleave(group_size, 1)
+        fp8_weight = (weight / repeated_scales).to(torch.float8_e4m3fn)
+        weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
+    else:
+        scales = weight.view(size_n, 1, group_size).abs().max(-1)[0] / 448
+        repeated_scales = scales.repeat_interleave(size_k, 1)
+        fp8_weight = (weight / repeated_scales).to(torch.float8_e4m3fn)
+        weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
+
+    packed_weight = pack_fp8_to_int32(fp8_weight, False).T.contiguous()
+    marlin_qweight = gptq_marlin_repack(
+        b_q_weight=packed_weight,
+        perm=torch.empty(0, dtype=torch.int, device=device),
+        size_k=size_k,
+        size_n=size_n,
+        num_bits=8,
+    )
+
+    marlin_scales = marlin_permute_scales(
+        s=scales.T, size_k=size_k, size_n=size_n, group_size=group_size
+    )
+
+    marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
+
+    return weight_ref.T, marlin_qweight, marlin_scales
diff --git a/python/sglang/srt/layers/quantization/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py
new file mode 100755
index 000000000..89ecd44f5
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -0,0 +1,1402 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/modelopt.py
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from sglang.srt.distributed import get_tp_group
+from sglang.srt.layers.dp_attention import get_dp_global_num_tokens, get_local_dp_buffer
+from sglang.srt.layers.moe import (
+    MoeRunner,
+    MoeRunnerBackend,
+    MoeRunnerConfig,
+    should_use_flashinfer_cutlass_moe_fp4_allgather,
+    should_use_flashinfer_trtllm_moe,
+)
+from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams, CutlassMoEType
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.fp8_utils import (
+    apply_fp8_linear,
+    cutlass_fp8_supported,
+    is_sm100_supported,
+)
+from sglang.srt.layers.quantization.kv_cache import BaseKVCacheMethod
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+from sglang.srt.layers.quantization.utils import (
+    convert_to_channelwise,
+    is_layer_skipped,
+    per_tensor_dequantize,
+    requantize_with_max_scale,
+)
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.utils import is_cuda, next_power_of_2
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+
+if is_cuda():
+    from sgl_kernel import scaled_fp4_quant
+
+try:
+    from flashinfer import mm_fp4 as fp4_gemm
+    from flashinfer import reorder_rows_for_gated_act_gemm, shuffle_matrix_sf_a
+
+    enable_flashinfer_fp4_gemm = True
+except ImportError:
+    if is_cuda():
+        from sgl_kernel import cutlass_scaled_fp4_mm as fp4_gemm
+    else:
+        fp4_gemm = None
+    enable_flashinfer_fp4_gemm = False
+    reorder_rows_for_gated_act_gemm = None
+    shuffle_matrix_a = None
+    shuffle_matrix_sf_a = None
+
+try:
+    from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe
+except ImportError:
+    flashinfer_cutlass_fused_moe = None
+
+# Initialize logger for the module
+logger = logging.getLogger(__name__)
+
+# Supported activation schemes for the current configuration
+ACTIVATION_SCHEMES = ["static"]
+
+
+class ModelOptFp8Config(QuantizationConfig):
+    """Configuration for ModelOpt FP8 quantization, including serialization and compatibility checks."""
+
+    def __init__(
+        self,
+        is_checkpoint_fp8_serialized: bool = False,
+        kv_cache_quant_method: Optional[str] = None,
+        exclude_modules: Optional[List[str]] = None,
+    ) -> None:
+        """
+        Args:
+            is_checkpoint_fp8_serialized (bool): Indicates if the checkpoint uses serialized FP8 format.
+        """
+        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+        self.kv_cache_quant_method = kv_cache_quant_method
+        self.exclude_modules = exclude_modules
+        if is_checkpoint_fp8_serialized:
+            logger.warning(
+                "Detected ModelOpt FP8 checkpoint. The format is experimental and subject to change."
+            )
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "modelopt"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 89  # Minimum hardware capability (e.g., Hopper GPUs).
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["hf_quant_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> ModelOptFp8Config:
+        # Handle two different config formats:
+        # 1. hf_quant_config.json format: {"quantization": {"quant_algo": "FP8", ...}}
+        # 2. config.json quantization_config format: {"quant_algo": "FP8", ...}
+        # In future modelopt will deprecate hf_quant_config.json, and only keep config.json.
+        # For legacy reasons, we keep hf_quant_config.json for now.
+
+        # Initialize variables
+        kv_cache_quant_method = None
+        exclude_modules = None
+
+        # Try flat format first (config.json quantization_config - preferred format)
+        quant_method = config.get("quant_algo")
+        if quant_method is not None:
+            # Flat format (config.json quantization_config)
+            # For kv_cache, check if kv_cache_scheme exists and extract algo
+            kv_cache_scheme = config.get("kv_cache_scheme")
+            if (
+                kv_cache_scheme
+                and kv_cache_scheme.get("type") == "float"
+                and kv_cache_scheme.get("num_bits") == 8
+            ):
+                kv_cache_quant_method = "FP8"
+
+            # Map 'ignore' field to 'exclude_modules'
+            exclude_modules = config.get("ignore")
+        else:
+            # Fall back to nested format (hf_quant_config.json - legacy format)
+            try:
+                quantization_section = cls.get_from_keys(config, ["quantization"])
+                quant_method = quantization_section.get("quant_algo")
+                kv_cache_quant_method = quantization_section.get("kv_cache_quant_algo")
+                exclude_modules = quantization_section.get("exclude_modules")
+            except ValueError:
+                raise ValueError(
+                    "Cannot find 'quant_algo' in the model's quantization config. "
+                    "Expected either flat format (config.json) or nested format (hf_quant_config.json)."
+                )
+        if quant_method is None:
+            raise ValueError(
+                "Cannot find 'quant_algo' in the model's quantization config. "
+            )
+        if "FP8" not in quant_method:
+            raise ValueError(
+                "ModelOptFp8Config only supports static FP8 quantization in SGLang. "
+                "For FP4 quantization, use ModelOptFp4Config. "
+                "Check the quantization config for your model's configuration."
+            )
+
+        return cls(
+            is_checkpoint_fp8_serialized=True,
+            kv_cache_quant_method=kv_cache_quant_method,
+            exclude_modules=exclude_modules,
+        )
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[QuantizeMethodBase]:
+
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+
+        if self.exclude_modules and any(
+            module in prefix
+            or (
+                prefix.startswith("language_model.")
+                and module in prefix.removeprefix("language_model.")
+            )
+            for module in self.exclude_modules
+        ):
+            return None
+
+        if isinstance(layer, LinearBase):
+            return ModelOptFp8LinearMethod(self)
+        if self.kv_cache_quant_method and isinstance(layer, RadixAttention):
+            return ModelOptFp8KVCacheMethod(self)
+
+        if isinstance(layer, FusedMoE):
+            return ModelOptFp8MoEMethod(self)
+
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class ModelOptFp8LinearMethod(LinearMethodBase):
+    """Linear method for ModelOpt static FP8 quantization.
+
+    Supports loading FP8 checkpoints with static weight and activation scales.
+    Future support may include dynamic scales.
+
+    **Limitations**:
+    1. Only supports per-tensor quantization due to `torch._scaled_mm` limitations.
+    2. Only supports the `float8_e4m3fn` data type.
+
+    Args:
+        quant_config (ModelOptFp8Config): The ModelOpt quantization configuration.
+    """
+
+    def __init__(self, quant_config: ModelOptFp8Config):
+        super().__init__()
+        self.quant_config = quant_config
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        """Creates and registers weights, weight scales, and input scales for FP8 quantization."""
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        weight_dtype = (
+            torch.float8_e4m3fn
+            if self.quant_config.is_checkpoint_fp8_serialized
+            else params_dtype
+        )
+
+        # Set layer attributes
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+
+        # Register weight
+        layer.register_parameter(
+            "weight",
+            ModelWeightParameter(
+                data=torch.empty(
+                    output_size_per_partition,
+                    input_size_per_partition,
+                    dtype=weight_dtype,
+                ),
+                input_dim=1,
+                output_dim=0,
+                weight_loader=weight_loader,
+            ),
+        )
+
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            # Register weight and input scales
+            for scale_name in ["weight_scale", "input_scale"]:
+                layer.register_parameter(
+                    scale_name,
+                    PerTensorScaleParameter(
+                        data=torch.full(
+                            (len(output_partition_sizes),),
+                            torch.finfo(torch.float32).min,
+                            dtype=torch.float32,
+                        ),
+                        weight_loader=weight_loader,
+                    ),
+                )
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        """Requantizes weights after loading using the maximum scale."""
+        max_w_scale, quantized_weight = requantize_with_max_scale(
+            layer.weight, layer.weight_scale, layer.logical_widths
+        )
+        layer.weight = Parameter(quantized_weight.t(), requires_grad=False)
+        # cutlass sgl-kernel only supports per-channel scale
+        if self.cutlass_fp8_supported:
+            max_w_scale = convert_to_channelwise(max_w_scale, layer.logical_widths)
+        layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
+        layer.input_scale = Parameter(layer.input_scale.max(), requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Applies FP8 linear transformation."""
+        return apply_fp8_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=layer.input_scale,
+            bias=bias,
+            cutlass_fp8_supported=self.cutlass_fp8_supported,
+        )
+
+
+class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
+    """
+    Handles loading FP8 kv-cache scaling factors from modelopt quantized checkpoints.
+    """
+
+    def __init__(self, quant_config: ModelOptFp8Config):
+        super().__init__(quant_config)
+
+
+class ModelOptFp8MoEMethod(FusedMoEMethodBase):
+    """MoE method for ModelOpt FP8.
+    Supports loading FP8 checkpoints with static weight scale and activation scale.
+
+    Args:
+        quant_config: The ModelOpt quantization config.
+    """
+
+    def __init__(self, quant_config: ModelOptFp8Config):
+        self.quant_config = quant_config
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        # Use FP8 dtype if checkpoint is serialized, otherwise use the default dtype
+        weight_dtype = (
+            torch.float8_e4m3fn
+            if self.quant_config.is_checkpoint_fp8_serialized
+            else params_dtype
+        )
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        w13_weight = ModelWeightParameter(
+            data=torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size,
+                dtype=weight_dtype,
+            ),
+            input_dim=2,
+            output_dim=1,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+
+        w2_weight = ModelWeightParameter(
+            data=torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=weight_dtype,
+            ),
+            input_dim=2,
+            output_dim=1,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            # WEIGHT SCALES - Per-tensor scaling for ModelOpts
+            # Allocate 2 scales for w1 and w3 respectively.
+            # They will be combined to a single scale after weight loading.
+            w13_weight_scale = PerTensorScaleParameter(
+                data=torch.full(
+                    (num_experts, 2),
+                    torch.finfo(torch.float32).min,
+                    dtype=torch.float32,
+                ),
+                weight_loader=weight_loader,
+            )
+            w2_weight_scale = PerTensorScaleParameter(
+                data=torch.full(
+                    (num_experts,), torch.finfo(torch.float32).min, dtype=torch.float32
+                ),
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+
+            # Set weight loader attributes for scales
+            extra_weight_attrs.update(
+                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+            )
+
+            # INPUT SCALES - Per-tensor scaling for ModelOpt
+            w13_input_scale = PerTensorScaleParameter(
+                data=torch.full((num_experts,), 1.0, dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
+            w2_input_scale = PerTensorScaleParameter(
+                data=torch.full((num_experts,), 1.0, dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("w13_input_scale", w13_input_scale)
+            layer.register_parameter("w2_input_scale", w2_input_scale)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        """Process FP8 MoE weights after loading from serialized checkpoint.
+
+        Only supports pre-quantized checkpoints with FP8 weights and scales.
+        """
+
+        layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False)
+        layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
+
+        # Handle scale parameters
+        if hasattr(layer, "w13_weight_scale") and layer.w13_weight_scale is not None:
+            # Fp8 moe kernel needs single weight scale for w13 per expert.
+            # We take the max of the w1 and w3 scales then dequant and requant each expert.
+            if layer.w13_weight_scale.dim() == 2:  # Shape: (num_experts, 2)
+                from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant
+
+                # Get the maximum scale across w1 and w3 for each expert
+                max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+
+                # Requantize each expert's weights using the combined scale
+                # w13_weight has shape (num_experts, 2 * intermediate_size_per_partition, hidden_size)
+                # where the first intermediate_size_per_partition rows are w1, the next are w3
+                intermediate_size_per_partition = layer.w13_weight.shape[1] // 2
+                for expert_id in range(layer.w13_weight.shape[0]):
+                    start = 0
+                    for shard_id in range(2):  # w1 and w3
+                        # Dequantize using the original scale for this shard
+                        dq_weight = per_tensor_dequantize(
+                            layer.w13_weight[expert_id][
+                                start : start + intermediate_size_per_partition, :
+                            ],
+                            layer.w13_weight_scale[expert_id][shard_id],
+                        )
+                        # Requantize using the combined max scale
+                        (
+                            layer.w13_weight[expert_id][
+                                start : start + intermediate_size_per_partition, :
+                            ],
+                            _,
+                        ) = scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
+
+                        start += intermediate_size_per_partition
+
+                # Update the scale parameter to be per-expert instead of per-shard
+                layer.w13_weight_scale = Parameter(max_w13_scales, requires_grad=False)
+            else:
+                layer.w13_weight_scale = Parameter(
+                    layer.w13_weight_scale.data, requires_grad=False
+                )
+
+        if hasattr(layer, "w2_weight_scale") and layer.w2_weight_scale is not None:
+            layer.w2_weight_scale = Parameter(
+                layer.w2_weight_scale.data, requires_grad=False
+            )
+        if hasattr(layer, "w13_input_scale") and layer.w13_input_scale is not None:
+            layer.w13_input_scale = Parameter(
+                layer.w13_input_scale.max(), requires_grad=False
+            )
+        if hasattr(layer, "w2_input_scale") and layer.w2_input_scale is not None:
+            layer.w2_input_scale = Parameter(
+                layer.w2_input_scale.max(), requires_grad=False
+            )
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+        self.runner = MoeRunner(MoeRunnerBackend.TRITON, moe_runner_config)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+
+        quant_info = TritonMoeQuantInfo(
+            w13_weight=layer.w13_weight,
+            w2_weight=layer.w2_weight,
+            use_fp8_w8a8=True,
+            per_channel_quant=False,
+            w13_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            a13_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
+        )
+
+        return self.runner.run(dispatch_output, quant_info)
+
+
+class ModelOptFp4Config(QuantizationConfig):
+    """Config class for FP4."""
+
+    def __init__(
+        self,
+        is_checkpoint_nvfp4_serialized: bool = False,
+        kv_cache_quant_algo: str = None,
+        group_size: int = None,
+        exclude_modules: List[str] = None,
+    ) -> None:
+        self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized
+        if is_checkpoint_nvfp4_serialized:
+            logger.warning(
+                "Detected nvfp4 checkpoint. Please note that the "
+                "format is experimental and subject to change."
+            )
+        self.group_size = group_size
+        self.kv_cache_quant_algo = kv_cache_quant_algo
+        self.exclude_modules = exclude_modules
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "modelopt_fp4"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half, torch.float8_e4m3fn]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 100
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["hf_quant_config.json"]
+
+    @staticmethod
+    def common_group_size(cfg: dict) -> int:
+        """Return the unique group_size across the config; raise if missing/mismatched."""
+        sizes = set()
+
+        # Top-level and 'quantization' block
+        v = cfg.get("group_size")
+        if isinstance(v, int):
+            sizes.add(v)
+        q = cfg.get("quantization")
+        if isinstance(q, dict):
+            v = q.get("group_size")
+            if isinstance(v, int):
+                sizes.add(v)
+
+        # config_groups: accept group-level or nested dicts (e.g., weights/input_activations)
+        for g in (cfg.get("config_groups") or {}).values():
+            if isinstance(g, dict):
+                v = g.get("group_size")
+                if isinstance(v, int):
+                    sizes.add(v)
+                for sub in g.values():
+                    if isinstance(sub, dict):
+                        v = sub.get("group_size")
+                        if isinstance(v, int):
+                            sizes.add(v)
+
+        if not sizes:
+            raise ValueError("No group_size found in config.")
+        if len(sizes) > 1:
+            raise ValueError(f"Inconsistent group_size values: {sorted(sizes)}")
+        return next(iter(sizes))
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> ModelOptFp4Config:
+        # Handle two different config formats:
+        # 1. hf_quant_config.json format: {"quantization": {"quant_algo": "NVFP4", ...}}
+        # 2. config.json quantization_config format: {"quant_algo": "NVFP4", ...}
+        # In future modelopt will deprecate hf_quant_config.json, and only keep config.json.
+        # For legacy reasons, we keep hf_quant_config.json for now.
+
+        # Initialize variables
+        kv_cache_quant_algo = None
+        group_size = None
+        exclude_modules = []
+
+        # Try flat format first (config.json quantization_config - preferred format)
+        quant_method = config.get("quant_algo")
+        if quant_method is not None:
+            # Flat format (config.json quantization_config)
+            # Note: FP4 models in config.json format may not have all the detailed fields
+            # that are present in hf_quant_config.json, so we need to handle defaults
+            kv_cache_quant_algo = config.get("kv_cache_quant_algo")
+            if not kv_cache_quant_algo:
+                # For config.json format, derive from kv_cache_scheme if available
+                kv_cache_scheme = config.get("kv_cache_scheme")
+                if (
+                    kv_cache_scheme
+                    and kv_cache_scheme.get("type") == "float"
+                    and kv_cache_scheme.get("num_bits") == 8
+                ):
+                    kv_cache_quant_algo = "FP8"
+                else:
+                    kv_cache_quant_algo = "auto"
+
+            group_size = ModelOptFp4Config.common_group_size(config)
+            exclude_modules = config.get("ignore", [])
+        else:
+            # Fall back to nested format (hf_quant_config.json - legacy format)
+            try:
+                quant_config = cls.get_from_keys(config, ["quantization"])
+                quant_method = quant_config["quant_algo"]
+                kv_cache_quant_algo = quant_config.get("kv_cache_quant_algo")
+                if not kv_cache_quant_algo:
+                    kv_cache_quant_algo = "auto"
+                group_size = ModelOptFp4Config.common_group_size(config)
+                exclude_modules = quant_config.get("exclude_modules", [])
+            except (ValueError, KeyError):
+                raise ValueError(
+                    "Cannot find 'quant_algo' in the model's quantization config. "
+                    "Expected either flat format (config.json) or nested format (hf_quant_config.json)."
+                )
+
+        if not quant_method in ["FP8", "NVFP4"]:
+            raise ValueError(
+                f"ModelOpt currently only supports: FP8, NVFP4"
+                " quantizations in sglang. Please check the "
+                "quantization config for your model's configuration."
+            )
+        is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method
+
+        if not (group_size and kv_cache_quant_algo) or exclude_modules is None:
+            logger.warning(
+                f"group_size: {group_size},"
+                f"kv_cache_quant_algo: {kv_cache_quant_algo},"
+                f"exclude_modules: {exclude_modules}"
+            )
+            raise ValueError(
+                "NVFP4 quantization requires group size and "
+                "kv_cache_quant_algo specified in the quantization config"
+            )
+        return cls(
+            is_checkpoint_nvfp4_serialized,
+            kv_cache_quant_algo,
+            group_size,
+            exclude_modules,
+        )
+
+    def is_layer_excluded(self, prefix: str, exclude_modules: list):
+        import regex as re
+
+        fused_patterns = ["q_a_proj", "q_b_proj", "kv_a_proj_with_mqa", "kv_b_proj"]
+        prefix_split = prefix.split(".")
+        for pattern in exclude_modules:
+            regex_str = pattern.replace(".", r"\.").replace("*", r".*")
+            pattern_split = pattern.split(".")
+            if re.fullmatch(regex_str, prefix):
+                return True
+            elif (
+                pattern_split[-1] in fused_patterns
+                and pattern_split[-1] in prefix_split[-1]
+            ):
+                # Check if the last part of the excluded pattern is contained in the last part of the prefix
+                # This handles fused modules like fused_qkv_a_proj_with_mqa that contain q_a_proj and kv_a_proj_with_mqa
+                # e.g., model.layers.{i}.self_attn.{fused_weight_name}
+                assert len(prefix_split) == 5 and len(pattern_split) == 5
+                return True
+        return False
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[QuantizeMethodBase]:
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+        from sglang.srt.layers.moe.fused_moe_triton.layer import FlashInferFP4MoE
+
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix, self.exclude_modules) or self.is_layer_excluded(
+                prefix, self.exclude_modules
+            ):
+                return UnquantizedLinearMethod()
+            return ModelOptFp4LinearMethod(self)
+        if self.kv_cache_quant_algo and isinstance(layer, RadixAttention):
+            return ModelOptFp8KVCacheMethod(self)
+        elif isinstance(layer, FlashInferFP4MoE):
+            # FlashInferFP4MoE needs the same quantization method but with compatible attribute handling
+            return ModelOptNvFp4FusedMoEMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return ModelOptNvFp4FusedMoEMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class ModelOptFp4LinearMethod(LinearMethodBase):
+    """Linear method for NVFP4.
+    Supports loading NVFP4 checkpoints with the following structure:
+
+    |Tensor Name           | datatype      |  shape      |
+    |----------------------------------------------------|
+    |input_scale           | torch.float32 | scalar      |
+    |weight                | NVFP4(SE2M1)  | [1, X, y/2] |
+    |weight_scale          | FP8-E4M3      | [X, Y]      |
+    |weight_scale_2        | torch.float32 | scalar      |
+
+    The weights are quantized per block of 16 elements.
+    Args: quant_config: The ModelOpt quantization config.
+    """
+
+    def __init__(self, quant_config: ModelOptFp4Config):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del input_size, output_size
+        if not self.quant_config.is_checkpoint_nvfp4_serialized:
+            raise ValueError(
+                "NVFP4 quantization was selected, "
+                " dynamic quantization is not supported."
+            )
+
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        layer.logical_widths = output_partition_sizes
+
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        if input_size_per_partition % 16 != 0:
+            raise ValueError(
+                "Unsupported model when in features size is " "not multiple of 16"
+            )
+
+        weight_dtype = (
+            torch.float8_e4m3fn
+            if self.quant_config.is_checkpoint_nvfp4_serialized
+            else params_dtype
+        )
+
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                # 2 fp4 data is packed in one uint8 in the input dimension
+                output_size_per_partition,
+                input_size_per_partition // 2,
+                dtype=torch.uint8,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        input_scale = PerTensorScaleParameter(
+            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+
+        layer.register_parameter("input_scale", input_scale)
+
+        weight_scale_2 = PerTensorScaleParameter(
+            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_scale_2", weight_scale_2)
+
+        weight_scale = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // self.quant_config.group_size,
+                dtype=weight_dtype,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        input_scale_2 = layer.input_scale.max().to(torch.float32)
+        weight_scale_2 = layer.weight_scale_2.max().to(torch.float32)
+        layer.input_scale = Parameter(input_scale_2, requires_grad=False)
+        layer.weight_scale_2 = Parameter(weight_scale_2, requires_grad=False)
+        layer.alpha = Parameter(
+            layer.input_scale * layer.weight_scale_2, requires_grad=False
+        )
+        layer.input_scale_inv = Parameter(
+            (1 / input_scale_2).to(torch.float32), requires_grad=False
+        )
+
+        # Pad and blockwise interleave weight_scale
+        scales = layer.weight_scale
+        scale_ndim = scales.ndim
+        if scale_ndim == 2:
+            scales = scales.unsqueeze(0)
+        assert scales.ndim == 3
+        B, M, K = scales.shape
+        round_up_multiple = lambda x, m: (x + m - 1) // m * m
+        M_padded = round_up_multiple(M, 128)
+        K_padded = round_up_multiple(K, 4)
+        padded_scales = torch.zeros((B, M_padded, K_padded), dtype=scales.dtype)
+        padded_scales[:B, :M, :K] = scales
+        batches, rows, cols = padded_scales.shape
+        assert rows % 128 == 0
+        assert cols % 4 == 0
+        padded_scales = padded_scales.reshape(batches, rows // 128, 4, 32, cols // 4, 4)
+        padded_scales = padded_scales.permute((0, 1, 4, 3, 2, 5))
+        padded_scales = padded_scales.contiguous().cuda()
+        padded_scales = (
+            padded_scales.reshape(M_padded, K_padded)
+            if scale_ndim == 2
+            else padded_scales.reshape(B, M_padded, K_padded)
+        )
+        layer.weight_scale_interleaved = Parameter(padded_scales, requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        output_dtype = x.dtype
+        x_m, _ = x.shape
+        w_n, _ = layer.weight.shape
+        output_shape = [x_m, w_n]
+
+        # Quantize BF16 or FP16 to (FP4 and interleaved block scale)
+        x_fp4, x_scale_interleaved = scaled_fp4_quant(x, layer.input_scale_inv)
+
+        assert x_fp4.dtype == torch.uint8
+        assert x_scale_interleaved.dtype == torch.float8_e4m3fn
+        assert layer.weight.dtype == torch.uint8
+        assert layer.weight_scale_interleaved.dtype == torch.float8_e4m3fn
+        assert layer.alpha.dtype == torch.float32
+
+        w = layer.weight
+        w_scale_interleaved = layer.weight_scale_interleaved
+        if enable_flashinfer_fp4_gemm:
+            w = layer.weight.T
+            w_scale_interleaved = layer.weight_scale_interleaved.T
+        out = fp4_gemm(
+            x_fp4,
+            w,
+            x_scale_interleaved,
+            w_scale_interleaved,
+            layer.alpha,
+            output_dtype,
+        )
+        if bias is not None:
+            out = out + bias
+        return out.view(*output_shape)
+
+
+class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
+    """
+       MoE Method for FP4 Quantization with Blockscales and PerTensorScales
+    Args:
+        quant_config: NVFP4 Quant Config
+    """
+
+    def __init__(self, quant_config: ModelOptFp4Config):
+        self.quant_config = quant_config
+        if not is_sm100_supported():
+            raise ValueError(
+                "Current platform does not support NVFP4"
+                " quantization. Please use Blackwell and"
+                " above."
+            )
+        self.enable_flashinfer_trtllm_moe = should_use_flashinfer_trtllm_moe()
+        self._cache_permute_indices = {}
+
+    @property
+    def enable_flashinfer_cutlass_moe(self) -> bool:
+        from sglang.srt.layers.moe import get_moe_runner_backend
+
+        """Access the global enable_flashinfer_cutlass_moe setting."""
+        return get_moe_runner_backend().is_flashinfer_cutlass()
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        if not self.quant_config.is_checkpoint_nvfp4_serialized:
+            raise ValueError(
+                "NVFP4 quantization was selected, "
+                " dynamic quantization is not supported."
+            )
+
+        # TODO(ch-wan): check if this is needed
+        layer.intermediate_size_per_partition = intermediate_size_per_partition
+        layer.params_dtype = params_dtype
+        layer.quant_config = self.quant_config
+
+        weight_dtype = torch.uint8
+        weight_scale_dtype = torch.float8_e4m3fn
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        # GEMM 1
+        w13_weight = ModelWeightParameter(
+            data=torch.empty(
+                layer.num_local_experts,
+                2 * intermediate_size_per_partition,
+                # 2 fp4 items are packed in the input dimension
+                hidden_size // 2,
+                dtype=weight_dtype,
+            ),
+            input_dim=1,
+            output_dim=2,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+
+        # GEMM 2
+        w2_weight = ModelWeightParameter(
+            data=torch.empty(
+                layer.num_local_experts,
+                hidden_size,
+                # 2 fp4 items are packed in the input dimension
+                intermediate_size_per_partition // 2,
+                dtype=weight_dtype,
+            ),
+            input_dim=1,
+            output_dim=2,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+
+        w13_weight_scale = ModelWeightParameter(
+            data=torch.empty(
+                layer.num_local_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // self.quant_config.group_size,
+                dtype=weight_scale_dtype,
+            ),
+            input_dim=1,
+            output_dim=2,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+
+        # Only use `swizzle_blockscale` for shapes, not for real content
+        layer.w13_blockscale_swizzled = Parameter(
+            self.swizzle_blockscale(layer.w13_weight_scale), requires_grad=False
+        )
+
+        w2_weight_scale = ModelWeightParameter(
+            data=torch.empty(
+                layer.num_local_experts,
+                hidden_size,
+                intermediate_size_per_partition // self.quant_config.group_size,
+                dtype=weight_scale_dtype,
+            ),
+            input_dim=1,
+            output_dim=2,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+
+        layer.w2_blockscale_swizzled = Parameter(
+            self.swizzle_blockscale(layer.w2_weight_scale), requires_grad=False
+        )
+
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value}
+        )
+
+        w13_weight_scale_2 = PerTensorScaleParameter(
+            data=torch.empty(layer.num_local_experts, 2, dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w13_weight_scale_2", w13_weight_scale_2)
+
+        w2_weight_scale_2 = PerTensorScaleParameter(
+            data=torch.empty(layer.num_local_experts, dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w2_weight_scale_2", w2_weight_scale_2)
+
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+        )
+
+        w13_input_scale = PerTensorScaleParameter(
+            data=torch.empty(layer.num_local_experts, 2, dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w13_input_scale", w13_input_scale)
+
+        w2_input_scale = PerTensorScaleParameter(
+            data=torch.empty(layer.num_local_experts, dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w2_input_scale", w2_input_scale)
+
+    def swizzle_blockscale(self, scale: torch.Tensor):
+        assert scale.dtype == torch.float8_e4m3fn
+        # Pad and blockwise interleave weight_scale
+        scale_ndim = scale.ndim
+        if scale.ndim == 2:
+            scale = scale.unsqueeze(0)
+        assert scale.ndim == 3
+        B, M, K = scale.shape
+        round_up_multiple = lambda x, m: (x + m - 1) // m * m
+        M_padded = round_up_multiple(M, 128)
+        K_padded = round_up_multiple(K, 4)
+        padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
+        padded_scale[:B, :M, :K] = scale
+        batches, rows, cols = padded_scale.shape
+        assert rows % 128 == 0
+        assert cols % 4 == 0
+        padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32, cols // 4, 4)
+        swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
+        swizzled_scale = swizzled_scale.contiguous().cuda()
+        return (
+            swizzled_scale.reshape(M_padded, K_padded)
+            if scale_ndim == 2
+            else swizzled_scale.reshape(B, M_padded, K_padded)
+        )
+
+    def prepare_static_weights_for_kernel(
+        self,
+        # args_dequant,
+        # args,
+        gemm1_weights,
+        gemm2_weights,
+        gemm1_scales_linear_fp4_bytes,
+        gemm2_scales_linear_fp4_bytes,
+        hidden_size,
+        intermediate_size,
+        num_experts,
+    ):
+        from flashinfer import (
+            RoutingMethodType,
+            e2m1_and_ufp8sf_scale_to_float,
+            fp4_quantize,
+            next_positive_power_of_2,
+            nvfp4_block_scale_interleave,
+            reorder_rows_for_gated_act_gemm,
+            shuffle_matrix_a,
+            shuffle_matrix_sf_a,
+        )
+        from flashinfer.fused_moe.core import (
+            _maybe_get_cached_w2_permute_indices,
+            _maybe_get_cached_w3_w1_permute_indices,
+        )
+
+        """Prepare quantized weights for kernel (done offline with weights)."""
+        epilogue_tile_m = 128  # FIXME: this depends on the kernel internals
+
+        # Convert quantized weights to proper formats
+        gemm1_weights_fp4 = gemm1_weights.view(torch.float8_e4m3fn).reshape(
+            num_experts, 2 * intermediate_size, hidden_size // 2
+        )  # packed fp4
+        gemm1_scales_linear_fp4 = gemm1_scales_linear_fp4_bytes.view(
+            torch.float8_e4m3fn
+        ).reshape(
+            num_experts, 2 * intermediate_size, hidden_size // 16
+        )  # fp8 scaling factors
+
+        gemm2_weights_fp4 = gemm2_weights.view(torch.float8_e4m3fn).reshape(
+            num_experts, hidden_size, intermediate_size // 2
+        )  # packed fp4
+        gemm2_scales_linear_fp4 = gemm2_scales_linear_fp4_bytes.view(
+            torch.float8_e4m3fn
+        ).reshape(
+            num_experts, hidden_size, intermediate_size // 16
+        )  # fp8 scaling factors
+
+        gemm1_weights_fp4_shuffled = []
+        gemm1_scales_fp4_shuffled = []
+        gemm2_weights_fp4_shuffled = []
+        gemm2_scales_fp4_shuffled = []
+        for i in range(num_experts):
+            # Calculate the permute indices for the following:
+            # 1. Reorder rows of W1 and scales for fused gated activation
+            # 2. Shuffle weights and scaling factors for transposed mma output
+            # for both w3_w1 and w2 weights and scale factors
+            permute_indices = _maybe_get_cached_w3_w1_permute_indices(
+                self._cache_permute_indices,
+                gemm1_weights_fp4[i].view(torch.uint8),
+                epilogue_tile_m,
+            )
+            gemm1_weights_fp4_shuffled.append(
+                gemm1_weights_fp4[i]
+                .view(torch.uint8)[permute_indices.to(gemm1_weights_fp4.device)]
+                .contiguous()
+            )
+
+            permute_sf_indices = _maybe_get_cached_w3_w1_permute_indices(
+                self._cache_permute_indices,
+                gemm1_scales_linear_fp4[i].view(torch.uint8),
+                epilogue_tile_m,
+                num_elts_per_sf=16,
+            )
+            gemm1_scales_fp4_shuffled.append(
+                nvfp4_block_scale_interleave(
+                    gemm1_scales_linear_fp4[i]
+                    .view(torch.uint8)[
+                        permute_sf_indices.to(gemm1_scales_linear_fp4.device)
+                    ]
+                    .contiguous()
+                )
+            )
+
+            permute_indices = _maybe_get_cached_w2_permute_indices(
+                self._cache_permute_indices,
+                gemm2_weights_fp4[i].view(torch.uint8),
+                epilogue_tile_m,
+            )
+            gemm2_weights_fp4_shuffled.append(
+                gemm2_weights_fp4[i]
+                .view(torch.uint8)[permute_indices.to(gemm2_weights_fp4.device)]
+                .contiguous()
+            )
+
+            permute_sf_indices = _maybe_get_cached_w2_permute_indices(
+                self._cache_permute_indices,
+                gemm2_scales_linear_fp4[i].view(torch.uint8),
+                epilogue_tile_m,
+                num_elts_per_sf=16,
+            )
+            gemm2_scales_fp4_shuffled.append(
+                nvfp4_block_scale_interleave(
+                    gemm2_scales_linear_fp4[i]
+                    .view(torch.uint8)[
+                        permute_sf_indices.to(gemm2_scales_linear_fp4.device)
+                    ]
+                    .contiguous()
+                )
+            )
+
+        # Stack weights for all experts
+        gemm1_weights_fp4_shuffled = torch.stack(gemm1_weights_fp4_shuffled)
+        gemm1_scales_fp4_shuffled = (
+            torch.stack(gemm1_scales_fp4_shuffled)
+            .view(torch.float8_e4m3fn)
+            .reshape(num_experts, 2 * intermediate_size, hidden_size // 16)
+        )
+
+        gemm2_weights_fp4_shuffled = torch.stack(gemm2_weights_fp4_shuffled)
+        gemm2_scales_fp4_shuffled = (
+            torch.stack(gemm2_scales_fp4_shuffled)
+            .view(torch.float8_e4m3fn)
+            .reshape(num_experts, hidden_size, intermediate_size // 16)
+        )
+        return (
+            gemm1_weights_fp4_shuffled,
+            gemm1_scales_fp4_shuffled,
+            gemm2_weights_fp4_shuffled,
+            gemm2_scales_fp4_shuffled,
+        )
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        """Process FP4 MoE weights after loading from serialized checkpoint.
+
+        Only supports pre-quantized checkpoints with FP8 weights and scales.
+        """
+
+        # GEMM 1 scale processing
+        if not torch.allclose(
+            layer.w13_weight_scale_2[:, 0], layer.w13_weight_scale_2[:, 1]
+        ):
+            logger.warning_once(
+                "w1_weight_scale_2 must match w3_weight_scale_2. "
+                "Accuracy may be affected."
+            )
+
+        w13_weight_scale_2 = layer.w13_weight_scale_2[:, 0]
+        layer.w13_weight_scale_2 = Parameter(w13_weight_scale_2, requires_grad=False)
+
+        # Calculate input scales based on strategy
+        if self.enable_flashinfer_cutlass_moe or self.enable_flashinfer_trtllm_moe:
+            w13_input_scale = layer.w13_input_scale.max().to(torch.float32)
+            w2_input_scale = layer.w2_input_scale.max().to(torch.float32)
+        else:
+            w13_input_scale = layer.w13_input_scale.max(dim=1).values.to(torch.float32)
+            w2_input_scale = layer.w2_input_scale
+
+        # Create shared parameters
+        layer.g1_alphas = Parameter(
+            (w13_input_scale * w13_weight_scale_2).to(torch.float32),
+            requires_grad=False,
+        )
+        layer.g2_alphas = Parameter(
+            (w2_input_scale * layer.w2_weight_scale_2).to(torch.float32),
+            requires_grad=False,
+        )
+        layer.w13_input_scale_quant = Parameter(
+            (1 / w13_input_scale).to(torch.float32), requires_grad=False
+        )
+        layer.w2_input_scale_quant = Parameter(
+            (1 / w2_input_scale).to(torch.float32), requires_grad=False
+        )
+
+        # Validate weight scales
+        for name, weight_scale in [
+            ("w13", layer.w13_weight_scale),
+            ("w2", layer.w2_weight_scale),
+        ]:
+            assert (
+                weight_scale.shape[2] % 16 == 0
+            ), f"Expected {name}_weight_scale.dim(2) to be divisible by 16"
+            assert (
+                weight_scale.dtype == torch.float8_e4m3fn
+            ), f"{name} Weight Blockscale must be represented as FP8-E4M3"
+
+        # Weight processing based on strategy
+        if (
+            self.enable_flashinfer_trtllm_moe
+            and reorder_rows_for_gated_act_gemm is not None
+            and shuffle_matrix_sf_a is not None
+        ):
+            # FlashInfer TRTLLM processing - handles both w13 and w2
+            (
+                gemm1_weights_fp4_shuffled,
+                gemm1_scales_fp4_shuffled,
+                gemm2_weights_fp4_shuffled,
+                gemm2_scales_fp4_shuffled,
+            ) = self.prepare_static_weights_for_kernel(
+                layer.w13_weight,
+                layer.w2_weight,
+                layer.w13_weight_scale,
+                layer.w2_weight_scale,
+                layer.w2_weight.size(-2),  # hidden_size
+                layer.w13_weight.size(-2) // 2,  # intermediate_size
+                layer.w13_weight.size(0),  # num_experts
+            )
+
+            # Set flashinfer parameters
+            layer.gemm1_weights_fp4_shuffled = Parameter(
+                gemm1_weights_fp4_shuffled, requires_grad=False
+            )
+            layer.gemm2_weights_fp4_shuffled = Parameter(
+                gemm2_weights_fp4_shuffled, requires_grad=False
+            )
+            layer.gemm1_scales_fp4_shuffled = Parameter(
+                gemm1_scales_fp4_shuffled, requires_grad=False
+            )
+            layer.gemm2_scales_fp4_shuffled = Parameter(
+                gemm2_scales_fp4_shuffled, requires_grad=False
+            )
+
+            # Additional parameter needed for TRT-LLM
+            layer.g1_scale_c = Parameter(
+                (layer.w2_input_scale_quant * layer.g1_alphas).to(torch.float32),
+                requires_grad=False,
+            )
+
+            # Clean up weights that won't be used by TRT-LLM
+            del (
+                layer.w2_weight,
+                layer.w2_weight_scale,
+                layer.w13_weight,
+                layer.w13_weight_scale,
+            )
+
+        else:
+            # CUTLASS processing - handle w13 and w2 separately
+
+            # Process w13 weights
+            w13_blockscale_swizzled = self.swizzle_blockscale(layer.w13_weight_scale)
+            del layer.w13_weight_scale
+            layer.w13_blockscale_swizzled.data.copy_(w13_blockscale_swizzled)
+            layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False)
+
+            # Process w2 weights
+            w2_blockscale_swizzled = self.swizzle_blockscale(layer.w2_weight_scale)
+            del layer.w2_weight_scale
+            layer.w2_blockscale_swizzled.data.copy_(w2_blockscale_swizzled)
+            layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
+
+            # Both flashinfer cutlass and regular cutlass use same processing for w2
+
+            # Set up CUTLASS MoE parameters
+            device = layer.w13_weight.device
+            layer.cutlass_moe_params = CutlassMoEParams(
+                CutlassMoEType.BlockscaledFP4,
+                device,
+                num_experts=layer.num_experts,  # global num experts
+                intermediate_size_per_partition=layer.w2_weight.shape[2] * 2,  # n
+                hidden_size=layer.w13_weight.shape[2] * 2,
+            )  # k
+
+    @property
+    def load_up_proj_weight_first(self) -> bool:
+        # FlashInfer CUTLASS kernel assumes [Up, Gate] Proj as W13
+        return self.enable_flashinfer_cutlass_moe
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+
+    def apply(
+        self,
+        layer: FusedMoE,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        assert (
+            self.moe_runner_config.activation == "silu"
+        ), "Only SiLU activation is supported."
+
+        moe_runner_config = self.moe_runner_config
+
+        # Check if this is a FlashInferFP4MoE layer that should handle its own forward
+        if hasattr(layer, "gemm1_weights_fp4_shuffled"):
+            # This layer was processed with flashinfer TRTLLM - delegate to its own forward
+            return StandardCombineInput(hidden_states=layer.forward(x, topk_output))
+
+        if self.enable_flashinfer_cutlass_moe:
+            assert (
+                not moe_runner_config.apply_router_weight_on_input
+            ), "apply_router_weight_on_input is not supported for Flashinfer"
+            # TRTLLM Cutlass moe takes in activations in BF16/Half/nvfp4 precision
+            # and fp4 quantized weights loaded from the checkpoint
+            topk_weights, topk_ids = topk_output.topk_weights, topk_output.topk_ids
+
+            output_dtype = x.dtype
+            x_sf = None
+            if should_use_flashinfer_cutlass_moe_fp4_allgather():
+                from flashinfer import fp4_quantize, nvfp4_block_scale_interleave
+
+                # Quantize before comm, swizzle after.
+                if x.shape[0] > 0:
+                    x, x_sf = fp4_quantize(
+                        x, layer.w13_input_scale_quant, is_sf_swizzled_layout=False
+                    )
+                else:
+                    x_col = x.shape[1]
+                    x = torch.zeros(0, x_col // 2, dtype=torch.uint8, device=x.device)
+                    x_sf = torch.zeros(
+                        0, x_col // 16, dtype=torch.uint8, device=x.device
+                    )
+                topk_weights, topk_ids, x, x_sf = get_tp_group().all_gatherv(
+                    [topk_weights, topk_ids, x, x_sf], sizes=get_dp_global_num_tokens()
+                )
+                x_sf = nvfp4_block_scale_interleave(x_sf)
+
+            output = flashinfer_cutlass_fused_moe(
+                input=x,
+                token_selected_experts=topk_ids.to(torch.int),
+                token_final_scales=topk_weights,
+                fc1_expert_weights=layer.w13_weight.view(torch.long),
+                fc2_expert_weights=layer.w2_weight.view(torch.long),
+                output_dtype=output_dtype,
+                input_sf=x_sf,
+                quant_scales=[
+                    layer.w13_input_scale_quant,
+                    layer.w13_blockscale_swizzled.view(torch.int32),
+                    layer.g1_alphas,
+                    layer.w2_input_scale_quant,
+                    layer.w2_blockscale_swizzled.view(torch.int32),
+                    layer.g2_alphas,
+                ],
+                ep_size=layer.moe_ep_size,
+                ep_rank=layer.moe_ep_rank,
+                tp_size=layer.moe_tp_size,
+                tp_rank=layer.moe_tp_rank,
+                tune_max_num_tokens=next_power_of_2(x.shape[0]),
+            )[0]
+            if should_use_flashinfer_cutlass_moe_fp4_allgather():
+                output, global_output = get_local_dp_buffer(), output
+                get_tp_group().reduce_scatterv(
+                    global_output, output=output, sizes=get_dp_global_num_tokens()
+                )
+            return StandardCombineInput(hidden_states=output)
+
+        from sglang.srt.layers.moe.cutlass_moe import cutlass_moe_fp4
+
+        topk_weights, topk_ids = topk_output.topk_weights, topk_output.topk_ids
+        output = cutlass_moe_fp4(
+            a=x,
+            a1_gscale=layer.w13_input_scale_quant,
+            w1_fp4=layer.w13_weight,
+            w1_blockscale=layer.w13_blockscale_swizzled,
+            w1_alphas=layer.g1_alphas,
+            a2_gscale=layer.w2_input_scale_quant,
+            w2_fp4=layer.w2_weight,
+            w2_blockscale=layer.w2_blockscale_swizzled,
+            w2_alphas=layer.g2_alphas,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            params=layer.cutlass_moe_params,
+            apply_router_weight_on_input=moe_runner_config.apply_router_weight_on_input,
+        ).to(x.dtype)
+        # Scale by routed_scaling_factor is fused into select_experts.
+
+        return StandardCombineInput(hidden_states=output)
diff --git a/python/sglang/srt/layers/quantization/moe_wna16.py b/python/sglang/srt/layers/quantization/moe_wna16.py
new file mode 100644
index 000000000..531e4271f
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/moe_wna16.py
@@ -0,0 +1,501 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/moe_wna16.py
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+import numpy as np
+import torch
+
+from sglang.srt.distributed import get_tensor_model_parallel_rank
+from sglang.srt.distributed.parallel_state import get_tp_group
+from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+from sglang.srt.layers.quantization.awq import AWQConfig
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.gptq import GPTQConfig, GPTQMarlinConfig
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+from sglang.srt.utils import get_device_capability, set_weight_attrs
+
+logger = logging.getLogger(__name__)
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+
+
+def get_weight_perm(num_bits: int):
+    perm_list: List[int] = []
+    for i in range(32):
+        perm1: List[int] = []
+        col = i // 4
+        for block in [0, 1]:
+            for row in [
+                2 * (i % 4),
+                2 * (i % 4) + 1,
+                2 * (i % 4 + 4),
+                2 * (i % 4 + 4) + 1,
+            ]:
+                perm1.append(16 * row + col + 8 * block)
+        for j in range(4):
+            perm_list.extend([p + 256 * j for p in perm1])
+
+    perm = np.array(perm_list)
+
+    if num_bits == 4:
+        interleave = np.array([0, 2, 4, 6, 1, 3, 5, 7])
+    elif num_bits == 8:
+        interleave = np.array([0, 2, 1, 3])
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
+    perm = torch.from_numpy(perm)
+    return perm
+
+
+class MoeWNA16Config(QuantizationConfig):
+    """Config class for MOE WNA16 (W8A16/W4A16) quantization."""
+
+    def __init__(
+        self,
+        linear_quant_method: str,
+        weight_bits: int,
+        group_size: int,
+        has_zp: bool,
+        lm_head_quantized: bool,
+        modules_to_not_convert: Optional[List[str]],
+        full_config: Dict[str, Any],
+    ) -> None:
+        super().__init__()
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.has_zp = has_zp
+        self.bit8_pack_factor = 8 // self.weight_bits
+        self.lm_head_quantized = lm_head_quantized
+        self.linear_quant_method = linear_quant_method
+        self.full_config = full_config
+        self.use_marlin = False
+        # Avoid circular import
+
+        if self.linear_quant_method == "gptq":
+            self.use_marlin = GPTQMarlinConfig.is_gptq_marlin_compatible(full_config)
+        elif self.linear_quant_method == "awq":
+            capability_tuple = get_device_capability()
+            device_capability = (
+                -1
+                if capability_tuple is None
+                else capability_tuple[0] * 10 + capability_tuple[1]
+            )
+            awq_min_capability = AWQConfig.get_min_capability()
+            if device_capability < awq_min_capability:
+                raise ValueError(
+                    "The quantization method moe_wna16 + awq is not supported "
+                    "for the current GPU. "
+                    f"Minimum capability: {awq_min_capability}. "
+                    f"Current capability: {device_capability}."
+                )
+        else:
+            raise ValueError("moe_wna16 only support gptq and awq.")
+
+        if modules_to_not_convert is None:
+            self.modules_to_not_convert = []
+        else:
+            self.modules_to_not_convert = modules_to_not_convert
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "moe_wna16"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    def get_scaled_act_names(self) -> List[str]:
+        raise NotImplementedError
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> MoeWNA16Config:
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
+        if quant_method == "gptq":
+            has_zp = not cls.get_from_keys(config, ["sym"])
+            modules_to_not_convert = []
+        elif quant_method == "awq":
+            has_zp = cls.get_from_keys(config, ["zero_point"])
+            modules_to_not_convert = cls.get_from_keys_or(
+                config, ["modules_to_not_convert"], None
+            )
+        else:
+            raise ValueError("moe_wna16 only support gptq and awq.")
+
+        return cls(
+            quant_method,
+            weight_bits,
+            group_size,
+            has_zp,
+            lm_head_quantized,
+            modules_to_not_convert,
+            config,
+        )
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:
+        if user_quant == "moe_wna16" and cls.is_moe_wna16_compatible(hf_quant_cfg):
+            return cls.get_name()
+        return None
+
+    @classmethod
+    def is_moe_wna16_compatible(cls, quant_config: Dict[str, Any]):
+        # Extract data from quant config.
+        quant_method = quant_config.get("quant_method", "").lower()
+        num_bits = quant_config.get("bits")
+        desc_act = quant_config.get("desc_act")
+
+        capability_tuple = get_device_capability()
+        device_capability = (
+            -1
+            if all(capability is None for capability in capability_tuple)
+            else capability_tuple[0] * 10 + capability_tuple[1]
+        )
+        # Avoid circular import
+        awq_min_capability = AWQConfig.get_min_capability()
+
+        gptq_compatible = quant_method == "gptq" and not desc_act and num_bits in [4, 8]
+        awq_compatible = (
+            quant_method == "awq"
+            and num_bits == 4
+            and device_capability >= awq_min_capability
+        )
+
+        return gptq_compatible or awq_compatible
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[QuantizeMethodBase]:
+        # avoid circular import
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+
+        if is_layer_skipped_quant(prefix, self.modules_to_not_convert):
+            return UnquantizedLinearMethod()
+        elif isinstance(layer, LinearBase):
+
+            if self.linear_quant_method == "gptq":
+                if self.use_marlin:
+                    return GPTQMarlinConfig.from_config(
+                        self.full_config
+                    ).get_quant_method(layer, prefix)
+                else:
+                    return GPTQConfig.from_config(self.full_config).get_quant_method(
+                        layer, prefix
+                    )
+            elif self.linear_quant_method == "awq":
+                return AWQConfig.from_config(self.full_config).get_quant_method(
+                    layer, prefix
+                )
+            else:
+                raise ValueError("moe_wna16 only support gptq and awq.")
+        elif isinstance(layer, FusedMoE):
+            return MoeWNA16Method(self)
+        return None
+
+
+def is_layer_skipped_quant(prefix: str, modules_to_not_convert: List[str]):
+    return any(module_name in prefix for module_name in modules_to_not_convert)
+
+
+class MoeWNA16Method(FusedMoEMethodBase):
+    """Linear method for MOE WNA16 (W8A16/W4A16) quantization.
+
+    Args:
+        quant_config: The MOE WNA16 (W8A16/W4A16) quantization config.
+    """
+
+    def __init__(self, quant_config: MoeWNA16Config):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        layer.quant_config = self.quant_config
+        bit8_pack_factor = self.quant_config.bit8_pack_factor
+        group_size = self.quant_config.group_size
+        group_size_div_factor = 1
+
+        # make intermediate_size and hidden_size diviable by group_size
+        # we reduce the group size to ensure that
+        # and we would repeat the loaded_weight later
+        while intermediate_size_per_partition % group_size or hidden_size % group_size:
+            group_size = group_size // 2
+            group_size_div_factor *= 2
+            assert group_size >= 32
+        layer.group_size = group_size
+        layer.group_size_div_factor = group_size_div_factor
+
+        strategy = FusedMoeWeightScaleSupported.GROUP.value
+        extra_weight_attrs.update({"quant_method": strategy, "is_transposed": False})
+
+        assert "weight_loader" in extra_weight_attrs
+        weight_loader = extra_weight_attrs["weight_loader"]
+        wrapped_weight_loader = MoeWNA16Method.get_weight_loader(layer, weight_loader)
+        extra_weight_attrs["weight_loader"] = wrapped_weight_loader
+
+        # Fused gate_up_proj (column parallel)
+        w13_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // bit8_pack_factor,
+                dtype=torch.uint8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qweight", w13_qweight)
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+
+        # down_proj (row parallel)
+        w2_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // bit8_pack_factor,
+                dtype=torch.uint8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qweight", w2_qweight)
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+
+        w13_scales = torch.nn.Parameter(
+            torch.zeros(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // group_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_scales", w13_scales)
+        set_weight_attrs(w13_scales, extra_weight_attrs)
+
+        w2_scales = torch.nn.Parameter(
+            torch.zeros(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // group_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_scales", w2_scales)
+        set_weight_attrs(w2_scales, extra_weight_attrs)
+
+        if self.quant_config.has_zp:
+            w13_qzeros = torch.nn.Parameter(
+                torch.zeros(
+                    num_experts,
+                    2 * intermediate_size_per_partition // bit8_pack_factor,
+                    hidden_size // group_size,
+                    dtype=torch.uint8,
+                ),
+                requires_grad=False,
+            )
+            layer.register_parameter("w13_qzeros", w13_qzeros)
+            set_weight_attrs(w13_qzeros, extra_weight_attrs)
+
+            w2_qzeros = torch.nn.Parameter(
+                torch.zeros(
+                    num_experts,
+                    hidden_size // bit8_pack_factor,
+                    intermediate_size_per_partition // group_size,
+                    dtype=torch.uint8,
+                ),
+                requires_grad=False,
+            )
+            layer.register_parameter("w2_qzeros", w2_qzeros)
+            set_weight_attrs(w2_qzeros, extra_weight_attrs)
+
+        if self.quant_config.linear_quant_method == "gptq":
+            # some param are unused, but we need to init them in order to
+            # load weights
+            invalid_param_keys = ["w13_g_idx", "w2_g_idx"]
+            if not self.quant_config.has_zp:
+                invalid_param_keys += ["w13_qzeros", "w2_qzeros"]
+            for key in invalid_param_keys:
+                param = torch.nn.Parameter(
+                    torch.empty((0,), dtype=torch.int32), requires_grad=False
+                )
+                layer.register_parameter(key, param)
+                set_weight_attrs(param, extra_weight_attrs)
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+        self.runner = MoeRunner(MoeRunnerBackend.TRITON, moe_runner_config)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+        assert (
+            self.moe_runner_config.activation == "silu"
+        ), "Only SiLU activation is supported."
+
+        weight_bits = self.quant_config.weight_bits
+        has_zp = self.quant_config.has_zp
+
+        quant_info = TritonMoeQuantInfo(
+            w13_weight=layer.w13_qweight,
+            w2_weight=layer.w2_qweight,
+            use_int4_w4a16=weight_bits == 4,
+            use_int8_w8a16=weight_bits == 8,
+            w13_scale=layer.w13_scales,
+            w2_scale=layer.w2_scales,
+            w13_zp=layer.w13_qzeros if has_zp else None,
+            w2_zp=layer.w2_qzeros if has_zp else None,
+            block_shape=[0, layer.group_size],
+        )
+        return self.runner.run(dispatch_output, quant_info)
+
+    @staticmethod
+    def get_weight_loader(layer, weight_loader):
+
+        def convert_awq_tensor(tensor, tensor_type):
+            # convert awq qweight/qzeros to a standard format (assume int4)
+            # qweight: (k, n // pack_factor_bit32) -> (n, k // pack_factor_bit8)
+            # qzeros: (k // group_size, n // pack_factor_bit32) ->
+            #         (n // pack_factor_bit8, k // group_size)
+            # pack_factor_bit32 = 32 // weight_bits
+            # pack_factor_bit8 = 8 // weight_bits
+
+            # 0. suppose origin shape (a, b), dtype int32
+            # 1. convert to uint8, shape (a, b) -> (a, 4 * b)
+            size0 = tensor.size(0)
+            tensor = tensor.view(torch.uint8)
+
+            # 2. unpack to uint4 (only when weight_bits == 4)
+            #    shape (a, 4 * b) -> (a, 4 * b, 2)
+            shifter = torch.tensor([0, 4], dtype=torch.uint8, device=tensor.device)
+            tensor = (tensor[:, :, None] >> shifter) & 0xF
+
+            # 3. change order, see
+            # https://github.com/casper-hansen/AutoAWQ/blob/v0.2.8/awq/utils/quant_utils.py
+            # shape -> (a, 4 * b * pack_factor_bit8)
+            reverse_awq_pack_order = [0, 4, 1, 5, 2, 6, 3, 7]
+            tensor = tensor.view(-1, 8)[:, reverse_awq_pack_order]
+            tensor = tensor.view(size0, -1)
+
+            # 4. transpose, shape -> (4 * b * pack_factor_bit8, a)
+            tensor = tensor.T.contiguous()
+
+            # 5. repack (only when weight_bits == 4)
+            # qweight shape -> (4 * b * pack_factor_bit8, a // pack_factor_bit8)
+            # qzeros shape -> (4 * b, a)
+
+            if tensor_type == "qweight":
+                tensor = tensor[:, 1::2] * 16 + tensor[:, ::2]
+            elif tensor_type == "qzeros":
+                tensor = tensor[1::2, :] * 16 + tensor[::2, :]
+            return tensor
+
+        def convert_gptq_int4_qzeros(tensor):
+            tensor = tensor.view(torch.uint8)
+            shifter = torch.tensor([0, 4], dtype=torch.uint8, device=tensor.device)
+            tensor = (tensor[:, :, None] >> shifter) & 0xF
+            tensor = tensor + 1
+            tensor = tensor[:, :, 0] + tensor[:, :, 1] * 16
+            return tensor
+
+        def moe_wna16_weight_loader(
+            param: torch.nn.Parameter,
+            loaded_weight: torch.Tensor,
+            weight_name: str,
+            shard_id: str,
+            expert_id: int,
+        ):
+            if "g_idx" in weight_name:
+                return
+            if not layer.quant_config.has_zp and "qzeros" in weight_name:
+                return
+
+            device = get_tp_group().device
+            tp_rank = get_tensor_model_parallel_rank()
+            loaded_weight = loaded_weight.to(device)
+            shard_size = layer.intermediate_size_per_partition
+
+            # convert gptq and awq weight to a standard format
+            if layer.quant_config.linear_quant_method == "awq":
+                assert layer.quant_config.weight_bits == 4
+                if "weight" in weight_name:
+                    loaded_weight = convert_awq_tensor(loaded_weight, "qweight")
+                elif "zeros" in weight_name:
+                    loaded_weight = convert_awq_tensor(loaded_weight, "qzeros")
+                else:
+                    loaded_weight = loaded_weight.T
+            elif layer.quant_config.linear_quant_method == "gptq":
+                assert layer.quant_config.weight_bits in [4, 8]
+                if "weight" in weight_name:
+                    loaded_weight = loaded_weight.T.contiguous().view(torch.uint8)
+                elif "zeros" in weight_name:
+                    # add 1 to gptq qzeros to align with awq
+                    loaded_weight = loaded_weight.view(torch.uint8)
+                    if layer.quant_config.weight_bits == 4:
+                        loaded_weight = convert_gptq_int4_qzeros(loaded_weight).T
+                    else:
+                        loaded_weight = loaded_weight.T + 1
+                else:
+                    loaded_weight = loaded_weight.T
+
+            # repeat the qzeros/scales to fit new group size
+            if (
+                layer.group_size_div_factor > 1
+                and "qzeros" in weight_name
+                or "scales" in weight_name
+            ):
+                loaded_weight = loaded_weight.repeat_interleave(
+                    layer.group_size_div_factor, 1
+                )
+
+            if "w13_qzeros" in weight_name:
+                tensor = loaded_weight.view(
+                    layer.moe_tp_size, -1, loaded_weight.size(1)
+                )[tp_rank]
+                if shard_id == "w1":
+                    param.data[expert_id, : shard_size // 2] = tensor
+                else:
+                    param.data[expert_id, shard_size // 2 :] = tensor
+            elif "w2_qzeros" in weight_name:
+                param.data[expert_id] = loaded_weight.view(
+                    loaded_weight.size(0), layer.moe_tp_size, -1
+                )[:, tp_rank]
+            else:
+                weight_loader(param, loaded_weight, weight_name, shard_id, expert_id)
+
+        return moe_wna16_weight_loader
diff --git a/python/sglang/srt/layers/quantization/mxfp4.py b/python/sglang/srt/layers/quantization/mxfp4.py
new file mode 100644
index 000000000..0d98d00d6
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/mxfp4.py
@@ -0,0 +1,862 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/mxfp4.py
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, List, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+from sglang.srt.layers.moe.utils import get_moe_runner_backend
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.utils import is_layer_skipped
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.utils import (
+    direct_register_custom_op,
+    is_cuda,
+    is_flashinfer_available,
+    is_hip,
+    is_sm100_supported,
+    is_triton_kernels_available,
+    log_info_on_rank0,
+    mxfp_supported,
+    next_power_of_2,
+    round_up,
+    set_weight_attrs,
+)
+
+_is_sm100_supported = is_cuda() and is_sm100_supported()
+has_triton_kernels = is_triton_kernels_available()
+
+
+if is_flashinfer_available():
+    from flashinfer import (
+        mxfp8_quantize,
+        shuffle_matrix_a,
+        shuffle_matrix_sf_a,
+        trtllm_fp4_block_scale_moe,
+    )
+
+logger = logging.getLogger(__name__)
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+
+_is_hip = is_hip()
+
+if _is_hip:
+    # import aiter
+    try:
+        from aiter import ActivationType, QuantType, dtypes
+        from aiter.fused_moe import fused_moe
+        from aiter.ops.triton.quant import dynamic_mxfp4_quant
+        from aiter.utility.fp4_utils import e8m0_shuffle
+    except ImportError as err:
+        ActivationType = QuantType = dtypes = fused_moe = dynamic_mxfp4_quant = (
+            e8m0_shuffle
+        ) = err
+
+
+def _swizzle_mxfp4(quant_tensor, scale, num_warps):
+    """weight swizzle for mxfp4 moe, used for OAI mxfp4 kernel"""
+    import triton_kernels.matmul_ogs_details.opt_flags as opt_flags
+    from triton_kernels.numerics import InFlexData
+    from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
+    from triton_kernels.tensor_details import layout
+
+    value_layout, value_layout_opts = layout.make_default_matmul_mxfp4_w_layout(
+        mx_axis=1
+    )
+    scale_layout, scale_layout_opts = layout.make_default_matmul_mxfp4_w_scale_layout(
+        mx_axis=1, num_warps=num_warps
+    )
+    if _is_sm100_supported:
+        constraints = {
+            "is_persistent": True,
+            "epilogue_subtile": 1,
+        }
+        opt_flags.update_opt_flags_constraints(constraints)
+    # transpose the tensor so that the quantization axis is on dim1
+    quant_tensor = quant_tensor.transpose(-2, -1)
+    scale = scale.transpose(-2, -1)
+    quant_tensor = convert_layout(
+        wrap_torch_tensor(quant_tensor, dtype=FP4), value_layout, **value_layout_opts
+    )
+    scale = convert_layout(wrap_torch_tensor(scale), scale_layout, **scale_layout_opts)
+    return quant_tensor, InFlexData(), scale
+
+
+def _dequant_mxfp4(
+    x: torch.Tensor, scale: torch.Tensor, float_dtype: torch.dtype
+) -> torch.Tensor:
+    try:
+        from quark.torch.kernel import mx
+    except ImportError as err:
+        raise ImportError(
+            "The package `amd-quark` is required to use "
+            "MX-FP4 models. Please install it with `pip install "
+            "amd-quark`."
+        ) from err
+
+    return mx.dq_mxfp4(x, scale, float_dtype)
+
+
+def _dequant_mxfp4_fake(
+    x: torch.Tensor, scale: torch.Tensor, float_dtype: torch.dtype
+) -> torch.Tensor:
+    return torch.empty(
+        (*x.shape[:-1], x.shape[-1] * 2), dtype=float_dtype, device=x.device
+    )
+
+
+def _quant_dequant_mxfp4(
+    x: torch.Tensor, scale_calculation_mode: str = "even"
+) -> torch.Tensor:
+    try:
+        from quark.torch.kernel import mx
+    except ImportError as err:
+        raise ImportError(
+            "The package `amd-quark` is required to use "
+            "MX-FP4 models. Please install it with `pip install "
+            "amd-quark`."
+        ) from err
+
+    return mx.qdq_mxfp4(x, scale_calculation_mode)
+
+
+def _quant_dequant_mxfp4_fake(
+    x: torch.Tensor, scale_calculation_mode: str = "even"
+) -> torch.Tensor:
+    return torch.empty_like(x)
+
+
+direct_register_custom_op(
+    op_name="dequant_mxfp4",
+    op_func=_dequant_mxfp4,
+    mutates_args=[],
+    fake_impl=_dequant_mxfp4_fake,
+)
+dequant_mxfp4 = torch.ops.sglang.dequant_mxfp4
+
+direct_register_custom_op(
+    op_name="quant_dequant_mxfp4",
+    op_func=_quant_dequant_mxfp4,
+    mutates_args=[],
+    fake_impl=_quant_dequant_mxfp4_fake,
+)
+quant_dequant_mxfp4 = torch.ops.sglang.quant_dequant_mxfp4
+
+
+class Mxfp4Config(QuantizationConfig):
+
+    def __init__(
+        self,
+        ignored_layers: Optional[list[str]] = None,
+        is_checkpoint_mxfp4_serialized: bool = False,
+    ):
+        super().__init__()
+        self.is_checkpoint_mxfp4_serialized = is_checkpoint_mxfp4_serialized
+        self.ignored_layers = ignored_layers
+
+    @classmethod
+    def from_config(cls, config):
+
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        is_checkpoint_mxfp4_serialized = "mxfp4" in quant_method
+
+        if _is_hip:
+            if mxfp_supported():
+                return cls(
+                    is_checkpoint_mxfp4_serialized=is_checkpoint_mxfp4_serialized
+                )
+            else:
+
+                platform = torch.cuda.get_device_properties(0).gcnArchName
+                raise ValueError(
+                    f"Current platform {platform} not support mxfp4 computation"
+                )
+
+        return cls(is_checkpoint_mxfp4_serialized=is_checkpoint_mxfp4_serialized)
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "mxfp4"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.bfloat16, torch.float16]
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return []
+
+    def is_static_cfg(self):
+        return self.is_checkpoint_mxfp4_serialized
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+        from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+
+        if isinstance(layer, LinearBase):
+            if self.ignored_layers and is_layer_skipped(
+                prefix=prefix,
+                ignored_layers=self.ignored_layers,
+                fused_mapping=self.packed_modules_mapping,
+            ):
+                return UnquantizedLinearMethod()
+            elif _is_hip:
+                return UnquantizedLinearMethod()
+        elif isinstance(layer, FusedMoE):
+            if self.is_checkpoint_mxfp4_serialized:
+                return Mxfp4MoEMethod(prefix=prefix)
+            else:
+                return Mxfp4DynamicQuantMoEMethod()
+        else:
+            if self.is_checkpoint_mxfp4_serialized:
+                raise NotImplementedError("Mxfp4 attention layer is not implemented")
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class Mxfp4MoEMethod(FusedMoEMethodBase):
+
+    def __init__(
+        self,
+        prefix: str,
+    ):
+        super().__init__()
+
+        self.prefix = prefix
+        self.topk_indices_dtype = None
+        self.use_triton_kernels = get_moe_runner_backend().is_triton_kernel()
+        self.with_bias = False
+        self.use_flashinfer = get_moe_runner_backend().is_flashinfer_mxfp4()
+        self.flashinfer_mxfp4_moe_precision = global_server_args_dict[
+            "flashinfer_mxfp4_moe_precision"
+        ]
+
+        self.triton_kernel_moe_forward = None
+        self.triton_kernel_moe_with_bias_forward = None
+        if torch.cuda.is_available() and has_triton_kernels:
+            from sglang.srt.layers.moe.fused_moe_triton.triton_kernels_moe import (
+                triton_kernel_moe_forward as _tk_forward,
+            )
+            from sglang.srt.layers.moe.fused_moe_triton.triton_kernels_moe import (
+                triton_kernel_moe_with_bias_forward as _tk_with_bias_forward,
+            )
+
+            self.triton_kernel_moe_forward = _tk_forward
+            self.triton_kernel_moe_with_bias_forward = _tk_with_bias_forward
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        with_bias: bool = False,
+        **extra_weight_attrs,
+    ):
+        self.num_experts = num_experts
+        weight_dtype = torch.uint8
+        scale_dtype = torch.uint8
+        self.with_bias = with_bias
+        mxfp4_block = 32
+
+        # pad the intermediate size to be a multiple of 2 * mxfp4_block
+        # for to hold non-uniform sharded tensor as well as swizzling
+        intermediate_size_per_partition_after_pad = intermediate_size_per_partition
+        if _is_sm100_supported:
+            if self.use_flashinfer:
+                intermediate_size_per_partition_after_pad = round_up(
+                    intermediate_size_per_partition, 256
+                )
+                hidden_size = round_up(hidden_size, 256)
+            else:
+                intermediate_size_per_partition_after_pad = round_up(
+                    intermediate_size_per_partition, 64
+                )
+        elif has_triton_kernels:
+            # TODO: this is a hack to make
+            # intermediate_size_per_partition_after_pad the same as the
+            # per_rank_intermediate_size during weight loading
+            intermediate_size_per_partition_after_pad = round_up(
+                intermediate_size_per_partition, mxfp4_block
+            )
+
+        self.intermediate_size_per_partition = intermediate_size_per_partition_after_pad
+
+        self.hidden_size = hidden_size
+        # Fused gate_up_proj (column parallel)
+        w13_weight = torch.nn.Parameter(
+            torch.zeros(
+                layer.num_local_experts,
+                2 * intermediate_size_per_partition_after_pad,
+                hidden_size // 2,
+                dtype=weight_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w13_weight_scale = torch.nn.Parameter(
+            torch.zeros(
+                layer.num_local_experts,
+                2 * intermediate_size_per_partition_after_pad,
+                hidden_size // mxfp4_block,
+                dtype=scale_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+
+        w13_weight_bias = torch.nn.Parameter(
+            torch.zeros(
+                layer.num_local_experts,
+                2 * intermediate_size_per_partition_after_pad,
+                dtype=torch.bfloat16,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_bias", w13_weight_bias)
+        set_weight_attrs(w13_weight_bias, extra_weight_attrs)
+
+        # down_proj (row parallel)
+        w2_weight = torch.nn.Parameter(
+            torch.zeros(
+                layer.num_local_experts,
+                hidden_size,
+                intermediate_size_per_partition_after_pad // 2,
+                dtype=weight_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        w2_weight_scale = torch.nn.Parameter(
+            torch.zeros(
+                layer.num_local_experts,
+                hidden_size,
+                intermediate_size_per_partition_after_pad // mxfp4_block,
+                dtype=scale_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        w2_weight_bias = torch.nn.Parameter(
+            torch.zeros(layer.num_local_experts, hidden_size, dtype=torch.bfloat16),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_bias", w2_weight_bias)
+        set_weight_attrs(w2_weight_bias, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer):
+        if self.use_flashinfer:
+            log_info_on_rank0(
+                logger,
+                f"Shuffling MoE weights for FlashInfer MXFP4 moe kernel (layer: {self.prefix}), it might take a while...",
+            )
+            # TODO: these values are hardcoded for now, we need to get them from the model
+            layer.gemm1_alpha = Parameter(
+                torch.tensor([1.702] * self.num_experts, dtype=torch.float32).cuda(),
+                requires_grad=False,
+            )
+            layer.gemm1_beta = Parameter(
+                torch.tensor([1.0] * self.num_experts, dtype=torch.float32).cuda(),
+                requires_grad=False,
+            )
+            layer.gemm1_clamp_limit = Parameter(
+                torch.tensor([7.0] * self.num_experts, dtype=torch.float32).cuda(),
+                requires_grad=False,
+            )
+            sf_block_size = 32  # mxfp4 block size
+
+            assert (
+                layer.w13_weight.dim() == 3
+                and layer.w13_weight.shape[0] == self.num_experts
+                and layer.w13_weight.shape[1]
+                == self.intermediate_size_per_partition * 2
+                and layer.w13_weight.shape[2] == self.hidden_size // 2
+            )
+            assert (
+                layer.w13_weight_scale.dim() == 3
+                and layer.w13_weight_scale.shape[0] == self.num_experts
+                and layer.w13_weight_scale.shape[1]
+                == self.intermediate_size_per_partition * 2
+                and layer.w13_weight_scale.shape[2] == self.hidden_size // sf_block_size
+            )
+            assert (
+                layer.w2_weight.dim() == 3
+                and layer.w2_weight.shape[0] == self.num_experts
+                and layer.w2_weight.shape[1] == self.hidden_size
+                and layer.w2_weight.shape[2]
+                == self.intermediate_size_per_partition // 2
+            )
+            assert (
+                layer.w2_weight_scale.dim() == 3
+                and layer.w2_weight_scale.shape[1] == self.hidden_size
+                and layer.w2_weight_scale.shape[2]
+                == self.intermediate_size_per_partition // sf_block_size
+            )
+            assert (
+                layer.w13_weight_bias.dim() == 2
+                and layer.w13_weight_bias.shape[0] == self.num_experts
+                and layer.w13_weight_bias.shape[1]
+                == self.intermediate_size_per_partition * 2
+            )
+            assert (
+                layer.w2_weight_bias.dim() == 2
+                and layer.w2_weight_bias.shape[0] == self.num_experts
+                and layer.w2_weight_bias.shape[1] == self.hidden_size
+            )
+
+            w13_weight_scale = layer.w13_weight_scale.data
+            w2_weight_scale = layer.w2_weight_scale.data
+            w13_weight = layer.w13_weight.data
+            w2_weight = layer.w2_weight.data
+            w13_bias = layer.w13_weight_bias.data.to(torch.float32)
+            w2_bias = layer.w2_weight_bias.data.to(torch.float32)
+
+            # Swap w1 and w3 as the definition of
+            # swiglu is different in the trtllm-gen
+            def swap_every_two_rows(x, axis=-1):
+                shape = x.shape
+                if axis < 0:
+                    axis = len(shape) + axis
+
+                # Create a new shape with pairs swapped along specified axis
+                new_shape = list(shape)
+                new_shape[axis] = shape[axis] // 2
+                new_shape.insert(axis + 1, 2)
+
+                # Reshape to expose pairs, swap them, and reshape back
+                x = x.reshape(*new_shape)
+                x = x.flip(axis + 1)
+                new_shape = list(shape)
+                return x.reshape(*new_shape)
+
+            w13_weight_scale = swap_every_two_rows(w13_weight_scale, -2)
+            w13_weight = swap_every_two_rows(w13_weight, -2)
+            w13_bias = swap_every_two_rows(w13_bias, -1)
+
+            # Shuffle weights and scaling factors for transposed mma output
+            gemm1_weights_mxfp4_shuffled = []
+            gemm1_scales_mxfp4_shuffled = []
+            gemm2_weights_mxfp4_shuffled = []
+            gemm2_scales_mxfp4_shuffled = []
+            gemm1_bias_shuffled = []
+            gemm2_bias_shuffled = []
+            epilogue_tile_m = 128  # FIXME: this depends on the kernel internals
+            for i in range(self.num_experts):
+                gemm1_weights_mxfp4_shuffled.append(
+                    shuffle_matrix_a(w13_weight[i].view(torch.uint8), epilogue_tile_m)
+                )
+                gemm1_scales_mxfp4_shuffled.append(
+                    shuffle_matrix_sf_a(
+                        w13_weight_scale[i].view(torch.uint8), epilogue_tile_m
+                    )
+                )
+                gemm1_bias_shuffled.append(
+                    shuffle_matrix_a(
+                        w13_bias[i].clone().reshape(-1, 1), epilogue_tile_m
+                    )
+                )
+
+                gemm2_weights_mxfp4_shuffled.append(
+                    shuffle_matrix_a(w2_weight[i].view(torch.uint8), epilogue_tile_m)
+                )
+                gemm2_scales_mxfp4_shuffled.append(
+                    shuffle_matrix_sf_a(
+                        w2_weight_scale[i].view(torch.uint8), epilogue_tile_m
+                    )
+                )
+                gemm2_bias_shuffled.append(
+                    shuffle_matrix_a(w2_bias[i].clone().reshape(-1, 1), epilogue_tile_m)
+                )
+
+            w13_weight = torch.stack(gemm1_weights_mxfp4_shuffled)
+            w13_weight_scale = (
+                torch.stack(gemm1_scales_mxfp4_shuffled)
+                .reshape(
+                    self.num_experts,
+                    2 * self.intermediate_size_per_partition,
+                    self.hidden_size // sf_block_size,
+                )
+                .view(torch.float8_e4m3fn)
+            )
+
+            w2_weight = torch.stack(gemm2_weights_mxfp4_shuffled)
+            w2_weight_scale = (
+                torch.stack(gemm2_scales_mxfp4_shuffled)
+                .reshape(
+                    self.num_experts,
+                    self.hidden_size,
+                    self.intermediate_size_per_partition // sf_block_size,
+                )
+                .view(torch.float8_e4m3fn)
+            )
+
+            layer.w13_weight = Parameter(w13_weight, requires_grad=False)
+            layer.w13_weight_scale = Parameter(w13_weight_scale, requires_grad=False)
+            layer.w2_weight = Parameter(w2_weight, requires_grad=False)
+            layer.w2_weight_scale = Parameter(w2_weight_scale, requires_grad=False)
+            layer.w13_weight_bias = Parameter(
+                torch.stack(gemm1_bias_shuffled).reshape(self.num_experts, -1),
+                requires_grad=False,
+            )
+            layer.w2_weight_bias = Parameter(
+                torch.stack(gemm2_bias_shuffled).reshape(self.num_experts, -1),
+                requires_grad=False,
+            )
+            return
+
+        if self.use_triton_kernels:
+
+            from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
+
+            w13_weight_bias = layer.w13_weight_bias.to(torch.float32)
+            w2_weight_bias = layer.w2_weight_bias.to(torch.float32)
+
+            layer.w13_weight_bias = Parameter(w13_weight_bias, requires_grad=False)
+            layer.w2_weight_bias = Parameter(w2_weight_bias, requires_grad=False)
+
+            num_warps = 8
+
+            w13_weight, w13_flex, w13_scale = _swizzle_mxfp4(
+                layer.w13_weight, layer.w13_weight_scale, num_warps
+            )
+            w2_weight, w2_flex, w2_scale = _swizzle_mxfp4(
+                layer.w2_weight, layer.w2_weight_scale, num_warps
+            )
+
+            self.w13_precision_config = PrecisionConfig(
+                weight_scale=w13_scale, flex_ctx=FlexCtx(rhs_data=w13_flex)
+            )
+            self.w2_precision_config = PrecisionConfig(
+                weight_scale=w2_scale, flex_ctx=FlexCtx(rhs_data=w2_flex)
+            )
+
+            self.w13_weight_triton_tensor = w13_weight
+            self.w2_weight_triton_tensor = w2_weight
+            del layer.w13_weight
+            del layer.w2_weight
+        else:
+            from triton_kernels.numerics_details.mxfp import upcast_from_mxfp
+
+            w13_weight = upcast_from_mxfp(
+                layer.w13_weight, layer.w13_weight_scale, dtype=torch.bfloat16, axis=-1
+            )
+            w2_weight = upcast_from_mxfp(
+                layer.w2_weight, layer.w2_weight_scale, dtype=torch.bfloat16, axis=-1
+            )
+            del layer.w13_weight
+            del layer.w2_weight
+            del layer.w13_weight_scale
+            del layer.w2_weight_scale
+            layer.w13_weight = Parameter(w13_weight.data, requires_grad=False)
+            layer.w2_weight = Parameter(w2_weight.data, requires_grad=False)
+        torch.cuda.empty_cache()
+
+    def _get_tile_tokens_dim(self, x: torch.Tensor, top_k: int):
+        # Number of tokens in the input tensor.
+        num_tokens = x.shape[0]
+        # Factor to account for the imbalance of the experts.
+        # factor equals to the
+        # max_real_num_tokens_per_expert / perfect_num_tokens_per_expert
+        # - 1.0 means perfect expert distribution.
+        # - > 1.0 means some experts have more
+        #     tokens than the perfect distribution.
+        # - < 1.0 does not make sense.
+        imbalance_factor = 1.3
+        # Calculate the number of tokens per expert
+        # assuming perfect distribution.
+        num_tokens_per_expert = (num_tokens * top_k) // self.num_experts
+        # Apply the imbalance factor.
+        num_tokens_per_expert = int(num_tokens_per_expert * imbalance_factor)
+        # And pad the number to the next power of 2.
+        tile_tokens_dim = next_power_of_2(num_tokens_per_expert)
+        # Cap to 8-64 tokens per CTA tile
+        # as it's the range supported by the kernel.
+        tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
+
+        return tile_tokens_dim
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+        self.runner = MoeRunner(MoeRunnerBackend.TRITON, moe_runner_config)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+        from sglang.srt.layers.moe.topk import TopKOutputChecker
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        moe_runner_config = self.moe_runner_config
+
+        if self.use_flashinfer:
+            # When bf16 mode is enabled, we don't need to quantize the input,
+            # TRT-LLM automatically handles quantization in the kernel implementation and pipelines it with GEMM operations,
+            # which can theoretically improve performance
+            if self.flashinfer_mxfp4_moe_precision == "bf16":
+                assert x.dtype == torch.bfloat16
+                x_quant = x
+                x_scale = None
+
+                # May be fused later if this code branch is frequently needed
+                origin_hidden_states_dim = x_quant.shape[-1]
+                if self.hidden_size != origin_hidden_states_dim:
+                    x_quant = torch.nn.functional.pad(
+                        x_quant,
+                        (0, self.hidden_size - origin_hidden_states_dim),
+                        mode="constant",
+                        value=0.0,
+                    )
+            elif self.flashinfer_mxfp4_moe_precision == "default":
+                x_quant, x_scale = mxfp8_quantize(x, False, alignment=self.hidden_size)
+                x_scale = x_scale.view(torch.float8_e4m3fn).reshape(-1)
+            else:
+                raise NotImplementedError
+
+            assert x_quant.shape[-1] == self.hidden_size
+            assert TopKOutputChecker.format_is_bypassed(topk_output)
+
+            top_k = topk_output.topk_config.top_k
+            router_logits = topk_output.router_logits
+
+            trtllm_gen_output = trtllm_fp4_block_scale_moe(
+                router_logits.to(torch.bfloat16),
+                None,  # routing_bias
+                x_quant,
+                x_scale,
+                layer.w13_weight,  # uint8 (e2m1 x 2)
+                layer.w13_weight_scale,  # uint8 (e4m3 x 2)
+                layer.w13_weight_bias,  # fp32 per expert per channel
+                layer.gemm1_alpha,  # fp32 per expert
+                layer.gemm1_beta,  # fp32 per expert
+                layer.gemm1_clamp_limit,  # fp32 per expert
+                layer.w2_weight,  # uint8 (e2m1 x 2)
+                layer.w2_weight_scale,  # ue8m0
+                layer.w2_weight_bias,  # fp32 per expert per channel
+                None,  # output1_scale_scalar
+                None,  # output1_scale_gate_scalar
+                None,  # output2_scale_scalar
+                layer.num_experts,
+                top_k,
+                None,  # n_group      # TODO: support n_group
+                None,  # topk_group   # TODO: support topk_group
+                self.intermediate_size_per_partition,  # padded to multiple of 256
+                layer.moe_ep_rank * layer.num_local_experts,  # local_expert_offset
+                layer.num_local_experts,  # local num experts
+                None,
+                self._get_tile_tokens_dim(x, top_k),
+                1,  # routing_method_type, renormalize
+                True,  # do finalize
+            )[0]
+            return StandardCombineInput(hidden_states=trtllm_gen_output)
+
+        if self.use_triton_kernels:
+            assert (
+                layer.moe_ep_size == 1
+            ), "Expert parallel is not supported when using triton kernels"
+            if self.with_bias:
+                output = self.triton_kernel_moe_with_bias_forward(
+                    hidden_states=x,
+                    w1=self.w13_weight_triton_tensor,
+                    w1_pcg=self.w13_precision_config,
+                    w2=self.w2_weight_triton_tensor,
+                    w2_pcg=self.w2_precision_config,
+                    b1=layer.w13_weight_bias,
+                    b2=layer.w2_weight_bias,
+                    topk_output=topk_output,
+                    moe_runner_config=moe_runner_config,
+                )
+            else:
+                output = self.triton_kernel_moe_forward(
+                    hidden_states=x,
+                    w1=layer.w13_weight,
+                    w2=layer.w2_weight,
+                    topk_output=topk_output,
+                    moe_runner_config=moe_runner_config,
+                )
+            return StandardCombineInput(hidden_states=output)
+        else:
+            quant_info = TritonMoeQuantInfo(
+                w13_weight=layer.w13_weight,
+                w2_weight=layer.w2_weight,
+                w13_weight_bias=layer.w13_weight_bias,
+                w2_weight_bias=layer.w2_weight_bias,
+            )
+            return self.runner.run(dispatch_output, quant_info)
+
+
+class Mxfp4DynamicQuantMoEMethod(FusedMoEMethodBase):
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # Allocate 2 scales for w1 and w3 respectively.
+        # They will be combined to a single scale after weight loading.
+        w13_weight_scale = torch.nn.Parameter(
+            torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
+        )
+        w2_weight_scale = torch.nn.Parameter(
+            torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+
+        # Add the quantization method used (per tensor/grouped/channel)
+        # to ensure the weight scales are loaded in properly
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+        )
+
+        layer.w13_input_scale = None
+        layer.w2_input_scale = None
+
+    def mxfp4_quantize(self, w):
+        w_shape = w.shape
+        w_need_reshape = True if w.dim() != 2 else False
+
+        if w_need_reshape:
+            w_last_dim_size = w_shape[-1]
+            w = w.view(-1, w_last_dim_size)
+
+        w, mx_scales = dynamic_mxfp4_quant(w)
+
+        if w_need_reshape:
+            w_new_shape = w_shape[:-1] + (w.shape[-1],)
+            w = w.view(w_new_shape)
+
+        mx_scales = e8m0_shuffle(mx_scales)
+
+        return w, mx_scales
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        w13, w13_mx_scales = self.mxfp4_quantize(layer.w13_weight.data)
+        w2, w2_mx_scales = self.mxfp4_quantize(layer.w2_weight.data)
+
+        layer.w13_weight = torch.nn.Parameter(w13, requires_grad=False)
+        layer.w13_weight_scale = torch.nn.Parameter(w13_mx_scales, requires_grad=False)
+
+        layer.w2_weight = torch.nn.Parameter(w2, requires_grad=False)
+        layer.w2_weight_scale = torch.nn.Parameter(w2_mx_scales, requires_grad=False)
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        topk_weights, topk_ids, _ = topk_output
+        if _is_hip:
+            topk_weights = topk_weights.to(
+                torch.float32
+            )  # aiter's moe_sorting requires topk_weights to be FP32
+        output = fused_moe(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights,
+            topk_ids,
+            quant_type=QuantType.per_1x32,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            activation=(
+                ActivationType.Silu
+                if self.moe_runner_config.activation == "silu"
+                else ActivationType.Gelu
+            ),
+            doweight_stage1=False,
+        )
+        return StandardCombineInput(hidden_states=output)
diff --git a/python/sglang/srt/layers/quantization/mxfp4_tensor.py b/python/sglang/srt/layers/quantization/mxfp4_tensor.py
new file mode 100644
index 000000000..76cb92c54
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/mxfp4_tensor.py
@@ -0,0 +1,135 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import torch
+
+
+# https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/modelopt/torch/quantization/qtensor/mxfp4_tensor.py
+class MXFP4QuantizeUtil:
+    E2M1_max = 6.0
+
+    E2M1_values = [0, 0.5, 1, 1.5, 2, 3, 4, 6]
+    E2M1_bounds = torch.tensor([0.25, 0.75, 1.25, 1.75, 2.5, 3.5, 5])
+
+    @classmethod
+    def quantize(cls, input: torch.Tensor, block_size: Optional[int]) -> tuple:
+        """Converting a tensor to a quantized format based on MXFP4 quantization. Only E4M3 is supported.
+        Args:
+            input (torch.Tensor): The input tensor to be quantized.
+            block_sizes (dict | None): The block sizes for quantization.
+        """
+
+        def cast_fp4(x):
+            sign = torch.sign(x)
+            sign_bit = (2 - sign) // 2
+            ord_ = torch.sum(
+                (x.abs().unsqueeze(-1) - cls.E2M1_bounds.to(x.device)) > 0, dim=-1
+            )
+            fp4_val = (sign_bit * 0b1000 + ord_).to(torch.uint8)
+            return fp4_val
+
+        def fuse_uint4_to_uint8(x):
+            # If the last dimension is odd, pad with zeros
+            # If this behavior is not desired, please modify the code accordingly
+            left_side = x[..., 0::2]  # Even indices (0, 2, 4...)
+            right_side = x[..., 1::2]  # Odd indices (1, 3, 5...)
+            new_data = (
+                right_side.clone() << 4
+            )  # Put odd indices (higher addresses) in high bits
+            new_data[
+                ..., : left_side.shape[-1]
+            ] += left_side  # Put even indices in low bits
+            return new_data
+
+        if block_size is None:
+            block_size = 32
+
+        original_shape = input.shape
+        original_dtype = input.dtype
+        input = input.view(-1, block_size)
+        # get scales
+        input_amax = input.abs().max(dim=-1, keepdim=True).values
+        descale = input_amax / cls.E2M1_max
+        min_value = torch.tensor(-127.0, device=descale.device)
+        e8m0_scale = torch.ceil(torch.maximum(torch.log2(descale), min_value))
+
+        input = (input / torch.exp2(e8m0_scale)).view(original_shape)
+        input_q = cast_fp4(input)
+        input_q = fuse_uint4_to_uint8(input_q)
+        e8m0_scale = (e8m0_scale + 127).to(torch.uint8)
+        return cls(original_shape, original_dtype, input_q), e8m0_scale
+
+    @classmethod
+    def dequantize(cls, quantized_data, dtype: torch.dtype, scale, block_sizes):
+        """Dequantze MXFP4 packed tensor to a target dtype."""
+
+        def unfuse_uint8_to_uint4(x):
+            """Unfuse uint8 values back to uint4 values.
+            This is the inverse operation of fuse_uint4_to_uint8.
+            """
+            # Extract the lower 4 bits (even indices)
+            left_side = x & 0x0F
+
+            # Extract the upper 4 bits (odd indices)
+            right_side = (x >> 4) & 0x0F
+
+            # Create a new tensor with alternating values
+            shape = list(x.shape)
+            shape[-1] = shape[-1] * 2
+            result = torch.zeros(shape, dtype=torch.uint8, device=x.device)
+
+            # Fill in the values - even indices get low bits, odd indices get high bits
+            result[..., 0::2] = left_side  # Even indices from low bits
+            result[..., 1::2] = right_side  # Odd indices from high bits
+
+            return result
+
+        e8m0_scale = scale
+        block_size = block_sizes[-1]
+
+        # Unfuse the uint8 values back to uint4
+        x_unfused = unfuse_uint8_to_uint4(quantized_data)
+        # Extract sign and magnitude
+        sign = 1 - 2 * ((x_unfused & 0b1000) >> 3).to(
+            torch.float32
+        )  # Extract sign bit and convert to +1/-1
+        magnitude = x_unfused & 0b0111  # Extract magnitude bits
+        magnitude = magnitude.to(torch.long)
+
+        # Create a tensor with the E2M1 values
+        values = torch.tensor(cls.E2M1_values, device=quantized_data.device)
+
+        # Use gather to index the values tensor properly
+        # We need to reshape magnitude to match the dimensions we want to gather along
+        original_shape = magnitude.shape
+        x_float = values[magnitude.reshape(-1)].reshape(original_shape)
+
+        # Apply sign and scale
+        x_float = sign.float() * x_float
+
+        # Reshape to apply block-wise scaling
+        x_float = x_float.reshape(-1, block_size)
+
+        # Apply the E8M0 scale
+        scale_factor = torch.exp2(e8m0_scale.float() - 127)
+        scale_factor = scale_factor.reshape(-1, 1)  # Reshape for proper broadcasting
+
+        # Apply scaling and reshape back to original shape
+        x_float = x_float * scale_factor
+
+        # Reshape back to the original shape
+        return x_float.reshape(original_shape).to(dtype)
diff --git a/python/sglang/srt/layers/quantization/petit.py b/python/sglang/srt/layers/quantization/petit.py
new file mode 100644
index 000000000..2c608507c
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/petit.py
@@ -0,0 +1,252 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/modelopt.py
+
+
+import logging
+from typing import Any, Callable, Dict, List, Optional
+
+import regex as re
+import torch
+from torch.nn.parameter import Parameter
+
+from sglang.srt.layers.linear import LinearBase, UnquantizedLinearMethod
+from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
+from sglang.srt.layers.quantization.base_config import (
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.petit_utils import (
+    apply_petit_nvfp4_linear,
+    prepare_nvfp4_layer_for_petit,
+    verify_petit_nvfp4_supported,
+)
+from sglang.srt.layers.quantization.utils import is_layer_skipped
+from sglang.srt.utils import is_hip
+
+_is_hip = is_hip()
+
+# Initialize logger for the module
+logger = logging.getLogger(__name__)
+
+
+# Configuration class to support the NVFP4 quantized model generated by the ModelOpt quantization tool
+class PetitNvFp4Config(QuantizationConfig):
+    """Config class for Petit FP4."""
+
+    def __init__(
+        self,
+        is_checkpoint_nvfp4_serialized: bool = False,
+        kv_cache_quant_algo: str = None,
+        group_size: int = None,
+        exclude_modules: List[str] = None,
+    ) -> None:
+        self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized
+        if is_checkpoint_nvfp4_serialized:
+            logger.warning(
+                "Detected nvfp4 checkpoint. Please note that the "
+                "format is experimental and subject to change."
+            )
+        self.group_size = group_size
+        self.kv_cache_quant_algo = kv_cache_quant_algo
+        self.exclude_modules = exclude_modules
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "petit_nvfp4"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # Petit supports the gfx90a and gfx942 GPUs
+        return 90
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["hf_quant_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "PetitNvFp4Config":
+        quant_config = cls.get_from_keys(config, ["quantization"])
+        quant_method = quant_config["quant_algo"]
+        group_size = quant_config.get("group_size", None)
+        verify_petit_nvfp4_supported(quant_method, group_size)
+
+        is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method
+        kv_cache_quant_algo = quant_config["kv_cache_quant_algo"]
+        if not kv_cache_quant_algo:
+            kv_cache_quant_algo = "auto"
+        exclude_modules = quant_config.get("exclude_modules", None)
+        if not (group_size and kv_cache_quant_algo and (exclude_modules is not None)):
+            logger.warning(
+                f"group_size: {group_size},"
+                f"kv_cache_quant_algo: {kv_cache_quant_algo},"
+                f"exclude_modules: {exclude_modules}"
+            )
+            raise ValueError(
+                "NVFP4 quantization requires group size and "
+                "kv_cache_quant_algo specified in "
+                "hf_quant_config.json"
+            )
+        return cls(
+            is_checkpoint_nvfp4_serialized,
+            kv_cache_quant_algo,
+            group_size,
+            exclude_modules,
+        )
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:
+        can_convert = cls.is_petit_nvfp4_compatible(hf_quant_cfg)
+        if can_convert:
+            return cls.get_name()
+        return None
+
+    @classmethod
+    def is_petit_nvfp4_compatible(cls, quant_config: Dict[str, Any]) -> bool:
+        quant_method = quant_config.get("quant_method", "").lower()
+        return _is_hip and quant_method == "modelopt"
+
+    def is_layer_excluded(self, prefix: str, exclude_modules: list):
+        for pattern in exclude_modules:
+            regex_str = pattern.replace(".", r"\.").replace("*", r".*")
+            if re.fullmatch(regex_str, prefix):
+                return True
+        return False
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix, self.exclude_modules) or self.is_layer_excluded(
+                prefix, self.exclude_modules
+            ):
+                return UnquantizedLinearMethod()
+            return PetitNvFp4LinearMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class PetitNvFp4LinearMethod(LinearMethodBase):
+    """Linear method for NVFP4.
+    Supports loading NVFP4 checkpoints with the following structure:
+
+    |Tensor Name           | datatype      |  shape      |
+    |----------------------------------------------------|
+    |input_scale           | torch.float32 | scalar      |
+    |weight                | NVFP4(SE2M1)  | [1, X, y/2] |
+    |weight_scale          | FP8-E4M3      | [X, Y]      |
+    |weight_scale_2        | torch.float32 | scalar      |
+
+    The weights are quantized per block of 16 elements.
+    Args: quant_config: The ModelOpt quantization config.
+    """
+
+    def __init__(self, quant_config: PetitNvFp4Config):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del input_size, output_size
+        if not self.quant_config.is_checkpoint_nvfp4_serialized:
+            raise ValueError(
+                "NVFP4 quantization was selected, "
+                " dynamic quantization is not supported."
+            )
+
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        layer.logical_widths = output_partition_sizes
+
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        if input_size_per_partition % 16 != 0:
+            raise ValueError(
+                "Unsupported model when in features size is " "not multiple of 16"
+            )
+
+        weight_dtype = (
+            torch.float8_e4m3fn
+            if self.quant_config.is_checkpoint_nvfp4_serialized
+            else params_dtype
+        )
+
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                # 2 fp4 data is packed in one uint8 in the input dimension
+                output_size_per_partition,
+                input_size_per_partition // 2,
+                dtype=torch.uint8,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        input_scale = PerTensorScaleParameter(
+            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+
+        layer.register_parameter("input_scale", input_scale)
+
+        weight_scale_2 = PerTensorScaleParameter(
+            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_scale_2", weight_scale_2)
+
+        weight_scale = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // self.quant_config.group_size,
+                dtype=weight_dtype,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        input_scale_2 = layer.input_scale.max().to(torch.float32)
+        weight_scale_2 = layer.weight_scale_2.max().to(torch.float32)
+        layer.input_scale = Parameter(input_scale_2, requires_grad=False)
+        layer.weight_scale_2 = Parameter(weight_scale_2, requires_grad=False)
+        layer.alpha = Parameter(
+            layer.input_scale * layer.weight_scale_2, requires_grad=False
+        )
+
+        prepare_nvfp4_layer_for_petit(layer)
+        del layer.input_scale
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return apply_petit_nvfp4_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            weight_scale_2=layer.weight_scale_2,
+            size_n=layer.output_size_per_partition,
+            size_k=layer.input_size_per_partition,
+            bias=bias,
+        )
diff --git a/python/sglang/srt/layers/quantization/petit_utils.py b/python/sglang/srt/layers/quantization/petit_utils.py
new file mode 100644
index 000000000..529869f24
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/petit_utils.py
@@ -0,0 +1,104 @@
+from typing import Optional
+
+import torch
+
+try:
+    from petit_kernel import mul_nvfp4_a16, process_nvfp4_scales, repack_nvfp4
+except ImportError:
+
+    def _check_petit_nvfp4_supported(
+        quant_method: str, group_size: Optional[int]
+    ) -> tuple[bool, Optional[str]]:
+        return (
+            False,
+            "Petit is not installed. Please install it with `pip install petit-kernel`.",
+        )
+
+    def prepare_nvfp4_layer_for_petit(layer: torch.nn.Module) -> None:
+        raise ValueError(
+            "Petit is not installed. Please install it with `pip install petit-kernel`."
+        )
+
+    def apply_petit_nvfp4_linear(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        weight_scale_2: torch.Tensor,
+        size_n: int,
+        size_k: int,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise ValueError(
+            "Petit is not installed. Please install it with `pip install petit-kernel`."
+        )
+
+
+def _check_petit_nvfp4_supported(
+    quant_method: str, group_size: Optional[int]
+) -> tuple[bool, Optional[str]]:
+    if quant_method != "NVFP4":
+        return (
+            False,
+            "Petit currently only supports: NVFP4"
+            " quantizations in sglang. Please check the "
+            "`hf_quant_config.json` file for your model's "
+            "quant configuration.",
+        )
+    if group_size is not None and group_size != 16:
+        return (
+            False,
+            "Petit currently only supports: group_size=16" " quantizations.",
+        )
+    return (True, None)
+
+
+def verify_petit_nvfp4_supported(quant_method: str, group_size: Optional[int]) -> None:
+    supported, error_msg = _check_petit_nvfp4_supported(quant_method, group_size)
+    if not supported:
+        raise ValueError(error_msg)
+
+
+def prepare_nvfp4_layer_for_petit(layer: torch.nn.Module) -> None:
+    # Repack weights to petit format
+    part_size_n = layer.output_size_per_partition
+    part_size_k = layer.input_size_per_partition
+    qweight = layer.weight.view(torch.int32).contiguous()
+    petit_qweight = repack_nvfp4(qweight, size_n=part_size_n, size_k=part_size_k)
+    layer.weight = torch.nn.Parameter(petit_qweight, requires_grad=False)
+
+    # Permute scales
+    weight_scale = process_nvfp4_scales(
+        scales=layer.weight_scale, size_k=part_size_k, size_n=part_size_n
+    )
+    layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
+
+    return
+
+
+def apply_petit_nvfp4_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    weight_scale_2: torch.Tensor,
+    size_n: int,
+    size_k: int,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (size_n,)
+
+    # TODO: Use auto-tuning to find the performant solution_id
+    output = mul_nvfp4_a16(
+        a=reshaped_x,
+        b=weight,
+        s=weight_scale,
+        global_scale=weight_scale_2,
+        size_m=reshaped_x.size(0),
+        size_n=size_n,
+        size_k=size_k,
+        solution_id=-1,
+    )
+    if bias is not None:
+        output.add_(bias)  # In-place add
+
+    return output.reshape(out_shape)
diff --git a/python/sglang/srt/layers/quantization/qoq.py b/python/sglang/srt/layers/quantization/qoq.py
new file mode 100644
index 000000000..ec0fda482
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/qoq.py
@@ -0,0 +1,245 @@
+from __future__ import annotations
+
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from sglang.srt.layers.parameter import (
+    ChannelQuantScaleParameter,
+    GroupQuantScaleParameter,
+    ModelWeightParameter,
+)
+from sglang.srt.layers.quantization.base_config import (
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
+from sglang.srt.utils import is_cuda
+
+_is_cuda = is_cuda()
+if _is_cuda:
+    from sgl_kernel import qserve_w4a8_per_chn_gemm, qserve_w4a8_per_group_gemm
+
+
+QoQ_SUPPORTED_WEIGHT_BITS = [4]
+QoQ_SUPPORTED_GROUP_SIZES = [-1, 128]
+
+
+class QoQConfig(QuantizationConfig):
+    """Config class for QoQ Quantization.
+
+    - Weight: static, per-channel/group, asymmetric
+    - Activation: dynamic, per-token, symmetric
+
+    Reference: https://arxiv.org/abs/2405.04532
+    https://github.com/mit-han-lab/omniserve
+    """
+
+    def __init__(self, weight_bits: int, group_size: int) -> None:
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+
+        # Verify
+        if self.weight_bits not in QoQ_SUPPORTED_WEIGHT_BITS:
+            raise ValueError(
+                f"QoQ does not support weight_bits = {self.weight_bits}. "
+                f"Only weight_bits = {QoQ_SUPPORTED_WEIGHT_BITS} "
+                "are supported."
+            )
+        if self.group_size not in QoQ_SUPPORTED_GROUP_SIZES:
+            raise ValueError(
+                f"QoQ does not support group_size = {self.group_size}. "
+                f"Only group_sizes = {QoQ_SUPPORTED_GROUP_SIZES} "
+                "are supported."
+            )
+
+        # 4 bits packed into 8 bit datatype.
+        self.pack_factor = 8 // self.weight_bits
+
+    def __repr__(self) -> str:
+        return "QoQConfig(weight_bits={}, group_size={})".format(
+            self.weight_bits, self.group_size
+        )
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.float16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "qoq"
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        """List of filenames to search for in the model directory."""
+        return [
+            "quant_config.json",
+            "quantize_config.json",
+        ]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> QoQConfig:
+        weight_bits = cls.get_from_keys(config, ["wbits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        return cls(weight_bits, group_size)
+
+    def get_quant_method(
+        self,
+        layer: torch.nn.Module,
+        prefix: str,
+    ) -> Optional[QuantizeMethodBase]:
+        from sglang.srt.layers.linear import LinearBase
+
+        if isinstance(layer, LinearBase):
+            return QoQLinearMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class QoQLinearMethod(LinearMethodBase):
+    """Linear method for QoQ.
+
+    Args:
+        quant_config: The QoQ quantization config.
+    """
+
+    def __init__(self, quant_config: QoQConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        # Validate output_size_per_partition
+        output_size_per_partition = sum(output_partition_sizes)
+        if output_size_per_partition % 32 != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by 32."
+            )
+
+        # Validate input_size_per_partition
+        if input_size_per_partition % self.quant_config.pack_factor != 0:
+            raise ValueError(
+                f"Weight input_size_per_partition = "
+                f"{input_size_per_partition} is not divisible by "
+                f"pack_factor = {self.quant_config.pack_factor}."
+            )
+        if (
+            self.quant_config.group_size != -1
+            and input_size_per_partition % self.quant_config.group_size != 0
+        ):
+            raise ValueError(
+                f"Weight input_size_per_partition = "
+                f"{input_size_per_partition} is not divisible by "
+                f"group_size = {self.quant_config.group_size}."
+            )
+
+        qweight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int8,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("qweight", qweight)
+
+        s1_scales = ChannelQuantScaleParameter(
+            data=torch.empty(output_size_per_partition, dtype=torch.float16),
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("s1_scales", s1_scales)
+
+        if self.quant_config.group_size == -1:
+            s1_szeros = ChannelQuantScaleParameter(
+                data=torch.empty(output_size_per_partition, dtype=torch.float16),
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("s1_szeros", s1_szeros)
+        else:
+            s2_scales = GroupQuantScaleParameter(
+                data=torch.empty(
+                    (
+                        input_size_per_partition // self.quant_config.group_size,
+                        output_size_per_partition,
+                    ),
+                    dtype=torch.int8,
+                ),
+                input_dim=0,
+                output_dim=1,
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("s2_scales", s2_scales)
+
+            s2_zeros = GroupQuantScaleParameter(
+                data=torch.empty(
+                    (
+                        input_size_per_partition // self.quant_config.group_size,
+                        output_size_per_partition,
+                    ),
+                    dtype=torch.int8,
+                ),
+                input_dim=0,
+                output_dim=1,
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("s2_zeros", s2_zeros)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.qweight = Parameter(layer.qweight.data, requires_grad=False)
+        layer.s1_scales = Parameter(layer.s1_scales.data, requires_grad=False)
+        if self.quant_config.group_size == -1:
+            layer.s1_szeros = Parameter(layer.s1_szeros.data, requires_grad=False)
+        else:
+            layer.s2_scales = Parameter(layer.s2_scales.data, requires_grad=False)
+            layer.s2_zeros = Parameter(layer.s2_zeros.data, requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ):
+        assert x.dtype == torch.float16, "QoQ only supports float16 input now"
+        if self.quant_config.group_size == -1:
+            x_q, x_scale, x_sum = per_token_quant_int8(
+                x, scale_dtype=x.dtype, cal_sum=True
+            )
+            out = qserve_w4a8_per_chn_gemm(
+                x_q, layer.qweight, layer.s1_scales, x_scale, layer.s1_szeros, x_sum
+            )
+        else:
+            x_q, x_scale = per_token_quant_int8(x, scale_dtype=x.dtype)
+            out = qserve_w4a8_per_group_gemm(
+                x_q,
+                layer.qweight,
+                layer.s2_zeros,
+                layer.s2_scales,
+                layer.s1_scales,
+                x_scale,
+            )
+        if bias is not None:
+            out = out + bias
+        return out
diff --git a/python/sglang/srt/layers/quantization/quark/__init__.py b/python/sglang/srt/layers/quantization/quark/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/sglang/srt/layers/quantization/quark/quark.py b/python/sglang/srt/layers/quantization/quark/quark.py
new file mode 100644
index 000000000..6d5a66544
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/quark/quark.py
@@ -0,0 +1,390 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import fnmatch
+import logging
+from typing import Any, List, Optional, cast
+
+import torch
+
+from sglang.srt.layers.linear import LinearBase, UnquantizedLinearMethod
+from sglang.srt.layers.quantization.base_config import (  # noqa: E501
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.kv_cache import BaseKVCacheMethod
+from sglang.srt.layers.quantization.quark.quark_moe import QuarkMoEMethod
+from sglang.srt.layers.quantization.quark.schemes import QuarkScheme, QuarkW4A4MXFP4
+from sglang.srt.layers.quantization.quark.utils import deep_compare, should_ignore_layer
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.utils import get_device_capability
+
+__all__ = ["QuarkLinearMethod"]
+
+logger = logging.getLogger(__name__)
+
+
+class QuarkConfig(QuantizationConfig):
+
+    def __init__(
+        self,
+        quant_config: dict[str, Any],
+        kv_cache_group: Optional[list[str]] = None,
+        kv_cache_config: Optional[dict[str, Any]] = None,
+        pack_method: str = "reorder",
+    ):
+        super().__init__()
+        if kv_cache_group is None:
+            kv_cache_group = []
+        self.quant_config = quant_config
+        self.kv_cache_group = kv_cache_group
+        self.kv_cache_config = kv_cache_config
+        self.pack_method = pack_method
+
+        self.packed_modules_mapping = self.quant_config["packed_modules_mapping"]
+
+    def get_linear_method(self) -> "QuarkLinearMethod":
+        return QuarkLinearMethod(self)
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    def get_name(self) -> str:
+        return "quark"
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        # Check if the layer is skipped for quantization.
+        exclude_layers = cast(list[str], self.quant_config.get("exclude"))
+        if should_ignore_layer(
+            prefix, ignore=exclude_layers, fused_mapping=self.packed_modules_mapping
+        ):
+            return UnquantizedLinearMethod()
+
+        if isinstance(layer, LinearBase):
+            scheme = self.get_scheme(layer=layer, layer_name=prefix)
+            layer.scheme = scheme
+            return QuarkLinearMethod(self)
+
+        if isinstance(layer, RadixAttention):
+            return QuarkKVCacheMethod(self)
+
+        from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+
+        if isinstance(layer, FusedMoE):
+            return QuarkMoEMethod.get_moe_method(self, module=layer, layer_name=prefix)
+
+        return None
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "QuarkConfig":
+        export_config = config.get("export")
+        if export_config is None:
+            raise ValueError(
+                "The export key should be included in "
+                "the configurations of Quark quantized model"
+            )
+
+        kv_cache_group = cast(list[str], export_config.get("kv_cache_group"))
+        pack_method = cast(str, export_config.get("pack_method"))
+
+        # In the export model of quark, the quantization configuration
+        # of kv_cache is stored in layer_quant_config. First, it is
+        # judged whether kv_cache_group exists, and then it is judged
+        # whether layer_quant_config has a quantization configuration
+        # that matches kv_cache.
+        if len(kv_cache_group) == 0:
+            kv_cache_config = None
+        else:
+            kv_cache_set = set(kv_cache_group)
+            layer_quant_config = cast(dict[str, Any], config.get("layer_quant_config"))
+            layer_quant_names = list(layer_quant_config.keys())
+            layer_quant_set = set(layer_quant_names)
+
+            if not kv_cache_set.issubset(layer_quant_set):
+                raise ValueError(
+                    "The Quark quantized model has the "
+                    "kv_cache_group parameter setting, "
+                    "but no kv_cache quantization settings "
+                    "were found in the quantization "
+                    "configuration."
+                )
+
+            q_configs = [
+                cast(dict[str, Any], layer_quant_config.get(name))
+                for name in kv_cache_group
+            ]
+            if not all(deep_compare(q_config, q_configs[0]) for q_config in q_configs):
+                raise ValueError(
+                    "The quantization method used for kv_cache should "
+                    "be the same, but the quantization method for the "
+                    "kv_cache layer in the config is different."
+                )
+            kv_cache_config = q_configs[0].get("output_tensors")
+            if kv_cache_config is None:
+                raise ValueError("The kv_cache quantization configuration is empty.")
+
+            # Since we have already set kv_cache quantization configurations,
+            # we will remove the quantization configuration for the
+            # output_tensors corresponding to the kv_cache layer.
+            for q_config in q_configs:
+                q_config["output_tensors"] = None
+
+            # In case q_proj output is also quantized, remove the configuration
+            # to keep qkv consistency.
+            q_proj_q_config = cast(dict[str, Any], layer_quant_config.get("*q_proj"))
+            if q_proj_q_config is not None:
+                q_proj_q_config["output_tensors"] = None
+
+        return cls(
+            quant_config=config,
+            kv_cache_group=kv_cache_group,
+            kv_cache_config=kv_cache_config,
+            pack_method=pack_method,
+        )
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return []
+
+    def _check_scheme_supported(self, min_capability: int, error: bool = True) -> bool:
+        capability_tuple = get_device_capability()
+
+        if capability_tuple is not None:
+            assert 0 <= capability_tuple[1] < 10
+            capability = capability_tuple[0] * 10 + capability_tuple[1]
+
+            supported = capability >= min_capability
+            if error and not supported:
+                raise RuntimeError(
+                    "Quantization scheme is not supported for ",
+                    f"the current GPU. Min capability: {min_capability}. ",
+                    f"Current capability: {capability}.",
+                )
+            return supported
+        else:
+            return False
+
+    def _is_mx_fp4(
+        self,
+        weight_quant: Optional[dict[str, Any]],
+        input_quant: Optional[dict[str, Any]],
+    ) -> bool:
+        # Confirm weights and input quantized.
+        if weight_quant is None or input_quant is None:
+            logger.debug(
+                "Quark model is not in MX-FP4 format: "
+                "weight_quant or input_quant not set"
+            )
+            return False
+
+        # Input and weight dtype needs to be fp4.
+        if weight_quant.get("dtype") != "fp4" or input_quant.get("dtype") != "fp4":
+            logger.debug("Quark model is not in MX-FP4 format: dtype not fp4")
+            return False
+
+        # Input and weight qscheme needs to be per group.
+        if (
+            weight_quant.get("qscheme") != "per_group"
+            or input_quant.get("qscheme") != "per_group"
+        ):
+            logger.debug("Quark model is not in MX-FP4 format: not per_group")
+            return False
+
+        # Input and weight group size needs to be 32.
+        if weight_quant.get("group_size") != 32 or input_quant.get("group_size") != 32:
+            logger.debug("Quark model is not in MX-FP4 format: not group_size=32")
+            return False
+
+        # Weights need to use static quantization.
+        if weight_quant.get("is_dynamic") is True:
+            logger.debug("Quark model is not in MX-FP4 format: not weight static")
+            return False
+
+        # Activations need to use dynamic quantization.
+        if input_quant.get("is_dynamic") is False:
+            logger.debug("Quark model is not in MX-FP4 format: not activation dynamic")
+            return False
+
+        # Activations and weight scales need to be in e8m0 format.
+        if (
+            weight_quant.get("scale_format") != "e8m0"
+            or input_quant.get("scale_format") != "e8m0"
+        ):
+            logger.debug("Quark model is not in MX-FP4 format: not scale_format e8m0")
+            return False
+
+        return True
+
+    def _find_matched_config(
+        self, layer_name: str, module: torch.nn.Module
+    ) -> dict[str, Any]:
+
+        proj_name = layer_name.split(".")[-1]
+        if proj_name in self.packed_modules_mapping:
+            shard_proj_names = self.packed_modules_mapping[proj_name]
+
+            # Convert fused_name --> [shard_names]
+            shard_names = [
+                layer_name.replace(proj_name, shard_proj_name)
+                for shard_proj_name in shard_proj_names
+            ]
+            shard_configs = [
+                self._find_matched_config(shard_name, module)
+                for shard_name in shard_names
+            ]
+            if not all(
+                deep_compare(q_config, shard_configs[0]) for q_config in shard_configs
+            ):
+                raise ValueError(
+                    f"Found a different quantization configuration for "
+                    f"{shard_proj_names} in {layer_name}. vLLM "
+                    "requires all to use the same scheme."
+                )
+            return shard_configs[0]
+        else:
+            layer_quant_config = cast(
+                dict[str, Any], self.quant_config.get("layer_quant_config")
+            )
+            for name_pattern in layer_quant_config:
+                if fnmatch.fnmatch(layer_name, name_pattern):
+                    return layer_quant_config[name_pattern]
+
+            layer_type = type(module).__name__
+            layer_type_quant_config = cast(
+                dict[str, Any], self.quant_config.get("layer_type_quant_config")
+            )
+            if layer_type in layer_type_quant_config:
+                return layer_type_quant_config[layer_type]
+
+            global_quant_config = cast(
+                dict[str, Any], self.quant_config.get("global_quant_config")
+            )
+            return global_quant_config
+
+    def _get_scheme_from_config(self, config: dict[str, Any]) -> "QuarkScheme":
+        if config.get("output_tensors") or config.get("bias"):
+            raise NotImplementedError(
+                "Currently, Quark models with output_tensors "
+                "and bias quantized are not supported"
+            )
+        weight_config = cast(dict[str, Any], config.get("weight"))
+        input_config = cast(dict[str, Any], config.get("input_tensors"))
+
+        if self._is_mx_fp4(weight_config, input_config):
+            return QuarkW4A4MXFP4(weight_config, input_config)
+
+        raise NotImplementedError(
+            "No quark compatible scheme was found. "
+            f"Weight config: {weight_config}, "
+            f"Input config: {input_config}"
+        )
+
+    def get_scheme(self, layer: torch.nn.Module, layer_name: str) -> "QuarkScheme":
+
+        layer_quant_config = self._find_matched_config(layer_name, layer)
+
+        # Find the quant_scheme
+        scheme = self._get_scheme_from_config(layer_quant_config)
+
+        # Raise error if device does not support the scheme
+        # (e.g. fp8 needs ada lovelace)
+        self._check_scheme_supported(scheme.get_min_capability())
+
+        return scheme
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class QuarkLinearMethod(LinearMethodBase):
+
+    def __init__(self, quantization_config: QuarkConfig):
+        self.quantization_config = quantization_config
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.scheme.process_weights_after_loading(layer)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        """
+        Use the CompressedTensorsScheme associated with each layer to create
+        the necessary parameters for the layer. See LinearMethodBase for param
+        details
+        """
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.scheme.create_weights(
+            layer=layer,
+            input_size=input_size,
+            input_size_per_partition=input_size_per_partition,
+            output_partition_sizes=output_partition_sizes,
+            output_size=output_size,
+            params_dtype=params_dtype,
+            weight_loader=weight_loader,
+        )
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ):
+        """
+        Use the output of create_weights and the CompressedTensorsScheme
+        associated with the layer to apply the forward pass with the
+        layer input.  See LinearMethodBase for param details
+
+        """
+        scheme = layer.scheme
+        if scheme is None:
+            raise ValueError("A scheme must be defined for each layer")
+        return scheme.apply_weights(layer, x, bias=bias)
+
+
+class QuarkKVCacheMethod(BaseKVCacheMethod):
+    """
+    Supports loading kv-cache scaling factors from quark checkpoints.
+    """
+
+    def __init__(self, quant_config: QuarkConfig):
+        self.validate_kv_cache_config(quant_config.kv_cache_config)
+        super().__init__(quant_config)
+
+    @staticmethod
+    def validate_kv_cache_config(kv_cache_config: Optional[dict[str, Any]]):
+        """
+        Validator for the kv cache configuration. Useful for controlling the
+        kv cache quantization schemes, that are being supported in vLLM
+        :param kv_cache_config: the quark kv cache scheme
+        """
+        if kv_cache_config is None:
+            return
+
+        dtype = kv_cache_config.get("dtype")
+        if dtype != "fp8_e4m3":
+            raise NotImplementedError(
+                "Currently supported kv cache quantization is "
+                f"dtype=fp8_e4m3, however received {dtype}"
+            )
+
+        qscheme = kv_cache_config.get("qscheme")
+        if qscheme != "per_tensor":
+            raise NotImplementedError(
+                "Only support per-tensor scaling factor "
+                "for quark KV cache. "
+                f"Expected qscheme: per_tensor, found qscheme: {qscheme}"
+            )
diff --git a/python/sglang/srt/layers/quantization/quark/quark_moe.py b/python/sglang/srt/layers/quantization/quark/quark_moe.py
new file mode 100644
index 000000000..f6e750a2c
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/quark/quark_moe.py
@@ -0,0 +1,202 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any, Callable, Optional
+
+import torch
+from aiter import ActivationType, QuantType, biased_grouped_topk
+from aiter.fused_moe import fused_moe
+from aiter.utility.fp4_utils import e8m0_shuffle
+
+from sglang.srt.layers.moe import MoeRunnerConfig
+from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase
+from sglang.srt.utils import get_bool_env_var, mxfp_supported, set_weight_attrs
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+    from sglang.srt.layers.quantization.quark.quark import QuarkConfig
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["QuarkMoEMethod", "QuarkW4A4MXFp4MoEMethod"]
+
+OCP_MX_BLOCK_SIZE = 32
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.quantization import QuarkConfig
+
+
+class QuarkMoEMethod(FusedMoEMethodBase):
+
+    def __init__(self, quant_config: QuarkConfig):
+        self.quant_config = quant_config
+
+    @staticmethod
+    def get_moe_method(
+        quant_config: QuarkConfig,  # type: ignore # noqa E501 # noqa F821
+        module: torch.nn.Module,
+        layer_name: str,
+    ) -> "QuarkMoEMethod":
+        layer_quant_config = quant_config._find_matched_config(layer_name, module)
+
+        if layer_quant_config.get("output_tensors") or layer_quant_config.get("bias"):
+            raise NotImplementedError(
+                "Currently, Quark models with "
+                "output_tensors and bias "
+                "quantized are not supported"
+            )
+        weight_config = layer_quant_config.get("weight")
+        input_config = layer_quant_config.get("input_tensors")
+
+        if quant_config._is_mx_fp4(weight_config, input_config):
+            return QuarkW4A4MXFp4MoEMethod(weight_config, input_config)
+        else:
+            raise RuntimeError("Unsupported FusedMoe scheme")
+
+
+class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod):
+
+    def __init__(self, weight_config: dict[str, Any], input_config: dict[str, Any]):
+        self.weight_quant = weight_config
+        self.input_quant = input_config
+
+        weight_qscheme = self.weight_quant.get("qscheme")
+        input_qscheme = self.input_quant.get("qscheme")
+        if not (weight_qscheme == "per_group" and input_qscheme == "per_group"):
+            raise ValueError(
+                "For MX(FP4) Fused MoE layers, only per-group scales "
+                "for weights and activations are supported. Found "
+                f"{weight_qscheme}, {input_qscheme}"
+            )  # noqa E501
+
+        self.static_input_scales = not self.input_quant.get("is_dynamic")
+        self.with_bias = False
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        # Add the quantization method used (per tensor/grouped/channel)
+        # to ensure the weight scales are loaded in properly
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value}
+        )
+
+        params_dtype = torch.uint8
+
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // 2,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // 2,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # WEIGHT_SCALES
+        w13_weight_scale = torch.nn.Parameter(
+            torch.ones(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // OCP_MX_BLOCK_SIZE,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        w2_weight_scale = torch.nn.Parameter(
+            torch.ones(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // OCP_MX_BLOCK_SIZE,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        float_dtype = torch.get_default_dtype()
+
+        # Pre-shuffle weight scales
+        s0, s1, _ = layer.w13_weight_scale.shape
+        w13_weight_scale = layer.w13_weight_scale.view(s0 * s1, -1)
+        w13_weight_scale = e8m0_shuffle(w13_weight_scale)
+        # layer.w13_weight_scale = torch.nn.Parameter(w13_weight_scale, requires_grad=False)
+        layer.w13_weight_scale.data = w13_weight_scale.view(s0, s1, -1)
+
+        s0, s1, _ = layer.w2_weight_scale.shape
+        w2_weight_scale = layer.w2_weight_scale.view(s0 * s1, -1)
+        w2_weight_scale = e8m0_shuffle(w2_weight_scale)
+        # layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale, requires_grad=False)
+        layer.w2_weight_scale.data = w2_weight_scale.view(s0, s1, -1)
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+        moe_runner_config = self.moe_runner_config
+        topk_weights, topk_ids, _ = topk_output
+
+        output = fused_moe(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights,
+            topk_ids,
+            quant_type=QuantType.per_1x32,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            activation=(
+                ActivationType.Silu
+                if moe_runner_config.activation == "silu"
+                else ActivationType.Gelu
+            ),
+            doweight_stage1=False,
+        )
+        return StandardCombineInput(hidden_states=output)
diff --git a/python/sglang/srt/layers/quantization/quark/schemes/__init__.py b/python/sglang/srt/layers/quantization/quark/schemes/__init__.py
new file mode 100644
index 000000000..91b08c512
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/quark/schemes/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from .quark_scheme import QuarkScheme
+from .quark_w4a4_mxfp4 import QuarkW4A4MXFP4
+
+__all__ = ["QuarkScheme", "QuarkW4A4MXFP4"]
diff --git a/python/sglang/srt/layers/quantization/quark/schemes/quark_scheme.py b/python/sglang/srt/layers/quantization/quark/schemes/quark_scheme.py
new file mode 100644
index 000000000..aab5c9c1e
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/quark/schemes/quark_scheme.py
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from abc import ABC, abstractmethod
+from typing import Optional
+
+import torch
+
+__all__ = ["QuarkScheme"]
+
+
+class QuarkScheme(ABC):
+    """
+    Abstract class used to describe the weight creation and forward pass
+    of different quantization schemes supported by Quark.
+    """
+
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        """
+        Get minimum device capability.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def create_weights(self, *args, **kwargs):
+        """
+        Weight creation for the particular scheme. Inputs to this function
+
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_weights(
+        self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor]
+    ):
+        """
+        Run the forward pass for the particular scheme. This is where
+        scheme-specific dequant/quant steps/kernels should be applied.
+
+        :param layer: torch.nn.Module with the registered weights and
+            other parameters relevant to the particular scheme.
+        :param x: input to the layer
+        :param bias: bias parameter
+
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        """
+        Called after weight loading is complete for any cleanup that
+        needs to occur.
+        """
+        raise NotImplementedError
diff --git a/python/sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py b/python/sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
new file mode 100644
index 000000000..a0787baaf
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
@@ -0,0 +1,137 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Optional
+
+import aiter
+import torch
+import torch.nn.functional as F
+from aiter.ops.gemm_op_a4w4 import gemm_a4w4
+from aiter.ops.shuffle import shuffle_weight
+from aiter.ops.triton.gemm_afp4wfp4 import gemm_afp4wfp4
+from aiter.ops.triton.gemm_afp4wfp4_pre_quant_atomic import gemm_afp4wfp4_pre_quant
+from aiter.ops.triton.quant import dynamic_mxfp4_quant
+from aiter.utility import dtypes
+from aiter.utility.fp4_utils import e8m0_shuffle
+
+from sglang.srt.layers.parameter import GroupQuantScaleParameter, PackedvLLMParameter
+from sglang.srt.layers.quantization.quark.schemes import QuarkScheme
+from sglang.srt.utils import get_bool_env_var
+
+__all__ = ["QuarkW4A4MXFP4"]
+
+OCP_MX_BLOCK_SIZE = 32
+
+
+class QuarkW4A4MXFP4(QuarkScheme):
+
+    def __init__(
+        self, weight_quant_spec: dict[str, Any], input_quant_spec: dict[str, Any]
+    ):
+        self.out_dtype = torch.get_default_dtype()
+        self.qscheme = "per_group"
+        self.weight_quant_spec = weight_quant_spec
+        self.input_quant_spec = input_quant_spec
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        return
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        output_partition_sizes: list[int],
+        input_size_per_partition: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs
+    ):
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+
+        # WEIGHT
+        weight = PackedvLLMParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // 2,
+                dtype=torch.uint8,
+            ),
+            input_dim=1,
+            output_dim=0,
+            packed_dim=1,
+            packed_factor=2,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+        weight_scale = GroupQuantScaleParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // OCP_MX_BLOCK_SIZE,
+                dtype=torch.uint8,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # This path does not have support for bias currently
+        assert bias is None, "bias is not supported"
+
+        three_d = False
+        x_s = None
+        y = None
+        if isinstance(x, tuple):
+            assert len(x) in [
+                2,
+                3,
+            ], "For tuple input, only (x, x_s) or (x, x_s, y) formats are accepted"
+            if len(x) == 2:
+                x, x_s = x
+            elif len(x) == 3:
+                x, x_s, y = x
+
+        use_fused_quant_gemm = (
+            x_s is None and y is not None and layer.weight.shape[0] == y.shape[1]
+        )
+
+        if x.dim() == 3:
+            three_d = True
+            x = x.view(-1, x.shape[-1])
+            output_shape = [*x.shape[:-1], layer.weight.shape[0]]
+
+        # use_fused_quant_gemm = true, x_q is a bf16/fp16 num
+        # x_s is not None = true, x_q is uint8 num
+        if use_fused_quant_gemm or x_s is not None:
+            x_q = x
+        else:
+            x_q, x_s = dynamic_mxfp4_quant(x)
+
+        if y is None:
+            y = torch.empty(
+                x_q.shape[0],
+                layer.weight.shape[0],
+                device=x_q.device,
+                dtype=self.out_dtype,
+            )
+
+        if use_fused_quant_gemm:
+            gemm_afp4wfp4_pre_quant(x_q, layer.weight, layer.weight_scale, y.dtype, y)
+            y = y.to(x.dtype)
+        else:
+            gemm_afp4wfp4(x_q, layer.weight, x_s, layer.weight_scale, self.out_dtype, y)
+
+        if three_d:
+            return y.view(*output_shape)
+
+        return y
diff --git a/python/sglang/srt/layers/quantization/quark/utils.py b/python/sglang/srt/layers/quantization/quark/utils.py
new file mode 100644
index 000000000..eacbf3ba9
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/quark/utils.py
@@ -0,0 +1,204 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import re
+from collections.abc import Iterable, Mapping
+from types import MappingProxyType
+from typing import Any, Optional
+
+import torch
+from aiter.ops.triton.quant import dynamic_mxfp4_quant
+from torch import nn
+
+
+def deep_compare(dict1: Any, dict2: Any) -> bool:
+    if type(dict1) is not type(dict2):
+        return False
+    if isinstance(dict1, dict):
+        if dict1.keys() != dict2.keys():
+            return False
+        return all(deep_compare(dict1[k], dict2[k]) for k in dict1)
+    elif isinstance(dict1, list):
+        return set(dict1) == set(dict2)
+    else:
+        return dict1 == dict2
+
+
+def should_ignore_layer(
+    layer_name: Optional[str],
+    ignore: Iterable[str],
+    fused_mapping: Mapping[str, list[str]] = MappingProxyType({}),
+) -> bool:
+    if layer_name is None:
+        return False
+
+    # layer_name = model.layers.0.self_attn.qkv_proj
+    # proj_name = qkv_proj
+    proj_name = layer_name.split(".")[-1]
+
+    # Fused layers like gate_up_proj or qkv_proj will not be fused
+    # in the safetensors checkpoint. So, we convert the name
+    # from the fused version to unfused + check to make sure that
+    # each shard of the fused layer has the same scheme.
+    if proj_name in fused_mapping:
+        shard_proj_names = fused_mapping[proj_name]
+
+        # Convert fused_name --> [shard_names]
+        shard_names = [
+            layer_name.replace(proj_name, shard_proj_name)
+            for shard_proj_name in shard_proj_names
+        ]
+
+        # Layer should be ignored if shards are ignored.
+        should_ignore_layer = None
+        for shard_name in shard_names:
+            should_ignore_shard = check_equal_or_regex_match(
+                layer_name=shard_name, targets=ignore
+            )
+
+            # If shard_idx=0, set layer ignore to match shard.
+            if should_ignore_layer is None:
+                should_ignore_layer = should_ignore_shard
+
+            # If shard_idx=1+ confirm scheme matches prior shards.
+            elif should_ignore_shard != should_ignore_layer:
+                raise ValueError(
+                    f"Found a different quantization schemes for "
+                    f"{shard_proj_names} in {layer_name}. vLLM "
+                    "requires all to use the same scheme."
+                )
+
+    # Unfused layers like down_proj and o_proj will match
+    # the safetensors checkpoint already.
+    else:
+        should_ignore_layer = check_equal_or_regex_match(
+            layer_name=layer_name, targets=ignore
+        )
+
+    assert should_ignore_layer is not None
+
+    return should_ignore_layer
+
+
+def check_equal_or_regex_match(layer_name: str, targets: Iterable[str]) -> bool:
+    """
+    Checks whether a layer_name is exactly equal or a regex match for
+    if target starts with 're:' to any target in list.
+    """
+    for target in targets:
+        if _is_equal_or_regex_match(layer_name, target):
+            return True
+    return False
+
+
+def _is_equal_or_regex_match(
+    value: str, target: str, check_contains: bool = False
+) -> bool:
+    """
+    Checks whether a value is exactly equal or a regex match for target
+    if target starts with 're:'. If check_contains is set to True,
+    additionally checks if the target string is contained within the value.
+    """
+
+    if target.startswith("re:"):
+        pattern = target[3:]
+        if re.match(pattern, value):
+            return True
+    elif check_contains:
+        if target.lower() in value.lower():
+            return True
+    elif target == value:
+        return True
+    return False
+
+
+# utility for tensor dims > 2 cases
+def b_dynamic_mxfp4_quant(x):
+    h, b, d = x.shape
+    x, x_scales = dynamic_mxfp4_quant(x.reshape(-1, d))
+    return x.view(h, b, d // 2), x_scales.view(h, b, d // 32)
+
+
+def mxfp4_to_f32(x, is_threed):
+    # 2 because we pack fp4 in uint8.
+    x = x.repeat_interleave(2, dim=-1)
+    if is_threed:
+        x[..., ::2] = x[..., ::2] & 0xF
+        x[..., 1::2] = x[..., 1::2] >> 4
+    else:
+        x[:, ::2] = x[:, ::2] & 0xF
+        x[:, 1::2] = x[:, 1::2] >> 4
+
+    mxfp4_list = [
+        0.0,
+        0.5,
+        1.0,
+        1.5,
+        2.0,
+        3.0,
+        4.0,
+        6.0,
+        -0.0,
+        -0.5,
+        -1.0,
+        -1.5,
+        -2.0,
+        -3.0,
+        -4.0,
+        -6.0,
+    ]
+    mxfp4_in_f32 = torch.tensor(mxfp4_list, dtype=torch.float32, device="cuda")
+    return mxfp4_in_f32[x.long()]
+
+
+def e8m0_to_f32(x):
+    # Convert the input tensor `x` (assumed to be in e8m0 format) to float32.
+    # e8m0 is a custom 8-bit floating point format with 8 bits for exponent, 0 for mantissa.
+    # This means the value is essentially 2^(exponent - 127), similar to how IEEE-754 stores floats.
+
+    # Convert x to float32 for computation, and compute the power of 2 by subtracting the bias (127).
+    x_f32 = 2 ** ((x.to(torch.float32)) - 127)
+
+    # If the exponent value was 255 (i.e., 2^(128)), this is a special case usually used to represent NaN or Inf.
+    # Since this custom format has no mantissa, treat 2^128 as NaN.
+    x_f32[x_f32 == 128] = float("nan")
+    return x_f32
+
+
+def quark_post_load_weights(self_attn: nn.Module, w: torch.Tensor, quant_format: str):
+    if "mxfp4" in quant_format:
+        # when dtype is bf16, the processing flow is to dynamic quantize bf16 tensor to uint8 tensor
+        # do w_kc (bf16) first to get the w_kc(uint8) w_s_kc(uint8)
+        # and w_vc repeating the same procedure of w_kc to get  w_vc(uint8) w_s_vc(uint8)
+        if w.dtype == torch.bfloat16:
+            w_kc, w_vc = w.unflatten(
+                0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
+            ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
+            w_kc, w_s_kc = b_dynamic_mxfp4_quant(w_kc.transpose(-2, -1))
+            w_kc = w_kc.transpose(-2, -1)
+            w_s_kc = w_s_kc.transpose(-2, -1)
+            w_vc, w_s_vc = b_dynamic_mxfp4_quant(w_vc)
+            w_s_kc = w_s_kc.transpose(1, 2).contiguous().transpose(1, 2)
+            w_s_vc = w_s_vc.contiguous().transpose(1, 2)
+        elif w.dtype == torch.uint8:  # static quant for mxfp4
+            # when dtype is uint8, it means the w has been quantized to mxfp4 format
+            # but we must separate it to w_kc and w_vc.
+            # The quantized tensor size is only half of original tensor size
+            # and the scaling factor is 1/32, the transpose behavior will be not correct
+            # need to upcast it to fp32 to separate w to w_kc and w_vc
+            # to ensure the following transpose behavior is correct
+            # and then do mxfp4 quant again
+            w = mxfp4_to_f32(w, True).to(torch.bfloat16)
+            w_scales = self_attn.kv_b_proj.weight_scale.repeat_interleave(32, dim=-1)
+            w_scales = e8m0_to_f32(w_scales).to(torch.bfloat16)
+            w = w * w_scales
+            w_kc, w_vc = w.unflatten(
+                0, (-1, (self_attn.qk_nope_head_dim + self_attn.v_head_dim))
+            ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
+            w_kc, w_s_kc = b_dynamic_mxfp4_quant(w_kc.transpose(-2, -1))
+            w_kc = w_kc.transpose(-2, -1)
+            w_s_kc = w_s_kc.transpose(-2, -1)
+            w_vc, w_s_vc = b_dynamic_mxfp4_quant(w_vc)
+            w_s_kc = w_s_kc.transpose(1, 2).contiguous().transpose(1, 2)
+            w_s_vc = w_s_vc.contiguous().transpose(1, 2)
+
+        return w_kc, w_s_kc, w_vc, w_s_vc
diff --git a/python/sglang/srt/layers/quantization/rocm_mxfp4_utils.py b/python/sglang/srt/layers/quantization/rocm_mxfp4_utils.py
new file mode 100644
index 000000000..4659f76bd
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/rocm_mxfp4_utils.py
@@ -0,0 +1,13 @@
+from aiter.ops.triton.batched_gemm_afp4wfp4_pre_quant import (
+    batched_gemm_afp4wfp4_pre_quant,
+)
+from aiter.ops.triton.fused_mxfp4_quant import (
+    fused_flatten_mxfp4_quant,
+    fused_rms_mxfp4_quant,
+)
+
+__all__ = [
+    "fused_rms_mxfp4_quant",
+    "fused_flatten_mxfp4_quant",
+    "batched_gemm_afp4wfp4_pre_quant",
+]
diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py
new file mode 100644
index 000000000..495beb009
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/unquant.py
@@ -0,0 +1,468 @@
+from __future__ import annotations
+
+import importlib.util
+from typing import TYPE_CHECKING, List, Optional
+
+import torch
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+
+from sglang.srt.custom_op import CustomOp
+from sglang.srt.layers.amx_utils import _amx_process_weight_after_loading
+from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    LinearMethodBase,
+    QuantizeMethodBase,
+)
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    get_bool_env_var,
+    is_cpu,
+    is_hip,
+    set_weight_attrs,
+    use_intel_amx_backend,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+
+has_triton_kernels = importlib.util.find_spec("triton_kernels") is not None
+
+
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_hip = is_hip()
+_is_cpu = is_cpu()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+if _use_aiter:
+    from aiter import ActivationType
+    from aiter.fused_moe import fused_moe
+    from aiter.ops.shuffle import shuffle_weight
+
+
+class UnquantizedEmbeddingMethod(QuantizeMethodBase):
+    """Unquantized method for embeddings."""
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        """Create weights for embedding layer."""
+        weight = Parameter(
+            torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, extra_weight_attrs)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return F.linear(x, layer.weight, bias)
+
+    def embedding(self, layer: torch.nn.Module, input_: torch.Tensor) -> torch.Tensor:
+        return F.embedding(input_, layer.weight)
+
+
+class UnquantizedLinearMethod(LinearMethodBase):
+    """Linear method without quantization."""
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        weight = Parameter(
+            torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if _is_cpu and _is_cpu_amx_available:
+            _amx_process_weight_after_loading(layer, ["weight"])
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        if use_intel_amx_backend(layer):
+            x_shapes = x.shape
+            if len(x_shapes) == 3:
+                x = x.view(-1, x.shape[-1])
+            output = torch.ops.sgl_kernel.weight_packed_linear(
+                x, layer.weight, bias, True  # is_vnni
+            )
+            if len(x_shapes) == 3:
+                output = output.view(x_shapes[0], x_shapes[1], -1)
+            return output
+
+        return F.linear(x, layer.weight, bias)
+
+
+class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
+    """MoE method without quantization."""
+
+    def __init__(self, use_triton_kernels: bool = False):
+        super().__init__()
+        self.use_triton_kernels = use_triton_kernels
+        self.with_bias = False
+
+        self.triton_kernel_moe_forward = None
+        self.triton_kernel_moe_with_bias_forward = None
+        if torch.cuda.is_available() and has_triton_kernels:
+            from sglang.srt.layers.moe.fused_moe_triton.triton_kernels_moe import (
+                triton_kernel_moe_forward as _tk_forward,
+            )
+            from sglang.srt.layers.moe.fused_moe_triton.triton_kernels_moe import (
+                triton_kernel_moe_with_bias_forward as _tk_with_bias_forward,
+            )
+
+            self.triton_kernel_moe_forward = _tk_forward
+            self.triton_kernel_moe_with_bias_forward = _tk_with_bias_forward
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        with_bias: bool = False,
+        **extra_weight_attrs,
+    ):
+        self.with_bias = with_bias
+
+        # Fused gate_up_proj (column parallel)
+        w13_weight_n, w13_weight_k = 2 * intermediate_size_per_partition, hidden_size
+        if self.use_triton_kernels:
+            w13_weight_n, w13_weight_k = w13_weight_k, w13_weight_n
+        w13_weight = torch.nn.Parameter(
+            torch.empty(num_experts, w13_weight_n, w13_weight_k, dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        if self.with_bias:
+            w13_weight_bias = torch.nn.Parameter(
+                torch.empty(
+                    num_experts,
+                    2 * intermediate_size_per_partition,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            layer.register_parameter("w13_weight_bias", w13_weight_bias)
+            set_weight_attrs(w13_weight_bias, extra_weight_attrs)
+
+        # down_proj (row parallel)
+        w2_weight_n, w2_weight_k = (
+            hidden_size,
+            intermediate_size_per_partition,
+        )
+        if self.use_triton_kernels:
+            w2_weight_n, w2_weight_k = w2_weight_k, w2_weight_n
+        w2_weight = torch.nn.Parameter(
+            torch.empty(num_experts, w2_weight_n, w2_weight_k, dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        if self.with_bias:
+            w2_weight_bias = torch.nn.Parameter(
+                torch.empty(num_experts, hidden_size, dtype=torch.float32),
+                requires_grad=False,
+            )
+            layer.register_parameter("w2_weight_bias", w2_weight_bias)
+            set_weight_attrs(w2_weight_bias, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if _use_aiter:
+            layer.w13_weight = torch.nn.Parameter(
+                shuffle_weight(layer.w13_weight.data, (16, 16)),
+                requires_grad=False,
+            )
+            torch.cuda.empty_cache()
+            layer.w2_weight = torch.nn.Parameter(
+                shuffle_weight(layer.w2_weight.data, (16, 16)),
+                requires_grad=False,
+            )
+            torch.cuda.empty_cache()
+
+        # Pack weight for get better performance on CPU
+        if _is_cpu and _is_cpu_amx_available:
+            _amx_process_weight_after_loading(layer, ["w13_weight", "w2_weight"])
+
+        return
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+        self.runner = MoeRunner(MoeRunnerBackend.TRITON, moe_runner_config)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+
+        return self.forward(
+            layer=layer,
+            dispatch_output=dispatch_output,
+        )
+
+    def forward_cuda(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        moe_runner_config = self.moe_runner_config
+
+        if self.use_triton_kernels:
+            if self.with_bias:
+                assert self.triton_kernel_moe_with_bias_forward is not None
+                output = self.triton_kernel_moe_with_bias_forward(
+                    hidden_states=x,
+                    w1=layer.w13_weight,
+                    w2=layer.w2_weight,
+                    b1=layer.w13_weight_bias,
+                    b2=layer.w2_weight_bias,
+                    topk_output=topk_output,
+                    moe_runner_config=moe_runner_config,
+                    w1_pcg=None,
+                    w2_pcg=None,
+                )
+            else:
+                assert self.triton_kernel_moe_forward is not None
+                output = self.triton_kernel_moe_forward(
+                    hidden_states=x,
+                    w1=layer.w13_weight,
+                    w2=layer.w2_weight,
+                    topk_output=topk_output,
+                    moe_runner_config=moe_runner_config,
+                )
+            return StandardCombineInput(hidden_states=output)
+        else:
+            if _use_aiter:
+                assert not moe_runner_config.no_combine, "unsupported"
+                topk_weights, topk_ids, _ = topk_output
+                if moe_runner_config.apply_router_weight_on_input:
+                    assert (
+                        topk_weights.dim() == 2
+                    ), "`topk_weights` should be in shape (num_tokens, topk)"
+                    _, topk = topk_weights.shape
+                    assert (
+                        topk == 1
+                    ), "Only support topk=1 when `apply_router_weight_on_input` is True"
+                    x = x * topk_weights.to(x.dtype)
+                    topk_weights = torch.ones_like(
+                        topk_weights, dtype=torch.float32
+                    )  # topk_weights must be FP32 (float32)
+                output = fused_moe(
+                    x,
+                    layer.w13_weight,
+                    layer.w2_weight,
+                    topk_weights,
+                    topk_ids,
+                    activation=(
+                        ActivationType.Silu
+                        if moe_runner_config.activation == "silu"
+                        else ActivationType.Gelu
+                    ),
+                )
+                return StandardCombineInput(hidden_states=output)
+            else:
+
+                quant_info = TritonMoeQuantInfo(
+                    w13_weight=layer.w13_weight,
+                    w2_weight=layer.w2_weight,
+                    b13=getattr(layer, "w13_weight_bias", None),
+                    b2=getattr(layer, "w2_weight_bias", None),
+                )
+                return self.runner.run(dispatch_output, quant_info)
+
+    def forward_cpu(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        moe_runner_config = self.moe_runner_config
+
+        assert (
+            moe_runner_config.activation == "silu"
+        ), f"activation = {moe_runner_config.activation} is not supported."
+
+        if (
+            use_intel_amx_backend(layer)
+            and not moe_runner_config.apply_router_weight_on_input
+        ):
+            from sglang.srt.layers.moe.topk import apply_topk_weights_cpu
+
+            topk_weights, topk_ids, _ = topk_output
+            x, topk_weights = apply_topk_weights_cpu(
+                moe_runner_config.apply_router_weight_on_input, topk_weights, x
+            )
+            output = torch.ops.sgl_kernel.fused_experts_cpu(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights,
+                topk_ids,
+                False,  # inplace # See [Note] inplace should be False in fused_experts.
+                False,  # use_int8_w8a8
+                False,  # use_fp8_w8a16
+                None,  # w1_scale
+                None,  # w2_scale
+                None,  # block_size
+                None,  # a1_scale
+                None,  # a2_scale
+                True,  # is_vnni
+            )
+            return StandardCombineInput(hidden_states=output)
+        else:
+            from sglang.srt.layers.moe.fused_moe_native import moe_forward_native
+
+            output = moe_forward_native(
+                layer,
+                x,
+                topk_output,
+                moe_runner_config,
+            )
+            return StandardCombineInput(hidden_states=output)
+
+    def forward_npu(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+
+        import torch_npu
+
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        x = dispatch_output.hidden_states
+        topk_weights, topk_ids, _ = dispatch_output.topk_output
+
+        original_dtype = x.dtype
+        num_tokens = x.shape[0]
+        topk_weights = topk_weights.to(x.dtype)
+        topk_ids = topk_ids.to(torch.int32)
+        num_experts = layer.num_experts
+        top_k = layer.top_k
+        row_idx_len = num_tokens * top_k
+        row_idx = (
+            torch.arange(0, row_idx_len, dtype=torch.int32, device=topk_weights.device)
+            .view(top_k, -1)
+            .permute(1, 0)
+            .contiguous()
+        )
+
+        hidden_states, expanded_row_idx, expanded_expert_idx = (
+            torch_npu.npu_moe_init_routing(
+                x, row_idx=row_idx, expert_idx=topk_ids, active_num=num_tokens
+            )
+        )
+
+        expert_tokens = torch_npu.npu_moe_compute_expert_tokens(
+            expanded_expert_idx, num_experts
+        )
+
+        expert_tokens = expert_tokens.to(torch.int64)
+        if layer.w13_weight.shape[-1] == layer.hidden_size:
+            w13 = layer.w13_weight.transpose(1, 2)
+            w2 = layer.w2_weight.transpose(1, 2)
+
+        # gmm1: gate_up_proj
+        hidden_states = torch_npu.npu_grouped_matmul(
+            x=[hidden_states],
+            weight=[w13],
+            split_item=2,
+            group_list_type=0,
+            group_type=0,
+            group_list=expert_tokens,
+            output_dtype=original_dtype,
+        )[0]
+
+        # act_fn:
+        if self.moe_runner_config.activation == "silu":
+            hidden_states = torch_npu.npu_swiglu(hidden_states)
+        else:
+            from sglang.srt.layers.activation import GeluAndMul
+
+            hidden_states = GeluAndMul()(hidden_states)
+
+        # gmm2: down_proj
+        hidden_states = torch_npu.npu_grouped_matmul(
+            x=[hidden_states],
+            weight=[w2],
+            split_item=2,
+            group_list_type=0,
+            group_type=0,
+            group_list=expert_tokens,
+            output_dtype=original_dtype,
+        )[0]
+
+        final_hidden_states = torch_npu.npu_moe_finalize_routing(
+            hidden_states,
+            skip1=None,
+            skip2=None,
+            bias=None,
+            scales=topk_weights,
+            expanded_src_to_dst_row=expanded_row_idx,
+            export_for_source_row=topk_ids,
+        )
+
+        return StandardCombineInput(hidden_states=final_hidden_states)
+
+    def forward_tpu(self, *args, **kwargs) -> CombineInput:
+        raise NotImplementedError("The TPU backend currently does not support MoE.")
+
+    forward_native = forward_cpu
diff --git a/python/sglang/srt/layers/quantization/utils.py b/python/sglang/srt/layers/quantization/utils.py
new file mode 100644
index 000000000..63b8b6eb7
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/utils.py
@@ -0,0 +1,564 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/utils/quant_utils.py
+
+from __future__ import annotations
+
+import re
+from copy import deepcopy
+from types import MappingProxyType
+from typing import TYPE_CHECKING, Dict, List, Mapping, Optional, Tuple, Union
+
+import numpy
+import torch
+
+from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant
+from sglang.srt.utils import is_cuda
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.quantization.base_config import QuantizationConfig
+
+
+def get_scalar_types():
+    """
+    Returns:
+        tuple: (ScalarType, scalar_types)
+    """
+    try:
+        from sgl_kernel.scalar_type import ScalarType, scalar_types
+
+        return ScalarType, scalar_types
+    except ImportError:
+
+        class MockScalarType:
+            pass
+
+        class MockScalarTypes:
+            uint4b8 = "uint4b8"
+            uint8b128 = "uint8b128"
+
+            def __getattr__(self, name):
+                return f"mock_{name}"
+
+        return MockScalarType, MockScalarTypes()
+
+
+ScalarType, scalar_types = get_scalar_types()
+
+
+def is_layer_skipped(
+    prefix: str,
+    ignored_layers: List[str],
+    fused_mapping: Mapping[str, List[str]] = MappingProxyType({}),
+) -> bool:
+    # prefix: model.layers.0.self_attn.q_proj
+    # proj_name: q_proj
+    proj_name = prefix.split(".")[-1]
+
+    # Fused layers like gate_up_proj or qkv_proj will not be fused
+    # in the safetensors checkpoint. So, we convert the name
+    # from the fused version to unfused + check to make sure that
+    # each shard of the fused layer has the same scheme.
+    if proj_name in fused_mapping:
+        shard_prefixes = [
+            prefix.replace(proj_name, shard_proj_name)
+            for shard_proj_name in fused_mapping[proj_name]
+        ]
+
+        is_skipped = None
+        for shard_prefix in shard_prefixes:
+            is_shard_skipped = shard_prefix in ignored_layers
+
+            if is_skipped is None:
+                is_skipped = is_shard_skipped
+            elif is_shard_skipped != is_skipped:
+                raise ValueError(
+                    f"Detected some but not all shards of {prefix} "
+                    "are quantized. All shards of fused layers "
+                    "to have the same precision."
+                )
+    else:
+        is_skipped = prefix in ignored_layers
+        if "gate_up_proj" in prefix:
+            prefix_gate = prefix.replace("gate_up_proj", "gate_proj")
+            prefix_up = prefix.replace("gate_up_proj", "up_proj")
+            if prefix_gate in ignored_layers and prefix_up in ignored_layers:
+                is_skipped = True
+        elif "experts" in prefix:
+            is_skipped = any(
+                [
+                    prefix in layer_name
+                    for layer_name in ignored_layers
+                    if "experts" in layer_name
+                ]
+            )
+
+    assert is_skipped is not None
+    return is_skipped
+
+
+def per_tensor_dequantize(
+    tensor: torch.Tensor, inv_scale: Union[float, torch.Tensor]
+) -> torch.Tensor:
+    fake_qweight = tensor.to(torch.float16)
+    dq_weight = fake_qweight * inv_scale
+    return dq_weight
+
+
+def all_close_1d(x: torch.Tensor) -> bool:
+    assert len(x.shape) == 1
+    return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0]))
+
+
+def convert_to_channelwise(
+    weight_scale: torch.Tensor, logical_widths: List[int]
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Create channelwise buffer
+    weight_scale_channel = torch.empty(
+        (sum(logical_widths), 1), dtype=torch.float32, device=weight_scale.device
+    )
+
+    # Handle scalar tensor case: broadcast same scale to all channels
+    if weight_scale.dim() == 0:
+        weight_scale_channel.fill_(weight_scale.item())
+        return weight_scale_channel
+
+    # Expand each scale to match the size of each logical matrix.
+    start = 0
+    for idx, logical_width in enumerate(logical_widths):
+        end = start + logical_width
+        weight_scale_channel[start:end, :] = weight_scale[idx]
+        start = end
+
+    return weight_scale_channel
+
+
+def requantize_with_max_scale(
+    weight: torch.Tensor, weight_scale: torch.Tensor, logical_widths: List[int]
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Max scale to be used for requanitzation.
+    max_w_scale = weight_scale.max()
+
+    # QKV / MLP is fused in the on disk checkpoint if any of the
+    # weight scales are still set to the default since we initialize
+    # N weight scales for N shards but we only load 1 weight scale
+    # from disk in this case. Skip requantization in this case (since)
+    # we already are quantized with the single scale.
+    # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8
+    unfused_module_in_checkpoint = (
+        weight_scale[-1] > torch.finfo(torch.float8_e4m3fn).min
+    )
+
+    # If unfused checkpoint, need requanize with the single scale.
+    if unfused_module_in_checkpoint:
+        start = 0
+        for idx, logical_width in enumerate(logical_widths):
+            end = start + logical_width
+            weight_dq = per_tensor_dequantize(weight[start:end, :], weight_scale[idx])
+            weight[start:end, :], _ = scaled_fp8_quant(weight_dq, max_w_scale)
+            start = end
+
+    return max_w_scale, weight
+
+
+def update_tensor_inplace(old: torch.Tensor, new: torch.Tensor) -> None:
+    old.copy_(new)
+
+
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/utils/layer_utils.py
+# Newly generated tensors need to replace existing tensors that are
+# already registered as parameters by vLLM (and won't be freed)
+def replace_parameter(
+    mod: torch.nn.Module, name: str, new: Union[torch.Tensor, torch.nn.Parameter]
+) -> None:
+
+    old = getattr(mod, name)
+    if (
+        type(old) is type(new)
+        and old.dtype == new.dtype
+        and old.untyped_storage().nbytes() == new.untyped_storage().nbytes()
+    ):
+        # If we can just update in-place to avoid re-registering
+        #   can be faster if the underlying storage is the same
+        update_tensor_inplace(old, new)
+    else:
+        # Fallback re-register parameter, convert to Parameter if necessary
+        # this not only ensures we don't register a tensor as a parameter, but
+        # also ensures that all parameter subclasses get re-registered as
+        # parameters for `torch.compile` compatibility
+        if not isinstance(new, torch.nn.Parameter):
+            new = torch.nn.Parameter(new, requires_grad=False)
+        mod.register_parameter(name, torch.nn.Parameter(new, requires_grad=False))
+
+
+def assert_fp8_all_close(a: torch.Tensor, b: torch.Tensor):
+    assert a.shape == b.shape
+    assert a.dtype == b.dtype == torch.float8_e4m3fn
+
+    a_u8 = a.view(torch.uint8)
+    b_u8 = b.view(torch.uint8)
+    diff_u8 = (a_u8.to(torch.int16) - b_u8.to(torch.int16)).abs()
+
+    numel = a.numel()
+
+    count_diff_sign = ((a_u8 >= 0) & (b_u8 < 0)).sum().item()
+    count_tiny_diff = (diff_u8 >= 1).sum().item()
+    count_large_diff = (diff_u8 >= 2).sum().item()
+
+    assert (
+        (count_diff_sign == 0)
+        and (count_tiny_diff / numel < 0.005)
+        and (count_large_diff == 0)
+    ), f"{count_diff_sign=} {count_tiny_diff=} {count_large_diff=} {numel=}"
+
+
+# Match dynamic rules with module name (prefix) and override quantize
+# config if module (prefix) matches a rule
+def override_config(config: QuantizationConfig, prefix: str):
+    weight_bits = get_dynamic_override(config, prefix, "bits", config.weight_bits)
+    if isinstance(weight_bits, int):
+        config.weight_bits = weight_bits
+    group_size = get_dynamic_override(config, prefix, "group_size", config.group_size)
+    if isinstance(group_size, int):
+        config.group_size = group_size
+    desc_act = get_dynamic_override(config, prefix, "desc_act", config.desc_act)
+    if isinstance(desc_act, bool):
+        config.desc_act = desc_act
+
+    config.pack_factor = 32 // config.weight_bits  # packed into int32
+    if config.get_name() == "gptq_marlin":
+        is_sym = get_dynamic_override(config, prefix, "sym", config.is_sym)
+        if isinstance(is_sym, bool):
+            config.is_sym = is_sym
+
+        if (config.weight_bits, config.is_sym) not in config.TYPE_MAP:
+            raise ValueError(
+                "Unsupported quantization config: "
+                f"bits={config.weight_bits}, sym={config.is_sym}"
+            )
+
+        config.quant_type = config.TYPE_MAP[(config.weight_bits, config.is_sym)]
+    elif config.get_name() == "gptq":
+        if config.weight_bits not in [2, 3, 4, 8]:
+            raise ValueError(
+                "Currently, only 2/3/4/8-bit weight quantization is "
+                f"supported for GPTQ, but got {config.weight_bits} bits."
+            )
+
+
+def get_dynamic_override(
+    config: QuantizationConfig,
+    layer_name: str,
+    key: Optional[str] = None,
+    default_value: Union[int, bool, None] = None,
+) -> Union[Dict, int, bool, None]:
+    for pattern, pattern_dict in config.dynamic.items():
+        # Negative match: matched modules are excluded from quantized init
+        if pattern.startswith("-:"):
+            if re.match(pattern.removeprefix("-:"), layer_name):
+                return False
+        # Positive match: matched modules have quant properties overrides
+        # base quant config
+        elif re.match(pattern.removeprefix("+:"), layer_name):
+            if key is None:
+                return pattern_dict
+            else:
+                return pattern_dict.get(key, default_value)
+    return default_value
+
+
+def get_linear_quant_method(
+    config: QuantizationConfig,
+    layer: torch.nn.Module,
+    prefix: str,
+    linear_method_cls: type,
+):
+    from sglang.srt.layers.linear import LinearBase
+    from sglang.srt.layers.quantization.unquant import (
+        UnquantizedEmbeddingMethod,
+        UnquantizedLinearMethod,
+    )
+    from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+
+    cloned_config = deepcopy(config)
+    parallel_lm_head_quantized = (
+        isinstance(layer, ParallelLMHead) and cloned_config.lm_head_quantized
+    )
+
+    if isinstance(layer, LinearBase) or parallel_lm_head_quantized:
+        # False = skip module, None = no override, else = Positive match
+        if get_dynamic_override(cloned_config, layer_name=prefix) is False:
+            if parallel_lm_head_quantized:
+                return UnquantizedEmbeddingMethod()
+            return UnquantizedLinearMethod()
+
+        if prefix:
+            # Dynamic per module/layer rules may override base config
+            override_config(cloned_config, prefix=prefix)
+
+        return linear_method_cls(cloned_config)
+    return None
+
+
+def get_pack_factor(num_bits):
+    assert 32 % num_bits == 0, f"Unsupported num_bits = {num_bits}"
+    return 32 // num_bits
+
+
+def permute_rows(
+    q_w: torch.Tensor,
+    w_ref: torch.Tensor,
+    group_size: int,
+    test_perm: Optional[torch.Tensor] = None,
+):
+    assert q_w.shape == w_ref.shape
+
+    orig_device = q_w.device
+    k_size, _ = q_w.shape
+
+    g_idx = torch.zeros((k_size,), dtype=torch.int32)
+    for i in range(k_size):
+        g_idx[i] = i // group_size
+
+    # Simulate act_order by doing a random permutation on K
+    rand_perm = test_perm if test_perm is not None else torch.randperm(k_size)
+
+    g_idx = g_idx[rand_perm].contiguous()
+    q_w = q_w[rand_perm, :].contiguous()
+    w_ref = w_ref[rand_perm, :].contiguous()
+
+    return (
+        w_ref.to(device=orig_device),
+        q_w.to(device=orig_device),
+        g_idx.to(device=orig_device),
+        rand_perm.to(device=orig_device),
+    )
+
+
+def pack_cols(
+    q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    assert q_w.shape == (size_k, size_n)
+
+    pack_factor = get_pack_factor(num_bits)
+    assert size_n % pack_factor == 0
+
+    orig_device = q_w.device
+
+    q_w = q_w.cpu().numpy().astype(numpy.uint32)
+
+    q_res = numpy.zeros((size_k, size_n // pack_factor), dtype=numpy.uint32)
+
+    for i in range(pack_factor):
+        q_res |= q_w[:, i::pack_factor] << num_bits * i
+
+    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
+    q_res = q_res.contiguous()
+
+    return q_res
+
+
+def pack_rows(
+    q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    assert q_w.shape == (size_k, size_n)
+
+    pack_factor = get_pack_factor(num_bits)
+    assert size_k % pack_factor == 0
+
+    orig_device = q_w.device
+
+    q_w = q_w.cpu().numpy().astype(numpy.uint32)
+
+    q_res = numpy.zeros((size_k // pack_factor, size_n), dtype=numpy.uint32)
+
+    for i in range(pack_factor):
+        q_res |= q_w[i::pack_factor, :] << num_bits * i
+
+    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
+    return q_res
+
+
+def unpack_cols(
+    packed_q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    pack_factor = get_pack_factor(num_bits)
+    assert size_n % pack_factor == 0
+    assert packed_q_w.shape == (
+        size_k,
+        size_n // pack_factor,
+    ), "packed_q_w.shape = {} size_k = {}, size_n = {} pack_Factor = {}".format(
+        packed_q_w.shape, size_k, size_n, pack_factor
+    )
+
+    orig_device = packed_q_w.device
+
+    packed_q_w_cpu = packed_q_w.cpu().numpy().astype(numpy.uint32)
+    q_res = numpy.zeros((size_k, size_n), dtype=numpy.uint32)
+
+    mask = (1 << num_bits) - 1
+    for i in range(pack_factor):
+        vals = packed_q_w_cpu & mask
+        packed_q_w_cpu >>= num_bits
+        q_res[:, i::pack_factor] = vals
+
+    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
+    q_res = q_res.contiguous()
+
+    return q_res
+
+
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/utils/quant_utils.py
+def quantize_weights(
+    w: torch.Tensor,
+    quant_type: ScalarType,
+    group_size: Optional[int],
+    zero_points: bool = False,
+    ref_zero_points_after_scales: bool = False,
+):
+    assert (
+        quant_type.is_integer()
+    ), "Floating point quantization may work but has not been tested"
+    assert not zero_points or group_size is not None, (
+        "to have group zero points, group_size must be provided "
+        "(-1 group_size is channelwise)"
+    )
+
+    orig_device = w.device
+    orig_type = w.dtype
+    size_k, size_n = w.shape
+
+    assert w.is_floating_point(), "w must be float"
+
+    if group_size == -1:
+        group_size = size_k
+
+    # Reshape to [groupsize, -1]
+    if group_size is not None and group_size < size_k:
+        w = w.reshape((-1, group_size, size_n))
+        w = w.permute(1, 0, 2)
+        w = w.reshape((group_size, -1))
+
+    # Compute scale for each group
+    max_val = torch.max(w, 0, keepdim=True).values
+    min_val = torch.min(w, 0, keepdim=True).values
+
+    max_q_val = quant_type.max()
+    min_q_val = quant_type.min()
+
+    w_s = torch.Tensor([1.0]).to(w.device)  # unscaled case
+    maybe_w_zp = None
+    if group_size is not None:
+        if zero_points:
+            assert not quant_type.is_signed() and quant_type.max() > 0
+            w_s = (max_val - min_val).clamp(min=1e-5) / quant_type.max()
+            maybe_w_zp = (
+                torch.round(torch.abs(min_val / w_s)).clamp(min_q_val, max_q_val).int()
+            )
+        else:
+            # If the bias is such that there are no possible negative/positive
+            #  values, set the max value to inf to avoid divide by 0
+            w_s = torch.max(
+                abs(max_val / (max_q_val if max_q_val != 0 else torch.inf)),
+                abs(min_val / (min_q_val if min_q_val != 0 else torch.inf)),
+            )
+
+    # Quantize
+    w_q = torch.round(w / w_s).int() + (maybe_w_zp if zero_points else 0)
+    w_q = torch.clamp(w_q, min_q_val, max_q_val)
+
+    # Compute ref (dequantized)
+    # For some kernels (namely Machete) the zero-points are applied after the
+    # scales are applied, for this case computing the reference in similar way
+    # allows us to use tighter error tolerances in our unit tests.
+    if ref_zero_points_after_scales and maybe_w_zp is not None:
+        w_ref = w_q.to(orig_type) * w_s - maybe_w_zp.to(orig_type) * w_s
+    else:
+        w_ref = (w_q - (maybe_w_zp if zero_points else 0)).to(orig_type) * w_s
+
+    if quant_type.has_bias():
+        w_q += quant_type.bias
+
+    # Restore original shapes
+    if group_size is not None and group_size < size_k:
+
+        def reshape_w(w):
+            w = w.reshape((group_size, -1, size_n))
+            w = w.permute(1, 0, 2)
+            w = w.reshape((size_k, size_n)).contiguous()
+            return w
+
+        w_q = reshape_w(w_q)
+        w_ref = reshape_w(w_ref)
+        w_s = w_s.reshape((-1, size_n)).contiguous()
+
+    if maybe_w_zp is not None:
+        maybe_w_zp = maybe_w_zp.reshape((-1, size_n)).contiguous()
+        maybe_w_zp = maybe_w_zp.to(device=orig_device)
+
+    return (
+        w_ref.to(device=orig_device),
+        w_q.to(device=orig_device),
+        w_s if group_size is not None else None,
+        maybe_w_zp,
+    )
+
+
+SUPPORTED_GPTQ_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128]
+SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
+
+
+def gptq_quantize_weights(
+    w: torch.Tensor,
+    quant_type: ScalarType,
+    group_size: int,
+    act_order: bool,
+    test_perm: Optional[torch.Tensor] = None,
+):
+    size_k, _ = w.shape
+
+    assert w.is_floating_point(), "w must be float"
+    assert (
+        quant_type in SUPPORTED_GPTQ_QUANT_TYPES
+    ), f"Unsupported gptq type = {quant_type}"
+    assert group_size in SUPPORTED_GROUP_SIZES + [
+        size_k
+    ], f"Unsupported groupsize = {group_size}"
+
+    w_ref, w_q, w_s, _ = quantize_weights(w, quant_type, group_size)
+
+    # Apply act_order
+    g_idx = torch.empty(0, dtype=torch.int, device=w.device)
+    rand_perm = torch.empty(0, dtype=torch.int, device=w.device)
+    if act_order:
+        assert (
+            group_size < size_k
+        ), "For act_order, groupsize = {} must be less than size_k = {}".format(
+            group_size, size_k
+        )
+
+        w_ref, w_q, g_idx, rand_perm = permute_rows(w_q, w_ref, group_size, test_perm)
+
+    return w_ref, w_q, w_s, g_idx, rand_perm
+
+
+def sort_weights(q_w: torch.Tensor, g_idx: torch.Tensor):
+    orig_device = q_w.device
+
+    sort_indices = torch.argsort(g_idx).to(dtype=torch.int32)  # Sort based on g_idx
+
+    g_idx = g_idx[sort_indices].contiguous()
+    q_w = q_w[sort_indices, :].contiguous()
+
+    return (
+        q_w.to(device=orig_device),
+        g_idx.to(device=orig_device),
+        sort_indices.to(device=orig_device),
+    )
diff --git a/python/sglang/srt/layers/quantization/w4afp8.py b/python/sglang/srt/layers/quantization/w4afp8.py
new file mode 100644
index 000000000..e95247041
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/w4afp8.py
@@ -0,0 +1,346 @@
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
+
+import torch
+from torch.nn import Module
+from torch.nn.parameter import Parameter
+
+from sglang.srt.distributed.parallel_state import get_moe_expert_parallel_world_size
+from sglang.srt.layers.linear import LinearBase, UnquantizedLinearMethod
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.fp8 import Fp8LinearMethod
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+from sglang.srt.layers.quantization.utils import is_layer_skipped
+from sglang.srt.utils import is_npu, set_weight_attrs
+
+_is_npu = is_npu()
+if not _is_npu:
+    from sglang.srt.layers.moe.cutlass_w4a8_moe import cutlass_w4a8_moe
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe import MoeRunnerConfig
+    from sglang.srt.layers.moe.ep_moe.layer import EPMoE
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+
+ACTIVATION_SCHEMES = ["static", "dynamic"]
+
+logger = logging.getLogger(__name__)
+
+
+class W4AFp8Config(QuantizationConfig):
+    """Config class for MIXED_PRECISION W4AFp8."""
+
+    def __init__(
+        self,
+        is_checkpoint_fp8_serialized: bool = True,
+        is_checkpoint_w4afp8_serialized: bool = True,
+        linear_activation_scheme: str = "dynamic",
+        moe_activation_scheme: str = "static",
+        ignored_layers: Optional[List[str]] = None,
+        weight_block_size: Optional[List[int]] = None,
+        group_size: int = 128,
+    ) -> None:
+        super().__init__()
+        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+        self.is_checkpoint_w4afp8_serialized = is_checkpoint_w4afp8_serialized
+        if is_checkpoint_w4afp8_serialized:
+            logger.warning("Detected w4afp8 checkpoint. Please note that")
+        if moe_activation_scheme not in ACTIVATION_SCHEMES:
+            raise ValueError(f"Unsupported activation scheme {moe_activation_scheme}")
+        self.linear_activation_scheme = linear_activation_scheme
+        self.moe_activation_scheme = moe_activation_scheme
+        self.ignored_layers = ignored_layers or []
+        self.weight_block_size = [128, 128]
+        self.group_size = group_size
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "w4afp8"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.float8_e4m3fn]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 90
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> W4AFp8Config:
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        is_checkpoint_fp8_serialized = "fp8" in quant_method
+        is_checkpoint_w4afp8_serialized = "w4afp8" in quant_method
+        linear_activation_scheme = "dynamic"
+        moe_activation_scheme = "static"
+        weight_block_size = [128, 128]
+        return cls(
+            is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
+            is_checkpoint_w4afp8_serialized=is_checkpoint_w4afp8_serialized,
+            linear_activation_scheme=linear_activation_scheme,
+            moe_activation_scheme=moe_activation_scheme,
+            weight_block_size=weight_block_size,
+        )
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[QuantizeMethodBase]:
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.ep_moe.layer import EPMoE
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+        from sglang.srt.managers.schedule_batch import global_server_args_dict
+
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix, self.ignored_layers):
+                return UnquantizedLinearMethod()
+            return Fp8LinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return W4AFp8MoEMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+def interleave_scales(scales: torch.Tensor) -> torch.Tensor:
+    """Interleave scales in groups of 4 similar to TRT-LLM implementation."""
+    s_shape = scales.shape
+    # Reshape to separate groups of 4
+    alignment = 4 if s_shape[2] % 4 == 0 else 1
+    scales_interleaved = scales.reshape(
+        s_shape[0], s_shape[1], (s_shape[2] // alignment), alignment
+    )
+    # Permute dimensions to interleave
+    scales_interleaved = scales_interleaved.permute(0, 2, 1, 3)
+    # Reshape back to original dimensions but with interleaved values
+    scales_interleaved = scales_interleaved.reshape(
+        s_shape[0], s_shape[2] // alignment, s_shape[1] * alignment
+    )
+    return scales_interleaved.contiguous()
+
+
+class W4AFp8MoEMethod(FusedMoEMethodBase):
+    def __init__(self, quant_config: W4AFp8Config):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: EPMoE,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        assert "weight_loader" in extra_weight_attrs
+
+        # Fused gate_up_proj (column parallel)
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition * 2,
+                hidden_size // 2,
+                dtype=torch.int8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        # down_proj (row parallel)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // 2,
+                dtype=torch.int8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.GROUP.value}
+        )
+        w13_weight_scale = torch.nn.Parameter(
+            torch.zeros(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // self.quant_config.group_size,
+                dtype=torch.float32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale_inv", w13_weight_scale)
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+
+        w2_weight_scale = torch.nn.Parameter(
+            torch.zeros(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // self.quant_config.group_size,
+                dtype=torch.float32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_scale_inv", w2_weight_scale)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # Input scales
+        w13_input_scale = torch.nn.Parameter(
+            torch.ones((num_experts, 2), dtype=torch.bfloat16),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_input_scale", w13_input_scale)
+        set_weight_attrs(w13_input_scale, extra_weight_attrs)
+
+        w2_input_scale = torch.nn.Parameter(
+            torch.ones(num_experts, dtype=torch.bfloat16),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_input_scale", w2_input_scale)
+        set_weight_attrs(w2_input_scale, extra_weight_attrs)
+
+        # Pre-populate the strides
+        device = layer.w13_weight.device
+
+        self.a_strides1 = torch.full(
+            (num_experts, 3),
+            hidden_size,
+            device=device,
+            dtype=torch.int64,
+        )
+        self.c_strides1 = torch.full(
+            (num_experts, 3),
+            2 * intermediate_size_per_partition,
+            device=device,
+            dtype=torch.int64,
+        )
+        self.a_strides2 = torch.full(
+            (num_experts, 3),
+            intermediate_size_per_partition,
+            device=device,
+            dtype=torch.int64,
+        )
+        self.c_strides2 = torch.full(
+            (num_experts, 3),
+            hidden_size,
+            device=device,
+            dtype=torch.int64,
+        )
+        self.b_strides1 = self.a_strides1
+        self.s_strides13 = self.c_strides1
+        self.b_strides2 = self.a_strides2
+        self.s_strides2 = self.c_strides2
+
+        self.expert_offsets = torch.empty(
+            (num_experts + 1), dtype=torch.int32, device=device
+        )
+        self.problem_sizes1 = torch.empty(
+            (num_experts, 3), dtype=torch.int32, device=device
+        )
+        self.problem_sizes2 = torch.empty(
+            (num_experts, 3), dtype=torch.int32, device=device
+        )
+
+        return
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        dtype = torch.bfloat16
+        device = layer.w2_weight.device
+
+        # Interleave w13_weight_scale (gate_up_proj)
+        w13_weight_scale = layer.w13_weight_scale_inv.to(dtype)
+        w13_weight_scale = interleave_scales(w13_weight_scale)
+        layer.w13_weight_scale_inv = Parameter(w13_weight_scale, requires_grad=False)
+
+        # Interleave w2_weight_scale (down_proj)
+        w2_weight_scale = layer.w2_weight_scale_inv.to(dtype)
+        w2_weight_scale = interleave_scales(w2_weight_scale)
+        layer.w2_weight_scale_inv = Parameter(w2_weight_scale, requires_grad=False)
+
+        # Process input scales
+        w13_input_scale_max = layer.w13_input_scale.max().to(dtype).item()
+        new_w13_input_scale = torch.tensor(
+            [w13_input_scale_max],
+            dtype=dtype,
+            device=device,
+        )
+        layer.w13_input_scale = Parameter(new_w13_input_scale, requires_grad=False)
+
+        w2_input_scale_max = layer.w2_input_scale.max().to(dtype).item()
+        new_w2_input_scale = torch.tensor(
+            [w2_input_scale_max], dtype=dtype, device=device
+        )
+        layer.w2_input_scale = Parameter(new_w2_input_scale, requires_grad=False)
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+
+    def apply(
+        self,
+        layer: EPMoE,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+
+        from sglang.srt.layers.moe.cutlass_w4a8_moe import cutlass_w4a8_moe
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        topk_weights, topk_ids, _ = topk_output
+        local_topk_ids = topk_ids
+        if get_moe_expert_parallel_world_size() > 1:
+            local_topk_ids = torch.where(
+                topk_ids == -1,
+                layer.num_experts,
+                topk_ids,
+            )
+
+        output = cutlass_w4a8_moe(
+            layer.start_expert_id,
+            layer.end_expert_id,
+            layer.num_experts,
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            layer.w13_weight_scale_inv,
+            layer.w2_weight_scale_inv,
+            topk_weights,
+            topk_ids,
+            local_topk_ids,
+            self.a_strides1,
+            self.b_strides1,
+            self.c_strides1,
+            self.a_strides2,
+            self.b_strides2,
+            self.c_strides2,
+            self.s_strides13,
+            self.s_strides2,
+            self.expert_offsets,
+            self.problem_sizes1,
+            self.problem_sizes2,
+            layer.w13_input_scale,
+            layer.w2_input_scale,
+        )
+        if self.moe_runner_config.routed_scaling_factor is not None:
+            output *= self.moe_runner_config.routed_scaling_factor
+        return StandardCombineInput(hidden_states=output)
diff --git a/python/sglang/srt/layers/quantization/w8a8_fp8.py b/python/sglang/srt/layers/quantization/w8a8_fp8.py
new file mode 100644
index 000000000..c68591fc6
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/w8a8_fp8.py
@@ -0,0 +1,305 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+from sglang.srt.layers.parameter import ChannelQuantScaleParameter, ModelWeightParameter
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.fp8_kernel import (
+    fp8_dtype,
+    is_fp8_fnuz,
+    per_token_group_quant_fp8,
+)
+from sglang.srt.layers.quantization.fp8_utils import (
+    apply_fp8_linear,
+    cutlass_fp8_supported,
+    input_to_float8,
+    normalize_e4m3fn_to_e4m3fnuz,
+)
+from sglang.srt.utils import set_weight_attrs
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+
+_is_fp8_fnuz = is_fp8_fnuz()
+
+
+class W8A8Fp8Config(QuantizationConfig):
+    """Config class for W8A8 FP8 Quantization.
+
+    Weight Quantization:
+    - Method: Static quantization
+    - Granularity: Per-channel
+    - Type: Symmetric
+
+    Activation Quantization:
+    - Method: Dynamic quantization
+    - Granularity: Per-token
+    - Type: Symmetric
+
+    Note:
+    - For models without offline quantization, weights will be quantized during model loading
+    - If CUTLASS is supported: Per-channel weight quantization is used
+    - If CUTLASS is not supported: Falls back to per-tensor weight quantization
+    """
+
+    def __init__(self, is_checkpoint_fp8_serialized: bool = False):
+        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 89
+
+    @classmethod
+    def get_name(self) -> str:
+        return "w8a8_fp8"
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> W8A8Fp8Config:
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        is_checkpoint_fp8_serialized = (
+            "compressed-tensors" in quant_method or "w8a8_fp8" in quant_method
+        )
+        return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized)
+
+    def get_quant_method(
+        self,
+        layer: torch.nn.Module,
+        prefix: str,
+    ) -> Optional[QuantizeMethodBase]:
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+
+        if isinstance(layer, LinearBase):
+            return W8A8Fp8LinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return W8A8FP8MoEMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class W8A8Fp8LinearMethod(LinearMethodBase):
+
+    def __init__(self, quantization_config: W8A8Fp8Config):
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+        self.quantization_config = quantization_config
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        weight = layer.weight
+
+        if self.quantization_config.is_checkpoint_fp8_serialized:
+            weight_scale = layer.weight_scale.detach()
+            # If checkpoint offline quantized with w8a8_fp8, load the weight and weight_scale directly.
+            if _is_fp8_fnuz:
+                weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                    weight=weight, weight_scale=weight_scale
+                )
+
+            layer.weight = Parameter(weight.t(), requires_grad=False)
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+        else:
+            # If checkpoint not offline quantized, quantize the weights with per-channel quantization.
+            if self.cutlass_fp8_supported:
+                # if cutlass supported, we use cutlass_scaled_mm
+                # which requires per-channel quantization on weight
+                qweight, weight_scale = per_token_group_quant_fp8(
+                    layer.weight, layer.weight.shape[-1]
+                )
+                weight_scale = weight_scale.t().contiguous()
+            else:
+                # if cutlass not supported, we fall back to use torch._scaled_mm
+                # which requires per tensor quantization on weight
+                qweight, weight_scale = input_to_float8(layer.weight, dtype=fp8_dtype)
+
+            # Update the layer with the new values.
+            layer.weight = Parameter(qweight.t(), requires_grad=False)
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+            layer.input_scale = None
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        weight_dtype = (
+            torch.float8_e4m3fn
+            if self.quantization_config.is_checkpoint_fp8_serialized
+            else params_dtype
+        )
+
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        self.logical_widths = output_partition_sizes
+
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition,
+                dtype=weight_dtype,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        if self.quantization_config.is_checkpoint_fp8_serialized:
+            weight_scale = ChannelQuantScaleParameter(
+                data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32),
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("weight_scale", weight_scale)
+        else:
+            layer.weight_scale = None
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ):
+        return apply_fp8_linear(
+            x,
+            layer.weight,
+            layer.weight_scale,
+            bias=bias,
+            cutlass_fp8_supported=self.cutlass_fp8_supported,
+        )
+
+
+class W8A8FP8MoEMethod(FusedMoEMethodBase):
+    """MoE method for FP8.
+    Supports loading FP8 checkpoints with static weight scale and
+    dynamic/static activation scale.
+    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
+    activation scaling. The weight scaling factor will be initialized after
+    the model weights are loaded.
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self, quant_config: W8A8Fp8Config):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size,
+                dtype=fp8_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=fp8_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        w13_weight_scale = torch.nn.Parameter(
+            torch.ones(
+                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
+        w2_weight_scale = torch.nn.Parameter(
+            torch.ones(num_experts, hidden_size, 1, dtype=torch.float32),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
+        )
+
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        w13_input_scale = None
+        layer.register_parameter("w13_input_scale", w13_input_scale)
+
+        w2_input_scale = None
+        layer.register_parameter("w2_input_scale", w2_input_scale)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.w13_weight = Parameter(layer.w13_weight, requires_grad=False)
+        layer.w2_weight = Parameter(layer.w2_weight, requires_grad=False)
+        layer.w13_weight_scale = Parameter(
+            layer.w13_weight_scale.data, requires_grad=False
+        )
+        layer.w2_weight_scale = Parameter(
+            layer.w2_weight_scale.data, requires_grad=False
+        )
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+        self.runner = MoeRunner(MoeRunnerBackend.TRITON, moe_runner_config)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+
+        quant_info = TritonMoeQuantInfo(
+            w13_weight=layer.w13_weight,
+            w2_weight=layer.w2_weight,
+            use_fp8_w8a8=True,
+            per_channel_quant=True,
+            w13_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            a13_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
+        )
+        return self.runner.run(dispatch_output, quant_info)
diff --git a/python/sglang/srt/layers/quantization/w8a8_int8.py b/python/sglang/srt/layers/quantization/w8a8_int8.py
new file mode 100644
index 000000000..5ccb0259d
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/w8a8_int8.py
@@ -0,0 +1,1034 @@
+from __future__ import annotations
+
+import importlib
+import sys
+from types import MappingProxyType
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    List,
+    Mapping,
+    Optional,
+    Tuple,
+    Union,
+    cast,
+)
+
+import torch
+from torch.nn.parameter import Parameter
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.amx_utils import _amx_process_weight_after_loading
+from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+from sglang.srt.layers.parameter import (
+    ChannelQuantScaleParameter,
+    ModelWeightParameter,
+    PerTensorScaleParameter,
+)
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer
+from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+from sglang.srt.utils import (
+    apply_module_patch,
+    cpu_has_amx_support,
+    is_cpu,
+    is_cuda,
+    is_npu,
+    set_weight_attrs,
+    use_intel_amx_backend,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+
+_is_cuda = is_cuda()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+if _is_cuda:
+    from sgl_kernel import int8_scaled_mm
+_is_npu = is_npu()
+
+if _is_npu:
+    import torch_npu
+
+    try:
+        from mindie_turbo import _ops as ops
+        from mindie_turbo.quantize.quant_utils import quant_per_tensor
+    except ImportError:
+        useMindIETurbo = False
+    else:
+        useMindIETurbo = True
+
+
+# func refers to RMSNorm.__init__
+def npu_wrapper_rmsnorm_init(func):
+    def init(self, hidden_size: int, **extra_args) -> None:
+        func(self, hidden_size, **extra_args)
+        self.ignore_anti = True
+        # The Ascend w8a8_int8 quantization requires adding a bias in rmsnorm
+        self.bias = torch.nn.Parameter(torch.zeros(hidden_size), requires_grad=False)
+
+    return init
+
+
+# func refers to RMSNorm.forward_oot
+def npu_wrapper_rmsnorm_forward(func):
+    def _rmsnorm_forward_oot(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        if residual is not None:
+            out, _, residual_out = torch_npu.npu_add_rms_norm(
+                residual, x, self.weight.data, self.variance_epsilon
+            )
+            out = out + self.bias
+            return out.to(x.dtype), residual_out
+
+        out = torch_npu.npu_rms_norm(x, self.weight.data, self.variance_epsilon)[0]
+        out = out + self.bias
+        return out.to(x.dtype)
+
+    return _rmsnorm_forward_oot
+
+
+def npu_fused_experts(
+    hidden_states: torch.Tensor,
+    w13: torch.Tensor,
+    w13_scale: torch.Tensor,
+    w2: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    top_k: int,
+):
+    original_shape = hidden_states.shape
+    original_dtype = hidden_states.dtype
+    scale_dtype = original_dtype if original_dtype == torch.bfloat16 else torch.float32
+    if len(original_shape) == 3:
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+    num_tokens = hidden_states.shape[0]
+    num_experts = w13.shape[0]
+    row_idx_len = num_tokens * top_k
+    row_idx = (
+        torch.arange(0, row_idx_len, dtype=torch.int32, device=topk_weights.device)
+        .view(top_k, -1)
+        .permute(1, 0)
+        .contiguous()
+    )
+    hidden_states, expanded_row_idx, expanded_expert_idx = (
+        torch_npu.npu_moe_init_routing(
+            hidden_states, row_idx=row_idx, expert_idx=topk_ids, active_num=num_tokens
+        )
+    )
+    expert_tokens = torch_npu.npu_moe_compute_expert_tokens(
+        expanded_expert_idx, num_experts
+    )
+    expert_tokens = expert_tokens.to(torch.int64)
+    # gmm1: gate_up_proj
+    hidden_states, pertoken_scale = torch_npu.npu_dynamic_quant(hidden_states)
+    hidden_states = torch_npu.npu_grouped_matmul(
+        x=[hidden_states],
+        weight=[w13],
+        scale=[w13_scale.to(scale_dtype)],
+        per_token_scale=[pertoken_scale],
+        split_item=2,
+        group_list_type=0,
+        group_type=0,
+        group_list=expert_tokens,
+        output_dtype=original_dtype,
+    )[0]
+    # act_fn: swiglu
+    hidden_states = torch_npu.npu_swiglu(hidden_states)
+    hidden_states, pertoken_scale = torch_npu.npu_dynamic_quant(hidden_states)
+    # gmm2: down_proj
+    hidden_states = torch_npu.npu_grouped_matmul(
+        x=[hidden_states],
+        weight=[w2],
+        scale=[w2_scale.to(scale_dtype)],
+        per_token_scale=[pertoken_scale],
+        split_item=2,
+        group_list_type=0,
+        group_type=0,
+        group_list=expert_tokens,
+        output_dtype=original_dtype,
+    )[0]
+
+    final_hidden_states = torch_npu.npu_moe_finalize_routing(
+        hidden_states,
+        skip1=None,
+        skip2=None,
+        bias=None,
+        scales=topk_weights,
+        expanded_src_to_dst_row=expanded_row_idx,
+        export_for_source_row=topk_ids,
+    )
+    if len(original_shape) == 3:
+        final_hidden_states = final_hidden_states.view(original_shape)
+    return final_hidden_states
+
+
+class W8A8Int8Config(QuantizationConfig):
+    """Config class for W8A8 Int8 Quantization.
+
+    - Weight: static, per-channel, symmetric
+    - Activation: dynamic, per-token, symmetric
+    """
+
+    def __init__(self, quant_config: Dict[str, Any] = {}):
+        super().__init__()
+        self.quant_description = quant_config
+        self.is_dynamic = quant_config.get("is_dynamic", False)
+        ignore = cast(List[str], quant_config.get("ignore", []))
+        self.ignore = ignore if ignore is not None else []
+        packed_modules_mapping = quant_config.get("packed_modules_mapping", {})
+        self.packed_modules_mapping = (
+            packed_modules_mapping if packed_modules_mapping is not None else {}
+        )
+
+        if _is_npu:
+            # Ascend w8a8_int8 quantization with bias, use wrappers to isolate the effects between models
+            for name in self.quant_description.keys():
+                if "norm.bias" in name:
+                    apply_module_patch(
+                        "sglang.srt.layers.layernorm.RMSNorm",
+                        "__init__",
+                        [npu_wrapper_rmsnorm_init],
+                    )
+                    apply_module_patch(
+                        "sglang.srt.layers.layernorm.RMSNorm",
+                        "forward_npu",
+                        [npu_wrapper_rmsnorm_forward],
+                    )
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return (
+            [torch.float16, torch.bfloat16]
+            if not _is_npu
+            else [torch.int8, torch.float16, torch.bfloat16]
+        )
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        if _is_npu:
+            raise NotImplementedError(
+                'NPU hardware does not support "get_min_capability" feature.'
+            )
+        else:
+            return 75
+
+    @classmethod
+    def get_name(self) -> str:
+        return "w8a8_int8"
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        filenames = []
+        if _is_npu:
+            filenames.append("quant_model_description.json")
+        return filenames
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> W8A8Int8Config:
+        return cls(config)
+
+    def get_quant_method(
+        self,
+        layer: torch.nn.Module,
+        prefix: str,
+    ) -> Optional[QuantizeMethodBase]:
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+
+        if _is_npu:
+            if isinstance(layer, LinearBase):
+                key = "model"
+                if "vision_model" in prefix:
+                    key = "vision_model"
+                elif "visual" in prefix:
+                    key = "visual"
+                packed_modules_mapping_subset = self.packed_modules_mapping.get(key, {})
+                prefix_in_quant_config = prefix
+                proj_name = prefix.split(".")[-1]
+                if proj_name in packed_modules_mapping_subset:
+                    prefix_in_quant_config = prefix.replace(
+                        proj_name, packed_modules_mapping_subset[proj_name][0]
+                    )
+                self.is_dynamic = (
+                    self.quant_description[prefix_in_quant_config + ".weight"]
+                    == "W8A8_DYNAMIC"
+                )
+                if self.is_layer_skipped(prefix, packed_modules_mapping_subset):
+                    return UnquantizedLinearMethod()
+                return (
+                    NPU_W8A8DynamicLinearMethod(self)
+                    if self.is_dynamic
+                    else NPU_W8A8LinearMethod(self)
+                )
+            elif isinstance(layer, FusedMoE):
+                return NPU_W8A8MoEMethod(self)
+            return None
+
+        if should_ignore_layer(
+            prefix, ignore=self.ignore, fused_mapping=self.packed_modules_mapping
+        ):
+            return UnquantizedLinearMethod()
+        if isinstance(layer, LinearBase):
+            return W8A8Int8LinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return W8A8Int8MoEMethod(self)
+        return None
+
+    def is_layer_skipped(
+        self, prefix: str, fused_mapping: Mapping[str, List[str]] = MappingProxyType({})
+    ):
+        # adapted from vllm.model_executor.layers.quantization.utils.quant_utils.is_layer_skipped
+        proj_name = prefix.split(".")[-1]
+        if proj_name in fused_mapping:
+            shard_prefixes = [
+                prefix.replace(proj_name, shard_proj_name)
+                for shard_proj_name in fused_mapping[proj_name]
+            ]
+
+            is_skipped = None
+            for shard_prefix in shard_prefixes:
+                is_shard_skipped = (
+                    self.quant_description[shard_prefix + ".weight"] == "FLOAT"
+                )
+
+                if is_skipped is None:
+                    is_skipped = is_shard_skipped
+                elif is_shard_skipped != is_skipped:
+                    raise ValueError(
+                        f"Detected some but not all shards of {prefix} "
+                        "are quantized. All shards of fused layers "
+                        "to have the same precision."
+                    )
+        else:
+            is_skipped = self.quant_description[prefix + ".weight"] == "FLOAT"
+
+        assert is_skipped is not None
+        return is_skipped
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class W8A8Int8LinearMethod(LinearMethodBase):
+
+    def __init__(self, quantization_config: W8A8Int8Config):
+        self.quantization_config = quantization_config
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if _is_cpu:
+            assert (
+                _is_cpu_amx_available
+            ), "W8A8Int8LinearMethod on CPU requires that CPU has AMX support"
+            _amx_process_weight_after_loading(layer, ["weight"])
+        else:
+            layer.weight = Parameter(layer.weight.t(), requires_grad=False)
+        layer.weight_scale = Parameter(layer.weight_scale.data, requires_grad=False)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        self.logical_widths = output_partition_sizes
+
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                sum(output_partition_sizes), input_size_per_partition, dtype=torch.int8
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        weight_scale = ChannelQuantScaleParameter(
+            data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32),
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ):
+        if use_intel_amx_backend(layer):
+            return torch.ops.sgl_kernel.int8_scaled_mm_with_quant(
+                x,
+                layer.weight,
+                layer.weight_scale,
+                bias,
+                x.dtype,
+                True,  # is_vnni
+            )
+
+        x_q, x_scale = per_token_quant_int8(x)
+
+        return int8_scaled_mm(
+            x_q, layer.weight, x_scale, layer.weight_scale, out_dtype=x.dtype, bias=bias
+        )
+
+
+class W8A8Int8MoEMethod(FusedMoEMethodBase):
+    """MoE method for INT8.
+    Supports loading INT8 checkpoints with static weight scale and
+    dynamic/static activation scale.
+    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
+    activation scaling. The weight scaling factor will be initialized after
+    the model weights are loaded.
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self, quant_config: W8A8Int8Config):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        tp_size = get_tensor_model_parallel_world_size()
+
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size,
+                dtype=torch.int8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=torch.int8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        w13_weight_scale = torch.nn.Parameter(
+            torch.ones(
+                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
+        w2_weight_scale = torch.nn.Parameter(
+            torch.ones(num_experts, hidden_size, 1, dtype=torch.float32),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
+        )
+
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        w13_input_scale = None
+        layer.register_parameter("w13_input_scale", w13_input_scale)
+
+        w2_input_scale = None
+        layer.register_parameter("w2_input_scale", w2_input_scale)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if _is_cpu:
+            assert (
+                _is_cpu_amx_available
+            ), "W8A8Int8MoEMethod on CPU requires that CPU has AMX support"
+            _amx_process_weight_after_loading(layer, ["w13_weight", "w2_weight"])
+        else:
+            layer.w13_weight = Parameter(layer.w13_weight, requires_grad=False)
+            layer.w2_weight = Parameter(layer.w2_weight, requires_grad=False)
+        layer.w13_weight_scale = Parameter(
+            layer.w13_weight_scale.data, requires_grad=False
+        )
+        layer.w2_weight_scale = Parameter(
+            layer.w2_weight_scale.data, requires_grad=False
+        )
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+        self.runner = MoeRunner(MoeRunnerBackend.TRITON, moe_runner_config)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> torch.Tensor:
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        if use_intel_amx_backend(layer):
+            from sglang.srt.layers.moe.topk import apply_topk_weights_cpu
+
+            topk_weights, topk_ids, _ = topk_output
+            x, topk_weights = apply_topk_weights_cpu(
+                self.moe_runner_config.apply_router_weight_on_input, topk_weights, x
+            )
+            output = torch.ops.sgl_kernel.fused_experts_cpu(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights,
+                topk_ids,
+                False,  # inplace See [Note] inplace should be False in fused_experts.
+                True,  # use_int8_w8a8
+                False,  # use_fp8_w8a16
+                layer.w13_weight_scale,  # w1_scale
+                layer.w2_weight_scale,  # w2_scale
+                None,  # block_size
+                layer.w13_input_scale,  # a1_scale
+                layer.w2_input_scale,  # a2_scale
+                True,  # is_vnni
+            )
+            return StandardCombineInput(hidden_states=output)
+
+        quant_info = TritonMoeQuantInfo(
+            w13_weight=layer.w13_weight,
+            w2_weight=layer.w2_weight,
+            use_int8_w8a8=True,
+            per_channel_quant=True,
+            w13_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            a13_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
+        )
+        return self.runner.run(dispatch_output, quant_info)
+
+
+class NPU_W8A8LinearMethodImpl:
+    """Linear method for NPU W8A8."""
+
+    def __init__(self) -> None:
+        # aclnn quant matmul requires to transpose matrix B, set to true by default.
+        self.transpose_weight = True
+
+    @staticmethod
+    def get_weight(
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype = torch.bfloat16,
+    ) -> Dict[str, Any]:
+        params_dict = {"weight": torch.empty(output_size, input_size, dtype=torch.int8)}
+        return params_dict
+
+    @staticmethod
+    def get_pertensor_param(params_dtype: torch.dtype) -> Dict[str, Any]:
+        params_dict = {}
+        params_dict["input_scale"] = torch.empty(1, dtype=params_dtype)
+        params_dict["input_offset"] = torch.empty(1, dtype=params_dtype)
+        return params_dict
+
+    @staticmethod
+    def get_perchannel_param(
+        output_size: int,
+        params_dtype: torch.dtype,
+    ) -> Dict[str, Any]:
+        params_dict = {}
+        params_dict["quant_bias"] = torch.empty(output_size, dtype=torch.int32)
+        if params_dtype == torch.bfloat16:
+            params_dict["deq_scale"] = torch.empty(output_size, dtype=torch.float32)
+        elif params_dtype == torch.float16:
+            params_dict["deq_scale"] = torch.empty(output_size, dtype=torch.int64)
+        params_dict["weight_scale"] = torch.empty(output_size, 1, dtype=params_dtype)
+        params_dict["weight_offset"] = torch.empty(output_size, 1, dtype=params_dtype)
+        return params_dict
+
+    @staticmethod
+    def apply(
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # To prevent import loops
+        from sglang.srt.layers.linear import RowParallelLinear
+
+        original_dtype = x.dtype
+        if original_dtype != torch.int8:
+            x = torch_npu.npu_quantize(
+                x,
+                layer.aclnn_input_scale_reciprocal,
+                layer.aclnn_input_offset,
+                torch.qint8,
+                -1,
+                False,
+            )
+        # Only fuse bias add into GEMM for rank 0 (this ensures that
+        # bias will not get added more than once in Attention TP>1 case)
+        if isinstance(layer, RowParallelLinear) and layer.tp_rank > 0:
+            quant_bias = None
+        else:
+            quant_bias = layer.quant_bias
+        return torch_npu.npu_quant_matmul(
+            x,
+            layer.weight,
+            layer.deq_scale,
+            bias=quant_bias,
+            output_dtype=original_dtype,
+        )
+
+    def process_weights_after_loading(self, layer):
+        expanding_factor = layer.weight.data.shape[1]
+        layer.aclnn_input_scale = torch.nn.Parameter(
+            layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
+            requires_grad=False,
+        )
+        layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter(
+            layer.input_scale.data.repeat(expanding_factor).to(device="npu"),
+            requires_grad=False,
+        )
+        layer.aclnn_input_offset = torch.nn.Parameter(
+            layer.input_offset.data.repeat(expanding_factor).to(device="npu"),
+            requires_grad=False,
+        )
+        if self.transpose_weight:
+            layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
+        layer.weight_scale.data = torch.flatten(layer.weight_scale.data)
+        layer.weight_offset.data = torch.flatten(layer.weight_offset.data)
+
+
+class NPU_W8A8LinearMethodMTImpl:
+    """Linear method for NPU W8A8."""
+
+    def __init__(self) -> None:
+        self.transpose_weight = True
+
+    @staticmethod
+    def get_weight(
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype = torch.bfloat16,
+    ) -> Dict[str, Any]:
+        params_dict = {"weight": torch.empty(output_size, input_size, dtype=torch.int8)}
+        return params_dict
+
+    @staticmethod
+    def get_pertensor_param(params_dtype: torch.dtype) -> Dict[str, Any]:
+        params_dict = {}
+        params_dict["input_scale"] = torch.empty(1, dtype=params_dtype)
+        params_dict["input_offset"] = torch.empty(1, dtype=torch.int8)
+        return params_dict
+
+    @staticmethod
+    def get_perchannel_param(
+        output_size: int,
+        params_dtype: torch.dtype,
+    ) -> Dict[str, Any]:
+        params_dict = {}
+        params_dict["quant_bias"] = torch.empty(output_size, dtype=torch.int32)
+        if params_dtype == torch.bfloat16:
+            params_dict["deq_scale"] = torch.empty(output_size, dtype=torch.float32)
+        elif params_dtype == torch.float16:
+            params_dict["deq_scale"] = torch.empty(output_size, dtype=torch.int64)
+        params_dict["weight_scale"] = torch.empty(output_size, 1, dtype=params_dtype)
+        params_dict["weight_offset"] = torch.empty(output_size, 1, dtype=params_dtype)
+        return params_dict
+
+    @staticmethod
+    def apply(
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # To prevent import loops
+        from sglang.srt.layers.linear import RowParallelLinear
+
+        original_dtype = x.dtype
+        if original_dtype != torch.int8:
+            x = quant_per_tensor(x, layer.input_scale, layer.input_offset)
+
+        # Only fuse bias add into GEMM for rank 0 (this ensures that
+        # bias will not get added more than once in Attention TP>1 case)
+        if isinstance(layer, RowParallelLinear) and layer.tp_rank > 0:
+            quant_bias = None
+        else:
+            quant_bias = layer.quant_bias
+
+        return ops.quant_matmul(
+            x=x, weight=layer.weight, deq_scale=layer.deq_scale, deq_bias=quant_bias
+        )
+
+    def process_weights_after_loading(self, layer):
+        layer.aclnn_deq_scale = torch.nn.Parameter(
+            torch_npu.npu_trans_quant_param(layer.deq_scale.npu()).to(device="npu"),
+            requires_grad=False,
+        )
+
+
+class NPU_W8A8LinearMethod(LinearMethodBase):
+    """Linear method for NPU quantization.
+
+    This class search for specific quantization
+    implementation supported on NPU hardware for linear methods.
+
+    Args:
+        quant_config: The NPU quantization config.
+    """
+
+    def __init__(self, quantization_config: W8A8Int8Config) -> None:
+        self.quantization_config = quantization_config
+        self.quant_method = (
+            NPU_W8A8LinearMethodMTImpl()
+            if useMindIETurbo
+            else NPU_W8A8LinearMethodImpl()
+        )
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        weight_dict = self.quant_method.get_weight(
+            input_size_per_partition, output_size_per_partition, params_dtype
+        )
+        for weight_name, weight_param in weight_dict.items():
+            param = torch.nn.Parameter(weight_param, requires_grad=False)
+            set_weight_attrs(param, {"input_dim": 1, "output_dim": 0})
+            layer.register_parameter(weight_name, param)
+            set_weight_attrs(param, extra_weight_attrs)
+
+        pertensor_dict = self.quant_method.get_pertensor_param(params_dtype)
+        for pertensor_name, pertensor_param in pertensor_dict.items():
+            param = PerTensorScaleParameter(
+                data=pertensor_param, weight_loader=weight_loader
+            )
+            # disable warning
+            param.ignore_warning = True
+            layer.register_parameter(pertensor_name, param)
+
+        perchannel_dict = self.quant_method.get_perchannel_param(
+            output_size_per_partition, params_dtype
+        )
+        for perchannel_name, perchannel_param in perchannel_dict.items():
+            param = torch.nn.Parameter(perchannel_param, requires_grad=False)
+            set_weight_attrs(param, {"output_dim": 0})
+            layer.register_parameter(perchannel_name, param)
+            set_weight_attrs(param, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if hasattr(self.quant_method, "process_weights_after_loading"):
+            self.quant_method.process_weights_after_loading(layer)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return self.quant_method.apply(layer, x, bias)
+
+
+class NPU_W8A8DynamicLinearMethodImpl:
+    """Linear method for NPU W8A8_DYNAMIC."""
+
+    def __init__(self):
+        self.transpose_weight = True
+
+    @staticmethod
+    def get_weight(
+        input_size: int, output_size: int, params_dtype: torch.dtype
+    ) -> Dict[str, Any]:
+        params_dict = {"weight": torch.empty(output_size, input_size, dtype=torch.int8)}
+        return params_dict
+
+    @staticmethod
+    def get_pertensor_param(params_dtype: torch.dtype) -> Dict[str, Any]:
+        return {}
+
+    @staticmethod
+    def get_perchannel_param(
+        output_size: int,
+        params_dtype: torch.dtype,
+    ) -> Dict[str, Any]:
+        params_dict = {}
+        params_dict["weight_scale"] = torch.empty(output_size, 1, dtype=params_dtype)
+        params_dict["weight_offset"] = torch.empty(output_size, 1, dtype=params_dtype)
+        return params_dict
+
+    @staticmethod
+    def apply(
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+        tp_rank: Optional[int] = 0,
+    ) -> torch.Tensor:
+        original_dtype = x.dtype
+        quant_out, dynamic_scale = torch_npu.npu_dynamic_quant(x)
+        return torch_npu.npu_quant_matmul(
+            quant_out,
+            layer.weight,
+            layer.weight_scale,
+            pertoken_scale=dynamic_scale,
+            bias=bias,
+            output_dtype=original_dtype,
+        )
+
+    def process_weights_after_loading(self, layer):
+        if self.transpose_weight:
+            layer.weight.data = layer.weight.data.transpose(0, 1).contiguous()
+        layer.weight_scale.data = layer.weight_scale.data.flatten()
+        layer.weight_scale_fp32 = layer.weight_scale.data.to(torch.float32)
+        layer.weight_offset.data = layer.weight_offset.data.flatten()
+
+
+class NPU_W8A8DynamicLinearMethod(LinearMethodBase):
+    """Linear method for NPU quantization.
+
+    This class search for specific quantization
+    implementations supported on NPU hardware for linear methods.
+
+    Args:
+        quant_config: The NPU quantization config.
+    """
+
+    def __init__(self, quantization_config: W8A8Int8Config) -> None:
+        self.quantization_config = quantization_config
+        self.quant_method = NPU_W8A8DynamicLinearMethodImpl()
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        weight_dict = self.quant_method.get_weight(
+            input_size_per_partition, output_size_per_partition, params_dtype
+        )
+        for weight_name, weight_param in weight_dict.items():
+            param = torch.nn.Parameter(weight_param, requires_grad=False)
+            set_weight_attrs(param, {"input_dim": 1, "output_dim": 0})
+            layer.register_parameter(weight_name, param)
+            set_weight_attrs(param, extra_weight_attrs)
+
+        pertensor_dict = self.quant_method.get_pertensor_param(params_dtype)
+        for pertensor_name, pertensor_param in pertensor_dict.items():
+            param = PerTensorScaleParameter(
+                data=pertensor_param, weight_loader=weight_loader
+            )
+            # disable warning
+            param.ignore_warning = True
+            layer.register_parameter(pertensor_name, param)
+
+        perchannel_dict = self.quant_method.get_perchannel_param(
+            output_size_per_partition, params_dtype
+        )
+        for perchannel_name, perchannel_param in perchannel_dict.items():
+            param = torch.nn.Parameter(perchannel_param, requires_grad=False)
+            set_weight_attrs(param, {"output_dim": 0})
+            layer.register_parameter(perchannel_name, param)
+            set_weight_attrs(param, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if hasattr(self.quant_method, "process_weights_after_loading"):
+            self.quant_method.process_weights_after_loading(layer)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return self.quant_method.apply(layer, x, bias)
+
+
+class NPU_W8A8MoEMethod(FusedMoEMethodBase):
+    """MoE method for NPU quantization.
+
+    This class search for specific quantization
+    implementations supported on NPU hardware for moe methods.
+
+    Args:
+        quant_config: The NPU quantization config.
+    """
+
+    def __init__(self, quantization_config: W8A8Int8Config) -> None:
+        self.quantization_config = quantization_config
+        self.quant_method = self
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        self.num_experts = num_experts
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
+        )
+
+        # weight
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size,
+                dtype=torch.int8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=torch.int8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+        # scale
+        w13_weight_scale = torch.nn.Parameter(
+            torch.empty(
+                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        w2_weight_scale = torch.nn.Parameter(
+            torch.empty(num_experts, hidden_size, 1, dtype=torch.float32),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+        # offset
+        w13_weight_offset = torch.nn.Parameter(
+            torch.empty(
+                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_offset", w13_weight_offset)
+        set_weight_attrs(w13_weight_offset, extra_weight_attrs)
+        w2_weight_offset = torch.nn.Parameter(
+            torch.empty(num_experts, hidden_size, 1, dtype=torch.float32),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_offset", w2_weight_offset)
+        set_weight_attrs(w2_weight_offset, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.w13_weight = Parameter(
+            layer.w13_weight.data.transpose(1, 2).contiguous(), requires_grad=False
+        )
+        layer.w2_weight = Parameter(
+            layer.w2_weight.data.transpose(1, 2).contiguous(), requires_grad=False
+        )
+        layer.w13_weight_scale = Parameter(
+            layer.w13_weight_scale.data.squeeze(-1).contiguous(), requires_grad=False
+        )
+        layer.w2_weight_scale = Parameter(
+            layer.w2_weight_scale.data.squeeze(-1).contiguous(), requires_grad=False
+        )
+        layer.w13_weight_offset = Parameter(
+            layer.w13_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False
+        )
+        layer.w2_weight_offset = Parameter(
+            layer.w2_weight_offset.data.squeeze(-1).contiguous(), requires_grad=False
+        )
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+
+    def apply(
+        self,
+        layer,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        topk_weights, topk_ids, _ = topk_output
+        topk_ids = topk_ids.to(torch.int32)
+        topk_weights = topk_weights.to(x.dtype)
+        output = npu_fused_experts(
+            hidden_states=x,
+            w13=layer.w13_weight,
+            w13_scale=layer.w13_weight_scale,
+            w2=layer.w2_weight,
+            w2_scale=layer.w2_weight_scale,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            top_k=topk_ids.shape[1],
+        )
+        return StandardCombineInput(hidden_states=output)
diff --git a/python/sglang/srt/layers/radix_attention.py b/python/sglang/srt/layers/radix_attention.py
new file mode 100644
index 000000000..0719cdd29
--- /dev/null
+++ b/python/sglang/srt/layers/radix_attention.py
@@ -0,0 +1,116 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Radix attention."""
+from __future__ import annotations
+
+from enum import Enum
+from typing import TYPE_CHECKING, Optional
+
+from torch import nn
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.quantization.base_config import QuantizationConfig
+    from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+
+class AttentionType(Enum):
+    """
+    Attention type.
+    Use string to be compatible with `torch.compile`.
+    """
+
+    # Decoder attention between previous layer Q/K/V
+    DECODER = "decoder"
+    # Encoder attention between previous layer Q/K/V
+    ENCODER_ONLY = "encoder_only"
+
+
+class RadixAttention(nn.Module):
+    """
+    The attention layer implementation.
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_dim: int,
+        scaling: float,
+        num_kv_heads: int,
+        layer_id: int,
+        logit_cap: float = 0.0,
+        v_head_dim: int = -1,
+        sliding_window_size: int = -1,
+        is_cross_attention: bool = False,
+        pos_encoding_mode: str = "NONE",
+        logit_capping_method: str = "tanh",
+        quant_config: Optional[QuantizationConfig] = None,
+        attn_type: AttentionType = AttentionType.DECODER,
+        use_irope: bool = False,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_q_head_num = num_heads
+        self.tp_k_head_num = num_kv_heads
+        self.tp_v_head_num = num_kv_heads
+        self.head_dim = head_dim
+        self.qk_head_dim = head_dim
+        self.v_head_dim = v_head_dim if v_head_dim != -1 else head_dim
+        self.scaling = scaling
+        self.layer_id = layer_id
+        self.logit_cap = logit_cap
+        self.sliding_window_size = sliding_window_size or -1
+        self.is_cross_attention = is_cross_attention
+        self.use_irope = use_irope
+        self.k_scale = None
+        self.v_scale = None
+        self.k_scale_float = None
+        self.v_scale_float = None
+        self.quant_method = None
+        if quant_config is not None:
+            self.quant_method = quant_config.get_quant_method(self, prefix=prefix)
+        if self.quant_method is not None:
+            self.quant_method.create_weights(self)
+        self.attn_type = attn_type
+
+        self.pos_encoding_mode = pos_encoding_mode
+        self.logit_capping_method = logit_capping_method
+        self.xai_temperature_len = -1
+
+    def forward(
+        self,
+        q,
+        k,
+        v,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        **kwargs,
+    ):
+        if k is not None:
+            # For cross-layer sharing, kv can be None
+            assert v is not None
+            if "k_rope" not in kwargs:
+                k = k.view(-1, self.tp_k_head_num, self.qk_head_dim)
+                v = v.view(-1, self.tp_v_head_num, self.v_head_dim)
+            else:
+                k = k.view(-1, self.tp_k_head_num, self.v_head_dim)
+
+        return forward_batch.attn_backend.forward(
+            q,
+            k,
+            v,
+            self,
+            forward_batch,
+            save_kv_cache,
+            **kwargs,
+        )
diff --git a/python/sglang/srt/layers/rocm_linear_utils.py b/python/sglang/srt/layers/rocm_linear_utils.py
new file mode 100644
index 000000000..ee7dd1f59
--- /dev/null
+++ b/python/sglang/srt/layers/rocm_linear_utils.py
@@ -0,0 +1,44 @@
+import torch
+from aiter.ops.triton.fused_qk_concat import fused_qk_rope_cat
+from aiter.ops.triton.gemm_a16w16 import gemm_a16w16
+from aiter.ops.triton.gemm_a16w16_atomic import gemm_a16w16_atomic
+
+from sglang.srt.utils import BumpAllocator
+
+__all__ = ["fused_qk_rope_cat"]
+
+
+def aiter_dsv3_router_gemm(
+    hidden_states: torch.Tensor,
+    weight: torch.Tensor,
+    gemm_output_zero_allocator: BumpAllocator = None,
+):
+    M = hidden_states.shape[0]
+    N = weight.shape[0]
+    y = None
+
+    if M <= 256:
+        # TODO (cagri): convert to bfloat16 as part of another kernel to save time
+        # for now it is also coupled with zero allocator.
+        if gemm_output_zero_allocator != None:
+            y = gemm_output_zero_allocator.allocate(M * N).view(M, N)
+        else:
+            y = torch.zeros((M, N), dtype=torch.float32, device=hidden_states.device)
+
+    if y is not None:
+        logits = gemm_a16w16_atomic(hidden_states, weight, y=y).to(hidden_states.dtype)
+    else:
+        logits = gemm_a16w16(hidden_states, weight)
+
+    return logits
+
+
+def get_dsv3_gemm_output_zero_allocator_size(
+    n_routed_experts: int, num_moe_layers: int, allocate_size: int, embedding_dim: int
+):
+    if embedding_dim != 7168 or n_routed_experts != 256:
+        return 0
+
+    per_layer_size = 256 * (allocate_size + n_routed_experts)
+
+    return num_moe_layers * per_layer_size
diff --git a/python/sglang/srt/layers/rotary_embedding.py b/python/sglang/srt/layers/rotary_embedding.py
new file mode 100644
index 000000000..05f068557
--- /dev/null
+++ b/python/sglang/srt/layers/rotary_embedding.py
@@ -0,0 +1,2017 @@
+# Adapted from https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.6.6.post1/vllm/model_executor/layers/rotary_embedding.py
+
+"""Rotary Positional Embeddings."""
+import itertools
+import math
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from sglang.srt.custom_op import CustomOp
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    get_bool_env_var,
+    is_cpu,
+    is_cuda,
+    is_hip,
+    is_npu,
+)
+
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+_is_npu = is_npu()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+
+if _is_cuda:
+    from sgl_kernel import apply_rope_with_cos_sin_cache_inplace
+if _use_aiter:
+    from aiter.rotary_embedding import get_rope as aiter_get_rope
+
+if is_npu():
+    import torch_npu
+
+
+def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def _rotate_gptj(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., ::2]
+    x2 = x[..., 1::2]
+    x = torch.stack((-x2, x1), dim=-1)
+    return x.flatten(-2)
+
+
+def _apply_rotary_emb(
+    x: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    is_neox_style: bool,
+) -> torch.Tensor:
+    """
+    Args:
+        x: [num_tokens, num_heads, head_size]
+        cos: [num_tokens, head_size // 2]
+        sin: [num_tokens, head_size // 2]
+        is_neox_style: Whether to use the Neox-style or GPT-J-style rotary
+            positional embeddings.
+    """
+    cos = cos.unsqueeze(-2).to(x.dtype)
+    sin = sin.unsqueeze(-2).to(x.dtype)
+    if is_neox_style:
+        x1, x2 = torch.chunk(x, 2, dim=-1)
+    else:
+        x1 = x[..., ::2]
+        x2 = x[..., 1::2]
+    o1 = x1 * cos - x2 * sin
+    o2 = x2 * cos + x1 * sin
+    if is_neox_style:
+        return torch.cat((o1, o2), dim=-1)
+    else:
+        return torch.stack((o1, o2), dim=-1).flatten(-2)
+
+
+class RotaryEmbedding(CustomOp):
+    """Original rotary positional embedding."""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+    ) -> None:
+        super().__init__()
+        self.head_size = head_size
+        self.rotary_dim = rotary_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.is_neox_style = is_neox_style
+        self.dtype = dtype
+
+        cache = self._compute_cos_sin_cache()
+        # NOTE(ByronHsu): cache needs to be in FP32 for numerical stability
+        if not _is_cuda:
+            cache = cache.to(dtype)
+
+        if (
+            not (_is_cuda or _is_npu) or self.head_size not in [64, 128, 256, 512]
+        ) and not (_is_cpu and _is_cpu_amx_available):
+            from vllm._custom_ops import rotary_embedding
+
+            self.vllm_rotary_embedding = rotary_embedding
+
+        self.cos_sin_cache: torch.Tensor
+        self.register_buffer("cos_sin_cache", cache, persistent=False)
+
+    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+        """Compute the inverse frequency."""
+        # NOTE(woosuk): To exactly match the HF implementation, we need to
+        # use CPU to compute the cache and then move it to GPU. However, we
+        # create the cache on GPU for faster initialization. This may cause
+        # a slight numerical difference between the HF implementation and ours.
+        inv_freq = 1.0 / (
+            base
+            ** (
+                torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
+            )
+        )
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        """Compute the cos and sin cache."""
+        inv_freq = self._compute_inv_freq(self.base)
+        t = torch.arange(self.max_position_embeddings, dtype=torch.float)
+
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """A PyTorch-native implementation of forward()."""
+        if offsets is not None:
+            positions = positions + offsets
+        positions = positions.flatten()
+        num_tokens = positions.shape[0]
+        cos_sin = self.cos_sin_cache.index_select(0, positions)
+        cos, sin = cos_sin.chunk(2, dim=-1)
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    def forward_npu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """A PyTorch-npu implementation of forward()."""
+        import os
+
+        if get_bool_env_var("SGLANG_ENABLE_TORCH_COMPILE"):
+            return self.forward_native(positions, query, key, offsets)
+        else:
+            rotary_mode = "half"
+            if self.is_neox_style:
+                rotary_mode = "half"
+            else:
+                rotary_mode = "interleave"
+            mrope_section = [0, 0, 0]
+            query_out, key_out = torch_npu.npu_mrope(
+                positions,
+                query,
+                key,
+                self.cos_sin_cache,
+                self.head_size,
+                mrope_section=mrope_section,
+                rotary_mode=rotary_mode,
+            )
+            return query_out, key_out
+
+    def forward_cpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        positions = torch.add(positions, offsets) if offsets is not None else positions
+        if _is_cpu_amx_available:
+            return torch.ops.sgl_kernel.rotary_embedding_cpu(
+                positions,
+                query,
+                key,
+                self.head_size,
+                self.cos_sin_cache,
+                self.is_neox_style,
+            )
+        else:
+            return self.forward_native(positions, query, key, offsets)
+
+    def forward_cuda(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+        fused_set_kv_buffer_arg=None,  # Optional[FusedSetKVBufferArg]
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if _is_cuda and (self.head_size in [64, 128, 256, 512]):
+            apply_rope_with_cos_sin_cache_inplace(
+                positions=positions,
+                query=query,
+                key=key,
+                head_size=self.head_size,
+                cos_sin_cache=self.cos_sin_cache,
+                is_neox=self.is_neox_style,
+                # Compatible with old sgl-kernel
+                **(
+                    dict(fused_set_kv_buffer_arg=fused_set_kv_buffer_arg)
+                    if fused_set_kv_buffer_arg is not None
+                    else {}
+                ),
+            )
+        else:
+            assert (
+                fused_set_kv_buffer_arg is None
+            ), "save kv cache is not supported for vllm_rotary_embedding."
+            self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype)
+            self.vllm_rotary_embedding(
+                positions,
+                query,
+                key,
+                self.head_size,
+                self.cos_sin_cache,
+                self.is_neox_style,
+            )
+        return query, key
+
+    def extra_repr(self) -> str:
+        s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
+        s += f", max_position_embeddings={self.max_position_embeddings}"
+        s += f", base={self.base}, is_neox_style={self.is_neox_style}"
+        return s
+
+
+class LinearScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with linear scaling.
+
+    It supports multiple scaling factors. Since multiple LoRA adapters may have
+    different scaling factors, we need multiple cos/sin caches. In this way,
+    instead of running rotary embedding kernel per lora, we can run multiple
+    lora in a batched way.
+
+    In addition to that, we also keep the cos/sin cache for the scaling factor
+    of 1 (default) at all times.
+
+    Exemplary for two scaling factors x=1, y and z with embeddings
+    [[x11, x12, ... x1m], ..., [xn1, xn2, ..., xnm]] and
+    [[y11, y12, ... y1o], ..., [yn1, yn2, ..., yno]], and
+    [[z11, z12, ... z1p], ..., [zn1, zn2, ..., znp]],
+
+    we construct the cos/sin cache as follows:
+    [[x11, x12, ... x1m, y11, y12, ... y1o, z11, z12, ... z1p],
+        ...
+     [xn1, xn2, ... xnm, yn1, yn2, ... yno, zn1, zn2, ... znp]]
+
+    We then use offsets to index into the cos/sin cache for
+    the respective scaling factors.
+
+    The offset to cache can be accessed via `scaling_factor_to_offset` API.
+
+    Credits to the Reddit user /u/kaiokendev
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        scaling_factors: Union[List[float], float],
+        dtype: torch.dtype,
+    ) -> None:
+        if isinstance(scaling_factors, float):
+            scaling_factors = [scaling_factors]
+        self.scaling_factors: List[float] = scaling_factors  # noqa
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+        # Lazy initialized.
+        self._scaling_factor_to_offset: Dict[float, int]
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.base)
+        cache_list: List[torch.Tensor] = []
+        # offsets to the next cache in a tensor.
+        # Each offset corresponds to the same index in scaling_factors.
+        offsets: List[int] = []
+        for scaling_factor in self.scaling_factors:
+            # NOTE(woosuk): self.max_position_embeddings is the original
+            # maximum length before applying the rope scaling.
+            # Thus, the maximum length after applying the rope scaling is
+            # self.max_position_embeddings * self.scaling_factor.
+            max_len = self.max_position_embeddings * scaling_factor
+            t = torch.arange(max_len, dtype=torch.float)
+            t = t / scaling_factor
+
+            freqs = torch.einsum("i,j -> ij", t, inv_freq)
+            cos = freqs.cos()
+            sin = freqs.sin()
+            cache = torch.cat((cos, sin), dim=-1)
+            if not cache_list:
+                offset = 0
+            else:
+                last_offset = offsets[-1]
+                next_max_len = cache_list[-1].shape[0]
+                offset = last_offset + next_max_len
+            offsets.append(offset)
+            cache_list.append(cache)
+        self._scaling_factor_to_offset = {
+            float(scaling_factor): offsets[i]
+            for i, scaling_factor in enumerate(self.scaling_factors)
+        }
+        assert len(self.scaling_factors) == len(offsets)
+        return torch.cat(cache_list, dim=0)
+
+    @property
+    def scaling_factor_to_offset(self) -> Dict[float, int]:
+        return self._scaling_factor_to_offset
+
+
+class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with Dynamic NTK scaling.
+
+    Credits to the Reddit users /u/bloc97 and /u/emozilla
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        # NOTE(woosuk): self.max_position_embeddings is the original
+        # maximum length before applying the rope scaling.
+        # Thus, the maximum length after applying the rope scaling is
+        # self.max_position_embeddings * self.scaling_factor.
+        max_len = self.max_position_embeddings * self.scaling_factor
+        base = self.base * (
+            (self.scaling_factor * max_len / self.max_position_embeddings)
+            - (self.scaling_factor - 1)
+        ) ** (self.rotary_dim / (self.rotary_dim - 2))
+        inv_freq = self._compute_inv_freq(base)
+        t = torch.arange(max_len, dtype=torch.float)
+
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+
+# Inverse dim formula to find dim based on number of rotations
+def _yarn_find_correction_dim(
+    num_rotations: int,
+    dim: int,
+    base: float = 10000,
+    max_position_embeddings: int = 2048,
+) -> float:
+    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
+        2 * math.log(base)
+    )
+
+
+# Find dim range bounds based on rotations
+def _yarn_find_correction_range(
+    low_rot: int,
+    high_rot: int,
+    dim: int,
+    base: float = 10000,
+    max_position_embeddings: int = 2048,
+) -> Tuple[int, int]:
+    low = math.floor(
+        _yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
+    )
+    high = math.ceil(
+        _yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
+    )
+    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
+
+
+def _yarn_linear_ramp_mask(
+    low: float, high: float, dim: int, dtype: torch.dtype, device: torch.device = None
+) -> torch.Tensor:
+    if low == high:
+        high += 0.001  # Prevent singularity
+
+    linear_func = (torch.arange(dim, dtype=dtype, device=device) - low) / (high - low)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+
+
+def _yarn_get_mscale(scale: float = 1) -> float:
+    if scale <= 1:
+        return 1.0
+    return 0.1 * math.log(scale) + 1.0
+
+
+class YaRNScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with YaRN method.
+
+    Credits to Peng et al. github.com/jquesnelle/yarn
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+        *,
+        extrapolation_factor: float = 1,
+        attn_factor: float = 1,
+        beta_fast: int = 32,
+        beta_slow: int = 1,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        # Get n-d magnitude scaling corrected for interpolation
+        self.mscale = float(_yarn_get_mscale(self.scaling_factor) * attn_factor)
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+    def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
+        pos_freqs = self.base ** (
+            torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
+        )
+        inv_freq_extrapolation = 1.0 / pos_freqs
+        inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
+
+        low, high = _yarn_find_correction_range(
+            self.beta_fast,
+            self.beta_slow,
+            self.rotary_dim,
+            self.base,
+            self.max_position_embeddings,
+        )
+        # Get n-d rotational scaling corrected for extrapolation
+        inv_freq_mask = (
+            1
+            - _yarn_linear_ramp_mask(low, high, self.rotary_dim // 2, dtype=torch.float)
+        ) * self.extrapolation_factor
+        inv_freq = (
+            inv_freq_interpolation * (1 - inv_freq_mask)
+            + inv_freq_extrapolation * inv_freq_mask
+        )
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.scaling_factor)
+        t = torch.arange(
+            self.max_position_embeddings * self.scaling_factor, dtype=torch.float32
+        )
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos() * self.mscale
+        sin = freqs.sin() * self.mscale
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+
+class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
+    """Phi3 family of models scaled rotary embedding.
+
+    Based on the original RotaryEmbedding implementation.
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        original_max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        short_factor: List[float],
+        long_factor: List[float],
+        short_mscale: Optional[float] = None,
+        long_mscale: Optional[float] = None,
+    ):
+        super().__init__()
+
+        if is_neox_style is False:
+            raise ValueError(
+                "`Phi3LongRoPEScaledRotaryEmbedding` only supports neox_style."
+            )
+
+        self.rotary_dim = rotary_dim
+        self.head_size = head_size
+        self.max_position_embeddings = max_position_embeddings
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.base = base
+        self.short_factor = short_factor
+        self.long_factor = long_factor
+
+        scale = self.max_position_embeddings / self.original_max_position_embeddings
+        if scale <= 1.0:
+            scaling_factor = 1.0
+        else:
+            scaling_factor = math.sqrt(
+                1 + math.log(scale) / math.log(self.original_max_position_embeddings)
+            )
+        if short_mscale is None:
+            short_mscale = scaling_factor
+        if long_mscale is None:
+            long_mscale = scaling_factor
+
+        self.short_mscale = short_mscale
+        self.long_mscale = long_mscale
+
+        short_cache = self._compute_cos_sin_cache(
+            original_max_position_embeddings, short_factor, short_mscale
+        )
+        short_cache = short_cache.to(dtype)
+        self.register_buffer("short_cos_sin_cache", short_cache, persistent=False)
+
+        long_cache = self._compute_cos_sin_cache(
+            max_position_embeddings, long_factor, long_mscale
+        )
+        long_cache = long_cache.to(dtype)
+        self.register_buffer("long_cos_sin_cache", long_cache, persistent=False)
+
+        long_short_cache = torch.cat(
+            [self.short_cos_sin_cache, self.long_cos_sin_cache], dim=0
+        )
+        self.register_buffer(
+            "long_short_cos_sin_cache", long_short_cache, persistent=False
+        )
+
+    def _compute_inv_freq(self, rescale_factors: List[float]) -> torch.Tensor:
+        rescale_factors = torch.tensor(rescale_factors, dtype=torch.float32)
+        inv_freq = 1.0 / (
+            rescale_factors
+            * (
+                self.base
+                ** (
+                    torch.arange(0, self.rotary_dim, 2, dtype=torch.float)
+                    / self.rotary_dim
+                )
+            )
+        )
+        return inv_freq
+
+    def _compute_cos_sin_cache(
+        self,
+        max_position_embeddings: int,
+        rescale_factors: List[float],
+        mscale: float,
+    ) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(rescale_factors)
+        t = torch.arange(max_position_embeddings, dtype=torch.float)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos() * mscale
+        sin = freqs.sin() * mscale
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        query = query.view(*query.shape[:-1], -1, self.head_size)
+        key = key.view(*key.shape[:-1], -1, self.head_size)
+
+        k = self.original_max_position_embeddings
+        long_prompt_offset = (
+            torch.any(positions > k).float() * torch.full_like(positions, k)
+        ).long()
+        idx = (
+            torch.add(positions, long_prompt_offset)
+            if long_prompt_offset is not None
+            else positions
+        )
+        self.long_short_cos_sin_cache: torch.Tensor = self.long_short_cos_sin_cache.to(
+            idx.device
+        )
+        idx = torch.add(idx, offsets) if offsets is not None else idx
+        cos_sin = torch.index_select(self.long_short_cos_sin_cache, 0, idx)
+
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        cos = cos.repeat(1, 2).unsqueeze(-2)
+        sin = sin.repeat(1, 2).unsqueeze(-2)
+
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = query_rot * cos + _rotate_neox(query_rot) * sin
+        query = torch.cat((query_rot, query_pass), dim=-1)
+
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = key_rot * cos + _rotate_neox(key_rot) * sin
+        key = torch.cat((key_rot, key_pass), dim=-1)
+
+        return query.flatten(-2), key.flatten(-2)
+
+
+def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
+class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with YaRN method.
+
+    Credits to Peng et al. github.com/jquesnelle/yarn
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+        *,
+        extrapolation_factor: float = 1,
+        attn_factor: float = 1,
+        beta_fast: int = 32,
+        beta_slow: int = 1,
+        mscale: float = 1,
+        mscale_all_dim: float = 0,
+        device: Optional[str] = "cuda" if not _is_npu else "npu",
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        # Get n-d magnitude scaling corrected for interpolation.
+        self.mscale = float(
+            yarn_get_mscale(self.scaling_factor, float(mscale))
+            / yarn_get_mscale(self.scaling_factor, float(mscale_all_dim))
+            * attn_factor
+        )
+        self.device = device
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+        # Re-dispatch
+        if _is_hip:
+            self._forward_method = self.forward_native
+
+    def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
+        pos_freqs = self.base ** (
+            torch.arange(0, self.rotary_dim, 2, dtype=torch.float, device=self.device)
+            / self.rotary_dim
+        )
+        inv_freq_extrapolation = 1.0 / pos_freqs
+        inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
+
+        low, high = _yarn_find_correction_range(
+            self.beta_fast,
+            self.beta_slow,
+            self.rotary_dim,
+            self.base,
+            self.max_position_embeddings,
+        )
+        # Get n-d rotational scaling corrected for extrapolation
+        inv_freq_mask = (
+            1
+            - _yarn_linear_ramp_mask(
+                low, high, self.rotary_dim // 2, dtype=torch.float, device=self.device
+            )
+        ) * self.extrapolation_factor
+        inv_freq = (
+            inv_freq_interpolation * (1 - inv_freq_mask)
+            + inv_freq_extrapolation * inv_freq_mask
+        )
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.scaling_factor)
+        t = torch.arange(
+            self.max_position_embeddings * self.scaling_factor,
+            device=self.device,
+            dtype=torch.float32,
+        )
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos() * self.mscale
+        sin = freqs.sin() * self.mscale
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """PyTorch-native implementation equivalent to forward()."""
+        dtype = query.dtype
+        query_rot = query[..., : self.rotary_dim]
+        key_rot = key[..., : self.rotary_dim]
+        if self.rotary_dim < self.head_size:
+            query_pass = query[..., self.rotary_dim :]
+            key_pass = key[..., self.rotary_dim :]
+
+        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(positions.device)
+        cos_sin = self.cos_sin_cache[
+            torch.add(positions, offsets) if offsets is not None else positions
+        ]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if self.is_neox_style:
+            # NOTE(woosuk): Here we assume that the positions tensor has the
+            # shape [batch_size, seq_len].
+            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+        else:
+            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
+
+        rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj
+        query_rot = query_rot * cos + rotate_fn(query_rot) * sin
+        key_rot = key_rot * cos + rotate_fn(key_rot) * sin
+
+        if self.rotary_dim < self.head_size:
+            query = torch.cat((query_rot, query_pass), dim=-1)
+            key = torch.cat((key_rot, key_pass), dim=-1)
+        else:
+            query = query_rot
+            key = key_rot
+        return query.to(dtype), key.to(dtype)
+
+    def forward_npu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # NOTE: now npu_mrope can only support `numQHeads*headSize <= 4096` pattern,
+        # and generalization to more scenarios will be supported in the future.
+        if query.shape[1] * query.shape[2] > 4096:
+            return self.forward_native(positions, query, key, offsets)
+        num_tokens = query.shape[0]
+        rotary_mode = "half" if self.is_neox_style else "interleave"
+        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(positions.device)
+        query_rot = query[..., : self.rotary_dim]
+        key_rot = key[..., : self.rotary_dim]
+        if self.rotary_dim < self.head_size:
+            query_pass = query[..., self.rotary_dim :]
+            key_pass = key[..., self.rotary_dim :]
+
+        query_rot, key_rot = torch_npu.npu_mrope(
+            torch.add(positions, offsets) if offsets is not None else positions,
+            query_rot.reshape(num_tokens, -1),
+            key_rot.reshape(num_tokens, -1),
+            self.cos_sin_cache,
+            self.rotary_dim,
+            mrope_section=[0, 0, 0],
+            rotary_mode=rotary_mode,
+        )
+        query_rot = query_rot.reshape(num_tokens, -1, self.rotary_dim)
+        key_rot = key_rot.reshape(num_tokens, -1, self.rotary_dim)
+
+        if self.rotary_dim < self.head_size:
+            query = torch.cat((query_rot, query_pass), dim=-1)
+            key = torch.cat((key_rot, key_pass), dim=-1)
+        else:
+            query = query_rot
+            key = key_rot
+        return query, key
+
+    def forward_cpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        positions = torch.add(positions, offsets) if offsets is not None else positions
+        if _is_cpu_amx_available:
+            return torch.ops.sgl_kernel.rotary_embedding_cpu(
+                positions, query, key, self.head_size, self.cos_sin_cache, False
+            )
+        else:
+            return self.forward_native(positions, query, key, offsets)
+
+
+class Llama3RotaryEmbedding(RotaryEmbedding):
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        scaling_factor: float,
+        low_freq_factor: float,
+        high_freq_factor: float,
+        orig_max_position: int,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.low_freq_factor = low_freq_factor
+        self.high_freq_factor = high_freq_factor
+        self.orig_max_position = orig_max_position
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+        inv_freqs = super()._compute_inv_freq(base)
+        low_freq_wavelen = self.orig_max_position / self.low_freq_factor
+        high_freq_wavelen = self.orig_max_position / self.high_freq_factor
+
+        wave_len = 2 * math.pi / inv_freqs
+        if self.low_freq_factor != self.high_freq_factor:
+            smooth = (self.orig_max_position / wave_len - self.low_freq_factor) / (
+                self.high_freq_factor - self.low_freq_factor
+            )
+        else:
+            smooth = 0
+        new_freqs = torch.where(
+            wave_len < high_freq_wavelen,
+            inv_freqs,
+            torch.where(
+                wave_len > low_freq_wavelen,
+                inv_freqs / self.scaling_factor,
+                (1 - smooth) * inv_freqs / self.scaling_factor + smooth * inv_freqs,
+            ),
+        )
+        return new_freqs
+
+
+class Llama4VisionRotaryEmbedding(RotaryEmbedding):
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+    ):
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+        inv_freqs = super()._compute_inv_freq(base)
+        inv_freqs = inv_freqs[: (self.rotary_dim // 2)]
+        return inv_freqs
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.base)
+
+        # self.max_position_embeddings here is number of image patches
+        # i.e. (image_size // patch_size) ** 2
+        num_patches = self.max_position_embeddings
+        img_idx = torch.arange(num_patches, dtype=torch.int32).reshape(num_patches, 1)
+        img_idx = torch.cat([img_idx, img_idx[:1]], dim=0)
+        img_idx[-1, -1] = -2  # set to ID_CLS_TOKEN
+        num_patches_single_dim = int(math.sqrt(num_patches))
+        frequencies_x = img_idx % num_patches_single_dim
+        frequencies_y = img_idx // num_patches_single_dim
+        freqs_x = (
+            (frequencies_x + 1)[..., None] * inv_freq[None, None, :]
+        ).repeat_interleave(2, dim=-1)
+        freqs_y = (
+            (frequencies_y + 1)[..., None] * inv_freq[None, None, :]
+        ).repeat_interleave(2, dim=-1)
+        freqs = torch.cat([freqs_x, freqs_y], dim=-1).float().contiguous()[..., ::2]
+        freqs = freqs.masked_fill(img_idx.reshape(-1, 1, 1) < 0, 0)
+        cache = torch.view_as_complex(
+            torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1)
+        )
+        return cache
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(query.device)
+        query_ = torch.view_as_complex(query.float().reshape(*query.shape[:-1], -1, 2))
+        key_ = torch.view_as_complex(key.float().reshape(*key.shape[:-1], -1, 2))
+        broadcast_shape = [
+            d if i == 1 or i == (query_.ndim - 1) else 1
+            for i, d in enumerate(query_.shape)
+        ]
+        freqs_ci = self.cos_sin_cache.view(*broadcast_shape)
+        query_out = torch.view_as_real(query_ * freqs_ci).flatten(3)
+        key_out = torch.view_as_real(key_ * freqs_ci).flatten(3)
+        return query_out.type_as(query), key_out.type_as(key)
+
+
+class DynamicNTKAlphaRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with Dynamic NTK scaling.
+
+    Credits to the Reddit users /u/bloc97 and /u/emozilla
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        scaling_alpha: float,
+        dtype: torch.dtype,
+    ) -> None:
+        self.scaling_alpha = scaling_alpha
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        max_len = self.max_position_embeddings
+        base = self.base * self.scaling_alpha ** (
+            self.rotary_dim / (self.rotary_dim - 2)
+        )
+
+        inv_freq = self._compute_inv_freq(base)
+        t = torch.arange(max_len, dtype=torch.float)
+
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+
+class MRotaryEmbedding(RotaryEmbedding):
+    """Rotary Embedding with Multimodal Sections."""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        mrope_section: Optional[List[int]] = None,
+    ) -> None:
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+        self.mrope_section = mrope_section
+        if self.mrope_section:
+            expected_sum = rotary_dim // 2
+            actual_sum = sum(self.mrope_section)
+            if actual_sum != expected_sum:
+                print(
+                    f"MRoPE section sum mismatch: expected {expected_sum}, got {actual_sum}. "
+                    f"Adjusting mrope_section to match rotary_dim // 2 = {expected_sum}"
+                )
+                # Auto-correct by scaling the mrope_section proportionally
+                if actual_sum > 0:
+                    scale_factor = expected_sum / actual_sum
+                    self.mrope_section = [
+                        max(1, int(section * scale_factor))
+                        for section in self.mrope_section
+                    ]
+                    # Ensure the sum exactly matches by adjusting the last element
+                    current_sum = sum(self.mrope_section)
+                    if current_sum != expected_sum:
+                        self.mrope_section[-1] += expected_sum - current_sum
+                else:
+                    # If all sections are 0, create a default distribution
+                    self.mrope_section = [
+                        expected_sum // len(self.mrope_section)
+                    ] * len(self.mrope_section)
+                    # Handle remainder
+                    remainder = expected_sum % len(self.mrope_section)
+                    for i in range(remainder):
+                        self.mrope_section[i] += 1
+
+                print(
+                    f"Corrected mrope_section: {self.mrope_section} (sum={sum(self.mrope_section)})"
+                )
+
+    @torch.compile(dynamic=True)
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """PyTorch-native implementation equivalent to forward().
+
+        Args:
+            positions:
+                [num_tokens,] (text only) or
+                [3, num_tokens] (T/H/W positions with multimodal inputs)
+            query: [num_tokens, num_heads * head_size]
+            key: [num_tokens, num_kv_heads * head_size]
+        """
+        assert positions.ndim == 1 or positions.ndim == 2
+
+        num_tokens = positions.shape[-1]
+        cos_sin = self.cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if positions.ndim == 2:
+            assert self.mrope_section
+
+            cos = torch.cat(
+                [m[i] for i, m in enumerate(cos.split(self.mrope_section, dim=-1))],
+                dim=-1,
+            )
+            sin = torch.cat(
+                [m[i] for i, m in enumerate(sin.split(self.mrope_section, dim=-1))],
+                dim=-1,
+            )
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    # Copied from https://github.com/huggingface/transformers/blob/c8e0e603de9b3d49161a15fe6e8ea84badfb5d02/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1439
+    @staticmethod
+    def get_rope_index(
+        spatial_merge_size: int,
+        image_token_id: int,
+        video_token_id: int,
+        vision_start_token_id: int,
+        model_type: str,
+        tokens_per_second: Optional[int] = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        second_per_grid_ts: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        mrope_position_deltas = []
+        if input_ids is not None and (
+            image_grid_thw is not None or video_grid_thw is not None
+        ):
+            total_input_ids = input_ids
+            position_ids = torch.ones(
+                3,
+                input_ids.shape[0],
+                input_ids.shape[1],
+                dtype=input_ids.dtype,
+                device=input_ids.device,
+            )
+            image_index, video_index = 0, 0
+            for i, input_ids in enumerate(total_input_ids):
+                image_nums, video_nums = 0, 0
+                vision_start_indices = torch.argwhere(
+                    input_ids == vision_start_token_id
+                ).squeeze(1)
+                vision_tokens = input_ids[vision_start_indices + 1]
+                image_nums = (vision_tokens == image_token_id).sum()
+                video_nums = (vision_tokens == video_token_id).sum()
+                input_tokens = input_ids.tolist()
+                llm_pos_ids_list: list = []
+                st = 0
+                remain_images, remain_videos = image_nums, video_nums
+                for _ in range(image_nums + video_nums):
+                    if image_token_id in input_tokens and remain_images > 0:
+                        ed_image = input_tokens.index(image_token_id, st)
+                    else:
+                        ed_image = len(input_tokens) + 1
+                    if video_token_id in input_tokens and remain_videos > 0:
+                        ed_video = input_tokens.index(video_token_id, st)
+                    else:
+                        ed_video = len(input_tokens) + 1
+                    if ed_image < ed_video:
+                        t, h, w = (
+                            image_grid_thw[image_index][0],
+                            image_grid_thw[image_index][1],
+                            image_grid_thw[image_index][2],
+                        )
+                        second_per_grid_t = 0
+                        image_index += 1
+                        remain_images -= 1
+                        ed = ed_image
+                    else:
+                        t, h, w = (
+                            video_grid_thw[video_index][0],
+                            video_grid_thw[video_index][1],
+                            video_grid_thw[video_index][2],
+                        )
+                        if second_per_grid_ts is not None:
+                            second_per_grid_t = second_per_grid_ts[video_index]
+                        else:
+                            second_per_grid_t = 1.0
+                        video_index += 1
+                        remain_videos -= 1
+                        ed = ed_video
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t.item(),
+                        h.item() // spatial_merge_size,
+                        w.item() // spatial_merge_size,
+                    )
+                    text_len = ed - st
+
+                    st_idx = (
+                        llm_pos_ids_list[-1].max() + 1
+                        if len(llm_pos_ids_list) > 0
+                        else 0
+                    )
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                    )
+
+                    if model_type == "qwen2_5_vl":
+                        range_tensor = torch.arange(llm_grid_t).view(-1, 1)
+                        expanded_range = range_tensor.expand(
+                            -1, llm_grid_h * llm_grid_w
+                        )
+
+                        time_tensor = (
+                            expanded_range * second_per_grid_t * tokens_per_second
+                        )
+
+                        time_tensor_long = time_tensor.long()
+                        t_index = time_tensor_long.flatten()
+                    elif model_type == "qwen2_vl":
+                        t_index = (
+                            torch.arange(llm_grid_t)
+                            .view(-1, 1)
+                            .expand(-1, llm_grid_h * llm_grid_w)
+                            .flatten()
+                        )
+                    else:
+                        raise RuntimeError("Unimplemented")
+                    h_index = (
+                        torch.arange(llm_grid_h)
+                        .view(1, -1, 1)
+                        .expand(llm_grid_t, -1, llm_grid_w)
+                        .flatten()
+                    )
+                    w_index = (
+                        torch.arange(llm_grid_w)
+                        .view(1, 1, -1)
+                        .expand(llm_grid_t, llm_grid_h, -1)
+                        .flatten()
+                    )
+                    llm_pos_ids_list.append(
+                        torch.stack([t_index, h_index, w_index]) + text_len + st_idx
+                    )
+                    st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+                if st < len(input_tokens):
+                    st_idx = (
+                        llm_pos_ids_list[-1].max() + 1
+                        if len(llm_pos_ids_list) > 0
+                        else 0
+                    )
+                    text_len = len(input_tokens) - st
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                    )
+
+                llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+                position_ids[..., i, :] = llm_positions.to(position_ids.device)
+                mrope_position_deltas.append(
+                    llm_positions.max() + 1 - len(total_input_ids[i])
+                )
+            mrope_position_deltas = torch.tensor(
+                mrope_position_deltas, device=input_ids.device
+            ).unsqueeze(1)
+            return position_ids, mrope_position_deltas
+        else:
+            s = input_ids.shape[1]
+            position_ids = torch.arange(s)
+            position_ids = (
+                position_ids.unsqueeze(0).expand(3, -1, -1).to(input_ids.device)
+            )
+            max_position_ids = position_ids.max(0, keepdim=False)[0].max(
+                -1, keepdim=True
+            )[0]
+            mrope_position_deltas = max_position_ids + 1 - s
+            return position_ids, mrope_position_deltas
+
+    # Adapted from https://github.com/vllm-project/vllm/blob/3779eb8c81449b924a23457fc77e45a0e6171178/vllm/model_executor/layers/rotary_embedding.py#L1120
+    @staticmethod
+    def get_rope_index_glm4v(
+        input_ids: torch.Tensor,
+        hf_config: Any,
+        image_grid_thw: Union[list[list[int]], torch.Tensor],
+        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        attention_mask: torch.Tensor,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Get mrope input positions and delta value for GLM4V."""
+        image_token_id = hf_config.image_token_id
+        video_start_token_id = hf_config.video_start_token_id
+        video_end_token_id = hf_config.video_end_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+
+        mrope_position_deltas = []
+        if input_ids is not None and (
+            image_grid_thw is not None or video_grid_thw is not None
+        ):
+            total_input_ids = input_ids
+            if attention_mask is None:
+                attention_mask = torch.ones_like(total_input_ids)
+            position_ids = torch.ones(
+                3,
+                input_ids.shape[0],
+                input_ids.shape[1],
+                dtype=input_ids.dtype,
+                device=input_ids.device,
+            )
+            image_index, video_index = 0, 0
+            video_group_index = 0
+            attention_mask = attention_mask.to(total_input_ids.device)
+            for i, input_ids in enumerate(total_input_ids):
+                input_ids = input_ids[attention_mask[i] == 1]
+                input_tokens = input_ids.tolist()
+
+                input_token_type = []
+                video_check_flg = False
+                for token in input_tokens:
+                    if token == video_start_token_id:
+                        video_check_flg = True
+                    elif token == video_end_token_id:
+                        video_check_flg = False
+
+                    if token == image_token_id and not video_check_flg:
+                        input_token_type.append("image")
+                    elif token == image_token_id and video_check_flg:
+                        input_token_type.append("video")
+                    else:
+                        input_token_type.append("text")
+
+                input_type_group = []
+                for key, group in itertools.groupby(
+                    enumerate(input_token_type), lambda x: x[1]
+                ):
+                    group = list(group)
+                    start_index = group[0][0]
+                    end_index = group[-1][0] + 1
+                    input_type_group.append((key, start_index, end_index))
+
+                llm_pos_ids_list = []
+                video_frame_num = 1
+                for modality_type, start_idx, end_idx in input_type_group:
+                    st_idx = (
+                        llm_pos_ids_list[-1].max() + 1
+                        if len(llm_pos_ids_list) > 0
+                        else 0
+                    )
+
+                    if modality_type == "image":
+                        t, h, w = (
+                            image_grid_thw[image_index][0],
+                            image_grid_thw[image_index][1],
+                            image_grid_thw[image_index][2],
+                        )
+                        llm_grid_t, llm_grid_h, llm_grid_w = (
+                            t.item(),
+                            h.item() // spatial_merge_size,
+                            w.item() // spatial_merge_size,
+                        )
+
+                        t_index = (
+                            torch.arange(llm_grid_t)
+                            .view(-1, 1)
+                            .expand(-1, llm_grid_h * llm_grid_w)
+                            .flatten()
+                        )
+                        h_index = (
+                            torch.arange(llm_grid_h)
+                            .view(1, -1, 1)
+                            .expand(llm_grid_t, -1, llm_grid_w)
+                            .flatten()
+                        )
+                        w_index = (
+                            torch.arange(llm_grid_w)
+                            .view(1, 1, -1)
+                            .expand(llm_grid_t, llm_grid_h, -1)
+                            .flatten()
+                        )
+                        llm_pos_ids_list.append(
+                            torch.stack([t_index, h_index, w_index]) + st_idx
+                        )
+
+                        image_index += 1
+                        video_frame_num = 1
+
+                    elif modality_type == "video":
+                        t, h, w = (
+                            video_frame_num,
+                            video_grid_thw[video_index][1],
+                            video_grid_thw[video_index][2],
+                        )
+
+                        llm_grid_t, llm_grid_h, llm_grid_w = (
+                            t,
+                            h.item() // spatial_merge_size,
+                            w.item() // spatial_merge_size,
+                        )
+
+                        for t_idx in range(llm_grid_t):
+                            t_index = (
+                                torch.tensor(t_idx)
+                                .view(-1, 1)
+                                .expand(-1, llm_grid_h * llm_grid_w)
+                                .flatten()
+                            )
+
+                            h_index = (
+                                torch.arange(llm_grid_h)
+                                .view(1, -1, 1)
+                                .expand(1, -1, llm_grid_w)
+                                .flatten()
+                            )
+                            w_index = (
+                                torch.arange(llm_grid_w)
+                                .view(1, 1, -1)
+                                .expand(1, llm_grid_h, -1)
+                                .flatten()
+                            )
+                            llm_pos_ids_list.append(
+                                torch.stack([t_index, h_index, w_index]) + st_idx
+                            )
+
+                        video_group_index += 1
+
+                        if video_group_index >= video_grid_thw[video_index][0]:
+                            video_index += 1
+                            video_group_index = 0
+
+                        video_frame_num += 1
+
+                    else:
+                        text_len = end_idx - start_idx
+                        llm_pos_ids_list.append(
+                            torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                        )
+
+                        video_frame_num = 1
+
+                llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+                position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(
+                    position_ids.device
+                )
+                mrope_position_deltas.append(
+                    llm_positions.max() + 1 - len(total_input_ids[i])
+                )
+            mrope_position_deltas = torch.tensor(
+                mrope_position_deltas, device=input_ids.device
+            ).unsqueeze(1)
+            return position_ids, mrope_position_deltas
+        else:
+            if attention_mask is not None:
+                position_ids = attention_mask.long().cumsum(-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                position_ids = (
+                    position_ids.unsqueeze(0)
+                    .expand(3, -1, -1)
+                    .to(attention_mask.device)
+                )
+                max_position_ids = position_ids.max(0, keepdim=False)[0].max(
+                    -1, keepdim=True
+                )[0]
+                mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+            else:
+                position_ids = (
+                    torch.arange(input_ids.shape[1], device=input_ids.device)
+                    .view(1, 1, -1)
+                    .expand(3, input_ids.shape[0], -1)
+                )
+                mrope_position_deltas = torch.zeros(
+                    [input_ids.shape[0], 1],
+                    device=input_ids.device,
+                    dtype=input_ids.dtype,
+                )
+
+            return position_ids, mrope_position_deltas
+
+
+class DualChunkRotaryEmbedding(CustomOp):
+    """Rotary positional embedding for Dual Chunk Attention."""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        chunk_size: int,
+        local_size: int,
+    ) -> None:
+        super().__init__()
+        self.head_size = head_size
+        self.rotary_dim = rotary_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.is_neox_style = is_neox_style
+        self.chunk_size = chunk_size
+        self.local_size = local_size
+        self.dtype = dtype
+        self.device = torch.device(f"cuda:{torch.cuda.current_device()}")
+        (q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache) = (
+            self._compute_cos_sin_cache()
+        )
+
+        self.register_buffer("cos_sin_q_cache", q_cache, persistent=False)
+        self.register_buffer("cos_sin_qc_cache", qc_cache, persistent=False)
+        self.register_buffer("cos_sin_k_cache", k_cache, persistent=False)
+        self.register_buffer(
+            "cos_sin_qc_no_clamp_cache", qc_no_clamp_cache, persistent=False
+        )
+        self.register_buffer("cos_sin_q_inter_cache", q_inter_cache, persistent=False)
+
+    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+        """Compute the inverse frequency."""
+        # NOTE(woosuk): The HF implementation uses `torch.arange(...).float()`.
+        # However, we use `torch.arange(..., dtype=torch.float)` instead to
+        # avoid numerical issues with large base values (e.g., 10000000).
+        # This may cause a slight numerical difference between the HF
+        # implementation and ours.
+        # NOTE(woosuk): To exactly match the HF implementation, we need to
+        # use CPU to compute the cache and then move it to GPU. However, we
+        # create the cache on GPU for faster initialization. This may cause
+        # a slight numerical difference between the HF implementation and ours.
+        inv_freq = 1.0 / (
+            base
+            ** (
+                torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
+            )
+        )
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        """Compute the cos and sin cache."""
+        inv_freq = self._compute_inv_freq(self.base)
+        chunk_len = self.chunk_size - self.local_size
+        q_t = torch.arange(chunk_len, dtype=torch.float)
+        qc_t = (torch.arange(chunk_len, dtype=torch.float) + chunk_len).clamp(
+            max=self.chunk_size
+        )
+        k_t = torch.arange(self.max_position_embeddings, dtype=torch.float) % chunk_len
+
+        # count from chunk_len, no clamp(self.chunk_size) restriction
+        qc_no_clamp_t = torch.arange(chunk_len, dtype=torch.float) + chunk_len
+        # count from self.chunk_size for q_inter's rope
+        q_inter_t = torch.arange(chunk_len, dtype=torch.float) + self.chunk_size
+
+        q_freqs = torch.outer(q_t, inv_freq)
+        qc_freqs = torch.outer(qc_t, inv_freq)
+        k_freqs = torch.outer(k_t, inv_freq)
+        qc_no_clamp_freqs = torch.outer(qc_no_clamp_t, inv_freq)
+        q_inter_freqs = torch.outer(q_inter_t, inv_freq)
+
+        q_cos = q_freqs.cos()
+        q_sin = q_freqs.sin()
+        qc_cos = qc_freqs.cos()
+        qc_sin = qc_freqs.sin()
+        k_cos = k_freqs.cos()
+        k_sin = k_freqs.sin()
+
+        qc_no_clamp_cos = qc_no_clamp_freqs.cos()
+        qc_no_clamp_sin = qc_no_clamp_freqs.sin()
+        q_inter_cos = q_inter_freqs.cos()
+        q_inter_sin = q_inter_freqs.sin()
+
+        q_cache = torch.cat((q_cos, q_sin), dim=-1).to(
+            dtype=self.dtype, device=self.device
+        )
+        qc_cache = torch.cat((qc_cos, qc_sin), dim=-1).to(
+            dtype=self.dtype, device=self.device
+        )
+        k_cache = torch.cat((k_cos, k_sin), dim=-1).to(
+            dtype=self.dtype, device=self.device
+        )
+        qc_no_clamp_cache = torch.cat((qc_no_clamp_cos, qc_no_clamp_sin), dim=-1).to(
+            dtype=self.dtype, device=self.device
+        )
+        q_inter_cache = torch.cat((q_inter_cos, q_inter_sin), dim=-1).to(
+            dtype=self.dtype, device=self.device
+        )
+        return q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        query = query.view(*query.shape[:-1], -1, self.head_size)
+        key = key.view(*key.shape[:-1], -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        key_rot = key[..., : self.rotary_dim]
+        if self.rotary_dim < self.head_size:
+            query_pass = query[..., self.rotary_dim :]
+            key_pass = key[..., self.rotary_dim :]
+        else:
+            query_pass = None
+            key_pass = None
+
+        positions_with_offsets = (
+            torch.add(positions, offsets) if offsets is not None else positions
+        )
+        key = self._apply_rotary_embedding(
+            self.cos_sin_k_cache[positions_with_offsets], key_rot, key_pass
+        )
+        chunk_len = self.chunk_size - self.local_size
+        query = self._apply_rotary_embedding(
+            self.cos_sin_q_cache[positions_with_offsets % chunk_len],
+            query_rot,
+            query_pass,
+        )
+        query_succ = self._apply_rotary_embedding(
+            self.cos_sin_qc_cache[positions_with_offsets % chunk_len],
+            query_rot,
+            query_pass,
+        )
+        query_inter = self._apply_rotary_embedding(
+            self.cos_sin_qc_cache[chunk_len - 1].repeat(positions.shape[0], 1),
+            query_rot,
+            query_pass,
+        )
+        query_succ_critical = self._apply_rotary_embedding(
+            self.cos_sin_qc_no_clamp_cache[positions_with_offsets % chunk_len],
+            query_rot,
+            query_pass,
+        )
+        query_inter_critical = self._apply_rotary_embedding(
+            self.cos_sin_q_inter_cache[positions_with_offsets % chunk_len],
+            query_rot,
+            query_pass,
+        )
+
+        # merge query into one tensor to simplify the interfaces
+        query = torch.cat(
+            (
+                query,
+                query_succ,
+                query_inter,
+                query_succ_critical,
+                query_inter_critical,
+            ),
+            dim=-1,
+        )
+        return query, key
+
+    def _apply_rotary_embedding(self, cos_sin, hidden_rot, hidden_pass):
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if self.is_neox_style:
+            # NOTE(woosuk): Here we assume that the positions tensor has the
+            # shape [batch_size, seq_len].
+            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+        else:
+            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
+        rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj
+        hidden_rot = hidden_rot * cos + rotate_fn(hidden_rot) * sin
+
+        if self.rotary_dim < self.head_size:
+            hidden = torch.cat((hidden_rot, hidden_pass), dim=-1)
+        else:
+            hidden = hidden_rot
+        return hidden.flatten(-2).squeeze(0)
+
+    def extra_repr(self) -> str:
+        s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
+        s += f", max_position_embeddings={self.max_position_embeddings}"
+        s += f", base={self.base}, is_neox_style={self.is_neox_style}"
+        s += f", chunk_size={self.chunk_size}, local_size={self.local_size}"
+        return s
+
+
+_ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {}
+
+
+def get_rope(
+    head_size: int,
+    rotary_dim: int,
+    max_position: int,
+    base: int,
+    is_neox_style: bool = True,
+    rope_scaling: Optional[Dict[str, Any]] = None,
+    dtype: Optional[torch.dtype] = None,
+    partial_rotary_factor: float = 1.0,
+    dual_chunk_attention_config: Optional[Dict[str, Any]] = None,
+) -> RotaryEmbedding:
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+    if rope_scaling is not None:
+        # Transforms every value that is a list into a tuple for caching calls
+        rope_scaling_tuple = {
+            k: tuple(v) if isinstance(v, list) else v for k, v in rope_scaling.items()
+        }
+        rope_scaling_args = tuple(rope_scaling_tuple.items())
+    else:
+        rope_scaling_args = None
+
+    if dual_chunk_attention_config is not None:
+        dual_chunk_attention_tuple = {
+            k: tuple(v) if isinstance(v, list) else v
+            for k, v in dual_chunk_attention_config.items()
+            if k != "sparse_attention_config"
+        }
+        dual_chunk_attention_args = tuple(dual_chunk_attention_tuple.items())
+    else:
+        dual_chunk_attention_args = None
+
+    if partial_rotary_factor < 1.0:
+        rotary_dim = int(rotary_dim * partial_rotary_factor)
+    key = (
+        head_size,
+        rotary_dim,
+        max_position,
+        base,
+        is_neox_style,
+        rope_scaling_args,
+        dual_chunk_attention_args,
+        dtype,
+    )
+    if key in _ROPE_DICT:
+        return _ROPE_DICT[key]
+
+    if dual_chunk_attention_config is not None:
+        extra_kwargs = {
+            k: v
+            for k, v in dual_chunk_attention_config.items()
+            if k in ("chunk_size", "local_size")
+        }
+        rotary_emb = DualChunkRotaryEmbedding(
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_style,
+            dtype,
+            **extra_kwargs,
+        )
+    elif rope_scaling is None:
+        rotary_emb = RotaryEmbedding(
+            head_size, rotary_dim, max_position, base, is_neox_style, dtype
+        )
+    else:
+        if "rope_type" in rope_scaling:
+            scaling_type = rope_scaling["rope_type"]
+        elif "type" in rope_scaling:
+            scaling_type = rope_scaling["type"]
+        else:
+            raise ValueError("Unknown RoPE scaling type")
+
+        if scaling_type == "llama3":
+            scaling_factor = rope_scaling["factor"]
+            low_freq_factor = rope_scaling["low_freq_factor"]
+            high_freq_factor = rope_scaling["high_freq_factor"]
+            original_max_position = rope_scaling["original_max_position_embeddings"]
+            rotary_emb = Llama3RotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                dtype,
+                scaling_factor,
+                low_freq_factor,
+                high_freq_factor,
+                original_max_position,
+            )
+        elif scaling_type == "default":
+            if "mrope_section" in rope_scaling:
+                rotary_emb = MRotaryEmbedding(
+                    head_size,
+                    rotary_dim,
+                    max_position,
+                    base,
+                    is_neox_style,
+                    dtype,
+                    mrope_section=rope_scaling["mrope_section"],
+                )
+            else:
+                rotary_emb = RotaryEmbedding(
+                    head_size,
+                    rotary_dim,
+                    max_position,
+                    base,
+                    is_neox_style,
+                    dtype,
+                )
+        elif scaling_type == "linear":
+            scaling_factor = rope_scaling["factor"]
+            rotary_emb = LinearScalingRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                scaling_factor,
+                dtype,
+            )
+        elif scaling_type == "dynamic":
+            scaling_factor = rope_scaling["factor"]
+            if "alpha" in rope_scaling:
+                rotary_emb = DynamicNTKAlphaRotaryEmbedding(
+                    head_size,
+                    rotary_dim,
+                    max_position,
+                    base,
+                    is_neox_style,
+                    rope_scaling["alpha"],
+                    dtype,
+                )
+            else:
+                rotary_emb = DynamicNTKScalingRotaryEmbedding(
+                    head_size,
+                    rotary_dim,
+                    max_position,
+                    base,
+                    is_neox_style,
+                    scaling_factor,
+                    dtype,
+                )
+        elif scaling_type == "yarn":
+            scaling_factor = rope_scaling["factor"]
+            original_max_position = rope_scaling["original_max_position_embeddings"]
+            extra_kwargs = {
+                k: v
+                for k, v in rope_scaling.items()
+                if k
+                in ("extrapolation_factor", "attn_factor", "beta_fast", "beta_slow")
+            }
+            rotary_emb = YaRNScalingRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                original_max_position,
+                base,
+                is_neox_style,
+                scaling_factor,
+                dtype,
+                **extra_kwargs,
+            )
+        elif scaling_type == "deepseek_yarn":
+            scaling_factor = rope_scaling["factor"]
+            original_max_position = rope_scaling["original_max_position_embeddings"]
+            # assert max_position == original_max_position * scaling_factor
+            extra_kwargs = {
+                k: v
+                for k, v in rope_scaling.items()
+                if k
+                in (
+                    "extrapolation_factor",
+                    "attn_factor",
+                    "beta_fast",
+                    "beta_slow",
+                    "mscale",
+                    "mscale_all_dim",
+                )
+            }
+            rotary_emb = DeepseekScalingRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                original_max_position,
+                base,
+                is_neox_style,
+                scaling_factor,
+                dtype,
+                **extra_kwargs,
+            )
+        elif scaling_type == "longrope":
+            short_factor = rope_scaling["short_factor"]
+            long_factor = rope_scaling["long_factor"]
+            original_max_position = rope_scaling["original_max_position_embeddings"]
+            extra_kwargs = {
+                k: v
+                for k, v in rope_scaling.items()
+                if k in ("short_mscale", "long_mscale")
+            }
+            rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                original_max_position,
+                base,
+                is_neox_style,
+                dtype,
+                short_factor,
+                long_factor,
+                **extra_kwargs,
+            )
+        else:
+            raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    _ROPE_DICT[key] = rotary_emb
+    return rotary_emb
+
+
+# Copied from transformers
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb_native(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim=1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    orig_q_dtype = q.dtype
+    orig_k_dtype = k.dtype
+    q, k = q.float(), k.float()
+
+    # embedding is performed in float
+    cos = cos.unsqueeze(unsqueeze_dim).float()
+    sin = sin.unsqueeze(unsqueeze_dim).float()
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+
+    q_embed = q_embed.to(orig_q_dtype)
+    k_embed = k_embed.to(orig_k_dtype)
+
+    return q_embed, k_embed
+
+
+def apply_rotary_pos_emb_npu(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim=1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if q.shape[1] != 128:
+        return apply_rotary_pos_emb_native(q, k, cos, sin, unsqueeze_dim)
+    cos = cos.unsqueeze(unsqueeze_dim)
+    cos = torch.transpose(cos, 1, 2)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    sin = torch.transpose(sin, 1, 2)
+    q = torch.transpose(q, 1, 2)
+    k = torch.transpose(k, 1, 2)
+    q_embed, k_embed = torch_npu.npu_apply_rotary_pos_emb(q, k, cos, sin)
+    q_embed = torch.transpose(q_embed, 1, 2)
+    k_embed = torch.transpose(k_embed, 1, 2)
+    return q_embed, k_embed
+
+
+if _is_npu:
+    apply_rotary_pos_emb = apply_rotary_pos_emb_npu
+else:
+    apply_rotary_pos_emb = apply_rotary_pos_emb_native
+
+
+def get_rope_cpu(
+    head_size: int,
+    rotary_dim: int,
+    max_position: int,
+    base: int,
+    is_neox_style: bool = True,
+    rope_scaling: Optional[Dict[str, Any]] = None,
+    dtype: Optional[torch.dtype] = None,
+    partial_rotary_factor: float = 1.0,
+    device: Optional[str] = None,
+) -> RotaryEmbedding:
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+    if rope_scaling is not None:
+        # Transforms every value that is a list into a tuple for caching calls
+        rope_scaling_tuple = {
+            k: tuple(v) if isinstance(v, list) else v for k, v in rope_scaling.items()
+        }
+        rope_scaling_args = tuple(rope_scaling_tuple.items())
+    else:
+        rope_scaling_args = None
+    if partial_rotary_factor < 1.0:
+        rotary_dim = int(rotary_dim * partial_rotary_factor)
+    key = (
+        head_size,
+        rotary_dim,
+        max_position,
+        base,
+        is_neox_style,
+        rope_scaling_args,
+        dtype,
+    )
+    if key in _ROPE_DICT:
+        return _ROPE_DICT[key]
+
+    assert rope_scaling is not None
+    scaling_type = rope_scaling["rope_type"]
+    assert (
+        scaling_type == "deepseek_yarn"
+    ), "Only deepseek_yarn is supported for CPU for now"
+
+    scaling_factor = rope_scaling["factor"]
+    original_max_position = rope_scaling["original_max_position_embeddings"]
+    extra_kwargs = {
+        k: v
+        for k, v in rope_scaling.items()
+        if k
+        in (
+            "extrapolation_factor",
+            "attn_factor",
+            "beta_fast",
+            "beta_slow",
+            "mscale",
+            "mscale_all_dim",
+        )
+    }
+    extra_kwargs["device"] = device
+    rotary_emb = DeepseekScalingRotaryEmbedding(
+        head_size,
+        rotary_dim,
+        original_max_position,
+        base,
+        is_neox_style,
+        scaling_factor,
+        dtype,
+        **extra_kwargs,
+    )
+
+    _ROPE_DICT[key] = rotary_emb
+    return rotary_emb
+
+
+def get_rope_wrapper(
+    head_size: int,
+    rotary_dim: int,
+    max_position: int,
+    base: int,
+    is_neox_style: bool = True,
+    rope_scaling: Optional[Dict[str, Any]] = None,
+    dtype: Optional[torch.dtype] = None,
+    partial_rotary_factor: float = 1.0,
+    device: Optional[str] = None,
+):
+    if device != "cpu":
+        wrapper = aiter_get_rope if _use_aiter else get_rope
+        return wrapper(
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_style,
+            rope_scaling,
+            dtype,
+            partial_rotary_factor,
+        )
+
+    return get_rope_cpu(
+        head_size,
+        rotary_dim,
+        max_position,
+        base,
+        is_neox_style,
+        rope_scaling,
+        dtype,
+        partial_rotary_factor,
+        device,
+    )
diff --git a/python/sglang/srt/layers/sampler.py b/python/sglang/srt/layers/sampler.py
new file mode 100644
index 000000000..56a831f2d
--- /dev/null
+++ b/python/sglang/srt/layers/sampler.py
@@ -0,0 +1,294 @@
+import logging
+from typing import List
+
+import torch
+import torch.distributed as dist
+from torch import nn
+
+from sglang.srt.distributed import get_tp_group
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_group,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
+from sglang.srt.utils import crash_on_warnings, get_bool_env_var, is_cuda
+
+if is_cuda():
+    from sgl_kernel import (
+        min_p_sampling_from_probs,
+        top_k_renorm_prob,
+        top_k_top_p_sampling_from_probs,
+        top_p_renorm_prob,
+    )
+
+
+logger = logging.getLogger(__name__)
+
+SYNC_TOKEN_IDS_ACROSS_TP = get_bool_env_var("SYNC_TOKEN_IDS_ACROSS_TP")
+RETURN_ORIGINAL_LOGPROB = get_bool_env_var("RETURN_ORIGINAL_LOGPROB")
+
+
+class Sampler(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.use_nan_detection = global_server_args_dict["enable_nan_detection"]
+        self.tp_sync_group = get_tp_group().device_group
+
+        if is_dp_attention_enabled():
+            self.tp_sync_group = get_attention_tp_group().device_group
+
+    def forward(
+        self,
+        logits_output: LogitsProcessorOutput,
+        sampling_info: SamplingBatchInfo,
+        return_logprob: bool,
+        top_logprobs_nums: List[int],
+        token_ids_logprobs: List[List[int]],
+    ):
+        """Run a sampler & compute logprobs and update logits_output accordingly.
+
+        Args:
+            logits_output: The logits from the model forward
+            sampling_info: Metadata for sampling
+            return_logprob: If set, store the output logprob information to
+                logits_output
+            top_logprobs_nums: Number of top lobprobs per sequence in a batch
+            batch_next_token_ids: next token IDs. If set, skip sampling and only
+                compute output logprobs It is used for speculative decoding which
+                performs sampling in draft workers.
+        """
+        logits = logits_output.next_token_logits
+
+        # Apply the custom logit processors if registered in the sampling info.
+        if sampling_info.has_custom_logit_processor:
+            apply_custom_logit_processor(logits, sampling_info)
+
+        if self.use_nan_detection and torch.any(torch.isnan(logits)):
+            logger.warning("Detected errors during sampling! NaN in the logits.")
+            logits = torch.where(
+                torch.isnan(logits), torch.full_like(logits, -1e5), logits
+            )
+            if crash_on_warnings():
+                raise ValueError("Detected errors during sampling! NaN in the logits.")
+
+        if sampling_info.is_all_greedy:
+            # Use torch.argmax if all requests use greedy sampling
+            batch_next_token_ids = torch.argmax(logits, -1)
+            if return_logprob:
+                logprobs = torch.nn.functional.log_softmax(logits, dim=-1)
+
+        else:
+            # Post process original logits. if temperatures are all 1.0, no need to rescale
+            if return_logprob and RETURN_ORIGINAL_LOGPROB:
+                logprobs = torch.softmax(logits, dim=-1)
+
+            # Post process logits
+            logits.div_(sampling_info.temperatures)
+            logits[:] = torch.softmax(logits, dim=-1)
+            probs = logits
+            del logits
+
+            if True:  # Keep this redundant check to simplify some internal code sync
+                if global_server_args_dict["sampling_backend"] == "flashinfer":
+                    if sampling_info.need_min_p_sampling:
+                        probs = top_k_renorm_prob(probs, sampling_info.top_ks)
+                        probs = top_p_renorm_prob(probs, sampling_info.top_ps)
+                        batch_next_token_ids = min_p_sampling_from_probs(
+                            probs, sampling_info.min_ps
+                        )
+                    else:
+                        batch_next_token_ids = top_k_top_p_sampling_from_probs(
+                            probs.contiguous(),
+                            sampling_info.top_ks,
+                            sampling_info.top_ps,
+                            filter_apply_order="joint",
+                            check_nan=self.use_nan_detection,
+                        )
+                elif global_server_args_dict["sampling_backend"] == "pytorch":
+                    # A slower fallback implementation with torch native operations.
+                    batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
+                        probs,
+                        sampling_info.top_ks,
+                        sampling_info.top_ps,
+                        sampling_info.min_ps,
+                        sampling_info.need_min_p_sampling,
+                    )
+                else:
+                    raise ValueError(
+                        f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
+                    )
+
+            if return_logprob:
+                # clamp to avoid -inf
+                if RETURN_ORIGINAL_LOGPROB:
+                    logprobs = torch.log(logprobs).clamp(
+                        min=torch.finfo(logprobs.dtype).min
+                    )
+                else:
+                    logprobs = torch.log(probs).clamp(min=torch.finfo(probs.dtype).min)
+
+        # Attach logprobs to logits_output (in-place modification)
+        if return_logprob:
+            if any(x > 0 for x in top_logprobs_nums):
+                (
+                    logits_output.next_token_top_logprobs_val,
+                    logits_output.next_token_top_logprobs_idx,
+                ) = get_top_logprobs(logprobs, top_logprobs_nums)
+
+            if any(x is not None for x in token_ids_logprobs):
+                (
+                    logits_output.next_token_token_ids_logprobs_val,
+                    logits_output.next_token_token_ids_logprobs_idx,
+                ) = get_token_ids_logprobs(logprobs, token_ids_logprobs)
+
+            logits_output.next_token_logprobs = logprobs[
+                torch.arange(len(batch_next_token_ids), device=sampling_info.device),
+                batch_next_token_ids,
+            ]
+
+        if SYNC_TOKEN_IDS_ACROSS_TP or sampling_info.grammars:
+            # For performance reasons, SGLang does not sync the final token IDs across TP ranks by default.
+            # This saves one all-reduce, but the correctness of this approach depends on the determinism of several operators:
+            # the last all-reduce, the last lm_head matmul, and all sampling kernels.
+            # These kernels are deterministic in most cases, but there are some rare instances where they are not deterministic.
+            # In such cases, enable this env variable to prevent hanging due to TP ranks becoming desynchronized.
+            # When using xgrammar, this becomes more likely so we also do the sync when grammar is used.
+
+            torch.distributed.all_reduce(
+                batch_next_token_ids,
+                op=dist.ReduceOp.MIN,
+                group=self.tp_sync_group,
+            )
+
+        return batch_next_token_ids
+
+
+def top_k_top_p_min_p_sampling_from_probs_torch(
+    probs: torch.Tensor,
+    top_ks: torch.Tensor,
+    top_ps: torch.Tensor,
+    min_ps: torch.Tensor,
+    need_min_p_sampling: bool,
+):
+    """A top-k, top-p and min-p sampling implementation with native pytorch operations."""
+    probs_sort, probs_idx = probs.sort(dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    probs_sort[
+        torch.arange(0, probs.shape[-1], device=probs.device).view(1, -1)
+        >= top_ks.view(-1, 1)
+    ] = 0.0
+    probs_sort[(probs_sum - probs_sort) > top_ps.view(-1, 1)] = 0.0
+
+    if need_min_p_sampling:
+        min_p_thresholds = probs_sort[:, 0] * min_ps
+        probs_sort[probs_sort < min_p_thresholds.view(-1, 1)] = 0.0
+
+    sampled_index = torch.multinomial(probs_sort, num_samples=1)
+    # int32 range is enough to represent the token ids
+    probs_idx = probs_idx.to(torch.int32)
+    batch_next_token_ids = torch.gather(probs_idx, dim=1, index=sampled_index).view(-1)
+    return batch_next_token_ids
+
+
+def sampling_from_probs_torch(probs: torch.Tensor):
+    """A sampling implementation with native pytorch operations, without
+    top-k, top-p, or min-p filtering."""
+    sampled_index = torch.multinomial(probs, num_samples=1)
+    batch_next_token_ids = sampled_index.view(-1).to(torch.int32)
+    return batch_next_token_ids
+
+
+def top_p_normalize_probs_torch(
+    probs: torch.Tensor,
+    top_ps: torch.Tensor,
+):
+    # See also top_k_top_p_min_p_sampling_from_probs_torch
+    probs_sort, probs_idx = probs.sort(dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    probs_sort[(probs_sum - probs_sort) > top_ps.view(-1, 1)] = 0.0
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    return torch.zeros_like(probs_sort).scatter_(-1, probs_idx, probs_sort)
+
+
+def get_top_logprobs(
+    logprobs: torch.Tensor,
+    top_logprobs_nums: List[int],
+):
+    max_k = max(top_logprobs_nums)
+    ret = logprobs.topk(max_k, dim=1)
+    values = ret.values.tolist()
+    indices = ret.indices.tolist()
+
+    output_top_logprobs_val = []
+    output_top_logprobs_idx = []
+    for i, k in enumerate(top_logprobs_nums):
+        output_top_logprobs_val.append(values[i][:k])
+        output_top_logprobs_idx.append(indices[i][:k])
+
+    return (
+        output_top_logprobs_val,
+        output_top_logprobs_idx,
+    )
+
+
+def get_token_ids_logprobs(
+    logprobs: torch.Tensor,
+    token_ids_logprobs: List[List[int]],
+):
+    output_token_ids_logprobs_val = []
+    output_token_ids_logprobs_idx = []
+    for i, token_ids in enumerate(token_ids_logprobs):
+        if token_ids is not None:
+            output_token_ids_logprobs_val.append(logprobs[i, token_ids].tolist())
+            output_token_ids_logprobs_idx.append(token_ids)
+        else:
+            output_token_ids_logprobs_val.append([])
+            output_token_ids_logprobs_idx.append([])
+
+    return (
+        output_token_ids_logprobs_val,
+        output_token_ids_logprobs_idx,
+    )
+
+
+def apply_custom_logit_processor(
+    logits: torch.Tensor,
+    sampling_batch_info: SamplingBatchInfo,
+    num_tokens_in_batch: int = 1,
+):
+    """Apply custom logit processors to the logits.
+    This function will modify the logits in-place.
+    num_tokens_in_batch is needed to support spec decoding, where each batch can contain multiple
+    tokens. By default, we assume each batch contains only 1 token.
+    """
+
+    assert logits.shape[0] == len(sampling_batch_info) * num_tokens_in_batch, (
+        f"The batch size of logits ({logits.shape[0]}) does not match the batch size of "
+        f"sampling_batch_info ({len(sampling_batch_info)}) x num_tokens_in_batch "
+        f"({num_tokens_in_batch})"
+    )
+
+    for _, (
+        processor,
+        batch_mask,
+    ) in sampling_batch_info.custom_logit_processor.items():
+        # Get the batch indices that need to be processed
+        batch_indices = batch_mask.nonzero(as_tuple=True)[0]
+
+        assert batch_mask.shape[0] == len(sampling_batch_info), (
+            f"The number of batch mask ({batch_mask.shape[0]}) does not match the number of "
+            f"sampling_batch_info ({len(sampling_batch_info)})"
+        )
+        batch_mask = torch.repeat_interleave(batch_mask, num_tokens_in_batch)
+
+        # Apply the processor to the logits
+        logits[batch_mask] = processor(
+            logits[batch_mask],
+            [sampling_batch_info.custom_params[i] for i in batch_indices],
+        )
+
+        logger.debug(
+            f"Custom logit processor {processor.__class__.__name__} is applied."
+        )
diff --git a/python/sglang/srt/layers/torchao_utils.py b/python/sglang/srt/layers/torchao_utils.py
new file mode 100644
index 000000000..e08abd5ae
--- /dev/null
+++ b/python/sglang/srt/layers/torchao_utils.py
@@ -0,0 +1,124 @@
+"""
+Common utilities for torchao.
+"""
+
+import logging
+import os
+import pwd
+from typing import Callable, Optional
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+def get_gemlite_cache_path() -> str:
+    return f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
+
+
+def save_gemlite_cache(print_error: bool = False) -> bool:
+    try:
+        from gemlite.core import GemLiteLinearTriton
+
+        GemLiteLinearTriton.cache_config(get_gemlite_cache_path())
+    except Exception:
+        if print_error:
+            logger.error("Failed to save the GemLite cache.")
+        return False
+    return True
+
+
+def proj_filter(
+    module: torch.nn.Module,
+    fqn: str,
+):
+    """Filter function for quantizing projection layers."""
+    return "proj" in fqn
+
+
+def apply_torchao_config_to_model(
+    model: torch.nn.Module,
+    torchao_config: str,
+    filter_fn: Optional[Callable] = proj_filter,
+):
+    """Quantize a modelwith torchao quantization specified by torchao_config
+
+    Args:
+       `model`: a model to be quantized based on torchao_config
+       `torchao_config` (str): type of quantization and their arguments we want to use to
+        quantize the model, e.g. int4wo-128 means int4 weight only quantization with group_size
+        128
+    """
+    # Lazy import to suppress some warnings
+    from torchao.quantization import (
+        float8_dynamic_activation_float8_weight,
+        float8_weight_only,
+        int4_weight_only,
+        int8_dynamic_activation_int8_weight,
+        int8_weight_only,
+        quantize_,
+    )
+    from torchao.quantization.observer import PerRow, PerTensor
+
+    if torchao_config == "" or torchao_config is None:
+        return model
+    elif "int8wo" in torchao_config:
+        quantize_(model, int8_weight_only(), filter_fn=filter_fn)
+    elif "int8dq" in torchao_config:
+        quantize_(model, int8_dynamic_activation_int8_weight(), filter_fn=filter_fn)
+    elif "int4wo" in torchao_config:
+        group_size = int(torchao_config.split("-")[-1])
+        assert group_size in [
+            32,
+            64,
+            128,
+            256,
+        ], f"int4wo groupsize needs to be one of [32, 64, 128, 256] but got {group_size}"
+        quantize_(model, int4_weight_only(group_size=group_size), filter_fn=filter_fn)
+    elif "gemlite" in torchao_config:
+        # gemlite-<packing_bitwidth>-<bit_width>-<group_size> or
+        # gemlite-<bit_width>-<group_size> (packing_bitwidth defaults to 32)
+        from gemlite.core import GemLiteLinearTriton
+        from torchao.quantization import gemlite_uintx_weight_only
+
+        _quant_args = torchao_config.split("-")
+        bit_width = int(_quant_args[-2])
+        group_size = None if _quant_args[-1] == "None" else int(_quant_args[-1])
+
+        try:
+            packing_bitwidth = int(_quant_args[-3])
+        except (ValueError, IndexError):
+            # if only 2 inputs found or conversion fails, use default value
+            packing_bitwidth = 32
+
+        quantize_(
+            model, gemlite_uintx_weight_only(group_size, bit_width, packing_bitwidth)
+        )
+
+        # try to load gemlite kernel config
+        GemLiteLinearTriton.load_config(get_gemlite_cache_path())
+
+    elif "fp8wo" in torchao_config:
+        # this requires newer hardware
+        # [rank0]: AssertionError: fp8e4nv data type is not supported on CUDA arch < 89
+        quantize_(model, float8_weight_only(), filter_fn=filter_fn)
+    elif "fp8dq" in torchao_config:
+        granularity = torchao_config.split("-")[-1]
+        GRANULARITY_MAP = {
+            "per_row": PerRow(),
+            "per_tensor": PerTensor(),
+        }
+        assert (
+            granularity in GRANULARITY_MAP
+        ), f"Supported granularity are: {GRANULARITY_MAP.keys()}, got {granularity}"
+        quantize_(
+            model,
+            float8_dynamic_activation_float8_weight(
+                granularity=GRANULARITY_MAP[granularity]
+            ),
+            filter_fn=filter_fn,
+        )
+    else:
+        raise ValueError(f"Unexpected config: {torchao_config}")
+
+    return model
diff --git a/python/sglang/srt/layers/utils.py b/python/sglang/srt/layers/utils.py
new file mode 100644
index 000000000..d79ccc663
--- /dev/null
+++ b/python/sglang/srt/layers/utils.py
@@ -0,0 +1,36 @@
+import logging
+import re
+from functools import lru_cache
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+def get_layer_id(weight_name):
+    # example weight name: model.layers.10.self_attn.qkv_proj.weight
+    match = re.search(r"layers\.(\d+)\.", weight_name)
+    if match:
+        return int(match.group(1))
+    return None
+
+
+class PPMissingLayer(torch.nn.Identity):
+    # Adapted from
+    # https://github.com/vllm-project/vllm/blob/18ed3132d2bfe1df9a74729457b69243955221e8/vllm/model_executor/models/utils.py#L468C1-L486C1
+    """
+    A placeholder layer for missing layers in a pipeline parallel model.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.return_tuple = kwargs.get("return_tuple", False)
+
+    def forward(self, *args, **kwargs):
+        """
+        Return the first arg from args or the first value from kwargs.
+
+        Wraps the input in a tuple if `self.return_tuple` is True.
+        """
+        input = args[0] if args else next(iter(kwargs.values()))
+        return (input,) if self.return_tuple else input
diff --git a/python/sglang/srt/layers/vocab_parallel_embedding.py b/python/sglang/srt/layers/vocab_parallel_embedding.py
new file mode 100644
index 000000000..66abb7541
--- /dev/null
+++ b/python/sglang/srt/layers/vocab_parallel_embedding.py
@@ -0,0 +1,571 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/model_executor/layers/vocab_parallel_embedding.py
+
+import logging
+from dataclasses import dataclass
+from typing import List, Optional, Sequence, Tuple
+
+import torch
+from torch.nn.parameter import Parameter, UninitializedParameter
+
+from sglang.srt.distributed import (
+    divide,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    parallel_state,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    use_symmetric_memory,
+)
+from sglang.srt.layers.amx_utils import PackWeightMethod
+from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size
+from sglang.srt.layers.parameter import BasevLLMParameter
+from sglang.srt.layers.quantization.base_config import (
+    QuantizationConfig,
+    QuantizeMethodBase,
+    method_has_implemented_embedding,
+)
+from sglang.srt.layers.quantization.unquant import UnquantizedEmbeddingMethod
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    get_compiler_backend,
+    is_cpu,
+    set_weight_attrs,
+)
+
+DEFAULT_VOCAB_PADDING_SIZE = 64
+
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+
+logger = logging.getLogger(__name__)
+
+
+def pad_vocab_size(vocab_size: int, pad_to: int = DEFAULT_VOCAB_PADDING_SIZE) -> int:
+    """Pad the vocab size to the given value."""
+    return ((vocab_size + pad_to - 1) // pad_to) * pad_to
+
+
+def vocab_range_from_per_partition_vocab_size(
+    per_partition_vocab_size: int, rank: int, offset: int = 0
+) -> Sequence[int]:
+    index_f = rank * per_partition_vocab_size
+    index_l = index_f + per_partition_vocab_size
+    return index_f + offset, index_l + offset
+
+
+def vocab_range_from_global_vocab_size(
+    global_vocab_size: int, rank: int, world_size: int, offset: int = 0
+) -> Sequence[int]:
+    per_partition_vocab_size = divide(global_vocab_size, world_size)
+    return vocab_range_from_per_partition_vocab_size(
+        per_partition_vocab_size, rank, offset=offset
+    )
+
+
+@dataclass
+class VocabParallelEmbeddingShardIndices:
+    """Indices for a shard of a vocab parallel embedding."""
+
+    padded_org_vocab_start_index: int
+    padded_org_vocab_end_index: int
+    padded_added_vocab_start_index: int
+    padded_added_vocab_end_index: int
+
+    org_vocab_start_index: int
+    org_vocab_end_index: int
+    added_vocab_start_index: int
+    added_vocab_end_index: int
+
+    @property
+    def num_org_elements(self) -> int:
+        return self.org_vocab_end_index - self.org_vocab_start_index
+
+    @property
+    def num_added_elements(self) -> int:
+        return self.added_vocab_end_index - self.added_vocab_start_index
+
+    @property
+    def num_org_elements_padded(self) -> int:
+        return self.padded_org_vocab_end_index - self.padded_org_vocab_start_index
+
+    @property
+    def num_added_elements_padded(self) -> int:
+        return self.padded_added_vocab_end_index - self.padded_added_vocab_start_index
+
+    @property
+    def num_org_vocab_padding(self) -> int:
+        return self.num_org_elements_padded - self.num_org_elements
+
+    @property
+    def num_added_vocab_padding(self) -> int:
+        return self.num_added_elements_padded - self.num_added_elements
+
+    @property
+    def num_elements_padded(self) -> int:
+        return self.num_org_elements_padded + self.num_added_elements_padded
+
+    def __post_init__(self):
+        # sanity checks
+        assert self.padded_org_vocab_start_index <= self.padded_org_vocab_end_index
+        assert self.padded_added_vocab_start_index <= self.padded_added_vocab_end_index
+
+        assert self.org_vocab_start_index <= self.org_vocab_end_index
+        assert self.added_vocab_start_index <= self.added_vocab_end_index
+
+        assert self.org_vocab_start_index <= self.padded_org_vocab_start_index
+        assert self.added_vocab_start_index <= self.padded_added_vocab_start_index
+        assert self.org_vocab_end_index <= self.padded_org_vocab_end_index
+        assert self.added_vocab_end_index <= self.padded_added_vocab_end_index
+
+        assert self.num_org_elements <= self.num_org_elements_padded
+        assert self.num_added_elements <= self.num_added_elements_padded
+
+
+@torch.compile(dynamic=True, backend=get_compiler_backend())
+def get_masked_input_and_mask(
+    input_: torch.Tensor,
+    org_vocab_start_index: int,
+    org_vocab_end_index: int,
+    num_org_vocab_padding: int,
+    added_vocab_start_index: int,
+    added_vocab_end_index: int,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # torch.compile will fuse all of the pointwise ops below
+    # into a single kernel, making it very fast
+    org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_end_index)
+    added_vocab_mask = (input_ >= added_vocab_start_index) & (
+        input_ < added_vocab_end_index
+    )
+    added_offset = (
+        added_vocab_start_index
+        - (org_vocab_end_index - org_vocab_start_index)
+        - num_org_vocab_padding
+    )
+    valid_offset = (org_vocab_start_index * org_vocab_mask) + (
+        added_offset * added_vocab_mask
+    )
+    vocab_mask = org_vocab_mask | added_vocab_mask
+    input_ = vocab_mask * (input_ - valid_offset)
+    return input_, ~vocab_mask
+
+
+class VocabParallelEmbedding(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    Adapted from torch.nn.Embedding, note that we pad the vocabulary size to
+    make sure it is divisible by the number of model parallel GPUs.
+
+    In order to support various loading methods, we ensure that LoRA-added
+    embeddings are always at the end of TP-sharded tensors. In other words,
+    we shard base embeddings and LoRA embeddings separately (both padded),
+    and place them in the same tensor.
+    In this example, we will have the original vocab size = 1010,
+    added vocab size = 16 and padding to 64. Therefore, the total
+    vocab size with padding will be 1088 (because we first pad 1010 to
+    1024, add 16, and then pad to 1088).
+    Therefore, the tensor format looks like the following:
+    TP1, rank 0 (no sharding):
+                            |< --------BASE-------- >|< -BASE PADDING-- >|< -----LORA------ >|< -LORA PADDING-- >|
+    corresponding token_id: |  0  |  1  | ... | 1009 |  -1  | ... |  -1  | 1010 | ... | 1015 |  -1  | ... |  -1  |
+                     index: |  0  |  1  | ... | 1009 | 1010 | ... | 1023 | 1024 | ... | 1039 | 1040 | ... | 1087 |
+
+    TP2, rank 0:
+                            |< --------------------BASE--------------------- >|< -----LORA------ >|< -LORA PADDING- >|
+    corresponding token_id: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 1000 | ... | 1015 |  -1  | ... |  -1 |
+                     index: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 512  | ... | 527  |  520 | ... | 543 |
+    TP2, rank 1:
+                            |< -----------BASE----------- >|< -BASE PADDING- >|< -----------LORA PADDING----------- >|
+    corresponding token_id: | 512 | 513 | 514 | ... | 1009 | -1  | ...  | -1  |  -1  | ... |  -1  | -1  | ... |   -1 |
+                     index: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 512  | ... | 519  | 520 | ... |  543 |
+
+    Args:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        params_dtype: type of the parameters.
+        org_num_embeddings: original vocabulary size (without LoRA).
+        padding_size: padding size for the vocabulary.
+        quant_config: quant config for the layer
+        prefix: full name of the layer in the state dict
+    """  # noqa: E501
+
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        *,
+        params_dtype: Optional[torch.dtype] = None,
+        org_num_embeddings: Optional[int] = None,
+        padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        enable_tp: bool = True,
+        use_attn_tp_group: bool = False,
+        use_presharded_weights: bool = False,
+    ):
+        super().__init__()
+        self.quant_config = quant_config
+
+        self.enable_tp = enable_tp
+        if self.enable_tp:
+            if use_attn_tp_group:
+                tp_rank = get_attention_tp_rank()
+                self.tp_size = get_attention_tp_size()
+            else:
+                tp_rank = get_tensor_model_parallel_rank()
+                self.tp_size = get_tensor_model_parallel_world_size()
+        else:
+            assert use_attn_tp_group is False
+            tp_rank = 0
+            self.tp_size = 1
+
+        self.num_embeddings = num_embeddings
+        self.org_vocab_size = org_num_embeddings or num_embeddings
+
+        # Support the case where the vocab size is not divisible by the TP size.
+        if (
+            _is_cpu
+            and pad_vocab_size(self.org_vocab_size, padding_size) % self.tp_size != 0
+        ):
+            padding_size *= self.tp_size
+        self.padding_size = padding_size
+
+        num_added_embeddings = num_embeddings - self.org_vocab_size
+        self.use_presharded_weights = use_presharded_weights
+        if use_presharded_weights:
+            assert (
+                num_added_embeddings == 0
+            ), "Lora is not supported with presharded weights."
+
+        self.org_vocab_size_padded = pad_vocab_size(
+            self.org_vocab_size, self.padding_size
+        )
+        self.num_embeddings_padded = pad_vocab_size(
+            self.org_vocab_size_padded + num_added_embeddings, self.padding_size
+        )
+        assert self.org_vocab_size_padded <= self.num_embeddings_padded
+
+        self.shard_indices = self._get_indices(
+            self.num_embeddings_padded,
+            self.org_vocab_size_padded,
+            self.num_embeddings,
+            self.org_vocab_size,
+            tp_rank,
+            self.tp_size,
+        )
+        self.embedding_dim = embedding_dim
+
+        quant_method = None
+        if quant_config is not None:
+            quant_method = quant_config.get_quant_method(self, prefix=prefix)
+        if quant_method is None:
+            quant_method = UnquantizedEmbeddingMethod()
+
+        # If we are making an embedding layer, then our quantization linear
+        # method must implement the embedding operation. If we are another
+        # layer type like ParallelLMHead, this is not important.
+        is_embedding_layer = type(self.__class__) is VocabParallelEmbedding
+        quant_method_implements_embedding = method_has_implemented_embedding(
+            type(quant_method)
+        )
+        if is_embedding_layer and not quant_method_implements_embedding:
+            raise NotImplementedError(
+                f"The class {type(quant_method).__name__} must implement "
+                "the 'embedding' method, see UnquantizedEmbeddingMethod."
+            )
+
+        self.quant_method: QuantizeMethodBase = quant_method
+
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        # Divide the weight matrix along the vocaburaly dimension.
+        self.num_added_embeddings = self.num_embeddings - self.org_vocab_size
+        self.num_embeddings_per_partition = divide(
+            self.num_embeddings_padded, self.tp_size
+        )
+        assert (
+            self.shard_indices.num_elements_padded == self.num_embeddings_per_partition
+        )
+        self.num_org_embeddings_per_partition = (
+            self.shard_indices.org_vocab_end_index
+            - self.shard_indices.org_vocab_start_index
+        )
+        self.num_added_embeddings_per_partition = (
+            self.shard_indices.added_vocab_end_index
+            - self.shard_indices.added_vocab_start_index
+        )
+
+        self.quant_method.create_weights(
+            self,
+            self.embedding_dim,
+            [self.num_embeddings_per_partition],
+            self.embedding_dim,
+            self.num_embeddings_padded,
+            params_dtype=params_dtype,
+            weight_loader=self.weight_loader,
+        )
+
+    @classmethod
+    def _get_indices(
+        cls,
+        vocab_size_padded: int,
+        org_vocab_size_padded: int,
+        vocab_size: int,
+        org_vocab_size: int,
+        tp_rank: int,
+        tp_size: int,
+    ) -> VocabParallelEmbeddingShardIndices:
+        """Get start and end indices for vocab parallel embedding, following the
+        layout outlined in the class docstring, based on the given tp_rank and
+        tp_size."""
+        num_added_embeddings_padded = vocab_size_padded - org_vocab_size_padded
+        padded_org_vocab_start_index, padded_org_vocab_end_index = (
+            vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank, tp_size)
+        )
+        padded_added_vocab_start_index, padded_added_vocab_end_index = (
+            vocab_range_from_global_vocab_size(
+                num_added_embeddings_padded, tp_rank, tp_size, offset=org_vocab_size
+            )
+        )
+        # remove padding
+        org_vocab_start_index = min(padded_org_vocab_start_index, org_vocab_size)
+        org_vocab_end_index = min(padded_org_vocab_end_index, org_vocab_size)
+        added_vocab_start_index = min(padded_added_vocab_start_index, vocab_size)
+        added_vocab_end_index = min(padded_added_vocab_end_index, vocab_size)
+        return VocabParallelEmbeddingShardIndices(
+            padded_org_vocab_start_index,
+            padded_org_vocab_end_index,
+            padded_added_vocab_start_index,
+            padded_added_vocab_end_index,
+            org_vocab_start_index,
+            org_vocab_end_index,
+            added_vocab_start_index,
+            added_vocab_end_index,
+        )
+
+    def get_sharded_to_full_mapping(self) -> Optional[List[int]]:
+        """Get a mapping that can be used to reindex the gathered
+        logits for sampling.
+
+        During sampling, we gather logits from all ranks. The relationship
+        of index->token_id will follow the same format as outlined in the class
+        docstring. However, after the gather, we want to reindex the final
+        logits tensor to map index->token_id one-to-one (the index is always
+        equal the token_id it corresponds to). The indices returned by this
+        method allow us to do that.
+        """
+        if self.tp_size < 2:
+            return None
+
+        base_embeddings: List[int] = []
+        added_embeddings: List[int] = []
+        padding: List[int] = []
+        for tp_rank in range(self.tp_size):
+            shard_indices = self._get_indices(
+                self.num_embeddings_padded,
+                self.org_vocab_size_padded,
+                self.num_embeddings,
+                self.org_vocab_size,
+                tp_rank,
+                self.tp_size,
+            )
+            range_start = self.num_embeddings_per_partition * tp_rank
+            range_end = self.num_embeddings_per_partition * (tp_rank + 1)
+            base_embeddings.extend(
+                range(range_start, range_start + shard_indices.num_org_elements)
+            )
+            padding.extend(
+                range(
+                    range_start + shard_indices.num_org_elements,
+                    range_start + shard_indices.num_org_elements_padded,
+                )
+            )
+            added_embeddings.extend(
+                range(
+                    range_start + shard_indices.num_org_elements_padded,
+                    range_start
+                    + shard_indices.num_org_elements_padded
+                    + shard_indices.num_added_elements,
+                )
+            )
+            padding.extend(
+                range(
+                    range_start
+                    + shard_indices.num_org_elements_padded
+                    + shard_indices.num_added_elements,
+                    range_start
+                    + shard_indices.num_org_elements_padded
+                    + shard_indices.num_added_elements_padded,
+                )
+            )
+            assert (
+                range_start
+                + shard_indices.num_org_elements_padded
+                + shard_indices.num_added_elements_padded
+                == range_end
+            )
+        ret = base_embeddings + added_embeddings + padding
+        assert len(ret) == self.num_embeddings_padded
+        return ret
+
+    def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+        output_dim = getattr(param, "output_dim", None)
+        packed_dim = getattr(param, "packed_dim", None)
+
+        # If the parameter is a gguf weight, then load it directly.
+        if getattr(param, "is_gguf_weight_type", None):
+            param.data.copy_(loaded_weight)
+            param.weight_type = loaded_weight.item()
+            return
+        elif isinstance(param, UninitializedParameter):
+            shape = list(loaded_weight.shape)
+            if output_dim is not None:
+                shape[output_dim] = shape[output_dim] // self.tp_size
+            param.materialize(tuple(shape), dtype=loaded_weight.dtype)
+
+        # If parameter does not have output dim, then it should
+        # be copied onto all gpus (e.g. g_idx for act_order gptq).
+        if output_dim is None:
+            assert param.data.shape == loaded_weight.shape
+            param.data.copy_(loaded_weight)
+            return
+
+        # Shard indexes for loading the weight
+        start_idx = self.shard_indices.org_vocab_start_index
+        shard_size = self.shard_indices.org_vocab_end_index - start_idx
+
+        # If param packed on the same dim we are sharding on, then
+        # need to adjust offsets of loaded weight by pack_factor.
+        if packed_dim is not None and packed_dim == output_dim:
+            packed_factor = (
+                param.packed_factor
+                if isinstance(param, BasevLLMParameter)
+                else param.packed_factor
+            )
+            assert loaded_weight.shape[output_dim] == (
+                self.org_vocab_size // param.packed_factor
+            )
+            start_idx = start_idx // packed_factor
+            shard_size = shard_size // packed_factor
+        else:
+            assert loaded_weight.shape[output_dim] == (
+                self.org_vocab_size
+                // (self.tp_size if self.use_presharded_weights else 1)
+            ), f"{self.org_vocab_size=} {self.use_presharded_weights=} {loaded_weight.shape[output_dim]=}"
+
+        # Copy the data.
+        if not self.use_presharded_weights:
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+        param[: loaded_weight.shape[0]].data.copy_(loaded_weight)
+        param[loaded_weight.shape[0] :].data.fill_(0)
+
+    def forward(self, input_):
+        if self.tp_size > 1:
+            # Build the mask.
+            masked_input, input_mask = get_masked_input_and_mask(
+                input_,
+                self.shard_indices.org_vocab_start_index,
+                self.shard_indices.org_vocab_end_index,
+                self.shard_indices.num_org_vocab_padding,
+                self.shard_indices.added_vocab_start_index,
+                self.shard_indices.added_vocab_end_index,
+            )
+        else:
+            masked_input = input_
+        # Get the embeddings.
+        with use_symmetric_memory(parallel_state.get_tp_group()) as sm:
+            output_parallel = self.quant_method.embedding(self, masked_input.long())
+            sm.tag(output_parallel)
+        # Mask the output embedding.
+        if self.tp_size > 1:
+            output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0)
+            # Reduce across all the model parallel GPUs.
+            output = tensor_model_parallel_all_reduce(output_parallel)
+        else:
+            output = output_parallel
+        return output
+
+    def extra_repr(self) -> str:
+        s = f"num_embeddings={self.num_embeddings_per_partition}"
+        s += f", embedding_dim={self.embedding_dim}"
+        s += f", org_vocab_size={self.org_vocab_size}"
+        s += f", num_embeddings_padded={self.num_embeddings_padded}"
+        if self.enable_tp:
+            s += f", tp_size={self.tp_size}"
+        return s
+
+
+class ParallelLMHead(VocabParallelEmbedding):
+    """Parallelized LM head.
+
+    Output logits weight matrices used in the Sampler. The weight and bias
+    tensors are padded to make sure they are divisible by the number of
+    model parallel GPUs.
+
+    Args:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        bias: whether to use bias.
+        params_dtype: type of the parameters.
+        org_num_embeddings: original vocabulary size (without LoRA).
+        padding_size: padding size for the vocabulary.
+    """
+
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        *,
+        bias: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        org_num_embeddings: Optional[int] = None,
+        padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_attn_tp_group: bool = False,
+        use_presharded_weights: bool = False,
+    ):
+        super().__init__(
+            num_embeddings,
+            embedding_dim,
+            params_dtype=params_dtype,
+            org_num_embeddings=org_num_embeddings,
+            padding_size=padding_size,
+            quant_config=quant_config,
+            prefix=prefix,
+            use_attn_tp_group=use_attn_tp_group,
+            use_presharded_weights=use_presharded_weights,
+        )
+        self.quant_config = quant_config
+
+        # We only support pack LMHead if it's not quantized.
+        if _is_cpu and _is_cpu_amx_available:
+            if hasattr(self, "weight") and self.weight.dtype == torch.bfloat16:
+                self.quant_method = PackWeightMethod(weight_names=["weight"])
+
+        if bias:
+            self.bias = Parameter(
+                torch.empty(self.num_embeddings_per_partition, dtype=params_dtype)
+            )
+            set_weight_attrs(
+                self.bias,
+                {
+                    "output_dim": 0,
+                    "weight_loader": self.weight_loader,
+                },
+            )
+        else:
+            self.register_parameter("bias", None)
+
+    def tie_weights(self, embed_tokens: VocabParallelEmbedding):
+        """Tie the weights with word embeddings."""
+        # GGUF quantized embed_tokens.
+        if self.quant_config and self.quant_config.get_name() == "gguf":
+            return embed_tokens
+        else:
+            self.weight = embed_tokens.weight
+            return self
+
+    def forward(self, input_):
+        del input_
+        raise RuntimeError("LMHead's weights should be used in the sampler.")
diff --git a/python/sglang/srt/lora/backend/base_backend.py b/python/sglang/srt/lora/backend/base_backend.py
new file mode 100644
index 000000000..7c2c232d5
--- /dev/null
+++ b/python/sglang/srt/lora/backend/base_backend.py
@@ -0,0 +1,155 @@
+from typing import Optional, Tuple, Union
+
+import torch
+
+from sglang.srt.lora.utils import LoRABatchInfo
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+
+class BaseLoRABackend:
+    """Base class for different Lora backends.
+       Each backend has its own implementation of Lora kernels.
+
+    Args:
+        max_loras_per_batch: maximum number of different lora weights
+                             that can be applied in a single forward batch.
+        device: the device where the backend runs.
+    """
+
+    def __init__(self, max_loras_per_batch: int, device: torch.device):
+        self.max_loras_per_batch = max_loras_per_batch
+        self.device = device
+
+    def run_lora_a_sgemm(
+        self, x: torch.Tensor, weights: torch.Tensor, *args, **kwargs
+    ) -> torch.Tensor:
+        """Run segment Gemm of lora a modules with current backend.
+        The definition of segment Gemm can be referred to https://docs.flashinfer.ai/api/gemm.html.
+
+        Args:
+             x: input matrix with shape (s, input_dim), here s is the sum of all sequence lengths
+             weights: a set of lora weights with shape (num_lora, c * r, input_dim),
+                      here r is lora rank, c is a multiplier for stacked modules (e.g., c=3 for qkv_proj, c=2 for gate_up_proj)
+                      usually input_dim is much larger than r
+        Returns:
+             result with shape (s, c * r)
+        """
+        pass
+
+    def run_lora_b_sgemm(
+        self, x: torch.Tensor, weights: torch.Tensor, *args, **kwargs
+    ) -> torch.Tensor:
+        """Run segment Gemm of lora b modules with current backend.
+        The definition of segment Gemm can be referred to https://docs.flashinfer.ai/api/gemm.html.
+
+        Args:
+             x: input matrix with shape (s, r), here s is the sum of all sequence lengths, r is lora rank
+             weights: a set of lora weights with shape (num_lora, output_dim, r)
+                      usually output_dim is much larger than r
+        Returns:
+             result with shape (s, output_dim)
+        """
+        pass
+
+    def run_qkv_lora(
+        self,
+        x: torch.Tensor,
+        qkv_lora_a: torch.Tensor,
+        qkv_lora_b: Union[torch.Tensor, Tuple[torch.Tensor]],
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        """Run the lora pass for QKV Layer.
+
+        Args:
+            x: input matrix with shape (s, input_dim), here s is the sum of all sequence lengths
+            qkv_lora_a: lora_a module for qkv, with shape (num_lora, 3 * r, input_dim)
+            qkv_lora_b: lora_b module for qkv.
+                        If passed in as a tensor, its shape should be (num_lora,output_dim_q + 2 * output_dim_kv, r)
+                        If passed in as a tuple of two tensors, it should contain:
+                           a lora_b module for q, with shape (1, num_lora, output_dim_q, r)
+                           and a combined lora_b module for kv, with shape (2, num_lora, output_dim_kv, r)
+        Returns:
+            result with shape (s, output_dim_q + 2 * output_dim_kv)
+        """
+        pass
+
+    def run_gate_up_lora(
+        self,
+        x: torch.Tensor,
+        gate_up_lora_a: torch.Tensor,
+        gate_up_lora_b: Union[torch.Tensor, Tuple[torch.Tensor]],
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        """Run the lora pass for gate_up_proj, usually attached to MergedColumnParallelLayer.
+
+        Args:
+            x: input matrix with shape (s, input_dim), here s is the sum of all sequence lengths
+            gate_up_lora_a: lora_a module for gate_up_proj, with shape (num_lora, 2 * r, input_dim)
+            gate_up_lora_b: lora_b module for qkv.
+                        If passed in as a tensor, its shape should be (num_lora, 2 * output_dim, r)
+                        If passed in as a tuple, it should contain two tensors with shape (num_lora, output_dim, r)
+        Returns:
+            result with shape (s, 2 * output_dim)
+        """
+        pass
+
+    def init_cuda_graph_batch_info(
+        self,
+        cuda_graph_batch_info: LoRABatchInfo,
+        max_bs_in_cuda_graph: int,
+    ):
+        """Initialize the batch info for CUDA Graph mode.
+
+        This method provides a hook for each backend to conduct its own initialization
+        logic for CUDA Graph mode.
+
+        Args:
+            cuda_graph_batch_info: the LoRABatchInfo object created in LoraManager
+            max_bs_in_cuda_graph: maximum batch size for CUDA Graph mode
+        """
+        pass
+
+    def prepare_lora_batch(
+        self,
+        forward_batch: ForwardBatch,
+        weight_indices: list[int],
+        lora_ranks: list[int],
+        scalings: list[float],
+        batch_info: Optional[LoRABatchInfo] = None,
+    ):
+        """Prepare the lora weights and batch info for current forward batch.
+
+        This method provides a hook for each backend to conduct its own preparation
+        logic for each forward batch.
+
+        Args:
+            forward_batch: the ForwardBatch object for current forward pass
+            weight_indices: list of indices of lora weights to be applied for current batch
+            lora_ranks: list of lora ranks corresponding to weight_indices
+            scalings: list of scaling factors corresponding to weight_indices
+            batch_info: optional LoRABatchInfo object, if not provided, the backend should use its own
+                        internal batch info (e.g., self.cuda_graph_batch_info for CUDA Graph mode)
+        """
+        pass
+
+
+def get_backend_from_name(name: str) -> BaseLoRABackend:
+    """
+    Get corresponding backend class from backend's name
+    """
+    if name == "triton":
+        from sglang.srt.lora.backend.triton_backend import TritonLoRABackend
+
+        return TritonLoRABackend
+    # elif name == "csgmv":
+    #     from sglang.srt.lora.backend.chunked_backend import ChunkedSgmvLoRABackend
+
+    #     return ChunkedSgmvLoRABackend
+    elif name == "flashinfer":
+        raise ValueError(
+            "FlashInfer LoRA backend has been deprecated, please use `triton` instead."
+        )
+    else:
+        raise ValueError(f"Invalid backend: {name}")
diff --git a/python/sglang/srt/lora/backend/triton_backend.py b/python/sglang/srt/lora/backend/triton_backend.py
new file mode 100644
index 000000000..7abeef770
--- /dev/null
+++ b/python/sglang/srt/lora/backend/triton_backend.py
@@ -0,0 +1,176 @@
+from typing import Optional
+
+import torch
+
+from sglang.srt.lora.backend.base_backend import BaseLoRABackend
+from sglang.srt.lora.triton_ops import (
+    gate_up_lora_b_fwd,
+    qkv_lora_b_fwd,
+    sgemm_lora_a_fwd,
+    sgemm_lora_b_fwd,
+)
+from sglang.srt.lora.utils import LoRABatchInfo
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+
+class TritonLoRABackend(BaseLoRABackend):
+    name = "triton"
+
+    def __init__(self, max_loras_per_batch: int, device: torch.device):
+        super().__init__(max_loras_per_batch, device)
+
+    def run_lora_a_sgemm(
+        self, x: torch.Tensor, weights: torch.Tensor, *args, **kwargs
+    ) -> torch.Tensor:
+        return sgemm_lora_a_fwd(x, weights, self.batch_info)
+
+    def run_lora_b_sgemm(
+        self,
+        x: torch.Tensor,
+        weights: torch.Tensor,
+        base_output: torch.Tensor = None,
+        *args,
+        **kwargs
+    ) -> torch.Tensor:
+        return sgemm_lora_b_fwd(x, weights, self.batch_info, base_output)
+
+    def run_qkv_lora(
+        self,
+        x: torch.Tensor,
+        qkv_lora_a: torch.Tensor,
+        qkv_lora_b: torch.Tensor,
+        output_offset: torch.Tensor,
+        max_qkv_out_dim: int,
+        base_output: torch.Tensor = None,
+        *args,
+        **kwargs
+    ) -> torch.Tensor:
+
+        # x: (s, input_dim)
+        # qkv_lora_a: (num_lora, 3 * r, input_dim)
+        # qkv_lora_b: (num_lora, output_dim_q + 2 * output_dim_kv, r)
+        assert isinstance(qkv_lora_b, torch.Tensor)
+
+        lora_a_output = sgemm_lora_a_fwd(x, qkv_lora_a, self.batch_info, stack_num=3)
+        lora_output = qkv_lora_b_fwd(
+            lora_a_output,
+            qkv_lora_b,
+            self.batch_info,
+            output_offset,
+            max_qkv_out_dim,
+            base_output,
+        )
+        return lora_output
+
+    def run_gate_up_lora(
+        self,
+        x: torch.Tensor,
+        gate_up_lora_a: torch.Tensor,
+        gate_up_lora_b: torch.Tensor,
+        base_output: torch.Tensor = None,
+        *args,
+        **kwargs
+    ) -> torch.Tensor:
+
+        # x: (s, input_dim)
+        # gate_up_lora_a: (num_lora, 2 * r, input_dim)
+        # gate_up_lora_b: (num_lora, 2 * output_dim, r)
+        assert isinstance(gate_up_lora_b, torch.Tensor)
+        output_dim = gate_up_lora_b.shape[-2] // 2
+
+        # lora_a_output: (s, 2 * r)
+        lora_a_output = sgemm_lora_a_fwd(
+            x, gate_up_lora_a, self.batch_info, stack_num=2
+        )
+        lora_output = gate_up_lora_b_fwd(
+            lora_a_output,
+            gate_up_lora_b,
+            self.batch_info,
+            output_dim,
+            base_output,
+        )
+        return lora_output
+
+    def init_cuda_graph_batch_info(
+        self, cuda_graph_batch_info: LoRABatchInfo, max_bs_in_cuda_graph: int
+    ):
+        # Initialize seg_lens and seg_indptr for CUDA graph as they remain constant
+        # across batches.
+        cuda_graph_batch_info.seg_lens[:max_bs_in_cuda_graph].fill_(1)
+        torch.cumsum(
+            cuda_graph_batch_info.seg_lens[:max_bs_in_cuda_graph],
+            dim=0,
+            out=cuda_graph_batch_info.seg_indptr[1 : max_bs_in_cuda_graph + 1],
+        )
+
+    def prepare_lora_batch(
+        self,
+        forward_batch: ForwardBatch,
+        weight_indices: list[int],
+        lora_ranks: list[int],
+        scalings: list[float],
+        batch_info: Optional[LoRABatchInfo] = None,
+    ):
+        # Use pinned memory to avoid synchronizations during host-to-device transfer
+        weight_indices_tensor = torch.tensor(
+            weight_indices, dtype=torch.int32, pin_memory=True, device="cpu"
+        )
+        lora_ranks_tensor = torch.tensor(
+            lora_ranks, dtype=torch.int32, pin_memory=True, device="cpu"
+        )
+        scalings_tensor = torch.tensor(
+            scalings, dtype=torch.float, pin_memory=True, device="cpu"
+        )
+
+        bs = forward_batch.batch_size
+
+        if batch_info is not None:
+            assert (
+                batch_info.use_cuda_graph
+            ), "batch_info.use_cuda_graph must be True when batch_info is provided"
+            batch_info.bs = forward_batch.batch_size
+            batch_info.num_segments = forward_batch.batch_size
+        else:
+            max_len = (
+                # Calculate max_len from the CPU copy to avoid D2H transfer.
+                max(forward_batch.extend_seq_lens_cpu)
+                if forward_batch.forward_mode.is_extend()
+                else 1
+            )
+            seg_lens = (
+                forward_batch.extend_seq_lens
+                if forward_batch.forward_mode.is_extend()
+                else torch.ones(bs, device=self.device)
+            )
+            seg_indptr = torch.zeros((bs + 1,), dtype=torch.int32, device=self.device)
+            seg_indptr[1:] = torch.cumsum(seg_lens, dim=0)
+
+            batch_info = LoRABatchInfo(
+                bs=forward_batch.batch_size,
+                num_segments=forward_batch.batch_size,
+                max_len=max_len,
+                use_cuda_graph=False,
+                seg_lens=seg_lens,
+                seg_indptr=seg_indptr,
+                weight_indices=torch.empty(
+                    (bs,), dtype=torch.int32, device=self.device
+                ),
+                lora_ranks=torch.empty(
+                    (self.max_loras_per_batch,), dtype=torch.int64, device=self.device
+                ),
+                scalings=torch.empty(
+                    (self.max_loras_per_batch,), dtype=torch.float, device=self.device
+                ),
+                permutation=None,
+            )
+
+        # Copy to device asynchronously
+        batch_info.lora_ranks[: self.max_loras_per_batch].copy_(
+            lora_ranks_tensor, non_blocking=True
+        )
+        batch_info.scalings[: self.max_loras_per_batch].copy_(
+            scalings_tensor, non_blocking=True
+        )
+        batch_info.weight_indices[:bs].copy_(weight_indices_tensor, non_blocking=True)
+
+        self.batch_info = batch_info
diff --git a/python/sglang/srt/lora/layers.py b/python/sglang/srt/lora/layers.py
new file mode 100644
index 000000000..4426faccb
--- /dev/null
+++ b/python/sglang/srt/lora/layers.py
@@ -0,0 +1,352 @@
+import torch
+from torch import nn
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    split_tensor_along_last_dim,
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.lora.backend.base_backend import BaseLoRABackend
+
+
+class BaseLayerWithLoRA(nn.Module):
+    def __init__(
+        self,
+        base_layer: nn.Module,
+        lora_backend: BaseLoRABackend,
+    ):
+        super().__init__()
+        self.base_layer: nn.Module = base_layer
+        self.set_lora: bool = False
+        self.lora_backend: BaseLoRABackend = lora_backend
+
+    def forward(self, x: torch.Tensor):
+        return self.base_layer.forward(x)
+
+    def set_lora_info(self, *args):
+        pass
+
+    def slice_lora_a_weights(self, A: torch.Tensor, tp_rank: int):
+        pass
+
+    def slice_lora_b_weights(self, B: torch.Tensor, tp_rank: int):
+        pass
+
+
+class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
+    """
+    Vocab parallel embedding layer with support for LoRA (Low-Rank Adaptation).
+
+    Note: The current version does not yet implement the LoRA functionality.
+    This class behaves exactly the same as the base VocabParallelEmbedding.
+    Future versions will integrate LoRA functionality to support efficient parameter fine-tuning.
+    """
+
+    def __init__(
+        self,
+        base_layer: VocabParallelEmbedding,
+        lora_backend: BaseLoRABackend,
+    ) -> None:
+        super().__init__(base_layer, lora_backend)
+        self.weight = base_layer.weight
+
+
+class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA):
+    def __init__(
+        self,
+        base_layer: ColumnParallelLinear,
+        lora_backend: BaseLoRABackend,
+    ) -> None:
+        super().__init__(base_layer, lora_backend)
+        shard_size = self.base_layer.output_partition_sizes[0]
+        self.output_offset = torch.tensor(
+            [
+                0,
+                shard_size,
+            ],
+            dtype=torch.int32,
+            device=next(self.base_layer.parameters()).device,
+        )
+
+    def set_lora_info(
+        self,
+        A_buffer: torch.Tensor,
+        B_buffer: torch.Tensor,
+    ):
+        self.set_lora = True
+        self.A_buffer = A_buffer
+        self.B_buffer = B_buffer
+
+    def apply_lora(self, base_output: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
+        lora_a_output = self.lora_backend.run_lora_a_sgemm(x, self.A_buffer)
+        lora_output = self.lora_backend.run_lora_b_sgemm(
+            x=lora_a_output,
+            weights=self.B_buffer,
+            output_offset=self.output_offset,
+            base_output=base_output,
+        )
+        return lora_output
+
+    def forward(self, input_: torch.Tensor):
+        # duplicate the logic in ColumnParallelLinear
+        bias = self.base_layer.bias if not self.base_layer.skip_bias_add else None
+        output_parallel = self.base_layer.quant_method.apply(
+            self.base_layer, input_, bias
+        )
+
+        if self.set_lora:
+            output_parallel = self.apply_lora(output_parallel, input_)
+
+        if self.base_layer.gather_output:
+            output = tensor_model_parallel_all_gather(output_parallel)
+        else:
+            output = output_parallel
+        output_bias = self.base_layer.bias if self.base_layer.skip_bias_add else None
+        return output, output_bias
+
+    def slice_lora_a_weights(self, A: torch.Tensor, tp_rank: int):
+        return A
+
+    def slice_lora_b_weights(self, B: torch.Tensor, tp_rank: int):
+        shard_size = self.base_layer.output_partition_sizes[0]
+        start_idx = tp_rank * shard_size
+        end_idx = (tp_rank + 1) * shard_size
+        B = B[start_idx:end_idx, :]
+        return B
+
+
+class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
+    def __init__(
+        self,
+        base_layer: MergedColumnParallelLinear,
+        lora_backend: BaseLoRABackend,
+    ) -> None:
+        super().__init__(base_layer, lora_backend)
+
+    def set_lora_info(
+        self,
+        A_buffer: torch.Tensor,
+        B_buffer: torch.Tensor,
+    ):
+        self.set_lora = True
+        self.A_buffer_gate_up = A_buffer
+        self.B_buffer_gate_up = B_buffer
+
+        shard_size = self.base_layer.output_partition_sizes[0]
+        self.output_offset = torch.tensor(
+            [
+                0,
+                shard_size,
+                2 * shard_size,
+            ],
+            dtype=torch.int32,
+            device=next(self.base_layer.parameters()).device,
+        )
+
+    def apply_lora(self, base_output: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
+        lora_output = self.lora_backend.run_gate_up_lora(
+            x=x,
+            gate_up_lora_a=self.A_buffer_gate_up,
+            gate_up_lora_b=self.B_buffer_gate_up,
+            output_offset=self.output_offset,
+            base_output=base_output,
+        )
+        return lora_output
+
+    def slice_lora_a_weights(self, A: torch.Tensor, tp_rank: int):
+        return A
+
+    def slice_lora_b_weights(self, B: torch.Tensor, tp_rank: int):
+        # Since the outputs for both gate and up are identical, we use a random one.
+        shard_size = self.base_layer.output_partition_sizes[0]
+        gate_size = self.base_layer.output_sizes[0]
+        start_idx = tp_rank * shard_size
+        end_idx = (tp_rank + 1) * shard_size
+        return torch.concat(
+            (
+                B[start_idx:end_idx, :],
+                B[gate_size + start_idx : gate_size + end_idx],
+            ),
+            dim=0,
+        )
+
+
+class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
+    def __init__(
+        self,
+        base_layer: QKVParallelLinear,
+        lora_backend: BaseLoRABackend,
+    ) -> None:
+        super().__init__(base_layer, lora_backend)
+        q_proj_shard_size = self.base_layer.q_proj_shard_size
+        kv_proj_shard_size = self.base_layer.kv_proj_shard_size
+        self.output_offset = torch.tensor(
+            [
+                0,
+                q_proj_shard_size,
+                q_proj_shard_size + kv_proj_shard_size,
+                q_proj_shard_size + 2 * kv_proj_shard_size,
+            ],
+            dtype=torch.int32,
+            device=next(self.base_layer.parameters()).device,
+        )
+
+        # For computing number of launched blocks
+        self.max_qkv_out_dim = max(q_proj_shard_size, kv_proj_shard_size)
+
+    def set_lora_info(
+        self,
+        A_buffer_qkv: torch.Tensor,
+        B_buffer_qkv: torch.Tensor,
+    ):
+        self.set_lora = True
+        self.A_buffer_qkv = A_buffer_qkv
+        self.B_buffer_qkv = B_buffer_qkv
+
+    def apply_lora(self, base_output: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
+        lora_output = self.lora_backend.run_qkv_lora(
+            x=x,
+            qkv_lora_a=self.A_buffer_qkv,
+            qkv_lora_b=self.B_buffer_qkv,
+            base_output=base_output,
+            output_offset=self.output_offset,
+            max_qkv_out_dim=self.max_qkv_out_dim,
+        )
+        return lora_output
+
+    def slice_lora_a_weights(self, A: torch.Tensor, tp_rank: int):
+        return A
+
+    def slice_lora_b_weights(self, B: torch.Tensor, tp_rank: int) -> torch.Tensor:
+        base_layer = self.base_layer
+        q_proj_shard_size = base_layer.q_proj_shard_size
+        kv_proj_shard_size = base_layer.kv_proj_shard_size
+        num_kv_head_replicas = base_layer.num_kv_head_replicas
+
+        q_start_idx = q_proj_shard_size * tp_rank
+        q_end_idx = q_start_idx + q_proj_shard_size
+
+        kv_shard_id = tp_rank // num_kv_head_replicas
+        kv_start_idx = kv_proj_shard_size * kv_shard_id
+        kv_end_idx = kv_start_idx + kv_proj_shard_size
+
+        q_size, k_size, _ = base_layer.output_sizes
+        B_q_shard = B[q_start_idx:q_end_idx, :]
+        B_k_shard = B[q_size + kv_start_idx : q_size + kv_end_idx, :]
+        B_v_shard = B[q_size + k_size + kv_start_idx : q_size + k_size + kv_end_idx, :]
+
+        return torch.concat(
+            (
+                B_q_shard,
+                B_k_shard,
+                B_v_shard,
+            ),
+            dim=0,
+        )
+
+
+class RowParallelLinearWithLoRA(BaseLayerWithLoRA):
+    def __init__(
+        self,
+        base_layer: RowParallelLinear,
+        lora_backend: BaseLoRABackend,
+    ) -> None:
+        super().__init__(base_layer, lora_backend)
+
+    def set_lora_info(self, A_buffer: torch.Tensor, B_buffer: torch.Tensor):
+        self.set_lora = True
+        self.A_buffer = A_buffer
+        self.B_buffer = B_buffer
+        output_size = self.base_layer.output_size
+        self.output_offset = torch.tensor(
+            [
+                0,
+                output_size,
+            ],
+            dtype=torch.int32,
+            device=next(self.base_layer.parameters()).device,
+        )
+
+    def apply_lora(self, base_output: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
+        lora_a_output = self.lora_backend.run_lora_a_sgemm(x, self.A_buffer)
+        lora_output = self.lora_backend.run_lora_b_sgemm(
+            x=lora_a_output,
+            weights=self.B_buffer,
+            output_offset=self.output_offset,
+            base_output=base_output,
+        )
+        return lora_output
+
+    def forward(self, input_: torch.Tensor, skip_all_reduce=False):
+        # duplicate the logic in RowParallelLinear
+        if self.base_layer.input_is_parallel:
+            input_parallel = input_
+        else:
+            tp_rank = get_tensor_model_parallel_rank()
+            splitted_input = split_tensor_along_last_dim(
+                input_, num_partitions=self.base_layer.tp_size
+            )
+            input_parallel = splitted_input[tp_rank].contiguous()
+        output_parallel = self.base_layer.quant_method.apply(
+            self.base_layer, input_parallel
+        )
+
+        if self.set_lora:
+            output_parallel = self.apply_lora(output_parallel, input_parallel)
+
+        if (
+            self.base_layer.reduce_results
+            and self.base_layer.tp_size > 1
+            and not skip_all_reduce
+        ):
+            output_ = tensor_model_parallel_all_reduce(output_parallel)
+        else:
+            output_ = output_parallel
+
+        if not self.base_layer.skip_bias_add:
+            output = (
+                output_ + self.base_layer.bias
+                if self.base_layer.bias is not None
+                else output_
+            )
+            output_bias = None
+        else:
+            output = output_
+            output_bias = self.base_layer.bias
+        return output, output_bias
+
+    def slice_lora_a_weights(self, A: torch.Tensor, tp_rank: int):
+        shard_size = self.base_layer.input_size_per_partition
+        start_idx = tp_rank * shard_size
+        end_idx = (tp_rank + 1) * shard_size
+        A = A[:, start_idx:end_idx].contiguous()
+        return A
+
+    def slice_lora_b_weights(self, B: torch.Tensor, tp_rank: int):
+        return B
+
+
+def get_lora_layer(
+    layer: nn.Module, lora_backend: BaseLoRABackend
+) -> BaseLayerWithLoRA:
+    supported_layer_types = {
+        # the order matters
+        VocabParallelEmbedding: VocabParallelEmbeddingWithLoRA,
+        QKVParallelLinear: QKVParallelLinearWithLoRA,
+        MergedColumnParallelLinear: MergedColumnParallelLinearWithLoRA,
+        ColumnParallelLinear: ColumnParallelLinearWithLoRA,
+        RowParallelLinear: RowParallelLinearWithLoRA,
+    }
+    for src_layer_type, lora_layer_type in supported_layer_types.items():
+        if isinstance(layer, src_layer_type):  # pylint: disable=unidiomatic-typecheck
+            ret = lora_layer_type(layer, lora_backend)
+            return ret
+    raise Exception(f"No corresponding LoRA layer supported for {type(layer)}.")
diff --git a/python/sglang/srt/lora/lora.py b/python/sglang/srt/lora/lora.py
new file mode 100644
index 000000000..e7569624c
--- /dev/null
+++ b/python/sglang/srt/lora/lora.py
@@ -0,0 +1,178 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Integrates "S-LoRA: Serving Thousands of Concurrent LoRA Adapters"
+# and "Punica: Multi-Tenant LoRA Serving"
+
+# LoRA layers class inheritance adapted from:
+# https://github.com/vllm-project/vllm/blob/4abf6336ec65c270343eb895e7b18786e9274176/vllm/lora/layers.py
+
+import logging
+import re
+from typing import Dict, List
+
+import torch
+from torch import nn
+
+from sglang.srt.configs.load_config import LoadConfig
+from sglang.srt.hf_transformers_utils import AutoConfig
+from sglang.srt.lora.backend.base_backend import BaseLoRABackend
+
+# from sglang.srt.lora.backend.chunked_backend import ChunkedSgmvLoRABackend
+from sglang.srt.lora.backend.triton_backend import TritonLoRABackend
+from sglang.srt.lora.lora_config import LoRAConfig
+from sglang.srt.model_loader.loader import DefaultModelLoader
+
+logger = logging.getLogger(__name__)
+
+
+class LoRALayer(nn.Module):
+    def __init__(self, config: LoRAConfig, base_hf_config: AutoConfig):
+        super().__init__()
+        self.config: LoRAConfig = config
+        self.base_hf_config: AutoConfig = base_hf_config
+
+        # lora weights in cpu. The weights are loaded from checkpoint.
+        self.weights: Dict[str, torch.Tensor] = {}
+
+
+class LoRAAdapter(nn.Module):
+    def __init__(
+        self,
+        uid: str,
+        config: LoRAConfig,
+        base_hf_config: AutoConfig,
+        load_config: LoadConfig,
+        lora_backend: BaseLoRABackend,
+    ):
+        super().__init__()
+        self.uid: str = uid
+        self.config: LoRAConfig = config
+        assert self.config.hf_config["peft_type"].lower() == "lora"
+        self.base_hf_config: AutoConfig = base_hf_config
+        self.load_config: LoadConfig = load_config
+        self.lora_backend: BaseLoRABackend = lora_backend
+        self.scaling: float = self.config.lora_alpha / self.config.r
+
+        self.layers: List[LoRALayer] = nn.ModuleList(
+            [
+                LoRALayer(config, base_hf_config)
+                for _ in range(base_hf_config.num_hidden_layers)
+            ]
+        )
+
+        self.weights: Dict[str, torch.Tensor] = {}
+
+    # initialize the LoRA weights to cpu
+    def initialize_weights(self):
+        model_path = self.config.path
+        loader = DefaultModelLoader(self.load_config)
+        revision = getattr(self.config.hf_config, "revision", None)
+        for name, loaded_weight in loader._get_weights_iterator(
+            DefaultModelLoader.Source(
+                model_path, revision=revision, fall_back_to_pt=True
+            )
+        ):
+            match = re.search(r"layers\.(\d+)\.", name)
+            if match is not None:
+                layer_id = int(match.group(1))
+                self.layers[layer_id].weights[name] = loaded_weight.cpu()
+            else:
+                self.weights[name] = loaded_weight.cpu()
+
+        # normalize kv_proj and gate_up_proj
+        for layer in self.layers:
+            weight_names = list(layer.weights.keys())
+            self.normalize_qkv_proj(weight_names, layer.weights)
+            self.normalize_gate_up_proj(weight_names, layer.weights)
+
+    def normalize_qkv_proj(
+        self, weight_names: List[str], weights: Dict[str, torch.Tensor]
+    ):
+        # Collect target q/k/v modules. This process is necessary since there might be no lora attached to k_proj
+        target_module = set()
+        for weight_name in weight_names:
+            if "k_proj" in weight_name:
+                target_module.add("k_proj")
+            if "q_proj" in weight_name:
+                target_module.add("q_proj")
+            if "v_proj" in weight_name:
+                target_module.add("v_proj")
+            if "qkv_proj" in weight_name:
+                target_module.add("qkv_proj")
+        if len(target_module) == 0:
+            return
+
+        for weight_name in weight_names:
+            # We assume every lora adaptor should contain lora modules for q_proj
+            if "q_proj" in weight_name:
+                q_name = weight_name
+                k_name = weight_name.replace("q_proj", "k_proj")
+                v_name = weight_name.replace("q_proj", "v_proj")
+                qkv_name = weight_name.replace("q_proj", "qkv_proj")
+
+                # If k_proj doesn't have lora, initialize it to zero
+                k_proj_weight = (
+                    weights[k_name]
+                    if "k_proj" in target_module
+                    else torch.zeros_like(weights[v_name])
+                )
+                weights[qkv_name] = torch.cat(
+                    (
+                        weights[q_name],
+                        k_proj_weight,
+                        weights[v_name],
+                    ),
+                    0,
+                )
+                weights.pop(q_name)
+                if "k_proj" in target_module:
+                    weights.pop(k_name)
+                weights.pop(v_name)
+            elif "qkv_proj" in weight_name:
+                # If qkv_proj is already stacked, we normalize it following the SGL convention.
+                qkv_name = weight_name
+                q_name = weight_name.replace("qkv_proj", "q_proj")
+                k_name = weight_name.replace("qkv_proj", "k_proj")
+                v_name = weight_name.replace("qkv_proj", "v_proj")
+                if "lora_A" in weight_name:
+                    weights[qkv_name] = weights[qkv_name].repeat(3, 1)
+                # else: no-op as LoRA B weight is already stacked.
+
+    def normalize_gate_up_proj(
+        self, weight_names: List[str], weights: Dict[str, torch.Tensor]
+    ):
+        for weight_name in weight_names:
+            if "gate_proj" in weight_name:
+                up_name = weight_name.replace("gate_proj", "up_proj")
+                gate_up_name = weight_name.replace("gate_proj", "gate_up_proj")
+                if up_name not in weights:
+                    weights[up_name] = torch.zeros_like(weights[weight_name])
+                    assert isinstance(self.lora_backend, TritonLoRABackend), (
+                        f"LoRA weight initialization currently only supported for 'triton' backend. "
+                        f"Received backend: {self.lora_backend.name}. Please verify your backend configuration "
+                        f"or consider implementing custom initialization logic for other backends."
+                    )
+                weights[gate_up_name] = torch.cat(
+                    (weights[weight_name], weights[up_name]), 0
+                )
+                weights.pop(weight_name)
+                if up_name in weights:
+                    weights.pop(up_name)
+            elif "gate_up_proj" in weight_name:
+                # If gate_up_proj is already stacked, we normalize it following the SGL convention
+                gate_up_name = weight_name
+                if "lora_A" in weight_name:
+                    weights[gate_up_name] = weights[gate_up_name].repeat(2, 1)
+                # else: no-op as LoRA B weight is already stacked.
diff --git a/python/sglang/srt/lora/lora_config.py b/python/sglang/srt/lora/lora_config.py
new file mode 100644
index 000000000..185b7b824
--- /dev/null
+++ b/python/sglang/srt/lora/lora_config.py
@@ -0,0 +1,47 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import json
+import os
+
+from huggingface_hub import snapshot_download
+
+
+class LoRAConfig:
+    def __init__(
+        self,
+        path: str,
+    ) -> None:
+        self.path = path
+        self.hf_config = self.get_lora_config()
+        self.target_modules = self.hf_config["target_modules"]
+
+        # TODO: Support more modules
+        if any(module in self.target_modules for module in ["embed_tokens", "lm_head"]):
+            raise ValueError("Not supported yet")
+
+        self.r = self.hf_config["r"]
+        self.lora_alpha = self.hf_config["lora_alpha"]
+
+    def get_lora_config(self, dummy=False):
+        if dummy:
+            raise NotImplementedError()
+        else:
+            if not os.path.isdir(self.path):
+                weights_dir = snapshot_download(self.path, allow_patterns=["*.json"])
+            else:
+                weights_dir = self.path
+            config_name = "adapter_config.json"
+            with open(os.path.join(weights_dir, config_name), "r") as f:
+                return json.load(f)
diff --git a/python/sglang/srt/lora/lora_manager.py b/python/sglang/srt/lora/lora_manager.py
new file mode 100644
index 000000000..baf120ca2
--- /dev/null
+++ b/python/sglang/srt/lora/lora_manager.py
@@ -0,0 +1,440 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Integrates "S-LoRA: Serving Thousands of Concurrent LoRA Adapters"
+# and "Punica: Multi-Tenant LoRA Serving"
+
+import logging
+from typing import Dict, Iterable, List, Optional, Set, Tuple
+
+import torch
+
+from sglang.srt.configs.load_config import LoadConfig
+from sglang.srt.hf_transformers_utils import AutoConfig
+from sglang.srt.lora.backend.base_backend import BaseLoRABackend, get_backend_from_name
+from sglang.srt.lora.layers import BaseLayerWithLoRA, get_lora_layer
+from sglang.srt.lora.lora import LoRAAdapter
+from sglang.srt.lora.lora_config import LoRAConfig
+from sglang.srt.lora.lora_registry import LoRARef
+from sglang.srt.lora.mem_pool import LoRAMemoryPool
+from sglang.srt.lora.utils import (
+    LoRABatchInfo,
+    LoRAType,
+    get_layer_id,
+    get_normalized_target_modules,
+    get_target_module_name,
+)
+from sglang.srt.managers.io_struct import LoRAUpdateResult
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.utils import replace_submodule
+
+logger = logging.getLogger(__name__)
+
+
+class LoRAManager:
+    def __init__(
+        self,
+        base_model: torch.nn.Module,
+        base_hf_config: AutoConfig,
+        max_loras_per_batch: int,
+        load_config: LoadConfig,
+        dtype: torch.dtype,
+        lora_backend: str = "triton",
+        tp_size: int = 1,
+        tp_rank: int = 0,
+        max_lora_rank: Optional[int] = None,
+        target_modules: Optional[Iterable[str]] = None,
+        lora_paths: Optional[List[LoRARef]] = None,
+    ):
+        self.base_model: torch.nn.Module = base_model
+        self.base_hf_config: AutoConfig = base_hf_config
+        self.max_loras_per_batch: int = max_loras_per_batch
+        self.load_config: LoadConfig = load_config
+        self.dtype: torch.dtype = dtype
+        self.device: torch.device = next(self.base_model.parameters()).device
+        self.tp_size: int = tp_size
+        self.tp_rank: int = tp_rank
+
+        # LoRA backend for running sgemm kernels
+        logger.info(f"Using {lora_backend} as backend of LoRA kernels.")
+        backend_type = get_backend_from_name(lora_backend)
+        self.lora_backend: BaseLoRABackend = backend_type(
+            max_loras_per_batch=max_loras_per_batch,
+            device=self.device,
+        )
+
+        # Initialize mutable internal state of the LoRAManager.
+        self.init_state(
+            max_lora_rank=max_lora_rank,
+            target_modules=target_modules,
+            lora_paths=lora_paths,
+        )
+
+    def init_cuda_graph_batch_info(self, max_bs_in_cuda_graph: int):
+        self.max_bs_in_cuda_graph = max_bs_in_cuda_graph
+        with torch.device("cuda"):
+            self.cuda_graph_batch_info = LoRABatchInfo(
+                bs=max_bs_in_cuda_graph,
+                use_cuda_graph=True,
+                num_segments=None,
+                seg_lens=torch.zeros(max_bs_in_cuda_graph, dtype=torch.int32),
+                seg_indptr=torch.zeros(max_bs_in_cuda_graph + 1, dtype=torch.int32),
+                max_len=1,
+                weight_indices=torch.zeros(max_bs_in_cuda_graph, dtype=torch.int32),
+                permutation=torch.zeros(max_bs_in_cuda_graph, dtype=torch.int32),
+                lora_ranks=torch.zeros(self.max_loras_per_batch, dtype=torch.int32),
+                scalings=torch.zeros(self.max_loras_per_batch, dtype=torch.float),
+            )
+
+        self.lora_backend.init_cuda_graph_batch_info(
+            cuda_graph_batch_info=self.cuda_graph_batch_info,
+            max_bs_in_cuda_graph=max_bs_in_cuda_graph,
+        )
+
+    def create_lora_update_result(
+        self, success: bool, error_message: str = ""
+    ) -> LoRAUpdateResult:
+        return LoRAUpdateResult(
+            success=success,
+            error_message=error_message,
+            loaded_adapters={
+                lora_ref.lora_name: lora_ref.lora_path
+                for lora_ref in self.lora_refs.values()
+            },
+        )
+
+    def load_lora_adapter(self, lora_ref: LoRARef) -> LoRAUpdateResult:
+        """
+        Load a single LoRA adapter from the specified path.
+
+        Args:
+            lora_ref (LoRARef): The LoRARef object containing the LoRA name, path, and ID.
+        """
+        assert (
+            lora_ref.lora_name is not None and lora_ref.lora_path is not None
+        ), "LoRARef must have both lora_name and lora_path set for loading."
+        assert (
+            lora_ref.lora_id not in self.loras
+        ), f"LoRA adapter with ID {lora_ref.lora_id} is already loaded. This should have been verified before request is sent to the backend."
+
+        try:
+            # load configs
+            new_adapter = LoRAConfig(lora_ref.lora_path)
+            self.validate_new_adapter(new_adapter, lora_ref)
+            self.configs[lora_ref.lora_id] = new_adapter
+
+            # load weights
+            self.load_lora_weights(lora_ref)
+
+            # keep metadata for displayed messages
+            self.lora_refs[lora_ref.lora_id] = lora_ref
+            self.num_pinned_loras += int(lora_ref.pinned)
+        except Exception as e:
+            return self.create_lora_update_result(
+                success=False,
+                error_message=str(e),
+            )
+
+        return self.create_lora_update_result(success=True)
+
+    def validate_new_adapter(self, lora_config: LoRAConfig, lora_ref: LoRARef):
+        """
+        Validate if an adapter can be loaded into the current LoRA memory pool and generate error if it is incompatible.
+        """
+
+        # Check if the LoRA adapter shape is compatible with the current LoRA memory pool configuration.
+        memory_pool = getattr(self, "memory_pool", None)
+        incompatible = memory_pool and not memory_pool.can_support(lora_config)
+        if incompatible:
+            raise ValueError(
+                f"LoRA adapter {lora_ref.lora_name} with rank {lora_config.r} is incompatible with the current "
+                "LoRA memory pool configuration. Please ensure that the LoRA adapter's rank is within the configured "
+                "`--max-lora-rank` and that the target modules are included in `--lora-target-modules`."
+            )
+
+        # Ensure pinned LoRA adapters does not exceed maximal limit or cause starvation.
+        if lora_ref.pinned and self.num_pinned_loras >= self.max_loras_per_batch - 1:
+            raise ValueError(
+                f"Failed to load LoRA adapter {lora_ref.lora_name} as a pinned adapter. It is not allowed to pin all slots "
+                "in the LoRA memory pool to avoid starvation for unpinned adapters and base models. Please increase your "
+                "`--max-loras-per-batch` or load it as unpinned LoRA adapters."
+            )
+
+    def unload_lora_adapter(self, lora_ref: LoRARef) -> LoRAUpdateResult:
+        """
+        Unload LoRA adapters by their names. This will remove the adapters from the memory pool and
+        delete the corresponding LoRA modules.
+        """
+
+        adapter = self.configs.get(lora_ref.lora_id)
+        lora_ref = self.lora_refs.get(lora_ref.lora_id)
+        assert (
+            adapter is not None and lora_ref is not None
+        ), f"LoRA adapter with ID {lora_ref.lora_id} is not loaded. This should have been verified before request is sent to the backend."
+
+        try:
+            del self.configs[lora_ref.lora_id]
+            del self.loras[lora_ref.lora_id]
+            del self.lora_refs[lora_ref.lora_id]
+            self.num_pinned_loras -= int(lora_ref.pinned)
+        except Exception as e:
+            return self.create_lora_update_result(
+                success=False,
+                error_message=str(e),
+            )
+
+        return self.create_lora_update_result(success=True)
+
+    def validate_lora_batch(self, lora_ids: set[str]) -> bool:
+        """
+        Validate if the LoRA IDs in the batch can be loaded into the current LoRA memory pool.
+        """
+        if len(lora_ids) > self.max_loras_per_batch:
+            return False
+
+        # skip pinned LoRA check if no pinned LoRA adapters are loaded.
+        if self.num_pinned_loras == 0:
+            return True
+
+        # counting the number of pinned LoRA adapters in the batch.
+        pinned_loras_in_batch = 0
+        for lora_id in lora_ids:
+            if lora_id is not None:
+                lora_ref = self.lora_refs.get(lora_id)
+                assert (
+                    lora_ref is not None
+                ), f"LoRA ID {lora_id} not found in lora_refs."
+                pinned_loras_in_batch += int(lora_ref.pinned)
+
+        assert pinned_loras_in_batch <= self.num_pinned_loras, (
+            f"Number of pinned LoRA adapters in the batch ({pinned_loras_in_batch}) exceeds the total number of pinned adapters "
+            f"({self.num_pinned_loras}). This indicates a bug in the LoRA loading logic."
+        )
+
+        required_slots = len(lora_ids) - pinned_loras_in_batch
+        mem_pool_vacancy = self.memory_pool.max_loras_per_batch - self.num_pinned_loras
+
+        return required_slots <= mem_pool_vacancy
+
+    def prepare_lora_batch(self, forward_batch: ForwardBatch):
+        # Load active loras into lora memory pool
+        cur_uids = set(forward_batch.lora_ids)
+
+        assert len(cur_uids) <= self.max_loras_per_batch
+        self.memory_pool.prepare_lora_batch(
+            cur_uids=cur_uids,
+            lora_adapters=self.loras,
+            lora_modules=self.lora_modules,
+            lora_refs=self.lora_refs.copy(),  # copy snapshot of current lora_refs to avoid mutation during the batch preparation.
+        )
+
+        # set up batch info shared by all lora modules
+        bs = forward_batch.batch_size
+
+        use_cuda_graph = (
+            hasattr(self, "max_bs_in_cuda_graph")
+            and bs <= self.max_bs_in_cuda_graph
+            and forward_batch.forward_mode.is_cuda_graph()
+        )
+
+        weight_indices = [0] * len(forward_batch.lora_ids)
+        lora_ranks = [0] * self.max_loras_per_batch
+        scalings = [0] * self.max_loras_per_batch
+        for i, uid in enumerate(forward_batch.lora_ids):
+            weight_indices[i] = self.memory_pool.get_buffer_id(uid)
+            if uid is not None:
+                lora = self.loras[uid]
+                lora_ranks[weight_indices[i]] = lora.config.r
+                scalings[weight_indices[i]] = lora.scaling
+        # Do in-place updates when CUDA graph is enabled and the batch forward mode
+        # could use CUDA graph.
+        self.lora_backend.prepare_lora_batch(
+            forward_batch=forward_batch,
+            weight_indices=weight_indices,
+            lora_ranks=lora_ranks,
+            scalings=scalings,
+            batch_info=self.cuda_graph_batch_info if use_cuda_graph else None,
+        )
+
+    def update_lora_info(self):
+        """
+        Update all LoRA modules to associate them with the latest memory buffer.
+        """
+        for layer_id, layer_modules in enumerate(self.lora_modules):
+            for module_name, module in layer_modules.items():
+                target_module = get_target_module_name(
+                    module_name, self.memory_pool.target_modules
+                )
+                module.set_lora_info(
+                    self.memory_pool.get_tensor(
+                        target_module=target_module,
+                        layer_id=layer_id,
+                        lora_type=LoRAType.LORA_A,
+                    ),
+                    self.memory_pool.get_tensor(
+                        target_module=target_module,
+                        layer_id=layer_id,
+                        lora_type=LoRAType.LORA_B,
+                    ),
+                )
+
+    def init_state(
+        self,
+        max_lora_rank: Optional[int] = None,
+        target_modules: Optional[Iterable[str]] = None,
+        lora_paths: Optional[List[LoRARef]] = None,
+    ):
+        """
+        Initialize the internal (mutable) state of the LoRAManager.
+
+        When `lora_paths` is provided and not empty, it might be used for inferring LoRA shape info such as
+        the target modules and max_lora_rank.
+        """
+
+        assert lora_paths or (
+            max_lora_rank is not None and target_modules is not None
+        ), "When no initial --lora-paths is provided, you need to specify both --max-lora-rank and --lora-target-modules for LoRA initialization."
+
+        self.init_lora_adapters(lora_paths)
+        self.init_lora_shapes(
+            max_lora_rank=max_lora_rank,
+            target_modules=target_modules,
+        )
+        self.init_lora_modules()
+        self.init_memory_pool()
+        self.update_lora_info()
+
+    def init_lora_adapters(self, lora_paths: Optional[List[LoRARef]] = None):
+        # Configs of all active LoRA adapters, indexed by LoRA ID.
+        self.configs: Dict[str, LoRAConfig] = {}
+
+        # LoRA adapter weights cached in CPU memory, indexed by LoRA ID.
+        self.loras: Dict[str, LoRAAdapter] = {}
+
+        # Mapping from LoRA ID to LoRARef object.
+        self.lora_refs: Dict[str, LoRARef] = {}
+
+        # Count of pinned LoRA adapters.
+        self.num_pinned_loras: int = 0
+
+        if lora_paths:
+            for lora_ref in lora_paths:
+                result = self.load_lora_adapter(lora_ref)
+                if not result.success:
+                    raise RuntimeError(
+                        f"Failed to load LoRA adapter {lora_ref.lora_name}: {result.error_message}"
+                    )
+
+    def init_lora_shapes(
+        self,
+        max_lora_rank: Optional[int] = None,
+        target_modules: Optional[Iterable[str]] = None,
+    ):
+        """Infer LoRA target modules and max_lora_rank from loaded adapters if not provided."""
+
+        self.target_modules = (
+            get_normalized_target_modules(target_modules) if target_modules else set()
+        )
+
+        for lora_id, config in self.configs.items():
+            if not isinstance(config.target_modules, list):
+                raise ValueError(
+                    f"SGLang currently only supports inferring LoRA target modules when a list of "
+                    "suffixes is provided in `target_modules` field of PEFT config. Please explicitly "
+                    "specify `--lora-target-modules` during server startup. You can specify `all` to "
+                    "enable all support modules types. "
+                )
+
+            adapter_target_modules = get_normalized_target_modules(
+                config.target_modules
+            )
+
+            if target_modules is not None:
+                # When `--lora-target-modules` is provided, validate adapter target modules is a subset of the specified target modules.
+                if not adapter_target_modules.issubset(self.target_modules):
+                    unsupported_modules = adapter_target_modules - self.target_modules
+                    lora_name = self.lora_refs[lora_id].lora_name
+                    raise ValueError(
+                        f"LoRA adapter '{lora_name}' contains target modules {sorted(unsupported_modules)} "
+                        f"that are not included in the specified --lora-target-modules {sorted(self.target_modules)}. "
+                        f"Please update --lora-target-modules to include all required modules: "
+                        f"{sorted(self.target_modules | adapter_target_modules)}, or use 'all' to enable all supported modules."
+                    )
+            else:
+                # Otherwise, infer target_modules from adapter configs.
+                self.target_modules.update(adapter_target_modules)
+
+        if max_lora_rank is not None:
+            self.max_lora_rank = max_lora_rank
+        else:
+            self.max_lora_rank = max(
+                [x.r for x in self.configs.values()],
+                default=0,
+            )
+
+    def load_lora_weights(self, lora_ref: LoRARef):
+        """
+        Load the weights of a LoRA adapter to CPU memory and conducts post-loading validation.
+        """
+        lora_adapter = LoRAAdapter(
+            lora_ref.lora_id,
+            self.configs[lora_ref.lora_id],
+            self.base_hf_config,
+            self.load_config,
+            self.lora_backend,
+        )
+        lora_adapter.initialize_weights()
+        self.loras[lora_ref.lora_id] = lora_adapter
+
+    def init_memory_pool(self):
+        """(Re)initialize the LoRA memory pool based on the current configurations."""
+        self.memory_pool = LoRAMemoryPool(
+            base_hf_config=self.base_hf_config,
+            max_loras_per_batch=self.max_loras_per_batch,
+            dtype=self.dtype,
+            tp_size=self.tp_size,
+            tp_rank=self.tp_rank,
+            max_lora_rank=self.max_lora_rank,
+            target_modules=self.target_modules,
+            base_model=self.base_model,
+        )
+
+    def set_lora_module(self, module_name, module):
+        lora_module = get_lora_layer(module, self.lora_backend)
+        replace_submodule(self.base_model, module_name, lora_module)
+        return lora_module
+
+    def init_lora_modules(self):
+        # Look-up table that essentially maps (layer_index, module_name) to the corresponding LoRA module.
+        self.lora_modules: List[Dict[str, BaseLayerWithLoRA]] = [
+            {} for _ in range(self.base_hf_config.num_hidden_layers)
+        ]
+
+        for module_name, module in self.base_model.named_modules():
+            # TODO (lifuhuang): in the future, we should consider generalizing the
+            # should_apply_lora function to support mapping by full module name instead
+            # of just the last part (e.g., "qkv_proj") to support scenarios with multiple
+            # attention stacks (e.g., multimodal models).
+            # See: https://github.com/sgl-project/sglang/issues/6608
+            if getattr(
+                self.base_model, "should_apply_lora", None
+            ) and not self.base_model.should_apply_lora(module_name):
+                continue
+
+            # The module should be converted if it is included in target_names
+            if module_name.split(".")[-1] in self.target_modules:
+                layer_id = get_layer_id(module_name)
+                self.lora_modules[layer_id][module_name] = self.set_lora_module(
+                    module_name, module
+                )
diff --git a/python/sglang/srt/lora/lora_registry.py b/python/sglang/srt/lora/lora_registry.py
new file mode 100644
index 000000000..51d2b0e66
--- /dev/null
+++ b/python/sglang/srt/lora/lora_registry.py
@@ -0,0 +1,207 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+import asyncio
+from dataclasses import dataclass, field, fields
+from typing import Dict, List, Optional, Union
+from uuid import uuid4
+
+from sglang.srt.aio_rwlock import RWLock
+from sglang.srt.utils import ConcurrentCounter
+
+
+@dataclass(frozen=True)
+class LoRARef:
+    """
+    Reference record for a LoRA model.
+
+    This object guarantees a unique ``lora_id`` and may include ``lora_name``, ``lora_path``, and ``pinned``.
+    The ID eliminates conflicts from reused LoRA names or paths and can be used to generate deterministic cache
+    keys (e.g., radix cache).
+    """
+
+    lora_id: str = field(default_factory=lambda: uuid4().hex)
+    lora_name: Optional[str] = None
+    lora_path: Optional[str] = None
+    pinned: Optional[bool] = None
+
+    def __post_init__(self):
+        if self.lora_id is None:
+            raise ValueError("lora_id cannot be None")
+
+    def __str__(self) -> str:
+        parts = [
+            f"{f.name}={value}"
+            for f in fields(self)
+            if (value := getattr(self, f.name)) is not None
+        ]
+        return f"{self.__class__.__name__}({', '.join(parts)})"
+
+
+class LoRARegistry:
+    """
+    The central registry to keep track of available LoRA adapters and ongoing LoRA requests.
+
+    The `LoRARegistry` resides in the tokenizer manager process and acts as the single source of truth for all
+    available LoRA adapters. It supports concurrent inference and dynamic adapter updates through a two-phase
+    update / eventual consistency model between the tokenizer manager process and the scheduler processes.
+    """
+
+    def __init__(self, lora_paths: Optional[List[LoRARef]] = None):
+        assert lora_paths is None or all(
+            isinstance(lora, LoRARef) for lora in lora_paths
+        ), (
+            "server_args.lora_paths should have been normalized to LoRARef objects during server initialization. "
+            "Please file an issue if you see this error."
+        )
+
+        # A read-write lock to ensure adapters loading / unloading operations are exclusive.
+        # Please note that the counter increment/decrement operations are not synchronized through this
+        # lock, as they are designed to be non-blocking and can be performed concurrently.
+        self._registry_lock = RWLock()
+        # A dictionary to hold LoRARef objects, mapping from LoRA name to LoRARef.
+        self._registry: Dict[str, LoRARef] = {}
+        # Counters for ongoing requests, mapping from LoRA ID to ConcurrentCounter.
+        self._counters: Dict[str, ConcurrentCounter] = {}
+
+        # Initialize the registry with provided LoRA paths, if present.
+        if lora_paths:
+            for lora_ref in lora_paths:
+                self._register_adapter(lora_ref)
+
+    async def register(self, lora_ref: LoRARef):
+        """
+        Register a new LoRARef object in the registry.
+
+        Args:
+            lora_ref (LoRARef): The LoRARef object to register.
+        """
+        async with self._registry_lock.writer_lock:
+            self._register_adapter(lora_ref)
+
+    async def unregister(self, lora_name: str) -> str:
+        """
+        Unregister a LoRARef object from the registry and returns the removed LoRA ID.
+
+        Args:
+            lora_name (str): The name of the LoRA model to unregister.
+        """
+        async with self._registry_lock.writer_lock:
+            lora_ref = self._registry.get(lora_name, None)
+            if lora_ref is None:
+                raise ValueError(
+                    f"LoRA with name {lora_name} does not exist. Loaded LoRAs: {self._registry.keys()}"
+                )
+            del self._registry[lora_name]
+
+        return lora_ref.lora_id
+
+    async def acquire(self, lora_name: Union[str, List[str]]) -> Union[str, List[str]]:
+        """
+        Queries registry for LoRA IDs based on LoRA names and start tracking the usage of the corresponding LoRA adapters
+        by incrementing its counter.
+        """
+
+        def _lookup(name: str) -> str:
+            if name is None:
+                return None
+
+            lora_ref = self._registry.get(name, None)
+            if lora_ref is None:
+                raise ValueError(
+                    f"The following requested LoRA adapters are not loaded: {name}\n"
+                    f"Loaded adapters: {self._registry.keys()}."
+                )
+            return lora_ref.lora_id
+
+        async with self._registry_lock.reader_lock:
+            if isinstance(lora_name, str):
+                lora_id = _lookup(lora_name)
+                await self._counters[lora_id].increment(notify_all=False)
+                return lora_id
+            elif isinstance(lora_name, list):
+                lora_ids = [_lookup(name) for name in lora_name]
+
+                # Increment the counters only after all IDs are looked up.
+                await asyncio.gather(
+                    *[
+                        self._counters[id].increment(notify_all=False)
+                        for id in lora_ids
+                        if id is not None
+                    ]
+                )
+                return lora_ids
+            else:
+                raise TypeError(
+                    "lora_name must be either a string or a list of strings."
+                )
+
+    async def release(self, lora_id: Union[str, List[str]]):
+        """
+        Decrements the usage counter for a LoRA adapter, indicating that it is no longer in use.
+        """
+
+        async with self._registry_lock.reader_lock:
+            if isinstance(lora_id, str):
+                await self._counters[lora_id].decrement()
+            elif isinstance(lora_id, list):
+                await asyncio.gather(
+                    *[
+                        self._counters[id].decrement()
+                        for id in lora_id
+                        if id is not None
+                    ]
+                )
+            else:
+                raise TypeError("lora_id must be either a string or a list of strings.")
+
+    async def wait_for_unload(self, lora_id: str):
+        """
+        Waits until the usage counter for a LoRA adapter reaches zero, indicating that it is no longer in use.
+        This is useful for ensuring that a LoRA adapter can be safely unloaded.
+
+        This method itself is not synchronized, which is safe because it should only be called during LoRA unloading,
+        which itself is guaranteed to be sequential.
+        """
+        assert (
+            lora_id not in self._registry
+        ), "wait_for_unload should only be called after the LoRA adapter has been unregistered. "
+        assert (
+            lora_id in self._counters
+        ), "The LoRA ID should still have a counter if it has been registered before."
+
+        # Wait until no requests are using this LoRA adapter.
+        await self._counters[lora_id].wait_for_zero()
+        del self._counters[lora_id]
+
+    def _register_adapter(self, lora_ref: LoRARef):
+        """
+        Internal helper method to register a LoRA adapter.
+        """
+
+        if lora_ref.lora_name in self._registry:
+            raise ValueError(
+                f"LoRA with name {lora_ref.lora_name} already exists. Loaded LoRAs: {self._registry.keys()}"
+            )
+        self._registry[lora_ref.lora_name] = lora_ref
+        self._counters[lora_ref.lora_id] = ConcurrentCounter()
+        return lora_ref
+
+    @property
+    def num_registered_loras(self) -> int:
+        """
+        Returns the total number of LoRA adapters currently registered.
+        """
+        return len(self._registry)
diff --git a/python/sglang/srt/lora/mem_pool.py b/python/sglang/srt/lora/mem_pool.py
new file mode 100644
index 000000000..cdf1707e8
--- /dev/null
+++ b/python/sglang/srt/lora/mem_pool.py
@@ -0,0 +1,309 @@
+import logging
+from typing import Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
+
+import torch
+
+from sglang.srt.distributed import divide
+from sglang.srt.hf_transformers_utils import AutoConfig
+from sglang.srt.lora.layers import BaseLayerWithLoRA
+from sglang.srt.lora.lora import LoRAAdapter
+from sglang.srt.lora.lora_config import LoRAConfig
+from sglang.srt.lora.lora_registry import LoRARef
+from sglang.srt.lora.utils import (
+    ROW_PARALLELISM_LINEAR_LORA_NAMES,
+    LoRAType,
+    get_hidden_dim,
+    get_normalized_target_modules,
+    get_stacked_multiply,
+    get_target_module_name,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class EmptySlot:
+    """
+    Singleton class to represent an empty slot in the memory pool.
+    This is used to improve readability by not using special str as a placeholder.
+    """
+
+    __slots__ = ()
+
+    def __repr__(self):
+        return "|EMPTY|"
+
+    def __new__(cls):
+        if not hasattr(cls, "_instance"):
+            cls._instance = super().__new__(cls)
+        return cls._instance
+
+
+EMPTY_SLOT = EmptySlot()
+
+
+class LoRAMemoryPool:
+    """Class for memory pool management of lora modules"""
+
+    def __init__(
+        self,
+        base_hf_config: AutoConfig,
+        max_loras_per_batch: int,
+        dtype: torch.dtype,
+        tp_size: int,
+        tp_rank: int,
+        max_lora_rank: int,
+        target_modules: Set[str],
+        base_model: torch.nn.Module,
+    ):
+        self.base_hf_config: AutoConfig = base_hf_config
+        self.num_layer: int = base_hf_config.num_hidden_layers
+        self.max_loras_per_batch: int = max_loras_per_batch
+        self.dtype: torch.dtype = dtype
+        self.tp_size: int = tp_size
+        self.tp_rank: int = tp_rank
+        self.max_lora_rank: int = max_lora_rank
+        self.target_modules: Set[str] = target_modules
+
+        # Both A_buffer and B_buffer maps lora weight names to its buffer space.
+        # A_buffer contains num_layer number of row-major tensors with shape
+        #   (max_loras_per_batch, stacked_num * max_lora_dim, input_dim)
+        # B_buffer contains num_layer number of column-major tensors with shape
+        #   (stacked_num, max_loras_per_batch, output_dim, max_lora_dim)
+        self.A_buffer: Dict[str, List[torch.Tensor]] = {}
+        self.B_buffer: Dict[str, List[torch.Tensor]] = {}
+
+        # Lora uid -> buffer idx in memory pool
+        self.uid_to_buffer_id: Dict[Optional[str], int] = {}
+
+        # Buffer idx -> lora uid in memory pool
+        # All uids are initialized as `EmptySlot` for empty buffer slots
+        # Here we don't initialize to None since None is a valid uid
+        self.buffer_id_to_uid: List[Union[str, None, EmptySlot]] = [
+            EMPTY_SLOT
+        ] * self.max_loras_per_batch
+
+        self.init_buffers(base_model)
+
+    def can_support(self, config: Union[LoRAConfig, Iterable[LoRAConfig]]) -> bool:
+        """
+        Check if the memory pool can support the given LoRA adapters.
+        """
+
+        def _can_support(config: LoRAConfig) -> bool:
+            """
+            Check if the memory pool can support a single LoRA adapter.
+            """
+            if config.r > self.max_lora_rank:
+                return False
+            target_module_names = get_normalized_target_modules(config.target_modules)
+            return target_module_names.issubset(self.target_modules)
+
+        if isinstance(config, LoRAConfig):
+            return _can_support(config)
+        else:
+            return all(_can_support(x) for x in config)
+
+    def get_lora_A_shape(
+        self,
+        module_name: str,
+        base_model: torch.nn.Module,
+        max_lora_dim: int,
+        layer_idx: int,
+    ) -> Tuple[int]:
+        """
+        Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
+        """
+        input_dim, _ = get_hidden_dim(
+            module_name, self.base_hf_config, base_model, layer_idx
+        )
+        c = get_stacked_multiply(module_name)
+        if self.tp_size > 1 and module_name in ROW_PARALLELISM_LINEAR_LORA_NAMES:
+            input_dim = divide(input_dim, self.tp_size)
+        return (
+            self.max_loras_per_batch,
+            max_lora_dim * c,
+            input_dim,
+        )
+
+    def get_lora_B_shape(
+        self,
+        module_name: str,
+        base_model: torch.nn.Module,
+        max_lora_dim: int,
+        layer_idx: int,
+    ) -> Tuple[int]:
+        """
+        Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
+        """
+        _, output_dim = get_hidden_dim(
+            module_name, self.base_hf_config, base_model, layer_idx
+        )
+        if self.tp_size > 1 and module_name not in ROW_PARALLELISM_LINEAR_LORA_NAMES:
+            output_dim = divide(output_dim, self.tp_size)
+        return (
+            self.max_loras_per_batch,
+            output_dim,
+            max_lora_dim,
+        )
+
+    def init_buffers(self, base_model: torch.nn.Module):
+        device = next(base_model.parameters()).device
+
+        def init_buffer(
+            buffer: Dict[str, List[torch.Tensor]],
+            target_modules: Set[str],
+            get_lora_shape_fn: Callable[[str, torch.nn.Module, int, int], Tuple[int]],
+        ):
+            for module_name in target_modules:
+                buffer[module_name] = [
+                    torch.empty(
+                        get_lora_shape_fn(
+                            module_name,
+                            base_model,
+                            self.max_lora_rank,
+                            idx,
+                        ),
+                        dtype=self.dtype,
+                        device=device,
+                    )
+                    for idx in range(self.num_layer)
+                ]
+
+        init_buffer(
+            self.A_buffer,
+            self.target_modules,
+            self.get_lora_A_shape,
+        )
+
+        init_buffer(
+            self.B_buffer,
+            self.target_modules,
+            self.get_lora_B_shape,
+        )
+
+    def prepare_lora_batch(
+        self,
+        cur_uids: Set[Optional[str]],
+        lora_adapters: Dict[str, LoRAAdapter],
+        lora_modules: List[Dict[str, BaseLayerWithLoRA]],
+        lora_refs: Dict[str, LoRARef],
+    ):
+        def get_available_buffer_slot():
+            for buffer_id in range(self.max_loras_per_batch):
+                # Prioritize empty slots
+                if self.buffer_id_to_uid[buffer_id] == EMPTY_SLOT:
+                    return buffer_id
+
+            for buffer_id in range(self.max_loras_per_batch):
+                uid = self.buffer_id_to_uid[buffer_id]
+
+                # Evict unneeded lora
+                if uid not in cur_uids:
+                    # Skip pinned LoRAs
+                    # TODO (lifuhuang): we might consider supporting pinning base model (uid == None) in the future.
+                    if uid is not None:
+                        lora_ref = lora_refs.get(uid)
+                        if lora_ref is not None and lora_ref.pinned:
+                            continue
+
+                    self.uid_to_buffer_id.pop(uid)
+                    logger.debug(f"Evicting LoRA {uid} from buffer slot {buffer_id}.")
+                    self.buffer_id_to_uid[buffer_id] = EMPTY_SLOT
+                    return buffer_id
+
+            raise ValueError(
+                "No available buffer slots found. Please ensure the number of active loras is less than max_loras_per_batch."
+            )
+
+        for uid in cur_uids:
+            if uid not in self.uid_to_buffer_id:
+                buffer_id = get_available_buffer_slot()
+                lora_adapter = lora_adapters.get(uid, None)
+                self.load_lora_weight_to_buffer(
+                    uid, buffer_id, lora_adapter, lora_modules
+                )
+                self.uid_to_buffer_id[uid] = buffer_id
+                self.buffer_id_to_uid[buffer_id] = uid
+
+    def load_lora_weight_to_buffer(
+        self,
+        uid: str,
+        buffer_id: int,
+        lora_adapter: LoRAAdapter,
+        lora_modules: List[Dict[str, BaseLayerWithLoRA]],
+    ):
+        def load_lora_weight_tensor(
+            buffer_view: torch.Tensor, weight: Optional[torch.Tensor]
+        ):
+            if weight is None:
+                # If the particular weight is not present in the adapter, we initialize the buffer to zero
+                # to avoid contamination from the residual weight of the evicted adapters.
+                buffer_view.zero_()
+            else:
+                assert (
+                    buffer_view.shape == weight.shape
+                ), f"LoRA buffer shape {buffer_view.shape} does not match weight shape {weight.shape}."
+                buffer_view.copy_(weight)
+
+        if uid is None:
+            for i in range(self.num_layer):
+                for k in self.A_buffer.keys():
+                    self.A_buffer[k][i][buffer_id] = 0
+            return
+
+        assert lora_adapter is not None
+        lora_rank = lora_adapter.config.r
+        for layer_id in range(self.num_layer):
+            layer_weights = lora_adapter.layers[layer_id].weights
+            temp_A_buffer: Dict[str, Optional[torch.Tensor]] = {
+                target_module: None for target_module in self.A_buffer
+            }
+            temp_B_buffer: Dict[str, Optional[torch.Tensor]] = {
+                target_module: None for target_module in self.B_buffer
+            }
+            for name, weights in layer_weights.items():
+                target_module = get_target_module_name(name, self.target_modules)
+                if "lora_A" in name:
+                    temp_A_buffer[target_module] = weights
+                else:
+                    temp_B_buffer[target_module] = weights
+
+            if self.tp_size > 1:
+                cur_layer_modules = lora_modules[layer_id]
+                for module_name, module in cur_layer_modules.items():
+                    target_module = get_target_module_name(
+                        module_name, self.target_modules
+                    )
+
+                    if temp_A_buffer[target_module] is None:
+                        # Skip weight slicing if the weight is not present in the adapter
+                        continue
+
+                    temp_A_buffer[target_module] = module.slice_lora_a_weights(
+                        temp_A_buffer[target_module], self.tp_rank
+                    )
+                    temp_B_buffer[target_module] = module.slice_lora_b_weights(
+                        temp_B_buffer[target_module], self.tp_rank
+                    )
+
+            for name, weights in temp_A_buffer.items():
+                c = get_stacked_multiply(name)
+                target_buffer = self.A_buffer[name][layer_id]
+                buffer_view = target_buffer[buffer_id, : lora_rank * c, :]
+                load_lora_weight_tensor(buffer_view, weights)
+
+            for name, weights in temp_B_buffer.items():
+                target_buffer = self.B_buffer[name][layer_id]
+                buffer_view = target_buffer[buffer_id, :, :lora_rank]
+                load_lora_weight_tensor(buffer_view, weights)
+
+    def get_tensor(
+        self, target_module: str, layer_id: int, lora_type: LoRAType
+    ) -> torch.Tensor:
+        if lora_type == LoRAType.LORA_A:
+            return self.A_buffer[target_module][layer_id]
+
+        return self.B_buffer[target_module][layer_id]
+
+    def get_buffer_id(self, lora_uid: str):
+        return self.uid_to_buffer_id[lora_uid]
diff --git a/python/sglang/srt/lora/triton_ops/__init__.py b/python/sglang/srt/lora/triton_ops/__init__.py
new file mode 100644
index 000000000..da55e8fd5
--- /dev/null
+++ b/python/sglang/srt/lora/triton_ops/__init__.py
@@ -0,0 +1,11 @@
+from .gate_up_lora_b import gate_up_lora_b_fwd
+from .qkv_lora_b import qkv_lora_b_fwd
+from .sgemm_lora_a import sgemm_lora_a_fwd
+from .sgemm_lora_b import sgemm_lora_b_fwd
+
+__all__ = [
+    "gate_up_lora_b_fwd",
+    "qkv_lora_b_fwd",
+    "sgemm_lora_a_fwd",
+    "sgemm_lora_b_fwd",
+]
diff --git a/python/sglang/srt/lora/triton_ops/gate_up_lora_b.py b/python/sglang/srt/lora/triton_ops/gate_up_lora_b.py
new file mode 100644
index 000000000..fc4574dd3
--- /dev/null
+++ b/python/sglang/srt/lora/triton_ops/gate_up_lora_b.py
@@ -0,0 +1,187 @@
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.lora.utils import LoRABatchInfo
+
+
+@triton.jit
+def _gate_up_lora_b_kernel(
+    # Pointers to matrices
+    x,
+    weights,
+    output,
+    # Parameters of size
+    K,  # K = R
+    output_dim,
+    # Strides
+    x_stride_0,
+    x_stride_1,
+    w_stride_0,
+    w_stride_1,
+    w_stride_2,
+    output_stride_0,
+    output_stride_1,
+    # Information on sequence lengths,ranks and weight id
+    seg_lens,
+    seg_indptr,
+    weight_indices,
+    lora_ranks,
+    # Meta parameters
+    BLOCK_S: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    # For fused output scaling
+    scalings,
+):
+    """
+    This kernel packs 2 sgemms (gate/up) into a single kernel. The multiplication
+    results are accumulated into the output tensor.
+
+    When a sequence's rank is 0, the kernel is essentially a no-op, following
+    the convention in pytorch where the product of two matrices of shape (m, 0)
+    and (0, n) is an all-zero matrix of shape (m, n).
+
+    Args:
+        x (Tensor): The input tensor, which is the result of the LoRA A projection.
+            Shape: (s, 2 * K), where s is the sum of all sequence lengths in the
+            batch and K is the maximum LoRA rank.
+        weights (Tensor): The LoRA B weights for all adapters.
+            Shape: (num_lora, 2 * output_dim, K).
+        output (Tensor): The output tensor where the result is stored.
+            Shape: (s, 2 * output_dim).
+    """
+    # output_dim >> K
+
+    # Current block computes sequence with batch_id,
+    # which starts from row seg_start of x with length seg_len.
+    # gate_up_id decides which of gate or up (0: gate, 1: up)
+    batch_id = tl.program_id(axis=2)
+    w_index = tl.load(weight_indices + batch_id)
+    rank = tl.load(lora_ranks + w_index)
+
+    # If rank is 0, this kernel is a no-op.
+    if rank == 0:
+        return
+
+    gate_up_id = tl.program_id(axis=1)
+    pid = tl.program_id(axis=0)
+    seg_len = tl.load(seg_lens + batch_id)
+    seg_start = tl.load(seg_indptr + batch_id)
+    n_start = gate_up_id * output_dim  # offset on output dim
+    scaling = tl.load(scalings + w_index)
+
+    # Adjust K (rank) according to the specific LoRA adapter
+    K = tl.minimum(K, rank)
+
+    # The tile in output matrix will have (pid_s, pid_n) as id
+    num_pid_n = tl.cdiv(output_dim, BLOCK_N)
+    pid_s = pid // num_pid_n
+    pid_n = pid % num_pid_n
+
+    # Create pointers for the first block of x and weights
+    # The pointers will be advanced as we move in the K direction
+    # and accumulate
+    s_offset = tl.arange(0, BLOCK_S) + pid_s * BLOCK_S
+    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    k_offset = tl.arange(0, BLOCK_K)
+
+    x_ptrs = (x + seg_start * x_stride_0 + (gate_up_id * K) * x_stride_1) + (
+        s_offset[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1
+    )
+    w_ptrs = (weights + w_index * w_stride_0 + n_start * w_stride_1) + (
+        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
+    )
+
+    # Iterate to compute the block in output matrix
+    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        x_tile = tl.load(
+            x_ptrs,
+            mask=(s_offset[:, None] < seg_len) & (k_offset[None, :] < K - k * BLOCK_K),
+            other=0.0,
+        )
+        w_tile = tl.load(
+            w_ptrs,
+            mask=(k_offset[:, None] < K - k * BLOCK_K)
+            & (n_offset[None, :] < output_dim),
+            other=0.0,
+        )
+        partial_sum += tl.dot(x_tile, w_tile)
+
+        x_ptrs += BLOCK_K * x_stride_1
+        w_ptrs += BLOCK_K * w_stride_2
+
+    # Store result to output matrix
+    partial_sum *= scaling
+    partial_sum = partial_sum.to(x.dtype.element_ty)
+    output_ptr = (output + seg_start * output_stride_0 + n_start * output_stride_1) + (
+        s_offset[:, None] * output_stride_0 + n_offset[None, :] * output_stride_1
+    )
+    output_mask = (s_offset[:, None] < seg_len) & (n_offset[None, :] < output_dim)
+    partial_sum += tl.load(output_ptr, mask=output_mask)
+    tl.store(output_ptr, partial_sum, mask=output_mask)
+
+
+def gate_up_lora_b_fwd(
+    x: torch.Tensor,
+    gate_up_lora_b: torch.Tensor,
+    batch_info: LoRABatchInfo,
+    output_dim: int,
+    base_output: torch.Tensor = None,
+) -> torch.Tensor:
+
+    # x: (s, 2 * r)
+    # gate_up_lora_b: (num_lora, 2 * output_dim, r)
+    # output: (s, 2 * output_dim)
+
+    # Compute lora_output with shape (s, output_dim) as follows:
+    # lora_output[:, :output_dim] = sgemm(x[:, :r], gate_up_lora_b[:, :output_dim, :])
+    # lora_output[:, output_dim:]
+    #      = sgemm(x[:, r:], gate_up_lora_b[:, output_dim:, :])
+
+    # Get dims
+    s = x.shape[0]
+    input_dim = x.shape[1]
+    r = gate_up_lora_b.shape[-1]
+    assert input_dim == 2 * r
+
+    BLOCK_S = 16
+    BLOCK_R = 16
+    BLOCK_OUT = 64
+
+    grid_b = (
+        triton.cdiv(batch_info.max_len, BLOCK_S) * triton.cdiv(output_dim, BLOCK_OUT),
+        2,  # this dimension decides current block computes on gate or up proj
+        batch_info.bs,
+    )
+
+    if base_output is None:
+        output = torch.zeros((s, 2 * output_dim), device=x.device, dtype=x.dtype)
+    else:
+        output = base_output
+
+    _gate_up_lora_b_kernel[grid_b](
+        x,
+        gate_up_lora_b,
+        output,
+        r,
+        output_dim,
+        x.stride(0),
+        x.stride(1),
+        gate_up_lora_b.stride(0),
+        gate_up_lora_b.stride(1),
+        gate_up_lora_b.stride(2),
+        output.stride(0),
+        output.stride(1),
+        batch_info.seg_lens,
+        batch_info.seg_indptr,
+        batch_info.weight_indices,
+        batch_info.lora_ranks,
+        BLOCK_S,
+        BLOCK_OUT,
+        BLOCK_R,
+        batch_info.scalings,
+    )
+
+    return output
diff --git a/python/sglang/srt/lora/triton_ops/qkv_lora_b.py b/python/sglang/srt/lora/triton_ops/qkv_lora_b.py
new file mode 100644
index 000000000..1d6663dbe
--- /dev/null
+++ b/python/sglang/srt/lora/triton_ops/qkv_lora_b.py
@@ -0,0 +1,198 @@
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.lora.utils import LoRABatchInfo
+
+
+@triton.jit
+def _qkv_lora_b_kernel(
+    # Pointers to matrices
+    x,
+    weights,
+    output,
+    # Parameters of size
+    K,  # K = R
+    max_qkv_out_dim,  # max(output_q_dim, output_kv_dim)
+    # Strides
+    x_stride_0,
+    x_stride_1,
+    w_stride_0,
+    w_stride_1,
+    w_stride_2,
+    output_stride_0,
+    output_stride_1,
+    # Information on sequence lengths and weight id
+    seg_lens,
+    seg_indptr,
+    weight_indices,
+    lora_ranks,
+    # Offsets of q/k/v slice on output dimension
+    n_offs,
+    # Meta parameters
+    BLOCK_S: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    # For fused output scaling
+    scalings,
+):
+    """
+    This kernel packs 3 sgemms (q/k/v) into a single kernel. The multiplication
+    results are accumulated into the output tensor.
+
+    When a sequence's rank is 0, the kernel is essentially a no-op, following
+    the convention in pytorch where the product of two matrices of shape (m, 0)
+    and (0, n) is an all-zero matrix of shape (m, n).
+
+    Args:
+        x (Tensor): The input tensor, which is the result of the LoRA A projection.
+            Shape: (s, 3 * K), where s is the sum of all sequence lengths in the
+            batch and K is the maximum LoRA rank. The second dimension is partitioned
+            for Q, K, and V.
+        weights (Tensor): The LoRA B weights for all adapters.
+            Shape: (num_lora, N_Q + 2 * N_KV, K).
+        output (Tensor): The output tensor where the result is stored.
+            Shape: (s, N_Q + 2 * N_KV).
+    """
+
+    # Current block computes sequence with batch_id,
+    # which starts from row seg_start of x with length seg_len.
+    # qkv_id decides which of q,k,v to compute (0: q, 1: k, 2: v)
+    batch_id = tl.program_id(axis=2)
+    w_index = tl.load(weight_indices + batch_id)
+    rank = tl.load(lora_ranks + w_index)
+
+    # If rank is 0, this kernel is a no-op.
+    if rank == 0:
+        return
+
+    qkv_id = tl.program_id(axis=1)
+    pid = tl.program_id(axis=0)
+    seg_len = tl.load(seg_lens + batch_id)
+    seg_start = tl.load(seg_indptr + batch_id)
+    n_start = tl.load(n_offs + qkv_id)
+    n_size = tl.load(n_offs + qkv_id + 1) - n_start
+    scaling = tl.load(scalings + w_index)
+    # Adjust K (rank) according to the specific LoRA adapter
+    K = tl.minimum(K, rank)
+
+    # The tile in output matrix will have (pid_s, pid_n) as id
+    num_pid_n = tl.cdiv(max_qkv_out_dim, BLOCK_N)
+    pid_s = pid // num_pid_n
+    pid_n = pid % num_pid_n
+
+    # Create pointers for the first block of x and weights[batch_id][n_start: n_end][:]
+    # The pointers will be advanced as we move in the K direction
+    # and accumulate
+    s_offset = tl.arange(0, BLOCK_S) + pid_s * BLOCK_S
+    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    k_offset = tl.arange(0, BLOCK_K)
+
+    x_ptrs = (x + seg_start * x_stride_0 + (qkv_id * K) * x_stride_1) + (
+        s_offset[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1
+    )
+    w_ptrs = (weights + w_index * w_stride_0 + n_start * w_stride_1) + (
+        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
+    )
+
+    # Iterate to compute the block in output matrix
+    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        x_tile = tl.load(
+            x_ptrs,
+            mask=(s_offset[:, None] < seg_len) & (k_offset[None, :] < K - k * BLOCK_K),
+            other=0.0,
+        )
+        w_tile = tl.load(
+            w_ptrs,
+            mask=(k_offset[:, None] < K - k * BLOCK_K) & (n_offset[None, :] < n_size),
+            other=0.0,
+        )
+        partial_sum += tl.dot(x_tile, w_tile)
+
+        x_ptrs += BLOCK_K * x_stride_1
+        w_ptrs += BLOCK_K * w_stride_2
+
+    # Store result to output matrix
+    partial_sum *= scaling
+    partial_sum = partial_sum.to(x.dtype.element_ty)
+    output_ptr = (output + seg_start * output_stride_0 + n_start * output_stride_1) + (
+        s_offset[:, None] * output_stride_0 + n_offset[None, :] * output_stride_1
+    )
+    output_mask = (s_offset[:, None] < seg_len) & (n_offset[None, :] < n_size)
+    partial_sum += tl.load(output_ptr, mask=output_mask)
+    tl.store(output_ptr, partial_sum, mask=output_mask)
+
+
+def qkv_lora_b_fwd(
+    x: torch.Tensor,
+    qkv_lora_b: torch.Tensor,
+    batch_info: LoRABatchInfo,
+    output_offset: torch.Tensor,
+    max_qkv_out_dim: int,
+    base_output: torch.Tensor = None,
+) -> torch.Tensor:
+
+    # x: (s, 3 * r)
+    # qkv_lora_b: (num_lora, output_dim_q + 2 * output_dim_kv, r)
+    # output_offset = [0, output_dim_q, output_dim_q + output_dim_kv,
+    #                     output_dim_q + 2 * output_dim_kv]
+    # max_qkv_out_dim = max(output_dim_q, output_dim_kv)
+    # output: (s, output_dim_q + 2 * output_dim_kv)
+
+    # Compute lora_output with shape (s, output_dim) as follows:
+    # lora_output[:, :output_dim_q] = sgemm(x[:, :r], qkv_lora_b[:, :outptu_dim_q, :])
+    # lora_output[:, output_dim_q: output_dim_q + output_dim_kv]
+    #      = sgemm(x[:, r: 2 * r], qkv_lora_b[:, outptu_dim_q: output_dim_q + output_dim_kv, :])
+    # lora_output[:, output_dim_q + output_dim_kv: ]
+    #      = sgemm(x[:, 2 * r: , qkv_lora_b[:, output_dim_q + output_dim_kv: , :])
+
+    # Get dims
+    s = x.shape[0]
+    input_dim = x.shape[1]
+    r = qkv_lora_b.shape[-1]
+    output_dim = qkv_lora_b.shape[-2]
+    assert input_dim == 3 * r
+    assert output_offset.shape[0] == 4
+
+    BLOCK_S = 16
+    BLOCK_R = 16
+    BLOCK_OUT = 64
+
+    grid_b = (
+        triton.cdiv(batch_info.max_len, BLOCK_S)
+        * triton.cdiv(max_qkv_out_dim, BLOCK_OUT),
+        3,  # this dimension decides current block computes on q, k or v
+        batch_info.bs,
+    )
+
+    if base_output is None:
+        output = torch.zeros((s, output_dim), device=x.device, dtype=x.dtype)
+    else:
+        output = base_output
+
+    _qkv_lora_b_kernel[grid_b](
+        x,
+        qkv_lora_b,
+        output,
+        r,
+        max_qkv_out_dim,
+        x.stride(0),
+        x.stride(1),
+        qkv_lora_b.stride(0),
+        qkv_lora_b.stride(1),
+        qkv_lora_b.stride(2),
+        output.stride(0),
+        output.stride(1),
+        batch_info.seg_lens,
+        batch_info.seg_indptr,
+        batch_info.weight_indices,
+        batch_info.lora_ranks,
+        output_offset,
+        BLOCK_S,
+        BLOCK_OUT,
+        BLOCK_R,
+        batch_info.scalings,
+    )
+
+    return output
diff --git a/python/sglang/srt/lora/triton_ops/sgemm_lora_a.py b/python/sglang/srt/lora/triton_ops/sgemm_lora_a.py
new file mode 100644
index 000000000..dded64bcf
--- /dev/null
+++ b/python/sglang/srt/lora/triton_ops/sgemm_lora_a.py
@@ -0,0 +1,170 @@
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.lora.utils import LoRABatchInfo
+
+
+@triton.jit
+def _sgemm_lora_a_kernel(
+    # Pointers to matrices
+    x,
+    weights,
+    output,
+    # Matrix dimensions
+    N,  # stack_num * r
+    K,  # input_dim
+    stack_num,
+    # Strides
+    x_stride_0,
+    x_stride_1,
+    w_stride_0,
+    w_stride_1,
+    w_stride_2,
+    output_stride_0,
+    output_stride_1,
+    # Information on sequence lengths,ranks and weight id
+    seg_lens,
+    seg_indptr,
+    weight_indices,
+    lora_ranks,
+    # Meta parameters
+    BLOCK_S: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    """
+    Computes a segmented batched matrix multiplication for the LoRA A matrix.
+
+    The kernel ensures that output[seg_start:seg_start + seg_len, :rank * stack_num]
+    stores the product of the input `x` and the LoRA weights for the corresponding
+    sequence. This implies that when rank is 0, the kernel is essentially a no-op,
+    as output[seg_start:seg_start + seg_len, :0] is trivially correct (empty).
+
+    Args:
+        x (torch.Tensor): The input activations tensor of shape `(s, K)`, where `s`
+            is the sum of all sequence lengths in the batch.
+        weights (torch.Tensor): The LoRA 'A' weights for all available adapters,
+            with shape `(num_lora, N, K)`.
+        output (torch.Tensor): The output tensor of shape `(s, N)`.
+    """
+
+    # Current block computes sequence with batch_id,
+    # which starts from row seg_start of x with length seg_len
+    batch_id = tl.program_id(axis=1)
+    w_index = tl.load(weight_indices + batch_id)
+    rank = tl.load(lora_ranks + w_index)
+
+    # If rank is 0, this kernel becomes a no-op as the output is always trivially correct.
+    if rank == 0:
+        return
+
+    pid = tl.program_id(axis=0)
+    seg_start = tl.load(seg_indptr + batch_id)
+    seg_len = tl.load(seg_lens + batch_id)
+
+    # Adjust N (stack_num * max_rank) according to the specific LoRA adapter
+    N = tl.minimum(N, rank * stack_num)
+
+    # The tile in output matrix will have (pid_s, pid_n) as id
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    pid_s = pid // num_pid_n
+    pid_n = pid % num_pid_n
+
+    # Create pointers for the first block of x and weights[batch_id]
+    # The pointers will be advanced as we move in the K direction
+    # and accumulate
+    s_offset = tl.arange(0, BLOCK_S) + pid_s * BLOCK_S
+    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    k_offset = tl.arange(0, BLOCK_K)
+    x_ptrs = (x + seg_start * x_stride_0) + (
+        s_offset[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1
+    )
+    w_ptrs = (weights + w_index * w_stride_0) + (
+        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
+    )
+
+    # Iterate to compute the block in output matrix
+    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        x_tile = tl.load(
+            x_ptrs,
+            mask=(s_offset[:, None] < seg_len) & (k_offset[None, :] < K - k * BLOCK_K),
+            other=0.0,
+        )
+        w_tile = tl.load(
+            w_ptrs,
+            mask=(k_offset[:, None] < K - k * BLOCK_K) & (n_offset[None, :] < N),
+            other=0.0,
+        )
+        partial_sum += tl.dot(x_tile, w_tile)
+
+        x_ptrs += BLOCK_K * x_stride_1
+        w_ptrs += BLOCK_K * w_stride_2
+
+    # Store result to output matrix
+    partial_sum = partial_sum.to(x.dtype.element_ty)
+    output_ptr = (output + seg_start * output_stride_0) + (
+        s_offset[:, None] * output_stride_0 + n_offset[None, :] * output_stride_1
+    )
+    output_mask = (s_offset[:, None] < seg_len) & (n_offset[None, :] < N)
+    tl.store(output_ptr, partial_sum, mask=output_mask)
+
+
+def sgemm_lora_a_fwd(
+    x: torch.Tensor,
+    weights: torch.Tensor,
+    batch_info: LoRABatchInfo,
+    stack_num: int = 1,
+) -> torch.Tensor:
+    # x: (s, input_dim)
+    # weights: (num_lora, stack_num * r, input_dim)
+    # output: (s, stack_num * r)
+    # stack_num: run_qkv_lora: 3, run_gate_up_lora: 2
+    # when called by run_qkv_lora, the weights.shape[-2] will be 3 * r
+    # input_dim is much larger than r
+
+    assert x.is_contiguous()
+    assert weights.is_contiguous()
+    assert len(x.shape) == 2
+    assert len(weights.shape) == 3
+
+    S = x.shape[0]
+    R = weights.shape[-2]
+    K = weights.shape[-1]
+    assert x.shape[-1] == K
+
+    # Block shapes
+    BLOCK_S = 16
+    BLOCK_K = 256
+    BLOCK_R = 16
+
+    grid = (
+        triton.cdiv(batch_info.max_len, BLOCK_S) * triton.cdiv(R, BLOCK_R),
+        batch_info.bs,
+    )
+
+    output = torch.empty((S, R), device=x.device, dtype=x.dtype)
+    _sgemm_lora_a_kernel[grid](
+        x,
+        weights,
+        output,
+        R,
+        K,
+        stack_num,
+        x.stride(0),
+        x.stride(1),
+        weights.stride(0),
+        weights.stride(1),
+        weights.stride(2),
+        output.stride(0),
+        output.stride(1),
+        batch_info.seg_lens,
+        batch_info.seg_indptr,
+        batch_info.weight_indices,
+        batch_info.lora_ranks,
+        BLOCK_S,
+        BLOCK_R,
+        BLOCK_K,
+    )
+    return output
diff --git a/python/sglang/srt/lora/triton_ops/sgemm_lora_b.py b/python/sglang/srt/lora/triton_ops/sgemm_lora_b.py
new file mode 100644
index 000000000..357d32805
--- /dev/null
+++ b/python/sglang/srt/lora/triton_ops/sgemm_lora_b.py
@@ -0,0 +1,176 @@
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.lora.utils import LoRABatchInfo
+
+
+@triton.jit
+def _sgemm_lora_b_kernel(
+    # Pointers to matrices
+    x,
+    weights,
+    output,
+    # Matrix dimensions
+    N,  # output_dim
+    K,  # r
+    # Strides
+    x_stride_0,
+    x_stride_1,
+    w_stride_0,
+    w_stride_1,
+    w_stride_2,
+    output_stride_0,
+    output_stride_1,
+    # Information on sequence lengths and weight id
+    seg_lens,
+    seg_indptr,
+    weight_indices,
+    lora_ranks,
+    # Meta parameters
+    BLOCK_S: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    # For fused output scaling
+    scalings,
+):
+    """
+    Computes a segmented batched matrix multiplication for the LoRA B matrix
+    and adds the result to the output in-place.
+
+    When a sequence's rank is 0, the kernel is essentially a no-op, following
+    the convention in pytorch where the product of two matrices of shape (m, 0)
+    and (0, n) is an all-zero matrix of shape (m, n).
+
+    Args:
+        x (torch.Tensor): The intermediate tensor from the LoRA 'A' multiplication,
+            of shape `(s, K)`, where `s` is the total number of tokens.
+        weights (torch.Tensor): The LoRA 'B' weights for all available adapters,
+            with shape `(num_lora, N, K)`.
+        output (torch.Tensor): The output tensor of shape `(s, N)`. This can be
+            the base model's output for a fused add operation.
+    """
+
+    # Current block computes sequence with batch_id,
+    # which starts from row seg_start of x with length seg_len
+    batch_id = tl.program_id(axis=1)
+    w_index = tl.load(weight_indices + batch_id)
+    rank = tl.load(lora_ranks + w_index)
+
+    # If rank is 0, this kernel is a no-op.
+    if rank == 0:
+        return
+
+    pid = tl.program_id(axis=0)
+    seg_len = tl.load(seg_lens + batch_id)
+    seg_start = tl.load(seg_indptr + batch_id)
+    scaling = tl.load(scalings + w_index)
+    # Adjust K (rank) according to the specific LoRA adapter
+    K = tl.minimum(K, rank)
+
+    # The tile in output matrix will have (pid_s, pid_n) as id
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    pid_s = pid // num_pid_n
+    pid_n = pid % num_pid_n
+
+    # Create pointers for the first block of x and weights[batch_id]
+    # The pointers will be advanced as we move in the K direction
+    # and accumulate
+    s_offset = tl.arange(0, BLOCK_S) + pid_s * BLOCK_S
+    n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    k_offset = tl.arange(0, BLOCK_K)
+    x_ptrs = (x + seg_start * x_stride_0) + (
+        s_offset[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1
+    )
+    w_ptrs = (weights + w_index * w_stride_0) + (
+        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
+    )
+
+    # Iterate to compute the block in output matrix
+    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        x_tile = tl.load(
+            x_ptrs,
+            mask=(s_offset[:, None] < seg_len) & (k_offset[None, :] < K - k * BLOCK_K),
+            other=0.0,
+        )
+        w_tile = tl.load(
+            w_ptrs,
+            mask=(k_offset[:, None] < K - k * BLOCK_K),
+            other=0.0,
+        )
+        partial_sum += tl.dot(x_tile, w_tile)
+
+        x_ptrs += BLOCK_K * x_stride_1
+        w_ptrs += BLOCK_K * w_stride_2
+
+    # Store result to output matrix
+    partial_sum *= scaling
+    partial_sum = partial_sum.to(x.dtype.element_ty)
+    output_ptr = (output + seg_start * output_stride_0) + (
+        s_offset[:, None] * output_stride_0 + n_offset[None, :] * output_stride_1
+    )
+    output_mask = s_offset[:, None] < seg_len
+    partial_sum += tl.load(output_ptr, mask=output_mask)
+    tl.store(output_ptr, partial_sum, mask=output_mask)
+
+
+def sgemm_lora_b_fwd(
+    x: torch.Tensor,
+    weights: torch.Tensor,
+    batch_info: LoRABatchInfo,
+    base_output: torch.Tensor = None,
+) -> torch.Tensor:
+    # x: (s, max_r)
+    # weights: (num_lora, output_dim, max_r)
+    # output: (s, output_dim)
+    # output_dim is much larger than max_r
+
+    assert x.is_contiguous()
+    assert weights.is_contiguous()
+    assert len(x.shape) == 2
+    assert len(weights.shape) == 3
+
+    S = x.shape[0]
+    N = weights.shape[-2]
+    R = weights.shape[-1]
+    assert x.shape[-1] == R
+
+    # Block shapes
+    BLOCK_S = 16
+    BLOCK_R = 16
+    BLOCK_N = 256
+
+    grid = (
+        triton.cdiv(batch_info.max_len, BLOCK_S) * triton.cdiv(N, BLOCK_N),
+        batch_info.bs,
+    )
+
+    if base_output is None:
+        output = torch.zeros((S, N), device=x.device, dtype=x.dtype)
+    else:
+        output = base_output
+
+    _sgemm_lora_b_kernel[grid](
+        x,
+        weights,
+        output,
+        N,
+        R,
+        x.stride(0),
+        x.stride(1),
+        weights.stride(0),
+        weights.stride(1),
+        weights.stride(2),
+        output.stride(0),
+        output.stride(1),
+        batch_info.seg_lens,
+        batch_info.seg_indptr,
+        batch_info.weight_indices,
+        batch_info.lora_ranks,
+        BLOCK_S,
+        BLOCK_N,
+        BLOCK_R,
+        batch_info.scalings,
+    )
+    return output
diff --git a/python/sglang/srt/lora/utils.py b/python/sglang/srt/lora/utils.py
new file mode 100644
index 000000000..459c943b7
--- /dev/null
+++ b/python/sglang/srt/lora/utils.py
@@ -0,0 +1,143 @@
+import re
+from dataclasses import dataclass
+from enum import Enum
+from typing import Iterable, Optional, Set, Tuple
+
+import torch
+
+from sglang.srt.hf_transformers_utils import AutoConfig
+
+
+@dataclass
+class LoRABatchInfo:
+    # The forward mode is using CUDA Graph.
+    use_cuda_graph: bool
+
+    # Batch size
+    bs: int
+
+    # Number of segments. For triton backend, it is equal to batch size.
+    num_segments: int
+
+    # Indice pointers of each segment in shape (num_segments + 1, )
+    seg_indptr: torch.Tensor
+
+    # The index of lora adapter used by each segment, in shape (num_segments,)
+    weight_indices: torch.Tensor
+
+    # ranks of each lora adapter, in shape (lora_num,)
+    lora_ranks: torch.Tensor
+
+    # scaling of each lora adapter, in shape (lora_num,)
+    scalings: torch.Tensor
+
+    # Lengths of each segments in shape (num_segments,)
+    seg_lens: Optional[torch.Tensor]
+
+    # Maximum segment length of current batch
+    max_len: Optional[int]
+
+    # The logical (re)ordering of input rows (tokens), in shape (num_tokens,)
+    permutation: Optional[torch.Tensor]
+
+
+class LoRAType(Enum):
+    LORA_A = 0
+    LORA_B = 1
+
+
+def get_layer_id(name: str) -> int:
+    """
+    Extract integer id of layer from its name in string.
+    """
+    match = re.search(r"layers\.(\d+)\.", name)
+    if match is None:
+        return None
+    return int(match.group(1))
+
+
+def get_hidden_dim(
+    module_name: str, config: AutoConfig, base_model: torch.nn.Module, layer_idx: int
+) -> Tuple[int]:
+    """
+    Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
+    """
+
+    if hasattr(base_model, "get_hidden_dim"):
+        return base_model.get_hidden_dim(module_name, layer_idx)
+    else:
+        """
+        WARNING: get_hidden_dim() is not defined,
+        which is used to get the hidden dim for different lora modules
+        Use the default one, but please check if it is correct for your model.
+        Please implement the function in the model class if it is not.
+        You can reference this function in llama.py.
+        """
+        head_dim = getattr(
+            config, "head_dim", config.hidden_size // config.num_attention_heads
+        )
+        if module_name == "qkv_proj":
+            return config.hidden_size, head_dim * (
+                config.num_attention_heads + config.num_key_value_heads * 2
+            )
+        elif module_name == "o_proj":
+            return (
+                head_dim * config.num_attention_heads,
+                config.hidden_size,
+            )
+        elif module_name == "gate_up_proj":
+            return config.hidden_size, config.intermediate_size * 2
+        elif module_name == "down_proj":
+            return config.intermediate_size, config.hidden_size
+        else:
+            raise NotImplementedError()
+
+
+def get_normalized_target_modules(
+    target_modules: Iterable[str],
+) -> set[str]:
+    """
+    Mapping a list of target module name to names of the normalized LoRA weights.
+    """
+    params_mapping = {
+        "q_proj": "qkv_proj",
+        "k_proj": "qkv_proj",
+        "v_proj": "qkv_proj",
+        "gate_proj": "gate_up_proj",
+        "up_proj": "gate_up_proj",
+    }
+
+    result = set()
+    for name in target_modules:
+        normalized_name = params_mapping.get(name, name)
+        result.add(normalized_name)
+    return result
+
+
+def get_stacked_multiply(module_name: str) -> int:
+    """
+    Mapping a lora module name to its magnification at output dimension
+    """
+    stacked_rank = {
+        "qkv_proj": 3,
+        "gate_up_proj": 2,
+    }
+    return stacked_rank[module_name] if module_name in stacked_rank else 1
+
+
+def get_target_module_name(full_module_name: str, target_modules: Set[str]) -> str:
+    """
+    Get the target module name in target_modules that can match full_module_name.
+
+    If there is a target module name in target_modules that can match full_module_name, return this name
+    Else raise ValueError.
+    """
+    for target_module in target_modules:
+        if target_module in full_module_name:
+            return target_module
+    raise ValueError(
+        f"Cannot find target module name for {full_module_name} in {target_modules}"
+    )
+
+
+ROW_PARALLELISM_LINEAR_LORA_NAMES = ["o_proj", "down_proj"]
diff --git a/python/sglang/srt/managers/cache_controller.py b/python/sglang/srt/managers/cache_controller.py
new file mode 100644
index 000000000..f9d45b2f7
--- /dev/null
+++ b/python/sglang/srt/managers/cache_controller.py
@@ -0,0 +1,886 @@
+from __future__ import annotations
+
+"""
+Copyright 2023-2025 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import logging
+import math
+import threading
+import time
+from queue import Empty, Full, PriorityQueue, Queue
+from typing import TYPE_CHECKING, List, NamedTuple, Optional, Set, Tuple
+
+import torch
+
+from sglang.srt.mem_cache.hicache_storage import HiCacheStorageConfig
+
+if TYPE_CHECKING:
+    from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
+    from sglang.srt.mem_cache.memory_pool_host import HostKVCache
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.dp_attention import (
+    get_attention_dp_rank,
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool, MLATokenToKVPool
+
+logger = logging.getLogger(__name__)
+
+
+class LayerLoadingEvent:
+    def __init__(self, num_layers: int):
+        self._num_layers = num_layers
+        self.load_events = [torch.cuda.Event() for _ in range(num_layers)]
+        self.start_event = torch.cuda.Event()  # start event on controller stream
+
+    def complete(self, layer_index: int):
+        assert 0 <= layer_index < self._num_layers
+        self.load_events[layer_index].record()
+
+    def wait(self, layer_index: int):
+        torch.cuda.current_stream().wait_event(self.load_events[layer_index])
+
+    @property
+    def finish_event(self):
+        return self.load_events[-1]
+
+
+class LayerDoneCounter:
+    def __init__(self, num_layers: int):
+        self.num_layers = num_layers
+        # extra producer and consumer counters for overlap mode
+        self.num_counters = 3
+        self.events = [LayerLoadingEvent(num_layers) for _ in range(self.num_counters)]
+        self.producer_index = -1
+        self.consumer_index = -1
+
+    def update_producer(self):
+        self.producer_index = (self.producer_index + 1) % self.num_counters
+        assert self.events[
+            self.producer_index
+        ].finish_event.query(), (
+            "Producer finish event should be ready before being reused."
+        )
+        return self.producer_index
+
+    def set_consumer(self, index: int):
+        self.consumer_index = index
+
+    def wait_until(self, threshold: int):
+        if self.consumer_index < 0:
+            return
+        self.events[self.consumer_index].wait(threshold)
+
+    def reset(self):
+        self.producer_index = -1
+        self.consumer_index = -1
+
+
+class CacheOperation:
+
+    counter = 0
+
+    def __init__(
+        self,
+        host_indices: torch.Tensor,
+        device_indices: torch.Tensor,
+        node_id: int,
+        priority: Optional[int] = None,
+    ):
+        self.host_indices = host_indices
+        self.device_indices = device_indices
+        self.node_ids = [node_id]
+        self.data = None
+
+        self.id = CacheOperation.counter
+        CacheOperation.counter += 1
+        # default priority is the order of creation
+        self.priority = priority if priority is not None else self.id
+
+    @staticmethod
+    def merge_ops(ops: List[CacheOperation]) -> CacheOperation:
+        assert len(ops) > 0
+        if len(ops) == 1:
+            return ops[0]
+
+        host_indices = torch.cat([op.host_indices for op in ops])
+        device_indices = torch.cat([op.device_indices for op in ops])
+        node_ids = []
+        priority = min(op.priority for op in ops)
+        for op in ops:
+            node_ids.extend(op.node_ids)
+        merged_op = CacheOperation(host_indices, device_indices, -1, priority)
+        merged_op.node_ids = node_ids
+        return merged_op
+
+    def __lt__(self, other: CacheOperation):
+        return self.priority < other.priority
+
+
+class HiCacheAck(NamedTuple):
+    start_event: torch.cuda.Event
+    finish_event: torch.cuda.Event
+    node_ids: List[int]
+
+
+class TransferBuffer:
+    """
+    Overlapping buffer preparation and transfer operations to improve throughput.
+    """
+
+    def __init__(
+        self, stop_event, buffer_count: int = 3, max_buffer_size: int = 1024
+    ) -> None:
+        self.stop_event = stop_event
+        self.buffers = Queue(maxsize=buffer_count)
+        # todo: adjust the buffer size based on throughput profile of the system
+        self.max_buffer_size = max_buffer_size
+
+    def full(self) -> bool:
+        return self.buffers.full()
+
+    def empty(self) -> bool:
+        return self.buffers.empty()
+
+    def put(self, item, block=True, timeout=1) -> None:
+        while not self.stop_event.is_set():
+            try:
+                self.buffers.put(item, block=block, timeout=timeout)
+                break
+            except Full:
+                if not block:
+                    break
+                continue
+            except Exception as e:
+                logger.error(e)
+
+    def get(self, block=True, timeout=1) -> Optional[CacheOperation]:
+        try:
+            return self.buffers.get(block=block, timeout=timeout)
+        except Empty:
+            return None
+        except Exception as e:
+            logger.error(e)
+
+    def clear(self):
+        self.buffers.queue.clear()
+
+
+class StorageOperation:
+    counter = 0
+
+    def __init__(
+        self,
+        host_indices: torch.Tensor,
+        token_ids: List[int],
+        last_hash: Optional[str] = None,
+        hash_value: Optional[List[str]] = None,
+    ):
+        self.host_indices = host_indices
+        self.token_ids = token_ids
+        self.last_hash = last_hash
+        self.completed_tokens = 0
+        self.hash_value = hash_value if hash_value is not None else []
+
+        self.id = StorageOperation.counter
+        StorageOperation.counter += 1
+
+    def __lt__(self, other: "StorageOperation"):
+        return self.id < other.id
+
+
+class PrefetchOperation(StorageOperation):
+    def __init__(
+        self,
+        request_id: str,
+        host_indices: torch.Tensor,
+        token_ids: List[int],
+        last_hash: Optional[str] = None,
+    ):
+        self.request_id = request_id
+
+        self._lock = threading.Lock()
+        self._terminated_flag = False
+        self.start_time = time.monotonic()
+
+        super().__init__(host_indices, token_ids, last_hash)
+
+    def increment(self, num_tokens: int):
+        with self._lock:
+            if self._terminated_flag:
+                return False
+            self.completed_tokens += num_tokens
+            return True
+
+    def mark_terminate(self):
+        with self._lock:
+            self._terminated_flag = True
+
+    def is_terminated(self) -> bool:
+        return self._terminated_flag
+
+
+class HiCacheController:
+
+    def __init__(
+        self,
+        token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator,
+        mem_pool_host: HostKVCache,
+        page_size: int,
+        tp_group: torch.distributed.ProcessGroup,
+        load_cache_event: threading.Event,
+        write_policy: str = "write_through_selective",
+        io_backend: str = "",
+        storage_backend: Optional[str] = None,
+        prefetch_threshold: int = 256,
+        model_name: Optional[str] = None,
+        storage_backend_extra_config: Optional[str] = None,
+    ):
+        self.mem_pool_device_allocator = token_to_kv_pool_allocator
+        self.mem_pool_device = token_to_kv_pool_allocator.get_kvcache()
+        self.mem_pool_host = mem_pool_host
+        self.write_policy = write_policy
+        self.page_size = page_size
+        self.io_backend = io_backend
+        self.enable_storage = False
+
+        if storage_backend is not None:
+            self.storage_backend_type = storage_backend
+            from sglang.srt.mem_cache.hicache_storage import get_hash_str
+
+            self.get_hash_str = get_hash_str
+            self.storage_config = self._generate_storage_config(
+                model_name, storage_backend_extra_config
+            )
+            # for MLA models, only one rank needs to backup the KV cache
+            self.backup_skip = (
+                self.storage_config.is_mla_model
+                # todo: load balancing
+                and self.storage_config.tp_rank != 0
+            )
+
+            if storage_backend == "file":
+                from sglang.srt.mem_cache.hicache_storage import HiCacheFile
+
+                self.storage_backend = HiCacheFile(self.storage_config)
+            elif storage_backend == "nixl":
+                from sglang.srt.mem_cache.storage.nixl.hicache_nixl import HiCacheNixl
+
+                self.storage_backend = HiCacheNixl()
+            elif storage_backend == "mooncake":
+                from sglang.srt.mem_cache.storage.mooncake_store.mooncake_store import (
+                    MooncakeStore,
+                )
+
+                self.storage_backend = MooncakeStore(self.storage_config)
+                self.storage_backend.register_buffer(self.mem_pool_host.kv_buffer)
+                assert self.mem_pool_host.layout == "page_first"
+            elif storage_backend == "hf3fs":
+                from sglang.srt.mem_cache.storage.hf3fs.storage_hf3fs import (
+                    HiCacheHF3FS,
+                )
+
+                if self.mem_pool_host.layout == "page_first":
+                    bytes_per_page = (
+                        mem_pool_host.get_ksize_per_token() * mem_pool_host.page_size
+                    )
+                elif self.mem_pool_host.layout == "layer_first":
+                    bytes_per_page = (
+                        mem_pool_host.get_size_per_token() * mem_pool_host.page_size
+                    )
+                dtype = mem_pool_host.dtype
+                self.storage_backend = HiCacheHF3FS.from_env_config(
+                    bytes_per_page, dtype, self.storage_config
+                )
+            else:
+                raise NotImplementedError(
+                    f"Unsupported storage backend: {storage_backend}"
+                )
+
+            self.enable_storage = True
+            # todo: threshold policy for prefetching
+            self.prefetch_threshold = max(prefetch_threshold, self.page_size)
+            self.prefetch_capacity_limit = int(
+                0.8 * (self.mem_pool_host.size - self.mem_pool_device.size)
+            )
+            # granularity of batch storage IO operations, in number of pages
+            self.storage_batch_size = 128
+            # tracking the number of tokens locked in prefetching, updated by the main scheduler thread
+            self.prefetch_tokens_occupied = 0
+
+            # create a new communication group for synchronizing storage operations across TP workers
+            self.tp_world_size = torch.distributed.get_world_size(group=tp_group)
+            if self.tp_world_size > 1:
+                group_ranks = torch.distributed.get_process_group_ranks(tp_group)
+                self.prefetch_tp_group = torch.distributed.new_group(
+                    group_ranks, backend="gloo"
+                )
+
+            # Select the get and set functions
+            self.page_get_func = self._generic_page_get
+            self.page_set_func = self._generic_page_set
+            self.batch_exists_func = self.storage_backend.batch_exists
+            self.is_3fs_zerocopy = (
+                self.storage_backend_type == "hf3fs"
+                and self.mem_pool_host.layout == "page_first"
+            )
+            if self.storage_backend_type == "mooncake":
+                self.page_get_func = self._mooncake_page_get
+                self.page_set_func = self._mooncake_page_set
+            elif self.is_3fs_zerocopy:
+                self.page_get_func = self._3fs_zero_copy_page_get
+                self.page_set_func = self._3fs_zero_copy_page_set
+                self.batch_exists_func = self._3fs_zero_copy_batch_exists
+
+        self.device = self.mem_pool_device.device
+        self.layer_num = self.mem_pool_device.layer_num
+        self.layer_done_counter = LayerDoneCounter(self.layer_num)
+        self.mem_pool_device.register_layer_transfer_counter(self.layer_done_counter)
+
+        if write_policy not in [
+            "write_through",
+            "write_through_selective",
+            "write_back",
+        ]:
+            raise ValueError(f"Invalid write policy: {write_policy}")
+
+        # self.write_queue = PriorityQueue[CacheOperation]()
+        self.load_queue: List[CacheOperation] = []
+        self.write_queue: List[CacheOperation] = []
+        self.ack_load_queue: List[HiCacheAck] = []
+        self.ack_write_queue: List[HiCacheAck] = []
+
+        self.stop_event = threading.Event()
+        self.write_buffer = TransferBuffer(self.stop_event)
+        self.load_buffer = TransferBuffer(
+            self.stop_event, buffer_count=10, max_buffer_size=100
+        )
+
+        self.write_stream = torch.cuda.Stream()
+        self.load_stream = torch.cuda.Stream()
+
+        if self.enable_storage:
+            self.prefetch_thread = threading.Thread(
+                target=self.prefetch_thread_func, daemon=True
+            )
+            self.backup_thread = threading.Thread(
+                target=self.backup_thread_func, daemon=True
+            )
+            self.prefetch_queue = Queue()
+            self.backup_queue = Queue()
+
+            self.prefetch_revoke_queue = Queue()
+            self.ack_backup_queue = Queue()
+            self.host_mem_release_queue = Queue()
+
+            self.prefetch_thread.start()
+            self.backup_thread.start()
+
+    def _generate_storage_config(
+        self,
+        model_name: Optional[str] = None,
+        storage_backend_extra_config: Optional[str] = None,
+    ):
+
+        if is_dp_attention_enabled():
+            self.tp_rank = get_attention_tp_rank()
+            self.tp_size = get_attention_tp_size()
+            self.dp_rank = get_attention_dp_rank()
+        else:
+            self.tp_rank = get_tensor_model_parallel_rank()
+            self.tp_size = get_tensor_model_parallel_world_size()
+            self.dp_rank = 0
+
+        # Currently, AscendMLAPagedTokenToKVPool is the subclass of MLATokenToKVPool.
+        is_mla_backend = isinstance(self.mem_pool_device, MLATokenToKVPool)
+
+        # Parse extra config JSON if provided
+        extra_config = None
+        if storage_backend_extra_config:
+            try:
+                import json
+
+                extra_config = json.loads(storage_backend_extra_config)
+            except Exception as e:
+                logger.error(f"Invalid backend extra config JSON: {e}")
+
+        return HiCacheStorageConfig(
+            tp_rank=self.tp_rank,
+            tp_size=self.tp_size,
+            is_mla_model=is_mla_backend,
+            is_page_first_layout=self.mem_pool_host.layout == "page_first",
+            model_name=model_name,
+            extra_config=extra_config,
+        )
+
+    def reset(self):
+        self.stop_event.set()
+
+        self.write_queue.clear()
+        self.load_queue.clear()
+        self.write_buffer.clear()
+        self.load_buffer.clear()
+        self.ack_write_queue.clear()
+        self.ack_load_queue.clear()
+        if self.enable_storage:
+            self.prefetch_thread.join()
+            self.backup_thread.join()
+            self.prefetch_queue.queue.clear()
+            self.backup_queue.queue.clear()
+            self.prefetch_revoke_queue.queue.clear()
+            self.ack_backup_queue.queue.clear()
+
+        self.stop_event.clear()
+
+        if self.enable_storage:
+            self.prefetch_thread = threading.Thread(
+                target=self.prefetch_thread_func, daemon=True
+            )
+            self.backup_thread = threading.Thread(
+                target=self.backup_thread_func, daemon=True
+            )
+            self.prefetch_thread.start()
+            self.backup_thread.start()
+
+    def write(
+        self,
+        device_indices: torch.Tensor,
+        priority: Optional[int] = None,
+        node_id: int = -1,
+    ) -> Optional[torch.Tensor]:
+        """
+        Back up KV caches from device memory to host memory.
+        """
+        host_indices = self.mem_pool_host.alloc(len(device_indices))
+        if host_indices is None:
+            return None
+        self.mem_pool_host.protect_write(host_indices)
+        self.write_queue.append(
+            CacheOperation(host_indices, device_indices, node_id, priority)
+        )
+        self.start_writing()
+        return host_indices
+
+    def start_writing(self) -> None:
+        if len(self.write_queue) == 0:
+            return
+
+        op = CacheOperation.merge_ops(self.write_queue)
+        host_indices, device_indices = self.move_indices(op)
+        self.write_queue.clear()
+
+        start_event = torch.cuda.Event()
+        finish_event = torch.cuda.Event()
+
+        start_event.record()
+        with torch.cuda.stream(self.write_stream):
+            start_event.wait(self.write_stream)
+            self.mem_pool_host.backup_from_device_all_layer(
+                self.mem_pool_device, host_indices, device_indices, self.io_backend
+            )
+            self.mem_pool_host.complete_io(op.host_indices)
+            finish_event.record()
+            # NOTE: We must save the host indices and device indices here,
+            # this is because we need to guarantee that these tensors are
+            # still alive when the write stream is executing.
+            if host_indices.is_cuda:
+                host_indices.record_stream(self.write_stream)
+            if device_indices.is_cuda:
+                device_indices.record_stream(self.write_stream)
+
+        self.ack_write_queue.append(HiCacheAck(start_event, finish_event, op.node_ids))
+
+    def load(
+        self,
+        host_indices: torch.Tensor,
+        priority: Optional[int] = None,
+        node_id: int = -1,
+    ) -> Optional[torch.Tensor]:
+        """
+        Load KV caches from host memory to device memory.
+        """
+        device_indices = self.mem_pool_device_allocator.alloc(len(host_indices))
+        if device_indices is None:
+            return None
+        self.mem_pool_host.protect_load(host_indices)
+        self.load_queue.append(
+            CacheOperation(host_indices, device_indices, node_id, priority)
+        )
+        return device_indices
+
+    def move_indices(self, op: CacheOperation):
+        host_indices, device_indices = op.host_indices, op.device_indices
+        # move indices to GPU if using kernels, to host if using direct indexing
+        if self.io_backend == "kernel":
+            if not host_indices.is_cuda:
+                host_indices = host_indices.to(self.device, non_blocking=True)
+            return host_indices, device_indices
+        elif self.io_backend == "direct":
+            device_indices = device_indices.cpu()
+            host_indices, idx = host_indices.sort()
+            return host_indices, device_indices.index_select(0, idx)
+        else:
+            raise ValueError(f"Unsupported io backend")
+
+    def start_loading(self) -> int:
+        if len(self.load_queue) == 0:
+            return -1
+
+        producer_id = self.layer_done_counter.update_producer()
+        op = CacheOperation.merge_ops(self.load_queue)
+        host_indices, device_indices = self.move_indices(op)
+        self.load_queue.clear()
+        producer_event = self.layer_done_counter.events[producer_id]
+        producer_event.start_event.record()
+
+        with torch.cuda.stream(self.load_stream):
+            producer_event.start_event.wait(self.load_stream)
+            for i in range(self.layer_num):
+                self.mem_pool_host.load_to_device_per_layer(
+                    self.mem_pool_device,
+                    host_indices,
+                    device_indices,
+                    i,
+                    self.io_backend,
+                )
+                producer_event.complete(i)
+            self.mem_pool_host.complete_io(op.host_indices)
+            # NOTE: We must save the host indices and device indices here,
+            # this is because we need to guarantee that these tensors are
+            # still alive when the load stream is executing.
+            if host_indices.is_cuda:
+                host_indices.record_stream(self.load_stream)
+            if device_indices.is_cuda:
+                device_indices.record_stream(self.load_stream)
+
+        self.ack_load_queue.append(
+            HiCacheAck(
+                start_event=producer_event.start_event,
+                finish_event=producer_event.finish_event,
+                node_ids=op.node_ids,
+            )
+        )
+        return producer_id
+
+    def evict_device(
+        self, device_indices: torch.Tensor, host_indices: torch.Tensor
+    ) -> int:
+        if self.mem_pool_host.is_synced(host_indices):
+            self.mem_pool_device_allocator.free(device_indices)
+            self.mem_pool_host.update_backup(host_indices)
+            return len(device_indices)
+        else:
+            raise ValueError(
+                f"Inconsistent states: {self.mem_pool_host.get_state(host_indices)}"
+            )
+
+    def evict_host(self, host_indices: torch.Tensor, backup_only: bool = True) -> int:
+        if not backup_only:
+            raise ValueError("Other eviction policies are not supported yet.")
+
+        if self.mem_pool_host.is_backup(host_indices):
+            self.mem_pool_host.free(host_indices)
+            return len(host_indices)
+        else:
+            raise ValueError(
+                f"Inconsistent states: {self.mem_pool_host.get_state(host_indices)}"
+            )
+
+    def prefetch(
+        self,
+        request_id: str,
+        host_indices: torch.Tensor,
+        new_input_tokens: List[int],
+        last_hash: Optional[str] = None,
+    ) -> PrefetchOperation:
+        """
+        Prefetch KV caches from storage backend to host memory.
+        """
+        operation = PrefetchOperation(
+            request_id, host_indices, new_input_tokens, last_hash
+        )
+        self.prefetch_queue.put(operation)
+        return operation
+
+    def terminate_prefetch(self, operation):
+        operation.mark_terminate()
+        return operation.completed_tokens, operation.hash_value
+
+    def append_host_mem_release(self, host_indices: torch.Tensor):
+        chunks = host_indices.split(self.mem_pool_host.page_size)
+        for chunk in chunks:
+            self.host_mem_release_queue.put(chunk)
+
+    def _3fs_zero_copy_batch_exists(self, batch_hashes):
+        _batch_hashes, _, factor = self.mem_pool_host.get_buffer_with_hash(batch_hashes)
+        hit_page_num = self.storage_backend.batch_exists(_batch_hashes) // factor
+        return hit_page_num
+
+    def _3fs_zero_copy_page_get(self, operation, hash_values, host_indices):
+        hashes, dsts, factor = self.mem_pool_host.get_buffer_with_hash(
+            hash_values, host_indices
+        )
+        page_data = self.storage_backend.batch_get(hashes, dsts)
+        if page_data:
+            inc = self.page_size * len(hashes) // factor
+            operation.increment(inc)
+        else:
+            logger.warning(
+                f"Prefetch operation {operation.request_id} failed to retrieve page {hashes}."
+            )
+
+    def _mooncake_page_get(self, operation, hash_values, host_indices):
+        key_strs, buffer_ptrs, buffer_sizes = self.mem_pool_host.get_buffer_meta(
+            hash_values,
+            host_indices,
+            self.storage_config.tp_rank,
+        )
+        get_result = self.storage_backend.batch_get(
+            key_strs,
+            target_locations=buffer_ptrs,
+            target_sizes=buffer_sizes,
+        )
+        if get_result != len(hash_values):
+            logger.warning(
+                f"Prefetch operation {operation.request_id} failed or partially failed."
+            )
+        if get_result != 0:
+            operation.increment(get_result * self.page_size)
+
+    def _generic_page_get(self, operation, hash_values, host_indices):
+        dummy_page_dst = [
+            self.mem_pool_host.get_dummy_flat_data_page() for _ in hash_values
+        ]
+        page_data = self.storage_backend.batch_get(hash_values, dummy_page_dst)
+        if page_data is None:
+            return
+        for i in range(len(hash_values)):
+            if page_data[i] is None:
+                logger.warning(
+                    f"Prefetch operation {operation.request_id} failed to retrieve page {hash_values[i]}."
+                )
+                break
+            # Must set the data before increasing the completed tokens.
+            # Otherwise this page may be read before being set.
+            self.mem_pool_host.set_from_flat_data_page(
+                host_indices[i * self.page_size],
+                page_data[i],
+            )
+            if not operation.increment(self.page_size):
+                break  # Operation terminated by controller
+
+    def _page_transfer(self, operation):
+        # Transfer batch by batch
+        for i in range(0, len(operation.hash_value), self.storage_batch_size):
+            batch_hashes = operation.hash_value[i : i + self.storage_batch_size]
+            batch_host_indices = operation.host_indices[
+                i * self.page_size : (i + len(batch_hashes)) * self.page_size
+            ]
+            prev_completed_tokens = operation.completed_tokens
+            # Get one batch token, and update the completed_tokens if succeed
+            self.page_get_func(operation, batch_hashes, batch_host_indices)
+            # Check termination
+            if (
+                operation.completed_tokens
+                != prev_completed_tokens + len(batch_hashes) * self.page_size
+            ):
+                operation.mark_terminate()
+                break  # Some operations fail or operation terminated by controller
+        # release pre-allocated memory
+        self.append_host_mem_release(
+            operation.host_indices[operation.completed_tokens :]
+        )
+
+    def prefetch_io_aux_func(self):
+        """
+        Auxiliary function conducting IO operations for prefetching.
+        """
+        while not self.stop_event.is_set():
+            try:
+                operation = self.prefetch_buffer.get(block=True, timeout=1)
+                self._page_transfer(operation)
+                # operation terminated by controller, release pre-allocated memory
+                self.append_host_mem_release(
+                    operation.host_indices[operation.completed_tokens :]
+                )
+            except Empty:
+                continue
+
+    def prefetch_rate_limited(self) -> bool:
+        """
+        Rate limit the prefetching operations to avoid overwhelming the storage backend.
+        """
+        # cancel prefetch if too much memory is occupied
+        if self.prefetch_tokens_occupied >= self.prefetch_capacity_limit:
+            return True
+        # todo: more sophisticated rate limiting based on storage backend performance
+        return False
+
+    def _storage_hit_query(self, operation) -> tuple[list[str], int]:
+        last_hash = operation.last_hash
+        tokens_to_fetch = operation.token_ids
+
+        storage_query_count = 0
+        hash_value = []
+
+        for start in range(
+            0, len(tokens_to_fetch), self.page_size * self.storage_batch_size
+        ):
+            end = min(
+                start + self.page_size * self.storage_batch_size, len(tokens_to_fetch)
+            )
+            batch_tokens = tokens_to_fetch[start:end]
+            batch_hashes = []
+            for i in range(0, len(batch_tokens), self.page_size):
+                last_hash = self.get_hash_str(
+                    batch_tokens[i : i + self.page_size], last_hash
+                )
+                batch_hashes.append(last_hash)
+            hit_page_num = self.batch_exists_func(batch_hashes)
+            hash_value.extend(batch_hashes[:hit_page_num])
+            storage_query_count += hit_page_num * self.page_size
+            if hit_page_num < len(batch_hashes):
+                break
+        return hash_value, storage_query_count
+
+    def prefetch_thread_func(self):
+        """
+        Manage prefetching operations from storage backend to host memory.
+        """
+        self.prefetch_buffer = Queue()
+        aux_thread = threading.Thread(target=self.prefetch_io_aux_func, daemon=True)
+        aux_thread.start()
+        while (not self.stop_event.is_set()) or not self.prefetch_queue.empty():
+            try:
+                operation = self.prefetch_queue.get(block=True, timeout=1)
+                if operation is None:
+                    continue
+
+                hash_value, storage_hit_count = self._storage_hit_query(operation)
+                if self.tp_world_size > 1:
+                    storage_hit_count_tensor = torch.tensor(
+                        storage_hit_count, dtype=torch.int
+                    )
+                    torch.distributed.all_reduce(
+                        storage_hit_count_tensor,
+                        op=torch.distributed.ReduceOp.MIN,
+                        group=self.prefetch_tp_group,
+                    )
+                    storage_hit_count = storage_hit_count_tensor.item()
+
+                if storage_hit_count < self.prefetch_threshold:
+                    # not to prefetch if not enough benefits
+                    self.prefetch_revoke_queue.put(operation.request_id)
+                    self.append_host_mem_release(operation.host_indices)
+                    logger.debug(
+                        f"Revoking prefetch for request {operation.request_id} due to insufficient hits ({storage_hit_count})."
+                    )
+                else:
+                    operation.hash_value = hash_value[
+                        : (storage_hit_count // self.page_size)
+                    ]
+                    # free the pre-allocated memory for pages that are not hit
+                    self.append_host_mem_release(
+                        operation.host_indices[storage_hit_count:]
+                    )
+                    operation.host_indices = operation.host_indices[:storage_hit_count]
+                    logger.debug(
+                        f"Prefetching {len(operation.hash_value)} pages for request {operation.request_id}."
+                    )
+                    self.prefetch_buffer.put(operation)
+
+            except Empty:
+                continue
+
+    def write_storage(
+        self,
+        host_indices: torch.Tensor,
+        token_ids: List[int],
+        hash_value: Optional[List[str]] = None,
+    ) -> int:
+        """
+        Write KV caches from host memory to storage backend.
+        """
+        operation = StorageOperation(host_indices, token_ids, hash_value=hash_value)
+        self.backup_queue.put(operation)
+        return operation.id
+
+    # non-zero copy
+    def _generic_page_set(self, hash_values, host_indices) -> bool:
+        data = [
+            self.mem_pool_host.get_flat_data_page(host_indices[i * self.page_size])
+            for i in range(len(hash_values))
+        ]
+        return self.storage_backend.batch_set(hash_values, data)
+
+    # zero copy
+    def _mooncake_page_set(self, hash_values, host_indices) -> bool:
+        key_strs, buffer_ptrs, buffer_sizes = self.mem_pool_host.get_buffer_meta(
+            hash_values,
+            host_indices,
+            self.storage_config.tp_rank,
+        )
+        success = self.storage_backend.batch_set(
+            key_strs,
+            target_locations=buffer_ptrs,
+            target_sizes=buffer_sizes,
+        )
+        return success
+
+    # zero copy
+    def _3fs_zero_copy_page_set(self, hash_values, host_indices) -> bool:
+        hashes, dsts, _ = self.mem_pool_host.get_buffer_with_hash(
+            hash_values, host_indices
+        )
+        return self.storage_backend.batch_set(hashes, dsts)
+
+    # Backup batch by batch
+    def _page_backup(self, operation):
+        # Backup batch by batch
+        for i in range(0, len(operation.hash_value), self.storage_batch_size):
+            batch_hashes = operation.hash_value[i : i + self.storage_batch_size]
+            batch_host_indices = operation.host_indices[
+                i * self.page_size : (i + len(batch_hashes)) * self.page_size
+            ]
+            # Set one batch token, and record if success.
+            # todo: allow partial success
+            success = self.page_set_func(batch_hashes, batch_host_indices)
+            if not success:
+                logger.warning(
+                    f"Write page to storage: {len(batch_hashes)} pages failed."
+                )
+                break
+            operation.completed_tokens += self.page_size * len(batch_hashes)
+
+    def backup_thread_func(self):
+        """
+        Manage backup operations from host memory to storage backend.
+        """
+        while not self.stop_event.is_set():
+            try:
+                operation = self.backup_queue.get(block=True, timeout=1)
+                if operation is None:
+                    continue
+
+                if not self.backup_skip:
+                    self._page_backup(operation)
+                self.ack_backup_queue.put(operation)
+
+            except Empty:
+                continue
diff --git a/python/sglang/srt/managers/configure_logging.py b/python/sglang/srt/managers/configure_logging.py
new file mode 100644
index 000000000..0dc78edfa
--- /dev/null
+++ b/python/sglang/srt/managers/configure_logging.py
@@ -0,0 +1,47 @@
+"""
+Copyright 2023-2025 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""
+Configure the logging settings of a server.
+
+Usage:
+python3 -m sglang.srt.managers.configure_logging --url http://localhost:30000
+"""
+
+import argparse
+
+import requests
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--url", type=str, default="http://localhost:30000")
+    parser.add_argument("--log-requests", action="store_true")
+    parser.add_argument("--log-requests-level", type=int, default=3)
+    parser.add_argument(
+        "--dump-requests-folder", type=str, default="/tmp/sglang_request_dump"
+    )
+    parser.add_argument("--dump-requests-threshold", type=int, default=1000)
+    args = parser.parse_args()
+
+    response = requests.post(
+        args.url + "/configure_logging",
+        json={
+            "log_requests": args.log_requests,
+            "log_requests_level": args.log_requests_level,  # Log full requests
+            "dump_requests_folder": args.dump_requests_folder,
+            "dump_requests_threshold": args.dump_requests_threshold,
+        },
+    )
+    assert response.status_code == 200
diff --git a/python/sglang/srt/managers/data_parallel_controller.py b/python/sglang/srt/managers/data_parallel_controller.py
new file mode 100644
index 000000000..a7bb6d13a
--- /dev/null
+++ b/python/sglang/srt/managers/data_parallel_controller.py
@@ -0,0 +1,390 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A controller that dispatches requests to multiple data parallel workers."""
+
+import faulthandler
+import logging
+import multiprocessing as mp
+import signal
+import struct
+import sys
+import threading
+import time
+from enum import Enum, auto
+from multiprocessing import shared_memory
+from typing import Dict, List
+
+import psutil
+import setproctitle
+import zmq
+
+from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
+from sglang.srt.managers.io_struct import (
+    BlockReqInput,
+    TokenizedEmbeddingReqInput,
+    TokenizedGenerateReqInput,
+)
+from sglang.srt.managers.schedule_batch import Req
+from sglang.srt.managers.scheduler import run_scheduler_process
+from sglang.srt.managers.utils import DPBalanceMeta
+from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
+from sglang.srt.utils import (
+    bind_port,
+    configure_logger,
+    get_zmq_socket,
+    kill_itself_when_parent_died,
+)
+from sglang.utils import get_exception_traceback
+
+logger = logging.getLogger(__name__)
+
+
+class LoadBalanceMethod(Enum):
+    """Load balance method."""
+
+    ROUND_ROBIN = auto()
+    SHORTEST_QUEUE = auto()
+    MINIMUM_TOKENS = auto()
+
+    @classmethod
+    def from_str(cls, method: str):
+        method = method.upper()
+        try:
+            return cls[method]
+        except KeyError as exc:
+            raise ValueError(f"Invalid load balance method: {method}") from exc
+
+
+class DataParallelController:
+    """A controller that dispatches requests to multiple data parallel workers."""
+
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+        dp_balance_meta: DPBalanceMeta,
+    ) -> None:
+        # for dp balance
+        self.global_balance_id = 0
+        self.balance_meta = dp_balance_meta
+
+        # Parse args
+        self.max_total_num_tokens = None
+        self.server_args = server_args
+        self.port_args = port_args
+        self.load_balance_method = LoadBalanceMethod.from_str(
+            server_args.load_balance_method
+        )
+
+        # Init inter-process communication
+        self.context = zmq.Context(1 + server_args.dp_size)
+        if server_args.node_rank == 0:
+            self.recv_from_tokenizer = get_zmq_socket(
+                self.context, zmq.PULL, port_args.scheduler_input_ipc_name, False
+            )
+
+        # Dispatch method
+        self.round_robin_counter = 0
+        dispatch_lookup = {
+            LoadBalanceMethod.ROUND_ROBIN: self.round_robin_scheduler,
+            LoadBalanceMethod.SHORTEST_QUEUE: self.shortest_queue_scheduler,
+            LoadBalanceMethod.MINIMUM_TOKENS: self.minimum_tokens_scheduler,
+        }
+        self.dispatching = dispatch_lookup[self.load_balance_method]
+
+        # Launch data parallel workers
+        self.scheduler_procs = []
+        self.workers: List[zmq.Socket] = [None] * server_args.dp_size
+
+        if server_args.enable_dp_attention:
+            dp_port_args = self.launch_dp_attention_schedulers(server_args, port_args)
+            self.control_message_step = server_args.tp_size
+        else:
+            dp_port_args = self.launch_dp_schedulers(server_args, port_args)
+            self.control_message_step = 1
+
+        # Only node rank 0 runs the real data parallel controller that dispatches the requests.
+        if server_args.node_rank == 0:
+            for dp_rank in range(server_args.dp_size):
+                self.workers[dp_rank] = get_zmq_socket(
+                    self.context,
+                    zmq.PUSH,
+                    dp_port_args[dp_rank].scheduler_input_ipc_name,
+                    True,
+                )
+
+        self.max_req_input_len = None
+
+    def launch_dp_schedulers(self, server_args, port_args):
+        base_gpu_id = 0
+
+        threads = []
+        sockets = []
+        dp_port_args = []
+        ready_events = []
+        for dp_rank in range(server_args.dp_size):
+            tmp_port_args = PortArgs.init_new(server_args)
+            tmp_port_args.tokenizer_ipc_name = port_args.tokenizer_ipc_name
+            tmp_port_args.detokenizer_ipc_name = port_args.detokenizer_ipc_name
+            dp_port_args.append(tmp_port_args)
+
+            # This port is checked free in PortArgs.init_new.
+            # We hold it first so that the next dp worker gets a different port
+            sockets.append(bind_port(tmp_port_args.nccl_port))
+
+            ready_event = threading.Event()
+            ready_events.append(ready_event)
+
+            # Create a thread for each worker
+            thread = threading.Thread(
+                target=self.launch_tensor_parallel_group_thread,
+                args=(server_args, tmp_port_args, base_gpu_id, dp_rank, ready_event),
+            )
+            threads.append(thread)
+            base_gpu_id += server_args.tp_size * server_args.gpu_id_step
+
+        # Free all sockets before starting the threads to launch TP workers
+        for sock in sockets:
+            sock.close()
+
+        # Start all threads
+        for thread in threads:
+            thread.start()
+        for event in ready_events:
+            event.wait()
+
+        return dp_port_args
+
+    def launch_tensor_parallel_group_thread(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+        base_gpu_id: int,
+        dp_rank: int,
+        ready_event: threading.Event,
+    ):
+        self.launch_tensor_parallel_group(server_args, port_args, base_gpu_id, dp_rank)
+        ready_event.set()
+
+        # This thread cannot be closed because otherwise the `kill_itself_when_parent_died`
+        # function in scheduler.py will kill the scheduler.
+        while True:
+            time.sleep(30 * 24 * 3600)
+
+    def launch_dp_attention_schedulers(self, server_args, port_args):
+        self.launch_tensor_parallel_group(server_args, port_args, 0, None)
+        dp_port_args = []
+        for dp_rank in range(server_args.dp_size):
+            dp_port_args.append(PortArgs.init_new(server_args, dp_rank))
+        return dp_port_args
+
+    def launch_tensor_parallel_group(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+        base_gpu_id: int,
+        dp_rank: int,
+    ):
+        if not server_args.enable_dp_attention:
+            logger.info(f"Launch DP{dp_rank} starting at GPU #{base_gpu_id}.")
+
+        memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=server_args.enable_memory_saver
+        )
+
+        scheduler_pipe_readers = []
+
+        nnodes_per_tp_group = max(server_args.nnodes // server_args.pp_size, 1)
+        tp_size_per_node = server_args.tp_size // nnodes_per_tp_group
+        tp_rank_range = range(
+            tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group),
+            tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group + 1),
+        )
+
+        pp_size_per_node = max(server_args.pp_size // server_args.nnodes, 1)
+        pp_rank_range = range(
+            pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group),
+            pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group + 1),
+        )
+
+        for pp_rank in pp_rank_range:
+            for tp_rank in tp_rank_range:
+                rank_port_args = port_args
+
+                if server_args.enable_dp_attention:
+                    # dp attention has different sharding logic
+                    _, _, dp_rank = compute_dp_attention_world_info(
+                        server_args.enable_dp_attention,
+                        tp_rank,
+                        server_args.tp_size,
+                        server_args.dp_size,
+                    )
+                    # compute zmq ports for this dp rank
+                    rank_port_args = PortArgs.init_new(server_args, dp_rank)
+                    # Data parallelism reuses the tensor parallelism group,
+                    # so all dp ranks should use the same nccl port.
+                    rank_port_args.nccl_port = port_args.nccl_port
+
+                reader, writer = mp.Pipe(duplex=False)
+                gpu_id = (
+                    server_args.base_gpu_id
+                    + base_gpu_id
+                    + ((pp_rank % pp_size_per_node) * tp_size_per_node)
+                    + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
+                )
+                moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
+                proc = mp.Process(
+                    target=run_scheduler_process,
+                    args=(
+                        server_args,
+                        rank_port_args,
+                        gpu_id,
+                        tp_rank,
+                        moe_ep_rank,
+                        pp_rank,
+                        dp_rank,
+                        writer,
+                        self.balance_meta,
+                    ),
+                )
+                with memory_saver_adapter.configure_subprocess():
+                    proc.start()
+                self.scheduler_procs.append(proc)
+                scheduler_pipe_readers.append(reader)
+
+        # Wait for model to finish loading
+        scheduler_info = []
+        for i in range(len(scheduler_pipe_readers)):
+            scheduler_info.append(scheduler_pipe_readers[i].recv())
+
+        self.max_total_num_tokens = scheduler_info[0]["max_total_num_tokens"]
+        self.max_req_input_len = scheduler_info[0]["max_req_input_len"]
+
+    def maybe_external_dp_rank_routing(self, req: Req):
+        if req.data_parallel_rank is not None:
+            logger.debug(f"Direct routing to DP rank {req.data_parallel_rank}")
+            self.workers[req.data_parallel_rank].send_pyobj(req)
+            return True
+        return False
+
+    def round_robin_scheduler(self, req: Req):
+        if self.maybe_external_dp_rank_routing(req):
+            return
+
+        if self.server_args.disaggregation_mode == "null":
+            self.workers[self.round_robin_counter].send_pyobj(req)
+            self.round_robin_counter = (self.round_robin_counter + 1) % len(
+                self.workers
+            )
+        else:
+            self.workers[req.bootstrap_room % len(self.workers)].send_pyobj(req)
+
+    def shortest_queue_scheduler(self, input_requests):
+        if self.maybe_external_dp_rank_routing(req):
+            return
+        raise NotImplementedError()
+
+    def minimum_tokens_scheduler(self, req):
+        if self.maybe_external_dp_rank_routing(req):
+            return
+
+        # This variable corresponds to the balance_id in TokenizedGenerateReqInput.
+        # We use it to to control the number of onfly tokens (requests dispatched to workers but not yet received).
+        def get_next_global_balance_id() -> int:
+            INT32_MAX = 2147483647
+            current_id = self.global_balance_id
+            self.global_balance_id = (self.global_balance_id + 1) % INT32_MAX
+            return current_id
+
+        req.dp_balance_id = get_next_global_balance_id()
+        with self.balance_meta.mutex:
+            # 1. local_tokens represents the tokens currently inferring on the worker,
+            #  while onfly refers to the requests dispatched by the dispatcher but not yet received by the scheduler.
+            onfly_info = self.balance_meta.get_shared_onfly()
+            local_tokens = self.balance_meta.get_shared_local_tokens()
+            total_tokens = [
+                local_token + sum(onfly_dict.values())
+                for local_token, onfly_dict in zip(local_tokens, onfly_info)
+            ]
+            target_worker = total_tokens.index(min(total_tokens))
+            onfly_info[target_worker][req.dp_balance_id] = len(req.input_ids)
+            # 2. write the new onfly info to the shm
+            self.balance_meta.set_shared_onfly_info(onfly_info)
+
+        # logger.info(f"dp workers {local_tokens=}, {onfly_info=}, {target_worker=}")
+        self.workers[target_worker].send_pyobj(req)
+
+    def event_loop(self):
+        while True:
+            while True:
+                try:
+                    recv_req = self.recv_from_tokenizer.recv_pyobj(zmq.NOBLOCK)
+                except zmq.ZMQError:
+                    break
+
+                if isinstance(
+                    recv_req,
+                    (
+                        TokenizedGenerateReqInput,
+                        TokenizedEmbeddingReqInput,
+                    ),
+                ):
+                    self.dispatching(recv_req)
+                elif isinstance(recv_req, BlockReqInput):
+                    for worker in self.workers:
+                        worker.send_pyobj(recv_req)
+                else:
+                    # Send other control messages to first worker of tp group
+                    for worker in self.workers[:: self.control_message_step]:
+                        worker.send_pyobj(recv_req)
+
+
+def run_data_parallel_controller_process(
+    server_args: ServerArgs,
+    port_args: PortArgs,
+    pipe_writer,
+):
+    kill_itself_when_parent_died()
+    setproctitle.setproctitle("sglang::data_parallel_controller")
+    faulthandler.enable()
+    configure_logger(server_args)
+    parent_process = psutil.Process().parent()
+    balance_meta = DPBalanceMeta(server_args.dp_size)
+
+    try:
+        controller = DataParallelController(
+            server_args, port_args, dp_balance_meta=balance_meta
+        )
+        pipe_writer.send(
+            {
+                "status": "ready",
+                "max_total_num_tokens": controller.max_total_num_tokens,
+                "max_req_input_len": controller.max_req_input_len,
+            }
+        )
+        if server_args.node_rank == 0:
+            controller.event_loop()
+        for proc in controller.scheduler_procs:
+            proc.join()
+            logger.error(
+                f"Scheduler or DataParallelController {proc.pid} terminated with {proc.exitcode}"
+            )
+    except Exception:
+        traceback = get_exception_traceback()
+        logger.error(f"DataParallelController hit an exception: {traceback}")
+        parent_process.send_signal(signal.SIGQUIT)
+    finally:
+        # we need to destruct mp.Manager() in balance_meta
+        balance_meta.destructor()
diff --git a/python/sglang/srt/managers/detokenizer_manager.py b/python/sglang/srt/managers/detokenizer_manager.py
new file mode 100644
index 000000000..bc58f4ee5
--- /dev/null
+++ b/python/sglang/srt/managers/detokenizer_manager.py
@@ -0,0 +1,303 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""DetokenizerManager is a process that detokenizes the token ids."""
+
+import dataclasses
+import logging
+import os
+import signal
+from collections import OrderedDict
+from typing import Dict, List, Union
+
+import psutil
+import setproctitle
+import zmq
+
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.managers.io_struct import (
+    BatchEmbeddingOut,
+    BatchMultimodalDecodeReq,
+    BatchMultimodalOut,
+    BatchStrOut,
+    BatchTokenIDOut,
+    FreezeGCReq,
+    MultiTokenizerRegisterReq,
+)
+from sglang.srt.managers.multi_tokenizer_mixin import MultiHttpWorkerDetokenizerMixin
+from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.utils import (
+    configure_logger,
+    freeze_gc,
+    get_zmq_socket,
+    kill_itself_when_parent_died,
+)
+from sglang.utils import (
+    TypeBasedDispatcher,
+    find_printable_text,
+    get_exception_traceback,
+)
+
+logger = logging.getLogger(__name__)
+
+# Maximum number of request states that detokenizer can hold. When exceeded,
+# oldest request states will be evicted. Default: 65536 (1<<16).
+# For more details, see: https://github.com/sgl-project/sglang/issues/2812
+# Use power of 2 values for better memory allocation.
+DETOKENIZER_MAX_STATES = int(os.environ.get("SGLANG_DETOKENIZER_MAX_STATES", 1 << 16))
+
+
+@dataclasses.dataclass
+class DecodeStatus:
+    """Store the status of incremental decoding."""
+
+    decoded_text: str
+    decode_ids: List[int]
+    surr_offset: int
+    read_offset: int
+    # Offset that's sent to tokenizer for incremental update.
+    sent_offset: int = 0
+
+
+class DetokenizerManager(MultiHttpWorkerDetokenizerMixin):
+    """DetokenizerManager is a process that detokenizes the token ids."""
+
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+    ):
+        # Init inter-process communication
+        context = zmq.Context(2)
+        self.recv_from_scheduler = get_zmq_socket(
+            context, zmq.PULL, port_args.detokenizer_ipc_name, True
+        )
+        self.send_to_tokenizer = get_zmq_socket(
+            context, zmq.PUSH, port_args.tokenizer_ipc_name, False
+        )
+
+        if server_args.skip_tokenizer_init:
+            self.tokenizer = None
+        else:
+            self.tokenizer = get_tokenizer(
+                server_args.tokenizer_path,
+                tokenizer_mode=server_args.tokenizer_mode,
+                trust_remote_code=server_args.trust_remote_code,
+                revision=server_args.revision,
+            )
+
+        self.decode_status = LimitedCapacityDict(capacity=DETOKENIZER_MAX_STATES)
+        self.is_dummy = server_args.load_format == "dummy"
+
+        self._request_dispatcher = TypeBasedDispatcher(
+            [
+                (BatchEmbeddingOut, self.handle_batch_embedding_out),
+                (BatchTokenIDOut, self.handle_batch_token_id_out),
+                (BatchMultimodalDecodeReq, self.handle_multimodal_decode_req),
+                (MultiTokenizerRegisterReq, lambda x: x),
+                (FreezeGCReq, self.handle_freeze_gc_req),
+            ]
+        )
+
+        self.is_tool_call_parser_gpt_oss = server_args.tool_call_parser == "gpt-oss"
+
+    def event_loop(self):
+        """The event loop that handles requests"""
+        while True:
+            recv_obj = self.recv_from_scheduler.recv_pyobj()
+            output = self._request_dispatcher(recv_obj)
+            if output is not None:
+                self.send_to_tokenizer.send_pyobj(output)
+
+    def trim_matched_stop(
+        self, output: Union[str, List[int]], finished_reason: Dict, no_stop_trim: bool
+    ):
+        if no_stop_trim or not finished_reason:
+            return output
+
+        matched = finished_reason.get("matched", None)
+        if not matched:
+            return output
+
+        # TODO(lmzheng): handle the case where multiple stop strs are hit
+
+        # Trim stop str.
+        if isinstance(matched, str) and isinstance(output, str):
+            pos = output.find(matched)
+            return output[:pos] if pos != -1 else output
+
+        # Trim stop token.
+        if isinstance(matched, int) and isinstance(output, list):
+            # 200012 <|call|> is the tool call token and one of eos tokens for gpt-oss model
+            if output[-1] == 200012 and self.is_tool_call_parser_gpt_oss:
+                return output
+            assert len(output) > 0
+            return output[:-1]
+        return output
+
+    def handle_batch_embedding_out(self, recv_obj: BatchEmbeddingOut):
+        # If it is embedding model, no detokenization is needed.
+        return recv_obj
+
+    def handle_batch_token_id_out(self, recv_obj: BatchTokenIDOut):
+        bs = len(recv_obj.rids)
+
+        # Initialize decode status
+        read_ids, surr_ids = [], []
+        for i in range(bs):
+            rid = recv_obj.rids[i]
+            if rid not in self.decode_status:
+                s = DecodeStatus(
+                    decoded_text=recv_obj.decoded_texts[i],
+                    decode_ids=recv_obj.decode_ids[i],
+                    surr_offset=0,
+                    read_offset=recv_obj.read_offsets[i],
+                )
+                self.decode_status[rid] = s
+            else:
+                s = self.decode_status[rid]
+                s.decode_ids.extend(recv_obj.decode_ids[i])
+
+            read_ids.append(
+                self.trim_matched_stop(
+                    s.decode_ids[s.surr_offset :],
+                    recv_obj.finished_reasons[i],
+                    recv_obj.no_stop_trim[i],
+                )
+            )
+            surr_ids.append(s.decode_ids[s.surr_offset : s.read_offset])
+
+        # TODO(lmzheng): handle skip_special_tokens/spaces_between_special_tokens per request
+        surr_texts = self.tokenizer.batch_decode(
+            surr_ids,
+            skip_special_tokens=recv_obj.skip_special_tokens[0],
+            spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
+        )
+        read_texts = self.tokenizer.batch_decode(
+            read_ids,
+            skip_special_tokens=recv_obj.skip_special_tokens[0],
+            spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[0],
+        )
+
+        # Incremental decoding
+        output_strs = []
+        for i in range(bs):
+            try:
+                s = self.decode_status[recv_obj.rids[i]]
+            except KeyError:
+                raise RuntimeError(
+                    f"Decode status not found for request {recv_obj.rids[i]}. "
+                    "It may be due to the request being evicted from the decode status due to memory pressure. "
+                    "Please increase the maximum number of requests by setting "
+                    "the SGLANG_DETOKENIZER_MAX_STATES environment variable to a bigger value than the default value. "
+                    f"The current value is {DETOKENIZER_MAX_STATES}. "
+                    "For more details, see: https://github.com/sgl-project/sglang/issues/2812"
+                )
+            new_text = read_texts[i][len(surr_texts[i]) :]
+            if recv_obj.finished_reasons[i] is None:
+                # Streaming chunk: update the decode status
+                if len(new_text) > 0 and not new_text.endswith("�"):
+                    s.decoded_text = s.decoded_text + new_text
+                    s.surr_offset = s.read_offset
+                    s.read_offset = len(s.decode_ids)
+                    new_text = ""
+                else:
+                    new_text = find_printable_text(new_text)
+
+            output_str = self.trim_matched_stop(
+                s.decoded_text + new_text,
+                recv_obj.finished_reasons[i],
+                recv_obj.no_stop_trim[i],
+            )
+            # Incrementally send text.
+            incremental_output = output_str[s.sent_offset :]
+            s.sent_offset = len(output_str)
+            output_strs.append(incremental_output)
+
+        return BatchStrOut(
+            rids=recv_obj.rids,
+            finished_reasons=recv_obj.finished_reasons,
+            output_strs=output_strs,
+            output_ids=recv_obj.decode_ids,
+            prompt_tokens=recv_obj.prompt_tokens,
+            completion_tokens=recv_obj.completion_tokens,
+            cached_tokens=recv_obj.cached_tokens,
+            spec_verify_ct=recv_obj.spec_verify_ct,
+            input_token_logprobs_val=recv_obj.input_token_logprobs_val,
+            input_token_logprobs_idx=recv_obj.input_token_logprobs_idx,
+            output_token_logprobs_val=recv_obj.output_token_logprobs_val,
+            output_token_logprobs_idx=recv_obj.output_token_logprobs_idx,
+            input_top_logprobs_val=recv_obj.input_top_logprobs_val,
+            input_top_logprobs_idx=recv_obj.input_top_logprobs_idx,
+            output_top_logprobs_val=recv_obj.output_top_logprobs_val,
+            output_top_logprobs_idx=recv_obj.output_top_logprobs_idx,
+            input_token_ids_logprobs_val=recv_obj.input_token_ids_logprobs_val,
+            input_token_ids_logprobs_idx=recv_obj.input_token_ids_logprobs_idx,
+            output_token_ids_logprobs_val=recv_obj.output_token_ids_logprobs_val,
+            output_token_ids_logprobs_idx=recv_obj.output_token_ids_logprobs_idx,
+            output_hidden_states=recv_obj.output_hidden_states,
+            placeholder_tokens_idx=None,
+            placeholder_tokens_val=None,
+        )
+
+    def handle_multimodal_decode_req(self, recv_obj: BatchMultimodalDecodeReq):
+        outputs = self.tokenizer.detokenize(recv_obj)
+        return BatchMultimodalOut(
+            rids=recv_obj.rids,
+            finished_reasons=recv_obj.finished_reasons,
+            outputs=outputs,
+            prompt_tokens=recv_obj.prompt_tokens,
+            completion_tokens=recv_obj.completion_tokens,
+            cached_tokens=recv_obj.cached_tokens,
+            placeholder_tokens_idx=None,
+            placeholder_tokens_val=None,
+        )
+
+    def handle_freeze_gc_req(self, recv_req: FreezeGCReq):
+        freeze_gc("Detokenizer Manager")
+        return None
+
+
+class LimitedCapacityDict(OrderedDict):
+    def __init__(self, capacity: int, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.capacity = capacity
+
+    def __setitem__(self, key, value):
+        if len(self) >= self.capacity:
+            # Remove the oldest element (first item in the dict)
+            self.popitem(last=False)
+        # Set the new item
+        super().__setitem__(key, value)
+
+
+def run_detokenizer_process(
+    server_args: ServerArgs,
+    port_args: PortArgs,
+):
+    kill_itself_when_parent_died()
+    setproctitle.setproctitle("sglang::detokenizer")
+    configure_logger(server_args)
+    parent_process = psutil.Process().parent()
+
+    try:
+        manager = DetokenizerManager(server_args, port_args)
+        if server_args.tokenizer_worker_num > 1:
+            manager.multi_http_worker_event_loop()
+        else:
+            manager.event_loop()
+    except Exception:
+        manager.socket_mapping.clear_all_sockets()
+        traceback = get_exception_traceback()
+        logger.error(f"DetokenizerManager hit an exception: {traceback}")
+        parent_process.send_signal(signal.SIGQUIT)
diff --git a/python/sglang/srt/managers/disagg_service.py b/python/sglang/srt/managers/disagg_service.py
new file mode 100644
index 000000000..df0eac48b
--- /dev/null
+++ b/python/sglang/srt/managers/disagg_service.py
@@ -0,0 +1,46 @@
+"""Start bootstrap/kv-store-related server"""
+
+import os
+from typing import Type
+
+from sglang.srt.disaggregation.base import BaseKVBootstrapServer
+from sglang.srt.disaggregation.utils import (
+    DisaggregationMode,
+    KVClassType,
+    TransferBackend,
+    get_kv_class,
+)
+from sglang.srt.server_args import ServerArgs
+
+
+def start_disagg_service(
+    server_args: ServerArgs,
+):
+    # Start kv boostrap server on prefill
+    disagg_mode = DisaggregationMode(server_args.disaggregation_mode)
+    transfer_backend = TransferBackend(server_args.disaggregation_transfer_backend)
+
+    if disagg_mode == DisaggregationMode.PREFILL:
+        # only start bootstrap server on prefill tm
+        kv_bootstrap_server_class: Type[BaseKVBootstrapServer] = get_kv_class(
+            transfer_backend, KVClassType.BOOTSTRAP_SERVER
+        )
+        bootstrap_server: BaseKVBootstrapServer = kv_bootstrap_server_class(
+            host=server_args.host,
+            port=server_args.disaggregation_bootstrap_port,
+        )
+        is_create_store = (
+            server_args.node_rank == 0 and transfer_backend == TransferBackend.ASCEND
+        )
+        if is_create_store:
+            try:
+                from mf_adapter import create_config_store
+
+                ascend_url = os.getenv("ASCEND_MF_STORE_URL")
+                create_config_store(ascend_url)
+            except Exception as e:
+                error_message = f"Failed create mf store, invalid ascend_url."
+                error_message += f" With exception {e}"
+                raise error_message
+
+        return bootstrap_server
diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py
new file mode 100644
index 000000000..6237cd383
--- /dev/null
+++ b/python/sglang/srt/managers/io_struct.py
@@ -0,0 +1,1332 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+The definition of objects transferred between different
+processes (TokenizerManager, DetokenizerManager, Scheduler).
+"""
+
+import copy
+import uuid
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+from sglang.srt.lora.lora_registry import LoRARef
+from sglang.srt.managers.schedule_batch import BaseFinishReason
+from sglang.srt.multimodal.mm_utils import has_valid_data
+from sglang.srt.sampling.sampling_params import SamplingParams
+from sglang.srt.utils import ImageData
+
+# Handle serialization of Image for pydantic
+if TYPE_CHECKING:
+    from PIL.Image import Image
+else:
+    Image = Any
+
+
+@dataclass
+class SessionParams:
+    id: Optional[str] = None
+    rid: Optional[str] = None
+    offset: Optional[int] = None
+    replace: Optional[bool] = None
+    drop_previous_output: Optional[bool] = None
+
+
+# Type definitions for multimodal input data
+# Individual data item types for each modality
+ImageDataInputItem = Union[Image, str, ImageData, Dict]
+AudioDataInputItem = Union[str, Dict]
+VideoDataInputItem = Union[str, Dict]
+# Union type for any multimodal data item
+MultimodalDataInputItem = Union[
+    ImageDataInputItem, VideoDataInputItem, AudioDataInputItem
+]
+# Format types supporting single items, lists, or nested lists for batch processing
+MultimodalDataInputFormat = Union[
+    List[List[MultimodalDataInputItem]],
+    List[MultimodalDataInputItem],
+    MultimodalDataInputItem,
+]
+
+
+@dataclass
+class GenerateReqInput:
+    # The input prompt. It can be a single prompt or a batch of prompts.
+    text: Optional[Union[List[str], str]] = None
+    # The token ids for text; one can specify either text or input_ids
+    input_ids: Optional[Union[List[List[int]], List[int]]] = None
+    # The embeddings for input_ids; one can specify either text or input_ids or input_embeds.
+    input_embeds: Optional[Union[List[List[List[float]]], List[List[float]]]] = None
+    # The image input. It can be an image instance, file name, URL, or base64 encoded string.
+    # Can be formatted as:
+    # - Single image for a single request
+    # - List of images (one per request in a batch)
+    # - List of lists of images (multiple images per request)
+    # See also python/sglang/srt/utils.py:load_image for more details.
+    image_data: Optional[MultimodalDataInputFormat] = None
+    # The video input. Like image data, it can be a file name, a url, or base64 encoded string.
+    video_data: Optional[MultimodalDataInputFormat] = None
+    # The audio input. Like image data, it can be a file name, a url, or base64 encoded string.
+    audio_data: Optional[MultimodalDataInputFormat] = None
+    # The sampling_params. See descriptions below.
+    sampling_params: Optional[Union[List[Dict], Dict]] = None
+    # The request id.
+    rid: Optional[Union[List[str], str]] = None
+    # Whether to return logprobs.
+    return_logprob: Optional[Union[List[bool], bool]] = None
+    # If return logprobs, the start location in the prompt for returning logprobs.
+    # By default, this value is "-1", which means it will only return logprobs for output tokens.
+    logprob_start_len: Optional[Union[List[int], int]] = None
+    # If return logprobs, the number of top logprobs to return at each position.
+    top_logprobs_num: Optional[Union[List[int], int]] = None
+    # If return logprobs, the token ids to return logprob for.
+    token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None
+    # Whether to detokenize tokens in text in the returned logprobs.
+    return_text_in_logprobs: bool = False
+    # Whether to stream output.
+    stream: bool = False
+    # Whether to log metrics for this request (e.g. health_generate calls do not log metrics)
+    log_metrics: bool = True
+    # Whether to return hidden states
+    return_hidden_states: Union[List[bool], bool] = False
+
+    # The modalities of the image data [image, multi-images, video]
+    modalities: Optional[List[str]] = None
+    # Session info for continual prompting
+    session_params: Optional[Union[List[Dict], Dict]] = None
+
+    # The path to the LoRA adaptors
+    lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
+    # The uid of LoRA adaptors, should be initialized by tokenizer manager
+    lora_id: Optional[Union[List[Optional[str]], Optional[str]]] = None
+
+    # Custom logit processor for advanced sampling control. Must be a serialized instance
+    # of `CustomLogitProcessor` in python/sglang/srt/sampling/custom_logit_processor.py
+    # Use the processor's `to_str()` method to generate the serialized string.
+    custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None
+
+    # For disaggregated inference
+    bootstrap_host: Optional[Union[List[str], str]] = None
+    bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
+    bootstrap_room: Optional[Union[List[int], int]] = None
+    bootstrap_pair_key: Optional[Union[List[str], str]] = None
+
+    # For data parallel rank routing
+    data_parallel_rank: Optional[int] = None
+
+    # For background responses (OpenAI responses API)
+    background: bool = False
+
+    # Conversation id used for tracking requests
+    conversation_id: Optional[str] = None
+
+    # Label for the request
+    label: Optional[str] = None
+
+    # Priority for the request
+    priority: Optional[int] = None
+
+    # Image gen grpc migration
+    return_bytes: bool = False
+
+    def contains_mm_input(self) -> bool:
+        return (
+            has_valid_data(self.image_data)
+            or has_valid_data(self.video_data)
+            or has_valid_data(self.audio_data)
+        )
+
+    def normalize_batch_and_arguments(self):
+        """
+        Normalize the batch size and arguments for the request.
+
+        This method resolves various input formats and ensures all parameters
+        are properly formatted as either single values or batches depending on the input.
+        It also handles parallel sampling expansion and sets default values for
+        unspecified parameters.
+
+        Raises:
+            ValueError: If inputs are not properly specified (e.g., none or all of
+                       text, input_ids, input_embeds are provided)
+        """
+        self._validate_inputs()
+        self._determine_batch_size()
+        self._handle_parallel_sampling()
+
+        if self.is_single:
+            self._normalize_single_inputs()
+        else:
+            self._normalize_batch_inputs()
+
+    def _validate_inputs(self):
+        """Validate that the input configuration is valid."""
+        if (
+            self.text is None and self.input_ids is None and self.input_embeds is None
+        ) or (
+            self.text is not None
+            and self.input_ids is not None
+            and self.input_embeds is not None
+        ):
+            raise ValueError(
+                "Either text, input_ids or input_embeds should be provided."
+            )
+
+    def _determine_batch_size(self):
+        """Determine if this is a single example or a batch and the batch size."""
+        if self.text is not None:
+            if isinstance(self.text, str):
+                self.is_single = True
+                self.batch_size = 1
+            else:
+                self.is_single = False
+                self.batch_size = len(self.text)
+            self.input_embeds = None
+        elif self.input_ids is not None:
+            if len(self.input_ids) == 0:
+                raise ValueError("input_ids cannot be empty.")
+            if isinstance(self.input_ids[0], int):
+                self.is_single = True
+                self.batch_size = 1
+            else:
+                self.is_single = False
+                self.batch_size = len(self.input_ids)
+            self.input_embeds = None
+        else:
+            if isinstance(self.input_embeds[0][0], float):
+                self.is_single = True
+                self.batch_size = 1
+            else:
+                self.is_single = False
+                self.batch_size = len(self.input_embeds)
+
+    def _handle_parallel_sampling(self):
+        """Handle parallel sampling parameters and adjust batch size if needed."""
+        # Determine parallel sample count
+        if self.sampling_params is None:
+            self.parallel_sample_num = 1
+            return
+        elif isinstance(self.sampling_params, dict):
+            self.parallel_sample_num = self.sampling_params.get("n", 1)
+        else:  # isinstance(self.sampling_params, list):
+            self.parallel_sample_num = self.sampling_params[0].get("n", 1)
+            for sampling_params in self.sampling_params:
+                if self.parallel_sample_num != sampling_params.get("n", 1):
+                    raise ValueError(
+                        "The parallel_sample_num should be the same for all samples in sample params."
+                    )
+
+        # If using parallel sampling with a single example, convert to batch
+        if self.parallel_sample_num > 1 and self.is_single:
+            self.is_single = False
+            if self.text is not None:
+                self.text = [self.text]
+            if self.input_ids is not None:
+                self.input_ids = [self.input_ids]
+            if self.input_embeds is not None:
+                self.input_embeds = [self.input_embeds]
+
+    def _normalize_single_inputs(self):
+        """Normalize inputs for a single example."""
+        if self.sampling_params is None:
+            self.sampling_params = {}
+        if self.rid is None:
+            self.rid = uuid.uuid4().hex
+        if self.return_logprob is None:
+            self.return_logprob = False
+        if self.logprob_start_len is None:
+            self.logprob_start_len = -1
+        if self.top_logprobs_num is None:
+            self.top_logprobs_num = 0
+        if not self.token_ids_logprob:  # covers both None and []
+            self.token_ids_logprob = None
+
+    def _normalize_batch_inputs(self):
+        """Normalize inputs for a batch of examples, including parallel sampling expansion."""
+        # Calculate expanded batch size
+        if self.parallel_sample_num == 1:
+            num = self.batch_size
+        else:
+            # Expand parallel_sample_num
+            num = self.batch_size * self.parallel_sample_num
+
+        # Expand input based on type
+        self._expand_inputs(num)
+        self._normalize_rid(num)
+        self._normalize_lora_paths(num)
+        self._normalize_image_data(num)
+        self._normalize_video_data(num)
+        self._normalize_audio_data(num)
+        self._normalize_sampling_params(num)
+        self._normalize_logprob_params(num)
+        self._normalize_custom_logit_processor(num)
+        self._normalize_bootstrap_params(num)
+
+    def _expand_inputs(self, num):
+        """Expand the main inputs (text, input_ids, input_embeds) for parallel sampling."""
+        if self.text is not None:
+            if not isinstance(self.text, list):
+                raise ValueError("Text should be a list for batch processing.")
+            self.text = self.text * self.parallel_sample_num
+        elif self.input_ids is not None:
+            if not isinstance(self.input_ids, list) or not isinstance(
+                self.input_ids[0], list
+            ):
+                raise ValueError(
+                    "input_ids should be a list of lists for batch processing."
+                )
+            self.input_ids = self.input_ids * self.parallel_sample_num
+        elif self.input_embeds is not None:
+            if not isinstance(self.input_embeds, list):
+                raise ValueError("input_embeds should be a list for batch processing.")
+            self.input_embeds = self.input_embeds * self.parallel_sample_num
+
+    def _normalize_lora_paths(self, num):
+        """Normalize LoRA paths for batch processing."""
+        if self.lora_path is not None:
+            if isinstance(self.lora_path, str):
+                self.lora_path = [self.lora_path] * num
+            elif isinstance(self.lora_path, list):
+                self.lora_path = self.lora_path * self.parallel_sample_num
+            else:
+                raise ValueError("lora_path should be a list or a string.")
+
+    def _normalize_image_data(self, num):
+        """Normalize image data for batch processing."""
+        if self.image_data is None:
+            self.image_data = [None] * num
+        elif not isinstance(self.image_data, list):
+            # Single image, convert to list of single-image lists
+            self.image_data = [[self.image_data]] * num
+            self.modalities = ["image"] * num
+        elif isinstance(self.image_data, list):
+            # Handle empty list case - treat as no images
+            if len(self.image_data) == 0:
+                self.image_data = [None] * num
+                return
+
+            if len(self.image_data) != self.batch_size:
+                raise ValueError(
+                    "The length of image_data should be equal to the batch size."
+                )
+
+            self.modalities = []
+            if len(self.image_data) > 0 and isinstance(self.image_data[0], list):
+                # Already a list of lists, keep as is
+                for i in range(len(self.image_data)):
+                    if self.image_data[i] is None or self.image_data[i] == [None]:
+                        self.modalities.append(None)
+                    elif len(self.image_data[i]) == 1:
+                        self.modalities.append("image")
+                    elif len(self.image_data[i]) > 1:
+                        self.modalities.append("multi-images")
+                    else:
+                        # Ensure len(self.modalities) == len(self.image_data)
+                        self.modalities.append(None)
+                # Expand parallel_sample_num
+                self.image_data = self.image_data * self.parallel_sample_num
+                self.modalities = self.modalities * self.parallel_sample_num
+            else:
+                # List of images for a batch, wrap each in a list
+                wrapped_images = [[img] for img in self.image_data]
+                # Expand for parallel sampling
+                self.image_data = wrapped_images * self.parallel_sample_num
+                self.modalities = ["image"] * num
+
+    def _normalize_video_data(self, num):
+        """Normalize video data for batch processing."""
+        if self.video_data is None:
+            self.video_data = [None] * num
+        elif not isinstance(self.video_data, list):
+            self.video_data = [self.video_data] * num
+        elif isinstance(self.video_data, list):
+            self.video_data = self.video_data * self.parallel_sample_num
+
+    def _normalize_audio_data(self, num):
+        """Normalize audio data for batch processing."""
+        if self.audio_data is None:
+            self.audio_data = [None] * num
+        elif not isinstance(self.audio_data, list):
+            self.audio_data = [self.audio_data] * num
+        elif isinstance(self.audio_data, list):
+            self.audio_data = self.audio_data * self.parallel_sample_num
+
+    def _normalize_sampling_params(self, num):
+        """Normalize sampling parameters for batch processing."""
+        if self.sampling_params is None:
+            self.sampling_params = [{}] * num
+        elif isinstance(self.sampling_params, dict):
+            self.sampling_params = [self.sampling_params] * num
+        else:  # Already a list
+            self.sampling_params = self.sampling_params * self.parallel_sample_num
+
+    def _normalize_rid(self, num):
+        """Normalize request IDs for batch processing."""
+        if self.rid is None:
+            self.rid = [uuid.uuid4().hex for _ in range(num)]
+        elif isinstance(self.rid, str):
+            new_rids = [f"{self.rid}_{i}" for i in range(num)]
+            self.rid = new_rids
+        elif isinstance(self.rid, list):
+            # Note: the length of rid shall be the same as the batch_size,
+            # as the rid would be expanded for parallel sampling in tokenizer_manager
+            if len(self.rid) != self.batch_size:
+                raise ValueError(
+                    "The specified rids length mismatch with the batch_size for batch processing."
+                )
+        else:
+            raise ValueError("The rid should be a string or a list of strings.")
+
+    def _normalize_logprob_params(self, num):
+        """Normalize logprob-related parameters for batch processing."""
+
+        # Helper function to normalize a parameter
+        def normalize_param(param, default_value, param_name):
+            if param is None:
+                return [default_value] * num
+            elif not isinstance(param, list):
+                return [param] * num
+            else:
+                if self.parallel_sample_num > 1:
+                    raise ValueError(
+                        f"Cannot use list {param_name} with parallel_sample_num > 1"
+                    )
+                return param
+
+        # Normalize each logprob parameter
+        self.return_logprob = normalize_param(
+            self.return_logprob, False, "return_logprob"
+        )
+        self.logprob_start_len = normalize_param(
+            self.logprob_start_len, -1, "logprob_start_len"
+        )
+        self.top_logprobs_num = normalize_param(
+            self.top_logprobs_num, 0, "top_logprobs_num"
+        )
+
+        # Handle token_ids_logprob specially due to its nested structure
+        if not self.token_ids_logprob:  # covers both None and []
+            self.token_ids_logprob = [None] * num
+        elif not isinstance(self.token_ids_logprob, list):
+            self.token_ids_logprob = [[self.token_ids_logprob] for _ in range(num)]
+        elif not isinstance(self.token_ids_logprob[0], list):
+            self.token_ids_logprob = [
+                copy.deepcopy(self.token_ids_logprob) for _ in range(num)
+            ]
+        elif self.parallel_sample_num > 1:
+            raise ValueError(
+                "Cannot use list token_ids_logprob with parallel_sample_num > 1"
+            )
+
+    def _normalize_custom_logit_processor(self, num):
+        """Normalize custom logit processor for batch processing."""
+        if self.custom_logit_processor is None:
+            self.custom_logit_processor = [None] * num
+        elif not isinstance(self.custom_logit_processor, list):
+            self.custom_logit_processor = [self.custom_logit_processor] * num
+        elif self.parallel_sample_num > 1:
+            raise ValueError(
+                "Cannot use list custom_logit_processor with parallel_sample_num > 1"
+            )
+
+    def _normalize_bootstrap_params(self, num):
+        """Normalize bootstrap parameters for batch processing."""
+        # Normalize bootstrap_host
+        if self.bootstrap_host is None:
+            self.bootstrap_host = [None] * num
+        elif not isinstance(self.bootstrap_host, list):
+            self.bootstrap_host = [self.bootstrap_host] * num
+        elif isinstance(self.bootstrap_host, list):
+            self.bootstrap_host = self.bootstrap_host * self.parallel_sample_num
+
+        # Normalize bootstrap_port
+        if self.bootstrap_port is None:
+            self.bootstrap_port = [None] * num
+        elif not isinstance(self.bootstrap_port, list):
+            self.bootstrap_port = [self.bootstrap_port] * num
+        elif isinstance(self.bootstrap_port, list):
+            self.bootstrap_port = self.bootstrap_port * self.parallel_sample_num
+
+        # Normalize bootstrap_room
+        if self.bootstrap_room is None:
+            self.bootstrap_room = [None] * num
+        elif not isinstance(self.bootstrap_room, list):
+            self.bootstrap_room = [self.bootstrap_room + i for i in range(num)]
+        elif isinstance(self.bootstrap_room, list):
+            self.bootstrap_room = self.bootstrap_room * self.parallel_sample_num
+
+        # Normalize bootstrap_pair_key
+        if self.bootstrap_pair_key is None:
+            self.bootstrap_pair_key = [None] * num
+        elif not isinstance(self.bootstrap_pair_key, list):
+            self.bootstrap_pair_key = [self.bootstrap_pair_key] * num
+        elif isinstance(self.bootstrap_pair_key, list):
+            self.bootstrap_pair_key = self.bootstrap_pair_key * self.parallel_sample_num
+
+    def _validate_session_params(self):
+        """Validate that session parameters are properly formatted."""
+        if self.session_params is not None:
+            if not isinstance(self.session_params, dict) and not isinstance(
+                self.session_params[0], dict
+            ):
+                raise ValueError("Session params must be a dict or a list of dicts.")
+
+    def regenerate_rid(self):
+        """Generate a new request ID and return it."""
+        self.rid = uuid.uuid4().hex
+        return self.rid
+
+    def __getitem__(self, i):
+        return GenerateReqInput(
+            text=self.text[i] if self.text is not None else None,
+            input_ids=self.input_ids[i] if self.input_ids is not None else None,
+            input_embeds=(
+                self.input_embeds[i] if self.input_embeds is not None else None
+            ),
+            image_data=self.image_data[i],
+            video_data=self.video_data[i],
+            audio_data=self.audio_data[i],
+            sampling_params=self.sampling_params[i],
+            rid=self.rid[i],
+            return_logprob=self.return_logprob[i],
+            logprob_start_len=self.logprob_start_len[i],
+            top_logprobs_num=self.top_logprobs_num[i],
+            token_ids_logprob=self.token_ids_logprob[i],
+            return_text_in_logprobs=self.return_text_in_logprobs,
+            stream=self.stream,
+            log_metrics=self.log_metrics,
+            return_hidden_states=(
+                self.return_hidden_states[i]
+                if isinstance(self.return_hidden_states, list)
+                else self.return_hidden_states
+            ),
+            modalities=self.modalities[i] if self.modalities else None,
+            session_params=self.session_params,
+            lora_path=self.lora_path[i] if self.lora_path is not None else None,
+            lora_id=self.lora_id[i] if self.lora_id is not None else None,
+            custom_logit_processor=(
+                self.custom_logit_processor[i]
+                if self.custom_logit_processor is not None
+                else None
+            ),
+            # if `__getitem__` is called, the bootstrap_host, bootstrap_port, bootstrap_room must be a list
+            bootstrap_host=(
+                self.bootstrap_host[i] if self.bootstrap_host is not None else None
+            ),
+            bootstrap_port=(
+                self.bootstrap_port[i] if self.bootstrap_port is not None else None
+            ),
+            bootstrap_room=(
+                self.bootstrap_room[i] if self.bootstrap_room is not None else None
+            ),
+            bootstrap_pair_key=(
+                self.bootstrap_pair_key[i]
+                if self.bootstrap_pair_key is not None
+                else None
+            ),
+            data_parallel_rank=(
+                self.data_parallel_rank if self.data_parallel_rank is not None else None
+            ),
+            conversation_id=self.conversation_id,
+            label=self.label,
+            priority=self.priority,
+            return_bytes=self.return_bytes,
+        )
+
+
+@dataclass
+class TokenizedGenerateReqInput:
+    # The request id
+    rid: str
+    # The input text
+    input_text: str
+    # The input token ids
+    input_ids: List[int]
+    # The multimodal inputs
+    mm_inputs: dict
+    # The sampling parameters
+    sampling_params: SamplingParams
+    # Whether to return the logprobs
+    return_logprob: bool
+    # If return logprobs, the start location in the prompt for returning logprobs.
+    logprob_start_len: int
+    # If return logprobs, the number of top logprobs to return at each position.
+    top_logprobs_num: int
+    # If return logprobs, the token id to return logprob for
+    token_ids_logprob: List[int]
+    # Whether to stream output
+    stream: bool
+    # Whether to return hidden states
+    return_hidden_states: bool = False
+
+    # The input embeds
+    input_embeds: Optional[Union[List[List[List[float]]], List[List[float]]]] = None
+
+    # Session info for continual prompting
+    session_params: Optional[SessionParams] = None
+
+    # LoRA related
+    lora_id: Optional[str] = None  # None means just use the base model
+
+    # Custom logit processor for advanced sampling control. Must be a serialized instance
+    # of `CustomLogitProcessor` in python/sglang/srt/sampling/custom_logit_processor.py
+    # Use the processor's `to_str()` method to generate the serialized string.
+    custom_logit_processor: Optional[str] = None
+
+    # For disaggregated inference
+    bootstrap_host: Optional[str] = None
+    bootstrap_port: Optional[int] = None
+    bootstrap_room: Optional[int] = None
+    bootstrap_pair_key: Optional[str] = None
+
+    # For data parallel rank routing
+    data_parallel_rank: Optional[int] = None
+
+    # For dp balance
+    dp_balance_id: int = -1
+
+    # Label for the request
+    label: Optional[str] = None
+
+    # Priority for the request
+    priority: Optional[int] = None
+
+    # Image gen grpc migration
+    return_bytes: bool = False
+
+
+@dataclass
+class BatchTokenizedGenerateReqInput:
+    # The batch of tokenized requests
+    batch: List[TokenizedGenerateReqInput]
+
+    def __len__(self):
+        return len(self.batch)
+
+    def __getitem__(self, i):
+        return self.batch[i]
+
+    def __iter__(self):
+        return iter(self.batch)
+
+
+@dataclass
+class EmbeddingReqInput:
+    # The input prompt. It can be a single prompt or a batch of prompts.
+    text: Optional[Union[List[List[str]], List[str], str]] = None
+    # The image input. It can be an image instance, file name, URL, or base64 encoded string.
+    # Can be formatted as:
+    # - Single image for a single request
+    # - List of images (one per request in a batch)
+    # - List of lists of images (multiple images per request)
+    # See also python/sglang/srt/utils.py:load_image for more details.
+    image_data: Optional[MultimodalDataInputFormat] = None
+    # The video input. Like image data, it can be a file name, a url, or base64 encoded string.
+    video_data: Optional[MultimodalDataInputFormat] = None
+    # The audio input. Like image data, it can be a file name, a url, or base64 encoded string.
+    audio_data: Optional[MultimodalDataInputFormat] = None
+    # The token ids for text; one can either specify text or input_ids.
+    input_ids: Optional[Union[List[List[int]], List[int]]] = None
+    # The request id.
+    rid: Optional[Union[List[str], str]] = None
+    # Dummy sampling params for compatibility
+    sampling_params: Optional[Union[List[Dict], Dict]] = None
+    # Dummy input embeds for compatibility
+    input_embeds: Optional[Union[List[List[List[float]]], List[List[float]]]] = None
+    # Whether to log metrics for this request (e.g. health_generate calls do not log metrics)
+    log_metrics: bool = True
+    # The modalities of the image data [image, multi-images, video]
+    modalities: Optional[List[str]] = None
+    # For cross-encoder requests
+    is_cross_encoder_request: bool = False
+
+    # For background responses (OpenAI responses API)
+    background: bool = False
+
+    def normalize_batch_and_arguments(self):
+        # at least one of text, input_ids, or image should be provided
+        if self.text is None and self.input_ids is None and self.image_data is None:
+            raise ValueError(
+                "At least one of text, input_ids, or image should be provided"
+            )
+
+        # text and input_ids cannot be provided at the same time
+        if self.text is not None and self.input_ids is not None:
+            raise ValueError("text and input_ids cannot be provided at the same time")
+
+        # Derive the batch size
+        self.batch_size = 0
+        self.is_single = True
+
+        # check the batch size of text
+        if self.text is not None:
+            if isinstance(self.text, list):
+                self.batch_size += len(self.text)
+                self.is_single = False
+            else:
+                self.batch_size += 1
+
+        # check the batch size of input_ids
+        if self.input_ids is not None:
+            if isinstance(self.input_ids[0], list):
+                self.batch_size += len(self.input_ids)
+                self.is_single = False
+            else:
+                self.batch_size += 1
+
+        # Fill in default arguments
+        if self.is_single:
+            if self.rid is None:
+                self.rid = uuid.uuid4().hex
+            if self.sampling_params is None:
+                self.sampling_params = {}
+            self.sampling_params["max_new_tokens"] = 0
+        else:
+            if self.rid is None:
+                self.rid = [uuid.uuid4().hex for _ in range(self.batch_size)]
+            else:
+                assert isinstance(self.rid, list), "The rid should be a list."
+
+            if self.sampling_params is None:
+                self.sampling_params = [{}] * self.batch_size
+            elif isinstance(self.sampling_params, dict):
+                self.sampling_params = [self.sampling_params] * self.batch_size
+            for i in range(self.batch_size):
+                self.sampling_params[i]["max_new_tokens"] = 0
+
+    def regenerate_rid(self):
+        self.rid = uuid.uuid4().hex
+        return self.rid
+
+    def contains_mm_input(self) -> bool:
+        return (
+            has_valid_data(self.image_data)
+            or has_valid_data(self.video_data)
+            or has_valid_data(self.audio_data)
+        )
+
+    def __getitem__(self, i):
+        if self.is_cross_encoder_request:
+            return EmbeddingReqInput(
+                text=[self.text[i]] if self.text is not None else None,
+                sampling_params=self.sampling_params[i],
+                rid=self.rid[i],
+                is_cross_encoder_request=True,
+            )
+
+        return EmbeddingReqInput(
+            text=self.text[i] if self.text is not None else None,
+            input_ids=self.input_ids[i] if self.input_ids is not None else None,
+            image_data=self.image_data[i] if self.image_data is not None else None,
+            audio_data=self.audio_data[i] if self.audio_data is not None else None,
+            video_data=self.video_data[i] if self.video_data is not None else None,
+            sampling_params=self.sampling_params[i],
+            rid=self.rid[i],
+        )
+
+
+@dataclass
+class TokenizedEmbeddingReqInput:
+    # The request id
+    rid: str
+    # The input text
+    input_text: str
+    # The input token ids
+    input_ids: List[int]
+    # The image inputs
+    image_inputs: dict
+    # The token type ids
+    token_type_ids: List[int]
+    # Dummy sampling params for compatibility
+    sampling_params: SamplingParams
+    # For data parallel rank routing
+    data_parallel_rank: Optional[int] = None
+    # For dp balance
+    dp_balance_id: int = -1
+
+
+@dataclass
+class BatchTokenizedEmbeddingReqInput:
+    # The batch of tokenized embedding requests
+    batch: List[TokenizedEmbeddingReqInput]
+
+    def __len__(self):
+        return len(self.batch)
+
+    def __getitem__(self, i):
+        return self.batch[i]
+
+    def __iter__(self):
+        return iter(self.batch)
+
+
+@dataclass
+class BatchTokenIDOut:
+    # The request id
+    rids: List[str]
+    # The finish reason
+    finished_reasons: List[BaseFinishReason]
+    # For incremental decoding
+    decoded_texts: List[str]
+    decode_ids: List[int]
+    read_offsets: List[int]
+    # Only used when `--skip-tokenizer-init` is on
+    output_ids: Optional[List[int]]
+    # Detokenization configs
+    skip_special_tokens: List[bool]
+    spaces_between_special_tokens: List[bool]
+    no_stop_trim: List[bool]
+
+    # Token counts
+    prompt_tokens: List[int]
+    completion_tokens: List[int]
+    cached_tokens: List[int]
+    spec_verify_ct: List[int]
+
+    # Logprobs
+    input_token_logprobs_val: List[float]
+    input_token_logprobs_idx: List[int]
+    output_token_logprobs_val: List[float]
+    output_token_logprobs_idx: List[int]
+    input_top_logprobs_val: List[List]
+    input_top_logprobs_idx: List[List]
+    output_top_logprobs_val: List[List]
+    output_top_logprobs_idx: List[List]
+    input_token_ids_logprobs_val: List[List]
+    input_token_ids_logprobs_idx: List[List]
+    output_token_ids_logprobs_val: List[List]
+    output_token_ids_logprobs_idx: List[List]
+
+    # Hidden states
+    output_hidden_states: List[List[float]]
+
+    # The information of placeholder tokens (e.g., image token)
+    # idx is the index of the token in the prompt after expansion.
+    # val is the length of padded tokens after expansion.
+    placeholder_tokens_idx: List[Optional[List[int]]]
+    placeholder_tokens_val: List[Optional[List[int]]]
+
+
+@dataclass
+class BatchMultimodalDecodeReq:
+    decoded_ids: List[int]
+    input_token_logprobs_val: List[float]
+    input_token_logprobs_idx: List[int]
+    output_token_logprobs_val: List[float]
+    output_token_logprobs_idx: List[int]
+    read_offsets: List[int]
+    skip_special_tokens: List[bool]
+    spaces_between_special_tokens: List[bool]
+    image_resolutions: List[List[int]]
+    resize_image_resolutions: List[List[int]]
+
+    # The request id
+    rids: List[str]
+    finished_reasons: List[BaseFinishReason]
+
+    # Token counts
+    prompt_tokens: List[int]
+    completion_tokens: List[int]
+    cached_tokens: List[int]
+
+    # Placeholder token info
+    placeholder_tokens_idx: List[Optional[List[int]]]
+    placeholder_tokens_val: List[Optional[List[int]]]
+
+    return_bytes: bool = False
+
+
+@dataclass
+class BatchStrOut:
+    # The request id
+    rids: List[str]
+    # The finish reason
+    finished_reasons: List[dict]
+    # The output decoded strings
+    output_strs: List[str]
+    # The token ids
+    output_ids: Optional[List[int]]
+
+    # Token counts
+    prompt_tokens: List[int]
+    completion_tokens: List[int]
+    cached_tokens: List[int]
+    spec_verify_ct: List[int]
+
+    # Logprobs
+    input_token_logprobs_val: List[float]
+    input_token_logprobs_idx: List[int]
+    output_token_logprobs_val: List[float]
+    output_token_logprobs_idx: List[int]
+    input_top_logprobs_val: List[List]
+    input_top_logprobs_idx: List[List]
+    output_top_logprobs_val: List[List]
+    output_top_logprobs_idx: List[List]
+    input_token_ids_logprobs_val: List[List]
+    input_token_ids_logprobs_idx: List[List]
+    output_token_ids_logprobs_val: List[List]
+    output_token_ids_logprobs_idx: List[List]
+
+    # Hidden states
+    output_hidden_states: List[List[float]]
+
+    placeholder_tokens_idx: List[Optional[List[int]]]
+    placeholder_tokens_val: List[Optional[List[int]]]
+
+
+@dataclass
+class BatchMultimodalOut:
+    # The request id
+    rids: List[str]
+    # The finish reason
+    finished_reasons: List[dict]
+    decoded_ids: List[List[int]]
+    # The outputs
+    outputs: Union[List[str | bytes], List[List[Dict]]]
+
+    # probability values for input tokens and output tokens
+    input_token_logprobs_val: List[List[float]]
+    input_token_logprobs_idx: List[List[int]]
+    output_token_logprobs_val: List[List[float]]
+    output_token_logprobs_idx: List[List[int]]
+
+    # Token counts
+    prompt_tokens: List[int]
+    completion_tokens: List[int]
+    cached_tokens: List[int]
+
+    placeholder_tokens_idx: List[Optional[List[int]]]
+    placeholder_tokens_val: List[Optional[List[int]]]
+
+    return_bytes: List[bool]
+
+
+@dataclass
+class BatchEmbeddingOut:
+    # The request id
+    rids: List[str]
+    # The finish reason
+    finished_reasons: List[BaseFinishReason]
+    # The output embedding
+    embeddings: List[List[float]]
+    # Token counts
+    prompt_tokens: List[int]
+    cached_tokens: List[int]
+    # Placeholder token info
+    placeholder_tokens_idx: List[Optional[List[int]]]
+    placeholder_tokens_val: List[Optional[List[int]]]
+
+
+@dataclass
+class ClearHiCacheReqInput:
+    pass
+
+
+@dataclass
+class ClearHiCacheReqOutput:
+    success: bool
+
+
+@dataclass
+class FlushCacheReqInput:
+    pass
+
+
+@dataclass
+class FlushCacheReqOutput:
+    success: bool
+
+
+@dataclass
+class UpdateWeightFromDiskReqInput:
+    # The model path with the new weights
+    model_path: str
+    # The format to load the weights
+    load_format: Optional[str] = None
+    # Whether to abort all requests before updating weights
+    abort_all_requests: bool = False
+    # Optional: Update weight version along with weights
+    weight_version: Optional[str] = None
+    # Whether to update weights asynchronously
+    is_async: bool = False
+    # Whether to empty torch cache
+    torch_empty_cache: bool = False
+    # Whether to keep the scheduler paused after weight update
+    keep_pause: bool = False
+
+
+@dataclass
+class UpdateWeightFromDiskReqOutput:
+    success: bool
+    message: str
+    # Number of paused requests during weight sync.
+    num_paused_requests: Optional[int] = 0
+
+
+@dataclass
+class UpdateWeightsFromDistributedReqInput:
+    names: List[str]
+    dtypes: List[str]
+    shapes: List[List[int]]
+    # The group name
+    group_name: str = "weight_update_group"
+    # Whether to flush the cache after updating weights
+    flush_cache: bool = True
+    # Whether to abort all requests before updating weights
+    abort_all_requests: bool = False
+    # Optional: Update weight version along with weights
+    weight_version: Optional[str] = None
+
+
+@dataclass
+class UpdateWeightsFromDistributedReqOutput:
+    success: bool
+    message: str
+
+
+@dataclass
+class UpdateWeightsFromTensorReqInput:
+    """Update model weights from tensor input.
+
+    - Tensors are serialized for transmission
+    - Data is structured in JSON for easy transmission over HTTP
+    """
+
+    serialized_named_tensors: List[Union[str, bytes]]
+    # Optional format specification for loading
+    load_format: Optional[str] = None
+    # Whether to flush the cache after updating weights
+    flush_cache: bool = True
+    # Whether to abort all requests before updating weights
+    abort_all_requests: bool = False
+    # Optional: Update weight version along with weights
+    weight_version: Optional[str] = None
+
+
+@dataclass
+class UpdateWeightsFromTensorReqOutput:
+    success: bool
+    message: str
+
+
+@dataclass
+class InitWeightsUpdateGroupReqInput:
+    # The master address
+    master_address: str
+    # The master port
+    master_port: int
+    # The rank offset
+    rank_offset: int
+    # The world size
+    world_size: int
+    # The group name
+    group_name: str = "weight_update_group"
+    # The backend
+    backend: str = "nccl"
+
+
+@dataclass
+class InitWeightsUpdateGroupReqOutput:
+    success: bool
+    message: str
+
+
+@dataclass
+class UpdateWeightVersionReqInput:
+    # The new weight version
+    new_version: str
+    # Whether to abort all running requests before updating
+    abort_all_requests: bool = True
+
+
+@dataclass
+class GetWeightsByNameReqInput:
+    name: str
+    truncate_size: int = 100
+
+
+@dataclass
+class GetWeightsByNameReqOutput:
+    parameter: list
+
+
+@dataclass
+class ReleaseMemoryOccupationReqInput:
+    # Optional tags to identify the memory region, which is primarily used for RL
+    # Currently we only support `weights` and `kv_cache`
+    tags: Optional[List[str]] = None
+
+
+@dataclass
+class ReleaseMemoryOccupationReqOutput:
+    pass
+
+
+@dataclass
+class ResumeMemoryOccupationReqInput:
+    # Optional tags to identify the memory region, which is primarily used for RL
+    # Currently we only support `weights` and `kv_cache`
+    tags: Optional[List[str]] = None
+
+
+@dataclass
+class ResumeMemoryOccupationReqOutput:
+    pass
+
+
+@dataclass
+class SlowDownReqInput:
+    forward_sleep_time: Optional[float]
+
+
+@dataclass
+class SlowDownReqOutput:
+    pass
+
+
+@dataclass
+class AbortReq:
+    # The request id
+    rid: str = ""
+    # Whether to abort all requests
+    abort_all: bool = False
+    # The finished reason data
+    finished_reason: Optional[Dict[str, Any]] = None
+    abort_reason: Optional[str] = None
+    # used in MultiTokenzierManager mode
+    rids: Optional[Union[List[str], str]] = None
+
+    def __post_init__(self):
+        self.rids = self.rid
+
+
+@dataclass
+class GetInternalStateReq:
+    pass
+
+
+@dataclass
+class GetInternalStateReqOutput:
+    internal_state: Dict[Any, Any]
+
+
+@dataclass
+class SetInternalStateReq:
+    server_args: Dict[str, Any]
+
+
+@dataclass
+class SetInternalStateReqOutput:
+    updated: bool
+    server_args: Dict[str, Any]
+
+
+@dataclass
+class ProfileReqInput:
+    # The output directory
+    output_dir: Optional[str] = None
+    # If set, it profile as many as this number of steps.
+    # If it is set, profiling is automatically stopped after this step, and
+    # the caller doesn't need to run stop_profile.
+    start_step: Optional[int] = None
+    num_steps: Optional[int] = None
+    activities: Optional[List[str]] = None
+    profile_by_stage: bool = False
+    with_stack: Optional[bool] = None
+    record_shapes: Optional[bool] = None
+
+
+class ProfileReqType(Enum):
+    START_PROFILE = 1
+    STOP_PROFILE = 2
+
+
+@dataclass
+class ProfileReq:
+    type: ProfileReqType
+    output_dir: Optional[str] = None
+    start_step: Optional[int] = None
+    num_steps: Optional[int] = None
+    activities: Optional[List[str]] = None
+    profile_by_stage: bool = False
+    with_stack: Optional[bool] = None
+    record_shapes: Optional[bool] = None
+    profile_id: Optional[str] = None
+
+
+@dataclass
+class ProfileReqOutput:
+    success: bool
+    message: str
+
+
+@dataclass
+class FreezeGCReq:
+    pass
+
+
+@dataclass
+class ConfigureLoggingReq:
+    log_requests: Optional[bool] = None
+    log_requests_level: Optional[int] = None
+    dump_requests_folder: Optional[str] = None
+    dump_requests_threshold: Optional[int] = None
+    crash_dump_folder: Optional[str] = None
+
+
+@dataclass
+class OpenSessionReqInput:
+    capacity_of_str_len: int
+    session_id: Optional[str] = None
+
+
+@dataclass
+class CloseSessionReqInput:
+    session_id: str
+
+
+@dataclass
+class OpenSessionReqOutput:
+    session_id: Optional[str]
+    success: bool
+
+
+@dataclass
+class HealthCheckOutput:
+    pass
+
+
+class ExpertDistributionReq(Enum):
+    START_RECORD = 1
+    STOP_RECORD = 2
+    DUMP_RECORD = 3
+
+
+@dataclass
+class ExpertDistributionReqOutput:
+    pass
+
+
+@dataclass
+class Function:
+    description: Optional[str] = None
+    name: Optional[str] = None
+    parameters: Optional[object] = None
+
+
+@dataclass
+class Tool:
+    function: Function
+    type: Optional[str] = "function"
+
+
+@dataclass
+class ParseFunctionCallReq:
+    text: str  # The text to parse.
+    tools: List[Tool] = field(
+        default_factory=list
+    )  # A list of available function tools (name, parameters, etc.).
+    tool_call_parser: Optional[str] = (
+        None  # Specify the parser type, e.g. 'llama3', 'qwen25', or 'mistral'. If not specified, tries all.
+    )
+
+
+@dataclass
+class SeparateReasoningReqInput:
+    text: str  # The text to parse.
+    reasoning_parser: str  # Specify the parser type, e.g., "deepseek-r1".
+
+
+@dataclass
+class VertexGenerateReqInput:
+    instances: List[dict]
+    parameters: Optional[dict] = None
+
+
+@dataclass
+class RpcReqInput:
+    method: str
+    parameters: Optional[Dict] = None
+
+
+@dataclass
+class RpcReqOutput:
+    success: bool
+    message: str
+
+
+@dataclass
+class LoadLoRAAdapterReqInput:
+    # The name of the lora module to newly loaded.
+    lora_name: str
+    # The path of loading.
+    lora_path: str
+    # Whether to pin the LoRA adapter in memory.
+    pinned: bool = False
+    # The unique identifier for the LoRA adapter, which automatically generated in the `TokenizerManager`.
+    lora_id: Optional[str] = None
+
+    def to_ref(self) -> LoRARef:
+        return LoRARef(
+            lora_id=self.lora_id,
+            lora_name=self.lora_name,
+            lora_path=self.lora_path,
+            pinned=self.pinned,
+        )
+
+
+@dataclass
+class UnloadLoRAAdapterReqInput:
+    # The name of lora module to unload.
+    lora_name: str
+    # The unique identifier for the LoRA adapter, which automatically generated in the `TokenizerManager`.
+    lora_id: Optional[str] = None
+
+    def to_ref(self) -> LoRARef:
+        return LoRARef(
+            lora_id=self.lora_id,
+            lora_name=self.lora_name,
+        )
+
+
+@dataclass
+class LoRAUpdateResult:
+    success: bool
+    error_message: Optional[str] = None
+    loaded_adapters: Optional[Dict[str, LoRARef]] = None
+
+
+LoadLoRAAdapterReqOutput = UnloadLoRAAdapterReqOutput = LoRAUpdateResult
+
+
+@dataclass
+class MultiTokenizerRegisterReq:
+    rids: Optional[Union[List[str], str]] = None
+    ipc_name: Optional[str] = None
+
+
+@dataclass
+class MultiTokenizerWrapper:
+    worker_id: int
+    obj: Optional[Any] = None
+
+
+class BlockReqType(Enum):
+    BLOCK = 1
+    UNBLOCK = 2
+
+
+@dataclass
+class BlockReqInput:
+    type: BlockReqType
diff --git a/python/sglang/srt/managers/mm_utils.py b/python/sglang/srt/managers/mm_utils.py
new file mode 100644
index 000000000..f495904d5
--- /dev/null
+++ b/python/sglang/srt/managers/mm_utils.py
@@ -0,0 +1,775 @@
+"""
+Multi-modality utils
+"""
+
+import hashlib
+import pickle
+from abc import abstractmethod
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
+
+import numpy as np
+import torch
+from torch import nn
+
+from sglang.srt.layers.multimodal import gpu_tensor_hash
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+    global_server_args_dict,
+)
+from sglang.srt.mem_cache.multimodal_cache import MultiModalCache
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.utils import flatten_nested_list, is_npu, print_warning_once
+from sglang.utils import logger
+
+_is_npu = is_npu()
+
+# NOTE: Using the shared logger from sglang.utils instead of creating a module-specific logger
+# to ensure consistent logging behavior across the codebase. This prevents issues with log
+# propagation that can cause some log messages (like 'server is fired up') to not appear
+# in the console when multimodal support is enabled.
+
+# TODO(mick): nccl
+# cuda_ipc: for intranode tensor sharing
+TensorTransportMode = Literal["cuda_ipc", "auto", "default"]
+
+
+class TransportProxyTensor(torch.Tensor):
+    """
+    A convenient torch.Tensor subclass that carries extra metadata and supports
+    efficient inter-process communications
+    """
+
+    @staticmethod
+    def __new__(
+        cls,
+        data: torch.Tensor,
+        name: Optional[str] = None,
+        fields: Optional[Dict[str, Any]] = None,
+        transport_mode: TensorTransportMode = "default",
+        *args,
+        **kwargs,
+    ):
+
+        if not isinstance(data, torch.Tensor):
+            raise TypeError(
+                f"Input 'data' must be a torch.Tensor, but got {type(data)}"
+            )
+
+        instance = data.as_subclass(cls)
+
+        instance._metadata = {
+            "name": name,
+            "fields": fields if fields is not None else {},
+            "transport_mode": transport_mode,
+        }
+
+        return instance
+
+    def __getstate__(self):
+        """
+        Called during pickling. Implements the serialization logic.
+        """
+        # acquire all serialize metadata from _metadata
+        state = {
+            "metadata": self._metadata,
+            "tensor_data": None,
+            "ipc_extra": None,
+        }
+
+        transport_mode = self._metadata.get("transport_mode", "default")
+
+        if transport_mode == "cuda_ipc" and self.is_cuda:
+            try:
+                storage = self.untyped_storage()
+                handle = storage._share_cuda_()
+
+                state["ipc_extra"] = {
+                    "handle": handle,
+                    "shape": self.shape,
+                    "dtype": self.dtype,
+                    "stride": self.stride(),
+                    "device_index": self.device.index,
+                }
+                state["tensor_data"] = None
+            except Exception as e:
+                # Failed to get CUDA IPC handle (possibly tp). Falling back to default transport.
+                state["metadata"]["transport_mode"] = "default"
+                state["tensor_data"] = self.as_subclass(torch.Tensor)
+        else:
+            state["metadata"]["transport_mode"] = "default"
+            state["tensor_data"] = self.as_subclass(torch.Tensor)
+
+        return state
+
+    def __setstate__(self, state: Dict[str, Any]):
+        """
+        Called during unpickling. Implements the deserialization logic.
+        """
+        self._metadata = state["metadata"]
+
+        transport_mode = self._metadata.get("transport_mode", "default")
+
+        if transport_mode == "cuda_ipc" and state["ipc_extra"] is not None:
+            ipc_extra = state["ipc_extra"]
+            handle, shape, dtype, stride, source_device_index = (
+                ipc_extra["handle"],
+                ipc_extra["shape"],
+                ipc_extra["dtype"],
+                ipc_extra["stride"],
+                ipc_extra["device_index"],
+            )
+
+            try:
+                target_device = torch.device(f"cuda:{source_device_index}")
+                with torch.cuda.device(target_device):
+                    storage = torch.UntypedStorage._new_shared_cuda(*handle)
+                    reconstructed_tensor = torch.empty(
+                        0, dtype=dtype, device=target_device
+                    ).set_(storage, storage_offset=0, size=shape, stride=stride)
+                    self.set_(reconstructed_tensor)
+            except Exception as e:
+                print(f"Error: Failed to deserialize from CUDA IPC handle ({e}).")
+                raise e
+
+        elif state["tensor_data"] is not None:
+            self.set_(state["tensor_data"])
+        else:
+            raise pickle.UnpicklingError(
+                "Invalid state for TransportProxyTensor: no tensor data found."
+            )
+
+    @property
+    def name(self) -> Optional[str]:
+        return self._metadata.get("name")
+
+    @property
+    def fields(self) -> Dict[str, Any]:
+        return self._metadata.get("fields", {})
+
+    @property
+    def transport_mode(self) -> TensorTransportMode:
+        return self._metadata.get("transport_mode", "default")
+
+
+class MultiModalityDataPaddingPattern:
+    """
+    Data tokens (like image tokens) often need special handling during padding
+    to maintain model compatibility. This class provides the interface for
+    implementing different padding strategies for data tokens
+    """
+
+    @abstractmethod
+    def pad_input_tokens(
+        self, input_ids: List[int], mm_inputs: MultimodalInputs
+    ) -> List[int]:
+        """
+        Pad the input ids sequence containing data tokens, and replace them with pad_values
+        """
+        pass
+
+
+class MultiModalityDataPaddingPatternTokenPairs(MultiModalityDataPaddingPattern):
+    """In this pattern, data tokens should be enclosed by special token pairs (e.g. <image>...</image>, data_token_pairs)
+
+    The padded value in a region enclosed by a token pair with be the same one, as the MultimodalDataItem's pad value
+
+    This strategy should be applied when data content is marked by start/end token pairs in the input sequence.
+    """
+
+    def __init__(
+        self,
+        data_token_pairs: Optional[List[Tuple[int, int]]],
+        data_start_token_ids: Optional[List[int]] = None,
+    ) -> None:
+        """
+
+        Args:
+            data_start_token_ids marks the start of a single multimodal data
+            See Minicpmo's slice_start_id for example
+        """
+        self.data_token_id_pairs = data_token_pairs
+        self.data_start_token_ids = data_start_token_ids or [
+            s for s, _e in data_token_pairs
+        ]
+
+    def pad_input_tokens(
+        self, input_ids: List[int], mm_inputs: MultimodalInputs
+    ) -> List[int]:
+        """
+        This function will replace the data-tokens in between with pad_values accordingly
+        """
+        pad_values = [item.pad_value for item in mm_inputs.mm_items]
+        data_token_pairs = self.data_token_id_pairs
+        mm_inputs.data_offsets = []
+        if data_token_pairs is None:
+            data_token_pairs = [mm_inputs.im_start_id, mm_inputs.im_end_id]
+        if data_token_pairs is None:
+            print_warning_once(
+                "No data_token_pairs provided, RadixAttention might be influenced."
+            )
+            return input_ids
+        start_token_ids = {s for s, _e in data_token_pairs}
+        end_tokens_ids = {e for _s, e in data_token_pairs}
+
+        padded_ids = []
+        last_idx = 0
+        data_idx = -1
+
+        start_indices = [i for i, x in enumerate(input_ids) if x in start_token_ids]
+        end_indices = [i for i, x in enumerate(input_ids) if x in end_tokens_ids]
+
+        if len(start_indices) != len(end_indices):
+            return input_ids
+
+        for start_idx, end_idx in zip(start_indices, end_indices):
+            padded_ids.extend(input_ids[last_idx : start_idx + 1])
+
+            if input_ids[start_idx] in self.data_start_token_ids:
+                data_idx += 1
+                mm_inputs.data_offsets += [start_idx]
+
+            if data_idx >= len(pad_values):
+                data_idx = len(pad_values) - 1
+
+            num_tokens = end_idx - start_idx - 1
+            pad_value = pad_values[data_idx]
+            padded_ids.extend([pad_value] * num_tokens)
+
+            last_idx = end_idx
+
+        padded_ids.extend(input_ids[last_idx:])
+
+        assert len(input_ids) == len(padded_ids), "Length validation fails"
+        return padded_ids
+
+
+class MultiModalityDataPaddingPatternMultimodalTokens(MultiModalityDataPaddingPattern):
+    """In this pattern, data tokens should be represented as repetitions of a single token
+    e.g. <image><image>....<image>, or <audio><audio>...<audio>
+    """
+
+    def pad_input_tokens(
+        self, input_ids: List[int], mm_inputs: MultimodalInputs
+    ) -> List[int]:
+        """
+        Replaces multimodal tokens in input_ids with corresponding pad_values from mm_items.
+        Each modality (image, audio, video) is handled separately based on its token_id.
+        """
+        if not input_ids or not mm_inputs.mm_items:
+            return input_ids
+
+        input_ids_tensor = torch.as_tensor(input_ids)
+
+        # Create mapping of token_ids to pad_values for each modality
+        token_to_pad_mapping = {}
+
+        for item in mm_inputs.mm_items:
+            if item.is_image() and mm_inputs.im_token_id is not None:
+                token_to_pad_mapping[mm_inputs.im_token_id] = item.pad_value
+            elif item.is_audio() and mm_inputs.audio_token_id is not None:
+                token_to_pad_mapping[mm_inputs.audio_token_id] = item.pad_value
+            elif item.is_video() and mm_inputs.video_token_id is not None:
+                token_to_pad_mapping[mm_inputs.video_token_id] = item.pad_value
+            else:
+                raise ValueError(f"No multimodal token id provided for {item.modality}")
+
+        # Apply replacements for all tokens at once
+        for token_id, pad_value in token_to_pad_mapping.items():
+            input_ids_tensor[input_ids_tensor == token_id] = pad_value
+
+        ret_input_ids = input_ids_tensor.tolist()
+
+        return ret_input_ids
+
+
+embedding_cache: Optional[MultiModalCache] = None
+
+
+def init_embedding_cache(max_size: int = 0):
+    global embedding_cache
+    embedding_cache = MultiModalCache(max_size)
+
+
+def get_embedding_hash(embedding_items: List[MultimodalDataItem]) -> int:
+    hash_list = [item.hash for item in embedding_items]
+    return hash(tuple(hash_list))
+
+
+def get_embedding_chunk(
+    embedding: torch.Tensor,
+    extend_prefix_len: int,
+    extend_seq_len: int,
+    items_offset: List[Tuple[int, int]],
+) -> Tuple[torch.Tensor, int, int]:
+    """
+    Extract a chunk of embeddings based on the specified prefix length, sequence length, and offset ranges.
+
+    Args:
+        embedding: The full embedding tensor to extract a chunk from
+        extend_prefix_len: The starting position (prefix length) for extraction
+        extend_seq_len: The number of tokens to extract
+        items_offset: List of [start, end] offset ranges for multimodal items in the input sequence
+
+    Returns:
+        A tuple containing:
+        - The extracted embedding chunk as a tensor
+        - The start index used for extraction
+        - The end index used for extraction
+
+    Note:
+        If there's no overlap between the requested range and the offset ranges,
+        an empty tensor is returned with zeros for start and end indices.
+    """
+    start_index, end_index = 0, 0
+    extend_start_index = extend_prefix_len
+    extend_end_index = extend_prefix_len + extend_seq_len - 1
+
+    for start, end in items_offset:
+        if extend_start_index >= start and extend_start_index <= end:
+            start_index += extend_start_index - start
+        elif extend_start_index > end:
+            start_index += end - start + 1
+
+        if extend_end_index >= start and extend_end_index <= end:
+            end_index += extend_end_index - start + 1
+        elif extend_end_index > end:
+            end_index += end - start + 1
+    # some models' embedding is 3-dim, reshape it to 2-dim
+    embedding = embedding.reshape(-1, embedding.shape[-1])
+    embedding_chunk = embedding[start_index:end_index]
+    return embedding_chunk, start_index, end_index
+
+
+def _get_precomputed_embedding(
+    items: List[MultimodalDataItem],
+) -> Optional[torch.Tensor]:
+    """
+    If all items have precomputed_embeddings, return their concatenation.
+    If some but not all have precomputed_embeddings, raise NotImplementedError.
+    If none have precomputed_embeddings, return None.
+    """
+    precomputed_embeddings = [item.precomputed_embeddings for item in items]
+    if any(feature is not None for feature in precomputed_embeddings):
+        if not all(feature is not None for feature in precomputed_embeddings):
+            raise NotImplementedError(
+                "MM inputs where only some items are precomputed."
+            )
+        result = torch.concat(precomputed_embeddings)
+        # some models embedding is 3-dim, reshape it to 2-dim (similar to get_embedding_chunk)
+        result = result.reshape(-1, result.shape[-1])
+        return result
+    return None
+
+
+def _get_chunked_prefill_embedding(
+    data_embedding_func: Callable[[List[MultimodalDataItem]], torch.Tensor],
+    embedding_items: List[MultimodalDataItem],
+    items_size: List[int],
+    prefix_length: List[int],
+    extend_length: List[int],
+    items_offset_list: List[List[Tuple[int, int]]],
+) -> Optional[torch.Tensor]:
+    # Calculate embedding for each request, try to get it from cache to avoid repeated calculation
+    embedding_list = []
+    # FIXME(Xinyuan): temporary workaround for eagle3, which may have len(items_size) > len(prefix_length)
+    max_iterations = min(len(items_size) - 1, len(prefix_length))
+    for i in range(max_iterations):
+        if items_size[i] == items_size[i + 1]:
+            continue
+        embedding_items_per_req = embedding_items[items_size[i] : items_size[i + 1]]
+        items_offset = items_offset_list[i]
+        assert items_offset is not None, items_offset
+        embedding_items_hash = get_embedding_hash(embedding_items_per_req)
+        # if all items has been prefixed, we do not need to calculate embedding
+        if all([offset_end < prefix_length[i] for _, offset_end in items_offset]):
+            continue
+        embedding_per_req = embedding_cache.get(embedding_items_hash)
+        if embedding_per_req is None:
+            embedding_per_req = data_embedding_func(embedding_items_per_req)
+            if not embedding_cache.put(embedding_items_hash, embedding_per_req):
+                print_warning_once(
+                    "Multimodal embedding cache is full. This typically occurs when a single "
+                    "embedding exceeds the cache size limit. Consider increasing the "
+                    "`SGLANG_VLM_CACHE_SIZE_MB` environment variable or reducing the input "
+                    "embedding size."
+                )
+
+        embedding_per_req_chunk, _, _ = get_embedding_chunk(
+            embedding=embedding_per_req,
+            extend_prefix_len=prefix_length[i],
+            extend_seq_len=extend_length[i] if i < len(extend_length) else 0,
+            items_offset=items_offset,
+        )
+        embedding_list.append(embedding_per_req_chunk)
+    if len(embedding_list) == 0:
+        return None
+    return torch.concat(embedding_list, dim=0)
+
+
+def _get_multimodal_mask(
+    input_ids: torch.Tensor, placeholder_tensor: torch.Tensor
+) -> torch.Tensor:
+    return torch.isin(input_ids, placeholder_tensor).unsqueeze(-1)
+
+
+def _adjust_embedding_length(
+    embedding: torch.Tensor,
+    mask: torch.Tensor,
+    logger,
+) -> torch.Tensor:
+    num_mm_tokens_in_embedding = embedding.shape[0]
+    num_mm_tokens_in_input_ids = mask.sum().item()
+    if num_mm_tokens_in_input_ids != num_mm_tokens_in_embedding:
+        logger.warning(
+            f"Number of tokens in multimodal embedding does not match those in the input text. "
+            f"Got {num_mm_tokens_in_input_ids} tokens in the text but {num_mm_tokens_in_embedding} "
+            f"tokens from multimodal embeddings."
+        )
+        if num_mm_tokens_in_input_ids < num_mm_tokens_in_embedding:
+            chunked_prefill_size = global_server_args_dict["chunked_prefill_size"]
+            if chunked_prefill_size != -1:
+                logger.warning(
+                    "You may want to avoid this issue by raising `chunked_prefill_size`, or disabling chunked prefill"
+                )
+            # extract from the end: this is a compromise
+            if embedding.dim() == 2:
+                embedding = embedding[-num_mm_tokens_in_input_ids:, :]
+            else:
+                num_multimodal = num_mm_tokens_in_input_ids // embedding.shape[0]
+                embedding = embedding[-num_multimodal:, :]
+        else:
+            raise RuntimeError(
+                f"Insufficient multimodal embedding length: {num_mm_tokens_in_input_ids=} vs {num_mm_tokens_in_embedding=}. This is an internal error"
+            )
+    return embedding
+
+
+def get_embedding_and_mask(
+    data_embedding_func: Callable[[List[MultimodalDataItem]], torch.Tensor],
+    embedding_items: List[MultimodalDataItem],
+    placeholder_tensor: torch.Tensor,
+    input_ids: torch.Tensor,
+    items_size: List[int],
+    prefix_length: List[int],
+    extend_length: List[int],
+    items_offset_list: List[List[Tuple[int, int]]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Generate multimodal embeddings and create a mask for identifying their positions in the input sequence.
+
+    Args:
+        data_embedding_func: Function that generates embeddings for multimodal items
+        embedding_items: List of multimodal items to embed
+        placeholder_tensor: Tensor containing token IDs that serve as placeholders for multimodal content
+        input_ids: The input token IDs tensor
+        items_size: Cumulative sizes of multimodal items per request
+        prefix_length: Prefix lengths for each request
+        extend_length: Sequence lengths for each request
+        items_offset_list: List of offset ranges for multimodal items in each request
+
+    Returns:
+        A tuple containing:
+        - The generated embeddings tensor
+        - A boolean mask tensor indicating where these embeddings should be placed
+    """
+    # 1. Get embedding
+    embedding = _get_precomputed_embedding(embedding_items)
+    if embedding is None:
+        embedding = _get_chunked_prefill_embedding(
+            data_embedding_func,
+            embedding_items,
+            items_size,
+            prefix_length,
+            extend_length,
+            items_offset_list,
+        )
+        if embedding is None:
+            return None, None
+    # 2. Get mask
+    if _is_npu:
+        torch.npu.current_stream().synchronize()
+    special_multimodal_mask = _get_multimodal_mask(input_ids, placeholder_tensor)
+    # 3. Adjust embedding length if needed
+    embedding = _adjust_embedding_length(embedding, special_multimodal_mask, logger)
+    return embedding, special_multimodal_mask
+
+
+def embed_mm_inputs(
+    mm_inputs_list: List[MultimodalInputs],
+    extend_prefix_lens: List[int],
+    extend_seq_lens: List[int],
+    input_ids: torch.Tensor,
+    input_embedding: nn.Embedding,
+    multimodal_model: nn.Module = None,
+    data_embedding_func_mapping: Dict[
+        Modality, Callable[[List[MultimodalDataItem]], torch.Tensor]
+    ] = None,
+    placeholder_tokens: dict[Modality, List[int]] = None,
+) -> Optional[torch.Tensor]:
+    """
+    Embed multimodal inputs and integrate them with text token embeddings.
+
+    Args:
+        mm_inputs_list: List of multimodal inputs to process
+        extend_prefix_lens: Prefix lengths for each request
+        extend_seq_lens: Sequence lengths for each request
+        input_ids: Input token IDs tensor
+        input_embedding: Embedding layer for text tokens
+        placeholder_tokens: Token IDs for multimodal placeholders (uses pad_values if None)
+
+    Returns:
+        Combined embedding tensor with multimodal content integrated
+    """
+
+    if mm_inputs_list is None:
+        return None
+
+    # 1. Calculate the multimodal data which exists in input_ids, with the help of pad_values
+    # we assume that multimodal data are represented with its pad_values in input_ids
+    item_flatten_list = []
+    for mm_inputs in mm_inputs_list:
+        item_flatten_list += [item for item in mm_inputs.mm_items if item is not None]
+
+    embeddings, masks = [], []
+    # 2. Get multimodal embedding separately
+    # Try get mm embedding if any
+    for modality in Modality.all():
+        items = [
+            item for item in item_flatten_list if item.is_modality(modality=modality)
+        ]
+        embedder = (
+            None
+            if data_embedding_func_mapping is None
+            else data_embedding_func_mapping.get(modality, None)
+        )
+        if embedder is None:
+            # "image", "video", etc
+            modality_id = modality.name.lower()
+            embedder = getattr(multimodal_model, f"get_{modality_id}_feature", None)
+        if len(items) != 0 and embedder is not None:
+            placeholder_tensor = torch.as_tensor(
+                [item.pad_value for item in items],
+                device=input_ids.device,
+            )
+            # calculate per request items length offset
+            items_size = torch.zeros(len(mm_inputs_list) + 1, dtype=int)
+            items_offsets = []
+            for i, mm_inputs in enumerate(mm_inputs_list):
+                mm_items = [
+                    item
+                    for item in mm_inputs.mm_items
+                    if item.is_modality(modality=modality)
+                ]
+                items_size[i + 1] = len(mm_items)
+                items_offsets.append(
+                    flatten_nested_list([item.offsets for item in mm_items])
+                )
+            items_size = torch.cumsum(items_size, dim=0).tolist()
+
+            embedding, mask = get_embedding_and_mask(
+                data_embedding_func=embedder,
+                embedding_items=items,
+                placeholder_tensor=placeholder_tensor,
+                input_ids=input_ids,
+                items_size=items_size,
+                prefix_length=extend_prefix_lens,
+                extend_length=extend_seq_lens,
+                items_offset_list=items_offsets,
+            )
+            embeddings += [embedding]
+            masks += [mask]
+
+    # 3. Get input embeddings
+    vocab_size = input_embedding.num_embeddings
+    # Important: clamp after getting original multimodal regions
+    # Clamp input ids. This is because the input_ids for the multimodal tokens are
+    # filled with the hash values of the multimodal for the prefix matching in the radix attention.
+    # There values are useless because their embeddings will be replaced by vision embeddings anyway.
+    input_ids.clamp_(min=0, max=vocab_size - 1)
+    inputs_embeds = input_embedding(input_ids)
+
+    # 4. scatter embeddings into input embedding
+    for embedding, mask in zip(embeddings, masks):
+        if embedding is None or mask is None:
+            continue
+        # in-place update
+        indices = torch.where(mask.squeeze(dim=-1))[0]
+        inputs_embeds[indices] = embedding.to(inputs_embeds.device, inputs_embeds.dtype)
+    return inputs_embeds
+
+
+def general_mm_embed_routine(
+    input_ids: torch.Tensor,
+    forward_batch: ForwardBatch,
+    language_model: nn.Module,
+    multimodal_model: Optional[nn.Module] = None,
+    data_embedding_funcs: Dict[
+        Modality, Callable[[List[MultimodalDataItem]], torch.Tensor]
+    ] = None,
+    placeholder_tokens: Optional[dict[Modality, List[int]]] = None,
+    **kwargs,
+) -> torch.Tensor:
+    """
+    Process multimodal inputs and forward through language model.
+
+    Args:
+        input_ids: Input token IDs tensor
+        forward_batch: Batch information for model forward pass
+        language_model: Base language model to use
+        data_embedding_funcs: A dictionary mapping from modality type to the corresponding embedding function.
+        placeholder_tokens: Token IDs for multimodal placeholders
+        **kwargs: Additional arguments passed to language model
+
+    Returns:
+        Hidden states from language model forward pass
+    """
+    assert hasattr(language_model, "get_input_embeddings")
+    embed_tokens = language_model.get_input_embeddings()
+    if (
+        not forward_batch.forward_mode.is_decode()
+        and not forward_batch.forward_mode.is_target_verify()
+        and forward_batch.contains_mm_inputs()
+    ):
+        mm_inputs_list = [
+            mm_input for mm_input in forward_batch.mm_inputs if mm_input is not None
+        ]
+        extend_prefix_lens = [
+            prefix_len
+            for i, prefix_len in enumerate(forward_batch.extend_prefix_lens_cpu)
+            if forward_batch.mm_inputs[i] is not None
+        ]
+        extend_seq_lens = [
+            seq_len
+            for i, seq_len in enumerate(forward_batch.extend_seq_lens_cpu)
+            if forward_batch.mm_inputs[i] is not None
+        ]
+        inputs_embeds = embed_mm_inputs(
+            mm_inputs_list=mm_inputs_list,
+            extend_prefix_lens=extend_prefix_lens,
+            extend_seq_lens=extend_seq_lens,
+            input_ids=input_ids,
+            input_embedding=embed_tokens,
+            multimodal_model=multimodal_model,
+            data_embedding_func_mapping=data_embedding_funcs,
+            placeholder_tokens=placeholder_tokens,
+        )
+        # once used, mm_inputs is useless, considering chunked-prefill is disabled for multimodal models
+        # just being defensive here
+        forward_batch.mm_inputs = None
+    else:
+        inputs_embeds = embed_tokens(input_ids)
+
+    hidden_states = language_model(
+        input_ids=None,
+        forward_batch=forward_batch,
+        input_embeds=inputs_embeds,
+        **kwargs,
+    )
+    return hidden_states
+
+
+def get_multimodal_data_bounds(
+    input_ids: torch.Tensor, pad_values: List[int], token_pairs: List[Tuple[int, int]]
+) -> torch.Tensor:
+    """
+    Returns a tensor indicating the bounds of multimodal data (images, video, audio, etc.)
+
+    Returns:
+        [bounds_count, 2]
+    """
+    # All the multimodal data in the batch should share the same special bound token ids.
+    start_tokens = {s for s, _e in token_pairs}
+    end_tokens = {e for _s, e in token_pairs}
+
+    assert all(isinstance(t, int) for t in start_tokens)
+    assert all(isinstance(t, int) for t in end_tokens)
+
+    start_cond = torch.isin(
+        input_ids, torch.as_tensor(start_tokens, device=input_ids.device)
+    )
+    end_cond = torch.isin(
+        input_ids, torch.as_tensor(end_tokens, device=input_ids.device)
+    )
+
+    (data_start_tokens,) = torch.where(start_cond)
+    (data_end_tokens,) = torch.where(end_cond)
+
+    data_start_tokens_cpu = data_start_tokens.cpu().tolist()
+    data_end_tokens_cpu = data_end_tokens.cpu().tolist()
+
+    # the im_start_id sometimes can be cached as prefix, but it is needed for the embedding of the multimodal data
+    if len(data_start_tokens_cpu) != len(data_end_tokens_cpu):
+        if (
+            len(data_start_tokens_cpu) + 1 == len(data_end_tokens_cpu)
+            and input_ids[0].item() in pad_values
+            and data_end_tokens_cpu
+            and data_start_tokens_cpu
+            and data_end_tokens_cpu[0] < data_start_tokens_cpu[0]
+        ):
+            data_start_tokens_cpu.insert(0, 0)
+    valid_mm_data_nums = min(len(data_start_tokens_cpu), len(data_end_tokens_cpu))
+
+    if valid_mm_data_nums == 0:
+        return torch.zeros((0, 2), device=input_ids.device)
+
+    # Filter out pairs where start_token >= end_token
+    valid_pairs = []
+    for i in range(valid_mm_data_nums):
+        start_token = data_start_tokens_cpu[i]
+        end_token = data_end_tokens_cpu[i]
+        if start_token < end_token:
+            valid_pairs.append((start_token + 1, end_token - 1))
+
+    if not valid_pairs:
+        return torch.zeros((0, 2), device=input_ids.device)
+
+    # Convert valid pairs to tensor
+    valid_pairs_tensor = torch.as_tensor(valid_pairs, device=input_ids.device)
+    return valid_pairs_tensor
+
+
+def data_hash(data) -> int:
+    hash_bytes = hashlib.sha256(data).digest()[:8]
+    return int.from_bytes(hash_bytes, byteorder="big", signed=False)
+
+
+def tensor_hash(tensor_list) -> int:
+    """
+    hash a tensor or a tensor list
+    """
+    tensor = tensor_list
+    if isinstance(tensor_list, list):
+        tensor_list = flatten_nested_list(tensor_list)
+        tensor_list = [
+            x.flatten() if isinstance(x, torch.Tensor) else x for x in tensor_list
+        ]
+        tensor = torch.concat(tensor_list)
+    if tensor.is_cuda:
+        return gpu_tensor_hash(tensor.cuda())
+    tensor = tensor.detach().contiguous()
+
+    if tensor.dtype == torch.bfloat16:
+        # memoryview() doesn't support PyTorch's BFloat16 dtype
+        tensor = tensor.float()
+
+    assert isinstance(tensor, torch.Tensor)
+    tensor_cpu = tensor.cpu()
+
+    mv = memoryview(tensor_cpu.numpy())
+    return data_hash(mv.tobytes())
+
+
+def hash_feature(f):
+    if isinstance(f, list):
+        if isinstance(f[0], torch.Tensor):
+            return tensor_hash(f)
+        return data_hash(tuple(flatten_nested_list(f)))
+    elif isinstance(f, np.ndarray):
+        arr = np.ascontiguousarray(f)
+        arr_bytes = arr.tobytes()
+        return data_hash(arr_bytes)
+    elif isinstance(f, torch.Tensor):
+        return tensor_hash([f])
+    return data_hash(f)
diff --git a/python/sglang/srt/managers/multi_tokenizer_mixin.py b/python/sglang/srt/managers/multi_tokenizer_mixin.py
new file mode 100644
index 000000000..0aadfba2c
--- /dev/null
+++ b/python/sglang/srt/managers/multi_tokenizer_mixin.py
@@ -0,0 +1,579 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""MultiTokenizerMixin is a class that provides nesscary methods for MultiTokenizerManager and DetokenizerManager."""
+import asyncio
+import logging
+import multiprocessing as multiprocessing
+import os
+import pickle
+import sys
+import threading
+from functools import partialmethod
+from multiprocessing import shared_memory
+from typing import Any, Dict
+
+import setproctitle
+import zmq
+import zmq.asyncio
+
+from sglang.srt.disaggregation.utils import DisaggregationMode, TransferBackend
+from sglang.srt.managers.disagg_service import start_disagg_service
+from sglang.srt.managers.io_struct import (
+    BatchEmbeddingOut,
+    BatchMultimodalOut,
+    BatchStrOut,
+    BatchTokenIDOut,
+    MultiTokenizerRegisterReq,
+    MultiTokenizerWrapper,
+)
+from sglang.srt.managers.tokenizer_communicator_mixin import _Communicator
+from sglang.srt.managers.tokenizer_manager import TokenizerManager
+from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.utils import get_zmq_socket, kill_process_tree
+from sglang.utils import get_exception_traceback
+
+logger = logging.getLogger(__name__)
+
+
+class SocketMapping:
+    def __init__(self):
+        self._zmq_context = zmq.Context()
+        self._mapping: Dict[str, zmq.Socket] = {}
+
+    def clear_all_sockets(self):
+        for socket in self._mapping.values():
+            socket.close()
+        self._mapping.clear()
+
+    def register_ipc_mapping(
+        self, recv_obj: MultiTokenizerRegisterReq, worker_id: str, is_tokenizer: bool
+    ):
+        type_str = "tokenizer" if is_tokenizer else "detokenizer"
+        if worker_id in self._mapping:
+            logger.warning(
+                f"{type_str} already registered with worker {worker_id}, skipping..."
+            )
+            return
+        logger.info(
+            f"{type_str} not registered with worker {worker_id}, registering..."
+        )
+        socket = get_zmq_socket(self._zmq_context, zmq.PUSH, recv_obj.ipc_name, False)
+        self._mapping[worker_id] = socket
+        self._mapping[worker_id].send_pyobj(recv_obj)
+
+    def send_output(self, worker_id: str, output: Any):
+        if worker_id not in self._mapping:
+            logger.error(
+                f"worker ID {worker_id} not registered. Check if the server Process is alive"
+            )
+            return
+        self._mapping[worker_id].send_pyobj(output)
+
+
+def _handle_output_by_index(output, i):
+    """NOTE: A maintainable method is better here."""
+    if isinstance(output, BatchTokenIDOut):
+        new_output = BatchTokenIDOut(
+            rids=[output.rids[i]],
+            finished_reasons=(
+                [output.finished_reasons[i]]
+                if len(output.finished_reasons) > i
+                else None
+            ),
+            decoded_texts=(
+                [output.decoded_texts[i]] if len(output.decoded_texts) > i else None
+            ),
+            decode_ids=([output.decode_ids[i]] if len(output.decode_ids) > i else None),
+            read_offsets=(
+                [output.read_offsets[i]] if len(output.read_offsets) > i else None
+            ),
+            output_ids=(
+                [output.output_ids[i]]
+                if output.output_ids and len(output.output_ids) > i
+                else None
+            ),
+            skip_special_tokens=(
+                [output.skip_special_tokens[i]]
+                if len(output.skip_special_tokens) > i
+                else None
+            ),
+            spaces_between_special_tokens=(
+                [output.spaces_between_special_tokens[i]]
+                if len(output.spaces_between_special_tokens) > i
+                else None
+            ),
+            no_stop_trim=(
+                [output.no_stop_trim[i]] if len(output.no_stop_trim) > i else None
+            ),
+            prompt_tokens=(
+                [output.prompt_tokens[i]] if len(output.prompt_tokens) > i else None
+            ),
+            completion_tokens=(
+                [output.completion_tokens[i]]
+                if len(output.completion_tokens) > i
+                else None
+            ),
+            cached_tokens=(
+                [output.cached_tokens[i]] if len(output.cached_tokens) > i else None
+            ),
+            spec_verify_ct=(
+                [output.spec_verify_ct[i]] if len(output.spec_verify_ct) > i else None
+            ),
+            input_token_logprobs_val=(
+                [output.input_token_logprobs_val[i]]
+                if output.input_token_logprobs_val
+                else None
+            ),
+            input_token_logprobs_idx=(
+                [output.input_token_logprobs_idx[i]]
+                if output.input_token_logprobs_idx
+                else None
+            ),
+            output_token_logprobs_val=(
+                [output.output_token_logprobs_val[i]]
+                if output.output_token_logprobs_val
+                else None
+            ),
+            output_token_logprobs_idx=(
+                [output.output_token_logprobs_idx[i]]
+                if output.output_token_logprobs_idx
+                else None
+            ),
+            input_top_logprobs_val=(
+                [output.input_top_logprobs_val[i]]
+                if output.input_top_logprobs_val
+                else None
+            ),
+            input_top_logprobs_idx=(
+                [output.input_top_logprobs_idx[i]]
+                if output.input_top_logprobs_idx
+                else None
+            ),
+            output_top_logprobs_val=(
+                [output.output_top_logprobs_val[i]]
+                if output.output_top_logprobs_val
+                else None
+            ),
+            output_top_logprobs_idx=(
+                [output.output_top_logprobs_idx[i]]
+                if output.output_top_logprobs_idx
+                else None
+            ),
+            input_token_ids_logprobs_val=(
+                [output.input_token_ids_logprobs_val[i]]
+                if output.input_token_ids_logprobs_val
+                else None
+            ),
+            input_token_ids_logprobs_idx=(
+                [output.input_token_ids_logprobs_idx[i]]
+                if output.input_token_ids_logprobs_idx
+                else None
+            ),
+            output_token_ids_logprobs_val=(
+                [output.output_token_ids_logprobs_val[i]]
+                if output.output_token_ids_logprobs_val
+                else None
+            ),
+            output_token_ids_logprobs_idx=(
+                [output.output_token_ids_logprobs_idx[i]]
+                if output.output_token_ids_logprobs_idx
+                else None
+            ),
+            output_hidden_states=(
+                [output.output_hidden_states[i]]
+                if output.output_hidden_states
+                else None
+            ),
+            placeholder_tokens_idx=None,
+            placeholder_tokens_val=None,
+        )
+    elif isinstance(output, BatchEmbeddingOut):
+        new_output = BatchEmbeddingOut(
+            rids=[output.rids[i]],
+            finished_reasons=(
+                [output.finished_reasons[i]]
+                if len(output.finished_reasons) > i
+                else None
+            ),
+            embeddings=([output.embeddings[i]] if len(output.embeddings) > i else None),
+            prompt_tokens=(
+                [output.prompt_tokens[i]] if len(output.prompt_tokens) > i else None
+            ),
+            cached_tokens=(
+                [output.cached_tokens[i]] if len(output.cached_tokens) > i else None
+            ),
+            placeholder_tokens_idx=None,
+            placeholder_tokens_val=None,
+        )
+    elif isinstance(output, BatchStrOut):
+        new_output = BatchStrOut(
+            rids=[output.rids[i]],
+            finished_reasons=(
+                [output.finished_reasons[i]]
+                if len(output.finished_reasons) > i
+                else None
+            ),
+            output_strs=(
+                [output.output_strs[i]] if len(output.output_strs) > i else None
+            ),
+            output_ids=(
+                [output.output_ids[i]]
+                if output.output_ids and len(output.output_ids) > i
+                else None
+            ),
+            prompt_tokens=(
+                [output.prompt_tokens[i]] if len(output.prompt_tokens) > i else None
+            ),
+            completion_tokens=(
+                [output.completion_tokens[i]]
+                if len(output.completion_tokens) > i
+                else None
+            ),
+            cached_tokens=(
+                [output.cached_tokens[i]] if len(output.cached_tokens) > i else None
+            ),
+            spec_verify_ct=(
+                [output.spec_verify_ct[i]] if len(output.spec_verify_ct) > i else None
+            ),
+            input_token_logprobs_val=(
+                [output.input_token_logprobs_val[i]]
+                if output.input_token_logprobs_val
+                else None
+            ),
+            input_token_logprobs_idx=(
+                [output.input_token_logprobs_idx[i]]
+                if output.input_token_logprobs_idx
+                else None
+            ),
+            output_token_logprobs_val=(
+                [output.output_token_logprobs_val[i]]
+                if output.output_token_logprobs_val
+                else None
+            ),
+            output_token_logprobs_idx=(
+                [output.output_token_logprobs_idx[i]]
+                if output.output_token_logprobs_idx
+                else None
+            ),
+            input_top_logprobs_val=(
+                [output.input_top_logprobs_val[i]]
+                if output.input_top_logprobs_val
+                else None
+            ),
+            input_top_logprobs_idx=(
+                [output.input_top_logprobs_idx[i]]
+                if output.input_top_logprobs_idx
+                else None
+            ),
+            output_top_logprobs_val=(
+                [output.output_top_logprobs_val[i]]
+                if output.output_top_logprobs_val
+                else None
+            ),
+            output_top_logprobs_idx=(
+                [output.output_top_logprobs_idx[i]]
+                if output.output_top_logprobs_idx
+                else None
+            ),
+            input_token_ids_logprobs_val=(
+                [output.input_token_ids_logprobs_val[i]]
+                if output.input_token_ids_logprobs_val
+                else None
+            ),
+            input_token_ids_logprobs_idx=(
+                [output.input_token_ids_logprobs_idx[i]]
+                if output.input_token_ids_logprobs_idx
+                else None
+            ),
+            output_token_ids_logprobs_val=(
+                [output.output_token_ids_logprobs_val[i]]
+                if output.output_token_ids_logprobs_val
+                else None
+            ),
+            output_token_ids_logprobs_idx=(
+                [output.output_token_ids_logprobs_idx[i]]
+                if output.output_token_ids_logprobs_idx
+                else None
+            ),
+            output_hidden_states=(
+                [output.output_hidden_states[i]]
+                if output.output_hidden_states
+                else None
+            ),
+            placeholder_tokens_idx=None,
+            placeholder_tokens_val=None,
+        )
+    elif isinstance(output, BatchMultimodalOut):
+        new_output = BatchMultimodalOut(
+            rids=[output.rids[i]],
+            finished_reasons=(
+                [output.finished_reasons[i]]
+                if len(output.finished_reasons) > i
+                else None
+            ),
+            outputs=([output.outputs[i]] if len(output.outputs) > i else None),
+            prompt_tokens=(
+                [output.prompt_tokens[i]] if len(output.prompt_tokens) > i else None
+            ),
+            completion_tokens=(
+                [output.completion_tokens[i]]
+                if len(output.completion_tokens) > i
+                else None
+            ),
+            cached_tokens=(
+                [output.cached_tokens[i]] if len(output.cached_tokens) > i else None
+            ),
+            placeholder_tokens_idx=None,
+            placeholder_tokens_val=None,
+        )
+    else:
+        new_output = output
+    return new_output
+
+
+class MultiHttpWorkerDetokenizerMixin:
+    """Mixin class for MultiTokenizerManager and DetokenizerManager"""
+
+    def get_worker_ids_from_req_rids(self, rids):
+        if isinstance(rids, list):
+            worker_ids = [int(rid.split("_")[0]) for rid in rids]
+        elif isinstance(rids, str):
+            worker_ids = [int(rids.split("_")[0])]
+        else:
+            worker_ids = []
+        return worker_ids
+
+    def multi_http_worker_event_loop(self):
+        """The event loop that handles requests, for multi multi-http-worker mode"""
+        self.socket_mapping = SocketMapping()
+        while True:
+            recv_obj = self.recv_from_scheduler.recv_pyobj()
+            output = self._request_dispatcher(recv_obj)
+            if output is None:
+                continue
+            # Extract worker_id from rid
+            if isinstance(recv_obj.rids, list):
+                worker_ids = self.get_worker_ids_from_req_rids(recv_obj.rids)
+            else:
+                raise RuntimeError(
+                    f"for tokenizer_worker_num > 1, recv_obj.rids must be a list"
+                )
+
+            # Send data using the corresponding socket
+            for i, worker_id in enumerate(worker_ids):
+                if isinstance(recv_obj, MultiTokenizerRegisterReq):
+                    self.socket_mapping.register_ipc_mapping(
+                        recv_obj, worker_id, is_tokenizer=False
+                    )
+                else:
+                    new_output = _handle_output_by_index(output, i)
+                    self.socket_mapping.send_output(worker_id, new_output)
+
+
+class MultiTokenizerRouter:
+    """A router to receive requests from MultiTokenizerManager"""
+
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+    ):
+        self.server_args = server_args
+        context = zmq.asyncio.Context(3)
+        self.recv_from_detokenizer = get_zmq_socket(
+            context, zmq.PULL, port_args.tokenizer_ipc_name, True
+        )
+        self.send_to_scheduler = get_zmq_socket(
+            context, zmq.PUSH, port_args.scheduler_input_ipc_name, True
+        )
+        self.receive_from_worker = get_zmq_socket(
+            context, zmq.PULL, port_args.tokenizer_worker_ipc_name, True
+        )
+        self._loop = asyncio.new_event_loop()
+        self._thread = threading.Thread(target=self._run_loop, daemon=True)
+        self._thread.start()
+        self._task = asyncio.run_coroutine_threadsafe(
+            self.router_worker_obj(), self._loop
+        )
+        # Start handle_loop simultaneously
+        self._handle_task = asyncio.run_coroutine_threadsafe(
+            print_exception_wrapper(self.handle_loop), self._loop
+        )
+        self.disaggregation_bootstrap_server = start_disagg_service(self.server_args)
+
+    def _run_loop(self):
+        self._loop.run_forever()
+
+    async def router_worker_obj(self):
+        while True:
+            recv_obj = await self.receive_from_worker.recv_pyobj()
+            await self.send_to_scheduler.send_pyobj(recv_obj)
+
+    async def handle_loop(self):
+        # special reqs will recv from scheduler, need to route to right worker
+        self.socket_mapping = SocketMapping()
+        while True:
+            recv_obj = await self.recv_from_detokenizer.recv_pyobj()
+            await self._distribute_result_to_workers(recv_obj)
+
+    async def _distribute_result_to_workers(self, recv_obj):
+        """Distribute result to corresponding workers based on rid"""
+        if isinstance(recv_obj, MultiTokenizerWrapper):
+            worker_ids = [recv_obj.worker_id]
+            recv_obj = recv_obj.obj
+        else:
+            worker_ids = self.get_worker_ids_from_req_rids(recv_obj.rids)
+
+        if len(worker_ids) == 0:
+            logger.error(f"Cannot find worker_id from rids {recv_obj.rids}")
+            return
+
+        # Distribute result to each worker
+        for i, worker_id in enumerate(worker_ids):
+            if isinstance(recv_obj, MultiTokenizerRegisterReq):
+                self.socket_mapping.register_ipc_mapping(
+                    recv_obj, worker_id, is_tokenizer=True
+                )
+            else:
+                new_recv_obj = _handle_output_by_index(recv_obj, i)
+                self.socket_mapping.send_output(worker_id, new_recv_obj)
+
+
+class MultiTokenizerManager(TokenizerManager):
+    """Multi Process Tokenizer Manager that tokenizes the text."""
+
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+    ):
+        setproctitle.setproctitle(f"sglang::tokenizer_worker:{os.getpid()}")
+        # prevent init prefill bootstrapserver again
+        disaggregation_mode = server_args.disaggregation_mode
+        server_args.disaggregation_mode = "null"
+        super().__init__(server_args, port_args)
+
+        self.worker_id = os.getpid()
+        self.tokenizer_ipc_name = port_args.tokenizer_ipc_name
+
+        # For PD disaggregtion
+        self.server_args.disaggregation_mode = disaggregation_mode
+        self.disaggregation_mode = DisaggregationMode(
+            self.server_args.disaggregation_mode
+        )
+        self.disaggregation_transfer_backend = TransferBackend(
+            self.server_args.disaggregation_transfer_backend
+        )
+        # Communicator
+        self.register_multi_tokenizer_communicator = _Communicator(
+            self.send_to_scheduler, 2
+        )
+        self._result_dispatcher._mapping.append(
+            (
+                MultiTokenizerRegisterReq,
+                self.register_multi_tokenizer_communicator.handle_recv,
+            )
+        )
+
+    async def register_to_main_tokenizer_manager(self):
+        """Register this worker to the main TokenizerManager"""
+        # create a handle loop to receive messages from the main TokenizerManager
+        self.auto_create_handle_loop()
+        req = MultiTokenizerRegisterReq(rids=[f"{self.worker_id}_register"])
+        req.ipc_name = self.tokenizer_ipc_name
+        _Communicator.enable_multi_tokenizer = True
+        await self.register_multi_tokenizer_communicator(req)
+
+
+async def print_exception_wrapper(func):
+    """
+    Sometimes an asyncio function does not print exception.
+    We do another wrapper to handle the exception.
+    """
+    try:
+        await func()
+    except Exception:
+        traceback = get_exception_traceback()
+        logger.error(f"MultiTokenizerRouter hit an exception: {traceback}")
+        if hasattr(func, "__self__") and isinstance(
+            func.__self__, MultiTokenizerRouter
+        ):
+            func.__self__.dump_requests_before_crash()
+        kill_process_tree(os.getpid(), include_parent=True)
+        sys.exit(1)
+
+
+def get_main_process_id() -> int:
+    """Get the main process ID"""
+    return multiprocessing.current_process()._parent_pid
+
+
+def write_to_shared_memory(obj, name: str) -> shared_memory.SharedMemory:
+    """Write data to shared memory"""
+    serialized = pickle.dumps(obj)
+    size = len(serialized)
+    try:
+        # Try to open existing shared memory
+        shm = shared_memory.SharedMemory(name=name)
+        # If size is insufficient, close and recreate
+        if shm.size < size:
+            shm.close()
+            shm.unlink()
+            shm = shared_memory.SharedMemory(create=True, size=size, name=name)
+    except FileNotFoundError:
+        # If not present, create new shared memory
+        shm = shared_memory.SharedMemory(create=True, size=size, name=name)
+
+    shm.buf[:size] = serialized
+    return shm
+
+
+def read_from_shared_memory(name: str) -> Any:
+    """Read data from shared memory"""
+    try:
+        shm = shared_memory.SharedMemory(name=name)
+        data = pickle.loads(bytes(shm.buf))
+        shm.close()
+        return data
+    except FileNotFoundError:
+        raise FileNotFoundError(f"Shared memory {name} not found")
+
+
+def write_data_for_multi_tokenizer(
+    port_args: PortArgs, server_args: ServerArgs, scheduler_info: Dict
+):
+    """Write args information to share memory for multi-tokenizer"""
+    # get main process ID
+    main_pid = get_main_process_id()
+    current_pid = os.getpid()
+    logger.info(f"main process ID: {main_pid}, current process ID: {current_pid}")
+    args = (port_args, server_args, scheduler_info)
+    args_shm = write_to_shared_memory(args, f"multi_tokenizer_args_{current_pid}")
+    args_shm.close()
+
+    return args_shm
+
+
+def monkey_patch_uvicorn_multiprocessing(timeout: float = 10):
+    """Monkey patch uvicorn multiprocessing is_alive timeout"""
+    # from default 5s -> 10s
+    try:
+        from uvicorn.supervisors.multiprocess import Process
+
+        Process.is_alive = partialmethod(Process.is_alive, timeout=timeout)
+
+    except ImportError:
+        logger.warning(
+            "uvicorn.supervisors.multiprocess not found, skipping monkey patch"
+        )
diff --git a/python/sglang/srt/managers/multimodal_processor.py b/python/sglang/srt/managers/multimodal_processor.py
new file mode 100644
index 000000000..bc060a5b3
--- /dev/null
+++ b/python/sglang/srt/managers/multimodal_processor.py
@@ -0,0 +1,49 @@
+# TODO: also move pad_input_ids into this module
+import importlib
+import inspect
+import logging
+import pkgutil
+
+from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
+from sglang.srt.server_args import ServerArgs
+
+logger = logging.getLogger(__name__)
+
+PROCESSOR_MAPPING = {}
+
+
+def import_processors():
+    package_name = "sglang.srt.multimodal.processors"
+    package = importlib.import_module(package_name)
+    for _, name, ispkg in pkgutil.iter_modules(package.__path__, package_name + "."):
+        if not ispkg:
+            try:
+                module = importlib.import_module(name)
+            except Exception as e:
+                logger.warning(f"Ignore import error when loading {name}: {e}")
+                continue
+            all_members = inspect.getmembers(module, inspect.isclass)
+            classes = [
+                member
+                for name, member in all_members
+                if member.__module__ == module.__name__
+            ]
+            for cls in (
+                cls for cls in classes if issubclass(cls, BaseMultimodalProcessor)
+            ):
+                assert hasattr(cls, "models")
+                for arch in getattr(cls, "models"):
+                    PROCESSOR_MAPPING[arch] = cls
+
+
+def get_mm_processor(
+    hf_config, server_args: ServerArgs, processor, transport_mode
+) -> BaseMultimodalProcessor:
+    for model_cls, processor_cls in PROCESSOR_MAPPING.items():
+        if model_cls.__name__ in hf_config.architectures:
+            return processor_cls(hf_config, server_args, processor, transport_mode)
+
+    raise ValueError(
+        f"No processor registered for architecture: {hf_config.architectures}.\n"
+        f"Registered architectures: {[model_cls.__name__ for model_cls in PROCESSOR_MAPPING.keys()]}"
+    )
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
new file mode 100644
index 000000000..c0c0917ac
--- /dev/null
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -0,0 +1,2017 @@
+from __future__ import annotations
+
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Store information about requests and batches.
+
+The following is the flow of data structures for a batch:
+
+ScheduleBatch -> ModelWorkerBatch -> ForwardBatch
+
+- ScheduleBatch is managed by `scheduler.py::Scheduler`.
+  It contains high-level scheduling data. Most of the data is on the CPU.
+- ModelWorkerBatch is managed by `tp_worker.py::TpModelWorker`.
+  It is a subset of `ScheduleBatch` that only contains data related to the model forward on GPU.
+  It will be transformed from CPU scheduler to GPU model runner.
+- ForwardBatch is managed by `model_runner.py::ModelRunner`.
+  It contains low-level tensor data. Most of the data consists of GPU tensors.
+
+TODO(lmzheng): ModelWorkerBatch seems a bit redundant and we consider removing it in the future.
+"""
+
+import copy
+import dataclasses
+import logging
+import threading
+from enum import Enum, auto
+from http import HTTPStatus
+from itertools import chain
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union
+
+import numpy as np
+import torch
+import triton
+import triton.language as tl
+
+from sglang.global_config import global_config
+from sglang.srt.constrained.base_grammar_backend import BaseGrammarObject
+from sglang.srt.disaggregation.base import BaseKVSender
+from sglang.srt.disaggregation.decode_schedule_batch_mixin import (
+    ScheduleBatchDisaggregationDecodeMixin,
+)
+from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_rank
+from sglang.srt.mem_cache.allocator import (
+    BaseTokenToKVPoolAllocator,
+    SWATokenToKVPoolAllocator,
+)
+from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
+from sglang.srt.mem_cache.chunk_cache import ChunkCache, SWAChunkCache
+from sglang.srt.mem_cache.lora_radix_cache import LoRAKey, LoRARadixCache
+from sglang.srt.mem_cache.memory_pool import HybridReqToTokenPool, ReqToTokenPool
+from sglang.srt.mem_cache.swa_radix_cache import SWARadixCache
+from sglang.srt.metrics.collector import TimeStats
+from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, ForwardMode
+from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
+from sglang.srt.sampling.sampling_params import SamplingParams
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import flatten_nested_list, support_triton
+
+if TYPE_CHECKING:
+    from sglang.srt.configs.model_config import ModelConfig
+    from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
+    from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+
+INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
+
+GLOBAL_SERVER_ARGS_KEYS = [
+    "attention_backend",
+    "mm_attention_backend",
+    "debug_tensor_dump_inject",
+    "debug_tensor_dump_output_folder",
+    "chunked_prefill_size",
+    "device",
+    "disable_chunked_prefix_cache",
+    "disable_flashinfer_cutlass_moe_fp4_allgather",
+    "disable_radix_cache",
+    "enable_dp_lm_head",
+    "flashinfer_mxfp4_moe_precision",
+    "enable_flashinfer_allreduce_fusion",
+    "moe_dense_tp_size",
+    "ep_dispatch_algorithm",
+    "ep_num_redundant_experts",
+    "enable_nan_detection",
+    "flashinfer_mla_disable_ragged",
+    "max_micro_batch_size",
+    "disable_shared_experts_fusion",
+    "sampling_backend",
+    "speculative_accept_threshold_single",
+    "speculative_accept_threshold_acc",
+    "speculative_attention_mode",
+    "torchao_config",
+    "triton_attention_reduce_in_fp32",
+    "num_reserved_decode_tokens",
+    "weight_loader_disable_mmap",
+    "enable_multimodal",
+    "enable_symm_mem",
+    "quantization",
+    "enable_custom_logit_processor",
+    "disaggregation_mode",
+]
+
+# Put some global args for easy access
+global_server_args_dict = {k: getattr(ServerArgs, k) for k in GLOBAL_SERVER_ARGS_KEYS}
+
+logger = logging.getLogger(__name__)
+
+
+class BaseFinishReason:
+    def __init__(self, is_error: bool = False):
+        self.is_error = is_error
+
+    def to_json(self):
+        raise NotImplementedError()
+
+
+class FINISH_MATCHED_TOKEN(BaseFinishReason):
+    def __init__(self, matched: Union[int, List[int]]):
+        super().__init__()
+        self.matched = matched
+
+    def to_json(self):
+        return {
+            "type": "stop",  # to match OpenAI API's return value
+            "matched": self.matched,
+        }
+
+
+class FINISH_MATCHED_STR(BaseFinishReason):
+    def __init__(self, matched: str):
+        super().__init__()
+        self.matched = matched
+
+    def to_json(self):
+        return {
+            "type": "stop",  # to match OpenAI API's return value
+            "matched": self.matched,
+        }
+
+
+class FINISH_LENGTH(BaseFinishReason):
+    def __init__(self, length: int):
+        super().__init__()
+        self.length = length
+
+    def to_json(self):
+        return {
+            "type": "length",  # to match OpenAI API's return value
+            "length": self.length,
+        }
+
+
+class FINISH_ABORT(BaseFinishReason):
+    def __init__(self, message=None, status_code=None, err_type=None):
+        super().__init__(is_error=True)
+        self.message = message or "Aborted"
+        self.status_code = status_code
+        self.err_type = err_type
+
+    def to_json(self):
+        return {
+            "type": "abort",
+            "message": self.message,
+            "status_code": self.status_code,
+            "err_type": self.err_type,
+        }
+
+
+class Modality(Enum):
+    IMAGE = auto()
+    MULTI_IMAGES = auto()
+    VIDEO = auto()
+    AUDIO = auto()
+
+    @staticmethod
+    def from_str(modality_str: str):
+        try:
+            return Modality[modality_str.upper()]
+        except KeyError:
+            raise ValueError(
+                f"Invalid modality string: {modality_str}. Valid modalities are: {[m.name for m in Modality]}"
+            )
+
+    @staticmethod
+    def all():
+        return [Modality.IMAGE, Modality.VIDEO, Modality.AUDIO]
+
+
+@dataclasses.dataclass
+class MultimodalDataItem:
+    """
+    One MultimodalDataItem contains all inputs for one modality.
+    For example, if there are 3 images and 1 audio inputs, there will be 2 MultimodalDataItem.
+    One for images and one for audio.
+
+    We put the common fields first and the model-specific fields in model_specific_data.
+    """
+
+    modality: Modality
+    hash: int = None
+    pad_value: int = None
+    offsets: Optional[list] = None
+
+    # the raw features returned by processor, e.g. pixel_values or audio_features
+    feature: Union[torch.Tensor, np.ndarray] = None
+    # the precomputed embeddings, passed as final encoder embeddings
+    # One and only one of the feature and precomputed_embeddings will be empty
+    precomputed_embeddings: Optional[Union[torch.Tensor, np.ndarray]] = None
+
+    # Model-specific data stored in a dictionary
+    model_specific_data: dict[str, Any] = dataclasses.field(default_factory=dict)
+
+    def __getattr__(self, name: str):
+        if (
+            "model_specific_data" in self.__dict__
+            and name in self.__dict__["model_specific_data"]
+        ):
+            return self.__dict__["model_specific_data"][name]
+        else:
+            raise AttributeError(
+                f"'{self.__class__.__name__}' object has no attribute '{name}'"
+            )
+
+    def __setitem__(self, key: str, value: Any):
+        if key in self.__dict__:
+            self.__dict__[key] = value
+        else:
+            self.model_specific_data[key] = value
+
+    def set(self, key: str, value: Any):
+        self.__setitem__(key, value)
+
+    @staticmethod
+    def is_empty_list(l):
+        if l is None:
+            return True
+        return len([item for item in flatten_nested_list(l) if item is not None]) == 0
+
+    def set_pad_value(self):
+        """
+        Set the pad value after first hashing the data
+        """
+        from sglang.srt.managers.mm_utils import hash_feature
+
+        if self.hash is None:
+            if self.feature is not None:
+                hashed_feature = self.feature
+            else:
+                hashed_feature = self.precomputed_embeddings
+            self.hash = hash_feature(hashed_feature)
+        assert self.hash is not None
+        self.pad_value = self.hash % (1 << 30)
+
+    def is_modality(self, modality: Modality) -> bool:
+        return self.modality == modality
+
+    def is_audio(self):
+        return self.modality == Modality.AUDIO
+
+    def is_image(self):
+        return self.modality in [Modality.IMAGE, Modality.MULTI_IMAGES]
+
+    def is_video(self):
+        return self.modality == Modality.VIDEO
+
+    def is_valid(self) -> bool:
+        return self.is_image() or self.is_video() or self.is_audio()
+
+    def validate(self):
+        ...
+        # TODO
+
+    @staticmethod
+    def from_dict(obj: dict):
+        kwargs = dict(obj)
+        modality = kwargs.pop("modality")
+        if isinstance(modality, str):
+            modality = Modality[modality]
+        ret = MultimodalDataItem(modality=modality, **kwargs)
+        ret.validate()
+        return ret
+
+    def merge(self, other):
+        self.feature += other.feature
+        self.offsets += other.offsets
+        self.hash = hash((self.hash, other.hash))
+        self.set_pad_value()
+
+
+@dataclasses.dataclass
+class MultimodalInputs:
+    """The multimodal data related inputs."""
+
+    # items of data
+    mm_items: List[MultimodalDataItem]
+    image_pad_len: Optional[list] = None
+    num_image_tokens: Optional[int] = None
+
+    # image
+    im_token_id: Optional[int] = None
+    im_start_id: Optional[int] = None
+    im_end_id: Optional[int] = None
+    slice_start_id: Optional[int] = None
+    slice_end_id: Optional[int] = None
+
+    # video
+    video_token_id: Optional[int] = None
+
+    # audio
+    audio_token_id: Optional[int] = None
+    audio_start_id: Optional[int] = None
+    audio_end_id: Optional[int] = None
+
+    # QWen2-VL related
+    mrope_positions: Optional[torch.Tensor] = None
+    mrope_position_delta: Optional[torch.Tensor] = None
+
+    @staticmethod
+    def from_dict(obj: dict):
+        ret = MultimodalInputs(
+            mm_items=obj["mm_items"],
+        )
+
+        assert isinstance(ret.mm_items, list)
+        ret.mm_items = [item for item in ret.mm_items if item.is_valid()]
+        for item in ret.mm_items:
+            item.set_pad_value()
+
+        optional_args = [
+            "mrope_positions",
+            "mrope_position_delta",
+            "im_token_id",
+            "im_start_id",
+            "im_end_id",
+            "video_token_id",
+            "slice_start_id",
+            "slice_end_id",
+            "audio_start_id",
+            "audio_end_id",
+            "audio_token_id",
+        ]
+        for arg in optional_args:
+            if arg in obj:
+                setattr(ret, arg, obj[arg])
+
+        return ret
+
+    def contains_image_inputs(self) -> bool:
+        return any(item.is_image() for item in self.mm_items)
+
+    def contains_video_inputs(self) -> bool:
+        return any(item.is_video() for item in self.mm_items)
+
+    def contains_audio_inputs(self) -> bool:
+        return any(item.is_audio() for item in self.mm_items)
+
+    def contains_mm_input(self) -> bool:
+        return any(True for item in self.mm_items if item.is_valid())
+
+    def merge(self, other: MultimodalInputs):
+        """
+        merge image inputs when requests are being merged
+        """
+
+        # args needed to be merged
+        optional_args = [
+            "mm_items",
+            "image_pad_len",
+        ]
+        for arg in optional_args:
+            self_arg = getattr(self, arg, None)
+            if self_arg is not None:
+                setattr(self, arg, self_arg + getattr(other, arg))
+
+        mrope_positions = self.mrope_positions
+        if mrope_positions is not None:
+            if other.mrope_positions is None:
+                self.mrope_positions = mrope_positions
+            else:
+                self.mrope_positions = torch.cat(
+                    [self.mrope_positions, other.mrope_positions], dim=1
+                )
+
+        mrope_position_delta = self.mrope_position_delta
+        if mrope_position_delta is not None:
+            if other.mrope_position_delta is None:
+                self.mrope_position_delta = mrope_position_delta
+            else:
+                self.mrope_position_delta = torch.cat(
+                    [self.mrope_position_delta, other.mrope_position_delta], dim=0
+                )
+
+        for key, val in other.__dict__.items():
+            if "_id" in key:
+                # set token_ids
+                if getattr(self, key, None) is None:
+                    setattr(self, key, getattr(other, key, None))
+        # other args would be kept intact
+
+
+class Req:
+    """The input and output status of a request."""
+
+    def __init__(
+        self,
+        rid: str,
+        origin_input_text: str,
+        origin_input_ids: List[int],
+        sampling_params: SamplingParams,
+        return_logprob: bool = False,
+        top_logprobs_num: int = 0,
+        token_ids_logprob: List[int] = None,
+        stream: bool = False,
+        origin_input_ids_unpadded: Optional[Tuple[int]] = None,
+        lora_id: Optional[str] = None,
+        input_embeds: Optional[List[List[float]]] = None,
+        token_type_ids: List[int] = None,
+        session_id: Optional[str] = None,
+        custom_logit_processor: Optional[str] = None,
+        return_hidden_states: bool = False,
+        eos_token_ids: Optional[Set[int]] = None,
+        bootstrap_host: Optional[str] = None,
+        bootstrap_port: Optional[int] = None,
+        bootstrap_room: Optional[int] = None,
+        data_parallel_rank: Optional[int] = None,
+        vocab_size: Optional[int] = None,
+    ):
+        # Input and output info
+        self.rid = rid
+        self.origin_input_text = origin_input_text
+        self.origin_input_ids_unpadded = (
+            origin_input_ids_unpadded
+            if origin_input_ids_unpadded
+            else origin_input_ids  # Before image padding
+        )
+        self.origin_input_ids = origin_input_ids
+        # Each decode stage's output ids
+        self.output_ids = []
+        # fill_ids = origin_input_ids + output_ids. Updated if chunked.
+        self.fill_ids = []
+        self.session_id = session_id
+        self.input_embeds = input_embeds
+
+        # for corss-endoder model
+        self.token_type_ids = token_type_ids
+
+        # The length of KV that have been removed in local attention chunked prefill
+        self.evicted_seqlen_local = 0
+
+        # Sampling info
+        if isinstance(sampling_params.custom_params, dict):
+            sampling_params = copy.copy(sampling_params)
+            sampling_params.custom_params = sampling_params.custom_params | {
+                "__req__": self
+            }
+        self.sampling_params = sampling_params
+        self.custom_logit_processor = custom_logit_processor
+        self.return_hidden_states = return_hidden_states
+        self.lora_id = lora_id
+
+        # Memory pool info
+        self.req_pool_idx: Optional[int] = None
+
+        # Check finish
+        self.tokenizer = None
+        self.finished_reason = None
+        # Whether this request has finished output
+        self.finished_output = None
+        # If we want to abort the request in the middle of the event loop, set this to true
+        # Note: We should never set finished_reason in the middle, the req will get filtered and never respond
+        self.to_abort = False
+        # This carries the error message for `.to_abort` and will be attached to the finished_reason at the end of the event loop
+        self.to_abort_message: str = None
+        self.stream = stream
+        self.eos_token_ids = eos_token_ids
+        self.vocab_size = vocab_size
+
+        # For incremental decoding
+        # ----- | --------- read_ids -------|
+        # ----- |   surr_ids  |
+        # xxxxx | xxxxxxxxxxx | xxxxxxxxxxx |
+        # ----- ^ ----------- ^ ----------- ^
+        # ----- 1 ----------- 2 ----------- 3
+        # 1: surr_offset
+        # 2: read_offset
+        # 3: last token
+        self.surr_offset = None  # Surrounding offset to defeat the cleanup algorithm
+        self.read_offset = None
+        self.decoded_text = ""
+
+        # For multimodal inputs
+        self.multimodal_inputs: Optional[MultimodalInputs] = None
+
+        # Prefix info
+        # The indices to kv cache for the shared prefix.
+        self.prefix_indices: torch.Tensor = []
+        # Number of tokens to run prefill.
+        self.extend_input_len = 0
+        # The relative logprob_start_len in an extend batch
+        self.extend_logprob_start_len = 0
+        self.last_node: Any = None
+        self.last_host_node: Any = None
+        self.host_hit_length = 0
+        # The node to lock until for swa radix tree lock ref
+        self.swa_uuid_for_lock: Optional[int] = None
+
+        # Whether or not if it is chunked. It increments whenever
+        # it is chunked, and decrement whenever chunked request is
+        # processed.
+        self.is_chunked = 0
+
+        # For retraction
+        self.is_retracted = False
+
+        # Incremental streamining
+        self.send_token_offset: int = 0
+        self.send_decode_id_offset: int = 0
+        # TODO (Byron): send_output_token_logprobs_offset and send_decode_id_offset can be different in disaggregation mode
+        # because the decode server does not have the first output token logprobs
+        self.send_output_token_logprobs_offset: int = 0
+
+        # Logprobs (arguments)
+        self.return_logprob = return_logprob
+        # Start index to compute logprob from.
+        self.logprob_start_len = 0
+        self.top_logprobs_num = top_logprobs_num
+        self.token_ids_logprob = token_ids_logprob
+        self.temp_scaled_logprobs = False
+        self.top_p_normalized_logprobs = False
+
+        # Logprobs (return values)
+        # True means the input logprob has been already sent to detokenizer.
+        self.input_logprob_sent: bool = False
+        self.input_token_logprobs_val: Optional[List[float]] = None
+        self.input_token_logprobs_idx: Optional[List[int]] = None
+        self.input_top_logprobs_val: Optional[List[float]] = None
+        self.input_top_logprobs_idx: Optional[List[int]] = None
+        self.input_token_ids_logprobs_val: Optional[List[float]] = None
+        self.input_token_ids_logprobs_idx: Optional[List[int]] = None
+        # Temporary holder to store input_token_logprobs.
+        self.input_token_logprobs: Optional[List[Tuple[int]]] = None
+        self.temp_input_top_logprobs_val: Optional[List[torch.Tensor]] = None
+        self.temp_input_top_logprobs_idx: Optional[List[int]] = None
+        self.temp_input_token_ids_logprobs_val: Optional[List[float]] = None
+        self.temp_input_token_ids_logprobs_idx: Optional[List[int]] = None
+
+        if return_logprob:
+            # shape: (bs, 1)
+            self.output_token_logprobs_val = []
+            self.output_token_logprobs_idx = []
+            # shape: (bs, k)
+            self.output_top_logprobs_val = []
+            self.output_top_logprobs_idx = []
+            self.output_token_ids_logprobs_val = []
+            self.output_token_ids_logprobs_idx = []
+        else:
+            self.output_token_logprobs_val = self.output_token_logprobs_idx = (
+                self.output_top_logprobs_val
+            ) = self.output_top_logprobs_idx = self.output_token_ids_logprobs_val = (
+                self.output_token_ids_logprobs_idx
+            ) = None
+        self.hidden_states: List[List[float]] = []
+        self.hidden_states_tensor = None  # Note: use tensor instead of list to transfer hidden_states when PD + MTP
+
+        # Embedding (return values)
+        self.embedding = None
+
+        # Constrained decoding
+        self.grammar: Optional[BaseGrammarObject] = None
+        self.grammar_wait_ct = 0
+
+        # The number of cached tokens that were already cached in the KV cache
+        self.cached_tokens = 0
+        self.already_computed = 0
+
+        # The number of verification forward passes in the speculative decoding.
+        # This is used to compute the average acceptance length per request.
+        self.spec_verify_ct = 0
+
+        # For metrics
+        self.time_stats: TimeStats = TimeStats()
+        self.has_log_time_stats: bool = False
+        self.queue_time_start = None
+        self.queue_time_end = None
+
+        # For disaggregation
+        self.bootstrap_host: str = bootstrap_host
+        self.bootstrap_port: Optional[int] = bootstrap_port
+        self.bootstrap_room: Optional[int] = bootstrap_room
+        self.disagg_kv_sender: Optional[BaseKVSender] = None
+
+        # For data parallel rank routing
+        self.data_parallel_rank: Optional[int] = data_parallel_rank
+
+        # the start index of the sent kv cache
+        # We want to send it chunk by chunk for chunked prefill.
+        # After every chunk forward, we do the following:
+        # kv_send(req.input_ids[req.start_send_idx:len(req.fill_ids)])
+        # start_send_idx = len(req.fill_ids)
+        self.start_send_idx: int = 0
+
+        # For overlap schedule, we delay the kv transfer until `process_batch_result_disagg_prefill` rather than `process_prefill_chunk` in non-overlap
+        # This is because kv is not ready in `process_prefill_chunk`.
+        # We use `tmp_end_idx` to store the end index of the kv cache to send.
+        self.tmp_end_idx: int = -1
+        self.metadata_buffer_index: int = -1
+
+    @property
+    def seqlen(self):
+        return len(self.origin_input_ids) + len(self.output_ids)
+
+    def extend_image_inputs(self, image_inputs):
+        if self.multimodal_inputs is None:
+            self.multimodal_inputs = image_inputs
+        else:
+            self.multimodal_inputs.merge(image_inputs)
+
+    def finished(self) -> bool:
+        # Whether request reached finished condition
+        return self.finished_reason is not None
+
+    def init_next_round_input(
+        self,
+        tree_cache: Optional[BasePrefixCache] = None,
+    ):
+        self.fill_ids = self.origin_input_ids + self.output_ids
+        if tree_cache is not None:
+            if isinstance(tree_cache, LoRARadixCache):
+                (
+                    self.prefix_indices,
+                    self.last_node,
+                    self.last_host_node,
+                    self.host_hit_length,
+                ) = tree_cache.match_prefix_with_lora_id(
+                    key=LoRAKey(
+                        lora_id=self.lora_id, token_ids=self.adjust_max_prefix_ids()
+                    ),
+                )
+            else:
+                (
+                    self.prefix_indices,
+                    self.last_node,
+                    self.last_host_node,
+                    self.host_hit_length,
+                ) = tree_cache.match_prefix(
+                    key=self.adjust_max_prefix_ids(),
+                )
+        self.extend_input_len = len(self.fill_ids) - len(self.prefix_indices)
+
+    def adjust_max_prefix_ids(self):
+        self.fill_ids = self.origin_input_ids + self.output_ids
+        input_len = len(self.fill_ids)
+
+        # FIXME: To work around some bugs in logprob computation, we need to ensure each
+        # request has at least one token. Later, we can relax this requirement and use `input_len`.
+        max_prefix_len = input_len - 1
+
+        if self.sampling_params.max_new_tokens > 0:
+            # Need at least one token to compute logits
+            max_prefix_len = min(max_prefix_len, input_len - 1)
+
+        if self.return_logprob:
+            max_prefix_len = min(max_prefix_len, self.logprob_start_len)
+
+        max_prefix_len = max(max_prefix_len, 0)
+        return self.fill_ids[:max_prefix_len]
+
+    # Based on https://github.com/vllm-project/vllm/blob/7a64d24aad69e4d2548aa0bf528d9fe63428ab01/vllm/transformers_utils/detokenizer.py#L194-L313
+    def init_incremental_detokenize(self):
+        first_iter = self.surr_offset is None or self.read_offset is None
+
+        if first_iter:
+            self.read_offset = len(self.origin_input_ids_unpadded)
+            self.surr_offset = max(
+                self.read_offset - INIT_INCREMENTAL_DETOKENIZATION_OFFSET, 0
+            )
+
+        all_ids = self.origin_input_ids_unpadded + self.output_ids
+        return all_ids[self.surr_offset :], self.read_offset - self.surr_offset
+
+    def check_finished(self):
+        if self.finished():
+            return
+
+        if self.to_abort:
+            self.finished_reason = FINISH_ABORT(
+                message=self.to_abort_message,
+            )
+            return
+
+        if len(self.output_ids) >= self.sampling_params.max_new_tokens:
+            self.finished_reason = FINISH_LENGTH(
+                length=self.sampling_params.max_new_tokens
+            )
+            return
+
+        if self.grammar is not None:
+            if self.grammar.is_terminated():
+                self.finished_reason = FINISH_MATCHED_TOKEN(matched=self.output_ids[-1])
+                return
+
+        last_token_id = self.output_ids[-1]
+
+        if not self.sampling_params.ignore_eos:
+            matched_eos = False
+
+            # Check stop token ids
+            if self.sampling_params.stop_token_ids:
+                matched_eos = last_token_id in self.sampling_params.stop_token_ids
+            if self.eos_token_ids:
+                matched_eos |= last_token_id in self.eos_token_ids
+            if self.tokenizer is not None:
+                matched_eos |= last_token_id == self.tokenizer.eos_token_id
+                if self.tokenizer.additional_stop_token_ids:
+                    matched_eos |= (
+                        last_token_id in self.tokenizer.additional_stop_token_ids
+                    )
+            if matched_eos:
+                self.finished_reason = FINISH_MATCHED_TOKEN(matched=last_token_id)
+                return
+
+        if last_token_id > self.vocab_size or last_token_id < 0:
+            if self.sampling_params.stop_token_ids:
+                self.output_ids[-1] = next(iter(self.sampling_params.stop_token_ids))
+            if self.eos_token_ids:
+                self.output_ids[-1] = next(iter(self.eos_token_ids))
+            self.finished_reason = FINISH_MATCHED_STR(matched="NaN happened")
+            return
+
+        # Check stop strings
+        if len(self.sampling_params.stop_strs) > 0:
+            tail_str = self.tokenizer.decode(
+                self.output_ids[-(self.sampling_params.stop_str_max_len + 1) :]
+            )
+
+            for stop_str in self.sampling_params.stop_strs:
+                if stop_str in tail_str or stop_str in self.decoded_text:
+                    self.finished_reason = FINISH_MATCHED_STR(matched=stop_str)
+                    return
+
+    def reset_for_retract(self):
+        self.prefix_indices = []
+        self.last_node = None
+        self.swa_uuid_for_lock = None
+        self.extend_input_len = 0
+        self.is_retracted = True
+        self.input_token_logprobs = None
+        self.temp_input_top_logprobs_val = None
+        self.temp_input_top_logprobs_idx = None
+        self.extend_logprob_start_len = 0
+        self.is_chunked = 0
+        self.req_pool_idx = None
+        self.already_computed = 0
+
+    def offload_kv_cache(self, req_to_token_pool, token_to_kv_pool_allocator):
+        token_indices = req_to_token_pool.req_to_token[
+            self.req_pool_idx, : self.seqlen - 1
+        ]
+        self.kv_cache_cpu = token_to_kv_pool_allocator.get_cpu_copy(token_indices)
+
+    def load_kv_cache(self, req_to_token_pool, token_to_kv_pool_allocator):
+        token_indices = req_to_token_pool.req_to_token[
+            self.req_pool_idx, : self.seqlen - 1
+        ]
+        token_to_kv_pool_allocator.load_cpu_copy(self.kv_cache_cpu, token_indices)
+        del self.kv_cache_cpu
+
+    def log_time_stats(self):
+        # If overlap schedule, we schedule one decode batch ahead so this gets called twice.
+        if self.has_log_time_stats is True:
+            return
+
+        if self.bootstrap_room is not None:
+            prefix = f"Req Time Stats(rid={self.rid}, bootstrap_room={self.bootstrap_room}, input len={len(self.origin_input_ids)}, output len={len(self.output_ids)}, type={self.time_stats.get_type().value})"
+        else:
+            prefix = f"Req Time Stats(rid={self.rid}, input len={len(self.origin_input_ids)}, output len={len(self.output_ids)}, type={self.time_stats.get_type().value})"
+        logger.info(f"{prefix}: {self.time_stats}")
+        self.has_log_time_stats = True
+
+    def set_finish_with_abort(self, error_msg: str):
+        if get_tensor_model_parallel_rank() == 0:
+            logger.error(f"{error_msg}, {self.rid=}")
+        self.multimodal_inputs = None
+        self.grammar = None
+        self.origin_input_ids = [0]  # set it to one token to skip the long prefill
+        self.return_logprob = False
+        self.finished_reason = FINISH_ABORT(
+            error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
+        )
+
+    def __repr__(self):
+        return (
+            f"Req(rid={self.rid}, "
+            f"input_ids={self.origin_input_ids}, output_ids={self.output_ids}, "
+            f"{self.grammar=}, "
+            f"{self.sampling_params=})"
+        )
+
+
+# Batch id
+bid = 0
+
+
+@dataclasses.dataclass
+class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
+    """Store all information of a batch on the scheduler."""
+
+    # Request, memory pool, and cache
+    reqs: List[Req]
+    req_to_token_pool: ReqToTokenPool = None
+    token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator = None
+    tree_cache: BasePrefixCache = None
+    is_hybrid: bool = False
+
+    # Batch configs
+    model_config: ModelConfig = None
+    forward_mode: ForwardMode = None
+    enable_overlap: bool = False
+    # Tell whether the current running batch is full so that we can skip
+    # the check of whether to prefill new requests.
+    # This is an optimization to reduce the overhead of the prefill check.
+    batch_is_full: bool = False
+
+    # Events
+    launch_done: Optional[threading.Event] = None
+
+    # For chunked prefill in PP
+    chunked_req: Optional[Req] = None
+
+    # Sampling info
+    sampling_info: SamplingBatchInfo = None
+    next_batch_sampling_info: SamplingBatchInfo = None
+
+    # Batched arguments to model runner
+    input_ids: torch.Tensor = None  # shape: [b], int64
+    input_embeds: torch.Tensor = None  # shape: [b, hidden_size], float32
+    token_type_ids: torch.Tensor = None  # shape: [b], int64
+    req_pool_indices: torch.Tensor = None  # shape: [b], int64
+    seq_lens: torch.Tensor = None  # shape: [b], int64
+    # The output locations of the KV cache
+    out_cache_loc: torch.Tensor = None  # shape: [b], int64
+    output_ids: torch.Tensor = None  # shape: [b], int64
+
+    # For multimodal inputs
+    multimodal_inputs: Optional[List] = None
+
+    # The sum of all sequence lengths
+    seq_lens_sum: int = None
+    # The original sequence lengths, Qwen-1M related
+    orig_seq_lens: torch.Tensor = None  # shape: [b], int32
+
+    # For DP attention
+    global_num_tokens: Optional[List[int]] = None
+    global_num_tokens_for_logprob: Optional[List[int]] = None
+    is_extend_in_batch: bool = False
+    can_run_dp_cuda_graph: bool = False
+    tbo_split_seq_index: Optional[int] = None
+    global_forward_mode: Optional[ForwardMode] = None
+
+    # For processing logprobs
+    return_logprob: bool = False
+    top_logprobs_nums: Optional[List[int]] = None
+    token_ids_logprobs: Optional[List[List[int]]] = None
+
+    # For logits and logprob post processing
+    temp_scaled_logprobs: bool = False
+    top_p_normalized_logprobs: bool = False
+
+    # For extend and mixed chunekd prefill
+    prefix_lens: List[int] = None
+    extend_lens: List[int] = None
+    extend_num_tokens: Optional[int] = None
+    decoding_reqs: List[Req] = None
+    extend_logprob_start_lens: List[int] = None
+    # It comes empty list if logprob is not required.
+    extend_input_logprob_token_ids: Optional[torch.Tensor] = None
+
+    # For encoder-decoder architectures
+    encoder_cached: Optional[List[bool]] = None
+    encoder_lens: Optional[torch.Tensor] = None
+    encoder_lens_cpu: Optional[List[int]] = None
+    encoder_out_cache_loc: Optional[torch.Tensor] = None
+
+    # Stream
+    has_stream: bool = False
+
+    # Has grammar
+    has_grammar: bool = False
+
+    # Device
+    device: str = "cuda"
+
+    # Speculative decoding
+    spec_algorithm: SpeculativeAlgorithm = None
+    spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]] = None
+
+    # Whether to return hidden states
+    return_hidden_states: bool = False
+
+    # Whether this batch is prefill-only (no token generation needed)
+    is_prefill_only: bool = False
+
+    # hicache pointer for synchronizing data loading from CPU to GPU
+    hicache_consumer_index: int = -1
+
+    @classmethod
+    def init_new(
+        cls,
+        reqs: List[Req],
+        req_to_token_pool: ReqToTokenPool,
+        token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator,
+        tree_cache: BasePrefixCache,
+        model_config: ModelConfig,
+        enable_overlap: bool,
+        spec_algorithm: SpeculativeAlgorithm,
+        chunked_req: Optional[Req] = None,
+    ):
+        return_logprob = any(req.return_logprob for req in reqs)
+
+        is_hybrid = False
+        if isinstance(token_to_kv_pool_allocator, SWATokenToKVPoolAllocator):
+            assert (
+                tree_cache is None
+                or isinstance(tree_cache, SWARadixCache)
+                or isinstance(tree_cache, SWAChunkCache)
+            ), "SWARadixCache or SWAChunkCache is required for SWATokenToKVPoolAllocator"
+            is_hybrid = True
+
+        return cls(
+            reqs=reqs,
+            req_to_token_pool=req_to_token_pool,
+            token_to_kv_pool_allocator=token_to_kv_pool_allocator,
+            tree_cache=tree_cache,
+            is_hybrid=is_hybrid,
+            model_config=model_config,
+            enable_overlap=enable_overlap,
+            return_logprob=return_logprob,
+            has_stream=any(req.stream for req in reqs),
+            has_grammar=any(req.grammar for req in reqs),
+            device=req_to_token_pool.device,
+            spec_algorithm=spec_algorithm,
+            return_hidden_states=any(req.return_hidden_states for req in reqs),
+            is_prefill_only=all(
+                req.sampling_params.max_new_tokens == 0 for req in reqs
+            ),
+            chunked_req=chunked_req,
+        )
+
+    def batch_size(self):
+        return len(self.reqs)
+
+    def is_empty(self):
+        return len(self.reqs) == 0
+
+    def alloc_req_slots(self, num_reqs: int, reqs: Optional[List[Req]] = None):
+        if isinstance(self.req_to_token_pool, HybridReqToTokenPool):
+            req_pool_indices = self.req_to_token_pool.alloc(num_reqs, reqs)
+        else:
+            req_pool_indices = self.req_to_token_pool.alloc(num_reqs)
+        if req_pool_indices is None:
+            raise RuntimeError(
+                "alloc_req_slots runs out of memory. "
+                "Please set a smaller number for `--max-running-requests`. "
+                f"{self.req_to_token_pool.available_size()=}, "
+                f"{num_reqs=}, "
+            )
+        return req_pool_indices
+
+    def alloc_token_slots(self, num_tokens: int, backup_state: bool = False):
+        self._evict_tree_cache_if_needed(num_tokens)
+
+        if backup_state:
+            state = self.token_to_kv_pool_allocator.backup_state()
+
+        out_cache_loc = self.token_to_kv_pool_allocator.alloc(num_tokens)
+        if out_cache_loc is None:
+            phase_str = "Prefill" if self.forward_mode.is_extend() else "Decode"
+            error_msg = (
+                f"{phase_str} out of memory. Try to lower your batch size.\n"
+                f"Try to allocate {num_tokens} tokens.\n"
+                f"{self._available_and_evictable_str()}"
+            )
+            logger.error(error_msg)
+            if self.tree_cache is not None:
+                self.tree_cache.pretty_print()
+            raise RuntimeError(error_msg)
+
+        if backup_state:
+            return out_cache_loc, state
+        else:
+            return out_cache_loc
+
+    def alloc_paged_token_slots_extend(
+        self,
+        prefix_lens: torch.Tensor,
+        seq_lens: torch.Tensor,
+        last_loc: torch.Tensor,
+        extend_num_tokens: int,
+        backup_state: bool = False,
+    ):
+        # Over estimate the number of tokens: assume each request needs a new page.
+        num_tokens = (
+            extend_num_tokens
+            + len(seq_lens) * self.token_to_kv_pool_allocator.page_size
+        )
+        self._evict_tree_cache_if_needed(num_tokens)
+
+        if backup_state:
+            state = self.token_to_kv_pool_allocator.backup_state()
+
+        out_cache_loc = self.token_to_kv_pool_allocator.alloc_extend(
+            prefix_lens, seq_lens, last_loc, extend_num_tokens
+        )
+        if out_cache_loc is None:
+            error_msg = (
+                f"Prefill out of memory. Try to lower your batch size.\n"
+                f"Try to allocate {extend_num_tokens} tokens.\n"
+                f"{self._available_and_evictable_str()}"
+            )
+            logger.error(error_msg)
+            raise RuntimeError(error_msg)
+
+        if backup_state:
+            return out_cache_loc, state
+        else:
+            return out_cache_loc
+
+    def alloc_paged_token_slots_decode(
+        self,
+        seq_lens: torch.Tensor,
+        last_loc: torch.Tensor,
+        backup_state: bool = False,
+    ):
+        # Over estimate the number of tokens: assume each request needs a new page.
+        num_tokens = len(seq_lens) * self.token_to_kv_pool_allocator.page_size
+        self._evict_tree_cache_if_needed(num_tokens)
+
+        if backup_state:
+            state = self.token_to_kv_pool_allocator.backup_state()
+
+        out_cache_loc = self.token_to_kv_pool_allocator.alloc_decode(seq_lens, last_loc)
+        if out_cache_loc is None:
+            error_msg = (
+                f"Decode out of memory. Try to lower your batch size.\n"
+                f"Try to allocate {len(seq_lens)} tokens.\n"
+                f"{self._available_and_evictable_str()}"
+            )
+            logger.error(error_msg)
+            raise RuntimeError(error_msg)
+
+        if backup_state:
+            return out_cache_loc, state
+        else:
+            return out_cache_loc
+
+    def prepare_encoder_info_extend(self, input_ids: List[int], seq_lens: List[int]):
+        self.encoder_lens_cpu = []
+        self.encoder_cached = []
+
+        for req in self.reqs:
+            im = req.multimodal_inputs
+            if im is None or im.num_image_tokens is None:
+                # No image input
+                self.encoder_lens_cpu.append(0)
+                self.encoder_cached.append(True)
+            else:
+                self.encoder_lens_cpu.append(im.num_image_tokens)
+                self.encoder_cached.append(
+                    self.forward_mode.is_decode()
+                    or len(req.prefix_indices) >= im.num_image_tokens
+                )
+
+        self.encoder_lens = torch.tensor(self.encoder_lens_cpu, dtype=torch.int64).to(
+            self.device, non_blocking=True
+        )
+
+        # Strip encoder infos
+        pt = 0
+        decoder_out_cache_loc = []
+        encoder_out_cache_loc = []
+        for i, req in enumerate(self.reqs):
+            encoder_len = self.encoder_lens_cpu[i]
+            seq_lens[i] -= encoder_len
+
+            if len(req.prefix_indices) < encoder_len:
+                # NOTE: the encoder part should be considered as a whole
+                assert len(req.prefix_indices) == 0
+                input_ids[i] = input_ids[i][encoder_len:]
+                encoder_out_cache_loc.append(self.out_cache_loc[pt : pt + encoder_len])
+                decoder_out_cache_loc.append(
+                    self.out_cache_loc[pt + encoder_len : pt + req.extend_input_len]
+                )
+                self.extend_lens[i] -= encoder_len
+                self.extend_num_tokens -= encoder_len
+            else:
+                decoder_out_cache_loc.append(
+                    self.out_cache_loc[pt : pt + req.extend_input_len]
+                )
+                self.prefix_lens[i] -= encoder_len
+
+            pt += req.extend_input_len
+
+        # Reassign
+        self.input_ids = torch.tensor(sum(input_ids, []), dtype=torch.int64).to(
+            self.device, non_blocking=True
+        )
+        self.seq_lens = torch.tensor(seq_lens, dtype=torch.int64).to(
+            self.device, non_blocking=True
+        )
+
+        if not decoder_out_cache_loc:
+            self.out_cache_loc = torch.zeros(0, dtype=torch.int64).to(
+                self.device, non_blocking=True
+            )
+        else:
+            self.out_cache_loc = torch.cat(decoder_out_cache_loc)
+
+        if not encoder_out_cache_loc:
+            self.encoder_out_cache_loc = torch.zeros(0, dtype=torch.int64).to(
+                self.device, non_blocking=True
+            )
+        else:
+            self.encoder_out_cache_loc = torch.cat(encoder_out_cache_loc)
+
+        assert (
+            len(self.out_cache_loc) == self.extend_num_tokens
+        ), f"Expected {len(self.out_cache_loc)}, got {self.extend_num_tokens}"
+
+    def prepare_for_extend(self):
+        self.forward_mode = ForwardMode.EXTEND
+
+        # Allocate req slots
+        bs = len(self.reqs)
+        req_pool_indices = self.alloc_req_slots(bs, self.reqs)
+
+        # Init tensors
+        reqs = self.reqs
+        input_ids = [r.fill_ids[len(r.prefix_indices) :] for r in reqs]
+        extend_num_tokens = sum(len(ids) for ids in input_ids)
+        seq_lens = [len(r.fill_ids) for r in reqs]
+        orig_seq_lens = [max(len(r.fill_ids), len(r.origin_input_ids)) for r in reqs]
+        prefix_lens = [len(r.prefix_indices) for r in reqs]
+        extend_lens = [r.extend_input_len for r in reqs]
+
+        token_type_ids = [
+            r.token_type_ids for r in reqs if r.token_type_ids is not None
+        ]
+
+        req_pool_indices_tensor = torch.tensor(req_pool_indices, dtype=torch.int64).to(
+            self.device, non_blocking=True
+        )
+        input_ids_tensor = torch.tensor(
+            list(chain.from_iterable(input_ids)), dtype=torch.int64
+        ).to(self.device, non_blocking=True)
+        seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.int64).to(
+            self.device, non_blocking=True
+        )
+        orig_seq_lens_tensor = torch.tensor(orig_seq_lens, dtype=torch.int32).to(
+            self.device, non_blocking=True
+        )
+        prefix_lens_tensor = torch.tensor(
+            prefix_lens, dtype=torch.int64, device=self.device
+        )
+
+        token_type_ids_tensor = None
+        if len(token_type_ids) > 0:
+            token_type_ids_tensor = torch.tensor(
+                sum(token_type_ids, []), dtype=torch.int64
+            ).to(self.device, non_blocking=True)
+
+        extend_lens_tensor = seq_lens_tensor - prefix_lens_tensor
+
+        # Copy prefix and do some basic check
+        input_embeds = []
+        extend_input_logprob_token_ids = []
+        multimodal_inputs = []
+
+        for i, (req, seq_len, pre_len) in enumerate(zip(reqs, seq_lens, prefix_lens)):
+            req.req_pool_idx = req_pool_indices[i]
+            assert seq_len - pre_len == req.extend_input_len
+
+            if pre_len > 0:
+                self.req_to_token_pool.write(
+                    (req.req_pool_idx, slice(0, pre_len)), req.prefix_indices
+                )
+                if isinstance(self.tree_cache, SWAChunkCache):
+                    self.tree_cache.evict_swa(
+                        req, pre_len, self.model_config.attention_chunk_size
+                    )
+
+            # If input_embeds are available, store them
+            if req.input_embeds is not None:
+                # If req.input_embeds is already a list, append its content directly
+                input_embeds.extend(req.input_embeds)  # Use extend to avoid nesting
+
+            multimodal_inputs.append(req.multimodal_inputs)
+
+            req.cached_tokens += pre_len - req.already_computed
+            req.already_computed = seq_len
+            req.is_retracted = False
+
+            # Compute the relative logprob_start_len in an extend batch
+            if req.logprob_start_len >= pre_len:
+                req.extend_logprob_start_len = min(
+                    req.logprob_start_len - pre_len,
+                    req.extend_input_len,
+                    req.seqlen - 1,
+                )
+            else:
+                req.extend_logprob_start_len = 0
+
+            if self.return_logprob:
+                # Find input logprob token ids.
+                # First, find a global index within origin_input_ids and slide it by 1
+                # to compute input logprobs. It is because you need the next token
+                # to compute input logprobs. E.g., (chunk size 2)
+                #
+                # input_logprobs = [1, 2, 3, 4]
+                # fill_ids = [1, 2]
+                # extend_input_logprob_token_id = [2, 3]
+                #
+                # Note that it can also overflow. In this case, we pad it with 0.
+                # input_logprobs = [1, 2, 3, 4]
+                # fill_ids = [3, 4]
+                # extend_input_logprob_token_id = [4, 0]
+                global_start_idx, global_end_idx = (
+                    len(req.prefix_indices),
+                    len(req.fill_ids),
+                )
+                # Apply logprob_start_len
+                if global_start_idx < req.logprob_start_len:
+                    global_start_idx = req.logprob_start_len
+
+                logprob_token_ids = req.origin_input_ids[
+                    global_start_idx + 1 : global_end_idx + 1
+                ]
+                extend_input_logprob_token_ids.extend(logprob_token_ids)
+
+                # We will need req.extend_input_len - req.extend_logprob_start_len number of
+                # tokens, and logprob_token_ids is for input logprob, so pad the rest of them by 0.
+                extend_input_logprob_token_ids.extend(
+                    [0]
+                    * (
+                        req.extend_input_len
+                        - req.extend_logprob_start_len
+                        - len(logprob_token_ids)
+                    )
+                )
+
+        if self.return_logprob:
+            extend_input_logprob_token_ids = torch.tensor(
+                extend_input_logprob_token_ids
+            )
+        else:
+            extend_input_logprob_token_ids = None
+
+        # Allocate memory
+        if self.token_to_kv_pool_allocator.page_size == 1:
+            out_cache_loc = self.alloc_token_slots(extend_num_tokens)
+        else:
+            last_loc = get_last_loc(
+                self.req_to_token_pool.req_to_token,
+                req_pool_indices_tensor,
+                prefix_lens_tensor,
+            )
+            out_cache_loc = self.alloc_paged_token_slots_extend(
+                prefix_lens_tensor, seq_lens_tensor, last_loc, extend_num_tokens
+            )
+
+        # Set fields
+        self.input_ids = input_ids_tensor
+        self.req_pool_indices = req_pool_indices_tensor
+        self.seq_lens = seq_lens_tensor
+        self.orig_seq_lens = orig_seq_lens_tensor
+        self.out_cache_loc = out_cache_loc
+        self.input_embeds = (
+            torch.tensor(input_embeds).to(self.device, non_blocking=True)
+            if input_embeds
+            else None
+        )
+        for mm_input in multimodal_inputs:
+            if mm_input is None:
+                continue
+            for mm_item in mm_input.mm_items:
+                pixel_values = getattr(mm_item, "feature", None)
+                if isinstance(pixel_values, torch.Tensor):
+                    mm_item.feature = pixel_values.to(self.device, non_blocking=True)
+        self.multimodal_inputs = multimodal_inputs
+        self.token_type_ids = token_type_ids_tensor
+        self.seq_lens_sum = sum(seq_lens)
+
+        if self.return_logprob:
+            self.top_logprobs_nums = [r.top_logprobs_num for r in reqs]
+            self.token_ids_logprobs = [r.token_ids_logprob for r in reqs]
+
+        self.extend_logprob_start_lens = [r.extend_logprob_start_len for r in reqs]
+        self.extend_num_tokens = extend_num_tokens
+        self.prefix_lens = prefix_lens
+        self.extend_lens = extend_lens
+        self.extend_input_logprob_token_ids = extend_input_logprob_token_ids
+
+        # Write to req_to_token_pool
+        if support_triton(global_server_args_dict.get("attention_backend")):
+            # TODO: some tensors can be reused for ForwardBatchInfo (e.g., extend_lens, cumsum_start)
+
+            write_req_to_token_pool_triton[(bs,)](
+                self.req_to_token_pool.req_to_token,
+                req_pool_indices_tensor,
+                prefix_lens_tensor,
+                seq_lens_tensor,
+                extend_lens_tensor,
+                out_cache_loc,
+                self.req_to_token_pool.req_to_token.shape[1],
+            )
+        else:
+            pt = 0
+            for i in range(bs):
+                self.req_to_token_pool.write(
+                    (req_pool_indices[i], slice(prefix_lens[i], seq_lens[i])),
+                    out_cache_loc[pt : pt + extend_lens[i]],
+                )
+                pt += extend_lens[i]
+
+        if self.model_config.is_encoder_decoder:
+            self.prepare_encoder_info_extend(input_ids, seq_lens)
+
+        # Build sampling info
+        self.sampling_info = SamplingBatchInfo.from_schedule_batch(
+            self,
+            self.model_config.vocab_size,
+        )
+
+    def prepare_for_split_prefill(self):
+        self.prepare_for_extend()
+        # For split prefill, we need to set the forward mode to SPLIT_PREFILL
+        self.forward_mode = ForwardMode.SPLIT_PREFILL
+
+    def mix_with_running(self, running_batch: "ScheduleBatch"):
+        self.forward_mode = ForwardMode.MIXED
+        running_bs = running_batch.batch_size()
+
+        for req in running_batch.reqs:
+            req.fill_ids = req.origin_input_ids + req.output_ids
+            req.extend_input_len = 1
+
+        input_ids = torch.cat([self.input_ids, running_batch.input_ids])
+        out_cache_loc = torch.cat([self.out_cache_loc, running_batch.out_cache_loc])
+
+        self.merge_batch(running_batch)
+        self.input_ids = input_ids
+        self.out_cache_loc = out_cache_loc
+
+        # For overlap scheduler, the output_ids has one step delay
+        delta = 0 if self.enable_overlap else -1
+
+        # NOTE: prefix_indices is what has been cached, but we don't cache each decode step
+        self.prefix_lens.extend(
+            [
+                len(r.origin_input_ids) + len(r.output_ids) + delta
+                for r in running_batch.reqs
+            ]
+        )
+        self.extend_lens.extend([1] * running_bs)
+        self.extend_num_tokens += running_bs
+        # TODO (lianmin): Revisit this. It should be seq_len - 1
+        self.extend_logprob_start_lens.extend([0] * running_bs)
+
+    def new_page_count_next_decode(self, selected_indices: Optional[List[int]] = None):
+        page_size = self.token_to_kv_pool_allocator.page_size
+        requests = (
+            self.reqs
+            if selected_indices is None
+            else [self.reqs[i] for i in selected_indices]
+        )
+        if page_size == 1:
+            return len(requests)
+        # In the decoding phase, the length of a request's KV cache should be
+        # the total length of the request minus 1
+        return (
+            sum(1 for req in requests if req.seqlen % page_size == 0)
+            if self.enable_overlap
+            else sum(1 for req in requests if (req.seqlen - 1) % page_size == 0)
+        )
+
+    def check_decode_mem(
+        self, buf_multiplier=1, selected_indices: Optional[List[int]] = None
+    ):
+        num_tokens = (
+            self.new_page_count_next_decode(selected_indices)
+            * buf_multiplier
+            * self.token_to_kv_pool_allocator.page_size
+        )
+
+        self._evict_tree_cache_if_needed(num_tokens)
+        return self._is_available_size_sufficient(num_tokens)
+
+    def retract_decode(self, server_args: ServerArgs):
+        """Retract the decoding requests when there is not enough memory."""
+        sorted_indices = list(range(len(self.reqs)))
+
+        # TODO(lsyin): improve retraction policy for radix cache
+        # For spec decoding, filter_batch API can only filter
+        # requests from the back, so we can only retract from the back.
+        # TODO(sang): Clean up finish path and support better retract
+        # policy.
+        if not server_args.speculative_algorithm:
+            sorted_indices.sort(
+                key=lambda i: (
+                    len(self.reqs[i].output_ids),
+                    -len(self.reqs[i].origin_input_ids),
+                ),
+                reverse=True,
+            )
+
+        retracted_reqs = []
+        seq_lens_cpu = self.seq_lens.cpu().numpy()
+        first_iter = True
+        while first_iter or (
+            not self.check_decode_mem(selected_indices=sorted_indices)
+        ):
+            if len(sorted_indices) == 1:
+                # Corner case: only one request left
+                if self.is_hybrid:
+                    full_available_size = (
+                        self.token_to_kv_pool_allocator.full_available_size()
+                    )
+                    swa_available_size = (
+                        self.token_to_kv_pool_allocator.swa_available_size()
+                    )
+                    assert (
+                        full_available_size > 0 and swa_available_size > 0
+                    ), f"No space left for only one request in SWA mode {full_available_size=}, {swa_available_size=}"
+                else:
+                    assert (
+                        self.token_to_kv_pool_allocator.available_size() > 0
+                    ), f"No space left for only one request, {self.token_to_kv_pool_allocator.available_size()=}"
+                break
+
+            first_iter = False
+            idx = sorted_indices.pop()
+            req = self.reqs[idx]
+            retracted_reqs.append(req)
+
+            if server_args.disaggregation_mode == "decode":
+                req.offload_kv_cache(
+                    self.req_to_token_pool, self.token_to_kv_pool_allocator
+                )
+
+            if isinstance(self.tree_cache, ChunkCache):
+                # ChunkCache does not have eviction
+                token_indices = self.req_to_token_pool.req_to_token[
+                    req.req_pool_idx, : seq_lens_cpu[idx]
+                ]
+                self.token_to_kv_pool_allocator.free(token_indices)
+                self.req_to_token_pool.free(req.req_pool_idx)
+            else:
+                # TODO: apply more fine-grained retraction
+                last_uncached_pos = (
+                    len(req.prefix_indices) // server_args.page_size
+                ) * server_args.page_size
+                token_indices = self.req_to_token_pool.req_to_token[
+                    req.req_pool_idx, last_uncached_pos : seq_lens_cpu[idx]
+                ]
+                self.token_to_kv_pool_allocator.free(token_indices)
+                self.req_to_token_pool.free(req.req_pool_idx)
+
+                # release the last node
+                if self.is_hybrid:
+                    self.tree_cache.dec_lock_ref(req.last_node, req.swa_uuid_for_lock)
+                else:
+                    self.tree_cache.dec_lock_ref(req.last_node)
+
+            req.reset_for_retract()
+
+            if len(retracted_reqs) == 0:
+                # Corner case: only one request left
+                raise ValueError(
+                    "Failed to retract any request. No space left for only one request."
+                )
+
+        self.filter_batch(keep_indices=sorted_indices)
+
+        # Reqs in batch are filtered
+        total_decoded_tokens = sum(len(r.output_ids) for r in self.reqs)
+        total_max_new_tokens = sum(r.sampling_params.max_new_tokens for r in self.reqs)
+
+        new_estimate_ratio = (
+            total_decoded_tokens + global_config.retract_decode_steps * len(self.reqs)
+        ) / total_max_new_tokens
+        new_estimate_ratio = min(1.0, new_estimate_ratio)
+
+        return retracted_reqs, new_estimate_ratio
+
+    def prepare_encoder_info_decode(self):
+        # Reset the encoder cached status
+        self.encoder_cached = [True] * len(self.reqs)
+
+    def prepare_for_idle(self):
+        self.forward_mode = ForwardMode.IDLE
+        self.input_ids = torch.empty(0, dtype=torch.int64, device=self.device)
+        self.seq_lens = torch.empty(0, dtype=torch.int64, device=self.device)
+        self.orig_seq_lens = torch.empty(0, dtype=torch.int32, device=self.device)
+        self.out_cache_loc = torch.empty(0, dtype=torch.int64, device=self.device)
+        self.req_pool_indices = torch.empty(0, dtype=torch.int32, device=self.device)
+        self.seq_lens_sum = 0
+        self.extend_num_tokens = 0
+        self.sampling_info = SamplingBatchInfo.from_schedule_batch(
+            self,
+            self.model_config.vocab_size,
+        )
+
+    def prepare_for_decode(self):
+        self.forward_mode = ForwardMode.DECODE
+        bs = len(self.reqs)
+
+        if self.spec_algorithm.is_eagle() or self.spec_algorithm.is_standalone():
+            # if spec decoding is used, the decode batch is prepared inside
+            # `forward_batch_speculative_generation` after running draft models.
+            return
+
+        if self.sampling_info.penalizer_orchestrator.is_required:
+            if self.enable_overlap:
+                # TODO: this can be slow, optimize this.
+                delayed_output_ids = torch.tensor(
+                    [
+                        (
+                            req.output_ids[-1]
+                            if len(req.output_ids)
+                            else req.origin_input_ids[-1]
+                        )
+                        for req in self.reqs
+                    ],
+                    dtype=torch.int64,
+                    device=self.device,
+                )
+                self.sampling_info.penalizer_orchestrator.cumulate_output_tokens(
+                    delayed_output_ids
+                )
+            else:
+                self.sampling_info.penalizer_orchestrator.cumulate_output_tokens(
+                    self.output_ids.to(torch.int64)
+                )
+
+        # Update fields
+        self.input_ids = self.output_ids
+        self.output_ids = None
+
+        if self.model_config.is_encoder_decoder:
+            locs = self.encoder_lens + self.seq_lens
+            self.prepare_encoder_info_decode()
+        else:
+            locs = self.seq_lens.clone()
+
+        if self.enable_overlap:
+            # Do not use in-place operations in the overlap mode
+            self.seq_lens = self.seq_lens + 1
+            self.orig_seq_lens = self.orig_seq_lens + 1
+        else:
+            # A faster in-place version
+            self.seq_lens.add_(1)
+            self.orig_seq_lens.add_(1)
+        self.seq_lens_sum += bs
+
+        # free memory
+        if isinstance(self.tree_cache, SWAChunkCache):
+            for req in self.reqs:
+                self.tree_cache.evict_swa(
+                    req, req.seqlen - 1, self.model_config.attention_chunk_size
+                )
+
+        # Allocate memory
+        if self.token_to_kv_pool_allocator.page_size == 1:
+            self.out_cache_loc = self.alloc_token_slots(bs)
+        else:
+            last_loc = self.req_to_token_pool.req_to_token[
+                self.req_pool_indices, self.seq_lens - 2
+            ]
+            self.out_cache_loc = self.alloc_paged_token_slots_decode(
+                self.seq_lens, last_loc
+            )
+
+        self.req_to_token_pool.write(
+            (self.req_pool_indices, locs), self.out_cache_loc.to(torch.int32)
+        )
+
+    def filter_batch(
+        self,
+        chunked_req_to_exclude: Optional[Union[Req, List[Req]]] = None,
+        keep_indices: Optional[List[int]] = None,
+    ):
+        if keep_indices is None:
+            if isinstance(chunked_req_to_exclude, Req):
+                chunked_req_to_exclude = [chunked_req_to_exclude]
+            elif chunked_req_to_exclude is None:
+                chunked_req_to_exclude = []
+            keep_indices = [
+                i
+                for i in range(len(self.reqs))
+                if not self.reqs[i].finished()
+                and self.reqs[i] not in chunked_req_to_exclude
+            ]
+
+        if keep_indices is None or len(keep_indices) == 0:
+            # Filter out all requests
+            self.reqs = []
+            return
+
+        if len(keep_indices) == len(self.reqs):
+            # No need to filter
+            return
+
+        keep_indices_device = torch.tensor(keep_indices, dtype=torch.int64).to(
+            self.device, non_blocking=True
+        )
+
+        if self.model_config.is_encoder_decoder:
+            self.encoder_lens = self.encoder_lens[keep_indices_device]
+            self.encoder_lens_cpu = [self.encoder_lens_cpu[i] for i in keep_indices]
+
+        self.reqs = [self.reqs[i] for i in keep_indices]
+        if self.multimodal_inputs is not None:
+            self.multimodal_inputs = [self.multimodal_inputs[i] for i in keep_indices]
+        self.req_pool_indices = self.req_pool_indices[keep_indices_device]
+        self.seq_lens = self.seq_lens[keep_indices_device]
+        self.orig_seq_lens = self.orig_seq_lens[keep_indices_device]
+        self.out_cache_loc = None
+        self.seq_lens_sum = self.seq_lens.sum().item()
+        self.output_ids = self.output_ids[keep_indices_device]
+        self.return_logprob = any(req.return_logprob for req in self.reqs)
+        if self.return_logprob:
+            self.top_logprobs_nums = [self.top_logprobs_nums[i] for i in keep_indices]
+            self.token_ids_logprobs = [self.token_ids_logprobs[i] for i in keep_indices]
+        else:
+            self.top_logprobs_nums = None
+            self.token_ids_logprobs = None
+
+        self.has_stream = any(req.stream for req in self.reqs)
+        self.has_grammar = any(req.grammar for req in self.reqs)
+
+        self.sampling_info.filter_batch(keep_indices, keep_indices_device)
+        if self.spec_info:
+            self.spec_info.filter_batch(keep_indices_device)
+
+    def merge_batch(self, other: "ScheduleBatch"):
+        # Penalizer orchestrator must be merged before Batch.reqs is merged. This is because
+        # orchestrator.merge() depends on Batch.reqs during preparation of each penalizers, so it
+        # needs to be called with pre-merged Batch.reqs.
+        self.sampling_info.merge_batch(other.sampling_info)
+
+        # Encoder-decoder infos
+        if self.model_config.is_encoder_decoder:
+            self.encoder_lens = torch.cat([self.encoder_lens, other.encoder_lens])
+            self.encoder_lens_cpu.extend(other.encoder_lens_cpu)
+        self.req_pool_indices = torch.cat(
+            [self.req_pool_indices, other.req_pool_indices]
+        )
+        self.seq_lens = torch.cat([self.seq_lens, other.seq_lens])
+        self.orig_seq_lens = torch.cat([self.orig_seq_lens, other.orig_seq_lens])
+        self.out_cache_loc = None
+        self.seq_lens_sum += other.seq_lens_sum
+        if self.output_ids is not None:
+            self.output_ids = torch.cat([self.output_ids, other.output_ids])
+        if self.return_logprob and other.return_logprob:
+            self.top_logprobs_nums.extend(other.top_logprobs_nums)
+            self.token_ids_logprobs.extend(other.token_ids_logprobs)
+        elif self.return_logprob:
+            self.top_logprobs_nums.extend([0] * len(other.reqs))
+            self.token_ids_logprobs.extend([None] * len(other.reqs))
+        elif other.return_logprob:
+            self.top_logprobs_nums = [0] * len(self.reqs) + other.top_logprobs_nums
+            self.token_ids_logprobs = [None] * len(self.reqs) + other.token_ids_logprobs
+        self.reqs.extend(other.reqs)
+        if self.multimodal_inputs is not None:
+            self.multimodal_inputs.extend(other.multimodal_inputs)
+
+        self.return_logprob |= other.return_logprob
+        self.has_stream |= other.has_stream
+        self.has_grammar |= other.has_grammar
+        self.return_hidden_states |= other.return_hidden_states
+
+        if self.spec_info:
+            self.spec_info.merge_batch(other.spec_info)
+
+    def get_model_worker_batch(
+        self, seq_lens_cpu_cache: Optional[torch.Tensor] = None
+    ) -> ModelWorkerBatch:
+        if self.forward_mode.is_decode_or_idle():
+            extend_seq_lens = extend_prefix_lens = extend_logprob_start_lens = None
+        else:
+            extend_seq_lens = self.extend_lens
+            extend_prefix_lens = self.prefix_lens
+            extend_logprob_start_lens = self.extend_logprob_start_lens
+
+        if self.sampling_info:
+            if self.has_grammar:
+                self.sampling_info.grammars = [req.grammar for req in self.reqs]
+            else:
+                self.sampling_info.grammars = None
+
+        seq_lens_cpu = (
+            seq_lens_cpu_cache
+            if seq_lens_cpu_cache is not None
+            else self.seq_lens.cpu()
+        )
+
+        global bid
+        bid += 1
+        return ModelWorkerBatch(
+            bid=bid,
+            forward_mode=self.forward_mode,
+            input_ids=self.input_ids,
+            req_pool_indices=self.req_pool_indices,
+            seq_lens=self.seq_lens,
+            orig_seq_lens=self.orig_seq_lens,
+            out_cache_loc=self.out_cache_loc,
+            seq_lens_cpu=seq_lens_cpu,
+            seq_lens_sum=self.seq_lens_sum,
+            return_logprob=self.return_logprob,
+            top_logprobs_nums=self.top_logprobs_nums,
+            token_ids_logprobs=self.token_ids_logprobs,
+            global_num_tokens=self.global_num_tokens,
+            global_num_tokens_for_logprob=self.global_num_tokens_for_logprob,
+            is_extend_in_batch=self.is_extend_in_batch,
+            can_run_dp_cuda_graph=self.can_run_dp_cuda_graph,
+            tbo_split_seq_index=self.tbo_split_seq_index,
+            global_forward_mode=self.global_forward_mode,
+            extend_num_tokens=self.extend_num_tokens,
+            extend_seq_lens=extend_seq_lens,
+            extend_prefix_lens=extend_prefix_lens,
+            extend_logprob_start_lens=extend_logprob_start_lens,
+            multimodal_inputs=self.multimodal_inputs,
+            encoder_cached=self.encoder_cached,
+            encoder_lens=self.encoder_lens,
+            encoder_lens_cpu=self.encoder_lens_cpu,
+            encoder_out_cache_loc=self.encoder_out_cache_loc,
+            lora_ids=[req.lora_id for req in self.reqs],
+            sampling_info=self.sampling_info,
+            input_embeds=self.input_embeds,
+            token_type_ids=self.token_type_ids,
+            spec_algorithm=self.spec_algorithm,
+            spec_info=self.spec_info,
+            hicache_consumer_index=self.hicache_consumer_index,
+            capture_hidden_mode=(
+                CaptureHiddenMode.FULL
+                if self.return_hidden_states
+                else (
+                    getattr(
+                        self.spec_info, "capture_hidden_mode", CaptureHiddenMode.NULL
+                    )
+                    if self.spec_info
+                    else CaptureHiddenMode.NULL
+                )
+            ),
+            extend_input_logprob_token_ids=self.extend_input_logprob_token_ids,
+            launch_done=self.launch_done,
+        )
+
+    def copy(self):
+        # Only contain fields that will be used by process_batch_result
+        return ScheduleBatch(
+            reqs=self.reqs,
+            model_config=self.model_config,
+            forward_mode=self.forward_mode,
+            out_cache_loc=self.out_cache_loc,
+            return_logprob=self.return_logprob,
+            decoding_reqs=self.decoding_reqs,
+            spec_algorithm=self.spec_algorithm,
+            global_num_tokens=self.global_num_tokens,
+            global_num_tokens_for_logprob=self.global_num_tokens_for_logprob,
+            can_run_dp_cuda_graph=self.can_run_dp_cuda_graph,
+            is_extend_in_batch=self.is_extend_in_batch,
+            is_prefill_only=self.is_prefill_only,
+        )
+
+    def _evict_tree_cache_if_needed(self, num_tokens: int):
+        if isinstance(self.tree_cache, (SWAChunkCache, ChunkCache)):
+            return
+
+        if self.is_hybrid:
+            full_available_size = self.token_to_kv_pool_allocator.full_available_size()
+            swa_available_size = self.token_to_kv_pool_allocator.swa_available_size()
+
+            if full_available_size < num_tokens or swa_available_size < num_tokens:
+                if self.tree_cache is not None:
+                    full_num_tokens = max(0, num_tokens - full_available_size)
+                    swa_num_tokens = max(0, num_tokens - swa_available_size)
+                    self.tree_cache.evict(full_num_tokens, swa_num_tokens)
+        else:
+            if self.token_to_kv_pool_allocator.available_size() < num_tokens:
+                if self.tree_cache is not None:
+                    self.tree_cache.evict(num_tokens)
+
+    def _is_available_size_sufficient(self, num_tokens: int) -> bool:
+        if self.is_hybrid:
+            return (
+                self.token_to_kv_pool_allocator.full_available_size() >= num_tokens
+                and self.token_to_kv_pool_allocator.swa_available_size() >= num_tokens
+            )
+        else:
+            return self.token_to_kv_pool_allocator.available_size() >= num_tokens
+
+    def _available_and_evictable_str(self) -> str:
+        if self.is_hybrid:
+            full_available_size = self.token_to_kv_pool_allocator.full_available_size()
+            swa_available_size = self.token_to_kv_pool_allocator.swa_available_size()
+            full_evictable_size = self.tree_cache.full_evictable_size()
+            swa_evictable_size = self.tree_cache.swa_evictable_size()
+            return (
+                f"Available full tokens: {full_available_size + full_evictable_size} ({full_available_size=} + {full_evictable_size=})\n"
+                f"Available swa tokens: {swa_available_size + swa_evictable_size} ({swa_available_size=} + {swa_evictable_size=})\n"
+                f"Full LRU list evictable size: {self.tree_cache.full_lru_list_evictable_size()}\n"
+                f"SWA LRU list evictable size: {self.tree_cache.swa_lru_list_evictable_size()}\n"
+            )
+        else:
+            available_size = self.token_to_kv_pool_allocator.available_size()
+            evictable_size = self.tree_cache.evictable_size()
+            return f"Available tokens: {available_size + evictable_size} ({available_size=} + {evictable_size=})\n"
+
+    def __str__(self):
+        return (
+            f"ScheduleBatch(forward_mode={self.forward_mode.name if self.forward_mode else 'None'}, "
+            f"#req={(len(self.reqs))})"
+        )
+
+
+@dataclasses.dataclass
+class ModelWorkerBatch:
+    # The batch id
+    bid: int
+    # The forward mode
+    forward_mode: ForwardMode
+    # The input ids
+    input_ids: torch.Tensor
+    # The indices of requests in the req_to_token_pool
+    req_pool_indices: torch.Tensor
+    # The sequence length
+    seq_lens: torch.Tensor
+    # The indices of output tokens in the token_to_kv_pool_allocator
+    out_cache_loc: torch.Tensor
+    # The sequence length tensor on CPU
+    seq_lens_cpu: Optional[torch.Tensor]
+    seq_lens_sum: int
+
+    # For logprob
+    return_logprob: bool
+    top_logprobs_nums: Optional[List[int]]
+    token_ids_logprobs: Optional[List[List[int]]]
+
+    # For DP attention
+    global_num_tokens: Optional[List[int]]
+    global_num_tokens_for_logprob: Optional[List[int]]
+    is_extend_in_batch: bool
+    can_run_dp_cuda_graph: bool
+    tbo_split_seq_index: Optional[int]
+    global_forward_mode: Optional[ForwardMode]
+
+    # For extend
+    extend_num_tokens: Optional[int]
+    extend_seq_lens: Optional[List[int]]
+    extend_prefix_lens: Optional[List[int]]
+    extend_logprob_start_lens: Optional[List[int]]
+    extend_input_logprob_token_ids: Optional[torch.Tensor]
+
+    # For multimodal
+    multimodal_inputs: Optional[List[MultimodalInputs]]
+
+    # For encoder-decoder
+    encoder_cached: Optional[List[bool]]
+    encoder_lens: Optional[torch.Tensor]
+    encoder_lens_cpu: Optional[List[int]]
+    encoder_out_cache_loc: Optional[torch.Tensor]
+
+    # For LoRA
+    lora_ids: Optional[List[str]]
+
+    # Sampling info
+    sampling_info: SamplingBatchInfo
+
+    # The original sequence lengths, Qwen-1M related
+    orig_seq_lens: Optional[torch.Tensor] = None
+
+    # The input Embeds
+    input_embeds: Optional[torch.Tensor] = None
+
+    # For corss-encoder model
+    token_type_ids: Optional[torch.Tensor] = None
+
+    # Speculative decoding
+    spec_algorithm: SpeculativeAlgorithm = None
+    spec_info: Optional[Union[EagleVerifyInput, EagleDraftInput]] = None
+    # If set, the output of the batch contains the hidden states of the run.
+    capture_hidden_mode: CaptureHiddenMode = None
+    hicache_consumer_index: int = -1
+
+    # Overlap event
+    launch_done: Optional[threading.Event] = None
+
+
+@triton.jit
+def write_req_to_token_pool_triton(
+    req_to_token_ptr,  # [max_batch, max_context_len]
+    req_pool_indices,
+    pre_lens,
+    seq_lens,
+    extend_lens,
+    out_cache_loc,
+    req_to_token_ptr_stride: tl.constexpr,
+):
+    BLOCK_SIZE: tl.constexpr = 512
+    pid = tl.program_id(0)
+
+    req_pool_index = tl.load(req_pool_indices + pid)
+    pre_len = tl.load(pre_lens + pid)
+    seq_len = tl.load(seq_lens + pid)
+
+    # NOTE: This can be slow for large bs
+    cumsum_start = tl.cast(0, tl.int64)
+    for i in range(pid):
+        cumsum_start += tl.load(extend_lens + i)
+
+    num_loop = tl.cdiv(seq_len - pre_len, BLOCK_SIZE)
+    for i in range(num_loop):
+        offset = tl.arange(0, BLOCK_SIZE) + i * BLOCK_SIZE
+        mask = offset < (seq_len - pre_len)
+        value = tl.load(out_cache_loc + cumsum_start + offset, mask=mask)
+        tl.store(
+            req_to_token_ptr
+            + req_pool_index * req_to_token_ptr_stride
+            + offset
+            + pre_len,
+            value,
+            mask=mask,
+        )
+
+
+def get_last_loc(
+    req_to_token: torch.Tensor,
+    req_pool_indices_tensor: torch.Tensor,
+    prefix_lens_tensor: torch.Tensor,
+) -> torch.Tensor:
+    if (
+        global_server_args_dict["attention_backend"] != "ascend"
+        and global_server_args_dict["attention_backend"] != "torch_native"
+    ):
+        impl = get_last_loc_triton
+    else:
+        impl = get_last_loc_torch
+
+    return impl(req_to_token, req_pool_indices_tensor, prefix_lens_tensor)
+
+
+def get_last_loc_torch(
+    req_to_token: torch.Tensor,
+    req_pool_indices_tensor: torch.Tensor,
+    prefix_lens_tensor: torch.Tensor,
+) -> torch.Tensor:
+    return torch.where(
+        prefix_lens_tensor > 0,
+        req_to_token[req_pool_indices_tensor, prefix_lens_tensor - 1],
+        torch.full_like(prefix_lens_tensor, -1),
+    )
+
+
+@triton.jit
+def get_last_loc_kernel(
+    req_to_token,
+    req_pool_indices_tensor,
+    prefix_lens_tensor,
+    result,
+    num_tokens,
+    req_to_token_stride,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offset = tl.arange(0, BLOCK_SIZE) + pid * BLOCK_SIZE
+    mask = offset < num_tokens
+
+    prefix_lens = tl.load(prefix_lens_tensor + offset, mask=mask, other=0)
+    req_pool_indices = tl.load(req_pool_indices_tensor + offset, mask=mask, other=0)
+
+    token_mask = prefix_lens > 0
+    token_index = req_pool_indices * req_to_token_stride + (prefix_lens - 1)
+    tokens = tl.load(req_to_token + token_index, mask=token_mask, other=-1)
+
+    tl.store(result + offset, tokens, mask=mask)
+
+
+def get_last_loc_triton(
+    req_to_token: torch.Tensor,
+    req_pool_indices_tensor: torch.Tensor,
+    prefix_lens_tensor: torch.Tensor,
+) -> torch.Tensor:
+    BLOCK_SIZE = 256
+    num_tokens = prefix_lens_tensor.shape[0]
+    result = torch.empty_like(prefix_lens_tensor)
+    grid = (triton.cdiv(num_tokens, BLOCK_SIZE),)
+
+    get_last_loc_kernel[grid](
+        req_to_token,
+        req_pool_indices_tensor,
+        prefix_lens_tensor,
+        result,
+        num_tokens,
+        req_to_token.stride(0),
+        BLOCK_SIZE,
+    )
+    return result
diff --git a/python/sglang/srt/managers/schedule_policy.py b/python/sglang/srt/managers/schedule_policy.py
new file mode 100644
index 000000000..0a3723e0b
--- /dev/null
+++ b/python/sglang/srt/managers/schedule_policy.py
@@ -0,0 +1,570 @@
+from __future__ import annotations
+
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Request scheduler policy"""
+
+import os
+import random
+from collections import defaultdict
+from contextlib import contextmanager
+from enum import Enum, auto
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
+
+import torch
+
+from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
+from sglang.srt.mem_cache.allocator import SWATokenToKVPoolAllocator
+from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
+from sglang.srt.mem_cache.radix_cache import RadixCache, TreeNode
+
+if TYPE_CHECKING:
+    from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
+
+# Clip the estimation of max_new_tokens for the request whose max_new_tokens is very large.
+# This can prevent the server from being too conservative.
+# Note that this only clips the estimation in the scheduler but does not change the stop
+# condition. The request can still generate tokens until it hits the unclipped max_new_tokens.
+CLIP_MAX_NEW_TOKENS = int(
+    os.environ.get("SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION", "4096")
+)
+
+# Threshold for in-batch prefix cache.
+# If a request has a matched prefix length (against existing cache) less than this value,
+# the scheduler runs the in-batch prefix caching check for this request.
+# If we set it to -1, it means we disable in-batch prefix caching.
+IN_BATCH_PREFIX_CACHING_CHECK_THRESHOLD = int(
+    os.environ.get("IN_BATCH_PREFIX_CACHING_CHECK_THRESHOLD", "32")
+)
+
+# Threshold for in-batch prefix cache.
+# If a request has a matched prefix length (within the waiting queue) larger than this value,
+# the scheduler deprioritizes this request
+IN_BATCH_PREFIX_CACHING_DEPRIORITIZE_THRESHOLD = int(
+    os.environ.get("IN_BATCH_PREFIX_CACHING_DEPRIORITIZE_THRESHOLD", "32")
+)
+
+
+IGNORE_EOS_RESERVE_TOKENS = 1
+
+
+class CacheAwarePolicy(Enum):
+    """Scheduling policies that are aware of the tree cache."""
+
+    LPM = "lpm"  # longest prefix match
+    DFS_WEIGHT = "dfs-weight"  # depth-first search weighting
+
+
+class CacheAgnosticPolicy(Enum):
+    """Scheduling policies that are not aware of the tree cache."""
+
+    FCFS = "fcfs"  # first come first serve
+    LOF = "lof"  # longest output first
+    RANDOM = "random"
+
+
+class SchedulePolicy:
+    Policy = Union[CacheAwarePolicy, CacheAgnosticPolicy]
+
+    def __init__(
+        self,
+        policy: str,
+        tree_cache: BasePrefixCache,
+        enable_hierarchical_cache: bool,
+    ):
+        self.policy = self._validate_and_adjust_policy(policy, tree_cache)
+        self.tree_cache = tree_cache
+        self.enable_hierarchical_cache = enable_hierarchical_cache
+
+        # It is used to find the matching prefix for in-batch prefix caching.
+        self.waiting_queue_radix_tree = RadixCache(
+            req_to_token_pool=None,
+            token_to_kv_pool_allocator=None,
+            page_size=1,
+            disable=False,
+        )
+
+    def calc_priority(self, waiting_queue: List[Req]) -> bool:
+        if self.policy == CacheAgnosticPolicy.FCFS:
+            # A shortcut for FCFS
+            return False
+
+        policy = self._determine_active_policy(waiting_queue)
+
+        prefix_computed = False
+        if isinstance(policy, CacheAwarePolicy):
+            prefix_computed = True
+            temporary_deprioritized = self._compute_prefix_matches(
+                waiting_queue, policy
+            )
+            if policy == CacheAwarePolicy.LPM:
+                SchedulePolicy._sort_by_longest_prefix(
+                    waiting_queue, temporary_deprioritized
+                )
+            elif policy == CacheAwarePolicy.DFS_WEIGHT:
+                SchedulePolicy._sort_by_dfs_weight(waiting_queue, self.tree_cache)
+            else:
+                raise ValueError(f"Unknown CacheAware Policy: {policy=}")
+        else:
+            if policy == CacheAgnosticPolicy.FCFS:
+                pass
+            elif policy == CacheAgnosticPolicy.LOF:
+                SchedulePolicy._sort_by_longest_output(waiting_queue)
+            elif policy == CacheAgnosticPolicy.RANDOM:
+                SchedulePolicy._sort_randomly(waiting_queue)
+            else:
+                raise ValueError(f"Unknown CacheAgnostic Policy: {policy=}")
+
+        return prefix_computed
+
+    def _determine_active_policy(self, waiting_queue: List[Req]) -> Policy:
+        if self.policy == CacheAwarePolicy.LPM and len(waiting_queue) > 128:
+            # Turn off the expensive prefix matching and sorting when the #queue is large.
+            return CacheAgnosticPolicy.FCFS
+        return self.policy
+
+    def _validate_and_adjust_policy(
+        self, policy: str, tree_cache: BasePrefixCache
+    ) -> Policy:
+        """
+        Validates the policy and adjusts it if necessary based on tree cache settings.
+        """
+        try:
+            policy_enum = CacheAwarePolicy(policy)
+            if getattr(tree_cache, "disable", True):
+                # If tree_cache is disabled, using CacheAgnosticPolicy policy
+                return CacheAgnosticPolicy.FCFS
+            return policy_enum
+        except ValueError:
+            try:
+                return CacheAgnosticPolicy(policy)
+            except ValueError:
+                raise ValueError(f"Unknown schedule_policy: {policy=}")
+
+    def _compute_prefix_matches(
+        self, waiting_queue: List[Req], policy: CacheAwarePolicy
+    ) -> Set[int]:
+        """
+        Computes and caches the matching prefixes for requests in the waiting queue,
+            and handles in-batch prefix caching logic.
+        """
+        temporary_deprioritized: Set[int] = set()
+        self.waiting_queue_radix_tree.reset()
+
+        for r in waiting_queue:
+            prefix_ids = r.adjust_max_prefix_ids()
+
+            # NOTE: the prefix_indices must always be aligned with last_node
+            r.prefix_indices, r.last_node, r.last_host_node, r.host_hit_length = (
+                self.tree_cache.match_prefix(rid=r.rid, key=prefix_ids)
+            )
+
+            # NOTE(sang): This logic is for in-batch prefix caching;
+            # If there are more than 1 request that have small matching prefix from
+            # existing cache, but all those requests share the same prefix, we prefer
+            # to schedule only one of them so that we can increase the cache hit rate.
+            # We prefer to set IN_BATCH_PREFIX_CACHING_CHECK_THRESHOLD > 0 because too small
+            # threshold means we cannot use in-batch prefix caching for short prefixes.
+            # It is kind of common when the engine is long running (e.g., imagine the prefix "the").
+            if len(r.prefix_indices) <= IN_BATCH_PREFIX_CACHING_CHECK_THRESHOLD:
+                in_batch_matching_prefixes, _, _, _ = (
+                    self.waiting_queue_radix_tree.match_prefix(
+                        rid=r.rid, key=prefix_ids
+                    )
+                )
+                if (
+                    len(in_batch_matching_prefixes)
+                    >= IN_BATCH_PREFIX_CACHING_DEPRIORITIZE_THRESHOLD
+                ):
+                    temporary_deprioritized.add(r.rid)
+                else:
+                    # Insert with a dummy key
+                    self.waiting_queue_radix_tree.insert(
+                        prefix_ids, torch.empty(len(prefix_ids), dtype=torch.bool)
+                    )
+        return temporary_deprioritized
+
+    @staticmethod
+    def _sort_by_longest_prefix(
+        waiting_queue: List[Req], temporary_deprioritized: Set[int]
+    ) -> None:
+        """Sorts the waiting queue based on the longest prefix match."""
+        waiting_queue.sort(
+            key=lambda r: (
+                -len(r.prefix_indices)
+                if r.rid not in temporary_deprioritized
+                else float("inf")
+            )
+        )
+
+    @staticmethod
+    def _sort_by_dfs_weight(
+        waiting_queue: List[Req], tree_cache: BasePrefixCache
+    ) -> None:
+        """Sorts the waiting queue based on a depth-first search weighting."""
+        last_node_to_reqs = defaultdict(list)
+        for req in waiting_queue:
+            last_node_to_reqs[req.last_node].append(req)
+
+        node_to_weight = defaultdict(int)
+        for node in last_node_to_reqs:
+            node_to_weight[node] = len(last_node_to_reqs[node])
+        SchedulePolicy._calc_weight(tree_cache.root_node, node_to_weight)
+
+        waiting_queue.clear()
+        SchedulePolicy._get_dfs_priority(
+            tree_cache.root_node,
+            node_to_weight,
+            last_node_to_reqs,
+            waiting_queue,
+        )
+
+    @staticmethod
+    def _sort_by_longest_output(waiting_queue: List[Req]) -> None:
+        """Sorts the waiting queue based on the longest output (max_new_tokens)."""
+        waiting_queue.sort(key=lambda x: -x.sampling_params.max_new_tokens)
+
+    @staticmethod
+    def _sort_randomly(waiting_queue: List[Req]) -> None:
+        """Shuffles the waiting queue randomly."""
+        random.shuffle(waiting_queue)
+
+    @staticmethod
+    def _calc_weight(cur_node: TreeNode, node_to_weight: Dict[TreeNode, int]) -> None:
+        for child in cur_node.children.values():
+            SchedulePolicy._calc_weight(child, node_to_weight)
+            node_to_weight[cur_node] += node_to_weight[child]
+
+    @staticmethod
+    def _get_dfs_priority(
+        cur_node: TreeNode,
+        node_to_priority: Dict[TreeNode, int],
+        last_node_to_reqs: Dict[TreeNode, List[Req]],
+        q: List,
+    ) -> None:
+        childs = [child for child in cur_node.children.values()]
+        childs.sort(key=lambda x: -node_to_priority[x])
+        for child in childs:
+            SchedulePolicy._get_dfs_priority(
+                child, node_to_priority, last_node_to_reqs, q
+            )
+        q.extend(last_node_to_reqs[cur_node])
+
+
+class AddReqResult(Enum):
+    CONTINUE = auto()  # Continue to add requests
+    NO_TOKEN = auto()  # No token left
+    OTHER = auto()  # Other reasons to stop adding requests
+
+
+class PrefillAdder:
+    def __init__(
+        self,
+        page_size: int,
+        tree_cache: BasePrefixCache,
+        token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator,
+        running_batch: ScheduleBatch,
+        new_token_ratio: float,
+        rem_input_tokens: int,
+        rem_chunk_tokens: Optional[int],
+        mixed_with_decode_tokens: int = 0,
+    ):
+        self.page_size = page_size
+        self.tree_cache = tree_cache
+        self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
+        self.running_batch = running_batch
+        self.new_token_ratio = new_token_ratio
+        self.rem_input_tokens = rem_input_tokens - mixed_with_decode_tokens
+        self.rem_chunk_tokens = rem_chunk_tokens
+        if self.rem_chunk_tokens is not None:
+            self.rem_chunk_tokens -= mixed_with_decode_tokens
+
+        self.rem_total_token_offset = mixed_with_decode_tokens
+        self.cur_rem_token_offset = mixed_with_decode_tokens
+
+        self.req_states = None
+        self.can_run_list = []
+        self.new_chunked_req = None
+        self.log_hit_tokens = 0
+        # TODO(lsyin): report the real input tokens excluding page alignment
+        self.log_input_tokens = 0
+
+        if running_batch is not None:
+            self.rem_total_token_offset += sum(
+                [
+                    min(
+                        (r.sampling_params.max_new_tokens - len(r.output_ids)),
+                        CLIP_MAX_NEW_TOKENS,
+                    )
+                    * self.new_token_ratio
+                    for r in running_batch.reqs
+                ]
+            )
+
+        self.is_hybrid = isinstance(
+            self.token_to_kv_pool_allocator, SWATokenToKVPoolAllocator
+        )
+
+    @property
+    def rem_total_tokens(self):
+        if self.is_hybrid:
+            available_and_evictable = min(
+                self.token_to_kv_pool_allocator.full_available_size()
+                + self.tree_cache.full_evictable_size(),
+                self.token_to_kv_pool_allocator.swa_available_size()
+                + self.tree_cache.swa_evictable_size(),
+            )
+        else:
+            available_and_evictable = (
+                self.token_to_kv_pool_allocator.available_size()
+                + self.tree_cache.evictable_size()
+            )
+
+        return available_and_evictable - self.rem_total_token_offset
+
+    @property
+    def cur_rem_tokens(self):
+        if self.is_hybrid:
+            available_and_evictable = min(
+                self.token_to_kv_pool_allocator.full_available_size()
+                + self.tree_cache.full_evictable_size(),
+                self.token_to_kv_pool_allocator.swa_available_size()
+                + self.tree_cache.swa_evictable_size(),
+            )
+        else:
+            available_and_evictable = (
+                self.token_to_kv_pool_allocator.available_size()
+                + self.tree_cache.evictable_size()
+            )
+
+        return available_and_evictable - self.cur_rem_token_offset
+
+    def ceil_paged_tokens(self, tokens: int) -> int:
+        return -(-tokens // self.page_size) * self.page_size
+
+    def budget_state(self):
+        if self.rem_total_tokens <= 0 or self.cur_rem_tokens <= 0:
+            return AddReqResult.NO_TOKEN
+
+        if self.rem_input_tokens <= 0 or (
+            self.rem_chunk_tokens is not None and self.rem_chunk_tokens <= 0
+        ):
+            return AddReqResult.OTHER
+
+        return AddReqResult.CONTINUE
+
+    def _update_prefill_budget(
+        self, prefix_len: int, extend_input_len: int, max_new_tokens: int
+    ):
+        # TODO(lsyin): check this workaround logic, which only ensures the prefill will not out of memory, and may be too conservative
+        extend_input_len = self.ceil_paged_tokens(extend_input_len)
+
+        self.rem_total_token_offset += extend_input_len + max_new_tokens
+        self.cur_rem_token_offset += extend_input_len
+        self.rem_input_tokens -= extend_input_len
+        if self.rem_chunk_tokens is not None:
+            self.rem_chunk_tokens -= extend_input_len
+
+        self.log_hit_tokens += prefix_len
+        self.log_input_tokens += extend_input_len
+
+    def add_chunked_req(self, req: Req):
+        _rem_tokens = min(self.rem_chunk_tokens, int(self.rem_total_tokens))
+        truncated = req.extend_input_len > _rem_tokens
+        req.extend_input_len = min(req.extend_input_len, _rem_tokens)
+        req.fill_ids = req.fill_ids[: len(req.prefix_indices) + req.extend_input_len]
+        self.can_run_list.append(req)
+        self._update_prefill_budget(
+            0,
+            req.extend_input_len,
+            (
+                min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS)
+                if not truncated
+                else 0
+            ),
+        )
+
+        # Return if chunked prefill not finished
+        return req if truncated else None
+
+    @contextmanager
+    def _lock_node(self, last_node: TreeNode):
+        if self.is_hybrid:
+            try:
+                swa_uuid_for_lock = self.tree_cache.inc_lock_ref(last_node)
+                yield None
+            finally:
+                self.tree_cache.dec_lock_ref(last_node, swa_uuid_for_lock)
+        else:
+            try:
+                self.tree_cache.inc_lock_ref(last_node)
+                yield None
+            finally:
+                self.tree_cache.dec_lock_ref(last_node)
+
+    def add_one_req_ignore_eos(self, req: Req, has_chunked_req: bool):
+        # Early exit if no enough tokens for the input tokens
+        if self.ceil_paged_tokens(req.extend_input_len) > min(
+            self.cur_rem_tokens, self.rem_total_tokens
+        ):
+            return AddReqResult.NO_TOKEN
+
+        def add_req_state(r, insert_sort=False):
+            new_token_ratio = (
+                1.0 if r.sampling_params.ignore_eos else self.new_token_ratio
+            )
+            tokens_left = r.sampling_params.max_new_tokens * new_token_ratio - len(
+                r.output_ids
+            )
+            tokens_occupied = len(r.origin_input_ids) + len(r.output_ids)
+
+            if tokens_left <= 0:
+                return
+
+            if not insert_sort:
+                self.req_states.append((tokens_left, tokens_occupied))
+            else:
+                i = 0
+                for i in range(len(self.req_states)):
+                    if tokens_left <= self.req_states[i][0]:
+                        break
+                self.req_states.insert(i, (tokens_left, tokens_occupied))
+
+        if self.req_states is None:
+            self.req_states = []
+            add_req_state(req)
+            if self.running_batch is not None:
+                for r in self.running_batch.reqs:
+                    add_req_state(r)
+            for r in self.can_run_list:
+                add_req_state(r)
+            self.req_states.sort(key=lambda x: x[0])
+        else:
+            add_req_state(req, insert_sort=True)
+
+        if not self.is_hybrid:
+            # Skip this logic for swa. The SWA has different memory management, and
+            # this mechanism is underestimating the memory usage.
+            cur_rem_tokens = self.cur_rem_tokens - self.ceil_paged_tokens(
+                req.extend_input_len
+            )
+            tokens_freed = 0
+            for i, (tokens_left, tokens_occupied) in enumerate(self.req_states):
+                # tokens_left gives a reservative calculation as the last token is not stored
+                bs = len(self.req_states) - i
+                min_free_tokens = cur_rem_tokens + tokens_freed - tokens_left * bs
+                # reserve tokens for corner cases
+                if min_free_tokens <= IGNORE_EOS_RESERVE_TOKENS * bs:
+                    return AddReqResult.NO_TOKEN
+                tokens_freed += tokens_occupied
+
+        if (
+            self.rem_chunk_tokens is None  # chunked prefill is disabled
+            or req.extend_input_len <= self.rem_chunk_tokens  # it is the last chunk
+        ):
+            # Non-chunked prefill
+            self.can_run_list.append(req)
+            self._update_prefill_budget(
+                0,
+                req.extend_input_len,
+                min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS),
+            )
+        else:
+            if self.rem_chunk_tokens == 0:
+                return AddReqResult.OTHER
+
+            # Chunked prefill
+            trunc_len = self.rem_chunk_tokens
+
+            req.extend_input_len = trunc_len
+            req.fill_ids = req.fill_ids[:trunc_len]
+            self.can_run_list.append(req)
+            self.new_chunked_req = req
+            self._update_prefill_budget(0, trunc_len, 0)
+
+        return self.budget_state()
+
+    def add_one_req(self, req: Req, has_chunked_req: bool):
+        if req.sampling_params.ignore_eos and getattr(self.tree_cache, "disable", True):
+            return self.add_one_req_ignore_eos(req, has_chunked_req)
+
+        total_tokens = req.extend_input_len + min(
+            req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS
+        )
+
+        # adjusting the input_tokens based on host_hit_length and page_size
+        real_input_tokens = req.extend_input_len - req.host_hit_length
+        real_input_tokens = self.ceil_paged_tokens(real_input_tokens)
+        prefix_len = len(req.prefix_indices)
+
+        if total_tokens >= self.rem_total_tokens:
+            return AddReqResult.NO_TOKEN
+
+        if real_input_tokens >= self.rem_input_tokens and len(self.can_run_list) != 0:
+            return AddReqResult.OTHER
+
+        with self._lock_node(req.last_node):
+            # self.rem_total_tokens may decrease after the lock acquisition
+            if total_tokens >= self.rem_total_tokens:
+                return AddReqResult.NO_TOKEN
+
+            if req.host_hit_length > 0:
+                new_indices, req.last_node = self.tree_cache.init_load_back(
+                    req.last_host_node, req.host_hit_length
+                )
+                req.prefix_indices = torch.cat([req.prefix_indices, new_indices])
+                req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
+                prefix_len = len(req.prefix_indices)
+
+            input_tokens = self.ceil_paged_tokens(req.extend_input_len)
+
+            if input_tokens >= self.rem_input_tokens and len(self.can_run_list) != 0:
+                return AddReqResult.OTHER
+
+            if self.rem_chunk_tokens is None or input_tokens <= self.rem_chunk_tokens:
+                # Non-chunked prefill
+                self.can_run_list.append(req)
+                if self.is_hybrid:
+                    swa_uuid_for_lock = self.tree_cache.inc_lock_ref(req.last_node)
+                    req.swa_uuid_for_lock = swa_uuid_for_lock
+                else:
+                    self.tree_cache.inc_lock_ref(req.last_node)
+                self._update_prefill_budget(
+                    prefix_len,
+                    input_tokens,
+                    min(
+                        req.sampling_params.max_new_tokens,
+                        CLIP_MAX_NEW_TOKENS,
+                    ),
+                )
+            else:
+                # Make sure at least one page is available
+                trunc_len = self.rem_chunk_tokens // self.page_size * self.page_size
+                if trunc_len <= 0:
+                    return AddReqResult.OTHER
+
+                # Chunked prefill
+                req.extend_input_len = trunc_len
+                req.fill_ids = req.fill_ids[: len(req.prefix_indices) + trunc_len]
+
+                self.can_run_list.append(req)
+                self.new_chunked_req = req
+                if self.is_hybrid:
+                    swa_uuid_for_lock = self.tree_cache.inc_lock_ref(req.last_node)
+                    req.swa_uuid_for_lock = swa_uuid_for_lock
+                else:
+                    self.tree_cache.inc_lock_ref(req.last_node)
+                self._update_prefill_budget(prefix_len, trunc_len, 0)
+
+        return self.budget_state()
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
new file mode 100644
index 000000000..5b80afcc1
--- /dev/null
+++ b/python/sglang/srt/managers/scheduler.py
@@ -0,0 +1,2631 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A scheduler that manages a tensor parallel GPU worker."""
+
+import faulthandler
+import logging
+import os
+import signal
+import sys
+import threading
+import time
+from collections import deque
+from concurrent import futures
+from dataclasses import dataclass
+from http import HTTPStatus
+from types import SimpleNamespace
+from typing import Dict, List, Optional, Tuple, Union
+
+import psutil
+import setproctitle
+import torch
+import zmq
+from torch.distributed import barrier
+
+from sglang.global_config import global_config
+from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.constrained.base_grammar_backend import (
+    INVALID_GRAMMAR_OBJ,
+    create_grammar_backend,
+)
+from sglang.srt.disaggregation.decode import (
+    DecodePreallocQueue,
+    DecodeTransferQueue,
+    SchedulerDisaggregationDecodeMixin,
+)
+from sglang.srt.disaggregation.prefill import (
+    PrefillBootstrapQueue,
+    SchedulerDisaggregationPrefillMixin,
+)
+from sglang.srt.disaggregation.utils import (
+    DisaggregationMode,
+    MetadataBuffers,
+    ReqToMetadataIdxAllocator,
+    TransferBackend,
+    prepare_abort,
+)
+from sglang.srt.distributed import get_pp_group, get_world_group
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.hf_transformers_utils import (
+    get_processor,
+    get_tokenizer,
+    get_tokenizer_from_processor,
+)
+from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.layers.moe import initialize_moe_config
+from sglang.srt.managers.io_struct import (
+    AbortReq,
+    BatchTokenizedEmbeddingReqInput,
+    BatchTokenizedGenerateReqInput,
+    ClearHiCacheReqInput,
+    ClearHiCacheReqOutput,
+    CloseSessionReqInput,
+    ExpertDistributionReq,
+    ExpertDistributionReqOutput,
+    FlushCacheReqInput,
+    FlushCacheReqOutput,
+    FreezeGCReq,
+    GetInternalStateReq,
+    GetInternalStateReqOutput,
+    GetWeightsByNameReqInput,
+    HealthCheckOutput,
+    InitWeightsUpdateGroupReqInput,
+    LoadLoRAAdapterReqInput,
+    LoadLoRAAdapterReqOutput,
+    MultiTokenizerRegisterReq,
+    MultiTokenizerWrapper,
+    OpenSessionReqInput,
+    OpenSessionReqOutput,
+    ProfileReq,
+    ReleaseMemoryOccupationReqInput,
+    ResumeMemoryOccupationReqInput,
+    RpcReqInput,
+    RpcReqOutput,
+    SetInternalStateReq,
+    SetInternalStateReqOutput,
+    SlowDownReqInput,
+    SlowDownReqOutput,
+    TokenizedEmbeddingReqInput,
+    TokenizedGenerateReqInput,
+    UnloadLoRAAdapterReqInput,
+    UnloadLoRAAdapterReqOutput,
+    UpdateWeightFromDiskReqInput,
+    UpdateWeightsFromDistributedReqInput,
+    UpdateWeightsFromTensorReqInput,
+)
+from sglang.srt.managers.mm_utils import init_embedding_cache
+from sglang.srt.managers.schedule_batch import (
+    FINISH_ABORT,
+    MultimodalInputs,
+    Req,
+    ScheduleBatch,
+    global_server_args_dict,
+)
+from sglang.srt.managers.schedule_policy import (
+    AddReqResult,
+    PrefillAdder,
+    SchedulePolicy,
+)
+from sglang.srt.managers.scheduler_input_blocker import SchedulerInputBlocker
+from sglang.srt.managers.scheduler_metrics_mixin import (
+    RECORD_STEP_TIME,
+    SchedulerMetricsMixin,
+)
+from sglang.srt.managers.scheduler_output_processor_mixin import (
+    SchedulerOutputProcessorMixin,
+)
+from sglang.srt.managers.scheduler_profiler_mixin import SchedulerProfilerMixin
+from sglang.srt.managers.scheduler_recv_skipper import SchedulerRecvSkipper
+from sglang.srt.managers.scheduler_update_weights_mixin import (
+    SchedulerUpdateWeightsMixin,
+)
+from sglang.srt.managers.session_controller import Session
+from sglang.srt.managers.tp_worker import TpModelWorker
+from sglang.srt.managers.tp_worker_overlap_thread import TpModelWorkerClient
+from sglang.srt.managers.utils import DPBalanceMeta, validate_input_length
+from sglang.srt.mem_cache.chunk_cache import ChunkCache, SWAChunkCache
+from sglang.srt.mem_cache.hiradix_cache import HiRadixCache
+from sglang.srt.mem_cache.lora_radix_cache import LoRARadixCache
+from sglang.srt.mem_cache.radix_cache import RadixCache
+from sglang.srt.mem_cache.swa_radix_cache import SWARadixCache
+from sglang.srt.model_executor.forward_batch_info import ForwardMode, PPProxyTensors
+from sglang.srt.parser.reasoning_parser import ReasoningParser
+from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
+from sglang.srt.two_batch_overlap import TboDPAttentionPreparer
+from sglang.srt.utils import (
+    DynamicGradMode,
+    broadcast_pyobj,
+    configure_gc_logger,
+    configure_logger,
+    disable_request_logging,
+    freeze_gc,
+    get_available_gpu_memory,
+    get_bool_env_var,
+    get_zmq_socket,
+    is_cpu,
+    kill_itself_when_parent_died,
+    numa_bind_to_node,
+    point_to_point_pyobj,
+    pyspy_dump_schedulers,
+    require_mlp_sync,
+    require_mlp_tp_gather,
+    set_gpu_proc_affinity,
+    set_random_seed,
+    suppress_other_loggers,
+)
+from sglang.utils import TypeBasedDispatcher, get_exception_traceback
+
+logger = logging.getLogger(__name__)
+
+# Test retract decode for debugging purposes
+TEST_RETRACT = get_bool_env_var("SGLANG_TEST_RETRACT")
+GRAMMAR_TIMEOUT = float(os.environ.get("SGLANG_GRAMMAR_TIMEOUT", 300))
+
+_is_cpu = is_cpu()
+
+
+@dataclass
+class GenerationBatchResult:
+    logits_output: Optional[LogitsProcessorOutput]
+    pp_hidden_states_proxy_tensors: Optional[torch.Tensor]
+    next_token_ids: Optional[List[int]]
+    extend_input_len_per_req: List[int]
+    extend_logprob_start_len_per_req: List[int]
+    bid: int
+    can_run_cuda_graph: bool
+
+
+@dataclass
+class EmbeddingBatchResult:
+    embeddings: torch.Tensor
+    bid: int
+
+
+class Scheduler(
+    SchedulerOutputProcessorMixin,
+    SchedulerUpdateWeightsMixin,
+    SchedulerProfilerMixin,
+    SchedulerMetricsMixin,
+    SchedulerDisaggregationDecodeMixin,
+    SchedulerDisaggregationPrefillMixin,
+):
+    """A scheduler that manages a tensor parallel GPU worker."""
+
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+        gpu_id: int,
+        tp_rank: int,
+        moe_ep_rank: int,
+        pp_rank: int,
+        dp_rank: Optional[int],
+        dp_balance_meta: Optional[DPBalanceMeta] = None,
+    ):
+        # Parse args
+        self.server_args = server_args
+        self.tp_rank = tp_rank
+        self.moe_ep_rank = moe_ep_rank
+        self.pp_rank = pp_rank
+        self.dp_rank = dp_rank
+        self.tp_size = server_args.tp_size
+        self.moe_ep_size = server_args.ep_size
+        self.pp_size = server_args.pp_size
+        self.dp_size = server_args.dp_size
+        self.schedule_policy = server_args.schedule_policy
+        self.enable_lora = server_args.enable_lora
+        self.max_loras_per_batch = server_args.max_loras_per_batch
+        self.enable_overlap = not server_args.disable_overlap_schedule
+        self.skip_tokenizer_init = server_args.skip_tokenizer_init
+        self.enable_metrics = server_args.enable_metrics
+        self.enable_metrics_for_all_schedulers = (
+            server_args.enable_metrics_for_all_schedulers
+        )
+        self.enable_kv_cache_events = server_args.kv_events_config is not None
+        self.stream_interval = server_args.stream_interval
+        self.spec_algorithm = SpeculativeAlgorithm.from_string(
+            server_args.speculative_algorithm
+        )
+        self.gpu_id = gpu_id
+        self.enable_hierarchical_cache = server_args.enable_hierarchical_cache
+        self.enable_hicache_storage = server_args.hicache_storage_backend is not None
+        self.page_size = server_args.page_size
+
+        self.attn_tp_rank, self.attn_tp_size, self.attn_dp_rank = (
+            compute_dp_attention_world_info(
+                server_args.enable_dp_attention,
+                self.tp_rank,
+                self.tp_size,
+                self.dp_size,
+            )
+        )
+
+        # Init model config
+        self.model_config = ModelConfig.from_server_args(server_args)
+
+        # Init inter-process communication
+        context = zmq.Context(2)
+        self.idle_sleeper = None
+        if self.pp_rank == 0 and self.attn_tp_rank == 0:
+            self.recv_from_tokenizer = get_zmq_socket(
+                context, zmq.PULL, port_args.scheduler_input_ipc_name, False
+            )
+            self.recv_from_rpc = get_zmq_socket(
+                context, zmq.DEALER, port_args.rpc_ipc_name, False
+            )
+
+            self.send_to_tokenizer = get_zmq_socket(
+                context, zmq.PUSH, port_args.tokenizer_ipc_name, False
+            )
+            if server_args.skip_tokenizer_init:
+                # Directly send to the TokenizerManager
+                self.send_to_detokenizer = get_zmq_socket(
+                    context, zmq.PUSH, port_args.tokenizer_ipc_name, False
+                )
+            else:
+                # Send to the DetokenizerManager
+                self.send_to_detokenizer = get_zmq_socket(
+                    context, zmq.PUSH, port_args.detokenizer_ipc_name, False
+                )
+
+            if self.server_args.sleep_on_idle:
+                self.idle_sleeper = IdleSleeper(
+                    [
+                        self.recv_from_tokenizer,
+                        self.recv_from_rpc,
+                    ]
+                )
+        else:
+            self.recv_from_tokenizer = None
+            self.recv_from_rpc = None
+            self.send_to_tokenizer = SimpleNamespace(send_pyobj=lambda x: None)
+            self.send_to_detokenizer = SimpleNamespace(send_pyobj=lambda x: None)
+
+        if self.current_scheduler_metrics_enabled():
+            self.send_metrics_from_scheduler = get_zmq_socket(
+                context, zmq.PUSH, port_args.metrics_ipc_name, False
+            )
+
+        # Init tokenizer
+        self.init_tokenizer()
+
+        # Init moe config
+        self.init_moe_config()
+
+        # Set reasoning_parser and think_end_id if --reasoning_parser is enabled
+        if self.server_args.reasoning_parser and self.tokenizer:
+            reasoning_parser = ReasoningParser(
+                model_type=self.server_args.reasoning_parser, stream_reasoning=False
+            )
+            self.tokenizer.think_end_id = self.tokenizer.encode(
+                reasoning_parser.detector.think_end_token, add_special_tokens=False
+            )[0]
+
+        # Check whether overlap can be enabled
+        if not self.is_generation:
+            self.enable_overlap = False
+            logger.info("Overlap scheduler is disabled for embedding models.")
+
+        # Launch a tensor parallel worker
+        if self.enable_overlap:
+            TpWorkerClass = TpModelWorkerClient
+        else:
+            TpWorkerClass = TpModelWorker
+
+        self.tp_worker = TpWorkerClass(
+            server_args=server_args,
+            gpu_id=gpu_id,
+            tp_rank=tp_rank,
+            moe_ep_rank=moe_ep_rank,
+            pp_rank=pp_rank,
+            dp_rank=dp_rank,
+            nccl_port=port_args.nccl_port,
+        )
+
+        # Launch a draft worker for speculative decoding
+        if self.spec_algorithm.is_eagle():
+            from sglang.srt.speculative.eagle_worker import EAGLEWorker
+
+            self.draft_worker = EAGLEWorker(
+                gpu_id=gpu_id,
+                tp_rank=tp_rank,
+                moe_ep_rank=moe_ep_rank,
+                server_args=server_args,
+                nccl_port=port_args.nccl_port,
+                target_worker=self.tp_worker,
+                dp_rank=dp_rank,
+            )
+        elif self.spec_algorithm.is_standalone():
+            from sglang.srt.speculative.standalone_worker import StandaloneWorker
+
+            self.draft_worker = StandaloneWorker(
+                gpu_id=gpu_id,
+                tp_rank=tp_rank,
+                moe_ep_rank=moe_ep_rank,
+                server_args=server_args,
+                nccl_port=port_args.nccl_port,
+                target_worker=self.tp_worker,
+                dp_rank=dp_rank,
+            )
+        else:
+            self.draft_worker = None
+
+        # Get token and memory info from the model worker
+        (
+            self.max_total_num_tokens,
+            self.max_prefill_tokens,
+            self.max_running_requests,
+            self.max_queued_requests,
+            self.max_req_len,
+            self.max_req_input_len,
+            self.random_seed,
+            self.device,
+            worker_global_server_args_dict,
+            _,
+            _,
+            _,
+        ) = self.tp_worker.get_worker_info()
+        if global_server_args_dict["max_micro_batch_size"] is None:
+            global_server_args_dict["max_micro_batch_size"] = max(
+                self.max_running_requests // server_args.pp_size, 1
+            )
+
+        self.tp_group = self.tp_worker.get_tp_group()
+        self.tp_cpu_group = self.tp_group.cpu_group
+        self.attn_tp_group = self.tp_worker.get_attention_tp_group()
+        self.attn_tp_cpu_group = self.tp_worker.get_attention_tp_cpu_group()
+        self.pp_group = get_pp_group()
+        self.world_group = get_world_group()
+
+        self.pad_input_ids_func = self.tp_worker.get_pad_input_ids_func()
+        global_server_args_dict.update(worker_global_server_args_dict)
+        set_random_seed(self.random_seed)
+
+        # Hybrid memory pool
+        self.is_hybrid = self.tp_worker.is_hybrid
+        if self.is_hybrid:
+            self.sliding_window_size = self.tp_worker.sliding_window_size
+            self.full_tokens_per_layer, self.swa_tokens_per_layer = (
+                self.tp_worker.get_tokens_per_layer_info()
+            )
+
+        # Print debug info
+        if tp_rank == 0:
+            avail_mem = get_available_gpu_memory(
+                self.device, self.gpu_id, empty_cache=False
+            )
+            logger.info(
+                f"max_total_num_tokens={self.max_total_num_tokens}, "
+                f"chunked_prefill_size={server_args.chunked_prefill_size}, "
+                f"max_prefill_tokens={self.max_prefill_tokens}, "
+                f"max_running_requests={self.max_running_requests}, "
+                f"context_len={self.model_config.context_len}, "
+                f"{'available_cpu_mem' if self.device == 'cpu' else 'available_gpu_mem'}={avail_mem:.2f} GB"
+            )
+
+        # Init memory pool and cache
+        self.init_memory_pool_and_cache()
+
+        # Init running status
+        self.waiting_queue: List[Req] = []
+        # The running decoding batch for continuous batching
+        self.running_batch: ScheduleBatch = ScheduleBatch(reqs=[], batch_is_full=False)
+        # The current forward batch
+        self.cur_batch: Optional[ScheduleBatch] = None
+        # The last forward batch
+        self.last_batch: Optional[ScheduleBatch] = None
+        self.forward_ct = 0
+        self.forward_ct_decode = 0
+        self.num_generated_tokens = 0
+        self.last_prefill_tokens = 0
+        self.last_decode_stats_tic = time.perf_counter()
+        self.last_prefill_stats_tic = time.perf_counter()
+        self.return_health_check_ct = 0
+        self.num_retracted_reqs: int = 0
+        self.num_paused_reqs: int = 0
+        self.kv_transfer_speed_gb_s: float = 0.0
+        self.kv_transfer_latency_ms: float = 0.0
+        self.sessions: Dict[str, Session] = {}
+        self.current_stream = torch.get_device_module(self.device).current_stream()
+        if self.device == "cpu":
+            self.current_stream.synchronize = lambda: None  # No-op for CPU
+        self.forward_sleep_time = None
+
+        # Init chunked prefill
+        self.chunked_prefill_size = server_args.chunked_prefill_size
+        if self.chunked_prefill_size <= 0:  # -1 means disable
+            self.chunked_prefill_size = None
+        self.chunked_req = None
+        self.is_mixed_chunk = (
+            self.chunked_prefill_size is not None and server_args.enable_mixed_chunk
+        )
+
+        # Init the grammar backend for constrained generation
+        self.grammar_queue: List[Req] = []
+        if not server_args.skip_tokenizer_init:
+            self.grammar_backend = create_grammar_backend(
+                server_args,
+                self.tokenizer,
+                self.model_config.vocab_size,
+                self.model_config.hf_eos_token_id,
+            )
+        else:
+            self.grammar_backend = None
+
+        # Init schedule policy and new token estimation
+        self.policy = SchedulePolicy(
+            self.schedule_policy,
+            self.tree_cache,
+            self.enable_hierarchical_cache,
+        )
+        assert (
+            server_args.schedule_conservativeness >= 0
+        ), "Invalid schedule_conservativeness"
+        self.init_new_token_ratio = min(
+            global_config.default_init_new_token_ratio
+            * server_args.schedule_conservativeness,
+            1.0,
+        )
+        self.min_new_token_ratio = min(
+            self.init_new_token_ratio
+            * global_config.default_min_new_token_ratio_factor,
+            1.0,
+        )
+        self.new_token_ratio_decay = (
+            self.init_new_token_ratio - self.min_new_token_ratio
+        ) / global_config.default_new_token_ratio_decay_steps
+        self.new_token_ratio = self.init_new_token_ratio
+
+        # Init watchdog thread
+        self.watchdog_timeout = server_args.watchdog_timeout
+        t = threading.Thread(target=self.watchdog_thread, daemon=True)
+        t.start()
+        self.parent_process = psutil.Process().parent()
+
+        # Init memory saver, profiler and metric stats
+        self.memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=server_args.enable_memory_saver
+        )
+        self.offload_tags = set()
+        self.init_profiler()
+
+        self.recv_skipper = SchedulerRecvSkipper.maybe_create(server_args)
+        self.input_blocker = (
+            SchedulerInputBlocker(noop=self.attn_tp_rank != 0)
+            if get_bool_env_var("SGLANG_ENABLE_COLOCATED_BATCH_GEN")
+            else None
+        )
+
+        # Init metrics stats
+        self.init_metrics(tp_rank, pp_rank, dp_rank)
+        self.init_kv_events(server_args.kv_events_config)
+        self.init_dp_balance(dp_balance_meta)
+
+        # Init disaggregation
+        self.disaggregation_mode = DisaggregationMode(
+            self.server_args.disaggregation_mode
+        )
+        self.init_disaggregation()
+
+        if get_bool_env_var("SGLANG_GC_LOG"):
+            configure_gc_logger()
+
+        # Init request dispatcher
+        self._request_dispatcher = TypeBasedDispatcher(
+            [
+                (TokenizedGenerateReqInput, self.handle_generate_request),
+                (TokenizedEmbeddingReqInput, self.handle_embedding_request),
+                (BatchTokenizedGenerateReqInput, self.handle_batch_generate_request),
+                (BatchTokenizedEmbeddingReqInput, self.handle_batch_embedding_request),
+                (FlushCacheReqInput, self.flush_cache_wrapped),
+                (ClearHiCacheReqInput, self.clear_hicache_storage_wrapped),
+                (AbortReq, self.abort_request),
+                (OpenSessionReqInput, self.open_session),
+                (CloseSessionReqInput, self.close_session),
+                (UpdateWeightFromDiskReqInput, self.update_weights_from_disk),
+                (InitWeightsUpdateGroupReqInput, self.init_weights_update_group),
+                (
+                    UpdateWeightsFromDistributedReqInput,
+                    self.update_weights_from_distributed,
+                ),
+                (UpdateWeightsFromTensorReqInput, self.update_weights_from_tensor),
+                (GetWeightsByNameReqInput, self.get_weights_by_name),
+                (ReleaseMemoryOccupationReqInput, self.release_memory_occupation),
+                (ResumeMemoryOccupationReqInput, self.resume_memory_occupation),
+                (SlowDownReqInput, self.slow_down),
+                (ProfileReq, self.profile),
+                (FreezeGCReq, self.handle_freeze_gc),
+                (GetInternalStateReq, self.get_internal_state),
+                (SetInternalStateReq, self.set_internal_state),
+                (RpcReqInput, self.handle_rpc_request),
+                (ExpertDistributionReq, self.expert_distribution_handle),
+                (LoadLoRAAdapterReqInput, self.load_lora_adapter),
+                (UnloadLoRAAdapterReqInput, self.unload_lora_adapter),
+                (MultiTokenizerRegisterReq, self.register_multi_tokenizer),
+            ]
+        )
+
+    def init_tokenizer(self):
+        server_args = self.server_args
+        self.is_generation = self.model_config.is_generation
+
+        if server_args.skip_tokenizer_init:
+            self.tokenizer = self.processor = None
+        else:
+            if self.model_config.is_multimodal:
+                self.processor = get_processor(
+                    server_args.tokenizer_path,
+                    tokenizer_mode=server_args.tokenizer_mode,
+                    trust_remote_code=server_args.trust_remote_code,
+                    revision=server_args.revision,
+                    use_fast=not server_args.disable_fast_image_processor,
+                )
+                self.tokenizer = get_tokenizer_from_processor(self.processor)
+            else:
+                self.tokenizer = get_tokenizer(
+                    server_args.tokenizer_path,
+                    tokenizer_mode=server_args.tokenizer_mode,
+                    trust_remote_code=server_args.trust_remote_code,
+                    revision=server_args.revision,
+                )
+
+    def init_memory_pool_and_cache(self):
+        server_args = self.server_args
+
+        self.req_to_token_pool, self.token_to_kv_pool_allocator = (
+            self.tp_worker.get_memory_pool()
+        )
+
+        if (
+            server_args.chunked_prefill_size is not None
+            and server_args.disable_radix_cache
+        ):
+            if self.is_hybrid:
+                ChunkCacheClass = SWAChunkCache
+            else:
+                ChunkCacheClass = ChunkCache
+            self.tree_cache = ChunkCacheClass(
+                req_to_token_pool=self.req_to_token_pool,
+                token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
+                page_size=self.page_size,
+            )
+        else:
+            if os.environ.get("SGLANG_EXPERIMENTAL_CPP_RADIX_TREE") == "1":
+                # lazy import to avoid JIT overhead
+                from sglang.srt.mem_cache.radix_cache_cpp import RadixCacheCpp
+
+                self.tree_cache = RadixCacheCpp(
+                    disable=False,
+                    use_hicache=self.enable_hierarchical_cache,
+                    req_to_token_pool=self.req_to_token_pool,
+                    token_to_kv_pool=self.token_to_kv_pool_allocator,
+                    tp_cache_group=self.tp_cpu_group,
+                    page_size=self.page_size,
+                    hicache_ratio=server_args.hicache_ratio,
+                    hicache_size=server_args.hicache_size,
+                    hicache_write_policy=server_args.hicache_write_policy,
+                    enable_kv_cache_events=self.enable_kv_cache_events,
+                )
+            elif self.enable_hierarchical_cache:
+                self.tree_cache = HiRadixCache(
+                    req_to_token_pool=self.req_to_token_pool,
+                    token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
+                    tp_cache_group=(
+                        self.attn_tp_cpu_group
+                        if self.server_args.enable_dp_attention
+                        else self.tp_cpu_group
+                    ),
+                    page_size=self.page_size,
+                    hicache_ratio=server_args.hicache_ratio,
+                    hicache_size=server_args.hicache_size,
+                    hicache_write_policy=server_args.hicache_write_policy,
+                    hicache_io_backend=server_args.hicache_io_backend,
+                    hicache_mem_layout=server_args.hicache_mem_layout,
+                    enable_metrics=self.enable_metrics,
+                    hicache_storage_backend=server_args.hicache_storage_backend,
+                    hicache_storage_prefetch_policy=server_args.hicache_storage_prefetch_policy,
+                    model_name=server_args.served_model_name,
+                    storage_backend_extra_config=server_args.hicache_storage_backend_extra_config,
+                )
+                self.tp_worker.register_hicache_layer_transfer_counter(
+                    self.tree_cache.cache_controller.layer_done_counter
+                )
+            elif self.is_hybrid:
+                assert (
+                    self.server_args.disaggregation_mode == "null"
+                ), "Hybrid mode does not support disaggregation yet"
+                self.tree_cache = SWARadixCache(
+                    req_to_token_pool=self.req_to_token_pool,
+                    token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
+                    sliding_window_size=self.sliding_window_size,
+                    page_size=self.page_size,
+                    disable=server_args.disable_radix_cache,
+                )
+            elif self.enable_lora:
+                assert (
+                    not self.enable_hierarchical_cache
+                ), "LoRA radix cache doesn't support hierarchical cache"
+                assert (
+                    self.schedule_policy == "fcfs"
+                ), "LoRA radix cache only supports FCFS policy"
+                self.tree_cache = LoRARadixCache(
+                    req_to_token_pool=self.req_to_token_pool,
+                    token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
+                    page_size=self.page_size,
+                    disable=server_args.disable_radix_cache,
+                )
+            elif server_args.enable_lmcache:
+                from sglang.srt.mem_cache.storage.lmcache.lmc_radix_cache import (
+                    LMCRadixCache,
+                )
+
+                self.tree_cache = LMCRadixCache(
+                    req_to_token_pool=self.req_to_token_pool,
+                    token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
+                    page_size=self.page_size,
+                    disable=server_args.disable_radix_cache,
+                    model_config=self.model_config,
+                    tp_size=self.tp_size,
+                    rank=self.tp_rank,
+                    tp_group=self.tp_group,
+                )
+            else:
+                self.tree_cache = RadixCache(
+                    req_to_token_pool=self.req_to_token_pool,
+                    token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
+                    page_size=self.page_size,
+                    disable=server_args.disable_radix_cache,
+                    enable_kv_cache_events=self.enable_kv_cache_events,
+                )
+
+        self.decode_mem_cache_buf_multiplier = (
+            1
+            if self.spec_algorithm.is_none()
+            else (
+                server_args.speculative_num_draft_tokens
+                + (
+                    server_args.speculative_eagle_topk
+                    * server_args.speculative_num_steps
+                )
+            )
+        )
+
+        embedding_cache_size = int(os.environ.get("SGLANG_VLM_CACHE_SIZE_MB", "100"))
+        init_embedding_cache(embedding_cache_size * 1024 * 1024)
+
+    def init_disaggregation(self):
+        self.transfer_backend = TransferBackend(
+            self.server_args.disaggregation_transfer_backend
+        )
+
+        if (
+            self.disaggregation_mode == DisaggregationMode.DECODE
+        ):  # *2 for the headroom.
+            buffer_size = (self.req_to_token_pool.size) * 2
+            self.req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator(
+                buffer_size
+            )
+            self.disagg_metadata_buffers = MetadataBuffers(
+                buffer_size,
+                hidden_size=self.model_config.hf_text_config.hidden_size,
+                dtype=self.model_config.dtype,
+                custom_mem_pool=self.token_to_kv_pool_allocator.get_kvcache().maybe_get_custom_mem_pool(),
+            )
+
+            # The decode requests polling kv cache
+            self.disagg_decode_transfer_queue = DecodeTransferQueue(
+                gloo_group=self.attn_tp_cpu_group,
+                req_to_metadata_buffer_idx_allocator=self.req_to_metadata_buffer_idx_allocator,
+                tp_rank=self.tp_rank,
+                metadata_buffers=self.disagg_metadata_buffers,
+                scheduler=self,
+                tree_cache=self.tree_cache,
+            )
+
+            # The decode requests pending for pre-allocation
+            self.disagg_decode_prealloc_queue = DecodePreallocQueue(
+                req_to_token_pool=self.req_to_token_pool,
+                token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
+                draft_token_to_kv_pool=(
+                    None
+                    if self.draft_worker is None
+                    else self.draft_worker.model_runner.token_to_kv_pool
+                ),
+                req_to_metadata_buffer_idx_allocator=self.req_to_metadata_buffer_idx_allocator,
+                metadata_buffers=self.disagg_metadata_buffers,
+                scheduler=self,
+                transfer_queue=self.disagg_decode_transfer_queue,
+                tree_cache=self.tree_cache,
+                gloo_group=self.attn_tp_cpu_group,
+                tp_rank=self.tp_rank,
+                tp_size=self.tp_size,
+                dp_size=self.server_args.dp_size,
+                gpu_id=self.gpu_id,
+                bootstrap_port=self.server_args.disaggregation_bootstrap_port,
+                max_total_num_tokens=self.max_total_num_tokens,
+                prefill_pp_size=self.server_args.disaggregation_prefill_pp,
+                num_reserved_decode_tokens=self.server_args.num_reserved_decode_tokens,
+                transfer_backend=self.transfer_backend,
+            )
+
+        elif self.disaggregation_mode == DisaggregationMode.PREFILL:
+            # *2 for the headroom.
+            buffer_size = self.max_running_requests * 2
+            self.req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator(
+                buffer_size
+            )
+            self.disagg_metadata_buffers = MetadataBuffers(
+                buffer_size,
+                hidden_size=self.model_config.hf_text_config.hidden_size,
+                dtype=self.model_config.dtype,
+                custom_mem_pool=self.token_to_kv_pool_allocator.get_kvcache().maybe_get_custom_mem_pool(),
+            )
+
+            self.disagg_prefill_bootstrap_queue = PrefillBootstrapQueue(
+                token_to_kv_pool=self.token_to_kv_pool_allocator.get_kvcache(),
+                draft_token_to_kv_pool=(
+                    None
+                    if self.draft_worker is None
+                    else self.draft_worker.model_runner.token_to_kv_pool
+                ),
+                req_to_metadata_buffer_idx_allocator=self.req_to_metadata_buffer_idx_allocator,
+                metadata_buffers=self.disagg_metadata_buffers,
+                tp_rank=self.tp_rank,
+                tp_size=self.tp_size,
+                gpu_id=self.gpu_id,
+                bootstrap_port=self.server_args.disaggregation_bootstrap_port,
+                gloo_group=self.attn_tp_cpu_group,
+                max_total_num_tokens=self.max_total_num_tokens,
+                decode_tp_size=self.server_args.disaggregation_decode_tp,
+                decode_dp_size=self.server_args.disaggregation_decode_dp,
+                scheduler=self,
+                pp_rank=self.pp_rank,
+                pp_size=self.pp_size,
+                transfer_backend=self.transfer_backend,
+            )
+            # The prefill requests that are in the middle of kv sending
+            self.disagg_prefill_inflight_queue: List[Req] = []
+
+    def init_moe_config(self):
+        if hasattr(self.model_config.hf_config, "num_experts_per_tok"):
+            initialize_moe_config(self.server_args)
+
+    @DynamicGradMode()
+    def event_loop_normal(self):
+        """A normal scheduler loop."""
+        while True:
+            recv_reqs = self.recv_requests()
+            self.process_input_requests(recv_reqs)
+
+            batch = self.get_next_batch_to_run()
+            self.cur_batch = batch
+
+            if batch:
+                result = self.run_batch(batch)
+                self.process_batch_result(batch, result)
+            else:
+                # When the server is idle, do self-check and re-init some states
+                self.self_check_during_idle()
+
+            self.last_batch = batch
+
+    @DynamicGradMode()
+    def event_loop_overlap(self):
+        """A scheduler loop that overlaps the CPU processing and GPU computation."""
+        self.result_queue = deque()
+
+        while True:
+            recv_reqs = self.recv_requests()
+            self.process_input_requests(recv_reqs)
+
+            batch = self.get_next_batch_to_run()
+            self.cur_batch = batch
+
+            if batch:
+                batch.launch_done = threading.Event()
+                result = self.run_batch(batch)
+                self.result_queue.append((batch.copy(), result))
+
+                if self.last_batch is None:
+                    # Create a dummy first batch to start the pipeline for overlap schedule.
+                    # It is now used for triggering the sampling_info_done event.
+                    tmp_batch = ScheduleBatch(
+                        reqs=None,
+                        forward_mode=ForwardMode.DUMMY_FIRST,
+                        next_batch_sampling_info=self.tp_worker.cur_sampling_info,
+                    )
+                    self.process_batch_result(tmp_batch, None, batch.launch_done)
+
+            if self.last_batch:
+                # Process the results of the last batch
+                tmp_batch, tmp_result = self.result_queue.popleft()
+                tmp_batch.next_batch_sampling_info = (
+                    self.tp_worker.cur_sampling_info if batch else None
+                )
+                # NOTE: we should use current launched batch's launch_done event Instead of the last batch's
+                self.process_batch_result(
+                    tmp_batch, tmp_result, batch.launch_done if batch else None
+                )
+            elif batch is None:
+                # When the server is idle, do self-check and re-init some states
+                self.self_check_during_idle()
+
+            self.last_batch = batch
+
+    @DynamicGradMode()
+    def event_loop_pp(self):
+        """A non-overlap scheduler loop for pipeline parallelism."""
+        mbs = [None] * self.pp_size
+        last_mbs = [None] * self.pp_size
+        self.running_mbs = [
+            ScheduleBatch(reqs=[], batch_is_full=False) for _ in range(self.pp_size)
+        ]
+        bids = [None] * self.pp_size
+        pp_outputs: Optional[PPProxyTensors] = None
+        while True:
+            server_is_idle = True
+            for mb_id in range(self.pp_size):
+                self.running_batch = self.running_mbs[mb_id]
+                self.last_batch = last_mbs[mb_id]
+
+                recv_reqs = self.recv_requests()
+                self.process_input_requests(recv_reqs)
+                mbs[mb_id] = self.get_next_batch_to_run()
+                self.running_mbs[mb_id] = self.running_batch
+
+                self.cur_batch = mbs[mb_id]
+                if self.cur_batch:
+                    server_is_idle = False
+                    result = self.run_batch(self.cur_batch)
+
+                # (last rank) send the outputs to the next step
+                if self.pp_group.is_last_rank:
+                    if self.cur_batch:
+                        next_token_ids, bids[mb_id] = (
+                            result.next_token_ids,
+                            result.bid,
+                        )
+                        if self.cur_batch.return_logprob:
+                            pp_outputs = PPProxyTensors(
+                                {
+                                    "next_token_ids": next_token_ids,
+                                    "extend_input_len_per_req": result.extend_input_len_per_req,
+                                    "extend_logprob_start_len_per_req": result.extend_logprob_start_len_per_req,
+                                }
+                                | (
+                                    {
+                                        f"logits_output.{k}": v
+                                        for k, v in result.logits_output.__dict__.items()
+                                    }
+                                    if result.logits_output is not None
+                                    else {}
+                                )
+                            )
+                        else:
+                            pp_outputs = PPProxyTensors(
+                                {
+                                    "next_token_ids": next_token_ids,
+                                }
+                            )
+                        # send the output from the last round to let the next stage worker run post processing
+                        self.pp_group.send_tensor_dict(
+                            pp_outputs.tensors,
+                            all_gather_group=self.attn_tp_group,
+                        )
+
+                # receive outputs and post-process (filter finished reqs) the coming microbatch
+                next_mb_id = (mb_id + 1) % self.pp_size
+                next_pp_outputs = None
+                if mbs[next_mb_id] is not None:
+                    next_pp_outputs: Optional[PPProxyTensors] = PPProxyTensors(
+                        self.pp_group.recv_tensor_dict(
+                            all_gather_group=self.attn_tp_group
+                        )
+                    )
+                    mbs[next_mb_id].output_ids = next_pp_outputs["next_token_ids"]
+                    logits_output_args = {
+                        k[len("logits_output.") :]: v
+                        for k, v in next_pp_outputs.tensors.items()
+                        if k.startswith("logits_output.")
+                    }
+                    if len(logits_output_args) > 0:
+                        logits_output = LogitsProcessorOutput(**logits_output_args)
+                    else:
+                        logits_output = None
+                    output_result = GenerationBatchResult(
+                        logits_output=logits_output,
+                        pp_hidden_states_proxy_tensors=None,
+                        next_token_ids=next_pp_outputs["next_token_ids"],
+                        extend_input_len_per_req=next_pp_outputs.tensors.get(
+                            "extend_input_len_per_req", None
+                        ),
+                        extend_logprob_start_len_per_req=next_pp_outputs.tensors.get(
+                            "extend_logprob_start_len_per_req", None
+                        ),
+                        bid=bids[next_mb_id],
+                        can_run_cuda_graph=result.can_run_cuda_graph,
+                    )
+                    self.process_batch_result(mbs[next_mb_id], output_result)
+                    last_mbs[next_mb_id] = mbs[next_mb_id]
+
+                # (not last rank)
+                if not self.pp_group.is_last_rank:
+                    if self.cur_batch:
+                        bids[mb_id] = result.bid
+                    # carry the outputs to the next stage
+                    # send the outputs from the last round to let the next stage worker run post processing
+                    if pp_outputs:
+                        self.pp_group.send_tensor_dict(
+                            pp_outputs.tensors,
+                            all_gather_group=self.attn_tp_group,
+                        )
+
+                    # send out reqs to the next stage
+                    dp_offset = self.attn_dp_rank * self.attn_tp_size
+                    if self.attn_tp_rank == 0:
+                        point_to_point_pyobj(
+                            recv_reqs,
+                            self.pp_rank * self.tp_size + dp_offset,
+                            self.world_group.device_group,
+                            self.pp_rank * self.tp_size + dp_offset,
+                            (self.pp_rank + 1) * self.tp_size + dp_offset,
+                        )
+
+                    # send out proxy tensors to the next stage
+                    if self.cur_batch:
+                        self.pp_group.send_tensor_dict(
+                            result.pp_hidden_states_proxy_tensors,
+                            all_gather_group=self.attn_tp_group,
+                        )
+
+                pp_outputs = next_pp_outputs
+
+            # When the server is idle, self-check and re-init some states
+            if server_is_idle:
+                # When the server is idle, do self-check and re-init some states
+                self.self_check_during_idle()
+
+    def recv_requests(self) -> List[Req]:
+        """Receive results at tp_rank = 0 and broadcast it to all other TP ranks."""
+
+        if self.recv_skipper is not None:
+            last_forward_mode = (
+                self.last_batch.forward_mode if self.last_batch is not None else None
+            )
+            if not self.recv_skipper.handle(last_forward_mode):
+                return []
+
+        if self.pp_rank == 0:
+            if self.attn_tp_rank == 0:
+                recv_reqs = []
+
+                while True:
+                    try:
+                        recv_req = self.recv_from_tokenizer.recv_pyobj(zmq.NOBLOCK)
+                    except zmq.ZMQError:
+                        break
+                    recv_reqs.append(recv_req)
+
+                while True:
+                    try:
+                        recv_rpc = self.recv_from_rpc.recv_pyobj(zmq.NOBLOCK)
+                    except zmq.ZMQError:
+                        break
+                    recv_reqs.append(recv_rpc)
+            else:
+                recv_reqs = None
+        else:
+            if self.attn_tp_rank == 0:
+                dp_offset = self.attn_dp_rank * self.attn_tp_size
+                recv_reqs = point_to_point_pyobj(
+                    [],
+                    self.pp_rank * self.tp_size + dp_offset,
+                    self.world_group.device_group,
+                    (self.pp_rank - 1) * self.tp_size + dp_offset,
+                    self.pp_rank * self.tp_size + dp_offset,
+                )
+            else:
+                recv_reqs = None
+
+        if self.input_blocker is not None:
+            recv_reqs = self.input_blocker.handle(recv_reqs)
+
+        if self.server_args.enable_dp_attention:
+            if self.attn_tp_rank == 0:
+                work_reqs = [
+                    req
+                    for req in recv_reqs
+                    if isinstance(
+                        req,
+                        (
+                            TokenizedGenerateReqInput,
+                            TokenizedEmbeddingReqInput,
+                            BatchTokenizedGenerateReqInput,
+                            BatchTokenizedEmbeddingReqInput,
+                        ),
+                    )
+                ]
+                control_reqs = [
+                    req
+                    for req in recv_reqs
+                    if not isinstance(
+                        req,
+                        (
+                            TokenizedGenerateReqInput,
+                            TokenizedEmbeddingReqInput,
+                            BatchTokenizedGenerateReqInput,
+                            BatchTokenizedEmbeddingReqInput,
+                        ),
+                    )
+                ]
+            else:
+                work_reqs = None
+                control_reqs = None
+
+            if self.attn_tp_size != 1:
+                work_reqs = broadcast_pyobj(
+                    work_reqs,
+                    self.attn_tp_group.rank,
+                    self.attn_tp_cpu_group,
+                    src=self.attn_tp_group.ranks[0],
+                )
+            if self.tp_size != 1:
+                control_reqs = broadcast_pyobj(
+                    control_reqs,
+                    self.tp_group.rank,
+                    self.tp_cpu_group,
+                    src=self.tp_group.ranks[0],
+                )
+            recv_reqs = work_reqs + control_reqs
+        elif self.tp_size != 1:
+            recv_reqs = broadcast_pyobj(
+                recv_reqs,
+                self.tp_group.rank,
+                self.tp_cpu_group,
+                src=self.tp_group.ranks[0],
+            )
+        return recv_reqs
+
+    def process_input_requests(self, recv_reqs: List):
+        for recv_req in recv_reqs:
+            # If it is a health check generation request and there are running requests, ignore it.
+            if is_health_check_generate_req(recv_req) and (
+                self.chunked_req is not None
+                or not self.running_batch.is_empty()
+                or len(self.offload_tags) > 0
+            ):
+                self.return_health_check_ct += 1
+                continue
+
+            # If it is a work request, accept or reject the request based on the request queue size.
+            if is_work_request(recv_req):
+                if len(self.waiting_queue) + 1 > self.max_queued_requests:
+                    abort_req = AbortReq(
+                        recv_req.rid,
+                        finished_reason={
+                            "type": "abort",
+                            "status_code": HTTPStatus.SERVICE_UNAVAILABLE,
+                            "message": "The request queue is full.",
+                        },
+                    )
+                    self.send_to_tokenizer.send_pyobj(abort_req)
+                    continue
+
+            # If it is a MultiTokenizerWrapper, unwrap it and handle the inner request.
+            if isinstance(recv_req, MultiTokenizerWrapper):
+                worker_id = recv_req.worker_id
+                recv_req = recv_req.obj
+                output = self._request_dispatcher(recv_req)
+                if output is not None:
+                    output = MultiTokenizerWrapper(worker_id, output)
+                    self.send_to_tokenizer.send_pyobj(output)
+                continue
+
+            output = self._request_dispatcher(recv_req)
+            if output is not None:
+                if isinstance(output, RpcReqOutput):
+                    if self.recv_from_rpc is not None:
+                        self.recv_from_rpc.send_pyobj(output)
+                else:
+                    self.send_to_tokenizer.send_pyobj(output)
+
+    def handle_generate_request(
+        self,
+        recv_req: TokenizedGenerateReqInput,
+    ):
+        self.maybe_update_dp_balance_data(recv_req)
+
+        # Create a new request
+        if (
+            recv_req.session_params is None
+            or recv_req.session_params.id is None
+            or recv_req.session_params.id not in self.sessions
+        ):
+            if recv_req.input_embeds is not None:
+                # Generate fake input_ids based on the length of input_embeds
+                seq_length = len(recv_req.input_embeds)
+                fake_input_ids = [1] * seq_length
+                recv_req.input_ids = fake_input_ids
+
+            if recv_req.bootstrap_port is None:
+                # Use default bootstrap port
+                recv_req.bootstrap_port = self.server_args.disaggregation_bootstrap_port
+
+            req = Req(
+                recv_req.rid,
+                recv_req.input_text,
+                recv_req.input_ids,
+                recv_req.sampling_params,
+                return_logprob=recv_req.return_logprob,
+                top_logprobs_num=recv_req.top_logprobs_num,
+                token_ids_logprob=recv_req.token_ids_logprob,
+                stream=recv_req.stream,
+                lora_id=recv_req.lora_id,
+                input_embeds=recv_req.input_embeds,
+                custom_logit_processor=recv_req.custom_logit_processor,
+                return_hidden_states=recv_req.return_hidden_states,
+                eos_token_ids=self.model_config.hf_eos_token_id,
+                bootstrap_host=recv_req.bootstrap_host,
+                bootstrap_port=recv_req.bootstrap_port,
+                bootstrap_room=recv_req.bootstrap_room,
+                data_parallel_rank=recv_req.data_parallel_rank,
+                vocab_size=self.model_config.vocab_size,
+            )
+            req.tokenizer = self.tokenizer
+
+            if self.disaggregation_mode != DisaggregationMode.NULL:
+                # Invalid request for disaggregated mode
+                if recv_req.bootstrap_room is None:
+                    error_msg = (
+                        f"Invalid request: Disaggregated request received without "
+                        f"boostrap room id. {req.rid=}"
+                    )
+                    logger.error(error_msg)
+                    prepare_abort(req, error_msg, status_code=HTTPStatus.BAD_REQUEST)
+                    self.stream_output([req], req.return_logprob)
+                    return
+
+            if (
+                recv_req.session_params is not None
+                and recv_req.session_params.id is not None
+            ):
+                req.set_finish_with_abort(
+                    f"Invalid request: session id {recv_req.session_params.id} does not exist"
+                )
+                self._add_request_to_queue(req)
+                return
+        else:
+            # Create a new request from a previous session
+            session = self.sessions[recv_req.session_params.id]
+            req = session.create_req(recv_req, self.tokenizer)
+            if isinstance(req.finished_reason, FINISH_ABORT):
+                self._add_request_to_queue(req)
+                return
+
+        # Handle multimodal inputs
+        if recv_req.mm_inputs is not None:
+            image_inputs = MultimodalInputs.from_dict(recv_req.mm_inputs)
+            # Expand a single image token into multiple dummy tokens for receiving image embeddings
+            req.origin_input_ids = self.pad_input_ids_func(
+                req.origin_input_ids, image_inputs
+            )
+            req.extend_image_inputs(image_inputs)
+
+            if len(req.origin_input_ids) >= self.max_req_input_len:
+                req.set_finish_with_abort(
+                    error_msg=(
+                        "Multimodal prompt is too long after expanding multimodal tokens. "
+                        f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}."
+                    )
+                )
+                self._add_request_to_queue(req)
+                return
+
+        # Validate prompt length
+        error_msg = validate_input_length(
+            req,
+            self.max_req_input_len,
+            self.server_args.allow_auto_truncate,
+        )
+        if error_msg:
+            req.set_finish_with_abort(error_msg)
+            self._add_request_to_queue(req)
+            return
+
+        # Copy more attributes
+        if recv_req.logprob_start_len == -1 or not recv_req.return_logprob:
+            # By default, only return the logprobs for output tokens
+            req.logprob_start_len = len(req.origin_input_ids) - 1
+        else:
+            req.logprob_start_len = recv_req.logprob_start_len
+
+        if req.logprob_start_len >= len(req.origin_input_ids):
+            error_msg = f"{req.logprob_start_len=} is higher than the number of input tokens {len(req.origin_input_ids)=}. Please use a smaller logprob_start_len."
+            req.logprob_start_len = len(req.origin_input_ids) - 1
+            req.set_finish_with_abort(error_msg)
+            self._add_request_to_queue(req)
+            return
+
+        req.sampling_params.max_new_tokens = min(
+            (
+                req.sampling_params.max_new_tokens
+                if req.sampling_params.max_new_tokens is not None
+                else 1 << 30
+            ),
+            self.max_req_len - len(req.origin_input_ids) - 1,
+        )
+
+        # Init grammar cache for this request
+        add_to_grammar_queue = False
+        if (
+            req.sampling_params.json_schema is not None
+            or req.sampling_params.regex is not None
+            or req.sampling_params.ebnf is not None
+            or req.sampling_params.structural_tag is not None
+        ):
+            assert self.grammar_backend is not None
+            if req.sampling_params.json_schema is not None:
+                key = ("json", req.sampling_params.json_schema)
+            elif req.sampling_params.regex is not None:
+                key = ("regex", req.sampling_params.regex)
+            elif req.sampling_params.ebnf is not None:
+                key = ("ebnf", req.sampling_params.ebnf)
+            elif req.sampling_params.structural_tag:
+                key = ("structural_tag", req.sampling_params.structural_tag)
+
+            value, cache_hit = self.grammar_backend.get_cached_or_future_value(key)
+            req.grammar = value
+
+            if not cache_hit:
+                req.grammar_key = key
+                add_to_grammar_queue = True
+            else:
+                if value is INVALID_GRAMMAR_OBJ:  # We hit a cached invalid grammar.
+                    error_msg = f"Invalid grammar request with cache hit: {key=}"
+                    req.set_finish_with_abort(error_msg)
+
+        if add_to_grammar_queue:
+            req.queue_time_start = time.perf_counter()
+            self.grammar_queue.append(req)
+        else:
+            self._add_request_to_queue(req)
+
+    def handle_batch_generate_request(
+        self,
+        recv_req: BatchTokenizedGenerateReqInput,
+    ):
+        """Handle optimized batch generate request."""
+        logger.debug(f"Processing batch generate request with {len(recv_req)} requests")
+
+        # Process each request in the batch
+        for tokenized_req in recv_req:
+            self.handle_generate_request(tokenized_req)
+
+    def _add_request_to_queue(self, req: Req):
+        req.queue_time_start = time.perf_counter()
+        if self.disaggregation_mode == DisaggregationMode.PREFILL:
+            self._prefetch_kvcache(req)
+            self.disagg_prefill_bootstrap_queue.add(
+                req, self.model_config.num_key_value_heads
+            )
+        elif self.disaggregation_mode == DisaggregationMode.DECODE:
+            self.disagg_decode_prealloc_queue.add(req)
+        else:
+            self._prefetch_kvcache(req)
+            self.waiting_queue.append(req)
+
+    def _prefetch_kvcache(self, req: Req):
+        if self.enable_hicache_storage:
+            req.init_next_round_input(self.tree_cache)
+            if req.last_node.backuped:
+                # only to initiate the prefetch if the last node is backuped
+                # otherwise, the allocated GPU memory must be locked for integrity
+                last_hash = req.last_host_node.get_last_hash_value()
+                matched_len = len(req.prefix_indices) + req.host_hit_length
+                new_input_tokens = req.fill_ids[matched_len:]
+                self.tree_cache.prefetch_from_storage(
+                    req.rid, req.last_host_node, new_input_tokens, last_hash
+                )
+
+    def _extend_requests_to_queue(self, reqs: List[Req], is_retracted: bool = False):
+        if self.disaggregation_mode == DisaggregationMode.PREFILL:
+            self.disagg_prefill_bootstrap_queue.extend(
+                reqs, self.model_config.num_key_value_heads
+            )
+        elif self.disaggregation_mode == DisaggregationMode.DECODE:
+            # If this is a decode server, we put the request to the decode pending prealloc queue
+            self.disagg_decode_prealloc_queue.extend(reqs, is_retracted)
+        else:
+            self.waiting_queue.extend(reqs)
+
+    def handle_embedding_request(
+        self,
+        recv_req: TokenizedEmbeddingReqInput,
+    ):
+        req = Req(
+            recv_req.rid,
+            recv_req.input_text,
+            recv_req.input_ids,
+            recv_req.sampling_params,
+            token_type_ids=recv_req.token_type_ids,
+        )
+        req.tokenizer = self.tokenizer
+
+        # Handle multimodal inputs
+        if recv_req.image_inputs is not None:
+            image_inputs = MultimodalInputs.from_dict(recv_req.image_inputs)
+            # Expand a single image token into multiple dummy tokens for receiving image embeddings
+            req.origin_input_ids = self.pad_input_ids_func(
+                req.origin_input_ids, image_inputs
+            )
+            req.extend_image_inputs(image_inputs)
+
+            if len(req.origin_input_ids) >= self.max_req_input_len:
+                req.set_finish_with_abort(
+                    error_msg=(
+                        "Multimodal prompt is too long after expanding multimodal tokens. "
+                        f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}."
+                    )
+                )
+                self._add_request_to_queue(req)
+                return
+
+        # Validate prompts length
+        error_msg = validate_input_length(
+            req,
+            self.max_req_input_len,
+            self.server_args.allow_auto_truncate,
+        )
+        if error_msg:
+            self._add_request_to_queue(req)
+            return
+
+        # Copy more attributes
+        req.logprob_start_len = len(req.origin_input_ids) - 1
+        self._add_request_to_queue(req)
+
+    def handle_batch_embedding_request(
+        self,
+        recv_req: BatchTokenizedEmbeddingReqInput,
+    ):
+        """Handle optimized batch embedding request."""
+        logger.debug(
+            f"Processing batch embedding request with {len(recv_req)} requests"
+        )
+
+        # Process each request in the batch
+        for tokenized_req in recv_req:
+            self.handle_embedding_request(tokenized_req)
+
+    def self_check_during_idle(self):
+        self.check_memory()
+        self.check_tree_cache()
+        self.new_token_ratio = self.init_new_token_ratio
+        self.maybe_sleep_on_idle()
+
+    def check_memory(self):
+        if self.is_hybrid:
+            (
+                full_num_used,
+                swa_num_used,
+                _,
+                _,
+                full_available_size,
+                full_evictable_size,
+                swa_available_size,
+                swa_evictable_size,
+            ) = self._get_swa_token_info()
+            memory_leak = full_num_used != 0 or swa_num_used != 0
+            token_msg = (
+                f"{self.full_tokens_per_layer=}, {full_available_size=}, {full_evictable_size=}, {self.tree_cache.full_protected_size()=}\n"
+                f"{self.swa_tokens_per_layer=}, {swa_available_size=}, {swa_evictable_size=}, {self.tree_cache.swa_protected_size()=}\n"
+            )
+        else:
+            _, _, available_size, evictable_size = self._get_token_info()
+            protected_size = self.tree_cache.protected_size()
+            memory_leak = (available_size + evictable_size) != (
+                # self.max_total_num_tokens
+                # if not self.enable_hierarchical_cache
+                # else self.max_total_num_tokens - protected_size
+                self.max_total_num_tokens
+                - protected_size
+            )
+            token_msg = f"{self.max_total_num_tokens=}, {available_size=}, {evictable_size=}, {protected_size=}\n"
+
+        if memory_leak:
+            msg = "token_to_kv_pool_allocator memory leak detected! " f"{token_msg}"
+            raise ValueError(msg)
+
+        if self.disaggregation_mode == DisaggregationMode.DECODE:
+            req_total_size = (
+                self.req_to_token_pool.size + self.req_to_token_pool.pre_alloc_size
+            )
+        else:
+            req_total_size = self.req_to_token_pool.size
+
+        if len(self.req_to_token_pool.free_slots) != req_total_size:
+            msg = (
+                "req_to_token_pool memory leak detected!"
+                f"available_size={len(self.req_to_token_pool.free_slots)}, "
+                f"total_size={self.req_to_token_pool.size}\n"
+            )
+            raise ValueError(msg)
+
+        if (
+            self.enable_metrics
+            and self.current_scheduler_metrics_enabled()
+            and time.perf_counter() > self.metrics_collector.last_log_time + 30
+        ):
+            # During idle time, also collect metrics every 30 seconds.
+            if self.is_hybrid:
+                (
+                    full_num_used,
+                    swa_num_used,
+                    full_token_usage,
+                    swa_token_usage,
+                    _,
+                    _,
+                    _,
+                    _,
+                ) = self._get_swa_token_info()
+                num_used = max(full_num_used, swa_num_used)
+                token_usage = max(full_token_usage, swa_token_usage)
+            else:
+                num_used, token_usage, _, _ = self._get_token_info()
+            num_running_reqs = len(self.running_batch.reqs)
+            self.stats.num_running_reqs = num_running_reqs
+            self.stats.num_used_tokens = num_used
+            self.stats.token_usage = round(token_usage, 2)
+            self.stats.gen_throughput = 0
+            self.stats.num_queue_reqs = len(self.waiting_queue)
+            self.stats.num_grammar_queue_reqs = len(self.grammar_queue)
+            self.metrics_collector.log_stats(self.stats)
+        self._publish_kv_events()
+
+    def check_tree_cache(self):
+        if self.is_hybrid and isinstance(self.tree_cache, SWARadixCache):
+            self.tree_cache.sanity_check()
+
+    def _get_token_info(self):
+        available_size = self.token_to_kv_pool_allocator.available_size()
+        evictable_size = self.tree_cache.evictable_size()
+        num_used = self.max_total_num_tokens - (available_size + evictable_size)
+        token_usage = num_used / self.max_total_num_tokens
+        return num_used, token_usage, available_size, evictable_size
+
+    def _get_swa_token_info(self):
+        full_available_size = self.token_to_kv_pool_allocator.full_available_size()
+        full_evictable_size = self.tree_cache.full_evictable_size()
+        swa_available_size = self.token_to_kv_pool_allocator.swa_available_size()
+        swa_evictable_size = self.tree_cache.swa_evictable_size()
+        full_num_used = self.full_tokens_per_layer - (
+            full_available_size + full_evictable_size
+        )
+        swa_num_used = self.swa_tokens_per_layer - (
+            swa_available_size + swa_evictable_size
+        )
+        full_token_usage = full_num_used / self.full_tokens_per_layer
+        swa_token_usage = swa_num_used / self.swa_tokens_per_layer
+        return (
+            full_num_used,
+            swa_num_used,
+            full_token_usage,
+            swa_token_usage,
+            full_available_size,
+            full_evictable_size,
+            swa_available_size,
+            swa_evictable_size,
+        )
+
+    def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
+        # Merge the prefill batch into the running batch
+        chunked_req_to_exclude = set()
+        if self.chunked_req:
+            # Move the chunked request out of the batch so that we can merge
+            # only finished requests to running_batch.
+            chunked_req_to_exclude.add(self.chunked_req)
+            self.tree_cache.cache_unfinished_req(self.chunked_req, chunked=True)
+            # chunked request keeps its rid but will get a new req_pool_idx
+            if self.tp_worker.worker.model_runner.is_hybrid_gdn:
+                self.req_to_token_pool.free(
+                    self.chunked_req.req_pool_idx, free_mamba_cache=False
+                )
+            else:
+                self.req_to_token_pool.free(self.chunked_req.req_pool_idx)
+        if self.last_batch and self.last_batch.forward_mode.is_extend():
+            if self.last_batch.chunked_req is not None:
+                # In the context pipeline parallelism, after the last chunk, the current microbatch still track outdated chunked_req.
+                # We need to discard it.
+                chunked_req_to_exclude.add(self.last_batch.chunked_req)
+
+            # Filter batch
+            last_bs = self.last_batch.batch_size()
+            self.last_batch.filter_batch(
+                chunked_req_to_exclude=list(chunked_req_to_exclude)
+            )
+            if self.last_batch.batch_size() < last_bs:
+                self.running_batch.batch_is_full = False
+
+            # Merge the new batch into the running batch.
+            # For prefill-only batch, we can avoid going through decoding step.
+            if not self.last_batch.is_empty() and not self.last_batch.is_prefill_only:
+                if self.running_batch.is_empty():
+                    self.running_batch = self.last_batch
+                else:
+                    # Merge running_batch with prefill batch
+                    self.running_batch.merge_batch(self.last_batch)
+
+        new_batch = self.get_new_batch_prefill()
+
+        need_dp_attn_preparation = require_mlp_sync(self.server_args)
+
+        if need_dp_attn_preparation and not self.spec_algorithm.is_none():
+            # In speculative decoding, prefill batches and decode batches cannot be processed in the same DP attention group.
+            # We prepare idle batches in advance to skip preparing decode batches when there are prefill batches in the group.
+            new_batch = self.prepare_mlp_sync_batch(new_batch)
+            need_dp_attn_preparation = new_batch is None
+
+        if new_batch is not None:
+            # Run prefill first if possible
+            ret = new_batch
+        else:
+            # Run decode
+            if not self.running_batch.is_empty():
+                self.running_batch = self.update_running_batch(self.running_batch)
+                ret = self.running_batch if not self.running_batch.is_empty() else None
+            else:
+                ret = None
+
+        # Handle DP attention
+        if need_dp_attn_preparation:
+            self.maybe_handle_dp_balance_data()
+            ret = self.prepare_mlp_sync_batch(ret)
+
+        return ret
+
+    def get_num_allocatable_reqs(self, running_bs):
+        res = global_server_args_dict["max_micro_batch_size"] - running_bs
+        if self.pp_size > 1:
+            res = min(res, self.req_to_token_pool.available_size())
+        return res
+
+    def get_new_batch_prefill(self) -> Optional[ScheduleBatch]:
+        # Check if the grammar is ready in the grammar queue
+        if self.grammar_queue:
+            self.move_ready_grammar_requests()
+
+        # Handle the cases where prefill is not allowed
+        if (
+            self.running_batch.batch_is_full or len(self.waiting_queue) == 0
+        ) and self.chunked_req is None:
+            return None
+
+        running_bs = len(self.running_batch.reqs)
+        # Ignore the check if self.chunked_req is not None.
+        # In the non-PP case, when self.chunked_req is not None, num_allocatable_reqs should always be greater than 0,
+        # as the space for the chunked request has just been released.
+        # In PP case, a chunked req can start in one microbatch and end in another microbatch, so the max_running_requests per microbatch should not be strict.
+        # Instead, we should always allow chunked request to be added, otherwise, there will be a memory leak.
+        if self.get_num_allocatable_reqs(running_bs) <= 0 and not self.chunked_req:
+            self.running_batch.batch_is_full = True
+            return None
+
+        if self.enable_hierarchical_cache:
+            self.tree_cache.check_hicache_events()
+
+        # Get priority queue
+        self.policy.calc_priority(self.waiting_queue)
+
+        # Prefill policy
+        adder = PrefillAdder(
+            self.page_size,
+            self.tree_cache,
+            self.token_to_kv_pool_allocator,
+            self.running_batch,
+            self.new_token_ratio,
+            self.max_prefill_tokens,
+            self.chunked_prefill_size,
+            running_bs if self.is_mixed_chunk else 0,
+        )
+
+        if self.chunked_req is not None:
+            self.chunked_req.init_next_round_input()
+            self.chunked_req = adder.add_chunked_req(self.chunked_req)
+
+        if self.enable_lora:
+            lora_set = set([req.lora_id for req in self.running_batch.reqs])
+
+        # Get requests from the waiting queue to a new prefill batch
+        for req in self.waiting_queue:
+
+            if self.enable_lora and not self.tp_worker.can_run_lora_batch(
+                lora_set
+                | set([req.lora_id for req in adder.can_run_list])
+                | set([req.lora_id])
+            ):
+                self.running_batch.batch_is_full = True
+                break
+
+            if len(adder.can_run_list) >= self.get_num_allocatable_reqs(running_bs):
+                self.running_batch.batch_is_full = True
+                break
+
+            if self.disaggregation_mode == DisaggregationMode.PREFILL:
+                # In prefill mode, prealloc queue and transfer queue can also take memory,
+                # so we need to check if the available size for the actual available size.
+                if len(adder.can_run_list) >= self.req_to_token_pool.available_size():
+                    self.running_batch.batch_is_full = True
+                    break
+
+            if self.enable_hicache_storage:
+                prefetch_done = self.tree_cache.check_prefetch_progress(req.rid)
+                if not prefetch_done:
+                    # skip staging requests that are ongoing prefetch
+                    continue
+
+            req.init_next_round_input(self.tree_cache)
+            res = adder.add_one_req(req, has_chunked_req=(self.chunked_req is not None))
+
+            if res != AddReqResult.CONTINUE:
+                if res == AddReqResult.NO_TOKEN:
+                    if self.enable_hierarchical_cache:
+                        # Set batch_is_full after making sure there are requests that can be served
+                        self.running_batch.batch_is_full = len(
+                            adder.can_run_list
+                        ) > 0 or (not self.running_batch.is_empty())
+                    else:
+                        self.running_batch.batch_is_full = True
+                break
+
+        # Update waiting queue
+        can_run_list: List[Req] = adder.can_run_list
+        if len(can_run_list) == 0:
+            return None
+
+        if self.enable_metrics:
+            # only record queue time when enable_metrics is True to avoid overhead
+            for req in can_run_list:
+                req.queue_time_end = time.perf_counter()
+
+        self.waiting_queue = [
+            x for x in self.waiting_queue if x not in set(can_run_list)
+        ]
+
+        if adder.new_chunked_req is not None:
+            assert self.chunked_req is None
+            self.chunked_req = adder.new_chunked_req
+
+        if self.chunked_req:
+            self.chunked_req.is_chunked += 1
+
+        # Print stats
+        if self.current_scheduler_metrics_enabled():
+            self.log_prefill_stats(adder, can_run_list, running_bs)
+
+        # Create a new batch
+        new_batch = ScheduleBatch.init_new(
+            can_run_list,
+            self.req_to_token_pool,
+            self.token_to_kv_pool_allocator,
+            self.tree_cache,
+            self.model_config,
+            self.enable_overlap,
+            self.spec_algorithm,
+            chunked_req=self.chunked_req,
+        )
+        if self.enable_hierarchical_cache:
+            # todo (zhiqiang): disable cuda graph execution if hicache loading triggered
+            new_batch.hicache_consumer_index = (
+                self.tree_cache.ready_to_load_host_cache()
+            )
+
+        new_batch.prepare_for_extend()
+
+        # Mixed-style chunked prefill
+        if (
+            self.is_mixed_chunk
+            and not self.running_batch.is_empty()
+            and not (new_batch.return_logprob or self.running_batch.return_logprob)
+        ):
+            # TODO (lianmin): support return_logprob + mixed chunked prefill
+            self.running_batch.filter_batch()
+            if not self.running_batch.is_empty():
+                self.running_batch.prepare_for_decode()
+                new_batch.mix_with_running(self.running_batch)
+                new_batch.decoding_reqs = self.running_batch.reqs
+            self.running_batch = ScheduleBatch(
+                reqs=[], batch_is_full=self.running_batch.batch_is_full
+            )
+        else:
+            new_batch.decoding_reqs = None
+
+        return new_batch
+
+    def update_running_batch(self, batch: ScheduleBatch) -> Optional[ScheduleBatch]:
+        """Update the current running decoding batch."""
+        initial_bs = batch.batch_size()
+
+        batch.filter_batch()
+        if batch.is_empty():
+            batch.batch_is_full = False
+            return batch
+
+        # Check if decode out of memory
+        if not batch.check_decode_mem(self.decode_mem_cache_buf_multiplier) or (
+            TEST_RETRACT and batch.batch_size() > 10
+        ):
+            old_ratio = self.new_token_ratio
+
+            retracted_reqs, new_token_ratio = batch.retract_decode(self.server_args)
+            num_retracted_reqs = len(retracted_reqs)
+            self.new_token_ratio = new_token_ratio
+
+            logger.info(
+                "KV cache pool is full. Retract requests. "
+                f"#retracted_reqs: {num_retracted_reqs}, "
+                f"#new_token_ratio: {old_ratio:.4f} -> {self.new_token_ratio:.4f}"
+            )
+
+            self._extend_requests_to_queue(retracted_reqs, is_retracted=True)
+            self.total_retracted_reqs += num_retracted_reqs
+        else:
+            self.new_token_ratio = max(
+                self.new_token_ratio - self.new_token_ratio_decay,
+                self.min_new_token_ratio,
+            )
+
+        if batch.batch_size() < initial_bs:
+            batch.batch_is_full = False
+
+        # Update batch tensors
+        batch.prepare_for_decode()
+        return batch
+
+    def run_batch(
+        self, batch: ScheduleBatch
+    ) -> Union[GenerationBatchResult, EmbeddingBatchResult]:
+        """Run a batch."""
+        self.forward_ct += 1
+
+        # Whether to run the profiler
+        self._profile_batch_predicate(batch)
+        if self.forward_sleep_time is not None:
+            logger.info(f"Scheduler.run_batch sleep {self.forward_sleep_time}s")
+            time.sleep(self.forward_sleep_time)
+
+        # Run forward
+        if self.is_generation:
+            if self.spec_algorithm.is_none():
+                model_worker_batch = batch.get_model_worker_batch()
+
+                if self.pp_group.is_last_rank:
+                    logits_output, next_token_ids, can_run_cuda_graph = (
+                        self.tp_worker.forward_batch_generation(model_worker_batch)
+                    )
+                else:
+                    pp_hidden_states_proxy_tensors, _, can_run_cuda_graph = (
+                        self.tp_worker.forward_batch_generation(model_worker_batch)
+                    )
+                bid = model_worker_batch.bid
+            else:
+                (
+                    logits_output,
+                    next_token_ids,
+                    bid,
+                    num_accepted_tokens,
+                    can_run_cuda_graph,
+                ) = self.draft_worker.forward_batch_speculative_generation(batch)
+                bs = batch.batch_size()
+                self.spec_num_total_accepted_tokens += num_accepted_tokens + bs
+                self.spec_num_total_forward_ct += bs
+                self.num_generated_tokens += num_accepted_tokens
+
+            if self.pp_group.is_last_rank:
+                batch.output_ids = next_token_ids
+
+            # These 2 values are needed for processing the output, but the values can be
+            # modified by overlap schedule. So we have to copy them here so that
+            # we can use the correct values in output processing.
+            if batch.return_logprob or self.spec_algorithm.is_eagle():
+                extend_input_len_per_req = [req.extend_input_len for req in batch.reqs]
+            else:
+                extend_input_len_per_req = None
+            if batch.return_logprob:
+                extend_logprob_start_len_per_req = [
+                    req.extend_logprob_start_len for req in batch.reqs
+                ]
+            else:
+                extend_logprob_start_len_per_req = None
+
+            ret = GenerationBatchResult(
+                logits_output=logits_output if self.pp_group.is_last_rank else None,
+                pp_hidden_states_proxy_tensors=(
+                    pp_hidden_states_proxy_tensors
+                    if not self.pp_group.is_last_rank
+                    else None
+                ),
+                next_token_ids=next_token_ids if self.pp_group.is_last_rank else None,
+                extend_input_len_per_req=extend_input_len_per_req,
+                extend_logprob_start_len_per_req=extend_logprob_start_len_per_req,
+                bid=bid,
+                can_run_cuda_graph=can_run_cuda_graph,
+            )
+        else:  # embedding or reward model
+            model_worker_batch = batch.get_model_worker_batch()
+            embeddings = self.tp_worker.forward_batch_embedding(model_worker_batch)
+            ret = EmbeddingBatchResult(
+                embeddings=embeddings, bid=model_worker_batch.bid
+            )
+        return ret
+
+    def process_batch_result(
+        self,
+        batch: ScheduleBatch,
+        result: Union[GenerationBatchResult, EmbeddingBatchResult],
+        launch_done: Optional[threading.Event] = None,
+    ):
+        if batch.forward_mode.is_decode():
+            self.process_batch_result_decode(batch, result, launch_done)
+        elif batch.forward_mode.is_extend():
+            self.process_batch_result_prefill(batch, result, launch_done)
+        elif batch.forward_mode.is_idle():
+            if self.enable_overlap:
+                self.tp_worker.resolve_last_batch_result(launch_done)
+                self.set_next_batch_sampling_info_done(batch)
+        elif batch.forward_mode.is_dummy_first():
+            self.set_next_batch_sampling_info_done(batch)
+
+        self.maybe_send_health_check_signal()
+
+    def maybe_send_health_check_signal(self):
+        if self.return_health_check_ct:
+            # Return some signal for the health check.
+            # This is used to prevent the health check signal being blocked by long context prefill.
+            # However, one minor issue is that this code path does not check the status of detokenizer manager.
+            self.return_health_check_ct -= 1
+            self.send_to_tokenizer.send_pyobj(HealthCheckOutput())
+
+    def prepare_mlp_sync_batch(self, local_batch: ScheduleBatch):
+        return self.prepare_mlp_sync_batch_raw(
+            local_batch,
+            dp_size=self.server_args.dp_size,
+            attn_tp_size=self.attn_tp_size,
+            tp_group=self.tp_group,
+            get_idle_batch=self.get_idle_batch,
+            disable_cuda_graph=self.server_args.disable_cuda_graph,
+            spec_algorithm=self.spec_algorithm,
+            speculative_num_draft_tokens=self.server_args.speculative_num_draft_tokens,
+            require_mlp_tp_gather=require_mlp_tp_gather(self.server_args),
+            disable_overlap_schedule=self.server_args.disable_overlap_schedule,
+        )
+
+    @staticmethod
+    def prepare_mlp_sync_batch_raw(
+        local_batch: ScheduleBatch,
+        dp_size,
+        attn_tp_size: int,
+        tp_group,
+        get_idle_batch,
+        disable_cuda_graph: bool,
+        spec_algorithm,
+        speculative_num_draft_tokens,
+        require_mlp_tp_gather: bool,
+        disable_overlap_schedule: bool,
+    ):
+        # Check if other DP workers have running batches
+        if local_batch is None:
+            num_tokens = 0
+            num_tokens_for_logprob = 0
+        elif local_batch.forward_mode.is_decode():
+            num_tokens = local_batch.batch_size()
+            num_tokens_for_logprob = num_tokens
+        else:
+            num_tokens = local_batch.extend_num_tokens
+            num_tokens_for_logprob = sum(
+                [
+                    # We should have at least 1 token for sample in every case.
+                    max(extend_len - logprob_start_len, 1)
+                    for logprob_start_len, extend_len in zip(
+                        local_batch.extend_logprob_start_lens, local_batch.extend_lens
+                    )
+                ]
+            )
+
+        if local_batch is None or local_batch.forward_mode.is_decode_or_idle():
+            can_cuda_graph = 1
+        else:
+            can_cuda_graph = 0
+
+        is_extend_in_batch = (
+            local_batch.forward_mode.is_extend() if local_batch else False
+        )
+
+        tbo_preparer = TboDPAttentionPreparer()
+        if disable_overlap_schedule:
+            group = tp_group.device_group
+            device = tp_group.device
+        else:
+            group = tp_group.cpu_group
+            device = "cpu"
+
+        local_info = torch.tensor(
+            [
+                num_tokens,
+                can_cuda_graph,
+                num_tokens_for_logprob,
+                is_extend_in_batch,
+                *tbo_preparer.prepare_all_gather(
+                    local_batch,
+                ),
+            ],
+            dtype=torch.int64,
+            device=device,
+        )
+        global_info = torch.empty(
+            (dp_size, attn_tp_size, 6),
+            dtype=torch.int64,
+            device=device,
+        )
+        torch.distributed.all_gather_into_tensor(
+            global_info.flatten(),
+            local_info,
+            group=group,
+        )
+        global_num_tokens = global_info[:, 0, 0].tolist()
+        can_cuda_graph = min(global_info[:, 0, 1].tolist())
+        global_num_tokens_for_logprob = global_info[:, 0, 2].tolist()
+        is_extend_in_batch = global_info[:, 0, 3].tolist()
+
+        tbo_split_seq_index, global_forward_mode = tbo_preparer.compute_output(
+            global_info[:, :, 4:6]
+        )
+
+        if local_batch is None and max(global_num_tokens) > 0:
+            local_batch = get_idle_batch()
+
+        if local_batch is not None:
+            # TODO: handle the case when moe_dense_tp_size != 1
+            if not require_mlp_tp_gather:
+                local_batch.global_num_tokens = [num_tokens]
+                local_batch.global_num_tokens_for_logprob = [num_tokens_for_logprob]
+            else:
+                local_batch.global_num_tokens = global_num_tokens
+                local_batch.global_num_tokens_for_logprob = (
+                    global_num_tokens_for_logprob
+                )
+            local_batch.is_extend_in_batch = any(is_extend_in_batch)
+            local_batch.tbo_split_seq_index = tbo_split_seq_index
+            local_batch.global_forward_mode = global_forward_mode
+
+            # Check forward mode for cuda graph
+            if not disable_cuda_graph:
+                local_batch.can_run_dp_cuda_graph = can_cuda_graph
+
+        return local_batch
+
+    def get_idle_batch(self):
+        idle_batch = ScheduleBatch.init_new(
+            [],
+            self.req_to_token_pool,
+            self.token_to_kv_pool_allocator,
+            self.tree_cache,
+            self.model_config,
+            self.enable_overlap,
+            self.spec_algorithm,
+        )
+        idle_batch.prepare_for_idle()
+        return idle_batch
+
+    def move_ready_grammar_requests(self):
+        """Move requests whose grammar objects are ready from grammar_queue to waiting_queue."""
+
+        num_ready_reqs = 0
+        num_timeout_reqs = 0
+        for req in self.grammar_queue:
+            try:
+                if req.finished():  # It is aborted by AbortReq
+                    num_ready_reqs += 1
+                    continue
+                req.grammar = req.grammar.result(timeout=0.03)
+                self.grammar_backend.set_cache(req.grammar_key, req.grammar.copy())
+                if req.grammar is INVALID_GRAMMAR_OBJ:
+                    req.set_finish_with_abort(
+                        f"Invalid grammar request: {req.grammar_key=}"
+                    )
+                num_ready_reqs += 1
+            except futures._base.TimeoutError:
+                req.grammar_wait_ct += 1
+                # NOTE(lianmin): this timeout is the waiting time of the above line. It is
+                # not the waiting time from it enters the grammar queue.
+                if req.grammar_wait_ct > GRAMMAR_TIMEOUT / 0.03:
+                    num_timeout_reqs = 1
+                break
+
+        if self.server_args.enable_dp_attention:
+            tp_size = self.attn_tp_size
+            tp_group = self.attn_tp_cpu_group
+        else:
+            tp_size = self.tp_size
+            tp_group = self.tp_cpu_group
+
+        if tp_size > 1:
+            # Sync across TP ranks to make sure they have the same number of ready requests
+            tensor = torch.tensor([num_ready_reqs, num_timeout_reqs], dtype=torch.int32)
+            torch.distributed.all_reduce(
+                tensor, op=torch.distributed.ReduceOp.MAX, group=tp_group
+            )
+            num_ready_reqs_max, num_timeout_reqs_max = tensor.tolist()
+
+            for i in range(num_ready_reqs, num_ready_reqs_max):
+                req = self.grammar_queue[i]
+                if req.finished():  # It is aborted by AbortReq
+                    continue
+                req.grammar = req.grammar.result()
+                self.grammar_backend.set_cache(req.grammar_key, req.grammar.copy())
+                if req.grammar is INVALID_GRAMMAR_OBJ:
+                    req.set_finish_with_abort(
+                        f"Invalid grammar request: {req.grammar_key=}"
+                    )
+        else:
+            num_ready_reqs_max = num_ready_reqs
+            num_timeout_reqs_max = num_timeout_reqs
+
+        for i in range(num_ready_reqs, num_ready_reqs + num_timeout_reqs_max):
+            req = self.grammar_queue[i]
+            req.grammar.cancel()
+            error_msg = f"Grammar preprocessing timed out for {req.grammar_key=}"
+            req.set_finish_with_abort(error_msg)
+            self.grammar_backend.set_cache(req.grammar_key, INVALID_GRAMMAR_OBJ)
+        num_ready_reqs = num_ready_reqs_max + num_timeout_reqs_max
+
+        self._extend_requests_to_queue(self.grammar_queue[:num_ready_reqs])
+        self.grammar_queue = self.grammar_queue[num_ready_reqs:]
+
+    def set_next_batch_sampling_info_done(self, batch: ScheduleBatch):
+        if batch.next_batch_sampling_info:
+            if batch.next_batch_sampling_info.grammars is not None:
+                batch.next_batch_sampling_info.update_regex_vocab_mask()
+                self.current_stream.synchronize()
+            batch.next_batch_sampling_info.sampling_info_done.set()
+
+    def watchdog_thread(self):
+        """A watch dog thread that will try to kill the server itself if one forward batch takes too long."""
+        self.watchdog_last_forward_ct = 0
+        self.watchdog_last_time = time.perf_counter()
+
+        while True:
+            current = time.perf_counter()
+            if self.cur_batch is not None:
+                if self.watchdog_last_forward_ct == self.forward_ct:
+                    if current > self.watchdog_last_time + self.watchdog_timeout:
+                        break
+                else:
+                    self.watchdog_last_forward_ct = self.forward_ct
+                    self.watchdog_last_time = current
+            time.sleep(self.watchdog_timeout // 2)
+
+        if not disable_request_logging():
+            # Print batch size and memory pool info to check whether there are de-sync issues.
+            if self.is_hybrid:
+                (
+                    _,
+                    _,
+                    _,
+                    _,
+                    full_available_size,
+                    full_evictable_size,
+                    swa_available_size,
+                    swa_evictable_size,
+                ) = self._get_swa_token_info()
+                info_msg = (
+                    f"{full_available_size=}, "
+                    f"{full_evictable_size=}, "
+                    f"{swa_available_size=}, "
+                    f"{swa_evictable_size=}, "
+                )
+            else:
+                _, _, available_size, evictable_size = self._get_token_info()
+                info_msg = f"{available_size=}, " f"{evictable_size=}, "
+            logger.error(
+                f"{self.cur_batch.batch_size()=}, "
+                f"{self.cur_batch.reqs=}, "
+                f"{info_msg}"
+            )
+
+        pyspy_dump_schedulers()
+        logger.error(f"Watchdog timeout ({self.watchdog_timeout=})")
+        print(file=sys.stderr, flush=True)
+        print(file=sys.stdout, flush=True)
+
+        # Wait for some time so that the parent process can print the error.
+        time.sleep(5)
+        self.parent_process.send_signal(signal.SIGQUIT)
+
+    def flush_cache_wrapped(self, recv_req: FlushCacheReqInput):
+        success = self.flush_cache()
+        return FlushCacheReqOutput(success=success)
+
+    def clear_hicache_storage_wrapped(self, recv_req: ClearHiCacheReqInput):
+        if self.enable_hierarchical_cache:
+            self.tree_cache.clear_storage_backend()
+            logger.info("Hierarchical cache cleared successfully!")
+            if_success = True
+        else:
+            logging.warning("Hierarchical cache is not enabled.")
+            if_success = False
+        return ClearHiCacheReqOutput(success=if_success)
+
+    def flush_cache(self):
+        """Flush the memory pool and cache."""
+        if (
+            len(self.waiting_queue) == 0
+            and self.running_batch.is_empty()
+            and (self.pp_size == 1 or all(x.is_empty() for x in self.running_mbs))
+        ):
+            self.cur_batch = None
+            self.last_batch = None
+            self.tree_cache.reset()
+            if self.grammar_backend:
+                self.grammar_backend.reset()
+            self.req_to_token_pool.clear()
+            self.token_to_kv_pool_allocator.clear()
+
+            if not self.spec_algorithm.is_none():
+                self.draft_worker.model_runner.req_to_token_pool.clear()
+                self.draft_worker.model_runner.token_to_kv_pool_allocator.clear()
+
+            self.num_generated_tokens = 0
+            self.forward_ct_decode = 0
+            self.spec_num_total_accepted_tokens = 0
+            self.spec_num_total_forward_ct = 0
+            self.cum_spec_accept_length = 0
+            self.cum_spec_accept_count = 0
+            torch.cuda.empty_cache()
+            logger.info("Cache flushed successfully!")
+            if_success = True
+        else:
+            logging.warning(
+                f"Cache not flushed because there are pending requests. "
+                f"#queue-req: {len(self.waiting_queue)}, "
+                f"#running-req: {len(self.running_batch.reqs)}"
+            )
+            if_success = False
+        return if_success
+
+    def get_load(self):
+        # TODO(lsyin): use dynamically maintained num_waiting_tokens
+        if self.is_hybrid:
+            load_full = (
+                self.full_tokens_per_layer
+                - self.token_to_kv_pool_allocator.full_available_size()
+                - self.tree_cache.full_evictable_size()
+            )
+            load_swa = (
+                self.swa_tokens_per_layer
+                - self.token_to_kv_pool_allocator.swa_available_size()
+                - self.tree_cache.swa_evictable_size()
+            )
+            load = max(load_full, load_swa)
+        else:
+            load = (
+                self.max_total_num_tokens
+                - self.token_to_kv_pool_allocator.available_size()
+                - self.tree_cache.evictable_size()
+            )
+        load += sum(len(req.origin_input_ids) for req in self.waiting_queue)
+        if self.disaggregation_mode == DisaggregationMode.PREFILL:
+            load += sum(
+                len(req.origin_input_ids)
+                for req in self.disagg_prefill_bootstrap_queue.queue
+            )
+        elif self.disaggregation_mode == DisaggregationMode.DECODE:
+            load += sum(
+                len(req.req.origin_input_ids)
+                for req in self.disagg_decode_prealloc_queue.queue
+            )
+
+        return load
+
+    def get_internal_state(self, recv_req: GetInternalStateReq):
+        ret = dict(global_server_args_dict)
+        ret["last_gen_throughput"] = self.last_gen_throughput
+        ret["memory_usage"] = {
+            "weight": round(
+                self.tp_worker.worker.model_runner.weight_load_mem_usage, 2
+            ),
+            "kvcache": round(
+                self.token_to_kv_pool_allocator.get_kvcache().mem_usage, 2
+            ),
+            "token_capacity": int(self.max_total_num_tokens),
+        }
+
+        ret["memory_usage"]["graph"] = round(
+            self.tp_worker.worker.model_runner.graph_mem_usage, 2
+        )
+
+        if not self.spec_algorithm.is_none() and self.cum_spec_accept_count > 0:
+            ret["avg_spec_accept_length"] = (
+                self.cum_spec_accept_length / self.cum_spec_accept_count
+            )
+        if RECORD_STEP_TIME:
+            ret["step_time_dict"] = self.step_time_dict
+
+        ret["load"] = self.get_load()
+
+        return GetInternalStateReqOutput(internal_state=ret)
+
+    def set_internal_state(self, recv_req: SetInternalStateReq):
+        server_args_dict = recv_req.server_args
+        args_allow_update = set(
+            [
+                "max_micro_batch_size",
+                "speculative_accept_threshold_single",
+                "speculative_accept_threshold_acc",
+            ]
+        )
+        if_success = True
+        for k, v in server_args_dict.items():
+            if k not in args_allow_update:
+                logging.warning(f"Updating {k} is not supported.")
+                if_success = False
+                break
+            elif k == "max_micro_batch_size" and (
+                v > self.max_running_requests // self.pp_size or v < 1
+            ):
+                logging.warning(
+                    f"Updating {k} to {v} is rejected because it is out of the valid range [1, {self.max_running_requests // self.pp_size}]."
+                )
+                if_success = False
+                break
+        if if_success:
+            if not self.spec_algorithm.is_none() and self.cum_spec_accept_count > 0:
+                avg_spec_accept_length = (
+                    self.cum_spec_accept_length / self.cum_spec_accept_count
+                )
+                logger.info(f"{avg_spec_accept_length=}")
+            self.cum_spec_accept_length = self.cum_spec_accept_count = 0
+            for k, v in server_args_dict.items():
+                global_server_args_dict[k] = v
+            logger.info(f"Global server args updated! {global_server_args_dict=}")
+        return SetInternalStateReqOutput(
+            updated=True,
+            server_args=global_server_args_dict,
+        )
+
+    def handle_rpc_request(self, recv_req: RpcReqInput):
+        # Handle RPC requests
+        logger.info(
+            f"handle_rpc_request: {recv_req.method}, param: {recv_req.parameters}"
+        )
+
+        success = True
+        exec = None
+        try:
+            func = getattr(self, recv_req.method)
+            func(recv_req.parameters)
+        except Exception as e:
+            success = False
+            exec = e
+            logger.error(f"Failed to call rpc {recv_req.method}: {str(e)}")
+
+        barrier()
+        return RpcReqOutput(success, "" if not exec else str(exec))
+
+    def abort_request(self, recv_req: AbortReq):
+        # Delete requests in the waiting queue
+        to_del = []
+        for i, req in enumerate(self.waiting_queue):
+            if recv_req.abort_all or req.rid.startswith(recv_req.rid):
+                to_del.append(i)
+
+        # Sort in reverse order to avoid index issues when deleting
+        for i in reversed(to_del):
+            # Abort method 1: directly pop from the queue
+            # This only works for requests that have not started anything.
+            # We still need to send something back to TokenizerManager to clean up the state.
+            req = self.waiting_queue.pop(i)
+            if self.enable_hicache_storage:
+                # to release prefetch events associated with the request
+                self.tree_cache.release_aborted_request(req.rid)
+            self.send_to_tokenizer.send_pyobj(AbortReq(req.rid))
+            # For disaggregation decode mode, the request in the waiting queue has KV cache allocated.
+            if self.disaggregation_mode == DisaggregationMode.DECODE:
+                self.tree_cache.cache_finished_req(req)
+
+            logger.debug(f"Abort queued request. {req.rid=}")
+
+        # Delete the requests in the grammar queue
+        for req in self.grammar_queue:
+            # Abort method 2: call `set_finish_with_abort`
+            # The request will still run one prefill forward pass.
+            # In this case, we change the input_ids to be only one token to make this prefill cheap.
+            if recv_req.abort_all or req.rid.startswith(recv_req.rid):
+                logger.debug(f"Abort grammar queue request. {req.rid=}")
+                if req.grammar:
+                    req.grammar.cancel()
+                req.set_finish_with_abort("Aborted by AbortReq.")
+
+        # Delete requests not in the waiting queue when PD disaggregation is enabled
+        if self.disaggregation_mode == DisaggregationMode.PREFILL:
+            # Abort requests that have not yet been bootstrapped
+            for i, req in enumerate(self.disagg_prefill_bootstrap_queue.queue):
+                logger.debug(f"Abort bootstrap queue request. {req.rid=}")
+                if recv_req.abort_all or req.rid.startswith(recv_req.rid):
+                    if hasattr(req.disagg_kv_sender, "abort"):
+                        req.disagg_kv_sender.abort()
+
+            # Abort in-flight requests
+            for i, req in enumerate(self.disagg_prefill_inflight_queue):
+                logger.debug(f"Abort inflight queue request. {req.rid=}")
+                if recv_req.abort_all or req.rid.startswith(recv_req.rid):
+                    if hasattr(req.disagg_kv_sender, "abort"):
+                        req.disagg_kv_sender.abort()
+
+        elif self.disaggregation_mode == DisaggregationMode.DECODE:
+            # Abort requests that have not yet finished preallocation
+            for i, decode_req in enumerate(self.disagg_decode_prealloc_queue.queue):
+                logger.debug(f"Abort prealloc queue request. {decode_req.req.rid=}")
+                if recv_req.abort_all or decode_req.req.rid.startswith(recv_req.rid):
+                    if hasattr(decode_req.kv_receiver, "abort"):
+                        decode_req.kv_receiver.abort()
+
+            # Abort requests waiting for kvcache to release tree cache
+            for i, decode_req in enumerate(self.disagg_decode_transfer_queue.queue):
+                logger.debug(f"Abort transfer queue request. {decode_req.req.rid=}")
+                if recv_req.abort_all or decode_req.req.rid.startswith(recv_req.rid):
+                    if hasattr(decode_req.kv_receiver, "abort"):
+                        decode_req.kv_receiver.abort()
+
+        # Delete requests in the running batch
+        if self.cur_batch is self.running_batch or self.cur_batch is None:
+            reqs = self.running_batch.reqs
+        else:
+            reqs = self.running_batch.reqs + self.cur_batch.reqs
+
+        for req in reqs:
+            if not req.finished() and (
+                recv_req.abort_all or req.rid.startswith(recv_req.rid)
+            ):
+                # Abort method 3: set `to_abort=True`
+                # The request will still run one decode forward pass.
+                # Then we reuse all existing code to clean up the KV cache allocation.
+                logger.debug(f"Abort running request. {req.rid=}")
+                req.to_abort = True
+
+    def _pause_engine(self) -> Tuple[List[Req], int]:
+        raise NotImplementedError()
+
+    def load_lora_adapter(
+        self, recv_req: LoadLoRAAdapterReqInput
+    ) -> LoadLoRAAdapterReqOutput:
+        """In-place loading a new lora adapter from disk or huggingface."""
+
+        result = self.tp_worker.load_lora_adapter(recv_req)
+        return result
+
+    def unload_lora_adapter(
+        self, recv_req: UnloadLoRAAdapterReqInput
+    ) -> UnloadLoRAAdapterReqOutput:
+        """Unload the lora adapter."""
+
+        result = self.tp_worker.unload_lora_adapter(recv_req)
+        return result
+
+    def register_multi_tokenizer(self, recv_req: MultiTokenizerRegisterReq):
+        self.send_to_detokenizer.send_pyobj(recv_req)
+        return recv_req
+
+    def slow_down(self, recv_req: SlowDownReqInput):
+        t = recv_req.forward_sleep_time
+        if t is not None and t <= 0:
+            t = None
+        self.forward_sleep_time = t
+        return SlowDownReqOutput()
+
+    def expert_distribution_handle(self, recv_req: ExpertDistributionReq):
+        if recv_req == ExpertDistributionReq.START_RECORD:
+            get_global_expert_distribution_recorder().start_record()
+        elif recv_req == ExpertDistributionReq.STOP_RECORD:
+            get_global_expert_distribution_recorder().stop_record()
+        elif recv_req == ExpertDistributionReq.DUMP_RECORD:
+            get_global_expert_distribution_recorder().dump_record()
+        else:
+            raise ValueError(f"Unrecognized ExpertDistributionReq value: {recv_req=}")
+        return ExpertDistributionReqOutput()
+
+    def open_session(self, recv_req: OpenSessionReqInput):
+        # handle error
+        session_id = recv_req.session_id
+        if session_id in self.sessions:
+            logger.warning(f"session id {session_id} already exist, cannot open.")
+            return OpenSessionReqOutput(session_id, False)
+        elif session_id is None:
+            logger.warning("session id is None, cannot open.")
+            return OpenSessionReqOutput(session_id, False)
+        else:
+            self.sessions[session_id] = Session(
+                recv_req.capacity_of_str_len, session_id
+            )
+            return OpenSessionReqOutput(session_id, True)
+
+    def close_session(self, recv_req: CloseSessionReqInput):
+        # handle error
+        session_id = recv_req.session_id
+        if session_id not in self.sessions:
+            logger.warning(f"session id {session_id} does not exist, cannot delete.")
+        else:
+            del self.sessions[session_id]
+
+    def get_print_prefix(self):
+        prefix = ""
+        if self.attn_dp_rank is not None:
+            prefix += f" DP{self.attn_dp_rank}"
+        if self.server_args.tp_size > 1:
+            prefix += f" TP{self.tp_rank}"
+        if self.pp_size > 1:
+            prefix += f" PP{self.pp_rank}"
+        return prefix
+
+    def current_scheduler_metrics_enabled(self):
+        return self.attn_tp_rank == 0 or self.enable_metrics_for_all_schedulers
+
+    def maybe_sleep_on_idle(self):
+        if self.idle_sleeper is not None:
+            self.idle_sleeper.maybe_sleep()
+
+    def handle_freeze_gc(self, recv_req: FreezeGCReq):
+        """Handle freeze_gc request: freeze scheduler's GC and forward to detokenizer."""
+        freeze_gc("Scheduler")
+        self.send_to_detokenizer.send_pyobj(recv_req)
+        return None
+
+
+class IdleSleeper:
+    """
+    In setups which have long inactivity periods it is desirable to reduce
+    system power consumption when sglang does nothing. This would lead not only
+    to power savings, but also to more CPU thermal headroom when a request
+    eventually comes. This is important in cases when multiple GPUs are connected
+    as each GPU would otherwise pin one thread at 100% CPU usage.
+
+    The simplest solution is to use zmq.Poller on all sockets that may receive
+    data that needs handling immediately.
+    """
+
+    def __init__(self, sockets):
+        self.poller = zmq.Poller()
+        self.last_empty_time = time.time()
+        for s in sockets:
+            self.poller.register(s, zmq.POLLIN)
+
+    def maybe_sleep(self):
+        self.poller.poll(1000)
+        if (
+            global_config.torch_empty_cache_interval > 0
+            and time.time() - self.last_empty_time
+            > global_config.torch_empty_cache_interval
+        ):
+            self.last_empty_time = time.time()
+            torch.cuda.empty_cache()
+
+
+def is_health_check_generate_req(recv_req):
+    return getattr(recv_req, "rid", "").startswith("HEALTH_CHECK")
+
+
+def is_work_request(recv_req):
+    return isinstance(
+        recv_req,
+        (
+            TokenizedGenerateReqInput,
+            TokenizedEmbeddingReqInput,
+            BatchTokenizedGenerateReqInput,
+            BatchTokenizedEmbeddingReqInput,
+        ),
+    )
+
+
+def run_scheduler_process(
+    server_args: ServerArgs,
+    port_args: PortArgs,
+    gpu_id: int,
+    tp_rank: int,
+    moe_ep_rank: int,
+    pp_rank: int,
+    dp_rank: Optional[int],
+    pipe_writer,
+    balance_meta: Optional[DPBalanceMeta] = None,
+):
+    if (numa_node := server_args.numa_node) is not None:
+        numa_bind_to_node(numa_node[gpu_id])
+
+    # Generate the prefix
+    prefix = ""
+    if dp_rank is not None:
+        prefix += f" DP{dp_rank}"
+    if server_args.tp_size > 1:
+        prefix += f" TP{tp_rank}"
+    if server_args.ep_size > 1:
+        prefix += f" EP{moe_ep_rank}"
+    if server_args.pp_size > 1:
+        prefix += f" PP{pp_rank}"
+
+    # Config the process
+    setproctitle.setproctitle(f"sglang::scheduler{prefix.replace(' ', '_')}")
+    faulthandler.enable()
+    kill_itself_when_parent_died()
+    parent_process = psutil.Process().parent()
+
+    # [For Router] if env var "SGLANG_DP_RANK" exist, set dp_rank to the value of the env var
+    if dp_rank is None and "SGLANG_DP_RANK" in os.environ:
+        dp_rank = int(os.environ["SGLANG_DP_RANK"])
+
+    # Configure the logger
+    configure_logger(server_args, prefix=prefix)
+    suppress_other_loggers()
+
+    # Set cpu affinity to this gpu process
+    if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
+        set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, gpu_id)
+
+    # Create a scheduler and run the event loop
+    try:
+        scheduler = Scheduler(
+            server_args,
+            port_args,
+            gpu_id,
+            tp_rank,
+            moe_ep_rank,
+            pp_rank,
+            dp_rank,
+            dp_balance_meta=balance_meta,
+        )
+        pipe_writer.send(
+            {
+                "status": "ready",
+                "max_total_num_tokens": scheduler.max_total_num_tokens,
+                "max_req_input_len": scheduler.max_req_input_len,
+            }
+        )
+
+        disaggregation_mode: DisaggregationMode = scheduler.disaggregation_mode
+        if disaggregation_mode == DisaggregationMode.NULL:
+            if server_args.pp_size > 1:
+                scheduler.event_loop_pp()
+            elif scheduler.enable_overlap:
+                scheduler.event_loop_overlap()
+            else:
+                scheduler.event_loop_normal()
+        elif disaggregation_mode == DisaggregationMode.PREFILL:
+            if scheduler.enable_overlap:
+                scheduler.event_loop_overlap_disagg_prefill()
+            else:
+                if server_args.pp_size > 1:
+                    scheduler.event_loop_pp_disagg_prefill()
+                else:
+                    scheduler.event_loop_normal_disagg_prefill()
+
+        elif disaggregation_mode == DisaggregationMode.DECODE:
+            if scheduler.enable_overlap:
+                scheduler.event_loop_overlap_disagg_decode()
+            else:
+                scheduler.event_loop_normal_disagg_decode()
+
+    except Exception:
+        traceback = get_exception_traceback()
+        logger.error(f"Scheduler hit an exception: {traceback}")
+        parent_process.send_signal(signal.SIGQUIT)
diff --git a/python/sglang/srt/managers/scheduler_input_blocker.py b/python/sglang/srt/managers/scheduler_input_blocker.py
new file mode 100644
index 000000000..60ae8d5d6
--- /dev/null
+++ b/python/sglang/srt/managers/scheduler_input_blocker.py
@@ -0,0 +1,106 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import logging
+from contextlib import contextmanager
+from enum import Enum, auto
+from typing import Any, List, Optional
+
+from sglang.srt.managers.io_struct import BlockReqInput, BlockReqType
+from sglang.srt.poll_based_barrier import PollBasedBarrier
+
+logger = logging.getLogger(__name__)
+
+
+class SchedulerInputBlocker:
+    def __init__(self, noop: bool):
+        self._state = _State.UNBLOCKED
+        self._pending_reqs = []
+        self._noop = noop
+        self._global_unblock_barrier = PollBasedBarrier(noop=noop)
+
+    def handle(self, recv_reqs: Optional[List[Any]]):
+        assert (recv_reqs is None) == self._noop
+
+        if not self._noop:
+            output_reqs = []
+            for recv_req in recv_reqs:
+                output_reqs += self._handle_recv_req(recv_req)
+
+        global_arrived_unblock_barrier = (
+            self._global_unblock_barrier.poll_global_arrived()
+        )
+        if (
+            self._state == _State.GLOBAL_UNBLOCK_BARRIER
+            and global_arrived_unblock_barrier
+        ):
+            output_reqs += self._handle_arrive_unblock_barrier()
+
+        if not self._noop:
+            return output_reqs
+
+    def _handle_recv_req(self, recv_req):
+        if isinstance(recv_req, BlockReqInput):
+            if recv_req.type == BlockReqType.BLOCK:
+                self._execute_block_req()
+                return []
+            elif recv_req.type == BlockReqType.UNBLOCK:
+                self._execute_unblock_req()
+                return []
+            else:
+                raise NotImplementedError(f"{recv_req=}")
+        else:
+            if self._state == _State.UNBLOCKED:
+                return [recv_req]
+            else:
+                self._pending_reqs.append(recv_req)
+                return []
+
+    def _execute_block_req(self):
+        logger.info("Handle block req")
+        self._change_state(original=_State.UNBLOCKED, target=_State.BLOCKED)
+
+    def _execute_unblock_req(self):
+        logger.info("Handle unblock req")
+        self._change_state(
+            original=_State.BLOCKED, target=_State.GLOBAL_UNBLOCK_BARRIER
+        )
+        self._global_unblock_barrier.local_arrive()
+
+    def _handle_arrive_unblock_barrier(self):
+        logger.info(f"Arrived at unblock barrier ({len(self._pending_reqs)=})")
+        self._change_state(
+            original=_State.GLOBAL_UNBLOCK_BARRIER, target=_State.UNBLOCKED
+        )
+        output_reqs = [*self._pending_reqs]
+        self._pending_reqs.clear()
+        return output_reqs
+
+    def _change_state(self, original: "_State", target: "_State"):
+        assert self._state == original, f"{self._state=} {original=} {target=}"
+        self._state = target
+
+
+class _State(Enum):
+    UNBLOCKED = auto()
+    BLOCKED = auto()
+    GLOBAL_UNBLOCK_BARRIER = auto()
+
+
+@contextmanager
+def input_blocker_guard_region(send_to_scheduler):
+    send_to_scheduler.send_pyobj(BlockReqInput(BlockReqType.BLOCK))
+    try:
+        yield
+    finally:
+        send_to_scheduler.send_pyobj(BlockReqInput(BlockReqType.UNBLOCK))
diff --git a/python/sglang/srt/managers/scheduler_metrics_mixin.py b/python/sglang/srt/managers/scheduler_metrics_mixin.py
new file mode 100644
index 000000000..3d8572e34
--- /dev/null
+++ b/python/sglang/srt/managers/scheduler_metrics_mixin.py
@@ -0,0 +1,350 @@
+from __future__ import annotations
+
+import logging
+import time
+from collections import defaultdict
+from typing import TYPE_CHECKING, Dict, List, Optional, Union
+
+import torch
+
+from sglang.srt.disaggregation.kv_events import EventPublisherFactory, KVEventBatch
+from sglang.srt.disaggregation.utils import DisaggregationMode
+from sglang.srt.managers.io_struct import TokenizedGenerateReqInput
+from sglang.srt.managers.schedule_policy import PrefillAdder
+from sglang.srt.managers.scheduler import Req, ScheduleBatch
+from sglang.srt.managers.utils import DPBalanceMeta
+from sglang.srt.metrics.collector import SchedulerMetricsCollector, SchedulerStats
+from sglang.srt.utils import get_bool_env_var
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.scheduler import Scheduler
+
+logger = logging.getLogger(__name__)
+
+RECORD_STEP_TIME = get_bool_env_var("SGLANG_RECORD_STEP_TIME")
+
+
+class KvMetrics:
+    def __init__(self):
+        self.request_active_slots = None
+        self.request_total_slots = None
+        self.kv_active_blocks = None
+        self.kv_total_blocks = None
+        self.num_requests_waiting = None
+        self.gpu_cache_usage_perc = None
+        self.gpu_prefix_cache_hit_rate = None
+        self.data_parallel_rank = None
+
+
+class SchedulerMetricsMixin:
+    def init_metrics(
+        self: Scheduler, tp_rank: int, pp_rank: int, dp_rank: Optional[int]
+    ):
+        self.last_gen_throughput: float = 0.0
+        self.last_input_throughput: float = 0.0
+        self.step_time_dict = defaultdict(list)  # Dict[batch size -> step time]
+        self.spec_num_total_accepted_tokens = 0
+        self.spec_num_total_forward_ct = 0
+        self.cum_spec_accept_length = 0
+        self.cum_spec_accept_count = 0
+        self.total_retracted_reqs = 0
+        self.stats = SchedulerStats()
+        if self.enable_metrics:
+            engine_type = "unified"
+            labels = {
+                "model_name": self.server_args.served_model_name,
+                "engine_type": engine_type,
+                "tp_rank": tp_rank,
+                "pp_rank": pp_rank,
+            }
+            if dp_rank is not None:
+                labels["dp_rank"] = dp_rank
+            self.metrics_collector = SchedulerMetricsCollector(labels=labels)
+
+    def init_dp_balance(self: Scheduler, dp_balance_meta: Optional[DPBalanceMeta]):
+        self.balance_meta = dp_balance_meta
+        if (
+            self.server_args.enable_dp_attention
+            and self.server_args.load_balance_method == "minimum_tokens"
+        ):
+            assert dp_balance_meta is not None
+
+        self.recv_dp_balance_id_this_term = []
+
+    def init_kv_events(self: Scheduler, kv_events_config: Optional[str]):
+        if self.enable_kv_cache_events:
+            self.kv_event_publisher = EventPublisherFactory.create(
+                kv_events_config, self.attn_dp_rank
+            )
+
+    def log_prefill_stats(
+        self: Scheduler,
+        adder: PrefillAdder,
+        can_run_list: List[Req],
+        running_bs: int,
+    ):
+        gap_latency = time.perf_counter() - self.last_prefill_stats_tic
+        self.last_prefill_stats_tic = time.perf_counter()
+        self.last_input_throughput = self.last_prefill_tokens / gap_latency
+        self.last_prefill_tokens = adder.log_input_tokens
+
+        if self.is_hybrid:
+            (
+                full_num_used,
+                swa_num_used,
+                full_token_usage,
+                swa_token_usage,
+                _,
+                _,
+                _,
+                _,
+            ) = self._get_swa_token_info()
+            num_used = max(full_num_used, swa_num_used)
+            token_usage = max(full_token_usage, swa_token_usage)
+            token_msg = (
+                f"full token usage: {full_token_usage:.2f}, "
+                f"swa token usage: {swa_token_usage:.2f}, "
+            )
+        else:
+            num_used, token_usage, _, _ = self._get_token_info()
+            token_msg = f"token usage: {token_usage:.2f}, "
+
+        num_new_seq = len(can_run_list)
+        f = (
+            f"Prefill batch. "
+            f"#new-seq: {num_new_seq}, "
+            f"#new-token: {adder.log_input_tokens}, "
+            f"#cached-token: {adder.log_hit_tokens}, "
+            f"{token_msg}"
+        )
+
+        if self.disaggregation_mode == DisaggregationMode.PREFILL:
+            f += f"#unbootstrapped-req: {len(self.disagg_prefill_bootstrap_queue.queue)}, "
+            f += f"#queue-req: {len(self.waiting_queue)}, "
+            f += f"#transferring-req: {len(self.disagg_prefill_inflight_queue)}, "
+            f += f"input throughput (token/s): {self.last_input_throughput:.2f}, "
+        else:
+            f += f"#running-req: {running_bs}, "
+            f += f"#queue-req: {len(self.waiting_queue)}, "
+
+        logger.info(f)
+
+        if self.enable_metrics:
+            total_tokens = adder.log_input_tokens + adder.log_hit_tokens
+
+            cache_hit_rate = (
+                adder.log_hit_tokens / total_tokens if total_tokens > 0 else 0.0
+            )
+            self.stats.num_running_reqs = running_bs
+            self.stats.num_used_tokens = num_used
+            self.stats.token_usage = round(token_usage, 2)
+            self.stats.num_queue_reqs = len(self.waiting_queue)
+            self.stats.cache_hit_rate = cache_hit_rate
+
+            total_queue_latency = 0
+            for req in can_run_list:
+                total_queue_latency += req.queue_time_end - req.queue_time_start
+            self.stats.avg_request_queue_latency = total_queue_latency / num_new_seq
+
+            if self.disaggregation_mode == DisaggregationMode.PREFILL:
+                self.stats.num_prefill_prealloc_queue_reqs = len(
+                    self.disagg_prefill_bootstrap_queue.queue
+                )
+                self.stats.num_prefill_inflight_queue_reqs = len(
+                    self.disagg_prefill_inflight_queue
+                )
+
+            self.metrics_collector.log_stats(self.stats)
+            self._emit_kv_metrics()
+        self._publish_kv_events()
+
+    def log_decode_stats(
+        self: Scheduler, can_run_cuda_graph: bool, running_batch: ScheduleBatch = None
+    ):
+        batch = running_batch or self.running_batch
+
+        gap_latency = time.perf_counter() - self.last_decode_stats_tic
+        self.last_decode_stats_tic = time.perf_counter()
+        self.last_gen_throughput = self.num_generated_tokens / gap_latency
+        self.num_generated_tokens = 0
+        num_running_reqs = len(batch.reqs)
+        if self.is_hybrid:
+            (
+                full_num_used,
+                swa_num_used,
+                full_token_usage,
+                swa_token_usage,
+                _,
+                _,
+                _,
+                _,
+            ) = self._get_swa_token_info()
+            num_used = max(full_num_used, swa_num_used)
+            token_usage = max(full_token_usage, swa_token_usage)
+            token_msg = (
+                f"#full token: {full_num_used}, "
+                f"full token usage: {full_token_usage:.2f}, "
+                f"#swa token: {swa_num_used}, "
+                f"swa token usage: {swa_token_usage:.2f}, "
+            )
+        else:
+            num_used, token_usage, _, _ = self._get_token_info()
+            token_msg = f"#token: {num_used}, " f"token usage: {token_usage:.2f}, "
+
+        if RECORD_STEP_TIME:
+            self.step_time_dict[num_running_reqs].append(
+                gap_latency / self.server_args.decode_log_interval
+            )
+
+        msg = f"Decode batch. #running-req: {num_running_reqs}, {token_msg}"
+
+        if self.spec_algorithm.is_none():
+            spec_accept_length = 0
+        else:
+            spec_accept_length = (
+                self.spec_num_total_accepted_tokens / self.spec_num_total_forward_ct
+            )
+            self.cum_spec_accept_length += self.spec_num_total_accepted_tokens
+            self.cum_spec_accept_count += self.spec_num_total_forward_ct
+            self.spec_num_total_accepted_tokens = self.spec_num_total_forward_ct = 0
+            msg += f"accept len: {spec_accept_length:.2f}, "
+
+        if self.disaggregation_mode == DisaggregationMode.DECODE:
+            msg += f"pre-allocated usage: {self.disagg_decode_prealloc_queue.num_tokens_pre_allocated / self.max_total_num_tokens:.2f}, "
+            msg += f"#retracted-req: {len(self.disagg_decode_prealloc_queue.retracted_queue)}, "
+
+        msg += (
+            f"{'cpu graph' if self.device == 'cpu' else 'cuda graph'}: {can_run_cuda_graph}, "
+            f"gen throughput (token/s): {self.last_gen_throughput:.2f}, "
+            f"#queue-req: {len(self.waiting_queue)}, "
+        )
+
+        logger.info(msg)
+        if self.enable_metrics:
+            self.stats.num_running_reqs = num_running_reqs
+            self.stats.num_used_tokens = num_used
+            self.stats.token_usage = round(token_usage, 2)
+            self.stats.cache_hit_rate = 0.0
+            self.stats.gen_throughput = self.last_gen_throughput
+            self.stats.num_queue_reqs = len(self.waiting_queue)
+            self.stats.num_grammar_queue_reqs = len(self.grammar_queue)
+            self.stats.spec_accept_length = spec_accept_length
+            self.stats.total_retracted_reqs = self.total_retracted_reqs
+            self.metrics_collector.log_stats(self.stats)
+            if self.disaggregation_mode == DisaggregationMode.DECODE:
+                self.stats.num_decode_prealloc_queue_reqs = len(
+                    self.disagg_decode_prealloc_queue.queue
+                )
+                self.stats.num_decode_transfer_queue_reqs = len(
+                    self.disagg_decode_transfer_queue.queue
+                )
+            self._emit_kv_metrics()
+        self._publish_kv_events()
+
+    def _emit_kv_metrics(self: Scheduler):
+        kv_metrics = KvMetrics()
+        kv_metrics.request_active_slots = self.stats.num_running_reqs
+        kv_metrics.request_total_slots = self.max_running_requests
+        kv_metrics.kv_active_blocks = int(
+            self.stats.token_usage * self.max_total_num_tokens
+        )
+        kv_metrics.kv_total_blocks = self.max_total_num_tokens
+        kv_metrics.num_requests_waiting = self.stats.num_queue_reqs
+        kv_metrics.gpu_cache_usage_perc = self.stats.token_usage
+        kv_metrics.gpu_prefix_cache_hit_rate = self.stats.cache_hit_rate
+        kv_metrics.data_parallel_rank = self.dp_rank if self.dp_rank is not None else 0
+
+        if not self.send_metrics_from_scheduler.closed:
+            self.send_metrics_from_scheduler.send_pyobj(kv_metrics)
+
+    def _publish_kv_events(self: Scheduler):
+        if self.enable_kv_cache_events:
+            events = self.tree_cache.take_events()
+            if events:
+                batch = KVEventBatch(ts=time.time(), events=events)
+                self.kv_event_publisher.publish(batch)
+
+    def maybe_update_dp_balance_data(
+        self: Scheduler, recv_req: TokenizedGenerateReqInput
+    ):
+        if (
+            self.server_args.enable_dp_attention
+            and self.server_args.load_balance_method == "minimum_tokens"
+        ):
+            self.recv_dp_balance_id_this_term.append(recv_req.dp_balance_id)
+
+    def maybe_handle_dp_balance_data(self: Scheduler):
+        if (
+            self.server_args.load_balance_method == "minimum_tokens"
+            and self.forward_ct % 40 == 0
+        ):
+            holding_tokens = self.get_load()
+
+            new_recv_dp_balance_id_list, holding_token_list = (
+                self.gather_dp_balance_info(holding_tokens)
+            )
+
+            self.recv_dp_balance_id_this_term.clear()
+            if self.tp_rank == 0:  # only first worker write info
+                self.write_shared_dp_balance_info(
+                    new_recv_dp_balance_id_list, holding_token_list
+                )
+
+    def gather_dp_balance_info(
+        self: Scheduler, holding_tokens_list
+    ) -> Union[None, List[List[int]]]:
+        """gather recv_dp_balance_id_this_term and holding tokens per worker for dp balance"""
+        recv_list = self.recv_dp_balance_id_this_term
+        assert len(recv_list) <= 511, (
+            "The number of requests received this round is too large. "
+            "Please increase gather_tensor_size and onfly_info_size."
+        )
+        # The maximum size of the tensor used for gathering data from all workers.
+        gather_tensor_size = 512
+
+        # recv_tensor: | holding_tokens | len(recv_dp_balance_id) | recv_dp_balance_ids
+        recv_tensor = torch.zeros(gather_tensor_size, dtype=torch.int32)
+        recv_tensor[0] = holding_tokens_list
+        recv_tensor[1] = len(recv_list)  # The first element is the length of the list.
+        recv_tensor[2 : len(recv_list) + 2] = torch.tensor(recv_list, dtype=torch.int32)
+
+        if self.tp_rank == 0:
+            gathered_list = [
+                torch.zeros(gather_tensor_size, dtype=torch.int32)
+                for _ in range(self.balance_meta.num_workers)
+            ]
+        else:
+            gathered_list = None
+
+        torch.distributed.gather(recv_tensor, gathered_list, group=self.tp_cpu_group)
+
+        gathered_id_list_per_worker = None
+        if self.tp_rank == 0:
+            gathered_id_list_per_worker = []
+            holding_tokens_list = []
+            for tensor in gathered_list:
+                holding_tokens_list.append(tensor[0].item())
+                list_length = tensor[1].item()
+                gathered_id_list_per_worker.append(tensor[2 : list_length + 2].tolist())
+
+        return gathered_id_list_per_worker, holding_tokens_list
+
+    def write_shared_dp_balance_info(self: Scheduler, new_recv_rid_lists, local_tokens):
+        meta = self.balance_meta
+
+        with meta.mutex:
+            onfly_list: List[Dict[int, int]] = meta.get_shared_onfly()
+            assert len(new_recv_rid_lists) == len(onfly_list), "num_worker not equal"
+            # 1.Check if the rid received by each worker this round is present in onfly.
+            #   If it is, remove the corresponding onfly item.
+            worker_id = 0
+            for new_recv_rids, on_fly_reqs in zip(new_recv_rid_lists, onfly_list):
+                for new_recv_rid in new_recv_rids:
+                    assert (
+                        new_recv_rid in on_fly_reqs
+                    ), f"{new_recv_rid=} not in {worker_id=} {on_fly_reqs=}, data consistency is wrong"
+                    del on_fly_reqs[new_recv_rid]
+                worker_id += 1
+            # 2. Atomically write local_tokens and onfly into shm under the mutex
+            meta.set_shared_onfly_info(onfly_list)
+            meta.set_shared_local_tokens(local_tokens)
diff --git a/python/sglang/srt/managers/scheduler_output_processor_mixin.py b/python/sglang/srt/managers/scheduler_output_processor_mixin.py
new file mode 100644
index 000000000..d931759bb
--- /dev/null
+++ b/python/sglang/srt/managers/scheduler_output_processor_mixin.py
@@ -0,0 +1,732 @@
+from __future__ import annotations
+
+import logging
+import threading
+import time
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+
+from sglang.srt.disaggregation.utils import DisaggregationMode
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.managers.io_struct import AbortReq, BatchEmbeddingOut, BatchTokenIDOut
+from sglang.srt.managers.schedule_batch import BaseFinishReason, Req, ScheduleBatch
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.scheduler import (
+        EmbeddingBatchResult,
+        GenerationBatchResult,
+        ScheduleBatch,
+        Scheduler,
+    )
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_FORCE_STREAM_INTERVAL = 50
+
+
+class SchedulerOutputProcessorMixin:
+    """
+    This class implements the output processing logic for Scheduler.
+    We put them into a separate file to make the `scheduler.py` shorter.
+    """
+
+    def process_batch_result_prefill(
+        self: Scheduler,
+        batch: ScheduleBatch,
+        result: Union[GenerationBatchResult, EmbeddingBatchResult],
+        launch_done: Optional[threading.Event] = None,
+    ):
+        skip_stream_req = None
+
+        if self.is_generation:
+            (
+                logits_output,
+                next_token_ids,
+                extend_input_len_per_req,
+                extend_logprob_start_len_per_req,
+            ) = (
+                result.logits_output,
+                result.next_token_ids,
+                result.extend_input_len_per_req,
+                result.extend_logprob_start_len_per_req,
+            )
+
+            if self.enable_overlap:
+                logits_output, next_token_ids, _ = (
+                    self.tp_worker.resolve_last_batch_result(launch_done)
+                )
+            else:
+                # Move next_token_ids and logprobs to cpu
+                next_token_ids = next_token_ids.tolist()
+                if batch.return_logprob:
+                    if logits_output.next_token_logprobs is not None:
+                        logits_output.next_token_logprobs = (
+                            logits_output.next_token_logprobs.tolist()
+                        )
+                    if logits_output.input_token_logprobs is not None:
+                        logits_output.input_token_logprobs = tuple(
+                            logits_output.input_token_logprobs.tolist()
+                        )
+
+            hidden_state_offset = 0
+
+            # Check finish conditions
+            logprob_pt = 0
+            for i, (req, next_token_id) in enumerate(zip(batch.reqs, next_token_ids)):
+                if req.is_retracted:
+                    continue
+
+                if self.is_mixed_chunk and self.enable_overlap and req.finished():
+                    # Free the one delayed token for the mixed decode batch
+                    j = len(batch.out_cache_loc) - len(batch.reqs) + i
+                    self.token_to_kv_pool_allocator.free(batch.out_cache_loc[j : j + 1])
+                    continue
+
+                if req.is_chunked <= 0:
+                    # req output_ids are set here
+                    req.output_ids.append(next_token_id)
+                    req.check_finished()
+
+                    if req.finished():
+                        self.tree_cache.cache_finished_req(req)
+                        req.time_stats.completion_time = time.time()
+                    elif not batch.decoding_reqs or req not in batch.decoding_reqs:
+                        # This updates radix so others can match
+                        self.tree_cache.cache_unfinished_req(req)
+
+                    if batch.return_logprob:
+                        assert extend_logprob_start_len_per_req is not None
+                        assert extend_input_len_per_req is not None
+                        extend_logprob_start_len = extend_logprob_start_len_per_req[i]
+                        extend_input_len = extend_input_len_per_req[i]
+                        num_input_logprobs = extend_input_len - extend_logprob_start_len
+                        if req.return_logprob:
+                            self.add_logprob_return_values(
+                                i,
+                                req,
+                                logprob_pt,
+                                next_token_ids,
+                                num_input_logprobs,
+                                logits_output,
+                            )
+                        logprob_pt += num_input_logprobs
+
+                    if (
+                        req.return_hidden_states
+                        and logits_output.hidden_states is not None
+                    ):
+                        req.hidden_states.append(
+                            logits_output.hidden_states[
+                                hidden_state_offset : (
+                                    hidden_state_offset := hidden_state_offset
+                                    + len(req.origin_input_ids)
+                                )
+                            ]
+                            .cpu()
+                            .clone()
+                            .tolist()
+                        )
+
+                    if req.grammar is not None:
+                        # FIXME: this try-except block is for handling unexpected xgrammar issue.
+                        try:
+                            req.grammar.accept_token(next_token_id)
+                        except ValueError as e:
+                            # Grammar accept_token can raise ValueError if the token is not in the grammar.
+                            # This can happen if the grammar is not set correctly or the token is invalid.
+                            logger.error(
+                                f"Grammar accept_token failed for req {req.rid} with token {next_token_id}: {e}"
+                            )
+                            self.abort_request(AbortReq(req.rid))
+                        req.grammar.finished = req.finished()
+                else:
+                    # being chunked reqs' prefill is not finished
+                    req.is_chunked -= 1
+                    # There is only at most one request being currently chunked.
+                    # Because this request does not finish prefill,
+                    # we don't want to stream the request currently being chunked.
+                    skip_stream_req = req
+
+                    # Incrementally update input logprobs.
+                    if batch.return_logprob:
+                        extend_logprob_start_len = extend_logprob_start_len_per_req[i]
+                        extend_input_len = extend_input_len_per_req[i]
+                        if extend_logprob_start_len < extend_input_len:
+                            # Update input logprobs.
+                            num_input_logprobs = (
+                                extend_input_len - extend_logprob_start_len
+                            )
+                            if req.return_logprob:
+                                self.add_input_logprob_return_values(
+                                    i,
+                                    req,
+                                    logits_output,
+                                    logprob_pt,
+                                    num_input_logprobs,
+                                    last_prefill_chunk=False,
+                                )
+                            logprob_pt += num_input_logprobs
+
+            self.set_next_batch_sampling_info_done(batch)
+
+        else:  # embedding or reward model
+            embeddings, bid = result.embeddings, result.bid
+            embeddings = embeddings.tolist()
+
+            # Check finish conditions
+            for i, req in enumerate(batch.reqs):
+                if req.is_retracted:
+                    continue
+
+                req.embedding = embeddings[i]
+                if req.is_chunked <= 0:
+                    # Dummy output token for embedding models
+                    req.output_ids.append(0)
+                    req.check_finished()
+
+                    if req.finished():
+                        self.tree_cache.cache_finished_req(req)
+                    else:
+                        self.tree_cache.cache_unfinished_req(req)
+                else:
+                    # being chunked reqs' prefill is not finished
+                    req.is_chunked -= 1
+
+        self.stream_output(batch.reqs, batch.return_logprob, skip_stream_req)
+
+    def process_batch_result_decode(
+        self: Scheduler,
+        batch: ScheduleBatch,
+        result: GenerationBatchResult,
+        launch_done: Optional[threading.Event] = None,
+    ):
+        logits_output, next_token_ids, can_run_cuda_graph = (
+            result.logits_output,
+            result.next_token_ids,
+            result.can_run_cuda_graph,
+        )
+        self.num_generated_tokens += len(batch.reqs)
+
+        if self.enable_overlap:
+            logits_output, next_token_ids, can_run_cuda_graph = (
+                self.tp_worker.resolve_last_batch_result(launch_done)
+            )
+            next_token_logprobs = logits_output.next_token_logprobs
+        elif batch.spec_algorithm.is_none():
+            # spec decoding handles output logprobs inside verify process.
+            next_token_ids = next_token_ids.tolist()
+            if batch.return_logprob:
+                next_token_logprobs = logits_output.next_token_logprobs.tolist()
+
+        self.token_to_kv_pool_allocator.free_group_begin()
+
+        # Check finish condition
+        # NOTE: the length of reqs and next_token_ids don't match if it is spec decoding.
+        # We should ignore using next_token_ids for spec decoding cases.
+        for i, (req, next_token_id) in enumerate(zip(batch.reqs, next_token_ids)):
+            if req.is_retracted:
+                continue
+
+            if self.enable_overlap and req.finished():
+                # Free the one extra delayed token
+                if self.page_size == 1:
+                    self.token_to_kv_pool_allocator.free(batch.out_cache_loc[i : i + 1])
+                else:
+                    # Only free when the extra token is in a new page
+                    if (
+                        len(req.origin_input_ids) + len(req.output_ids) - 1
+                    ) % self.page_size == 0:
+                        self.token_to_kv_pool_allocator.free(
+                            batch.out_cache_loc[i : i + 1]
+                        )
+                continue
+
+            if batch.spec_algorithm.is_none():
+                # speculative worker will solve the output_ids in speculative decoding
+                req.output_ids.append(next_token_id)
+
+            req.check_finished()
+            if req.finished():
+                self.tree_cache.cache_finished_req(req)
+                req.time_stats.completion_time = time.time()
+
+            if req.return_logprob and batch.spec_algorithm.is_none():
+                # speculative worker handles logprob in speculative decoding
+                req.output_token_logprobs_val.append(next_token_logprobs[i])
+                req.output_token_logprobs_idx.append(next_token_id)
+                if req.top_logprobs_num > 0:
+                    req.output_top_logprobs_val.append(
+                        logits_output.next_token_top_logprobs_val[i]
+                    )
+                    req.output_top_logprobs_idx.append(
+                        logits_output.next_token_top_logprobs_idx[i]
+                    )
+                if req.token_ids_logprob is not None:
+                    req.output_token_ids_logprobs_val.append(
+                        logits_output.next_token_token_ids_logprobs_val[i]
+                    )
+                    req.output_token_ids_logprobs_idx.append(
+                        logits_output.next_token_token_ids_logprobs_idx[i]
+                    )
+
+            if req.return_hidden_states and logits_output.hidden_states is not None:
+                req.hidden_states.append(
+                    logits_output.hidden_states[i].cpu().clone().tolist()
+                )
+
+            if req.grammar is not None and batch.spec_algorithm.is_none():
+                # FIXME: this try-except block is for handling unexpected xgrammar issue.
+                try:
+                    req.grammar.accept_token(next_token_id)
+                except ValueError as e:
+                    # Grammar accept_token can raise ValueError if the token is not in the grammar.
+                    # This can happen if the grammar is not set correctly or the token is invalid.
+                    logger.error(
+                        f"Grammar accept_token failed for req {req.rid} with token {next_token_id}: {e}"
+                    )
+                    self.abort_request(AbortReq(req.rid))
+                req.grammar.finished = req.finished()
+
+        self.set_next_batch_sampling_info_done(batch)
+        self.stream_output(batch.reqs, batch.return_logprob)
+        self.token_to_kv_pool_allocator.free_group_end()
+
+        self.forward_ct_decode = (self.forward_ct_decode + 1) % (1 << 30)
+        if (
+            self.current_scheduler_metrics_enabled()
+            and self.forward_ct_decode % self.server_args.decode_log_interval == 0
+        ):
+            self.log_decode_stats(can_run_cuda_graph, running_batch=batch)
+
+    def add_input_logprob_return_values(
+        self: Scheduler,
+        i: int,
+        req: Req,
+        output: LogitsProcessorOutput,
+        logprob_pt: int,
+        num_input_logprobs: int,
+        last_prefill_chunk: bool,  # If True, it means prefill is finished.
+    ):
+        """Incrementally add input logprobs to `req`.
+
+        Args:
+            i: The request index in a batch.
+            req: The request. Input logprobs inside req are modified as a
+                consequence of the API
+            fill_ids: The prefill ids processed.
+            output: Logit processor output that's used to compute input logprobs
+            last_prefill_chunk: True if it is the last prefill (when chunked).
+                Some of input logprob operation should only happen at the last
+                prefill (e.g., computing input token logprobs).
+        """
+        assert output.input_token_logprobs is not None
+        if req.input_token_logprobs is None:
+            req.input_token_logprobs = []
+        if req.temp_input_top_logprobs_val is None:
+            req.temp_input_top_logprobs_val = []
+        if req.temp_input_top_logprobs_idx is None:
+            req.temp_input_top_logprobs_idx = []
+        if req.temp_input_token_ids_logprobs_val is None:
+            req.temp_input_token_ids_logprobs_val = []
+        if req.temp_input_token_ids_logprobs_idx is None:
+            req.temp_input_token_ids_logprobs_idx = []
+
+        if req.input_token_logprobs_val is not None:
+            # The input logprob has been already computed. It only happens
+            # upon retract.
+            if req.top_logprobs_num > 0:
+                assert req.input_token_logprobs_val is not None
+            return
+
+        # Important for the performance.
+        assert isinstance(output.input_token_logprobs, tuple)
+        input_token_logprobs: Tuple[int] = output.input_token_logprobs
+        input_token_logprobs = input_token_logprobs[
+            logprob_pt : logprob_pt + num_input_logprobs
+        ]
+        req.input_token_logprobs.extend(input_token_logprobs)
+
+        if req.top_logprobs_num > 0:
+            req.temp_input_top_logprobs_val.append(output.input_top_logprobs_val[i])
+            req.temp_input_top_logprobs_idx.append(output.input_top_logprobs_idx[i])
+
+        if req.token_ids_logprob is not None:
+            req.temp_input_token_ids_logprobs_val.append(
+                output.input_token_ids_logprobs_val[i]
+            )
+            req.temp_input_token_ids_logprobs_idx.append(
+                output.input_token_ids_logprobs_idx[i]
+            )
+
+        if last_prefill_chunk:
+            input_token_logprobs = req.input_token_logprobs
+            req.input_token_logprobs = None
+            assert req.input_token_logprobs_val is None
+            assert req.input_token_logprobs_idx is None
+            assert req.input_top_logprobs_val is None
+            assert req.input_top_logprobs_idx is None
+
+            # Compute input_token_logprobs_val
+            # Always pad the first one with None.
+            req.input_token_logprobs_val = [None]
+            req.input_token_logprobs_val.extend(input_token_logprobs)
+            # The last input logprob is for sampling, so just pop it out.
+            req.input_token_logprobs_val.pop()
+
+            # Compute input_token_logprobs_idx
+            input_token_logprobs_idx = req.origin_input_ids[req.logprob_start_len :]
+            # Clip the padded hash values from image tokens.
+            # Otherwise, it will lead to detokenization errors.
+            input_token_logprobs_idx = [
+                x if x < self.model_config.vocab_size - 1 else 0
+                for x in input_token_logprobs_idx
+            ]
+            req.input_token_logprobs_idx = input_token_logprobs_idx
+
+            if req.top_logprobs_num > 0:
+                req.input_top_logprobs_val = [None]
+                req.input_top_logprobs_idx = [None]
+                assert len(req.temp_input_token_ids_logprobs_val) == len(
+                    req.temp_input_token_ids_logprobs_idx
+                )
+                for val, idx in zip(
+                    req.temp_input_top_logprobs_val,
+                    req.temp_input_top_logprobs_idx,
+                    strict=True,
+                ):
+                    req.input_top_logprobs_val.extend(val)
+                    req.input_top_logprobs_idx.extend(idx)
+
+                # Last token is a sample token.
+                req.input_top_logprobs_val.pop()
+                req.input_top_logprobs_idx.pop()
+                req.temp_input_top_logprobs_idx = None
+                req.temp_input_top_logprobs_val = None
+
+            if req.token_ids_logprob is not None:
+                req.input_token_ids_logprobs_val = [None]
+                req.input_token_ids_logprobs_idx = [None]
+
+                for val, idx in zip(
+                    req.temp_input_token_ids_logprobs_val,
+                    req.temp_input_token_ids_logprobs_idx,
+                    strict=True,
+                ):
+                    req.input_token_ids_logprobs_val.extend(val)
+                    req.input_token_ids_logprobs_idx.extend(idx)
+
+                # Last token is a sample token.
+                req.input_token_ids_logprobs_val.pop()
+                req.input_token_ids_logprobs_idx.pop()
+                req.temp_input_token_ids_logprobs_idx = None
+                req.temp_input_token_ids_logprobs_val = None
+
+            if req.return_logprob:
+                relevant_tokens_len = len(req.origin_input_ids) - req.logprob_start_len
+                assert len(req.input_token_logprobs_val) == relevant_tokens_len
+                assert len(req.input_token_logprobs_idx) == relevant_tokens_len
+                if req.top_logprobs_num > 0:
+                    assert len(req.input_top_logprobs_val) == relevant_tokens_len
+                    assert len(req.input_top_logprobs_idx) == relevant_tokens_len
+                if req.token_ids_logprob is not None:
+                    assert len(req.input_token_ids_logprobs_val) == relevant_tokens_len
+                    assert len(req.input_token_ids_logprobs_idx) == relevant_tokens_len
+
+    def add_logprob_return_values(
+        self: Scheduler,
+        i: int,
+        req: Req,
+        pt: int,
+        next_token_ids: List[int],
+        num_input_logprobs: int,
+        output: LogitsProcessorOutput,
+    ):
+        """Attach logprobs to the return values."""
+        req.output_token_logprobs_val.append(output.next_token_logprobs[i])
+        req.output_token_logprobs_idx.append(next_token_ids[i])
+
+        self.add_input_logprob_return_values(
+            i, req, output, pt, num_input_logprobs, last_prefill_chunk=True
+        )
+
+        if req.top_logprobs_num > 0:
+            req.output_top_logprobs_val.append(output.next_token_top_logprobs_val[i])
+            req.output_top_logprobs_idx.append(output.next_token_top_logprobs_idx[i])
+
+        if req.token_ids_logprob is not None:
+            req.output_token_ids_logprobs_val.append(
+                output.next_token_token_ids_logprobs_val[i]
+            )
+            req.output_token_ids_logprobs_idx.append(
+                output.next_token_token_ids_logprobs_idx[i]
+            )
+
+        return num_input_logprobs
+
+    def stream_output(
+        self: Scheduler,
+        reqs: List[Req],
+        return_logprob: bool,
+        skip_req: Optional[Req] = None,
+    ):
+        """Stream the output to detokenizer."""
+        if self.is_generation:
+            self.stream_output_generation(reqs, return_logprob, skip_req)
+        else:  # embedding or reward model
+            self.stream_output_embedding(reqs)
+
+    def stream_output_generation(
+        self: Scheduler,
+        reqs: List[Req],
+        return_logprob: bool,
+        skip_req: Optional[Req] = None,
+    ):
+        rids = []
+        finished_reasons: List[BaseFinishReason] = []
+
+        decoded_texts = []
+        decode_ids_list = []
+        read_offsets = []
+        output_ids = []
+
+        skip_special_tokens = []
+        spaces_between_special_tokens = []
+        no_stop_trim = []
+        prompt_tokens = []
+        completion_tokens = []
+        cached_tokens = []
+        spec_verify_ct = []
+        output_hidden_states = None
+
+        if return_logprob:
+            input_token_logprobs_val = []
+            input_token_logprobs_idx = []
+            output_token_logprobs_val = []
+            output_token_logprobs_idx = []
+            input_top_logprobs_val = []
+            input_top_logprobs_idx = []
+            output_top_logprobs_val = []
+            output_top_logprobs_idx = []
+            input_token_ids_logprobs_val = []
+            input_token_ids_logprobs_idx = []
+            output_token_ids_logprobs_val = []
+            output_token_ids_logprobs_idx = []
+        else:
+            input_token_logprobs_val = input_token_logprobs_idx = (
+                output_token_logprobs_val
+            ) = output_token_logprobs_idx = input_top_logprobs_val = (
+                input_top_logprobs_idx
+            ) = output_top_logprobs_val = output_top_logprobs_idx = (
+                input_token_ids_logprobs_val
+            ) = input_token_ids_logprobs_idx = output_token_ids_logprobs_val = (
+                output_token_ids_logprobs_idx
+            ) = None
+
+        for req in reqs:
+            if req is skip_req:
+                continue
+
+            # Multimodal partial stream chunks break the detokenizer, so drop aborted requests here.
+            if self.model_config.is_multimodal_gen and req.to_abort:
+                continue
+
+            if req.finished():
+                if req.finished_output:
+                    # With the overlap schedule, a request will try to output twice and hit this line twice
+                    # because of the one additional delayed token. This "continue" prevented the dummy output.
+                    continue
+                req.finished_output = True
+                should_output = True
+            else:
+                if req.stream:
+                    stream_interval = (
+                        req.sampling_params.stream_interval or self.stream_interval
+                    )
+                    should_output = (
+                        len(req.output_ids) % stream_interval == 1
+                        if not self.model_config.is_multimodal_gen
+                        and stream_interval > 1
+                        else len(req.output_ids) % stream_interval == 0
+                    )
+                else:
+                    should_output = (
+                        len(req.output_ids) % DEFAULT_FORCE_STREAM_INTERVAL == 0
+                        if not self.model_config.is_multimodal_gen
+                        else False
+                    )
+
+            if should_output:
+                send_token_offset = req.send_token_offset
+                send_output_token_logprobs_offset = (
+                    req.send_output_token_logprobs_offset
+                )
+                rids.append(req.rid)
+                finished_reasons.append(
+                    req.finished_reason.to_json() if req.finished_reason else None
+                )
+                decoded_texts.append(req.decoded_text)
+                decode_ids, read_offset = req.init_incremental_detokenize()
+
+                if self.model_config.is_multimodal_gen:
+                    decode_ids_list.append(decode_ids)
+                else:
+                    decode_ids_list.append(decode_ids[req.send_decode_id_offset :])
+
+                req.send_decode_id_offset = len(decode_ids)
+                read_offsets.append(read_offset)
+                output_ids.append(req.output_ids[send_token_offset:])
+                req.send_token_offset = len(req.output_ids)
+                skip_special_tokens.append(req.sampling_params.skip_special_tokens)
+                spaces_between_special_tokens.append(
+                    req.sampling_params.spaces_between_special_tokens
+                )
+                no_stop_trim.append(req.sampling_params.no_stop_trim)
+                prompt_tokens.append(len(req.origin_input_ids))
+                completion_tokens.append(len(req.output_ids))
+                cached_tokens.append(req.cached_tokens)
+
+                if not self.spec_algorithm.is_none():
+                    spec_verify_ct.append(req.spec_verify_ct)
+
+                if return_logprob:
+                    if (
+                        req.return_logprob
+                        and not req.input_logprob_sent
+                        # Decode server does not send input logprobs
+                        and self.disaggregation_mode != DisaggregationMode.DECODE
+                    ):
+                        input_token_logprobs_val.append(req.input_token_logprobs_val)
+                        input_token_logprobs_idx.append(req.input_token_logprobs_idx)
+                        input_top_logprobs_val.append(req.input_top_logprobs_val)
+                        input_top_logprobs_idx.append(req.input_top_logprobs_idx)
+                        input_token_ids_logprobs_val.append(
+                            req.input_token_ids_logprobs_val
+                        )
+                        input_token_ids_logprobs_idx.append(
+                            req.input_token_ids_logprobs_idx
+                        )
+                        req.input_logprob_sent = True
+                    else:
+                        input_token_logprobs_val.append([])
+                        input_token_logprobs_idx.append([])
+                        input_top_logprobs_val.append([])
+                        input_top_logprobs_idx.append([])
+                        input_token_ids_logprobs_val.append([])
+                        input_token_ids_logprobs_idx.append([])
+
+                    if req.return_logprob:
+                        output_token_logprobs_val.append(
+                            req.output_token_logprobs_val[
+                                send_output_token_logprobs_offset:
+                            ]
+                        )
+                        output_token_logprobs_idx.append(
+                            req.output_token_logprobs_idx[
+                                send_output_token_logprobs_offset:
+                            ]
+                        )
+                        output_top_logprobs_val.append(
+                            req.output_top_logprobs_val[
+                                send_output_token_logprobs_offset:
+                            ]
+                        )
+                        output_top_logprobs_idx.append(
+                            req.output_top_logprobs_idx[
+                                send_output_token_logprobs_offset:
+                            ]
+                        )
+                        output_token_ids_logprobs_val.append(
+                            req.output_token_ids_logprobs_val[
+                                send_output_token_logprobs_offset:
+                            ]
+                        )
+                        output_token_ids_logprobs_idx.append(
+                            req.output_token_ids_logprobs_idx[
+                                send_output_token_logprobs_offset:
+                            ]
+                        )
+                        req.send_output_token_logprobs_offset = len(
+                            req.output_token_logprobs_val
+                        )
+                    else:
+                        output_token_logprobs_val.append([])
+                        output_token_logprobs_idx.append([])
+                        output_top_logprobs_val.append([])
+                        output_top_logprobs_idx.append([])
+                        output_token_ids_logprobs_val.append([])
+                        output_token_ids_logprobs_idx.append([])
+
+                if req.return_hidden_states:
+                    if output_hidden_states is None:
+                        output_hidden_states = []
+                    output_hidden_states.append(req.hidden_states)
+
+            if (
+                req.finished()
+                and self.tp_rank == 0
+                and self.server_args.enable_request_time_stats_logging
+            ):
+                req.log_time_stats()
+
+        # Send to detokenizer
+        if rids:
+            if self.model_config.is_multimodal_gen:
+                return
+
+            self.send_to_detokenizer.send_pyobj(
+                BatchTokenIDOut(
+                    rids,
+                    finished_reasons,
+                    decoded_texts,
+                    decode_ids_list,
+                    read_offsets,
+                    output_ids,
+                    skip_special_tokens,
+                    spaces_between_special_tokens,
+                    no_stop_trim,
+                    prompt_tokens,
+                    completion_tokens,
+                    cached_tokens,
+                    spec_verify_ct,
+                    input_token_logprobs_val,
+                    input_token_logprobs_idx,
+                    output_token_logprobs_val,
+                    output_token_logprobs_idx,
+                    input_top_logprobs_val,
+                    input_top_logprobs_idx,
+                    output_top_logprobs_val,
+                    output_top_logprobs_idx,
+                    input_token_ids_logprobs_val,
+                    input_token_ids_logprobs_idx,
+                    output_token_ids_logprobs_val,
+                    output_token_ids_logprobs_idx,
+                    output_hidden_states,
+                    placeholder_tokens_idx=None,
+                    placeholder_tokens_val=None,
+                )
+            )
+
+    def stream_output_embedding(self: Scheduler, reqs: List[Req]):
+        rids = []
+        finished_reasons: List[BaseFinishReason] = []
+
+        embeddings = []
+        prompt_tokens = []
+        cached_tokens = []
+        for req in reqs:
+            if req.finished():
+                rids.append(req.rid)
+                finished_reasons.append(req.finished_reason.to_json())
+                embeddings.append(req.embedding)
+                prompt_tokens.append(len(req.origin_input_ids))
+                cached_tokens.append(req.cached_tokens)
+        self.send_to_detokenizer.send_pyobj(
+            BatchEmbeddingOut(
+                rids,
+                finished_reasons,
+                embeddings,
+                prompt_tokens,
+                cached_tokens,
+                placeholder_tokens_idx=None,
+                placeholder_tokens_val=None,
+            )
+        )
diff --git a/python/sglang/srt/managers/scheduler_profiler_mixin.py b/python/sglang/srt/managers/scheduler_profiler_mixin.py
new file mode 100644
index 000000000..116a8d842
--- /dev/null
+++ b/python/sglang/srt/managers/scheduler_profiler_mixin.py
@@ -0,0 +1,299 @@
+import logging
+import os
+import time
+from pathlib import Path
+from typing import List, Optional
+
+import torch
+
+from sglang.srt.managers.io_struct import ProfileReq, ProfileReqOutput, ProfileReqType
+from sglang.srt.model_executor.forward_batch_info import ForwardMode
+from sglang.srt.utils import is_npu
+
+_is_npu = is_npu()
+if _is_npu:
+    import torch_npu
+
+    patches = [
+        ["profiler.profile", torch_npu.profiler.profile],
+        ["profiler.ProfilerActivity.CUDA", torch_npu.profiler.ProfilerActivity.NPU],
+        ["profiler.ProfilerActivity.CPU", torch_npu.profiler.ProfilerActivity.CPU],
+    ]
+    torch_npu._apply_patches(patches)
+
+logger = logging.getLogger(__name__)
+
+
+class SchedulerProfilerMixin:
+
+    def init_profiler(self):
+        self.torch_profiler = None
+        self.torch_profiler_output_dir: Optional[str] = None
+        self.profiler_activities: Optional[List[str]] = None
+        self.profile_id: Optional[str] = None
+        self.profiler_start_forward_ct: Optional[int] = None
+        self.profiler_target_forward_ct: Optional[int] = None
+        self.profiler_target_prefill_ct: Optional[int] = None
+        self.profiler_target_decode_ct: Optional[int] = None
+        self.profiler_prefill_ct: Optional[int] = None
+        self.profiler_decode_ct: Optional[int] = None
+        self.profile_by_stage: bool = False
+        self.profile_steps: Optional[int] = None
+        self.profile_in_progress: bool = False
+        self.rpd_profiler = None
+
+    def init_profile(
+        self,
+        output_dir: Optional[str],
+        start_step: Optional[int],
+        num_steps: Optional[int],
+        activities: Optional[List[str]],
+        with_stack: Optional[bool],
+        record_shapes: Optional[bool],
+        profile_by_stage: bool,
+        profile_id: str,
+    ) -> ProfileReqOutput:
+        if self.profile_in_progress:
+            return ProfileReqOutput(
+                success=False,
+                message="Profiling is already in progress. Call /stop_profile first.",
+            )
+
+        self.profile_by_stage = profile_by_stage
+
+        if output_dir is None:
+            output_dir = os.getenv("SGLANG_TORCH_PROFILER_DIR", "/tmp")
+        if activities is None:
+            activities = ["CPU", "GPU"]
+
+        self.torch_profiler_output_dir = output_dir
+        self.torch_profiler_with_stack = with_stack
+        self.torch_profiler_record_shapes = record_shapes
+        self.profiler_activities = activities
+        self.profile_id = profile_id
+
+        if start_step:
+            self.profiler_start_forward_ct = max(start_step, self.forward_ct + 1)
+
+        if num_steps:
+            self.profile_steps = num_steps
+            if self.profile_by_stage:
+                self.profiler_target_prefill_ct = num_steps
+                self.profiler_target_decode_ct = num_steps
+                self.profiler_prefill_ct = 0
+                self.profiler_decode_ct = 0
+            elif start_step:
+                self.profiler_target_forward_ct = (
+                    self.profiler_start_forward_ct + num_steps
+                )
+            else:
+                self.profiler_target_forward_ct = self.forward_ct + num_steps
+            # The caller will be notified when reaching profiler_target_forward_ct
+        else:
+            self.profiler_target_forward_ct = None
+
+        return ProfileReqOutput(success=True, message="Succeeded")
+
+    def start_profile(
+        self, stage: Optional[ForwardMode] = None
+    ) -> ProfileReqOutput | None:
+        stage_str = f" for {stage.__str__()}" if stage else ""
+        logger.info(
+            f"Profiling starts{stage_str}. Traces will be saved to: {self.torch_profiler_output_dir} (with profile id: {self.profile_id})",
+        )
+
+        activities = self.profiler_activities
+        with_stack = self.torch_profiler_with_stack
+        record_shapes = self.torch_profiler_record_shapes
+
+        activity_map = {
+            "CPU": torch.profiler.ProfilerActivity.CPU,
+            "GPU": torch.profiler.ProfilerActivity.CUDA,
+        }
+        torchprof_activities = [
+            activity_map[a] for a in activities if a in activity_map
+        ]
+
+        if "RPD" in activities:
+            from rpdTracerControl import rpdTracerControl
+
+            rpdTracerControl.skipCreate()
+
+            self.rpd_profile_path = os.path.join(
+                self.torch_profiler_output_dir,
+                "rpd-" + str(time.time()) + f"-TP-{self.tp_rank}" + ".trace.json.gz",
+            )
+
+            if self.tp_rank == 0:
+                import sqlite3
+
+                from rocpd.schema import RocpdSchema
+
+                if os.path.exists("trace.rpd"):
+                    os.unlink("trace.rpd")
+                schema = RocpdSchema()
+                connection = sqlite3.connect("trace.rpd")
+                schema.writeSchema(connection)
+                connection.commit()
+                del connection
+            torch.distributed.barrier(self.tp_cpu_group)
+
+            self.rpd_profiler = rpdTracerControl()
+            self.rpd_profiler.setPythonTrace(True)
+            self.rpd_profiler.start()
+            self.rpd_profiler.rangePush("", "rpd profile range", "")
+            self.profile_in_progress = True
+        elif torchprof_activities:
+            self.torch_profiler = torch.profiler.profile(
+                activities=torchprof_activities,
+                with_stack=with_stack if with_stack is not None else True,
+                record_shapes=record_shapes if record_shapes is not None else False,
+                on_trace_ready=(
+                    None
+                    if not _is_npu
+                    else torch_npu.profiler.tensorboard_trace_handler(
+                        self.torch_profiler_output_dir
+                    )
+                ),
+            )
+            self.torch_profiler.start()
+            self.profile_in_progress = True
+
+        if "MEM" in activities:
+            torch.cuda.memory._record_memory_history(max_entries=100000)
+            self.profile_in_progress = True
+
+        if "CUDA_PROFILER" in activities:
+            torch.cuda.cudart().cudaProfilerStart()
+            self.profile_in_progress = True
+
+        return ProfileReqOutput(success=True, message="Succeeded")
+
+    def stop_profile(
+        self, stage: Optional[ForwardMode] = None
+    ) -> ProfileReqOutput | None:
+        if not self.profile_in_progress:
+            return ProfileReqOutput(
+                success=False,
+                message="Profiling is not in progress. Call /start_profile first.",
+            )
+
+        if not Path(self.torch_profiler_output_dir).exists():
+            Path(self.torch_profiler_output_dir).mkdir(parents=True, exist_ok=True)
+
+        stage_suffix = f"-{stage.__str__()}" if stage else ""
+        logger.info("Stop profiling" + stage_suffix + "...")
+        if self.torch_profiler is not None:
+            self.torch_profiler.stop()
+            if not _is_npu:
+                self.torch_profiler.export_chrome_trace(
+                    os.path.join(
+                        self.torch_profiler_output_dir,
+                        self.profile_id
+                        + f"-TP-{self.tp_rank}"
+                        + stage_suffix
+                        + ".trace.json.gz",
+                    )
+                )
+            torch.distributed.barrier(self.tp_cpu_group)
+
+        if self.rpd_profiler is not None:
+            self.rpd_profiler.rangePop()
+            self.rpd_profiler.stop()
+            self.rpd_profiler.flush()
+
+            torch.distributed.barrier(self.tp_cpu_group)
+            if self.tp_rank == 0:
+                from sglang.srt.utils import rpd_to_chrome_trace
+
+                rpd_to_chrome_trace("trace.rpd", self.rpd_profile_path)
+            self.rpd_profiler = None
+            self.rpd_profiler_path = None
+
+        if self.profiler_activities is not None and "MEM" in self.profiler_activities:
+            memory_profile_path = os.path.join(
+                self.torch_profiler_output_dir,
+                str(time.time())
+                + f"-TP-{self.tp_rank}-memory"
+                + stage_suffix
+                + ".pickle",
+            )
+            torch.cuda.memory._dump_snapshot(memory_profile_path)
+            torch.cuda.memory._record_memory_history(enabled=None)
+
+        if "CUDA_PROFILER" in self.profiler_activities:
+            torch.cuda.cudart().cudaProfilerStop()
+
+        logger.info(
+            "Profiling done. Traces are saved to: %s",
+            self.torch_profiler_output_dir,
+        )
+        self.torch_profiler = None
+        self.profile_in_progress = False
+        self.profiler_start_forward_ct = None
+
+        return ProfileReqOutput(success=True, message="Succeeded.")
+
+    def _profile_batch_predicate(self, batch):
+        if self.profile_by_stage:
+            if batch.forward_mode.is_prefill():
+                if self.profiler_prefill_ct == 0:
+                    self.start_profile(batch.forward_mode)
+                self.profiler_prefill_ct += 1
+                if self.profiler_prefill_ct > self.profiler_target_prefill_ct:
+                    if self.profile_in_progress:
+                        self.stop_profile(stage=ForwardMode.EXTEND)
+            elif batch.forward_mode.is_decode():
+                if self.profiler_decode_ct == 0:
+                    if self.profile_in_progress:
+                        # force trace flush
+                        self.stop_profile(ForwardMode.EXTEND)
+                    self.start_profile(batch.forward_mode)
+                self.profiler_decode_ct += 1
+                if self.profiler_decode_ct > self.profiler_target_decode_ct:
+                    if self.profile_in_progress:
+                        self.stop_profile(stage=ForwardMode.DECODE)
+            elif batch.forward_mode.is_idle():
+                pass
+            else:
+                raise RuntimeError(f"unsupported profile stage: {batch.forward_mode}")
+        else:
+            # Check profiler
+            if (
+                self.profiler_target_forward_ct
+                and self.profiler_target_forward_ct <= self.forward_ct
+            ):
+                self.stop_profile()
+            if (
+                self.profiler_start_forward_ct
+                and self.profiler_start_forward_ct == self.forward_ct
+            ):
+                self.start_profile()
+
+    def profile(self, recv_req: ProfileReq):
+        if recv_req.type == ProfileReqType.START_PROFILE:
+            if recv_req.profile_by_stage or recv_req.start_step:
+                return self.init_profile(
+                    recv_req.output_dir,
+                    recv_req.start_step,
+                    recv_req.num_steps,
+                    recv_req.activities,
+                    recv_req.with_stack,
+                    recv_req.record_shapes,
+                    recv_req.profile_by_stage,
+                    recv_req.profile_id,
+                )
+            else:
+                self.init_profile(
+                    recv_req.output_dir,
+                    recv_req.start_step,
+                    recv_req.num_steps,
+                    recv_req.activities,
+                    recv_req.with_stack,
+                    recv_req.record_shapes,
+                    recv_req.profile_by_stage,
+                    recv_req.profile_id,
+                )
+                return self.start_profile(True)
+        else:
+            return self.stop_profile()
diff --git a/python/sglang/srt/managers/scheduler_recv_skipper.py b/python/sglang/srt/managers/scheduler_recv_skipper.py
new file mode 100644
index 000000000..f0550c935
--- /dev/null
+++ b/python/sglang/srt/managers/scheduler_recv_skipper.py
@@ -0,0 +1,37 @@
+from sglang.srt.model_executor.forward_batch_info import ForwardMode
+from sglang.srt.server_args import ServerArgs
+
+
+class SchedulerRecvSkipper:
+    @staticmethod
+    def maybe_create(server_args: ServerArgs):
+        if server_args.scheduler_recv_interval <= 1:
+            return None
+        return SchedulerRecvSkipper(server_args)
+
+    def __init__(self, server_args: ServerArgs):
+        # Can be supported if needed, but may need e.g. `global_forward_mode`
+        assert not server_args.enable_dp_attention
+        self._counter = 0
+        self._threshold = server_args.scheduler_recv_interval
+
+    def handle(self, last_forward_mode: ForwardMode):
+        should_recv = False
+
+        last_weight = _WEIGHT_OF_FORWARD_MODE.get(last_forward_mode, _DEFAULT_WEIGHT)
+        self._counter += last_weight
+
+        if self._counter >= self._threshold:
+            self._counter = 0
+            should_recv = True
+
+        return should_recv
+
+
+# All can be tuned if needed
+_DEFAULT_WEIGHT = 1000
+_WEIGHT_OF_FORWARD_MODE = {
+    ForwardMode.DECODE: 1,
+    ForwardMode.TARGET_VERIFY: 1,
+    None: 1,
+}
diff --git a/python/sglang/srt/managers/scheduler_update_weights_mixin.py b/python/sglang/srt/managers/scheduler_update_weights_mixin.py
new file mode 100644
index 000000000..fdae2142c
--- /dev/null
+++ b/python/sglang/srt/managers/scheduler_update_weights_mixin.py
@@ -0,0 +1,155 @@
+import logging
+from typing import Tuple
+
+import torch
+
+from sglang.srt.constants import GPU_MEMORY_TYPE_KV_CACHE, GPU_MEMORY_TYPE_WEIGHTS
+from sglang.srt.managers.io_struct import (
+    GetWeightsByNameReqInput,
+    GetWeightsByNameReqOutput,
+    InitWeightsUpdateGroupReqInput,
+    InitWeightsUpdateGroupReqOutput,
+    ReleaseMemoryOccupationReqInput,
+    ReleaseMemoryOccupationReqOutput,
+    ResumeMemoryOccupationReqInput,
+    ResumeMemoryOccupationReqOutput,
+    UpdateWeightFromDiskReqInput,
+    UpdateWeightFromDiskReqOutput,
+    UpdateWeightsFromDistributedReqInput,
+    UpdateWeightsFromDistributedReqOutput,
+    UpdateWeightsFromTensorReqInput,
+    UpdateWeightsFromTensorReqOutput,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class SchedulerUpdateWeightsMixin:
+
+    def update_weights_from_disk(self, recv_req: UpdateWeightFromDiskReqInput):
+        """In-place update of the weights from disk."""
+        success, message = self.tp_worker.update_weights_from_disk(recv_req)
+        if success:
+            flush_cache_success = self.flush_cache()
+            assert flush_cache_success, "Cache flush failed after updating weights"
+        else:
+            logger.error(message)
+        return UpdateWeightFromDiskReqOutput(success, message, 0)
+
+    def init_weights_update_group(self, recv_req: InitWeightsUpdateGroupReqInput):
+        """Initialize the online model parameter update group."""
+        success, message = self.tp_worker.init_weights_update_group(recv_req)
+        return InitWeightsUpdateGroupReqOutput(success, message)
+
+    def update_weights_from_distributed(
+        self,
+        recv_req: UpdateWeightsFromDistributedReqInput,
+    ) -> Tuple[bool, str]:
+        """Update the online model parameter."""
+        success, message = self.tp_worker.update_weights_from_distributed(recv_req)
+        if success:
+            if recv_req.flush_cache:
+                flush_cache_success = self.flush_cache()
+                assert flush_cache_success, "Cache flush failed after updating weights"
+        else:
+            logger.error(message)
+        return UpdateWeightsFromDistributedReqOutput(success, message)
+
+    def update_weights_from_tensor(self, recv_req: UpdateWeightsFromTensorReqInput):
+        """Update the online model parameter from tensors."""
+        success, message = self.tp_worker.update_weights_from_tensor(recv_req)
+        # TODO extract common code b/t update_weights_from_distributed and update_weights_from_tensor later
+        if success:
+            if recv_req.flush_cache:
+                flush_cache_success = self.flush_cache()
+                assert flush_cache_success, "Cache flush failed after updating weights"
+        else:
+            logger.error(message)
+        torch.distributed.barrier(group=self.tp_cpu_group)
+        return UpdateWeightsFromTensorReqOutput(success, message)
+
+    def get_weights_by_name(self, recv_req: GetWeightsByNameReqInput):
+        parameter = self.tp_worker.get_weights_by_name(recv_req)
+        return GetWeightsByNameReqOutput(parameter)
+
+    def release_memory_occupation(self, recv_req: ReleaseMemoryOccupationReqInput):
+        tags = recv_req.tags
+
+        if tags is None or len(tags) == 0:
+            tags = [GPU_MEMORY_TYPE_WEIGHTS, GPU_MEMORY_TYPE_KV_CACHE]
+
+        for tag in tags:
+            self.offload_tags.add(tag)
+
+        if GPU_MEMORY_TYPE_KV_CACHE in tags:
+            self.memory_saver_adapter.pause(GPU_MEMORY_TYPE_KV_CACHE)
+            self.flush_cache()
+
+        if GPU_MEMORY_TYPE_WEIGHTS in tags:
+            self.stashed_model_static_state = _export_static_state(
+                self.tp_worker.worker.model_runner.model
+            )
+            torch.distributed.barrier(self.tp_cpu_group)
+            self.memory_saver_adapter.pause(GPU_MEMORY_TYPE_WEIGHTS)
+
+        return ReleaseMemoryOccupationReqOutput()
+
+    def resume_memory_occupation(self, recv_req: ResumeMemoryOccupationReqInput):
+        tags = recv_req.tags
+
+        if tags is None or len(tags) == 0:
+            tags = [GPU_MEMORY_TYPE_WEIGHTS, GPU_MEMORY_TYPE_KV_CACHE]
+
+        for tag in tags:
+            self.offload_tags.remove(tag)
+
+        if GPU_MEMORY_TYPE_WEIGHTS in tags:
+            self.memory_saver_adapter.resume(GPU_MEMORY_TYPE_WEIGHTS)
+            torch.distributed.barrier(self.tp_cpu_group)
+            _import_static_state(
+                self.tp_worker.worker.model_runner.model,
+                self.stashed_model_static_state,
+            )
+            del self.stashed_model_static_state
+
+        if GPU_MEMORY_TYPE_KV_CACHE in tags:
+            self.memory_saver_adapter.resume(GPU_MEMORY_TYPE_KV_CACHE)
+
+        return ResumeMemoryOccupationReqOutput()
+
+    def save_remote_model(self, params):
+        url = params["url"]
+
+        worker = self.tp_worker.worker
+        worker.model_runner.save_remote_model(url)
+
+        if self.draft_worker is not None:
+            draft_url = params.get("draft_url", None)
+            assert (
+                draft_url is not None
+            ), "draft_url must be provided when draft model is enabled"
+            draft_worker = self.draft_worker.worker
+            draft_worker.model_runner.save_remote_model(draft_url)
+
+    def save_sharded_model(self, params):
+        worker = self.tp_worker.worker
+
+        worker.model_runner.save_sharded_model(
+            path=params["path"],
+            pattern=params["pattern"],
+            max_size=params["max_size"],
+        )
+
+
+def _export_static_state(model):
+    return dict(
+        buffers=[
+            (name, buffer.detach().clone()) for name, buffer in model.named_buffers()
+        ]
+    )
+
+
+def _import_static_state(model, static_params):
+    self_named_buffers = dict(model.named_buffers())
+    for name, tensor in static_params["buffers"]:
+        self_named_buffers[name][...] = tensor
diff --git a/python/sglang/srt/managers/session_controller.py b/python/sglang/srt/managers/session_controller.py
new file mode 100644
index 000000000..5f041beb0
--- /dev/null
+++ b/python/sglang/srt/managers/session_controller.py
@@ -0,0 +1,158 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import logging
+import uuid
+from typing import Dict, Optional
+
+from sglang.srt.managers.io_struct import TokenizedGenerateReqInput
+from sglang.srt.managers.schedule_batch import Req
+
+
+class SessionReqNode:
+    def __init__(self, req, parent=None, childs=None):
+        self.req = req
+        self.parent = parent
+        if parent is not None:
+            parent.childs.append(self)
+        self.childs = [] if not childs else childs
+
+    def clear_childs(self, req_dict):
+        for req_node in self.childs:
+            req_node.clear(req_dict)
+        self.childs = []
+
+    def clear(self, req_dict):
+        for req_node in self.childs:
+            req_node.clear(req_dict)
+
+        if self.req.finished_reason is None:
+            self.req.to_abort = True
+        del req_dict[self.req.rid]
+
+    def abort(self):
+        if self.req.finished_reason is None:
+            self.req.to_abort = True
+
+    def __str__(self):
+        return self._str_helper(self.req.rid)
+
+    def _str_helper(self, prefix=""):
+        if len(self.childs) == 0:
+            return prefix + "\n"
+        else:
+            origin_prefix = prefix
+            prefix += " -- " + self.childs[0].req.rid
+            ret = self.childs[0]._str_helper(prefix)
+            for child in self.childs[1:]:
+                prefix = " " * len(origin_prefix) + " \\- " + child.req.rid
+                ret += child._str_helper(prefix)
+            return ret
+
+
+class Session:
+    def __init__(self, capacity_of_str_len: int, session_id: Optional[str] = None):
+        self.session_id = session_id if session_id is not None else uuid.uuid4().hex
+        self.capacity_of_str_len = capacity_of_str_len
+        self.req_nodes: Dict[str, SessionReqNode] = {}
+
+    def create_req(self, req: TokenizedGenerateReqInput, tokenizer):
+        assert req.session_params is not None
+        session_params = req.session_params
+
+        last_req_node = None
+        last_req = None
+        abort = False
+        if session_params.replace:
+            if session_params.rid is None:
+                for _, req_node in self.req_nodes.items():
+                    req_node.clear(self.req_nodes)
+            else:
+                if session_params.rid not in self.req_nodes:
+                    abort = True
+                else:
+                    last_req_node = self.req_nodes[session_params.rid]
+                    last_req_node.abort()
+                    last_req = last_req_node.req
+                    last_req_node.clear_childs(self.req_nodes)
+        else:
+            if session_params.rid is not None:
+                if session_params.rid not in self.req_nodes:
+                    abort = True
+                else:
+                    last_req_node = self.req_nodes[session_params.rid]
+                    last_req = last_req_node.req
+                    if not last_req.finished():
+                        logging.warning(
+                            "The request in a session is appending to a request that hasn't finished."
+                        )
+                        abort = True
+
+        if last_req is not None:
+            # trim bos token if it is an append
+            if tokenizer is not None and req.input_ids[0] == tokenizer.bos_token_id:
+                req.input_ids = req.input_ids[1:]
+
+            input_ids = (
+                last_req.origin_input_ids
+                + last_req.output_ids[: last_req.sampling_params.max_new_tokens]
+            )
+
+            if session_params.drop_previous_output:
+                input_ids = last_req.origin_input_ids[:]
+
+            if session_params.offset and session_params.offset != 0:
+                input_ids = input_ids[: session_params.offset] + req.input_ids
+            else:
+                input_ids += req.input_ids
+
+            input_ids_unpadded = (
+                last_req.origin_input_ids_unpadded
+                + last_req.output_ids[: last_req.sampling_params.max_new_tokens]
+            )
+            if session_params.drop_previous_output:
+                input_ids_unpadded = last_req.origin_input_ids_unpadded[:]
+
+            if session_params.offset and session_params.offset != 0:
+                input_ids_unpadded = (
+                    input_ids_unpadded[: session_params.offset] + req.input_ids
+                )
+            else:
+                input_ids_unpadded += req.input_ids
+        else:
+            input_ids = req.input_ids
+            input_ids_unpadded = req.input_ids
+        new_req = Req(
+            rid=req.rid,
+            origin_input_text=None,
+            origin_input_ids=input_ids,
+            origin_input_ids_unpadded=input_ids_unpadded,
+            sampling_params=req.sampling_params,
+            lora_path=req.lora_path,
+            session_id=self.session_id,
+            custom_logit_processor=req.custom_logit_processor,
+            stream=req.stream,
+            return_logprob=req.return_logprob,
+            top_logprobs_num=req.top_logprobs_num,
+            token_ids_logprob=req.token_ids_logprob,
+        )
+        if last_req is not None:
+            new_req.multimodal_inputs = last_req.multimodal_inputs
+        new_req.tokenizer = tokenizer
+
+        if abort:
+            new_req.set_finish_with_abort("Invalid request session id")
+        else:
+            new_req_node = SessionReqNode(new_req, last_req_node)
+            self.req_nodes[req.rid] = new_req_node
+
+        return new_req
diff --git a/python/sglang/srt/managers/template_manager.py b/python/sglang/srt/managers/template_manager.py
new file mode 100644
index 000000000..1d9bbea81
--- /dev/null
+++ b/python/sglang/srt/managers/template_manager.py
@@ -0,0 +1,308 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Centralized template management for chat templates and completion templates.
+
+This module provides a unified interface for managing both chat conversation templates
+and code completion templates, eliminating global state and improving modularity.
+"""
+
+import json
+import logging
+import os
+import re
+from typing import Optional
+
+from sglang.srt.parser.code_completion_parser import (
+    CompletionTemplate,
+    FimPosition,
+    completion_template_exists,
+    register_completion_template,
+)
+from sglang.srt.parser.conversation import (
+    Conversation,
+    SeparatorStyle,
+    chat_template_exists,
+    get_conv_template_by_model_path,
+    register_conv_template,
+)
+from sglang.srt.parser.jinja_template_utils import detect_jinja_template_content_format
+
+logger = logging.getLogger(__name__)
+
+
+class TemplateManager:
+    """
+    Centralized manager for chat and completion templates.
+
+    This class encapsulates all template-related state and operations,
+    eliminating the need for global variables and providing a clean
+    interface for template management.
+    """
+
+    def __init__(self):
+        self._chat_template_name: Optional[str] = None
+        self._completion_template_name: Optional[str] = None
+        self._jinja_template_content_format: Optional[str] = "openai"
+        self._force_reasoning: bool = False
+
+    @property
+    def chat_template_name(self) -> Optional[str]:
+        """Get the current chat template name."""
+        return self._chat_template_name
+
+    @property
+    def completion_template_name(self) -> Optional[str]:
+        """Get the current completion template name."""
+        return self._completion_template_name
+
+    @property
+    def jinja_template_content_format(self) -> Optional[str]:
+        """Get the detected template content format ('string' or 'openai' or None)."""
+        return self._jinja_template_content_format
+
+    @property
+    def force_reasoning(self) -> bool:
+        """
+        Check if the current chat template enforces reasoning/thinking.
+
+        Returns:
+            True if the template contains reasoning patterns like <think> tags
+        """
+        return self._force_reasoning
+
+    def _detect_reasoning_pattern(self, template: str) -> bool:
+        """
+        Detect if the chat template contains reasoning/thinking patterns.
+        """
+        if template is None:
+            return False
+
+        # TODO: remove this hard code the reasoning pattern
+        force_reasoning_pattern = r"<\|im_start\|>assistant\\n<think>\\n"
+        has_reasoning = re.search(force_reasoning_pattern, template) is not None
+
+        if has_reasoning:
+            logger.info("Detected the force reasoning pattern in chat template.")
+
+        return has_reasoning
+
+    def load_chat_template(
+        self, tokenizer_manager, chat_template_arg: Optional[str], model_path: str
+    ) -> None:
+        """
+        Load a chat template from various sources.
+
+        Args:
+            tokenizer_manager: The tokenizer manager instance
+            chat_template_arg: Template name, file path, or None to auto-detect
+            model_path: Path to the model
+        """
+        if chat_template_arg:
+            self._load_explicit_chat_template(tokenizer_manager, chat_template_arg)
+        else:
+            # Guess chat template from model path
+            self.guess_chat_template_from_model_path(model_path)
+
+            # If no pre-defined template was found, fallback to HuggingFace template
+            if self._chat_template_name is None:
+                # Try HuggingFace template first
+                hf_template = self._resolve_hf_chat_template(tokenizer_manager)
+                if hf_template:
+                    # override the chat template
+                    if tokenizer_manager.tokenizer:
+                        tokenizer_manager.tokenizer.chat_template = hf_template
+                    self._jinja_template_content_format = (
+                        detect_jinja_template_content_format(hf_template)
+                    )
+                    logger.info(
+                        f"Using default HuggingFace chat template with detected content format: {self._jinja_template_content_format}"
+                    )
+                else:
+                    # Default to string content format if no template was found
+                    self._jinja_template_content_format = "string"
+                    logger.info(
+                        "No chat template found, defaulting to 'string' content format"
+                    )
+
+        # Detect reasoning pattern from chat template
+        if tokenizer_manager.tokenizer:
+            self._force_reasoning = self._detect_reasoning_pattern(
+                tokenizer_manager.tokenizer.chat_template
+            )
+
+    def _load_explicit_chat_template(
+        self, tokenizer_manager, chat_template_arg: str
+    ) -> None:
+        """Load explicitly specified chat template."""
+        logger.info(f"Loading chat template from argument: {chat_template_arg}")
+
+        if chat_template_exists(chat_template_arg):
+            self._chat_template_name = chat_template_arg
+            return
+
+        if not os.path.exists(chat_template_arg):
+            raise RuntimeError(
+                f"Chat template {chat_template_arg} is not a built-in template name "
+                "or a valid chat template file path."
+            )
+
+        if chat_template_arg.endswith(".jinja"):
+            self._load_jinja_template(tokenizer_manager, chat_template_arg)
+        else:
+            self._load_json_chat_template(chat_template_arg)
+
+    def guess_chat_template_from_model_path(self, model_path: str) -> None:
+        """
+        Infer chat template name from model path.
+
+        Args:
+            model_path: Path to the model
+        """
+        template_name = get_conv_template_by_model_path(model_path)
+        if template_name is not None:
+            logger.info(f"Inferred chat template from model path: {template_name}")
+            self._chat_template_name = template_name
+
+    def load_completion_template(self, completion_template_arg: str) -> None:
+        """
+        Load completion template for code completion.
+
+        Args:
+            completion_template_arg: Template name or file path
+        """
+        logger.info(f"Loading completion template: {completion_template_arg}")
+
+        if not completion_template_exists(completion_template_arg):
+            if not os.path.exists(completion_template_arg):
+                raise RuntimeError(
+                    f"Completion template {completion_template_arg} is not a built-in template name "
+                    "or a valid completion template file path."
+                )
+
+            self._load_json_completion_template(completion_template_arg)
+        else:
+            self._completion_template_name = completion_template_arg
+
+    def initialize_templates(
+        self,
+        tokenizer_manager,
+        model_path: str,
+        chat_template: Optional[str] = None,
+        completion_template: Optional[str] = None,
+    ) -> None:
+        """
+        Initialize all templates based on provided configuration.
+
+        Args:
+            tokenizer_manager: The tokenizer manager instance
+            model_path: Path to the model
+            chat_template: Optional chat template name/path
+            completion_template: Optional completion template name/path
+        """
+        # Load chat template
+        self.load_chat_template(tokenizer_manager, chat_template, model_path)
+
+        # Load completion template
+        if completion_template:
+            self.load_completion_template(completion_template)
+
+    def _load_jinja_template(self, tokenizer_manager, template_path: str) -> None:
+        """Load a Jinja template file."""
+        with open(template_path, "r") as f:
+            chat_template = "".join(f.readlines()).strip("\n")
+        tokenizer_manager.tokenizer.chat_template = chat_template.replace("\\n", "\n")
+        self._chat_template_name = None
+        # Detect content format from the loaded template
+        self._jinja_template_content_format = detect_jinja_template_content_format(
+            chat_template
+        )
+        logger.info(
+            f"Detected user specified Jinja chat template with content format: {self._jinja_template_content_format}"
+        )
+
+    def _load_json_chat_template(self, template_path: str) -> None:
+        """Load a JSON chat template file."""
+        assert template_path.endswith(
+            ".json"
+        ), "unrecognized format of chat template file"
+
+        with open(template_path, "r") as filep:
+            template = json.load(filep)
+            try:
+                sep_style = SeparatorStyle[template["sep_style"]]
+            except KeyError:
+                raise ValueError(
+                    f"Unknown separator style: {template['sep_style']}"
+                ) from None
+
+            register_conv_template(
+                Conversation(
+                    name=template["name"],
+                    system_template=template["system"] + "\n{system_message}",
+                    system_message=template.get("system_message", ""),
+                    roles=(template["user"], template["assistant"]),
+                    sep_style=sep_style,
+                    sep=template.get("sep", "\n"),
+                    stop_str=template["stop_str"],
+                ),
+                override=True,
+            )
+        self._chat_template_name = template["name"]
+
+    def _load_json_completion_template(self, template_path: str) -> None:
+        """Load a JSON completion template file."""
+        assert template_path.endswith(
+            ".json"
+        ), "unrecognized format of completion template file"
+
+        with open(template_path, "r") as filep:
+            template = json.load(filep)
+            try:
+                fim_position = FimPosition[template["fim_position"]]
+            except KeyError:
+                raise ValueError(
+                    f"Unknown fim position: {template['fim_position']}"
+                ) from None
+
+            register_completion_template(
+                CompletionTemplate(
+                    name=template["name"],
+                    fim_begin_token=template["fim_begin_token"],
+                    fim_middle_token=template["fim_middle_token"],
+                    fim_end_token=template["fim_end_token"],
+                    fim_position=fim_position,
+                ),
+                override=True,
+            )
+        self._completion_template_name = template["name"]
+
+    def _resolve_hf_chat_template(self, tokenizer_manager) -> Optional[str]:
+        """
+        Resolve HuggingFace chat template.
+
+        Returns the chat template string if found, None otherwise.
+        """
+        try:
+            if processor := tokenizer_manager.processor:
+                if hasattr(processor, "chat_template") and processor.chat_template:
+                    return processor.chat_template
+            if tokenizer := tokenizer_manager.tokenizer:
+                if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
+                    return tokenizer.chat_template
+        except Exception as e:
+            logger.debug(f"Error getting chat template: {e}")
+
+        logger.debug("No HuggingFace chat template found")
+        return None
diff --git a/python/sglang/srt/managers/tokenizer_communicator_mixin.py b/python/sglang/srt/managers/tokenizer_communicator_mixin.py
new file mode 100644
index 000000000..e59d3f296
--- /dev/null
+++ b/python/sglang/srt/managers/tokenizer_communicator_mixin.py
@@ -0,0 +1,491 @@
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+import time
+from collections import deque
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Deque,
+    Dict,
+    Generic,
+    List,
+    Optional,
+    Tuple,
+    TypeVar,
+)
+
+import fastapi
+
+from sglang.srt.managers.io_struct import (
+    ClearHiCacheReqInput,
+    ClearHiCacheReqOutput,
+    ExpertDistributionReq,
+    ExpertDistributionReqOutput,
+    FlushCacheReqInput,
+    FlushCacheReqOutput,
+    GetInternalStateReq,
+    GetInternalStateReqOutput,
+    GetWeightsByNameReqInput,
+    GetWeightsByNameReqOutput,
+    InitWeightsUpdateGroupReqInput,
+    InitWeightsUpdateGroupReqOutput,
+    LoadLoRAAdapterReqInput,
+    LoadLoRAAdapterReqOutput,
+    LoRAUpdateResult,
+    MultiTokenizerWrapper,
+    ProfileReq,
+    ProfileReqOutput,
+    ProfileReqType,
+    ReleaseMemoryOccupationReqInput,
+    ReleaseMemoryOccupationReqOutput,
+    ResumeMemoryOccupationReqInput,
+    ResumeMemoryOccupationReqOutput,
+    SetInternalStateReq,
+    SetInternalStateReqOutput,
+    SlowDownReqInput,
+    SlowDownReqOutput,
+    UnloadLoRAAdapterReqInput,
+    UnloadLoRAAdapterReqOutput,
+    UpdateWeightsFromDistributedReqInput,
+    UpdateWeightsFromDistributedReqOutput,
+    UpdateWeightsFromTensorReqInput,
+    UpdateWeightsFromTensorReqOutput,
+)
+from sglang.srt.server_args import LoRARef, ServerArgs
+from sglang.srt.utils import get_bool_env_var
+from sglang.utils import TypeBasedDispatcher
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.tokenizer_manager import TokenizerManager
+
+T = TypeVar("T")
+
+logger = logging.getLogger(__name__)
+
+
+class _Communicator(Generic[T]):
+    """Note: The communicator now only run up to 1 in-flight request at any time."""
+
+    enable_multi_tokenizer = False
+
+    def __init__(self, sender, fan_out: int):
+        self._sender = sender
+        self._fan_out = fan_out
+        self._result_event: Optional[asyncio.Event] = None
+        self._result_values: Optional[List[T]] = None
+        self._ready_queue: Deque[asyncio.Future] = deque()
+
+    async def __call__(self, obj):
+        ready_event = asyncio.Event()
+        if self._result_event is not None or len(self._ready_queue) > 0:
+            self._ready_queue.append(ready_event)
+            await ready_event.wait()
+            assert self._result_event is None
+            assert self._result_values is None
+
+        if obj:
+            if _Communicator.enable_multi_tokenizer:
+                obj = MultiTokenizerWrapper(worker_id=os.getpid(), obj=obj)
+            self._sender.send_pyobj(obj)
+
+        self._result_event = asyncio.Event()
+        self._result_values = []
+        await self._result_event.wait()
+        result_values = self._result_values
+        self._result_event = self._result_values = None
+
+        if len(self._ready_queue) > 0:
+            self._ready_queue.popleft().set()
+
+        return result_values
+
+    def handle_recv(self, recv_obj: T):
+        self._result_values.append(recv_obj)
+        if len(self._result_values) == self._fan_out:
+            self._result_event.set()
+
+
+class TokenizerCommunicatorMixin:
+    """Mixin class for TokenizerManager to handle communication with the scheduler."""
+
+    def init_communicators(self: TokenizerManager, server_args: ServerArgs):
+        # Communicators
+        self.init_weights_update_group_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.update_weights_from_distributed_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.update_weights_from_tensor_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.get_weights_by_name_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.release_memory_occupation_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.resume_memory_occupation_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.slow_down_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.flush_cache_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.clear_hicache_storage_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.profile_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.get_internal_state_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.set_internal_state_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.expert_distribution_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.update_lora_adapter_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+
+        self._result_dispatcher += self._get_communicator_dispatcher()
+
+    def _get_communicator_dispatcher(self: TokenizerManager):
+        return TypeBasedDispatcher(
+            [
+                (
+                    InitWeightsUpdateGroupReqOutput,
+                    self.init_weights_update_group_communicator.handle_recv,
+                ),
+                (
+                    UpdateWeightsFromDistributedReqOutput,
+                    self.update_weights_from_distributed_communicator.handle_recv,
+                ),
+                (
+                    UpdateWeightsFromTensorReqOutput,
+                    self.update_weights_from_tensor_communicator.handle_recv,
+                ),
+                (
+                    GetWeightsByNameReqOutput,
+                    self.get_weights_by_name_communicator.handle_recv,
+                ),
+                (
+                    ReleaseMemoryOccupationReqOutput,
+                    self.release_memory_occupation_communicator.handle_recv,
+                ),
+                (
+                    ResumeMemoryOccupationReqOutput,
+                    self.resume_memory_occupation_communicator.handle_recv,
+                ),
+                (
+                    SlowDownReqOutput,
+                    self.slow_down_communicator.handle_recv,
+                ),
+                (
+                    ClearHiCacheReqOutput,
+                    self.clear_hicache_storage_communicator.handle_recv,
+                ),
+                (
+                    FlushCacheReqOutput,
+                    self.flush_cache_communicator.handle_recv,
+                ),
+                (
+                    ProfileReqOutput,
+                    self.profile_communicator.handle_recv,
+                ),
+                (
+                    GetInternalStateReqOutput,
+                    self.get_internal_state_communicator.handle_recv,
+                ),
+                (
+                    SetInternalStateReqOutput,
+                    self.set_internal_state_communicator.handle_recv,
+                ),
+                (
+                    ExpertDistributionReqOutput,
+                    self.expert_distribution_communicator.handle_recv,
+                ),
+                (
+                    LoRAUpdateResult,
+                    self.update_lora_adapter_communicator.handle_recv,
+                ),
+            ]
+        )
+
+    async def flush_cache(self: TokenizerManager) -> FlushCacheReqOutput:
+        return (await self.flush_cache_communicator(FlushCacheReqInput()))[0]
+
+    async def clear_hicache_storage(self: TokenizerManager) -> ClearHiCacheReqOutput:
+        """Clear the hierarchical cache storage."""
+        # Delegate to the scheduler to handle HiCacheStorage clearing
+        return (await self.clear_hicache_storage_communicator(ClearHiCacheReqInput()))[
+            0
+        ]
+
+    async def start_profile(
+        self: TokenizerManager,
+        output_dir: Optional[str] = None,
+        start_step: Optional[int] = None,
+        num_steps: Optional[int] = None,
+        activities: Optional[List[str]] = None,
+        with_stack: Optional[bool] = None,
+        record_shapes: Optional[bool] = None,
+        profile_by_stage: bool = False,
+    ):
+        self.auto_create_handle_loop()
+        env_with_stack: bool = get_bool_env_var("SGLANG_PROFILE_WITH_STACK", "true")
+        with_stack = False if with_stack is False or env_with_stack is False else True
+        req = ProfileReq(
+            type=ProfileReqType.START_PROFILE,
+            output_dir=output_dir,
+            start_step=start_step,
+            num_steps=num_steps,
+            activities=activities,
+            with_stack=with_stack,
+            record_shapes=record_shapes,
+            profile_by_stage=profile_by_stage,
+            profile_id=str(time.time()),
+        )
+        return await self._execute_profile(req)
+
+    async def stop_profile(self: TokenizerManager):
+        self.auto_create_handle_loop()
+        req = ProfileReq(type=ProfileReqType.STOP_PROFILE)
+        return await self._execute_profile(req)
+
+    async def _execute_profile(self: TokenizerManager, req: ProfileReq):
+        result = (await self.profile_communicator(req))[0]
+        if not result.success:
+            raise RuntimeError(result.message)
+        return result
+
+    async def start_expert_distribution_record(self: TokenizerManager):
+        self.auto_create_handle_loop()
+        await self.expert_distribution_communicator(ExpertDistributionReq.START_RECORD)
+
+    async def stop_expert_distribution_record(self: TokenizerManager):
+        self.auto_create_handle_loop()
+        await self.expert_distribution_communicator(ExpertDistributionReq.STOP_RECORD)
+
+    async def dump_expert_distribution_record(self: TokenizerManager):
+        self.auto_create_handle_loop()
+        await self.expert_distribution_communicator(ExpertDistributionReq.DUMP_RECORD)
+
+    async def init_weights_update_group(
+        self: TokenizerManager,
+        obj: InitWeightsUpdateGroupReqInput,
+        request: Optional[fastapi.Request] = None,
+    ) -> Tuple[bool, str]:
+        self.auto_create_handle_loop()
+        assert (
+            self.server_args.dp_size == 1
+        ), "dp_size must be 1 for init parameter update group"
+        result = (await self.init_weights_update_group_communicator(obj))[0]
+        return result.success, result.message
+
+    async def update_weights_from_distributed(
+        self: TokenizerManager,
+        obj: UpdateWeightsFromDistributedReqInput,
+        request: Optional[fastapi.Request] = None,
+    ) -> Tuple[bool, str]:
+        self.auto_create_handle_loop()
+        assert (
+            self.server_args.dp_size == 1 or self.server_args.enable_dp_attention
+        ), "dp_size must be 1 or dp attention must be enabled for update weights from distributed"
+
+        if obj.abort_all_requests:
+            self.abort_request(abort_all=True)
+
+        # This means that weight sync
+        # cannot run while requests are in progress.
+        async with self.model_update_lock.writer_lock:
+            result = (await self.update_weights_from_distributed_communicator(obj))[0]
+            return result.success, result.message
+
+    async def update_weights_from_tensor(
+        self: TokenizerManager,
+        obj: UpdateWeightsFromTensorReqInput,
+        request: Optional[fastapi.Request] = None,
+    ) -> Tuple[bool, str]:
+        self.auto_create_handle_loop()
+        assert (
+            self.server_args.dp_size == 1 or self.server_args.enable_dp_attention
+        ), "dp_size must be 1 or dp attention must be enabled for update weights from tensor"
+
+        if obj.abort_all_requests:
+            self.abort_request(abort_all=True)
+
+        # This means that weight sync
+        # cannot run while requests are in progress.
+        async with self.model_update_lock.writer_lock:
+            result = (await self.update_weights_from_tensor_communicator(obj))[0]
+            return result.success, result.message
+
+    async def load_lora_adapter(
+        self: TokenizerManager,
+        obj: LoadLoRAAdapterReqInput,
+        _: Optional[fastapi.Request] = None,
+    ) -> LoadLoRAAdapterReqOutput:
+        self.auto_create_handle_loop()
+
+        try:
+            if not self.server_args.enable_lora:
+                raise ValueError(
+                    "LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
+                )
+
+            # TODO (lifuhuang): Remove this after we verify that dynamic lora loading works
+            # with dp_size > 1.
+            assert (
+                self.server_args.dp_size == 1
+            ), "dp_size must be 1 for dynamic lora loading"
+            logger.info(
+                "Start load Lora adapter. Lora name=%s, path=%s",
+                obj.lora_name,
+                obj.lora_path,
+            )
+
+            async with self.lora_update_lock:
+                if (
+                    self.server_args.max_loaded_loras is not None
+                    and self.lora_registry.num_registered_loras
+                    >= self.server_args.max_loaded_loras
+                ):
+                    raise ValueError(
+                        f"Cannot load LoRA adapter {obj.lora_name} at path {obj.lora_path}. "
+                        f"Maximum number of loaded LoRA adapters is {self.server_args.max_loaded_loras}. "
+                        "Please unload some LoRA adapters before loading new ones."
+                    )
+
+                # Generate new uniquely identifiable LoRARef object.
+                new_adapter = LoRARef(
+                    lora_name=obj.lora_name,
+                    lora_path=obj.lora_path,
+                    pinned=obj.pinned,
+                )
+
+                # Trigger the actual loading operation at the backend processes.
+                obj.lora_id = new_adapter.lora_id
+                result = (await self.update_lora_adapter_communicator(obj))[0]
+
+                # Register the LoRA adapter only after loading is successful.
+                if result.success:
+                    await self.lora_registry.register(new_adapter)
+
+                return result
+        except ValueError as e:
+            return LoadLoRAAdapterReqOutput(
+                success=False,
+                error_message=str(e),
+            )
+
+    async def unload_lora_adapter(
+        self: TokenizerManager,
+        obj: UnloadLoRAAdapterReqInput,
+        _: Optional[fastapi.Request] = None,
+    ) -> UnloadLoRAAdapterReqOutput:
+        self.auto_create_handle_loop()
+
+        try:
+            if not self.server_args.enable_lora:
+                raise ValueError(
+                    "LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
+                )
+
+            assert (
+                obj.lora_name is not None
+            ), "lora_name must be provided to unload LoRA adapter"
+
+            # TODO (lifuhuang): Remove this after we verify that dynamic lora loading works
+            # with dp_size > 1.
+            assert (
+                self.server_args.dp_size == 1
+            ), "dp_size must be 1 for dynamic lora loading"
+            logger.info(
+                "Start unload Lora adapter. Lora name=%s",
+                obj.lora_name,
+            )
+
+            async with self.lora_update_lock:
+                # Unregister the LoRA adapter from the registry to stop new requests for this adapter
+                # from being started.
+                lora_id = await self.lora_registry.unregister(obj.lora_name)
+                obj.lora_id = lora_id
+
+                # Initiate the actual unloading operation at the backend processes only after all
+                # ongoing requests using this LoRA adapter are finished.
+                await self.lora_registry.wait_for_unload(lora_id)
+                result = (await self.update_lora_adapter_communicator(obj))[0]
+
+                return result
+        except ValueError as e:
+            return UnloadLoRAAdapterReqOutput(success=False, error_message=str(e))
+
+    async def get_weights_by_name(
+        self: TokenizerManager,
+        obj: GetWeightsByNameReqInput,
+        request: Optional[fastapi.Request] = None,
+    ):
+        self.auto_create_handle_loop()
+        results = await self.get_weights_by_name_communicator(obj)
+        all_parameters = [r.parameter for r in results]
+        if self.server_args.dp_size == 1:
+            return all_parameters[0]
+        else:
+            return all_parameters
+
+    async def release_memory_occupation(
+        self: TokenizerManager,
+        obj: ReleaseMemoryOccupationReqInput,
+        request: Optional[fastapi.Request] = None,
+    ):
+        self.auto_create_handle_loop()
+        await self.release_memory_occupation_communicator(obj)
+
+    async def resume_memory_occupation(
+        self: TokenizerManager,
+        obj: ResumeMemoryOccupationReqInput,
+        request: Optional[fastapi.Request] = None,
+    ):
+        self.auto_create_handle_loop()
+        await self.resume_memory_occupation_communicator(obj)
+
+    async def slow_down(
+        self: TokenizerManager,
+        obj: SlowDownReqInput,
+        request: Optional[fastapi.Request] = None,
+    ):
+        self.auto_create_handle_loop()
+        await self.slow_down_communicator(obj)
+
+    async def get_internal_state(self: TokenizerManager) -> List[Dict[Any, Any]]:
+        req = GetInternalStateReq()
+        responses: List[GetInternalStateReqOutput] = (
+            await self.get_internal_state_communicator(req)
+        )
+        # Many DP ranks
+        return [res.internal_state for res in responses]
+
+    async def set_internal_state(
+        self: TokenizerManager, obj: SetInternalStateReq
+    ) -> List[bool]:
+        responses: List[SetInternalStateReqOutput] = (
+            await self.set_internal_state_communicator(obj)
+        )
+        return [res.updated for res in responses]
+
+    async def get_load(self: TokenizerManager) -> dict:
+        # TODO(lsyin): fake load report server
+        if not self.current_load_lock.locked():
+            async with self.current_load_lock:
+                internal_state = await self.get_internal_state()
+                self.current_load = internal_state[0]["load"]
+        return {"load": self.current_load}
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
new file mode 100644
index 000000000..4812ca180
--- /dev/null
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -0,0 +1,1715 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TokenizerManager is a process that tokenizes the text."""
+
+import asyncio
+import copy
+import dataclasses
+import json
+import logging
+import math
+import os
+import pickle
+import signal
+import sys
+import threading
+import time
+import uuid
+from collections import deque
+from contextlib import nullcontext
+from datetime import datetime
+from enum import Enum
+from http import HTTPStatus
+from typing import Any, Awaitable, Dict, List, Optional, Tuple, Union
+
+import fastapi
+import torch
+import uvloop
+import zmq
+import zmq.asyncio
+from fastapi import BackgroundTasks
+
+from sglang.srt.aio_rwlock import RWLock
+from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.disaggregation.utils import DisaggregationMode
+from sglang.srt.hf_transformers_utils import (
+    get_processor,
+    get_tokenizer,
+    get_tokenizer_from_processor,
+)
+from sglang.srt.lora.lora_registry import LoRARef, LoRARegistry
+from sglang.srt.managers.disagg_service import start_disagg_service
+from sglang.srt.managers.io_struct import (
+    AbortReq,
+    BatchEmbeddingOut,
+    BatchMultimodalOut,
+    BatchStrOut,
+    BatchTokenIDOut,
+    BatchTokenizedEmbeddingReqInput,
+    BatchTokenizedGenerateReqInput,
+    CloseSessionReqInput,
+    ConfigureLoggingReq,
+    EmbeddingReqInput,
+    FreezeGCReq,
+    GenerateReqInput,
+    HealthCheckOutput,
+    MultiTokenizerWrapper,
+    OpenSessionReqInput,
+    OpenSessionReqOutput,
+    SessionParams,
+    TokenizedEmbeddingReqInput,
+    TokenizedGenerateReqInput,
+    UpdateWeightFromDiskReqInput,
+    UpdateWeightFromDiskReqOutput,
+)
+from sglang.srt.managers.mm_utils import TensorTransportMode
+from sglang.srt.managers.multimodal_processor import get_mm_processor, import_processors
+from sglang.srt.managers.scheduler import is_health_check_generate_req
+from sglang.srt.managers.scheduler_input_blocker import input_blocker_guard_region
+from sglang.srt.managers.tokenizer_communicator_mixin import TokenizerCommunicatorMixin
+from sglang.srt.metrics.collector import TokenizerMetricsCollector
+from sglang.srt.sampling.sampling_params import SamplingParams
+from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.utils import (
+    configure_gc_warning,
+    dataclass_to_string_truncated,
+    freeze_gc,
+    get_bool_env_var,
+    get_origin_rid,
+    get_zmq_socket,
+    kill_process_tree,
+)
+from sglang.utils import TypeBasedDispatcher, get_exception_traceback
+
+asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+
+logger = logging.getLogger(__name__)
+
+
+@dataclasses.dataclass
+class ReqState:
+    """Store the state a request."""
+
+    out_list: List[Dict[Any, Any]]
+    finished: bool
+    event: asyncio.Event
+    obj: Union[GenerateReqInput, EmbeddingReqInput]
+
+    # For metrics
+    created_time: float
+    finished_time: float = 0.0
+    first_token_time: float = 0.0
+    last_time: float = 0.0
+    last_completion_tokens: int = 1
+
+    # For streaming output
+    last_output_offset: int = 0
+
+    # For incremental state update.
+    # TODO(lianmin): do not initialize some lists if not needed.
+    text: str = ""
+    output_ids: List[int] = dataclasses.field(default_factory=list)
+    input_token_logprobs_val: List[float] = dataclasses.field(default_factory=list)
+    input_token_logprobs_idx: List[int] = dataclasses.field(default_factory=list)
+    output_token_logprobs_val: List[float] = dataclasses.field(default_factory=list)
+    output_token_logprobs_idx: List[int] = dataclasses.field(default_factory=list)
+    input_top_logprobs_val: List[List[float]] = dataclasses.field(default_factory=list)
+    input_top_logprobs_idx: List[List[int]] = dataclasses.field(default_factory=list)
+    output_top_logprobs_val: List[List[float]] = dataclasses.field(default_factory=list)
+    output_top_logprobs_idx: List[List[int]] = dataclasses.field(default_factory=list)
+    input_token_ids_logprobs_val: List = dataclasses.field(default_factory=list)
+    input_token_ids_logprobs_idx: List = dataclasses.field(default_factory=list)
+    output_token_ids_logprobs_val: List = dataclasses.field(default_factory=list)
+    output_token_ids_logprobs_idx: List = dataclasses.field(default_factory=list)
+
+
+class TokenizerManager(TokenizerCommunicatorMixin):
+    """TokenizerManager is a process that tokenizes the text."""
+
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+    ):
+        # Parse args
+        self.server_args = server_args
+        self.enable_metrics = server_args.enable_metrics
+        self.log_requests = server_args.log_requests
+        self.log_requests_level = server_args.log_requests_level
+        self.preferred_sampling_params = (
+            json.loads(server_args.preferred_sampling_params)
+            if server_args.preferred_sampling_params
+            else None
+        )
+        self.crash_dump_folder = server_args.crash_dump_folder
+
+        # Read model args
+        self.model_path = server_args.model_path
+        self.served_model_name = server_args.served_model_name
+        self.model_config = ModelConfig.from_server_args(server_args)
+        self.is_generation = self.model_config.is_generation
+        self.is_image_gen = self.model_config.is_image_gen
+        self.context_len = self.model_config.context_len
+        self.image_token_id = self.model_config.image_token_id
+        self.max_req_input_len = None  # Will be set later in engine.py
+
+        if self.model_config.is_multimodal:
+            import_processors()
+            try:
+                _processor = get_processor(
+                    server_args.tokenizer_path,
+                    tokenizer_mode=server_args.tokenizer_mode,
+                    trust_remote_code=server_args.trust_remote_code,
+                    revision=server_args.revision,
+                    use_fast=not server_args.disable_fast_image_processor,
+                )
+            except ValueError as e:
+                error_message = str(e)
+                if "does not have a slow version" in error_message:
+                    logger.info(
+                        f"Processor {server_args.tokenizer_path} does not have a slow version. Automatically use fast version"
+                    )
+                    _processor = get_processor(
+                        server_args.tokenizer_path,
+                        tokenizer_mode=server_args.tokenizer_mode,
+                        trust_remote_code=server_args.trust_remote_code,
+                        revision=server_args.revision,
+                        use_fast=True,
+                    )
+                else:
+                    raise e
+            transport_mode = _determine_tensor_transport_mode(self.server_args)
+
+            # We want to parallelize the image pre-processing so we create an executor for it
+            # We create mm_processor for any skip_tokenizer_init to make sure we still encode
+            # images even with skip_tokenizer_init=False.
+            self.mm_processor = get_mm_processor(
+                self.model_config.hf_config, server_args, _processor, transport_mode
+            )
+
+            if server_args.skip_tokenizer_init:
+                self.tokenizer = self.processor = None
+            else:
+                self.processor = _processor
+                self.tokenizer = get_tokenizer_from_processor(self.processor)
+                os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        else:
+            self.mm_processor = self.processor = None
+
+            if server_args.skip_tokenizer_init:
+                self.tokenizer = None
+            else:
+                self.tokenizer = get_tokenizer(
+                    server_args.tokenizer_path,
+                    tokenizer_mode=server_args.tokenizer_mode,
+                    trust_remote_code=server_args.trust_remote_code,
+                    revision=server_args.revision,
+                )
+
+        # Init inter-process communication
+        context = zmq.asyncio.Context(2)
+        self.recv_from_detokenizer = get_zmq_socket(
+            context, zmq.PULL, port_args.tokenizer_ipc_name, True
+        )
+        if self.server_args.tokenizer_worker_num > 1:
+            # Use tokenizer_worker_ipc_name in multi-tokenizer mode
+            self.send_to_scheduler = get_zmq_socket(
+                context, zmq.PUSH, port_args.tokenizer_worker_ipc_name, False
+            )
+        else:
+            self.send_to_scheduler = get_zmq_socket(
+                context, zmq.PUSH, port_args.scheduler_input_ipc_name, True
+            )
+
+        # Request states
+        self.no_create_loop = False
+        self.rid_to_state: Dict[str, ReqState] = {}
+        self.asyncio_tasks = set()
+
+        # Health check
+        self.server_status = ServerStatus.Starting
+        self.gracefully_exit = False
+        self.last_receive_tstamp = 0
+
+        # Dumping
+        self.dump_requests_folder = ""  # By default do not dump
+        self.dump_requests_threshold = 1000
+        self.dump_request_list: List[Tuple] = []
+        self.log_request_metadata = self.get_log_request_metadata()
+        self.crash_dump_request_list: deque[Tuple] = deque()
+        self.crash_dump_performed = False  # Flag to ensure dump is only called once
+
+        # Session
+        self.session_futures = {}  # session_id -> asyncio event
+
+        # Weight updates
+        # The event to notify the weight sync is finished.
+        self.model_update_lock = RWLock()
+        self.model_update_result: Optional[Awaitable[UpdateWeightFromDiskReqOutput]] = (
+            None
+        )
+        self.is_pause = False
+        self.is_pause_cond = asyncio.Condition()
+
+        # LoRA
+        # Initialize the `LoRARegistry` with initial LoRA adapter paths provided in `server_args`.
+        # The registry dynamically updates as adapters are loaded / unloaded during runtime. It
+        # serves as the source of truth for available adapters and maps user-friendly LoRA names
+        # to internally used unique LoRA IDs.
+        self.lora_registry = LoRARegistry(self.server_args.lora_paths)
+        # Lock to serialize LoRA update operations.
+        # Please note that, unlike `model_update_lock`, this does not block inference, allowing
+        # LoRA updates and inference to overlap.
+        self.lora_update_lock = asyncio.Lock()
+
+        self.disaggregation_mode = DisaggregationMode(
+            self.server_args.disaggregation_mode
+        )
+        self.bootstrap_server = start_disagg_service(self.server_args)
+
+        # For load balancing
+        self.current_load = 0
+        self.current_load_lock = asyncio.Lock()
+
+        # Metrics
+        if self.enable_metrics:
+            self.metrics_collector = TokenizerMetricsCollector(
+                server_args=server_args,
+                labels={
+                    "model_name": self.server_args.served_model_name,
+                    # TODO: Add lora name/path in the future,
+                },
+                bucket_time_to_first_token=self.server_args.bucket_time_to_first_token,
+                bucket_e2e_request_latency=self.server_args.bucket_e2e_request_latency,
+                bucket_inter_token_latency=self.server_args.bucket_inter_token_latency,
+                collect_tokens_histogram=self.server_args.collect_tokens_histogram,
+            )
+
+        # Configure GC warning
+        if self.server_args.gc_warning_threshold_secs > 0.0:
+            configure_gc_warning(self.server_args.gc_warning_threshold_secs)
+
+        self._result_dispatcher = TypeBasedDispatcher(
+            [
+                (
+                    (
+                        BatchStrOut,
+                        BatchEmbeddingOut,
+                        BatchTokenIDOut,
+                        BatchMultimodalOut,
+                    ),
+                    self._handle_batch_output,
+                ),
+                (AbortReq, self._handle_abort_req),
+                (OpenSessionReqOutput, self._handle_open_session_req_output),
+                (
+                    UpdateWeightFromDiskReqOutput,
+                    self._handle_update_weights_from_disk_req_output,
+                ),
+                (
+                    FreezeGCReq,
+                    lambda x: None,
+                ),  # For handling case when scheduler skips detokenizer and forwards back to the tokenizer manager, we ignore it.
+                (HealthCheckOutput, lambda x: None),
+            ]
+        )
+
+        self.init_communicators(server_args)
+
+    async def generate_request(
+        self,
+        obj: Union[GenerateReqInput, EmbeddingReqInput],
+        request: Optional[fastapi.Request] = None,
+    ):
+        created_time = time.time()
+        self.auto_create_handle_loop()
+        obj.normalize_batch_and_arguments()
+
+        if self.server_args.tokenizer_worker_num > 1:
+            # Modify rid, add worker_id
+            if isinstance(obj.rid, list):
+                # If it's an array, add worker_id prefix to each element
+                obj.rid = [f"{self.worker_id}_{rid}" for rid in obj.rid]
+            else:
+                # If it's a single value, add worker_id prefix
+                obj.rid = f"{self.worker_id}_{obj.rid}"
+
+        if self.log_requests:
+            max_length, skip_names, _ = self.log_request_metadata
+            logger.info(
+                f"Receive: obj={dataclass_to_string_truncated(obj, max_length, skip_names=skip_names)}"
+            )
+
+        async with self.is_pause_cond:
+            await self.is_pause_cond.wait_for(lambda: not self.is_pause)
+
+        async with self.model_update_lock.reader_lock:
+            if self.server_args.enable_lora and obj.lora_path:
+                # Look up the LoRA ID from the registry and start tracking ongoing LoRA requests.
+                obj.lora_id = await self.lora_registry.acquire(obj.lora_path)
+
+            if obj.is_single:
+                tokenized_obj = await self._tokenize_one_request(obj)
+                state = self._send_one_request(obj, tokenized_obj, created_time)
+                async for response in self._wait_one_response(obj, state, request):
+                    yield response
+            else:
+                async for response in self._handle_batch_request(
+                    obj, request, created_time
+                ):
+                    yield response
+
+    async def _tokenize_one_request(
+        self,
+        obj: Union[GenerateReqInput, EmbeddingReqInput],
+    ):
+        """Tokenize one request."""
+        # Tokenize
+        input_embeds = None
+        input_text = obj.text
+        token_type_ids = None
+        is_cross_encoder_request = (
+            isinstance(obj, EmbeddingReqInput) and obj.is_cross_encoder_request
+        )
+        if obj.input_embeds is not None:
+            if not self.server_args.disable_radix_cache:
+                raise ValueError(
+                    "input_embeds is provided while disable_radix_cache is False. "
+                    "Please add `--disable-radix-cache` when you launch the server "
+                    "if you want to use input_embeds as inputs."
+                )
+            input_embeds = obj.input_embeds
+            input_ids = obj.input_ids
+        elif obj.input_ids is not None:
+            input_ids = obj.input_ids
+        else:
+            if self.tokenizer is None:
+                raise ValueError(
+                    "The engine initialized with skip_tokenizer_init=True cannot "
+                    "accept text prompts. Please provide input_ids or re-initialize "
+                    "the engine with skip_tokenizer_init=False."
+                )
+            encoded = self.tokenizer(
+                input_text, return_token_type_ids=is_cross_encoder_request
+            )
+
+            input_ids = encoded["input_ids"]
+            if is_cross_encoder_request:
+                input_ids = encoded["input_ids"][0]
+                token_type_ids = encoded.get("token_type_ids", [None])[0]
+
+        if self.mm_processor and obj.contains_mm_input():
+            if not isinstance(obj.image_data, list):
+                obj.image_data = [obj.image_data]
+            if not isinstance(obj.audio_data, list):
+                obj.audio_data = [obj.audio_data]
+            mm_inputs: Dict = await self.mm_processor.process_mm_data_async(
+                image_data=obj.image_data,
+                audio_data=obj.audio_data,
+                input_text=input_text or input_ids,
+                request_obj=obj,
+                max_req_input_len=self.max_req_input_len,
+            )
+            if mm_inputs and "input_ids" in mm_inputs:
+                input_ids = mm_inputs["input_ids"]
+        else:
+            mm_inputs = None
+
+        self._validate_one_request(obj, input_ids)
+        return self._create_tokenized_object(
+            obj, input_text, input_ids, input_embeds, mm_inputs, token_type_ids
+        )
+
+    def _validate_one_request(
+        self, obj: Union[GenerateReqInput, EmbeddingReqInput], input_ids: List[int]
+    ) -> None:
+        """Validates that the input token count and the requested token count doesn't exceed the model's context length."""
+        # FIXME: unify the length validation logic with the one in the scheduler.
+        _max_req_len = self.context_len
+
+        input_token_num = len(input_ids) if input_ids is not None else 0
+        if input_token_num >= self.context_len:
+            if self.server_args.allow_auto_truncate:
+                logger.warning(
+                    f"The input ({input_token_num} tokens) is longer than the "
+                    f"model's context length ({self.context_len} tokens). "
+                    "Truncating the input."
+                )
+                del input_ids[_max_req_len:]
+                input_token_num = len(input_ids)
+            else:
+                raise ValueError(
+                    f"The input ({input_token_num} tokens) is longer than the "
+                    f"model's context length ({self.context_len} tokens)."
+                )
+
+        if isinstance(obj, EmbeddingReqInput) and self.is_generation:
+            raise ValueError(
+                "This model does not appear to be an embedding model by default. "
+                "Please add `--is-embedding` when launching the server or try another model."
+            )
+
+        # Check total tokens (input + max_new_tokens)
+        max_new_tokens = obj.sampling_params.get("max_new_tokens")
+        if (
+            max_new_tokens is not None
+            and (max_new_tokens + input_token_num) >= _max_req_len
+        ):
+            if self.server_args.allow_auto_truncate:
+                logger.warning(
+                    f"Requested token count ({input_token_num} input + {max_new_tokens} new) "
+                    f"exceeds the model's context length ({self.context_len} tokens). "
+                    "Truncating max_new_tokens."
+                )
+                obj.sampling_params["max_new_tokens"] = max(
+                    0, _max_req_len - input_token_num
+                )
+            else:
+                total_tokens = max_new_tokens + input_token_num
+                error_msg = (
+                    f"Requested token count exceeds the model's maximum context length "
+                    f"of {self.context_len} tokens. You requested a total of {total_tokens} "
+                    f"tokens: {input_token_num} tokens from the input messages and "
+                    f"{max_new_tokens} tokens for the completion. Please reduce the number "
+                    f"of tokens in the input messages or the completion to fit within the limit."
+                )
+                raise ValueError(error_msg)
+
+        if isinstance(obj, GenerateReqInput):
+            if (
+                obj.return_hidden_states
+                and not self.server_args.enable_return_hidden_states
+            ):
+                raise ValueError(
+                    "The server is not configured to return the hidden states. "
+                    "Please set `--enable-return-hidden-states` to enable this feature."
+                )
+            if (
+                obj.custom_logit_processor
+                and not self.server_args.enable_custom_logit_processor
+            ):
+                raise ValueError(
+                    "The server is not configured to enable custom logit processor. "
+                    "Please set `--enable-custom-logits-processor` to enable this feature."
+                )
+
+    def _validate_input_ids_in_vocab(
+        self, input_ids: List[int], vocab_size: int
+    ) -> None:
+        if any(id >= vocab_size for id in input_ids):
+            raise ValueError(
+                f"The input_ids {input_ids} contains values greater than the vocab size ({vocab_size})."
+            )
+
+    def _create_tokenized_object(
+        self,
+        obj: Union[GenerateReqInput, EmbeddingReqInput],
+        input_text: str,
+        input_ids: List[int],
+        input_embeds: Optional[Union[List[float], None]] = None,
+        mm_inputs: Optional[Dict] = None,
+        token_type_ids: Optional[List[int]] = None,
+    ) -> Union[TokenizedGenerateReqInput, TokenizedEmbeddingReqInput]:
+        """Create a tokenized request object from common parameters."""
+        # Parse sampling parameters
+        # Note: if there are preferred sampling params, we use them if they are not
+        # explicitly passed in sampling_params
+        if self.preferred_sampling_params:
+            sampling_kwargs = {**self.preferred_sampling_params, **obj.sampling_params}
+        else:
+            sampling_kwargs = obj.sampling_params
+        sampling_params = SamplingParams(**sampling_kwargs)
+        sampling_params.normalize(self.tokenizer)
+        sampling_params.verify(self.model_config.vocab_size)
+
+        # Build return object
+        if isinstance(obj, GenerateReqInput):
+            session_params = (
+                SessionParams(**obj.session_params) if obj.session_params else None
+            )
+
+            tokenized_obj = TokenizedGenerateReqInput(
+                obj.rid,
+                input_text,
+                input_ids,
+                mm_inputs,
+                sampling_params,
+                obj.return_logprob,
+                obj.logprob_start_len,
+                obj.top_logprobs_num,
+                obj.token_ids_logprob,
+                obj.stream,
+                bootstrap_host=obj.bootstrap_host,
+                bootstrap_port=obj.bootstrap_port,
+                bootstrap_room=obj.bootstrap_room,
+                lora_id=obj.lora_id,
+                input_embeds=input_embeds,
+                session_params=session_params,
+                custom_logit_processor=obj.custom_logit_processor,
+                return_hidden_states=obj.return_hidden_states,
+                data_parallel_rank=obj.data_parallel_rank,
+            )
+        elif isinstance(obj, EmbeddingReqInput):
+            tokenized_obj = TokenizedEmbeddingReqInput(
+                obj.rid,
+                input_text,
+                input_ids,
+                mm_inputs,
+                token_type_ids,
+                sampling_params,
+            )
+
+        return tokenized_obj
+
+    async def _batch_tokenize_and_process(
+        self, batch_size: int, obj: Union[GenerateReqInput, EmbeddingReqInput]
+    ) -> List[Union[TokenizedGenerateReqInput, TokenizedEmbeddingReqInput]]:
+        """Handle batch tokenization for text inputs only."""
+        logger.debug(f"Starting batch tokenization for {batch_size} text requests")
+
+        # Collect requests and texts
+        requests = [obj[i] for i in range(batch_size)]
+        texts = [req.text for req in requests]
+
+        # Batch tokenize all texts
+        encoded = self.tokenizer(texts)
+        input_ids_list = encoded["input_ids"]
+
+        # Process all requests
+        tokenized_objs = []
+        for i, req in enumerate(requests):
+            self._validate_one_request(obj[i], input_ids_list[i])
+            tokenized_objs.append(
+                self._create_tokenized_object(
+                    req, req.text, input_ids_list[i], None, None
+                )
+            )
+        logger.debug(f"Completed batch processing for {batch_size} requests")
+        return tokenized_objs
+
+    def _validate_batch_tokenization_constraints(
+        self, batch_size: int, obj: Union[GenerateReqInput, EmbeddingReqInput]
+    ) -> None:
+        """Validate constraints for batch tokenization processing."""
+        for i in range(batch_size):
+            if self.is_generation and obj[i].contains_mm_input():
+                raise ValueError(
+                    "For multimodal input processing do not set `enable_tokenizer_batch_encode`."
+                )
+            if obj[i].input_ids is not None:
+                raise ValueError(
+                    "Batch tokenization is not needed for pre-tokenized input_ids. Do not set `enable_tokenizer_batch_encode`."
+                )
+            if obj[i].input_embeds is not None:
+                raise ValueError(
+                    "Batch tokenization is not needed for input_embeds. Do not set `enable_tokenizer_batch_encode`."
+                )
+
+    def _send_one_request(
+        self,
+        obj: Union[GenerateReqInput, EmbeddingReqInput],
+        tokenized_obj: Union[TokenizedGenerateReqInput, TokenizedEmbeddingReqInput],
+        created_time: Optional[float] = None,
+    ):
+        self.send_to_scheduler.send_pyobj(tokenized_obj)
+        state = ReqState([], False, asyncio.Event(), obj, created_time=created_time)
+        self.rid_to_state[obj.rid] = state
+        return state
+
+    def _send_batch_request(
+        self,
+        obj: Union[GenerateReqInput, EmbeddingReqInput],
+        tokenized_objs: List[
+            Union[TokenizedGenerateReqInput, TokenizedEmbeddingReqInput]
+        ],
+        created_time: Optional[float] = None,
+    ):
+        """Send a batch of tokenized requests as a single batched request to the scheduler."""
+        if isinstance(tokenized_objs[0], TokenizedGenerateReqInput):
+            batch_req = BatchTokenizedGenerateReqInput(batch=tokenized_objs)
+        else:
+            batch_req = BatchTokenizedEmbeddingReqInput(batch=tokenized_objs)
+
+        self.send_to_scheduler.send_pyobj(batch_req)
+
+        # Create states for each individual request in the batch
+        for i, tokenized_obj in enumerate(tokenized_objs):
+            tmp_obj = obj[i]
+            state = ReqState(
+                [], False, asyncio.Event(), tmp_obj, created_time=created_time
+            )
+            self.rid_to_state[tmp_obj.rid] = state
+
+    async def _wait_one_response(
+        self,
+        obj: Union[GenerateReqInput, EmbeddingReqInput],
+        state: ReqState,
+        request: Optional[fastapi.Request] = None,
+    ):
+        """Wait for the response of one request."""
+        while True:
+            try:
+                await asyncio.wait_for(state.event.wait(), timeout=4)
+            except asyncio.TimeoutError:
+                if (
+                    request is not None
+                    and not obj.background
+                    and await request.is_disconnected()
+                ):
+                    # Abort the request for disconnected requests (non-streaming, waiting queue)
+                    self.abort_request(obj.rid)
+                    # Use exception to kill the whole call stack and asyncio task
+                    raise ValueError(
+                        f"Request is disconnected from the client side (type 1). Abort request {obj.rid=}"
+                    )
+                continue
+
+            out = state.out_list[-1]
+
+            state.out_list = []
+            if state.finished:
+                if self.log_requests:
+                    max_length, skip_names, out_skip_names = self.log_request_metadata
+                    if self.model_config.is_multimodal_gen:
+                        msg = f"Finish: obj={dataclass_to_string_truncated(obj, max_length, skip_names=skip_names)}"
+                    else:
+                        msg = f"Finish: obj={dataclass_to_string_truncated(obj, max_length, skip_names=skip_names)}, out={dataclass_to_string_truncated(out, max_length, skip_names=out_skip_names)}"
+                    logger.info(msg)
+
+                # Check if this was an abort/error created by scheduler
+                if isinstance(out["meta_info"].get("finish_reason"), dict):
+                    finish_reason = out["meta_info"]["finish_reason"]
+                    if (
+                        finish_reason.get("type") == "abort"
+                        and finish_reason.get("status_code") == HTTPStatus.BAD_REQUEST
+                    ):
+                        raise ValueError(finish_reason["message"])
+
+                    if finish_reason.get("type") == "abort" and finish_reason.get(
+                        "status_code"
+                    ) in (
+                        HTTPStatus.SERVICE_UNAVAILABLE,
+                        HTTPStatus.INTERNAL_SERVER_ERROR,
+                    ):
+                        # This is an abort request initiated by scheduler.
+                        # Delete the key to prevent resending abort request to the scheduler and
+                        # to ensure aborted request state is cleaned up.
+                        if state.obj.rid in self.rid_to_state:
+                            del self.rid_to_state[state.obj.rid]
+
+                        # Mark ongoing LoRA request as finished.
+                        if self.server_args.enable_lora and state.obj.lora_path:
+                            await self.lora_registry.release(state.obj.lora_id)
+
+                        raise fastapi.HTTPException(
+                            status_code=finish_reason["status_code"],
+                            detail=finish_reason["message"],
+                        )
+                yield out
+                break
+
+            state.event.clear()
+
+            if obj.stream:
+                yield out
+            else:
+                if (
+                    request is not None
+                    and not obj.background
+                    and await request.is_disconnected()
+                ):
+                    # Abort the request for disconnected requests (non-streaming, running)
+                    self.abort_request(obj.rid)
+                    # Use exception to kill the whole call stack and asyncio task
+                    raise ValueError(
+                        f"Request is disconnected from the client side (type 3). Abort request {obj.rid=}"
+                    )
+
+    async def _handle_batch_request(
+        self,
+        obj: Union[GenerateReqInput, EmbeddingReqInput],
+        request: Optional[fastapi.Request] = None,
+        created_time: Optional[float] = None,
+    ):
+        batch_size = obj.batch_size
+
+        generators = []
+        rids = []
+        if getattr(obj, "parallel_sample_num", 1) == 1:
+            if self.server_args.enable_tokenizer_batch_encode:
+                # Validate batch tokenization constraints
+                self._validate_batch_tokenization_constraints(batch_size, obj)
+
+                tokenized_objs = await self._batch_tokenize_and_process(batch_size, obj)
+
+                # Send as a single batched request
+                self._send_batch_request(obj, tokenized_objs, created_time)
+
+                # Set up generators for each request in the batch
+                for i in range(batch_size):
+                    tmp_obj = obj[i]
+                    generators.append(
+                        self._wait_one_response(
+                            tmp_obj, self.rid_to_state[tmp_obj.rid], request
+                        )
+                    )
+                    rids.append(tmp_obj.rid)
+            else:
+                # Sequential tokenization and processing
+                with (
+                    input_blocker_guard_region(send_to_scheduler=self.send_to_scheduler)
+                    if get_bool_env_var("SGLANG_ENABLE_COLOCATED_BATCH_GEN")
+                    else nullcontext()
+                ):
+                    for i in range(batch_size):
+                        tmp_obj = obj[i]
+                        tokenized_obj = await self._tokenize_one_request(tmp_obj)
+                        state = self._send_one_request(
+                            tmp_obj, tokenized_obj, created_time
+                        )
+                        generators.append(
+                            self._wait_one_response(tmp_obj, state, request)
+                        )
+                        rids.append(tmp_obj.rid)
+        else:
+            # FIXME: When using batch and parallel_sample_num together, the perf is not optimal.
+            if batch_size > 128:
+                logger.warning(
+                    "Sending a single large batch with parallel sampling (n > 1) has not been well optimized. "
+                    "The performance might be better if you just duplicate the requests n times or use "
+                    "many threads to send them one by one with parallel sampling (n > 1)."
+                )
+
+            # Tokenize all requests
+            objs = [obj[i] for i in range(batch_size)]
+            tokenized_objs = await asyncio.gather(
+                *(self._tokenize_one_request(obj) for obj in objs)
+            )
+
+            # Cache the common prefix for parallel sampling
+            for i in range(batch_size):
+                tmp_obj = copy.copy(objs[i])
+                tokenized_obj = copy.copy(tokenized_objs[i])
+                tokenized_obj.rid = tmp_obj.regenerate_rid()
+                tokenized_obj.sampling_params = copy.copy(tokenized_obj.sampling_params)
+                tokenized_obj.sampling_params.max_new_tokens = 0
+                tokenized_obj.stream = False
+                state = self._send_one_request(tmp_obj, tokenized_obj, created_time)
+                await self._wait_one_response(tmp_obj, state, request).__anext__()
+
+            # Expand requests, assign new rids for them, and send them
+            for i in range(batch_size):
+                for _ in range(obj.parallel_sample_num):
+                    tmp_obj = copy.copy(objs[i])
+                    tokenized_obj = copy.copy(tokenized_objs[i])
+                    tokenized_obj.rid = tmp_obj.regenerate_rid()
+                    state = self._send_one_request(tmp_obj, tokenized_obj, created_time)
+                    generators.append(self._wait_one_response(tmp_obj, state, request))
+                    rids.append(tmp_obj.rid)
+
+        # Wait for all requests
+        is_stream = hasattr(obj, "stream") and obj.stream
+        if not is_stream:
+            outputs = await asyncio.gather(*(gen.__anext__() for gen in generators))
+            yield outputs
+        else:
+            rid_to_index = {rid: i for i, rid in enumerate(rids)}
+            task_map = {asyncio.create_task(gen.__anext__()): gen for gen in generators}
+            while task_map:
+                done, _ = await asyncio.wait(
+                    task_map.keys(), return_when=asyncio.FIRST_COMPLETED
+                )
+
+                for task in done:
+                    gen = task_map.pop(task)
+                    try:
+                        result = task.result()
+                        result["index"] = rid_to_index[result["meta_info"]["id"]]
+                        yield result
+                        new_task = asyncio.create_task(gen.__anext__())
+                        task_map[new_task] = gen
+                    except StopAsyncIteration:
+                        pass
+
+    def abort_request(self, rid: str = "", abort_all: bool = False):
+        if not abort_all and rid not in self.rid_to_state:
+            return
+        req = AbortReq(rid, abort_all)
+        self.send_to_scheduler.send_pyobj(req)
+
+        if self.enable_metrics:
+            self.metrics_collector.observe_one_aborted_request()
+
+    async def pause_generation(self):
+        async with self.is_pause_cond:
+            self.is_pause = True
+            self.abort_request(abort_all=True)
+
+    async def continue_generation(self):
+        async with self.is_pause_cond:
+            self.is_pause = False
+            self.is_pause_cond.notify_all()
+
+    async def update_weights_from_disk(
+        self,
+        obj: UpdateWeightFromDiskReqInput,
+        request: Optional[fastapi.Request] = None,
+    ) -> Tuple[bool, str]:
+        self.auto_create_handle_loop()
+
+        # default the load format to the server_args
+        if obj.load_format is None:
+            obj.load_format = self.server_args.load_format
+        logger.info("Start update_weights. Load format=%s", obj.load_format)
+
+        if obj.abort_all_requests:
+            self.abort_request(abort_all=True)
+
+        if True:  # Keep this redundant check to simplify some internal code sync
+            # Hold the lock if it is not async. This means that weight sync
+            # cannot run while requests are in progress.
+            async with self.model_update_lock.writer_lock:
+                return await self._wait_for_model_update_from_disk(obj)
+
+    async def _wait_for_model_update_from_disk(
+        self, obj: UpdateWeightFromDiskReqInput
+    ) -> Tuple[bool, str]:
+        if self.server_args.tokenizer_worker_num > 1:
+            obj = MultiTokenizerWrapper(self.worker_id, obj)
+        self.send_to_scheduler.send_pyobj(obj)
+        self.model_update_result = asyncio.Future()
+        if self.server_args.dp_size == 1:
+            result = await self.model_update_result
+            if result.success:
+                self.served_model_name = obj.model_path
+                self.server_args.model_path = obj.model_path
+                self.server_args.load_format = obj.load_format
+                self.model_path = obj.model_path
+            return result.success, result.message, result.num_paused_requests
+        else:  # self.server_args.dp_size > 1
+            self.model_update_tmp = []
+            result = await self.model_update_result
+
+            all_success = all([r.success for r in result])
+            if all_success is True:
+                self.server_args.model_path = obj.model_path
+                self.server_args.load_format = obj.load_format
+                self.model_path = obj.model_path
+            all_message = [r.message for r in result]
+            all_message = " | ".join(all_message)
+            all_paused_requests = [r.num_paused_requests for r in result]
+            return all_success, all_message, all_paused_requests
+
+    async def open_session(
+        self, obj: OpenSessionReqInput, request: Optional[fastapi.Request] = None
+    ):
+        self.auto_create_handle_loop()
+
+        if obj.session_id is None:
+            obj.session_id = uuid.uuid4().hex
+        elif obj.session_id in self.session_futures:
+            return None
+
+        if self.server_args.tokenizer_worker_num > 1:
+            obj = MultiTokenizerWrapper(self.worker_id, obj)
+        self.send_to_scheduler.send_pyobj(obj)
+
+        self.session_futures[obj.session_id] = asyncio.Future()
+        session_id = await self.session_futures[obj.session_id]
+        del self.session_futures[obj.session_id]
+        return session_id
+
+    async def close_session(
+        self, obj: CloseSessionReqInput, request: Optional[fastapi.Request] = None
+    ):
+        await self.send_to_scheduler.send_pyobj(obj)
+
+    def get_log_request_metadata(self):
+        max_length = None
+        skip_names = None
+        out_skip_names = None
+        if self.log_requests:
+            if self.log_requests_level == 0:
+                max_length = 1 << 30
+                skip_names = set(
+                    [
+                        "text",
+                        "input_ids",
+                        "input_embeds",
+                        "image_data",
+                        "audio_data",
+                        "lora_path",
+                        "sampling_params",
+                    ]
+                )
+                out_skip_names = set(
+                    [
+                        "text",
+                        "output_ids",
+                        "embedding",
+                    ]
+                )
+            elif self.log_requests_level == 1:
+                max_length = 1 << 30
+                skip_names = set(
+                    [
+                        "text",
+                        "input_ids",
+                        "input_embeds",
+                        "image_data",
+                        "audio_data",
+                        "lora_path",
+                    ]
+                )
+                out_skip_names = set(
+                    [
+                        "text",
+                        "output_ids",
+                        "embedding",
+                    ]
+                )
+            elif self.log_requests_level == 2:
+                max_length = 2048
+            elif self.log_requests_level == 3:
+                max_length = 1 << 30
+            else:
+                raise ValueError(
+                    f"Invalid --log-requests-level: {self.log_requests_level=}"
+                )
+        return max_length, skip_names, out_skip_names
+
+    def configure_logging(self, obj: ConfigureLoggingReq):
+        if obj.log_requests is not None:
+            self.log_requests = obj.log_requests
+        if obj.log_requests_level is not None:
+            self.log_requests_level = obj.log_requests_level
+        if obj.dump_requests_folder is not None:
+            self.dump_requests_folder = obj.dump_requests_folder
+        if obj.dump_requests_threshold is not None:
+            self.dump_requests_threshold = obj.dump_requests_threshold
+        if obj.crash_dump_folder is not None:
+            self.crash_dump_folder = obj.crash_dump_folder
+        logging.info(f"Config logging: {obj=}")
+        self.log_request_metadata = self.get_log_request_metadata()
+
+    async def freeze_gc(self):
+        """Send a freeze_gc message to the scheduler first, then freeze locally."""
+        self.send_to_scheduler.send_pyobj(FreezeGCReq())
+        freeze_gc("Tokenizer Manager")
+        return None
+
+    def create_abort_task(self, obj: GenerateReqInput):
+        # Abort the request if the client is disconnected.
+        async def abort_request():
+            await asyncio.sleep(2)
+            if obj.is_single:
+                self.abort_request(obj.rid)
+            else:
+                for rid in obj.rid:
+                    self.abort_request(rid)
+
+        background_tasks = BackgroundTasks()
+        background_tasks.add_task(abort_request)
+        return background_tasks
+
+    def auto_create_handle_loop(self):
+        if self.no_create_loop:
+            return
+
+        self.no_create_loop = True
+        loop = asyncio.get_event_loop()
+        self.asyncio_tasks.add(
+            loop.create_task(print_exception_wrapper(self.handle_loop))
+        )
+
+        self.event_loop = loop
+
+        # We cannot add signal handler when the tokenizer manager is not in
+        # the main thread due to the CPython limitation.
+        if threading.current_thread() is threading.main_thread():
+            signal_handler = SignalHandler(self)
+            loop.add_signal_handler(signal.SIGTERM, signal_handler.sigterm_handler)
+            # Update the signal handler for the process. It overrides the sigquit handler in the launch phase.
+            loop.add_signal_handler(
+                signal.SIGQUIT, signal_handler.running_phase_sigquit_handler
+            )
+        else:
+            logger.warning(
+                "Signal handler is not added because the tokenizer manager is "
+                "not in the main thread. This disables graceful shutdown of the "
+                "tokenizer manager when SIGTERM is received."
+            )
+        self.asyncio_tasks.add(
+            loop.create_task(print_exception_wrapper(self.sigterm_watchdog))
+        )
+
+    def dump_requests_before_crash(self):
+        if self.crash_dump_performed:
+            logger.info(
+                "SIGTERM/SIGQUIT/Exception triggered, but crash dump already performed, skipping."
+            )
+            return
+
+        if not self.crash_dump_folder:
+            return
+
+        logger.error(f"Dumping requests before crash. {self.crash_dump_folder=}")
+        self.crash_dump_performed = True
+
+        # Check if NFS directory is available
+        # expected_nfs_dir = "/" + self.crash_dump_folder.lstrip("/").split("/")[0]
+        # use_nfs_dir = os.path.isdir(expected_nfs_dir) and os.access(
+        #     expected_nfs_dir, os.W_OK
+        # )
+        use_nfs_dir = False
+        if not use_nfs_dir:
+            logger.error(
+                f"Expected NFS directory is not available or writable. Uploading to GCS."
+            )
+
+        data_to_dump = []
+        if self.crash_dump_request_list:
+            data_to_dump.extend(self.crash_dump_request_list)
+
+        # Add unfinished requests from rid_to_state
+        unfinished_requests = []
+        for rid, state in self.rid_to_state.items():
+            if not state.finished:
+                unfinished_requests.append(
+                    (
+                        state.obj,
+                        state.out_list[-1] if state.out_list else {},
+                        state.created_time,
+                        time.time(),
+                    )
+                )
+        if unfinished_requests:
+            data_to_dump.extend(unfinished_requests)
+
+        if not data_to_dump:
+            return
+
+        object_name = f'crash_dump_{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.pkl'
+        filename = os.path.join(
+            self.crash_dump_folder,
+            os.getenv("HOSTNAME", None),
+            object_name,
+        )
+
+        os.makedirs(os.path.dirname(filename), exist_ok=True)
+        # Include server_args in the dump
+        data_to_dump_with_server_args = {
+            "server_args": self.server_args,
+            "requests": data_to_dump,
+        }
+        with open(filename, "wb") as f:
+            pickle.dump(data_to_dump_with_server_args, f)
+        logger.error(
+            f"Dumped {len(self.crash_dump_request_list)} finished and {len(unfinished_requests)} unfinished requests before crash to {filename}"
+        )
+
+        def _upload_file_to_gcs(bucket_name, source_file_path, object_name):
+            from google.cloud import storage
+
+            client = storage.Client()
+            bucket = client.bucket(bucket_name)
+            blob = bucket.blob(object_name)
+            blob.upload_from_filename(source_file_path, if_generation_match=0)
+            logger.error(
+                f"Successfully uploaded {source_file_path} to gs://{bucket_name}/{object_name}"
+            )
+
+        if not use_nfs_dir:
+            _upload_file_to_gcs(
+                "sglang_crash_dump",
+                filename,
+                os.getenv("HOSTNAME", None) + "/" + object_name,
+            )
+
+    async def sigterm_watchdog(self):
+        while not self.gracefully_exit:
+            await asyncio.sleep(5)
+
+        # Drain requests
+        while True:
+            remain_num_req = len(self.rid_to_state)
+
+            if self.server_status == ServerStatus.UnHealthy:
+                # if health check failed, we should exit immediately
+                logger.error(
+                    "Signal SIGTERM received while health check failed. Exiting... remaining number of requests: %d",
+                    remain_num_req,
+                )
+                self.dump_requests_before_crash()
+                break
+
+            elif get_bool_env_var("SGL_FORCE_SHUTDOWN"):
+                # if force shutdown flag set, exit immediately
+                logger.error(
+                    "Signal SIGTERM received while force shutdown flag set. Force exiting... remaining number of requests: %d",
+                    remain_num_req,
+                )
+                break
+
+            logger.info(
+                f"Gracefully exiting... remaining number of requests {remain_num_req}"
+            )
+            if remain_num_req > 0:
+                await asyncio.sleep(5)
+            else:
+                self.dump_requests_before_crash()
+                break
+
+        kill_process_tree(os.getpid(), include_parent=True)
+        sys.exit(0)
+
+    async def handle_loop(self):
+        """The event loop that handles requests"""
+        while True:
+            recv_obj = await self.recv_from_detokenizer.recv_pyobj()
+            self._result_dispatcher(recv_obj)
+            self.last_receive_tstamp = time.time()
+
+    def _handle_batch_output(
+        self,
+        recv_obj: Union[
+            BatchStrOut, BatchEmbeddingOut, BatchMultimodalOut, BatchTokenIDOut
+        ],
+    ):
+        for i, rid in enumerate(recv_obj.rids):
+            state = self.rid_to_state.get(rid, None)
+            if state is None:
+                logger.error(
+                    f"Received output for {rid=} but the state was deleted in TokenizerManager."
+                )
+                continue
+
+            origin_rid = rid
+            if self.server_args.tokenizer_worker_num > 1:
+                origin_rid = get_origin_rid(rid)
+            # Build meta_info and return value
+            meta_info = {
+                "id": origin_rid,
+                "finish_reason": recv_obj.finished_reasons[i],
+                "prompt_tokens": recv_obj.prompt_tokens[i],
+                "weight_version": self.server_args.weight_version,
+            }
+
+            if getattr(state.obj, "return_logprob", False):
+                self.convert_logprob_style(
+                    meta_info,
+                    state,
+                    state.obj.top_logprobs_num,
+                    state.obj.token_ids_logprob,
+                    state.obj.return_text_in_logprobs
+                    and not self.server_args.skip_tokenizer_init,
+                    recv_obj,
+                    i,
+                )
+
+            if not isinstance(recv_obj, BatchEmbeddingOut):
+                meta_info.update(
+                    {
+                        "completion_tokens": recv_obj.completion_tokens[i],
+                        "cached_tokens": recv_obj.cached_tokens[i],
+                    }
+                )
+
+            if getattr(recv_obj, "output_hidden_states", None):
+                meta_info["hidden_states"] = recv_obj.output_hidden_states[i]
+
+            if isinstance(recv_obj, BatchStrOut):
+                state.text += recv_obj.output_strs[i]
+                if state.obj.stream:
+                    state.output_ids.extend(recv_obj.output_ids[i])
+                    output_token_ids = state.output_ids[state.last_output_offset :]
+                    state.last_output_offset = len(state.output_ids)
+                else:
+                    state.output_ids.extend(recv_obj.output_ids[i])
+                    output_token_ids = state.output_ids.copy()
+
+                out_dict = {
+                    "text": state.text,
+                    "output_ids": output_token_ids,
+                    "meta_info": meta_info,
+                }
+            elif isinstance(recv_obj, BatchTokenIDOut):
+                if self.server_args.stream_output and state.obj.stream:
+                    state.output_ids.extend(recv_obj.output_ids[i])
+                    output_token_ids = state.output_ids[state.last_output_offset :]
+                    state.last_output_offset = len(state.output_ids)
+                else:
+                    state.output_ids.extend(recv_obj.output_ids[i])
+                    output_token_ids = state.output_ids.copy()
+
+                out_dict = {
+                    "output_ids": output_token_ids,
+                    "meta_info": meta_info,
+                }
+            elif isinstance(recv_obj, BatchMultimodalOut):
+                raise NotImplementedError("BatchMultimodalOut not implemented")
+            else:
+                assert isinstance(recv_obj, BatchEmbeddingOut)
+                out_dict = {
+                    "embedding": recv_obj.embeddings[i],
+                    "meta_info": meta_info,
+                }
+
+            state.finished = recv_obj.finished_reasons[i] is not None
+            if state.finished:
+                if self.server_args.speculative_algorithm:
+                    meta_info["spec_verify_ct"] = recv_obj.spec_verify_ct[i]
+                state.finished_time = time.time()
+                meta_info["e2e_latency"] = state.finished_time - state.created_time
+                del self.rid_to_state[rid]
+
+                # Mark ongoing LoRA request as finished.
+                if self.server_args.enable_lora and state.obj.lora_path:
+                    asyncio.create_task(self.lora_registry.release(state.obj.lora_id))
+
+            state.out_list.append(out_dict)
+            state.event.set()
+
+            # Log metrics and dump
+            if self.enable_metrics and state.obj.log_metrics:
+                self.collect_metrics(state, recv_obj, i)
+            if self.dump_requests_folder and state.finished and state.obj.log_metrics:
+                self.dump_requests(state, out_dict)
+            if self.crash_dump_folder and state.finished and state.obj.log_metrics:
+                self.record_request_for_crash_dump(state, out_dict)
+
+    def convert_logprob_style(
+        self,
+        meta_info: dict,
+        state: ReqState,
+        top_logprobs_num: int,
+        token_ids_logprob: List[int],
+        return_text_in_logprobs: bool,
+        recv_obj: BatchStrOut,
+        recv_obj_index: int,
+    ):
+        if recv_obj.input_token_logprobs_val is None:
+            return
+
+        if len(recv_obj.input_token_logprobs_val) > 0:
+            state.input_token_logprobs_val.extend(
+                recv_obj.input_token_logprobs_val[recv_obj_index]
+            )
+            state.input_token_logprobs_idx.extend(
+                recv_obj.input_token_logprobs_idx[recv_obj_index]
+            )
+        state.output_token_logprobs_val.extend(
+            recv_obj.output_token_logprobs_val[recv_obj_index]
+        )
+        state.output_token_logprobs_idx.extend(
+            recv_obj.output_token_logprobs_idx[recv_obj_index]
+        )
+        meta_info["input_token_logprobs"] = self.detokenize_logprob_tokens(
+            state.input_token_logprobs_val,
+            state.input_token_logprobs_idx,
+            return_text_in_logprobs,
+        )
+        meta_info["output_token_logprobs"] = self.detokenize_logprob_tokens(
+            state.output_token_logprobs_val,
+            state.output_token_logprobs_idx,
+            return_text_in_logprobs,
+        )
+
+        if top_logprobs_num > 0:
+            if len(recv_obj.input_top_logprobs_val) > 0:
+                state.input_top_logprobs_val.extend(
+                    recv_obj.input_top_logprobs_val[recv_obj_index]
+                )
+                state.input_top_logprobs_idx.extend(
+                    recv_obj.input_top_logprobs_idx[recv_obj_index]
+                )
+            state.output_top_logprobs_val.extend(
+                recv_obj.output_top_logprobs_val[recv_obj_index]
+            )
+            state.output_top_logprobs_idx.extend(
+                recv_obj.output_top_logprobs_idx[recv_obj_index]
+            )
+            meta_info["input_top_logprobs"] = self.detokenize_top_logprobs_tokens(
+                state.input_top_logprobs_val,
+                state.input_top_logprobs_idx,
+                return_text_in_logprobs,
+            )
+            meta_info["output_top_logprobs"] = self.detokenize_top_logprobs_tokens(
+                state.output_top_logprobs_val,
+                state.output_top_logprobs_idx,
+                return_text_in_logprobs,
+            )
+
+        if token_ids_logprob is not None:
+            if len(recv_obj.input_token_ids_logprobs_val) > 0:
+                state.input_token_ids_logprobs_val.extend(
+                    recv_obj.input_token_ids_logprobs_val[recv_obj_index]
+                )
+                state.input_token_ids_logprobs_idx.extend(
+                    recv_obj.input_token_ids_logprobs_idx[recv_obj_index]
+                )
+            state.output_token_ids_logprobs_val.extend(
+                recv_obj.output_token_ids_logprobs_val[recv_obj_index]
+            )
+            state.output_token_ids_logprobs_idx.extend(
+                recv_obj.output_token_ids_logprobs_idx[recv_obj_index]
+            )
+            meta_info["input_token_ids_logprobs"] = self.detokenize_top_logprobs_tokens(
+                state.input_token_ids_logprobs_val,
+                state.input_token_ids_logprobs_idx,
+                return_text_in_logprobs,
+            )
+            meta_info["output_token_ids_logprobs"] = (
+                self.detokenize_top_logprobs_tokens(
+                    state.output_token_ids_logprobs_val,
+                    state.output_token_ids_logprobs_idx,
+                    return_text_in_logprobs,
+                )
+            )
+
+    def detokenize_logprob_tokens(
+        self,
+        token_logprobs_val: List[float],
+        token_logprobs_idx: List[int],
+        decode_to_text: bool,
+    ):
+        if not decode_to_text:
+            return [
+                (logprob, token_id, None)
+                for logprob, token_id in zip(token_logprobs_val, token_logprobs_idx)
+            ]
+        else:
+            assert self.tokenizer is not None
+            token_texts = self.tokenizer.batch_decode(token_logprobs_idx)
+            return list(zip(token_logprobs_val, token_logprobs_idx, token_texts))
+
+    def detokenize_top_logprobs_tokens(
+        self,
+        token_logprobs_val: List[float],
+        token_logprobs_idx: List[int],
+        decode_to_text: bool,
+    ):
+        # TODO: The current implementation only batches the detokenization for top-k tokens per single position.
+        # We should batch all top-k tokens in all positions.
+        ret = []
+        for i in range(len(token_logprobs_val)):
+            if token_logprobs_val[i]:
+                ret.append(
+                    self.detokenize_logprob_tokens(
+                        token_logprobs_val[i], token_logprobs_idx[i], decode_to_text
+                    )
+                )
+            else:
+                ret.append(None)
+        return ret
+
+    def collect_metrics(self, state: ReqState, recv_obj: BatchStrOut, i: int):
+        completion_tokens = (
+            recv_obj.completion_tokens[i]
+            if getattr(recv_obj, "completion_tokens", None)
+            else 0
+        )
+
+        if (
+            state.first_token_time == 0.0
+            and self.disaggregation_mode != DisaggregationMode.PREFILL
+        ):
+            state.first_token_time = state.last_time = time.time()
+            state.last_completion_tokens = completion_tokens
+            self.metrics_collector.observe_time_to_first_token(
+                state.first_token_time - state.created_time
+            )
+        else:
+            num_new_tokens = completion_tokens - state.last_completion_tokens
+            if num_new_tokens:
+                new_time = time.time()
+                interval = new_time - state.last_time
+                self.metrics_collector.observe_inter_token_latency(
+                    interval,
+                    num_new_tokens,
+                )
+                state.last_time = new_time
+                state.last_completion_tokens = completion_tokens
+
+        if state.finished:
+            has_grammar = (
+                state.obj.sampling_params.get("json_schema", None)
+                or state.obj.sampling_params.get("regex", None)
+                or state.obj.sampling_params.get("ebnf", None)
+                or state.obj.sampling_params.get("structural_tag", None)
+            )
+            self.metrics_collector.observe_one_finished_request(
+                recv_obj.prompt_tokens[i],
+                completion_tokens,
+                recv_obj.cached_tokens[i],
+                state.finished_time - state.created_time,
+                has_grammar,
+            )
+
+    def dump_requests(self, state: ReqState, out_dict: dict):
+        self.dump_request_list.append(
+            (state.obj, out_dict, state.created_time, time.time())
+        )
+
+        if len(self.dump_request_list) >= self.dump_requests_threshold:
+            filename = os.path.join(
+                self.dump_requests_folder,
+                datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".pkl",
+            )
+            self._dump_data_to_file(
+                data_list=self.dump_request_list,
+                filename=filename,
+                log_message=f"Dump {len(self.dump_request_list)} requests to {filename}",
+            )
+            self.dump_request_list = []
+
+    def record_request_for_crash_dump(self, state: ReqState, out_dict: dict):
+        current_time = time.time()
+        self.crash_dump_request_list.append(
+            (state.obj, out_dict, state.created_time, current_time)
+        )
+        # Remove requests older than 5 minutes based on finish time
+        while (
+            self.crash_dump_request_list
+            and current_time - self.crash_dump_request_list[0][3] >= 300
+        ):
+            self.crash_dump_request_list.popleft()
+
+    def _dump_data_to_file(
+        self, data_list: List[Tuple], filename: str, log_message: str
+    ):
+        logger.info(log_message)
+        to_dump_with_server_args = {
+            "server_args": self.server_args,
+            "requests": data_list.copy(),
+        }
+
+        def background_task():
+            os.makedirs(os.path.dirname(filename), exist_ok=True)
+            with open(filename, "wb") as f:
+                pickle.dump(to_dump_with_server_args, f)
+
+        asyncio.create_task(asyncio.to_thread(background_task))
+
+    def _handle_abort_req(self, recv_obj):
+        if is_health_check_generate_req(recv_obj):
+            return
+        state = self.rid_to_state[recv_obj.rid]
+        origin_rid = recv_obj.rid
+        if self.server_args.tokenizer_worker_num > 1:
+            origin_rid = get_origin_rid(origin_rid)
+        state.finished = True
+        if recv_obj.finished_reason:
+            out = {
+                "meta_info": {
+                    "id": recv_obj.rid,
+                    "finish_reason": recv_obj.finished_reason,
+                },
+            }
+        else:
+            out = {
+                "text": "",
+                "meta_info": {
+                    "id": origin_rid,
+                    "finish_reason": {
+                        "type": "abort",
+                        "message": "Abort before prefill",
+                    },
+                    "prompt_tokens": 0,
+                    "completion_tokens": 0,
+                },
+            }
+        state.out_list.append(out)
+        state.event.set()
+
+    def _handle_open_session_req_output(self, recv_obj):
+        self.session_futures[recv_obj.session_id].set_result(
+            recv_obj.session_id if recv_obj.success else None
+        )
+
+    def _handle_update_weights_from_disk_req_output(self, recv_obj):
+        if self.server_args.dp_size == 1:
+            self.model_update_result.set_result(recv_obj)
+        else:  # self.server_args.dp_size > 1
+            self.model_update_tmp.append(recv_obj)
+            # set future if the all results are received
+            if len(self.model_update_tmp) == self.server_args.dp_size:
+                self.model_update_result.set_result(self.model_update_tmp)
+
+    async def score_request(
+        self,
+        query: Optional[Union[str, List[int]]] = None,
+        items: Optional[Union[str, List[str], List[List[int]]]] = None,
+        label_token_ids: Optional[List[int]] = None,
+        apply_softmax: bool = False,
+        item_first: bool = False,
+        request: Optional[Any] = None,
+    ) -> List[List[float]]:
+        """
+        See Engine.score() for more details.
+        """
+        if label_token_ids is None:
+            raise ValueError("label_token_ids must be provided")
+
+        if self.tokenizer is not None:
+            vocab_size = self.tokenizer.vocab_size
+            for token_id in label_token_ids:
+                if token_id >= vocab_size:
+                    raise ValueError(
+                        f"Token ID {token_id} is out of vocabulary (vocab size: {vocab_size})"
+                    )
+
+        batch_request = GenerateReqInput(
+            token_ids_logprob=label_token_ids,
+            return_logprob=True,
+            stream=False,
+            sampling_params={"max_new_tokens": 0},
+        )
+
+        # Handle string or tokenized query/items
+        if isinstance(query, str) and (
+            isinstance(items, str)
+            or (isinstance(items, list) and (not items or isinstance(items[0], str)))
+        ):
+            # Both query and items are text
+            items_list = [items] if isinstance(items, str) else items
+            if item_first:
+                prompts = [f"{item}{query}" for item in items_list]
+            else:
+                prompts = [f"{query}{item}" for item in items_list]
+
+            batch_request.text = prompts
+
+        elif (
+            isinstance(query, list)
+            and isinstance(items, list)
+            and items
+            and isinstance(items[0], list)
+        ):
+            # Both query and items are token IDs
+            if item_first:
+                input_ids_list = [item + query for item in items]
+            else:
+                input_ids_list = [query + item for item in items]
+
+            batch_request.input_ids = input_ids_list
+        else:
+            raise ValueError(
+                "Invalid combination of query/items types for score_request."
+            )
+
+        results = await self.generate_request(batch_request, request).__anext__()
+        scores = []
+
+        for result in results:
+            # Get logprobs for each token
+            logprobs = {}
+
+            # For scoring requests, we read from output_token_ids_logprobs since we want
+            # the logprobs for specific tokens mentioned in the label_token_ids at
+            # the next position after the last token in the prompt
+            output_logprobs = result["meta_info"].get("output_token_ids_logprobs", [])
+
+            # Throw an error here if output_logprobs is None
+            if output_logprobs is None:
+                raise RuntimeError(
+                    f"output_logprobs is None for request {result['meta_info'].get('id', '<unknown>')}. "
+                    "This usually indicates a problem with the scoring request or the backend output."
+                )
+
+            for logprob, token_id, _ in output_logprobs[0]:
+                if token_id in label_token_ids:
+                    logprobs[token_id] = logprob
+
+            # Get scores in order of label_token_ids
+            score_list = [
+                logprobs.get(token_id, float("-inf")) for token_id in label_token_ids
+            ]
+
+            # Apply softmax to logprobs if needed
+            if apply_softmax:
+                score_list = torch.softmax(torch.tensor(score_list), dim=0).tolist()
+            else:
+                # Convert logprobs to probabilities if not using softmax
+                score_list = [
+                    math.exp(x) if x != float("-inf") else 0.0 for x in score_list
+                ]
+
+            scores.append(score_list)
+
+        return scores
+
+
+class ServerStatus(Enum):
+    Up = "Up"
+    Starting = "Starting"
+    UnHealthy = "UnHealthy"
+
+
+def _determine_tensor_transport_mode(server_args: ServerArgs) -> TensorTransportMode:
+    is_cross_node = server_args.dist_init_addr
+
+    if is_cross_node:
+        # Fallback to default CPU transport for multi-node
+        return "default"
+    else:
+        return "cuda_ipc"
+
+
+async def print_exception_wrapper(func):
+    """
+    Sometimes an asyncio function does not print exception.
+    We do another wrapper to handle the exception.
+    """
+    try:
+        await func()
+    except Exception:
+        traceback = get_exception_traceback()
+        logger.error(f"TokenizerManager hit an exception: {traceback}")
+        if hasattr(func, "__self__") and isinstance(func.__self__, TokenizerManager):
+            func.__self__.dump_requests_before_crash()
+        kill_process_tree(os.getpid(), include_parent=True)
+        sys.exit(1)
+
+
+class SignalHandler:
+    def __init__(self, tokenizer_manager: TokenizerManager):
+        self.tokenizer_manager = tokenizer_manager
+
+    def sigterm_handler(self, signum=None, frame=None):
+        logger.warning(
+            f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..."
+        )
+        self.tokenizer_manager.gracefully_exit = True
+
+    def running_phase_sigquit_handler(self, signum=None, frame=None):
+        logger.error(
+            "Received sigquit from a child process. It usually means the child failed."
+        )
+        self.tokenizer_manager.dump_requests_before_crash()
+        kill_process_tree(os.getpid())
+
+
+# Note: request abort handling logic
+# We should handle all of the following cases correctly.
+#
+# | entrypoint | is_streaming | status          | abort engine    | cancel asyncio task   | rid_to_state                |
+# | ---------- | ------------ | --------------- | --------------- | --------------------- | --------------------------- |
+# | http       | yes          | validation      | background task | fast api              | del in _handle_abort_req    |
+# | http       | yes          | waiting queue   | background task | fast api              | del in _handle_abort_req    |
+# | http       | yes          | running         | background task | fast api              | del in _handle_batch_output |
+# | http       | no           | validation      | http exception  | http exception        | del in _handle_abort_req    |
+# | http       | no           | waiting queue   | type 1          | type 1 exception      | del in _handle_abort_req    |
+# | http       | no           | running         | type 3          | type 3 exception      | del in _handle_batch_output |
+#
diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py
new file mode 100644
index 000000000..1cdc48c25
--- /dev/null
+++ b/python/sglang/srt/managers/tp_worker.py
@@ -0,0 +1,329 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A tensor parallel worker."""
+from __future__ import annotations
+
+import logging
+import threading
+from typing import TYPE_CHECKING, Optional, Tuple, Union
+
+import torch
+
+from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.distributed import get_pp_group, get_world_group
+from sglang.srt.hf_transformers_utils import (
+    get_processor,
+    get_tokenizer,
+    get_tokenizer_from_processor,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.managers.io_struct import (
+    GetWeightsByNameReqInput,
+    InitWeightsUpdateGroupReqInput,
+    LoadLoRAAdapterReqInput,
+    UnloadLoRAAdapterReqInput,
+    UpdateWeightFromDiskReqInput,
+    UpdateWeightsFromDistributedReqInput,
+    UpdateWeightsFromTensorReqInput,
+)
+from sglang.srt.managers.schedule_batch import ModelWorkerBatch, global_server_args_dict
+from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
+from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_executor.model_runner import ModelRunner
+from sglang.srt.patch_torch import monkey_patch_torch_reductions
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import MultiprocessingSerializer, broadcast_pyobj, set_random_seed
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.cache_controller import LayerDoneCounter
+
+logger = logging.getLogger(__name__)
+
+
+class TpModelWorker:
+    """A tensor parallel model worker."""
+
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        gpu_id: int,
+        tp_rank: int,
+        moe_ep_rank: int,
+        pp_rank: int,
+        dp_rank: Optional[int],
+        nccl_port: int,
+        is_draft_worker: bool = False,
+        req_to_token_pool: Optional[ReqToTokenPool] = None,
+        token_to_kv_pool_allocator: Optional[BaseTokenToKVPoolAllocator] = None,
+    ):
+        # Parse args
+        self.tp_size = server_args.tp_size
+        self.tp_rank = tp_rank
+        self.moe_ep_rank = moe_ep_rank
+        self.pp_rank = pp_rank
+
+        # Init model and tokenizer
+        self.model_config = ModelConfig.from_server_args(
+            server_args,
+            model_path=(
+                server_args.model_path
+                if not is_draft_worker
+                else server_args.speculative_draft_model_path
+            ),
+            model_revision=(
+                server_args.revision
+                if not is_draft_worker
+                else server_args.speculative_draft_model_revision
+            ),
+            is_draft_model=is_draft_worker,
+        )
+
+        self.model_runner = ModelRunner(
+            model_config=self.model_config,
+            mem_fraction_static=server_args.mem_fraction_static,
+            gpu_id=gpu_id,
+            tp_rank=tp_rank,
+            tp_size=server_args.tp_size,
+            moe_ep_rank=moe_ep_rank,
+            moe_ep_size=server_args.ep_size,
+            pp_rank=pp_rank,
+            pp_size=server_args.pp_size,
+            nccl_port=nccl_port,
+            dp_rank=dp_rank,
+            server_args=server_args,
+            is_draft_worker=is_draft_worker,
+            req_to_token_pool=req_to_token_pool,
+            token_to_kv_pool_allocator=token_to_kv_pool_allocator,
+        )
+        if server_args.skip_tokenizer_init:
+            self.tokenizer = self.processor = None
+        else:
+            if self.model_config.is_multimodal:
+                self.processor = get_processor(
+                    server_args.tokenizer_path,
+                    tokenizer_mode=server_args.tokenizer_mode,
+                    trust_remote_code=server_args.trust_remote_code,
+                    revision=server_args.revision,
+                )
+                self.tokenizer = get_tokenizer_from_processor(self.processor)
+            else:
+                self.tokenizer = get_tokenizer(
+                    server_args.tokenizer_path,
+                    tokenizer_mode=server_args.tokenizer_mode,
+                    trust_remote_code=server_args.trust_remote_code,
+                    revision=server_args.revision,
+                )
+        self.device = self.model_runner.device
+
+        # Init nccl groups
+        self.pp_group = get_pp_group()
+        self.world_group = get_world_group()
+
+        # Profile number of tokens
+        self.max_total_num_tokens = self.model_runner.max_total_num_tokens
+        self.max_prefill_tokens = server_args.max_prefill_tokens
+        self.max_running_requests = min(
+            (
+                self.max_total_num_tokens // 2
+                if server_args.max_running_requests is None
+                else server_args.max_running_requests
+                // (server_args.dp_size if server_args.enable_dp_attention else 1)
+            ),
+            self.model_runner.req_to_token_pool.size,
+        )
+        assert self.max_running_requests > 0, "max_running_request is zero"
+        self.max_queued_requests = server_args.max_queued_requests
+        assert (
+            self.max_queued_requests > 0
+        ), "max_queued_requests is zero. We need to be at least 1 to schedule a request."
+        self.max_req_len = min(
+            self.model_config.context_len - 1,
+            self.max_total_num_tokens - 1,
+        )
+        self.max_req_input_len = self.max_req_len - 5
+        assert (
+            self.max_req_len > 0 and self.max_req_input_len > 0
+        ), "Memory pool size is too small"
+
+        # Sync random seed across TP workers
+        self.random_seed = broadcast_pyobj(
+            [server_args.random_seed],
+            self.tp_size * self.pp_rank + tp_rank,
+            self.world_group.cpu_group,
+            src=self.world_group.ranks[0],
+        )[0]
+        set_random_seed(self.random_seed)
+
+        # A reference make this class has the same member as TpModelWorkerClient
+        self.worker = self
+
+        self.hicache_layer_transfer_counter = None
+
+    def register_hicache_layer_transfer_counter(self, counter: LayerDoneCounter):
+        self.hicache_layer_transfer_counter = counter
+
+    def set_hicache_consumer(self, consumer_index: int):
+        if self.hicache_layer_transfer_counter is not None:
+            self.hicache_layer_transfer_counter.set_consumer(consumer_index)
+
+    def get_worker_info(self):
+        return (
+            self.max_total_num_tokens,
+            self.max_prefill_tokens,
+            self.max_running_requests,
+            self.max_queued_requests,
+            self.max_req_len,
+            self.max_req_input_len,
+            self.random_seed,
+            self.device,
+            global_server_args_dict,
+            self.model_runner.req_to_token_pool.size,
+            self.model_runner.req_to_token_pool.max_context_len,
+            self.model_runner.token_to_kv_pool.size,
+        )
+
+    @property
+    def sliding_window_size(self) -> Optional[int]:
+        return self.model_runner.sliding_window_size
+
+    @property
+    def is_hybrid(self) -> bool:
+        return self.model_runner.is_hybrid is not None
+
+    def get_tokens_per_layer_info(self):
+        return (
+            self.model_runner.full_max_total_num_tokens,
+            self.model_runner.swa_max_total_num_tokens,
+        )
+
+    def get_pad_input_ids_func(self):
+        return getattr(self.model_runner.model, "pad_input_ids", None)
+
+    def get_tp_group(self):
+        return self.model_runner.tp_group
+
+    def get_attention_tp_group(self):
+        return self.model_runner.attention_tp_group
+
+    def get_attention_tp_cpu_group(self):
+        return getattr(self.model_runner.attention_tp_group, "cpu_group", None)
+
+    def get_memory_pool(self):
+        return (
+            self.model_runner.req_to_token_pool,
+            self.model_runner.token_to_kv_pool_allocator,
+        )
+
+    def forward_batch_generation(
+        self,
+        model_worker_batch: ModelWorkerBatch,
+        launch_done: Optional[threading.Event] = None,
+        skip_sample: bool = False,
+    ) -> Tuple[
+        Union[LogitsProcessorOutput, torch.Tensor], Optional[torch.Tensor], bool
+    ]:
+        # update the consumer index of hicache to the running batch
+        self.set_hicache_consumer(model_worker_batch.hicache_consumer_index)
+
+        forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
+
+        pp_proxy_tensors = None
+        if not self.pp_group.is_first_rank:
+            pp_proxy_tensors = PPProxyTensors(
+                self.pp_group.recv_tensor_dict(
+                    all_gather_group=self.get_attention_tp_group()
+                )
+            )
+
+        if self.pp_group.is_last_rank:
+            logits_output, can_run_cuda_graph = self.model_runner.forward(
+                forward_batch, pp_proxy_tensors=pp_proxy_tensors
+            )
+            if launch_done is not None:
+                launch_done.set()
+
+            if skip_sample:
+                next_token_ids = None
+            else:
+                next_token_ids = self.model_runner.sample(
+                    logits_output, model_worker_batch
+                )
+
+            return logits_output, next_token_ids, can_run_cuda_graph
+        else:
+            pp_proxy_tensors, can_run_cuda_graph = self.model_runner.forward(
+                forward_batch,
+                pp_proxy_tensors=pp_proxy_tensors,
+            )
+            return pp_proxy_tensors.tensors, None, can_run_cuda_graph
+
+    def forward_batch_embedding(self, model_worker_batch: ModelWorkerBatch):
+        forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
+        logits_output, _ = self.model_runner.forward(forward_batch)
+        embeddings = logits_output.embeddings
+        return embeddings
+
+    def update_weights_from_disk(self, recv_req: UpdateWeightFromDiskReqInput):
+        success, message = self.model_runner.update_weights_from_disk(
+            recv_req.model_path, recv_req.load_format
+        )
+        return success, message
+
+    def init_weights_update_group(self, recv_req: InitWeightsUpdateGroupReqInput):
+        success, message = self.model_runner.init_weights_update_group(
+            recv_req.master_address,
+            recv_req.master_port,
+            recv_req.rank_offset,
+            recv_req.world_size,
+            recv_req.group_name,
+            recv_req.backend,
+        )
+        return success, message
+
+    def update_weights_from_distributed(
+        self, recv_req: UpdateWeightsFromDistributedReqInput
+    ):
+        success, message = self.model_runner.update_weights_from_distributed(
+            recv_req.names, recv_req.dtypes, recv_req.shapes, recv_req.group_name
+        )
+        return success, message
+
+    def update_weights_from_tensor(self, recv_req: UpdateWeightsFromTensorReqInput):
+
+        monkey_patch_torch_reductions()
+        success, message = self.model_runner.update_weights_from_tensor(
+            named_tensors=MultiprocessingSerializer.deserialize(
+                recv_req.serialized_named_tensors[self.tp_rank]
+            ),
+            load_format=recv_req.load_format,
+        )
+        return success, message
+
+    def get_weights_by_name(self, recv_req: GetWeightsByNameReqInput):
+        parameter = self.model_runner.get_weights_by_name(
+            recv_req.name, recv_req.truncate_size
+        )
+        return parameter
+
+    def load_lora_adapter(self, recv_req: LoadLoRAAdapterReqInput):
+        result = self.model_runner.load_lora_adapter(recv_req.to_ref())
+        return result
+
+    def unload_lora_adapter(self, recv_req: UnloadLoRAAdapterReqInput):
+        result = self.model_runner.unload_lora_adapter(recv_req.to_ref())
+        return result
+
+    def can_run_lora_batch(self, lora_ids: list[str]) -> bool:
+        return self.model_runner.lora_manager.validate_lora_batch(lora_ids)
diff --git a/python/sglang/srt/managers/tp_worker_overlap_thread.py b/python/sglang/srt/managers/tp_worker_overlap_thread.py
new file mode 100644
index 000000000..e72d4fb6e
--- /dev/null
+++ b/python/sglang/srt/managers/tp_worker_overlap_thread.py
@@ -0,0 +1,294 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A tensor parallel worker."""
+from __future__ import annotations
+
+import dataclasses
+import logging
+import signal
+import threading
+from queue import Queue
+from typing import TYPE_CHECKING, List, Optional, Tuple
+
+import psutil
+import torch
+
+from sglang.srt.managers.io_struct import (
+    GetWeightsByNameReqInput,
+    InitWeightsUpdateGroupReqInput,
+    LoadLoRAAdapterReqInput,
+    UnloadLoRAAdapterReqInput,
+    UpdateWeightFromDiskReqInput,
+    UpdateWeightsFromDistributedReqInput,
+    UpdateWeightsFromTensorReqInput,
+)
+from sglang.srt.managers.schedule_batch import ModelWorkerBatch
+from sglang.srt.managers.tp_worker import TpModelWorker
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import DynamicGradMode, get_compiler_backend
+from sglang.utils import get_exception_traceback
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.cache_controller import LayerDoneCounter
+
+logger = logging.getLogger(__name__)
+
+
+@torch.compile(dynamic=True, backend=get_compiler_backend())
+def resolve_future_token_ids(input_ids, future_token_ids_map):
+    input_ids[:] = torch.where(
+        input_ids < 0,
+        future_token_ids_map[torch.clamp(-input_ids, min=0)],
+        input_ids,
+    )
+
+
+class TpModelWorkerClient:
+    """A tensor parallel model worker."""
+
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        gpu_id: int,
+        tp_rank: int,
+        moe_ep_rank: int,
+        pp_rank: int,
+        dp_rank: Optional[int],
+        nccl_port: int,
+    ):
+        # Load the model
+        self.worker = TpModelWorker(
+            server_args, gpu_id, tp_rank, moe_ep_rank, pp_rank, dp_rank, nccl_port
+        )
+        self.max_running_requests = self.worker.max_running_requests
+        self.device = self.worker.device
+        self.gpu_id = gpu_id
+
+        # Init future mappings
+        self.future_token_ids_ct = 0
+        self.future_token_ids_limit = self.max_running_requests * 3
+        self.future_token_ids_map = torch.empty(
+            (self.max_running_requests * 5,), dtype=torch.int64, device=self.device
+        )
+
+        # Launch threads
+        self.input_queue = Queue[Tuple[ModelWorkerBatch, int, torch.Event]]()
+        self.output_queue = Queue()
+        self.forward_stream = torch.get_device_module(self.device).Stream()
+        self.forward_thread = threading.Thread(
+            target=self.forward_thread_func,
+        )
+        self.forward_thread.start()
+        self.parent_process = psutil.Process().parent()
+        self.scheduler_stream = torch.get_device_module(self.device).current_stream()
+        if self.device == "cpu":
+            self.scheduler_stream.synchronize = lambda: None  # No-op for CPU
+
+        self.hicache_layer_transfer_counter = None
+
+    def register_hicache_layer_transfer_counter(self, counter: LayerDoneCounter):
+        self.hicache_layer_transfer_counter = counter
+
+    def get_worker_info(self):
+        return self.worker.get_worker_info()
+
+    def get_tokens_per_layer_info(self):
+        return self.worker.get_tokens_per_layer_info()
+
+    @property
+    def sliding_window_size(self) -> Optional[int]:
+        return self.worker.sliding_window_size
+
+    @property
+    def is_hybrid(self) -> bool:
+        return self.worker.is_hybrid
+
+    def get_pad_input_ids_func(self):
+        return self.worker.get_pad_input_ids_func()
+
+    def get_tp_group(self):
+        return self.worker.get_tp_group()
+
+    def get_attention_tp_group(self):
+        return self.worker.get_attention_tp_group()
+
+    def get_attention_tp_cpu_group(self):
+        return self.worker.get_attention_tp_cpu_group()
+
+    def get_memory_pool(self):
+        return (
+            self.worker.model_runner.req_to_token_pool,
+            self.worker.model_runner.token_to_kv_pool_allocator,
+        )
+
+    def get_kv_cache(self):
+        return self.worker.model_runner.token_to_kv_pool
+
+    def forward_thread_func(self):
+        try:
+            with torch.get_device_module(self.device).stream(self.forward_stream):
+                self.forward_thread_func_()
+        except Exception:
+            traceback = get_exception_traceback()
+            logger.error(f"TpModelWorkerClient hit an exception: {traceback}")
+            self.parent_process.send_signal(signal.SIGQUIT)
+
+    @DynamicGradMode()
+    def forward_thread_func_(self):
+        batch_pt = 0
+        batch_lists: List = [None] * 2
+
+        while True:
+            model_worker_batch, future_token_ids_ct, sync_event = self.input_queue.get()
+            if not model_worker_batch:
+                break
+
+            sync_event.wait()
+
+            # Keep a reference of model_worker_batch by storing it into a list.
+            # Otherwise, the tensor members of model_worker_batch will be released
+            # by pytorch and cause CUDA illegal memory access errors.
+            batch_lists[batch_pt % 2] = model_worker_batch
+            batch_pt += 1
+
+            # Create event
+            copy_done = torch.get_device_module(self.device).Event()
+
+            # Resolve future tokens in the input
+            input_ids = model_worker_batch.input_ids
+            resolve_future_token_ids(input_ids, self.future_token_ids_map)
+
+            # Run forward
+            logits_output, next_token_ids, can_run_cuda_graph = (
+                self.worker.forward_batch_generation(
+                    model_worker_batch, model_worker_batch.launch_done
+                )
+            )
+
+            # Update the future token ids map
+            bs = len(model_worker_batch.seq_lens)
+            self.future_token_ids_map[
+                future_token_ids_ct + 1 : future_token_ids_ct + bs + 1
+            ] = next_token_ids
+
+            # Copy results to the CPU
+            if model_worker_batch.return_logprob:
+                logits_output.next_token_logprobs = (
+                    logits_output.next_token_logprobs.to("cpu", non_blocking=True)
+                )
+                if logits_output.input_token_logprobs is not None:
+                    logits_output.input_token_logprobs = (
+                        logits_output.input_token_logprobs.to("cpu", non_blocking=True)
+                    )
+            if logits_output.hidden_states is not None:
+                logits_output.hidden_states = logits_output.hidden_states.to(
+                    "cpu", non_blocking=True
+                )
+            next_token_ids = next_token_ids.to("cpu", non_blocking=True)
+            copy_done.record()
+
+            self.output_queue.put(
+                (copy_done, logits_output, next_token_ids, can_run_cuda_graph)
+            )
+
+    def resolve_last_batch_result(self, launch_done: Optional[threading.Event] = None):
+        """
+        This function is called to resolve the last batch result and
+        wait for the current batch to be launched. Used in overlap mode.
+        """
+        copy_done, logits_output, next_token_ids, can_run_cuda_graph = (
+            self.output_queue.get()
+        )
+
+        if launch_done is not None:
+            launch_done.wait()
+        copy_done.synchronize()
+
+        if logits_output.next_token_logprobs is not None:
+            logits_output.next_token_logprobs = (
+                logits_output.next_token_logprobs.tolist()
+            )
+            if logits_output.input_token_logprobs is not None:
+                logits_output.input_token_logprobs = tuple(
+                    logits_output.input_token_logprobs.tolist()
+                )
+        next_token_ids = next_token_ids.tolist()
+        return logits_output, next_token_ids, can_run_cuda_graph
+
+    def forward_batch_generation(
+        self, model_worker_batch: ModelWorkerBatch
+    ) -> Tuple[None, torch.Tensor, bool]:
+        # Create a new copy of sampling_info because it will be updated in-place by the scheduler for the next batch.
+        sampling_info = model_worker_batch.sampling_info
+        sampling_info.update_penalties()
+        model_worker_batch.sampling_info = self.cur_sampling_info = dataclasses.replace(
+            sampling_info,
+            sampling_info_done=threading.Event(),
+            penalizer_orchestrator=None,
+        )
+
+        # A cuda stream sync here to avoid the cuda illegal memory access error.
+        sync_event = torch.get_device_module(self.device).Event()
+        sync_event.record(self.scheduler_stream)
+
+        # Push a new batch to the queue
+        self.input_queue.put((model_worker_batch, self.future_token_ids_ct, sync_event))
+
+        # Allocate output future objects
+        bs = len(model_worker_batch.seq_lens)
+        future_next_token_ids = torch.arange(
+            -(self.future_token_ids_ct + 1),
+            -(self.future_token_ids_ct + 1 + bs),
+            -1,
+            dtype=torch.int64,
+            device=self.device,
+        )
+        self.future_token_ids_ct = (
+            self.future_token_ids_ct + bs
+        ) % self.future_token_ids_limit
+        return None, future_next_token_ids, False
+
+    def update_weights_from_disk(self, recv_req: UpdateWeightFromDiskReqInput):
+        success, message = self.worker.update_weights_from_disk(recv_req)
+        return success, message
+
+    def init_weights_update_group(self, recv_req: InitWeightsUpdateGroupReqInput):
+        success, message = self.worker.init_weights_update_group(recv_req)
+        return success, message
+
+    def update_weights_from_distributed(
+        self, recv_req: UpdateWeightsFromDistributedReqInput
+    ):
+        success, message = self.worker.update_weights_from_distributed(recv_req)
+        return success, message
+
+    def update_weights_from_tensor(self, recv_req: UpdateWeightsFromTensorReqInput):
+        success, message = self.worker.update_weights_from_tensor(recv_req)
+        return success, message
+
+    def get_weights_by_name(self, recv_req: GetWeightsByNameReqInput):
+        return self.worker.get_weights_by_name(recv_req)
+
+    def load_lora_adapter(self, recv_req: LoadLoRAAdapterReqInput):
+        return self.worker.load_lora_adapter(recv_req)
+
+    def unload_lora_adapter(self, recv_req: UnloadLoRAAdapterReqInput):
+        return self.worker.unload_lora_adapter(recv_req)
+
+    def can_run_lora_batch(self, lora_ids: list[str]) -> bool:
+        return self.worker.can_run_lora_batch(lora_ids)
+
+    def __delete__(self):
+        self.input_queue.put((None, None))
+        self.copy_queue.put((None, None, None))
diff --git a/python/sglang/srt/managers/utils.py b/python/sglang/srt/managers/utils.py
new file mode 100644
index 000000000..de83c4590
--- /dev/null
+++ b/python/sglang/srt/managers/utils.py
@@ -0,0 +1,142 @@
+from __future__ import annotations
+
+import logging
+import multiprocessing as mp
+from http import HTTPStatus
+from typing import TYPE_CHECKING, Dict, List, Optional
+
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.managers.schedule_batch import FINISH_ABORT, Req
+from sglang.srt.model_executor.forward_batch_info import PPProxyTensors
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.scheduler import GenerationBatchResult
+
+logger = logging.getLogger(__name__)
+
+
+def validate_input_length(
+    req: Req, max_req_input_len: int, allow_auto_truncate: bool
+) -> Optional[str]:
+    """Validate and potentially truncate input length.
+
+    Args:
+        req: The request containing input_ids to validate
+        max_req_input_len: Maximum allowed input length
+        allow_auto_truncate: Whether to truncate long inputs
+
+    Returns:
+        Error message if validation fails, None if successful
+    """
+    if len(req.origin_input_ids) >= max_req_input_len:
+        if allow_auto_truncate:
+            logger.warning(
+                "Request length is longer than the KV cache pool size or "
+                "the max context length. Truncated. "
+                f"{len(req.origin_input_ids)=}, {max_req_input_len=}."
+            )
+            req.origin_input_ids = req.origin_input_ids[:max_req_input_len]
+            return None
+        else:
+            error_msg = (
+                f"Input length ({len(req.origin_input_ids)} tokens) exceeds "
+                f"the maximum allowed length ({max_req_input_len} tokens). "
+                f"Use a shorter input or enable --allow-auto-truncate."
+            )
+            return error_msg
+
+    return None
+
+
+def get_logprob_dict_from_result(result: GenerationBatchResult) -> dict:
+
+    logits_output = result.logits_output
+    assert logits_output is not None
+
+    return {
+        "extend_input_len_per_req": result.extend_input_len_per_req,
+        "extend_logprob_start_len_per_req": result.extend_logprob_start_len_per_req,
+        "next_token_logprobs": result.logits_output.next_token_logprobs,
+        "next_token_top_logprobs_val": result.logits_output.next_token_top_logprobs_val,
+        "next_token_top_logprobs_idx": result.logits_output.next_token_top_logprobs_idx,
+        "next_token_token_ids_logprobs_val": result.logits_output.next_token_token_ids_logprobs_val,
+        "next_token_token_ids_logprobs_idx": result.logits_output.next_token_token_ids_logprobs_idx,
+        "input_token_logprobs": result.logits_output.input_token_logprobs,
+        "input_top_logprobs_val": result.logits_output.input_top_logprobs_val,
+        "input_top_logprobs_idx": result.logits_output.input_top_logprobs_idx,
+        "input_token_ids_logprobs_val": result.logits_output.input_token_ids_logprobs_val,
+        "input_token_ids_logprobs_idx": result.logits_output.input_token_ids_logprobs_idx,
+    }
+
+
+def get_logprob_from_pp_outputs(
+    next_pp_outputs: PPProxyTensors,
+) -> tuple[LogitsProcessorOutput, list[int], list[int]]:
+    logits_output = LogitsProcessorOutput(
+        # Do not send logits and hidden states because they are large
+        next_token_logits=None,
+        hidden_states=None,
+        next_token_logprobs=next_pp_outputs["next_token_logprobs"],
+        next_token_top_logprobs_val=next_pp_outputs["next_token_top_logprobs_val"],
+        next_token_top_logprobs_idx=next_pp_outputs["next_token_top_logprobs_idx"],
+        next_token_token_ids_logprobs_val=next_pp_outputs[
+            "next_token_token_ids_logprobs_val"
+        ],
+        next_token_token_ids_logprobs_idx=next_pp_outputs[
+            "next_token_token_ids_logprobs_idx"
+        ],
+        input_token_logprobs=next_pp_outputs["input_token_logprobs"],
+        input_top_logprobs_val=next_pp_outputs["input_top_logprobs_val"],
+        input_top_logprobs_idx=next_pp_outputs["input_top_logprobs_idx"],
+        input_token_ids_logprobs_val=next_pp_outputs["input_token_ids_logprobs_val"],
+        input_token_ids_logprobs_idx=next_pp_outputs["input_token_ids_logprobs_idx"],
+    )
+    extend_input_len_per_req = next_pp_outputs["extend_input_len_per_req"]
+    extend_logprob_start_len_per_req = next_pp_outputs[
+        "extend_logprob_start_len_per_req"
+    ]
+
+    return logits_output, extend_input_len_per_req, extend_logprob_start_len_per_req
+
+
+class DPBalanceMeta:
+    """
+    This class will be use in scheduler and dp controller
+    """
+
+    def __init__(self, num_workers: int):
+        self.num_workers = num_workers
+        self._manager = mp.Manager()
+        self.mutex = self._manager.Lock()
+
+        init_local_tokens = [0] * self.num_workers
+        init_onfly_info = [self._manager.dict() for _ in range(self.num_workers)]
+
+        self.shared_state = self._manager.Namespace()
+        self.shared_state.local_tokens = self._manager.list(init_local_tokens)
+        self.shared_state.onfly_info = self._manager.list(init_onfly_info)
+
+    def destructor(self):
+        # we must destructor this class manually
+        self._manager.shutdown()
+
+    def get_shared_onfly(self) -> List[Dict[int, int]]:
+        return [dict(d) for d in self.shared_state.onfly_info]
+
+    def set_shared_onfly_info(self, data: List[Dict[int, int]]):
+        self.shared_state.onfly_info = data
+
+    def get_shared_local_tokens(self) -> List[int]:
+        return list(self.shared_state.local_tokens)
+
+    def set_shared_local_tokens(self, data: List[int]):
+        self.shared_state.local_tokens = data
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        del state["_manager"]
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        self._manager = None
diff --git a/python/sglang/srt/mem_cache/allocator.py b/python/sglang/srt/mem_cache/allocator.py
new file mode 100644
index 000000000..497331673
--- /dev/null
+++ b/python/sglang/srt/mem_cache/allocator.py
@@ -0,0 +1,581 @@
+from __future__ import annotations
+
+"""
+Copyright 2025 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""
+Page-aligned memory pool.
+"""
+
+import abc
+from typing import TYPE_CHECKING
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.mem_cache.memory_pool import SWAKVPool
+from sglang.srt.utils import get_bool_env_var, next_power_of_2
+
+if TYPE_CHECKING:
+    from sglang.srt.mem_cache.memory_pool import KVCache
+
+
+class BaseTokenToKVPoolAllocator(abc.ABC):
+    @abc.abstractmethod
+    def __init__(
+        self,
+        size: int,
+        page_size: int,
+        dtype: torch.dtype,
+        device: str,
+        kvcache: KVCache,
+        need_sort: bool,
+    ):
+        self.size = size
+        self.page_size = page_size
+        self.dtype = dtype
+        self.device = device
+        self._kvcache = kvcache
+        self.need_sort = need_sort
+
+        self.free_pages = None
+        self.release_pages = None
+        self.is_not_in_free_group = True
+        self.free_group = []
+
+    def debug_print(self) -> str:
+        return ""
+
+    def available_size(self):
+        return (len(self.free_pages) + len(self.release_pages)) * self.page_size
+
+    def get_kvcache(self):
+        return self._kvcache
+
+    def restore_state(self, state):
+        self.free_pages, self.release_pages = state
+
+    def backup_state(self):
+        return (self.free_pages, self.release_pages)
+
+    def free_group_begin(self):
+        self.is_not_in_free_group = False
+        self.free_group = []
+
+    def free_group_end(self):
+        self.is_not_in_free_group = True
+        if self.free_group:
+            self.free(torch.cat(self.free_group))
+
+    def merge_and_sort_free(self):
+        if len(self.release_pages) > 0:
+            self.free_pages = torch.cat((self.free_pages, self.release_pages))
+            self.free_pages, _ = torch.sort(self.free_pages)
+            self.release_pages = torch.empty(
+                (0,), dtype=self.release_pages.dtype, device=self.device
+            )
+
+    def get_cpu_copy(self, *args, **kwargs):
+        # FIXME: reuse the get_cpu_copy after paged allocator is implemented
+        raise NotImplementedError()
+
+    def load_cpu_copy(self, *args, **kwargs):
+        # FIXME: reuse the load_cpu_copy after paged allocator is implemented
+        raise NotImplementedError()
+
+    def alloc_extend(self, *args, **kwargs):
+        raise NotImplementedError("alloc_extend is only for paged allocator")
+
+    def alloc_decode(self, *args, **kwargs):
+        raise NotImplementedError("alloc_decode is only for paged allocator")
+
+    @abc.abstractmethod
+    def clear(self):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def alloc(self, need_size: int):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def free(self, free_index: torch.Tensor):
+        raise NotImplementedError()
+
+
+class TokenToKVPoolAllocator(BaseTokenToKVPoolAllocator):
+    """An allocator managing the indices to kv cache data."""
+
+    def __init__(
+        self,
+        size: int,
+        dtype: torch.dtype,
+        device: str,
+        kvcache: KVCache,
+        need_sort: bool,
+    ):
+        super().__init__(size, 1, dtype, device, kvcache, need_sort)
+        self.clear()
+
+    def clear(self):
+        # The padded slot 0 is used for writing dummy outputs from padded tokens.
+        self.free_pages = torch.arange(
+            1, self.size + 1, dtype=torch.int64, device=self.device
+        )
+        self.is_not_in_free_group = True
+        self.free_group = []
+        self.release_pages = torch.empty((0,), dtype=torch.int64, device=self.device)
+
+    def available_size(self):
+        # To avoid minor "len(free_pages) * 1" overhead
+        return len(self.free_pages) + len(self.release_pages)
+
+    def alloc(self, need_size: int):
+        if self.need_sort and need_size > len(self.free_pages):
+            self.merge_and_sort_free()
+
+        if need_size > len(self.free_pages):
+            return None
+
+        select_index = self.free_pages[:need_size]
+        self.free_pages = self.free_pages[need_size:]
+        return select_index
+
+    def free(self, free_index: torch.Tensor):
+        if free_index.numel() == 0:
+            return
+
+        if self.is_not_in_free_group:
+            if self.need_sort:
+                self.release_pages = torch.cat((self.release_pages, free_index))
+            else:
+                self.free_pages = torch.cat((self.free_pages, free_index))
+        else:
+            self.free_group.append(free_index)
+
+    def get_cpu_copy(self, indices):
+        return self._kvcache.get_cpu_copy(indices)
+
+    def load_cpu_copy(self, kv_cache_cpu, indices):
+        return self._kvcache.load_cpu_copy(kv_cache_cpu, indices)
+
+
+class SWATokenToKVPoolAllocator(BaseTokenToKVPoolAllocator):
+    """Allocator for SWA hybrid KV cache."""
+
+    def __init__(
+        self,
+        size: int,
+        size_swa: int,
+        dtype: torch.dtype,
+        device: str,
+        kvcache: SWAKVPool,
+        need_sort: bool,
+    ):
+        super().__init__(size, 1, dtype, device, kvcache, need_sort)
+        assert isinstance(kvcache, SWAKVPool)
+        self._size_full = size
+        self._size_swa = size_swa
+        self.full_attn_allocator = TokenToKVPoolAllocator(
+            size,
+            dtype,
+            device,
+            kvcache.full_kv_pool,
+            need_sort,
+        )
+        self.swa_attn_allocator = TokenToKVPoolAllocator(
+            size_swa,
+            dtype,
+            device,
+            kvcache.swa_kv_pool,
+            need_sort,
+        )
+        self.full_to_swa_index_mapping = torch.empty(
+            size + size_swa + 1,
+            dtype=torch.int64,
+            device=device,
+        )
+        self.clear()
+
+        self._kvcache.full_to_swa_index_mapping = self.full_to_swa_index_mapping
+
+    def available_size(self):
+        raise NotImplementedError()
+
+    def full_available_size(self):
+        return self.full_attn_allocator.available_size()
+
+    def swa_available_size(self):
+        return self.swa_attn_allocator.available_size()
+
+    @property
+    def size_full(self):
+        return self._size_full
+
+    @property
+    def size_swa(self):
+        return self._size_swa
+
+    def debug_print(self) -> str:
+        msg = ""
+        msg += f"#swa-available-size: {self.swa_attn_allocator.available_size()}, "
+        msg += (
+            f"#full-attn-available-size: {self.full_attn_allocator.available_size()}, "
+        )
+        return msg
+
+    def get_kvcache(self):
+        return self._kvcache
+
+    def translate_loc_from_full_to_swa(self, kv_indices: torch.Tensor):
+        assert self.full_to_swa_index_mapping is not None
+        return self.full_to_swa_index_mapping[kv_indices].to(torch.int32)
+
+    def alloc(self, need_size: int):
+        if need_size > self.full_attn_allocator.available_size():
+            return None
+        if need_size > self.swa_attn_allocator.available_size():
+            return None
+
+        alloc_full_indices = self.full_attn_allocator.alloc(need_size)
+        alloc_swa_indices = self.swa_attn_allocator.alloc(need_size)
+        self.full_to_swa_index_mapping[alloc_full_indices] = alloc_swa_indices
+        return alloc_full_indices
+
+    def free(self, free_index: torch.Tensor):
+        if free_index.numel() == 0:
+            return
+        if self.is_not_in_free_group:
+            self.full_attn_allocator.free(free_index)
+            self.free_swa(free_index)
+        else:
+            self.free_group.append(free_index)
+        assert (
+            self.full_attn_allocator.available_size() <= self.full_attn_allocator.size
+        )
+        assert self.swa_attn_allocator.available_size() <= self.swa_attn_allocator.size
+
+    def free_swa(self, free_index: torch.Tensor):
+        swa_indices = self.full_to_swa_index_mapping[free_index]
+        swa_indices = swa_indices[swa_indices > 0]
+        self.swa_attn_allocator.free(swa_indices)
+        self.full_to_swa_index_mapping[free_index] = 0
+
+    def backup_state(self):
+        raise NotImplementedError
+
+    def restore_state(self, state):
+        raise NotImplementedError
+
+    def clear(self):
+        self.swa_attn_allocator.clear()
+        self.full_attn_allocator.clear()
+        self.full_to_swa_index_mapping.fill_(0)
+        self.is_not_in_free_group = True
+        self.free_group = []
+
+
+@triton.jit
+def alloc_extend_kernel(
+    pre_lens_ptr,
+    seq_lens_ptr,
+    last_loc_ptr,
+    free_page_ptr,
+    out_indices,
+    ret_values,
+    bs_upper: tl.constexpr,
+    page_size: tl.constexpr,
+    max_num_extend_tokens: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    load_offset = tl.arange(0, bs_upper)
+    seq_lens = tl.load(seq_lens_ptr + load_offset, mask=load_offset <= pid)
+    pre_lens = tl.load(pre_lens_ptr + load_offset, mask=load_offset <= pid)
+    extend_lens = seq_lens - pre_lens
+
+    seq_len = tl.load(seq_lens_ptr + pid)
+    pre_len = tl.load(pre_lens_ptr + pid)
+    extend_len = seq_len - pre_len
+
+    sum_extend_lens = tl.sum(extend_lens)
+    output_start_loc = sum_extend_lens - extend_len
+
+    num_pages_after = (seq_lens + page_size - 1) // page_size
+    num_pages_before = (pre_lens + page_size - 1) // page_size
+    num_new_pages = num_pages_after - num_pages_before
+
+    num_page_start_loc_self = (seq_len + page_size - 1) // page_size - (
+        pre_len + page_size - 1
+    ) // page_size
+    sum_num_new_pages = tl.sum(num_new_pages)
+    new_page_start_loc = sum_num_new_pages - num_page_start_loc_self
+
+    # Return value
+    if pid == tl.num_programs(0) - 1:
+        merged_value = (sum_num_new_pages.to(tl.int64)) << 32 | sum_extend_lens.to(
+            tl.int64
+        )
+        tl.store(ret_values, merged_value)
+
+    # Part 1: fill the old partial page
+    last_loc = tl.load(last_loc_ptr + pid)
+    num_part1 = (
+        min(seq_len, (pre_len + page_size - 1) // page_size * page_size) - pre_len
+    )
+    offset_one_page = tl.arange(0, page_size)
+    tl.store(
+        out_indices + output_start_loc + offset_one_page,
+        last_loc + 1 + offset_one_page,
+        mask=offset_one_page < num_part1,
+    )
+    if pre_len + num_part1 == seq_len:
+        return
+
+    # Part 2: fill the new full pages
+    num_part2 = (
+        seq_len // page_size * page_size
+        - (pre_len + page_size - 1) // page_size * page_size
+    )
+
+    offset_many_page = tl.arange(0, max_num_extend_tokens)
+    page_start = tl.load(
+        free_page_ptr + new_page_start_loc + offset_many_page // page_size,
+        mask=offset_many_page < num_part2,
+    )
+    tl.store(
+        out_indices + output_start_loc + num_part1 + offset_many_page,
+        page_start * page_size + offset_many_page % page_size,
+        mask=offset_many_page < num_part2,
+    )
+    if pre_len + num_part1 + num_part2 == seq_len:
+        return
+
+    # Part 3: fill the new partial page
+    num_part3 = seq_len - seq_len // page_size * page_size
+    start_loc = tl.load(
+        free_page_ptr + new_page_start_loc + num_page_start_loc_self - 1
+    )
+    tl.store(
+        out_indices + output_start_loc + num_part1 + num_part2 + offset_one_page,
+        start_loc * page_size + offset_one_page,
+        mask=offset_one_page < num_part3,
+    )
+
+
+@triton.jit
+def alloc_decode_kernel(
+    seq_lens_ptr,
+    last_loc_ptr,
+    free_page_ptr,
+    out_indices,
+    ret_values,
+    bs_upper: tl.constexpr,
+    page_size: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    load_offset = tl.arange(0, bs_upper)
+    seq_lens = tl.load(seq_lens_ptr + load_offset, mask=load_offset <= pid)
+    pre_lens = tl.where(load_offset <= pid, seq_lens - 1, seq_lens)
+
+    seq_len = tl.load(seq_lens_ptr + pid)
+    pre_len = seq_len - 1
+
+    num_pages_after = (seq_lens + page_size - 1) // page_size
+    num_pages_before = (pre_lens + page_size - 1) // page_size
+    num_new_pages = num_pages_after - num_pages_before
+
+    num_page_start_loc_self = (seq_len + page_size - 1) // page_size - (
+        pre_len + page_size - 1
+    ) // page_size
+    sum_num_new_pages = tl.sum(num_new_pages)
+    new_page_start_loc = sum_num_new_pages - num_page_start_loc_self
+
+    # Return value
+    if pid == tl.num_programs(0) - 1:
+        tl.store(ret_values, sum_num_new_pages)
+
+    if num_page_start_loc_self == 0:
+        last_loc = tl.load(last_loc_ptr + pid)
+        tl.store(out_indices + pid, last_loc + 1)
+    else:
+        page = tl.load(free_page_ptr + new_page_start_loc)
+        tl.store(out_indices + pid, page * page_size)
+
+
+class PagedTokenToKVPoolAllocator(BaseTokenToKVPoolAllocator):
+    """
+    An allocator managing the indices to kv cache data.
+
+    This class has the same interface as `TokenToKVPoolAllocator` but the output
+    of one request is always page-aligned.
+
+    TODO: fuse last_loc into the kernel.
+    """
+
+    def __init__(
+        self,
+        size: int,
+        page_size: int,
+        dtype: torch.dtype,
+        device: str,
+        kvcache: KVCache,
+        need_sort: bool,
+    ):
+        super().__init__(size, page_size, dtype, device, kvcache, need_sort)
+        self.num_pages = size // page_size
+        self.debug_mode = get_bool_env_var("SGLANG_DEBUG_MEMORY_POOL")
+        self.ret_values = torch.empty((), dtype=torch.int64, device=self.device)
+        self.seen_max_num_extend_tokens_next_power_of_2 = 1
+        self.clear()
+
+    def alloc(self, need_size: int):
+        # page-aligned allocation, returning contiguous indices of pages
+        if self.debug_mode:
+            assert (
+                need_size % self.page_size == 0
+            ), "The allocation size should be page-aligned"
+
+        num_pages = need_size // self.page_size
+        if self.need_sort and num_pages > len(self.free_pages):
+            self.merge_and_sort_free()
+        if num_pages > len(self.free_pages):
+            return None
+
+        out_pages = self.free_pages[:num_pages]
+        self.free_pages = self.free_pages[num_pages:]
+
+        out_indices = (
+            out_pages[:, None] * self.page_size
+            + torch.arange(self.page_size, device=self.device)
+        ).reshape(-1)
+
+        return out_indices
+
+    def alloc_extend(
+        self,
+        prefix_lens: torch.Tensor,
+        seq_lens: torch.Tensor,
+        last_loc: torch.Tensor,
+        extend_num_tokens: int,
+    ):
+        if self.debug_mode:
+            assert torch.all(
+                (last_loc + 1) % self.page_size == prefix_lens % self.page_size
+            )
+
+        self.seen_max_num_extend_tokens_next_power_of_2 = max(
+            self.seen_max_num_extend_tokens_next_power_of_2,
+            next_power_of_2(extend_num_tokens),
+        )
+
+        bs = len(prefix_lens)
+        if self.need_sort and extend_num_tokens // self.page_size + bs + 1 > len(
+            self.free_pages
+        ):
+            self.merge_and_sort_free()
+
+        out_indices = torch.empty(
+            (extend_num_tokens,), dtype=torch.int64, device=self.device
+        )
+        alloc_extend_kernel[(bs,)](
+            prefix_lens,
+            seq_lens,
+            last_loc,
+            self.free_pages,
+            out_indices,
+            self.ret_values,
+            next_power_of_2(bs),
+            self.page_size,
+            self.seen_max_num_extend_tokens_next_power_of_2,
+        )
+
+        if self.debug_mode:
+            assert len(torch.unique(out_indices)) == len(out_indices)
+
+        merged_value = self.ret_values.item()
+        num_new_pages = merged_value >> 32
+        if num_new_pages > len(self.free_pages):
+            return None
+
+        self.free_pages = self.free_pages[num_new_pages:]
+        return out_indices
+
+    def alloc_decode(
+        self,
+        seq_lens: torch.Tensor,
+        last_loc: torch.Tensor,
+    ):
+        if self.debug_mode:
+            assert torch.all(
+                (last_loc + 2) % self.page_size == seq_lens % self.page_size
+            )
+
+        bs = len(seq_lens)
+        if self.need_sort and bs > len(self.free_pages):
+            self.merge_and_sort_free()
+
+        out_indices = torch.empty((bs,), dtype=torch.int64, device=self.device)
+        alloc_decode_kernel[(bs,)](
+            seq_lens,
+            last_loc,
+            self.free_pages,
+            out_indices,
+            self.ret_values,
+            next_power_of_2(bs),
+            self.page_size,
+        )
+
+        if self.debug_mode:
+            assert len(torch.unique(out_indices)) == len(out_indices)
+
+        num_new_pages = self.ret_values.item()
+        if num_new_pages > len(self.free_pages):
+            return None
+
+        self.free_pages = self.free_pages[num_new_pages:]
+        return out_indices
+
+    def free(self, free_index: torch.Tensor):
+        if free_index.numel() == 0:
+            return
+
+        if self.is_not_in_free_group:
+            free_page_indices = torch.unique(free_index // self.page_size)
+            if self.need_sort:
+                self.release_pages = torch.cat((free_page_indices, self.release_pages))
+            else:
+                self.free_pages = torch.cat((free_page_indices, self.free_pages))
+        else:
+            self.free_group.append(free_index)
+
+        if self.debug_mode:
+            assert len(torch.unique(self.free_pages)) == len(self.free_pages)
+
+    def clear(self):
+        # The padded slot 0 is used for writing dummy outputs from padded tokens.
+        self.free_pages = torch.arange(
+            1, self.num_pages + 1, dtype=torch.int64, device=self.device
+        )
+        self.is_not_in_free_group = True
+        self.free_group = []
+        self.release_pages = torch.empty((0,), dtype=torch.int64, device=self.device)
+
+    def get_cpu_copy(self, indices):
+        return self._kvcache.get_cpu_copy(indices)
+
+    def load_cpu_copy(self, kv_cache_cpu, indices):
+        return self._kvcache.load_cpu_copy(kv_cache_cpu, indices)
diff --git a/python/sglang/srt/mem_cache/allocator_ascend.py b/python/sglang/srt/mem_cache/allocator_ascend.py
new file mode 100644
index 000000000..2af138a6c
--- /dev/null
+++ b/python/sglang/srt/mem_cache/allocator_ascend.py
@@ -0,0 +1,147 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import torch
+
+from sglang.srt.mem_cache.allocator import PagedTokenToKVPoolAllocator
+
+if TYPE_CHECKING:
+    from sglang.srt.mem_cache.memory_pool import KVCache
+
+
+def alloc_extend_kernel_ascend(
+    prefix_lens,
+    seq_lens,
+    last_loc,
+    free_pages,
+    out_indices,
+    page_size,
+    device,
+):
+    extend_lens = seq_lens - prefix_lens
+    end_pos = torch.cumsum(extend_lens, 0)
+    start_pos = end_pos - extend_lens
+    num_new_pages = (seq_lens + page_size - 1) // page_size - (
+        prefix_lens + page_size - 1
+    ) // page_size
+    num_full_new_pages = (seq_lens) // page_size - (
+        prefix_lens + page_size - 1
+    ) // page_size
+    need_page = num_new_pages - num_full_new_pages
+    end_new_pages = torch.cumsum(num_new_pages, 0)
+    start_new_pages = end_new_pages - num_new_pages
+    pos_in_page = torch.arange(page_size, device=device, dtype=torch.int32)
+    for i in range(len(prefix_lens)):
+        num1 = (
+            min(
+                seq_lens[i],
+                (prefix_lens[i] + page_size - 1) // page_size * page_size,
+            )
+            - prefix_lens[i]
+        )
+        if num1:
+            out_indices[start_pos[i] : start_pos[i] + num1] = (
+                last_loc[i] + 1 + pos_in_page[:num1].view(-1)
+            )
+
+        num2 = (
+            seq_lens[i] // page_size - (prefix_lens[i] + page_size - 1) // page_size
+        ) * page_size
+        if num2:
+            pages = (
+                free_pages[start_new_pages[i] : end_new_pages[i] - need_page[i]]
+                * page_size
+            )
+            out_indices[start_pos[i] + num1 : start_pos[i] + num1 + num2] = (
+                pages.view(-1, 1) + pos_in_page.view(1, -1)
+            ).view(-1)
+
+        num3 = seq_lens[i] - seq_lens[i] // page_size * page_size
+        if num3:
+            out_indices[end_pos[i] - num3 : end_pos[i]] = (
+                free_pages[end_new_pages[i] - 1] * page_size + pos_in_page[:num3]
+            ).view(-1)
+
+
+class AscendPagedTokenToKVPoolAllocator(PagedTokenToKVPoolAllocator):
+
+    def alloc_extend(
+        self,
+        prefix_lens: torch.Tensor,
+        seq_lens: torch.Tensor,
+        last_loc: torch.Tensor,
+        extend_num_tokens: int,
+    ):
+        if self.debug_mode:
+            assert torch.all(
+                (last_loc + 1) % self.page_size == prefix_lens % self.page_size
+            )
+
+        num_new_pages = (
+            (
+                (seq_lens + self.page_size - 1) // self.page_size
+                - (prefix_lens + self.page_size - 1) // self.page_size
+            )
+            .sum()
+            .item()
+        )
+        if self.need_sort and num_new_pages > len(self.free_pages):
+            self.merge_and_sort_free()
+
+        if num_new_pages > len(self.free_pages):
+            return None
+
+        out_indices = torch.empty(
+            (extend_num_tokens,), dtype=torch.int32, device=self.device
+        )
+
+        alloc_extend_kernel_ascend(
+            prefix_lens,
+            seq_lens,
+            last_loc,
+            self.free_pages,
+            out_indices,
+            self.page_size,
+            self.device,
+        )
+
+        if self.debug_mode:
+            assert len(torch.unique(out_indices)) == len(out_indices)
+
+        self.free_pages = self.free_pages[num_new_pages:]
+        return out_indices
+
+    def alloc_decode(
+        self,
+        seq_lens: torch.Tensor,
+        last_loc: torch.Tensor,
+    ):
+        if self.debug_mode:
+            assert torch.all(
+                (last_loc + 2) % self.page_size == seq_lens % self.page_size
+            )
+
+        need_new_pages = (seq_lens % self.page_size == 1).int()
+        num_new_pages = need_new_pages.sum().item()
+
+        if num_new_pages > len(self.free_pages):
+            self.merge_and_sort_free()
+
+        if num_new_pages > len(self.free_pages):
+            return None
+
+        end_new_pages = torch.cumsum(need_new_pages, 0)
+        start_new_pages = end_new_pages - need_new_pages
+        if num_new_pages == 0:
+            out_indices = last_loc + 1
+        else:
+            out_indices = (last_loc + 1) * (1 - need_new_pages) + self.free_pages[
+                start_new_pages
+            ] * self.page_size * need_new_pages
+
+        if self.debug_mode:
+            assert len(torch.unique(out_indices)) == len(out_indices)
+
+        self.free_pages = self.free_pages[num_new_pages:]
+        return out_indices.int()
diff --git a/python/sglang/srt/mem_cache/base_prefix_cache.py b/python/sglang/srt/mem_cache/base_prefix_cache.py
new file mode 100644
index 000000000..4fdd04b72
--- /dev/null
+++ b/python/sglang/srt/mem_cache/base_prefix_cache.py
@@ -0,0 +1,109 @@
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any, List, NamedTuple, Optional, Tuple
+
+import torch
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req
+else:
+    Req = Any  # Placeholder for Req type when not type checking
+
+
+class MatchResult(NamedTuple):
+    """Result of a prefix match operation.
+
+    Attributes:
+        device_indices  :   Indices of the KV cache on the device matched by common prefix.
+        last_device_node:   The last TreeNode on the device that was matched.
+        last_host_node  :   The last TreeNode on the host that was matched.
+                            Note that if HiCache is not enabled,
+                            this **must** be the same as `last_device_node`.
+        host_hit_length :   Length of the KV cache hit on the host, if applicable.
+                            0 if HiCache is not enabled.
+    """
+
+    device_indices: torch.Tensor
+    last_device_node: Any
+    last_host_node: Any
+    host_hit_length: int = 0
+
+
+class BasePrefixCache(ABC):
+    """Cache can be indexed by either rid or key."""
+
+    @abstractmethod
+    def reset(self):
+        pass
+
+    @abstractmethod
+    def match_prefix(self, key: List[int], **kwargs) -> MatchResult:
+        pass
+
+    @abstractmethod
+    def cache_finished_req(self, req: Req, **kwargs):
+        pass
+
+    @abstractmethod
+    def cache_unfinished_req(self, req: Req, **kwargs):
+        pass
+
+    @abstractmethod
+    def evict(self, num_tokens: int):
+        pass
+
+    @abstractmethod
+    def inc_lock_ref(self, node: Any):
+        pass
+
+    @abstractmethod
+    def dec_lock_ref(self, node: Any, swa_uuid_for_lock: Optional[str] = None):
+        pass
+
+    def evictable_size(self):
+        return 0
+
+    def full_evictable_size(self):
+        return 0
+
+    def swa_evictable_size(self):
+        return 0
+
+    def protected_size(self):
+        return 0
+
+    def full_protected_size(self):
+        return 0
+
+    def swa_protected_size(self):
+        return 0
+
+    def total_size(self):
+        raise NotImplementedError()
+
+    def pretty_print(self):
+        raise NotImplementedError()
+
+    def init_load_back(
+        self,
+        last_host_node: Any,
+        host_hit_length: int,
+    ) -> Tuple[torch.Tensor, Any]:
+        """
+        Preparing KV cache loading from host to device.
+        """
+        raise NotImplementedError()
+
+    def ready_to_load_host_cache(self) -> Any:
+        """
+        Notify the cache controller to start the KV cache loading
+        """
+        raise NotImplementedError()
+
+    def check_hicache_events(self) -> Any:
+        """
+        Check HiCache related activities to update radix tree and synchronize across TP workers if needed
+        """
+        raise NotImplementedError()
+
+    def take_events(self):
+        return []
diff --git a/python/sglang/srt/mem_cache/chunk_cache.py b/python/sglang/srt/mem_cache/chunk_cache.py
new file mode 100644
index 000000000..1a576bfa2
--- /dev/null
+++ b/python/sglang/srt/mem_cache/chunk_cache.py
@@ -0,0 +1,100 @@
+from __future__ import annotations
+
+"""Cache for chunked prefill, used when RadixCache is disabled."""
+
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+
+from sglang.srt.mem_cache.allocator import (
+    BaseTokenToKVPoolAllocator,
+    SWATokenToKVPoolAllocator,
+)
+from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache, MatchResult
+from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req
+
+
+class ChunkCache(BasePrefixCache):
+    def __init__(
+        self,
+        req_to_token_pool: ReqToTokenPool,
+        token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator,
+        page_size: int,
+    ):
+        self.req_to_token_pool = req_to_token_pool
+        self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
+        self.page_size = page_size
+
+    def reset(self):
+        pass
+
+    def match_prefix(self, **unused_kwargs) -> MatchResult:
+        return MatchResult(
+            device_indices=torch.empty((0,), dtype=torch.int64),
+            last_device_node=None,
+            last_host_node=None,
+        )
+
+    def cache_finished_req(self, req: Req):
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx,
+            # For decode server: if req.output_ids is empty, we want to free all req.origin_input_ids
+            : len(req.origin_input_ids) + max(len(req.output_ids) - 1, 0),
+        ]
+        self.req_to_token_pool.free(req.req_pool_idx)
+        self.token_to_kv_pool_allocator.free(kv_indices)
+
+    def cache_unfinished_req(self, req: Req, chunked=False):
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, : len(req.fill_ids)
+        ]
+
+        # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
+        req.prefix_indices = kv_indices
+
+    def evict(self, num_tokens: int):
+        pass
+
+    def inc_lock_ref(self, node: Any):
+        return 0
+
+    def dec_lock_ref(self, node: Any, swa_uuid_for_lock: Optional[str] = None):
+        return 0
+
+    def pretty_print(self):
+        return ""
+
+
+class SWAChunkCache(ChunkCache):
+    """ChunkCache with support for hybrid KV cache operations."""
+
+    def __init__(
+        self,
+        req_to_token_pool: ReqToTokenPool,
+        token_to_kv_pool_allocator: SWATokenToKVPoolAllocator,
+        page_size: int,
+    ):
+        super().__init__(req_to_token_pool, token_to_kv_pool_allocator, page_size)
+        assert isinstance(token_to_kv_pool_allocator, SWATokenToKVPoolAllocator)
+
+    def evict_swa(
+        self,
+        req: Req,
+        prelen: int,
+        attention_chunk_size: int,
+    ):
+        if prelen >= req.evicted_seqlen_local + attention_chunk_size:
+            new_evicted_seqlen_local = attention_chunk_size * (
+                prelen // attention_chunk_size
+            )
+            free_slots = self.req_to_token_pool.req_to_token[
+                req.req_pool_idx, req.evicted_seqlen_local : new_evicted_seqlen_local
+            ]
+            self.token_to_kv_pool_allocator.free_swa(free_slots)
+            req.evicted_seqlen_local = new_evicted_seqlen_local
+
+    def evict(self, num_tokens: int):
+        pass
diff --git a/python/sglang/srt/mem_cache/cpp_radix_tree/.clang-format b/python/sglang/srt/mem_cache/cpp_radix_tree/.clang-format
new file mode 120000
index 000000000..5a7a8cea7
--- /dev/null
+++ b/python/sglang/srt/mem_cache/cpp_radix_tree/.clang-format
@@ -0,0 +1 @@
+../../../../../sgl-kernel/.clang-format
\ No newline at end of file
diff --git a/python/sglang/srt/mem_cache/cpp_radix_tree/common.h b/python/sglang/srt/mem_cache/cpp_radix_tree/common.h
new file mode 100644
index 000000000..72c2c78be
--- /dev/null
+++ b/python/sglang/srt/mem_cache/cpp_radix_tree/common.h
@@ -0,0 +1,29 @@
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <source_location>
+#include <span>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace radix_tree_v2 {
+
+using token_t = std::int32_t;
+using token_vec_t = std::vector<token_t>;
+using token_slice = std::span<const token_t>;
+using NodeHandle = std::size_t;
+using IOTicket = std::uint32_t;
+
+inline void _assert(
+    bool condition,
+    const char* message = "Assertion failed",
+    std::source_location loc = std::source_location::current()) {
+  if (!condition) [[unlikely]] {
+    std::string msg = message;
+    msg = msg + " at " + loc.file_name() + ":" + std::to_string(loc.line()) + " in " + loc.function_name();
+    throw std::runtime_error(msg);
+  }
+}
+
+}  // namespace radix_tree_v2
diff --git a/python/sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py b/python/sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py
new file mode 100644
index 000000000..592727aac
--- /dev/null
+++ b/python/sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py
@@ -0,0 +1,182 @@
+from __future__ import annotations
+
+import os
+from typing import TYPE_CHECKING, List, Optional, Tuple
+
+import torch
+from torch.utils.cpp_extension import load
+
+_abs_path = os.path.dirname(os.path.abspath(__file__))
+radix_tree_cpp = load(
+    name="radix_tree_cpp",
+    sources=[
+        f"{_abs_path}/tree_v2_binding.cpp",
+        f"{_abs_path}/tree_v2_debug.cpp",
+        f"{_abs_path}/tree_v2.cpp",
+    ],
+    extra_cflags=["-O3", "-std=c++20"],
+)
+
+if TYPE_CHECKING:
+
+    class TreeNodeCpp:
+        """
+        A placeholder for the TreeNode class. Cannot be constructed elsewhere.
+        """
+
+    class IOHandle:
+        """
+        A placeholder for the IOHandle class. Cannot be constructed elsewhere.
+        """
+
+    class RadixTreeCpp:
+        def __init__(
+            self,
+            disabled: bool,
+            host_size: Optional[int],
+            page_size: int,
+            write_through_threshold: int,
+        ):
+            """
+            Initializes the RadixTreeCpp instance.
+            Args:
+                disabled (bool): If True, the radix tree is disabled.
+                host_size (Optional[int]): Size of the radix tree on the CPU. None means no CPU tree.
+                page_size (int): Size of the page for the radix tree.
+                write_through_threshold (int): Threshold for writing through from GPU to CPU.
+            """
+            self.tree = radix_tree_cpp.RadixTree(  # type: ignore
+                disabled, host_size, page_size, write_through_threshold
+            )
+
+        def match_prefix(
+            self, prefix: List[int]
+        ) -> Tuple[List[torch.Tensor], int, TreeNodeCpp, TreeNodeCpp]:
+            """
+            Matches a prefix in the radix tree.
+            Args:
+                prefix (List[int]): The prefix to match.
+            Returns:
+                Tuple[List[torch.Tensor], TreeNodeCpp, TreeNodeCpp]:
+                    0. A list of indices that is matched by the prefix on the GPU.
+                    1. Sum length of the indices matched on the CPU.
+                    2. The last node of the prefix matched on the GPU.
+                    3. The last node of the prefix matched on the CPU.
+            """
+            return self.tree.match_prefix(prefix)
+
+        def evict(self, num_tokens: int) -> List[torch.Tensor]:
+            """
+            Evicts a number of tokens from the radix tree.
+            Args:
+                num_tokens (int): The number of tokens to evict.
+            Returns:
+                List[torch.Tensor]: A list of indices that were evicted.
+            """
+            return self.tree.evict(num_tokens)
+
+        def lock_ref(self, handle: TreeNodeCpp, lock: bool) -> None:
+            """
+            Locks or unlocks a reference to a tree node.
+            After locking, the node will not be evicted from the radix tree.
+            Args:
+                handle (TreeNodeCpp): The tree node to lock or unlock.
+                lock (bool): If True, locks the node; if False, unlocks it.
+            """
+            return self.tree.lock_ref(handle, lock)
+
+        def writing_through(
+            self, key: List[int], indices: torch.Tensor
+        ) -> Tuple[List[Tuple[IOHandle, torch.Tensor, torch.Tensor]], int]:
+            """
+            Inserts a key-value pair into the radix tree and perform write-through check.
+            Args:
+                key (List[int]): The key to insert.
+                indices (torch.Tensor): The value associated with the key.
+            Returns:
+                Tuple[List[Tuple[IOHandle, torch.Tensor, torch.Tensor]], int]:
+                    0. A list of (IOHandle, device indices, host indices) tuples.
+                       These IOhandles require write-through to the CPU in python side.
+                    1. The number of indices that are matched on device.
+            """
+            return self.tree.writing_through(key, indices)
+
+        def loading_onboard(
+            self,
+            host_node: TreeNodeCpp,
+            new_device_indices: torch.Tensor,
+        ) -> Tuple[IOHandle, List[torch.Tensor]]:
+            """
+            Updates the device indices of tree nodes within a range on the tree.
+            Args:
+                host_node (TreeNodeCpp): The tree node on the host, must be descendant of device_node.
+                new_device_indices (torch.Tensor): The new device indices to set.
+                    The length of this tensor must be exactly host indices length.
+            Returns:
+                Tuple[IOHandle, List[torch.Tensor]]:
+                    0. An IOHandle that requires loading to the CPU in python side.
+                    1. A list of host indices corresponding to the new device indices.
+            """
+            return self.tree.loading_onboard(host_node, new_device_indices)
+
+        def commit_writing_through(self, handle: IOHandle, success: bool) -> None:
+            """
+            Commits the write-through process for a tree node.
+            Args:
+                handle (IOHandle): The IOHandle to commit.
+                success (bool): If True, commits the write-through; if False, just indicates failure.
+            """
+            return self.tree.commit_writing_through(handle, success)
+
+        def commit_loading_onboard(self, handle: IOHandle, success: bool) -> None:
+            """
+            Commits the load onboard process for tree nodes within a range on the tree.
+            Args:
+                handle (IOHandle): The IOHandle to commit.
+                success (bool): If True, commits the load-onboard; if False, just indicates failure.
+            """
+            return self.tree.commit_loading_onboard(handle, success)
+
+        def evictable_size(self) -> int:
+            """
+            Returns the size of the evictable part of the radix tree.
+            This is the size of the part that can be evicted from the GPU (ref_count = 0).
+            Returns:
+                int: The size of the evictable part.
+            """
+            return self.tree.evictable_size()
+
+        def protected_size(self) -> int:
+            """
+            Returns the size of the protected part of the radix tree.
+            This is the size of the part that cannot be evicted from the GPU (ref_count > 0).
+            Returns:
+                int: The size of the protected part.
+            """
+            return self.tree.protected_size()
+
+        def total_size(self) -> int:
+            """
+            Returns the total size of the radix tree (including CPU nodes).
+            Returns:
+                int: The total size of the radix tree.
+            """
+            return self.tree.total_size()
+
+        def reset(self) -> None:
+            """
+            Resets the radix tree, clearing all nodes and indices.
+            """
+            return self.tree.reset()
+
+        def debug_print(self) -> None:
+            """
+            Prints the internal state of the radix tree for debugging purposes.
+            """
+            return self.tree.debug_print()
+
+else:
+    # Real implementation of the classes for runtime
+    RadixTreeCpp = radix_tree_cpp.RadixTree
+    TreeNodeCpp = object
+    IOHandle = object
diff --git a/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2.cpp b/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2.cpp
new file mode 100644
index 000000000..2a5433221
--- /dev/null
+++ b/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2.cpp
@@ -0,0 +1,143 @@
+#include "tree_v2.h"
+
+#include <ATen/core/TensorBody.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/tensor.h>
+#include <ATen/ops/zeros.h>
+#include <c10/util/irange.h>
+
+#include <cstddef>
+#include <memory>
+#include <queue>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include "common.h"
+#include "tree_v2_impl.h"
+#include "tree_v2_node.h"
+
+namespace radix_tree_v2 {
+
+static NodeHandle node2id(TreeNode* node) {
+  return node->node_id;
+}
+
+// compare function for the TreeNode pointers based on their time
+// we use LRU, so we want to evict the least recently used nodes
+// since std::priority_queue is a max-heap, we need to reverse the comparison
+static constexpr auto cmp = [](TreeNode* lhs, TreeNode* rhs) { return lhs->time() > rhs->time(); };
+
+RadixTree::RadixTree(bool disabled, std::optional<std::size_t> host_size, std::size_t page_size, std::size_t threshold)
+    : m_impl(std::make_unique<Impl>(disabled, host_size.has_value(), page_size, host_size.value_or(0), threshold)) {}
+
+RadixTree::~RadixTree() = default;
+
+std::tuple<std::vector<at::Tensor>, std::size_t, NodeHandle, NodeHandle>
+RadixTree::match_prefix(const token_vec_t& _key) {
+  if (m_impl->disabled) return {};
+
+  const auto key = token_slice{_key.data(), m_impl->align(_key.size())};
+  const auto [host_node, _] = m_impl->tree_walk(key);
+
+  // walk up to the first non-evicted node
+  std::size_t host_hit_length = 0;
+  const auto device_node = host_node;
+
+  // collect all the device indices
+  std::vector<at::Tensor> indices{};
+  walk_to_root(device_node, [&](TreeNode* n) { indices.push_back(n->device_indices()); });
+  std::reverse(indices.begin(), indices.end());
+
+  return {std::move(indices), host_hit_length, node2id(device_node), node2id(host_node)};
+}
+
+std::vector<at::Tensor> RadixTree::evict(std::size_t num_tokens) {
+  if (m_impl->disabled || num_tokens == 0) return {};
+  auto heap = std::priority_queue{cmp, m_impl->collect_leaves_device()};
+  std::vector<at::Tensor> evicted_values;
+  // evict nodes until we reach the desired number of tokens
+  std::size_t num_evict = 0;
+  while (num_evict < num_tokens && !heap.empty()) {
+    const auto node = heap.top();
+    heap.pop();
+    // when ref_count == 0, can't be writing through
+    _assert(node->on_gpu() && node->ref_count == 0);
+    if (!node->is_io_free()) continue;  // skip nodes that are undergoing IO (i.e. indices protected)
+    evicted_values.push_back(node->device_indices());
+    num_evict += node->length();
+    const auto parent = node->parent();
+    m_impl->remove_device_node(node);
+    if (parent->is_leaf_device() && parent->ref_count == 0)
+      heap.push(parent);  // push parent to the heap if it is now a free leaf
+  }
+
+  return evicted_values;
+}
+
+std::tuple<std::vector<std::tuple<IOTicket, at::Tensor, at::Tensor>>, std::size_t>
+RadixTree::writing_through(const token_vec_t& _key, at::Tensor value) {
+  if (m_impl->disabled) return {};
+  _assert(_key.size() == std::size_t(value.size(0)), "Key and value must have the same size");
+
+  // just align the key to the page size, clip the unaligned tail
+  const auto key = token_slice{_key.data(), m_impl->align(_key.size())};
+
+  // walk the tree to find the right place to insert
+  const auto [host_node, host_prefix_length] = m_impl->tree_walk(key);
+
+  // insert and create a new node if the remaining part of the key is not empty
+  if (host_prefix_length != key.size()) {
+    m_impl->create_device_node(
+        host_node,
+        {key.begin() + host_prefix_length, key.end()},
+        value.slice(/*dim=*/0, host_prefix_length, key.size()));
+  }
+
+  // add the hit count for the device node
+  walk_to_root(host_node, [&](TreeNode* n) { n->hit_count++; });
+
+  std::vector<std::tuple<IOTicket, at::Tensor, at::Tensor>> result;
+
+  // don't write through if hicache is disabled (no host memory), fast path
+  if (!m_impl->use_hicache) return {std::move(result), host_prefix_length};
+  throw std::runtime_error("Not implemented yet");
+}
+
+std::tuple<IOTicket, std::vector<at::Tensor>> RadixTree::loading_onboard(NodeHandle, at::Tensor) {
+  if (m_impl->disabled) return {};
+  throw std::runtime_error("Not implemented yet");
+}
+
+void RadixTree::commit_writing_through(IOTicket, bool) {
+  if (m_impl->disabled) return;
+  throw std::runtime_error("Not implemented yet");
+}
+
+void RadixTree::commit_loading_onboard(IOTicket, bool) {
+  if (m_impl->disabled) return;
+  throw std::runtime_error("Not implemented yet");
+}
+
+void RadixTree::reset() {
+  m_impl->reset();
+}
+
+void RadixTree::lock_ref(NodeHandle node_id, bool increment) {
+  if (m_impl->disabled) return;
+  m_impl->lock_ref(node_id, increment);
+}
+
+std::size_t RadixTree::evictable_size() const {
+  return m_impl->evictable_size();
+}
+
+std::size_t RadixTree::protected_size() const {
+  return m_impl->protected_size();
+}
+
+std::size_t RadixTree::total_size() const {
+  return m_impl->total_size();
+}
+
+}  // namespace radix_tree_v2
diff --git a/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2.h b/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2.h
new file mode 100644
index 000000000..68da9b9e1
--- /dev/null
+++ b/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2.h
@@ -0,0 +1,59 @@
+#pragma once
+#include <ATen/core/TensorBody.h>
+#include <c10/core/Device.h>
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <tuple>
+#include <vector>
+
+#include "common.h"
+
+namespace radix_tree_v2 {
+
+struct RadixTree {
+ public:
+  RadixTree(bool disabled, std::optional<std::size_t> host_size, std::size_t page_size, std::size_t threshold);
+  ~RadixTree();
+
+  // Trees should not be copied or moved, as they manage their own memory and state.
+  RadixTree(const RadixTree&) = delete;
+  RadixTree(RadixTree&&) = delete;
+  RadixTree& operator=(const RadixTree&) = delete;
+  RadixTree& operator=(RadixTree&&) = delete;
+
+  /// @return (device indices that are matched, host indices length, device node, host node)
+  std::tuple<std::vector<at::Tensor>, std::size_t, NodeHandle, NodeHandle> match_prefix(const token_vec_t& key);
+  /// @return Device indices that need to be evicted (on python side).
+  std::vector<at::Tensor> evict(std::size_t num_tokens);
+  /// @brief (Un-)Lock a node.
+  void lock_ref(NodeHandle node_id, bool increment /* increment or decrement */);
+  /// @brief Update new key-value pair and try to perform write-through.
+  std::tuple<std::vector<std::tuple<IOTicket, at::Tensor, at::Tensor>>, std::size_t>
+  writing_through(const token_vec_t& key, at::Tensor value);
+  /// @brief Load to device from host within a range of nodes.
+  std::tuple<IOTicket, std::vector<at::Tensor>> loading_onboard(NodeHandle host_id, at::Tensor indices);
+  /// @brief Commit a transaction of write-through.
+  void commit_writing_through(IOTicket ticket, bool success);
+  /// @brief Commit a transaction of load onboard.
+  void commit_loading_onboard(IOTicket ticket, bool success);
+  /// @brief Clear and reset the tree.
+  void reset();
+
+  /// @return How many size are still evictable (on device + not locked).
+  std::size_t evictable_size() const;
+  /// @return How many size are protected (locked).
+  std::size_t protected_size() const;
+  /// @return How many size are used on device.
+  std::size_t total_size() const;
+
+  /// @brief Print debug information of the tree.
+  void debug_print() const;
+
+ private:
+  struct Impl;
+  std::unique_ptr<Impl> m_impl;
+};
+
+}  // namespace radix_tree_v2
diff --git a/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2_binding.cpp b/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2_binding.cpp
new file mode 100644
index 000000000..81069e4fe
--- /dev/null
+++ b/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2_binding.cpp
@@ -0,0 +1,32 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <torch/extension.h>
+
+#include <cstddef>
+#include <optional>
+
+#include "tree_v2.h"
+
+PYBIND11_MODULE(radix_tree_cpp, m) {
+  using namespace radix_tree_v2;
+  namespace py = pybind11;
+  py::class_<RadixTree>(m, "RadixTree")
+      .def(
+          py::init<bool, std::optional<std::size_t>, std::size_t, std::size_t>(),
+          py::arg("disabled"),
+          py::arg("host_size"),
+          py::arg("page_size"),
+          py::arg("write_through_threshold"))
+      .def("match_prefix", &RadixTree::match_prefix)
+      .def("evict", &RadixTree::evict)
+      .def("lock_ref", &RadixTree::lock_ref)
+      .def("evictable_size", &RadixTree::evictable_size)
+      .def("protected_size", &RadixTree::protected_size)
+      .def("total_size", &RadixTree::total_size)
+      .def("writing_through", &RadixTree::writing_through)
+      .def("loading_onboard", &RadixTree::loading_onboard)
+      .def("commit_writing_through", &RadixTree::commit_writing_through)
+      .def("commit_loading_onboard", &RadixTree::commit_loading_onboard)
+      .def("reset", &RadixTree::reset)
+      .def("debug_print", &RadixTree::debug_print);
+}
diff --git a/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2_debug.cpp b/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2_debug.cpp
new file mode 100644
index 000000000..89b6290b1
--- /dev/null
+++ b/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2_debug.cpp
@@ -0,0 +1,194 @@
+#include <c10/core/DeviceType.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/ScalarType.h>
+
+#include <cstddef>
+#include <cstdlib>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+#include "tree_v2.h"
+#include "tree_v2_impl.h"
+
+namespace radix_tree_v2 {
+
+void RadixTree::debug_print() const {
+  m_impl->debug_print(std::clog);
+}
+
+static constexpr auto npos = std::size_t(-1);
+
+void RadixTree::Impl::debug_print(std::ostream& os) const {
+  static constexpr auto _check = [](bool condition, auto msg, std::size_t id = npos) {
+    if (!condition) {
+      std::string suffix = id == npos ? "" : " [id = " + std::to_string(id) + "]";
+      throw std::runtime_error(std::string("RadixTree::debug_print failed: ") + msg + suffix);
+    }
+  };
+
+  static constexpr auto _print_node = [](TreeNode* node, std::size_t depth, std::ostream& os) {
+    const auto length = node->length();
+    os << node->node_id << " [depth = " << depth << "] [len = " << length << "]";
+
+    // placement status
+    if (node->on_both()) {
+      os << " [cpu + gpu]";
+    } else if (node->on_gpu()) {
+      os << " [gpu]";
+    } else if (node->on_cpu()) {
+      os << " [cpu]";
+    } else {
+      _check(false, "Node is not on GPU or CPU", node->node_id);
+    }
+
+    // IO status
+    if (node->is_io_free()) {
+      os << " [io = free]";
+    } else if (node->is_io_device_to_host()) {
+      os << " [io = gpu -> cpu]";
+    } else if (node->is_io_host_to_device()) {
+      os << " [io = cpu -> gpu]";
+    } else {
+      _check(false, "Node is in unknown IO state", node->node_id);
+    }
+
+    os << " [rc = " << node->ref_count << "]";
+    os << " [hit = " << node->hit_count << "]";
+  };
+
+  static constexpr auto _print_indices = [](at::Tensor indices, std::ostream& os) {
+    if (!indices.defined()) {
+      os << "[[N/A]]";
+      return indices;
+    }
+    indices = indices.to(c10::kCPU, c10::kLong, false, false, c10::MemoryFormat::Contiguous);
+    const auto length = indices.numel();
+    os << "[";
+    auto* data_ptr = indices.data_ptr<int64_t>();
+    for (const auto i : c10::irange(indices.size(0))) {
+      os << data_ptr[i];
+      if (i != length - 1) os << ", ";
+    }
+    os << "]";
+    return indices;
+  };
+
+  os << "Evictable size: " << evictable_size() << std::endl;
+  os << "Protected size: " << protected_size() << std::endl;
+  os << "Total size: " << const_cast<Impl*>(this)->total_size() << std::endl;
+  std::vector<std::tuple<TreeNode*, TreeNode*, token_slice>> stack;
+  auto root = const_cast<TreeNode*>(&m_root);
+  os << root->node_id << " [root]" << std::endl;
+  for (const auto& [key, child] : *root) {
+    stack.push_back({child.get(), root, key});
+  }
+
+  std::unordered_map<TreeNode*, std::size_t> depth_map;
+  std::string indent_buffer;
+  depth_map[root] = 0;
+  std::vector<NodeHandle> visited_id;
+  std::size_t evictable_size_real = 0;
+  while (!stack.empty()) {
+    const auto [node, parent, key] = stack.back();
+    stack.pop_back();
+    visited_id.push_back(node->node_id);
+    const auto nid = node->node_id;
+    _check(node != nullptr, "Node is null", nid);
+    _check(node->on_gpu() || node->on_cpu(), "Node is not on GPU or CPU", nid);
+    _check(node->parent() == parent, "Parent is not correct", nid);
+    _check(key.size() == page_size && node->diff_key(key, 0) == page_size, "Key is not correct", nid);
+    _check(depth_map.count(node) == 0, "Node is visited twice", nid);
+    _check(m_node_map.count(nid) == 1, "Node is not in the map", nid);
+    _check(m_node_map.at(nid) == node, "Node in the map is not the same as the one in the stack", nid);
+    _check(!node->on_gpu() || parent->is_root() || parent->on_gpu(), "Node on GPU must have a GPU/root parent", nid);
+    if (!node->is_io_free()) {
+      _check(node->ref_count > 0, "Node is in IO state but not protected", nid);
+      _check(node->on_both(), "Node in IO state must be on both CPU and GPU", nid);
+    }
+
+    if (node->on_gpu() && node->ref_count == 0) {
+      evictable_size_real += node->length();
+    }
+
+    const auto depth = (depth_map[node] = depth_map[parent] + 1);
+    indent_buffer.resize(depth * 2, ' ');
+    os << indent_buffer;
+    _print_node(node, depth, os);
+    os << std::endl;
+    for (const auto& [key, child] : *node) {
+      stack.push_back({child.get(), node, key});
+    }
+  }
+
+  _check(evictable_size_real == evictable_size(), "Evictable size is wrong");
+  _check(m_node_map.count(root->node_id) == 1, "Root node is not in the map");
+  _check(m_node_map.at(root->node_id) == root, "Root node in the map is not correct");
+
+  std::sort(visited_id.begin(), visited_id.end());
+  if (visited_id.size() != m_node_map.size() - 1) {
+    // Some error in the tree, not all nodes are visited
+    std::string id_list;
+    id_list += "(visited: ";
+    id_list += std::to_string(root->node_id) + " ";
+    for (const auto& id : visited_id) {
+      id_list += std::to_string(id) + " ";
+    }
+    id_list += "), (in map: ";
+    for (const auto& [id, _] : m_node_map) {
+      id_list += std::to_string(id) + " ";
+    }
+    id_list += ")";
+    _check(false, "Not all nodes are visited " + id_list);
+  }
+
+  static const auto kSGLANG_RADIX_CPP_DEBUG_LIMIT = [] {
+    const char* env = std::getenv("SGLANG_RADIX_CPP_DEBUG_LIMIT");
+    const std::size_t default_limit = 16;
+    if (env != nullptr) {
+      try {
+        return static_cast<std::size_t>(std::stoull(env));
+      } catch (const std::exception& e) {
+        std::cerr << "Invalid SGLANG_RADIX_CPP_DEBUG_LIMIT value: " << env  //
+                  << ". Using default value =" << default_limit << std::endl;
+      }
+    }
+    return default_limit;
+  }();
+
+  for (const auto nid : visited_id) {
+    const auto node = m_node_map.at(nid);
+    // print key and indices
+    const auto& key = node->_unsafe_tokens();
+    if (key.size() > kSGLANG_RADIX_CPP_DEBUG_LIMIT) {
+      os << "Node " << nid << ": key is too long (" << key.size() << " tokens), skipping..." << std::endl;
+      continue;
+    }
+    os << "Node " << nid << ": key = [";
+    for (const auto& i : c10::irange(key.size())) {
+      os << key[i];
+      if (i != key.size() - 1) os << ", ";
+    }
+
+    _check(key.size() % page_size == 0, "Misaligned key", nid);
+
+    os << "] device_indices = ";
+    const auto device_indices = _print_indices(node->device_indices(), os);
+    if (device_indices.defined()) {
+      std::size_t length = device_indices.numel();
+      _check(device_indices.dim() == 1, "Device indices must be 1D tensor", nid);
+      _check(length == node->length(), "Wrong device indices size", nid);
+    }
+
+    os << " host_indices = ";
+    const auto host_indices = _print_indices(node->host_indices(), os);
+    if (host_indices.defined()) {
+      std::size_t length = host_indices.numel();
+      _check(host_indices.dim() == 1, "Host indices must be 1D tensor", nid);
+      _check(length == node->length(), "Wrong host indices size", nid);
+    }
+    os << std::endl;
+  }
+}
+
+}  // namespace radix_tree_v2
diff --git a/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2_impl.h b/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2_impl.h
new file mode 100644
index 000000000..cb9f9dde5
--- /dev/null
+++ b/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2_impl.h
@@ -0,0 +1,276 @@
+#pragma once
+#include <c10/util/irange.h>
+
+#include <chrono>
+#include <cstddef>
+#include <iosfwd>
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "common.h"
+#include "tree_v2.h"
+#include "tree_v2_node.h"
+
+namespace radix_tree_v2 {
+
+using node_iterator_t = typename TreeNode::iterator_t;
+
+struct RadixTree::Impl {
+ public:
+  Impl(bool disabled, bool use_hicache, std::size_t page_size, std::size_t host_size, std::size_t threshold)
+      : m_root(/*node_id_=*/0),
+        m_evictable_size(0),
+        m_protected_size(0),
+        m_cached_vec(),
+        m_node_map(),
+        m_node_counter(1),  // start from 1 to avoid confusion with root node
+        disabled(disabled),
+        use_hicache(use_hicache),
+        page_size(page_size),
+        threshold(threshold) {
+    _assert(page_size > 0, "Page size must be greater than zero");
+    _assert(use_hicache == (host_size > 0), "Hierarchical cache is enabled iff host size > 0");
+    m_root.ref_count = 1;                  // root node is always protected
+    m_cached_vec.reserve(page_size);       // to avoid repeated allocations
+    m_node_map[m_root.node_id] = &m_root;  // add root to the map
+  }
+
+  TreeNode* split_node(node_iterator_t iterator, std::size_t prefix_length) {
+    // from `parent -> old_node` to `parent-> new_node -> old_node`
+    // the prefix part of the old node is moved to the new node
+    auto old_node_ptr = std::move(iterator->second);
+    auto new_node_ptr = std::make_unique<TreeNode>(m_node_counter++);
+    auto* old_node = old_node_ptr.get();
+    auto* new_node = new_node_ptr.get();
+    auto* parent = old_node->parent();
+    // set up data structures
+    split_prefix(new_node, old_node, prefix_length);
+    // set up parent-child relationship
+    add_child(new_node, std::move(old_node_ptr));
+    add_child(parent, std::move(new_node_ptr), iterator);
+    m_node_map[new_node->node_id] = new_node;  // add to the map
+    return new_node;
+  }
+
+  // node: x -> [GPU]
+  TreeNode* create_device_node(TreeNode* parent, token_vec_t vec, at::Tensor indices) {
+    auto new_node_ptr = std::make_unique<TreeNode>(m_node_counter++);
+    auto new_node = new_node_ptr.get();
+    new_node_ptr->_unsafe_tokens() = std::move(vec);
+    new_node_ptr->_unsafe_device_indices() = std::move(indices);
+    m_evictable_size += new_node_ptr->length();
+    add_child(parent, std::move(new_node_ptr));
+    m_node_map[new_node->node_id] = new_node;  // add to the map
+    return new_node;
+  }
+
+  // node: [GPU] -> x
+  void remove_device_node(TreeNode* node) {
+    _assert(node->on_gpu_only() && node->ref_count == 0);
+    m_evictable_size -= node->length();
+    node->parent()->erase_child(get_key(node));
+    m_node_map.erase(node->node_id);  // remove from the map
+  }
+
+  /**
+   * @brief Walk the tree to find the node that matches the key.
+   * If the key partially matches a node, it will split that node.
+   * @return A pair containing the last node that matches the key and
+   * the total prefix length matched (on gpu and cpu) so far.
+   */
+  std::pair<TreeNode*, std::size_t> tree_walk(token_slice key) {
+    _assert(key.size() % page_size == 0, "Key should be page-aligned");
+
+    std::size_t total_prefix_length = 0;
+    TreeNode* node = &m_root;
+
+    const auto now = std::chrono::steady_clock::now();
+    while (key.size() > 0) {
+      const auto iterator = node->find_child(get_key(key));
+      if (iterator == node->end()) break;
+
+      // walk to the child node
+      node = iterator->second.get();
+
+      // at least `page_size` tokens are matched, and there may be more tokens to match
+      // the return value prefix_length is no less than `page_size`
+      const auto prefix_length = align(node->diff_key(key, page_size) + page_size);
+      total_prefix_length += prefix_length;
+
+      // split the node if the prefix is not the whole token vector
+      if (prefix_length < node->length()) {
+        return {split_node(iterator, prefix_length), total_prefix_length};
+      }
+
+      // we have matched the whole key, continue to the next node
+      node->access(now);
+      key = key.subspan(prefix_length);
+    }
+
+    return {node, total_prefix_length};
+  }
+
+  std::vector<TreeNode*> collect_leaves() const {
+    std::vector<TreeNode*> leaves;
+    std::vector<TreeNode*> stack = {};
+    for (const auto& [_, child] : m_root) {
+      stack.push_back(child.get());
+    }
+    while (!stack.empty()) {
+      const auto node = stack.back();
+      stack.pop_back();
+      if (node->is_leaf()) {
+        if (node->ref_count == 0) {
+          leaves.push_back(node);
+        }
+      } else {
+        for (const auto& [_, child] : *node) {
+          stack.push_back(child.get());
+        }
+      }
+    }
+    return leaves;
+  }
+
+  std::vector<TreeNode*> collect_leaves_device() const {
+    // for non-hicache, every leaf device node is a leaf node (since no backup on host)
+    if (!use_hicache) return collect_leaves();
+    std::vector<TreeNode*> leaves;
+    std::vector<TreeNode*> stack = {};
+    for (const auto& [_, child] : m_root) {
+      stack.push_back(child.get());
+    }
+    while (!stack.empty()) {
+      const auto node = stack.back();
+      stack.pop_back();
+      if (!node->on_gpu()) continue;  // skip nodes that are not on GPU
+      if (node->is_leaf_device()) {
+        if (node->ref_count == 0) {
+          leaves.push_back(node);
+        }
+      } else {
+        for (const auto& [_, child] : *node) {
+          stack.push_back(child.get());
+        }
+      }
+    }
+    return leaves;
+  }
+
+  void lock_ref(TreeNode* node, bool increment) {
+    if (node->is_root()) return;  // skip root node
+    _assert(node->on_gpu(), "Cannot lock reference on an evicted node");
+    if (increment)
+      walk_to_root(node, [this](TreeNode* n) {
+        if (n->ref_count == 0) {
+          m_evictable_size -= n->length();
+          m_protected_size += n->length();
+        }
+        n->ref_count++;
+      });
+    else
+      walk_to_root(node, [this](TreeNode* n) {
+        _assert(n->ref_count != 0, "Cannot decrement reference count = zero");
+        n->ref_count--;
+        if (n->ref_count == 0) {
+          m_protected_size -= n->length();
+          m_evictable_size += n->length();
+        }
+      });
+  }
+
+  void lock_ref(NodeHandle node_ptr, bool increment) {
+    return lock_ref(id2node(node_ptr), increment);
+  }
+
+  void lock(TreeNode* node) {
+    return lock_ref(node, /*increment=*/true);
+  }
+
+  void unlock(TreeNode* node) {
+    return lock_ref(node, /*increment=*/false);
+  }
+
+  std::size_t total_size() const {
+    std::size_t size = 0;
+    std::vector<const TreeNode*> stack = {&m_root};
+    while (!stack.empty()) {
+      auto* node = stack.back();
+      stack.pop_back();
+      size += node->length();
+      for (const auto& [_, child] : *node)
+        stack.push_back(child.get());
+    }
+    return size;
+  }
+
+  std::size_t evictable_size() const {
+    return m_evictable_size;
+  }
+
+  std::size_t protected_size() const {
+    return m_protected_size;
+  }
+
+  std::size_t align(std::size_t size) const {
+    return (size / page_size) * page_size;  // align to page size
+  }
+
+  TreeNode* id2node(NodeHandle node_id) const {
+    const auto iterator = m_node_map.find(node_id);
+    _assert(iterator != m_node_map.end(), "Node not found in the map");
+    return iterator->second;
+  }
+
+  void reset() {
+    _assert(m_root.ref_count == 1, "Root node must be protected during reset");
+    m_node_counter = 1;  // reset node counter
+    m_root.root_reset();
+    m_evictable_size = 0;
+    m_protected_size = 0;
+    m_node_map.clear();
+    m_node_map[m_root.node_id] = &m_root;  // re-add root to the map
+  }
+
+  void debug_print(std::ostream& os) const;
+
+ private:
+  // some auxiliary functions
+  token_vec_t& get_key(token_slice tokens) {
+    _assert(tokens.size() >= page_size, "Key should be at least page-sized");
+    tokens = tokens.subspan(0, page_size);
+    m_cached_vec.assign(tokens.begin(), tokens.end());
+    return m_cached_vec;
+  }
+
+  // justify for _unsafe call: we need to read the key part of the tokens
+  token_vec_t& get_key(TreeNode* node) {
+    return get_key(node->_unsafe_tokens());
+  }
+
+  void add_child(TreeNode* parent, std::unique_ptr<TreeNode>&& child) {
+    parent->add_child(get_key(child.get()), std::move(child));
+  }
+
+  void add_child(TreeNode* parent, std::unique_ptr<TreeNode>&& child, node_iterator_t it) {
+    parent->add_child(it, std::move(child));
+  }
+
+  TreeNode m_root;                                        // root node of the tree
+  std::size_t m_evictable_size;                           // number of evictable tokens on GPU (lock ref = 0)
+  std::size_t m_protected_size;                           // number of protected tokens on GPU (lock ref > 0)
+  token_vec_t m_cached_vec;                               // cached vector of tokens for the current operation
+  std::unordered_map<std::size_t, TreeNode*> m_node_map;  // map of node keys to nodes
+  std::size_t m_node_counter;                             // counter for node IDs
+
+ public:
+  // some public constant configurations (without m_ prefix)
+  const bool disabled;          // whether the cache is enabled, or just a temporary cache
+  const bool use_hicache;       // whether to use the HiCache for this tree
+  const std::size_t page_size;  // size of each page in the cache
+  const std::size_t threshold;  // threshold for write_through
+};
+
+}  // namespace radix_tree_v2
diff --git a/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2_node.h b/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2_node.h
new file mode 100644
index 000000000..4eac86ea4
--- /dev/null
+++ b/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2_node.h
@@ -0,0 +1,257 @@
+#pragma once
+#include <ATen/core/TensorBody.h>
+
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <ranges>
+#include <unordered_map>
+
+#include "common.h"
+
+namespace radix_tree_v2 {
+
+struct std_vector_hash {
+  // see https://stackoverflow.com/questions/20511347/a-good-hash-function-for-a-vector
+  std::size_t operator()(const token_vec_t& vec) const {
+    std::size_t hash = 0;
+    for (const auto& token : vec) {
+      hash ^= token + 0x9e3779b9 + (hash << 6) + (hash >> 2);
+    }
+    return hash;
+  }
+};
+
+struct TreeNode {
+ public:
+  using childern_map_t = std::unordered_map<token_vec_t, std::unique_ptr<TreeNode>, std_vector_hash>;
+  using iterator_t = typename childern_map_t::iterator;
+  using const_iterator_t = typename childern_map_t::const_iterator;
+  using timestamp_t = std::chrono::steady_clock::time_point;
+
+  TreeNode(std::size_t node_id_)
+      : ref_count(0),
+        hit_count(0),
+        m_io_locked(std::nullopt),
+        m_io_status(IOStatus::None),
+        m_io_ticket(),
+        m_tokens(),
+        m_device_indices(),
+        m_host_indices(),
+        m_parent(),
+        m_children(),
+        m_last_access_time(std::chrono::steady_clock::now()),
+        node_id(node_id_) {}
+
+  void access(timestamp_t time = std::chrono::steady_clock::now()) {
+    m_last_access_time = time;
+  }
+
+  bool is_root() const {
+    return m_parent == nullptr;
+  }
+
+  timestamp_t time() const {
+    return m_last_access_time;
+  }
+
+  bool on_gpu() const {
+    return m_device_indices.defined();
+  }
+
+  bool on_cpu() const {
+    return m_host_indices.defined();
+  }
+
+  bool on_gpu_only() const {
+    return on_gpu() && !on_cpu();
+  }
+
+  bool on_cpu_only() const {
+    return !on_gpu() && on_cpu();
+  }
+
+  bool on_both() const {
+    return on_gpu() && on_cpu();
+  }
+
+  std::size_t length() const {
+    return m_tokens.size();
+  }
+
+  bool is_leaf() const {
+    return m_children.empty();
+  }
+
+  bool is_leaf_device() const {
+    for (const auto& [_, child] : m_children)
+      if (child->on_gpu()) return false;  // at least one child is on the device
+    return true;
+  }
+
+  void add_child(const token_vec_t& v, std::unique_ptr<TreeNode>&& child) {
+    child->m_parent = this;
+    m_children[v] = std::move(child);
+  }
+
+  void add_child(iterator_t it, std::unique_ptr<TreeNode>&& child) {
+    child->m_parent = this;
+    it->second = std::move(child);
+  }
+
+  void erase_child(const token_vec_t& v) {
+    _assert(m_children.erase(v) > 0, "Child node not found");
+  }
+
+  iterator_t find_child(const token_vec_t& v) {
+    return m_children.find(v);
+  }
+
+  iterator_t begin() {
+    return m_children.begin();
+  }
+
+  iterator_t end() {
+    return m_children.end();
+  }
+
+  const_iterator_t begin() const {
+    return m_children.begin();
+  }
+
+  const_iterator_t end() const {
+    return m_children.end();
+  }
+
+  TreeNode* parent() {
+    return m_parent;
+  }
+
+  // set up all data structures except for parent-child relationship
+  friend void split_prefix(TreeNode* new_node, TreeNode* old_node, std::size_t prefix_length) {
+    auto tokens = std::move(old_node->m_tokens);
+    _assert(0 < prefix_length && prefix_length < tokens.size(), "Invalid prefix size for split");
+
+    // set up tokens
+    old_node->m_tokens = token_vec_t(tokens.begin() + prefix_length, tokens.end());
+    new_node->m_tokens = std::move(tokens);
+    new_node->m_tokens.resize(prefix_length);
+
+    // set up values
+    const int64_t new_size = new_node->length();
+    const int64_t old_size = old_node->length();
+    if (old_node->m_device_indices.defined()) {
+      auto new_indices = old_node->m_device_indices.split_with_sizes({new_size, old_size});
+      new_node->m_device_indices = std::move(new_indices[0]);
+      old_node->m_device_indices = std::move(new_indices[1]);
+    }
+    if (old_node->m_host_indices.defined()) {
+      auto new_indices = old_node->m_host_indices.split_with_sizes({new_size, old_size});
+      new_node->m_host_indices = std::move(new_indices[0]);
+      old_node->m_host_indices = std::move(new_indices[1]);
+    }
+
+    // set up ref counts and hit counts
+    new_node->ref_count = old_node->ref_count;
+    new_node->hit_count = old_node->hit_count;
+
+    // If the old node (child) was locked for IO, the new node (parent) does not need
+    // to be locked, since it is naturally protected by the child node's lock.
+    if (old_node->m_io_locked.has_value()) {
+      new_node->m_io_locked = false;
+      new_node->m_io_status = old_node->m_io_status;
+      new_node->m_io_ticket = old_node->m_io_ticket;
+    }
+  }
+
+  /// @return The first index in `m_tokens` that differs from `key`.
+  std::size_t diff_key(token_slice key, std::size_t offset) const {
+    const auto a = token_slice{key}.subspan(offset);
+    const auto b = token_slice{m_tokens}.subspan(offset);
+    const auto [it_a, it_b] = std::ranges::mismatch(a, b);
+    return it_a - a.begin();  // return the index of the first differing token
+  }
+
+  at::Tensor device_indices() const {
+    return m_device_indices;
+  }
+  at::Tensor host_indices() const {
+    return m_host_indices;
+  }
+
+  // visiting tokens are always unsafe (use `diff_key` instead)
+  token_vec_t& _unsafe_tokens() {
+    return m_tokens;
+  }
+  at::Tensor& _unsafe_device_indices() {
+    return m_device_indices;
+  }
+  at::Tensor& _unsafe_host_indices() {
+    return m_host_indices;
+  }
+
+  bool is_io_free() const {
+    return m_io_status == IOStatus::None;
+  }
+
+  bool is_io_device_to_host() const {
+    return m_io_status == IOStatus::DeviceToHost;
+  }
+
+  bool is_io_host_to_device() const {
+    return m_io_status == IOStatus::HostToDevice;
+  }
+
+  void root_reset() {
+    _assert(is_root(), "Only root node can call root_reset");
+    _assert(
+        m_io_status == IOStatus::None && m_io_locked == std::nullopt,
+        "IO operation in progress, cannot reset root node");
+    _assert(this->m_tokens.empty(), "Root node tokens should be empty on reset");
+    _assert(
+        !this->m_device_indices.defined() && !this->m_host_indices.defined(),
+        "Root node indices should be always be empty and never assigned");
+    m_children.clear();
+    this->access();
+  }
+
+ public:
+  std::size_t ref_count;
+  std::size_t hit_count;
+
+ private:
+  enum class IOStatus : std::uint8_t {
+    None,
+    HostToDevice,
+    DeviceToHost,
+  };
+
+  std::optional<bool> m_io_locked;  // whether the node is locked in IO operation
+  IOStatus m_io_status;
+  IOTicket m_io_ticket;
+
+  token_vec_t m_tokens;
+  at::Tensor m_device_indices;  // indices of device value
+  at::Tensor m_host_indices;    // indices of host value
+  TreeNode* m_parent;
+  childern_map_t m_children;
+  timestamp_t m_last_access_time;
+
+ public:
+  const std::size_t node_id;  // unique ID for the node
+};
+
+template <typename F>
+inline TreeNode* walk_to_root(TreeNode* t, const F& f) {
+  while (!t->is_root()) {
+    f(t);
+    t = t->parent();
+  }
+  return t;  // return the root node
+}
+
+}  // namespace radix_tree_v2
diff --git a/python/sglang/srt/mem_cache/flush_cache.py b/python/sglang/srt/mem_cache/flush_cache.py
new file mode 100644
index 000000000..73f80a2db
--- /dev/null
+++ b/python/sglang/srt/mem_cache/flush_cache.py
@@ -0,0 +1,33 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""
+Flush the KV cache.
+
+Usage:
+python3 -m sglang.srt.mem_cache.flush_cache --url http://localhost:30000
+"""
+
+import argparse
+
+import requests
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--url", type=str, default="http://localhost:30000")
+    args = parser.parse_args()
+
+    response = requests.post(args.url + "/flush_cache")
+    assert response.status_code == 200
diff --git a/python/sglang/srt/mem_cache/hicache_storage.py b/python/sglang/srt/mem_cache/hicache_storage.py
new file mode 100644
index 000000000..6ec077db5
--- /dev/null
+++ b/python/sglang/srt/mem_cache/hicache_storage.py
@@ -0,0 +1,226 @@
+import hashlib
+import logging
+import os
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, List, Optional
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+def get_hash_str(token_ids: List[int], prior_hash: str = None) -> str:
+    hasher = hashlib.sha256()
+
+    if prior_hash:
+        hasher.update(bytes.fromhex(prior_hash))
+
+    for t in token_ids:
+        hasher.update(t.to_bytes(4, byteorder="little", signed=False))
+
+    return hasher.hexdigest()
+
+
+@dataclass
+class HiCacheStorageConfig:
+    tp_rank: int
+    tp_size: int
+    is_mla_model: bool
+    is_page_first_layout: bool
+    model_name: Optional[str]
+    extra_config: Optional[dict] = None
+
+
+class HiCacheStorage(ABC):
+    """
+    HiCacheStorage is a class that provides a generic key-value interface for storing and retrieving KV cache.
+    It abstracts the underlying storage mechanism, allowing different implementations to be used.
+    """
+
+    # todo, potentially pass model and TP configs into storage backend
+    # todo, the page size of storage backend does not have to be the same as the same as host memory pool
+
+    @abstractmethod
+    def get(
+        self,
+        key: str,
+        target_location: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> torch.Tensor | None:
+        """
+        Retrieve the value associated with the given key.
+        Returns None if the key does not exist.
+        """
+        pass
+
+    @abstractmethod
+    def batch_get(
+        self,
+        keys: List[str],
+        target_locations: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> List[torch.Tensor | None] | int:
+        """
+        Retrieve values for multiple keys.
+        Returns a list of tensors or None for each key.
+        """
+        pass
+
+    @abstractmethod
+    def set(
+        self,
+        key: str,
+        value: Optional[Any] = None,
+        target_location: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> bool:
+        """
+        Store the value associated with the given key.
+        Returns True if the operation was successful, False otherwise.
+        """
+        pass
+
+    @abstractmethod
+    def batch_set(
+        self,
+        keys: List[str],
+        values: Optional[Any] = None,
+        target_locations: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> bool:
+        """
+        Store multiple key-value pairs.
+        Returns True if all operations were successful, False otherwise.
+        """
+        pass
+
+    @abstractmethod
+    def exists(self, key: str) -> bool:
+        """
+        Check if the key exists in the storage.
+        Returns True if the key exists, False otherwise.
+        """
+        pass
+
+    def batch_exists(self, keys: List[str]) -> int:
+        """
+        Check if the keys exist in the storage.
+        return the number of consecutive existing keys from the start.
+        Can be overridden by subclasses for more efficient implementation.
+        """
+        for i in range(len(keys)):
+            if not self.exists(keys[i]):
+                return i
+        return len(keys)
+
+    def get_stats(self):
+        return None
+
+
+class HiCacheFile(HiCacheStorage):
+
+    def __init__(
+        self, storage_config: HiCacheStorageConfig, file_path: str = "/tmp/hicache"
+    ):
+        self.file_path = os.getenv("SGLANG_HICACHE_FILE_BACKEND_STORAGE_DIR", file_path)
+
+        tp_rank, tp_size, model_name, is_mla_model = (
+            storage_config.tp_rank,
+            storage_config.tp_size,
+            storage_config.model_name,
+            storage_config.is_mla_model,
+        )
+        model_name = "-".join(model_name.split("/")) if model_name else ""
+        if is_mla_model:
+            self.config_suffix = f"_{model_name}"
+        else:
+            self.config_suffix = f"_{model_name}_{tp_rank}_{tp_size}"
+
+        if not os.path.exists(self.file_path) and tp_rank == 0:
+            os.makedirs(self.file_path)
+            logger.info(f"Created HiCacheFile storage directory at {self.file_path}")
+
+    def _get_suffixed_key(self, key: str) -> str:
+        return key + self.config_suffix
+
+    def get(
+        self,
+        key: str,
+        target_location: torch.Tensor,
+        target_sizes: Optional[Any] = None,
+    ) -> torch.Tensor | None:
+        key = self._get_suffixed_key(key)
+        tensor_path = os.path.join(self.file_path, f"{key}.bin")
+        try:
+            expected = target_location.numel() * target_location.element_size()
+            with open(tensor_path, "rb", buffering=0) as f:
+                buf = memoryview(target_location.view(torch.uint8).contiguous().numpy())
+                if f.readinto(buf) != expected:
+                    raise IOError(f"Short read for {key}")
+            return target_location
+        except FileNotFoundError:
+            logger.warning(f"Failed to fetch {key} from HiCacheFile storage.")
+            return None
+
+    def batch_get(
+        self,
+        keys: List[str],
+        target_locations: List[torch.Tensor],
+        target_sizes: Optional[Any] = None,
+    ) -> List[torch.Tensor | None]:
+        return [
+            self.get(key, target_location)
+            for key, target_location in zip(
+                keys, target_locations or [None] * len(keys)
+            )
+        ]
+
+    def set(
+        self,
+        key: str,
+        value: Optional[Any] = None,
+        target_location: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> bool:
+        if self.exists(key):
+            logger.debug(f"Key {key} already exists. Skipped.")
+            return True
+
+        key = self._get_suffixed_key(key)
+        tensor_path = os.path.join(self.file_path, f"{key}.bin")
+        try:
+            value.contiguous().view(dtype=torch.uint8).numpy().tofile(tensor_path)
+            return True
+        except Exception as e:
+            logger.error(f"Failed to save tensor {key}: {e}")
+            return False
+
+    def batch_set(
+        self,
+        keys: List[str],
+        values: Optional[Any] = None,
+        target_locations: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> bool:
+        for key, value in zip(keys, values):
+            if not self.set(key, value):
+                return False
+        return True
+
+    def exists(self, key: str) -> bool:
+        key = self._get_suffixed_key(key)
+        tensor_path = os.path.join(self.file_path, f"{key}.bin")
+        return os.path.exists(tensor_path)
+
+    def clear(self) -> bool:
+        try:
+            for filename in os.listdir(self.file_path):
+                file_path = os.path.join(self.file_path, filename)
+                if os.path.isfile(file_path):
+                    os.remove(file_path)
+            logger.info("Cleared all entries in HiCacheFile storage.")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to clear HiCacheFile storage: {e}")
+            return False
diff --git a/python/sglang/srt/mem_cache/hiradix_cache.py b/python/sglang/srt/mem_cache/hiradix_cache.py
new file mode 100644
index 000000000..3b00e4619
--- /dev/null
+++ b/python/sglang/srt/mem_cache/hiradix_cache.py
@@ -0,0 +1,849 @@
+import heapq
+import logging
+import threading
+import time
+from queue import Queue
+from typing import List, Optional
+
+import torch
+
+from sglang.srt.managers.cache_controller import HiCacheController, PrefetchOperation
+from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
+from sglang.srt.mem_cache.base_prefix_cache import MatchResult
+from sglang.srt.mem_cache.memory_pool import (
+    MHATokenToKVPool,
+    MLATokenToKVPool,
+    ReqToTokenPool,
+)
+from sglang.srt.mem_cache.memory_pool_host import (
+    MHATokenToKVPoolHost,
+    MLATokenToKVPoolHost,
+)
+from sglang.srt.mem_cache.radix_cache import RadixCache, TreeNode
+from sglang.srt.metrics.collector import StorageMetricsCollector
+
+logger = logging.getLogger(__name__)
+
+
+class HiRadixCache(RadixCache):
+
+    def __init__(
+        self,
+        req_to_token_pool: ReqToTokenPool,
+        token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator,
+        tp_cache_group: torch.distributed.ProcessGroup,
+        page_size: int,
+        hicache_ratio: float,
+        hicache_size: int,
+        hicache_write_policy: str,
+        hicache_io_backend: str,
+        hicache_mem_layout: str,
+        enable_metrics: bool,
+        hicache_storage_backend: Optional[str] = None,
+        hicache_storage_prefetch_policy: Optional[str] = "best_effort",
+        model_name: Optional[str] = None,
+        storage_backend_extra_config: Optional[str] = None,
+    ):
+
+        if hicache_io_backend == "direct":
+            if hicache_mem_layout == "page_first":
+                hicache_mem_layout = "layer_first"
+                logger.warning(
+                    "Page first layout is not supported with direct IO backend, switching to layer first layout"
+                )
+
+        self.kv_cache = token_to_kv_pool_allocator.get_kvcache()
+        if isinstance(self.kv_cache, MHATokenToKVPool):
+            self.token_to_kv_pool_host = MHATokenToKVPoolHost(
+                self.kv_cache,
+                hicache_ratio,
+                hicache_size,
+                page_size,
+                hicache_mem_layout,
+            )
+        elif isinstance(self.kv_cache, MLATokenToKVPool):
+            self.token_to_kv_pool_host = MLATokenToKVPoolHost(
+                self.kv_cache,
+                hicache_ratio,
+                hicache_size,
+                page_size,
+                hicache_mem_layout,
+            )
+        else:
+            raise ValueError(f"HiRadixCache only supports MHA and MLA yet")
+
+        self.tp_group = tp_cache_group
+        self.tp_world_size = torch.distributed.get_world_size(group=self.tp_group)
+        self.enable_storage = hicache_storage_backend is not None
+        self.enable_storage_metrics = self.enable_storage and enable_metrics
+
+        # todo: customizable storage prefetch threshold and timeout
+        self.prefetch_threshold = 256
+        self.prefetch_timeout = 3  # seconds
+        self.prefetch_stop_policy = hicache_storage_prefetch_policy
+
+        self.load_cache_event = threading.Event()
+        self.cache_controller = HiCacheController(
+            token_to_kv_pool_allocator,
+            self.token_to_kv_pool_host,
+            page_size,
+            self.tp_group,
+            load_cache_event=self.load_cache_event,
+            write_policy=hicache_write_policy,
+            io_backend=hicache_io_backend,
+            storage_backend=hicache_storage_backend,
+            prefetch_threshold=self.prefetch_threshold,
+            model_name=model_name,
+            storage_backend_extra_config=storage_backend_extra_config,
+        )
+        if self.enable_storage_metrics:
+            # TODO: support pp
+            labels = {
+                "storage_backend": hicache_storage_backend,
+                "tp_rank": self.cache_controller.tp_rank,
+                "dp_rank": self.cache_controller.dp_rank,
+            }
+            self.metrics_collector = StorageMetricsCollector(labels=labels)
+
+        # record the nodes with ongoing write through
+        self.ongoing_write_through = {}
+        # record the node segments with ongoing load back
+        self.ongoing_load_back = {}
+        # record the ongoing prefetch requests
+        self.ongoing_prefetch = {}
+        self.ongoing_backup = {}
+        # todo: dynamically adjust the threshold
+        self.write_through_threshold = (
+            1 if hicache_write_policy == "write_through" else 2
+        )
+        self.load_back_threshold = 10
+        super().__init__(
+            req_to_token_pool, token_to_kv_pool_allocator, page_size, disable=False
+        )
+
+    def reset(self):
+        TreeNode.counter = 0
+        self.cache_controller.reset()
+        self.token_to_kv_pool_host.clear()
+        super().reset()
+
+    def get_height(self, node: TreeNode):
+        height = 0
+        while node != self.root_node:
+            node = node.parent
+            height += 1
+        return height
+
+    def clear_storage_backend(self) -> bool:
+        if self.enable_storage:
+            try:
+                # Check if the storage backend has a clear method (for nixl backends)
+                if hasattr(self.cache_controller.storage_backend, "clear"):
+                    self.cache_controller.storage_backend.clear()
+                    logger.info(
+                        "Hierarchical cache storage backend cleared successfully!"
+                    )
+                    return True
+                else:
+                    logger.warning(
+                        f"Storage backend {type(self.cache_controller.storage_backend).__name__} does not support clear operation."
+                    )
+                    return False
+            except Exception as e:
+                logger.error(f"Failed to clear hierarchical cache storage backend: {e}")
+                return False
+        else:
+            logger.warning("Hierarchical cache storage backend is not enabled.")
+            return False
+
+    def write_backup(self, node: TreeNode, write_back=False):
+        host_indices = self.cache_controller.write(
+            device_indices=node.value,
+            node_id=node.id,
+        )
+        if host_indices is None:
+            self.evict_host(len(node.value))
+            host_indices = self.cache_controller.write(
+                device_indices=node.value,
+                node_id=node.id,
+            )
+        if host_indices is not None:
+            node.host_value = host_indices
+            assert len(node.host_value) > 0
+            self.ongoing_write_through[node.id] = node
+            if not write_back:
+                # no need to lock nodes if write back
+                self.inc_lock_ref(node)
+        else:
+            return 0
+
+        return len(host_indices)
+
+    def write_backup_storage(self, node: TreeNode):
+        operation_id = self.cache_controller.write_storage(
+            node.host_value, node.key, node.hash_value
+        )
+        self.ongoing_backup[operation_id] = node
+        node.protect_host()
+
+    def _inc_hit_count(self, node: TreeNode, chunked=False):
+        # skip the hit count update for chunked requests
+        if self.cache_controller.write_policy == "write_back" or chunked:
+            return
+        node.hit_count += 1
+
+        if not node.backuped:
+            if node.hit_count >= self.write_through_threshold:
+                # write to host if the node is not backuped
+                self.write_backup(node)
+
+    def writing_check(self, write_back=False):
+        if write_back:
+            # blocking till all write back complete
+            while len(self.ongoing_write_through) > 0:
+                for _, finish_event, ack_list in self.cache_controller.ack_write_queue:
+                    finish_event.synchronize()
+                    for ack_id in ack_list:
+                        del self.ongoing_write_through[ack_id]
+                self.cache_controller.ack_write_queue.clear()
+                assert len(self.ongoing_write_through) == 0
+            return
+
+        # NOTE: all ranks has the same ongoing_write_through, can skip sync if empty
+        if len(self.ongoing_write_through) == 0:
+            return
+
+        finish_count = 0
+        for _, finish_event, ack_list in self.cache_controller.ack_write_queue:
+            if not finish_event.query():
+                break
+            finish_count += 1
+        queue_size = torch.tensor(finish_count, dtype=torch.int, device="cpu")
+        if self.tp_world_size > 1:
+            # synchronize TP workers to make the same update to radix cache
+            torch.distributed.all_reduce(
+                queue_size,
+                op=torch.distributed.ReduceOp.MIN,
+                group=self.tp_group,
+            )
+
+        finish_count = int(queue_size.item())
+        while finish_count > 0:
+            _, finish_event, ack_list = self.cache_controller.ack_write_queue.pop(0)
+            finish_event.synchronize()
+            for ack_id in ack_list:
+                backuped_node = self.ongoing_write_through.pop(ack_id)
+                self.dec_lock_ref(backuped_node)
+                if self.enable_storage:
+                    self.write_backup_storage(backuped_node)
+            finish_count -= 1
+
+    def loading_check(self):
+        finish_count = 0
+        for _, finish_event, ack_list in self.cache_controller.ack_load_queue:
+            if not finish_event.query():
+                # the KV cache loading is still ongoing
+                break
+            finish_count += 1
+            # no need to sync across TP workers as batch forwarding is synced
+            for ack_id in ack_list:
+                end_node = self.ongoing_load_back.pop(ack_id)
+                self.dec_lock_ref(end_node)
+
+        # ACK until all events are processed
+        del self.cache_controller.ack_load_queue[:finish_count]
+
+    def evictable_size(self):
+        return self.evictable_size_
+
+    def evict(self, num_tokens: int):
+        leaves = self._collect_leaves_device()
+        heapq.heapify(leaves)
+
+        num_evicted = 0
+        write_back_nodes = []
+        while num_evicted < num_tokens and len(leaves):
+            x = heapq.heappop(leaves)
+
+            if x.lock_ref > 0:
+                continue
+
+            if not x.backuped:
+                if self.cache_controller.write_policy == "write_back":
+                    # write to host if the node is not backuped
+                    num_evicted += self.write_backup(x, write_back=True)
+                    write_back_nodes.append(x)
+                else:
+                    num_evicted += self._evict_regular(x)
+            else:
+                num_evicted += self._evict_backuped(x)
+
+            for child in x.parent.children.values():
+                if child in write_back_nodes:
+                    continue
+                if not child.evicted:
+                    break
+            else:
+                # all children are evicted or no children
+                heapq.heappush(leaves, x.parent)
+
+        if self.cache_controller.write_policy == "write_back":
+            self.writing_check(write_back=True)
+            for node in write_back_nodes:
+                assert node.backuped
+                self._evict_backuped(node)
+
+    def _evict_backuped(self, node: TreeNode):
+        # evict a node already written to host
+        num_evicted = self.cache_controller.evict_device(node.value, node.host_value)
+        assert num_evicted > 0
+        self.evictable_size_ -= num_evicted
+        node.value = None
+        return num_evicted
+
+    def _evict_regular(self, node: TreeNode):
+        # evict a node not initiated write to host
+        self.cache_controller.mem_pool_device_allocator.free(node.value)
+        num_evicted = len(node.value)
+        self._delete_leaf(node)
+        return num_evicted
+
+    def evict_host(self, num_tokens: int):
+        leaves = self._collect_leaves()
+        heapq.heapify(leaves)
+
+        num_evicted = 0
+        while num_evicted < num_tokens and len(leaves):
+            x = heapq.heappop(leaves)
+            if x == self.root_node:
+                break
+            # only evict the host value of evicted nodes
+            if not x.evicted:
+                continue
+
+            # node is protected from eviction as it has ongoing prefetch or backup to storage
+            if x.host_ref_counter > 0:
+                continue
+
+            num_evicted += self.cache_controller.evict_host(x.host_value)
+
+            for k, v in x.parent.children.items():
+                if v == x:
+                    break
+            del x.parent.children[k]
+
+            if len(x.parent.children) == 0 and x.parent.evicted:
+                heapq.heappush(leaves, x.parent)
+
+    def load_back(
+        self, node: TreeNode, mem_quota: Optional[int] = None
+    ) -> Optional[torch.Tensor]:
+        # todo: more loading policies
+
+        last_hit_node = node
+        nodes_to_load = []
+        while node.evicted:
+            assert (
+                node.backuped
+            ), "No backup available on evicted nodes, should not happen"
+            nodes_to_load.insert(0, node)
+            node = node.parent
+        else:
+            ancester_node = node
+
+        # protect the ancestor nodes from eviction
+        delta = self.inc_lock_ref(ancester_node)
+
+        # load it all or not at all
+        host_indices = torch.cat([n.host_value for n in nodes_to_load])
+        if len(host_indices) < self.load_back_threshold or (
+            len(host_indices) > mem_quota + delta if mem_quota is not None else False
+        ):
+            # skip loading back if the total size is too small or exceeding the memory quota
+            self.dec_lock_ref(ancester_node)
+            return None
+
+        device_indices = self.cache_controller.load(
+            host_indices=host_indices, node_id=last_hit_node.id
+        )
+        if device_indices is None:
+            self.evict(len(host_indices))
+            device_indices = self.cache_controller.load(
+                host_indices=host_indices, node_id=last_hit_node.id
+            )
+        self.dec_lock_ref(ancester_node)
+        if device_indices is None:
+            # no sufficient GPU memory to load back KV caches
+            return None
+
+        self.ongoing_load_back[last_hit_node.id] = last_hit_node
+        offset = 0
+        for node in nodes_to_load:
+            node.value = device_indices[offset : offset + len(node.host_value)]
+            offset += len(node.host_value)
+        self.evictable_size_ += len(device_indices)
+        self.inc_lock_ref(last_hit_node)
+
+        return device_indices
+
+    def init_load_back(
+        self,
+        last_node: TreeNode,
+        host_hit_length: int,
+        mem_quota: Optional[int] = None,
+    ):
+        _ = host_hit_length  # unused, but kept for compatibility
+        if last_node.evicted:
+            loading_values = self.load_back(last_node, mem_quota)
+            if loading_values is not None:
+                logger.debug(
+                    f"loading back {len(loading_values)} tokens for node {last_node.id}"
+                )
+                return loading_values, last_node
+
+            while last_node.evicted:
+                last_node = last_node.parent
+
+        return (
+            torch.empty((0,), dtype=torch.int64, device=self.device),
+            last_node,
+        )
+
+    def ready_to_load_host_cache(self) -> int:
+        """
+        Notify the cache controller to start the KV cache loading.
+        Return the consumer index for the schedule batch manager to track.
+        """
+        return self.cache_controller.start_loading()
+
+    def check_hicache_events(self):
+        self.writing_check()
+        self.loading_check()
+        if self.enable_storage:
+            self.drain_storage_control_queues()
+        if self.enable_storage_metrics:
+            self.metrics_collector.log_storage_metrics(
+                self.cache_controller.storage_backend.get_stats()
+            )
+
+    def drain_storage_control_queues(self):
+        """
+        Combine prefetch revoke, backup ack, and host mem release checks
+        to minimize TP synchronization and Python overhead.
+        """
+        cc = self.cache_controller
+
+        qsizes = torch.tensor(
+            [
+                cc.prefetch_revoke_queue.qsize(),
+                cc.ack_backup_queue.qsize(),
+                cc.host_mem_release_queue.qsize(),
+            ],
+            dtype=torch.int,
+        )
+        if self.tp_world_size > 1:
+            torch.distributed.all_reduce(
+                qsizes, op=torch.distributed.ReduceOp.MIN, group=self.tp_group
+            )
+
+        n_revoke, n_backup, n_release = map(int, qsizes.tolist())
+
+        # process prefetch revokes
+        for _ in range(n_revoke):
+            req_id = cc.prefetch_revoke_queue.get()
+            info = self.ongoing_prefetch.pop(req_id, None)
+            if info is not None:
+                last_host_node, token_ids, _, _ = info
+                last_host_node.release_host()
+                cc.prefetch_tokens_occupied -= len(token_ids)
+            # else: the revoked operation already got terminated, nothing to do
+
+        # process backup acks
+        for _ in range(n_backup):
+            operation = cc.ack_backup_queue.get()
+            ack_id = operation.id
+            entry = self.ongoing_backup.pop(ack_id, None)
+            if entry is not None:
+                entry.release_host()
+            if self.enable_storage_metrics:
+                self.metrics_collector.log_backuped_tokens(operation.completed_tokens)
+
+        # release host memory
+        host_indices_list = []
+        for _ in range(n_release):
+            host_indices_list.append(cc.host_mem_release_queue.get())
+        if host_indices_list:
+            host_indices = torch.cat(host_indices_list, dim=0)
+            cc.mem_pool_host.free(host_indices)
+
+    def can_terminate_prefetch(self, operation: PrefetchOperation):
+        can_terminate = True
+
+        if self.prefetch_stop_policy == "best_effort":
+            return can_terminate
+
+        if len(operation.hash_value) == 0:
+            completed = False
+        else:
+            completed = (
+                operation.completed_tokens == len(operation.hash_value) * self.page_size
+            )
+
+        if self.prefetch_stop_policy == "wait_complete":
+            can_terminate = completed
+        elif self.prefetch_stop_policy == "timeout":
+            can_terminate = completed or (
+                time.monotonic() - operation.start_time > self.prefetch_timeout
+            )
+        else:
+            # unknown prefetch stop policy, just return True
+            return True
+
+        operation_terminated = operation.is_terminated()
+        if self.tp_world_size > 1:
+            states = torch.tensor(
+                [1 - int(can_terminate), int(operation_terminated)],
+                dtype=torch.int,
+            )
+            torch.distributed.all_reduce(
+                states,
+                op=torch.distributed.ReduceOp.MAX,
+                group=self.tp_group,
+            )
+            can_terminate = states[0].item() == 0
+            operation_terminated = states[1].item() == 1
+        # the operation should be terminated if it is already terminated on any TP worker
+        # or it meets the termination condition on all TP workers
+        can_terminate = can_terminate or operation_terminated
+        return can_terminate
+
+    def check_prefetch_progress(self, req_id: str) -> bool:
+        if req_id not in self.ongoing_prefetch:
+            # there is no ongoing prefetch for this request or it has been revoked
+            return True
+
+        # todo: more policies for prefetch progress such as timeout
+        # the current policy is to prefetch with best effort and terminate when queuing is over
+        last_host_node, token_ids, host_indices, operation = self.ongoing_prefetch[
+            req_id
+        ]
+
+        if operation.host_indices is None:
+            # prefetch has not been issued due to insufficient host memory
+            return True
+
+        if not self.can_terminate_prefetch(operation):
+            return False
+
+        completed_tokens, hash_value = self.cache_controller.terminate_prefetch(
+            operation
+        )
+        logger.debug(f"Prefetch {req_id} completed with {completed_tokens} tokens")
+
+        min_completed_tokens = completed_tokens
+        if self.tp_world_size > 1:
+            # synchrnoize TP workers to make the same update to hiradix cache
+            completed_tokens_tensor = torch.tensor(
+                min_completed_tokens, dtype=torch.int
+            )
+            torch.distributed.all_reduce(
+                completed_tokens_tensor,
+                op=torch.distributed.ReduceOp.MIN,
+                group=self.tp_group,
+            )
+            min_completed_tokens = completed_tokens_tensor.item()
+        fetched_token_ids = token_ids[:min_completed_tokens]
+        written_indices = host_indices[:min_completed_tokens]
+        matched_length = self._insert_helper_host(
+            last_host_node,
+            fetched_token_ids,
+            written_indices,
+            hash_value[: min_completed_tokens // self.page_size],
+        )
+        if len(written_indices):
+            self.cache_controller.mem_pool_host.update_prefetch(written_indices)
+
+        self.cache_controller.mem_pool_host.free(host_indices[:matched_length])
+        self.cache_controller.append_host_mem_release(
+            host_indices[min_completed_tokens:completed_tokens]
+        )
+        last_host_node.release_host()
+        del self.ongoing_prefetch[req_id]
+        self.cache_controller.prefetch_tokens_occupied -= len(token_ids)
+
+        if self.enable_storage_metrics:
+            self.metrics_collector.log_prefetched_tokens(
+                min_completed_tokens - matched_length
+            )
+
+        return True
+
+    def match_prefix(self, key: List[int], **kwargs):
+        empty_value = torch.empty((0,), dtype=torch.int64, device=self.device)
+        if self.disable or len(key) == 0:
+            return MatchResult(
+                device_indices=empty_value,
+                last_device_node=self.root_node,
+                last_host_node=self.root_node,
+                host_hit_length=0,
+            )
+
+        if self.page_size != 1:
+            page_aligned_len = len(key) // self.page_size * self.page_size
+            key = key[:page_aligned_len]
+
+        value, last_node = self._match_prefix_helper(self.root_node, key)
+        if value:
+            value = torch.cat(value)
+        else:
+            value = empty_value
+
+        host_hit_length = 0
+        last_host_node = last_node
+        while last_node.evicted:
+            host_hit_length += len(last_node.host_value)
+            last_node = last_node.parent
+        while not last_host_node.backuped:
+            last_host_node = last_host_node.parent
+
+        return MatchResult(
+            device_indices=value,
+            last_device_node=last_node,
+            last_host_node=last_host_node,
+            host_hit_length=host_hit_length,
+        )
+
+    def prefetch_from_storage(
+        self,
+        req_id: str,
+        last_host_node: TreeNode,
+        new_input_tokens: List[int],
+        last_hash: Optional[str] = None,
+    ):
+        # align the number of fetching tokens to the page size
+        prefetch_length = len(new_input_tokens) - (
+            len(new_input_tokens) % self.page_size
+        )
+        new_input_tokens = new_input_tokens[:prefetch_length]
+        if (
+            not self.enable_storage
+            or prefetch_length < self.prefetch_threshold
+            or self.cache_controller.prefetch_rate_limited()
+        ):
+            return
+
+        last_host_node.protect_host()
+        host_indices = self.cache_controller.mem_pool_host.alloc(prefetch_length)
+        if host_indices is None:
+            self.evict_host(prefetch_length)
+            host_indices = self.cache_controller.mem_pool_host.alloc(prefetch_length)
+        if host_indices is None:
+            last_host_node.release_host()
+            # no sufficient host memory for prefetch
+            return
+        operation = self.cache_controller.prefetch(
+            req_id, host_indices, new_input_tokens, last_hash
+        )
+        self.ongoing_prefetch[req_id] = (
+            last_host_node,
+            new_input_tokens,
+            host_indices,
+            operation,
+        )
+        self.cache_controller.prefetch_tokens_occupied += len(new_input_tokens)
+
+    def _insert_helper_host(self, node: TreeNode, key: List, host_value, hash_value):
+        node.last_access_time = time.monotonic()
+        if len(key) == 0:
+            return 0
+
+        child_key = self.get_child_key_fn(key)
+
+        matched_length = 0
+        while len(key) > 0 and child_key in node.children.keys():
+            node = node.children[child_key]
+            node.last_access_time = time.monotonic()
+            prefix_len = self.key_match_fn(node.key, key)
+            key = key[prefix_len:]
+            host_value = host_value[prefix_len:]
+            hash_value = hash_value[prefix_len // self.page_size :]
+            matched_length += prefix_len
+
+            if prefix_len < len(node.key):
+                new_node = self._split_node(node.key, node, prefix_len)
+                node = new_node
+
+            if len(key):
+                child_key = self.get_child_key_fn(key)
+
+        if len(key):
+            new_node = TreeNode()
+            new_node.parent = node
+            new_node.key = key
+            new_node.value = None
+            new_node.host_value = host_value
+            new_node.hash_value = hash_value
+            node.children[child_key] = new_node
+        return matched_length
+
+    def _match_prefix_helper(self, node: TreeNode, key: List):
+        node.last_access_time = time.monotonic()
+        child_key = self.get_child_key_fn(key)
+        value = []
+
+        while len(key) > 0 and child_key in node.children.keys():
+            child = node.children[child_key]
+            child.last_access_time = time.monotonic()
+            prefix_len = self.key_match_fn(child.key, key)
+            if prefix_len < len(child.key):
+                new_node = self._split_node(child.key, child, prefix_len)
+                if not new_node.evicted:
+                    value.append(new_node.value)
+                node = new_node
+                break
+            else:
+                if not child.evicted:
+                    value.append(child.value)
+                node = child
+                key = key[prefix_len:]
+
+                if len(key):
+                    child_key = self.get_child_key_fn(key)
+
+        return value, node
+
+    def _split_node(self, key, child: TreeNode, split_len: int):
+        # child node split into new_node -> child
+        new_node = TreeNode()
+        new_node.children = {self.get_child_key_fn(key[split_len:]): child}
+        new_node.parent = child.parent
+        new_node.lock_ref = child.lock_ref
+        new_node.key = child.key[:split_len]
+        new_node.hit_count = child.hit_count
+
+        # split value and host value if exists
+        if child.evicted:
+            new_node.value = None
+        else:
+            new_node.value = child.value[:split_len]
+            child.value = child.value[split_len:]
+        if child.backuped:
+            new_node.host_value = child.host_value[:split_len]
+            child.host_value = child.host_value[split_len:]
+
+        if child.hash_value:
+            new_node.hash_value = child.hash_value[: split_len // self.page_size]
+            child.hash_value = child.hash_value[split_len // self.page_size :]
+        child.parent = new_node
+        child.key = child.key[split_len:]
+        new_node.parent.children[self.get_child_key_fn(key)] = new_node
+        return new_node
+
+    def insert(self, key: List, value, chunked=False):
+        if len(key) == 0:
+            return 0
+
+        node = self.root_node
+        child_key = self.get_child_key_fn(key)
+        total_prefix_length = 0
+
+        while len(key) > 0 and child_key in node.children.keys():
+            node = node.children[child_key]
+            node.last_access_time = time.monotonic()
+            prefix_len = self.key_match_fn(node.key, key)
+
+            if prefix_len == len(node.key):
+                if node.evicted:
+                    # change the reference if the node is evicted
+                    # this often happens in the case of KV cache recomputation
+                    node.value = value[:prefix_len]
+                    self.token_to_kv_pool_host.update_synced(node.host_value)
+                    self.evictable_size_ += len(node.value)
+                else:
+                    self._inc_hit_count(node, chunked)
+                    total_prefix_length += prefix_len
+            else:
+                # partial match, split the node
+                new_node = self._split_node(node.key, node, prefix_len)
+                if new_node.evicted:
+                    new_node.value = value[:prefix_len]
+                    self.token_to_kv_pool_host.update_synced(new_node.host_value)
+                    self.evictable_size_ += len(new_node.value)
+                else:
+                    self._inc_hit_count(new_node, chunked)
+                    total_prefix_length += prefix_len
+                node = new_node
+
+            key = key[prefix_len:]
+            value = value[prefix_len:]
+
+            if len(key):
+                child_key = self.get_child_key_fn(key)
+
+        if len(key):
+            new_node = TreeNode()
+            new_node.parent = node
+            new_node.key = key
+            new_node.value = value
+            node.children[child_key] = new_node
+            self.evictable_size_ += len(value)
+
+            if self.enable_storage:
+                last_hash = node.get_last_hash_value()
+                assert (node == self.root_node) or (
+                    last_hash is not None
+                ), "Parent node must have a hash value with storage enabled"
+                new_node.hash_value = []
+                for idx in range(0, len(key), self.page_size):
+                    new_node.hash_value.append(
+                        self.cache_controller.get_hash_str(
+                            key[idx : idx + self.page_size],
+                            prior_hash=last_hash,
+                        )
+                    )
+                    last_hash = new_node.hash_value[-1]
+
+            if self.cache_controller.write_policy != "write_back":
+                self._inc_hit_count(new_node, chunked)
+        return total_prefix_length
+
+    def _collect_leaves_device(self):
+        def is_leaf(node):
+            if node.evicted:
+                return False
+            if node == self.root_node:
+                return False
+            if len(node.children) == 0:
+                return True
+            for child in node.children.values():
+                if not child.evicted:
+                    return False
+            return True
+
+        ret_list = []
+        stack = [self.root_node]
+        while stack:
+            cur_node = stack.pop()
+            if is_leaf(cur_node):
+                ret_list.append(cur_node)
+            else:
+                for cur_child in cur_node.children.values():
+                    if not cur_child.evicted:
+                        stack.append(cur_child)
+        return ret_list
+
+    def release_aborted_request(self, rid: str):
+        if rid not in self.ongoing_prefetch:
+            return
+
+        last_host_node, token_ids, host_indices, operation = self.ongoing_prefetch[rid]
+        if operation.host_indices is None:
+            return
+
+        completed_tokens, _ = self.cache_controller.terminate_prefetch(operation)
+        if self.tp_world_size > 1:
+            torch.distributed.barrier(group=self.tp_group)
+        last_host_node.release_host()
+        del self.ongoing_prefetch[rid]
+        self.cache_controller.append_host_mem_release(host_indices[:completed_tokens])
+        self.cache_controller.prefetch_tokens_occupied -= len(token_ids)
diff --git a/python/sglang/srt/mem_cache/lora_radix_cache.py b/python/sglang/srt/mem_cache/lora_radix_cache.py
new file mode 100644
index 000000000..32b115cb6
--- /dev/null
+++ b/python/sglang/srt/mem_cache/lora_radix_cache.py
@@ -0,0 +1,421 @@
+"""Radix cache for LoRA. It's modified based on RadixCache with lora_id added to the key of nodes."""
+
+import heapq
+import time
+from collections import defaultdict
+from typing import TYPE_CHECKING, Any, List, Optional
+
+import torch
+
+from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
+from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache, MatchResult
+from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req
+else:
+    Req = Any  # Placeholder for Req type when not type checking
+
+
+class LoRAKey:
+
+    def __init__(self, lora_id: str, token_ids: List[int]):
+        self.lora_id = (
+            lora_id  # lora_id of adaptor, should be hash value of adaptor path
+        )
+        self.token_ids = token_ids  # token_ids of the key
+
+    def __len__(self):
+        return len(self.token_ids)
+
+
+def get_child_key(key: LoRAKey):
+    # Here the key of children dict is the hash of lora_id + str(token_ids[0])
+    # So the child key can be matched only when lora_id and token_ids[0] are the same
+    if key.lora_id is None:
+        return hash(str(key.token_ids[0]))
+    else:
+        return hash(key.lora_id + str(key.token_ids[0]))
+
+
+class LoRATreeNode:
+
+    counter = 0
+
+    def __init__(self, id: Optional[int] = None):
+        self.children = defaultdict(LoRATreeNode)
+        self.parent: LoRATreeNode = None
+        self.key: LoRAKey = None
+        self.value: Optional[torch.Tensor] = None
+        self.lock_ref = 0
+        self.last_access_time = time.monotonic()
+
+        self.id = LoRATreeNode.counter if id is None else id
+        LoRATreeNode.counter += 1
+
+    @property
+    def evicted(self):
+        return self.value is None
+
+    def __lt__(self, other: "LoRATreeNode"):
+        return self.last_access_time < other.last_access_time
+
+
+def _key_match(key0: LoRAKey, key1: LoRAKey):
+    if key0.lora_id != key1.lora_id:
+        raise ValueError(
+            f"_key_match should be run on the same lora_id, but got key0.lora_id={key0.lora_id} != key1.lora_id={key1.lora_id}"
+        )
+    i = 0
+    for k0, k1 in zip(key0.token_ids, key1.token_ids):
+        if k0 != k1:
+            break
+        i += 1
+    return i
+
+
+class LoRARadixCache(BasePrefixCache):
+
+    def __init__(
+        self,
+        req_to_token_pool: ReqToTokenPool,
+        token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator,
+        page_size: int,
+        disable: bool = False,
+    ):
+        if page_size > 1:
+            raise ValueError("LoRARadixCache currently only supports page_size = 1")
+
+        if token_to_kv_pool_allocator is None:
+            raise ValueError(
+                "token_to_kv_pool_allocator is required to run LoraRadixCache"
+            )
+
+        self.req_to_token_pool = req_to_token_pool
+        self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
+        self.page_size = page_size
+        self.disable = disable
+        self.device = self.token_to_kv_pool_allocator.device
+
+        self.key_match_fn = _key_match
+        self.get_child_key_fn = get_child_key
+        self.reset()
+
+    def reset(self):
+        self.root_node = LoRATreeNode()
+        self.root_node.key = LoRAKey(lora_id="", token_ids=[])
+        self.root_node.value = None
+        self.evictable_size_ = 0
+        self.protected_size_ = 0
+
+    def match_prefix(self, key: List[int], **kwargs) -> MatchResult:
+        raise ValueError(
+            "LoRARadixCache needs both token ids and lora id as inputs for matching. Please use match_prefix_with_lora_id instead."
+        )
+
+    def match_prefix_with_lora_id(self, key: LoRAKey, **kwargs) -> MatchResult:
+        """Find the matching prefix from the lora radix tree.
+        Args:
+            key: A LoRAKey to find a matching prefix.
+        Returns:
+            A tuple of a tensor of matching prefix token IDs and
+            the last node that contains the prefix values. Note that
+            this API can modify the internal state of the Radix tree.
+            The last node create a new child if the prefix is shorter
+            than the last node's value.
+        """
+        if self.disable or len(key) == 0:
+            return MatchResult(
+                device_indices=torch.empty(
+                    (0,),
+                    dtype=torch.int64,
+                    device=self.device,
+                ),
+                last_device_node=self.root_node,
+                last_host_node=self.root_node,
+            )
+
+        value, last_node = self._match_prefix_helper(self.root_node, key)
+        if value:
+            value = torch.cat(value)
+        else:
+            value = torch.empty((0,), dtype=torch.int64, device=self.device)
+        return MatchResult(
+            device_indices=value,
+            last_device_node=last_node,
+            last_host_node=last_node,
+        )
+
+    def insert(self, key: LoRAKey, value=None):
+        if self.disable:
+            return 0
+
+        if value is None:
+            value = [x for x in key.token_ids]
+        return self._insert_helper(self.root_node, key, value)
+
+    def cache_finished_req(self, req: Req):
+        """Cache request when it finishes."""
+        if self.disable:
+            kv_indices = self.req_to_token_pool.req_to_token[
+                req.req_pool_idx, : len(req.origin_input_ids) + len(req.output_ids) - 1
+            ]
+            self.token_to_kv_pool_allocator.free(kv_indices)
+            self.req_to_token_pool.free(req.req_pool_idx)
+            return
+
+        token_ids = (req.origin_input_ids + req.output_ids)[:-1]
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, : len(token_ids)
+        ]
+
+        page_aligned_len = len(kv_indices)
+        page_aligned_kv_indices = kv_indices.to(dtype=torch.int64, copy=True)
+
+        # Radix Cache takes one ref in memory pool
+        lora_key = LoRAKey(lora_id=req.lora_id, token_ids=token_ids[:page_aligned_len])
+        new_prefix_len = self.insert(lora_key, page_aligned_kv_indices)
+        self.token_to_kv_pool_allocator.free(
+            kv_indices[len(req.prefix_indices) : new_prefix_len]
+        )
+
+        # Remove req slot release the cache lock
+        self.req_to_token_pool.free(req.req_pool_idx)
+        self.dec_lock_ref(req.last_node)
+
+    def cache_unfinished_req(self, req: Req, chunked=False):
+        """Cache request when it is unfinished."""
+        if self.disable:
+            return
+
+        token_ids = req.fill_ids
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, : len(token_ids)
+        ]
+
+        page_aligned_len = len(kv_indices)
+        page_aligned_kv_indices = kv_indices.to(dtype=torch.int64, copy=True)
+        page_aligned_token_ids = token_ids[:page_aligned_len]
+
+        # Radix Cache takes one ref in memory pool
+        inserted_key = LoRAKey(lora_id=req.lora_id, token_ids=page_aligned_token_ids)
+        new_prefix_len = self.insert(inserted_key, page_aligned_kv_indices)
+        self.token_to_kv_pool_allocator.free(
+            kv_indices[len(req.prefix_indices) : new_prefix_len]
+        )
+
+        # The prefix indices could be updated, reuse it
+        new_indices, new_last_node, _, _ = self.match_prefix_with_lora_id(inserted_key)
+        self.req_to_token_pool.write(
+            (req.req_pool_idx, slice(len(req.prefix_indices), len(new_indices))),
+            new_indices[len(req.prefix_indices) :],
+        )
+
+        self.dec_lock_ref(req.last_node)
+        self.inc_lock_ref(new_last_node)
+
+        # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
+        req.prefix_indices = new_indices
+        req.last_node = new_last_node
+
+    def pretty_print(self):
+        self._print_helper(self.root_node, 0)
+        print(f"#tokens: {self.total_size()}")
+
+    def total_size(self):
+        return self._total_size_helper()
+
+    def evict(self, num_tokens: int):
+        if self.disable:
+            return
+
+        leaves = self._collect_leaves()
+        heapq.heapify(leaves)
+
+        num_evicted = 0
+        while num_evicted < num_tokens and len(leaves):
+            x = heapq.heappop(leaves)
+
+            if x == self.root_node:
+                break
+            if x.lock_ref > 0:
+                continue
+
+            self.token_to_kv_pool_allocator.free(x.value)
+            num_evicted += len(x.value)
+            self._delete_leaf(x)
+
+            if len(x.parent.children) == 0:
+                heapq.heappush(leaves, x.parent)
+
+    def inc_lock_ref(self, node: LoRATreeNode):
+        if self.disable:
+            return 0
+
+        delta = 0
+        while node != self.root_node:
+            if node.lock_ref == 0:
+                self.evictable_size_ -= len(node.value)
+                self.protected_size_ += len(node.value)
+                delta -= len(node.value)
+            node.lock_ref += 1
+            node = node.parent
+        return delta
+
+    def dec_lock_ref(self, node: LoRATreeNode):
+        if self.disable:
+            return 0
+
+        delta = 0
+        while node != self.root_node:
+            if node.lock_ref == 1:
+                self.evictable_size_ += len(node.value)
+                self.protected_size_ -= len(node.value)
+                delta += len(node.value)
+            node.lock_ref -= 1
+            node = node.parent
+        return delta
+
+    def evictable_size(self):
+        return self.evictable_size_
+
+    def protected_size(self):
+        # protected size refers to the size of the cache that is locked
+        return self.protected_size_
+
+    def all_values_flatten(self):
+        values = []
+
+        def _dfs_helper(node: LoRATreeNode):
+            for _, child in node.children.items():
+                values.append(child.value)
+                _dfs_helper(child)
+
+        _dfs_helper(self.root_node)
+        return torch.cat(values)
+
+    ##### Internal Helper Functions #####
+
+    def _match_prefix_helper(self, node: LoRATreeNode, key: LoRAKey):
+        node.last_access_time = time.monotonic()
+
+        child_key = self.get_child_key_fn(key)
+
+        value = []
+        while len(key) > 0 and child_key in node.children.keys():
+            child = node.children[child_key]
+            child.last_access_time = time.monotonic()
+            prefix_len = self.key_match_fn(child.key, key)
+            if prefix_len < len(child.key):
+                new_node = self._split_node(child.key, child, prefix_len)
+                value.append(new_node.value)
+                node = new_node
+                break
+            else:
+                value.append(child.value)
+                node = child
+                key = LoRAKey(lora_id=key.lora_id, token_ids=key.token_ids[prefix_len:])
+
+                if len(key):
+                    child_key = self.get_child_key_fn(key)
+
+        return value, node
+
+    def _split_node(self, key: LoRAKey, child: LoRATreeNode, split_len: int):
+        # new_node -> child
+        new_node = LoRATreeNode()
+        key_split_1 = LoRAKey(lora_id=key.lora_id, token_ids=key.token_ids[:split_len])
+        key_split_2 = LoRAKey(lora_id=key.lora_id, token_ids=key.token_ids[split_len:])
+        new_node.children = {self.get_child_key_fn(key_split_2): child}
+        new_node.parent = child.parent
+        new_node.lock_ref = child.lock_ref
+        new_node.key = key_split_1
+        new_node.value = child.value[:split_len]
+        child.parent = new_node
+        child.key = key_split_2
+        child.value = child.value[split_len:]
+        new_node.parent.children[self.get_child_key_fn(key)] = new_node
+
+        return new_node
+
+    def _insert_helper(self, node: LoRATreeNode, key: LoRAKey, value):
+        node.last_access_time = time.monotonic()
+        if len(key) == 0:
+            return 0
+
+        child_key = self.get_child_key_fn(key)
+
+        total_prefix_length = 0
+        while len(key) > 0 and child_key in node.children.keys():
+            node = node.children[child_key]
+            node.last_access_time = time.monotonic()
+            prefix_len = self.key_match_fn(node.key, key)
+            total_prefix_length += prefix_len
+            key = LoRAKey(lora_id=key.lora_id, token_ids=key.token_ids[prefix_len:])
+            value = value[prefix_len:]
+
+            if prefix_len < len(node.key):
+                new_node = self._split_node(node.key, node, prefix_len)
+                node = new_node
+
+            if len(key):
+                child_key = self.get_child_key_fn(key)
+
+        if len(key):
+            new_node = LoRATreeNode()
+            new_node.parent = node
+            new_node.key = key
+            new_node.value = value
+            node.children[child_key] = new_node
+            self.evictable_size_ += len(value)
+        return total_prefix_length
+
+    def _print_helper(self, node: LoRATreeNode, indent: int):
+        """Prints the radix tree in a human-readable format."""
+        stack = [(node, indent)]
+        while stack:
+            current_node, current_indent = stack.pop()
+            print(
+                " " * current_indent,
+                len(current_node.key),
+                current_node.key.token_ids[:10],
+                f"r={current_node.lock_ref}",
+            )
+            for key, child in current_node.children.items():
+                stack.append((child, current_indent + 2))
+
+                assert key == self.get_child_key_fn(
+                    child.key
+                ), f"{key=}, {self.get_child_key_fn(child.key)=}"
+
+    def _delete_leaf(self, node):
+        for k, v in node.parent.children.items():
+            if v == node:
+                break
+        del node.parent.children[k]
+        self.evictable_size_ -= len(node.key)
+
+    def _total_size_helper(self):
+        total_size = 0
+        stack = [self.root_node]
+        while stack:
+            current_node = stack.pop()
+            total_size += len(current_node.value)
+            for child in current_node.children.values():
+                if child.evicted:
+                    continue
+                stack.append(child)
+        return total_size
+
+    def _collect_leaves(self):
+        ret_list = []
+        stack = [self.root_node]
+
+        while stack:
+            cur_node = stack.pop()
+            if len(cur_node.children) == 0:
+                ret_list.append(cur_node)
+            else:
+                stack.extend(cur_node.children.values())
+
+        return ret_list
diff --git a/python/sglang/srt/mem_cache/memory_pool.py b/python/sglang/srt/mem_cache/memory_pool.py
new file mode 100644
index 000000000..80c549033
--- /dev/null
+++ b/python/sglang/srt/mem_cache/memory_pool.py
@@ -0,0 +1,1423 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from __future__ import annotations
+
+from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
+
+"""
+Memory pool.
+
+SGLang has two levels of memory pool.
+ReqToTokenPool maps a request to its token locations.
+TokenToKVPoolAllocator manages the indices to kv cache data.
+KVCache actually holds the physical kv cache.
+"""
+
+import abc
+import logging
+from contextlib import nullcontext
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.constants import GPU_MEMORY_TYPE_KV_CACHE
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.utils import get_bool_env_var, is_cuda, is_npu, next_power_of_2
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.cache_controller import LayerDoneCounter
+
+logger = logging.getLogger(__name__)
+
+GB = 1024 * 1024 * 1024
+_is_cuda = is_cuda()
+_is_npu = is_npu()
+if _is_npu:
+    import torch_npu
+
+
+class ReqToTokenPool:
+    """A memory pool that maps a request to its token locations."""
+
+    def __init__(
+        self,
+        size: int,
+        max_context_len: int,
+        device: str,
+        enable_memory_saver: bool,
+    ):
+
+        memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=enable_memory_saver
+        )
+
+        self.size = size
+        self.max_context_len = max_context_len
+        self.device = device
+        with memory_saver_adapter.region(GPU_MEMORY_TYPE_KV_CACHE):
+            self.req_to_token = torch.zeros(
+                (size, max_context_len), dtype=torch.int32, device=device
+            )
+
+        self.free_slots = list(range(size))
+
+    def write(self, indices, values):
+        self.req_to_token[indices] = values
+
+    def available_size(self):
+        return len(self.free_slots)
+
+    def alloc(self, need_size: int) -> List[int]:
+        if need_size > len(self.free_slots):
+            return None
+
+        select_index = self.free_slots[:need_size]
+        self.free_slots = self.free_slots[need_size:]
+
+        return select_index
+
+    def free(self, free_index: Union[int, List[int]]):
+        if isinstance(free_index, (int,)):
+            self.free_slots.append(free_index)
+        else:
+            self.free_slots.extend(free_index)
+
+    def clear(self):
+        self.free_slots = list(range(self.size))
+
+
+class MambaPool:
+    def __init__(
+        self,
+        size: int,
+        conv_dtype: torch.dtype,
+        ssm_dtype: torch.dtype,
+        num_mamba_layers: int,
+        conv_state_shape: Tuple[int, int],
+        temporal_state_shape: Tuple[int, int],
+        device: str,
+        speculative_num_draft_tokens: Optional[int] = None,
+    ):
+        conv_state = torch.zeros(
+            size=(num_mamba_layers, size + 1) + conv_state_shape,
+            dtype=conv_dtype,
+            device=device,
+        )
+        temporal_state = torch.zeros(
+            size=(num_mamba_layers, size + 1) + temporal_state_shape,
+            dtype=ssm_dtype,
+            device=device,
+        )
+        if speculative_num_draft_tokens is not None:
+            # Cache intermediate SSM states per draft token during target verify
+            # Shape: [num_layers, size + 1, speculative_num_draft_tokens, HV, K, V]
+            intermediate_ssm_state_cache = torch.empty(
+                size=(
+                    num_mamba_layers,
+                    size + 1,
+                    speculative_num_draft_tokens,
+                    temporal_state_shape[0],
+                    temporal_state_shape[1],
+                    temporal_state_shape[2],
+                ),
+                dtype=ssm_dtype,
+                device="cuda",
+            )
+            # Cache intermediate conv windows (last K-1 inputs) per draft token during target verify
+            # Shape: [num_layers, size + 1, speculative_num_draft_tokens, dim, K-1]
+            intermediate_conv_window_cache = torch.empty(
+                size=(
+                    num_mamba_layers,
+                    size + 1,
+                    speculative_num_draft_tokens,
+                    conv_state_shape[0],
+                    conv_state_shape[1],
+                ),
+                dtype=conv_dtype,
+                device="cuda",
+            )
+            self.mamba_cache = (
+                conv_state,
+                temporal_state,
+                intermediate_ssm_state_cache,
+                intermediate_conv_window_cache,
+            )
+        else:
+            self.mamba_cache = (conv_state, temporal_state)
+        self.size = size
+        self.free_slots = list(range(size))
+        self.mem_usage = self.get_mamba_size() / GB
+        logger.info(
+            f"Mamba Cache is allocated. "
+            f"conv_state size: {conv_state.numel() * conv_state.itemsize / GB:.2f}GB, "
+            f"ssm_state size: {temporal_state.numel() * temporal_state.itemsize / GB:.2f}GB "
+        )
+
+    def get_mamba_params_all_layers(self):
+        return [self.mamba_cache[i] for i in range(len(self.mamba_cache))]
+
+    def get_mamba_params(self, layer_id: int):
+        return [self.mamba_cache[i][layer_id] for i in range(len(self.mamba_cache))]
+
+    def get_mamba_size(self):
+        return (
+            np.prod(self.mamba_cache[0].shape) * self.mamba_cache[0].dtype.itemsize
+            + np.prod(self.mamba_cache[1].shape) * self.mamba_cache[1].dtype.itemsize
+        )
+
+    def available_size(self):
+        return len(self.free_slots)
+
+    def alloc(self, need_size: int) -> Optional[List[int]]:
+        if need_size > len(self.free_slots):
+            return None
+
+        select_index = self.free_slots[:need_size]
+        self.free_slots = self.free_slots[need_size:]
+
+        return select_index
+
+    def free(self, free_index: Union[int, List[int]]):
+        if isinstance(free_index, (int,)):
+            self.free_slots.append(free_index)
+        else:
+            self.free_slots.extend(free_index)
+        self.mamba_cache[0][:, free_index] = self.mamba_cache[1][:, free_index] = 0
+
+    def clear(self):
+        self.free_slots = list(range(self.size))
+
+
+class HybridReqToTokenPool(ReqToTokenPool):
+    """A memory pool that maps a request to its token locations."""
+
+    def __init__(
+        self,
+        size: int,
+        max_context_len: int,
+        device: str,
+        enable_memory_saver: bool,
+        conv_dtype: torch.dtype,
+        ssm_dtype: torch.dtype,
+        mamba_layers: List[int],
+        conv_state_shape: Tuple[int, int],
+        temporal_state_shape: Tuple[int, int],
+        speculative_num_draft_tokens: int,
+    ):
+        super().__init__(
+            size=size,
+            max_context_len=max_context_len,
+            device=device,
+            enable_memory_saver=enable_memory_saver,
+        )
+
+        self.mamba_pool = MambaPool(
+            size,
+            conv_dtype,
+            ssm_dtype,
+            len(mamba_layers),
+            conv_state_shape,
+            temporal_state_shape,
+            device,
+            speculative_num_draft_tokens,
+        )
+        self.mamba_map = {layer_id: i for i, layer_id in enumerate(mamba_layers)}
+
+        self.device = device
+        self.req_index_to_mamba_index_mapping: torch.Tensor = torch.empty(
+            size, dtype=torch.int32, device=self.device
+        )
+
+        self.rid_to_mamba_index_mapping: Dict[str, int] = {}
+        self.mamba_index_to_rid_mapping: Dict[int, str] = {}
+
+    # For chunk prefill req, we do not need to allocate mamba cache,
+    # We could use allocated mamba cache instead.
+    def alloc(
+        self, need_size: int, reqs: Optional[List["Req"]] = None
+    ) -> Optional[List[int]]:
+        select_index = super().alloc(need_size)
+        if select_index == None:
+            return None
+
+        mamba_index = []
+        for req in reqs:
+            rid = req.rid
+            if rid in self.rid_to_mamba_index_mapping:
+                mid = self.rid_to_mamba_index_mapping[rid]
+            elif (mid := self.mamba_pool.alloc(1)) is not None:
+                mid = mid[0]
+                self.rid_to_mamba_index_mapping[rid] = mid
+                self.mamba_index_to_rid_mapping[mid] = rid
+            mamba_index.append(mid)
+        assert len(select_index) == len(
+            mamba_index
+        ), f"Not enough space for mamba cache, try to increase --max-mamba-cache-size."
+        self.req_index_to_mamba_index_mapping[select_index] = torch.tensor(
+            mamba_index, dtype=torch.int32, device=self.device
+        )
+        return select_index
+
+    def get_mamba_indices(self, req_indices: torch.Tensor) -> torch.Tensor:
+        return self.req_index_to_mamba_index_mapping[req_indices]
+
+    def get_mamba_params(self, layer_id: int):
+        assert layer_id in self.mamba_map
+        return self.mamba_pool.get_mamba_params(self.mamba_map[layer_id])
+
+    def get_mamba_params_all_layers(self):
+        return self.mamba_pool.get_mamba_params_all_layers()
+
+    # For chunk prefill, we can not free mamba cache, we need use it in the future
+    def free(self, free_index: Union[int, List[int]], free_mamba_cache: bool = True):
+        super().free(free_index)
+        if free_mamba_cache:
+            mamba_index = self.req_index_to_mamba_index_mapping[free_index]
+            mamba_index_list = mamba_index.tolist()
+            if isinstance(mamba_index_list, int):
+                mamba_index_list = [mamba_index_list]
+            self.mamba_pool.free(mamba_index_list)
+            for mid in mamba_index_list:
+                rid = self.mamba_index_to_rid_mapping[mid]
+                self.mamba_index_to_rid_mapping.pop(mid)
+                self.rid_to_mamba_index_mapping.pop(rid)
+
+    def clear(self):
+        super().clear()
+        self.mamba_pool.clear()
+
+
+class KVCache(abc.ABC):
+    @abc.abstractmethod
+    def __init__(
+        self,
+        size: int,
+        page_size: int,
+        dtype: torch.dtype,
+        layer_num: int,
+        device: str,
+        enable_memory_saver: bool,
+        start_layer: Optional[int] = None,
+        end_layer: Optional[int] = None,
+    ):
+        self.size = size
+        self.page_size = page_size
+        self.dtype = dtype
+        self.device = device
+        if dtype in (torch.float8_e5m2, torch.float8_e4m3fn):
+            # NOTE: Store as torch.uint8 because Tensor.index_put is not implemented for torch.float8_e5m2
+            self.store_dtype = torch.uint8
+        else:
+            self.store_dtype = dtype
+        self.layer_num = layer_num
+        self.start_layer = start_layer or 0
+        self.end_layer = end_layer or layer_num - 1
+        self.memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=enable_memory_saver
+        )
+        self.mem_usage = 0
+
+        # used for chunked cpu-offloading
+        self.cpu_offloading_chunk_size = 8192
+
+        # default state for optional layer-wise transfer control
+        self.layer_transfer_counter = None
+
+    def _finalize_allocation_log(self, num_tokens: int):
+        """Common logging and mem_usage computation for KV cache allocation.
+        Supports both tuple (K, V) size returns and single KV size returns.
+        """
+        kv_size_bytes = self.get_kv_size_bytes()
+        if isinstance(kv_size_bytes, tuple):
+            k_size, v_size = kv_size_bytes
+            k_size_GB = k_size / GB
+            v_size_GB = v_size / GB
+            logger.info(
+                f"KV Cache is allocated. #tokens: {num_tokens}, K size: {k_size_GB:.2f} GB, V size: {v_size_GB:.2f} GB"
+            )
+            self.mem_usage = k_size_GB + v_size_GB
+        else:
+            kv_size_GB = kv_size_bytes / GB
+            logger.info(
+                f"KV Cache is allocated. #tokens: {num_tokens}, KV size: {kv_size_GB:.2f} GB"
+            )
+            self.mem_usage = kv_size_GB
+
+    @abc.abstractmethod
+    def get_key_buffer(self, layer_id: int) -> torch.Tensor:
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def get_value_buffer(self, layer_id: int) -> torch.Tensor:
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def get_kv_buffer(self, layer_id: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def set_kv_buffer(
+        self,
+        layer: RadixAttention,
+        loc: torch.Tensor,
+        cache_k: torch.Tensor,
+        cache_v: torch.Tensor,
+    ) -> None:
+        raise NotImplementedError()
+
+    def register_layer_transfer_counter(self, layer_transfer_counter: LayerDoneCounter):
+        self.layer_transfer_counter = layer_transfer_counter
+
+    def get_cpu_copy(self, indices):
+        raise NotImplementedError()
+
+    def load_cpu_copy(self, kv_cache_cpu, indices):
+        raise NotImplementedError()
+
+
+class MHATokenToKVPool(KVCache):
+
+    def __init__(
+        self,
+        size: int,
+        page_size: int,
+        dtype: torch.dtype,
+        head_num: int,
+        head_dim: int,
+        layer_num: int,
+        device: str,
+        enable_memory_saver: bool,
+        start_layer: Optional[int] = None,
+        end_layer: Optional[int] = None,
+    ):
+        super().__init__(
+            size,
+            page_size,
+            dtype,
+            layer_num,
+            device,
+            enable_memory_saver,
+            start_layer,
+            end_layer,
+        )
+        self.head_num = head_num
+        self.head_dim = head_dim
+
+        # for disagg with nvlink
+        self.enable_custom_mem_pool = get_bool_env_var(
+            "SGLANG_MOONCAKE_CUSTOM_MEM_POOL", "false"
+        )
+        if self.enable_custom_mem_pool:
+            # TODO(shangming): abstract custom allocator class for more backends
+            from mooncake.allocator import NVLinkAllocator
+
+            allocator = NVLinkAllocator.get_allocator(self.device)
+            self.custom_mem_pool = torch.cuda.MemPool(allocator.allocator())
+        else:
+            self.custom_mem_pool = None
+
+        self._create_buffers()
+
+        self.device_module = torch.get_device_module(self.device)
+        self.alt_stream = self.device_module.Stream() if _is_cuda else None
+        self._finalize_allocation_log(size)
+
+    def _create_buffers(self):
+        with self.memory_saver_adapter.region(GPU_MEMORY_TYPE_KV_CACHE):
+            with (
+                torch.cuda.use_mem_pool(self.custom_mem_pool)
+                if self.enable_custom_mem_pool
+                else nullcontext()
+            ):
+                # [size, head_num, head_dim] for each layer
+                # The padded slot 0 is used for writing dummy outputs from padded tokens.
+                self.k_buffer = [
+                    torch.zeros(
+                        (self.size + self.page_size, self.head_num, self.head_dim),
+                        dtype=self.store_dtype,
+                        device=self.device,
+                    )
+                    for _ in range(self.layer_num)
+                ]
+                self.v_buffer = [
+                    torch.zeros(
+                        (self.size + self.page_size, self.head_num, self.head_dim),
+                        dtype=self.store_dtype,
+                        device=self.device,
+                    )
+                    for _ in range(self.layer_num)
+                ]
+
+        self.k_data_ptrs = torch.tensor(
+            [x.data_ptr() for x in self.k_buffer],
+            dtype=torch.uint64,
+            device=self.device,
+        )
+        self.v_data_ptrs = torch.tensor(
+            [x.data_ptr() for x in self.v_buffer],
+            dtype=torch.uint64,
+            device=self.device,
+        )
+        self.data_ptrs = torch.cat([self.k_data_ptrs, self.v_data_ptrs], dim=0)
+        self.data_strides = torch.tensor(
+            [
+                np.prod(x.shape[1:]) * x.dtype.itemsize
+                for x in self.k_buffer + self.v_buffer
+            ],
+            device=self.device,
+        )
+
+    def _clear_buffers(self):
+        del self.k_buffer
+        del self.v_buffer
+
+    def get_kv_size_bytes(self):
+        assert hasattr(self, "k_buffer")
+        assert hasattr(self, "v_buffer")
+        k_size_bytes = 0
+        for k_cache in self.k_buffer:
+            k_size_bytes += np.prod(k_cache.shape) * k_cache.dtype.itemsize
+        v_size_bytes = 0
+        for v_cache in self.v_buffer:
+            v_size_bytes += np.prod(v_cache.shape) * v_cache.dtype.itemsize
+        return k_size_bytes, v_size_bytes
+
+    # for disagg
+    def get_contiguous_buf_infos(self):
+        # layer_num x [seq_len, head_num, head_dim]
+        # layer_num x [page_num, page_size, head_num, head_dim]
+        kv_data_ptrs = [
+            self._get_key_buffer(i).data_ptr()
+            for i in range(self.start_layer, self.start_layer + self.layer_num)
+        ] + [
+            self._get_value_buffer(i).data_ptr()
+            for i in range(self.start_layer, self.start_layer + self.layer_num)
+        ]
+        kv_data_lens = [
+            self._get_key_buffer(i).nbytes
+            for i in range(self.start_layer, self.start_layer + self.layer_num)
+        ] + [
+            self._get_value_buffer(i).nbytes
+            for i in range(self.start_layer, self.start_layer + self.layer_num)
+        ]
+        kv_item_lens = [
+            self._get_key_buffer(i)[0].nbytes * self.page_size
+            for i in range(self.start_layer, self.start_layer + self.layer_num)
+        ] + [
+            self._get_value_buffer(i)[0].nbytes * self.page_size
+            for i in range(self.start_layer, self.start_layer + self.layer_num)
+        ]
+        return kv_data_ptrs, kv_data_lens, kv_item_lens
+
+    def maybe_get_custom_mem_pool(self):
+        return self.custom_mem_pool
+
+    def get_cpu_copy(self, indices):
+        torch.cuda.synchronize()
+        kv_cache_cpu = []
+        chunk_size = self.cpu_offloading_chunk_size
+        for layer_id in range(self.layer_num):
+            kv_cache_cpu.append([])
+            for i in range(0, len(indices), chunk_size):
+                chunk_indices = indices[i : i + chunk_size]
+                k_cpu = self.k_buffer[layer_id][chunk_indices].to(
+                    "cpu", non_blocking=True
+                )
+                v_cpu = self.v_buffer[layer_id][chunk_indices].to(
+                    "cpu", non_blocking=True
+                )
+                kv_cache_cpu[-1].append([k_cpu, v_cpu])
+        torch.cuda.synchronize()
+        return kv_cache_cpu
+
+    def load_cpu_copy(self, kv_cache_cpu, indices):
+        torch.cuda.synchronize()
+        chunk_size = self.cpu_offloading_chunk_size
+        for layer_id in range(self.layer_num):
+            for i in range(0, len(indices), chunk_size):
+                chunk_indices = indices[i : i + chunk_size]
+                k_cpu, v_cpu = (
+                    kv_cache_cpu[layer_id][i // chunk_size][0],
+                    kv_cache_cpu[layer_id][i // chunk_size][1],
+                )
+                assert k_cpu.shape[0] == v_cpu.shape[0] == len(chunk_indices)
+                k_chunk = k_cpu.to(self.k_buffer[0].device, non_blocking=True)
+                v_chunk = v_cpu.to(self.v_buffer[0].device, non_blocking=True)
+                self.k_buffer[layer_id][chunk_indices] = k_chunk
+                self.v_buffer[layer_id][chunk_indices] = v_chunk
+        torch.cuda.synchronize()
+
+    def _get_key_buffer(self, layer_id: int):
+        # for internal use of referencing
+        if self.store_dtype != self.dtype:
+            return self.k_buffer[layer_id - self.start_layer].view(self.dtype)
+        return self.k_buffer[layer_id - self.start_layer]
+
+    def get_key_buffer(self, layer_id: int):
+        # note: get_key_buffer is hooked with synchronization for layer-wise KV cache loading
+        # it is supposed to be used only by attention backend not for information purpose
+        # same applies to get_value_buffer and get_kv_buffer
+        if self.layer_transfer_counter is not None:
+            self.layer_transfer_counter.wait_until(layer_id - self.start_layer)
+        return self._get_key_buffer(layer_id)
+
+    def _get_value_buffer(self, layer_id: int):
+        # for internal use of referencing
+        if self.store_dtype != self.dtype:
+            return self.v_buffer[layer_id - self.start_layer].view(self.dtype)
+        return self.v_buffer[layer_id - self.start_layer]
+
+    def get_value_buffer(self, layer_id: int):
+        if self.layer_transfer_counter is not None:
+            self.layer_transfer_counter.wait_until(layer_id - self.start_layer)
+        return self._get_value_buffer(layer_id)
+
+    def get_kv_buffer(self, layer_id: int):
+        return self.get_key_buffer(layer_id), self.get_value_buffer(layer_id)
+
+    def set_kv_buffer(
+        self,
+        layer: RadixAttention,
+        loc: torch.Tensor,
+        cache_k: torch.Tensor,
+        cache_v: torch.Tensor,
+        k_scale: Optional[float] = None,
+        v_scale: Optional[float] = None,
+        layer_id_override: Optional[int] = None,
+    ):
+        from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+
+        if layer_id_override is not None:
+            layer_id = layer_id_override
+        else:
+            layer_id = layer.layer_id
+        if cache_k.dtype != self.dtype:
+            if k_scale is not None:
+                cache_k.div_(k_scale)
+            if v_scale is not None:
+                cache_v.div_(v_scale)
+            cache_k = cache_k.to(self.dtype)
+            cache_v = cache_v.to(self.dtype)
+
+        if self.store_dtype != self.dtype:
+            cache_k = cache_k.view(self.store_dtype)
+            cache_v = cache_v.view(self.store_dtype)
+
+        if get_is_capture_mode() and self.alt_stream is not None:
+            # Overlap the copy of K and V cache for small batch size
+            current_stream = self.device_module.current_stream()
+            self.alt_stream.wait_stream(current_stream)
+            self.k_buffer[layer_id - self.start_layer][loc] = cache_k
+            with self.device_module.stream(self.alt_stream):
+                self.v_buffer[layer_id - self.start_layer][loc] = cache_v
+            current_stream.wait_stream(self.alt_stream)
+        else:
+            self.k_buffer[layer_id - self.start_layer][loc] = cache_k
+            self.v_buffer[layer_id - self.start_layer][loc] = cache_v
+
+    def move_kv_cache(self, tgt_loc: torch.Tensor, src_loc: torch.Tensor):
+        copy_all_layer_kv_cache[(len(self.data_ptrs),)](
+            self.data_ptrs,
+            self.data_strides,
+            tgt_loc,
+            src_loc,
+            len(tgt_loc),
+            next_power_of_2(len(tgt_loc)),
+        )
+
+
+class HybridLinearKVPool(KVCache):
+    """KV cache with separate pools for full and linear attention layers."""
+
+    def __init__(
+        self,
+        size: int,
+        dtype: torch.dtype,
+        head_num: int,
+        head_dim: int,
+        full_attention_layer_ids: List[int],
+        enable_kvcache_transpose: bool,
+        device: str,
+    ):
+        self.size = size
+        self.dtype = dtype
+        self.device = device
+        self.full_layer_nums = len(full_attention_layer_ids)
+        self.page_size = 1
+        # TODO MHATransposedTokenToKVPool if enable_kvcache_transpose is True
+        assert not enable_kvcache_transpose
+        self.full_kv_pool = MHATokenToKVPool(
+            size=size,
+            page_size=self.page_size,
+            dtype=dtype,
+            head_num=head_num,
+            head_dim=head_dim,
+            layer_num=self.full_layer_nums,
+            device=device,
+            enable_memory_saver=False,
+        )
+        self.full_attention_layer_id_mapping = {
+            id: i for i, id in enumerate(full_attention_layer_ids)
+        }
+        k_size, v_size = self.get_kv_size_bytes()
+        self.mem_usage = (k_size + v_size) / GB
+
+    def get_kv_size_bytes(self):
+        return self.full_kv_pool.get_kv_size_bytes()
+
+    def get_contiguous_buf_infos(self):
+        return self.full_kv_pool.get_contiguous_buf_infos()
+
+    def _transfer_full_attention_id(self, layer_id: int):
+        if layer_id not in self.full_attention_layer_id_mapping:
+            raise ValueError(
+                f"{layer_id=} not in full attention layers: {self.full_attention_layer_id_mapping.keys()}"
+            )
+        return self.full_attention_layer_id_mapping[layer_id]
+
+    def get_key_buffer(self, layer_id: int):
+        layer_id = self._transfer_full_attention_id(layer_id)
+        return self.full_kv_pool.get_key_buffer(layer_id)
+
+    def get_value_buffer(self, layer_id: int):
+        layer_id = self._transfer_full_attention_id(layer_id)
+        return self.full_kv_pool.get_value_buffer(layer_id)
+
+    def get_kv_buffer(self, layer_id: int):
+        layer_id = self._transfer_full_attention_id(layer_id)
+        return self.full_kv_pool.get_kv_buffer(layer_id)
+
+    def set_kv_buffer(
+        self,
+        layer: RadixAttention,
+        loc: torch.Tensor,
+        cache_k: torch.Tensor,
+        cache_v: torch.Tensor,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
+    ):
+        layer_id = self._transfer_full_attention_id(layer.layer_id)
+        self.full_kv_pool.set_kv_buffer(
+            None,
+            loc,
+            cache_k,
+            cache_v,
+            k_scale,
+            v_scale,
+            layer_id_override=layer_id,
+        )
+
+
+class SWAKVPool(KVCache):
+    """KV cache with separate pools for full and SWA attention layers."""
+
+    def __init__(
+        self,
+        size: int,
+        size_swa: int,
+        swa_attention_layer_ids: List[int],
+        full_attention_layer_ids: List[int],
+        enable_kvcache_transpose: bool,
+        token_to_kv_pool_class: KVCache = MHATokenToKVPool,
+        **kwargs,
+    ):
+        self.size = size
+        self.size_swa = size_swa
+        self.swa_layer_nums = len(swa_attention_layer_ids)
+        self.full_layer_nums = len(full_attention_layer_ids)
+        kwargs["page_size"] = 1
+        kwargs["enable_memory_saver"] = False
+        # TODO MHATransposedTokenToKVPool if enable_kvcache_transpose is True
+        assert not enable_kvcache_transpose
+
+        self.swa_kv_pool = token_to_kv_pool_class(
+            size=size_swa,
+            layer_num=self.swa_layer_nums,
+            **kwargs,
+        )
+        self.full_kv_pool = token_to_kv_pool_class(
+            size=size,
+            layer_num=self.full_layer_nums,
+            **kwargs,
+        )
+        self.layers_mapping: Dict[int, Tuple[int, bool]] = {}
+        for full_attn_layer_id, global_layer_id in enumerate(full_attention_layer_ids):
+            self.layers_mapping[global_layer_id] = (full_attn_layer_id, False)
+        for swa_layer_id, global_layer_id in enumerate(swa_attention_layer_ids):
+            self.layers_mapping[global_layer_id] = (swa_layer_id, True)
+        self.full_to_swa_index_mapping: Optional[torch.Tensor] = None
+
+        k_size, v_size = self.get_kv_size_bytes()
+        self.mem_usage = (k_size + v_size) / GB
+
+    def get_kv_size_bytes(self):
+        k_size, v_size = self.full_kv_pool.get_kv_size_bytes()
+        k_size_swa, v_size_swa = self.swa_kv_pool.get_kv_size_bytes()
+        return k_size + k_size_swa, v_size + v_size_swa
+
+    def get_contiguous_buf_infos(self):
+        full_kv_data_ptrs, full_kv_data_lens, full_kv_item_lens = (
+            self.full_kv_pool.get_contiguous_buf_infos()
+        )
+        swa_kv_data_ptrs, swa_kv_data_lens, swa_kv_item_lens = (
+            self.swa_kv_pool.get_contiguous_buf_infos()
+        )
+
+        kv_data_ptrs = full_kv_data_ptrs + swa_kv_data_ptrs
+        kv_data_lens = full_kv_data_lens + swa_kv_data_lens
+        kv_item_lens = full_kv_item_lens + swa_kv_item_lens
+
+        return kv_data_ptrs, kv_data_lens, kv_item_lens
+
+    def get_key_buffer(self, layer_id: int):
+        layer_id_pool, is_swa = self.layers_mapping[layer_id]
+        if is_swa:
+            return self.swa_kv_pool.get_key_buffer(layer_id_pool)
+        else:
+            return self.full_kv_pool.get_key_buffer(layer_id_pool)
+
+    def get_value_buffer(self, layer_id: int):
+        layer_id_pool, is_swa = self.layers_mapping[layer_id]
+        if is_swa:
+            return self.swa_kv_pool.get_value_buffer(layer_id_pool)
+        else:
+            return self.full_kv_pool.get_value_buffer(layer_id_pool)
+
+    def get_kv_buffer(self, layer_id: int):
+        layer_id_pool, is_swa = self.layers_mapping[layer_id]
+        if is_swa:
+            return self.swa_kv_pool.get_kv_buffer(layer_id_pool)
+        else:
+            return self.full_kv_pool.get_kv_buffer(layer_id_pool)
+
+    def translate_loc_from_full_to_swa(self, kv_indices: torch.Tensor):
+        assert self.full_to_swa_index_mapping is not None
+        return self.full_to_swa_index_mapping[kv_indices].to(torch.int32)
+
+    def set_kv_buffer(
+        self,
+        layer: RadixAttention,
+        loc: torch.Tensor,
+        cache_k: torch.Tensor,
+        cache_v: torch.Tensor,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
+    ):
+
+        layer_id = layer.layer_id
+        layer_id_pool, is_swa = self.layers_mapping[layer_id]
+        if is_swa:
+            if self.full_to_swa_index_mapping is not None:
+                loc = self.translate_loc_from_full_to_swa(loc)
+            self.swa_kv_pool.set_kv_buffer(
+                None,
+                loc,
+                cache_k,
+                cache_v,
+                k_scale,
+                v_scale,
+                layer_id_override=layer_id_pool,
+            )
+        else:
+            self.full_kv_pool.set_kv_buffer(
+                None,
+                loc,
+                cache_k,
+                cache_v,
+                k_scale,
+                v_scale,
+                layer_id_override=layer_id_pool,
+            )
+
+
+class AscendTokenToKVPool(MHATokenToKVPool):
+
+    def _create_buffers(self):
+        with self.memory_saver_adapter.region(GPU_MEMORY_TYPE_KV_CACHE):
+            # [size, head_num, head_dim] for each layer
+            # The padded slot 0 is used for writing dummy outputs from padded tokens.
+            # Continuous memory improves the efficiency of Ascend`s transmission backend,
+            # while other backends remain unchanged.
+            self.kv_buffer = torch.zeros(
+                (
+                    2,
+                    self.layer_num,
+                    self.size // self.page_size + 1,
+                    self.page_size,
+                    self.head_num,
+                    self.head_dim,
+                ),
+                dtype=self.store_dtype,
+                device=self.device,
+            )
+            self.k_buffer = self.kv_buffer[0]
+            self.v_buffer = self.kv_buffer[1]
+
+    # for disagg
+    def get_contiguous_buf_infos(self):
+        # layer_num x [seq_len, head_num, head_dim]
+        # layer_num x [page_num, page_size, head_num, head_dim]
+        kv_data_ptrs = [
+            self.get_key_buffer(i).data_ptr()
+            for i in range(self.start_layer, self.start_layer + self.layer_num)
+        ] + [
+            self.get_value_buffer(i).data_ptr()
+            for i in range(self.start_layer, self.start_layer + self.layer_num)
+        ]
+        kv_data_lens = [
+            self.get_key_buffer(i).nbytes
+            for i in range(self.start_layer, self.start_layer + self.layer_num)
+        ] + [
+            self.get_value_buffer(i).nbytes
+            for i in range(self.start_layer, self.start_layer + self.layer_num)
+        ]
+        kv_item_lens = [
+            self.get_key_buffer(i)[0].nbytes
+            for i in range(self.start_layer, self.start_layer + self.layer_num)
+        ] + [
+            self.get_value_buffer(i)[0].nbytes
+            for i in range(self.start_layer, self.start_layer + self.layer_num)
+        ]
+        return kv_data_ptrs, kv_data_lens, kv_item_lens
+
+    def set_kv_buffer(
+        self,
+        layer: RadixAttention,
+        loc: torch.Tensor,
+        cache_k: torch.Tensor,
+        cache_v: torch.Tensor,
+        k_scale: Optional[float] = None,
+        v_scale: Optional[float] = None,
+    ):
+        layer_id = layer.layer_id
+        if cache_k.dtype != self.dtype:
+            if k_scale is not None:
+                cache_k.div_(k_scale)
+            if v_scale is not None:
+                cache_v.div_(v_scale)
+            cache_k = cache_k.to(self.dtype)
+            cache_v = cache_v.to(self.dtype)
+
+        if self.store_dtype != self.dtype:
+            cache_k = cache_k.view(self.store_dtype)
+            cache_v = cache_v.view(self.store_dtype)
+
+        torch_npu._npu_reshape_and_cache(
+            key=cache_k,
+            value=cache_v,
+            key_cache=self.k_buffer[layer_id].view(
+                -1, self.page_size, self.head_num, self.head_dim
+            ),
+            value_cache=self.v_buffer[layer_id].view(
+                -1, self.page_size, self.head_num, self.head_dim
+            ),
+            slot_indices=loc,
+        )
+
+
+@triton.jit
+def set_mla_kv_buffer_kernel(
+    kv_buffer_ptr,
+    cache_k_nope_ptr,
+    cache_k_rope_ptr,
+    loc_ptr,
+    buffer_stride: tl.constexpr,
+    nope_stride: tl.constexpr,
+    rope_stride: tl.constexpr,
+    nope_dim: tl.constexpr,
+    rope_dim: tl.constexpr,
+    BLOCK: tl.constexpr,
+):
+    pid_loc = tl.program_id(0)
+    pid_blk = tl.program_id(1)
+
+    base = pid_blk * BLOCK
+    offs = base + tl.arange(0, BLOCK)
+    total_dim = nope_dim + rope_dim
+    mask = offs < total_dim
+
+    loc = tl.load(loc_ptr + pid_loc)
+    dst_ptr = kv_buffer_ptr + loc * buffer_stride + offs
+
+    if base + BLOCK <= nope_dim:
+        src = tl.load(
+            cache_k_nope_ptr + pid_loc * nope_stride + offs,
+            mask=mask,
+        )
+    else:
+        offs_rope = offs - nope_dim
+        src = tl.load(
+            cache_k_rope_ptr + pid_loc * rope_stride + offs_rope,
+            mask=mask,
+        )
+
+    tl.store(dst_ptr, src, mask=mask)
+
+
+def set_mla_kv_buffer_triton(
+    kv_buffer: torch.Tensor,
+    loc: torch.Tensor,
+    cache_k_nope: torch.Tensor,
+    cache_k_rope: torch.Tensor,
+):
+    nope_dim = cache_k_nope.shape[-1]
+    rope_dim = cache_k_rope.shape[-1]
+    total_dim = nope_dim + rope_dim
+    BLOCK = 128
+    n_loc = loc.numel()
+    grid = (n_loc, triton.cdiv(total_dim, BLOCK))
+
+    set_mla_kv_buffer_kernel[grid](
+        kv_buffer,
+        cache_k_nope,
+        cache_k_rope,
+        loc,
+        kv_buffer.stride(0),
+        cache_k_nope.stride(0),
+        cache_k_rope.stride(0),
+        nope_dim,
+        rope_dim,
+        BLOCK=BLOCK,
+    )
+
+
+class MLATokenToKVPool(KVCache):
+    def __init__(
+        self,
+        size: int,
+        page_size: int,
+        dtype: torch.dtype,
+        kv_lora_rank: int,
+        qk_rope_head_dim: int,
+        layer_num: int,
+        device: str,
+        enable_memory_saver: bool,
+        start_layer: Optional[int] = None,
+        end_layer: Optional[int] = None,
+    ):
+        super().__init__(
+            size,
+            page_size,
+            dtype,
+            layer_num,
+            device,
+            enable_memory_saver,
+            start_layer,
+            end_layer,
+        )
+
+        self.kv_lora_rank = kv_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+
+        # for disagg with nvlink
+        self.enable_custom_mem_pool = get_bool_env_var(
+            "SGLANG_MOONCAKE_CUSTOM_MEM_POOL", "false"
+        )
+        if self.enable_custom_mem_pool:
+            # TODO(shangming): abstract custom allocator class for more backends
+            from mooncake.allocator import NVLinkAllocator
+
+            allocator = NVLinkAllocator.get_allocator(self.device)
+            self.custom_mem_pool = torch.cuda.MemPool(allocator.allocator())
+        else:
+            self.custom_mem_pool = None
+
+        with self.memory_saver_adapter.region(GPU_MEMORY_TYPE_KV_CACHE):
+            with (
+                torch.cuda.use_mem_pool(self.custom_mem_pool)
+                if self.custom_mem_pool
+                else nullcontext()
+            ):
+                # The padded slot 0 is used for writing dummy outputs from padded tokens.
+                self.kv_buffer = [
+                    torch.zeros(
+                        (size + page_size, 1, kv_lora_rank + qk_rope_head_dim),
+                        dtype=self.store_dtype,
+                        device=device,
+                    )
+                    for _ in range(layer_num)
+                ]
+
+        self.data_ptrs = torch.tensor(
+            [x.data_ptr() for x in self.kv_buffer],
+            dtype=torch.uint64,
+            device=self.device,
+        )
+        self._finalize_allocation_log(size)
+
+    def get_kv_size_bytes(self):
+        assert hasattr(self, "kv_buffer")
+        kv_size_bytes = 0
+        for kv_cache in self.kv_buffer:
+            kv_size_bytes += np.prod(kv_cache.shape) * kv_cache.dtype.itemsize
+        return kv_size_bytes
+
+    # for disagg
+    def get_contiguous_buf_infos(self):
+        # MLA has only one kv_buffer, so only the information of this buffer needs to be returned.
+        kv_data_ptrs = [self.kv_buffer[i].data_ptr() for i in range(self.layer_num)]
+        kv_data_lens = [self.kv_buffer[i].nbytes for i in range(self.layer_num)]
+        kv_item_lens = [
+            self.kv_buffer[i][0].nbytes * self.page_size for i in range(self.layer_num)
+        ]
+        return kv_data_ptrs, kv_data_lens, kv_item_lens
+
+    def maybe_get_custom_mem_pool(self):
+        return self.custom_mem_pool
+
+    def get_key_buffer(self, layer_id: int):
+        if self.layer_transfer_counter is not None:
+            self.layer_transfer_counter.wait_until(layer_id - self.start_layer)
+
+        if self.store_dtype != self.dtype:
+            return self.kv_buffer[layer_id - self.start_layer].view(self.dtype)
+        return self.kv_buffer[layer_id - self.start_layer]
+
+    def get_value_buffer(self, layer_id: int):
+        if self.layer_transfer_counter is not None:
+            self.layer_transfer_counter.wait_until(layer_id - self.start_layer)
+
+        if self.store_dtype != self.dtype:
+            return self.kv_buffer[layer_id - self.start_layer][
+                ..., : self.kv_lora_rank
+            ].view(self.dtype)
+        return self.kv_buffer[layer_id - self.start_layer][..., : self.kv_lora_rank]
+
+    def get_kv_buffer(self, layer_id: int):
+        return self.get_key_buffer(layer_id), self.get_value_buffer(layer_id)
+
+    def set_kv_buffer(
+        self,
+        layer: RadixAttention,
+        loc: torch.Tensor,
+        cache_k: torch.Tensor,
+        cache_v: torch.Tensor,
+    ):
+        layer_id = layer.layer_id
+        if cache_k.dtype != self.dtype:
+            cache_k = cache_k.to(self.dtype)
+        if self.store_dtype != self.dtype:
+            self.kv_buffer[layer_id - self.start_layer][loc] = cache_k.view(
+                self.store_dtype
+            )
+        else:
+            self.kv_buffer[layer_id - self.start_layer][loc] = cache_k
+
+    def set_mla_kv_buffer(
+        self,
+        layer: RadixAttention,
+        loc: torch.Tensor,
+        cache_k_nope: torch.Tensor,
+        cache_k_rope: torch.Tensor,
+    ):
+        layer_id = layer.layer_id
+        if cache_k_nope.dtype != self.dtype:
+            cache_k_nope = cache_k_nope.to(self.dtype)
+            cache_k_rope = cache_k_rope.to(self.dtype)
+        if self.store_dtype != self.dtype:
+            cache_k_nope = cache_k_nope.view(self.store_dtype)
+            cache_k_rope = cache_k_rope.view(self.store_dtype)
+
+        set_mla_kv_buffer_triton(
+            self.kv_buffer[layer_id - self.start_layer], loc, cache_k_nope, cache_k_rope
+        )
+
+    def get_cpu_copy(self, indices):
+        torch.cuda.synchronize()
+        kv_cache_cpu = []
+        chunk_size = self.cpu_offloading_chunk_size
+        for layer_id in range(self.layer_num):
+            kv_cache_cpu.append([])
+            for i in range(0, len(indices), chunk_size):
+                chunk_indices = indices[i : i + chunk_size]
+                kv_cpu = self.kv_buffer[layer_id][chunk_indices].to(
+                    "cpu", non_blocking=True
+                )
+                kv_cache_cpu[-1].append(kv_cpu)
+        torch.cuda.synchronize()
+        return kv_cache_cpu
+
+    def load_cpu_copy(self, kv_cache_cpu, indices):
+        torch.cuda.synchronize()
+        chunk_size = self.cpu_offloading_chunk_size
+        for layer_id in range(self.layer_num):
+            for i in range(0, len(indices), chunk_size):
+                chunk_indices = indices[i : i + chunk_size]
+                kv_cpu = kv_cache_cpu[layer_id][i // chunk_size]
+                assert kv_cpu.shape[0] == len(chunk_indices)
+                kv_chunk = kv_cpu.to(self.kv_buffer[0].device, non_blocking=True)
+                self.kv_buffer[layer_id][chunk_indices] = kv_chunk
+        torch.cuda.synchronize()
+
+
+class AscendMLAPagedTokenToKVPool(MLATokenToKVPool):
+    def __init__(
+        self,
+        size: int,
+        page_size: int,
+        dtype: torch.dtype,
+        kv_lora_rank: int,
+        qk_rope_head_dim: int,
+        layer_num: int,
+        device: str,
+        enable_memory_saver: bool,
+        start_layer: Optional[int] = None,
+        end_layer: Optional[int] = None,
+    ):
+        super(MLATokenToKVPool, self).__init__(
+            size,
+            page_size,
+            dtype,
+            layer_num,
+            device,
+            enable_memory_saver,
+            start_layer,
+            end_layer,
+        )
+
+        self.kv_lora_rank = kv_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+
+        self.custom_mem_pool = None
+
+        with self.memory_saver_adapter.region(GPU_MEMORY_TYPE_KV_CACHE):
+            # The padded slot 0 is used for writing dummy outputs from padded tokens.
+            self.k_buffer = torch.zeros(
+                (
+                    layer_num,
+                    self.size // self.page_size + 1,
+                    self.page_size,
+                    1,
+                    self.kv_lora_rank,
+                ),
+                dtype=self.store_dtype,
+                device=self.device,
+            )
+            self.v_buffer = torch.zeros(
+                (
+                    layer_num,
+                    self.size // self.page_size + 1,
+                    self.page_size,
+                    1,
+                    self.qk_rope_head_dim,
+                ),
+                dtype=self.store_dtype,
+                device=self.device,
+            )
+
+        self._finalize_allocation_log(size)
+
+    def get_kv_size_bytes(self):
+        assert hasattr(self, "k_buffer")
+        assert hasattr(self, "v_buffer")
+        kv_size_bytes = 0
+        for k_cache in self.k_buffer:
+            kv_size_bytes += np.prod(k_cache.shape) * k_cache.dtype.itemsize
+        for v_cache in self.v_buffer:
+            kv_size_bytes += np.prod(v_cache.shape) * v_cache.dtype.itemsize
+        return kv_size_bytes
+
+    def get_kv_buffer(self, layer_id: int):
+        if self.layer_transfer_counter is not None:
+            self.layer_transfer_counter.wait_until(layer_id - self.start_layer)
+        return (
+            self.k_buffer[layer_id - self.start_layer],
+            self.v_buffer[layer_id - self.start_layer],
+        )
+
+    def get_key_buffer(self, layer_id: int):
+        if self.layer_transfer_counter is not None:
+            self.layer_transfer_counter.wait_until(layer_id - self.start_layer)
+
+        if self.store_dtype != self.dtype:
+            return self.k_buffer[layer_id - self.start_layer].view(self.dtype)
+        return self.k_buffer[layer_id - self.start_layer]
+
+    def get_value_buffer(self, layer_id: int):
+        if self.layer_transfer_counter is not None:
+            self.layer_transfer_counter.wait_until(layer_id - self.start_layer)
+
+        if self.store_dtype != self.dtype:
+            return self.v_buffer[layer_id - self.start_layer].view(self.dtype)
+        return self.v_buffer[layer_id - self.start_layer]
+
+    # for disagg
+    def get_contiguous_buf_infos(self):
+        # MLA has only one kv_buffer, so only the information of this buffer needs to be returned.
+        kv_data_ptrs = [self.k_buffer[i].data_ptr() for i in range(self.layer_num)] + [
+            self.v_buffer[i].data_ptr() for i in range(self.layer_num)
+        ]
+        kv_data_lens = [self.k_buffer[i].nbytes for i in range(self.layer_num)] + [
+            self.v_buffer[i].nbytes for i in range(self.layer_num)
+        ]
+        kv_item_lens = [self.k_buffer[i][0].nbytes for i in range(self.layer_num)] + [
+            self.v_buffer[i][0].nbytes for i in range(self.layer_num)
+        ]
+        return kv_data_ptrs, kv_data_lens, kv_item_lens
+
+    def set_kv_buffer(
+        self,
+        layer: RadixAttention,
+        loc: torch.Tensor,
+        cache_k: torch.Tensor,
+        cache_v: torch.Tensor,
+    ):
+        layer_id = layer.layer_id
+        if cache_k.dtype != self.dtype:
+            cache_k = cache_k.to(self.dtype)
+            cache_v = cache_v.to(self.dtype)
+
+        if self.store_dtype != self.dtype:
+            cache_k = cache_k.view(self.store_dtype)
+            cache_v = cache_v.view(self.store_dtype)
+
+        if cache_v is None:
+            cache_k, cache_v = cache_k.split(
+                [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+            )
+
+        torch_npu.npu_scatter_nd_update_(
+            self.k_buffer[layer_id - self.start_layer].view(-1, 1, self.kv_lora_rank),
+            loc.view(-1, 1),
+            cache_k.view(-1, 1, self.kv_lora_rank),
+        )
+        torch_npu.npu_scatter_nd_update_(
+            self.v_buffer[layer_id - self.start_layer].view(
+                -1, 1, self.qk_rope_head_dim
+            ),
+            loc.view(-1, 1),
+            cache_v.view(-1, 1, self.qk_rope_head_dim),
+        )
+
+
+class DoubleSparseTokenToKVPool(KVCache):
+    def __init__(
+        self,
+        size: int,
+        page_size: int,
+        dtype: torch.dtype,
+        head_num: int,
+        head_dim: int,
+        layer_num: int,
+        device: str,
+        heavy_channel_num: int,
+        enable_memory_saver: bool,
+        start_layer: Optional[int] = None,
+        end_layer: Optional[int] = None,
+    ):
+        super().__init__(
+            size,
+            page_size,
+            dtype,
+            layer_num,
+            device,
+            enable_memory_saver,
+            start_layer,
+            end_layer,
+        )
+
+        with self.memory_saver_adapter.region(GPU_MEMORY_TYPE_KV_CACHE):
+            # [size, head_num, head_dim] for each layer
+            self.k_buffer = [
+                torch.zeros(
+                    (size + page_size, head_num, head_dim), dtype=dtype, device=device
+                )
+                for _ in range(layer_num)
+            ]
+            self.v_buffer = [
+                torch.zeros(
+                    (size + page_size, head_num, head_dim), dtype=dtype, device=device
+                )
+                for _ in range(layer_num)
+            ]
+
+            # [size, head_num, heavy_channel_num] for each layer
+            self.label_buffer = [
+                torch.zeros(
+                    (size + 1, head_num, heavy_channel_num), dtype=dtype, device=device
+                )
+                for _ in range(layer_num)
+            ]
+
+    def get_key_buffer(self, layer_id: int):
+        return self.k_buffer[layer_id - self.start_layer]
+
+    def get_value_buffer(self, layer_id: int):
+        return self.v_buffer[layer_id - self.start_layer]
+
+    def get_label_buffer(self, layer_id: int):
+        return self.label_buffer[layer_id - self.start_layer]
+
+    def get_kv_buffer(self, layer_id: int):
+        return (
+            self.k_buffer[layer_id - self.start_layer],
+            self.v_buffer[layer_id - self.start_layer],
+        )
+
+    def set_kv_buffer(
+        self,
+        layer: RadixAttention,
+        loc: torch.Tensor,
+        cache_k: torch.Tensor,
+        cache_v: torch.Tensor,
+        cache_label: torch.Tensor,
+    ):
+        # NOTE(Andy): ignore the dtype check
+        layer_id = layer.layer_id
+        self.k_buffer[layer_id - self.start_layer][loc] = cache_k
+        self.v_buffer[layer_id - self.start_layer][loc] = cache_v
+        self.label_buffer[layer_id - self.start_layer][loc] = cache_label
+
+
+@triton.jit
+def copy_all_layer_kv_cache(
+    data_ptrs,
+    strides,
+    tgt_loc_ptr,
+    src_loc_ptr,
+    num_locs,
+    num_locs_upper: tl.constexpr,
+):
+    BLOCK_SIZE: tl.constexpr = 128
+
+    bid = tl.program_id(0)
+    stride = tl.load(strides + bid)
+
+    data_ptr = tl.load(data_ptrs + bid)
+    data_ptr = tl.cast(data_ptr, tl.pointer_type(tl.uint8))
+
+    num_locs_offset = tl.arange(0, num_locs_upper)
+    tgt_locs = tl.load(tgt_loc_ptr + num_locs_offset, mask=num_locs_offset < num_locs)
+    src_locs = tl.load(src_loc_ptr + num_locs_offset, mask=num_locs_offset < num_locs)
+
+    # NOTE: we cannot parallelize over the tgt_loc_ptr dim with cuda blocks
+    # because this copy is an inplace operation.
+
+    num_loop = tl.cdiv(stride, BLOCK_SIZE)
+    for i in range(num_loop):
+        copy_offset = tl.arange(0, BLOCK_SIZE) + i * BLOCK_SIZE
+        mask = (num_locs_offset < num_locs)[:, None] & (copy_offset < stride)[None, :]
+        value = tl.load(
+            data_ptr + src_locs[:, None] * stride + copy_offset[None, :], mask=mask
+        )
+        tl.store(
+            data_ptr + tgt_locs[:, None] * stride + copy_offset[None, :],
+            value,
+            mask=mask,
+        )
diff --git a/python/sglang/srt/mem_cache/memory_pool_host.py b/python/sglang/srt/mem_cache/memory_pool_host.py
new file mode 100644
index 000000000..dc27eaa03
--- /dev/null
+++ b/python/sglang/srt/mem_cache/memory_pool_host.py
@@ -0,0 +1,747 @@
+import abc
+import logging
+import threading
+from enum import IntEnum
+from functools import wraps
+from typing import Optional
+
+import psutil
+import torch
+
+from sglang.srt.mem_cache.memory_pool import KVCache, MHATokenToKVPool, MLATokenToKVPool
+from sglang.srt.utils import is_npu, is_xpu
+
+_is_npu = is_npu()
+_is_xpu = is_xpu()
+if not (_is_npu or _is_xpu):
+    from sgl_kernel.kvcacheio import (
+        transfer_kv_all_layer,
+        transfer_kv_all_layer_lf_pf,
+        transfer_kv_all_layer_mla,
+        transfer_kv_all_layer_mla_lf_pf,
+        transfer_kv_direct,
+        transfer_kv_per_layer,
+        transfer_kv_per_layer_mla,
+        transfer_kv_per_layer_mla_pf_lf,
+        transfer_kv_per_layer_pf_lf,
+    )
+
+logger = logging.getLogger(__name__)
+
+
+class MemoryStateInt(IntEnum):
+    IDLE = 0
+    RESERVED = 1
+    PROTECTED = 2
+    SYNCED = 3
+    BACKUP = 4
+
+
+def synchronized(debug_only=False):
+    def _decorator(func):
+        @wraps(func)
+        def wrapper(self, *args, **kwargs):
+            if (not debug_only) or self.debug:
+                with self.lock:
+                    return func(self, *args, **kwargs)
+            else:
+                return True
+
+        return wrapper
+
+    return _decorator
+
+
+class HostKVCache(abc.ABC):
+
+    def __init__(
+        self,
+        device_pool: KVCache,
+        host_to_device_ratio: float,
+        host_size: int,
+        page_size: int,
+        layout: str,
+        pin_memory: bool,
+        device: str,
+    ):
+        self.device_pool = device_pool
+        self.page_size = page_size
+        self.layout = layout
+        self.pin_memory = pin_memory
+        self.device = device
+
+        self.dtype = device_pool.store_dtype
+        self.size_per_token = self.get_size_per_token()
+        if host_size > 0:
+            self.size = int(host_size * 1e9 // self.size_per_token)
+        else:
+            self.size = int(device_pool.size * host_to_device_ratio)
+        # Align the host memory pool size to the page size
+        self.size = self.size - (self.size % self.page_size)
+        self.start_layer = device_pool.start_layer
+        self.end_layer = device_pool.end_layer
+
+        assert (
+            self.size > device_pool.size
+        ), "The host memory should be larger than the device memory with the current protocol"
+
+        # Verify there is enough available host memory.
+        host_mem = psutil.virtual_memory()
+        requested_bytes = self.size * self.size_per_token
+        # preserve at least 10GB for other usage
+        ten_gb = 10 * (1024**3)
+        available_bytes = host_mem.available - ten_gb
+        if requested_bytes > available_bytes:
+            raise ValueError(
+                f"Not enough host memory available. Requesting "
+                f"{requested_bytes / 1e9:.2f} GB but only have "
+                f"{available_bytes / 1e9:.2f} GB free. Please reduce the "
+                f"size of the hierarchical cache."
+            )
+        else:
+            logger.info(
+                f"Allocating {requested_bytes / 1e9:.2f} GB host memory for hierarchical KV cache."
+            )
+
+        self.kv_buffer = self.init_kv_buffer()
+
+        # A lock for synchronized operations on memory allocation and state transitions.
+        self.lock = threading.RLock()
+        self.debug = logger.isEnabledFor(logging.DEBUG)
+        self.clear()
+
+    @abc.abstractmethod
+    def get_size_per_token(self):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def init_kv_buffer(self):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def load_to_device_per_layer(
+        self, device_pool, host_indices, device_indices, layer_id, io_backend
+    ) -> None:
+        """
+        Load KV data from the host memory pool to the device memory pool for a specific layer.
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def backup_from_device_all_layer(
+        self, device_pool, host_indices, device_indices, io_backend
+    ) -> None:
+        """
+        Backup KV data from the device memory pool to the host memory pool for all layers.
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def get_flat_data_page(self, index) -> torch.Tensor:
+        """
+        Get a flat data page from the host memory pool.
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def get_dummy_flat_data_page(self) -> torch.Tensor:
+        """
+        Get a dummy flat data page from the host memory pool.
+        This is used for prefetching or initializing empty pages.
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def set_from_flat_data_page(self, index: int, data_page: torch.Tensor) -> None:
+        """
+        Set a flat data page to the host memory pool.
+        """
+        raise NotImplementedError()
+
+    @synchronized()
+    def clear(self):
+        # Initialize memory states and tracking structures.
+        self.mem_state = torch.zeros(
+            (self.size,), dtype=torch.uint8, device=self.device
+        )
+        self.free_slots = torch.arange(self.size, dtype=torch.int64)
+
+    def available_size(self):
+        return len(self.free_slots)
+
+    @synchronized()
+    def alloc(self, need_size: int) -> Optional[torch.Tensor]:
+        assert (
+            need_size % self.page_size == 0
+        ), "The requested size should be a multiple of the page size."
+        if need_size > self.available_size():
+            return None
+
+        select_index = self.free_slots[:need_size]
+        self.free_slots = self.free_slots[need_size:]
+
+        if self.debug:
+            self.mem_state[select_index] = MemoryStateInt.RESERVED
+
+        return select_index
+
+    @synchronized()
+    def free(self, indices: torch.Tensor) -> int:
+        self.free_slots = torch.cat([self.free_slots, indices])
+        if self.debug:
+            self.mem_state[indices] = MemoryStateInt.IDLE
+        return len(indices)
+
+    @synchronized(debug_only=True)
+    def get_state(self, indices: torch.Tensor) -> MemoryStateInt:
+        assert len(indices) > 0, "The indices should not be empty"
+        states = self.mem_state[indices]
+        assert (
+            states == states[0]
+        ).all(), "The memory slots should have the same state {}".format(states)
+        return MemoryStateInt(states[0].item())
+
+    @synchronized(debug_only=True)
+    def is_reserved(self, indices: torch.Tensor) -> bool:
+        return self.get_state(indices) == MemoryStateInt.RESERVED
+
+    @synchronized(debug_only=True)
+    def is_protected(self, indices: torch.Tensor) -> bool:
+        return self.get_state(indices) == MemoryStateInt.PROTECTED
+
+    @synchronized(debug_only=True)
+    def is_synced(self, indices: torch.Tensor) -> bool:
+        return self.get_state(indices) == MemoryStateInt.SYNCED
+
+    @synchronized(debug_only=True)
+    def is_backup(self, indices: torch.Tensor) -> bool:
+        return self.get_state(indices) == MemoryStateInt.BACKUP
+
+    @synchronized(debug_only=True)
+    def update_backup(self, indices: torch.Tensor):
+        if not self.is_synced(indices):
+            raise ValueError(
+                f"The host memory slots should be in SYNCED state before turning into BACKUP. "
+                f"Current state: {self.get_state(indices)}"
+            )
+        self.mem_state[indices] = MemoryStateInt.BACKUP
+
+    @synchronized(debug_only=True)
+    def update_prefetch(self, indices: torch.Tensor):
+        if not self.is_reserved(indices):
+            raise ValueError(
+                f"The host memory slots should be in RESERVED state before turning into BACKUP. "
+                f"Current state: {self.get_state(indices)}"
+            )
+        self.mem_state[indices] = MemoryStateInt.BACKUP
+
+    @synchronized(debug_only=True)
+    def update_synced(self, indices: torch.Tensor):
+        self.mem_state[indices] = MemoryStateInt.SYNCED
+
+    @synchronized(debug_only=True)
+    def protect_write(self, indices: torch.Tensor):
+        if not self.is_reserved(indices):
+            raise ValueError(
+                f"The host memory slots should be RESERVED before write operations. "
+                f"Current state: {self.get_state(indices)}"
+            )
+        self.mem_state[indices] = MemoryStateInt.PROTECTED
+
+    @synchronized(debug_only=True)
+    def protect_load(self, indices: torch.Tensor):
+        if not self.is_backup(indices):
+            raise ValueError(
+                f"The host memory slots should be in BACKUP state before load operations. "
+                f"Current state: {self.get_state(indices)}"
+            )
+        self.mem_state[indices] = MemoryStateInt.PROTECTED
+
+    @synchronized(debug_only=True)
+    def complete_io(self, indices: torch.Tensor):
+        if not self.is_protected(indices):
+            raise ValueError(
+                f"The host memory slots should be PROTECTED during I/O operations. "
+                f"Current state: {self.get_state(indices)}"
+            )
+        self.mem_state[indices] = MemoryStateInt.SYNCED
+
+
+class MHATokenToKVPoolHost(HostKVCache):
+    device_pool: MHATokenToKVPool
+
+    def __init__(
+        self,
+        device_pool: MHATokenToKVPool,
+        host_to_device_ratio: float,
+        host_size: int,
+        page_size: int,
+        layout: str,
+        pin_memory: bool = True,
+        device: str = "cpu",
+    ):
+        super().__init__(
+            device_pool,
+            host_to_device_ratio,
+            host_size,
+            page_size,
+            layout,
+            pin_memory,
+            device,
+        )
+        self.k_data_refs = [self.k_buffer[i] for i in range(self.layer_num)]
+        self.v_data_refs = [self.v_buffer[i] for i in range(self.layer_num)]
+        self.k_data_ptrs = torch.tensor(
+            [x.data_ptr() for x in self.k_data_refs],
+            dtype=torch.uint64,
+            device=self.device_pool.device,
+        )
+        self.v_data_ptrs = torch.tensor(
+            [x.data_ptr() for x in self.v_data_refs],
+            dtype=torch.uint64,
+            device=self.device_pool.device,
+        )
+
+    def get_size_per_token(self):
+        self.head_num = self.device_pool.head_num
+        self.head_dim = self.device_pool.head_dim
+        self.layer_num = self.device_pool.layer_num
+
+        return self.head_dim * self.head_num * self.layer_num * self.dtype.itemsize * 2
+
+    def get_ksize_per_token(self):
+        return self.get_size_per_token() // 2
+
+    def init_kv_buffer(self):
+        if self.layout == "layer_first":
+            dims = (2, self.layer_num, self.size, self.head_num, self.head_dim)
+        elif self.layout == "page_first":
+            dims = (2, self.size, self.layer_num, self.head_num, self.head_dim)
+        else:
+            raise ValueError(f"Unsupported layout: {self.layout}")
+        self.token_stride_size = self.head_num * self.head_dim * self.dtype.itemsize
+        self.layout_dim = self.token_stride_size * self.layer_num
+        return torch.empty(
+            dims,
+            dtype=self.dtype,
+            device=self.device,
+            pin_memory=self.pin_memory,
+        )
+
+    @property
+    def k_buffer(self):
+        return self.kv_buffer[0]
+
+    @property
+    def v_buffer(self):
+        return self.kv_buffer[1]
+
+    def load_to_device_per_layer(
+        self,
+        device_pool,
+        host_indices,
+        device_indices,
+        layer_id,
+        io_backend,
+    ):
+        if io_backend == "kernel":
+            if self.layout == "layer_first":
+                transfer_kv_per_layer(
+                    src_k=self.k_buffer[layer_id],
+                    dst_k=device_pool.k_buffer[layer_id],
+                    src_v=self.v_buffer[layer_id],
+                    dst_v=device_pool.v_buffer[layer_id],
+                    src_indices=host_indices,
+                    dst_indices=device_indices,
+                    item_size=self.token_stride_size,
+                )
+            elif self.layout == "page_first":
+                transfer_kv_per_layer_pf_lf(
+                    src_k=self.k_buffer,
+                    dst_k=device_pool.k_buffer[layer_id],
+                    src_v=self.v_buffer,
+                    dst_v=device_pool.v_buffer[layer_id],
+                    src_indices=host_indices,
+                    dst_indices=device_indices,
+                    layer_id=layer_id,
+                    item_size=self.token_stride_size,
+                    src_layout_dim=self.layout_dim,
+                )
+            else:
+                raise ValueError(f"Unsupported layout: {self.layout}")
+        elif io_backend == "direct":
+            assert (
+                self.layout == "layer_first"
+            ), f"Direct IO backend only supports layer_first layout."
+            transfer_kv_direct(
+                src_layers=[self.k_buffer[layer_id], self.v_buffer[layer_id]],
+                dst_layers=[
+                    device_pool.k_buffer[layer_id],
+                    device_pool.v_buffer[layer_id],
+                ],
+                src_indices=host_indices,
+                dst_indices=device_indices,
+                page_size=self.page_size,
+            )
+        else:
+            raise ValueError(f"Unsupported IO backend: {io_backend}")
+
+    def backup_from_device_all_layer(
+        self, device_pool, host_indices, device_indices, io_backend
+    ):
+        if io_backend == "kernel":
+            if self.layout == "layer_first":
+                transfer_kv_all_layer(
+                    src_k_layers=device_pool.k_data_ptrs,
+                    dst_k_layers=self.k_data_ptrs,
+                    src_v_layers=device_pool.v_data_ptrs,
+                    dst_v_layers=self.v_data_ptrs,
+                    src_indices=device_indices,
+                    dst_indices=host_indices,
+                    item_size=self.token_stride_size,
+                    num_layers=self.layer_num,
+                )
+            elif self.layout == "page_first":
+                transfer_kv_all_layer_lf_pf(
+                    src_k_layers=device_pool.k_data_ptrs,
+                    dst_k=self.k_buffer,
+                    src_v_layers=device_pool.v_data_ptrs,
+                    dst_v=self.v_buffer,
+                    src_indices=device_indices,
+                    dst_indices=host_indices,
+                    item_size=self.token_stride_size,
+                    dst_layout_dim=self.layout_dim,
+                    num_layers=self.layer_num,
+                )
+            else:
+                raise ValueError(f"Unsupported layout: {self.layout}")
+        elif io_backend == "direct":
+            assert (
+                self.layout == "layer_first"
+            ), f"Direct IO backend only supports layer_first layout."
+            transfer_kv_direct(
+                src_layers=device_pool.k_buffer + device_pool.v_buffer,
+                dst_layers=self.k_data_refs + self.v_data_refs,
+                src_indices=device_indices,
+                dst_indices=host_indices,
+                page_size=self.page_size,
+            )
+        else:
+            raise ValueError(f"Unsupported IO backend: {io_backend}")
+
+    def get_flat_data_page(self, index) -> torch.Tensor:
+        if self.layout == "layer_first":
+            return self.kv_buffer[:, :, index : index + self.page_size, :, :].flatten()
+        elif self.layout == "page_first":
+            return self.kv_buffer[:, index : index + self.page_size, :, :, :].flatten()
+        else:
+            raise ValueError(f"Unsupported layout: {self.layout}")
+
+    def get_dummy_flat_data_page(self) -> torch.Tensor:
+        return torch.zeros(
+            (2, self.layer_num, self.page_size, self.head_num, self.head_dim),
+            dtype=self.dtype,
+            device=self.device,
+            pin_memory=self.pin_memory,
+        ).flatten()
+
+    def set_from_flat_data_page(self, index: int, data_page: torch.Tensor) -> None:
+        if self.layout == "layer_first":
+            self.kv_buffer[:, :, index : index + self.page_size, :, :] = (
+                data_page.reshape(
+                    2,
+                    self.layer_num,
+                    self.page_size,
+                    self.head_num,
+                    self.head_dim,
+                )
+            )
+        elif self.layout == "page_first":
+            self.kv_buffer[:, index : index + self.page_size, :, :, :] = (
+                data_page.reshape(
+                    2, self.page_size, self.layer_num, self.head_num, self.head_dim
+                )
+            )
+        else:
+            raise ValueError(f"Unsupported layout: {self.layout}")
+
+    def get_buffer_meta(self, keys, indices, local_rank):
+        ptr_list = []
+        key_list = []
+        kv_buffer_data_ptr = self.kv_buffer.data_ptr()
+        indices = indices.tolist()
+        v_offset = (
+            self.layer_num
+            * self.size
+            * self.head_num
+            * self.head_dim
+            * self.dtype.itemsize
+        )
+        for index in range(0, len(indices), self.page_size):
+            k_ptr = (
+                kv_buffer_data_ptr
+                + indices[index]
+                * self.layer_num
+                * self.head_num
+                * self.head_dim
+                * self.dtype.itemsize
+            )
+            v_ptr = k_ptr + v_offset
+            ptr_list.append(k_ptr)
+            ptr_list.append(v_ptr)
+            key_ = keys[index // self.page_size]
+            key_list.append(f"{key_}_{local_rank}_k")
+            key_list.append(f"{key_}_{local_rank}_v")
+        element_size = (
+            self.layer_num
+            * self.dtype.itemsize
+            * self.page_size
+            * self.head_num
+            * self.head_dim
+        )
+        element_size_list = [element_size] * len(key_list)
+        return key_list, ptr_list, element_size_list
+
+    def get_buffer_with_hash(self, keys, indices=None):
+        assert self.layout == "page_first"
+        assert indices is None or (len(keys) == (len(indices) // self.page_size))
+
+        key_list = []
+        buf_list = []
+
+        for i in range(len(keys)):
+            key = keys[i]
+            key_list.append(f"{key}-k")
+            key_list.append(f"{key}-v")
+            if indices is not None:
+                index = indices[i * self.page_size]
+                buf_list.append(self.k_buffer[index : index + self.page_size])
+                buf_list.append(self.v_buffer[index : index + self.page_size])
+
+        return key_list, buf_list, 2
+
+
+class MLATokenToKVPoolHost(HostKVCache):
+    device_pool: MLATokenToKVPool
+
+    def __init__(
+        self,
+        device_pool: MLATokenToKVPool,
+        host_to_device_ratio: float,
+        host_size: int,
+        page_size: int,
+        layout: str,
+        pin_memory: bool = True,
+        device: str = "cpu",
+    ):
+        super().__init__(
+            device_pool,
+            host_to_device_ratio,
+            host_size,
+            page_size,
+            layout,
+            pin_memory,
+            device,
+        )
+        self.data_refs = [self.kv_buffer[i] for i in range(self.layer_num)]
+        self.data_ptrs = torch.tensor(
+            [x.data_ptr() for x in self.data_refs],
+            dtype=torch.uint64,
+            device=self.device_pool.device,
+        )
+
+    def get_size_per_token(self):
+        self.kv_lora_rank = self.device_pool.kv_lora_rank
+        self.qk_rope_head_dim = self.device_pool.qk_rope_head_dim
+        self.layer_num = self.device_pool.layer_num
+
+        return (
+            (self.kv_lora_rank + self.qk_rope_head_dim)
+            * 1
+            * self.dtype.itemsize
+            * self.layer_num
+        )
+
+    def get_ksize_per_token(self):
+        return self.get_size_per_token()
+
+    def init_kv_buffer(self):
+        if self.layout == "layer_first":
+            dims = (
+                self.layer_num,
+                self.size,
+                1,
+                self.kv_lora_rank + self.qk_rope_head_dim,
+            )
+        elif self.layout == "page_first":
+            dims = (
+                self.size,
+                self.layer_num,
+                1,
+                self.kv_lora_rank + self.qk_rope_head_dim,
+            )
+        else:
+            raise ValueError(f"Unsupported layout: {self.layout}")
+        self.token_stride_size = (
+            self.kv_lora_rank + self.qk_rope_head_dim
+        ) * self.dtype.itemsize
+        self.layout_dim = self.token_stride_size * self.layer_num
+
+        return torch.empty(
+            dims,
+            dtype=self.dtype,
+            device=self.device,
+            pin_memory=self.pin_memory,
+        )
+
+    def load_to_device_per_layer(
+        self, device_pool, host_indices, device_indices, layer_id, io_backend
+    ):
+        if io_backend == "kernel":
+            if self.layout == "layer_first":
+                transfer_kv_per_layer_mla(
+                    src=self.kv_buffer[layer_id],
+                    dst=device_pool.kv_buffer[layer_id],
+                    src_indices=host_indices,
+                    dst_indices=device_indices,
+                    item_size=self.token_stride_size,
+                )
+            elif self.layout == "page_first":
+                transfer_kv_per_layer_mla_pf_lf(
+                    src=self.kv_buffer,
+                    dst=device_pool.kv_buffer[layer_id],
+                    src_indices=host_indices,
+                    dst_indices=device_indices,
+                    layer_id=layer_id,
+                    item_size=self.token_stride_size,
+                    src_layout_dim=self.layout_dim,
+                )
+            else:
+                raise ValueError(f"Unsupported layout: {self.layout}")
+        elif io_backend == "direct":
+            assert (
+                self.layout == "layer_first"
+            ), f"Direct IO backend only supports layer_first layout."
+            transfer_kv_direct(
+                src_layers=[self.kv_buffer[layer_id]],
+                dst_layers=[device_pool.kv_buffer[layer_id]],
+                src_indices=host_indices,
+                dst_indices=device_indices,
+                page_size=self.page_size,
+            )
+
+    def backup_from_device_all_layer(
+        self, device_pool, host_indices, device_indices, io_backend
+    ):
+        if io_backend == "kernel":
+            if self.layout == "layer_first":
+                transfer_kv_all_layer_mla(
+                    src_layers=device_pool.data_ptrs,
+                    dst_layers=self.data_ptrs,
+                    src_indices=device_indices,
+                    dst_indices=host_indices,
+                    item_size=self.token_stride_size,
+                    num_layers=self.layer_num,
+                )
+            elif self.layout == "page_first":
+                transfer_kv_all_layer_mla_lf_pf(
+                    src_layers=device_pool.data_ptrs,
+                    dst=self.kv_buffer,
+                    src_indices=device_indices,
+                    dst_indices=host_indices,
+                    item_size=self.token_stride_size,
+                    dst_layout_dim=self.layout_dim,
+                    num_layers=self.layer_num,
+                )
+            else:
+                raise ValueError(f"Unsupported layout: {self.layout}")
+        elif io_backend == "direct":
+            assert (
+                self.layout == "layer_first"
+            ), f"Direct IO backend only supports layer_first layout."
+            transfer_kv_direct(
+                src_layers=device_pool.kv_buffer,
+                dst_layers=self.data_refs,
+                src_indices=device_indices,
+                dst_indices=host_indices,
+                page_size=self.page_size,
+            )
+        else:
+            raise ValueError(f"Unsupported IO backend: {io_backend}")
+
+    def get_flat_data_page(self, index) -> torch.Tensor:
+        if self.layout == "layer_first":
+            return self.kv_buffer[:, index : index + self.page_size, :, :].flatten()
+        elif self.layout == "page_first":
+            return self.kv_buffer[index : index + self.page_size, :, :, :].flatten()
+        else:
+            raise ValueError(f"Unsupported layout: {self.layout}")
+
+    def get_dummy_flat_data_page(self) -> torch.Tensor:
+        return torch.zeros(
+            (
+                self.layer_num,
+                self.page_size,
+                1,
+                self.kv_lora_rank + self.qk_rope_head_dim,
+            ),
+            dtype=self.dtype,
+            device=self.device,
+            pin_memory=self.pin_memory,
+        ).flatten()
+
+    def set_from_flat_data_page(self, index: int, data_page: torch.Tensor) -> None:
+        if self.layout == "layer_first":
+            self.kv_buffer[:, index : index + self.page_size, :, :] = data_page.reshape(
+                self.layer_num,
+                self.page_size,
+                1,
+                self.kv_lora_rank + self.qk_rope_head_dim,
+            )
+        elif self.layout == "page_first":
+            self.kv_buffer[index : index + self.page_size, :, :, :] = data_page.reshape(
+                self.page_size,
+                self.layer_num,
+                1,
+                self.kv_lora_rank + self.qk_rope_head_dim,
+            )
+        else:
+            raise ValueError(f"Unsupported layout: {self.layout}")
+
+    def get_buffer_meta(self, keys, indices, local_rank):
+        ptr_list = []
+        key_list = []
+        kv_buffer_data_ptr = self.kv_buffer.data_ptr()
+        indices = indices.tolist()
+        for index in range(0, len(indices), self.page_size):
+            k_ptr = (
+                kv_buffer_data_ptr
+                + indices[index]
+                * self.layer_num
+                * (self.kv_lora_rank + self.qk_rope_head_dim)
+                * self.dtype.itemsize
+            )
+            ptr_list.append(k_ptr)
+            key_ = keys[index // self.page_size]
+            key_list.append(f"{key_}_k")
+        element_size = (
+            self.layer_num
+            * self.dtype.itemsize
+            * self.page_size
+            * (self.kv_lora_rank + self.qk_rope_head_dim)
+        )
+        element_size_list = [element_size] * len(key_list)
+        return key_list, ptr_list, element_size_list
+
+    def get_buffer_with_hash(self, keys, indices=None):
+        assert self.layout == "page_first"
+        assert indices is None or (len(keys) == (len(indices) // self.page_size))
+
+        buf_list = []
+
+        if indices is not None:
+            for i in range(len(keys)):
+                index = indices[i * self.page_size]
+                buf_list.append(self.kv_buffer[index : index + self.page_size])
+
+        return keys, buf_list, 1
diff --git a/python/sglang/srt/mem_cache/multimodal_cache.py b/python/sglang/srt/mem_cache/multimodal_cache.py
new file mode 100644
index 000000000..63a177543
--- /dev/null
+++ b/python/sglang/srt/mem_cache/multimodal_cache.py
@@ -0,0 +1,68 @@
+import logging
+from collections import OrderedDict
+from typing import Dict
+
+import torch
+
+# Set up logging for cache behavior
+logger = logging.getLogger(__name__)
+
+
+class MultiModalCache:
+    """MultiModalCache is used to store vlm encoder results with LRU eviction"""
+
+    def __init__(
+        self,
+        max_size: int,
+    ):
+        self.max_size = max_size
+        self.mm_cache: OrderedDict[int, torch.Tensor] = OrderedDict()
+        self.current_size = 0
+
+    def _allocate(self, embedding_size: int) -> bool:
+        """Allocate space by evicting least recently used entries"""
+        evictions = 0
+        while self.current_size + embedding_size > self.max_size and self.mm_cache:
+            _, old_embedding = self.mm_cache.popitem(last=False)
+            evicted_size = self._get_tensor_size(old_embedding)
+            self.current_size -= evicted_size
+            evictions += evicted_size
+
+        if evictions > 0:
+            logger.debug(
+                f"Cache eviction: evicted {evictions} bytes, remaining size: {self.current_size}/{self.max_size} bytes"
+            )
+
+        if self.current_size + embedding_size > self.max_size:
+            return False
+        return True
+
+    def put(self, mm_hash: int, embedding: torch.Tensor) -> bool:
+        data_size = self._get_tensor_size(embedding)
+        # Lazy free cache if not enough space
+        if not self._allocate(data_size):
+            return False
+        self.mm_cache[mm_hash] = embedding
+        self.current_size += data_size
+        return True
+
+    def has(self, mm_hash: int) -> bool:
+        return mm_hash in self.mm_cache
+
+    def get(self, mm_hash: int) -> torch.Tensor:
+        """Get embedding and update LRU order"""
+        if mm_hash in self.mm_cache:
+            # Move to end (most recently used)
+            self.mm_cache.move_to_end(mm_hash)
+            return self.mm_cache[mm_hash]
+        return None
+
+    def clear(self):
+        self.mm_cache.clear()
+        self.current_size = 0
+
+    def _get_tensor_size(self, embedding: torch.Tensor):
+        return embedding.element_size() * embedding.numel()
+
+    def __len__(self):
+        return len(self.mm_cache)
diff --git a/python/sglang/srt/mem_cache/radix_cache.py b/python/sglang/srt/mem_cache/radix_cache.py
new file mode 100644
index 000000000..d8208e143
--- /dev/null
+++ b/python/sglang/srt/mem_cache/radix_cache.py
@@ -0,0 +1,572 @@
+from __future__ import annotations
+
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""
+The radix tree data structure for managing the KV cache.
+"""
+
+import heapq
+import time
+from collections import defaultdict
+from functools import partial
+from typing import TYPE_CHECKING, List, Optional
+
+import torch
+
+from sglang.srt.disaggregation.kv_events import (
+    AllBlocksCleared,
+    BlockRemoved,
+    BlockStored,
+)
+from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
+from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache, MatchResult
+from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req
+
+
+class TreeNode:
+
+    counter = 0
+
+    def __init__(self, id: Optional[int] = None):
+        self.children = defaultdict(TreeNode)
+        self.parent: TreeNode = None
+        self.key: List[int] = None
+        self.value: Optional[torch.Tensor] = None
+        self.lock_ref = 0
+        self.last_access_time = time.monotonic()
+
+        self.hit_count = 0
+        # indicating the node is locked to protect from eviction
+        # incremented when the node is referenced by a storage operation
+        self.host_ref_counter = 0
+        # store the host indices of KV cache
+        self.host_value: Optional[torch.Tensor] = None
+        # store hash values of each pages
+        self.hash_value: Optional[List[str]] = None
+
+        self.id = TreeNode.counter if id is None else id
+        TreeNode.counter += 1
+
+    @property
+    def evicted(self):
+        return self.value is None
+
+    @property
+    def backuped(self):
+        return self.host_value is not None
+
+    def protect_host(self):
+        """Protect the host value from eviction."""
+        self.host_ref_counter += 1
+
+    def release_host(self):
+        """Release the host value, allowing it to be evicted."""
+        if self.host_ref_counter > 0:
+            self.host_ref_counter -= 1
+        else:
+            raise RuntimeError("Host reference counter is already zero.")
+
+    def get_last_hash_value(self) -> Optional[str]:
+        """Returns the hash value of the last page in this node."""
+        if self.hash_value is None or len(self.hash_value) == 0:
+            return None
+        return self.hash_value[-1]
+
+    def __lt__(self, other: "TreeNode"):
+        return self.last_access_time < other.last_access_time
+
+
+def _key_match_page_size1(key0: List, key1: List):
+    i = 0
+    for k0, k1 in zip(key0, key1):
+        if k0 != k1:
+            break
+        i += 1
+    return i
+
+
+def _key_match_paged(key0: List, key1: List, page_size: int):
+    min_len = min(len(key0), len(key1))
+
+    i = 0
+    while i < min_len:
+        if key0[i : i + page_size] != key1[i : i + page_size]:
+            break
+        i += page_size
+
+    return i
+
+
+class RadixCache(BasePrefixCache):
+    def __init__(
+        self,
+        req_to_token_pool: ReqToTokenPool,
+        token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator,
+        page_size: int,
+        disable: bool = False,
+        enable_kv_cache_events: bool = False,
+    ):
+        self.req_to_token_pool = req_to_token_pool
+        self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
+        self.page_size = page_size
+        self.disable = disable
+        self.enable_kv_cache_events = enable_kv_cache_events
+        self.kv_event_queue = []
+
+        if self.token_to_kv_pool_allocator:
+            self.device = self.token_to_kv_pool_allocator.device
+        else:
+            self.device = torch.device("cpu")
+
+        if self.page_size == 1:
+            self.key_match_fn = _key_match_page_size1
+            self.get_child_key_fn = lambda key: key[0]
+        else:
+            self.key_match_fn = partial(_key_match_paged, page_size=page_size)
+            self.get_child_key_fn = lambda key: tuple(key[:page_size])
+        self.reset()
+
+    ##### Public API #####
+
+    def reset(self):
+        self.root_node = TreeNode()
+        self.root_node.key = []
+        self.root_node.value = []
+        self.root_node.host_value = []
+        self.root_node.lock_ref = 1
+        self.evictable_size_ = 0
+        self.protected_size_ = 0
+        self._record_all_cleared_event()
+
+    def match_prefix(self, key: List[int], **kwargs) -> MatchResult:
+        """Find the matching prefix from the radix tree.
+        Args:
+            key: A list of token IDs to find a matching prefix.
+        Returns:
+            A tuple of a tensor of matching prefix token IDs and
+            the last node that contains the prefix values. Note that
+            this API can modify the internal state of the Radix tree.
+            The last node create a new child if the prefix is shorter
+            than the last node's value.
+        """
+        if self.disable or len(key) == 0:
+            return MatchResult(
+                device_indices=torch.empty(
+                    (0,),
+                    dtype=torch.int64,
+                    device=self.device,
+                ),
+                last_device_node=self.root_node,
+                last_host_node=self.root_node,
+            )
+
+        if self.page_size != 1:
+            page_aligned_len = len(key) // self.page_size * self.page_size
+            key = key[:page_aligned_len]
+
+        value, last_node = self._match_prefix_helper(self.root_node, key)
+        if value:
+            value = torch.cat(value)
+        else:
+            value = torch.empty((0,), dtype=torch.int64, device=self.device)
+        return MatchResult(
+            device_indices=value,
+            last_device_node=last_node,
+            last_host_node=last_node,
+        )
+
+    def insert(self, key: List, value=None, chunked=False):
+        if self.disable:
+            return 0
+
+        if value is None:
+            value = [x for x in key]
+        return self._insert_helper(self.root_node, key, value)
+
+    def cache_finished_req(self, req: Req):
+        """Cache request when it finishes."""
+        if self.disable:
+            kv_indices = self.req_to_token_pool.req_to_token[
+                req.req_pool_idx, : len(req.origin_input_ids) + len(req.output_ids) - 1
+            ]
+            self.token_to_kv_pool_allocator.free(kv_indices)
+            self.req_to_token_pool.free(req.req_pool_idx)
+            return
+
+        token_ids = (req.origin_input_ids + req.output_ids)[:-1]
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, : len(token_ids)
+        ]
+
+        if self.page_size != 1:
+            page_aligned_len = len(kv_indices) // self.page_size * self.page_size
+            page_aligned_kv_indices = kv_indices[:page_aligned_len].to(
+                dtype=torch.int64, copy=True
+            )
+            self.token_to_kv_pool_allocator.free(kv_indices[page_aligned_len:])
+        else:
+            page_aligned_len = len(kv_indices)
+            page_aligned_kv_indices = kv_indices.to(dtype=torch.int64, copy=True)
+
+        # Radix Cache takes one ref in memory pool
+        new_prefix_len = self.insert(
+            token_ids[:page_aligned_len], page_aligned_kv_indices
+        )
+        self.token_to_kv_pool_allocator.free(
+            kv_indices[len(req.prefix_indices) : new_prefix_len]
+        )
+
+        # Remove req slot release the cache lock
+        self.req_to_token_pool.free(req.req_pool_idx)
+        self.dec_lock_ref(req.last_node)
+
+    def cache_unfinished_req(self, req: Req, chunked=False):
+        """Cache request when it is unfinished."""
+        if self.disable:
+            return
+
+        token_ids = req.fill_ids
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, : len(token_ids)
+        ]
+
+        if self.page_size != 1:
+            page_aligned_len = len(kv_indices) // self.page_size * self.page_size
+            page_aligned_kv_indices = kv_indices[:page_aligned_len].to(
+                dtype=torch.int64, copy=True
+            )
+        else:
+            page_aligned_len = len(kv_indices)
+            page_aligned_kv_indices = kv_indices.to(dtype=torch.int64, copy=True)
+        page_aligned_token_ids = token_ids[:page_aligned_len]
+
+        # Radix Cache takes one ref in memory pool
+        new_prefix_len = self.insert(
+            page_aligned_token_ids, page_aligned_kv_indices, chunked=chunked
+        )
+        self.token_to_kv_pool_allocator.free(
+            kv_indices[len(req.prefix_indices) : new_prefix_len]
+        )
+
+        # The prefix indices could be updated, reuse it
+        new_indices, new_last_node, _, _ = self.match_prefix(page_aligned_token_ids)
+        self.req_to_token_pool.write(
+            (req.req_pool_idx, slice(len(req.prefix_indices), len(new_indices))),
+            new_indices[len(req.prefix_indices) :],
+        )
+
+        self.dec_lock_ref(req.last_node)
+        self.inc_lock_ref(new_last_node)
+
+        # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
+        if self.page_size != 1:
+            req.prefix_indices = torch.cat(
+                [new_indices, kv_indices[len(new_indices) :]]
+            )
+        else:
+            req.prefix_indices = new_indices
+        req.last_node = new_last_node
+
+    def pretty_print(self):
+        self._print_helper(self.root_node, 0)
+        print(f"#tokens: {self.total_size()}")
+
+    def total_size(self):
+        return self._total_size_helper()
+
+    def evict(self, num_tokens: int):
+        if self.disable:
+            return
+
+        leaves = self._collect_leaves()
+        heapq.heapify(leaves)
+
+        num_evicted = 0
+        while num_evicted < num_tokens and len(leaves):
+            x = heapq.heappop(leaves)
+
+            if x == self.root_node:
+                break
+            if x.lock_ref > 0:
+                continue
+
+            self.token_to_kv_pool_allocator.free(x.value)
+            num_evicted += len(x.value)
+            self._delete_leaf(x)
+
+            if len(x.parent.children) == 0:
+                heapq.heappush(leaves, x.parent)
+
+            self._record_remove_event(x)
+
+    def inc_lock_ref(self, node: TreeNode):
+        if self.disable:
+            return 0
+
+        delta = 0
+        while node != self.root_node:
+            if node.lock_ref == 0:
+                self.evictable_size_ -= len(node.value)
+                self.protected_size_ += len(node.value)
+                delta -= len(node.value)
+            node.lock_ref += 1
+            node = node.parent
+        return delta
+
+    def dec_lock_ref(self, node: TreeNode):
+        if self.disable:
+            return 0
+
+        delta = 0
+        while node != self.root_node:
+            if node.lock_ref == 1:
+                self.evictable_size_ += len(node.value)
+                self.protected_size_ -= len(node.value)
+                delta += len(node.value)
+            node.lock_ref -= 1
+            node = node.parent
+        return delta
+
+    def evictable_size(self):
+        return self.evictable_size_
+
+    def protected_size(self):
+        # protected size refers to the size of the cache that is locked
+        return self.protected_size_
+
+    def all_values_flatten(self):
+        values = []
+
+        def _dfs_helper(node: TreeNode):
+            for _, child in node.children.items():
+                values.append(child.value)
+                _dfs_helper(child)
+
+        _dfs_helper(self.root_node)
+        return torch.cat(values)
+
+    ##### Internal Helper Functions #####
+
+    def _match_prefix_helper(self, node: TreeNode, key: List):
+        node.last_access_time = time.monotonic()
+
+        child_key = self.get_child_key_fn(key)
+
+        value = []
+        while len(key) > 0 and child_key in node.children.keys():
+            child = node.children[child_key]
+            child.last_access_time = time.monotonic()
+            prefix_len = self.key_match_fn(child.key, key)
+            if prefix_len < len(child.key):
+                new_node = self._split_node(child.key, child, prefix_len)
+                value.append(new_node.value)
+                node = new_node
+                break
+            else:
+                value.append(child.value)
+                node = child
+                key = key[prefix_len:]
+
+                if len(key):
+                    child_key = self.get_child_key_fn(key)
+
+        return value, node
+
+    def _split_node(self, key, child: TreeNode, split_len: int):
+        # new_node -> child
+        self._record_remove_event(child)
+        new_node = TreeNode()
+        new_node.children = {self.get_child_key_fn(key[split_len:]): child}
+        new_node.parent = child.parent
+        new_node.lock_ref = child.lock_ref
+        new_node.key = child.key[:split_len]
+        new_node.value = child.value[:split_len]
+        child.parent = new_node
+        child.key = child.key[split_len:]
+        child.value = child.value[split_len:]
+        new_node.parent.children[self.get_child_key_fn(key)] = new_node
+
+        self._record_store_event(new_node)
+        self._record_store_event(child)
+
+        return new_node
+
+    def _insert_helper(self, node: TreeNode, key: List, value):
+        node.last_access_time = time.monotonic()
+        if len(key) == 0:
+            return 0
+
+        child_key = self.get_child_key_fn(key)
+
+        total_prefix_length = 0
+        while len(key) > 0 and child_key in node.children.keys():
+            node = node.children[child_key]
+            node.last_access_time = time.monotonic()
+            prefix_len = self.key_match_fn(node.key, key)
+            total_prefix_length += prefix_len
+            key = key[prefix_len:]
+            value = value[prefix_len:]
+
+            if prefix_len < len(node.key):
+                new_node = self._split_node(node.key, node, prefix_len)
+                node = new_node
+
+            if len(key):
+                child_key = self.get_child_key_fn(key)
+
+        if len(key):
+            new_node = TreeNode()
+            new_node.parent = node
+            new_node.key = key
+            new_node.value = value
+            node.children[child_key] = new_node
+            self.evictable_size_ += len(value)
+            self._record_store_event(new_node)
+        return total_prefix_length
+
+    def _print_helper(self, node: TreeNode, indent: int):
+        """Prints the radix tree in a human-readable format."""
+        stack = [(node, indent)]
+        while stack:
+            current_node, current_indent = stack.pop()
+            print(
+                " " * current_indent,
+                len(current_node.key),
+                current_node.key[:10],
+                f"r={current_node.lock_ref}",
+            )
+            for key, child in current_node.children.items():
+                stack.append((child, current_indent + 2))
+
+                assert key == self.get_child_key_fn(
+                    child.key
+                ), f"{key=}, {self.get_child_key_fn(child.key)=}"
+
+    def _delete_leaf(self, node):
+        for k, v in node.parent.children.items():
+            if v == node:
+                break
+        del node.parent.children[k]
+        self.evictable_size_ -= len(node.key)
+
+    def _total_size_helper(self):
+        total_size = 0
+        stack = [self.root_node]
+        while stack:
+            current_node = stack.pop()
+            total_size += len(current_node.value)
+            for child in current_node.children.values():
+                if child.evicted:
+                    continue
+                stack.append(child)
+        return total_size
+
+    def _collect_leaves(self):
+        ret_list = []
+        stack = [self.root_node]
+
+        while stack:
+            cur_node = stack.pop()
+            if len(cur_node.children) == 0:
+                ret_list.append(cur_node)
+            else:
+                stack.extend(cur_node.children.values())
+
+        return ret_list
+
+    def _record_store_event(self, node: TreeNode):
+        # One BlockStored per ``page_size`` chunk.
+        if self.enable_kv_cache_events:
+            # First chunk links to the last page of the parent node (if any).
+            if node.parent is None or node != self.root_node:
+                parent_block_hash = None
+            else:
+                last_page_start = (
+                    (len(node.parent.key) - 1) // self.page_size
+                ) * self.page_size
+                parent_parent_tokens = node.parent.key[last_page_start:]
+                parent_block_hash = hash(tuple(parent_parent_tokens))
+
+            for start in range(0, len(node.key), self.page_size):
+                page_tokens = node.key[start : start + self.page_size]
+                if not page_tokens:
+                    continue
+
+                block_hash = hash(tuple(page_tokens))
+
+                self.kv_event_queue.append(
+                    BlockStored(
+                        block_hashes=[block_hash],
+                        parent_block_hash=parent_block_hash,
+                        token_ids=page_tokens,
+                        block_size=len(page_tokens),
+                        lora_id=None,
+                    )
+                )
+
+                # Chain next chunk to this one.
+                parent_block_hash = block_hash
+
+    def _record_remove_event(self, node: TreeNode):
+        # One BlockRemoved per chunk.
+        if self.enable_kv_cache_events:
+            for start in range(0, len(node.key), self.page_size):
+                page_tokens = node.key[start : start + self.page_size]
+                if not page_tokens:
+                    continue
+                block_hash = hash(tuple(page_tokens))
+                self.kv_event_queue.append(BlockRemoved(block_hashes=[block_hash]))
+
+    def _record_all_cleared_event(self):
+        if self.enable_kv_cache_events:
+            self.kv_event_queue.append(AllBlocksCleared())
+
+    def take_events(self):
+        """Atomically takes all events and clears the queue.
+
+        Returns:
+            A list of KV cache events.
+        """
+        if not self.enable_kv_cache_events:
+            return []
+        events = self.kv_event_queue
+        self.kv_event_queue = []
+        return events
+
+
+if __name__ == "__main__":
+    tree = RadixCache(None, None, page_size=1, disable=False)
+
+    tree.insert("Hello")
+    tree.insert("Hello")
+    tree.insert("Hello_L.A.!")
+    # tree.insert("Hello_world! Happy")
+    # tree.insert("I love you!")
+    tree.pretty_print()
+
+    # print(tree.match_prefix("I love you! aha"))
+
+    # def evict_callback(x):
+    #    print("evict", x)
+    #    return len(x)
+
+    # tree.evict(5, evict_callback)
+    # tree.evict(10, evict_callback)
+    # tree.pretty_print()
diff --git a/python/sglang/srt/mem_cache/radix_cache_cpp.py b/python/sglang/srt/mem_cache/radix_cache_cpp.py
new file mode 100644
index 000000000..e9512e83f
--- /dev/null
+++ b/python/sglang/srt/mem_cache/radix_cache_cpp.py
@@ -0,0 +1,229 @@
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, List, Set
+
+import torch
+
+from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
+from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache, MatchResult
+from sglang.srt.mem_cache.cpp_radix_tree.radix_tree import (
+    IOHandle,
+    RadixTreeCpp,
+    TreeNodeCpp,
+)
+from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req
+
+
+logger = logging.getLogger(__name__)
+
+
+class RadixCacheCpp(BasePrefixCache):
+    def _merge_tensor(self, l: List[torch.Tensor]) -> torch.Tensor:
+        """
+        Merge a list of tensors into a single tensor.
+        Args:
+            l (List[torch.Tensor]): List of tensors to merge.
+        Returns:
+            torch.Tensor: Merged tensor.
+        """
+        if len(l) == 0:
+            return torch.empty(0, dtype=torch.int64, device=self.device)
+        elif len(l) == 1:
+            return l[0]
+        else:
+            return torch.cat(l)
+
+    def __init__(
+        self,
+        disable: bool,
+        use_hicache: bool,
+        req_to_token_pool: ReqToTokenPool,
+        token_to_kv_pool: BaseTokenToKVPoolAllocator,
+        tp_cache_group: torch.distributed.ProcessGroup,
+        page_size: int,
+        hicache_ratio: float,
+        hicache_size: int,
+        hicache_write_policy: str,
+        enable_kv_cache_events: bool = False,
+        hicache_oracle: bool = False,
+        enable_write_cancel: bool = False,
+    ):
+        self.disable = disable
+        self.enable_write_cancel = enable_write_cancel
+
+        assert (
+            enable_kv_cache_events is False
+        ), "HiRadixCache does not support kv cache events yet"
+        self.kv_cache = token_to_kv_pool.get_kvcache()
+
+        # record the nodes with ongoing write through
+        self.ongoing_write_through: Set[IOHandle] = set()
+        # record the node segments with ongoing load back
+        self.ongoing_load_back: Set[IOHandle] = set()
+        # todo: dynamically adjust the threshold
+        self.write_through_threshold = (
+            1 if hicache_write_policy == "write_through" else 2
+        )
+        self.device = token_to_kv_pool.device
+        self.token_to_kv_pool = token_to_kv_pool
+        self.req_to_token_pool = req_to_token_pool
+        self.page_size = page_size
+
+        self.tp_group = tp_cache_group
+
+        if not use_hicache:
+            self.tree = RadixTreeCpp(
+                disabled=self.disable,
+                page_size=page_size,
+                host_size=None,  # no host cache, this should be removed in the future
+                write_through_threshold=self.write_through_threshold,
+            )
+            self.cache_controller = None
+            return  # early return if hicache is not used
+
+        raise NotImplementedError("Host cache is not supported yet")
+
+    def reset(self):
+        if self.cache_controller is not None:
+            # need to clear the acks before resetting the cache controller
+            raise NotImplementedError("Host cache is not supported yet")
+        self.tree.reset()
+
+    def match_prefix(self, key: List[int], **kwargs) -> MatchResult:
+        device_indices_vec, host_indices_length, node_gpu, node_cpu = (
+            self.tree.match_prefix(key)
+        )
+        return MatchResult(
+            device_indices=self._merge_tensor(device_indices_vec),
+            last_device_node=node_gpu,
+            last_host_node=node_cpu,
+            host_hit_length=host_indices_length,
+        )
+
+    def _insert(self, key: List[int], value: torch.Tensor) -> int:
+        """
+        Insert a key-value pair into the radix tree.
+        Args:
+            key (List[int]): The key to insert, represented as a list of integers.
+            value (torch.Tensor): The value to associate with the key.
+        Returns:
+            int: Number of device indices that were already present in the tree before the insertion.
+        """
+        ongoing_write, length = self.tree.writing_through(key, value)
+        if self.cache_controller is None:
+            assert len(ongoing_write) == 0, "Implementation error"
+            return length
+
+        raise NotImplementedError("Host cache is not supported yet")
+
+    def dec_lock_ref(self, node: TreeNodeCpp):
+        """
+        Decrement the reference count of a node to root of the radix tree.
+        Args:
+            node (TreeNodeCpp): The handle of the node to decrement the reference count for.
+        """
+        self.tree.lock_ref(node, False)  # do not increment
+
+    def inc_lock_ref(self, node: TreeNodeCpp):
+        """
+        Increment the reference count of from a node to root of the radix tree.
+        Args:
+            node (TreeNodeCpp): The handle of the node to increment the reference count for.
+        """
+        self.tree.lock_ref(node, True)
+
+    def evict(self, num_tokens: int):
+        evicted_device_indices = self.tree.evict(num_tokens)
+        for indice in evicted_device_indices:
+            self.token_to_kv_pool.free(indice)
+
+    def evictable_size(self):
+        return self.tree.evictable_size()
+
+    def protected_size(self):
+        return self.tree.protected_size()
+
+    def total_size(self):
+        return self.tree.total_size()
+
+    def cache_finished_req(self, req: Req):
+        """Cache request when it finishes."""
+        assert req.req_pool_idx is not None
+        token_ids = (req.origin_input_ids + req.output_ids)[:-1]
+        overall_len = len(token_ids)  # prefill + decode
+        kv_indices = self.req_to_token_pool.req_to_token[req.req_pool_idx, :overall_len]
+
+        # NOTE: our C++ implementation don't need `token_ids` and `kv_indices` to be page-aligned
+        # it will automatically align them, but length of them should be equal
+        old_prefix_len = len(req.prefix_indices) // self.page_size * self.page_size
+        new_prefix_len = self._insert(token_ids, kv_indices)
+
+        # NOTE: kv_indices[:old_prefix_len] == req.prefix_indices
+        assert old_prefix_len <= new_prefix_len, "Wrong prefix indices"
+
+        # KVCache between old & new is newly generated, but already exists in the pool
+        # we need to free this newly generated kv indices
+        if old_prefix_len < new_prefix_len:
+            self.token_to_kv_pool.free(kv_indices[old_prefix_len:new_prefix_len])
+
+        # need to free the unaligned part, since it cannot be inserted into the radix tree
+        if self.page_size != 1 and (  # unaligned tail only exists when page_size > 1
+            (unaligned_len := overall_len % self.page_size) > 0
+        ):
+            # NOTE: sglang PagedAllocator support unaligned free (which will automatically align it)
+            self.token_to_kv_pool.free(kv_indices[overall_len - unaligned_len :])
+
+        # Remove req slot release the cache lock
+        self.dec_lock_ref(req.last_node)
+        self.req_to_token_pool.free(req.req_pool_idx)
+
+    def cache_unfinished_req(self, req: Req, chunked=False):
+        """Cache request when it is unfinished."""
+        assert req.req_pool_idx is not None
+        token_ids = req.fill_ids
+        prefill_len = len(token_ids)  # prefill only (maybe chunked)
+        kv_indices = self.req_to_token_pool.req_to_token[req.req_pool_idx, :prefill_len]
+
+        # NOTE: our C++ implementation don't need `token_ids` and `kv_indices` to be page-aligned
+        # it will automatically align them, but length of them should be equal
+        old_prefix_len = len(req.prefix_indices) // self.page_size * self.page_size
+        new_prefix_len = self._insert(token_ids, kv_indices)
+
+        # NOTE: kv_indices[:old_prefix_len] == req.prefix_indices
+        assert old_prefix_len <= new_prefix_len, "Wrong prefix indices"
+
+        # TODO(dark): optimize the `insert` and `match` (e.g. merge into 1 function)
+        # The prefix indices need to updated to reuse the kv indices in the pool
+        new_indices_vec, _, new_last_node, _ = self.tree.match_prefix(token_ids)
+        new_indices = self._merge_tensor(new_indices_vec)
+        assert new_prefix_len <= len(new_indices)
+
+        # KVCache between old & new is newly generated, but already exists in the pool
+        # we need to free this newly generated kv indices and reuse the indices in the pool
+        if old_prefix_len < new_prefix_len:
+            self.token_to_kv_pool.free(kv_indices[old_prefix_len:new_prefix_len])
+            reused_indices = new_indices[old_prefix_len:new_prefix_len]
+            self.req_to_token_pool.req_to_token[
+                req.req_pool_idx, old_prefix_len:new_prefix_len
+            ] = reused_indices
+
+        if req.last_node != new_last_node:
+            self.dec_lock_ref(req.last_node)
+            self.inc_lock_ref(new_last_node)
+
+        # NOTE: there might be unaligned tail, so we may need to append it
+        assert len(new_indices) <= prefill_len < len(new_indices) + self.page_size
+        if self.page_size != 1 and len(new_indices) < prefill_len:
+            req.prefix_indices = torch.cat(
+                [new_indices, kv_indices[len(new_indices) :]]
+            )
+        else:
+            req.prefix_indices = new_indices
+        req.last_node = new_last_node
+
+    def pretty_print(self):
+        return self.tree.debug_print()
diff --git a/python/sglang/srt/mem_cache/storage/hf3fs/docs/README.md b/python/sglang/srt/mem_cache/storage/hf3fs/docs/README.md
new file mode 100644
index 000000000..480f431a8
--- /dev/null
+++ b/python/sglang/srt/mem_cache/storage/hf3fs/docs/README.md
@@ -0,0 +1,35 @@
+# Using HF3FS as L3 Global KV Cache
+
+This document provides step-by-step instructions for setting up a k8s + 3FS + SGLang runtime environment from scratch, describing how to utilize deepseek-hf3fs as the L3 KV cache for SGLang.
+The process consists of five main steps:
+
+## Step 1: Install deepseek-3fs via 3fs-Operator
+Refer to the [3fs-operator documentation](https://github.com/aliyun/kvc-3fs-operator/blob/main/README_en.md) to deploy 3FS components in your Kubernetes environment using the Operator with one-click deployment.
+
+## Step 2: Launch SGLang Pod
+Start your SGLang Pod while specifying 3FS-related labels in the YAML configuration. Follow the [fuse-client-creation guide](https://github.com/aliyun/kvc-3fs-operator/blob/main/README_en.md#fuse-client-creation).
+
+## Step 3: Configure Usrbio Client in SGLang Pod
+The Usrbio client is required for accessing 3FS. Install it in your SGLang Pod using either method below:
+
+**Alternative 1 (Recommend):** Build from source (refer to [setup_usrbio_client.md](setup_usrbio_client.md))
+
+**Alternative 2:** Run `pip3 install hf3fs-py-usrbio` (Follow https://pypi.org/project/hf3fs-py-usrbio/#files)
+
+## Step 4: Deploy Model Serving
+
+### Single Node Deployment
+```bash
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages
+python3 -m sglang.launch_server \
+    --model-path /code/models/Qwen3-32B/ \
+    --host 0.0.0.0 --port 10000 \
+    --page-size 64 \
+    --enable-hierarchical-cache \
+    --hicache-ratio 2 --hicache-size 0 \
+    --hicache-write-policy write_through \
+    --hicache-storage-backend hf3fs
+```
+
+### Multi-Node Deployment (Shared KV Cache)
+Follow the [deploy_sglang_3fs_multinode.md](deploy_sglang_3fs_multinode.md) guide to deploy SGLang with 3FS across multiple nodes for shared KV caching.
diff --git a/python/sglang/srt/mem_cache/storage/hf3fs/docs/deploy_sglang_3fs_multinode.md b/python/sglang/srt/mem_cache/storage/hf3fs/docs/deploy_sglang_3fs_multinode.md
new file mode 100644
index 000000000..889f9ad85
--- /dev/null
+++ b/python/sglang/srt/mem_cache/storage/hf3fs/docs/deploy_sglang_3fs_multinode.md
@@ -0,0 +1,65 @@
+# 1. Startup 3fs metadata service
+```bash
+nohup python3 -m sglang.srt.mem_cache.storage.hf3fs.mini_3fs_metadata_server > meta.out &
+```
+
+
+# 2. Startup sglang engine
+## HF3fs configures
+```bash
+vim /sgl-workspace/sglang/benchmark/hf3fs/hf3fs_config.json
+{
+    "file_path_prefix": "/data/hicache",
+    "file_size": 1099511627776,
+    "numjobs": 16,
+    "entries": 8,
+    "metadata_server_url": "http://metaServerIp:18000"
+}
+```
+
+## node1
+```bash
+export SGLANG_HICACHE_HF3FS_CONFIG_PATH=/sgl-workspace/sglang/benchmark/hf3fs/hf3fs_config.json
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages
+rm -rf instance1.out && \
+nohup python3 -m sglang.launch_server \
+    --model-path /code/models/Qwen3-32B/ \
+    --host 0.0.0.0 --port 10000 \
+    --page-size 64 \
+    --enable-hierarchical-cache \
+    --hicache-ratio 2 --hicache-size 0 \
+    --hicache-write-policy write_through \
+    --hicache-storage-backend hf3fs --tp 2 > instance1.out &
+```
+
+## node2
+```bash
+export SGLANG_HICACHE_HF3FS_CONFIG_PATH=/sgl-workspace/sglang/benchmark/hf3fs/hf3fs_config.json
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages
+rm -rf instance2.out && \
+nohup python3 -m sglang.launch_server \
+    --model-path /code/models/Qwen3-32B/ \
+    --host 0.0.0.0 --port 10000 \
+    --page-size 64 \
+    --enable-hierarchical-cache \
+    --hicache-ratio 2 --hicache-size 0 \
+    --hicache-write-policy write_through \
+    --hicache-storage-backend hf3fs --tp 2 > instance2.out &
+```
+
+# 3. Startup router
+```bash
+rm -rf router.out && \
+nohup python -m sglang_router.launch_router --worker-urls http://node1:10000 http://node2:10000 > router.out &
+```
+
+# 4. Startup multiturn benchmark
+```bash
+rm -rf bench_multiturn.out && \
+nohup python3 benchmark/hicache/bench_multiturn.py \
+    --model-path /code/models/Qwen3-32B \
+    --dataset-path /code/models/ShareGPT_V3_unfiltered_cleaned_split.json \
+    --port 30000 \
+    --request-length 2048 --num-clients 512 --num-rounds 5 --max-parallel 8 \
+    > bench_multiturn.out &
+```
diff --git a/python/sglang/srt/mem_cache/storage/hf3fs/docs/setup_usrbio_client.md b/python/sglang/srt/mem_cache/storage/hf3fs/docs/setup_usrbio_client.md
new file mode 100644
index 000000000..7c7c0bfb2
--- /dev/null
+++ b/python/sglang/srt/mem_cache/storage/hf3fs/docs/setup_usrbio_client.md
@@ -0,0 +1,68 @@
+# HiCacheHF3FS Setup
+
+## Build & Package
+### Source Code
+https://github.com/deepseek-ai/3FS/blob/main/README.md#check-out-source-code
+```sh
+git clone https://github.com/deepseek-ai/3fs
+
+cd 3fs
+git submodule update --init --recursive
+./patches/apply.sh
+```
+
+### Build Dev Container
+https://github.com/deepseek-ai/3FS/blob/main/dockerfile/dev.dockerfile
+```sh
+cd 3fs/dockerfile
+docker build -t hf3fs:dev -f dev.dockerfile .
+```
+
+### Generate Python Wheel
+```sh
+docker run -it hf3fs:dev bash
+
+# Inside the development container
+git clone https://github.com/deepseek-ai/3fs
+
+cd 3fs
+git submodule update --init --recursive
+./patches/apply.sh
+
+apt-get update \
+&& apt-get install -y --no-install-recommends \
+python3 python3-pip \
+&& apt-get clean \
+&& rm -rf /var/lib/apt/lists/*
+# apt install python3.12 python3.12-venv python3.12-dev
+# curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
+# python3.12 get-pip.py
+
+# Generated wheel location: dist/hf3fs_py_usrbio-1.2.9+2db69ce-cp310-cp310-linux_x86_64.whl
+python3 setup.py bdist_wheel
+```
+
+## Installation
+```sh
+# Install Dependencies
+# https://github.com/deepseek-ai/3FS/blob/main/dockerfile/dev.dockerfile
+apt update && apt install -y                            \
+  libaio-dev                                            \
+  libboost-all-dev                                      \
+  libdouble-conversion-dev                              \
+  libdwarf-dev                                          \
+  libgflags-dev                                         \
+  libgmock-dev                                          \
+  libgoogle-glog-dev                                    \
+  libgoogle-perftools-dev                               \
+  libgtest-dev                                          \
+  liblz4-dev                                            \
+  liblzma-dev                                           \
+  libssl-dev                                            \
+  libunwind-dev                                         \
+  libuv1-dev
+
+# Install Python Package
+pip install hf3fs_py_usrbio-1.2.9+394583d-cp312-cp312-linux_x86_64.whl
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages
+```
diff --git a/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py b/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py
new file mode 100644
index 000000000..c7a485fa0
--- /dev/null
+++ b/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py
@@ -0,0 +1,164 @@
+import logging
+import os
+import threading
+from abc import ABC, abstractmethod
+from typing import List
+
+import torch
+
+
+class Hf3fsClient(ABC):
+    """Abstract interface for HF3FS clients."""
+
+    @abstractmethod
+    def __init__(self, path: str, size: int, bytes_per_page: int, entries: int):
+        """Initialize the HF3FS client.
+
+        Args:
+            path: File path for storage
+            size: Total size of storage file
+            bytes_per_page: Bytes per page
+            entries: Number of entries for batch operations
+        """
+        pass
+
+    @abstractmethod
+    def batch_read(self, offsets: List[int], tensors: List[torch.Tensor]) -> List[int]:
+        """Batch read from storage."""
+        pass
+
+    @abstractmethod
+    def batch_write(self, offsets: List[int], tensors: List[torch.Tensor]) -> List[int]:
+        """Batch write to storage."""
+        pass
+
+    @abstractmethod
+    def check(self, offsets: List[int], tensors: List[torch.Tensor]) -> None:
+        """Validate batch operation parameters."""
+        pass
+
+    @abstractmethod
+    def get_size(self) -> int:
+        """Get total storage size."""
+        pass
+
+    @abstractmethod
+    def close(self) -> None:
+        """Close the client and cleanup resources."""
+        pass
+
+    @abstractmethod
+    def flush(self) -> None:
+        """Flush data to disk."""
+        pass
+
+
+logger = logging.getLogger(__name__)
+
+
+class Hf3fsMockClient(Hf3fsClient):
+    """Mock implementation of Hf3fsClient for CI testing purposes."""
+
+    def __init__(self, path: str, size: int, bytes_per_page: int, entries: int):
+        """Initialize mock HF3FS client."""
+        self.path = path
+        self.size = size
+        self.bytes_per_page = bytes_per_page
+        self.entries = entries
+
+        # Create directory if it doesn't exist
+        os.makedirs(os.path.dirname(self.path), exist_ok=True)
+
+        # Create and initialize the file
+        self.file = os.open(self.path, os.O_RDWR | os.O_CREAT)
+        os.ftruncate(self.file, size)
+
+        logger.info(
+            f"Hf3fsMockClient initialized: path={path}, size={size}, "
+            f"bytes_per_page={bytes_per_page}, entries={entries}"
+        )
+
+    def batch_read(self, offsets: List[int], tensors: List[torch.Tensor]) -> List[int]:
+        """Batch read from mock storage."""
+        self.check(offsets, tensors)
+
+        results = []
+
+        for offset, tensor in zip(offsets, tensors):
+            size = tensor.numel() * tensor.itemsize
+
+            try:
+                os.lseek(self.file, offset, os.SEEK_SET)
+                bytes_read = os.read(self.file, size)
+
+                if len(bytes_read) == size:
+                    # Convert bytes to tensor and copy to target
+                    bytes_tensor = torch.frombuffer(bytes_read, dtype=torch.uint8)
+                    typed_tensor = bytes_tensor.view(tensor.dtype).view(tensor.shape)
+                    tensor.copy_(typed_tensor)
+                    results.append(size)
+                else:
+                    logger.warning(
+                        f"Short read: expected {size}, got {len(bytes_read)}"
+                    )
+                    results.append(len(bytes_read))
+
+            except Exception as e:
+                logger.error(f"Error reading from offset {offset}: {e}")
+                results.append(0)
+
+        return results
+
+    def batch_write(self, offsets: List[int], tensors: List[torch.Tensor]) -> List[int]:
+        """Batch write to mock storage."""
+        self.check(offsets, tensors)
+
+        results = []
+
+        for offset, tensor in zip(offsets, tensors):
+            size = tensor.numel() * tensor.itemsize
+
+            try:
+                # Convert tensor to bytes and write directly to file
+                tensor_bytes = tensor.contiguous().view(torch.uint8).flatten()
+                data = tensor_bytes.numpy().tobytes()
+
+                os.lseek(self.file, offset, os.SEEK_SET)
+                bytes_written = os.write(self.file, data)
+
+                if bytes_written == size:
+                    results.append(size)
+                else:
+                    logger.warning(f"Short write: expected {size}, got {bytes_written}")
+                    results.append(bytes_written)
+
+            except Exception as e:
+                logger.error(f"Error writing to offset {offset}: {e}")
+                results.append(0)
+
+        return results
+
+    def check(self, offsets: List[int], tensors: List[torch.Tensor]) -> None:
+        """Validate batch operation parameters."""
+        pass
+
+    def get_size(self) -> int:
+        """Get total storage size."""
+        return self.size
+
+    def close(self) -> None:
+        """Close the mock client and cleanup resources."""
+        try:
+            if hasattr(self, "file") and self.file >= 0:
+                os.close(self.file)
+                self.file = -1  # Mark as closed
+            logger.info(f"MockHf3fsClient closed: {self.path}")
+        except Exception as e:
+            logger.error(f"Error closing MockHf3fsClient: {e}")
+
+    def flush(self) -> None:
+        """Flush data to disk."""
+        try:
+            os.fsync(self.file)
+        except Exception as e:
+            logger.error(f"Error flushing MockHf3fsClient: {e}")
diff --git a/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_usrbio_client.py b/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_usrbio_client.py
new file mode 100644
index 000000000..480c18ed1
--- /dev/null
+++ b/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_usrbio_client.py
@@ -0,0 +1,187 @@
+import logging
+import multiprocessing
+import os
+import threading
+from functools import wraps
+from pathlib import Path
+from typing import List
+
+import torch
+from torch.utils.cpp_extension import load
+
+from sglang.srt.mem_cache.storage.hf3fs.hf3fs_client import Hf3fsClient
+
+root = Path(__file__).parent.resolve()
+hf3fs_utils = load(name="hf3fs_utils", sources=[f"{root}/hf3fs_utils.cpp"])
+
+logger = logging.getLogger(__name__)
+
+HF3FS_AVAILABLE = True
+try:
+    from hf3fs_fuse.io import (
+        deregister_fd,
+        extract_mount_point,
+        make_ioring,
+        make_iovec,
+        register_fd,
+    )
+except ImportError:
+    HF3FS_AVAILABLE = False
+
+
+def rsynchronized():
+    def _decorator(func):
+        @wraps(func)
+        def wrapper(self, *args, **kwargs):
+            with self.rlock:
+                return func(self, *args, **kwargs)
+
+        return wrapper
+
+    return _decorator
+
+
+def wsynchronized():
+    def _decorator(func):
+        @wraps(func)
+        def wrapper(self, *args, **kwargs):
+            with self.wlock:
+                return func(self, *args, **kwargs)
+
+        return wrapper
+
+    return _decorator
+
+
+class Hf3fsUsrBioClient(Hf3fsClient):
+    """HF3FS client implementation using usrbio."""
+
+    def __init__(self, path: str, size: int, bytes_per_page: int, entries: int):
+        if not HF3FS_AVAILABLE:
+            raise ImportError(
+                "hf3fs_fuse.io is not available. Please install the hf3fs_fuse package."
+            )
+
+        self.path = path
+        self.size = size
+        self.bytes_per_page = bytes_per_page
+        self.entries = entries
+
+        self.file = os.open(self.path, os.O_RDWR | os.O_CREAT)
+        os.ftruncate(self.file, size)
+        register_fd(self.file)
+
+        self.hf3fs_mount_point = extract_mount_point(path)
+        self.bs = self.bytes_per_page
+        self.shm_r = multiprocessing.shared_memory.SharedMemory(
+            size=self.bs * self.entries, create=True
+        )
+        self.shm_w = multiprocessing.shared_memory.SharedMemory(
+            size=self.bs * self.entries, create=True
+        )
+
+        self.shm_r_tensor = torch.frombuffer(self.shm_r.buf, dtype=torch.uint8)
+        self.shm_w_tensor = torch.frombuffer(self.shm_w.buf, dtype=torch.uint8)
+
+        self.numa = -1
+        self.ior_r = make_ioring(
+            self.hf3fs_mount_point,
+            self.entries,
+            for_read=True,
+            timeout=1,
+            numa=self.numa,
+        )
+        self.ior_w = make_ioring(
+            self.hf3fs_mount_point,
+            self.entries,
+            for_read=False,
+            timeout=1,
+            numa=self.numa,
+        )
+        self.iov_r = make_iovec(self.shm_r, self.hf3fs_mount_point)
+        self.iov_w = make_iovec(self.shm_w, self.hf3fs_mount_point)
+        self.shm_r.unlink()
+        self.shm_w.unlink()
+
+        self.rlock = threading.RLock()
+        self.wlock = threading.RLock()
+
+    @rsynchronized()
+    def batch_read(self, offsets: List[int], tensors: List[torch.Tensor]) -> List[int]:
+        self.check(offsets, tensors)
+
+        # prepare
+        current = 0
+        for offset, tensor in zip(offsets, tensors):
+            size = tensor.numel() * tensor.itemsize
+            self.ior_r.prepare(
+                self.iov_r[current : current + size], True, self.file, offset
+            )
+            current += size
+
+        # submit
+        ionum = len(offsets)
+        resv = self.ior_r.submit().wait(min_results=ionum)
+
+        # results
+        hf3fs_utils.read_shm(self.shm_r_tensor, tensors)
+        results = [res.result for res in resv]
+
+        return results
+
+    @wsynchronized()
+    def batch_write(self, offsets: List[int], tensors: List[torch.Tensor]) -> List[int]:
+        self.check(offsets, tensors)
+
+        # prepare
+        hf3fs_utils.write_shm(tensors, self.shm_w_tensor)
+        current = 0
+        for offset, tensor in zip(offsets, tensors):
+            size = tensor.numel() * tensor.itemsize
+            self.ior_w.prepare(
+                self.iov_w[current : current + size], False, self.file, offset
+            )
+            current += size
+
+        # submit
+        ionum = len(offsets)
+        resv = self.ior_w.submit().wait(min_results=ionum)
+
+        # results
+        results = [res.result for res in resv]
+
+        return results
+
+    def check(self, offsets: List[int], tensors: List[torch.Tensor]) -> None:
+        sizes = [t.numel() * t.itemsize for t in tensors]
+        if any(
+            [
+                len(offsets) > self.entries,
+                len(offsets) != len(sizes),
+                all(
+                    [
+                        offset < 0 or offset + size > self.size
+                        for offset, size in zip(offsets, sizes)
+                    ]
+                ),
+                all([size > self.bytes_per_page for size in sizes]),
+            ]
+        ):
+            self.close()
+            raise ValueError(f"Hf3fsClient.check: {offsets=}, {sizes=}")
+
+    def get_size(self) -> int:
+        return self.size
+
+    def close(self) -> None:
+        deregister_fd(self.file)
+        os.close(self.file)
+        del self.ior_r
+        del self.ior_w
+        del self.iov_r
+        del self.iov_w
+        self.shm_r.close()
+        self.shm_w.close()
+
+    def flush(self) -> None:
+        os.fsync(self.file)
diff --git a/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp b/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp
new file mode 100644
index 000000000..3a4b7dcc0
--- /dev/null
+++ b/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp
@@ -0,0 +1,35 @@
+#include <torch/extension.h>
+
+#include <cstring>
+#include <vector>
+
+void read_shm(const torch::Tensor &shm, std::vector<torch::Tensor> dst) {
+  py::gil_scoped_release release;
+  char *src_ptr = static_cast<char *>(shm.data_ptr());
+  size_t current = 0;
+  for (size_t i = 0; i < dst.size(); ++i) {
+    auto &t = dst[i];
+    size_t t_bytes = t.numel() * t.element_size();
+    char *dst_ptr = static_cast<char *>(t.data_ptr());
+    std::memcpy(dst_ptr, src_ptr + current, t_bytes);
+    current += t_bytes;
+  }
+}
+
+void write_shm(const std::vector<torch::Tensor> src, torch::Tensor &shm) {
+  py::gil_scoped_release release;
+  char *dst_ptr = static_cast<char *>(shm.data_ptr());
+  size_t current = 0;
+  for (size_t i = 0; i < src.size(); ++i) {
+    auto &t = src[i];
+    size_t t_bytes = t.numel() * t.element_size();
+    char *src_ptr = static_cast<char *>(t.data_ptr());
+    std::memcpy(dst_ptr + current, src_ptr, t_bytes);
+    current += t_bytes;
+  }
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("read_shm", &read_shm, "Read tensors from shared memory");
+  m.def("write_shm", &write_shm, "Write tensors to shared memory");
+}
diff --git a/python/sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py b/python/sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py
new file mode 100644
index 000000000..414d13adc
--- /dev/null
+++ b/python/sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py
@@ -0,0 +1,470 @@
+import argparse
+import atexit
+import json
+import logging
+import threading
+from pathlib import Path
+from typing import Dict, List, Optional, OrderedDict, Tuple
+
+import orjson
+import requests
+from fastapi import FastAPI, HTTPException, Request, Response
+from fastapi.responses import ORJSONResponse
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+
+from sglang.srt.mem_cache.storage.hf3fs.storage_hf3fs import Hf3fsMetadataInterface
+
+# --- Configuration ---
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
+
+# --- Data Models ---
+class RankMetadata:
+    """Holds all metadata for a single rank."""
+
+    def __init__(self, num_pages: int):
+        self.lock = threading.Lock()
+        self.num_pages = num_pages
+        self.free_pages: List[int] = list(range(num_pages))
+        self.key_to_index: OrderedDict[str, int] = OrderedDict()
+        # Todo: Support multi files for HF3FS
+
+    def exists_keys(self, keys: List[str]) -> List[bool]:
+        """Check if keys exist in metadata."""
+        with self.lock:
+            return [key in self.key_to_index for key in keys]
+
+    def reserve_and_allocate_page_indices(
+        self, keys: List[Tuple[str, str]]
+    ) -> List[Tuple[bool, int]]:
+        """Reserve and allocate page indices for keys."""
+        with self.lock:
+            results = [None] * len(keys)
+            new_keys_to_process = []
+
+            for i, (key, prefix_key) in enumerate(keys):
+                if key in self.key_to_index:
+                    results[i] = (True, self.key_to_index[key])
+                    self.key_to_index.move_to_end(key)
+                else:
+                    new_keys_to_process.append((i, key, prefix_key))
+
+            # Todo: Implementing data eviction logic after HiCache supports prefix information pass-through
+            for i, key, prefix_key in new_keys_to_process:
+                if len(self.free_pages) > 0:
+                    page_index = self.free_pages.pop()
+                else:
+                    page_index = self.key_to_index.popitem(last=False)[1]
+
+                results[i] = (False, page_index)
+
+            return results
+
+    def confirm_write(
+        self,
+        written_keys_to_confirm: List[Tuple[str, int]],
+        pages_to_release: List[int],
+    ) -> None:
+        """Confirm write operations and release pages."""
+        with self.lock:
+            for key, page_index in written_keys_to_confirm:
+                self.key_to_index[key] = page_index
+                self.key_to_index.move_to_end(key)
+
+            for page_index in pages_to_release:
+                if page_index not in self.free_pages:
+                    self.free_pages.append(page_index)
+
+    def delete_keys(self, keys: List[str]) -> int:
+        """Delete keys and return count of deleted keys."""
+        with self.lock:
+            count = 0
+            for key in keys:
+                if key in self.key_to_index:
+                    page_index = self.key_to_index.pop(key)
+                    if page_index not in self.free_pages:
+                        self.free_pages.append(page_index)
+                    count += 1
+            return count
+
+    def clear_all(self) -> None:
+        """Clear all metadata."""
+        with self.lock:
+            self.free_pages = list(range(self.num_pages))
+            self.key_to_index.clear()
+
+    def get_page_indices(self, keys: List[str]) -> List[Optional[int]]:
+        """Get page indices for keys."""
+        with self.lock:
+            results = []
+            for key in keys:
+                if key in self.key_to_index:
+                    results.append(self.key_to_index[key])
+                    self.key_to_index.move_to_end(key)
+                else:
+                    results.append(None)
+            return results
+
+
+class GlobalMetadataState:
+    """Manages the state for all ranks and persistence."""
+
+    def __init__(self, persistence_path: Optional[str], save_interval: int):
+        self.global_lock = threading.RLock()
+        self.ranks: Dict[int, RankMetadata] = {}
+        self.persistence_path = Path(persistence_path) if persistence_path else None
+        self.save_interval = save_interval
+        self.save_timer: Optional[threading.Timer] = None
+        self.is_shutting_down = False
+
+    def load_from_disk(self):
+        if not self.persistence_path or not self.persistence_path.exists():
+            logging.info("Persistence file not found. Starting with a clean state.")
+            return
+
+        logging.info(f"Loading state from {self.persistence_path}")
+        try:
+            with open(self.persistence_path, "r") as f:
+                persisted_data = json.load(f)
+
+            with self.global_lock:
+                for rank_id_str, data in persisted_data.items():
+                    rank_id = int(rank_id_str)
+                    num_pages = data["num_pages"]
+                    rank_meta = RankMetadata(num_pages)
+                    rank_meta.free_pages = data["free_pages"]
+                    rank_meta.key_to_index = dict(data["key_to_index"])
+                    self.ranks[rank_id] = rank_meta
+                logging.info(
+                    f"Successfully loaded metadata for {len(self.ranks)} ranks."
+                )
+        except (json.JSONDecodeError, KeyError, TypeError) as e:
+            logging.error(
+                f"Failed to load or parse persistence file: {e}. Starting fresh.",
+                exc_info=True,
+            )
+            self.ranks.clear()
+
+    def save_to_disk(self):
+        if not self.persistence_path:
+            return
+
+        logging.info("Persisting metadata to disk...")
+        with self.global_lock:
+            serializable_state = {}
+            for rank_id, rank_meta in self.ranks.items():
+                with rank_meta.lock:
+                    serializable_state[rank_id] = {
+                        "num_pages": rank_meta.num_pages,
+                        "free_pages": rank_meta.free_pages,
+                        "key_to_index": list(rank_meta.key_to_index.items()),
+                    }
+
+        try:
+            temp_path = self.persistence_path.with_suffix(".tmp")
+            with open(temp_path, "w") as f:
+                json.dump(serializable_state, f, indent=4)
+            temp_path.rename(self.persistence_path)
+            logging.info(f"Metadata successfully persisted to {self.persistence_path}")
+        except Exception as e:
+            logging.error(f"Failed to save metadata to disk: {e}", exc_info=True)
+
+    def schedule_save(self):
+        if self.is_shutting_down or not self.persistence_path:
+            return
+        self.save_to_disk()
+        self.save_timer = threading.Timer(self.save_interval, self.schedule_save)
+        self.save_timer.start()
+
+    def shutdown(self):
+        logging.info("Shutting down metadata server...")
+        self.is_shutting_down = True
+        if self.save_timer:
+            self.save_timer.cancel()
+        self.save_to_disk()
+        logging.info("Shutdown complete.")
+
+
+# --- Global MetadataServer implementation ---
+class Hf3fsMetadataServer:
+    """HF3FS Metadata Server that manages metadata for multiple ranks."""
+
+    def __init__(self, persistence_path: Optional[str] = None, save_interval: int = 60):
+        self.state = GlobalMetadataState(persistence_path, save_interval)
+        self.app = FastAPI(default_response_class=ORJSONResponse)
+
+        self._setup_routes()
+
+    def _setup_routes(self):
+        """Setup FastAPI routes."""
+        self.app.post("/{rank}/initialize")(self.initialize)
+        self.app.post("/{rank}/exists")(self.exists)
+        self.app.post("/{rank}/reserve_and_allocate_page_indices")(
+            self.reserve_and_allocate_page_indices
+        )
+        self.app.post("/{rank}/confirm_write")(self.confirm_write)
+        self.app.post("/{rank}/delete_keys")(self.delete_keys)
+        self.app.post("/{rank}/clear")(self.clear)
+        self.app.post("/{rank}/get_page_indices")(self.get_page_indices)
+
+    def get_rank_metadata(self, rank: int) -> RankMetadata:
+        """Get rank metadata with proper error handling."""
+        if rank not in self.state.ranks:
+            raise HTTPException(
+                status_code=404,
+                detail=f"Rank {rank} not initialized. Please call /{rank}/initialize first.",
+            )
+        return self.state.ranks[rank]
+
+    async def _read_json(self, request: Request) -> dict:
+        """Parse request JSON using orjson if available."""
+        body = await request.body()
+        return orjson.loads(body)
+
+    def _json_response(self, content: dict):
+        """Return ORJSONResponse when available to bypass jsonable_encoder."""
+        return ORJSONResponse(content)
+
+    async def initialize(self, rank: int, request: Request):
+        """Initialize a rank with specified number of pages."""
+        data = await self._read_json(request)
+        num_pages = data["num_pages"]
+        with self.state.global_lock:
+            if rank in self.state.ranks:
+                logging.info(
+                    f"Rank {rank} already exists. Initialization request ignored."
+                )
+                if self.state.ranks[rank].num_pages != num_pages:
+                    logging.warning(
+                        f"Rank {rank} initialized with different num_pages. Existing: {self.state.ranks[rank].num_pages}, New: {num_pages}"
+                    )
+            else:
+                logging.info(f"Initializing new Rank {rank} with {num_pages} pages.")
+                self.state.ranks[rank] = RankMetadata(num_pages)
+        return Response(status_code=204)
+
+    async def exists(self, rank: int, request: Request):
+        """Check if keys exist in metadata."""
+        data = await self._read_json(request)
+        keys = data["keys"]
+        metadata = self.get_rank_metadata(rank)
+        results = metadata.exists_keys(keys)
+        return self._json_response({"exists": results})
+
+    async def reserve_and_allocate_page_indices(self, rank: int, request: Request):
+        """Reserve and allocate page indices for keys."""
+        data = await self._read_json(request)
+        metadata = self.get_rank_metadata(rank)
+        keys = data["keys"]
+        results = metadata.reserve_and_allocate_page_indices(keys)
+        return self._json_response({"indices": results})
+
+    async def confirm_write(self, rank: int, request: Request):
+        """Confirm write operations and release pages."""
+        data = await self._read_json(request)
+        metadata = self.get_rank_metadata(rank)
+        success_written_keys = data.get("written_keys_to_confirm", [])
+        released_pages = data.get("pages_to_release", [])
+
+        metadata.confirm_write(success_written_keys, released_pages)
+
+        return Response(status_code=204)
+
+    async def delete_keys(self, rank: int, request: Request):
+        """Delete keys from metadata."""
+        data = await self._read_json(request)
+        metadata = self.get_rank_metadata(rank)
+        count = metadata.delete_keys(data["keys"])
+        return Response(status_code=204)
+
+    async def clear(self, rank: int):
+        """Clear all metadata for a rank."""
+        metadata = self.get_rank_metadata(rank)
+        metadata.clear_all()
+        return Response(status_code=204)
+
+    async def get_page_indices(self, rank: int, request: Request):
+        """Get page indices for keys."""
+        data = await self._read_json(request)
+        metadata = self.get_rank_metadata(rank)
+        keys = data["keys"]
+        results = metadata.get_page_indices(keys)
+        return self._json_response({"indices": results})
+
+    def run(self, host: str = "0.0.0.0", port: int = 18000):
+        """Run the metadata server."""
+        self.state.load_from_disk()
+        if self.state.persistence_path:
+            self.state.schedule_save()
+            atexit.register(self.state.shutdown)
+
+        import uvicorn
+
+        logging.info(f"Starting metadata server on http://{host}:{port}")
+        if self.state.persistence_path:
+            logging.info(
+                f"Persistence is ENABLED. Saving to '{self.state.persistence_path}' every {self.state.save_interval} seconds."
+            )
+        else:
+            logging.info("Persistence is DISABLED.")
+
+        uvicorn.run(self.app, host=host, port=port)
+
+
+# --- Client implementation ---
+class Hf3fsGlobalMetadataClient(Hf3fsMetadataInterface):
+    """Global http metadata client for HF3FS."""
+
+    def __init__(self, base_url: str, max_retries: int = 3):
+        self.base_url = base_url.rstrip("/")
+        self._session = requests.Session()
+
+        retry_strategy = Retry(
+            total=max_retries,
+            backoff_factor=0.3,
+            status_forcelist=[500, 502, 503, 504],
+            allowed_methods=["GET", "POST"],
+        )
+        adapter = HTTPAdapter(
+            max_retries=retry_strategy, pool_connections=256, pool_maxsize=256
+        )
+        self._session.mount("http://", adapter)
+
+    def _post(self, endpoint: str, json_data: dict) -> dict:
+        try:
+            url = f"{self.base_url}/{endpoint}"
+            headers = {"Content-Type": "application/json"}
+            payload = orjson.dumps(json_data)  # type: ignore[union-attr]
+            response = self._session.post(url, data=payload, headers=headers)
+            response.raise_for_status()
+
+            if response.status_code == 204 or not response.content:
+                return {}
+            return orjson.loads(response.content)  # type: ignore[union-attr]
+        except requests.exceptions.RequestException as e:
+            logging.error(f"Failed to POST to {endpoint} after retries: {e}")
+            raise RuntimeError(f"Failed to connect to metadata server: {e}") from e
+
+    def initialize(self, rank: int, num_pages: int) -> None:
+        self._post(f"{rank}/initialize", {"num_pages": num_pages})
+
+    def reserve_and_allocate_page_indices(
+        self, rank: int, keys: List[Tuple[str, str]]
+    ) -> List[Tuple[bool, int]]:
+        response = self._post(
+            f"{rank}/reserve_and_allocate_page_indices", {"keys": keys}
+        )
+        return [tuple(item) for item in response.get("indices")]
+
+    def confirm_write(
+        self,
+        rank: int,
+        written_keys_to_confirm: List[Tuple[str, int]],
+        pages_to_release: List[int],
+    ) -> None:
+        self._post(
+            f"{rank}/confirm_write",
+            {
+                "written_keys_to_confirm": written_keys_to_confirm,
+                "pages_to_release": pages_to_release,
+            },
+        )
+
+    def delete_keys(self, rank: int, keys: List[str]) -> None:
+        self._post(f"{rank}/delete_keys", {"keys": keys})
+
+    def exists(self, rank: int, keys: List[str]) -> List[bool]:
+        response = self._post(f"{rank}/exists", {"keys": keys})
+        return response.get("exists", [])
+
+    def clear(self, rank: int) -> None:
+        self._post(f"{rank}/clear", {})
+
+    def get_page_indices(self, rank: int, keys: List[str]) -> List[Optional[int]]:
+        response = self._post(f"{rank}/get_page_indices", {"keys": keys})
+        return response.get("indices")
+
+
+class Hf3fsLocalMetadataClient(Hf3fsMetadataInterface):
+    """Local metadata client that directly operates on single RankMetadata in memory without metadata server."""
+
+    def __init__(self):
+        self.rank_metadata = None
+
+    def initialize(self, rank: int, num_pages: int) -> None:
+        self.rank_metadata = RankMetadata(num_pages)
+
+    def reserve_and_allocate_page_indices(
+        self, rank: int, keys: List[Tuple[str, str]]
+    ) -> List[Tuple[bool, int]]:
+        """Reserve and allocate page indices for keys."""
+        return self.rank_metadata.reserve_and_allocate_page_indices(keys)
+
+    def confirm_write(
+        self,
+        rank: int,
+        written_keys_to_confirm: List[Tuple[str, int]],
+        pages_to_release: List[int],
+    ) -> None:
+        """Confirm write operations."""
+        self.rank_metadata.confirm_write(written_keys_to_confirm, pages_to_release)
+
+    def delete_keys(self, rank: int, keys: List[str]) -> None:
+        """Delete keys."""
+        self.rank_metadata.delete_keys(keys)
+
+    def exists(self, rank: int, keys: List[str]) -> List[bool]:
+        """Check if keys exist."""
+        return self.rank_metadata.exists_keys(keys)
+
+    def clear(self, rank: int) -> None:
+        """Clear all metadata for rank."""
+        self.rank_metadata.clear_all()
+
+    def get_page_indices(self, rank: int, keys: List[str]) -> List[Optional[int]]:
+        """Get page indices for keys."""
+        return self.rank_metadata.get_page_indices(keys)
+
+
+def run_metadata_server(
+    host: str = "0.0.0.0",
+    port: int = 18000,
+    persistence_path: Optional[str] = None,
+    save_interval: int = 60,
+):
+    """Run the HF3FS metadata server."""
+    global server
+    server = Hf3fsMetadataServer(
+        persistence_path=persistence_path, save_interval=save_interval
+    )
+
+    server.run(host=host, port=port)
+
+
+# --- Main Execution ---
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="HF3FS Metadata Server")
+    parser.add_argument(
+        "--host", type=str, default="0.0.0.0", help="Host to bind the server to."
+    )
+    parser.add_argument(
+        "--port", type=int, default=18000, help="Port to run the server on."
+    )
+    parser.add_argument(
+        "--persistence-path",
+        type=str,
+        default=None,
+        help="Path to the file for persisting metadata. If not provided, persistence is disabled.",
+    )
+    parser.add_argument(
+        "--save-interval",
+        type=int,
+        default=60,
+        help="Interval in seconds for periodically saving metadata to disk.",
+    )
+    args = parser.parse_args()
+
+    run_metadata_server(args.host, args.port, args.persistence_path, args.save_interval)
diff --git a/python/sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py b/python/sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py
new file mode 100644
index 000000000..9595e7204
--- /dev/null
+++ b/python/sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py
@@ -0,0 +1,523 @@
+import atexit
+import concurrent.futures
+import json
+import logging
+import os
+import signal
+import threading
+import time
+from abc import ABC, abstractmethod
+from functools import wraps
+from typing import Any, List, Optional, Tuple
+
+import torch
+
+from sglang.srt.mem_cache.hicache_storage import HiCacheStorage, HiCacheStorageConfig
+from sglang.srt.mem_cache.storage.hf3fs.hf3fs_client import Hf3fsClient
+from sglang.srt.metrics.collector import StorageMetrics
+
+logger = logging.getLogger(__name__)
+
+
+class Hf3fsMetadataInterface(ABC):
+    """Interface for HF3FS metadata operations."""
+
+    @abstractmethod
+    def initialize(self, rank: int, num_pages: int) -> None:
+        """Initialize the metadata service with specified number of pages."""
+        pass
+
+    @abstractmethod
+    def reserve_and_allocate_page_indices(
+        self,
+        rank: int,
+        keys: List[Tuple[str, str]],
+    ) -> List[Tuple[bool, int]]:
+        """
+        Reserve and allocate page indices for the specified keys.
+        Args:
+            rank: The rank of the process.
+            keys: The keys to reserve and allocate page indices for. Each tuple contains a key and the key of its prefix block.
+        Returns:
+            List[Tuple[bool, int]]: A list of tuples, where each tuple contains a boolean indicating whether the key has existed and an integer indicating the allocated page index.
+        """
+        pass
+
+    @abstractmethod
+    def confirm_write(
+        self,
+        rank: int,
+        written_keys_to_confirm: List[Tuple[str, int]],
+        pages_to_release: List[int],
+    ) -> None:
+        """
+        Confirm that key-value pairs have been successfully written to storage.
+        Args:
+            rank: The rank of the process.
+            written_keys_to_confirm: A list of tuples, where each tuple contains a key and its corresponding page index.
+            pages_to_release: A list of page indices to be released.
+        """
+        pass
+
+    @abstractmethod
+    def get_page_indices(self, rank: int, keys: List[str]) -> List[Optional[int]]:
+        """
+        Get page indices for the specified keys.
+        Args:
+            rank: The rank of the process.
+            keys: A list of keys.
+        Returns:
+            List[Optional[int]]: A list of integers representing the page indices for the specified keys.
+                                 If a key is not found, the corresponding index will be None.
+        """
+        pass
+
+    @abstractmethod
+    def delete_keys(self, rank: int, keys: List[str]) -> None:
+        """Delete specified keys and their associated pages."""
+        pass
+
+    @abstractmethod
+    def exists(self, rank: int, keys: List[str]) -> List[bool]:
+        """Check if the specified keys exist."""
+        pass
+
+    @abstractmethod
+    def clear(self, rank: int) -> None:
+        """Clear all key-value pairs and page allocations for the specified rank."""
+        pass
+
+
+class AtomicCounter:
+    def __init__(self, n: int):
+        assert n > 0
+        self.n = n
+        self._value = 0
+        self._lock = threading.Lock()
+
+    def next(self) -> int:
+        with self._lock:
+            current = self._value
+            self._value = (current + 1) % self.n
+            return current
+
+
+def synchronized():
+    def _decorator(func):
+        @wraps(func)
+        def wrapper(self, *args, **kwargs):
+            with self.lock:
+                return func(self, *args, **kwargs)
+
+        return wrapper
+
+    return _decorator
+
+
+def create_hf3fs_client(
+    path: str, size: int, bytes_per_page: int, entries: int, use_mock: bool = False
+) -> Hf3fsClient:
+    """Factory function to create appropriate HF3FS client.
+
+    Args:
+        path: File path for storage
+        size: Total size of storage file
+        bytes_per_page: Bytes per page
+        entries: Number of entries for batch operations
+        use_mock: Whether to use mock client instead of real usrbio client
+
+    Returns:
+    """
+    if use_mock:
+        from sglang.srt.mem_cache.storage.hf3fs.hf3fs_client import Hf3fsMockClient
+
+        logger.info(f"[Rank Using Hf3fsMockClient for testing")
+        return Hf3fsMockClient(path, size, bytes_per_page, entries)
+    else:
+        from sglang.srt.mem_cache.storage.hf3fs.hf3fs_usrbio_client import (
+            Hf3fsUsrBioClient,
+        )
+
+        return Hf3fsUsrBioClient(path, size, bytes_per_page, entries)
+
+
+class HiCacheHF3FS(HiCacheStorage):
+    """HiCache backend that stores KV cache pages in HF3FS files."""
+
+    default_env_var: str = "SGLANG_HICACHE_HF3FS_CONFIG_PATH"
+
+    def __init__(
+        self,
+        rank: int,
+        file_path: str,
+        file_size: int,
+        numjobs: int,
+        bytes_per_page: int,
+        entries: int,
+        dtype: torch.dtype,
+        metadata_client: Hf3fsMetadataInterface,
+        is_mla_model: bool = False,
+        is_page_first_layout: bool = False,
+        use_mock_client: bool = False,
+    ):
+        self.rank = rank
+        self.file_path = file_path
+        self.file_size = file_size
+        self.numjobs = numjobs
+        self.bytes_per_page = bytes_per_page
+        self.gb_per_page = bytes_per_page / (1 << 30)
+        self.entries = entries
+        self.dtype = dtype
+        self.metadata_client = metadata_client
+        self.is_mla_model = is_mla_model
+        self.is_page_first_layout = is_page_first_layout
+        self.numel = self.bytes_per_page // self.dtype.itemsize
+        self.num_pages = self.file_size // self.bytes_per_page
+        self.skip_backup = False
+        if self.is_mla_model and self.rank != 0:
+            self.skip_backup = True
+            self.rank = 0
+
+        logger.info(
+            f"[Rank {self.rank}] HiCacheHF3FS Client Initializing: "
+            f"file_path={self.file_path}, "
+            f"file_size={self.file_size / (2 ** 30):.2f} GB, "
+            f"num_pages={self.num_pages}"
+        )
+
+        self.ac = AtomicCounter(self.numjobs)
+        self.clients = [
+            create_hf3fs_client(
+                self.file_path,
+                self.file_size,
+                self.bytes_per_page,
+                self.entries,
+                use_mock_client,
+            )
+            for _ in range(numjobs)
+        ]
+        self.executor = concurrent.futures.ThreadPoolExecutor(
+            max_workers=self.numjobs, thread_name_prefix=f"HiCacheHF3FS-Rank{self.rank}"
+        )
+
+        self.metadata_client.initialize(self.rank, self.num_pages)
+        self.lock = threading.RLock()
+
+        atexit.register(self.close)
+
+        signal.signal(signal.SIGINT, lambda sig, frame: self.close())
+        signal.signal(signal.SIGTERM, lambda sig, frame: self.close())
+        signal.signal(signal.SIGQUIT, lambda sig, frame: self.close())
+
+        self.prefetch_pgs = []
+        self.backup_pgs = []
+        self.prefetch_bandwidth = []
+        self.backup_bandwidth = []
+
+    @staticmethod
+    def from_env_config(
+        bytes_per_page: int,
+        dtype: torch.dtype,
+        storage_config: HiCacheStorageConfig = None,
+    ) -> "HiCacheHF3FS":
+        """Create a HiCacheHF3FS instance from environment configuration.
+
+        Environment:
+            - Uses env var stored in `HiCacheHF3FS.default_env_var` to locate a JSON config.
+            - Falls back to a local single-machine config when the env var is not set.
+
+        Raises:
+            ValueError: If MLA Model is requested without global metadata server or required keys are missing.
+        """
+        from sglang.srt.mem_cache.storage.hf3fs.mini_3fs_metadata_server import (
+            Hf3fsGlobalMetadataClient,
+            Hf3fsLocalMetadataClient,
+        )
+
+        use_mock_client = False
+        if storage_config is not None:
+            rank, is_mla_model, is_page_first_layout = (
+                storage_config.tp_rank,
+                storage_config.is_mla_model,
+                storage_config.is_page_first_layout,
+            )
+
+            if storage_config.extra_config is not None:
+                use_mock_client = storage_config.extra_config.get(
+                    "use_mock_hf3fs_client", False
+                )
+        else:
+            rank, is_mla_model, is_page_first_layout = (
+                0,
+                False,
+                False,
+            )
+
+        mla_unsupported_msg = f"MLA model is not supported without global metadata server, please refer to https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/mem_cache/storage/hf3fs/docs/deploy_sglang_3fs_multinode.md"
+
+        config_path = os.getenv(HiCacheHF3FS.default_env_var)
+        if not config_path:
+            if is_mla_model:
+                raise ValueError(mla_unsupported_msg)
+
+            return HiCacheHF3FS(
+                rank=rank,
+                file_path=f"/data/hicache.{rank}.bin",
+                file_size=1 << 40,
+                numjobs=16,
+                bytes_per_page=bytes_per_page,
+                entries=8,
+                dtype=dtype,
+                metadata_client=Hf3fsLocalMetadataClient(),
+                is_page_first_layout=is_page_first_layout,
+                use_mock_client=use_mock_client,
+            )
+
+        try:
+            with open(config_path, "r") as f:
+                config = json.load(f)
+        except Exception as e:
+            raise RuntimeError(f"Failed to load config from {config_path}: {str(e)}")
+
+        # Check required keys (metadata_server_url is now optional)
+        required_keys = {
+            "file_path_prefix",
+            "file_size",
+            "numjobs",
+            "entries",
+        }
+        missing_keys = required_keys - set(config.keys())
+        if missing_keys:
+            raise ValueError(f"Missing required keys in config: {missing_keys}")
+
+        # Choose metadata client based on configuration
+        if config.get("metadata_server_url"):
+            # Use global metadata client to connect to metadata server
+            metadata_server_url = config["metadata_server_url"]
+            metadata_client = Hf3fsGlobalMetadataClient(metadata_server_url)
+
+            logger.info(
+                f"Using global metadata client with server url: {metadata_server_url}"
+            )
+        else:
+            # Enable MLA optimization only when using the global metadata client
+            if is_mla_model:
+                raise ValueError(mla_unsupported_msg)
+
+            # Use local metadata client for single-machine deployment
+            metadata_client = Hf3fsLocalMetadataClient()
+
+        rank_for_path = 0 if is_mla_model else rank
+        return HiCacheHF3FS(
+            rank=rank,
+            # Let all ranks use the same file path for MLA model
+            file_path=f"{config['file_path_prefix']}.{rank_for_path}.bin",
+            file_size=int(config["file_size"]),
+            numjobs=int(config["numjobs"]),
+            bytes_per_page=bytes_per_page,
+            entries=int(config["entries"]),
+            dtype=dtype,
+            metadata_client=metadata_client,
+            is_mla_model=is_mla_model,
+            is_page_first_layout=is_page_first_layout,
+            use_mock_client=use_mock_client,
+        )
+
+    def get(
+        self,
+        key: str,
+        target_location: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> torch.Tensor | None:
+        return self.batch_get(
+            [key],
+            [target_location] if target_location is not None else None,
+            [target_sizes] if target_sizes is not None else None,
+        )[0]
+
+    @synchronized()
+    def batch_get(
+        self,
+        keys: List[str],
+        target_locations: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> List[torch.Tensor | None]:
+        page_indices = self.metadata_client.get_page_indices(self.rank, keys)
+
+        batch_indices, file_offsets = [], []
+        for i, page_index in enumerate(page_indices):
+            if page_index is not None:
+                batch_indices.append(i)
+                file_offsets.append(page_index * self.bytes_per_page)
+
+        if target_locations is not None:
+            for target_location in target_locations:
+                assert target_location.is_contiguous()
+            file_results = target_locations
+        else:
+            file_results = [
+                torch.empty(self.numel, dtype=self.dtype)
+                for _ in range(len(batch_indices))
+            ]
+
+        start_time = time.perf_counter()
+
+        futures = [
+            self.executor.submit(
+                self.clients[self.ac.next()].batch_read,
+                file_offsets[i : i + self.entries],
+                file_results[i : i + self.entries],
+            )
+            for i in range(0, len(batch_indices), self.entries)
+        ]
+        read_results = [result for future in futures for result in future.result()]
+
+        end_time = time.perf_counter()
+        ionum = len(batch_indices)
+        self.prefetch_pgs.append(ionum)
+        self.prefetch_bandwidth.append(
+            ionum / (end_time - start_time) * self.gb_per_page
+        )
+
+        results = [None] * len(keys)
+        for batch_index, file_result, read_result in zip(
+            batch_indices, file_results, read_results
+        ):
+            if read_result == self.bytes_per_page:
+                results[batch_index] = file_result
+            else:
+                logger.error(
+                    f"[Rank {self.rank}] HiCacheHF3FS get {keys[batch_index]} failed"
+                )
+
+        return results
+
+    def set(
+        self,
+        key: str,
+        value: Optional[Any] = None,
+        target_location: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> bool:
+        return self.batch_set(
+            [key],
+            [value] if value is not None else None,
+            [target_location] if target_location is not None else None,
+            [target_sizes] if target_sizes is not None else None,
+        )
+
+    @synchronized()
+    def batch_set(
+        self,
+        keys: List[str],
+        values: Optional[Any] = None,
+        target_locations: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> bool:
+        # In MLA backend, only one rank needs to backup the KV cache
+        if self.skip_backup:
+            return True
+
+        # Todo: Add prefix block's hash key
+        key_with_prefix = [(key, "") for key in keys]
+        indices = self.metadata_client.reserve_and_allocate_page_indices(
+            self.rank, key_with_prefix
+        )
+
+        batch_indices, file_offsets, file_values = [], [], []
+        pages_to_release = []
+
+        for i, (value, (is_written, page_index)) in enumerate(zip(values, indices)):
+            if is_written or page_index == -1:
+                continue
+
+            batch_indices.append(i)
+            file_offsets.append(page_index * self.bytes_per_page)
+            assert value.is_contiguous()
+            file_values.append(value)
+
+        start_time = time.perf_counter()
+
+        futures = [
+            self.executor.submit(
+                self.clients[self.ac.next()].batch_write,
+                file_offsets[i : i + self.entries],
+                file_values[i : i + self.entries],
+            )
+            for i in range(0, len(batch_indices), self.entries)
+        ]
+        write_results = [
+            result == self.bytes_per_page
+            for future in futures
+            for result in future.result()
+        ]
+
+        end_time = time.perf_counter()
+        ionum = len(batch_indices)
+        self.backup_pgs.append(ionum)
+        self.backup_bandwidth.append(ionum / (end_time - start_time) * self.gb_per_page)
+
+        written_keys_to_confirm = []
+        results = [index[0] for index in indices]
+        for batch_index, write_result in zip(batch_indices, write_results):
+            key = keys[batch_index]
+            page_index = indices[batch_index][1]
+            if write_result:
+                written_keys_to_confirm.append((key, page_index))
+            else:
+                logger.error(f"[Rank {self.rank}] HiCacheHF3FS set {key} failed")
+                pages_to_release.append(page_index)
+            results[batch_index] = write_result
+
+        if len(written_keys_to_confirm) > 0 or len(pages_to_release) > 0:
+            self.metadata_client.confirm_write(
+                self.rank, written_keys_to_confirm, pages_to_release
+            )
+
+        return all(results)
+
+    def delete(self, key: str) -> None:
+        self.metadata_client.delete_keys(self.rank, [key])
+
+    def exists(self, key: str) -> bool:
+        result = self.metadata_client.exists(self.rank, [key])
+        return result[0] if result else False
+
+    def batch_exists(self, keys: List[str]) -> int:
+        results = self.metadata_client.exists(self.rank, keys)
+        for i in range(len(keys)):
+            if not results[i]:
+                return i
+
+        return len(keys)
+
+    def clear(self) -> bool:
+        try:
+            self.metadata_client.clear(self.rank)
+            logger.info(f"Cleared HiCacheHF3FS for rank {self.rank}")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to clear HiCacheHF3FS: {e}")
+            return False
+
+    def close(self) -> None:
+        try:
+            for c in self.clients:
+                c.close()
+            self.executor.shutdown(wait=True)
+        except Exception as e:
+            logger.error(f"close HiCacheHF3FS: {e}")
+        logger.info("close HiCacheHF3FS")
+
+    @synchronized()
+    def get_stats(self):
+        storage_metrics = StorageMetrics()
+        storage_metrics.prefetch_pgs.extend(self.prefetch_pgs)
+        storage_metrics.backup_pgs.extend(self.backup_pgs)
+        storage_metrics.prefetch_bandwidth.extend(self.prefetch_bandwidth)
+        storage_metrics.backup_bandwidth.extend(self.backup_bandwidth)
+        self.prefetch_pgs.clear()
+        self.backup_pgs.clear()
+        self.prefetch_bandwidth.clear()
+        self.backup_bandwidth.clear()
+        return storage_metrics
diff --git a/python/sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py b/python/sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py
new file mode 100644
index 000000000..365effdef
--- /dev/null
+++ b/python/sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py
@@ -0,0 +1,43 @@
+import multiprocessing.shared_memory
+from pathlib import Path
+
+import pytest
+import torch
+from torch.utils.cpp_extension import load
+from tqdm import tqdm
+
+root = Path(__file__).parent.resolve()
+hf3fs_utils = load(
+    name="hf3fs_utils", sources=[f"{root}/hf3fs_utils.cpp"], verbose=True
+)
+
+
+def test_rw_shm():
+    numel = 8 << 20
+    dtype = torch.bfloat16
+    page_num = 128
+    page_bytes = numel * dtype.itemsize
+    shm = multiprocessing.shared_memory.SharedMemory(
+        size=page_num * page_bytes, create=True
+    )
+    tshm = torch.frombuffer(shm.buf, dtype=torch.uint8)
+    a = [
+        torch.randn(numel, dtype=dtype)
+        for _ in tqdm(range(page_num), desc="prepare input")
+    ]
+    b = [
+        torch.empty(numel, dtype=dtype)
+        for _ in tqdm(range(page_num), desc="prepare output")
+    ]
+    hf3fs_utils.write_shm(a, tshm)
+    hf3fs_utils.read_shm(tshm, b)
+    for _a, _b in tqdm(zip(a, b), desc="assert_close"):
+        torch.testing.assert_close(_a, _b)
+
+    del tshm
+    shm.close()
+    shm.unlink()
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/python/sglang/srt/mem_cache/storage/lmcache/README.md b/python/sglang/srt/mem_cache/storage/lmcache/README.md
new file mode 100644
index 000000000..7177e21e5
--- /dev/null
+++ b/python/sglang/srt/mem_cache/storage/lmcache/README.md
@@ -0,0 +1,43 @@
+# LMCache Connector for SGLang
+
+This document describes how to use LMCache as KV Cache Management Backend for SGLang engine.
+For more details about LMCache, please refer to: https://lmcache.ai
+
+## Install LMCache
+
+### Method 1: with pip
+
+```bash
+pip install lmcache
+```
+
+### Method 2: from source
+
+Clone LMCache project:
+
+```bash
+git clone https://github.com/LMCache/LMCache
+```
+
+Install:
+
+```bash
+cd LMCache
+pip install -e . --no-build-isolation
+```
+
+
+## Use LMCache
+
+Firstly, setup LMCache config. An example config is set at `example_config.yaml`. For more settings please refer to https://docs.lmcache.ai/api_reference/configurations.html.
+
+Secondly, setup SGLang serving engine with lmcache:
+
+```bash
+export LMCACHE_USE_EXPERIMENTAL=True
+export LMCACHE_CONFIG_FILE=example_config.yaml
+
+python -m sglang.launch_server \
+  --model-path MODEL \
+  --enable-lmcache
+```
diff --git a/python/sglang/srt/mem_cache/storage/lmcache/example_config.yaml b/python/sglang/srt/mem_cache/storage/lmcache/example_config.yaml
new file mode 100644
index 000000000..549110b7c
--- /dev/null
+++ b/python/sglang/srt/mem_cache/storage/lmcache/example_config.yaml
@@ -0,0 +1,7 @@
+# Basic configurations
+chunk_size: 256
+
+# CPU offloading configurations
+local_cpu: true
+use_layerwise: true
+max_local_cpu_size: 10 # number of CPU backend GB
diff --git a/python/sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py b/python/sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py
new file mode 100644
index 000000000..f8690aec4
--- /dev/null
+++ b/python/sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py
@@ -0,0 +1,280 @@
+from __future__ import annotations
+
+import logging
+import threading
+from typing import TYPE_CHECKING, List, Optional
+
+import torch
+
+from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
+from sglang.srt.mem_cache.base_prefix_cache import MatchResult
+from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
+from sglang.srt.mem_cache.radix_cache import RadixCache, TreeNode
+
+try:
+    from lmcache.integration.sglang.sglang_adapter import (
+        LMCacheLayerwiseConnector,
+        LoadMetadata,
+        StoreMetadata,
+    )
+except ImportError as e:
+    raise RuntimeError(
+        "LMCache is not installed. Please install it by running `pip install lmcache`"
+    ) from e
+
+if TYPE_CHECKING:
+    from sglang.srt.configs.model_config import ModelConfig
+    from sglang.srt.managers.schedule_batch import Req
+
+logger = logging.getLogger(__name__)
+
+
+class LayerTransferCounter:
+    """Minimal adapter that lets the memory pool notify LMCache per-layer.
+
+    The KV pool calls `wait_until(layer_id)` after finishing a layer, which we
+    translate into a `load_kv_layerwise(layer_id)` call on the LMCache connector
+    within the provided CUDA stream.
+    """
+
+    def __init__(
+        self,
+        num_layers: int,
+        load_stream: torch.cuda.Stream,
+        lmc_connector: LMCacheLayerwiseConnector,
+        printable: bool = False,
+    ):
+        self.num_layers = num_layers
+        self.load_stream = load_stream
+        self.lmc_connector = lmc_connector
+
+    def wait_until(self, layer_id: int):
+        # Ensure ordering of the async loads wrt compute stream(s).
+        self.load_stream.synchronize()
+        with self.load_stream:
+            self.lmc_connector.load_kv_layerwise(layer_id)
+
+
+class LMCRadixCache(RadixCache):
+    """RadixCache + LMCache IO.
+
+    This subclass adds:
+      - LMCache connector setup (device/host buffers, TP rank/size)
+      - Two CUDA streams for async load/store
+      - Layer-wise transfer executor wiring to the KV cache
+      - Overridden `match_prefix` to fetch missing prefix chunks from LMCache
+      - Extended cache_finalization paths to store back into LMCache
+      - Eviction barrier that respects any in-flight host->device stores
+    """
+
+    def __init__(
+        self,
+        req_to_token_pool: ReqToTokenPool,
+        token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator,
+        page_size: int,
+        disable: bool = False,
+        enable_kv_cache_events: bool = False,
+        model_config: Optional["ModelConfig"] = None,
+        tp_size: int = 1,
+        rank: int = 0,
+        tp_group: Optional[torch.distributed.ProcessGroup] = None,
+    ):
+        super().__init__(
+            req_to_token_pool=req_to_token_pool,
+            token_to_kv_pool_allocator=token_to_kv_pool_allocator,
+            page_size=page_size,
+            disable=disable,
+            enable_kv_cache_events=enable_kv_cache_events,
+        )
+
+        kvcache = self.token_to_kv_pool_allocator.get_kvcache()
+        self.lmcache_connector = LMCacheLayerwiseConnector(
+            sgl_config=model_config,
+            tp_size=tp_size,
+            rank=rank,
+            # NOTE: The original implementation accessed private buffers via
+            # `_kvcache.k_buffer` / `.v_buffer`. We prefer public accessors when
+            # available; fall back to private fields if needed.
+            k_pool=getattr(
+                kvcache,
+                "k_buffer",
+                getattr(self.token_to_kv_pool_allocator._kvcache, "k_buffer"),
+            ),
+            v_pool=getattr(
+                kvcache,
+                "v_buffer",
+                getattr(self.token_to_kv_pool_allocator._kvcache, "v_buffer"),
+            ),
+            tp_group=tp_group,
+        )
+
+        self.load_stream = torch.cuda.Stream()
+        self.store_stream = torch.cuda.Stream()
+
+        self.layer_done_executor = LayerTransferCounter(
+            num_layers=(
+                model_config.num_hidden_layers if model_config is not None else 0
+            ),
+            load_stream=self.load_stream,
+            lmc_connector=self.lmcache_connector,
+        )
+        kvcache.register_layer_transfer_counter(self.layer_done_executor)
+
+        self._in_flight_nodes: list[TreeNode] = []
+        self._node_lock = threading.Lock()
+
+    def reset(self):  # type: ignore[override]
+        super().reset()
+        if hasattr(self, "_in_flight_nodes"):
+            with self._node_lock:
+                self._in_flight_nodes.clear()
+
+    def match_prefix(self, key: List[int], **kwargs) -> MatchResult:  # type: ignore[override]
+        """Match cached prefix; if there's a tail miss, prefetch from LMCache.
+
+        Reuses the base matching logic to obtain (value, last_node). If there
+        remains a *page-aligned* uncached suffix and there is room (or after
+        eviction), we allocate token slots and trigger an async LMCache load
+        into those slots, then materialize a new child node for the retrieved
+        chunk.
+        """
+        if self.disable or not key:
+            return super().match_prefix(key, **kwargs)
+
+        if self.page_size != 1:
+            aligned_len = len(key) // self.page_size * self.page_size
+            key = key[:aligned_len]
+
+        base_res = super().match_prefix(key, **kwargs)
+        value: torch.Tensor = base_res.device_indices
+        last_node: TreeNode = base_res.last_device_node
+
+        if value.numel() == len(key):
+            return base_res
+
+        uncached_len = len(key) - value.numel()
+        if uncached_len == 0:
+            return base_res
+
+        chunk_size = self.lmcache_connector.chunk_size()
+        prefix_pad = value.numel() % chunk_size
+
+        if self.token_to_kv_pool_allocator.available_size() < uncached_len:
+            self.evict(uncached_len)
+
+        token_slots = self.token_to_kv_pool_allocator.alloc(uncached_len)
+        if token_slots is None:
+            return base_res
+
+        slot_mapping = torch.cat(
+            [
+                torch.full((value.numel(),), -1, dtype=torch.int64, device=self.device),
+                token_slots.detach().clone().to(torch.int64).to(self.device),
+            ]
+        )
+
+        with torch.cuda.stream(self.load_stream):
+            num_retrieved = self.lmcache_connector.start_load_kv(
+                LoadMetadata(
+                    token_ids=key,  # full page-aligned key
+                    slot_mapping=slot_mapping,
+                    offset=value.numel() - prefix_pad,  # LMCache offset convention
+                )
+            )
+        logger.debug("num_retrieved_tokens: %s", num_retrieved)
+
+        if num_retrieved > 0:
+            self.token_to_kv_pool_allocator.free(
+                token_slots[(num_retrieved - prefix_pad) :]
+            )
+        else:
+            self.token_to_kv_pool_allocator.free(token_slots)
+
+        if num_retrieved > 0:
+            fetched = num_retrieved - prefix_pad
+            new_node = TreeNode()
+            start = value.numel()
+            end = start + fetched
+            new_node.key = key[start:end]
+            new_node.value = token_slots[:fetched]
+            new_node.parent = last_node
+            last_node.children[self.get_child_key_fn(new_node.key)] = new_node
+            last_node = new_node
+
+            value = torch.cat([value, token_slots[:fetched]])
+            self.evictable_size_ += fetched
+
+            self._record_store_event(new_node.parent)
+            self._record_store_event(new_node)
+
+            return MatchResult(
+                device_indices=value,
+                last_device_node=last_node,
+                last_host_node=last_node,
+            )
+
+        return base_res
+
+    def cache_finished_req(self, req: "Req") -> None:  # type: ignore[override]
+        """On request completion, insert device KV into radix and store to LMCache."""
+
+        super().cache_finished_req(req)
+
+        token_ids = (req.origin_input_ids + req.output_ids)[:-1]
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, : len(token_ids)
+        ]
+
+        _, new_last_node, _, _ = self.match_prefix(token_ids)
+        assert new_last_node is not None
+
+        self.inc_lock_ref(new_last_node)
+        store_md = StoreMetadata(
+            last_node=new_last_node,
+            token_ids=token_ids,
+            kv_indices=kv_indices,
+            offset=0,
+        )
+        with torch.cuda.stream(self.store_stream):
+            self.lmcache_connector.store_kv(store_md)
+        with self._node_lock:
+            self._in_flight_nodes.append(new_last_node)
+
+    def evict(self, num_tokens: int) -> None:  # type: ignore[override]
+        """Before base eviction, wait for any outstanding stores and release locks."""
+        if self.disable:
+            return
+
+        self.store_stream.synchronize()
+        with self._node_lock:
+            for node in self._in_flight_nodes:
+                self.dec_lock_ref(node)
+            self._in_flight_nodes.clear()
+
+        super().evict(num_tokens)
+
+    def pretty_print(self):  # type: ignore[override]
+        super().pretty_print()
+        try:
+            logger.debug(
+                "evictable=%d protected=%d", self.evictable_size_, self.protected_size_
+            )
+        except Exception:  # pragma: no cover
+            pass
+
+
+if __name__ == "__main__":
+    cache = LMCRadixCache(
+        req_to_token_pool=None,
+        token_to_kv_pool_allocator=None,
+        page_size=1,
+        disable=False,
+        enable_kv_cache_events=False,
+        model_config=None,
+        tp_size=1,
+        rank=0,
+        tp_group=None,
+    )
+    cache.insert([1, 2, 3], torch.tensor([10, 11, 12], dtype=torch.int64))
+    cache.insert([1, 2, 3, 4], torch.tensor([10, 11, 12, 13], dtype=torch.int64))
+    cache.pretty_print()
diff --git a/python/sglang/srt/mem_cache/storage/lmcache/unit_test.py b/python/sglang/srt/mem_cache/storage/lmcache/unit_test.py
new file mode 100644
index 000000000..68dfe939d
--- /dev/null
+++ b/python/sglang/srt/mem_cache/storage/lmcache/unit_test.py
@@ -0,0 +1,121 @@
+try:
+    from lmcache.integration.sglang.sglang_adapter import (
+        LMCacheLayerwiseConnector,
+        LoadMetadata,
+        StoreMetadata,
+    )
+except ImportError:
+    raise RuntimeError(
+        "LMCache is not installed. Please install it by running `pip install lmcache` in the root directory of LMCache"
+    )
+
+import os
+
+import torch
+
+from sglang.srt.configs.model_config import ModelConfig
+
+os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
+os.environ["LMCACHE_CONFIG_FILE"] = "example_config.yaml"
+
+
+def test_load_store_metadata():
+    model_config = ModelConfig(
+        model_path="Qwen/Qwen3-4B",
+    )
+
+    # Generate Dummy KV Cache
+    head_num = model_config.num_key_value_heads
+    head_dim = model_config.head_dim
+    layer_num = model_config.num_hidden_layers
+    buffer_size = 256
+    input_id_len = 16
+
+    k_buffer = [
+        torch.randn(buffer_size, head_num, head_dim, dtype=torch.bfloat16).cuda()
+        for _ in range(layer_num)
+    ]
+    v_buffer = [
+        torch.randn(buffer_size, head_num, head_dim, dtype=torch.bfloat16).cuda()
+        for _ in range(layer_num)
+    ]
+
+    connector = LMCacheLayerwiseConnector(model_config, 1, 0, k_buffer, v_buffer)
+
+    fake_token_ids = torch.randint(0, model_config.vocab_size, (input_id_len,)).tolist()
+    fake_kv_indices = torch.randint(0, buffer_size, (input_id_len,))
+    offset = 0
+
+    store_metadata = StoreMetadata(
+        last_node=None,
+        token_ids=fake_token_ids,
+        kv_indices=fake_kv_indices,
+        offset=offset,
+    )
+
+    load_metadata = LoadMetadata(
+        token_ids=fake_token_ids,
+        slot_mapping=fake_kv_indices,
+        offset=offset,
+    )
+
+    current_stream = torch.cuda.current_stream()
+
+    retrieve_token_num = connector.start_load_kv(load_metadata)
+    assert retrieve_token_num == 0
+
+    connector.store_kv(store_metadata)
+    current_stream.synchronize()
+
+    # check retrieve
+    gt_key_buffer = [
+        torch.zeros(input_id_len, head_num, head_dim, dtype=torch.bfloat16).cuda()
+        for _ in range(layer_num)
+    ]
+    gt_value_buffer = [
+        torch.zeros(input_id_len, head_num, head_dim, dtype=torch.bfloat16).cuda()
+        for _ in range(layer_num)
+    ]
+
+    for i in range(layer_num):
+        gt_key_buffer[i] = k_buffer[i][fake_kv_indices]
+        gt_value_buffer[i] = v_buffer[i][fake_kv_indices]
+
+    # clear the k_buffer and v_buffer
+    for _ in range(layer_num):
+        k_buffer[i].zero_()
+        v_buffer[i].zero_()
+
+    retrieve_token_num = connector.start_load_kv(load_metadata)
+    assert retrieve_token_num == input_id_len
+
+    for i in range(layer_num):
+        current_stream.synchronize()
+        connector.load_kv_layerwise(i)
+
+    current_stream.synchronize()
+    test_key_buffer = [
+        torch.zeros(input_id_len, head_num, head_dim, dtype=torch.bfloat16).cuda()
+        for _ in range(layer_num)
+    ]
+    test_value_buffer = [
+        torch.zeros(input_id_len, head_num, head_dim, dtype=torch.bfloat16).cuda()
+        for _ in range(layer_num)
+    ]
+
+    for i in range(layer_num):
+        test_key_buffer[i] = k_buffer[i][fake_kv_indices]
+        test_value_buffer[i] = v_buffer[i][fake_kv_indices]
+
+    for i in range(layer_num):
+        assert torch.allclose(test_key_buffer[i], gt_key_buffer[i])
+        assert torch.allclose(test_value_buffer[i], gt_value_buffer[i])
+
+    print("================================================")
+    print("TEST_LOAD_STORE_METADATA PASSED!")
+    print("================================================")
+    connector.close()
+
+
+if __name__ == "__main__":
+    test_load_store_metadata()
diff --git a/python/sglang/srt/mem_cache/storage/mooncake_store/README.md b/python/sglang/srt/mem_cache/storage/mooncake_store/README.md
new file mode 100644
index 000000000..e815122bd
--- /dev/null
+++ b/python/sglang/srt/mem_cache/storage/mooncake_store/README.md
@@ -0,0 +1,167 @@
+# Mooncake as L3 KV Cache
+
+This document describes how to use Mooncake as the L3 KV cache for SGLang.
+
+## About Mooncake
+
+Mooncake aims to enhance the inference efficiency of large language models (LLMs), especially in slow object storage environments, by constructing a multi-level caching pool on high-speed interconnected DRAM/SSD resources. Compared to traditional caching systems, Mooncake utilizes (GPUDirect) RDMA technology to transfer data directly in a zero-copy manner, while maximizing the use of multi-NIC resources on a single machine.
+
+For more details about Mooncake, please refer to [Mooncake project](https://github.com/kvcache-ai/Mooncake) and [Mooncake documents](https://kvcache-ai.github.io/Mooncake/).
+
+## Install Mooncake
+
+### Method 1: with pip
+
+```bash
+pip install mooncake-transfer-engine
+```
+
+### Method 2: from source
+
+Clone Mooncake project:
+
+```bash
+git clone https://github.com/kvcache-ai/Mooncake --recursive
+```
+
+Install dependencies:
+
+```bash
+cd Mooncake
+bash dependencies.sh
+```
+
+Build the project. For additional build options, please refer to [the official guide](https://kvcache-ai.github.io/Mooncake/getting_started/build.html).
+
+```bash
+mkdir build
+cd build
+cmake ..
+make -j
+```
+
+Install Mooncake:
+
+```bash
+sudo make install
+```
+
+## Deploy Mooncake
+
+**Mooncake** is a distributed system that efficiently aggregates memory resources across multiple servers. It can also be deployed on a single server for simpler setups.
+
+When integrated with **SGLang**, the system conceptually consists of four key components: `the master service`, `metadata service`, `store service`, and the `SGLang server`. Among them, the `master service` and `metadata service` are responsible for object and metadata maintenance. The `store service` manages a contiguous memory segment that contributes to the distributed KV cache, making its memory accessible to both local and remote `SGLang servers`. Data transfer occurs directly between the `store service` and `SGLang servers`, bypassing the `master service`.
+
+### Single Server Deployment
+
+**Launch Mooncake `metadata service`:**
+
+```bash
+python -m mooncake.http_metadata_server
+```
+
+**Launch Mooncake `master service`:**
+
+```bash
+mooncake_master
+```
+
+**Launch Mooncake `store service`:**
+
+First, create and save a configuration file in JSON format. For example:
+
+```json
+{
+    "local_hostname": "localhost",
+    "metadata_server": "http://localhost:8080/metadata",
+    "master_server_address": "localhost:50051",
+    "protocol": "rdma",
+    "device_name": "mlx5_0,mlx5_1",
+    "global_segment_size": 2684354560,
+    "local_buffer_size": 0
+}
+```
+
+Parameter Explanation:
+
+* `local_hostname`: The hostname of the `store service`.
+* `metadata_server`: The network address of the `metadata service`. The default port is 8080.
+* `master_server_address`: The network address of the `master service`. The default port is 50051.
+* `protocol`: The protocol used by the Mooncake. Supported values are `"rdma"` or `"tcp"`. For optimal performance, `"rdma"` is recommended.
+* `device_name`: The RDMA devices used by Mooncake. This parameter is required only when the protocol is set to `"rdma"`. Available devices can be listed using the `ibv_devices` command.
+* `global_segment_size`: The amount of memory (in bytes) contributed to the global memory pool. A larger value allows Mooncake to cache more KV tensors.
+* `local_buffer_size`: Local buffer is used to do request operations such as `Get` or `Put`. In this case, it is set to 0 because the instance functions solely as a storage server, contributing memory to the global pool without issuing any request operations.
+
+Then start the `store service`:
+
+```bash
+python -m mooncake.mooncake_store_service --config=[config_path]
+```
+
+Note: To get started quickly, if `MOONCAKE_GLOBAL_SEGMENT_SIZE` is set to a non-zero value when starting the `SGLang server`, launching the `store service` can be skipped. In this case, the `SGLang server` also fulfills the role of the `store service`.
+
+**Start the `SGLang server` with Mooncake enabled:**
+Mooncake configuration can be provided via environment variables. Note that, for optimal performance, the Mooncake backend currently supports only the `page_first` layout (which optimizes memory access patterns for KV cache operations).
+
+There are two ways to configure Mooncake: 1. Using environment variables; 2. Using extra-config of sglang arguments.
+
+**Using env variables to configure Mooncake**
+
+```bash
+MOONCAKE_TE_META_DATA_SERVER="http://127.0.0.1:8080/metadata" \
+MOONCAKE_MASTER=127.0.0.1:50051 \
+MOONCAKE_PROTOCOL="rdma" \
+MOONCAKE_DEVICE="mlx5_0,mlx5_1" \
+MOONCAKE_GLOBAL_SEGMENT_SIZE=4294967296 \
+python -m sglang.launch_server \
+    --enable-hierarchical-cache \
+    --hicache-storage-backend mooncake\
+    --model-path [model_path]
+```
+
+Parameter Explanation:
+
+* `MOONCAKE_TE_META_DATA_SERVER`: The network address of the `metadata service`. The default port is 8080.
+* `MOONCAKE_MASTER`: The network address of the `master service`. The default port is 50051.
+* `MOONCAKE_PROTOCOL`: The protocol used by Mooncake. Supported values are `"rdma"` or `"tcp"`. For optimal performance, `"rdma"` is recommended.
+* `MOONCAKE_DEVICE`: The RDMA devices used by Mooncake. This parameter is required only when the protocol is set to `"rdma"`. Available devices can be listed using the `ibv_devices` command.
+* `MOONCAKE_GLOBAL_SEGMENT_SIZE`: The amount of memory (in bytes) contributed to the global memory pool. If at least one `store service` is launched, then this value could be set to `0`. In this case, the `SGLang server` will not contribute any memory to the system. Note that KV tensors cached in the contributed memory will be lost once this process terminates; however, this will not cause any system errors.
+
+**Using extra-config of sglang arguments to configure Mooncake**
+
+```bash
+python -m sglang.launch_server \
+    --enable-hierarchical-cache \
+    --hicache-storage-backend mooncake \
+    --model-path [model_path] \
+    --hicache-storage-backend-extra-config '{"master_server_address": "127.0.0.1:50051", "local_hostname": "localhost", "metadata_server": "http://127.0.0.1:8080/metadata", "global_segment_size": 4294967296, "local_buffer_size": 16777216, "protocol": "rdma", "device_name": "mlx5_0,mlx5_1"}'
+```
+
+**Important: Understanding Global Segment Size**
+
+`global_segment_size` for `store service` and `MOONCAKE_GLOBAL_SEGMENT_SIZE` for `SGLang service`: This parameter specifies the amount of memory each instance contributes to the distributed memory pool. The total memory available for KV cache storage across the cluster is the sum of the memory contributed by all instances.
+
+Adjust this value according to system’s available memory and expected cache requirements.
+
+### Distributed Deployment
+
+Distributed deployment of Mooncake is straightforward. Similar to the single-node setup, start one `metadata service` and one `master service` for this cluster. Then start a `store service` on each server.
+
+Mooncake also supports high availability mode. This mode enhances fault tolerance by running the `master service` as a cluster of multiple master nodes coordinated through an `etcd` cluster. The master nodes use `etcd` to elect a leader, which is responsible for handling client requests. For more details about how to deploy in this mode, please refer to our [documents](https://kvcache-ai.github.io/Mooncake/) .
+
+## Test Mooncake Store
+
+This test is intended for developers to quickly verify that the MooncakeStore class interfaces are functioning correctly.
+
+First, start the `metadata service` and `master service`. Then run the `test_mooncake_store.py`. 16MB global segments size is enough to run this test.
+
+```bash
+MOONCAKE_TE_META_DATA_SERVER="http://127.0.0.1:8080/metadata" \
+MOONCAKE_MASTER=127.0.0.1:50051 \
+MOONCAKE_PROTOCOL="rdma" \
+MOONCAKE_DEVICE="mlx5_0,mlx5_1" \
+MOONCAKE_GLOBAL_SEGMENT_SIZE=16777216 \
+python3 [path of test_mooncake_store.py]
+```
+
+If all tests pass, the message "✅ All tests passed" will be printed at the end.
diff --git a/python/sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py b/python/sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py
new file mode 100644
index 000000000..caab04b5c
--- /dev/null
+++ b/python/sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py
@@ -0,0 +1,324 @@
+import json
+import logging
+import os
+import uuid
+from dataclasses import dataclass
+from typing import Any, List, Optional
+
+import torch
+
+from sglang.srt.mem_cache.hicache_storage import HiCacheStorage, HiCacheStorageConfig
+
+DEFAULT_GLOBAL_SEGMENT_SIZE = 4 * 1024 * 1024 * 1024  # 4 GiB
+DEFAULT_LOCAL_BUFFER_SIZE = 16 * 1024 * 1024  # 16 MB
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class MooncakeStoreConfig:
+    local_hostname: str
+    metadata_server: str
+    global_segment_size: int
+    local_buffer_size: int
+    protocol: str
+    device_name: str
+    master_server_address: str
+
+    @staticmethod
+    def from_file() -> "MooncakeStoreConfig":
+        """Load the config from a JSON file."""
+        file_path = os.getenv("MOONCAKE_CONFIG_PATH")
+        if file_path is None:
+            raise ValueError(
+                "The environment variable 'MOONCAKE_CONFIG_PATH' is not set."
+            )
+        with open(file_path) as fin:
+            config = json.load(fin)
+        return MooncakeStoreConfig(
+            local_hostname=config.get("local_hostname"),
+            metadata_server=config.get("metadata_server"),
+            global_segment_size=config.get(
+                "global_segment_size", DEFAULT_GLOBAL_SEGMENT_SIZE
+            ),
+            # Zero copy interface does not need local buffer
+            local_buffer_size=DEFAULT_LOCAL_BUFFER_SIZE,
+            protocol=config.get("protocol", "tcp"),
+            device_name=config.get("device_name", "auto"),
+            master_server_address=config.get("master_server_address"),
+        )
+
+    @staticmethod
+    def load_from_env() -> "MooncakeStoreConfig":
+        """Load config from a file specified in the environment variable.
+        export MOONCAKE_MASTER=10.13.3.232:50051
+        export MOONCAKE_PROTOCOL="rdma"
+        export MOONCAKE_DEVICE="auto"
+        export MOONCAKE_TE_META_DATA_SERVER="P2PHANDSHAKE"
+        """
+        # other required environment variables...
+        if not os.getenv("MOONCAKE_MASTER"):
+            raise ValueError("The environment variable 'MOONCAKE_MASTER' is not set.")
+        return MooncakeStoreConfig(
+            local_hostname=os.getenv("LOCAL_HOSTNAME", "localhost"),
+            metadata_server=os.getenv("MOONCAKE_TE_META_DATA_SERVER", "P2PHANDSHAKE"),
+            global_segment_size=int(
+                os.getenv("MOONCAKE_GLOBAL_SEGMENT_SIZE", DEFAULT_GLOBAL_SEGMENT_SIZE)
+            ),
+            # Zero copy interface does not need local buffer
+            local_buffer_size=DEFAULT_LOCAL_BUFFER_SIZE,
+            protocol=os.getenv("MOONCAKE_PROTOCOL", "tcp"),
+            device_name=os.getenv("MOONCAKE_DEVICE", "auto"),
+            master_server_address=os.getenv("MOONCAKE_MASTER"),
+        )
+
+    @staticmethod
+    def load_from_extra_config(extra_config: dict) -> "MooncakeStoreConfig":
+        """Load config from extra_config dictionary."""
+        if "master_server_address" not in extra_config:
+            raise ValueError("master_server_address is required in extra_config")
+
+        return MooncakeStoreConfig(
+            local_hostname=extra_config.get("local_hostname", "localhost"),
+            metadata_server=extra_config.get("metadata_server", "P2PHANDSHAKE"),
+            global_segment_size=extra_config.get(
+                "global_segment_size", DEFAULT_GLOBAL_SEGMENT_SIZE
+            ),
+            local_buffer_size=extra_config.get(
+                "local_buffer_size", DEFAULT_LOCAL_BUFFER_SIZE
+            ),
+            protocol=extra_config.get("protocol", "tcp"),
+            device_name=extra_config.get("device_name", "auto"),
+            master_server_address=extra_config["master_server_address"],
+        )
+
+    def __post_init__(self):
+        if self.device_name == "auto":
+            os.environ["MC_MS_AUTO_DISC"] = "1"
+            os.environ["MC_MS_FILTERS"] = (
+                "mlx5_bond_0, mlx5_bond_1, mlx5_bond_2, mlx5_bond_3"
+            )
+
+
+class MooncakeStore(HiCacheStorage):
+    def __init__(self, storage_config: HiCacheStorageConfig = None):
+        try:
+            from mooncake.store import MooncakeDistributedStore
+        except ImportError as e:
+            raise ImportError(
+                "Please install mooncake by following the instructions at "
+                "https://kvcache-ai.github.io/Mooncake/getting_started/build.html"
+                "to run SGLang with MooncakeConnector."
+            ) from e
+
+        try:
+            self.store = MooncakeDistributedStore()
+
+            extra_config = (
+                getattr(storage_config, "extra_config", None)
+                if storage_config
+                else None
+            )
+            # Load configuration with master_server_address prioritized from extra_config if available
+            if (
+                extra_config is not None
+                and extra_config.get("master_server_address") is not None
+            ):
+                # Load from extra_config
+                self.config = MooncakeStoreConfig.load_from_extra_config(extra_config)
+                logger.info(
+                    "Mooncake Configuration loaded from extra_config successfully."
+                )
+            else:
+                # Load from environment variables
+                self.config = MooncakeStoreConfig.load_from_env()
+                logger.info("Mooncake Configuration loaded from env successfully.")
+
+            ret_code = self.store.setup(
+                self.config.local_hostname,
+                self.config.metadata_server,
+                self.config.global_segment_size,
+                self.config.local_buffer_size,
+                self.config.protocol,
+                self.config.device_name,
+                self.config.master_server_address,
+            )
+            if ret_code:
+                logger.error(f"failed to setup mooncake store, error code: {ret_code}")
+
+            logger.info("Connect to Mooncake store successfully.")
+            self.warmup()
+            logger.info("Mooncake store warmup successfully.")
+
+            if storage_config is not None:
+                self.is_mla_backend = storage_config.is_mla_model
+                self.local_rank = storage_config.tp_rank
+            else:
+                self.is_mla_backend = False
+                self.local_rank = 0
+
+        except ValueError as e:
+            logger.error("Configuration loading failed: %s", e)
+            raise
+        except Exception as exc:
+            logger.error("An error occurred while loading the configuration: %s", exc)
+            raise
+
+    def warmup(self):
+        warmup_key = "sglang_mooncake_store_warmup_key" + uuid.uuid4().hex
+        warmup_value = bytes(4 * 1024)  # 4 KB
+        assert self.store.put(warmup_key, warmup_value) == 0
+        assert self.store.is_exist(warmup_key) == 1
+        assert self.store.get(warmup_key) == warmup_value
+
+    def register_buffer(self, buffer: torch.Tensor) -> None:
+        try:
+            buffer_ptr = buffer.data_ptr()
+            buffer_size = buffer.numel() * buffer.element_size()
+            ret_code = self.store.register_buffer(buffer_ptr, buffer_size)
+            if ret_code:
+                logger.error(f"failed to register buffer, error code: {ret_code}")
+        except TypeError as err:
+            logger.error("Failed to register buffer to Mooncake Store: %s", err)
+            raise TypeError("Mooncake Store Register Buffer Error.") from err
+
+    def set(
+        self,
+        key,
+        value: Optional[Any] = None,
+        target_location: Optional[List[int]] = None,
+        target_sizes: Optional[List[int]] = None,
+    ) -> bool:
+        # Only support zero copy set for now
+        assert target_location is not None and target_sizes is not None
+        exist_result = self._batch_exist([key])
+        if exist_result[0] == 1:
+            return True
+        put_result = self._put_batch_zero_copy_impl(
+            [key], [target_location], [target_sizes]
+        )
+        return put_result[0] == 0
+
+    def batch_set(
+        self,
+        keys: List[str],
+        values: Optional[List[torch.Tensor]] = None,
+        target_locations: Optional[List[int]] = None,
+        target_sizes: Optional[List[int]] = None,
+    ) -> bool:
+        # Only support zero copy set for now
+        assert target_locations is not None and target_sizes is not None
+        assert len(keys) == len(target_locations) == len(target_sizes)
+
+        if len(keys) == 0:
+            return False
+
+        for i in range(len(keys)):
+            if (
+                keys[i] is None
+                or target_locations[i] is None
+                or target_sizes[i] is None
+            ):
+                return False
+
+        exist_result = self._batch_exist(keys)
+        set_keys = []
+        set_target_locations = []
+        set_target_sizes = []
+        set_indices = []
+        for i in range(len(keys)):
+            if exist_result[i] != 1:
+                set_keys.append(keys[i])
+                set_target_locations.append(target_locations[i])
+                set_target_sizes.append(target_sizes[i])
+                set_indices.append(i)
+        # Only set non-existing keys to storage
+        put_result = self._put_batch_zero_copy_impl(
+            set_keys, set_target_locations, set_target_sizes
+        )
+        for i in range(len(set_indices)):
+            if put_result[i] == 0:
+                exist_result[set_indices[i]] = 1
+
+        success_count = 0
+        for i in range(len(keys)):
+            if exist_result[i] == 0:
+                break
+            success_count += 1
+        # TODO: return the number of consecutive successful operations from the start.
+        return success_count == len(keys)
+
+    def get(
+        self,
+        key,
+        target_location: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> bool:
+        assert target_location is not None and target_sizes is not None
+        get_result = self._get_batch_zero_copy_impl(
+            [key], [target_location], [target_sizes]
+        )
+        return get_result[0] >= 0
+
+    def batch_get(
+        self,
+        keys: List[str],
+        target_locations: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> int:
+        assert len(keys) == len(target_locations) == len(target_sizes)
+        if len(keys) == 0:
+            return 0
+        get_result = self._get_batch_zero_copy_impl(
+            keys, target_locations, target_sizes
+        )
+        if self.is_mla_backend:
+            key_multiplier = 1
+        else:
+            key_multiplier = 2
+        for i in range(len(keys)):
+            if get_result[i] < 0:
+                return i // key_multiplier
+        return len(keys) // key_multiplier
+
+    def exists(self, key) -> bool:
+        exist_result = self._batch_exist([key])
+        return exist_result[0] == 1
+
+    def batch_exists(self, keys) -> int:
+        if self.is_mla_backend:
+            query_keys = [f"{key}_k" for key in keys]
+            key_multiplier = 1
+        else:
+            query_keys = []
+            for key in keys:
+                query_keys.append(f"{key}_{self.local_rank}_k")
+                query_keys.append(f"{key}_{self.local_rank}_v")
+            key_multiplier = 2
+
+        exist_result = self._batch_exist(query_keys)
+        for i in range(len(query_keys)):
+            if exist_result[i] != 1:
+                return i // key_multiplier
+        return len(query_keys) // key_multiplier
+
+    def close(self):
+        # MooncakeDistributedStore will automatically call the destructor, so
+        # it is unnecessary to close it manually.
+        pass
+
+    def clear(self) -> None:
+        self.store.remove_all()
+
+    def _put_batch_zero_copy_impl(
+        self, key_strs: List[str], buffer_ptrs: List[int], buffer_sizes: List[int]
+    ) -> List[int]:
+        return self.store.batch_put_from(key_strs, buffer_ptrs, buffer_sizes)
+
+    def _get_batch_zero_copy_impl(
+        self, key_strs: List[str], buffer_ptrs: List[int], buffer_sizes: List[int]
+    ) -> List[int]:
+        return self.store.batch_get_into(key_strs, buffer_ptrs, buffer_sizes)
+
+    def _batch_exist(self, key_strs: List[str]) -> List[int]:
+        return self.store.batch_is_exist(key_strs)
diff --git a/python/sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py b/python/sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py
new file mode 100644
index 000000000..3083abe22
--- /dev/null
+++ b/python/sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py
@@ -0,0 +1,161 @@
+import logging
+import uuid
+
+import torch
+from mooncake_store import MooncakeStore
+
+from sglang.srt.mem_cache.hicache_storage import HiCacheStorageConfig
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+
+def generate_batch_query_keys(kv_num: int, config: HiCacheStorageConfig):
+    keys = []
+    for _ in range(kv_num):
+        key = "test_" + str(uuid.uuid4())
+        keys.append(key)
+    set_keys = []
+    for key in keys:
+        if config.is_mla_model:
+            set_keys.append(key + "_k")
+        else:
+            set_keys.append(key + f"_{config.tp_rank}_k")
+            set_keys.append(key + f"_{config.tp_rank}_v")
+    get_keys = set_keys
+    exist_keys = keys
+    return set_keys, get_keys, exist_keys
+
+
+def test_single_operation():
+    """Test the set API with a single key-value pair."""
+    print("=" * 100)
+    print("Testing single operation")
+
+    buffer_size = 1024 * 1024 * 16  # 16MB
+    value_elements = 1024
+    store = MooncakeStore()
+    buffer = torch.randn(buffer_size, dtype=torch.float32)
+    store.register_buffer(buffer)
+    value_size = value_elements * buffer.element_size()
+
+    key = str(uuid.uuid4())
+    set_slice = buffer[:value_elements]
+    get_slice = buffer[value_elements : 2 * value_elements]
+    set_location = set_slice.data_ptr()
+    get_location = get_slice.data_ptr()
+
+    # Test set operation
+    result = store.set(key, target_location=set_location, target_sizes=value_size)
+    assert result is True, f"❌set operation failed for key: {key}"
+
+    # Test exists operation
+    assert store.exists(key), f"❌key {key} should exist after set operation"
+
+    # Test get operation
+    result = store.get(key, target_location=get_location, target_sizes=value_size)
+    assert result is True, f"❌get operation failed for key: {key}"
+
+    # Compare the data using proper tensor indices
+    assert torch.allclose(
+        set_slice, get_slice, atol=1e-6
+    ), f"❌get operation failed for key: {key}"
+
+    logger.info(f"✅ Single operation passed")
+
+
+def test_batch_operation(config: HiCacheStorageConfig):
+    """Test the batch set/get APIs with multiple key-value pairs."""
+    print("=" * 100)
+    print(f"Testing batch operation with config: {config}")
+
+    buffer_size = 1024 * 1024 * 16  # 16MB
+    value_elements = 256
+    kv_num = 13
+    store = MooncakeStore(config)
+    buffer = torch.randn(buffer_size, dtype=torch.float32)
+    store.register_buffer(buffer)
+    value_size = value_elements * buffer.element_size()
+
+    set_keys, get_keys, exist_keys = generate_batch_query_keys(kv_num, config)
+    set_slices = [
+        buffer[i * value_elements : (i + 1) * value_elements]
+        for i in range(len(set_keys))
+    ]
+    set_locations = [set_slice.data_ptr() for set_slice in set_slices]
+    target_sizes = [value_size for _ in range(len(set_keys))]
+
+    # Test batch set operation
+    result = store.batch_set(
+        set_keys, target_locations=set_locations, target_sizes=target_sizes
+    )
+    assert result is True, f"❌batch set operation failed"
+
+    # Test batch exists operation
+    assert store.batch_exists(
+        exist_keys
+    ), f"❌keys should exist after batch set operation"
+
+    # Test batch get operation
+    get_slices = [
+        buffer[
+            (len(set_keys) + i)
+            * value_elements : (len(set_keys) + i + 1)
+            * value_elements
+        ]
+        for i in range(len(get_keys))
+    ]
+    get_locations = [get_slice.data_ptr() for get_slice in get_slices]
+    result = store.batch_get(
+        get_keys, target_locations=get_locations, target_sizes=target_sizes
+    )
+    assert result == kv_num, f"❌batch get operation failed"
+    for i in range(len(get_keys)):
+        assert torch.allclose(
+            set_slices[i], get_slices[i], atol=1e-6
+        ), f"❌batch get operation failed for key: {get_keys[i]}"
+
+    logger.info(f"✅ Batch operation passed")
+
+
+if __name__ == "__main__":
+    test_single_operation()
+    test_batch_operation(
+        HiCacheStorageConfig(
+            is_mla_model=False,
+            tp_rank=0,
+            tp_size=1,
+            model_name=None,
+            is_page_first_layout=True,
+        )
+    )
+    test_batch_operation(
+        HiCacheStorageConfig(
+            is_mla_model=True,
+            tp_rank=0,
+            tp_size=1,
+            model_name=None,
+            is_page_first_layout=True,
+        )
+    )
+    test_batch_operation(
+        HiCacheStorageConfig(
+            is_mla_model=False,
+            tp_rank=1,
+            tp_size=4,
+            model_name=None,
+            is_page_first_layout=True,
+        )
+    )
+    test_batch_operation(
+        HiCacheStorageConfig(
+            is_mla_model=True,
+            tp_rank=3,
+            tp_size=8,
+            model_name=None,
+            is_page_first_layout=True,
+        )
+    )
+    logger.info(f"✅ All tests passed")
diff --git a/python/sglang/srt/mem_cache/storage/nixl/README.md b/python/sglang/srt/mem_cache/storage/nixl/README.md
new file mode 100644
index 000000000..d33cd5d05
--- /dev/null
+++ b/python/sglang/srt/mem_cache/storage/nixl/README.md
@@ -0,0 +1,172 @@
+# NIXL Integration for HiCache
+
+This directory contains the **NIXL (NVIDIA Inference Xfer Library)** integration for **HiCache**, enabling high-performance storage across multiple backends.
+
+NIXL provides a unified API for accessing various storage plugins, including but not limited to:
+
+- **Deepseek's 3FS APIs** for high-throughput file operations
+- **GPU Direct Storage (GDS)** for direct data movement between storage and GPU memory, bypassing CPU memory copies
+- **Amazon S3-compatible object storage** for key-value access patterns
+
+Additional backend integrations are planned for future releases.
+
+## NIXL Resources
+
+- **Project Repository**: [NIXL on GitHub](https://github.com/ai-dynamo/nixl)
+- **Documentation**: [NIXL Documentation](https://github.com/ai-dynamo/nixl/tree/main/docs)
+
+## Overview
+
+The NIXL integration consists of two main files:
+
+- **`hicache_nixl.py`** - Main HiCache storage connector using NIXL
+- **`nixl_utils.py`** - Utility classes for backend selection, registration, and file management
+
+## Components
+
+### HiCacheNixl
+The main storage connector that provides:
+- Single and batch tensor set/get operations
+- Automatic backend selection (3FS > POSIX > GDS_MT > GDS > OBJ)
+- High-performance file-based (or) object based storage access using NIXL
+
+### NixlUtils
+Consolidated utility classes:
+- **NixlBackendSelection** - Handles backend selection and creation
+- **NixlRegistration** - Manages memory registration for tensors, files and objects
+- **NixlFileManager** - Handles file system operations and NIXL tuple creation
+
+## Using NIXL for HiCache backend
+When running the SGLang server, indicate `nixl` for `hicache-storage-backend` parameter, for instance:
+
+```bash
+python3 -m sglang.launch_server --model-path <model> --host <ip> --port <port> --page-size 64 --enable-hierarchical-cache --hicache-ratio 2 --hicache-size 64 --hicache-write-policy write_through --hicache-storage-backend nixl
+```
+
+To customize the base directory for files, you can set the following environment variable:
+
+```bash
+export SGLANG_HICACHE_NIXL_BACKEND_STORAGE_DIR=/path/to/desired/dir
+```
+
+Selection of any storage backend like 3FS requires availability of that library on the system, and the backend is selected based on the priority mentioned above.
+
+## Running Unit Tests
+
+### Prerequisites
+- NIXL library installed and available (latest main required for supporting object query)
+- PyTorch installed
+- Python 3.8+
+
+### Unit tests from current directory
+From the current directory run:
+
+#### Run all NIXL tests:
+```bash
+PYTHONPATH=. python -m pytest test_hicache_nixl_storage.py -o asyncio_mode=strict
+```
+
+#### Run with verbose output:
+```bash
+PYTHONPATH=. python -m pytest test_hicache_nixl_storage.py -v -o asyncio_mode=strict
+```
+
+Note: The `-v` flag provides more detailed output, showing each test case name and its result.
+
+#### Run a specific test:
+```bash
+PYTHONPATH=. python -m pytest test_hicache_nixl_storage.py -v -k test_single_set_get -o asyncio_mode=strict
+```
+
+Note: The `-o asyncio_mode=strict` flag is added to suppress warnings about asyncio configuration. This is not required for test functionality but provides cleaner output.
+
+## Test Coverage
+
+Tests for this integration, a test suite can be found at `test_hicache_nixl_storage.py` which covers:
+
+### HiCache Integration Tests (4 tests)
+- Single tensor set/get operations
+- Batch tensor set/get operations
+- Mixed single and batch operations
+- Data integrity for various tensor types
+
+### File Management Tests (5 tests)
+- Basic file operations
+- NIXL tuple creation
+- Error handling in file operations
+
+### Registration Tests (2 tests)
+- Tensor registration with memory type detection
+- File registration using NIXL tuples
+
+## Expected Output
+
+When tests run successfully, you should see:
+- NIXL agent initialization messages
+- Backend selection messages (e.g., "Backend POSIX was instantiated")
+- Test results with "ok" for passed tests
+- Summary showing "Ran X tests in Y seconds" and "OK"
+
+## Troubleshooting
+
+### Import Errors
+If you encounter `ModuleNotFoundError`, ensure:
+- You're running from the correct directory
+- `PYTHONPATH` is set correctly
+- NIXL library is properly installed
+
+### NIXL Errors
+If NIXL operations fail:
+- Check that NIXL is properly installed
+- Verify that required plugins are available
+- Ensure file permissions are correct for test directories
+
+## File Structure
+
+```
+python/sglang/srt/mem_cache/nixl/
+├── hicache_nixl.py          # Main HiCache storage connector
+├── nixl_utils.py            # All NIXL utility classes
+├── README.md                # This file
+└── tests/
+    └── test_nixl_unified.py # All tests in one file
+```
+
+## Dependencies
+
+- **NIXL**: NVIDIA Inference Xfer Library (version 0.4 or later)
+  - Required plugins: POSIX (minimum), 3FS/GDS (optional for better performance)
+  - See [NIXL Installation Guide](https://github.com/ai-dynamo/nixl/blob/main/README.md#installation)
+- **PyTorch**: For tensor operations (version 1.8 or later)
+- **Python 3.8+**: For type hints and modern features
+
+## Supported Features
+
+### Memory Types
+- **Tensor side**: multi-dimensional tensors of all numeric types (int32, int64, float32, float64) are supported.
+  - Tensors can be on CPU or GPU (as long as a GPU capable backend such as GDS_MT is available).
+  - Currently each tensor is mapped to a file or key, but it can be extended to support multiple keys per file or key.
+
+- **Storage side**: file and object are supported through their relevant backends (e.g., 3FS or OBJ).
+
+### Backend Priority
+
+The NIXL backend selection follows this priority order:
+1. **3FS** - Highest performance (if available)
+    - Best for high-throughput file operations using Deepseek 3FS APIs
+2. **POSIX** - Standard file I/O (fallback)
+    - Universal compatibility
+    - Good for development and testing - Leverages both libaio/liburing
+3. **GDS_MT** - Multi-threaded GDS (if available)
+    - Optimized for concurrent operations
+    - Supports GPU Direct storage with multiple light weight threads
+4. **GDS** - GPU Direct Storage (if available)
+    - Direct GPU-storage data path
+    - Best for filesystems benefiting from batch operations and smaller IOs.
+5. **OBJ** - Amazon S3 based Object Storage
+    - Key-value based storage
+The system automatically selects the best available backend, with POSIX as the default fallback.
+
+## Note
+
+This is v0 of the NIXL connector. Future versions will focus on further performance optimizations such as memory pre-registration (pre-allocating and registering memory buffers to reduce registration overhead during transfers) and block merging (combining related blocks as offsets within the same file to reduce file operations and improve throughput). These optimizations require changes at a higher layer, as the current HiCache API doesn't expose information like block relationships or hash patterns that would enable these optimizations.
diff --git a/python/sglang/srt/mem_cache/storage/nixl/hicache_nixl.py b/python/sglang/srt/mem_cache/storage/nixl/hicache_nixl.py
new file mode 100644
index 000000000..327c90502
--- /dev/null
+++ b/python/sglang/srt/mem_cache/storage/nixl/hicache_nixl.py
@@ -0,0 +1,257 @@
+import hashlib
+import logging
+import os
+import time
+import uuid
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+
+from sglang.srt.mem_cache.hicache_storage import HiCacheStorage
+
+from .nixl_utils import NixlBackendSelection, NixlFileManager, NixlRegistration
+
+try:
+    from nixl._api import nixl_agent, nixl_agent_config
+except ImportError as e:
+    raise ImportError(
+        "Please install NIXL by following the instructions at "
+        "https://github.com/ai-dynamo/nixl/blob/main/README.md "
+        "to use HiCacheNixl storage backend."
+    ) from e
+
+logger = logging.getLogger(__name__)
+
+
+class HiCacheNixl(HiCacheStorage):
+    """HiCacheNixl provides high-performance storage using NIXL plugins."""
+
+    def __init__(self, file_path: str = "/tmp/hicache_storage", plugin: str = "auto"):
+        """Initialize NIXL storage connector."""
+        # Might be better to be unified across HiCache backends and moved to HiCacheController
+        file_path = os.getenv("SGLANG_HICACHE_NIXL_BACKEND_STORAGE_DIR", file_path)
+        self.file_manager = (
+            NixlFileManager(file_path)
+            if plugin not in NixlBackendSelection.OBJ_PLUGINS
+            else None
+        )
+
+        agent_config = nixl_agent_config(backends=[])
+        self.agent_name = f"hicache_nixl_{str(uuid.uuid4())}"
+        self.agent = nixl_agent(self.agent_name, agent_config)
+
+        self.backend_selector = NixlBackendSelection(plugin)
+        if not self.backend_selector.create_backend(self.agent):
+            raise RuntimeError("Failed to create NIXL backend")
+
+        self.registration = NixlRegistration(self.agent)
+
+    def register_buffers(
+        self, buffers: Union[torch.Tensor, List[torch.Tensor], List[tuple]]
+    ) -> Optional[Any]:
+        """Register tensor(s) or target locations in host memory (list of addr,len tuples) with NIXL."""
+        if isinstance(buffers[0], tuple):
+            tuples = [(x[0], x[1], 0, "") for x in buffers]
+            return self.registration._register_memory(tuples, "DRAM")
+        else:
+            return self.registration._register_memory(buffers)
+
+    def register_files(
+        self, file_paths: List[str], open_file: Optional[bool] = True
+    ) -> Optional[Any]:
+        """Register files with NIXL."""
+        tuples = self.file_manager.files_to_nixl_tuples(file_paths)
+        return self.registration._register_memory(tuples, "FILE")
+
+    def register_objects(
+        self, keys: List[str], sizes: Optional[List[int]] = None
+    ) -> Optional[Any]:
+        """Register objects with NIXL."""
+        if not keys:
+            return None
+        tuples = [(0, 0, key, "") for key in keys]
+        return self.registration._register_memory(tuples, "OBJ")
+
+    def _execute_transfer(
+        self,
+        buffers: Optional[List[torch.Tensor | tuple]],
+        keys: List[str],
+        direction: str,
+    ) -> bool:
+        if len(buffers) != len(keys):
+            logger.error("Mismatch between number of tensors/buffers and files/objects")
+            return False
+
+        # Registering file and object keys per transfer, to be updated when
+        # pre-registration for file and object is added to HiCache.
+        if self.backend_selector.mem_type == "FILE":
+            tuples = self.file_manager.files_to_nixl_tuples(keys)
+            if not tuples or not self.registration._register_memory(tuples, "FILE"):
+                logger.error("Failed to prepare files for transfer")
+                return False
+        else:  # mem_type == "OBJ"
+            tuples = [(0, 0, key, "") for key in keys]
+            if not tuples or not self.registration._register_memory(tuples, "OBJ"):
+                logger.error("Failed to register objects")
+                return False
+
+        # Prepare transfer descriptors
+        if isinstance(buffers[0], torch.Tensor):
+            tensor_sizes = [
+                tensor.element_size() * tensor.numel() for tensor in buffers
+            ]
+            storage_tuples = [(x[0], s, x[2]) for x, s in zip(tuples, tensor_sizes)]
+            host_descs = self.agent.get_xfer_descs(buffers)
+        elif isinstance(buffers[0], tuple):
+            storage_tuples = [(x[0], y[1], x[2]) for x, y in zip(tuples, buffers)]
+            host_descs = self.agent.get_xfer_descs(
+                [(x[0], x[1], 0) for x in buffers], "DRAM"
+            )
+        else:
+            return False
+
+        storage_descs = self.agent.get_xfer_descs(
+            storage_tuples, self.backend_selector.mem_type
+        )
+
+        if (host_descs is None) or (storage_descs is None):
+            logger.error("Failed to get transfer descriptors")
+            return False
+
+        # Initialize transfer, default assumption that tensor was registered
+        try:
+            xfer_req = self.agent.initialize_xfer(
+                direction, host_descs, storage_descs, self.agent_name
+            )
+        except Exception:
+            # Check if it was due to missing pre-registration
+            if not self.register_buffers(buffers):
+                logger.error("Failed to register tensors/buffers")
+                return False
+
+            try:
+                xfer_req = self.agent.initialize_xfer(
+                    direction, host_descs, storage_descs, self.agent_name
+                )
+            except Exception as e:
+                logger.error(f"Failed to create transfer request: {e}")
+                return False
+
+        # Execute transfer and wait for its completion
+        try:
+            state = self.agent.transfer(xfer_req)
+            while state != "DONE":
+                state = self.agent.check_xfer_state(xfer_req)
+                if state == "ERR":
+                    self.agent.release_xfer_handle(xfer_req)
+                    logger.error("Transfer failed")
+                    return False
+                time.sleep(0.0001)  # Can be changed to os.sched_yield() or parametrized
+
+            self.agent.release_xfer_handle(xfer_req)
+            return True
+
+        except Exception as e:
+            logger.error(f"Failed to execute transfer: {e}")
+            import traceback
+
+            logger.error(f"Traceback: {traceback.format_exc()}")
+            return False
+
+    def get(
+        self,
+        key: str,
+        target_location: Optional[torch.Tensor | int] = None,
+        target_sizes: Optional[int] = None,
+    ) -> torch.Tensor | None:
+        # To be removed, being compatible with the current API
+        if target_location is None:
+            return None
+        if target_sizes:
+            result = self.batch_get([key], [target_location], [target_sizes])
+        else:
+            result = self.batch_get([key], [target_location])
+        return result[0] if result else None
+
+    def batch_get(
+        self,
+        keys: List[str],
+        target_locations: Optional[List[torch.Tensor | int]] = None,
+        target_sizes: Optional[List[int]] = None,
+    ) -> List[torch.Tensor | None]:
+        if not keys:
+            return []
+
+        # To be removed, being compatible with the current API
+        if not target_locations:
+            return [None] * len(keys)
+
+        if target_sizes and (len(target_sizes) != len(target_locations)):
+            logger.error("Mismatch between number of target_locations and target_sizes")
+            return [None] * len(keys)
+        if target_sizes:
+            dest = list(zip(target_locations, target_sizes))
+        else:
+            dest = target_locations
+
+        if self.backend_selector.mem_type == "FILE":
+            file_paths = [self.file_manager.get_file_path(key) for key in keys]
+            success = self._execute_transfer(dest, file_paths, "READ")
+        else:
+            success = self._execute_transfer(dest, keys, "READ")
+        return target_locations if success and not target_sizes else [None] * len(keys)
+
+    def set(
+        self,
+        key: str,
+        value: Optional[torch.Tensor] = None,
+        target_location: Optional[int] = None,
+        target_sizes: Optional[int] = None,
+    ) -> bool:
+        if target_location and target_sizes:
+            return self.batch_set([key], None, [target_location], [target_sizes])
+        else:
+            return self.batch_set([key], [value])
+
+    def batch_set(
+        self,
+        keys: List[str],
+        values: Optional[List[torch.Tensor]] = None,
+        target_locations: Optional[List[int]] = None,
+        target_sizes: Optional[List[int]] = None,
+    ) -> bool:
+        if not keys or (not values and (not target_locations or not target_sizes)):
+            logger.error("Keys or values were not passed")
+            return False
+
+        if not values:
+            values = list(zip(target_locations, target_sizes))
+
+        if self.backend_selector.mem_type == "FILE":
+            file_paths = []
+            for key in keys:
+                file_path = self.file_manager.get_file_path(key)
+                # New file per set, to be updated when partial writes is added to HiCache
+                if not self.file_manager.create_file(file_path):
+                    logger.error(f"Failed to create file {file_path}")
+                    return False
+                file_paths.append(file_path)
+            return self._execute_transfer(values, file_paths, "WRITE")
+        else:  # mem_type == "OBJ"
+            return self._execute_transfer(values, keys, "WRITE")
+
+    def exists(self, key: str) -> bool:
+        tuples = self.registration.create_query_tuples(
+            key,
+            self.backend_selector.mem_type,
+            self.file_manager if self.backend_selector.mem_type == "FILE" else None,
+        )
+        if not tuples:
+            return False
+
+        query_res = self.agent.query_memory(
+            tuples,
+            self.backend_selector.backend_name,
+            mem_type=self.backend_selector.mem_type,
+        )
+        return query_res[0] is not None  # can be expanded to multiple keys
diff --git a/python/sglang/srt/mem_cache/storage/nixl/nixl_utils.py b/python/sglang/srt/mem_cache/storage/nixl/nixl_utils.py
new file mode 100644
index 000000000..6e3d2a900
--- /dev/null
+++ b/python/sglang/srt/mem_cache/storage/nixl/nixl_utils.py
@@ -0,0 +1,204 @@
+import logging
+import os
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+class NixlBackendSelection:
+    """Handles NIXL backend selection and creation."""
+
+    # Priority order for File-based plugins in case of auto selection
+    FILE_PLUGINS = ["3FS", "POSIX", "GDS_MT", "GDS"]
+    # Priority order for File-based plugins in case of auto selection (add more as needed)
+    OBJ_PLUGINS = ["OBJ"]  # Based on Amazon S3 SDK
+
+    def __init__(self, plugin: str = "auto"):
+        """Initialize backend selection.
+        Args:
+            plugin: Plugin to use (default "auto" selects best available).
+                   Can be a file plugin (3FS, POSIX, GDS, GDS_MT) or
+                   an object plugin (OBJ).
+        """
+        self.plugin = plugin
+        self.backend_name = None
+        self.mem_type = None
+
+    def set_bucket(self, bucket_name: str) -> None:
+        """Set AWS bucket name in environment variable."""
+        os.environ["AWS_DEFAULT_BUCKET"] = bucket_name
+        logger.debug(f"Set AWS bucket name to: {bucket_name}")
+
+    def create_backend(self, agent) -> bool:
+        """Create the appropriate NIXL backend based on configuration."""
+        try:
+            plugin_list = agent.get_plugin_list()
+            logger.debug(f"Available NIXL plugins: {plugin_list}")
+
+            # Handle explicit plugin selection or auto priority
+            if self.plugin == "auto":
+                # Try all file plugins first
+                for plugin in self.FILE_PLUGINS:
+                    if plugin in plugin_list:
+                        self.backend_name = plugin
+                        break
+                # If no file plugin found, try object plugins
+                if not self.backend_name:
+                    for plugin in self.OBJ_PLUGINS:
+                        if plugin in plugin_list:
+                            self.backend_name = plugin
+                            break
+            else:
+                # Use explicitly requested plugin
+                self.backend_name = self.plugin
+
+            if self.backend_name not in plugin_list:
+                logger.error(
+                    f"Backend {self.backend_name} not available in plugins: {plugin_list}"
+                )
+                return False
+
+            # Create backend and set memory type
+            if self.backend_name in self.OBJ_PLUGINS:
+                bucket = os.environ.get("AWS_DEFAULT_BUCKET")
+                if not bucket:
+                    logger.error(
+                        "AWS_DEFAULT_BUCKET environment variable must be set for object storage"
+                    )
+                    return False
+                agent.create_backend(self.backend_name, {"bucket": bucket})
+            else:
+                agent.create_backend(self.backend_name)
+
+            self.mem_type = "OBJ" if self.backend_name in self.OBJ_PLUGINS else "FILE"
+            logger.debug(
+                f"Created NIXL backend: {self.backend_name} with memory type: {self.mem_type}"
+            )
+            return True
+
+        except Exception as e:
+            logger.error(f"Failed to create NIXL backend: {e}")
+            return False
+
+
+class NixlRegistration:
+    """Handles NIXL memory registration."""
+
+    def __init__(self, agent):
+        self.agent = agent
+
+    def create_query_tuples(
+        self, key: str, mem_type: str, file_manager=None
+    ) -> List[Tuple]:
+        """Create NIXL tuples for querying memory.
+        Args:
+            key: Key to query (file path for FILE or object key for OBJ)
+            mem_type: Memory type ("FILE" or "OBJ")
+            file_manager: Optional NixlFileManager for FILE memory type
+        Returns:
+            List of NIXL tuples for querying
+        """
+        if mem_type == "FILE":
+            if file_manager is None:
+                logger.error("file_manager required for FILE memory type")
+                return []
+            return [(0, 0, 0, file_manager.get_file_path(key))]
+        else:  # OBJ
+            return [(0, 0, key)]
+
+    def _register_memory(
+        self,
+        items: Union[List[tuple], torch.Tensor, List[torch.Tensor]],
+        mem_type: Optional[str] = None,
+    ) -> Optional[Any]:
+        """Common registration logic for files, objects, and buffers.
+        Args:
+            items: List of tuples or tensors to register
+            mem_type: Memory type ("FILE", "OBJ") or None for tensor or list of tensors
+        """
+        if isinstance(items, list) and not items:
+            return None
+
+        reg_descs = self.agent.get_reg_descs(items, mem_type)
+        if reg_descs is None:
+            logger.error("Failed to create registration descriptors")
+            return None
+
+        try:
+            registered_memory = self.agent.register_memory(reg_descs)
+            return registered_memory  # Could be None in case of error
+        except Exception as e:
+            if not mem_type:
+                logger.error(f"Failed to register Tensors with NIXL: {e}")
+            else:
+                logger.error(
+                    f"Failed to register memory of type {mem_type} with NIXL: {e}"
+                )
+            return None
+
+
+class NixlFileManager:
+    """Handles file system operations for NIXL."""
+
+    def __init__(self, base_dir: str):
+        """
+        Initialize file manager.
+        Args:
+            base_dir: Base directory for storing tensor files
+        """
+        self.base_dir = base_dir
+        if base_dir == "":
+            logger.debug(f"Initialized file manager without a base directory")
+        else:
+            os.makedirs(base_dir, exist_ok=True)
+            logger.debug(f"Initialized file manager with base directory: {base_dir}")
+
+    def get_file_path(self, key: str) -> str:
+        """Get full file path for a given key."""
+        return os.path.join(self.base_dir, key)
+
+    def create_file(self, file_path: str) -> bool:
+        """Create a file if it doesn't exist."""
+        try:
+            os.makedirs(os.path.dirname(file_path), exist_ok=True)
+            if not os.path.exists(file_path):
+                with open(file_path, "wb") as f:
+                    pass  # Create empty file
+            return True
+        except Exception as e:
+            logger.error(f"Failed to create file {file_path}: {e}")
+            return False
+
+    def open_file(self, file_path: str) -> Optional[int]:
+        """Open a file and return its file descriptor."""
+        try:
+            fd = os.open(file_path, os.O_RDWR)
+            return fd
+        except Exception as e:
+            logger.error(f"Failed to open file {file_path}: {e}")
+            return None
+
+    def close_file(self, fd: int) -> bool:
+        """Close a file descriptor."""
+        try:
+            os.close(fd)
+            return True
+        except Exception as e:
+            logger.error(f"Failed to close file descriptor {fd}: {e}")
+            return False
+
+    def files_to_nixl_tuples(
+        self, file_paths: List[str]
+    ) -> List[Tuple[int, int, int, str]]:
+        """Create NIXL tuples (offset, length, fd, file_path) for given files."""
+        tuples = []
+        for path in file_paths:
+            if (fd := self.open_file(path)) is None:
+                # Clean up on failure
+                for t in tuples:
+                    self.close_file(t[2])
+                return []
+            tuples.append((0, 0, fd, path))
+        return tuples
diff --git a/python/sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py b/python/sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py
new file mode 100755
index 000000000..951e5a4ea
--- /dev/null
+++ b/python/sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python3
+
+import os
+import unittest
+from typing import List, Optional
+from unittest.mock import MagicMock
+
+import torch
+
+from sglang.srt.mem_cache.storage.nixl.hicache_nixl import HiCacheNixl
+from sglang.srt.mem_cache.storage.nixl.nixl_utils import (
+    NixlFileManager,
+    NixlRegistration,
+)
+
+
+class TestNixlUnified(unittest.TestCase):
+    """Unified test suite for all NIXL components."""
+
+    def setUp(self):
+        """Set up test environment."""
+        # Create test directories
+        self.test_dir = "/tmp/test_nixl_unified"
+        os.makedirs(self.test_dir, exist_ok=True)
+
+        # Mock NIXL agent for registration tests
+        self.mock_agent = MagicMock()
+        self.mock_agent.get_reg_descs.return_value = "mock_reg_descs"
+        self.mock_agent.register_memory.return_value = "mock_registered_memory"
+
+        # Create instances
+        self.file_manager = NixlFileManager(self.test_dir)
+        self.registration = NixlRegistration(self.mock_agent)
+        try:
+            self.hicache = HiCacheNixl(file_path=self.test_dir, plugin="POSIX")
+        except ImportError:
+            self.skipTest("NIXL not available, skipping NIXL storage tests")
+
+    def tearDown(self):
+        """Clean up test directories."""
+        if os.path.exists(self.test_dir):
+            import shutil
+
+            shutil.rmtree(self.test_dir)
+
+    def delete_test_file(self, file_path: str) -> bool:
+        """Helper method to delete a test file.
+
+        Args:
+            file_path: Path to the file to delete
+
+        Returns:
+            bool: True if file was deleted or didn't exist, False on error
+        """
+        try:
+            if os.path.exists(file_path):
+                os.remove(file_path)
+            return True
+        except Exception as e:
+            return False
+
+    def verify_tensors_equal(self, expected: torch.Tensor, actual: torch.Tensor):
+        """Helper to verify tensor equality."""
+        self.assertIsNotNone(actual, "Retrieved tensor is None")
+        self.assertTrue(
+            torch.allclose(expected, actual, atol=1e-6),
+            f"Tensors not equal:\nExpected: {expected}\nActual: {actual}",
+        )
+
+    def verify_tensor_lists_equal(
+        self, expected: List[torch.Tensor], actual: List[torch.Tensor]
+    ):
+        """Helper to verify lists of tensors are equal."""
+        self.assertEqual(len(expected), len(actual), "Lists have different lengths")
+        for exp, act in zip(expected, actual):
+            self.verify_tensors_equal(exp, act)
+
+    # ============================================================================
+    # HiCache Integration Tests
+    # ============================================================================
+
+    def test_single_set_get(self):
+        """Test single tensor set/get operations."""
+        key = "test_key"
+        value = torch.randn(10, 10, device="cpu")
+        dst_tensor = torch.zeros_like(value, device="cpu")
+
+        # Test set
+        self.assertTrue(self.hicache.set(key, value))
+        self.assertTrue(self.hicache.exists(key))
+
+        # Test get
+        retrieved = self.hicache.get(key, dst_tensor)
+        self.verify_tensors_equal(value, dst_tensor)
+        self.verify_tensors_equal(value, retrieved)
+
+        # Same test in addr,len mode with another key and dst_tensor
+        key2 = "test_key2"
+        dst_tensor2 = torch.zeros_like(value, device="cpu")
+        src_addr, src_len = value.data_ptr(), value.numel() * value.element_size()
+        dst_addr, dst_len = (
+            dst_tensor2.data_ptr(),
+            dst_tensor2.numel() * dst_tensor2.element_size(),
+        )
+
+        # Test set
+        self.assertTrue(self.hicache.set(key, None, src_addr, src_len))
+        self.assertTrue(self.hicache.exists(key))
+
+        # Test get
+        retrieved2 = self.hicache.get(key, dst_addr, dst_len)
+        self.assertTrue(retrieved2 == None)
+        self.verify_tensors_equal(value, dst_tensor2)
+
+    def test_batch_set_get(self):
+        """Test batch tensor set/get operations."""
+        keys = ["key1", "key2", "key3"]
+        values = [
+            torch.randn(5, 5, device="cpu"),
+            torch.randn(3, 3, device="cpu"),
+            torch.randn(7, 7, device="cpu"),
+        ]
+        dst_tensors = [torch.zeros_like(v, device="cpu") for v in values]
+
+        # Test batch set
+        self.assertTrue(self.hicache.batch_set(keys, values))
+        self.assertTrue(all(self.hicache.exists(key) for key in keys))
+
+        # Test batch get
+        retrieved = self.hicache.batch_get(keys, dst_tensors)
+        self.verify_tensor_lists_equal(values, retrieved)
+
+        # Same test in addr,len mode with another key and dst_tensor
+        keys2 = ["key4", "key5", "key6"]
+        dst_tensors2 = [torch.zeros_like(v, device="cpu") for v in values]
+        src_addrs = [v.data_ptr() for v in values]
+        src_lens = [v.numel() * v.element_size() for v in values]
+        dst_addrs = [dt.data_ptr() for dt in dst_tensors2]
+        dst_lens = [dt.numel() * dt.element_size() for dt in dst_tensors2]
+
+        # Test batch set
+        self.assertTrue(self.hicache.batch_set(keys2, None, src_addrs, src_lens))
+        self.assertTrue(all(self.hicache.exists(key) for key in keys2))
+
+        # Test batch get
+        retrieved2 = self.hicache.batch_get(keys, dst_addrs, dst_lens)
+        self.assertTrue(all(ret == None for ret in retrieved2))
+        self.verify_tensor_lists_equal(values, dst_tensors2)
+
+    def test_mixed_operations(self):
+        """Test mixing single and batch operations."""
+        # Test interleaved set/get operations
+        key1, key2 = "key1", "key2"
+        value1 = torch.randn(4, 4, device="cpu")
+        value2 = torch.randn(6, 6, device="cpu")
+        dst1 = torch.zeros_like(value1)
+        dst2 = torch.zeros_like(value2)
+
+        # Single set/get
+        self.assertTrue(self.hicache.set(key1, value1))
+        retrieved1 = self.hicache.get(key1, dst1)
+        self.verify_tensors_equal(value1, retrieved1)
+
+        # Batch set/get
+        self.assertTrue(self.hicache.batch_set([key2], [value2]))
+        retrieved2 = self.hicache.batch_get([key2], [dst2])
+        self.verify_tensors_equal(value2, retrieved2[0])
+
+    def test_data_integrity(self):
+        """Test data integrity across operations."""
+        # Test with various tensor types and sizes
+        test_cases = [
+            ("float32", torch.randn(10, 10, dtype=torch.float32)),
+            ("float64", torch.randn(5, 5, dtype=torch.float64)),
+            ("int32", torch.randint(-100, 100, (8, 8), dtype=torch.int32)),
+            ("int64", torch.randint(-100, 100, (6, 6), dtype=torch.int64)),
+            ("bool", torch.randint(0, 2, (4, 4)).bool()),
+        ]
+
+        for name, tensor in test_cases:
+            with self.subTest(tensor_type=name):
+                key = f"test_{name}"
+                dst_tensor = torch.zeros_like(tensor)
+
+                # Set and immediately get
+                self.assertTrue(self.hicache.set(key, tensor))
+                retrieved1 = self.hicache.get(key, dst_tensor)
+                self.verify_tensors_equal(tensor, retrieved1)
+
+                # Get again to verify persistence
+                dst_tensor.zero_()
+                retrieved2 = self.hicache.get(key, dst_tensor)
+                self.verify_tensors_equal(tensor, retrieved2)
+
+    def test_basic_file_operations(self):
+        """Test basic file operations."""
+        test_file = os.path.join(self.test_dir, "test_file.bin")
+        self.file_manager.create_file(test_file)
+        self.assertTrue(os.path.exists(test_file))
+        self.assertEqual(os.path.getsize(test_file), 0)  # Empty file
+
+        # Test file deletion
+        self.assertTrue(self.delete_test_file(test_file))
+        self.assertFalse(os.path.exists(test_file))
+
+    def test_create_nixl_tuples(self):
+        """Test creation of NIXL tuples."""
+        test_file = os.path.join(self.test_dir, "test_file.bin")
+        self.file_manager.create_file(test_file)
+
+        # Test tuple creation
+        tuples = self.file_manager.files_to_nixl_tuples([test_file])
+        self.assertIsNotNone(tuples)
+        self.assertTrue(len(tuples) > 0)
+
+    def test_error_handling(self):
+        """Test error handling in file operations."""
+        # Test non-existent file
+        self.assertTrue(
+            self.delete_test_file("nonexistent_file.bin")
+        )  # Returns True if file doesn't exist
+
+        # Test invalid file path
+        self.assertFalse(self.file_manager.create_file(""))  # Empty path should fail
+
+    def test_register_buffers(self):
+        """Test registration of memory buffers."""
+        # Create test tensor
+        tensor = torch.randn(10, 10)
+
+        # Test buffer registration
+        self.assertIsNotNone(self.hicache.register_buffers(tensor))
+
+        # Test batch registration
+        tensors = [torch.randn(5, 5) for _ in range(3)]
+        self.assertIsNotNone(self.hicache.register_buffers(tensors))
+
+    def test_register_files_with_tuples(self):
+        """Test registration of files using NIXL tuples."""
+        files = [os.path.join(self.test_dir, f"test_file_{i}.bin") for i in range(3)]
+        for file in files:
+            self.file_manager.create_file(file)
+
+        # Create tuples and register
+        tuples = self.file_manager.files_to_nixl_tuples(files)
+        self.hicache.register_files(tuples)
+
+        # Verify tuples
+        self.assertEqual(len(tuples), len(files))
+        for t, f in zip(tuples, files):
+            self.assertEqual(t[3], f)  # Check file path
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/sglang/srt/mem_cache/swa_radix_cache.py b/python/sglang/srt/mem_cache/swa_radix_cache.py
new file mode 100644
index 000000000..686fc6ab0
--- /dev/null
+++ b/python/sglang/srt/mem_cache/swa_radix_cache.py
@@ -0,0 +1,1023 @@
+from __future__ import annotations
+
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""
+The radix tree data structure for managing the hybrid (full and SWA) KV cache.
+"""
+
+import heapq
+import time
+from collections import defaultdict
+from functools import partial
+from typing import TYPE_CHECKING, List, Optional, Tuple
+
+import torch
+
+from sglang.srt.mem_cache.allocator import SWATokenToKVPoolAllocator
+from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache, MatchResult
+from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class TreeNode:
+
+    counter = 0
+    swa_uuid_counter = 1
+
+    def __init__(self, id: Optional[int] = None):
+        self.children = defaultdict(TreeNode)
+        self.parent: TreeNode = None
+        self.key: List[int] = None
+        self.value: Optional[torch.Tensor] = None
+        # swa_tombstone is used to indicate the kv indices have been freed for swa layers
+        self.swa_tombstone = False
+        # invariant: for any node, if swa_lock_ref is locked, full_lock_ref must be locked;
+        # if full_lock_ref is locked, swa_lock_ref doesn't need to be locked. So,
+        # full_lock_ref is always >= swa_lock_ref.
+        self.full_lock_ref = 0
+        self.swa_lock_ref = 0
+        # last access time is only used for sanity check. LRU is maintained by the lru list.
+        self.last_access_time = time.monotonic()
+
+        self.hit_count = 0
+        # store the host indices of KV cache
+        self.host_value = None
+
+        # for lru list, invariant:
+        # 1. prev has greater last_access_time
+        # 2. next has smaller last_access_time
+        self.prev = None
+        self.next = None
+        self.swa_prev = None
+        self.swa_next = None
+
+        self.id = TreeNode.counter if id is None else id
+        TreeNode.counter += 1
+        self.swa_uuid = None
+
+    @property
+    def evicted(self):
+        return self.value is None
+
+    @property
+    def backuped(self):
+        return self.host_value is not None
+
+    def __lt__(self, other: "TreeNode"):
+        return self.last_access_time < other.last_access_time
+
+
+def _key_match_page_size1(key0: List, key1: List):
+    i = 0
+    for k0, k1 in zip(key0, key1):
+        if k0 != k1:
+            break
+        i += 1
+    return i
+
+
+def _key_match_paged(key0: List, key1: List, page_size: int):
+    min_len = min(len(key0), len(key1))
+
+    i = 0
+    while i < min_len:
+        if key0[i : i + page_size] != key1[i : i + page_size]:
+            break
+        i += page_size
+
+    return i
+
+
+def gen_swa_uuid() -> int:
+    TreeNode.swa_uuid_counter += 1
+    return TreeNode.swa_uuid_counter
+
+
+class LRUList:
+    def __init__(self, swa: bool = False):
+        self.swa = swa
+        if self.swa:
+            self.prv = "swa_prev"
+            self.nxt = "swa_next"
+            self.lock_ref = "swa_lock_ref"
+        else:
+            self.prv = "prev"
+            self.nxt = "next"
+            self.lock_ref = "full_lock_ref"
+        # Initialize dummy head and tail nodes
+        self.head = TreeNode()  # Most recently used side
+        self.tail = TreeNode()  # Least recently used side
+        setattr(self.head, self.nxt, self.tail)  # self.head.next = self.tail
+        setattr(self.tail, self.prv, self.head)  # self.tail.prev = self.head
+        self.cache = {}
+
+    def _add_node(self, node):
+        """Helper to add node right after head (most recently used)"""
+        self._add_node_after(self.head, node)
+
+    def _add_node_after(self, old_node, new_node):
+        """Helper to add node right after old_node"""
+        setattr(new_node, self.prv, old_node)  # new_node.prev = old_node
+        setattr(
+            new_node, self.nxt, getattr(old_node, self.nxt)
+        )  # new_node.next = old_node.next
+        setattr(
+            getattr(old_node, self.nxt), self.prv, new_node
+        )  # old_node.next.prev = new_node
+        setattr(old_node, self.nxt, new_node)  # old_node.next = new_node
+
+    def _remove_node(self, node):
+        """Helper to remove node from linked list"""
+        setattr(
+            getattr(node, self.prv), self.nxt, getattr(node, self.nxt)
+        )  # node.prev.next = node.next
+        setattr(
+            getattr(node, self.nxt), self.prv, getattr(node, self.prv)
+        )  # node.next.prev = node.prev
+
+    def _get_lru(self) -> Optional[TreeNode]:
+        """
+        Get the least recently used node
+        """
+        if len(self.cache) == 0:
+            return None
+        return getattr(self.tail, self.prv)
+
+    def reset_node_mru(self, node):
+        """
+        Move a (existing) node to most recently used position
+        """
+        assert node.id in self.cache, f"Resetting node {node.id=} not in lru list"
+        assert (
+            not self.swa or not node.swa_tombstone
+        ), f"Resetting swa tombstone node in swa lru list: {node.id=}"
+        self._remove_node(node)
+        self._add_node(node)
+
+    def reset_node_and_parents_mru(self, node, root_node):
+        """
+        Move an (existing) node and its parents to most recently used position. Child node is
+        more recently used than parent node.
+        """
+        prev_node = self.head
+        while node != root_node:
+            # for swa lru list, only reset non-tombstone nodes
+            if not self.swa or not node.swa_tombstone:
+                assert (
+                    node.id in self.cache
+                ), f"Resetting node {node.id=} not in lru list when resetting node and parents mru"
+                self._remove_node(node)
+                self._add_node_after(prev_node, node)
+                prev_node = node
+            node = node.parent
+
+    def insert_mru(self, node):
+        """
+        Insert a (new) node as most recently used
+        """
+        assert (
+            not self.swa or not node.swa_tombstone
+        ), f"Inserting swa tombstone node in swa lru list: {node.id=}"
+        assert (
+            node.id not in self.cache
+        ), f"Inserting node {node.id=} already in lru list, existing node: {self.cache[node.id].id=}"
+        self.cache[node.id] = node
+        self._add_node(node)
+
+    def remove_node(self, node: TreeNode):
+        """
+        Remove node from lru list
+        """
+        assert node.id in self.cache, f"Removing node {node.id=} not in lru list"
+        assert (
+            not self.swa or not node.swa_tombstone
+        ), f"Removing swa tombstone node from swa lru list: {node.id=}"
+        del self.cache[node.id]
+        self._remove_node(node)
+
+    def get_lru_no_lock(self) -> Optional[TreeNode]:
+        """
+        Get the least recently used node that is not locked
+        """
+        return self.get_prev_no_lock(self.tail, check_id=False)
+
+    def get_leaf_lru_no_lock(self) -> Optional[TreeNode]:
+        """
+        Get the least recently used leaf node that is not locked
+        """
+        return self.get_prev_leaf_no_lock(self.tail, check_id=False)
+
+    def get_prev_no_lock(
+        self, node: TreeNode, check_id: bool = True
+    ) -> Optional[TreeNode]:
+        """
+        Get the previous (i.e. more recently used) node that is not locked
+        """
+        if check_id:
+            assert (
+                node.id in self.cache
+            ), f"Getting prev of node {node.id=} not in lru list"
+        x = getattr(node, self.prv)  # x = node.prev
+        while getattr(x, self.lock_ref) > 0:
+            x = getattr(x, self.prv)  # x = x.prev
+        # if x is the head, it means there is no node in the lru list without lock
+        if x == self.head:
+            return None
+        return x
+
+    def get_prev_leaf_no_lock(self, node: TreeNode, check_id: bool = True):
+        """
+        Get the previous (i.e. more recently used) leaf node that is not locked
+        """
+        if check_id:
+            assert (
+                node.id in self.cache
+            ), f"Getting prev of node {node.id=} not in lru list"
+        x = getattr(node, self.prv)  # x = node.prev
+        while getattr(x, self.lock_ref) > 0 or len(x.children) > 0:
+            x = getattr(x, self.prv)  # x = x.prev
+        # if x is the head, it means there is no leaf node in the lru list without lock
+        if x == self.head:
+            return None
+        return x
+
+    def in_list(self, node: Optional[TreeNode]):
+        """
+        Check if the node is in the lru list
+        """
+        if not node:
+            return False
+        return node.id in self.cache
+
+    # Note: this is expensive, only use for debug
+    def sanity_check_evictable_size(self):
+        """
+        Check the evictable size (i.e. the size of the nodes that are not locked)
+        """
+        node = self.get_lru_no_lock()
+        evictable_size = 0
+        while self.in_list(node):
+            evictable_size += len(node.value)
+            node = self.get_prev_no_lock(node)
+        return evictable_size
+
+    # Note: this is expensive, only use for debug or idle check
+    def sanity_check(self, tree_cache: "SWARadixCache"):
+        """
+        Check if the lru list is valid by rebuilding the lru list from the tree, heapifying it, and
+        checking if the lru list is valid.
+        """
+        try:
+            if self.swa:
+                nodes = tree_cache._collect_nontombstone_nodes()
+            else:
+                nodes = tree_cache._collect_all_nodes()
+            total_nodes = len(nodes)
+            total_lru_plus_1 = len(self.cache) + 1
+            # heapify based on last_access_time
+            heapq.heapify(nodes)
+            # the root node is not in the lru list
+            assert (
+                len(nodes) == len(self.cache) + 1
+            ), f"len(nodes): {len(nodes)} != len(self.cache) + 1: {len(self.cache) + 1}"
+
+            x_lru = self._get_lru()
+            while len(nodes):
+                x = heapq.heappop(nodes)
+                if x == tree_cache.root_node:
+                    # root node is not in the lru list
+                    continue
+                assert (
+                    x == x_lru
+                ), f"Incorrect LRU list, {self.swa=}, x: {x.id=} != x_lru: {x_lru.id=}"
+                assert (
+                    x_lru.full_lock_ref == 0
+                ), f"x_lru should not be locked when idle, {x_lru.full_lock_ref=}, {x_lru.swa_uuid=}, {x_lru.id=}"
+                assert (
+                    x_lru.swa_lock_ref == 0
+                ), f"x_lru should not be locked when idle, {x_lru.swa_lock_ref=}, {x_lru.swa_uuid=}, {x_lru.id=}"
+                x_lru = getattr(x, self.prv)
+
+            if self.swa:
+                evictable_size = tree_cache.swa_evictable_size()
+                lru_list_evictable_size = tree_cache.swa_lru_list_evictable_size()
+            else:
+                evictable_size = tree_cache.full_evictable_size()
+                lru_list_evictable_size = tree_cache.full_lru_list_evictable_size()
+
+            assert (
+                evictable_size == lru_list_evictable_size
+            ), f"{self.swa=}, total nodes: {total_nodes}, total lru plus 1: {total_lru_plus_1}, evictable size: {evictable_size} != lru list evictable size: {lru_list_evictable_size}"
+        except Exception as e:
+            msg = f"SWA Radix tree sanity check failed, ping @hanming-lu: {e}"
+            logger.error(msg)
+            raise Exception(msg)
+
+
+class SWARadixCache(BasePrefixCache):
+    def __init__(
+        self,
+        req_to_token_pool: ReqToTokenPool,
+        token_to_kv_pool_allocator: SWATokenToKVPoolAllocator,
+        sliding_window_size: int,
+        page_size: int,
+        disable: bool = False,
+    ):
+        assert isinstance(token_to_kv_pool_allocator, SWATokenToKVPoolAllocator)
+        self.req_to_token_pool = req_to_token_pool
+        self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
+        self.page_size = page_size
+        self.disable = disable
+
+        if self.token_to_kv_pool_allocator:
+            self.device = self.token_to_kv_pool_allocator.device
+        else:
+            self.device = torch.device("cpu")
+
+        if self.page_size == 1:
+            self.key_match_fn = _key_match_page_size1
+            self.get_child_key_fn = lambda key: key[0]
+        else:
+            self.key_match_fn = partial(_key_match_paged, page_size=page_size)
+            self.get_child_key_fn = lambda key: tuple(key[:page_size])
+
+        self.sliding_window_size = sliding_window_size
+        self.reset()
+
+    ##### Public API #####
+
+    def reset(self) -> None:
+        self.root_node = TreeNode()
+        self.root_node.key = []
+        self.root_node.value = []
+        self.root_node.full_lock_ref = 1
+        self.root_node.swa_lock_ref = 1
+        self.full_evictable_size_ = 0
+        self.swa_evictable_size_ = 0
+        self.full_protected_size_ = 0
+        self.swa_protected_size_ = 0
+        # LRU lists are used to maintain the order of eviction of the nodes in the tree
+        self.full_lru_list = LRUList(swa=False)
+        self.swa_lru_list = LRUList(swa=True)
+
+    def match_prefix(self, key: List[int], **kwargs) -> MatchResult:
+        """Find the matching prefix from the radix tree.
+        Args:
+            key: A list of token IDs to find a matching prefix.
+        Returns:
+            A tuple of a tensor of matching prefix token IDs and
+            the last node that contains the prefix values. Note that
+            this API can modify the internal state of the Radix tree.
+            The last node create a new child if the prefix is shorter
+            than the last node's value.
+        """
+        if self.disable or len(key) == 0:
+            return MatchResult(
+                device_indices=torch.empty(
+                    (0,),
+                    dtype=torch.int64,
+                    device=self.device,
+                ),
+                last_device_node=self.root_node,
+                last_host_node=self.root_node,
+            )
+
+        if self.page_size != 1:
+            page_aligned_len = len(key) // self.page_size * self.page_size
+            key = key[:page_aligned_len]
+
+        value, last_node = self._match_prefix_helper(key)
+        if value:
+            value = torch.cat(value)
+        else:
+            value = torch.empty((0,), dtype=torch.int64, device=self.device)
+        return MatchResult(
+            device_indices=value,
+            last_device_node=last_node,
+            last_host_node=last_node,
+        )
+
+    def insert(self, key: List, value=None, prev_prefix_len: int = 0) -> int:
+        if self.disable:
+            return 0
+
+        if value is None:
+            value = [x for x in key]
+        return self._insert_helper(self.root_node, key, value, prev_prefix_len)
+
+    def cache_finished_req(self, req: Req) -> None:
+        """Cache request when it finishes."""
+        if self.disable:
+            kv_indices = self.req_to_token_pool.req_to_token[
+                req.req_pool_idx,
+                : len(req.origin_input_ids) + max(len(req.output_ids) - 1, 0),
+            ]
+            self.token_to_kv_pool_allocator.free(kv_indices)
+            self.req_to_token_pool.free(req.req_pool_idx)
+            return
+
+        token_ids = (req.origin_input_ids + req.output_ids)[:-1]
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, : len(token_ids)
+        ]
+
+        if self.page_size != 1:
+            page_aligned_len = len(kv_indices) // self.page_size * self.page_size
+            page_aligned_kv_indices = kv_indices[:page_aligned_len].clone()
+            self.token_to_kv_pool_allocator.free(kv_indices[page_aligned_len:])
+        else:
+            page_aligned_len = len(kv_indices)
+            page_aligned_kv_indices = kv_indices.clone()
+
+        # Radix Cache takes one ref in memory pool
+        # insert the token_ids and kv_indices into the radix tree
+        # Note: the insert function already frees the overlapped kv_indices
+        new_prefix_len = self.insert(
+            token_ids[:page_aligned_len],
+            page_aligned_kv_indices,
+            len(req.prefix_indices),
+        )
+
+        # Remove req slot release the cache lock
+        self.req_to_token_pool.free(req.req_pool_idx)
+        self.dec_lock_ref(req.last_node, req.swa_uuid_for_lock)
+
+    def cache_unfinished_req(self, req: Req, chunked=False) -> None:
+        """Cache request when it is unfinished."""
+        if self.disable:
+            kv_indices = self.req_to_token_pool.req_to_token[
+                req.req_pool_idx, : len(req.fill_ids)
+            ]
+
+            # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
+            req.prefix_indices = kv_indices
+            return
+
+        token_ids = req.fill_ids
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, : len(token_ids)
+        ]
+
+        if self.page_size != 1:
+            page_aligned_len = len(kv_indices) // self.page_size * self.page_size
+            page_aligned_kv_indices = kv_indices[:page_aligned_len].clone()
+        else:
+            page_aligned_len = len(kv_indices)
+            page_aligned_kv_indices = kv_indices.clone()
+        page_aligned_token_ids = token_ids[:page_aligned_len]
+
+        # Radix Cache takes one ref in memory pool
+        # Note: the insert function already frees the overlapped kv_indices
+        new_prefix_len = self.insert(
+            page_aligned_token_ids, page_aligned_kv_indices, len(req.prefix_indices)
+        )
+
+        # The prefix indices could be updated, reuse it
+        new_indices, new_last_node, _, _ = self.match_prefix(page_aligned_token_ids)
+        assert len(req.prefix_indices) <= len(
+            new_indices
+        ), f"{req.prefix_indices=}, {new_indices=}"
+        assert new_prefix_len <= len(new_indices), f"{new_prefix_len=}, {new_indices=}"
+        self.req_to_token_pool.write(
+            (req.req_pool_idx, slice(len(req.prefix_indices), len(new_indices))),
+            new_indices[len(req.prefix_indices) :],
+        )
+
+        self.dec_lock_ref(req.last_node, req.swa_uuid_for_lock)
+        swa_uuid_for_lock = self.inc_lock_ref(new_last_node)
+
+        # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
+        if self.page_size != 1:
+            req.prefix_indices = torch.cat(
+                [new_indices, kv_indices[len(new_indices) :]]
+            )
+        else:
+            req.prefix_indices = new_indices
+        req.last_node = new_last_node
+        req.swa_uuid_for_lock = swa_uuid_for_lock
+
+    def pretty_print(self) -> None:
+        self._print_helper(self.root_node, 0)
+        total_size, total_swa_size = self._total_size_helper()
+        print(f"#full_tokens: {total_size}, #swa_tokens: {total_swa_size}")
+
+    def total_size(self) -> Tuple[int, int]:
+        return self._total_size_helper()
+
+    def evict(self, full_num_tokens: int, swa_num_tokens: int = 0) -> None:
+        if self.disable:
+            return
+
+        full_num_evicted = 0
+        swa_num_evicted = 0
+        if full_num_tokens > 0:
+            # get the least recently used leaf node that is not locked
+            x = self.full_lru_list.get_leaf_lru_no_lock()
+
+            while full_num_evicted < full_num_tokens and self.full_lru_list.in_list(x):
+                assert (
+                    x != self.root_node
+                ), f"root node should not exist in full lru list, {x.id=}"
+                assert x.full_lock_ref == 0, f"node is in use, {x.id=}"
+
+                # 1. free node kv indices, evict full and swa tokens
+                self.token_to_kv_pool_allocator.free(x.value)
+                full_num_evicted += len(x.value)
+                swa_num_evicted += len(x.value)
+
+                # 2. get the next leaf, update the lru lists
+                x_next = self.full_lru_list.get_prev_leaf_no_lock(x)
+                self.full_lru_list.remove_node(x)
+                self.swa_lru_list.remove_node(x)
+
+                # 3. delete the leaf node
+                self._delete_leaf(x)
+
+                # 4. Iteratively delete tombstone leaves to maintain invariant that leaf nodes are not tombstone
+                x, leaf_full_num_evicted = self._iteratively_delete_tombstone_leaf(x)
+                full_num_evicted += leaf_full_num_evicted
+
+                # 5. if parent has no more children, it is a leaf. It is possible that this node is lru, so
+                # we need to get the first leaf node in the lru list
+                if len(x.parent.children) == 0:
+                    x_next = self.full_lru_list.get_leaf_lru_no_lock()
+
+                x = x_next
+
+        if swa_num_evicted < swa_num_tokens:
+            # get the least recently used node that is not locked, doesn't have to be a leaf
+            x = self.swa_lru_list.get_lru_no_lock()
+
+            # evict lru leaf nodes until swa_num_tokens is reached
+            while swa_num_evicted < swa_num_tokens and (self.swa_lru_list.in_list(x)):
+                assert not x.swa_tombstone, f"duplicate swa tombstone node, {x.id=}"
+                assert x != self.root_node, f"root node is not evictable, {x.id=}"
+                assert x.swa_lock_ref == 0, f"node is in use by swa kv indices, {x.id=}"
+
+                if len(x.children) > 0:
+                    # 1. an internal node, free swa tokens.
+                    self.token_to_kv_pool_allocator.free_swa(x.value)
+                    swa_num_evicted += len(x.value)
+
+                    # 2. get the next node, update the lru lists
+                    x_next = self.swa_lru_list.get_prev_no_lock(x)
+                    self.swa_lru_list.remove_node(x)
+
+                    # 3. tombstone the node
+                    self._tombstone_internal_node(x)
+                else:
+                    assert (
+                        x.full_lock_ref == 0
+                    ), f"leaf node with full lock must also have swa lock, {x.id=}"
+                    # 1. a leaf node, free full and swa tokens
+                    self.token_to_kv_pool_allocator.free(x.value)
+                    full_num_evicted += len(x.value)
+                    swa_num_evicted += len(x.value)
+
+                    # 2. get the next node, update the lru lists
+                    x_next = self.swa_lru_list.get_prev_no_lock(x)
+                    self.full_lru_list.remove_node(x)
+                    self.swa_lru_list.remove_node(x)
+
+                    # 3. delete the leaf node
+                    self._delete_leaf(x)
+
+                    # 4. Iteratively delete tombstone leaves to maintain invariant that leaf nodes are not tombstone
+                    self._iteratively_delete_tombstone_leaf(x)
+
+                x = x_next
+
+    def inc_lock_ref(self, node: TreeNode) -> Optional[int]:
+        """
+        Increment the lock reference count for the node. Returns the swa_uuid_for_lock, which needs
+        to be passed to dec_lock_ref.
+        It locks the full_lock_ref for nodes between the [last node, root), exclusive.
+        It locks the swa_lock_ref for nodes between the [last node, swa_uuid_for_lock], inclusive.
+        """
+        if self.disable:
+            return None
+
+        swa_lock_size = 0
+        swa_uuid_for_lock = None
+        while node != self.root_node:
+            # lock full from node to root
+            assert (
+                node.full_lock_ref >= 0
+            ), f"inc_lock_ref on node with {node.full_lock_ref=}, {node.id=}"
+            if node.full_lock_ref == 0:
+                self.full_evictable_size_ -= len(node.value)
+                self.full_protected_size_ += len(node.value)
+            node.full_lock_ref += 1
+
+            # lock swa if we have not reached the sliding window size.
+            # When we reach the sliding window size, we will set the swa_uuid_for_lock.
+            # caller needs to pass the swa_uuid_for_lock to dec_lock_ref
+            if swa_lock_size < self.sliding_window_size:
+                assert (
+                    not node.swa_tombstone
+                ), f"inc_lock_swa on swa_tombstone node, {node.id=}"
+                if node.swa_lock_ref == 0:
+                    self.swa_evictable_size_ -= len(node.value)
+                    self.swa_protected_size_ += len(node.value)
+                node.swa_lock_ref += 1
+                swa_lock_size += len(node.value)
+                if swa_lock_size >= self.sliding_window_size:
+                    if node.swa_uuid is None:
+                        node.swa_uuid = gen_swa_uuid()
+                    swa_uuid_for_lock = node.swa_uuid
+            node = node.parent
+        return swa_uuid_for_lock
+
+    def dec_lock_ref(self, node: TreeNode, swa_uuid_for_lock: Optional[int] = None):
+        """
+        Decrement the lock reference count for the node.
+        It unlocks the full_lock_ref for nodes between the [last node, root), exclusive.
+        It unlocks the swa_lock_ref for nodes between the [last node, swa_uuid_for_lock], inclusive.
+        If swa_uuid_for_lock is None, it unlocks to the root, exclusive.
+        """
+        if self.disable:
+            return
+
+        dec_lock_swa = True
+        while node != self.root_node:
+            assert (
+                node.full_lock_ref > 0
+            ), f"dec_lock_ref on node with {node.full_lock_ref=}, {node.id=}"
+            if node.full_lock_ref == 1:
+                self.full_evictable_size_ += len(node.value)
+                self.full_protected_size_ -= len(node.value)
+            node.full_lock_ref -= 1
+
+            if dec_lock_swa:
+                assert (
+                    not node.swa_tombstone
+                ), f"dec_lock_ref on swa_tombstone node, {node.id=}"
+                assert (
+                    node.swa_lock_ref > 0
+                ), f"dec_lock_ref on node with {node.swa_lock_ref=}, {node.id=}"
+
+                if node.swa_lock_ref == 1:
+                    self.swa_evictable_size_ += len(node.value)
+                    self.swa_protected_size_ -= len(node.value)
+                node.swa_lock_ref -= 1
+                if swa_uuid_for_lock and node.swa_uuid == swa_uuid_for_lock:
+                    dec_lock_swa = False
+
+            node = node.parent
+
+    def sanity_check(self):
+        self.full_lru_list.sanity_check(self)
+        self.swa_lru_list.sanity_check(self)
+
+    def evictable_size(self) -> Tuple[int, int]:
+        # Note: use full_evictable_size() and swa_evictable_size() instead.
+        raise NotImplementedError
+
+    def full_evictable_size(self) -> int:
+        return self.full_evictable_size_
+
+    def swa_evictable_size(self) -> int:
+        return self.swa_evictable_size_
+
+    # Note: this is expensive, only use for debug
+    def full_lru_list_evictable_size(self) -> int:
+        return self.full_lru_list.sanity_check_evictable_size()
+
+    # Note: this is expensive, only use for debug
+    def swa_lru_list_evictable_size(self) -> int:
+        return self.swa_lru_list.sanity_check_evictable_size()
+
+    def protected_size(self) -> Tuple[int, int]:
+        # Note: use full_protected_size() and swa_protected_size() instead.
+        raise NotImplementedError
+
+    def full_protected_size(self) -> int:
+        # protected size refers to the size of the full cache that is locked
+        return self.full_protected_size_
+
+    def swa_protected_size(self) -> int:
+        # protected size refers to the size of the swa cache that is locked
+        return self.swa_protected_size_
+
+    def all_values_flatten(self) -> torch.Tensor:
+        values = []
+
+        def _dfs_helper(node: TreeNode):
+            for _, child in node.children.items():
+                values.append(child.value)
+                _dfs_helper(child)
+
+        _dfs_helper(self.root_node)
+        return torch.cat(values)
+
+    ##### Internal Helper Functions #####
+
+    def _match_prefix_helper(self, key: List) -> Tuple[List[torch.Tensor], TreeNode]:
+        """
+        SWA prefix matching helper. It factors in the sliding window size such that
+        the matched node is guaranteed to either 1. connected to root without swa tombstone,
+        or 2. the number of matching tokens from the matched node to the last swa tombstone
+        node is greater than or equal to the sliding window size.
+        """
+        node = self.root_node
+        child_key = self.get_child_key_fn(key)
+
+        value = []
+        # for path connected to root without tombstone, always match, so set to inf
+        match_len_since_tombstone = float("inf")
+        best_value_len = 0
+        best_last_node = node
+        while len(key) > 0 and child_key in node.children.keys():
+            child = node.children[child_key]
+
+            # update best_value_len and best_last_node if needed
+            if (
+                child.swa_tombstone
+                and match_len_since_tombstone >= self.sliding_window_size
+            ):
+                best_value_len = len(value)
+                best_last_node = node
+                match_len_since_tombstone = 0
+
+            prefix_len = self.key_match_fn(child.key, key)
+            if prefix_len < len(child.key):
+                new_node = self._split_node(child.key, child, prefix_len)
+                value.append(new_node.value)
+                if not new_node.swa_tombstone:
+                    match_len_since_tombstone += len(new_node.value)
+                node = new_node
+                break
+            else:
+                value.append(child.value)
+                if not child.swa_tombstone:
+                    match_len_since_tombstone += len(child.value)
+                node = child
+                key = key[prefix_len:]
+
+                if len(key):
+                    child_key = self.get_child_key_fn(key)
+
+        # handle best_value_len and best_last_node, for the case that last node is fully matched
+        if match_len_since_tombstone >= self.sliding_window_size:
+            best_value_len = len(value)
+            best_last_node = node
+
+        # update time for matched nodes, and make nodes closer to root to be least recently used
+        # this allows swa to evict nodes closer to root first
+        self.full_lru_list.reset_node_and_parents_mru(best_last_node, self.root_node)
+        self.swa_lru_list.reset_node_and_parents_mru(best_last_node, self.root_node)
+
+        # This last_access_time is for sanity check, can be deleted after validation in production
+        cur_time = time.monotonic()
+        while node:
+            node.last_access_time = cur_time
+            cur_time -= 0.0001
+            node = node.parent
+
+        return value[:best_value_len], best_last_node
+
+    def _split_node(self, key: List[int], child: TreeNode, split_len: int) -> TreeNode:
+        # new_node -> child
+        new_node = TreeNode()
+        new_node.children = {self.get_child_key_fn(key[split_len:]): child}
+        new_node.parent = child.parent
+        new_node.swa_tombstone = child.swa_tombstone
+        new_node.full_lock_ref = child.full_lock_ref
+        new_node.swa_lock_ref = child.swa_lock_ref
+        new_node.key = child.key[:split_len]
+        new_node.value = child.value[:split_len]
+        # parent inherits the swa_uuid from child for swa lock ref
+        new_node.swa_uuid = child.swa_uuid
+        child.swa_uuid = None
+        # child time should be later than parent's time for swa tombstone
+        child.last_access_time = time.monotonic()
+
+        # remove the child from the lru lists because it is being split
+        self.full_lru_list.remove_node(child)
+        if not new_node.swa_tombstone:
+            self.swa_lru_list.remove_node(child)
+        child.parent = new_node
+        child.key = child.key[split_len:]
+        child.value = child.value[split_len:]
+        new_node.parent.children[self.get_child_key_fn(key)] = new_node
+
+        # insert the new node and child into the lru lists, insert
+        # parent first so that parent is after child in the lru list
+        self.full_lru_list.insert_mru(new_node)
+        self.full_lru_list.insert_mru(child)
+        if not new_node.swa_tombstone:
+            self.swa_lru_list.insert_mru(new_node)
+            self.swa_lru_list.insert_mru(child)
+        return new_node
+
+    def _insert_helper(
+        self, node: TreeNode, key: List, value, update_kv_after_len: int
+    ) -> int:
+        # Update the last access time from root to leaf, so that
+        # swa will tombstone the node closer to root first
+        node.last_access_time = time.monotonic()
+        if node != self.root_node:
+            self.full_lru_list.reset_node_mru(node)
+            if not node.swa_tombstone:
+                self.swa_lru_list.reset_node_mru(node)
+        if len(key) == 0:
+            return 0
+
+        child_key = self.get_child_key_fn(key)
+
+        total_prefix_length = 0
+        while len(key) > 0 and child_key in node.children.keys():
+            node = node.children[child_key]
+            node.last_access_time = time.monotonic()
+            self.full_lru_list.reset_node_mru(node)
+            if not node.swa_tombstone:
+                self.swa_lru_list.reset_node_mru(node)
+            prefix_len = self.key_match_fn(node.key, key)
+
+            if prefix_len < len(node.key):
+                new_node = self._split_node(node.key, node, prefix_len)
+                node = new_node
+
+            # if tombstone after update_kv_after_len, update node.value to be the input value.
+            # This is needed because it is possible that the last sliding window size tokens
+            # contains tombstone. If this is the case and we don't update the kv value, then
+            # the prefill prefix matching will stuck.
+            if update_kv_after_len < total_prefix_length + prefix_len:
+                first_diff_idx = max(0, update_kv_after_len - total_prefix_length)
+                if node.swa_tombstone:
+                    assert (
+                        node.swa_lock_ref == 0
+                    ), f"tombstone swa_lock_ref should always be 0, {node.full_lock_ref=}, {node.swa_lock_ref=}, {node.id=}"
+                    self.token_to_kv_pool_allocator.free(node.value[first_diff_idx:])
+                    node.value = value[:prefix_len]
+                    node.swa_tombstone = False
+
+                    # insert the node into the lru lists
+                    self.swa_lru_list.insert_mru(node)
+
+                    self.swa_evictable_size_ += len(node.value)
+                else:
+                    self.token_to_kv_pool_allocator.free(
+                        value[first_diff_idx:prefix_len]
+                    )
+
+            total_prefix_length += prefix_len
+            key = key[prefix_len:]
+            value = value[prefix_len:]
+
+            if len(key):
+                child_key = self.get_child_key_fn(key)
+
+        if len(key):
+            new_node = TreeNode()
+            new_node.parent = node
+            new_node.key = key
+            new_node.value = value
+            self.full_lru_list.insert_mru(new_node)
+            self.swa_lru_list.insert_mru(new_node)
+            node.children[child_key] = new_node
+            self.full_evictable_size_ += len(value)
+            self.swa_evictable_size_ += len(value)
+        return total_prefix_length
+
+    def _iteratively_delete_tombstone_leaf(
+        self, node: TreeNode
+    ) -> Tuple[TreeNode, int]:
+        full_num_evicted = 0
+        while node.parent.swa_tombstone and len(node.parent.children) == 0:
+            # root node is not evictable
+            if node.parent == self.root_node:
+                break
+            # if locked, means node is in use, skip
+            if node.parent.full_lock_ref > 0:
+                break
+            assert (
+                node.parent.swa_lock_ref == 0
+            ), f"tombstone swa_lock_ref should always be 0, {node.parent.full_lock_ref=}, {node.parent.swa_lock_ref=}, {node.parent.id=}"
+            # delete tombstone node evicts full tokens
+            self.token_to_kv_pool_allocator.free(node.parent.value)
+            full_num_evicted += len(node.parent.value)
+            self.full_lru_list.remove_node(node.parent)
+            self._delete_tombstone_leaf(node.parent)
+            node = node.parent
+
+        return node, full_num_evicted
+
+    def _delete_leaf(self, node: TreeNode) -> None:
+        assert (
+            not node.swa_tombstone
+        ), f"Invariant violated: leaf node is a tombstone, {node.id=}"
+        assert len(node.children) == 0, f"leaf node has children, {node.id=}"
+        for k, v in node.parent.children.items():
+            if v == node:
+                break
+        del node.parent.children[k]
+        self.full_evictable_size_ -= len(node.key)
+        self.swa_evictable_size_ -= len(node.key)
+
+    def _tombstone_internal_node(self, node: TreeNode) -> None:
+        assert len(node.children) != 0, f"Cannot tombstone a leaf node, {node.id=}"
+        node.swa_tombstone = True
+        self.swa_evictable_size_ -= len(node.key)
+
+    def _delete_tombstone_leaf(self, node: TreeNode) -> None:
+        assert (
+            node.swa_tombstone
+        ), f"Deleting a unexpected non-tombstone leaf node, {node.id=}"
+        assert len(node.children) == 0, f"leaf node has children, {node.id=}"
+        for k, v in node.parent.children.items():
+            if v == node:
+                break
+        del node.parent.children[k]
+        self.full_evictable_size_ -= len(node.key)
+
+    def _collect_leaves(self) -> List[TreeNode]:
+        ret_list = []
+        stack = [self.root_node]
+
+        while stack:
+            cur_node = stack.pop()
+            if len(cur_node.children) == 0:
+                ret_list.append(cur_node)
+            else:
+                stack.extend(cur_node.children.values())
+
+        return ret_list
+
+    def _collect_nontombstone_nodes(self) -> List[TreeNode]:
+        ret_list = []
+        stack = [self.root_node]
+
+        while stack:
+            cur_node = stack.pop()
+            if not cur_node.swa_tombstone:
+                ret_list.append(cur_node)
+            stack.extend(cur_node.children.values())
+
+        return ret_list
+
+    def _collect_all_nodes(self) -> List[TreeNode]:
+        ret_list = []
+        stack = [self.root_node]
+        while stack:
+            cur_node = stack.pop()
+            ret_list.append(cur_node)
+            stack.extend(cur_node.children.values())
+        return ret_list
+
+    def _print_helper(self, node: TreeNode, indent: int) -> None:
+        """Prints the radix tree in a human-readable format."""
+        stack = [(node, indent)]
+        while stack:
+            current_node, current_indent = stack.pop()
+            print(
+                " " * current_indent,
+                current_node.id,
+                len(current_node.key),
+                f"fr={current_node.full_lock_ref}",
+                f"sr={current_node.swa_lock_ref}",
+                f"fll={self.full_lru_list.in_list(current_node)}",
+                f"sll={self.swa_lru_list.in_list(current_node)}",
+                f"ts={current_node.swa_tombstone}",
+            )
+            for key, child in current_node.children.items():
+                stack.append((child, current_indent + 2))
+
+                assert key == self.get_child_key_fn(
+                    child.key
+                ), f"{key=}, {self.get_child_key_fn(child.key)=}"
+
+    def _total_size_helper(self) -> Tuple[int, int]:
+        total_size = 0
+        total_swa_size = 0
+        stack = [self.root_node]
+        while stack:
+            current_node = stack.pop()
+            total_size += len(current_node.value)
+            if not current_node.swa_tombstone:
+                total_swa_size += len(current_node.value)
+            for child in current_node.children.values():
+                if child.evicted:
+                    continue
+                stack.append(child)
+        return total_size, total_swa_size
diff --git a/python/sglang/srt/metrics/collector.py b/python/sglang/srt/metrics/collector.py
new file mode 100644
index 000000000..7cbcb6949
--- /dev/null
+++ b/python/sglang/srt/metrics/collector.py
@@ -0,0 +1,974 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for Prometheus Metrics Collection."""
+
+import time
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Dict, List, Optional, Union
+
+from sglang.srt.metrics.utils import generate_buckets
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import get_bool_env_var
+
+SGLANG_TEST_REQUEST_TIME_STATS = get_bool_env_var("SGLANG_TEST_REQUEST_TIME_STATS")
+
+
+@dataclass
+class TimeStats:
+    """
+    Store the timestamps for each stage of a request.
+
+    Unified: wait_queue -> forward -> completion
+    Prefill: bootstrap_queue -> wait_queue -> forward -> transfer_queue -> completion
+    Decode: prealloc_queue -> transfer_queue -> wait_queue -> forward -> completion
+    """
+
+    lb_entry_time: float = 0.0
+    wait_queue_entry_time: float = 0.0
+    forward_entry_time: float = 0.0
+    completion_time: float = 0.0
+    prefill_bootstrap_queue_entry_time: float = 0.0
+    prefill_transfer_queue_entry_time: float = 0.0
+    decode_prealloc_queue_entry_time: float = 0.0
+    decode_transfer_queue_entry_time: float = 0.0
+
+    class RequestType(Enum):
+        UNIFIED = "unified"
+        PREFILL = "prefill"
+        DECODE = "decode"
+        INVALID = "invalid"
+
+    def get_queueing_time(self) -> float:
+        return self.forward_entry_time - self.wait_queue_entry_time
+
+    def __str__(self) -> str:
+        # if unified
+        _type = self.get_type()
+
+        if _type == self.RequestType.UNIFIED:
+            queue_duration = self.forward_entry_time - self.wait_queue_entry_time
+            forward_duration = self.completion_time - self.forward_entry_time
+
+            if SGLANG_TEST_REQUEST_TIME_STATS:
+                assert (
+                    queue_duration >= 0 and forward_duration >= 0
+                ), f"queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
+
+            return f"queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.wait_queue_entry_time}"
+        elif _type == self.RequestType.PREFILL:
+            bootstrap_duration = (
+                self.wait_queue_entry_time - self.prefill_bootstrap_queue_entry_time
+            )
+
+            queue_duration = self.forward_entry_time - self.wait_queue_entry_time
+
+            forward_duration = self.completion_time - self.forward_entry_time
+
+            if SGLANG_TEST_REQUEST_TIME_STATS:
+                assert (
+                    bootstrap_duration >= 0
+                    and queue_duration >= 0
+                    and forward_duration >= 0
+                ), f"bootstrap_duration={bootstrap_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
+            return f"bootstrap_duration={self.format_duration(bootstrap_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.prefill_bootstrap_queue_entry_time}"
+        # if decode
+        elif _type == self.RequestType.DECODE:
+            prealloc_duration = (
+                self.decode_transfer_queue_entry_time
+                - self.decode_prealloc_queue_entry_time
+            )
+
+            transfer_duration = (
+                self.wait_queue_entry_time - self.decode_transfer_queue_entry_time
+            )
+            queue_duration = self.forward_entry_time - self.wait_queue_entry_time
+            forward_duration = self.completion_time - self.forward_entry_time
+
+            if SGLANG_TEST_REQUEST_TIME_STATS:
+                assert (
+                    prealloc_duration >= 0
+                    and transfer_duration >= 0
+                    and queue_duration >= 0
+                    and forward_duration >= 0
+                ), f"prealloc_duration={prealloc_duration} < 0 or transfer_duration={transfer_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
+
+            return f"prealloc_duration={self.format_duration(prealloc_duration)}, transfer_duration={self.format_duration(transfer_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.decode_prealloc_queue_entry_time}"
+        else:
+            return "Invalid Time Stats"
+
+    def format_duration(self, duration: float) -> str:
+        return f"{duration * 1e3:.2f}ms"
+
+    def get_type(self) -> RequestType:
+        """Determine the type of request based on timestamp values."""
+        if (
+            self.prefill_bootstrap_queue_entry_time == 0.0
+            and self.prefill_transfer_queue_entry_time == 0.0
+            and self.decode_prealloc_queue_entry_time == 0.0
+            and self.decode_transfer_queue_entry_time == 0.0
+        ):
+            return self.RequestType.UNIFIED
+        elif (
+            self.prefill_bootstrap_queue_entry_time > 0.0
+            and self.prefill_transfer_queue_entry_time > 0.0
+        ):
+            return self.RequestType.PREFILL
+        elif (
+            self.decode_prealloc_queue_entry_time > 0.0
+            and self.decode_transfer_queue_entry_time > 0.0
+            and self.wait_queue_entry_time > 0.0
+        ):
+            return self.RequestType.DECODE
+        else:
+            return self.RequestType.INVALID
+
+
+@dataclass
+class SchedulerStats:
+    # Basics
+    num_running_reqs: int = 0
+    num_used_tokens: int = 0
+    token_usage: float = 0.0
+    swa_token_usage: float = 0.0
+    gen_throughput: float = 0.0
+    num_queue_reqs: int = 0
+    num_grammar_queue_reqs: int = 0
+    num_running_reqs_offline_batch: int = 0
+    avg_request_queue_latency: float = 0.0
+    cache_hit_rate: float = 0.0
+
+    # Speculative decoding
+    spec_accept_length: float = 0.0
+
+    # PD disaggregation
+    num_prefill_prealloc_queue_reqs: int = 0
+    num_prefill_inflight_queue_reqs: int = 0
+    num_decode_prealloc_queue_reqs: int = 0
+    num_decode_transfer_queue_reqs: int = 0
+    kv_transfer_speed_gb_s: float = 0.0
+    kv_transfer_latency_ms: float = 0.0
+
+    # Retract
+    total_retracted_reqs: int = 0
+    num_retracted_reqs: int = 0
+    num_paused_reqs: int = 0
+
+    # Utilization
+    utilization: float = 0.0
+    max_running_requests_under_SLO: Optional[int] = None
+
+    # Engine startup
+    engine_startup_time: float = 0.0
+    engine_load_weights_time: float = 0.0
+
+
+class SchedulerMetricsCollector:
+
+    def __init__(self, labels: Dict[str, str]) -> None:
+        # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
+        from prometheus_client import Counter, Gauge, Histogram
+
+        self.labels = labels
+        self.last_log_time = time.perf_counter()
+
+        self.num_running_reqs = Gauge(
+            name="sglang:num_running_reqs",
+            documentation="The number of running requests.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.num_used_tokens = Gauge(
+            name="sglang:num_used_tokens",
+            documentation="The number of used tokens.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.token_usage = Gauge(
+            name="sglang:token_usage",
+            documentation="The token usage.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.swa_token_usage = Gauge(
+            name="sglang:swa_token_usage",
+            documentation="The token usage for SWA layers.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.gen_throughput = Gauge(
+            name="sglang:gen_throughput",
+            documentation="The generation throughput (token/s).",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.num_queue_reqs = Gauge(
+            name="sglang:num_queue_reqs",
+            documentation="The number of requests in the waiting queue.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.num_grammar_queue_reqs = Gauge(
+            name="sglang:num_grammar_queue_reqs",
+            documentation="The number of requests in the grammar waiting queue.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.num_running_reqs_offline_batch = Gauge(
+            name="sglang:num_running_reqs_offline_batch",
+            documentation="The number of running low-priority offline batch requests(label is 'batch').",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.avg_request_queue_latency = Gauge(
+            name="sglang:avg_request_queue_latency",
+            documentation="The average request queue latency for the last batch of requests in seconds.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.cache_hit_rate = Gauge(
+            name="sglang:cache_hit_rate",
+            documentation="The prefix cache hit rate.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+
+        # Speculative decoding
+        self.spec_accept_length = Gauge(
+            name="sglang:spec_accept_length",
+            documentation="The average acceptance length of speculative decoding.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+
+        # PD disaggregation
+        self.num_prefill_prealloc_queue_reqs = Gauge(
+            name="sglang:num_prefill_prealloc_queue_reqs",
+            documentation="The number of requests in the prefill prealloc queue.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.num_prefill_inflight_queue_reqs = Gauge(
+            name="sglang:num_prefill_inflight_queue_reqs",
+            documentation="The number of requests in the prefill inflight queue.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.num_decode_prealloc_queue_reqs = Gauge(
+            name="sglang:num_decode_prealloc_queue_reqs",
+            documentation="The number of requests in the decode prealloc queue.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.num_decode_transfer_queue_reqs = Gauge(
+            name="sglang:num_decode_transfer_queue_reqs",
+            documentation="The number of requests in the decode transfer queue.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.num_bootstrap_failed_reqs = Counter(
+            name="sglang:num_bootstrap_failed_reqs_total",
+            documentation="The number of bootstrap failed requests.",
+            labelnames=labels.keys(),
+        )
+        self.num_transfer_failed_reqs = Counter(
+            name="sglang:num_transfer_failed_reqs_total",
+            documentation="The number of transfer failed requests.",
+            labelnames=labels.keys(),
+        )
+        self.kv_transfer_speed_gb_s = Gauge(
+            name="sglang:kv_transfer_speed_gb_s",
+            documentation="The transfer speed of the KV cache in GB/s.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.kv_transfer_latency_ms = Gauge(
+            name="sglang:kv_transfer_latency_ms",
+            documentation="The transfer latency of the KV cache in ms.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+
+        # Retract
+        self.total_retracted_reqs = Gauge(
+            name="sglang:total_retracted_reqs",
+            documentation="The total number of retracted requests due to kvcache full.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.num_retracted_reqs = Gauge(
+            name="sglang:num_retracted_reqs",
+            documentation="The number of retracted requests.",
+            labelnames=labels.keys(),
+        )
+        self.num_paused_reqs = Gauge(
+            name="sglang:num_paused_reqs",
+            documentation="The number of paused requests by async weight sync.",
+            labelnames=labels.keys(),
+        )
+
+        # Utilization
+        self.utilization = Gauge(
+            name="sglang:utilization",
+            documentation="The utilization.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.max_running_requests_under_SLO = Gauge(
+            name="sglang:max_running_requests_under_SLO",
+            documentation="The maximum number of running requests under SLO.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+
+        # Engine startup
+        self.engine_startup_time = Gauge(
+            name="sglang:engine_startup_time",
+            documentation="The time taken for the engine to start up.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.engine_load_weights_time = Gauge(
+            name="sglang:engine_load_weights_time",
+            documentation="The time taken for the engine to load weights.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+
+        # Additional queueing time histogram
+        self.queue_time = Histogram(
+            name="sglang:queue_time_s",
+            documentation="Histogram of queueing time in seconds.",
+            labelnames=labels.keys(),
+            buckets=[
+                0.0,
+                0.1,
+                0.2,
+                0.5,
+                1,
+                2,
+                3,
+                4,
+                5,
+                10,
+                15,
+                20,
+                30,
+                40,
+                50,
+                60,
+                70,
+                80,
+                90,
+                100,
+                200,
+                300,
+                400,
+                500,
+                600,
+                700,
+                800,
+                900,
+                1000,
+                1200,
+                1400,
+                1600,
+                1800,
+                2000,
+                2500,
+                3000,
+            ],
+        )
+
+        # Grammar metrics
+        self.grammar_compilation_time = Histogram(
+            name="sglang:grammar_compilation_time_seconds",
+            documentation="Histogram of grammar compilation time in seconds.",
+            labelnames=labels.keys(),
+            buckets=[
+                0.0,
+                0.01,
+                0.02,
+                0.05,
+                0.1,
+                0.2,
+                0.5,
+                1,
+                2,
+                5,
+                10,
+                20,
+                30,
+                60,
+                90,
+                120,
+                240,
+            ],
+        )
+        self.num_grammar_cache_hit = Counter(
+            name="sglang:num_grammar_cache_hit_total",
+            documentation="Number of grammar cache hits.",
+            labelnames=labels.keys(),
+        )
+        self.num_grammar_aborted = Counter(
+            name="sglang:num_grammar_aborted_total",
+            documentation="Number of grammar aborted requests.",
+            labelnames=labels.keys(),
+        )
+        self.num_grammar_total = Counter(
+            name="sglang:num_grammar_total",
+            documentation="Number of the total grammar requests.",
+            labelnames=labels.keys(),
+        )
+        self.grammar_schema_count = Histogram(
+            name="sglang:grammar_schema_count",
+            documentation="Histogram of grammar schema count.",
+            labelnames=labels.keys(),
+            buckets=[
+                0,
+                1,
+                2,
+                5,
+                10,
+                20,
+                30,
+                40,
+                60,
+                80,
+                100,
+                120,
+                140,
+                160,
+                180,
+                200,
+                300,
+                400,
+                500,
+                700,
+                1000,
+            ],
+        )
+        self.grammar_ebnf_size = Histogram(
+            name="sglang:grammar_ebnf_size",
+            documentation="Histogram of grammar EBNF size.",
+            labelnames=labels.keys(),
+            buckets=[
+                0,
+                50,
+                100,
+                200,
+                300,
+                500,
+                1000,
+                2000,
+                3000,
+                5000,
+                10000,
+                20000,
+                30000,
+                50000,
+                100000,
+            ],
+        )
+
+        tree_traversal_time_buckets = [
+            0.0,
+            0.01,
+            0.02,
+            0.05,
+            0.1,
+            0.2,
+            0.5,
+            1,
+            2,
+            5,
+            10,
+            15,
+            30,
+            60,
+            90,
+            120,
+            240,
+        ]
+        self.grammar_tree_traversal_time_avg = Histogram(
+            name="sglang:grammar_tree_traversal_time_avg",
+            documentation="Histogram of average grammar tree traversal time in seconds.",
+            labelnames=labels.keys(),
+            buckets=tree_traversal_time_buckets,
+        )
+        self.grammar_tree_traversal_time_max = Histogram(
+            name="sglang:grammar_tree_traversal_time_max",
+            documentation="Histogram of max grammar tree traversal time in seconds.",
+            labelnames=labels.keys(),
+            buckets=tree_traversal_time_buckets,
+        )
+
+    def _log_gauge(self, gauge, data: Union[int, float]) -> None:
+        # Convenience function for logging to gauge.
+        gauge.labels(**self.labels).set(data)
+
+    def log_histogram(self, histogram, data: Union[int, float]) -> None:
+        histogram.labels(**self.labels).observe(data)
+
+    def increment_bootstrap_failed_reqs(self) -> None:
+        self.num_bootstrap_failed_reqs.labels(**self.labels).inc(1)
+
+    def increment_transfer_failed_reqs(self) -> None:
+        self.num_transfer_failed_reqs.labels(**self.labels).inc(1)
+
+    def log_stats(self, stats: SchedulerStats) -> None:
+        self._log_gauge(self.num_running_reqs, stats.num_running_reqs)
+        self._log_gauge(self.num_used_tokens, stats.num_used_tokens)
+        self._log_gauge(self.token_usage, stats.token_usage)
+        self._log_gauge(self.swa_token_usage, stats.swa_token_usage)
+        self._log_gauge(self.gen_throughput, stats.gen_throughput)
+        self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
+        self._log_gauge(self.num_grammar_queue_reqs, stats.num_grammar_queue_reqs)
+        self._log_gauge(
+            self.num_running_reqs_offline_batch, stats.num_running_reqs_offline_batch
+        )
+        self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
+
+        # Speculative decoding
+        self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
+
+        # PD disaggregation
+        self._log_gauge(
+            self.num_prefill_prealloc_queue_reqs, stats.num_prefill_prealloc_queue_reqs
+        )
+        self._log_gauge(
+            self.num_prefill_inflight_queue_reqs, stats.num_prefill_inflight_queue_reqs
+        )
+        self._log_gauge(
+            self.num_decode_prealloc_queue_reqs, stats.num_decode_prealloc_queue_reqs
+        )
+        self._log_gauge(
+            self.num_decode_transfer_queue_reqs, stats.num_decode_transfer_queue_reqs
+        )
+        self._log_gauge(self.kv_transfer_speed_gb_s, stats.kv_transfer_speed_gb_s)
+        self._log_gauge(self.kv_transfer_latency_ms, stats.kv_transfer_latency_ms)
+
+        # Retract
+        self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs)
+        self._log_gauge(self.num_retracted_reqs, stats.num_retracted_reqs)
+        self._log_gauge(self.num_paused_reqs, stats.num_paused_reqs)
+
+        # Utilization
+        self._log_gauge(self.utilization, stats.utilization)
+        if stats.max_running_requests_under_SLO is not None:
+            self._log_gauge(
+                self.max_running_requests_under_SLO,
+                stats.max_running_requests_under_SLO,
+            )
+
+        # Engine startup time
+        self._log_gauge(self.engine_startup_time, stats.engine_startup_time)
+        if stats.engine_load_weights_time is not None:
+            self._log_gauge(
+                self.engine_load_weights_time, stats.engine_load_weights_time
+            )
+
+        self.last_log_time = time.perf_counter()
+
+    def log_grammar_stats(self, grammar_stats) -> None:
+        # Duck-typed GrammarStats to avoid cross-package dependency
+        if getattr(grammar_stats, "compilation_time", None) is not None:
+            self.log_histogram(
+                self.grammar_compilation_time, grammar_stats.compilation_time
+            )
+        if getattr(grammar_stats, "schema_count", None) is not None:
+            self.log_histogram(self.grammar_schema_count, grammar_stats.schema_count)
+        if getattr(grammar_stats, "ebnf_size", None) is not None:
+            self.log_histogram(self.grammar_ebnf_size, grammar_stats.ebnf_size)
+        tree_times = getattr(grammar_stats, "tree_traversal_time", None)
+        if tree_times:
+            max_time = max(tree_times)
+            avg_time = sum(tree_times) / len(tree_times)
+            self.log_histogram(self.grammar_tree_traversal_time_max, max_time)
+            self.log_histogram(self.grammar_tree_traversal_time_avg, avg_time)
+        if getattr(grammar_stats, "is_cache_hit", False):
+            self.num_grammar_cache_hit.labels(**self.labels).inc(1)
+        if getattr(grammar_stats, "is_grammar_aborted", False):
+            self.num_grammar_aborted.labels(**self.labels).inc(1)
+        self.num_grammar_total.labels(**self.labels).inc(1)
+
+
+class TokenizerMetricsCollector:
+    def __init__(
+        self,
+        server_args: Optional[ServerArgs] = None,
+        labels: Dict[str, str] = None,
+        bucket_time_to_first_token: Optional[List[float]] = None,
+        bucket_inter_token_latency: Optional[List[float]] = None,
+        bucket_e2e_request_latency: Optional[List[float]] = None,
+        collect_tokens_histogram: bool = False,
+    ) -> None:
+        # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
+        from prometheus_client import Counter, Histogram
+
+        self.labels = labels or {}
+        self.collect_tokens_histogram = collect_tokens_histogram
+
+        self.prompt_tokens_total = Counter(
+            name="sglang:prompt_tokens_total",
+            documentation="Number of prefill tokens processed.",
+            labelnames=labels.keys(),
+        )
+
+        self.generation_tokens_total = Counter(
+            name="sglang:generation_tokens_total",
+            documentation="Number of generation tokens processed.",
+            labelnames=labels.keys(),
+        )
+
+        if collect_tokens_histogram:
+            default_bucket_prompt_tokens = [
+                100,
+                300,
+                500,
+                700,
+                1000,
+                1500,
+                2000,
+                3000,
+                4000,
+                5000,
+                6000,
+                7000,
+                8000,
+                9000,
+                10000,
+                12000,
+                15000,
+                20000,
+                22000,
+                25000,
+                30000,
+                35000,
+                40000,
+                66000,
+                99000,
+                132000,
+                300000,
+                600000,
+                900000,
+                1100000,
+            ]
+            self.prompt_tokens_histogram = Histogram(
+                name="sglang:prompt_tokens_histogram",
+                documentation="Histogram of prompt token length.",
+                labelnames=labels.keys(),
+                buckets=generate_buckets(
+                    server_args.prompt_tokens_buckets, default_bucket_prompt_tokens
+                ),
+            )
+            self.generation_tokens_histogram = Histogram(
+                name="sglang:generation_tokens_histogram",
+                documentation="Histogram of generation token length.",
+                labelnames=labels.keys(),
+                buckets=generate_buckets(
+                    server_args.generation_tokens_buckets,
+                    default_bucket_prompt_tokens,
+                ),
+            )
+
+        self.cached_tokens_total = Counter(
+            name="sglang:cached_tokens_total",
+            documentation="Number of cached prompt tokens.",
+            labelnames=labels.keys(),
+        )
+
+        self.num_requests_total = Counter(
+            name="sglang:num_requests_total",
+            documentation="Number of requests processed.",
+            labelnames=labels.keys(),
+        )
+
+        self.num_so_requests_total = Counter(
+            name="sglang:num_so_requests_total",
+            documentation="Number of structured output requests processed.",
+            labelnames=labels.keys(),
+        )
+
+        self.num_aborted_requests_total = Counter(
+            name="sglang:num_aborted_requests",
+            documentation="Number of requests aborted.",
+            labelnames=labels.keys(),
+        )
+
+        if bucket_time_to_first_token is None:
+            bucket_time_to_first_token = [
+                0.1,
+                0.2,
+                0.4,
+                0.6,
+                0.8,
+                1,
+                2,
+                4,
+                6,
+                8,
+                10,
+                20,
+                40,
+                60,
+                80,
+                100,
+                200,
+                400,
+            ]
+
+        if bucket_e2e_request_latency is None:
+            bucket_e2e_request_latency = [
+                0.1,
+                0.2,
+                0.4,
+                0.6,
+                0.8,
+                1,
+                2,
+                4,
+                6,
+                8,
+                10,
+                20,
+                40,
+                60,
+                80,
+                100,
+                200,
+                400,
+                600,
+                1200,
+                1800,
+                2400,
+            ]
+
+        if bucket_inter_token_latency is None:
+            bucket_inter_token_latency = [
+                0.002,
+                0.004,
+                0.006,
+                0.008,
+                0.010,
+                0.015,
+                0.020,
+                0.025,
+                0.030,
+                0.035,
+                0.040,
+                0.060,
+                0.080,
+                0.100,
+                0.200,
+                0.400,
+                0.600,
+                0.800,
+                1.000,
+                2.000,
+                4.000,
+                6.000,
+                8.000,
+            ]
+
+        self.histogram_time_to_first_token = Histogram(
+            name="sglang:time_to_first_token_seconds",
+            documentation="Histogram of time to first token in seconds.",
+            labelnames=labels.keys(),
+            buckets=bucket_time_to_first_token,
+        )
+
+        self.histogram_inter_token_latency_seconds = Histogram(
+            name="sglang:inter_token_latency_seconds",
+            documentation="Histogram of inter-token latency in seconds.",
+            labelnames=labels.keys(),
+            buckets=bucket_inter_token_latency,
+        )
+
+        self.histogram_e2e_request_latency = Histogram(
+            name="sglang:e2e_request_latency_seconds",
+            documentation="Histogram of End-to-end request latency in seconds",
+            labelnames=labels.keys(),
+            buckets=bucket_e2e_request_latency,
+        )
+
+        # Offline batch specific TTFB histogram
+        self.histogram_time_to_first_token_offline_batch = Histogram(
+            name="sglang:time_to_first_token_seconds_offline_batch",
+            documentation="Histogram of time to first token in seconds for offline batch requests.",
+            labelnames=labels.keys(),
+            buckets=bucket_time_to_first_token,
+        )
+
+    def _log_histogram(self, histogram, data: Union[int, float]) -> None:
+        histogram.labels(**self.labels).observe(data)
+
+    def observe_one_finished_request(
+        self,
+        prompt_tokens: int,
+        generation_tokens: int,
+        cached_tokens: int,
+        e2e_latency: float,
+        has_grammar: bool,
+    ):
+        self.prompt_tokens_total.labels(**self.labels).inc(prompt_tokens)
+        self.generation_tokens_total.labels(**self.labels).inc(generation_tokens)
+        if cached_tokens > 0:
+            self.cached_tokens_total.labels(**self.labels).inc(cached_tokens)
+        self.num_requests_total.labels(**self.labels).inc(1)
+        if has_grammar:
+            self.num_so_requests_total.labels(**self.labels).inc(1)
+        self._log_histogram(self.histogram_e2e_request_latency, e2e_latency)
+        if self.collect_tokens_histogram:
+            self._log_histogram(self.prompt_tokens_histogram, prompt_tokens)
+            self._log_histogram(self.generation_tokens_histogram, generation_tokens)
+
+    def observe_time_to_first_token(self, value: float, label: str = ""):
+        if label == "batch":
+            self.histogram_time_to_first_token_offline_batch.labels(
+                **self.labels
+            ).observe(value)
+        else:
+            self.histogram_time_to_first_token.labels(**self.labels).observe(value)
+
+    def check_time_to_first_token_straggler(self, value: float) -> bool:
+        his = self.histogram_time_to_first_token.labels(**self.labels)
+        total_observations = sum(bucket._value for bucket in his._buckets)
+        if total_observations < 100:
+            return False
+        p99_threshold = total_observations * 0.99
+        cumulative_count = 0
+        for i, bucket in enumerate(his._buckets):
+            cumulative_count += bucket._value
+            if cumulative_count > p99_threshold:
+                return value >= his._upper_bounds[i]
+        return False
+
+    def observe_inter_token_latency(self, internval: float, num_new_tokens: int):
+        adjusted_interval = internval / num_new_tokens
+
+        # A faster version of the Histogram::observe which observes multiple values at the same time.
+        # reference: https://github.com/prometheus/client_python/blob/v0.21.1/prometheus_client/metrics.py#L639
+        his = self.histogram_inter_token_latency_seconds.labels(**self.labels)
+        his._sum.inc(internval)
+
+        for i, bound in enumerate(his._upper_bounds):
+            if adjusted_interval <= bound:
+                his._buckets[i].inc(num_new_tokens)
+                break
+
+    def observe_one_aborted_request(self):
+        self.num_aborted_requests_total.labels(**self.labels).inc(1)
+
+
+@dataclass
+class StorageMetrics:
+    prefetch_pgs: List[int] = field(default_factory=list)
+    backup_pgs: List[int] = field(default_factory=list)
+    prefetch_bandwidth: List[float] = field(default_factory=list)
+    backup_bandwidth: List[float] = field(default_factory=list)
+
+
+class StorageMetricsCollector:
+    def __init__(
+        self,
+        labels: Dict[str, str],
+    ):
+        from prometheus_client import Counter, Histogram
+
+        self.labels = labels
+
+        self.prefetched_tokens_total = Counter(
+            name="sglang:prefetched_tokens_total",
+            documentation="Number of prefetched prompt tokens.",
+            labelnames=labels.keys(),
+        )
+
+        self.backuped_tokens_total = Counter(
+            name="sglang:backuped_tokens_total",
+            documentation="Number of backuped tokens.",
+            labelnames=labels.keys(),
+        )
+
+        bucket_io = [
+            1,
+            5,
+            10,
+            50,
+            100,
+        ]
+
+        bucket_bandwidth = [
+            0.1,
+            0.5,
+            1,
+            5,
+            10,
+            50,
+            100,
+        ]
+
+        self.histogram_prefetch_pgs = Histogram(
+            name="sglang:prefetch_pgs",
+            documentation="Histogram of prefetch pages of batches.",
+            labelnames=labels.keys(),
+            buckets=bucket_io,
+        )
+
+        self.histogram_backup_pgs = Histogram(
+            name="sglang:backup_pgs",
+            documentation="Histogram of backup pages of batches.",
+            labelnames=labels.keys(),
+            buckets=bucket_io,
+        )
+
+        self.histogram_prefetch_bandwidth = Histogram(
+            name="sglang:prefetch_bandwidth",
+            documentation="Histogram of prefetch bandwidth in GB/s.",
+            labelnames=labels.keys(),
+            buckets=bucket_bandwidth,
+        )
+
+        self.histogram_backup_bandwidth = Histogram(
+            name="sglang:backup_bandwidth",
+            documentation="Histogram of backup bandwidth in GB/s.",
+            labelnames=labels.keys(),
+            buckets=bucket_bandwidth,
+        )
+
+    def log_prefetched_tokens(self, prefetched_tokens: int):
+        if prefetched_tokens > 0:
+            self.prefetched_tokens_total.labels(**self.labels).inc(prefetched_tokens)
+
+    def log_backuped_tokens(self, backuped_tokens: int):
+        if backuped_tokens > 0:
+            self.backuped_tokens_total.labels(**self.labels).inc(backuped_tokens)
+
+    def _log_histogram(self, histogram, data: Union[int, float]):
+        histogram.labels(**self.labels).observe(data)
+
+    def log_storage_metrics(self, storage_metrics: Optional[StorageMetrics] = None):
+        if storage_metrics is None:
+            return
+
+        assert isinstance(storage_metrics, StorageMetrics)
+
+        for v in storage_metrics.prefetch_pgs:
+            self._log_histogram(self.histogram_prefetch_pgs, v)
+        for v in storage_metrics.backup_pgs:
+            self._log_histogram(self.histogram_backup_pgs, v)
+        for v in storage_metrics.prefetch_bandwidth:
+            self._log_histogram(self.histogram_prefetch_bandwidth, v)
+        for v in storage_metrics.backup_bandwidth:
+            self._log_histogram(self.histogram_backup_bandwidth, v)
diff --git a/python/sglang/srt/metrics/func_timer.py b/python/sglang/srt/metrics/func_timer.py
new file mode 100644
index 000000000..e965d25f8
--- /dev/null
+++ b/python/sglang/srt/metrics/func_timer.py
@@ -0,0 +1,106 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Records the latency of some functions
+"""
+
+import asyncio
+import time
+from functools import wraps
+from typing import Any, Callable, List, Optional
+
+enable_metrics = False
+
+
+def enable_func_timer():
+    # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
+    from prometheus_client import Histogram
+
+    global enable_metrics, FUNC_LATENCY
+    enable_metrics = True
+
+    FUNC_LATENCY = Histogram(
+        "sglang:func_latency_seconds",
+        "Function latency in seconds",
+        # captures latency in range [50ms - ~50s]
+        buckets=exponential_buckets(start=0.05, width=1.5, length=18),
+        labelnames=["name"],
+    )
+
+
+FUNC_LATENCY = None
+
+
+def exponential_buckets(start: float, width: float, length: int) -> List[float]:
+    buckets = []
+    for i in range(length):
+        buckets.append(start * (width**i))
+    return buckets
+
+
+def time_func_latency(
+    func: Callable = None, name: Optional[str] = None
+) -> Callable[..., Any]:
+    """
+    A decorator to observe the latency of a function's execution. Supports both sync and async functions.
+
+    NOTE: We use our own implementation of a timer decorator since prometheus_client does not support async
+    context manager yet.
+
+    Overhead: The overhead introduced here in case of an async function could likely be because of `await` introduced
+    which will return in another coroutine object creation and under heavy load could see longer wall time
+    (scheduling delays due to introduction of another awaitable).
+    """
+
+    def measure(func: Callable[..., Any]) -> Callable[..., Any]:
+        nonlocal name
+
+        name = name or func.__name__
+
+        @wraps(func)
+        async def async_wrapper(*args, **kwargs):
+            if not enable_metrics:
+                return await func(*args, **kwargs)
+
+            metric = FUNC_LATENCY
+            start = time.monotonic()
+            ret = func(*args, **kwargs)
+            if isinstance(ret, asyncio.Future) or asyncio.iscoroutine(ret):
+                try:
+                    ret = await ret
+                finally:
+                    metric.labels(name=name).observe(time.monotonic() - start)
+            return ret
+
+        @wraps(func)
+        def sync_wrapper(*args, **kwargs):
+            if not enable_metrics:
+                return func(*args, **kwargs)
+
+            metric = FUNC_LATENCY
+            start = time.monotonic()
+            try:
+                ret = func(*args, **kwargs)
+            finally:
+                metric.labels(name=name).observe(time.monotonic() - start)
+            return ret
+
+        if asyncio.iscoroutinefunction(func):
+            return async_wrapper
+        return sync_wrapper
+
+    if func:
+        return measure(func)
+    else:
+        return measure
diff --git a/python/sglang/srt/metrics/startup_func_log_and_timer.py b/python/sglang/srt/metrics/startup_func_log_and_timer.py
new file mode 100644
index 000000000..752daccbd
--- /dev/null
+++ b/python/sglang/srt/metrics/startup_func_log_and_timer.py
@@ -0,0 +1,150 @@
+"""
+Records startup latency breakdown by context using gauge metrics in seconds
+"""
+
+import logging
+import time
+from contextlib import contextmanager
+from functools import wraps
+from typing import Any, Callable, Dict, Generator, Optional
+
+logger = logging.getLogger(__name__)
+
+enable_startup_metrics = False
+STARTUP_LATENCY_SECONDS = None
+# Track maximum durations for each context
+_max_durations: Dict[str, float] = {}
+
+
+def enable_startup_timer():
+    """Initialize startup latency metrics when metrics are enabled"""
+    # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
+    from prometheus_client import Gauge
+
+    global enable_startup_metrics, STARTUP_LATENCY_SECONDS
+    enable_startup_metrics = True
+
+    STARTUP_LATENCY_SECONDS = Gauge(
+        "sglang:startup_latency_breakdown_seconds_max",
+        "Startup latency breakdown in seconds by context, only records the maximum duration if the context is called multiple times.",
+        labelnames=["context"],
+        multiprocess_mode="mostrecent",
+    )
+
+
+def set_startup_metric(context: str, value: float, should_log: bool = True):
+    """Set the startup metric for a given context"""
+    if should_log:
+        logger.info(f"Setting startup metric: {context} took {value:.3f}s")
+
+    if not enable_startup_metrics:
+        return
+    current_max = _max_durations.get(context, 0.0)
+    if value > current_max:
+        _max_durations[context] = value
+        STARTUP_LATENCY_SECONDS.labels(context=context).set(value)
+
+
+def reset_startup_timers():
+    """Reset all recorded maximum durations. Useful for testing or reinitialization."""
+    global _max_durations
+    _max_durations.clear()
+
+
+def get_max_duration(context: str) -> Optional[float]:
+    """Get the maximum recorded duration for a context name."""
+    return _max_durations.get(context)
+
+
+@contextmanager
+def startup_timer(name: str, log_only: bool = False) -> Generator[None, None, None]:
+    """
+    Context manager to measure startup latency for arbitrary code blocks.
+    Only records the maximum duration if the context is called multiple times.
+
+    Usage:
+        with startup_timer("model_loading"):
+            # model loading code
+            model = load_model()
+
+        with startup_timer("memory_allocation"):
+            # memory setup code
+            allocate_memory()
+    """
+    start_time = time.monotonic()
+    try:
+        yield
+    finally:
+        duration_seconds = time.monotonic() - start_time
+
+        # Track the maximum duration for this context name
+        current_max = _max_durations.get(name, 0.0)
+        is_new_max = duration_seconds > current_max
+
+        if is_new_max:
+            _max_durations[name] = duration_seconds
+
+            # Only update Prometheus gauge if this is a new maximum
+            if enable_startup_metrics and not log_only:
+                STARTUP_LATENCY_SECONDS.labels(context=name).set(duration_seconds)
+
+        # Log with indication if this was a new max
+        logger.info(f"Startup timing: {name} took {duration_seconds:.3f}s")
+
+
+def time_startup_latency(
+    func: Callable = None, name: Optional[str] = None, log_only: bool = False
+) -> Callable[..., Any]:
+    """
+    A decorator to measure startup context latency and record it in seconds.
+    Only records the maximum duration if the context is called multiple times.
+
+    Usage:
+        @time_startup_latency
+        def load_model():
+            # model loading code
+
+        @time_startup_latency(name="custom_init")
+        def initialize_something():
+            # initialization code
+
+        @time_startup_latency(name="debug_only", log_only=True)
+        def debug_function():
+            # This will only log, not record to Prometheus
+    """
+
+    def measure(func: Callable[..., Any]) -> Callable[..., Any]:
+        nonlocal name
+        name = name or func.__name__
+
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            start_time = time.monotonic()
+            try:
+                result = func(*args, **kwargs)
+                return result
+            finally:
+                duration_seconds = time.monotonic() - start_time
+
+                # Track the maximum duration for this context name
+                current_max = _max_durations.get(name, 0.0)
+                is_new_max = duration_seconds > current_max
+
+                if is_new_max:
+                    _max_durations[name] = duration_seconds
+
+                    # Only update Prometheus gauge if this is a new maximum
+                    if enable_startup_metrics and not log_only:
+                        STARTUP_LATENCY_SECONDS.labels(context=name).set(
+                            duration_seconds
+                        )
+
+                # Log the timing
+                logger.info(f"Startup timing: {name} took {duration_seconds:.3f}s")
+
+        return wrapper
+
+    if func:
+        return measure(func)
+    else:
+        return measure
diff --git a/python/sglang/srt/metrics/utils.py b/python/sglang/srt/metrics/utils.py
new file mode 100644
index 000000000..ffc7e1066
--- /dev/null
+++ b/python/sglang/srt/metrics/utils.py
@@ -0,0 +1,48 @@
+# Copyright 2023-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for Prometheus Metrics."""
+import math
+from typing import List
+
+
+def two_sides_exponential_buckets(
+    middle: float, base: float, count: int
+) -> List[float]:
+    buckets = []
+    half_count = math.ceil(count / 2)
+    distance = 1
+    buckets.append(middle)
+    for i in range(half_count):
+        distance *= base
+        buckets.append(middle + distance)
+        buckets.append(max(0, middle - distance))
+    return sorted(set(buckets))
+
+
+def generate_buckets(
+    buckets_rule: List[str], default_buckets: List[float]
+) -> List[float]:
+    if not buckets_rule:
+        buckets_rule = ["default"]
+
+    assert len(buckets_rule) > 0
+    rule = buckets_rule[0]
+    if rule == "tse":
+        middle, base, count = buckets_rule[1:]
+        assert float(base) > 1.0, "Base must be greater than 1.0"
+        return two_sides_exponential_buckets(float(middle), float(base), int(count))
+    if rule == "default":
+        return sorted(set(default_buckets))
+    assert rule == "customer"
+    return sorted(set([float(x) for x in buckets_rule[1:]]))
diff --git a/python/sglang/srt/model_executor/cpu_graph_runner.py b/python/sglang/srt/model_executor/cpu_graph_runner.py
new file mode 100644
index 000000000..bc1e5c5b8
--- /dev/null
+++ b/python/sglang/srt/model_executor/cpu_graph_runner.py
@@ -0,0 +1,640 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Run the model with cpu torch compile."""
+
+# The implementation of CPUGraphRunner follows the CudaGraphRunner
+
+from __future__ import annotations
+
+import logging
+from contextlib import contextmanager
+from typing import TYPE_CHECKING, Callable, Optional, Union
+
+import psutil
+import torch
+import tqdm
+
+from sglang.srt.distributed import get_tensor_model_parallel_rank
+from sglang.srt.distributed.parallel_state import GroupCoordinator
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.model_executor.forward_batch_info import (
+    CaptureHiddenMode,
+    ForwardBatch,
+    ForwardMode,
+    PPProxyTensors,
+)
+from sglang.srt.patch_torch import monkey_patch_torch_compile
+from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+from sglang.srt.utils import (
+    log_info_on_rank0,
+    require_attn_tp_gather,
+    require_gathered_buffer,
+    require_mlp_sync,
+    require_mlp_tp_gather,
+)
+
+logger = logging.getLogger(__name__)
+
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+
+@contextmanager
+def patch_model(
+    model: torch.nn.Module,
+    enable_compile: bool,
+    num_tokens: int,
+    tp_group: GroupCoordinator,
+):
+    """Patch the model to make it compatible with torch.compile"""
+    backup_ca_comm = None
+
+    try:
+        if enable_compile:
+            backup_ca_comm = tp_group.ca_comm
+            # Use custom-allreduce here.
+            # We found the custom allreduce is much faster than the built-in allreduce in torch,
+            # even with ENABLE_INTRA_NODE_COMM=1.
+            # tp_group.ca_comm = None
+            yield torch.compile(
+                torch.no_grad()(model.forward),
+                dynamic=False,
+            )
+        else:
+            yield model.forward
+    finally:
+        if enable_compile:
+            tp_group.ca_comm = backup_ca_comm
+
+
+def set_torch_compile_config():
+    import torch._dynamo.config
+    import torch._inductor.config
+
+    torch._inductor.config.fx_graph_cache = True  # Experimental feature to reduce compilation times, will be on by default in future
+    torch._inductor.config.freezing = True
+    torch._dynamo.config.accumulated_cache_size_limit = 1024
+    if hasattr(torch._dynamo.config, "cache_size_limit"):
+        torch._dynamo.config.cache_size_limit = 1024
+    monkey_patch_torch_compile()
+
+
+def get_batch_sizes_to_capture(model_runner: ModelRunner):
+    server_args = model_runner.server_args
+    # cpu torch compile only speeds up decoding by
+    # reducing python overhead when bs is small
+    capture_bs = list(range(1, 17))
+    capture_bs = [bs for bs in capture_bs if bs <= server_args.torch_compile_max_bs]
+    capture_bs = [bs for bs in capture_bs if bs <= model_runner.req_to_token_pool.size]
+    capture_bs = list(sorted(set(capture_bs)))
+    assert len(capture_bs) > 0 and capture_bs[0] > 0, f"{capture_bs=}"
+    return capture_bs
+
+
+def register_fake_ops():
+    """
+    Registers fake/meta implementations for all custom sgl_kernel CPU operators
+    using torch.library.register_fake to support torch.compile
+    """
+
+    none_return_ops = [
+        "shm_allreduce",
+        "bmm_cpu",
+        "fused_add_rmsnorm_cpu",
+        "decode_attention_cpu",
+        "extend_attention_cpu",
+    ]
+    for op in none_return_ops:
+
+        @torch.library.register_fake(f"sgl_kernel::{op}")
+        def _(*args, **kwargs):
+            return
+
+    for op in [
+        "rmsnorm_cpu",
+        "l2norm_cpu",
+        "fused_experts_cpu",
+        "shared_expert_cpu",
+    ]:
+
+        @torch.library.register_fake(f"sgl_kernel::{op}")
+        def _(input, *args, **kwargs):
+            return torch.empty_like(input)
+
+    @torch.library.register_fake("sgl_kernel::qkv_proj_with_rope")
+    def _(
+        hidden_states,
+        q_a_proj_weight,
+        q_b_proj_weight,
+        kv_a_proj_weight,
+        w_kc,
+        q_a_layernorm_weight,
+        kv_a_layernorm_weight,
+        positions,
+        cos_sin_cache,
+        eps,
+        use_int8_w8a8,
+        use_fp8_w8a16,
+        q_a_proj_scale,
+        q_b_proj_scale,
+        kv_a_proj_scale,
+        is_vnni,
+        block_size,
+    ):
+        num_seqs = hidden_states.shape[0]
+        num_heads = w_kc.shape[0]
+        kv_lora_rank = w_kc.shape[1]
+        qk_rope_head_dim = kv_a_proj_weight.shape[0] - kv_lora_rank
+        q_input = torch.empty(
+            num_seqs,
+            num_heads,
+            kv_lora_rank + qk_rope_head_dim,
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        k_input = torch.empty(
+            num_seqs,
+            1,
+            kv_lora_rank + qk_rope_head_dim,
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        v_input = k_input.narrow(-1, 0, kv_lora_rank)
+        return q_input, k_input, v_input
+
+    @torch.library.register_fake("sgl_kernel::rotary_embedding_cpu")
+    def _(positions, query, key, head_size, cos_sin_cache, is_neox):
+        if query.ndim == 2:
+            return query, key
+        else:
+            return torch.empty_like(query), torch.empty_like(key)
+
+    @torch.library.register_fake("sgl_kernel::qkv_proj_with_rope_fused_weight")
+    def _(
+        hidden_states,
+        q_a_proj_weight,
+        q_b_proj_weight,
+        w_kc,
+        q_a_layernorm_weight,
+        kv_a_layernorm_weight,
+        positions,
+        cos_sin_cache,
+        eps,
+        use_int8_w8a8,
+        use_fp8_w8a16,
+        qkv_a_proj_scale,
+        q_b_proj_scale,
+        is_vnni,
+        block_size,
+        q_lora_rank,
+        kv_lora_rank,
+        qk_rope_head_dim,
+    ):
+        num_seqs = hidden_states.shape[0]
+        num_heads = w_kc.shape[0]
+        kv_lora_rank = w_kc.shape[1]
+        weight_chunks = torch.split(
+            q_a_proj_weight, [q_lora_rank, kv_lora_rank + qk_rope_head_dim], dim=0
+        )
+        qk_rope_head_dim = weight_chunks[1].shape[0] - kv_lora_rank
+        q_input = torch.empty(
+            num_seqs,
+            num_heads,
+            kv_lora_rank + qk_rope_head_dim,
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        k_input = torch.empty(
+            num_seqs,
+            1,
+            kv_lora_rank + qk_rope_head_dim,
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        v_input = k_input.narrow(-1, 0, kv_lora_rank)
+        return q_input, k_input, v_input
+
+    @torch.library.register_fake("sgl_kernel::weight_packed_linear")
+    def _(x, weight, bias, is_vnni):
+        return x.new_empty(x.shape[0], weight.shape[0])
+
+    @torch.library.register_fake("sgl_kernel::per_token_quant_int8_cpu")
+    def _(input):
+        M = input.shape[0]
+        K = input.shape[1]
+        Aq = input.new_empty(M, K, dtype=torch.int8)
+        As = input.new_empty(M, dtype=torch.float32)
+        return Aq, As
+
+    @torch.library.register_fake("sgl_kernel::int8_scaled_mm_cpu")
+    def _(mat1, mat2, scales1, scales2, bias, out_dtype, is_vnni):
+        M = mat1.shape[0]
+        N = mat2.shape[0]
+        out = mat1.new_empty(M, N, dtype=out_dtype)
+        return out
+
+    @torch.library.register_fake("sgl_kernel::grouped_topk_cpu")
+    def _(
+        hidden_states,
+        gating_output,
+        topk,
+        renormalize,
+        num_expert_group,
+        topk_group,
+        num_fused_shared_experts,
+        routed_scaling_factor,
+        num_token_non_padded,
+    ):
+        num_tokens = hidden_states.shape[0]
+        shape = (num_tokens, topk)
+        device = hidden_states.device
+        topk_weights = torch.empty(shape, device=device, dtype=torch.float32)
+        topk_ids = torch.empty(shape, device=device, dtype=torch.int)
+        return topk_weights, topk_ids
+
+    @torch.library.register_fake("sgl_kernel::biased_grouped_topk_cpu")
+    def _(
+        hidden_states,
+        gating_output,
+        correction_bias,
+        topk,
+        renormalize,
+        num_expert_group,
+        topk_group,
+        num_fused_shared_experts,
+        routed_scaling_factor,
+        num_token_non_padded,
+    ):
+        num_tokens = hidden_states.shape[0]
+        shape = (num_tokens, topk)
+        device = hidden_states.device
+        topk_weights = torch.empty(shape, device=device, dtype=torch.float32)
+        topk_ids = torch.empty(shape, device=device, dtype=torch.int)
+        return topk_weights, topk_ids
+
+    @torch.library.register_fake("sgl_kernel::topk_sigmoid_cpu")
+    def _(hidden_states, gating_output, topk, renormalize):
+        num_tokens = hidden_states.shape[0]
+        shape = (num_tokens, topk)
+        return (
+            torch.empty(shape, device=hidden_states.device, dtype=torch.float),
+            torch.empty(shape, device=hidden_states.device, dtype=torch.int),
+        )
+
+    @torch.library.register_fake("sgl_kernel::topk_softmax_cpu")
+    def _(
+        hidden_states,
+        gating_output,
+        topk,
+        renormalize,
+    ):
+        num_tokens = hidden_states.shape[0]
+        shape = (num_tokens, topk)
+        return (
+            torch.empty(shape, device=hidden_states.device, dtype=torch.float),
+            torch.empty(shape, device=hidden_states.device, dtype=torch.int),
+        )
+
+    @torch.library.register_fake("sgl_kernel::silu_and_mul_cpu")
+    def _(input):
+        return input.new_empty(input.shape[0], input.shape[1] // 2)
+
+    @torch.library.register_fake("sgl_kernel::int8_scaled_mm_with_quant")
+    def _(
+        mat1,
+        mat2,
+        scales2,
+        bias,
+        out_dtype,
+        is_vnni,
+    ):
+        M = mat1.shape[0]
+        N = mat2.shape[0]
+        return mat1.new_empty(M, N, dtype=out_dtype)
+
+    @torch.library.register_fake("sgl_kernel::fp8_scaled_mm_cpu")
+    def _(
+        mat1,
+        mat2,
+        scales2,
+        block_size,
+        bias,
+        out_dtype,
+        is_vnni,
+    ):
+        M = mat1.shape[0]
+        N = mat2.shape[0]
+        return mat1.new_empty(M, N, dtype=out_dtype)
+
+
+# TODO Remove unnecessary settings for CPUGraphRunner.
+# Re-abstract the graph runner and restructure CPUGraphRunner to reuse the same logic.
+class CPUGraphRunner:
+    """A CPUGraphRunner runs the forward pass of a model with cpu torch.compile."""
+
+    def __init__(self, model_runner: ModelRunner):
+        # Parse args
+        self.model_runner = model_runner
+        self.device = model_runner.device
+        self.graphs = {}
+        self.output_buffers = {}
+        self.enable_torch_compile = model_runner.server_args.enable_torch_compile
+        self.disable_padding = model_runner.server_args.disable_cuda_graph_padding
+        self.is_encoder_decoder = model_runner.model_config.is_encoder_decoder
+        self.require_gathered_buffer = require_gathered_buffer(model_runner.server_args)
+        self.require_mlp_tp_gather = require_mlp_tp_gather(model_runner.server_args)
+        self.require_mlp_sync = require_mlp_sync(model_runner.server_args)
+        self.require_attn_tp_gather = require_attn_tp_gather(model_runner.server_args)
+        self.enable_two_batch_overlap = (
+            model_runner.server_args.enable_two_batch_overlap
+        )
+        self.speculative_algorithm = model_runner.server_args.speculative_algorithm
+        self.enable_profile_cuda_graph = (
+            model_runner.server_args.enable_profile_cuda_graph
+        )
+        self.tp_size = model_runner.server_args.tp_size
+        self.dp_size = model_runner.server_args.dp_size
+        self.pp_size = model_runner.server_args.pp_size
+
+        self.capture_forward_mode = ForwardMode.DECODE
+        self.capture_hidden_mode = CaptureHiddenMode.NULL
+        self.num_tokens_per_bs = 1
+
+        # If returning hidden states is enabled, set initial capture hidden mode to full to avoid double-capture on startup
+        if model_runner.server_args.enable_return_hidden_states:
+            self.capture_hidden_mode = CaptureHiddenMode.FULL
+
+        assert (
+            not self.model_runner.server_args.enable_lora
+        ), "CPUGraphRunner does not support LoRA yet."
+        assert (
+            not self.enable_two_batch_overlap
+        ), "CPUGraphRunner does not support two batch overlap yet."
+        assert (
+            not self.require_mlp_tp_gather
+        ), "CPUGraphRunner does not support MLP TP gather yet."
+        assert (
+            not self.require_mlp_sync
+        ), "CPUGraphRunner does not support MLP sync yet."
+        assert (
+            not self.require_gathered_buffer
+        ), "CPUGraphRunner does not support gathered buffer yet."
+        assert (
+            model_runner.spec_algorithm == SpeculativeAlgorithm.NONE
+        ), "CPUGraphRunner does not support speculative inference yet."
+        # TODO add compile support for encoder-decoder models
+        assert (
+            not self.is_encoder_decoder
+        ), "CPUGraphRunner does not support encoder-decoder models yet."
+        assert self.dp_size == 1, "CPUGraphRunner does not support DP yet."
+        assert self.pp_size == 1, "CPUGraphRunner does not support PP yet."
+
+        # Batch sizes to capture
+        self.capture_bs = get_batch_sizes_to_capture(model_runner)
+        log_info_on_rank0(logger, f"Capture cpu graph bs {self.capture_bs}")
+        # Attention backend
+        self.max_bs = max(self.capture_bs)
+        self.max_num_token = self.max_bs * self.num_tokens_per_bs
+
+        self.seq_len_fill_value = (
+            self.model_runner.attn_backend.get_graph_seq_len_fill_value()
+        )
+
+        if self.enable_torch_compile:
+            register_fake_ops()
+            set_torch_compile_config()
+
+        # Graph inputs
+        with torch.device(self.device):
+            self.input_ids = torch.zeros((self.max_num_token,), dtype=torch.int64)
+            self.req_pool_indices = torch.zeros((self.max_bs,), dtype=torch.int64)
+            self.seq_lens = torch.full(
+                (self.max_bs,), self.seq_len_fill_value, dtype=torch.int64
+            )
+            self.out_cache_loc = torch.zeros((self.max_num_token,), dtype=torch.int64)
+            self.positions = torch.zeros((self.max_num_token,), dtype=torch.int64)
+            self.mrope_positions = torch.zeros((3, self.max_bs), dtype=torch.int64)
+            self.num_token_non_padded = torch.zeros((1,), dtype=torch.int64)
+            self.custom_mask = torch.ones(
+                (
+                    (self.seq_lens.sum().item() + self.max_num_token)
+                    * self.num_tokens_per_bs
+                ),
+                dtype=torch.bool,
+                device=self.device,
+            )
+
+        # Capture
+        try:
+            self.capture()
+        except RuntimeError as e:
+            raise Exception(
+                f"Capture CPU graph failed: {e}\n{CPU_GRAPH_CAPTURE_FAILED_MSG}"
+            )
+
+    def can_run(self, forward_batch: ForwardBatch):
+        is_bs_supported = forward_batch.batch_size in self.graphs
+
+        requested_capture_hidden_mode = max(
+            forward_batch.capture_hidden_mode,
+            (
+                forward_batch.spec_info.capture_hidden_mode
+                if getattr(forward_batch.spec_info, "capture_hidden_mode", None)
+                is not None
+                else CaptureHiddenMode.NULL
+            ),
+        )
+        capture_hidden_mode_matches = (
+            requested_capture_hidden_mode == CaptureHiddenMode.NULL
+            or requested_capture_hidden_mode == self.capture_hidden_mode
+        )
+
+        return is_bs_supported and capture_hidden_mode_matches
+
+    def capture(self) -> None:
+        capture_range = (
+            tqdm.tqdm(list(reversed(self.capture_bs)))
+            if get_tensor_model_parallel_rank() == 0
+            else reversed(self.capture_bs)
+        )
+        for bs in capture_range:
+            if get_tensor_model_parallel_rank() == 0:
+                avail_mem = psutil.virtual_memory().available / (1 << 30)
+                capture_range.set_description(
+                    f"Capturing batches ({bs=} {avail_mem=:.2f} GB)"
+                )
+
+            with patch_model(
+                self.model_runner.model,
+                bs in self.capture_bs,
+                num_tokens=bs * self.num_tokens_per_bs,
+                tp_group=self.model_runner.tp_group,
+            ) as forward:
+                (
+                    graph,
+                    output_buffers,
+                ) = self.capture_one_batch_size(bs, forward)
+                self.graphs[bs] = graph
+                self.output_buffers[bs] = output_buffers
+
+    def capture_one_batch_size(self, bs: int, forward: Callable):
+        num_tokens = bs * self.num_tokens_per_bs
+
+        # Graph inputs
+        input_ids = self.input_ids[:num_tokens]
+        req_pool_indices = self.req_pool_indices[:bs]
+        seq_lens = self.seq_lens[:bs]
+        out_cache_loc = self.out_cache_loc[:num_tokens]
+        positions = self.positions[:num_tokens]
+        mrope_positions = self.mrope_positions[:, :bs]
+        self.num_token_non_padded[...] = num_tokens
+
+        spec_info = self.get_spec_info(num_tokens)
+        if self.capture_hidden_mode != CaptureHiddenMode.FULL:
+            self.capture_hidden_mode = (
+                spec_info.capture_hidden_mode if spec_info else CaptureHiddenMode.NULL
+            )
+
+        forward_batch = ForwardBatch(
+            forward_mode=self.capture_forward_mode,
+            batch_size=bs,
+            input_ids=input_ids,
+            req_pool_indices=req_pool_indices,
+            seq_lens=seq_lens,
+            req_to_token_pool=self.model_runner.req_to_token_pool,
+            token_to_kv_pool=self.model_runner.token_to_kv_pool,
+            attn_backend=self.model_runner.attn_backend,
+            out_cache_loc=out_cache_loc,
+            seq_lens_sum=seq_lens.sum().item(),
+            return_logprob=False,
+            positions=positions,
+            mrope_positions=mrope_positions,
+            spec_algorithm=self.model_runner.spec_algorithm,
+            spec_info=spec_info,
+            capture_hidden_mode=self.capture_hidden_mode,
+            num_token_non_padded=self.num_token_non_padded,
+            global_forward_mode=self.capture_forward_mode,
+        )
+
+        # Attention backend
+        self.model_runner.attn_backend.init_forward_metadata(forward_batch)
+        # Do infernence to avoid setting attr at runtime, e.g.,
+        # self.attn_mha.kv_b_proj = self.kv_b_proj for full graph compile on CPU
+        self.model_runner.model.forward(
+            forward_batch.input_ids,
+            forward_batch.positions,
+            forward_batch,
+        )
+
+        # Run and capture
+        def run_once():
+            # Clean intermediate result cache for DP attention
+            forward_batch.dp_local_start_pos = forward_batch.dp_local_num_tokens = None
+            logits_output_or_pp_proxy_tensors = forward(
+                input_ids,
+                forward_batch.positions,
+                forward_batch,
+            )
+            return logits_output_or_pp_proxy_tensors
+
+        with torch.no_grad():
+            for _ in range(2):
+                self.model_runner.tp_group.barrier()
+                out = run_once()
+            return forward, out
+
+    def recapture_if_needed(self, forward_batch: ForwardBatch):
+
+        # If the required capture_hidden_mode changes, we need to recapture the graph
+
+        # These are the different factors that can influence the capture_hidden_mode
+        capture_hidden_mode_required_by_forward_batch = (
+            forward_batch.capture_hidden_mode
+        )
+        capture_hidden_mode_required_by_spec_info = getattr(
+            forward_batch.spec_info, "capture_hidden_mode", CaptureHiddenMode.NULL
+        )
+        capture_hidden_mode_required_for_returning_hidden_states = (
+            CaptureHiddenMode.FULL
+            if self.model_runner.server_args.enable_return_hidden_states
+            else CaptureHiddenMode.NULL
+        )
+
+        # Determine the highest capture_hidden_mode required
+        # (If we have FULL, we can emulate LAST or NULL)
+        # (If we have LAST, we can emulate NULL)
+        required_capture_hidden_mode = max(
+            capture_hidden_mode_required_by_forward_batch,
+            capture_hidden_mode_required_by_spec_info,
+            capture_hidden_mode_required_for_returning_hidden_states,
+        )
+
+        # If the current hidden mode is no longer aligned with the required hidden mode, we need to set it to what is required and re-capture
+        if self.capture_hidden_mode != required_capture_hidden_mode:
+            self.capture_hidden_mode = required_capture_hidden_mode
+            self.capture()
+
+    # TODO add padding support for CPUGraphRunner
+    def replay(
+        self,
+        forward_batch: ForwardBatch,
+        skip_attn_backend_init: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[LogitsProcessorOutput, PPProxyTensors]:
+        assert (
+            pp_proxy_tensors is None
+        ), "PPProxyTensors is not supported in CPUGraphRunner yet."
+        self.recapture_if_needed(forward_batch)
+        self.model_runner.attn_backend.init_forward_metadata(forward_batch)
+        output = self.graphs[forward_batch.batch_size](
+            forward_batch.input_ids,
+            forward_batch.positions,
+            forward_batch,
+        )
+        return output
+
+    def get_spec_info(self, num_tokens: int):
+        spec_info = None
+        if self.model_runner.spec_algorithm.is_eagle():
+            from sglang.srt.speculative.eagle_utils import EagleVerifyInput
+
+            if self.model_runner.is_draft_worker:
+                raise RuntimeError("This should not happen.")
+            else:
+                spec_info = EagleVerifyInput(
+                    draft_token=None,
+                    custom_mask=self.custom_mask,
+                    positions=None,
+                    retrive_index=None,
+                    retrive_next_token=None,
+                    retrive_next_sibling=None,
+                    retrive_cum_len=None,
+                    spec_steps=self.model_runner.server_args.speculative_num_steps,
+                    topk=self.model_runner.server_args.speculative_eagle_topk,
+                    draft_token_num=self.model_runner.server_args.speculative_num_draft_tokens,
+                    capture_hidden_mode=CaptureHiddenMode.FULL,
+                    seq_lens_sum=None,
+                    seq_lens_cpu=None,
+                )
+
+        return spec_info
+
+
+CPU_GRAPH_CAPTURE_FAILED_MSG = (
+    "Possible solutions:\n"
+    "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
+    "2. set --torch-compile-max-bs to a smaller value (e.g., 8)\n"
+    "3. disable torch compile by not using --enable-torch-compile\n"
+    "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
+)
diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py
new file mode 100644
index 000000000..14da84e42
--- /dev/null
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -0,0 +1,868 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Run the model with cuda graph and torch.compile."""
+
+from __future__ import annotations
+
+import bisect
+import gc
+import inspect
+import logging
+import os
+from contextlib import contextmanager
+from typing import TYPE_CHECKING, Callable, Optional, Union
+
+import torch
+import tqdm
+from torch.profiler import ProfilerActivity, profile
+
+from sglang.srt.custom_op import CustomOp
+from sglang.srt.distributed import get_tensor_model_parallel_rank
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    set_graph_pool_id,
+)
+from sglang.srt.distributed.parallel_state import GroupCoordinator, graph_capture
+from sglang.srt.layers.dp_attention import (
+    DpPaddingMode,
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    set_dp_buffer_len,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.layers.torchao_utils import save_gemlite_cache
+from sglang.srt.model_executor.forward_batch_info import (
+    CaptureHiddenMode,
+    ForwardBatch,
+    ForwardMode,
+    PPProxyTensors,
+    enable_num_token_non_padded,
+)
+from sglang.srt.patch_torch import monkey_patch_torch_compile
+from sglang.srt.two_batch_overlap import TboCudaGraphRunnerPlugin
+from sglang.srt.utils import (
+    empty_context,
+    get_available_gpu_memory,
+    get_device_memory_capacity,
+    log_info_on_rank0,
+    require_attn_tp_gather,
+    require_gathered_buffer,
+    require_mlp_sync,
+    require_mlp_tp_gather,
+)
+
+logger = logging.getLogger(__name__)
+
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+# Detect whether the current forward pass is in capture mode
+is_capture_mode = False
+
+
+def get_is_capture_mode():
+    return is_capture_mode
+
+
+@contextmanager
+def model_capture_mode():
+    global is_capture_mode
+    is_capture_mode = True
+
+    yield
+
+    is_capture_mode = False
+
+
+@contextmanager
+def freeze_gc(enable_cudagraph_gc: bool):
+    """
+    Optimize garbage collection during CUDA graph capture.
+    Clean up, then freeze all remaining objects from being included
+    in future collections if GC is disabled during capture.
+    """
+    gc.collect()
+    should_freeze = not enable_cudagraph_gc
+    if should_freeze:
+        gc.freeze()
+    try:
+        yield
+    finally:
+        if should_freeze:
+            gc.unfreeze()
+
+
+def _to_torch(model: torch.nn.Module, reverse: bool, num_tokens: int):
+    for sub in model._modules.values():
+        if isinstance(sub, CustomOp):
+            if reverse:
+                sub.leave_torch_compile()
+            else:
+                sub.enter_torch_compile(num_tokens=num_tokens)
+        if isinstance(sub, torch.nn.Module):
+            _to_torch(sub, reverse, num_tokens)
+
+
+@contextmanager
+def patch_model(
+    model: torch.nn.Module,
+    enable_compile: bool,
+    num_tokens: int,
+    tp_group: GroupCoordinator,
+):
+    """Patch the model to make it compatible with with torch.compile"""
+    backup_ca_comm = None
+
+    try:
+        if enable_compile:
+            _to_torch(model, reverse=False, num_tokens=num_tokens)
+            backup_ca_comm = tp_group.ca_comm
+            # Use custom-allreduce here.
+            # We found the custom allreduce is much faster than the built-in allreduce in torch,
+            # even with ENABLE_INTRA_NODE_COMM=1.
+            # tp_group.ca_comm = None
+            yield torch.compile(
+                torch.no_grad()(model.forward),
+                mode=os.environ.get(
+                    "SGLANG_TORCH_COMPILE_MODE", "max-autotune-no-cudagraphs"
+                ),
+                dynamic=False,
+            )
+        else:
+            yield model.forward
+    finally:
+        if enable_compile:
+            _to_torch(model, reverse=True, num_tokens=num_tokens)
+            tp_group.ca_comm = backup_ca_comm
+
+
+def set_torch_compile_config():
+    import torch._dynamo.config
+    import torch._inductor.config
+
+    torch._inductor.config.coordinate_descent_tuning = True
+    torch._inductor.config.triton.unique_kernel_names = True
+    torch._inductor.config.fx_graph_cache = True  # Experimental feature to reduce compilation times, will be on by default in future
+
+    # FIXME: tmp workaround
+    torch._dynamo.config.accumulated_cache_size_limit = 1024
+    if hasattr(torch._dynamo.config, "cache_size_limit"):
+        torch._dynamo.config.cache_size_limit = 1024
+
+    monkey_patch_torch_compile()
+
+
+def get_batch_sizes_to_capture(model_runner: ModelRunner):
+    server_args = model_runner.server_args
+    capture_bs = server_args.cuda_graph_bs
+
+    if capture_bs is None:
+        if server_args.speculative_algorithm is None:
+            if server_args.disable_cuda_graph_padding:
+                capture_bs = list(range(1, 33)) + list(range(48, 161, 16))
+            else:
+                capture_bs = [1, 2, 4, 8] + list(range(16, 161, 8))
+        else:
+            # Since speculative decoding requires more cuda graph memory, we
+            # capture less.
+            capture_bs = (
+                list(range(1, 9))
+                + list(range(10, 33, 2))
+                + list(range(40, 64, 8))
+                + list(range(80, 161, 16))
+            )
+
+        gpu_mem = get_device_memory_capacity()
+        if gpu_mem is not None:
+            if gpu_mem > 90 * 1024:  # H200, H20
+                capture_bs += list(range(160, 257, 8))
+            if gpu_mem > 160 * 1000:  # B200, MI300
+                capture_bs += list(range(256, 513, 16))
+
+    if max(capture_bs) > model_runner.req_to_token_pool.size:
+        # In some cases (e.g., with a small GPU or --max-running-requests), the #max-running-requests
+        # is very small. We add more values here to make sure we capture the maximum bs.
+        capture_bs += [model_runner.req_to_token_pool.size]
+
+    mul_base = 1
+
+    if server_args.enable_two_batch_overlap:
+        mul_base *= 2
+
+    if require_gathered_buffer(server_args):
+        mul_base *= get_attention_tp_size()
+
+    capture_bs = [bs for bs in capture_bs if bs % mul_base == 0]
+
+    if server_args.cuda_graph_max_bs:
+        capture_bs = [bs for bs in capture_bs if bs <= server_args.cuda_graph_max_bs]
+        if max(capture_bs) < server_args.cuda_graph_max_bs:
+            capture_bs += list(
+                range(max(capture_bs), server_args.cuda_graph_max_bs + 1, 16)
+            )
+    capture_bs = [bs for bs in capture_bs if bs <= model_runner.req_to_token_pool.size]
+    capture_bs = list(sorted(set(capture_bs)))
+    assert len(capture_bs) > 0 and capture_bs[0] > 0, f"{capture_bs=}"
+    compile_bs = (
+        [bs for bs in capture_bs if bs <= server_args.torch_compile_max_bs]
+        if server_args.enable_torch_compile
+        else []
+    )
+    return capture_bs, compile_bs
+
+
+# Reuse this memory pool across all cuda graph runners.
+global_graph_memory_pool = None
+
+
+def get_global_graph_memory_pool():
+    return global_graph_memory_pool
+
+
+def set_global_graph_memory_pool(val):
+    global global_graph_memory_pool
+    global_graph_memory_pool = val
+
+
+class CudaGraphRunner:
+    """A CudaGraphRunner runs the forward pass of a model with cuda graph and torch.compile."""
+
+    def __init__(self, model_runner: ModelRunner):
+        # Parse args
+        self.model_runner = model_runner
+        self.device = model_runner.device
+        self.device_module = torch.get_device_module(self.device)
+        self.graphs = {}
+        self.output_buffers = {}
+        self.enable_torch_compile = model_runner.server_args.enable_torch_compile
+        self.disable_padding = model_runner.server_args.disable_cuda_graph_padding
+        self.is_encoder_decoder = model_runner.model_config.is_encoder_decoder
+        self.require_gathered_buffer = require_gathered_buffer(model_runner.server_args)
+        self.require_mlp_tp_gather = require_mlp_tp_gather(model_runner.server_args)
+        self.require_mlp_sync = require_mlp_sync(model_runner.server_args)
+        self.require_attn_tp_gather = require_attn_tp_gather(model_runner.server_args)
+        self.enable_two_batch_overlap = (
+            model_runner.server_args.enable_two_batch_overlap
+        )
+        self.speculative_algorithm = model_runner.server_args.speculative_algorithm
+        self.enable_profile_cuda_graph = (
+            model_runner.server_args.enable_profile_cuda_graph
+        )
+        self.tp_size = model_runner.server_args.tp_size
+        self.dp_size = model_runner.server_args.dp_size
+        self.pp_size = model_runner.server_args.pp_size
+
+        self.attn_tp_size = get_attention_tp_size()
+        self.attn_tp_rank = get_attention_tp_rank()
+
+        # Batch sizes to capture
+        self.capture_bs, self.compile_bs = get_batch_sizes_to_capture(model_runner)
+        log_info_on_rank0(logger, f"Capture cuda graph bs {self.capture_bs}")
+        self.capture_forward_mode = ForwardMode.DECODE
+        self.capture_hidden_mode = CaptureHiddenMode.NULL
+        self.num_tokens_per_bs = 1
+        if (
+            model_runner.spec_algorithm.is_eagle()
+            or model_runner.spec_algorithm.is_standalone()
+        ):
+            if self.model_runner.is_draft_worker:
+                raise RuntimeError("This should not happen")
+            else:
+                self.capture_forward_mode = ForwardMode.TARGET_VERIFY
+                self.num_tokens_per_bs = (
+                    self.model_runner.server_args.speculative_num_draft_tokens
+                )
+
+        # If returning hidden states is enabled, set initial capture hidden mode to full to avoid double-capture on startup
+        if model_runner.server_args.enable_return_hidden_states:
+            self.capture_hidden_mode = CaptureHiddenMode.FULL
+
+        # Attention backend
+        self.max_bs = max(self.capture_bs)
+        self.max_num_token = self.max_bs * self.num_tokens_per_bs
+        self.model_runner.attn_backend.init_cuda_graph_state(
+            self.max_bs, self.max_num_token
+        )
+        self.seq_len_fill_value = (
+            self.model_runner.attn_backend.get_cuda_graph_seq_len_fill_value()
+        )
+
+        # FIXME(lsyin): leave it here for now, I don't know whether it is necessary
+        self.encoder_len_fill_value = 0
+        self.seq_lens_cpu = torch.full(
+            (self.max_bs,), self.seq_len_fill_value, dtype=torch.int32
+        )
+
+        if self.enable_torch_compile:
+            set_torch_compile_config()
+
+        if self.model_runner.server_args.enable_lora:
+            self.model_runner.lora_manager.init_cuda_graph_batch_info(self.max_bs)
+
+        # Graph inputs
+        with torch.device(self.device):
+            self.input_ids = torch.zeros((self.max_num_token,), dtype=torch.int64)
+            self.req_pool_indices = torch.zeros((self.max_bs,), dtype=torch.int32)
+            self.seq_lens = torch.full(
+                (self.max_bs,), self.seq_len_fill_value, dtype=torch.int32
+            )
+            self.out_cache_loc = torch.zeros(
+                (self.max_num_token,), dtype=self._cache_loc_dtype()
+            )
+            self.positions = torch.zeros((self.max_num_token,), dtype=torch.int64)
+            self.mrope_positions = torch.zeros(
+                (3, self.max_num_token), dtype=torch.int64
+            )
+            self.num_token_non_padded = torch.zeros((1,), dtype=torch.int32)
+            self.tbo_plugin = TboCudaGraphRunnerPlugin()
+
+            # pipeline parallelism
+            if self.pp_size > 1:
+                self.pp_proxy_tensors = {
+                    "hidden_states": torch.zeros(
+                        (self.max_bs, self.model_runner.model_config.hidden_size),
+                        dtype=torch.bfloat16,
+                    ),
+                    "residual": torch.zeros(
+                        (self.max_bs, self.model_runner.model_config.hidden_size),
+                        dtype=torch.bfloat16,
+                    ),
+                }
+
+            # Speculative_inference
+            if model_runner.spec_algorithm.is_eagle3():
+                self.model_runner.model.set_eagle3_layers_to_capture()
+
+            if self.is_encoder_decoder:
+                # NOTE: encoder_lens can influence the full_text_row_masked_out_mask tensor when doing mixed batch
+                self.encoder_lens = torch.full(
+                    (self.max_bs,), self.encoder_len_fill_value, dtype=torch.int32
+                )
+            else:
+                self.encoder_lens = None
+
+            if self.require_gathered_buffer:
+                if self.require_mlp_tp_gather:
+                    self.global_num_tokens_gpu = torch.zeros(
+                        (self.dp_size,), dtype=torch.int32
+                    )
+                    self.global_num_tokens_for_logprob_gpu = torch.zeros(
+                        (self.dp_size,), dtype=torch.int32
+                    )
+                else:
+                    assert self.require_attn_tp_gather
+                    self.global_num_tokens_gpu = torch.zeros((1,), dtype=torch.int32)
+                    self.global_num_tokens_for_logprob_gpu = torch.zeros(
+                        (1,), dtype=torch.int32
+                    )
+            else:
+                self.global_num_tokens_gpu = None
+                self.global_num_tokens_for_logprob_gpu = None
+
+            self.custom_mask = torch.ones(
+                (
+                    (self.seq_lens.sum().item() + self.max_num_token)
+                    * self.num_tokens_per_bs
+                ),
+                dtype=torch.bool,
+                device=self.device,
+            )
+            self.next_token_logits_buffer = torch.zeros(
+                (self.max_num_token, self.model_runner.model_config.vocab_size),
+                dtype=torch.float,
+                device=self.device,
+            )
+
+        # Capture
+        try:
+            with model_capture_mode():
+                self.capture()
+        except RuntimeError as e:
+            raise Exception(
+                f"Capture cuda graph failed: {e}\n{CUDA_GRAPH_CAPTURE_FAILED_MSG}"
+            )
+
+    def _cache_loc_dtype(self):
+        return torch.int64
+
+    def can_run(self, forward_batch: ForwardBatch):
+        if self.require_mlp_tp_gather:
+            cuda_graph_bs = (
+                max(forward_batch.global_num_tokens_cpu) // self.num_tokens_per_bs
+                if self.model_runner.spec_algorithm.is_eagle()
+                else max(forward_batch.global_num_tokens_cpu)
+            )
+        else:
+            cuda_graph_bs = forward_batch.batch_size
+
+        is_bs_supported = (
+            cuda_graph_bs in self.graphs
+            if self.disable_padding
+            else cuda_graph_bs <= self.max_bs
+        )
+
+        if self.require_mlp_sync:
+            is_bs_supported = is_bs_supported and forward_batch.can_run_dp_cuda_graph
+
+        # NOTE: cuda graph cannot handle mixed batch (encoder_len = 0)
+        # If mixed batch cannot be supported, then encoder_lens can be removed in cuda graph
+        # because the full_text_row_masked_out_mask tensor will always be ones
+        is_encoder_lens_supported = (
+            torch.all(forward_batch.encoder_lens > 0)
+            if self.is_encoder_decoder
+            else True
+        )
+
+        requested_capture_hidden_mode = max(
+            forward_batch.capture_hidden_mode,
+            (
+                forward_batch.spec_info.capture_hidden_mode
+                if getattr(forward_batch.spec_info, "capture_hidden_mode", None)
+                is not None
+                else CaptureHiddenMode.NULL
+            ),
+        )
+        capture_hidden_mode_matches = (
+            requested_capture_hidden_mode == CaptureHiddenMode.NULL
+            or requested_capture_hidden_mode == self.capture_hidden_mode
+        )
+        is_tbo_supported = (
+            forward_batch.can_run_tbo if self.enable_two_batch_overlap else True
+        )
+
+        return (
+            is_bs_supported
+            and is_encoder_lens_supported
+            and is_tbo_supported
+            and capture_hidden_mode_matches
+        )
+
+    def capture(self) -> None:
+        profile_context = empty_context()
+        if self.enable_profile_cuda_graph:
+            profile_context = profile(
+                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                record_shapes=True,
+            )
+
+        # Trigger CUDA graph capture for specific shapes.
+        # Capture the large shapes first so that the smaller shapes
+        # can reuse the memory pool allocated for the large shapes.
+        with freeze_gc(
+            self.model_runner.server_args.enable_cudagraph_gc
+        ), graph_capture() as graph_capture_context:
+            with profile_context as prof:
+                self.stream = graph_capture_context.stream
+                avail_mem = get_available_gpu_memory(
+                    self.model_runner.device,
+                    self.model_runner.gpu_id,
+                    empty_cache=False,
+                )
+                # Reverse the order to enable better memory sharing across cuda graphs.
+                capture_range = (
+                    tqdm.tqdm(list(reversed(self.capture_bs)))
+                    if get_tensor_model_parallel_rank() == 0
+                    else reversed(self.capture_bs)
+                )
+                for i, bs in enumerate(capture_range):
+                    if get_tensor_model_parallel_rank() == 0:
+                        avail_mem = get_available_gpu_memory(
+                            self.model_runner.device,
+                            self.model_runner.gpu_id,
+                            empty_cache=False,
+                        )
+                        capture_range.set_description(
+                            f"Capturing batches ({bs=} {avail_mem=:.2f} GB)"
+                        )
+
+                    with patch_model(
+                        self.model_runner.model,
+                        bs in self.compile_bs,
+                        num_tokens=bs * self.num_tokens_per_bs,
+                        tp_group=self.model_runner.tp_group,
+                    ) as forward:
+                        (
+                            graph,
+                            output_buffers,
+                        ) = self.capture_one_batch_size(bs, forward)
+                        self.graphs[bs] = graph
+                        self.output_buffers[bs] = output_buffers
+
+                    # Save gemlite cache after each capture
+                    save_gemlite_cache()
+
+        if self.enable_profile_cuda_graph:
+            log_message = (
+                "Sorted by CUDA Time:\n"
+                + prof.key_averages(group_by_input_shape=True).table(
+                    sort_by="cuda_time_total", row_limit=10
+                )
+                + "\n\nSorted by CPU Time:\n"
+                + prof.key_averages(group_by_input_shape=True).table(
+                    sort_by="cpu_time_total", row_limit=10
+                )
+            )
+            logger.info(log_message)
+
+    def _capture_graph(self, graph, pool, stream, run_once_fn):
+        with self.device_module.graph(graph, pool=pool, stream=stream):
+            out = run_once_fn()
+        return out
+
+    def _create_device_graph(self):
+        return torch.cuda.CUDAGraph()
+
+    def capture_one_batch_size(self, bs: int, forward: Callable):
+        graph = self._create_device_graph()
+        stream = self.stream
+        num_tokens = bs * self.num_tokens_per_bs
+
+        # Graph inputs
+        input_ids = self.input_ids[:num_tokens]
+        req_pool_indices = self.req_pool_indices[:bs]
+        seq_lens = self.seq_lens[:bs]
+        out_cache_loc = self.out_cache_loc[:num_tokens]
+        positions = self.positions[:num_tokens]
+        if self.is_encoder_decoder:
+            encoder_lens = self.encoder_lens[:bs]
+        else:
+            encoder_lens = None
+        mrope_positions = self.mrope_positions[:, :num_tokens]
+        next_token_logits_buffer = self.next_token_logits_buffer[:num_tokens]
+        self.num_token_non_padded[...] = num_tokens
+
+        # pipeline parallelism
+        if self.pp_size > 1:
+            pp_proxy_tensors = PPProxyTensors(
+                {k: v[:num_tokens] for k, v in self.pp_proxy_tensors.items()}
+            )
+
+        if self.require_mlp_tp_gather:
+            self.global_num_tokens_gpu.copy_(
+                torch.tensor(
+                    [num_tokens] * self.dp_size,
+                    dtype=torch.int32,
+                    device=input_ids.device,
+                )
+            )
+            self.global_num_tokens_for_logprob_gpu.copy_(
+                torch.tensor(
+                    [num_tokens] * self.dp_size,
+                    dtype=torch.int32,
+                    device=input_ids.device,
+                )
+            )
+            global_dp_buffer_len = num_tokens * self.dp_size
+        elif self.require_attn_tp_gather:
+            self.global_num_tokens_gpu.copy_(
+                torch.tensor(
+                    [num_tokens],
+                    dtype=torch.int32,
+                    device=input_ids.device,
+                )
+            )
+            self.global_num_tokens_for_logprob_gpu.copy_(
+                torch.tensor(
+                    [num_tokens],
+                    dtype=torch.int32,
+                    device=input_ids.device,
+                )
+            )
+            global_dp_buffer_len = num_tokens
+        else:
+            global_dp_buffer_len = None
+
+        spec_info = self.get_spec_info(num_tokens)
+        if self.capture_hidden_mode != CaptureHiddenMode.FULL:
+            self.capture_hidden_mode = (
+                spec_info.capture_hidden_mode if spec_info else CaptureHiddenMode.NULL
+            )
+
+        if self.model_runner.server_args.enable_lora:
+            # It is safe to capture CUDA graph using empty LoRA id, as the LoRA kernels will always be launched whenever
+            # `--enable-lora` is set to True (and return immediately if the LoRA id is empty for perf optimization).
+            lora_ids = [None] * bs
+        else:
+            lora_ids = None
+
+        forward_batch = ForwardBatch(
+            forward_mode=self.capture_forward_mode,
+            batch_size=bs,
+            input_ids=input_ids,
+            req_pool_indices=req_pool_indices,
+            seq_lens=seq_lens,
+            next_token_logits_buffer=next_token_logits_buffer,
+            orig_seq_lens=seq_lens,
+            req_to_token_pool=self.model_runner.req_to_token_pool,
+            token_to_kv_pool=self.model_runner.token_to_kv_pool,
+            attn_backend=self.model_runner.attn_backend,
+            out_cache_loc=out_cache_loc,
+            seq_lens_sum=seq_lens.sum().item(),
+            encoder_lens=encoder_lens,
+            return_logprob=False,
+            positions=positions,
+            global_num_tokens_gpu=self.global_num_tokens_gpu,
+            global_num_tokens_for_logprob_gpu=self.global_num_tokens_for_logprob_gpu,
+            dp_padding_mode=DpPaddingMode.get_default_mode_in_cuda_graph(),
+            global_dp_buffer_len=global_dp_buffer_len,
+            mrope_positions=mrope_positions,
+            spec_algorithm=self.model_runner.spec_algorithm,
+            spec_info=spec_info,
+            capture_hidden_mode=self.capture_hidden_mode,
+            num_token_non_padded=self.num_token_non_padded,
+            global_forward_mode=self.capture_forward_mode,
+            lora_ids=lora_ids,
+        )
+        self.tbo_plugin.capture_one_batch_size(forward_batch, num_tokens=num_tokens)
+
+        if lora_ids is not None:
+            self.model_runner.lora_manager.prepare_lora_batch(forward_batch)
+
+        # Attention backend
+        self.model_runner.attn_backend.init_forward_metadata_capture_cuda_graph(
+            bs,
+            num_tokens,
+            req_pool_indices,
+            seq_lens,
+            encoder_lens,
+            forward_batch.forward_mode,
+            forward_batch.spec_info,
+        )
+
+        # Run and capture
+        def run_once():
+            # Clean intermediate result cache for DP attention
+            forward_batch.dp_local_start_pos = forward_batch.dp_local_num_tokens = None
+            set_dp_buffer_len(global_dp_buffer_len, num_tokens)
+
+            kwargs = {}
+            if (
+                self.pp_size > 1
+                and "pp_proxy_tensors" in inspect.signature(forward).parameters
+            ):
+                kwargs["pp_proxy_tensors"] = PPProxyTensors(
+                    {k: v.clone() for k, v in pp_proxy_tensors.tensors.items()}
+                )
+
+            logits_output_or_pp_proxy_tensors = forward(
+                input_ids,
+                forward_batch.positions,
+                forward_batch,
+                **kwargs,
+            )
+            return logits_output_or_pp_proxy_tensors
+
+        for _ in range(2):
+            self.device_module.synchronize()
+            self.model_runner.tp_group.barrier()
+            run_once()
+
+        if get_global_graph_memory_pool() is None:
+            set_global_graph_memory_pool(self.device_module.graph_pool_handle())
+        # Set graph pool id globally to be able to use symmetric memory
+        set_graph_pool_id(get_global_graph_memory_pool())
+        out = self._capture_graph(
+            graph, get_global_graph_memory_pool(), stream, run_once
+        )
+
+        return graph, out
+
+    def recapture_if_needed(self, forward_batch: ForwardBatch):
+
+        # If the required capture_hidden_mode changes, we need to recapture the graph
+
+        # These are the different factors that can influence the capture_hidden_mode
+        capture_hidden_mode_required_by_forward_batch = (
+            forward_batch.capture_hidden_mode
+        )
+        capture_hidden_mode_required_by_spec_info = getattr(
+            forward_batch.spec_info, "capture_hidden_mode", CaptureHiddenMode.NULL
+        )
+        capture_hidden_mode_required_for_returning_hidden_states = (
+            CaptureHiddenMode.FULL
+            if self.model_runner.server_args.enable_return_hidden_states
+            else CaptureHiddenMode.NULL
+        )
+
+        # Determine the highest capture_hidden_mode required
+        # (If we have FULL, we can emulate LAST or NULL)
+        # (If we have LAST, we can emulate NULL)
+        required_capture_hidden_mode = max(
+            capture_hidden_mode_required_by_forward_batch,
+            capture_hidden_mode_required_by_spec_info,
+            capture_hidden_mode_required_for_returning_hidden_states,
+        )
+
+        # If the current hidden mode is no longer aligned with the required hidden mode, we need to set it to what is required and re-capture
+        if self.capture_hidden_mode != required_capture_hidden_mode:
+            self.capture_hidden_mode = required_capture_hidden_mode
+            self.capture()
+
+    def replay_prepare(
+        self,
+        forward_batch: ForwardBatch,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ):
+        self.recapture_if_needed(forward_batch)
+
+        raw_bs = forward_batch.batch_size
+        raw_num_token = raw_bs * self.num_tokens_per_bs
+
+        # Pad
+        if self.require_mlp_tp_gather:
+            max_num_tokens = max(forward_batch.global_num_tokens_cpu)
+            max_batch_size = (
+                max_num_tokens / self.num_tokens_per_bs
+                if self.model_runner.spec_algorithm.is_eagle()
+                else max_num_tokens
+            )
+            index = bisect.bisect_left(self.capture_bs, max_batch_size)
+        else:
+            index = bisect.bisect_left(self.capture_bs, raw_bs)
+        bs = self.capture_bs[index]
+        if bs != raw_bs:
+            self.seq_lens.fill_(self.seq_len_fill_value)
+            self.out_cache_loc.zero_()
+
+        # Common inputs
+        self.input_ids[:raw_num_token].copy_(forward_batch.input_ids)
+        self.req_pool_indices[:raw_bs].copy_(forward_batch.req_pool_indices)
+        self.seq_lens[:raw_bs].copy_(forward_batch.seq_lens)
+        self.out_cache_loc[:raw_num_token].copy_(forward_batch.out_cache_loc)
+        self.positions[:raw_num_token].copy_(forward_batch.positions)
+
+        seq_lens_cpu = None
+        if forward_batch.seq_lens_cpu is not None:
+            if bs != raw_bs:
+                self.seq_lens_cpu.fill_(self.seq_len_fill_value)
+            self.seq_lens_cpu[:raw_bs].copy_(forward_batch.seq_lens_cpu)
+            seq_lens_cpu = self.seq_lens_cpu[:bs]
+
+        if pp_proxy_tensors:
+            for key in self.pp_proxy_tensors.keys():
+                dim = pp_proxy_tensors[key].shape[0]
+                self.pp_proxy_tensors[key][:dim].copy_(pp_proxy_tensors[key])
+
+        if self.is_encoder_decoder:
+            self.encoder_lens[:raw_bs].copy_(forward_batch.encoder_lens)
+        if forward_batch.mrope_positions is not None:
+            self.mrope_positions[:, :raw_num_token].copy_(forward_batch.mrope_positions)
+        if self.require_gathered_buffer:
+            self.global_num_tokens_gpu.fill_(bs * self.num_tokens_per_bs)
+            self.global_num_tokens_for_logprob_gpu.fill_(bs * self.num_tokens_per_bs)
+        if enable_num_token_non_padded(self.model_runner.server_args):
+            num_token_non_padded = forward_batch.num_token_non_padded
+            if self.require_gathered_buffer:
+                tokens_per_rank = bs // self.attn_tp_size * self.num_tokens_per_bs
+                num_local_token_non_padded = torch.clamp(
+                    num_token_non_padded - tokens_per_rank * self.attn_tp_rank,
+                    min=0,
+                    max=tokens_per_rank,
+                )
+                self.num_token_non_padded.copy_(num_local_token_non_padded)
+            else:
+                self.num_token_non_padded.copy_(num_token_non_padded)
+        if self.enable_two_batch_overlap:
+            self.tbo_plugin.replay_prepare(
+                forward_mode=self.capture_forward_mode,
+                bs=bs,
+                num_token_non_padded=len(forward_batch.input_ids),
+                spec_info=forward_batch.spec_info,
+            )
+        if forward_batch.forward_mode.is_idle() and forward_batch.spec_info is not None:
+            forward_batch.spec_info.custom_mask = self.custom_mask
+        # Attention backend
+        self.model_runner.attn_backend.init_forward_metadata_replay_cuda_graph(
+            bs,
+            self.req_pool_indices[:bs],
+            self.seq_lens[:bs],
+            forward_batch.seq_lens_sum + (bs - raw_bs) * self.seq_len_fill_value,
+            self.encoder_lens[:bs] if self.is_encoder_decoder else None,
+            self.capture_forward_mode,
+            forward_batch.spec_info,
+            seq_lens_cpu=seq_lens_cpu,
+        )
+
+        # Store fields
+        self.raw_bs = raw_bs
+        self.raw_num_token = raw_num_token
+        self.bs = bs
+
+    def replay(
+        self,
+        forward_batch: ForwardBatch,
+        skip_attn_backend_init: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[LogitsProcessorOutput, PPProxyTensors]:
+        if not skip_attn_backend_init:
+            self.replay_prepare(forward_batch, pp_proxy_tensors)
+        else:
+            # In speculative decoding, these two fields are still needed.
+            self.input_ids[: self.raw_num_token].copy_(forward_batch.input_ids)
+            self.positions[: self.raw_num_token].copy_(forward_batch.positions)
+
+        # Replay
+        self.graphs[self.bs].replay()
+
+        output = self.output_buffers[self.bs]
+        if isinstance(output, LogitsProcessorOutput):
+            return LogitsProcessorOutput(
+                next_token_logits=output.next_token_logits[: self.raw_num_token],
+                hidden_states=(
+                    output.hidden_states[: self.raw_num_token]
+                    if output.hidden_states is not None
+                    else None
+                ),
+            )
+        else:
+            assert isinstance(output, PPProxyTensors)
+            return PPProxyTensors({k: v[: self.bs] for k, v in output.tensors.items()})
+
+    def get_spec_info(self, num_tokens: int):
+        spec_info = None
+        if (
+            self.model_runner.spec_algorithm.is_eagle()
+            or self.model_runner.spec_algorithm.is_standalone()
+        ):
+            from sglang.srt.speculative.eagle_utils import EagleVerifyInput
+
+            if self.model_runner.is_draft_worker:
+                raise RuntimeError("This should not happen.")
+            else:
+                spec_info = EagleVerifyInput(
+                    draft_token=None,
+                    custom_mask=self.custom_mask,
+                    positions=None,
+                    retrive_index=None,
+                    retrive_next_token=None,
+                    retrive_next_sibling=None,
+                    retrive_cum_len=None,
+                    spec_steps=self.model_runner.server_args.speculative_num_steps,
+                    topk=self.model_runner.server_args.speculative_eagle_topk,
+                    draft_token_num=self.model_runner.server_args.speculative_num_draft_tokens,
+                    capture_hidden_mode=CaptureHiddenMode.FULL,
+                    seq_lens_sum=None,
+                    seq_lens_cpu=None,
+                )
+
+        return spec_info
+
+
+CUDA_GRAPH_CAPTURE_FAILED_MSG = (
+    "Possible solutions:\n"
+    "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
+    "2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
+    "3. disable torch compile by not using --enable-torch-compile\n"
+    "4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
+    "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
+)
diff --git a/python/sglang/srt/model_executor/forward_batch_info.py b/python/sglang/srt/model_executor/forward_batch_info.py
new file mode 100644
index 000000000..e343d6b4f
--- /dev/null
+++ b/python/sglang/srt/model_executor/forward_batch_info.py
@@ -0,0 +1,1094 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Store information about a forward batch.
+
+The following is the flow of data structures for a batch:
+
+ScheduleBatch -> ModelWorkerBatch -> ForwardBatch
+
+- ScheduleBatch is managed by `scheduler.py::Scheduler`.
+  It contains high-level scheduling data. Most of the data is on the CPU.
+- ModelWorkerBatch is managed by `tp_worker.py::TpModelWorker`.
+  It is a subset of `ScheduleBatch` that only contains data related to the model forward on GPU.
+  It will be transformed from CPU scheduler to GPU model runner.
+- ForwardBatch is managed by `model_runner.py::ModelRunner`.
+  It contains low-level tensor data. Most of the data consists of GPU tensors.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from enum import IntEnum, auto
+from functools import total_ordering
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.distributed.parallel_state import get_moe_expert_parallel_world_size
+from sglang.srt.layers.dp_attention import (
+    DpPaddingMode,
+    get_attention_dp_rank,
+    get_attention_tp_size,
+    set_dp_buffer_len,
+)
+from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
+from sglang.srt.utils import (
+    flatten_nested_list,
+    get_compiler_backend,
+    is_npu,
+    support_triton,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+    from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+    from sglang.srt.managers.schedule_batch import ModelWorkerBatch, MultimodalInputs
+    from sglang.srt.mem_cache.memory_pool import KVCache, ReqToTokenPool
+    from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
+    from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
+    from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+
+_is_npu = is_npu()
+
+
+class ForwardMode(IntEnum):
+    # Extend a sequence. The KV cache of the beginning part of the sequence is already computed (e.g., system prompt).
+    # It is also called "prefill" in common terminology.
+    EXTEND = auto()
+    # Decode one token.
+    DECODE = auto()
+    # Contains both EXTEND and DECODE when doing chunked prefill.
+    MIXED = auto()
+    # No sequence to forward. For data parallel attention, some workers will be IDLE if no sequence are allocated.
+    IDLE = auto()
+
+    # Used in speculative decoding: verify a batch in the target model.
+    TARGET_VERIFY = auto()
+    # Used in speculative decoding: extend a batch in the draft model.
+    DRAFT_EXTEND = auto()
+
+    # A dummy first batch to start the pipeline for overlap scheduler.
+    # It is now used for triggering the sampling_info_done event for the first prefill batch.
+    DUMMY_FIRST = auto()
+
+    # Split Prefill for PD multiplexing
+    SPLIT_PREFILL = auto()
+
+    def is_prefill(self):
+        return self.is_extend()
+
+    def is_extend(self):
+        return (
+            self == ForwardMode.EXTEND
+            or self == ForwardMode.MIXED
+            or self == ForwardMode.DRAFT_EXTEND
+            or self == ForwardMode.TARGET_VERIFY
+        )
+
+    def is_decode(self):
+        return self == ForwardMode.DECODE
+
+    def is_mixed(self):
+        return self == ForwardMode.MIXED
+
+    def is_idle(self):
+        return self == ForwardMode.IDLE
+
+    def is_decode_or_idle(self):
+        return self == ForwardMode.DECODE or self == ForwardMode.IDLE
+
+    def is_target_verify(self):
+        return self == ForwardMode.TARGET_VERIFY
+
+    def is_draft_extend(self):
+        return self == ForwardMode.DRAFT_EXTEND
+
+    def is_extend_or_draft_extend_or_mixed(self):
+        return (
+            self == ForwardMode.EXTEND
+            or self == ForwardMode.DRAFT_EXTEND
+            or self == ForwardMode.MIXED
+        )
+
+    def is_cuda_graph(self):
+        return (
+            self == ForwardMode.DECODE
+            or self == ForwardMode.TARGET_VERIFY
+            or self == ForwardMode.IDLE
+        )
+
+    def is_cpu_graph(self):
+        return self == ForwardMode.DECODE
+
+    def is_dummy_first(self):
+        return self == ForwardMode.DUMMY_FIRST
+
+    def is_split_prefill(self):
+        return self == ForwardMode.SPLIT_PREFILL
+
+
+@total_ordering
+class CaptureHiddenMode(IntEnum):
+    # Do not capture anything.
+    NULL = 0
+    # Capture a hidden state of the last token.
+    LAST = 1
+    # Capture hidden states of all tokens.
+    FULL = 2
+
+    def need_capture(self):
+        return self != CaptureHiddenMode.NULL
+
+    def is_full(self):
+        return self == CaptureHiddenMode.FULL
+
+    def is_last(self):
+        return self == CaptureHiddenMode.LAST
+
+    def __lt__(self, other):
+        return self.value < other.value
+
+
+@dataclass
+class ForwardBatch:
+    """Store all inputs of a forward pass."""
+
+    # The forward mode
+    forward_mode: ForwardMode
+    # The batch size
+    batch_size: int
+    # The input ids
+    input_ids: torch.Tensor
+    # The indices of requests in the req_to_token_pool
+    req_pool_indices: torch.Tensor
+    # The sequence length
+    seq_lens: torch.Tensor
+    # The indices of output tokens in the token_to_kv_pool
+    out_cache_loc: torch.Tensor
+
+    # The sum of all sequence lengths
+    seq_lens_sum: int
+
+    # The original sequence length without being chunked. Qwen-1M related.
+    orig_seq_lens: Optional[torch.Tensor] = None
+
+    # Optional seq_lens on cpu
+    seq_lens_cpu: Optional[torch.Tensor] = None
+
+    # For logprob
+    return_logprob: bool = False
+    top_logprobs_nums: Optional[List[int]] = None
+    token_ids_logprobs: Optional[List[List[int]]] = None
+
+    # For logits and logprobs post processing
+    next_token_logits_buffer: torch.Tensor = None
+    temp_scaled_logprobs: bool = False
+    temperature: torch.Tensor = None
+    top_p_normalized_logprobs: bool = False
+    top_p: torch.Tensor = None
+
+    # Position information
+    positions: torch.Tensor = None
+
+    # For extend
+    extend_num_tokens: Optional[int] = None
+    extend_seq_lens: Optional[torch.Tensor] = None
+    extend_prefix_lens: Optional[torch.Tensor] = None
+    extend_start_loc: Optional[torch.Tensor] = None
+    extend_prefix_lens_cpu: Optional[List[int]] = None
+    extend_seq_lens_cpu: Optional[List[int]] = None
+    extend_logprob_start_lens_cpu: Optional[List[int]] = None
+    extend_input_logprob_token_ids_gpu: Optional[torch.Tensor] = None
+
+    # For split prefill
+    # intermediate values for split prefill
+    hidden_states: torch.Tensor = None
+    residual: torch.Tensor = None
+    model_specific_states: Dict[str, any] = None
+    # current split index of layer
+    split_index: int = 0
+
+    # For MLA chunked prefix cache used in chunked prefill
+    # Tell attention backend whether the kv cache needs to be attended in current pass
+    attn_attend_prefix_cache: Optional[bool] = None
+    # Number of prefix cache chunks
+    num_prefix_chunks: Optional[int] = None
+    # Index of current chunk, used by attention backend
+    prefix_chunk_idx: Optional[int] = None
+    # Maximum number of tokens in each chunk per sequence. Computed from maximum chunk capacity
+    prefix_chunk_len: Optional[int] = None
+    # Start positions of prefix cache for each chunk, (num_prefix_chunks, batch_size)
+    prefix_chunk_starts: Optional[torch.Tensor] = None
+    # Lengths of prefix cache for each chunk, (num_prefix_chunks, batch_size)
+    prefix_chunk_seq_lens: Optional[torch.Tensor] = None
+    # Accumulated lengths of prefix cache for each chunk, (num_prefix_chunks, batch_size + 1)
+    prefix_chunk_cu_seq_lens: Optional[torch.Tensor] = None
+    # Max lengths of prefix cache for each chunk, (num_prefix_chunks,)
+    prefix_chunk_max_seq_lens: Optional[List[int]] = None
+    # Number of tokens in each prefix cache chunk, (num_prefix_chunks,)
+    prefix_chunk_num_tokens: Optional[List[int]] = None
+    # KV Indices for each chunk
+    prefix_chunk_kv_indices: Optional[List[torch.Tensor]] = None
+    # For MLA chunked prefix cache used in chunked prefill
+    # Tell attention backend whether lse needs to be returned
+    mha_return_lse: Optional[bool] = None
+
+    # For multimodal
+    mm_inputs: Optional[List[MultimodalInputs]] = None
+
+    # Encoder-decoder
+    encoder_cached: Optional[List[bool]] = None
+    encoder_lens: Optional[torch.Tensor] = None
+    encoder_lens_cpu: Optional[List[int]] = None
+    encoder_out_cache_loc: Optional[torch.Tensor] = None
+
+    # For LoRA
+    lora_ids: Optional[List[str]] = None
+
+    # For input embeddings
+    input_embeds: Optional[torch.Tensor] = None
+
+    # For cross-encoder model
+    token_type_ids: Optional[torch.Tensor] = None
+
+    # Sampling info
+    sampling_info: SamplingBatchInfo = None
+
+    # Attention backend
+    req_to_token_pool: ReqToTokenPool = None
+    token_to_kv_pool: KVCache = None
+    attn_backend: AttentionBackend = None
+
+    # For DP attention
+    global_num_tokens_cpu: Optional[List[int]] = None
+    global_num_tokens_gpu: Optional[torch.Tensor] = None
+    # Has to be None when cuda graph is captured.
+    global_num_tokens_for_logprob_cpu: Optional[List[int]] = None
+    global_num_tokens_for_logprob_gpu: Optional[torch.Tensor] = None
+    # The padding mode for DP attention
+    dp_padding_mode: Optional[DpPaddingMode] = None
+    # for extend, local start pos and num tokens is different in logits processor
+    # this will be computed in get_dp_local_info
+    # this will be recomputed in LogitsMetadata.from_forward_batch
+    dp_local_start_pos: Optional[torch.Tensor] = None  # cached info at runtime
+    dp_local_num_tokens: Optional[torch.Tensor] = None  # cached info at runtime
+    global_dp_buffer_len: Optional[int] = None
+    is_extend_in_batch: bool = False
+    can_run_dp_cuda_graph: bool = False
+    global_forward_mode: Optional[ForwardMode] = None
+
+    # Speculative decoding
+    spec_info: Optional[Union[EagleVerifyInput, EagleDraftInput]] = None
+    spec_algorithm: SpeculativeAlgorithm = None
+    capture_hidden_mode: CaptureHiddenMode = None
+
+    # For padding
+    padded_static_len: int = -1  # -1 if not padded
+    num_token_non_padded: Optional[torch.Tensor] = None  # scalar tensor
+
+    # For Qwen2-VL
+    mrope_positions: torch.Tensor = None
+
+    # For two-batch overlap
+    tbo_split_seq_index: Optional[int] = None
+    tbo_parent_token_range: Optional[Tuple[int, int]] = None
+    tbo_children: Optional[List[ForwardBatch]] = None
+
+    @classmethod
+    def init_new(
+        cls,
+        batch: ModelWorkerBatch,
+        model_runner: ModelRunner,
+    ):
+        from sglang.srt.two_batch_overlap import TboForwardBatchPreparer
+
+        ret = cls(
+            forward_mode=batch.forward_mode,
+            batch_size=len(batch.seq_lens),
+            input_ids=batch.input_ids,
+            req_pool_indices=batch.req_pool_indices,
+            seq_lens=batch.seq_lens,
+            out_cache_loc=batch.out_cache_loc,
+            mm_inputs=batch.multimodal_inputs,
+            encoder_cached=batch.encoder_cached,
+            encoder_lens=batch.encoder_lens,
+            encoder_lens_cpu=batch.encoder_lens_cpu,
+            encoder_out_cache_loc=batch.encoder_out_cache_loc,
+            seq_lens_sum=batch.seq_lens_sum,
+            seq_lens_cpu=batch.seq_lens_cpu,
+            orig_seq_lens=batch.orig_seq_lens,
+            return_logprob=batch.return_logprob,
+            top_logprobs_nums=batch.top_logprobs_nums,
+            token_ids_logprobs=batch.token_ids_logprobs,
+            is_extend_in_batch=batch.is_extend_in_batch,
+            can_run_dp_cuda_graph=batch.can_run_dp_cuda_graph,
+            global_forward_mode=batch.global_forward_mode,
+            lora_ids=batch.lora_ids,
+            sampling_info=batch.sampling_info,
+            req_to_token_pool=model_runner.req_to_token_pool,
+            token_to_kv_pool=model_runner.token_to_kv_pool,
+            attn_backend=model_runner.attn_backend,
+            spec_algorithm=batch.spec_algorithm,
+            spec_info=batch.spec_info,
+            capture_hidden_mode=batch.capture_hidden_mode,
+            input_embeds=batch.input_embeds,
+            token_type_ids=batch.token_type_ids,
+            tbo_split_seq_index=batch.tbo_split_seq_index,
+        )
+        device = model_runner.device
+
+        if batch.extend_input_logprob_token_ids is not None:
+            ret.extend_input_logprob_token_ids_gpu = (
+                batch.extend_input_logprob_token_ids.to(device, non_blocking=True)
+            )
+
+        if enable_num_token_non_padded(model_runner.server_args):
+            ret.num_token_non_padded = torch.tensor(
+                len(batch.input_ids), dtype=torch.int32
+            ).to(device, non_blocking=True)
+
+        # For MLP sync
+        if batch.global_num_tokens is not None:
+            from sglang.srt.speculative.eagle_utils import (
+                EagleDraftInput,
+                EagleVerifyInput,
+            )
+
+            assert batch.global_num_tokens_for_logprob is not None
+            # process global_num_tokens and global_num_tokens_for_logprob
+            if batch.spec_info is not None:
+                if isinstance(batch.spec_info, EagleDraftInput):
+                    global_num_tokens = [
+                        x * batch.spec_info.num_tokens_per_batch
+                        for x in batch.global_num_tokens
+                    ]
+                    global_num_tokens_for_logprob = [
+                        x * batch.spec_info.num_tokens_for_logprob_per_batch
+                        for x in batch.global_num_tokens_for_logprob
+                    ]
+                else:
+                    assert isinstance(batch.spec_info, EagleVerifyInput)
+                    global_num_tokens = [
+                        x * batch.spec_info.draft_token_num
+                        for x in batch.global_num_tokens
+                    ]
+                    global_num_tokens_for_logprob = [
+                        x * batch.spec_info.draft_token_num
+                        for x in batch.global_num_tokens_for_logprob
+                    ]
+            else:
+                global_num_tokens = batch.global_num_tokens
+                global_num_tokens_for_logprob = batch.global_num_tokens_for_logprob
+
+            ret.global_num_tokens_cpu = global_num_tokens
+            ret.global_num_tokens_gpu = torch.tensor(
+                global_num_tokens, dtype=torch.int64
+            ).to(device, non_blocking=True)
+
+            ret.global_num_tokens_for_logprob_cpu = global_num_tokens_for_logprob
+            ret.global_num_tokens_for_logprob_gpu = torch.tensor(
+                global_num_tokens_for_logprob, dtype=torch.int64
+            ).to(device, non_blocking=True)
+
+        if ret.forward_mode.is_idle():
+            ret.positions = torch.empty((0,), dtype=torch.int64, device=device)
+            TboForwardBatchPreparer.prepare(
+                ret, is_draft_worker=model_runner.is_draft_worker
+            )
+            return ret
+
+        # Override the positions with spec_info
+        if (
+            ret.spec_info is not None
+            and getattr(ret.spec_info, "positions", None) is not None
+        ):
+            ret.positions = ret.spec_info.positions
+
+        # Init position information
+        if ret.forward_mode.is_decode():
+            if ret.positions is None:
+                ret.positions = clamp_position(batch.seq_lens)
+        else:
+            ret.extend_seq_lens = torch.tensor(
+                batch.extend_seq_lens, dtype=torch.int32
+            ).to(device, non_blocking=True)
+            ret.extend_prefix_lens = torch.tensor(
+                batch.extend_prefix_lens, dtype=torch.int32
+            ).to(device, non_blocking=True)
+            ret.extend_num_tokens = batch.extend_num_tokens
+            positions, ret.extend_start_loc = compute_position(
+                model_runner.server_args.attention_backend,
+                ret.extend_prefix_lens,
+                ret.extend_seq_lens,
+                ret.extend_num_tokens,
+            )
+            if ret.positions is None:
+                ret.positions = positions
+            ret.extend_prefix_lens_cpu = batch.extend_prefix_lens
+            ret.extend_seq_lens_cpu = batch.extend_seq_lens
+            ret.extend_logprob_start_lens_cpu = batch.extend_logprob_start_lens
+
+        if model_runner.model_is_mrope:
+            if (
+                ret.spec_info is not None
+                and getattr(ret.spec_info, "positions", None) is not None
+            ):
+                ret._compute_spec_mrope_positions(model_runner, batch)
+            else:
+                ret._compute_mrope_positions(model_runner, batch)
+
+        # Init lora information
+        if model_runner.server_args.enable_lora:
+            model_runner.lora_manager.prepare_lora_batch(ret)
+
+        TboForwardBatchPreparer.prepare(
+            ret, is_draft_worker=model_runner.is_draft_worker
+        )
+
+        return ret
+
+    def merge_mm_inputs(self) -> Optional[MultimodalInputs]:
+        """
+        Merge all multimodal inputs in the batch into a single MultiModalInputs object.
+
+        Returns:
+            if none, current batch contains no multimodal input
+
+        """
+        if not self.mm_inputs or all(x is None for x in self.mm_inputs):
+            return None
+        # Filter out None values
+        valid_inputs = [x for x in self.mm_inputs if x is not None]
+
+        # TODO: is it expensive?
+        # a workaround to avoid importing `MultimodalInputs`
+        merged = valid_inputs[0].__class__(mm_items=[])
+
+        # Merge remaining inputs
+        for mm_input in valid_inputs:
+            merged.merge(mm_input)
+
+        return merged
+
+    def contains_image_inputs(self) -> bool:
+        if self.mm_inputs is None:
+            return False
+        return any(
+            mm_input is not None and mm_input.contains_image_inputs()
+            for mm_input in self.mm_inputs
+        )
+
+    def contains_audio_inputs(self) -> bool:
+        if self.mm_inputs is None:
+            return False
+        return any(
+            mm_input is not None and mm_input.contains_audio_inputs()
+            for mm_input in self.mm_inputs
+        )
+
+    def contains_video_inputs(self) -> bool:
+        if self.mm_inputs is None:
+            return False
+        return any(
+            mm_input is not None and mm_input.contains_video_inputs()
+            for mm_input in self.mm_inputs
+        )
+
+    def contains_mm_inputs(self) -> bool:
+        return (
+            self.contains_audio_inputs()
+            or self.contains_video_inputs()
+            or self.contains_image_inputs()
+        )
+
+    def _compute_spec_mrope_positions(
+        self, model_runner: ModelRunner, batch: ModelWorkerBatch
+    ):
+        # TODO support batched deltas
+        batch_size = self.seq_lens.shape[0]
+        device = model_runner.device
+        mm_inputs = batch.multimodal_inputs
+
+        if batch.forward_mode.is_draft_extend():  # draft_extend_after_decode
+            mrope_deltas = []
+            extend_lens = []
+            for batch_idx in range(batch_size):
+                extend_seq_len = batch.extend_seq_lens[batch_idx]
+                extend_lens.append(extend_seq_len)
+                mrope_delta = (
+                    torch.zeros(1, dtype=torch.int64)
+                    if mm_inputs[batch_idx] is None
+                    else mm_inputs[batch_idx].mrope_position_delta.squeeze(0)
+                )
+                mrope_deltas.append(mrope_delta.to(device=device))
+            position_chunks = torch.split(batch.spec_info.positions, extend_lens)
+            mrope_positions_list = [
+                pos_chunk + delta
+                for pos_chunk, delta in zip(position_chunks, mrope_deltas)
+            ]
+            next_input_positions = (
+                torch.cat(mrope_positions_list, dim=0).unsqueeze(0).repeat(3, 1)
+            )
+
+        else:  # target_verify or draft_decode
+            seq_positions = batch.spec_info.positions.view(batch_size, -1)
+            mrope_deltas = [
+                (
+                    torch.tensor([0], dtype=torch.int64)
+                    if mm_inputs[i] is None
+                    else mm_inputs[i].mrope_position_delta.squeeze(0)
+                )
+                for i in range(batch_size)
+            ]
+            mrope_delta_tensor = torch.stack(mrope_deltas, dim=0).to(device=device)
+            next_input_positions = (
+                (seq_positions + mrope_delta_tensor).flatten().unsqueeze(0).repeat(3, 1)
+            )
+
+        self.mrope_positions = next_input_positions
+
+    def _compute_mrope_positions(
+        self, model_runner: ModelRunner, batch: ModelWorkerBatch
+    ):
+        # batch_size * [3 * seq_len]
+        batch_size = self.seq_lens.shape[0]
+        mrope_positions_list = [[]] * batch_size
+        for batch_idx in range(batch_size):
+            mm_input = batch.multimodal_inputs[batch_idx]
+            if self.forward_mode.is_decode():
+                # 3 * N
+                if mm_input is None:
+                    mrope_positions_list[batch_idx] = torch.full(
+                        (3, 1),
+                        self.seq_lens[batch_idx] - 1,
+                        dtype=torch.int64,
+                        device=model_runner.device,
+                    )
+                else:
+                    mrope_position_deltas = mm_input.mrope_position_delta.flatten().to(
+                        model_runner.device, non_blocking=True
+                    )
+                    mrope_positions_list[batch_idx] = (
+                        (mrope_position_deltas + self.seq_lens[batch_idx] - 1)
+                        .unsqueeze(0)
+                        .repeat(3, 1)
+                    )
+            elif self.forward_mode.is_extend():
+                extend_seq_len, extend_prefix_len = (
+                    batch.extend_seq_lens[batch_idx],
+                    batch.extend_prefix_lens[batch_idx],
+                )
+                if mm_input is None:
+                    # text only
+                    mrope_positions = torch.tensor(
+                        [
+                            [
+                                pos
+                                for pos in range(
+                                    extend_prefix_len,
+                                    extend_prefix_len + extend_seq_len,
+                                )
+                            ]
+                        ]
+                        * 3
+                    )
+                else:
+                    mrope_positions = mm_input.mrope_positions[
+                        :,
+                        extend_prefix_len : extend_prefix_len + extend_seq_len,
+                    ]
+                mrope_positions_list[batch_idx] = mrope_positions
+
+        self.mrope_positions = torch.cat(
+            [pos.to(device=model_runner.device) for pos in mrope_positions_list],
+            dim=1,
+        ).to(dtype=torch.int64, device=model_runner.device)
+
+    def get_max_chunk_capacity(self):
+        # Maximum number of tokens in each chunk
+        # TODO: Should be changed to a better value, maybe passed through server args
+        return 128 * 1024
+
+    def set_prefix_chunk_idx(self, idx: int):
+        self.prefix_chunk_idx = idx
+
+    def set_attn_attend_prefix_cache(self, attn_attend_prefix_cache: bool):
+        self.attn_attend_prefix_cache = attn_attend_prefix_cache
+
+    def prepare_chunked_kv_indices(self, device: torch.device):
+        self.prefix_chunk_kv_indices = []
+        for idx in range(self.num_prefix_chunks):
+            chunk_starts = self.prefix_chunk_starts[idx]
+            chunk_seq_lens = self.prefix_chunk_seq_lens[idx]
+            chunk_cu_seq_lens = self.prefix_chunk_cu_seq_lens[idx]
+            num_chunk_tokens = self.prefix_chunk_num_tokens[idx]
+
+            chunk_kv_indices = torch.empty(
+                num_chunk_tokens, dtype=torch.int32, device=device
+            )
+
+            create_chunked_prefix_cache_kv_indices[(self.batch_size,)](
+                self.req_to_token_pool.req_to_token,
+                self.req_pool_indices,
+                chunk_starts,
+                chunk_seq_lens,
+                chunk_cu_seq_lens,
+                chunk_kv_indices,
+                self.req_to_token_pool.req_to_token.shape[1],
+            )
+            self.prefix_chunk_kv_indices.append(chunk_kv_indices)
+
+    def _pad_tensor_to_size(self, tensor: torch.Tensor, size: int, *, value: int = 0):
+        if value == 0:
+            return torch.cat(
+                [tensor, tensor.new_zeros(size - tensor.shape[0], *tensor.shape[1:])],
+                dim=0,
+            )
+        else:
+            return torch.cat(
+                [
+                    tensor,
+                    tensor.new_full((size - tensor.shape[0], *tensor.shape[1:]), value),
+                ],
+                dim=0,
+            )
+
+    def prepare_mlp_sync_batch(self, model_runner: ModelRunner):
+
+        from sglang.srt.speculative.eagle_utils import EagleDraftInput
+
+        assert self.global_num_tokens_cpu is not None
+        assert self.global_num_tokens_for_logprob_cpu is not None
+
+        global_num_tokens = self.global_num_tokens_cpu
+        sync_group_size = len(global_num_tokens)
+        attn_tp_size = get_attention_tp_size()
+
+        for i in range(sync_group_size):
+            # make sure that the padded length is divisible by attn_tp_size because we may need reduce-scatter across attn_tp dim.
+            # there is no reduce-scatter in LM logprob, so we do not need to adjust the padded length for logprob
+            global_num_tokens[i] = (
+                (global_num_tokens[i] - 1) // attn_tp_size + 1
+            ) * attn_tp_size
+
+        dp_padding_mode = DpPaddingMode.get_dp_padding_mode(global_num_tokens)
+        self.dp_padding_mode = dp_padding_mode
+
+        if dp_padding_mode.is_max_len():
+            # when DP gather mode is all gather, we will use
+            # all_gather_into_tensor to gather hidden states, where transferred
+            # tokens should be padded to the same length. We will also use
+            # reduce-scatter instead of all-reduce after MLP.
+            max_num_tokens = max(global_num_tokens)
+            global_num_tokens = [max_num_tokens] * sync_group_size
+            buffer_len = max_num_tokens * sync_group_size
+        else:
+            buffer_len = sum(global_num_tokens)
+
+        if len(global_num_tokens) > 1:
+            num_tokens = global_num_tokens[get_attention_dp_rank()]
+        else:
+            num_tokens = global_num_tokens[0]
+
+        self.global_dp_buffer_len = buffer_len
+        set_dp_buffer_len(buffer_len, num_tokens, global_num_tokens)
+
+        bs = self.batch_size
+
+        if self.forward_mode.is_decode():
+            if self.is_extend_in_batch and dp_padding_mode.is_max_len():
+                setattr(self, "_original_forward_mode", self.forward_mode)
+                self.forward_mode = ForwardMode.EXTEND
+                self.extend_num_tokens = bs
+                self.extend_seq_lens = torch.full_like(self.seq_lens, 1)
+                self.extend_prefix_lens = self.seq_lens - 1
+                self.extend_start_loc = torch.arange(
+                    bs, dtype=torch.int32, device=self.seq_lens.device
+                )
+                self.extend_prefix_lens_cpu = self.extend_prefix_lens.cpu()
+                self.extend_seq_lens_cpu = self.extend_seq_lens.cpu()
+                self.extend_logprob_start_lens_cpu = self.extend_prefix_lens_cpu
+            else:
+                setattr(self, "_original_batch_size", self.batch_size)
+                if self.spec_info is not None:
+                    bs = self.batch_size = (
+                        num_tokens // self.spec_info.num_tokens_per_batch
+                    )
+                else:
+                    bs = self.batch_size = num_tokens
+
+        # padding
+        self.input_ids = self._pad_tensor_to_size(self.input_ids, num_tokens)
+        self.req_pool_indices = self._pad_tensor_to_size(self.req_pool_indices, bs)
+
+        seq_len_fill_value = (
+            model_runner.attn_backend.get_cuda_graph_seq_len_fill_value()
+        )
+        self.seq_lens_sum = self.seq_lens_sum + seq_len_fill_value * (
+            bs - self.seq_lens.shape[0]
+        )
+        self.seq_lens = self._pad_tensor_to_size(
+            self.seq_lens, bs, value=seq_len_fill_value
+        )
+        if self.seq_lens_cpu is not None:
+            self.seq_lens_cpu = self._pad_tensor_to_size(
+                self.seq_lens_cpu, bs, value=seq_len_fill_value
+            )
+
+        self.out_cache_loc = self._pad_tensor_to_size(self.out_cache_loc, num_tokens)
+        if self.encoder_lens is not None:
+            self.encoder_lens = self._pad_tensor_to_size(self.encoder_lens, bs)
+        self.positions = self._pad_tensor_to_size(self.positions, num_tokens)
+        self.global_num_tokens_cpu = global_num_tokens
+        self.global_num_tokens_gpu = self.global_num_tokens_gpu.new_tensor(
+            global_num_tokens
+        )
+
+        if self.mrope_positions is not None:
+            self.mrope_positions = self._pad_tensor_to_size(self.mrope_positions, bs)
+
+        # TODO: check if we need to pad other tensors
+        if self.extend_seq_lens is not None:
+            self.extend_seq_lens = self._pad_tensor_to_size(self.extend_seq_lens, bs)
+
+        if self.spec_info is not None and isinstance(self.spec_info, EagleDraftInput):
+            spec_info = self.spec_info
+            self.output_cache_loc_backup = self.out_cache_loc
+            self.hidden_states_backup = spec_info.hidden_states
+            if spec_info.topk_p is not None:
+                spec_info.topk_p = self._pad_tensor_to_size(spec_info.topk_p, bs)
+            if spec_info.topk_index is not None:
+                spec_info.topk_index = self._pad_tensor_to_size(
+                    spec_info.topk_index, bs
+                )
+            if spec_info.accept_length is not None:
+                spec_info.accept_length = self._pad_tensor_to_size(
+                    spec_info.accept_length, bs
+                )
+            spec_info.hidden_states = self._pad_tensor_to_size(
+                spec_info.hidden_states, num_tokens
+            )
+
+    def post_forward_mlp_sync_batch(self, logits_output: LogitsProcessorOutput):
+
+        self.forward_mode = getattr(self, "_original_forward_mode", self.forward_mode)
+        self.batch_size = getattr(self, "_original_batch_size", self.batch_size)
+        bs = self.batch_size
+
+        if self.spec_info is not None:
+            if self.forward_mode.is_decode():  # draft
+                num_tokens = self.hidden_states_backup.shape[0]
+                self.positions = self.positions[:num_tokens]
+                self.seq_lens = self.seq_lens[:bs]
+                self.req_pool_indices = self.req_pool_indices[:bs]
+                if self.seq_lens_cpu is not None:
+                    self.seq_lens_cpu = self.seq_lens_cpu[:bs]
+                logits_output.next_token_logits = logits_output.next_token_logits[
+                    :num_tokens
+                ]
+                logits_output.hidden_states = logits_output.hidden_states[:num_tokens]
+            elif self.forward_mode.is_target_verify():  # verify
+                num_tokens = bs * self.spec_info.draft_token_num
+                logits_output.next_token_logits = logits_output.next_token_logits[
+                    :num_tokens
+                ]
+                logits_output.hidden_states = logits_output.hidden_states[:num_tokens]
+            elif self.forward_mode.is_draft_extend():  # draft extend
+                self.spec_info.accept_length = self.spec_info.accept_length[:bs]
+                logits_output.next_token_logits = logits_output.next_token_logits[:bs]
+                logits_output.hidden_states = logits_output.hidden_states[:bs]
+            elif self.forward_mode.is_extend() or self.forward_mode.is_idle():
+                logits_output.next_token_logits = logits_output.next_token_logits[:bs]
+                logits_output.hidden_states = logits_output.hidden_states[:bs]
+
+            if hasattr(self, "hidden_states_backup"):
+                self.spec_info.hidden_states = self.hidden_states_backup
+            if hasattr(self, "output_cache_loc_backup"):
+                self.out_cache_loc = self.output_cache_loc_backup
+
+        elif self.forward_mode.is_decode() or self.forward_mode.is_idle():
+            logits_output.next_token_logits = logits_output.next_token_logits[:bs]
+            if logits_output.hidden_states is not None:
+                logits_output.hidden_states = logits_output.hidden_states[:bs]
+        elif self.forward_mode.is_extend():
+            num_tokens = self.seq_lens_sum
+            logits_output.next_token_logits = logits_output.next_token_logits[
+                :num_tokens
+            ]
+            if logits_output.hidden_states is not None:
+                logits_output.hidden_states = logits_output.hidden_states[:num_tokens]
+
+    # Here we suppose the length of each chunk is equal
+    # For example, if we have 4 sequences with prefix length [256, 512, 768, 1024], prefix_chunk_len = 256
+    # num_prefix_chunks = cdiv(1024, 256) = 4
+    # prefix_chunk_starts = [[0, 0, 0, 0], [256, 256, 256, 256], [512, 512, 512, 512], [768, 768, 768, 768]]
+    # prefix_chunk_ends = [[256, 256, 256, 256], [256, 512, 512, 512], [256, 512, 768, 768], [256, 512, 768, 1024]]
+    # prefix_chunk_seq_lens = [[256, 256, 256, 256], [0, 256, 256, 256], [0, 0, 256, 256], [0, 0, 0, 256]]
+    # TODO: Implement a better way to allocate chunk lengths that uses memory spaces more efficiently.
+    def get_prefix_chunk_seq_lens(
+        self, prefix_lens: torch.Tensor, num_prefix_chunks: int, prefix_chunk_len: int
+    ):
+        device = prefix_lens.device
+        prefix_chunk_starts = (
+            torch.arange(num_prefix_chunks, device=device, dtype=torch.int32)
+            .unsqueeze(1)
+            .expand(-1, self.batch_size)
+            * prefix_chunk_len
+        )
+        prefix_chunk_ends = torch.min(
+            prefix_lens.unsqueeze(0),
+            prefix_chunk_starts + prefix_chunk_len,
+        ).to(torch.int32)
+
+        prefix_chunk_seq_lens = (
+            (prefix_chunk_ends - prefix_chunk_starts).clamp(min=0).to(torch.int32)
+        )
+
+        return prefix_chunk_starts, prefix_chunk_seq_lens
+
+    # Called before each attention module if using chunked kv cache for prefill
+    # Some of the codes are adapted from https://github.com/vllm-project/vllm/blob/main/vllm/v1/attention/backends/mla/common.py
+    def prepare_chunked_prefix_cache_info(self, device: torch.device):
+
+        from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool
+
+        assert isinstance(
+            self.token_to_kv_pool, MLATokenToKVPool
+        ), "Currently chunked prefix cache can only be used by Deepseek models"
+
+        if self.prefix_chunk_len is not None:
+            # Chunked kv cache info already prepared by prior modules
+            return
+
+        self.prefix_chunk_idx = -1
+
+        # chunk_capacity is the maximum number of tokens in each chunk
+        chunk_capacity = self.get_max_chunk_capacity()
+        self.prefix_chunk_len = chunk_capacity // self.batch_size
+
+        self.num_prefix_chunks = (
+            max(self.extend_prefix_lens_cpu) + self.prefix_chunk_len - 1
+        ) // self.prefix_chunk_len
+
+        # Here we compute chunk lens twice to avoid stream sync, once on gpu and once on cpu.
+        prefix_chunk_starts_cuda, prefix_chunk_seq_lens_cuda = (
+            self.get_prefix_chunk_seq_lens(
+                self.extend_prefix_lens,
+                self.num_prefix_chunks,
+                self.prefix_chunk_len,
+            )
+        )
+        _, prefix_chunk_seq_lens_cpu = self.get_prefix_chunk_seq_lens(
+            torch.tensor(self.extend_prefix_lens_cpu),
+            self.num_prefix_chunks,
+            self.prefix_chunk_len,
+        )
+        self.prefix_chunk_starts = prefix_chunk_starts_cuda
+        self.prefix_chunk_seq_lens = prefix_chunk_seq_lens_cuda
+
+        # Metadata for attention backend
+        self.prefix_chunk_cu_seq_lens = torch.zeros(
+            self.num_prefix_chunks,
+            self.batch_size + 1,
+            device=device,
+            dtype=torch.int32,
+        )
+        self.prefix_chunk_cu_seq_lens[:, 1:] = prefix_chunk_seq_lens_cuda.cumsum(
+            dim=1
+        ).to(torch.int32)
+        self.prefix_chunk_max_seq_lens = prefix_chunk_seq_lens_cpu.max(
+            dim=1
+        ).values.tolist()
+
+        self.prefix_chunk_num_tokens = prefix_chunk_seq_lens_cpu.sum(dim=1).tolist()
+        assert max(self.prefix_chunk_num_tokens) <= self.get_max_chunk_capacity()
+
+        # Precompute the kv indices for each chunk
+        self.prepare_chunked_kv_indices(device)
+
+    @property
+    def can_run_tbo(self):
+        return self.tbo_split_seq_index is not None
+
+
+def enable_num_token_non_padded(server_args):
+    return get_moe_expert_parallel_world_size() > 1
+
+
+class PPProxyTensors:
+    # adapted from https://github.com/vllm-project/vllm/blob/d14e98d924724b284dc5eaf8070d935e214e50c0/vllm/sequence.py#L1103
+    tensors: Dict[str, torch.Tensor]
+
+    def __init__(self, tensors):
+        # manually define this function, so that
+        # Dynamo knows `IntermediateTensors()` comes from this file.
+        # Otherwise, dataclass will generate this function by evaluating
+        # a string, and we will lose the information about the source file.
+        self.tensors = tensors
+
+    def __getitem__(self, key: Union[str, slice]):
+        if isinstance(key, str):
+            return self.tensors[key]
+        elif isinstance(key, slice):
+            return self.__class__({k: v[key] for k, v in self.tensors.items()})
+
+    def __setitem__(self, key: str, value: torch.Tensor):
+        self.tensors[key] = value
+
+    def __len__(self):
+        return len(self.tensors)
+
+    def __eq__(self, other: object):
+        return isinstance(other, self.__class__) and self
+
+    def __repr__(self) -> str:
+        return f"PPProxyTensors(tensors={self.tensors})"
+
+
+def compute_position(
+    attn_backend: str,
+    extend_prefix_lens: torch.Tensor,
+    extend_seq_lens: torch.Tensor,
+    extend_seq_lens_sum: int,
+):
+    if support_triton(attn_backend):
+        positions, extend_start_loc = compute_position_triton(
+            extend_prefix_lens,
+            extend_seq_lens,
+            extend_seq_lens_sum,
+        )
+    else:
+        positions, extend_start_loc = compute_position_torch(
+            extend_prefix_lens, extend_seq_lens
+        )
+    return positions, extend_start_loc
+
+
+def compute_position_triton(
+    extend_prefix_lens: torch.Tensor, extend_seq_lens: torch.Tensor, extend_seq_lens_sum
+):
+    """Compute positions. It is a fused version of `compute_position_torch`."""
+    batch_size = extend_seq_lens.shape[0]
+    has_prefix = extend_prefix_lens.shape[0] == batch_size
+
+    positions = torch.empty(
+        extend_seq_lens_sum, dtype=torch.int64, device=extend_seq_lens.device
+    )
+    extend_start_loc = torch.empty(
+        batch_size, dtype=torch.int32, device=extend_seq_lens.device
+    )
+
+    # Launch kernel
+    compute_position_kernel[(batch_size,)](
+        positions,
+        extend_start_loc,
+        extend_prefix_lens,
+        extend_seq_lens,
+        has_prefix,
+    )
+
+    return positions, extend_start_loc
+
+
+@triton.jit
+def compute_position_kernel(
+    positions,
+    extend_start_loc,
+    extend_prefix_lens,
+    extend_seq_lens,
+    has_prefix: tl.constexpr,
+):
+    BLOCK_SIZE: tl.constexpr = 512
+    pid = tl.program_id(0).to(tl.int64)
+
+    prefix_len = tl.load(extend_prefix_lens + pid) if has_prefix else 0
+    seq_len = tl.load(extend_seq_lens + pid)
+
+    # NOTE: This can be slow for large bs
+    cumsum_start = tl.cast(0, tl.int64)
+    for i in range(pid):
+        cumsum_start += tl.load(extend_seq_lens + i)
+
+    num_loop = tl.cdiv(seq_len, BLOCK_SIZE)
+    for i in range(num_loop):
+        offset = tl.arange(0, BLOCK_SIZE) + i * BLOCK_SIZE
+        tl.store(
+            positions + cumsum_start + offset,
+            prefix_len + offset,
+            mask=offset < seq_len,
+        )
+    tl.store(extend_start_loc + pid, cumsum_start)
+
+
+def compute_position_torch(
+    extend_prefix_lens: torch.Tensor, extend_seq_lens: torch.Tensor
+):
+    positions = torch.cat(
+        [
+            torch.arange(
+                prefix_len, prefix_len + extend_len, device=extend_prefix_lens.device
+            )
+            for prefix_len, extend_len in zip(extend_prefix_lens, extend_seq_lens)
+        ],
+        axis=0,
+    )
+    extend_start_loc = torch.zeros_like(extend_seq_lens)
+    extend_start_loc[1:] = torch.cumsum(extend_seq_lens[:-1], dim=0)
+    return positions.to(torch.int64), extend_start_loc
+
+
+@torch.compile(dynamic=True, backend=get_compiler_backend(), disable=_is_npu)
+def clamp_position(seq_lens):
+    return torch.clamp((seq_lens - 1), min=0).to(torch.int64)
+
+
+@triton.jit
+def create_chunked_prefix_cache_kv_indices(
+    req_to_token_ptr,  # (max_batch, max_context_len,)
+    req_pool_indices_ptr,  # (batch_size,)
+    chunk_start_idx_ptr,  # (batch_size,)
+    chunk_seq_lens_ptr,  # (batch_size,)
+    chunk_cu_seq_lens_ptr,  # (batch_size + 1,)
+    chunk_kv_indices_ptr,  # (num_chunk_tokens,)
+    req_to_token_ptr_stride: tl.constexpr,
+):
+    BLOCK_SIZE: tl.constexpr = 512
+    pid = tl.program_id(axis=0)
+
+    # find the req pool idx, this is for batch to token
+    req_pool_index = tl.load(req_pool_indices_ptr + pid)
+    chunk_kv_indices_offset = tl.load(chunk_cu_seq_lens_ptr + pid)
+
+    # get the token positions of current chunk
+    chunk_start_pos = tl.load(chunk_start_idx_ptr + pid).to(tl.int32)
+    chunk_seq_len = tl.load(chunk_seq_lens_ptr + pid).to(tl.int32)
+
+    num_loop = tl.cdiv(chunk_seq_len, BLOCK_SIZE)
+    for i in range(num_loop):
+        offset = tl.arange(0, BLOCK_SIZE) + i * BLOCK_SIZE
+        mask = offset < chunk_seq_len
+        data = tl.load(
+            req_to_token_ptr
+            + req_pool_index * req_to_token_ptr_stride
+            + chunk_start_pos
+            + offset,
+            mask=mask,
+        )
+        tl.store(
+            chunk_kv_indices_ptr + chunk_kv_indices_offset + offset, data, mask=mask
+        )
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
new file mode 100644
index 000000000..e6bf890f0
--- /dev/null
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -0,0 +1,2071 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ModelRunner runs the forward passes of the models."""
+
+import datetime
+import gc
+import inspect
+import json
+import logging
+import os
+import time
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.distributed as dist
+
+from sglang.srt.configs.device_config import DeviceConfig
+from sglang.srt.configs.load_config import LoadConfig
+from sglang.srt.configs.model_config import AttentionArch, ModelConfig
+from sglang.srt.configs.update_config import adjust_config_with_unaligned_cpu_tp
+from sglang.srt.constants import GPU_MEMORY_TYPE_WEIGHTS
+from sglang.srt.distributed import (
+    get_pp_group,
+    get_tp_group,
+    get_world_group,
+    init_distributed_environment,
+    initialize_model_parallel,
+    set_custom_all_reduce,
+    set_mscclpp_all_reduce,
+)
+from sglang.srt.distributed.parallel_state import monkey_patch_vllm_parallel_state
+from sglang.srt.eplb.eplb_manager import EPLBManager
+from sglang.srt.eplb.expert_distribution import (
+    ExpertDistributionRecorder,
+    get_global_expert_distribution_recorder,
+    set_global_expert_distribution_recorder,
+)
+from sglang.srt.eplb.expert_location import (
+    ExpertLocationMetadata,
+    compute_initial_expert_location_metadata,
+    get_global_expert_location_metadata,
+    set_global_expert_location_metadata,
+)
+from sglang.srt.eplb.expert_location_updater import ExpertLocationUpdater
+from sglang.srt.layers.attention.tbo_backend import TboAttnBackend
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_group,
+    get_attention_tp_size,
+    initialize_dp_attention,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.layers.quantization import (
+    deep_gemm_wrapper,
+    monkey_patch_isinstance_for_vllm_base_layer,
+)
+from sglang.srt.layers.sampler import Sampler
+from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
+from sglang.srt.lora.lora_manager import LoRAManager
+from sglang.srt.lora.lora_registry import LoRARef
+from sglang.srt.managers.schedule_batch import (
+    GLOBAL_SERVER_ARGS_KEYS,
+    global_server_args_dict,
+)
+from sglang.srt.mem_cache.allocator import (
+    BaseTokenToKVPoolAllocator,
+    PagedTokenToKVPoolAllocator,
+    SWATokenToKVPoolAllocator,
+    TokenToKVPoolAllocator,
+)
+from sglang.srt.mem_cache.allocator_ascend import AscendPagedTokenToKVPoolAllocator
+from sglang.srt.mem_cache.memory_pool import (
+    AscendMLAPagedTokenToKVPool,
+    AscendTokenToKVPool,
+    DoubleSparseTokenToKVPool,
+    HybridLinearKVPool,
+    HybridReqToTokenPool,
+    MHATokenToKVPool,
+    MLATokenToKVPool,
+    ReqToTokenPool,
+    SWAKVPool,
+)
+from sglang.srt.model_executor.cpu_graph_runner import CPUGraphRunner
+from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_executor.npu_graph_runner import NPUGraphRunner
+from sglang.srt.model_loader import get_model
+from sglang.srt.model_loader.loader import DefaultModelLoader, get_model_loader
+from sglang.srt.model_loader.utils import set_default_torch_dtype
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.offloader import (
+    create_offloader_from_server_args,
+    get_offloader,
+    set_offloader,
+)
+from sglang.srt.patch_torch import monkey_patch_torch_reductions
+from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
+from sglang.srt.utils import (
+    MultiprocessingSerializer,
+    cpu_has_amx_support,
+    dynamic_import,
+    enable_show_time_cost,
+    get_available_gpu_memory,
+    get_bool_env_var,
+    get_cpu_ids_by_node,
+    init_custom_process_group,
+    is_fa3_default_architecture,
+    is_flashinfer_available,
+    is_hip,
+    is_hopper_with_cuda_12_3,
+    is_no_spec_infer_or_topk_one,
+    is_npu,
+    is_sm100_supported,
+    monkey_patch_p2p_access_check,
+    monkey_patch_vllm_gguf_config,
+    set_cuda_arch,
+)
+from sglang.srt.weight_sync.tensor_bucket import (
+    FlattenedTensorBucket,
+    FlattenedTensorMetadata,
+)
+
+_is_hip = is_hip()
+_is_npu = is_npu()
+_is_cpu_amx_available = cpu_has_amx_support()
+
+# Use a small KV cache pool size for tests in CI
+SGLANG_CI_SMALL_KV_SIZE = os.getenv("SGLANG_CI_SMALL_KV_SIZE", None)
+
+# Detect stragger ranks in model loading
+UNBALANCED_MODEL_LOADING_TIMEOUT_S = 300
+
+logger = logging.getLogger(__name__)
+
+
+class RankZeroFilter(logging.Filter):
+    """Filter that only allows INFO level logs from rank 0, but allows all other levels from any rank."""
+
+    def __init__(self, is_rank_zero):
+        super().__init__()
+        self.is_rank_zero = is_rank_zero
+
+    def filter(self, record):
+        if record.levelno == logging.INFO:
+            return self.is_rank_zero
+        return True
+
+
+class ModelRunner:
+    """ModelRunner runs the forward passes of the models."""
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        mem_fraction_static: float,
+        gpu_id: int,
+        tp_rank: int,
+        tp_size: int,
+        moe_ep_rank: int,
+        moe_ep_size: int,
+        pp_rank: int,
+        pp_size: int,
+        nccl_port: int,
+        server_args: ServerArgs,
+        dp_rank: Optional[int] = None,
+        is_draft_worker: bool = False,
+        req_to_token_pool: Optional[ReqToTokenPool] = None,
+        token_to_kv_pool_allocator: Optional[BaseTokenToKVPoolAllocator] = None,
+    ):
+        # Parse args
+        self.mem_fraction_static = mem_fraction_static
+        self.device = server_args.device
+        self.gpu_id = gpu_id
+        self.tp_rank = tp_rank
+        self.tp_size = tp_size
+        self.moe_ep_rank = moe_ep_rank
+        self.moe_ep_size = moe_ep_size
+        self.dp_size = server_args.dp_size
+        self.pp_rank = pp_rank
+        self.pp_size = pp_size
+        self.model_config = model_config
+        self.dist_port = nccl_port
+        self.server_args = server_args
+        self.is_draft_worker = is_draft_worker
+        self.is_generation = model_config.is_generation
+        self.is_multimodal = model_config.is_multimodal
+        self.is_multimodal_chunked_prefill_supported = (
+            model_config.is_multimodal_chunked_prefill_supported
+        )
+        self.spec_algorithm = SpeculativeAlgorithm.from_string(
+            server_args.speculative_algorithm
+        )
+        self.page_size = server_args.page_size
+        self.req_to_token_pool = req_to_token_pool
+        self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
+        self.is_hybrid = model_config.is_hybrid
+        self.use_mla_backend = self.model_config.attention_arch == AttentionArch.MLA
+        self.attention_chunk_size = model_config.attention_chunk_size
+        self.forward_pass_id = 0
+
+        # Apply the rank zero filter to logger
+        if not any(isinstance(f, RankZeroFilter) for f in logger.filters):
+            logger.addFilter(RankZeroFilter(tp_rank == 0))
+        if server_args.show_time_cost:
+            enable_show_time_cost()
+
+        # Model-specific adjustment
+        self.model_specific_adjustment()
+
+        # Global vars
+        global_server_args_dict.update(
+            {k: getattr(server_args, k) for k in GLOBAL_SERVER_ARGS_KEYS}
+            | {
+                # TODO it is indeed not a "server args"
+                "use_mla_backend": self.use_mla_backend,
+                "speculative_algorithm": self.spec_algorithm,
+            }
+        )
+
+        # Init OpenMP threads binding for CPU
+        if self.device == "cpu":
+            self.init_threads_binding()
+
+        # Get memory before model loading
+        min_per_gpu_memory = self.init_torch_distributed()
+
+        # CPU offload
+        set_offloader(create_offloader_from_server_args(server_args, dp_rank=dp_rank))
+
+        # Update deep gemm configure
+        if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM:
+            deep_gemm_wrapper.update_deep_gemm_config(gpu_id, server_args)
+
+        # Initialize the model runner
+        self.initialize(min_per_gpu_memory)
+
+        # Temporary cached values
+        self.support_pp = (
+            "pp_proxy_tensors" in inspect.signature(self.model.forward).parameters
+        )
+
+        # For weight updates
+        self._model_update_group = {}
+
+    def initialize(self, min_per_gpu_memory: float):
+        server_args = self.server_args
+
+        self.memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=self.server_args.enable_memory_saver
+        )
+
+        if not self.is_draft_worker:
+            set_global_expert_location_metadata(
+                compute_initial_expert_location_metadata(server_args, self.model_config)
+            )
+            if self.tp_rank == 0 and get_bool_env_var(
+                "SGLANG_LOG_EXPERT_LOCATION_METADATA"
+            ):
+                logger.info(
+                    f"Initial expert_location_metadata: {get_global_expert_location_metadata()}"
+                )
+
+            set_global_expert_distribution_recorder(
+                ExpertDistributionRecorder.init_new(
+                    server_args,
+                    get_global_expert_location_metadata(),
+                    rank=self.tp_rank,
+                )
+            )
+
+        # Expert parallelism
+        self.eplb_manager = (
+            EPLBManager(self)
+            if self.server_args.enable_eplb and (not self.is_draft_worker)
+            else None
+        )
+        self.expert_location_updater = ExpertLocationUpdater()
+
+        # Load the model
+        self.sampler = Sampler()
+        self.load_model()
+
+        # Check if the model is using hybrid SWA
+        if (
+            not self.server_args.disable_hybrid_swa_memory
+            and self.sliding_window_size is not None
+            and self.sliding_window_size > 0
+        ):
+            architectures = self.model_config.hf_config.architectures
+            if architectures and not any("Llama4" in arch for arch in architectures):
+                self.is_hybrid = self.model_config.is_hybrid = True
+
+        if self.is_hybrid_gdn:
+            logger.warning("Hybrid GDN model detected, disable radix cache")
+            self.server_args.disable_radix_cache = True
+            self.server_args.attention_backend = "hybrid_linear_attn"
+            if self.server_args.max_mamba_cache_size is None:
+                if self.server_args.max_running_requests is not None:
+                    self.server_args.max_mamba_cache_size = (
+                        self.server_args.max_running_requests
+                    )
+                else:
+                    self.server_args.max_mamba_cache_size = 512
+            self.server_args.max_mamba_cache_size = (
+                self.server_args.max_mamba_cache_size
+                // (
+                    self.server_args.dp_size
+                    if self.server_args.enable_dp_attention
+                    else 1
+                )
+            )
+
+        # For MTP models like DeepSeek-V3 or GLM-4.5, the MTP layer(s) are used separately as draft
+        # models for speculative decoding. In those cases, `num_nextn_predict_layers` is used to
+        # determine the number of layers.
+        model_has_mtp_layers = self.model_config.num_nextn_predict_layers is not None
+        model_num_layers = (
+            self.model_config.num_nextn_predict_layers
+            if self.is_draft_worker and model_has_mtp_layers
+            else max(
+                self.model_config.num_hidden_layers,
+                self.model_config.num_attention_layers,
+            )
+        )
+        self.start_layer = getattr(self.model, "start_layer", 0)
+        self.end_layer = getattr(self.model, "end_layer", model_num_layers)
+        self.num_effective_layers = self.end_layer - self.start_layer
+        assert (
+            (not model_has_mtp_layers)
+            or (self.spec_algorithm.is_none())
+            or (
+                (not self.spec_algorithm.is_none())
+                and (self.num_effective_layers == model_num_layers)
+            )
+        ), "PP is not compatible with MTP models."
+
+        # Apply torchao quantization
+        torchao_applied = getattr(self.model, "torchao_applied", False)
+        # In layered loading, torchao may have been applied
+        if not torchao_applied:
+            apply_torchao_config_to_model(
+                self.model, global_server_args_dict["torchao_config"]
+            )
+
+        # Apply torch TP if the model supports it
+        supports_torch_tp = getattr(self.model, "supports_torch_tp", False)
+        if self.tp_size > 1 and supports_torch_tp:
+            self.apply_torch_tp()
+
+        # Init lora
+        if server_args.enable_lora:
+            self.init_lora_manager()
+
+        # Init Double Sparsity
+        if server_args.enable_double_sparsity:
+            if server_args.ds_heavy_channel_type is None:
+                raise ValueError(
+                    "Please specify the heavy channel type for double sparsity optimization."
+                )
+            self.init_double_sparsity_channel_config(server_args.ds_heavy_channel_type)
+
+        # Init memory pool and attention backends
+        self.init_memory_pool(
+            min_per_gpu_memory,
+            server_args.max_running_requests,
+            server_args.max_total_tokens,
+        )
+        if self.device == "cuda":
+            self.init_cublas()
+            self.init_attention_backend()
+            self.init_device_graphs()
+        elif self.device in ["npu", "cpu"]:
+            self.init_attention_backend()
+            self.init_device_graphs()
+        else:
+            self.graph_runner = None
+            self.graph_mem_usage = 0
+            self.init_attention_backend()
+
+        # auxiliary hidden capture mode. TODO: expose this to server args?
+        if self.spec_algorithm.is_eagle3() and not self.is_draft_worker:
+            # load draft config
+            draft_model_config = ModelConfig.from_server_args(
+                server_args,
+                model_path=(server_args.speculative_draft_model_path),
+                is_draft_model=True,
+            )
+
+            try:
+                # get the aux layer from draft model config
+                eagle_config = getattr(
+                    draft_model_config.hf_config, "eagle_config", None
+                )
+                eagle_aux_hidden_state_layer_ids = eagle_config[
+                    "eagle_aux_hidden_state_layer_ids"
+                ]
+            except:
+                # if there is no aux layer, set to None
+                eagle_aux_hidden_state_layer_ids = None
+
+            self.model.set_eagle3_layers_to_capture(eagle_aux_hidden_state_layer_ids)
+
+    def model_specific_adjustment(self):
+        server_args = self.server_args
+
+        if (
+            server_args.attention_backend == "intel_amx"
+            and server_args.device == "cpu"
+            and not _is_cpu_amx_available
+        ):
+            logger.info(
+                "The current platform does not support Intel AMX, will fallback to torch_native backend."
+            )
+            server_args.attention_backend = "torch_native"
+
+        if server_args.prefill_attention_backend is not None and (
+            server_args.prefill_attention_backend
+            == server_args.decode_attention_backend
+        ):  # override the default attention backend
+            server_args.attention_backend = server_args.prefill_attention_backend
+
+        if (
+            getattr(self.model_config.hf_config, "dual_chunk_attention_config", None)
+            is not None
+        ):
+            if server_args.attention_backend is None:
+                server_args.attention_backend = "dual_chunk_flash_attn"
+                logger.info("Dual chunk attention is turned on by default.")
+            elif server_args.attention_backend != "dual_chunk_flash_attn":
+                raise ValueError(
+                    "Dual chunk attention is enabled, but attention backend is set to "
+                    f"{server_args.attention_backend}. Please set it to 'dual_chunk_flash_attn'."
+                )
+
+        if server_args.attention_backend is None:
+            """
+            Auto select the fastest attention backend.
+
+            1. Models with MHA Architecture (e.g: Llama, QWen)
+                1.1 We will turn on FA3 on hopper unless user use spec decode with topk > 1 or page_size > 1.
+                1.2 In other cases, we will use flashinfer if available, otherwise use triton.
+            2. Models with MLA Architecture and using FA3
+                2.1 We will use FA3 backend on hopper.
+                2.2 We will use Flashinfer backend on blackwell.
+                2.3 Otherwise, we will use triton backend.
+            """
+
+            if not self.use_mla_backend:
+                # MHA architecture
+                if (
+                    is_hopper_with_cuda_12_3()
+                    and is_no_spec_infer_or_topk_one(server_args)
+                    and is_fa3_default_architecture(self.model_config.hf_config)
+                ):
+                    server_args.attention_backend = "fa3"
+                elif _is_hip:
+                    # server_args.attention_backend = "triton"
+                    server_args.attention_backend = "flashattention"
+                elif _is_npu:
+                    server_args.attention_backend = "ascend"
+                else:
+                    server_args.attention_backend = (
+                        "flashinfer" if is_flashinfer_available() else "triton"
+                    )
+            else:
+                # MLA architecture
+                if is_hopper_with_cuda_12_3():
+                    server_args.attention_backend = "fa3"
+                elif is_sm100_supported():
+                    server_args.attention_backend = "flashinfer"
+                elif _is_hip:
+                    head_num = self.model_config.get_num_kv_heads(self.tp_size)
+                    # TODO current aiter only support head number 16 or 128 head number
+                    if (
+                        head_num == 128 or head_num == 16
+                    ) and self.spec_algorithm.is_none():
+                        server_args.attention_backend = "aiter"
+                    else:
+                        server_args.attention_backend = "triton"
+                elif _is_npu:
+                    server_args.attention_backend = "ascend"
+                else:
+                    server_args.attention_backend = "triton"
+            logger.info(
+                f"Attention backend not explicitly specified. Use {server_args.attention_backend} backend by default."
+            )
+        elif self.use_mla_backend:
+            if server_args.device != "cpu":
+                if server_args.attention_backend in [
+                    "aiter",
+                    "flashinfer",
+                    "fa3",
+                    "triton",
+                    "flashmla",
+                    "cutlass_mla",
+                    "trtllm_mla",
+                    "ascend",
+                ]:
+                    logger.info(
+                        f"MLA optimization is turned on. Use {server_args.attention_backend} backend."
+                    )
+                else:
+                    raise ValueError(
+                        f"Invalid attention backend for MLA: {server_args.attention_backend}"
+                    )
+            else:
+                if server_args.attention_backend != "intel_amx":
+                    raise ValueError(
+                        "MLA optimization not supported on CPU except for intel_amx backend."
+                    )
+
+        if (
+            server_args.attention_backend == "fa3"
+            and server_args.kv_cache_dtype == "fp8_e5m2"
+        ):
+            logger.warning(
+                "FlashAttention3 only supports fp8_e4m3 if using FP8; "
+                "Setting attention backend to triton."
+            )
+            server_args.attention_backend = "triton"
+
+        if server_args.enable_double_sparsity:
+            logger.info(
+                "Double sparsity optimization is turned on. Use triton backend without CUDA graph."
+            )
+            server_args.attention_backend = "triton"
+            server_args.disable_cuda_graph = True
+
+        if self.is_multimodal:
+            if not self.is_multimodal_chunked_prefill_supported:
+                server_args.chunked_prefill_size = -1
+                logger.info(
+                    f"Automatically turn off --chunked-prefill-size as it is not supported for "
+                    f"{self.model_config.hf_config.model_type}"
+                )
+
+        if not self.use_mla_backend:
+            server_args.disable_chunked_prefix_cache = True
+        # TODO(kaixih@nvidia): remove this once we have a better solution for DP attention.
+        #  For more details, see: https://github.com/sgl-project/sglang/issues/8616
+        elif (
+            self.dp_size > 1
+            and is_sm100_supported()
+            and server_args.attention_backend != "triton"
+        ):
+            logger.info(
+                "Disable chunked prefix cache when dp size > 1 and attention backend is not triton."
+            )
+            server_args.disable_chunked_prefix_cache = True
+
+        if not server_args.disable_chunked_prefix_cache:
+            logger.info("Chunked prefix cache is turned on.")
+
+        if server_args.attention_backend == "aiter":
+            if self.model_config.context_len > 8192:
+                self.mem_fraction_static *= 0.85
+
+        if (
+            server_args.enable_hierarchical_cache
+            and server_args.hicache_io_backend == "kernel"
+        ):
+            # fix for the compatibility issue with FlashAttention3 decoding and HiCache kernel backend
+            if server_args.decode_attention_backend is None:
+                if not self.use_mla_backend:
+                    server_args.decode_attention_backend = (
+                        "flashinfer" if is_flashinfer_available() else "triton"
+                    )
+                else:
+                    server_args.decode_attention_backend = (
+                        "flashinfer" if is_sm100_supported() else "triton"
+                    )
+            elif server_args.decode_attention_backend == "fa3":
+                server_args.hicache_io_backend = "direct"
+                logger.warning(
+                    "FlashAttention3 decode backend is not compatible with hierarchical cache. "
+                    f"Setting hicache_io_backend to vanilla I/O, which may lead to suboptimal performance with small page sizes."
+                )
+
+    def init_torch_distributed(self):
+        logger.info("Init torch distributed begin.")
+
+        try:
+            torch.get_device_module(self.device).set_device(self.gpu_id)
+        except Exception:
+            logger.warning(
+                f"Context: {self.device=} {self.gpu_id=} {os.environ.get('CUDA_VISIBLE_DEVICES')=} {self.tp_rank=} {self.tp_size=}"
+            )
+            raise
+
+        if self.device == "cuda":
+            backend = "nccl"
+        elif self.device == "xpu":
+            backend = "xccl"
+        elif self.device == "hpu":
+            backend = "hccl"
+        elif self.device == "cpu":
+            backend = "gloo"
+        elif self.device == "npu":
+            backend = "hccl"
+
+        before_avail_memory = get_available_gpu_memory(self.device, self.gpu_id)
+        if not self.server_args.enable_p2p_check:
+            monkey_patch_p2p_access_check()
+
+        if self.server_args.dist_init_addr:
+            dist_init_method = f"tcp://{self.server_args.dist_init_addr}"
+        else:
+            dist_init_method = f"tcp://127.0.0.1:{self.dist_port}"
+        set_custom_all_reduce(not self.server_args.disable_custom_all_reduce)
+        set_mscclpp_all_reduce(self.server_args.enable_mscclpp)
+
+        if not self.is_draft_worker:
+            if self.device == "cpu":
+                if _is_cpu_amx_available:
+                    # Bind OpenMP threads to CPU cores
+                    torch.ops.sgl_kernel.init_cpu_threads_env(self.local_omp_cpuid)
+
+                    # Set local size to hint SGLang to use shared memory based AllReduce
+                    os.environ["LOCAL_SIZE"] = str(self.tp_size)
+                    torch.ops.sgl_kernel.initialize(self.tp_size, self.tp_rank)
+
+                    @torch.library.register_fake("sgl_kernel::shm_allgather")
+                    def _(data, dim):
+                        return torch.cat([data] * self.tp_size, dim=dim)
+
+                else:
+                    logger.warning(
+                        "init_cpu_threads_env and shared memory based AllReduce is disabled since intel amx backend is not available"
+                    )
+
+            # Only initialize the distributed environment on the target model worker.
+            init_distributed_environment(
+                backend=backend,
+                world_size=self.tp_size * self.pp_size,
+                rank=self.tp_size * self.pp_rank + self.tp_rank,
+                local_rank=self.gpu_id,
+                distributed_init_method=dist_init_method,
+                timeout=self.server_args.dist_timeout,
+            )
+            initialize_model_parallel(
+                tensor_model_parallel_size=self.tp_size,
+                pipeline_model_parallel_size=self.pp_size,
+                expert_model_parallel_size=self.moe_ep_size,
+                duplicate_tp_group=self.server_args.enable_pdmux,
+            )
+            initialize_dp_attention(
+                server_args=self.server_args,
+                model_config=self.model_config,
+            )
+
+        min_per_gpu_memory = get_available_gpu_memory(
+            self.device,
+            self.gpu_id,
+            distributed=get_world_group().world_size > 1,
+            cpu_group=get_world_group().cpu_group,
+        )
+        self.tp_group = get_tp_group()
+        self.pp_group = get_pp_group()
+        self.attention_tp_group = get_attention_tp_group()
+
+        # Check memory for tensor parallelism
+        local_gpu_memory = get_available_gpu_memory(self.device, self.gpu_id)
+        if self.tp_size > 1 and not self.is_draft_worker:
+            if min_per_gpu_memory < local_gpu_memory * 0.9:
+                if get_bool_env_var("SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK"):
+                    logger.warning(
+                        "The memory capacity is unbalanced. Some GPUs may be occupied by other processes. "
+                        f"{min_per_gpu_memory=}, {local_gpu_memory=}, {local_gpu_memory * 0.9=}"
+                    )
+                else:
+                    raise ValueError(
+                        "The memory capacity is unbalanced. Some GPUs may be occupied by other processes. "
+                        f"{min_per_gpu_memory=}, {local_gpu_memory=}, {local_gpu_memory * 0.9=}"
+                    )
+
+        logger.info(
+            f"Init torch distributed ends. mem usage={(before_avail_memory - local_gpu_memory):.2f} GB"
+        )
+        return min_per_gpu_memory
+
+    def load_model(self):
+        before_avail_memory = get_available_gpu_memory(self.device, self.gpu_id)
+        logger.info(
+            f"Load weight begin. avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
+        )
+
+        # This can reduce thread conflicts and speed up weight loading.
+        if self.device != "cpu":
+            torch.set_num_threads(1)
+        if self.device == "cuda":
+            if torch.cuda.get_device_capability()[0] < 8:
+                logger.info(
+                    "Compute capability below sm80. Use float16 due to lack of bfloat16 support."
+                )
+                self.server_args.dtype = "float16"
+                self.model_config.dtype = torch.float16
+                if torch.cuda.get_device_capability()[1] < 5:
+                    raise RuntimeError("SGLang only supports sm75 and above.")
+
+        set_cuda_arch()
+
+        # Prepare the model config
+        self.load_config = LoadConfig(
+            load_format=self.server_args.load_format,
+            download_dir=self.server_args.download_dir,
+            model_loader_extra_config=self.server_args.model_loader_extra_config,
+        )
+        if self.device == "cpu":
+            self.model_config = adjust_config_with_unaligned_cpu_tp(
+                self.model_config, self.load_config, self.tp_size
+            )
+        if self.server_args.load_format == "gguf":
+            monkey_patch_vllm_gguf_config()
+
+        # Load the model
+        # Remove monkey_patch when linear.py quant remove dependencies with vllm
+        monkey_patch_vllm_parallel_state()
+        monkey_patch_isinstance_for_vllm_base_layer()
+
+        with self.memory_saver_adapter.region(GPU_MEMORY_TYPE_WEIGHTS):
+            self.model = get_model(
+                model_config=self.model_config,
+                load_config=self.load_config,
+                device_config=DeviceConfig(self.device),
+            )
+        monkey_patch_vllm_parallel_state(reverse=True)
+        monkey_patch_isinstance_for_vllm_base_layer(reverse=True)
+
+        get_offloader().post_init()
+
+        if self.server_args.kv_cache_dtype == "fp8_e4m3":
+            if self.server_args.quantization_param_path is not None:
+                if callable(getattr(self.model, "load_kv_cache_scales", None)):
+                    self.model.load_kv_cache_scales(
+                        self.server_args.quantization_param_path
+                    )
+                    logger.info(
+                        "Loaded KV cache scaling factors from %s",
+                        self.server_args.quantization_param_path,
+                    )
+                else:
+                    raise RuntimeError(
+                        "Using FP8 KV cache and scaling factors provided but "
+                        "model %s does not support loading scaling factors.",
+                        self.model.__class__,
+                    )
+            else:
+                logger.warning(
+                    "Using FP8 KV cache but no scaling factors "
+                    "provided. Defaulting to scaling factors of 1.0. "
+                    "This may lead to less accurate results!"
+                )
+
+        # Parse other args
+        self.sliding_window_size = None
+        if hasattr(self.model, "get_attention_sliding_window_size"):
+            self.sliding_window_size = self.model.get_attention_sliding_window_size()
+        elif self.model_config.attention_chunk_size is not None:
+            self.sliding_window_size = self.model_config.attention_chunk_size
+            logger.info(
+                f"Setting sliding_window_size to be attention_chunk_size: {self.sliding_window_size}"
+            )
+
+        self.dtype = self.model_config.dtype
+
+        after_avail_memory = get_available_gpu_memory(self.device, self.gpu_id)
+        self.weight_load_mem_usage = before_avail_memory - after_avail_memory
+        logger.info(
+            f"Load weight end. "
+            f"type={type(self.model).__name__}, "
+            f"dtype={self.dtype}, "
+            f"avail mem={after_avail_memory:.2f} GB, "
+            f"mem usage={self.weight_load_mem_usage:.2f} GB."
+        )
+
+        # Handle the case where some ranks do not finish loading.
+        try:
+            dist.monitored_barrier(
+                group=get_tp_group().cpu_group,
+                timeout=datetime.timedelta(seconds=UNBALANCED_MODEL_LOADING_TIMEOUT_S),
+                wait_all_ranks=True,
+            )
+        except RuntimeError:
+            raise ValueError(
+                f"TP rank {self.tp_rank} could finish the model loading, but there are other ranks that didn't finish loading. It is likely due to unexpected failures (e.g., OOM) or a slow node."
+            ) from None
+
+    def update_expert_location(
+        self,
+        new_expert_location_metadata: ExpertLocationMetadata,
+        update_layer_ids: List[int],
+    ):
+        self.expert_location_updater.update(
+            self.model.routed_experts_weights_of_layer,
+            new_expert_location_metadata,
+            update_layer_ids=update_layer_ids,
+            nnodes=self.server_args.nnodes,
+            rank=self.tp_rank,
+        )
+
+    def update_weights_from_disk(
+        self, model_path: str, load_format: str
+    ) -> tuple[bool, str]:
+        """Update engine weights in-place from the disk."""
+        logger.info(
+            f"Update engine weights online from disk begin. "
+            f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
+        )
+
+        target_device = torch.device(self.device)
+        self.model_config.model_path = model_path
+        load_config = LoadConfig(load_format=load_format)
+
+        # Only support DefaultModelLoader for now
+        loader = get_model_loader(load_config)
+        if not isinstance(loader, DefaultModelLoader):
+            message = f"Failed to get model loader: {loader}."
+            return False, message
+
+        def get_weight_iter(config):
+            iter = loader._get_weights_iterator(
+                DefaultModelLoader.Source.init_new(config, self.model)
+            )
+            return iter
+
+        def model_load_weights(model, iter):
+            DefaultModelLoader.load_weights_and_postprocess(model, iter, target_device)
+            return model
+
+        with set_default_torch_dtype(self.model_config.dtype):
+            try:
+                iter = get_weight_iter(self.model_config)
+            except Exception as e:
+                message = f"Failed to get weights iterator: {e}."
+                return False, message
+            try:
+                model = model_load_weights(self.model, iter)
+            except Exception as e:
+                message = (
+                    f"Failed to update weights: {e}.\nRolling back to original weights."
+                )
+                del iter
+                gc.collect()
+                iter = get_weight_iter(self.model_config)
+                self.model = model_load_weights(self.model, iter)
+                return False, message
+
+        self.model = model
+        self.server_args.model_path = model_path
+        self.server_args.load_format = load_format
+        self.load_config = load_config
+
+        logger.info("Update weights end.")
+        return True, "Succeeded to update model weights."
+
+    def init_weights_update_group(
+        self,
+        master_address,
+        master_port,
+        rank_offset,
+        world_size,
+        group_name,
+        backend="nccl",
+    ):
+        """Initialize the Torch process group for model parameter updates.
+
+        `_model_update_group` is used in the RLHF workflow, where rank
+        0 is the actor model in the training engine, and the other ranks are
+        the inference engine, which is used for rollout.
+
+        In the RLHF workflow, the training engine updates the model
+        weights/parameters online, and broadcasts them to the inference
+        engine through the `_model_update_group` process group.
+        """
+        assert (
+            torch.distributed.is_initialized()
+        ), "Default torch process group must be initialized"
+        assert group_name != "", "Group name cannot be empty"
+
+        rank = rank_offset + self.tp_rank
+
+        logger.info(
+            f"init custom process group: master_address={master_address}, master_port={master_port}, "
+            f"rank_offset={rank_offset}, rank={rank}, world_size={world_size}, group_name={group_name}, backend={backend}"
+        )
+
+        try:
+            self._model_update_group[group_name] = init_custom_process_group(
+                backend=backend,
+                init_method=f"tcp://{master_address}:{master_port}",
+                world_size=world_size,
+                rank=rank,
+                group_name=group_name,
+            )
+            return True, "Succeeded to initialize custom process group."
+        except Exception as e:
+            message = f"Failed to initialize custom process group: {e}."
+            logger.error(message)
+            return False, message
+
+    def update_weights_from_distributed(self, names, dtypes, shapes, group_name):
+        """
+        Update specific parameter in the model weights online
+        through `_model_update_group` process group.
+
+        Args:
+            name: the name of the parameter to be updated.
+            dtype: the data type of the parameter to be updated.
+            shape: the shape of the parameter to be updated.
+        """
+
+        assert group_name in self._model_update_group, (
+            f"Group {group_name} not in {list(self._model_update_group.keys())}. "
+            "Please call `init_weights_update_group` first."
+        )
+
+        try:
+            weights = []
+            handles = []
+            for name, dtype, shape in zip(names, dtypes, shapes):
+                target_dtype = (
+                    dtype if isinstance(dtype, torch.dtype) else getattr(torch, dtype)
+                )
+                weight = torch.empty(shape, dtype=target_dtype, device=self.device)
+                handles.append(
+                    torch.distributed.broadcast(
+                        weight,
+                        src=0,
+                        group=self._model_update_group[group_name],
+                        async_op=True,
+                    )
+                )
+                weights.append((name, weight))
+            for handle in handles:
+                handle.wait()
+
+            self.model.load_weights(weights)
+            return True, f"Succeeded to update parameter online."
+
+        except Exception as e:
+            error_msg = (
+                f"Failed to update parameter online: {e}. "
+                f"The full weights of the ModelRunner are partially updated. "
+                f"Please discard the whole weights."
+            )
+            logger.error(error_msg)
+            return False, error_msg
+
+    def update_weights_from_tensor(
+        self,
+        named_tensors: List[Tuple[str, Union[torch.Tensor, "LocalSerializedTensor"]]],
+        load_format: Optional[str] = None,
+    ):
+        monkey_patch_torch_reductions()
+        if load_format == "flattened_bucket":
+            # Handle flattened bucket format
+            return self._update_weights_from_flattened_bucket(
+                flattened_tensor_bucket_dict=named_tensors
+            )
+
+        # We need to get device after patch otherwise the device would be wrong
+        self.device_module = torch.get_device_module(self.device)
+        infered_device = self.device_module.current_device()
+
+        named_tensors = [
+            (name, _unwrap_tensor(tensor, tp_rank=self.tp_rank, device=infered_device))
+            for name, tensor in named_tensors
+        ]
+        if load_format == "direct":
+            _model_load_weights_direct(self.model, named_tensors)
+        elif load_format in self.server_args.custom_weight_loader:
+            custom_loader = dynamic_import(load_format)
+            custom_loader(self.model, named_tensors)
+        elif load_format is None:
+            self.model.load_weights(named_tensors)
+        else:
+            raise NotImplementedError(f"Unknown load_format={load_format}")
+        return True, "Success"
+
+    def _update_weights_from_flattened_bucket(
+        self,
+        flattened_tensor_bucket_dict,
+    ):
+        """Handle flattened bucket format for weight updates"""
+        flattened_tensor = flattened_tensor_bucket_dict["flattened_tensor"]
+        metadata = flattened_tensor_bucket_dict["metadata"]
+
+        # Convert metadata dict to our format
+        converted_metadata = []
+        for meta in metadata:
+            converted_meta = FlattenedTensorMetadata(
+                name=meta.name,
+                shape=meta.shape,
+                dtype=meta.dtype,
+                start_idx=meta.start_idx,
+                end_idx=meta.end_idx,
+                numel=meta.numel,
+            )
+            converted_metadata.append(converted_meta)
+
+        # Create bucket and reconstruct tensors
+        bucket = FlattenedTensorBucket(
+            flattened_tensor=flattened_tensor, metadata=converted_metadata
+        )
+        reconstructed_tensors = bucket.reconstruct_tensors()
+
+        # Load the reconstructed tensors using the standard method
+        self.model.load_weights(reconstructed_tensors)
+
+        return True, "Success"
+
+    def get_weights_by_name(
+        self, name: str, truncate_size: int = 100
+    ) -> Optional[torch.Tensor]:
+        """Get the weights of the parameter by its name. Similar to `get_parameter` in Hugging Face.
+
+        Only used for unit test with an unoptimized performance.
+        For optimized performance, please use torch.save and torch.load.
+        """
+        # TODO: (chenyang) Add support for Qwen models.
+        try:
+            return self.model.get_weights_by_name(
+                name, truncate_size, tp_size=self.tp_size
+            )
+        except Exception as e:
+            logger.error(f"Error when getting parameter {name}: {e}")
+            return None
+
+    def init_lora_manager(self):
+        self.lora_manager = LoRAManager(
+            base_model=self.model,
+            base_hf_config=self.model_config.hf_config,
+            max_loras_per_batch=self.server_args.max_loras_per_batch,
+            load_config=self.load_config,
+            dtype=self.dtype,
+            lora_backend=self.server_args.lora_backend,
+            tp_size=self.tp_size,
+            tp_rank=self.tp_rank,
+            max_lora_rank=self.server_args.max_lora_rank,
+            target_modules=self.server_args.lora_target_modules,
+            lora_paths=self.server_args.lora_paths,
+        )
+
+    def load_lora_adapter(self, lora_ref: LoRARef):
+        """Load a new lora adapter from disk or huggingface."""
+
+        logger.info(
+            f"LoRA adapter loading starts: {lora_ref}. "
+            f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
+        )
+
+        result = self.lora_manager.load_lora_adapter(lora_ref)
+
+        logger.info(
+            f"LoRA adapter loading completes: {lora_ref}. "
+            f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
+        )
+
+        return result
+
+    def unload_lora_adapter(self, lora_ref: LoRARef):
+        """Unload a lora adapter that was previously loaded during initialization or dynamic loading."""
+
+        logger.info(
+            f"LoRA adapter unloading starts: {lora_ref}. "
+            f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
+        )
+
+        result = self.lora_manager.unload_lora_adapter(lora_ref)
+
+        logger.info(
+            f"LoRA adapter unloading completes: {lora_ref}. "
+            f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
+        )
+
+        return result
+
+    def profile_max_num_token(self, total_gpu_memory: int):
+        available_gpu_memory = get_available_gpu_memory(
+            self.device,
+            self.gpu_id,
+            distributed=get_world_group().world_size > 1,
+            cpu_group=get_world_group().cpu_group,
+        )
+        if self.is_draft_worker:
+            num_layers = getattr(
+                self.model_config.hf_config,
+                "num_nextn_predict_layers",
+                self.num_effective_layers,
+            )
+        elif self.is_hybrid_gdn:
+            num_layers = len(self.model_config.hf_config.full_attention_layer_ids)
+        else:
+            num_layers = self.num_effective_layers
+        if self.use_mla_backend:
+            cell_size = (
+                (self.model_config.kv_lora_rank + self.model_config.qk_rope_head_dim)
+                * num_layers
+                * torch._utils._element_size(self.kv_cache_dtype)
+            )
+        else:
+            cell_size = (
+                self.model_config.get_num_kv_heads(get_attention_tp_size())
+                * self.model_config.head_dim
+                * num_layers
+                * 2
+                * torch._utils._element_size(self.kv_cache_dtype)
+            )
+        rest_memory = available_gpu_memory - total_gpu_memory * (
+            1 - self.mem_fraction_static
+        )
+        if self.is_hybrid_gdn:
+            rest_memory -= (
+                self.server_args.max_mamba_cache_size
+                * self.model_config.hf_config.mamba_cache_per_req
+                / (1 << 30)
+            )
+        max_num_token = int(rest_memory * (1 << 30) // cell_size)
+        return max_num_token
+
+    @property
+    def is_hybrid_gdn(self):
+        return self.model_config.hf_config.architectures[0] in [
+            "Qwen3NextForCausalLM",
+            "Qwen3NextForCausalLMMTP",
+        ]
+
+    def set_num_token_hybrid(self):
+        if (
+            "Llama4ForConditionalGeneration"
+            in self.model_config.hf_config.architectures
+        ):
+            temp_ratio = (
+                (1 - self.is_hybrid)
+                + self.is_hybrid
+                * self.attention_chunk_size
+                / self.model_config.context_len
+            )
+            self.swa_max_total_num_tokens = (
+                4 * self.max_total_num_tokens * temp_ratio // (3 * temp_ratio + 1)
+            )
+            self.full_max_total_num_tokens = (
+                4 * self.max_total_num_tokens
+                - 12 * self.max_total_num_tokens * temp_ratio // (3 * temp_ratio + 1)
+            )
+            self.swa_max_total_num_tokens = int(
+                self.swa_max_total_num_tokens
+                // self.server_args.page_size
+                * self.server_args.page_size
+            )
+            self.full_max_total_num_tokens = int(
+                self.full_max_total_num_tokens
+                // self.server_args.page_size
+                * self.server_args.page_size
+            )
+            self.max_total_num_tokens = self.full_max_total_num_tokens
+        else:
+            assert self.sliding_window_size is not None and self.sliding_window_size > 0
+            full_attention_layer_ids = []
+            swa_attention_layer_ids = []
+
+            try:
+                layers = self.model.model.layers
+            except:
+                try:
+                    layers = self.model.language_model.model.layers
+                except:
+                    try:
+                        layers = self.model.language_model.layers
+                    except:
+                        self.is_hybrid = False
+                        return
+
+            for layer in layers:
+                if (
+                    layer.self_attn.attn.sliding_window_size is None
+                    or layer.self_attn.attn.sliding_window_size == -1
+                ):
+                    full_attention_layer_ids.append(layer.layer_id)
+                else:
+                    swa_attention_layer_ids.append(layer.layer_id)
+            self.model_config.swa_attention_layer_ids = swa_attention_layer_ids
+            self.model_config.full_attention_layer_ids = full_attention_layer_ids
+
+            # Algorithm:
+            # Existing max_total_num_tokens is per layer and assume all layers have the same number of tokens.
+            # - Find total # of tokens available across layers.
+            # - Calculate full_max_total_num_tokens and swa_max_total_num_tokens based on the given swa_full_tokens_ratio.
+            total_tokens = (
+                self.max_total_num_tokens * self.model_config.num_hidden_layers
+            )
+            full_layers_num = len(full_attention_layer_ids)
+            swa_layers_num = len(swa_attention_layer_ids)
+            swa_full_tokens_ratio = self.server_args.swa_full_tokens_ratio
+
+            # Solve the equations:
+            # 1. swa_max_total_num_tokens * swa_layers_num + full_max_total_num_tokens * full_layers_num == total_tokens
+            # 2. full_max_total_num_tokens * swa_full_tokens_ratio == swa_max_total_num_tokens
+            denominator = swa_full_tokens_ratio * swa_layers_num + full_layers_num
+            self.full_max_total_num_tokens = int(total_tokens / denominator)
+            self.swa_max_total_num_tokens = int(
+                self.full_max_total_num_tokens * swa_full_tokens_ratio
+            )
+            self.max_total_num_tokens = self.full_max_total_num_tokens
+
+            logger.info(
+                f"Use Sliding window memory pool. full_layer_tokens={self.full_max_total_num_tokens}, swa_layer_tokens={self.swa_max_total_num_tokens}"
+            )
+
+    def init_memory_pool(
+        self,
+        total_gpu_memory: int,
+        max_num_reqs: Optional[int] = None,
+        max_total_tokens: Optional[int] = None,
+    ):
+        # Determine the kv cache dtype
+        if self.server_args.kv_cache_dtype == "auto":
+            self.kv_cache_dtype = self.dtype
+        elif self.server_args.kv_cache_dtype == "fp8_e5m2":
+            if _is_hip:  # Using natively supported format
+                self.kv_cache_dtype = torch.float8_e5m2fnuz
+            else:
+                self.kv_cache_dtype = torch.float8_e5m2
+        elif self.server_args.kv_cache_dtype == "fp8_e4m3":
+            if _is_hip:  # Using natively supported format
+                self.kv_cache_dtype = torch.float8_e4m3fnuz
+            else:
+                self.kv_cache_dtype = torch.float8_e4m3fn
+        else:
+            raise ValueError(
+                f"Unsupported kv_cache_dtype: {self.server_args.kv_cache_dtype}."
+            )
+
+        self.max_total_num_tokens = self.profile_max_num_token(total_gpu_memory)
+        if SGLANG_CI_SMALL_KV_SIZE:
+            self.max_total_num_tokens = int(SGLANG_CI_SMALL_KV_SIZE)
+
+        if max_num_reqs is None:
+            max_num_reqs = min(
+                max(
+                    int(
+                        self.max_total_num_tokens / self.model_config.context_len * 512
+                    ),
+                    2048,
+                ),
+                4096,
+            )
+        if self.is_hybrid_gdn:
+            max_num_reqs = min(max_num_reqs, self.server_args.max_mamba_cache_size)
+
+        if not self.spec_algorithm.is_none():
+            if self.is_draft_worker:
+                self.max_total_num_tokens = self.server_args.draft_runner_cache_size
+                max_num_reqs = self.server_args.max_num_reqs
+            else:
+                # We are sharing the `token_to_kv_pool`, and both verify and draft tokens
+                # can be concurrently allocated, so we should give a headroom for it.
+                self.server_args.draft_runner_cache_size = (
+                    self.max_total_num_tokens
+                    # draft
+                    + max_num_reqs
+                    * self.server_args.speculative_num_steps
+                    * self.server_args.speculative_eagle_topk
+                    # verify
+                    + max_num_reqs * self.server_args.speculative_num_draft_tokens
+                    # buffer
+                    + 100
+                )
+                # Target worker and draft worker shares the same indices for the
+                # token_to_kv_pool, so we should make sure to match max_total_num_tokens.
+                self.max_total_num_tokens = self.server_args.draft_runner_cache_size
+                self.server_args.max_num_reqs = max_num_reqs
+
+        if max_total_tokens is not None:
+            if max_total_tokens > self.max_total_num_tokens:
+                logging.warning(
+                    f"max_total_tokens={max_total_tokens} is larger than the profiled value "
+                    f"{self.max_total_num_tokens}. "
+                    f"Use the profiled value instead."
+                )
+            self.max_total_num_tokens = min(self.max_total_num_tokens, max_total_tokens)
+
+        self.max_total_num_tokens = (
+            self.max_total_num_tokens
+            // self.server_args.page_size
+            * self.server_args.page_size
+        )
+        # different pp rank may have different num of layers, so we need to reduce the max_total_num_tokens
+        if self.pp_size > 1:
+            tensor = torch.tensor(self.max_total_num_tokens, dtype=torch.int64)
+            torch.distributed.all_reduce(
+                tensor,
+                op=torch.distributed.ReduceOp.MIN,
+                group=get_world_group().cpu_group,
+            )
+            self.max_total_num_tokens = tensor.item()
+
+        # create token size for hybrid cache
+        if self.is_hybrid:
+            self.set_num_token_hybrid()
+
+        if self.max_total_num_tokens <= 0:
+            raise RuntimeError(
+                "Not enough memory. Please try to increase --mem-fraction-static."
+            )
+
+        # Initialize req_to_token_pool
+        if self.req_to_token_pool is None:
+            # FIXME(lsyin): this is the temporary fix for the context length issue when using speculative decoding
+            extra_max_context_len = 4
+            if self.server_args.speculative_num_draft_tokens is not None:
+                extra_max_context_len += self.server_args.speculative_num_draft_tokens
+
+            if self.server_args.disaggregation_mode == "decode":
+                from sglang.srt.disaggregation.decode import DecodeReqToTokenPool
+
+                # subscribe memory for pre-allocated requests
+                # if max_num_reqs <= 32, we pre-allocate 2x requests
+                pre_alloc_size = max_num_reqs * 2 if max_num_reqs <= 32 else 0
+                self.req_to_token_pool = DecodeReqToTokenPool(
+                    size=max_num_reqs,
+                    max_context_len=self.model_config.context_len
+                    + extra_max_context_len,
+                    device=self.device,
+                    enable_memory_saver=self.server_args.enable_memory_saver,
+                    pre_alloc_size=pre_alloc_size,
+                )
+            elif self.is_hybrid_gdn:
+                config = self.model_config.hf_config
+                (
+                    conv_state_shape,
+                    temporal_state_shape,
+                    conv_dtype,
+                    ssm_dtype,
+                    mamba_layers,
+                ) = config.hybrid_gdn_params
+                self.req_to_token_pool = HybridReqToTokenPool(
+                    size=max_num_reqs,
+                    max_context_len=self.model_config.context_len
+                    + extra_max_context_len,
+                    device=self.device,
+                    enable_memory_saver=self.server_args.enable_memory_saver,
+                    conv_state_shape=conv_state_shape,
+                    temporal_state_shape=temporal_state_shape,
+                    conv_dtype=conv_dtype,
+                    ssm_dtype=ssm_dtype,
+                    mamba_layers=mamba_layers,
+                    speculative_num_draft_tokens=self.server_args.speculative_num_draft_tokens,
+                )
+            else:
+                self.req_to_token_pool = ReqToTokenPool(
+                    size=max_num_reqs,
+                    max_context_len=self.model_config.context_len
+                    + extra_max_context_len,
+                    device=self.device,
+                    enable_memory_saver=self.server_args.enable_memory_saver,
+                )
+        else:
+            # Draft worker shares req_to_token_pool with the target worker.
+            assert self.is_draft_worker
+
+        # Initialize token_to_kv_pool
+        if self.server_args.attention_backend == "ascend":
+            if self.use_mla_backend:
+                self.token_to_kv_pool = AscendMLAPagedTokenToKVPool(
+                    self.max_total_num_tokens,
+                    page_size=self.page_size,
+                    dtype=self.kv_cache_dtype,
+                    kv_lora_rank=self.model_config.kv_lora_rank,
+                    qk_rope_head_dim=self.model_config.qk_rope_head_dim,
+                    layer_num=self.num_effective_layers,
+                    device=self.device,
+                    enable_memory_saver=self.server_args.enable_memory_saver,
+                    start_layer=self.start_layer,
+                    end_layer=self.end_layer,
+                )
+            else:
+                self.token_to_kv_pool = AscendTokenToKVPool(
+                    self.max_total_num_tokens,
+                    page_size=self.page_size,
+                    dtype=self.kv_cache_dtype,
+                    head_num=self.model_config.get_num_kv_heads(
+                        get_attention_tp_size()
+                    ),
+                    head_dim=self.model_config.head_dim,
+                    layer_num=self.model_config.num_hidden_layers,
+                    device=self.device,
+                    enable_memory_saver=self.server_args.enable_memory_saver,
+                )
+        elif self.use_mla_backend:
+            self.token_to_kv_pool = MLATokenToKVPool(
+                self.max_total_num_tokens,
+                page_size=self.page_size,
+                dtype=self.kv_cache_dtype,
+                kv_lora_rank=self.model_config.kv_lora_rank,
+                qk_rope_head_dim=self.model_config.qk_rope_head_dim,
+                layer_num=self.num_effective_layers,
+                device=self.device,
+                enable_memory_saver=self.server_args.enable_memory_saver,
+                start_layer=self.start_layer,
+                end_layer=self.end_layer,
+            )
+        elif self.server_args.enable_double_sparsity:
+            self.token_to_kv_pool = DoubleSparseTokenToKVPool(
+                self.max_total_num_tokens,
+                page_size=self.page_size,
+                dtype=self.kv_cache_dtype,
+                head_num=self.model_config.get_num_kv_heads(get_attention_tp_size()),
+                head_dim=self.model_config.head_dim,
+                layer_num=self.num_effective_layers,
+                device=self.device,
+                heavy_channel_num=self.server_args.ds_heavy_channel_num,
+                enable_memory_saver=self.server_args.enable_memory_saver,
+                start_layer=self.start_layer,
+                end_layer=self.end_layer,
+            )
+        else:
+            if self.is_hybrid:
+                self.token_to_kv_pool = SWAKVPool(
+                    size=self.full_max_total_num_tokens,
+                    size_swa=self.swa_max_total_num_tokens,
+                    dtype=self.kv_cache_dtype,
+                    head_num=self.model_config.get_num_kv_heads(
+                        get_attention_tp_size()
+                    ),
+                    head_dim=self.model_config.head_dim,
+                    swa_attention_layer_ids=self.model_config.swa_attention_layer_ids,
+                    full_attention_layer_ids=self.model_config.full_attention_layer_ids,
+                    enable_kvcache_transpose=False,
+                    device=self.device,
+                )
+            elif self.is_hybrid_gdn:
+                self.token_to_kv_pool = HybridLinearKVPool(
+                    size=self.max_total_num_tokens,
+                    dtype=self.kv_cache_dtype,
+                    head_num=self.model_config.get_num_kv_heads(
+                        get_attention_tp_size()
+                    ),
+                    head_dim=self.model_config.head_dim,
+                    # if draft worker, we only need 1 attention layer's kv pool
+                    full_attention_layer_ids=(
+                        [0]
+                        if self.is_draft_worker
+                        else self.model_config.hf_config.full_attention_layer_ids
+                    ),
+                    enable_kvcache_transpose=False,
+                    device=self.device,
+                )
+            else:
+                self.token_to_kv_pool = MHATokenToKVPool(
+                    self.max_total_num_tokens,
+                    page_size=self.page_size,
+                    dtype=self.kv_cache_dtype,
+                    head_num=self.model_config.get_num_kv_heads(
+                        get_attention_tp_size()
+                    ),
+                    head_dim=self.model_config.head_dim,
+                    layer_num=self.num_effective_layers,
+                    device=self.device,
+                    enable_memory_saver=self.server_args.enable_memory_saver,
+                    start_layer=self.start_layer,
+                    end_layer=self.end_layer,
+                )
+
+        # Initialize token_to_kv_pool_allocator
+        need_sort = self.server_args.disaggregation_mode in ("decode", "prefill")
+        if self.token_to_kv_pool_allocator is None:
+            if self.server_args.attention_backend == "ascend":
+                self.token_to_kv_pool_allocator = AscendPagedTokenToKVPoolAllocator(
+                    self.max_total_num_tokens,
+                    page_size=self.page_size,
+                    dtype=self.kv_cache_dtype,
+                    device=self.device,
+                    kvcache=self.token_to_kv_pool,
+                    need_sort=need_sort,
+                )
+            else:
+                if self.page_size == 1:
+                    if self.is_hybrid:
+                        self.token_to_kv_pool_allocator = SWATokenToKVPoolAllocator(
+                            self.full_max_total_num_tokens,
+                            self.swa_max_total_num_tokens,
+                            dtype=self.kv_cache_dtype,
+                            device=self.device,
+                            kvcache=self.token_to_kv_pool,
+                            need_sort=need_sort,
+                        )
+                    else:
+                        self.token_to_kv_pool_allocator = TokenToKVPoolAllocator(
+                            self.max_total_num_tokens,
+                            dtype=self.kv_cache_dtype,
+                            device=self.device,
+                            kvcache=self.token_to_kv_pool,
+                            need_sort=need_sort,
+                        )
+                else:
+                    assert not self.is_hybrid
+                    self.token_to_kv_pool_allocator = PagedTokenToKVPoolAllocator(
+                        self.max_total_num_tokens,
+                        page_size=self.page_size,
+                        dtype=self.kv_cache_dtype,
+                        device=self.device,
+                        kvcache=self.token_to_kv_pool,
+                        need_sort=need_sort,
+                    )
+        else:
+            assert self.is_draft_worker
+
+        logger.info(
+            f"Memory pool end. "
+            f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
+        )
+
+    def init_cublas(self):
+        """We need to run a small matmul to init cublas. Otherwise, it will raise some errors later."""
+        dtype = torch.float16
+        device = "cuda"
+        a = torch.ones((16, 16), dtype=dtype, device=device)
+        b = torch.ones((16, 16), dtype=dtype, device=device)
+        c = a @ b
+        return c
+
+    def init_attention_backend(self):
+        """Init attention kernel backend."""
+        if self.server_args.enable_two_batch_overlap and not self.is_draft_worker:
+            self.attn_backend = TboAttnBackend.init_new(self._get_attention_backend)
+        else:
+            self.attn_backend = self._get_attention_backend()
+
+    def _get_attention_backend(self):
+        """Init attention kernel backend."""
+        self.decode_attention_backend_str = (
+            self.server_args.decode_attention_backend
+            if self.server_args.decode_attention_backend
+            else self.server_args.attention_backend
+        )
+        self.prefill_attention_backend_str = (
+            self.server_args.prefill_attention_backend
+            if self.server_args.prefill_attention_backend
+            else self.server_args.attention_backend
+        )
+        if self.decode_attention_backend_str != self.prefill_attention_backend_str:
+            from sglang.srt.layers.attention.hybrid_attn_backend import (
+                HybridAttnBackend,
+            )
+
+            attn_backend = HybridAttnBackend(
+                self,
+                decode_backend=self._get_attention_backend_from_str(
+                    self.decode_attention_backend_str
+                ),
+                prefill_backend=self._get_attention_backend_from_str(
+                    self.prefill_attention_backend_str
+                ),
+            )
+            logger.info(
+                f"Using hybrid attention backend for decode and prefill: "
+                f"decode_backend={self.decode_attention_backend_str}, "
+                f"prefill_backend={self.prefill_attention_backend_str}."
+            )
+            logger.warning(
+                f"Warning: Attention backend specified by --attention-backend or default backend might be overridden."
+                f"The feature of hybrid attention backend is experimental and unstable. Please raise an issue if you encounter any problem."
+            )
+        else:
+            attn_backend = self._get_attention_backend_from_str(
+                self.server_args.attention_backend
+            )
+
+        global_server_args_dict.update(
+            {
+                "decode_attention_backend": self.decode_attention_backend_str,
+                "prefill_attention_backend": self.prefill_attention_backend_str,
+            }
+        )
+        return attn_backend
+
+    def _get_attention_backend_from_str(self, backend_str: str):
+        if backend_str == "flashinfer":
+            if not self.use_mla_backend:
+                from sglang.srt.layers.attention.flashinfer_backend import (
+                    FlashInferAttnBackend,
+                )
+
+                # Init streams
+                if self.server_args.speculative_algorithm == "EAGLE":
+                    if (
+                        not hasattr(self, "plan_stream_for_flashinfer")
+                        or not self.plan_stream_for_flashinfer
+                    ):
+                        self.plan_stream_for_flashinfer = torch.cuda.Stream()
+                return FlashInferAttnBackend(self)
+            else:
+                from sglang.srt.layers.attention.flashinfer_mla_backend import (
+                    FlashInferMLAAttnBackend,
+                )
+
+                return FlashInferMLAAttnBackend(self)
+        elif backend_str == "aiter":
+            from sglang.srt.layers.attention.aiter_backend import AiterAttnBackend
+
+            return AiterAttnBackend(self)
+        elif self.server_args.attention_backend == "wave":
+            from sglang.srt.layers.attention.wave_backend import WaveAttnBackend
+
+            return WaveAttnBackend(self)
+        elif backend_str == "ascend":
+            from sglang.srt.layers.attention.ascend_backend import AscendAttnBackend
+
+            return AscendAttnBackend(self)
+        elif backend_str == "triton":
+            assert not self.model_config.is_encoder_decoder, (
+                "Cross attention is not supported in the triton attention backend. "
+                "Please use `--attention-backend flashinfer`."
+            )
+            if self.server_args.enable_double_sparsity:
+                from sglang.srt.layers.attention.double_sparsity_backend import (
+                    DoubleSparseAttnBackend,
+                )
+
+                return DoubleSparseAttnBackend(self)
+            else:
+                from sglang.srt.layers.attention.triton_backend import TritonAttnBackend
+
+                return TritonAttnBackend(self)
+        elif backend_str == "torch_native":
+            from sglang.srt.layers.attention.torch_native_backend import (
+                TorchNativeAttnBackend,
+            )
+
+            return TorchNativeAttnBackend(self)
+        elif backend_str == "flashmla":
+            from sglang.srt.layers.attention.flashmla_backend import FlashMLABackend
+
+            return FlashMLABackend(self)
+        elif backend_str == "fa3":
+            assert (
+                torch.cuda.get_device_capability()[0] == 8 and not self.use_mla_backend
+            ) or torch.cuda.get_device_capability()[0] == 9, (
+                "FlashAttention v3 Backend requires SM>=80 and SM<=90. "
+                "Please use `--attention-backend flashinfer`."
+            )
+            from sglang.srt.layers.attention.flashattention_backend import (
+                FlashAttentionBackend,
+            )
+
+            return FlashAttentionBackend(self)
+        elif backend_str == "cutlass_mla":
+            from sglang.srt.layers.attention.cutlass_mla_backend import (
+                CutlassMLABackend,
+            )
+
+            return CutlassMLABackend(self)
+        elif backend_str == "trtllm_mla":
+            if not self.use_mla_backend:
+                raise ValueError("trtllm_mla backend can only be used with MLA models.")
+            from sglang.srt.layers.attention.trtllm_mla_backend import TRTLLMMLABackend
+
+            return TRTLLMMLABackend(self)
+        elif backend_str == "trtllm_mha":
+            if self.use_mla_backend:
+                raise ValueError(
+                    "trtllm_mha backend can only be used with non-MLA models."
+                )
+            from sglang.srt.layers.attention.trtllm_mha_backend import (
+                TRTLLMHAAttnBackend,
+            )
+
+            return TRTLLMHAAttnBackend(self)
+        elif backend_str == "intel_amx":
+            from sglang.srt.layers.attention.intel_amx_backend import (
+                IntelAMXAttnBackend,
+            )
+
+            return IntelAMXAttnBackend(self)
+        elif backend_str == "dual_chunk_flash_attn":
+            from sglang.srt.layers.attention.dual_chunk_flashattention_backend import (
+                DualChunkFlashAttentionBackend,
+            )
+
+            return DualChunkFlashAttentionBackend(self)
+        elif backend_str == "hybrid_linear_attn":
+            assert (
+                self.is_hybrid_gdn
+            ), "hybrid_linear_attn backend can only be used with hybrid GDN models."
+            from sglang.srt.layers.attention.flashattention_backend import (
+                FlashAttentionBackend,
+            )
+            from sglang.srt.layers.attention.hybrid_linear_attn_backend import (
+                HybridLinearAttnBackend,
+                MambaAttnBackend,
+            )
+
+            full_attn_backend = FlashAttentionBackend(self)
+            linear_attn_backend = MambaAttnBackend(self)
+            full_attn_layers = self.model_config.hf_config.full_attention_layer_ids
+            return HybridLinearAttnBackend(
+                full_attn_backend, linear_attn_backend, full_attn_layers
+            )
+        else:
+            raise ValueError(f"Invalid attention backend: {backend_str}")
+
+    def init_double_sparsity_channel_config(self, selected_channel):
+        selected_channel = "." + selected_channel + "_proj"
+        self.sorted_channels = []
+        # load channel config
+        with open(self.server_args.ds_channel_config_path, "r") as f:
+            channel_config = json.load(f)
+
+        for i in range(self.start_layer, self.end_layer):
+            key = "model.layers." + str(i) + ".self_attn" + selected_channel
+            self.sorted_channels.append(
+                torch.tensor(channel_config[key])[
+                    :, : self.server_args.ds_heavy_channel_num
+                ]
+                .contiguous()
+                .cuda()
+            )
+
+    def init_device_graphs(self):
+        """Capture device graphs."""
+        self.graph_runner = None
+        self.graph_mem_usage = 0
+
+        if not self.is_generation:
+            # TODO: Currently, cuda graph only captures decode steps, which only exists for generation models
+            return
+
+        if self.device != "cpu" and self.server_args.disable_cuda_graph:
+            return
+
+        if self.device == "cpu" and not self.server_args.enable_torch_compile:
+            return
+
+        tic = time.perf_counter()
+        before_mem = get_available_gpu_memory(self.device, self.gpu_id)
+        logger.info(
+            f"Capture {'cpu graph' if self.device == 'cpu' else 'cuda graph'} begin. This can take up to several minutes. avail mem={before_mem:.2f} GB"
+        )
+        graph_runners = defaultdict(
+            lambda: CudaGraphRunner,
+            {
+                "cpu": CPUGraphRunner,
+                "npu": NPUGraphRunner,
+            },
+        )
+        self.graph_runner = graph_runners[self.device](self)
+
+        after_mem = get_available_gpu_memory(self.device, self.gpu_id)
+        self.graph_mem_usage = before_mem - after_mem
+        logger.info(
+            f"Capture {'cpu graph' if self.device == 'cpu' else 'cuda graph'} end. Time elapsed: {time.perf_counter() - tic:.2f} s. "
+            f"mem usage={self.graph_mem_usage:.2f} GB. avail mem={after_mem:.2f} GB."
+        )
+
+    def init_threads_binding(self):
+        omp_cpuids = os.environ.get("SGLANG_CPU_OMP_THREADS_BIND", "all")
+        cpu_ids_by_node = get_cpu_ids_by_node()
+        n_numa_node = len(cpu_ids_by_node)
+        if omp_cpuids == "all":
+            assert self.tp_size <= n_numa_node, (
+                f"SGLANG_CPU_OMP_THREADS_BIND is not set, in this case, "
+                f"tp_size {self.tp_size} should be smaller than or equal to number of numa node on the machine {n_numa_node}. "
+                f"If you need tp_size to be larger than number of numa node, please set the CPU cores for each tp rank via SGLANG_CPU_OMP_THREADS_BIND explicitly. "
+                f"For example, on a machine with 2 numa nodes, where core 0-31 are on numa node 0 and core 32-63 are on numa node 1, "
+                f"it is suggested to use -tp 2 and bind tp rank 0 to core 0-31 and tp rank 1 to core 32-63. "
+                f"This is the default behavior if SGLANG_CPU_OMP_THREADS_BIND is not set and it is the same as setting SGLANG_CPU_OMP_THREADS_BIND=0-31|32-63. "
+                f"If you do need tp_size to be larger than the number of numa nodes, you could set SGLANG_CPU_OMP_THREADS_BIND explicitly for example SGLANG_CPU_OMP_THREADS_BIND=0-15|16-31|32-47|48-63 and run with -tp 4. "
+                f"If you don't want each tp rank to use all the cores on one numa node, you could set for example SGLANG_CPU_OMP_THREADS_BIND=0-15|32-47 and run with -tp 2."
+            )
+            if self.tp_size < n_numa_node:
+                logger.warning(
+                    f"Detected the current machine has {n_numa_node} numa nodes available, but tp_size is set to {self.tp_size}, so only {self.tp_size} numa nodes are used."
+                )
+            self.local_omp_cpuid = cpu_ids_by_node[self.tp_rank]
+        else:
+            threads_bind_list = omp_cpuids.split("|")
+            assert self.tp_size == len(threads_bind_list), (
+                f"SGLANG_CPU_OMP_THREADS_BIND setting must be aligned with TP size parameter ({self.tp_size}). "
+                f"Please double check your settings."
+            )
+            self.local_omp_cpuid = threads_bind_list[self.tp_rank]
+            if self.tp_size > n_numa_node:
+                logger.warning(
+                    f"TP size ({self.tp_size})is larger than numa node number ({n_numa_node}), "
+                    f"in this case the available memory amount of each rank cannot be determined in prior. "
+                    f"Please set proper `--max-total-tokens` to avoid the out-of-memory error."
+                )
+
+    def apply_torch_tp(self):
+        logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
+        from sglang.srt.layers.model_parallel import tensor_parallel
+
+        device_mesh = torch.distributed.init_device_mesh(self.device, (self.tp_size,))
+        tensor_parallel(self.model, device_mesh)
+
+    def forward_decode(
+        self,
+        forward_batch: ForwardBatch,
+        skip_attn_backend_init: bool = False,
+        pp_proxy_tensors=None,
+    ) -> LogitsProcessorOutput:
+        if not skip_attn_backend_init:
+            self.attn_backend.init_forward_metadata(forward_batch)
+        # FIXME: add pp_proxy_tensors arg to all models
+        kwargs = {}
+        if self.support_pp:
+            kwargs["pp_proxy_tensors"] = pp_proxy_tensors
+        return self.model.forward(
+            forward_batch.input_ids,
+            forward_batch.positions,
+            forward_batch,
+            **kwargs,
+        )
+
+    def forward_extend(
+        self,
+        forward_batch: ForwardBatch,
+        skip_attn_backend_init: bool = False,
+        pp_proxy_tensors=None,
+    ) -> LogitsProcessorOutput:
+        if not skip_attn_backend_init:
+            self.attn_backend.init_forward_metadata(forward_batch)
+
+        kwargs = {}
+        if self.support_pp:
+            kwargs["pp_proxy_tensors"] = pp_proxy_tensors
+        if forward_batch.input_embeds is not None:
+            kwargs["input_embeds"] = forward_batch.input_embeds.bfloat16()
+        if not self.is_generation:
+            kwargs["get_embedding"] = True
+        return self.model.forward(
+            forward_batch.input_ids,
+            forward_batch.positions,
+            forward_batch,
+            **kwargs,
+        )
+
+    def forward_idle(
+        self, forward_batch: ForwardBatch, pp_proxy_tensors=None
+    ) -> LogitsProcessorOutput:
+        kwargs = {}
+        if self.support_pp:
+            kwargs["pp_proxy_tensors"] = pp_proxy_tensors
+        return self.model.forward(
+            forward_batch.input_ids,
+            forward_batch.positions,
+            forward_batch,
+            **kwargs,
+        )
+
+    def forward_split_prefill(
+        self,
+        forward_batch: ForwardBatch,
+        reinit_attn_backend: bool = False,
+        forward_count: int = 1,
+    ) -> LogitsProcessorOutput:
+        if forward_batch.split_index == 0 or reinit_attn_backend:
+            self.attn_backend.init_forward_metadata(forward_batch)
+        next_split_index = min(
+            forward_batch.split_index + forward_count,
+            self.model_config.num_hidden_layers,
+        )
+        ret = self.model.forward_split_prefill(
+            forward_batch.input_ids,
+            forward_batch.positions,
+            forward_batch,
+            (forward_batch.split_index, next_split_index),
+        )
+        forward_batch.split_index = next_split_index
+        return ret
+
+    def forward(
+        self,
+        forward_batch: ForwardBatch,
+        skip_attn_backend_init: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+        reinit_attn_backend: bool = False,
+        split_forward_count: int = 1,
+    ) -> Tuple[Union[LogitsProcessorOutput, PPProxyTensors], bool]:
+        self.forward_pass_id += 1
+
+        with get_global_expert_distribution_recorder().with_forward_pass(
+            self.forward_pass_id,
+            forward_batch,
+        ):
+            output = self._forward_raw(
+                forward_batch,
+                skip_attn_backend_init,
+                pp_proxy_tensors,
+                reinit_attn_backend,
+                split_forward_count,
+            )
+
+        if self.eplb_manager is not None:
+            self.eplb_manager.on_forward_pass_end()
+
+        return output
+
+    def _forward_raw(
+        self,
+        forward_batch: ForwardBatch,
+        skip_attn_backend_init: bool,
+        pp_proxy_tensors: Optional[PPProxyTensors],
+        reinit_attn_backend: bool = False,
+        split_forward_count: int = 1,
+    ) -> Tuple[Union[LogitsProcessorOutput, PPProxyTensors], bool]:
+        mode_check = (
+            forward_batch.forward_mode.is_cpu_graph
+            if self.device == "cpu"
+            else forward_batch.forward_mode.is_cuda_graph
+        )
+        can_run_graph = bool(
+            mode_check()
+            and self.graph_runner
+            and self.graph_runner.can_run(forward_batch)
+        )
+
+        if can_run_graph:
+            ret = self.graph_runner.replay(
+                forward_batch,
+                skip_attn_backend_init=skip_attn_backend_init,
+                pp_proxy_tensors=pp_proxy_tensors,
+            )
+            return ret, can_run_graph
+
+        # For MLP sync
+        if forward_batch.global_num_tokens_cpu is not None:
+            forward_batch.prepare_mlp_sync_batch(self)
+
+        if forward_batch.forward_mode.is_decode():
+            ret = self.forward_decode(
+                forward_batch,
+                skip_attn_backend_init=skip_attn_backend_init,
+                pp_proxy_tensors=pp_proxy_tensors,
+            )
+        elif forward_batch.forward_mode.is_extend():
+            ret = self.forward_extend(
+                forward_batch,
+                skip_attn_backend_init=skip_attn_backend_init,
+                pp_proxy_tensors=pp_proxy_tensors,
+            )
+        elif forward_batch.forward_mode.is_split_prefill():
+            ret = self.forward_split_prefill(
+                forward_batch,
+                reinit_attn_backend=reinit_attn_backend,
+                forward_count=split_forward_count,
+            )
+        elif forward_batch.forward_mode.is_idle():
+            ret = self.forward_idle(forward_batch, pp_proxy_tensors=pp_proxy_tensors)
+        else:
+            raise ValueError(f"Invalid forward mode: {forward_batch.forward_mode}")
+
+        if (
+            forward_batch.global_num_tokens_cpu is not None
+            and self.pp_group.is_last_rank
+        ):
+            forward_batch.post_forward_mlp_sync_batch(ret)
+
+        return ret, can_run_graph
+
+    def _preprocess_logits(
+        self, logits_output: LogitsProcessorOutput, sampling_info: SamplingBatchInfo
+    ):
+        # Apply logit bias
+        if sampling_info.sampling_info_done:
+            # Overlap mode: the function update_regex_vocab_mask was executed
+            # in process_batch_result of the last batch.
+            if sampling_info.grammars:
+                sampling_info.sampling_info_done.wait()
+        else:
+            # Normal mode: Put CPU-heavy tasks here. They will be overlapped with the forward pass.
+            sampling_info.update_regex_vocab_mask()
+        sampling_info.apply_logits_bias(logits_output.next_token_logits)
+
+    def sample(
+        self,
+        logits_output: LogitsProcessorOutput,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        """Sample and compute logprobs and update logits_output.
+
+        Args:
+            logits_output: The logits output from the model forward
+            forward_batch: The forward batch that generates logits_output
+
+        Returns:
+            A list of next_token_ids
+        """
+        # For duplex models with multiple output streams.
+        if isinstance(logits_output, tuple):
+            return torch.stack(
+                [self.sample(values, forward_batch) for values in logits_output],
+                axis=-1,
+            )
+
+        self._preprocess_logits(logits_output, forward_batch.sampling_info)
+
+        # Sample the next tokens
+        next_token_ids = self.sampler(
+            logits_output,
+            forward_batch.sampling_info,
+            forward_batch.return_logprob,
+            forward_batch.top_logprobs_nums,
+            forward_batch.token_ids_logprobs,
+        )
+        return next_token_ids
+
+    @property
+    def model_is_mrope(self) -> bool:
+        """Detect if the model has "mrope" rope_scaling type.
+        mrope requires keep "rope_deltas" between prompt and decoding phases."""
+        rope_scaling = getattr(self.model_config.hf_text_config, "rope_scaling", {})
+        if rope_scaling is None:
+            return False
+        is_mrope_enabled = "mrope_section" in rope_scaling
+        return is_mrope_enabled
+
+    def save_remote_model(self, url: str):
+        from sglang.srt.model_loader.loader import RemoteModelLoader
+
+        logger.info(f"Saving model to {url}")
+        RemoteModelLoader.save_model(self.model, self.model_config.model_path, url)
+
+    def save_sharded_model(
+        self, path: str, pattern: Optional[str] = None, max_size: Optional[int] = None
+    ):
+        from sglang.srt.model_loader.loader import ShardedStateLoader
+
+        logger.info(
+            f"Save sharded model to {path} with pattern {pattern} and max_size {max_size}"
+        )
+        ShardedStateLoader.save_model(self.model, path, pattern, max_size)
+
+
+def _model_load_weights_direct(model, named_tensors: List[Tuple[str, torch.Tensor]]):
+    params_dict = dict(model.named_parameters())
+    for name, tensor in named_tensors:
+        default_weight_loader(params_dict[name], tensor)
+
+
+def _unwrap_tensor(tensor, tp_rank, device):
+    if isinstance(tensor, LocalSerializedTensor):
+        tensor = tensor.get(tp_rank)
+    return tensor.to(device)
+
+
+@dataclass
+class LocalSerializedTensor:
+    """torch.Tensor that gets serialized by MultiprocessingSerializer (which only serializes a pointer and not the data).
+    The i-th element in the list corresponds to i-th rank's GPU."""
+
+    values: List[bytes]
+
+    def get(self, rank: int):
+        return MultiprocessingSerializer.deserialize(self.values[rank])
diff --git a/python/sglang/srt/model_executor/npu_graph_runner.py b/python/sglang/srt/model_executor/npu_graph_runner.py
new file mode 100644
index 000000000..0ff19d582
--- /dev/null
+++ b/python/sglang/srt/model_executor/npu_graph_runner.py
@@ -0,0 +1,94 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Run the model with npu graph and torch.compile."""
+
+from __future__ import annotations
+
+import logging
+import threading
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+
+from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
+
+logger = logging.getLogger(__name__)
+
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+
+
+class NPUGraphRunner(CudaGraphRunner):
+    """A NPUGraphRunner runs the forward pass of a model with npu graph and torch.compile."""
+
+    def __init__(self, model_runner: ModelRunner):
+        super().__init__(model_runner)
+
+    def _create_device_graph(self):
+        return torch.npu.NPUGraph()
+
+    def _capture_graph(self, graph, pool, stream, run_once_fn):
+        with torch.npu.graph(
+            graph,
+            pool=pool,
+            stream=stream,
+            auto_dispatch_capture=True,
+        ):
+            out = run_once_fn()
+        return out
+
+    def _update_inputs(self, seq_lens):
+        self.graphs[self.bs].update(
+            cpu_update_input=[{"actual_seq_lengths_kv": seq_lens}]
+        )
+
+    def _cache_loc_dtype(self):
+        return torch.int32
+
+    def replay(
+        self,
+        forward_batch: ForwardBatch,
+        skip_attn_backend_init: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[LogitsProcessorOutput, PPProxyTensors]:
+        if not skip_attn_backend_init:
+            self.replay_prepare(forward_batch, pp_proxy_tensors)
+        else:
+            # In speculative decoding, these two fields are still needed.
+            self.input_ids[: self.raw_num_token].copy_(forward_batch.input_ids)
+            self.positions[: self.raw_num_token].copy_(forward_batch.positions)
+
+        # Replay
+        seq_lens = forward_batch.seq_lens.cpu().tolist() + [0] * (self.bs - self.raw_bs)
+        thread = threading.Thread(target=self._update_inputs, args=(seq_lens,))
+        thread.start()
+        self.graphs[self.bs].replay()
+        thread.join()
+
+        output = self.output_buffers[self.bs]
+        if isinstance(output, LogitsProcessorOutput):
+            return LogitsProcessorOutput(
+                next_token_logits=output.next_token_logits[: self.raw_num_token],
+                hidden_states=(
+                    output.hidden_states[: self.raw_num_token]
+                    if output.hidden_states is not None
+                    else None
+                ),
+            )
+        else:
+            assert isinstance(output, PPProxyTensors)
+            return PPProxyTensors({k: v[: self.bs] for k, v in output.tensors.items()})
diff --git a/python/sglang/srt/model_loader/__init__.py b/python/sglang/srt/model_loader/__init__.py
new file mode 100644
index 000000000..63f110204
--- /dev/null
+++ b/python/sglang/srt/model_loader/__init__.py
@@ -0,0 +1,40 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/model_loader/__init__.py
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from torch import nn
+
+from sglang.srt.model_loader.loader import BaseModelLoader, get_model_loader
+from sglang.srt.model_loader.utils import (
+    get_architecture_class_name,
+    get_model_architecture,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.configs.device_config import DeviceConfig
+    from sglang.srt.configs.load_config import LoadConfig
+    from sglang.srt.configs.model_config import ModelConfig
+
+
+def get_model(
+    *,
+    model_config: ModelConfig,
+    load_config: LoadConfig,
+    device_config: DeviceConfig,
+) -> nn.Module:
+    loader = get_model_loader(load_config)
+    return loader.load_model(
+        model_config=model_config,
+        device_config=device_config,
+    )
+
+
+__all__ = [
+    "get_model",
+    "get_model_loader",
+    "BaseModelLoader",
+    "get_architecture_class_name",
+    "get_model_architecture",
+]
diff --git a/python/sglang/srt/model_loader/loader.py b/python/sglang/srt/model_loader/loader.py
new file mode 100644
index 000000000..d2b4c6bfc
--- /dev/null
+++ b/python/sglang/srt/model_loader/loader.py
@@ -0,0 +1,1584 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/model_executor/model_loader/loader.py
+
+from __future__ import annotations
+
+# ruff: noqa: SIM117
+import collections
+import concurrent
+import dataclasses
+import fnmatch
+import glob
+import json
+import logging
+import math
+import os
+import time
+from abc import ABC, abstractmethod
+from concurrent.futures import ThreadPoolExecutor
+from contextlib import contextmanager
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    Generator,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    cast,
+)
+
+import huggingface_hub
+import numpy as np
+import safetensors.torch
+import torch
+from huggingface_hub import HfApi, hf_hub_download
+from torch import nn
+from tqdm.auto import tqdm
+from transformers import AutoModelForCausalLM
+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
+
+from sglang.srt.configs.load_config import LoadConfig, LoadFormat
+from sglang.srt.connector import (
+    ConnectorType,
+    create_remote_connector,
+    get_connector_type,
+)
+from sglang.srt.connector.utils import parse_model_name
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.model_loader.utils import (
+    get_model_architecture,
+    post_load_weights,
+    set_default_torch_dtype,
+)
+from sglang.srt.model_loader.weight_utils import (
+    _BAR_FORMAT,
+    download_safetensors_index_file_from_hf,
+    download_weights_from_hf,
+    filter_duplicate_safetensors_files,
+    filter_files_not_needed_for_inference,
+    get_gguf_extra_tensor_names,
+    get_quant_config,
+    gguf_quant_weights_iterator,
+    initialize_dummy_weights,
+    multi_thread_pt_weights_iterator,
+    multi_thread_safetensors_weights_iterator,
+    np_cache_weights_iterator,
+    pt_weights_iterator,
+    safetensors_weights_iterator,
+    set_runai_streamer_env,
+)
+from sglang.srt.utils import (
+    get_bool_env_var,
+    get_device_capability,
+    is_npu,
+    is_pin_memory_available,
+    set_weight_attrs,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.configs.device_config import DeviceConfig
+    from sglang.srt.configs.model_config import ModelConfig
+    from sglang.srt.layers.quantization.base_config import QuantizationConfig
+
+_is_npu = is_npu()
+
+
+@contextmanager
+def device_loading_context(module: torch.nn.Module, target_device: torch.device):
+    if target_device.type == "cpu":
+        # If target is CPU, no need to move anything
+        yield module
+        return
+
+    original_infos: Dict[str, Dict] = {}
+
+    # Store original device states and move parameters to GPU if they're on CPU
+    for name, p in module.named_parameters():
+        if p.device.type == "cpu":
+            original_data = p.data
+            device_data = p.data.to(target_device)
+            original_infos[name] = dict(
+                device=p.device,
+                original_data=original_data,
+                device_data=device_data,
+            )
+            p.data = device_data
+        # Parameters already on target device are not touched
+
+    try:
+        yield module
+
+    finally:
+        # Restore parameters to their original devices, ignoring new parameters
+        pin_memory = is_pin_memory_available()
+        for name, p in module.named_parameters():
+            if name in original_infos:
+                original_info = original_infos[name]
+                device_data = original_info["device_data"]
+                original_data = original_info["original_data"]
+                original_device: torch.device = original_info["device"]
+
+                if (
+                    (device_data.device == p.data.device)
+                    and (device_data.data_ptr() == p.data.data_ptr())
+                    and (device_data.shape == p.data.shape)
+                    and (device_data.dtype == p.data.dtype)
+                ):
+                    original_data.copy_(p.data.to(original_data.device))
+                    p.data = original_data
+                elif original_device.type == "cpu":
+                    # `torch.empty_like` does not support `pin_memory` argument
+                    cpu_data = torch.empty_strided(
+                        size=p.data.size(),
+                        stride=p.data.stride(),
+                        dtype=p.data.dtype,
+                        layout=p.data.layout,
+                        device="cpu",
+                        pin_memory=pin_memory,
+                    )
+                    cpu_data.copy_(p.data)
+                    p.data = cpu_data
+                else:
+                    p.data = p.data.to(original_device)
+        # New parameters or parameters already on target device are untouched
+
+
+logger = logging.getLogger(__name__)
+
+
+def _get_quantization_config(
+    model_config: ModelConfig,
+    load_config: LoadConfig,
+    packed_modules_mapping: Dict[str, List[str]],
+) -> Optional[QuantizationConfig]:
+    """Get the quantization config."""
+    if model_config.quantization is not None:
+        quant_config = get_quant_config(
+            model_config, load_config, packed_modules_mapping
+        )
+        # (yizhang2077) workaround for nvidia/Llama-4-Maverick-17B-128E-Eagle3
+        if quant_config is None:
+            return None
+        if not _is_npu:
+            major, minor = get_device_capability()
+
+            if major is not None and minor is not None:
+                assert 0 <= minor < 10
+                capability = major * 10 + minor
+                if capability < quant_config.get_min_capability():
+                    raise ValueError(
+                        f"The quantization method {model_config.quantization} "
+                        "is not supported for the current GPU. "
+                        f"Minimum capability: {quant_config.get_min_capability()}. "
+                        f"Current capability: {capability}."
+                    )
+        supported_dtypes = quant_config.get_supported_act_dtypes()
+        if model_config.dtype not in supported_dtypes:
+            raise ValueError(
+                f"{model_config.dtype} is not supported for quantization "
+                f"method {model_config.quantization}. Supported dtypes: "
+                f"{supported_dtypes}"
+            )
+        return quant_config
+    return None
+
+
+def _initialize_model(
+    model_config: ModelConfig,
+    load_config: LoadConfig,
+) -> nn.Module:
+    """Initialize a model with the given configurations."""
+    model_class, _ = get_model_architecture(model_config)
+    packed_modules_mapping = getattr(model_class, "packed_modules_mapping", {})
+    if _is_npu:
+        packed_modules_mapping.update(
+            {
+                "visual": {"qkv_proj": ["qkv"]},
+                "vision_model": {
+                    "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+                    "proj": ["out_proj"],
+                },
+                "model": {
+                    "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+                    "gate_up_proj": ["gate_proj", "up_proj"],
+                    "fused_qkv_a_proj_with_mqa": [
+                        "q_a_proj",
+                        "kv_a_proj_with_mqa",
+                    ],
+                },
+            }
+        )
+
+    quant_config = _get_quantization_config(
+        model_config, load_config, packed_modules_mapping
+    )
+    return model_class(
+        config=model_config.hf_config,
+        quant_config=quant_config,
+    )
+
+
+class BaseModelLoader(ABC):
+    """Base class for model loaders."""
+
+    def __init__(self, load_config: LoadConfig):
+        self.load_config = load_config
+
+    @abstractmethod
+    def download_model(self, model_config: ModelConfig) -> None:
+        """Download a model so that it can be immediately loaded."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+    ) -> nn.Module:
+        """Load a model with the given configurations."""
+        raise NotImplementedError
+
+
+class DefaultModelLoader(BaseModelLoader):
+    """Model loader that can load different file types from disk."""
+
+    # default number of thread when enable multithread weight loading
+    DEFAULT_NUM_THREADS = 8
+
+    @dataclasses.dataclass
+    class Source:
+        """A source for weights."""
+
+        model_or_path: str
+        """The model ID or path."""
+
+        revision: Optional[str]
+        """The optional model revision."""
+
+        prefix: str = ""
+        """A prefix to prepend to all weights."""
+
+        fall_back_to_pt: bool = True
+        """Whether .pt weights can be used."""
+
+        @classmethod
+        def init_new(cls, model_config: ModelConfig, model):
+            return cls(
+                model_config.model_path,
+                model_config.revision,
+                prefix="",
+                fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load", True),
+            )
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        extra_config = load_config.model_loader_extra_config
+        allowed_keys = {"enable_multithread_load", "num_threads"}
+        unexpected_keys = set(extra_config.keys()) - allowed_keys
+
+        if unexpected_keys:
+            raise ValueError(
+                f"Unexpected extra config keys for load format "
+                f"{load_config.load_format}: "
+                f"{unexpected_keys}"
+            )
+
+    def _maybe_download_from_modelscope(
+        self, model: str, revision: Optional[str]
+    ) -> Optional[str]:
+        """Download model from ModelScope hub if SGLANG_USE_MODELSCOPE is True.
+
+        Returns the path to the downloaded model, or None if the model is not
+        downloaded from ModelScope."""
+        if get_bool_env_var("SGLANG_USE_MODELSCOPE"):
+            # download model from ModelScope hub,
+            # lazy import so that modelscope is not required for normal use.
+            # pylint: disable=C.
+            from modelscope.hub.snapshot_download import snapshot_download
+
+            if not os.path.exists(model):
+                model_path = snapshot_download(
+                    model_id=model,
+                    cache_dir=self.load_config.download_dir,
+                    local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                    revision=revision,
+                    ignore_file_pattern=self.load_config.ignore_patterns,
+                )
+            else:
+                model_path = model
+            return model_path
+        return None
+
+    def _prepare_weights(
+        self, model_name_or_path: str, revision: Optional[str], fall_back_to_pt: bool
+    ) -> Tuple[str, List[str], bool]:
+        """Prepare weights for the model.
+
+        If the model is not local, it will be downloaded."""
+        model_name_or_path = (
+            self._maybe_download_from_modelscope(model_name_or_path, revision)
+            or model_name_or_path
+        )
+
+        is_local = os.path.isdir(model_name_or_path)
+        load_format = self.load_config.load_format
+        use_safetensors = False
+        index_file = SAFE_WEIGHTS_INDEX_NAME
+        # Some quantized models use .pt files for storing the weights.
+        if load_format == LoadFormat.AUTO:
+            allow_patterns = ["*.safetensors", "*.bin"]
+        elif load_format == LoadFormat.SAFETENSORS:
+            use_safetensors = True
+            allow_patterns = ["*.safetensors"]
+        elif load_format == LoadFormat.MISTRAL:
+            use_safetensors = True
+            allow_patterns = ["consolidated*.safetensors"]
+            index_file = "consolidated.safetensors.index.json"
+        elif load_format == LoadFormat.PT:
+            allow_patterns = ["*.pt"]
+        elif load_format == LoadFormat.NPCACHE:
+            allow_patterns = ["*.bin"]
+        else:
+            raise ValueError(f"Unknown load_format: {load_format}")
+
+        if fall_back_to_pt:
+            allow_patterns += ["*.pt"]
+
+        if not is_local:
+            hf_folder = download_weights_from_hf(
+                model_name_or_path,
+                self.load_config.download_dir,
+                allow_patterns,
+                revision,
+                ignore_patterns=self.load_config.ignore_patterns,
+            )
+        else:
+            hf_folder = model_name_or_path
+
+        hf_weights_files: List[str] = []
+        for pattern in allow_patterns:
+            hf_weights_files += glob.glob(os.path.join(hf_folder, pattern))
+            if len(hf_weights_files) > 0:
+                if pattern == "*.safetensors":
+                    use_safetensors = True
+                break
+
+        if use_safetensors:
+            # For models like Mistral-7B-Instruct-v0.3
+            # there are both sharded safetensors files and a consolidated
+            # safetensors file. Using both breaks.
+            # Here, we download the `model.safetensors.index.json` and filter
+            # any files not found in the index.
+            if not is_local:
+                download_safetensors_index_file_from_hf(
+                    model_name_or_path,
+                    index_file,
+                    self.load_config.download_dir,
+                    revision,
+                )
+            hf_weights_files = filter_duplicate_safetensors_files(
+                hf_weights_files, hf_folder, index_file
+            )
+        else:
+            hf_weights_files = filter_files_not_needed_for_inference(hf_weights_files)
+
+        if len(hf_weights_files) == 0:
+            raise RuntimeError(
+                f"Cannot find any model weights with `{model_name_or_path}`"
+            )
+
+        return hf_folder, hf_weights_files, use_safetensors
+
+    def _get_weights_iterator(
+        self, source: "Source"
+    ) -> Generator[Tuple[str, torch.Tensor], None, None]:
+        """Get an iterator for the model weights based on the load format."""
+        extra_config = self.load_config.model_loader_extra_config
+        hf_folder, hf_weights_files, use_safetensors = self._prepare_weights(
+            source.model_or_path, source.revision, source.fall_back_to_pt
+        )
+        if self.load_config.load_format == LoadFormat.NPCACHE:
+            # Currently np_cache only support *.bin checkpoints
+            assert use_safetensors is False
+            weights_iterator = np_cache_weights_iterator(
+                source.model_or_path,
+                self.load_config.download_dir,
+                hf_folder,
+                hf_weights_files,
+            )
+        elif use_safetensors:
+            from sglang.srt.managers.schedule_batch import global_server_args_dict
+
+            weight_loader_disable_mmap = global_server_args_dict.get(
+                "weight_loader_disable_mmap"
+            )
+
+            if extra_config.get("enable_multithread_load"):
+                weights_iterator = multi_thread_safetensors_weights_iterator(
+                    hf_weights_files,
+                    max_workers=extra_config.get(
+                        "num_threads", self.DEFAULT_NUM_THREADS
+                    ),
+                    disable_mmap=weight_loader_disable_mmap,
+                )
+            else:
+                weights_iterator = safetensors_weights_iterator(
+                    hf_weights_files, disable_mmap=weight_loader_disable_mmap
+                )
+
+        else:
+            if extra_config.get("enable_multithread_load"):
+                weights_iterator = multi_thread_pt_weights_iterator(
+                    hf_weights_files,
+                    max_workers=extra_config.get(
+                        "num_threads", self.DEFAULT_NUM_THREADS
+                    ),
+                )
+            else:
+                weights_iterator = pt_weights_iterator(hf_weights_files)
+
+        # Apply the prefix.
+        return ((source.prefix + name, tensor) for (name, tensor) in weights_iterator)
+
+    def _get_all_weights(
+        self,
+        model_config: ModelConfig,
+        model: nn.Module,
+    ) -> Generator[Tuple[str, torch.Tensor], None, None]:
+
+        primary_weights = DefaultModelLoader.Source.init_new(model_config, model)
+        yield from self._get_weights_iterator(primary_weights)
+
+        secondary_weights = cast(
+            Iterable[DefaultModelLoader.Source], getattr(model, "secondary_weights", ())
+        )
+        for source in secondary_weights:
+            yield from self._get_weights_iterator(source)
+
+    def download_model(self, model_config: ModelConfig) -> None:
+        self._prepare_weights(
+            model_config.model_path, model_config.revision, fall_back_to_pt=True
+        )
+
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+    ) -> nn.Module:
+        target_device = torch.device(device_config.device)
+        with set_default_torch_dtype(model_config.dtype):
+            with target_device:
+                model = _initialize_model(
+                    model_config,
+                    self.load_config,
+                )
+
+        self.load_weights_and_postprocess(
+            model, self._get_all_weights(model_config, model), target_device
+        )
+
+        return model.eval()
+
+    @staticmethod
+    def load_weights_and_postprocess(model, weights, target_device):
+        model.load_weights(weights)
+
+        for _, module in model.named_modules():
+            quant_method = getattr(module, "quant_method", None)
+            if quant_method is not None:
+                # When quant methods need to process weights after loading
+                # (for repacking, quantizing, etc), they expect parameters
+                # to be on the global target device. This scope is for the
+                # case where cpu offloading is used, where we will move the
+                # parameters onto device for processing and back off after.
+                with device_loading_context(module, target_device):
+                    quant_method.process_weights_after_loading(module)
+
+
+class LayeredModelLoader(DefaultModelLoader):
+    """Model loader that loads weights layer by layer so that one can quantize a
+    layer before loading another to make the peak memory envelope smaller."""
+
+    def __init__(self, load_config: LoadConfig):
+        # Back to the default load format
+        load_config.load_format = LoadFormat.AUTO
+        super().__init__(load_config)
+
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+    ) -> nn.Module:
+        from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
+        from sglang.srt.managers.schedule_batch import global_server_args_dict
+
+        torchao_config = global_server_args_dict.get("torchao_config")
+        target_device = torch.device(device_config.device)
+
+        with set_default_torch_dtype(model_config.dtype):
+            # Create model on meta device
+            with torch.device("meta"):
+                model = _initialize_model(
+                    model_config,
+                    self.load_config,
+                )
+
+            # Check model's layered load support
+            if not hasattr(model, "load_weights_to_module"):
+                raise ValueError(
+                    "LayeredModelLoader requires the model to have a "
+                    "`load_weights_to_module` method. "
+                    f"{model_config.model_path} does not support it."
+                )
+
+            # Get all weights from disk
+            weights = self._get_all_weights(model_config, model)
+
+            # Helper function to recursively fill the weights of a module
+            def fill_module(module, fqn: List[str], weights):
+                """
+                fqn: list of strings representing the fully qualified name of `module`.
+                """
+                # Layer by layer
+                for name, submod in module.named_children():
+                    fill_module(submod, fqn + [name], weights)
+
+                # First materialize on target device
+                module.to_empty(device=target_device, recurse=False)
+                fqn_path = ".".join(fqn)
+                # Fill weights
+                model.load_weights_to_module(
+                    fqn_path,
+                    weights,
+                )
+                # Quantize weights if applicable
+                if torchao_config and "proj" in fqn_path:
+                    # Note: `None` here is needed to indicate no filter, see
+                    # `apply_torchao_config_to_model` for details.
+                    apply_torchao_config_to_model(module, torchao_config, None)
+
+            # Start calling on root module
+            fill_module(model, [], weights)
+
+        if torchao_config:
+            model.torchao_applied = True
+
+        return model.eval()
+
+
+class DummyModelLoader(BaseModelLoader):
+    """Model loader that will set model weights to random values."""
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        if load_config.model_loader_extra_config:
+            raise ValueError(
+                f"Model loader extra config is not supported for "
+                f"load format {load_config.load_format}"
+            )
+
+    def download_model(self, model_config: ModelConfig) -> None:
+        pass  # Nothing to download
+
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+    ) -> nn.Module:
+
+        if get_bool_env_var("SGL_CPU_QUANTIZATION"):
+            return load_model_with_cpu_quantization(
+                self, model_config=model_config, device_config=device_config
+            )
+
+        with set_default_torch_dtype(model_config.dtype):
+            with torch.device(device_config.device):
+                model = _initialize_model(
+                    model_config,
+                    self.load_config,
+                )
+
+            for _, module in model.named_modules():
+                quant_method = getattr(module, "quant_method", None)
+                if quant_method is not None:
+                    quant_method.process_weights_after_loading(module)
+
+            # NOTE(woosuk): For accurate performance evaluation, we assign
+            # random values to the weights.
+            initialize_dummy_weights(model)
+
+            post_load_weights(model, model_config)
+
+        return model.eval()
+
+
+class ShardedStateLoader(BaseModelLoader):
+    """
+    Model loader that directly loads each worker's model state dict, which
+    enables a fast load path for large tensor-parallel models where each worker
+    only needs to read its own shard rather than the entire checkpoint. See
+    `examples/runtime/engine/save_sharded_state.py` for creating a sharded checkpoint.
+    """
+
+    DEFAULT_PATTERN = "model-rank-{rank}-part-{part}.safetensors"
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        extra_config = (
+            {}
+            if load_config.model_loader_extra_config is None
+            else load_config.model_loader_extra_config.copy()
+        )
+        self.pattern = extra_config.pop("pattern", self.DEFAULT_PATTERN)
+        if extra_config:
+            raise ValueError(
+                f"Unexpected extra config keys for load format "
+                f"{load_config.load_format}: "
+                f"{load_config.model_loader_extra_config.keys()}"
+            )
+
+    @staticmethod
+    def _filter_subtensors(tensors: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        """
+        Filter out all tensors that share the same memory or a subset of the
+        memory of another tensor.
+        """
+        same_storage_groups: Dict[Any, List[Tuple[str, torch.Tensor]]] = (
+            collections.defaultdict(list)
+        )
+        for key, tensor in tensors.items():
+            if tensor.numel():
+                ptr = tensor.untyped_storage().data_ptr()
+                same_storage_groups[tensor.device, ptr].append((key, tensor))
+
+        def get_end_ptr(tensor: torch.Tensor) -> int:
+            return tensor.view(-1)[-1].data_ptr() + tensor.element_size()
+
+        result: Dict[str, torch.Tensor] = {}
+        for group in same_storage_groups.values():
+            for k, t in group:
+                a, b = t.data_ptr(), get_end_ptr(t)
+                for k2, t2 in group:
+                    if not t2.is_contiguous():
+                        continue
+                    a2, b2 = t2.data_ptr(), get_end_ptr(t2)
+                    if a < a2 or b2 < b:
+                        continue
+                    if a2 < a or b < b2 or not t.is_contiguous():
+                        break  # t2 covers strictly more memory than t.
+                    if k2 < k:
+                        # Same tensors, keep the one with the smaller key.
+                        break
+                else:
+                    result[k] = t
+        return result
+
+    def _prepare_weights(self, model_name_or_path: str, revision: Optional[str]):
+        if os.path.isdir(model_name_or_path):
+            return model_name_or_path
+        else:
+            allow_patterns = ["*.safetensors"]
+            return download_weights_from_hf(
+                model_name_or_path,
+                self.load_config.download_dir,
+                allow_patterns,
+                revision,
+                ignore_patterns=self.load_config.ignore_patterns,
+            )
+
+    def download_model(self, model_config: ModelConfig) -> None:
+        self._prepare_weights(model_config.model_path, model_config.revision)
+
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+    ) -> nn.Module:
+        from safetensors.torch import safe_open
+
+        from sglang.srt.distributed import get_tensor_model_parallel_rank
+
+        local_model_path = self._prepare_weights(
+            model_config.model_path, model_config.revision
+        )
+
+        with set_default_torch_dtype(model_config.dtype):
+            with torch.device(device_config.device):
+                model = _initialize_model(model_config, self.load_config)
+                for _, module in model.named_modules():
+                    quant_method = getattr(module, "quant_method", None)
+                    if quant_method is not None:
+                        quant_method.process_weights_after_loading(module)
+            rank = get_tensor_model_parallel_rank()
+            pattern = os.path.join(
+                local_model_path,
+                self.pattern.format(rank=rank, part="*"),
+            )
+            filepaths = glob.glob(pattern)
+            if not filepaths:
+                # TODO: support un-sharded checkpoints too
+                raise ValueError(
+                    f"Could not find checkpoint files '{pattern}', only "
+                    f"pre-sharded checkpoints are currently supported!"
+                )
+            state_dict = self._filter_subtensors(model.state_dict())
+            for path in filepaths:
+                with safe_open(path, framework="pt") as f:
+                    for key in f.keys():  # noqa: SIM118
+                        tensor = f.get_tensor(key)
+                        # If loading with LoRA enabled, additional padding may
+                        # be added to certain parameters. We only load into a
+                        # narrowed view of the parameter data.
+                        param_data = state_dict[key].data
+                        param_shape = state_dict[key].shape
+                        for dim, size in enumerate(tensor.shape):
+                            if size < param_shape[dim]:
+                                param_data = param_data.narrow(dim, 0, size)
+                        if tensor.shape != param_shape:
+                            logger.warning(
+                                "loading tensor of shape %s into "
+                                "parameter '%s' of shape %s",
+                                tensor.shape,
+                                key,
+                                param_shape,
+                            )
+                        param_data.copy_(tensor)
+                        state_dict.pop(key)
+            if state_dict:
+                raise ValueError(f"Missing keys {tuple(state_dict)} in loaded state!")
+
+            post_load_weights(model, model_config)
+
+        return model.eval()
+
+    @staticmethod
+    def save_model(
+        model: torch.nn.Module,
+        path: str,
+        pattern: Optional[str] = None,
+        max_size: Optional[int] = None,
+    ) -> None:
+        from safetensors.torch import save_file
+
+        from sglang.srt.distributed import get_tensor_model_parallel_rank
+
+        if pattern is None:
+            pattern = ShardedStateLoader.DEFAULT_PATTERN
+        rank = get_tensor_model_parallel_rank()
+        part_idx = 0
+        total_size = 0
+        state_dict = ShardedStateLoader._filter_subtensors(model.state_dict())
+        state_dict_part: Dict[str, torch.Tensor] = {}
+        for key, tensor in state_dict.items():
+            param_size = tensor.nelement() * tensor.element_size()
+            if max_size is not None and total_size + param_size > max_size:
+                filename = pattern.format(rank=rank, part=part_idx)
+                save_file(
+                    state_dict_part,
+                    os.path.join(path, filename),
+                )
+                part_idx += 1
+                total_size = 0
+                state_dict_part = {}
+            state_dict_part[key] = tensor
+            total_size += param_size
+        if len(state_dict_part) > 0:
+            filename = pattern.format(rank=rank, part=part_idx)
+            save_file(
+                state_dict_part,
+                os.path.join(path, filename),
+            )
+
+
+class BitsAndBytesModelLoader(BaseModelLoader):
+    """Model loader to load model weights with BitAndBytes quantization."""
+
+    possible_config_file_names = ["adapter_config.json"]
+
+    default_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+        ".fc1.",
+        ".fc2.",
+        ".dense.",
+        ".query_key_value.",
+        ".qkv_proj.",
+        ".dense_h_to_4h.",
+        ".dense_4h_to_h.",
+        ".out_proj.",
+    ]
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+
+        # we don't need to quantize the whole model, only the target modules
+        # that are specified in the adapter config file. If the adapter config
+        # file is not provided, we will quantize the default modules.
+        if (
+            not load_config.model_loader_extra_config
+            or "qlora_adapter_name_or_path" not in load_config.model_loader_extra_config
+        ):
+            self.target_modules = []
+            return
+
+        qlora_adapter = load_config.model_loader_extra_config[
+            "qlora_adapter_name_or_path"
+        ]
+
+        config_file_path = self._get_config_file(qlora_adapter)
+
+        with open(config_file_path, "r") as f:
+            config = json.load(f)
+            self.target_modules = config["target_modules"]
+
+    def _get_config_file(self, qlora_adapter: str) -> str:
+        is_local = os.path.isdir(qlora_adapter)
+        config_file_path = None
+        if is_local:
+            for file in self.possible_config_file_names:
+                config_file_path = os.path.join(qlora_adapter, file)
+                if os.path.exists(config_file_path):
+                    break
+        else:
+            hf_api = HfApi()
+            repo_files = hf_api.list_repo_files(repo_id=qlora_adapter)
+            for file in self.possible_config_file_names:
+                if file in repo_files:
+                    config_file_path = hf_hub_download(
+                        repo_id=qlora_adapter, filename=file
+                    )
+                    break
+
+        if not config_file_path:
+            raise ValueError(f"Cannot find adapter config file in {qlora_adapter}")
+
+        return config_file_path
+
+    def _get_weight_files(
+        self,
+        model_name_or_path: str,
+        allowed_patterns: List[str],
+        revision: Optional[str] = None,
+    ) -> Tuple[List[str], str]:
+        """Retrieve weight files. Download the files if necessary.
+
+        Return the weight files and the file pattern."""
+        is_local = os.path.isdir(model_name_or_path)
+
+        if is_local:
+            for pattern in allowed_patterns:
+                weight_files = glob.glob(os.path.join(model_name_or_path, pattern))
+                if weight_files:
+                    return weight_files, pattern
+        else:
+            hf_api = HfApi()
+            repo_files = hf_api.list_repo_files(repo_id=model_name_or_path)
+            for pattern in allowed_patterns:
+                matching_files = fnmatch.filter(repo_files, pattern)
+                if matching_files:
+                    hf_folder = download_weights_from_hf(
+                        model_name_or_path,
+                        self.load_config.download_dir,
+                        [pattern],
+                        revision,
+                        ignore_patterns=self.load_config.ignore_patterns,
+                    )
+                    return glob.glob(os.path.join(hf_folder, pattern)), pattern
+
+        raise RuntimeError(f"No model weights found in: `{model_name_or_path}`")
+
+    def _prepare_weights(
+        self, model_name_or_path: str, revision: Optional[str]
+    ) -> Tuple[List[str], bool]:
+        """Prepare weight files for the model."""
+
+        allowed_patterns = ["*.safetensors", "*.bin", "*.pt"]
+
+        hf_weights_files, matched_pattern = self._get_weight_files(
+            model_name_or_path, allowed_patterns, revision
+        )
+
+        if matched_pattern != "*.safetensors":
+            hf_weights_files = filter_files_not_needed_for_inference(hf_weights_files)
+
+        if len(hf_weights_files) == 0:
+            raise RuntimeError(
+                f"Cannot find any model weights with `{model_name_or_path}`"
+            )
+
+        return hf_weights_files, matched_pattern == "*.safetensors"
+
+    def _hf_weight_iter(self, hf_weights_files, use_safetensors: bool):
+        if use_safetensors:
+            return safetensors_weights_iterator(hf_weights_files)
+        else:
+            return pt_weights_iterator(hf_weights_files)
+
+    def _get_quantized_weights_iterator(
+        self,
+        model_name_or_path: str,
+        revision: Optional[str],
+        pre_quant: bool,
+        load_8bit: bool,
+    ) -> Tuple[Generator[Tuple[str, torch.Tensor], None, None], Dict[str, Any]]:
+        """Get an iterator to the model weights with bitsandbytes quantization,
+        as well as the quantization state dictionary."""
+
+        # only load the bitsandbytes module when needed
+        try:
+            import bitsandbytes
+
+            if bitsandbytes.__version__ < "0.44.0":
+                raise ImportError(
+                    "bitsandbytes version is wrong. Please "
+                    "install bitsandbytes>=0.44.0."
+                )
+        except ImportError as err:
+            raise ImportError(
+                "Please install bitsandbytes>=0.44.0 via "
+                "`pip install bitsandbytes>=0.44.0` to use "
+                "bitsandbytes quantizer."
+            ) from err
+
+        hf_weights_files, use_safetensors = self._prepare_weights(
+            model_name_or_path, revision
+        )
+
+        quant_state_dict: Dict[str, Any] = {}
+
+        if pre_quant:
+            if load_8bit:
+                return (
+                    self._quantized_8bit_generator(
+                        hf_weights_files, use_safetensors, quant_state_dict
+                    ),
+                    quant_state_dict,
+                )
+            else:
+                return (
+                    self._quantized_4bit_generator(
+                        hf_weights_files, use_safetensors, quant_state_dict
+                    ),
+                    quant_state_dict,
+                )
+
+        return (
+            self._unquantized_generator(
+                hf_weights_files, use_safetensors, quant_state_dict
+            ),
+            quant_state_dict,
+        )
+
+    def _is_8bit_weight_name(self, weight_name: str):
+        quantized_suffix = {".scb", ".weight_format"}
+        return any(weight_name.lower().endswith(suffix) for suffix in quantized_suffix)
+
+    def _is_4bit_weight_name(self, weight_name: str):
+        quantized_suffix = {
+            "absmax",
+            "quant_map",
+            "nested_absmax",
+            "nested_quant_map",
+            "bitsandbytes",
+        }
+        suffix = weight_name.split(".")[-1]
+        return any(q_suffix in suffix for q_suffix in quantized_suffix)
+
+    def _quantized_8bit_generator(
+        self, hf_weights_files, use_safetensors, quant_state_dict
+    ) -> Generator:
+        for weight_name, weight_tensor in self._hf_weight_iter(
+            hf_weights_files, use_safetensors
+        ):
+            if not weight_name.lower().endswith(".scb"):
+                continue
+
+            weight_key = weight_name.lower().replace(".scb", ".weight")
+            quant_state_dict[weight_key] = weight_tensor
+
+        for weight_name, weight_tensor in self._hf_weight_iter(
+            hf_weights_files, use_safetensors
+        ):
+            if self._is_8bit_weight_name(weight_name):
+                continue
+
+            if weight_name in quant_state_dict:
+                set_weight_attrs(weight_tensor, {"load_in_8bit": True})
+                yield weight_name, weight_tensor
+            else:
+                yield weight_name, weight_tensor
+
+    def _quantized_4bit_generator(
+        self, hf_weights_files, use_safetensors, quant_state_dict
+    ) -> Generator:
+        from bitsandbytes.functional import QuantState
+
+        # First iterate over all quant state weights
+        weight_iterator = self._hf_weight_iter(hf_weights_files, use_safetensors)
+        temp_state_dict = {}
+        for weight_name, weight_tensor in weight_iterator:
+            if not self._is_4bit_weight_name(weight_name):
+                continue
+            # bitsandbytes library requires
+            # weight.quant_state.bitsandbytes__* in CPU
+            if "quant_state.bitsandbytes" in weight_name:
+                temp_state_dict[weight_name] = weight_tensor.cpu().data
+            else:
+                temp_state_dict[weight_name] = weight_tensor
+
+        # Closure to parse quant_state for each prequant weight
+        def _parse_quant_state(param_name: str, temp_state_dict: Dict) -> QuantState:
+            quant_state = {}
+            for k in temp_state_dict:
+                if param_name + "." in k:
+                    quant_state[k] = temp_state_dict[k]
+
+            return QuantState.from_dict(quant_state, device="cuda")
+
+        # Second iterate over all prequant and normal weights
+        # pre quantized weights would have a quant_state
+        for weight_name, weight_tensor in self._hf_weight_iter(
+            hf_weights_files, use_safetensors
+        ):
+
+            if self._is_4bit_weight_name(weight_name):
+                continue
+
+            if (f"{weight_name}.quant_state.bitsandbytes__nf4" in temp_state_dict) or (
+                f"{weight_name}.quant_state.bitsandbytes__fp4" in temp_state_dict
+            ):
+                quant_state = _parse_quant_state(weight_name, temp_state_dict)
+                quant_state_dict[weight_name] = quant_state
+                yield weight_name, weight_tensor
+            else:
+                yield weight_name, weight_tensor
+
+    def _unquantized_generator(
+        self, hf_weights_files, use_safetensors, quant_state_dict
+    ) -> Generator:
+        from bitsandbytes.functional import quantize_4bit
+
+        tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+
+        for weight_name, weight_tensor in self._hf_weight_iter(
+            hf_weights_files, use_safetensors
+        ):
+
+            if any(
+                target_module in weight_name for target_module in self.target_modules
+            ) and weight_name.endswith(".weight"):
+                weight_name = weight_name.replace(".weight", ".qweight")
+
+                if any(
+                    module in weight_name
+                    for module in self.column_parallel_weights_modules
+                ):
+
+                    total_size = weight_tensor.size(-1)
+                    start_index = total_size // tp_size * tp_rank
+                    end_index = total_size // tp_size * (tp_rank + 1)
+                    weight_sub_tensor = weight_tensor[..., start_index:end_index]
+
+                else:
+                    total_size = weight_tensor.size(0)
+                    start_index = total_size // tp_size * tp_rank
+                    end_index = total_size // tp_size * (tp_rank + 1)
+                    weight_sub_tensor = weight_tensor[start_index:end_index, ...]
+
+                # bitsandbytes requires data in GPU
+                if weight_sub_tensor.is_cuda:
+                    loaded_weight = weight_sub_tensor
+                else:
+                    loaded_weight = weight_sub_tensor.cuda()
+
+                # remove the following after the issue is fixed:
+                # https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1342
+                if loaded_weight.is_contiguous() is False:
+                    loaded_weight = loaded_weight.contiguous()
+
+                with set_default_torch_dtype(torch.float32):
+                    processed_weight, quant_state = quantize_4bit(
+                        loaded_weight, compress_statistics=True, quant_type="nf4"
+                    )
+
+                quant_state_dict[weight_name] = quant_state
+            else:
+                processed_weight = weight_tensor
+
+            yield weight_name, processed_weight
+
+    def _load_weights(self, model_config: ModelConfig, model: nn.Module) -> None:
+        if not hasattr(model, "load_weights"):
+            raise AttributeError(
+                "The required method 'load_weights' is not defined in class"
+                f" {type(model).__name__}."
+            )
+
+        if not hasattr(model, "bitsandbytes_stacked_params_mapping"):
+            raise AttributeError(
+                f"Model {type(model).__name__} does not support BitsAndBytes "
+                "quantization yet."
+            )
+
+        if len(self.target_modules) == 0:
+            if hasattr(model, "default_bitsandbytes_target_modules"):
+                self.target_modules = model.default_bitsandbytes_target_modules
+            else:
+                self.target_modules = self.default_target_modules
+
+        if hasattr(model, "column_parallel_weights_modules"):
+            self.column_parallel_weights_modules = model.column_parallel_weights_modules
+        else:
+            self.column_parallel_weights_modules = []
+
+        self.model_type = type(model).__name__
+
+        logger.info(
+            "Loading weights with BitsAndBytes quantization. " " May take a while ..."
+        )
+
+        quant_config = getattr(model_config.hf_config, "quantization_config", None)
+
+        pre_quant = False
+        if quant_config is not None:
+            quant_method = quant_config.get("quant_method")
+            if quant_method == "bitsandbytes":
+                pre_quant = True
+            else:
+                raise ValueError(
+                    f"BitsAndBytes loader does not support {quant_method} "
+                    "quantization"
+                )
+
+        # The quant_states in pre_quantized models cannot work with a split
+        # weight tensor. So TP does not work with pre_quantized bnb models.
+        if pre_quant and get_tensor_model_parallel_world_size() > 1:
+            raise ValueError(
+                "Prequant BitsAndBytes models with TP is not supported."
+                "Please try with PP."
+            )
+
+        load_8bit = False
+        if pre_quant:
+            load_8bit = quant_config.get("load_in_8bit", False)
+
+        qweight_iterator, quant_state_dict = self._get_quantized_weights_iterator(
+            model_config.model_path, model_config.revision, pre_quant, load_8bit
+        )
+
+        model.load_weights(qweight_iterator)
+
+        torch.cuda.empty_cache()
+
+        param_dict = dict(model.named_parameters())
+        stacked_quant_state_dict: Dict[str, Dict[int, Any]] = {}
+        model_type = model_config.hf_config.model_type
+        for quant_param_name in quant_state_dict:
+            non_stacked_param_name = quant_param_name
+            if model_type == "mllama" and "vision_model" in quant_param_name:
+                # adapt to VisionAttention
+                quant_param_name = quant_param_name.replace(
+                    "self_attn.o_proj", "self_attn.proj"
+                )
+            shard_index = 0
+            for shard_name, (
+                weight_name,
+                index,
+            ) in model.bitsandbytes_stacked_params_mapping.items():
+                if (
+                    model_type in ["qwen2_vl", "qwen2_5_vl"]
+                    and "visual" in quant_param_name
+                ):
+                    break
+                if shard_name in quant_param_name:
+                    shard_index = index
+                    quant_param_name = quant_param_name.replace(shard_name, weight_name)
+                    break
+
+            if (
+                model_type in ["qwen2_vl", "qwen2_5_vl"]
+                and "visual" in quant_param_name
+            ):
+                quant_param_name = quant_param_name.replace(
+                    r"attn.qkv.", r"attn.qkv_proj."
+                )
+
+            if quant_param_name not in param_dict:
+                raise ValueError(
+                    f"Parameter {quant_param_name} not found in the model."
+                )
+
+            if quant_param_name not in stacked_quant_state_dict:
+                stacked_quant_state_dict[quant_param_name] = {}
+
+            stacked_quant_state_dict[quant_param_name][shard_index] = quant_state_dict[
+                non_stacked_param_name
+            ]
+
+        # save quant_states and offsets as the attributes of the parameters
+        for param_name, param in param_dict.items():
+            if param_name in stacked_quant_state_dict:
+                quant_states = stacked_quant_state_dict[param_name]
+                set_weight_attrs(param, {"bnb_quant_state": quant_states})
+
+                pack_ratio = getattr(param, "pack_factor", -1)
+                if pack_ratio == -1:
+                    raise ValueError(f"pack_factor not set for parameter {param_name}.")
+
+                num_elements = [0] * len(quant_states)
+                for seq, quant_state in quant_states.items():
+                    num_elements[seq] = math.prod(quant_state.shape) // pack_ratio
+
+                offsets = np.concatenate(([0], np.cumsum(num_elements)))
+                # Make torch infer_schema happy(Compatible with vLLM)
+                offsets = torch.tensor(offsets).cpu()
+                set_weight_attrs(param, {"bnb_shard_offsets": offsets})
+
+                if load_8bit:
+                    set_weight_attrs(
+                        param, {"matmul_state": [None] * len(quant_states)}
+                    )
+
+    def download_model(self, model_config: ModelConfig) -> None:
+        self._prepare_weights(model_config.model_path, model_config.revision)
+
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+    ) -> nn.Module:
+        with set_default_torch_dtype(model_config.dtype):
+            with torch.device(device_config.device):
+                model = _initialize_model(
+                    model_config,
+                    self.load_config,
+                )
+
+                self._load_weights(model_config, model)
+
+        return model.eval()
+
+
+class GGUFModelLoader(BaseModelLoader):
+    """
+    Model loader that can load GGUF files. This is useful for loading models
+    that are quantized with GGUF and saved in the GGUF format. This loader
+    supports loading both full models and sharded models.
+    """
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        if load_config.model_loader_extra_config:
+            raise ValueError(
+                f"Model loader extra config is not supported for "
+                f"load format {load_config.load_format}"
+            )
+
+    def _prepare_weights(self, model_name_or_path: str):
+        if os.path.isfile(model_name_or_path):
+            return model_name_or_path
+        else:
+            raise ValueError(f"{model_name_or_path} is not a file.")
+
+    def _get_gguf_weights_map(self, model_config: ModelConfig):
+        """
+        GGUF uses this naming convention for their tensors from HF checkpoint:
+        `blk.N.BB.weight` and `blk.N.BB.bias`
+        where N signifies the block number of a layer, and BB signifies the
+        attention/mlp layer components.
+        See "Standardized tensor names" in
+        https://github.com/ggerganov/ggml/blob/master/docs/gguf.md for details.
+        """
+
+        # only load the gguf module when needed
+        try:
+            import gguf
+
+            # FIXME: add version check for gguf
+        except ImportError as err:
+            raise ImportError(
+                "Please install gguf via `pip install gguf` to use gguf quantizer."
+            ) from err
+
+        config = model_config.hf_config
+        model_type = config.model_type
+        # hack: ggufs have a different name than transformers
+        if model_type == "cohere":
+            model_type = "command-r"
+        arch = None
+        for key, value in gguf.MODEL_ARCH_NAMES.items():
+            if value == model_type:
+                arch = key
+                break
+        if arch is None:
+            raise RuntimeError(f"Unknown gguf model_type: {model_type}")
+        num_layers = config.num_hidden_layers
+        name_map = gguf.get_tensor_name_map(arch, num_layers)
+        with torch.device("meta"):
+            dummy_model = AutoModelForCausalLM.from_config(config)
+        state_dict = dummy_model.state_dict()
+
+        gguf_to_hf_name_map = {}
+        for hf_name in state_dict:
+            name, suffix = hf_name.rsplit(".", 1)
+            gguf_name = name_map.get_name(name)
+            gguf_to_hf_name_map[f"{gguf_name}.{suffix}"] = hf_name
+        return gguf_to_hf_name_map
+
+    def _get_weights_iterator(
+        self, model_name_or_path: str, gguf_to_hf_name_map: Dict[str, str]
+    ) -> Generator[Tuple[str, torch.Tensor], None, None]:
+        return gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map)
+
+    def download_model(self, model_config: ModelConfig) -> None:
+        self._prepare_weights(model_config.model_path)
+
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+    ) -> nn.Module:
+
+        local_model_path = self._prepare_weights(model_config.model_path)
+        gguf_weights_map = self._get_gguf_weights_map(model_config)
+        # we can only know if tie word embeddings after mapping weights
+        if "lm_head.weight" in get_gguf_extra_tensor_names(
+            local_model_path, gguf_weights_map
+        ):
+            model_config.hf_config.update({"tie_word_embeddings": True})
+
+        target_device = torch.device(device_config.device)
+        with set_default_torch_dtype(model_config.dtype):
+            with target_device:
+                model = _initialize_model(model_config, self.load_config)
+            model.load_weights(
+                self._get_weights_iterator(local_model_path, gguf_weights_map)
+            )
+
+            for _, module in model.named_modules():
+                quant_method = getattr(module, "quant_method", None)
+                if quant_method is not None:
+                    with device_loading_context(module, target_device):
+                        quant_method.process_weights_after_loading(module)
+        return model
+
+
+class RemoteModelLoader(BaseModelLoader):
+    """Model loader that can load Tensors from remote database."""
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        # TODO @DellCurry: move to s3 connector only
+        set_runai_streamer_env(load_config)
+
+    def _get_weights_iterator_kv(
+        self,
+        client,
+    ) -> Generator[Tuple[str, torch.Tensor], None, None]:
+        """Get an iterator for the model weights from remote storage."""
+        assert get_connector_type(client) == ConnectorType.KV
+        rank = get_tensor_model_parallel_rank()
+        return client.weight_iterator(rank)
+
+    def _get_weights_iterator_fs(
+        self,
+        client,
+    ) -> Generator[Tuple[str, torch.Tensor], None, None]:
+        """Get an iterator for the model weights from remote storage."""
+        assert get_connector_type(client) == ConnectorType.FS
+        return client.weight_iterator()
+
+    def download_model(self, model_config: ModelConfig) -> None:
+        pass
+
+    @staticmethod
+    def save_model(
+        model: torch.nn.Module,
+        model_path: str,
+        url: str,
+    ) -> None:
+        with create_remote_connector(url) as client:
+            assert get_connector_type(client) == ConnectorType.KV
+            model_name = parse_model_name(url)
+            rank = get_tensor_model_parallel_rank()
+            state_dict = ShardedStateLoader._filter_subtensors(model.state_dict())
+            for key, tensor in state_dict.items():
+                r_key = f"{model_name}/keys/rank_{rank}/{key}"
+                client.set(r_key, tensor)
+
+            for root, _, files in os.walk(model_path):
+                for file_name in files:
+                    # ignore hidden files
+                    if file_name.startswith("."):
+                        continue
+                    if os.path.splitext(file_name)[1] in (".json", ".py"):
+                        file_path = os.path.join(root, file_name)
+                        with open(file_path, encoding="utf-8") as file:
+                            file_content = file.read()
+                            f_key = f"{model_name}/files/{file_name}"
+                            client.setstr(f_key, file_content)
+
+    def _load_model_from_remote_kv(
+        self, model: nn.Module, model_config: ModelConfig, client
+    ):
+        for _, module in model.named_modules():
+            quant_method = getattr(module, "quant_method", None)
+            if quant_method is not None:
+                quant_method.process_weights_after_loading(module)
+        weights_iterator = self._get_weights_iterator_kv(client)
+        state_dict = ShardedStateLoader._filter_subtensors(model.state_dict())
+        for key, tensor in weights_iterator:
+            # If loading with LoRA enabled, additional padding may
+            # be added to certain parameters. We only load into a
+            # narrowed view of the parameter data.
+            param_data = state_dict[key].data
+            param_shape = state_dict[key].shape
+            for dim, size in enumerate(tensor.shape):
+                if size < param_shape[dim]:
+                    param_data = param_data.narrow(dim, 0, size)
+            if tensor.shape != param_shape:
+                logger.warning(
+                    "loading tensor of shape %s into " "parameter '%s' of shape %s",
+                    tensor.shape,
+                    key,
+                    param_shape,
+                )
+            param_data.copy_(tensor)
+            state_dict.pop(key)
+        if state_dict:
+            raise ValueError(f"Missing keys {tuple(state_dict)} in loaded state!")
+
+        post_load_weights(model, model_config)
+
+    def _load_model_from_remote_fs(
+        self, model, client, model_config: ModelConfig, device_config: DeviceConfig
+    ) -> nn.Module:
+
+        target_device = torch.device(device_config.device)
+        with set_default_torch_dtype(model_config.dtype):
+            model.load_weights(self._get_weights_iterator_fs(client))
+
+            for _, module in model.named_modules():
+                quant_method = getattr(module, "quant_method", None)
+                if quant_method is not None:
+                    # When quant methods need to process weights after loading
+                    # (for repacking, quantizing, etc), they expect parameters
+                    # to be on the global target device. This scope is for the
+                    # case where cpu offloading is used, where we will move the
+                    # parameters onto device for processing and back off after.
+                    with device_loading_context(module, target_device):
+                        quant_method.process_weights_after_loading(module)
+
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+    ) -> nn.Module:
+        logger.info("Loading weights from remote storage ...")
+        start = time.perf_counter()
+        load_config = self.load_config
+
+        assert load_config.load_format == LoadFormat.REMOTE, (
+            f"Model loader {self.load_config.load_format} is not supported for "
+            f"load format {load_config.load_format}"
+        )
+
+        model_weights = model_config.model_path
+        if hasattr(model_config, "model_weights"):
+            model_weights = model_config.model_weights
+
+        with set_default_torch_dtype(model_config.dtype):
+            with torch.device(device_config.device):
+                model = _initialize_model(model_config, self.load_config)
+
+            with create_remote_connector(
+                model_weights, device=device_config.device
+            ) as client:
+                connector_type = get_connector_type(client)
+                if connector_type == ConnectorType.KV:
+                    self._load_model_from_remote_kv(model, model_config, client)
+                elif connector_type == ConnectorType.FS:
+                    self._load_model_from_remote_fs(
+                        model, client, model_config, device_config
+                    )
+
+        end = time.perf_counter()
+        logger.info("Loaded weights from remote storage in %.2f seconds.", end - start)
+        return model.eval()
+
+
+def load_model_with_cpu_quantization(
+    self,
+    *,
+    model_config: ModelConfig,
+    device_config: DeviceConfig,
+) -> nn.Module:
+    target_device = torch.device(device_config.device)
+    with set_default_torch_dtype(model_config.dtype):
+        model = _initialize_model(
+            model_config,
+            self.load_config,
+        )
+
+        if not isinstance(self, DummyModelLoader):
+            model.load_weights(self._get_all_weights(model_config, model))
+
+        for _, module in model.named_modules():
+            quant_method = getattr(module, "quant_method", None)
+            if quant_method is not None:
+                # When quant methods need to process weights after loading
+                # (for repacking, quantizing, etc), they expect parameters
+                # to be on the global target device. This scope is for the
+                # case where cpu offloading is used, where we will move the
+                # parameters onto device for processing and back off after.
+                with device_loading_context(module, target_device):
+                    quant_method.process_weights_after_loading(module)
+
+        model.to(target_device)
+
+    return model.eval()
+
+
+def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
+    """Get a model loader based on the load format."""
+
+    if isinstance(load_config.load_format, type):
+        return load_config.load_format(load_config)
+
+    if load_config.load_format == LoadFormat.DUMMY:
+        return DummyModelLoader(load_config)
+
+    if load_config.load_format == LoadFormat.SHARDED_STATE:
+        return ShardedStateLoader(load_config)
+
+    if load_config.load_format == LoadFormat.BITSANDBYTES:
+        return BitsAndBytesModelLoader(load_config)
+
+    if load_config.load_format == LoadFormat.GGUF:
+        return GGUFModelLoader(load_config)
+
+    if load_config.load_format == LoadFormat.LAYERED:
+        return LayeredModelLoader(load_config)
+
+    if load_config.load_format == LoadFormat.REMOTE:
+        return RemoteModelLoader(load_config)
+
+    return DefaultModelLoader(load_config)
diff --git a/python/sglang/srt/model_loader/utils.py b/python/sglang/srt/model_loader/utils.py
new file mode 100644
index 000000000..f6ad79010
--- /dev/null
+++ b/python/sglang/srt/model_loader/utils.py
@@ -0,0 +1,119 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/model_loader/utils.py
+
+"""Utilities for selecting and loading models."""
+import contextlib
+import logging
+from typing import Tuple, Type
+
+import torch
+import transformers
+from torch import nn
+from transformers.dynamic_module_utils import get_class_from_dynamic_module
+
+from sglang.srt.configs.model_config import ModelConfig, ModelImpl
+
+logger = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def set_default_torch_dtype(dtype: torch.dtype):
+    """Sets the default torch dtype to the given dtype."""
+    old_dtype = torch.get_default_dtype()
+    torch.set_default_dtype(dtype)
+    yield
+    torch.set_default_dtype(old_dtype)
+
+
+def resolve_transformers_arch(model_config: ModelConfig, architectures: list[str]):
+    for i, arch in enumerate(architectures):
+        if arch == "TransformersForCausalLM":
+            continue
+        auto_map: dict[str, str] = (
+            getattr(model_config.hf_config, "auto_map", None) or dict()
+        )
+        # Make sure that config class is always initialized before model class,
+        # otherwise the model class won't be able to access the config class,
+        # the expected auto_map should have correct order like:
+        # "auto_map": {
+        #     "AutoConfig": "<your-repo-name>--<config-name>",
+        #     "AutoModel": "<your-repo-name>--<config-name>",
+        #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
+        # },
+        auto_modules = {
+            name: get_class_from_dynamic_module(
+                module, model_config.model_path, revision=model_config.revision
+            )
+            for name, module in sorted(auto_map.items(), key=lambda x: x[0])
+        }
+        model_module = getattr(transformers, arch, None)
+        if model_module is None:
+            if "AutoModel" not in auto_map:
+                raise ValueError(
+                    f"Cannot find model module. '{arch}' is not a registered "
+                    "model in the Transformers library (only relevant if the "
+                    "model is meant to be in Transformers) and 'AutoModel' is "
+                    "not present in the model config's 'auto_map' (relevant "
+                    "if the model is custom)."
+                )
+            model_module = auto_modules["AutoModel"]
+        if model_config.model_impl == ModelImpl.TRANSFORMERS:
+            if not model_module.is_backend_compatible():
+                raise ValueError(
+                    f"The Transformers implementation of {arch} is not "
+                    "compatible with SGLang."
+                )
+            architectures[i] = "TransformersForCausalLM"
+        if model_config.model_impl == ModelImpl.AUTO:
+            if not model_module.is_backend_compatible():
+                raise ValueError(
+                    f"{arch} has no SGlang implementation and the Transformers "
+                    "implementation is not compatible with SGLang."
+                )
+            logger.warning(
+                "%s has no SGLang implementation, falling back to Transformers "
+                "implementation. Some features may not be supported and "
+                "performance may not be optimal.",
+                arch,
+            )
+            architectures[i] = "TransformersForCausalLM"
+    return architectures
+
+
+def get_model_architecture(model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
+    from sglang.srt.models.registry import ModelRegistry
+
+    architectures = getattr(model_config.hf_config, "architectures", [])
+    # Special handling for quantized Mixtral.
+    # FIXME(woosuk): This is a temporary hack.
+    mixtral_supported = ["fp8", "compressed-tensors", "gptq_marlin", "awq_marlin"]
+
+    if (
+        model_config.quantization is not None
+        and model_config.quantization not in mixtral_supported
+        and "MixtralForCausalLM" in architectures
+    ):
+        architectures = ["QuantMixtralForCausalLM"]
+
+    supported_archs = ModelRegistry.get_supported_archs()
+    is_native_supported = any(arch in supported_archs for arch in architectures)
+
+    if not is_native_supported or model_config.model_impl == ModelImpl.TRANSFORMERS:
+        architectures = resolve_transformers_arch(model_config, architectures)
+
+    return ModelRegistry.resolve_model_cls(architectures)
+
+
+def get_architecture_class_name(model_config: ModelConfig) -> str:
+    return get_model_architecture(model_config)[1]
+
+
+def post_load_weights(model: nn.Module, model_config: ModelConfig):
+    # Model weight loading consists of two stages:
+    # 1. Initial weight loading.
+    # 2. Post-processing of weights, including assigning specific member variables.
+    # For `dummy_init`, only the second stage is required.
+    if hasattr(model, "post_load_weights"):
+        if model_config.hf_config.architectures[0] == "DeepseekV3ForCausalLMNextN":
+            model.post_load_weights(is_nextn=True)
+        else:
+            model.post_load_weights()
diff --git a/python/sglang/srt/model_loader/weight_utils.py b/python/sglang/srt/model_loader/weight_utils.py
new file mode 100644
index 000000000..397d9e913
--- /dev/null
+++ b/python/sglang/srt/model_loader/weight_utils.py
@@ -0,0 +1,1030 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/model_loader/weight_utils.py
+
+"""Utilities for downloading and initializing model weights."""
+import concurrent.futures
+import fnmatch
+import glob
+import hashlib
+import json
+import logging
+import os
+import queue
+import tempfile
+from collections import defaultdict
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
+
+import filelock
+import huggingface_hub.constants
+import numpy as np
+import safetensors.torch
+import torch
+from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
+from pydantic import BaseModel, ConfigDict, ValidationInfo, model_validator
+from tqdm.auto import tqdm
+
+from sglang.srt.configs.load_config import LoadConfig
+from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.distributed import get_tensor_model_parallel_rank
+from sglang.srt.layers.dp_attention import get_attention_tp_rank
+from sglang.srt.layers.quantization import QuantizationConfig, get_quantization_config
+from sglang.srt.layers.quantization.modelopt_quant import ModelOptFp4Config
+from sglang.srt.utils import print_warning_once
+
+logger = logging.getLogger(__name__)
+
+# use system-level temp directory for file locks, so that multiple users
+# can share the same lock without error.
+# lock files in the temp directory will be automatically deleted when the
+# system reboots, so users will not complain about annoying lock files
+temp_dir = tempfile.gettempdir()
+
+
+def enable_hf_transfer():
+    """automatically activates hf_transfer"""
+    if "HF_HUB_ENABLE_HF_TRANSFER" not in os.environ:
+        try:
+            # enable hf hub transfer if available
+            import hf_transfer  # type: ignore # noqa
+
+            huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
+        except ImportError:
+            pass
+
+
+enable_hf_transfer()
+
+
+class DisabledTqdm(tqdm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs, disable=True)
+
+
+def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None):
+    lock_dir = cache_dir or temp_dir
+    os.makedirs(os.path.dirname(lock_dir), exist_ok=True)
+    model_name = model_name_or_path.replace("/", "-")
+    hash_name = hashlib.sha256(model_name.encode()).hexdigest()
+    # add hash to avoid conflict with old users' lock files
+    lock_file_name = hash_name + model_name + ".lock"
+    # mode 0o666 is required for the filelock to be shared across users
+    lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name), mode=0o666)
+    return lock
+
+
+def _shared_pointers(tensors):
+    ptrs = defaultdict(list)
+    for k, v in tensors.items():
+        ptrs[v.data_ptr()].append(k)
+    failing = []
+    for _, names in ptrs.items():
+        if len(names) > 1:
+            failing.append(names)
+    return failing
+
+
+def convert_bin_to_safetensor_file(
+    pt_filename: str,
+    sf_filename: str,
+) -> None:
+    loaded = torch.load(pt_filename, map_location="cpu", weights_only=True)
+    if "state_dict" in loaded:
+        loaded = loaded["state_dict"]
+    shared = _shared_pointers(loaded)
+    for shared_weights in shared:
+        for name in shared_weights[1:]:
+            loaded.pop(name)
+
+    # For tensors to be contiguous
+    loaded = {k: v.contiguous() for k, v in loaded.items()}
+
+    dirname = os.path.dirname(sf_filename)
+    os.makedirs(dirname, exist_ok=True)
+    save_file(loaded, sf_filename, metadata={"format": "pt"})
+
+    # check file size
+    sf_size = os.stat(sf_filename).st_size
+    pt_size = os.stat(pt_filename).st_size
+    if (sf_size - pt_size) / pt_size > 0.01:
+        raise RuntimeError(
+            f"""The file size different is more than 1%:
+         - {sf_filename}: {sf_size}
+         - {pt_filename}: {pt_size}
+         """
+        )
+
+    # check if the tensors are the same
+    reloaded = safetensors.torch.load_file(sf_filename)
+    for k in loaded:
+        pt_tensor = loaded[k]
+        sf_tensor = reloaded[k]
+        if not torch.equal(pt_tensor, sf_tensor):
+            raise RuntimeError(f"The output tensors do not match for key {k}")
+
+
+# TODO(woosuk): Move this to other place.
+def get_quant_config(
+    model_config: ModelConfig,
+    load_config: LoadConfig,
+    packed_modules_mapping: Dict[str, List[str]],
+) -> QuantizationConfig:
+    quant_cls = get_quantization_config(model_config.quantization)
+
+    # GGUF doesn't have config file
+    if model_config.quantization == "gguf":
+        return quant_cls.from_config({})
+
+    # Read the quantization config from the HF model config, if available.
+    hf_quant_config = getattr(model_config.hf_config, "quantization_config", None)
+    # some vision model may keep quantization_config in their text_config
+    hf_text_config = getattr(model_config.hf_config, "text_config", None)
+    if hf_quant_config is None and hf_text_config is not None:
+        hf_quant_config = getattr(hf_text_config, "quantization_config", None)
+    if hf_quant_config is None:
+        # compressed-tensors uses a compressions_config
+        hf_quant_config = getattr(model_config.hf_config, "compression_config", None)
+    if hf_quant_config is not None:
+        hf_quant_config["packed_modules_mapping"] = packed_modules_mapping
+        return quant_cls.from_config(hf_quant_config)
+    # In case of bitsandbytes/QLoRA, get quant config from the adapter model.
+    if model_config.quantization == "bitsandbytes":
+        if (
+            not load_config.model_loader_extra_config
+            or "qlora_adapter_name_or_path" not in load_config.model_loader_extra_config
+        ):
+            return quant_cls.from_config({"adapter_name_or_path": ""})
+        model_name_or_path = load_config.model_loader_extra_config[
+            "qlora_adapter_name_or_path"
+        ]
+
+    else:
+        model_name_or_path = model_config.model_path
+    is_local = os.path.isdir(model_name_or_path)
+    if not is_local:
+        # Download the config files.
+        with get_lock(model_name_or_path, load_config.download_dir):
+            hf_folder = snapshot_download(
+                model_name_or_path,
+                revision=model_config.revision,
+                allow_patterns="*.json",
+                cache_dir=load_config.download_dir,
+                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                tqdm_class=DisabledTqdm,
+            )
+    else:
+        hf_folder = model_name_or_path
+
+    possible_config_filenames = quant_cls.get_config_filenames()
+
+    # If the quantization config is not found, use the default config.
+    if not possible_config_filenames:
+        return quant_cls()
+
+    config_files = glob.glob(os.path.join(hf_folder, "*.json"))
+
+    quant_config_files = [
+        f for f in config_files if any(f.endswith(x) for x in possible_config_filenames)
+    ]
+    if len(quant_config_files) == 0:
+        raise ValueError(f"Cannot find the config file for {model_config.quantization}")
+    if len(quant_config_files) > 1:
+        raise ValueError(
+            f"Found multiple config files for {model_config.quantization}: "
+            f"{quant_config_files}"
+        )
+
+    quant_config_file = quant_config_files[0]
+    with open(quant_config_file) as f:
+        config = json.load(f)
+
+        if model_config.quantization == "bitsandbytes":
+            config["adapter_name_or_path"] = model_name_or_path
+        elif model_config.quantization == "modelopt":
+            if config["producer"]["name"] == "modelopt":
+                # (yizhang2077) workaround for nvidia/Llama-4-Maverick-17B-128E-Eagle3
+                if config["quantization"]["quant_algo"] is None:
+                    if (
+                        model_config.hf_config.architectures[0]
+                        != "LlamaForCausalLMEagle3"
+                    ):
+                        raise ValueError(
+                            f"Invalid quant_config, quantization method: {model_config.quantization},"
+                            f"hf architectures: {model_config.hf_config.architectures[0]}. "
+                        )
+                    return None
+                if "FP4" in config["quantization"]["quant_algo"]:
+                    return ModelOptFp4Config.from_config(config)
+                else:
+                    return quant_cls.from_config(config)
+            else:
+                raise ValueError(
+                    f"Unsupported quantization config"
+                    f" found for {model_config.quantization} in {f}."
+                )
+        elif model_config.quantization == "w8a8_int8":
+            config["packed_modules_mapping"] = packed_modules_mapping
+
+    return quant_cls.from_config(config)
+
+
+def download_weights_from_hf(
+    model_name_or_path: str,
+    cache_dir: Optional[str],
+    allow_patterns: List[str],
+    revision: Optional[str] = None,
+    ignore_patterns: Optional[Union[str, List[str]]] = None,
+) -> str:
+    """Download model weights from Hugging Face Hub.
+
+    Args:
+        model_name_or_path (str): The model name or path.
+        cache_dir (Optional[str]): The cache directory to store the model
+            weights. If None, will use HF defaults.
+        allow_patterns (List[str]): The allowed patterns for the
+            weight files. Files matched by any of the patterns will be
+            downloaded.
+        revision (Optional[str]): The revision of the model.
+        ignore_patterns (Optional[Union[str, List[str]]]): The patterns to
+            filter out the weight files. Files matched by any of the patterns
+            will be ignored.
+
+    Returns:
+        str: The path to the downloaded model weights.
+    """
+    if not huggingface_hub.constants.HF_HUB_OFFLINE:
+        # Before we download we look at that is available:
+        fs = HfFileSystem()
+        file_list = fs.ls(model_name_or_path, detail=False, revision=revision)
+
+        # depending on what is available we download different things
+        for pattern in allow_patterns:
+            matching = fnmatch.filter(file_list, pattern)
+            if len(matching) > 0:
+                allow_patterns = [pattern]
+                break
+
+    logger.info("Using model weights format %s", allow_patterns)
+    # Use file lock to prevent multiple processes from
+    # downloading the same model weights at the same time.
+    with get_lock(model_name_or_path, cache_dir):
+        hf_folder = snapshot_download(
+            model_name_or_path,
+            allow_patterns=allow_patterns,
+            ignore_patterns=ignore_patterns,
+            cache_dir=cache_dir,
+            tqdm_class=DisabledTqdm,
+            revision=revision,
+            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+        )
+    return hf_folder
+
+
+def download_safetensors_index_file_from_hf(
+    model_name_or_path: str,
+    index_file: str,
+    cache_dir: Optional[str],
+    revision: Optional[str] = None,
+) -> None:
+    """Download hf safetensors index file from Hugging Face Hub.
+
+    Args:
+        model_name_or_path (str): The model name or path.
+        cache_dir (Optional[str]): The cache directory to store the model
+            weights. If None, will use HF defaults.
+        revision (Optional[str]): The revision of the model.
+    """
+    # Use file lock to prevent multiple processes from
+    # downloading the same model weights at the same time.
+    with get_lock(model_name_or_path, cache_dir):
+        try:
+            # Download the safetensors index file.
+            hf_hub_download(
+                repo_id=model_name_or_path,
+                filename=index_file,
+                cache_dir=cache_dir,
+                revision=revision,
+                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+            )
+        # If file not found on remote or locally, we should not fail since
+        # only some models will have index_file.
+        except huggingface_hub.utils.EntryNotFoundError:
+            logger.info("No %s found in remote.", index_file)
+        except huggingface_hub.utils.LocalEntryNotFoundError:
+            logger.info("No %s found in local cache.", index_file)
+
+
+# For models like Mistral-7B-v0.3, there are both sharded
+# safetensors files and a consolidated safetensors file.
+# Passing both of these to the weight loader functionality breaks.
+# So, we use the index_file to
+# look up which safetensors files should be used.
+def filter_duplicate_safetensors_files(
+    hf_weights_files: List[str], hf_folder: str, index_file: str
+) -> List[str]:
+    # model.safetensors.index.json is a mapping from keys in the
+    # torch state_dict to safetensors file holding that weight.
+    index_file_name = os.path.join(hf_folder, index_file)
+    if not os.path.isfile(index_file_name):
+        return hf_weights_files
+
+    # Iterate through the weight_map (weight_name: safetensors files)
+    # to identify weights that we should use.
+    with open(index_file_name) as f:
+        weight_map = json.load(f)["weight_map"]
+    weight_files_in_index = set()
+    for weight_name in weight_map:
+        weight_files_in_index.add(os.path.join(hf_folder, weight_map[weight_name]))
+    # Filter out any fields that are not found in the index file.
+    hf_weights_files = [f for f in hf_weights_files if f in weight_files_in_index]
+    return hf_weights_files
+
+
+def filter_files_not_needed_for_inference(hf_weights_files: List[str]) -> List[str]:
+    """
+    Exclude files that are not needed for inference.
+
+    See https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233
+    """
+    blacklist = [
+        "training_args.bin",
+        "optimizer.bin",
+        "optimizer.pt",
+        "scheduler.pt",
+        "scaler.pt",
+    ]
+    hf_weights_files = [
+        f for f in hf_weights_files if not any(f.endswith(x) for x in blacklist)
+    ]
+    return hf_weights_files
+
+
+# explicitly use pure text format, with a newline at the end
+# this makes it impossible to see the animation in the progress bar
+# but will avoid messing up with ray or multiprocessing, which wraps
+# each line of output with some prefix.
+_BAR_FORMAT = "{desc}: {percentage:3.0f}% Completed | {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]\n"  # noqa: E501
+
+
+def np_cache_weights_iterator(
+    model_name_or_path: str,
+    cache_dir: Optional[str],
+    hf_folder: str,
+    hf_weights_files: List[str],
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    """Iterate over the weights in the model np files.
+
+    Will dump the model weights to numpy files if they are not already dumped.
+    """
+    enable_tqdm = (
+        not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
+    )
+    # Convert the model weights from torch tensors to numpy arrays for
+    # faster loading.
+    np_folder = os.path.join(hf_folder, "np")
+    os.makedirs(np_folder, exist_ok=True)
+    weight_names_file = os.path.join(np_folder, "weight_names.json")
+    # Use file lock to prevent multiple processes from
+    # dumping the same model weights to numpy at the same time.
+    with get_lock(model_name_or_path, cache_dir):
+        if not os.path.exists(weight_names_file):
+            weight_names: List[str] = []
+            for bin_file in tqdm(
+                hf_weights_files,
+                desc="Loading np_cache checkpoint shards",
+                disable=not enable_tqdm,
+                bar_format=_BAR_FORMAT,
+            ):
+                state = torch.load(bin_file, map_location="cpu", weights_only=True)
+                for name, param in state.items():
+                    param_path = os.path.join(np_folder, name)
+                    with open(param_path, "wb") as f:
+                        np.save(f, param.cpu().detach().numpy())
+                    weight_names.append(name)
+            with open(weight_names_file, "w") as f:
+                json.dump(weight_names, f)
+
+    with open(weight_names_file) as f:
+        weight_names = json.load(f)
+
+    for name in weight_names:
+        param_path = os.path.join(np_folder, name)
+        with open(param_path, "rb") as f:
+            param = np.load(f)
+        yield name, torch.from_numpy(param)
+
+
+def decrypt(fn, key):
+    raise NotImplementedError()
+
+
+def safetensors_encrypted_weights_iterator(
+    hf_weights_files: List[str],
+    is_all_weights_sharded: bool = False,
+    decryption_key: Optional[str] = None,
+):
+    raise NotImplementedError()
+
+
+def safetensors_weights_iterator(
+    hf_weights_files: List[str],
+    is_all_weights_sharded: bool = False,
+    decryption_key: Optional[str] = None,
+    disable_mmap: bool = False,
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    """Iterate over the weights in the model safetensor files.
+
+    If is_all_weights_sharded is True, it uses more optimize read by reading an
+    entire file instead of reading each tensor one by one.
+    """
+    if decryption_key:
+        yield from safetensors_encrypted_weights_iterator(
+            hf_weights_files, is_all_weights_sharded, decryption_key
+        )
+        return
+
+    enable_tqdm = (
+        not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
+    )
+    for st_file in tqdm(
+        hf_weights_files,
+        desc="Loading safetensors checkpoint shards",
+        disable=not enable_tqdm,
+        bar_format=_BAR_FORMAT,
+    ):
+        if disable_mmap:
+            with open(st_file, "rb") as f:
+                result = safetensors.torch.load(f.read())
+                for name, param in result.items():
+                    yield name, param
+        else:
+            with safetensors.safe_open(st_file, framework="pt", device="cpu") as f:
+                for name in f.keys():
+                    yield name, f.get_tensor(name)
+
+
+def multi_thread_safetensors_weights_iterator(
+    hf_weights_files: List[str],
+    is_all_weights_sharded: bool = False,
+    decryption_key: Optional[str] = None,
+    max_workers: int = 4,
+    disable_mmap: bool = False,
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    """Multi-Thread iterate over the weights in the model safetensor files.
+
+    If is_all_weights_sharded is True, it uses more optimize read by reading an
+    entire file instead of reading each tensor one by one.
+    """
+    if decryption_key:
+        logger.warning(
+            "Multi-Thread loading is not working for encrypted safetensor weights."
+        )
+        yield from safetensors_encrypted_weights_iterator(
+            hf_weights_files, is_all_weights_sharded, decryption_key
+        )
+        return
+
+    enable_tqdm = (
+        not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
+    )
+
+    def _load_file(st_file: str):
+        if disable_mmap:
+            with open(st_file, "rb") as f:
+                result = safetensors.torch.load(f.read())
+        else:
+            with safetensors.safe_open(st_file, framework="pt", device="cpu") as f:
+                result = {k: f.get_tensor(k) for k in f.keys()}
+
+        return result
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = [executor.submit(_load_file, st_file) for st_file in hf_weights_files]
+
+        if enable_tqdm:
+            futures_iter = tqdm(
+                concurrent.futures.as_completed(futures),
+                total=len(hf_weights_files),
+                desc="Multi-thread loading shards",
+                disable=not enable_tqdm,
+                bar_format=_BAR_FORMAT,
+            )
+        else:
+            futures_iter = concurrent.futures.as_completed(futures)
+
+        for future in futures_iter:
+            state_dict = future.result()
+            for name, param in state_dict.items():
+                yield name, param
+
+
+def pt_weights_iterator(
+    hf_weights_files: List[str],
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    """Iterate over the weights in the model bin/pt files."""
+    enable_tqdm = (
+        not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
+    )
+    for bin_file in tqdm(
+        hf_weights_files,
+        desc="Loading pt checkpoint shards",
+        disable=not enable_tqdm,
+        bar_format=_BAR_FORMAT,
+    ):
+        state = torch.load(bin_file, map_location="cpu", weights_only=True)
+        yield from state.items()
+        del state
+
+
+def multi_thread_pt_weights_iterator(
+    hf_weights_files: List[str],
+    max_workers: int = 4,
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    """Multi-Thread iterate over the weights in the model bin/pt files."""
+    enable_tqdm = (
+        not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
+    )
+
+    def _load_file(bin_file: str):
+        return torch.load(bin_file, map_location="cpu", weights_only=True)
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = [
+            executor.submit(_load_file, bin_file) for bin_file in hf_weights_files
+        ]
+
+        if enable_tqdm:
+            futures_iter = tqdm(
+                concurrent.futures.as_completed(futures),
+                total=len(hf_weights_files),
+                desc="Multi-thread loading pt checkpoint shards",
+                disable=not enable_tqdm,
+                bar_format=_BAR_FORMAT,
+            )
+        else:
+            futures_iter = concurrent.futures.as_completed(futures)
+
+        for future in futures_iter:
+            state = future.result()
+            yield from state.items()
+
+
+def get_gguf_extra_tensor_names(
+    gguf_file: str, gguf_to_hf_name_map: Dict[str, str]
+) -> List[str]:
+    import gguf
+
+    reader = gguf.GGUFReader(gguf_file)
+    expected_gguf_keys = set(gguf_to_hf_name_map.keys())
+    exact_gguf_keys = set([tensor.name for tensor in reader.tensors])
+    extra_keys = expected_gguf_keys - exact_gguf_keys
+    return [gguf_to_hf_name_map[key] for key in extra_keys]
+
+
+def gguf_quant_weights_iterator(
+    gguf_file: str, gguf_to_hf_name_map: Dict[str, str]
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    """
+    Iterate over the quant weights in the model gguf files and convert
+    them to torch tensors
+    """
+
+    import gguf
+
+    reader = gguf.GGUFReader(gguf_file)
+
+    for tensor in reader.tensors:
+        if tensor.name in gguf_to_hf_name_map:
+            weight_type = tensor.tensor_type
+            name = gguf_to_hf_name_map[tensor.name]
+
+            if weight_type.name != "F32":
+                weight_type_name = name.replace("weight", "qweight_type")
+                weight_type = torch.tensor(weight_type)
+                yield weight_type_name, weight_type
+
+    for tensor in reader.tensors:
+        if tensor.name in gguf_to_hf_name_map:
+            weight = tensor.data
+            weight_type = tensor.tensor_type
+            name = gguf_to_hf_name_map[tensor.name]
+
+            if weight_type.name != "F32":
+                name = name.replace("weight", "qweight")
+            param = torch.tensor(weight)
+            yield name, param
+
+
+def convert_pyslice_to_tensor(x: Any) -> torch.Tensor:
+    """convert PySafeSlice object from safetensors to torch.Tensor
+
+    PySafeSlice object supports indexing, which is done before loading the
+    actual tensor and can reduce the amount of memory being read into the
+    memory. However, it does not support more advanced functionalities
+    like `.view()` or `.t()`. Therefore, if we need to modify the loaded
+    tensor with these more complicated operators, we need to convert to
+    tensor first.
+    """
+    if not isinstance(x, torch.Tensor):
+        x = x[:]
+    return x
+
+
+def default_weight_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+    """Default weight loader."""
+    try:
+        if param.numel() == 1 and loaded_weight.numel() == 1:
+            # Sometimes scalar values aren't considered tensors with shapes
+            # so if both param and loaded_weight are a scalar,
+            # "broadcast" instead of copy
+            param.data.fill_(loaded_weight.item())
+        else:
+            assert param.size() == loaded_weight.size(), (
+                f"Attempted to load weight ({loaded_weight.size()}) "
+                f"into parameter ({param.size()})"
+            )
+
+            param.data.copy_(loaded_weight)
+    except Exception:
+        # NOTE: This exception is added for the purpose of setting breakpoint to
+        # debug weight loading issues.
+        raise
+
+
+def row_parallel_weight_loader(
+    param: torch.Tensor, loaded_weight: torch.Tensor
+) -> None:
+    """Load weights that are row-parallelized."""
+    tp_rank = get_tensor_model_parallel_rank()
+    shard_dim = 0 if param.dim() != 1 else None
+
+    if shard_dim is not None:
+        shard_size = param.data.shape[shard_dim]
+        start_idx = tp_rank * shard_size
+        loaded_weight = loaded_weight.narrow(shard_dim, start_idx, shard_size)
+
+    return default_weight_loader(param, loaded_weight)
+
+
+LoaderFunction = Callable[[torch.Tensor, torch.Tensor], torch.Tensor]
+
+
+def sharded_weight_loader(shard_axis: int) -> LoaderFunction:
+    """Create a weight loader that shards the weights along the given axis"""
+
+    def loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+        tp_rank = get_attention_tp_rank()
+
+        shard_size = param.data.shape[shard_axis]
+        start_idx = tp_rank * shard_size
+        loaded_weight = loaded_weight.narrow(shard_axis, start_idx, shard_size)
+
+        return default_weight_loader(param, loaded_weight)
+
+    return loader
+
+
+def composed_weight_loader(
+    loader: LoaderFunction, fn: Callable[[torch.Tensor], torch.Tensor]
+) -> LoaderFunction:
+    """Create a weight loader that post-processes the weights after loading"""
+
+    def composed_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+        loader(param, loaded_weight)
+        param.data.copy_(fn(param))
+        return
+
+    return composed_loader
+
+
+def runai_safetensors_weights_iterator(
+    hf_weights_files: List[str],
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    """Iterate over the weights in the model safetensor files."""
+    from runai_model_streamer import SafetensorsStreamer
+
+    enable_tqdm = (
+        not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
+    )
+
+    with SafetensorsStreamer() as streamer:
+        for st_file in tqdm(
+            hf_weights_files,
+            desc="Loading safetensors using Runai Model Streamer",
+            disable=not enable_tqdm,
+            bar_format=_BAR_FORMAT,
+        ):
+            streamer.stream_file(st_file)
+            yield from streamer.get_tensors()
+
+
+def set_runai_streamer_env(load_config: LoadConfig):
+    if load_config.model_loader_extra_config:
+        extra_config = load_config.model_loader_extra_config
+
+        if "concurrency" in extra_config and isinstance(
+            extra_config.get("concurrency"), int
+        ):
+            os.environ["RUNAI_STREAMER_CONCURRENCY"] = str(
+                extra_config.get("concurrency")
+            )
+
+        if "memory_limit" in extra_config and isinstance(
+            extra_config.get("memory_limit"), int
+        ):
+            os.environ["RUNAI_STREAMER_MEMORY_LIMIT"] = str(
+                extra_config.get("memory_limit")
+            )
+
+    runai_streamer_s3_endpoint = os.getenv("RUNAI_STREAMER_S3_ENDPOINT")
+    aws_endpoint_url = os.getenv("AWS_ENDPOINT_URL")
+    if runai_streamer_s3_endpoint is None and aws_endpoint_url is not None:
+        os.environ["RUNAI_STREAMER_S3_ENDPOINT"] = aws_endpoint_url
+
+
+def initialize_dummy_weights(
+    model: torch.nn.Module,
+    low: float = -1e-3,
+    high: float = 1e-3,
+    seed: int = 1234,
+) -> None:
+    """Initialize model weights with random values.
+
+    The model weights must be randomly initialized for accurate performance
+    measurements. Additionally, the model weights should not cause NaNs in the
+    forward pass. We empirically found that initializing the weights with
+    values between -1e-3 and 1e-3 works well for most models.
+
+    We use per-parameter random seed, so that dummy weights are consistent,
+    even if the model is partitioned across multiple devices. When the seed
+    is fixed, the random values generated by this function only depends on
+    the parameter's number of elements and its data type.
+    """
+    for param in model.state_dict().values():
+        if torch.is_floating_point(param):
+            generator = torch.Generator(device=param.data.device)
+            generator.manual_seed(seed)
+            if torch.finfo(param.data.dtype).bits < 16:
+                # uniform_ doesn't support < 16-bit datatypes (FP8)
+                dtype = param.data.dtype
+                tmp_param = param.data.to(torch.float16)
+                tmp_param = tmp_param.uniform_(low, high, generator=generator).to(dtype)
+                param.data.copy_(tmp_param)
+            else:
+                param.uniform_(low, high, generator=generator)
+
+
+def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
+    """Remap the name of FP8 k/v_scale parameters.
+
+    This function handles the remapping of FP8 k/v_scale parameter names.
+    It detects if the given name ends with a suffix and attempts to remap
+    it to the expected name format in the model. If the remapped name is not
+    found in the params_dict, a warning is printed and None is returned.
+
+    Args:
+        name (str): The original loaded checkpoint parameter name.
+        params_dict (dict): Dictionary containing the model's named parameters.
+
+    Returns:
+        str: The remapped parameter name if successful, or the original name
+             if no remapping is needed.
+        None: If the remapped name is not found in params_dict.
+    """
+    if name.endswith(".kv_scale"):
+        print_warning_once(
+            "DEPRECATED. Found kv_scale in the checkpoint. "
+            "This format is deprecated in favor of separate k_scale and "
+            "v_scale tensors and will be removed in a future release. "
+            "Functionally, we will remap kv_scale to k_scale and duplicate "
+            "k_scale to v_scale"
+        )
+        # NOTE: we remap the deprecated kv_scale to k_scale
+        remapped_name = name.replace(".kv_scale", ".attn.k_scale")
+        if remapped_name not in params_dict:
+            print_warning_once(
+                f"Found kv_scale in the checkpoint (e.g. {name}), "
+                "but not found the expected name in the model "
+                f"(e.g. {remapped_name}). kv_scale is "
+                "not loaded."
+            )
+            return None
+        return remapped_name
+
+    possible_scale_names = [".k_scale", ".v_scale"]
+    modelopt_scale_names = [".self_attn.k_proj.k_scale", ".self_attn.v_proj.v_scale"]
+    for scale_name in possible_scale_names:
+        if name.endswith(scale_name):
+            # Check and remap the name based on modelopt scale names
+            if any(
+                modelopt_scale_name in name
+                for modelopt_scale_name in modelopt_scale_names
+            ):
+                remapped_name = name.replace(
+                    f".self_attn.{scale_name[1]}_proj{scale_name}",
+                    f".self_attn.attn{scale_name}",
+                )
+            else:
+                remapped_name = name.replace(scale_name, f".attn{scale_name}")
+            if remapped_name not in params_dict:
+                print_warning_once(
+                    f"Found {scale_name} in the checkpoint (e.g. {name}), "
+                    "but not found the expected name in the model "
+                    f"(e.g. {remapped_name}). {scale_name} is "
+                    "not loaded."
+                )
+                return None
+            return remapped_name
+
+    quark_scale_names = {
+        ".q_proj.output_scale": ".attn.q_scale",
+        ".k_proj.output_scale": ".attn.k_scale",
+        ".v_proj.output_scale": ".attn.v_scale",
+        "self_attn.prob_output_scale": ".attn.prob_scale",
+    }
+    for quark_scale_name, sglang_scale_name in quark_scale_names.items():
+        if name.endswith(quark_scale_name):
+            return name.replace(quark_scale_name, sglang_scale_name)
+
+    # If there were no matches, return the untouched param name
+    return name
+
+
+# Adapted from https://github.com/vllm-project/vllm/blob/68ad4e3a8d8a66fb2a43be57471ee13a8bec4ec0/vllm/model_executor/layers/quantization/schema.py
+class KVCacheQuantSchema(BaseModel):
+    dtype: str
+    # Each key is a TP rank. Each value is a dictionary mapping a TP rank's
+    # layer indices to their per-tensor KV cache scaling factor.
+    # TODO: Consider pulling this and its validation methods out into its
+    # own schema class (tricky as its members are variable)
+    scaling_factor: Dict[int, Dict[int, float]]
+
+    @model_validator(mode="after")
+    def check_is_fp8(self) -> "KVCacheQuantSchema":
+        assert self.dtype == "float8_e4m3fn", (
+            "Loaded scaling factors intended for KV cache dtype = "
+            f"{self.dtype} rather than float8_e4m3fn!"
+        )
+        return self
+
+    @model_validator(mode="after")
+    def check_tp_ranks(self, info: ValidationInfo) -> "KVCacheQuantSchema":
+        context = info.context
+        if context:
+            tp_size = context["tp_size"]
+            num_hidden_layers = context["num_hidden_layers"]
+            assert len(self.scaling_factor) == tp_size, (
+                f"Loaded dictionary has TP size {len(self.scaling_factor)} "
+                f"but LLM engine is currently running with TP size {tp_size}."
+            )
+            for tp_rank, layer_maps in self.scaling_factor.items():
+                assert len(layer_maps) == num_hidden_layers, (
+                    f"KV cache scales map for TP rank {tp_rank} is malformed. "
+                    f"Expected {num_hidden_layers} layers, got "
+                    f"{len(layer_maps)}."
+                )
+            for i in range(tp_size):
+                assert (
+                    i in self.scaling_factor
+                ), f"KV cache scales map for TP rank {i} not found."
+        return self
+
+    @model_validator(mode="after")
+    def check_current_rank(self, info: ValidationInfo) -> "KVCacheQuantSchema":
+        context = info.context
+        if context:
+            tp_rank = context["tp_rank"]
+            num_hidden_layers = context["num_hidden_layers"]
+            layer_scales_map = self.scaling_factor[tp_rank]
+            for i in range(num_hidden_layers):
+                assert i in layer_scales_map, (
+                    f"Could not find KV cache scales for layer {i} in "
+                    f"TP rank {tp_rank}."
+                )
+        return self
+
+
+class QuantParamSchema(BaseModel):
+    # TODO: Generalize and extend with more fields
+    # (e.g. weights/activations params) once functionality is enabled
+    model_config = ConfigDict(protected_namespaces=())
+    model_type: Optional[str]
+    kv_cache: KVCacheQuantSchema
+
+    @model_validator(mode="after")
+    def check_model_type(self, info: ValidationInfo) -> "QuantParamSchema":
+        context = info.context
+        if context:
+            model_type = context.get("model_type", None)
+            if model_type is not None:
+                assert model_type == self.model_type, (
+                    f"Model type is {model_type} but loaded "
+                    f"scaling factors belonging to different "
+                    f"model type {self.model_type}!"
+                )
+        return self
+
+
+def kv_cache_scales_loader(
+    filename: str,
+    tp_rank: int,
+    tp_size: int,
+    num_hidden_layers: int,
+    model_type: Optional[str],
+) -> Iterable[Tuple[int, float]]:
+    """
+    A simple utility to read in KV cache scaling factors that have been
+    previously serialized to disk. Used by the model to populate the appropriate
+    KV cache scaling factors. The serialization should represent a dictionary
+    whose keys are the TP ranks and values are another dictionary mapping layers
+    to their KV cache scaling factors.
+    """
+    try:
+        with open(filename) as f:
+            context = {
+                "model_type": model_type,
+                "num_hidden_layers": num_hidden_layers,
+                "tp_rank": tp_rank,
+                "tp_size": tp_size,
+            }
+            schema_dct = json.load(f)
+            schema = QuantParamSchema.model_validate(schema_dct, context=context)
+            layer_scales_map = schema.kv_cache.scaling_factor[tp_rank]
+            return layer_scales_map.items()
+    except FileNotFoundError:
+        logger.error("File or directory '%s' not found.", filename)
+    except json.JSONDecodeError:
+        logger.error("Error decoding JSON in file '%s'.", filename)
+    except Exception:
+        logger.error("An error occurred while reading '%s'.", filename)
+    # This section is reached if and only if any of the excepts are hit
+    # Return an empty iterable (list) => no KV cache scales are loaded
+    # which ultimately defaults to 1.0 scales
+    logger.warning(
+        "Defaulting to KV cache scaling factors = 1.0 for all "
+        "layers in TP rank %d as an error occurred during loading.",
+        tp_rank,
+    )
+    return []
+
+
+def get_actual_shard_size(shard_size, weight_start, weight_end):
+    if weight_end < weight_start:
+        return 0
+
+    return min(shard_size, weight_end - weight_start)
+
+
+def reset_param_data_if_needed(param_data, dim, start, length):
+    if length == 0:
+        return
+
+    assert length > 0, f"Length should be positive, but got {length}"
+
+    param_data.narrow(dim, start, length).zero_()
+    return
+
+
+def narrow_padded_param_and_loaded_weight(
+    param_data,
+    loaded_weight,
+    param_data_start,
+    weight_start,
+    dim,
+    shard_size,
+    narrow_weight=True,
+):
+    actual_shard_size = get_actual_shard_size(
+        shard_size, weight_start, loaded_weight.size(dim)
+    )
+
+    if narrow_weight:
+        if actual_shard_size > 0:
+            loaded_weight = loaded_weight.narrow(dim, weight_start, actual_shard_size)
+        else:
+            # No real data to load; create a dummy tensor filled with zeros
+            loaded_weight = torch.zeros_like(
+                param_data.narrow(dim, param_data_start, actual_shard_size)
+            )
+
+    # [Note] Reset padded weights to zero.
+    # If the actual shard size is less than the shard size, we need to reset
+    # the padded param_data to zero and then copy the loaded_weight into it.
+    reset_param_data_if_needed(
+        param_data,
+        dim,
+        param_data_start + actual_shard_size,
+        shard_size - actual_shard_size,
+    )
+
+    param_data = param_data.narrow(dim, param_data_start, actual_shard_size)
+
+    return param_data, loaded_weight
diff --git a/python/sglang/srt/models/arcee.py b/python/sglang/srt/models/arcee.py
new file mode 100644
index 000000000..f9ebfe19a
--- /dev/null
+++ b/python/sglang/srt/models/arcee.py
@@ -0,0 +1,532 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only Arcee Foundational Model (AFM) compatible with HuggingFace weights."""
+
+import logging
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from sglang.srt.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    kv_cache_scales_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.utils import add_prefix, make_layers
+
+logger = logging.getLogger(__name__)
+
+
+class ArceeMLP(nn.Module):
+    """
+    MLP block for the Arcee model, using a ReLU-squared activation function.
+    This differs from the Llama SwiGLU activation.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        reduce_results: bool = True,
+    ) -> None:
+        super().__init__()
+        # Arcee uses a single up-projection, not a merged gate/up projection.
+        self.up_proj = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+            reduce_results=reduce_results,
+        )
+        if hidden_act != "relu2":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Arcee model in SGLang only supports 'relu2'."
+            )
+        # The activation function is relu(x)^2
+        self.act_fn = get_act_fn("relu2")
+
+    def forward(self, x, forward_batch=None):
+        x, _ = self.up_proj(x)
+        x = self.act_fn(x)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class ArceeAttention(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        rope_is_neox_style: bool = True,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        bias: bool = False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = getattr(config, "head_dim", None)
+        if self.head_dim is None:
+            self.head_dim = self.hidden_size // self.total_num_heads
+        self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
+        self.rotary_dim = int(self.partial_rotary_factor * self.head_dim)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.rotary_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=rope_is_neox_style,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class ArceeDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+        rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False
+        )
+        self.self_attn = ArceeAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            rope_is_neox_style=rope_is_neox_style,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+            bias=attention_bias,
+        )
+        self.mlp = ArceeMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class ArceeModel(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("embed_tokens", prefix),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: ArceeDecoderLayer(
+                config=config, quant_config=quant_config, layer_id=idx, prefix=prefix
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix="model.layers",
+        )
+
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+        self.layers_to_capture = []
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]], PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        aux_hidden_states = []
+        for i in range(self.start_layer, self.end_layer):
+            if i in self.layers_to_capture:
+                aux_hidden_states.append(hidden_states + residual)
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+
+        return hidden_states, aux_hidden_states
+
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+        for layer_idx, scaling_factor in kv_cache_scales_loader(
+            quantization_param_path,
+            tp_rank,
+            tp_size,
+            self.config.num_hidden_layers,
+            self.config.__class__.model_type,
+        ):
+            if not isinstance(self.layers[layer_idx], nn.Identity):
+                layer_self_attn = self.layers[layer_idx].self_attn
+
+            if hasattr(layer_self_attn.attn, "k_scale"):
+                layer_self_attn.attn.k_scale = scaling_factor
+                layer_self_attn.attn.v_scale = scaling_factor
+            else:
+                raise RuntimeError(
+                    "Self attention has no KV cache scaling factor attribute!"
+                )
+
+
+class ArceeForCausalLM(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        # Note: gate_proj is removed compared to Llama
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    # in TP, these weights are partitioned along the column dimension (dim=-1)
+    column_parallel_weights_modules = [".down_proj.", ".o_proj."]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        # Note: gate_proj and up_proj are removed as they are not stacked in ArceeMLP
+        ".q_proj": (".qkv_proj", 0),
+        ".k_proj": (".qkv_proj", 1),
+        ".v_proj": (".qkv_proj", 2),
+    }
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = self._init_model(config, quant_config, add_prefix("model", prefix))
+        # Arcee does not tie word embeddings
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+            use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
+        )
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+        # Parameters that are stacked in a single tensor in this model
+        self.stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+        ]
+        self.capture_aux_hidden_states = False
+
+    def _init_model(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        return ArceeModel(config, quant_config=quant_config, prefix=prefix)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if self.pp_group.is_last_rank:
+            if not get_embedding:
+                return self.logits_processor(
+                    input_ids,
+                    hidden_states,
+                    self.lm_head,
+                    forward_batch,
+                    aux_hidden_states,
+                )
+            else:
+                return self.pooler(hidden_states, forward_batch)
+        else:
+            return hidden_states
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                continue
+
+            # Handle FP8 kv-scale remapping
+            if "scale" in name:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+            is_stacked = False
+            for param_name, weight_name, shard_id in self.stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                is_stacked = True
+                break
+
+            if not is_stacked:
+                if name in params_dict:
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                else:
+                    logger.warning(f"Parameter {name} not found in model.")
+
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        self.model.load_kv_cache_scales(quantization_param_path)
+
+
+EntryClass = [ArceeForCausalLM]
diff --git a/python/sglang/srt/models/baichuan.py b/python/sglang/srt/models/baichuan.py
new file mode 100644
index 000000000..6e8d3b4a3
--- /dev/null
+++ b/python/sglang/srt/models/baichuan.py
@@ -0,0 +1,441 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only BaiChuan model compatible with HuggingFace weights."""
+import math
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+
+def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
+    closest_power_of_2 = 2 ** math.floor(math.log2(total_num_heads))
+    base = torch.tensor(
+        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))),
+        dtype=torch.float32,
+    )
+    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != total_num_heads:
+        extra_base = torch.tensor(
+            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))),
+            dtype=torch.float32,
+        )
+        num_remaining_heads = min(
+            closest_power_of_2, total_num_heads - closest_power_of_2
+        )
+        extra_powers = torch.arange(
+            start=1, end=1 + 2 * num_remaining_heads, step=2, dtype=torch.int32
+        )
+        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+    return slopes
+
+
+class BaiChuanMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class BaiChuanAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        position_embedding: str,
+        rope_theta: float = 10000,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_id: int = 0,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
+        self.head_dim = hidden_size // self.total_num_heads
+        self.postion_embedding = position_embedding
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.total_num_kv_heads = self.num_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+
+        # pylint: disable=invalid-name
+        self.W_pack = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_heads,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+        )
+        # Create the alibi slopes and slice them.
+        if self.postion_embedding == "ALIBI":
+            tp_rank = get_tensor_model_parallel_rank()
+            head_start = tp_rank * self.num_heads
+            head_end = (tp_rank + 1) * self.num_heads
+            alibi_slopes = _get_alibi_slopes(self.total_num_heads)
+            alibi_slopes = alibi_slopes[head_start:head_end].tolist()
+
+            scaling = self.head_dim**-0.5
+            self.attn = RadixAttention(
+                self.num_heads,
+                self.head_dim,
+                scaling,
+                num_kv_heads=self.num_kv_heads,
+                layer_id=layer_id,
+                quant_config=quant_config,
+                prefix=add_prefix("attn", prefix),
+            )
+        else:
+            self.rotary_emb = get_rope(
+                self.head_dim,
+                rotary_dim=self.head_dim,
+                max_position=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+            self.scaling = self.head_dim**-0.5
+            self.attn = RadixAttention(
+                self.num_heads,
+                self.head_dim,
+                self.scaling,
+                num_kv_heads=self.num_kv_heads,
+                layer_id=layer_id,
+                quant_config=quant_config,
+                prefix=add_prefix("attn", prefix),
+            )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.W_pack(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        if self.postion_embedding != "ALIBI":
+            q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class BaiChuanDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        position_embedding: str,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        self.self_attn = BaiChuanAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            position_embedding=position_embedding,
+            rope_theta=rope_theta,
+            layer_id=layer_id,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = BaiChuanMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class BaiChuanModel(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        position_embedding: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList(
+            [
+                BaiChuanDecoderLayer(
+                    config,
+                    layer_id=i,
+                    position_embedding=position_embedding,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class BaiChuanBaseForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "W_pack": ["W_pack"],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "W_pack",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+    ]
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        position_embedding: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.config = config
+
+        self.quant_config = quant_config
+        self.model = BaiChuanModel(
+            config, position_embedding, quant_config, prefix=add_prefix("model", prefix)
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        self.logits_processor = LogitsProcessor(config)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if name == "lm_head.weight":
+                # Unlike Baichuan, Baichuan2 normalizes the head weights.
+                # Refer to:
+                # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/84603cde5ebffb6084e476cfaeceaf0b8b91fe54/modeling_baichuan.py#L508
+                # Distinguish between Baichuan and Baichuan2 by checking the
+                # vocab size. This is suggested by
+                # https://github.com/vllm-project/vllm/pull/1022#discussion_r1325652704
+                is_baichuan2 = self.config.vocab_size == 125696
+                if is_baichuan2:
+                    loaded_weight = torch.nn.functional.normalize(loaded_weight)
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+class BaichuanForCausalLM(BaiChuanBaseForCausalLM):
+    """Baichuan 13B and Baichuan2 7B/13B."""
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        if config.hidden_size == 4096:  # baichuan2 7b
+            super().__init__(config, "ROPE", quant_config, prefix=prefix)
+        else:  # baichuan 13b, baichuan2 13b
+            super().__init__(config, "ALIBI", quant_config, prefix=prefix)
+
+
+EntryClass = [BaichuanForCausalLM]
diff --git a/python/sglang/srt/models/bailing_moe.py b/python/sglang/srt/models/bailing_moe.py
new file mode 100644
index 000000000..73e5a9a16
--- /dev/null
+++ b/python/sglang/srt/models/bailing_moe.py
@@ -0,0 +1,425 @@
+# Copyright 2023-2024 SGLang Team
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/bailing_moe.py
+
+from collections.abc import Iterable
+from typing import Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, make_layers
+
+
+class BailingAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+
+        self.total_num_heads = config.num_attention_heads
+        self.total_num_kv_heads = config.num_key_value_heads
+
+        assert self.total_num_heads % tp_size == 0
+        assert self.total_num_kv_heads % tp_size == 0
+
+        self.num_heads = self.total_num_heads // tp_size
+        self.head_dim = config.head_dim or (self.hidden_size // self.total_num_heads)
+        self.q_size = self.num_heads * self.head_dim
+
+        self.num_kv_heads = self.total_num_kv_heads // tp_size
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scale = self.head_dim**-0.5
+
+        self.query_key_value = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=(config.use_bias or config.use_qkv_bias),
+            quant_config=quant_config,
+            prefix=add_prefix("query_key_value", prefix),
+        )
+
+        self.dense = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=config.use_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("dense", prefix),
+        )
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scale,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=config.max_position_embeddings,
+            base=config.rope_theta,
+            is_neox_style=True,
+            rope_scaling=config.rope_scaling,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.query_key_value(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        q, k = self.rotary_emb(position_ids, q, k)
+        context_layer = self.attn(q, k, v, forward_batch)
+        attn_output, _ = self.dense(context_layer)
+        return attn_output
+
+
+class BailingMLP(nn.Module):
+    def __init__(
+        self,
+        intermediate_size: int,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: Optional[bool] = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            config.hidden_size,
+            [intermediate_size] * 2,
+            bias=config.use_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            config.hidden_size,
+            bias=config.use_bias,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        x, _ = self.gate_up_proj(x)
+        x = self.act_fn(x)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class BailingMoE(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_experts = config.num_experts
+        self.top_k = config.num_experts_per_tok
+        self.hidden_size = config.hidden_size
+        self.num_shared_experts = config.num_shared_experts
+        self.norm_expert_prob = config.norm_topk_prob
+        self.moe_intermediate_size = config.moe_intermediate_size
+
+        self.gate = ReplicatedLinear(
+            self.hidden_size, self.num_experts, bias=False, quant_config=None
+        )
+
+        self.topk = TopK(top_k=self.top_k, renormalize=self.norm_expert_prob)
+
+        self.experts = FusedMoE(
+            num_experts=self.num_experts,
+            top_k=self.top_k,
+            layer_id=layer_id,
+            hidden_size=self.hidden_size,
+            intermediate_size=self.moe_intermediate_size,
+            reduce_results=False,
+            quant_config=quant_config,
+            prefix=add_prefix("experts", prefix),
+        )
+
+        if self.num_shared_experts > 0:
+            shared_intermediate_size = (
+                self.moe_intermediate_size * self.num_shared_experts
+            )
+            self.shared_experts = BailingMLP(
+                intermediate_size=shared_intermediate_size,
+                config=config,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=add_prefix("shared_experts", prefix),
+            )
+        else:
+            self.shared_experts = None
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_shape = hidden_states.shape
+        hidden_states_flat = hidden_states.view(-1, self.hidden_size)
+
+        shared_output = None
+        if self.shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states_flat)
+
+        router_logits, _ = self.gate(hidden_states_flat)
+        topk_output = self.topk(hidden_states_flat, router_logits)
+        final_hidden_states = self.experts(hidden_states_flat, topk_output)
+
+        if shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states.view(orig_shape)
+
+
+class BailingMoeBlock(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.attention = BailingAttention(
+            config, layer_id, quant_config, prefix=add_prefix("attention", prefix)
+        )
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.mlp = BailingMoE(
+            config=config,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        forward_batch: ForwardBatch,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Pre-normalization and residual connection for the attention block
+        if residual is None:
+            residual = hidden_states
+            normed_hidden_states = self.input_layernorm(hidden_states)
+        else:
+            normed_hidden_states, residual = self.input_layernorm(
+                hidden_states, residual
+            )
+
+        attn_output = self.attention(
+            hidden_states=normed_hidden_states,
+            position_ids=position_ids,
+            forward_batch=forward_batch,
+        )
+
+        # Pre-normalization and residual connection for the MLP block
+        normed_hidden_states, residual = self.post_attention_layernorm(
+            attn_output, residual
+        )
+        mlp_output = self.mlp(normed_hidden_states)
+
+        return mlp_output, residual
+
+
+class BailingMoeModel(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_dim = config.hidden_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.embedding_dropout = torch.nn.Dropout(config.embedding_dropout)
+
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: BailingMoeBlock(
+                config=config,
+                layer_id=idx,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=add_prefix("layers", prefix),
+        )
+
+        self.norm = RMSNorm(self.embed_dim, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        residual = None
+        for layer in self.layers:
+            hidden_states, residual = layer(
+                hidden_states,
+                position_ids,
+                residual,
+                forward_batch,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class BailingMoeForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.model = BailingMoeModel(config=config, quant_config=quant_config)
+        self.lm_head = ParallelLMHead(
+            num_embeddings=config.vocab_size,
+            embedding_dim=config.hidden_size,
+            quant_config=quant_config,
+        )
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+
+        self.logits_processor = LogitsProcessor(config)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, inputs_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+
+        stacked_params_mapping = [
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+
+            if (
+                hasattr(self.config, "norm_head")
+                and self.config.norm_head
+                and "lm_head.weight" in name
+            ):
+                loaded_weight = F.normalize(loaded_weight, dim=0, p=2, eps=1e-7)
+
+            if "model.word_embeddings.weight" == name:
+                name = "model.embed_tokens.weight"
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name in name and "mlp.experts" not in name:
+                    full_param_name = name.replace(weight_name, param_name)
+                    param = params_dict[full_param_name]
+                    param.weight_loader(param, loaded_weight, shard_id)
+                    break
+            else:
+                for p_name, w_name, e_id, s_id in expert_params_mapping:
+                    if w_name in name and "mlp.experts" in name:
+                        full_param_name = name.replace(w_name, p_name)
+                        param = params_dict[full_param_name]
+                        param.weight_loader(
+                            param,
+                            loaded_weight,
+                            full_param_name,
+                            shard_id=s_id,
+                            expert_id=e_id,
+                        )
+                        break
+                else:
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+
+EntryClass = BailingMoeForCausalLM
diff --git a/python/sglang/srt/models/bert.py b/python/sglang/srt/models/bert.py
new file mode 100644
index 000000000..d7f3301c6
--- /dev/null
+++ b/python/sglang/srt/models/bert.py
@@ -0,0 +1,498 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any, Dict, Iterable, Optional, Set, Tuple
+
+import torch
+from torch import nn
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.pooler import CrossEncodingPooler, Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import AttentionType, RadixAttention
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+BertConfig = None
+
+
+class BertEmbedding(nn.Module):
+
+    def __init__(self, config: BertConfig):
+
+        super().__init__()
+        self.size = config.hidden_size
+        self.word_embeddings = VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size
+        )
+        self.position_embeddings = VocabParallelEmbedding(
+            config.max_position_embeddings, config.hidden_size
+        )
+        self.token_type_embeddings = VocabParallelEmbedding(
+            config.type_vocab_size, config.hidden_size
+        )
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.position_ids = nn.Parameter(
+            torch.empty((1, config.max_position_embeddings)),
+        )
+
+        self.position_embedding_type = config.position_embedding_type
+        if self.position_embedding_type != "absolute":
+            raise ValueError(
+                "Only 'absolute' position_embedding_type" + " is supported"
+            )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        input_shape = input_ids.size()
+
+        # Input embeddings.
+        inputs_embeds = self.word_embeddings(input_ids)
+
+        # Position embeddings.
+        position_embeddings = self.position_embeddings(positions)
+
+        token_type_ids = forward_batch.token_type_ids
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=inputs_embeds.device
+            )
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings + position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        return embeddings
+
+
+class BertPooler(nn.Module):
+
+    def __init__(self, config: BertConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        # simply taking the hidden state corresponding
+        first_token_tensor = hidden_states[0, :]
+
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+
+        return pooled_output
+
+
+class BertEncoder(nn.Module):
+
+    def __init__(
+        self,
+        config: BertConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.layer = nn.ModuleList(
+            [
+                BertLayer(
+                    config=config,
+                    layer_id=layer_idx,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.layer.{layer_idx}",
+                )
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+
+    def forward(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        for layer in self.layer:
+            hidden_states = layer(hidden_states, forward_batch)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: BertConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.layer_id = layer_id
+
+        self.attention = BertAttention(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            layer_id=layer_id,
+            layer_norm_eps=config.layer_norm_eps,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attention",
+        )
+
+        self.intermediate = BertIntermediate(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=f"{prefix}.intermediate",
+        )
+
+        self.output = BertOutput(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            layer_norm_eps=config.layer_norm_eps,
+            quant_config=quant_config,
+            prefix=f"{prefix}.output",
+        )
+
+    def forward(self, hidden_states: torch.Tensor, forward_batch: ForwardBatch):
+        attn_output = self.attention(hidden_states, forward_batch)
+        intermediate_output = self.intermediate(attn_output)
+        output = self.output(intermediate_output, attn_output)
+
+        return output
+
+
+class BertAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_attention_heads: int,
+        layer_norm_eps: float,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.self_attn = BertSelfAttention(
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=f"{prefix}.output",
+        )
+
+        self.output = BertSelfOutput(
+            hidden_size=hidden_size,
+            layer_norm_eps=layer_norm_eps,
+            quant_config=quant_config,
+            prefix=f"{prefix}.output",
+        )
+
+    def forward(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        self_output = self.self_attn(hidden_states, forward_batch)
+        return self.output(self_output, hidden_states)
+
+
+class BertSelfAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_attention_heads: int,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+
+        self.total_num_heads = num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = self.total_num_heads
+        self.head_dim = self.hidden_size // self.total_num_heads
+        assert self.head_dim * self.total_num_heads == self.hidden_size
+
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=self.hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.attn = RadixAttention(
+            num_heads=self.num_heads,
+            head_dim=self.head_dim,
+            scaling=self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            prefix=f"{prefix}.attn",
+            attn_type=AttentionType.ENCODER_ONLY,
+        )
+
+    def forward(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        output = self.attn(q, k, v, forward_batch)
+        return output
+
+
+class BertSelfOutput(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        layer_norm_eps: float,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.dense = RowParallelLinear(
+            input_size=hidden_size,
+            output_size=hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense",
+        )
+        self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
+
+    def forward(
+        self, hidden_states: torch.Tensor, input_tensor: torch.Tensor
+    ) -> torch.Tensor:
+        hidden_states, _ = self.dense(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertIntermediate(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.dense = ColumnParallelLinear(
+            input_size=hidden_size,
+            output_size=intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense",
+        )
+        self.intermediate_act_fn = get_act_fn(hidden_act)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        layer_norm_eps: float,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.dense = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense",
+        )
+
+        self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
+
+    def forward(
+        self, hidden_states: torch.Tensor, input_tensor: torch.Tensor
+    ) -> torch.Tensor:
+        hidden_states, _ = self.dense(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertModel(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        config: BertConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        use_bert_pooler: bool = False,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.use_bert_pooler = use_bert_pooler
+        self.config = config
+        self.embeddings = BertEmbedding(config)
+        self.encoder = BertEncoder(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("encoder", prefix),
+        )
+        self.pooler = (
+            BertPooler(config)
+            if self.use_bert_pooler
+            else Pooler(pooling_type=PoolingType.LAST, normalize=True)
+        )
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+    ) -> torch.Tensor:
+        assert get_embedding == True
+        # Your tokenized IDs
+
+        hidden_states = self.embeddings(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+        )
+
+        hidden_states = self.encoder(hidden_states, forward_batch=forward_batch)
+
+        if not self.use_bert_pooler:
+            hidden_states = self.pooler(hidden_states, forward_batch)
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "query", "q"),
+            ("qkv_proj", "key", "k"),
+            ("qkv_proj", "value", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            name = name.replace("self", "self_attn")
+            if not self.use_bert_pooler and "pooler" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+class Contriever(BertModel):
+    pass
+
+
+class BertForSequenceClassification(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        config: BertConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.num_labels = config.num_labels
+        self.bert = BertModel(
+            config=config,
+            quant_config=quant_config,
+            use_bert_pooler=True,
+            prefix=add_prefix("bert", prefix),
+        )
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.pooler = CrossEncodingPooler(config, self.classifier, self.bert.pooler)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        self_weights = []
+
+        def weight_filter():
+            for name, weight in weights:
+                if name.startswith("bert."):
+                    yield (name[len("bert.") :], weight)
+                else:
+                    self_weights.append((name, weight))
+
+        self.bert.load_weights(weight_filter())
+
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in self_weights:
+            if name.startswith("classifier"):
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+    ) -> torch.Tensor:
+        assert get_embedding == True
+
+        hidden_states = self.bert(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+            input_embeds=input_embeds,
+            get_embedding=get_embedding,
+        )
+        return self.pooler(hidden_states, forward_batch)
+
+
+EntryClass = [BertModel, Contriever, BertForSequenceClassification]
diff --git a/python/sglang/srt/models/chatglm.py b/python/sglang/srt/models/chatglm.py
new file mode 100644
index 000000000..9cf585a02
--- /dev/null
+++ b/python/sglang/srt/models/chatglm.py
@@ -0,0 +1,427 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/THUDM/ChatGLM2-6B
+"""Inference-only ChatGLM model compatible with THUDM weights."""
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from torch.nn import LayerNorm
+
+from sglang.srt.configs import ChatGLMConfig
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+LoraConfig = None
+
+
+class GLMAttention(nn.Module):
+    def __init__(
+        self,
+        config,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.multi_query_attention = config.multi_query_attention
+        self.total_num_kv_heads = (
+            config.multi_query_group_num
+            if config.multi_query_attention
+            else config.num_attention_heads
+        )
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        self.query_key_value = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=config.add_bias_linear or config.add_qkv_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("query_key_value", prefix),
+        )
+        self.dense = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            config.hidden_size,
+            bias=config.add_bias_linear,
+            quant_config=quant_config,
+            prefix=add_prefix("dense", prefix),
+        )
+
+        # https://huggingface.co/THUDM/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141
+        rope_ratio = getattr(config, "rope_ratio", 1.0)
+        max_positions = getattr(config, "seq_length", 8192)
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim // 2,
+            max_position=max_positions,
+            base=10000 * rope_ratio,
+            is_neox_style=False,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.query_key_value(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(position_ids, q, k)
+        context_layer = self.attn(
+            q,
+            k,
+            v,
+            forward_batch,
+        )
+        attn_output, _ = self.dense(context_layer)
+        return attn_output
+
+
+class GLMMLP(nn.Module):
+    """MLP.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension.
+    """
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.add_bias = config.add_bias_linear
+
+        # Project to 4h.
+        self.dense_h_to_4h = MergedColumnParallelLinear(
+            config.hidden_size,
+            [config.ffn_hidden_size] * 2,
+            bias=config.add_bias_linear,
+            quant_config=quant_config,
+            prefix=add_prefix("dense_h_to_4h", prefix),
+        )
+
+        self.activation_func = SiluAndMul()
+
+        # Project back to h.
+        self.dense_4h_to_h = RowParallelLinear(
+            config.ffn_hidden_size,
+            config.hidden_size,
+            bias=config.add_bias_linear,
+            quant_config=quant_config,
+            prefix=add_prefix("dense_4h_to_h", prefix),
+        )
+
+    def forward(self, hidden_states):
+        # [s, b, 4hp]
+        intermediate_parallel, _ = self.dense_h_to_4h(hidden_states)
+        intermediate_parallel = self.activation_func(intermediate_parallel)
+        # [s, b, h]
+        output, _ = self.dense_4h_to_h(intermediate_parallel)
+        return output
+
+
+class GLMBlock(nn.Module):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(
+        self,
+        config,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.apply_residual_connection_post_layernorm = (
+            config.apply_residual_connection_post_layernorm
+        )
+
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+        layer_norm_func = RMSNorm if config.rmsnorm else LayerNorm
+        # Layernorm on the input data.
+        self.input_layernorm = layer_norm_func(
+            config.hidden_size, eps=config.layernorm_epsilon
+        )
+
+        # Self attention.
+        self.self_attention = GLMAttention(
+            config, layer_id, quant_config, prefix=add_prefix("self_attention", prefix)
+        )
+        self.hidden_dropout = config.hidden_dropout
+
+        # Layernorm on the attention output
+        self.post_attention_layernorm = layer_norm_func(
+            config.hidden_size, eps=config.layernorm_epsilon
+        )
+
+        # MLP
+        self.mlp = GLMMLP(config, quant_config, prefix=add_prefix("mlp", prefix))
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        # hidden_states: [num_tokens, h]
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output = self.self_attention(
+            hidden_states=layernorm_output,
+            position_ids=position_ids,
+            forward_batch=forward_batch,
+        )
+
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        layernorm_input = residual + attention_output
+
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+
+        # Second residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+
+        output = self.mlp(layernorm_output) + residual
+
+        return output
+
+
+class GLMTransformer(nn.Module):
+    """Transformer class."""
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.post_layer_norm = config.post_layer_norm
+
+        # Number of layers.
+        self.num_layers = config.num_layers
+
+        # Transformer layers.
+        self.layers = nn.ModuleList(
+            [
+                GLMBlock(
+                    config,
+                    i,
+                    quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(self.num_layers)
+            ]
+        )
+
+        if self.post_layer_norm:
+            layer_norm_func = RMSNorm if config.rmsnorm else LayerNorm
+            # Final layer norm before output.
+            self.final_layernorm = layer_norm_func(
+                config.hidden_size, eps=config.layernorm_epsilon
+            )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            hidden_states = layer(
+                hidden_states=hidden_states,
+                position_ids=position_ids,
+                forward_batch=forward_batch,
+            )
+        # Final layer norm.
+        if self.post_layer_norm:
+            hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states
+
+
+class ChatGLMM(nn.Module):
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.embedding = VocabParallelEmbedding(
+            config.padded_vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embedding", prefix),
+        )
+
+        self.num_layers = config.num_layers
+        self.multi_query_group_num = config.multi_query_group_num
+        self.kv_channels = config.kv_channels
+        self.encoder = GLMTransformer(
+            config, quant_config, add_prefix("encoder", prefix)
+        )
+
+        self.output_layer = ParallelLMHead(
+            config.padded_vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("output_layer", prefix),
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        inputs_embeds = self.embedding(input_ids)
+
+        # Run encoder.
+        hidden_states = self.encoder(
+            hidden_states=inputs_embeds,
+            position_ids=position_ids,
+            forward_batch=forward_batch,
+        )
+        return hidden_states
+
+
+class ChatGLMForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "query_key_value": ["query_key_value"],
+        "dense_h_to_4h": ["dense_h_to_4h"],
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "query_key_value",
+        "dense",
+        "dense_h_to_4h",
+        "dense_4h_to_h",
+    ]
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    def __init__(
+        self,
+        config: ChatGLMConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config: ChatGLMConfig = config
+        self.quant_config = quant_config
+        self.max_position_embeddings = getattr(config, "max_sequence_length", 8192)
+        self.transformer = ChatGLMM(
+            config, quant_config, prefix=add_prefix("transformer", prefix)
+        )
+        self.lm_head = self.transformer.output_layer
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.transformer(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "rotary_pos_emb.inv_freq" in name:
+                continue
+            if "word_embeddings" in name:
+                name = name.replace(".word_embeddings", "")
+            # Skip loading extra bias for GPTQ models.
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+
+class ChatGLMModel(ChatGLMForCausalLM):
+    pass
+
+
+EntryClass = [ChatGLMModel]
diff --git a/python/sglang/srt/models/clip.py b/python/sglang/srt/models/clip.py
new file mode 100644
index 000000000..ea9fee9ac
--- /dev/null
+++ b/python/sglang/srt/models/clip.py
@@ -0,0 +1,572 @@
+# Adapted from
+# https://github.com/huggingface/transformers/blob/af9b2eaa54c150741f298d6db939af6328e1dc38/src/transformers/models/clip/modeling_clip.py
+
+from functools import partial
+from typing import Iterable, List, Optional, Tuple, Type, Union
+
+import torch
+import torch.nn as nn
+from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
+from transformers.modeling_attn_mask_utils import _create_4d_causal_attention_mask
+
+from sglang.srt.layers.activation import QuickGELU
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
+from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.schedule_batch import MultimodalInputs
+from sglang.srt.model_executor.model_runner import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, flatten_nested_list
+
+
+class CLIPVisionEmbeddings(nn.Module):
+
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        assert self.image_size % self.patch_size == 0
+
+        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_positions).expand((1, -1)),
+            persistent=False,
+        )
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(
+            pixel_values.to(dtype=target_dtype)
+        )  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+
+        return embeddings
+
+
+class CLIPTextEmbeddings(nn.Module):
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(
+            config.max_position_embeddings, embed_dim
+        )
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids",
+            torch.arange(config.max_position_embeddings).expand((1, -1)),
+            persistent=False,
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        seq_length = (
+            input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+        )
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        return embeddings
+
+
+class CLIPMLP(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        act_layer: Type[nn.Module] = QuickGELU,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            quant_config=quant_config,
+            prefix=add_prefix("fc1", prefix),
+        )
+        self.act = act_layer()
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("fc2", prefix),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_parallel, _ = self.fc1(x)
+        x_parallel = self.act(x_parallel)
+        x, _ = self.fc2(x_parallel)
+        return x
+
+
+class CLIPEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: CLIPVisionConfig,
+        act_layer: Type[nn.Module] = QuickGELU,
+        norm_layer: Type[nn.Module] = None,
+        attn_implementation: Optional[str] = "sdpa",
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=config.layer_norm_eps)
+        self.layer_norm1 = norm_layer(config.hidden_size)
+        self.layer_norm2 = norm_layer(config.hidden_size)
+        if attn_implementation == "sdpa":
+            qkv_backend = "sdpa"
+            softmax_in_single_precision = False
+        elif attn_implementation == "flash_attention_2":
+            qkv_backend = "triton_attn"
+            softmax_in_single_precision = False
+        elif attn_implementation == "eager":
+            qkv_backend = "sdpa"
+            softmax_in_single_precision = True
+        self.self_attn = VisionAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            projection_size=config.hidden_size,
+            use_qkv_parallel=True,
+            qkv_backend=qkv_backend,
+            softmax_in_single_precision=softmax_in_single_precision,
+            flatten_batch=True,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = CLIPMLP(
+            config,
+            act_layer=act_layer,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        # CLIP text model uses both `causal_attention_mask` and `attention_mask`
+        if attention_mask is not None and causal_attention_mask is not None:
+            attn_mask = attention_mask + causal_attention_mask
+        elif causal_attention_mask is not None:
+            attn_mask = causal_attention_mask
+        else:
+            attn_mask = attention_mask
+        hidden_states = self.self_attn(
+            hidden_states,
+            attention_mask=attn_mask,
+            # causal_attention_mask=causal_attention_mask,
+        )
+
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class CLIPEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self
+    attention layers. Each layer is a [`CLIPEncoderLayer`].
+
+    Args:
+        config: CLIPConfig
+    """
+
+    def __init__(
+        self,
+        config: CLIPVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        num_hidden_layers = config.num_hidden_layers
+        norm_layer = partial(nn.LayerNorm, eps=config.layer_norm_eps)
+        self.layers = nn.ModuleList(
+            [
+                CLIPEncoderLayer(
+                    config=config,
+                    norm_layer=norm_layer,
+                    attn_implementation="sdpa",
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{layer_idx}", prefix),
+                )
+                for layer_idx in range(num_hidden_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        causal_attention_mask: torch.Tensor = None,
+        return_all_hidden_states: bool = False,
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        hidden_states_pool = [inputs_embeds]
+        hidden_states = inputs_embeds
+
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(
+                hidden_states, attention_mask, causal_attention_mask
+            )
+            if return_all_hidden_states:
+                hidden_states_pool.append(hidden_states)
+        if return_all_hidden_states:
+            return hidden_states_pool
+        return hidden_states
+
+
+class CLIPTextTransformer(nn.Module):
+    def __init__(
+        self,
+        config: CLIPTextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = CLIPTextEmbeddings(config)
+        self.encoder = CLIPEncoder(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("encoder", prefix),
+        )
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @property
+    def device(self) -> torch.device:
+        return self.encoder.layers[0].layer_norm1.weight.device
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+    ):
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+        hidden_states = self.embeddings(input_ids, position_ids)
+        causal_attention_mask = _create_4d_causal_attention_mask(
+            input_ids.shape, hidden_states.dtype, device=hidden_states.device
+        )
+        encoder_outputs = self.encoder(
+            hidden_states, attention_mask, causal_attention_mask
+        )
+        last_hidden_state = self.final_layer_norm(encoder_outputs)
+        return last_hidden_state
+
+
+class CLIPTextModel(nn.Module):
+    def __init__(
+        self,
+        config: CLIPTextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.text_model = CLIPTextTransformer(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("text_model", prefix),
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+    ):
+        return self.text_model(input_ids, position_ids)
+
+
+class CLIPVisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        config: CLIPVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = CLIPVisionEmbeddings(config)
+
+        # NOTE: This typo of "layrnorm" is not fixed on purpose to match
+        # the original transformers code and name of the model weights.
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        self.encoder = CLIPEncoder(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("encoder", prefix),
+        )
+
+        num_hidden_layers = config.num_hidden_layers
+        if len(self.encoder.layers) > config.num_hidden_layers:
+            raise ValueError(
+                f"The original encoder only has {num_hidden_layers} "
+                f"layers, but you requested {len(self.encoder.layers)} layers."
+            )
+
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @property
+    def device(self) -> torch.device:
+        return self.encoder.layers[0].layer_norm1.weight.device
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self.embeddings(pixel_values.to(self.device))
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        return_all_hidden_states = False
+
+        last_hidden_state = self.encoder(
+            inputs_embeds=hidden_states,
+            return_all_hidden_states=return_all_hidden_states,
+        )
+
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        return last_hidden_state
+
+
+class CLIPVisionModel(nn.Module):
+    def __init__(
+        self,
+        config: CLIPVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.vision_model = CLIPVisionTransformer(
+            config, quant_config, prefix=add_prefix("vision_model", prefix)
+        )
+
+    @property
+    def device(self) -> torch.device:
+        return self.vision_model.device
+
+    def forward(self, pixel_values: torch.Tensor):
+        return self.vision_model(pixel_values)
+
+
+class CLIPModel(nn.Module):
+    def __init__(
+        self,
+        config: CLIPConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        if not isinstance(config.text_config, CLIPTextConfig):
+            raise TypeError(
+                "config.text_config is expected to be of type CLIPTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, CLIPVisionConfig):
+            raise TypeError(
+                "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+        self.visual_projection = nn.Linear(
+            self.vision_embed_dim, self.projection_dim, bias=False
+        )
+        self.text_projection = nn.Linear(
+            self.text_embed_dim, self.projection_dim, bias=False
+        )
+        self.logit_scale = nn.Parameter(
+            torch.tensor(self.config.logit_scale_init_value)
+        )
+
+        text_model = CLIPTextModel(
+            text_config, quant_config, prefix=add_prefix("text_model", prefix)
+        )
+        vision_model = CLIPVisionModel(
+            vision_config, quant_config, prefix=add_prefix("vision_model", prefix)
+        )
+        self.text_model = text_model.text_model
+        self.vision_model = vision_model.vision_model
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+        monkey_patch_weight_loader()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = True,
+    ):
+        assert get_embedding, "CLIPEmbeddingModel is only used for embedding"
+        mm_inputs = []
+        if forward_batch.mm_inputs is not None:
+            mm_inputs = forward_batch.mm_inputs
+        pixel_values_list = [
+            item.feature
+            for item in flatten_nested_list(
+                [mm_input.mm_items for mm_input in mm_inputs if mm_input is not None]
+            )
+        ]
+        if len(pixel_values_list) != 0:
+            pixel_values = torch.concat(pixel_values_list)
+            vision_outputs = self.vision_model(pixel_values)
+            pooled_output = vision_outputs[:, 0, :]
+            image_embeds = self.visual_projection(pooled_output)
+            image_embeds = nn.functional.normalize(image_embeds, p=2, dim=1)
+            return EmbeddingPoolerOutput(embeddings=image_embeds)
+
+        else:
+            text_outputs = self.text_model(input_ids, position_ids=positions)
+            pooled_output = self.pooler(text_outputs[0], forward_batch)
+            return EmbeddingPoolerOutput(
+                embeddings=self.text_projection(pooled_output.embeddings)
+            )
+
+    def pad_input_ids(self, input_ids: List[int], image_inputs: MultimodalInputs):
+        # Clip embeddings models handle text/image separately, so we don't need to pad input ids
+        return input_ids
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "position_ids" in name:
+                continue
+            if "out_proj" in name:
+                name = name.replace("out_proj", "proj")
+            for param_name, shard_name, shard_id in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+# monkey patch weight loader to remove open_clip file
+def monkey_patch_weight_loader():
+    import glob
+    import os
+
+    from sglang.srt.model_loader.loader import DefaultModelLoader
+    from sglang.srt.model_loader.weight_utils import (
+        download_weights_from_hf,
+        filter_files_not_needed_for_inference,
+    )
+
+    def prepare_weights(
+        self, model_name_or_path: str, revision: Optional[str], fall_back_to_pt: bool
+    ) -> Tuple[str, List[str], bool]:
+        model_name_or_path = (
+            self._maybe_download_from_modelscope(model_name_or_path, revision)
+            or model_name_or_path
+        )
+
+        is_local = os.path.isdir(model_name_or_path)
+        use_safetensors = False
+        allow_patterns = ["*.bin"]
+
+        if not is_local:
+            hf_folder = download_weights_from_hf(
+                model_name_or_path,
+                self.load_config.download_dir,
+                allow_patterns,
+                revision,
+                ignore_patterns=self.load_config.ignore_patterns,
+            )
+        else:
+            hf_folder = model_name_or_path
+
+        hf_weights_files: List[str] = []
+        for pattern in allow_patterns:
+            hf_weights_files += glob.glob(os.path.join(hf_folder, pattern))
+
+        hf_weights_files = filter_files_not_needed_for_inference(hf_weights_files)
+
+        # remove open_clip file
+        hf_weights_files = [
+            file for file in hf_weights_files if "open_clip" not in file
+        ]
+
+        if len(hf_weights_files) == 0:
+            raise RuntimeError(
+                f"Cannot find any model weights with `{model_name_or_path}`"
+            )
+
+        return hf_folder, hf_weights_files, use_safetensors
+
+    setattr(DefaultModelLoader, "_prepare_weights", prepare_weights)
+
+
+EntryClass = CLIPModel
diff --git a/python/sglang/srt/models/commandr.py b/python/sglang/srt/models/commandr.py
new file mode 100644
index 000000000..ebbf8ed64
--- /dev/null
+++ b/python/sglang/srt/models/commandr.py
@@ -0,0 +1,418 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Copyright 2024 Cohere and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/commandr.py#L1
+
+# This file is based on the LLama model definition file in transformers
+"""PyTorch Cohere model."""
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn.parameter import Parameter
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.utils import add_prefix, get_compiler_backend, set_weight_attrs
+
+
+@torch.compile(backend=get_compiler_backend())
+def layer_norm_func(hidden_states, weight, variance_epsilon):
+    input_dtype = hidden_states.dtype
+    hidden_states = hidden_states.to(torch.float32)
+    mean = hidden_states.mean(-1, keepdim=True)
+    variance = (hidden_states - mean).pow(2).mean(-1, keepdim=True)
+    hidden_states = (hidden_states - mean) * torch.rsqrt(variance + variance_epsilon)
+    hidden_states = weight.to(torch.float32) * hidden_states
+    return hidden_states.to(input_dtype)
+
+
+class LayerNorm(nn.Module):
+    def __init__(self, param_shape=None, eps=1e-5):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(param_shape))
+        self.variance_epsilon = eps
+        set_weight_attrs(self.weight, {"weight_loader": self.weight_loader})
+
+    def forward(self, hidden_states, residuals=None):
+        hidden_states = layer_norm_func(
+            hidden_states, self.weight, self.variance_epsilon
+        )
+        return hidden_states, residuals
+
+    def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+        tp_rank = get_tensor_model_parallel_rank()
+        shard_dim = 0 if param.dim() != 1 else None
+        param_data = param.data
+        if shard_dim is not None:
+            shard_size = param_data.shape[shard_dim]
+            start_idx = tp_rank * shard_size
+            loaded_weight = loaded_weight.narrow(shard_dim, start_idx, shard_size)
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaMLP Llama->Cohere
+class CohereMLP(nn.Module):
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_up_proj = MergedColumnParallelLinear(
+            self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class CohereAttention(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        tp_size = get_tensor_model_parallel_world_size()
+        self.config = config
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.total_num_heads = config.num_attention_heads
+        self.num_heads = self.total_num_heads // tp_size
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.max_position_embeddings = getattr(
+            config, "model_max_length", None
+        ) or getattr(config, "max_position_embeddings", 8192)
+        self.rope_theta = config.rope_theta
+        self.rope_scaling = getattr(config, "rope_scaling", None)
+        self.use_qk_norm = getattr(config, "use_qk_norm", False)
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=self.rope_scaling,
+            is_neox_style=False,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+        if self.use_qk_norm:
+            self.q_norm = LayerNorm(
+                param_shape=(self.num_heads, self.head_dim), eps=config.layer_norm_eps
+            )
+            self.k_norm = LayerNorm(
+                param_shape=(self.num_kv_heads, self.head_dim),
+                eps=config.layer_norm_eps,
+            )
+
+    def _apply_qk_norm(self, q, k):
+        q = q.view(*q.shape[:-1], -1, self.head_dim)
+        k = k.view(*k.shape[:-1], -1, self.head_dim)
+        q, _ = self.q_norm(q)
+        k, _ = self.k_norm(k)
+        q = q.view(*q.shape[:-2], -1)
+        k = k.view(*k.shape[:-2], -1)
+        return q, k
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        if self.use_qk_norm:
+            q, k = self._apply_qk_norm(q, k)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class CohereDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = CohereAttention(
+            config,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+
+        self.mlp = CohereMLP(
+            config,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = LayerNorm(
+            param_shape=(config.hidden_size), eps=config.layer_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        residual = hidden_states
+        hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states_attention = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states_mlp = self.mlp(hidden_states)
+        # Add everything together
+        hidden_states = residual + hidden_states_attention + hidden_states_mlp
+
+        return hidden_states, residual
+
+
+class CohereModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size
+        )
+        self.layers = nn.ModuleList(
+            [
+                CohereDecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = LayerNorm(
+            param_shape=(config.hidden_size), eps=config.layer_norm_eps
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class CohereForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.logits_processor = LogitsProcessor(config)
+        self.model = CohereModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+        )
+        return self.logits_processor(
+            input_ids, hidden_states, self.model.embed_tokens, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params = set()
+        for name, loaded_weight in weights:
+            for param_name, shard_name, shard_id in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # lm_head is not used in vllm as it is tied with embed_token.
+                # To prevent errors, skip loading lm_head.weight.
+                if "lm_head.weight" in name:
+                    continue
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+
+class Cohere2ForCausalLM(CohereForCausalLM):
+    pass
+
+
+EntryClass = [CohereForCausalLM, Cohere2ForCausalLM]
diff --git a/python/sglang/srt/models/dbrx.py b/python/sglang/srt/models/dbrx.py
new file mode 100644
index 000000000..74de384b3
--- /dev/null
+++ b/python/sglang/srt/models/dbrx.py
@@ -0,0 +1,460 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from:
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/dbrx.py#L1
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+import torch.nn as nn
+
+from sglang.srt.configs import DbrxConfig
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
+from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.utils import add_prefix, set_weight_attrs
+
+
+class DbrxRouter(nn.Module):
+    """A Router implementation for DBRX that returns logits for each expert
+    per token.
+    """
+
+    def __init__(
+        self,
+        config: DbrxConfig,
+        params_dtype: Optional[torch.dtype] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_total_experts = config.ffn_config.moe_num_experts
+        self.d_model = config.d_model
+        self.layer = ReplicatedLinear(
+            self.d_model,
+            self.num_total_experts,
+            bias=False,
+            params_dtype=params_dtype,
+            quant_config=None,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        router_logits, _ = self.layer(hidden_states)
+        return router_logits
+
+
+class DbrxExperts(nn.Module):
+    """A tensor-parallel MoE implementation for DBRX.
+
+    Each expert's weights are sharded across all ranks and a fused MoE
+    kernel is used for the forward pass, and finally we reduce the outputs
+    across ranks.
+    """
+
+    def __init__(
+        self,
+        config: DbrxConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        params_dtype: Optional[torch.dtype] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_total_experts = config.ffn_config.moe_num_experts
+        self.top_k = config.ffn_config.moe_top_k
+        self.d_model = config.d_model
+        self.intermediate_size = config.ffn_config.ffn_hidden_size // self.tp_size
+
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+
+        self.router = DbrxRouter(config, self.params_dtype)
+        self.topk = TopK(
+            self.top_k,
+            renormalize=True,
+        )
+        self.moe_runner_config = MoeRunnerConfig(inplace=True)
+        self.ws = nn.Parameter(
+            torch.empty(
+                self.num_total_experts,
+                2 * self.intermediate_size,
+                self.d_model,
+                device="cuda",
+                dtype=self.params_dtype,
+            )
+        )
+        self.w2s = nn.Parameter(
+            torch.empty(
+                self.num_total_experts,
+                self.d_model,
+                self.intermediate_size,
+                device="cuda",
+                dtype=self.params_dtype,
+            )
+        )
+
+        set_weight_attrs(
+            self.ws,
+            {
+                "weight_loader": self.weight_loader,
+            },
+        )
+        set_weight_attrs(
+            self.w2s,
+            {
+                "weight_loader": self.weight_loader,
+            },
+        )
+
+    def weight_loader(
+        self, param: nn.Parameter, loaded_weight: torch.Tensor, weight_name: str
+    ):
+        tp_rank = get_tensor_model_parallel_rank()
+        param_data = param.data
+        shard_size = self.intermediate_size
+        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+        # DBRX uses GLU for each experts.
+        # GLU has 3 linear layers: w1, v1 and w2.
+        if weight_name.endswith("w1"):
+            loaded_weight = torch.reshape(
+                loaded_weight,
+                [-1, self.intermediate_size * self.tp_size, self.d_model],
+            )
+            param_data[:, 0:shard_size, :] = loaded_weight[:, shard, :]
+        if weight_name.endswith("v1"):
+            loaded_weight = torch.reshape(
+                loaded_weight,
+                [-1, self.intermediate_size * self.tp_size, self.d_model],
+            )
+            param_data[:, shard_size : 2 * shard_size, :] = loaded_weight[:, shard, :]
+        if weight_name.endswith("w2"):
+            loaded_weight = torch.reshape(
+                loaded_weight,
+                [-1, self.intermediate_size * self.tp_size, self.d_model],
+            ).transpose(1, 2)
+            param_data[:] = loaded_weight[:, :, shard]
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.d_model)
+        # router_logits: (num_tokens, n_experts)
+        router_logits = self.router(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = fused_moe(
+            hidden_states,
+            self.ws,
+            self.w2s,
+            topk_output,
+            self.moe_runner_config,
+        )
+
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_size)
+
+
+class DbrxAttention(nn.Module):
+    def __init__(
+        self,
+        config: DbrxConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.d_model = config.d_model
+        self.total_num_heads = config.n_heads
+        self.head_dim = self.d_model // self.total_num_heads
+        self.total_num_kv_heads = config.attn_config.kv_n_heads
+        self.clip_qkv = config.attn_config.clip_qkv
+        self.rope_theta = config.attn_config.rope_theta
+        self.max_position = config.max_seq_len
+
+        # pylint: disable=invalid-name
+        self.Wqkv = QKVParallelLinear(
+            self.d_model,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("Wqkv", prefix),
+        )
+        self.out_proj = RowParallelLinear(
+            self.d_model,
+            self.d_model,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("out_proj", prefix),
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position,
+            base=int(self.rope_theta),
+            is_neox_style=True,
+        )
+
+        tp_world_size = get_tensor_model_parallel_world_size()
+        self.tp_size = tp_world_size
+        assert self.total_num_heads % tp_world_size == 0
+        self.num_heads = self.total_num_heads // tp_world_size
+        if self.total_num_kv_heads >= tp_world_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_world_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_world_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_world_size)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.Wqkv(hidden_states)
+        if self.clip_qkv is not None:
+            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(position_ids, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        hidden_states, _ = self.out_proj(attn_output)
+        return hidden_states
+
+
+class DbrxFusedNormAttention(nn.Module):
+    def __init__(
+        self,
+        config: DbrxConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.d_model = config.d_model
+        self.attn = DbrxAttention(
+            config,
+            layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+        self.norm_1 = nn.LayerNorm(self.d_model)
+        self.norm_2 = nn.LayerNorm(self.d_model)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        residual = hidden_states
+        hidden_states = self.norm_1(hidden_states)
+        x = self.attn(
+            position_ids=position_ids,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = residual + x
+        residual = hidden_states
+        hidden_states = self.norm_2(hidden_states)
+        return hidden_states, residual
+
+
+class DbrxBlock(nn.Module):
+    def __init__(
+        self,
+        config: DbrxConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.norm_attn_norm = DbrxFusedNormAttention(
+            config,
+            layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("norm_attn_norm", prefix),
+        )
+        self.ffn = DbrxExperts(config, quant_config=quant_config)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states, residual = self.norm_attn_norm(
+            position_ids=position_ids,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = self.ffn(hidden_states)
+        hidden_states = hidden_states + residual
+        return hidden_states
+
+
+class DbrxModel(nn.Module):
+    def __init__(
+        self,
+        config: DbrxConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.wte = VocabParallelEmbedding(
+            config.vocab_size,
+            config.d_model,
+        )
+        self.blocks = nn.ModuleList(
+            [
+                DbrxBlock(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"blocks.{i}", prefix),
+                )
+                for i in range(config.n_layers)
+            ]
+        )
+        self.norm_f = nn.LayerNorm(config.d_model, eps=1e-5)
+        for module in self.modules():
+            if hasattr(module, "bias") and isinstance(module.bias, nn.Parameter):
+                # Remove the bias term in Linear and LayerNorm.
+                module.register_parameter("bias", None)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.wte(input_ids)
+        else:
+            hidden_states = input_embeds
+        for i in range(len(self.blocks)):
+            block = self.blocks[i]
+            hidden_states = block(position_ids, hidden_states, forward_batch)
+        hidden_states = self.norm_f(hidden_states)
+        return hidden_states
+
+
+class DbrxForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: DbrxConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.unpadded_vocab_size = config.vocab_size
+        self.transformer = DbrxModel(
+            config, quant_config=quant_config, prefix=add_prefix("transformer", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.d_model,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            prefix=add_prefix("lm_head", prefix),
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.transformer(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        expert_params_mapping = [
+            (
+                "ws" if weight_name in ["w1", "v1"] else "w2s",
+                f"experts.mlp.{weight_name}",
+            )
+            for weight_name in ["w1", "v1", "w2"]
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            for param_name, weight_name in expert_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, weight_name)
+                break
+            else:
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = DbrxForCausalLM
diff --git a/python/sglang/srt/models/deepseek.py b/python/sglang/srt/models/deepseek.py
new file mode 100644
index 000000000..ef431e00d
--- /dev/null
+++ b/python/sglang/srt/models/deepseek.py
@@ -0,0 +1,489 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from:
+# https://github.com/vllm-project/vllm/blob/14f91fe67c2342f2fe859dc6a5c40810df0e1c61/vllm/model_executor/models/deepseek.py
+"""Inference-only Deepseek model."""
+
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.fused_moe_triton import fused_moe
+from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+
+class DeepseekMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class DeepseekMoE(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.n_routed_experts = config.n_routed_experts
+        self.top_k = config.num_experts_per_tok
+        if self.tp_size > self.n_routed_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {self.n_routed_experts}."
+            )
+        self.topk = TopK(
+            top_k=self.top_k,
+            renormalize=config.norm_topk_prob,
+        )
+        self.experts = nn.ModuleList(
+            [
+                DeepseekMLP(
+                    hidden_size=config.hidden_size,
+                    intermediate_size=config.moe_intermediate_size,
+                    hidden_act=config.hidden_act,
+                    quant_config=quant_config,
+                    reduce_results=False,
+                    prefix=add_prefix(f"{idx}.experts", prefix),
+                )
+                for idx in range(self.n_routed_experts)
+            ]
+        )
+        self.pack_params()
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            self.n_routed_experts,
+            bias=False,
+            quant_config=None,
+            prefix=add_prefix("gate", prefix),
+        )
+
+        if config.n_shared_experts is not None:
+            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+            self.shared_experts = DeepseekMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=add_prefix("shared_experts", prefix),
+            )
+
+    def pack_params(self):
+        w1 = []
+        w2 = []
+        for expert in self.experts:
+            w1.append(expert.gate_up_proj.weight)
+            w2.append(expert.down_proj.weight)
+        self.w1 = torch._utils._flatten_dense_tensors(w1)
+        w1s = torch._utils._unflatten_dense_tensors(self.w1, w1)
+        for data, param in zip(w1s, w1):
+            param.data = data
+        self.w1 = self.w1.view(len(w1), *w1s[0].shape)
+
+        self.w2 = torch._utils._flatten_dense_tensors(w2)
+        w2s = torch._utils._unflatten_dense_tensors(self.w2, w2)
+        for data, param in zip(w2s, w2):
+            param.data = data
+
+        self.w2 = self.w2.view(len(w2), *w2s[0].shape)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        if self.config.n_shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = fused_moe.fused_moe(
+            hidden_states,
+            w1=self.w1,
+            w2=self.w2,
+            topk_output=topk_output,
+            moe_runner_config=MoeRunnerConfig(inplace=True),
+        )
+
+        if self.config.n_shared_experts is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class DeepseekAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class DeepseekDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        self.self_attn = DeepseekAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        if (
+            config.n_routed_experts is not None
+            and layer_id >= config.first_k_dense_replace
+            and layer_id % config.moe_layer_freq == 0
+        ):
+            self.mlp = DeepseekMoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+        else:
+            self.mlp = DeepseekMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class DeepseekModel(nn.Module):
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList(
+            [
+                DeepseekDecoderLayer(
+                    config,
+                    layer_id,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{layer_id}", prefix),
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions, hidden_states, forward_batch, residual
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class DeepseekForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = DeepseekModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (
+                    "mlp.experts." in name or "mlp.shared_experts." in name
+                ) and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (
+                    "mlp.experts." in name or "mlp.shared_experts." in name
+                ) and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = DeepseekForCausalLM
diff --git a/python/sglang/srt/models/deepseek_janus_pro.py b/python/sglang/srt/models/deepseek_janus_pro.py
new file mode 100644
index 000000000..fe1c833f7
--- /dev/null
+++ b/python/sglang/srt/models/deepseek_janus_pro.py
@@ -0,0 +1,2065 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Copied and Adapted from:
+# https://github.com/deepseek-ai/Janus
+
+
+import collections
+import math
+import os
+from dataclasses import field
+from enum import Enum
+from functools import partial
+from itertools import repeat
+from typing import (
+    Callable,
+    Final,
+    Iterable,
+    Literal,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Type,
+    Union,
+)
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from torch import Tensor, _assert, nn
+from torch.nn.init import trunc_normal_
+from transformers import AutoModel, PreTrainedModel
+
+from sglang.srt.configs.janus_pro import *
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternTokenPairs,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.llama import LlamaForCausalLM
+from sglang.utils import logger
+
+#################################################################################
+#                              VQ Model Configs                                 #
+#################################################################################
+
+
+# Copied from:
+# https://github.com/deepseek-ai/Janus/tree/main/janus/models/vq_model.py
+@dataclass
+class ModelArgs:
+    codebook_size: int = 16384
+    codebook_embed_dim: int = 8
+    codebook_l2_norm: bool = True
+    codebook_show_usage: bool = True
+    commit_loss_beta: float = 0.25
+    entropy_loss_ratio: float = 0.0
+
+    encoder_ch_mult: List[int] = field(default_factory=lambda: [1, 1, 2, 2, 4])
+    decoder_ch_mult: List[int] = field(default_factory=lambda: [1, 1, 2, 2, 4])
+    z_channels: int = 256
+    dropout_p: float = 0.0
+
+
+def named_apply(
+    fn: Callable,
+    module: nn.Module,
+    name="",
+    depth_first: bool = True,
+    include_root: bool = False,
+) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(
+            fn=fn,
+            module=child_module,
+            name=child_name,
+            depth_first=depth_first,
+            include_root=True,
+        )
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+
+
+def VQ_16(**kwargs):
+    return VQModel(
+        ModelArgs(
+            encoder_ch_mult=[1, 1, 2, 2, 4], decoder_ch_mult=[1, 1, 2, 2, 4], **kwargs
+        )
+    )
+
+
+VQ_models = {"VQ-16": VQ_16}
+
+import collections.abc
+
+
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return tuple(x)
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        logger.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    if tensor.dtype in [torch.float16, torch.bfloat16]:
+        # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
+        og_dtype = tensor.dtype
+        tensor = tensor.to(torch.float32)
+        tensor.erfinv_()
+        tensor = tensor.to(og_dtype)
+    else:
+        tensor.erfinv_()
+
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+
+    # Clamp to ensure it's in the proper range
+    if tensor.dtype == torch.float16:
+        # The `clamp_` op is not (yet?) defined in float16+cpu
+        tensor = tensor.to(torch.float32)
+        tensor.clamp_(min=a, max=b)
+    else:
+        tensor.clamp_(min=a, max=b)
+
+
+def trunc_normal_tf_(
+    tensor: torch.Tensor,
+    mean: float = 0.0,
+    std: float = 1.0,
+    a: float = -2.0,
+    b: float = 2.0,
+):
+    """Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \\leq \text{mean} \\leq b`.
+    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
+    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
+    and the result is subsequently scaled and shifted by the mean and std args.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    """
+    with torch.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.mul_(std).add_(mean)
+
+
+to_2tuple = _ntuple(2)
+
+
+class Format(str, Enum):
+    NCHW = "NCHW"
+    NHWC = "NHWC"
+    NCL = "NCL"
+    NLC = "NLC"
+
+
+def nchw_to(x: torch.Tensor, fmt: Format):
+    if fmt == Format.NHWC:
+        x = x.permute(0, 2, 3, 1)
+    elif fmt == Format.NLC:
+        x = x.flatten(2).transpose(1, 2)
+    elif fmt == Format.NCL:
+        x = x.flatten(2)
+    return x
+
+
+def resample_patch_embed(
+    patch_embed,
+    new_size: List[int],
+    interpolation: str = "bicubic",
+    antialias: bool = True,
+    verbose: bool = False,
+):
+    """Resample the weights of the patch embedding kernel to target resolution.
+    We resample the patch embedding kernel by approximately inverting the effect
+    of patch resizing.
+
+    Code based on:
+      https://github.com/google-research/big_vision/blob/b00544b81f8694488d5f36295aeb7972f3755ffe/big_vision/models/proj/flexi/vit.py
+
+    With this resizing, we can for example load a B/8 filter into a B/16 model
+    and, on 2x larger input image, the result will match.
+
+    Args:
+        patch_embed: original parameter to be resized.
+        new_size (tuple(int, int): target shape (height, width)-only.
+        interpolation (str): interpolation for resize
+        antialias (bool): use anti-aliasing filter in resize
+        verbose (bool): log operation
+    Returns:
+        Resized patch embedding kernel.
+    """
+    import numpy as np
+
+    try:
+        from torch import vmap
+    except ImportError:
+        from torch.func import vmap
+
+    assert len(patch_embed.shape) == 4, "Four dimensions expected"
+    assert len(new_size) == 2, "New shape should only be hw"
+    old_size = patch_embed.shape[-2:]
+    if tuple(old_size) == tuple(new_size):
+        return patch_embed
+
+    if verbose:
+        logger.info(
+            f"Resize patch embedding {patch_embed.shape} to {new_size}, w/ {interpolation} interpolation."
+        )
+
+    def resize(x_np, _new_size):
+        x_tf = torch.Tensor(x_np)[None, None, ...]
+        x_upsampled = F.interpolate(
+            x_tf, size=_new_size, mode=interpolation, antialias=antialias
+        )[0, 0, ...].numpy()
+        return x_upsampled
+
+    def get_resize_mat(_old_size, _new_size):
+        mat = []
+        for i in range(np.prod(_old_size)):
+            basis_vec = np.zeros(_old_size)
+            basis_vec[np.unravel_index(i, _old_size)] = 1.0
+            mat.append(resize(basis_vec, _new_size).reshape(-1))
+        return np.stack(mat).T
+
+    resize_mat = get_resize_mat(old_size, new_size)
+    resize_mat_pinv = torch.tensor(
+        np.linalg.pinv(resize_mat.T), device=patch_embed.device
+    )
+
+    def resample_kernel(kernel):
+        resampled_kernel = resize_mat_pinv @ kernel.reshape(-1)
+        return resampled_kernel.reshape(new_size)
+
+    v_resample_kernel = vmap(vmap(resample_kernel, 0, 0), 1, 1)
+    orig_dtype = patch_embed.dtype
+    patch_embed = patch_embed.float()
+    patch_embed = v_resample_kernel(patch_embed)
+    patch_embed = patch_embed.to(orig_dtype)
+    return patch_embed
+
+
+# Copied from:
+# https://github.com/deepseek-ai/Janus/tree/main/janus/models/siglip_vit.py
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding"""
+
+    output_fmt: Format
+    dynamic_img_pad: torch.jit.Final[bool]
+
+    def __init__(
+        self,
+        img_size: Optional[int] = 224,
+        patch_size: int = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten: bool = True,
+        output_fmt: Optional[str] = None,
+        bias: bool = True,
+        strict_img_size: bool = True,
+        dynamic_img_pad: bool = False,
+    ):
+        super().__init__()
+        self.patch_size = tuple(to_2tuple(patch_size))
+        self.img_size, self.grid_size, self.num_patches = self._init_img_size(img_size)
+
+        if output_fmt is not None:
+            self.flatten = False
+            self.output_fmt = Format(output_fmt)
+        else:
+            # flatten spatial dim and transpose to channels last, kept for bwd compat
+            self.flatten = flatten
+            self.output_fmt = Format.NCHW
+        self.strict_img_size = strict_img_size
+        self.dynamic_img_pad = dynamic_img_pad
+
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias
+        )
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def _init_img_size(self, img_size: Union[int, Tuple[int, int]]):
+        assert self.patch_size
+        if img_size is None:
+            return None, None, None
+        img_size = to_2tuple(img_size)
+        grid_size = tuple([s // p for s, p in zip(img_size, self.patch_size)])
+        num_patches = grid_size[0] * grid_size[1]
+        return img_size, grid_size, num_patches
+
+    def set_input_size(
+        self,
+        img_size: Optional[Union[int, Tuple[int, int]]] = None,
+        patch_size: Optional[Union[int, Tuple[int, int]]] = None,
+    ):
+        new_patch_size = None
+        if patch_size is not None:
+            new_patch_size = to_2tuple(patch_size)
+        if new_patch_size is not None and new_patch_size != self.patch_size:
+            with torch.no_grad():
+                new_proj = nn.Conv2d(
+                    self.proj.in_channels,
+                    self.proj.out_channels,
+                    kernel_size=new_patch_size,
+                    stride=new_patch_size,
+                    bias=self.proj.bias is not None,
+                )
+                new_proj.weight.copy_(
+                    resample_patch_embed(self.proj.weight, new_patch_size, verbose=True)
+                )
+                if self.proj.bias is not None:
+                    new_proj.bias.copy_(self.proj.bias)
+                self.proj = new_proj
+            self.patch_size = new_patch_size
+        img_size = img_size or self.img_size
+        if img_size != self.img_size or new_patch_size is not None:
+            self.img_size, self.grid_size, self.num_patches = self._init_img_size(
+                img_size
+            )
+
+    def feat_ratio(self, as_scalar=True) -> Union[Tuple[int, int], int]:
+        if as_scalar:
+            return max(self.patch_size)
+        else:
+            return self.patch_size
+
+    def dynamic_feat_size(self, img_size: Tuple[int, int]) -> Tuple[int, int]:
+        """Get grid (feature) size for given image size taking account of dynamic padding.
+        NOTE: must be torchscript compatible so using fixed tuple indexing
+        """
+        if self.dynamic_img_pad:
+            return math.ceil(img_size[0] / self.patch_size[0]), math.ceil(
+                img_size[1] / self.patch_size[1]
+            )
+        else:
+            return img_size[0] // self.patch_size[0], img_size[1] // self.patch_size[1]
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        if self.img_size is not None:
+            if self.strict_img_size:
+                _assert(
+                    H == self.img_size[0],
+                    f"Input height ({H}) doesn't match model ({self.img_size[0]}).",
+                )
+                _assert(
+                    W == self.img_size[1],
+                    f"Input width ({W}) doesn't match model ({self.img_size[1]}).",
+                )
+            elif not self.dynamic_img_pad:
+                _assert(
+                    H % self.patch_size[0] == 0,
+                    f"Input height ({H}) should be divisible by patch size ({self.patch_size[0]}).",
+                )
+                _assert(
+                    W % self.patch_size[1] == 0,
+                    f"Input width ({W}) should be divisible by patch size ({self.patch_size[1]}).",
+                )
+        if self.dynamic_img_pad:
+            pad_h = (self.patch_size[0] - H % self.patch_size[0]) % self.patch_size[0]
+            pad_w = (self.patch_size[1] - W % self.patch_size[1]) % self.patch_size[1]
+            x = F.pad(x, (0, pad_w, 0, pad_h))
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # NCHW -> NLC
+        elif self.output_fmt != Format.NCHW:
+            x = nchw_to(x, self.output_fmt)
+        x = self.norm(x)
+        return x
+
+
+class Mlp(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks
+
+    NOTE: When use_conv=True, expects 2D NCHW tensors, otherwise N*C expected.
+    """
+
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=None,
+        bias=True,
+        drop=0.0,
+        use_conv=False,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+
+        self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = (
+            norm_layer(hidden_features) if norm_layer is not None else nn.Identity()
+        )
+        self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.norm(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+
+
+def drop_path(
+    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
+):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+
+    def extra_repr(self):
+        return f"drop_prob={round(self.drop_prob, 3):0.3f}"
+
+
+class VisionTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        qk_norm: bool = False,
+        proj_drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values: Optional[float] = None,
+        drop_path: float = 0.0,
+        act_layer: nn.Module = nn.GELU,
+        norm_layer: nn.Module = nn.LayerNorm,
+        mlp_layer: nn.Module = Mlp,
+    ) -> None:
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = VisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            use_qkv_parallel=True,
+            qkv_backend="sdpa",
+            softmax_in_single_precision=False,
+            dropout=attn_drop,
+        )
+
+        self.ls1 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+        self.norm2 = norm_layer(dim)
+        self.mlp = mlp_layer(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            act_layer=act_layer,
+            drop=proj_drop,
+        )
+        self.ls2 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x))))
+        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+        return x
+
+
+LayerType = Union[str, Callable, Type[torch.nn.Module]]
+
+
+class PatchDropout(nn.Module):
+    """
+    https://arxiv.org/abs/2212.00794 and https://arxiv.org/pdf/2208.07220
+    """
+
+    return_indices: torch.jit.Final[bool]
+
+    def __init__(
+        self,
+        prob: float = 0.5,
+        num_prefix_tokens: int = 1,
+        ordered: bool = False,
+        return_indices: bool = False,
+    ):
+        super().__init__()
+        assert 0 <= prob < 1.0
+        self.prob = prob
+        self.num_prefix_tokens = (
+            num_prefix_tokens  # exclude CLS token (or other prefix tokens)
+        )
+        self.ordered = ordered
+        self.return_indices = return_indices
+
+    def forward(
+        self, x
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, Optional[torch.Tensor]]]:
+        if not self.training or self.prob == 0.0:
+            if self.return_indices:
+                return x, None
+            return x
+
+        if self.num_prefix_tokens:
+            prefix_tokens, x = (
+                x[:, : self.num_prefix_tokens],
+                x[:, self.num_prefix_tokens :],
+            )
+        else:
+            prefix_tokens = None
+
+        B = x.shape[0]
+        L = x.shape[1]
+        num_keep = max(1, int(L * (1.0 - self.prob)))
+        keep_indices = torch.argsort(torch.randn(B, L, device=x.device), dim=-1)[
+            :, :num_keep
+        ]
+        if self.ordered:
+            # NOTE does not need to maintain patch order in typical transformer use,
+            # but possibly useful for debug / visualization
+            keep_indices = keep_indices.sort(dim=-1)[0]
+        x = x.gather(1, keep_indices.unsqueeze(-1).expand((-1, -1) + x.shape[2:]))
+
+        if prefix_tokens is not None:
+            x = torch.cat((prefix_tokens, x), dim=1)
+
+        if self.return_indices:
+            return x, keep_indices
+        return x
+
+
+def resample_abs_pos_embed(
+    posemb: torch.Tensor,
+    new_size: List[int],
+    old_size: Optional[List[int]] = None,
+    num_prefix_tokens: int = 1,
+    interpolation: str = "bicubic",
+    antialias: bool = True,
+    verbose: bool = False,
+):
+    # sort out sizes, assume square if old size not provided
+    num_pos_tokens = posemb.shape[1]
+    num_new_tokens = new_size[0] * new_size[1] + num_prefix_tokens
+    if num_new_tokens == num_pos_tokens and new_size[0] == new_size[1]:
+        return posemb
+
+    if old_size is None:
+        hw = int(math.sqrt(num_pos_tokens - num_prefix_tokens))
+        old_size = hw, hw
+
+    if num_prefix_tokens:
+        posemb_prefix, posemb = (
+            posemb[:, :num_prefix_tokens],
+            posemb[:, num_prefix_tokens:],
+        )
+    else:
+        posemb_prefix, posemb = None, posemb
+
+    # do the interpolation
+    embed_dim = posemb.shape[-1]
+    orig_dtype = posemb.dtype
+    posemb = posemb.float()  # interpolate needs float32
+    posemb = posemb.reshape(1, old_size[0], old_size[1], -1).permute(0, 3, 1, 2)
+    posemb = F.interpolate(
+        posemb, size=new_size, mode=interpolation, antialias=antialias
+    )
+    posemb = posemb.permute(0, 2, 3, 1).reshape(1, -1, embed_dim)
+    posemb = posemb.to(orig_dtype)
+
+    # add back extra (class, etc) prefix tokens
+    if posemb_prefix is not None:
+        posemb = torch.cat([posemb_prefix, posemb], dim=1)
+
+    if not torch.jit.is_scripting() and verbose:
+        logger.info(f"Resized position embedding: {old_size} to {new_size}.")
+
+    return posemb
+
+
+def init_weights(self):
+    if self.pos_embed is not None:
+        trunc_normal_(self.pos_embed, std=self.pos_embed.shape[1] ** -0.5)
+    trunc_normal_(self.latent, std=self.latent_dim**-0.5)
+
+
+def init_weights_vit_timm(module: nn.Module, name: str = "") -> None:
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif hasattr(module, "init_weights"):
+        module.init_weights()
+
+
+class VisionTransformer(nn.Module):
+    """Vision Transformer
+
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
+        - https://arxiv.org/abs/2010.11929
+    """
+
+    dynamic_img_size: Final[bool]
+
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        num_classes: int = 1000,
+        global_pool: Literal["", "avg", "token", "map"] = "token",
+        embed_dim: int = 768,
+        depth: int = 12,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        init_values: Optional[float] = None,
+        class_token: bool = True,
+        no_embed_class: bool = False,
+        reg_tokens: int = 0,
+        pre_norm: bool = False,
+        fc_norm: Optional[bool] = None,
+        dynamic_img_size: bool = False,
+        dynamic_img_pad: bool = False,
+        drop_rate: float = 0.0,
+        pos_drop_rate: float = 0.0,
+        patch_drop_rate: float = 0.0,
+        proj_drop_rate: float = 0.0,
+        attn_drop_rate: float = 0.0,
+        drop_path_rate: float = 0.0,
+        weight_init: Literal["skip", "jax", "jax_nlhb", "moco", ""] = "",
+        embed_layer: Callable = PatchEmbed,
+        _norm_layer: Optional[LayerType] = None,
+        _act_layer: Optional[LayerType] = None,
+        block_fn: Type[nn.Module] = VisionTransformerBlock,
+        mlp_layer: Type[nn.Module] = Mlp,
+        ignore_head: bool = False,
+    ) -> None:
+        """
+        Args:
+            img_size: Input image size.
+            patch_size: Patch size.
+            in_chans: Number of image input channels.
+            num_classes: Number of classes for classification head.
+            global_pool: Type of global pooling for final sequence (default: 'token').
+            embed_dim: Transformer embedding dimension.
+            depth: Depth of transformer.
+            num_heads: Number of attention heads.
+            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
+            qkv_bias: Enable bias for qkv projections if True.
+            init_values: Layer-scale init values (layer-scale enabled if not None).
+            class_token: Use class token.
+            no_embed_class: Don't include position embeddings for class (or reg) tokens.
+            reg_tokens: Number of register tokens.
+            fc_norm: Pre head norm after pool (instead of before), if None, enabled when global_pool == 'avg'.
+            drop_rate: Head dropout rate.
+            pos_drop_rate: Position embedding dropout rate.
+            attn_drop_rate: Attention dropout rate.
+            drop_path_rate: Stochastic depth rate.
+            weight_init: Weight initialization scheme.
+            embed_layer: Patch embedding layer.
+            _norm_layer: Normalization layer.
+            _act_layer: MLP activation layer.
+            block_fn: Transformer block layer.
+        """
+        super().__init__()
+        assert global_pool in ("", "avg", "token", "map")
+        assert class_token or global_pool != "token"
+        use_fc_norm = global_pool == "avg" if fc_norm is None else fc_norm
+        # norm_layer = get_norm_layer(norm_layer) or partial(nn.LayerNorm, eps=1e-6)
+        # act_layer = get_act_layer(act_layer) or nn.GELU
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        act_layer = nn.GELU
+
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.num_features = self.embed_dim = (
+            embed_dim  # num_features for consistency with other models
+        )
+        self.num_prefix_tokens = 1 if class_token else 0
+        self.num_prefix_tokens += reg_tokens
+        self.num_reg_tokens = reg_tokens
+        self.has_class_token = class_token
+        self.no_embed_class = (
+            no_embed_class  # don't embed prefix positions (includes reg)
+        )
+        self.dynamic_img_size = dynamic_img_size
+        self.grad_checkpointing = False
+        self.ignore_head = ignore_head
+
+        embed_args = {}
+        if dynamic_img_size:
+            # flatten deferred until after pos embed
+            embed_args.update(dict(strict_img_size=False, output_fmt="NHWC"))
+        self.patch_embed = embed_layer(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            bias=not pre_norm,  # disable bias if pre-norm is used (e.g. CLIP)
+            dynamic_img_pad=dynamic_img_pad,
+            **embed_args,
+        )
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = (
+            nn.Parameter(torch.zeros(1, 1, embed_dim)) if class_token else None
+        )
+        self.reg_token = (
+            nn.Parameter(torch.zeros(1, reg_tokens, embed_dim)) if reg_tokens else None
+        )
+        embed_len = (
+            num_patches if no_embed_class else num_patches + self.num_prefix_tokens
+        )
+        self.pos_embed = nn.Parameter(torch.randn(1, embed_len, embed_dim) * 0.02)
+        self.pos_drop = nn.Dropout(p=pos_drop_rate)
+        if patch_drop_rate > 0:
+            self.patch_drop = PatchDropout(
+                patch_drop_rate,
+                num_prefix_tokens=self.num_prefix_tokens,
+            )
+        else:
+            self.patch_drop = nn.Identity()
+        self.norm_pre = norm_layer(embed_dim) if pre_norm else nn.Identity()
+
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, depth)
+        ]  # stochastic depth decay rule
+        self.blocks = nn.Sequential(
+            *[
+                block_fn(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_norm=qk_norm,
+                    init_values=init_values,
+                    proj_drop=proj_drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    act_layer=act_layer,
+                    mlp_layer=mlp_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+        self.norm = norm_layer(embed_dim) if not use_fc_norm else nn.Identity()
+
+        # Classifier Head
+        if global_pool == "map":
+            AttentionPoolLatent.init_weights = init_weights
+            self.attn_pool = AttentionPoolLatent(
+                self.embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                norm_layer=norm_layer,
+            )
+        else:
+            self.attn_pool = None
+        self.fc_norm = norm_layer(embed_dim) if use_fc_norm else nn.Identity()
+        self.head_drop = nn.Dropout(drop_rate)
+        self.head = (
+            nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        )
+
+        if weight_init != "skip":
+            self.init_weights(weight_init)
+
+    def init_weights(self, mode: Literal["jax", "jax_nlhb", "moco", ""] = "") -> None:
+        assert mode in ("jax", "jax_nlhb", "moco", "")
+        # head_bias = -math.log(self.num_classes) if "nlhb" in mode else 0.0
+        trunc_normal_(self.pos_embed, std=0.02)
+        if self.cls_token is not None:
+            nn.init.normal_(self.cls_token, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+
+    @torch.jit.ignore
+    def no_weight_decay(self) -> Set:
+        return {"pos_embed", "cls_token", "dist_token"}
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse: bool = False) -> Dict:
+        return dict(
+            stem=r"^cls_token|pos_embed|patch_embed",  # stem and embed
+            blocks=[(r"^blocks\.(\d+)", None), (r"^norm", (99999,))],
+        )
+
+    @torch.jit.ignore
+    def get_classifier(self) -> nn.Module:
+        return self.head
+
+    def reset_classifier(self, num_classes: int, global_pool=None) -> None:
+        self.num_classes = num_classes
+        if global_pool is not None:
+            assert global_pool in ("", "avg", "token", "map")
+            if global_pool == "map" and self.attn_pool is None:
+                assert (
+                    False
+                ), "Cannot currently add attention pooling in reset_classifier()."
+            elif global_pool != "map " and self.attn_pool is not None:
+                self.attn_pool = None  # remove attention pooling
+            self.global_pool = global_pool
+        self.head = (
+            nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        )
+
+    def _pos_embed(self, x: torch.Tensor) -> torch.Tensor:
+        if self.dynamic_img_size:
+            B, H, W, C = x.shape
+            pos_embed = resample_abs_pos_embed(
+                self.pos_embed,
+                [H, W],
+                num_prefix_tokens=0 if self.no_embed_class else self.num_prefix_tokens,
+            )
+            x = x.view(B, -1, C)
+        else:
+            pos_embed = self.pos_embed
+
+        to_cat = []
+        if self.cls_token is not None:
+            to_cat.append(self.cls_token.expand(x.shape[0], -1, -1))
+        if self.reg_token is not None:
+            to_cat.append(self.reg_token.expand(x.shape[0], -1, -1))
+
+        if self.no_embed_class:
+            # deit-3, updated JAX (big vision)
+            # position embedding does not overlap with class token, add then concat
+            x = x + pos_embed
+            if to_cat:
+                x = torch.cat(to_cat + [x], dim=1)
+        else:
+            # original timm, JAX, and deit vit impl
+            # pos_embed has entry for class token, concat then add
+            if to_cat:
+                x = torch.cat(to_cat + [x], dim=1)
+            x = x + pos_embed
+
+        return self.pos_drop(x)
+
+    def _intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,
+    ) -> List[torch.Tensor]:
+        outputs, num_blocks = [], len(self.blocks)
+        take_indices = set(
+            range(num_blocks - n, num_blocks) if isinstance(n, int) else n
+        )
+
+        # forward pass
+        x = self.patch_embed(x)
+        x = self._pos_embed(x)
+        x = self.patch_drop(x)
+        x = self.norm_pre(x)
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in take_indices:
+                outputs.append(x)
+
+        return outputs
+
+    def forward_features(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.patch_embed(x)
+        x = self._pos_embed(x)
+        x = self.patch_drop(x)
+        x = self.norm_pre(x)
+        x = self.blocks(x)
+        x = self.norm(x)
+        return x
+
+    def forward_head(self, x: torch.Tensor, pre_logits: bool = False) -> torch.Tensor:
+        if self.attn_pool is not None:
+            x = self.attn_pool(x)
+        elif self.global_pool == "avg":
+            x = x[:, self.num_prefix_tokens :].mean(dim=1)
+        elif self.global_pool:
+            x = x[:, 0]  # class token
+        x = self.fc_norm(x)
+        x = self.head_drop(x)
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.forward_features(x)
+        if not self.ignore_head:
+            x = self.forward_head(x)
+        return x
+
+
+def model_name_to_cls(cls_name):
+    if "MlpProjector" in cls_name:
+        cls = MlpProjector
+
+    elif "CLIPVisionTower" in cls_name:
+        cls = CLIPVisionTower
+
+    elif "VQ" in cls_name:
+
+        cls = VQ_models[cls_name]
+    elif "vision_head" in cls_name:
+        cls = vision_head
+    else:
+        raise ValueError(f"class_name {cls_name} is invalid.")
+
+    return cls
+
+
+class vision_head(torch.nn.Module):
+    def __init__(self, params):
+        super().__init__()
+        self.output_mlp_projector = torch.nn.Linear(
+            params["n_embed"], params["image_token_embed"]
+        )
+        self.vision_activation = torch.nn.GELU()
+        self.vision_head = torch.nn.Linear(
+            params["image_token_embed"], params["image_token_size"]
+        )
+
+    def forward(self, x):
+        x = self.output_mlp_projector(x)
+        x = self.vision_activation(x)
+        x = self.vision_head(x)
+        return x
+
+
+SigLIP_MODEL_CONFIG = {
+    "siglip_so400m_patch14_384": {
+        "image_size": 336,
+        "patch_size": 14,
+        "width": 1152,
+        "layers": 27,
+        "heads": 16,
+        "mlp_ratio": 3.7362,
+        "global_pool": "map",
+        "use_checkpoint": False,
+    },
+    "siglip_so400m_patch14_224": {
+        "image_size": 224,
+        "patch_size": 14,
+        "width": 1152,
+        "layers": 27,
+        "heads": 16,
+        "mlp_ratio": 3.7362,
+        "global_pool": "map",
+        "use_checkpoint": False,
+    },
+    "siglip_large_patch16_384": {
+        "image_size": 384,
+        "patch_size": 16,
+        "width": 1024,
+        "layers": 24,
+        "heads": 16,
+        "mlp_ratio": 4,
+        "global_pool": "map",
+        "use_checkpoint": False,
+    },
+}
+
+
+def create_siglip_vit(
+    model_name: str = "siglip_so400m_patch14_384",
+    image_size: int = 384,
+    select_layer: int = -1,
+    ckpt_path: str = "",
+    **kwargs,
+):
+    assert (
+        model_name in SigLIP_MODEL_CONFIG.keys()
+    ), f"model name should be in {SigLIP_MODEL_CONFIG.keys()}"
+
+    vision_cfg = SigLIPVisionCfg(**SigLIP_MODEL_CONFIG[model_name])
+
+    if select_layer <= 0:
+        layers = min(vision_cfg.layers, vision_cfg.layers + select_layer + 1)
+    else:
+        layers = min(vision_cfg.layers, select_layer)
+
+    model = VisionTransformer(
+        img_size=image_size,
+        patch_size=vision_cfg.patch_size,
+        embed_dim=vision_cfg.width,
+        depth=layers,
+        num_heads=vision_cfg.heads,
+        mlp_ratio=vision_cfg.mlp_ratio,
+        class_token=vision_cfg.class_token,
+        global_pool=vision_cfg.global_pool,
+        ignore_head=kwargs.get("ignore_head", True),
+        weight_init=kwargs.get("weight_init", "skip"),
+        num_classes=0,
+    )
+
+    if ckpt_path:
+        state_dict = torch.load(ckpt_path, map_location="cpu", weights_only=True)
+
+        incompatible_keys = model.load_state_dict(state_dict, strict=False)
+        print(
+            f"SigLIP-ViT restores from {ckpt_path},\n"
+            f"\tincompatible_keys:', {incompatible_keys}."
+        )
+
+    return model
+
+
+class Normalize(torch.nn.Module):
+    """Normalize a tensor image with mean and standard deviation.
+    This transform does not support PIL Image.
+    Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n``
+    channels, this transform will normalize each channel of the input
+    ``torch.*Tensor`` i.e.,
+    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
+
+    .. note::
+        This transform acts out of place, i.e., it does not mutate the input tensor.
+
+    Args:
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channel.
+        inplace(bool,optional): Bool to make this operation in-place.
+
+    """
+
+    def __init__(self, mean, std, inplace=False):
+        super().__init__()
+        # _log_api_usage_once(self)
+        self.mean = mean
+        self.std = std
+        self.inplace = inplace
+
+    def forward(self, tensor: Tensor) -> Tensor:
+        """
+        Args:
+            tensor (Tensor): Tensor image to be normalized.
+
+        Returns:
+            Tensor: Normalized Tensor image.
+        """
+        return F.normalize(tensor, self.mean, self.std, self.inplace)
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(mean={self.mean}, std={self.std})"
+
+
+class CLIPVisionTower(nn.Module):
+    def __init__(
+        self,
+        model_name: str = "siglip_large_patch16_384",
+        image_size: Union[Tuple[int, int], int] = 336,
+        select_feature: str = "patch",
+        select_layer: int = -2,
+        select_layers: list = None,
+        ckpt_path: str = "",
+        pixel_mean: Optional[List[float]] = None,
+        pixel_std: Optional[List[float]] = None,
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.model_name = model_name
+        self.select_feature = select_feature
+        self.select_layer = select_layer
+        self.select_layers = select_layers
+
+        vision_tower_params = {
+            "model_name": model_name,
+            "image_size": image_size,
+            "ckpt_path": ckpt_path,
+            "select_layer": select_layer,
+        }
+        vision_tower_params.update(kwargs)
+        self.vision_tower, self.forward_kwargs = self.build_vision_tower(
+            vision_tower_params
+        )
+
+        if pixel_mean is not None and pixel_std is not None:
+            image_norm = Normalize(mean=pixel_mean, std=pixel_std)
+        else:
+            image_norm = None
+
+        self.image_norm = image_norm
+
+    @property
+    def device(self) -> torch.device:
+        return next(self.vision_tower.parameters()).device
+
+    @property
+    def dtype(self):
+        return next(self.vision_tower.parameters()).dtype
+
+    def build_vision_tower(self, vision_tower_params):
+        if self.model_name.startswith("siglip"):
+            self.select_feature = "same"
+            vision_tower = create_siglip_vit(**vision_tower_params)
+            forward_kwargs = dict()
+
+        elif self.model_name.startswith("sam"):
+            # vision_tower = create_sam_vit(**vision_tower_params)
+            forward_kwargs = dict()
+
+        else:  # huggingface
+            from transformers import CLIPVisionModel
+
+            vision_tower = CLIPVisionModel.from_pretrained(**vision_tower_params)
+            forward_kwargs = dict(output_hidden_states=True)
+
+        return vision_tower, forward_kwargs
+
+    def feature_select(self, image_forward_outs):
+        if isinstance(image_forward_outs, torch.Tensor):
+            # the output has been the self.select_layer"s features
+            image_features = image_forward_outs
+        else:
+            image_features = image_forward_outs.hidden_states[self.select_layer]
+
+        if self.select_feature == "patch":
+            # if the output has cls_token
+            image_features = image_features[:, 1:]
+        elif self.select_feature == "cls_patch":
+            image_features = image_features
+        elif self.select_feature == "same":
+            image_features = image_features
+
+        else:
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
+        return image_features
+
+    def forward(self, images):
+        """
+
+        Args:
+            images (torch.Tensor): [b, 3, H, W]
+
+        Returns:
+            image_features (torch.Tensor): [b, n_patch, d]
+        """
+
+        if self.image_norm is not None:
+            images = self.image_norm(images)
+
+        image_forward_outs = self.vision_tower(images, **self.forward_kwargs)
+        image_features = self.feature_select(image_forward_outs)
+        return image_features
+
+
+class MlpProjector(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+
+        self.cfg = cfg
+
+        if cfg["projector_type"] == "identity":
+            modules = nn.Identity()
+
+        elif cfg["projector_type"] == "linear":
+            modules = nn.Linear(cfg["input_dim"], cfg["n_embed"])
+
+        elif cfg["projector_type"] == "mlp_gelu":
+            mlp_depth = cfg.get("depth", 1)
+            modules = [nn.Linear(cfg["input_dim"], cfg["n_embed"])]
+            for _ in range(1, mlp_depth):
+                modules.append(nn.GELU())
+                modules.append(nn.Linear(cfg["n_embed"], cfg["n_embed"]))
+            modules = nn.Sequential(*modules)
+
+        elif cfg["projector_type"] == "low_high_hybrid_split_mlp_gelu":
+            mlp_depth = cfg.get("depth", 1)
+            self.high_up_proj = nn.Linear(cfg["input_dim"], cfg["n_embed"] // 2)
+            self.low_up_proj = nn.Linear(cfg["input_dim"], cfg["n_embed"] // 2)
+
+            modules = []
+            for _ in range(1, mlp_depth):
+                modules.append(nn.GELU())
+                modules.append(nn.Linear(cfg["n_embed"], cfg["n_embed"]))
+            modules = nn.Sequential(*modules)
+
+        else:
+            raise ValueError(f"Unknown projector type: {cfg['projector_type']}")
+
+        self.layers = modules
+
+    def forward(
+        self, x_or_tuple: Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]
+    ):
+        """
+
+        Args:
+            x_or_tuple (Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:  if it is a tuple of torch.Tensor,
+                then it comes from the hybrid vision encoder, and x = high_res_x, low_res_x);
+                otherwise it is the feature from the single vision encoder.
+
+        Returns:
+            x (torch.Tensor): [b, s, c]
+        """
+
+        if isinstance(x_or_tuple, tuple):
+            # self.cfg.projector_type == "low_high_hybrid_split_mlp_gelu":
+            high_x, low_x = x_or_tuple
+            high_x = self.high_up_proj(high_x)
+            low_x = self.low_up_proj(low_x)
+            x = torch.cat([high_x, low_x], dim=-1)
+        else:
+            x = x_or_tuple
+
+        return self.layers(x)
+
+
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: float = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+
+
+# use torch.scaled_dot_product_attention where possible
+_HAS_FUSED_ATTN = hasattr(torch.nn.functional, "scaled_dot_product_attention")
+if "TIMM_FUSED_ATTN" in os.environ:
+    _USE_FUSED_ATTN = int(os.environ["TIMM_FUSED_ATTN"])
+else:
+    _USE_FUSED_ATTN = (
+        1  # 0 == off, 1 == on (for tested use), 2 == on (for experimental use)
+    )
+
+# Set to True if exporting a model with Same padding via ONNX
+_EXPORTABLE = False
+
+
+def use_fused_attn(experimental: bool = False) -> bool:
+    # NOTE: ONNX export cannot handle F.scaled_dot_product_attention as of pytorch 2.0
+    if not _HAS_FUSED_ATTN or _EXPORTABLE:
+        return False
+    if experimental:
+        return _USE_FUSED_ATTN > 1
+    return _USE_FUSED_ATTN > 0
+
+
+class AttentionPoolLatent(nn.Module):
+    """Attention pooling w/ latent query"""
+
+    fused_attn: torch.jit.Final[bool]
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int = None,
+        embed_dim: int = None,
+        num_heads: int = 8,
+        feat_size: Optional[int] = None,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        latent_len: int = 1,
+        latent_dim: int = None,
+        pos_embed: str = "",
+        pool_type: str = "token",
+        norm_layer: Optional[nn.Module] = None,
+        drop: float = 0.0,
+    ):
+        super().__init__()
+        embed_dim = embed_dim or in_features
+        out_features = out_features or in_features
+        assert embed_dim % num_heads == 0
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.feat_size = feat_size
+        self.scale = self.head_dim**-0.5
+        self.pool = pool_type
+        self.fused_attn = use_fused_attn()
+
+        if pos_embed == "abs":
+            assert feat_size is not None
+            self.pos_embed = nn.Parameter(torch.zeros(feat_size, in_features))
+        else:
+            self.pos_embed = None
+
+        self.latent_dim = latent_dim or embed_dim
+        self.latent_len = latent_len
+        self.latent = nn.Parameter(torch.zeros(1, self.latent_len, embed_dim))
+
+        self.q = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
+        self.kv = nn.Linear(embed_dim, embed_dim * 2, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.proj = nn.Linear(embed_dim, embed_dim)
+        self.proj_drop = nn.Dropout(drop)
+
+        self.norm = (
+            norm_layer(out_features) if norm_layer is not None else nn.Identity()
+        )
+        self.mlp = Mlp(embed_dim, int(embed_dim * mlp_ratio))
+
+        self.init_weights()
+
+    def init_weights(self):
+        if self.pos_embed is not None:
+            trunc_normal_tf_(self.pos_embed, std=self.pos_embed.shape[1] ** -0.5)
+        trunc_normal_tf_(self.latent, std=self.latent_dim**-0.5)
+
+    def forward(self, x):
+        B, N, C = x.shape
+
+        if self.pos_embed is not None:
+            # FIXME interpolate
+            x = x + self.pos_embed.unsqueeze(0).to(x.dtype)
+
+        q_latent = self.latent.expand(B, -1, -1)
+        q = (
+            self.q(q_latent)
+            .reshape(B, self.latent_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+        )
+
+        kv = (
+            self.kv(x)
+            .reshape(B, N, 2, self.num_heads, self.head_dim)
+            .permute(2, 0, 3, 1, 4)
+        )
+        k, v = kv.unbind(0)
+
+        q, k = self.q_norm(q), self.k_norm(k)
+
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(q, k, v)
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            attn = attn.softmax(dim=-1)
+            x = attn @ v
+        x = x.transpose(1, 2).reshape(B, self.latent_len, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        x = x + self.mlp(self.norm(x))
+
+        # optional pool if latent seq_len > 1 and pooled output is desired
+        if self.pool == "token":
+            x = x[:, 0]
+        elif self.pool == "avg":
+            x = x.mean(1)
+
+
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        in_channels=3,
+        ch=128,
+        ch_mult=(1, 1, 2, 2, 4),
+        num_res_blocks=2,
+        norm_type="group",
+        dropout=0.0,
+        resamp_with_conv=True,
+        z_channels=256,
+    ):
+        super().__init__()
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.conv_in = nn.Conv2d(in_channels, ch, kernel_size=3, stride=1, padding=1)
+
+        # downsampling
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.conv_blocks = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            conv_block = nn.Module()
+            # res & attn
+            res_block = nn.ModuleList()
+            attn_block = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                res_block.append(
+                    ResnetBlock(
+                        block_in, block_out, dropout=dropout, norm_type=norm_type
+                    )
+                )
+                block_in = block_out
+                if i_level == self.num_resolutions - 1:
+                    attn_block.append(AttnBlock(block_in, norm_type))
+            conv_block.res = res_block
+            conv_block.attn = attn_block
+            # downsample
+            if i_level != self.num_resolutions - 1:
+                conv_block.downsample = Downsample(block_in, resamp_with_conv)
+            self.conv_blocks.append(conv_block)
+
+        # middle
+        self.mid = nn.ModuleList()
+        self.mid.append(
+            ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type)
+        )
+        self.mid.append(AttnBlock(block_in, norm_type=norm_type))
+        self.mid.append(
+            ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type)
+        )
+
+        # end
+        self.norm_out = Normalize(block_in, norm_type)
+        self.conv_out = nn.Conv2d(
+            block_in, z_channels, kernel_size=3, stride=1, padding=1
+        )
+
+    def forward(self, x):
+        h = self.conv_in(x)
+        # downsampling
+        for i_level, block in enumerate(self.conv_blocks):
+            for i_block in range(self.num_res_blocks):
+                h = block.res[i_block](h)
+                if len(block.attn) > 0:
+                    h = block.attn[i_block](h)
+            if i_level != self.num_resolutions - 1:
+                h = block.downsample(h)
+
+        # middle
+        for mid_block in self.mid:
+            h = mid_block(h)
+
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        z_channels=256,
+        ch=128,
+        ch_mult=(1, 1, 2, 2, 4),
+        num_res_blocks=2,
+        norm_type="group",
+        dropout=0.0,
+        resamp_with_conv=True,
+        out_channels=3,
+    ):
+        super().__init__()
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        # z to block_in
+        self.conv_in = nn.Conv2d(
+            z_channels, block_in, kernel_size=3, stride=1, padding=1
+        )
+
+        # middle
+        self.mid = nn.ModuleList()
+        self.mid.append(
+            ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type)
+        )
+        self.mid.append(AttnBlock(block_in, norm_type=norm_type))
+        self.mid.append(
+            ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type)
+        )
+
+        # upsampling
+        self.conv_blocks = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            conv_block = nn.Module()
+            # res & attn
+            res_block = nn.ModuleList()
+            attn_block = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                res_block.append(
+                    ResnetBlock(
+                        block_in, block_out, dropout=dropout, norm_type=norm_type
+                    )
+                )
+                block_in = block_out
+                if i_level == self.num_resolutions - 1:
+                    attn_block.append(AttnBlock(block_in, norm_type))
+            conv_block.res = res_block
+            conv_block.attn = attn_block
+            # downsample
+            if i_level != 0:
+                conv_block.upsample = Upsample(block_in, resamp_with_conv)
+            self.conv_blocks.append(conv_block)
+
+        # end
+        self.norm_out = Normalize(block_in, norm_type)
+        self.conv_out = nn.Conv2d(
+            block_in, out_channels, kernel_size=3, stride=1, padding=1
+        )
+
+    @property
+    def last_layer(self):
+        return self.conv_out.weight
+
+    def forward(self, z):
+        # z to block_in
+        h = self.conv_in(z)
+
+        # middle
+        for mid_block in self.mid:
+            h = mid_block(h)
+
+        # upsampling
+        for i_level, block in enumerate(self.conv_blocks):
+            for i_block in range(self.num_res_blocks + 1):
+                h = block.res[i_block](h)
+                if len(block.attn) > 0:
+                    h = block.attn[i_block](h)
+            if i_level != self.num_resolutions - 1:
+                h = block.upsample(h)
+
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+
+
+class VectorQuantizer(nn.Module):
+    def __init__(self, n_e, e_dim, beta, entropy_loss_ratio, l2_norm, show_usage):
+        super().__init__()
+        self.n_e = n_e
+        self.e_dim = e_dim
+        self.beta = beta
+        self.entropy_loss_ratio = entropy_loss_ratio
+        self.l2_norm = l2_norm
+        self.show_usage = show_usage
+
+        self.embedding = nn.Embedding(self.n_e, self.e_dim)
+        self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+        if self.l2_norm:
+            self.embedding.weight.data = F.normalize(
+                self.embedding.weight.data, p=2, dim=-1
+            )
+        if self.show_usage:
+            # self.register_buffer("codebook_used", nn.Parameter(torch.zeros(65536)))
+            self.codebook_used = nn.Parameter(torch.zeros(65536))
+
+    def forward(self, z):
+        # reshape z -> (batch, height, width, channel) and flatten
+        z = torch.einsum("b c h w -> b h w c", z).contiguous()
+        z_flattened = z.view(-1, self.e_dim)
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+
+        if self.l2_norm:
+            z = F.normalize(z, p=2, dim=-1)
+            z_flattened = F.normalize(z_flattened, p=2, dim=-1)
+            embedding = F.normalize(self.embedding.weight, p=2, dim=-1)
+        else:
+            embedding = self.embedding.weight
+
+        d = (
+            torch.sum(z_flattened**2, dim=1, keepdim=True)
+            + torch.sum(embedding**2, dim=1)
+            - 2
+            * torch.einsum(
+                "bd,dn->bn", z_flattened, torch.einsum("n d -> d n", embedding)
+            )
+        )
+
+        min_encoding_indices = torch.argmin(d, dim=1)
+        z_q = embedding[min_encoding_indices].view(z.shape)
+        perplexity = None
+        min_encodings = None
+        vq_loss = None
+        commit_loss = None
+        entropy_loss = None
+
+        # compute loss for embedding
+        if self.training:
+            vq_loss = torch.mean((z_q - z.detach()) ** 2)
+            commit_loss = self.beta * torch.mean((z_q.detach() - z) ** 2)
+            entropy_loss = self.entropy_loss_ratio * compute_entropy_loss(-d)
+
+        # preserve gradients
+        z_q = z + (z_q - z).detach()
+
+        # reshape back to match original input shape
+        z_q = torch.einsum("b h w c -> b c h w", z_q)
+
+        return (
+            z_q,
+            (vq_loss, commit_loss, entropy_loss),
+            (perplexity, min_encodings, min_encoding_indices),
+        )
+
+    def get_codebook_entry(self, indices, shape=None, channel_first=True):
+        # shape = (batch, channel, height, width) if channel_first else (batch, height, width, channel)
+        if self.l2_norm:
+            embedding = F.normalize(self.embedding.weight, p=2, dim=-1)
+        else:
+            embedding = self.embedding.weight
+        z_q = embedding[indices]  # (b*h*w, c)
+
+        if shape is not None:
+            if channel_first:
+                z_q = z_q.reshape(shape[0], shape[2], shape[3], shape[1])
+                # reshape back to match original input shape
+                z_q = z_q.permute(0, 3, 1, 2).contiguous()
+            else:
+                z_q = z_q.view(shape)
+        return z_q
+
+
+class ResnetBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout=0.0,
+        norm_type="group",
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+
+        self.norm1 = Normalize(in_channels, norm_type)
+        self.conv1 = nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        self.norm2 = Normalize(out_channels, norm_type)
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = nn.Conv2d(
+                    in_channels, out_channels, kernel_size=3, stride=1, padding=1
+                )
+            else:
+                self.nin_shortcut = nn.Conv2d(
+                    in_channels, out_channels, kernel_size=1, stride=1, padding=0
+                )
+
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h
+
+
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels, norm_type="group"):
+        super().__init__()
+        self.norm = Normalize(in_channels, norm_type)
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+
+        # compute attention
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h * w)
+        q = q.permute(0, 2, 1)  # b,hw,c
+        k = k.reshape(b, c, h * w)  # b,c,hw
+        w_ = torch.bmm(q, k)  # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = F.softmax(w_, dim=2)
+
+        # attend to values
+        v = v.reshape(b, c, h * w)
+        w_ = w_.permute(0, 2, 1)  # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v, w_)  # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b, c, h, w)
+
+        h_ = self.proj_out(h_)
+
+        return x + h_
+
+
+def nonlinearity(x):
+    # swish
+    return x * torch.sigmoid(x)
+
+
+def Normalize(in_channels, norm_type="group"):
+    assert norm_type in ["group", "batch"]
+    if norm_type == "group":
+        return nn.GroupNorm(
+            num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+        )
+    elif norm_type == "batch":
+        return nn.SyncBatchNorm(in_channels)
+
+
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=1, padding=1
+            )
+
+    def forward(self, x):
+        if x.dtype != torch.float32:
+            x = F.interpolate(x.to(torch.float), scale_factor=2.0, mode="nearest").to(
+                torch.bfloat16
+            )
+        else:
+            x = F.interpolate(x, scale_factor=2.0, mode="nearest")
+
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+
+
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=2, padding=0
+            )
+
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = F.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = F.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+
+
+def compute_entropy_loss(affinity, loss_type="softmax", temperature=0.01):
+    flat_affinity = affinity.reshape(-1, affinity.shape[-1])
+    flat_affinity /= temperature
+    probs = F.softmax(flat_affinity, dim=-1)
+    log_probs = F.log_softmax(flat_affinity + 1e-5, dim=-1)
+    if loss_type == "softmax":
+        target_probs = probs
+    else:
+        raise ValueError("Entropy loss {} not supported".format(loss_type))
+    avg_probs = torch.mean(target_probs, dim=0)
+    avg_entropy = -torch.sum(avg_probs * torch.log(avg_probs + 1e-5))
+    sample_entropy = -torch.mean(torch.sum(target_probs * log_probs, dim=-1))
+    loss = sample_entropy - avg_entropy
+    return loss
+
+
+class VQModel(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.config = config
+        self.encoder = Encoder(
+            ch_mult=config.encoder_ch_mult,
+            z_channels=config.z_channels,
+            dropout=config.dropout_p,
+        )
+        self.decoder = Decoder(
+            ch_mult=config.decoder_ch_mult,
+            z_channels=config.z_channels,
+            dropout=config.dropout_p,
+        )
+
+        self.quantize = VectorQuantizer(
+            config.codebook_size,
+            config.codebook_embed_dim,
+            config.commit_loss_beta,
+            config.entropy_loss_ratio,
+            config.codebook_l2_norm,
+            config.codebook_show_usage,
+        )
+        self.quant_conv = nn.Conv2d(config.z_channels, config.codebook_embed_dim, 1)
+        self.post_quant_conv = nn.Conv2d(
+            config.codebook_embed_dim, config.z_channels, 1
+        )
+
+    def encode(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        quant, emb_loss, info = self.quantize(h)
+        return quant, emb_loss, info
+
+    def decode(self, quant):
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+
+    def decode_code(self, code_b, shape=None, channel_first=True):
+        quant_b = self.quantize.get_codebook_entry(code_b, shape, channel_first)
+        dec = self.decode(quant_b)
+        return dec
+
+    def forward(self, input):
+        quant, diff, _ = self.encode(input)
+        dec = self.decode(quant)
+        return dec, diff
+
+
+class MultiModalityPreTrainedModel(PreTrainedModel):
+    config_class = MultiModalityConfig
+    base_model_prefix = "multi_modality"
+    _no_split_modules = []
+    _skip_keys_device_placement = "past_key_values"
+
+
+# Copied and adapted from:
+# https://github.com/deepseek-ai/Janus/tree/main/janus/models/modeling_vlm.py
+class MultiModalityCausalLM(MultiModalityPreTrainedModel):
+
+    def __init__(
+        self,
+        config: MultiModalityConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__(config)
+
+        vision_config = config.vision_config
+        vision_cls = model_name_to_cls(vision_config.cls)
+        self.vision_model = vision_cls(**vision_config.params)
+
+        aligner_config = config.aligner_config
+        aligner_cls = model_name_to_cls(aligner_config.cls)
+        self.aligner = aligner_cls(aligner_config.params)
+
+        gen_vision_config = config.gen_vision_config
+        gen_vision_cls = model_name_to_cls(gen_vision_config.cls)
+        self.gen_vision_model = gen_vision_cls()
+
+        gen_aligner_config = config.gen_aligner_config
+        gen_aligner_cls = model_name_to_cls(gen_aligner_config.cls)
+        self.gen_aligner = gen_aligner_cls(gen_aligner_config.params)
+
+        gen_head_config = config.gen_head_config
+        gen_head_cls = model_name_to_cls(gen_head_config.cls)
+        self.gen_head = gen_head_cls(gen_head_config.params)
+
+        self.gen_embed = torch.nn.Embedding(
+            gen_vision_config.params["image_token_size"],
+            gen_vision_config.params["n_embed"],
+        )
+
+        language_config = config.language_config
+        self.language_model = LlamaForCausalLM(
+            language_config, quant_config=quant_config
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        pixel_values = torch.concat([item.feature for item in items], dim=0)
+        bs, n = pixel_values.shape[0:2]
+        pixel_values = pixel_values.to(
+            device=self.vision_model.device, dtype=self.vision_model.dtype
+        )
+        images = rearrange(pixel_values, "b n c h w -> (b n) c h w")
+
+        # [b x n, T2, D]
+        images_embeds = self.aligner(self.vision_model(images))
+
+        # [b x n, T2, D] -> [b, n x T2, D]
+        images_embeds = rearrange(images_embeds, "(b n) t d -> b (n t) d", b=bs, n=n)
+
+        return images_embeds
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.language_model.get_input_embeddings()
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = False,
+    ) -> torch.Tensor:
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            multimodal_model=self,
+            language_model=self.language_model,
+            positions=positions,
+        )
+
+        return hidden_states
+
+    def prepare_gen_img_embeds(self, image_ids: torch.LongTensor):
+        return self.gen_aligner(self.gen_embed(image_ids))
+
+    def pad_input_ids(self, input_ids: List[int], image_inputs: MultimodalInputs):
+        im_start_id = image_inputs.im_start_id
+        im_end_id = image_inputs.im_end_id
+        media_token_pairs = [(im_start_id, im_end_id)]
+
+        helper = MultiModalityDataPaddingPatternTokenPairs(media_token_pairs)
+
+        return helper.pad_input_tokens(input_ids, image_inputs)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq~" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+
+            # skip generation sub model
+            if "gen" in name:
+                continue
+
+            # adapt to VisionAttention
+            name = name.replace(r"self_attn.out_proj", r"self_attn.proj")
+            if "vision_model.vision_tower" in name:
+                name = name.replace("attn.qkv", "attn.qkv_proj")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # replace the name and load with customized loader
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                # # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", None)
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+AutoModel.register(config_class=MultiModalityConfig, model_class=MultiModalityCausalLM)
+EntryClass = [MultiModalityCausalLM]
diff --git a/python/sglang/srt/models/deepseek_nextn.py b/python/sglang/srt/models/deepseek_nextn.py
new file mode 100644
index 000000000..0d2283078
--- /dev/null
+++ b/python/sglang/srt/models/deepseek_nextn.py
@@ -0,0 +1,170 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Inference-only DeepSeek NextN Speculative Decoding."""
+import logging
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.layers.dp_attention import is_dp_attention_enabled
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.deepseek_v2 import DeepseekV2DecoderLayer, DeepseekV3ForCausalLM
+from sglang.srt.utils import BumpAllocator, add_prefix
+
+logger = logging.getLogger(__name__)
+
+
+class DeepseekModelNextN(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        if quant_config is not None and quant_config.get_name() == "modelopt_fp4":
+            logger.warning(
+                "Overriding DeepseekV3ForCausalLMNextN quant config for modelopt_fp4 Deepseek model."
+            )
+            quant_config = None
+
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            enable_tp=not is_dp_attention_enabled(),
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+
+        self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.eh_proj = nn.Linear(2 * config.hidden_size, config.hidden_size, bias=False)
+
+        self.decoder = DeepseekV2DecoderLayer(
+            config,
+            0,
+            quant_config=quant_config,
+            is_nextn=True,
+            prefix=add_prefix("decoder", prefix),
+        )
+
+        self.shared_head = nn.Module()
+        self.shared_head.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        zero_allocator = BumpAllocator(
+            buffer_size=2,
+            dtype=torch.float32,
+            device=(
+                input_embeds.device if input_embeds is not None else input_ids.device
+            ),
+        )
+
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        if hidden_states.shape[0] > 0:
+            hidden_states = self.eh_proj(
+                torch.cat(
+                    (
+                        self.enorm(hidden_states),
+                        self.hnorm(forward_batch.spec_info.hidden_states),
+                    ),
+                    dim=-1,
+                )
+            )
+
+        residual = None
+        with get_global_expert_distribution_recorder().disable_this_region():
+            hidden_states, residual = self.decoder(
+                positions, hidden_states, forward_batch, residual, zero_allocator
+            )
+
+        if not forward_batch.forward_mode.is_idle():
+            if residual is not None:
+                hidden_states, _ = self.shared_head.norm(hidden_states, residual)
+            else:
+                hidden_states = self.shared_head.norm(hidden_states)
+
+        return hidden_states
+
+
+class DeepseekV3ForCausalLMNextN(DeepseekV3ForCausalLM):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.quant_config = quant_config
+        # if not set, model load will be broken in DeepseekV3ForCausalLM load_weights()
+        self.pp_group = get_pp_group()
+        self.determine_num_fused_shared_experts("DeepseekV3ForCausalLMNextN")
+
+        self.model = DeepseekModelNextN(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("model.shared_head.head", prefix),
+            use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        super().load_weights(weights, is_nextn=True)
+
+
+EntryClass = [DeepseekV3ForCausalLMNextN]
diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
new file mode 100644
index 000000000..b5535f6d3
--- /dev/null
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -0,0 +1,2980 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from:
+# https://github.com/vllm-project/vllm/blob/fb6af8bc086328ca6659e72d11ffd4309ce4de22/vllm/model_executor/models/deepseek_v2.py
+"""Inference-only DeepseekV2 model."""
+
+import concurrent.futures
+import logging
+import os
+from enum import IntEnum, auto
+from typing import Any, Dict, Iterable, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from tqdm import tqdm
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_moe_expert_parallel_world_size,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    parallel_state,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    use_symmetric_memory,
+)
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.amx_utils import PackWeightMethod
+from sglang.srt.layers.communicator import (
+    LayerCommunicator,
+    LayerScatterModes,
+    enable_moe_dense_fully_dp,
+)
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe import (
+    get_deepep_mode,
+    get_moe_a2a_backend,
+    should_use_flashinfer_cutlass_moe_fp4_allgather,
+)
+from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE, get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton.layer import (
+    FusedMoE,
+    _is_fp4_quantization_enabled,
+)
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization import deep_gemm_wrapper
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.quantization.fp8_kernel import (
+    is_fp8_fnuz,
+    per_tensor_quant_mla_fp8,
+    per_token_group_quant_mla_deep_gemm_masked_fp8,
+)
+from sglang.srt.layers.quantization.fp8_utils import (
+    block_quant_dequant,
+    block_quant_to_tensor_quant,
+    channel_quant_to_tensor_quant,
+    normalize_e4m3fn_to_e4m3fnuz,
+    requant_weight_ue8m0_inplace,
+)
+from sglang.srt.layers.quantization.int8_utils import (
+    block_dequant as int8_block_dequant,
+)
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope_wrapper
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.two_batch_overlap import (
+    MaybeTboDeepEPDispatcher,
+    model_forward_maybe_tbo,
+)
+from sglang.srt.utils import (
+    BumpAllocator,
+    LazyValue,
+    add_prefix,
+    bind_or_assign,
+    cpu_has_amx_support,
+    get_bool_env_var,
+    get_device_sm,
+    get_int_env_var,
+    is_cpu,
+    is_cuda,
+    is_flashinfer_available,
+    is_gfx95_supported,
+    is_hip,
+    is_non_idle_and_non_empty,
+    is_npu,
+    is_sm100_supported,
+    log_info_on_rank0,
+    make_layers,
+    use_intel_amx_backend,
+)
+
+_is_hip = is_hip()
+_is_cuda = is_cuda()
+_is_npu = is_npu()
+_is_fp8_fnuz = is_fp8_fnuz()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+_device_sm = get_device_sm()
+_is_gfx95_supported = is_gfx95_supported()
+
+_use_aiter_gfx95 = _use_aiter and _is_gfx95_supported
+
+if _use_aiter_gfx95:
+    from sglang.srt.layers.quantization.quark.utils import quark_post_load_weights
+    from sglang.srt.layers.quantization.rocm_mxfp4_utils import (
+        batched_gemm_afp4wfp4_pre_quant,
+        fused_flatten_mxfp4_quant,
+        fused_rms_mxfp4_quant,
+    )
+    from sglang.srt.layers.rocm_linear_utils import (
+        aiter_dsv3_router_gemm,
+        fused_qk_rope_cat,
+        get_dsv3_gemm_output_zero_allocator_size,
+    )
+
+if _is_cuda:
+    from sgl_kernel import (
+        awq_dequantize,
+        bmm_fp8,
+        dsv3_fused_a_gemm,
+        dsv3_router_gemm,
+        merge_state_v2,
+    )
+elif _is_cpu and _is_cpu_amx_available:
+    pass
+elif _is_hip:
+    from sglang.srt.layers.quantization.awq_triton import (
+        awq_dequantize_triton as awq_dequantize,
+    )
+else:
+    from vllm._custom_ops import awq_dequantize
+
+if _is_hip:
+    from sglang.srt.layers.attention.triton_ops.rocm_mla_decode_rope import (
+        decode_attention_fwd_grouped_rope,
+    )
+
+_is_flashinfer_available = is_flashinfer_available()
+_is_sm100_supported = is_cuda() and is_sm100_supported()
+
+
+logger = logging.getLogger(__name__)
+
+
+class AttnForwardMethod(IntEnum):
+    # Use multi-head attention
+    MHA = auto()
+
+    # Use absorbed multi-latent attention
+    MLA = auto()
+
+    # Use multi-head attention, but with KV cache chunked.
+    # This method can avoid OOM when prefix lengths are long.
+    MHA_CHUNKED_KV = auto()
+
+    # Use MLA but with fused RoPE
+    MLA_FUSED_ROPE = auto()
+
+    # Use MLA with fused RoPE kernel for CPU
+    MLA_FUSED_ROPE_CPU = auto()
+
+
+class DeepseekV2MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        self.tp_size = tp_size
+
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=add_prefix("down_proj", prefix),
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(
+        self,
+        x,
+        forward_batch=None,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+        gemm_output_zero_allocator: BumpAllocator = None,
+    ):
+        if (self.tp_size == 1) and x.shape[0] == 0:
+            return x
+
+        if (
+            gemm_output_zero_allocator is not None
+            and x.shape[0] <= 256
+            and self.gate_up_proj.weight.dtype == torch.uint8
+        ):
+            y = gemm_output_zero_allocator.allocate(
+                x.shape[0] * self.gate_up_proj.output_size_per_partition
+            ).view(x.shape[0], self.gate_up_proj.output_size_per_partition)
+            x = (x, None, y)
+
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(
+            x, skip_all_reduce=should_allreduce_fusion or use_reduce_scatter
+        )
+        return x
+
+
+class MoEGate(nn.Module):
+    def __init__(
+        self,
+        config,
+        prefix: str = "",
+        is_nextn: bool = False,
+    ):
+        super().__init__()
+        self.is_nextn = is_nextn
+        self.weight = nn.Parameter(
+            torch.empty((config.n_routed_experts, config.hidden_size))
+        )
+        if config.topk_method == "noaux_tc":
+            self.e_score_correction_bias = nn.Parameter(
+                torch.empty((config.n_routed_experts), dtype=torch.float32)
+            )
+        else:
+            self.e_score_correction_bias = None
+        if _is_cpu and _is_cpu_amx_available:
+            self.quant_method = PackWeightMethod(weight_names=["weight"])
+
+    def forward(self, hidden_states, gemm_output_zero_allocator: BumpAllocator = None):
+        if use_intel_amx_backend(self):
+            return torch.ops.sgl_kernel.weight_packed_linear(
+                hidden_states,
+                self.weight,
+                None,  # bias
+                True,  # is_vnni
+            )
+
+        # NOTE: For some unknown reason, router_gemm seems degrade accept length.
+        if (
+            _is_cuda
+            and hidden_states.shape[0] <= 16
+            and hidden_states.shape[1] == 7168
+            and self.weight.shape[0] == 256
+            and _device_sm >= 90
+        ):
+            # router gemm output float32
+            logits = dsv3_router_gemm(
+                hidden_states, self.weight, out_dtype=torch.float32
+            )
+        elif _use_aiter_gfx95 and hidden_states.shape[0] <= 256:
+            logits = aiter_dsv3_router_gemm(
+                hidden_states, self.weight, gemm_output_zero_allocator
+            )
+        else:
+            logits = F.linear(hidden_states, self.weight, None)
+
+        return logits
+
+
+class DeepseekV2MoE(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+        is_nextn: bool = False,
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.n_shared_experts = config.n_shared_experts
+        self.num_fused_shared_experts = (
+            0
+            if global_server_args_dict["disable_shared_experts_fusion"]
+            else config.n_shared_experts
+        )
+        self.config = config
+        self.layer_id = layer_id
+        self.alt_stream = alt_stream
+
+        if self.tp_size > config.n_routed_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.n_routed_experts}."
+            )
+
+        if config.hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+
+        self.gate = MoEGate(
+            config=config, prefix=add_prefix("gate", prefix), is_nextn=is_nextn
+        )
+
+        self.experts = get_moe_impl_class(quant_config)(
+            num_experts=config.n_routed_experts
+            + self.num_fused_shared_experts
+            + global_server_args_dict["ep_num_redundant_experts"],
+            num_fused_shared_experts=self.num_fused_shared_experts,
+            top_k=config.num_experts_per_tok + self.num_fused_shared_experts,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            layer_id=self.layer_id,
+            quant_config=quant_config,
+            routed_scaling_factor=self.routed_scaling_factor,
+            prefix=add_prefix("experts", prefix),
+        )
+
+        correction_bias = self.gate.e_score_correction_bias
+        if _is_fp4_quantization_enabled():
+            correction_bias = correction_bias.to(torch.bfloat16)
+        self.topk = TopK(
+            top_k=config.num_experts_per_tok + self.num_fused_shared_experts,
+            renormalize=config.norm_topk_prob,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            num_fused_shared_experts=self.num_fused_shared_experts,
+            topk_group=config.topk_group,
+            correction_bias=correction_bias,
+            routed_scaling_factor=self.routed_scaling_factor,
+            apply_routed_scaling_factor_on_output=self.experts.should_fuse_routed_scaling_factor_in_topk(),
+            force_topk=quant_config is None,
+        )
+
+        self.shared_experts_is_int8 = False
+        self.shared_experts_is_fp8 = False
+        self.shared_experts_weight_block_size = None
+        if config.n_shared_experts is not None and self.num_fused_shared_experts == 0:
+            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+            # disable tp for shared experts when enable deepep moe, or with fp4 allgather
+            self.shared_experts = DeepseekV2MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=add_prefix("shared_experts", prefix),
+                **(
+                    dict(tp_rank=0, tp_size=1)
+                    if get_moe_a2a_backend().is_deepep()
+                    or should_use_flashinfer_cutlass_moe_fp4_allgather()
+                    else {}
+                ),
+            )
+            is_packed_weight = hasattr(
+                self.shared_experts.gate_up_proj.quant_method, "quant_config"
+            ) and self.shared_experts.gate_up_proj.quant_method.quant_config.get_name() in {
+                "awq",
+                "awq_marlin",
+                "moe_wna16",
+            }
+            self.shared_experts_is_int8 = (
+                not is_packed_weight
+                and self.shared_experts.gate_up_proj.weight.dtype == torch.int8
+            )
+            self.shared_experts_is_fp8 = (
+                not is_packed_weight
+                and self.shared_experts.gate_up_proj.weight.dtype == torch.float8_e4m3fn
+            )
+            if self.shared_experts_is_fp8:
+                assert (
+                    self.shared_experts.gate_up_proj.quant_method.quant_config.weight_block_size
+                    == self.shared_experts.down_proj.quant_method.quant_config.weight_block_size
+                )
+                self.shared_experts_weight_block_size = (
+                    self.shared_experts.gate_up_proj.quant_method.quant_config.weight_block_size
+                )
+
+        self.top_k = config.num_experts_per_tok
+
+        if get_moe_a2a_backend().is_deepep():
+            # TODO: we will support tp < ep in the future
+            self.ep_size = get_moe_expert_parallel_world_size()
+            self.num_experts = (
+                config.n_routed_experts
+                + global_server_args_dict["ep_num_redundant_experts"]
+            )
+            self.renormalize = config.norm_topk_prob
+            self.topk_group = config.topk_group
+            self.num_expert_group = config.n_group
+            self.correction_bias = (
+                self.gate.e_score_correction_bias.data
+                if self.gate.e_score_correction_bias is not None
+                else None
+            )
+
+            self.deepep_dispatcher = MaybeTboDeepEPDispatcher(
+                group=parallel_state.get_tp_group().device_group,
+                router_topk=self.top_k,
+                permute_fusion=True,
+                num_experts=self.num_experts,
+                num_local_experts=config.n_routed_experts // self.tp_size,
+                hidden_size=config.hidden_size,
+                params_dtype=config.torch_dtype,
+                deepep_mode=get_deepep_mode(),
+                async_finish=True,
+                return_recv_hook=True,
+            )
+
+        self._enable_deepep_moe = get_moe_a2a_backend().is_deepep()
+
+    def get_moe_weights(self):
+        return [
+            x.data
+            for name, x in self.experts.named_parameters()
+            if name not in ["correction_bias"]
+        ]
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: Optional[ForwardBatch] = None,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+        gemm_output_zero_allocator: BumpAllocator = None,
+    ) -> torch.Tensor:
+        if not self._enable_deepep_moe:
+            DUAL_STREAM_TOKEN_THRESHOLD = 1024
+            if (
+                self.alt_stream is not None
+                and self.num_fused_shared_experts == 0
+                and hidden_states.shape[0] > 0
+                and hidden_states.shape[0] <= DUAL_STREAM_TOKEN_THRESHOLD
+            ):
+                return self.forward_normal_dual_stream(
+                    hidden_states,
+                    should_allreduce_fusion,
+                    use_reduce_scatter,
+                    gemm_output_zero_allocator,
+                )
+            else:
+                return self.forward_normal(
+                    hidden_states,
+                    should_allreduce_fusion,
+                    use_reduce_scatter,
+                    gemm_output_zero_allocator,
+                )
+        else:
+            return self.forward_deepep(hidden_states, forward_batch)
+
+    def forward_normal_dual_stream(
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+        gemm_output_zero_allocator: BumpAllocator = None,
+    ) -> torch.Tensor:
+
+        current_stream = torch.cuda.current_stream()
+        self.alt_stream.wait_stream(current_stream)
+        shared_output = self._forward_shared_experts(
+            hidden_states, gemm_output_zero_allocator
+        )
+
+        with torch.cuda.stream(self.alt_stream):
+            # router_logits: (num_tokens, n_experts)
+            router_logits = self.gate(hidden_states, gemm_output_zero_allocator)
+            topk_output = self.topk(hidden_states, router_logits)
+            final_hidden_states = self.experts(hidden_states, topk_output)
+            if not _is_cuda:
+                final_hidden_states *= self.routed_scaling_factor
+
+        current_stream.wait_stream(self.alt_stream)
+        with use_symmetric_memory(parallel_state.get_tp_group()) as sm:
+            final_hidden_states_out = torch.empty_like(final_hidden_states)
+
+        torch.add(final_hidden_states, shared_output, out=final_hidden_states_out)
+        final_hidden_states = final_hidden_states_out
+        sm.tag(final_hidden_states)
+        if (
+            self.tp_size > 1
+            and not should_allreduce_fusion
+            and not use_reduce_scatter
+            and not should_use_flashinfer_cutlass_moe_fp4_allgather()
+        ):
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+        return final_hidden_states
+
+    def forward_normal(
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+        gemm_output_zero_allocator: BumpAllocator = None,
+    ) -> torch.Tensor:
+        if hasattr(self, "shared_experts") and use_intel_amx_backend(
+            self.shared_experts.gate_up_proj
+        ):
+            return self.forward_cpu(hidden_states, should_allreduce_fusion)
+
+        if hidden_states.shape[0] > 0:
+            shared_output = self._forward_shared_experts(
+                hidden_states, gemm_output_zero_allocator
+            )
+            # router_logits: (num_tokens, n_experts)
+            router_logits = self.gate(hidden_states, gemm_output_zero_allocator)
+            topk_output = self.topk(hidden_states, router_logits)
+        else:
+            shared_output = None
+            topk_output = self.topk.empty_topk_output(hidden_states.device)
+
+        final_hidden_states = self.experts(hidden_states, topk_output)
+        if not _is_cuda and not _use_aiter:
+            # fused in biased_grouped_topk so we can skip here
+            final_hidden_states *= self.routed_scaling_factor
+        if shared_output is not None:
+            with use_symmetric_memory(parallel_state.get_tp_group()) as sm:
+                final_hidden_states_out = torch.empty_like(final_hidden_states)
+            torch.add(final_hidden_states, shared_output, out=final_hidden_states_out)
+            final_hidden_states = final_hidden_states_out
+            sm.tag(final_hidden_states)
+        if (
+            self.tp_size > 1
+            and not should_allreduce_fusion
+            and not use_reduce_scatter
+            and not should_use_flashinfer_cutlass_moe_fp4_allgather()
+        ):
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+        return final_hidden_states
+
+    def forward_cpu(
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
+    ) -> torch.Tensor:
+        # router_logits: (num_tokens, n_experts)
+        router_logits = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        fused_experts_out = self.experts(
+            hidden_states=hidden_states, topk_output=topk_output
+        )
+
+        assert use_intel_amx_backend(
+            self.shared_experts.gate_up_proj
+        ) == use_intel_amx_backend(self.shared_experts.down_proj)
+        # [Note] inplace should be False in fused_experts.
+        # If inplace is True in fused_experts (self.experts), hidden_states will be changed after fused_experts
+        # While hidden_states is still needed in shared_expert.
+        final_hidden_states = torch.ops.sgl_kernel.shared_expert_cpu(
+            hidden_states,
+            self.shared_experts.gate_up_proj.weight,
+            self.shared_experts.down_proj.weight,
+            fused_experts_out,
+            self.routed_scaling_factor,
+            True,  # inplace
+            self.shared_experts_is_int8,  # use_int8_w8a8
+            self.shared_experts_is_fp8,  # use_fp8_w8a16
+            (
+                self.shared_experts.gate_up_proj.weight_scale
+                if self.shared_experts_is_int8
+                else (
+                    self.shared_experts.gate_up_proj.weight_scale_inv
+                    if self.shared_experts_is_fp8
+                    else None
+                )
+            ),  # w1_scale
+            (
+                self.shared_experts.down_proj.weight_scale
+                if self.shared_experts_is_int8
+                else (
+                    self.shared_experts.down_proj.weight_scale_inv
+                    if self.shared_experts_is_fp8
+                    else None
+                )
+            ),  # w2_scale
+            (
+                self.shared_experts_weight_block_size
+                if self.shared_experts_is_fp8
+                else None
+            ),  # block_size
+            None,  # a1_scale
+            None,  # a2_scale
+            True,  # is_vnni
+        )
+        if self.tp_size > 1 and not should_allreduce_fusion:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+        return final_hidden_states
+
+    def forward_deepep(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        shared_output = None
+        if hidden_states.shape[0] > 0:
+            # router_logits: (num_tokens, n_experts)
+            router_logits = self.gate(hidden_states)
+            shared_output = self._forward_shared_experts(hidden_states)
+            topk_weights, topk_idx, _ = self.topk(
+                hidden_states,
+                router_logits,
+                num_token_non_padded=forward_batch.num_token_non_padded,
+                expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
+                    layer_id=self.layer_id,
+                ),
+            )
+        else:
+            topk_weights, topk_idx, _ = self.topk.empty_topk_output(
+                hidden_states.device
+            )
+
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states,
+            topk_idx=topk_idx,
+            topk_weights=topk_weights,
+            forward_batch=forward_batch,
+        )
+
+        if shared_output is not None:
+            x = shared_output
+            x.add_(final_hidden_states, alpha=self.routed_scaling_factor)
+            final_hidden_states = x
+        else:
+            final_hidden_states *= self.routed_scaling_factor
+
+        return final_hidden_states
+
+    def _forward_shared_experts(
+        self, hidden_states, gemm_output_zero_allocator: BumpAllocator = None
+    ):
+        if self.num_fused_shared_experts == 0:
+            return self.shared_experts(
+                hidden_states, gemm_output_zero_allocator=gemm_output_zero_allocator
+            )
+        else:
+            return None
+
+    def op_gate(self, state):
+        if is_non_idle_and_non_empty(
+            state.forward_batch.forward_mode, state.hidden_states_mlp_input
+        ):
+            # router_logits: (num_tokens, n_experts)
+            state.router_logits = self.gate(state.hidden_states_mlp_input)
+        else:
+            state.router_logits = None
+
+    def op_shared_experts(self, state):
+        hidden_states_mlp_input = state.pop("hidden_states_mlp_input")
+        if (self.num_fused_shared_experts == 0) and is_non_idle_and_non_empty(
+            state.forward_batch.forward_mode, hidden_states_mlp_input
+        ):
+            state.shared_output = self.shared_experts(hidden_states_mlp_input)
+        else:
+            state.shared_output = None
+
+    def op_select_experts(self, state):
+        router_logits = state.pop("router_logits")
+        hidden_states = state.hidden_states_mlp_input
+
+        if router_logits is not None:
+            with get_global_expert_distribution_recorder().with_current_layer(
+                self.layer_id
+            ):
+                state.topk_weights_local, state.topk_idx_local, _ = self.topk(
+                    hidden_states=hidden_states,
+                    router_logits=router_logits,
+                    num_token_non_padded=state.forward_batch.num_token_non_padded,
+                    expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
+                        layer_id=self.layer_id,
+                    ),
+                )
+        else:
+            state.topk_idx_local = torch.full(
+                (0, self.top_k), -1, dtype=torch.int, device=hidden_states.device
+            )
+            state.topk_weights_local = torch.empty(
+                (0, self.top_k), dtype=torch.float32, device=hidden_states.device
+            )
+
+    def op_dispatch_a(self, state):
+        if self.ep_size > 1:
+            self.experts.deepep_dispatcher.dispatch_a(
+                hidden_states=state.hidden_states_mlp_input,
+                topk_idx=state.pop("topk_idx_local"),
+                topk_weights=state.pop("topk_weights_local"),
+                forward_batch=state.forward_batch,
+                tbo_subbatch_index=state.get("tbo_subbatch_index"),
+            )
+
+    def op_dispatch_b(self, state):
+        if self.ep_size > 1:
+            with get_global_expert_distribution_recorder().with_current_layer(
+                self.layer_id
+            ):
+                state.dispatch_output = self.experts.deepep_dispatcher.dispatch_b(
+                    tbo_subbatch_index=state.get("tbo_subbatch_index"),
+                )
+
+    def op_experts(self, state):
+        state.hidden_states_experts_output = self.experts.moe_impl(
+            dispatch_output=state.dispatch_output,
+        )
+
+    def op_combine_a(self, state):
+        if self.ep_size > 1:
+            self.experts.deepep_dispatcher.combine_a(
+                hidden_states=state.pop("hidden_states_experts_output"),
+                topk_idx=state.dispatch_output.topk_idx,
+                topk_weights=state.dispatch_output.topk_weights,
+                forward_batch=state.forward_batch,
+                tbo_subbatch_index=state.get("tbo_subbatch_index"),
+            )
+            state.pop("dispatch_output")
+
+    def op_combine_b(self, state):
+        if self.ep_size > 1:
+            state.hidden_states_after_combine = (
+                self.experts.deepep_dispatcher.combine_b(
+                    tbo_subbatch_index=state.get("tbo_subbatch_index"),
+                )
+            )
+
+    def op_output(self, state):
+        final_hidden_states = state.pop("hidden_states_after_combine")
+
+        if (shared_output := state.pop("shared_output")) is not None:
+            x = shared_output
+            x.add_(final_hidden_states, alpha=self.routed_scaling_factor)
+            final_hidden_states = x
+        else:
+            final_hidden_states *= self.routed_scaling_factor
+
+        state.hidden_states_mlp_output = final_hidden_states
+
+
+def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    import math
+
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
+class DeepseekV2AttentionMLA(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: int,
+        kv_lora_rank: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        layer_id: int = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        self.num_heads = num_heads
+        assert num_heads % attn_tp_size == 0
+        self.num_local_heads = num_heads // attn_tp_size
+        self.scaling = self.qk_head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        # For tensor parallel attention
+        if self.q_lora_rank is not None:
+            self.fused_qkv_a_proj_with_mqa = ReplicatedLinear(
+                self.hidden_size,
+                self.q_lora_rank + self.kv_lora_rank + self.qk_rope_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("fused_qkv_a_proj_with_mqa", prefix),
+            )
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(
+                q_lora_rank,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("q_b_proj", prefix),
+                tp_rank=attn_tp_rank,
+                tp_size=attn_tp_size,
+            )
+        else:
+            self.q_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("q_proj", prefix),
+                tp_rank=attn_tp_rank,
+                tp_size=attn_tp_size,
+            )
+            self.kv_a_proj_with_mqa = ReplicatedLinear(
+                self.hidden_size,
+                self.kv_lora_rank + self.qk_rope_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("kv_a_proj_with_mqa", prefix),
+            )
+
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("kv_b_proj", prefix),
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+        )
+        # O projection.
+        self.o_proj = RowParallelLinear(
+            self.num_heads * self.v_head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=add_prefix("o_proj", prefix),
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+        )
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
+
+        if rope_scaling:
+            rope_scaling["rope_type"] = "deepseek_yarn"
+
+        self.rotary_emb = get_rope_wrapper(
+            qk_rope_head_dim,
+            rotary_dim=qk_rope_head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=False,
+            device=global_server_args_dict["device"],
+        )
+
+        if rope_scaling:
+            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
+            scaling_factor = rope_scaling["factor"]
+            mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+            self.scaling = self.scaling * mscale * mscale
+        else:
+            self.rotary_emb.forward = self.rotary_emb.forward_native
+
+        self.attn_mqa = RadixAttention(
+            self.num_local_heads,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            self.scaling,
+            num_kv_heads=1,
+            layer_id=layer_id,
+            v_head_dim=self.kv_lora_rank,
+            quant_config=quant_config,
+            prefix=add_prefix("attn_mqa", prefix),
+        )
+
+        self.attn_mha = RadixAttention(
+            self.num_local_heads,
+            self.qk_nope_head_dim + self.qk_rope_head_dim,
+            self.scaling,
+            num_kv_heads=self.num_local_heads,
+            layer_id=layer_id,
+            v_head_dim=self.v_head_dim,
+            quant_config=quant_config,
+            prefix=add_prefix("attn_mha", prefix),
+        )
+
+        self.alt_stream = alt_stream
+        self.attn_mha.kv_b_proj = None
+
+        self.w_kc = None
+        self.w_vc = None
+        self.w_scale = 1.0
+
+        self.w_scale_k = None
+        self.w_scale_v = None
+        self.use_deep_gemm_bmm = False
+
+        self.flashinfer_mla_disable_ragged = global_server_args_dict[
+            "flashinfer_mla_disable_ragged"
+        ]
+        self.disable_chunked_prefix_cache = global_server_args_dict[
+            "disable_chunked_prefix_cache"
+        ]
+
+        self.current_attention_backend = (
+            None  # Attention backend used by current forward batch
+        )
+        self.rocm_fused_decode_mla = get_bool_env_var(
+            "SGLANG_ROCM_FUSED_DECODE_MLA", "false"
+        )
+
+        # TODO: Design a finer way to determine the threshold
+        self.chunked_prefix_cache_threshold = get_int_env_var(
+            "SGL_CHUNKED_PREFIX_CACHE_THRESHOLD", 8192
+        )
+
+        # If we have self.fused_qkv_a_proj_with_mqa and we're running on CPU, we will choose the torch.ops.sgl_kernel.qkv_proj_with_rope_fused_weight kernel
+        # which requires self.w_kc and self.w_vc to be packed.
+        # If not, we will use torch.bmm and weight shouldn't be packed in this case
+        has_fused_proj = hasattr(self, "fused_qkv_a_proj_with_mqa")
+        if has_fused_proj and _is_cpu and _is_cpu_amx_available:
+            self.quant_method = PackWeightMethod(
+                weight_names=["w_kc", "w_vc"], transpose_dims=[[1, 2], [1, 2]]
+            )
+
+        is_packed_weight = (
+            has_fused_proj
+            and hasattr(self.fused_qkv_a_proj_with_mqa.quant_method, "quant_config")
+            and self.fused_qkv_a_proj_with_mqa.quant_method.quant_config.get_name()
+            in {"awq", "awq_marlin", "moe_wna16"}
+        )
+        self.use_min_latency_fused_a_gemm = (
+            has_fused_proj
+            and not is_packed_weight
+            and self.fused_qkv_a_proj_with_mqa.weight.dtype == torch.bfloat16
+            and self.fused_qkv_a_proj_with_mqa.weight.shape[0] == 2112
+            and self.fused_qkv_a_proj_with_mqa.weight.shape[1] == 7168
+            and _is_cuda
+            and _device_sm >= 90
+        )
+
+        self.qkv_proj_with_rope_is_int8 = (
+            has_fused_proj
+            and not is_packed_weight
+            and self.fused_qkv_a_proj_with_mqa.weight.dtype == torch.int8
+        )
+        self.qkv_proj_with_rope_is_fp8 = (
+            has_fused_proj
+            and not is_packed_weight
+            and self.fused_qkv_a_proj_with_mqa.weight.dtype == torch.float8_e4m3fn
+        )
+
+        self.weight_block_size = None
+        if self.qkv_proj_with_rope_is_fp8 and _is_cpu and _is_cpu_amx_available:
+            assert getattr(
+                self.fused_qkv_a_proj_with_mqa.quant_method, "block_quant", False
+            ) == getattr(self.q_b_proj.quant_method, "block_quant", False)
+            use_block_quant = getattr(
+                self.fused_qkv_a_proj_with_mqa.quant_method, "block_quant", False
+            )
+
+            if use_block_quant:
+                assert (
+                    self.fused_qkv_a_proj_with_mqa.quant_method.quant_config.weight_block_size
+                    == self.q_b_proj.quant_method.quant_config.weight_block_size
+                )
+                self.weight_block_size = (
+                    self.fused_qkv_a_proj_with_mqa.quant_method.quant_config.weight_block_size
+                )
+
+    def dispatch_attn_forward_method(
+        self, forward_batch: ForwardBatch
+    ) -> AttnForwardMethod:
+        def _dispatch_mla_subtype():
+            if _is_hip:
+                if (
+                    self.rocm_fused_decode_mla
+                    and forward_batch.forward_mode.is_decode()
+                ):
+                    return AttnForwardMethod.MLA_FUSED_ROPE
+                else:
+                    return AttnForwardMethod.MLA
+            else:
+                if hasattr(self, "fused_qkv_a_proj_with_mqa") and use_intel_amx_backend(
+                    self
+                ):
+                    return AttnForwardMethod.MLA_FUSED_ROPE_CPU
+                else:
+                    return AttnForwardMethod.MLA
+
+        # Determine attention backend used by current forward batch
+        if forward_batch.forward_mode.is_decode_or_idle():
+            attention_backend = global_server_args_dict["decode_attention_backend"]
+        elif (
+            forward_batch.forward_mode.is_target_verify()
+            or forward_batch.forward_mode.is_draft_extend()
+        ):
+            # Use the specified backend for speculative operations (both verify and draft extend)
+            if global_server_args_dict["speculative_attention_mode"] == "decode":
+                attention_backend = global_server_args_dict["decode_attention_backend"]
+            else:  # default to prefill
+                attention_backend = global_server_args_dict["prefill_attention_backend"]
+        else:
+            attention_backend = global_server_args_dict["prefill_attention_backend"]
+        self.current_attention_backend = attention_backend
+
+        if attention_backend == "ascend":
+            if (
+                forward_batch.forward_mode.is_extend()
+                and not forward_batch.forward_mode.is_target_verify()
+                and not forward_batch.forward_mode.is_draft_extend()
+            ):
+                return AttnForwardMethod.MHA
+            else:
+                return AttnForwardMethod.MLA
+        elif (
+            attention_backend == "flashinfer"
+            or attention_backend == "fa3"
+            or attention_backend == "flashmla"
+            or attention_backend == "cutlass_mla"
+        ):
+            # Use MHA with chunked KV cache when prefilling on long sequences.
+            sum_extend_prefix_lens = (
+                sum(forward_batch.extend_prefix_lens_cpu)
+                if forward_batch.extend_prefix_lens_cpu is not None
+                else 0
+            )
+            # Flashinfer MLA: Do not absorb when enabling ragged prefill
+            disable_ragged = (
+                attention_backend == "flashinfer" or attention_backend == "flashmla"
+            ) and self.flashinfer_mla_disable_ragged
+            if (
+                not disable_ragged
+                and forward_batch.forward_mode.is_extend()
+                and not forward_batch.forward_mode.is_target_verify()
+                and not forward_batch.forward_mode.is_draft_extend()
+                and (
+                    (
+                        sum_extend_prefix_lens >= self.chunked_prefix_cache_threshold
+                        and not self.disable_chunked_prefix_cache
+                    )
+                    or sum_extend_prefix_lens == 0
+                )
+            ):
+                return AttnForwardMethod.MHA_CHUNKED_KV
+            else:
+                return _dispatch_mla_subtype()
+        elif attention_backend == "trtllm_mla":
+            if (
+                forward_batch.forward_mode.is_extend()
+                and not forward_batch.forward_mode.is_target_verify()
+                and not forward_batch.forward_mode.is_draft_extend()
+            ):
+                return AttnForwardMethod.MHA_CHUNKED_KV
+            else:
+                return _dispatch_mla_subtype()
+        elif attention_backend == "aiter":
+            if (
+                forward_batch.forward_mode.is_extend()
+                and not forward_batch.forward_mode.is_target_verify()
+                and not forward_batch.forward_mode.is_draft_extend()
+            ):
+                if is_dp_attention_enabled():
+                    if sum(forward_batch.extend_prefix_lens_cpu) == 0:
+                        return AttnForwardMethod.MHA
+                    else:
+                        return AttnForwardMethod.MLA
+                else:
+                    return AttnForwardMethod.MHA
+            else:
+                return AttnForwardMethod.MLA
+        else:
+            # Triton: Use normal computation for prefill and use weight absorption for extend/decode
+            if (
+                forward_batch.forward_mode.is_extend()
+                and not forward_batch.forward_mode.is_target_verify()
+                and not forward_batch.forward_mode.is_draft_extend()
+                and sum(forward_batch.extend_prefix_lens_cpu) == 0
+            ):
+                return AttnForwardMethod.MHA
+            else:
+                return _dispatch_mla_subtype()
+
+    def op_prepare(self, state):
+        state.attn_intermediate_state = self.forward_prepare(
+            positions=state.positions,
+            hidden_states=state.pop("hidden_states_after_comm_pre_attn"),
+            forward_batch=state.forward_batch,
+            zero_allocator=state.zero_allocator,
+        )
+
+    def op_core(self, state):
+        state.hidden_states_after_attn = self.forward_core(
+            state.pop("attn_intermediate_state")
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        zero_allocator: BumpAllocator,
+    ):
+        s = self.forward_prepare(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+            zero_allocator=zero_allocator,
+        )
+        return self.forward_core(s)
+
+    def forward_prepare(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        zero_allocator: BumpAllocator,
+    ):
+        if self.attn_mha.kv_b_proj is None:
+            self.attn_mha.kv_b_proj = self.kv_b_proj
+
+        # when hidden_states is a tuple of tensors, the tuple will include quantized weight and scale tensor
+        if isinstance(hidden_states, tuple):
+            if hidden_states[0].shape[0] == 0:
+                assert (
+                    not self.o_proj.reduce_results
+                ), "short-circuiting allreduce will lead to hangs"
+                return hidden_states[0]
+        else:
+            if hidden_states.shape[0] == 0:
+                assert (
+                    not self.o_proj.reduce_results
+                ), "short-circuiting allreduce will lead to hangs"
+                return hidden_states, None, forward_batch, None
+
+        attn_forward_method = self.dispatch_attn_forward_method(forward_batch)
+
+        if attn_forward_method == AttnForwardMethod.MHA:
+            inner_state = self.forward_normal_prepare(
+                positions, hidden_states, forward_batch, zero_allocator
+            )
+        elif attn_forward_method == AttnForwardMethod.MHA_CHUNKED_KV:
+            inner_state = self.forward_normal_chunked_kv_prepare(
+                positions, hidden_states, forward_batch, zero_allocator
+            )
+        elif attn_forward_method == AttnForwardMethod.MLA:
+            inner_state = self.forward_absorb_prepare(
+                positions, hidden_states, forward_batch, zero_allocator
+            )
+        elif attn_forward_method == AttnForwardMethod.MLA_FUSED_ROPE:
+            inner_state = self.forward_absorb_fused_mla_rope_prepare(
+                positions, hidden_states, forward_batch, zero_allocator
+            )
+        elif attn_forward_method == AttnForwardMethod.MLA_FUSED_ROPE_CPU:
+            inner_state = self.forward_absorb_fused_mla_rope_cpu_prepare(
+                positions, hidden_states, forward_batch, zero_allocator
+            )
+        else:
+            raise NotImplementedError
+        return None, attn_forward_method, forward_batch, inner_state
+
+    def forward_core(self, intermediate_state):
+        hidden_states, attn_forward_method, forward_batch, inner_state = (
+            intermediate_state
+        )
+        if inner_state is None:
+            return hidden_states
+
+        if attn_forward_method == AttnForwardMethod.MHA:
+            return self.forward_normal_core(*inner_state)
+        elif attn_forward_method == AttnForwardMethod.MHA_CHUNKED_KV:
+            return self.forward_normal_chunked_kv_core(*inner_state)
+        elif attn_forward_method == AttnForwardMethod.MLA:
+            return self.forward_absorb_core(*inner_state)
+        elif attn_forward_method == AttnForwardMethod.MLA_FUSED_ROPE:
+            return self.forward_absorb_fused_mla_rope_core(*inner_state)
+        elif attn_forward_method == AttnForwardMethod.MLA_FUSED_ROPE_CPU:
+            return self.forward_absorb_fused_mla_rope_cpu_core(*inner_state)
+        else:
+            raise NotImplementedError
+
+    def forward_normal_prepare(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        zero_allocator: BumpAllocator,
+    ):
+        if self.q_lora_rank is not None:
+            q, latent_cache = self.fused_qkv_a_proj_with_mqa(hidden_states)[0].split(
+                [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim], dim=-1
+            )
+            q = self.q_a_layernorm(q)
+            q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, self.qk_head_dim)
+        else:
+            q = self.q_proj(hidden_states)[0].view(
+                -1, self.num_local_heads, self.qk_head_dim
+            )
+            latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
+
+        _, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+        kv_a, _ = latent_cache.split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        latent_cache = latent_cache.unsqueeze(1)
+        kv_a = self.kv_a_layernorm(kv_a)
+        kv = self.kv_b_proj(kv_a)[0]
+        kv = kv.view(-1, self.num_local_heads, self.qk_nope_head_dim + self.v_head_dim)
+        k_nope = kv[..., : self.qk_nope_head_dim]
+        v = kv[..., self.qk_nope_head_dim :]
+        k_pe = latent_cache[:, :, self.kv_lora_rank :]
+        q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+        q[..., self.qk_nope_head_dim :] = q_pe
+        k = torch.empty_like(q)
+        k[..., : self.qk_nope_head_dim] = k_nope
+        k[..., self.qk_nope_head_dim :] = k_pe
+
+        if not _is_npu:
+            latent_cache[:, :, : self.kv_lora_rank] = kv_a.unsqueeze(1)
+            latent_cache[:, :, self.kv_lora_rank :] = k_pe
+
+            # Save latent cache
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                self.attn_mha, forward_batch.out_cache_loc, latent_cache, None
+            )
+        else:
+            # To reduce a time-costing split operation
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                self.attn_mha, forward_batch.out_cache_loc, kv_a.unsqueeze(1), k_pe
+            )
+
+        return q, k, v, forward_batch
+
+    def forward_normal_core(self, q, k, v, forward_batch):
+        attn_output = self.attn_mha(q, k, v, forward_batch, save_kv_cache=False)
+        attn_output = attn_output.reshape(-1, self.num_local_heads * self.v_head_dim)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def _fuse_rope_for_trtllm_mla(self, forward_batch: ForwardBatch) -> bool:
+        """
+        Check if we should skip rope and do fused rope+quantize for TRTLLM MLA decode in fp8_e4m3 path.
+        """
+        return (
+            self.current_attention_backend == "trtllm_mla"
+            and forward_batch.forward_mode.is_decode_or_idle()
+            and forward_batch.attn_backend.data_type == torch.float8_e4m3fn
+        )
+
+    def forward_absorb_prepare(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        zero_allocator: BumpAllocator,
+    ):
+        from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+
+        if self.q_lora_rank is not None:
+            if (
+                (not isinstance(hidden_states, tuple))
+                and hidden_states.shape[0] <= 16
+                and self.use_min_latency_fused_a_gemm
+            ):
+                fused_qkv_a_proj_out = dsv3_fused_a_gemm(
+                    hidden_states, self.fused_qkv_a_proj_with_mqa.weight.T
+                )
+            else:
+                fused_qkv_a_proj_out = self.fused_qkv_a_proj_with_mqa(hidden_states)[0]
+            q, latent_cache = fused_qkv_a_proj_out.split(
+                [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim], dim=-1
+            )
+            k_nope = latent_cache[..., : self.kv_lora_rank]
+
+            # overlap qk norm
+            if self.alt_stream is not None and get_is_capture_mode():
+                current_stream = torch.cuda.current_stream()
+                self.alt_stream.wait_stream(current_stream)
+                q = self.q_a_layernorm(q)
+                with torch.cuda.stream(self.alt_stream):
+                    k_nope = self.kv_a_layernorm(k_nope)
+                current_stream.wait_stream(self.alt_stream)
+            else:
+                if _use_aiter_gfx95 and self.q_b_proj.weight.dtype == torch.uint8:
+                    q, k_nope = fused_rms_mxfp4_quant(
+                        q,
+                        self.q_a_layernorm.weight,
+                        self.q_a_layernorm.variance_epsilon,
+                        k_nope,
+                        self.kv_a_layernorm.weight,
+                        self.kv_a_layernorm.variance_epsilon,
+                    )
+                else:
+                    q = self.q_a_layernorm(q)
+                    k_nope = self.kv_a_layernorm(k_nope)
+
+            k_nope = k_nope.unsqueeze(1)
+            q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, self.qk_head_dim)
+        else:
+            q = self.q_proj(hidden_states)[0].view(
+                -1, self.num_local_heads, self.qk_head_dim
+            )
+            latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
+            k_nope = latent_cache[..., : self.kv_lora_rank]
+            k_nope = self.kv_a_layernorm(k_nope).unsqueeze(1)
+
+        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+        k_pe = latent_cache[..., self.kv_lora_rank :].unsqueeze(1)
+
+        if self.use_deep_gemm_bmm:
+            q_nope_val, q_nope_scale, masked_m, expected_m, aligned_m = (
+                per_token_group_quant_mla_deep_gemm_masked_fp8(q_nope.transpose(0, 1))
+            )
+            q_nope_out = q_nope.new_empty(
+                (self.num_local_heads, aligned_m, self.kv_lora_rank)
+            )
+            deep_gemm_wrapper.grouped_gemm_nt_f8f8bf16_masked(
+                (q_nope_val, q_nope_scale),
+                (self.w_kc, self.w_scale_k),
+                q_nope_out,
+                masked_m,
+                expected_m,
+            )
+            q_nope_out = q_nope_out[:, :expected_m, :]
+        elif _is_hip:
+            # TODO(haishaw): add bmm_fp8 to ROCm
+            if _use_aiter_gfx95 and self.w_kc.dtype == torch.uint8:
+                x = q_nope.transpose(0, 1)
+                q_nope_out = torch.empty(
+                    x.shape[0],
+                    x.shape[1],
+                    self.w_kc.shape[2],
+                    device=x.device,
+                    dtype=torch.bfloat16,
+                )
+                batched_gemm_afp4wfp4_pre_quant(
+                    x,
+                    self.w_kc.transpose(-2, -1),
+                    self.w_scale_k.transpose(-2, -1),
+                    torch.bfloat16,
+                    q_nope_out,
+                )
+            else:
+                q_nope_out = torch.bmm(
+                    q_nope.to(torch.bfloat16).transpose(0, 1),
+                    self.w_kc.to(torch.bfloat16) * self.w_scale,
+                )
+        elif self.w_kc.dtype == torch.float8_e4m3fn:
+            q_nope_val, q_nope_scale = per_tensor_quant_mla_fp8(
+                q_nope.transpose(0, 1),
+                zero_allocator.allocate(1),
+            )
+            q_nope_out = bmm_fp8(
+                q_nope_val, self.w_kc, q_nope_scale, self.w_scale, torch.bfloat16
+            )
+        else:
+            q_nope_out = torch.bmm(q_nope.transpose(0, 1), self.w_kc)
+
+        q_nope_out = q_nope_out.transpose(0, 1)
+
+        if not self._fuse_rope_for_trtllm_mla(forward_batch) and (
+            not _use_aiter or not _is_gfx95_supported
+        ):
+            q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+
+        return q_pe, k_pe, q_nope_out, k_nope, forward_batch, zero_allocator, positions
+
+    def forward_absorb_core(
+        self, q_pe, k_pe, q_nope_out, k_nope, forward_batch, zero_allocator, positions
+    ):
+        if (
+            self.current_attention_backend == "fa3"
+            or self.current_attention_backend == "flashinfer"
+            or self.current_attention_backend == "cutlass_mla"
+            or self.current_attention_backend == "trtllm_mla"
+            or self.current_attention_backend == "ascend"
+        ):
+            extra_args = {}
+            if self._fuse_rope_for_trtllm_mla(forward_batch):
+                extra_args = {
+                    "cos_sin_cache": self.rotary_emb.cos_sin_cache,
+                    "is_neox": self.rotary_emb.is_neox_style,
+                }
+            attn_output = self.attn_mqa(
+                q_nope_out,
+                k_nope,
+                k_nope,
+                forward_batch,
+                q_rope=q_pe,
+                k_rope=k_pe,
+                **extra_args,
+            )
+        else:
+            if _use_aiter_gfx95:
+                cos = self.rotary_emb.cos_cache
+                sin = self.rotary_emb.sin_cache
+                q, k = fused_qk_rope_cat(
+                    q_nope_out,
+                    q_pe,
+                    k_nope,
+                    k_pe,
+                    positions,
+                    cos,
+                    sin,
+                    self.rotary_emb.is_neox_style,
+                )
+            else:
+                q = torch.cat([q_nope_out, q_pe], dim=-1)
+                k = torch.cat([k_nope, k_pe], dim=-1)
+
+            attn_output = self.attn_mqa(q, k, k_nope, forward_batch)
+        attn_output = attn_output.view(-1, self.num_local_heads, self.kv_lora_rank)
+
+        if self.use_deep_gemm_bmm:
+            attn_output_val, attn_output_scale, masked_m, expected_m, aligned_m = (
+                per_token_group_quant_mla_deep_gemm_masked_fp8(
+                    attn_output.transpose(0, 1)
+                )
+            )
+            attn_bmm_output = attn_output.new_empty(
+                (self.num_local_heads, aligned_m, self.v_head_dim)
+            )
+            deep_gemm_wrapper.grouped_gemm_nt_f8f8bf16_masked(
+                (attn_output_val, attn_output_scale),
+                (self.w_vc, self.w_scale_v),
+                attn_bmm_output,
+                masked_m,
+                expected_m,
+            )
+            attn_bmm_output = (
+                attn_bmm_output[:, :expected_m, :].transpose(0, 1).flatten(1, 2)
+            )
+        elif _is_hip:
+            # TODO(haishaw): add bmm_fp8 to ROCm
+            if _use_aiter_gfx95 and self.w_vc.dtype == torch.uint8:
+                x = attn_output.transpose(0, 1)
+                attn_bmm_output = torch.empty(
+                    x.shape[0],
+                    x.shape[1],
+                    self.w_vc.shape[2],
+                    device=x.device,
+                    dtype=torch.bfloat16,
+                )
+                batched_gemm_afp4wfp4_pre_quant(
+                    x,
+                    self.w_vc.transpose(-2, -1),
+                    self.w_scale_v.transpose(-2, -1),
+                    torch.bfloat16,
+                    attn_bmm_output,
+                )
+            else:
+                attn_bmm_output = torch.bmm(
+                    attn_output.to(torch.bfloat16).transpose(0, 1),
+                    self.w_vc.to(torch.bfloat16) * self.w_scale,
+                )
+
+            if self.o_proj.weight.dtype == torch.uint8:
+                attn_bmm_output = attn_bmm_output.transpose(0, 1)
+                attn_bmm_output = fused_flatten_mxfp4_quant(attn_bmm_output)
+            else:
+                attn_bmm_output = attn_bmm_output.transpose(0, 1).flatten(1, 2)
+
+        elif self.w_vc.dtype == torch.float8_e4m3fn:
+            attn_output_val, attn_output_scale = per_tensor_quant_mla_fp8(
+                attn_output.transpose(0, 1),
+                zero_allocator.allocate(1),
+            )
+            attn_bmm_output = bmm_fp8(
+                attn_output_val,
+                self.w_vc,
+                attn_output_scale,
+                self.w_scale,
+                torch.bfloat16,
+            )
+            attn_bmm_output = attn_bmm_output.transpose(0, 1).flatten(1, 2)
+        else:
+            attn_bmm_output = torch.empty(
+                (attn_output.shape[0], self.num_local_heads * self.v_head_dim),
+                dtype=attn_output.dtype,
+                device=attn_output.device,
+            )
+            torch.bmm(
+                attn_output.transpose(0, 1),
+                self.w_vc,
+                out=attn_bmm_output.view(
+                    -1, self.num_local_heads, self.v_head_dim
+                ).transpose(0, 1),
+            )
+        output, _ = self.o_proj(attn_bmm_output)
+
+        return output
+
+    def forward_absorb_fused_mla_rope_prepare(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        zero_allocator: BumpAllocator,
+    ):
+        enable_rope_fusion = (
+            os.getenv("SGLANG_FUSED_MLA_ENABLE_ROPE_FUSION", "1") == "1"
+        )
+        q_len = hidden_states.shape[0]
+        q_input = hidden_states.new_empty(
+            q_len, self.num_local_heads, self.kv_lora_rank + self.qk_rope_head_dim
+        )
+        if self.q_lora_rank is not None:
+            q, latent_cache = self.fused_qkv_a_proj_with_mqa(hidden_states)[0].split(
+                [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim], dim=-1
+            )
+            q = self.q_a_layernorm(q)
+            q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, self.qk_head_dim)
+        else:
+            q = self.q_proj(hidden_states)[0].view(
+                -1, self.num_local_heads, self.qk_head_dim
+            )
+            latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
+        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+
+        if _is_hip:
+            # TODO(haishaw): add bmm_fp8 to ROCm
+            q_nope_out = torch.bmm(
+                q_nope.to(torch.bfloat16).transpose(0, 1),
+                self.w_kc.to(torch.bfloat16) * self.w_scale,
+            )
+        elif self.w_kc.dtype == torch.float8_e4m3fn:
+            q_nope_val, q_nope_scale = per_tensor_quant_mla_fp8(
+                q_nope.transpose(0, 1),
+                zero_allocator.allocate(1),
+                dtype=torch.float8_e4m3fn,
+            )
+            q_nope_out = bmm_fp8(
+                q_nope_val, self.w_kc, q_nope_scale, self.w_scale, torch.bfloat16
+            )
+        else:
+            q_nope_out = torch.bmm(q_nope.transpose(0, 1), self.w_kc)
+        q_input[..., : self.kv_lora_rank] = q_nope_out.transpose(0, 1)
+        v_input = latent_cache[..., : self.kv_lora_rank]
+        v_input = self.kv_a_layernorm(v_input.contiguous()).unsqueeze(1)
+        k_input = latent_cache.unsqueeze(1)
+        k_input[..., : self.kv_lora_rank] = v_input
+
+        if not enable_rope_fusion:
+            k_pe = k_input[..., self.kv_lora_rank :]
+            q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+            q_input[..., self.kv_lora_rank :] = q_pe
+            k_input[..., self.kv_lora_rank :] = k_pe
+            k_pe_output = None
+        else:
+            k_pe_output = torch.empty_like(k_input[..., self.kv_lora_rank :])
+
+        q_input[..., self.kv_lora_rank :] = q_pe
+
+        # attn_output = self.attn_mqa(q_input, k_input, v_input, forward_batch)
+        # Use Fused ROPE with use_rope=OFF.
+        attn_output = torch.empty(
+            (q_len, self.num_local_heads, self.kv_lora_rank),
+            dtype=q.dtype,
+            device=q.device,
+        )
+        attn_logits, _, kv_indptr, kv_indices, _, _, _ = (
+            forward_batch.attn_backend.forward_metadata
+        )
+        cos_sin_cache = self.rotary_emb.cos_sin_cache
+        num_kv_split = forward_batch.attn_backend.num_kv_splits
+        sm_scale = self.attn_mqa.scaling
+        if attn_logits is None:
+            attn_logits = torch.empty(
+                (
+                    forward_batch.batch_size,
+                    self.num_local_heads,
+                    num_kv_split,
+                    self.kv_lora_rank + 1,
+                ),
+                dtype=torch.float32,
+                device=q.device,
+            )
+
+        # save current latent cache.
+        forward_batch.token_to_kv_pool.set_kv_buffer(
+            self.attn_mqa, forward_batch.out_cache_loc, k_input, None
+        )
+        key_cache_buf = forward_batch.token_to_kv_pool.get_key_buffer(
+            self.attn_mqa.layer_id
+        )
+        val_cache_buf = key_cache_buf[..., : self.kv_lora_rank]
+
+        return (
+            q_input,
+            key_cache_buf,
+            val_cache_buf,
+            attn_output,
+            kv_indptr,
+            kv_indices,
+            k_pe_output,
+            cos_sin_cache,
+            positions,
+            attn_logits,
+            num_kv_split,
+            sm_scale,
+            enable_rope_fusion,
+            k_input,
+            forward_batch,
+            zero_allocator,
+        )
+
+    def forward_absorb_fused_mla_rope_cpu_prepare(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        zero_allocator: BumpAllocator,
+    ):
+        assert self.q_lora_rank is not None and use_intel_amx_backend(
+            self
+        ), "forward_absorb_fused_mla_rope_cpu_prepare requires q_lora_rank is not None and use_intel_amx_backend"
+
+        q_input, k_input, v_input = (
+            torch.ops.sgl_kernel.qkv_proj_with_rope_fused_weight(
+                hidden_states,
+                self.fused_qkv_a_proj_with_mqa.weight,
+                self.q_b_proj.weight,
+                self.w_kc,
+                self.q_a_layernorm.weight,
+                self.kv_a_layernorm.weight,
+                positions,
+                self.rotary_emb.cos_sin_cache,
+                self.kv_a_layernorm.variance_epsilon,
+                self.qkv_proj_with_rope_is_int8,
+                self.qkv_proj_with_rope_is_fp8,
+                (
+                    self.fused_qkv_a_proj_with_mqa.weight_scale
+                    if self.qkv_proj_with_rope_is_int8
+                    else (
+                        self.fused_qkv_a_proj_with_mqa.weight_scale_inv
+                        if self.qkv_proj_with_rope_is_fp8
+                        else None
+                    )
+                ),
+                (
+                    self.q_b_proj.weight_scale
+                    if self.qkv_proj_with_rope_is_int8
+                    else (
+                        self.q_b_proj.weight_scale_inv
+                        if self.qkv_proj_with_rope_is_fp8
+                        else None
+                    )
+                ),
+                True,  # is_vnni
+                self.weight_block_size,
+                self.q_lora_rank,
+                self.kv_lora_rank,
+                self.qk_rope_head_dim,
+            )
+        )
+        return (q_input, k_input, v_input, forward_batch, zero_allocator)
+
+    def forward_absorb_fused_mla_rope_core(
+        self,
+        q_input,
+        key_cache_buf,
+        val_cache_buf,
+        attn_output,
+        kv_indptr,
+        kv_indices,
+        k_pe_output,
+        cos_sin_cache,
+        positions,
+        attn_logits,
+        num_kv_split,
+        sm_scale,
+        enable_rope_fusion,
+        k_input,
+        forward_batch,
+        zero_allocator,
+    ):
+        decode_attention_fwd_grouped_rope(
+            q_input,
+            key_cache_buf,
+            val_cache_buf,
+            attn_output,
+            kv_indptr,
+            kv_indices,
+            k_pe_output,
+            self.kv_lora_rank,
+            self.rotary_emb.rotary_dim,
+            cos_sin_cache,
+            positions,
+            attn_logits,
+            num_kv_split,
+            sm_scale,
+            logit_cap=self.attn_mqa.logit_cap,
+            use_rope=enable_rope_fusion,
+            is_neox_style=self.rotary_emb.is_neox_style,
+        )
+
+        if enable_rope_fusion:
+            k_input[..., self.kv_lora_rank :] = k_pe_output
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                self.attn_mqa, forward_batch.out_cache_loc, k_input, None
+            )
+
+        attn_output = attn_output.view(-1, self.num_local_heads, self.kv_lora_rank)
+
+        if _is_hip:
+            # TODO(haishaw): add bmm_fp8 to ROCm
+            attn_bmm_output = torch.bmm(
+                attn_output.to(torch.bfloat16).transpose(0, 1),
+                self.w_vc.to(torch.bfloat16) * self.w_scale,
+            )
+        elif self.w_vc.dtype == torch.float8_e4m3fn:
+            attn_output_val, attn_output_scale = per_tensor_quant_mla_fp8(
+                attn_output.transpose(0, 1),
+                zero_allocator.allocate(1),
+                dtype=torch.float8_e4m3fn,
+            )
+            attn_bmm_output = bmm_fp8(
+                attn_output_val,
+                self.w_vc,
+                attn_output_scale,
+                self.w_scale,
+                torch.bfloat16,
+            )
+        else:
+            attn_bmm_output = torch.bmm(attn_output.transpose(0, 1), self.w_vc)
+        attn_output = attn_bmm_output.transpose(0, 1).flatten(1, 2)
+        output, _ = self.o_proj(attn_output)
+
+        return output
+
+    def forward_absorb_fused_mla_rope_cpu_core(
+        self, q_input, k_input, v_input, forward_batch, zero_allocator
+    ):
+        assert self.q_lora_rank is not None and use_intel_amx_backend(
+            self
+        ), "forward_absorb_fused_mla_rope_cpu_core requires q_lora_rank is not None and use_intel_amx_backend"
+
+        attn_output = self.attn_mqa(q_input, k_input, v_input, forward_batch)
+        attn_output = attn_output.view(-1, self.num_local_heads, self.kv_lora_rank)
+
+        # [Note] Align shapes of bmm inputs.
+        # Shapes of inputs:
+        #   q_nope: [M, B, K]
+        #   original self.w_kc: [B, K, N]
+        #   current self.w_kc (which has been converted in PackWeightMethod): [B, N, K]
+
+        # Shapes of inputs to sgl_kernel.cpu.bmm:
+        #   out: [B, M, N]
+        #   mat1: [B, M, K]
+        #   mat2: [B, N, K]
+        B = self.w_vc.size(0)
+        N = self.w_vc.size(1)
+        M = attn_output.size(0)
+        output = torch.empty([M, int(B * N)], dtype=attn_output.dtype)
+        attn_bmm_output = output.view([M, B, N]).transpose_(0, 1)
+        torch.ops.sgl_kernel.bmm_cpu(
+            attn_bmm_output,
+            attn_output.transpose(0, 1),
+            self.w_vc,
+            True,  # is_vnni
+            None,  # scale
+        )
+        attn_output = output
+        output, _ = self.o_proj(attn_output)
+
+        return output
+
+    def _chunked_prefix_attn_mha(
+        self,
+        q: torch.Tensor,
+        accum_output: torch.Tensor,
+        accum_lse: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+
+        assert forward_batch.num_prefix_chunks is not None
+        for i in range(forward_batch.num_prefix_chunks):
+            forward_batch.set_prefix_chunk_idx(i)
+
+            # Fetch latent cache from memory pool with precomputed chunked kv indices
+            latent_cache_buf = forward_batch.token_to_kv_pool.get_key_buffer(
+                self.attn_mha.layer_id
+            )
+            latent_cache = (
+                latent_cache_buf[forward_batch.prefix_chunk_kv_indices[i]]
+                .contiguous()
+                .to(q.dtype)
+            )
+
+            kv_a_normed, k_pe = latent_cache.split(
+                [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+            )
+            kv_a_normed = kv_a_normed.squeeze(1).contiguous()
+            kv = self.kv_b_proj(kv_a_normed)[0]
+            kv = kv.view(
+                -1, self.num_local_heads, self.qk_nope_head_dim + self.v_head_dim
+            )
+            v = kv[..., self.qk_nope_head_dim :]
+            k_nope = kv[..., : self.qk_nope_head_dim]
+
+            k = torch.empty(
+                (
+                    k_nope.shape[0],
+                    self.num_local_heads,
+                    self.qk_nope_head_dim + self.qk_rope_head_dim,
+                ),
+                dtype=v.dtype,
+                device=v.device,
+            )
+            k[..., : self.qk_nope_head_dim] = k_nope
+            k[..., self.qk_nope_head_dim :] = k_pe
+
+            output, lse = self.attn_mha(q, k, v, forward_batch, save_kv_cache=False)
+            tmp_output = torch.empty_like(accum_output)
+            tmp_lse = torch.empty_like(accum_lse)
+            merge_state_v2(output, lse, accum_output, accum_lse, tmp_output, tmp_lse)
+            accum_output, accum_lse = tmp_output, tmp_lse
+
+        return accum_output
+
+    def forward_normal_chunked_kv_prepare(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        zero_allocator: BumpAllocator,
+    ):
+        # In normal mha, the k and v tensors will become overly large when the prefix length is long.
+        # To avoid this, we split the kv cache into chunks and process them one after another.
+        # Since mha is compute friendly, the for loop induced here will not introduce significant overhead.
+        # The top comments in https://github.com/vllm-project/vllm/blob/main/vllm/v1/attention/backends/mla/common.py
+        # will be helpful for understanding the purpose of this function.
+
+        # First do normal mha forward to get output for extended part
+        return self.forward_normal_prepare(
+            positions, hidden_states, forward_batch, zero_allocator
+        )
+
+    def forward_normal_chunked_kv_core(self, q, k, v, forward_batch):
+        has_extend_prefix = any(forward_batch.extend_prefix_lens_cpu)
+        # Only initialize the info once
+        if has_extend_prefix and forward_batch.num_prefix_chunks is None:
+            forward_batch.prepare_chunked_prefix_cache_info(q.device)
+            if hasattr(forward_batch.attn_backend, "init_mha_chunk_metadata"):
+                forward_batch.attn_backend.init_mha_chunk_metadata(forward_batch)
+
+        forward_batch.mha_return_lse = has_extend_prefix
+        # Do mha for extended part without prefix
+        forward_batch.set_attn_attend_prefix_cache(False)
+        attn_output = self.attn_mha(q, k, v, forward_batch, save_kv_cache=False)
+
+        # Do mha attention with chunked prefix cache if there are any sequence with prefix
+        if has_extend_prefix:
+            attn_output, lse = attn_output
+            forward_batch.set_attn_attend_prefix_cache(True)
+            attn_output = self._chunked_prefix_attn_mha(
+                q=q,
+                accum_output=attn_output,
+                accum_lse=lse,
+                forward_batch=forward_batch,
+            )
+
+        attn_output = attn_output.reshape(-1, self.num_local_heads * self.v_head_dim)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class DeepseekV2DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        is_nextn: bool = False,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.config = config
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        self.speculative_algorithm = global_server_args_dict["speculative_algorithm"]
+        self.layer_id = layer_id
+        self.is_nextn = is_nextn
+        self.self_attn = DeepseekV2AttentionMLA(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            qk_nope_head_dim=config.qk_nope_head_dim,
+            qk_rope_head_dim=config.qk_rope_head_dim,
+            v_head_dim=config.v_head_dim,
+            q_lora_rank=(
+                config.q_lora_rank if hasattr(config, "q_lora_rank") else None
+            ),
+            kv_lora_rank=config.kv_lora_rank,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            layer_id=layer_id,
+            reduce_results=False,
+            prefix=add_prefix("self_attn", prefix),
+            alt_stream=alt_stream,
+        )
+
+        self.is_layer_sparse = self._is_layer_sparse(layer_id, is_nextn=is_nextn)
+        is_previous_layer_sparse = self._is_layer_sparse(layer_id - 1, is_nextn=False)
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=1 if is_nextn else config.num_hidden_layers,
+            is_layer_sparse=self.is_layer_sparse,
+            is_previous_layer_sparse=is_previous_layer_sparse,
+        )
+
+        if self.is_layer_sparse:
+            self.mlp = DeepseekV2MoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+                layer_id=self.layer_id,
+                alt_stream=alt_stream,
+                is_nextn=is_nextn,
+            )
+        else:
+            if enable_moe_dense_fully_dp():
+                mlp_tp_rank, mlp_tp_size = 0, 1
+            else:
+                mlp_tp_rank, mlp_tp_size = None, None
+            self.mlp = DeepseekV2MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+                tp_rank=mlp_tp_rank,
+                tp_size=mlp_tp_size,
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            allow_reduce_scatter=True,
+            is_last_layer=(
+                is_nextn or (self.layer_id == self.config.num_hidden_layers - 1)
+            ),
+        )
+
+    def _is_layer_sparse(self, layer_id: int, is_nextn: bool) -> bool:
+        return is_nextn or (
+            self.config.n_routed_experts is not None
+            and layer_id >= self.config.first_k_dense_replace
+            and layer_id % self.config.moe_layer_freq == 0
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+        zero_allocator: BumpAllocator,
+        gemm_output_zero_allocator: BumpAllocator = None,
+    ) -> torch.Tensor:
+
+        quant_format = (
+            "mxfp4"
+            if _is_gfx95_supported
+            and getattr(self.self_attn, "fused_qkv_a_proj_with_mqa", None) is not None
+            and getattr(self.self_attn.fused_qkv_a_proj_with_mqa, "weight", None)
+            is not None
+            and self.self_attn.fused_qkv_a_proj_with_mqa.weight.dtype == torch.uint8
+            else ""
+        )
+
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states,
+            residual,
+            forward_batch,
+            quant_format,
+        )
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+            zero_allocator=zero_allocator,
+        )
+
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+
+        should_allreduce_fusion = (
+            self.layer_communicator.should_fuse_mlp_allreduce_with_next_layer(
+                forward_batch
+            )
+        )
+
+        # For DP with padding, reduce scatter can be used instead of all-reduce.
+        use_reduce_scatter = self.layer_communicator.should_use_reduce_scatter(
+            forward_batch
+        )
+
+        if isinstance(self.mlp, DeepseekV2MLP):
+            gemm_output_zero_allocator = None
+
+        hidden_states = self.mlp(
+            hidden_states,
+            forward_batch,
+            should_allreduce_fusion,
+            use_reduce_scatter,
+            gemm_output_zero_allocator,
+        )
+
+        if should_allreduce_fusion:
+            hidden_states._sglang_needs_allreduce_fusion = True
+
+        if not should_allreduce_fusion:
+            hidden_states, residual = self.layer_communicator.postprocess_layer(
+                hidden_states, residual, forward_batch
+            )
+
+        return hidden_states, residual
+
+    def op_comm_prepare_attn(
+        self,
+        state,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+        zero_allocator: BumpAllocator,
+        tbo_subbatch_index: Optional[int] = None,
+    ):
+        state.hidden_states_after_comm_pre_attn, state.residual_after_input_ln = (
+            self.layer_communicator.prepare_attn(hidden_states, residual, forward_batch)
+        )
+        state.update(
+            dict(
+                forward_batch=forward_batch,
+                positions=positions,
+                zero_allocator=zero_allocator,
+                tbo_subbatch_index=tbo_subbatch_index,
+            )
+        )
+
+    def op_comm_prepare_mlp(self, state):
+        state.hidden_states_mlp_input, state.residual_after_comm_pre_mlp = (
+            self.layer_communicator.prepare_mlp(
+                state.pop("hidden_states_after_attn"),
+                state.pop("residual_after_input_ln"),
+                state.forward_batch,
+            )
+        )
+
+    def op_mlp(self, state):
+        hidden_states = state.pop("hidden_states_mlp_input")
+        if not (
+            enable_moe_dense_fully_dp()
+            and (not self.is_layer_sparse)
+            and hidden_states.shape[0] == 0
+        ):
+            state.hidden_states_mlp_output = self.mlp(
+                hidden_states, state.forward_batch
+            )
+        else:
+            state.hidden_states_mlp_output = hidden_states
+
+    def op_comm_postprocess_layer(self, state):
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            state.pop("hidden_states_mlp_output"),
+            state.pop("residual_after_comm_pre_mlp"),
+            state.forward_batch,
+        )
+
+        output = dict(
+            positions=state.positions,
+            hidden_states=hidden_states,
+            residual=residual,
+            forward_batch=state.forward_batch,
+            zero_allocator=state.zero_allocator,
+            tbo_subbatch_index=state.tbo_subbatch_index,
+        )
+
+        state.clear(
+            expect_keys={
+                "positions",
+                "forward_batch",
+                "zero_allocator",
+                "tbo_subbatch_index",
+            }
+        )
+        return output
+
+
+class DeepseekV2Model(nn.Module):
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.padding_id = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.first_k_dense_replace = config.first_k_dense_replace
+        self.pp_group = get_pp_group()
+
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                enable_tp=not is_dp_attention_enabled(),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.alt_stream = torch.cuda.Stream() if _is_cuda else None
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: DeepseekV2DecoderLayer(
+                config=config,
+                layer_id=idx,
+                quant_config=quant_config,
+                prefix=prefix,
+                alt_stream=self.alt_stream,
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=add_prefix("layers", prefix),
+            offloader_kwargs=dict(
+                submodule_accessor=lambda layer: (
+                    layer.mlp.experts
+                    if isinstance(layer.mlp, DeepseekV2MoE)
+                    else layer.mlp
+                ),
+                whitelist_param_names_creator=lambda module: (
+                    [
+                        "w13_weight",
+                        "w2_weight",
+                        "w13_blockscale_swizzled",
+                        "w2_blockscale_swizzled",
+                    ]
+                    if isinstance(module, FusedMoE)
+                    else []
+                ),
+            ),
+        )
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+
+        self.gemm_output_zero_allocator_size = 0
+        if (
+            _use_aiter_gfx95
+            and config.n_routed_experts == 256
+            and self.embed_tokens.embedding_dim == 7168
+        ):
+            num_moe_layers = sum(
+                [
+                    1
+                    for i in range(len(self.layers))
+                    if isinstance(self.layers[i].mlp, DeepseekV2MoE)
+                ]
+            )
+
+            allocate_size = 0
+            for i in range(len(self.layers)):
+                if isinstance(self.layers[i].mlp, DeepseekV2MoE):
+                    allocate_size = self.layers[
+                        i
+                    ].mlp.shared_experts.gate_up_proj.output_size_per_partition
+                    break
+
+            self.gemm_output_zero_allocator_size = (
+                get_dsv3_gemm_output_zero_allocator_size(
+                    config.n_routed_experts,
+                    num_moe_layers,
+                    allocate_size,
+                    self.embed_tokens.embedding_dim,
+                )
+            )
+
+    def get_input_embeddings(self) -> torch.Tensor:
+        return self.embed_tokens
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        total_num_layers = self.end_layer - self.start_layer
+        device = input_embeds.device if input_embeds is not None else input_ids.device
+        zero_allocator = BumpAllocator(
+            buffer_size=total_num_layers * 2 * (2 if forward_batch.can_run_tbo else 1),
+            dtype=torch.float32,
+            device=device,
+        )
+
+        has_gemm_output_zero_allocator = hasattr(
+            self, "gemm_output_zero_allocator_size"
+        )
+
+        gemm_output_zero_allocator = (
+            BumpAllocator(
+                buffer_size=self.gemm_output_zero_allocator_size,
+                dtype=torch.float32,
+                device=device,
+            )
+            if has_gemm_output_zero_allocator
+            and self.gemm_output_zero_allocator_size > 0
+            else None
+        )
+
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        normal_start_layer = self.start_layer
+        normal_end_layer = self.end_layer
+        if forward_batch.can_run_tbo:
+            if (
+                self.first_k_dense_replace > normal_start_layer
+                and self.first_k_dense_replace < normal_end_layer
+            ):
+                normal_end_layer = self.first_k_dense_replace
+            elif self.first_k_dense_replace < normal_start_layer:
+                normal_end_layer = normal_start_layer = 0
+
+        for i in range(normal_start_layer, normal_end_layer):
+            with get_global_expert_distribution_recorder().with_current_layer(i):
+                layer = self.layers[i]
+                hidden_states, residual = layer(
+                    positions,
+                    hidden_states,
+                    forward_batch,
+                    residual,
+                    zero_allocator,
+                    gemm_output_zero_allocator,
+                )
+
+        if normal_end_layer != self.end_layer:
+            hidden_states, residual = model_forward_maybe_tbo(
+                layers=self.layers[normal_end_layer : self.end_layer],
+                enable_tbo=True,
+                positions=positions,
+                forward_batch=forward_batch,
+                hidden_states=hidden_states,
+                residual=residual,
+                input_data_scatter_mode=self.layers[
+                    normal_end_layer - 1
+                ].layer_scatter_modes.layer_output_mode,
+                zero_allocator=zero_allocator,
+            )
+
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            if not forward_batch.forward_mode.is_idle():
+                if residual is None:
+                    hidden_states = self.norm(hidden_states)
+                else:
+                    hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class DeepseekV2ForCausalLM(nn.Module):
+    # for quark model load
+    packed_modules_mapping = {}
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        # for quark model load
+        # Fuse q_a_proj and kv_a_proj_with_mqa along output dimension when q_lora_rank is not None
+        self.fuse_qkv_a_proj = (
+            hasattr(config, "q_lora_rank") and config.q_lora_rank is not None
+        )
+        if self.fuse_qkv_a_proj:
+            self.packed_modules_mapping["fused_qkv_a_proj_with_mqa"] = [
+                "q_a_proj",
+                "kv_a_proj_with_mqa",
+            ]
+
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.quant_config = quant_config
+        self.determine_num_fused_shared_experts()
+        self.model = DeepseekV2Model(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+            use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+        self._routed_experts_weights_of_layer = LazyValue(
+            lambda: {
+                layer_id: layer.mlp.get_moe_weights()
+                for layer_id, layer in enumerate(self.model.layers)
+                if isinstance(layer.mlp, DeepseekV2MoE)
+            }
+        )
+
+    @property
+    def routed_experts_weights_of_layer(self):
+        return self._routed_experts_weights_of_layer.value
+
+    def determine_num_fused_shared_experts(
+        self, architecture: str = "DeepseekV3ForCausalLM"
+    ):
+        self.num_fused_shared_experts = 0
+        if global_server_args_dict["disable_shared_experts_fusion"]:
+            return
+
+        # Only Deepseek V3/R1 can use shared experts fusion optimization now.
+        disable_reason = None
+        if (
+            not _is_cuda
+            or torch.cuda.get_device_capability("cuda") < (8, 0)
+            or self.config.architectures[0] != architecture
+            or self.config.n_routed_experts != 256
+            or self.config.n_shared_experts != 1
+        ):
+            disable_reason = "Only Deepseek V3/R1 on NV-platform with capability >= 80 can use shared experts fusion optimization."
+        elif get_moe_expert_parallel_world_size() > 1:
+            disable_reason = "Deepseek V3/R1 can not use shared experts fusion optimization under expert parallelism."
+        elif self.quant_config.get_name() == "w4afp8":
+            disable_reason = "Deepseek V3/R1 W4AFP8 model uses different quant method for routed experts and shared experts."
+
+        if disable_reason is not None:
+            global_server_args_dict["disable_shared_experts_fusion"] = True
+            self.num_fused_shared_experts = 0
+            log_info_on_rank0(
+                logger,
+                f"{disable_reason} Shared experts fusion optimization is disabled.",
+            )
+            return
+
+        self.num_fused_shared_experts = self.config.n_shared_experts
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids, positions, forward_batch, input_embeds, pp_proxy_tensors
+        )
+
+        if self.pp_group.is_last_rank:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            return hidden_states
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def post_load_weights(self, is_nextn=False, weight_names=None):
+
+        # Perform post-processing after loading weights
+        if is_nextn:
+            layer_ids = [self.config.num_hidden_layers]
+        else:
+            if weight_names is None:
+                layer_ids = range(self.model.start_layer, self.model.end_layer)
+            else:
+                layer_ids = set()
+                for name in weight_names:
+                    if "kv_b_proj" in name:
+                        layer_id = int(name.split(".")[2])
+                        if layer_id < self.config.num_hidden_layers:
+                            layer_ids.add(layer_id)
+
+        for layer_id in layer_ids:
+            self_attn = (
+                self.model.layers[layer_id].self_attn
+                if not is_nextn
+                else self.model.decoder.self_attn
+            )
+            if hasattr(self_attn.kv_b_proj, "qweight"):
+                # AWQ compatible
+                if _is_cuda or _is_hip:
+                    w = awq_dequantize(
+                        self_attn.kv_b_proj.qweight,
+                        self_attn.kv_b_proj.scales,
+                        self_attn.kv_b_proj.qzeros,
+                    ).T
+                else:
+                    w = awq_dequantize(
+                        self_attn.kv_b_proj.qweight,
+                        self_attn.kv_b_proj.scales,
+                        self_attn.kv_b_proj.qzeros,
+                        0,
+                        0,
+                        0,
+                    ).T
+            else:
+                w = self_attn.kv_b_proj.weight
+            # NOTE(HandH1998): Since `bmm_fp8` only supports per-tensor scale, we have to requantize `self_attn.kv_b_proj`.
+            # This may affect the accuracy of fp8 model.
+            # Fix deepseek v3 blockwise bmm by using deep_gemm
+            use_deep_gemm_bmm = False
+
+            if w.dtype in (
+                torch.float8_e4m3fn,
+                torch.float8_e4m3fnuz,
+            ):
+                if (
+                    hasattr(self.quant_config, "weight_block_size")
+                    and self.quant_config.weight_block_size is not None
+                ):
+                    weight_block_size = self.quant_config.weight_block_size
+                    assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
+                    if _is_fp8_fnuz:
+                        weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                            weight=w,
+                            weight_scale=self_attn.kv_b_proj.weight_scale_inv,
+                            input_scale=None,
+                        )
+                    else:
+                        weight = w
+                        weight_scale = self_attn.kv_b_proj.weight_scale_inv
+
+                    if (
+                        _is_cuda
+                        and weight_block_size[0] == 128
+                        and weight_block_size[1] == 128
+                    ):
+                        if (
+                            deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
+                            and not deep_gemm_wrapper.DEEPGEMM_BLACKWELL
+                            and get_bool_env_var("SGL_USE_DEEPGEMM_BMM", "false")
+                        ):
+                            block_scale = weight_scale
+                            use_deep_gemm_bmm = True
+                        else:
+                            w = block_quant_dequant(
+                                weight,
+                                weight_scale,
+                                weight_block_size,
+                                torch.bfloat16,
+                            )
+                    else:
+                        w, scale = block_quant_to_tensor_quant(
+                            weight, weight_scale, weight_block_size
+                        )
+                        self_attn.w_scale = scale
+                else:
+                    if _is_fp8_fnuz:
+                        weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                            weight=w,
+                            weight_scale=self_attn.kv_b_proj.weight_scale,
+                            input_scale=None,
+                        )
+                    else:
+                        weight = w
+                        weight_scale = self_attn.kv_b_proj.weight_scale
+
+                    w, scale = channel_quant_to_tensor_quant(weight, weight_scale)
+                    self_attn.w_scale = scale
+
+            if w.dtype == torch.int8:
+                if hasattr(self.quant_config, "weight_block_size"):
+                    # block-wise int8 need it
+                    weight_block_size = self.quant_config.weight_block_size
+                    if weight_block_size is not None:
+                        assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
+                        weight = w
+                        weight_scale = self_attn.kv_b_proj.weight_scale_inv
+                        w = int8_block_dequant(
+                            weight, weight_scale, weight_block_size
+                        ).to(torch.bfloat16)
+                else:
+                    # channel-wise int8 need it
+                    w = w.to(torch.bfloat16) * self_attn.kv_b_proj.weight_scale.to(
+                        torch.bfloat16
+                    )
+
+            w_kc, w_vc = w.unflatten(
+                0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
+            ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
+
+            if (
+                _use_aiter_gfx95
+                and self.quant_config is not None
+                and self.quant_config.get_name() == "quark"
+            ):
+                w_kc, self_attn.w_scale_k, w_vc, self_attn.w_scale_v = (
+                    quark_post_load_weights(self_attn, w, "mxfp4")
+                )
+
+            if not use_deep_gemm_bmm:
+                self_attn.w_kc = bind_or_assign(
+                    self_attn.w_kc, w_kc.transpose(1, 2).contiguous().transpose(1, 2)
+                )
+                self_attn.w_vc = bind_or_assign(
+                    self_attn.w_vc, w_vc.contiguous().transpose(1, 2)
+                )
+                if (
+                    hasattr(self_attn.kv_b_proj, "weight_scale")
+                    and self_attn.w_scale is None
+                ):
+                    self_attn.w_scale = bind_or_assign(
+                        self_attn.w_scale, self_attn.kv_b_proj.weight_scale
+                    )
+                    if _is_hip:
+                        self_attn.w_scale *= 2.0
+                # TODO: remove this after adding FP8 support in bmm cpu kernel
+                if _is_cpu and _is_cpu_amx_available and w.dtype == torch.float8_e4m3fn:
+                    self_attn.w_kc = (
+                        self_attn.w_kc.to(torch.bfloat16) * self_attn.w_scale
+                    )
+                    self_attn.w_vc = (
+                        self_attn.w_vc.to(torch.bfloat16) * self_attn.w_scale
+                    )
+            else:
+                num_tiles_k = self_attn.qk_nope_head_dim // weight_block_size[1]
+                num_tiles_n = self_attn.v_head_dim // weight_block_size[0]
+                ws_kc, ws_vc = block_scale.unflatten(
+                    0, (-1, (num_tiles_k + num_tiles_n))
+                ).split([num_tiles_k, num_tiles_n], dim=1)
+                self_attn.w_scale_k = bind_or_assign(
+                    self_attn.w_scale_k, ws_kc.transpose(1, 2).contiguous()
+                )
+                self_attn.w_scale_v = bind_or_assign(
+                    self_attn.w_scale_v, ws_vc.contiguous()
+                )
+                self_attn.w_kc = bind_or_assign(
+                    self_attn.w_kc, w_kc.transpose(1, 2).contiguous()
+                )
+                self_attn.w_vc = bind_or_assign(self_attn.w_vc, w_vc.contiguous())
+                self_attn.use_deep_gemm_bmm = True
+
+        if (
+            deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
+            and deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
+            and hasattr(self.quant_config, "weight_block_size")
+            and self.quant_config.weight_block_size is not None
+        ):
+            self._weight_requant_ue8m0(is_nextn)
+
+    def _weight_requant_ue8m0(self, is_nextn=False):
+        weight_block_size = self.quant_config.weight_block_size
+
+        moe_layers = list(
+            range(
+                self.config.first_k_dense_replace,
+                self.config.num_hidden_layers,
+                self.config.moe_layer_freq,
+            )
+        )
+
+        num_hidden_layers = 1 if is_nextn else self.config.num_hidden_layers
+
+        for layer_id in range(num_hidden_layers):
+            if is_nextn:
+                layer = self.model.decoder
+            else:
+                layer = self.model.layers[layer_id]
+
+            module_list = [
+                layer.self_attn.kv_b_proj,
+                layer.self_attn.o_proj,
+            ]
+
+            if self.config.q_lora_rank is not None:
+                module_list.append(layer.self_attn.fused_qkv_a_proj_with_mqa)
+                module_list.append(layer.self_attn.q_b_proj)
+            else:
+                module_list.append(layer.self_attn.kv_a_proj_with_mqa)
+                module_list.append(layer.self_attn.q_proj)
+
+            for module in module_list:
+                requant_weight_ue8m0_inplace(
+                    module.weight, module.weight_scale_inv, weight_block_size
+                )
+
+            if layer_id in moe_layers or is_nextn:
+                shared_experts = getattr(layer.mlp, "shared_experts", None)
+                if shared_experts is not None:
+                    for module in [
+                        shared_experts.gate_up_proj,
+                        shared_experts.down_proj,
+                    ]:
+                        requant_weight_ue8m0_inplace(
+                            module.weight, module.weight_scale_inv, weight_block_size
+                        )
+
+                experts = layer.mlp.experts
+                if isinstance(experts, DeepEPMoE):
+                    for w in [
+                        experts.w13_weight_fp8,
+                        experts.w2_weight_fp8,
+                    ]:
+                        requant_weight_ue8m0_inplace(w[0], w[1], weight_block_size)
+            else:
+                mlp = layer.mlp
+                assert isinstance(mlp, DeepseekV2MLP)
+                for module in [
+                    mlp.gate_up_proj,
+                    mlp.down_proj,
+                ]:
+                    requant_weight_ue8m0_inplace(
+                        module.weight, module.weight_scale_inv, weight_block_size
+                    )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=False):
+
+        if is_nextn:
+            if hasattr(self.config, "num_nextn_predict_layers"):
+                num_nextn_layers = self.config.num_nextn_predict_layers
+                assert num_nextn_layers == 1, "Only 1 nextn layer is supported"
+                # compatible with old design
+                nextn_layer_id = (
+                    0
+                    if self.config.num_hidden_layers == 1
+                    else self.config.num_hidden_layers
+                )
+            else:
+                raise ValueError("num_nextn_predict_layers is not in the config")
+
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts + self.num_fused_shared_experts,
+        )
+        # Params for special naming rules in mixed-precision models, for example:
+        # model.layers.xx.mlp.experts.xx.w1.input_scale. For details,
+        # see https://huggingface.co/Barrrrry/DeepSeek-R1-W4AFP8/blob/main.
+        if self.quant_config and self.quant_config.get_name() == "w4afp8":
+            expert_params_mapping += FusedMoE.make_expert_input_scale_params_mapping(
+                num_experts=self.config.n_routed_experts
+            )
+
+        # Fuse q_a_proj and kv_a_proj_with_mqa along output dimension when q_lora_rank is not None
+        fuse_qkv_a_proj = hasattr(self.config, "q_lora_rank") and (
+            self.config.q_lora_rank is not None
+        )
+        cached_a_proj = {} if fuse_qkv_a_proj else None
+
+        if is_nextn:
+            nextn_layer_prefix = f"model.layers.{nextn_layer_id}"
+            nextn_spec_weight_names = [
+                "shared_head.norm",
+                "eh_proj",
+                "enorm",
+                "hnorm",
+            ]
+
+        if self.num_fused_shared_experts > 0:
+            assert self.num_fused_shared_experts == 1
+            log_info_on_rank0(logger, "Shared experts fusion optimization enabled.")
+
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = []
+            params_dict = dict(self.named_parameters())
+            weight_names = []
+            for name, loaded_weight in weights:
+                layer_id = get_layer_id(name)
+                if (
+                    layer_id is not None
+                    and hasattr(self.model, "start_layer")
+                    and (
+                        layer_id < self.model.start_layer
+                        or layer_id >= self.model.end_layer
+                    )
+                ):
+                    continue
+                if self.num_fused_shared_experts > 0 and "mlp.shared_experts" in name:
+                    name = name.replace(
+                        "mlp.shared_experts",
+                        f"mlp.experts.{self.config.n_routed_experts}",
+                    )
+
+                weight_names.append(name)
+
+                if not is_nextn:
+                    if hasattr(self.config, "num_nextn_predict_layers"):
+                        num_nextn_layers = self.config.num_nextn_predict_layers
+                        if num_nextn_layers > 0 and name.startswith("model.layers"):
+                            name_list = name.split(".")
+                            if (
+                                len(name_list) >= 3
+                                and int(name_list[2]) >= self.config.num_hidden_layers
+                            ):
+                                continue
+                else:
+                    if not name.startswith(nextn_layer_prefix):
+                        continue
+
+                    # Use shared head and embed weights from target model
+                    if "shared_head.head" in name or "embed_tokens" in name:
+                        continue
+
+                    is_decoder = True
+                    # For nextn specific weights
+                    for weight_name in nextn_spec_weight_names:
+                        if weight_name in name:
+                            name = name.replace(nextn_layer_prefix, "model")
+                            is_decoder = False
+                            break
+                    # For decoder layer weights
+                    if is_decoder:
+                        name = name.replace(nextn_layer_prefix, "model.decoder")
+
+                if "rotary_emb.inv_freq" in name:
+                    continue
+                for param_name, weight_name, shard_id in stacked_params_mapping:
+                    # Skip non-stacked layers and experts (experts handled below).
+                    if weight_name not in name:
+                        continue
+                    # We have mlp.experts[0].gate_proj in the checkpoint.
+                    # Since we handle the experts below in expert_params_mapping,
+                    # we need to skip here BEFORE we update the name, otherwise
+                    # name will be updated to mlp.experts[0].gate_up_proj, which
+                    # will then be updated below in expert_params_mapping
+                    # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                    if ("mlp.experts." in name) and name not in params_dict:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    futures.append(
+                        executor.submit(weight_loader, param, loaded_weight, shard_id)
+                    )
+                    break
+                else:
+                    for mapping in expert_params_mapping:
+                        param_name, weight_name, expert_id, shard_id = mapping
+                        if weight_name not in name:
+                            continue
+                        name = name.replace(weight_name, param_name)
+                        param = params_dict[name]
+                        weight_loader = param.weight_loader
+                        futures.append(
+                            executor.submit(
+                                weight_loader,
+                                param,
+                                loaded_weight,
+                                name,
+                                shard_id=shard_id,
+                                expert_id=expert_id,
+                            )
+                        )
+                        break
+                    else:
+                        # Skip loading extra bias for GPTQ models.
+                        if name.endswith(".bias") and name not in params_dict:
+                            continue
+                        # Skip loading embed_tokens if not first rank in pipeline parallelism
+                        if ".embed_tokens." in name and not self.pp_group.is_first_rank:
+                            continue
+                        # Skip loading norm if not last rank in pipeline parallelism
+                        if ".norm." in name and not self.pp_group.is_last_rank:
+                            continue
+                        if fuse_qkv_a_proj and (
+                            "q_a_proj" in name or "kv_a_proj_with_mqa" in name
+                        ):
+                            cached_a_proj[name] = loaded_weight
+                            q_a_proj_name = (
+                                name
+                                if "q_a_proj" in name
+                                else name.replace("kv_a_proj_with_mqa", "q_a_proj")
+                            )
+                            kv_a_proj_name = (
+                                name
+                                if "kv_a_proj_with_mqa" in name
+                                else name.replace("q_a_proj", "kv_a_proj_with_mqa")
+                            )
+
+                            # When both q_a_proj and kv_a_proj_with_mqa has been cached, load the fused weight to parameter
+                            if (
+                                q_a_proj_name in cached_a_proj
+                                and kv_a_proj_name in cached_a_proj
+                            ):
+                                q_a_proj_weight = cached_a_proj[q_a_proj_name]
+                                kv_a_proj_weight = cached_a_proj[kv_a_proj_name]
+                                cat_dim = 0
+                                if self.quant_config is not None and (
+                                    self.quant_config.get_name() == "awq"
+                                    or self.quant_config.get_name() == "awq_marlin"
+                                    or self.quant_config.get_name() == "moe_wna16"
+                                ):
+                                    cat_dim = 1
+                                fused_weight = torch.cat(
+                                    [q_a_proj_weight, kv_a_proj_weight], dim=cat_dim
+                                )
+                                param_name = (
+                                    name.replace(
+                                        "q_a_proj", "fused_qkv_a_proj_with_mqa"
+                                    )
+                                    if "q_a_proj" in name
+                                    else name.replace(
+                                        "kv_a_proj_with_mqa",
+                                        "fused_qkv_a_proj_with_mqa",
+                                    )
+                                )
+                                param = params_dict[param_name]
+
+                                weight_loader = getattr(
+                                    param, "weight_loader", default_weight_loader
+                                )
+                                futures.append(
+                                    executor.submit(weight_loader, param, fused_weight)
+                                )
+                                cached_a_proj.pop(q_a_proj_name)
+                                cached_a_proj.pop(kv_a_proj_name)
+                        else:
+                            if (
+                                "k_scale" in name or "v_scale" in name
+                            ) and name not in params_dict:
+                                # modelopt attn kv scale is named differently
+                                for scale in ["k_scale", "v_scale"]:
+                                    if scale in name:
+                                        name = name.replace(
+                                            f"{scale[0]}_proj", "attn_mqa"
+                                        )
+                                        break
+                            if name not in params_dict:
+                                # modelopt ckpt contains not needed weights for MTP module:
+                                # model.decoder.self_attn.attn_mqa.v_scale and
+                                # model.decoder.self_attn.attn_mqa.k_scale
+                                logger.warning(f"{name} not found in params_dict.")
+                                continue
+                            param = params_dict[name]
+                            weight_loader = getattr(
+                                param, "weight_loader", default_weight_loader
+                            )
+                            futures.append(
+                                executor.submit(weight_loader, param, loaded_weight)
+                            )
+
+            # Wait for all tasks to complete and raise any exceptions.
+            for future in concurrent.futures.as_completed(futures):
+                future.result()
+
+        self.post_load_weights(is_nextn=is_nextn, weight_names=weight_names)
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        return ModelConfigForExpertLocation(
+            num_layers=config.num_hidden_layers,
+            num_logical_experts=config.n_routed_experts,
+            num_groups=config.n_group,
+        )
+
+
+class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM):
+    pass
+
+
+EntryClass = [DeepseekV2ForCausalLM, DeepseekV3ForCausalLM]
diff --git a/python/sglang/srt/models/deepseek_vl2.py b/python/sglang/srt/models/deepseek_vl2.py
new file mode 100644
index 000000000..3fba37008
--- /dev/null
+++ b/python/sglang/srt/models/deepseek_vl2.py
@@ -0,0 +1,362 @@
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from torch import nn
+
+from sglang.srt.configs.deepseekvl2 import (
+    DeepseekVL2Config,
+    DeepseekVL2MlpProjectorConfig,
+)
+from sglang.srt.layers.linear import ReplicatedLinear
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.deepseek import DeepseekForCausalLM
+from sglang.srt.models.deepseek_v2 import DeepseekV2ForCausalLM
+
+
+class DeepseekVL2MlpProjector(nn.Module):
+    def __init__(
+        self,
+        config: DeepseekVL2MlpProjectorConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+
+        super().__init__()
+
+        self.config = config
+
+        if config.projector_type == "identity":
+            modules = nn.Identity()
+
+        elif config.projector_type == "linear":
+            self.layers = nn.ModuleList(
+                [
+                    ReplicatedLinear(
+                        config.input_dim,
+                        config.n_embed,
+                        quant_config=quant_config,
+                    )
+                ]
+            )
+
+        elif config.projector_type == "mlp_gelu":
+            mlp_depth = config.depth
+            self.layers = nn.ModuleList(
+                [
+                    ReplicatedLinear(
+                        config.input_dim,
+                        config.n_embed,
+                        quant_config=quant_config,
+                    )
+                ]
+            )
+            for _ in range(1, mlp_depth):
+                self.layers.append(nn.GELU())
+                self.layers.append(
+                    ReplicatedLinear(
+                        config.n_embed,
+                        config.n_embed,
+                        quant_config=quant_config,
+                    )
+                )
+
+        elif config.projector_type == "downsample_mlp_gelu":
+            mlp_depth = config.depth
+            mlp_ratio = config.mlp_ratio
+            self.layers = nn.ModuleList(
+                [
+                    ReplicatedLinear(
+                        config.input_dim
+                        * config.downsample_ratio
+                        * config.downsample_ratio,
+                        config.n_embed * mlp_ratio,
+                        quant_config=quant_config,
+                    )
+                ]
+            )
+            for _ in range(1, mlp_depth - 1):
+                self.layers.append(nn.GELU())
+                self.layers.append(
+                    ReplicatedLinear(
+                        config.n_embed * mlp_ratio,
+                        config.n_embed * mlp_ratio,
+                        quant_config=quant_config,
+                    )
+                )
+            self.layers.append(nn.GELU())
+            self.layers.append(
+                ReplicatedLinear(
+                    config.n_embed * mlp_ratio,
+                    config.n_embed,
+                    quant_config=quant_config,
+                )
+            )
+
+        else:
+            raise ValueError(f"Unknown projector type: {config.projector_type}")
+
+        if config.token_pooling:
+            self.token_pooling_layer = ReplicatedLinear(
+                config.input_dim * 4, config.input_dim, quant_config=quant_config
+            )
+
+    def forward(self, x):
+        if self.config.token_pooling:
+            batch_size, wxh, channels = x.shape
+            w = h = int(wxh**0.5)
+            x = x.view(batch_size, w, h, channels)
+            x = x.permute(0, 3, 1, 2)
+
+            patches = x.unfold(2, 2, 2).unfold(3, 2, 2)
+            batch_size, channels, h_patches, w_patches, _, _ = patches.size()
+            patches = patches.contiguous().view(
+                batch_size, channels, h_patches * w_patches, -1
+            )
+            patches = patches.permute(0, 2, 1, 3).contiguous()
+            patches = patches.view(batch_size, h_patches * w_patches, channels * 4)
+
+            x = self.token_pooling_layer(patches)[0]
+
+        elif self.config.projector_type == "downsample_mlp_gelu":
+            bs, hw, input_dim = x.shape
+            h = w = int((hw) ** 0.5)
+
+            """compute padding"""
+            if h % self.config.downsample_ratio:
+                pad = self.config.downsample_ratio - h % self.config.downsample_ratio
+            else:
+                pad = 0
+            x = x.reshape(bs, h, w, input_dim)
+            if pad > 0:
+                x = F.pad(x, (0, 0, 0, pad, 0, pad), "constant", 0)
+
+            """4 to 1 concat"""
+            x = x.permute(0, 3, 1, 2)  # B, C, H, W
+            x = F.unfold(
+                x,
+                kernel_size=self.config.downsample_ratio,
+                stride=self.config.downsample_ratio,
+                padding=0,
+            )  # B, C*4, HW // 4
+            x = x.permute(0, 2, 1)
+
+        for layer in self.layers:
+            x = layer(x)
+            if isinstance(x, tuple):
+                x = x[0]
+        return x
+
+
+class DeepseekVL2ForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: DeepseekVL2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+
+        # ----------- vision encoder ------------
+        vision_config = config.vision_config
+        self.vision = self._init_vision_module(vision_config, quant_config)
+
+        # ----------- vl projector ------------
+        projector_config = config.projector_config
+        self.projector = DeepseekVL2MlpProjector(projector_config, quant_config)
+
+        self.tile_tag = config.tile_tag
+        self.global_view_pos = config.global_view_pos
+
+        embed_std = 1 / torch.sqrt(
+            torch.tensor(projector_config.n_embed, dtype=torch.float32)
+        )
+        if self.tile_tag == "2D":
+            self.image_newline = nn.Parameter(
+                torch.randn(projector_config.n_embed) * embed_std
+            )
+            self.view_seperator = nn.Parameter(
+                torch.randn(projector_config.n_embed) * embed_std
+            )
+        else:
+            raise ValueError(f"tile tag should be 2D, but got {self.tile_tag}")
+
+        # ----------- language model ------------
+        language_config = config.language_config
+        if language_config.use_mla:
+            self.language_model = DeepseekV2ForCausalLM(language_config)
+        else:
+            # deepseek-vl2-tiny forbids mla
+            self.language_model = DeepseekForCausalLM(language_config)
+
+    def _init_vision_module(
+        self, vision_config, quant_config: Optional[QuantizationConfig]
+    ) -> nn.Module:
+        # TODO: refactor vision model through timm wrapper from transformers
+        try:
+            import timm
+        except ImportError:
+            raise ImportError("Please install timm") from ImportError
+
+        model = timm.create_model(
+            "vit_so400m_patch14_siglip_384.webli",
+            pretrained=False,
+            num_classes=0,
+            dynamic_img_size=True,
+            dynamic_img_pad=True,
+        )
+
+        model = model.to(dtype=torch.get_default_dtype())
+        return model
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs: object,
+    ):
+        hs = general_mm_embed_routine(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+            multimodal_model=self,
+            language_model=self.language_model,
+        )
+
+        return hs
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "up_proj", 1),
+            ("gate_up_proj", "gate_proj", 0),
+        ]
+        params_dict = dict(self.named_parameters())
+        weights = list(weights)
+        for name, loaded_weight in weights:
+            if "language" in name:
+                name = name.replace("language.", "")
+                self.language_model.load_weights([(name, loaded_weight)])
+            else:
+                param = params_dict[name]
+                weights_loader = getattr(param, "weight_loader", default_weight_loader)
+                weights_loader(param, loaded_weight)
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def get_image_feature(self, items: List[MultimodalDataItem]):
+
+        images_spatial_crop = torch.cat(
+            [item.images_spatial_crop for item in items], dim=0
+        )
+
+        assert images_spatial_crop.dim() == 3
+
+        # TODO: can it be batched ?
+        images_in_this_batch = []
+        for item in items:
+            assert item.feature.dim() == 4
+            image_feature = self.vision.forward_features(
+                item.feature.type(next(self.vision.parameters()).dtype).to(
+                    device=next(self.vision.parameters()).device
+                )
+            )
+            images_embeds = self.projector(image_feature)
+            _, hw, n_dim = images_embeds.shape
+            h = w = int(hw**0.5)
+            tile_index = 0
+            for jdx in range(item.images_spatial_crop.shape[1]):
+                num_width_tiles, num_height_tiles = item.images_spatial_crop[0, jdx]
+                if num_width_tiles == 0 or num_height_tiles == 0:
+                    break
+                num_tiles_in_image = num_width_tiles * num_height_tiles
+
+                # [hw, D]
+                global_features = images_embeds[tile_index]
+
+                # [num_height_tiles * num_width_tiles, hw, D]
+                local_features = images_embeds[
+                    tile_index + 1 : tile_index + 1 + num_tiles_in_image
+                ]
+                tile_index += num_tiles_in_image + 1
+
+                # format global and local features
+                # ----------------- global view add newline -----------------
+                # [hw, D] -> [h, w, D]
+                global_features = global_features.view(h, w, n_dim)
+
+                # [D]     -> [h, 1, D]
+                new_lines_in_global = repeat(self.image_newline, "d -> h 1 d", h=h)
+
+                # cat([h, w, D], [h, 1, D], dim=1) -> [h, w + 1, D]
+                global_features = torch.cat(
+                    [global_features, new_lines_in_global], dim=1
+                )
+
+                # [h, w + 1, D] -> [h * (w + 1), D]
+                global_features = global_features.view(-1, n_dim)
+
+                # ----------------- local view add newline -----------------
+                # [num_height_tiles * num_width_tiles, h * w, D] ->
+                # [num_height_tiles * h, num_width_tiles * w, D]
+                local_features = rearrange(
+                    local_features,
+                    "(th tw) (h w) d -> (th h) (tw w) d",
+                    th=num_height_tiles,
+                    tw=num_width_tiles,
+                    h=h,
+                    w=w,
+                )
+
+                # [D] -> [num_height_tiles * h, 1, D]
+                new_lines_in_local = repeat(
+                    self.image_newline,
+                    "d -> (th h) 1 d",
+                    th=num_height_tiles,
+                    h=h,
+                )
+
+                # [num_height_tiles * h, num_width_tiles * w + 1, D]
+                local_features = torch.cat([local_features, new_lines_in_local], dim=1)
+
+                # [num_height_tiles * h, num_width_tiles * w + 1, D]
+                #   --> [(num_height_tiles * h) * (num_width_tiles * w + 1), D]
+                local_features = local_features.view(-1, n_dim)
+
+                # merge global and local tiles
+                if self.global_view_pos == "head":
+                    global_local_features = torch.cat(
+                        [
+                            global_features,
+                            self.view_seperator[None, :],
+                            local_features,
+                        ]
+                    )
+                else:
+                    global_local_features = torch.cat(
+                        [
+                            local_features,
+                            self.view_seperator[None, :],
+                            global_features,
+                        ]
+                    )
+
+                images_in_this_batch.append(global_local_features)
+
+        return torch.cat(images_in_this_batch, dim=0)
+
+
+EntryClass = DeepseekVL2ForCausalLM
diff --git a/python/sglang/srt/models/ernie4.py b/python/sglang/srt/models/ernie4.py
new file mode 100644
index 000000000..78a7b4b94
--- /dev/null
+++ b/python/sglang/srt/models/ernie4.py
@@ -0,0 +1,426 @@
+# Copyright 2023-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+""" Inference-only Ernie4.5 model compatible with baidu/ERNIE-4.5-*-PT weights. """
+
+from typing import Iterable, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers.models.ernie4_5_moe.configuration_ernie4_5_moe import (
+    Ernie4_5_MoeConfig,
+)
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.layers.communicator import enable_moe_dense_fully_dp
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.deepseek_v2 import DeepseekV2MLP as Ernie4MLP
+from sglang.srt.models.llama import LlamaAttention as Ernie4Attention
+from sglang.srt.utils import add_prefix, make_layers
+
+
+class MoEGate(nn.Module):
+    def __init__(
+        self,
+        config,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.weight = nn.Parameter(
+            torch.empty((config.moe_num_experts, config.hidden_size))
+        )
+        self.e_score_correction_bias = nn.Parameter(
+            torch.empty((1, config.moe_num_experts))
+        )
+
+    def forward(self, hidden_states):
+        logits = F.linear(hidden_states, self.weight, None)
+        return logits
+
+
+class Ernie4Moe(nn.Module):
+    def __init__(
+        self,
+        config: Ernie4_5_MoeConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.layer_id = layer_id
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.moe_num_shared_experts = getattr(config, "moe_num_shared_experts", 0)
+
+        if config.hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+
+        self.gate = MoEGate(config=config, prefix=add_prefix("gate", prefix))
+
+        self.topk = TopK(
+            top_k=config.moe_k,
+            renormalize=True,
+            use_grouped_topk=False,
+            correction_bias=self.gate.e_score_correction_bias,
+        )
+
+        self.experts = get_moe_impl_class()(
+            num_experts=config.moe_num_experts,
+            top_k=config.moe_k,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            layer_id=self.layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("experts", prefix),
+        )
+
+        if self.moe_num_shared_experts > 0:
+            intermediate_size = (
+                config.moe_intermediate_size * config.moe_num_shared_experts
+            )
+            # disable tp for shared experts when enable deepep moe
+            self.shared_experts = Ernie4MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=add_prefix("shared_experts", prefix),
+            )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.forward_normal(hidden_states)
+
+    def forward_normal(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        shared_output = (
+            self.shared_experts(hidden_states)
+            if self.moe_num_shared_experts > 0
+            else None
+        )
+        # router_logits: (num_tokens, n_experts)
+        router_logits = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states, topk_output=topk_output
+        )
+        if shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+        return final_hidden_states
+
+
+class Ernie4DecoderLayer(nn.Module):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(
+        self,
+        config,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        is_mtp: bool = False,
+    ):
+        super().__init__()
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        rope_is_neox_style = getattr(config, "rope_is_neox_style", False)
+        # Self attention.
+        self.self_attn = Ernie4Attention(
+            config=config,
+            hidden_size=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            rope_is_neox_style=rope_is_neox_style,
+            max_position_embeddings=config.max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+            bias=config.use_bias,
+        )
+        moe_layer_start_index = getattr(
+            config, "moe_layer_start_index", config.num_hidden_layers
+        )
+        moe_layer_end_index = getattr(
+            config, "moe_layer_end_index", config.num_hidden_layers - 1
+        )
+        # MLP
+        if (not is_mtp) and (
+            moe_layer_start_index <= layer_id <= moe_layer_end_index
+            and (layer_id - moe_layer_start_index) % config.moe_layer_interval == 0
+        ):
+            self.mlp = Ernie4Moe(
+                config=config,
+                layer_id=layer_id,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+        else:
+            if enable_moe_dense_fully_dp():
+                mlp_tp_rank, mlp_tp_size = 0, 1
+            else:
+                mlp_tp_rank, mlp_tp_size = None, None
+            self.mlp = Ernie4MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+                tp_rank=mlp_tp_rank,
+                tp_size=mlp_tp_size,
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+class Ernie4Model(nn.Module):
+    def __init__(
+        self,
+        config: Ernie4_5_MoeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Ernie4DecoderLayer(
+                config=config, layer_id=idx, quant_config=quant_config, prefix=prefix
+            ),
+            prefix="model.layers",
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]]]:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        residual = None
+        for layer in self.layers:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class Ernie4_5_ForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+    stacked_params_mapping = [
+        # (param_name, weight_name, shard_id)
+        (".qkv_proj", ".q_proj", "q"),
+        (".qkv_proj", ".k_proj", "k"),
+        (".qkv_proj", ".v_proj", "v"),
+        (".gate_up_proj", ".gate_proj", 0),
+        (".gate_up_proj", ".up_proj", 1),
+    ]
+
+    def __init__(
+        self,
+        config: Ernie4_5_MoeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config: Ernie4_5_MoeConfig = config
+        self.quant_config = quant_config
+        self.model = Ernie4Model(config, quant_config, add_prefix("model", prefix))
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix="lm_head",
+            )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            for param_name, weight_name, shard_id in self.stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if name in params_dict.keys():
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                else:
+                    raise KeyError(f"Parameter '{name}' not found in model.")
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+
+class Ernie4_5_MoeForCausalLM(Ernie4_5_ForCausalLM):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.moe_num_experts,
+        )
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            if name.startswith("model.mtp_"):
+                continue
+            if "moe_statics.e_score_correction_bias" in name:
+                name = name.replace("moe_statics", "gate")
+            for param_name, weight_name, shard_id in self.stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if ("mlp.experts." in name) and name not in params_dict:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        weight_loader = param.weight_loader
+                        weight_loader(
+                            param,
+                            loaded_weight,
+                            name,
+                            shard_id=shard_id,
+                            expert_id=expert_id,
+                        )
+                    else:
+                        raise KeyError(
+                            f"Parameter '{name}'(replaced) not found in model."
+                        )
+                    break
+                else:
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+                    else:
+                        raise KeyError(f"Parameter '{name}' not found in model.")
+
+
+EntryClass = [Ernie4_5_MoeForCausalLM, Ernie4_5_ForCausalLM]
diff --git a/python/sglang/srt/models/ernie4_eagle.py b/python/sglang/srt/models/ernie4_eagle.py
new file mode 100644
index 000000000..bf62d515c
--- /dev/null
+++ b/python/sglang/srt/models/ernie4_eagle.py
@@ -0,0 +1,203 @@
+# Copyright 2023-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+""" Ernie4.5 MTP model compatible with baidu/ERNIE-4.5-*-PT weights. """
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers.models.ernie4_5_moe.configuration_ernie4_5_moe import (
+    Ernie4_5_MoeConfig,
+)
+
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.ernie4 import Ernie4_5_ForCausalLM, Ernie4DecoderLayer
+from sglang.srt.utils import add_prefix
+
+
+class Ernie4ModelMTP(nn.Module):
+    def __init__(
+        self,
+        config: Ernie4_5_MoeConfig,
+        layer_id: int,
+        prefix: str,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.mtp_emb_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.mtp_hidden_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.mtp_linear_proj = nn.Linear(
+            config.hidden_size * 2, config.hidden_size, bias=config.use_bias
+        )
+        self.mtp_block = Ernie4DecoderLayer(
+            config=config,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("mtp_block", prefix),
+            is_mtp=True,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        # masking inputs at position 0, as not needed by MTP
+        hidden_states[positions == 0] = 0
+
+        hidden_states = self.mtp_linear_proj(
+            torch.cat(
+                (
+                    self.mtp_emb_norm(hidden_states),
+                    self.mtp_hidden_norm(forward_batch.spec_info.hidden_states),
+                ),
+                dim=-1,
+            )
+        )
+        residual = None
+        hidden_states, residual = self.mtp_block(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+            residual=residual,
+        )
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class Ernie4_5_MoeForCausalLMMTP(nn.Module):
+    def __init__(
+        self,
+        config: Ernie4_5_MoeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        mtp_layer_id: int = 0,
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.mtp_layer_id = mtp_layer_id
+
+        self.model = Ernie4ModelMTP(
+            config=config,
+            layer_id=self.mtp_layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("model", prefix),
+        )
+
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix="lm_head",
+            )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        mtp_layer_found = False
+        mtp_weight_patterns = [
+            f"mtp_block.{self.mtp_layer_id}",
+            f"mtp_emb_norm.{self.mtp_layer_id}",
+            f"mtp_hidden_norm.{self.mtp_layer_id}",
+            f"mtp_linear_proj.{self.mtp_layer_id}",
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            # Only name matched patterns should be loaded
+            for layer_pattern in mtp_weight_patterns:
+                if layer_pattern in name:
+                    mtp_layer_found = True
+                    break
+            else:
+                continue
+            # But strip mtp_layer_id before loading, because each MTP layer is a MTP model.
+            name = name.replace(f".{self.mtp_layer_id}.", ".")
+            for (
+                param_name,
+                weight_name,
+                shard_id,
+            ) in Ernie4_5_ForCausalLM.stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if name in params_dict.keys():
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                else:
+                    raise KeyError(f"Parameter '{name}' not found in MTP model.")
+        if not mtp_layer_found:
+            raise KeyError(
+                f"MTP layers 'mtp_*.{self.mtp_layer_id}.*' not found in weights."
+            )
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        self.model.embed_tokens.weight = embed
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            del self.lm_head.weight
+            self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+
+EntryClass = [Ernie4_5_MoeForCausalLMMTP]
diff --git a/python/sglang/srt/models/exaone.py b/python/sglang/srt/models/exaone.py
new file mode 100644
index 000000000..1e4dfb3df
--- /dev/null
+++ b/python/sglang/srt/models/exaone.py
@@ -0,0 +1,377 @@
+# Copyright 2024 The LGcns AI Engineering Team
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from llama2.py
+"""Inference-only Exaone model compatible with HuggingFace weights."""
+
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+
+class ExaoneGatedMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.c_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("c_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.c_proj(x)
+        return x
+
+
+class ExaoneAttention(nn.Module):
+    def __init__(
+        self,
+        config,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 500000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        rope_is_neox_style: bool = True,
+        max_position_embeddings: int = 4096,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(
+            config, "head_dim", self.hidden_size // self.total_num_heads
+        )
+        self.rotary_dim = int(
+            self.head_dim * getattr(config, "partial_rotary_factor", 1)
+        )
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.out_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("out_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.rotary_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=rope_is_neox_style,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.out_proj(attn_output)
+        return output
+
+
+class ExaoneDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 500000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+        rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 4096)
+        self.self_attn = ExaoneAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            rope_is_neox_style=rope_is_neox_style,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = ExaoneGatedMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.activation_function,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        rms_norm_eps = config.layer_norm_epsilon
+        self.ln_1 = RMSNorm(config.hidden_size, eps=rms_norm_eps)
+        self.ln_2 = RMSNorm(config.hidden_size, eps=rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.ln_1(hidden_states)
+        else:
+            hidden_states, residual = self.ln_1(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.ln_2(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class ExaoneModel(nn.Module):
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.wte = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.h = nn.ModuleList(
+            [
+                ExaoneDecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"h.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        rms_norm_eps = config.layer_norm_epsilon
+        self.ln_f = RMSNorm(config.hidden_size, eps=rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.wte(input_ids)
+        else:
+            hidden_states = input_embeds
+        residual = None
+        for i in range(len(self.h)):
+            layer = self.h[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        hidden_states, _ = self.ln_f(hidden_states, residual)
+        return hidden_states
+
+
+class ExaoneForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.transformer = ExaoneModel(
+            config, quant_config=quant_config, prefix=add_prefix("transformer", prefix)
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.transformer.wte
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.transformer(
+            input_ids, positions, forward_batch, input_embeds
+        )
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "c_fc_0", 0),
+            ("gate_up_proj", "c_fc_1", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+
+            name = name.replace("attn.attention", "self_attn")
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = ExaoneForCausalLM
diff --git a/python/sglang/srt/models/gemma.py b/python/sglang/srt/models/gemma.py
new file mode 100644
index 000000000..1ecb5011f
--- /dev/null
+++ b/python/sglang/srt/models/gemma.py
@@ -0,0 +1,410 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from:
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/gemma.py#L1
+"""Inference-only Gemma model compatible with HuggingFace weights."""
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import GeluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+
+class GemmaMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        self.act_fn = GeluAndMul("none")
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class GemmaAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        layer_id: int = 0,
+        max_position_embeddings: int = 8192,
+        rope_theta: float = 10000,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=self.rope_theta,
+            is_neox_style=True,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class GemmaDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = GemmaAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=config.head_dim,
+            layer_id=layer_id,
+            max_position_embeddings=config.max_position_embeddings,
+            rope_theta=config.rope_theta,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = GemmaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class GemmaModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList(
+            [
+                GemmaDecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        # Normalize the embedding by sqrt(hidden_size)
+        hidden_states *= self.config.hidden_size**0.5
+
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class GemmaForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+    ]
+    # Gemma does not apply LoRA to the embedding layer.
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = GemmaModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.model.embed_tokens, forward_batch
+        )
+
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],  # [start, end) 0-based
+        input_embeds: torch.Tensor = None,
+    ):
+        start, end = split_interval
+        # embed
+        if start == 0:
+            if input_embeds is None:
+                forward_batch.hidden_states = self.model.embed_tokens(input_ids)
+            else:
+                forward_batch.hidden_states = input_embeds
+
+            # Normalize the embedding by sqrt(hidden_size)
+            forward_batch.hidden_states *= self.model.config.hidden_size**0.5
+
+        # decoder layer
+        for i in range(start, end):
+            layer = self.model.layers[i]
+            forward_batch.hidden_states, forward_batch.residual = layer(
+                positions,
+                forward_batch.hidden_states,
+                forward_batch,
+                forward_batch.residual,
+            )
+
+        if end == self.model.config.num_hidden_layers:
+            # norm
+            forward_batch.hidden_states, _ = self.model.norm(
+                forward_batch.hidden_states, forward_batch.residual
+            )
+
+            # logits process
+            result = self.logits_processor(
+                input_ids,
+                forward_batch.hidden_states,
+                self.model.embed_tokens,
+                forward_batch,
+            )
+        else:
+            result = None
+
+        return result
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params = set()
+        for name, loaded_weight in weights:
+            for param_name, shard_name, shard_id in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # lm_head is not used in vllm as it is tied with embed_token.
+                # To prevent errors, skip loading lm_head.weight.
+                if "lm_head.weight" in name:
+                    continue
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # GemmaRMSNorm is different from Llama's in that it multiplies
+                # (1 + weight) to the output, instead of just weight.
+                if "norm.weight" in name:
+                    loaded_weight += 1.0
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+
+EntryClass = GemmaForCausalLM
diff --git a/python/sglang/srt/models/gemma2.py b/python/sglang/srt/models/gemma2.py
new file mode 100644
index 000000000..b9b4e4cec
--- /dev/null
+++ b/python/sglang/srt/models/gemma2.py
@@ -0,0 +1,480 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from:
+# https://github.com/vllm-project/vllm/blob/56b325e977435af744f8b3dca7af0ca209663558/vllm/model_executor/models/gemma2.py
+
+from typing import Iterable, Optional, Set, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import GeluAndMul
+from sglang.srt.layers.layernorm import GemmaRMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.utils import add_prefix, make_layers
+
+
+# Aligned with HF's implementation, using sliding window inclusive with the last token
+# SGLang assumes exclusive
+def get_attention_sliding_window_size(config):
+    return config.sliding_window - 1
+
+
+class Gemma2MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        hidden_activation: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if not (hidden_act == hidden_activation == "gelu_pytorch_tanh"):
+            raise ValueError(
+                "Gemma2 uses `gelu_pytorch_tanh` as the hidden activation "
+                "function. Please set `hidden_act` and `hidden_activation` to "
+                "`gelu_pytorch_tanh`."
+            )
+        self.act_fn = GeluAndMul()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Gemma2Attention(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        max_position_embeddings: int,
+        rope_theta: float,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.config = config
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = config.query_pre_attn_scalar**-0.5
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=self.rope_theta,
+            is_neox_style=True,
+        )
+
+        use_sliding_window = layer_id % 2 == 0 and hasattr(config, "sliding_window")
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            logit_cap=self.config.attn_logit_softcapping,
+            sliding_window_size=(
+                get_attention_sliding_window_size(config)
+                if use_sliding_window
+                else None
+            ),
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Gemma2DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.hidden_size = config.hidden_size
+        self.self_attn = Gemma2Attention(
+            layer_id=layer_id,
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=config.head_dim,
+            max_position_embeddings=config.max_position_embeddings,
+            rope_theta=config.rope_theta,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.hidden_size = config.hidden_size
+        self.mlp = Gemma2MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            hidden_activation=config.hidden_activation,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GemmaRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.pre_feedforward_layernorm = GemmaRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_feedforward_layernorm = GemmaRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+
+        hidden_states, residual = self.pre_feedforward_layernorm(
+            hidden_states, residual
+        )
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        return hidden_states, residual
+
+
+class Gemma2Model(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Gemma2DecoderLayer(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+            ),
+            prefix=add_prefix("layers", prefix),
+        )
+        self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        # Normalize the embedding by sqrt(hidden_size)
+        # The normalizer's data type should be downcasted to the model's
+        # data type such as bfloat16, not float32.
+        # See https://github.com/huggingface/transformers/pull/29402
+        normalizer = self.config.hidden_size**0.5
+        self.register_buffer("normalizer", torch.tensor(normalizer))
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=torch.float16)
+        hidden_states *= normalizer
+
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class Gemma2ForCausalLM(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+    ]
+    # Gemma does not apply LoRA to the embedding layer.
+    embedding_modules = {}
+    embedding_padding_modules = []
+    supports_lora = True
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Gemma2Model(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.model.embed_tokens, forward_batch
+        )
+
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],  # [start, end) 0-based
+        input_embeds: torch.Tensor = None,
+    ):
+        start, end = split_interval
+        # embed
+        if start == 0:
+            if input_embeds is None:
+                forward_batch.hidden_states = self.model.embed_tokens(input_ids)
+            else:
+                forward_batch.hidden_states = input_embeds
+
+            # Normalize
+            normalizer = torch.tensor(
+                self.model.config.hidden_size**0.5, dtype=torch.float16
+            )
+            forward_batch.hidden_states *= normalizer
+
+        # decoder layer
+        for i in range(start, end):
+            layer = self.model.layers[i]
+            forward_batch.hidden_states, forward_batch.residual = layer(
+                positions,
+                forward_batch.hidden_states,
+                forward_batch,
+                forward_batch.residual,
+            )
+
+        if end == self.model.config.num_hidden_layers:
+            # norm
+            forward_batch.hidden_states, _ = self.model.norm(
+                forward_batch.hidden_states, forward_batch.residual
+            )
+
+            # logits process
+            result = self.logits_processor(
+                input_ids,
+                forward_batch.hidden_states,
+                self.model.embed_tokens,
+                forward_batch,
+            )
+        else:
+            result = None
+
+        return result
+
+    def get_attention_sliding_window_size(self):
+        return get_attention_sliding_window_size(self.config)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for param_name, shard_name, shard_id in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # lm_head is not used in vllm as it is tied with embed_token.
+                # To prevent errors, skip loading lm_head.weight.
+                if "lm_head.weight" in name:
+                    continue
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+
+EntryClass = Gemma2ForCausalLM
diff --git a/python/sglang/srt/models/gemma2_reward.py b/python/sglang/srt/models/gemma2_reward.py
new file mode 100644
index 000000000..03bea4d10
--- /dev/null
+++ b/python/sglang/srt/models/gemma2_reward.py
@@ -0,0 +1,70 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import Gemma2Config
+
+from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.gemma2 import Gemma2ForCausalLM, Gemma2Model
+from sglang.srt.utils import add_prefix
+
+
+class Gemma2ForSequenceClassification(nn.Module):
+    def __init__(
+        self,
+        config: Gemma2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.num_labels = config.num_labels
+        self.model = Gemma2Model(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=False)
+
+        self.eos_token_id = config.eos_token_id
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = True,
+    ) -> EmbeddingPoolerOutput:
+        assert (
+            get_embedding
+        ), "Gemma2ForSequenceClassification is only used for embedding"
+
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        last_token_hidden = self.pooler(hidden_states, forward_batch).embeddings
+        scores = self.score(last_token_hidden)
+
+        return EmbeddingPoolerOutput(scores)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        Gemma2ForCausalLM.load_weights(self, weights)
+
+
+EntryClass = [Gemma2ForSequenceClassification]
diff --git a/python/sglang/srt/models/gemma3_causal.py b/python/sglang/srt/models/gemma3_causal.py
new file mode 100644
index 000000000..5b6145aff
--- /dev/null
+++ b/python/sglang/srt/models/gemma3_causal.py
@@ -0,0 +1,764 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import copy
+from typing import Iterable, Optional, Set, Tuple
+
+import einops
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import (
+    ROPE_INIT_FUNCTIONS,
+    AutoModel,
+    Gemma3TextConfig,
+    PretrainedConfig,
+    PreTrainedModel,
+)
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import GeluAndMul
+from sglang.srt.layers.layernorm import Gemma3RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import apply_rotary_pos_emb
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.utils import add_prefix, make_layers
+
+
+# Aligned with HF's implementation, using sliding window inclusive with the last token
+# SGLang assumes exclusive
+def get_attention_sliding_window_size(config):
+    return config.sliding_window - 1
+
+
+# Adapted from:
+# https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/gemma3.py
+def extract_layer_index(prefix: str) -> int:
+    """Extract the layer index from a prefix string."""
+    parts = prefix.split(".")
+    for part in parts:
+        if part.startswith("layers."):
+            layer_str = part.split(".")[-1]
+            try:
+                return int(layer_str)
+            except ValueError:
+                continue
+    return -1
+
+
+class Gemma3MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_activation: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_activation != "gelu_pytorch_tanh":
+            raise ValueError(
+                "Gemma3 uses `gelu_pytorch_tanh` as the hidden activation "
+                "function. Please set `hidden_activation` to "
+                "`gelu_pytorch_tanh`."
+            )
+        self.act_fn = GeluAndMul()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Gemma3Attention(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        config: Gemma3TextConfig,
+        max_position_embeddings: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.config = config
+        tp_size = get_tensor_model_parallel_world_size()
+
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+
+        hidden_size = config.hidden_size
+
+        head_dim = getattr(
+            config, "head_dim", hidden_size // config.num_attention_heads
+        )
+        self.head_dim = head_dim
+
+        self.q_size = self.num_heads * self.head_dim
+
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = config.query_pre_attn_scalar**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.is_sliding = config.layer_types[layer_id] == "sliding_attention"
+
+        # Initialize the rotary embedding.
+        if self.is_sliding:
+            # Local attention. Override the values in config.json.
+            self.rope_theta = config.rope_local_base_freq
+            self.rope_scaling = {"rope_type": "default"}
+            # FIXME(mick): idk why vllm does this
+            # self.sliding_window = config.interleaved_sliding_window
+            self.sliding_window = get_attention_sliding_window_size(config)
+        else:
+            # Global attention. Use the values in config.json.
+            self.rope_theta = config.rope_theta
+            self.rope_scaling = config.rope_scaling
+            self.sliding_window = None
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            logit_cap=0.0,
+            # Module must also define `get_attention_sliding_window_size` to correctly initialize
+            # attention backend in `ForwardBatch`.
+            sliding_window_size=self.sliding_window,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+        # Gemma3 adds normalization for q and k
+        self.q_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)
+
+    def naive_attn_with_masks(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        out: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        q = q.view(-1, self.num_heads, self.head_dim)
+        # Expand the key and value to handle GQA.
+        num_queries_per_kv = self.num_heads // self.num_kv_heads
+        k = k.view(-1, self.num_kv_heads, self.head_dim)
+        k = k.repeat_interleave(num_queries_per_kv, dim=-2)
+        v = v.view(-1, self.num_kv_heads, self.head_dim)
+        v = v.repeat_interleave(num_queries_per_kv, dim=-2)
+
+        if self.is_sliding:
+            attn_masks = kwargs["local_attn_masks"]
+        else:
+            attn_masks = kwargs["global_attn_masks"]
+
+        seq_lens = kwargs["seq_lens"]
+        start_idx = 0
+        for seq_len, attn_mask in zip(seq_lens, attn_masks):
+            end_idx = start_idx + seq_len
+            query = q[start_idx:end_idx].unsqueeze(0)
+            key = k[start_idx:end_idx].unsqueeze(0)
+            value = v[start_idx:end_idx].unsqueeze(0)
+
+            # Transpose.
+            query = query.transpose(1, 2)
+            key = key.transpose(1, 2)
+            value = value.transpose(1, 2)
+
+            output = F.scaled_dot_product_attention(
+                query,
+                key,
+                value,
+                attn_mask,
+                self.scaling,
+            )
+            output = output.transpose(1, 2).flatten(-2, -1)
+            out[start_idx:end_idx] = output
+            start_idx = end_idx
+        return out
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        forward_batch: ForwardBatch,
+        **kwargs,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        # [s, h * head_dim]
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        # [s, h, head_dim]
+        q = q.unflatten(-1, (self.num_heads, self.head_dim))
+        # -> [h, s, head_dim]
+        q = q.transpose(0, 1).unsqueeze(0)
+        q = self.q_norm(q)
+        k = k.unflatten(-1, (self.num_kv_heads, self.head_dim))
+        # -> [h, s, head_dim]
+        k = k.transpose(0, 1).unsqueeze(0)
+        k = self.k_norm(k)
+
+        # q, k = self.rotary_emb(positions, q, k)
+        cos, sin = position_embeddings
+        q, k = apply_rotary_pos_emb(q, k, cos, sin)
+
+        # [b, h, s, head_dim] ->  [b, s, h, head_dim]
+        q = q.permute(0, 2, 1, 3)
+        k = k.permute(0, 2, 1, 3)
+
+        attn_output = self.attn(q, k, v, forward_batch=forward_batch)
+
+        # Compatible with triton backend which returns [1, s, h, head_dim]
+        if attn_output.dim() == 4 and attn_output.shape[0] == 1:
+            attn_output = attn_output.squeeze(0)
+            attn_output = attn_output.flatten(-2, -1)
+        # [s, h * head_dim]
+
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Gemma3DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Gemma3Attention(
+            layer_id=layer_id,
+            config=config,
+            max_position_embeddings=config.max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.hidden_size = config.hidden_size
+        self.mlp = Gemma3MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_activation=config.hidden_activation,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = Gemma3RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = Gemma3RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.pre_feedforward_layernorm = Gemma3RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_feedforward_layernorm = Gemma3RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.is_sliding = self.self_attn.is_sliding
+        self.layer_id = layer_id
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        position_embeddings_global: torch.Tensor,
+        position_embeddings_local: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs,
+    ) -> tuple[
+        torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]
+    ]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # apply global RoPE to non-sliding layer only
+        if self.self_attn.is_sliding:
+            position_embeddings = position_embeddings_local
+        else:
+            position_embeddings = position_embeddings_global
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            forward_batch=forward_batch,
+            **kwargs,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.pre_feedforward_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        return outputs
+
+
+class Gemma3RotaryEmbedding(nn.Module):
+    def __init__(self, config: Gemma3TextConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get(
+                "rope_type", config.rope_scaling.get("type")
+            )
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(
+                self.config, device, seq_len=seq_len
+            )
+            self.register_buffer(
+                "inv_freq", inv_freq, persistent=False
+            )  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+
+        if (
+            seq_len < self.original_max_seq_len
+            and self.max_seq_len_cached > self.original_max_seq_len
+        ):  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+
+        # Core RoPE block
+        inv_freq_expanded = (
+            self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        )
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = (
+            device_type
+            if isinstance(device_type, str) and device_type != "mps"
+            else "cpu"
+        )
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (
+                inv_freq_expanded.float().to(x.device) @ position_ids_expanded.float()
+            ).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class Gemma3TextScaledWordEmbedding(nn.Embedding):
+    """
+    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
+    """
+
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        padding_idx: int,
+        embed_scale: Optional[float] = 1.0,
+    ):
+        super().__init__(num_embeddings, embedding_dim, padding_idx)
+        self.embed_scale = embed_scale
+
+    def forward(self, input_ids: torch.Tensor):
+        return super().forward(input_ids) * self.embed_scale
+
+
+class Gemma3TextModel(PreTrainedModel):
+    def __init__(
+        self,
+        config: Gemma3TextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config=config)
+        self.config = config
+        self.quant_config = quant_config
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        # Gemma3 downcasts the below to float16, causing sqrt(3072)=55.4256 to become 55.5. See https://github.com/huggingface/transformers/pull/29402
+        self.embed_tokens = Gemma3TextScaledWordEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            self.padding_idx,
+            embed_scale=self.config.hidden_size**0.5,
+        )
+
+        self.norm = Gemma3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Gemma3RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # when we want to create a local RoPE layer. Config defaults should hold values for global RoPE
+        config = copy.deepcopy(config)
+        config.rope_theta = config.rope_local_base_freq
+        config.rope_scaling = {"rope_type": "default"}
+        self.rotary_emb_local = Gemma3RotaryEmbedding(config=config)
+
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Gemma3DecoderLayer(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=add_prefix("layers", prefix),
+        )
+        self.norm = Gemma3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        if positions.dim() == 1:
+            positions = einops.rearrange(positions, "s -> 1 s")
+
+        position_embeddings_global = self.rotary_emb(hidden_states, positions)
+        position_embeddings_local = self.rotary_emb_local(hidden_states, positions)
+        for layer in self.layers:
+            layer_outputs = layer(
+                positions=positions,
+                position_embeddings_global=position_embeddings_global,
+                position_embeddings_local=position_embeddings_local,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+                **kwargs,
+            )
+            hidden_states = layer_outputs[0]
+
+        hidden_states = self.norm(hidden_states)
+
+        return hidden_states
+
+
+class Gemma3ForCausalLM(PreTrainedModel):
+    config_class = Gemma3TextConfig
+
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    config_class = Gemma3TextConfig
+    base_model_prefix = "language_model"
+
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+    ]
+    # Gemma does not apply LoRA to the embedding layer.
+    embedding_modules = {}
+    embedding_padding_modules = []
+    supports_lora = True
+
+    def __init__(
+        self,
+        config: Gemma3TextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config=config)
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Gemma3TextModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    def get_attention_sliding_window_size(self):
+        return get_attention_sliding_window_size(self.config)
+
+    def dtype(self) -> torch.dtype:
+        return next(self.parameters()).dtype
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        **kwargs,
+    ) -> LogitsProcessor:
+        hidden_states = self.model(
+            input_ids, positions, forward_batch, input_embeds, **kwargs
+        )
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.model.embed_tokens, forward_batch
+        )
+
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],  # [start, end) 0-based
+        input_embeds: torch.Tensor = None,
+    ):
+        start, end = split_interval
+        # embed
+        if start == 0:
+            if input_embeds is None:
+                hidden_states = self.model.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+
+            if positions.dim() == 1:
+                positions = einops.rearrange(positions, "s -> 1 s")
+            position_embeddings_global = self.model.rotary_emb(hidden_states, positions)
+            position_embeddings_local = self.model.rotary_emb_local(
+                hidden_states, positions
+            )
+
+            forward_batch.hidden_states = hidden_states
+            forward_batch.model_specific_states = {
+                "positions": positions,
+                "position_embeddings_global": position_embeddings_global,
+                "position_embeddings_local": position_embeddings_local,
+            }
+
+        # decoder layer
+        for i in range(start, end):
+            layer = self.model.layers[i]
+            layer_output = layer(
+                positions=forward_batch.model_specific_states["positions"],
+                position_embeddings_global=forward_batch.model_specific_states[
+                    "position_embeddings_global"
+                ],
+                position_embeddings_local=forward_batch.model_specific_states[
+                    "position_embeddings_local"
+                ],
+                hidden_states=forward_batch.hidden_states,
+                forward_batch=forward_batch,
+            )
+            forward_batch.hidden_states = layer_output[0]
+
+        if end == self.model.config.num_hidden_layers:
+            # norm
+            forward_batch.hidden_states = self.model.norm(forward_batch.hidden_states)
+
+            # logits process
+            result = self.logits_processor(
+                input_ids,
+                forward_batch.hidden_states,
+                self.model.embed_tokens,
+                forward_batch,
+            )
+        else:
+            result = None
+
+        return result
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for param_name, shard_name, shard_id in stacked_params_mapping:
+                # if param_name in name:
+                # print(f"{param_name} is already in {name}")
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # lm_head is not used in vllm as it is tied with embed_token.
+                # To prevent errors, skip loading lm_head.weight.
+                if "lm_head.weight" in name:
+                    continue
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        # unloaded_params = params_dict.keys() - loaded_params
+        # if unloaded_params:
+        #     logger.warning(
+        #         "Some weights are not initialized from checkpoints: %s", unloaded_params
+        #     )
+        return loaded_params
+
+
+EntryClass = Gemma3ForCausalLM
+AutoModel.register(Gemma3TextConfig, Gemma3ForCausalLM, exist_ok=True)
diff --git a/python/sglang/srt/models/gemma3_mm.py b/python/sglang/srt/models/gemma3_mm.py
new file mode 100644
index 000000000..527a11b69
--- /dev/null
+++ b/python/sglang/srt/models/gemma3_mm.py
@@ -0,0 +1,444 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from:
+# https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/gemma3_mm.py
+
+import logging
+from functools import lru_cache
+from typing import Dict, Iterable, List, Optional, Set, Tuple, TypedDict
+
+import torch
+from torch import nn
+from transformers import Gemma3Config, PreTrainedModel
+
+from sglang.srt.hf_transformers_utils import get_processor
+from sglang.srt.layers.layernorm import Gemma3RMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternTokenPairs,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    MultimodalDataItem,
+    MultimodalInputs,
+    flatten_nested_list,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.models.gemma3_causal import Gemma3ForCausalLM
+from sglang.srt.models.siglip import SiglipVisionModel
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+cached_get_processor = lru_cache(get_processor)
+
+
+class Gemma3ImagePixelInputs(TypedDict):
+    pixel_values: torch.Tensor
+    """Shape: `(batch_size * num_images, num_channels, height, width)`"""
+
+
+class Gemma3MultiModalProjector(nn.Module):
+    """Projector for Gemma3 multimodal."""
+
+    def __init__(self, config: Gemma3Config):
+        super().__init__()
+
+        self.mm_input_projection_weight = nn.Parameter(
+            torch.zeros(
+                config.vision_config.hidden_size, config.text_config.hidden_size
+            )
+        )
+
+        self.mm_soft_emb_norm = Gemma3RMSNorm(
+            config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps
+        )
+
+        self.patches_per_image = int(
+            config.vision_config.image_size // config.vision_config.patch_size
+        )
+        self.tokens_per_side = int(config.mm_tokens_per_image**0.5)
+        self.kernel_size = self.patches_per_image // self.tokens_per_side
+        self.avg_pool = nn.AvgPool2d(
+            kernel_size=self.kernel_size, stride=self.kernel_size
+        )
+
+    def forward(self, vision_outputs: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_length, hidden_size = vision_outputs.shape
+
+        # Reshape for pooling
+        reshaped_vision_outputs = vision_outputs.transpose(1, 2)
+        reshaped_vision_outputs = reshaped_vision_outputs.reshape(
+            batch_size, hidden_size, self.patches_per_image, self.patches_per_image
+        )
+        reshaped_vision_outputs = reshaped_vision_outputs.contiguous()
+
+        # Apply pooling
+        pooled_vision_outputs = self.avg_pool(reshaped_vision_outputs)
+        pooled_vision_outputs = pooled_vision_outputs.flatten(2)
+        pooled_vision_outputs = pooled_vision_outputs.transpose(1, 2)
+
+        # Apply normalization
+        normed_vision_outputs = self.mm_soft_emb_norm(pooled_vision_outputs)
+
+        # Project to text embedding space
+        projected_vision_outputs = torch.matmul(
+            normed_vision_outputs, self.mm_input_projection_weight
+        )
+
+        return projected_vision_outputs.type_as(vision_outputs)
+
+
+class Gemma3ForConditionalGeneration(PreTrainedModel):
+    config_class = Gemma3Config
+    """Gemma3 multimodal model for conditional generation."""
+
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+        ".out_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+        "out_proj": ("proj", 0),
+    }
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+    ]
+    # Gemma does not apply LoRA to the embedding layer.
+    embedding_modules = {}
+    embedding_padding_modules = []
+    supports_lora = True
+
+    def __init__(
+        self,
+        config: Gemma3Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config=config)
+        self.config = config
+        self.quant_config = quant_config
+
+        self.vision_tower = SiglipVisionModel(
+            config=config.vision_config,
+            quant_config=quant_config,
+            prefix=add_prefix("vision_tower", prefix),
+        )
+
+        self.multi_modal_projector = Gemma3MultiModalProjector(config)
+        self.vocab_size = config.text_config.vocab_size
+
+        # Text model
+        self.language_model = Gemma3ForCausalLM(
+            config.text_config,
+            quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+        if self.language_model.logits_processor.logit_scale:
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.language_model.logits_processor.logit_scale *= logit_scale
+        self.post_init()
+
+    def pad_input_ids(
+        self, input_ids: List[int], image_inputs: MultimodalInputs
+    ) -> List[int]:
+        """Pad input IDs with image tokens."""
+        # Get special token IDs
+        im_start_id: int = image_inputs.im_start_id
+        im_end_id: int = image_inputs.im_end_id
+
+        media_token_pairs = [(im_start_id, im_end_id)]
+        pattern = MultiModalityDataPaddingPatternTokenPairs(media_token_pairs)
+        ids = pattern.pad_input_tokens(input_ids, image_inputs)
+        return ids
+
+    def prepare_attn_masks(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        mask_dtype: torch.dtype,
+        **kwargs,
+    ) -> Dict:
+        """Prepare attention masks for multimodal inputs."""
+        kwargs["has_images"] = True
+
+        # Distinguish sequences by position id 0
+        start_indices = (positions == 0).cpu().nonzero()
+        num_seqs = len(start_indices)
+        seq_lens = []
+
+        for i in range(num_seqs):
+            start_idx = start_indices[i].item()
+            if i < num_seqs - 1:
+                end_idx = start_indices[i + 1].item()
+            else:
+                end_idx = len(input_ids)
+            seq_lens.append(end_idx - start_idx)
+
+        kwargs["seq_lens"] = seq_lens
+
+        # Create attention masks
+        global_attn_masks = []
+        local_attn_masks = []
+        sliding_window = self.config.text_config.interleaved_sliding_window
+
+        start_idx = 0
+        for seq_len in seq_lens:
+            end_idx = start_idx + seq_len
+            input_token_ids = input_ids[start_idx:end_idx]
+            start_idx = end_idx
+
+            # Create global causal mask
+            global_attn_mask = torch.empty(
+                1,
+                1,
+                seq_len,
+                seq_len,
+                dtype=mask_dtype,
+                device=input_ids.device,
+            )
+            global_attn_mask.fill_(float("-inf"))
+            global_attn_mask = global_attn_mask.triu(diagonal=1)
+
+            # Consider bidirectional attention between image tokens
+            img_mask = torch.zeros_like(global_attn_mask)
+            img_pos = input_token_ids == self.config.image_token_index
+            img_mask[:, :, :, img_pos] += 1
+            img_mask[:, :, img_pos, :] += 1
+            global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask)
+            global_attn_masks.append(global_attn_mask)
+
+            # Create local causal mask with sliding window
+            local_attn_mask = torch.ones_like(global_attn_mask)
+            local_attn_mask = torch.tril(local_attn_mask, diagonal=-sliding_window)
+            local_attn_mask = torch.where(
+                local_attn_mask == 0, global_attn_mask, float("-inf")
+            )
+            local_attn_masks.append(local_attn_mask)
+
+        kwargs["global_attn_masks"] = global_attn_masks
+        kwargs["local_attn_masks"] = local_attn_masks
+        return kwargs
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.language_model.get_input_embeddings()
+
+    def get_attention_sliding_window_size(self):
+        """
+        This value is used to initialize attention backends in `ForwardBatch`.
+        """
+        return self.language_model.get_attention_sliding_window_size()
+
+    def get_image_feature(self, items: List[MultimodalDataItem]):
+        """
+        Projects the last hidden state from the vision model into language model space.
+
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        # Process images one by one to handle flatten_batch=True constraint in vision_tower
+        all_pixel_values = flatten_nested_list([item.feature for item in items])
+        vision_outputs_list = []
+
+        for pixel_values_batch in all_pixel_values:
+            # Normalize input shape to [batch_size, channels, height, width]
+            if pixel_values_batch.dim() == 5:
+                pixel_values_batch = pixel_values_batch.squeeze(0)
+            elif pixel_values_batch.dim() == 3:
+                pixel_values_batch = pixel_values_batch.unsqueeze(0)
+            elif pixel_values_batch.dim() != 4:
+                raise ValueError(
+                    f"Unexpected pixel_values shape: {pixel_values_batch.shape}"
+                )
+
+            # Process each image in the batch
+            batch_size = pixel_values_batch.shape[0]
+            for i in range(batch_size):
+                pixel_value = pixel_values_batch[i : i + 1]  # Keep batch dimension as 1
+                pixel_value = pixel_value.to(
+                    device=self.vision_tower.device, dtype=self.language_model.dtype()
+                )
+                vision_output = self.vision_tower(pixel_values=pixel_value)
+                vision_outputs_list.append(vision_output)
+
+        # Concatenate all vision outputs
+        vision_outputs = torch.cat(vision_outputs_list, dim=0)
+        image_features = self.multi_modal_projector(vision_outputs)
+        return image_features
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        **kwargs: object,
+    ) -> LogitsProcessor:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
+
+        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/Gemma3-test-224px-hf")
+        >>> processor = AutoProcessor.from_pretrained("google/Gemma3-test-224px-hf")
+
+        >>> prompt = "answer en Where is the cow standing?"
+        >>> url = "https://huggingface.co/gv-hf/Gemma3-test-224px-hf/resolve/main/cow_beach_1.png"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_length=30)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "answer en Where is the cow standing?\nbeach"
+        ```"""
+
+        # Important: position_ids in Gemma3 are 1-indexed
+        # This really does cost me sometime
+        positions += 1
+
+        # Replace image id with PAD if the image token if OOV, to avoid index-errors
+        if input_ids is not None and self.config.image_token_index >= self.vocab_size:
+            special_image_mask = input_ids == self.config.image_token_index
+            llm_input_ids = input_ids.clone()
+            llm_input_ids[special_image_mask] = 0
+        else:
+            llm_input_ids = input_ids
+
+        hs = general_mm_embed_routine(
+            input_ids=llm_input_ids,
+            forward_batch=forward_batch,
+            language_model=self.language_model,
+            multimodal_model=self,
+            positions=positions,
+        )
+
+        return hs
+
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            ("gate_up_proj", "up_proj", 1),
+            ("gate_up_proj", "gate_proj", 0),
+        ]
+        """Load weights for the model."""
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            if "language_model" in name:
+                # Gemma3ForCausalLM.load_weights(self, [(name.replace("language_model.", ""), loaded_weight)])
+                causal_loaded_params = Gemma3ForCausalLM.load_weights(
+                    self, [(name, loaded_weight)]
+                )
+                loaded_params.update(causal_loaded_params)
+                continue
+            else:
+                for param_name, weight_name, shard_id in stacked_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                    break
+                else:
+                    if "vision_model" in name:
+                        # adapt to VisionAttention
+                        name = name.replace(".self_attn.out_proj", ".self_attn.proj")
+                    # Skip loading extra bias for GPTQ models
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    # Remapping the name of FP8 kv-scale
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+        unloaded_params = params_dict.keys() - loaded_params
+        if unloaded_params:
+            pass
+            # raise RuntimeError(
+            #     f"Some weights are not initialized from checkpoints: {unloaded_params}")
+        return loaded_params
+
+
+EntryClass = Gemma3ForConditionalGeneration
diff --git a/python/sglang/srt/models/gemma3n_audio.py b/python/sglang/srt/models/gemma3n_audio.py
new file mode 100644
index 000000000..57a743c9f
--- /dev/null
+++ b/python/sglang/srt/models/gemma3n_audio.py
@@ -0,0 +1,949 @@
+import math
+from typing import Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import Gemma3nAudioConfig, PreTrainedModel
+
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.models.gemma3n_causal import Gemma3nRMSNorm
+from sglang.srt.utils import add_prefix, make_layers
+
+
+class Gemma3nCumulativeGroupNorm(nn.Module):
+    """Applies Group Normalization cumulatively over the time dimension.
+
+    This layer normalizes the input by calculating the mean and variance
+    cumulatively over the time dimension (dim 1). The statistics are computed
+    over all feature dimensions (specified by `feature_dims` and `num_channels`)
+    for elements marked as valid by the optional `mask`.
+
+    If a `mask` is provided (True for valid, False for invalid/padded),
+    invalid time steps do not contribute to the statistics calculation, and
+    their corresponding output values are zeroed out.
+
+    Scale and bias, if enabled, are applied per-channel (last dimension).
+    This behavior is similar to JAX's `GroupNormalization` with `num_groups=1`
+    and `cumulative=True`.
+    """
+
+    def __init__(
+        self,
+        num_channels: int,  # Number of channels (size of the last dimension)
+        feature_dims: Sequence[
+            int
+        ],  # Sizes of non-channel feature dimensions, e.g., (H, W) for input [B,T,H,W,C]
+        eps: float = 1e-3,
+    ):
+        super().__init__()
+        self.num_channels = num_channels
+        self.feature_dims = tuple(feature_dims)
+        self.eps = eps
+
+        # Scale parameter depends only on the channel dimension
+        self.weight = nn.Parameter(torch.ones(num_channels))
+
+        # Axes for normalization: all dimensions except Batch (0) and Time (1).
+        # For input [B, T, *feature_dims, C], these are dims from 2 onwards.
+        self.reduction_axes = tuple(range(2, 2 + len(self.feature_dims) + 1))
+
+    def forward(
+        self, x: torch.Tensor, mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """Applies cumulative group norm, optionally using a mask.
+
+        Args:
+          x: Input tensor, shape [B, T, *feature_dims, C].
+          mask: Optional boolean mask, shape [B, T]. True indicates a valid
+            (non-padded) time step. If None, all time steps are considered valid.
+
+        Returns:
+          Normalized tensor with the same shape as x.
+        """
+        expected_input_suffix = self.feature_dims + (self.num_channels,)
+        if x.shape[2:] != expected_input_suffix:
+            raise ValueError(
+                f"Input tensor shape suffix {x.shape[2:]} does not match expected"
+                f" suffix (feature_dims + num_channels) {expected_input_suffix}"
+            )
+
+        input_dtype = x.dtype
+        # Calculations are performed in float32 for numerical stability.
+        calc_dtype = torch.float32
+        x_calc = x.to(calc_dtype)
+
+        # Prepare a broadcastable mask (`mask_calc`).
+        # If no mask is provided, treat all elements as valid
+        # (mask_calc is all ones).
+        # Otherwise, expand the [B, T] mask to [B, T, 1, ..., 1] for broadcasting.
+        mask_calc = torch.ones_like(x_calc, dtype=calc_dtype)
+
+        # Cumulative Statistics Calculation
+        # 1. Sum of values over reduction axes at each time step.
+        sum_values_at_t = torch.sum(x_calc, dim=self.reduction_axes, keepdim=True)
+        # 2. Cumulative sum of values over time.
+        cum_sum_values = torch.cumsum(sum_values_at_t, dim=1)
+
+        # 3. Count of valid elements in the normalization group at each time step.
+        #    (A "group" here consists of all features at a given Batch, Time).
+        elements_in_group_at_t = torch.sum(
+            mask_calc, dim=self.reduction_axes, keepdim=True
+        )
+        # 4. Cumulative count of valid elements over time.
+        cum_count_elements = torch.cumsum(elements_in_group_at_t, dim=1)
+        # Avoid division by zero if all preceding elements were masked.
+        safe_cum_count_elements = torch.clamp(cum_count_elements, min=1.0)
+
+        # 5. Cumulative mean.
+        cum_mean = cum_sum_values / safe_cum_count_elements
+
+        # 6. Sum of squared differences from the cumulative mean.
+        #    Only sum for valid elements: (x_calc - cum_mean)^2 * mask_calc.
+        #    Using x_calc here for the difference, as cum_mean already accounts for masking.
+        squared_diff_from_mean = (x_calc - cum_mean).pow(2)
+        sum_sq_diff_at_t = torch.sum(
+            squared_diff_from_mean, dim=self.reduction_axes, keepdim=True
+        )
+
+        # 7. Cumulative sum of squared differences over time.
+        cum_sum_sq_diff = torch.cumsum(sum_sq_diff_at_t, dim=1)
+
+        # 8. Cumulative variance.
+        cum_variance = cum_sum_sq_diff / safe_cum_count_elements
+
+        # Normalize the input using the calculated cumulative statistics:
+        # (x - E[x]) / sqrt(Var[x] + eps)
+        normalized_x = (x_calc - cum_mean) * torch.rsqrt(cum_variance + self.eps)
+
+        # Apply affine transformation (scale and bias) if enabled.
+        # Scale and bias are applied per-channel (last dimension).
+        scale = self.weight.to(calc_dtype)
+        # Reshape for broadcasting: [C] -> [1, ..., 1, C]
+        scale_view_shape = [1] * (x.dim() - 1) + [self.num_channels]
+        normalized_x = normalized_x * scale.view(scale_view_shape)
+
+        # Zero out outputs for time steps that were originally masked (where mask_calc is 0).
+        # This ensures padded/invalid positions in the input result in zero output.
+        final_output = normalized_x * mask_calc
+
+        return final_output.to(input_dtype)
+
+
+class Gemma3nAudioRelativePositionEmbedding(nn.Module):
+    def __init__(
+        self,
+        config: Gemma3nAudioConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        self.num_heads = self.config.conf_num_attention_heads
+        self.channels = self.config.hidden_size
+        self.head_dim = self.channels // self.num_heads
+        self.max_backward = max(0, self.config.conf_attention_context_left - 1)
+        self.max_forward = self.config.conf_attention_context_right
+
+        self.pos_proj = ColumnParallelLinear(
+            self.channels,
+            self.num_heads * self.head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("pos_proj", prefix),
+        )
+
+        min_timescale = 1.0
+        max_timescale = 1.0e4
+        num_timescales = self.channels // 2
+        log_timescale_increment = math.log(
+            float(max_timescale) / float(min_timescale)
+        ) / max(num_timescales - 1, 1)
+        inv_timescales = min_timescale * torch.exp(
+            torch.arange(num_timescales) * -log_timescale_increment
+        )
+        self.register_buffer(
+            "inv_timescales",
+            inv_timescales.float().unsqueeze(0).unsqueeze(0),
+            persistent=False,
+        )
+
+    def _get_timing_signal_1d_pos(
+        self, position: torch.Tensor, dtype: torch.dtype
+    ) -> torch.Tensor:
+        assert position.ndim == 2
+        position = position.float().unsqueeze(-1)
+        scaled_time = position * self.inv_timescales.to(
+            device=position.device, dtype=torch.float32
+        )
+        timing_signal = torch.cat(
+            [torch.sin(scaled_time), torch.cos(scaled_time)], dim=-1
+        )
+        return timing_signal.type(dtype)
+
+    def _relative_shift(
+        self,
+        term_bd_before_shift: torch.Tensor,
+        batch_size: int,
+        num_heads: int,
+        num_query_blocks: int,
+        query_block_size: int,
+        key_context_size: int,
+        max_span_plus_1: int,
+    ) -> torch.Tensor:
+        """Performs the relative shift."""
+        pad_amount_last_dim = (key_context_size + 1) - max_span_plus_1
+        padding_tuple = (0, pad_amount_last_dim)
+
+        term_bd_padded = F.pad(term_bd_before_shift, padding_tuple)
+        term_bd_reshaped = term_bd_padded.reshape(
+            (
+                batch_size,
+                num_heads,
+                num_query_blocks,
+                query_block_size * (key_context_size + 1),
+            )
+        )
+        term_bd_sliced = term_bd_reshaped[
+            :, :, :, : query_block_size * key_context_size
+        ]
+        term_bd_shifted = term_bd_sliced.reshape(
+            (
+                batch_size,
+                num_heads,
+                num_query_blocks,
+                query_block_size,
+                key_context_size,
+            )
+        )
+        return term_bd_shifted
+
+    def forward(self, queries: torch.Tensor, keys: torch.Tensor) -> torch.Tensor:
+        batch_size, num_query_blocks, query_block_size, num_heads, head_dim = (
+            queries.shape
+        )
+        _, _, key_context_size, _, _ = keys.shape
+
+        pos_indices = torch.arange(
+            self.max_backward, -self.max_forward - 1, -1, device=queries.device
+        ).unsqueeze(0)
+        max_span_plus_1 = pos_indices.shape[1]
+
+        sin_emb_timing_signal = self._get_timing_signal_1d_pos(
+            pos_indices, dtype=queries.dtype
+        )
+        projected_sin_emb, _ = self.pos_proj(sin_emb_timing_signal)
+        sin_emb = projected_sin_emb.reshape(
+            1, max_span_plus_1, self.num_heads, self.head_dim
+        ).squeeze(0)
+
+        queries_p = queries.permute(0, 3, 1, 2, 4)
+        keys_p_t = keys.permute(0, 3, 1, 4, 2)
+        term_ac = torch.matmul(queries_p, keys_p_t)
+
+        q_permuted = queries.permute(0, 3, 1, 2, 4)
+        s_permuted = sin_emb.permute(1, 2, 0)
+        q_reshaped = q_permuted.reshape(
+            batch_size, num_heads, num_query_blocks * query_block_size, head_dim
+        )
+        term_bd_unshifed_matmul = torch.matmul(q_reshaped, s_permuted)
+        term_bd_unshifed = term_bd_unshifed_matmul.reshape(
+            batch_size,
+            num_heads,
+            num_query_blocks,
+            query_block_size,
+            max_span_plus_1,
+        )
+
+        term_bd_shifted = self._relative_shift(
+            term_bd_unshifed,
+            batch_size,
+            num_heads,
+            num_query_blocks,
+            query_block_size,
+            key_context_size,
+            max_span_plus_1,
+        )
+
+        return term_ac + term_bd_shifted
+
+
+class Gemma3nAudioAttention(nn.Module):
+    """Local dot product self-attention for audio."""
+
+    def __init__(
+        self,
+        config: Gemma3nAudioConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        self.num_heads = self.config.conf_num_attention_heads
+        self.hidden_size = self.config.hidden_size
+        self.head_dim = self.hidden_size // self.num_heads
+
+        self.chunk_size = self.config.conf_attention_chunk_size
+        self.max_future_horizon = self.config.conf_attention_context_right
+        self.max_past_horizon = max(0, self.config.conf_attention_context_left - 1)
+        self.attention_logits_soft_cap = self.config.conf_attention_logit_cap
+        self.context_size = (
+            self.chunk_size + self.max_past_horizon + self.max_future_horizon
+        )
+
+        self.relative_position_embedding = Gemma3nAudioRelativePositionEmbedding(
+            config,
+            quant_config,
+            prefix=add_prefix("relative_position_embedding", prefix),
+        )
+        self.per_dim_scale = nn.Parameter(torch.zeros((self.head_dim,)))
+
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.num_heads,
+            self.num_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        q_scale = self.head_dim**-0.5
+        r_softplus_0 = 1.0 / F.softplus(torch.tensor(0.0))
+        self.register_buffer(
+            "q_scale", (q_scale * r_softplus_0).clone().detach(), persistent=False
+        )
+
+        # Create local causal mask
+        lower_causal_mask = torch.tril(
+            torch.ones((self.context_size, self.chunk_size), dtype=torch.bool),
+            diagonal=0,
+        ).T
+        upper_causal_mask = torch.tril(
+            torch.ones((self.chunk_size, self.context_size), dtype=torch.bool),
+            diagonal=self.max_past_horizon + self.max_future_horizon,
+        )
+        local_causal_valid_mask = torch.ones(
+            (self.chunk_size, self.context_size), dtype=torch.bool
+        )
+        local_causal_valid_mask = (
+            local_causal_valid_mask * lower_causal_mask * upper_causal_mask
+        )
+        self.register_buffer(
+            "local_causal_valid_mask", local_causal_valid_mask, persistent=False
+        )
+
+        self.register_buffer(
+            "softcap",
+            torch.tensor(self.attention_logits_soft_cap).float(),
+            persistent=False,
+        )
+
+    def _pad_dim1(
+        self, x: torch.Tensor, dim10_val: int, dim11_val: int
+    ) -> torch.Tensor:
+        padding_tuple = [0] * x.ndim * 2
+        dim_idx_from_end = x.ndim - 2
+        start_idx_for_dim = 2 * dim_idx_from_end
+        padding_tuple[start_idx_for_dim] = dim10_val
+        padding_tuple[start_idx_for_dim + 1] = dim11_val
+        return F.pad(x, tuple(padding_tuple))
+
+    def _convert_to_block(self, x: torch.Tensor) -> torch.Tensor:
+        """Turns a sequence to non overlapping blocks."""
+        shape = x.shape
+        b, t = shape[:2]
+        num_blocks = (t + self.chunk_size - 1) // self.chunk_size
+
+        if (padding_len := num_blocks * self.chunk_size - t) > 0:
+            x = self._pad_dim1(x, 0, padding_len)
+
+        permute_dims = (b, num_blocks, self.chunk_size) + shape[2:]
+        x = x.reshape(permute_dims).contiguous()
+        return x
+
+    def _extract_block_context(self, x: torch.Tensor) -> torch.Tensor:
+        """Extracts temporal context for every block."""
+        pad_left = self.max_past_horizon
+        pad_right = self.max_future_horizon + self.chunk_size - 1
+        x = self._pad_dim1(x, pad_left, pad_right)
+
+        frame_len = self.context_size
+        frame_step = self.chunk_size
+
+        x_unfolded = x.unfold(dimension=1, size=frame_len, step=frame_step)
+
+        if x.ndim > 2 and x_unfolded.ndim > 3:
+            x_unfolded = torch.movedim(x_unfolded, source=-1, destination=2)
+
+        return x_unfolded.contiguous()
+
+    def forward(self, x: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor:
+        # Project to Q, K, V
+        qkv, _ = self.qkv_proj(x)
+        query_states, key_states, value_states = qkv.chunk(chunks=3, dim=-1)
+
+        # Reshape
+        query_states = query_states.reshape(
+            *x.shape[:-1], self.num_heads, self.head_dim
+        ).contiguous()
+        key_states = key_states.reshape(
+            *x.shape[:-1], self.num_heads, self.head_dim
+        ).contiguous()
+        value_states = value_states.reshape(
+            *x.shape[:-1], self.num_heads, self.head_dim
+        ).contiguous()
+
+        # Apply per-dim scale
+        per_dim_scale_sp = F.softplus(self.per_dim_scale)
+        broadcast_shape = (1, 1, 1, self.head_dim)
+        per_dim_scale_sp_broadcast = per_dim_scale_sp.view(broadcast_shape)
+        query_states = query_states * self.q_scale * per_dim_scale_sp_broadcast
+
+        batch_size, q_time = query_states.shape[:2]
+
+        # Convert to blocks
+        query_blocks = self._convert_to_block(query_states)
+        key_blocks = self._extract_block_context(key_states)
+        value_blocks = self._extract_block_context(value_states)
+        num_query_blocks = query_blocks.shape[1]
+
+        # Create mask for valid positions
+        original_valid_mask = ~mask
+        extracted_valid_mask_blocks = self._extract_block_context(original_valid_mask)
+
+        if (
+            extracted_valid_mask_blocks.ndim == 4
+            and extracted_valid_mask_blocks.shape[0] == batch_size
+            and extracted_valid_mask_blocks.shape[1] == num_query_blocks
+            and extracted_valid_mask_blocks.shape[2]
+            * extracted_valid_mask_blocks.shape[3]
+            == self.context_size
+        ):
+            extracted_valid_mask_blocks = extracted_valid_mask_blocks.reshape(
+                batch_size, num_query_blocks, self.context_size
+            )
+
+        condition_from_input_validity = extracted_valid_mask_blocks.unsqueeze(
+            1
+        ).unsqueeze(-2)
+        condition_from_causality = (
+            self.local_causal_valid_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0)
+        )
+
+        final_condition_for_where = torch.logical_and(
+            condition_from_input_validity,
+            condition_from_causality.to(condition_from_input_validity.device),
+        )
+
+        # Compute attention scores
+        logits = self.relative_position_embedding(query_blocks, key_blocks)
+
+        # Apply attention logit softcap
+        softcap_val = self.softcap.to(logits.device)
+        logits = logits / softcap_val
+        logits = torch.tanh(logits)
+        logits = logits * softcap_val
+
+        # Apply the combined mask.
+        # final_condition_for_where will broadcast with logits [B,N,U,W,C]
+        logits = torch.where(
+            final_condition_for_where, logits, torch.finfo(logits.dtype).min
+        )
+
+        probabilities = F.softmax(logits, dim=-1, dtype=torch.float32).to(
+            dtype=value_blocks.dtype
+        )
+
+        # context_vectors is adapted from jax.numpy.einsum("BNuwc,BucNH->BuwNH", ...)
+        b_dim, n_dim, u_dim, w_dim, c_dim = probabilities.shape
+        h_dim = value_blocks.shape[-1]
+        prob_bun = probabilities.permute(0, 2, 1, 3, 4).reshape(-1, w_dim, c_dim)
+        v_bun = value_blocks.permute(0, 1, 3, 2, 4).reshape(-1, c_dim, h_dim)
+        result_bmm = torch.bmm(prob_bun, v_bun)
+        context_vectors = result_bmm.reshape(b_dim, u_dim, n_dim, w_dim, h_dim).permute(
+            0, 1, 3, 2, 4
+        )
+        context_vectors = context_vectors.reshape(
+            (
+                batch_size,
+                num_query_blocks * self.chunk_size,
+                self.num_heads,
+                self.head_dim,
+            )
+        )
+        context_vectors = context_vectors[:, :q_time]
+
+        return context_vectors
+
+
+class Gemma3nAudioSSCPConvBlock(nn.Module):
+    """A single convolution block for the SubSampleConvProjection."""
+
+    def __init__(
+        self,
+        config: Gemma3nAudioConfig,
+        idx: int,
+        input_freq_dim: int,
+        manual_padding: Tuple[int, int, int, int] = (0, 0, 0, 0),
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.manual_padding = manual_padding
+
+        in_channels = 1 if idx == 0 else self.config.sscp_conv_channel_size[idx - 1]
+        out_channels = self.config.sscp_conv_channel_size[idx]
+        kernel_h, kernel_w = self.config.sscp_conv_kernel_size[idx]
+        stride_h, stride_w = self.config.sscp_conv_stride_size[idx]
+
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=(kernel_h, kernel_w),
+            stride=(stride_h, stride_w),
+            padding=(0, 0),  # Manual padding is used
+            bias=False,
+        )
+
+        f_in_padded = input_freq_dim + self.manual_padding[0] + self.manual_padding[1]
+        f_out_conv = (f_in_padded - kernel_w) // stride_w + 1
+
+        self.norm = Gemma3nCumulativeGroupNorm(
+            num_channels=out_channels,
+            feature_dims=(f_out_conv,),
+            eps=self.config.sscp_conv_group_norm_eps,
+        )
+
+        self.activation = nn.ReLU()
+
+    def forward(self, audio_encodings: torch.Tensor) -> torch.Tensor:
+        audio_encodings_padded = F.pad(
+            audio_encodings, self.manual_padding, mode="constant", value=0.0
+        )
+        audio_encodings_conv = self.conv(audio_encodings_padded)
+        x_for_norm = audio_encodings_conv.permute(0, 2, 3, 1).contiguous()
+        x_normed = self.norm(x_for_norm)
+        audio_encodings_normed = x_normed.permute(0, 3, 1, 2).contiguous()
+        return self.activation(audio_encodings_normed)
+
+
+class Gemma3nAudioSubSampleConvProjection(nn.Module):
+    def __init__(
+        self,
+        config: Gemma3nAudioConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        current_f_for_block_input = config.input_feat_size
+        calculated_block_padding = []
+        calculated_f_out_dims = []
+
+        for i in range(2):  # Assuming 2 conv layers
+            kernel_h, kernel_w = config.sscp_conv_kernel_size[i]
+            stride_h, stride_w = config.sscp_conv_stride_size[i]
+
+            # Padding for Time (Height for Conv2d) - REVERSE_CAUSAL like
+            pad_t_top = 0
+            pad_t_bottom = kernel_h - 1
+
+            # Frequency Padding (Width for Conv2d)
+            pad_f_left = 1
+            pad_f_right = 1
+
+            manual_padding_tuple = (pad_f_left, pad_f_right, pad_t_top, pad_t_bottom)
+            calculated_block_padding.append(manual_padding_tuple)
+
+            f_in_padded = current_f_for_block_input + pad_f_left + pad_f_right
+            f_out_after_conv = (f_in_padded - kernel_w) // stride_w + 1
+            calculated_f_out_dims.append(f_out_after_conv)
+            current_f_for_block_input = f_out_after_conv
+
+        self.conv_0 = Gemma3nAudioSSCPConvBlock(
+            idx=0,
+            input_freq_dim=config.input_feat_size,
+            config=config,
+            manual_padding=calculated_block_padding[0],
+            quant_config=quant_config,
+            prefix=add_prefix("conv_0", prefix),
+        )
+        self.conv_1 = Gemma3nAudioSSCPConvBlock(
+            idx=1,
+            input_freq_dim=calculated_f_out_dims[0],
+            config=config,
+            manual_padding=calculated_block_padding[1],
+            quant_config=quant_config,
+            prefix=add_prefix("conv_1", prefix),
+        )
+
+        final_c_out = config.sscp_conv_channel_size[-1]
+        final_f_out = calculated_f_out_dims[-1]
+        self.input_proj_in_features = final_c_out * final_f_out
+
+        self.input_proj_linear = RowParallelLinear(
+            self.input_proj_in_features,
+            self.config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("input_proj_linear", prefix),
+        )
+
+    def forward(self, audio_encodings: torch.Tensor) -> torch.Tensor:
+        audio_encodings_reshaped = audio_encodings.unsqueeze(1)
+        x = self.conv_0(audio_encodings_reshaped)
+        x = self.conv_1(x)
+        b, c_out, t_out, f_out = x.shape
+        x_permuted = x.permute(0, 2, 3, 1).contiguous()
+        output_flattened = x_permuted.view(b, t_out, f_out * c_out)
+        output, _ = self.input_proj_linear(output_flattened)
+        return output
+
+
+class Gemma3nAudioConformerAttention(nn.Module):
+    def __init__(
+        self,
+        config: Gemma3nAudioConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        head_dim = self.config.hidden_size // self.config.conf_num_attention_heads
+        self.post_in_shape = (self.config.conf_num_attention_heads, head_dim)
+        self.post_in_features = self.config.hidden_size
+
+        self.register_buffer(
+            "gradient_clipping",
+            torch.tensor(self.config.gradient_clipping),
+            persistent=False,
+        )
+
+        self.pre_attn_norm = Gemma3nRMSNorm(self.config.hidden_size)
+        self.attn = Gemma3nAudioAttention(
+            config, quant_config, prefix=add_prefix("attn", prefix)
+        )
+        self.post = RowParallelLinear(
+            self.post_in_features,
+            self.config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("post", prefix),
+        )
+        self.post_norm = Gemma3nRMSNorm(self.config.hidden_size)
+
+    def forward(
+        self, audio_encodings: torch.Tensor, audio_mel_mask: torch.BoolTensor
+    ) -> torch.Tensor:
+        audio_encodings_input_to_attn = audio_encodings
+        audio_encodings = torch.clamp(
+            audio_encodings, -self.gradient_clipping, self.gradient_clipping
+        )
+        audio_encodings_norm = self.pre_attn_norm(audio_encodings)
+        audio_encodings_attn_out = self.attn(audio_encodings_norm, audio_mel_mask)
+
+        b, t, num_heads, head_dim = audio_encodings_attn_out.shape
+        audio_encodings_reshaped = audio_encodings_attn_out.reshape(
+            b, t, num_heads * head_dim
+        )
+
+        audio_encodings, _ = self.post(audio_encodings_reshaped)
+        audio_encodings = torch.clamp(
+            audio_encodings, -self.gradient_clipping, self.gradient_clipping
+        )
+        return audio_encodings_input_to_attn + self.post_norm(audio_encodings)
+
+
+class Gemma3nAudioConformerFeedForward(nn.Module):
+    def __init__(
+        self,
+        config: Gemma3nAudioConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        self.register_buffer(
+            "gradient_clipping",
+            torch.tensor(self.config.gradient_clipping),
+            persistent=False,
+        )
+
+        self.pre_layer_norm = Gemma3nRMSNorm(self.config.hidden_size)
+        self.ffw_layer_1 = ColumnParallelLinear(
+            self.config.hidden_size,
+            self.config.hidden_size * 4,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("ffw_layer_1", prefix),
+        )
+        self.ffw_layer_2 = RowParallelLinear(
+            self.config.hidden_size * 4,
+            self.config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("ffw_layer_2", prefix),
+        )
+        self.post_layer_norm = Gemma3nRMSNorm(self.config.hidden_size)
+        self.post_layer_scale = torch.tensor(self.config.conf_residual_weight)
+
+    def forward(self, audio_encodings: torch.Tensor) -> torch.Tensor:
+        residual = audio_encodings
+        audio_encodings = torch.clamp(
+            audio_encodings, -self.gradient_clipping, self.gradient_clipping
+        )
+        audio_encodings = self.pre_layer_norm(audio_encodings)
+        audio_encodings, _ = self.ffw_layer_1(audio_encodings)
+        audio_encodings = F.silu(audio_encodings)
+        audio_encodings, _ = self.ffw_layer_2(audio_encodings)
+        audio_encodings = torch.clamp(
+            audio_encodings, -self.gradient_clipping, self.gradient_clipping
+        )
+        audio_encodings = self.post_layer_norm(audio_encodings)
+        return residual + (audio_encodings * self.post_layer_scale)
+
+
+class Gemma3nAudioConformerLightConv1d(nn.Module):
+    def __init__(
+        self,
+        config: Gemma3nAudioConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        self.pre_layer_norm = Gemma3nRMSNorm(
+            self.config.hidden_size, eps=self.config.rms_norm_eps
+        )
+        self.linear_start = ColumnParallelLinear(
+            self.config.hidden_size,
+            self.config.hidden_size * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("linear_start", prefix),
+        )
+
+        self.depthwise_conv1d = nn.Conv1d(
+            in_channels=self.config.hidden_size,
+            out_channels=self.config.hidden_size,
+            kernel_size=self.config.conf_conv_kernel_size,
+            stride=1,
+            padding=0,  # Manual causal padding
+            groups=self.config.hidden_size,  # Depthwise
+            bias=False,
+        )
+        self.register_buffer(
+            "gradient_clipping",
+            torch.tensor(self.config.gradient_clipping),
+            persistent=False,
+        )
+        self.conv_norm = Gemma3nRMSNorm(
+            self.config.hidden_size, eps=self.config.rms_norm_eps
+        )
+        self.linear_end = RowParallelLinear(
+            self.config.hidden_size,
+            self.config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("linear_end", prefix),
+        )
+
+        self.causal_padding = self.config.conf_conv_kernel_size - 1
+
+    def forward(self, audio_encodings: torch.Tensor) -> torch.Tensor:
+        audio_encodings_residual = audio_encodings  # Save for residual connection
+
+        audio_encodings = self.pre_layer_norm(audio_encodings)
+        audio_encodings, _ = self.linear_start(audio_encodings)
+        audio_encodings = F.glu(audio_encodings, dim=-1)
+
+        # Permute for Conv1d: [B, T, D] -> [B, D, T]
+        audio_encodings_permuted = audio_encodings.permute(0, 2, 1)
+        # Apply manual causal padding
+        audio_encodings_permuted_padded = F.pad(
+            audio_encodings_permuted, (self.causal_padding, 0)
+        )
+        audio_encodings = self.depthwise_conv1d(audio_encodings_permuted_padded)
+        # Permute back: [B, D, T_out] -> [B, T_out, D]
+        audio_encodings = audio_encodings.permute(0, 2, 1)
+        audio_encodings = torch.clamp(
+            audio_encodings, -self.gradient_clipping, self.gradient_clipping
+        )
+        audio_encodings = self.conv_norm(audio_encodings)
+        audio_encodings = F.silu(audio_encodings)
+        audio_encodings, _ = self.linear_end(audio_encodings)
+        output = audio_encodings + audio_encodings_residual
+        return output
+
+
+class Gemma3nAudioConformerBlock(nn.Module):
+    def __init__(
+        self,
+        config: Gemma3nAudioConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        self.ffw_layer_start = Gemma3nAudioConformerFeedForward(
+            config, quant_config, prefix=add_prefix("ffw_layer_start", prefix)
+        )
+        self.attention = Gemma3nAudioConformerAttention(
+            config, quant_config, prefix=add_prefix("attention", prefix)
+        )
+        self.lconv1d = Gemma3nAudioConformerLightConv1d(
+            config, quant_config, prefix=add_prefix("lconv1d", prefix)
+        )
+        self.ffw_layer_end = Gemma3nAudioConformerFeedForward(
+            config, quant_config, prefix=add_prefix("ffw_layer_end", prefix)
+        )
+        self.register_buffer(
+            "gradient_clipping",
+            torch.tensor(self.config.gradient_clipping),
+            persistent=False,
+        )
+        self.norm = Gemma3nRMSNorm(self.config.hidden_size)
+
+    def forward(
+        self, audio_encodings: torch.Tensor, audio_mel_mask: torch.BoolTensor
+    ) -> torch.Tensor:
+        audio_encodings = self.ffw_layer_start(audio_encodings)
+        audio_encodings = self.attention(audio_encodings, audio_mel_mask)
+        validity_mask_for_lconv = ~audio_mel_mask  # True for valid
+        audio_encodings_for_lconv_input = (
+            audio_encodings
+            * validity_mask_for_lconv.unsqueeze(-1).to(audio_encodings.dtype)
+        )
+        audio_encodings = self.lconv1d(audio_encodings_for_lconv_input)
+
+        audio_encodings = self.ffw_layer_end(audio_encodings)
+        audio_encodings = torch.clamp(
+            audio_encodings, -self.gradient_clipping, self.gradient_clipping
+        )
+        output = self.norm(audio_encodings)
+        return output
+
+
+class Gemma3nAudioEncoder(PreTrainedModel):
+    """A Universal Speech Encoder -- https://arxiv.org/abs/2303.01037"""
+
+    config_class = Gemma3nAudioConfig
+
+    def __init__(
+        self,
+        config: Gemma3nAudioConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__(config)
+        self.config = config
+
+        self.subsample_conv_projection = Gemma3nAudioSubSampleConvProjection(
+            config, quant_config, prefix=add_prefix("subsample_conv_projection", prefix)
+        )
+        self.conformer = make_layers(
+            config.conf_num_hidden_layers,
+            lambda idx, prefix: Gemma3nAudioConformerBlock(
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=add_prefix("conformer", prefix),
+        )
+
+    def forward(
+        self, audio_mel: torch.Tensor, audio_mel_mask: torch.BoolTensor
+    ) -> Tuple[torch.Tensor, torch.BoolTensor]:
+        """Encodes a batch of MELs.
+
+        Args:
+            audio_mel: a torch.Tensor of shape [batch, num_frames, mel_bins].
+            audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames].
+
+        Returns:
+            audio_encodings: a torch.Tensor of shape
+                `[batch_size, reduced_time_frames, hidden_size]`
+            audio_mel_mask: a torch.BoolTensor of shape [batch, reduced_time_frames].
+        """
+        audio_encodings = self.subsample_conv_projection(
+            audio_mel
+        )  # audio_encodings: [B, T_sub, D]
+
+        # Subsample the input audio_mel_mask to match the time dimension of audio_encodings (T_sub)
+        t_sub = audio_encodings.shape[1]
+
+        time_stride_product = 1
+        for stride_pair_idx in range(len(self.config.sscp_conv_stride_size)):
+            time_stride_product *= self.config.sscp_conv_stride_size[stride_pair_idx][0]
+
+        # Create indices for gathering from the original mask.
+        # These indices map to original time steps corresponding to the start of each
+        # receptive field in the subsampled output.
+        indices = (
+            torch.arange(t_sub, device=audio_mel_mask.device) * time_stride_product
+        )
+        indices = torch.clamp(indices, max=audio_mel_mask.shape[1] - 1)
+
+        # Expand indices for batch compatibility if B > 1 and indices is 1D.
+        if audio_mel_mask.ndim > 1 and indices.ndim == 1:
+            indices = indices.unsqueeze(0).expand(
+                audio_mel_mask.shape[0], -1
+            )  # [B, T_sub]
+        elif (
+            audio_mel_mask.ndim == indices.ndim
+            and audio_mel_mask.shape[0] == 1
+            and indices.shape[0] != 1
+            and t_sub == indices.shape[0]
+        ):
+            # Handle case where B=1 but indices became [T_sub] instead of [1, T_sub]
+            indices = indices.unsqueeze(0)
+
+        current_mask = torch.gather(audio_mel_mask, 1, indices)  # [B, T_sub]
+
+        # Fallback: Ensure mask length matches feature length after gather.
+        if current_mask.shape[1] != t_sub:
+            if current_mask.shape[1] > t_sub:
+                current_mask = current_mask[:, :t_sub]
+            else:  # current_mask.shape[1] < t_sub
+                padding_needed = t_sub - current_mask.shape[1]
+                current_mask = F.pad(
+                    current_mask, (0, padding_needed), value=True
+                )  # Pad with True (masked)
+
+        for i, block in enumerate(self.conformer):
+            audio_encodings = block(
+                audio_encodings, current_mask
+            )  # Pass the processed mask
+
+        if self.config.conf_reduction_factor > 1:
+            audio_encodings = audio_encodings[:, :: self.config.conf_reduction_factor]
+            # Reduce the mask as well
+            current_mask = current_mask[:, :: self.config.conf_reduction_factor]
+
+        # Final masking of audio_encodings based on the final current_mask
+        # Ensure current_mask length matches the finally reduced audio_encodings length
+        if current_mask.shape[1] != audio_encodings.shape[1]:
+            target_len = audio_encodings.shape[1]
+            mask_current_len = current_mask.shape[1]
+            if target_len > mask_current_len:
+                padding_needed = target_len - mask_current_len
+                current_mask = F.pad(current_mask, (0, padding_needed), value=True)
+            elif mask_current_len > target_len:  # mask is longer
+                current_mask = current_mask[:, :target_len]
+
+        audio_encodings = audio_encodings.masked_fill(current_mask.unsqueeze(-1), 0.0)
+        return audio_encodings, current_mask
diff --git a/python/sglang/srt/models/gemma3n_causal.py b/python/sglang/srt/models/gemma3n_causal.py
new file mode 100644
index 000000000..0f710b0f8
--- /dev/null
+++ b/python/sglang/srt/models/gemma3n_causal.py
@@ -0,0 +1,1010 @@
+from typing import Iterable, Optional, Set, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import AutoModel, Gemma3nTextConfig, PretrainedConfig, PreTrainedModel
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import GeluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.models.gemma3_causal import Gemma3TextScaledWordEmbedding
+from sglang.srt.utils import add_prefix, make_layers
+
+
+# Aligned with HF's implementation, using sliding window inclusive with the last token
+# SGLang assumes exclusive
+def get_attention_sliding_window_size(config):
+    return config.sliding_window - 1
+
+
+class Gemma3nRMSNorm(RMSNorm):
+    def __init__(
+        self,
+        dim: int,
+        eps: float = 1e-6,
+        with_scale: bool = True,
+    ) -> None:
+        super().__init__(dim, eps=eps)
+        if not with_scale:
+            del self.weight
+            self.register_buffer(
+                "weight",
+                torch.ones(dim, dtype=torch.get_default_dtype()),
+                persistent=False,
+            )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        original_shape = x.shape
+        x_2d = x.contiguous().reshape(-1, original_shape[-1])
+        x_2d = super().forward(x_2d)
+        x = x_2d.reshape(original_shape)
+        return x
+
+
+class Gemma3nTextScaledWordEmbedding(Gemma3TextScaledWordEmbedding):
+    pass
+
+
+class Gemma3nTextMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_activation: str,
+        activation_sparsity: float = 0.0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_activation != "gelu_pytorch_tanh":
+            raise ValueError(
+                "Gemma3n uses `gelu_pytorch_tanh` as the hidden activation "
+                "function. Please set `hidden_activation` to "
+                "`gelu_pytorch_tanh`."
+            )
+        # Use proper GELU with tanh approximation as specified
+        self.act_fn = GeluAndMul()
+        self.activation_sparsity = activation_sparsity
+        self.register_buffer(
+            "target_sparsity_tensor",
+            torch.tensor(self.activation_sparsity, dtype=torch.float32),
+            persistent=False,
+        )  # moved from _gaussian_topk for cuda graph
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+
+        # Split gate and up projections
+        gate_proj, up_proj = gate_up.chunk(2, dim=-1)
+
+        # Apply activation sparsity if needed
+        if self.activation_sparsity > 0.0:
+            gate_proj = self._gaussian_topk(gate_proj)
+
+        gate_up = torch.cat([gate_proj, up_proj], dim=-1)
+
+        # Apply GELU activation to gate projection and multiply with up projection
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+    def _gaussian_topk(self, inputs: torch.Tensor) -> torch.Tensor:
+        normal_dist = torch.distributions.normal.Normal(0, 1)
+        std_multiplier = normal_dist.icdf(self.target_sparsity_tensor)
+        std_multiplier = std_multiplier.type(inputs.dtype)
+        inputs_mean = torch.mean(inputs, dim=-1, keepdim=True)
+        inputs_std = torch.std(inputs, dim=-1, keepdim=True, unbiased=False)
+        cutoff_x = inputs_mean + inputs_std * std_multiplier
+        return F.relu(inputs - cutoff_x)
+
+
+class Gemma3nLaurelBlock(nn.Module):
+    """Learned Augmented Residual Layer"""
+
+    def __init__(
+        self,
+        config: Gemma3nTextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        self.linear_left = ColumnParallelLinear(
+            config.hidden_size,
+            config.laurel_rank,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("linear_left", prefix),
+        )
+        self.linear_right = RowParallelLinear(
+            config.laurel_rank,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("linear_right", prefix),
+        )
+        self.post_laurel_norm = Gemma3nRMSNorm(
+            dim=config.hidden_size,
+            eps=config.rms_norm_eps,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # [num_tokens, hidden_size]
+        laurel_x, _ = self.linear_left(x)
+        laurel_x, _ = self.linear_right(laurel_x)
+        normed_laurel_x = self.post_laurel_norm(laurel_x)
+        return x + normed_laurel_x
+
+
+class Gemma3nAltUp(nn.Module):
+    """Alternating Updates (AltUp)"""
+
+    def __init__(
+        self,
+        config: Gemma3nTextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        self.correct_output_scale = nn.Parameter(
+            torch.zeros(config.hidden_size, dtype=torch.float32)
+        )
+        self.correction_coefs = ColumnParallelLinear(
+            config.altup_num_inputs,
+            config.altup_num_inputs,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("correction_coefs", prefix),
+        )
+        self.prediction_coefs = ColumnParallelLinear(
+            config.altup_num_inputs,
+            config.altup_num_inputs**2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("prediction_coefs", prefix),
+        )
+        self.modality_router = ColumnParallelLinear(
+            config.hidden_size,
+            config.altup_num_inputs,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("modality_router", prefix),
+        )
+
+        self.router_norm = Gemma3nRMSNorm(
+            dim=config.hidden_size,
+            eps=config.rms_norm_eps,
+        )
+
+        self.register_buffer(
+            "router_input_scale",
+            torch.tensor(config.hidden_size**-1.0),
+            persistent=False,
+        )
+
+    def compute_router_modalities(self, x: torch.Tensor) -> torch.Tensor:
+        # x  : [num_tokens, hidden_size]
+        router_inputs = self.router_norm(x) * self.router_input_scale.to(
+            self.router_norm.weight.dtype
+        )
+        # router_inputs : [num_tokens, hidden_size]
+        routed, _ = self.modality_router(router_inputs)
+
+        # routed : [num_tokens, altup_num_inputs]
+        return torch.tanh(routed.float()).type_as(routed)
+
+    def predict(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """Predicts the output of a layer using a trainable map.
+        hidden_states: [num_altup_inputs, num_tokens, hidden_size]
+        """
+        modalities = self.compute_router_modalities(
+            hidden_states[self.config.altup_active_idx]
+        )  # (n_tokens, altup_num_inputs)
+        # TODO: CHECK DO WE NEED THIS: self.prediction_coefs.float()  # Force computation in float32, in-place operation
+
+        if self.config.altup_coef_clip is not None:
+            self.prediction_coefs.weight.data.clamp_(
+                -self.config.altup_coef_clip, self.config.altup_coef_clip
+            )
+
+        all_coefs, _ = self.prediction_coefs(
+            modalities
+        )  # (n_tokens, altup_num_inputs) -> (n_tokens, altup_num_inputs**2)
+
+        all_coefs = all_coefs.reshape(
+            *modalities.shape[:-1],
+            self.config.altup_num_inputs,
+            self.config.altup_num_inputs,
+        ).permute(0, 2, 1)
+
+        # permute hidden_states from [num_altup_inputs, num_tokens, hidden_size] to [num_tokens, hidden_size, altup_num_inputs]
+        predictions = torch.matmul(hidden_states.permute(1, 2, 0), all_coefs)
+        predictions = predictions.permute(2, 0, 1)  # undo the permute
+        predictions += hidden_states  # add the original input
+        return predictions.contiguous().type_as(
+            hidden_states
+        )  # [num_altup_inputs, num_tokens, hidden_size]
+
+    def correct(
+        self, predictions: torch.Tensor, activated: torch.Tensor
+    ) -> torch.Tensor:
+        """Corrects the predictions relative to the activated inputs."""
+        # prediction : [num_altup_inputs, num_tokens, hidden_size]
+        # activated  : [num_tokens, hidden_size]
+        modalities = self.compute_router_modalities(
+            activated
+        )  # [num_tokens, altup_num_inputs]
+        innovation = (
+            activated - predictions[self.config.altup_active_idx]
+        )  # [num_tokens, hidden_size]
+        innovation = innovation.repeat(
+            self.config.altup_num_inputs, 1, 1
+        )  # (self.config.altup_num_inputs, num_tokens, hidden_size)
+
+        if self.config.altup_coef_clip is not None:
+            self.correction_coefs.weight.data.clamp_(
+                -self.config.altup_coef_clip, self.config.altup_coef_clip
+            )
+
+        all_coefs, _ = self.correction_coefs(
+            modalities
+        )  # [num_tokens, altup_num_inputs]
+        all_coefs = (all_coefs + 1.0).permute(1, 0).unsqueeze(-1)
+        # # [num_tokens, altup_num_inputs, 1]
+
+        corrected = torch.mul(innovation, all_coefs)
+        corrected += predictions
+        return corrected.contiguous().type_as(activated)
+
+    def scale_corrected_output(self, corrected: torch.Tensor) -> torch.Tensor:
+        """Scales the provided 3D tensor."""
+        return corrected * self.correct_output_scale.to(corrected.dtype)
+
+    def forward(
+        self, hidden_states: torch.Tensor, activated: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Predicts, correct, and optionally scales the output of a layer using trainable maps.
+
+        hidden_states: [num_altup_inputs, num_tokens, hidden_size]
+        """
+
+        predictions = self.predict(hidden_states)
+        corrected = self.correct(predictions=predictions, activated=activated)
+        output = corrected[self.config.altup_active_idx]
+        if self.config.altup_correct_scale:
+            output = self.scale_corrected_output(output)
+        return corrected, output
+
+
+class Gemma3nAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        layer_id: int,
+        config: Gemma3nTextConfig,
+        max_position_embeddings: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.config = config
+        tp_size = get_tensor_model_parallel_world_size()
+
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+
+        if self.total_num_kv_heads >= tp_size:
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            assert tp_size % self.total_num_kv_heads == 0
+
+        hidden_size = config.hidden_size
+        head_dim = getattr(
+            config, "head_dim", hidden_size // config.num_attention_heads
+        )
+        self.head_dim = head_dim
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        # self.scaling = config.query_rescale_scalar / config.query_pre_attn_scalar
+        self.scaling = 1.0
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        # Determine if layer uses sliding window based on pattern
+        self.is_sliding = config.layer_types[layer_id] == "sliding_attention"
+
+        # Check if this is a KV shared layer
+        first_kv_shared_layer_idx = (
+            config.num_hidden_layers - config.num_kv_shared_layers
+        )
+        self.is_kv_shared_layer = layer_id >= first_kv_shared_layer_idx
+
+        # Compute the layer index from which shared KV cache values will be retrieved
+        if not self.is_kv_shared_layer:
+            self.kv_shared_layer_index = None
+        elif self.is_sliding:
+            self.kv_shared_layer_index = first_kv_shared_layer_idx - 2
+        else:
+            self.kv_shared_layer_index = first_kv_shared_layer_idx - 1
+
+        if self.is_sliding:
+            self.rotary_emb = get_rope(
+                self.head_dim,
+                rotary_dim=self.head_dim,
+                max_position=config.max_position_embeddings,
+                base=config.rope_local_base_freq,
+                rope_scaling={"rope_type": "default"},
+            )
+        else:
+            self.rotary_emb = get_rope(
+                self.head_dim,
+                rotary_dim=self.head_dim,
+                max_position=config.max_position_embeddings,
+                base=config.rope_theta,
+                rope_scaling=config.rope_scaling,
+            )
+
+        self.sliding_window = config.sliding_window if self.is_sliding else None
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=(
+                layer_id if not self.is_kv_shared_layer else self.kv_shared_layer_index
+            ),
+            logit_cap=0.0,
+            sliding_window_size=self.sliding_window,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+        # Gemma3n adds normalization for q, k, v
+        self.q_norm = Gemma3nRMSNorm(
+            dim=config.head_dim,
+            eps=config.rms_norm_eps,
+        )
+        self.k_norm = Gemma3nRMSNorm(
+            dim=config.head_dim,
+            eps=config.rms_norm_eps,
+        )
+        self.v_norm = Gemma3nRMSNorm(
+            dim=config.head_dim,
+            eps=config.rms_norm_eps,
+            with_scale=False,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        positions: Tuple[torch.Tensor, torch.Tensor],
+        forward_batch: ForwardBatch,
+        **kwargs,
+    ) -> torch.Tensor:
+
+        qkv, _ = self.qkv_proj(hidden_states)
+        # TODO: for first 20 layers, we use QKVParallelLinear
+        #       for others, we only calc Q.
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        # Apply normalization to q, k, v
+        q = q.unflatten(-1, (self.num_heads, self.head_dim))
+        q = self.q_norm(q)
+
+        # Check if we should use shared KV cache
+        if self.is_kv_shared_layer and self.kv_shared_layer_index is not None:
+            # For KV shared layers, we skip K/V computation and normalization
+            # The RadixAttention will handle retrieving shared KV from cache
+            k = None
+            v = None
+        else:
+            k = k.unflatten(-1, (self.num_kv_heads, self.head_dim))
+            k = self.k_norm(k)
+
+            v = v.unflatten(-1, (self.num_kv_heads, self.head_dim))
+            v = self.v_norm(v)
+
+        # Flatten back for rotary embedding
+        q = q.flatten(-2, -1)
+
+        # Apply rotary embedding
+        if k is not None:
+            k = k.flatten(-2, -1)
+            q, k = self.rotary_emb(positions, q, k)
+            # Reshape k back to head format for attention
+            k = k.unflatten(-1, (self.num_kv_heads, self.head_dim))
+        else:
+            # For shared KV layers, create a dummy key for rotary embedding and discard it
+            dummy_k = torch.zeros_like(
+                q[:, : self.kv_size]
+            )  # Create dummy key with same shape as needed
+            q, _ = self.rotary_emb(positions, q, dummy_k)
+
+        # Reshape q back to head format for attention
+        q = q.unflatten(-1, (self.num_heads, self.head_dim))
+
+        attn_output = self.attn(
+            q,
+            k,
+            v,
+            forward_batch=forward_batch,
+            save_kv_cache=not self.is_kv_shared_layer,
+        )
+
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Gemma3nDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.layer_id = layer_id
+        self.attention_type = config.layer_types[layer_id]
+        self.config = config
+
+        self.self_attn = Gemma3nAttention(
+            layer_id=layer_id,
+            config=config,
+            max_position_embeddings=config.max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+
+        intermediate_size = config.intermediate_size[layer_id]
+        activation_sparsity = config.activation_sparsity_pattern[layer_id]
+        self.mlp = Gemma3nTextMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=intermediate_size,
+            hidden_activation=config.hidden_activation,
+            activation_sparsity=activation_sparsity,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+        self.input_layernorm = Gemma3nRMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Gemma3nRMSNorm(
+            self.hidden_size, eps=config.rms_norm_eps
+        )
+        self.pre_feedforward_layernorm = Gemma3nRMSNorm(
+            self.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_feedforward_layernorm = Gemma3nRMSNorm(
+            self.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.hidden_size_per_layer_input = config.hidden_size_per_layer_input
+
+        self.altup = Gemma3nAltUp(
+            config, quant_config, prefix=add_prefix("altup", prefix)
+        )
+        self.laurel = Gemma3nLaurelBlock(
+            config, quant_config, prefix=add_prefix("laurel", prefix)
+        )
+
+        self.per_layer_input_gate = ColumnParallelLinear(
+            self.hidden_size,
+            self.hidden_size_per_layer_input,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("per_layer_input_gate", prefix),
+        )
+        self.per_layer_projection = RowParallelLinear(
+            self.hidden_size_per_layer_input,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("per_layer_projection", prefix),
+        )
+        self.post_per_layer_input_norm = Gemma3nRMSNorm(
+            self.hidden_size, eps=config.rms_norm_eps
+        )
+        self.is_sliding = self.self_attn.is_sliding
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        per_layer_input: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs,
+    ) -> torch.Tensor:
+        predictions = self.altup.predict(
+            hidden_states
+        )  # [num_altup_inputs, num_tokens, hidden_size]
+        active_prediction = predictions[self.config.altup_active_idx]
+
+        active_prediction_normed = self.input_layernorm(active_prediction)
+        laurel_output = self.laurel(
+            active_prediction_normed
+        )  # laurel_output: [num_tokens, hidden_size]
+        # active_prediction: [num_tokens, hidden_size]
+
+        attn = self.self_attn(
+            positions=positions,
+            hidden_states=active_prediction_normed,
+            forward_batch=forward_batch,
+            **kwargs,
+        )
+        attn = self.post_attention_layernorm(attn)  # [num_tokens, hidden_size]
+
+        attn_gated = active_prediction + attn  # [num_tokens, hidden_size]
+        attn_laurel = (attn_gated + laurel_output) / torch.sqrt(torch.tensor(2.0))
+
+        attn_norm = self.pre_feedforward_layernorm(
+            attn_laurel
+        )  # [num_tokens, hidden_size]
+        attn_ffw = self.mlp(attn_norm)  # [num_tokens, hidden_size]
+        attn_ffw_norm = self.post_feedforward_layernorm(
+            attn_ffw
+        )  # [num_tokens, hidden_size]
+        attn_ffw_laurel_gated = attn_laurel + attn_ffw_norm  # [num_tokens, hidden_size]
+        corrected_predictions = self.altup.correct(
+            predictions, attn_ffw_laurel_gated
+        )  # prediction : [num_altup_inputs, num_tokens, hidden_size]
+        # attn_ffw_laurel_gated: [num_tokens, hidden_size]
+        first_prediction = corrected_predictions[self.config.altup_active_idx]
+
+        if self.config.altup_correct_scale:
+            first_prediction = self.altup.scale_corrected_output(first_prediction)
+
+        # per_layer_input_gate
+        first_prediction = first_prediction.to(self.per_layer_input_gate.weight.dtype)
+        first_prediction, _ = self.per_layer_input_gate(first_prediction)
+        first_prediction = F.gelu(first_prediction, approximate="tanh")
+        first_prediction = torch.multiply(first_prediction, per_layer_input)
+
+        # per_layer_projection
+        first_prediction, _ = self.per_layer_projection(first_prediction)
+        first_prediction = self.post_per_layer_input_norm(first_prediction)
+        corrected_predictions[1:] += first_prediction
+
+        return corrected_predictions
+
+
+class Gemma3nTextModel(PreTrainedModel):
+    def __init__(
+        self,
+        config: Gemma3nTextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config=config)
+        self.config = config
+        self.quant_config = quant_config
+        self.vocab_size = config.vocab_size
+        self.padding_idx = config.pad_token_id
+
+        # Gemma3n downcasts the below to float16, causing sqrt(3072)=55.4256 to become 55.5
+        self.embed_tokens = Gemma3nTextScaledWordEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            self.padding_idx,
+            embed_scale=self.config.hidden_size**0.5,
+        )
+
+        self.norm = Gemma3nRMSNorm(
+            config.hidden_size,
+            eps=config.rms_norm_eps,
+        )
+
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Gemma3nDecoderLayer(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=add_prefix("layers", prefix),
+        )
+
+        # Per-layer input embeddings
+        self.hidden_size = config.hidden_size
+        self.hidden_size_per_layer_input = config.hidden_size_per_layer_input
+
+        self.embed_tokens_per_layer = Gemma3nTextScaledWordEmbedding(
+            config.vocab_size_per_layer_input,
+            config.num_hidden_layers * config.hidden_size_per_layer_input,
+            self.padding_idx,
+            embed_scale=self.config.hidden_size_per_layer_input**0.5,
+        )
+
+        self.per_layer_model_projection = ColumnParallelLinear(
+            self.hidden_size,
+            config.num_hidden_layers * config.hidden_size_per_layer_input,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("per_layer_model_projection", prefix),
+        )
+
+        self.per_layer_projection_norm = Gemma3nRMSNorm(
+            dim=config.hidden_size_per_layer_input,
+            eps=config.rms_norm_eps,
+        )
+
+        self.altup_projections = make_layers(
+            self.config.altup_num_inputs - 1,
+            lambda idx, prefix: ColumnParallelLinear(
+                self.hidden_size,
+                self.hidden_size,
+                bias=False,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=add_prefix("altup_projections", prefix),
+        )
+
+        self.altup_unembed_projections = make_layers(
+            self.config.altup_num_inputs - 1,
+            lambda idx, prefix: ColumnParallelLinear(
+                self.hidden_size,
+                self.hidden_size,
+                bias=False,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=add_prefix("altup_unembed_projections", prefix),
+        )
+
+        self.register_buffer(
+            "per_layer_projection_scale",
+            torch.tensor(self.hidden_size**-0.5),
+            persistent=False,
+        )
+        self.register_buffer(
+            "per_layer_input_scale", torch.rsqrt(torch.tensor(2.0)), persistent=False
+        )
+
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embed_tokens
+
+    def dtype(self) -> torch.dtype:
+        return next(self.parameters()).dtype
+
+    def get_per_layer_inputs(self, input_ids: torch.LongTensor) -> torch.Tensor:
+        embeddings = self.embed_tokens_per_layer(input_ids)
+        return embeddings.reshape(
+            *input_ids.shape,
+            self.config.num_hidden_layers,
+            self.hidden_size_per_layer_input,
+        )
+
+    def project_per_layer_inputs(
+        self,
+        inputs_embeds: torch.Tensor,
+        per_layer_inputs: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        per_layer_projection, _ = self.per_layer_model_projection(inputs_embeds)
+        per_layer_projection *= self.per_layer_projection_scale.type(
+            inputs_embeds.dtype
+        )
+        per_layer_projection = per_layer_projection.reshape(
+            *inputs_embeds.shape[:-1],
+            self.config.num_hidden_layers,
+            self.hidden_size_per_layer_input,
+        )
+        per_layer_projection = self.per_layer_projection_norm(per_layer_projection)
+
+        if per_layer_inputs is None:
+            return per_layer_projection
+
+        if per_layer_projection.shape != per_layer_inputs.shape:
+            # per-layer inputs are sometimes padded with zeros, slice the relevant embeddings
+            per_layer_inputs = per_layer_inputs[..., : self.config.num_hidden_layers, :]
+
+        return (
+            per_layer_projection + per_layer_inputs
+        ) * self.per_layer_input_scale.type(inputs_embeds.dtype)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        per_layer_inputs: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        if (input_ids is None) ^ (input_embeds is not None):
+            raise ValueError(
+                "You must specify exactly one of input_ids or inputs_embeds"
+            )
+
+        if input_ids is not None:
+            input_embeds = self.embed_tokens(input_ids)
+            per_layer_inputs = self.get_per_layer_inputs(input_ids)
+
+        per_layer_inputs = self.project_per_layer_inputs(input_embeds, per_layer_inputs)
+
+        if positions.dim() == 1:
+            positions = positions.unsqueeze(0)
+
+        # Expand hidden_states to support per-layer inputs
+        target_magnitude = torch.mean(input_embeds**2, dim=-1, keepdim=True) ** 0.5
+        epsilon_tensor = torch.tensor(torch.finfo(input_embeds.dtype).min)
+
+        # embed positions
+        hidden_states_0 = input_embeds
+        temp_hidden_states = [hidden_states_0]
+
+        for i in range(1, self.config.altup_num_inputs):
+            altup_proj, _ = self.altup_projections[i - 1](hidden_states_0)
+            current_hidden_state = altup_proj.type(hidden_states_0.dtype)
+            new_magnitude = (
+                torch.mean(current_hidden_state**2, dim=-1, keepdim=True) ** 0.5
+            )
+            current_hidden_state = current_hidden_state * (
+                target_magnitude / torch.maximum(new_magnitude, epsilon_tensor)
+            )
+            temp_hidden_states.append(current_hidden_state)
+
+        hidden_states = torch.stack(
+            temp_hidden_states, dim=0
+        )  # [num_altup_inputs, n_tokens, hidden_size]
+
+        for layer_idx, layer in enumerate(self.layers):
+            per_layer_input = per_layer_inputs[:, layer_idx, :]
+            hidden_states = layer(
+                positions=positions,
+                per_layer_input=per_layer_input,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+                **kwargs,
+            )
+
+        # Per-layer inputs to single output
+        target_magnitude = (
+            torch.mean(hidden_states[0] ** 2, dim=-1, keepdim=True) ** 0.5
+        )
+
+        temp_hidden_states = [hidden_states[0]]
+
+        for i in range(1, self.config.altup_num_inputs):
+            # altup_unembed_projections adapted from jax.numpy.einsum("btp,pd->btd", ...)
+            altup_unemb_proj, _ = self.altup_unembed_projections[i - 1](
+                hidden_states[i]
+            )
+            current_hidden_state = altup_unemb_proj.type(hidden_states_0.dtype)
+            new_magnitude = (
+                torch.mean(current_hidden_state**2, dim=-1, keepdim=True) ** 0.5
+            )
+            current_hidden_state = current_hidden_state * (
+                target_magnitude / torch.maximum(new_magnitude, epsilon_tensor)
+            )
+            temp_hidden_states.append(current_hidden_state)
+
+        hidden_states = torch.stack(temp_hidden_states)
+        hidden_states = torch.mean(hidden_states, dim=0)
+        hidden_states = self.norm(hidden_states)
+
+        return hidden_states
+
+
+class Gemma3nForCausalLM(PreTrainedModel):
+    config_class = Gemma3nTextConfig
+
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    config_class = Gemma3nTextConfig
+    base_model_prefix = "language_model"
+
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        ".q_proj": (".qkv_proj", 0),
+        ".k_proj": (".qkv_proj", 1),
+        ".v_proj": (".qkv_proj", 2),
+        ".gate_proj": (".gate_up_proj", 0),
+        ".up_proj": (".gate_up_proj", 1),
+    }
+
+    packed_modules_mapping = {
+        ".qkv_proj": [
+            ".q_proj",
+            ".k_proj",
+            ".v_proj",
+        ],
+        ".gate_up_proj": [
+            ".gate_proj",
+            ".up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        ".qkv_proj",
+        ".o_proj",
+        ".gate_up_proj",
+        ".down_proj",
+    ]
+    # Gemma does not apply LoRA to the embedding layer
+    embedding_modules = {}
+    embedding_padding_modules = []
+    supports_lora = True
+
+    def __init__(
+        self,
+        config: Gemma3nTextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config=config)
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Gemma3nTextModel(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("model", prefix),
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    def get_attention_sliding_window_size(self):
+        return get_attention_sliding_window_size(self.config)
+
+    def dtype(self) -> torch.dtype:
+        return next(self.parameters()).dtype
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        per_layer_inputs: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> LogitsProcessor:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            per_layer_inputs,
+            **kwargs,
+        )
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.model.embed_tokens, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            name = name.replace("model.language_model.", "model.")
+            for param_name, shard_name, shard_id in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    # Skip loading weights that are not in the model
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # lm_head is not used in vllm as it is tied with embed_token
+                if "lm_head.weight" in name:
+                    continue
+                # Skip loading extra bias for GPTQ models
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                if name not in params_dict:
+                    # Skip loading weights that are not in the model
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+EntryClass = Gemma3nForCausalLM
+AutoModel.register(Gemma3nTextConfig, Gemma3nForCausalLM, exist_ok=True)
diff --git a/python/sglang/srt/models/gemma3n_mm.py b/python/sglang/srt/models/gemma3n_mm.py
new file mode 100644
index 000000000..995db2602
--- /dev/null
+++ b/python/sglang/srt/models/gemma3n_mm.py
@@ -0,0 +1,534 @@
+import logging
+import re
+from functools import lru_cache
+from typing import Iterable, List, Optional, Set, Tuple, TypedDict, Union
+
+import torch
+from torch import nn
+from transformers import (
+    Gemma3nAudioConfig,
+    Gemma3nConfig,
+    Gemma3nTextConfig,
+    Gemma3nVisionConfig,
+    PreTrainedModel,
+)
+from transformers.models.auto.modeling_auto import AutoModel
+
+from sglang.srt.hf_transformers_utils import get_processor
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+    flatten_nested_list,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.models.gemma3n_audio import Gemma3nAudioEncoder
+from sglang.srt.models.gemma3n_causal import Gemma3nRMSNorm, Gemma3nTextModel
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+cached_get_processor = lru_cache(get_processor)
+
+
+class Gemma3nImagePixelInputs(TypedDict):
+    pixel_values: torch.Tensor
+    """Shape: `(batch_size * num_images, num_channels, height, width)`"""
+
+
+class Gemma3nAudioInputs(TypedDict):
+    input_features: torch.Tensor
+    """Shape: `(batch_size * num_audio, seq_length, num_features)`"""
+    input_features_mask: torch.Tensor
+    """Shape: `(batch_size * num_audio, seq_length)`"""
+
+
+class Gemma3nMultimodalEmbedder(nn.Module):
+    """Embeds token ids or soft tokens for multimodal content into language model space."""
+
+    def __init__(
+        self,
+        multimodal_config: Union[Gemma3nAudioConfig, Gemma3nVisionConfig],
+        text_config: Gemma3nTextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.multimodal_hidden_size = multimodal_config.hidden_size
+        self.eps = multimodal_config.rms_norm_eps
+        self.vocab_offset = multimodal_config.vocab_offset
+        self.vocab_size = multimodal_config.vocab_size
+        self.text_hidden_size = text_config.hidden_size
+
+        self.embedding = VocabParallelEmbedding(
+            self.vocab_size,
+            self.multimodal_hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("embedding", prefix),
+        )
+
+        self.hard_embedding_norm = Gemma3nRMSNorm(
+            self.multimodal_hidden_size,
+            eps=self.eps,
+        )
+
+        self.soft_embedding_norm = Gemma3nRMSNorm(
+            self.multimodal_hidden_size,
+            eps=self.eps,
+        )
+
+        self.embedding_projection = RowParallelLinear(
+            self.multimodal_hidden_size,
+            self.text_hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("embedding_projection", prefix),
+        )
+
+        self.embedding_post_projection_norm = Gemma3nRMSNorm(
+            self.text_hidden_size,
+            eps=self.eps,
+            with_scale=False,
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Embeds token ids or soft tokens for multimodal content into language model space.
+
+        Args:
+            input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range
+                `[vocab_offset, vocab_offset + vocab_size)`.
+            inputs_embeds: A torch.Tensor containing the soft tokens to embed.
+
+        Returns:
+            A torch.Tensor of embeddings with  shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
+        """
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You must specify exactly one of input_ids or inputs_embeds"
+            )
+
+        if inputs_embeds is not None:
+            emb_norm = self.soft_embedding_norm(inputs_embeds)
+        else:
+            # Handle out of vocab ids to prevent CUDA assertion failures
+            out_of_vocab_id = self.vocab_size - 1
+            adjusted_ids = input_ids - self.vocab_offset
+            adjusted_ids = torch.where(adjusted_ids < 0, out_of_vocab_id, adjusted_ids)
+            adjusted_ids = torch.where(
+                adjusted_ids >= self.vocab_size, out_of_vocab_id, adjusted_ids
+            )
+            hard_emb = self.embedding(adjusted_ids)
+            emb_norm = self.hard_embedding_norm(hard_emb)
+
+        emb_norm_proj, _ = self.embedding_projection(emb_norm)
+        return self.embedding_post_projection_norm(emb_norm_proj)
+
+
+class Gemma3nForConditionalGeneration(PreTrainedModel):
+    config_class = Gemma3nConfig
+    """Gemma3n multimodal model for conditional generation."""
+
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+        ".out_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+        "out_proj": ("proj", 0),
+    }
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+    ]
+    # Gemma does not apply LoRA to the embedding layer
+    embedding_modules = {}
+    embedding_padding_modules = []
+    supports_lora = True
+
+    def __init__(
+        self,
+        config: Gemma3nConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config=config)
+        self.config = config
+        self.quant_config = quant_config
+
+        prefix = add_prefix("model", prefix)
+
+        # Vision components
+        # TODO: Use sglang's vision model
+        self.vision_tower = AutoModel.from_config(config=config.vision_config)
+
+        self.embed_vision = Gemma3nMultimodalEmbedder(
+            config.vision_config,
+            config.text_config,
+            quant_config=quant_config,
+            prefix=add_prefix("embed_vision", prefix),
+        )
+
+        # Audio components
+        self.embed_audio = Gemma3nMultimodalEmbedder(
+            config.audio_config,
+            config.text_config,
+            quant_config=quant_config,
+            prefix=add_prefix("embed_audio", prefix),
+        )
+
+        self.audio_tower = Gemma3nAudioEncoder(
+            config.audio_config,
+            quant_config=quant_config,
+            prefix=add_prefix("audio_tower", prefix),
+        )
+
+        self.vocab_size = config.text_config.vocab_size
+        self.vocab_size_per_layer_input = config.text_config.vocab_size_per_layer_input
+
+        # Text model
+        self.language_model = Gemma3nTextModel(
+            config.text_config,
+            quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+
+        # Create logits processor for the multimodal model
+        self.logits_processor = LogitsProcessor(config.text_config)
+
+        self.post_init()
+
+    def pad_input_ids(
+        self,
+        input_ids: List[int],
+        mm_inputs: MultimodalInputs,
+    ) -> List[int]:
+        """Pad input IDs with image and audio tokens."""
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.language_model.get_input_embeddings()
+
+    def get_attention_sliding_window_size(self):
+        return self.config.text_config.sliding_window - 1
+
+    def get_image_feature(self, items: List[MultimodalDataItem]):
+        """
+        Projects the last hidden state from the vision model into language model space.
+
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        # Process images one by one to handle flatten_batch=True constraint in vision_tower
+        all_pixel_values = flatten_nested_list([item.feature for item in items])
+        vision_outputs_list = []
+
+        for pixel_values_batch in all_pixel_values:
+            # Normalize input shape to [batch_size, channels, height, width]
+            if pixel_values_batch.dim() == 5:
+                pixel_values_batch = pixel_values_batch.squeeze(0)
+            elif pixel_values_batch.dim() == 3:
+                pixel_values_batch = pixel_values_batch.unsqueeze(0)
+            elif pixel_values_batch.dim() != 4:
+                raise ValueError(
+                    f"Unexpected pixel_values shape: {pixel_values_batch.shape}"
+                )
+
+            # Process each image in the batch
+            batch_size = pixel_values_batch.shape[0]
+            for i in range(batch_size):
+                pixel_value = pixel_values_batch[i : i + 1]  # Keep batch dimension as 1
+                pixel_value = pixel_value.to(
+                    device=self.vision_tower.device, dtype=self.language_model.dtype()
+                )
+                vision_outputs = self.vision_tower(
+                    pixel_values=pixel_value, do_pooling=False, return_dict=True
+                ).last_hidden_state
+                vision_outputs_list.append(vision_outputs)
+
+        # Concatenate all vision outputs
+        vision_outputs = torch.cat(vision_outputs_list, dim=0)
+
+        # Convert from (batch, channels, height, width) to (batch, height * width, channels)
+        vision_outputs = vision_outputs.reshape(
+            vision_outputs.shape[0],
+            self.config.vision_config.hidden_size,
+            self.config.vision_soft_tokens_per_image,
+        ).permute(0, 2, 1)
+
+        # Normalize and embed the soft tokens into language model space
+        vision_outputs *= self.config.vision_config.hidden_size**0.5
+        return self.embed_vision(inputs_embeds=vision_outputs)
+
+    def get_audio_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        """
+        Projects the last hidden state from the audio encoder into language model space.
+
+        Args:
+            items: List of multimodal data items containing audio data.
+
+        Returns:
+            audio_features (`torch.Tensor`): Audio feature tensor of shape `(num_audios, audio_length, embed_dim)`).
+        """
+        # Extract audio features and masks from items
+        all_input_features = flatten_nested_list([item.feature for item in items])
+        all_input_features_mask = flatten_nested_list(
+            [~item.input_features_mask for item in items]
+        )  # Note(Xinyuan): reverse the mask according to the HF implementation
+
+        # Process audio features one by one
+        audio_features_list = []
+
+        for input_features, input_features_mask in zip(
+            all_input_features, all_input_features_mask
+        ):
+            # Ensure proper tensor format
+            if input_features.dim() == 2:
+                input_features = input_features.unsqueeze(0)
+            if input_features_mask.dim() == 1:
+                input_features_mask = input_features_mask.unsqueeze(0)
+
+            # Move to device and dtype
+            input_features = input_features.to(
+                device=next(self.audio_tower.parameters()).device,
+                dtype=self.language_model.dtype(),
+            )
+            input_features_mask = input_features_mask.to(device=input_features.device)
+
+            # Process through audio tower
+            audio_outputs, audio_mask = self.audio_tower(
+                input_features, input_features_mask
+            )
+
+            # Embed the audio outputs
+            audio_embeds = self.embed_audio(inputs_embeds=audio_outputs)
+            audio_features_list.append(audio_embeds)
+
+        # Concatenate all audio features
+        if audio_features_list:
+            audio_features = torch.cat(audio_features_list, dim=0)
+
+            # The Gemma3nProcessor expects all audio will be 30s in length and inserts 188 audio soft tokens into the
+            # text to account for this. However, the audio preprocessing and encoder do not gurarantee they will
+            # produce 188 soft tokens; they will produce at most that many tokens, but they may produce fewer tokens
+            # depending on the length of the longest audio input in the batch. When we encounter this situation, we pad
+            # the audio feature out to 188 soft tokens with the emebedding of the last token in the embed_audio vocab.
+            audio_padding_toks = torch.tensor(
+                [[self.vocab_size - 1]], dtype=torch.long, device=audio_features.device
+            )
+            audio_padding_embs = self.embed_audio(input_ids=audio_padding_toks)
+            audio_features = torch.where(
+                audio_mask.unsqueeze(-1), audio_padding_embs, audio_features
+            )
+
+            audio_batch_size, audio_seq_len, audio_embed_dim = audio_features.shape
+            extra_padding_tokens = (
+                self.config.audio_soft_tokens_per_image - audio_seq_len
+            )
+            extra_padding_features = audio_padding_embs.expand(
+                audio_batch_size, extra_padding_tokens, audio_embed_dim
+            )
+
+            audio_features = torch.cat((audio_features, extra_padding_features), dim=1)
+            return audio_features
+        else:
+            return torch.empty(
+                0,
+                0,
+                self.language_model.config.hidden_size,
+                device=next(self.parameters()).device,
+                dtype=self.language_model.dtype(),
+            )
+
+    def get_per_layer_inputs(
+        self, input_ids: torch.LongTensor
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.get_per_layer_inputs(input_ids)
+
+    def project_per_layer_inputs(
+        self,
+        inputs_embeds: torch.Tensor,
+        per_layer_inputs: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return self.language_model.project_per_layer_inputs(
+            inputs_embeds, per_layer_inputs
+        )
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        **kwargs: object,
+    ) -> LogitsProcessor:
+        """Forward pass for multimodal Gemma3n."""
+        if (input_ids is None) ^ (input_embeds is not None):
+            raise ValueError(
+                "You must specify exactly one of input_ids or inputs_embeds"
+            )
+
+        positions += 1
+        if input_ids is not None:
+            # Prepare per-layer inputs from inputs_ids
+            per_layer_inputs_mask = torch.logical_and(
+                input_ids >= 0, input_ids < self.vocab_size_per_layer_input
+            )
+            per_layer_inputs_tokens = torch.where(
+                per_layer_inputs_mask, input_ids, torch.zeros_like(input_ids)
+            )
+            per_layer_inputs = self.language_model.get_per_layer_inputs(
+                per_layer_inputs_tokens
+            )
+
+        # Use general_mm_embed_routine for handling multimodal data
+        # This will automatically handle text, image, and audio embeddings
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.language_model,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+                Modality.AUDIO: self.get_audio_feature,
+            },
+            positions=positions,
+            per_layer_inputs=per_layer_inputs,
+        )
+
+        # Process hidden states through logits processor
+        return self.logits_processor(
+            input_ids, hidden_states, self.language_model.embed_tokens, forward_batch
+        )
+
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".up_proj", 1),
+            (".gate_up_proj", ".gate_proj", 0),
+        ]
+        """Load weights for the model."""
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            name = re.sub(r"^model\.", "", name)
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if "vision_model" in name:
+                    # adapt to VisionAttention
+                    name = name.replace(".self_attn.out_proj", ".self_attn.proj")
+                # Skip loading extra bias for GPTQ models
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+    lora_pattern = re.compile(
+        r"^language_model\.layers\.(\d+)\.(?:self_attn|mlp)\.(?:qkv_proj|o_proj|down_proj|gate_up_proj)"
+    )
+
+    def should_apply_lora(self, module_name: str) -> bool:
+        return bool(self.lora_pattern.match(module_name))
+
+    def get_hidden_dim(self, module_name, layer_idx):
+        # return input_dim, output_dim
+        if module_name == "qkv_proj":
+            return (
+                self.config.hidden_size,
+                self.config.head_dim
+                * (
+                    self.config.num_attention_heads
+                    + self.config.num_key_value_heads * 2
+                ),
+            )
+        elif module_name == "o_proj":
+            return (
+                self.config.head_dim * self.config.num_attention_heads,
+                self.config.hidden_size,
+            )
+        elif module_name == "gate_up_proj":
+            assert len(set(self.config.intermediate_size)) == 1, (
+                "Currently SGLang requires uniform intermediate size for all layers. "
+                "Please file an issue if you need support for non-uniform intermediate sizes."
+            )
+            return self.config.hidden_size, self.config.intermediate_size[0] * 2
+        elif module_name == "down_proj":
+            assert len(set(self.config.intermediate_size)) == 1, (
+                "Currently SGLang requires uniform intermediate size for all layers. "
+                "Please file an issue if you need support for non-uniform intermediate sizes."
+            )
+            return self.config.intermediate_size[0], self.config.hidden_size
+        else:
+            raise NotImplementedError()
+
+
+EntryClass = Gemma3nForConditionalGeneration
diff --git a/python/sglang/srt/models/glm4.py b/python/sglang/srt/models/glm4.py
new file mode 100644
index 000000000..7357e5f82
--- /dev/null
+++ b/python/sglang/srt/models/glm4.py
@@ -0,0 +1,318 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Modeling from:
+# ./llama.py and
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/glm4/modular_glm4.py
+"""Inference-only GLM4 model compatible with THUDM weights."""
+
+from typing import Iterable, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import Glm4Config
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import QKVParallelLinear, RowParallelLinear
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.llama import LlamaMLP as Glm4MLP
+from sglang.srt.utils import add_prefix, make_layers
+
+
+class Glm4Attention(nn.Module):
+    def __init__(
+        self,
+        config,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = getattr(config, "rope_theta", 1000000)
+        self.rope_scaling = getattr(config, "rope_scaling", None)
+
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=config.max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=self.rope_scaling,
+            partial_rotary_factor=partial_rotary_factor,
+            is_neox_style=False,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        context_layer = self.attn(
+            q,
+            k,
+            v,
+            forward_batch,
+        )
+        attn_output, _ = self.o_proj(context_layer)
+        return attn_output
+
+
+class Glm4DecoderLayer(nn.Module):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(
+        self,
+        config,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        # Self attention.
+        self.self_attn = Glm4Attention(
+            config, layer_id, quant_config, prefix=add_prefix("self_attn", prefix)
+        )
+
+        # MLP
+        self.mlp = Glm4MLP(
+            config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_self_attn_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_mlp_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = self.post_self_attn_layernorm(hidden_states)
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_mlp_layernorm(hidden_states)
+
+        return hidden_states, residual
+
+
+class Glm4Model(nn.Module):
+    def __init__(
+        self,
+        config: Glm4Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Glm4DecoderLayer(
+                config=config, layer_id=idx, quant_config=quant_config, prefix=prefix
+            ),
+            prefix="model.layers",
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embed_tokens
+
+    def dtype(self) -> torch.dtype:
+        return next(self.parameters()).dtype
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]]]:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        residual = None
+        for layer in self.layers:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class Glm4ForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: Glm4Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config: Glm4Config = config
+        self.quant_config = quant_config
+        self.model = Glm4Model(config, quant_config, add_prefix("model", prefix))
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix="lm_head",
+            )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, weight_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if name in params_dict.keys():
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                else:
+                    raise KeyError(f"Parameter '{name}' not found in model.")
+
+
+EntryClass = [Glm4ForCausalLM]
diff --git a/python/sglang/srt/models/glm4_moe.py b/python/sglang/srt/models/glm4_moe.py
new file mode 100644
index 000000000..5ae5b0af6
--- /dev/null
+++ b/python/sglang/srt/models/glm4_moe.py
@@ -0,0 +1,1083 @@
+# Copyright 2025-2026 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Inference-only GLM-4.5 model compatible with HuggingFace weights"""
+
+import logging
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_moe_expert_parallel_world_size,
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    parallel_state,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.amx_utils import PackWeightMethod
+from sglang.srt.layers.communicator import (
+    LayerCommunicator,
+    LayerScatterModes,
+    enable_moe_dense_fully_dp,
+)
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe import get_deepep_mode, get_moe_a2a_backend
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.quantization.fp8_kernel import (
+    is_fp8_fnuz,
+    per_tensor_quant_mla_fp8,
+    per_token_group_quant_mla_deep_gemm_masked_fp8,
+)
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.deepseek_v2 import (
+    DeepseekV2DecoderLayer,
+    DeepseekV2ForCausalLM,
+    DeepseekV2Model,
+    DeepseekV2MoE,
+)
+from sglang.srt.two_batch_overlap import MaybeTboDeepEPDispatcher
+from sglang.srt.utils import (
+    BumpAllocator,
+    LazyValue,
+    add_prefix,
+    bind_or_assign,
+    cpu_has_amx_support,
+    get_bool_env_var,
+    get_device_sm,
+    get_int_env_var,
+    is_cpu,
+    is_cuda,
+    is_flashinfer_available,
+    is_hip,
+    is_non_idle_and_non_empty,
+    log_info_on_rank0,
+    use_intel_amx_backend,
+)
+
+_is_hip = is_hip()
+_is_cuda = is_cuda()
+_is_fp8_fnuz = is_fp8_fnuz()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+_device_sm = get_device_sm()
+
+if _is_cuda:
+    from sgl_kernel import dsv3_router_gemm
+elif _is_cpu and _is_cpu_amx_available:
+    pass
+
+logger = logging.getLogger(__name__)
+
+
+class Glm4MoeMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        self.tp_size = tp_size
+
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=add_prefix("down_proj", prefix),
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(
+        self,
+        x,
+        forward_batch=None,
+        should_allreduce_fusion=False,
+        gemm_output_zero_allocator: BumpAllocator = None,
+    ):
+        if (self.tp_size == 1) and x.shape[0] == 0:
+            return x
+
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x, skip_all_reduce=should_allreduce_fusion)
+        return x
+
+
+class Glm4MoeAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        partial_rotary_factor: float = 0.5,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        head_dim: Optional[int] = None,
+        rms_norm_eps: float = 1e-05,
+        attention_bias: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        use_qk_norm: bool = False,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % attn_tp_size == 0
+        self.num_heads = self.total_num_heads // attn_tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
+        self.head_dim = head_dim or hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.use_qk_norm = use_qk_norm
+        self.max_position_embeddings = max_position_embeddings
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=attention_bias,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            reduce_results=False,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            partial_rotary_factor=partial_rotary_factor,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            prefix=add_prefix("attn", prefix),
+        )
+
+        if self.use_qk_norm:
+            self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+            self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+        self.alt_stream = alt_stream
+
+    def _apply_qk_norm(
+        self, q: torch.Tensor, k: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # overlap qk norm
+        if self.alt_stream is not None and get_is_capture_mode():
+            current_stream = torch.cuda.current_stream()
+            self.alt_stream.wait_stream(current_stream)
+            q_by_head = q.reshape(-1, self.head_dim)
+            q_by_head = self.q_norm(q_by_head)
+            with torch.cuda.stream(self.alt_stream):
+                k_by_head = k.reshape(-1, self.head_dim)
+                k_by_head = self.k_norm(k_by_head)
+            current_stream.wait_stream(self.alt_stream)
+        else:
+            q_by_head = q.reshape(-1, self.head_dim)
+            q_by_head = self.q_norm(q_by_head)
+            k_by_head = k.reshape(-1, self.head_dim)
+            k_by_head = self.k_norm(k_by_head)
+        q = q_by_head.view(q.shape)
+        k = k_by_head.view(k.shape)
+        return q, k
+
+    def op_prepare(self, state):
+        state.attn_intermediate_state = self.forward_prepare(
+            positions=state.positions,
+            hidden_states=state.pop("hidden_states_after_comm_pre_attn"),
+            forward_batch=state.forward_batch,
+        )
+
+    def op_core(self, state):
+        state.hidden_states_after_attn = self.forward_core(
+            state.pop("attn_intermediate_state")
+        )
+
+    def forward_prepare(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        if hidden_states.shape[0] == 0:
+            return hidden_states, forward_batch, None
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        if self.use_qk_norm:
+            q, k = self._apply_qk_norm(q, k)
+        q, k = self.rotary_emb(positions, q, k)
+        inner_state = q, k, v, forward_batch
+        return None, forward_batch, inner_state
+
+    def forward_core(self, intermediate_state):
+        hidden_states, forward_batch, inner_state = intermediate_state
+        if inner_state is None:
+            return hidden_states
+        attn_output = self.attn(*inner_state)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        s = self.forward_prepare(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        return self.forward_core(s)
+
+
+class Glm4MoeGate(nn.Module):
+    def __init__(
+        self,
+        config,
+        prefix: str = "",
+        is_nextn: bool = False,
+    ):
+        super().__init__()
+        self.is_nextn = is_nextn
+        self.weight = nn.Parameter(
+            torch.empty((config.n_routed_experts, config.hidden_size))
+        )
+        self.e_score_correction_bias = nn.Parameter(
+            torch.empty((config.n_routed_experts), dtype=torch.float32)
+        )
+        if _is_cpu and _is_cpu_amx_available:
+            self.quant_method = PackWeightMethod(weight_names=["weight"])
+
+    def forward(self, hidden_states):
+        if use_intel_amx_backend(self):
+            return torch.ops.sgl_kernel.weight_packed_linear(
+                hidden_states,
+                self.weight,
+                None,  # bias
+                True,  # is_vnni
+            )
+
+        # NOTE: For some unknown reason, router_gemm seems degrade accept length.
+        if (
+            _is_cuda
+            and not self.is_nextn
+            and hidden_states.shape[0] < 4
+            and hidden_states.shape[1] == 7168
+            and self.weight.shape[0] == 256
+            and _device_sm >= 90
+        ):
+            logits = dsv3_router_gemm(hidden_states, self.weight).to(
+                hidden_states.dtype
+            )
+        else:
+            logits = F.linear(hidden_states, self.weight, None)
+
+        return logits
+
+
+class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+        is_nextn: bool = False,
+    ):
+        nn.Module.__init__(self)
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.ep_size = get_moe_expert_parallel_world_size()
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.n_shared_experts = config.n_shared_experts
+        self.num_fused_shared_experts = (
+            0
+            if global_server_args_dict["disable_shared_experts_fusion"]
+            else config.n_shared_experts
+        )
+        self.config = config
+        self.layer_id = layer_id
+        self.alt_stream = alt_stream
+
+        if self.tp_size > config.n_routed_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.n_routed_experts}."
+            )
+
+        if config.hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+
+        self.gate = Glm4MoeGate(
+            config=config, prefix=add_prefix("gate", prefix), is_nextn=is_nextn
+        )
+
+        self.topk = TopK(
+            top_k=config.num_experts_per_tok + self.num_fused_shared_experts,
+            renormalize=config.norm_topk_prob,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            num_fused_shared_experts=self.num_fused_shared_experts,
+            topk_group=config.topk_group,
+            correction_bias=self.gate.e_score_correction_bias,
+            routed_scaling_factor=self.routed_scaling_factor,
+        )
+
+        self.experts = get_moe_impl_class()(
+            num_experts=config.n_routed_experts
+            + self.num_fused_shared_experts
+            + global_server_args_dict["ep_num_redundant_experts"],
+            num_fused_shared_experts=self.num_fused_shared_experts,
+            top_k=config.num_experts_per_tok + self.num_fused_shared_experts,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            layer_id=self.layer_id,
+            quant_config=quant_config,
+            routed_scaling_factor=self.routed_scaling_factor,
+            prefix=add_prefix("experts", prefix),
+        )
+
+        self.shared_experts_is_int8 = False
+        self.shared_experts_is_fp8 = False
+        # self.shared_experts_weight_block_size = None
+        if config.n_shared_experts is not None and self.num_fused_shared_experts == 0:
+            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+            self.shared_experts = Glm4MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=add_prefix("shared_experts", prefix),
+                **(dict(tp_rank=0, tp_size=1) if self.ep_size > 1 else {}),
+            )
+            is_packed_weight = hasattr(
+                self.shared_experts.gate_up_proj.quant_method, "quant_config"
+            )
+            self.shared_experts_is_int8 = (
+                not is_packed_weight
+                and self.shared_experts.gate_up_proj.weight.dtype == torch.int8
+            )
+            self.shared_experts_is_fp8 = (
+                not is_packed_weight
+                and self.shared_experts.gate_up_proj.weight.dtype == torch.float8_e4m3fn
+            )
+
+        self.top_k = config.num_experts_per_tok
+
+        if get_moe_a2a_backend().is_deepep():
+            # TODO: we will support tp < ep in the future
+            self.ep_size = get_moe_expert_parallel_world_size()
+            self.num_experts = (
+                config.n_routed_experts
+                + global_server_args_dict["ep_num_redundant_experts"]
+            )
+            self.renormalize = config.norm_topk_prob
+            self.topk_group = config.topk_group
+            self.num_expert_group = config.n_group
+            self.correction_bias = (
+                self.gate.e_score_correction_bias.data
+                if self.gate.e_score_correction_bias is not None
+                else None
+            )
+
+            self.deepep_dispatcher = MaybeTboDeepEPDispatcher(
+                group=parallel_state.get_tp_group().device_group,
+                router_topk=self.top_k,
+                permute_fusion=True,
+                num_experts=self.num_experts,
+                num_local_experts=config.n_routed_experts // self.tp_size,
+                hidden_size=config.hidden_size,
+                params_dtype=config.torch_dtype,
+                deepep_mode=get_deepep_mode(),
+                async_finish=True,
+                return_recv_hook=True,
+            )
+
+        self._enable_deepep_moe = get_moe_a2a_backend().is_deepep()
+
+    def forward_normal_dual_stream(
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+        gemm_output_zero_allocator: BumpAllocator = None,
+    ) -> torch.Tensor:
+
+        current_stream = torch.cuda.current_stream()
+        self.alt_stream.wait_stream(current_stream)
+        shared_output = self._forward_shared_experts(hidden_states)
+
+        with torch.cuda.stream(self.alt_stream):
+            # router_logits: (num_tokens, n_experts)
+            router_logits = self.gate(hidden_states)
+            topk_output = self.topk(hidden_states, router_logits)
+            final_hidden_states = self.experts(hidden_states, topk_output)
+            if not _is_cuda:
+                final_hidden_states *= self.routed_scaling_factor
+        current_stream.wait_stream(self.alt_stream)
+
+        if self.ep_size > 1:
+            if (
+                self.tp_size > 1
+                and not should_allreduce_fusion
+                and not use_reduce_scatter
+            ):
+                final_hidden_states = tensor_model_parallel_all_reduce(
+                    final_hidden_states
+                )
+            final_hidden_states += shared_output
+        else:
+            final_hidden_states += shared_output
+            if (
+                self.tp_size > 1
+                and not should_allreduce_fusion
+                and not use_reduce_scatter
+            ):
+                final_hidden_states = tensor_model_parallel_all_reduce(
+                    final_hidden_states
+                )
+        return final_hidden_states
+
+    def forward_normal(
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+        gemm_output_zero_allocator: BumpAllocator = None,
+    ) -> torch.Tensor:
+        if hasattr(self, "shared_experts") and use_intel_amx_backend(
+            self.shared_experts.gate_up_proj
+        ):
+            return self.forward_cpu(hidden_states, should_allreduce_fusion)
+
+        shared_output = self._forward_shared_experts(hidden_states)
+        # router_logits: (num_tokens, n_experts)
+        router_logits = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = self.experts(hidden_states, topk_output)
+        if not _is_cuda and not _use_aiter:
+            # fused in biased_grouped_topk so we can skip here
+            final_hidden_states *= self.routed_scaling_factor
+        if self.ep_size > 1:
+            if self.tp_size > 1 and not should_allreduce_fusion:
+                final_hidden_states = tensor_model_parallel_all_reduce(
+                    final_hidden_states
+                )
+            if shared_output is not None:
+                final_hidden_states += shared_output
+        else:
+            if shared_output is not None:
+                final_hidden_states += shared_output
+            if self.tp_size > 1 and not should_allreduce_fusion:
+                final_hidden_states = tensor_model_parallel_all_reduce(
+                    final_hidden_states
+                )
+        return final_hidden_states
+
+
+class Glm4MoeDecoderLayer(DeepseekV2DecoderLayer):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        is_nextn: bool = False,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        nn.Module.__init__(self)
+        self.hidden_size = config.hidden_size
+        self.config = config
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        head_dim = getattr(
+            config, "head_dim", config.hidden_size // config.num_attention_heads
+        )
+        rms_norm_eps = config.rms_norm_eps
+        attention_bias = config.attention_bias
+        self.layer_id = layer_id
+        self.self_attn = Glm4MoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            partial_rotary_factor=partial_rotary_factor,
+            max_position_embeddings=max_position_embeddings,
+            head_dim=head_dim,
+            rms_norm_eps=rms_norm_eps,
+            attention_bias=attention_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+            use_qk_norm=config.use_qk_norm,
+        )
+
+        self.is_layer_sparse = self._is_layer_sparse(layer_id, is_nextn=is_nextn)
+        is_previous_layer_sparse = self._is_layer_sparse(layer_id - 1, is_nextn=False)
+
+        num_layers = 1 if is_nextn else config.num_hidden_layers
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=num_layers,
+            is_layer_sparse=self.is_layer_sparse,
+            is_previous_layer_sparse=is_previous_layer_sparse,
+        )
+
+        if self.is_layer_sparse:
+            self.mlp = Glm4MoeSparseMoeBlock(
+                config=config,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+                layer_id=self.layer_id,
+            )
+        else:
+            if enable_moe_dense_fully_dp():
+                mlp_tp_rank, mlp_tp_size = 0, 1
+            else:
+                mlp_tp_rank, mlp_tp_size = None, None
+            self.mlp = Glm4MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+                tp_rank=mlp_tp_rank,
+                tp_size=mlp_tp_size,
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            allow_reduce_scatter=True,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+        zero_allocator: BumpAllocator,
+        gemm_output_zero_allocator: BumpAllocator = None,
+    ) -> torch.Tensor:
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+
+        hidden_states = self.mlp(hidden_states, forward_batch)
+
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            hidden_states, residual, forward_batch
+        )
+
+        return hidden_states, residual
+
+
+class Glm4MoeModel(DeepseekV2Model):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+        self.padding_id = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.first_k_dense_replace = config.first_k_dense_replace
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            enable_tp=not is_dp_attention_enabled(),
+        )
+        self.alt_stream = torch.cuda.Stream() if _is_cuda else None
+        self.layers = nn.ModuleList(
+            [
+                Glm4MoeDecoderLayer(
+                    config,
+                    layer_id,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{layer_id}", prefix),
+                    alt_stream=self.alt_stream,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.pp_group = get_pp_group()
+        self.start_layer = 0
+        self.end_layer = config.num_hidden_layers
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+
+class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+        config.moe_layer_freq = 1
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.quant_config = quant_config
+        self.pp_group = get_pp_group()
+        self.determine_num_fused_shared_experts("Glm4MoeForCausalLM")
+        self.model = Glm4MoeModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+            use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+        self._routed_experts_weights_of_layer = LazyValue(
+            lambda: {
+                layer_id: layer.mlp.get_moe_weights()
+                for layer_id, layer in enumerate(self.model.layers)
+                if isinstance(layer.mlp, DeepseekV2MoE)
+            }
+        )
+
+    def determine_num_fused_shared_experts(
+        self, architecture: str = "Glm4MoeForCausalLM"
+    ):
+        self.num_fused_shared_experts = 0
+        if global_server_args_dict["disable_shared_experts_fusion"]:
+            return
+
+        # Only Deepseek V3/R1 can use shared experts fusion optimization now.
+        disable_reason = None
+        if (
+            not _is_cuda
+            or torch.cuda.get_device_capability("cuda") < (8, 0)
+            or self.config.architectures[0] != architecture
+            or self.config.n_shared_experts != 1
+        ):
+            disable_reason = "Only GLM-4.5 on NV-platform with capability >= 80 can use shared experts fusion optimization."
+        elif get_moe_expert_parallel_world_size() > 1:
+            disable_reason = "Deepseek and GLM-4.5 can not use shared experts fusion optimization under expert parallelism."
+
+        if disable_reason is not None:
+            global_server_args_dict["disable_shared_experts_fusion"] = True
+            self.num_fused_shared_experts = 0
+            log_info_on_rank0(
+                logger,
+                f"{disable_reason} Shared experts fusion optimization is disabled.",
+            )
+            return
+
+        self.num_fused_shared_experts = self.config.n_shared_experts
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=False):
+
+        if is_nextn:
+            if hasattr(self.config, "num_nextn_predict_layers"):
+                num_nextn_layers = self.config.num_nextn_predict_layers
+                assert num_nextn_layers == 1, "Only 1 nextn layer is supported"
+                # compatible with old design
+                nextn_layer_id = (
+                    0
+                    if self.config.num_hidden_layers == 1
+                    else self.config.num_hidden_layers
+                )
+            else:
+                raise ValueError("num_nextn_predict_layers is not in the config")
+
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        if self.num_fused_shared_experts > 0:
+            assert self.num_fused_shared_experts == 1
+            weights_list = list(weights)
+            weights_dict = dict(weights_list)
+            if self.quant_config is not None:
+                if self.quant_config.get_name() == "w8a8_int8":
+                    suffix_list = [
+                        "down_proj.weight",
+                        "down_proj.weight_scale",
+                        "gate_proj.weight",
+                        "gate_proj.weight_scale",
+                        "up_proj.weight",
+                        "up_proj.weight_scale",
+                    ]
+                elif (
+                    self.quant_config.get_name() == "fp8"
+                    or self.quant_config.get_name() == "blockwise_int8"
+                    or self.quant_config.get_name() == "compressed_tensors"
+                ):
+                    suffix_list = [
+                        "down_proj.weight",
+                        "down_proj.weight_scale",
+                        "gate_proj.weight",
+                        "gate_proj.weight_scale",
+                        "up_proj.weight",
+                        "up_proj.weight_scale",
+                    ]
+                elif self.quant_config.get_name() == "awq":
+                    suffix_list = [
+                        "down_proj.qweight",
+                        "down_proj.qzeros",
+                        "down_proj.scales",
+                        "gate_proj.qweight",
+                        "gate_proj.qzeros",
+                        "gate_proj.scales",
+                        "up_proj.qweight",
+                        "up_proj.qzeros",
+                        "up_proj.scales",
+                    ]
+                elif self.quant_config.get_name() == "modelopt_fp4":
+                    suffix_list = [
+                        "down_proj.weight",
+                        "down_proj.weight_scale",
+                        "down_proj.weight_scale_2",
+                        "down_proj.input_scale",
+                        "gate_proj.weight",
+                        "gate_proj.weight_scale",
+                        "gate_proj.weight_scale_2",
+                        "gate_proj.input_scale",
+                        "up_proj.weight",
+                        "up_proj.weight_scale",
+                        "up_proj.weight_scale_2",
+                        "up_proj.input_scale",
+                    ]
+                else:
+                    raise ValueError(
+                        f"Unsupported shared expert fusion for quantization: {self.quant_config.get_name()}."
+                    )
+            else:
+                suffix_list = [
+                    "down_proj.weight",
+                    "gate_proj.weight",
+                    "up_proj.weight",
+                ]
+            names_to_remove = []
+
+            moe_layers = (
+                range(
+                    self.config.first_k_dense_replace,
+                    self.config.num_hidden_layers,
+                    self.config.moe_layer_freq,
+                )
+                if not is_nextn
+                else [nextn_layer_id]
+            )
+
+            for moe_layer in moe_layers:
+                for suffix in suffix_list:
+                    shared_expert_weight_name = (
+                        f"model.layers.{moe_layer}.mlp.shared_experts.{suffix}"
+                    )
+                    # online fp8 quantization does not load weight_scale
+                    if shared_expert_weight_name not in weights_dict:
+                        continue
+                    weights_list.append(
+                        (
+                            f"model.layers.{moe_layer}."
+                            f"mlp.experts."
+                            f"{self.config.n_routed_experts + 0}"
+                            f".{suffix}",
+                            weights_dict[shared_expert_weight_name],
+                        )
+                    )
+                    names_to_remove += [shared_expert_weight_name]
+            weights = [w for w in weights_list if w[0] not in names_to_remove]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts + self.num_fused_shared_experts,
+        )
+
+        # Fuse q_a_proj and kv_a_proj_with_mqa along output dimension when q_lora_rank is not None
+        fuse_qkv_a_proj = hasattr(self.config, "q_lora_rank") and (
+            self.config.q_lora_rank is not None
+        )
+        cached_a_proj = {} if fuse_qkv_a_proj else None
+
+        if is_nextn:
+            nextn_layer_prefix = f"model.layers.{nextn_layer_id}"
+            nextn_spec_weight_names = [
+                "shared_head.norm",
+                "eh_proj",
+                "enorm",
+                "hnorm",
+            ]
+
+        params_dict = dict(self.named_parameters())
+        weight_names = []
+        for name, loaded_weight in weights:
+            weight_names.append(name)
+
+            if not is_nextn:
+                if hasattr(self.config, "num_nextn_predict_layers"):
+                    num_nextn_layers = self.config.num_nextn_predict_layers
+                    if num_nextn_layers > 0 and name.startswith("model.layers"):
+                        name_list = name.split(".")
+                        if (
+                            len(name_list) >= 3
+                            and int(name_list[2]) >= self.config.num_hidden_layers
+                        ):
+                            continue
+            else:
+                if not name.startswith(nextn_layer_prefix):
+                    continue
+
+                # Use shared head and embed weights from target model
+                if "shared_head.head" in name or "embed_tokens" in name:
+                    continue
+
+                is_decoder = True
+                # For nextn specific weights
+                for weight_name in nextn_spec_weight_names:
+                    if weight_name in name:
+                        name = name.replace(nextn_layer_prefix, "model")
+                        is_decoder = False
+                        break
+                # For decoder layer weights
+                if is_decoder:
+                    name = name.replace(nextn_layer_prefix, "model.decoder")
+
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if ("mlp.experts." in name) and name not in params_dict:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if fuse_qkv_a_proj and (
+                        "q_a_proj" in name or "kv_a_proj_with_mqa" in name
+                    ):
+                        cached_a_proj[name] = loaded_weight
+                        q_a_proj_name = (
+                            name
+                            if "q_a_proj" in name
+                            else name.replace("kv_a_proj_with_mqa", "q_a_proj")
+                        )
+                        kv_a_proj_name = (
+                            name
+                            if "kv_a_proj_with_mqa" in name
+                            else name.replace("q_a_proj", "kv_a_proj_with_mqa")
+                        )
+
+                        # When both q_a_proj and kv_a_proj_with_mqa has been cached, load the fused weight to parameter
+                        if (
+                            q_a_proj_name in cached_a_proj
+                            and kv_a_proj_name in cached_a_proj
+                        ):
+                            q_a_proj_weight = cached_a_proj[q_a_proj_name]
+                            kv_a_proj_weight = cached_a_proj[kv_a_proj_name]
+                            fused_weight = torch.cat(
+                                [q_a_proj_weight, kv_a_proj_weight], dim=0
+                            )
+                            param_name = (
+                                name.replace("q_a_proj", "fused_qkv_a_proj_with_mqa")
+                                if "q_a_proj" in name
+                                else name.replace(
+                                    "kv_a_proj_with_mqa", "fused_qkv_a_proj_with_mqa"
+                                )
+                            )
+                            param = params_dict[param_name]
+
+                            weight_loader = getattr(
+                                param, "weight_loader", default_weight_loader
+                            )
+                            weight_loader(param, fused_weight)
+                            cached_a_proj.pop(q_a_proj_name)
+                            cached_a_proj.pop(kv_a_proj_name)
+                    else:
+                        if (
+                            "k_scale" in name or "v_scale" in name
+                        ) and name not in params_dict:
+                            # modelopt attn kv scale is named differently
+                            if any(scale in name for scale in ["k_scale", "v_scale"]):
+                                name = name.replace("_proj", "attn_mqa")
+                            else:
+                                logger.warning(
+                                    f"Unknown scale found in checkpoint: {name}"
+                                )
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+
+
+EntryClass = [Glm4MoeForCausalLM]
diff --git a/python/sglang/srt/models/glm4_moe_nextn.py b/python/sglang/srt/models/glm4_moe_nextn.py
new file mode 100644
index 000000000..399f0f4e0
--- /dev/null
+++ b/python/sglang/srt/models/glm4_moe_nextn.py
@@ -0,0 +1,168 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Inference-only GLM-4.5 NextN Speculative Decoding."""
+import logging
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.layers.dp_attention import is_dp_attention_enabled
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.glm4_moe import Glm4MoeDecoderLayer, Glm4MoeForCausalLM
+from sglang.srt.utils import BumpAllocator, add_prefix
+
+logger = logging.getLogger(__name__)
+
+
+class Glm4MoeModelNextN(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        if quant_config is not None and quant_config.get_name() == "modelopt_fp4":
+            logger.warning(
+                "Overriding Glm4MoeForCausalLMNextN quant config for modelopt_fp4 GLM-4.5 model."
+            )
+            quant_config = None
+
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            enable_tp=not is_dp_attention_enabled(),
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+
+        self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.eh_proj = nn.Linear(2 * config.hidden_size, config.hidden_size, bias=False)
+
+        self.decoder = Glm4MoeDecoderLayer(
+            config,
+            0,
+            quant_config=quant_config,
+            is_nextn=True,
+            prefix=add_prefix("decoder", prefix),
+        )
+
+        self.shared_head = nn.Module()
+        self.shared_head.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        zero_allocator = BumpAllocator(
+            buffer_size=2,
+            dtype=torch.float32,
+            device=(
+                input_embeds.device if input_embeds is not None else input_ids.device
+            ),
+        )
+
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        if hidden_states.shape[0] > 0:
+            hidden_states = self.eh_proj(
+                torch.cat(
+                    (
+                        self.enorm(hidden_states),
+                        self.hnorm(forward_batch.spec_info.hidden_states),
+                    ),
+                    dim=-1,
+                )
+            )
+
+        residual = None
+        with get_global_expert_distribution_recorder().disable_this_region():
+            hidden_states, residual = self.decoder(
+                positions, hidden_states, forward_batch, residual, zero_allocator
+            )
+
+        if not forward_batch.forward_mode.is_idle():
+            if residual is not None:
+                hidden_states, _ = self.shared_head.norm(hidden_states, residual)
+            else:
+                hidden_states = self.shared_head.norm(hidden_states)
+
+        return hidden_states
+
+
+class Glm4MoeForCausalLMNextN(Glm4MoeForCausalLM):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.quant_config = quant_config
+        self.determine_num_fused_shared_experts("Glm4MoeForCausalLMNextN")
+
+        self.model = Glm4MoeModelNextN(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("model.shared_head.head", prefix),
+            use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        super().load_weights(weights, is_nextn=True)
+
+
+EntryClass = [Glm4MoeForCausalLMNextN]
diff --git a/python/sglang/srt/models/glm4v.py b/python/sglang/srt/models/glm4v.py
new file mode 100644
index 000000000..63c955a72
--- /dev/null
+++ b/python/sglang/srt/models/glm4v.py
@@ -0,0 +1,642 @@
+import logging
+from functools import lru_cache, partial
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.models.glm4v.configuration_glm4v import Glm4vConfig, Glm4vVisionConfig
+
+from sglang.srt.hf_transformers_utils import get_processor
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.attention import vision_utils
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.managers.schedule_batch import MultimodalDataItem
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.glm4 import Glm4Model
+from sglang.srt.models.qwen2_5_vl import (
+    Qwen2_5_VisionBlock,
+    Qwen2_5_VLForConditionalGeneration,
+)
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+cached_get_processor = lru_cache(get_processor)
+
+
+class Glm4vRMSNorm(RMSNorm):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        original_shape = x.shape
+        x_2d = x.contiguous().reshape(-1, original_shape[-1])
+        x_2d = super().forward(x_2d)
+        x = x_2d.reshape(original_shape)
+        return x
+
+
+class Glm4vVisionMLP(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        bias: bool = False,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=in_features,
+            output_sizes=[hidden_features] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            hidden_features,
+            in_features,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x: torch.Tensor):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Glm4vVisionBlock(Qwen2_5_VisionBlock):
+    def __init__(
+        self,
+        config: Glm4vVisionConfig,
+        norm_layer: Optional[nn.Module] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(
+            dim=config.hidden_size,
+            intermediate_dim=config.out_hidden_size,
+            num_heads=config.num_heads,
+            hidden_act=config.hidden_act,
+            norm_layer=norm_layer,
+            quant_config=quant_config,
+            prefix=prefix,
+            num_dummy_heads=config.num_dummy_heads,
+            rms_norm_eps=config.rms_norm_eps,
+        )
+
+        self.mlp = Glm4vVisionMLP(
+            config.hidden_size,
+            config.out_hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+
+class Glm4vVisionPatchEmbed(nn.Module):
+    def __init__(
+        self,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        in_channels: int = 3,
+        hidden_size: int = 1536,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.hidden_size = hidden_size
+        self.in_channels = in_channels
+
+        kernel_size = (temporal_patch_size, patch_size, patch_size)
+        self.proj = nn.Conv3d(
+            in_channels,
+            hidden_size,
+            kernel_size=kernel_size,
+            stride=kernel_size,
+            bias=True,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.view(
+            -1,
+            self.in_channels,
+            self.temporal_patch_size,
+            self.patch_size,
+            self.patch_size,
+        )
+        x = self.proj(x).view(-1, self.hidden_size)
+        return x
+
+
+class Glm4vPatchMerger(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        context_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = d_model
+        self.proj = ColumnParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("proj", prefix),
+            gather_output=True,
+        )
+        self.post_projection_norm = nn.LayerNorm(self.hidden_size)
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=self.hidden_size,
+            output_sizes=[context_dim] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            context_dim,
+            self.hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        self.extra_activation_func = nn.GELU()
+
+    def forward(self, x: torch.Tensor):
+        x, _ = self.proj(x)
+        x = self.extra_activation_func(self.post_projection_norm(x))
+        gate_up, _ = self.gate_up_proj(x)
+        gate, up = gate_up.chunk(2, dim=-1)
+        x = F.silu(gate) * up
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Glm4vVisionEmbeddings(nn.Module):
+    def __init__(self, config: Glm4vVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_positions).expand((1, -1)),
+            persistent=False,
+        )
+
+    def forward(
+        self, embeddings, lengths, image_shapes, h_coords, w_coords
+    ) -> torch.Tensor:
+        pos_embed_weight = self.position_embedding.weight
+        hidden_size = pos_embed_weight.shape[1]
+        total_seq = h_coords.shape[0]
+        device = pos_embed_weight.device
+
+        # Move coordinates to correct device
+        h_coords, w_coords = h_coords.to(device), w_coords.to(device)
+
+        # Handle empty sequence case
+        if total_seq == 0:
+            adapted_pos_embed = torch.empty(
+                0, hidden_size, device=device, dtype=pos_embed_weight.dtype
+            )
+        else:
+            # Convert inputs to tensors if needed
+            if isinstance(lengths, list):
+                lengths = torch.tensor(lengths, device=device, dtype=torch.long)
+            if not isinstance(image_shapes, torch.Tensor):
+                image_shapes = torch.tensor(
+                    image_shapes, device=device, dtype=torch.long
+                )
+
+            # Prepare 2D position embedding
+            orig_size_sq = pos_embed_weight.shape[0]
+            orig_size = int(orig_size_sq**0.5)
+            pos_embed_2d = (
+                pos_embed_weight.view(orig_size, orig_size, hidden_size)
+                .permute(2, 0, 1)
+                .unsqueeze(0)
+                .to(device=device, dtype=torch.float32)
+            )
+
+            # Calculate target dimensions for each patch
+            target_h = torch.cat(
+                [image_shapes[i, 1].repeat(lengths[i]) for i in range(len(lengths))]
+            ).to(device=device, dtype=torch.float32)
+            target_w = torch.cat(
+                [image_shapes[i, 2].repeat(lengths[i]) for i in range(len(lengths))]
+            ).to(device=device, dtype=torch.float32)
+
+            # Normalize coordinates to [-1, 1] range for grid_sample
+            h_coords = h_coords.to(device=device, dtype=torch.float32)
+            w_coords = w_coords.to(device=device, dtype=torch.float32)
+            norm_w = ((w_coords + 0.5) / target_w) * 2 - 1
+            norm_h = ((h_coords + 0.5) / target_h) * 2 - 1
+
+            # Create sampling grid
+            grid = torch.stack((norm_w, norm_h), dim=-1).unsqueeze(0).unsqueeze(2)
+
+            # Perform bicubic interpolation
+            interpolated_embed_fp32 = F.grid_sample(
+                pos_embed_2d,
+                grid,
+                mode="bicubic",
+                align_corners=False,
+                padding_mode="border",
+            )
+
+            # Reshape and convert back to original dtype
+            adapted_pos_embed_fp32 = (
+                interpolated_embed_fp32.squeeze(0).squeeze(-1).permute(1, 0)
+            )
+            adapted_pos_embed = adapted_pos_embed_fp32.to(pos_embed_weight.dtype).to(
+                embeddings.device
+            )
+
+        # Add adapted position encoding to embeddings
+        embeddings = embeddings + adapted_pos_embed
+        return embeddings
+
+
+class Glm4vVisionRotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self._seq_len_cached = 0
+        self._freqs_cached = None
+
+    def update_freqs_cache(self, seqlen: int) -> None:
+        if seqlen > self._seq_len_cached:
+            seqlen *= 2
+            self._seq_len_cached = seqlen
+            self.inv_freq = 1.0 / (
+                self.theta
+                ** (
+                    torch.arange(
+                        0,
+                        self.dim,
+                        2,
+                        dtype=torch.float,
+                        device=self.inv_freq.device,
+                    )
+                    / self.dim
+                )
+            )
+            seq = torch.arange(
+                seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype
+            )
+            freqs = torch.outer(seq, self.inv_freq)
+            self._freqs_cached = freqs
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        self.update_freqs_cache(seqlen)
+        return self._freqs_cached[:seqlen]
+
+
+class Glm4vVisionModel(nn.Module):
+    def __init__(
+        self,
+        vision_config: Glm4vVisionConfig,
+        norm_eps: float = 1e-6,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        patch_size = vision_config.patch_size
+        temporal_patch_size = vision_config.temporal_patch_size
+        in_channels = vision_config.in_channels
+        depth = vision_config.depth
+        self.hidden_size = vision_config.hidden_size
+        self.num_heads = vision_config.num_heads
+
+        self.patch_size = vision_config.patch_size
+        self.spatial_merge_size = vision_config.spatial_merge_size
+        self.out_hidden_size = vision_config.out_hidden_size
+
+        self.patch_embed = Glm4vVisionPatchEmbed(
+            patch_size=patch_size,
+            temporal_patch_size=temporal_patch_size,
+            in_channels=in_channels,
+            hidden_size=self.hidden_size,
+        )
+
+        norm_layer = partial(Glm4vRMSNorm, eps=norm_eps)
+        head_dim = self.hidden_size // self.num_heads
+        self.rotary_pos_emb = Glm4vVisionRotaryEmbedding(head_dim // 2)
+
+        self.blocks = nn.ModuleList(
+            [
+                Glm4vVisionBlock(
+                    config=vision_config,
+                    norm_layer=norm_layer,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"blocks.{layer_idx}", prefix),
+                )
+                for layer_idx in range(depth)
+            ]
+        )
+
+        self.merger = Glm4vPatchMerger(
+            d_model=vision_config.out_hidden_size,
+            context_dim=vision_config.intermediate_size,
+            quant_config=quant_config,
+            bias=False,
+            prefix=add_prefix("merger", prefix),
+        )
+
+        self.embeddings = Glm4vVisionEmbeddings(vision_config)
+
+        self.post_conv_layernorm = Glm4vRMSNorm(
+            vision_config.hidden_size, eps=vision_config.rms_norm_eps
+        )
+        self.downsample = nn.Conv2d(
+            in_channels=vision_config.hidden_size,
+            out_channels=vision_config.out_hidden_size,
+            kernel_size=vision_config.spatial_merge_size,
+            stride=vision_config.spatial_merge_size,
+        )
+        self.post_layernorm = Glm4vRMSNorm(
+            vision_config.hidden_size, eps=vision_config.rms_norm_eps
+        )
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.patch_embed.proj.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.patch_embed.proj.weight.device
+
+    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            hpos_ids = (
+                hpos_ids.reshape(
+                    h // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                    w // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                )
+                .permute(0, 2, 1, 3)
+                .flatten()
+            )
+            wpos_ids = (
+                wpos_ids.reshape(
+                    h // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                    w // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                )
+                .permute(0, 2, 1, 3)
+                .flatten()
+            )
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb, pos_ids
+
+    def forward(self, x: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
+        # patchify
+        x = x.to(device=self.device, dtype=self.dtype)
+        x = self.patch_embed(x)
+        x = self.post_conv_layernorm(x)
+
+        # compute position embedding
+        rotary_pos_emb, image_type_ids = self.rot_pos_emb(grid_thw)
+        # compute cu_seqlens
+        cu_seqlens = torch.repeat_interleave(
+            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
+        ).cumsum(dim=0, dtype=torch.int32)
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
+
+        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        x = self.embeddings(
+            x, seqlens, grid_thw, image_type_ids[:, 0], image_type_ids[:, 1]
+        )
+
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        rotary_pos_emb_tuple = (emb.cos(), emb.sin())
+
+        # x.shape: (s, b, d) where b=1 for vision processing
+        # transformers
+        x = x.unsqueeze(1)
+        for blk in self.blocks:
+            x = blk(x, cu_seqlens=cu_seqlens, position_embeddings=rotary_pos_emb_tuple)
+
+        # adapter
+        x = self.post_layernorm(x)
+        x = x.view(-1, self.spatial_merge_size, self.spatial_merge_size, x.shape[-1])
+        x = x.permute(0, 3, 1, 2)
+        x = self.downsample(x).view(-1, self.out_hidden_size)
+        x = self.merger(x)
+
+        return x
+
+
+class Glm4vForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
+    def __init__(
+        self,
+        config: Glm4vConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+
+        self.config = config
+        vision_utils.update_vit_attn_dummy_heads_config(self.config)
+        self.model = Glm4Model(
+            config,
+            quant_config,
+            prefix=add_prefix("model", prefix),
+        )
+        self.visual = Glm4vVisionModel(
+            config.vision_config,
+            norm_eps=getattr(config, "rms_norm_eps", 1e-5),
+            quant_config=quant_config,
+            prefix=add_prefix("visual", prefix),
+        )
+
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+        self.is_mrope_enabled = "mrope_section" in self.config.rope_scaling
+
+        # For EAGLE3 support
+        self.capture_aux_hidden_states = False
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        pixel_values = torch.cat(
+            [item.feature.squeeze(0) for item in items], dim=0
+        ).type(self.visual.dtype)
+        image_grid_thw = torch.concat([item.image_grid_thw for item in items], dim=0)
+        # For multi-image, pixel_values is [num_of_images, L, C] shape
+        # assert pixel_values.dim() == 2, pixel_values.dim()
+        assert image_grid_thw.dim() == 2, image_grid_thw.dim()
+        image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
+        split_sizes = (
+            image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2
+        ).tolist()
+        image_embeds = torch.split(image_embeds, split_sizes)
+        return torch.cat(image_embeds)
+
+    def get_video_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        pixel_values_videos = torch.cat(
+            [item.feature.squeeze(0) for item in items], dim=0
+        ).type(self.visual.dtype)
+        video_grid_thw = torch.concat([item.video_grid_thw for item in items], dim=0)
+        # For multi-video, pixel_values_videos is [num_of_videos, L, C] shape
+        # assert pixel_values_videos.dim() == 2, pixel_values_videos.dim()
+        assert video_grid_thw.dim() == 2, video_grid_thw.dim()
+
+        # reshape video_grid_thw -> [b, 3] -> [1, h, w] * frames
+        temp_frames_hw = []
+        for t, h, w in video_grid_thw:
+            repeated_row = (
+                torch.tensor([1, h.item(), w.item()]).unsqueeze(0).repeat(t, 1)
+            )
+            temp_frames_hw.append(repeated_row)
+        flattened_video_grid_thw = torch.cat(temp_frames_hw, dim=0)
+        video_embeds = self.visual(
+            pixel_values_videos, grid_thw=flattened_video_grid_thw
+        )
+        split_sizes = (
+            video_grid_thw.prod(-1) // self.visual.spatial_merge_size**2
+        ).tolist()
+        video_embeds = torch.split(video_embeds, split_sizes)
+        return torch.cat(video_embeds)
+
+    def _update_hf_config(self):
+        """update hf config to ensure vision attention num_attention_heads is divisible by tp_size"""
+        tp_size = get_attention_tp_size()
+        num_heads = self.config.vision_config.num_heads
+        head_dim = self.config.vision_config.hidden_size // num_heads
+        num_dummy_heads = 0
+
+        if num_heads % tp_size != 0:
+            num_dummy_heads = (
+                (num_heads + tp_size - 1) // tp_size
+            ) * tp_size - num_heads
+
+        setattr(self.config.vision_config, "head_dim", head_dim)
+        setattr(self.config.vision_config, "num_dummy_heads", num_dummy_heads)
+
+    def _pad_vit_attn_dummy_heads(self, name: str, loaded_weight: torch.Tensor):
+        """pad attn qkv weights for dummy heads"""
+        num_dummy_heads = self.config.vision_config.num_dummy_heads
+        if num_dummy_heads == 0:
+            return loaded_weight
+        head_dim = self.config.vision_config.head_dim
+
+        if "attn.qkv_proj" in name:
+            wq, wk, wv = loaded_weight.chunk(3, dim=0)
+            if name.endswith(".weight"):
+                dummy_shape = [num_dummy_heads, head_dim, wq.shape[-1]]
+            elif name.endswith(".bias"):
+                dummy_shape = [num_dummy_heads, head_dim]
+            else:
+                raise RuntimeError(f"Unsupported weight with name={name}")
+            pad_func = lambda x: torch.cat(
+                [x.unflatten(0, (-1, head_dim)), x.new_zeros(dummy_shape)], dim=0
+            ).flatten(0, 1)
+            wq, wk, wv = pad_func(wq), pad_func(wk), pad_func(wv)
+            loaded_weight = torch.cat([wq, wk, wv], dim=0)
+        elif "attn.proj.weight" in name:
+            padded_weight = loaded_weight.new_zeros(
+                loaded_weight.shape[0], head_dim * num_dummy_heads
+            )
+            loaded_weight = torch.cat([loaded_weight, padded_weight], dim=-1)
+        elif "attn.q_norm.weight" in name or "attn.k_norm.weight" in name:
+            padded_weight = loaded_weight.new_zeros(head_dim * num_dummy_heads)
+            loaded_weight = torch.cat([loaded_weight, padded_weight], dim=0)
+        return loaded_weight
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".up_proj", 1),
+            (".gate_up_proj", ".gate_proj", 0),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "language_model." in name:
+                name = name.replace("language_model.", "")
+            if "model.visual." in name:
+                name = name.replace("model.visual.", "visual.")
+
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if "visual" in name:
+                    # adapt to VisionAttention
+                    name = name.replace(r"attn.qkv.", r"attn.qkv_proj.")
+
+                try:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                except KeyError:
+                    print(params_dict.keys())
+                    raise
+
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                if "visual" in name:
+                    loaded_weight = vision_utils.pad_vit_attn_dummy_heads(
+                        self.config, name, loaded_weight
+                    )
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = [Glm4vForConditionalGeneration]
diff --git a/python/sglang/srt/models/glm4v_moe.py b/python/sglang/srt/models/glm4v_moe.py
new file mode 100644
index 000000000..86cca4ab2
--- /dev/null
+++ b/python/sglang/srt/models/glm4v_moe.py
@@ -0,0 +1,397 @@
+import logging
+from functools import lru_cache
+from typing import Iterable, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from transformers.models.glm4v_moe.configuration_glm4v_moe import Glm4vMoeConfig
+
+from sglang.srt.distributed import (
+    get_moe_expert_parallel_world_size,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.hf_transformers_utils import get_processor
+from sglang.srt.layers.attention import vision_utils
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.glm4_moe import Glm4MoeModel
+from sglang.srt.models.glm4v import Glm4vForConditionalGeneration, Glm4vVisionModel
+from sglang.srt.utils import add_prefix, is_cuda, log_info_on_rank0
+
+_is_cuda = is_cuda()
+
+logger = logging.getLogger(__name__)
+
+cached_get_processor = lru_cache(get_processor)
+
+
+class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
+    def __init__(
+        self,
+        config: Glm4vMoeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+
+        config.moe_layer_freq = 1
+        self.config = config
+        vision_utils.update_vit_attn_dummy_heads_config(self.config)
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.quant_config = quant_config
+        self.determine_num_fused_shared_experts("Glm4MoeForCausalLM")
+        self.num_fused_shared_experts = (
+            0
+            if global_server_args_dict["disable_shared_experts_fusion"]
+            else config.n_shared_experts
+        )
+
+        self.model = Glm4MoeModel(
+            config,
+            quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+        self.visual = Glm4vVisionModel(
+            config.vision_config,
+            norm_eps=getattr(config, "rms_norm_eps", 1e-5),
+            quant_config=quant_config,
+            prefix=add_prefix("visual", prefix),
+        )
+
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+            use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
+        )
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+        self.is_mrope_enabled = "mrope_section" in self.config.rope_scaling
+
+    def determine_num_fused_shared_experts(
+        self, architecture: str = "Glm4MoeForCausalLM"
+    ):
+        self.num_fused_shared_experts = 0
+        if global_server_args_dict["disable_shared_experts_fusion"]:
+            return
+
+        # Only Deepseek V3/R1 can use shared experts fusion optimization now.
+        disable_reason = None
+        if (
+            not _is_cuda
+            or torch.cuda.get_device_capability("cuda") < (8, 0)
+            or self.config.architectures[0] != architecture
+            or self.config.n_shared_experts != 1
+        ):
+            disable_reason = "Only GLM-4.5 on NV-platform with capability >= 80 can use shared experts fusion optimization."
+        elif get_moe_expert_parallel_world_size() > 1:
+            disable_reason = "Deepseek and GLM-4.5 can not use shared experts fusion optimization under expert parallelism."
+
+        if disable_reason is not None:
+            global_server_args_dict["disable_shared_experts_fusion"] = True
+            self.num_fused_shared_experts = 0
+            log_info_on_rank0(
+                logger,
+                f"{disable_reason} Shared experts fusion optimization is disabled.",
+            )
+            return
+
+        self.num_fused_shared_experts = self.config.n_shared_experts
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=False):
+
+        if is_nextn:
+            if hasattr(self.config, "num_nextn_predict_layers"):
+                num_nextn_layers = self.config.num_nextn_predict_layers
+                assert num_nextn_layers == 1, "Only 1 nextn layer is supported"
+                # compatible with old design
+                nextn_layer_id = (
+                    0
+                    if self.config.num_hidden_layers == 1
+                    else self.config.num_hidden_layers
+                )
+            else:
+                raise ValueError("num_nextn_predict_layers is not in the config")
+
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        if self.num_fused_shared_experts > 0:
+            assert self.num_fused_shared_experts == 1
+            weights_list = list(weights)
+            weights_dict = dict(weights_list)
+            if self.quant_config is not None:
+                if self.quant_config.get_name() == "w8a8_int8":
+                    suffix_list = [
+                        "down_proj.weight",
+                        "down_proj.weight_scale",
+                        "gate_proj.weight",
+                        "gate_proj.weight_scale",
+                        "up_proj.weight",
+                        "up_proj.weight_scale",
+                    ]
+                elif (
+                    self.quant_config.get_name() == "fp8"
+                    or self.quant_config.get_name() == "blockwise_int8"
+                    or self.quant_config.get_name() == "compressed_tensors"
+                ):
+                    suffix_list = [
+                        "down_proj.weight",
+                        "down_proj.weight_scale",
+                        "gate_proj.weight",
+                        "gate_proj.weight_scale",
+                        "up_proj.weight",
+                        "up_proj.weight_scale",
+                    ]
+                elif self.quant_config.get_name() == "awq":
+                    suffix_list = [
+                        "down_proj.qweight",
+                        "down_proj.qzeros",
+                        "down_proj.scales",
+                        "gate_proj.qweight",
+                        "gate_proj.qzeros",
+                        "gate_proj.scales",
+                        "up_proj.qweight",
+                        "up_proj.qzeros",
+                        "up_proj.scales",
+                    ]
+                elif self.quant_config.get_name() == "modelopt_fp4":
+                    suffix_list = [
+                        "down_proj.weight",
+                        "down_proj.weight_scale",
+                        "down_proj.weight_scale_2",
+                        "down_proj.input_scale",
+                        "gate_proj.weight",
+                        "gate_proj.weight_scale",
+                        "gate_proj.weight_scale_2",
+                        "gate_proj.input_scale",
+                        "up_proj.weight",
+                        "up_proj.weight_scale",
+                        "up_proj.weight_scale_2",
+                        "up_proj.input_scale",
+                    ]
+                else:
+                    raise ValueError(
+                        f"Unsupported shared expert fusion for quantization: {self.quant_config.get_name()}."
+                    )
+            else:
+                suffix_list = [
+                    "down_proj.weight",
+                    "gate_proj.weight",
+                    "up_proj.weight",
+                ]
+            names_to_remove = []
+
+            moe_layers = (
+                range(
+                    self.config.first_k_dense_replace,
+                    self.config.num_hidden_layers,
+                    self.config.moe_layer_freq,
+                )
+                if not is_nextn
+                else [nextn_layer_id]
+            )
+
+            for moe_layer in moe_layers:
+                for suffix in suffix_list:
+                    shared_expert_weight_name = (
+                        f"model.layers.{moe_layer}.mlp.shared_experts.{suffix}"
+                    )
+                    # online fp8 quantization does not load weight_scale
+                    if shared_expert_weight_name not in weights_dict:
+                        continue
+                    weights_list.append(
+                        (
+                            f"model.layers.{moe_layer}."
+                            f"mlp.experts."
+                            f"{self.config.n_routed_experts + 0}"
+                            f".{suffix}",
+                            weights_dict[shared_expert_weight_name],
+                        )
+                    )
+                    names_to_remove += [shared_expert_weight_name]
+            weights = [w for w in weights_list if w[0] not in names_to_remove]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts + self.num_fused_shared_experts,
+        )
+
+        # Fuse q_a_proj and kv_a_proj_with_mqa along output dimension when q_lora_rank is not None
+        fuse_qkv_a_proj = hasattr(self.config, "q_lora_rank") and (
+            self.config.q_lora_rank is not None
+        )
+        cached_a_proj = {} if fuse_qkv_a_proj else None
+
+        if is_nextn:
+            nextn_layer_prefix = f"model.layers.{nextn_layer_id}"
+            nextn_spec_weight_names = [
+                "shared_head.norm",
+                "eh_proj",
+                "enorm",
+                "hnorm",
+            ]
+
+        params_dict = dict(self.named_parameters())
+        weight_names = []
+        for name, loaded_weight in weights:
+            weight_names.append(name)
+
+            if not is_nextn:
+                if hasattr(self.config, "num_nextn_predict_layers"):
+                    num_nextn_layers = self.config.num_nextn_predict_layers
+                    if num_nextn_layers > 0 and name.startswith("model.layers"):
+                        name_list = name.split(".")
+                        if (
+                            len(name_list) >= 3
+                            and int(name_list[2]) >= self.config.num_hidden_layers
+                        ):
+                            continue
+            else:
+                if not name.startswith(nextn_layer_prefix):
+                    continue
+
+                # Use shared head and embed weights from target model
+                if "shared_head.head" in name or "embed_tokens" in name:
+                    continue
+
+                is_decoder = True
+                # For nextn specific weights
+                for weight_name in nextn_spec_weight_names:
+                    if weight_name in name:
+                        name = name.replace(nextn_layer_prefix, "model")
+                        is_decoder = False
+                        break
+                # For decoder layer weights
+                if is_decoder:
+                    name = name.replace(nextn_layer_prefix, "model.decoder")
+
+            if "language_model." in name:
+                name = name.replace("language_model.", "")
+            if "model.visual." in name:
+                name = name.replace("model.visual.", "visual.")
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if ("mlp.experts." in name) and name not in params_dict:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    if "visual" in name:
+                        # adapt to VisionAttention
+                        name = name.replace(r"attn.qkv.", r"attn.qkv_proj.")
+
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if fuse_qkv_a_proj and (
+                        "q_a_proj" in name or "kv_a_proj_with_mqa" in name
+                    ):
+                        cached_a_proj[name] = loaded_weight
+                        q_a_proj_name = (
+                            name
+                            if "q_a_proj" in name
+                            else name.replace("kv_a_proj_with_mqa", "q_a_proj")
+                        )
+                        kv_a_proj_name = (
+                            name
+                            if "kv_a_proj_with_mqa" in name
+                            else name.replace("q_a_proj", "kv_a_proj_with_mqa")
+                        )
+
+                        # When both q_a_proj and kv_a_proj_with_mqa has been cached, load the fused weight to parameter
+                        if (
+                            q_a_proj_name in cached_a_proj
+                            and kv_a_proj_name in cached_a_proj
+                        ):
+                            q_a_proj_weight = cached_a_proj[q_a_proj_name]
+                            kv_a_proj_weight = cached_a_proj[kv_a_proj_name]
+                            fused_weight = torch.cat(
+                                [q_a_proj_weight, kv_a_proj_weight], dim=0
+                            )
+                            param_name = (
+                                name.replace("q_a_proj", "fused_qkv_a_proj_with_mqa")
+                                if "q_a_proj" in name
+                                else name.replace(
+                                    "kv_a_proj_with_mqa", "fused_qkv_a_proj_with_mqa"
+                                )
+                            )
+                            param = params_dict[param_name]
+
+                            weight_loader = getattr(
+                                param, "weight_loader", default_weight_loader
+                            )
+                            weight_loader(param, fused_weight)
+                            cached_a_proj.pop(q_a_proj_name)
+                            cached_a_proj.pop(kv_a_proj_name)
+                    else:
+                        if (
+                            "k_scale" in name or "v_scale" in name
+                        ) and name not in params_dict:
+                            # modelopt attn kv scale is named differently
+                            if any(scale in name for scale in ["k_scale", "v_scale"]):
+                                name = name.replace("_proj", "attn_mqa")
+                            else:
+                                logger.warning(
+                                    f"Unknown scale found in checkpoint: {name}"
+                                )
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        if "visual" in name:
+                            loaded_weight = vision_utils.pad_vit_attn_dummy_heads(
+                                self.config, name, loaded_weight
+                            )
+                        weight_loader(param, loaded_weight)
+
+
+EntryClass = [Glm4vMoeForConditionalGeneration]
diff --git a/python/sglang/srt/models/gpt2.py b/python/sglang/srt/models/gpt2.py
new file mode 100644
index 000000000..1ec33406f
--- /dev/null
+++ b/python/sglang/srt/models/gpt2.py
@@ -0,0 +1,288 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
+# Copyright 2023 The vLLM team.
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only GPT-2 model compatible with HuggingFace weights."""
+from typing import Iterable, Optional, Tuple, Type
+
+import torch
+from torch import nn
+from transformers import GPT2Config
+
+from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import NewGELU
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+
+class GPT2Attention(nn.Module):
+
+    def __init__(
+        self,
+        layer_id: int,
+        config: GPT2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        total_num_heads = config.num_attention_heads
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        assert total_num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = total_num_heads // tensor_model_parallel_world_size
+        self.head_dim = self.hidden_size // total_num_heads
+        self.scale = self.head_dim**-0.5
+
+        self.c_attn = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            total_num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("c_attn", prefix),
+        )
+        self.c_proj = RowParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("c_proj", prefix),
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            scaling=self.scale,
+            num_kv_heads=total_num_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.c_attn(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        attn_output = self.attn(q, k, v, forward_batch)
+        attn_output, _ = self.c_proj(attn_output)
+        return attn_output
+
+
+class GPT2MLP(nn.Module):
+
+    def __init__(
+        self,
+        intermediate_size: int,
+        config: GPT2Config,
+        act_layer: Type[nn.Module] = NewGELU,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.c_fc = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("c_fc", prefix),
+        )
+        self.c_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("c_proj", prefix),
+        )
+        self.act = act_layer()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states, _ = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.c_proj(hidden_states)
+        return hidden_states
+
+
+class GPT2Block(nn.Module):
+
+    def __init__(
+        self,
+        layer_id: int,
+        config: GPT2Config,
+        act_layer: Type[nn.Module] = NewGELU,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
+
+        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.attn = GPT2Attention(
+            layer_id, config, quant_config, prefix=add_prefix("attn", prefix)
+        )
+        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.mlp = GPT2MLP(
+            inner_dim,
+            config,
+            act_layer=act_layer,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_output = self.attn(
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        # residual connection
+        hidden_states = attn_output + residual
+
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+        return hidden_states
+
+
+class GPT2Model(nn.Module):
+
+    def __init__(
+        self,
+        config: GPT2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        assert not config.add_cross_attention
+        assert not config.scale_attn_by_inverse_layer_idx
+        assert not config.reorder_and_upcast_attn
+        self.embed_dim = config.hidden_size
+        self.wte = VocabParallelEmbedding(config.vocab_size, self.embed_dim)
+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+        self.h = nn.ModuleList(
+            [
+                GPT2Block(
+                    i,
+                    config,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"h.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        inputs_embeds = self.wte(input_ids)
+        position_embeds = self.wpe(position_ids)
+        hidden_states = inputs_embeds + position_embeds
+
+        for i in range(len(self.h)):
+            layer = self.h[i]
+            hidden_states = layer(hidden_states, forward_batch)
+
+        hidden_states = self.ln_f(hidden_states)
+        return hidden_states
+
+
+class GPT2LMHeadModel(nn.Module):
+
+    def __init__(
+        self,
+        config: GPT2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.transformer = GPT2Model(
+            config, quant_config, prefix=add_prefix("transformer", prefix)
+        )
+        self.lm_head = self.transformer.wte
+
+        self.logits_processor = LogitsProcessor(config)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.transformer(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "lm_head.weight" in name:
+                # GPT-2 ties the weights of the embedding layer and the final
+                # linear layer.
+                continue
+            if ".attn.bias" in name or ".attn.masked_bias" in name:
+                # Skip attention mask.
+                # NOTE: "c_attn.bias" should not be skipped.
+                continue
+            if not name.startswith("transformer."):
+                name = "transformer." + name
+
+            param = params_dict[name]
+            # The HF's GPT-2 implementation uses Conv1D instead of Linear.
+            # Because of this, we need to transpose the weights.
+            # Note(zhuohan): the logic below might break quantized models.
+            for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]:
+                if conv1d_weight_name not in name:
+                    continue
+                if not name.endswith(".weight"):
+                    continue
+                loaded_weight = loaded_weight.t()
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+
+EntryClass = GPT2LMHeadModel
diff --git a/python/sglang/srt/models/gpt_bigcode.py b/python/sglang/srt/models/gpt_bigcode.py
new file mode 100644
index 000000000..f49a5d304
--- /dev/null
+++ b/python/sglang/srt/models/gpt_bigcode.py
@@ -0,0 +1,304 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from:
+# https://github.com/vllm-project/vllm/blob/07eb6f19f3b0ee9f7adf6eb689607028aa40bfd5/vllm/model_executor/models/gpt_bigcode.py
+"""Inference-only GPTBigCode model compatible with HuggingFace weights."""
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import GPTBigCodeConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+
+class GPTBigCodeAttention(nn.Module):
+
+    def __init__(
+        self,
+        layer_id: int,
+        config: GPTBigCodeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        total_num_heads = config.num_attention_heads
+        self.tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        assert total_num_heads % self.tensor_model_parallel_world_size == 0
+        self.num_heads = total_num_heads // self.tensor_model_parallel_world_size
+        self.head_dim = self.hidden_size // total_num_heads
+        self.scale = self.head_dim**-0.5
+
+        self.multi_query = config.multi_query
+        if self.multi_query:
+            total_num_kv_heads = 1
+            self.num_kv_heads = 1
+        else:
+            total_num_kv_heads = total_num_heads
+            self.num_kv_heads = self.num_heads
+        self.kv_dim = self.head_dim * self.num_kv_heads
+        self.c_attn = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            total_num_heads,
+            total_num_kv_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("c_attn", prefix),
+        )
+
+        self.c_proj = RowParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("c_proj", prefix),
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            scaling=self.scale,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.c_attn(hidden_states)
+        q, k, v = qkv.split(
+            [
+                self.hidden_size // self.tensor_model_parallel_world_size,
+                self.kv_dim,
+                self.kv_dim,
+            ],
+            dim=-1,
+        )
+        attn_output = self.attn(q, k, v, forward_batch)
+        attn_output, _ = self.c_proj(attn_output)
+        return attn_output
+
+
+class GPTBigMLP(nn.Module):
+
+    def __init__(
+        self,
+        intermediate_size: int,
+        config: GPTBigCodeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.c_fc = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("c_fc", prefix),
+        )
+        self.c_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("c_proj", prefix),
+        )
+        self.act = get_act_fn(
+            config.activation_function, quant_config, intermediate_size
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.c_proj(hidden_states)
+        return hidden_states
+
+
+class GPTBigCodeBlock(nn.Module):
+
+    def __init__(
+        self,
+        layer_id: int,
+        config: GPTBigCodeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
+
+        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.attn = GPTBigCodeAttention(
+            layer_id, config, quant_config, prefix=add_prefix("attn", prefix)
+        )
+        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.mlp = GPTBigMLP(
+            inner_dim, config, quant_config, prefix=add_prefix("mlp", prefix)
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_output = self.attn(
+            hidden_states=hidden_states, forward_batch=forward_batch
+        )
+        # residual connection
+        hidden_states = attn_output + residual
+
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+        return hidden_states
+
+
+class GPTBigCodeModel(nn.Module):
+
+    def __init__(
+        self,
+        config: GPTBigCodeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        assert not config.add_cross_attention
+
+        self.embed_dim = config.hidden_size
+        lora_vocab = 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.wte = VocabParallelEmbedding(
+            self.vocab_size,
+            self.embed_dim,
+            org_num_embeddings=config.vocab_size,
+            prefix=add_prefix("wte", prefix),
+        )
+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+        self.h = nn.ModuleList(
+            [
+                GPTBigCodeBlock(
+                    i, config, quant_config, prefix=add_prefix(f"h.{i}", prefix)
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        inputs_embeds = self.wte(input_ids)
+        position_embeds = self.wpe(position_ids)
+        hidden_states = inputs_embeds + position_embeds
+
+        for i in range(len(self.h)):
+            layer = self.h[i]
+            hidden_states = layer(hidden_states, forward_batch)
+
+        hidden_states = self.ln_f(hidden_states)
+        return hidden_states
+
+
+class GPTBigCodeForCausalLM(nn.Module):
+    packed_modules_mapping = {"c_attn": ["c_attn"]}
+
+    supported_lora_modules = ["c_fc", "c_proj", "wte", "c_attn"]
+
+    embedding_modules = {
+        "wte": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+
+    embedding_padding_modules = []
+
+    def __init__(
+        self,
+        config: GPTBigCodeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.config = config
+
+        self.quant_config = quant_config
+        self.transformer = GPTBigCodeModel(
+            config, quant_config, prefix=add_prefix("transformer", prefix)
+        )
+        self.lm_head = self.transformer.wte
+        self.unpadded_vocab_size = config.vocab_size
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.transformer(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "lm_head.weight" in name:
+                continue
+            if ".attn.bias" in name:
+                # Skip attention mask.
+                # NOTE: "c_attn.bias" should not be skipped.
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            # TODO (@robertgshaw2-neuralmagic): move to fp8 linear method
+            if "c_attn.input_scale" in name or "c_attn.weight_scale" in name:
+                weight_loader(param, loaded_weight, "q")
+                weight_loader(param, loaded_weight, "k")
+                weight_loader(param, loaded_weight, "v")
+            else:
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = GPTBigCodeForCausalLM
diff --git a/python/sglang/srt/models/gpt_oss.py b/python/sglang/srt/models/gpt_oss.py
new file mode 100644
index 000000000..64efff14b
--- /dev/null
+++ b/python/sglang/srt/models/gpt_oss.py
@@ -0,0 +1,1204 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+"""Inference-only GptOss model compatible with HuggingFace weights."""
+
+import logging
+import math
+from collections.abc import Iterable
+from functools import partial
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_moe_expert_parallel_rank,
+    get_moe_expert_parallel_world_size,
+    get_moe_tensor_parallel_rank,
+    get_moe_tensor_parallel_world_size,
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe import get_moe_a2a_backend
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.quantization.fp8_utils import dequant_mxfp4
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import (
+    LazyValue,
+    add_prefix,
+    is_cuda,
+    is_flashinfer_available,
+    is_sm100_supported,
+    make_layers,
+)
+
+_is_cuda = is_cuda()
+_is_flashinfer_available = is_flashinfer_available()
+_is_sm100_supported = is_cuda() and is_sm100_supported()
+
+
+if _is_cuda:
+    from sgl_kernel import FusedSetKVBufferArg
+
+
+class GptOssConfig(PretrainedConfig):
+    model_type = "gpt_oss"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+
+logger = logging.getLogger(__name__)
+
+
+# Aligned with HF's implementation, using sliding window inclusive with the last token
+# SGLang assumes exclusive
+def get_attention_sliding_window_size(config):
+    return config.sliding_window - 1
+
+
+class GptOssSparseMoeBlock(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        config: GptOssConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.layer_id = layer_id
+        self.activation = config.hidden_act
+        self.gemm1_alpha = getattr(config, "hidden_act_alpha", 1.702)
+        self.gemm1_clamp_limit = config.swiglu_limit
+
+        self.topk = TopK(
+            top_k=config.num_experts_per_tok,
+            renormalize=True,
+        )
+
+        self.top_k = config.num_experts_per_tok
+        experts_type = get_moe_impl_class()
+        extra_kwargs = {}
+        if experts_type.__name__ == "FusedMoE":
+            quant_config_name = (
+                quant_config.get_name() if quant_config is not None else None
+            )
+            extra_kwargs = {
+                # for moe gate_up_proj and down_proj and their bias loading
+                "use_weight_loader_fused": quant_config_name
+                != "mxfp4"
+            }
+        self.experts = experts_type(
+            num_experts=config.num_local_experts
+            + global_server_args_dict["ep_num_redundant_experts"],
+            top_k=config.num_experts_per_tok,
+            layer_id=layer_id,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            quant_config=quant_config,
+            activation=self.activation,
+            gemm1_alpha=self.gemm1_alpha,
+            gemm1_clamp_limit=self.gemm1_clamp_limit,
+            with_bias=True,
+            prefix=add_prefix("experts", prefix),
+            **extra_kwargs,
+        )
+
+        self.router = ReplicatedLinear(
+            config.hidden_size,
+            config.num_local_experts,
+            bias=True,
+            quant_config=None,
+            prefix=add_prefix("gate", prefix),
+            params_dtype=config.torch_dtype,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: Optional[ForwardBatch] = None,
+        should_allreduce_fusion: bool = False,
+    ) -> torch.Tensor:
+        if not get_moe_a2a_backend().is_deepep():
+            return self.forward_normal(hidden_states, should_allreduce_fusion)
+        else:
+            raise Exception("forward_deepep branch not implemented yet")
+
+    def get_moe_weights(self):
+        return [
+            x.data
+            for name, x in self.experts.named_parameters()
+            if name not in ["correction_bias"]
+        ]
+
+    def forward_normal(
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
+    ) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+
+        router_logits, _ = self.router(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = self.experts(hidden_states, topk_output)
+
+        if self.tp_size > 1 and not should_allreduce_fusion:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        ans = final_hidden_states.view(num_tokens, hidden_dim)
+        return ans
+
+
+def _enable_fused_set_kv_buffer(forward_batch: ForwardBatch):
+    """Enable fused set_kv_buffer only on CUDA with bfloat16 KV cache."""
+    return _is_cuda and forward_batch.token_to_kv_pool.dtype == torch.bfloat16
+
+
+# TODO maybe move to a model-common utils
+def _create_fused_set_kv_buffer_arg(
+    value: torch.Tensor,
+    layer: RadixAttention,
+    forward_batch: ForwardBatch,
+):
+    layer_id = layer.layer_id
+    token_to_kv_pool = forward_batch.token_to_kv_pool
+
+    k_buffer = token_to_kv_pool.get_key_buffer(layer_id)
+    v_buffer = token_to_kv_pool.get_value_buffer(layer_id)
+
+    return FusedSetKVBufferArg(
+        value=value,
+        k_buffer=k_buffer.view(k_buffer.shape[0], -1),
+        v_buffer=v_buffer.view(v_buffer.shape[0], -1),
+        k_scale=layer.k_scale,
+        v_scale=layer.v_scale,
+        cache_loc=forward_batch.out_cache_loc,
+    )
+
+
+class GptOssAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        head_dim: Optional[int] = None,
+        rms_norm_eps: float = 1e-06,
+        attention_bias: bool = False,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        sliding_window_size: int = -1,  # if -1, normal attention, else, window attention.
+        layer_type: str = "",
+        params_dtype: torch.dtype = torch.bfloat16,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.sliding_window_size = sliding_window_size
+
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % attn_tp_size == 0
+        self.num_heads = self.total_num_heads // attn_tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
+        self.head_dim = head_dim or hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=attention_bias,
+            params_dtype=params_dtype,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        # Choose dtype of sinks based on attention backend: trtllm_mha requires float32,
+        # others can use bfloat16
+        attn_backend = global_server_args_dict.get("attention_backend")
+        sinks_dtype = torch.float32 if attn_backend == "trtllm_mha" else torch.bfloat16
+        self.sinks = nn.Parameter(
+            torch.empty(self.num_heads, dtype=sinks_dtype), requires_grad=False
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=attention_bias,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            reduce_results=False,
+            params_dtype=params_dtype,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+
+        assert layer_type in {"sliding_attention", "full_attention"}
+        use_sliding_window = layer_type == "sliding_attention"
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            prefix=add_prefix("attn", prefix),
+            sliding_window_size=(sliding_window_size if use_sliding_window else -1),
+        )
+        self.layer_id = layer_id
+
+    def forward_prepare(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        if hidden_states.shape[0] == 0:
+            return hidden_states, forward_batch, None
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        q, k = self.rotary_emb(
+            positions,
+            q,
+            k,
+            fused_set_kv_buffer_arg=(
+                _create_fused_set_kv_buffer_arg(
+                    value=v,
+                    layer=self.attn,
+                    forward_batch=forward_batch,
+                )
+                if _enable_fused_set_kv_buffer(forward_batch)
+                else None
+            ),
+        )
+        inner_state = q, k, v, forward_batch
+        return None, forward_batch, inner_state
+
+    def forward_core(self, intermediate_state):
+        hidden_states, forward_batch, inner_state = intermediate_state
+        if inner_state is None:
+            return hidden_states
+        attn_output = self.attn(
+            *inner_state,
+            sinks=self.sinks,
+            save_kv_cache=not _enable_fused_set_kv_buffer(forward_batch),
+        )
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        s = self.forward_prepare(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        return self.forward_core(s)
+
+
+class GptOssDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: GptOssConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        sliding_window_size: int | None = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        head_dim = getattr(
+            config, "head_dim", config.hidden_size // config.num_attention_heads
+        )
+        rms_norm_eps = config.rms_norm_eps
+        attention_bias = config.attention_bias
+
+        if sliding_window_size is None:
+            self.sliding_window_size = get_attention_sliding_window_size(self.config)
+        else:
+            self.sliding_window_size = sliding_window_size
+
+        self.self_attn = GptOssAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            head_dim=head_dim,
+            rms_norm_eps=rms_norm_eps,
+            attention_bias=attention_bias,
+            prefix=add_prefix("self_attn", prefix),
+            sliding_window_size=self.sliding_window_size,
+            layer_type=config.layer_types[layer_id],
+            params_dtype=config.torch_dtype,
+        )
+
+        self.layer_id = layer_id
+
+        self.attn_tp_size = get_attention_tp_size()
+        self.attn_tp_rank = get_attention_tp_rank()
+
+        # GptOss all layers are sparse and have no nextn now
+        self.is_layer_sparse = True
+        self.is_nextn = False
+        is_previous_layer_sparse = True
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=self.is_layer_sparse,
+            is_previous_layer_sparse=is_previous_layer_sparse,
+        )
+
+        if self.is_layer_sparse:
+            self.mlp = GptOssSparseMoeBlock(
+                layer_id=self.layer_id,
+                config=config,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+        else:
+            raise NotImplementedError(
+                "Dense MLP is not implemented for GptOssDecoderLayer. "
+                "Please use GptOssSparseMoeBlock instead."
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            is_last_layer=(
+                self.is_nextn or (self.layer_id == self.config.num_hidden_layers - 1)
+            ),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+
+        if hidden_states.shape[0] != 0:
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+
+        should_allreduce_fusion = (
+            self.layer_communicator.should_fuse_mlp_allreduce_with_next_layer(
+                forward_batch
+            )
+        )
+
+        hidden_states = self.mlp(hidden_states, forward_batch, should_allreduce_fusion)
+
+        if should_allreduce_fusion:
+            hidden_states._sglang_needs_allreduce_fusion = True
+
+        if not should_allreduce_fusion:
+            hidden_states, residual = self.layer_communicator.postprocess_layer(
+                hidden_states, residual, forward_batch
+            )
+
+        return hidden_states, residual
+
+
+class GptOssModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        decoder_layer_type: type[nn.Module] = GptOssDecoderLayer,
+    ) -> None:
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                enable_tp=not is_dp_attention_enabled(),
+                prefix=add_prefix("embed_tokens", prefix),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        # Use the provided decoder layer type or default to GptOssDecoderLayer
+        decoder_layer_type = decoder_layer_type or GptOssDecoderLayer
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: decoder_layer_type(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=add_prefix("layers", prefix),
+        )
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+
+        self.layers_to_capture = []
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        aux_hidden_states = []
+        for i in range(self.start_layer, self.end_layer):
+            with get_global_expert_distribution_recorder().with_current_layer(i):
+                if i in self.layers_to_capture:
+                    aux_hidden_states.append(hidden_states + residual)
+                layer = self.layers[i]
+                hidden_states, residual = layer(
+                    positions, hidden_states, forward_batch, residual
+                )
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            if hidden_states.shape[0] != 0:
+                if residual is None:
+                    hidden_states = self.norm(hidden_states)
+                else:
+                    hidden_states, _ = self.norm(hidden_states, residual)
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+
+        return hidden_states, aux_hidden_states
+
+
+class GptOssForCausalLM(nn.Module):
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: GptOssConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = GptOssModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            # quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+            use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
+        )
+        self.logits_processor = LogitsProcessor(config)
+        self.capture_aux_hidden_states = False
+
+        self._routed_experts_weights_of_layer = LazyValue(
+            lambda: {
+                layer_id: self.model.layers[layer_id].mlp.get_moe_weights()
+                for layer_id in range(self.start_layer, self.end_layer)
+                if isinstance(self.model.layers[layer_id].mlp, GptOssSparseMoeBlock)
+            }
+        )
+
+    @property
+    def routed_experts_weights_of_layer(self):
+        return self._routed_experts_weights_of_layer.value
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if self.pp_group.is_last_rank:
+            return self.logits_processor(
+                input_ids,
+                hidden_states,
+                self.lm_head,
+                forward_batch,
+                aux_hidden_states,
+            )
+        else:
+            return hidden_states
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def _get_default_weight_mapping(self):
+        """Generate default weight name mapping for GptOss safetensors."""
+        weight_mapping = {}
+
+        # Map router weights to gate
+        weight_mapping["embedding.weight"] = "model.embed_tokens.weight"
+        weight_mapping["unembedding.weight"] = "lm_head.weight"
+        weight_mapping["norm.scale"] = "model.norm.weight"
+        for layer_id in range(self.config.num_hidden_layers):
+            weight_mapping[f"block.{layer_id}.attn.q_proj.weight"] = (
+                f"model.layers.{layer_id}.self_attn.q_proj.weight"
+            )
+            weight_mapping[f"block.{layer_id}.attn.q_proj.bias"] = (
+                f"model.layers.{layer_id}.self_attn.q_proj.bias"
+            )
+
+            weight_mapping[f"block.{layer_id}.attn.k_proj.weight"] = (
+                f"model.layers.{layer_id}.self_attn.k_proj.weight"
+            )
+            weight_mapping[f"block.{layer_id}.attn.k_proj.bias"] = (
+                f"model.layers.{layer_id}.self_attn.k_proj.bias"
+            )
+
+            weight_mapping[f"block.{layer_id}.attn.v_proj.weight"] = (
+                f"model.layers.{layer_id}.self_attn.v_proj.weight"
+            )
+            weight_mapping[f"block.{layer_id}.attn.v_proj.bias"] = (
+                f"model.layers.{layer_id}.self_attn.v_proj.bias"
+            )
+
+            weight_mapping[f"block.{layer_id}.attn.out.weight"] = (
+                f"model.layers.{layer_id}.self_attn.o_proj.weight"
+            )
+            weight_mapping[f"block.{layer_id}.attn.out.bias"] = (
+                f"model.layers.{layer_id}.self_attn.o_proj.bias"
+            )
+            weight_mapping[f"block.{layer_id}.attn.sinks"] = (
+                f"model.layers.{layer_id}.self_attn.sinks"
+            )
+            weight_mapping[f"block.{layer_id}.attn.norm.scale"] = (
+                f"model.layers.{layer_id}.input_layernorm.weight"
+            )
+
+            weight_mapping[f"block.{layer_id}.mlp.gate.weight"] = (
+                f"model.layers.{layer_id}.mlp.router.weight"
+            )
+            weight_mapping[f"block.{layer_id}.mlp.gate.bias"] = (
+                f"model.layers.{layer_id}.mlp.router.bias"
+            )
+            weight_mapping[f"block.{layer_id}.mlp.norm.scale"] = (
+                f"model.layers.{layer_id}.post_attention_layernorm.weight"
+            )
+            weight_mapping[f"block.{layer_id}.mlp.experts.gate_up_proj"] = (
+                f"model.layers.{layer_id}.mlp.experts.gate_up_proj"
+            )
+            weight_mapping[f"block.{layer_id}.mlp.gate_up_proj_bias"] = (
+                f"model.layers.{layer_id}.mlp.experts.gate_up_proj_bias"
+            )
+            weight_mapping[f"block.{layer_id}.mlp.down_proj"] = (
+                f"model.layers.{layer_id}.mlp.experts.mlp2_weight"
+            )
+            weight_mapping[f"block.{layer_id}.mlp.down_proj_bias"] = (
+                f"model.layers.{layer_id}.mlp.experts.mlp2_bias"
+            )
+
+        return weight_mapping
+
+    # TODO beautify code
+    def load_weights(
+        self,
+        weights: Iterable[Tuple[str, torch.Tensor]],
+        is_nextn: bool = False,
+        weight_name_mapping: dict = None,
+    ):
+        quant_config_name = (
+            self.quant_config.get_name() if self.quant_config is not None else None
+        )
+        if quant_config_name != "mxfp4":
+            self._load_normal_weights(
+                weights, is_nextn=is_nextn, weight_name_mapping=weight_name_mapping
+            )
+        else:
+            self._load_weights_mxfp4(
+                weights, is_nextn=is_nextn, weight_name_mapping=weight_name_mapping
+            )
+
+    def _load_weights_mxfp4(self, weights, is_nextn, weight_name_mapping):
+        mxfp4_weights = []
+        normal_weights = []
+
+        for name, weight in weights:
+            if (
+                ".experts" in name
+                and self.quant_config is not None
+                and self.quant_config.get_name() == "mxfp4"
+            ):
+                mxfp4_weights.append((name, weight))
+            else:
+                normal_weights.append((name, weight))
+
+        mxfp4_loaded_params = self._load_mxfp4_experts_weights(mxfp4_weights)
+        self._load_normal_weights(
+            normal_weights,
+            is_nextn=is_nextn,
+            weight_name_mapping=weight_name_mapping,
+            other_loaded_param_names=mxfp4_loaded_params,
+        )
+
+    def _load_mxfp4_experts_weights(self, weights):
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        mxfp4_block = 32
+
+        moe_tp_rank = get_moe_tensor_parallel_rank()
+        moe_tp_size = get_moe_tensor_parallel_world_size()
+        moe_ep_rank = get_moe_expert_parallel_rank()
+        moe_ep_size = get_moe_expert_parallel_world_size()
+
+        intermediate_size = self.config.intermediate_size
+        assert (
+            intermediate_size % mxfp4_block == 0
+        ), f"{intermediate_size=} must be divisible by {mxfp4_block=}"
+        intermediate_size_block = intermediate_size // mxfp4_block
+
+        per_rank_intermediate_size_block = math.ceil(
+            intermediate_size_block / moe_tp_size
+        )
+
+        per_rank_intermediate_size = per_rank_intermediate_size_block * mxfp4_block
+
+        # Calculate common slicing bounds for current rank
+        assert self.config.num_local_experts % moe_ep_size == 0
+        moe_num_global_experts = self.config.num_local_experts
+        moe_num_local_experts = self.config.num_local_experts // moe_ep_size
+
+        moe_tp_rank_start = moe_tp_rank * per_rank_intermediate_size
+        moe_tp_rank_end = min(
+            (moe_tp_rank + 1) * per_rank_intermediate_size, intermediate_size
+        )
+
+        moe_ep_rank_start = moe_ep_rank * moe_num_local_experts
+        moe_ep_rank_end = (moe_ep_rank + 1) * moe_num_local_experts
+
+        for name, weight in weights:
+            weight = weight.cuda()
+
+            if "gate_up_proj_blocks" in name:
+                # Handle MLP gate and up projection weights
+                new_name = name.replace("gate_up_proj_blocks", "w13_weight")
+
+                # flat weight from (E, 2 * N, block_size, entry_per_block)
+                # to (E, 2 * N, -1), shouldn't trigger copy for contiguous
+                weight = weight.view(
+                    moe_num_global_experts, 2 * intermediate_size, -1
+                ).contiguous()
+
+                narrow_weight = weight[
+                    moe_ep_rank_start:moe_ep_rank_end,
+                    2 * moe_tp_rank_start : 2 * moe_tp_rank_end,
+                    ...,
+                ]
+
+                param = params_dict[new_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(
+                    param,
+                    narrow_weight,
+                    weight_name=new_name,
+                    shard_id=None,
+                    expert_id=None,
+                )
+                loaded_params.add(new_name)
+
+            elif "down_proj_blocks" in name:
+                # Handle MLP down projection weights
+                new_name = name.replace("down_proj_blocks", "w2_weight")
+                # same flatten here, but since 2 mx4 value are packed in 1
+                # uint8, divide by 2
+                weight = weight.view(
+                    moe_num_global_experts, -1, intermediate_size // 2
+                ).contiguous()
+                narrow_weight = weight[
+                    moe_ep_rank_start:moe_ep_rank_end,
+                    ...,
+                    moe_tp_rank_start // 2 : moe_tp_rank_end // 2,
+                ]
+
+                param = params_dict[new_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(
+                    param,
+                    narrow_weight,
+                    weight_name=new_name,
+                    shard_id=None,
+                    expert_id=None,
+                )
+                loaded_params.add(new_name)
+
+            elif "gate_up_proj_scales" in name:
+                # Handle MLP gate and up projection weights scale
+                new_name = name.replace("gate_up_proj_scales", "w13_weight_scale")
+                narrow_weight = weight[
+                    moe_ep_rank_start:moe_ep_rank_end,
+                    2 * moe_tp_rank_start : 2 * moe_tp_rank_end,
+                    ...,
+                ]
+
+                param = params_dict[new_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(
+                    param,
+                    narrow_weight,
+                    weight_name=new_name,
+                    shard_id=None,
+                    expert_id=None,
+                )
+                loaded_params.add(new_name)
+
+            elif "down_proj_scales" in name:
+                # Handle MLP down projection weights
+                new_name = name.replace("down_proj_scales", "w2_weight_scale")
+                narrow_weight = weight[
+                    moe_ep_rank_start:moe_ep_rank_end,
+                    ...,
+                    moe_tp_rank_start // mxfp4_block : moe_tp_rank_end // mxfp4_block,
+                ]
+
+                param = params_dict[new_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(
+                    param,
+                    narrow_weight,
+                    weight_name=new_name,
+                    shard_id=None,
+                    expert_id=None,
+                )
+                loaded_params.add(new_name)
+            elif "gate_up_proj_bias" in name:
+                # Handle MLP gate and up projection biases
+                new_name = name.replace("gate_up_proj_bias", "w13_weight_bias")
+
+                narrow_weight = weight[
+                    moe_ep_rank_start:moe_ep_rank_end,
+                    2 * moe_tp_rank_start : 2 * moe_tp_rank_end,
+                ]
+
+                param = params_dict[new_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(
+                    param,
+                    narrow_weight,
+                    weight_name=new_name,
+                    shard_id=None,
+                    expert_id=None,
+                )
+                loaded_params.add(new_name)
+
+            elif "down_proj_bias" in name:
+                narrow_weight = weight[moe_ep_rank_start:moe_ep_rank_end, ...]
+                if moe_tp_rank != 0:
+                    narrow_weight = torch.zeros_like(narrow_weight)
+
+                # Handle MLP down projection bias
+                new_name = name.replace("down_proj_bias", "w2_weight_bias")
+                param = params_dict[new_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(
+                    param,
+                    narrow_weight,
+                    weight_name=new_name,
+                    shard_id=None,
+                    expert_id=None,
+                )
+                loaded_params.add(new_name)
+
+        return loaded_params
+
+    def _load_normal_weights(
+        self,
+        weights,
+        is_nextn: bool,
+        weight_name_mapping: dict,
+        other_loaded_param_names=[],
+    ):
+        tp_rank = get_tensor_model_parallel_rank()
+        if is_nextn:
+            logging.warning(
+                "Loading weights for nextn is currently not supported in GptOssForCausalLM. "
+            )
+            return
+        weights = _canonicalize_weights(self.config, weights)
+        weights = sorted(weights, key=lambda x: x[0])  # Sort by name for consistency
+
+        new_weights = []
+        for name, p in weights:
+            if "qkv.weight" in name:
+                q_proj, k_proj, v_proj = p.split(
+                    [
+                        self.config.num_attention_heads * self.config.head_dim,
+                        self.config.num_key_value_heads * self.config.head_dim,
+                        self.config.num_key_value_heads * self.config.head_dim,
+                    ],
+                    dim=0,
+                )
+                new_weights.append(
+                    (f"{name.replace('qkv.weight', 'q_proj.weight')}", q_proj)
+                )
+                new_weights.append(
+                    (f"{name.replace('qkv.weight', 'k_proj.weight')}", k_proj)
+                )
+                new_weights.append(
+                    (f"{name.replace('qkv.weight', 'v_proj.weight')}", v_proj)
+                )
+            elif "qkv.bias" in name:
+                q_bias, k_bias, v_bias = p.split(
+                    [
+                        self.config.num_attention_heads * self.config.head_dim,
+                        self.config.num_key_value_heads * self.config.head_dim,
+                        self.config.num_key_value_heads * self.config.head_dim,
+                    ],
+                    dim=0,
+                )
+                new_weights.append(
+                    (f"{name.replace('qkv.bias', 'q_proj.bias')}", q_bias)
+                )
+                new_weights.append(
+                    (f"{name.replace('qkv.bias', 'k_proj.bias')}", k_bias)
+                )
+                new_weights.append(
+                    (f"{name.replace('qkv.bias', 'v_proj.bias')}", v_bias)
+                )
+            else:
+                new_weights.append((name, p))
+        weights = new_weights
+
+        # Use provided weight name mapping if available, otherwise use default
+        if weight_name_mapping is None:
+            weight_name_mapping = self._get_default_weight_mapping()
+        else:
+            # Merge with default mapping
+            default_mapping = self._get_default_weight_mapping()
+            default_mapping.update(weight_name_mapping)
+            weight_name_mapping = default_mapping
+
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        expert_params_mapping = FusedMoE.make_expert_params_mapping_fused(
+            ckpt_gate_up_proj_name="gate_up_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_gate_up_proj_bias_name="gate_up_proj_bias",
+            ckpt_down_proj_bias_name="down_proj_bias",
+        )
+
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            loaded_weight = _WeightCreator.maybe_materialize(loaded_weight)
+
+            # Apply weight name mapping if provided
+            if weight_name_mapping and name in weight_name_mapping:
+                name = weight_name_mapping[name]
+
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if "mlp.experts" in name:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    if name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    if "bias" not in name:
+                        loaded_weight = loaded_weight.transpose(-2, -1)
+                    if "w2_weight_bias" in name and get_moe_tensor_parallel_rank() != 0:
+                        loaded_weight = loaded_weight.zero_()
+
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                    )
+                    break
+                else:
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if name not in params_dict:
+                        continue
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        if "sinks" in name:
+                            start = get_attention_tp_rank() * param.numel()
+                            param.data.copy_(
+                                loaded_weight[start : start + param.numel()]
+                            )
+                        else:
+                            weight_loader = getattr(
+                                param, "weight_loader", default_weight_loader
+                            )
+                            weight_loader(param, loaded_weight)
+                    else:
+                        logger.warning(f"Parameter {name} not found in params_dict")
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def set_eagle3_layers_to_capture(self, layer_ids: Optional[List[int]] = None):
+        if not self.pp_group.is_last_rank:
+            return
+
+        if layer_ids is None:
+            self.capture_aux_hidden_states = True
+            num_layers = self.config.num_hidden_layers
+            self.model.layers_to_capture = [2, num_layers // 2, num_layers - 3]
+        else:
+            self.capture_aux_hidden_states = True
+            # we plus 1 here because in sglang, for the ith layer, it takes the output
+            # of the (i-1)th layer as aux hidden state
+            self.model.layers_to_capture = [val + 1 for val in layer_ids]
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        return ModelConfigForExpertLocation(
+            num_layers=config.num_hidden_layers,
+            num_logical_experts=config.num_local_experts,
+            num_groups=None,
+        )
+
+    def get_attention_sliding_window_size(self):
+        return get_attention_sliding_window_size(self.config)
+
+
+def _canonicalize_weights(config, weights_in: Iterable[Tuple[str, torch.Tensor]]):
+    weights_out_dict = dict(weights_in)
+
+    for layer_id in range(config.num_hidden_layers):
+        for name_chunk in ["mlp1_weight", "mlp2_weight"]:
+            name_prefix = f"block.{layer_id}.mlp.{name_chunk}"
+            w_blocks = weights_out_dict.pop(f"{name_prefix}.blocks", None)
+            w_scales = weights_out_dict.pop(f"{name_prefix}.scales", None)
+            if w_blocks is not None:
+                weights_out_dict[name_prefix] = _WeightCreator(
+                    partial(
+                        _dequant_mlp_weight,
+                        debug_name=name_prefix,
+                        w_blocks=w_blocks,
+                        w_scales=w_scales,
+                    )
+                )
+
+    return list(weights_out_dict.items())
+
+
+def _dequant_mlp_weight(debug_name, w_blocks, w_scales):
+    if get_tensor_model_parallel_rank() == 0:
+        logger.info(f"Dequantize {debug_name} start")
+
+    original_device = w_blocks.device
+
+    w_blocks = w_blocks.cuda()
+    w_scales = w_scales.cuda()
+
+    w_bf16 = dequant_mxfp4(w_block=w_blocks, w_scale=w_scales, out_dtype=torch.bfloat16)
+    w_bf16 = w_bf16.transpose(-2, -1).contiguous()
+
+    if get_tensor_model_parallel_rank() == 0:
+        logger.info(
+            f"Dequantize {debug_name} end {w_blocks.shape=} {w_scales.shape=} {w_bf16.shape=}"
+        )
+
+    return w_bf16.to(original_device)
+
+
+class _WeightCreator:
+    def __init__(self, fn):
+        self._fn = fn
+
+    @staticmethod
+    def maybe_materialize(obj):
+        if isinstance(obj, _WeightCreator):
+            output = obj._fn()
+            obj._fn = None
+            return output
+
+        return obj
+
+
+EntryClass = GptOssForCausalLM
diff --git a/python/sglang/srt/models/granite.py b/python/sglang/srt/models/granite.py
new file mode 100644
index 000000000..19252dc8d
--- /dev/null
+++ b/python/sglang/srt/models/granite.py
@@ -0,0 +1,505 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/llama.py#L1
+"""Inference-only Granite model compatible with HuggingFace weights."""
+
+import logging
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import GraniteConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+from sglang.utils import get_exception_traceback
+
+logger = logging.getLogger(__name__)
+
+
+class GraniteMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class GraniteAttention(nn.Module):
+    def __init__(
+        self,
+        config: GraniteConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        rope_is_neox_style: bool = True,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(
+            config, "head_dim", self.hidden_size // self.total_num_heads
+        )
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = config.attention_multiplier
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=rope_is_neox_style,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class GraniteDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: GraniteConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.residual_multiplier = config.residual_multiplier
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+        rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        self.self_attn = GraniteAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            rope_is_neox_style=rope_is_neox_style,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = GraniteMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = (
+            self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+            * self.residual_multiplier
+        )  # multiplier for Maximal Update Parameterization
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states) * self.residual_multiplier
+        return hidden_states, residual
+
+
+class GraniteModel(nn.Module):
+    def __init__(
+        self,
+        config: GraniteConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size
+        )
+        self.layers = nn.ModuleList(
+            [
+                GraniteDecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        residual = None
+        hidden_states *= self.config.embedding_multiplier
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class GraniteForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: GraniteConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = GraniteModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        # If tie_word_embeddings == True, then input and output embeddings are
+        # the same tensor. Enforce during object creation so that weights will
+        # load correctly even if the LM head weights don't have a separate entry
+        # in the state dict.
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head.tie_weights(self.model.embed_tokens)
+
+        # Granite logit scaling factors are applied via division, but
+        # LogitsProcessor expects a multiplicative factor.
+        if hasattr(config, "logits_scaling"):
+            logit_scale = 1.0 / config.logits_scaling
+        else:
+            logit_scale = None
+        self.logits_processor = LogitsProcessor(config, logit_scale=logit_scale)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+        self.stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        if not get_embedding:
+            logits_processor_output: LogitsProcessorOutput = self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+            return logits_processor_output
+        else:
+            return self.pooler(hidden_states, forward_batch)
+
+    def get_module_name_from_weight_name(self, name):
+        for param_name, weight_name, shard_id, num_shard in self.stacked_params_mapping:
+            if weight_name in name:
+                return (
+                    name.replace(weight_name, param_name)[: -len(".weight")],
+                    num_shard,
+                )
+        return name[: -len(".weight")], 1
+
+    def get_num_params(self):
+        params_dict = dict(self.named_parameters())
+        return len(params_dict)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+            if "lm_head.weight" in name and self.config.tie_word_embeddings:
+                # Input and output embeddings are tied, so the output embeddings
+                # may not be present in the checkpoint. We assume that the input
+                # embeddings are always present in the checkpoint.
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # This block only runs if the preceding for loop doesn't find
+                # a match for `name` in `stacked_params_mapping`.
+
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip loading kv_scale from ckpts towards new design.
+                if name.endswith(".kv_scale") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+    def get_weights_by_name(
+        self, name: str, truncate_size: int = 100, tp_size: int = 1
+    ) -> Optional[torch.Tensor]:
+        """Get the weights of the parameter by its name. Similar to `get_parameter` in Hugging Face.
+
+        Only used for unit test with an unoptimized performance.
+        For optimized performance, please use torch.save and torch.load.
+        """
+        try:
+            if name == "lm_head.weight" and self.config.tie_word_embeddings:
+                logger.info(
+                    "word embedding is tied for this model, return embed_tokens.weight as lm_head.weight."
+                )
+                return (
+                    self.model.embed_tokens.weight.cpu()
+                    .to(torch.float32)
+                    .numpy()
+                    .tolist()[:truncate_size]
+                )
+
+            mapped_name = name
+            mapped_shard_id = None
+            for param_name, weight_name, shard_id in self.stacked_params_mapping:
+                if weight_name in name:
+                    mapped_name = name.replace(weight_name, param_name)
+                    mapped_shard_id = shard_id
+                    break
+            params_dict = dict(self.named_parameters())
+            param = params_dict[mapped_name]
+            if mapped_shard_id is not None:
+                if mapped_shard_id in ["q", "k", "v"]:
+                    num_heads = self.config.num_attention_heads // tp_size
+                    num_kv_heads = self.config.num_key_value_heads // tp_size
+                    head_dim = (
+                        self.config.hidden_size // self.config.num_attention_heads
+                    )
+                    if mapped_shard_id == "q":
+                        offset = 0
+                        size = num_heads * head_dim
+                    elif mapped_shard_id == "k":
+                        offset = num_heads * head_dim
+                        size = num_kv_heads * head_dim
+                    elif mapped_shard_id == "v":
+                        offset = (num_heads + num_kv_heads) * head_dim
+                        size = num_kv_heads * head_dim
+                    weight = param.data.narrow(0, offset, size)
+                elif mapped_shard_id in [0, 1]:
+                    intermediate_size = self.config.intermediate_size
+                    slice_size = intermediate_size // tp_size
+                    if mapped_shard_id == 0:  # gate_proj
+                        offset = 0
+                        size = slice_size
+                    elif mapped_shard_id == 1:  # up_proj
+                        offset = slice_size
+                        size = slice_size
+
+                    weight = param.data.narrow(0, offset, size)
+                else:
+                    weight = param.data
+            else:
+                weight = param.data
+            if tp_size > 1 and ("o_proj" in name or "down_proj" in name):
+                gathered_weights = [torch.zeros_like(weight) for _ in range(tp_size)]
+                torch.distributed.all_gather(gathered_weights, weight)
+                weight = torch.cat(gathered_weights, dim=1)
+            return weight.cpu().to(torch.float32).numpy().tolist()[:truncate_size]
+
+        except Exception:
+            logger.error(
+                f"Error getting weights by name {name} in GraniteForCausalLM: {get_exception_traceback()}"
+            )
+            return None
+
+
+EntryClass = [GraniteForCausalLM]
diff --git a/python/sglang/srt/models/granitemoe.py b/python/sglang/srt/models/granitemoe.py
new file mode 100644
index 000000000..d65b9ec06
--- /dev/null
+++ b/python/sglang/srt/models/granitemoe.py
@@ -0,0 +1,387 @@
+"""Inference-only GraniteMoe model."""
+
+from typing import Iterable, Optional
+
+import torch
+from torch import nn
+from transformers import GraniteConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models import mixtral
+from sglang.srt.utils import add_prefix
+
+
+class GraniteMoeMoE(nn.Module):
+    """A tensor-parallel MoE implementation for GraniteMoe that shards each
+    expert across all ranks.
+    Each expert's weights are sharded across all ranks and a fused MoE
+    kernel is used for the forward pass, and finally we reduce the outputs
+    across ranks.
+    """
+
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        layer_id: int,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        tp_size: Optional[int] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        # Gate always runs at half / full precision for now.
+        self.gate = ReplicatedLinear(
+            hidden_size,
+            num_experts,
+            bias=False,
+            params_dtype=params_dtype,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+
+        self.topk = TopK(
+            top_k=top_k,
+            renormalize=True,
+        )
+
+        self.experts = FusedMoE(
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            layer_id=layer_id,
+            params_dtype=params_dtype,
+            reduce_results=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        router_logits, _ = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = self.experts(hidden_states, topk_output)
+        return final_hidden_states.view(orig_shape)
+
+
+class GraniteMoeAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        max_position: int = 4096 * 32,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        quant_config: Optional[QuantizationConfig] = None,
+        attention_multiplier: Optional[float] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = (
+            attention_multiplier
+            if attention_multiplier is not None
+            else self.head_dim**-1
+        )
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=int(self.rope_theta),
+            is_neox_style=True,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class GraniteMoeDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: GraniteConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        self.self_attn = GraniteMoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+            attention_multiplier=config.attention_multiplier,
+        )
+        self.block_sparse_moe = GraniteMoeMoE(
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=f"{prefix}.block_sparse_moe",
+        )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.residual_multiplier = config.residual_multiplier
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = residual + hidden_states * self.residual_multiplier
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.block_sparse_moe(hidden_states)
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        return hidden_states
+
+
+class GraniteMoeModel(nn.Module):
+
+    def __init__(
+        self,
+        config: GraniteConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+        self.embedding_multiplier = config.embedding_multiplier
+
+        self.layers = nn.ModuleList(
+            [
+                GraniteMoeDecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.get_input_embeddings(input_ids)
+        hidden_states *= self.embedding_multiplier
+
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+            )
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class GraniteMoeForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: GraniteConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+
+        self.model = GraniteMoeModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+        )
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        # Granite logit scaling factors are applied via division, but
+        # LogitsProcessor expects a multiplicative factor.
+        if hasattr(config, "logits_scaling"):
+            logit_scale = 1.0 / config.logits_scaling
+        else:
+            logit_scale = None
+        self.logits_processor = LogitsProcessor(config, logit_scale=logit_scale)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        if not get_embedding:
+            logits_processor_output: LogitsProcessorOutput = self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+            return logits_processor_output
+        else:
+            return self.pooler(hidden_states, forward_batch)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        new_weights = {}
+        for n, p in weights:
+            if n.endswith(".block_sparse_moe.input_linear.weight"):
+                for e in range(p.size(0)):
+                    w1_name = n.replace(
+                        ".block_sparse_moe.input_linear.weight",
+                        f".block_sparse_moe.experts.{e}.w1.weight",
+                    )
+                    w3_name = n.replace(
+                        ".block_sparse_moe.input_linear.weight",
+                        f".block_sparse_moe.experts.{e}.w3.weight",
+                    )
+                    w1_param, w3_param = p[e].chunk(2, dim=0)
+                    assert w1_name not in new_weights
+                    assert w3_name not in new_weights
+                    new_weights[w1_name] = w1_param
+                    new_weights[w3_name] = w3_param
+            elif n.endswith(".block_sparse_moe.output_linear.weight"):
+                for e in range(p.size(0)):
+                    w2_name = n.replace(
+                        ".block_sparse_moe.output_linear.weight",
+                        f".block_sparse_moe.experts.{e}.w2.weight",
+                    )
+                    w2_param = p[e]
+                    assert w2_name not in new_weights
+                    new_weights[w2_name] = w2_param
+            elif n.endswith(".block_sparse_moe.router.layer.weight"):
+                gate_name = n.replace(
+                    ".block_sparse_moe.router.layer.weight",
+                    ".block_sparse_moe.gate.weight",
+                )
+                assert gate_name not in new_weights
+                new_weights[gate_name] = p
+            else:
+                new_weights[n] = p
+        mixtral.MixtralForCausalLM.load_weights(self, new_weights.items())
+
+
+EntryClass = [GraniteMoeForCausalLM]
diff --git a/python/sglang/srt/models/grok.py b/python/sglang/srt/models/grok.py
new file mode 100644
index 000000000..a35420993
--- /dev/null
+++ b/python/sglang/srt/models/grok.py
@@ -0,0 +1,1132 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/mixtral.py#L1
+"""Inference-only Grok1 model."""
+import functools
+import logging
+import math
+import os
+import warnings
+from typing import Iterable, Optional, Tuple
+
+import numpy as np
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_moe_expert_parallel_world_size,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.layers.activation import GeluAndMul
+from sglang.srt.layers.elementwise import (
+    experts_combine_triton,
+    fused_dual_residual_rmsnorm,
+    fused_rmsnorm,
+    gelu_and_mul_triton,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.ep_moe.layer import EPMoE
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.router import fused_moe_router_shim
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import (
+    RotaryEmbedding,
+    _yarn_find_correction_range,
+    _yarn_get_mscale,
+    get_rope,
+)
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.loader import DefaultModelLoader
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, dispose_tensor, dump_to_file
+
+logger = logging.getLogger(__name__)
+
+
+# Dump tensors for debugging
+debug_tensor_dump_output_folder = None
+debug_tensor_dump_prefill_only = False
+# Skip all the other tensor dumps, only dump the target logits
+debug_tensor_dump_only_target_logprobs = False
+debug_tensor_dump_inject = False
+debug_tensor_dump_layers = None
+debug_tensor_dump_test = False
+
+
+class Grok1MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        reduce_results=True,
+        use_presharded_weights: bool = False,
+        split_gate_up: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+            use_presharded_weights=use_presharded_weights,
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+            reduce_results=reduce_results,
+            use_presharded_weights=use_presharded_weights,
+        )
+        self.act_fn = GeluAndMul(approximate="tanh")
+        self.layer_id = layer_id
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x, _ = gelu_and_mul_triton(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Grok1MoE(nn.Module):
+    """A tensor-parallel MoE implementation for Grok1 that shards each expert
+    across all ranks.
+
+    Each expert's weights are sharded across all ranks and a fused MoE
+    kernel is used for the forward pass, and finally we reduce the outputs
+    across ranks.
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        tp_size: Optional[int] = None,
+        reduce_results: bool = True,
+        use_presharded_weights: bool = False,
+        inplace: bool = True,
+        no_combine: bool = False,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        # Gate always runs at full precision for stability (see https://arxiv.org/pdf/2101.03961)
+        self.gate = ReplicatedLinear(
+            hidden_size,
+            num_experts,
+            bias=False,
+            params_dtype=torch.float32,
+            quant_config=None,
+        )
+
+        self.router_logit_softcapping = getattr(
+            config, "router_logit_softcapping", 30.0
+        )
+        custom_routing_function = functools.partial(
+            fused_moe_router_shim, self.router_logit_softcapping
+        )
+
+        self.topk = TopK(
+            top_k=top_k,
+            renormalize=False,
+            custom_routing_function=custom_routing_function,
+        )
+
+        kwargs = {}
+        if get_moe_expert_parallel_world_size() > 1:
+            MoEImpl = EPMoE
+        else:
+            MoEImpl = FusedMoE
+            kwargs["reduce_results"] = reduce_results
+            kwargs["use_presharded_weights"] = use_presharded_weights
+            kwargs["inplace"] = inplace
+            kwargs["no_combine"] = no_combine
+
+        self.experts = MoEImpl(
+            num_experts=num_experts,
+            top_k=top_k,
+            layer_id=layer_id,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            params_dtype=params_dtype,
+            quant_config=quant_config,
+            activation="gelu",
+            **kwargs,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # need to assert self.gate.quant_method is unquantized
+        topk_output = self.topk(hidden_states, self.gate.weight)
+        return self.experts(hidden_states, topk_output)
+
+
+def _yarn_linear_ramp_mask(
+    low: float, high: float, dim: int, dtype: torch.dtype
+) -> torch.Tensor:
+    if low == high:
+        low -= 0.001  # Prevent singularity
+
+    linear_func = (torch.arange(dim, dtype=dtype) - low) / (high - low)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+
+
+def get_rope_scaling(config):
+    rope_type = getattr(config, "rope_type", None)
+    if rope_type:
+        original_max_position_embeddings = getattr(
+            config, "original_max_position_embeddings", None
+        )
+        scaling_factor = getattr(config, "scaling_factor", None)
+        extrapolation_factor = getattr(config, "extrapolation_factor", 1.0)
+        attn_factor = getattr(config, "attn_factor", 1.0)
+        beta_fast = getattr(config, "beta_fast", 32)
+        beta_slow = getattr(config, "beta_slow", 1)
+        rope_scaling = {
+            "extra_method": rope_type,
+            "max_position_embeddings": original_max_position_embeddings,
+            "scaling_factor": scaling_factor,
+            "extrapolation_factor": extrapolation_factor,
+            "attn_factor": attn_factor,
+            "beta_fast": beta_fast,
+            "beta_slow": beta_slow,
+            "dtype": torch.float,
+        }
+        return rope_scaling
+    else:
+        return None
+
+
+class ScalingRotaryEmbedding(RotaryEmbedding):
+    """Scale the RotaryEmbedding in a way similar to YaRN method. https://arxiv.org/pdf/2309.00071."""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+        *,
+        extra_method: str = "yarn_log",
+        extrapolation_factor: float = 1,
+        attn_factor: float = 1,
+        beta_fast: int = 32,
+        beta_slow: int = 1,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.extra_method = extra_method
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        # Get n-d magnitude scaling corrected for interpolation
+        self.mscale = float(_yarn_get_mscale(self.scaling_factor) * attn_factor)
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+    def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
+        pos_freqs = self.base ** (
+            torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
+        )
+        inv_freq_extrapolation = 1.0 / pos_freqs
+        inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
+
+        low, high = _yarn_find_correction_range(
+            self.beta_fast,
+            self.beta_slow,
+            self.rotary_dim,
+            self.base,
+            self.max_position_embeddings,
+        )
+        # Get n-d rotational scaling corrected for extrapolation
+        inv_freq_mask = (
+            1
+            - _yarn_linear_ramp_mask(low, high, self.rotary_dim // 2, dtype=torch.float)
+        ) * self.extrapolation_factor
+        if self.extra_method in ["original"]:
+            inv_freq = inv_freq_extrapolation
+        elif self.extra_method in ["yarn", "yarn_linear"]:
+            inv_freq = (
+                inv_freq_interpolation * (1 - inv_freq_mask)
+                + inv_freq_extrapolation * inv_freq_mask
+            )
+        elif self.extra_method == "yarn_log":
+            inv_freq = torch.exp(
+                torch.log(inv_freq_extrapolation) * inv_freq_mask
+                + torch.log(inv_freq_interpolation) * (1.0 - inv_freq_mask)
+            )
+        elif self.extra_method == "theta_scale":
+            exponents = torch.arange(0, self.rotary_dim, 2, dtype=torch.float)
+            theta_scale_exponent = self.base ** (
+                math.log(
+                    self.max_position_embeddings * self.scaling_factor / (2 * math.pi)
+                )
+                / math.log(self.max_position_embeddings / (2 * math.pi))
+            )
+            inv_freq = torch.tensor(
+                1.0 / (theta_scale_exponent ** (exponents / self.rotary_dim)),
+                dtype=torch.float32,
+            )
+        else:
+            raise ValueError(f"Unknown extrapolation method: {self.extra_method}")
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.scaling_factor)
+        t = torch.arange(
+            self.max_position_embeddings * self.scaling_factor, dtype=torch.float32
+        )
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        # cos = freqs.cos() * self.mscale
+        # sin = freqs.sin() * self.mscale
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+
+class Grok1Attention(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        max_position: int = 4096 * 32,
+        rope_theta: float = 10000,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        alt_stream: Optional[torch.cuda.Stream] = None,
+        load_presharded_attn: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.layer_id = layer_id
+        self.hidden_size = hidden_size
+        attn_tp_rank = get_tensor_model_parallel_rank()
+        attn_tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % attn_tp_size == 0
+        self.num_heads = self.total_num_heads // attn_tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
+        self.head_dim = getattr(config, "head_dim", 128)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        rope_scaling = get_rope_scaling(config)
+        self.load_presharded_attn = load_presharded_attn
+        self.alt_stream = alt_stream or torch.cuda.Stream()
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            load_presharded_attn=self.load_presharded_attn,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            use_presharded_weights=self.load_presharded_attn,
+            prefix=add_prefix("o_proj", prefix),
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=int(self.rope_theta),
+            is_neox_style=True,
+        )
+
+        self.rope_rotate_half_dims = getattr(config, "rope_rotate_half_dims", False)
+
+        if rope_scaling is not None:
+            self.rotary_emb = ScalingRotaryEmbedding(
+                self.head_dim,
+                rotary_dim=(
+                    self.head_dim
+                    if not self.rope_rotate_half_dims
+                    else self.head_dim // 2
+                ),
+                base=int(self.rope_theta),
+                is_neox_style=True,
+                **rope_scaling,
+            )
+            pos_encoding_mode = "NONE"
+        else:
+            self.rotary_emb = get_rope(
+                self.head_dim,
+                rotary_dim=(
+                    self.head_dim
+                    if not self.rope_rotate_half_dims
+                    else self.head_dim // 2
+                ),
+                max_position=max_position,
+                base=int(self.rope_theta),
+                is_neox_style=True,
+            )
+            pos_encoding_mode = "NONE"
+
+        logit_cap = max(getattr(config, "attn_logit_softcapping", 30.0), 0.0)
+        logit_capping_method = getattr(config, "attn_logit_softcapping_method", "tanh")
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            logit_cap=logit_cap,
+            quant_config=quant_config,
+            pos_encoding_mode=pos_encoding_mode,
+            logit_capping_method=logit_capping_method,
+            prefix=add_prefix("attn", prefix),
+        )
+        self.attn.xai_temperature_len = getattr(self.config, "attn_temperature_len", -1)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        if hidden_states.shape[0] == 0:
+            assert (
+                not self.o_proj.reduce_results
+            ), "short-circuiting allreduce will lead to hangs"
+            return hidden_states
+        if debug_tensor_dump_output_folder:
+            dump_to_file(
+                debug_tensor_dump_output_folder,
+                f"attn_input_{self.layer_id}",
+                hidden_states,
+            )
+
+            if debug_tensor_dump_inject:
+                name = os.path.join(
+                    debug_tensor_dump_output_folder,
+                    f"jax_dump_attn_input_{self.layer_id}.npy",
+                )
+                logger.info(f"Load {name} from jax.")
+                x = np.load(name)
+                hidden_states = torch.tensor(x[0, : hidden_states.shape[0]]).to(
+                    hidden_states
+                )
+
+        qkv, _ = self.qkv_proj(hidden_states)
+        dispose_tensor(hidden_states)
+
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+
+        if debug_tensor_dump_output_folder:
+            num_tokens = q.shape[0]
+            num_heads_q = self.num_heads
+            head_dim = self.head_dim
+            num_heads_kv = k.numel() // (num_tokens * head_dim)
+
+            dump_to_file(
+                debug_tensor_dump_output_folder,
+                f"q_{self.layer_id}",
+                tensor_model_parallel_all_gather(
+                    q.reshape(num_tokens, num_heads_q, head_dim).contiguous(), dim=1
+                ).contiguous(),
+            )
+            dump_to_file(
+                debug_tensor_dump_output_folder,
+                f"k_{self.layer_id}",
+                tensor_model_parallel_all_gather(
+                    k.reshape(num_tokens, num_heads_kv, head_dim).contiguous(), dim=1
+                ).contiguous(),
+            )
+            dump_to_file(
+                debug_tensor_dump_output_folder,
+                f"v_{self.layer_id}",
+                tensor_model_parallel_all_gather(
+                    v.reshape(num_tokens, num_heads_kv, head_dim).contiguous(), dim=1
+                ).contiguous(),
+            )
+
+        attn_output = self.attn(q, k, v, forward_batch)
+        del q, k, v, qkv
+
+        if debug_tensor_dump_output_folder:
+            dump_to_file(
+                debug_tensor_dump_output_folder,
+                f"attn_output_{self.layer_id}",
+                tensor_model_parallel_all_gather(
+                    attn_output.reshape(num_tokens, num_heads_q, head_dim).contiguous(),
+                    dim=1,
+                ).contiguous(),
+            )
+
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Grok1DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        load_presharded_moe: bool = False,
+        load_presharded_attn: bool = False,
+        load_presharded_mlp: bool = False,
+        alt_stream: Optional[torch.cuda.Stream] = None,
+        skip_moe: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.num_experts = config.num_local_experts
+        self.hidden_size = config.hidden_size
+        self.residual_moe = getattr(config, "residual_moe", False)
+        self.layer_id = layer_id
+        self.alt_stream = alt_stream or torch.cuda.Stream()
+
+        rope_theta = getattr(config, "rope_theta", 10000)
+        self.self_attn = Grok1Attention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=(
+                config.context_len
+                if hasattr(config, "context_len")
+                else config.max_position_embeddings
+            ),
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            quant_config=quant_config,
+            reduce_results=False,
+            alt_stream=self.alt_stream,
+            load_presharded_attn=load_presharded_attn,
+            prefix=add_prefix("attn", prefix),
+        )
+
+        split_gate_up = not getattr(config, "merge_gate_up", True)
+        if self.num_experts > 0:
+            self.block_sparse_moe = Grok1MoE(
+                config=config,
+                layer_id=layer_id,
+                num_experts=config.num_local_experts,
+                top_k=config.num_experts_per_tok,
+                hidden_size=config.hidden_size,
+                intermediate_size=getattr(
+                    config,
+                    "moe_intermediate_size",
+                    getattr(config, "intermediate_size", None),
+                ),
+                quant_config=quant_config,
+                reduce_results=not self.residual_moe,
+                use_presharded_weights=load_presharded_moe,
+                inplace=False,  # not self.residual_moe,
+                no_combine=False,  # self.residual_moe,  # just a suggestion to not combine topk
+                prefix=add_prefix("block_sparse_moe", prefix),
+            )
+            if self.residual_moe:
+                self.mlp = Grok1MLP(
+                    hidden_size=config.hidden_size,
+                    intermediate_size=config.intermediate_size,
+                    quant_config=quant_config,
+                    reduce_results=False,
+                    use_presharded_weights=load_presharded_mlp,
+                    layer_id=layer_id,
+                    split_gate_up=split_gate_up,
+                )
+        else:
+            raise NotImplementedError()
+
+        self.pre_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_moe_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_moe_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        if self.num_experts > 0:
+            if self.residual_moe:
+                # NOTE: self.block_sparse_moe modifies the input in-place,
+                # so we have to call it later. Be aware of any possible related errors.
+                if get_tensor_model_parallel_world_size() > 1:
+                    self.ffn = lambda x: tensor_model_parallel_all_reduce(
+                        self.moe_with_rmoe(x)
+                    )
+                else:
+                    self.ffn = self.moe_with_rmoe
+            else:
+                self.ffn = self.block_sparse_moe
+        else:
+            raise NotImplementedError()
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor] = None,
+        deferred_norm: Optional[RMSNorm] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, RMSNorm]:
+
+        hidden_states_original = hidden_states
+        residual_original = residual
+
+        # Self Attention
+        if deferred_norm is not None:
+            assert residual is not None
+            # here hidden_states is output of ffn, residual is residual from after previous attn layer
+            hidden_states, residual = fused_dual_residual_rmsnorm(
+                hidden_states,
+                residual,
+                deferred_norm.weight,
+                self.pre_attn_norm.weight,
+                deferred_norm.variance_epsilon,
+            )
+        else:
+            # here hidden_states is the residual
+            hidden_states, residual = (
+                fused_rmsnorm(
+                    hidden_states,
+                    self.pre_attn_norm.weight,
+                    self.pre_attn_norm.variance_epsilon,
+                ),
+                hidden_states,
+            )
+
+        if residual_original is not None:
+            dispose_tensor(residual_original)
+
+        dispose_flag = False
+        if residual is not hidden_states_original:
+            dispose_flag = True
+            dispose_tensor(hidden_states_original)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        if get_tensor_model_parallel_world_size() > 1:
+            hidden_states = tensor_model_parallel_all_reduce(hidden_states)
+
+        hidden_states, residual = fused_dual_residual_rmsnorm(
+            hidden_states,
+            residual,
+            self.post_attn_norm.weight,
+            self.pre_moe_norm.weight,
+            self.post_attn_norm.variance_epsilon,
+        )
+
+        if not dispose_flag:
+            dispose_tensor(hidden_states_original)
+
+        # Fully Connected
+        hidden_states = self.ffn(hidden_states)
+        return hidden_states, residual, self.post_moe_norm  # defer layernorm
+
+    def moe_with_rmoe(self, x):
+        current_stream = torch.cuda.current_stream()
+        self.alt_stream.wait_stream(current_stream)
+        mlp_result = self.mlp(x)
+        with torch.cuda.stream(self.alt_stream):
+            # moe should not be inplace because of stream race condition
+            moe_result = self.block_sparse_moe(x)
+        current_stream.wait_stream(self.alt_stream)
+        return (mlp_result + moe_result) / 1.4142135623730951
+
+
+class Grok1Model(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        load_presharded_moe: bool = False,
+        load_presharded_embedding: bool = False,
+        load_presharded_attn: bool = False,
+        load_presharded_mlp: bool = False,
+        replicate_embedding: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            use_presharded_weights=load_presharded_embedding,
+            enable_tp=not replicate_embedding,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+
+        self.alt_stream = torch.cuda.Stream()
+        self.layers = nn.ModuleList(
+            [
+                Grok1DecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    load_presharded_moe=load_presharded_moe,
+                    load_presharded_attn=load_presharded_attn,
+                    load_presharded_mlp=load_presharded_mlp,
+                    alt_stream=self.alt_stream,
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+            hidden_states.mul_(self.config.embedding_multiplier_scale)
+        else:
+            hidden_states = input_embeds
+
+        residual, deferred_norm = None, None
+        for i in range(len(self.layers)):
+            hidden_states, residual, deferred_norm = self.layers[i](
+                positions, hidden_states, forward_batch, residual, deferred_norm
+            )
+
+        if debug_tensor_dump_output_folder:
+            hidden_states = (
+                fused_rmsnorm(
+                    hidden_states,
+                    deferred_norm.weight,
+                    deferred_norm.variance_epsilon,
+                )
+                + residual
+            )
+
+            dump_to_file(
+                debug_tensor_dump_output_folder,
+                "last_hidden_before_norm",
+                hidden_states,
+            )
+
+            hidden_states = fused_rmsnorm(
+                hidden_states,
+                self.norm.weight,
+                self.norm.variance_epsilon,
+            )
+
+            dump_to_file(
+                debug_tensor_dump_output_folder,
+                "last_hidden_after_norm",
+                hidden_states,
+            )
+        else:
+            hidden_states, _ = fused_dual_residual_rmsnorm(
+                hidden_states,
+                residual,
+                deferred_norm.weight,
+                self.norm.weight,
+                deferred_norm.variance_epsilon,
+            )
+
+        return hidden_states
+
+
+class Grok1ForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+
+        # Get presharded weights.
+        self.load_presharded_mlp = getattr(config, "load_presharded_mlp", False)
+        self.load_presharded_moe = (
+            getattr(config, "load_presharded_moe", True)
+            and self.config.num_local_experts > 0
+            and get_tensor_model_parallel_world_size() > 1
+        )
+        self.load_presharded_attn = getattr(config, "load_presharded_attn", False)
+        self.load_presharded_embedding = getattr(
+            config, "load_presharded_embedding", False
+        )
+
+        self.is_weights_presharded = (
+            self.load_presharded_mlp
+            or self.load_presharded_moe
+            or self.load_presharded_attn
+            or self.load_presharded_embedding
+        )
+
+        default_replicate_lm_head = False
+        self.replicate_lm_head = getattr(
+            config, "replicate_lm_head", default_replicate_lm_head
+        )
+
+        if self.is_weights_presharded:
+            setattr(DefaultModelLoader, "_prepare_weights", _prepare_presharded_weights)
+
+        self.replicate_embedding = getattr(config, "replicate_embedding", False)
+
+        self.model = Grok1Model(
+            config,
+            quant_config=quant_config,
+            load_presharded_moe=self.load_presharded_moe,
+            load_presharded_embedding=self.load_presharded_embedding,
+            load_presharded_attn=self.load_presharded_attn,
+            load_presharded_mlp=self.load_presharded_mlp,
+            replicate_embedding=self.replicate_embedding,
+            prefix=add_prefix("model", prefix),
+        )
+
+        lm_head_params_dtype = None
+        if self.replicate_lm_head:
+            self.lm_head = ReplicatedLinear(
+                config.hidden_size,
+                config.vocab_size,
+                bias=False,
+                params_dtype=lm_head_params_dtype,
+                prefix=add_prefix("lm_head", prefix),
+            )
+            self.logits_processor = LogitsProcessor(config, skip_all_gather=True)
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                use_presharded_weights=self.load_presharded_embedding,
+                params_dtype=lm_head_params_dtype,
+                prefix=add_prefix("lm_head", prefix),
+            )
+            self.logits_processor = LogitsProcessor(config)
+
+        # Dump tensors for debugging
+        global debug_tensor_dump_output_folder, debug_tensor_dump_inject
+        debug_tensor_dump_output_folder = global_server_args_dict[
+            "debug_tensor_dump_output_folder"
+        ]
+        debug_tensor_dump_inject = global_server_args_dict["debug_tensor_dump_inject"]
+        warnings.filterwarnings("ignore", category=FutureWarning)
+
+        if get_tensor_model_parallel_rank() == 0:
+            logger.info(
+                f"#parameters (analytical): {self.get_num_params_analytical() / 1e9:.2f} B, "
+                f"#parameters (actual): {self.get_num_params_torch() / 1e9:.2f} B"
+            )
+        self.loaded_param_names = set()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if debug_tensor_dump_output_folder:
+            dump_to_file(debug_tensor_dump_output_folder, "input_ids", input_ids)
+
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(
+        self,
+        weights: Iterable[Tuple[str, torch.Tensor]],
+        ignore_parent_name: bool = False,
+        check_hit_names: bool = True,
+        model_config: PretrainedConfig | None = None,
+    ) -> dict[str, torch.Tensor]:
+        if model_config is None:
+            model_config = self.config
+
+        stacked_params_mapping = []
+        stacked_params_mapping += [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        stacked_params_mapping += [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        num_experts = model_config.num_local_experts
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=num_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        all_names = set(params_dict.keys())
+        hit_names = set()
+
+        def load_weight_wrapper(
+            name: str, loaded_weight: torch.Tensor, *args, **kwargs
+        ):
+            # Fuse constant multipliers into the weights
+            if "lm_head" in name:
+                loaded_weight = (
+                    loaded_weight.to(torch.float32)
+                    * model_config.output_multiplier_scale
+                )
+
+            original_name = name
+            if ignore_parent_name:
+                name = name.split(".")[-1]
+
+            if name not in params_dict:
+                logger.info(f"Skipping {name=} in load_weights_wrapper")
+                return
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight, *args, **kwargs)
+            hit_names.add(name)
+            self.loaded_param_names.add(original_name)
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                load_weight_wrapper(name, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+
+                    load_weight_wrapper(
+                        name,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if name is None:
+                        continue
+
+                    load_weight_wrapper(name=name, loaded_weight=loaded_weight)
+
+        if check_hit_names:
+            if len(hit_names) > 5:
+                missing = all_names - hit_names
+                missing_exclude_scales = {x for x in missing if "scale" not in x}
+                logger.info(
+                    f"#all_names: {len(all_names)}, #hit_names: {len(hit_names)}, #missing_exclude_scales: {len(missing_exclude_scales)}",
+                )
+                if len(missing_exclude_scales) > 0:
+                    raise ValueError(
+                        f"load_weights failed because some weights are missing: {missing_exclude_scales=}."
+                    )
+
+            elif len(hit_names) == 0:
+                raise ValueError(
+                    f"load_weights failed because it did not hit any names. {all_names=} {hit_names=}"
+                )
+
+        return hit_names
+
+    def get_num_params_analytical(self):
+        cfg = self.config
+        moe_intermediate_size = getattr(
+            cfg,
+            "moe_intermediate_size",
+            getattr(cfg, "intermediate_size", None),
+        )
+        residual_moe = getattr(cfg, "residual_moe", False)
+        if cfg.num_local_experts > 0:
+            num_experts = cfg.num_local_experts + (1 if residual_moe else 0)
+        else:
+            num_experts = 1
+
+        wq = (
+            cfg.num_hidden_layers
+            * cfg.hidden_size
+            * cfg.num_attention_heads
+            * cfg.head_dim
+        )
+        wkv = (
+            cfg.num_hidden_layers
+            * cfg.hidden_size
+            * cfg.num_key_value_heads
+            * cfg.head_dim
+            * 2
+        )
+        out = (
+            cfg.num_hidden_layers
+            * cfg.hidden_size
+            * cfg.num_attention_heads
+            * cfg.head_dim
+        )
+        ffn1 = (
+            cfg.num_hidden_layers
+            * num_experts
+            * cfg.hidden_size
+            * moe_intermediate_size
+            * 2
+        )
+        ffn2 = (
+            cfg.num_hidden_layers
+            * num_experts
+            * cfg.hidden_size
+            * moe_intermediate_size
+        )
+        embed = cfg.hidden_size * cfg.vocab_size * 2
+        return wq + wkv + out + ffn1 + ffn2 + embed
+
+    def get_num_params_torch(self):
+        return (
+            sum(p.numel() for p in self.parameters())
+            * get_tensor_model_parallel_world_size()
+        )
+
+
+old_prepare_weights = getattr(DefaultModelLoader, "_prepare_weights")
+
+
+def _prepare_presharded_weights(
+    self, model_name_or_path: str, revision: Optional[str], fall_back_to_pt: bool
+) -> Tuple[str, list[str], bool]:
+    import glob
+    import os
+
+    if get_tensor_model_parallel_world_size() == 1:
+        return old_prepare_weights(self, model_name_or_path, revision, fall_back_to_pt)
+
+    if not os.path.isdir(model_name_or_path):
+        from sglang.srt.model_loader.weight_utils import download_weights_from_hf
+
+        allow_patterns = ["*.safetensors", "*.bin"]
+        hf_folder = download_weights_from_hf(
+            model_name_or_path,
+            self.load_config.download_dir,
+            allow_patterns,
+            revision,
+            ignore_patterns=self.load_config.ignore_patterns,
+        )
+    else:
+        hf_folder = model_name_or_path
+
+    tp_rank = get_tensor_model_parallel_rank()
+
+    # The old format
+    allow_patterns = [f"*-{tp_rank:03d}.bin"]
+
+    # The new format
+    allow_patterns += [f"*-TP-{tp_rank:03d}.safetensors", "*-TP-common.safetensors"]
+
+    hf_weights_files = []
+    for pattern in allow_patterns:
+        hf_weights_files += glob.glob(os.path.join(hf_folder, pattern))
+
+    if hf_weights_files[0].endswith("safetensors"):
+        use_safetensors = True
+    else:
+        use_safetensors = False
+
+    return hf_folder, hf_weights_files, use_safetensors
+
+
+class Grok1ModelForCausalLM(Grok1ForCausalLM):
+    """An alias for backward-compatbility."""
+
+    pass
+
+
+EntryClass = [Grok1ForCausalLM, Grok1ModelForCausalLM]
diff --git a/python/sglang/srt/models/hunyuan.py b/python/sglang/srt/models/hunyuan.py
new file mode 100644
index 000000000..c1ed2543c
--- /dev/null
+++ b/python/sglang/srt/models/hunyuan.py
@@ -0,0 +1,819 @@
+# coding=utf-8
+# Copyright 2024 The HunYuan team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only HunYuan model compatible with HuggingFace weights."""
+import logging
+import re
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.eplb.expert_distribution import ExpertDistributionRecorder
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.sampler import Sampler
+from sglang.srt.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    kv_cache_scales_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.utils import add_prefix, is_hip
+
+expert_distribution_recorder = ExpertDistributionRecorder()
+
+
+def _is_moe(config: PretrainedConfig) -> bool:
+    if getattr(config, "num_experts", None) and (
+        (isinstance(config.num_experts, int) and config.num_experts > 1)
+        or (isinstance(config.num_experts, list) and max(config.num_experts) > 1)
+    ):
+        return True
+    else:
+        return False
+
+
+def _get_cla_factor(config: PretrainedConfig) -> int:
+    if not getattr(config, "use_cla", False):
+        return 1
+    return getattr(config, "cla_share_factor", 1)
+
+
+class HunYuanMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+        reduce_results: bool = True,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=hidden_size,
+            output_sizes=[intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+            reduce_results=reduce_results,
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class HunYuanSparseMoeBlock(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_id: int = -1,
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.num_experts}."
+            )
+
+        # Get layer_id topk if config.moe_topk is a list
+        if isinstance(config.moe_topk, list):
+            assert layer_id >= 0
+            assert len(config.moe_topk) > layer_id
+            top_k = config.moe_topk[layer_id]
+        else:
+            top_k = config.moe_topk
+
+        # If it is moe, moe_intermediate_size is preferred
+        intermediate_size = config.intermediate_size
+        if config.moe_intermediate_size is not None:
+            intermediate_size = (
+                config.moe_intermediate_size
+                if isinstance(config.moe_intermediate_size, int)
+                else config.moe_intermediate_size[layer_id]
+            )
+
+        self.topk = TopK(
+            top_k=top_k,
+            renormalize=True if top_k > 1 else False,
+        )
+
+        self.experts = FusedMoE(
+            num_experts=config.num_experts,
+            hidden_size=config.hidden_size,
+            intermediate_size=intermediate_size,
+            reduce_results=False,
+            layer_id=layer_id,
+            quant_config=quant_config,
+        )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size, config.num_experts, bias=False, quant_config=None
+        )
+        if config.use_mixed_mlp_moe > 0:
+            # Get layer_id num_shared_expert if config.num_shared_expert is a list
+            if isinstance(config.num_shared_expert, list):
+                assert layer_id >= 0
+                assert len(config.num_shared_expert) > layer_id
+                num_shared_expert = config.num_shared_expert[layer_id]
+            else:
+                num_shared_expert = config.num_shared_expert
+
+            self.shared_mlp = HunYuanMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size * num_shared_expert,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+            )
+        else:
+            self.shared_mlp = None
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        shared_output = None
+        if self.shared_mlp is not None:
+            shared_output = self.shared_mlp(hidden_states)
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = self.experts(hidden_states, topk_output)
+        if shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states.view(orig_shape)
+
+
+def get_head_dim(config):
+    if hasattr(config, "head_dim"):
+        return int(config.head_dim)
+    if hasattr(config, "attention_head_dim"):
+        return int(config.attention_head_dim)
+
+    # since some hunyuan model don't follow the self.hidden_size // self.total_num_heads rule
+    # wrong setting may cause runtime error, just throw error if this field is missing.
+    raise ValueError("Missing head dim config, try set head_dim in config.json")
+
+
+def check_head_dim(config):
+    # Some models may lack `head_dim` and use `attention_head_dim` instead.
+    # This attribute is also used by flashinfer_backend.py, so we check for
+    # consistency and raise an error if it's not met to avoid silent failures.
+    # Although we could adapt the HunYuan model to use `attention_head_dim`,
+    # flashinfer expects `head_dim`, so we enforce its presence for correctness.
+    calc_head_dim = config.hidden_size // config.num_attention_heads
+
+    if hasattr(config, "attention_head_dim"):
+        if calc_head_dim != config.attention_head_dim and not hasattr(
+            config, "head_dim"
+        ):
+            # in this case, flash infer(and other components may calculate wrong value.)
+            raise ValueError(
+                f"HunYuan model config error: calculated head_dim {calc_head_dim} != attention_head_dim {config.attention_head_dim}"
+                + f"\nPlease Add head_dim:{config.attention_head_dim} in config.json to make sure correctly inference."
+            )
+
+        if hasattr(config, "head_dim") and config.attention_head_dim != config.head_dim:
+            raise ValueError(
+                f"HunYuan model config error: head_dim({config.head_dim}) != attention_head_dim({config.attention_head_dim})"
+                + f"\nPlease change head_dim:{config.attention_head_dim} in config.json to make sure correctly inference."
+            )
+
+
+class HunYuanAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+        attention_type: str = "self",
+        layer_id: int = -1,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        # Prioritize `head_dim` but fall back to `attention_head_dim` for Hunyuan models.
+        self.head_dim = get_head_dim(config)
+
+        check_head_dim(config)
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.use_qk_norm = getattr(config, "use_qk_norm", False)
+        self.attention_type = attention_type
+        self.layer_id = layer_id
+
+        if attention_type == "self":
+            self.qkv_proj = QKVParallelLinear(
+                hidden_size=hidden_size,
+                head_size=self.head_dim,
+                total_num_heads=self.total_num_heads,
+                total_num_kv_heads=self.total_num_kv_heads,
+                bias=bias,
+                quant_config=quant_config,
+                prefix=f"{prefix}.qkv_proj",
+            )
+        elif attention_type == "cross":
+            self.q_proj = ColumnParallelLinear(
+                hidden_size,
+                hidden_size,
+                bias=bias,
+                quant_config=quant_config,
+                prefix=f"{prefix}.q_proj",
+            )
+        else:
+            raise RuntimeError("Not support attnention type")
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        is_neox_style = True
+        if quant_config is not None and quant_config.get_name() == "gguf":
+            is_neox_style = False
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=is_neox_style,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            prefix=f"{prefix}.attn",
+        )
+
+        if self.use_qk_norm:
+            self.query_layernorm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.key_layernorm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        kv_states: Optional[Tuple[torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        if self.attention_type == "self":
+            qkv, _ = self.qkv_proj(hidden_states)
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+            q, k = self.rotary_emb(positions, q, k)
+            ori_k = k
+            if self.use_qk_norm:
+                # q = self.query_layernorm(q.view(-1, self.num_heads, self.head_dim).contiguous())
+                # k = self.key_layernorm(k.view(-1, self.num_kv_heads, self.head_dim).contiguous())
+                q = self.query_layernorm(q.reshape(-1, self.head_dim).contiguous())
+                k = self.key_layernorm(k.reshape(-1, self.head_dim).contiguous())
+        elif self.attention_type == "cross":
+            assert kv_states is not None
+            ori_k, v = kv_states  # use last layer kv,
+            k = ori_k
+            q, _ = self.q_proj(hidden_states)
+            k_tmp = torch.empty_like(k)  # Todo: reduant rotary embedding
+            q, _ = self.rotary_emb(positions, q, k_tmp)
+            if self.use_qk_norm:
+                q = self.query_layernorm(
+                    q.view(-1, self.num_heads, self.head_dim).contiguous()
+                )
+                k = self.key_layernorm(
+                    k.view(-1, self.num_kv_heads, self.head_dim).contiguous()
+                )
+        else:
+            raise RuntimeError("Not support attnention type")
+
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output, (ori_k, v)
+
+
+class HunYuanDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        layer_id: int = -1,
+    ) -> None:
+        super().__init__()
+        assert layer_id >= 0
+        self.layer_id = layer_id
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = (
+            config.intermediate_size
+            if isinstance(config.intermediate_size, int)
+            else config.intermediate_size[layer_id]
+        )
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        # Support abacusai/Smaug-72B-v0.1 with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False
+        )
+        cla_factor = _get_cla_factor(config)
+        attention_type = (
+            "cross" if layer_id >= 0 and layer_id % cla_factor != 0 else "self"
+        )
+        self.self_attn = HunYuanAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(
+                config, "num_key_value_heads", config.num_attention_heads
+            ),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=attention_bias,
+            prefix=f"{prefix}.self_attn",
+            attention_type=attention_type,
+            layer_id=layer_id,
+        )
+        if _is_moe(config):
+            self.mlp = HunYuanSparseMoeBlock(
+                config=config,
+                quant_config=quant_config,
+                layer_id=layer_id,
+            )
+        else:
+            self.mlp = HunYuanMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=self.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                bias=getattr(config, "mlp_bias", False),
+                prefix=f"{prefix}.mlp",
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+        kv_states: Optional[Tuple[torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states, ori_kv_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+            kv_states=kv_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual, ori_kv_states
+
+
+class HunYuanModel(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                HunYuanDecoderLayer(
+                    config=config,
+                    layer_id=layer_id,
+                    quant_config=quant_config,
+                    # prefix=prefix
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if input_embeds is not None:
+            hidden_states = input_embeds
+        else:
+            hidden_states = self.get_input_embeddings(input_ids)
+        residual = None
+
+        prev_kv_states = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual, kv_states = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+                prev_kv_states,
+            )
+
+            if False:  # (i - self.start_layer) % cla_factor == 0:
+                prev_kv_states = kv_states
+            else:
+                prev_kv_states = None
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class HunYuanMoEV1ForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        self.model = HunYuanModel(config, quant_config, prefix="model")
+        self.unpadded_vocab_size = config.vocab_size
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+        )
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+
+        self.hidden_size = config.hidden_size
+        self.head_dim = get_head_dim(config)
+
+        check_head_dim(config)
+
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(config, logit_scale=logit_scale)
+        self.sampler = Sampler()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def _split_qkv_weight(self, qkv: torch.Tensor):
+        num_attention_heads = self.config.num_attention_heads
+        num_kv_heads = getattr(
+            self.config, "num_key_value_heads", self.config.num_attention_heads
+        )
+        num_key_value_groups = num_attention_heads // num_kv_heads
+
+        qkv = qkv.reshape(
+            num_kv_heads, num_key_value_groups + 2, self.head_dim, self.hidden_size
+        )
+        q, k, v = torch.split(qkv, (num_key_value_groups, 1, 1), dim=1)
+        q = q.reshape(-1, self.hidden_size)
+        k = k.reshape(-1, self.hidden_size)
+        v = v.reshape(-1, self.hidden_size)
+        return torch.concat((q, k, v))
+        # return qkv.reshape((num_kv_heads, num_key_value_groups+2 , attention_head_dim, hidden_size)).permute((1,0,2,3)).reshape((-1, hidden_size)),
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        cla_factor = _get_cla_factor(self.config)
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+        num_attention_heads = self.config.num_attention_heads
+        num_kv_heads = getattr(
+            self.config, "num_key_value_heads", self.config.num_attention_heads
+        )
+        split_params_mapping = [
+            (".gate_up_proj", ".gate_and_up_proj", 2, [(1, 1), (0, 1)], None),
+            (
+                ".qkv_proj",
+                ".qkv_proj",
+                num_attention_heads + num_kv_heads * 2,
+                [("q", num_attention_heads), ("k", num_kv_heads), ("v", num_kv_heads)],
+                self._split_qkv_weight,
+            ),
+        ]
+
+        if _is_moe(self.config):
+            # Params for weights, fp8 weight scales, fp8 activation scales
+            # (param_name, weight_name, expert_id, shard_id)
+            expert_params_mapping = FusedMoE.make_expert_params_mapping(
+                ckpt_gate_proj_name="gate_proj",
+                ckpt_down_proj_name="down_proj",
+                ckpt_up_proj_name="up_proj",
+                num_experts=self.config.num_experts,
+            )
+        else:
+            expert_params_mapping = {}
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "gate_proj_bias" in name:
+                name = name.replace("gate_proj_bias", "gate_proj.bias")
+            if "up_proj_bias" in name:
+                name = name.replace("up_proj_bias", "up_proj.bias")
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            # With tie_word_embeddings, we can skip lm_head.weight
+            # The weight might appear unnecessarily in the files if the model is
+            # processed with quantization, LoRA, fine-tuning, etc.
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+
+            is_found = False
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if "mlp.experts" in name:
+                    continue
+                # cross layer only have q_proj, skip qkv pack
+                if weight_name == ".q_proj":
+                    match = re.search(r"layers\.\d+", name)
+                    if match:
+                        layer_id = int(match.group(0).split(".")[-1])
+                        if cla_factor > 1 and layer_id % cla_factor != 0:
+                            continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+
+                is_found = True
+                break
+            if is_found:
+                continue
+
+            for param_name, weight_name, den, split_param, func in split_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                assert loaded_weight.shape[0] % den == 0
+                units = loaded_weight.shape[0] // den
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                offset = 0
+                for shard_id, num in split_param:
+                    new_offset = offset + num * units
+                    if func:
+                        weight_loader(
+                            param, func(loaded_weight)[offset:new_offset], shard_id
+                        )
+                    else:
+                        weight_loader(param, loaded_weight[offset:new_offset], shard_id)
+                    offset = new_offset
+
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    if "mlp.gate.wg." in name:
+                        name = name.replace("wg.", "")
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+    # If this function is called, it should always initialize KV cache scale
+    # factors (or else raise an exception). Thus, handled exceptions should
+    # make sure to leave KV cache scale factors in a known good (dummy) state
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+        for layer_idx, scaling_factor in kv_cache_scales_loader(
+            quantization_param_path,
+            tp_rank,
+            tp_size,
+            self.config.num_hidden_layers,
+            self.config.__class__.model_type,
+        ):
+            if not isinstance(self.model.layers[layer_idx], nn.Identity):
+                layer_self_attn = self.model.layers[layer_idx].self_attn
+
+            if is_hip():
+                # The scaling factor convention we are assuming is
+                # quantized_value * scaling_factor ~= true_value
+                # which is consistent with the practice of setting
+                # scaling_factor = tensor_amax / FPtype_max
+                scaling_factor *= 2
+            if hasattr(layer_self_attn, "kv_scale"):
+                layer_self_attn.attn._kv_scale = scaling_factor
+            else:
+                raise RuntimeError(
+                    "Self attention has no KV cache scaling " "factor attribute!"
+                )
+
+
+class HunYuanDenseV1ForCausalLM(HunYuanMoEV1ForCausalLM):
+    pass
+
+
+EntryClass = [HunYuanMoEV1ForCausalLM, HunYuanDenseV1ForCausalLM]
diff --git a/python/sglang/srt/models/idefics2.py b/python/sglang/srt/models/idefics2.py
new file mode 100644
index 000000000..75922d05c
--- /dev/null
+++ b/python/sglang/srt/models/idefics2.py
@@ -0,0 +1,342 @@
+# Copyright 2023 The SGLang team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.utils import add_prefix
+
+
+class Idefics2VisionMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("fc1", prefix),
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("fc2", prefix),
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Idefics2EncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.self_attn = VisionAttention(
+            embed_dim=config.hidden_size,
+            num_heads=self.num_heads,
+            projection_size=config.intermediate_size,
+            use_qkv_parallel=True,
+            quant_config=quant_config,
+            dropout=config.attention_dropout,
+            qkv_backend="sdpa",
+            softmax_in_single_precision=True,
+            flatten_batch=False,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = Idefics2VisionMLP(
+            config,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+
+        """
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(hidden_states, cu_seqlens=cu_seqlens)
+
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class Idefics2Encoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention
+    layers. Each layer is a
+    [`Idefics2EncoderLayer`].
+
+    Args:
+        config: Idefics2Config
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.layers = nn.ModuleList(
+            [
+                Idefics2EncoderLayer(
+                    config,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            inputs_embeds (torch.Tensor):
+                Optionally, instead of passing `input_ids` you can choose to
+                directly pass an embedded representation.
+                This is useful if you want more control over how to convert
+                `input_ids` indices into associated vectorsthan the model's
+                internal embedding lookup matrix.
+        """
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            layer_outputs = encoder_layer(
+                hidden_states,
+                cu_seqlens=cu_seqlens,
+            )
+            hidden_states = layer_outputs
+        return hidden_states
+
+
+class Idefics2VisionEmbeddings(nn.Module):
+    """
+    This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings
+    ` to enable images of variable
+    resolution.
+
+    The modifications are adapted from [Patch n' Pack: NaViT, a Vision
+    Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
+    which allows treating images in their native aspect ratio and without the
+    need to resize them to the same fixed size. In particular, we start from the
+    original pre-trained SigLIP model(which uses images of fixed-size square
+    images) and adapt it by training on images of variable resolutions.
+    """
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+
+    def get_position_ids(
+        self,
+        pixel_values: torch.FloatTensor,
+        patch_attention_mask: torch.BoolTensor,
+        tgt_sizes: Optional[torch.IntTensor] = None,
+    ):
+        batch_size, _, max_im_h, max_im_w = pixel_values.shape
+
+        max_nb_patches_h, max_nb_patches_w = (
+            max_im_h // self.patch_size,
+            max_im_w // self.patch_size,
+        )
+        boundaries = torch.arange(
+            1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side
+        )
+        position_ids = torch.full(
+            size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0
+        )
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+
+            if tgt_sizes is not None:
+                nb_patches_h = tgt_sizes[batch_idx][0]
+                nb_patches_w = tgt_sizes[batch_idx][1]
+            else:
+                nb_patches_h = p_attn_mask[:, 0].sum()
+                nb_patches_w = p_attn_mask[0].sum()
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+            bucket_coords_h = torch.bucketize(
+                fractional_coords_h, boundaries, right=True
+            )
+            bucket_coords_w = torch.bucketize(
+                fractional_coords_w, boundaries, right=True
+            )
+            pos_ids = (
+                bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w
+            ).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+        position_ids = position_ids.to(self.position_embedding.weight.device)
+        return position_ids
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        patch_attention_mask: torch.BoolTensor,
+        tgt_sizes: Optional[torch.IntTensor] = None,
+    ) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        pixel_values = pixel_values.to(
+            device=self.patch_embedding.weight.device, dtype=target_dtype
+        )
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+        position_ids = self.get_position_ids(
+            pixel_values, patch_attention_mask, tgt_sizes
+        )
+
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+
+
+class Idefics2VisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        require_post_norm: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        embed_dim = config.hidden_size
+        self.config = config
+        self.embeddings = Idefics2VisionEmbeddings(config)
+        self.encoder = Idefics2Encoder(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("encoder", prefix),
+        )
+        self.post_layernorm = (
+            nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+            if require_post_norm
+            else nn.Identity()
+        )
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embeddings
+
+    def compute_cu_seqlens(
+        self,
+        tgt_sizes: Optional[torch.Tensor] = None,
+        input_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # shape: (batch_size,)
+        if tgt_sizes is not None:
+            seqlen = tgt_sizes[:, 0] * tgt_sizes[:, 1]
+        elif input_embeds is not None:
+            seqlen = torch.full(
+                size=(input_embeds.shape[0],),
+                fill_value=input_embeds.shape[1],
+                dtype=torch.int32,
+                device=input_embeds.device,
+            )
+        else:
+            raise ValueError(
+                "Either `tgt_sizes` or `input_embeds` must be provided to compute cu_seqlens."
+            )
+
+        cu_seqlens = torch.cat(
+            [
+                torch.tensor([0], device=seqlen.device, dtype=torch.int32),
+                torch.cumsum(seqlen, dim=0, dtype=torch.int32),
+            ],
+            dim=0,
+        ).to(seqlen.device)
+        return cu_seqlens
+
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        tgt_sizes: Optional[torch.IntTensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.embeddings(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask,
+            tgt_sizes=tgt_sizes,
+        )
+        cu_seqlens = self.compute_cu_seqlens(tgt_sizes, hidden_states)
+        encoder_outputs = self.encoder(
+            hidden_states,
+            cu_seqlens=cu_seqlens,
+        )
+        last_hidden_state = self.post_layernorm(encoder_outputs)
+        return last_hidden_state
diff --git a/python/sglang/srt/models/internlm2.py b/python/sglang/srt/models/internlm2.py
new file mode 100644
index 000000000..7e0956ff0
--- /dev/null
+++ b/python/sglang/srt/models/internlm2.py
@@ -0,0 +1,357 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from https://raw.githubusercontent.com/vllm-project/vllm/7f62077af5159c625fe3ad1c812e6c1a2b93ba3b/vllm/model_executor/models/internlm2.py
+
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+
+class InternLM2MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.w2 = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("w2", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.w2(x)
+        return x
+
+
+class InternLM2Attention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.wqkv = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("wqkv", prefix),
+        )
+        self.wo = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("wo", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            self.num_kv_heads,
+            layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.wqkv(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.wo(attn_output)
+        return output
+
+
+class InternLMDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        self.attention = InternLM2Attention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attention", prefix),
+        )
+        self.feed_forward = InternLM2MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("feed_forward", prefix),
+        )
+        self.attention_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.ffn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.attention_norm(hidden_states)
+        else:
+            hidden_states, residual = self.attention_norm(hidden_states, residual)
+        hidden_states = self.attention(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.ffn_norm(hidden_states, residual)
+        hidden_states = self.feed_forward(hidden_states)
+        return hidden_states, residual
+
+
+class InternLM2Model(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.tok_embeddings = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("tok_embeddings", prefix),
+        )
+        self.layers = nn.ModuleList(
+            [
+                InternLMDecoderLayer(
+                    config, i, quant_config, prefix=add_prefix(f"layers.{i}", prefix)
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.tok_embeddings(input_ids)
+        else:
+            hidden_states = input_embeds
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class InternLM2ForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = InternLM2Model(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.output = ParallelLMHead(
+            config.vocab_size, config.hidden_size, prefix=add_prefix("output", prefix)
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.tok_embeddings
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.output, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "w1", 0),
+            ("gate_up_proj", "w3", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                if "wqkv" in name:
+                    config = self.config
+                    kv_groups = config.num_attention_heads // config.num_key_value_heads
+                    head_dim = config.hidden_size // config.num_attention_heads
+                    loaded_weight = loaded_weight.view(
+                        -1, 2 + kv_groups, head_dim, loaded_weight.shape[-1]
+                    )
+                    wq, wk, wv = torch.split(loaded_weight, [kv_groups, 1, 1], dim=1)
+                    wq = wq.reshape(-1, wq.shape[-1])
+                    wk = wk.reshape(-1, wk.shape[-1])
+                    wv = wv.reshape(-1, wv.shape[-1])
+                    weight_loader = param.weight_loader
+                    weight_loader(param, wq, "q")
+                    weight_loader(param, wk, "k")
+                    weight_loader(param, wv, "v")
+                else:
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+
+EntryClass = InternLM2ForCausalLM
diff --git a/python/sglang/srt/models/internlm2_reward.py b/python/sglang/srt/models/internlm2_reward.py
new file mode 100644
index 000000000..68be8d001
--- /dev/null
+++ b/python/sglang/srt/models/internlm2_reward.py
@@ -0,0 +1,64 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.internlm2 import InternLM2ForCausalLM, InternLM2Model
+from sglang.srt.utils import add_prefix
+
+
+class InternLM2ForRewardModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.vocab_size = config.vocab_size
+        self.model = InternLM2Model(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.v_head = nn.Linear(config.hidden_size, 1, bias=False)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=False)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = True,
+    ) -> EmbeddingPoolerOutput:
+        assert get_embedding, "InternLM2ForRewardModel is only used for embedding"
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        last_token_hidden = self.pooler(hidden_states, forward_batch).embeddings
+        scores = self.v_head(last_token_hidden)
+        return EmbeddingPoolerOutput(scores)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        return InternLM2ForCausalLM.load_weights(self, weights)
+
+
+EntryClass = InternLM2ForRewardModel
diff --git a/python/sglang/srt/models/interns1.py b/python/sglang/srt/models/interns1.py
new file mode 100644
index 000000000..c7383ed25
--- /dev/null
+++ b/python/sglang/srt/models/interns1.py
@@ -0,0 +1,293 @@
+from typing import Iterable, List, Optional, Set, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.layers.attention import vision_utils
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternTokenPairs,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.internvl import InternVisionModel
+from sglang.srt.models.qwen2 import Qwen2ForCausalLM
+from sglang.srt.models.qwen3 import Qwen3ForCausalLM
+from sglang.srt.models.qwen3_moe import Qwen3MoeForCausalLM
+from sglang.utils import logger
+
+
+class InternS1ForConditionalGeneration(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        use_flash_attn=True,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        vision_utils.update_vit_attn_dummy_heads_config(self.config)
+        image_size = (
+            getattr(config, "force_image_size", None) or config.vision_config.image_size
+        )
+        patch_size = config.vision_config.patch_size
+        if isinstance(image_size, list):
+            image_size = image_size[0]
+        if isinstance(patch_size, list):
+            patch_size = patch_size[0]
+        self.patch_size = patch_size
+        self.select_layer = config.vision_feature_layer
+        self.num_image_token = int(
+            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
+        )
+        self.downsample_ratio = config.downsample_ratio
+        self.ps_version = getattr(config, "ps_version", "v1")
+        # self.template = getattr(config, 'template', 'internvl2_5')
+
+        config.vision_config.use_flash_attn = True if use_flash_attn else False
+        config.text_config._attn_implementation = (
+            "flash_attention_2" if use_flash_attn else "eager"
+        )
+
+        logger.info(f"num_image_token: {self.num_image_token}")
+        logger.info(f"ps_version: {self.ps_version}")
+
+        self.vision_model = InternVisionModel(config.vision_config)
+        if config.text_config.architectures[0] == "Qwen2ForCausalLM":
+            self.language_model = Qwen2ForCausalLM(
+                config=config.text_config, quant_config=quant_config
+            )
+        elif config.text_config.architectures[0] == "Qwen3MoeForCausalLM":
+            self.language_model = Qwen3MoeForCausalLM(
+                config=config.text_config, quant_config=quant_config
+            )
+        elif config.text_config.architectures[0] == "Qwen3ForCausalLM":
+            self.language_model = Qwen3ForCausalLM(
+                config=config.text_config, quant_config=quant_config
+            )
+        else:
+            raise NotImplementedError(
+                f"{config.text_config.architectures[0]} is not implemented."
+            )
+
+        vit_hidden_size = config.vision_config.hidden_size
+        llm_hidden_size = config.text_config.hidden_size
+
+        self.mlp1 = nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
+            nn.Linear(
+                vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size
+            ),
+            nn.GELU(),
+            nn.Linear(llm_hidden_size, llm_hidden_size),
+        )
+
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
+        x = x.view(
+            n,
+            int(h * scale_factor),
+            int(w * scale_factor),
+            int(c / (scale_factor * scale_factor)),
+        )
+        if self.ps_version == "v1":
+            logger.warn(
+                "In ps_version 'v1', the height and width have not been swapped back, "
+                "which results in a transposed image."
+            )
+        else:
+            x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+
+    def extract_feature(self, pixel_values):
+        if self.select_layer == -1:
+            vit_embeds = self.vision_model(
+                pixel_values=pixel_values, output_hidden_states=False, return_dict=True
+            ).last_hidden_state
+        else:
+            vit_embeds = self.vision_model(
+                pixel_values=pixel_values, output_hidden_states=True, return_dict=True
+            ).hidden_states[self.select_layer]
+        vit_embeds = vit_embeds[:, 1:, :]
+
+        h = w = int(vit_embeds.shape[1] ** 0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
+        vit_embeds = self.mlp1(vit_embeds)
+        return vit_embeds
+
+    def get_image_feature(self, items: List[MultimodalDataItem]):
+        """
+        Projects the last hidden state from the vision model into language model space.
+
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        pixel_values = torch.cat([item.feature for item in items])
+        image_features = self.extract_feature(pixel_values)
+        return image_features
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+
+        hs = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.language_model,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+            },
+            positions=positions,
+        )
+
+        return hs
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        # Get all special token IDs
+        im_start_id: int = mm_inputs.im_start_id
+        im_end_id: int = mm_inputs.im_end_id
+
+        media_token_pairs = [(im_start_id, im_end_id)]
+        helper = MultiModalityDataPaddingPatternTokenPairs(media_token_pairs)
+
+        return helper.pad_input_tokens(input_ids, mm_inputs)
+
+    def _mapping_interns1_name(self, name):
+        names_map = {
+            "lm_head.weight": "language_model.lm_head.weight",
+            "model.multi_modal_projector.layer_norm.bias": "mlp1.0.bias",
+            "model.multi_modal_projector.layer_norm.weight": "mlp1.0.weight",
+            "model.multi_modal_projector.linear_1.bias": "mlp1.1.bias",
+            "model.multi_modal_projector.linear_1.weight": "mlp1.1.weight",
+            "model.multi_modal_projector.linear_2.bias": "mlp1.3.bias",
+            "model.multi_modal_projector.linear_2.weight": "mlp1.3.weight",
+            "model.vision_tower.embeddings.cls_token": "vision_model.embeddings.class_embedding",
+            "model.vision_tower.embeddings.patch_embeddings.projection.bias": "vision_model.embeddings.patch_embedding.bias",
+            "model.vision_tower.embeddings.patch_embeddings.projection.weight": "vision_model.embeddings.patch_embedding.weight",
+            "model.vision_tower.embeddings.position_embeddings": "vision_model.embeddings.position_embedding",
+        }
+        if name in names_map:
+            name = names_map[name]
+        elif name.startswith("model.language_model."):
+            name = "language_model.model." + name[len("model.language_model.") :]
+        elif name.startswith("model.vision_tower."):
+            name = "vision_model." + name[len("model.vision_tower.") :]
+
+        if name.startswith("vision_model.encoder.layer"):
+
+            name = name.replace(r".layer.", r".layers.")
+            name = name.replace(r".attention.", r".attn.attn.")
+            name = name.replace(r".projection_layer.", r".proj.")
+            name = name.replace(r".lambda_1", r".ls1")
+            name = name.replace(r".lambda_2", r".ls2")
+            name = name.replace(r".layernorm_before.", r".norm1.")
+            name = name.replace(r".layernorm_after.", r".norm2.")
+        return name
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        expert_params_mapping = []
+        if "Qwen3MoeForCausalLM" in self.config.text_config.architectures:
+            expert_params_mapping = FusedMoE.make_expert_params_mapping(
+                ckpt_gate_proj_name="gate_proj",
+                ckpt_down_proj_name="down_proj",
+                ckpt_up_proj_name="up_proj",
+                num_experts=self.config.num_experts,
+            )
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            name = self._mapping_interns1_name(name)
+            if "vision_model" in name:
+                loaded_weight = vision_utils.pad_vit_attn_dummy_heads(
+                    self.config, name, loaded_weight
+                )
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+            loaded_params.add(name)
+        unloaded_params = params_dict.keys() - loaded_params
+        if unloaded_params:
+            raise RuntimeError(
+                f"Some weights are not initialized from checkpoints: {unloaded_params}"
+            )
+        return loaded_params
+
+
+EntryClass = [InternS1ForConditionalGeneration]
diff --git a/python/sglang/srt/models/internvl.py b/python/sglang/srt/models/internvl.py
new file mode 100644
index 000000000..b146da0e5
--- /dev/null
+++ b/python/sglang/srt/models/internvl.py
@@ -0,0 +1,699 @@
+from typing import Iterable, List, Optional, Set, Tuple, Union
+
+import torch
+
+# Adapted from https://raw.githubusercontent.com/vllm-project/vllm/7f62077af5159c625fe3ad1c812e6c1a2b93ba3b/vllm/model_executor/models/internlm2.py
+# Adapted from https://raw.githubusercontent.com/hehesangsj/sglang/refs/heads/internvl/python/sglang/srt/models/internvl.py
+import torch.nn.functional as F
+from torch import nn
+from transformers import PretrainedConfig, PreTrainedModel
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+
+from sglang.srt.layers.attention import vision_utils
+from sglang.srt.layers.attention.vision import SingletonCache, VisionAttention
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternTokenPairs,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.deepseek_janus_pro import DropPath
+from sglang.srt.models.gpt_oss import GptOssForCausalLM
+from sglang.srt.models.internlm2 import InternLM2ForCausalLM
+from sglang.srt.models.qwen2 import Qwen2ForCausalLM
+from sglang.srt.models.qwen3 import Qwen3ForCausalLM
+from sglang.srt.models.qwen3_moe import Qwen3MoeForCausalLM
+from sglang.utils import logger
+
+
+class InternAttention(nn.Module):
+    def __init__(
+        self,
+        config,
+        quant_config: QuantizationConfig = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.scale = self.head_dim**-0.5
+
+        self.attn = VisionAttention(
+            qkv_backend="fa3",
+            embed_dim=self.embed_dim,
+            num_heads=self.num_heads,
+            projection_size=self.embed_dim,
+            use_qkv_parallel=True,
+            quant_config=quant_config,
+            dropout=getattr(config, "dropout", 0.0),
+            qkv_bias=getattr(config, "qkv_bias", False)
+            or getattr(config, "attention_bias", False),
+            num_dummy_heads=getattr(config, "num_dummy_heads", 0),
+            qk_normalization=getattr(config, "qk_normalization", False)
+            or getattr(config, "use_qk_norm", False),
+            flatten_batch=False,
+        )
+
+        self.proj_drop = nn.Dropout(config.dropout)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+    ) -> torch.Tensor:
+        out = self.attn(hidden_states, cu_seqlens=cu_seqlens)
+        outs = self.proj_drop(out)
+        return outs
+
+
+class InternVisionEmbeddings(nn.Module):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = (
+            config.image_size
+            if isinstance(config.image_size, int)
+            else config.image_size[0]
+        )
+        self.patch_size = (
+            config.patch_size
+            if isinstance(config.patch_size, int)
+            else config.patch_size[0]
+        )
+
+        self.class_embedding = nn.Parameter(
+            torch.randn(1, 1, self.embed_dim),
+        )
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = nn.Parameter(
+            torch.randn(1, self.num_positions, self.embed_dim)
+        )
+
+    def _get_pos_embed(self, pos_embed, H, W):
+        target_dtype = pos_embed.dtype
+        pos_embed = (
+            pos_embed.float()
+            .reshape(
+                1,
+                self.image_size // self.patch_size,
+                self.image_size // self.patch_size,
+                -1,
+            )
+            .permute(0, 3, 1, 2)
+        )
+        pos_embed = (
+            F.interpolate(pos_embed, size=(H, W), mode="bicubic", align_corners=False)
+            .reshape(1, -1, H * W)
+            .permute(0, 2, 1)
+            .to(target_dtype)
+        )
+        return pos_embed
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(
+            pixel_values
+        )  # shape = [*, channel, width, height]
+        batch_size, _, height, width = patch_embeds.shape
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        position_embedding = torch.cat(
+            [
+                self.position_embedding[:, :1, :],
+                self._get_pos_embed(self.position_embedding[:, 1:, :], height, width),
+            ],
+            dim=1,
+        )
+        embeddings = embeddings + position_embedding.to(target_dtype)
+        return embeddings
+
+
+class InternRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+class InternMLP(nn.Module):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.act = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+NORM2FN = {
+    "rms_norm": InternRMSNorm,
+    "layer_norm": nn.LayerNorm,
+}
+
+
+class InternVisionEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        drop_path_rate: float,
+        quant_config: QuantizationConfig = None,
+    ):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.norm_type = config.norm_type
+        self.attn = InternAttention(config=config, quant_config=quant_config)
+        self.mlp = InternMLP(config)
+        self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
+        self.norm2 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
+
+        self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.drop_path1 = (
+            DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        )
+        self.drop_path2 = (
+            DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+    ) -> Tuple[
+        torch.FloatTensor,
+        Optional[torch.FloatTensor],
+        Optional[Tuple[torch.FloatTensor]],
+    ]:
+        """
+        Args:
+            hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
+        """
+
+        hidden_states = hidden_states + self.drop_path1(
+            self.attn(
+                self.norm1(hidden_states).to(hidden_states.dtype), cu_seqlens=cu_seqlens
+            )
+            * self.ls1
+        )
+
+        hidden_states = hidden_states + self.drop_path2(
+            self.mlp(self.norm2(hidden_states).to(hidden_states.dtype)) * self.ls2
+        )
+
+        return hidden_states
+
+
+class InternVisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`InternEncoderLayer`].
+
+    Args:
+        config (`InternConfig`):
+            The corresponding vision configuration for the `InternEncoder`.
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.config = config
+        # stochastic depth decay rule
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)
+        ]
+        self.layers = nn.ModuleList(
+            [
+                InternVisionEncoderLayer(config, dpr[idx], quant_config)
+                for idx in range(config.num_hidden_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        inputs_embeds,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        encoder_states = () if output_hidden_states else None
+        hidden_states = inputs_embeds
+
+        cu_seqlens = SingletonCache()
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(hidden_states, cu_seqlens=cu_seqlens)
+            hidden_states = layer_outputs
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states
+        )
+
+
+class InternVisionModel(PreTrainedModel):
+    main_input_name = "pixel_values"
+    _supports_flash_attn_2 = True
+    config_class = PretrainedConfig
+    _no_split_modules = ["InternVisionEncoderLayer"]
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = InternVisionEmbeddings(
+            config,
+        )
+        self.encoder = InternVisionEncoder(config, quant_config)
+
+    def resize_pos_embeddings(self, old_size, new_size, patch_size):
+        pos_emb = self.embeddings.position_embedding
+        _, num_positions, embed_dim = pos_emb.shape
+        cls_emb = pos_emb[:, :1, :]
+        pos_emb = (
+            pos_emb[:, 1:, :]
+            .reshape(1, old_size // patch_size, old_size // patch_size, -1)
+            .permute(0, 3, 1, 2)
+        )
+        pos_emb = F.interpolate(
+            pos_emb.float(),
+            size=new_size // patch_size,
+            mode="bicubic",
+            align_corners=False,
+        )
+        pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1)
+        pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
+        self.embeddings.position_embedding = nn.Parameter(pos_emb)
+        self.embeddings.image_size = new_size
+        logger.info(
+            "Resized position embeddings from {} to {}".format(old_size, new_size)
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        pixel_embeds: Optional[torch.FloatTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        pixel_values = pixel_values.to(device=self.device, dtype=self.dtype)
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        if pixel_values is None and pixel_embeds is None:
+            raise ValueError("You have to specify pixel_values or pixel_embeds")
+
+        if pixel_embeds is not None:
+            hidden_states = pixel_embeds
+        else:
+            if len(pixel_values.shape) == 4:
+                hidden_states = self.embeddings(pixel_values)
+            else:
+                raise ValueError(f"wrong pixel_values size: {pixel_values.shape}")
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs.last_hidden_state
+        pooled_output = last_hidden_state[:, 0, :]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class InternVLChatModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        use_flash_attn=True,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        vision_utils.update_vit_attn_dummy_heads_config(self.config)
+        image_size = config.force_image_size or config.vision_config.image_size
+        patch_size = config.vision_config.patch_size
+        self.patch_size = patch_size
+        self.select_layer = config.select_layer
+        self.template = config.template
+        self.num_image_token = int(
+            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
+        )
+        self.downsample_ratio = config.downsample_ratio
+        self.ps_version = config.ps_version
+
+        config.vision_config.use_flash_attn = True if use_flash_attn else False
+        config.llm_config._attn_implementation = (
+            "flash_attention_2" if use_flash_attn else "eager"
+        )
+
+        logger.info(f"num_image_token: {self.num_image_token}")
+        logger.info(f"ps_version: {self.ps_version}")
+
+        self.vision_model = InternVisionModel(config.vision_config)
+        if config.llm_config.architectures[0] == "Qwen2ForCausalLM":
+            self.language_model = Qwen2ForCausalLM(
+                config=config.llm_config, quant_config=quant_config
+            )
+        elif config.llm_config.architectures[0] == "InternLM2ForCausalLM":
+            self.language_model = InternLM2ForCausalLM(
+                config=config.llm_config, quant_config=quant_config
+            )
+        elif config.llm_config.architectures[0] == "Qwen3MoeForCausalLM":
+            self.language_model = Qwen3MoeForCausalLM(
+                config=config.llm_config, quant_config=quant_config
+            )
+        elif config.llm_config.architectures[0] == "GptOssForCausalLM":
+            self.language_model = GptOssForCausalLM(
+                config=config.llm_config, quant_config=quant_config
+            )
+        elif config.llm_config.architectures[0] == "Qwen3ForCausalLM":
+            self.language_model = Qwen3ForCausalLM(
+                config=config.llm_config, quant_config=quant_config
+            )
+        else:
+            raise NotImplementedError(
+                f"{config.llm_config.architectures[0]} is not implemented."
+            )
+
+        vit_hidden_size = config.vision_config.hidden_size
+        llm_hidden_size = config.llm_config.hidden_size
+
+        self.mlp1 = nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
+            nn.Linear(
+                vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size
+            ),
+            nn.GELU(),
+            nn.Linear(llm_hidden_size, llm_hidden_size),
+        )
+
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
+        x = x.view(
+            n,
+            int(h * scale_factor),
+            int(w * scale_factor),
+            int(c / (scale_factor * scale_factor)),
+        )
+        if self.ps_version == "v1":
+            logger.warn(
+                "In ps_version 'v1', the height and width have not been swapped back, "
+                "which results in a transposed image."
+            )
+        else:
+            x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+
+    def extract_feature(self, pixel_values):
+        if self.select_layer == -1:
+            vit_embeds = self.vision_model(
+                pixel_values=pixel_values, output_hidden_states=False, return_dict=True
+            ).last_hidden_state
+        else:
+            vit_embeds = self.vision_model(
+                pixel_values=pixel_values, output_hidden_states=True, return_dict=True
+            ).hidden_states[self.select_layer]
+        vit_embeds = vit_embeds[:, 1:, :]
+
+        h = w = int(vit_embeds.shape[1] ** 0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
+        vit_embeds = self.mlp1(vit_embeds)
+        return vit_embeds
+
+    def get_image_feature(self, items: List[MultimodalDataItem]):
+        """
+        Projects the last hidden state from the vision model into language model space.
+
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        pixel_values = torch.cat([item.feature for item in items])
+        image_features = self.extract_feature(pixel_values)
+        return image_features
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+
+        hs = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.language_model,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+            },
+            positions=positions,
+        )
+
+        return hs
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        # Get all special token IDs
+        im_start_id: int = mm_inputs.im_start_id
+        im_end_id: int = mm_inputs.im_end_id
+
+        media_token_pairs = [(im_start_id, im_end_id)]
+        helper = MultiModalityDataPaddingPatternTokenPairs(media_token_pairs)
+
+        return helper.pad_input_tokens(input_ids, mm_inputs)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        expert_params_mapping = []
+        if "InternLM2ForCausalLM" in self.config.llm_config.architectures:
+            stacked_params_mapping = [
+                # (param_name, shard_name, shard_id)
+                ("gate_up_proj", "w1", 0),
+                ("gate_up_proj", "w3", 1),
+            ]
+        elif "Qwen2ForCausalLM" in self.config.llm_config.architectures:
+            stacked_params_mapping = [
+                # (param_name, shard_name, shard_id)
+                ("qkv_proj", "q_proj", "q"),
+                ("qkv_proj", "k_proj", "k"),
+                ("qkv_proj", "v_proj", "v"),
+                ("gate_up_proj", "gate_proj", 0),
+                ("gate_up_proj", "up_proj", 1),
+            ]
+        elif "Qwen3MoeForCausalLM" in self.config.llm_config.architectures:
+            stacked_params_mapping = [
+                # (param_name, shard_name, shard_id)
+                ("qkv_proj", "q_proj", "q"),
+                ("qkv_proj", "k_proj", "k"),
+                ("qkv_proj", "v_proj", "v"),
+                ("gate_up_proj", "gate_proj", 0),
+                ("gate_up_proj", "up_proj", 1),
+            ]
+
+            expert_params_mapping = FusedMoE.make_expert_params_mapping(
+                ckpt_gate_proj_name="gate_proj",
+                ckpt_down_proj_name="down_proj",
+                ckpt_up_proj_name="up_proj",
+                num_experts=self.config.num_experts,
+            )
+        elif "Qwen3ForCausalLM" in self.config.llm_config.architectures:
+            stacked_params_mapping = [
+                # (param_name, shard_name, shard_id)
+                ("qkv_proj", "q_proj", "q"),
+                ("qkv_proj", "k_proj", "k"),
+                ("qkv_proj", "v_proj", "v"),
+                ("gate_up_proj", "gate_proj", 0),
+                ("gate_up_proj", "up_proj", 1),
+            ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if "vision_model" in name:
+                    # adapt to VisionAttention
+                    name = name.replace(r"attn.", r"attn.attn.")
+                    name = name.replace(r"qkv.", r"qkv_proj.")
+
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    if "wqkv" in name:
+                        config = self.config
+                        kv_groups = (
+                            config.num_attention_heads // config.num_key_value_heads
+                        )
+                        head_dim = config.hidden_size // config.num_attention_heads
+                        loaded_weight = loaded_weight.view(
+                            -1, 2 + kv_groups, head_dim, loaded_weight.shape[-1]
+                        )
+                        wq, wk, wv = torch.split(
+                            loaded_weight, [kv_groups, 1, 1], dim=1
+                        )
+                        wq = wq.reshape(-1, wq.shape[-1])
+                        wk = wk.reshape(-1, wk.shape[-1])
+                        wv = wv.reshape(-1, wv.shape[-1])
+                        weight_loader = param.weight_loader
+                        weight_loader(param, wq, "q")
+                        weight_loader(param, wk, "k")
+                        weight_loader(param, wv, "v")
+                    else:
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        if "vision_model" in name:
+                            loaded_weight = vision_utils.pad_vit_attn_dummy_heads(
+                                self.config, name, loaded_weight
+                            )
+                        weight_loader(param, loaded_weight)
+
+            loaded_params.add(name)
+        unloaded_params = params_dict.keys() - loaded_params
+        # Skip params that are created by quantization wrappers and are not expected in the ckpt
+        _quant_only_fragments = (
+            "weight_scale",  # per-matrix FP8 scales (e.g., w2_weight_scale, w13_weight_scale)
+        )
+        unloaded_params = {
+            n
+            for n in unloaded_params
+            if not any(frag in n for frag in _quant_only_fragments)
+        }
+        if unloaded_params:
+            raise RuntimeError(
+                f"Some weights are not initialized from checkpoints: {unloaded_params}"
+            )
+        return loaded_params
+
+
+EntryClass = InternVLChatModel
diff --git a/python/sglang/srt/models/kimi_vl.py b/python/sglang/srt/models/kimi_vl.py
new file mode 100644
index 000000000..68ed47b2e
--- /dev/null
+++ b/python/sglang/srt/models/kimi_vl.py
@@ -0,0 +1,313 @@
+# SPDX-License-Identifier: Apache-2.0
+# ruff: noqa: E501
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/modeling_kimi_vl.py
+# Copyright 2025 The Moonshot AI Team, DeepSeek-AI, and HuggingFace Inc. team. All rights reserved.
+#
+# The code is based on llava (llava/modeling_llava.py) and DeepSeek-V3 (DeepSeek-V3/modeling_deepseek.py), but modified for KimiVL.
+#
+# Licensing Information:
+# - Code derived from llava (llava/modeling_llava.py) and DeepSeek-V3 (DeepSeek-V3/modeling_deepseek.py) is licensed under the Apache License, Version 2.0.
+# - Other parts of the code are licensed under the MIT License.
+#
+# Apache License, Version 2.0:
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License:
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import copy
+import logging
+import math
+from collections.abc import Mapping
+from dataclasses import dataclass
+from typing import Any, Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers.activations import GELUActivation
+
+from sglang.srt.configs import KimiVLConfig
+from sglang.srt.configs.deepseekvl2 import DeepseekV2Config
+from sglang.srt.configs.kimi_vl import KimiVLConfig
+from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.activation import QuickGELU
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.models.deepseek_v2 import DeepseekV2ForCausalLM
+from sglang.srt.models.kimi_vl_moonvit import MoonVitPretrainedModel
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+
+# For dummy input only
+@dataclass
+class MaxImageTokenMeta:
+    width: int = 1024
+    height: int = 1024
+
+
+class KimiVLMultiModalProjector(nn.Module):
+
+    def __init__(self, config: KimiVLConfig):
+        super().__init__()
+
+        self.hidden_size = (
+            config.vision_config.hidden_size
+            * config.vision_config.merge_kernel_size[0]
+            * config.vision_config.merge_kernel_size[1]
+        )
+
+        self.pre_norm = torch.nn.LayerNorm(config.vision_config.hidden_size, eps=1e-5)
+        self.linear_1 = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
+        self.act = GELUActivation()
+        self.act = QuickGELU()
+        self.linear_2 = nn.Linear(
+            self.hidden_size, config.text_config.hidden_size, bias=True
+        )
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.pre_norm(image_features).view(-1, self.hidden_size)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class KimiVLForConditionalGeneration(nn.Module):
+    def __init__(
+        self,
+        config: KimiVLConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        **kwargs,  # fix init_tts argument error
+    ) -> None:
+        super().__init__()
+        self.config = config
+        assert isinstance(config.vision_config, MoonViTConfig)
+
+        self.vision_tower = MoonVitPretrainedModel(config.vision_config)
+
+        self.multi_modal_projector = KimiVLMultiModalProjector(config=config)
+        self.quant_config = quant_config
+        text_config = copy.deepcopy(config.text_config)
+        text_config.architectures = ["DeepseekV2ForCausalLM"]
+        self.language_model = DeepseekV2ForCausalLM(
+            config=text_config,
+            quant_config=quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        pixel_values = (
+            torch.cat([item.feature for item in items], dim=0)
+            .type(self.vision_tower.dtype)
+            .to(self.vision_tower.device)
+        )
+        image_grid_hws = torch.cat([item.image_grid_hws for item in items], dim=0).to(
+            self.vision_tower.device
+        )
+        image_features = self.vision_tower(pixel_values, image_grid_hws)
+        assert isinstance(image_features, list)
+        # lengths = [x.shape[0] for x in image_features]
+        res = self.multi_modal_projector(torch.cat(image_features))  # .split(lengths)
+        return res
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = False,
+    ):
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.language_model,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+            },
+            positions=positions,
+        )
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        config = self.config.text_config
+        _KEYS_TO_MODIFY_MAPPING = {
+            # "language_model.lm_head": "lm_head",
+            # "language_model.model": "language_model",
+        }
+        # only doing this for language model part for now.
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        if not config.use_mla:
+            stacked_params_mapping += [
+                (".qkv_proj", ".q_proj", "q"),
+                (".qkv_proj", ".k_proj", "k"),
+                (".qkv_proj", ".v_proj", "v"),
+            ]
+        if getattr(config, "n_routed_experts", None):
+            # Params for weights, fp8 weight scales, fp8 activation scales
+            # (param_name, weight_name, expert_id, shard_id)
+            expert_params_mapping = FusedMoE.make_expert_params_mapping(
+                ckpt_gate_proj_name="gate_proj",
+                ckpt_down_proj_name="down_proj",
+                ckpt_up_proj_name="up_proj",
+                num_experts=config.n_routed_experts,
+            )
+        else:
+            expert_params_mapping = []
+
+        params_dict = dict(self.named_parameters())
+        for args in weights:
+            name, loaded_weight = args[:2]
+            kwargs = args[2] if len(args) > 2 else {}
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            spec_layer = get_spec_layer_idx_from_weight_name(config, name)
+            if spec_layer is not None:
+                continue  # skip spec decode layers for main model
+
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
+                if key_to_modify in name:
+                    name = name.replace(key_to_modify, new_key)
+            use_default_weight_loading = False
+            if "vision" in name:
+                if self.vision_tower is not None:
+                    # We only do sharding for language model and
+                    # not vision model for now.
+                    use_default_weight_loading = True
+            else:
+                for param_name, weight_name, shard_id in stacked_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    # We have mlp.experts[0].gate_proj in the checkpoint.
+                    # Since we handle the experts below in expert_params_mapping,
+                    # we need to skip here BEFORE we update the name, otherwise
+                    # name will be updated to mlp.experts[0].gate_up_proj, which
+                    # will then be updated below in expert_params_mapping
+                    # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                    if ("mlp.experts." in name) and name not in params_dict:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id, **kwargs)
+                    break
+                else:
+                    for idx, (
+                        param_name,
+                        weight_name,
+                        expert_id,
+                        shard_id,
+                    ) in enumerate(expert_params_mapping):
+                        if weight_name not in name:
+                            continue
+                        name = name.replace(weight_name, param_name)
+
+                        param = params_dict[name]
+                        weight_loader = param.weight_loader
+                        weight_loader(
+                            param,
+                            loaded_weight,
+                            name,
+                            expert_id=expert_id,
+                            shard_id=shard_id,
+                            **kwargs,
+                        )
+                        break
+                    else:
+                        use_default_weight_loading = True
+            if use_default_weight_loading:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                # if is_pp_missing_parameter(name, self):
+                #     continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight, **kwargs)
+        self.language_model.post_load_weights()
+
+
+def get_spec_layer_idx_from_weight_name(
+    config: DeepseekV2Config, weight_name: str
+) -> Optional[int]:
+    if hasattr(config, "num_nextn_predict_layers") and (
+        config.num_nextn_predict_layers > 0
+    ):
+        layer_idx = config.num_hidden_layers
+        for i in range(config.num_nextn_predict_layers):
+            if weight_name.startswith(f"model.layers.{layer_idx+i}."):
+                return layer_idx + i
+    return None
+
+
+EntryClass = [KimiVLForConditionalGeneration]
diff --git a/python/sglang/srt/models/kimi_vl_moonvit.py b/python/sglang/srt/models/kimi_vl_moonvit.py
new file mode 100644
index 000000000..a16ee5923
--- /dev/null
+++ b/python/sglang/srt/models/kimi_vl_moonvit.py
@@ -0,0 +1,639 @@
+# SPDX-License-Identifier: Apache-2.0
+# ruff: noqa: E501
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/modeling_kimi_vl.py
+# This file is meant to be used in kimi_vl.py only
+# Copyright 2025 The Moonshot AI Team, DeepSeek-AI, and HuggingFace Inc. team. All rights reserved.
+#
+# The code is based on llava (llava/modeling_llava.py) and DeepSeek-V3 (DeepSeek-V3/modeling_deepseek.py), but modified for KimiVL.
+#
+# Licensing Information:
+# - Code derived from llava (llava/modeling_llava.py) and DeepSeek-V3 (DeepSeek-V3/modeling_deepseek.py) is licensed under the Apache License, Version 2.0.
+# - Other parts of the code are licensed under the MIT License.
+#
+# Apache License, Version 2.0:
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License:
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import math
+from copy import deepcopy
+from functools import cached_property
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.activations import ACT2FN, PytorchGELUTanh
+from transformers.modeling_utils import PreTrainedModel
+
+try:
+    from flash_attn.flash_attn_interface import flash_attn_varlen_func
+except ImportError:
+    flash_attn_varlen_func = None
+
+from sglang.srt.configs import MoonViTConfig
+
+
+def multihead_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    q_cu_seqlens: Optional[torch.Tensor] = None,
+    k_cu_seqlens: Optional[torch.Tensor] = None,
+):
+    """Multi-head attention using flash attention 2.
+    This function is used to handle the case where the query, key, and value are packed.
+    Args:
+        q, k, v: tensor of shape (tot_seqlens, num_heads, head_dim).
+        q_cu_seqlens (torch.Tensor): cumulative sequence lengths of q.
+            The first element should be 0 and the last element should be q.shape[0].
+        k_cu_seqlens (torch.Tensor): cumulative sequence lengths of k.
+            The first element should be 0 and the last element should be k.shape[0].
+
+    Returns:
+        output: shape (batch_size, seqlen, dim) or (tot_seqlens, dim) if packing,
+            where dim = num_heads * head_dim
+    """
+    if flash_attn_varlen_func is None:
+        raise ImportError(
+            "flash_attn is not installed, this function needs flash_attn_varlen_func from flash_attn"
+        )
+    # Unified format legal check
+    assert q.dim() == k.dim() == v.dim() == 3, "q, k, v must have 3 dims"
+    assert q_cu_seqlens[-1] == q.shape[0], "q_cu_seqlens must sum to q.shape[0]"
+    assert (
+        k_cu_seqlens[-1] == k.shape[0] == v.shape[0]
+    ), "k_cu_seqlens must sum to k.shape[0]"
+    assert q.dtype in [
+        torch.bfloat16,
+        torch.float16,
+    ], f"unsupported dtype {q.dtype} for multihead attn"
+
+    max_seqlen_q = (q_cu_seqlens[1:] - q_cu_seqlens[:-1]).max().item()
+    max_seqlen_k = (k_cu_seqlens[1:] - k_cu_seqlens[:-1]).max().item()
+    attn_out = flash_attn_varlen_func(
+        q,
+        k,
+        v,
+        q_cu_seqlens,
+        k_cu_seqlens,
+        max_seqlen_q,
+        max_seqlen_k,
+        causal=False,
+    )
+    attn_out = attn_out.flatten(start_dim=-2)
+
+    return attn_out
+
+
+def sdpa_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    q_cu_seqlens: Optional[torch.Tensor] = None,
+    k_cu_seqlens: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """Multi-head attention using torch scaled dot product attention.
+    This function is used to handle the case where the query, key, and value are packed.
+    Args:
+        q, k, v: tensor of shape (tot_seqlens, num_heads, head_dim).
+        q_cu_seqlens (torch.Tensor): cumulative sequence lengths of q.
+            The first element should be 0 and the last element should be q.shape[0].
+        k_cu_seqlens (torch.Tensor): cumulative sequence lengths of k.
+            The first element should be 0 and the last element should be k.shape[0].
+
+    Returns:
+        output: shape (batch_size, seqlen, dim) or (tot_seqlens, dim) if packing,
+            where dim = num_heads * head_dim
+    """
+    # Unified format legal check
+    assert q.dim() == k.dim() == v.dim() == 3, "q, k, v must have 3 dims"
+    assert q_cu_seqlens[-1] == q.shape[0], "q_cu_seqlens must sum to q.shape[0]"
+    seq_length = q.shape[0]
+    attention_mask = torch.zeros(
+        [1, seq_length, seq_length], device=q.device, dtype=torch.bool
+    )
+    for i in range(1, len(q_cu_seqlens)):
+        attention_mask[
+            ...,
+            q_cu_seqlens[i - 1] : q_cu_seqlens[i],
+            q_cu_seqlens[i - 1] : q_cu_seqlens[i],
+        ] = True
+    q = q.transpose(0, 1)
+    k = k.transpose(0, 1)
+    v = v.transpose(0, 1)
+    attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
+    attn_output = attn_output.transpose(0, 1)
+    attn_output = attn_output.reshape(seq_length, -1)
+    return attn_output
+
+
+VL_VISION_ATTENTION_FUNCTIONS = {
+    "flash_attention_2": multihead_attention,
+    "sdpa": sdpa_attention,
+}
+
+
+def _apply_rope_input_validation(x, freqs_cis):
+    assert x.ndim == freqs_cis.ndim + 1, (x.shape, freqs_cis.shape)
+    assert x.shape[:-2] == freqs_cis.shape[:-1], (x.shape, freqs_cis.shape)
+    assert x.shape[-1] == 2 * freqs_cis.shape[-1], (x.shape, freqs_cis.shape)
+    assert freqs_cis.dtype == torch.complex64, freqs_cis.dtype
+
+
+def apply_rope(
+    xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Args: (The leading dimensions of all inputs should be the same)
+        xq: query, tensor of shape (..., num_heads, head_dim)
+        xk: key, tensor of shape (..., num_heads, head_dim)
+        freqs_cis: tensor of shape (..., head_dim/2), dtype=torch.complex64. It contains the precomputed cis(freqs) for each position in the 2D grid.
+    Returns:
+        xq_out, xk_out: tensors of shape (..., num_heads, head_dim)
+    """
+    _apply_rope_input_validation(xq, freqs_cis)
+    _apply_rope_input_validation(xk, freqs_cis)
+
+    freqs_cis = freqs_cis.unsqueeze(-2)  # ..., 1, head_dim/2
+    # ..., num_heads, head_dim/2
+    xq_ = torch.view_as_complex(xq.float().view(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().view(*xq.shape[:-1], -1, 2))
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(-2)  # ..., num_heads, head_dim
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(-2)  # ..., num_heads, head_dim
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+
+
+class Learnable2DInterpPosEmb(nn.Module):
+
+    def __init__(
+        self, height: int, width: int, dim: int, interpolation_mode: str = "bicubic"
+    ) -> None:
+        super().__init__()
+        self.height = height
+        self.width = width
+        self.interpolation_mode = interpolation_mode
+        self.weight = nn.Parameter(torch.empty(height, width, dim))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.normal_(self.weight)
+
+    def forward(self, x: torch.Tensor, grid_hws: torch.Tensor) -> torch.Tensor:
+        pos_embs = []
+        for shape in grid_hws.tolist():
+            if shape == self.weight.shape[:-1]:
+                pos_embs.append(self.weight.flatten(end_dim=1))
+            else:
+                pos_embs.append(
+                    F.interpolate(
+                        self.weight.permute((2, 0, 1)).unsqueeze(0),
+                        size=shape,
+                        mode=self.interpolation_mode,
+                    )
+                    .squeeze(0)
+                    .permute((1, 2, 0))
+                    .flatten(end_dim=1)
+                )
+        out = x + torch.cat(pos_embs)
+        return out
+
+
+class MoonVisionPatchEmbed(nn.Module):
+
+    def __init__(
+        self,
+        out_dim: int,
+        in_dim: int = 3,
+        patch_size: Union[int, Tuple[int, int]] = (14, 14),
+        pos_emb_height: int = 14,
+        pos_emb_width: int = 14,
+    ):
+        super().__init__()
+        assert isinstance(
+            patch_size, (int, Sequence)
+        ), f"Invalid patch_size type: {type(patch_size)}"
+        if isinstance(patch_size, int):
+            patch_size = (patch_size, patch_size)
+        assert (
+            len(patch_size) == 2
+        ), f"Expected patch_size to be a tuple of 2, got {patch_size}"
+        self.patch_size = patch_size
+
+        self.proj = nn.Conv2d(
+            in_dim, out_dim, kernel_size=patch_size, stride=patch_size
+        )
+
+        self.pos_emb = Learnable2DInterpPosEmb(
+            height=pos_emb_height, width=pos_emb_width, dim=out_dim
+        )
+
+    def forward(self, x: torch.Tensor, grid_hw: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (L, Channels): input tensor
+            grid_hw (N, 2): grid height and width
+
+        Returns:
+            (L, Cout) tensor
+        """
+        x = self.proj(x).view(x.size(0), -1)
+        # apply positional embedding
+        x = self.pos_emb(x, grid_hw)
+        return x
+
+
+class Rope2DPosEmb(nn.Module):
+    """2D rotary position embedding with multi-resolution support.
+
+    This class is intended to be used in the following way:
+    1. Before training, create an instance of Rope2DPosEmb. This instance will hold the precomputed cis.
+    2. Before each forward pass, call `get_freqs_cis_by_*` to get the `freqs_cis` tensor for this iteration.
+    3. During the forward pass, pass the `freqs_cis` tensor to each attention layer, and call `apply` just before each attention operation.
+        The rope is shared across all attention layers and all heads.
+
+    Refs:
+    - RoFormer: https://arxiv.org/abs/2104.09864
+    - VisionLLaMA: https://arxiv.org/abs/2403.00522
+    - https://github.com/Meituan-AutoML/VisionLLaMA/blob/main/dit/models.py
+
+    Args:
+        dim (int): usually the multi-head attention dimension, should be divisible by 4 (TODO: relax this constraint if needed)
+        max_height (int): the maximum height of the 2D grid
+        max_width (int): the maximum width of the 2D grid
+        theta_base (float): the base of the theta
+        device (str): the device to store the precomputed cis
+    """
+
+    def __init__(
+        self, dim: int, max_height: int, max_width: int, theta_base=10000, device="cuda"
+    ):
+        super().__init__()
+        self.dim = dim
+        assert self.dim % 4 == 0, "dim must be divisible by 4"
+        self.max_height = max_height
+        self.max_width = max_width
+        self.theta_base = theta_base
+        self.device = device
+
+    def extra_repr(self):
+        return f"dim={self.dim}, max_height={self.max_height}, max_width={self.max_width}, theta_base={self.theta_base}"
+
+    @cached_property
+    def precomputed_freqs_cis(self) -> torch.Tensor:
+        """Calculate the cis(freqs) for each position in the 2D grid.
+
+        Return: complex tensor of shape (max_height, max_width, dim//2) and value:
+            height axis: ret[h, w, 2*i] = cis(h * theta_base**(-4*i/dim))
+            weight axis: ret[h, w, 2*i+1] = cis(w * theta_base**(-4*i/dim))   with (i in [0, dim//4))
+            note: `cis` is a mathematical notation defined by cis x = cos x + i sin x,
+        """
+        N = self.max_height * self.max_width
+        flat_pos = torch.arange(0, N).float().to(self.device)
+        x_pos = flat_pos % self.max_width
+        y_pos = flat_pos // self.max_width
+        dim_range = (
+            torch.arange(0, self.dim, 4)[: (self.dim // 4)].float().to(self.device)
+        )  # C/4
+        freqs = 1.0 / (self.theta_base ** (dim_range / self.dim))
+        x_freqs = torch.outer(x_pos, freqs).float()  # N, C/4
+        y_freqs = torch.outer(y_pos, freqs).float()  # N, C/4
+        x_cis = torch.polar(torch.ones_like(x_freqs), x_freqs)  # N, C/4
+        y_cis = torch.polar(torch.ones_like(y_freqs), y_freqs)  # N, C/4
+        # N, C/4, 2
+        freqs_cis = torch.cat(
+            [x_cis.unsqueeze(dim=-1), y_cis.unsqueeze(dim=-1)], dim=-1
+        )
+        # max_height, max_width, C/2
+        freqs_cis = freqs_cis.reshape(self.max_height, self.max_width, -1)
+        return freqs_cis
+
+    def get_freqs_cis_by_seqlens(self, grid_hws: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            grid_hws (torch.Tensor): containing list of (height, width) or (t, height, width) tuples.
+        Returns:
+            freqs_cis: tensor of shape (sum(t * height * width), dim//2)
+        """
+        shapes = grid_hws.tolist()
+        assert all(
+            1 <= h <= self.max_height and 1 <= w <= self.max_width for h, w in shapes
+        ), (
+            shapes,
+            self.max_height,
+            self.max_width,
+        )
+        freqs_cis = torch.cat(
+            [
+                self.precomputed_freqs_cis[:h, :w].reshape(-1, self.dim // 2)
+                for h, w in shapes
+            ],
+            dim=0,
+        )
+        return freqs_cis
+
+    def get_freqs_cis_by_idx(
+        self, pos_idx: torch.Tensor, pos_idx_mask: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Args:
+            pos_idx: tensor of shape (..., 2), It contains the (h, w) position indices of each 2D token.
+            pos_idx_mask: a mask of shape (...), the leading dimensions should be the same as pos_idx.
+                Rope will only be applied to the tokens with True mask. `freqs_cis` for the tokens with False mask with be ones.
+        Return:
+            freqs_cis: tensor of shape (..., dim//2)
+        """
+        assert (
+            pos_idx.shape[:-1] == pos_idx_mask.shape
+            and pos_idx.shape[-1] == 2
+            and pos_idx.ndim == pos_idx_mask.ndim + 1
+        ), (pos_idx.shape, pos_idx_mask.shape)
+        assert pos_idx_mask.dtype == torch.bool, pos_idx_mask.dtype
+
+        shp = pos_idx_mask.shape + (self.dim // 2,)  # ..., head_dim/2
+        freqs_cis = torch.ones(
+            shp, dtype=torch.complex64, device=self.device
+        )  # ..., head_dim/2
+        freqs_cis[pos_idx_mask] = self.precomputed_freqs_cis[
+            pos_idx[..., 0][pos_idx_mask], pos_idx[..., 1][pos_idx_mask]
+        ]
+        return freqs_cis
+
+
+class MLP2(nn.Module):
+    """
+    Args:
+        dims: [in_dim, hidden_dim, out_dim]
+        bias: whether to use bias in linear layer.
+    """
+
+    def __init__(self, dims: list[int], activation, bias=True):
+        super().__init__()
+        assert len(dims) == 3
+        self.fc0 = nn.Linear(dims[0], dims[1], bias=bias)
+        self.fc1 = nn.Linear(dims[1], dims[2], bias=bias)
+        self.activation = activation
+        for m in [self.fc0, self.fc1]:
+            nn.init.trunc_normal_(m.weight, std=math.sqrt(2 / m.in_features))
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc0(x)
+        x = self.activation(x)
+        return self.fc1(x)
+
+
+class MoonVitEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        num_heads: int,
+        hidden_dim: int,
+        mlp_dim: int,
+        *,
+        attn_implementation: str = "flash_attention_2",  # use fa2 in sglang by default
+        activation=F.gelu,
+        attn_bias: bool = False,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.hidden_size_per_attention_head = self.hidden_dim // self.num_heads
+        self.attn_implementation = attn_implementation
+
+        self.norm0 = nn.LayerNorm(hidden_dim)
+        self.norm1 = nn.LayerNorm(hidden_dim)
+        self.mlp = MLP2([hidden_dim, mlp_dim, hidden_dim], activation)
+        self.wqkv = nn.Linear(hidden_dim, hidden_dim * 3, bias=attn_bias)
+        self.wo = nn.Linear(hidden_dim, hidden_dim, bias=attn_bias)
+
+    def attention_qkvpacked(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rope_freqs_cis: Optional[torch.Tensor] = None,
+    ):
+        """
+        Args:
+            x (torch.Tensor): (batch_size, seqlen, hidden_dim)
+            cu_seqlens (torch.Tensor):
+        """
+        xqkv = self.wqkv(x)
+
+        qkv_shape = xqkv.size()[:-1] + (
+            3,
+            self.num_heads,
+            self.hidden_size_per_attention_head,
+        )
+        # xqkv: (batch_size, seqlen, 3, nheads, headdim)
+        xqkv = xqkv.view(*qkv_shape)
+        xq, xk, xv = torch.unbind(xqkv, dim=-3)
+
+        xq, xk = apply_rope(xq, xk, rope_freqs_cis)
+
+        attn_func = VL_VISION_ATTENTION_FUNCTIONS[self.attn_implementation]
+        attn_out = attn_func(
+            xq, xk, xv, q_cu_seqlens=cu_seqlens, k_cu_seqlens=cu_seqlens
+        )
+
+        attn_out = self.wo(attn_out)
+        return attn_out
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rope_freqs_cis: Union[torch.Tensor, None] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states: non-packed (B, N, D) or packed (L, D). if non-packed, seqlens should be None, if packed, seqlens should be set
+
+        Returns:
+            output: same shape of input, non-packed (B, N, D) for non-packed input, (L, D) for packed input
+        """
+        residual = hidden_states
+        hidden_states = self.norm0(hidden_states)
+        attn_out = self.attention_qkvpacked(
+            hidden_states, cu_seqlens, rope_freqs_cis=rope_freqs_cis
+        )
+        hidden_states = residual + attn_out
+
+        residual = hidden_states
+        hidden_states = self.mlp(self.norm1(hidden_states))
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class MoonVitEncoder(nn.Module):
+
+    def __init__(
+        self,
+        hidden_dim: int,
+        num_layers: int,
+        block_cfg: dict,
+    ) -> None:
+        super().__init__()
+
+        self.rope_2d = Rope2DPosEmb(
+            block_cfg["hidden_dim"] // block_cfg["num_heads"], 512, 512
+        )
+        self.blocks = nn.ModuleList(
+            [MoonVitEncoderLayer(**block_cfg) for _ in range(num_layers)]
+        )
+        self.final_layernorm = nn.LayerNorm(hidden_dim)
+
+    def forward(
+        self, hidden_states: torch.Tensor, grid_hw: torch.Tensor
+    ) -> torch.Tensor:
+        rope_freqs_cis = self.rope_2d.get_freqs_cis_by_seqlens(grid_hws=grid_hw)
+
+        lengths = torch.cat(
+            (
+                torch.zeros(1, device=hidden_states.device, dtype=grid_hw.dtype),
+                grid_hw[:, 0] * grid_hw[:, 1],
+            )
+        )
+        cu_seqlens = lengths.cumsum(dim=0, dtype=torch.int32)
+
+        for _, block in enumerate(self.blocks):
+            hidden_states = block(
+                hidden_states, cu_seqlens, rope_freqs_cis=rope_freqs_cis
+            )
+
+        hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states
+
+
+def patch_merger(
+    x: torch.Tensor,
+    grid_hw: torch.Tensor,
+    merge_kernel_size: list[int, int] = (2, 2),
+) -> List[torch.Tensor]:
+    d_model = x.size(-1)
+
+    outputs = []
+    pre_sum = 0
+    for x_shape in grid_hw.tolist():
+        height, width = x_shape[0], x_shape[1]
+        # Get the current sequence
+        seq = x[pre_sum : pre_sum + height * width]
+        # Reshape along self.merge_kernel_size and concat to the last dimension
+        kernel_height, kernel_width = merge_kernel_size
+        new_height, new_width = height // kernel_height, width // kernel_width
+        reshaped_seq = seq.view(
+            new_height, kernel_height, new_width, kernel_width, d_model
+        )
+        reshaped_seq = reshaped_seq.permute(0, 2, 1, 3, 4).contiguous()
+        padded_seq = reshaped_seq.view(
+            new_height * new_width, kernel_height * kernel_width, -1
+        )
+        outputs.append(padded_seq)
+        pre_sum += height * width
+
+    return outputs
+
+
+class MoonVitVLProjector(nn.Module):
+
+    def __init__(
+        self,
+        in_channels: int,
+        merge_kernel_size: list[int, int],
+        hidden_act: str = "gelu",
+        ln_eps: float = 1e-5,
+        out_dim: int = 4096,
+    ):
+        super().__init__()
+        self.hidden_size = in_channels * merge_kernel_size[0] * merge_kernel_size[1]
+
+        self.pre_norm = nn.nn.LayerNorm(in_channels, eps=ln_eps)
+        self.linear_1 = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
+        self.act = ACT2FN[hidden_act]
+        self.linear_2 = nn.Linear(self.hidden_size, out_dim, bias=True)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.pre_norm(hidden_states).view(-1, self.hidden_size)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class MoonVitPretrainedModel(PreTrainedModel):
+    config_class = MoonViTConfig
+    model_type = "moonvit"
+    _no_split_modules = ["PackingTransformer"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+
+    def __init__(self, config: MoonViTConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        config = deepcopy(config)
+        self.merge_kernel_size = config.merge_kernel_size
+        self.patch_size = config.patch_size
+        self.patch_embed = MoonVisionPatchEmbed(
+            out_dim=config.hidden_size,
+            patch_size=config.patch_size,
+            pos_emb_height=config.init_pos_emb_height,
+            pos_emb_width=config.init_pos_emb_width,
+        )
+
+        self.encoder = MoonVitEncoder(
+            hidden_dim=config.hidden_size,
+            num_layers=config.num_hidden_layers,
+            block_cfg={
+                "num_heads": config.num_attention_heads,
+                "hidden_dim": config.hidden_size,
+                "mlp_dim": config.intermediate_size,
+                "activation": PytorchGELUTanh(),
+                "attn_bias": True,
+                "attn_implementation": config._attn_implementation,
+            },
+        )
+
+    def forward(
+        self, pixel_values: torch.Tensor, grid_hw: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Args:
+            pixel_values (torch.Tensor): The input pixel values.
+            grid_hw (torch.Tensor): The grid height and width.
+
+        Returns:
+            torch.Tensor: The output tokens.
+        """
+        hidden_states = self.patch_embed(pixel_values, grid_hw)
+        hidden_states = self.encoder(hidden_states, grid_hw)
+        hidden_states = patch_merger(
+            hidden_states, grid_hw, merge_kernel_size=self.merge_kernel_size
+        )
+        return hidden_states
diff --git a/python/sglang/srt/models/llama.py b/python/sglang/srt/models/llama.py
new file mode 100644
index 000000000..fc0ce930a
--- /dev/null
+++ b/python/sglang/srt/models/llama.py
@@ -0,0 +1,749 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/llama.py#L1
+"""Inference-only LLaMA model compatible with HuggingFace weights."""
+
+import logging
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from sglang.srt.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    kv_cache_scales_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.utils import add_prefix, make_layers
+from sglang.utils import get_exception_traceback
+
+logger = logging.getLogger(__name__)
+
+
+class LlamaMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        reduce_results: bool = True,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+            reduce_results=reduce_results,
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(
+        self,
+        x,
+        forward_batch=None,
+        use_reduce_scatter: bool = False,
+    ):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(
+            x,
+            skip_all_reduce=use_reduce_scatter,
+        )
+        return x
+
+
+class LlamaAttention(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        rope_is_neox_style: bool = True,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        bias: bool = False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(
+            config, "head_dim", self.hidden_size // self.total_num_heads
+        )
+        partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
+        self.rotary_dim = int(partial_rotary_factor * self.head_dim)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.rotary_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=rope_is_neox_style,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class LlamaDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+        rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        # Support llamafy/Qwen-Qwen2.5-7B-Instruct-llamafied with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False
+        )
+        self.self_attn = LlamaAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            rope_is_neox_style=rope_is_neox_style,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+            bias=attention_bias,
+        )
+        self.mlp = LlamaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class LlamaModel(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("embed_tokens", prefix),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: LlamaDecoderLayer(
+                config=config, quant_config=quant_config, layer_id=idx, prefix=prefix
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix="model.layers",
+        )
+
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+        self.layers_to_capture = []
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]], PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            # FIXME(@ying): reduce the number of proxy tensors by not fusing layer norms
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+            deferred_norm = None
+
+        aux_hidden_states = []
+        for i in range(self.start_layer, self.end_layer):
+            if i in self.layers_to_capture:
+                aux_hidden_states.append(hidden_states + residual)
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+
+        return hidden_states, aux_hidden_states
+
+    # If this function is called, it should always initialize KV cache scale
+    # factors (or else raise an exception). Thus, handled exceptions should
+    # make sure to leave KV cache scale factors in a known good (dummy) state
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+        for layer_idx, scaling_factor in kv_cache_scales_loader(
+            quantization_param_path,
+            tp_rank,
+            tp_size,
+            self.config.num_hidden_layers,
+            self.config.__class__.model_type,
+        ):
+            if not isinstance(self.layers[layer_idx], nn.Identity):
+                layer_self_attn = self.layers[layer_idx].self_attn
+
+            if hasattr(layer_self_attn.attn, "k_scale"):
+                layer_self_attn.attn.k_scale = scaling_factor
+                layer_self_attn.attn.v_scale = scaling_factor
+            else:
+                raise RuntimeError(
+                    "Self attention has no KV cache scaling " "factor attribute!"
+                )
+
+
+class LlamaForCausalLM(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    # in TP, these weights are partitioned along the column dimension (dim=-1)
+    column_parallel_weights_modules = [".down_proj.", ".o_proj."]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        ".q_proj": (".qkv_proj", 0),
+        ".k_proj": (".qkv_proj", 1),
+        ".v_proj": (".qkv_proj", 2),
+        ".gate_proj": (".gate_up_proj", 0),
+        ".up_proj": (".gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = self._init_model(config, quant_config, add_prefix("model", prefix))
+        # Llama 3.2 1B Instruct set tie_word_embeddings to True
+        # Llama 3.1 8B Instruct set tie_word_embeddings to False
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+                use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
+            )
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+        self.stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+        self.capture_aux_hidden_states = False
+
+    def _init_model(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        return LlamaModel(config, quant_config=quant_config, prefix=prefix)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if self.pp_group.is_last_rank:
+            if not get_embedding:
+                return self.logits_processor(
+                    input_ids,
+                    hidden_states,
+                    self.lm_head,
+                    forward_batch,
+                    aux_hidden_states,
+                )
+            else:
+                return self.pooler(hidden_states, forward_batch)
+        else:
+            return hidden_states
+
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],  # [start, end) 0-based
+        input_embeds: torch.Tensor = None,
+    ) -> Optional[LogitsProcessorOutput]:
+        start, end = split_interval
+        # embed
+        if start == 0:
+            if input_embeds is None:
+                forward_batch.hidden_states = self.model.embed_tokens(input_ids)
+            else:
+                forward_batch.hidden_states = input_embeds
+        # decoder layer
+        for i in range(start, end):
+            layer = self.model.layers[i]
+            forward_batch.hidden_states, forward_batch.residual = layer(
+                positions,
+                forward_batch.hidden_states,
+                forward_batch,
+                forward_batch.residual,
+            )
+
+        if end == self.model.config.num_hidden_layers:
+            # norm
+            hidden_states, _ = self.model.norm(
+                forward_batch.hidden_states, forward_batch.residual
+            )
+            forward_batch.hidden_states = hidden_states
+            # logits process
+            result = self.logits_processor(
+                input_ids, forward_batch.hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            result = None
+
+        return result
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    def get_module_name_from_weight_name(self, name):
+        for param_name, weight_name, shard_id, num_shard in self.stacked_params_mapping:
+            if weight_name in name:
+                return (
+                    name.replace(weight_name, param_name)[: -len(".weight")],
+                    num_shard,
+                )
+        return name[: -len(".weight")], 1
+
+    def get_num_params(self):
+        params_dict = dict(self.named_parameters())
+        return len(params_dict)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            # Handle FP8 kv-scale remapping
+            if "scale" in name:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip loading kv_scale from ckpts towards new design.
+                if name.endswith(".kv_scale") and name not in params_dict:
+                    continue
+                if name in params_dict.keys():
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                else:
+                    logger.warning(f"Parameter {name} not found in params_dict")
+
+    def get_weights_by_name(
+        self, name: str, truncate_size: int = 100, tp_size: int = 1
+    ) -> Optional[torch.Tensor]:
+        """Get the weights of the parameter by its name. Similar to `get_parameter` in Hugging Face.
+
+        Only used for unit test with an unoptimized performance.
+        For optimized performance, please use torch.save and torch.load.
+        """
+        try:
+            if name == "lm_head.weight" and self.config.tie_word_embeddings:
+                logger.info(
+                    "word embedding is tied for this model, return embed_tokens.weight as lm_head.weight."
+                )
+                return (
+                    self.model.embed_tokens.weight.cpu()
+                    .to(torch.float32)
+                    .numpy()
+                    .tolist()[:truncate_size]
+                )
+
+            mapped_name = name
+            mapped_shard_id = None
+            for param_name, weight_name, shard_id in self.stacked_params_mapping:
+                if weight_name in name:
+                    mapped_name = name.replace(weight_name, param_name)
+                    mapped_shard_id = shard_id
+                    break
+            params_dict = dict(self.named_parameters())
+            param = params_dict[mapped_name]
+            if mapped_shard_id is not None:
+                if mapped_shard_id in ["q", "k", "v"]:
+                    num_heads = self.config.num_attention_heads // tp_size
+                    num_kv_heads = self.config.num_key_value_heads // tp_size
+                    head_dim = (
+                        self.config.hidden_size // self.config.num_attention_heads
+                    )
+                    if mapped_shard_id == "q":
+                        offset = 0
+                        size = num_heads * head_dim
+                    elif mapped_shard_id == "k":
+                        offset = num_heads * head_dim
+                        size = num_kv_heads * head_dim
+                    elif mapped_shard_id == "v":
+                        offset = (num_heads + num_kv_heads) * head_dim
+                        size = num_kv_heads * head_dim
+                    weight = param.data.narrow(0, offset, size)
+                elif mapped_shard_id in [0, 1]:
+                    intermediate_size = self.config.intermediate_size
+                    slice_size = intermediate_size // tp_size
+                    if mapped_shard_id == 0:  # gate_proj
+                        offset = 0
+                        size = slice_size
+                    elif mapped_shard_id == 1:  # up_proj
+                        offset = slice_size
+                        size = slice_size
+
+                    weight = param.data.narrow(0, offset, size)
+                else:
+                    weight = param.data
+            else:
+                weight = param.data
+            if tp_size > 1 and ("o_proj" in name or "down_proj" in name):
+                gathered_weights = [torch.zeros_like(weight) for _ in range(tp_size)]
+                torch.distributed.all_gather(gathered_weights, weight)
+                weight = torch.cat(gathered_weights, dim=1)
+            return weight.cpu().to(torch.float32).numpy().tolist()[:truncate_size]
+
+        except Exception:
+            logger.error(
+                f"Error getting weights by name {name} in LlamaForCausalLM: {get_exception_traceback()}"
+            )
+            return None
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def get_embed(self):
+        return self.model.embed_tokens.weight
+
+    def set_embed(self, embed):
+        # NOTE: If draft hidden size != target hidden size, the embed weight cannot be shared for EAGLE3
+        if (
+            hasattr(self.config, "target_hidden_size")
+            and self.config.target_hidden_size != self.config.hidden_size
+        ):
+            return
+        del self.model.embed_tokens.weight
+        self.model.embed_tokens.weight = embed
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        self.model.load_kv_cache_scales(quantization_param_path)
+
+    def set_eagle3_layers_to_capture(self, layer_ids: Optional[List[int]] = None):
+        if not self.pp_group.is_last_rank:
+            return
+
+        if layer_ids is None:
+            self.capture_aux_hidden_states = True
+            num_layers = self.config.num_hidden_layers
+            self.model.layers_to_capture = [2, num_layers // 2, num_layers - 3]
+        else:
+            self.capture_aux_hidden_states = True
+            # we plus 1 here because in sglang, for the ith layer, it takes the output
+            # of the (i-1)th layer as aux hidden state
+            self.model.layers_to_capture = [val + 1 for val in layer_ids]
+
+
+class Phi3ForCausalLM(LlamaForCausalLM):
+    pass
+
+
+class InternLM3ForCausalLM(LlamaForCausalLM):
+    pass
+
+
+EntryClass = [LlamaForCausalLM, Phi3ForCausalLM, InternLM3ForCausalLM]
diff --git a/python/sglang/srt/models/llama4.py b/python/sglang/srt/models/llama4.py
new file mode 100644
index 000000000..2d2a60730
--- /dev/null
+++ b/python/sglang/srt/models/llama4.py
@@ -0,0 +1,561 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/v0.8.3/vllm/model_executor/models/llama4.py
+"""Inference-only LLaMA model compatible with HuggingFace weights."""
+
+import logging
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import Llama4TextConfig
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.model_executor.forward_batch_info import (
+    ForwardBatch,
+    ForwardMode,
+    PPProxyTensors,
+)
+from sglang.srt.models.llama import LlamaForCausalLM, LlamaMLP
+from sglang.srt.utils import (
+    add_prefix,
+    fast_topk,
+    get_compiler_backend,
+    is_cuda,
+    make_layers,
+)
+
+_is_cuda = is_cuda()
+
+logger = logging.getLogger(__name__)
+
+
+class Llama4MoE(nn.Module):
+
+    @torch.compile(dynamic=True, backend=get_compiler_backend())
+    @staticmethod
+    def custom_routing_function(
+        hidden_states: torch.Tensor,
+        gating_output: torch.Tensor,
+        topk: int,
+        renormalize: bool,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        router_scores_aK, router_indices_aK = fast_topk(gating_output, topk, dim=-1)
+        router_scores_aK = torch.sigmoid(router_scores_aK.float()).to(
+            hidden_states.dtype
+        )
+        return (
+            router_scores_aK.view(-1).reshape(router_scores_aK.shape),
+            router_indices_aK.to(torch.int32),
+        )
+
+    def __init__(
+        self,
+        config: Llama4TextConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.top_k = config.num_experts_per_tok
+        self.device_module = torch.get_device_module()
+
+        intermediate_size_moe = config.intermediate_size
+        self.router = ReplicatedLinear(
+            config.hidden_size,
+            config.num_local_experts,
+            bias=False,
+            quant_config=None,
+            prefix=add_prefix("router", prefix),
+        )
+
+        self.topk = TopK(
+            top_k=self.top_k,
+            renormalize=False,
+            custom_routing_function=Llama4MoE.custom_routing_function,
+        )
+
+        self.experts = FusedMoE(
+            num_experts=config.num_local_experts,
+            hidden_size=config.hidden_size,
+            intermediate_size=intermediate_size_moe,
+            layer_id=layer_id,
+            reduce_results=False,
+            quant_config=quant_config,
+            apply_router_weight_on_input=True,
+            prefix=add_prefix("experts", prefix),
+        )
+
+        self.shared_expert = LlamaMLP(
+            hidden_size=config.hidden_size,
+            intermediate_size=intermediate_size_moe,
+            hidden_act="silu",
+            quant_config=quant_config,
+            prefix=add_prefix("shared_expert", prefix),
+            reduce_results=False,  # We need to do scatter before reduce
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        forward_batch: ForwardBatch,
+        use_reduce_scatter: bool = False,
+    ):
+        shared_out, routed_out = self._forward_core(
+            hidden_states, forward_batch.forward_mode
+        )
+
+        out_aD = routed_out + shared_out
+
+        if self.tp_size > 1 and not use_reduce_scatter:
+            out_aD = tensor_model_parallel_all_reduce(out_aD)
+
+        return out_aD
+
+    def _forward_core(self, hidden_states, forward_mode: ForwardMode):
+        if hidden_states.shape[0] < 4 and _is_cuda:
+            return self._forward_core_shared_routed_overlap(hidden_states)
+        else:
+            return self._forward_core_normal(hidden_states)
+
+    def _forward_core_normal(self, hidden_states):
+        # router_scores: [num_tokens, num_experts]
+        router_logits, _ = self.router(hidden_states)
+        shared_out = self.shared_expert(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        routed_out = self.experts(hidden_states, topk_output)
+        return shared_out, routed_out
+
+    def _forward_core_shared_routed_overlap(self, hidden_states):
+        alt_stream = _get_or_create_alt_stream(self.device_module)
+
+        alt_stream.wait_stream(self.device_module.current_stream())
+
+        shared_out = self.shared_expert(hidden_states)
+
+        with self.device_module.stream(alt_stream):
+            # router_scores: [num_tokens, num_experts]
+            router_logits, _ = self.router(hidden_states)
+            topk_output = self.topk(hidden_states, router_logits)
+            routed_out = self.experts(hidden_states, topk_output)
+        self.device_module.current_stream().wait_stream(alt_stream)
+
+        return shared_out, routed_out
+
+
+_alt_stream = None
+
+
+def _get_or_create_alt_stream(device_module):
+    global _alt_stream
+    if _alt_stream is None:
+        _alt_stream = device_module.Stream()
+    return _alt_stream
+
+
+class Llama4Attention(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4TextConfig,
+        layer_id: int,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        bias_o_proj: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.hidden_size = hidden_size
+        self.use_rope = (layer_id + 1) % 4 != 0
+        self.use_qk_norm = config.use_qk_norm and self.use_rope
+
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % attn_tp_size == 0
+        self.num_heads = self.total_num_heads // attn_tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
+        self.head_dim = config.head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.attn_temperature_tuning = config.attn_temperature_tuning
+        self.floor_scale = config.floor_scale
+        self.attn_scale = config.attn_scale
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.n_rep = self.num_heads // self.num_kv_heads
+        self.qk_norm = (
+            RMSNorm(
+                hidden_size=self.head_dim,
+                eps=config.rms_norm_eps,
+            )
+            if self.use_qk_norm
+            else None
+        )
+
+        qkv_quant_config = quant_config
+        o_quant_config = quant_config
+        if quant_config and hasattr(quant_config, "ignore") and quant_config.ignore:
+            if add_prefix("q_proj", prefix) in quant_config.ignore:
+                qkv_quant_config = None
+            if add_prefix("o_proj", prefix) in quant_config.ignore:
+                o_quant_config = None
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=qkv_quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias_o_proj,
+            quant_config=o_quant_config,
+            prefix=add_prefix("o_proj", prefix),
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            reduce_results=False,
+        )
+        is_neox_style = True
+        is_gguf = quant_config and quant_config.get_name() == "gguf"
+        if is_gguf and config.model_type in ["llama", "llama4"]:
+            is_neox_style = False
+
+        self.rotary_emb = (
+            get_rope(
+                self.head_dim,
+                rotary_dim=self.head_dim,
+                max_position=max_position_embeddings,
+                base=int(rope_theta),
+                rope_scaling=rope_scaling if rope_scaling != "default" else None,
+                is_neox_style=is_neox_style,
+            )
+            if self.use_rope
+            else None
+        )
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            prefix=add_prefix("attn", prefix),
+            use_irope=self.use_rope,
+        )
+
+    def _get_attn_scale(self, positions: torch.Tensor) -> torch.Tensor:
+        floor = torch.floor((positions + 1.0) / self.floor_scale)
+        attn_scale = torch.log(floor + 1.0) * self.attn_scale + 1.0
+        return attn_scale.unsqueeze(-1)
+
+    @torch.compile(dynamic=True, backend=get_compiler_backend())
+    def _mul_attn_scale(self, positions, q):
+        attn_scale = self._get_attn_scale(positions)
+        return (q * attn_scale).to(q.dtype)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+
+        qk, v = qkv.split([self.q_size + self.kv_size, self.kv_size], dim=-1)
+
+        if self.rotary_emb is not None:
+            q_view, k_view = qk.split([self.q_size, self.kv_size], dim=-1)
+            q_out_unused, k_out_unused = self.rotary_emb(positions, q_view, k_view)
+            del q_view, k_view, q_out_unused, k_out_unused
+
+        if self.qk_norm is not None:
+            # TODO there are still 2 redundant direct_copy_kernel_cuda for this `reshape` and (in attn backend) q.contiguous(), maybe we can fuse them later
+            qk = qk.reshape(-1, self.head_dim).contiguous().bfloat16()
+            qk = self.qk_norm(qk).to(torch.bfloat16)
+            qk = qk.reshape(-1, self.q_size + self.kv_size)
+
+        q, k = qk.split([self.q_size, self.kv_size], dim=-1)
+
+        # We are applying temperature tuning (https://arxiv.org/abs/2501.19399) to NoPE layers, where
+        # the inference-time temperature tuning function is customized to not affect short context
+        # while working at very long context
+        # https://arxiv.org/abs/2501.19399
+        if self.attn_temperature_tuning and not self.use_rope:
+            q = self._mul_attn_scale(positions=positions, q=q)
+
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Llama4DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: Llama4TextConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.layer_id = layer_id
+        self.hidden_size = config.hidden_size
+        rope_theta = config.rope_theta
+        rope_scaling = config.rope_scaling
+        max_position_embeddings = config.max_position_embeddings
+        self.attn_tp_size = get_attention_tp_size()
+        self.attn_tp_rank = get_attention_tp_rank()
+
+        self.self_attn = Llama4Attention(
+            config=config,
+            layer_id=layer_id,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=False,
+            bias_o_proj=False,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.config = config
+        is_moe_layer = self._is_moe_layer(layer_id)
+        is_previous_moe_layer = self._is_moe_layer(layer_id - 1)
+
+        if is_moe_layer:
+            self.feed_forward = Llama4MoE(
+                config=config,
+                layer_id=layer_id,
+                quant_config=quant_config,
+                prefix=add_prefix("feed_forward", prefix),
+            )
+        else:
+            self.feed_forward = LlamaMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.intermediate_size_mlp,
+                hidden_act="silu",
+                quant_config=quant_config,
+                prefix=add_prefix("feed_forward", prefix),
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=is_moe_layer,
+            is_previous_layer_sparse=is_previous_moe_layer,
+        )
+
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            allow_reduce_scatter=True,
+        )
+
+    def _is_moe_layer(self, layer_id: int) -> bool:
+        if self.config.interleave_moe_layer_step == 0:
+            return self.config.num_local_experts > 0
+        return (layer_id + 1) % self.config.interleave_moe_layer_step == 0
+
+    def get_intermediate_size(self) -> int:
+        if isinstance(self.feed_forward, Llama4MoE):
+            return self.config.intermediate_size
+        else:
+            return self.config.intermediate_size_mlp
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+
+        if hidden_states.shape[0] != 0:
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+
+        # For DP with padding, reduce scatter can be used instead of all-reduce.
+        use_reduce_scatter = self.layer_communicator.should_use_reduce_scatter(
+            forward_batch
+        )
+
+        # Fully Connected
+        hidden_states = self.feed_forward(
+            hidden_states, forward_batch, use_reduce_scatter
+        )
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            hidden_states, residual, forward_batch
+        )
+
+        return hidden_states, residual
+
+
+class Llama4Model(nn.Module):
+    def __init__(
+        self,
+        config: Llama4TextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("embed_tokens", prefix),
+            enable_tp=not is_dp_attention_enabled(),
+        )
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Llama4DecoderLayer(
+                config=config, layer_id=idx, quant_config=quant_config, prefix=prefix
+            ),
+            prefix=add_prefix("layers", prefix),
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.layers_to_capture = []
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]]]:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        residual = None
+        aux_hidden_states = []
+        for i in range(len(self.layers)):
+            if i in self.layers_to_capture:
+                aux_hidden_states.append(hidden_states + residual)
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        if not forward_batch.forward_mode.is_idle():
+            hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+
+        return hidden_states, aux_hidden_states
+
+
+class Llama4ForCausalLM(LlamaForCausalLM):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    def __init__(
+        self,
+        config: Llama4TextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__(config, quant_config, prefix)
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def get_layers(self):
+        return self.model.layers
+
+    def _init_model(
+        self,
+        config: Llama4TextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        return Llama4Model(config, quant_config=quant_config, prefix=prefix)
+
+
+EntryClass = [Llama4ForCausalLM]
diff --git a/python/sglang/srt/models/llama_classification.py b/python/sglang/srt/models/llama_classification.py
new file mode 100644
index 000000000..8387d2030
--- /dev/null
+++ b/python/sglang/srt/models/llama_classification.py
@@ -0,0 +1,81 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.llama import LlamaForCausalLM, LlamaModel
+from sglang.srt.utils import add_prefix
+
+
+class LlamaForClassification(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = LlamaModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+
+        self.classification_head = nn.Linear(
+            config.hidden_size, config.classification_out_size, bias=False
+        )
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=False)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = True,
+    ) -> EmbeddingPoolerOutput:
+        assert (
+            get_embedding
+        ), "LlamaForClassification is only used for embedding. Please add --is-embedding when you launch the server."
+
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        last_token_hidden = self.pooler(hidden_states, forward_batch).embeddings
+        scores = self.classification_head(last_token_hidden)
+
+        return EmbeddingPoolerOutput(scores)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            if "classification_head" in name:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            elif "lm_head" in name:
+                continue
+            else:
+                LlamaForCausalLM.load_weights(self, [(name, loaded_weight)])
+
+
+EntryClass = LlamaForClassification
diff --git a/python/sglang/srt/models/llama_eagle.py b/python/sglang/srt/models/llama_eagle.py
new file mode 100644
index 000000000..881731fdf
--- /dev/null
+++ b/python/sglang/srt/models/llama_eagle.py
@@ -0,0 +1,149 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from sglang.srt.utils import add_prefix
+
+# Adapted from
+# https://github.com/SafeAILab/EAGLE/blob/main/eagle/model/cnets.py
+"""Inference-only LLaMA-EAGLE model compatible with HuggingFace weights."""
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from sglang.srt.distributed import get_pp_group
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.models.llama import LlamaDecoderLayer, LlamaForCausalLM
+
+
+class LlamaDecoderLayer(LlamaDecoderLayer):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, layer_id, quant_config, prefix)
+
+        # Skip the input_layernorm
+        # https://github.com/SafeAILab/EAGLE/blob/35c78f6cdc19a73e05cf5c330b4c358dad970c6a/eagle/model/cnets.py#L427
+        if layer_id == 0:
+            del self.input_layernorm
+            setattr(self, "input_layernorm", lambda x: x)
+
+
+class LlamaModel(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = nn.ModuleList(
+            [
+                LlamaDecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.fc = torch.nn.Linear(config.hidden_size * 2, config.hidden_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        hidden_states = self.fc(
+            torch.cat((hidden_states, forward_batch.spec_info.hidden_states), dim=-1)
+        )
+
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        return hidden_states + residual
+
+
+class LlamaForCausalLMEagle(LlamaForCausalLM):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.quant_config = quant_config
+        self.pp_group = get_pp_group()
+        self.model = LlamaModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        # Llama 3.2 1B Instruct set tie_word_embeddings to True
+        # Llama 3.1 8B Instruct set tie_word_embeddings to False
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                getattr(config, "hot_vocab_size", config.vocab_size),
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+
+        self.logits_processor = LogitsProcessor(config)
+        self.capture_aux_hidden_states = False
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        for name, loaded_weight in weights:
+            if "lm_head" not in name:
+                name = "model." + name
+                super().load_weights([(name, loaded_weight)])
+
+
+EntryClass = [LlamaForCausalLMEagle]
diff --git a/python/sglang/srt/models/llama_eagle3.py b/python/sglang/srt/models/llama_eagle3.py
new file mode 100644
index 000000000..87ae7ade5
--- /dev/null
+++ b/python/sglang/srt/models/llama_eagle3.py
@@ -0,0 +1,266 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from sglang.srt.utils import add_prefix
+
+# Adapted from
+# https://github.com/SafeAILab/EAGLE/blob/main/eagle/model/cnets.py
+"""Inference-only LLaMA-EAGLE model compatible with HuggingFace weights."""
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from sglang.srt.distributed import get_pp_group
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import QKVParallelLinear, RowParallelLinear
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.llama import LlamaDecoderLayer, LlamaForCausalLM, LlamaMLP
+
+
+class LlamaDecoderLayer(LlamaDecoderLayer):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, layer_id, quant_config, prefix)
+
+        # override qkv
+        self.self_attn.qkv_proj = QKVParallelLinear(
+            2 * self.hidden_size,
+            self.self_attn.head_dim,
+            self.self_attn.total_num_heads,
+            self.self_attn.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        if config.model_type == "llama4_text":
+            inter_size = config.intermediate_size_mlp
+        else:
+            inter_size = config.intermediate_size
+
+        self.mlp = LlamaMLP(
+            config.hidden_size, inter_size, config.hidden_act, quant_config, prefix
+        )
+
+        self.hidden_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        embeds: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        residual = hidden_states
+        embeds = self.input_layernorm(embeds)
+        hidden_states = self.hidden_norm(hidden_states)
+
+        hidden_states = torch.cat([embeds, hidden_states], dim=-1)
+        # Self Attention
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+
+        # Fully Connected
+        hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+class LlamaModel(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.is_mrope_enabled = (
+            hasattr(config, "rope_scaling")
+            and config.rope_scaling is not None
+            and "mrope_section" in config.rope_scaling
+        )
+        # fix rope_scaling for qwen2.5-vl
+        if self.is_mrope_enabled:
+            config.rope_scaling["rope_type"] = "default"
+
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+
+        if hasattr(config, "target_hidden_size"):
+            self.hidden_size_in = config.target_hidden_size
+        else:
+            self.hidden_size_in = config.hidden_size
+
+        self.fc = torch.nn.Linear(
+            self.hidden_size_in * 3,
+            config.hidden_size,
+            bias=getattr(config, "bias", False),
+        )
+
+        self.midlayer = LlamaDecoderLayer(config, 0, quant_config, prefix)
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            embeds = self.embed_tokens(input_ids)
+        else:
+            embeds = input_embeds
+
+        if self.is_mrope_enabled:
+            positions = forward_batch.mrope_positions
+
+        hidden_states = forward_batch.spec_info.hidden_states
+        if hidden_states.shape[-1] != embeds.shape[-1]:
+            hidden_states = self.fc(hidden_states)
+
+        residual = None
+        hidden_states, residual = self.midlayer(
+            positions,
+            embeds,
+            hidden_states,
+            forward_batch,
+            residual,
+        )
+
+        hidden_states_to_logits, hidden_states_to_aux = self.norm(
+            hidden_states, residual
+        )
+
+        # For draft decode, we capture the hidden state before norm
+        return hidden_states_to_logits, [hidden_states_to_aux]
+
+
+class LlamaForCausalLMEagle3(LlamaForCausalLM):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.quant_config = quant_config
+        self.pp_group = get_pp_group()
+
+        if self.config.num_hidden_layers != 1:
+            raise ValueError("EAGLE3 currently only supports 1 layer")
+
+        self.model = LlamaModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        # Llama 3.2 1B Instruct set tie_word_embeddings to True
+        # Llama 3.1 8B Instruct set tie_word_embeddings to False
+        self.load_lm_head_from_target = False
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            if config.draft_vocab_size is None:
+                self.load_lm_head_from_target = True
+                config.draft_vocab_size = config.vocab_size
+            self.lm_head = ParallelLMHead(
+                config.draft_vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+
+        self.logits_processor = LogitsProcessor(config)
+        self.capture_aux_hidden_states = True
+        self.hot_token_id = None
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> None:
+        params_dict = dict(self.named_parameters())
+        # Define the parameter mapping for stacked parameters
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+        for name, loaded_weight in weights:
+            if "d2t" in name:
+                # d2t stores diffs between draft id and target id
+                self.hot_token_id = loaded_weight + torch.arange(loaded_weight.shape[0])
+                continue
+
+            if "t2d" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param_name = f"model.{name}" if name not in params_dict else name
+                if param_name in params_dict:
+                    param = params_dict[param_name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Handle regular parameters
+                param_name = name if name in params_dict else f"model.{name}"
+                if param_name in params_dict:
+                    param = params_dict[param_name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+    def get_hot_token_id(self):
+        return self.hot_token_id
+
+
+EntryClass = [LlamaForCausalLMEagle3]
diff --git a/python/sglang/srt/models/llama_embedding.py b/python/sglang/srt/models/llama_embedding.py
new file mode 100644
index 000000000..ba448f7fc
--- /dev/null
+++ b/python/sglang/srt/models/llama_embedding.py
@@ -0,0 +1,87 @@
+from typing import Iterable, Tuple
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
+from sglang.srt.model_executor.model_runner import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.llama import LlamaModel
+from sglang.srt.utils import add_prefix
+
+
+class LlamaEmbeddingModel(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config=None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.model = LlamaModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = True,
+    ) -> EmbeddingPoolerOutput:
+        assert (
+            get_embedding
+        ), "LlamaEmbeddingModel / MistralModel is only used for embedding"
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.pooler(hidden_states, forward_batch)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.model.named_parameters())
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                return
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                return
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                return
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    return
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+class MistralModel(LlamaEmbeddingModel):
+    pass
+
+
+EntryClass = [LlamaEmbeddingModel, MistralModel]
diff --git a/python/sglang/srt/models/llama_reward.py b/python/sglang/srt/models/llama_reward.py
new file mode 100644
index 000000000..2f78dfa1b
--- /dev/null
+++ b/python/sglang/srt/models/llama_reward.py
@@ -0,0 +1,126 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.llama import LlamaForCausalLM, LlamaModel
+from sglang.srt.utils import add_prefix
+
+
+class LlamaForSequenceClassification(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.num_labels = config.num_labels
+        self.model = LlamaModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=False)
+
+        self.eos_token_id = config.eos_token_id
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = True,
+    ) -> EmbeddingPoolerOutput:
+        assert (
+            get_embedding
+        ), "LlamaForSequenceClassification is only used for embedding"
+
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        last_token_hidden = self.pooler(hidden_states, forward_batch).embeddings
+        scores = self.score(last_token_hidden)
+
+        return EmbeddingPoolerOutput(scores)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        return LlamaForCausalLM.load_weights(self, weights)
+
+
+class LlamaForSequenceClassificationWithNormal_Weights(LlamaForSequenceClassification):
+    class Weights(torch.nn.Module):
+        def __init__(self, hidden_size, num_label):
+            super().__init__()
+            self.fc = torch.nn.Sequential(
+                torch.nn.Linear(hidden_size, hidden_size, dtype=torch.float16),
+                torch.nn.SELU(),
+                torch.nn.Linear(hidden_size, hidden_size, dtype=torch.float16),
+                torch.nn.SELU(),
+                torch.nn.Linear(hidden_size, num_label // 2, dtype=torch.float16),
+            )
+
+        def forward(self, x):
+            return self.fc(x.to(torch.float16))
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, quant_config, prefix=prefix)
+        self.weights = self.Weights(config.hidden_size, self.num_labels)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = True,
+    ) -> EmbeddingPoolerOutput:
+        assert (
+            get_embedding
+        ), "LlamaForSequenceClassification is only used for embedding"
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        logits = self.score(hidden_states)
+        weights = self.weights(hidden_states)
+
+        pooled_logits = self.pooler(logits, forward_batch).embeddings
+        pooled_weights = self.pooler(weights, forward_batch).embeddings
+
+        rews = pooled_logits.view(-1, self.num_labels // 2, 2)[:, :, 0].view(
+            -1, self.num_labels // 2
+        )
+        scores = (rews * pooled_weights).sum(dim=-1).view(-1, 1)
+        return EmbeddingPoolerOutput(scores)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        return super().load_weights(weights)
+
+
+EntryClass = [
+    LlamaForSequenceClassification,
+    LlamaForSequenceClassificationWithNormal_Weights,
+]
diff --git a/python/sglang/srt/models/llava.py b/python/sglang/srt/models/llava.py
new file mode 100644
index 000000000..2fbbe5590
--- /dev/null
+++ b/python/sglang/srt/models/llava.py
@@ -0,0 +1,843 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only LLaVa model compatible with HuggingFace weights."""
+
+import math
+import re
+from functools import lru_cache
+from typing import Dict, Iterable, List, Optional, Tuple, Type, Union
+
+import numpy as np
+import torch
+from torch import nn
+from transformers import (
+    CLIPVisionConfig,
+    CLIPVisionModel,
+    LlavaConfig,
+    MistralConfig,
+    Qwen2Config,
+    SiglipVisionModel,
+)
+from transformers.models.auto.modeling_auto import AutoModel, AutoModelForCausalLM
+from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
+
+# leave till last and symbol only in case circular import
+import sglang.srt.models as sgl_models
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import general_mm_embed_routine
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.llama import LlamaForCausalLM
+from sglang.srt.models.mistral import MistralForCausalLM
+from sglang.srt.models.qwen2 import Qwen2ForCausalLM
+from sglang.srt.multimodal.mm_utils import (
+    get_anyres_image_grid_shape,
+    unpad_image,
+    unpad_image_shape,
+)
+from sglang.srt.utils import add_prefix, flatten_nested_list, logger
+
+
+class LlavaBaseForCausalLM(nn.Module):
+    def pad_input_ids(self, input_ids: List[int], image_inputs: MultimodalInputs):
+        image_sizes = flatten_nested_list(
+            [item.image_sizes for item in image_inputs.mm_items]
+        )
+
+        pad_values = [item.pad_value for item in image_inputs.mm_items]
+
+        # hardcode for spatial_unpad + anyres
+        if any(
+            item.modality == Modality.MULTI_IMAGES or item.modality == Modality.VIDEO
+            for item in image_inputs.mm_items
+        ):
+            image_aspect_ratio = "pad"
+        else:
+            image_aspect_ratio = "anyres"
+        offset_list = []
+        image_inputs.image_pad_len = []
+        for image_idx, image_s in enumerate(image_sizes):
+            if len(image_sizes) > 16:
+                # 2x2 pooling with stride 2
+                new_image_feature_len = (
+                    math.ceil(self.image_size / self.patch_size / 2) ** 2
+                )
+            else:
+                new_image_feature_len = self.image_feature_len  # multi-image
+
+            height = width = self.num_patches_per_side
+            if "anyres" in image_aspect_ratio:
+                num_patch_width, num_patch_height = get_anyres_image_grid_shape(
+                    image_s,
+                    self.image_grid_pinpoints,
+                    self.vision_tower.config.image_size,
+                )
+                h = num_patch_height * height
+                w = num_patch_width * width
+                new_h, new_w = unpad_image_shape(h, w, image_s)
+
+                if "anyres_max" in self.config.image_aspect_ratio:
+                    matched_anyres_max_num_patches = re.match(
+                        r"anyres_max_(\d+)", self.config.image_aspect_ratio
+                    )
+                    if matched_anyres_max_num_patches:
+                        max_num_patches = int(matched_anyres_max_num_patches.group(1))
+                    # times = math.sqrt(h * w / (max_num_patches * unit**2))
+                    times = math.sqrt(
+                        new_h * new_w / (max_num_patches * self.image_feature_len)
+                    )
+                    if times > 1.1:
+                        new_h = int(new_h // times)
+                        new_w = int(new_w // times)
+                new_image_feature_len += new_h * (new_w + 1)
+
+            try:
+                offset = input_ids.index(self.config.image_token_index)
+            except ValueError:
+                offset = 0
+            # old_len + pad_len - 1, because we need to remove image_token_id
+            input_ids = (
+                input_ids[:offset]
+                + [pad_values[image_idx % len(pad_values)]] * new_image_feature_len
+                + input_ids[offset + 1 :]
+            )
+            offset_list.append(offset)
+            image_inputs.image_pad_len.append(new_image_feature_len)
+
+        image_inputs.image_offsets = offset_list
+        return input_ids
+
+    def encode_images(
+        self, pixel_values: Union[torch.Tensor, List[torch.Tensor]]
+    ) -> torch.Tensor:
+        """
+        encode images by vision tower and multimodal projector
+        Args:
+            pixel_values: torch.Tensor or List[torch.Tensor]: each tensor for an input image
+        Returns:
+            torch.Tensor: encoded image features from the input image; if multiple, flattened by seq_len axis
+        """
+        image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+        # NOTE: This is not memory efficient. (output_hidden_states=True) will save all the hidden stated.
+        selected_image_feature = image_outputs.hidden_states[self.vision_feature_layer]
+        if self.vision_feature_select_strategy in ["default", "patch"]:
+            selected_image_feature = selected_image_feature[:, 1:]
+        elif self.vision_feature_select_strategy == "full":
+            selected_image_feature = selected_image_feature
+        else:
+            raise ValueError(
+                f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}"
+            )
+        image_features = self.multi_modal_projector(selected_image_feature)
+        return image_features
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        image_inputs = forward_batch.mm_inputs
+
+        if forward_batch.forward_mode.is_extend():
+            # Clamp input ids. This is because the input_ids for the image tokens are
+            # filled with the hash values of the image for the prefix matching in the radix attention.
+            # There values are useless because their embeddings will be replaced by vision embeddings anyway.
+            input_ids.clamp_(min=0, max=self.config.vocab_size - 1)
+
+            # Embed text inputs
+            input_embeds = self.language_model.model.embed_tokens(input_ids)
+
+            # Got List[List[str]] extend it to List[str]
+            # The length of the List should be equal to batch size
+            modalities_list = []
+            max_image_offset = []
+            for im in image_inputs:
+                if im:
+                    modalities_list.extend([item.modality for item in im.mm_items])
+                if im and im.image_offsets:
+                    max_image_offset.append(
+                        np.max(np.array(im.image_offsets) + np.array(im.image_pad_len))
+                    )
+                else:
+                    max_image_offset.append(-1)
+
+            start_positions = positions[forward_batch.extend_start_loc].cpu().numpy()
+            need_vision = start_positions <= np.array(max_image_offset)
+
+            if need_vision.any():
+                bs = forward_batch.batch_size
+                pixel_values = flatten_nested_list(
+                    [
+                        [item.feature for item in image_inputs[i].mm_items]
+                        for i in range(bs)
+                        if need_vision[i]
+                    ]
+                )
+                image_sizes = [
+                    flatten_nested_list(
+                        [item.image_sizes for item in image_inputs[i].mm_items]
+                    )
+                    for i in range(bs)
+                    if need_vision[i]
+                ]
+
+                ########## Encode Image ########
+
+                if pixel_values[0].ndim == 4:
+                    # llava-hd: BS, num_patch, C=3, H=336, W=336, num_patch obtained from process_images
+                    np.concatenate(pixel_values, axis=0)
+                    # ndim=4
+                    concat_images = torch.tensor(
+                        np.concatenate(pixel_values, axis=0),
+                        device=self.vision_tower.device,
+                    )
+                    image_features = self.encode_images(concat_images)
+                    split_sizes = [image.shape[0] for image in pixel_values]
+                    image_features = torch.split(image_features, split_sizes, dim=0)
+                    # hd image_features: BS, num_patch, 576, 4096
+                else:
+                    # normal pixel: BS, C=3, H=336, W=336
+                    pixel_values = torch.tensor(
+                        np.array(pixel_values), device=self.vision_tower.device
+                    )
+                    image_features = self.encode_images(pixel_values)
+                    # image_features: BS, 576, 4096
+
+                if self.mm_patch_merge_type.startswith("spatial"):
+                    new_image_features = []
+                    height = width = self.num_patches_per_side
+                    for image_idx, image_feature in enumerate(image_features):
+                        if modalities_list[image_idx] == Modality.IMAGE:
+                            image_aspect_ratio = (
+                                self.config.image_aspect_ratio
+                            )  # single image
+                        elif (
+                            modalities_list[image_idx] == Modality.MULTI_IMAGES
+                            or modalities_list[image_idx] == Modality.VIDEO
+                        ):
+                            image_aspect_ratio = "pad"  # multi image
+                        # image_aspect_ratio = (
+                        #     "anyres" if len(image_sizes[image_idx]) == 1 else "pad"
+                        # )
+                        if (
+                            image_feature.shape[0] > 1
+                            and "anyres" in image_aspect_ratio
+                            and modalities_list[image_idx] == Modality.IMAGE
+                        ):
+                            base_image_feature = image_feature[0]
+                            image_feature = image_feature[1:]
+                            assert height * width == base_image_feature.shape[0]
+
+                            if "anyres_max" in image_aspect_ratio:
+                                matched_anyres_max_num_patches = re.match(
+                                    r"anyres_max_(\d+)", image_aspect_ratio
+                                )
+                                if matched_anyres_max_num_patches:
+                                    max_num_patches = int(
+                                        matched_anyres_max_num_patches.group(1)
+                                    )
+
+                            if (
+                                image_aspect_ratio == "anyres"
+                                or "anyres_max" in image_aspect_ratio
+                            ):
+                                vision_tower_image_size = self.image_size
+                                try:
+                                    num_patch_width, num_patch_height = (
+                                        get_anyres_image_grid_shape(
+                                            image_sizes[image_idx][0],
+                                            self.config.image_grid_pinpoints,
+                                            vision_tower_image_size,
+                                        )
+                                    )
+                                except Exception as e:
+                                    print(f"Error: {e}")
+                                    num_patch_width, num_patch_height = 2, 2
+                                image_feature = image_feature.view(
+                                    num_patch_height, num_patch_width, height, width, -1
+                                )
+                            else:
+                                image_feature = image_feature.view(
+                                    2, 2, height, width, -1
+                                )
+
+                            # (
+                            #     num_patch_width,
+                            #     num_patch_height,
+                            # ) = get_anyres_image_grid_shape(
+                            #     image_sizes[image_idx][0],
+                            #     self.image_grid_pinpoints,
+                            #     self.vision_tower.config.image_size,
+                            # )
+
+                            # image_feature = image_feature.view(
+                            #     num_patch_height, num_patch_width, height, width, -1
+                            # )
+
+                            if "unpad" in self.mm_patch_merge_type:
+                                unit = image_feature.shape[2]
+                                image_feature = image_feature.permute(
+                                    4, 0, 2, 1, 3
+                                ).contiguous()
+                                image_feature = image_feature.flatten(1, 2).flatten(
+                                    2, 3
+                                )
+                                image_feature = unpad_image(
+                                    image_feature, image_sizes[image_idx][0]
+                                )
+                                if (
+                                    "anyres_max" in image_aspect_ratio
+                                    and matched_anyres_max_num_patches
+                                ):
+                                    c, h, w = image_feature.shape
+                                    times = math.sqrt(
+                                        h * w / (max_num_patches * unit**2)
+                                    )
+                                    if times > 1.1:
+                                        image_feature = image_feature[None]
+                                        image_feature = nn.functional.interpolate(
+                                            image_feature,
+                                            [int(h // times), int(w // times)],
+                                            mode="bilinear",
+                                        )[0]
+                                image_feature = torch.cat(
+                                    (
+                                        image_feature,
+                                        self.language_model.model.image_newline[
+                                            :, None, None
+                                        ].expand(*image_feature.shape[:-1], 1),
+                                    ),
+                                    dim=-1,
+                                )
+                                image_feature = image_feature.flatten(1, 2).transpose(
+                                    0, 1
+                                )
+                            else:
+                                image_feature = image_feature.permute(
+                                    0, 2, 1, 3, 4
+                                ).contiguous()
+                                image_feature = image_feature.flatten(0, 3)
+                            image_feature = torch.cat(
+                                (base_image_feature, image_feature), dim=0
+                            )
+                            image_feature = image_feature.unsqueeze(0)
+                        else:
+                            if modalities_list[image_idx] == Modality.VIDEO:  # video
+                                # 2x2 pooling
+                                num_of_frames = image_feature.shape[0]
+                                image_feature = image_feature.view(
+                                    num_of_frames, height, width, -1
+                                )
+                                image_feature = image_feature.permute(
+                                    0, 3, 1, 2
+                                ).contiguous()  # N, C, H, W
+                                height, weight = image_feature.shape[2:]
+                                scaled_shape = [
+                                    math.ceil(height / 2),
+                                    math.ceil(weight / 2),
+                                ]
+                                image_feature = nn.functional.interpolate(
+                                    image_feature, size=scaled_shape, mode="bilinear"
+                                )
+                                image_feature = (
+                                    image_feature.flatten(2)
+                                    .transpose(1, 2)
+                                    .contiguous()
+                                )  # N, C, H*W
+                            if "unpad" in self.mm_patch_merge_type:
+                                image_feature = torch.cat(
+                                    (
+                                        image_feature,
+                                        # Expand to (bs, 1, hidden_dim) and concat at the end of the image tokens
+                                        self.language_model.model.image_newline[
+                                            None, None
+                                        ].expand(
+                                            image_feature.shape[0],
+                                            1,
+                                            image_feature.shape[-1],
+                                        ),
+                                    ),
+                                    dim=1,
+                                )
+
+                        new_image_features.append(image_feature)
+                    image_features = new_image_features
+
+                # Fill in the placeholder for the image
+                extend_start_loc_cpu = forward_batch.extend_start_loc.cpu().numpy()
+                extend_seq_lens = forward_batch.extend_seq_lens.cpu().numpy()
+                prefix_lens_cpu = forward_batch.extend_prefix_lens_cpu
+                pt = 0
+                for i in range(bs):
+                    if not need_vision[i]:
+                        continue
+
+                    start_idx = extend_start_loc_cpu[i]
+                    seq_len = extend_seq_lens[i]
+                    prefix_len = prefix_lens_cpu[i]
+
+                    # Multiple images
+                    for image_idx, image_offset in enumerate(
+                        image_inputs[i].image_offsets
+                    ):
+                        if (
+                            image_offset + image_inputs[i].image_pad_len[image_idx]
+                            <= prefix_len
+                        ):
+                            continue
+                        if image_offset >= prefix_len + seq_len:
+                            break
+
+                        tmp_image_feature = image_features[pt][image_idx]
+                        pad_len = tmp_image_feature.shape[0]
+
+                        input_offset = image_offset - prefix_len
+                        left_idx = start_idx + input_offset
+                        right_idx = left_idx + pad_len
+                        assert right_idx > start_idx
+                        if input_offset < 0:
+                            left_idx = start_idx
+                            tmp_image_feature = tmp_image_feature[-input_offset:]
+                        if right_idx > start_idx + seq_len:
+                            tmp_image_feature = tmp_image_feature[
+                                : start_idx + seq_len - right_idx
+                            ]
+                            right_idx = start_idx + seq_len
+                        try:
+                            input_embeds[left_idx:right_idx] = tmp_image_feature
+                        except RuntimeError as e:
+                            print(f"RuntimeError in image encoding: {e}")
+                            print(f"{input_embeds.shape=}, {tmp_image_feature.shape=}")
+                            print(
+                                f"{start_idx=}, {image_offset=}, {prefix_len=}, {pad_len=}"
+                            )
+                    pt += 1
+
+            return self.language_model(
+                input_ids, positions, forward_batch, input_embeds=input_embeds
+            )
+        elif forward_batch.forward_mode.is_decode():
+            return self.language_model(input_ids, positions, forward_batch)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        # Load clip vision model by cfg['mm_vision_tower']:
+        # huggingface_name or path_of_clip_relative_to_llava_model_dir
+        # We put the initialization here instead of __init__ to allow it being reused by other subclasses.
+        vision_path = self.config.mm_vision_tower
+        if "clip" in vision_path:
+            self.vision_tower = CLIPVisionModel.from_pretrained(
+                vision_path, torch_dtype=torch.float16
+            ).cuda()
+        elif "siglip" in vision_path:
+            self.vision_tower = SiglipVisionModel.from_pretrained(
+                vision_path, torch_dtype=torch.float16
+            ).cuda()
+            # Siglip needs all feature tokens
+            self.config.mm_vision_select_feature = "full"
+        self.vision_tower.eval()
+
+        self.vision_feature_layer = self.config.mm_vision_select_layer
+        self.vision_feature_select_strategy = self.config.mm_vision_select_feature
+        self.image_size = self.vision_tower.config.image_size
+        self.patch_size = self.vision_tower.config.patch_size
+
+        self.mm_patch_merge_type = getattr(self.config, "mm_patch_merge_type", "flat")
+        self.image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square")
+        self.image_grid_pinpoints = getattr(self.config, "image_grid_pinpoints", None)
+
+        self.image_feature_len = int((self.image_size // self.patch_size) ** 2)
+        if (
+            self.vision_feature_select_strategy == "patch"
+            or self.vision_feature_select_strategy == "full"
+        ):
+            pass
+        elif self.vision_feature_select_strategy == "cls_patch":
+            self.image_feature_len += 1
+        else:
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
+
+        # load mm_projector
+        projector_weights = {
+            "model.mm_projector.0": "multi_modal_projector.linear_1",
+            "model.mm_projector.2": "multi_modal_projector.linear_2",
+            "model.vision_tower.vision_tower": "vision_tower",
+            # Update the vision tower weights if we find them in the checkpoint (it may be finetuned).
+            "model.image_newline": "language_model.model.image_newline",
+        }
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "projector" in name or "vision_tower" in name or "image_newline" in name:
+                for weight_name, param_name in projector_weights.items():
+                    if weight_name in name:
+                        name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            else:
+                self.language_model.load_weights([(name, loaded_weight)])
+
+    @property
+    def num_patches_per_side(self):
+        return self.image_size // self.patch_size
+
+
+class LlavaLlamaForCausalLM(LlavaBaseForCausalLM):
+    def __init__(
+        self,
+        config: LlavaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.vision_tower = None
+        self.config.vision_config.hidden_size = config.mm_hidden_size
+        self.config.text_config.hidden_size = config.hidden_size
+
+        self.multi_modal_projector = LlavaMultiModalProjector(config)
+        self.language_model = LlamaForCausalLM(
+            config,
+            quant_config=quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+        if "unpad" in getattr(config, "mm_patch_merge_type", ""):
+            self.language_model.model.image_newline = nn.Parameter(
+                torch.empty(config.text_config.hidden_size, dtype=torch.float16)
+            )
+
+
+class LlavaQwenForCausalLM(LlavaBaseForCausalLM):
+    def __init__(
+        self,
+        config: LlavaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.vision_tower = None
+
+        if getattr(self.config, "vision_config", None) is None:
+            self.config.vision_config = CLIPVisionConfig(self.config.mm_vision_tower)
+        if getattr(self.config, "text_config", None) is None:
+            self.config.text_config = Qwen2Config(self.config._name_or_path)
+
+        self.config.vision_config.hidden_size = config.mm_hidden_size
+        self.config.text_config.hidden_size = config.hidden_size
+
+        if getattr(self.config, "projector_hidden_act", None) is None:
+            self.config.projector_hidden_act = "gelu"
+        if getattr(self.config, "image_token_index", None) is None:
+            self.config.image_token_index = 151646
+
+        self.multi_modal_projector = LlavaMultiModalProjector(config)
+        self.language_model = Qwen2ForCausalLM(
+            config,
+            quant_config=quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+        if "unpad" in getattr(config, "mm_patch_merge_type", ""):
+            self.language_model.model.image_newline = nn.Parameter(
+                torch.empty(config.text_config.hidden_size, dtype=torch.float16)
+            )
+
+
+class LlavaMistralForCausalLM(LlavaBaseForCausalLM):
+    def __init__(
+        self,
+        config: LlavaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.vision_tower = None
+
+        if getattr(self.config, "vision_config", None) is None:
+            self.config.vision_config = CLIPVisionConfig(self.config.mm_vision_tower)
+        if getattr(self.config, "text_config", None) is None:
+            self.config.text_config = MistralConfig(self.config._name_or_path)
+
+        self.config.vision_config.hidden_size = config.mm_hidden_size
+        self.config.text_config.hidden_size = config.hidden_size
+
+        if getattr(self.config, "projector_hidden_act", None) is None:
+            self.config.projector_hidden_act = "gelu"
+        if getattr(self.config, "image_token_index", None) is None:
+            self.config.image_token_index = 32000
+
+        self.multi_modal_projector = LlavaMultiModalProjector(config)
+        self.language_model = MistralForCausalLM(
+            config,
+            quant_config=quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+        if "unpad" in getattr(config, "mm_patch_merge_type", ""):
+            self.language_model.model.image_newline = nn.Parameter(
+                torch.empty(config.text_config.hidden_size, dtype=torch.float16)
+            )
+
+
+class LlavaForConditionalGeneration(LlavaBaseForCausalLM):
+    """
+    An adaptor class to enable support for multiple mmlm such as mistral-community/pixtral-12b
+    It follows the structure of (vision_tower, multi_modal_projector, language_model)
+
+    Once a model config is loaded, text_config and vision_config will be extracted, and
+    LlavaForConditionalGeneration will load the language_model and vision_tower models
+    according to config.
+    """
+
+    MULTIMODAL_PROJECTOR_TYPE = LlavaMultiModalProjector
+
+    @property
+    def dtype(self):
+        return self.torch_dtype
+
+    def pad_input_ids(self, input_ids: List[int], image_inputs: MultimodalInputs):
+        if hasattr(self.vision_tower, "pad_input_ids"):
+            return self.vision_tower.pad_input_ids(input_ids, image_inputs)
+        else:
+            return super().pad_input_ids(input_ids, image_inputs)
+
+    def _get_sgl_model_cls(self, config, auto_model_type: Type[AutoModel] = AutoModel):
+        """
+        Get the SGLang model implementation class according to config.
+
+        Args:
+            config: The config object of the model.
+            auto_model_type: The type of the auto model.
+
+        Returns:
+            The SGLang model implementation class.
+        """
+        config_cls_name = config.__class__.__name__
+        arch_name_mapping = self._config_cls_name_to_arch_name_mapping(auto_model_type)
+        if arch := arch_name_mapping.get(config_cls_name):
+            if isinstance(arch, tuple):
+                arch = arch[0]
+                logger.warning(
+                    f"Multiple {auto_model_type.__name__} models found for submodule config `{config_cls_name}`, defaulting to [0]: {arch.__name__}"
+                )
+            try:
+                return sgl_models.registry.ModelRegistry.resolve_model_cls(arch)[0]
+            except Exception as e:
+                raise ValueError(
+                    f"{auto_model_type.__name__} found a corresponding model `{arch}` for config class `{config_cls_name}`, but failed to load it from SGLang ModelRegistry. \n{e}"
+                )
+        else:
+            raise ValueError(
+                f"{auto_model_type.__name__} cannot find a corresponding model for config class `{config_cls_name}`"
+            )
+
+    @lru_cache
+    def _config_cls_name_to_arch_name_mapping(
+        self, auto_model_type: Type[AutoModel]
+    ) -> Dict[str, str]:
+        mapping = {}
+        for config_cls in auto_model_type._model_mapping.keys():
+            archs = auto_model_type._model_mapping.get(config_cls, None)
+            if archs is not None:
+                if isinstance(archs, tuple):
+                    mapping[config_cls.__name__] = tuple(
+                        arch.__name__ for arch in archs
+                    )
+                else:
+                    mapping[config_cls.__name__] = archs.__name__
+        return mapping
+
+    def __init__(
+        self,
+        config: LlavaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        assert hasattr(config, "text_config")
+        assert hasattr(config, "vision_config")
+        self.config = config
+        self.text_config = self.config.text_config
+        self.vision_config = self.config.vision_config
+        self.torch_dtype = getattr(self.config, "torch_dtype")
+
+        if not getattr(self.text_config, "torch_dtype"):
+            self.text_config.torch_dtype = self.torch_dtype
+        if not getattr(self.vision_config, "torch_dtype"):
+            self.vision_config.torch_dtype = self.torch_dtype
+
+        if not hasattr(self.config, "vocab_size"):
+            self.config.vocab_size = self.text_config.vocab_size
+        if not hasattr(self.config, "image_aspect_ratio"):
+            self.config.image_aspect_ratio = "anyres"
+        if not hasattr(self.config, "image_grid_pinpoints"):
+            # from transformers.models.llava_onevision.configuration_llava_onevision import LlavaOnevisionConfig
+            # self.config.image_grid_pinpoints = LlavaOnevisionConfig().image_grid_pinpoints
+            self.config.image_grid_pinpoints = [
+                [96, 96],
+                [224, 224],
+                [384, 384],
+                [512, 512],
+                [768, 768],
+                [1024, 1024],
+            ]
+        if not hasattr(self.config, "mm_patch_merge_type"):
+            self.config.mm_patch_merge_type = "flat"
+        if not hasattr(self.config, "image_token_index"):
+            self.config.image_token_index = 10
+        if not hasattr(self.config, "projector_hidden_act"):
+            self.config.projector_hidden_act = "gelu"
+
+        self.vision_feature_layer = getattr(self.config, "vision_feature_layer", -1)
+        self.vision_feature_select_strategy = getattr(
+            self.config, "vision_feature_select_strategy", "full"
+        )
+        self.image_size = self.vision_config.image_size
+        self.patch_size = self.vision_config.patch_size
+
+        self.mm_patch_merge_type = self.config.mm_patch_merge_type
+        self.image_aspect_ratio = self.config.image_aspect_ratio
+        self.image_grid_pinpoints = self.config.image_grid_pinpoints
+
+        self.image_feature_len = int((self.image_size // self.patch_size) ** 2)
+
+        self.multi_modal_projector = self.MULTIMODAL_PROJECTOR_TYPE(config)
+
+        language_model_cls = self._get_sgl_model_cls(
+            self.text_config, AutoModelForCausalLM
+        )
+        vision_model_cls = self._get_sgl_model_cls(self.vision_config, AutoModel)
+        self.language_model = language_model_cls(
+            self.text_config,
+            quant_config=quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+        self.vision_tower = vision_model_cls(
+            self.vision_config,
+            quant_config=quant_config,
+            prefix=add_prefix("vision_tower", prefix),
+        )
+
+        if "unpad" in getattr(self.config, "mm_patch_merge_type", ""):
+            self.language_model.model.image_newline = nn.Parameter(
+                torch.empty(self.text_config.hidden_size, dtype=self.torch_dtype)
+            )
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        """Extract features from image inputs.
+
+        Args:
+            items: List of MultimodalDataItem objects containing image data
+                Note that an item can be either "image" or "multi-images"
+
+        Returns:
+            torch.Tensor: features from image inputs, concatenated
+        """
+        features = []
+        for item in items:
+            # in each item, we assume pixel_values is always batched
+            pixel_values, image_sizes = item.feature, item.image_sizes
+            image_outputs = self.vision_tower(
+                pixel_values, image_sizes, output_hidden_states=True
+            )
+            selected_image_feature = image_outputs.hidden_states[
+                self.vision_feature_layer
+            ]
+
+            if self.vision_feature_select_strategy in ["default", "patch"]:
+                selected_image_feature = selected_image_feature[:, 1:]
+            elif self.vision_feature_select_strategy == "full":
+                selected_image_feature = selected_image_feature
+            else:
+                raise ValueError(
+                    f"Unexpected select feature: {self.vision_feature_select_strategy}"
+                )
+            features.append(
+                self.multi_modal_projector(selected_image_feature.squeeze(0))
+            )
+        ret = torch.cat(features, dim=0)
+        return ret
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = False,
+    ):
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            get_embedding=get_embedding,
+            language_model=self.language_model,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+            },
+            placeholder_tokens=None,  # using mm_item.pad_value
+            positions=positions,
+        )
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        """Load weights for LlavaForConditionalGeneration.
+
+        Unlike the base class implementation, this one doesn't need to handle
+        weight name remapping as the weights are already properly structured with
+        'language_model' and 'vision_tower' prefixes in the safetensors files.
+        """
+        if (
+            self.vision_feature_select_strategy == "patch"
+            or self.vision_feature_select_strategy == "full"
+        ):
+            pass
+        elif self.vision_feature_select_strategy == "cls_patch":
+            self.image_feature_len += 1
+        else:
+            raise ValueError(
+                f"Unexpected select feature: {self.vision_feature_select_strategy}"
+            )
+
+        # Create dictionaries for direct parameter loading
+        params_dict = dict(self.named_parameters())
+
+        # Load weights directly without remapping
+        for name, loaded_weight in weights:
+            for part in ("language_model", "vision_tower"):
+                if name.startswith(part):
+                    name = name[len(part + ".") :]
+                    getattr(self, part).load_weights([(name, loaded_weight)])
+                    break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = [
+    LlavaLlamaForCausalLM,
+    LlavaQwenForCausalLM,
+    LlavaMistralForCausalLM,
+    LlavaForConditionalGeneration,
+]
diff --git a/python/sglang/srt/models/llavavid.py b/python/sglang/srt/models/llavavid.py
new file mode 100644
index 000000000..e5d6aa72b
--- /dev/null
+++ b/python/sglang/srt/models/llavavid.py
@@ -0,0 +1,283 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only LLaVa video model compatible with HuggingFace weights."""
+
+from typing import Iterable, List, Optional, Tuple
+
+import numpy as np
+import torch
+from torch import nn
+from transformers import CLIPVisionModel, LlavaConfig
+from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
+
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.schedule_batch import MultimodalInputs, flatten_nested_list
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.llama import LlamaForCausalLM
+from sglang.srt.utils import add_prefix
+
+
+class LlavaVidForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: LlavaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.vision_tower = None
+        self.config.vision_config.hidden_size = config.mm_hidden_size
+        self.config.text_config.hidden_size = config.hidden_size
+        self.multi_modal_projector = LlavaMultiModalProjector(config)
+        self.mm_spatial_pool_stride = getattr(self.config, "mm_spatial_pool_stride", 2)
+        self.resampler = nn.AvgPool2d(
+            kernel_size=self.mm_spatial_pool_stride, stride=self.mm_spatial_pool_stride
+        )
+        self.language_model = LlamaForCausalLM(
+            config,
+            quant_config=quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+        self.num_frames = getattr(self.config, "num_frames", 16)
+        if "unpad" in getattr(config, "mm_patch_merge_type", ""):
+            self.language_model.model.image_newline = nn.Parameter(
+                torch.empty(config.text_config.hidden_size, dtype=torch.float16)
+            )
+
+    def pad_input_ids(self, input_ids: List[int], image_inputs: MultimodalInputs):
+        pad_values = [item.pad_value for item in image_inputs.mm_items]
+        new_image_feature_len = self.image_feature_len
+
+        pad_ids = pad_values * (
+            (new_image_feature_len + len(pad_values)) // len(pad_values)
+        )
+        offset = input_ids.index(self.config.image_token_index)
+        # old_len + pad_len - 1, because we need to remove image_token_id
+        new_input_ids = (
+            input_ids[:offset]
+            + pad_ids[:new_image_feature_len]
+            + input_ids[offset + 1 :]
+        )
+        image_inputs.image_offsets = [offset]
+        return new_input_ids
+
+    def encode_images(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+        # NOTE: This is not memory efficient. (output_hidden_states=True) will save all the hidden stated.
+
+        selected_image_feature = image_outputs.hidden_states[self.vision_feature_layer]
+        if self.vision_feature_select_strategy in ["default", "patch"]:
+            selected_image_feature = selected_image_feature[:, 1:]
+        elif self.vision_feature_select_strategy == "full":
+            selected_image_feature = selected_image_feature
+        else:
+            raise ValueError(
+                f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}"
+            )
+
+        height = width = self.num_patches_per_side
+        num_of_frames = selected_image_feature.shape[0]
+        selected_image_feature = selected_image_feature.view(
+            num_of_frames, height, width, -1
+        )
+        selected_image_feature = selected_image_feature.permute(0, 3, 1, 2).contiguous()
+        selected_image_feature = (
+            self.resampler(selected_image_feature)
+            .flatten(2)
+            .transpose(1, 2)
+            .contiguous()
+        )
+
+        image_features = self.multi_modal_projector(selected_image_feature)
+
+        return image_features
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        image_inputs = forward_batch.mm_inputs
+        if forward_batch.forward_mode.is_extend():
+            bs = forward_batch.batch_size
+
+            # Clamp input ids. See llava.py for more details
+            input_ids = input_ids.clamp_(min=0, max=self.config.vocab_size - 1)
+
+            # Embed text inputs
+            input_embeds = self.language_model.model.embed_tokens(input_ids)
+
+            # Whether the requests need vision inputs
+            max_image_offset = []
+            for im in image_inputs:
+                if im and im.image_offsets:
+                    max_image_offset.append(max(im.image_offsets))
+                else:
+                    max_image_offset.append(-1)
+            start_positions = positions[forward_batch.extend_start_loc].cpu().numpy()
+            need_vision = start_positions <= np.array(max_image_offset)
+
+            if need_vision.any():
+                pixel_values = flatten_nested_list(
+                    [
+                        [item.feature for item in image_inputs[i].mm_items]
+                        for i in range(bs)
+                        if need_vision[i]
+                    ]
+                )
+                image_offsets = [
+                    flatten_nested_list(
+                        [item.offsets for item in image_inputs[i].mm_items]
+                    )
+                    for i in range(bs)
+                    if need_vision[i]
+                ]
+
+                ########## Encode Image ########
+
+                if pixel_values[0].ndim == 4:
+                    # llava-hd: BS, num_patch, C=3, H=336, W=336, num_patch obtained from process_images
+                    np.concatenate(pixel_values, axis=0)
+                    # ndim=4
+                    concat_images = torch.tensor(
+                        np.concatenate(pixel_values, axis=0),
+                        device=self.vision_tower.device,
+                    )
+                    # image_features = self.encode_images(concat_images)
+                    # split_sizes = [image.shape[0] for image in pixel_values]
+                    # image_features = torch.split(image_features, split_sizes, dim=0)
+                    image_features = self.encode_images(
+                        concat_images
+                    )  # , prompts)#, image_counts, long_video=long_video)
+                    split_sizes = [image.shape[0] for image in pixel_values]
+                    image_features = torch.split(image_features, split_sizes, dim=0)
+
+                    # hd image_features: BS, num_patch, 576, 4096
+                else:
+                    # normal pixel: BS, C=3, H=336, W=336
+                    pixel_values = torch.tensor(
+                        np.array(pixel_values), device=self.vision_tower.device
+                    )
+                    image_features = self.encode_images(pixel_values)
+                    # image_features: BS, 576, 4096
+
+                new_image_features = []
+                for image_idx, image_feature in enumerate(image_features):
+                    new_image_features.append(image_feature.flatten(0, 1))
+                image_features = new_image_features
+
+                # Fill in the placeholder for the image
+                extend_start_loc_cpu = forward_batch.extend_start_loc.cpu().numpy()
+                prefix_lens_cpu = forward_batch.extend_prefix_lens_cpu
+                pt = 0
+                for i in range(bs):
+                    if not need_vision[i]:
+                        continue
+
+                    start_idx = extend_start_loc_cpu[i]
+                    prefix_len = prefix_lens_cpu[i]
+
+                    # Multiple images
+                    for image_offset in image_offsets[i]:
+                        if image_offset < prefix_len:
+                            continue
+
+                        tmp_image_feature = image_features[pt]
+                        pad_len = tmp_image_feature.shape[0]
+
+                        left_idx = start_idx + (image_offset - prefix_len)
+                        right_idx = start_idx + (image_offset - prefix_len) + pad_len
+                        try:
+                            input_embeds[left_idx:right_idx] = tmp_image_feature
+                        except RuntimeError as e:
+                            print(f"RuntimeError in image encoding: {e}")
+                            print(f"{input_embeds.shape=}, {tmp_image_feature.shape=}")
+                            print(
+                                f"{start_idx=}, {image_offset=}, {prefix_len=}, {pad_len=}"
+                            )
+                        pt += 1
+
+            return self.language_model(
+                input_ids, positions, forward_batch, input_embeds=input_embeds
+            )
+        elif forward_batch.forward_mode.is_decode():
+            return self.language_model(input_ids, positions, forward_batch)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        # Load clip vision model by cfg['mm_vision_tower']:
+        # huggingface_name or path_of_clip_relative_to_llava_model_dir
+        # We put the initialization here instead of __init__ to allow it being reused by other subclasses.
+        vision_path = self.config.mm_vision_tower
+        self.vision_tower = CLIPVisionModel.from_pretrained(
+            vision_path, torch_dtype=torch.float16
+        ).cuda()
+        self.vision_tower.eval()
+
+        self.vision_feature_layer = self.config.mm_vision_select_layer
+        self.vision_feature_select_strategy = self.config.mm_vision_select_feature
+        self.image_size = self.vision_tower.config.image_size
+        self.patch_size = self.vision_tower.config.patch_size
+
+        self.mm_patch_merge_type = getattr(self.config, "mm_patch_merge_type", "flat")
+        self.image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square")
+        self.image_grid_pinpoints = getattr(self.config, "image_grid_pinpoints", None)
+
+        print(f"target_frames: {self.num_frames}")
+        self.image_feature_len = self.num_frames * int(
+            (self.image_size / self.patch_size / self.mm_spatial_pool_stride) ** 2
+        )
+        if self.vision_feature_select_strategy == "patch":
+            pass
+        elif self.vision_feature_select_strategy == "cls_patch":
+            self.image_feature_len += 1
+        else:
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
+
+        # load mm_projector
+        projector_weights = {
+            "model.mm_projector.0": "multi_modal_projector.linear_1",
+            "model.mm_projector.2": "multi_modal_projector.linear_2",
+            "model.vision_resampler.mm_projector.0": "multi_modal_projector.linear_1",
+            "model.vision_resampler.mm_projector.2": "multi_modal_projector.linear_2",
+            "model.vision_tower.vision_tower": "vision_tower",
+            # Update the vision tower weights if we find them in the checkpoint (it may be finetuned).
+            "model.image_newline": "language_model.model.image_newline",
+        }
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            # FIXME: why projector weights read two times?
+            if "projector" in name or "vision_tower" in name or "image_newline" in name:
+                for weight_name, param_name in projector_weights.items():
+                    if weight_name in name:
+                        name = name.replace(weight_name, param_name)
+                if name in params_dict:
+                    param = params_dict[name]
+                else:
+                    print(f"Warning: {name} not found in the model")
+                    continue
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            else:
+                self.language_model.load_weights([(name, loaded_weight)])
+
+    @property
+    def num_patches_per_side(self):
+        return self.image_size // self.patch_size
+
+
+EntryClass = LlavaVidForCausalLM
diff --git a/python/sglang/srt/models/longcat_flash.py b/python/sglang/srt/models/longcat_flash.py
new file mode 100644
index 000000000..9531cb83e
--- /dev/null
+++ b/python/sglang/srt/models/longcat_flash.py
@@ -0,0 +1,1026 @@
+# Apache License, Version 2.0:
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License:
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import concurrent.futures
+import logging
+import os
+from enum import IntEnum, auto
+from typing import Any, Dict, Iterable, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from tqdm import tqdm
+
+from sglang.srt.configs import LongcatFlashConfig
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.amx_utils import PackWeightMethod
+from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.ep_moe.kernels import zero_experts_compute_triton
+from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE, get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.moe.topk import StandardTopKOutput, TopK
+from sglang.srt.layers.quantization import deep_gemm_wrapper
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
+from sglang.srt.layers.quantization.fp8_utils import (
+    block_quant_dequant,
+    block_quant_to_tensor_quant,
+    channel_quant_to_tensor_quant,
+    normalize_e4m3fn_to_e4m3fnuz,
+    requant_weight_ue8m0_inplace,
+)
+from sglang.srt.layers.quantization.int8_utils import (
+    block_dequant as int8_block_dequant,
+)
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.deepseek_v2 import DeepseekV2AttentionMLA
+from sglang.srt.utils import (
+    BumpAllocator,
+    LazyValue,
+    add_prefix,
+    bind_or_assign,
+    cpu_has_amx_support,
+    get_bool_env_var,
+    get_device_sm,
+    get_int_env_var,
+    is_cpu,
+    is_cuda,
+    is_flashinfer_available,
+    is_hip,
+    is_non_idle_and_non_empty,
+    is_npu,
+    is_sm100_supported,
+)
+
+_is_hip = is_hip()
+_is_cuda = is_cuda()
+_is_npu = is_npu()
+_is_fp8_fnuz = is_fp8_fnuz()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+_device_sm = get_device_sm()
+
+if _is_cuda:
+    from sgl_kernel import (
+        awq_dequantize,
+        bmm_fp8,
+        dsv3_fused_a_gemm,
+        dsv3_router_gemm,
+        merge_state_v2,
+    )
+elif _is_cpu and _is_cpu_amx_available:
+    pass
+elif _is_hip:
+    from sglang.srt.layers.quantization.awq_triton import (
+        awq_dequantize_triton as awq_dequantize,
+    )
+else:
+    from vllm._custom_ops import awq_dequantize
+
+logger = logging.getLogger(__name__)
+
+
+class LongcatFlashMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(
+        self,
+        x,
+    ):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class LongcatFlashRouter(nn.Module):
+    def __init__(
+        self,
+        config,
+        zero_expert_num=0,
+        rounter_params_dtype=torch.float32,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.n_routed_experts = config.n_routed_experts
+        self.n_routed_experts = self.n_routed_experts + zero_expert_num
+        self.rounter_params_dtype = rounter_params_dtype
+        self.classifier = ReplicatedLinear(
+            config.hidden_size,
+            self.n_routed_experts,
+            bias=config.router_bias,
+            params_dtype=rounter_params_dtype,
+            quant_config=None,
+            prefix=add_prefix("classifier", prefix),
+        )
+        self.e_score_correction_bias = nn.Parameter(
+            torch.zeros((self.n_routed_experts), dtype=rounter_params_dtype)
+        )
+
+    def forward(self, hidden_states):
+        logits, _ = self.classifier(hidden_states.to(self.rounter_params_dtype))
+        return logits
+
+
+class LongcatFlashMoE(nn.Module):
+
+    def __init__(
+        self,
+        config: LongcatFlashConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.layer_id = layer_id
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.num_experts = config.n_routed_experts
+        self.top_k = config.moe_topk
+        self.zero_expert_num = config.zero_expert_num
+        self.zero_expert_type = config.zero_expert_type
+
+        if config.rounter_params_dtype == "float32":
+            self.rounter_params_dtype = torch.float32
+        else:
+            self.rounter_params_dtype = torch.bfloat16
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        if self.tp_size > config.n_routed_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.n_routed_experts}."
+            )
+
+        if config.hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+
+        self.router = LongcatFlashRouter(
+            config=self.config,
+            zero_expert_num=self.zero_expert_num,
+            rounter_params_dtype=self.rounter_params_dtype,
+            prefix=add_prefix("router", prefix),
+        )
+
+        self.topk = TopK(
+            top_k=self.top_k,
+            renormalize=False,
+            use_grouped_topk=False,
+            correction_bias=self.router.e_score_correction_bias.data,
+        )
+        self.topk.forward = self.topk.forward_native
+
+        self.experts = get_moe_impl_class()(
+            num_experts=self.num_experts,
+            top_k=self.top_k,
+            layer_id=self.layer_id,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            quant_config=quant_config,
+            prefix=add_prefix("experts", prefix),
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits = self.router(hidden_states)
+        topk_weights, topk_idx, _ = self.topk(
+            hidden_states,
+            router_logits,
+        )
+        if self.zero_expert_type is not None:
+            zero_expert_result = zero_experts_compute_triton(
+                expert_indices=topk_idx,
+                expert_scales=topk_weights,
+                num_experts=self.num_experts,
+                zero_expert_type=self.zero_expert_type,
+                hidden_states=hidden_states,
+            )
+        topk_output = StandardTopKOutput(topk_weights, topk_idx, _)
+
+        final_hidden_states = self.experts(hidden_states, topk_output)
+        final_hidden_states *= self.routed_scaling_factor
+
+        if self.zero_expert_type is not None and hidden_states.shape[0] > 0:
+            final_hidden_states += zero_expert_result.to(final_hidden_states.device)
+
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+    def get_moe_weights(self):
+        return [
+            x.data
+            for name, x in self.experts.named_parameters()
+            if name not in ["correction_bias"]
+        ]
+
+
+class LongcatFlashDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: LongcatFlashConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.layer_id = layer_id
+        self.alt_stream = alt_stream
+        self.self_attn = nn.ModuleList(
+            [
+                DeepseekV2AttentionMLA(
+                    config=config,
+                    hidden_size=config.hidden_size,
+                    num_heads=config.num_attention_heads,
+                    qk_nope_head_dim=config.qk_nope_head_dim,
+                    qk_rope_head_dim=config.qk_rope_head_dim,
+                    v_head_dim=config.v_head_dim,
+                    q_lora_rank=config.q_lora_rank,
+                    kv_lora_rank=config.kv_lora_rank,
+                    rope_theta=config.rope_theta,
+                    rope_scaling=None,
+                    max_position_embeddings=config.max_position_embeddings,
+                    quant_config=(
+                        None
+                        if "self_attn" in getattr(config, "disable_quant_module", [])
+                        else quant_config
+                    ),
+                    layer_id=layer_id * 2 + i,
+                    reduce_results=False,
+                    prefix=add_prefix(f"self_attn.{i}", prefix),
+                    alt_stream=self.alt_stream,
+                )
+                for i in range(2)
+            ]
+        )
+
+        self.input_layernorm = nn.ModuleList(
+            [RMSNorm(config.hidden_size, eps=config.rms_norm_eps) for i in range(2)]
+        )
+        self.post_attention_layernorm = nn.ModuleList(
+            [RMSNorm(config.hidden_size, eps=config.rms_norm_eps) for i in range(2)]
+        )
+
+        self.mlps = nn.ModuleList(
+            [
+                LongcatFlashMLP(
+                    hidden_size=config.hidden_size,
+                    intermediate_size=config.intermediate_size,
+                    hidden_act=config.hidden_act,
+                    quant_config=(
+                        None
+                        if "mlps" in getattr(config, "disable_quant_module", [])
+                        else quant_config
+                    ),
+                    prefix=add_prefix(f"mlps.{i}", prefix),
+                )
+                for i in range(2)
+            ]
+        )
+
+        self.mlp = LongcatFlashMoE(
+            layer_id=self.layer_id,
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+        self.attn_tp_size = get_attention_tp_size()
+        self.attn_tp_rank = get_attention_tp_rank()
+
+        self.mlp_layer_scatter_modes = [
+            LayerScatterModes.init_new(
+                layer_id=self.layer_id * 2 + i,
+                num_layers=config.num_hidden_layers,
+                is_layer_sparse=False,
+                is_previous_layer_sparse=False,
+            )
+            for i in range(2)
+        ]
+        self.mlp_layer_communicator = [
+            LayerCommunicator(
+                layer_scatter_modes=self.mlp_layer_scatter_modes[i],
+                input_layernorm=self.input_layernorm[i],
+                post_attention_layernorm=self.post_attention_layernorm[i],
+            )
+            for i in range(2)
+        ]
+
+        self.moe_layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=self.layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=True,
+            is_previous_layer_sparse=True,
+        )
+        self.moe_layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.moe_layer_scatter_modes,
+            input_layernorm=self.input_layernorm[0],
+            post_attention_layernorm=self.post_attention_layernorm[0],
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+        zero_allocator: BumpAllocator,
+    ) -> torch.Tensor:
+        # first_attn
+        hidden_states, residual = self.moe_layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+        if hidden_states.shape[0] != 0:
+            hidden_states = self.self_attn[0](
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+                zero_allocator=zero_allocator,
+            )
+
+        # moe
+        hidden_states, residual = self.moe_layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+        moe_hidden_states = hidden_states.clone()
+        moe_residual = residual.clone()
+        moe_hidden_states = self.mlp(moe_hidden_states)
+        moe_hidden_states, moe_residual = self.moe_layer_communicator.postprocess_layer(
+            moe_hidden_states, moe_residual, forward_batch
+        )
+
+        hidden_states, residual = self.forward_mlp(
+            hidden_states, positions, residual, forward_batch, zero_allocator
+        )
+
+        hidden_states = moe_hidden_states + hidden_states
+        return hidden_states, residual
+
+    def forward_mlp(
+        self, hidden_states, positions, residual, forward_batch, zero_allocator
+    ):
+        # first_mlp
+        hidden_states = self.mlps[0](hidden_states)
+        # TP all_reduce
+        hidden_states = tensor_model_parallel_all_reduce(hidden_states)
+
+        # second_attn
+        hidden_states, residual = self.mlp_layer_communicator[1].prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+        if hidden_states.shape[0] != 0:
+            hidden_states = self.self_attn[1](
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+                zero_allocator=zero_allocator,
+            )
+
+        # second_mlp
+        hidden_states, residual = self.mlp_layer_communicator[1].prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+        hidden_states = self.mlps[1](hidden_states)
+        # TP all_reduce
+        hidden_states = tensor_model_parallel_all_reduce(hidden_states)
+
+        hidden_states, residual = self.mlp_layer_communicator[1].postprocess_layer(
+            hidden_states, residual, forward_batch
+        )
+
+        return hidden_states, residual
+
+
+class LongcatFlashModel(nn.Module):
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: LongcatFlashConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            enable_tp=not is_dp_attention_enabled(),
+        )
+
+        self.alt_stream = torch.cuda.Stream()
+        self.layers = nn.ModuleList(
+            [
+                LongcatFlashDecoderLayer(
+                    config,
+                    layer_id,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{layer_id}", prefix),
+                    alt_stream=self.alt_stream,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self) -> torch.Tensor:
+        return self.embed_tokens
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        total_num_layers = len(self.layers)
+        device = input_embeds.device if input_embeds is not None else input_ids.device
+        zero_allocator = BumpAllocator(
+            buffer_size=total_num_layers * 2 * (2 if forward_batch.can_run_tbo else 1),
+            dtype=torch.float32,
+            device=device,
+        )
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        residual = None
+
+        for i in range(total_num_layers):
+            with get_global_expert_distribution_recorder().with_current_layer(i):
+                layer = self.layers[i]
+                hidden_states, residual = layer(
+                    positions, hidden_states, forward_batch, residual, zero_allocator
+                )
+
+        if hidden_states.shape[0] != 0:
+            if residual is None:
+                hidden_states = self.norm(hidden_states)
+            else:
+                hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class LongcatFlashForCausalLM(nn.Module):
+    # for quark model load
+    packed_modules_mapping = {}
+
+    def __init__(
+        self,
+        config: LongcatFlashConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        # for quark model load
+        # Fuse q_a_proj and kv_a_proj_with_mqa along output dimension when q_lora_rank is not None
+        self.fuse_qkv_a_proj = (
+            hasattr(config, "q_lora_rank") and config.q_lora_rank is not None
+        )
+        if self.fuse_qkv_a_proj:
+            self.packed_modules_mapping["fused_qkv_a_proj_with_mqa"] = [
+                "q_a_proj",
+                "kv_a_proj_with_mqa",
+            ]
+
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.quant_config = quant_config
+        self.model = LongcatFlashModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+            use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def post_load_weights(self, weight_names=None):
+
+        # Perform post-processing after loading weights
+        if weight_names is None:
+            layer_ids = range(self.config.num_hidden_layers)
+        else:
+            layer_ids = set()
+            for name in weight_names:
+                if "kv_b_proj" in name:
+                    layer_id = int(name.split(".")[2])
+                    if layer_id < self.config.num_hidden_layers:
+                        layer_ids.add(layer_id)
+
+        for layer_id in layer_ids:
+            for i in range(2):
+                self_attn = self.model.layers[layer_id].self_attn[i]
+                if hasattr(self_attn.kv_b_proj, "qweight"):
+                    # AWQ compatible
+                    if _is_cuda or _is_hip:
+                        w = awq_dequantize(
+                            self_attn.kv_b_proj.qweight,
+                            self_attn.kv_b_proj.scales,
+                            self_attn.kv_b_proj.qzeros,
+                        ).T
+                    else:
+                        w = awq_dequantize(
+                            self_attn.kv_b_proj.qweight,
+                            self_attn.kv_b_proj.scales,
+                            self_attn.kv_b_proj.qzeros,
+                            0,
+                            0,
+                            0,
+                        ).T
+                else:
+                    w = self_attn.kv_b_proj.weight
+                use_deep_gemm_bmm = False
+
+                if w.dtype in (
+                    torch.float8_e4m3fn,
+                    torch.float8_e4m3fnuz,
+                ):
+                    if (
+                        hasattr(self.quant_config, "weight_block_size")
+                        and self.quant_config.weight_block_size is not None
+                    ):
+                        weight_block_size = self.quant_config.weight_block_size
+                        assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
+                        if _is_fp8_fnuz:
+                            weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                                weight=w,
+                                weight_scale=self_attn.kv_b_proj.weight_scale_inv,
+                                input_scale=None,
+                            )
+                        else:
+                            weight = w
+                            weight_scale = self_attn.kv_b_proj.weight_scale_inv
+
+                        if (
+                            _is_cuda
+                            and weight_block_size[0] == 128
+                            and weight_block_size[1] == 128
+                        ):
+                            if (
+                                deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
+                                and not deep_gemm_wrapper.DEEPGEMM_BLACKWELL
+                                and get_bool_env_var("SGL_USE_DEEPGEMM_BMM", "false")
+                            ):
+                                block_scale = weight_scale
+                                use_deep_gemm_bmm = True
+                            else:
+                                w = block_quant_dequant(
+                                    weight,
+                                    weight_scale,
+                                    weight_block_size,
+                                    torch.bfloat16,
+                                )
+                        else:
+                            w, scale = block_quant_to_tensor_quant(
+                                weight, weight_scale, weight_block_size
+                            )
+                            self_attn.w_scale = scale
+                    else:
+                        if _is_fp8_fnuz:
+                            weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                                weight=w,
+                                weight_scale=self_attn.kv_b_proj.weight_scale,
+                                input_scale=None,
+                            )
+                        else:
+                            weight = w
+                            weight_scale = self_attn.kv_b_proj.weight_scale
+
+                        w, scale = channel_quant_to_tensor_quant(weight, weight_scale)
+                        self_attn.w_scale = scale
+
+                if w.dtype == torch.int8:
+                    if hasattr(self.quant_config, "weight_block_size"):
+                        # block-wise int8 need it
+                        weight_block_size = self.quant_config.weight_block_size
+                        if weight_block_size is not None:
+                            assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
+                            weight = w
+                            weight_scale = self_attn.kv_b_proj.weight_scale_inv
+                            w = int8_block_dequant(
+                                weight, weight_scale, weight_block_size
+                            ).to(torch.bfloat16)
+                    else:
+                        # channel-wise int8 need it
+                        w = w.to(torch.bfloat16) * self_attn.kv_b_proj.weight_scale.to(
+                            torch.bfloat16
+                        )
+
+                w_kc, w_vc = w.unflatten(
+                    0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
+                ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
+                if not use_deep_gemm_bmm:
+                    self_attn.w_kc = bind_or_assign(
+                        self_attn.w_kc,
+                        w_kc.transpose(1, 2).contiguous().transpose(1, 2),
+                    )
+                    self_attn.w_vc = bind_or_assign(
+                        self_attn.w_vc, w_vc.contiguous().transpose(1, 2)
+                    )
+                    if (
+                        hasattr(self_attn.kv_b_proj, "weight_scale")
+                        and self_attn.w_scale is None
+                    ):
+                        self_attn.w_scale = bind_or_assign(
+                            self_attn.w_scale, self_attn.kv_b_proj.weight_scale
+                        )
+                        if _is_hip:
+                            self_attn.w_scale *= 2.0
+                    # TODO: remove this after adding FP8 support in bmm cpu kernel
+                    if (
+                        _is_cpu
+                        and _is_cpu_amx_available
+                        and w.dtype == torch.float8_e4m3fn
+                    ):
+                        self_attn.w_kc = (
+                            self_attn.w_kc.to(torch.bfloat16) * self_attn.w_scale
+                        )
+                        self_attn.w_vc = (
+                            self_attn.w_vc.to(torch.bfloat16) * self_attn.w_scale
+                        )
+                else:
+                    num_tiles_k = self_attn.qk_nope_head_dim // weight_block_size[1]
+                    num_tiles_n = self_attn.v_head_dim // weight_block_size[0]
+                    ws_kc, ws_vc = block_scale.unflatten(
+                        0, (-1, (num_tiles_k + num_tiles_n))
+                    ).split([num_tiles_k, num_tiles_n], dim=1)
+                    self_attn.w_scale_k = bind_or_assign(
+                        self_attn.w_scale_k, ws_kc.transpose(1, 2).contiguous()
+                    )
+                    self_attn.w_scale_v = bind_or_assign(
+                        self_attn.w_scale_v, ws_vc.contiguous()
+                    )
+                    self_attn.w_kc = bind_or_assign(
+                        self_attn.w_kc, w_kc.transpose(1, 2).contiguous()
+                    )
+                    self_attn.w_vc = bind_or_assign(self_attn.w_vc, w_vc.contiguous())
+                    self_attn.use_deep_gemm_bmm = True
+
+                if self.config.mla_scale_q_lora:
+                    self_attn.q_a_layernorm.weight.data *= (
+                        self.config.hidden_size / self.config.q_lora_rank
+                    ) ** 0.5
+                if self.config.mla_scale_kv_lora:
+                    self_attn.kv_a_layernorm.weight.data *= (
+                        self.config.hidden_size / self.config.kv_lora_rank
+                    ) ** 0.5
+
+        # TODO(linguoyuan) EPMoE not support DEEPGEMM_BLACKWELL, DeepEP needs to be supported in the future
+        deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0 = False
+
+        if (
+            deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
+            and deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
+            and hasattr(self.quant_config, "weight_block_size")
+            and self.quant_config.weight_block_size is not None
+        ):
+            self._weight_requant_ue8m0()
+
+    def _weight_requant_ue8m0(self):
+        weight_block_size = self.quant_config.weight_block_size
+
+        for layer_id in range(self.config.num_hidden_layers):
+            layer = self.model.layers[layer_id]
+            for i in range(2):
+                self_attn = layer.self_attn[i]
+                module_list = [
+                    self_attn.kv_b_proj,
+                    self_attn.o_proj,
+                ]
+
+                if self.config.q_lora_rank is not None:
+                    module_list.append(self_attn.fused_qkv_a_proj_with_mqa)
+                    module_list.append(self_attn.q_b_proj)
+                else:
+                    module_list.append(self_attn.kv_a_proj_with_mqa)
+                    module_list.append(self_attn.q_proj)
+
+                for module in module_list:
+                    if hasattr(module, "weight_scale_inv"):
+                        requant_weight_ue8m0_inplace(
+                            module.weight, module.weight_scale_inv, weight_block_size
+                        )
+
+                mlp = layer.mlps[i]
+                assert isinstance(mlp, LongcatFlashMLP)
+                for module in [
+                    mlp.gate_up_proj,
+                    mlp.down_proj,
+                ]:
+                    if hasattr(module, "weight_scale_inv"):
+                        requant_weight_ue8m0_inplace(
+                            module.weight, module.weight_scale_inv, weight_block_size
+                        )
+
+        for layer_id in range(self.config.num_hidden_layers):
+            experts = layer.mlp.experts
+            if isinstance(experts, DeepEPMoE):
+                for w in [
+                    experts.w13_weight_fp8,
+                    experts.w2_weight_fp8,
+                ]:
+                    requant_weight_ue8m0_inplace(w[0], w[1], weight_block_size)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = get_moe_impl_class().make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts,
+        )
+
+        # Fuse q_a_proj and kv_a_proj_with_mqa along output dimension when q_lora_rank is not None
+        fuse_qkv_a_proj = hasattr(self.config, "q_lora_rank") and (
+            self.config.q_lora_rank is not None
+        )
+        cached_a_proj = {} if fuse_qkv_a_proj else None
+
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = []
+            params_dict = dict(self.named_parameters())
+            weight_names = []
+            for name, loaded_weight in weights:
+                if "mtp" in name:
+                    continue
+                weight_names.append(name)
+                if "rotary_emb.inv_freq" in name:
+                    continue
+                for param_name, weight_name, shard_id in stacked_params_mapping:
+                    # Skip non-stacked layers and experts (experts handled below).
+                    if weight_name not in name:
+                        continue
+                    # We have mlp.experts[0].gate_proj in the checkpoint.
+                    # Since we handle the experts below in expert_params_mapping,
+                    # we need to skip here BEFORE we update the name, otherwise
+                    # name will be updated to mlp.experts[0].gate_up_proj, which
+                    # will then be updated below in expert_params_mapping
+                    # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                    if ("mlp.experts." in name) and name not in params_dict:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    futures.append(
+                        executor.submit(weight_loader, param, loaded_weight, shard_id)
+                    )
+                    break
+                else:
+                    for mapping in expert_params_mapping:
+                        param_name, weight_name, expert_id, shard_id = mapping
+                        if weight_name not in name:
+                            continue
+                        name = name.replace(weight_name, param_name)
+                        param = params_dict[name]
+                        weight_loader = param.weight_loader
+                        futures.append(
+                            executor.submit(
+                                weight_loader,
+                                param,
+                                loaded_weight,
+                                name,
+                                shard_id=shard_id,
+                                expert_id=expert_id,
+                            )
+                        )
+                        break
+                    else:
+                        # Skip loading extra bias for GPTQ models.
+                        if name.endswith(".bias") and name not in params_dict:
+                            continue
+                        if fuse_qkv_a_proj and (
+                            "q_a_proj" in name or "kv_a_proj_with_mqa" in name
+                        ):
+                            cached_a_proj[name] = loaded_weight
+                            q_a_proj_name = (
+                                name
+                                if "q_a_proj" in name
+                                else name.replace("kv_a_proj_with_mqa", "q_a_proj")
+                            )
+                            kv_a_proj_name = (
+                                name
+                                if "kv_a_proj_with_mqa" in name
+                                else name.replace("q_a_proj", "kv_a_proj_with_mqa")
+                            )
+
+                            # When both q_a_proj and kv_a_proj_with_mqa has been cached, load the fused weight to parameter
+                            if (
+                                q_a_proj_name in cached_a_proj
+                                and kv_a_proj_name in cached_a_proj
+                            ):
+                                q_a_proj_weight = cached_a_proj[q_a_proj_name]
+                                kv_a_proj_weight = cached_a_proj[kv_a_proj_name]
+                                cat_dim = 0
+                                if self.quant_config is not None and (
+                                    self.quant_config.get_name() == "awq"
+                                    or self.quant_config.get_name() == "awq_marlin"
+                                    or self.quant_config.get_name() == "moe_wna16"
+                                ):
+                                    cat_dim = 1
+                                fused_weight = torch.cat(
+                                    [q_a_proj_weight, kv_a_proj_weight], dim=cat_dim
+                                )
+                                param_name = (
+                                    name.replace(
+                                        "q_a_proj", "fused_qkv_a_proj_with_mqa"
+                                    )
+                                    if "q_a_proj" in name
+                                    else name.replace(
+                                        "kv_a_proj_with_mqa",
+                                        "fused_qkv_a_proj_with_mqa",
+                                    )
+                                )
+                                param = params_dict[param_name]
+
+                                weight_loader = getattr(
+                                    param, "weight_loader", default_weight_loader
+                                )
+                                futures.append(
+                                    executor.submit(weight_loader, param, fused_weight)
+                                )
+                                cached_a_proj.pop(q_a_proj_name)
+                                cached_a_proj.pop(kv_a_proj_name)
+                        else:
+                            if (
+                                "k_scale" in name or "v_scale" in name
+                            ) and name not in params_dict:
+                                # modelopt attn kv scale is named differently
+                                for scale in ["k_scale", "v_scale"]:
+                                    if scale in name:
+                                        name = name.replace(
+                                            f"{scale[0]}_proj", "attn_mqa"
+                                        )
+                                        break
+                            if name not in params_dict:
+                                # modelopt ckpt contains not needed weights for MTP module:
+                                # model.decoder.self_attn.attn_mqa.v_scale and
+                                # model.decoder.self_attn.attn_mqa.k_scale
+                                logger.warning(f"{name} not found in params_dict.")
+                                continue
+                            param = params_dict[name]
+                            weight_loader = getattr(
+                                param, "weight_loader", default_weight_loader
+                            )
+                            futures.append(
+                                executor.submit(weight_loader, param, loaded_weight)
+                            )
+
+            # Wait for all tasks to complete and raise any exceptions.
+            for future in concurrent.futures.as_completed(futures):
+                future.result()
+
+        self.post_load_weights(weight_names=weight_names)
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        return ModelConfigForExpertLocation(
+            num_layers=config.num_hidden_layers,
+            num_logical_experts=config.n_routed_experts,
+        )
+
+
+EntryClass = [LongcatFlashForCausalLM]
diff --git a/python/sglang/srt/models/longcat_flash_nextn.py b/python/sglang/srt/models/longcat_flash_nextn.py
new file mode 100644
index 000000000..64a4265c5
--- /dev/null
+++ b/python/sglang/srt/models/longcat_flash_nextn.py
@@ -0,0 +1,699 @@
+# Apache License, Version 2.0:
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License:
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import concurrent.futures
+import logging
+import os
+from enum import IntEnum, auto
+from typing import Any, Dict, Iterable, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from tqdm import tqdm
+
+from sglang.srt.configs import LongcatFlashConfig
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import ReplicatedLinear
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization import deep_gemm_wrapper
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
+from sglang.srt.layers.quantization.fp8_utils import (
+    block_quant_dequant,
+    block_quant_to_tensor_quant,
+    channel_quant_to_tensor_quant,
+    normalize_e4m3fn_to_e4m3fnuz,
+    requant_weight_ue8m0_inplace,
+)
+from sglang.srt.layers.quantization.int8_utils import (
+    block_dequant as int8_block_dequant,
+)
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.deepseek_v2 import DeepseekV2AttentionMLA
+from sglang.srt.models.longcat_flash import LongcatFlashForCausalLM, LongcatFlashMLP
+from sglang.srt.utils import (
+    BumpAllocator,
+    LazyValue,
+    add_prefix,
+    bind_or_assign,
+    cpu_has_amx_support,
+    get_bool_env_var,
+    get_device_sm,
+    is_cpu,
+    is_cuda,
+    is_hip,
+    is_npu,
+)
+
+_is_hip = is_hip()
+_is_cuda = is_cuda()
+_is_npu = is_npu()
+_is_fp8_fnuz = is_fp8_fnuz()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+_device_sm = get_device_sm()
+
+if _is_cuda:
+    from sgl_kernel import (
+        awq_dequantize,
+        bmm_fp8,
+        dsv3_fused_a_gemm,
+        dsv3_router_gemm,
+        merge_state_v2,
+    )
+elif _is_cpu and _is_cpu_amx_available:
+    pass
+elif _is_hip:
+    from sglang.srt.layers.quantization.awq_triton import (
+        awq_dequantize_triton as awq_dequantize,
+    )
+else:
+    from vllm._custom_ops import awq_dequantize
+
+
+logger = logging.getLogger(__name__)
+
+
+class LongcatFlashDenseDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: LongcatFlashConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.layer_id = layer_id
+        self.alt_stream = alt_stream
+
+        self.self_attn = DeepseekV2AttentionMLA(
+            config=config,
+            hidden_size=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            qk_nope_head_dim=config.qk_nope_head_dim,
+            qk_rope_head_dim=config.qk_rope_head_dim,
+            v_head_dim=config.v_head_dim,
+            q_lora_rank=config.q_lora_rank,
+            kv_lora_rank=config.kv_lora_rank,
+            rope_theta=config.rope_theta,
+            rope_scaling=None,
+            max_position_embeddings=config.max_position_embeddings,
+            quant_config=quant_config,
+            layer_id=layer_id,
+            reduce_results=False,
+            prefix=add_prefix(f"self_attn", prefix),
+            alt_stream=self.alt_stream,
+        )
+
+        self.mlp = LongcatFlashMLP(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix(f"mlps", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.attn_tp_size = get_attention_tp_size()
+        self.attn_tp_rank = get_attention_tp_rank()
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=self.layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=False,
+            is_previous_layer_sparse=False,
+        )
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+        zero_allocator: BumpAllocator,
+    ) -> torch.Tensor:
+
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+        if hidden_states.shape[0] != 0:
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+                zero_allocator=zero_allocator,
+            )
+
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+        hidden_states = self.mlp(hidden_states)
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            hidden_states, residual, forward_batch
+        )
+        return hidden_states, residual
+
+
+class LongcatFlashModelNextN(nn.Module):
+    def __init__(
+        self,
+        config: LongcatFlashConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.vocab_size = config.vocab_size
+        self.alt_stream = torch.cuda.Stream()
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            enable_tp=not is_dp_attention_enabled(),
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+
+        self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.eh_proj = ReplicatedLinear(
+            2 * config.hidden_size,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("eh_proj", ""),
+        )
+        self.decoder = LongcatFlashDenseDecoderLayer(
+            config, 0, quant_config=quant_config, alt_stream=self.alt_stream
+        )
+
+        self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self) -> torch.Tensor:
+        return self.embed_tokens
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        total_num_layers = 1
+        device = input_embeds.device if input_embeds is not None else input_ids.device
+        zero_allocator = BumpAllocator(
+            buffer_size=total_num_layers * 2 * (2 if forward_batch.can_run_tbo else 1),
+            dtype=torch.float32,
+            device=device,
+        )
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        if hidden_states.shape[0] > 0:
+            hidden_states, _ = self.eh_proj(
+                torch.cat(
+                    (
+                        self.enorm(hidden_states),
+                        self.hnorm(forward_batch.spec_info.hidden_states),
+                    ),
+                    dim=-1,
+                )
+            )
+
+        residual = None
+        with get_global_expert_distribution_recorder().disable_this_region():
+            hidden_states, residual = self.decoder(
+                positions, hidden_states, forward_batch, residual, zero_allocator
+            )
+
+        if not forward_batch.forward_mode.is_idle():
+            if residual is not None:
+                hidden_states, _ = self.final_layernorm(hidden_states, residual)
+            else:
+                hidden_states = self.final_layernorm(hidden_states)
+        return hidden_states
+
+
+class LongcatFlashForCausalLMNextN(LongcatFlashForCausalLM):
+
+    def __init__(
+        self,
+        config: LongcatFlashConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.quant_config = (
+            None
+            if "mtp" in getattr(config, "disable_quant_module", [])
+            else quant_config
+        )
+        self.model = LongcatFlashModelNextN(config, self.quant_config)
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=self.quant_config,
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def post_load_weights(self):
+        self_attn = self.model.decoder.self_attn
+        if hasattr(self_attn.kv_b_proj, "qweight"):
+            # AWQ compatible
+            if _is_cuda or _is_hip:
+                w = awq_dequantize(
+                    self_attn.kv_b_proj.qweight,
+                    self_attn.kv_b_proj.scales,
+                    self_attn.kv_b_proj.qzeros,
+                ).T
+            else:
+                w = awq_dequantize(
+                    self_attn.kv_b_proj.qweight,
+                    self_attn.kv_b_proj.scales,
+                    self_attn.kv_b_proj.qzeros,
+                    0,
+                    0,
+                    0,
+                ).T
+        else:
+            w = self_attn.kv_b_proj.weight
+        use_deep_gemm_bmm = False
+        if w.dtype in (
+            torch.float8_e4m3fn,
+            torch.float8_e4m3fnuz,
+        ):
+            if (
+                hasattr(self.quant_config, "weight_block_size")
+                and self.quant_config.weight_block_size is not None
+            ):
+                weight_block_size = self.quant_config.weight_block_size
+                assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
+                if _is_fp8_fnuz:
+                    weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                        weight=w,
+                        weight_scale=self_attn.kv_b_proj.weight_scale_inv,
+                        input_scale=None,
+                    )
+                else:
+                    weight = w
+                    weight_scale = self_attn.kv_b_proj.weight_scale_inv
+                if (
+                    _is_cuda
+                    and weight_block_size[0] == 128
+                    and weight_block_size[1] == 128
+                ):
+                    if (
+                        deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
+                        and not deep_gemm_wrapper.DEEPGEMM_BLACKWELL
+                        and get_bool_env_var("SGL_USE_DEEPGEMM_BMM", "false")
+                    ):
+                        block_scale = weight_scale
+                        use_deep_gemm_bmm = True
+                    else:
+                        w = block_quant_dequant(
+                            weight,
+                            weight_scale,
+                            weight_block_size,
+                            torch.bfloat16,
+                        )
+                else:
+                    w, scale = block_quant_to_tensor_quant(
+                        weight, weight_scale, weight_block_size
+                    )
+                    self_attn.w_scale = scale
+            else:
+                if _is_fp8_fnuz:
+                    weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                        weight=w,
+                        weight_scale=self_attn.kv_b_proj.weight_scale,
+                        input_scale=None,
+                    )
+                else:
+                    weight = w
+                    weight_scale = self_attn.kv_b_proj.weight_scale
+                w, scale = channel_quant_to_tensor_quant(weight, weight_scale)
+                self_attn.w_scale = scale
+        if w.dtype == torch.int8:
+            if hasattr(self.quant_config, "weight_block_size"):
+                # block-wise int8 need it
+                weight_block_size = self.quant_config.weight_block_size
+                if weight_block_size is not None:
+                    assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
+                    weight = w
+                    weight_scale = self_attn.kv_b_proj.weight_scale_inv
+                    w = int8_block_dequant(weight, weight_scale, weight_block_size).to(
+                        torch.bfloat16
+                    )
+            else:
+                # channel-wise int8 need it
+                w = w.to(torch.bfloat16) * self_attn.kv_b_proj.weight_scale.to(
+                    torch.bfloat16
+                )
+        w_kc, w_vc = w.unflatten(
+            0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
+        ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
+        if not use_deep_gemm_bmm:
+            self_attn.w_kc = bind_or_assign(
+                self_attn.w_kc, w_kc.transpose(1, 2).contiguous().transpose(1, 2)
+            )
+            self_attn.w_vc = bind_or_assign(
+                self_attn.w_vc, w_vc.contiguous().transpose(1, 2)
+            )
+            if (
+                hasattr(self_attn.kv_b_proj, "weight_scale")
+                and self_attn.w_scale is None
+            ):
+                self_attn.w_scale = bind_or_assign(
+                    self_attn.w_scale, self_attn.kv_b_proj.weight_scale
+                )
+                if _is_hip:
+                    self_attn.w_scale *= 2.0
+            # TODO: remove this after adding FP8 support in bmm cpu kernel
+            if _is_cpu and _is_cpu_amx_available and w.dtype == torch.float8_e4m3fn:
+                self_attn.w_kc = self_attn.w_kc.to(torch.bfloat16) * self_attn.w_scale
+                self_attn.w_vc = self_attn.w_vc.to(torch.bfloat16) * self_attn.w_scale
+        else:
+            num_tiles_k = self_attn.qk_nope_head_dim // weight_block_size[1]
+            num_tiles_n = self_attn.v_head_dim // weight_block_size[0]
+            ws_kc, ws_vc = block_scale.unflatten(
+                0, (-1, (num_tiles_k + num_tiles_n))
+            ).split([num_tiles_k, num_tiles_n], dim=1)
+            self_attn.w_scale_k = bind_or_assign(
+                self_attn.w_scale_k, ws_kc.transpose(1, 2).contiguous()
+            )
+            self_attn.w_scale_v = bind_or_assign(
+                self_attn.w_scale_v, ws_vc.contiguous()
+            )
+            self_attn.w_kc = bind_or_assign(
+                self_attn.w_kc, w_kc.transpose(1, 2).contiguous()
+            )
+            self_attn.w_vc = bind_or_assign(self_attn.w_vc, w_vc.contiguous())
+            self_attn.use_deep_gemm_bmm = True
+
+        if self.config.mla_scale_q_lora:
+            self_attn.q_a_layernorm.weight.data *= (
+                self.config.hidden_size / self.config.q_lora_rank
+            ) ** 0.5
+        if self.config.mla_scale_kv_lora:
+            self_attn.kv_a_layernorm.weight.data *= (
+                self.config.hidden_size / self.config.kv_lora_rank
+            ) ** 0.5
+
+        if (
+            deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
+            and deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
+            and hasattr(self.quant_config, "weight_block_size")
+            and self.quant_config.weight_block_size is not None
+        ):
+            self._weight_requant_ue8m0()
+
+    def _weight_requant_ue8m0(self):
+        weight_block_size = self.quant_config.weight_block_size
+        layer = self.model.decoder
+        self_attn = layer.self_attn
+        module_list = [
+            self_attn.kv_b_proj,
+            self_attn.o_proj,
+        ]
+
+        if self.config.q_lora_rank is not None:
+            module_list.append(self_attn.fused_qkv_a_proj_with_mqa)
+            module_list.append(self_attn.q_b_proj)
+        else:
+            module_list.append(self_attn.kv_a_proj_with_mqa)
+            module_list.append(self_attn.q_proj)
+
+        for module in module_list:
+            if hasattr(module, "weight_scale_inv"):
+                requant_weight_ue8m0_inplace(
+                    module.weight, module.weight_scale_inv, weight_block_size
+                )
+
+        mlp = layer.mlps
+        assert isinstance(mlp, LongcatFlashMLP)
+        for module in [
+            mlp.gate_up_proj,
+            mlp.down_proj,
+        ]:
+            if hasattr(module, "weight_scale_inv"):
+                requant_weight_ue8m0_inplace(
+                    module.weight, module.weight_scale_inv, weight_block_size
+                )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Fuse q_a_proj and kv_a_proj_with_mqa along output dimension when q_lora_rank is not None
+        fuse_qkv_a_proj = hasattr(self.config, "q_lora_rank") and (
+            self.config.q_lora_rank is not None
+        )
+        cached_a_proj = {} if fuse_qkv_a_proj else None
+
+        nextn_layer_prefix = "model.layers.0"
+        nextn_spec_weight_names = [
+            "shared_head.norm",
+            "eh_proj",
+            "enorm",
+            "hnorm",
+            "final_layernorm",
+        ]
+
+        weight_names_mapping = {
+            "model.mtp.embed_tokens.weight": "embed_tokens.weight",
+            "model.mtp.layers.0.eh_proj.weight": "eh_proj.weight",
+            "model.mtp.layers.0.eh_proj.weight_scale_inv": "eh_proj.weight_scale_inv",
+            "model.mtp.layers.0.enorm.m.weight": "enorm.weight",
+            "model.mtp.layers.0.hnorm.m.weight": "hnorm.weight",
+            "model.mtp.layers.0.input_layernorm.weight": "layers.0.input_layernorm.weight",
+            "model.mtp.layers.0.post_attention_layernorm.weight": "layers.0.post_attention_layernorm.weight",
+            "model.mtp.layers.0.self_attn.kv_a_layernorm.weight": "layers.0.self_attn.kv_a_layernorm.weight",
+            "model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight": "layers.0.self_attn.kv_a_proj_with_mqa.weight",
+            "model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv": "layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv",
+            "model.mtp.layers.0.self_attn.kv_b_proj.weight": "layers.0.self_attn.kv_b_proj.weight",
+            "model.mtp.layers.0.self_attn.kv_b_proj.weight_scale_inv": "layers.0.self_attn.kv_b_proj.weight_scale_inv",
+            "model.mtp.layers.0.self_attn.o_proj.weight": "layers.0.self_attn.o_proj.weight",
+            "model.mtp.layers.0.self_attn.o_proj.weight_scale_inv": "layers.0.self_attn.o_proj.weight_scale_inv",
+            "model.mtp.layers.0.self_attn.q_a_layernorm.weight": "layers.0.self_attn.q_a_layernorm.weight",
+            "model.mtp.layers.0.self_attn.q_a_proj.weight": "layers.0.self_attn.q_a_proj.weight",
+            "model.mtp.layers.0.self_attn.q_a_proj.weight_scale_inv": "layers.0.self_attn.q_a_proj.weight_scale_inv",
+            "model.mtp.layers.0.self_attn.q_b_proj.weight": "layers.0.self_attn.q_b_proj.weight",
+            "model.mtp.layers.0.self_attn.q_b_proj.weight_scale_inv": "layers.0.self_attn.q_b_proj.weight_scale_inv",
+            "model.mtp.layers.0.transformer_layer.mlp.down_proj.weight": "layers.0.mlp.down_proj.weight",
+            "model.mtp.layers.0.transformer_layer.mlp.down_proj.weight_scale_inv": "layers.0.mlp.down_proj.weight_scale_inv",
+            "model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight": "layers.0.mlp.gate_proj.weight",
+            "model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight_scale_inv": "layers.0.mlp.gate_proj.weight_scale_inv",
+            "model.mtp.layers.0.transformer_layer.mlp.up_proj.weight": "layers.0.mlp.up_proj.weight",
+            "model.mtp.layers.0.transformer_layer.mlp.up_proj.weight_scale_inv": "layers.0.mlp.up_proj.weight_scale_inv",
+            "model.mtp.norm.weight": "layers.0.final_layernorm.weight",
+        }
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = []
+            params_dict = dict(self.named_parameters())
+            weight_names = []
+            for name, loaded_weight in weights:
+                if ".mtp." not in name:
+                    continue
+                if name in weight_names_mapping:
+                    name = weight_names_mapping[name]
+                if name.startswith("layers.0"):
+                    name = "model." + name
+                if (
+                    name.startswith("enorm")
+                    or name.startswith("hnorm")
+                    or name.startswith("eh_proj")
+                ):
+                    name = nextn_layer_prefix + "." + name
+                if not name.startswith(nextn_layer_prefix):
+                    continue
+
+                # Use shared head and embed weights from target model
+                if "shared_head.head" in name or "embed_tokens" in name:
+                    continue
+
+                is_decoder = True
+                # For nextn specific weights
+                for weight_name in nextn_spec_weight_names:
+                    if weight_name in name:
+                        name = name.replace(nextn_layer_prefix, "model")
+                        is_decoder = False
+                        break
+                # For decoder layer weights
+                if is_decoder:
+                    name = name.replace(nextn_layer_prefix, "model.decoder")
+
+                weight_names.append(name)
+                if "rotary_emb.inv_freq" in name:
+                    continue
+                for param_name, weight_name, shard_id in stacked_params_mapping:
+                    # Skip non-stacked layers and experts (experts handled below).
+                    if weight_name not in name:
+                        continue
+                    # We have mlp.experts[0].gate_proj in the checkpoint.
+                    # Since we handle the experts below in expert_params_mapping,
+                    # we need to skip here BEFORE we update the name, otherwise
+                    # name will be updated to mlp.experts[0].gate_up_proj, which
+                    # will then be updated below in expert_params_mapping
+                    # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                    if ("mlp.experts." in name) and name not in params_dict:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    futures.append(
+                        executor.submit(weight_loader, param, loaded_weight, shard_id)
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if fuse_qkv_a_proj and (
+                        "q_a_proj" in name or "kv_a_proj_with_mqa" in name
+                    ):
+                        cached_a_proj[name] = loaded_weight
+                        q_a_proj_name = (
+                            name
+                            if "q_a_proj" in name
+                            else name.replace("kv_a_proj_with_mqa", "q_a_proj")
+                        )
+                        kv_a_proj_name = (
+                            name
+                            if "kv_a_proj_with_mqa" in name
+                            else name.replace("q_a_proj", "kv_a_proj_with_mqa")
+                        )
+
+                        # When both q_a_proj and kv_a_proj_with_mqa has been cached, load the fused weight to parameter
+                        if (
+                            q_a_proj_name in cached_a_proj
+                            and kv_a_proj_name in cached_a_proj
+                        ):
+                            q_a_proj_weight = cached_a_proj[q_a_proj_name]
+                            kv_a_proj_weight = cached_a_proj[kv_a_proj_name]
+                            cat_dim = 0
+                            if self.quant_config is not None and (
+                                self.quant_config.get_name() == "awq"
+                                or self.quant_config.get_name() == "awq_marlin"
+                                or self.quant_config.get_name() == "moe_wna16"
+                            ):
+                                cat_dim = 1
+                            fused_weight = torch.cat(
+                                [q_a_proj_weight, kv_a_proj_weight], dim=cat_dim
+                            )
+                            param_name = (
+                                name.replace("q_a_proj", "fused_qkv_a_proj_with_mqa")
+                                if "q_a_proj" in name
+                                else name.replace(
+                                    "kv_a_proj_with_mqa",
+                                    "fused_qkv_a_proj_with_mqa",
+                                )
+                            )
+                            param = params_dict[param_name]
+
+                            weight_loader = getattr(
+                                param, "weight_loader", default_weight_loader
+                            )
+                            futures.append(
+                                executor.submit(weight_loader, param, fused_weight)
+                            )
+                            cached_a_proj.pop(q_a_proj_name)
+                            cached_a_proj.pop(kv_a_proj_name)
+                    else:
+                        if (
+                            "k_scale" in name or "v_scale" in name
+                        ) and name not in params_dict:
+                            # modelopt attn kv scale is named differently
+                            for scale in ["k_scale", "v_scale"]:
+                                if scale in name:
+                                    name = name.replace(f"{scale[0]}_proj", "attn_mqa")
+                                    break
+                        if name not in params_dict:
+                            # modelopt ckpt contains not needed weights for MTP module:
+                            # model.decoder.self_attn.attn_mqa.v_scale and
+                            # model.decoder.self_attn.attn_mqa.k_scale
+                            logger.warning(f"{name} not found in params_dict.")
+                            continue
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        futures.append(
+                            executor.submit(weight_loader, param, loaded_weight)
+                        )
+        self.post_load_weights()
+
+
+EntryClass = [LongcatFlashForCausalLMNextN]
diff --git a/python/sglang/srt/models/mimo.py b/python/sglang/srt/models/mimo.py
new file mode 100644
index 000000000..2a89e7706
--- /dev/null
+++ b/python/sglang/srt/models/mimo.py
@@ -0,0 +1,171 @@
+# Adapted from qwen2.py
+
+from functools import partial
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    split_tensor_along_last_dim,
+    tensor_model_parallel_all_gather,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import QKVParallelLinear, RowParallelLinear
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen2 import Qwen2DecoderLayer, Qwen2MLP, Qwen2Model
+from sglang.srt.utils import add_prefix
+
+MiMoConfig = None
+
+
+class MiMoModel(Qwen2Model):
+    def __init__(
+        self,
+        config: MiMoConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(
+            config=config,
+            quant_config=quant_config,
+            prefix=prefix,
+            decoder_layer_type=Qwen2DecoderLayer,
+        )
+
+
+class MiMoForCausalLM(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        config: MiMoConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = MiMoModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        if not get_embedding:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            return self.pooler(hidden_states, forward_batch)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if (
+                "rotary_emb.inv_freq" in name
+                or "projector" in name
+                or "mtp_layers" in name
+            ):
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        self.model.load_kv_cache_scales(quantization_param_path)
+
+
+EntryClass = MiMoForCausalLM
diff --git a/python/sglang/srt/models/mimo_mtp.py b/python/sglang/srt/models/mimo_mtp.py
new file mode 100644
index 000000000..89e8c02cd
--- /dev/null
+++ b/python/sglang/srt/models/mimo_mtp.py
@@ -0,0 +1,204 @@
+# Adapted from https://github.com/vllm-project/vllm/pull/17433/files  and deepseek_nextn.py
+
+from functools import partial
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen2 import Qwen2DecoderLayer
+
+
+class MiMoMultiTokenPredictorLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        prefix: str,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.token_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.hidden_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.input_proj = nn.Linear(
+            config.hidden_size * 2, config.hidden_size, bias=False
+        )
+        self.mtp_block = Qwen2DecoderLayer(
+            config=config, quant_config=quant_config, prefix=prefix
+        )
+        self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        # masking inputs at position 0, as not needed by MTP
+        hidden_states[positions == 0] = 0
+
+        hidden_states = self.input_proj(
+            torch.cat(
+                (
+                    self.hidden_layernorm(forward_batch.spec_info.hidden_states),
+                    self.token_layernorm(hidden_states),
+                ),
+                dim=-1,
+            )
+        )
+
+        hidden_states, residual = self.mtp_block(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+            residual=None,
+        )
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layernorm(hidden_states)
+        return hidden_states
+
+
+class MiMoMTP(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.quant_config = quant_config
+
+        self.model = MiMoMultiTokenPredictorLayer(
+            config,
+            prefix,
+            quant_config,
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+            name = self.map_model_name_to_mtp_param_name(name)
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if "mtp_block" not in name:
+                    break
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if "mtp_block" not in name and (
+                    "embed_tokens" not in name
+                    and "lm_head" not in name
+                    and "token_layernorm" not in name
+                    and "hidden_layernorm" not in name
+                    and "input_proj" not in name
+                    and "final_layernorm" not in name
+                ):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+    def map_model_name_to_mtp_param_name(self, name: str) -> str:
+        import re
+
+        name_without_prefix = [
+            "token_layernorm",
+            "hidden_layernorm",
+            "input_proj",
+            "final_layernorm",
+        ]
+        pattern = r"model.mtp_layers.(\d+)."
+        group = re.match(pattern, name)
+        if group is not None:
+            for sub_name in name_without_prefix:
+                if sub_name in name:
+                    name = name.replace(group.group(), "model.")
+                    return name
+            name = name.replace(group.group(), "model.mtp_block.")
+        return name
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+
+EntryClass = MiMoMTP
diff --git a/python/sglang/srt/models/minicpm.py b/python/sglang/srt/models/minicpm.py
new file mode 100644
index 000000000..e7c94c85d
--- /dev/null
+++ b/python/sglang/srt/models/minicpm.py
@@ -0,0 +1,399 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only MiniCPM model compatible with HuggingFace weights."""
+
+import math
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+
+class MiniCPMMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class MiniCPMAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        orig_dtype = q.dtype
+        q, k = q.float(), k.float()
+        q, k = self.rotary_emb(positions, q, k)
+        q, k = q.to(orig_dtype), k.to(orig_dtype)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MiniCPMDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        self.self_attn = MiniCPMAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = MiniCPMMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = residual + hidden_states * (
+            self.config.scale_depth / math.sqrt(self.config.num_hidden_layers)
+        )
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states * (
+            self.config.scale_depth / math.sqrt(self.config.num_hidden_layers)
+        )
+
+        return hidden_states, None
+
+
+class MiniCPMModel(nn.Module):
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = nn.ModuleList(
+            [
+                MiniCPMDecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids) * self.config.scale_emb
+        else:
+            hidden_states = input_embeds
+        residual = None
+
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class MiniCPMForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.num_experts = getattr(self.config, "num_experts", 0)
+        self.quant_config = quant_config
+        self.model = MiniCPMModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        # self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        if not self.config.tie_word_embeddings:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                prefix=add_prefix("lm_head", prefix),
+            )
+
+        self.scale_width = self.config.hidden_size / self.config.dim_model_base
+
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is not None:
+            input_embeds = input_embeds * self.config.scale_emb
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        hidden_states = hidden_states / self.scale_width
+        if self.config.tie_word_embeddings:
+            lm_head = self.model.embed_tokens
+        else:
+            lm_head = self.lm_head
+        return self.logits_processor(input_ids, hidden_states, lm_head, forward_batch)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        expert_params_mapping = [
+            # (param_name, weight_name, expert_id)
+            (
+                "ws" if weight_name in ["w1", "w3"] else "w2s",
+                f"experts.{expert_id}.{weight_name}.weight",
+                expert_id,
+            )
+            for expert_id in range(self.num_experts)
+            for weight_name in ["w1", "w2", "w3"]
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for param_name, weight_name, expert_id in expert_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param, loaded_weight, weight_name, expert_id=expert_id
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+
+EntryClass = MiniCPMForCausalLM
diff --git a/python/sglang/srt/models/minicpm3.py b/python/sglang/srt/models/minicpm3.py
new file mode 100644
index 000000000..821dfa98a
--- /dev/null
+++ b/python/sglang/srt/models/minicpm3.py
@@ -0,0 +1,517 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only MiniCPM3 model compatible with HuggingFace weights."""
+
+import math
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, is_cuda
+
+if is_cuda():
+    from sgl_kernel import bmm_fp8
+
+
+class MiniCPM3MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+def input_to_float8(x, dtype=torch.float8_e4m3fn):
+    finfo = torch.finfo(dtype)
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    scale = finfo.max / amax
+    x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
+    return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
+
+
+class MiniCPM3AttentionMLA(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: int,
+        kv_lora_rank: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_id=None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.num_heads = num_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        assert num_heads % tp_size == 0
+        self.num_local_heads = num_heads // tp_size
+        self.scaling = self.qk_head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        if self.q_lora_rank is not None:
+            self.q_a_proj = ReplicatedLinear(
+                self.hidden_size,
+                self.q_lora_rank,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("q_a_proj", prefix),
+            )
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(
+                q_lora_rank,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("q_b_proj", prefix),
+            )
+        else:
+            self.q_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("q_proj", prefix),
+            )
+
+        self.kv_a_proj_with_mqa = ReplicatedLinear(
+            self.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("kv_a_proj_with_mqa", prefix),
+        )
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("kv_b_proj", prefix),
+        )
+        # O projection.
+        self.o_proj = RowParallelLinear(
+            self.num_heads * self.v_head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+        self.rotary_emb = get_rope(
+            qk_rope_head_dim,
+            rotary_dim=qk_rope_head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+
+        self.attn = RadixAttention(
+            self.num_local_heads,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            self.scaling,
+            num_kv_heads=1,
+            layer_id=layer_id,
+            v_head_dim=self.kv_lora_rank,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+        self.w_kc = None
+        self.w_vc = None
+        self.w_scale = None
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        q_len = hidden_states.shape[0]
+        q_input = hidden_states.new_empty(
+            q_len, self.num_local_heads, self.kv_lora_rank + self.qk_rope_head_dim
+        )
+        if self.q_lora_rank is not None:
+            q = self.q_a_proj(hidden_states)[0]
+            q = self.q_a_layernorm(q)
+            q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, self.qk_head_dim)
+        else:
+            q = self.q_proj(hidden_states)[0].view(
+                -1, self.num_local_heads, self.qk_head_dim
+            )
+        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+
+        if self.w_kc.dtype == torch.float8_e4m3fn:
+            q_nope_val, q_nope_scale = input_to_float8(
+                q_nope.transpose(0, 1), torch.float8_e4m3fn
+            )
+            q_nope_out = bmm_fp8(
+                q_nope_val, self.w_kc, q_nope_scale, self.w_scale, torch.bfloat16
+            )
+        else:
+            q_nope_out = torch.bmm(q_nope.transpose(0, 1), self.w_kc)
+        q_input[..., : self.kv_lora_rank] = q_nope_out.transpose(0, 1)
+
+        latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
+        v_input = latent_cache[..., : self.kv_lora_rank]
+        v_input = self.kv_a_layernorm(v_input.contiguous()).unsqueeze(1)
+        k_input = latent_cache.unsqueeze(1)
+        k_input[..., : self.kv_lora_rank] = v_input
+        k_pe = k_input[..., self.kv_lora_rank :]
+
+        original_shapes = [q_pe.shape, k_pe.shape]
+        q_pe, k_pe = self.rotary_emb(
+            positions, q_pe.reshape(q_pe.shape[0], -1), k_pe.reshape(k_pe.shape[0], -1)
+        )
+        q_pe, k_pe = q_pe.view(original_shapes[0]), k_pe.view(original_shapes[1])
+        q_input[..., self.kv_lora_rank :] = q_pe
+        k_input[..., self.kv_lora_rank :] = k_pe
+
+        attn_output = self.attn(q_input, k_input, v_input, forward_batch)
+        attn_output = attn_output.view(-1, self.num_local_heads, self.kv_lora_rank)
+
+        if self.w_vc.dtype == torch.float8_e4m3fn:
+            attn_output_val, attn_output_scale = input_to_float8(
+                attn_output.transpose(0, 1), torch.float8_e4m3fn
+            )
+            attn_bmm_output = bmm_fp8(
+                attn_output_val,
+                self.w_vc,
+                attn_output_scale,
+                self.w_scale,
+                torch.bfloat16,
+            )
+        else:
+            attn_bmm_output = torch.bmm(attn_output.transpose(0, 1), self.w_vc)
+        attn_output = attn_bmm_output.transpose(0, 1).flatten(1, 2)
+        output, _ = self.o_proj(attn_output)
+
+        return output
+
+
+class MiniCPM3DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        self.self_attn = MiniCPM3AttentionMLA(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            qk_nope_head_dim=config.qk_nope_head_dim,
+            qk_rope_head_dim=config.qk_rope_head_dim,
+            v_head_dim=self.hidden_size // config.num_attention_heads,
+            q_lora_rank=(
+                config.q_lora_rank if hasattr(config, "q_lora_rank") else None
+            ),
+            kv_lora_rank=config.kv_lora_rank,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            layer_id=layer_id,
+            prefix=add_prefix("self_attn", prefix),
+        )
+
+        self.mlp = MiniCPM3MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = residual + hidden_states * (
+            self.config.scale_depth / math.sqrt(self.config.num_hidden_layers)
+        )
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states * (
+            self.config.scale_depth / math.sqrt(self.config.num_hidden_layers)
+        )
+
+        return hidden_states, None
+
+
+class MiniCPM3Model(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = nn.ModuleList(
+            [
+                MiniCPM3DecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids) * self.config.scale_emb
+        else:
+            hidden_states = input_embeds
+        residual = None
+
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class MiniCPM3ForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.num_experts = getattr(self.config, "num_experts", 0)
+        self.quant_config = quant_config
+        self.model = MiniCPM3Model(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        # self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        if not self.config.tie_word_embeddings:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                prefix=add_prefix("lm_head", prefix),
+            )
+
+        self.scale_width = self.config.hidden_size / self.config.dim_model_base
+
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is not None:
+            input_embeds = input_embeds * self.config.scale_emb
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        hidden_states = hidden_states / self.scale_width
+        if self.config.tie_word_embeddings:
+            lm_head = self.model.embed_tokens
+        else:
+            lm_head = self.lm_head
+        return self.logits_processor(input_ids, hidden_states, lm_head, forward_batch)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        expert_params_mapping = [
+            # (param_name, weight_name, expert_id)
+            (
+                "ws" if weight_name in ["w1", "w3"] else "w2s",
+                f"experts.{expert_id}.{weight_name}.weight",
+                expert_id,
+            )
+            for expert_id in range(self.num_experts)
+            for weight_name in ["w1", "w2", "w3"]
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for param_name, weight_name, expert_id in expert_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param, loaded_weight, weight_name, expert_id=expert_id
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+        for layer_id in range(self.config.num_hidden_layers):
+            self_attn = self.model.layers[layer_id].self_attn
+            w_kc, w_vc = self_attn.kv_b_proj.weight.unflatten(
+                0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
+            ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
+            self_attn.w_kc = w_kc.transpose(1, 2).contiguous().transpose(1, 2)
+            self_attn.w_vc = w_vc.contiguous().transpose(1, 2)
+            if hasattr(self_attn.kv_b_proj, "weight_scale"):
+                self_attn.w_scale = self_attn.kv_b_proj.weight_scale
+            del self_attn.kv_b_proj
+
+
+EntryClass = MiniCPM3ForCausalLM
diff --git a/python/sglang/srt/models/minicpmo.py b/python/sglang/srt/models/minicpmo.py
new file mode 100644
index 000000000..2ce575411
--- /dev/null
+++ b/python/sglang/srt/models/minicpmo.py
@@ -0,0 +1,1914 @@
+# Copied and adapted from: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/modeling_minicpmo.py
+
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only MiniCPM-o model compatible with HuggingFace weights."""
+
+import math
+from dataclasses import dataclass
+from typing import Any, Iterable, List, Literal, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.nn.utils.parametrize as P
+import torch.types
+from torch import nn
+from torch.nn.utils import parametrizations
+from tqdm import tqdm
+from transformers import LlamaConfig, LlamaModel, PretrainedConfig, PreTrainedModel
+from transformers.activations import ACT2FN
+from transformers.cache_utils import DynamicCache, EncoderDecoderCache
+from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from transformers.models.whisper.modeling_whisper import (
+    WhisperAttention,
+    WhisperConfig,
+    WhisperEncoder,
+)
+
+from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternTokenPairs,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+    flatten_nested_list,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.utils import set_default_torch_dtype
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.idefics2 import Idefics2VisionTransformer
+from sglang.srt.models.minicpmv import MiniCPMBaseModel, Resampler2_5
+from sglang.srt.models.qwen2 import Qwen2ForCausalLM
+from sglang.srt.utils import logger
+
+try:
+    from transformers import LogitsWarper
+    from vector_quantize_pytorch import GroupedResidualFSQ
+    from vocos import Vocos
+    from vocos.pretrained import instantiate_class
+
+    _tts_deps = True
+except:
+    LogitsWarper = None
+    _tts_deps = False
+
+
+def apply_spk_emb(
+    input_ids: torch.Tensor = None,
+    spk_emb: torch.Tensor = None,
+    input_embeds: torch.Tensor = None,
+    spk_emb_token_id: int = 0,
+    num_spk_embs: int = 1,
+):
+    """
+    Replace consecutive `num_spk_embs` speaker embedding placeholders in input_embeds with pre-prepared speaker embeddings. This is an in-place replacement, no new tensor is created, so no value is returned.
+
+    Args:
+        input_ids (torch.Tensor): Input ID tensor, shape [batch_size, seq_len_max]
+        spk_emb (torch.Tensor): Speaker embedding tensor, shape [batch_size, num_spk_emb, hidden_dim]
+        input_embeds (torch.Tensor): Input embedding tensor, shape [batch_size, seq_len_max, hidden_dim]
+        spk_emb_token_id (int): ID of the speaker embedding token
+        num_spk_embs (int): Number of speaker embeddings
+
+    Returns:
+        None
+    """
+
+    batch_size = input_ids.shape[0]
+
+    for idx in range(batch_size):
+        input_ids_ = input_ids[idx]  # [seq_len_max]
+        spk_emb_ = spk_emb[idx]  # [num_spk_emb]
+        mask_ = input_ids_ == spk_emb_token_id  # [batch_size, seq_len_max]
+        nonzero_position_idx = mask_.nonzero(as_tuple=False)  # [num_spk_emb, 1]
+        assert nonzero_position_idx.shape[0] == num_spk_embs
+        begin_idx = nonzero_position_idx.min()
+        end_idx = nonzero_position_idx.max()
+        input_embeds[idx, begin_idx : end_idx + 1, :] = spk_emb_
+
+    return
+
+
+@dataclass
+class ConditionalChatTTSGenerationOutput(ModelOutput):
+    """
+    Output class for ConditionalChatTTS generation.
+
+    Args:
+        new_ids (torch.LongTensor): Newly generated audio code sequence, shape (batch_size, sequence_length, num_vq).
+        audio_input_ids (torch.LongTensor): Updated input IDs including condition and generated audio codes, shape (batch_size, full_sequence_length, num_vq).
+        past_key_values (Tuple[Tuple[torch.FloatTensor]]): Tuple containing pre-computed keys and values used for attention mechanism. Each element has shape (batch_size, num_heads, sequence_length, embed_size_per_head).
+        finished (bool): Boolean indicating whether generation is complete.
+
+    """
+
+    new_ids: torch.LongTensor = None
+    audio_input_ids: torch.LongTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    finished: bool = None
+
+
+def make_streaming_chunk_mask_generation(
+    inputs_embeds: torch.Tensor,
+    past_seen_tokens: int,
+    streaming_tts_text_mask: torch.Tensor,
+    streaming_reserved_length: int = 300,
+    streaming_audio_chunk_size: int = 50,
+    streaming_text_chunk_size: int = 10,
+    num_spk_emb: int = 1,
+    use_spk_emb: bool = True,
+) -> torch.Tensor:
+    """
+    In streaming audio generation, determine which `text` positions the TTS model can attend to when generating each chunk of `audio` tokens.
+
+    This function creates a mask that allows the model to attend to a specific chunk of text
+    tokens when generating each chunk of audio tokens, enabling streaming TTS generation.
+
+    Args:
+        inputs_embeds (torch.Tensor): Input embeddings tensor.
+        past_seen_tokens (int): Number of tokens already seen by the model.
+        streaming_tts_text_mask (torch.Tensor): Mask for the text tokens.
+        streaming_reserved_length (int, optional): Number of reserved tokens for streaming. Defaults to 300.
+        streaming_text_chunk_size (int, optional): Size of each text chunk. Defaults to 7.
+
+    Returns:
+        torch.Tensor: Causal mask for streaming TTS generation, shape is [batch_size=1, 1, seq_len=1, past_seen_tokens+1]
+
+    Raises:
+        AssertionError: If the batch size is not 1 (only supports batch size of 1 for inference).
+    """
+    assert inputs_embeds.shape[0] == 1
+
+    dtype = inputs_embeds.dtype
+    device = inputs_embeds.device
+    min_dtype = torch.finfo(dtype).min
+
+    # Add `1` to the past seen tokens to account for new `tokens` during `generate`
+    causal_mask = torch.full(
+        (1, past_seen_tokens + inputs_embeds.shape[1]),
+        fill_value=0,
+        dtype=dtype,
+        device=device,
+    )
+
+    # Calculate the start of invisible text tokens
+    invisible_text_tokens_start = (
+        min(
+            math.ceil(
+                (past_seen_tokens - streaming_reserved_length)
+                / streaming_audio_chunk_size
+            )
+            * streaming_text_chunk_size,
+            streaming_reserved_length,
+        )
+        + 1
+        + num_spk_emb * use_spk_emb
+    )  # Add 1 for [Stts] and N for [spk_emb] tokens if `use_spk_emb` is True
+
+    invisible_text_tokens_end = (
+        streaming_reserved_length + 1 + num_spk_emb * use_spk_emb + 1
+    )  # Add 1 for [Ptts] (aka `audio_bos_token_id`)
+
+    # Set invisible text tokens to min_dtype (effectively -inf)
+    causal_mask[0, invisible_text_tokens_start:invisible_text_tokens_end] = min_dtype
+
+    # Mask padding positions in the text mask
+    causal_mask[
+        0, 0 : 1 + num_spk_emb * use_spk_emb + streaming_reserved_length + 1
+    ].masked_fill_(streaming_tts_text_mask == 0, min_dtype)
+
+    # Add extra dimensions for batch and heads
+    causal_mask = causal_mask.unsqueeze(0).unsqueeze(0)
+
+    return causal_mask
+
+
+# Borrowed from `https://github.com/2noise/ChatTTS/blob/main/ChatTTS/model/dvae.py`
+class ConvNeXtBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int,
+        kernel: int,
+        dilation: int,
+        layer_scale_init_value: float = 1e-6,
+    ):
+        # ConvNeXt Block copied from Vocos.
+        super().__init__()
+        self.dwconv = nn.Conv1d(
+            dim,
+            dim,
+            kernel_size=kernel,
+            padding=dilation * (kernel // 2),
+            dilation=dilation,
+            groups=dim,
+        )
+
+        self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, intermediate_dim)
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+        self.coef = (
+            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+
+    def forward(self, x: torch.Tensor, cond=None) -> torch.Tensor:
+        residual = x
+
+        y = self.dwconv(x)
+        y.transpose_(1, 2)  # (B, C, T) -> (B, T, C)
+        x = self.norm(y)
+        del y
+        y = self.pwconv1(x)
+        del x
+        x = self.act(y)
+        del y
+        y = self.pwconv2(x)
+        del x
+        if self.coef is not None:
+            y *= self.coef
+        y.transpose_(1, 2)  # (B, T, C) -> (B, C, T)
+
+        x = y + residual
+        del y
+
+        return x
+
+
+# Borrowed from `https://github.com/2noise/ChatTTS/blob/main/ChatTTS/model/dvae.py`
+class DVAEDecoder(nn.Module):
+    def __init__(
+        self,
+        idim: int,
+        odim: int,
+        n_layer=12,
+        bn_dim=64,
+        hidden=256,
+        kernel=7,
+        dilation=2,
+        up=False,
+    ):
+        super().__init__()
+        self.up = up
+        self.conv_in = nn.Sequential(
+            nn.Conv1d(idim, bn_dim, 3, 1, 1),
+            nn.GELU(),
+            nn.Conv1d(bn_dim, hidden, 3, 1, 1),
+        )
+        self.decoder_block = nn.ModuleList(
+            [
+                ConvNeXtBlock(
+                    hidden,
+                    hidden * 4,
+                    kernel,
+                    dilation,
+                )
+                for _ in range(n_layer)
+            ]
+        )
+        self.conv_out = nn.Conv1d(hidden, odim, kernel_size=1, bias=False)
+
+    def forward(self, x: torch.Tensor, conditioning=None) -> torch.Tensor:
+        # B, C, T
+        y = self.conv_in(x)
+        del x
+        for f in self.decoder_block:
+            y = f(y, conditioning)
+
+        x = self.conv_out(y)
+        del y
+        return x
+
+
+# Borrowed from `https://github.com/2noise/ChatTTS/blob/main/ChatTTS/model/dvae.py`
+class GFSQ(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        levels: List[int],
+        G: int,
+        R: int,
+        eps=1e-5,
+        transpose=True,
+    ):
+        super(GFSQ, self).__init__()
+        self.quantizer = GroupedResidualFSQ(
+            dim=dim,
+            levels=list(levels),
+            num_quantizers=R,
+            groups=G,
+        )
+        self.n_ind = math.prod(levels)
+        self.eps = eps
+        self.transpose = transpose
+        self.G = G
+        self.R = R
+
+    def _embed(self, x: torch.Tensor):
+        if self.transpose:
+            x = x.transpose(1, 2)
+        x = x.view(x.size(0), x.size(1), self.G, self.R).permute(2, 0, 1, 3)
+        feat = self.quantizer.get_output_from_indices(x)
+        return feat.transpose_(1, 2) if self.transpose else feat
+
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        return super().__call__(x)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.transpose:
+            x.transpose_(1, 2)
+        _, ind = self.quantizer(x)
+        ind = ind.permute(1, 2, 0, 3).contiguous()
+        ind = ind.view(ind.size(0), ind.size(1), -1)
+        return ind.transpose_(1, 2) if self.transpose else ind
+
+
+# Borrowed from `https://github.com/2noise/ChatTTS/blob/main/ChatTTS/model/dvae.py`
+class DVAE(nn.Module):
+    def __init__(
+        self,
+    ):
+        super().__init__()
+
+        coef = torch.rand(100)
+        self.coef = nn.Parameter(coef.unsqueeze(0).unsqueeze_(2))
+
+        self.downsample_conv = nn.Sequential(
+            nn.Conv1d(100, 512, 3, 1, 1),
+            nn.GELU(),
+            nn.Conv1d(512, 512, 4, 2, 1),
+            nn.GELU(),
+        )
+
+        self.encoder = DVAEDecoder(
+            idim=512,
+            odim=1024,
+            hidden=256,
+            n_layer=12,
+            bn_dim=128,
+        )
+
+        self.decoder = DVAEDecoder(
+            idim=512,
+            odim=512,
+            hidden=256,
+            n_layer=12,
+            bn_dim=128,
+        )
+
+        self.out_conv = nn.Conv1d(512, 100, 3, 1, 1, bias=False)
+
+        self.vq_layer = GFSQ(
+            dim=1024,
+            levels=(5, 5, 5, 5),
+            G=2,
+            R=2,
+        )
+
+    @torch.inference_mode()
+    def forward(
+        self, inp: torch.Tensor, mode: Literal["encode", "decode"] = "decode"
+    ) -> torch.Tensor:
+        if mode == "encode" and hasattr(self, "encoder") and self.vq_layer is not None:
+            mel = inp.clone()
+            x: torch.Tensor = self.downsample_conv(
+                torch.div(mel, self.coef.view(100, 1).expand(mel.shape), out=mel),
+            ).unsqueeze_(0)
+            del mel
+            x = self.encoder(x)
+            ind = self.vq_layer(x)
+            del x
+            return ind
+
+        if self.vq_layer is not None:
+            vq_feats = self.vq_layer._embed(inp)
+        else:
+            vq_feats = inp
+
+        vq_feats = (
+            vq_feats.view(
+                (vq_feats.size(0), 2, vq_feats.size(1) // 2, vq_feats.size(2)),
+            )
+            .permute(0, 2, 3, 1)
+            .flatten(2)
+        )
+
+        dec_out = self.out_conv(
+            self.decoder(
+                x=vq_feats,
+            ),
+        )
+
+        del vq_feats
+
+        return torch.mul(dec_out, self.coef, out=dec_out)
+
+
+# Borrowed from `https://github.com/2noise/ChatTTS/blob/main/ChatTTS/model/processors.py`
+class CustomRepetitionPenaltyLogitsProcessorRepeat:
+    def __init__(self, penalty: float, max_input_ids: int, past_window: int):
+        if not isinstance(penalty, float) or not (penalty > 0):
+            raise ValueError(
+                f"`penalty` has to be a strictly positive float, but is {penalty}"
+            )
+
+        self.penalty = penalty
+        self.max_input_ids = max_input_ids
+        self.past_window = past_window
+
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        if input_ids.size(1) > self.past_window:
+            input_ids = input_ids.narrow(1, -self.past_window, self.past_window)
+        freq = F.one_hot(input_ids, scores.size(1)).sum(1)
+        if freq.size(0) > self.max_input_ids:
+            freq.narrow(
+                0, self.max_input_ids, freq.size(0) - self.max_input_ids
+            ).zero_()
+        alpha = torch.pow(self.penalty, freq)
+        scores = scores.contiguous()
+        inp = scores.multiply(alpha)
+        oth = scores.divide(alpha)
+        con = scores < 0
+        out = torch.where(con, inp, oth)
+        del inp, oth, scores, con, alpha
+        return out
+
+
+class ConditionalChatTTS(PreTrainedModel):
+    """A conditional text-to-speech model that can generate speech from text with speaker conditioning.
+
+    This model extends PreTrainedModel to provide text-to-speech capabilities with:
+    - LLM hidden state conditioning
+    - Streaming generation
+
+    The model uses a transformer architecture with LLM hidden states and can operate in both
+    streaming and non-streaming modes for flexible deployment.
+
+    The model process sequence in the following format:
+    | text bos token | LLM embedding projected to tts embedding space | text tokens (fixed length, reserved for future tokens) | audio bos token | audio tokens (audio token length is not fixed)| audio eos token |
+
+    The format is designed to support LLM-conditioned streaming audio generation.
+
+    Usage:
+    To support streaming generation, two global variables should be maintained outside of the model.
+        1. `audio_input_ids`: stores *discrete* audio codes. It is a tensor with shape [1, sequence length+1, num_vq].
+        2. `past_key_values`: stores the KV cache for both text tokens and audio codes. It is a list of tuples, each tuple contains two tensors with shape [1, num_attention_heads, sequence length, hidden_size // num_attention_heads]
+
+    where `num_vq` is the number of audio codebooks, in default setting, it is `4`.
+
+    1. Create an empty `past_key_values` with
+    ```python
+    initial_kv_cache_length = 1 + model.num_spk_embs + model.streaming_text_reserved_len # where `1` denotes the `bos` token
+    dtype = model.emb_text.weight.dtype
+    device = model.emb_text.weight.device
+    past_key_values = [
+        (
+            torch.zeros(1, model.config.num_attention_heads, initial_kv_cache_length, model.config.hidden_size // model.config.num_attention_heads, dtype=dtype, device=device),
+            torch.zeros(1, model.config.num_attention_heads, initial_kv_cache_length, model.config.hidden_size // model.config.num_attention_heads, dtype=dtype, device=device)
+        )
+        for _ in range(model.config.num_hidden_layers)
+    ]
+
+    2. At the same time, create an empty `audio_input_ids` with shape [1, sequence length, num_vq], `num_vq` denotes multiple layer audio codebooks. But here we also include text tokens in the sequence, but they will be zeros, and will not be used, just a placeholder.
+
+    ```python
+    initial_audio_input_ids_length = 1 + model.num_spk_embs + model.streaming_text_reserved_len + 1
+    # [bos token, speaker embeddings, text tokens, audio bos token]
+    audio_input_ids = torch.zeros(batch_size=1, initial_audio_input_ids_length, model.num_vq)
+    ```
+
+    2. Prefill some text tokens to TTS model (for example, 10 tokens) using `prefill_text` method.
+
+    ```python
+    outputs = llm.generate(**kwargs)
+    llm_tokens = some_function_to_extract_llm_tokens(outputs)
+    lm_spk_emb_last_hidden_states = some_function_to_extract_lm_spk_emb_last_hidden_states(outputs)
+    tts_text_input_ids = tts_tokenizer.encode(llm_tokenizer.decode(llm_tokens))
+    # here assume we are prefilling text token 0 to text token 9 (included), totally 10 tokens.
+    begin = 0
+    end = 9+1
+    position_ids = torch.arange(begin, end, dtype=torch.long, device=device)
+
+    past_key_values = model.prefill_text(
+        input_ids=tts_text_input_ids,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        lm_spk_emb_last_hidden_states=lm_spk_emb_last_hidden_states,
+    )
+    ```
+
+    3. Make a `streaming_tts_text_mask` to denote which position contains valid text tokens, similar to `attention_mask` in standard causal attention.
+
+    ```python
+    streaming_tts_text_mask = torch.zeros(model.streaming_reserved_length)
+    streaming_tts_text_mask[0:end] = 1 # denotes these post
+    ```
+
+    3. Generate audio codes using `generate` method.
+
+    ```python
+    outputs = model.generate(
+        input_ids=audio_input_ids,
+        past_key_values=past_key_values,
+        streaming_tts_text_mask=streaming_tts_text_mask,
+        max_new_token=50,
+    )
+
+    # update past_key_values and input_ids
+    past_key_values = outputs.past_key_values
+    audio_input_ids = outputs.input_ids
+    ```
+
+    The `past_key_values` is extended by `max_new_token=50`, and `audio_input_ids` is also extended by `max_new_token=50` after `generate` calling.
+
+    4. Notice that after prefilling `10` text tokens, the model can generate up to `50` audio tokens, if you want to generate more audio tokens, you need to prefill next `10` text tokens. And it is okay to only generate `25` audio tokens for faster initial response.
+
+    5. Repeat steps `2,3,4` as needed in your streaming audio generation cases, but ensure usage complies with the following guidelines discussed above.
+    """
+
+    config_class = PretrainedConfig
+    _no_split_modules = []
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__(config)
+
+        self.use_speaker_embedding = config.use_speaker_embedding
+        self.use_llm_hidden_state = config.use_llm_hidden_state
+        self.num_spk_embs = config.num_spk_embs
+        self.spk_emb_token_id = config.spk_emb_token_id
+
+        self.use_text = config.use_text
+        self.streaming = config.streaming
+        self.streaming_text_chunk_size = config.streaming_text_chunk_size
+        self.streaming_audio_chunk_size = config.streaming_audio_chunk_size
+        self.streaming_text_reserved_len = config.streaming_text_reserved_len
+        self.audio_bos_token_id = config.audio_bos_token_id
+        self.num_mel_bins = config.num_mel_bins
+        self.num_vq = config.num_vq
+        self.num_audio_tokens = config.num_audio_tokens
+
+        self.top_p = config.top_p
+        self.top_k = config.top_k
+        self.repetition_penalty = config.repetition_penalty
+
+        if self.config.use_mlp:
+            self.projector = MultiModalProjector(config.llm_dim, config.hidden_size)
+        else:
+            self.projector = nn.Linear(config.llm_dim, config.hidden_size, bias=False)
+        self.emb_code = nn.ModuleList(
+            [
+                nn.Embedding(config.num_audio_tokens, config.hidden_size)
+                for _ in range(config.num_vq)
+            ]
+        )
+        self.emb_text = nn.Embedding(config.num_text_tokens, config.hidden_size)
+        self.head_code = nn.ModuleList(
+            [
+                parametrizations.weight_norm(
+                    nn.Linear(config.hidden_size, config.num_audio_tokens, bias=False),
+                    name="weight",
+                )
+                for _ in range(config.num_vq)
+            ]
+        )
+
+        dvae = DVAE()
+        self.dvae = dvae
+
+        model_config = LlamaConfig(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            num_attention_heads=config.num_attention_heads,
+            num_hidden_layers=config.num_hidden_layers,
+            max_position_embeddings=config.max_position_embeddings,
+            attn_implementation=config.attn_implementation,
+        )
+
+        model = LlamaModel(model_config)
+        self.model = model
+
+    @torch.inference_mode()
+    def merge_inputs_embeds(
+        self,
+        input_ids: torch.Tensor,
+        lm_spk_emb_last_hidden_states: Optional[torch.Tensor] = None,
+    ):
+        """Merge `input_ids` and `lm_spk_emb_last_hidden_states` to `inputs_embeds`.
+
+        Args:
+            input_ids (torch.Tensor): Input token IDs.
+            lm_spk_emb_last_hidden_states (Optional[torch.Tensor], optional): Last hidden states of speaker embeddings from the language model. Defaults to None.
+
+        Raises:
+            NotImplementedError: If speaker embedding is not used and language model hidden states are not implemented.
+
+        Returns:
+            torch.Tensor: Prepared input embeddings for the model.
+        """
+        assert input_ids.shape[0] == 1
+
+        # Embed input_ids to input_embeds
+        inputs_embeds = self.emb_text(input_ids)
+
+        # Inject speaker embedding to input_embeds if it exists
+        if self.use_speaker_embedding:
+            spk_emb_mask = input_ids == self.spk_emb_token_id
+            if spk_emb_mask.any():
+                assert lm_spk_emb_last_hidden_states is not None
+                # Project spk emb to tts hidden size first, [batch_size, num_spk_emb, llm_dim] -> [batch_size, num_spk_emb, self.hidden_size]
+                lm_spk_emb_last_hidden_states = lm_spk_emb_last_hidden_states.to(
+                    self.projector.linear1.weight.dtype
+                )
+                projected_spk_emb = self.projector(lm_spk_emb_last_hidden_states)
+                projected_spk_emb = F.normalize(projected_spk_emb, p=2, dim=-1)
+                apply_spk_emb(
+                    input_ids=input_ids,
+                    spk_emb=projected_spk_emb,
+                    input_embeds=inputs_embeds,
+                    spk_emb_token_id=self.spk_emb_token_id,
+                    num_spk_embs=self.num_spk_embs,
+                )
+        else:
+            raise NotImplementedError
+
+        return inputs_embeds
+
+    @torch.inference_mode()
+    def prefill_text(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.LongTensor,
+        past_key_values: List[Tuple[torch.Tensor, torch.Tensor]],
+        lm_spk_emb_last_hidden_states: Optional[torch.Tensor] = None,
+    ):
+        """Prefill a chunk of new text tokens in streaming setting.
+        Specifically speaking, update `past_key_values` using new text tokens, then the model will read the new text tokens.
+
+        Args:
+            input_ids (Tensor): Tensor of shape [batch_size, seq_len]
+            position_ids (LongTensor): Tensor of shape [batch_size, seq_len]
+            past_key_values (List[Tuple[Tensor]]): KV Cache of all layers, each layer is a tuple (Tensor, Tensor) denoting keys and values. Each tensor is of seq_len = `self.streaming_text_reserved_len`. `past_key_values` will be updated.
+            lm_spk_emb_last_hidden_states (Tensor, optional): Tensor of shape [batch_size, num_spk_emb, llm_dim]. Defaults to None.
+
+        Note that all `batch_size` should be `1`.
+        """
+        assert input_ids.shape[0] == 1
+        assert past_key_values is not None
+
+        # Merge text and LLM embeddings
+        inputs_embeds = self.merge_inputs_embeds(
+            input_ids=input_ids,
+            lm_spk_emb_last_hidden_states=lm_spk_emb_last_hidden_states,
+        )
+
+        # Clone KV Cache
+        past_key_values_for_prefill = []
+        for i in range(len(past_key_values)):
+            past_key_values_for_prefill.append(
+                (
+                    past_key_values[i][0][:, :, : position_ids[:, 0], :].clone(),
+                    past_key_values[i][1][:, :, : position_ids[:, 0], :].clone(),
+                )
+            )
+
+        # ModelMiniCPMVBaseModel
+        outputs_prefill: BaseModelOutputWithPast = self.model(
+            attention_mask=None,  # because for text, it is standard causal attention mask, do nothing
+            position_ids=position_ids,  # position_ids denotes the position of new text tokens in the sequence
+            past_key_values=past_key_values_for_prefill,  # `past_key_values` will be updated by the model
+            inputs_embeds=inputs_embeds,  # contains text and language model embedding
+            use_cache=True,
+            output_attentions=False,
+            cache_position=position_ids,  # which new positions will use this cache, basically the same as position_ids
+        )
+
+        # Get model updated KV Cache
+        past_key_values_for_prefill_updated = outputs_prefill.past_key_values
+
+        # Update generated KV Cache to input `past_key_values`
+        for layer_idx in range(len(past_key_values)):
+            # Update keys
+            past_key_values[layer_idx][0][
+                :, :, position_ids[:, 0] : position_ids[:, -1] + 1, :
+            ] = past_key_values_for_prefill_updated[layer_idx][0][
+                :, :, position_ids[:, 0] : position_ids[:, -1] + 1
+            ].clone()
+            # Update values
+            past_key_values[layer_idx][1][
+                :, :, position_ids[:, 0] : position_ids[:, -1] + 1, :
+            ] = past_key_values_for_prefill_updated[layer_idx][1][
+                :, :, position_ids[:, 0] : position_ids[:, -1] + 1
+            ].clone()
+
+        # TODO: del past_key_values_for_prefill_updated recursively
+        # TODO: del outputs_prefill recursively
+
+        return past_key_values
+
+    @torch.inference_mode()
+    def prefill_audio_ids(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: List[Tuple[torch.Tensor, torch.Tensor]],
+        streaming_tts_text_mask=None,
+        add_audio_bos: bool = True,
+    ):
+        """Prefill a chunk of audio ids to the model. Used in sliding-window long audio generation.
+        Specifically, prefill many audio ids (typically from last window) to the model in the new window.
+
+        Args:
+            input_ids (torch.Tensor): (1, seq_len, num_vq) Audio input token ids.
+            past_key_values (List[Tuple[torch.Tensor, torch.Tensor]]): Past key values for attention mechanism.
+        """
+        assert input_ids.shape[0] == 1
+        assert past_key_values is not None
+
+        code_emb = [self.emb_code[i](input_ids[:, :, i]) for i in range(self.num_vq)]
+        inputs_embeds = torch.stack(code_emb, 3).sum(3)  # [1,seq_len,768]
+        input_len = input_ids.shape[1]
+
+        if add_audio_bos:
+            narrowed_input_ids = torch.tensor(
+                [[self.audio_bos_token_id]], dtype=torch.long, device=self.device
+            )
+            bos_inputs_embeds = self.emb_text(narrowed_input_ids)
+            inputs_embeds = torch.cat([bos_inputs_embeds, inputs_embeds], dim=1)
+            input_len += 1
+
+        past_key_values_length = past_key_values[0][0].shape[2]
+        position_ids = torch.arange(
+            past_key_values_length,
+            past_key_values_length + input_len,
+            dtype=torch.long,
+            device=self.device,
+        ).unsqueeze(0)
+
+        cache_position = position_ids.clone()
+        causal_mask = make_streaming_chunk_mask_generation(
+            inputs_embeds=inputs_embeds,
+            past_seen_tokens=past_key_values[0][0].shape[2],
+            streaming_tts_text_mask=streaming_tts_text_mask,
+            streaming_reserved_length=self.streaming_text_reserved_len,
+            streaming_text_chunk_size=self.streaming_text_chunk_size,
+        )  # [1, 1, 1, past_key_values_length + input_len]
+
+        # Model forward
+        outputs: BaseModelOutputWithPast = self.model(
+            attention_mask=causal_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=True,
+            output_attentions=False,
+            cache_position=cache_position,
+        )
+        past_key_values = outputs.past_key_values
+        return past_key_values
+
+    @torch.inference_mode()
+    def generate(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: List[Tuple[torch.Tensor, torch.Tensor]],
+        temperature: torch.Tensor,
+        eos_token: Union[int, torch.Tensor],
+        streaming_tts_text_mask=None,
+        force_no_stop=False,
+        min_new_token=10,
+        max_new_token=50,
+        logits_warpers: List[LogitsWarper] = [],
+        logits_processors: List[CustomRepetitionPenaltyLogitsProcessorRepeat] = [],
+        show_tqdm=False,
+    ):
+        """Generate audio codes in streaming setting or non-streaming setting.
+        Specifically speaking, generate audio codes when not all text tokens are prefilled.
+
+        Always pass a valid `past_key_values` to the method. The method does not do `prefill` by itself. It relies on `prefill_text` method to provide valid `past_key_values`. Please refer to docstring of this class for more details.
+
+        In this method, we borrowed a lot of codes from `https://github.com/2noise/ChatTTS/blob/main/ChatTTS/model/gpt.py`.
+
+        Args:
+            input_ids (torch.Tensor): Input token ids.
+            past_key_values (List[Tuple[torch.Tensor, torch.Tensor]]): Past key values for attention mechanism.
+            temperature (torch.Tensor): Temperature for sampling.
+            eos_token (Union[int, torch.Tensor]): End of sequence token.
+            streaming_tts_text_mask (Optional[torch.Tensor], optional): Mask for streaming TTS text. Defaults to None.
+            max_new_token (int, optional): Maximum number of new tokens to generate. Defaults to 50.
+            logits_warpers (List[LogitsWarper], optional): List of logits warpers. Defaults to [].
+            logits_processors (List[CustomRepetitionPenaltyLogitsProcessorRepeat], optional): List of logits processors. Defaults to [].
+            show_tqdm (bool, optional): Whether to show progress bar. Defaults to True.
+
+        Returns:
+            GenerationOutputs: Generation outputs.
+        """
+
+        # We only support batch size `1` for now
+        assert input_ids.shape[0] == 1
+        assert past_key_values is not None
+
+        # fix: this should not be `input_ids.shape[1]`
+        # start_idx = input_ids.shape[1]
+        start_idx = (
+            1
+            + self.num_spk_embs * self.use_speaker_embedding
+            + self.streaming_text_reserved_len
+            + 1
+        )
+
+        finish = torch.zeros(input_ids.shape[0], device=input_ids.device).bool()
+
+        temperature = (
+            temperature.unsqueeze(0)
+            .expand(input_ids.shape[0], -1)
+            .contiguous()
+            .view(-1, 1)
+        )
+
+        progress = input_ids.shape[1]
+
+        # Pre-allocate input_ids, shape is [batch_size=1, max_possible_seq_len, self.num_vqs]
+        input_ids_buf = torch.zeros(
+            input_ids.shape[0],  # batch_size
+            progress
+            + max_new_token,  # max_possible_seq_len = input_ids.shape[1] + max_new_token
+            input_ids.shape[2],  # self.num_vqs
+            dtype=input_ids.dtype,
+            device=input_ids.device,
+        )
+
+        # Copy existing `input_ids` to `input_ids_buf`
+        input_ids_buf.narrow(1, 0, progress).copy_(input_ids)
+
+        del input_ids
+        input_ids = input_ids_buf.narrow(1, 0, progress)
+
+        pbar: Optional[tqdm] = None
+        if show_tqdm:
+            pbar = tqdm(
+                total=max_new_token,
+                desc="code",
+                bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}(max) [{elapsed}, {rate_fmt}{postfix}]",
+            )
+
+        condition_length = (
+            1
+            + self.num_spk_embs * self.use_speaker_embedding
+            + self.streaming_text_reserved_len
+            + 1
+        )
+
+        for i in range(max_new_token):
+            # Prepare generation inputs
+            audio_bos = False
+
+            # If this is the first audio token, the case is SPECIAL
+            if progress == condition_length:
+                audio_bos = True
+
+            assert progress == (
+                past_key_values[0][0].shape[2] + 1
+            )  # If you are using according to the guidelines, this should be passed.
+
+            if audio_bos:
+                # Generate the first token, activate the model with `self.audio_bos_token_id`, the model will predict
+                # a new audio token. This is a special case because without the `audio bos token`, it is impossible
+                # to generate the first audio token in our streaming setting.
+                narrowed_input_ids = torch.tensor(
+                    [[self.audio_bos_token_id]], dtype=torch.long, device=self.device
+                )
+                inputs_embeds = self.emb_text(narrowed_input_ids)
+                del narrowed_input_ids
+            else:
+                # Generate the following audio tokens, it is applicable to all other cases, including second and the
+                # following calling of `generate`.
+                narrowed_input_ids = input_ids.narrow(
+                    dim=1, start=input_ids.shape[1] - 1, length=1
+                )
+                code_emb = [
+                    self.emb_code[i](narrowed_input_ids[:, :, i])
+                    for i in range(self.num_vq)
+                ]
+                inputs_embeds = torch.stack(code_emb, 3).sum(3)
+
+            position_ids = torch.tensor(
+                [past_key_values[0][0].shape[2]], dtype=torch.long, device=self.device
+            ).unsqueeze(0)
+
+            cache_position = position_ids.clone()
+
+            # Make causal mask
+            causal_mask = make_streaming_chunk_mask_generation(
+                inputs_embeds=inputs_embeds,
+                past_seen_tokens=past_key_values[0][0].shape[2],
+                streaming_tts_text_mask=streaming_tts_text_mask,
+                streaming_reserved_length=self.streaming_text_reserved_len,
+                streaming_text_chunk_size=self.streaming_text_chunk_size,
+            )
+
+            # Model forward
+            outputs: BaseModelOutputWithPast = self.model(
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=True,
+                output_attentions=False,
+                cache_position=cache_position,
+            )
+
+            del position_ids
+            del inputs_embeds
+            del cache_position
+            del causal_mask
+
+            hidden_states = outputs.last_hidden_state
+            past_key_values = outputs.past_key_values
+
+            with P.cached():
+                logits = torch.empty(
+                    hidden_states.size(0),
+                    hidden_states.size(1),
+                    self.num_audio_tokens,
+                    self.num_vq,
+                    dtype=torch.float,
+                    device=self.device,
+                )
+                for num_vq_iter in range(self.num_vq):
+                    x: torch.Tensor = self.head_code[num_vq_iter](hidden_states)
+                    logits[..., num_vq_iter] = x
+                    del x
+
+            del hidden_states
+
+            # logits = logits[:, -1].float()
+            logits = logits.narrow(1, -1, 1).squeeze_(1).float()
+
+            # logits = rearrange(logits, "b c n -> (b n) c")
+            logits = logits.permute(0, 2, 1)
+            logits = logits.reshape(-1, logits.size(2))
+            # logits_token = rearrange(input_ids[:, start_idx:], "b c n -> (b n) c")
+            input_ids_sliced = input_ids.narrow(
+                1,
+                start_idx,
+                input_ids.size(1) - start_idx,
+            ).permute(0, 2, 1)
+            logits_token = input_ids_sliced.reshape(
+                input_ids_sliced.size(0) * input_ids_sliced.size(1),
+                -1,
+            ).to(self.device)
+            del input_ids_sliced
+
+            logits /= temperature
+
+            if not audio_bos:
+                for logitsProcessors in logits_processors:
+                    logits = logitsProcessors(logits_token, logits)
+            if not audio_bos:
+                for logitsWarpers in logits_warpers:
+                    logits = logitsWarpers(logits_token, logits)
+
+            del logits_token
+
+            if i < min_new_token:
+                logits[:, eos_token] = -torch.inf
+
+            if force_no_stop:
+                logits[:, eos_token] = -torch.inf
+
+            scores = F.softmax(logits, dim=-1)
+
+            del logits
+            idx_next = torch.multinomial(scores, num_samples=1)  # .to(finish.device)
+
+            del scores
+
+            # idx_next = rearrange(idx_next, "(b n) 1 -> b n", n=self.num_vq)
+            idx_next = idx_next.view(-1, self.num_vq)
+            finish_or = idx_next.eq(eos_token).any(1)
+            finish.logical_or_(finish_or)
+
+            del finish_or
+            # Store new `token` into `input_ids_buf`
+            input_ids_buf.narrow(1, progress, 1).copy_(idx_next.unsqueeze_(1))
+
+            if i == 0 and finish.any():
+                # raise Exception
+                break
+
+            del idx_next
+            progress += 1
+            input_ids = input_ids_buf.narrow(1, 0, progress)
+
+            if finish.all():
+                break
+
+            if pbar is not None:
+                pbar.update(1)
+
+        if pbar is not None:
+            pbar.close()
+
+        if not finish.all():
+            if show_tqdm:
+                logger.info(f"incomplete result. hit max_new_token: {max_new_token}")
+
+        del input_ids_buf
+
+        if finish.all():
+            # the last may contains eos token
+            genrated_input_ids = input_ids[:, condition_length:-1, :]
+        else:
+            # there is no eos token
+            genrated_input_ids = input_ids[:, condition_length:, :]
+
+        return ConditionalChatTTSGenerationOutput(
+            new_ids=genrated_input_ids,
+            audio_input_ids=input_ids,  # for update purpose
+            past_key_values=past_key_values,  # for update purpose
+            finished=finish.all(),
+        )
+
+    @torch.inference_mode()
+    def decode_to_mel_specs(
+        self,
+        result_list: List[torch.Tensor],
+    ):
+        """Decode discrete audio codes to mel spectrograms.
+
+        Borrowed from `https://github.com/2noise/ChatTTS/blob/main/ChatTTS/core.py`
+
+        Args:
+            result_list (List[torch.Tensor]): Audio codes output from `generate`.
+
+        Returns:
+            torch.Tensor: Mel spectrograms.
+        """
+
+        decoder = self.dvae
+        max_x_len = -1
+        if len(result_list) == 0:
+            return np.array([], dtype=np.float32)
+        for result in result_list:
+            if result.size(0) > max_x_len:
+                max_x_len = result.size(0)
+        batch_result = torch.zeros(
+            (len(result_list), result_list[0].size(1), max_x_len),
+            dtype=result_list[0].dtype,
+            device=result_list[0].device,
+        )
+        for i in range(len(result_list)):
+            src = result_list[i]
+            batch_result[i].narrow(1, 0, src.size(0)).copy_(src.permute(1, 0))
+            del src
+
+        mel_specs = decoder(batch_result)
+        del batch_result
+        return mel_specs
+
+
+# Copied from transformers.models.whisper.modeling_whisper.WhisperEncoderLayer and add use_cache for streaming inference
+class MiniCPMWhisperEncoderLayer(nn.Module):
+    def __init__(self, config: WhisperConfig, layer_idx: int = None):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = WhisperAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+            config=config,
+            layer_idx=layer_idx,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_head_mask: torch.Tensor,
+        output_attentions: bool = False,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        use_cache: Optional[bool] = False,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, embed_dim)`):
+                Hidden states to be fed into the encoder layer.
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, 1, tgt_len, src_len)`):
+                Attention mask where padding elements are indicated by large negative values.
+            layer_head_mask (`torch.FloatTensor` of shape `(encoder_attention_heads,)`):
+                Mask to nullify selected heads of the attention modules.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attention weights.
+            past_key_values (`EncoderDecoderCache`, *optional*):
+                Past key-value pairs used for incremental decoding.
+            use_cache (`bool`, *optional*):
+                Whether or not to return updated `past_key_values` for caching.
+
+        Returns:
+            A tuple of shape `(hidden_states, optional(attn_weights), optional(past_key_values))`.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        # TODO (lifuhuang): confirmed with Mick that the logic for past_key_values is copied from minicpmo official code,
+        # currently we are not using past_key_values at all. We need to redesign the caching logic when we support streaming
+        # in the future.
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+            past_key_value=past_key_values,
+        )
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=False
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.activation_dropout, training=False
+        )
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=False
+        )
+        hidden_states = residual + hidden_states
+
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(
+                hidden_states, min=-clamp_value, max=clamp_value
+            )
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        if use_cache:
+            outputs += (past_key_values,)
+
+        return outputs
+
+
+# Copied from from transformers.models.whisper.modeling_whisper.WhisperEncoder and add use_cache for streaming inference
+class MiniCPMWhisperEncoder(WhisperEncoder):
+
+    def __init__(self, config: WhisperConfig):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [
+                MiniCPMWhisperEncoderLayer(config, layer_idx=i)
+                for i in range(config.encoder_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        input_features,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        use_cache: Optional[bool] = None,
+    ):
+        r"""
+        Forward pass of the Whisper encoder.
+
+        Args:
+            input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
+                Float values of log-mel features extracted from the raw audio waveform. Typically generated
+                by a feature extractor (e.g., `WhisperFeatureExtractor`) that processes `.flac` or `.wav`
+                files into padded 2D mel spectrogram frames. These features are projected via convolution layers
+                (`conv1` and `conv2`) and then transformed into embeddings for the encoder.
+
+            attention_mask (`torch.Tensor`, *optional*):
+                Not used by Whisper for masking `input_features`, but included for API compatibility with
+                other models. If provided, it is simply ignored within the model. By default, Whisper
+                effectively ignores silence in the input log-mel spectrogram.
+
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected attention heads. The elements should be either 1 or 0, where:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked** (i.e., the attention head is dropped).
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attention tensors of all encoder layers. If set to `True`, the
+                returned tuple (or `BaseModelOutputWithPast`) will contain an additional element with
+                attention weights for each encoder layer.
+
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. If set to `True`, the returned
+                tuple (or `BaseModelOutputWithPast`) will contain a tuple of hidden states, including the
+                initial embedding output as well as the outputs of each layer.
+
+            return_dict (`bool`, *optional*):
+                Whether or not to return a `BaseModelOutputWithPast` (a subclass of `ModelOutput`) instead
+                of a plain tuple. If set to `True`, the output will be a `BaseModelOutputWithPast` object,
+                otherwise it will be a tuple.
+
+            past_key_values (`EncoderDecoderCache`, *optional*):
+                When using caching for faster inference, this is an object that stores the key-value pairs
+                for attention states. If provided, the model will append new states to the existing cache
+                and return the updated cache. This speeds up sequential decoding or chunked inference.
+
+                - If `past_key_values` is `None`, no past states are used or returned.
+                - If `past_key_values` is not `None` and `use_cache=True`, the model will use the provided
+                cache and return the updated cache (as `next_encoder_cache`).
+
+            use_cache (`bool`, *optional*):
+                Whether or not the model should use caching (`past_key_values`) to speed up processing
+                during inference. When set to `True`, the model will:
+                - Inspect and use `past_key_values` if provided.
+                - Return updated `past_key_values` (under the name `next_encoder_cache` in
+                    `BaseModelOutputWithPast`).
+
+        Returns:
+            `BaseModelOutputWithPast` or `tuple` (depending on `return_dict`):
+                If `return_dict=True`, a `BaseModelOutputWithPast` is returned, which contains:
+                - **last_hidden_state** (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                The output of the final encoder layer.
+                - **hidden_states** (`tuple(torch.FloatTensor)`, *optional*, returned if `output_hidden_states=True`):
+                Hidden states of the model at each layer (including the initial projection).
+                - **attentions** (`tuple(torch.FloatTensor)`, *optional*, returned if `output_attentions=True`):
+                Attention weights from each encoder layer.
+                - **past_key_values** (an object of type `EncoderDecoderCache` or `None`, *optional*):
+                Updated cache of key-value pairs if `use_cache=True`.
+
+                If `return_dict=False`, a tuple is returned, where the format is:
+                `(last_hidden_state, hidden_states, attentions)`, with `hidden_states` and `attentions`
+                only present if their respective `output_*` arguments are set to `True`.
+
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        # Ignore copy
+        input_features = input_features.to(
+            dtype=self.conv1.weight.dtype, device=self.conv1.weight.device
+        )
+
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+
+        embed_pos = self.embed_positions.weight
+        past_key_values_length = 0
+        if use_cache:
+            if past_key_values is None:
+                past_key_values = EncoderDecoderCache(DynamicCache(), DynamicCache())
+            elif isinstance(past_key_values, list):
+                past_key_values = EncoderDecoderCache(
+                    DynamicCache.from_legacy_cache(past_key_values), DynamicCache()
+                )
+            elif isinstance(past_key_values, DynamicCache):
+                past_key_values = EncoderDecoderCache(past_key_values, DynamicCache())
+            else:
+                pass
+            past_key_values_length = (
+                past_key_values.self_attention_cache.get_usable_length(
+                    inputs_embeds.shape[1]
+                )
+            )
+            if inputs_embeds.shape[1] + past_key_values_length > embed_pos.shape[0]:
+                logger.warning(
+                    "seems the audio is longer than 30s. repeating the last part of the audio"
+                )
+                embed_pos_front = embed_pos[past_key_values_length:, :]
+                embed_pos = torch.cat(
+                    (
+                        embed_pos_front,
+                        torch.repeat_interleave(
+                            embed_pos[-1, :].unsqueeze(0),
+                            inputs_embeds.shape[1]
+                            - embed_pos.shape[0]
+                            + past_key_values_length,
+                            dim=0,
+                        ),
+                    )
+                )
+            else:
+                embed_pos = embed_pos[
+                    past_key_values_length : inputs_embeds.shape[1]
+                    + past_key_values_length,
+                    :,
+                ]
+        else:
+            embed_pos = embed_pos[: inputs_embeds.shape[1], :]
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=False
+        )
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            assert head_mask.size()[0] == (
+                len(self.layers)
+            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            to_drop = False
+
+            # Ignore copy
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    output_attentions=output_attentions,
+                    past_key_values=past_key_values,
+                    use_cache=use_cache,
+                )
+
+                hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_encoder_cache = layer_outputs[2 if output_attentions else 1]
+            else:
+                next_encoder_cache = None
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, encoder_states, all_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+            past_key_values=next_encoder_cache,
+        )
+
+
+class MultiModalProjector(nn.Module):
+    def __init__(self, in_dim, out_dim):
+        super().__init__()
+        self.linear1 = nn.Linear(in_features=in_dim, out_features=out_dim, bias=True)
+        self.relu = nn.ReLU()
+        self.linear2 = nn.Linear(in_features=out_dim, out_features=out_dim, bias=True)
+
+    def forward(self, audio_features):
+        hidden_states = self.relu(self.linear1(audio_features))
+        hidden_states = self.linear2(hidden_states)
+        return hidden_states
+
+
+class MiniCPMO(MiniCPMBaseModel):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__(config=config, quant_config=quant_config)
+
+        self.llm = self.init_llm(config=config, quant_config=quant_config)
+
+        self.embed_dim = self.llm.config.hidden_size
+
+        # init vision module
+        if self.config.init_vision:
+            # print("vision-understanding enabled")
+            self.vpm = self.init_vision_module(config=config, quant_config=quant_config)
+            self.vision_dim = self.vpm.embed_dim
+            self.resampler = self.init_resampler(self.embed_dim, self.vision_dim)
+
+        # init audio module
+        self.config.init_audio = True
+        if self.config.init_audio:
+            # print("audio-understanding enabled")
+            self.apm = self.init_audio_module()
+            audio_output_dim = int(self.apm.config.encoder_ffn_dim // 4)
+            self.audio_avg_pooler = nn.AvgPool1d(
+                self.config.audio_pool_step, stride=self.config.audio_pool_step
+            )
+            self.audio_projection_layer = MultiModalProjector(
+                in_dim=audio_output_dim, out_dim=self.embed_dim
+            )
+            self.audio_encoder_layer = -1
+
+        # init tts module
+        self.config.init_tts = False
+        logger.info("TTS is disabled for now")
+        if self.config.init_tts:
+            # print("tts enabled")
+            assert (
+                _tts_deps
+            ), "please make sure vector_quantize_pytorch and vocos are installed."
+            self.tts = self.init_tts_module()
+
+    def init_tts_module(self):
+        model = ConditionalChatTTS(self.config.tts_config)
+        return model
+
+    def init_audio_module(self):
+        model = MiniCPMWhisperEncoder(self.config.audio_config)
+        return model
+
+    def init_llm(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        return Qwen2ForCausalLM(config=config, quant_config=quant_config, prefix=prefix)
+
+    def init_vision_module(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            self.config.vision_config._attn_implementation = "flash_attention_2"
+        else:
+            self.config.vision_config._attn_implementation = "eager"
+        model = Idefics2VisionTransformer(
+            config=config.vision_config, quant_config=quant_config, prefix=prefix
+        )
+        if self.config.drop_vision_last_layer:
+            model.encoder.layers = model.encoder.layers[:-1]
+
+        setattr(model, "embed_dim", model.embeddings.embed_dim)
+        setattr(model, "patch_size", model.embeddings.patch_size)
+
+        return model
+
+    def init_resampler(
+        self,
+        embed_dim: int,
+        vision_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        with set_default_torch_dtype(torch.float16):
+            # The resampler in 2.6 remains consistent with the one in 2.5.
+            resampler = Resampler2_5(
+                num_queries=self.config.query_num,
+                embed_dim=embed_dim,
+                num_heads=embed_dim // 128,
+                kv_dim=vision_dim,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        return resampler.to(device="cuda", dtype=torch.get_default_dtype())
+
+    def pad_input_ids(self, input_ids: List[int], mm_input: MultimodalInputs):
+        # Get all special token IDs
+        im_start_id: int = mm_input.im_start_id
+        im_end_id: int = mm_input.im_end_id
+        slice_start_id: int = mm_input.slice_start_id
+        slice_end_id: int = mm_input.slice_end_id
+
+        data_token_pairs = [
+            (im_start_id, im_end_id),
+            (slice_start_id, slice_end_id),
+            (mm_input.audio_start_id, mm_input.audio_end_id),
+        ]
+        data_start_token_ids = [im_start_id, mm_input.audio_start_id]
+        pattern = MultiModalityDataPaddingPatternTokenPairs(
+            data_token_pairs=data_token_pairs, data_start_token_ids=data_start_token_ids
+        )
+
+        return pattern.pad_input_tokens(input_ids, mm_input)
+
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers and the output length of the audio encoder
+        """
+        input_lengths_after_cnn = (input_lengths - 1) // 2 + 1
+        input_lengths_after_pooling = (
+            input_lengths_after_cnn - self.config.audio_pool_step
+        ) // self.config.audio_pool_step + 1
+        input_lengths_after_pooling = input_lengths_after_pooling.to(dtype=torch.int32)
+
+        return input_lengths_after_cnn, input_lengths_after_pooling
+
+    def get_audio_embedding_streaming(self, items: List[MultimodalDataItem]):
+        r"""
+        Extract audio embeddings in a streaming manner using cached key-value pairs.
+
+        This method processes incoming audio features incrementally and stores/updates `past_key_values`
+        for faster inference on subsequent audio frames. It only supports batch_size=1 and is intended
+        for streaming scenarios.
+
+        Returns:
+            List[List[torch.Tensor]]: audio embeddings
+        """
+        wavforms = flatten_nested_list([item.feature for item in items if item.feature])
+        # list, [[x1, x2], [y1], [z1]]
+        audio_feature_lens_raw = flatten_nested_list(
+            [item.audio_feature_lens for item in items if item.audio_feature_lens]
+        )
+
+        # exist audio
+        if len(wavforms) > 0:
+            audio_feature_lens = torch.hstack(audio_feature_lens_raw)
+            batch_size, _, max_mel_seq_len = wavforms.shape
+            assert batch_size == 1
+            max_seq_len = (max_mel_seq_len - 1) // 2 + 1
+
+            if self.audio_past_key_values is not None:
+                cache_length = self.audio_past_key_values[0][0].shape[2]
+                apm_max_len = self.apm.embed_positions.weight.shape[0]
+                if cache_length + max_seq_len >= apm_max_len:
+                    logger.warning(
+                        f"audio_past_key_values length {cache_length + max_seq_len} exceed {apm_max_len}, reset."
+                    )
+                    self.audio_past_key_values = None
+
+            audio_outputs = self.apm(
+                wavforms, past_key_values=self.audio_past_key_values, use_cache=True
+            )
+            audio_states = (
+                audio_outputs.last_hidden_state
+            )  # [:, :audio_feat_lengths, :]
+            self.audio_past_key_values = audio_outputs.past_key_values
+
+            audio_embeds = self.audio_projection_layer(audio_states)
+
+            audio_embeds = audio_embeds.transpose(1, 2)
+            audio_embeds = self.audio_avg_pooler(audio_embeds)
+            audio_embeds = audio_embeds.transpose(1, 2)
+
+            _, feature_lens_after_pooling = self._get_feat_extract_output_lengths(
+                audio_feature_lens
+            )
+
+            num_audio_tokens = feature_lens_after_pooling
+
+            final_audio_embeds = []
+            idx = 0
+            for i in range(len(audio_feature_lens_raw)):
+                target_audio_embeds = []
+                for _ in range(len(audio_feature_lens_raw[i])):
+                    target_audio_embeds.append(
+                        audio_embeds[idx, : num_audio_tokens[idx], :]
+                    )
+                    idx += 1
+                final_audio_embeds.append(target_audio_embeds)
+            return final_audio_embeds
+        else:
+            return []
+
+    def subsequent_chunk_mask(
+        self,
+        size: int,
+        chunk_size: int,
+        num_left_chunks: int = -1,
+        device: torch.device = torch.device("cpu"),
+        num_lookhead: int = 0,
+    ) -> torch.Tensor:
+        """Create mask for subsequent steps (size, size) with chunk size,
+        this is for streaming encoder
+
+        Args:
+            size (int): size of mask
+            chunk_size (int): size of chunk
+            num_left_chunks (int): number of left chunks
+                <0: use full chunk
+                >=0: use num_left_chunks
+            device (torch.device): "cpu" or "cuda" or torch.Tensor.device
+
+        Returns:
+            torch.Tensor: mask
+
+        """
+        ret = torch.zeros(size, size, device=device, dtype=torch.bool)
+        for i in range(size):
+            if num_left_chunks < 0:
+                start = 0
+            else:
+                start = max((i // chunk_size - num_left_chunks) * chunk_size, 0)
+            ending = min((i // chunk_size + 1) * chunk_size + num_lookhead, size)
+            ret[i, start:ending] = True
+        return ret
+
+    def get_audio_embedding(self, items: List[MultimodalDataItem], chunk_length=-1):
+        r"""
+        Extract full audio embeddings with optional chunk-based attention.
+
+        This method computes embeddings for all audio frames at once, either using full attention (when
+        `chunk_length` is -1) or chunk-based attention (when `chunk_length` is a positive number). It does
+        not use key-value caching and is suitable for non-streaming inference.
+
+        Args:
+            chunk_length (int, optional): Determines whether to use full attention (-1) or chunk-based
+                attention (>0) during embedding computation.
+
+        Returns:
+            List[List[torch.Tensor]]: audio embeddings
+        """
+        # (bs, 80, frames) or [], multi audios need filled in advance
+        wavforms = flatten_nested_list([item.feature for item in items if item.feature])
+        # list, [[x1, x2], [y1], [z1]]
+        audio_feature_lens_raw = flatten_nested_list(
+            [item.audio_feature_lens for item in items if item.audio_feature_lens]
+        )
+
+        final_audio_embeds = []
+
+        assert isinstance(wavforms, list)
+        assert isinstance(wavforms[0], torch.Tensor)
+        # exist audio
+        for wavform in wavforms:
+            if len(wavform) > 0:
+                audio_feature_lens = torch.hstack(audio_feature_lens_raw)
+                batch_size, _, max_mel_seq_len = wavform.shape
+                max_seq_len = (max_mel_seq_len - 1) // 2 + 1
+
+                # Create a sequence tensor of shape (batch_size, max_seq_len)
+                seq_range = (
+                    torch.arange(
+                        0,
+                        max_seq_len,
+                        dtype=audio_feature_lens.dtype,
+                        device=audio_feature_lens.device,
+                    )
+                    .unsqueeze(0)
+                    .expand(batch_size, max_seq_len)
+                )
+                lengths_expand = audio_feature_lens.unsqueeze(1).expand(
+                    batch_size, max_seq_len
+                )
+                # Create mask
+                padding_mask = seq_range >= lengths_expand  # 1 for padded values
+
+                audio_attention_mask_ = padding_mask.view(
+                    batch_size, 1, 1, max_seq_len
+                ).expand(batch_size, 1, max_seq_len, max_seq_len)
+                audio_attention_mask = audio_attention_mask_.to(
+                    dtype=self.apm.conv1.weight.dtype,
+                    device=self.apm.conv1.weight.device,
+                )
+
+                if chunk_length > 0:
+                    chunk_num_frame = int(chunk_length * 50)
+                    chunk_mask = self.subsequent_chunk_mask(
+                        size=max_seq_len,
+                        chunk_size=chunk_num_frame,
+                        num_left_chunks=-1,
+                        device=audio_attention_mask_.device,
+                    )
+                    audio_attention_mask_ = torch.logical_or(
+                        audio_attention_mask_, torch.logical_not(chunk_mask)
+                    )
+
+                audio_attention_mask[audio_attention_mask_] = float("-inf")
+                audio_states = self.apm(
+                    wavform,
+                    output_hidden_states=True,
+                    attention_mask=audio_attention_mask,
+                ).hidden_states[self.audio_encoder_layer]
+                audio_embeds = self.audio_projection_layer(audio_states)
+
+                audio_embeds = audio_embeds.transpose(1, 2)
+                audio_embeds = self.audio_avg_pooler(audio_embeds)
+                audio_embeds = audio_embeds.transpose(1, 2)
+
+                _, feature_lens_after_pooling = self._get_feat_extract_output_lengths(
+                    audio_feature_lens
+                )
+
+                num_audio_tokens = feature_lens_after_pooling
+
+                idx = 0
+                for i in range(len(audio_feature_lens_raw)):
+                    target_audio_embeds = []
+                    for _ in range(len(audio_feature_lens_raw[i])):
+                        target_audio_embeds.append(
+                            audio_embeds[idx, : num_audio_tokens[idx], :]
+                        )
+                        idx += 1
+                    final_audio_embeds.append(target_audio_embeds)
+            return final_audio_embeds
+
+    def get_audio_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        embedding = self.get_omni_embedding(
+            items=items,
+            chunk_length=self.config.audio_chunk_length,
+            stream_input=False,
+        )
+        return embedding
+
+    def get_omni_embedding(
+        self,
+        items: List[MultimodalDataItem],
+        chunk_length=-1,
+        stream_input=False,
+    ):
+        """
+        Args:
+            chunk_length: whisper use full attention or chunk attention
+            stream_input: use streaming audio embedding
+        Returns:
+            final embeddings with audio feature
+        """
+
+        if stream_input:
+            audio_embeddings = self.get_audio_embedding_streaming(items)
+        else:
+            audio_embeddings = self.get_audio_embedding(items, chunk_length)
+        bs = len(audio_embeddings)
+        # batch size
+        audio_embs = torch.cat(flatten_nested_list(audio_embeddings), dim=0)
+
+        return audio_embs
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # list of tensors
+        pixel_values = flatten_nested_list([item.feature for item in items])
+        tgt_sizes = torch.stack(
+            flatten_nested_list([item.tgt_size for item in items]), dim=0
+        )
+        assert len(pixel_values) == tgt_sizes.shape[0]
+
+        device = self.vpm.embeddings.position_embedding.weight.device
+        dtype = self.vpm.embeddings.position_embedding.weight.dtype
+        all_pixel_values_lst = [
+            i.flatten(end_dim=1).permute(1, 0) for i in pixel_values
+        ]
+
+        max_patches = (tgt_sizes[:, 0] * tgt_sizes[:, 1]).max().item()
+        assert isinstance(max_patches, int)
+        all_pixel_values = torch.nn.utils.rnn.pad_sequence(
+            all_pixel_values_lst, batch_first=True, padding_value=0.0
+        )
+
+        B, L, _ = all_pixel_values.shape
+        all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
+        patch_attn_mask = torch.zeros(
+            (B, 1, max_patches), dtype=torch.bool, device=device
+        )
+
+        tgt_sizes_tensor = tgt_sizes.clone().to(device=patch_attn_mask.device)
+        mask_shapes = tgt_sizes_tensor[:, 0] * tgt_sizes_tensor[:, 1]
+        patch_attn_mask[:, 0, :] = torch.arange(
+            patch_attn_mask.size(2), device=patch_attn_mask.device
+        ).unsqueeze(0) < mask_shapes.unsqueeze(1)
+
+        vision_embedding = self.vpm(
+            all_pixel_values.type(dtype),
+            patch_attention_mask=patch_attn_mask,
+            tgt_sizes=tgt_sizes,
+        )
+        return self.resampler(vision_embedding, tgt_sizes)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.llm,
+            multimodal_model=self,
+            positions=positions,
+        )
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+
+            if "rotary_emb.inv_freq~" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+
+            # For weight_norm parametrization, handle both old and new formats
+            if self.config.init_tts and "tts" in name:
+                # Handle loading from older checkpoints with weight_g/weight_v format
+                if ".weight_g" in name or ".weight_v" in name:
+                    name = name.replace(
+                        ".weight_g", ".parametrizations.weight.original0"
+                    )
+                    name = name.replace(
+                        ".weight_v", ".parametrizations.weight.original1"
+                    )
+                elif ".weight" in name and name not in params_dict:
+                    param_name = name.replace(
+                        ".weight", ".parametrizations.weight.original0"
+                    )
+                    if param_name in params_dict:
+                        name = param_name
+
+            # adapt to VisionAttention
+            if "vpm" in name:
+                name = name.replace(r"self_attn.out_proj", r"self_attn.proj")
+
+            if not self.config.init_tts and "tts" in name:
+                continue
+            if not self.config.init_audio and ("apm" in name or "audio" in name):
+                continue
+            if not self.config.init_vision and "vpm" in name:
+                continue
+
+            if (
+                "sampler" in name
+                or "apm" in name
+                or ("tts" in name and "self_attn" in name)
+                or ("tts.model.layers" in name and ".mlp" in name)
+            ):
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # replace the name and load with customized loader
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = [MiniCPMO]
diff --git a/python/sglang/srt/models/minicpmv.py b/python/sglang/srt/models/minicpmv.py
new file mode 100644
index 000000000..e621676fc
--- /dev/null
+++ b/python/sglang/srt/models/minicpmv.py
@@ -0,0 +1,1046 @@
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The SGLang team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only MiniCPM-V model compatible with HuggingFace weights."""
+
+from functools import partial
+from typing import (
+    Any,
+    Callable,
+    Iterable,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    TypedDict,
+    Union,
+)
+
+import numpy as np
+import torch
+import torch.types
+from PIL import Image
+from torch import nn
+from torch.nn.init import trunc_normal_
+from transformers import PretrainedConfig
+
+from sglang.srt.layers.linear import ReplicatedLinear
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternTokenPairs,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.utils import set_default_torch_dtype
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.idefics2 import Idefics2VisionTransformer
+from sglang.srt.models.llama import LlamaConfig, LlamaForCausalLM
+from sglang.srt.models.qwen2 import Qwen2Config, Qwen2ForCausalLM
+from sglang.srt.utils import add_prefix, flatten_nested_list
+
+RawImageType = Union[Image.Image, torch.Tensor]
+
+
+# sin/cos positional embedding helpers are adapted from:
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_1d_sincos_pos_embed_from_grid(
+    embed_dim: int, pos: np.ndarray, version: Tuple[int, int] = (2, 0)
+) -> torch.Tensor:
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,) / (H, W)
+    out: (M, D) / (H, W, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+
+    if version == (2, 0):
+        pos = pos.reshape(-1)  # (M,)
+        out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+        emb_sin = np.sin(out)  # (M, D/2)
+        emb_cos = np.cos(out)  # (M, D/2)
+        emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    else:
+        out = np.einsum("hw,d->hwd", pos, omega)  # (H, W, D/2), outer product
+        emb_sin = np.sin(out)  # (H, W, D/2)
+        emb_cos = np.cos(out)  # (H, W, D/2)
+        emb = np.concatenate([emb_sin, emb_cos], axis=-1)  # (H, W, D)
+    return emb
+
+
+def get_2d_sincos_pos_embed_from_grid(
+    embed_dim: int, grid: np.ndarray, version: Tuple[int, int] = (2, 0)
+) -> torch.Tensor:
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(
+        embed_dim // 2, grid[0], version
+    )  # (H*W, D/2) or (H, W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(
+        embed_dim // 2, grid[1], version
+    )  # (H*W, D/2) or (H, W, D/2)
+
+    if version == (2, 0):
+        emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    else:
+        emb = np.concatenate([emb_h, emb_w], axis=-1)  # (H, W, D)
+    return emb
+
+
+def get_2d_sincos_pos_embed(
+    embed_dim: int,
+    grid_size: Union[int, Tuple[int, int]],
+    cls_token: bool = False,
+    version: Tuple[int, int] = (2, 0),
+) -> torch.Tensor:
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or
+                [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if isinstance(grid_size, int):
+        grid_h_size, grid_w_size = grid_size, grid_size
+    else:
+        grid_h_size, grid_w_size = grid_size[0], grid_size[1]
+
+    grid_h = np.arange(grid_h_size, dtype=np.float32)
+    grid_w = np.arange(grid_w_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    assert isinstance(grid, np.ndarray) and grid.shape == (2, grid_h_size, grid_w_size)
+
+    if version == (2, 0):
+        grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
+        pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
+        if cls_token:
+            pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    else:
+        pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
+    return pos_embed
+
+
+class MiniCPMVImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: List[torch.Tensor]
+    """
+    Shape: `(batch_size * num_images, num_channels, height, width)`
+
+    Note that the image size may vary, so we pass it as a list
+    instead of a batched tensor.
+    """
+
+    image_bounds: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
+
+    This should be in `(start, stop)` format.
+    """
+
+    tgt_sizes: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
+
+    This should be in `(height, width)` format.
+    """
+
+
+class MiniCPMVImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    instead of a batched tensor.
+    """
+
+    image_bounds: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
+
+    This should be in `(start, stop)` format.
+    """
+
+
+MiniCPMVImageInputs = Union[MiniCPMVImagePixelInputs, MiniCPMVImageEmbeddingInputs]
+
+DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6)
+
+
+class BaseResampler(nn.Module):
+    """
+    A 2D perceiver-resampler network with one cross attention layers by
+        (grid_size**2) learnable queries and 2d sincos pos_emb.
+    Outputs:
+        A tensor with the shape of (grid_size**2, embed_dim)
+    """
+
+    def __init__(
+        self,
+        num_queries: int,
+        embed_dim: int,
+        num_heads: int,
+        kv_dim: Optional[int] = None,
+        norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+        do_post_projection: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.num_queries = num_queries
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+
+        self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
+        trunc_normal_(self.query, std=0.02)
+        if kv_dim is not None and kv_dim != embed_dim:
+            self.kv_proj = ReplicatedLinear(
+                kv_dim,
+                embed_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("kv_proj", prefix),
+            )
+        else:
+            # Maintain the same return value with ReplicatedLinear.forward
+            self.kv_proj = lambda *args, **kwargs: (  # type: ignore # noqa
+                nn.Identity()(*args, **kwargs),
+                None,
+            )
+        self.attn = nn.MultiheadAttention(embed_dim, num_heads)
+        self.ln_q = norm_layer(embed_dim)
+        self.ln_kv = norm_layer(embed_dim)
+        self.do_post_projection = do_post_projection
+        self.ln_post = norm_layer(embed_dim) if do_post_projection else None
+        self.proj = (
+            nn.Parameter((embed_dim**-0.5) * torch.randn(embed_dim, embed_dim))
+            if do_post_projection
+            else None
+        )
+
+    def _init_weights(self, m: nn.Module) -> None:
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def _repeat(self, query, N: int):
+        return query.unsqueeze(1).repeat(1, N, 1)
+
+
+class Resampler2_5(BaseResampler):
+
+    def __init__(
+        self,
+        num_queries: int,
+        embed_dim: int,
+        num_heads: int,
+        kv_dim: Optional[int] = None,
+        norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+        max_size: Tuple[int, int] = (70, 70),
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(
+            num_queries,
+            embed_dim,
+            num_heads,
+            kv_dim,
+            norm_layer,
+            quant_config=quant_config,
+            prefix=prefix,
+        )
+
+        self.max_size = max_size
+        self._set_2d_pos_cache(self.max_size)
+
+        self.apply(self._init_weights)
+
+    def _set_2d_pos_cache(
+        self, max_size: Tuple[int, int], device: torch.types.Device = "cpu"
+    ) -> None:
+        pos_embed_arr = get_2d_sincos_pos_embed(
+            self.embed_dim, max_size, version=(2, 5)
+        )
+        pos_embed = torch.from_numpy(pos_embed_arr).float().to(device)
+        self.register_buffer("pos_embed", pos_embed, persistent=False)
+
+    def _adjust_pos_cache(
+        self, tgt_sizes: torch.Tensor, device: torch.types.Device
+    ) -> None:
+        max_h = tgt_sizes[:, 0].max().item()
+        max_w = tgt_sizes[:, 1].max().item()
+        assert isinstance(max_h, int) and isinstance(max_w, int)
+
+        if max_h > self.max_size[0] or max_w > self.max_size[1]:
+            self.max_size = (
+                max(max_h, self.max_size[0]),
+                max(max_w, self.max_size[1]),
+            )
+            self._set_2d_pos_cache(self.max_size, device)
+
+    def forward(self, x: torch.Tensor, tgt_sizes: torch.Tensor) -> torch.Tensor:
+        assert x.shape[0] == tgt_sizes.shape[0]
+        bs = x.shape[0]
+
+        device = x.device
+        dtype = x.dtype
+
+        patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]
+
+        self._adjust_pos_cache(tgt_sizes, device=device)
+
+        max_patch_len = patch_len.max().item()
+        assert isinstance(max_patch_len, int)
+
+        key_padding_mask = torch.zeros(
+            (bs, max_patch_len), dtype=torch.bool, device=device
+        )
+
+        pos_embed = []
+        for i in range(bs):
+            tgt_h, tgt_w = tgt_sizes[i].tolist()
+            pos_embed.append(
+                self.pos_embed[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, -1)).to(dtype)
+            )  # patches * D
+            key_padding_mask[i, patch_len[i] :] = True
+        pos_embed = torch.nn.utils.rnn.pad_sequence(
+            pos_embed, batch_first=True, padding_value=0.0
+        ).permute(
+            1, 0, 2
+        )  # BLD => L * B * D
+        x, _ = self.kv_proj(x)  # B * L * D
+        x = self.ln_kv(x).permute(1, 0, 2)  # L * B * D
+
+        q = self.ln_q(self.query)  # Q * D
+
+        out = self.attn(
+            self._repeat(q, bs),  # Q * B * D
+            x + pos_embed,  # L * B * D +  L * B * D
+            x,
+            key_padding_mask=key_padding_mask,
+        )[0]
+        #  out: Q * B * D
+        x = out.permute(1, 0, 2)  # B * Q * D
+
+        x = self.ln_post(x)
+        x = x @ self.proj
+        return x
+
+
+def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]:
+    version_float = getattr(config, "version", None)
+
+    # The old configs do not include version number
+    # TODO: Remove this after the HF repos are updated
+    if version_float is None:
+        if config.hidden_size == 2304 and config.query_num == 64:
+            return 2, 0
+        return 2, 5
+
+    version_str = str(version_float)
+    return tuple(int(x) for x in version_str.split("."))
+
+
+class MiniCPMBaseModel(nn.Module):
+    """
+    The abstract class of MiniCPMV can only be inherited, but cannot be
+    instantiated.
+    """
+
+    def __init__(
+        self,
+        *,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        # All MiniCPM-V models disable `tie_word_embeddings` but
+        # `PretrainedConfig.tie_word_embeddings` defaults to True; we cannot
+        # check `tie_word_embeddings` until SGLang integrate MiniCPM-V model
+        # and config class
+        self.config = config
+
+        self.version = get_version_by_config(self.config)
+        self.llm = self.init_llm(
+            config=config, quant_config=quant_config, prefix=add_prefix("llm", prefix)
+        )
+        self.vpm = self.init_vision_module(
+            config, quant_config, add_prefix("vpm", prefix)
+        )
+        self.vision_dim = (
+            self.vpm.embed_dim
+            if self.version == (2, 0)
+            else self.vpm.embeddings.embed_dim
+        )
+        self.embed_dim = self.config.hidden_size
+
+        self.resampler = self.init_resampler(
+            self.embed_dim,
+            self.vision_dim,
+            quant_config=quant_config,
+            prefix=add_prefix("resampler", prefix),
+        )
+
+        self.logits_processor = LogitsProcessor(config)
+
+    def _get_image_bounds(
+        self,
+        input_ids: torch.Tensor,
+        pad_values: List[int],
+        im_start_id: int,
+        im_end_id: int,
+        slice_start_id: Optional[int] = None,
+        slice_end_id: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Returns a tensor indicating the bounds (start and end token ids) of the images
+        """
+        # All the images in the batch should share the same special image
+        # bound token ids.
+        start_cond = input_ids == im_start_id
+        end_cond = input_ids == im_end_id
+        if slice_start_id is not None:
+            start_cond |= input_ids == slice_start_id
+            end_cond |= input_ids == slice_end_id
+
+        (image_start_tokens,) = torch.where(start_cond)
+        image_start_tokens += 1
+        (image_end_tokens,) = torch.where(end_cond)
+
+        # the im_start_id sometimes can be cached as prefix, but it is needed for the embedding of the images
+        if len(image_start_tokens) != len(image_end_tokens):
+            if (
+                len(image_start_tokens) + 1 == len(image_end_tokens)
+                and input_ids[0] in pad_values
+                and len(image_start_tokens) != 0
+                and len(image_end_tokens) != 0
+                and image_end_tokens[0] < image_start_tokens[0]
+            ):
+                image_start_tokens = torch.cat(
+                    [
+                        torch.tensor([0], device=image_start_tokens.device),
+                        image_start_tokens,
+                    ]
+                )
+        valid_image_nums = min(len(image_start_tokens), len(image_end_tokens))
+
+        if valid_image_nums == 0:
+            return torch.zeros((0, 2), device=input_ids.device)
+
+        # Filter out pairs where start_token >= end_token
+        valid_pairs = []
+        for i in range(valid_image_nums):
+            start_token = image_start_tokens[i]
+            end_token = image_end_tokens[i]
+            if start_token < end_token:
+                valid_pairs.append((start_token, end_token))
+
+        if not valid_pairs:
+            return torch.zeros((0, 2), device=input_ids.device)
+
+        # Convert valid pairs to tensor
+        valid_pairs_tensor = torch.tensor(valid_pairs, device=input_ids.device)
+        return valid_pairs_tensor
+
+    def _parse_and_validate_inputs(
+        self,
+        input_ids: torch.Tensor,
+        **kwargs: object,
+    ) -> Optional[MiniCPMVImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", [])
+        tgt_sizes = kwargs.pop("tgt_sizes", [])
+        im_start_id = kwargs.pop("im_start_id", None)
+        im_end_id = kwargs.pop("im_end_id", None)
+        slice_start_id = kwargs.pop("slice_start_id", None)
+        slice_end_id = kwargs.pop("slice_end_id", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        pad_values = kwargs.pop("pad_values", None)
+
+        if image_embeds is not None:
+            image_bounds = self._get_image_bounds(
+                input_ids=input_ids,
+                pad_values=pad_values,
+                im_start_id=im_start_id,
+                im_end_id=im_end_id,
+                slice_start_id=slice_start_id,
+                slice_end_id=slice_end_id,
+            )
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError(
+                    f"Incorrect type of image embeds. "
+                    f"Got type: {type(image_embeds)}"
+                )
+
+            if isinstance(image_embeds, list):
+                image_embeds = torch.cat(image_embeds)
+
+            return MiniCPMVImageEmbeddingInputs(
+                image_bounds=image_bounds,
+                data=image_embeds,
+                type="image_embeds",
+            )
+
+        image_bounds = self._get_image_bounds(
+            input_ids=input_ids,
+            pad_values=pad_values,
+            im_start_id=im_start_id,
+            im_end_id=im_end_id,
+            slice_start_id=slice_start_id,
+            slice_end_id=slice_end_id,
+        )
+        return MiniCPMVImagePixelInputs(
+            image_bounds=image_bounds.to(device=input_ids.device),
+            data=pixel_values,
+            tgt_sizes=tgt_sizes,
+            type="pixel_values",
+        )
+
+    def get_embedding(
+        self,
+        input_ids: torch.Tensor,
+        image_inputs: Optional[MiniCPMVImageInputs],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        vlm_embedding: torch.Tensor = self.llm.get_input_embeddings(input_ids)
+
+        if image_inputs is None:  # No image
+            vision_hidden_states = torch.tensor([], device=input_ids.device)
+        else:
+            if image_inputs["type"] == "image_embeds":
+                vision_hidden_states = (
+                    image_inputs["data"]
+                    .type(vlm_embedding.dtype)
+                    .to(vlm_embedding.device)
+                )
+            else:
+                vision_hidden_states = self.get_vision_hidden_states(image_inputs)
+            # See NOTE in _parse_and_validate_inputs
+            image_bounds = image_inputs["image_bounds"]
+            if len(image_bounds) > 0:
+                image_indices = torch.stack(
+                    [
+                        torch.arange(start, end, dtype=torch.long)
+                        for start, end in image_bounds.tolist()
+                    ]
+                ).to(vlm_embedding.device)
+
+                vlm_embedding.scatter_(
+                    0,
+                    image_indices.view(-1, 1).repeat(1, vlm_embedding.shape[-1]),
+                    vision_hidden_states.view(-1, vision_hidden_states.shape[-1]),
+                )
+
+        return vlm_embedding, vision_hidden_states
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.llm.get_input_embeddings()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            multimodal_model=self,
+            language_model=self.llm,
+            positions=positions,
+        )
+        return hidden_states
+
+    def init_llm(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        raise NotImplementedError
+
+    def init_vision_module(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ) -> nn.Module:
+        raise NotImplementedError
+
+    def init_resampler(
+        self,
+        embed_dim: int,
+        vision_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        raise NotImplementedError
+
+    def get_vision_embedding(
+        self,
+        pixel_values: List[torch.Tensor],
+        patch_attn_mask: Optional[torch.Tensor] = None,
+        tgt_sizes: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        raise NotImplementedError
+
+
+class MiniCPMV2_6(MiniCPMBaseModel):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        # vision encoder
+        "fc1",
+        "fc2",
+        "out_proj",
+        # language model
+        "qkv_proj",  # same name with vision encoder
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+        # resampler
+        "kv_proj",
+    ]
+
+    # BitandBytes specific attributes
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__(config=config, quant_config=quant_config, prefix=prefix)
+        assert self.version == (2, 6)
+
+    def init_llm(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        return Qwen2ForCausalLM(config=config, quant_config=quant_config, prefix=prefix)
+
+    def init_vision_module(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ) -> nn.Module:
+        model = Idefics2VisionTransformer(
+            config=config.vision_config, quant_config=quant_config, prefix=prefix
+        )
+        if self.config.drop_vision_last_layer:
+            model.encoder.layers = model.encoder.layers[:-1]
+
+        setattr(model, "embed_dim", model.embeddings.embed_dim)
+        setattr(model, "patch_size", model.embeddings.patch_size)
+        return model
+
+    def init_resampler(
+        self,
+        embed_dim: int,
+        vision_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        with set_default_torch_dtype(torch.float16):
+            # The resampler in 2.6 remains consistent with the one in 2.5.
+            resampler = Resampler2_5(
+                num_queries=self.config.query_num,
+                embed_dim=embed_dim,
+                num_heads=embed_dim // 128,
+                kv_dim=vision_dim,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        return resampler.to(device="cuda", dtype=torch.get_default_dtype())
+
+    def get_vision_embedding(
+        self,
+        pixel_values: List[torch.Tensor],
+        patch_attn_mask: Optional[torch.Tensor] = None,
+        tgt_sizes: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        vision_embedding = self.vpm(
+            pixel_values,
+            patch_attention_mask=patch_attn_mask,
+            tgt_sizes=tgt_sizes,
+        )
+        return vision_embedding
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # list of tensors
+        pixel_values = flatten_nested_list([item.feature for item in items])
+        tgt_sizes = torch.stack(
+            flatten_nested_list([item.tgt_size for item in items]), dim=0
+        )
+        assert len(pixel_values) == tgt_sizes.shape[0]
+
+        device = self.vpm.embeddings.position_embedding.weight.device
+        dtype = self.vpm.embeddings.position_embedding.weight.dtype
+        all_pixel_values_lst = [
+            i.flatten(end_dim=1).permute(1, 0) for i in pixel_values
+        ]
+
+        max_patches = (tgt_sizes[:, 0] * tgt_sizes[:, 1]).max().item()
+        assert isinstance(max_patches, int)
+        all_pixel_values = torch.nn.utils.rnn.pad_sequence(
+            all_pixel_values_lst, batch_first=True, padding_value=0.0
+        )
+
+        B, L, _ = all_pixel_values.shape
+        all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
+        patch_attn_mask = torch.zeros(
+            (B, 1, max_patches), dtype=torch.bool, device=device
+        )
+
+        tgt_sizes_tensor = tgt_sizes.clone().to(device=patch_attn_mask.device)
+        mask_shapes = tgt_sizes_tensor[:, 0] * tgt_sizes_tensor[:, 1]
+        patch_attn_mask[:, 0, :] = torch.arange(
+            patch_attn_mask.size(2), device=patch_attn_mask.device
+        ).unsqueeze(0) < mask_shapes.unsqueeze(1)
+
+        vision_embedding = self.vpm(
+            all_pixel_values.type(dtype),
+            patch_attention_mask=patch_attn_mask,
+            tgt_sizes=tgt_sizes,
+        )
+        return self.resampler(vision_embedding, tgt_sizes)
+
+    def pad_input_ids(self, input_ids: List[int], image_inputs: MultimodalInputs):
+        # Get all special token IDs
+        im_start_id: int = image_inputs.im_start_id
+        im_end_id: int = image_inputs.im_end_id
+        slice_start_id: int = image_inputs.slice_start_id
+        slice_end_id: int = image_inputs.slice_end_id
+
+        media_token_pairs = [(im_start_id, im_end_id), (slice_start_id, slice_end_id)]
+        pattern = MultiModalityDataPaddingPatternTokenPairs(media_token_pairs)
+
+        return pattern.pad_input_tokens(input_ids, image_inputs)
+
+
+class MiniCPMV4_0(MiniCPMBaseModel):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        # vision encoder
+        "fc1",
+        "fc2",
+        "out_proj",
+        # language model
+        "qkv_proj",  # same name with vision encoder
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+        # resampler
+        "kv_proj",
+    ]
+
+    # BitandBytes specific attributes
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__(config=config, quant_config=quant_config, prefix=prefix)
+        assert self.version == (4, 0)
+
+    def init_llm(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        return LlamaForCausalLM(config=config, quant_config=quant_config, prefix=prefix)
+
+    def init_vision_module(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ) -> nn.Module:
+        model = Idefics2VisionTransformer(
+            config=config.vision_config, quant_config=quant_config, prefix=prefix
+        )
+        if self.config.drop_vision_last_layer:
+            model.encoder.layers = model.encoder.layers[:-1]
+
+        setattr(model, "embed_dim", model.embeddings.embed_dim)
+        setattr(model, "patch_size", model.embeddings.patch_size)
+        return model
+
+    def init_resampler(
+        self,
+        embed_dim: int,
+        vision_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        with set_default_torch_dtype(torch.float16):
+            # The resampler in 2.6 remains consistent with the one in 2.5.
+            resampler = Resampler2_5(
+                num_queries=self.config.query_num,
+                embed_dim=embed_dim,
+                num_heads=embed_dim // 128,
+                kv_dim=vision_dim,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        return resampler.to(device="cuda", dtype=torch.get_default_dtype())
+
+    def get_vision_embedding(
+        self,
+        pixel_values: List[torch.Tensor],
+        patch_attn_mask: Optional[torch.Tensor] = None,
+        tgt_sizes: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        vision_embedding = self.vpm(
+            pixel_values,
+            patch_attention_mask=patch_attn_mask,
+            tgt_sizes=tgt_sizes,
+        )
+        return vision_embedding
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # list of tensors
+        pixel_values = flatten_nested_list([item.feature for item in items])
+        tgt_sizes = torch.stack(
+            flatten_nested_list([item.tgt_size for item in items]), dim=0
+        )
+        assert len(pixel_values) == tgt_sizes.shape[0]
+
+        device = self.vpm.embeddings.position_embedding.weight.device
+        dtype = self.vpm.embeddings.position_embedding.weight.dtype
+        all_pixel_values_lst = [
+            i.flatten(end_dim=1).permute(1, 0) for i in pixel_values
+        ]
+
+        max_patches = (tgt_sizes[:, 0] * tgt_sizes[:, 1]).max().item()
+        assert isinstance(max_patches, int)
+        all_pixel_values = torch.nn.utils.rnn.pad_sequence(
+            all_pixel_values_lst, batch_first=True, padding_value=0.0
+        )
+
+        B, L, _ = all_pixel_values.shape
+        all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
+        patch_attn_mask = torch.zeros(
+            (B, 1, max_patches), dtype=torch.bool, device=device
+        )
+
+        tgt_sizes_tensor = tgt_sizes.clone().to(device=patch_attn_mask.device)
+        mask_shapes = tgt_sizes_tensor[:, 0] * tgt_sizes_tensor[:, 1]
+        patch_attn_mask[:, 0, :] = torch.arange(
+            patch_attn_mask.size(2), device=patch_attn_mask.device
+        ).unsqueeze(0) < mask_shapes.unsqueeze(1)
+
+        vision_embedding = self.vpm(
+            all_pixel_values.type(dtype),
+            patch_attention_mask=patch_attn_mask,
+            tgt_sizes=tgt_sizes,
+        )
+        return self.resampler(vision_embedding, tgt_sizes)
+
+    def pad_input_ids(self, input_ids: List[int], image_inputs: MultimodalInputs):
+        # Get all special token IDs
+        im_start_id: int = image_inputs.im_start_id
+        im_end_id: int = image_inputs.im_end_id
+        slice_start_id: int = image_inputs.slice_start_id
+        slice_end_id: int = image_inputs.slice_end_id
+
+        media_token_pairs = [(im_start_id, im_end_id), (slice_start_id, slice_end_id)]
+        pattern = MultiModalityDataPaddingPatternTokenPairs(media_token_pairs)
+
+        return pattern.pad_input_tokens(input_ids, image_inputs)
+
+
+_SUPPORT_VERSION = {
+    (2, 6): MiniCPMV2_6,
+    (4, 0): MiniCPMV4_0,
+}
+
+
+class MiniCPMV:
+    """
+    Different versions of MiniCPMV use different visual encoders and LLMs,
+    which is not conducive to the current integration logic of LoRA and
+    bitsandbytes in SGLang. Therefore, it is necessary to separate them.
+    """
+
+    # Ensure that the LoRA support check passes when the class is not
+    # initialized, but set all these attributes to empty.
+    packed_modules_mapping = {}
+    supported_lora_modules = []
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    minicpmv: nn.Module
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        if not hasattr(config, "version"):
+            version = (2, 6)
+        else:
+            version = str(config.version).split(".")
+            version = tuple([int(x) for x in version])
+        # Dispatch class based on version
+        instance_class = _SUPPORT_VERSION.get(version)
+        if instance_class is None:
+            raise ValueError("Currently, MiniCPMV only supports versions 2.6 and 4.0")
+
+        try:
+            minicpmv = instance_class(
+                config=config, quant_config=quant_config, prefix=prefix
+            )
+            self.minicpmv = minicpmv
+        except Exception as e:
+            print(f"Failed to instantiate MiniCPMV: {e}")
+            raise e
+        self.config = config
+
+    def __getattr__(self, name):
+        if name == "minicpmv":
+            return None
+        return getattr(self.minicpmv, name)
+
+    def __call__(self, *args, **kwargs):
+        return self.minicpmv(*args, **kwargs)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.minicpmv.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq~" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+
+            # adapt to VisionAttention
+            name = name.replace(r"self_attn.out_proj", r"self_attn.proj")
+
+            if "sampler" in name:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # replace the name and load with customized loader
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = MiniCPMV
diff --git a/python/sglang/srt/models/mistral.py b/python/sglang/srt/models/mistral.py
new file mode 100644
index 000000000..632e857c2
--- /dev/null
+++ b/python/sglang/srt/models/mistral.py
@@ -0,0 +1,93 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only Mistral model."""
+
+from typing import List
+
+import torch
+from transformers.models.mistral3.modeling_mistral3 import Mistral3MultiModalProjector
+
+from sglang.srt.managers.schedule_batch import MultimodalDataItem
+from sglang.srt.models.llama import LlamaForCausalLM
+
+
+class MistralForCausalLM(LlamaForCausalLM):
+    pass
+
+
+class Mistral3ForConditionalGeneration:
+    MULTIMODAL_PROJECTOR_TYPE = Mistral3MultiModalProjector
+
+    def __init__(self, **kwargs):
+        # lazy load inner class
+        # to bypass circular import
+        from sglang.srt.models.llava import LlavaForConditionalGeneration
+
+        # override config: mistral's projector adds patchmerger that doesn't require padding
+        kwargs["config"].vision_config.pad_image_border = False
+
+        self.inner = LlavaForConditionalGeneration(**kwargs)
+        self.inner.multi_modal_projector = self.MULTIMODAL_PROJECTOR_TYPE(
+            kwargs["config"]
+        )
+        self.inner.get_image_feature = self.get_image_feature
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        """Extract features from image inputs.
+
+        Args:
+            items: List of MultimodalDataItem objects containing image data
+                Note that an item can be either "image" or "multi-images"
+
+        Returns:
+            torch.Tensor: features from image inputs, concatenated
+        """
+        features = []
+        for item in items:
+            # in each item, we assume pixel_values is always batched
+            pixel_values, image_sizes = item.feature, item.image_sizes
+            image_outputs = self.vision_tower(
+                pixel_values, image_sizes, output_hidden_states=True
+            )
+            selected_image_feature = image_outputs.hidden_states[
+                self.vision_feature_layer
+            ]
+
+            if self.vision_feature_select_strategy in ["default", "patch"]:
+                selected_image_feature = selected_image_feature[:, 1:]
+            elif self.vision_feature_select_strategy == "full":
+                selected_image_feature = selected_image_feature
+            else:
+                raise ValueError(
+                    f"Unexpected select feature: {self.vision_feature_select_strategy}"
+                )
+            features.append(
+                self.multi_modal_projector(
+                    selected_image_feature.squeeze(0), image_sizes
+                )
+            )
+        ret = torch.cat(features, dim=0)
+        return ret
+
+    def __getattr__(self, name):
+        return getattr(self.inner, name)
+
+    def __hasattr__(self, name):
+        return hasattr(self.inner, name)
+
+    def __call__(self, *args, **kwargs):
+        return self.inner(*args, **kwargs)
+
+
+EntryClass = [MistralForCausalLM, Mistral3ForConditionalGeneration]
diff --git a/python/sglang/srt/models/mixtral.py b/python/sglang/srt/models/mixtral.py
new file mode 100644
index 000000000..c5f04a4fc
--- /dev/null
+++ b/python/sglang/srt/models/mixtral.py
@@ -0,0 +1,480 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/mixtral.py#L1
+"""Inference-only Mixtral model."""
+
+import logging
+from typing import Iterable, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import MixtralConfig
+
+from sglang.srt.distributed import (
+    get_moe_expert_parallel_world_size,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.ep_moe.layer import EPMoE
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, make_layers
+
+logger = logging.getLogger(__name__)
+
+
+class MixtralMoE(nn.Module):
+    """A tensor-parallel MoE implementation for Mixtral that shards each expert
+    across all ranks.
+
+    Each expert's weights are sharded across all ranks and a fused MoE
+    kernel is used for the forward pass, and finally we reduce the outputs
+    across ranks.
+    """
+
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        layer_id: int,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        tp_size: Optional[int] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.hidden_size = hidden_size
+
+        # Gate always runs at half / full precision for now.
+        self.gate = ReplicatedLinear(
+            hidden_size,
+            num_experts,
+            bias=False,
+            params_dtype=params_dtype,
+            quant_config=None,
+            prefix=add_prefix("gate", prefix),
+        )
+
+        self.topk = TopK(
+            top_k=top_k,
+            renormalize=True,
+        )
+
+        MoEImpl = EPMoE if get_moe_expert_parallel_world_size() > 1 else FusedMoE
+        self.experts = MoEImpl(
+            num_experts=num_experts,
+            top_k=top_k,
+            layer_id=layer_id,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            params_dtype=params_dtype,
+            quant_config=quant_config,
+            prefix=add_prefix("experts", prefix),
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = self.experts(hidden_states, topk_output)
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+        return final_hidden_states.view(orig_shape)
+
+
+class MixtralAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        max_position: int = 4096 * 32,
+        rope_theta: float = 10000,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=int(self.rope_theta),
+            is_neox_style=True,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MixtralDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: MixtralConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Requires transformers > 4.32.0
+        rope_theta = getattr(config, "rope_theta", 10000)
+        self.self_attn = MixtralAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.block_sparse_moe = MixtralMoE(
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("block_sparse_moe", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.block_sparse_moe(hidden_states)
+        return hidden_states, residual
+
+
+class MixtralModel(nn.Module):
+    def __init__(
+        self,
+        config: MixtralConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                prefix=add_prefix("embed_tokens", prefix),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: MixtralDecoderLayer(
+                config=config, quant_config=quant_config, layer_id=idx, prefix=prefix
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix="layers",
+            return_tuple=True,
+        )
+
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions, hidden_states, forward_batch, residual
+            )
+
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class MixtralForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: MixtralConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = MixtralModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size, config.hidden_size, prefix=add_prefix("lm_head", prefix)
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        if self.pp_group.is_last_rank:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            return hidden_states
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_local_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if (
+                    name.endswith(".bias") or name.endswith("_bias")
+                ) and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
+                        continue
+                    # Skip loading kv_scale from ckpts towards new design.
+                    if name.endswith(".kv_scale") and name not in params_dict:
+                        continue
+                    if name is None:
+                        continue
+
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+                    else:
+                        logger.warning(f"Parameter {name} not found in params_dict")
+
+
+EntryClass = MixtralForCausalLM
diff --git a/python/sglang/srt/models/mixtral_quant.py b/python/sglang/srt/models/mixtral_quant.py
new file mode 100644
index 000000000..5b84c90dd
--- /dev/null
+++ b/python/sglang/srt/models/mixtral_quant.py
@@ -0,0 +1,430 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/mixtral_quant.py#L1
+"""Inference-only Mixtral model."""
+
+from typing import Iterable, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import MixtralConfig
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+
+class MixtralMLP(nn.Module):
+    def __init__(
+        self,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.num_experts = num_experts
+        self.ffn_dim = intermediate_size
+        self.hidden_dim = hidden_size
+
+        self.w1 = ReplicatedLinear(
+            self.hidden_dim,
+            self.ffn_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("w1", prefix),
+        )
+        self.w2 = ReplicatedLinear(
+            self.ffn_dim,
+            self.hidden_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("w2", prefix),
+        )
+        self.w3 = ReplicatedLinear(
+            self.hidden_dim,
+            self.ffn_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("w3", prefix),
+        )
+
+        # TODO: Use vllm's SiluAndMul
+        self.act_fn = nn.SiLU()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        w1_out, _ = self.w1(hidden_states)
+        w1_out = self.act_fn(w1_out)
+        w3_out, _ = self.w3(hidden_states)
+        current_hidden_states = w1_out * w3_out
+        current_hidden_states, _ = self.w2(current_hidden_states)
+        return current_hidden_states
+
+
+class MixtralMoE(nn.Module):
+    def __init__(
+        self,
+        config: MixtralConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_total_experts = config.num_local_experts
+        self.top_k = config.num_experts_per_tok
+        if self.tp_size > self.num_total_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {self.num_total_experts}."
+            )
+        # Split experts equally between ranks
+        self.expert_indicies = np.array_split(
+            range(self.num_total_experts), self.tp_size
+        )[self.rank].tolist()
+        if not self.expert_indicies:
+            raise ValueError(f"Rank {self.rank} has no experts assigned to it.")
+
+        self.experts = nn.ModuleList(
+            [
+                (
+                    MixtralMLP(
+                        self.num_total_experts,
+                        config.hidden_size,
+                        config.intermediate_size,
+                        quant_config=quant_config,
+                        prefix=add_prefix(f"experts.{idx}", prefix),
+                    )
+                    if idx in self.expert_indicies
+                    else None
+                )
+                for idx in range(self.num_total_experts)
+            ]
+        )
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            self.num_total_experts,
+            bias=False,
+            quant_config=None,
+            prefix=add_prefix("gate", prefix),
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        router_logits, _ = self.gate(hidden_states)
+
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(
+            routing_weights, self.top_k, dim=-1
+        )
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+
+        final_hidden_states = None
+        for expert_idx in self.expert_indicies:
+            expert_layer = self.experts[expert_idx]
+            expert_mask = selected_experts == expert_idx
+            expert_weights = (routing_weights * expert_mask).sum(dim=-1, keepdim=True)
+
+            current_hidden_states = expert_layer(hidden_states).mul_(expert_weights)
+            if final_hidden_states is None:
+                final_hidden_states = current_hidden_states
+            else:
+                final_hidden_states.add_(current_hidden_states)
+
+        return tensor_model_parallel_all_reduce(final_hidden_states)
+
+
+class MixtralAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        max_position: int = 4096 * 32,
+        rope_theta: float = 10000,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=int(self.rope_theta),
+            is_neox_style=True,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MixtralDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: MixtralConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Requires transformers > 4.32.0
+        rope_theta = getattr(config, "rope_theta", 10000)
+        self.self_attn = MixtralAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.block_sparse_moe = MixtralMoE(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("block_sparse_moe", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.block_sparse_moe(hidden_states)
+        return hidden_states, residual
+
+
+class MixtralModel(nn.Module):
+    def __init__(
+        self,
+        config: MixtralConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = nn.ModuleList(
+            [
+                MixtralDecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions, hidden_states, forward_batch, residual
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class QuantMixtralForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: MixtralConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = MixtralModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size, config.hidden_size, prefix=add_prefix("lm_head", prefix)
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if "block_sparse_moe.experts." in name and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = QuantMixtralForCausalLM
diff --git a/python/sglang/srt/models/mllama.py b/python/sglang/srt/models/mllama.py
new file mode 100644
index 000000000..fa294ddcd
--- /dev/null
+++ b/python/sglang/srt/models/mllama.py
@@ -0,0 +1,1062 @@
+# Adapted from:
+# https://github.com/vllm-project/vllm/blob/7193774b1ff8603ad5bf4598e5efba0d9a39b436/vllm/model_executor/models/mllama.py
+"""PyTorch Mllama model."""
+import math
+from typing import Iterable, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers.models.mllama.configuration_mllama as config_mllama
+from torch import nn
+from transformers.modeling_outputs import BaseModelOutput, CausalLMOutputWithPast
+from transformers.models.mllama.modeling_mllama import (
+    _prepare_aspect_ratio_attention_mask,
+)
+
+import sglang.srt.distributed.parallel_state as ps
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.managers.schedule_batch import MultimodalInputs
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.llama import LlamaDecoderLayer, LlamaMLP
+from sglang.srt.utils import add_prefix
+
+
+class ColumnParallelConv2dPatch(torch.nn.Module):
+    """Conv2D Patching layer with model parallelism.
+    Column parallel over unfolded input.
+    Arguments:
+        in_channels: Input channels.
+        out_channels: Output channels.
+        kernel_size: Size of convolution kernel.
+        stride (default 1): Stride for convolution.
+        bias (default False): Use bias in Conv2d.
+    Input: (bsz, in_channels, width, height)
+    Output: (bsz, num_tokens, out_channels)
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int]],
+        stride: Union[int, Tuple[int, int]],
+        bias: bool = False,
+    ) -> None:
+        super().__init__()
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size, kernel_size)
+        self._unfold = torch.nn.Unfold(kernel_size=kernel_size, stride=stride)
+        self._linear = ColumnParallelLinear(
+            in_channels * kernel_size[0] * kernel_size[1],
+            out_channels,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self._unfold(x)
+        x = x.permute(0, 2, 1)
+        x, _ = self._linear(x)
+        return x
+
+
+class MllamaPrecomputedAspectRatioEmbedding(nn.Module):
+
+    def __init__(self, config: config_mllama.MllamaVisionConfig, is_gated: bool = True):
+        super().__init__()
+        self.max_num_tiles = config.max_num_tiles
+        self.hidden_size = config.hidden_size
+        self.max_aspect_ratio_id = config.max_aspect_ratio_id
+        self.is_gated = is_gated
+
+        self.embedding = nn.Embedding(
+            self.max_aspect_ratio_id + 1, self.max_num_tiles * self.hidden_size
+        )
+        if is_gated:
+            self.gate = nn.Parameter(torch.zeros(1))
+
+    def forward(
+        self, hidden_state: torch.Tensor, aspect_ratio_ids: torch.Tensor
+    ) -> torch.Tensor:
+        embeddings = self.embedding(aspect_ratio_ids)
+        embeddings = embeddings.reshape(-1, self.max_num_tiles, 1, self.hidden_size)
+
+        if self.is_gated:
+            embeddings = embeddings * self.gate.tanh()
+
+        hidden_state = hidden_state + embeddings
+        return hidden_state
+
+
+class MllamaPrecomputedPositionEmbedding(nn.Module):
+    def __init__(self, config: config_mllama.MllamaVisionConfig):
+        super().__init__()
+        self.max_num_tiles = config.max_num_tiles
+        self.max_aspect_ratio_id = config.max_aspect_ratio_id
+        self.num_patches = (config.image_size // config.patch_size) ** 2 + 1
+        self.hidden_size = config.hidden_size
+        self.scale = config.hidden_size**-0.5
+
+        self.gate = nn.Parameter(torch.zeros(1))
+
+        # position embedding
+        position_embedding = torch.randn(self.num_patches, self.hidden_size)
+        self.embedding = nn.Parameter(self.scale * position_embedding)
+
+        # tile position embedding
+        self.tile_embedding = nn.Embedding(
+            self.max_aspect_ratio_id + 1,
+            self.max_num_tiles * self.num_patches * self.hidden_size,
+        )
+
+    def forward(
+        self, hidden_state: torch.Tensor, aspect_ratio_ids: torch.Tensor
+    ) -> torch.Tensor:
+        # position embeddings
+        gated_position_embedding = (1 - self.gate.tanh()) * self.embedding
+        hidden_state = hidden_state + gated_position_embedding.view(
+            1, 1, self.num_patches, self.hidden_size
+        )
+
+        # precomputed tile position embeddings
+        tile_position_embedding = self.tile_embedding(aspect_ratio_ids)
+        batch_size = hidden_state.shape[0]
+        tile_position_embedding = tile_position_embedding.reshape(
+            batch_size, self.max_num_tiles, self.num_patches, self.hidden_size
+        )
+        gated_tile_position_embedding = self.gate.tanh() * tile_position_embedding
+        hidden_state = hidden_state + gated_tile_position_embedding
+
+        return hidden_state
+
+
+class MllamaVisionMLP(nn.Module):
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("fc1", prefix),
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("fc2", prefix),
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+
+        return hidden_states
+
+
+class MllamaVisionEncoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: config_mllama.MllamaVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        is_gated: bool = False,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.attention_heads
+        self.is_gated = is_gated
+        self.intermediate_size = config.intermediate_size
+
+        self.self_attn = VisionAttention(
+            self.hidden_size,
+            self.num_attention_heads,
+            self.hidden_size,
+            use_qkv_parallel=True,
+            quant_config=quant_config,
+            dropout=0.0,
+            qkv_backend="sdpa",
+            softmax_in_single_precision=False,
+            flatten_batch=False,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = MllamaVisionMLP(
+            config, quant_config, prefix=add_prefix("mlp", prefix)
+        )
+
+        self.input_layernorm = nn.LayerNorm(self.hidden_size, eps=config.norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(
+            self.hidden_size, eps=config.norm_eps
+        )
+
+        # there used to be an if else here, no code path
+        if is_gated:
+            self.gate_attn = nn.Parameter(torch.ones(1) * math.pi / 4)
+            self.gate_ffn = nn.Parameter(torch.ones(1) * math.pi / 4)
+
+    def forward(
+        self,
+        hidden_state: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        # Self Attention
+        residual = hidden_state
+        hidden_state = self.input_layernorm(hidden_state)
+        hidden_state = self.self_attn(hidden_state, attention_mask=attention_mask)
+        gate_attn = 1 if not self.is_gated else self.gate_attn.tanh()
+        hidden_state = residual + gate_attn * hidden_state
+
+        # Feed forward
+        residual = hidden_state
+        hidden_state = self.post_attention_layernorm(hidden_state)
+        hidden_state = self.mlp(hidden_state)
+        gate_ffn = 1 if not self.is_gated else self.gate_ffn.tanh()
+        hidden_state = residual + gate_ffn * hidden_state
+
+        return hidden_state
+
+
+class MllamaVisionEncoder(nn.Module):
+    def __init__(
+        self,
+        config: config_mllama.MllamaVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        num_layers=32,
+        is_gated=False,
+        output_hidden_states=None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [
+                MllamaVisionEncoderLayer(
+                    config,
+                    quant_config,
+                    is_gated,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(num_layers)
+            ]
+        )
+        self.output_hidden_states = output_hidden_states or []
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        encoder_states = ()
+
+        for i, encoder_layer in enumerate(self.layers):
+            if i in self.output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            hidden_states = encoder_layer(
+                hidden_states,
+                attention_mask,
+            )
+
+        if len(self.layers) - 1 in self.output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        return hidden_states, encoder_states
+
+
+class MllamaVisionModel(nn.Module):
+    def __init__(
+        self,
+        config: config_mllama.MllamaVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.max_num_tiles = config.max_num_tiles
+        self.hidden_size = config.hidden_size
+        self.in_channels = config.num_channels
+        self.intermediate_layers_indices = config.intermediate_layers_indices
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2 + 1
+        self.scale = config.hidden_size**-0.5
+
+        self.patch_embedding = ColumnParallelConv2dPatch(
+            in_channels=config.num_channels,
+            out_channels=self.hidden_size,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+        )
+
+        self.class_embedding = nn.Parameter(self.scale * torch.randn(self.hidden_size))
+        self.gated_positional_embedding = MllamaPrecomputedPositionEmbedding(config)
+
+        self.pre_tile_positional_embedding = MllamaPrecomputedAspectRatioEmbedding(
+            config, is_gated=True
+        )
+        self.post_tile_positional_embedding = MllamaPrecomputedAspectRatioEmbedding(
+            config, is_gated=True
+        )
+
+        # layer norms
+        self.layernorm_pre = nn.LayerNorm(self.hidden_size)
+        self.layernorm_post = nn.LayerNorm(self.hidden_size)
+
+        # encoders
+        self.transformer = MllamaVisionEncoder(
+            config,
+            quant_config,
+            config.num_hidden_layers,
+            is_gated=False,
+            output_hidden_states=config.intermediate_layers_indices,
+            prefix=add_prefix("transformer", prefix),
+        )
+        self.global_transformer = MllamaVisionEncoder(
+            config,
+            quant_config,
+            config.num_global_layers,
+            is_gated=True,
+            prefix=add_prefix("global_transformer", prefix),
+        )
+
+    def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        batch_size, _, hidden_size = hidden_state.shape
+        class_embedding = self.class_embedding.expand(batch_size, 1, hidden_size)
+        hidden_state = torch.cat([class_embedding, hidden_state], dim=1)
+        return hidden_state
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        aspect_ratio_ids: torch.Tensor,
+        aspect_ratio_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        batch_size, num_concurrent_media, num_tiles, num_channels, height, width = (
+            pixel_values.shape
+        )
+
+        pixel_values = pixel_values.reshape(
+            batch_size * num_concurrent_media * num_tiles, num_channels, height, width
+        )
+        aspect_ratio_ids = aspect_ratio_ids.reshape(
+            batch_size * num_concurrent_media, -1
+        )
+
+        # patch embedding
+        patch_embeds = self.patch_embedding(
+            pixel_values.to(self.layernorm_pre.weight.dtype)
+        )
+        hidden_state = patch_embeds
+        hidden_state = ps.get_tp_group().all_gather(hidden_state)
+
+        # tile embeddings
+        _, num_patches, dim = hidden_state.shape
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media, num_tiles, -1, dim
+        )
+        hidden_state = self.pre_tile_positional_embedding(
+            hidden_state, aspect_ratio_ids
+        )
+
+        # apply cls token
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media * num_tiles, num_patches, dim
+        )
+        hidden_state = self.apply_class_embedding(hidden_state)
+        num_patches += 1
+
+        # apply position embeddings
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media, num_tiles, num_patches, dim
+        )
+        hidden_state = self.gated_positional_embedding(hidden_state, aspect_ratio_ids)
+
+        # apply encoder
+        hidden_state = self.layernorm_pre(hidden_state)
+
+        # Compute the number of tokens to pad
+        num_padding_patches = (8 - (hidden_state.shape[-2] % 8)) % 8
+        # Compute padding tuple for pad function
+        padding = (
+            0,
+            0,
+            0,
+            num_padding_patches,
+        )  # (pad_left, pad_right, pad_left for dim -2, pad_right for dim -2)
+        # Pad the tensor
+        hidden_state = F.pad(hidden_state, padding, mode="constant", value=0)
+        slice_index = -num_padding_patches if num_padding_patches > 0 else None
+
+        attention_mask = aspect_ratio_mask.reshape(
+            batch_size * num_concurrent_media, -1
+        )
+        attention_mask = _prepare_aspect_ratio_attention_mask(
+            aspect_ratio_mask=attention_mask,
+            num_patches=self.num_patches,
+            target_length=hidden_state.shape[2],
+            dtype=self.layernorm_pre.weight.dtype,
+        )
+
+        hidden_state = hidden_state.view(batch_size * num_concurrent_media, -1, dim)
+        output = self.transformer(
+            hidden_state,
+            attention_mask=attention_mask,
+        )
+        hidden_state, intermediate_hidden_states = output[0], output[1]
+        intermediate_hidden_states = torch.stack(intermediate_hidden_states, dim=-1)
+
+        # apply global encoder
+        hidden_state = self.layernorm_post(hidden_state)
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media,
+            num_tiles,
+            num_patches + num_padding_patches,
+            dim,
+        )
+        hidden_state = self.post_tile_positional_embedding(
+            hidden_state, aspect_ratio_ids
+        )
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media,
+            num_tiles * (num_patches + num_padding_patches),
+            dim,
+        )
+        hidden_state = self.global_transformer(
+            hidden_state, attention_mask=attention_mask
+        )[0]
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media,
+            num_tiles,
+            num_patches + num_padding_patches,
+            dim,
+        )
+        hidden_state = hidden_state[:, :, :slice_index]
+
+        # adding intermediate layer outputs
+        hidden_state = hidden_state.reshape(
+            batch_size, num_concurrent_media, num_tiles, num_patches, dim
+        )
+        intermediate_hidden_states = intermediate_hidden_states.reshape(
+            batch_size * num_concurrent_media,
+            num_tiles,
+            num_patches + num_padding_patches,
+            -1,
+        )
+        intermediate_hidden_states = intermediate_hidden_states[:, :, :slice_index]
+        intermediate_hidden_states = intermediate_hidden_states.reshape(
+            batch_size, num_concurrent_media, num_tiles, num_patches, -1
+        )
+        hidden_state = torch.cat([hidden_state, intermediate_hidden_states], dim=-1)
+        return hidden_state
+
+
+class MllamaTextRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class MllamaTextCrossAttention(nn.Module):
+    def __init__(
+        self,
+        config: Optional[config_mllama.MllamaTextConfig] = None,
+        layer_id: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.model_parallel_size = get_tensor_model_parallel_world_size()
+        self.num_heads = self.config.num_attention_heads
+        self.num_local_heads = self.num_heads // self.model_parallel_size
+        self.num_key_value_heads = self.config.num_key_value_heads
+        self.num_local_key_value_heads = (
+            self.num_key_value_heads // self.model_parallel_size
+        )
+        self.dropout = config.dropout
+        self.hidden_size = config.hidden_size
+        self.head_dim = config.hidden_size // self.num_heads
+        self.layer_id = layer_id
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.q_local_size = self.num_local_heads * self.head_dim
+        self.kv_local_size = self.num_local_key_value_heads * self.head_dim
+
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.num_heads,
+            self.num_key_value_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+        # vllm.model_executor.layers.layernorm.RMSNorm has precision issue,
+        # use huggingface's instead
+        self.q_norm = MllamaTextRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = MllamaTextRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.scaling = self.head_dim**-0.5
+
+        self.attn = RadixAttention(
+            self.num_local_heads,
+            self.head_dim,
+            self.scaling,
+            self.num_local_key_value_heads,
+            layer_id=layer_id,
+            is_cross_attention=True,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        cross_attention_states: Optional[torch.Tensor],
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv_dec, _ = self.qkv_proj(hidden_states)
+        q, _, _ = qkv_dec.split(
+            [self.q_local_size, self.kv_local_size, self.kv_local_size], dim=-1
+        )
+        if cross_attention_states is None:
+            k = None
+            v = None
+        else:
+            qkv_enc, _ = self.qkv_proj(cross_attention_states)
+            _, k, v = qkv_enc.split(
+                [self.q_local_size, self.kv_local_size, self.kv_local_size], dim=-1
+            )
+            k = k.view(-1, self.num_local_key_value_heads, self.head_dim)
+            v = v.view(-1, self.num_local_key_value_heads, self.head_dim)
+            k = self.k_norm(k)
+        q = q.view(-1, self.num_local_heads, self.head_dim)
+        q = self.q_norm(q)
+
+        output = self.attn(q, k, v, forward_batch)
+        out, _ = self.o_proj(output)
+        return out
+
+
+class MllamaCrossAttentionDecoderLayer(torch.nn.Module):
+    """Cross-attention transformer block with tanh-gated attention
+    and feedforward."""
+
+    def __init__(
+        self,
+        config: config_mllama.MllamaTextConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.cross_attn = MllamaTextCrossAttention(
+            config=config,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("cross_attn", prefix),
+        )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.cross_attn_attn_gate = torch.nn.Parameter(torch.zeros(1))
+
+        self.mlp = LlamaMLP(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.cross_attn_mlp_gate = torch.nn.Parameter(torch.zeros(1))
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cross_attention_states: torch.Tensor,
+        cross_attention_mask: torch.Tensor,
+        full_text_row_masked_out_mask: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.cross_attn(
+            hidden_states=hidden_states,
+            attention_mask=cross_attention_mask,
+            cross_attention_states=cross_attention_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = full_text_row_masked_out_mask * hidden_states
+        hidden_states = residual + self.cross_attn_attn_gate.tanh() * hidden_states
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = full_text_row_masked_out_mask * hidden_states
+        hidden_states = residual + self.cross_attn_mlp_gate.tanh() * hidden_states
+        return hidden_states
+
+
+class MllamaTextModel(nn.Module):
+    config_class = config_mllama.MllamaTextConfig
+    base_model_prefix = "model"
+
+    def __init__(
+        self,
+        config: config_mllama.MllamaTextConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.padding_id = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size + 8,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.cross_attention_layers = config.cross_attention_layers
+
+        layers = []
+        for layer_id in range(config.num_hidden_layers):
+            if layer_id in self.cross_attention_layers:
+                layers.append(
+                    MllamaCrossAttentionDecoderLayer(
+                        config,
+                        layer_id,
+                        quant_config=quant_config,
+                        prefix=add_prefix(f"layers.{layer_id}", prefix),
+                    )
+                )
+            else:
+                # TODO: force LlamaDecoderLayer to config.attention_bias=False
+                layers.append(
+                    LlamaDecoderLayer(
+                        config,
+                        quant_config=quant_config,
+                        layer_id=layer_id,
+                        prefix=add_prefix(f"layers.{layer_id}", prefix),
+                    )
+                )
+
+        self.layers = nn.ModuleList(layers)
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: Optional[torch.LongTensor],
+        cross_attention_states: Optional[torch.LongTensor],
+        cross_attention_mask: Optional[torch.LongTensor],
+        full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        forward_batch: ForwardBatch,
+        skip_cross_attention: bool,
+    ) -> torch.Tensor:
+        inputs_embeds = self.embed_tokens(input_ids)
+        hidden_states = inputs_embeds
+
+        for _, decoder_layer in enumerate(self.layers):
+            if isinstance(decoder_layer, MllamaCrossAttentionDecoderLayer):
+                if not skip_cross_attention:
+                    hidden_states = decoder_layer(
+                        hidden_states=hidden_states,
+                        cross_attention_states=cross_attention_states,
+                        cross_attention_mask=cross_attention_mask,
+                        full_text_row_masked_out_mask=full_text_row_masked_out_mask,
+                        forward_batch=forward_batch,
+                    )
+            elif isinstance(decoder_layer, LlamaDecoderLayer):
+                hidden_states, residual = decoder_layer(
+                    positions=positions,
+                    hidden_states=hidden_states,
+                    forward_batch=forward_batch,
+                    residual=None,
+                )
+                hidden_states = hidden_states + residual
+            else:
+                raise ValueError(f"Unknown decoder layer type {type(decoder_layer)}")
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class MllamaForCausalLM(nn.Module):
+    config_class = config_mllama.MllamaTextConfig
+    base_model_prefix = "language_model"
+    _no_split_modules = [
+        "MllamaCrossAttentionDecoderLayer",
+        "MllamaSelfAttentionDecoderLayer",
+    ]
+
+    def __init__(
+        self,
+        config: config_mllama.MllamaTextConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.vocab_size = config.vocab_size
+        self.model = MllamaTextModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+        )
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: Optional[torch.LongTensor],
+        cross_attention_states: Optional[torch.LongTensor],
+        cross_attention_mask: Optional[torch.LongTensor],
+        full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        forward_batch: ForwardBatch,
+        skip_cross_attention: bool,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            cross_attention_states=cross_attention_states,
+            cross_attention_mask=cross_attention_mask,
+            full_text_row_masked_out_mask=full_text_row_masked_out_mask,
+            forward_batch=forward_batch,
+            skip_cross_attention=skip_cross_attention,
+        )
+        return hidden_states
+
+
+class MllamaForConditionalGeneration(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    # in TP, these weights are partitioned along the column dimension (dim=-1)
+    column_parallel_weights_modules = [".down_proj.", ".o_proj."]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        config: config_mllama.MllamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.quant_config = quant_config
+        self.vocab_size = config.text_config.vocab_size
+        self.hidden_size = config.text_config.hidden_size
+        self.max_num_tiles = config.vision_config.max_num_tiles
+        self.vision_output_dim = config.vision_config.vision_output_dim
+        self.pad_token_id = (
+            config.pad_token_id if config.pad_token_id is not None else -1
+        )
+        self.image_size = config.vision_config.image_size
+
+        self.vision_model = MllamaVisionModel(
+            config.vision_config,
+            quant_config=quant_config,
+            prefix=add_prefix("vision_model", prefix),
+        )
+        self.language_model = MllamaForCausalLM(
+            config.text_config,
+            quant_config=quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+        self.multi_modal_projector = ReplicatedLinear(
+            config.vision_config.vision_output_dim,
+            config.text_config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix="multi_modal_projector",
+        )
+        self.logits_processor = LogitsProcessor(config.text_config)
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pixel_values = torch.cat([item.feature for item in mm_inputs.mm_items], dim=0)
+        pad_values = [item.pad_value for item in mm_inputs.mm_items]
+
+        num_concurrent_media, num_tiles = pixel_values.shape[1:3]
+        num_patches = self.vision_model.num_patches
+        image_len = num_concurrent_media * num_tiles * num_patches
+        mm_inputs.num_image_tokens = image_len
+
+        pad_ids = pad_values * ((image_len + len(pad_values)) // len(pad_values))
+
+        return pad_ids[:image_len] + input_ids
+
+    def _batch_image_inputs(self, forward_batch: ForwardBatch):
+        if forward_batch.forward_mode.is_decode() or all(forward_batch.encoder_cached):
+            return None, None, None, None
+
+        # pixel_values: shape (bs, num_image, num_tiles, 3, image_res, image_res)
+        max_num_images = max_num_tiles = bs = 0
+        for i, mm_input in enumerate(forward_batch.mm_inputs):
+
+            if not forward_batch.encoder_cached[i] and mm_input is not None:
+                pixel_values = torch.cat(
+                    [item.feature for item in mm_input.mm_items], dim=0
+                )
+                max_num_images = max(max_num_images, pixel_values.shape[1])
+
+                max_num_tiles = max(max_num_tiles, pixel_values.shape[2])
+                bs += 1
+
+        if max_num_images * max_num_tiles * bs == 0:
+            return None, None, None, None
+
+        with forward_batch.out_cache_loc.device:
+            batched_images = torch.zeros(
+                bs,
+                max_num_images,
+                max_num_tiles,
+                3,
+                self.image_size,
+                self.image_size,
+                dtype=torch.float32,
+            )
+            batched_ar_ids = torch.ones(
+                bs, max_num_images, dtype=torch.int64, device="cuda"
+            )
+            batched_ar_mask = torch.zeros(
+                bs, max_num_images, max_num_tiles, dtype=torch.int64
+            )
+            i = 0
+            encoder_lens_need = []
+
+            for k, mm_input in enumerate(forward_batch.mm_inputs):
+                if forward_batch.encoder_cached[k] or mm_input is None:
+                    continue
+
+                encoder_lens_need.append(forward_batch.encoder_lens[k])
+                pixel_values = torch.cat(
+                    [item.feature for item in mm_input.mm_items], dim=0
+                )
+                for j in range(pixel_values.shape[1]):
+                    img = pixel_values[0, j]
+                    num_tiles = img.shape[0]
+                    batched_images[i, j, :num_tiles] = img
+                    batched_ar_ids[i, j] = mm_input.mm_items[0].aspect_ratio_id[0, j]
+
+                    batched_ar_mask[i, j, :num_tiles] = mm_input.mm_items[
+                        0
+                    ].aspect_ratio_mask[0, j]
+                i += 1
+
+        return batched_images, batched_ar_ids, batched_ar_mask, encoder_lens_need
+
+    def flat_encoder_result(
+        self, cross_attention_states: torch.Tensor, encoder_lens_need: List[int]
+    ):
+        # NOTE: not all encoders need computation, some are cached
+        head_dim = cross_attention_states.shape[-1]
+        total_encoder_len = sum(encoder_lens_need)
+        cross_attention_states_flat = torch.zeros(
+            total_encoder_len,
+            head_dim,
+            device=cross_attention_states.device,
+            dtype=cross_attention_states.dtype,
+        )
+
+        i = start_pos = 0
+        for encoder_len in encoder_lens_need:
+            if encoder_len == 0:
+                continue
+            end_pos = start_pos + encoder_len
+            cross_attention_states_flat[start_pos:end_pos] = cross_attention_states[i][
+                :encoder_len
+            ]
+            i += 1
+            start_pos += encoder_len
+
+        return cross_attention_states_flat
+
+    def get_full_text_row_masked_out_mask(self, forward_batch: ForwardBatch):
+        if forward_batch.forward_mode.is_decode():
+            full_text_row_masked_out_mask = forward_batch.encoder_lens != 0
+        else:
+            full_text_row_masked_out_mask = torch.ones(
+                forward_batch.extend_seq_lens.sum(), dtype=torch.bool
+            )
+            start_pos = 0
+
+            for seq_len, encoder_len in zip(
+                forward_batch.seq_lens.tolist(), forward_batch.encoder_lens_cpu
+            ):
+                if encoder_len == 0:
+                    full_text_row_masked_out_mask[start_pos : start_pos + seq_len] = (
+                        False
+                    )
+                start_pos += encoder_len
+
+            full_text_row_masked_out_mask = full_text_row_masked_out_mask.to(
+                forward_batch.seq_lens.device
+            )
+
+        return full_text_row_masked_out_mask.reshape(-1, 1)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+
+        batched_images, batched_ar_ids, batched_ar_mask, encoder_lens_need = (
+            self._batch_image_inputs(forward_batch)
+        )
+
+        # TODO: support multi-image by this mask
+        cross_attention_mask = None
+        cross_attention_states = None
+
+        if get_is_capture_mode():
+            # NOTE: when doing cuda graph capture, we do not want to skip cross attention
+            # Make is a constant value to avoid cuda graph capture issue
+            skip_cross_attention = False
+        else:
+            # NOTE: we do not need image_inputs when prefill
+            assert len(forward_batch.encoder_lens) == len(forward_batch.seq_lens)
+            assert len(forward_batch.encoder_lens_cpu) == len(forward_batch.seq_lens)
+            skip_cross_attention = forward_batch.encoder_lens.max() == 0
+
+        if not skip_cross_attention:
+            full_text_row_masked_out_mask = self.get_full_text_row_masked_out_mask(
+                forward_batch
+            )
+        else:
+            full_text_row_masked_out_mask = None
+
+        if batched_images is not None:
+            # NOTE: llama's reference implementation runs vision model on CPU
+            cross_attention_states = self.vision_model(
+                batched_images, batched_ar_ids, batched_ar_mask
+            )
+            cross_attention_states, _ = self.multi_modal_projector(
+                cross_attention_states
+            )
+
+            bs, _, _, _, image_token_dim = cross_attention_states.shape
+            cross_attention_states = cross_attention_states.view(
+                bs, -1, image_token_dim
+            )
+
+            cross_attention_states = self.flat_encoder_result(
+                cross_attention_states, encoder_lens_need
+            )
+
+        hidden_states = self.language_model(
+            input_ids=input_ids,
+            positions=positions,
+            cross_attention_states=cross_attention_states,
+            cross_attention_mask=cross_attention_mask,
+            full_text_row_masked_out_mask=full_text_row_masked_out_mask,
+            forward_batch=forward_batch,
+            skip_cross_attention=skip_cross_attention,
+        )
+        return self.logits_processor(
+            input_ids, hidden_states, self.language_model.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        updated_params = set()
+        for name, loaded_weight in weights:
+            if "patch_embedding.weight" in name:
+                name = name.replace(
+                    "patch_embedding.weight", "patch_embedding._linear.weight"
+                )
+                loaded_weight = loaded_weight.view(loaded_weight.shape[0], -1)
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                updated_params.add(name)
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if "vision_model" in name:
+                    # adapt to VisionAttention
+                    name = name.replace("self_attn.o_proj", "self_attn.proj")
+                param = params_dict.pop(name)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = MllamaForConditionalGeneration
diff --git a/python/sglang/srt/models/mllama4.py b/python/sglang/srt/models/mllama4.py
new file mode 100644
index 000000000..f0184390c
--- /dev/null
+++ b/python/sglang/srt/models/mllama4.py
@@ -0,0 +1,990 @@
+import json as json_lib
+import logging
+import math
+import os
+from collections.abc import Iterable
+from typing import List, Optional, Set, Tuple
+
+import torch
+from torch import nn
+from transformers import Llama4Config, Llama4VisionConfig
+from transformers.models.llama4.modeling_llama4 import (
+    Llama4MultiModalProjector,
+    vision_apply_rotary_emb,
+)
+
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+    global_server_args_dict,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.utils import is_cpu
+
+_is_cpu = is_cpu()
+
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+
+class Llama4VisionMLP(nn.Module):
+
+    def __init__(
+        self,
+        input_size: int,
+        intermediate_size: int,
+        output_size: int,
+        bias: bool,
+        output_activation: bool,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        cls_fc1 = ReplicatedLinear if use_data_parallel else ColumnParallelLinear
+        self.fc1 = cls_fc1(
+            input_size=input_size,
+            output_size=intermediate_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        cls_fc2 = ReplicatedLinear if use_data_parallel else RowParallelLinear
+        self.fc2 = cls_fc2(
+            input_size=intermediate_size,
+            output_size=output_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+        self.activation_fn = nn.GELU()
+        self.output_activation = output_activation
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        if self.output_activation:
+            return self.activation_fn(hidden_states)
+        return hidden_states
+
+
+def pixel_shuffle(input_tensor, shuffle_ratio):
+    # input_tensor: [batch_size, num_patches, channels]
+    batch_size, num_patches, channels = input_tensor.shape
+    patch_size = int(math.sqrt(num_patches))
+
+    input_tensor = input_tensor.view(batch_size, patch_size, patch_size, -1)
+    batch_size, height, width, channels = input_tensor.size()
+
+    reshaped_tensor = input_tensor.view(
+        batch_size, height, int(width * shuffle_ratio), int(channels / shuffle_ratio)
+    )
+    reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous()
+
+    reshaped_tensor = reshaped_tensor.view(
+        batch_size,
+        int(height * shuffle_ratio),
+        int(width * shuffle_ratio),
+        int(channels / (shuffle_ratio**2)),
+    )
+    reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous()
+
+    output_tensor = reshaped_tensor.view(batch_size, -1, reshaped_tensor.shape[-1])
+    return output_tensor
+
+
+class Llama4VisionPixelShuffleMLP(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        self.pixel_shuffle_ratio = config.pixel_shuffle_ratio
+        self.mlp = Llama4VisionMLP(
+            input_size=config.intermediate_size,
+            intermediate_size=config.projector_input_dim,
+            output_size=config.projector_output_dim,
+            bias=config.multi_modal_projector_bias,
+            output_activation=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
+        )
+
+    def forward(self, encoded_patches: torch.Tensor) -> torch.Tensor:
+        encoded_patches = pixel_shuffle(encoded_patches, self.pixel_shuffle_ratio)
+        return self.mlp(encoded_patches)
+
+
+def apply_position_embedding(q, k, freqs_ci, shape):
+    # [batch_size_times_num_tiles, num_channels]
+    input_shape = shape[:2]
+    # [batch_size_times_num_tiles, num_channels, num_heads, head_dim]
+    hidden_shape = (*input_shape, *q.shape[-2:])
+    q = q.view(hidden_shape)
+    k = k.view(hidden_shape)
+    q, k = vision_apply_rotary_emb(q, k, freqs_ci)
+    return q, k
+
+
+class Llama4VisionEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4VisionConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.intermediate_size = config.intermediate_size
+
+        self.self_attn = VisionAttention(
+            self.hidden_size,
+            self.num_attention_heads,
+            self.hidden_size,
+            use_qkv_parallel=True,
+            # vision_model is explicitly ignored in Maverick-17B-128E-Instruct-FP8
+            quant_config=None,
+            dropout=0.0,
+            qkv_backend="sdpa",
+            softmax_in_single_precision=False,
+            flatten_batch=False,
+            prefix=add_prefix("self_attn", prefix),
+            qkv_bias=True,
+            customized_position_embedding_applier=apply_position_embedding,
+        )
+        self.mlp = Llama4VisionMLP(
+            input_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            output_size=config.hidden_size,
+            bias=True,
+            output_activation=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
+        )
+
+        self.input_layernorm = nn.LayerNorm(config.hidden_size)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size)
+
+    def forward(
+        self,
+        hidden_state: torch.Tensor,
+        freqs_ci: torch.Tensor,
+    ):
+        # Self Attention
+        residual = hidden_state
+        hidden_state = self.input_layernorm(hidden_state)
+        hidden_state = self.self_attn(hidden_state, position_embeddings=freqs_ci)
+        hidden_state = residual + hidden_state
+
+        # Feed forward
+        residual = hidden_state
+        hidden_state = self.post_attention_layernorm(hidden_state)
+        hidden_state = self.mlp(hidden_state)
+        hidden_state = residual + hidden_state
+
+        outputs = hidden_state
+        return outputs
+
+
+class Llama4VisionEncoder(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4VisionConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [
+                Llama4VisionEncoderLayer(
+                    config,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.layers.{layer_idx}",
+                    use_data_parallel=use_data_parallel,
+                )
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        freqs_ci: torch.Tensor,  # TODO: move this to an attribute instead of keeping it around
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            hidden_states (`torch.FloatTensor` of shape
+                    `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to
+                directly pass an embedded representation. This is useful if you
+                want more control over how to convert `input_ids` indices into
+                associated vectors than the model's internal embedding
+                lookup matrix.
+        """
+
+        for encoder_layer in self.layers:
+            layer_outputs = encoder_layer(hidden_states, freqs_ci=freqs_ci)
+            hidden_states = layer_outputs
+
+        return hidden_states
+
+
+class Llama4UnfoldConvolution(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4VisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        kernel_size = config.patch_size
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size, kernel_size)
+        self.unfold = torch.nn.Unfold(kernel_size=kernel_size, stride=config.patch_size)
+        params = {
+            "input_size": config.num_channels * kernel_size[0] * kernel_size[1],
+            "output_size": config.hidden_size,
+            "bias": False,
+            "quant_config": quant_config,
+            "prefix": f"{prefix}.linear",
+        }
+        if use_data_parallel:
+            cls = ReplicatedLinear
+        else:
+            cls = ColumnParallelLinear
+            params["gather_output"] = True
+        self.linear = cls(**params)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.unfold(hidden_states)
+        hidden_states = hidden_states.permute(0, 2, 1)
+        hidden_states, _ = self.linear(hidden_states)
+        return hidden_states
+
+
+class Llama4VisionRotaryEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        idx = config.image_size // config.patch_size
+        img_idx = torch.arange(idx**2, dtype=torch.int32).reshape(idx**2, 1)
+        img_idx = torch.cat([img_idx, img_idx[:1]], dim=0)
+        img_idx[-1, -1] = -2  # ID_CLS_TOKEN
+        frequencies_x = img_idx % idx  # get the coordinates of the 2d matrix along x
+        frequencies_y = img_idx // idx  # get the coordinates of the 2d matrix along y
+        freq_dim = config.hidden_size // config.num_attention_heads // 2
+        rope_freq = 1.0 / (
+            config.rope_theta
+            ** (torch.arange(0, freq_dim, 2)[: (freq_dim // 2)].float() / freq_dim)
+        )
+        freqs_x = (
+            (frequencies_x + 1)[..., None] * rope_freq[None, None, :]
+        ).repeat_interleave(2, dim=-1)
+        freqs_y = (
+            (frequencies_y + 1)[..., None] * rope_freq[None, None, :]
+        ).repeat_interleave(2, dim=-1)
+        freqs = torch.cat([freqs_x, freqs_y], dim=-1).float().contiguous()[..., ::2]
+        freqs = freqs.masked_fill(img_idx.reshape(-1, 1, 1) < 0, 0)
+        freq_cis = torch.view_as_complex(
+            torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1)
+        )
+        self.freqs_ci = freq_cis  # idx**2, idx**2, idx * 2
+
+    def forward(self, hidden_states):
+        return self.freqs_ci.to(hidden_states.device)
+
+
+class Llama4VisionModel(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4VisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.hidden_size = config.hidden_size
+        self.num_channels = config.num_channels
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2 + 1
+        self.scale = config.hidden_size**-0.5
+
+        self.patch_embedding = Llama4UnfoldConvolution(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.patch_embedding",
+        )
+
+        self.class_embedding = nn.Parameter(self.scale * torch.randn(self.hidden_size))
+        self.positional_embedding_vlm = nn.Parameter(
+            self.scale * torch.randn(self.num_patches, self.hidden_size)
+        )
+
+        self.rotary_embedding = Llama4VisionRotaryEmbedding(config)
+
+        # layer norms
+        self.layernorm_pre = nn.LayerNorm(self.hidden_size, eps=1e-5)
+        self.layernorm_post = nn.LayerNorm(self.hidden_size, eps=1e-5)
+
+        # encoders
+        self.model = Llama4VisionEncoder(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.model",
+        )
+        self.vision_adapter = Llama4VisionPixelShuffleMLP(
+            config,
+            quant_config,
+            prefix=f"{prefix}.vision_adapter",
+        )
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+    ) -> torch.Tensor:
+        # Patch embedding
+        hidden_state = self.patch_embedding(pixel_values)
+        num_tiles, num_patches, hidden_dim = hidden_state.shape
+
+        # Add cls token
+        class_embedding = self.class_embedding.expand(
+            hidden_state.shape[0], 1, hidden_state.shape[-1]
+        )
+        hidden_state = torch.cat([hidden_state, class_embedding], dim=1)
+        num_patches += 1
+
+        # Position embeddings
+        hidden_state = hidden_state.reshape(
+            num_tiles,
+            1,
+            num_patches,
+            hidden_dim,
+        )
+        positional_embedding = self.positional_embedding_vlm.to(
+            dtype=hidden_state.dtype, device=hidden_state.device
+        )
+        hidden_state = hidden_state + positional_embedding
+        hidden_state = self.layernorm_pre(hidden_state)
+        hidden_state = hidden_state.view(num_tiles, -1, hidden_dim)
+        freqs_ci = self.rotary_embedding(pixel_values)
+        # Apply encoder
+        hidden_state = self.model(hidden_state, freqs_ci=freqs_ci)
+        hidden_state = self.layernorm_post(hidden_state)
+
+        # Remove CLS token output
+        hidden_state = hidden_state[:, :-1, :]
+
+        # now, we use Llama4VisionPixelShuffle + mlp to project embeddings
+        hidden_state = self.vision_adapter(hidden_state)
+
+        return hidden_state
+
+
+class Llama4ForConditionalGeneration(nn.Module):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    def __init__(
+        self,
+        config: Llama4Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+
+        # Check if this is a text-only model (modelopt fp8 llama4 has no vision components)
+        self.has_vision_weights = self._has_vision_weights(config)
+        if not self.has_vision_weights:
+            logger.warning(
+                "No vision weights found in checkpoint. Model will run in text-only mode. "
+                "Multimodal capabilities (vision understanding) will be unavailable. "
+                "Please not that this warning might be inaccurate if the weights haven't been fully downloaded"
+            )
+
+        self.has_vision = (
+            self.has_vision_weights and global_server_args_dict["enable_multimodal"]
+        )
+
+        if self.has_vision:
+            self.vision_model = Llama4VisionModel(
+                config.vision_config,
+                quant_config=quant_config,
+                prefix=add_prefix("vision_model", prefix),
+            )
+
+            self.multi_modal_projector = Llama4MultiModalProjector(config)
+        else:
+            self.vision_model = None
+            self.multi_modal_projector = None
+
+        # Initialize the language model
+        from sglang.srt.models.llama4 import Llama4ForCausalLM
+
+        self.language_model = Llama4ForCausalLM(
+            config.text_config if hasattr(config, "text_config") else config,
+            quant_config=quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+
+        self.logits_processor = LogitsProcessor(
+            config.text_config if hasattr(config, "text_config") else config
+        )
+        self.padding_pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+
+    def _has_vision_weights(self, config) -> bool:
+        """Check if the model has vision components by examining the checkpoint."""
+        model_path = getattr(config, "_name_or_path", None)
+        if not model_path:
+            return False
+
+        # Check if this is a local path first
+        if os.path.isdir(model_path):
+            index_file = os.path.join(model_path, "model.safetensors.index.json")
+            if os.path.exists(index_file):
+                return self._check_vision_weights_in_index(index_file)
+
+        # For HuggingFace models, we need to check the actual checkpoint
+        # The config might say it's multimodal, but the checkpoint might be text-only
+        try:
+            # Try to access the HuggingFace cache directory
+            from huggingface_hub import try_to_load_from_cache
+
+            # Check if index file exists in cache
+            index_file_path = try_to_load_from_cache(
+                repo_id=model_path,
+                filename="model.safetensors.index.json",
+                cache_dir=None,
+            )
+            if index_file_path and os.path.exists(index_file_path):
+                return self._check_vision_weights_in_index(index_file_path)
+
+        except Exception:
+            # If we can't access the cache, fall back to config-based detection
+            pass
+
+        # Fallback, assume text-only
+        return False
+
+    def _check_vision_weights_in_index(self, index_file: str) -> bool:
+        """Check if the model.safetensors.index.json contains vision weights."""
+        try:
+            with open(index_file, "r") as f:
+                index_data = json_lib.load(f)
+
+            vision_patterns = ["vision_model", "vision_tower", "multi_modal_projector"]
+            weight_names = index_data.get("weight_map", {}).keys()
+            return any(
+                pattern in weight_name
+                for weight_name in weight_names
+                for pattern in vision_patterns
+            )
+        except (OSError, json_lib.JSONDecodeError, KeyError):
+            return False
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        return self.padding_pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def get_image_feature(
+        self,
+        items: List[MultimodalDataItem],
+    ) -> torch.Tensor:
+        # For text-only models, return None or raise an error
+        if not self.has_vision or self.vision_model is None:
+            raise ValueError("Vision model not available for text-only checkpoint")
+        pixel_values = (
+            torch.concat([item.feature for item in items])
+            .to(next(self.vision_model.parameters()).device)
+            .type(next(self.vision_model.parameters()).dtype)
+        )
+        image_features = self.vision_model(pixel_values)
+
+        vision_flat = image_features.view(-1, image_features.size(-1))
+
+        projected_vision_flat = self.multi_modal_projector(vision_flat)
+
+        return projected_vision_flat
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs: object,
+    ) -> torch.Tensor:
+
+        # For text-only models, pass None for image_data_embedding_func
+        image_embedding_func = self.get_image_feature if self.has_vision else None
+
+        hs = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.language_model,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+            },
+            positions=positions,
+        )
+
+        return hs
+
+    def permute_qk_weight_for_rotary(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+    ) -> Tuple[str, torch.Tensor]:
+
+        def permute(w: torch.Tensor, n_heads: int):
+            attn_in = self.language_model.config.head_dim * n_heads
+            attn_out = self.language_model.config.hidden_size
+
+            return (
+                w.view(n_heads, attn_in // n_heads // 2, 2, attn_out)
+                .transpose(1, 2)
+                .reshape(attn_in, attn_out)
+            )
+
+        modules = name.split(".")
+
+        # rotary embeds should be sliced
+        if ("wk" in modules or "k_proj" in modules) and modules[-1] == "weight":
+            if _is_cpu:
+                dim = self.language_model.config.original_total_num_kv_heads
+            else:
+                dim = self.language_model.config.num_key_value_heads
+            loaded_weight = permute(loaded_weight, dim)
+        elif ("wq" in modules or "q_proj" in modules) and modules[-1] == "weight":
+            if _is_cpu:
+                dim = self.language_model.config.original_num_attention_heads
+            else:
+                dim = self.language_model.config.num_attention_heads
+            loaded_weight = permute(loaded_weight, dim)
+
+        return name, loaded_weight
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
+            (".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
+            (".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
+            (".shared_expert.gate_up_proj", ".shared_expert.gate_proj", 0),
+            (".shared_expert.gate_up_proj", ".shared_expert.up_proj", 1),
+            (".feed_forward.gate_up_proj", ".feed_forward.gate_proj", 0),
+            (".feed_forward.gate_up_proj", ".feed_forward.up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        num_experts = (
+            self.config.text_config.num_local_experts
+            if hasattr(self.config, "text_config")
+            else self.config.num_local_experts
+        )
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=num_experts,
+        )
+
+        loaded_params = set()
+
+        for name, loaded_weight in weights:
+            if self._should_skip_weight(name):
+                continue
+
+            name = self._transform_weight_name(name)
+
+            if "vision" in name:
+                name = name.replace(".self_attn.o_proj", ".self_attn.proj")
+            else:
+                name, loaded_weight = self.permute_qk_weight_for_rotary(
+                    name, loaded_weight
+                )
+
+            if self._handle_scale_remapping(name, params_dict):
+                loaded_params.add(name)
+                continue
+
+            if self._handle_stacked_params(
+                name, loaded_weight, stacked_params_mapping, params_dict, loaded_params
+            ):
+                continue
+
+            if self._handle_expert_weights(
+                name,
+                loaded_weight,
+                expert_params_mapping,
+                params_dict,
+                num_experts,
+                loaded_params,
+            ):
+                continue
+
+            loaded_params.add(name)
+            self._handle_default_weight(name, loaded_weight, params_dict)
+        unloaded_params = params_dict.keys() - loaded_params
+        if unloaded_params:
+            logger.warning(
+                f"Some weights are not initialized from checkpoints {unloaded_params}"
+            )
+
+    def _should_skip_weight(self, name: str) -> bool:
+        """Check if we should skip loading this weight."""
+        return not self.has_vision and (
+            "vision" in name or "multi_modal_projector" in name
+        )
+
+    def _transform_weight_name(self, name: str) -> str:
+        """Transform weight name by adding language_model prefix if needed."""
+        if (
+            not name.startswith("language_model.")
+            and "vision" not in name
+            and "multi_modal_projector" not in name
+        ):
+            return f"language_model.{name}"
+        return name
+
+    def _handle_scale_remapping(self, name: str, params_dict: dict) -> bool:
+        """Handle scale parameter remapping. Returns True if handled."""
+        if "scale" in name and "expert" not in name:
+            remapped_name = maybe_remap_kv_scale_name(name, params_dict)
+            return remapped_name is None
+        return False
+
+    def _handle_stacked_params(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+        stacked_params_mapping: list,
+        params_dict: dict,
+        loaded_params: set,
+    ) -> bool:
+        """Handle stacked parameter loading. Returns True if handled."""
+        for param_name, weight_name, shard_id in stacked_params_mapping:
+            if weight_name in name:
+                transformed_name = name.replace(weight_name, param_name)
+                loaded_params.add(transformed_name)
+                param = params_dict[transformed_name]
+                param.weight_loader(param, loaded_weight, shard_id)
+                return True
+        return False
+
+    def _handle_expert_weights(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+        expert_params_mapping: list,
+        params_dict: dict,
+        num_experts: int,
+        loaded_params: set,
+    ) -> bool:
+        """Handle expert weight loading for MoE (Mixture of Experts) layers.
+
+        Args:
+            name: Parameter name from the checkpoint
+            loaded_weight: The weight tensor to be loaded
+            expert_params_mapping: Mapping of parameter names to expert configurations
+            params_dict: Dictionary of model parameters
+            num_experts: Total number of experts in the MoE layer
+
+        Returns:
+            bool: True if the parameter was handled (is an expert parameter), False otherwise
+        """
+        if ".experts" not in name:
+            return False
+
+        if "experts.gate_up_proj" not in name and "experts.down_proj" not in name:
+            return self._handle_other_expert_params(
+                name, loaded_weight, expert_params_mapping, params_dict, loaded_params
+            )
+
+        if "scale" in name:
+            return self._handle_expert_scale_params(
+                name, loaded_weight, params_dict, num_experts, loaded_params
+            )
+        else:
+            return self._handle_expert_weight_params(
+                name, loaded_weight, params_dict, num_experts, loaded_params
+            )
+
+    def _handle_other_expert_params(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+        expert_params_mapping: list,
+        params_dict: dict,
+        loaded_params: set,
+    ) -> bool:
+        """Handle expert parameters that are not gate_up_proj or down_proj weights.
+
+        Args:
+            name: Parameter name from the checkpoint
+            loaded_weight: The weight tensor to be loaded
+            expert_params_mapping: List of tuples mapping checkpoint names to model parameters
+            params_dict: Dictionary of model parameters
+            loaded_params: Set of loaded parameter names
+
+        Returns:
+            bool: True if parameter was found and handled, False otherwise
+        """
+        for param_name, weight_name, expert_id, shard_id in expert_params_mapping:
+            if weight_name in name:
+                transformed_name = name.replace(weight_name, param_name)
+                param = params_dict[transformed_name]
+                param.weight_loader(
+                    param, loaded_weight, name, shard_id=shard_id, expert_id=expert_id
+                )
+                loaded_params.add(transformed_name)
+                return True
+        return False
+
+    def _transform_expert_name(
+        self, name: str, is_weight: bool = False
+    ) -> Tuple[str, str, List[str]]:
+        """Transform expert parameter name and get shard information.
+
+        Args:
+            name: The original parameter name
+            is_weight: Whether this is a weight parameter (adds _weight suffix)
+
+        Returns:
+            Tuple of (transformed_name, shard_id, shard_id_list)
+        """
+        suffix = "_weight" if is_weight else ""
+
+        if ".gate_up_proj" in name:
+            transformed_name = name.replace(
+                ".experts.gate_up_proj", f".experts.w13{suffix}"
+            )
+            shard_id = "w13"
+            shard_id_list = ["w1", "w3"]
+        else:  # down_proj
+            transformed_name = name.replace(
+                ".experts.down_proj", f".experts.w2{suffix}"
+            )
+            shard_id = "w2"
+            shard_id_list = ["w2"]
+
+        return transformed_name, shard_id, shard_id_list
+
+    def _handle_expert_scale_params(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+        params_dict: dict,
+        num_experts: int,
+        loaded_params: set,
+    ) -> bool:
+        """Handle quantization scale parameters for expert weights.
+
+        Args:
+            name: Parameter name containing scale information
+            loaded_weight: Scale tensor to be loaded
+            params_dict: Dictionary of model parameters
+            num_experts: Total number of experts for broadcast operations
+            loaded_params: Set of loaded parameter names
+
+        Returns:
+            bool: True (always handles scale parameters)
+        """
+        import re
+
+        # Check if this matches the expert parameter pattern: experts.{expert_id}.{param_name}
+        expert_match = re.search(r"experts\.(\d+)\.", name)
+
+        # Transform name
+        transformed_name, _, _ = self._transform_expert_name(name)
+
+        if transformed_name not in params_dict:
+            return True
+
+        param = params_dict[transformed_name]
+
+        # Handle scale parameters
+        if expert_match:
+            # If we have a specific expert ID, only load for that expert
+            expert_id = int(expert_match.group(1))
+            # For scale parameters, we can directly set the value
+            param.data[expert_id] = loaded_weight
+        else:
+            # No expert ID found - this is a single scale for all experts
+            # Load the same scale for all experts
+            for expert_id in range(num_experts):
+                param.data[expert_id] = loaded_weight
+        loaded_params.add(transformed_name)
+
+        return True
+
+    def _handle_expert_weight_params(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+        params_dict: dict,
+        num_experts: int,
+        loaded_params: set,
+    ) -> bool:
+        """Handle actual weight tensors for expert layers (gate_up_proj and down_proj).
+
+        Args:
+            name: Parameter name (should contain gate_up_proj or down_proj)
+            loaded_weight: Weight tensor(s) to be loaded
+            params_dict: Dictionary of model parameters
+            num_experts: Total number of experts for tensor distribution
+            loaded_params: Set of loaded parameter names
+
+        Returns:
+            bool: True (always handles weight parameters)
+        """
+        # Transform name and get shard info
+        transformed_name, _, shard_id_list = self._transform_expert_name(
+            name, is_weight=True
+        )
+
+        if ".gate_up_proj" in name:
+            loaded_weight_list = loaded_weight.chunk(2, dim=-1)
+        else:  # down_proj
+            loaded_weight_list = [loaded_weight]
+
+        for param_name, weight_chunk, shard_id in zip(
+            [transformed_name] * len(shard_id_list), loaded_weight_list, shard_id_list
+        ):
+            if param_name not in params_dict:
+                continue
+
+            param = params_dict[param_name]
+            weight_loader = param.weight_loader
+            loaded_params.add(param_name)
+
+            # Handle the case where loaded_weight might be a single tensor for all experts
+            if weight_chunk.dim() == 2:
+                # Single tensor case - load for all experts
+                for expert_id in range(num_experts):
+                    weight_loader(
+                        param,
+                        weight_chunk.T,
+                        param_name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+            else:
+                # Multiple experts case - load each expert's weights
+                for expert_id in range(num_experts):
+                    weight_loader(
+                        param,
+                        weight_chunk[expert_id].T,
+                        param_name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+
+        return True
+
+    def _handle_default_weight(
+        self, name: str, loaded_weight: torch.Tensor, params_dict: dict
+    ):
+        """Handle default weight loading."""
+        # Skip loading extra bias for GPTQ models
+        if name.endswith(".bias") and name not in params_dict:
+            return
+
+        param = params_dict[name]
+        weight_loader = getattr(param, "weight_loader", default_weight_loader)
+        weight_loader(param, loaded_weight)
+
+    def set_eagle3_layers_to_capture(self, layer_ids: Optional[List[int]] = None):
+        if hasattr(self.language_model, "set_eagle3_layers_to_capture"):
+            self.language_model.set_eagle3_layers_to_capture(layer_ids)
+
+    def get_embed_and_head(self):
+        # For EAGLE3, we delegate to the language model which should have this method
+        # If the language model doesn't have lm_head (like EAGLE3), we return None for head
+        embed = self.language_model.get_embed()
+        if hasattr(self.language_model, "get_embed_and_head"):
+            return self.language_model.get_embed_and_head()
+        elif hasattr(self.language_model, "lm_head"):
+            return embed, self.language_model.lm_head.weight
+        else:
+            # For EAGLE3, head might not be needed
+            return embed, None
+
+    def set_embed_and_head(self, embed, head):
+        if hasattr(self.language_model, "set_embed_and_head"):
+            return self.language_model.set_embed_and_head(embed, head)
+        else:
+            # For EAGLE3, only set embed
+            return self.language_model.set_embed(embed)
+
+    def get_embed(self):
+        return self.language_model.get_embed()
+
+    def set_embed(self, embed):
+        return self.language_model.set_embed(embed)
+
+    def get_hidden_dim(self, module_name, layer_idx):
+        # return input_dim, output_dim
+        if module_name == "qkv_proj":
+            return (
+                self.config.hidden_size,
+                self.config.head_dim
+                * (
+                    self.config.num_attention_heads
+                    + self.config.num_key_value_heads * 2
+                ),
+            )
+        elif module_name == "o_proj":
+            return (
+                self.config.head_dim * self.config.num_attention_heads,
+                self.config.hidden_size,
+            )
+        elif module_name == "gate_up_proj":
+            return self.config.hidden_size, self.config.intermediate_size * 2
+        elif module_name == "down_proj":
+            decoder_layer = self.language_model.get_layers()[layer_idx]
+            intermediate_size = decoder_layer.get_intermediate_size()
+            return intermediate_size, self.config.hidden_size
+        else:
+            raise NotImplementedError()
+
+
+EntryClass = Llama4ForConditionalGeneration
diff --git a/python/sglang/srt/models/nemotron_nas.py b/python/sglang/srt/models/nemotron_nas.py
new file mode 100644
index 000000000..ebf49f95a
--- /dev/null
+++ b/python/sglang/srt/models/nemotron_nas.py
@@ -0,0 +1,435 @@
+# Copyright 2023-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/nemotron_nas.py
+
+"""Inference-only deci model compatible with HuggingFace weights."""
+from typing import Iterable, Optional, Tuple, Type, Union
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from sglang.srt.distributed import get_pp_group
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.layers.utils import PPMissingLayer
+from sglang.srt.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.models.llama import LlamaAttention, LlamaMLP
+from sglang.srt.utils import add_prefix, make_layers
+from sglang.utils import logger
+
+
+def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
+    # DeciLM-specific code
+    intermediate_size = int(2 * ffn_mult * n_embd / 3)
+    return _find_multiple(intermediate_size, 256)
+
+
+def _find_multiple(n: int, k: int) -> int:
+    # DeciLM-specific code
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+
+
+class DeciLMDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        layer_idx: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        block_config = config.block_configs[layer_idx]
+        self._is_no_op_attention = block_config.attention.no_op
+        self._is_no_op_ffn = block_config.ffn.no_op
+
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        # Support abacusai/Smaug-72B-v0.1 with attention_bias
+        # Support internlm/internlm-7b with bias
+        rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False
+        )
+        # support internlm/internlm3-8b with qkv_bias
+        if hasattr(config, "qkv_bias"):
+            attention_bias = config.qkv_bias
+
+        if not self._is_no_op_attention:
+            num_kv_heads = (
+                config.num_attention_heads // block_config.attention.n_heads_in_group
+            )
+            self.self_attn = LlamaAttention(
+                config=config,
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                num_kv_heads=num_kv_heads,
+                layer_id=layer_idx,
+                rope_theta=rope_theta,
+                rope_scaling=rope_scaling,
+                rope_is_neox_style=rope_is_neox_style,
+                max_position_embeddings=max_position_embeddings,
+                quant_config=quant_config,
+                prefix=add_prefix("self_attn", prefix),
+                bias=attention_bias,
+            )
+            self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        if not self._is_no_op_ffn:
+            ffn_mult = block_config.ffn.ffn_mult
+            intermediate_size = _ffn_mult_to_intermediate_size(
+                ffn_mult, config.hidden_size
+            )
+            self.mlp = LlamaMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+            self.post_attention_layernorm = RMSNorm(
+                config.hidden_size, eps=config.rms_norm_eps
+            )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+
+        if self._is_no_op_attention:
+            pass
+        else:
+            if residual is None:
+                residual = hidden_states
+                hidden_states = self.input_layernorm(hidden_states)
+            else:
+                hidden_states, residual = self.input_layernorm(hidden_states, residual)
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+
+        # Fully Connected
+        if not self._is_no_op_ffn:
+            hidden_states, residual = self.post_attention_layernorm(
+                hidden_states, residual
+            )
+            hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class DeciModel(nn.Module):
+    def __init__(
+        self,
+        *,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        layer_type: Type[DeciLMDecoderLayer] = DeciLMDecoderLayer,
+    ):
+        super().__init__()
+
+        lora_config = None
+        self.config = config
+        self.quant_config = quant_config
+        self.padding_idx = config.pad_token_id
+        lora_vocab = (
+            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
+            if lora_config
+            else 0
+        )
+        vocab_size = config.vocab_size + lora_vocab
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        def get_layer(idx: int, prefix: str):
+            return layer_type(
+                config,
+                layer_idx=idx,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            get_layer,
+            pp_rank=get_pp_group().rank_in_group,
+            pp_size=get_pp_group().world_size,
+            prefix=add_prefix("layers", prefix),
+        )
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        kv_cache_index = 0
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            if not layer._is_no_op_attention:
+                hidden_states, residual = layer(
+                    positions, hidden_states, forward_batch, residual
+                )
+                kv_cache_index += 1
+            else:
+                hidden_states, residual = layer(
+                    positions, hidden_states, forward_batch, residual
+                )
+
+        if not get_pp_group().is_last_rank:
+            return PPProxyTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class DeciLMForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    # Mistral/Llama models can also be loaded with --load-format mistral
+    # from consolidated.safetensors checkpoints
+    mistral_mapping = {
+        "layers": "model.layers",
+        "attention": "self_attn",
+        "wq": "q_proj",
+        "wk": "k_proj",
+        "wv": "v_proj",
+        "wo": "o_proj",
+        "attention_norm": "input_layernorm",
+        "feed_forward": "mlp",
+        "w1": "gate_proj",
+        "w2": "down_proj",
+        "w3": "up_proj",
+        "ffn_norm": "post_attention_layernorm",
+        "tok_embeddings": "model.embed_tokens",
+        "output": "lm_head",
+        "norm": "model.norm",
+    }
+
+    def __init__(
+        self,
+        *,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        lora_config = None
+        self.config = config
+        self.lora_config = lora_config
+
+        self.model = self._init_model(
+            config=config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.unpadded_vocab_size = config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=(
+                    DEFAULT_VOCAB_PADDING_SIZE
+                    # We need bigger padding if using lora for kernel
+                    # compatibility
+                    if not lora_config
+                    else lora_config.lora_vocab_padding_size
+                ),
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
+    def _init_model(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        return DeciModel(config=config, quant_config=quant_config, prefix=prefix)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        get_embedding: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            inputs_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+        if get_pp_group().is_last_rank:
+            if not get_embedding:
+                return self.logits_processor(
+                    input_ids, hidden_states, self.lm_head, forward_batch
+                )
+            else:
+                return self.pooler(hidden_states, forward_batch)
+        else:
+            return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> None:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            if self.model.quant_config is not None and (
+                scale_name := self.model.quant_config.get_cache_scale(name)
+            ):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
+                weight_loader(param, loaded_weight)
+                continue
+            if "scale" in name:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name in params_dict.keys():
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                else:
+                    logger.warning(f"Parameter {name} not found in params_dict")
+
+
+EntryClass = [DeciLMForCausalLM]
diff --git a/python/sglang/srt/models/olmo.py b/python/sglang/srt/models/olmo.py
new file mode 100644
index 000000000..08239374d
--- /dev/null
+++ b/python/sglang/srt/models/olmo.py
@@ -0,0 +1,377 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/olmo.py#L1
+"""Inference-only OLMo model compatible with HuggingFace weights."""
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import OlmoConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, make_layers
+
+
+class OlmoAttention(nn.Module):
+    """
+    This is the attention block where the output is computed as
+    ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(
+        self,
+        config: OlmoConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+
+        assert self.hidden_size % self.total_num_heads == 0
+        assert self.total_num_heads % tensor_model_parallel_world_size == 0
+
+        self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.clip_qkv = config.clip_qkv
+
+        # Attention input projection. Projects x -> (q, k, v)
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            bias=config.attention_bias,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        # Rotary embeddings.
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+        self.scaling = self.head_dim**-0.5
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+        # Attention output projection.
+        self.o_proj = RowParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=config.attention_bias,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        if self.clip_qkv is not None:
+            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class OlmoMLP(nn.Module):
+    """
+    This is the MLP block where the output is computed as
+    ``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(
+        self,
+        config: OlmoConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+
+        # Feed-forward input projection.
+        self.gate_up_proj = MergedColumnParallelLinear(
+            self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+
+        # Activation function.
+        self.act_fn = SiluAndMul()
+
+        # Feed-forward output projection.
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class OlmoDecoderLayer(nn.Module):
+    """
+    This is a typical transformer block where the output is
+    computed as ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(
+        self,
+        config: OlmoConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        # Attention block.
+        self.self_attn = OlmoAttention(
+            config,
+            layer_id,
+            quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+
+        # MLP block.
+        self.mlp = OlmoMLP(
+            config,
+            quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+        # LayerNorm
+        self.input_layernorm = nn.LayerNorm(
+            config.hidden_size, elementwise_affine=False, bias=False
+        )
+        self.post_attention_layernorm = nn.LayerNorm(
+            config.hidden_size, elementwise_affine=False, bias=False
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        # Attention block.
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(positions, hidden_states, forward_batch)
+        hidden_states = hidden_states + residual
+
+        # MLP block.
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class OlmoModel(nn.Module):
+
+    def __init__(
+        self,
+        config: OlmoConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: OlmoDecoderLayer(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=add_prefix("layers", prefix),
+        )
+        self.norm = nn.LayerNorm(
+            config.hidden_size, elementwise_affine=False, bias=False
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        """
+        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
+        """
+        # Get embeddings of input.
+        # shape: (batch_size, seq_len, d_model)
+
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        # Apply blocks one-by-one.
+        for layer_id, decoder_layer in enumerate(self.layers):
+            # shape: (batch_size, seq_len, d_model)
+            hidden_states = decoder_layer(
+                positions,
+                hidden_states,
+                forward_batch,
+            )
+
+        # Apply final layer norm.
+        # shape: (batch_size, seq_len or 1, d_model)
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class OlmoForCausalLM(nn.Module):
+    """
+    Extremely barebones HF model wrapper.
+    """
+
+    def __init__(
+        self,
+        config: OlmoConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.model = OlmoModel(config, quant_config, prefix=add_prefix("model", prefix))
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.unpadded_vocab_size = config.vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        self.logits_processor = LogitsProcessor(config)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+            input_embeds=input_embeds,
+        )
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = OlmoForCausalLM
diff --git a/python/sglang/srt/models/olmo2.py b/python/sglang/srt/models/olmo2.py
new file mode 100644
index 000000000..75834e6fb
--- /dev/null
+++ b/python/sglang/srt/models/olmo2.py
@@ -0,0 +1,413 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/olmo2.py
+"""Inference-only OLMo2 model compatible with HuggingFace weights."""
+from functools import partial
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    split_tensor_along_last_dim,
+    tensor_model_parallel_all_gather,
+)
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, make_layers
+
+
+class Olmo2Attention(nn.Module):
+    """
+    This is the attention block where the output is computed as
+    ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+
+        assert self.hidden_size % self.total_num_heads == 0
+        assert self.total_num_heads % self.tp_size == 0
+
+        self.num_heads = self.total_num_heads // self.tp_size
+        self.total_num_kv_heads = self.config.num_key_value_heads
+
+        if self.total_num_kv_heads >= self.tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % self.tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // self.tp_size)
+
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+
+        # Attention input projection. Projects x -> (q, k, v)
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        self.k_norm = RMSNorm(
+            self.total_num_kv_heads * self.head_dim,
+            eps=self.config.rms_norm_eps,
+        )
+        self.q_norm = RMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
+        # Rotary embeddings.
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+        self.scaling = self.head_dim**-0.5
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+        # Attention output projection.
+        self.o_proj = RowParallelLinear(
+            self.head_dim * self.total_num_heads,
+            self.hidden_size,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+    def _apply_qk_norm(
+        self, q: torch.Tensor, k: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.tp_size > 1:
+            q = tensor_model_parallel_all_gather(q.contiguous())
+            k = tensor_model_parallel_all_gather(k.contiguous())
+        q = self.q_norm.forward_native(q)
+        k = self.k_norm.forward_native(k)
+        if self.tp_size > 1:
+            splitter = partial(split_tensor_along_last_dim, num_partitions=self.tp_size)
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+        return q, k
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q, k = self._apply_qk_norm(q, k)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Olmo2MLP(nn.Module):
+    """
+    This is the MLP block where the output is computed as
+    ``MLP(x)`` in ``LN(MLP(x + LN(Attention(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+
+        # Feed-forward input projection.
+        self.gate_up_proj = MergedColumnParallelLinear(
+            self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+
+        # Activation function.
+        self.act_fn = SiluAndMul()
+
+        # Feed-forward output projection.
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Olmo2DecoderLayer(nn.Module):
+    """
+    This is a typical transformer block where the output is
+    computed as ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        # Attention block.
+        self.self_attn = Olmo2Attention(
+            config, layer_id, quant_config, prefix=add_prefix("self_attn", prefix)
+        )
+
+        # MLP block.
+        self.mlp = Olmo2MLP(config, quant_config, prefix=add_prefix("mlp", prefix))
+
+        # RMSNorm
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.post_feedforward_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        # Attention block.
+        residual = hidden_states
+        hidden_states = self.self_attn(positions, hidden_states, forward_batch)
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = hidden_states + residual
+
+        # MLP block.
+        residual = hidden_states
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class Olmo2Model(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Olmo2DecoderLayer(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=add_prefix("layers", prefix),
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        """
+        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
+        """
+        # Get embeddings of input.
+        # shape: (batch_size, seq_len, d_model)
+
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        # Apply blocks one-by-one.
+        for layer_id, decoder_layer in enumerate(self.layers):
+            # shape: (batch_size, seq_len, d_model)
+            hidden_states = decoder_layer(
+                positions,
+                hidden_states,
+                forward_batch,
+            )
+
+        # Apply final layer norm.
+        # shape: (batch_size, seq_len or 1, d_model)
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class Olmo2ForCausalLM(nn.Module):
+    """
+    Extremely barebones HF model wrapper.
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.model = Olmo2Model(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.unpadded_vocab_size = config.vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        self.logits_processor = LogitsProcessor(config)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+            input_embeds=input_embeds,
+        )
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            # With tie_word_embeddings, we can skip lm_head.weight
+            # The weight might appear unnecessarily in the files if the model is
+            # processed with quantization, LoRA, fine-tuning, etc.
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = Olmo2ForCausalLM
diff --git a/python/sglang/srt/models/olmoe.py b/python/sglang/srt/models/olmoe.py
new file mode 100644
index 000000000..a74a2968d
--- /dev/null
+++ b/python/sglang/srt/models/olmoe.py
@@ -0,0 +1,437 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from:
+# https://github.com/vllm-project/vllm/pull/7922
+
+"""Inference-only OLMoE model compatible with HuggingFace weights."""
+
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, make_layers, print_warning_once
+
+
+class OlmoeMoE(nn.Module):
+    """A tensor-parallel MoE implementation for Olmoe that shards each expert
+    across all ranks.
+
+    Each expert's weights are sharded across all ranks and a fused MoE
+    kernel is used for the forward pass, and finally we reduce the outputs
+    across ranks.
+    """
+
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        tp_size: Optional[int] = None,
+        layer_id: int = 0,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        # Gate always runs at half / full precision for now.
+        self.gate = ReplicatedLinear(
+            hidden_size,
+            num_experts,
+            bias=False,
+            quant_config=None,
+            prefix=add_prefix("gate", prefix),
+        )
+
+        self.topk = TopK(
+            top_k=top_k,
+            renormalize=False,
+        )
+
+        self.experts = FusedMoE(
+            num_experts=num_experts,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            reduce_results=True,
+            quant_config=quant_config,
+            layer_id=layer_id,
+            prefix=add_prefix("experts", prefix),
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = self.experts(hidden_states, topk_output)
+        return final_hidden_states.view(orig_shape)
+
+
+class OlmoeAttention(nn.Module):
+
+    def __init__(
+        self,
+        layer_id: int,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 4096,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.q_norm = RMSNorm(hidden_size, eps=1e-5)
+        self.k_norm = RMSNorm(hidden_size, eps=1e-5)
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=True,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            layer_id=layer_id,
+            num_kv_heads=self.num_kv_heads,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.q_norm(q.contiguous()), self.k_norm(k.contiguous())
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class OlmoeDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 4096)
+
+        self.self_attn = OlmoeAttention(
+            layer_id,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+
+        self.mlp = OlmoeMoE(
+            num_experts=config.num_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=1e-5)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=1e-5)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class OlmoeModel(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: OlmoeDecoderLayer(
+                config=config,
+                quant_config=quant_config,
+                layer_id=idx,
+                prefix=prefix,
+            ),
+            prefix=add_prefix("layers", prefix),
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=1e-5)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions, hidden_states, forward_batch, residual
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class OlmoeForCausalLM(nn.Module):
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = OlmoeModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    if name.endswith("kv_scale"):
+                        remapped_kv_scale_name = name.replace(
+                            ".kv_scale", ".attn.kv_scale"
+                        )
+                        if remapped_kv_scale_name not in params_dict:
+                            print_warning_once(
+                                "Found kv scale in the checkpoint "
+                                f"(e.g. {name}), but not found the expected "
+                                f"name in the model "
+                                f"(e.g. {remapped_kv_scale_name}). "
+                                "kv-scale is not loaded."
+                            )
+                            continue
+                        else:
+                            name = remapped_kv_scale_name
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+
+EntryClass = OlmoeForCausalLM
diff --git a/python/sglang/srt/models/opt.py b/python/sglang/srt/models/opt.py
new file mode 100644
index 000000000..a571e8937
--- /dev/null
+++ b/python/sglang/srt/models/opt.py
@@ -0,0 +1,637 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Inference-only OPT model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import OPTConfig
+
+from sglang.srt.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    kv_cache_scales_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.utils import add_prefix, make_layers
+
+
+def get_activation(name="relu"):
+    """Select an activation function by name
+
+    Args:
+        name: str
+            activation function name,
+            one of ["relu", "gelu", "swish", "sigmoid"],
+            default "relu".
+    """
+    name = name.lower()
+    if name == "relu":
+        return nn.ReLU()
+    if name == "gelu":
+        return nn.GELU()
+    if name == "sigmoid":
+        return torch.nn.Sigmoid()
+    return nn.Identity()
+
+
+class OPTLearnedPositionalEmbedding(nn.Embedding):
+
+    def __init__(self, num_embeddings: int, embedding_dim: int):
+        # OPT is set up so that if padding_idx is specified then offset the
+        # embedding ids by 2 and adjust num_embeddings appropriately. Other
+        # models don't have this hack
+        self.offset = 2
+        super().__init__(num_embeddings + self.offset, embedding_dim)
+
+    def forward(self, positions: torch.Tensor):
+        return super().forward(positions + self.offset)
+
+
+class OPTAttention(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        layer_id: int = 0,
+        bias: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.embed_dim = embed_dim
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        total_num_heads = num_heads
+        assert num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = total_num_heads // tensor_model_parallel_world_size
+        self.head_dim = embed_dim // total_num_heads
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            embed_dim,
+            self.head_dim,
+            total_num_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.out_proj = RowParallelLinear(
+            embed_dim,
+            embed_dim,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.out_proj(attn_output)
+        return output
+
+
+class OPTDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: OPTConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.self_attn = OPTAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.num_attention_heads,
+            layer_id=layer_id,
+            bias=config.enable_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.do_layer_norm_before = config.do_layer_norm_before
+
+        self.self_attn_layer_norm = nn.LayerNorm(
+            self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine
+        )
+        self.fc1 = ColumnParallelLinear(
+            self.embed_dim,
+            config.ffn_dim,
+            bias=config.enable_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("fc1", prefix),
+        )
+        self.activation_fn = get_activation(config.activation_function)
+        self.fc2 = RowParallelLinear(
+            config.ffn_dim,
+            self.embed_dim,
+            bias=config.enable_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("fc2", prefix),
+        )
+        self.final_layer_norm = nn.LayerNorm(
+            self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        # Self Attention
+        residual = hidden_states
+        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
+        if self.do_layer_norm_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states, forward_batch=forward_batch
+        )
+        hidden_states = residual + hidden_states
+        # 350m applies layer norm AFTER attention
+        if not self.do_layer_norm_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
+        if self.do_layer_norm_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        hidden_states = residual + hidden_states
+        # 350m applies layer norm AFTER attention
+        if not self.do_layer_norm_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+        return hidden_states
+
+
+class OPTDecoder(nn.Module):
+
+    def __init__(
+        self,
+        config: OPTConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.max_target_positions = config.max_position_embeddings
+        self.vocab_size = config.vocab_size
+
+        self.pp_group = get_pp_group()
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.word_embed_proj_dim,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        # Positional embeddings are replicated (not sharded).
+        self.embed_positions = OPTLearnedPositionalEmbedding(
+            config.max_position_embeddings, config.hidden_size
+        )
+
+        # Project out & in will be replicated if they exist.
+        if config.word_embed_proj_dim != config.hidden_size:
+            self.project_out = ReplicatedLinear(
+                config.hidden_size,
+                config.word_embed_proj_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("project_out", prefix),
+            )
+        else:
+            self.project_out = None
+
+        if config.word_embed_proj_dim != config.hidden_size:
+            self.project_in = ReplicatedLinear(
+                config.word_embed_proj_dim,
+                config.hidden_size,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("project_in", prefix),
+            )
+        else:
+            self.project_in = None
+
+        # Note that the only purpose of `config._remove_final_layer_norm` is to
+        # keep backward compatibility with checkpoints that have been fine-tuned
+        # before transformers v4.20.1
+        # see https://github.com/facebookresearch/metaseq/pull/164
+        if config.do_layer_norm_before and not config._remove_final_layer_norm:
+            self.final_layer_norm = nn.LayerNorm(
+                config.hidden_size,
+                elementwise_affine=config.layer_norm_elementwise_affine,
+            )
+        else:
+            self.final_layer_norm = None
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: OPTDecoderLayer(
+                config=config, layer_id=idx, quant_config=quant_config, prefix=prefix
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix="model.layers",
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+        input_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                input_embeds = self.embed_tokens(input_ids)
+            pos_embeds = self.embed_positions(positions)
+            if self.project_in is not None:
+                input_embeds, _ = self.project_in(input_embeds)
+            hidden_states = input_embeds + pos_embeds
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+
+        for layer in self.layers[self.start_layer : self.end_layer]:
+            hidden_states = layer(
+                hidden_states=hidden_states, forward_batch=forward_batch
+            )
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors({"hidden_states": hidden_states})
+        if self.final_layer_norm is not None:
+            hidden_states = self.final_layer_norm(hidden_states)
+            # 没有经过这里
+        if self.project_out is not None:
+            hidden_states, _ = self.project_out(hidden_states)
+        return hidden_states
+
+
+class OPTModel(nn.Module):
+
+    def __init__(
+        self,
+        config: OPTConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        # config = vllm_config.model_config.hf_config
+        # quant_config = vllm_config.quant_config
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+
+        self.decoder = OPTDecoder(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("decoder", prefix),
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        pp_proxy_tensors: Optional[PPProxyTensors],
+        input_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        return self.decoder(
+            input_ids,
+            positions,
+            pp_proxy_tensors=pp_proxy_tensors,
+            input_embeds=input_embeds,
+            forward_batch=forward_batch,
+        )
+
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+        for layer_idx, scaling_factor in kv_cache_scales_loader(
+            quantization_param_path,
+            tp_rank,
+            tp_size,
+            self.config.num_hidden_layers,
+            self.config.__class__.model_type,
+        ):
+            if not isinstance(self.decoder.layers[layer_idx], nn.Identity):
+                layer_self_attn = self.decoder.layers[layer_idx].self_attn
+
+            if hasattr(layer_self_attn.attn, "k_scale"):
+                layer_self_attn.attn.k_scale = scaling_factor
+                layer_self_attn.attn.v_scale = scaling_factor
+            else:
+                raise RuntimeError(
+                    "Self attention has no KV cache scaling " "factor attribute!"
+                )
+
+
+class OPTForCausalLM(nn.Module):
+    # BitandBytes specific attributes
+    # in TP, these weights are partitioned along the column dimension (dim=-1)
+    column_parallel_weights_modules = [".down_proj.", ".o_proj."]
+
+    def __init__(
+        self,
+        config: OPTConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+
+        self.model = OPTModel(
+            config=config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.decoder.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.word_embed_proj_dim,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+        self.capture_aux_hidden_states = False
+        self.pp_group = get_pp_group()
+        self.stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+        ]
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+        input_embeds: Optional[torch.Tensor] = None,
+        get_embedding: bool = False,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+            input_embeds=input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if self.pp_group.is_last_rank:
+            if not get_embedding:
+                return self.logits_processor(
+                    input_ids,
+                    hidden_states,
+                    self.lm_head,
+                    forward_batch,
+                    aux_hidden_states=aux_hidden_states,
+                )
+            else:
+                return self.pooler(hidden_states, forward_batch)
+        else:
+            return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> None:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+
+        for name, loaded_weight in weights:
+            if name.startswith("decoder"):
+                name = name.replace("decoder.", "model.decoder.")
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # if is_pp_missing_parameter(name, self):
+                #     continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # if is_pp_missing_parameter(name, self):
+                #     continue
+                if name not in params_dict:
+                    continue
+                if name in params_dict.keys():
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                else:
+                    logger.warning(f"Parameter {name} not found in params_dict")
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    def get_module_name_from_weight_name(self, name):
+        for param_name, weight_name, shard_id, num_shard in self.stacked_params_mapping:
+            if weight_name in name:
+                return (
+                    name.replace(weight_name, param_name)[: -len(".weight")],
+                    num_shard,
+                )
+        return name[: -len(".weight")], 1
+
+    def get_num_params(self):
+        params_dict = dict(self.named_parameters())
+        return len(params_dict)
+
+    def get_weights_by_name(
+        self, name: str, truncate_size: int = 100, tp_size: int = 1
+    ) -> Optional[torch.Tensor]:
+        """Get the weights of the parameter by its name. Similar to `get_parameter` in Hugging Face.
+
+        Only used for unit test with an unoptimized performance.
+        For optimized performance, please use torch.save and torch.load.
+        """
+        try:
+            if name == "lm_head.weight" and self.config.tie_word_embeddings:
+                logger.info(
+                    "word embedding is tied for this model, return embed_tokens.weight as lm_head.weight."
+                )
+                return (
+                    self.model.embed_tokens.weight.cpu()
+                    .to(torch.float32)
+                    .numpy()
+                    .tolist()[:truncate_size]
+                )
+
+            mapped_name = name
+            mapped_shard_id = None
+            for param_name, weight_name, shard_id in self.stacked_params_mapping:
+                if weight_name in name:
+                    mapped_name = name.replace(weight_name, param_name)
+                    mapped_shard_id = shard_id
+                    break
+            params_dict = dict(self.named_parameters())
+            param = params_dict[mapped_name]
+            if mapped_shard_id is not None:
+                if mapped_shard_id in ["q", "k", "v"]:
+                    num_heads = self.config.num_attention_heads // tp_size
+                    num_kv_heads = self.config.num_attention_heads // tp_size
+                    head_dim = (
+                        self.config.hidden_size // self.config.num_attention_heads
+                    )
+                    if mapped_shard_id == "q":
+                        offset = 0
+                        size = num_heads * head_dim
+                    elif mapped_shard_id == "k":
+                        offset = num_heads * head_dim
+                        size = num_kv_heads * head_dim
+                    elif mapped_shard_id == "v":
+                        offset = (num_heads + num_kv_heads) * head_dim
+                        size = num_kv_heads * head_dim
+                    weight = param.data.narrow(0, offset, size)
+                elif mapped_shard_id in [0, 1]:
+                    intermediate_size = self.config.ffn_dim
+                    slice_size = intermediate_size // tp_size
+                    if mapped_shard_id == 0:  # gate_proj
+                        offset = 0
+                        size = slice_size
+                    elif mapped_shard_id == 1:  # up_proj
+                        offset = slice_size
+                        size = slice_size
+
+                    weight = param.data.narrow(0, offset, size)
+                else:
+                    weight = param.data
+            else:
+                weight = param.data
+            if tp_size > 1 and ("o_proj" in name or "down_proj" in name):
+                gathered_weights = [torch.zeros_like(weight) for _ in range(tp_size)]
+                torch.distributed.all_gather(gathered_weights, weight)
+                weight = torch.cat(gathered_weights, dim=1)
+            return weight.cpu().to(torch.float32).numpy().tolist()[:truncate_size]
+
+        except Exception:
+            logger.error(
+                f"Error getting weights by name {name} in OPTForCausalLM: {get_exception_traceback()}"
+            )
+            return None
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def get_embed(self):
+        return self.model.embed_tokens.weight
+
+    def set_embed(self, embed):
+        # NOTE: If draft hidden size != target hidden size, the embed weight cannot be shared for EAGLE3
+        if (
+            hasattr(self.config, "target_hidden_size")
+            and self.config.target_hidden_size != self.config.hidden_size
+        ):
+            return
+        del self.model.embed_tokens.weight
+        self.model.embed_tokens.weight = embed
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        self.model.load_kv_cache_scales(quantization_param_path)
+
+
+EntryClass = [OPTForCausalLM]
diff --git a/python/sglang/srt/models/persimmon.py b/python/sglang/srt/models/persimmon.py
new file mode 100644
index 000000000..5f8885e71
--- /dev/null
+++ b/python/sglang/srt/models/persimmon.py
@@ -0,0 +1,330 @@
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+from torch import nn
+from transformers import PersimmonConfig
+
+from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, make_layers
+
+
+class PersimmonMLP(nn.Module):
+
+    def __init__(
+        self, config: PersimmonConfig, quant_config: Optional[QuantizationConfig] = None
+    ):
+        super().__init__()
+        self.dense_h_to_4h = ColumnParallelLinear(
+            config.hidden_size, config.intermediate_size, quant_config=quant_config
+        )
+        self.dense_4h_to_h = RowParallelLinear(
+            config.intermediate_size, config.hidden_size, quant_config=quant_config
+        )
+        self.act = get_act_fn(config.hidden_act)
+
+    def forward(self, hidden_states) -> torch.Tensor:
+        hidden_states, _ = self.dense_h_to_4h(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.dense_4h_to_h(hidden_states)
+        return hidden_states
+
+
+class PersimmonAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: PersimmonConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        layer_id: int = 0,
+    ):
+        super().__init__()
+        self.config = config
+        tensor_parallel_world_size = get_tensor_model_parallel_world_size()
+
+        self.hidden_size = config.hidden_size
+        self.total_num_heads = config.num_attention_heads
+        self.num_heads = self.total_num_heads // tensor_parallel_world_size
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.partial_rotary_factor = config.partial_rotary_factor
+        self.is_causal = True
+
+        assert (self.head_dim * self.total_num_heads) == self.hidden_size
+        assert self.total_num_heads % tensor_parallel_world_size == 0
+
+        self.query_key_value = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.dense = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.is_qk_layernorm = config.qk_layernorm
+
+        if self.is_qk_layernorm:
+            self.q_layernorm = nn.LayerNorm(self.head_dim)
+            self.k_layernorm = nn.LayerNorm(self.head_dim)
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            partial_rotary_factor=self.partial_rotary_factor,
+        )
+        self.scaling = self.head_dim**-0.5
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def _split_heads(self, x: torch.Tensor) -> torch.Tensor:
+        seq_length = x.shape[0]
+        return x.view(seq_length, self.num_heads, self.head_dim)
+
+    def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
+        seq_length = x.shape[0]
+        return x.view(seq_length, self.num_heads * self.head_dim)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.query_key_value(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+
+        if self.is_qk_layernorm:
+            q = self._split_heads(q)
+            k = self._split_heads(k)
+
+            q = self.q_layernorm(q)
+            k = self.k_layernorm(k)
+
+            q = self._merge_heads(q)
+            k = self._merge_heads(k)
+
+        q, k = self.rotary_emb(position_ids, q, k)
+        attn_output = self.attn(q, k, v, forward_batch=forward_batch)
+        output, _ = self.dense(attn_output)
+        return output
+
+
+class PersimmonDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PersimmonConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        idx: int = 0,
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = PersimmonAttention(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+            layer_id=idx,
+        )
+        self.mlp = PersimmonMLP(config, quant_config=quant_config)
+        self.input_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
+        self.post_attention_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.self_attn(
+            position_ids=position_ids,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = hidden_states + residual
+
+        outputs = hidden_states
+        return outputs
+
+
+class PersimmonModel(nn.Module):
+
+    def __init__(
+        self,
+        config: PersimmonConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.pp_group = get_pp_group()
+
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size, config.hidden_size
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: PersimmonDecoderLayer(
+                config, quant_config=quant_config, prefix=prefix, idx=idx
+            ),
+            prefix="model.layers",
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+        )
+
+        if self.pp_group.is_last_rank:
+            self.final_layernorm = nn.LayerNorm(
+                config.hidden_size, eps=config.layer_norm_eps
+            )
+        else:
+            self.final_layernorm = PPMissingLayer()
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+        positions: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if self.pp_group.is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+        else:
+            hidden_states = forward_batch.pp_input_hidden
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states = layer(
+                position_ids=positions,
+                forward_batch=forward_batch,
+                hidden_states=hidden_states,
+            )
+        return self.final_layernorm(hidden_states)
+
+
+class PersimmonForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: PersimmonConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = PersimmonModel(
+            config=config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            positions=positions,
+            inputs_embeds=inputs_embeds,
+        )
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if name not in params_dict:
+                if name == "lm_head.weight":
+                    continue
+                print(f"Warning: weight {name} not found in model.")
+                continue
+            param = params_dict[name]
+            if "query_key_value" in name:
+                output_dim = getattr(param, "output_dim", None)
+                if output_dim is not None:
+                    loaded_weight_shape = loaded_weight.shape
+                    num_heads = self.config.num_attention_heads
+                    loaded_weight = loaded_weight.view(
+                        loaded_weight_shape[:output_dim]
+                        + (num_heads, 3, -1)
+                        + loaded_weight_shape[output_dim + 1 :]
+                    )
+                    loaded_weight = loaded_weight.transpose(output_dim, output_dim + 1)
+                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+
+EntryClass = PersimmonForCausalLM
diff --git a/python/sglang/srt/models/phi.py b/python/sglang/srt/models/phi.py
new file mode 100644
index 000000000..f48895c67
--- /dev/null
+++ b/python/sglang/srt/models/phi.py
@@ -0,0 +1,321 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/phi.py
+from typing import Iterable, Optional, Union
+
+import torch
+from torch import nn
+from transformers import PhiConfig
+
+from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, make_layers
+
+
+class PhiAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: PhiConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        layer_id: int = 0,
+    ):
+        super().__init__()
+        self.total_num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.total_num_heads
+
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        assert self.total_num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
+
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_size,
+            self.total_num_heads,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.dense = RowParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            quant_config=quant_config,
+        )
+
+        scaling = self.head_size**-0.5
+        rotary_dim = int(
+            config.partial_rotary_factor
+            * (config.hidden_size // config.num_attention_heads)
+        )
+        assert rotary_dim % 2 == 0
+
+        rope_theta = getattr(config, "rope_theta", 10000.0)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 2048)
+        self.rotary_emb = get_rope(
+            self.head_size,
+            rotary_dim=rotary_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_size,
+            scaling,
+            num_kv_heads=self.num_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q, k = self.rotary_emb(position_ids, q, k)
+        attn_output = self.attn(q, k, v, forward_batch=forward_batch)
+        output, _ = self.dense(attn_output)
+        return output
+
+
+class PhiMLP(nn.Module):
+
+    def __init__(
+        self, config: PhiConfig, quant_config: Optional[QuantizationConfig] = None
+    ):
+        super().__init__()
+
+        n_inner = getattr(config, "n_inner", None)
+        n_inner = n_inner if n_inner is not None else 4 * config.hidden_size
+
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            n_inner,
+            quant_config=quant_config,
+        )
+        self.fc2 = RowParallelLinear(
+            n_inner,
+            config.hidden_size,
+            quant_config=quant_config,
+        )
+        self.act = get_act_fn(config.hidden_act)
+
+    def forward(self, hidden_states):
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        return hidden_states
+
+
+class PhiLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PhiConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        idx: int = 0,
+    ):
+        super().__init__()
+        self.input_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
+        self.self_attn = PhiAttention(
+            config,
+            quant_config,
+            prefix=add_prefix("self_attn", prefix),
+            layer_id=idx,
+        )
+        self.mlp = PhiMLP(config, quant_config)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        attn_outputs = self.self_attn(
+            position_ids=position_ids,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        hidden_states = attn_outputs + feed_forward_hidden_states + residual
+        return hidden_states
+
+
+class PhiModel(nn.Module):
+
+    def __init__(
+        self,
+        config: PhiConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size
+        )
+
+        pp_group = get_pp_group()
+        pp_size = pp_group.world_size
+        pp_rank = pp_group.rank
+
+        self.start_layer = pp_rank * config.num_hidden_layers // pp_size
+        self.end_layer = (pp_rank + 1) * config.num_hidden_layers // pp_size
+
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: PhiLayer(
+                config, quant_config=quant_config, prefix=prefix, idx=idx
+            ),
+            prefix=add_prefix("layers", prefix),
+        )
+
+        self.final_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+        positions: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.get_input_embeddings(input_ids)
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+
+            hidden_states = layer(
+                position_ids=positions,
+                forward_batch=forward_batch,
+                hidden_states=hidden_states,
+            )
+        hidden_states = self.final_layernorm(hidden_states)
+        return hidden_states
+
+
+class PhiForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ]
+    }
+
+    def __init__(
+        self,
+        config: PhiConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = PhiModel(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("model", prefix),
+        )
+
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> LogitsProcessorOutput:
+
+        hidden_states = self.model(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            positions=positions,
+            inputs_embeds=inputs_embeds,
+        )
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+        weights = dict(weights)
+        loaded_keys = set()
+
+        for name, param in params_dict.items():
+            if name in loaded_keys:
+                continue
+
+            # Handle packed weights
+            is_packed = False
+            for packed_name, src_names in self.packed_modules_mapping.items():
+                if packed_name not in name:
+                    continue
+
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                for src_name in src_names:
+                    full_src_name = name.replace(packed_name, src_name)
+                    if full_src_name in weights:
+                        loaded_weight = weights[full_src_name]
+                        # The shard_id for QKVParallelLinear is 'q', 'k', 'v'.
+                        shard_id = src_name.split("_")[0]
+                        weight_loader(param, loaded_weight, shard_id)
+                        loaded_keys.add(full_src_name)
+
+                loaded_keys.add(name)
+                is_packed = True
+                break
+            if is_packed:
+                continue
+
+            # Handle non-packed weights
+            if name not in weights:
+                # Redundant with the check in the loop, but good for safety
+                continue
+
+            loaded_weight = weights[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_keys.add(name)
+
+
+EntryClass = PhiForCausalLM
diff --git a/python/sglang/srt/models/phi3_small.py b/python/sglang/srt/models/phi3_small.py
new file mode 100644
index 000000000..9ac855c49
--- /dev/null
+++ b/python/sglang/srt/models/phi3_small.py
@@ -0,0 +1,476 @@
+import math
+from typing import Iterable, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import Phi3Config
+from transformers.configuration_utils import PretrainedConfig
+
+from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer
+from sglang.srt.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, make_layers
+
+
+@torch.jit.script
+def quick_gelu(x):
+    return x * torch.sigmoid(1.702 * x)
+
+
+@torch.jit.script
+def gegelu(input, limit: Optional[float] = None):
+    a_gelu, a_linear = input[..., ::2], input[..., 1::2]
+    if limit is not None:
+        a_gelu = torch.where(
+            torch.isinf(a_gelu), a_gelu, a_gelu.clamp(min=None, max=limit)
+        )
+        a_linear = torch.where(
+            torch.isinf(a_linear),
+            a_linear,
+            a_linear.clamp(min=-limit, max=limit),
+        )
+    out_gelu = quick_gelu(a_gelu)
+    return out_gelu * (a_linear + 1)
+
+
+class Phi3SmallMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        assert (
+            self.config.hidden_act == "gegelu"
+        ), "Only `gegelu` is supported for the 4.7 series of models .."
+        self.hidden_size = config.hidden_size
+        self.gegelu_limit = config.gegelu_limit
+        self.intermediate_size = config.intermediate_size
+
+        self.up_proj = MergedColumnParallelLinear(
+            self.hidden_size,
+            2 * [self.intermediate_size],
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+
+    def forward(self, x):
+        gate_up, _ = self.up_proj(x)
+        x = gegelu(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Phi3SmallSelfAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.config = config
+        self.sparse_block_size = config.blocksparse_block_size
+        self.homo_heads = config.blocksparse_homo_head_pattern
+        self.local_blocks = config.blocksparse_num_local_blocks
+        self.vert_stride = config.blocksparse_vert_stride
+
+        assert (
+            config.blocksparse_block_size == config.blocksparse_triton_kernel_block_size
+        )
+
+        self.hidden_size = config.hidden_size
+        # Number of Query Heads
+        self.num_heads = config.num_attention_heads
+
+        self.head_dim = self.hidden_size // self.num_heads
+        self.tp_size = get_tensor_model_parallel_world_size()
+        # Number of total Key Value Heads before tensor parallel
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_q_per_kv = self.num_heads // self.num_key_value_heads
+        if self.tp_size > 1:
+            assert self.num_key_value_heads % self.tp_size == 0
+        self.num_kv_heads_per_partion = max(1, self.num_key_value_heads // self.tp_size)
+        self.num_heads_per_partition = self.num_heads // self.tp_size
+
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_embedding_base = config.rope_embedding_base
+        self.rope_position_scale = config.rope_position_scale
+        self.is_causal = True
+
+        norm_factor = None
+        if config.mup_use_scaling:
+            norm_factor = self.head_dim / config.mup_attn_multiplier
+        else:
+            norm_factor = math.sqrt(self.head_dim)
+        self.scale = 1 / norm_factor
+
+        self.query_key_value = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.num_heads,
+            self.num_key_value_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        self.dense = RowParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        if getattr(self.config, "rope_scaling", None) is not None:
+            rope_scaling = self.config.rope_scaling
+            for key in rope_scaling:
+                if isinstance(rope_scaling[key], list):
+                    rope_scaling[key] = tuple(rope_scaling[key])
+
+            if "factor" not in rope_scaling:
+                rope_scaling["factor"] = self.rope_position_scale
+        else:
+            rope_scaling = {
+                "rope_type": "linear",
+                "factor": self.rope_position_scale,
+            }
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_embedding_base,
+            rope_scaling=rope_scaling,
+        )
+
+        # blocksparse params
+        self.blocksparse_block_size = config.blocksparse_block_size
+        self.blocksparse_num_local_blocks = config.blocksparse_num_local_blocks
+        self.blocksparse_vert_stride = config.blocksparse_vert_stride
+
+        use_dense_attn = (
+            getattr(self.config, "dense_attention_every_n_layers", None)
+            and (self.layer_id + 1) % self.config.dense_attention_every_n_layers == 0
+        )
+
+        bs_params = None
+        if not use_dense_attn:
+            bs_params = {
+                "max_seqlen": self.max_position_embeddings,
+                "num_heads": self.num_heads_per_partition,
+                "num_kv_heads": self.num_kv_heads_per_partion,
+                "block_size": self.sparse_block_size,
+                "local_blocks": self.local_blocks,
+                "vert_stride": self.vert_stride,
+                "homo_head": self.homo_heads,
+            }
+
+        self.attn = RadixAttention(
+            self.num_heads_per_partition,
+            self.head_dim,
+            self.scale,
+            num_kv_heads=self.num_kv_heads_per_partion,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        qkv, _ = self.query_key_value(hidden_states)
+
+        qkv = qkv.view(qkv.shape[:-1] + (-1, (self.num_q_per_kv + 2), self.head_dim))
+        q, k, v = qkv.split([self.num_q_per_kv, 1, 1], dim=-2)
+
+        # NOTE: this is required by RotaryEmbed, which indeed does not have to
+        # TODO: allow 3D QK for rotary forward
+        q = q.reshape(-1, self.head_dim * self.num_heads_per_partition)
+        k = k.reshape(-1, self.head_dim * self.num_kv_heads_per_partion)
+        v = v.reshape(-1, self.head_dim * self.num_kv_heads_per_partion)
+
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch=forward_batch)
+        output, _ = self.dense(attn_output)
+
+        return output
+
+
+class Phi3SmallDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Phi3SmallSelfAttention(
+            config,
+            layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = Phi3SmallMLP(
+            config,
+            quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+        self.input_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_epsilon
+        )
+        self.post_attention_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_epsilon
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class Phi3SmallModel(nn.Module):
+
+    def __init__(
+        self,
+        config: Phi3Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.config = config
+
+        self.pp_group = get_pp_group()
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                prefix=add_prefix("embed_tokens", prefix),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.mup_embedding_multiplier = config.mup_embedding_multiplier
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Phi3SmallDecoderLayer(
+                config,
+                int(prefix.split(".")[-1]),
+                quant_config,
+                prefix=prefix,
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=add_prefix("layers", prefix),
+        )
+
+        self.final_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_epsilon
+        )
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: Optional[torch.LongTensor],
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor],
+    ) -> Union[torch.Tensor]:
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.get_input_embeddings(input_ids)
+        if (
+            self.mup_embedding_multiplier is not None
+            and self.mup_embedding_multiplier > 0.0
+        ):
+            hidden_states = hidden_states * self.mup_embedding_multiplier
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states = layer(positions, hidden_states, forward_batch=forward_batch)
+
+        hidden_states = self.final_layernorm(hidden_states)
+        return hidden_states
+
+
+class Phi3SmallForCausalLM(nn.Module):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(
+        self,
+        config: Phi3Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+
+        super().__init__()
+
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Phi3SmallModel(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("model", prefix),
+        )
+        self.vocab_size = config.vocab_size
+        self.mup_width_multiplier = config.mup_width_multiplier
+        self.lm_head = ParallelLMHead(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
+        # tokens in tiktoken but not used
+        if hasattr(config, "dummy_token_indices"):
+            device = self.lm_head.weight.device
+            self.register_buffer(
+                "dummy_token_indices",
+                torch.LongTensor(config.dummy_token_indices).to(device),
+                persistent=False,
+            )
+        else:
+            self.dummy_token_indices = None
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, value):
+        self.lm_head = value
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    def compute_logits(
+        self,
+        input_ids: torch.LongTensor,
+        hidden_states: torch.Tensor,
+        sampling_metadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(
+            input_ids, self.lm_head, hidden_states, sampling_metadata
+        )
+        if self.dummy_token_indices is not None and logits is not None:
+            logits.index_fill_(-1, self.dummy_token_indices, -torch.inf)
+        return logits
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: Optional[torch.LongTensor],
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        get_embedding: bool = False,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+            inputs_embeds=inputs_embeds,
+        )
+
+        if not get_embedding:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+
+        else:
+            return self.pooler(hidden_states, forward_batch)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+
+EntryClass = Phi3SmallForCausalLM
diff --git a/python/sglang/srt/models/phi4mm.py b/python/sglang/srt/models/phi4mm.py
new file mode 100644
index 000000000..37a638acb
--- /dev/null
+++ b/python/sglang/srt/models/phi4mm.py
@@ -0,0 +1,538 @@
+# Copyright 2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/6071e989df1531b59ef35568f83f7351afb0b51e/vllm/model_executor/models/phi4mm.py
+# https://huggingface.co/microsoft/Phi-4-multimodal-instruct/blob/main/processing_phi4mm.py
+
+import logging
+import math
+import re
+from collections.abc import Iterable
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+from torch import nn
+from transformers import PretrainedConfig, SiglipVisionConfig
+
+from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.idefics2 import Idefics2VisionTransformer
+from sglang.srt.models.llama import LlamaForCausalLM
+from sglang.srt.models.phi4mm_audio import AudioEmbedding
+
+logger = logging.getLogger(__name__)
+
+SIGLIP_NAME = "siglip-so400m-patch14-448"
+VISION_ENCODER_TO_PROCESSING_CONFIG = {
+    "siglip-so400m-patch14-448": {
+        "vit_image_size": 448,
+        "vit_patch_size": 14,
+        "token_compression_factor": 2,
+    },
+}
+
+
+class Phi4MMImageEncoder(nn.Module):
+    """Image embedding."""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+        model_dir: str = "",
+    ) -> None:
+        super().__init__()
+
+        # n_embed or hidden_size
+        hidden_size = config.n_embd if hasattr(config, "n_embd") else config.hidden_size
+        self.type_feature = "patch"
+        self.img_processor = Idefics2VisionTransformer(
+            config=config.vision_config, require_post_norm=False
+        )
+
+        pe_weight = self.img_processor.embeddings.position_embedding.weight
+        L, D = pe_weight.size()
+        H = int(math.sqrt(L))
+        assert H**2 == L, f"position embedding size {L} is not square"
+        if H % 2 != 0:
+            self.img_processor_padding = nn.ReflectionPad2d((0, 1, 0, 1))
+            H += 1
+        image_dim_out = D
+        # ((448/14)//2)**2
+        self.num_img_tokens = (H // 2) ** 2
+        self.base_feat_height_target = H
+
+        self.image_dim_out = image_dim_out
+        self.img_sizes = None
+        self.image_attention_mask = None
+
+        # global_gn and sub_gn for hd transform, serves as line separator
+        self.use_hd_transform = True
+        self.with_learnable_separator = True
+        self.hd_transform_order = "sub_glb"
+        self.freeze_img_processor = False
+        self.crop_size = 448
+
+        # image token compression
+        self.image_token_compression_cls = "avg_pool_2d"
+        self.image_token_compression = nn.AvgPool2d(kernel_size=2, stride=2)
+        self.base_feat_height_reduction = 1
+        self.base_feat_height_target = self.base_feat_height_target // 2
+
+        # with_hd_transform and with_learnable_separator should have same value
+        assert (
+            self.use_hd_transform == self.with_learnable_separator
+        ), "use_hd_transform and with_learnable_separator should have same value"
+        assert self.use_hd_transform, "learnable separator is only for hd transform"
+        # 1024 * 4, merge spatial to channel dimension
+        self.glb_GN = nn.Parameter(
+            torch.zeros([1, 1, self.image_dim_out * self.base_feat_height_reduction**2])
+        )
+        self.sub_GN = nn.Parameter(
+            torch.zeros(
+                [1, 1, 1, self.image_dim_out * self.base_feat_height_reduction**2]
+            )
+        )
+
+        dim_projection = hidden_size
+        depth = 2
+        layers = [
+            nn.Linear(
+                image_dim_out * self.base_feat_height_reduction**2, dim_projection
+            )
+        ]
+        for _ in range(1, depth):
+            layers.extend([nn.GELU(), nn.Linear(dim_projection, dim_projection)])
+        self.img_projection = nn.Sequential(*layers)
+
+        self.vocab_size = config.vocab_size
+        self.img_features = None
+
+        self.use_out_place_operations = False
+
+    def get_img_features(
+        self, img_embeds: torch.FloatTensor, attention_mask=None
+    ) -> torch.FloatTensor:
+        img_feature = self.img_processor(
+            img_embeds, patch_attention_mask=attention_mask
+        )
+
+        patch_feature = img_feature
+
+        use_token_compression = self.image_token_compression is not None
+        use_padding = getattr(self, "img_processor_padding", None) is not None
+        if use_token_compression or use_padding:
+            # reshape to 2D tensor
+            width = int(math.sqrt(patch_feature.size(1)))
+            patch_feature = patch_feature.view(-1, width, width, patch_feature.size(-1))
+            # convert to NCHW
+            patch_feature = patch_feature.permute(0, 3, 1, 2)
+
+            if use_padding:
+                patch_feature = self.img_processor_padding(patch_feature)
+            if use_token_compression:
+                patch_feature = self.image_token_compression(patch_feature)
+
+            # convert to NHWC
+            patch_feature = patch_feature.permute(0, 2, 3, 1)
+            patch_feature = patch_feature.view(
+                -1,
+                patch_feature.size(1) * patch_feature.size(2),
+                patch_feature.size(-1),
+            )
+
+        return patch_feature
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        image_sizes: torch.Tensor,
+        image_attention_mask: torch.Tensor,
+    ) -> list[torch.FloatTensor]:
+        """
+        process image and return vision embeddings.
+
+        pixel_values: (num_images, num_crops, c, h, w)
+        image_sizes: [[h1, w1], [h2, w2]]
+        image_attention_mask: num_images x num_crops x 32 x 32
+        output: (num_images, num_img_tokens, hidden_size)
+        """
+
+        # eg
+        # pixel_values: torch.Size([1, 7, 3, 448, 448])
+        # image_sizes: tensor([[ 896, 1344]], device='cuda:0')
+        # output: torch.Size([1, 1841, 3072])
+
+        img_projection_params = next(self.img_projection.parameters())
+        target_device = img_projection_params.device
+        target_dtype = img_projection_params.dtype
+
+        img_sizes = image_sizes
+        num_images, num_crops, c, h, w = pixel_values.shape
+        bs = num_images
+        pixel_values = pixel_values.flatten(0, 1)
+
+        img_features = self.get_img_features(
+            pixel_values,
+            image_attention_mask.type(torch.BoolTensor).flatten(0, 1).to(target_device),
+        )
+
+        base_feat_height_target = self.base_feat_height_target
+        base_resolution = self.crop_size
+        base_feat_height_reduction = self.base_feat_height_reduction
+
+        base_feat_height = base_feat_width = int(np.sqrt(img_features.shape[1]))
+        assert (
+            base_feat_height == base_feat_height_target
+            and base_feat_width == base_feat_height_target
+        ), f'base_feat_height: {base_feat_height},"\
+                f" base_feat_width: {base_feat_width}, "\
+                f"expect {base_feat_height_target} features for hd transform'
+
+        # bs x max_num_crops x (24x24) x C
+        img_features = img_features.view(
+            bs, -1, base_feat_height * base_feat_width, self.image_dim_out
+        )
+        C = self.image_dim_out
+        H = base_feat_height
+
+        output_imgs = []
+        output_len = []
+        # training is tensor, inference is list
+        if isinstance(img_sizes, torch.Tensor):
+            img_sizes = img_sizes.view(-1, 2)
+        for _bs in range(bs):
+            h, w = img_sizes[_bs]
+            h = h // base_resolution
+            w = w // base_resolution
+            B_ = h * w
+
+            # 1 x (24x24) x 1024
+            global_img_feature = img_features[_bs, :1]
+
+            # 1 x 12 x 12 x 4096
+            glb_img = (
+                global_img_feature.reshape(1, H, H, C)
+                .reshape(
+                    1,
+                    H // base_feat_height_reduction,
+                    base_feat_height_reduction,
+                    H // base_feat_height_reduction,
+                    base_feat_height_reduction,
+                    C,
+                )
+                .contiguous()
+                .permute(0, 1, 3, 2, 4, 5)
+                .reshape(
+                    1,
+                    H // base_feat_height_reduction,
+                    H // base_feat_height_reduction,
+                    base_feat_height_reduction * base_feat_height_reduction * C,
+                )
+                .contiguous()
+            )
+            temp_glb_GN = self.sub_GN.repeat(1, H // base_feat_height_reduction, 1, 1)
+
+            # 1 x 156 x 4096
+            glb_img = torch.cat([glb_img, temp_glb_GN], dim=2).reshape(
+                1, -1, base_feat_height_reduction * base_feat_height_reduction * C
+            )
+
+            # (max_num_crops-1) x (12x12) x C
+            sub_img = img_features[_bs, 1:]
+            # 16x574x1024
+            # get rid of padding sub_img
+            sub_img = sub_img[:B_]
+
+            # (num_crops, 12, 2, 12, 2, 1024) ->
+            # (num_crops, 12, 12, 2, 2, 1024) -> (num_crops, 12*12, 4*1024)
+            sub_img = (
+                sub_img.reshape(B_, H, H, C)
+                .reshape(
+                    B_,
+                    H // base_feat_height_reduction,
+                    base_feat_height_reduction,
+                    H // base_feat_height_reduction,
+                    base_feat_height_reduction,
+                    C,
+                )
+                .contiguous()
+                .permute(0, 1, 3, 2, 4, 5)
+                .reshape(
+                    B_, -1, base_feat_height_reduction * base_feat_height_reduction * C
+                )
+                .contiguous()
+            )
+            sub_img = (
+                sub_img.reshape(
+                    1,
+                    h,
+                    w,
+                    base_feat_height // base_feat_height_reduction,
+                    base_feat_width // base_feat_height_reduction,
+                    -1,
+                )
+                .permute(0, 1, 3, 2, 4, 5)
+                .reshape(
+                    1,
+                    h * base_feat_height // base_feat_height_reduction,
+                    w * base_feat_width // base_feat_height_reduction,
+                    base_feat_height_reduction * base_feat_height_reduction * C,
+                )
+            )
+
+            if image_attention_mask is not None and len(image_attention_mask) > 0:
+                reshaped_image_attention_mask = (
+                    image_attention_mask[_bs, 1 : B_ + 1, 0::2, 0::2]
+                    .reshape(
+                        1,
+                        h,
+                        w,
+                        base_feat_height // base_feat_height_reduction,
+                        base_feat_width // base_feat_height_reduction,
+                    )
+                    .permute(0, 1, 3, 2, 4)
+                    .reshape(
+                        1,
+                        h * base_feat_height // base_feat_height_reduction,
+                        w * base_feat_width // base_feat_height_reduction,
+                    )
+                )
+                useful_height = int(reshaped_image_attention_mask[0, :, 0].sum().item())
+                useful_width = int(reshaped_image_attention_mask[0, 0, :].sum().item())
+                sub_img = sub_img[:, :useful_height, :useful_width]
+                temp_sub_GN = self.sub_GN.repeat(1, useful_height, 1, 1)
+                temp_len = (
+                    int(image_attention_mask[_bs, : B_ + 1, 0::2, 0::2].sum().item())
+                    + (useful_height + 1)
+                    + base_feat_height // base_feat_height_reduction
+                )
+            else:
+                temp_sub_GN = self.sub_GN.repeat(
+                    1, h * base_feat_height // base_feat_height_reduction, 1, 1
+                )
+                temp_len = int(
+                    (h * w + 1) * self.num_img_tokens
+                    + 1
+                    + (h + 1) * base_feat_height // base_feat_height_reduction
+                )
+
+            sub_img = torch.cat([sub_img, temp_sub_GN], dim=2).reshape(
+                1, -1, base_feat_height_reduction * base_feat_height_reduction * C
+            )
+            # (1, num_img_tokens, 1024*4)
+
+            # glb + sub
+            if self.hd_transform_order == "glb_sub":
+                output_imgs.append(torch.cat([glb_img, self.glb_GN, sub_img], dim=1))
+            elif self.hd_transform_order == "sub_glb":
+                output_imgs.append(torch.cat([sub_img, self.glb_GN, glb_img], dim=1))
+            else:
+                raise NotImplementedError(
+                    f'hd_transform_order = {self.hd_transform_order}, "\
+                        "not implemented'
+                )
+
+            # temp_len = int((h*w+1)*144 + 1 + (h+1)*12)
+            assert (
+                temp_len == output_imgs[-1].shape[1]
+            ), f'temp_len: {temp_len}, output_imgs[-1].shape[1]: "\
+                    "{output_imgs[-1].shape[1]}'
+
+            output_len.append(temp_len)
+
+        img_set_tensor = []
+        for _output_img in output_imgs:
+            img_feature_proj = self.img_projection(
+                _output_img.to(target_device).to(target_dtype)
+            )
+            img_set_tensor.append(img_feature_proj.squeeze(0))
+
+        return img_set_tensor
+
+
+class Phi4MMForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    lora_pattern = re.compile(
+        r"^language_model\.model\.layers\.(\d+)\.(?:self_attn|mlp)\.(?:qkv_proj|o_proj|down_proj|gate_up_proj)"
+    )
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.language_model = LlamaForCausalLM(
+            config=config, quant_config=quant_config, prefix=prefix
+        )
+
+        self.vision_encoder = Phi4MMImageEncoder(
+            config,
+            quant_config,
+            prefix="model.vision_embed_tokens",
+            model_dir=config._name_or_path,
+        )
+
+        if isinstance(config.embd_layer["audio_embd_layer"], dict):
+            embedding_config = {
+                "embedding_cls": config.embd_layer["audio_embd_layer"]["embedding_cls"],
+                **config.embd_layer["audio_embd_layer"],
+            }
+        else:
+            embedding_config = {"embedding_cls": config.embd_layer["embedding_cls"]}
+
+        self.embed_tokens_extend = AudioEmbedding(config, **embedding_config)
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        dtype = next(self.vision_encoder.parameters()).dtype
+        pixel_values = torch.cat([item.feature for item in items], dim=0).type(dtype)
+        image_attention_mask = torch.cat(
+            [
+                item.image_attention_mask
+                for item in items
+                if hasattr(item, "image_attention_mask")
+            ],
+            dim=0,
+        )
+        image_sizes = torch.cat([item.image_sizes for item in items], dim=0)
+        image_embeds = self.vision_encoder(
+            pixel_values, image_sizes, image_attention_mask
+        )
+        return torch.cat(image_embeds).type(dtype)
+
+    def get_audio_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # (e.g. multiple examples) and the second dim is the multi-audio dim
+        # (e.g. multiple audios in the same example)
+        embed_tokens_extend_param = next(self.embed_tokens_extend.parameters())
+        device = embed_tokens_extend_param.device
+        dtype = embed_tokens_extend_param.dtype
+        audio_embeds = [
+            self.embed_tokens_extend(
+                # item.feature: (num_audios_in_a_sequence, T, D)
+                # item.audio_attention_mask: (num_audios_in_a_sequence, T, D) BoolTensor or None
+                audio_features=item.feature.to(device).type(dtype),
+                audio_attention_mask=(
+                    item.audio_attention_mask.to(device)
+                    if hasattr(item, "audio_attention_mask")
+                    else None
+                ),
+            )
+            for item in items
+        ]
+        return torch.cat(audio_embeds).type(dtype)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.language_model,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+                Modality.AUDIO: self.get_audio_feature,
+            },
+            positions=positions,
+        )
+
+        return hidden_states
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def should_apply_lora(self, module_name: str) -> bool:
+        return bool(self.lora_pattern.match(module_name))
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
+            (".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
+            (".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
+        ]
+        prefix_mapping = {
+            "model.embed_tokens_extend.audio_embed.audio_projection.vision.": "embed_tokens_extend.audio_projection_for_vision.",
+            "model.embed_tokens_extend.audio_embed.audio_projection.speech.": "embed_tokens_extend.audio_projection.",
+            "model.embed_tokens_extend.audio_embed.": "embed_tokens_extend.",
+            "model.embed_tokens_extend.image_embed.": "vision_encoder.",
+            "model.": "language_model.model.",
+        }
+
+        skip_list = [
+            "img_processor.encoder.layers.26",
+            "img_processor.head",
+            "img_processor.post_layernorm",
+        ]
+
+        def _should_skip(name: str) -> bool:
+            return any(substr in name for substr in skip_list)
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            # Skip the last layer
+            if _should_skip(name):
+                continue
+
+            for old_name, new_name in prefix_mapping.items():
+                if name.startswith(old_name):
+                    name = name.replace(old_name, new_name)
+                    break
+
+            # Adapt to VisionAttention
+            name = name.replace(r"self_attn.out_proj", r"self_attn.proj")
+            name = name.replace(r"base_layer.", r"")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict.get(name)
+                if param is None:
+                    if "lora" not in name:
+                        logger.warning("Warning: {name} not found in model parameters")
+                    continue
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = [Phi4MMForCausalLM]
diff --git a/python/sglang/srt/models/phi4mm_audio.py b/python/sglang/srt/models/phi4mm_audio.py
new file mode 100644
index 000000000..fd199836e
--- /dev/null
+++ b/python/sglang/srt/models/phi4mm_audio.py
@@ -0,0 +1,1260 @@
+# Copyright 2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#!/usr/bin/env python3
+import abc
+import math
+from typing import Literal, Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    CheckpointWrapper,
+)
+from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel
+from transformers import PretrainedConfig
+
+from sglang.srt.models.phi4mm_utils import (
+    AbsolutePositionalEncoding,
+    ConvModule,
+    FeedForward,
+    MeanVarianceNormLayer,
+    MultiHeadedAttention,
+    MultiSequential,
+    NemoConvSubsampling,
+    T5RelativeAttentionLogitBias,
+    adaptive_enc_mask,
+    get_offset,
+    unfold_tensor,
+)
+
+_AUDIO_PLACEHOLDER_TOKEN_ID = 200011  # <|endoftext11|>
+
+
+class ConformerEncoderLayer(nn.Module):
+    """ConformerEncoder Layer module.
+    for more details see conformer paper:
+        https://arxiv.org/abs/2005.08100
+    This module implement the Conformer block layer.
+
+    Args:
+        d_model: int
+            attention dim.
+        ext_pw_out_channel: int
+            if > 0, ext_pw_out_channel is a dim channel size
+             for the last pointwise conv after swish activation.
+        depthwise_seperable_out_channel: int
+            if set different to 0, the number of
+             depthwise_seperable_out_channel will be used as a
+             channel_out of the second conv1d layer.
+             otherwise, it equal to 0, the second conv1d layer is skipped.
+        depthwise_multiplier: int
+            number of input_dim channels duplication. this value
+             will be used to compute the hidden channels of the Conv1D.
+        n_head: int
+            the number of heads for multihead attention module.
+        d_ffn: int
+            output size of the feed_forward blocks.
+        ext_pw_kernel_size: int
+            kernel size of the conv pointwise of the conformer.
+        kernel_size: int
+            kernel size.
+        dropout_rate: float
+            dropout rate.
+        causal: bool, optional
+            if set to True, convolution have no access
+             to future frames. default False.
+        batch_norm: bool, optional
+            if set to True, apply batchnorm before activation
+            in ConvModule layer of the conformer.
+            default False
+        activation: str, optional
+            activation function name,
+            one of ["relu", "swish", "sigmoid"],
+            sigmoid activation is only used with "glu_in_fnn=True",
+            default "relu".
+        chunk_se: int, optional
+            0 for offline SE.
+            1 for streaming SE, where mean is computed
+             by accumulated history until current chunk_se.
+            2 for streaming SE, where mean is computed
+             by only the current chunk.
+            default 0.
+        chunk_size: int, optional
+            chunk_size for cnn. default 18
+        conv_activation: str, optional
+            activation function used in ConvModule part
+            of the conformer, default "relu".
+        conv_glu_type: str, optional
+            activation function used for the glu inside
+            the ConvModule part of the conformer.
+            default: "sigmoid".
+        bias_in_glu: bool, optional
+            if set to True, use additive bias in the weight module
+             before GLU.
+        linear_glu_in_convm: bool, optional
+            if set to True, use GLULinear module,
+             otherwise, used GLUPointWiseConv module.
+              default to False.
+        attention_inner_dim: int, optional
+            if equal to -1, attention dim for linears k/q/v is
+            equal to d_model. otherwise attention_inner_dim is used.
+            default -1.
+        attention_glu_type: str, optional
+            activation function for glu used in the multihead attention,
+             default "swish".
+        activation_checkpointing: str, optional
+            a dictionarry of {"module","interval","offload"}, where
+                "module": str
+                    accept ["transformer", "attention"] to select
+                    which module should do activation checkpointing.
+                "interval": int, default 1,
+                    interval of applying activation checkpointing,
+                    interval = 1 means that we apply checkpointing
+                    on every layer (if activation), otherwise,
+                    we apply it every x interval.
+                "offload": bool, default False,
+                    if set to True, we offload activation to cpu and
+                    reload it during backward, otherwise,
+                    we recalculate activation in backward.
+            default "".
+        export: bool, optional
+            if set to True, it remove the padding from convolutional layers
+             and allow the onnx conversion for inference.
+              default False.
+        use_pt_scaled_dot_product_attention: bool, optional
+            if set to True, use pytorch's scaled dot product attention
+            implementation in training.
+        attn_group_sizes: int, optional
+            the number of groups to use for attention, default 1
+            (Multi-Head Attention),
+            1 = typical Multi-Head Attention,
+            1 < attn_group_sizes < attention_heads = Grouped-Query Attention
+            attn_group_sizes = attention_heads = Multi-Query Attention
+    """
+
+    def __init__(
+        self,
+        d_model=512,
+        ext_pw_out_channel=0,
+        depthwise_seperable_out_channel=256,
+        depthwise_multiplier=1,
+        n_head=4,
+        d_ffn=2048,
+        ext_pw_kernel_size=1,
+        kernel_size=3,
+        dropout_rate=0.1,
+        causal=False,
+        batch_norm=False,
+        activation="relu",
+        chunk_se=0,
+        chunk_size=18,
+        conv_activation="relu",
+        conv_glu_type="sigmoid",
+        bias_in_glu=True,
+        linear_glu_in_convm=False,
+        attention_inner_dim=-1,
+        attention_glu_type="swish",
+        activation_checkpointing="",
+        export=False,
+        use_pt_scaled_dot_product_attention=False,
+        attn_group_sizes: int = 1,
+    ):
+        super().__init__()
+
+        self.feed_forward_in = FeedForward(
+            d_model=d_model,
+            d_inner=d_ffn,
+            dropout_rate=dropout_rate,
+            activation=activation,
+            bias_in_glu=bias_in_glu,
+        )
+
+        self.self_attn = MultiHeadedAttention(
+            n_head,
+            d_model,
+            dropout_rate,
+            attention_inner_dim,
+            attention_glu_type,
+            bias_in_glu,
+            use_pt_scaled_dot_product_attention=use_pt_scaled_dot_product_attention,
+            group_size=attn_group_sizes,
+        )
+        self.conv = ConvModule(
+            d_model,
+            ext_pw_out_channel,
+            depthwise_seperable_out_channel,
+            ext_pw_kernel_size,
+            kernel_size,
+            depthwise_multiplier,
+            dropout_rate,
+            causal,
+            batch_norm,
+            chunk_se,
+            chunk_size,
+            conv_activation,
+            conv_glu_type,
+            bias_in_glu,
+            linear_glu_in_convm,
+            export=export,
+        )
+
+        self.feed_forward_out = FeedForward(
+            d_model=d_model,
+            d_inner=d_ffn,
+            dropout_rate=dropout_rate,
+            activation=activation,
+            bias_in_glu=bias_in_glu,
+        )
+
+        self.layer_norm_att = nn.LayerNorm(d_model)
+        self.layer_norm = nn.LayerNorm(d_model)
+
+    def forward(
+        self,
+        x,
+        pos_k,
+        pos_v,
+        mask,
+        relative_attention_bias: Optional[Tensor] = None,
+    ):
+        """ConformerEncoder forward.
+
+        Args:
+            x: torch.Tensor
+                input feature of shape (batch, max_time_in, size)
+            pos_k: torch.Tensor
+                positional key embedding.
+            mask: torch.Tensor
+                mask for x (batch, max_time_in)
+            relative_attention_bias: Optional[torch.Tensor]
+                bias added to attention logits w.r.t. relative positions
+                (1, n_head, time1, time2)
+        """
+        x = x + 0.5 * self.feed_forward_in(x)
+        norm_x = self.layer_norm_att(x)
+
+        x = x + self.self_attn(
+            norm_x,
+            norm_x,
+            norm_x,
+            pos_k,
+            pos_v,
+            mask,
+            relative_attention_bias=relative_attention_bias,
+        )
+        x = x + self.conv(x)
+        x = x + 0.5 * self.feed_forward_out(x)
+
+        out = self.layer_norm(x)
+
+        return out, pos_k, pos_v, mask
+
+
+class TransformerEncoderBase(abc.ABC, nn.Module):
+    """The Base class for Transformer based encoders
+
+    Please set causal = True in streaming model
+    Args:
+        input_size: int
+            input feature dimension.
+        chunk_size: int, list(int)
+            Number of frames for each chunk
+            This variable can take 2 forms:
+            int:  Used for inference, or single chunk size training
+            list(int) : Used only for variable chunk size training
+            Some examples for the 2 cases:
+            chunk_size = 12
+            chunk_size = [6, 8, 12, 24]
+        left_chunk: int, list(int)
+            Number of chunks used for masking in streaming mode.
+            This variable can take 2 forms:
+            int:  Used for inference, or single chunk size training
+            list(int) : Used only for variable chunk size training. When
+            chunk_size is a list, left_chunk must be a list with same length.
+            Some examples for the 2 cases:
+            left_chunk = 6
+            left_chunk = [12, 9, 6, 3]
+        attention_dim: int, optional
+            attention dimension. default 256.
+        attention_heads: int, optional
+            the number of heads. default 4
+        input_layer: str, optional
+            input layer type before Conformer,
+            one of ["linear", "conv2d", "custom", "vgg2l", "embed"],
+            default "conv2d"
+        cnn_out: int, optional
+            the number of CNN channels before Conformer.
+            default -1.
+        cnn_layer_norm: bool, optional
+            layer norm between Conformer and the first CNN.
+            default False.
+        time_reduction: int, optional
+            time reduction factor
+            default 4
+        dropout_rate: float, optional
+            dropout rate. default 0.1
+        padding_idx: int, optional
+            padding index for input_layer=embed
+            default -1
+        relative_attention_bias_args: dict, optional
+            use more efficient scalar bias-based relative multihead attention
+            (Q*K^T + B) implemented in cmb.basics.embedding.
+            [T5/ALiBi]RelativeAttentionLogitBias
+            usage: relative_attention_bias_args={"type": t5/alibi}
+            additional method-specific arguments can be provided (see
+            transformer_base.py)
+        positional_dropout_rate: float, optional
+            dropout rate after positional encoding. default 0.0
+        nemo_conv_settings: dict, optional
+            A dictionary of settings for NeMo Subsampling.
+            default None
+        conv2d_extra_padding: str, optional
+            Add extra padding in conv2d subsampling layers. Choices are
+            (feat, feat_time, none, True).
+            if True or feat_time, the extra padding is added into non full
+            supraframe utts in batch.
+            Default: none
+        attention_group_size: int, optional
+            the number of groups to use for attention, default 1
+            (Multi-Head Attention),
+            1 = typical Multi-Head Attention,
+            1 < attention_group_size < attention_heads = Grouped-Query
+            Attention
+            attention_group_size = attention_heads = Multi-Query Attention
+    """
+
+    def __init__(
+        self,
+        input_size,
+        chunk_size,
+        left_chunk,
+        attention_dim=256,
+        attention_heads=4,
+        input_layer="nemo_conv",
+        cnn_out=-1,
+        cnn_layer_norm=False,
+        time_reduction=4,
+        dropout_rate=0.0,
+        padding_idx=-1,
+        relative_attention_bias_args=None,
+        positional_dropout_rate=0.0,
+        nemo_conv_settings=None,
+        conv2d_extra_padding: Literal["feat", "feat_time", "none", True] = "none",
+        attention_group_size=1,
+        encoder_embedding_config=None,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.input_layer = input_layer
+        self.chunk_size = chunk_size
+        self.left_chunk = left_chunk
+        self.attention_dim = attention_dim
+        self.num_heads = attention_heads
+        self.attention_group_size = attention_group_size
+        self.time_reduction = time_reduction
+        self.nemo_conv_settings = nemo_conv_settings
+        self.encoder_embedding_config = encoder_embedding_config
+
+        if self.input_layer == "nemo_conv":
+            default_nemo_conv_settings = {
+                "subsampling": "dw_striding",
+                "subsampling_factor": self.time_reduction,
+                "feat_in": input_size,
+                "feat_out": attention_dim,
+                "conv_channels": 256,
+                "subsampling_conv_chunking_factor": 1,
+                "activation": nn.ReLU(),
+                "is_causal": False,
+            }
+            # Override any of the defaults with the incoming, user settings
+            if nemo_conv_settings:
+                default_nemo_conv_settings.update(nemo_conv_settings)
+                for i in ["subsampling_factor", "feat_in", "feat_out"]:
+                    assert (
+                        i not in nemo_conv_settings
+                    ), "{i} should be specified outside of the NeMo dictionary"
+
+            self.embed = NemoConvSubsampling(
+                **default_nemo_conv_settings,
+            )
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+
+        self.pos_emb = AbsolutePositionalEncoding(
+            attention_dim, positional_dropout_rate
+        )
+
+        self.relative_attention_bias_type = (
+            relative_attention_bias_args.get("type")
+            if relative_attention_bias_args
+            else None
+        )
+        if self.relative_attention_bias_type == "t5":
+            assert (
+                self.num_heads % self.attention_group_size == 0
+            ), "attention_group_size must divide n_head"
+            self.relative_attention_bias_layer = T5RelativeAttentionLogitBias(
+                self.num_heads // self.attention_group_size,
+                max_distance=relative_attention_bias_args.get(
+                    "t5_bias_max_distance", 1000
+                ),
+                symmetric=relative_attention_bias_args.get("t5_bias_symmetric", False),
+            )
+        else:
+            raise NotImplementedError
+
+        self.encoder_embedding = MeanVarianceNormLayer(
+            self.encoder_embedding_config["input_size"]
+        )
+
+    def compute_lens_change(self, feature_lens):
+        """feature_lens: int
+        return updated feature lens.
+
+        This used to return a different lambda function for each case that
+        computed the right thing.  That does not work within Torchscript.
+        If you really need this to be faster, create nn.Module()-s for all
+        the cases and return one of them.  Torchscript does support that.
+        """
+        if self.input_layer == "nemo_conv":
+            # Handle the special causal case
+            subsampling_causal_cond = self.nemo_conv_settings.get(
+                "subsampling", "dw_striding"
+            ) in [
+                "dw_striding",
+                "striding",
+                "striding_conv1d",
+            ]
+            is_causal = self.nemo_conv_settings.get("is_causal", False)
+            if is_causal and subsampling_causal_cond:
+                lens_change = (
+                    torch.ceil(feature_lens / self.time_reduction).long()
+                    if isinstance(feature_lens, Tensor)
+                    else math.ceil(feature_lens / self.time_reduction)
+                )
+                feature_lens_remainder = feature_lens % self.time_reduction
+                if isinstance(feature_lens, Tensor):
+                    lens_change[feature_lens_remainder != 1] += 1
+                elif feature_lens_remainder != 1:
+                    lens_change += 1
+                return lens_change
+            ceil_func = math.ceil if isinstance(feature_lens, int) else torch.ceil
+            return ceil_func(feature_lens / self.time_reduction)
+
+    @abc.abstractmethod
+    def forward(self):
+        """Abstract forward method implementation."""
+
+    def _chunk_size_selection(self, chunk_size=None, left_chunk=None):
+        """If chunk size is a list, we will randomly select a chunk size."""
+
+        if chunk_size is None:
+            chunk_size = self.chunk_size
+        if left_chunk is None:
+            left_chunk = self.left_chunk
+        if isinstance(chunk_size, list):
+            # Variable chunk size during training
+            chunk_size_index = int(
+                torch.randint(low=0, high=len(chunk_size), size=(1,))
+            )
+            chunk_size_train_eff = chunk_size[chunk_size_index]
+            if not isinstance(left_chunk, list):
+                raise ValueError(
+                    "Since chunk_size is a list, left_chunk must be a list"
+                )
+            if len(left_chunk) != len(chunk_size):
+                raise ValueError(
+                    "The length of left_chunk must be the same as length of "
+                    "chunk_size."
+                )
+            left_chunk_train_eff = left_chunk[chunk_size_index]
+        else:
+            chunk_size_train_eff = chunk_size
+            left_chunk_train_eff = left_chunk
+
+        return chunk_size_train_eff, left_chunk_train_eff
+
+    def _get_embed_class(self, embed):
+        # pylint: disable=protected-access
+        is_embed_using_act_chkpt = isinstance(embed, CheckpointWrapper)
+        is_embed_fsdp_wrapped = isinstance(embed, FullyShardedDataParallel)
+        embed_class = embed
+        if is_embed_using_act_chkpt:
+            embed_class = embed._checkpoint_wrapped_module
+        if is_embed_fsdp_wrapped:
+            embed_class = embed.module
+        return embed_class
+
+    def _forward_embeddings_core(self, input_tensor, masks):
+        embed_class = self._get_embed_class(self.embed)
+        assert isinstance(embed_class, NemoConvSubsampling)
+        input_tensor, masks = self.embed(input_tensor, masks)
+        return input_tensor, masks
+
+    def _position_embedding(self, input_tensor):
+        pos_k = None
+        pos_v = None
+        if self.relative_attention_bias_layer is None:
+            input_tensor = self.pos_emb(
+                input_tensor
+            )  # default to add abs sinusoid embedding
+        return pos_k, pos_v
+
+    def _streaming_mask(self, seq_len, batch_size, chunk_size, left_chunk):
+        chunk_size_train_eff, left_chunk_train_eff = self._chunk_size_selection(
+            chunk_size, left_chunk
+        )
+
+        # Create mask matrix for streaming
+        # S stores start index. if chunksize is 18, s is [0,18,36,....]
+        chunk_start_idx = np.arange(0, seq_len, chunk_size_train_eff)
+
+        enc_streaming_mask = (
+            adaptive_enc_mask(
+                seq_len, chunk_start_idx, left_window=left_chunk_train_eff
+            )
+            .unsqueeze(0)
+            .expand([batch_size, -1, -1])
+        )
+        return enc_streaming_mask
+
+    def forward_embeddings(self, xs_pad, masks, chunk_size_nc=None, left_chunk_nc=None):
+        """Forwarding the inputs through the top embedding layers
+
+        Args:
+            xs_pad: torch.Tensor
+                input tensor
+            masks: torch.Tensor
+                input mask
+            chunk_size_nc: (optional, default is None) chunk size for
+                            non-causal layers
+            left_chunk_nc: (optional, default is None) # of left chunks for
+                            non-causal layers
+        """
+        # pylint: disable=R0915
+        # get new lens.
+        seq_len = int(self.compute_lens_change(xs_pad.shape[1]))
+        if seq_len <= 0:
+            raise ValueError(
+                f"""The sequence length after time reduction is invalid:
+                {seq_len}. Your input feature is too short. Consider
+                filtering out the very short sentence from data
+                loader""",
+            )
+
+        batch_size = xs_pad.shape[0]
+
+        enc_streaming_mask = self._streaming_mask(
+            seq_len, batch_size, self.chunk_size, self.left_chunk
+        )
+
+        if xs_pad.is_cuda:
+            enc_streaming_mask = enc_streaming_mask.cuda()
+            xs_pad = xs_pad.cuda()
+
+        input_tensor = xs_pad
+        input_tensor, masks = self._forward_embeddings_core(input_tensor, masks)
+
+        streaming_mask = enc_streaming_mask
+        if streaming_mask is not None and masks is not None:
+            hs_mask = masks & streaming_mask
+        elif masks is not None:
+            hs_mask = masks
+        else:
+            hs_mask = streaming_mask
+
+        if chunk_size_nc is not None:
+            enc_streaming_mask_nc = self._streaming_mask(
+                seq_len, batch_size, chunk_size_nc, left_chunk_nc
+            )
+            if xs_pad.is_cuda:
+                enc_streaming_mask_nc = enc_streaming_mask_nc.cuda()
+            if masks is not None:
+                hs_mask_nc = masks & enc_streaming_mask_nc
+            else:
+                hs_mask_nc = enc_streaming_mask_nc
+        else:
+            hs_mask_nc = None
+
+        pos_k, pos_v = self._position_embedding(input_tensor)
+
+        if chunk_size_nc is None:
+            return input_tensor, pos_k, pos_v, hs_mask, masks
+        return input_tensor, pos_k, pos_v, hs_mask, masks, hs_mask_nc
+
+    def get_offset(self):
+        """Returns offset used when retaining inputs for decoding.
+
+        This is essentially, how many additional frames have to be added to
+        the front-end CNN input to ensure it can produce a single output.
+        So if the "padding" parameter is 0, typically offset will be > 0.
+        """
+        return get_offset(self.input_layer, self.time_reduction)
+
+
+class ConformerEncoder(TransformerEncoderBase):
+    """ConformerEncoder module.
+    see original paper for more details:
+        https://arxiv.org/abs/2005.08100
+
+    Please set causal = True in streaming model
+    Args:
+        input_size: int
+            input feature dimension.
+        chunk_size: int, list(int)
+            Number of frames for each chunk
+            This variable can take 2 forms:
+            int:  Used for inference, or single chunk size training
+            list(int) : Used only for variable chunk size training
+            Some examples for the 2 cases:
+            chunk_size = 12
+            chunk_size = [6, 8, 12, 24]
+        left_chunk: int, list(int)
+            Number of chunks used for masking in streaming mode.
+            This variable can take 2 forms:
+            int:  Used for inference, or single chunk size training
+            list(int) : Used only for variable chunk size training. When
+            chunk_size is a list, left_chunk must be a list with same length.
+            Some examples for the 2 cases:
+            left_chunk = 6
+            left_chunk = [12, 9, 6, 3]
+        left_chunk: int
+            number of chunks used for masking in streaming mode.
+        num_lang: int
+            This parameter is used to store the number of languages in the
+            lang_dict, only used for multiseed/multilingual models.
+            default None.
+        attention_dim: int, optional
+            attention dimension. default 256.
+        attention_heads: int, optional
+            the number of heads. default 4
+        linear_units:
+            the number of units of position-wise feed forward.
+            default 2048
+        num_block:
+            number of Transformer layer. default 6
+        dropout_rate: float, optional
+            dropout rate. default 0.1
+        input_layer: str, optional
+            input layer type before Conformer,
+            one of ["linear", "conv2d", "custom", "vgg2l", "embed"],
+            default "conv2d"
+        causal: bool, optional
+            if set to True, convolution have no access
+             to future frames. default False.
+        batch_norm: bool, optional
+            if set to True, apply batchnorm before activation
+            in ConvModule layer of the conformer.
+            default False
+        cnn_out: int, optional
+            the number of CNN channels before Conformer.
+            default -1.
+        cnn_layer_norm: bool, optional
+            layer norm between Conformer and the first CNN.
+            default False.
+        ext_pw_out_channel: int, optional
+            the number of channel for CNN
+            before depthwise_seperable_CNN.
+            If 0 then use linear. default 0.
+        ext_pw_kernel_size: int, optional
+            kernel size of N before depthwise_seperable_CNN.
+            only work for ext_pw_out_channel > 0.
+            default 1
+        depthwise_seperable_out_channel: int, optional
+            the number of channel for
+            depthwise_seperable_CNN.
+            default 256.
+        depthwise_multiplier: int, optional
+            the number of multiplier for
+            depthwise_seperable_CNN.
+            default 1.
+        chunk_se: int, optional
+            0 for offline SE.
+            1 for streaming SE, where mean is computed
+             by accumulated history until current chunk_se.
+            2 for streaming SE, where mean is computed
+             by only the current chunk.
+            default 0.
+        kernel_size: int, optional
+            the number of kernels for depthwise_seperable_CNN.
+            default 3.
+        activation: str, optional
+            FeedForward block activation.
+            one of ["relu", "swish", "sigmoid"]
+            default "relu".
+        conv_activation: str, optional
+            activation function used in ConvModule part
+            of the conformer, default "relu".
+        conv_glu_type: str, optional
+            activation used use glu in depthwise_seperable_CNN,
+            default "sigmoid"
+        bias_in_glu: bool, optional
+            if set to True, use additive bias in the weight module
+             before GLU. default True
+        linear_glu_in_convm: bool, optional
+            if set to True, use GLULinear module,
+             otherwise, used GLUPointWiseConv module.
+              default to False.
+        attention_glu_type: str
+            only work for glu_in_attention !=0
+            default "swish".
+        export: bool, optional
+            if set to True, it remove the padding from convolutional layers
+             and allow the onnx conversion for inference.
+              default False.
+        activation_checkpointing: str, optional
+            a dictionarry of {"module","interval","offload"}, where
+                "module": str
+                    accept ["transformer", "attention"] to select
+                    which module should do activation checkpointing.
+                "interval": int, default 1,
+                    interval of applying activation checkpointing,
+                    interval = 1 means that we apply checkpointing
+                    on every layer (if activation), otherwise,
+                    we apply it every x interval.
+                "offload": bool, default False,
+                    if set to True, we offload activation to cpu and
+                    reload it during backward, otherwise,
+                    we recalculate activation in backward.
+            default "".
+        extra_layer_output_idx: int
+            the layer index to be exposed.
+        relative_attention_bias_args: dict, optional
+            use more efficient scalar bias-based relative multihead attention
+            (Q*K^T + B) implemented in cmb.basics.embedding.
+            [T5/ALiBi]RelativeAttentionLogitBias
+            usage: relative_attention_bias_args={"type": t5/alibi}
+            additional method-specific arguments can be provided (see
+            transformer_base.py)
+        time_reduction: int optional
+            time reduction factor
+            default 4
+        use_pt_scaled_dot_product_attention: whether to use pytorch scaled
+            dot product attention in training.
+            Default: False
+        nemo_conv_settings: dict, optional
+            A dictionary of settings for NeMo Subsampling.
+            default: None
+            usage: nemo_conv_settings=
+                {
+                    "subsampling":
+                    dw_striding/striding/dw_striding_conv1d/striding_conv1d,
+                    "conv_channels": int,
+                    "subsampling_conv_chunking_factor": int,
+                    "is_causal": True/False
+                }
+        conv2d_extra_padding: str, optional
+            Add extra padding in conv2d subsampling layers. Choices are
+            (feat, feat_time, none, True)
+            Default: none
+        replication_pad_for_subsample_embedding:  For batched-streaming
+            decoding, use "replication" padding for the cache at start of
+            utterance.
+            Default: False
+        attention_group_size: int, optional
+            the number of groups to use for attention, default 1
+            (Multi-Head Attention),
+            1 = typical Multi-Head Attention,
+            1 < attention_group_size < attention_heads = Grouped-Query
+            Attention
+            attention_group_size = attention_heads = Multi-Query Attention
+    """
+
+    extra_multi_layer_output_idxs: list[int]
+
+    def __init__(  # pylint: disable-all
+        self,
+        input_size,
+        chunk_size,
+        left_chunk,
+        num_lang=None,
+        attention_dim=256,
+        attention_heads=4,
+        linear_units=2048,
+        num_blocks=6,
+        dropout_rate=0.1,
+        input_layer="nemo_conv",
+        causal=True,
+        batch_norm=False,
+        cnn_out=-1,
+        cnn_layer_norm=False,
+        ext_pw_out_channel=0,
+        ext_pw_kernel_size=1,
+        depthwise_seperable_out_channel=256,
+        depthwise_multiplier=1,
+        chunk_se=0,
+        kernel_size=3,
+        activation="relu",
+        conv_activation="relu",
+        conv_glu_type="sigmoid",
+        bias_in_glu=True,
+        linear_glu_in_convm=False,
+        attention_glu_type="swish",
+        export=False,
+        extra_layer_output_idx=-1,
+        extra_multi_layer_output_idxs=[],  # noqa
+        activation_checkpointing="",
+        relative_attention_bias_args=None,
+        time_reduction=4,
+        use_pt_scaled_dot_product_attention=False,
+        nemo_conv_settings=None,
+        conv2d_extra_padding: Literal["feat", "feat_time", "none", True] = "none",
+        replication_pad_for_subsample_embedding=False,
+        attention_group_size=1,
+        encoder_embedding_config=None,
+    ):
+        super().__init__(
+            input_size,
+            chunk_size,
+            left_chunk,
+            attention_dim,
+            attention_heads,
+            input_layer,
+            cnn_out,
+            cnn_layer_norm,
+            time_reduction,
+            dropout_rate=dropout_rate,
+            relative_attention_bias_args=relative_attention_bias_args,
+            positional_dropout_rate=0.0,
+            nemo_conv_settings=nemo_conv_settings,
+            conv2d_extra_padding=conv2d_extra_padding,
+            attention_group_size=attention_group_size,
+            encoder_embedding_config=encoder_embedding_config,
+        )
+        self.num_blocks = num_blocks
+        self.num_lang = num_lang
+        self.kernel_size = kernel_size
+        self.replication_pad_for_subsample_embedding: bool = (
+            replication_pad_for_subsample_embedding
+        )
+        assert (
+            self.num_heads % attention_group_size == 0
+        ), "attention_group_size must divide n_head"
+        self.num_heads_k = self.num_heads // attention_group_size
+
+        self.encoders = MultiSequential(
+            *[
+                ConformerEncoderLayer(
+                    d_model=attention_dim,
+                    ext_pw_out_channel=ext_pw_out_channel,
+                    depthwise_seperable_out_channel=depthwise_seperable_out_channel,
+                    depthwise_multiplier=depthwise_multiplier,
+                    n_head=attention_heads,
+                    d_ffn=linear_units,
+                    ext_pw_kernel_size=ext_pw_kernel_size,
+                    kernel_size=kernel_size,
+                    dropout_rate=dropout_rate,
+                    causal=causal,
+                    batch_norm=batch_norm,
+                    activation=activation,
+                    chunk_se=chunk_se,
+                    chunk_size=chunk_size,
+                    conv_activation=conv_activation,
+                    conv_glu_type=conv_glu_type,
+                    bias_in_glu=bias_in_glu,
+                    linear_glu_in_convm=linear_glu_in_convm,
+                    attention_glu_type=attention_glu_type,
+                    activation_checkpointing=activation_checkpointing,
+                    export=export,
+                    use_pt_scaled_dot_product_attention=use_pt_scaled_dot_product_attention,
+                    attn_group_sizes=attention_group_size,
+                )
+                for _ in range(num_blocks)
+            ]
+        )
+        self.extra_layer_output_idx = extra_layer_output_idx
+        self.extra_multi_layer_output_idxs = extra_multi_layer_output_idxs
+        # Make a zeros scalar we can use in get_initial_state to determine
+        # the device and the needed dtype:
+        self.register_buffer("dev_type", torch.zeros(()), persistent=False)
+
+    def init_relative_attention_bias(self, input_tensor):
+        if self.relative_attention_bias_layer:
+            return self.relative_attention_bias_layer(input_tensor)
+
+    def calculate_hs_mask(self, xs_pad, device, mask):
+        max_audio_length = xs_pad.shape[1]
+        batch_size = xs_pad.shape[0]
+        enc_streaming_mask = self._streaming_mask(
+            max_audio_length, batch_size, self.chunk_size, self.left_chunk
+        )
+        enc_streaming_mask = enc_streaming_mask.to(device)
+        if mask is None:
+            return enc_streaming_mask
+
+        feature_lens = mask.sum(1)
+        padding_length = feature_lens
+        pad_mask = torch.arange(0, max_audio_length, device=device).expand(
+            padding_length.size(0), -1
+        ) < padding_length.unsqueeze(1)
+        pad_mask = pad_mask.unsqueeze(1)
+        pad_mask = pad_mask & enc_streaming_mask
+        return pad_mask
+
+    @torch.jit.ignore
+    def forward(self, xs_pad, masks):
+        """Conformer Forward function
+
+        Args:
+            xs_pad: torch.Tensor
+                input tensor
+            masks: torch.Tensor
+                post-embedding input lengths
+        """
+        xs_pad = self.encoder_embedding(xs_pad)
+        input_tensor, pos_k, pos_v, hs_mask, masks = self.forward_embeddings(
+            xs_pad, masks
+        )
+
+        unfolded = False
+        ori_bz, seq_len, D = input_tensor.shape
+        max_seq_len = 500  # maximum position for absolute positional encoding
+        if seq_len > max_seq_len:
+            # audio sequence is longer than max_seq_len, unfold it into chunks
+            # of max_seq_len
+            unfolded = True
+            # the unfold op will drop residual frames, pad it to the multiple
+            # of max_seq_len
+            if seq_len % max_seq_len > 0:
+                chunk_pad_size = max_seq_len - (seq_len % max_seq_len)
+            else:
+                chunk_pad_size = 0
+            if chunk_pad_size > 0:
+                input_tensor_pad = F.pad(
+                    input_tensor, (0, 0, 0, chunk_pad_size), "constant", 0
+                )
+                input_tensor = input_tensor_pad.to(input_tensor.device)
+            input_tensor = unfold_tensor(input_tensor, max_seq_len)
+            if masks is not None:
+                # revise hs_mask here because the previous calculated hs_mask
+                # did not consider extra pad
+                subsampled_pad_mask = masks.squeeze(
+                    1
+                )  # [bz, subsampled_unmask_seq_len]
+                extra_padded_subsamlped_pad_mask = F.pad(
+                    subsampled_pad_mask, (0, chunk_pad_size), "constant", False
+                )  # extra padding to the pad mask
+                extra_padded_subsamlped_pad_mask = (
+                    extra_padded_subsamlped_pad_mask.unsqueeze(-1).float()
+                )
+                masks_unfold = unfold_tensor(
+                    extra_padded_subsamlped_pad_mask, max_seq_len
+                )  # unfold the pad mask like we did to the input tensor
+                masks_unfold = masks_unfold.squeeze(
+                    -1
+                ).bool()  # unfold op does not support bool tensor
+            else:
+                masks_unfold = None
+            hs_mask = self.calculate_hs_mask(
+                input_tensor, input_tensor.device, masks_unfold
+            )  # calculate hs_mask based on the unfolded pad mask
+
+        # layer_emb = None
+
+        relative_attention_bias = self.init_relative_attention_bias(input_tensor)
+
+        _simplified_path = (
+            self.extra_layer_output_idx == -1 and relative_attention_bias is None
+        )
+
+        if _simplified_path:
+            input_tensor, *_ = self.encoders(input_tensor, pos_k, pos_v, hs_mask)
+        else:
+            for i, layer in enumerate(self.encoders):
+                input_tensor, _, _, _ = layer(
+                    input_tensor,
+                    pos_k,
+                    pos_v,
+                    hs_mask,
+                    relative_attention_bias=relative_attention_bias,
+                )
+
+                # if i == self.extra_layer_output_idx:
+                #     layer_emb = input_tensor
+
+        if unfolded:
+            embed_dim = input_tensor.shape[-1]
+            input_tensor = input_tensor.reshape(ori_bz, -1, embed_dim)
+            # if we ever padded before unfolding, we need to remove the padding
+            if chunk_pad_size > 0:
+                input_tensor = input_tensor[:, :-chunk_pad_size, :]
+
+        return input_tensor, masks  # , layer_emb
+
+
+class WindowQformer(nn.Module):
+    """Window-level Qformer"""
+
+    def __init__(
+        self,
+        window_size: int = 8,
+        num_queries: int = 1,
+        num_blocks: int = 2,
+        attention_dim: int = 512,
+        attention_heads: int = 8,
+        linear_units: int = 2048,
+        dropout_rate: float = 0.0,
+        normalize_before: bool = True,
+    ):
+        super().__init__()
+
+        self.decoders = nn.ModuleList(
+            [
+                nn.TransformerDecoderLayer(
+                    d_model=attention_dim,
+                    nhead=attention_heads,
+                    dim_feedforward=linear_units,
+                    dropout=dropout_rate,
+                    activation="relu",
+                    batch_first=True,
+                    norm_first=normalize_before,  # TODO need to verify
+                )
+                for _ in range(num_blocks)
+            ]
+        )
+
+        self.queries = nn.Parameter(torch.zeros(1, num_queries, attention_dim))
+        self.after_norm = (
+            nn.LayerNorm(attention_dim, eps=1e-12) if normalize_before else None
+        )
+        self.window_size = window_size
+
+    def forward(self, audio_embed, mask, embed_len=None):
+        """forward decoder"""
+        # audio_embed: N x T x D => N x D x T
+
+        audio_embed = audio_embed.transpose(1, 2)
+        # audio_embed: N x D x 1 x T => N x DK x T'
+        padding = audio_embed.shape[-1] % self.window_size
+        if padding > 0:
+            audio_embed = F.pad(
+                audio_embed, (0, self.window_size - padding), "constant", 0
+            )
+
+        embed_chunk = F.unfold(
+            audio_embed[..., None, :],
+            kernel_size=(1, self.window_size),
+            stride=(1, self.window_size),
+        )
+        bsz, _, slen = embed_chunk.shape
+        # N x D x K x T'
+        embed_chunk = embed_chunk.view(bsz, -1, self.window_size, slen)
+        # N x T' x K x D
+        embed_chunk = embed_chunk.transpose(1, 3).contiguous()
+        # NT' x K x D
+        embed_chunk = embed_chunk.view(bsz * slen, self.window_size, -1)
+        # NT' x 1 x D
+        q = self.queries.expand(bsz * slen, -1, -1)
+        for layer in self.decoders:
+            q = layer(tgt=q, memory=embed_chunk, tgt_mask=None, memory_mask=mask)
+
+        if self.after_norm is not None:
+            q = self.after_norm(q)
+
+        if embed_len is not None:
+            embed_len = embed_len // self.window_size
+        # N x T' x D
+        out = q.view(bsz, slen, -1)
+
+        return out, embed_len
+
+
+class AudioEmbedding(nn.Module):
+    """Image embedding."""
+
+    def __init__(self, config: PretrainedConfig, **kwargs) -> None:
+        super().__init__()
+        self.config = config
+        # n_embed or hidden_size for text LM
+        hidden_size = config.n_embd if hasattr(config, "n_embd") else config.hidden_size
+
+        # self.wte = nn.Embedding(config.vocab_size, hidden_size)
+
+        audio_dim_out = (
+            None  # Set this variable according to the actual audio processor
+        )
+        self.layer_idx = -2
+
+        if (
+            isinstance(config.audio_processor, dict)
+            and config.audio_processor.get("name", None) == "cascades"
+        ):
+            encoder_config = config.audio_processor.get("config", None)
+            assert encoder_config is not None
+            self.encoder = ConformerEncoder(**encoder_config)
+
+            audio_dim_out = encoder_config["attention_dim"]
+            n_mels = encoder_config["input_size"]
+        else:
+            raise NotImplementedError("")
+
+        assert audio_dim_out is not None, "Remember to set values for audio_dim_out"
+        self.audio_dim_out = audio_dim_out
+        self.audio_dim_in = n_mels
+
+        self.freeze_audio_processor = kwargs.get("freeze_audio_processor", False)
+
+        self.downsample_rate = kwargs.get("downsample_rate", 1)
+
+        if kwargs.get("use_qformer", False):
+            qformer_config = kwargs.get("qformer_config", {})
+            qformer_config["attention_dim"] = audio_dim_out
+            self.qformer = WindowQformer(**qformer_config)
+        else:
+            self.qformer = None
+
+        if kwargs.get("use_conv_downsample", False):
+            assert (
+                self.qformer is None
+            ), "don't support use qformer and conv downsample together"
+            nemo_conv_settings = kwargs.get("nemo_conv_settings", {})
+            default_nemo_conv_settings = {
+                "subsampling": "dw_striding",
+                "subsampling_factor": self.downsample_rate,
+                "feat_in": audio_dim_out,
+                "feat_out": audio_dim_out,
+                "conv_channels": 256,
+                "subsampling_conv_chunking_factor": 1,
+                "activation": nn.ReLU(),
+                "is_causal": False,
+            }
+            # Override any of the defaults with the incoming, user settings
+            if nemo_conv_settings:
+                default_nemo_conv_settings.update(nemo_conv_settings)
+                for i in ["subsampling_factor", "feat_in", "feat_out"]:
+                    assert (
+                        i not in nemo_conv_settings
+                    ), "{i} should be specified outside of the NeMo dictionary"
+
+            self.conv_ds = NemoConvSubsampling(
+                **default_nemo_conv_settings,
+            )
+        else:
+            self.conv_ds = None
+
+        projection_cls = kwargs.get("projection_cls", "linear")
+        if projection_cls == "linear":
+            self.audio_projection = nn.Linear(audio_dim_out, hidden_size)
+        elif projection_cls == "mlp":
+            # follow llava-v1.5's implementation
+            # (do not use image_projection and image_proj_norm)
+            dim_projection = hidden_size
+            depth = 2
+            self.linear_downsample_rate = (
+                1 if (self.qformer or self.conv_ds) else self.downsample_rate
+            )
+            layers = [
+                nn.Linear(audio_dim_out * self.linear_downsample_rate, dim_projection)
+            ]
+            for _ in range(1, depth):
+                layers.extend([nn.GELU(), nn.Linear(dim_projection, dim_projection)])
+            self.audio_projection = nn.Sequential(*layers)
+            # NOTE vision-speech tasks use a separate projection layer
+            layers = [
+                nn.Linear(audio_dim_out * self.linear_downsample_rate, dim_projection)
+            ]
+            for _ in range(1, depth):
+                layers.extend([nn.GELU(), nn.Linear(dim_projection, dim_projection)])
+            self.audio_projection_for_vision = nn.Sequential(*layers)
+        else:
+            raise NotImplementedError(
+                f"projection_cls = {projection_cls}, not implemented"
+            )
+
+        # TODO: audio sequence compression - Qformer
+        self.vocab_size = config.vocab_size
+        self.input_embeds = None
+        self.audio_embed_sizes = None
+
+    def set_audio_embeds(self, input_embeds: torch.FloatTensor) -> None:
+        self.input_embeds = input_embeds
+
+    def set_audio_embed_sizes(self, audio_embed_sizes: torch.LongTensor) -> None:
+        self.audio_embed_sizes = audio_embed_sizes
+
+    def get_audio_features(
+        self,
+        input_embeds: torch.FloatTensor,
+        audio_attention_mask: torch.Tensor = None,
+        audio_projection_mode: str = "speech",
+    ) -> torch.FloatTensor:
+        """
+        arguments:
+            input_embeds: audio features (B, T, D)  B: num audios in a sequence
+        """
+        if self.freeze_audio_processor:
+            with torch.no_grad():
+                audio_features, masks = self.encoder(input_embeds, audio_attention_mask)
+        else:
+            audio_features, masks = self.encoder(input_embeds, audio_attention_mask)
+
+        if self.qformer is not None:
+            audio_features, _ = self.qformer(audio_features, mask=None)
+
+        if self.conv_ds is not None:
+            if masks is not None:
+                masks = masks.squeeze(1)
+
+            audio_features, masks = self.conv_ds(audio_features, mask=masks)
+
+        if self.linear_downsample_rate != 1:
+            bs, seq_len, feat_dim = audio_features.size()
+            padding = seq_len % self.linear_downsample_rate
+            if padding > 0:
+                audio_features = F.pad(
+                    audio_features,
+                    (0, 0, 0, self.linear_downsample_rate - padding),
+                    "constant",
+                    0,
+                )
+
+            seq_len = audio_features.size(1)
+            audio_features = audio_features.view(
+                bs,
+                seq_len // self.linear_downsample_rate,
+                feat_dim * self.linear_downsample_rate,
+            )
+
+        if audio_projection_mode == "speech":
+            audio_set_tensor = self.audio_projection(audio_features)
+        elif audio_projection_mode == "vision":
+            audio_set_tensor = self.audio_projection_for_vision(audio_features)
+        else:
+            raise ValueError(
+                f"audio_projection_mode = {audio_projection_mode} not " "implemented"
+            )
+
+        return audio_set_tensor
+
+    def forward(
+        self,
+        audio_features: torch.FloatTensor,
+        audio_attention_mask: torch.Tensor = None,
+        audio_projection_mode: str = "speech",
+    ) -> torch.FloatTensor:
+        """
+        arguments:
+            audio_features: audio features (num_audio_tokens, T, D)
+
+        returns:
+            audio_embeds: audio embeddings (num_audio_tokens, hidden_dim)
+        """
+        audio_embeds = self.get_audio_features(
+            audio_features,
+            audio_attention_mask=audio_attention_mask,
+            audio_projection_mode=audio_projection_mode,
+        )
+        return audio_embeds
diff --git a/python/sglang/srt/models/phi4mm_utils.py b/python/sglang/srt/models/phi4mm_utils.py
new file mode 100644
index 000000000..e6bf35ebf
--- /dev/null
+++ b/python/sglang/srt/models/phi4mm_utils.py
@@ -0,0 +1,1917 @@
+# Copyright 2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#!/usr/bin/env python3
+import math
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+
+class BlockBase(nn.Module):
+    """Block abstract module"""
+
+    def __init__(self, input_size, output_size):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+
+
+def get_activation(name="relu"):
+    """Select an activation function by name
+
+    Args:
+        name: str
+            activation function name,
+            one of ["relu", "gelu", "swish", "sigmoid"],
+            default "relu".
+    """
+    name = name.lower()
+    if name == "relu":
+        return nn.ReLU(inplace=True)
+    if name == "gelu":
+        return nn.GELU()
+    if name == "swish":
+        return Swish()
+    if name == "sigmoid":
+        return torch.nn.Sigmoid()
+    return nn.Identity()
+
+
+def adaptive_enc_mask(x_len, chunk_start_idx, left_window=0, right_window=0):
+    """
+    The function is very important for Transformer Transducer Streaming mode
+    Args:
+        xs_len (int): sequence length
+        chunk_start_idx (list): first idx of each chunk, such as [0,18,36,48].
+        It also supports adaptive chunk size [0,10,15,45]
+        left_window (int): how many left chunks can be seen
+        right_window (int): how many right chunks can be seen. It is used for
+        chunk overlap model.
+        Returns:
+            mask (torch.Tensor): a mask tensor for streaming model
+            Torch 1.0.1
+            tensor([[1., 1., 0., 0.],
+                    [0., 1., 1., 0.],
+                    [0., 0., 1., 1.]])
+            Torch 1.4.1
+            tensor([[True., True., False., False.],
+                    [False., True., True., False.],
+                    [False., False., True., True.]])
+    """
+    chunk_start_idx = torch.Tensor(
+        chunk_start_idx
+    ).long()  # first idx of each chunk, such as [0,18,36,48].
+    start_pad = torch.nn.functional.pad(
+        chunk_start_idx, (1, 0)
+    )  # append 0 to the beginning, so it becomes [0, 0, 18, 36, 48]
+    end_pad = torch.nn.functional.pad(
+        chunk_start_idx, (0, 1), value=x_len
+    )  # append x_len to the end, so it becomes [0,18,36,48, x_len]
+    seq_range = torch.arange(0, x_len).unsqueeze(-1)  # seq_range size: [x_len, 1]
+    idx = ((seq_range < end_pad) & (seq_range >= start_pad)).nonzero()[
+        :, 1
+    ]  # idx size: [x_len]
+    # boundary = end_pad[idx]  # boundary size: [x_len]
+    seq_range_expand = (
+        torch.arange(0, x_len).unsqueeze(0).expand(x_len, -1)
+    )  # seq_range_expand size [x_len, x_len]
+    idx_left = idx - left_window
+    idx_left[idx_left < 0] = 0
+    boundary_left = start_pad[idx_left]
+    mask_left = seq_range_expand >= boundary_left.unsqueeze(-1)
+    idx_right = idx + right_window
+    idx_right[idx_right > len(chunk_start_idx)] = len(chunk_start_idx)
+    boundary_right = end_pad[idx_right]
+    mask_right = seq_range_expand < boundary_right.unsqueeze(-1)
+    return mask_left & mask_right
+
+
+class Swish(nn.Module):
+    """Implement Swish activation module.
+    From https://arxiv.org/pdf/2005.03191.pdf
+
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.act_fn = nn.Sigmoid()
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Apply Swish function
+
+        Args:
+            x: torch.Tensor
+                Input.
+        """
+        return x * self.act_fn(x)
+
+
+class GLU(nn.Module):
+    """Implement Gated Linear Unit (GLU) module"""
+
+    def __init__(self, dim: int = -1, act_name: str = "sigmoid") -> None:
+        super().__init__()
+        self.dim = dim
+        self.act_name = act_name.lower()
+
+        if self.act_name == "relu":
+            self.act_fn = nn.ReLU(inplace=True)
+        elif self.act_name == "gelu":
+            self.act_fn = nn.GELU()
+        elif self.act_name == "swish":
+            self.act_fn = Swish()
+        elif self.act_name == "sigmoid":
+            self.act_fn = nn.Sigmoid()
+        else:
+            self.act_fn = nn.Identity()
+
+    def forward(self, x: Tensor) -> Tensor:
+        """GLU forward
+        Apply Swish function on the first half of input matrices
+        with sigmoid of the second half.
+
+        Args:
+            x: torch.Tensor
+                Input.
+
+        """
+        half_x, gate = x.chunk(2, dim=self.dim)
+        return half_x * self.act_fn(gate)
+
+
+# TODO: Abdel, this can be improved using GLU module
+class GLUPointWiseConv(nn.Module):
+    """GLUPointWiseConv module
+    used for conformer architecture,
+    for more details see:
+    https://arxiv.org/pdf/2005.08100v1.pdf
+
+    Args:
+        input_dim: int
+            input channel size.
+        output_dim: int
+            output channel size.
+        kernel_size: int
+            kernel size
+        glu_type: str, optional
+            activation function one of
+             ["sigmoid", "relu", "gelu"]
+              default "sigmoid".
+        bias_in_glu: bool, optional
+            use addtive bias in glu
+        causal: bool, optional
+            if set to True, padding is set to the half of
+             kernel size, ie, convolution can't see future frames.
+              default False.
+
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        output_dim,
+        kernel_size,
+        glu_type="sigmoid",
+        bias_in_glu=True,
+        causal=False,
+    ):
+        super().__init__()
+
+        self.glu_type = glu_type
+        self.output_dim = output_dim
+        self.bias_in_glu = bias_in_glu
+        if causal:
+            self.ext_pw_conv_1d = nn.Conv1d(
+                input_dim,
+                output_dim * 2,
+                kernel_size,
+                1,
+                padding=(kernel_size - 1),
+            )
+        else:
+            self.ext_pw_conv_1d = nn.Conv1d(
+                input_dim,
+                output_dim * 2,
+                kernel_size,
+                1,
+                padding=(kernel_size - 1) // 2,
+            )
+
+        if glu_type == "sigmoid":
+            self.glu_act = nn.Sigmoid()
+        elif glu_type == "relu":
+            self.glu_act = nn.ReLU()
+        elif glu_type == "gelu":
+            self.glu_act = nn.GELU()
+        elif glu_type == "swish":
+            self.glu_act = Swish()
+        else:
+            raise ValueError(f"Unsupported activation type {self.glu_act}")
+
+        if bias_in_glu:
+            self.b1 = nn.Parameter(torch.zeros(1, output_dim, 1))
+            self.b2 = nn.Parameter(torch.zeros(1, output_dim, 1))
+
+    def forward(self, x):
+        """
+        Args:
+            x: torch.Tensor
+                input tensor
+        """
+        # to be consistent with GLULinear, we assume the input always has the
+        # #channel (#dim) in the last dimension of the tensor, so need to
+        # switch the dimension first for 1D-Conv case
+        x = x.permute([0, 2, 1])
+        x = self.ext_pw_conv_1d(x)
+        if self.glu_type == "bilinear":
+            if self.bias_in_glu:
+                x = (x[:, 0 : self.output_dim, :] + self.b1) * (
+                    x[:, self.output_dim : self.output_dim * 2, :] + self.b2
+                )
+            else:
+                x = (x[:, 0 : self.output_dim, :]) * (
+                    x[:, self.output_dim : self.output_dim * 2, :]
+                )
+        else:
+            if self.bias_in_glu:
+                x = (x[:, 0 : self.output_dim, :] + self.b1) * self.glu_act(
+                    x[:, self.output_dim : self.output_dim * 2, :] + self.b2
+                )
+            else:
+                x = (x[:, 0 : self.output_dim, :]) * self.glu_act(
+                    x[:, self.output_dim : self.output_dim * 2, :]
+                )
+
+        x = x.permute([0, 2, 1])
+        return x
+
+
+class DepthWiseSeperableConv1d(nn.Module):
+    """DepthWiseSeperableConv1d module used in Convnet module
+    for the conformer, for more details see:
+    https://arxiv.org/pdf/2005.08100v1.pdf
+
+    Args:
+        input_dim: int
+            input channel size.
+        depthwise_seperable_out_channel: int
+            if set different to 0, the number of
+             depthwise_seperable_out_channel will be used as a channel_out
+             of the second conv1d layer.
+             otherwise, it equal to 0, the second conv1d layer is skipped.
+        kernel_size: int
+            kernel_size
+        depthwise_multiplier: int
+            number of input_dim channels duplication. this value
+            will be used to compute the hidden channels of the Conv1D.
+        padding: int, optional
+            padding for the conv1d,
+             default: 0.
+
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        depthwise_seperable_out_channel,
+        kernel_size,
+        depthwise_multiplier,
+        padding=0,
+    ):
+        super().__init__()
+
+        self.dw_conv = nn.Conv1d(
+            input_dim,
+            input_dim * depthwise_multiplier,
+            kernel_size,
+            1,
+            padding=padding,
+            groups=input_dim,
+        )
+
+        if depthwise_seperable_out_channel != 0:
+            self.pw_conv = nn.Conv1d(
+                input_dim * depthwise_multiplier,
+                depthwise_seperable_out_channel,
+                1,
+                1,
+                0,
+            )
+        else:
+            self.pw_conv = nn.Identity()
+        self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
+
+    def forward(self, x):
+        """
+
+        Args:
+            x: torch.Tensor
+                input tensor
+        """
+        x = self.dw_conv(x)
+        if self.depthwise_seperable_out_channel != 0:
+            x = self.pw_conv(x)
+        return x
+
+
+class ConvModule(nn.Module):
+    """ConvModule Module for the conformer block.
+    for more details see:
+    https://arxiv.org/pdf/2005.08100v1.pdf
+
+    Args:
+        input_dim: int
+            input channel size.
+        ext_pw_out_channel: int
+            if > 0, ext_pw_out_channel is a dim channel size
+             for the last pointwise conv after swish activation.
+        depthwise_seperable_out_channel: int
+            if set different to 0, the number of
+             depthwise_seperable_out_channel
+             will be used as a channel_out of the second conv1d layer.
+             otherwise, it equal to 0, the second conv1d layer is skipped.
+        ext_pw_kernel_size: int
+            kernel size of the conv pointwise of the conformer.
+        kernel_size: int
+            kernel size.
+        depthwise_multiplier: int
+            number of input_dim channels duplication. this value
+             will be used to compute the hidden channels of the Conv1D.
+        dropout_rate: float
+            dropout rate.
+        causal: bool, optional
+            if set to True, convolution have no access
+             to future frames. default False.
+        batch_norm: bool, optional
+            if set to True, apply batchnorm before activation.
+            default False
+        chunk_se: int, optional
+            0 for offline SE.
+            1 for streaming SE, where mean is computed
+             by accumulated history until current chunk_se.
+            2 for streaming SE, where mean is computed
+             by only the current chunk.
+        chunk_size: int, optional
+            chunk size for cnn. default 18
+        activation: str, optional
+            activation function used in ConvModule,
+            default: "relu".
+        glu_type: str, optional
+            activation function used for the glu,
+            default: "sigmoid".
+        bias_in_glu: bool, optional
+            if set to True, use additive bias in the weight module
+             before GLU.
+        linear_glu_in_convm: bool, optional
+            if set to True, use GLULinear module,
+             otherwise, used GLUPointWiseConv module.
+              default to False.
+        export: bool, optional,
+            if set to True, padding is equal to 0.  This is for inference,
+             or onnx export.  Typically this is set by the export program or
+             the decoder program, and it isn't present in your config file.
+             default False
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        ext_pw_out_channel,
+        depthwise_seperable_out_channel,
+        ext_pw_kernel_size,
+        kernel_size,
+        depthwise_multiplier,
+        dropout_rate,
+        causal=False,
+        batch_norm=False,
+        chunk_se=0,
+        chunk_size=18,
+        activation="relu",
+        glu_type="sigmoid",
+        bias_in_glu=True,
+        linear_glu_in_convm=False,
+        export=False,
+    ):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(input_dim)
+        self.input_dim = input_dim
+        self.ext_pw_out_channel = ext_pw_out_channel
+        self.ext_pw_kernel_size = ext_pw_kernel_size
+        self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
+        self.glu_type = glu_type
+        self.bias_in_glu = bias_in_glu
+        self.linear_glu_in_convm = linear_glu_in_convm
+        self.causal = causal
+
+        self._add_ext_pw_layer()
+
+        self.batch_norm = batch_norm
+        self.kernel_size = kernel_size
+
+        if batch_norm:
+            self.bn_layer = nn.BatchNorm1d(input_dim)
+
+        self.act = get_activation(activation)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.export = export
+
+        if causal:
+            padding = 0 if export else kernel_size - 1
+        else:
+            padding = (kernel_size - 1) // 2
+
+        self.dw_sep_conv_1d = DepthWiseSeperableConv1d(
+            input_dim,
+            depthwise_seperable_out_channel,
+            kernel_size,
+            depthwise_multiplier,
+            padding=padding,
+        )
+
+        if depthwise_seperable_out_channel != 0:
+            if input_dim != depthwise_seperable_out_channel:
+                self.ln2 = nn.Linear(depthwise_seperable_out_channel, input_dim)
+        else:
+            if depthwise_multiplier != 1:
+                self.ln2 = nn.Linear(input_dim * depthwise_multiplier, input_dim)
+
+    def _add_ext_pw_layer(self):
+        """
+        This function is an extension of __init__ function
+        and dedicated to the convolution module creation
+        of the conformer.
+        """
+        self.ln1 = self.glu = self.bn_layer = self.ext_pw_conv_1d = (
+            nn.Identity()
+        )  # jit hacks.
+        self.squeeze_excitation = nn.Identity()  # jit.
+        self.apply_ln1 = self.fix_len1 = False  # jit.
+
+        if self.ext_pw_out_channel != 0:
+            if self.causal:
+                self.ext_pw_conv_1d = nn.Conv1d(
+                    self.input_dim,
+                    self.ext_pw_out_channel,
+                    self.ext_pw_kernel_size,
+                    1,
+                    padding=(self.ext_pw_kernel_size - 1),
+                )
+                if self.ext_pw_kernel_size > 1:
+                    self.fix_len1 = True
+                else:
+                    self.fix_len1 = False
+            else:
+                self.ext_pw_conv_1d = nn.Conv1d(
+                    self.input_dim,
+                    self.ext_pw_out_channel,
+                    self.ext_pw_kernel_size,
+                    1,
+                    padding=(self.ext_pw_kernel_size - 1) // 2,
+                )
+                self.fix_len1 = False
+
+            if self.linear_glu_in_convm:
+                self.glu = GLULinear(
+                    self.input_dim,
+                    self.ext_pw_out_channel,
+                    self.glu_type,
+                    self.bias_in_glu,
+                )
+            else:
+                self.glu = GLUPointWiseConv(
+                    self.input_dim,
+                    self.ext_pw_out_channel,
+                    self.ext_pw_kernel_size,
+                    self.glu_type,
+                    self.bias_in_glu,
+                    self.causal,
+                )
+
+            if self.input_dim != self.ext_pw_out_channel:
+                self.apply_ln1 = True
+                self.ln1 = nn.Linear(self.ext_pw_out_channel, self.input_dim)
+            else:
+                self.apply_ln1 = False
+        else:
+            self.pw_conv_simplify_w = torch.nn.Parameter(torch.ones(3))
+            self.pw_conv_simplify_b = torch.nn.Parameter(torch.zeros(3))
+
+    def forward(self, x):
+        """ConvModule Forward.
+
+        Args:
+            x: torch.Tensor
+                input tensor.
+        """
+        x = self.layer_norm(x)
+
+        if self.ext_pw_out_channel != 0:
+            x = self.glu(x)
+            if self.causal and self.ext_pw_kernel_size > 1:
+                x = x[:, : -(self.ext_pw_kernel_size - 1), :]
+            if self.apply_ln1:
+                x = self.ln1(x)
+        else:
+            x_0 = x * self.pw_conv_simplify_w[0] + self.pw_conv_simplify_b[0]
+            x_1 = x * self.pw_conv_simplify_w[1] + self.pw_conv_simplify_b[1]
+            x = x_0 + x_1
+
+        x = x.permute([0, 2, 1])
+
+        x = self.dw_sep_conv_1d(x)
+        if self.causal and self.kernel_size > 1:
+            x = x[:, :, : -(self.kernel_size - 1)]
+        if hasattr(self, "ln2"):
+            x = x.permute([0, 2, 1])
+            x = self.ln2(x)
+            x = x.permute([0, 2, 1])
+        if self.batch_norm:
+            x = self.bn_layer(x)
+        x = self.act(x)
+
+        if self.ext_pw_out_channel != 0:
+            x = self.ext_pw_conv_1d(x)
+            if self.fix_len1:
+                x = x[:, :, : -(self.ext_pw_kernel_size - 1)]
+
+            if self.apply_ln1:
+                x = x.permute([0, 2, 1])
+                x = self.ln1(x)
+                x = x.permute([0, 2, 1])
+
+            x = x.permute([0, 2, 1])
+        else:
+            x = x.unsqueeze(1).permute([0, 1, 3, 2])
+            x = x * self.pw_conv_simplify_w[2] + self.pw_conv_simplify_b[2]
+            x = x.squeeze(1)
+
+        x = self.dropout(x)
+        return x
+
+
+class GLULinear(nn.Module):
+    """Linear + GLU module
+
+    Args:
+        input_dim: int
+            input size
+        output_dim: int
+            output size.
+        glu_type:
+            activation function name used in glu module.
+            default "sigmoid" (swish function).
+        bias_in_glu: bool, optional
+            If True, the addtive bias is added. Default False.
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        output_dim,
+        glu_type="sigmoid",
+        bias_in_glu=True,
+    ):
+        super().__init__()
+        self.linear = nn.Linear(input_dim, output_dim * 2, bias_in_glu)
+        self.glu_act = GLU(-1, glu_type)
+
+    def forward(self, x):
+        """GLULinear forward
+
+        Args:
+            x: torch.Tensor
+                inpute tensor.
+        """
+        x = self.linear(x)
+        return self.glu_act(x)
+
+
+class FeedForward(nn.Module):
+    """FeedForward Module.
+    For more details see Conformer paper:
+        https://arxiv.org/pdf/2005.08100.pdf
+
+    Args:
+        d_model: int
+            input size.
+        d_inner: int
+            output size.
+        dropout_rate: float,
+            dropout rate.
+        activation: str,
+            activation function name,
+            one of ["relu", "swish", "sigmoid"],
+            sigmoid activation is only used with "glu_in_fnn=True",
+            default "sigmoid".
+        bias_in_glu: bool, optional
+    """
+
+    def __init__(
+        self,
+        d_model,
+        d_inner,
+        dropout_rate,
+        activation="sigmoid",
+        bias_in_glu=True,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.d_inner = d_inner
+
+        self.layer_norm = nn.LayerNorm(d_model)
+        module = GLULinear(d_model, d_inner, activation, bias_in_glu)
+        self.net = nn.Sequential(
+            module,
+            nn.Dropout(dropout_rate),
+            nn.Linear(d_inner, d_model),
+            nn.Dropout(dropout_rate),
+        )
+
+    def forward(self, x):
+        """FeedForward forward function.
+
+        Args:
+            x: torch.Tensor
+                input tensor.
+        """
+        out = self.net(self.layer_norm(x))
+
+        return out
+
+
+#### positional encoding starts here
+def _pre_hook(
+    state_dict,
+    prefix,
+    local_metadata,
+    strict,
+    missing_keys,
+    unexpected_keys,
+    error_msgs,
+):
+    """Perform pre-hook in load_state_dict for backward compatibility.
+
+    Note:
+        We saved self.pe until v.0.5.2 but we have omitted it later.
+        Therefore, we remove the item "pe" from `state_dict` for backward
+        compatibility.
+
+    """
+    k = prefix + "pe"
+    if k in state_dict:
+        state_dict.pop(k)
+
+
+class T5RelativeAttentionLogitBias(nn.Module):
+    """
+    This module implements the relative position bias described in Section
+    2.1 of the T5 paper: https://arxiv.org/pdf/1910.10683.pdf
+
+    The Huggingface implementation is used as a reference
+    https://github.com/huggingface/transformers/blob/v4.30.0/src/
+    transformers/models/t5/modeling_t5.py#L435
+
+    Modifies attention as Q*K^T + B, where B is a learned scalar bias based
+    on relative position of the query and key. It is HxNxN, where H is the
+    number of heads, N is the sequence length.
+
+    I've made these modifications to the original T5 bias:
+    - Skipping of the bucketing step. Original T5 bias converted rel
+      position distances into logarithmically increasing buckets. This is
+      supposed to help with length generalization.
+    - I just directly use rel position index as bias values, as we don't
+      need length generalization (40s max is good enough for ASR encoder),
+      and it keeps ONNX export simple.
+    - I've also extended it so that biases can be asymmetric, the default
+      implementation treats L->R and R->L the same. Asymmetric was found to
+      yield better results in my experiments.
+
+    Args:
+        num_heads: int
+            Number of attention heads
+        num_buckets: int
+            Number of buckets to use for relative attention bias. This is the
+            size of the learnable bias parameter. Bucketing is not yet
+            supported, so this defaults to -1 which means no bucketing is
+            used (max_distance determines size of bias param).
+        max_distance: int
+            Maximum distance to use for relative attention bias. With
+            num_buckets=-1, this directly controls the max size of the bias
+            parameter. When num_buckets > 0 is supported, this will control
+            the maximum distance for logarithmic bucketing after which all
+            positions are in the same bucket.
+        symmetric: bool
+            Whether to use symmetric or asymmetric biases. symmetric=False uses
+            2x number of bias params to distinguish L->R from R->L. This was
+            found to be better for the encoder.
+    """
+
+    def __init__(self, num_heads, num_buckets=-1, max_distance=1000, symmetric=False):
+        super().__init__()
+        self.num_heads = num_heads
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+        self.symmetric = symmetric
+        self._skip_bucketing = self.num_buckets < 0
+        if self._skip_bucketing:
+            self.num_buckets = max_distance
+        else:
+            raise NotImplementedError(
+                "T5 attention bias with bucketed positions is not yet tested"
+            )
+        if not self.symmetric:
+            self.num_buckets *= 2
+        self.bias_values = nn.Embedding(self.num_buckets, self.num_heads)
+
+    def forward(self, x):
+        # instantiate bias compatible with shape of x
+        maxpos = x.size(1)
+        context_position = torch.arange(maxpos, device=x.device, dtype=torch.long)[
+            :, None
+        ]
+        memory_position = torch.arange(maxpos, device=x.device, dtype=torch.long)[
+            None, :
+        ]
+        relative_position = memory_position - context_position
+        # clipping to a maximum distance using ops that play well with ONNX
+        # export
+        relative_position = relative_position.masked_fill(
+            relative_position < -self.max_distance, -self.max_distance
+        )
+        relative_position = relative_position.masked_fill(
+            relative_position > self.max_distance - 1, self.max_distance - 1
+        )
+
+        # mapping from relative position to index in the bias parameter
+        if self._skip_bucketing:
+            bias_idx = relative_position
+        else:
+            bias_idx = self._bucket_relative_position(relative_position)
+        if self.symmetric:
+            bias_idx = bias_idx.abs()
+        else:
+            bias_idx += self.num_buckets // 2
+
+        t5_rel_att_bias = self.bias_values(bias_idx)  # [L, L, H]
+        t5_rel_att_bias = t5_rel_att_bias.permute(2, 0, 1).unsqueeze(0)  # [1, H, L, L]
+
+        return t5_rel_att_bias
+
+    def _bucket_relative_position(self, relative_position):
+        # this is a placeholder (isn't tested, likely buggy) using HuggingFace
+        # implem as a reference this also needs to be extended to support
+        # asymmetric +/- ve positions
+        relative_buckets = 0
+        if not self.causal:
+            self.num_buckets //= 2
+            relative_buckets += (relative_position > 0).to(
+                torch.long
+            ) * self.num_buckets
+            relative_position = torch.abs(relative_position)
+        else:
+            relative_position = -torch.min(
+                relative_position, torch.zeros_like(relative_position)
+            )
+        # now relative_position is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = self.num_buckets // 2
+        is_small = relative_position < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in
+        # positions up to max_distance
+        relative_position_if_large = max_exact + (
+            torch.log(relative_position.float() / max_exact)
+            / math.log(self.max_distance / max_exact)
+            * (self.num_buckets - max_exact)
+        ).to(torch.long)
+        relative_position_if_large = torch.min(
+            relative_position_if_large,
+            torch.full_like(relative_position_if_large, self.num_buckets - 1),
+        )
+
+        relative_buckets += torch.where(
+            is_small, relative_position, relative_position_if_large
+        )
+        return relative_buckets
+
+
+class AbsolutePositionalEncoding(nn.Module):
+    """Absolute Positional encoding module.
+    This module implement Absolute sinusoidal positional encoding
+    from: https://arxiv.org/pdf/1706.03762.pdf
+
+    Args:
+        d_model: int
+            Input embedding size.
+        dropout_rate: float
+            dropout rate
+        max_len: int, optional
+            Maximum input length sequence, Default 5000
+
+    """
+
+    def __init__(self, d_model, dropout_rate, max_len=5000):
+        """Construct an PositionalEncoding object."""
+        super().__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+        self._register_load_state_dict_pre_hook(_pre_hook)
+
+    def extend_pe(self, x):
+        """Reset the positional encodings.
+
+        Args:
+            x: torch.Tensor
+        """
+        if self.pe is not None and self.pe.size(1) >= x.size(1):
+            if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+            return
+        pe = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, x: torch.Tensor):
+        """Add positional encoding.
+
+        Args:
+            x: torch.Tensor
+                Input tensor. shape is (batch, time, ...)
+
+        Returns:
+            torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
+
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, : x.size(1)]
+        return self.dropout(x)
+
+
+#### forward embedding layers starts here
+class MeanVarianceNormLayer(nn.Module):
+    """Mean/variance normalization layer.
+
+    Will subtract mean and multiply input by inverted standard deviation.
+    Typically used as a very first layer in a model.
+
+    Args:
+        input_size: int
+            layer input size.
+    """
+
+    def __init__(self, input_size):
+        super().__init__()
+        self.input_size = input_size
+        self.global_mean = nn.Parameter(torch.zeros(input_size))
+        self.global_invstd = nn.Parameter(torch.ones(input_size))
+
+    def forward(self, input_: Tensor) -> Tensor:
+        """MeanVarianceNormLayer Forward
+
+        Args:
+            input_: torch.Tensor
+                input tensor.
+        """
+        return (input_ - self.global_mean) * self.global_invstd
+
+
+class CausalConv1D(nn.Conv1d):
+    """
+    A causal version of nn.Conv1d where each step would have limited access to
+    locations on its right or left
+    All arguments are the same as nn.Conv1d except padding.
+
+    If padding is set None, then paddings are set automatically to make it a
+    causal convolution where each location would not see any steps on its right.
+
+    If padding is set as a list (size of 2), then padding[0] would be used as
+    left padding and padding[1] as right padding.
+    It would make it possible to control the number of steps to be accessible
+    on the right and left.
+    This mode is not supported when stride > 1. padding[0]+padding[1] should
+    be equal to (kernel_size - 1).
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: Union[str, int] = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        self.cache_drop_size = None
+        if padding is None:
+            self._left_padding = kernel_size - 1
+            self._right_padding = stride - 1
+        else:
+            if stride != 1 and padding != kernel_size - 1:
+                raise ValueError("No striding allowed for non-symmetric convolutions!")
+            if isinstance(padding, int):
+                self._left_padding = padding
+                self._right_padding = padding
+            elif (
+                isinstance(padding, list)
+                and len(padding) == 2
+                and padding[0] + padding[1] == kernel_size - 1
+            ):
+                self._left_padding = padding[0]
+                self._right_padding = padding[1]
+            else:
+                raise ValueError(f"Invalid padding param: {padding}!")
+
+        self._max_cache_len = self._left_padding
+
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=0,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode,
+            device=device,
+            dtype=dtype,
+        )
+
+    def update_cache(self, x, cache=None):
+        if cache is None:
+            new_x = F.pad(x, pad=(self._left_padding, self._right_padding))
+            next_cache = cache
+        else:
+            new_x = F.pad(x, pad=(0, self._right_padding))
+            new_x = torch.cat([cache, new_x], dim=-1)
+            if self.cache_drop_size > 0:
+                next_cache = new_x[:, :, : -self.cache_drop_size]
+            else:
+                next_cache = new_x
+            next_cache = next_cache[:, :, -cache.size(-1) :]
+        return new_x, next_cache
+
+    def forward(self, x, cache=None):
+        x, cache = self.update_cache(x, cache=cache)
+        x = super().forward(x)
+        if cache is None:
+            return x
+        else:
+            return x, cache
+
+
+class CausalConv2D(nn.Conv2d):
+    """
+    A causal version of nn.Conv2d where each location in the 2D matrix would
+    have no access to locations on its right or down
+    All arguments are the same as nn.Conv2d except padding which should be
+    set as None
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: Union[str, int] = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        if padding is not None:
+            raise ValueError("Argument padding should be set to None for CausalConv2D.")
+        self._left_padding = kernel_size - 1
+        self._right_padding = stride - 1
+
+        padding = 0
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode,
+            device,
+            dtype,
+        )
+
+    def forward(
+        self,
+        x,
+    ):
+        x = F.pad(
+            x,
+            pad=(self._left_padding, self._right_padding, 0, 0),
+        )
+        x = super().forward(x)
+        return x
+
+
+class NemoConvSubsampling(torch.nn.Module):
+    """Convlutional subsampling module, taken from NeMo ASR
+    (https://github.com/NVIDIA/NeMo/blob/b367413645d5c72db3c2c96e46e95a
+    34501479cf/nemo/collections/asr/parts/submodules/subsampling.py)
+
+    Striding Subsampling: "Speech-Transformer: A No-Recurrence
+    Sequence-to-Sequence Model for Speech Recognition" by Linhao Dong
+    et al. (https://ieeexplore.ieee.org/document/8462506)
+
+
+    Compared with the EncoderConv2D (`input_layer: custom`), this is a
+    much simplified approach, and uses no LayerNorm and far fewer Conv2Ds.
+    Moreover, depthwise convolutions are used to reduce FLOPs, but the first
+      layer is kept as a regular convolution so as not to degrade accuracy.
+
+    `Striding` and `dw_striding` are the same except that the latter uses
+    depthwise convolutions after the first layer, whereas the former does not.
+
+    Args:
+        subsampling_factor (int): Time reduction factor
+        feat_in (int): size of the input features
+        feat_out (int): size of the output features
+        subsampling (str): The subsampling technique, choose from
+            {"striding", "dw-striding", "striding_conv1d",
+            "dw_striding_conv1d"}
+        conv_channels (int): Number of channels for the convolution layers,
+                            default is 256.
+        subsampling_conv_chunking_factor (int): Input chunking factor which
+            can be -1 (no chunking) 1 (auto) or a power of 2. Default is 1
+        activation (Module): activation function, default is nn.ReLU()
+        is_causal (bool): whether to use causal Conv1/2D, where each step will
+            have limited access to locations on its right or left
+    """
+
+    def __init__(
+        self,
+        feat_in,
+        feat_out,
+        subsampling_factor=4,
+        subsampling="dw_striding",
+        conv_channels=256,
+        subsampling_conv_chunking_factor=1,
+        activation=nn.ReLU(),  # noqa: B008
+        is_causal=False,
+    ):
+        super().__init__()
+        self._subsampling = subsampling
+        self._conv_channels = conv_channels
+        self._feat_in = feat_in
+        self._feat_out = feat_out
+
+        if subsampling_factor % 2 != 0:
+            raise ValueError("Sampling factor should be a multiply of 2!")
+        self._sampling_num = int(math.log(subsampling_factor, 2))
+        self.subsampling_factor = subsampling_factor
+        self.is_causal = is_causal
+        self.subsampling_causal_cond = subsampling in (
+            "dw_striding",
+            "striding",
+            "striding_conv1d",
+        )
+
+        if (
+            subsampling_conv_chunking_factor != -1
+            and subsampling_conv_chunking_factor != 1
+            and subsampling_conv_chunking_factor % 2 != 0
+        ):
+            raise ValueError(
+                "subsampling_conv_chunking_factor should be -1, 1, or a " "power of 2"
+            )
+        self.subsampling_conv_chunking_factor = subsampling_conv_chunking_factor
+
+        in_channels = 1
+        layers = []
+
+        if subsampling == "dw_striding":
+            self._stride = 2
+            self._kernel_size = 3
+            self._ceil_mode = False
+
+            if self.is_causal:
+                self._left_padding = self._kernel_size - 1
+                self._right_padding = self._stride - 1
+                self._max_cache_len = subsampling_factor + 1
+            else:
+                self._left_padding = (self._kernel_size - 1) // 2
+                self._right_padding = (self._kernel_size - 1) // 2
+                self._max_cache_len = 0
+
+            # Layer 1
+            if self.is_causal:
+                layers.append(
+                    CausalConv2D(
+                        in_channels=in_channels,
+                        out_channels=conv_channels,
+                        kernel_size=self._kernel_size,
+                        stride=self._stride,
+                        padding=None,
+                    )
+                )
+            else:
+                layers.append(
+                    torch.nn.Conv2d(
+                        in_channels=in_channels,
+                        out_channels=conv_channels,
+                        kernel_size=self._kernel_size,
+                        stride=self._stride,
+                        padding=self._left_padding,
+                    )
+                )
+            in_channels = conv_channels
+            layers.append(activation)
+
+            for i in range(self._sampling_num - 1):
+                if self.is_causal:
+                    layers.append(
+                        CausalConv2D(
+                            in_channels=in_channels,
+                            out_channels=in_channels,
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=None,
+                            groups=in_channels,
+                        )
+                    )
+                else:
+                    layers.append(
+                        torch.nn.Conv2d(
+                            in_channels=in_channels,
+                            out_channels=in_channels,
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=self._left_padding,
+                            groups=in_channels,
+                        )
+                    )
+
+                layers.append(
+                    torch.nn.Conv2d(
+                        in_channels=in_channels,
+                        out_channels=conv_channels,
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        groups=1,
+                    )
+                )
+                layers.append(activation)
+                in_channels = conv_channels
+
+        elif subsampling == "striding":
+            self._stride = 2
+            self._kernel_size = 3
+            self._ceil_mode = False
+
+            if self.is_causal:
+                self._left_padding = self._kernel_size - 1
+                self._right_padding = self._stride - 1
+                self._max_cache_len = subsampling_factor + 1
+            else:
+                self._left_padding = (self._kernel_size - 1) // 2
+                self._right_padding = (self._kernel_size - 1) // 2
+                self._max_cache_len = 0
+
+            for i in range(self._sampling_num):
+                if self.is_causal:
+                    layers.append(
+                        CausalConv2D(
+                            in_channels=in_channels,
+                            out_channels=conv_channels,
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=None,
+                        )
+                    )
+                else:
+                    layers.append(
+                        torch.nn.Conv2d(
+                            in_channels=in_channels,
+                            out_channels=conv_channels,
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=self._left_padding,
+                        )
+                    )
+                layers.append(activation)
+                in_channels = conv_channels
+
+        elif subsampling == "striding_conv1d":
+            in_channels = feat_in
+
+            self._stride = 2
+            self._kernel_size = 5
+            self._ceil_mode = False
+
+            if self.is_causal:
+                self._left_padding = self._kernel_size - 1
+                self._right_padding = self._stride - 1
+                self._max_cache_len = subsampling_factor + 1
+            else:
+                self._left_padding = (self._kernel_size - 1) // 2
+                self._right_padding = (self._kernel_size - 1) // 2
+                self._max_cache_len = 0
+
+            for i in range(self._sampling_num):
+                if self.is_causal:
+                    layers.append(
+                        CausalConv1D(
+                            in_channels=in_channels,
+                            out_channels=(
+                                feat_out
+                                if self._sampling_num == i + 1
+                                else conv_channels
+                            ),
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=None,
+                        )
+                    )
+                else:
+                    layers.append(
+                        torch.nn.Conv1d(
+                            in_channels=in_channels,
+                            out_channels=(
+                                feat_out
+                                if self._sampling_num == i + 1
+                                else conv_channels
+                            ),
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=self._left_padding,
+                        )
+                    )
+                layers.append(activation)
+                in_channels = conv_channels
+
+        elif subsampling == "dw_striding_conv1d":
+            in_channels = feat_in
+
+            self._stride = 2
+            self._kernel_size = 5
+            self._ceil_mode = False
+
+            self._left_padding = (self._kernel_size - 1) // 2
+            self._right_padding = (self._kernel_size - 1) // 2
+
+            # Layer 1
+            layers.extend(
+                [
+                    torch.nn.Conv1d(
+                        in_channels=in_channels,
+                        out_channels=in_channels,
+                        kernel_size=self._kernel_size,
+                        stride=self._stride,
+                        padding=self._left_padding,
+                        groups=in_channels,
+                    ),
+                    torch.nn.Conv1d(
+                        in_channels=in_channels,
+                        out_channels=(
+                            feat_out if self._sampling_num == 1 else conv_channels
+                        ),
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        groups=1,
+                    ),
+                ]
+            )
+            in_channels = conv_channels
+            layers.append(activation)
+
+            for i in range(self._sampling_num - 1):
+                layers.extend(
+                    [
+                        torch.nn.Conv1d(
+                            in_channels=in_channels,
+                            out_channels=in_channels,
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=self._left_padding,
+                            groups=in_channels,
+                        ),
+                        torch.nn.Conv1d(
+                            in_channels=in_channels,
+                            out_channels=(
+                                feat_out
+                                if self._sampling_num == i + 2
+                                else conv_channels
+                            ),
+                            kernel_size=1,
+                            stride=1,
+                            padding=0,
+                            groups=1,
+                        ),
+                    ]
+                )
+                layers.append(activation)
+                in_channels = conv_channels
+
+        else:
+            raise ValueError(f"Not valid sub-sampling: {subsampling}!")
+
+        if subsampling in ["dw_striding", "striding"]:
+            in_length = torch.tensor(feat_in, dtype=torch.float)
+            out_length = calc_length(
+                lengths=in_length,
+                all_paddings=self._left_padding + self._right_padding,
+                kernel_size=self._kernel_size,
+                stride=self._stride,
+                ceil_mode=self._ceil_mode,
+                repeat_num=self._sampling_num,
+            )
+            self.out = torch.nn.Linear(conv_channels * int(out_length), feat_out)
+            self.conv2d_subsampling = True
+        elif subsampling in ["striding_conv1d", "dw_striding_conv1d"]:
+            self.out = None
+            self.conv2d_subsampling = False
+        else:
+            raise ValueError(f"Not valid sub-sampling: {subsampling}!")
+
+        self.conv = torch.nn.Sequential(*layers)
+
+    def get_sampling_frames(self):
+        return [1, self.subsampling_factor]
+
+    def get_streaming_cache_size(self):
+        return [0, self.subsampling_factor + 1]
+
+    def forward(self, x, mask):
+        """
+        Forward method for NeMo subsampling.
+
+        Args:
+            x[Batch, Time, Filters]: torch.Tensor
+                input tensor
+            x_mask: torch.Tensor
+                input mask
+
+        Returns:
+            x: torch.Tensor
+                Resulting tensor from subsampling (B, T //
+                time_reduction_factor, feat_out)
+            pad_mask: torch.Tensor
+                tensor of padded hidden state sequences (B, 1, T //
+                time_reduction_factor)
+        """
+        x = x.unsqueeze(1) if self.conv2d_subsampling else x.transpose(1, 2)
+
+        # split inputs if chunking_factor is set
+        if self.subsampling_conv_chunking_factor != -1 and self.conv2d_subsampling:
+            if self.subsampling_conv_chunking_factor == 1:
+                # if subsampling_conv_chunking_factor is 1, we split only
+                # if needed.
+                # avoiding a bug / feature limiting indexing of tensors
+                # to 2**31.
+                # see https://github.com/pytorch/pytorch/issues/80020
+                x_ceil = 2**31 / self._conv_channels * self._stride * self._stride
+                need_to_split = torch.numel(x) > x_ceil
+            else:
+                # if subsampling_conv_chunking_factor > 1 we always split
+                need_to_split = True
+
+            if need_to_split:
+                x, success = self.conv_split_by_batch(x)
+                if not success:  # if unable to split by batch, try by channel
+                    if self._subsampling == "dw_striding":
+                        x = self.conv_split_by_channel(x)
+                    else:
+                        x = self.conv(x)  # try anyway
+            else:
+                x = self.conv(x)
+        else:
+            x = self.conv(x)
+
+        # Flatten Channel and Frequency Axes
+        if self.conv2d_subsampling:
+            b, c, t, f = x.size()
+            x = self.out(x.transpose(1, 2).reshape(b, t, -1))
+        # Transpose to Channel Last mode
+        else:
+            x = x.transpose(1, 2)
+
+        if mask is None:
+            return x, None
+
+        max_audio_length = x.shape[1]
+        feature_lens = mask.sum(1)
+        padding_length = torch.ceil(feature_lens / self.subsampling_factor)
+        if self.is_causal and self.subsampling_causal_cond:
+            feature_lens_remainder = feature_lens % self.subsampling_factor
+            padding_length[feature_lens_remainder != 1] += 1
+        pad_mask = torch.arange(0, max_audio_length, device=x.device).expand(
+            padding_length.size(0), -1
+        ) < padding_length.unsqueeze(1)
+        return x, pad_mask.unsqueeze(1)
+
+    def reset_parameters(self):
+        # initialize weights
+        if self._subsampling == "dw_striding":
+            with torch.no_grad():
+                # init conv
+                scale = 1.0 / self._kernel_size
+                dw_max = (self._kernel_size**2) ** -0.5
+                pw_max = self._conv_channels**-0.5
+
+                torch.nn.init.uniform_(self.conv[0].weight, -scale, scale)
+                torch.nn.init.uniform_(self.conv[0].bias, -scale, scale)
+
+                for idx in range(2, len(self.conv), 3):
+                    torch.nn.init.uniform_(self.conv[idx].weight, -dw_max, dw_max)
+                    torch.nn.init.uniform_(self.conv[idx].bias, -dw_max, dw_max)
+                    torch.nn.init.uniform_(self.conv[idx + 1].weight, -pw_max, pw_max)
+                    torch.nn.init.uniform_(self.conv[idx + 1].bias, -pw_max, pw_max)
+
+                # init fc (80 * 64 = 5120 from https://github.com/kssteven418/
+                # Squeezeformer/blob/13c97d6cf92f2844d2cb3142b4c5bfa9ad1a8951/
+                # src/models/conformer_encoder.py#L487
+                fc_scale = (self._feat_out * self._feat_in / self._sampling_num) ** -0.5
+                torch.nn.init.uniform_(self.out.weight, -fc_scale, fc_scale)
+                torch.nn.init.uniform_(self.out.bias, -fc_scale, fc_scale)
+
+    def conv_split_by_batch(self, x):
+        """Tries to split input by batch, run conv and concat results"""
+        b, _, _, _ = x.size()
+        if b == 1:  # can't split if batch size is 1
+            return x, False
+
+        if self.subsampling_conv_chunking_factor > 1:
+            cf = self.subsampling_conv_chunking_factor
+        else:
+            # avoiding a bug / feature limiting indexing of tensors to 2**31
+            # see https://github.com/pytorch/pytorch/issues/80020
+            x_ceil = 2**31 / self._conv_channels * self._stride * self._stride
+            p = math.ceil(math.log(torch.numel(x) / x_ceil, 2))
+            cf = 2**p
+
+        new_batch_size = b // cf
+        if new_batch_size == 0:  # input is too big
+            return x, False
+
+        return (
+            torch.cat(
+                [self.conv(chunk) for chunk in torch.split(x, new_batch_size, 0)]
+            ),
+            True,
+        )
+
+    def conv_split_by_channel(self, x):
+        """For dw convs, tries to split input by time, run conv and concat
+        results"""
+        x = self.conv[0](x)  # full conv2D
+        x = self.conv[1](x)  # activation
+
+        for i in range(self._sampling_num - 1):
+            _, c, t, _ = x.size()
+
+            if self.subsampling_conv_chunking_factor > 1:
+                cf = self.subsampling_conv_chunking_factor
+            else:
+                # avoiding a bug / feature limiting indexing of tensors
+                # to 2**31
+                # see https://github.com/pytorch/pytorch/issues/80020
+                p = math.ceil(math.log(torch.numel(x) / 2**31, 2))
+                cf = 2**p
+
+            new_c = int(c // cf)
+            if new_c == 0:
+                new_c = 1
+
+            new_t = int(t // cf)
+            if new_t == 0:
+                new_t = 1
+
+            x = self.channel_chunked_conv(
+                self.conv[i * 3 + 2], new_c, x
+            )  # conv2D, depthwise
+
+            # splitting pointwise convs by time
+            x = torch.cat(
+                [self.conv[i * 3 + 3](chunk) for chunk in torch.split(x, new_t, 2)],
+                2,
+            )  # conv2D, pointwise
+            x = self.conv[i * 3 + 4](x)  # activation
+        return x
+
+    def channel_chunked_conv(self, conv, chunk_size, x):
+        """Performs channel chunked convolution"""
+
+        ind = 0
+        out_chunks = []
+        for chunk in torch.split(x, chunk_size, 1):
+            step = chunk.size()[1]
+
+            if self.is_causal:
+                chunk = nn.functional.pad(
+                    chunk,
+                    pad=(
+                        self._kernel_size - 1,
+                        self._stride - 1,
+                        self._kernel_size - 1,
+                        self._stride - 1,
+                    ),
+                )
+                ch_out = nn.functional.conv2d(
+                    chunk,
+                    conv.weight[ind : ind + step, :, :, :],
+                    bias=conv.bias[ind : ind + step],
+                    stride=self._stride,
+                    padding=0,
+                    groups=step,
+                )
+            else:
+                ch_out = nn.functional.conv2d(
+                    chunk,
+                    conv.weight[ind : ind + step, :, :, :],
+                    bias=conv.bias[ind : ind + step],
+                    stride=self._stride,
+                    padding=self._left_padding,
+                    groups=step,
+                )
+            out_chunks.append(ch_out)
+            ind += step
+
+        return torch.cat(out_chunks, 1)
+
+    def change_subsampling_conv_chunking_factor(
+        self, subsampling_conv_chunking_factor: int
+    ):
+        if (
+            subsampling_conv_chunking_factor != -1
+            and subsampling_conv_chunking_factor != 1
+            and subsampling_conv_chunking_factor % 2 != 0
+        ):
+            raise ValueError(
+                "subsampling_conv_chunking_factor should be -1, 1, or a " "power of 2"
+            )
+        self.subsampling_conv_chunking_factor = subsampling_conv_chunking_factor
+
+
+def calc_length(lengths, all_paddings, kernel_size, stride, ceil_mode, repeat_num=1):
+    """Calculates the output length of a Tensor passed through a convolution or
+    max pooling layer"""
+    add_pad: float = all_paddings - kernel_size
+    one: float = 1.0
+    for i in range(repeat_num):
+        lengths = torch.div(lengths.to(dtype=torch.float) + add_pad, stride) + one
+        lengths = torch.ceil(lengths) if ceil_mode else torch.floor(lengths)
+    return lengths.to(dtype=torch.int)
+
+
+####  multihead attention starts here
+class AttModule(nn.Module):
+    """Attention abstraction module"""
+
+    def __init__(self):
+        super().__init__()
+        self.export_mode = False
+
+    def set_export(self, mode=True):
+        """set the export mode"""
+        self.export_mode = mode
+
+    def forward(
+        self,
+        x: Tensor,
+        memory: Optional[Tensor] = None,
+        pos_emb: Optional[Tensor] = None,
+        att_mask: Optional[Tensor] = None,
+    ) -> tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]:
+        """AttModule forward
+
+        Args:
+            x: torch.Tensor
+                input tensor.
+            memory: torch.Tensor, optional
+                memory tensor.
+            pos_emb: torch.Tensor, optional
+                positional encoder embedding.
+            att_mask: torch.Tensor, optional
+                attention mask tensor.
+        """
+        return x, memory, pos_emb, att_mask
+
+
+class AttBlock(BlockBase, AttModule):
+    """Attention Block module to support both Attention and Block module."""
+
+    def memory_dims(self, max_len=False):
+        """memory dimensions"""
+        return (1, self.input_size)
+
+
+def masked_softmax(
+    scores,
+    mask: Optional[Tensor],
+):
+    if mask is not None:
+        mask = mask.unsqueeze(1).eq(0)  # (batch, 1, time1, time2)
+        scores = scores.masked_fill(mask, -torch.inf)
+        attn = torch.softmax(scores, dim=-1).masked_fill(
+            mask, 0.0
+        )  # (batch, head, time1, time2)
+    else:
+        attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+    return attn
+
+
+class MultiHeadedAttention(nn.Module):
+    """Multi-Head Attention layer with optional relative position embedding
+    and GLU.
+
+    Args:
+        n_head: int
+            the number of heads.
+        n_feat: int
+            input size features.
+        dropout_rate: float
+            dropout rate.
+        use_LN: bool
+            apply layer norm or not
+        dropout_at_output: bool
+            whether to apply dropout at output
+        attention_inner_dim: int, optional
+            the attention dimension used in the class,
+            it can be different from the input dimension n_feat.
+            default: -1 (equal to n_feat).
+        use_pt_scaled_dot_product_attention: bool, optional
+            if set True, use pytorch scaled dot product attention in training.
+            NOTE: this will NOT be used in ONNX decoding due to a lack of
+            support.  In that case, we use the original attention
+            implementation, which shows no regression.
+            default: False.
+        n_value: int, optional
+            if set to values other than -1, use a different dimension for
+            value. With the default value (i.e. -1), it is backward compatible.
+        group_size: int, optional. must divide `n_head`
+            if group_size > 1:       GQA
+            if group_size = 1:       MHA
+            if group_size = n_head:  MQA
+    """
+
+    inv_sqrt_d_k: torch.jit.Final[float]
+    h: torch.jit.Final[int]
+    h_k: torch.jit.Final[int]
+    g: torch.jit.Final[int]
+
+    def __init__(
+        self,
+        n_head,
+        n_feat,
+        dropout_rate,
+        attention_inner_dim=-1,
+        glu_type="swish",
+        bias_in_glu=True,
+        use_pt_scaled_dot_product_attention=False,
+        n_value=-1,
+        group_size: int = 1,
+    ):
+        super().__init__()
+        if n_value == -1:
+            n_value = n_feat
+        if attention_inner_dim == -1:
+            attention_inner_dim = n_feat
+        assert attention_inner_dim % n_head == 0
+
+        # We assume d_v always equals d_k
+        self.d_k = attention_inner_dim // n_head
+        self.inv_sqrt_d_k = 1.0 / math.sqrt(self.d_k)
+        self.h = n_head
+        assert n_head % group_size == 0, "group_size must divide n_head"
+        self.g = group_size
+        self.h_k = n_head // group_size
+
+        self.linear_q = nn.Linear(n_feat, attention_inner_dim)
+        self.linear_k = nn.Linear(n_feat, attention_inner_dim // group_size)
+        self.linear_v = nn.Linear(n_value, attention_inner_dim // group_size)
+        self.linear_out = nn.Linear(attention_inner_dim // group_size, n_value)
+
+        self.attn = torch.jit.Attribute(None, Optional[Tensor])
+        self.dropout = nn.Dropout(p=dropout_rate)
+        self.dropout_rate = dropout_rate
+        self.use_pt_scaled_dot_product_attention = use_pt_scaled_dot_product_attention
+
+        if use_pt_scaled_dot_product_attention and group_size > 1:
+            raise ValueError("Cannot use PT Scaled Attention with GQA")
+
+        # Torchscript eager quantization.  Note that these functions below are
+        # NOOPs and have very little impact on performance unless quantization
+        # is enabled.
+        self.quant_q = torch.ao.quantization.QuantStub()
+        self.quant_x = torch.ao.quantization.QuantStub()
+        self.dequant = torch.ao.quantization.DeQuantStub()
+        self.ffunc = torch.ao.nn.quantized.FloatFunctional()
+
+    def forward(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        pos_k: Tensor,
+        pos_v: Tensor,
+        mask: Optional[Tensor],
+        relative_attention_bias: Optional[Tensor] = None,
+    ):
+        """Compute 'Scaled Dot Product Attention'.
+
+        Args:
+            query: torch.Tensor
+                query tensor (batch, time1, size)
+            key: torch.Tensor
+                key tensor (batch, time2, size)
+            value: torch.Tensor
+                value tensor (batch, time1, size)
+            pos_k: torch.Tensor
+                key tensor used for relative positional embedding.
+            pos_v: torch.Tensor
+                value tensor used for relative positional embedding.
+            mask: torch.Tensor
+                mask tensor (batch, time1, time2)
+            relative_attention_bias: torch.Tensor
+                bias added to attention logits w.r.t. relative positions
+                (1, n_head, time1, time2)
+        """
+        n_batch = query.size(0)
+
+        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)  # (b, t, d)
+        k = self.linear_k(key).view(n_batch, -1, self.h_k, self.d_k)  # (b, t, d)
+        v = self.linear_v(value).view(n_batch, -1, self.h_k, self.d_k)
+        q = (
+            q.transpose(1, 2)
+            if self.use_pt_scaled_dot_product_attention and not torch.jit.is_scripting()
+            else q.transpose(1, 2) * self.inv_sqrt_d_k
+        )
+        k = k.transpose(1, 2)  # (batch, head_k, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head_k, time2, d_k)
+
+        if self.use_pt_scaled_dot_product_attention and not torch.jit.is_scripting():
+            attn_mask = None
+            if mask is not None:
+                mask = mask.unsqueeze(1)
+                if relative_attention_bias is not None:
+                    attn_mask = mask + relative_attention_bias
+                else:
+                    attn_mask = mask
+                if mask.dtype != q.dtype:
+                    attn_mask = attn_mask.to(q.dtype)
+
+            with torch.nn.attention.sdpa_kernel(
+                [
+                    torch.nn.attention.SDPBackend.FLASH_ATTENTION,
+                    torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION,
+                    torch.nn.attention.SDPBackend.MATH,
+                    torch.nn.attention.SDPBackend.CUDNN_ATTENTION,
+                ]
+            ):
+                x = torch.nn.functional.scaled_dot_product_attention(
+                    q,
+                    k,
+                    v,
+                    attn_mask=attn_mask,
+                    dropout_p=self.dropout_rate,
+                )
+        else:
+            if self.h != self.h_k:
+                q = q.reshape(n_batch, self.g, self.h_k, -1, self.d_k)
+                A = torch.einsum("b g h t d, b h s d -> b h t s", q, k)
+            else:
+                A = torch.matmul(q, k.transpose(-2, -1))
+            if pos_k is not None:
+                if self.h != self.h_k:
+                    B = torch.einsum("b g h t d, t s d -> b h t s", q, pos_k)
+                else:
+                    reshape_q = (
+                        q.contiguous()
+                        .view(n_batch * self.h, -1, self.d_k)
+                        .transpose(0, 1)
+                    )  # (t1,nh,dk)
+                    B = torch.matmul(
+                        reshape_q, pos_k.transpose(-2, -1)
+                    )  # pos_k: (t1,dk,t2)
+                    B = B.transpose(0, 1).view(
+                        n_batch, self.h, pos_k.size(0), pos_k.size(1)
+                    )
+                scores = A + B
+            else:
+                scores = A
+
+            if relative_attention_bias is not None:
+                scores = scores + relative_attention_bias
+
+            attn = masked_softmax(scores, mask)  # (batch, head, time1, time2)
+
+            self.attn = attn
+
+            p_attn = self.dropout(attn)
+            x = torch.matmul(p_attn.to(v.dtype), v)  # (batch, head, time1, d_k)
+            if pos_v is not None:
+                reshape_attn = (
+                    p_attn.contiguous()
+                    .view(n_batch * self.h, pos_v.size(0), pos_v.size(1))
+                    .transpose(0, 1)
+                )  # (t1, bh, t2)
+
+                attn_v = (
+                    torch.matmul(reshape_attn, pos_v)
+                    .transpose(0, 1)
+                    .contiguous()
+                    .view(n_batch, self.h, pos_v.size(0), self.d_k)
+                )
+                x = x + attn_v
+        x = (
+            x.transpose(1, 2).contiguous().view(n_batch, -1, self.h_k * self.d_k)
+        )  # (batch, time1, d_model)
+
+        return self.linear_out(x)  # (batch, time1, d_model)
+
+
+class MultiSequential(torch.nn.Sequential):
+    """Multi-input multi-output torch.nn.Sequential"""
+
+    @torch.jit.ignore
+    def forward(self, *args):
+        """Forward method implementation."""
+        for m in self:
+            args = m(*args)
+        return args
+
+
+def get_offset(input_layer: str, time_reduction: int):
+    """Get an offset. We will use the offset for determining #frames of a
+    subsampled feature.
+
+    Args:
+        input_layer (str): Type of an input layer
+        time_reduction (int): time reduction factor for downsampling a feature
+    Returns:
+        int: offset
+    """
+    if input_layer in ("conv2d", "nemo_conv") and time_reduction == 4:
+        return 3
+    if input_layer in ("conv2d",) and time_reduction == 6:
+        return 1
+    if input_layer in ("conv2d", "nemo_conv") and time_reduction == 8:
+        return 7
+    return 0
+
+
+def unfold_tensor(xs_pad, max_seq_len):
+    """
+    For a given tensor with shape of (N, T, D), if sequence length T is
+    longer than max_seq_len, this function unfold it to a
+    (NT', max_seq_len, D) where T' is T // max_seq_len.
+    Args:
+        xs_pad: N, T, D
+    """
+    _, _, D = xs_pad.shape
+    xs_pad = xs_pad.transpose(-1, -2)  # convert to N, D, T
+    # N x D x 1 x T => N x (D x max_seq_len) x T'
+    xs_pad = F.unfold(
+        xs_pad[..., None, :],
+        kernel_size=(1, max_seq_len),
+        stride=(1, max_seq_len),
+    )
+    new_bsz, _, slen = xs_pad.shape
+    # N x D x max_seq_len x T'
+    xs_pad = xs_pad.view(new_bsz, -1, max_seq_len, slen)
+    # N x T' x max_seq_len x D
+    xs_pad = xs_pad.permute(0, 3, 2, 1).contiguous()
+    # NT' x max_seq_len x D
+    xs_pad = xs_pad.view(-1, max_seq_len, D)
+    return xs_pad
diff --git a/python/sglang/srt/models/phimoe.py b/python/sglang/srt/models/phimoe.py
new file mode 100644
index 000000000..4604aeef9
--- /dev/null
+++ b/python/sglang/srt/models/phimoe.py
@@ -0,0 +1,560 @@
+from typing import Iterable, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size
+from sglang.srt.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer
+from sglang.srt.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.utils import add_prefix, make_layers
+
+
+class PhiMoEConfig(PretrainedConfig):
+
+    model_type = "phimoe"
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        head_dim=None,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=1e6,
+        sliding_window=None,
+        attention_dropout=0.0,
+        num_experts_per_tok=2,
+        num_local_experts=16,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        router_jitter_noise=0.0,
+        attention_bias=False,
+        lm_head_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+        self.attention_bias = attention_bias
+        self.lm_head_bias = lm_head_bias
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        if head_dim is None:
+            head_dim = hidden_size // num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_local_experts = num_local_experts
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.router_jitter_noise = router_jitter_noise
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+def sparsemixer(scores, jitter_eps=0.01):
+    ################ Select first expert (topk=2) ################
+
+    # compute mask for sparsity
+    mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True)
+    factor = scores.abs().clamp(min=mask_logits_threshold)
+    mask_logits_threshold = ((mask_logits_threshold - scores) / factor) > (
+        2 * jitter_eps
+    )
+
+    # apply mask
+    masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf"))
+    selected_experts = max_ind
+
+    # compute scores for gradients
+    masked_gates = torch.softmax(masked_gates, dim=-1)
+    multiplier_o = masked_gates.gather(dim=-1, index=selected_experts)
+
+    multiplier = multiplier_o
+
+    # masked out first expert
+    masked_scores = torch.scatter(
+        scores,
+        -1,
+        selected_experts,
+        float("-inf"),
+    )
+
+    ################ Select second expert (topk=2) ################
+    # compute mask for sparsity
+    mask_logits_threshold, max_ind = masked_scores.max(dim=-1, keepdim=True)
+    factor = scores.abs().clamp(min=mask_logits_threshold)
+    mask_logits_threshold = ((mask_logits_threshold - scores) / factor) > (
+        2 * jitter_eps
+    )
+
+    # apply mask
+    masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold, float("-inf"))
+    selected_experts_top2 = max_ind
+    # compute scores for gradients
+    masked_gates_top2 = torch.softmax(masked_gates_top2, dim=-1)
+    multiplier_top2 = masked_gates_top2.gather(dim=-1, index=selected_experts_top2)
+
+    multiplier = torch.concat((multiplier, multiplier_top2), dim=-1)
+    selected_experts = torch.concat((selected_experts, selected_experts_top2), dim=-1)
+
+    return (
+        multiplier,
+        selected_experts,
+    )
+
+
+def phimoe_routing_function(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+):
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+    assert topk == 2, "Only top-2 routing is supported"
+    assert renormalize is False, "Renormalization is not supported"
+
+    topk_weights, topk_ids = sparsemixer(gating_output)
+    return topk_weights, topk_ids
+
+
+class PhiMoE(nn.Module):
+    """A tensor-parallel MoE implementation for PhiMoE that shards each expert
+    across all ranks.
+
+    Each expert's weights are sharded across all ranks and a fused MoE
+    kernel is used for the forward pass, and finally we reduce the outputs
+    across ranks.
+    """
+
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        # Gate always runs at half / full precision for now.
+        self.gate = ReplicatedLinear(
+            hidden_size,
+            num_experts,
+            bias=False,
+            quant_config=None,
+        )
+
+        self.topk = TopK(
+            top_k=top_k,
+            renormalize=False,
+            custom_routing_function=phimoe_routing_function,
+        )
+
+        self.experts = FusedMoE(
+            num_experts=num_experts,
+            top_k=top_k,
+            layer_id=layer_id,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            reduce_results=True,
+            quant_config=quant_config,
+            prefix=add_prefix("experts", prefix),
+        )
+
+    def forward(
+        self, hidden_states: torch.Tensor, forward_batch: Optional[ForwardBatch] = None
+    ) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        router_logits, _ = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = self.experts(hidden_states, topk_output)
+        return final_hidden_states.view(orig_shape)
+
+
+class PhiMoEAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: Optional[int] = None,
+        max_position: int = 4096 * 32,
+        rope_theta: float = 10000,
+        layer_id: int = 0,
+        attention_bias: bool = False,
+        quant_config: Optional[QuantizationConfig] = None,
+        rope_scaling: Optional[dict] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % attn_tp_size == 0
+        self.num_heads = self.total_num_heads // attn_tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
+        if head_dim is None:
+            head_dim = hidden_size // num_heads
+        self.head_dim = head_dim
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=attention_bias,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=attention_bias,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            prefix=add_prefix("o_proj", prefix),
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=int(self.rope_theta),
+            rope_scaling=self.rope_scaling,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class PhiMoEDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PhiMoEConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        self.self_attn = PhiMoEAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=getattr(
+                config, "head_dim", self.hidden_size // config.num_attention_heads
+            ),
+            rope_theta=rope_theta,
+            layer_id=layer_id,
+            attention_bias=config.attention_bias,
+            quant_config=quant_config,
+            rope_scaling=config.rope_scaling,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.block_sparse_moe = PhiMoE(
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("block_sparse_moe", prefix),
+        )
+        self.input_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.rms_norm_eps, elementwise_affine=True
+        )
+        self.post_attention_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.rms_norm_eps, elementwise_affine=True
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        forward_batch: ForwardBatch,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = hidden_states + residual
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.block_sparse_moe(
+            hidden_states, forward_batch=forward_batch
+        )
+
+        hidden_states = hidden_states + residual
+        return hidden_states, residual
+
+
+class PhiMoEModel(nn.Module):
+
+    def __init__(
+        self,
+        config: PhiMoEConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.config = config
+        self.quant_config = quant_config
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: PhiMoEDecoderLayer(
+                config, int(prefix.split(".")[-1]), quant_config, prefix=prefix
+            ),
+            prefix=add_prefix("layers", prefix),
+        )
+        self.norm = nn.LayerNorm(
+            config.hidden_size, eps=config.rms_norm_eps, elementwise_affine=True
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor]:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        residual = None
+
+        for layer in self.layers:
+            hidden_states, residual = layer(
+                positions, hidden_states, residual, forward_batch=forward_batch
+            )
+
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class PhiMoEForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: PhiMoEConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+
+        self.model = PhiMoEModel(
+            config=config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            quant_config=quant_config,
+            bias=True,
+            prefix=add_prefix("lm_head", prefix),
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        get_embedding: bool = False,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(input_ids, positions, forward_batch, inputs_embeds)
+
+        if not get_embedding:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+
+        else:
+            return self.pooler(hidden_states, forward_batch)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_local_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+
+EntryClass = PhiMoEForCausalLM
diff --git a/python/sglang/srt/models/pixtral.py b/python/sglang/srt/models/pixtral.py
new file mode 100644
index 000000000..04a7362d8
--- /dev/null
+++ b/python/sglang/srt/models/pixtral.py
@@ -0,0 +1,463 @@
+# Copyright 2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""
+Using mistral-community/pixtral-12b as reference.
+"""
+
+import logging
+import math
+from typing import Iterable, List, Optional, Set, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PixtralVisionConfig, PretrainedConfig
+from transformers.models.pixtral.modeling_pixtral import PixtralRotaryEmbedding
+from transformers.models.pixtral.modeling_pixtral import (
+    generate_block_attention_mask as _get_pixtral_attention_mask,
+)
+from transformers.models.pixtral.modeling_pixtral import position_ids_in_meshgrid
+
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import MergedColumnParallelLinear, RowParallelLinear
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import MultiModalityDataPaddingPatternMultimodalTokens
+from sglang.srt.managers.schedule_batch import MultimodalInputs
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+
+
+class PixtralHFMLP(nn.Module):
+    """MLP for PixtralHFVisionModel using SGLang components."""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        assert config.intermediate_size is not None
+
+        # Use MergedColumnParallelLinear for gate_up_proj to handle combined weights
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=config.hidden_size,
+            output_sizes=[config.intermediate_size, config.intermediate_size],
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+
+        self.down_proj = RowParallelLinear(
+            input_size=config.intermediate_size,
+            output_size=config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up_output, _ = self.gate_up_proj(x)
+
+        # Apply SiLU activation and multiply
+        gate_up = self.act_fn(gate_up_output)
+
+        # Project back to hidden size
+        out, _ = self.down_proj(gate_up)
+        return out
+
+
+class PixtralHFTransformerBlock(nn.Module):
+    """Transformer block for PixtralHFVisionModel using SGLang components."""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.layer_id = layer_id
+        self.attention_norm = RMSNorm(config.hidden_size, eps=1e-5)
+
+        # Use SGLang's VisionAttention instead of vLLM's PixtralHFAttention
+        self.attention = VisionAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            projection_size=config.hidden_size,
+            use_qkv_parallel=True,
+            quant_config=quant_config,
+            dropout=0.0,
+            use_context_forward=False,
+            softmax_in_single_precision=False,
+            flatten_batch=False,
+            prefix=f"{prefix}.attention",
+        )
+
+        self.feed_forward = PixtralHFMLP(
+            config, quant_config=quant_config, prefix=f"{prefix}.feed_forward"
+        )
+
+        self.ffn_norm = RMSNorm(config.hidden_size, eps=1e-5)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]],
+    ) -> torch.Tensor:
+        # Ensure hidden_states has the batch dimension [batch, seq_len, hidden_dim]
+        batch_size, seq_len, hidden_dim = hidden_states.shape
+
+        # Apply attention norm - normalize along the last dimension
+        attn_normalized = self.attention_norm(hidden_states.view(-1, hidden_dim)).view(
+            batch_size, seq_len, hidden_dim
+        )
+
+        # Pass through attention layer
+        attention_output = self.attention(
+            attn_normalized,
+            attention_mask=attention_mask,
+            cu_seqlens=None,
+            position_embeddings=position_embeddings,
+        )
+
+        # Apply first residual connection
+        hidden_states = hidden_states + attention_output
+
+        # Apply feed-forward norm - normalize along the last dimension
+        ffn_normalized = self.ffn_norm(hidden_states.view(-1, hidden_dim)).view(
+            batch_size, seq_len, hidden_dim
+        )
+
+        # Pass through feed-forward layer
+        # First reshape to 2D for the feed-forward network, then reshape back
+        ffn_output = self.feed_forward(ffn_normalized)
+
+        # Apply second residual connection
+        output = hidden_states + ffn_output
+
+        return output
+
+
+class PixtralHFTransformer(nn.Module):
+    """Transformer for PixtralHFVisionModel using SGLang components."""
+
+    def __init__(
+        self,
+        config: PixtralVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_hidden_layers_override: Optional[int] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        num_hidden_layers = config.num_hidden_layers
+        if num_hidden_layers_override is not None:
+            num_hidden_layers = num_hidden_layers_override
+
+        self.layers = nn.ModuleList(
+            [
+                PixtralHFTransformerBlock(
+                    config=config,
+                    layer_id=layer_idx,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.layers.{layer_idx}",
+                )
+                for layer_idx in range(num_hidden_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        return_all_hidden_states: bool = False,
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        """Forward pass through transformer layers.
+
+        Args:
+            x: Input tensor
+            attention_mask: Optional attention mask
+            position_embeddings: Optional position embeddings for rotary attention
+            return_all_hidden_states: Whether to return all hidden states
+
+        Returns:
+            Either the final hidden state, or a list of all hidden states if
+            return_all_hidden_states is True
+        """
+        # For HF model compatibility, always start with the input
+        hidden_states = x
+        all_hidden_states = [hidden_states] if return_all_hidden_states else None
+
+        for i, layer in enumerate(self.layers):
+            hidden_states = layer(hidden_states, attention_mask, position_embeddings)
+            if return_all_hidden_states:
+                all_hidden_states.append(hidden_states)
+
+        if return_all_hidden_states:
+            return all_hidden_states
+        return hidden_states
+
+
+def resolve_visual_encoder_outputs(
+    outputs: Union[torch.Tensor, List[torch.Tensor]],
+    feature_sample_layers: Optional[List[int]],
+    post_norm: Optional[nn.Module],
+    num_hidden_layers: int,
+) -> torch.Tensor:
+    """Resolve outputs from visual encoder based on feature_sample_layers."""
+    if feature_sample_layers is None:
+        # Just use the last layer's output
+        if isinstance(outputs, list):
+            outputs = outputs[-1]
+        if post_norm is not None:
+            outputs = post_norm(outputs)
+        return outputs
+
+    # Handle the case where we want to use specific layers
+    if not isinstance(outputs, list):
+        raise ValueError(
+            "Expected outputs to be a list when feature_sample_layers is provided"
+        )
+
+    # Validate layer indices
+    for layer_idx in feature_sample_layers:
+        if layer_idx < 0 or layer_idx > num_hidden_layers:
+            raise ValueError(
+                f"Feature sample layer index {layer_idx} is out of range "
+                f"[0, {num_hidden_layers}]"
+            )
+
+    # Collect outputs from specified layers
+    selected_outputs = [outputs[layer_idx] for layer_idx in feature_sample_layers]
+
+    # Combine the outputs
+    combined_outputs = torch.cat(selected_outputs, dim=-1)
+
+    if post_norm is not None:
+        combined_outputs = post_norm(combined_outputs)
+
+    return combined_outputs
+
+
+class PixtralHFVisionModel(nn.Module):
+    """Hugging Face Pixtral Vision Model implemented using SGLang components."""
+
+    DEFAULT_IMAGE_TOKEN_ID = 10
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        return self.input_padder.pad_input_tokens(input_ids, mm_inputs)
+
+    def __init__(
+        self,
+        config: PixtralVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_hidden_layers_override: Optional[int] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_conv = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=config.hidden_size,
+            kernel_size=config.patch_size,
+            stride=config.patch_size,
+            bias=False,
+        )
+
+        self.ln_pre = RMSNorm(config.hidden_size, eps=1e-5)
+
+        self.transformer = PixtralHFTransformer(
+            config,
+            quant_config,
+            num_hidden_layers_override=num_hidden_layers_override,
+            prefix=f"{prefix}.transformer",
+        )
+
+        # Check that num_hidden_layers is valid
+        num_hidden_layers = config.num_hidden_layers
+        if len(self.transformer.layers) > config.num_hidden_layers:
+            raise ValueError(
+                f"The original encoder only has {num_hidden_layers} "
+                f"layers, but you requested {len(self.transformer.layers)} "
+                "layers."
+            )
+
+        # Initialize patch position embedding
+        self.patch_positional_embedding = PixtralRotaryEmbedding(config)
+        self.input_padder = MultiModalityDataPaddingPatternMultimodalTokens()
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        image_sizes: list[tuple[int, int]],
+        output_hidden_states: bool = False,
+        feature_sample_layers: Optional[list[int]] = None,
+    ) -> Union[torch.Tensor, tuple]:
+        """
+        Args:
+            pixel_values: [batch_size, C, H, W], padded if multiple images
+            image_sizes: list of (H, W) for each image in the batch
+            output_hidden_states: Whether to return all hidden states.
+            feature_sample_layers: Layer indices whose features should be
+                concatenated and used as the visual encoder output. If none
+                are provided, the last layer is used.
+
+        Returns:
+            A tuple containing:
+              - hidden_states: Final model outputs (or selected layers if feature_sample_layers given)
+              - hidden_states tuple (optional): All hidden states if output_hidden_states=True
+        """
+        # batch patch images
+        embeds_orig = self.patch_conv(
+            pixel_values.to(device=self.device, dtype=self.dtype)
+        )
+        # crop the embeddings
+        embeds_2d = [
+            embed[..., : h // self.patch_size, : w // self.patch_size]
+            for embed, (h, w) in zip(embeds_orig, image_sizes)
+        ]
+
+        # flatten to sequence
+        embeds_1d = torch.cat([p.flatten(1).T for p in embeds_2d], dim=0)
+        embeds_featurized = self.ln_pre(embeds_1d).unsqueeze(0)
+
+        # positional embeddings
+        position_ids = position_ids_in_meshgrid(
+            embeds_2d,
+            max_width=self.image_size // self.patch_size,
+        ).to(self.device)
+
+        # The original PixtralRotaryEmbedding expects 2D input but returns a tuple of tensors (cos, sin)
+        # These tensors are used by apply_rotary_pos_emb in the transformer blocks
+        position_embedding = self.patch_positional_embedding(
+            embeds_featurized, position_ids
+        )
+        attention_mask = _get_pixtral_attention_mask(
+            [p.shape[-2] * p.shape[-1] for p in embeds_2d], embeds_featurized
+        )
+
+        return_all_hidden_states = (
+            output_hidden_states or feature_sample_layers is not None
+        )
+
+        transformer_outputs = self.transformer(
+            embeds_featurized,  # add batch dimension
+            attention_mask,
+            position_embedding,
+            return_all_hidden_states=return_all_hidden_states,
+        )
+
+        # Store all hidden states if requested
+        all_hidden_states = None
+        if isinstance(transformer_outputs, list):
+            all_hidden_states = transformer_outputs
+            # Use the last layer by default if feature_sample_layers is not specified
+            if feature_sample_layers is None:
+                out = transformer_outputs[-1]
+            else:
+                # Resolve outputs based on feature sample layers
+                out = resolve_visual_encoder_outputs(
+                    transformer_outputs,
+                    feature_sample_layers,
+                    None,
+                    self.config.num_hidden_layers,
+                )
+        else:
+            out = transformer_outputs
+
+        # Format return to be compatible with HuggingFace vision models
+        if output_hidden_states:
+            return type(
+                "VisualOutput",
+                (),
+                {
+                    "last_hidden_state": out,
+                    "hidden_states": all_hidden_states,
+                },
+            )
+        else:
+            return out
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]:
+        """Load weights from a HuggingFace checkpoint with proper parameter mapping."""
+        params_dict = dict(self.named_parameters())
+
+        # for (param, weight, shard_id): load weight into param as param's shard_id part
+        stacked_params_mapping = [
+            (".attention.qkv_proj", ".attention.q_proj", "q"),
+            (".attention.qkv_proj", ".attention.k_proj", "k"),
+            (".attention.qkv_proj", ".attention.v_proj", "v"),
+            (".feed_forward.gate_up_proj", ".feed_forward.gate_proj", 0),
+            (".feed_forward.gate_up_proj", ".feed_forward.up_proj", 1),
+        ]
+
+        # Process each weight
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name in name:
+                    # Replace the weight name part with the combined parameter name
+                    transformed_name = name.replace(weight_name, param_name)
+                    if transformed_name in params_dict:
+                        param = params_dict[transformed_name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight, shard_id)
+                        break
+            else:
+                if ".attention.o_proj" in name:
+                    alt_name = name.replace(".attention.o_proj", ".attention.proj")
+                    if alt_name in params_dict:
+                        name = alt_name
+                if name in params_dict:
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+
+class PixtralVisionModel(PixtralHFVisionModel):
+    pass
+
+
+# Register the model classes for external access
+EntryClass = [PixtralVisionModel]
diff --git a/python/sglang/srt/models/qwen.py b/python/sglang/srt/models/qwen.py
new file mode 100644
index 000000000..009650411
--- /dev/null
+++ b/python/sglang/srt/models/qwen.py
@@ -0,0 +1,356 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/qwen.py#L1
+
+import time
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+
+class QWenMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str = "silu",
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            2 * [intermediate_size],
+            bias=False,
+            gather_output=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.c_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            quant_config=quant_config,
+            prefix=add_prefix("c_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.c_proj(x)
+        return x
+
+
+class QWenAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        max_position_embeddings: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
+        self.head_dim = hidden_size // self.total_num_heads
+
+        # pylint: disable=invalid-name
+        self.c_attn = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("c_attn", prefix),
+        )
+        self.c_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            quant_config=quant_config,
+            prefix=add_prefix("c_proj", prefix),
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.scaling = self.head_dim**-0.5
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.c_attn(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.c_proj(attn_output)
+        return output
+
+
+class QWenBlock(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        self.attn = QWenAttention(
+            config.hidden_size,
+            config.num_attention_heads,
+            config.max_position_embeddings,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+        self.ln_2 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+        self.mlp = QWenMLP(
+            config.hidden_size,
+            config.intermediate_size // 2,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        hidden_states = self.attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class QWenModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+
+        vocab_size = ((config.vocab_size + 63) // 64) * 64
+        self.wte = VocabParallelEmbedding(
+            vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("wte", prefix),
+        )
+        self.h = nn.ModuleList(
+            [
+                QWenBlock(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"h.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.wte(input_ids)
+        for i in range(len(self.h)):
+            layer = self.h[i]
+            hidden_states = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+            )
+        hidden_states = self.ln_f(hidden_states)
+        return hidden_states
+
+
+class QWenLMHeadModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.transformer = QWenModel(
+            config, quant_config=quant_config, prefix=add_prefix("transformer", prefix)
+        )
+        vocab_size = ((config.vocab_size + 63) // 64) * 64
+        self.lm_head = ParallelLMHead(
+            vocab_size, config.hidden_size, prefix=add_prefix("lm_head", prefix)
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        hidden_states = self.transformer(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],  # [start, end) 0-based
+    ):
+        start, end = split_interval
+        # embed
+        if start == 0:
+            forward_batch.hidden_states = self.transformer.wte(input_ids)
+
+        # decoder layer
+        for i in range(start, end):
+            layer = self.transformer.h[i]
+            forward_batch.hidden_states = layer(
+                positions,
+                forward_batch.hidden_states,
+                forward_batch,
+            )
+
+        if end == self.transformer.config.num_hidden_layers:
+            # norm
+            forward_batch.hidden_states = self.transformer.ln_f(
+                forward_batch.hidden_states
+            )
+            # logits process
+            result = self.logits_processor(
+                input_ids, forward_batch.hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            result = None
+
+        return result
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "w2", 0),
+            ("gate_up_proj", "w1", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = QWenLMHeadModel
diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py
new file mode 100644
index 000000000..b3d5fb9ad
--- /dev/null
+++ b/python/sglang/srt/models/qwen2.py
@@ -0,0 +1,653 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from llama2.py
+# Modify details for the adaptation of Qwen2 model.
+"""Inference-only Qwen2 model compatible with HuggingFace weights."""
+import logging
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from sglang.srt.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.dp_attention import is_dp_attention_enabled
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    kv_cache_scales_loader,
+)
+from sglang.srt.utils import add_prefix, make_layers
+
+Qwen2Config = None
+
+
+logger = logging.getLogger(__name__)
+
+
+class Qwen2MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Qwen2Attention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: Optional[int] = None,
+        layer_id: int = 0,
+        rope_theta: float = 1000000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 32768,
+        quant_config: Optional[QuantizationConfig] = None,
+        dual_chunk_attention_config: Optional[dict[str, Any]] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        if head_dim is not None:
+            self.head_dim = head_dim
+        else:
+            self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            dual_chunk_attention_config=dual_chunk_attention_config,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Qwen2DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: Qwen2Config,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 1000000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 32768)
+        head_dim = getattr(config, "head_dim", None)
+        dual_chunk_attention_config = getattr(
+            config, "dual_chunk_attention_config", None
+        )
+        self.self_attn = Qwen2Attention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=head_dim,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            dual_chunk_attention_config=dual_chunk_attention_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = Qwen2MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class Qwen2Model(nn.Module):
+    def __init__(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        decoder_layer_type: type[nn.Module] = Qwen2DecoderLayer,
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                enable_tp=not is_dp_attention_enabled(),
+                prefix=add_prefix("embed_tokens", prefix),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        # Use the provided decoder layer type or default to Qwen2DecoderLayer
+        decoder_layer_type = decoder_layer_type or Qwen2DecoderLayer
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: decoder_layer_type(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+                alt_stream=alt_stream,
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=add_prefix("layers", prefix),
+        )
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+
+        # For EAGLE3 support
+        self.layers_to_capture = []
+
+    def get_input_embedding(self, input_ids: torch.Tensor) -> torch.Tensor:
+        if hasattr(self.config, "scale_emb"):
+            return self.get_input_embeddings()(input_ids) * self.config.scale_emb
+        else:
+            return self.get_input_embeddings()(input_ids)
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embed_tokens
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        aux_hidden_states = []
+        for i in range(self.start_layer, self.end_layer):
+            if i in self.layers_to_capture:
+                aux_hidden_states.append(
+                    hidden_states + residual if residual is not None else hidden_states
+                )
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            if hidden_states.shape[0] != 0:
+                if residual is None:
+                    hidden_states = self.norm(hidden_states)
+                else:
+                    hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+
+        return hidden_states, aux_hidden_states
+
+    # If this function is called, it should always initialize KV cache scale
+    # factors (or else raise an exception). Thus, handled exceptions should
+    # make sure to leave KV cache scale factors in a known good (dummy) state
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+        for layer_idx, scaling_factor in kv_cache_scales_loader(
+            quantization_param_path,
+            tp_rank,
+            tp_size,
+            self.config.num_hidden_layers,
+            self.config.__class__.model_type,
+        ):
+            if not isinstance(self.layers[layer_idx], nn.Identity):
+                layer_self_attn = self.layers[layer_idx].self_attn
+            if hasattr(layer_self_attn.attn, "k_scale"):
+                layer_self_attn.attn.k_scale = scaling_factor
+                layer_self_attn.attn.v_scale = scaling_factor
+            else:
+                raise RuntimeError(
+                    "Self attention has no KV cache scaling " "factor attribute!"
+                )
+
+
+class Qwen2ForCausalLM(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Qwen2Model(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+
+        # handle the lm head on different pp ranks
+        if self.pp_group.is_last_rank:
+            if self.pp_group.world_size == 1 and config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(
+                    config.vocab_size,
+                    config.hidden_size,
+                    quant_config=quant_config,
+                    prefix=add_prefix("lm_head", prefix),
+                )
+        else:
+            # ranks other than the last rank will have a placeholder layer
+            self.lm_head = PPMissingLayer()
+
+        # perform weight tying for PP
+        if self.pp_group.world_size > 1 and config.tie_word_embeddings:
+            if self.pp_group.is_first_rank:
+                self.pp_group.send(
+                    self.model.embed_tokens.weight, dst=self.pp_group.last_rank
+                )
+            else:
+                emb_token_weight = self.pp_group.recv(
+                    size=(config.vocab_size, config.hidden_size),
+                    dtype=next(self.model.parameters()).dtype,
+                    src=self.pp_group.first_rank,
+                )
+                self.lm_head.weight.copy_(emb_token_weight)
+
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+        # For EAGLE3 support
+        self.capture_aux_hidden_states = False
+
+        # For EAGLE3 support
+        self.capture_aux_hidden_states = False
+
+    def get_input_embedding(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embedding(input_ids)
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if self.pp_group.is_last_rank:
+            if not get_embedding:
+                return self.logits_processor(
+                    input_ids,
+                    hidden_states,
+                    self.lm_head,
+                    forward_batch,
+                    aux_hidden_states,
+                )
+            else:
+                return self.pooler(hidden_states, forward_batch)
+        else:
+            return hidden_states
+
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],  # [start, end) 0-based
+        input_embeds: torch.Tensor = None,
+    ):
+        start, end = split_interval
+        # embed
+        if start == 0:
+            if input_embeds is None:
+                forward_batch.hidden_states = self.model.embed_tokens(input_ids)
+            else:
+                forward_batch.hidden_states = input_embeds
+        # decoder layer
+        for i in range(start, end):
+            layer = self.model.layers[i]
+            forward_batch.hidden_states, forward_batch.residual = layer(
+                positions,
+                forward_batch.hidden_states,
+                forward_batch,
+                forward_batch.residual,
+            )
+
+        if end == self.model.config.num_hidden_layers:
+            # norm
+            hidden_states, _ = self.model.norm(
+                forward_batch.hidden_states, forward_batch.residual
+            )
+            forward_batch.hidden_states = hidden_states
+            # logits process
+            result = self.logits_processor(
+                input_ids, forward_batch.hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            result = None
+
+        return result
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                if self.pp_group.world_size > 1 and self.pp_group.is_last_rank:
+                    # Handle pp weight tying here
+                    # find the embed_tokens.weight in the weights
+                    embed_token_weights = next(
+                        filter(lambda x: x[0] == "model.embed_tokens.weight", weights)
+                    )[1]
+                    loaded_weight = embed_token_weights
+                else:
+                    continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if name in params_dict.keys():
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                else:
+                    logger.warning(f"Parameter {name} not found in params_dict")
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        self.model.load_kv_cache_scales(quantization_param_path)
+
+    def set_eagle3_layers_to_capture(self, layer_ids: Optional[List[int]] = None):
+        if not self.pp_group.is_last_rank:
+            return
+
+        self.capture_aux_hidden_states = True
+        if layer_ids is None:
+            num_layers = self.config.num_hidden_layers
+            self.model.layers_to_capture = [
+                2,
+                num_layers // 2,
+                num_layers - 3,
+            ]  # Specific layers for EAGLE3 support
+        else:
+            self.model.layers_to_capture = [val + 1 for val in layer_ids]
+
+
+EntryClass = Qwen2ForCausalLM
diff --git a/python/sglang/srt/models/qwen2_5_vl.py b/python/sglang/srt/models/qwen2_5_vl.py
new file mode 100644
index 000000000..9afb2b1ab
--- /dev/null
+++ b/python/sglang/srt/models/qwen2_5_vl.py
@@ -0,0 +1,671 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen2-VL model compatible with HuggingFace weights."""
+import logging
+from functools import lru_cache, partial
+from typing import Iterable, List, Optional, Tuple, Type
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers.activations import ACT2FN
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
+    Qwen2_5_VLConfig,
+    Qwen2_5_VLVisionConfig,
+)
+from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
+    Qwen2_5_VisionPatchEmbed,
+    Qwen2_5_VisionRotaryEmbedding,
+)
+
+from sglang.srt.hf_transformers_utils import get_processor
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen2 import Qwen2Model
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+
+class Qwen2_5_VLMLP(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int = None,
+        bias: bool = True,
+        hidden_act="silu",
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=in_features,
+            output_sizes=[hidden_features] * 2,  # [gate_proj, up_proj]
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            hidden_features,
+            in_features,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        self.act = ACT2FN[hidden_act]
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        gate, up = gate_up.chunk(2, dim=-1)
+        x = self.act(gate) * up
+        x_down, _ = self.down_proj(x)
+        return x_down
+
+
+class Qwen2_5_VisionBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int,
+        num_heads: int,
+        hidden_act="silu",
+        norm_layer: Type[nn.Module] = None,
+        attn_implementation: Optional[str] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        num_dummy_heads: int = 0,
+        rms_norm_eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.norm1 = RMSNorm(dim, eps=rms_norm_eps)
+        self.norm2 = RMSNorm(dim, eps=rms_norm_eps)
+
+        if attn_implementation is None:
+            softmax_in_single_precision = False
+            qkv_backend = None
+            flatten_batch = True
+        elif attn_implementation == "sdpa":
+            softmax_in_single_precision = False
+            qkv_backend = "sdpa"
+            flatten_batch = True
+        elif attn_implementation == "flash_attention_2":
+            softmax_in_single_precision = False
+            qkv_backend = "triton_attn"
+            flatten_batch = True
+        elif attn_implementation == "eager":
+            softmax_in_single_precision = True
+            qkv_backend = "sdpa"
+            flatten_batch = True
+        elif attn_implementation == "flash_attention_3":
+            softmax_in_single_precision = False
+            qkv_backend = "fa3"
+            flatten_batch = True
+
+        self.attn = VisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            use_qkv_parallel=True,
+            rotary_embed="normal",
+            proj_bias=True,
+            qkv_backend=qkv_backend,
+            softmax_in_single_precision=softmax_in_single_precision,
+            flatten_batch=flatten_batch,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+            num_dummy_heads=num_dummy_heads,
+        )
+        self.mlp = Qwen2_5_VLMLP(
+            dim,
+            intermediate_dim,
+            hidden_act=hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        position_embeddings: torch.Tensor,
+    ) -> torch.Tensor:
+        S, B, H = x.shape
+        # norm1: flatten to 2D -> [S*B, H], then reshape back
+        x2d = x.reshape(-1, H)
+        hidden_states = self.norm1(x2d).reshape(S, B, H)
+
+        # Attention expects [B, S, H]
+        hidden_states = rearrange(hidden_states, "s b h -> b s h")
+        attn = self.attn(
+            hidden_states,
+            cu_seqlens=cu_seqlens,
+            position_embeddings=position_embeddings,
+        )
+        attn = rearrange(attn, "b s h -> s b h")
+
+        # norm2 with fused residual-add: also 2D
+        attn2d = attn.reshape(-1, H)
+        x_norm_2d, x_after_add_2d = self.norm2(x2d, residual=attn2d)
+        x_norm = x_norm_2d.reshape(S, B, H)
+        x_after_add = x_after_add_2d.reshape(S, B, H)
+
+        # MLP and final residual
+        mlp_out = self.mlp(x_norm)
+        x = x_after_add + mlp_out
+        return x
+
+
+class Qwen2_5_VisionPatchMerger(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        context_dim: int,
+        spatial_merge_size: int = 2,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+        self.ln_q = RMSNorm(context_dim, eps=1e-6)
+        self.mlp = nn.ModuleList(
+            [
+                ColumnParallelLinear(
+                    self.hidden_size,
+                    self.hidden_size,
+                    bias=True,
+                    quant_config=quant_config,
+                    prefix=add_prefix("mlp.0", prefix),
+                ),
+                nn.GELU(),
+                RowParallelLinear(
+                    self.hidden_size,
+                    dim,
+                    bias=True,
+                    quant_config=quant_config,
+                    prefix=add_prefix("mlp.2", prefix),
+                ),
+            ]
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x expected shape: [S, B, context_dim]
+        S, B, D = x.shape
+        x2d = x.reshape(-1, D)
+        x2d = self.ln_q(x2d)  # RMSNorm expects 2D
+        x2d = x2d.view(-1, self.hidden_size)  # group into spatial_merge_unit
+        mlp_fc1, mlp_act, mlp_fc2 = self.mlp
+        x_parallel, _ = mlp_fc1(x2d)
+        x_parallel = mlp_act(x_parallel)
+        out, _ = mlp_fc2(x_parallel)
+        return out
+
+
+class Qwen2_5_VisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        vision_config: Qwen2_5_VLVisionConfig,
+        norm_eps: float = 1e-6,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        patch_size: int = vision_config.patch_size
+        temporal_patch_size: int = vision_config.temporal_patch_size
+        spatial_merge_size: int = vision_config.spatial_merge_size
+        self.spatial_merge_size = spatial_merge_size
+        self.spatial_merge_unit: int = spatial_merge_size * spatial_merge_size
+        in_channels: int = vision_config.in_channels
+        hidden_size: int = vision_config.hidden_size
+        depth: int = vision_config.depth
+        num_heads: int = vision_config.num_heads
+        self.fullatt_block_indexes = vision_config.fullatt_block_indexes
+        self.window_size = vision_config.window_size
+        self.patch_size = vision_config.patch_size
+        mlp_hidden_size: int = vision_config.intermediate_size
+        self.patch_embed = Qwen2_5_VisionPatchEmbed(
+            patch_size=patch_size,
+            temporal_patch_size=temporal_patch_size,
+            in_channels=in_channels,
+            embed_dim=hidden_size,
+        )
+
+        norm_layer = partial(nn.LayerNorm, eps=norm_eps)
+        head_dim = hidden_size // num_heads
+        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
+        self.blocks = nn.ModuleList(
+            [
+                Qwen2_5_VisionBlock(
+                    dim=hidden_size,
+                    intermediate_dim=mlp_hidden_size,
+                    num_heads=num_heads,
+                    hidden_act=vision_config.hidden_act,
+                    norm_layer=norm_layer,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"blocks.{i}", prefix),
+                )
+                for i in range(depth)
+            ]
+        )
+        self.merger = Qwen2_5_VisionPatchMerger(
+            dim=vision_config.out_hidden_size,
+            context_dim=hidden_size,
+            spatial_merge_size=spatial_merge_size,
+            quant_config=quant_config,
+            prefix=add_prefix("merger", prefix),
+        )
+
+    def get_window_index(self, grid_thw):
+        cu_window_seqlens: list = [0]
+        window_index_id = 0
+        vit_merger_window_size = (
+            self.window_size // self.spatial_merge_size // self.patch_size
+        )
+        window_index: list = []
+        for grid_t, grid_h, grid_w in grid_thw:
+            llm_grid_h, llm_grid_w = (
+                grid_h // self.spatial_merge_size,
+                grid_w // self.spatial_merge_size,
+            )
+            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(
+                grid_t, llm_grid_h, llm_grid_w
+            )
+            pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
+            pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
+            num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
+            num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
+            index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
+            index_padded = index_padded.reshape(
+                grid_t,
+                num_windows_h,
+                vit_merger_window_size,
+                num_windows_w,
+                vit_merger_window_size,
+            )
+            index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
+                grid_t,
+                num_windows_h * num_windows_w,
+                vit_merger_window_size,
+                vit_merger_window_size,
+            )
+            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+            index_padded = index_padded.reshape(-1)
+            index_new = index_padded[index_padded != -100]
+            window_index.append(index_new + window_index_id)
+            cu_seqlens_tmp = (
+                seqlens.cumsum(0) * self.spatial_merge_unit + cu_window_seqlens[-1]
+            )
+            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+            window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
+        window_index = torch.cat(window_index, dim=0)
+        return window_index, cu_window_seqlens
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.patch_embed.proj.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.patch_embed.proj.weight.device
+
+    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
+        pos_ids = []
+        for i in range(grid_thw.size(0)):
+            t, h, w = grid_thw[i].tolist()
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        # patchify
+        x = x.to(device=self.device, dtype=self.dtype)
+        x = self.patch_embed(x)
+
+        # compute position embedding
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+
+        window_index, cu_window_seqlens = self.get_window_index(grid_thw)
+        cu_window_seqlens = torch.tensor(
+            cu_window_seqlens,
+            device=x.device,
+            dtype=torch.int32,
+        )
+        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+
+        # Move window_index to the same device as x before using it to index x
+        window_index = window_index.to(device=x.device)
+
+        # Ensure rotary_pos_emb is on the same device/dtype as x
+        rotary_pos_emb = rotary_pos_emb.to(device=x.device, dtype=x.dtype)
+
+        seq_len, _ = x.size()
+
+        x = x.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        x = x[window_index, :, :]
+        x = x.reshape(seq_len, -1)
+        rotary_pos_emb = rotary_pos_emb.reshape(
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1
+        )
+        rotary_pos_emb = rotary_pos_emb[window_index, :, :]
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+        # After building position_embeddings, make sure both cos and sin are on the same device/dtype as the attention input
+        position_embeddings = (
+            position_embeddings[0].to(x.device, x.dtype),
+            position_embeddings[1].to(x.device, x.dtype),
+        )
+
+        # compute cu_seqlens - move cu_seqlens to GPU and make it int32
+        cu_seqlens = torch.cat(
+            [
+                torch.tensor([0], device=x.device, dtype=torch.int32),
+                (grid_thw[:, 0] * grid_thw[:, 1] * grid_thw[:, 2])
+                .cumsum(dim=0)
+                .to(device=x.device, dtype=torch.int32),
+            ]
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
+
+        # transformers
+        x = x.unsqueeze(1)
+        for layer_num, blk in enumerate(self.blocks):
+            if layer_num in self.fullatt_block_indexes:
+                cu_seqlens_now = cu_seqlens
+            else:
+                cu_seqlens_now = cu_window_seqlens
+            x = blk(
+                x, cu_seqlens=cu_seqlens_now, position_embeddings=position_embeddings
+            )
+
+        # adapter
+        x = self.merger(x)
+
+        reverse_indices = torch.argsort(window_index)
+        x = x[reverse_indices, :]
+
+        return x
+
+
+cached_get_processor = lru_cache(get_processor)
+
+
+class Qwen2_5_VLForConditionalGeneration(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_up_proj.",
+        ".down_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        config: Qwen2_5_VLConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.visual = Qwen2_5_VisionTransformer(
+            config.vision_config,
+            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+            # NOTE: Qwen2_5-VL vision encoder currently supports BitsAndBytes 4-bit quantization.
+            # Other quantization methods (e.g., GPTQ, AWQ) are untested and may not be supported.
+            quant_config=quant_config,
+            prefix=add_prefix("visual", prefix),
+        )
+
+        self.model = Qwen2Model(
+            config,
+            quant_config,
+            prefix=add_prefix("model", prefix),
+        )
+
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        self.is_mrope_enabled = "mrope_section" in self.config.rope_scaling
+
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
+        # For EAGLE3 support
+        self.capture_aux_hidden_states = False
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # in qwen-vl, last dim is the same
+        pixel_values = torch.cat([item.feature for item in items], dim=0).type(
+            self.visual.dtype
+        )
+        image_grid_thw = torch.concat([item.image_grid_thw for item in items], dim=0)
+        assert pixel_values.dim() == 2, pixel_values.dim()
+        assert image_grid_thw.dim() == 2, image_grid_thw.dim()
+        image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
+        return image_embeds
+
+    def get_video_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # in qwen-vl, last dim is the same
+        pixel_values = torch.cat([item.feature for item in items], dim=0).type(
+            self.visual.dtype
+        )
+        video_grid_thw = torch.concat([item.video_grid_thw for item in items], dim=0)
+        assert pixel_values.dim() == 2, pixel_values.dim()
+        assert video_grid_thw.dim() == 2, video_grid_thw.dim()
+        video_embeds = self.visual(pixel_values, grid_thw=video_grid_thw)
+        return video_embeds
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = False,
+    ):
+        """Run forward pass for Qwen2_5-VL.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            positions: Flattened (concatenated) position ids corresponding to a
+                batch.
+                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
+                opensource models), the shape will be `(3, seq_len)`,
+                otherwise it will be `(seq_len,).
+                (Use input_metadata.mrope_positions to replace it)
+        """
+        if self.is_mrope_enabled:
+            positions = forward_batch.mrope_positions
+
+        if not (
+            forward_batch.forward_mode.is_decode()
+            or not forward_batch.contains_image_inputs()
+        ):
+            if self.is_mrope_enabled:
+                assert positions.ndim == 2 and positions.size(0) == 3, (
+                    "multimodal section rotary embedding requires "
+                    f"(3, seq_len) positions, but got {positions.size()}"
+                )
+
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.model,
+            multimodal_model=self,
+            positions=positions,
+        )
+
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if not get_embedding:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch, aux_hidden_states
+            )
+        else:
+            return self.pooler(hidden_states, forward_batch)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            ("gate_up_proj", "up_proj", 1),
+            ("gate_up_proj", "gate_proj", 0),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if (
+                    "visual" in name
+                    and "up_proj" not in name
+                    and "gate_proj" not in name
+                ):
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if "visual" in name:
+                    # adapt to VisionAttention
+                    name = name.replace(r"attn.qkv.", r"attn.qkv_proj.")
+
+                try:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                except KeyError:
+                    print(params_dict.keys())
+                    raise
+
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_eagle3_layers_to_capture(self, layer_ids: Optional[List[int]] = None):
+        self.capture_aux_hidden_states = True
+        self.model.capture_aux_hidden_states = True
+        if layer_ids is None:
+            num_layers = self.config.num_hidden_layers
+            self.model.layers_to_capture = [
+                2,
+                num_layers // 2,
+                num_layers - 3,
+            ]  # Specific layers for EAGLE3 support
+        else:
+            self.model.layers_to_capture = [val + 1 for val in layer_ids]
+
+
+EntryClass = [Qwen2_5_VLForConditionalGeneration]
diff --git a/python/sglang/srt/models/qwen2_audio.py b/python/sglang/srt/models/qwen2_audio.py
new file mode 100644
index 000000000..180ee801b
--- /dev/null
+++ b/python/sglang/srt/models/qwen2_audio.py
@@ -0,0 +1,201 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/1d45d90e5d1552eccb6d8cc9b7bba283ccefb808/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
+import logging
+import math
+from functools import lru_cache, partial
+from typing import Any, Iterable, List, Optional, Tuple, Type, TypedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import AutoTokenizer, Qwen2AudioEncoderConfig, Qwen2Config
+from transformers.activations import ACT2FN
+from transformers.models.qwen2_audio.configuration_qwen2_audio import Qwen2AudioConfig
+from transformers.models.qwen2_audio.modeling_qwen2_audio import (
+    Qwen2AudioEncoder,
+    Qwen2AudioMultiModalProjector,
+)
+
+from sglang.srt.hf_transformers_utils import get_processor
+from sglang.srt.layers.activation import QuickGELU
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.utils import get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen2 import Qwen2ForCausalLM
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+
+class Qwen2AudioForConditionalGeneration(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        config: Qwen2AudioConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        if getattr(self.config, "audio_config", None) is None:
+            self.config.audio_config = Qwen2AudioEncoderConfig(
+                self.config._name_or_path
+            )
+
+        if getattr(self.config, "text_config", None) is None:
+            self.config.text_config = Qwen2Config(self.config._name_or_path)
+
+        self.audio_tower = Qwen2AudioEncoder(
+            config.audio_config,
+        )
+        self.multi_modal_projector = Qwen2AudioMultiModalProjector(config)
+        self.language_model = Qwen2ForCausalLM(
+            config.text_config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        return self.pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def get_audio_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # Extract audio features from input items
+        input_features = torch.cat([item.feature for item in items], dim=0).type(
+            self.audio_tower.dtype
+        )
+
+        audio_embeds = self.audio_tower(input_features).last_hidden_state
+        audio_embeds = self.multi_modal_projector(audio_embeds)
+
+        audio_feature_lens = torch.cat([item.audio_feature_lens for item in items])
+        new_embeds = []
+        for i, d in zip(audio_feature_lens, audio_embeds):
+            new_embeds.append(d[: i.item()])
+
+        return torch.cat(new_embeds, dim=0)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.language_model,
+            data_embedding_funcs={
+                Modality.AUDIO: self.get_audio_feature,
+            },
+            positions=positions,
+        )
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+
+            if self.config.text_config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name or "audio_tower" in name:
+                    continue
+                name_tmp = name.replace(weight_name, param_name)
+
+                # Skip loading extra bias for GPTQ models.
+                if name_tmp.endswith(".bias") and name_tmp not in params_dict:
+                    continue
+                param = params_dict[name_tmp]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                try:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                except KeyError:
+                    print(params_dict.keys())
+                    raise
+
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = Qwen2AudioForConditionalGeneration
diff --git a/python/sglang/srt/models/qwen2_classification.py b/python/sglang/srt/models/qwen2_classification.py
new file mode 100644
index 000000000..366213e70
--- /dev/null
+++ b/python/sglang/srt/models/qwen2_classification.py
@@ -0,0 +1,75 @@
+﻿# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import Qwen2Config
+
+from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.qwen2 import Qwen2ForCausalLM, Qwen2Model
+from sglang.srt.utils import add_prefix
+
+
+class Qwen2ForSequenceClassification(nn.Module):
+    def __init__(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Qwen2Model(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=False)
+
+        self.eos_token_id = config.eos_token_id
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = True,
+    ) -> EmbeddingPoolerOutput:
+        assert (
+            get_embedding
+        ), "Qwen2ForSequenceClassification is only used for embedding"
+
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        logits = self.score(hidden_states)
+        pooled_logits = self.pooler(logits, forward_batch).embeddings
+
+        return EmbeddingPoolerOutput(pooled_logits)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        # Filter out lm_head weights of Qwen2ForCausalLM
+        filtered_weights = [
+            (name, w) for name, w in weights if not name.startswith("lm_head")
+        ]
+        return Qwen2ForCausalLM.load_weights(self, filtered_weights)
+
+
+EntryClass = [
+    Qwen2ForSequenceClassification,
+]
diff --git a/python/sglang/srt/models/qwen2_eagle.py b/python/sglang/srt/models/qwen2_eagle.py
new file mode 100644
index 000000000..4b4c0ec41
--- /dev/null
+++ b/python/sglang/srt/models/qwen2_eagle.py
@@ -0,0 +1,146 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from sglang.srt.utils import add_prefix
+
+# Adapted from
+# https://github.com/SafeAILab/EAGLE/blob/main/eagle/model/cnets.py
+"""Inference-only LLaMA-EAGLE model compatible with HuggingFace weights."""
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+
+from sglang.srt.distributed import get_pp_group
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.models.qwen2 import Qwen2DecoderLayer, Qwen2ForCausalLM
+
+Qwen2Config = None
+
+
+class Qwen2DecoderLayer(Qwen2DecoderLayer):
+    def __init__(
+        self,
+        config: Qwen2Config,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, layer_id, quant_config, prefix=prefix)
+
+        # Skip the input_layernorm
+        # https://github.com/SafeAILab/EAGLE/blob/35c78f6cdc19a73e05cf5c330b4c358dad970c6a/eagle/model/cnets.py#L427
+        if layer_id == 0:
+            del self.input_layernorm
+            setattr(self, "input_layernorm", lambda x: x)
+
+
+class Qwen2Model(nn.Module):
+    def __init__(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = nn.ModuleList(
+            [
+                Qwen2DecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.fc = torch.nn.Linear(config.hidden_size * 2, config.hidden_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        hidden_states = self.fc(
+            torch.cat((hidden_states, forward_batch.spec_info.hidden_states), dim=-1)
+        )
+
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        return hidden_states + residual
+
+
+class Qwen2ForCausalLMEagle(Qwen2ForCausalLM):
+    def __init__(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.quant_config = quant_config
+        self.pp_group = get_pp_group()
+        self.model = Qwen2Model(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        self.logits_processor = LogitsProcessor(config)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        for name, loaded_weight in weights:
+            if "lm_head" not in name:
+                name = "model." + name
+                super().load_weights([(name, loaded_weight)])
+
+
+EntryClass = [Qwen2ForCausalLMEagle]
diff --git a/python/sglang/srt/models/qwen2_moe.py b/python/sglang/srt/models/qwen2_moe.py
new file mode 100644
index 000000000..ffb619940
--- /dev/null
+++ b/python/sglang/srt/models/qwen2_moe.py
@@ -0,0 +1,774 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/qwen2_moe.py
+"""Inference-only Qwen2MoE model compatible with HuggingFace weights."""
+
+import logging
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.communicator import (
+    LayerCommunicator,
+    LayerScatterModes,
+    ScatterMode,
+)
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.two_batch_overlap import model_forward_maybe_tbo
+from sglang.srt.utils import add_prefix, is_cuda, make_layers
+
+logger = logging.getLogger(__name__)
+
+_is_cuda = is_cuda()
+
+
+class Qwen2MoeMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(
+        self,
+        x,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(
+            x, skip_all_reduce=should_allreduce_fusion or use_reduce_scatter
+        )
+        return x
+
+
+class Qwen2MoeSparseMoeBlock(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        alt_stream: Optional[torch.cuda.Stream] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.layer_id = layer_id
+        self.alt_stream = alt_stream
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.num_experts}."
+            )
+
+        self.topk = TopK(
+            top_k=config.num_experts_per_tok,
+            renormalize=config.norm_topk_prob,
+        )
+
+        self.experts = get_moe_impl_class()(
+            layer_id=self.layer_id,
+            top_k=config.num_experts_per_tok,
+            num_experts=config.num_experts,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            quant_config=quant_config,
+            prefix=add_prefix("experts", prefix),
+        )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            quant_config=None,
+            prefix=add_prefix("gate", prefix),
+        )
+        if config.shared_expert_intermediate_size > 0:
+            self.shared_expert = Qwen2MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.shared_expert_intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=add_prefix("shared_expert", prefix),
+            )
+        else:
+            self.shared_expert = None
+        self.shared_expert_gate = torch.nn.Linear(config.hidden_size, 1, bias=False)
+
+    def _forward_shared_experts(self, hidden_states: torch.Tensor):
+        shared_output = None
+        if self.shared_expert is not None:
+            shared_output = self.shared_expert(hidden_states)
+            if self.shared_expert_gate is not None:
+                shared_output = (
+                    F.sigmoid(self.shared_expert_gate(hidden_states)) * shared_output
+                )
+        return shared_output
+
+    def _forward_router_experts(self, hidden_states: torch.Tensor):
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        return self.experts(hidden_states, topk_output)
+
+    def forward_normal_dual_stream(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        current_stream = torch.cuda.current_stream()
+        self.alt_stream.wait_stream(current_stream)
+        shared_output = self._forward_shared_experts(hidden_states)
+
+        with torch.cuda.stream(self.alt_stream):
+            router_output = self._forward_router_experts(hidden_states)
+
+        current_stream.wait_stream(self.alt_stream)
+
+        return router_output, shared_output
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: Optional[ForwardBatch] = None,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        DUAL_STREAM_TOKEN_THRESHOLD = 1024
+        if (
+            self.alt_stream is not None
+            and hidden_states.shape[0] > 0
+            and hidden_states.shape[0] <= DUAL_STREAM_TOKEN_THRESHOLD
+        ):
+            final_hidden_states, shared_output = self.forward_normal_dual_stream(
+                hidden_states
+            )
+        else:
+            shared_output = self._forward_shared_experts(hidden_states)
+            final_hidden_states = self._forward_router_experts(hidden_states)
+
+        if shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        if self.tp_size > 1 and not use_reduce_scatter:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class Qwen2MoeAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        qkv_bias: int = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        dual_chunk_attention_config: Optional[dict[str, Any]] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % attn_tp_size == 0
+        self.num_heads = self.total_num_heads // attn_tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=qkv_bias,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            reduce_results=False,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            dual_chunk_attention_config=dual_chunk_attention_config,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Qwen2MoeDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        qkv_bias = getattr(config, "qkv_bias", True)
+        dual_chunk_attention_config = getattr(
+            config, "dual_chunk_attention_config", None
+        )
+        self.self_attn = Qwen2MoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            dual_chunk_attention_config=dual_chunk_attention_config,
+            qkv_bias=qkv_bias,
+            prefix=add_prefix("self_attn", prefix),
+        )
+
+        self.layer_id = layer_id
+
+        self.attn_tp_size = get_attention_tp_size()
+        self.attn_tp_rank = get_attention_tp_rank()
+
+        # Qwen2MoE all layers are sparse and have no nextn now
+        self.is_layer_sparse = True
+        is_previous_layer_sparse = True
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=self.is_layer_sparse,
+            is_previous_layer_sparse=is_previous_layer_sparse,
+        )
+
+        if self.is_layer_sparse:
+            self.mlp = Qwen2MoeSparseMoeBlock(
+                layer_id=layer_id,
+                config=config,
+                quant_config=quant_config,
+                alt_stream=alt_stream,
+                prefix=add_prefix("mlp", prefix),
+            )
+        else:
+            self.mlp = Qwen2MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            allow_reduce_scatter=True,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+
+        if hidden_states.shape[0] != 0:
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+
+        # For DP with padding, reduce scatter can be used instead of all-reduce.
+        use_reduce_scatter = self.layer_communicator.should_use_reduce_scatter(
+            forward_batch
+        )
+
+        hidden_states = self.mlp(hidden_states, forward_batch, use_reduce_scatter)
+
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            hidden_states, residual, forward_batch
+        )
+
+        return hidden_states, residual
+
+
+class Qwen2MoeModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        decoder_layer_type: type[nn.Module] = Qwen2MoeDecoderLayer,
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                enable_tp=not is_dp_attention_enabled(),
+                prefix=add_prefix("embed_tokens", prefix),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        # Use the provided decoder layer type or default to Qwen2MoeDecoderLayer
+        decoder_layer_type = decoder_layer_type or Qwen2MoeDecoderLayer
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: decoder_layer_type(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+                alt_stream=alt_stream,
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=add_prefix("layers", prefix),
+        )
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+
+        # For EAGLE3 support
+        self.layers_to_capture = []
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        aux_hidden_states = []
+        if forward_batch.can_run_tbo:
+            hidden_states, residual = model_forward_maybe_tbo(
+                layers=self.layers,
+                enable_tbo=True,
+                input_data_scatter_mode=ScatterMode.model_input_output(),
+                positions=positions,
+                forward_batch=forward_batch,
+                hidden_states=hidden_states,
+                residual=residual,
+            )
+        else:
+            for i in range(self.start_layer, self.end_layer):
+                if i in self.layers_to_capture:
+                    aux_hidden_states.append(
+                        hidden_states + residual
+                        if residual is not None
+                        else hidden_states
+                    )
+                with get_global_expert_distribution_recorder().with_current_layer(i):
+                    layer = self.layers[i]
+                    hidden_states, residual = layer(
+                        positions, hidden_states, forward_batch, residual
+                    )
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            if hidden_states.shape[0] != 0:
+                if residual is None:
+                    hidden_states = self.norm(hidden_states)
+                else:
+                    hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+
+        return hidden_states, aux_hidden_states
+
+
+class Qwen2MoeForCausalLM(nn.Module):
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        alt_stream = torch.cuda.Stream() if _is_cuda else None
+        self.model = Qwen2MoeModel(
+            config,
+            quant_config,
+            prefix=add_prefix("model", prefix),
+            alt_stream=alt_stream,
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+            use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
+        )
+        self.logits_processor = LogitsProcessor(config)
+        # For EAGLE3 support
+        self.capture_aux_hidden_states = False
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+        if self.pp_group.is_last_rank:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch, aux_hidden_states
+            )
+        else:
+            return hidden_states
+
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],  # [start, end) 0-based
+        input_embeds: torch.Tensor = None,
+    ):
+        start, end = split_interval
+        # embed
+        if start == 0:
+            if input_embeds is None:
+                forward_batch.hidden_states = self.model.embed_tokens(input_ids)
+            else:
+                forward_batch.hidden_states = input_embeds
+
+        # decoder layer
+        for i in range(start, end):
+            with get_global_expert_distribution_recorder().with_current_layer(i):
+                layer = self.model.layers[i]
+                forward_batch.hidden_states, forward_batch.residual = layer(
+                    positions,
+                    forward_batch.hidden_states,
+                    forward_batch,
+                    forward_batch.residual,
+                )
+
+        if end == self.model.config.num_hidden_layers:
+            # norm
+            hidden_states, _ = self.model.norm(
+                forward_batch.hidden_states, forward_batch.residual
+            )
+            forward_batch.hidden_states = hidden_states
+            # logits process
+            result = self.logits_processor(
+                input_ids, forward_batch.hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            result = None
+
+        return result
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if name not in params_dict:
+                        continue
+
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+                    else:
+                        logger.warning(f"Parameter {name} not found in params_dict")
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        return ModelConfigForExpertLocation(
+            num_layers=config.num_hidden_layers,
+            num_logical_experts=config.num_experts,
+            num_groups=None,
+        )
+
+    def set_eagle3_layers_to_capture(self, layer_ids: Optional[List[int]] = None):
+        if not self.pp_group.is_last_rank:
+            return
+
+        self.capture_aux_hidden_states = True
+        if layer_ids is None:
+            num_layers = self.config.num_hidden_layers
+            self.model.layers_to_capture = [
+                2,
+                num_layers // 2,
+                num_layers - 3,
+            ]  # Specific layers for EAGLE3 support
+        else:
+            self.model.layers_to_capture = [val + 1 for val in layer_ids]
+
+
+EntryClass = Qwen2MoeForCausalLM
diff --git a/python/sglang/srt/models/qwen2_rm.py b/python/sglang/srt/models/qwen2_rm.py
new file mode 100644
index 000000000..f5ed9eae2
--- /dev/null
+++ b/python/sglang/srt/models/qwen2_rm.py
@@ -0,0 +1,78 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import Qwen2Config
+
+from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.qwen2 import Qwen2ForCausalLM, Qwen2Model
+from sglang.srt.utils import add_prefix
+
+
+class Qwen2ForRewardModel(nn.Module):
+    def __init__(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.num_labels = 1
+        self.model = Qwen2Model(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.score = nn.Sequential(
+            nn.Linear(config.hidden_size, config.hidden_size),
+            nn.ReLU(),
+            nn.Linear(config.hidden_size, self.num_labels),
+        )
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=False)
+
+        self.eos_token_id = config.eos_token_id
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = True,
+    ) -> EmbeddingPoolerOutput:
+        assert get_embedding, "Qwen2ForRewardModel is only used for embedding"
+
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        logits = self.score(hidden_states)
+        pooled_logits = self.pooler(logits, forward_batch).embeddings
+
+        return EmbeddingPoolerOutput(pooled_logits)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        # Filter out lm_head weights of Qwen2ForCausalLM
+        filtered_weights = [
+            (name, w) for name, w in weights if not name.startswith("lm_head")
+        ]
+        return Qwen2ForCausalLM.load_weights(self, filtered_weights)
+
+
+EntryClass = [
+    Qwen2ForRewardModel,
+]
diff --git a/python/sglang/srt/models/qwen2_vl.py b/python/sglang/srt/models/qwen2_vl.py
new file mode 100644
index 000000000..55f325813
--- /dev/null
+++ b/python/sglang/srt/models/qwen2_vl.py
@@ -0,0 +1,609 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen2-VL model compatible with HuggingFace weights."""
+import logging
+from functools import lru_cache, partial
+from typing import Iterable, List, Optional, Tuple, Type, TypedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import Qwen2VLConfig
+from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig
+
+from sglang.srt.hf_transformers_utils import get_processor
+from sglang.srt.layers.activation import QuickGELU
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen2 import Qwen2Model
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+
+# === Vision Inputs === #
+
+
+class Qwen2VLImageInputs(TypedDict):
+    pixel_values: torch.Tensor
+    """Shape:
+    `(num_patches, num_channels * patch_size * patch_size)`
+    """
+
+    image_grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+class Qwen2VLVideoInputs(TypedDict):
+    pixel_values_videos: torch.Tensor
+    """Shape:
+    `(num_patches,
+      num_channels * temporal_patch_size * patch_size * patch_size)`
+    """
+
+    video_grid_thw: torch.Tensor
+    """Shape: `(num_videos, 3)`
+
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+# === Vision Encoder === #
+
+
+class Qwen2VisionMLP(nn.Module):
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int = None,
+        act_layer: Type[nn.Module] = QuickGELU,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.fc1 = ColumnParallelLinear(
+            in_features,
+            hidden_features,
+            quant_config=quant_config,
+            prefix=add_prefix("fc1", prefix),
+        )
+        self.act = act_layer()
+        self.fc2 = RowParallelLinear(
+            hidden_features,
+            in_features,
+            quant_config=quant_config,
+            prefix=add_prefix("fc2", prefix),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_parallel, _ = self.fc1(x)
+        x_parallel = self.act(x_parallel)
+        x, _ = self.fc2(x_parallel)
+        return x
+
+
+class Qwen2VisionBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float,
+        act_layer: Type[nn.Module] = QuickGELU,
+        norm_layer: Type[nn.Module] = None,
+        attn_implementation: Optional[str] = "sdpa",
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.norm1 = norm_layer(dim)
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        if attn_implementation == "sdpa":
+            qkv_backend = "sdpa"
+            softmax_in_single_precision = False
+        elif attn_implementation == "flash_attention_2":
+            qkv_backend = "triton_attn"
+            softmax_in_single_precision = False
+        elif attn_implementation == "eager":
+            qkv_backend = "sdpa"
+            softmax_in_single_precision = True
+
+        self.attn = VisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            use_qkv_parallel=True,
+            qkv_backend=qkv_backend,
+            softmax_in_single_precision=softmax_in_single_precision,
+            flatten_batch=True,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+        self.mlp = Qwen2VisionMLP(
+            dim,
+            mlp_hidden_dim,
+            act_layer=act_layer,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        position_embeddings: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self.norm1(x)
+        hidden_states = rearrange(hidden_states, "s b ... -> b s ...")
+        attn = self.attn(
+            hidden_states,
+            cu_seqlens=cu_seqlens,
+            position_embeddings=position_embeddings,
+        )
+        attn = rearrange(attn, "b s ... -> s b ...")
+        x = x + attn
+        x = x + self.mlp(self.norm2(x))
+        return x
+
+
+class Qwen2VisionPatchEmbed(nn.Module):
+
+    def __init__(
+        self,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        in_chans: int = 3,
+        embed_dim: int = 1152,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.embed_dim = embed_dim
+
+        kernel_size = [temporal_patch_size, patch_size, patch_size]
+        self.proj = nn.Conv3d(
+            in_chans, embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=False
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        L, C = x.shape
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
+        x = self.proj(x).view(L, self.embed_dim)
+        return x
+
+
+class Qwen2VisionPatchMerger(nn.Module):
+
+    def __init__(
+        self,
+        d_model: int,
+        context_dim: int,
+        norm_layer: Type[nn.Module] = None,
+        spatial_merge_size: int = 2,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.ln_q = norm_layer(context_dim)
+        self.mlp = nn.ModuleList(
+            [
+                ColumnParallelLinear(
+                    self.hidden_size,
+                    self.hidden_size,
+                    bias=True,
+                    quant_config=quant_config,
+                    prefix=add_prefix("mlp.0", prefix),
+                ),
+                nn.GELU(),
+                RowParallelLinear(
+                    self.hidden_size,
+                    d_model,
+                    bias=True,
+                    quant_config=quant_config,
+                    prefix=add_prefix("mlp.2", prefix),
+                ),
+            ]
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.ln_q(x)
+        x = x.view(-1, self.hidden_size)
+
+        mlp_fc1, mlp_act, mlp_fc2 = self.mlp
+        x_parallel, _ = mlp_fc1(x)
+        x_parallel = mlp_act(x_parallel)
+        out, _ = mlp_fc2(x_parallel)
+        return out
+
+
+class Qwen2VisionRotaryEmbedding(nn.Module):
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self._seq_len_cached = 0
+        self._freqs_cached = None
+
+    def update_freqs_cache(self, seqlen: int) -> None:
+        if seqlen > self._seq_len_cached:
+            seqlen *= 2
+            self._seq_len_cached = seqlen
+            self.inv_freq = 1.0 / (
+                self.theta
+                ** (
+                    torch.arange(
+                        0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device
+                    )
+                    / self.dim
+                )
+            )
+            seq = torch.arange(
+                seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype
+            )
+            freqs = torch.outer(seq, self.inv_freq)
+            self._freqs_cached = freqs
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        self.update_freqs_cache(seqlen)
+        return self._freqs_cached[:seqlen]
+
+
+class Qwen2VisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        vision_config: Qwen2VLVisionConfig,
+        norm_eps: float = 1e-6,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        patch_size: int = vision_config.patch_size
+        temporal_patch_size: int = vision_config.temporal_patch_size
+        spatial_merge_size: int = vision_config.spatial_merge_size
+        in_chans: int = vision_config.in_chans
+        hidden_size: int = vision_config.hidden_size
+        embed_dim: int = vision_config.embed_dim
+        depth: int = vision_config.depth
+        num_heads: int = vision_config.num_heads
+        mlp_ratio: float = vision_config.mlp_ratio
+
+        self.spatial_merge_size = spatial_merge_size
+
+        self.patch_embed = Qwen2VisionPatchEmbed(
+            patch_size=patch_size,
+            temporal_patch_size=temporal_patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+
+        norm_layer = partial(nn.LayerNorm, eps=norm_eps)
+        head_dim = embed_dim // num_heads
+        self.rotary_pos_emb = Qwen2VisionRotaryEmbedding(head_dim // 2)
+        self.blocks = nn.ModuleList(
+            [
+                Qwen2VisionBlock(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    norm_layer=norm_layer,
+                    attn_implementation="sdpa",
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"blocks.{i}", prefix),
+                )
+                for i in range(depth)
+            ]
+        )
+        self.merger = Qwen2VisionPatchMerger(
+            d_model=hidden_size,
+            context_dim=embed_dim,
+            norm_layer=norm_layer,
+            quant_config=quant_config,
+            prefix=add_prefix("merger", prefix),
+        )
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.patch_embed.proj.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.blocks[0].mlp.fc2.weight.device
+
+    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
+        pos_ids = []
+        for i in range(grid_thw.size(0)):
+            t, h, w = grid_thw[i].tolist()
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            hpos_ids = (
+                hpos_ids.reshape(
+                    h // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                    w // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                )
+                .permute(0, 2, 1, 3)
+                .flatten()
+            )
+            wpos_ids = (
+                wpos_ids.reshape(
+                    h // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                    w // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                )
+                .permute(0, 2, 1, 3)
+                .flatten()
+            )
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        # patchify
+        x = x.to(device=self.device, dtype=self.dtype)
+        x = self.patch_embed(x)
+
+        # compute position embedding
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+        # compute cu_seqlens
+        cu_seqlens = torch.repeat_interleave(
+            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
+        ).cumsum(dim=0, dtype=torch.int32)
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
+
+        # transformers
+        x = x.unsqueeze(1)
+        for blk in self.blocks:
+            x = blk(x, cu_seqlens=cu_seqlens, position_embeddings=position_embeddings)
+
+        # adapter
+        x = self.merger(x)
+        return x
+
+
+cached_get_processor = lru_cache(get_processor)
+
+
+class Qwen2VLForConditionalGeneration(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        config: Qwen2VLConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.visual = Qwen2VisionTransformer(
+            config.vision_config,
+            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+            # NOTE: Qwen2-VL vision encoder currently supports BitsAndBytes 4-bit quantization.
+            # Other quantization methods (e.g., GPTQ, AWQ) are untested and may not be supported.
+            quant_config=quant_config,
+            prefix=add_prefix("visual", prefix),
+        )
+
+        self.model = Qwen2Model(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+
+        self.is_mrope_enabled = "mrope_section" in self.config.rope_scaling
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # in qwen-vl, last dim is the same
+        pixel_values = torch.cat([item.feature for item in items], dim=0).type(
+            self.visual.dtype
+        )
+        image_grid_thw = torch.concat([item.image_grid_thw for item in items], dim=0)
+        assert pixel_values.dim() == 2, pixel_values.dim()
+        assert image_grid_thw.dim() == 2, image_grid_thw.dim()
+        image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
+        return image_embeds
+
+    def get_video_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # in qwen-vl, last dim is the same
+        pixel_values = torch.cat([item.feature for item in items], dim=0).type(
+            self.visual.dtype
+        )
+        video_grid_thw = torch.concat([item.video_grid_thw for item in items], dim=0)
+        assert pixel_values.dim() == 2, pixel_values.dim()
+        assert video_grid_thw.dim() == 2, video_grid_thw.dim()
+        video_embeds = self.visual(pixel_values, grid_thw=video_grid_thw)
+        return video_embeds
+
+    def _process_video_input(self, video_input: Qwen2VLVideoInputs) -> torch.Tensor:
+        pixel_values_videos = video_input["pixel_values_videos"].type(self.visual.dtype)
+        video_embeds = self.visual(
+            pixel_values_videos, grid_thw=video_input["video_grid_thw"]
+        )
+        return video_embeds
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = False,
+    ):
+        """Run forward pass for Qwen2-VL.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            positions: Flattened (concatenated) position ids corresponding to a
+                batch.
+                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
+                opensource models), the shape will be `(3, seq_len)`,
+                otherwise it will be `(seq_len,).
+                (Use input_metadata.mrope_positions to replace it)
+        """
+        if self.is_mrope_enabled:
+            positions = forward_batch.mrope_positions
+
+        if not (
+            forward_batch.forward_mode.is_decode()
+            or not forward_batch.contains_image_inputs()
+        ):
+            if self.is_mrope_enabled:
+                assert positions.ndim == 2 and positions.size(0) == 3, (
+                    "multimodal section rotary embedding requires "
+                    f"(3, seq_len) positions, but got {positions.size()}"
+                )
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.model,
+            multimodal_model=self,
+            positions=positions,
+        )
+
+        if get_embedding:
+            return self.pooler(hidden_states, forward_batch)
+        else:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "up_proj", 1),
+            ("gate_up_proj", "gate_proj", 0),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if "visual" in name:
+                    # adapt to VisionAttention
+                    name = name.replace(r"attn.qkv.", r"attn.qkv_proj.")
+
+                try:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                except KeyError:
+                    print(params_dict.keys())
+                    raise
+
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = Qwen2VLForConditionalGeneration
diff --git a/python/sglang/srt/models/qwen3.py b/python/sglang/srt/models/qwen3.py
new file mode 100644
index 000000000..bc5f054d7
--- /dev/null
+++ b/python/sglang/srt/models/qwen3.py
@@ -0,0 +1,523 @@
+# Adapted from qwen2.py
+import logging
+from functools import partial
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+
+from sglang.srt.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
+from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import QKVParallelLinear, RowParallelLinear
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.models.qwen2 import Qwen2MLP as Qwen3MLP
+from sglang.srt.models.qwen2 import Qwen2Model
+from sglang.srt.utils import add_prefix, is_cuda
+
+Qwen3Config = None
+
+logger = logging.getLogger(__name__)
+_is_cuda = is_cuda()
+
+
+class Qwen3Attention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 1000000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        head_dim: Optional[int] = None,
+        max_position_embeddings: int = 32768,
+        quant_config: Optional[QuantizationConfig] = None,
+        rms_norm_eps: float = None,
+        attention_bias: bool = False,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        assert self.total_num_heads % attn_tp_size == 0
+        self.num_heads = self.total_num_heads // attn_tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
+        self.head_dim = head_dim or hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=attention_bias,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=attention_bias,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            reduce_results=False,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            prefix=add_prefix("attn", prefix),
+        )
+        self.alt_stream = alt_stream
+
+    def _apply_qk_norm(
+        self, q: torch.Tensor, k: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # overlap qk norm
+        if self.alt_stream is not None and get_is_capture_mode():
+            current_stream = torch.cuda.current_stream()
+            self.alt_stream.wait_stream(current_stream)
+            q_by_head = q.reshape(-1, self.head_dim)
+            q_by_head = self.q_norm(q_by_head)
+            with torch.cuda.stream(self.alt_stream):
+                k_by_head = k.reshape(-1, self.head_dim)
+                k_by_head = self.k_norm(k_by_head)
+            current_stream.wait_stream(self.alt_stream)
+        else:
+            q_by_head = q.reshape(-1, self.head_dim)
+            q_by_head = self.q_norm(q_by_head)
+            k_by_head = k.reshape(-1, self.head_dim)
+            k_by_head = self.k_norm(k_by_head)
+        q = q_by_head.view(q.shape)
+        k = k_by_head.view(k.shape)
+        return q, k
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self._apply_qk_norm(q, k)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Qwen3DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: Qwen3Config,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 1000000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 32768)
+        head_dim = getattr(config, "head_dim", None)
+        self.self_attn = Qwen3Attention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            head_dim=head_dim,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            rms_norm_eps=config.rms_norm_eps,
+            attention_bias=config.attention_bias,
+            prefix=add_prefix("self_attn", prefix),
+            alt_stream=alt_stream,
+        )
+        self.mlp = Qwen3MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=False,
+            is_previous_layer_sparse=False,
+        )
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+        if hidden_states.shape[0] != 0:
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+
+        # Fully Connected
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+        hidden_states = self.mlp(hidden_states)
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            hidden_states, residual, forward_batch
+        )
+        return hidden_states, residual
+
+
+class Qwen3Model(Qwen2Model):
+    def __init__(
+        self,
+        config: Qwen3Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        alt_stream = torch.cuda.Stream() if _is_cuda else None
+        super().__init__(
+            config=config,
+            quant_config=quant_config,
+            prefix=prefix,
+            decoder_layer_type=Qwen3DecoderLayer,
+            alt_stream=alt_stream,
+        )
+
+
+class Qwen3ForCausalLM(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        config: Qwen3Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Qwen3Model(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+
+        # handle the lm head on different pp ranks
+        if self.pp_group.is_last_rank:
+            if self.pp_group.world_size == 1 and config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(
+                    config.vocab_size,
+                    config.hidden_size,
+                    quant_config=quant_config,
+                    prefix=add_prefix("lm_head", prefix),
+                )
+        else:
+            # ranks other than the last rank will have a placeholder layer
+            self.lm_head = PPMissingLayer()
+
+        # perform weight tying for PP
+        if self.pp_group.world_size > 1 and config.tie_word_embeddings:
+            if self.pp_group.is_first_rank:
+                self.pp_group.send(
+                    self.model.embed_tokens.weight, dst=self.pp_group.last_rank
+                )
+            else:
+                emb_token_weight = self.pp_group.recv(
+                    size=(config.vocab_size, config.hidden_size),
+                    dtype=next(self.model.parameters()).dtype,
+                    src=self.pp_group.first_rank,
+                )
+                self.lm_head.weight.copy_(emb_token_weight)
+
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
+        # For EAGLE3 support
+        self.capture_aux_hidden_states = False
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.get_input_embeddings()
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if self.pp_group.is_last_rank:
+            if not get_embedding:
+                return self.logits_processor(
+                    input_ids,
+                    hidden_states,
+                    self.lm_head,
+                    forward_batch,
+                    aux_hidden_states,
+                )
+            else:
+                return self.pooler(hidden_states, forward_batch)
+        else:
+            return hidden_states
+
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],  # [start, end) 0-based
+        input_embeds: torch.Tensor = None,
+    ):
+        start, end = split_interval
+        # embed
+        if start == 0:
+            if input_embeds is None:
+                forward_batch.hidden_states = self.model.embed_tokens(input_ids)
+            else:
+                forward_batch.hidden_states = input_embeds
+        # decoder layer
+        for i in range(start, end):
+            layer = self.model.layers[i]
+            forward_batch.hidden_states, forward_batch.residual = layer(
+                positions,
+                forward_batch.hidden_states,
+                forward_batch,
+                forward_batch.residual,
+            )
+
+        if end == self.model.config.num_hidden_layers:
+            # norm
+            hidden_states, _ = self.model.norm(
+                forward_batch.hidden_states, forward_batch.residual
+            )
+            forward_batch.hidden_states = hidden_states
+            # logits process
+            result = self.logits_processor(
+                input_ids, forward_batch.hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            result = None
+
+        return result
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "Embedding" in self.config.name_or_path:
+                name = add_prefix(name, "model")
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                if self.pp_group.world_size > 1 and self.pp_group.is_last_rank:
+                    # Handle pp weight tying here
+                    # find the embed_tokens.weight in the weights
+                    embed_token_weights = next(
+                        filter(lambda x: x[0] == "model.embed_tokens.weight", weights)
+                    )[1]
+                    loaded_weight = embed_token_weights
+                else:
+                    continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+            if "scale" in name:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if name in params_dict.keys():
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                else:
+                    logger.warning(f"Parameter {name} not found in params_dict")
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        self.model.load_kv_cache_scales(quantization_param_path)
+
+    def set_eagle3_layers_to_capture(self, layer_ids: Optional[List[int]] = None):
+        if not self.pp_group.is_last_rank:
+            return
+
+        self.capture_aux_hidden_states = True
+        if layer_ids is None:
+            num_layers = self.config.num_hidden_layers
+            self.model.layers_to_capture = [
+                2,
+                num_layers // 2,
+                num_layers - 3,
+            ]  # Specific layers for EAGLE3 support
+        else:
+            self.model.layers_to_capture = [val + 1 for val in layer_ids]
+
+
+EntryClass = Qwen3ForCausalLM
diff --git a/python/sglang/srt/models/qwen3_classification.py b/python/sglang/srt/models/qwen3_classification.py
new file mode 100644
index 000000000..a59d6769b
--- /dev/null
+++ b/python/sglang/srt/models/qwen3_classification.py
@@ -0,0 +1,84 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import Qwen2Config  # Qwen3 uses Qwen2Config
+
+from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.qwen3 import Qwen3ForCausalLM, Qwen3Model
+from sglang.srt.utils import add_prefix
+
+
+class Qwen3ForSequenceClassification(nn.Module):
+    def __init__(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Qwen3Model(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+        # Use normalize=True for qwen3 embedding based on official implementation
+        # Reference: https://github.com/QwenLM/Qwen3-Embedding/blob/main/examples/qwen3_embedding_transformers.py#L55
+        # Official code: output = F.normalize(output, p=2, dim=1)
+        normalize = True
+
+        # We don't want to normalize the embedding if we have a classification head
+        if config.id2label is not None or config.label2id is not None:
+            normalize = False
+
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=normalize)
+
+        self.eos_token_id = config.eos_token_id
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: Optional[torch.Tensor] = None,
+        get_embedding: bool = True,
+    ) -> EmbeddingPoolerOutput:
+        assert (
+            get_embedding
+        ), "Qwen3ForSequenceClassification is only used for embedding"
+
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        logits = self.score(hidden_states)
+        pooled_logits = self.pooler(logits, forward_batch).embeddings
+
+        return EmbeddingPoolerOutput(pooled_logits)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        # Filter out lm_head weights of Qwen3ForCausalLM
+        filtered_weights = [
+            (name, w) for name, w in weights if not name.startswith("lm_head")
+        ]
+        return Qwen3ForCausalLM.load_weights(self, filtered_weights)
+
+
+EntryClass = [
+    Qwen3ForSequenceClassification,
+]
diff --git a/python/sglang/srt/models/qwen3_moe.py b/python/sglang/srt/models/qwen3_moe.py
new file mode 100644
index 000000000..c1c4c3638
--- /dev/null
+++ b/python/sglang/srt/models/qwen3_moe.py
@@ -0,0 +1,896 @@
+# Adapted from qwen2_moe.py
+
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+"""Inference-only Qwen3MoE model compatible with HuggingFace weights."""
+
+import logging
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+
+from sglang.srt.distributed import (
+    get_moe_expert_parallel_world_size,
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo
+from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
+from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe import (
+    get_moe_a2a_backend,
+    should_use_flashinfer_cutlass_moe_fp4_allgather,
+)
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen2_moe import Qwen2MoeMLP as Qwen3MoeMLP
+from sglang.srt.models.qwen2_moe import Qwen2MoeModel
+from sglang.srt.utils import (
+    add_prefix,
+    is_cuda,
+    is_flashinfer_available,
+    is_non_idle_and_non_empty,
+)
+
+Qwen3MoeConfig = None
+
+_is_flashinfer_available = is_flashinfer_available()
+
+logger = logging.getLogger(__name__)
+_is_cuda = is_cuda()
+
+
+class Qwen3MoeSparseMoeBlock(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        config: Qwen3MoeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.layer_id = layer_id
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.num_experts}."
+            )
+
+        self.topk = TopK(
+            top_k=config.num_experts_per_tok,
+            renormalize=config.norm_topk_prob,
+            use_grouped_topk=False,
+        )
+
+        self.experts = get_moe_impl_class()(
+            num_experts=config.num_experts
+            + global_server_args_dict["ep_num_redundant_experts"],
+            top_k=config.num_experts_per_tok,
+            layer_id=layer_id,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            quant_config=quant_config,
+            prefix=add_prefix("experts", prefix),
+        )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            quant_config=None,
+            prefix=add_prefix("gate", prefix),
+        )
+
+        if get_moe_a2a_backend().is_deepep():
+            # TODO: we will support tp < ep in the future
+            self.ep_size = get_moe_expert_parallel_world_size()
+            self.num_experts = (
+                config.num_experts + global_server_args_dict["ep_num_redundant_experts"]
+            )
+            self.top_k = config.num_experts_per_tok
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: Optional[ForwardBatch] = None,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+
+        if not get_moe_a2a_backend().is_deepep():
+            return self.forward_normal(
+                hidden_states, should_allreduce_fusion, use_reduce_scatter
+            )
+        else:
+            return self.forward_deepep(hidden_states, forward_batch)
+
+    def get_moe_weights(self):
+        return [
+            x.data
+            for name, x in self.experts.named_parameters()
+            if name not in ["correction_bias"]
+        ]
+
+    def forward_normal(
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = self.experts(hidden_states, topk_output)
+        if (
+            self.tp_size > 1
+            and not should_allreduce_fusion
+            and not use_reduce_scatter
+            and not should_use_flashinfer_cutlass_moe_fp4_allgather()
+        ):
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+    def forward_deepep(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        if hidden_states.shape[0] > 0:
+            # router_logits: (num_tokens, n_experts)
+            router_logits, _ = self.gate(hidden_states)
+            topk_weights, topk_idx, _ = self.topk(
+                hidden_states,
+                router_logits,
+                num_token_non_padded=forward_batch.num_token_non_padded,
+                expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
+                    layer_id=self.layer_id,
+                ),
+            )
+        else:
+            topk_idx = torch.full(
+                (0, self.top_k), -1, dtype=torch.int, device=hidden_states.device
+            )
+            topk_weights = torch.empty(
+                (0, self.top_k), dtype=torch.float32, device=hidden_states.device
+            )
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states,
+            topk_idx=topk_idx,
+            topk_weights=topk_weights,
+            forward_batch=forward_batch,
+        )
+        return final_hidden_states
+
+    def op_gate(self, state):
+        if is_non_idle_and_non_empty(
+            state.forward_batch.forward_mode, state.hidden_states_mlp_input
+        ):
+            # router_logits: (num_tokens, n_experts)
+            state.router_logits, _ = self.gate(state.hidden_states_mlp_input)
+        else:
+            state.router_logits = None
+
+    def op_select_experts(self, state):
+        router_logits = state.pop("router_logits")
+        hidden_states = state.hidden_states_mlp_input
+        if router_logits is not None:
+            with get_global_expert_distribution_recorder().with_current_layer(
+                self.layer_id
+            ):
+                state.topk_weights_local, state.topk_idx_local, _ = self.topk(
+                    hidden_states=hidden_states,
+                    router_logits=router_logits,
+                    num_token_non_padded=state.forward_batch.num_token_non_padded,
+                    expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
+                        layer_id=self.layer_id,
+                    ),
+                )
+        else:
+            state.topk_idx_local = torch.full(
+                (0, self.top_k), -1, dtype=torch.int, device=hidden_states.device
+            )
+            state.topk_weights_local = torch.empty(
+                (0, self.top_k), dtype=torch.float32, device=hidden_states.device
+            )
+
+    def op_dispatch_a(self, state):
+        if self.ep_size > 1:
+            self.experts.deepep_dispatcher.dispatch_a(
+                hidden_states=state.pop("hidden_states_mlp_input"),
+                topk_idx=state.pop("topk_idx_local"),
+                topk_weights=state.pop("topk_weights_local"),
+                forward_batch=state.forward_batch,
+                tbo_subbatch_index=state.get("tbo_subbatch_index"),
+            )
+
+    def op_dispatch_b(self, state):
+        if self.ep_size > 1:
+            with get_global_expert_distribution_recorder().with_current_layer(
+                self.layer_id
+            ):
+                state.dispatch_output = self.experts.deepep_dispatcher.dispatch_b(
+                    tbo_subbatch_index=state.get("tbo_subbatch_index"),
+                )
+
+    def op_experts(self, state):
+        state.hidden_states_experts_output = self.experts.moe_impl(
+            dispatch_output=state.dispatch_output,
+        )
+
+    def op_combine_a(self, state):
+        if self.ep_size > 1:
+            self.experts.deepep_dispatcher.combine_a(
+                hidden_states=state.pop("hidden_states_experts_output"),
+                topk_idx=state.dispatch_output.topk_idx,
+                topk_weights=state.dispatch_output.topk_weights,
+                forward_batch=state.forward_batch,
+                tbo_subbatch_index=state.get("tbo_subbatch_index"),
+            )
+            state.pop("dispatch_output")
+
+    def op_combine_b(self, state):
+        if self.ep_size > 1:
+            state.hidden_states_after_combine = (
+                self.experts.deepep_dispatcher.combine_b(
+                    tbo_subbatch_index=state.get("tbo_subbatch_index"),
+                )
+            )
+
+    def op_output(self, state):
+        state.hidden_states_mlp_output = state.pop("hidden_states_after_combine")
+
+
+class Qwen3MoeAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        head_dim: Optional[int] = None,
+        rms_norm_eps: float = 1e-06,
+        attention_bias: bool = False,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        dual_chunk_attention_config: Optional[dict[str, Any]] = None,
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % attn_tp_size == 0
+        self.num_heads = self.total_num_heads // attn_tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
+        self.head_dim = head_dim or hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=attention_bias,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=attention_bias,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            reduce_results=False,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            dual_chunk_attention_config=dual_chunk_attention_config,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            prefix=add_prefix("attn", prefix),
+        )
+
+        self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+        self.alt_stream = alt_stream
+
+    def _apply_qk_norm(
+        self, q: torch.Tensor, k: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # overlap qk norm
+        if self.alt_stream is not None and get_is_capture_mode():
+            current_stream = torch.cuda.current_stream()
+            self.alt_stream.wait_stream(current_stream)
+            q_by_head = q.reshape(-1, self.head_dim)
+            q_by_head = self.q_norm(q_by_head)
+            with torch.cuda.stream(self.alt_stream):
+                k_by_head = k.reshape(-1, self.head_dim)
+                k_by_head = self.k_norm(k_by_head)
+            current_stream.wait_stream(self.alt_stream)
+        else:
+            q_by_head = q.reshape(-1, self.head_dim)
+            q_by_head = self.q_norm(q_by_head)
+            k_by_head = k.reshape(-1, self.head_dim)
+            k_by_head = self.k_norm(k_by_head)
+        q = q_by_head.view(q.shape)
+        k = k_by_head.view(k.shape)
+        return q, k
+
+    def op_prepare(self, state):
+        state.attn_intermediate_state = self.forward_prepare(
+            positions=state.positions,
+            hidden_states=state.pop("hidden_states_after_comm_pre_attn"),
+            forward_batch=state.forward_batch,
+        )
+
+    def op_core(self, state):
+        state.hidden_states_after_attn = self.forward_core(
+            state.pop("attn_intermediate_state")
+        )
+
+    def forward_prepare(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        if hidden_states.shape[0] == 0:
+            return hidden_states, forward_batch, None
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self._apply_qk_norm(q, k)
+        q, k = self.rotary_emb(positions, q, k)
+        inner_state = q, k, v, forward_batch
+        return None, forward_batch, inner_state
+
+    def forward_core(self, intermediate_state):
+        hidden_states, forward_batch, inner_state = intermediate_state
+        if inner_state is None:
+            return hidden_states
+        attn_output = self.attn(*inner_state)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        s = self.forward_prepare(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        return self.forward_core(s)
+
+
+class Qwen3MoeDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: Qwen3MoeConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        head_dim = getattr(
+            config, "head_dim", config.hidden_size // config.num_attention_heads
+        )
+        rms_norm_eps = config.rms_norm_eps
+        attention_bias = config.attention_bias
+        dual_chunk_attention_config = getattr(
+            config, "dual_chunk_attention_config", None
+        )
+        self.self_attn = Qwen3MoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            head_dim=head_dim,
+            rms_norm_eps=rms_norm_eps,
+            attention_bias=attention_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+            dual_chunk_attention_config=dual_chunk_attention_config,
+            alt_stream=alt_stream,
+        )
+
+        self.layer_id = layer_id
+
+        self.attn_tp_size = get_attention_tp_size()
+        self.attn_tp_rank = get_attention_tp_rank()
+
+        # Qwen3MoE all layers are sparse and have no nextn now
+        self.is_layer_sparse = True
+        is_previous_layer_sparse = True
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=self.is_layer_sparse,
+            is_previous_layer_sparse=is_previous_layer_sparse,
+        )
+
+        if self.is_layer_sparse:
+            self.mlp = Qwen3MoeSparseMoeBlock(
+                layer_id=self.layer_id,
+                config=config,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+        else:
+            self.mlp = Qwen3MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            allow_reduce_scatter=True,
+            is_last_layer=(self.layer_id == self.config.num_hidden_layers - 1),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+
+        if hidden_states.shape[0] != 0:
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+
+        should_allreduce_fusion = (
+            self.layer_communicator.should_fuse_mlp_allreduce_with_next_layer(
+                forward_batch
+            )
+        )
+
+        # For DP with padding, reduce scatter can be used instead of all-reduce.
+        use_reduce_scatter = self.layer_communicator.should_use_reduce_scatter(
+            forward_batch
+        )
+
+        hidden_states = self.mlp(
+            hidden_states, forward_batch, should_allreduce_fusion, use_reduce_scatter
+        )
+
+        if should_allreduce_fusion:
+            hidden_states._sglang_needs_allreduce_fusion = True
+        else:
+            hidden_states, residual = self.layer_communicator.postprocess_layer(
+                hidden_states, residual, forward_batch
+            )
+
+        return hidden_states, residual
+
+    def op_comm_prepare_attn(
+        self,
+        state,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+        tbo_subbatch_index: Optional[int] = None,
+    ):
+        state.hidden_states_after_comm_pre_attn, state.residual_after_input_ln = (
+            self.layer_communicator.prepare_attn(hidden_states, residual, forward_batch)
+        )
+        state.update(
+            dict(
+                forward_batch=forward_batch,
+                positions=positions,
+                tbo_subbatch_index=tbo_subbatch_index,
+            )
+        )
+
+    def op_comm_prepare_mlp(self, state):
+        state.hidden_states_mlp_input, state.residual_after_comm_pre_mlp = (
+            self.layer_communicator.prepare_mlp(
+                state.pop("hidden_states_after_attn"),
+                state.pop("residual_after_input_ln"),
+                state.forward_batch,
+            )
+        )
+
+    def op_mlp(self, state):
+        hidden_states = state.pop("hidden_states_mlp_input")
+        state.hidden_states_mlp_output = self.mlp(hidden_states, state.forward_batch)
+
+    def op_comm_postprocess_layer(self, state):
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            state.pop("hidden_states_mlp_output"),
+            state.pop("residual_after_comm_pre_mlp"),
+            state.forward_batch,
+        )
+
+        output = dict(
+            positions=state.positions,
+            hidden_states=hidden_states,
+            residual=residual,
+            forward_batch=state.forward_batch,
+            tbo_subbatch_index=state.tbo_subbatch_index,
+        )
+
+        state.clear(
+            expect_keys={
+                "positions",
+                "forward_batch",
+                "tbo_subbatch_index",
+            }
+        )
+        return output
+
+
+class Qwen3MoeModel(Qwen2MoeModel):
+    def __init__(
+        self,
+        config: Qwen3MoeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        alt_stream = torch.cuda.Stream() if _is_cuda else None
+        super().__init__(
+            config=config,
+            quant_config=quant_config,
+            prefix=prefix,
+            decoder_layer_type=Qwen3MoeDecoderLayer,
+            alt_stream=alt_stream,
+        )
+
+
+class Qwen3MoeForCausalLM(nn.Module):
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: Qwen3MoeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Qwen3MoeModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+            use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
+        )
+        self.logits_processor = LogitsProcessor(config)
+        self.capture_aux_hidden_states = False
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if self.pp_group.is_last_rank:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch, aux_hidden_states
+            )
+        else:
+            return hidden_states
+
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],  # [start, end) 0-based
+        input_embeds: torch.Tensor = None,
+    ):
+        start, end = split_interval
+        # embed
+        if start == 0:
+            if input_embeds is None:
+                forward_batch.hidden_states = self.model.embed_tokens(input_ids)
+            else:
+                forward_batch.hidden_states = input_embeds
+
+        # decoder layer
+        for i in range(start, end):
+            with get_global_expert_distribution_recorder().with_current_layer(i):
+                layer = self.model.layers[i]
+                forward_batch.hidden_states, forward_batch.residual = layer(
+                    positions,
+                    forward_batch.hidden_states,
+                    forward_batch,
+                    forward_batch.residual,
+                )
+
+        if end == self.model.config.num_hidden_layers:
+            # norm
+            hidden_states, _ = self.model.norm(
+                forward_batch.hidden_states, forward_batch.residual
+            )
+            forward_batch.hidden_states = hidden_states
+            # logits process
+            result = self.logits_processor(
+                input_ids, forward_batch.hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            result = None
+
+        return result
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_eagle3_layers_to_capture(self, layer_ids: Optional[List[int]] = None):
+        if not self.pp_group.is_last_rank:
+            return
+
+        self.capture_aux_hidden_states = True
+        if layer_ids is None:
+            num_layers = self.config.num_hidden_layers
+            self.model.layers_to_capture = [
+                2,
+                num_layers // 2,
+                num_layers - 3,
+            ]  # Specific layers for EAGLE3 support
+        else:
+            self.model.layers_to_capture = [val + 1 for val in layer_ids]
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+        # Cache params_dict to avoid repeated expensive traversal of model parameters
+        if not hasattr(self, "_cached_params_dict"):
+            self._cached_params_dict = dict(self.named_parameters())
+        params_dict = self._cached_params_dict
+        for name, loaded_weight in weights:
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Track if this is an expert weight to enable early skipping
+                is_expert_weight = False
+
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+
+                    # Mark as expert weight regardless of whether we can process it
+                    is_expert_weight = True
+
+                    name = name.replace(weight_name, param_name)
+                    if name not in params_dict:
+                        # Expert weight not on this rank, will be skipped below
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    if is_expert_weight:
+                        # This is an expert weight but not mapped to this rank, skip all remaining processing
+                        continue
+
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if name not in params_dict:
+                        continue
+
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+                    else:
+                        logger.warning(f"Parameter {name} not found in params_dict")
+
+        # TODO mimic deepseek
+        # Lazy initialization of expert weights cache to avoid slowing down load_weights
+        if not hasattr(self, "routed_experts_weights_of_layer"):
+            self.routed_experts_weights_of_layer = {
+                layer_id: self.model.layers[layer_id].mlp.get_moe_weights()
+                for layer_id in range(self.start_layer, self.end_layer)
+                if isinstance(self.model.layers[layer_id].mlp, Qwen3MoeSparseMoeBlock)
+            }
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        return ModelConfigForExpertLocation(
+            num_layers=config.num_hidden_layers,
+            num_logical_experts=config.num_experts,
+            num_groups=None,
+        )
+
+
+EntryClass = Qwen3MoeForCausalLM
diff --git a/python/sglang/srt/models/qwen3_next.py b/python/sglang/srt/models/qwen3_next.py
new file mode 100644
index 000000000..6208ee2c6
--- /dev/null
+++ b/python/sglang/srt/models/qwen3_next.py
@@ -0,0 +1,1049 @@
+import enum
+import logging
+from typing import Any, Dict, Iterable, Optional, Set, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from sglang.srt.configs.qwen3_next import Qwen3NextConfig
+from sglang.srt.distributed import (
+    divide,
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+from sglang.srt.layers.attention.fla.layernorm_gated import RMSNorm as RMSNormGated
+from sglang.srt.layers.attention.mamba.mamba import mamba_v2_sharded_weight_loader
+from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import GemmaRMSNorm, RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    sharded_weight_loader,
+)
+from sglang.srt.models.qwen2_moe import Qwen2MoeMLP, Qwen2MoeSparseMoeBlock
+from sglang.srt.utils import add_prefix, is_cuda, make_layers, set_weight_attrs
+
+logger = logging.getLogger(__name__)
+def is_gpu_available():
+    try:
+        available = torch.cuda.is_available()
+    except Exception as e:
+        print(f"GPU检测失败: {e}")
+        return False
+    return False
+
+# 使用改进的检测
+_is_cuda = is_gpu_available()
+
+import triton
+import triton.language as tl
+
+
+
+@triton.jit
+def fused_qkvzba_split_reshape_cat_kernel(
+    mixed_qkv,
+    z,
+    b,
+    a,
+    mixed_qkvz,
+    mixed_ba,
+    NUM_HEADS_QK: tl.constexpr,
+    NUM_HEADS_V: tl.constexpr,
+    HEAD_QK: tl.constexpr,
+    HEAD_V: tl.constexpr,
+):
+    i_bs, i_qk = tl.program_id(0), tl.program_id(1)
+    QKVZ_DIM_T: tl.constexpr = HEAD_QK * 2 + NUM_HEADS_V // NUM_HEADS_QK * HEAD_V * 2
+    BA_DIM_T: tl.constexpr = NUM_HEADS_V // NUM_HEADS_QK * 2
+    QKV_DIM_T: tl.constexpr = HEAD_QK * 2 + NUM_HEADS_V // NUM_HEADS_QK * HEAD_V
+    q_end: tl.constexpr = HEAD_QK
+    blk_q_ptr = (
+        mixed_qkvz
+        + i_bs * NUM_HEADS_QK * QKVZ_DIM_T
+        + i_qk * QKVZ_DIM_T
+        + tl.arange(0, q_end)
+    )
+    k_end: tl.constexpr = q_end + HEAD_QK
+    blk_k_ptr = (
+        mixed_qkvz
+        + i_bs * NUM_HEADS_QK * QKVZ_DIM_T
+        + i_qk * QKVZ_DIM_T
+        + tl.arange(q_end, k_end)
+    )
+    v_end: tl.constexpr = k_end + NUM_HEADS_V // NUM_HEADS_QK * HEAD_V
+    blk_v_ptr = (
+        mixed_qkvz
+        + i_bs * NUM_HEADS_QK * QKVZ_DIM_T
+        + i_qk * QKVZ_DIM_T
+        + tl.arange(k_end, v_end)
+    )
+    z_end: tl.constexpr = v_end + NUM_HEADS_V // NUM_HEADS_QK * HEAD_V
+    blk_z_ptr = (
+        mixed_qkvz
+        + i_bs * NUM_HEADS_QK * QKVZ_DIM_T
+        + i_qk * QKVZ_DIM_T
+        + tl.arange(v_end, z_end)
+    )
+    blk_q_st_ptr = (
+        mixed_qkv
+        + i_bs * NUM_HEADS_QK * QKV_DIM_T
+        + i_qk * HEAD_QK
+        + tl.arange(0, HEAD_QK)
+    )
+    blk_k_st_ptr = (
+        mixed_qkv
+        + i_bs * NUM_HEADS_QK * QKV_DIM_T
+        + NUM_HEADS_QK * HEAD_QK
+        + i_qk * HEAD_QK
+        + tl.arange(0, HEAD_QK)
+    )
+    blk_v_st_ptr = (
+        mixed_qkv
+        + i_bs * NUM_HEADS_QK * QKV_DIM_T
+        + NUM_HEADS_QK * HEAD_QK * 2
+        + i_qk * HEAD_V * NUM_HEADS_V // NUM_HEADS_QK
+        + tl.arange(0, HEAD_V * NUM_HEADS_V // NUM_HEADS_QK)
+    )
+    blk_z_st_ptr = (
+        z
+        + i_bs * NUM_HEADS_V * HEAD_V
+        + i_qk * HEAD_V * NUM_HEADS_V // NUM_HEADS_QK
+        + tl.arange(0, HEAD_V * NUM_HEADS_V // NUM_HEADS_QK)
+    )
+    tl.store(blk_q_st_ptr, tl.load(blk_q_ptr))
+    tl.store(blk_k_st_ptr, tl.load(blk_k_ptr))
+    tl.store(blk_v_st_ptr, tl.load(blk_v_ptr))
+    tl.store(blk_z_st_ptr, tl.load(blk_z_ptr))
+    b_end: tl.constexpr = NUM_HEADS_V // NUM_HEADS_QK
+    a_end: tl.constexpr = b_end + NUM_HEADS_V // NUM_HEADS_QK
+    for i in tl.static_range(b_end):
+        blk_b_ptr = mixed_ba + i_bs * NUM_HEADS_QK * BA_DIM_T + i_qk * BA_DIM_T + i
+        blk_b_st_ptr = b + i_bs * NUM_HEADS_V + i_qk * NUM_HEADS_V // NUM_HEADS_QK + i
+        tl.store(blk_b_st_ptr, tl.load(blk_b_ptr))
+    for i in tl.static_range(b_end, a_end):
+        blk_a_ptr = mixed_ba + i_bs * NUM_HEADS_QK * BA_DIM_T + i_qk * BA_DIM_T + i
+        blk_a_st_ptr = (
+            a + i_bs * NUM_HEADS_V + i_qk * NUM_HEADS_V // NUM_HEADS_QK + (i - b_end)
+        )
+        tl.store(blk_a_st_ptr, tl.load(blk_a_ptr))
+
+
+def fused_qkvzba_split_reshape_cat(
+    mixed_qkvz,
+    mixed_ba,
+    num_heads_qk,
+    num_heads_v,
+    head_qk,
+    head_v,
+):
+    batch, seq_len = mixed_qkvz.shape[0], 1
+    qkv_dim_t = num_heads_qk * head_qk * 2 + num_heads_v * head_v
+    mixed_qkv = torch.empty(
+        [batch * seq_len, qkv_dim_t],
+        dtype=mixed_qkvz.dtype,
+        device=mixed_qkvz.device,
+    )
+    z = torch.empty(
+        [batch * seq_len, num_heads_v, head_v],
+        dtype=mixed_qkvz.dtype,
+        device=mixed_qkvz.device,
+    )
+    b = torch.empty(
+        [batch * seq_len, num_heads_v],
+        dtype=mixed_ba.dtype,
+        device=mixed_ba.device,
+    )
+    a = torch.empty_like(b)
+    grid = (batch * seq_len, num_heads_qk)
+    fused_qkvzba_split_reshape_cat_kernel[grid](
+        mixed_qkv,
+        z,
+        b,
+        a,
+        mixed_qkvz,
+        mixed_ba,
+        num_heads_qk,
+        num_heads_v,
+        head_qk,
+        head_v,
+        num_warps=1,
+        num_stages=3,
+    )
+    return mixed_qkv, z, b, a
+
+
+# g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
+@triton.jit
+def fused_gdn_gating_kernel(
+    g,
+    A_log,
+    a,
+    dt_bias,
+    seq_len,
+    NUM_HEADS: tl.constexpr,
+    beta: tl.constexpr,
+    threshold: tl.constexpr,
+    BLK_HEADS: tl.constexpr,
+):
+    i_b, i_s, i_d = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    head_off = i_d * BLK_HEADS + tl.arange(0, BLK_HEADS)
+    off = i_b * seq_len * NUM_HEADS + i_s * NUM_HEADS + head_off
+    mask = head_off < NUM_HEADS
+    blk_A_log = tl.load(A_log + head_off, mask=mask)
+    blk_a = tl.load(a + off, mask=mask)
+    blk_bias = tl.load(dt_bias + head_off, mask=mask)
+    x = blk_a.to(tl.float32) + blk_bias.to(tl.float32)
+    softplus_x = tl.where(
+        beta * x <= threshold, (1 / beta) * tl.log(1 + tl.exp(beta * x)), x
+    )
+    blk_g = -tl.exp(blk_A_log.to(tl.float32)) * softplus_x
+    tl.store(g + off, blk_g.to(g.dtype.element_ty), mask=mask)
+
+
+def fused_gdn_gating(
+    A_log: torch.Tensor,
+    a: torch.Tensor,
+    dt_bias: torch.Tensor,
+    beta: float = 1.0,
+    threshold: float = 20.0,
+) -> torch.Tensor:
+    batch, num_heads = a.shape
+    seq_len = 1
+    grid = (batch, seq_len, triton.cdiv(num_heads, 8))
+    g = torch.empty_like(a, dtype=torch.float32)
+    fused_gdn_gating_kernel[grid](
+        g, A_log, a, dt_bias, seq_len, num_heads, beta, threshold, 8, num_warps=1
+    )
+    return g
+
+
+class Qwen3GatedDeltaNet(nn.Module):
+    def __init__(
+        self,
+        config: Qwen3NextConfig,
+        layer_id: int,
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.attn_tp_rank = get_attention_tp_rank()
+        self.attn_tp_size = get_attention_tp_size()
+        self.hidden_size = config.hidden_size
+        self.num_v_heads = config.linear_num_value_heads
+        self.num_k_heads = config.linear_num_key_heads
+        self.head_k_dim = config.linear_key_head_dim
+        self.head_v_dim = config.linear_value_head_dim
+        self.key_dim = self.head_k_dim * self.num_k_heads
+        self.value_dim = self.head_v_dim * self.num_v_heads
+        self.alt_stream = alt_stream
+
+        self.conv_kernel_size = config.linear_conv_kernel_dim
+        self.layer_id = layer_id
+        self.activation = config.hidden_act
+        self.layer_norm_epsilon = config.rms_norm_eps
+
+        # QKV
+        self.conv_dim = self.key_dim * 2 + self.value_dim
+        self.conv1d = ColumnParallelLinear(
+            input_size=self.conv_kernel_size,
+            output_size=self.conv_dim,
+            bias=False,
+            quant_config=None,
+            tp_rank=self.attn_tp_rank,
+            tp_size=self.attn_tp_size,
+        )
+        self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
+        # projection of the input hidden states
+        projection_size_qkvz = self.key_dim * 2 + self.value_dim * 2
+        projection_size_ba = self.num_v_heads * 2
+
+        self.in_proj_qkvz = ColumnParallelLinear(
+            input_size=self.hidden_size,
+            output_size=projection_size_qkvz,
+            bias=False,
+            tp_rank=self.attn_tp_rank,
+            tp_size=self.attn_tp_size,
+        )
+        self.in_proj_ba = ColumnParallelLinear(
+            input_size=self.hidden_size,
+            output_size=projection_size_ba,
+            bias=False,
+            tp_rank=self.attn_tp_rank,
+            tp_size=self.attn_tp_size,
+        )
+
+        query_key_settings = (self.key_dim, 0, False)
+        value_settings = (self.value_dim, 0, False)
+
+        delattr(self.conv1d.weight, "weight_loader")
+        set_weight_attrs(
+            self.conv1d.weight,
+            {
+                "weight_loader": mamba_v2_sharded_weight_loader(
+                    [
+                        query_key_settings,
+                        query_key_settings,
+                        value_settings,
+                    ],
+                    self.attn_tp_size,
+                    self.attn_tp_rank,
+                )
+            },
+        )
+
+        # selective projection used to make dt, B and C input dependent
+
+        # time step projection (discretization)
+        # instantiate once and copy inv_dt in init_weights of PretrainedModel
+        self.dt_bias = nn.Parameter(torch.ones(self.num_v_heads // self.attn_tp_size))
+
+        A = torch.empty(
+            divide(self.num_v_heads, self.attn_tp_size), dtype=torch.float32
+        ).uniform_(0, 16)
+        self.A_log = nn.Parameter(torch.log(A))
+        self.A_log._no_weight_decay = True
+
+        set_weight_attrs(self.A_log, {"weight_loader": sharded_weight_loader(0)})
+        set_weight_attrs(self.dt_bias, {"weight_loader": sharded_weight_loader(0)})
+
+        self.norm = RMSNormGated(
+            self.head_v_dim,
+            eps=self.layer_norm_epsilon,
+            group_size=None,
+            norm_before_gate=True,
+            device=torch.cuda.current_device(),
+            dtype=config.torch_dtype,
+        )
+
+        self.out_proj = RowParallelLinear(
+            self.value_dim,
+            self.hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            reduce_results=False,
+            tp_rank=self.attn_tp_rank,
+            tp_size=self.attn_tp_size,
+        )
+
+    def fix_query_key_value_ordering(self, mixed_qkvz, mixed_ba):
+        """
+        Derives `query`, `key` and `value` tensors from `mixed_qkvzba`.
+        """
+        new_tensor_shape_qkvz = mixed_qkvz.size()[:-1] + (
+            self.num_k_heads // self.attn_tp_size,
+            (
+                self.head_k_dim
+                + self.head_k_dim
+                + (self.head_v_dim + self.head_v_dim)
+                * self.num_v_heads
+                // self.num_k_heads
+            ),
+        )
+        new_tensor_shape_ba = mixed_ba.size()[:-1] + (
+            self.num_k_heads // self.attn_tp_size,
+            2 * self.num_v_heads // self.num_k_heads,
+        )
+
+        mixed_qkvz = mixed_qkvz.view(*new_tensor_shape_qkvz)
+        mixed_ba = mixed_ba.view(*new_tensor_shape_ba)
+
+        split_arg_list_qkvz = [
+            self.head_k_dim,
+            self.head_k_dim,
+            (self.num_v_heads // self.num_k_heads * self.head_v_dim),
+            (self.num_v_heads // self.num_k_heads * self.head_v_dim),
+        ]
+        split_arg_list_ba = [
+            self.num_v_heads // self.num_k_heads,
+            self.num_v_heads // self.num_k_heads,
+        ]
+
+        # [b, sq, ng, (hn + hn + np/ng * hn + np/ng + np/ng)]
+        # --> [b, sq, ng, hn], [b, sq, ng, hn], [b, sq, ng, np/ng * hn], [b, sq, ng, np/ng * hn], [b, sq, ng, np/ng], [b, sq, ng, np/ng]
+        (query, key, value, z) = torch.split(mixed_qkvz, split_arg_list_qkvz, dim=2)
+        (b, a) = torch.split(mixed_ba, split_arg_list_ba, dim=2)
+
+        # [b, sq, ng, np/ng * hn] -> [b, sq, np, hn]
+        value = value.reshape(value.size(0), -1, self.head_v_dim)
+        z = z.reshape(z.size(0), -1, self.head_v_dim)
+        b = b.reshape(b.size(0), self.num_v_heads // self.attn_tp_size)
+        a = a.reshape(a.size(0), self.num_v_heads // self.attn_tp_size)
+
+        return query, key, value, z, b, a
+
+    def _forward_input_proj(self, hidden_states: torch.Tensor):
+        DUAL_STREAM_TOKEN_THRESHOLD = 1024
+        seq_len, _ = hidden_states.shape
+        if seq_len < DUAL_STREAM_TOKEN_THRESHOLD and self.alt_stream is not None:
+            current_stream = torch.cuda.current_stream()
+            self.alt_stream.wait_stream(current_stream)
+            projected_states_qkvz, _ = self.in_proj_qkvz(hidden_states)
+            with torch.cuda.stream(self.alt_stream):
+                projected_states_ba, _ = self.in_proj_ba(hidden_states)
+            current_stream.wait_stream(self.alt_stream)
+        else:
+            projected_states_qkvz, _ = self.in_proj_qkvz(hidden_states)
+            projected_states_ba, _ = self.in_proj_ba(hidden_states)
+        return projected_states_qkvz, projected_states_ba
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        seq_len, _ = hidden_states.shape
+        is_cuda_graph = forward_batch.forward_mode.is_cuda_graph()
+
+        projected_states_qkvz, projected_states_ba = self._forward_input_proj(
+            hidden_states
+        )
+
+        if self.num_v_heads // self.num_k_heads in [1, 2, 4] and is_cuda_graph:
+            mixed_qkv, z, b, a = fused_qkvzba_split_reshape_cat(
+                projected_states_qkvz,
+                projected_states_ba,
+                triton.cdiv(self.num_k_heads, self.attn_tp_size),
+                triton.cdiv(self.num_v_heads, self.attn_tp_size),
+                self.head_k_dim,
+                self.head_v_dim,
+            )
+        else:
+            query, key, value, z, b, a = self.fix_query_key_value_ordering(
+                projected_states_qkvz, projected_states_ba
+            )
+            query, key, value = map(
+                lambda x: x.reshape(x.shape[0], -1), (query, key, value)
+            )
+            mixed_qkv = torch.cat((query, key, value), dim=-1)
+        # mixed_qkv = rearrange(mixed_qkv, "b l d -> b d l")
+
+        # 2. Convolution sequence transformation
+        conv_weights = self.conv1d.weight.view(
+            self.conv1d.weight.size(0), self.conv1d.weight.size(2)
+        )
+
+        kwargs = {
+            "mixed_qkv": mixed_qkv,
+            "conv_weights": conv_weights,
+            "bias": self.conv1d.bias,
+            "activation": self.activation,
+            "key_dim": self.key_dim,
+            "value_dim": self.value_dim,
+            "attention_tp_size": self.attn_tp_size,
+            "head_k_dim": self.head_k_dim,
+            "head_v_dim": self.head_v_dim,
+            "a": a,
+            "b": b,
+            "A_log": self.A_log,
+            "dt_bias": self.dt_bias,
+            "layer_id": self.layer_id,
+            "seq_len": seq_len,
+            "z": z,
+        }
+
+        core_attn_out = forward_batch.attn_backend.forward(
+            q=None,
+            k=None,
+            v=None,
+            layer=None,
+            forward_batch=forward_batch,
+            **kwargs,
+        )
+
+        z_shape_og = z.shape
+        # reshape input data into 2D tensor
+        core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
+        z = z.reshape(-1, z.shape[-1])
+        core_attn_out = self.norm(core_attn_out, z)
+        core_attn_out = core_attn_out.reshape(z_shape_og)
+        core_attn_out = core_attn_out.reshape(*core_attn_out.shape[:-2], -1)
+
+        output, _ = self.out_proj(core_attn_out)
+        return output
+
+
+class Qwen3HybridLinearDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Qwen3NextConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.linear_attn = Qwen3GatedDeltaNet(config, layer_id, alt_stream)
+
+        # Qwen3Next all layers are sparse and have no nextn now
+        self.is_layer_sparse = True
+        is_previous_layer_sparse = True
+        self.layer_id = layer_id
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=self.is_layer_sparse,
+            is_previous_layer_sparse=is_previous_layer_sparse,
+        )
+
+        if self.is_layer_sparse:
+            self.mlp = Qwen2MoeSparseMoeBlock(
+                layer_id=layer_id,
+                config=config,
+                quant_config=quant_config,
+                alt_stream=alt_stream,
+            )
+        else:
+            self.mlp = Qwen2MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+            )
+        self.input_layernorm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GemmaRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            allow_reduce_scatter=True,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ):
+        forward_batch = kwargs.get("forward_batch", None)
+
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+
+        if not forward_batch.forward_mode.is_idle():
+            hidden_states = self.linear_attn(
+                hidden_states,
+                forward_batch,
+            )
+        # Fully Connected
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+
+        use_reduce_scatter = self.layer_communicator.should_use_reduce_scatter(
+            forward_batch
+        )
+        hidden_states = self.mlp(hidden_states, forward_batch, use_reduce_scatter)
+
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            hidden_states, residual, forward_batch
+        )
+
+        return hidden_states, residual
+
+
+class Qwen3HybridAttentionDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Qwen3NextConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.attn_tp_rank = get_attention_tp_rank()
+        self.attn_tp_size = get_attention_tp_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % self.attn_tp_size == 0
+        self.num_heads = self.total_num_heads // self.attn_tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= self.attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % self.attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // self.attn_tp_size)
+        self.head_dim = config.head_dim or (self.hidden_size // self.num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = getattr(config, "rope_theta", 10000)
+        self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        self.rope_scaling = getattr(config, "rope_scaling", None)
+        self.partial_rotary_factor = config.partial_rotary_factor
+        self.layer_id = layer_id
+
+        self.attn_output_gate = getattr(config, "attn_output_gate", True)
+        if self.attn_output_gate:
+            logger.warning_once("using attn output gate!")
+
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            rope_scaling=self.rope_scaling,
+            base=self.rope_theta,
+            partial_rotary_factor=self.partial_rotary_factor,
+            is_neox_style=True,
+            dtype=torch.get_default_dtype(),  # see impl of get_rope
+        )
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads * (1 + self.attn_output_gate),
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            tp_rank=self.attn_tp_rank,
+            tp_size=self.attn_tp_size,
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=False,
+            tp_rank=self.attn_tp_rank,
+            tp_size=self.attn_tp_size,
+        )
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            prefix=f"{prefix}.attn",
+        )
+
+        # Qwen3Next all layers are sparse and have no nextn now
+        self.is_layer_sparse = True
+        is_previous_layer_sparse = True
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=self.is_layer_sparse,
+            is_previous_layer_sparse=is_previous_layer_sparse,
+        )
+
+        if self.is_layer_sparse:
+            self.mlp = Qwen2MoeSparseMoeBlock(
+                layer_id=layer_id,
+                config=config,
+                quant_config=quant_config,
+                alt_stream=alt_stream,
+            )
+        else:
+            self.mlp = Qwen2MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+            )
+        self.input_layernorm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GemmaRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.q_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            allow_reduce_scatter=True,
+        )
+
+        self.alt_stream = alt_stream
+
+    def _apply_qk_norm(
+        self, q: torch.Tensor, k: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # overlap qk norm
+        if self.alt_stream is not None and get_is_capture_mode():
+            current_stream = torch.cuda.current_stream()
+            self.alt_stream.wait_stream(current_stream)
+            q_by_head = q.reshape(-1, self.head_dim)
+            q_by_head = self.q_norm(q_by_head)
+            with torch.cuda.stream(self.alt_stream):
+                k_by_head = k.reshape(-1, self.head_dim)
+                k_by_head = self.k_norm(k_by_head)
+            current_stream.wait_stream(self.alt_stream)
+        else:
+            q_by_head = q.reshape(-1, self.head_dim)
+            q_by_head = self.q_norm(q_by_head)
+            k_by_head = k.reshape(-1, self.head_dim)
+            k_by_head = self.k_norm(k_by_head)
+        q = q_by_head.view(q.shape)
+        k = k_by_head.view(k.shape)
+        return q, k
+
+    def self_attention(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+
+        if self.attn_output_gate:
+            q_gate, k, v = qkv.split(
+                [self.q_size * 2, self.kv_size, self.kv_size], dim=-1
+            )
+            orig_shape = q_gate.shape[:-1]
+            q_gate = q_gate.view(*orig_shape, self.num_heads, -1)
+            q, gate = torch.chunk(q_gate, 2, dim=-1)
+            q = q.reshape(*orig_shape, -1)
+            gate = gate.reshape(*orig_shape, -1)
+        else:
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        q, k = self._apply_qk_norm(q, k)
+
+        q, k = self.rotary_emb(positions, q, k)
+
+        attn_output = self.attn(q, k, v, forward_batch)
+
+        if self.attn_output_gate:
+            gate = torch.sigmoid(gate)
+            attn_output = attn_output * gate
+
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        forward_batch: ForwardBatch,
+        **kwargs: Any,
+    ):
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+
+        if not forward_batch.forward_mode.is_idle():
+            hidden_states = self.self_attention(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+
+        # Fully Connected
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+        use_reduce_scatter = self.layer_communicator.should_use_reduce_scatter(
+            forward_batch
+        )
+        hidden_states = self.mlp(hidden_states, forward_batch, use_reduce_scatter)
+
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            hidden_states, residual, forward_batch
+        )
+
+        return hidden_states, residual
+
+
+ALL_DECODER_LAYER_TYPES = {
+    "attention": Qwen3HybridAttentionDecoderLayer,
+    "linear_attention": Qwen3HybridLinearDecoderLayer,
+}
+
+
+class Qwen3NextModel(nn.Module):
+    def __init__(
+        self,
+        config: Qwen3NextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        alt_stream = torch.cuda.Stream() if _is_cuda else None
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            enable_tp=not is_dp_attention_enabled(),
+        )
+
+        def get_layer(idx: int, prefix: str):
+            layer_class = ALL_DECODER_LAYER_TYPES[config.layers_block_type[idx]]
+            return layer_class(
+                config,
+                idx,
+                quant_config=quant_config,
+                prefix=prefix,
+                alt_stream=alt_stream,
+            )
+
+        self.layers = make_layers(
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers"
+        )
+
+        self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.infer_count = 0
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        # mamba_cache_params: MambaCacheParams,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        # pass a sequence index tensor, that is required for
+        # proper continuous batching computation including
+        # chunked prefill
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embed_tokens(input_ids)
+
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                layer_id=i,
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+                forward_batch=forward_batch,
+            )
+
+        if not forward_batch.forward_mode.is_idle():
+            if residual is None:
+                hidden_states = self.norm(hidden_states)
+            else:
+                hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class HybridLayerType(enum.Enum):
+    full_attention = "attention"
+    swa_attention = "swa_attention"
+    linear_attention = "linear_attention"
+    mamba2 = "mamba"
+
+
+class Qwen3NextForCausalLM(nn.Module):
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: Qwen3NextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.pp_group = get_pp_group()
+        assert self.pp_group.is_first_rank and self.pp_group.is_last_rank
+        self.quant_config = quant_config
+        self.model = Qwen3NextModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            org_num_embeddings=config.vocab_size,
+            prefix=add_prefix("lm_head", prefix),
+            use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
+        )
+        self.lm_head = self.lm_head.float()
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        hidden_states = self.model(input_ids, positions, forward_batch, inputs_embeds)
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def load_weights(
+        self, weights: Iterable[Tuple[str, torch.Tensor]], is_mtp: bool = False
+    ) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = get_moe_impl_class().make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+
+            if is_mtp:
+
+                if "mtp" not in name:
+                    continue
+
+                if name in [
+                    "mtp.fc.weight",
+                    "mtp.pre_fc_norm_embedding.weight",
+                    "mtp.pre_fc_norm_hidden.weight",
+                ]:
+                    name = name.replace("mtp.", "")
+                else:
+                    name = name.replace("mtp", "model")
+
+            if not is_mtp and "mtp" in name:
+                continue
+
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if ".self_attn." in name:
+                name = name.replace(".self_attn", "")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+
+                # TODO(fix mtp loading)
+                if "mlp.experts" in name:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                # if is_pp_missing_parameter(name, self):
+                #     continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader")
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    # if is_pp_missing_parameter(name, self):
+                    #     continue
+                    # Skip loading extra bias for GPTQ models.
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+
+                    weight_loader = getattr(param, "weight_loader")
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    # if is_pp_missing_parameter(name, self):
+                    #     continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        return ModelConfigForExpertLocation(
+            num_layers=config.num_hidden_layers,
+            num_logical_experts=config.num_experts,
+            num_groups=None,
+        )
+
+
+EntryClass = Qwen3NextForCausalLM
diff --git a/python/sglang/srt/models/qwen3_next_mtp.py b/python/sglang/srt/models/qwen3_next_mtp.py
new file mode 100644
index 000000000..a9da0867d
--- /dev/null
+++ b/python/sglang/srt/models/qwen3_next_mtp.py
@@ -0,0 +1,109 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Inference-only Qwen3Next MTP Speculative Decoding."""
+import logging
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from sglang.srt.layers.layernorm import GemmaRMSNorm, RMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.qwen3_moe import Qwen3MoeModel
+from sglang.srt.models.qwen3_next import Qwen3NextForCausalLM, Qwen3NextModel
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+
+class Qwen3NextForCausalLMMTP(Qwen3NextForCausalLM):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.quant_config = quant_config
+        # if not set, model load will be broken in Qwen3NextForCausalLM load_weights()
+        self.pp_group = get_pp_group()
+        # self.determine_num_fused_shared_experts("Qwen3NextForCausalLMMTP")
+
+        # currently based on the provided ckpt, we:
+        # (1) do not use_dedicated_mtp_embeddings provided in ckpt since not provided and directly use the target model embeddings
+        # (2) hardcode bias=False since not provided
+        self.fc = nn.Linear(2 * config.hidden_size, config.hidden_size, bias=False)
+        RMSNorm_cls = GemmaRMSNorm
+        self.pre_fc_norm_embedding = RMSNorm_cls(
+            config.hidden_size, config.rms_norm_eps
+        )
+        self.pre_fc_norm_hidden = RMSNorm_cls(config.hidden_size, config.rms_norm_eps)
+        config.num_hidden_layers = 1
+        config.full_attention_interval = 1
+        self.model = Qwen3NextModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("model.shared_head.head", prefix),
+            use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        if input_embeds is None:
+            input_embeds = self.model.embed_tokens(input_ids)
+
+        input_embeds = self.pre_fc_norm_embedding(input_embeds)
+        hidden_states = self.pre_fc_norm_hidden(forward_batch.spec_info.hidden_states)
+        hidden_states = self.fc(torch.cat((input_embeds, hidden_states), dim=-1))
+
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            hidden_states,
+        )
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(
+        self, weights: Iterable[Tuple[str, torch.Tensor]], is_mtp: bool = False
+    ):
+        super().load_weights(weights, is_mtp=True)
+
+
+EntryClass = [Qwen3NextForCausalLMMTP]
diff --git a/python/sglang/srt/models/registry.py b/python/sglang/srt/models/registry.py
new file mode 100644
index 000000000..76e042a95
--- /dev/null
+++ b/python/sglang/srt/models/registry.py
@@ -0,0 +1,107 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/models/registry.py
+
+import importlib
+import logging
+import pkgutil
+from dataclasses import dataclass, field
+from functools import lru_cache
+from typing import AbstractSet, Dict, List, Optional, Tuple, Type, Union
+
+import torch.nn as nn
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class _ModelRegistry:
+    # Keyed by model_arch
+    models: Dict[str, Union[Type[nn.Module], str]] = field(default_factory=dict)
+
+    def get_supported_archs(self) -> AbstractSet[str]:
+        return self.models.keys()
+
+    def _raise_for_unsupported(self, architectures: List[str]):
+        all_supported_archs = self.get_supported_archs()
+
+        if any(arch in all_supported_archs for arch in architectures):
+            raise ValueError(
+                f"Model architectures {architectures} failed "
+                "to be inspected. Please check the logs for more details."
+            )
+
+        raise ValueError(
+            f"Model architectures {architectures} are not supported for now. "
+            f"Supported architectures: {all_supported_archs}"
+        )
+
+    def _try_load_model_cls(self, model_arch: str) -> Optional[Type[nn.Module]]:
+        if model_arch not in self.models:
+            return None
+
+        return self.models[model_arch]
+
+    def _normalize_archs(
+        self,
+        architectures: Union[str, List[str]],
+    ) -> List[str]:
+        if isinstance(architectures, str):
+            architectures = [architectures]
+        if not architectures:
+            logger.warning("No model architectures are specified")
+
+        # filter out support architectures
+        normalized_arch = list(
+            filter(lambda model: model in self.models, architectures)
+        )
+
+        # make sure Transformers backend is put at the last as a fallback
+        if len(normalized_arch) != len(architectures):
+            normalized_arch.append("TransformersForCausalLM")
+        return normalized_arch
+
+    def resolve_model_cls(
+        self,
+        architectures: Union[str, List[str]],
+    ) -> Tuple[Type[nn.Module], str]:
+        architectures = self._normalize_archs(architectures)
+
+        for arch in architectures:
+            model_cls = self._try_load_model_cls(arch)
+            if model_cls is not None:
+                return (model_cls, arch)
+
+        return self._raise_for_unsupported(architectures)
+
+
+@lru_cache()
+def import_model_classes():
+    model_arch_name_to_cls = {}
+    package_name = "sglang.srt.models"
+    package = importlib.import_module(package_name)
+    for _, name, ispkg in pkgutil.iter_modules(package.__path__, package_name + "."):
+        if not ispkg:
+            try:
+                module = importlib.import_module(name)
+            except Exception as e:
+                logger.warning(f"Ignore import error when loading {name}: {e}")
+                continue
+            if hasattr(module, "EntryClass"):
+                entry = module.EntryClass
+                if isinstance(
+                    entry, list
+                ):  # To support multiple model classes in one module
+                    for tmp in entry:
+                        assert (
+                            tmp.__name__ not in model_arch_name_to_cls
+                        ), f"Duplicated model implementation for {tmp.__name__}"
+                        model_arch_name_to_cls[tmp.__name__] = tmp
+                else:
+                    assert (
+                        entry.__name__ not in model_arch_name_to_cls
+                    ), f"Duplicated model implementation for {entry.__name__}"
+                    model_arch_name_to_cls[entry.__name__] = entry
+
+    return model_arch_name_to_cls
+
+
+ModelRegistry = _ModelRegistry(import_model_classes())
diff --git a/python/sglang/srt/models/roberta.py b/python/sglang/srt/models/roberta.py
new file mode 100644
index 000000000..209be1296
--- /dev/null
+++ b/python/sglang/srt/models/roberta.py
@@ -0,0 +1,286 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import itertools
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+
+from sglang.srt.layers.pooler import CrossEncodingPooler, Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.bert import BertEncoder
+
+RobertaConfig = None
+
+
+# Adapted from transformers
+class RobertaClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config: RobertaConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features[0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.out_proj(x)
+        return x
+
+
+class RobertaEmbedding(nn.Module):
+
+    def __init__(self, config: RobertaConfig):
+        super().__init__()
+        self.size = config.hidden_size
+        self.word_embeddings = VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size
+        )
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings,
+            config.hidden_size,
+            padding_idx=self.padding_idx,
+        )
+
+        self.token_type_embeddings = nn.Embedding(
+            config.type_vocab_size, config.hidden_size
+        )
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.position_ids = nn.Parameter(
+            torch.empty((1, config.max_position_embeddings)),
+        )
+
+        self.position_embedding_type = config.position_embedding_type
+        if self.position_embedding_type != "absolute":
+            raise ValueError(
+                "Only 'absolute' position_embedding_type" + " is supported"
+            )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        seq_lens: torch.Tensor,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        input_shape = input_ids.size()
+        inputs_embeds = self.word_embeddings(input_ids)
+
+        # Adapted from vllm: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py
+
+        pos_list = []
+        token_list = []
+        offset = 0
+        for seq_len in seq_lens:
+            pos_list.append(position_ids[offset : offset + seq_len])
+            token_list.append(input_ids[offset : offset + seq_len])
+            offset += seq_len
+
+        new_pos_list = []
+        for positions, tokens in zip(pos_list, token_list):
+            # Verify assumption that incoming position are
+            # always a sequence from 0 to N.
+            expected_pos = torch.arange(
+                positions.size()[0], dtype=torch.long, device=inputs_embeds.device
+            )
+            assert torch.equal(positions, expected_pos)
+            new_pos_list.append(
+                create_position_ids_from_input_ids(tokens, self.padding_idx)
+            )
+        position_ids = torch.cat(new_pos_list)
+
+        # Position embeddings.
+        position_embeddings = self.position_embeddings(position_ids)
+
+        token_type_ids = forward_batch.token_type_ids
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=inputs_embeds.device
+            )
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = inputs_embeds + token_type_embeddings + position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        return embeddings
+
+
+class XLMRobertaBaseModel(nn.Module):
+    def __init__(
+        self,
+        *,
+        config: RobertaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        add_pooling_layer: bool = False,
+    ):
+        super().__init__()
+
+        self.config = config
+        self.embeddings = RobertaEmbedding(config)
+        self.encoder = BertEncoder(config=config, quant_config=quant_config, prefix="")
+        self.pooler = (
+            Pooler(pooling_type=PoolingType.CLS, normalize=True)
+            if add_pooling_layer
+            else None
+        )
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+    ) -> torch.Tensor:
+        assert get_embedding == True
+        # Your tokenized IDs
+
+        hidden_states = self.embeddings(
+            input_ids=input_ids,
+            position_ids=positions,
+            seq_lens=forward_batch.seq_lens,
+            forward_batch=forward_batch,
+        )
+
+        hidden_states = self.encoder(hidden_states, forward_batch=forward_batch)
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "query", "q"),
+            ("qkv_proj", "key", "k"),
+            ("qkv_proj", "value", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            name = name.replace("self", "self_attn")
+            if self.pooler is None and "pooler" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+# Adapted from transformers
+def create_position_ids_from_input_ids(
+    input_ids, padding_idx, past_key_values_length=0
+):
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (
+        torch.cumsum(mask, dim=0).type_as(mask) + past_key_values_length
+    ) * mask
+    return incremental_indices.long() + padding_idx
+
+
+class XLMRobertaModel(nn.Module):
+    def __init__(
+        self,
+        *,
+        config: RobertaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.roberta = XLMRobertaBaseModel(
+            config=config, quant_config=quant_config, prefix=prefix
+        )
+        self.pooler = Pooler(pooling_type=PoolingType.CLS, normalize=True)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+    ) -> torch.Tensor:
+        hidden_states = self.roberta(
+            input_ids, positions, forward_batch, input_embeds, get_embedding
+        )
+        return self.pooler(hidden_states, forward_batch)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        self.roberta.load_weights(weights)
+
+
+class XLMRobertaForSequenceClassification(nn.Module):
+    def __init__(
+        self,
+        *,
+        config: RobertaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.roberta = XLMRobertaBaseModel(
+            config=config, quant_config=quant_config, prefix=prefix
+        )
+        self.classifier = RobertaClassificationHead(config)
+        self.pooler = CrossEncodingPooler(config, self.classifier, self.roberta.pooler)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = True,
+    ) -> torch.Tensor:
+        assert (
+            get_embedding
+        ), "XLMRobertaForSequenceClassification is only used for rerank"
+
+        hidden_states = self.roberta(
+            input_ids, positions, forward_batch, input_embeds, get_embedding
+        )
+        return self.pooler(hidden_states, forward_batch)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        self_weights = []
+
+        def weight_filter():
+            for name, weight in weights:
+                if name.startswith("roberta."):
+                    yield (name[len("roberta.") :], weight)
+                else:
+                    self_weights.append((name, weight))
+
+        self.roberta.load_weights(weight_filter())
+
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in self_weights:
+            if name.startswith("classifier"):
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = [XLMRobertaModel, XLMRobertaForSequenceClassification]
diff --git a/python/sglang/srt/models/siglip.py b/python/sglang/srt/models/siglip.py
new file mode 100644
index 000000000..2a76dc286
--- /dev/null
+++ b/python/sglang/srt/models/siglip.py
@@ -0,0 +1,294 @@
+# Adapted from
+# https://github.com/huggingface/transformers/blob/af9b2eaa54c150741f298d6db939af6328e1dc38/src/transformers/models/siglip/modeling_siglip.py
+
+from functools import partial
+from typing import Optional, Type, Union
+
+import torch
+import torch.nn as nn
+from transformers import SiglipVisionConfig
+
+from sglang.srt.layers.activation import QuickGELU
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.utils import add_prefix
+
+
+# Adapted from transformers.models.siglip.modeling_siglip.SiglipVisionTransformer
+class SiglipVisionEmbeddings(nn.Module):
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches
+        self.position_embedding = VocabParallelEmbedding(
+            self.num_positions, self.embed_dim
+        )
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_positions).expand((1, -1)),
+            persistent=False,
+        )
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(
+            pixel_values.to(dtype=target_dtype)
+        )  # shape = [*, width, grid, grid]
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+        # interpolate_pos_encoding is never used in sglang
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+
+        return embeddings
+
+
+# Copied from sglang.srt.models.clip.CLIPMLP
+class SiglipMLP(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        act_layer: Type[nn.Module] = QuickGELU,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            quant_config=quant_config,
+            prefix=add_prefix("fc1", prefix),
+        )
+        self.act = act_layer()
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("fc2", prefix),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_parallel, _ = self.fc1(x)
+        x_parallel = self.act(x_parallel)
+        x, _ = self.fc2(x_parallel)
+        return x
+
+
+# Copied from sglang.srt.models.clip.CLIPEncoderLayer
+class SiglipEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: SiglipVisionConfig,
+        act_layer: Type[nn.Module] = QuickGELU,
+        norm_layer: Type[nn.Module] = None,
+        attn_implementation: Optional[str] = "sdpa",
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=config.layer_norm_eps)
+        self.layer_norm1 = norm_layer(config.hidden_size)
+        self.layer_norm2 = norm_layer(config.hidden_size)
+        if attn_implementation == "sdpa":
+            qkv_backend = "sdpa"
+            softmax_in_single_precision = False
+        elif attn_implementation == "flash_attention_2":
+            qkv_backend = "triton_attn"
+            softmax_in_single_precision = False
+        elif attn_implementation == "eager":
+            qkv_backend = "sdpa"
+            softmax_in_single_precision = True
+        self.self_attn = VisionAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            projection_size=config.hidden_size,
+            use_qkv_parallel=True,
+            qkv_backend=qkv_backend,
+            softmax_in_single_precision=softmax_in_single_precision,
+            flatten_batch=True,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = SiglipMLP(
+            config,
+            act_layer=act_layer,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        # Siglip text model uses both `causal_attention_mask` and `attention_mask`
+        if attention_mask is not None and causal_attention_mask is not None:
+            attn_mask = attention_mask + causal_attention_mask
+        elif causal_attention_mask is not None:
+            attn_mask = causal_attention_mask
+        else:
+            attn_mask = attention_mask
+        hidden_states = self.self_attn(
+            hidden_states,
+            attention_mask=attn_mask,
+            # causal_attention_mask=causal_attention_mask,
+        )
+
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+# Copied from sglang.srt.models.clip.CLIPEncoder
+class SiglipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self
+    attention layers. Each layer is a [`SiglipEncoderLayer`].
+
+    Args:
+        config: SiglipConfig
+    """
+
+    def __init__(
+        self,
+        config: SiglipVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        num_hidden_layers = config.num_hidden_layers
+        norm_layer = partial(nn.LayerNorm, eps=config.layer_norm_eps)
+        self.layers = nn.ModuleList(
+            [
+                SiglipEncoderLayer(
+                    config=config,
+                    norm_layer=norm_layer,
+                    attn_implementation="sdpa",
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{layer_idx}", prefix),
+                )
+                for layer_idx in range(num_hidden_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        causal_attention_mask: torch.Tensor = None,
+        return_all_hidden_states: bool = False,
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        hidden_states_pool = [inputs_embeds]
+        hidden_states = inputs_embeds
+
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(
+                hidden_states, attention_mask, causal_attention_mask
+            )
+            if return_all_hidden_states:
+                hidden_states_pool.append(hidden_states)
+        if return_all_hidden_states:
+            return hidden_states_pool
+        return hidden_states
+
+
+# Adapted from transformers.models.siglip.modeling_siglip.SiglipVisionTransformer
+class SiglipVisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        config: SiglipVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = SiglipVisionEmbeddings(config)
+
+        self.encoder = SiglipEncoder(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("encoder", prefix),
+        )
+
+        num_hidden_layers = config.num_hidden_layers
+        if len(self.encoder.layers) > config.num_hidden_layers:
+            raise ValueError(
+                f"The original encoder only has {num_hidden_layers} "
+                f"layers, but you requested {len(self.encoder.layers)} layers."
+            )
+
+        # VisionAttention in SiglipEncoderLayer is multihead attention
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @property
+    def device(self) -> torch.device:
+        return self.encoder.layers[0].layer_norm1.weight.device
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self.embeddings(pixel_values.to(self.device))
+
+        return_all_hidden_states = False
+
+        last_hidden_state = self.encoder(
+            inputs_embeds=hidden_states,
+            return_all_hidden_states=return_all_hidden_states,
+        )
+
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        return last_hidden_state
+
+
+# Copied from sglang.srt.models.clip.CLIPVisionModel
+class SiglipVisionModel(nn.Module):
+    def __init__(
+        self,
+        config: SiglipVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.vision_model = SiglipVisionTransformer(
+            config, quant_config, prefix=add_prefix("vision_model", prefix)
+        )
+
+    @property
+    def device(self) -> torch.device:
+        return self.vision_model.device
+
+    def forward(self, pixel_values: torch.Tensor):
+        return self.vision_model(pixel_values)
diff --git a/python/sglang/srt/models/stablelm.py b/python/sglang/srt/models/stablelm.py
new file mode 100644
index 000000000..1566893ee
--- /dev/null
+++ b/python/sglang/srt/models/stablelm.py
@@ -0,0 +1,331 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from:
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/stablelm.py#L1
+"""
+Inference-only StableLM-2 (https://huggingface.co/stabilityai/stablelm-2-1_6b)
+model compatible with HuggingFace weights.
+"""
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+
+class StablelmMLP(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_up_proj = MergedColumnParallelLinear(
+            config.hidden_size,
+            [config.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class StablelmAttention(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        self.num_heads = self.total_num_heads // tp_size
+
+        self.total_num_key_value_heads = config.num_key_value_heads
+        if self.total_num_key_value_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_key_value_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_key_value_heads == 0
+        self.num_key_value_heads = max(1, self.total_num_key_value_heads // tp_size)
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        rope_pct = getattr(
+            config, "rope_pct", getattr(config, "partial_rotary_factor", 1)
+        )
+        self.rotary_ndims = int(self.head_dim * rope_pct)
+        self.scaling = self.head_dim**-0.5
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_key_value_heads * self.head_dim
+        self.qkv_bias = getattr(config, "use_qkv_bias", False)
+        if (self.head_dim * self.num_heads * tp_size) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads "
+                f"(got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_key_value_heads,
+            self.qkv_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.rotary_ndims,
+            max_position=self.config.max_position_embeddings,
+            base=self.config.rope_theta,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_key_value_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class StablelmDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.self_attn = StablelmAttention(
+            config, layer_id=layer_id, prefix=add_prefix("self_attn", prefix)
+        )
+        self.mlp = StablelmMLP(
+            config, quant_config=quant_config, prefix=add_prefix("mlp", prefix)
+        )
+        norm_eps = getattr(config, "norm_eps", getattr(config, "layer_norm_eps", 1e-05))
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states, residual
+
+
+class StableLMEpochModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = nn.ModuleList(
+            [
+                StablelmDecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        norm_eps = getattr(config, "norm_eps", getattr(config, "layer_norm_eps", 1e-05))
+        self.norm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+            )
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class StableLmForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = StableLMEpochModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size, config.hidden_size, prefix=add_prefix("lm_head", prefix)
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = StableLmForCausalLM
diff --git a/python/sglang/srt/models/step3_vl.py b/python/sglang/srt/models/step3_vl.py
new file mode 100644
index 000000000..a93bf69e7
--- /dev/null
+++ b/python/sglang/srt/models/step3_vl.py
@@ -0,0 +1,1005 @@
+import logging
+import math
+from collections.abc import Iterable
+from math import sqrt
+from typing import Any, Dict, Iterable, List, Literal, Optional, Tuple, TypedDict, Union
+
+import torch
+from torch import nn
+from torch.nn import LayerNorm
+from torch.nn import functional as F
+from transformers import PretrainedConfig
+from transformers.activations import ACT2FN
+
+from sglang.srt.configs.step3_vl import (
+    Step3TextConfig,
+    Step3VisionEncoderConfig,
+    Step3VLConfig,
+)
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe import get_moe_a2a_backend
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+    global_server_args_dict,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, log_info_on_rank0, make_layers
+
+logger = logging.getLogger(__name__)
+
+
+"""
+Text Model
+"""
+
+
+class Step3TextMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Step3TextMoEMLP(nn.Module):
+    # Native
+    def __init__(
+        self,
+        layer_id: int,
+        config: Step3TextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.layer_id = layer_id
+        if self.tp_size > config.moe_num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.moe_num_experts}."
+            )
+
+        self.topk = TopK(
+            top_k=config.moe_top_k,
+            renormalize=config.norm_expert_weight,
+            use_grouped_topk=False,
+        )
+
+        self.experts = get_moe_impl_class()(
+            num_experts=config.moe_num_experts,
+            top_k=config.moe_top_k,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("experts", prefix),
+        )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            output_size=config.moe_num_experts,
+            bias=False,
+            quant_config=None,
+            prefix=add_prefix("gate", prefix),
+        )
+
+        if get_moe_a2a_backend().is_deepep():
+            raise NotImplementedError("DeepEP MoE is not supported yet in Step3 model.")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        router_logits, _ = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states, topk_output=topk_output
+        )
+
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class Step3TextAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        share_q_dim: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        rms_norm_eps=None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        self.all_tp_rank = get_tensor_model_parallel_rank()
+        self.total_num_heads = num_heads
+        self.attn_tp_rank = attn_tp_rank
+        self.layer_id = layer_id
+        assert self.total_num_heads % attn_tp_size == 0
+        self.num_heads = self.total_num_heads // attn_tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
+        self.head_dim = head_dim
+        self.q_size = share_q_dim if share_q_dim else head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [self.q_size, self.kv_size, self.kv_size],
+            bias=False,
+            quant_config=quant_config,
+            tp_rank=0,  # In fact, we need a MergedReplicatedLinear
+            tp_size=1,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            reduce_results=False,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.inter_norm = RMSNorm(self.q_size, eps=rms_norm_eps)
+
+        self.wq = ColumnParallelLinear(
+            self.q_size,
+            self.head_dim * self.total_num_heads,
+            bias=False,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            prefix=add_prefix("wq", prefix),
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q = self.inter_norm(q.contiguous())
+        q, _ = self.wq(q)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Step3TextDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: Step3TextConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        head_dim = getattr(
+            config, "head_dim", config.hidden_size // config.num_attention_heads
+        )
+        # TODO: support shared experts fusion
+        # self.n_shared_experts = 1
+        # self.num_fused_shared_experts = (
+        #     0
+        #     if global_server_args_dict["disable_shared_experts_fusion"]
+        #     else self.n_shared_experts
+        # )
+        self.num_fused_shared_experts = 0
+        rms_norm_eps = config.rms_norm_eps
+        self.self_attn = Step3TextAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=1,
+            head_dim=head_dim,
+            share_q_dim=config.share_q_dim,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            rms_norm_eps=rms_norm_eps,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+
+        moe_layers_enum = getattr(config, "moe_layers_enum", None)
+        if moe_layers_enum is not None:
+            moe_layers_idx = [int(i) for i in moe_layers_enum.strip().split(",")]
+        else:
+            # Default to 1dense.
+            moe_layers_idx = [i for i in range(1, config.num_hidden_layers)]
+
+        self.use_moe = False
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.layer_id = layer_id
+        self.is_layer_sparse = True if layer_id in moe_layers_idx else False
+        self.is_previous_layer_sparse = (
+            True if layer_id - 1 in moe_layers_idx else False
+        )
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=self.is_layer_sparse,
+            is_previous_layer_sparse=self.is_previous_layer_sparse,
+        )
+
+        if not self.is_layer_sparse:
+            self.mlp = Step3TextMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act="silu",
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+        else:
+            self.use_moe = True
+            if self.num_fused_shared_experts == 0:
+                self.moe = Step3TextMoEMLP(
+                    layer_id=layer_id,
+                    config=config,
+                    quant_config=quant_config,
+                    prefix=add_prefix("mlp", prefix),
+                )
+                self.share_expert = Step3TextMLP(
+                    hidden_size=config.hidden_size,
+                    intermediate_size=config.share_expert_dim,
+                    hidden_act="silu",
+                    quant_config=quant_config,
+                    prefix=add_prefix("share_expert", prefix),
+                )
+            else:
+                self.moe = Step3TextMoEMLP(
+                    layer_id=layer_id,
+                    config=config,
+                    quant_config=quant_config,
+                    prefix=add_prefix("mlp", prefix),
+                )
+
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+        )
+
+    def moe_mlp_forward(self, hidden_states):
+        if not self.num_fused_shared_experts:
+            h = hidden_states.clone()
+            hidden_states = self.moe(hidden_states)
+            hidden_states += self.share_expert(h)
+        else:
+            hidden_states = self.moe(hidden_states)
+        return hidden_states
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+
+        if hidden_states.shape[0] != 0:
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+        if self.use_moe:
+            hidden_states = self.moe_mlp_forward(hidden_states)
+        else:
+            hidden_states = self.mlp(hidden_states)
+
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            hidden_states, residual, forward_batch
+        )
+
+        return hidden_states, residual
+
+
+class Step3TextModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            enable_tp=not is_dp_attention_enabled(),
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Step3TextDecoderLayer(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=add_prefix("layers", prefix),
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions, hidden_states, forward_batch, residual
+            )
+
+        if hidden_states.shape[0] != 0:
+            if residual is None:
+                hidden_states = self.norm(hidden_states)
+            else:
+                hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+"""
+Vision Model
+"""
+
+
+def get_abs_pos(abs_pos, tgt_size):
+    dim = abs_pos.size(-1)
+    abs_pos_new = abs_pos.squeeze(0)
+    cls_token, old_pos_embed = abs_pos_new[:1], abs_pos_new[1:]
+
+    src_size = int(math.sqrt(abs_pos_new.shape[0] - 1))
+    tgt_size = int(math.sqrt(tgt_size))
+    dtype = abs_pos.dtype
+
+    if src_size != tgt_size:
+        old_pos_embed = (
+            old_pos_embed.view(1, src_size, src_size, dim)
+            .permute(0, 3, 1, 2)
+            .contiguous()
+        )
+        old_pos_embed = old_pos_embed.to(torch.float32)
+        new_pos_embed = F.interpolate(
+            old_pos_embed,
+            size=(tgt_size, tgt_size),
+            mode="bicubic",
+            antialias=True,
+            align_corners=False,
+        ).to(dtype)
+        new_pos_embed = new_pos_embed.permute(0, 2, 3, 1)
+        new_pos_embed = new_pos_embed.view(tgt_size * tgt_size, dim)
+        vision_pos_embed = torch.cat([cls_token, new_pos_embed], dim=0)
+        vision_pos_embed = vision_pos_embed.view(1, tgt_size * tgt_size + 1, dim)
+        return vision_pos_embed
+    else:
+        return abs_pos
+
+
+class Step3VisionMLP(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        intermediate_size: int,
+        bias: bool = True,
+        hidden_act="quick_gelu",
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        # Since this is a dense model,
+        # the MLP component likewise adopts a DP-MLP approach modeled after DP Attention.
+        # This choice may not represent the optimal solution and remains open to further deliberation.
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+        self.fc1 = ColumnParallelLinear(
+            dim,
+            intermediate_size,
+            bias=bias,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            prefix=add_prefix("gate_proj", prefix),
+        )
+        self.act = ACT2FN[hidden_act]  # quick_gelu
+        self.fc2 = RowParallelLinear(
+            intermediate_size,
+            dim,
+            bias=bias,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            prefix=add_prefix("down_proj", prefix),
+        )
+
+    def forward(self, hidden_states) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Step3VisionAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 16,
+        qkv_backend="fa3",
+        quant_config=None,
+        prefix: str = "",
+    ) -> None:
+
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.out_proj = RowParallelLinear(
+            dim,
+            dim,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("out_proj", prefix),
+        )
+        self.scale = self.head_dim**-0.5
+
+        self.attn = VisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            use_qkv_parallel=True,
+            rotary_embed="normal",
+            proj_bias=True,
+            qkv_backend=qkv_backend,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        attn_output = self.attn(hidden_states)
+        return attn_output
+
+
+class Step3VisionEmbeddings(nn.Module):
+
+    def __init__(self, config: Step3VisionEncoderConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(1, self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=True,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.pad_tp_size = 4  # hard code for padding
+        # To load the pretrained weights, we still use P+1 as the seqlen
+        self.position_embedding = torch.nn.Embedding(
+            self.num_patches + 1, self.embed_dim
+        )
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_patches + 1).expand((1, -1)),
+            persistent=False,
+        )
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        patch_embeds = self.patch_embedding(
+            pixel_values
+        )  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        # pad
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + get_abs_pos(
+            self.position_embedding(self.position_ids), patch_embeds.size(1)
+        )
+        embeddings = torch.cat(
+            [
+                embeddings[:, 0, :].unsqueeze(1).repeat(1, self.pad_tp_size - 1, 1),
+                embeddings,
+            ],
+            dim=1,
+        )
+        return embeddings
+
+
+class Step3VisionEncoderLayer(nn.Module):
+    def __init__(self, config, attn_implementation: str = "sdpa") -> None:
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.layer_norm1 = LayerNorm(self.embed_dim, eps=1e-6)
+        self.layer_norm2 = LayerNorm(self.embed_dim, eps=1e-6)
+
+        self.self_attn = Step3VisionAttention(
+            self.embed_dim, num_heads=config.num_attention_heads
+        )
+        self.mlp = Step3VisionMLP(
+            dim=self.embed_dim,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+        )
+
+    def forward(self, hidden_states) -> torch.Tensor:
+        hidden_states = hidden_states + self.layer_norm1(self.self_attn(hidden_states))
+        hidden_states = hidden_states + self.layer_norm2(self.mlp(hidden_states))
+        return hidden_states
+
+
+class Step3VisionTransformer(nn.Module):
+    def __init__(self, config: Step3VisionEncoderConfig):
+        super().__init__()
+        self.config = config
+        self.image_size = config.image_size
+        self.embeddings = Step3VisionEmbeddings(config)
+        self.transformer = Step3VisionEncoder(config)
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.embeddings.patch_embedding.weight.dtype
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+    ):
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.transformer(inputs_embeds=hidden_states)
+        return hidden_states
+
+
+class Step3VisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`Step3VisionEncoderLayer`].
+
+    Args:
+        config: StepVisionEncoderConfig
+    """
+
+    def __init__(self, config: Step3VisionEncoderConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [Step3VisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]
+        )
+
+    def forward(
+        self,
+        inputs_embeds,
+    ) -> torch.Tensor:
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(
+                hidden_states,
+            )
+
+        return hidden_states
+
+
+class Step3VLForConditionalGeneration(nn.Module):
+
+    def __init__(
+        self,
+        config: Step3VLConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Step3TextModel(
+            config.text_config, quant_config, prefix=add_prefix("model", prefix)
+        )
+
+        self.vision_model = Step3VisionTransformer(config.vision_config)
+
+        self.vit_downsampler = nn.Conv2d(
+            config.vision_config.hidden_size,
+            config.vision_config.output_hidden_size,
+            kernel_size=2,
+            stride=config.understand_projector_stride,
+        )
+        self.vit_downsampler2 = nn.Conv2d(
+            config.vision_config.output_hidden_size,
+            config.vision_config.output_hidden_size * 2,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        )
+        self.vit_large_projector = nn.Linear(
+            config.vision_config.output_hidden_size * 2,
+            config.hidden_size,
+            bias=config.projector_bias,
+        )
+
+        # TODO: support shared experts fusion
+        # self.n_shared_experts = 1
+        # self.num_fused_shared_experts = (
+        #     0
+        #     if global_server_args_dict["disable_shared_experts_fusion"]
+        #     else self.n_shared_experts
+        # )
+        self.num_fused_shared_experts = 0
+        self.config.tie_word_embeddings = False
+        if getattr(self.config, "tie_word_embeddings", False):
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.text_config.vocab_size,
+                config.text_config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        self.logits_processor = LogitsProcessor(config.text_config)
+
+    def _get_vision_model_output(self, input_tensor: torch.Tensor) -> torch.Tensor:
+        return self.vision_model(input_tensor)[:, 4:]
+
+    def _flatten_embeddings(self, embeddings) -> torch.Tensor:
+
+        if isinstance(embeddings, torch.Tensor):
+            # Flatten all but the last dimension.
+            return embeddings.flatten(0, -2)
+
+        return torch.cat(tuple(self._flatten_embeddings(t) for t in embeddings))
+
+    def _process_image_features(self, image_features: torch.Tensor) -> torch.Tensor:
+        B, P = image_features.shape[:2]
+        HW = int(sqrt(P))
+        image_features = image_features.permute(0, 2, 1).view(B, -1, HW, HW)
+        image_features = self.vit_downsampler(image_features)
+        image_features = self.vit_downsampler2(image_features)
+        n_dim = image_features.size(1)
+        image_features = image_features.view(B, n_dim, -1).permute(0, 2, 1)
+        image_features = self.vit_large_projector(image_features)
+        return image_features
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        assert len(items) == 1  # We only have images.
+
+        item = items[0]
+        pixel_values = item.feature.type(self.vision_model.dtype)
+        num_patches = item.model_specific_data.get("num_patches")
+        patch_pixel_values = item.model_specific_data.get("patch_pixel_values", None)
+        if patch_pixel_values is not None:
+            patch_pixel_values = patch_pixel_values.type(self.vision_model.dtype)
+
+        if patch_pixel_values is not None:
+            patch_pixel_values = patch_pixel_values.to("cuda")
+
+        image_features = self._get_vision_model_output(pixel_values)
+        patch_image_features = (
+            self._get_vision_model_output(patch_pixel_values)
+            if patch_pixel_values is not None
+            else None
+        )
+
+        image_features = self._process_image_features(image_features)
+        patch_image_features = (
+            self._process_image_features(patch_image_features)
+            if patch_image_features is not None
+            else None
+        )
+
+        merged_image_features = []
+        cur_patch_idx = 0
+        for i, num_patch in enumerate(num_patches):
+            cur_feature = []
+            if num_patch > 0:
+                patch_slice = patch_image_features[
+                    cur_patch_idx : cur_patch_idx + num_patch
+                ]
+                cur_feature.append(patch_slice.view(-1, patch_slice.shape[-1]))
+            cur_feature.append(image_features[i].view(-1, image_features.shape[-1]))
+            cur_patch_idx += num_patch
+            merged_image_features.append(
+                torch.cat(cur_feature) if len(cur_feature) > 1 else cur_feature[0]
+            )
+        return self._flatten_embeddings(merged_image_features)
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.model,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+            },
+            positions=positions,
+        )
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", 0),
+            (".qkv_proj", ".k_proj", 1),
+            (".qkv_proj", ".v_proj", 2),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+        if self.num_fused_shared_experts > 0:
+            assert self.num_fused_shared_experts == 1
+            log_info_on_rank0(logger, "Shared experts fusion optimization enabled.")
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.text_config.moe_num_experts
+            + self.num_fused_shared_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        loaded_params = set()
+
+        def match_expert_and_shard_ids(name_path: str, weight_path: str) -> bool:
+            name_parts = name_path.split(".")
+            weight_parts = weight_path.split(".")
+            shard_id_matches = name_parts[4] == weight_parts[2]
+            return shard_id_matches
+
+        for name, loaded_weight in weights:
+            if "vision_model" in name:
+                name = name.replace("self_attn", "self_attn.attn")
+                name = name.replace("out_proj", "proj")
+
+            # TODO: support vision model
+            if self.num_fused_shared_experts > 0 and "share" in name:
+                # assert False
+                name = name.replace("share_expert", "moe")
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if (
+                        expert_id != self.config.text_config.moe_num_experts
+                        or not match_expert_and_shard_ids(name, weight_name)
+                    ):
+                        continue
+
+                    part_name = weight_name.split(".")[-2]
+                    fake_weight_name = name.replace(part_name, weight_name[:-1])
+                    actual_param_name = name.replace(part_name + ".", param_name)
+                    param = params_dict[actual_param_name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if "gate." not in name and "moe" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                loaded_params.add(name)
+                break
+            else:
+                if "moe" not in name:
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                    loaded_params.add(name)
+                else:
+                    if "gate." in name:
+                        name = name.replace(weight_name, param_name)
+                        param = params_dict[name]
+                        weight_loader = param.weight_loader
+                        weight_loader(param, loaded_weight)
+                        loaded_params.add(name)
+                        continue
+
+                    for mapping in expert_params_mapping:
+                        param_name, weight_name, expert_id, shard_id = mapping
+                        if expert_id == self.config.text_config.moe_num_experts:
+                            continue
+                        if not match_expert_and_shard_ids(name, weight_name):
+                            continue
+                        part_name = weight_name.split(".")[-2]
+                        fake_weight_name = name.replace(part_name, weight_name[:-1])
+                        actual_param_name = name.replace(part_name + ".", param_name)
+                        param = params_dict[actual_param_name]
+                        weight_loader = param.weight_loader
+                        weight_loader(
+                            param,
+                            loaded_weight[expert_id],
+                            name,
+                            shard_id=shard_id,
+                            expert_id=expert_id,
+                        )
+                        loaded_params.add(actual_param_name)
+                        # Don't break here, because this 'loaded_weight' includes all the weights for this layer
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config: Step3VLConfig):
+        return ModelConfigForExpertLocation(
+            num_layers=config.text_config.num_hidden_layers,
+            num_logical_experts=config.text_config.moe_num_experts,
+            num_groups=None,
+        )
+
+
+EntryClass = Step3VLForConditionalGeneration
diff --git a/python/sglang/srt/models/torch_native_llama.py b/python/sglang/srt/models/torch_native_llama.py
new file mode 100644
index 000000000..00499ce66
--- /dev/null
+++ b/python/sglang/srt/models/torch_native_llama.py
@@ -0,0 +1,500 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/llama.py#L1
+"""
+Inference-only LLaMA model compatible with HuggingFace weights.
+
+This model supports tensor parallelism (TP) using the PyTorch tensor parallel package.
+Reference: https://pytorch.org/docs/stable/distributed.tensor.parallel.html
+
+Here is a quick example to enable TP:
+```python
+from sglang.srt.layers.model_parallel import tensor_parallel
+
+device_mesh = torch.distributed.init_device_mesh("cuda", (tp_size,))
+tensor_parallel(model, device_mesh)
+```
+
+An end-to-end example can be found in `python/sglang/bench_one_batch.py`.
+You can run it with the following command:
+```bash
+$ python3 -m sglang.bench_one_batch --correct \
+  --model meta-llama/Meta-Llama-3-8B \
+  --json-model-override-args '{"architectures": ["TorchNativeLlamaForCausalLM"]}' \
+  --tensor-parallel-size 2 \
+  --disable-cuda-graph
+```
+We will enable CUDA Graph support soon.
+"""
+
+import types
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from torch.nn.parameter import Parameter
+from transformers import LlamaConfig
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+tp_size = get_tensor_model_parallel_world_size()
+tp_rank = get_tensor_model_parallel_rank()
+
+
+def gate_up_proj_weight_loader(
+    self,
+    param: Parameter,
+    loaded_weight: torch.Tensor,
+    loaded_shard_id: int,
+):
+    # shard_id: (shard_offset, shard_size)
+    gate_up_offsets = {}
+    current_shard_offset = 0
+    for i, output_size in enumerate(self.output_sizes):
+        # Everything shrinks by tp_size if TP enabled
+        output_size = output_size // tp_size
+        gate_up_offsets[i] = (current_shard_offset, output_size)
+        current_shard_offset += output_size
+    # Re-size the param to the size after TP
+    if current_shard_offset != param.shape[0]:
+        # The clone will free the original, full tensor
+        param.data = param.data.narrow(0, 0, current_shard_offset).clone()
+
+    # Now load gate or up
+    assert loaded_shard_id < len(self.output_sizes)
+    param_data = param.data
+    shard_offset, shard_size = gate_up_offsets[loaded_shard_id]
+    param_data = param_data.narrow(0, shard_offset, shard_size)
+    loaded_weight = loaded_weight.narrow(0, tp_rank * shard_size, shard_size)
+    assert param_data.shape == loaded_weight.shape
+    param_data.copy_(loaded_weight)
+
+
+class LlamaMLP(nn.Module):
+    _tp_plan = {
+        "gate_up_proj": "Colwise_Sharded",
+        "down_proj": "Rowwise",
+    }
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = torch.nn.Linear(
+            hidden_size,
+            intermediate_size * 2,
+            bias=False,
+        )
+        self.gate_up_proj.output_sizes = [intermediate_size] * 2
+        self.gate_up_proj.weight_loader = types.MethodType(
+            gate_up_proj_weight_loader, self.gate_up_proj
+        )
+        self.gate_up_proj.weight.weight_loader = self.gate_up_proj.weight_loader
+        self.down_proj = torch.nn.Linear(intermediate_size, hidden_size, bias=False)
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x = self.down_proj(x)
+        return x
+
+
+def qkv_proj_weight_loader(
+    self,
+    param: Parameter,
+    loaded_weight: torch.Tensor,
+    loaded_shard_id: str,
+):
+    num_heads = self.num_heads // tp_size
+    num_kv_heads = self.num_kv_heads // tp_size
+    # shard_id: (shard_offset, shard_size)
+    qkv_offsets = {
+        "q": (0, num_heads * self.head_size),
+        "k": (num_heads * self.head_size, num_kv_heads * self.head_size),
+        "v": (
+            (num_heads + num_kv_heads) * self.head_size,
+            num_kv_heads * self.head_size,
+        ),
+    }
+    total_size = qkv_offsets["v"][0] + qkv_offsets["v"][1]
+    # Re-size the param to the size after TP
+    if total_size != param.shape[0]:
+        # The clone will free the original, full tensor
+        param.data = param.data.narrow(0, 0, total_size).clone()
+
+    # Now load q, k or v
+    shard_offset, shard_size = qkv_offsets[loaded_shard_id]
+    param_data = param.data
+    param_data = param_data.narrow(0, shard_offset, shard_size)
+    loaded_weight = loaded_weight.narrow(0, tp_rank * shard_size, shard_size)
+    assert param_data.shape == loaded_weight.shape
+    param_data.copy_(loaded_weight)
+
+
+class LlamaAttention(nn.Module):
+    _tp_plan = {
+        "qkv_proj": "Colwise_Sharded",
+        "o_proj": "Rowwise",
+    }
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        rope_is_neox_style: bool = True,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(
+            config, "head_dim", self.hidden_size // self.total_num_heads
+        )
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = torch.nn.Linear(
+            hidden_size,
+            (self.total_num_heads + 2 * self.total_num_kv_heads) * self.head_dim,
+            bias=False,
+        )
+        self.qkv_proj.head_size = self.head_dim
+        self.qkv_proj.num_heads = self.total_num_heads
+        self.qkv_proj.num_kv_heads = self.total_num_kv_heads
+        self.qkv_proj.weight_loader = types.MethodType(
+            qkv_proj_weight_loader, self.qkv_proj
+        )
+        self.qkv_proj.weight.weight_loader = self.qkv_proj.weight_loader
+        self.qkv_proj.weight.output_dim = 0
+        self.o_proj = torch.nn.Linear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=rope_is_neox_style,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output = self.o_proj(attn_output)
+        return output
+
+
+class LlamaDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+        rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        self.self_attn = LlamaAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            rope_is_neox_style=rope_is_neox_style,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = LlamaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class LlamaModel(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList(
+            [
+                LlamaDecoderLayer(
+                    config, i, quant_config=quant_config, prefix=f"model.layers.{i}"
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class TorchNativeLlamaForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.supports_torch_tp = True
+        self.model = LlamaModel(config, quant_config=quant_config)
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.logits_processor = LogitsProcessor(config)
+
+        # turning off autotune for fp8dq since it doesn't give speedup and
+        # increases compile time significantly
+        torch._inductor.config.max_autotune_gemm_backends = "ATEN"
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def get_module_name_from_weight_name(self, name):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id, num_shard)
+            ("qkv_proj", "q_proj", "q", 3),
+            ("qkv_proj", "k_proj", "k", 3),
+            ("qkv_proj", "v_proj", "v", 3),
+            ("gate_up_proj", "gate_proj", 0, 2),
+            ("gate_up_proj", "up_proj", 1, 2),
+        ]
+        for param_name, weight_name, shard_id, num_shard in stacked_params_mapping:
+            if weight_name in name:
+                return (
+                    name.replace(weight_name, param_name)[: -len(".weight")],
+                    num_shard,
+                )
+        return name[: -len(".weight")], 1
+
+    def get_num_params(self):
+        params_dict = dict(self.named_parameters())
+        return len(params_dict)
+
+    def load_weights_to_module(
+        self,
+        fqn: str,
+        weights: Iterable[Tuple[str, torch.Tensor]],
+    ):
+        """Load weights onto submodule pointed by path `fqn`."""
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        module = self.get_submodule(fqn)
+        params_dict = dict(module.named_parameters(prefix=fqn, recurse=False))
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") or name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") or name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+    def load_weights(
+        self,
+        weights: Iterable[Tuple[str, torch.Tensor]],
+    ):
+        """Load weights onto the full model."""
+        self.load_weights_to_module("", weights)
+
+
+class TorchNativePhi3ForCausalLM(TorchNativeLlamaForCausalLM):
+    pass
+
+
+EntryClass = [TorchNativeLlamaForCausalLM, TorchNativePhi3ForCausalLM]
diff --git a/python/sglang/srt/models/transformers.py b/python/sglang/srt/models/transformers.py
new file mode 100644
index 000000000..40e7edcaf
--- /dev/null
+++ b/python/sglang/srt/models/transformers.py
@@ -0,0 +1,288 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/a1a2aaadb9122f05667140e39cf67e5736c8b6d6/vllm/model_executor/models/transformers.py
+"""Wrapper around `transformers` models"""
+import logging
+import re
+from typing import Iterable, Literal, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import AutoModel, PretrainedConfig, PreTrainedModel
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+
+from sglang.srt.distributed import divide, get_tensor_model_parallel_world_size
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+
+logger = logging.getLogger(__name__)
+
+
+def maybe_prefix(prefix: str, name: str) -> str:
+    """Add a prefix to a name if the prefix is non-empty.
+
+    Args:
+        prefix: The prefix to add. If empty, no prefix will be added.
+        name: The name to potentially prefix.
+
+    Returns:
+        The string "prefix.name" if prefix was non-empty, otherwise just "name".
+    """
+    return name if not prefix else f"{prefix}.{name}"
+
+
+def sglang_flash_attention_forward(
+    # Transformers args
+    module: torch.nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: torch.Tensor,
+    # sglang kwargs
+    forward_batch: ForwardBatch,
+    # Transformers kwargs
+    scaling: float = None,
+    attention_instances: list[RadixAttention] = None,
+    **kwargs,
+):
+    self_attn: RadixAttention = attention_instances[module.layer_idx]
+    if scaling is not None:
+        self_attn.scaling = float(scaling)
+    hidden = query.shape[-2]
+    query, key, value = (x.transpose(1, 2) for x in (query, key, value))
+    query, key, value = (x.reshape(hidden, -1) for x in (query, key, value))
+    return self_attn.forward(query, key, value, forward_batch=forward_batch), None
+
+
+ALL_ATTENTION_FUNCTIONS["sglang"] = sglang_flash_attention_forward
+
+
+class HFColumnParallelLinear(ColumnParallelLinear):
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return super().forward(input)[0]
+
+
+class HFRowParallelLinear(RowParallelLinear):
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return super().forward(input)[0]
+
+
+def replace_linear_class(
+    linear: nn.Linear,
+    style: Literal["colwise", "rowwise"],
+    quant_config: QuantizationConfig,
+) -> Union[ColumnParallelLinear, RowParallelLinear]:
+    """
+    Replace nn.Linear with one of vLLM's tensor parallel linear classes.
+
+    Args:
+        linear (nn.Linear): `nn.Linear` to be replaced.
+        style (str): Tensor parallel style of the new linear, e.g. "colwise".
+        quant_config (QuantConfig): Quantization config for the new linear.
+    Returns:
+        Union[ColumnParallelLinear, RowParallelLinear]: The new linear.
+    """
+
+    if not isinstance(style, str):
+        raise ValueError(f"Unsupported parallel style type {type(style)}, expected str")
+
+    sglang_linear_cls = {
+        "colwise": ColumnParallelLinear,
+        "rowwise": RowParallelLinear,
+    }.get(style, ReplicatedLinear)
+
+    class HFCompatibleLinear(sglang_linear_cls):
+        """
+        Wrapper class that removes `output_bias` from returned output.
+        """
+
+        @property
+        def parent_cls(self) -> type:
+            return sglang_linear_cls
+
+        def forward(self, input: torch.Tensor) -> torch.Tensor:
+            return super().forward(input)[0]
+
+    return HFCompatibleLinear(
+        input_size=linear.in_features,
+        output_size=linear.out_features,
+        bias=linear.bias is not None,
+        quant_config=quant_config,
+    )
+
+
+class TransformersForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        logger.info("Using Transformers backend.")
+
+        self.quant_config = quant_config
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.unpadded_vocab_size = config.vocab_size
+
+        # model is loaded under set_default_torch_dtype(model_config.dtype)
+        self.model: PreTrainedModel = AutoModel.from_config(
+            self.config,
+            torch_dtype=torch.get_default_dtype(),
+            attn_implementation="sglang",
+            trust_remote_code=True,
+        )
+
+        # Attention modifications (assumes 1 attention op per hidden layer)
+        tp_size = get_tensor_model_parallel_world_size()
+
+        # MLP modifications
+        self.tensor_parallel(tp_size)
+
+        head_dim = (
+            (config.hidden_size // config.num_attention_heads)
+            if not hasattr(config, "head_dim")
+            else config.head_dim
+        )
+        self.attention_instances = [
+            RadixAttention(
+                num_heads=divide(config.num_attention_heads, tp_size),
+                head_dim=head_dim,
+                # NOTE: We use Llama scale as default, if it's set by
+                # Transformers, it's updated in sglang_flash_attention_forward
+                scaling=head_dim**-0.5,
+                num_kv_heads=divide(config.num_key_value_heads, tp_size),
+                layer_id=i,
+                quant_config=self.quant_config,
+                prefix=f"{i}.attn",
+            )
+            for i in range(config.num_hidden_layers)
+        ]
+
+        # Model modifications
+        self.replace_vocab_embed_class(self.model)
+
+        # ForCausalLM modifications
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=self.quant_config,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.model.get_input_embeddings().weight
+
+        self.logits_processor = LogitsProcessor(config)
+
+    def log_replacement(self, name: str, old_module: nn.Module, new_module: nn.Module):
+        logger.debug("%s: %s -> %s", name, old_module, new_module)
+
+    def tensor_parallel(self, tp_size: int):
+        """
+        Apply the model's tensor parallelization plan.
+        Currently only supports linear layers.
+        """
+        tp_plan = getattr(self.model.config, "base_model_tp_plan", None) or {}
+
+        if not tp_plan and tp_size > 1:
+            raise ValueError(
+                f"{type(self.model)} does not support tensor parallel yet!"
+            )
+
+        def _tensor_parallel(module: nn.Module, prefix: str = ""):
+            for child_name, child_module in module.named_children():
+                qual_name = maybe_prefix(prefix, child_name)
+                for pattern, style in tp_plan.items():
+                    if re.match(pattern, qual_name) and isinstance(
+                        child_module, nn.Linear
+                    ):
+                        new_module = replace_linear_class(
+                            child_module, style, self.quant_config
+                        )
+                        setattr(module, child_name, new_module)
+                        self.log_replacement(qual_name, child_module, new_module)
+                else:
+                    _tensor_parallel(child_module, prefix=qual_name)
+
+        _tensor_parallel(self.model)
+
+    def replace_vocab_embed_class(self, module: nn.Module):
+        # Use native set input embeddings
+        new_module = VocabParallelEmbedding(
+            self.vocab_size,
+            self.config.hidden_size,
+            org_num_embeddings=self.config.vocab_size,
+            quant_config=None,
+        )
+        self.log_replacement(
+            "input embedding", self.model.get_input_embeddings(), new_module
+        )
+        self.model.set_input_embeddings(new_module)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+    ) -> LogitsProcessorOutput:
+        assert get_embedding is False, "embedding is not supported yet"
+        aux_hidden_states = None
+        hidden_states = self.model(
+            input_ids[None, ...],
+            use_cache=False,
+            position_ids=positions[None, ...],
+            forward_batch=forward_batch,
+            attention_instances=self.attention_instances,
+            return_dict=False,
+        )[0][
+            0, ...
+        ]  # we remove batch dimension for now
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch, aux_hidden_states
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if name not in params_dict:
+                name = f"{self.model.base_model_prefix}.{name}"
+            if name in params_dict:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = [TransformersForCausalLM]
diff --git a/python/sglang/srt/models/vila.py b/python/sglang/srt/models/vila.py
new file mode 100644
index 000000000..2bb0b2d35
--- /dev/null
+++ b/python/sglang/srt/models/vila.py
@@ -0,0 +1,306 @@
+import logging
+from typing import Any, Dict, Iterable, List, Optional, Tuple, cast
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_outputs import BaseModelOutputWithPooling
+from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
+from transformers.models.siglip import SiglipVisionConfig, SiglipVisionModel
+
+import sglang.srt.managers.mm_utils as mm_utils
+import sglang.srt.model_loader.weight_utils as weight_utils
+import sglang.srt.utils as utils
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import MultiModalityDataPaddingPatternMultimodalTokens
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.qwen2 import Qwen2ForCausalLM
+
+logger = logging.getLogger(__name__)
+
+
+##### BEGIN COPY configuration.py #####
+
+
+class VILAConfig(PretrainedConfig):
+    # Class attributes.
+    model_type: str = "vila"
+    sub_configs: Dict[str, PretrainedConfig] = {
+        "text_config": Qwen2Config(),
+        "vision_config": SiglipVisionConfig(),
+    }
+    _auto_class: Optional[str] = "AutoConfig"
+
+    # Configuration for sub-modules.
+    text_config: Qwen2Config = Qwen2Config()
+    vision_config: SiglipVisionConfig = SiglipVisionConfig()
+
+    # Model configuration.
+    hidden_size: int
+    image_token_id: int
+    mm_hidden_size: int
+    mm_projector_type: str
+    mm_vision_select_feature: str
+    mm_vision_select_layer: int
+    video_token_id: int
+
+    def __init__(
+        self,
+        text_config: Optional[Dict[str, Any]] = None,
+        vision_config: Optional[Dict[str, Any]] = None,
+        *,
+        hidden_size: int = 1536,
+        image_token_id: int = 151649,
+        mm_hidden_size: int = 1152,
+        mm_projector_type: str = "mlp_downsample_3x3_fix",
+        mm_vision_select_feature: str = "cls_patch",
+        mm_vision_select_layer: int = -2,
+        video_token_id: int = 151650,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.text_config = Qwen2Config(**text_config) if text_config else Qwen2Config()
+        self.vision_config = (
+            SiglipVisionConfig(**vision_config)
+            if vision_config
+            else SiglipVisionConfig()
+        )
+
+        self.hidden_size = hidden_size
+        self.image_token_id = image_token_id
+        self.mm_hidden_size = mm_hidden_size
+        self.mm_projector_type = mm_projector_type
+        self.mm_vision_select_feature = mm_vision_select_feature
+        self.mm_vision_select_layer = mm_vision_select_layer
+        self.video_token_id = video_token_id
+
+
+##### END COPY configuration.py #####
+
+##### BEGIN COPY modeling_vila.py #####
+
+
+class DownSample3x3BlockFix(nn.Module):
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x: The input tensor of shape (batch_size, sequence_length, mm_hidden_size).
+
+        Returns:
+            The output tensor of shape (batch_size, image_pad_len, mm_hidden_size * 9).
+        """
+
+        batch_size, sequence_length, hidden_size = x.shape
+
+        feat_size = int(sequence_length**0.5)
+        if feat_size**2 != sequence_length:
+            raise ValueError(
+                f"Cannot take square root: sequence_length {sequence_length} is not a perfect square"
+            )
+
+        features = x.reshape(batch_size, feat_size, feat_size, hidden_size)
+
+        pad_after = (3 - feat_size % 3) % 3
+        if pad_after > 0:
+            features = F.pad(features, (0, 0, 0, pad_after, 0, pad_after))
+            feat_size = feat_size + pad_after
+
+        features = features.reshape(
+            batch_size, feat_size // 3, 3, feat_size // 3, 3, hidden_size
+        )
+        features = features.permute(0, 1, 3, 2, 4, 5).contiguous()
+        features = features.reshape(batch_size, -1, 9 * hidden_size)
+
+        return features
+
+
+class MultimodalProjector(nn.Module):
+    layers: nn.Sequential
+
+    def __init__(
+        self,
+        config: VILAConfig,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+
+        if config.mm_projector_type == "mlp_downsample_3x3_fix":
+            self.layers = nn.Sequential(
+                DownSample3x3BlockFix(),
+                nn.LayerNorm(config.mm_hidden_size * 9),
+                nn.Linear(
+                    config.mm_hidden_size * 9,
+                    config.mm_hidden_size * 3,
+                ),
+                nn.GELU(),
+                nn.LayerNorm(config.vision_config.hidden_size * 3),
+                nn.Linear(config.vision_config.hidden_size * 3, config.hidden_size),
+                nn.GELU(),
+                nn.Linear(config.hidden_size, config.hidden_size),
+            )
+        else:
+            raise NotImplementedError(
+                f"Unsupported mm_projector_type: {config.mm_projector_type}"
+            )
+
+        self.layers.type(config.torch_dtype)
+
+    @property
+    def device(self) -> torch.device:
+        return next(self.parameters()).device
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return next(self.parameters()).dtype
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x: The input tensor of shape (batch_size, sequence_length, mm_hidden_size).
+
+        Returns:
+            The output tensor of shape (batch_size, image_pad_len, hidden_size).
+        """
+
+        return self.layers(x.to(device=self.device, dtype=self.dtype))
+
+
+##### END COPY modeling_vila.py #####
+
+
+class VILAForConditionalGeneration(nn.Module):
+    config: VILAConfig
+    quant_config: Optional[QuantizationConfig]
+
+    logits_processor: LogitsProcessor
+    pooler: Pooler
+
+    llm: Qwen2ForCausalLM
+    mm_projector: MultimodalProjector
+    vision_tower: SiglipVisionModel
+
+    def __init__(
+        self,
+        config: VILAConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.quant_config = quant_config
+
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
+        self.llm = Qwen2ForCausalLM(
+            config=config.text_config,
+            quant_config=quant_config,
+            prefix=utils.add_prefix("llm", prefix),
+        )
+        self.mm_projector = MultimodalProjector(config)
+        self.vision_tower = SiglipVisionModel(config.vision_config)
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.config.torch_dtype
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        positions: Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = False,
+    ) -> LogitsProcessorOutput:
+        output = mm_utils.general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.llm,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+            },
+            get_embedding=get_embedding,
+            positions=positions,
+        )
+
+        return cast(LogitsProcessorOutput, output)
+
+    def get_image_feature(self, mm_input: List[MultimodalDataItem]) -> Tensor:
+        pixel_values = cast(Tensor, mm_input[0].feature)
+
+        ##### BEGIN COPY modeling_vila.py #####
+
+        vision_tower_output: BaseModelOutputWithPooling = self.vision_tower.__call__(
+            pixel_values.to(
+                device=self.vision_tower.device, dtype=self.vision_tower.dtype
+            ),
+            output_hidden_states=True,
+        )
+
+        mm_projector_input = self._vision_tower_output_to_mm_projector_input(
+            vision_tower_output
+        )
+
+        image_embedding: Tensor = self.mm_projector.__call__(
+            mm_projector_input.to(
+                device=self.mm_projector.device, dtype=self.mm_projector.dtype
+            )
+        )
+
+        ##### END COPY modeling_vila.py #####
+
+        return image_embedding
+
+    def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> None:
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            if name.startswith("llm."):
+                self.llm.load_weights([(name[len("llm.") :], loaded_weight)])
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(
+                    param, "weight_loader", weight_utils.default_weight_loader
+                )
+                weight_loader(param, loaded_weight)
+
+    def pad_input_ids(
+        self, input_ids: List[int], mm_inputs: MultimodalInputs
+    ) -> List[int]:
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    ##### BEGIN COPY modeling_vila.py #####
+
+    def _vision_tower_output_to_mm_projector_input(
+        self,
+        vision_tower_output: BaseModelOutputWithPooling,
+    ) -> Tensor:
+        assert vision_tower_output.hidden_states is not None
+
+        selected_layer_hidden_states = vision_tower_output.hidden_states[
+            self.config.mm_vision_select_layer
+        ]
+
+        if self.config.mm_vision_select_feature == "cls_patch":
+            return selected_layer_hidden_states
+        else:
+            raise NotImplementedError(
+                f"Unsupported mm_vision_select_feature: {self.config.mm_vision_select_feature}"
+            )
+
+    ##### END COPY modeling_vila.py #####
+
+
+EntryClass = [VILAForConditionalGeneration]
diff --git a/python/sglang/srt/models/xverse.py b/python/sglang/srt/models/xverse.py
new file mode 100644
index 000000000..f84755b03
--- /dev/null
+++ b/python/sglang/srt/models/xverse.py
@@ -0,0 +1,382 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/xverse.py#L1
+"""Inference-only XVERSE model compatible with HuggingFace weights."""
+
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.model_runner import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+
+class XverseMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class XverseAttention(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        rope_is_neox_style: bool = True,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(
+            config, "head_dim", self.hidden_size // self.total_num_heads
+        )
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=rope_is_neox_style,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class XverseDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+        rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        num_kv_heads = getattr(
+            config, "num_key_value_heads", config.num_attention_heads
+        )
+        self.self_attn = XverseAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=num_kv_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            rope_is_neox_style=rope_is_neox_style,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = XverseMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class XverseModel(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = nn.ModuleList(
+            [
+                XverseDecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+            # print(f"layer[{i}].hidden_states: {hidden_states}")
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class XverseForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = XverseModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size, config.hidden_size, prefix=add_prefix("lm_head", prefix)
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(
+        self, weights: Iterable[Tuple[str, torch.Tensor]], name=None, loaded_weight=None
+    ):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+
+        def load_weights_per_param(name, loaded_weight):
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                return
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                return
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name.startswith("model.vision_tower") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    return
+                if name.startswith("model.vision_tower") and name not in params_dict:
+                    return
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+        if name is None or loaded_weight is None:
+            for name, loaded_weight in weights:
+                load_weights_per_param(name, loaded_weight)
+        else:
+            load_weights_per_param(name, loaded_weight)
+
+
+EntryClass = XverseForCausalLM
diff --git a/python/sglang/srt/models/xverse_moe.py b/python/sglang/srt/models/xverse_moe.py
new file mode 100644
index 000000000..6067acec6
--- /dev/null
+++ b/python/sglang/srt/models/xverse_moe.py
@@ -0,0 +1,478 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only XVERSE MoE model."""
+
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
+from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+
+class XverseMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class XverseMoE(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.n_routed_experts = config.num_experts
+        self.top_k = config.moe_top_k
+        if self.tp_size > self.n_routed_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {self.n_routed_experts}."
+            )
+
+        self.experts = nn.ModuleList(
+            [
+                XverseMLP(
+                    hidden_size=config.hidden_size,
+                    intermediate_size=config.intermediate_size,
+                    hidden_act=config.hidden_act,
+                    quant_config=quant_config,
+                    reduce_results=False,
+                    prefix=add_prefix(f"experts.{i}", prefix),
+                )
+                for i in range(self.n_routed_experts)
+            ]
+        )
+        self.pack_params()
+        self.moe_runner_config = MoeRunnerConfig(inplace=True)
+
+        self.router = ReplicatedLinear(
+            config.hidden_size,
+            self.n_routed_experts,
+            bias=False,
+            quant_config=None,
+            prefix=add_prefix("router", prefix),
+        )
+        self.topk = TopK(
+            top_k=self.top_k,
+            renormalize=getattr(self.config, "norm_topk_prob", False),
+        )
+
+        if config.num_shared_experts is not None:
+            intermediate_size = config.intermediate_size * config.num_shared_experts
+            self.shared_experts = XverseMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=add_prefix("shared_experts", prefix),
+            )
+
+    def pack_params(self):
+        w1 = []
+        w2 = []
+        for expert in self.experts:
+            w1.append(expert.gate_up_proj.weight)
+            w2.append(expert.down_proj.weight)
+        self.w1 = torch._utils._flatten_dense_tensors(w1)
+        w1s = torch._utils._unflatten_dense_tensors(self.w1, w1)
+        for data, param in zip(w1s, w1):
+            param.data = data
+        self.w1 = self.w1.view(len(w1), *w1s[0].shape)
+
+        self.w2 = torch._utils._flatten_dense_tensors(w2)
+        w2s = torch._utils._unflatten_dense_tensors(self.w2, w2)
+        for data, param in zip(w2s, w2):
+            param.data = data
+
+        self.w2 = self.w2.view(len(w2), *w2s[0].shape)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        if self.config.num_shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.router(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = fused_moe(
+            hidden_states,
+            self.w1,
+            self.w2,
+            topk_output,
+            self.moe_runner_config,
+        )
+
+        if self.config.num_shared_experts is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class XverseAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class XverseDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        num_key_value_heads = getattr(
+            config, "num_key_value_heads", config.num_attention_heads
+        )
+        self.self_attn = XverseAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        if config.num_experts is not None:
+            self.mlp = XverseMoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+        else:
+            self.mlp = XverseMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class XverseModel(nn.Module):
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = nn.ModuleList(
+            [
+                XverseDecoderLayer(
+                    config,
+                    layer_id,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{layer_id}", prefix),
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions, hidden_states, forward_batch, residual
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class XverseMoeForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = XverseModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (
+                    "mlp.experts." in name or "mlp.shared_experts." in name
+                ) and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (
+                    "mlp.experts." in name or "mlp.shared_experts." in name
+                ) and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = XverseMoeForCausalLM
diff --git a/python/sglang/srt/models/yivl.py b/python/sglang/srt/models/yivl.py
new file mode 100644
index 000000000..4c50b0d3c
--- /dev/null
+++ b/python/sglang/srt/models/yivl.py
@@ -0,0 +1,115 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only Yi-VL model."""
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from transformers import CLIPVisionModel, LlavaConfig
+
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.llava import LlavaLlamaForCausalLM
+
+
+class YiVLForCausalLM(LlavaLlamaForCausalLM):
+    def __init__(
+        self,
+        config: LlavaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, quant_config, prefix=prefix)
+
+        self.multi_modal_projector = YiVLMultiModalProjector(self.config)
+        self.vision_tower_subfolder = self.config.mm_vision_tower.replace(
+            "./", ""
+        )  # Everything after "./"
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        # We have to use the subfolder of the main model directory (e.g. 01-ai/Yi-VL-6B)
+        self.vision_tower = CLIPVisionModel.from_pretrained(
+            self.config._name_or_path,
+            torch_dtype=torch.float16,
+            subfolder=self.vision_tower_subfolder,
+        ).to("cuda")
+
+        self.vision_tower.eval()
+
+        self.vision_feature_layer = self.config.mm_vision_select_layer
+        self.vision_feature_select_strategy = self.config.mm_vision_select_feature
+        self.image_size = self.vision_tower.config.image_size
+        self.patch_size = self.vision_tower.config.patch_size
+
+        self.mm_patch_merge_type = getattr(self.config, "mm_patch_merge_type", "flat")
+        self.image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square")
+        self.image_grid_pinpoints = getattr(self.config, "image_grid_pinpoints", None)
+
+        self.image_feature_len = int((self.image_size / self.patch_size) ** 2)
+        if self.vision_feature_select_strategy == "patch":
+            pass
+        elif self.vision_feature_select_strategy == "cls_patch":
+            self.image_feature_len += 1
+        else:
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
+
+        # load mm_projector
+        # TODO: support TP?
+        projector_weights = {
+            "model.mm_projector.0": "multi_modal_projector.linear_1",
+            "model.mm_projector.1": "multi_modal_projector.ln_1",
+            "model.mm_projector.3": "multi_modal_projector.linear_2",
+            "model.mm_projector.4": "multi_modal_projector.ln_2",
+            "model.vision_tower.vision_tower": "vision_tower",  # Update the vision tower weights if we find them in the checkpoint (it may be finetuned).
+        }
+        params_dict = dict(self.named_parameters())
+        weights = list(weights)
+        for name, loaded_weight in weights:
+            if "projector" in name or "vision_tower" in name:
+                for weight_name, param_name in projector_weights.items():
+                    if weight_name in name:
+                        name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+        # load language model
+        self.language_model.load_weights(weights)
+
+
+class YiVLMultiModalProjector(nn.Module):
+    def __init__(self, config: LlavaConfig):
+        super().__init__()
+
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size, config.text_config.hidden_size
+        )
+        self.ln_1 = nn.LayerNorm(config.text_config.hidden_size)
+        self.act = nn.GELU()
+        self.linear_2 = nn.Linear(
+            config.text_config.hidden_size, config.text_config.hidden_size
+        )
+        self.ln_2 = nn.LayerNorm(config.text_config.hidden_size)
+
+    def forward(self, image_features):
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.ln_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        hidden_states = self.ln_2(hidden_states)
+        return hidden_states
+
+
+EntryClass = YiVLForCausalLM
diff --git a/python/sglang/srt/multimodal/mm_utils.py b/python/sglang/srt/multimodal/mm_utils.py
new file mode 100644
index 000000000..c399be806
--- /dev/null
+++ b/python/sglang/srt/multimodal/mm_utils.py
@@ -0,0 +1,349 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Source: https://github.com/LLaVA-VL/LLaVA-NeXT/blob/main/llava/mm_utils.py
+"""
+Utilities for multi-modal models.
+
+This python file mainly contains utilities that were used in the
+image processing logic of llava-next including operations such as
+anyres and anyres_max
+
+Currently supports the anyres and anyres_max operation for CLIP and
+SigLip. For more information, you may refer to the paper or the blog
+
+LLaVA-NeXT : https://llava-vl.github.io/blog/2024-01-30-llava-next/
+LLaVA-Onevision : https://arxiv.org/pdf/2408.03326
+
+"""
+import ast
+import math
+import re
+from io import BytesIO
+
+import numpy as np
+import pybase64
+from PIL import Image
+
+from sglang.srt.utils import flatten_nested_list
+
+
+def has_valid_data(data) -> bool:
+    if data is None:
+        return False
+    if isinstance(data, list):
+        return any(has_valid_data(item) for item in flatten_nested_list(data))
+    return True
+
+
+def select_best_resolution(original_size, possible_resolutions):
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    original_width, original_height = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float("inf")
+
+    for width, height in possible_resolutions:
+        # Calculate the downscaled size to keep the aspect ratio
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(
+            original_height * scale
+        )
+
+        # Calculate effective and wasted resolutions
+        effective_resolution = min(
+            downscaled_width * downscaled_height, original_width * original_height
+        )
+        wasted_resolution = (width * height) - effective_resolution
+
+        if effective_resolution > max_effective_resolution or (
+            effective_resolution == max_effective_resolution
+            and wasted_resolution < min_wasted_resolution
+        ):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+
+    return best_fit
+
+
+def resize_and_pad_image(image, target_resolution):
+    """
+    Resize and pad an image to a target resolution while maintaining aspect ratio.
+
+    Args:
+        image (PIL.Image.Image): The input image.
+        target_resolution (tuple): The target resolution (width, height) of the image.
+
+    Returns:
+        PIL.Image.Image: The resized and padded image.
+    """
+    original_width, original_height = image.size
+    target_width, target_height = target_resolution
+
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+
+    if scale_w < scale_h:
+        new_width = target_width
+        new_height = min(math.ceil(original_height * scale_w), target_height)
+    else:
+        new_height = target_height
+        new_width = min(math.ceil(original_width * scale_h), target_width)
+
+    # Resize the image
+    resized_image = image.resize((new_width, new_height))
+
+    new_image = Image.new("RGB", (target_width, target_height), (0, 0, 0))
+    paste_x = (target_width - new_width) // 2
+    paste_y = (target_height - new_height) // 2
+    new_image.paste(resized_image, (paste_x, paste_y))
+
+    return new_image
+
+
+def divide_to_patches(image, patch_size):
+    """
+    Divides an image into patches of a specified size.
+
+    Args:
+        image (PIL.Image.Image): The input image.
+        patch_size (int): The size of each patch.
+
+    Returns:
+        list: A list of PIL.Image.Image objects representing the patches.
+    """
+    patches = []
+    width, height = image.size
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            box = (j, i, j + patch_size, i + patch_size)
+            patch = image.crop(box)
+            patches.append(patch)
+
+    return patches
+
+
+def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
+    """
+    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
+
+    Args:
+        image_size (tuple): The size of the input image in the format (width, height).
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+        patch_size (int): The size of each image patch.
+
+    Returns:
+        tuple: The shape of the image patch grid in the format (width, height).
+    """
+    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
+        assert patch_size in [
+            224,
+            336,
+            384,
+            448,
+            512,
+        ], "patch_size should be in [224, 336, 384, 448, 512]"
+        # Use regex to extract the range from the input string
+        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
+        range_start = tuple(map(int, matches[0]))
+        range_end = tuple(map(int, matches[-1]))
+        # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
+        grid_pinpoints = [
+            (i, j)
+            for i in range(range_start[0], range_end[0] + 1)
+            for j in range(range_start[1], range_end[1] + 1)
+        ]
+        # Multiply all elements by patch_size
+        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    width, height = select_best_resolution(image_size, possible_resolutions)
+    return width // patch_size, height // patch_size
+
+
+def process_anyres_image(image, processor, grid_pinpoints):
+    """
+    Process an image with variable resolutions.
+
+    Args:
+        image (PIL.Image.Image): The input image to be processed.
+        processor: The image processor object.
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+
+    Returns:
+        np.array: An np array containing the processed image patches.
+    """
+    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
+        try:
+            patch_size = processor.size[0]
+        except Exception as e:
+            patch_size = processor.size["shortest_edge"]
+        assert patch_size in [
+            224,
+            336,
+            384,
+            448,
+            512,
+        ], "patch_size should be in [224, 336, 384, 448, 512]"
+        # Use regex to extract the range from the input string
+        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
+        range_start = tuple(map(int, matches[0]))
+        range_end = tuple(map(int, matches[-1]))
+        # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
+        grid_pinpoints = [
+            (i, j)
+            for i in range(range_start[0], range_end[0] + 1)
+            for j in range(range_start[1], range_end[1] + 1)
+        ]
+        # Multiply all elements by patch_size
+        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
+
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    best_resolution = select_best_resolution(image.size, possible_resolutions)
+    image_padded = resize_and_pad_image(image, best_resolution)
+
+    # For Siglip processor, only have size but no crop size
+    crop_size = (
+        processor.crop_size["height"]
+        if "crop_size" in processor.__dict__
+        else processor.size["height"]
+    )
+    shortest_edge = (
+        processor.size["shortest_edge"]
+        if "shortest_edge" in processor.size
+        else processor.size["height"]
+    )
+    patches = divide_to_patches(image_padded, crop_size)
+
+    image_original_resize = image.resize((shortest_edge, shortest_edge))
+
+    image_patches = [image_original_resize] + patches
+    image_patches = [
+        processor.preprocess(image_patch.convert("RGB"))["pixel_values"][0]
+        for image_patch in image_patches
+    ]
+    return np.stack(image_patches, axis=0)
+
+
+def load_image_from_base64(image):
+    return Image.open(BytesIO(pybase64.b64decode(image, validate=True)))
+
+
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    if pil_img.mode == "L":
+        pil_img = pil_img.convert("RGB")
+    if width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+
+
+def unpad_image(tensor, original_size):
+    """
+    Unpads a PyTorch tensor of a padded and resized image.
+
+    Args:
+    tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format.
+    original_size (tuple): The original size of the image (height, width).
+
+    Returns:
+    torch.Tensor: The unpadded image tensor.
+    """
+    original_width, original_height = original_size
+    current_height, current_width = tensor.shape[1:]
+
+    original_aspect_ratio = original_width / original_height
+    current_aspect_ratio = current_width / current_height
+
+    if original_aspect_ratio > current_aspect_ratio:
+        scale_factor = current_width / original_width
+        new_height = int(original_height * scale_factor)
+        padding = (current_height - new_height) // 2
+        unpadded_tensor = tensor[:, padding : current_height - padding, :]
+    else:
+        scale_factor = current_height / original_height
+        new_width = int(original_width * scale_factor)
+        padding = (current_width - new_width) // 2
+        unpadded_tensor = tensor[:, :, padding : current_width - padding]
+
+    return unpadded_tensor
+
+
+def unpad_image_shape(current_height, current_width, original_size):
+    """
+    Unpads a PyTorch tensor of a padded and resized image
+    and returns the new shape.
+    """
+    original_width, original_height = original_size
+
+    original_aspect_ratio = original_width / original_height
+    current_aspect_ratio = current_width / current_height
+
+    if original_aspect_ratio > current_aspect_ratio:
+        scale_factor = current_width / original_width
+        new_height = int(original_height * scale_factor)
+        padding = (current_height - new_height) // 2
+        new_shape = (current_height - 2 * padding, current_width)
+    else:
+        scale_factor = current_height / original_height
+        new_width = int(original_width * scale_factor)
+        padding = (current_width - new_width) // 2
+        new_shape = (current_height, current_width - 2 * padding)
+
+    return new_shape
+
+
+def process_images(images, image_processor, model_cfg):
+    image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+    new_images = []
+    if image_aspect_ratio == "pad":
+        for image in images:
+            image = expand2square(
+                image, tuple(int(x * 255) for x in image_processor.image_mean)
+            )
+            image = image_processor.preprocess(image)["pixel_values"][0]
+            new_images.append(image)
+    elif "anyres" in image_aspect_ratio:
+        for image in images:
+            image = process_anyres_image(
+                image, image_processor, model_cfg.image_grid_pinpoints
+            )
+            new_images.append(image)
+    else:
+        return image_processor(images)["pixel_values"]
+    if all(x.shape == new_images[0].shape for x in new_images):
+        new_images = np.stack(new_images, axis=0)
+    return new_images
diff --git a/python/sglang/srt/multimodal/processors/base_processor.py b/python/sglang/srt/multimodal/processors/base_processor.py
new file mode 100644
index 000000000..cc14f691f
--- /dev/null
+++ b/python/sglang/srt/multimodal/processors/base_processor.py
@@ -0,0 +1,653 @@
+import concurrent
+import concurrent.futures
+import dataclasses
+import multiprocessing as mp
+import os
+import re
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import BaseImageProcessorFast
+
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.utils import is_npu, load_audio, load_image, load_video, logger
+
+_is_npu = is_npu()
+
+
+@dataclasses.dataclass
+class BaseMultiModalProcessorOutput:
+    # input_text, with each frame of video/image represented with a image_token
+    input_text: str
+
+    # frames loaded from image, in given order
+    images: Optional[list[Union[Image.Image, dict]]] = dataclasses.field(
+        default_factory=list
+    )
+
+    # videos
+    videos: Optional[list[Union[torch.Tensor, dict]]] = dataclasses.field(
+        default_factory=list
+    )
+
+    # audios
+    audios: Optional[list[Union[np.ndarray, dict]]] = dataclasses.field(
+        default_factory=list
+    )
+
+    def organize_results(self) -> List[Tuple[Modality, Any]]:
+        """
+
+        :return: a list of results, with their corresponding modalities
+        """
+        return (
+            [(Modality.IMAGE, data) for data in self.images]
+            + [(Modality.VIDEO, data) for data in self.videos]
+            + [(Modality.AUDIO, data) for data in self.audios]
+        )
+
+
+@dataclasses.dataclass
+class MultimodalSpecialTokens:
+    image_token: Optional[Union[str, List[str]]] = None
+    video_token: Optional[Union[str, List[str]]] = None
+    audio_token: Optional[Union[str, List[str]]] = None
+
+    image_token_id: Optional[int] = None
+    video_token_id: Optional[int] = None
+    audio_token_id: Optional[int] = None
+
+    image_token_regex: Optional[re.Pattern] = None
+    video_token_regex: Optional[re.Pattern] = None
+    audio_token_regex: Optional[re.Pattern] = None
+
+    combined_regex: Optional[re.Pattern] = None
+
+    def build(self, processor):
+        self.convert_to_strs(processor)
+        self.parse_regex()
+        self.get_combined_regex()
+        return self
+
+    def convert_to_str(self, token: Union[str, int], processor) -> str:
+        if token is None:
+            return token
+        if isinstance(token, str):
+            return token
+        return processor.tokenizer.convert_ids_to_tokens([token])[0]
+
+    def convert_to_strs(self, processor):
+        if not self.image_token:
+            self.image_token = self.convert_to_str(self.image_token_id, processor)
+        if not self.video_token:
+            self.video_token = self.convert_to_str(self.video_token_id, processor)
+        if not self.audio_token:
+            self.audio_token = self.convert_to_str(self.audio_token_id, processor)
+
+    def get_modality_of_token(self, token: str) -> Optional[Modality]:
+        """
+        :return: the modality associated with the given token, if the token is a special_token or matches with the multimodal token regex
+        """
+        modality = {
+            self.image_token: Modality.IMAGE,
+            self.video_token: Modality.VIDEO,
+            self.audio_token: Modality.AUDIO,
+        }.get(token)
+        if modality:
+            return modality
+
+        for regex, modality in [
+            (self.image_token_regex, Modality.IMAGE),
+            (self.video_token_regex, Modality.VIDEO),
+            (self.audio_token_regex, Modality.AUDIO),
+        ]:
+            if regex and regex.match(token):
+                return modality
+
+        return None
+
+    def get_token_id_by_modality(self, modality: Modality) -> Optional[int]:
+        return {
+            Modality.IMAGE: self.image_token_id,
+            Modality.MULTI_IMAGES: self.image_token_id,
+            Modality.VIDEO: self.video_token_id,
+            Modality.AUDIO: self.audio_token_id,
+        }.get(modality)
+
+    def parse_regex(self):
+        if self.image_token_regex is None and self.image_token is not None:
+            self.image_token_regex = re.compile(re.escape(self.image_token))
+        if self.video_token_regex is None and self.video_token is not None:
+            self.video_token_regex = re.compile(re.escape(self.video_token))
+        if self.audio_token_regex is None and self.audio_token is not None:
+            self.audio_token_regex = re.compile(re.escape(self.audio_token))
+
+    def get_combined_regex(self) -> re.Pattern:
+        """
+        Builds and returns a regex, used to split input str into tokens (with mm special tokens)
+        """
+        if self.combined_regex:
+            return self.combined_regex
+        tokens = [
+            self.image_token_regex,
+            self.video_token_regex,
+            self.audio_token_regex,
+        ]
+        patterns = []
+        flags = 0
+        for t in tokens:
+            if t is not None:
+                patterns.append(t.pattern)
+                flags |= t.flags
+        combined = "(" + "|".join(f"(?:{p})" for p in patterns) + ")"
+        self.combined_regex = re.compile(combined, flags)
+        return self.combined_regex
+
+
+class BaseMultimodalProcessor(ABC):
+    models = []
+
+    def __init__(
+        self, hf_config, server_args, _processor, transport_mode, *args, **kwargs
+    ):
+        self.hf_config = hf_config
+        self._processor = _processor
+        self.arch = hf_config.architectures[0]
+        self.server_args = server_args
+        self.transport_mode = transport_mode
+
+        # FIXME: not accurate, model and image specific
+        self.NUM_TOKEN_PER_FRAME = 330
+
+        self.io_executor = concurrent.futures.ThreadPoolExecutor(
+            max_workers=int(os.environ.get("SGLANG_IO_WORKERS", 4))
+        )
+        self.cpu_executor = concurrent.futures.ProcessPoolExecutor(
+            mp_context=mp.get_context("fork"),
+            max_workers=int(os.environ.get("SGLANG_CPU_WORKERS", os.cpu_count())),
+        )
+
+        # Mapping from attribute names to modality types
+        self.ATTR_NAME_TO_MODALITY = {
+            # Image-related attributes
+            "pixel_values": Modality.IMAGE,
+            "image_sizes": Modality.IMAGE,
+            "image_grid_thw": Modality.IMAGE,
+            "image_attention_mask": Modality.IMAGE,
+            "image_emb_mask": Modality.IMAGE,
+            "images_spatial_crop": Modality.IMAGE,
+            "tgt_size": Modality.IMAGE,
+            "image_grid_hws": Modality.IMAGE,
+            "aspect_ratio_ids": Modality.IMAGE,
+            "aspect_ratio_mask": Modality.IMAGE,
+            "num_patches": Modality.IMAGE,
+            "patch_pixel_values": Modality.IMAGE,
+            # Audio-related attributes
+            "audio_features": Modality.AUDIO,
+            "audio_feature_lens": Modality.AUDIO,
+            "input_features": Modality.AUDIO,
+            "input_features_mask": Modality.AUDIO,
+            "audio_attention_mask": Modality.AUDIO,
+            # Video-related attributes
+            "pixel_values_videos": Modality.VIDEO,
+            "second_per_grid_ts": Modality.VIDEO,
+            "video_grid_thw": Modality.VIDEO,
+            # Generic attributes that could apply to multiple modalities
+            # "precomputed_embeddings" - handled specially as it can be any modality
+        }
+
+        # name of the feature filed
+        # TODO: pass from processors
+        self.FEATURE_NAMES = [
+            "pixel_values",
+            "pixel_values_videos",
+            "audio_features",
+            "input_features",
+        ]
+
+    def process_mm_data(
+        self, input_text, images=None, videos=None, audios=None, **kwargs
+    ) -> dict:
+        """
+        process multimodal data with transformers AutoProcessor
+        """
+        if images:
+            kwargs["images"] = images
+        if videos:
+            kwargs["videos"] = videos
+        if audios:
+            if self._processor.__class__.__name__ in {
+                "Gemma3nProcessor",
+                "Qwen2AudioProcessor",
+            }:
+                # Note(Xinyuan): for gemma3n, ref: https://github.com/huggingface/transformers/blob/ccf2ca162e33f381e454cdb74bf4b41a51ab976d/src/transformers/models/gemma3n/processing_gemma3n.py#L107
+                kwargs["audio"] = audios
+            else:
+                kwargs["audios"] = audios
+
+        processor = self._processor
+        if (
+            hasattr(processor, "image_processor")
+            and isinstance(processor.image_processor, BaseImageProcessorFast)
+            and not self.server_args.disable_fast_image_processor
+        ):
+            kwargs["device"] = "cuda" if not _is_npu else "npu"
+        result = processor.__call__(
+            text=[input_text],
+            padding=True,
+            return_tensors="pt",
+            **kwargs,
+        )
+        # move feature tensors to cpu
+        for feature_name in self.FEATURE_NAMES:
+            if feature_name in result and isinstance(
+                result[feature_name], torch.Tensor
+            ):
+                result[feature_name] = result[feature_name].to("cpu")
+
+        return result
+
+    @abstractmethod
+    async def process_mm_data_async(
+        self,
+        image_data,
+        audio_data,
+        input_text,
+        request_obj,
+        **kwargs,
+    ) -> Optional[Dict[str, Any]]:
+        pass
+
+    def get_estimated_frames_list(self, image_data):
+        """
+        estimate the total frame count from all visual input
+        """
+        # Lazy import because decord is not available on some arm platforms.
+        from decord import VideoReader, cpu
+
+        # Before processing inputs
+        if not image_data or len(image_data) == 0:
+            return []
+        estimated_frames_list = []
+        for image in image_data:
+            if isinstance(image, str) and image.startswith("video:"):
+                path = image[len("video:") :]
+                # Estimate frames for the video
+                vr = VideoReader(path, ctx=cpu(0))
+                num_frames = len(vr)
+            else:
+                # For images, each contributes one frame
+                num_frames = 1
+            estimated_frames_list.append(num_frames)
+
+        return estimated_frames_list
+
+    @staticmethod
+    def _load_single_item(
+        data,
+        modality: Modality,
+        frame_count_limit=None,
+        audio_sample_rate: Optional[int] = None,
+        discard_alpha_channel=True,
+    ):
+        """
+        Load a single multimodal data.
+
+        If data is precomputed, returns directly.
+
+        Static method that can be pickled for multiprocessing"""
+        if isinstance(data, dict):
+            return data
+        try:
+            if modality == Modality.IMAGE:
+                img, _ = load_image(data)
+                return img.convert("RGB") if discard_alpha_channel else img
+            elif modality == Modality.VIDEO:
+                return load_video(data, frame_count_limit)
+            elif modality == Modality.AUDIO:
+                return load_audio(data, audio_sample_rate)
+
+        except Exception as e:
+            raise RuntimeError(f"Error while loading data {data}: {e}")
+
+    def submit_data_loading_tasks(
+        self,
+        text_parts: List[str],
+        multimodal_tokens: MultimodalSpecialTokens,
+        data_iterators: dict[Modality, Iterator[Any]],
+        discard_alpha_channel: bool = True,
+        image_estimated_frames_iter: Optional[iter] = None,
+        image_scaling_factor: float = 1.0,
+        max_image_frames: int = 30,
+        audio_sample_rate: Optional[int] = None,
+    ) -> Tuple[List, List]:
+        """
+        load multimodal data parallelly using iterators.
+        """
+        futures = []
+        task_info = []
+
+        for text_part in text_parts:
+            modality = multimodal_tokens.get_modality_of_token(text_part)
+            if modality is not None:
+                data_iterator = data_iterators.get(modality)
+                if data_iterator is None:
+                    raise ValueError(f"No data iterator found for token: {text_part}")
+
+                try:
+                    data = next(data_iterator)
+                except StopIteration:
+                    raise ValueError(
+                        f"Mismatch: More '{text_part}' tokens found than corresponding data items provided."
+                    )
+
+                frame_count_limit = None
+                if modality == Modality.IMAGE and image_estimated_frames_iter:
+                    try:
+                        estimated_frames = next(image_estimated_frames_iter)
+                        # Use the pre-calculated scaling factor and max frames
+                        frame_count_limit = max(
+                            1, int(estimated_frames * image_scaling_factor)
+                        )
+                        # Ensure we don't exceed the absolute max (redundant if scaling_factor handles it)
+                        # frame_count_limit = min(frame_count_limit, max_image_frames)
+                    except StopIteration:
+                        raise ValueError(
+                            "Mismatch between image tokens and estimated frame counts."
+                        )
+
+                futures.append(
+                    self.io_executor.submit(
+                        BaseMultimodalProcessor._load_single_item,
+                        data,
+                        modality,
+                        frame_count_limit,
+                        audio_sample_rate,
+                        discard_alpha_channel,
+                    )
+                )
+                task_info.append((modality, data, frame_count_limit))
+
+        for modality, iterator in data_iterators.items():
+            try:
+                next(iterator)
+                logger.warning(
+                    f"Warning: More {modality.name.lower()} data items provided than corresponding tokens found in the prompt."
+                )
+            except StopIteration:
+                pass
+            except Exception:
+                pass
+
+        return futures, task_info
+
+    def load_mm_data(
+        self,
+        prompt: str,
+        multimodal_tokens: MultimodalSpecialTokens,
+        image_data: Optional[list] = None,
+        video_data: Optional[list] = None,
+        audio_data: Optional[list] = None,
+        return_text: Optional[bool] = True,
+        discard_alpha_channel: bool = True,
+        audio_sample_rate: Optional[int] = None,
+    ) -> BaseMultiModalProcessorOutput:
+        """
+        Each frame of video/image will be replaced by a single image token
+
+        Args:
+            multimodal_tokens (list[str]): list of special token which denoting a single multimodal data
+                e.g. image token or audio token
+            discard_alpha_channel: if True, discards the alpha channel in the returned images
+
+        """
+        multimodal_tokens_pattern = multimodal_tokens.get_combined_regex()
+
+        if isinstance(prompt, list) and return_text:
+            assert len(prompt) and isinstance(prompt[0], int)
+            prompt = self._processor.tokenizer.decode(prompt)
+        else:
+            prompt = prompt
+
+        assert isinstance(prompt, str)
+        # split text into list of normal text and special tokens
+        text_parts = re.split(multimodal_tokens_pattern, prompt)
+
+        # collect all data
+        data_iterators = {}
+        if multimodal_tokens.image_token and image_data:
+            data_iterators[Modality.IMAGE] = iter(image_data)
+        if multimodal_tokens.video_token and video_data:
+            data_iterators[Modality.VIDEO] = iter(video_data)
+        if multimodal_tokens.audio_token and audio_data:
+            data_iterators[Modality.AUDIO] = iter(audio_data)
+
+        # futures: the futures of loaded data
+        # task_info: modality, raw_data, and other metadata of each data
+        futures, task_info = self.submit_data_loading_tasks(
+            text_parts=text_parts,
+            multimodal_tokens=multimodal_tokens,
+            data_iterators=data_iterators,
+            discard_alpha_channel=discard_alpha_channel,
+            audio_sample_rate=audio_sample_rate,
+        )
+        task_info_iter = iter(task_info)
+        futures_iter = iter(futures)
+
+        # Process results
+        images, videos, audios = [], [], []
+        new_text_parts = []
+        for text_part in text_parts:
+            try:
+                if multimodal_tokens_pattern.match(text_part):
+                    modality, raw_data, frame_limit = next(task_info_iter)
+                    is_precomputed = isinstance(raw_data, dict)
+                    result = next(futures_iter).result()
+
+                    if modality == Modality.IMAGE:
+                        # If data is already processed it will be a
+                        # dictionary(precomputed). In this case we want to keep the
+                        # expanded tokens in text_part. Otherwise, we will
+                        # call the processor code, so keep only a single image
+                        # token.
+                        mm_tokens = (
+                            text_part
+                            if is_precomputed
+                            else multimodal_tokens.image_token
+                        )
+                        frames = [result] if not isinstance(result, list) else result
+                        if frames:
+                            # only for minicpmv
+                            images += frames
+                            new_text_parts += mm_tokens * len(frames)
+                    elif modality == Modality.VIDEO:
+                        # load as video
+                        mm_tokens = (
+                            text_part
+                            if is_precomputed
+                            else multimodal_tokens.video_token
+                        )
+                        videos += [result]
+                        new_text_parts += mm_tokens
+                    elif modality == Modality.AUDIO:
+                        # audio
+                        mm_tokens = (
+                            text_part
+                            if is_precomputed
+                            else multimodal_tokens.audio_token
+                        )
+                        audios += [result]
+                        new_text_parts += mm_tokens
+                else:
+                    # normal text
+                    new_text_parts += [text_part]
+
+            except Exception as e:
+                raise RuntimeError(
+                    f"An exception occurred while loading multimodal data: {e}"
+                )
+        return BaseMultiModalProcessorOutput(
+            images=images,
+            audios=audios,
+            videos=videos,
+            input_text="".join(new_text_parts),
+        )
+
+    @staticmethod
+    def get_mm_items_offset(
+        input_ids: torch.Tensor, mm_token_id: int
+    ) -> List[Tuple[int, int]]:
+        """
+        Get a set of range for mm_items from input_ids
+        Example:
+            input_ids = [1, 2, 3, 3, 3, 4, 3, 3]
+            mm_token_id = 3
+            return result = [(2,4),(6,7)]
+        """
+        mask = input_ids == mm_token_id
+        start_positions = (mask & ~torch.roll(mask, 1)).nonzero(as_tuple=True)[0]
+        end_positions = (mask & ~torch.roll(mask, -1)).nonzero(as_tuple=True)[0]
+
+        return list(zip(start_positions.tolist(), end_positions.tolist()))
+
+    @staticmethod
+    def get_mm_items_offset_by_pair(
+        input_ids: torch.Tensor, mm_start_id: int, mm_end_id: int
+    ) -> List[Tuple[int, int]]:
+        indices_start = (input_ids == mm_start_id).nonzero(as_tuple=True)[0] + 1
+        indices_end = (input_ids == mm_end_id).nonzero(as_tuple=True)[0] - 1
+
+        return list(zip(indices_start.tolist(), indices_end.tolist()))
+
+    def collect_mm_items_from_processor_output(
+        self, data_dict: dict
+    ) -> List[MultimodalDataItem]:
+        """Create mm_items directly from processor output."""
+        items: dict[Modality, MultimodalDataItem] = {}
+        for attr_name, value in data_dict.items():
+            if attr_name == "input_ids":
+                continue
+
+            # Get modality for this attribute
+            modality = self.ATTR_NAME_TO_MODALITY.get(attr_name)
+
+            if attr_name == "precomputed_embeddings":
+                modality_str = data_dict.get("modality")
+                modality = Modality.IMAGE
+                if modality_str:
+                    try:
+                        modality = Modality.from_str(modality_str)
+                    except ValueError:
+                        pass
+
+            if modality:
+                # Create item if needed
+                if modality not in items:
+                    items[modality] = MultimodalDataItem(modality=modality)
+
+                if attr_name in self.FEATURE_NAMES:
+                    attr_name = "feature"
+
+                items[modality].set(attr_name, value)
+
+        return list(items.values())
+
+    def _process_and_collect_mm_items(
+        self, input_text: str, images=None, audios=None, videos=None, **kwargs
+    ) -> Tuple[List[MultimodalDataItem], torch.Tensor, dict]:
+        """
+        Helper method to process multimodal data and create mm_items in one step.
+
+        Returns:
+            Tuple of (created mm_items, input_ids)
+        """
+        ret = self.process_mm_data(
+            input_text=input_text, images=images, audios=audios, videos=videos, **kwargs
+        )
+
+        input_ids = ret["input_ids"].flatten()
+        collected_items = self.collect_mm_items_from_processor_output(ret)
+
+        return collected_items, input_ids, ret
+
+    def process_and_combine_mm_data(
+        self,
+        base_output: BaseMultiModalProcessorOutput,
+        mm_tokens: MultimodalSpecialTokens,
+        **kwargs,
+    ) -> Tuple[List[MultimodalDataItem], torch.Tensor, dict]:
+        """
+        Process multimodal data and return the combined multimodal items and input_ids.
+        Supports mixed modalities (images and audio in the same request).
+
+        Returns:
+            Tuple of (list of mm_items, input_ids)
+        """
+        # Collect all items and categorize them
+        all_items = base_output.organize_results()
+        # Handle text-only case
+        if not all_items:
+            input_ids = self._processor.tokenizer(
+                base_output.input_text,
+                return_tensors="pt",
+                add_special_tokens=True,
+            ).input_ids.flatten()
+            return [], input_ids, {}
+
+        dict_items, raw_images, raw_audios, raw_videos = [], [], [], []
+        for modality, item in all_items:
+            if isinstance(item, dict):
+                dict_items.append(item)
+            elif modality == Modality.IMAGE:
+                raw_images.append(item)
+            elif modality == Modality.AUDIO:
+                raw_audios.append(item)
+            elif modality == Modality.VIDEO:
+                raw_videos.append(item)
+            else:
+                raise ValueError(f"Unknown multimodal item type: {type(item)}")
+        # Process items and get input_ids
+        all_collected_items: list[MultimodalDataItem] = []
+        input_ids = None
+
+        # Handle raw items (need processing)
+        if raw_images or raw_audios or raw_videos:
+            collected_items, input_ids, ret = self._process_and_collect_mm_items(
+                input_text=base_output.input_text,
+                images=raw_images,
+                audios=raw_audios,
+                videos=raw_videos,
+                **kwargs,
+            )
+            all_collected_items = collected_items
+        else:
+            ret = None
+
+        # Handle dict items (already processed)
+        for dict_item in dict_items:
+            all_collected_items.extend(
+                self.collect_mm_items_from_processor_output(dict_item)
+            )
+
+        # Fallback tokenization if no raw items were processed
+        if input_ids is None:
+            input_ids = self._processor.tokenizer(
+                base_output.input_text,
+                return_tensors="pt",
+                add_special_tokens=True,
+            ).input_ids.flatten()
+
+        # Add offsets to all items
+        for mm_item in all_collected_items:
+            mm_token_id = mm_tokens.get_token_id_by_modality(mm_item.modality)
+            if mm_token_id is None:
+                raise ValueError(f"No token id found for modality: {mm_item.modality}")
+            mm_item.offsets = self.get_mm_items_offset(
+                input_ids=input_ids,
+                mm_token_id=mm_token_id,
+            )
+
+        return all_collected_items, input_ids, ret
diff --git a/python/sglang/srt/multimodal/processors/clip.py b/python/sglang/srt/multimodal/processors/clip.py
new file mode 100644
index 000000000..19ff71e78
--- /dev/null
+++ b/python/sglang/srt/multimodal/processors/clip.py
@@ -0,0 +1,35 @@
+from typing import List, Union
+
+from sglang.srt.models.clip import CLIPModel
+from sglang.srt.multimodal.processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+)
+
+
+class ClipImageProcessor(BaseMultimodalProcessor):
+    models = [CLIPModel]
+
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
+        self.mm_tokens = MultimodalSpecialTokens(image_token="<image>").build(
+            _processor
+        )
+
+    async def process_mm_data_async(
+        self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
+    ):
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            multimodal_tokens=self.mm_tokens,
+            image_data=image_data,
+        )
+
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(
+            base_output, self.mm_tokens
+        )
+
+        return {
+            "input_ids": input_ids.tolist(),
+            "mm_items": mm_items,
+        }
diff --git a/python/sglang/srt/multimodal/processors/deepseek_vl_v2.py b/python/sglang/srt/multimodal/processors/deepseek_vl_v2.py
new file mode 100644
index 000000000..b09402d0b
--- /dev/null
+++ b/python/sglang/srt/multimodal/processors/deepseek_vl_v2.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2023-2024 DeepSeek.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+from typing import List, Union
+
+import torch
+
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.models.deepseek_vl2 import DeepseekVL2ForCausalLM
+from sglang.srt.multimodal.processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+)
+
+
+class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
+    models = [DeepseekVL2ForCausalLM]
+
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token="<image>", image_token_id=self._processor.image_token_id
+        ).build(_processor)
+
+    async def process_mm_data_async(
+        self,
+        image_data: List[Union[str, bytes]],
+        input_text,
+        request_obj,
+        max_req_input_len,
+        *args,
+        **kwargs
+    ):
+        base_output = self.load_mm_data(
+            input_text,
+            image_data=image_data,
+            multimodal_tokens=self.mm_tokens,
+        )
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(
+            base_output,
+            self.mm_tokens,
+            max_req_input_len=max_req_input_len,
+            conversations=base_output.input_text,
+        )
+
+        return {
+            "mm_items": mm_items,
+            "input_ids": input_ids.tolist(),
+            "im_token_id": self._processor.image_token_id,
+        }
diff --git a/python/sglang/srt/multimodal/processors/gemma3.py b/python/sglang/srt/multimodal/processors/gemma3.py
new file mode 100644
index 000000000..cbfb45e84
--- /dev/null
+++ b/python/sglang/srt/multimodal/processors/gemma3.py
@@ -0,0 +1,54 @@
+import re
+from typing import Dict, List, Union
+
+from sglang.srt.managers.multimodal_processor import (
+    BaseMultimodalProcessor as SGLangBaseProcessor,
+)
+from sglang.srt.models.gemma3_mm import Gemma3ForConditionalGeneration
+from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
+
+# Copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gemma3/image_processing_gemma3_fast.py
+# will be removed in the future
+
+
+class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
+    models = [Gemma3ForConditionalGeneration]
+
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
+        self.IM_START_TOKEN_ID = hf_config.boi_token_index
+        self.IM_END_TOKEN_ID = hf_config.eoi_token_index
+        self.mm_tokens = MultimodalSpecialTokens(
+            # The single, pre-expanded image token.
+            image_token="<start_of_image>",
+            image_token_id=hf_config.image_token_index,
+            # The regex that matches expanded image tokens.
+            image_token_regex=re.compile(
+                r"<start_of_image>(?:(?:<image_soft_token>)*<end_of_image>)?"
+            ),
+        ).build(_processor)
+
+    async def process_mm_data_async(
+        self,
+        image_data: List[Union[str, bytes, Dict]],
+        input_text,
+        request_obj,
+        *args,
+        **kwargs,
+    ):
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            image_data=image_data,
+            multimodal_tokens=self.mm_tokens,
+            discard_alpha_channel=True,
+        )
+
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(
+            base_output, self.mm_tokens
+        )
+        return {
+            "input_ids": input_ids.tolist(),
+            "mm_items": mm_items,
+            "im_start_id": self.IM_START_TOKEN_ID,
+            "im_end_id": self.IM_END_TOKEN_ID,
+        }
diff --git a/python/sglang/srt/multimodal/processors/gemma3n.py b/python/sglang/srt/multimodal/processors/gemma3n.py
new file mode 100644
index 000000000..9ea8b8be3
--- /dev/null
+++ b/python/sglang/srt/multimodal/processors/gemma3n.py
@@ -0,0 +1,71 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Dict, List, Optional, Union
+
+from sglang.srt.managers.multimodal_processor import (
+    BaseMultimodalProcessor as SGLangBaseProcessor,
+)
+from sglang.srt.models.gemma3n_mm import Gemma3nForConditionalGeneration
+from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
+
+
+class Gemma3nSGLangProcessor(SGLangBaseProcessor):
+    """Multimodal processor for Gemma3n supporting image and audio inputs."""
+
+    models = [Gemma3nForConditionalGeneration]
+
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
+
+        self.IM_START_TOKEN_ID = hf_config.boi_token_id
+        self.IM_END_TOKEN_ID = hf_config.eoi_token_id
+
+        self.AUDIO_START_TOKEN_ID = hf_config.boa_token_id
+        self.AUDIO_END_TOKEN_ID = hf_config.eoa_token_id
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token="<image_soft_token>",
+            image_token_id=hf_config.image_token_id,
+            audio_token="<audio_soft_token>",
+            audio_token_id=hf_config.audio_token_id,
+        ).build(_processor)
+
+    async def process_mm_data_async(
+        self,
+        image_data: Optional[List[Union[str, bytes, Dict]]] = None,
+        audio_data: Optional[List[Union[str, bytes, Dict]]] = None,
+        input_text: str = "",
+        request_obj=None,
+        *args,
+        **kwargs,
+    ):
+        """Process multimodal data including images and audio."""
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            image_data=image_data,
+            audio_data=audio_data,
+            multimodal_tokens=self.mm_tokens,
+        )
+
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(
+            base_output, self.mm_tokens
+        )
+
+        return {
+            "input_ids": input_ids.tolist(),
+            "mm_items": mm_items,
+            # TODO(mick): could we return MultimodalSpecialTokens directly?
+            "im_token_id": self.mm_tokens.image_token_id,
+            "audio_token_id": self.mm_tokens.audio_token_id,
+        }
diff --git a/python/sglang/srt/multimodal/processors/glm4v.py b/python/sglang/srt/multimodal/processors/glm4v.py
new file mode 100644
index 000000000..e3c8edc92
--- /dev/null
+++ b/python/sglang/srt/multimodal/processors/glm4v.py
@@ -0,0 +1,132 @@
+import re
+from typing import List, Union
+
+from decord import VideoReader
+
+from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
+from sglang.srt.models.glm4v import Glm4vForConditionalGeneration
+from sglang.srt.models.glm4v_moe import Glm4vMoeForConditionalGeneration
+from sglang.srt.multimodal.processors.base_processor import (
+    BaseMultimodalProcessor as SGLangBaseProcessor,
+)
+from sglang.srt.multimodal.processors.base_processor import (
+    BaseMultiModalProcessorOutput,
+    MultimodalSpecialTokens,
+)
+
+
+class Glm4vImageProcessor(SGLangBaseProcessor):
+    models = [Glm4vForConditionalGeneration, Glm4vMoeForConditionalGeneration]
+
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
+
+        # GLM-4.1V and GLM-4.5V specific tokens
+        self.IMAGE_TOKEN = "<|image|>"
+        self.VIDEO_TOKEN = "<|video|>"
+        self.IMAGE_START_TOKEN = "<|begin_of_image|>"
+        self.IMAGE_END_TOKEN = "<|end_of_image|>"
+        self.VIDEO_START_TOKEN = "<|begin_of_video|>"
+        self.VIDEO_END_TOKEN = "<|end_of_video|>"
+
+        # Token IDs
+        self.IM_TOKEN_ID = hf_config.image_token_id
+        self.VIDEO_TOKEN_ID = hf_config.video_token_id
+        self.IMAGE_START_TOKEN_ID = hf_config.image_start_token_id
+        self.IMAGE_END_TOKEN_ID = hf_config.image_end_token_id
+        self.VIDEO_START_TOKEN_ID = hf_config.video_start_token_id
+        self.VIDEO_END_TOKEN_ID = hf_config.video_end_token_id
+
+        # Vision config
+        self.IMAGE_FACTOR = 28
+        self.MIN_PIXELS = 112 * 112
+        self.MAX_PIXELS = 30000 * 28 * 28 * 2
+
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token=self.IMAGE_TOKEN,
+            image_token_id=self.IM_TOKEN_ID,
+            video_token=self.VIDEO_TOKEN,
+            # Note: For GLM4v videos, it uses the video token before tokenization but uses image token after tokenization
+            video_token_id=self.IM_TOKEN_ID,
+        ).build(_processor)
+
+    # adapted from https://github.com/huggingface/transformers/blob/369c99d0cea403b77bd0aef818527106453fd9fc/src/transformers/video_utils.py#L312
+    async def preprocess_video(self, vr: VideoReader):
+        """
+        Preprocess video using VideoReader from Decord backend.
+
+        Args:
+            vr (VideoReader): VideoReader object from decord
+
+        Returns:
+            tuple: A tuple containing processed frames and metadata
+        """
+        video_fps = vr.get_avg_fps()
+        total_num_frames = len(vr)
+        duration = total_num_frames / video_fps if video_fps else 0
+
+        # Extract all frames
+        indices = list(range(total_num_frames))
+        frames = vr.get_batch(indices).asnumpy()
+
+        # Return metadata as dict so transformers can properly create VideoMetadata objects
+        metadata = {
+            "total_num_frames": int(total_num_frames),
+            "fps": float(video_fps),
+            "duration": float(duration),
+            "video_backend": "decord",
+            "frames_indices": indices,
+        }
+
+        return frames, metadata
+
+    async def process_mm_data_async(
+        self,
+        image_data: List[Union[str, bytes]],
+        input_text,
+        request_obj,
+        *args,
+        **kwargs,
+    ):
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            image_data=image_data,
+            video_data=request_obj.video_data,
+            multimodal_tokens=self.mm_tokens,
+        )
+
+        video_metadata = None
+
+        if base_output.videos:
+            videos_processed = [
+                await self.preprocess_video(video) for video in base_output.videos
+            ]
+            base_output.videos, video_metadata = map(list, zip(*videos_processed))
+            # transformer requires the video inputs to be under this format
+            base_output.videos = [base_output.videos]
+            video_metadata = [video_metadata]
+
+        mm_items, input_ids, ret = self.process_and_combine_mm_data(
+            base_output, self.mm_tokens, video_metadata=video_metadata
+        )
+
+        input_ids = input_ids.flatten()
+        mrope_positions, mrope_position_delta = MRotaryEmbedding.get_rope_index_glm4v(
+            input_ids=input_ids.unsqueeze(0),
+            hf_config=self.hf_config,
+            image_grid_thw=getattr(ret, "image_grid_thw", None),
+            video_grid_thw=getattr(ret, "video_grid_thw", None),
+            attention_mask=getattr(ret, "attention_mask", None),
+        )
+        mrope_positions = mrope_positions.squeeze(1)
+
+        mm_inputs = {
+            "input_ids": input_ids.tolist(),
+            "mm_items": mm_items,
+            "im_token_id": self.mm_tokens.image_token_id,
+            "video_token_id": self.mm_tokens.video_token_id,
+            "mrope_positions": mrope_positions,
+            "mrope_position_delta": mrope_position_delta,
+        }
+
+        return mm_inputs
diff --git a/python/sglang/srt/multimodal/processors/internvl.py b/python/sglang/srt/multimodal/processors/internvl.py
new file mode 100644
index 000000000..9c20664d6
--- /dev/null
+++ b/python/sglang/srt/multimodal/processors/internvl.py
@@ -0,0 +1,267 @@
+# Adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
+
+import numpy as np
+import torch
+import torchvision.transforms as T
+from decord import VideoReader, cpu, gpu
+from PIL import Image
+from torchvision.transforms import InterpolationMode
+
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.models.interns1 import InternS1ForConditionalGeneration
+from sglang.srt.models.internvl import InternVLChatModel
+from sglang.srt.multimodal.processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+)
+
+
+class InternVLImageProcessor(BaseMultimodalProcessor):
+    models = [InternVLChatModel, InternS1ForConditionalGeneration]
+
+    def __init__(self, hf_config, server_args, _image_processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _image_processor, *args, **kwargs)
+        image_size = (
+            getattr(hf_config, "force_image_size", None)
+            or hf_config.vision_config.image_size
+        )
+        patch_size = hf_config.vision_config.patch_size
+        if isinstance(image_size, list):
+            image_size = image_size[0]
+        if isinstance(patch_size, list):
+            patch_size = patch_size[0]
+
+        self.IMG_CONTEXT_TOKEN = "<IMG_CONTEXT>"
+        self.IMG_START_TOKEN = "<img>"
+        self.IMG_END_TOKEN = "</img>"
+        self.num_image_token = int(
+            (image_size // patch_size) ** 2 * (hf_config.downsample_ratio**2)
+        )
+        if hasattr(self._processor, "tokenizer"):
+            tokenizer = self._processor.tokenizer
+        else:
+            tokenizer = self._processor
+        self.tokenizer = tokenizer
+
+        self.img_start_token_id = tokenizer.convert_tokens_to_ids(self.IMG_START_TOKEN)
+        self.img_end_token_id = tokenizer.convert_tokens_to_ids(self.IMG_END_TOKEN)
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token="<IMG_CONTEXT>",
+            image_token_id=tokenizer.convert_tokens_to_ids(self.IMG_CONTEXT_TOKEN),
+        ).build(_image_processor)
+
+    @staticmethod
+    def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
+        if bound:
+            start, end = bound[0], bound[1]
+        else:
+            start, end = -100000, 100000
+        start_idx = max(first_idx, round(start * fps))
+        end_idx = min(round(end * fps), max_frame)
+        seg_size = float(end_idx - start_idx) / num_segments
+        frame_indices = np.array(
+            [
+                int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
+                for idx in range(num_segments)
+            ]
+        )
+        return frame_indices
+
+    @staticmethod
+    def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
+        try:
+            vr = VideoReader(video_path, ctx=gpu(0), num_threads=1)
+            use_gpu = True
+        except (RuntimeError, OSError) as e:
+            print(
+                f"[WARNING] Load video on gpu decoding failed: {e}. Falling back to CPU."
+            )
+            vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+            use_gpu = False
+
+        max_frame = len(vr) - 1
+        fps = float(vr.get_avg_fps())
+
+        pixel_values_list = []
+        num_patches_list = []
+        frame_indices = InternVLImageProcessor.get_index(
+            bound, fps, max_frame, first_idx=0, num_segments=num_segments
+        )
+
+        for frame_index in frame_indices:
+            # Load frame
+            frame = vr[frame_index]
+            if use_gpu:
+                img = frame.cuda().permute(2, 0, 1).float() / 255.0
+            else:
+                img_np = frame.asnumpy()
+                img = torch.from_numpy(img_np).permute(2, 0, 1).cuda().float() / 255.0
+
+            # Using the mean and variance of the ImageNet dataset for all input images can lead to accuracy issues, while using the mean and variance of each input image is a more accurate choice.
+            mean = img.mean(dim=[1, 2], keepdim=True)
+            # Prevent division by zero; clamp to minimum value of 1e-6
+            std = img.std(dim=[1, 2], keepdim=True).clamp(min=1e-6)
+            img = (img - mean) / std
+
+            tiles = InternVLImageProcessor.dynamic_preprocess(
+                img, image_size=input_size, max_num=max_num, use_thumbnail=True
+            )
+
+            pixel_values_list.append(tiles)
+            num_patches_list.append(tiles.shape[0])
+
+        pixel_values = torch.cat(pixel_values_list, dim=0)
+        return pixel_values, num_patches_list
+
+    @staticmethod
+    def dynamic_preprocess(tensor, image_size=448, max_num=12, use_thumbnail=False):
+        C, H, W = tensor.shape
+        aspect_ratio = W / H
+
+        # Generate all possible aspect ratios
+        target_ratios = set(
+            (i, j)
+            for n in range(1, max_num + 1)
+            for i in range(1, n + 1)
+            for j in range(1, n + 1)
+            if i * j <= max_num
+        )
+        target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+        # Find closest ratio
+        best_ratio_diff = float("inf")
+        best_ratio = (1, 1)
+
+        for x, y in target_ratios:
+            target_ar = x / y
+            diff = abs(aspect_ratio - target_ar)
+            blocks = x * y
+            best_blocks = best_ratio[0] * best_ratio[1]
+
+            if diff < best_ratio_diff:
+                best_ratio_diff = diff
+                best_ratio = (x, y)
+            elif diff == best_ratio_diff and blocks > best_blocks:
+                best_ratio = (x, y)
+
+        target_w, target_h = image_size * best_ratio[0], image_size * best_ratio[1]
+        blocks = best_ratio[0] * best_ratio[1]
+
+        # Resize on GPU
+        resized = torch.nn.functional.interpolate(
+            tensor.unsqueeze(0),
+            size=(target_h, target_w),
+            mode="bicubic",
+            align_corners=False,
+        ).squeeze(0)
+
+        # Split into tiles
+        tiles = []
+        for i in range(blocks):
+            x = (i % best_ratio[0]) * image_size
+            y = (i // best_ratio[0]) * image_size
+            tile = resized[:, y : y + image_size, x : x + image_size]
+            tiles.append(tile)
+
+        # Add thumbnail if needed
+        if use_thumbnail and len(tiles) > 1:
+            thumb = torch.nn.functional.interpolate(
+                tensor.unsqueeze(0),
+                size=(image_size, image_size),
+                mode="bicubic",
+                align_corners=False,
+            ).squeeze(0)
+            tiles.append(thumb)
+
+        return torch.stack(tiles).to(torch.bfloat16)
+
+    async def process_mm_data_async(
+        self, image_data, input_text, request_obj, **kwargs
+    ):
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            image_data=image_data,
+            multimodal_tokens=self.mm_tokens,
+            discard_alpha_channel=True,
+        )
+
+        num_patches_list = []
+        pixel_values = []
+
+        # Process each input with allocated frames
+        for image_index, image in enumerate(base_output.images):
+            try:
+                # TODO: video input
+                # Convert PIL to GPU tensor
+                if isinstance(image, Image.Image):
+                    img_np = np.array(image.convert("RGB"))
+                    tensor = (
+                        torch.from_numpy(img_np).permute(2, 0, 1).cuda().float() / 255.0
+                    )
+                else:
+                    tensor = image.cuda()  # assume already tensor
+
+                # Using the mean and variance of the ImageNet dataset for all input images can lead to accuracy issues, while using the mean and variance of each input image is a more accurate choice.
+                mean = tensor.mean(dim=[1, 2], keepdim=True)
+                # Prevent division by zero; clamp to minimum value of 1e-6
+                std = tensor.std(dim=[1, 2], keepdim=True).clamp(min=1e-6)
+                tensor = (tensor - mean) / std
+                tiles = self.dynamic_preprocess(
+                    tensor, image_size=448, max_num=12, use_thumbnail=True
+                )
+
+                pixel_values.append(tiles)
+                num_patches_list.append(tiles.shape[0])
+
+            except Exception as e:
+                print(f"[Error] Failed to process image {image_index}: {e}")
+                return None
+
+        # Concatenate all
+        pixel_values = torch.cat(pixel_values, dim=0)
+
+        original_placeholder = "<<<__IMG_CONTEXT_PLACEHOLDER__>>>"
+        input_text = input_text.replace(self.IMG_CONTEXT_TOKEN, original_placeholder)
+
+        input_text_updated = input_text
+        for num_patches in num_patches_list:
+            image_tokens = (
+                self.IMG_START_TOKEN
+                + self.IMG_CONTEXT_TOKEN * self.num_image_token * num_patches
+                + self.IMG_END_TOKEN
+            )
+            input_text_updated = input_text_updated.replace(
+                original_placeholder, image_tokens, 1
+            )
+
+        input_text_updated = input_text_updated.replace(
+            original_placeholder, self.IMG_CONTEXT_TOKEN
+        )
+
+        # Tokenize
+        input_ids_tensor = self.tokenizer(input_text_updated, return_tensors="pt")[
+            "input_ids"
+        ].flatten()
+        input_ids = input_ids_tensor.tolist()
+
+        # Get image token offsets
+        image_offsets = self.get_mm_items_offset(
+            input_ids=input_ids_tensor.to("cuda"),
+            mm_token_id=self.mm_tokens.image_token_id,
+        )
+
+        items = [
+            MultimodalDataItem(
+                feature=pixel_values,
+                modality=Modality.IMAGE,
+                offsets=image_offsets,
+            )
+        ]
+
+        return {
+            "input_ids": input_ids,
+            "mm_items": items,
+            "im_start_id": self.img_start_token_id,
+            "im_end_id": self.img_end_token_id,
+            "im_token_id": self.mm_tokens.image_token_id,
+        }
diff --git a/python/sglang/srt/multimodal/processors/janus_pro.py b/python/sglang/srt/multimodal/processors/janus_pro.py
new file mode 100644
index 000000000..54d6c1978
--- /dev/null
+++ b/python/sglang/srt/multimodal/processors/janus_pro.py
@@ -0,0 +1,45 @@
+from typing import List, Union
+
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.models.deepseek_janus_pro import MultiModalityCausalLM
+from sglang.srt.multimodal.processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+)
+
+
+class JanusProImageProcessor(BaseMultimodalProcessor):
+    models = [MultiModalityCausalLM]
+
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
+
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token=_processor.image_token,
+            image_token_id=_processor.image_id,
+        ).build(_processor)
+
+    async def process_mm_data_async(
+        self,
+        image_data: List[Union[str, bytes]],
+        input_text,
+        request_obj,
+        **kwargs,
+    ):
+        base_out = self.load_mm_data(
+            prompt=input_text,
+            image_data=image_data,
+            multimodal_tokens=self.mm_tokens,
+        )
+
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(
+            base_out, self.mm_tokens, prompt=base_out.input_text
+        )
+
+        return {
+            "mm_items": mm_items,
+            "input_ids": input_ids.tolist(),
+            "im_start_id": self._processor.image_start_id,
+            "im_end_id": self._processor.image_end_id,
+            "im_token_id": self.mm_tokens.image_token_id,
+        }
diff --git a/python/sglang/srt/multimodal/processors/kimi_vl.py b/python/sglang/srt/multimodal/processors/kimi_vl.py
new file mode 100644
index 000000000..541ed5c9e
--- /dev/null
+++ b/python/sglang/srt/multimodal/processors/kimi_vl.py
@@ -0,0 +1,46 @@
+import re
+from typing import Dict, List, Union
+
+from sglang.srt.models.kimi_vl import KimiVLForConditionalGeneration
+from sglang.srt.multimodal.processors.base_processor import (
+    BaseMultimodalProcessor as SGLangBaseProcessor,
+)
+from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
+
+
+# Compatible with KimiVLForConditionalGeneration
+class KimiVLImageProcessor(SGLangBaseProcessor):
+    models = [KimiVLForConditionalGeneration]
+
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token="<|media_pad|>",
+            # TODO: could we convert in MultimodalSpecialTokens?
+            image_token_id=hf_config.media_placeholder_token_id,
+            image_token_regex=re.compile(r"(?:<\|media_pad\|>)+"),
+        ).build(_processor)
+
+    async def process_mm_data_async(
+        self,
+        image_data: List[Union[str, bytes, Dict]],
+        input_text,
+        request_obj,
+        *args,
+        **kwargs,
+    ):
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            image_data=image_data,
+            multimodal_tokens=self.mm_tokens,
+        )
+
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(
+            base_output, self.mm_tokens
+        )
+
+        return {
+            "input_ids": input_ids.tolist(),
+            "mm_items": mm_items,
+            "im_token_id": self.mm_tokens.image_token_id,
+        }
diff --git a/python/sglang/srt/multimodal/processors/llava.py b/python/sglang/srt/multimodal/processors/llava.py
new file mode 100644
index 000000000..1647ea1e5
--- /dev/null
+++ b/python/sglang/srt/multimodal/processors/llava.py
@@ -0,0 +1,211 @@
+import asyncio
+from typing import List, Optional, Union
+
+import numpy as np
+from transformers.models.auto.processing_auto import (
+    PROCESSOR_MAPPING_NAMES as HF_MAPPING_NAMES,
+)
+
+import sglang.srt.managers.multimodal_processor as sgl_mm_processor_utils
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.models.llava import (
+    LlavaForConditionalGeneration,
+    LlavaLlamaForCausalLM,
+    LlavaMistralForCausalLM,
+    LlavaQwenForCausalLM,
+)
+from sglang.srt.models.llavavid import LlavaVidForCausalLM
+from sglang.srt.models.mistral import Mistral3ForConditionalGeneration
+from sglang.srt.multimodal.mm_utils import expand2square, process_anyres_image
+from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
+from sglang.srt.utils import ImageData, load_image, logger
+from sglang.utils import get_exception_traceback
+
+
+class LlavaImageProcessor(BaseMultimodalProcessor):
+    models = [
+        LlavaLlamaForCausalLM,
+        LlavaVidForCausalLM,
+        LlavaQwenForCausalLM,
+        LlavaMistralForCausalLM,
+    ]
+
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
+
+    @staticmethod
+    def _process_single_image_task(
+        image_data: Union[str, bytes, ImageData],
+        image_aspect_ratio: Optional[str] = None,
+        image_grid_pinpoints: Optional[str] = None,
+        processor=None,
+    ):
+
+        image_processor = processor.image_processor
+
+        try:
+            url = image_data.url if isinstance(image_data, ImageData) else image_data
+            image, image_size = load_image(url)
+            if image_size is not None:
+                # It is a video with multiple images
+                image_hash = hash(url)
+                pixel_values = image_processor(image)["pixel_values"]
+                for _ in range(len(pixel_values)):
+                    pixel_values[_] = pixel_values[_].astype(np.float16)
+                pixel_values = np.stack(pixel_values, axis=0)
+                return pixel_values, image_hash, image_size
+            else:
+                # It is an image
+                image_hash = hash(url)
+                if image_aspect_ratio == "pad":
+                    image = expand2square(
+                        image,
+                        tuple(int(x * 255) for x in image_processor.image_mean),
+                    )
+                    pixel_values = image_processor(image.convert("RGB"))[
+                        "pixel_values"
+                    ][0]
+                elif image_aspect_ratio == "anyres" or (
+                    image_aspect_ratio is not None
+                    and "anyres_max" in image_aspect_ratio
+                ):
+                    pixel_values = process_anyres_image(
+                        image, image_processor, image_grid_pinpoints
+                    )
+                else:
+                    pixel_values = image_processor(image)["pixel_values"][0]
+
+                if isinstance(pixel_values, np.ndarray):
+                    pixel_values = pixel_values.astype(np.float16)
+
+                return pixel_values, image_hash, image.size
+        except Exception:
+            logger.error("Exception in TokenizerManager:\n" + get_exception_traceback())
+
+    async def _process_single_image(
+        self,
+        image_data: Union[bytes, str, ImageData],
+        aspect_ratio: str,
+        grid_pinpoints: str,
+    ):
+        if self.cpu_executor is not None:
+            loop = asyncio.get_event_loop()
+            return await loop.run_in_executor(
+                self.cpu_executor,
+                LlavaImageProcessor._process_single_image_task,
+                image_data,
+                aspect_ratio,
+                grid_pinpoints,
+                self._processor,
+            )
+        else:
+            return self._process_single_image_task(
+                image_data,
+                aspect_ratio,
+                grid_pinpoints,
+                self._processor.image_processor,
+            )
+
+    async def process_mm_data_async(
+        self,
+        image_data: List[Union[str, bytes, ImageData]],
+        input_text,
+        request_obj,
+        *args,
+        **kwargs,
+    ):
+        modalities = request_obj.modalities or ["image"]
+        aspect_ratio = getattr(self.hf_config, "image_aspect_ratio", None)
+        grid_pinpoints = (
+            self.hf_config.image_grid_pinpoints
+            if hasattr(self.hf_config, "image_grid_pinpoints")
+            and "anyres" in aspect_ratio
+            else None
+        )
+
+        if isinstance(image_data, list) and len(image_data) > 0:
+            if "multi-images" in modalities or "video" in modalities:
+                # Multiple images
+                aspect_ratio = "pad"  # LLaVA OneVision Handling: more than one image --> interleaved image mode or video mode. We do not use anyres
+                pixel_values, data_hashes, image_sizes = [], [], []
+                res = []
+                for img_data in image_data:
+                    res.append(
+                        self._process_single_image(
+                            img_data, aspect_ratio, grid_pinpoints
+                        )
+                    )
+
+                res = await asyncio.gather(*res)
+                for pixel_v, image_h, image_s in res:
+                    pixel_values.append(pixel_v)
+                    data_hashes.append(image_h)
+                    image_sizes.append(image_s)
+
+                if isinstance(pixel_values[0], np.ndarray):
+                    pixel_values = np.stack(pixel_values, axis=0)
+            else:
+                # A single image
+                pixel_values, image_hash, image_size = await self._process_single_image(
+                    image_data[0], aspect_ratio, grid_pinpoints
+                )
+                image_sizes = [image_size]
+        else:
+            raise ValueError(f"Invalid image data: {image_data}")
+        modality = Modality.IMAGE
+        if isinstance(request_obj.modalities, list):
+            if request_obj.modalities[0] == "multi-images":
+                modality = Modality.MULTI_IMAGES
+            elif request_obj.modalities[0] == "video":
+                modality = Modality.VIDEO
+
+        return {
+            "mm_items": [
+                MultimodalDataItem(
+                    feature=pixel_values,
+                    model_specific_data={
+                        "image_sizes": image_sizes,
+                    },
+                    modality=modality,
+                )
+            ],
+        }
+
+
+class LlavaMultimodalProcessor(BaseMultimodalProcessor):
+    """
+    This is a wrapper class used to identify the multimodal processor for Llava architectures' vision model.
+    """
+
+    models = [LlavaForConditionalGeneration, Mistral3ForConditionalGeneration]
+
+    def _get_sgl_processor_cls(self, model_type: str):
+        if hf_name := HF_MAPPING_NAMES.get(model_type):
+            sgl_mm_processor_set = sgl_mm_processor_utils.PROCESSOR_MAPPING.values()
+            sgl_processor_cls = list(
+                filter(lambda p: p.__name__ == hf_name, sgl_mm_processor_set)
+            )
+            if sgl_processor_cls:
+                return sgl_processor_cls[0]
+        raise ValueError(
+            f"Cannot find corresponding multimodal processor registered in sglang for model type `{model_type}`"
+        )
+
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        assert hasattr(hf_config, "vision_config")
+        assert hasattr(hf_config, "text_config")
+        self.vision_config = hf_config.vision_config
+        self.text_config = hf_config.text_config
+        self.hf_config = hf_config
+
+        if vision_type := getattr(self.vision_config, "model_type"):
+            self.inner = self._get_sgl_processor_cls(vision_type)(
+                hf_config, server_args, _processor, *args, **kwargs
+            )
+        else:
+            raise ValueError(
+                f"Required `vision_config.model_type` is not found in hf_config: `{hf_config}`"
+            )
+
+    async def process_mm_data_async(self, *args, **kwargs):
+        return await self.inner.process_mm_data_async(*args, **kwargs)
diff --git a/python/sglang/srt/multimodal/processors/minicpm.py b/python/sglang/srt/multimodal/processors/minicpm.py
new file mode 100644
index 000000000..9ddbf4fb6
--- /dev/null
+++ b/python/sglang/srt/multimodal/processors/minicpm.py
@@ -0,0 +1,144 @@
+from typing import List, Union
+
+import torch
+
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.models.minicpmo import MiniCPMO
+from sglang.srt.models.minicpmv import MiniCPMV
+from sglang.srt.multimodal.processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+)
+
+
+# Compatible with both 'O' and 'V'
+class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
+    models = [MiniCPMV, MiniCPMO]
+
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
+        # Collect special token ids
+        tokenizer = self._processor.tokenizer
+        self.slice_start_id = getattr(tokenizer, "slice_start_id", None)
+        self.slice_end_id = getattr(tokenizer, "slice_end_id", None)
+        self.audio_start_id = getattr(tokenizer, "audio_start_id", None)
+        self.audio_end_id = getattr(tokenizer, "audio_end_id", None)
+        self.im_start_id = getattr(tokenizer, "im_start_id", None)
+        self.im_end_id = getattr(tokenizer, "im_end_id", None)
+        self.im_token_id = getattr(tokenizer, "unk_id", None)
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token="(<image>./</image>)",
+            audio_token="(<audio>./</audio>)",
+            video_token="(<video>./</video>)",
+            image_token_id=self.im_token_id,
+        ).build(_processor)
+
+    async def process_mm_data_async(
+        self,
+        image_data: List[Union[str, bytes]],
+        audio_data: List[Union[str, bytes]],
+        input_text,
+        request_obj,
+        **kwargs,
+    ):
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            audio_data=audio_data,
+            image_data=image_data,
+            multimodal_tokens=self.mm_tokens,
+        )
+        if base_output is None:
+            return None
+
+        res = self.process_mm_data(
+            input_text=base_output.input_text,
+            images=base_output.images,
+            audios=base_output.audios,
+        )
+
+        pixel_values = res["pixel_values"]
+        tgt_sizes = res["tgt_sizes"]
+
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError(
+                "Incorrect type of pixel values. " f"Got type: {type(pixel_values)}"
+            )
+
+        if not isinstance(tgt_sizes, (torch.Tensor, list)):
+            raise ValueError(
+                "Incorrect type of target sizes. " f"Got type: {type(tgt_sizes)}"
+            )
+
+        if len(pixel_values) != len(tgt_sizes):
+            raise ValueError(
+                "Inconsistent batch lengths, found: "
+                f"{len(pixel_values)} vs. {len(tgt_sizes)}"
+            )
+
+        pixel_values_flat: List[torch.Tensor] = []
+        tgt_sizes_flat: List[torch.Tensor] = []
+        for pixel_b, tgt_b in zip(pixel_values, tgt_sizes):
+            # per image
+            if len(pixel_b) != len(tgt_b):
+                raise ValueError(
+                    "Inconsistent N lengths, found: " f"{len(pixel_b)} vs {len(tgt_b)}"
+                )
+            for pixel_n, tgt_n in zip(pixel_b, tgt_b):
+                pixel_values_flat += [pixel_n]
+                tgt_sizes_flat += [tgt_n]
+
+        pixel_values = pixel_values_flat
+
+        items = []
+        input_ids = res["input_ids"].flatten()
+        image_offsets = self.get_mm_items_offset_by_pair(
+            input_ids=input_ids, mm_start_id=self.im_start_id, mm_end_id=self.im_end_id
+        )
+        slice_offsets = self.get_mm_items_offset_by_pair(
+            input_ids=input_ids,
+            mm_start_id=self.slice_start_id,
+            mm_end_id=self.slice_end_id,
+        )
+        image_offsets.extend(slice_offsets)
+        image_offsets = sorted(image_offsets)
+
+        if len(pixel_values) != 0:
+            item = MultimodalDataItem(
+                feature=pixel_values,
+                offsets=image_offsets,
+                model_specific_data={"tgt_size": tgt_sizes_flat},
+                modality=Modality.IMAGE,
+            )
+            items += [item]
+
+        if (
+            "audio_features" in res
+            and res["audio_features"] is not None
+            and len(res["audio_features"]) != 0
+        ):
+            if self.audio_start_id is not None and self.audio_end_id is not None:
+                audio_offsets = self.get_mm_items_offset_by_pair(
+                    input_ids=input_ids,
+                    mm_start_id=self.audio_start_id,
+                    mm_end_id=self.audio_end_id,
+                )
+            else:
+                audio_offsets = None
+            item = MultimodalDataItem(
+                feature=[res["audio_features"]],
+                model_specific_data={"audio_feature_lens": res["audio_feature_lens"]},
+                offsets=audio_offsets,
+                modality=Modality.AUDIO,
+            )
+            items += [item]
+        return {
+            "mm_items": items,
+            "input_ids": input_ids.tolist(),
+            "audio_start_id": self.audio_start_id,
+            "audio_end_id": self.audio_end_id,
+            "im_token_id": self.im_token_id,
+            "im_start_id": self.im_start_id,
+            "im_end_id": self.im_end_id,
+            "slice_start_id": self.slice_start_id,
+            "slice_end_id": self.slice_end_id,
+        }
diff --git a/python/sglang/srt/multimodal/processors/mlama.py b/python/sglang/srt/multimodal/processors/mlama.py
new file mode 100644
index 000000000..432215a4f
--- /dev/null
+++ b/python/sglang/srt/multimodal/processors/mlama.py
@@ -0,0 +1,37 @@
+from typing import List, Union
+
+from sglang.srt.models.mllama import MllamaForConditionalGeneration
+from sglang.srt.multimodal.processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+)
+
+
+class MllamaImageProcessor(BaseMultimodalProcessor):
+    models = [MllamaForConditionalGeneration]
+
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token=self._processor.image_token,
+            image_token_id=self._processor.image_token_id,
+        ).build(_processor)
+
+    async def process_mm_data_async(
+        self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
+    ):
+        base_out = self.load_mm_data(
+            prompt=input_text,
+            image_data=image_data,
+            multimodal_tokens=self.mm_tokens,
+        )
+
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(
+            base_out, self.mm_tokens
+        )
+
+        return {
+            "mm_items": mm_items,
+            "input_ids": input_ids.tolist(),
+            "im_token_id": self.mm_tokens.image_token_id,
+        }
diff --git a/python/sglang/srt/multimodal/processors/mllama4.py b/python/sglang/srt/multimodal/processors/mllama4.py
new file mode 100644
index 000000000..6a01f2aeb
--- /dev/null
+++ b/python/sglang/srt/multimodal/processors/mllama4.py
@@ -0,0 +1,57 @@
+from typing import List, Union
+
+import torch
+from transformers.image_utils import SizeDict
+from transformers.models.llama4.image_processing_llama4_fast import (
+    find_supported_resolutions,
+    get_best_fit,
+)
+
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.models.mllama4 import Llama4ForConditionalGeneration
+from sglang.srt.multimodal.processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+)
+
+
+class Mllama4ImageProcessor(BaseMultimodalProcessor):
+    models = [Llama4ForConditionalGeneration]
+
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
+        self.vision_config = hf_config.vision_config
+        self.text_config = hf_config.text_config
+        self.IM_START_TOKEN_ID = hf_config.boi_token_index
+        self.IM_END_TOKEN_ID = hf_config.eoi_token_index
+        self.IM_TOKEN_ID = hf_config.image_token_index
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token=_processor.image_token,
+            image_token_id=self.IM_TOKEN_ID,
+        ).build(_processor)
+
+    async def process_mm_data_async(
+        self,
+        image_data: List[Union[str, bytes]],
+        input_text,
+        *args,
+        **kwargs,
+    ):
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            image_data=image_data,
+            multimodal_tokens=self.mm_tokens,
+        )
+
+        # Process the prompt and images
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(
+            base_output, self.mm_tokens
+        )
+
+        return {
+            "input_ids": input_ids.tolist(),
+            "mm_items": mm_items,
+            "im_start_id": self.IM_START_TOKEN_ID,
+            "im_end_id": self.IM_END_TOKEN_ID,
+            "im_token_id": self.IM_TOKEN_ID,
+        }
diff --git a/python/sglang/srt/multimodal/processors/phi4mm.py b/python/sglang/srt/multimodal/processors/phi4mm.py
new file mode 100644
index 000000000..1487d2ca2
--- /dev/null
+++ b/python/sglang/srt/multimodal/processors/phi4mm.py
@@ -0,0 +1,101 @@
+import logging
+from typing import List, Union
+
+from transformers.processing_utils import ProcessorMixin
+
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.models.phi4mm import Phi4MMForCausalLM
+from sglang.srt.multimodal.processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# It is an adapter of hf phi4 mm processor to make it work for sglang
+# Ref: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/blob/main/processing_phi4mm.py#L693
+class Phi4MMProcessorAdapter(ProcessorMixin):
+    def __init__(self, _processor) -> None:
+        self._processor = _processor
+
+    def __call__(self, **kwargs):
+        result = self._processor(**kwargs)
+
+        # Map HuggingFace output keys to sglang standard keys
+        key_mapping = {
+            "input_image_embeds": "pixel_values",
+            "input_audio_embeds": "audio_features",
+            "audio_embed_sizes": "audio_feature_lens",
+        }
+        for hf_key, sglang_key in key_mapping.items():
+            if hf_key in result:
+                result[sglang_key] = result[hf_key]
+                del result[hf_key]
+
+        # Filter out None or empty tensors from the result.
+        # This prevents the sglang function base_processor.collect_mm_items_from_processor_output()
+        # from misclassifying audio content as image content, and vice versa.
+        filtered_result = {
+            k: v
+            for k, v in result.items()
+            if v is not None and (not hasattr(v, "numel") or v.numel() > 0)
+        }
+        return filtered_result
+
+
+class Phi4MMMultimodalProcessor(BaseMultimodalProcessor):
+    models = [Phi4MMForCausalLM]
+
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        self.processor = Phi4MMProcessorAdapter(_processor)
+        super().__init__(hf_config, server_args, self.processor, *args, **kwargs)
+
+        # the following CONSTANTS come from hugging-face microsoft/Phi-4-multimodal-instruct's processing_phi4mm.py file
+        # ref: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/blob/main/processing_phi4mm.py
+        self.IMAGE_TOKEN = "<|endoftext10|>"
+        self.AUDIO_TOKEN = "<|endoftext11|>"
+        self.IM_TOKEN_ID = 200010
+        self.AUDIO_TOKEN_ID = 200011
+        self.AUDIO_SAMPLE_RATE = 16000
+
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token=self.IMAGE_TOKEN,
+            image_token_id=self.IM_TOKEN_ID,
+            audio_token=self.AUDIO_TOKEN,
+            audio_token_id=self.AUDIO_TOKEN_ID,
+        ).build(self.processor)
+
+    async def process_mm_data_async(
+        self,
+        image_data: List[Union[str, bytes]],
+        audio_data,
+        input_text,
+        request_obj,
+        **kwargs,
+    ):
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            audio_data=audio_data,
+            image_data=image_data,
+            multimodal_tokens=self.mm_tokens,
+            audio_sample_rate=self.AUDIO_SAMPLE_RATE,
+        )
+
+        if base_output.audios is not None:
+            # hugging-face microsoft/Phi-4-multimodal-instruct's processing_phi4mm.py file requires the audio input to be tuple of (audio, sample_rate)
+            # ref: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/blob/main/processing_phi4mm.py
+            base_output.audios = [
+                (audio, self.AUDIO_SAMPLE_RATE) for audio in base_output.audios
+            ]
+
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(
+            base_output, self.mm_tokens
+        )
+
+        return {
+            "input_ids": input_ids.tolist(),
+            "mm_items": mm_items,
+            "im_token_id": self.mm_tokens.image_token_id,
+            "audio_token_id": self.mm_tokens.audio_token_id,
+        }
diff --git a/python/sglang/srt/multimodal/processors/pixtral.py b/python/sglang/srt/multimodal/processors/pixtral.py
new file mode 100644
index 000000000..af5cedec9
--- /dev/null
+++ b/python/sglang/srt/multimodal/processors/pixtral.py
@@ -0,0 +1,100 @@
+import asyncio
+import math
+from typing import List, Union
+
+from transformers.models.pixtral.image_processing_pixtral import (
+    _num_image_tokens as _get_pixtral_hf_num_image_tokens,
+)
+
+from sglang.srt.models.pixtral import PixtralVisionModel
+from sglang.srt.multimodal.processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+)
+
+
+class PixtralProcessor(BaseMultimodalProcessor):
+    models = [PixtralVisionModel]
+
+    PAD_TOKEN = "<pad>"
+    IMG_BREAK_TOKEN_ID = 12
+    IMG_END_TOKEN_ID = 13
+
+    def get_patch_grid_size(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> tuple[int, int]:
+        max_width = max_height = self.image_size
+        patch_width = patch_height = self.patch_size
+
+        ratio = max(image_width / max_width, image_height / max_height)
+
+        if ratio > 1:
+            image_width = int(math.floor(image_width / ratio))
+            image_height = int(math.floor(image_height / ratio))
+
+        nrows, ncols = _get_pixtral_hf_num_image_tokens(
+            (image_height, image_width),
+            (patch_height, patch_width),
+        )
+
+        return ncols, nrows
+
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
+        self.IM_TOKEN_ID = getattr(
+            hf_config, "image_token_index", PixtralVisionModel.DEFAULT_IMAGE_TOKEN_ID
+        )
+        # Instantiate the patcher logic helper using the class defined above
+
+        self.vision_config = hf_config.vision_config
+        self.image_size = self.vision_config.image_size
+        self.patch_size = self.vision_config.patch_size
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token=_processor.image_token,
+            image_token_id=self.IM_TOKEN_ID,
+        ).build(_processor)
+        _processor.tokenizer.add_special_tokens(
+            {
+                "pad_token": getattr(hf_config, "pad_token", self.PAD_TOKEN),
+            }
+        )
+
+    async def _resize(self, image):
+        num_w_tokens, num_h_tokens = self.get_patch_grid_size(
+            image_width=image.size[0],
+            image_height=image.size[1],
+        )
+        new_size = (num_w_tokens * self.patch_size, num_h_tokens * self.patch_size)
+        return image.resize(new_size)
+
+    async def process_mm_data_async(
+        self,
+        image_data: List[Union[str, bytes]],
+        input_text,
+        request_obj,
+        *args,
+        **kwargs,
+    ):
+        mm_data = self.load_mm_data(
+            prompt=input_text,
+            multimodal_tokens=self.mm_tokens,
+            image_data=image_data,
+            return_text=True,
+        )
+        if mm_data.images:
+            resize_tasks = [self._resize(image) for image in mm_data.images]
+            mm_data.images = await asyncio.gather(*resize_tasks)
+
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(
+            mm_data, self.mm_tokens
+        )
+
+        return {
+            "mm_items": mm_items,
+            "input_ids": input_ids.tolist(),
+            "im_token_id": self.IM_TOKEN_ID,
+            "im_token": self._processor.image_token,
+        }
diff --git a/python/sglang/srt/multimodal/processors/qwen_audio.py b/python/sglang/srt/multimodal/processors/qwen_audio.py
new file mode 100644
index 000000000..f9275feea
--- /dev/null
+++ b/python/sglang/srt/multimodal/processors/qwen_audio.py
@@ -0,0 +1,67 @@
+import re
+
+from sglang.srt.managers.schedule_batch import Modality
+from sglang.srt.models.qwen2_audio import Qwen2AudioForConditionalGeneration
+from sglang.srt.multimodal.processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+)
+
+
+class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
+    models = [Qwen2AudioForConditionalGeneration]
+
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
+        self.AUDIO_TOKEN = "<|audio_bos|><|AUDIO|><|audio_eos|>"
+        self.AUDIO_TOKEN_REGEX = re.compile(
+            r"<\|audio_bos\|>(?:<\|AUDIO\|>)+<\|audio_eos\|>"
+        )
+        # Collect special token ids
+        tokenizer = self._processor.tokenizer
+        self.audio_start_id = tokenizer.convert_tokens_to_ids("<|audio_bos|>")
+        self.audio_token_id = tokenizer.convert_tokens_to_ids("<|AUDIO|>")
+        self.audio_end_id = tokenizer.convert_tokens_to_ids("<|audio_eos|>")
+
+        self.mm_tokens = MultimodalSpecialTokens(
+            audio_token=self.AUDIO_TOKEN,
+            audio_token_regex=self.AUDIO_TOKEN_REGEX,
+            audio_token_id=self.audio_token_id,
+        ).build(_processor)
+
+        self.ATTR_NAME_TO_MODALITY.update({"feature_attention_mask": Modality.AUDIO})
+
+    async def process_mm_data_async(
+        self,
+        audio_data,
+        input_text,
+        **kwargs,
+    ):
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            audio_data=audio_data,
+            multimodal_tokens=self.mm_tokens,
+        )
+        if base_output is None:
+            return None
+
+        mm_items, input_ids, ret = self.process_and_combine_mm_data(
+            base_output, self.mm_tokens
+        )
+
+        assert (
+            "feature_attention_mask" in ret
+        ), "feature_attention_mask not found in processor output"
+        input_lengths = ret["feature_attention_mask"].sum(dim=-1)
+        input_lengths = (input_lengths - 1) // 2 + 1
+        output_lengths = (input_lengths - 2) // 2 + 1
+
+        mm_items[0].audio_feature_lens = output_lengths
+
+        return {
+            "mm_items": mm_items,
+            "input_ids": input_ids.tolist(),
+            "audio_start_id": self.audio_start_id,
+            "audio_token_id": self.audio_token_id,
+            "audio_end_id": self.audio_end_id,
+        }
diff --git a/python/sglang/srt/multimodal/processors/qwen_vl.py b/python/sglang/srt/multimodal/processors/qwen_vl.py
new file mode 100644
index 000000000..f67f72b95
--- /dev/null
+++ b/python/sglang/srt/multimodal/processors/qwen_vl.py
@@ -0,0 +1,281 @@
+import asyncio
+import math
+import os
+import re
+from typing import List, Union
+
+import torch
+import torchvision
+from PIL import Image
+from torchvision.transforms import InterpolationMode
+
+from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
+from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
+from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration
+from sglang.srt.multimodal.processors.base_processor import (
+    BaseMultimodalProcessor as SGLangBaseProcessor,
+)
+from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
+from sglang.utils import logger
+
+IMAGE_FACTOR = 28
+MIN_PIXELS = 4 * 28 * 28
+MAX_PIXELS = 16384 * 28 * 28
+MAX_RATIO = 200
+VIDEO_TOTAL_PIXELS = int(
+    float(os.environ.get("VIDEO_MAX_PIXELS", 128000 * 28 * 28 * 0.9))
+)
+
+VIDEO_MIN_PIXELS = 128 * 28 * 28
+VIDEO_MAX_PIXELS = 768 * 28 * 28
+FRAME_FACTOR = 2
+FPS = 2.0
+FPS_MIN_FRAMES = 4
+FPS_MAX_FRAMES = 768
+
+
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int = IMAGE_FACTOR,
+    min_pixels: int = MIN_PIXELS,
+    max_pixels: int = MAX_PIXELS,
+) -> tuple[int, int]:
+    """
+    Rescales the image so that the following conditions are met:
+
+    1. Both dimensions (height and width) are divisible by 'factor'.
+
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    if max(height, width) / min(height, width) > MAX_RATIO:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = max(factor, round_by_factor(height, factor))
+    w_bar = max(factor, round_by_factor(width, factor))
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = floor_by_factor(height / beta, factor)
+        w_bar = floor_by_factor(width / beta, factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = ceil_by_factor(height * beta, factor)
+        w_bar = ceil_by_factor(width * beta, factor)
+    return h_bar, w_bar
+
+
+def resize_image(image, size_factor: int = IMAGE_FACTOR) -> Image.Image:
+    width, height = image.size
+    min_pixels = MIN_PIXELS
+    max_pixels = MAX_PIXELS
+    resized_height, resized_width = smart_resize(
+        height,
+        width,
+        factor=size_factor,
+        min_pixels=min_pixels,
+        max_pixels=max_pixels,
+    )
+    image = image.resize((resized_width, resized_height))
+    return image
+
+
+def round_by_factor(number: int, factor: int) -> int:
+    """Returns the closest integer to 'number' that is divisible by 'factor'."""
+    return round(number / factor) * factor
+
+
+def ceil_by_factor(number: int, factor: int) -> int:
+    """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
+    return math.ceil(number / factor) * factor
+
+
+def floor_by_factor(number: int, factor: int) -> int:
+    """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
+    return math.floor(number / factor) * factor
+
+
+async def resize_image_async(image):
+    return resize_image(image)
+
+
+def smart_nframes(
+    ele: dict,
+    total_frames: int,
+    video_fps: int | float,
+) -> int:
+    """calculate the number of frames for video used for model inputs.
+
+    Args:
+        ele (dict): a dict contains the configuration of video.
+            support either `fps` or `nframes`:
+                - nframes: the number of frames to extract for model inputs.
+                - fps: the fps to extract frames for model inputs.
+                    - min_frames: the minimum number of frames of the video, only used when fps is provided.
+                    - max_frames: the maximum number of frames of the video, only used when fps is provided.
+        total_frames (int): the original total number of frames of the video.
+        video_fps (int | float): the original fps of the video.
+
+    Raises:
+        ValueError: nframes should in interval [FRAME_FACTOR, total_frames].
+
+    Returns:
+        int: the number of frames for video used for model inputs.
+    """
+    assert not (
+        "fps" in ele and "nframes" in ele
+    ), "Only accept either `fps` or `nframes`"
+    if "nframes" in ele:
+        nframes = round_by_factor(ele["nframes"], FRAME_FACTOR)
+    else:
+        fps = ele.get("fps", FPS)
+        min_frames = ceil_by_factor(ele.get("min_frames", FPS_MIN_FRAMES), FRAME_FACTOR)
+        max_frames = floor_by_factor(
+            ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR
+        )
+        nframes = total_frames / video_fps * fps
+        if nframes > total_frames:
+            logger.warning(
+                f"smart_nframes: nframes[{nframes}] > total_frames[{total_frames}]"
+            )
+        nframes = min(min(max(nframes, min_frames), max_frames), total_frames)
+        nframes = floor_by_factor(nframes, FRAME_FACTOR)
+    if not (FRAME_FACTOR <= nframes and nframes <= total_frames):
+        raise ValueError(
+            f"nframes should in interval [{FRAME_FACTOR}, {total_frames}], but got {nframes}."
+        )
+    return nframes
+
+
+# process video, qwen-specific
+async def preprocess_video(
+    vr,
+    image_factor: int = IMAGE_FACTOR,
+    # vr: VideoReader, image_factor: int = IMAGE_FACTOR
+) -> torch.Tensor:
+    ele = {}
+    total_frames, video_fps = len(vr), vr.get_avg_fps()
+    nframes = smart_nframes({}, total_frames=total_frames, video_fps=video_fps)
+    idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist()
+    video = vr.get_batch(idx).asnumpy()
+    video = torch.tensor(video).permute(0, 3, 1, 2)  # Convert to TCHW format
+    nframes, _, height, width = video.shape
+    min_pixels = ele.get("min_pixels", VIDEO_MIN_PIXELS)
+    total_pixels = ele.get("total_pixels", VIDEO_TOTAL_PIXELS)
+    max_pixels = max(
+        min(VIDEO_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR),
+        int(min_pixels * 1.05),
+    )
+    max_pixels_supposed = ele.get("max_pixels", max_pixels)
+    if max_pixels_supposed > max_pixels:
+        logger.warning(
+            f"The given max_pixels[{max_pixels_supposed}] exceeds limit[{max_pixels}]."
+        )
+    max_pixels = min(max_pixels_supposed, max_pixels)
+    if "resized_height" in ele and "resized_width" in ele:
+        resized_height, resized_width = smart_resize(
+            ele["resized_height"],
+            ele["resized_width"],
+            factor=image_factor,
+        )
+    else:
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=image_factor,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+    video = torchvision.transforms.functional.resize(
+        video,
+        [resized_height, resized_width],
+        interpolation=InterpolationMode.BICUBIC,
+        antialias=True,
+    ).float()
+    return video
+
+
+# Compatible with Qwen2VL and Qwen2_5VL
+class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
+    models = [Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration]
+
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
+        # The regex that matches expanded image tokens.
+        self.IM_START_TOKEN_ID = hf_config.vision_start_token_id
+        self.IM_END_TOKEN_ID = hf_config.vision_end_token_id
+        self.vision_start_token_id = hf_config.vision_start_token_id
+        self.vision_end_token_id = hf_config.vision_end_token_id
+        self.NUM_TOKEN_PER_FRAME = 770
+        self.IMAGE_FACTOR = 28
+        self.MIN_PIXELS = 4 * 28 * 28
+        self.MAX_PIXELS = 16384 * 28 * 28
+        self.MAX_RATIO = 200
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token="<|vision_start|><|image_pad|><|vision_end|>",
+            image_token_id=hf_config.image_token_id,
+            image_token_regex=re.compile(
+                r"<\|vision_start\|>(?:<\|image_pad\|>)+<\|vision_end\|>"
+            ),
+            video_token_id=hf_config.video_token_id,
+        ).build(_processor)
+
+    async def process_mm_data_async(
+        self,
+        image_data: List[Union[str, bytes]],
+        input_text,
+        request_obj,
+        *args,
+        **kwargs,
+    ):
+
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            image_data=image_data,
+            video_data=request_obj.video_data,
+            multimodal_tokens=self.mm_tokens,
+        )
+
+        # Qwen-specific: resize images if they are raw Image objects
+        if base_output.images and isinstance(base_output.images[0], Image.Image):
+            resize_tasks = [resize_image_async(image) for image in base_output.images]
+            base_output.images = await asyncio.gather(*resize_tasks)
+
+        if base_output.videos:
+            base_output.videos = [
+                await preprocess_video(video) for video in base_output.videos
+            ]
+
+        mm_items, input_ids, ret = self.process_and_combine_mm_data(
+            base_output, self.mm_tokens
+        )
+
+        input_ids = input_ids.flatten()
+        mrope_positions, mrope_position_delta = MRotaryEmbedding.get_rope_index(
+            spatial_merge_size=self.hf_config.vision_config.spatial_merge_size,
+            image_token_id=self.mm_tokens.image_token_id,
+            video_token_id=self.mm_tokens.video_token_id,
+            vision_start_token_id=self.vision_start_token_id,
+            model_type=self.hf_config.model_type,
+            tokens_per_second=getattr(
+                self.hf_config.vision_config, "tokens_per_second", None
+            ),
+            input_ids=input_ids.unsqueeze(0),
+            image_grid_thw=getattr(ret, "image_grid_thw", None),
+            video_grid_thw=getattr(ret, "video_grid_thw", None),
+            second_per_grid_ts=getattr(ret, "second_per_grid_ts", None),
+        )
+        mrope_positions = mrope_positions.squeeze(1)
+
+        return {
+            "input_ids": input_ids.tolist(),
+            "mm_items": mm_items,
+            "im_start_id": self.IM_START_TOKEN_ID,
+            "im_end_id": self.IM_END_TOKEN_ID,
+            "im_token_id": self.mm_tokens.image_token_id,
+            "video_token_id": self.mm_tokens.video_token_id,
+            "mrope_positions": mrope_positions,
+            "mrope_position_delta": mrope_position_delta,
+        }
diff --git a/python/sglang/srt/multimodal/processors/step3_vl.py b/python/sglang/srt/multimodal/processors/step3_vl.py
new file mode 100644
index 000000000..ee537e68e
--- /dev/null
+++ b/python/sglang/srt/multimodal/processors/step3_vl.py
@@ -0,0 +1,517 @@
+import math
+import re
+from itertools import product
+from typing import List, Literal, Optional, TypedDict, Union
+
+import numpy as np
+import torch
+from PIL import Image
+from torchvision import transforms
+from torchvision.transforms import InterpolationMode
+from transformers import BatchFeature, ProcessorMixin, TensorType
+
+from sglang.srt.models.step3_vl import Step3VLForConditionalGeneration
+from sglang.srt.multimodal.processors.base_processor import (
+    BaseMultimodalProcessor as SGLangBaseProcessor,
+)
+from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
+
+ImageWithPatches = tuple[Image.Image, list[Image.Image], list[int] | None]
+
+
+class GPUToTensor(torch.nn.Module):
+
+    def forward(self, raw_image: Union[np.ndarray, Image.Image]) -> torch.Tensor:
+        if isinstance(raw_image, Image.Image):
+            return transforms.ToTensor()(raw_image)
+        if raw_image.ndim == 2:
+            raw_image = raw_image[:, :, None].repeat(3, -1)
+        if torch.cuda.is_available():
+            device = torch.device("cuda")
+        else:
+            device = torch.device("cpu")
+        image_tensor = torch.from_numpy(raw_image).to(device)
+        image_tensor = torch.permute(image_tensor, (2, 0, 1)).contiguous()
+        if image_tensor.dtype == torch.uint8:
+            image_tensor = image_tensor.to(torch.float32).div(255)
+        return image_tensor
+
+
+class Step3VisionProcessor:
+    def __init__(self, size, interpolation_mode="bicubic", patch_size=None):
+        mean = [0.48145466, 0.4578275, 0.40821073]
+        std = [0.26862954, 0.26130258, 0.27577711]
+        patch_size = patch_size if patch_size is not None else size
+
+        self.transform = transforms.Compose(
+            [
+                GPUToTensor(),
+                transforms.Normalize(mean, std),
+                transforms.Resize(
+                    (size, size),
+                    interpolation=(
+                        InterpolationMode.BICUBIC
+                        if interpolation_mode == "bicubic"
+                        else InterpolationMode.BILINEAR
+                    ),
+                    antialias=True,
+                ),
+            ]
+        )
+
+        self.patch_transform = (
+            transforms.Compose(
+                [
+                    GPUToTensor(),
+                    transforms.Normalize(mean, std),
+                    transforms.Resize(
+                        (patch_size, patch_size),
+                        interpolation=(
+                            InterpolationMode.BICUBIC
+                            if interpolation_mode == "bicubic"
+                            else InterpolationMode.BILINEAR
+                        ),
+                        antialias=True,
+                    ),
+                ]
+            )
+            if patch_size is not None
+            else None
+        )
+
+    def __call__(self, image, is_patch=False):
+        if is_patch:
+            return {"pixel_values": self.patch_transform(image).unsqueeze(0)}
+        else:
+            return {"pixel_values": self.transform(image).unsqueeze(0)}
+
+
+class ImagePatcher:
+
+    def determine_window_size(self, long: int, short: int) -> int:
+        if long <= 728:
+            return short if long / short > 1.5 else 0
+        return min(short, 504) if long / short > 4 else 504
+
+    def slide_window(
+        self,
+        width: int,
+        height: int,
+        sizes: list[tuple[int, int]],
+        steps: list[tuple[int, int]],
+        img_rate_thr: float = 0.6,
+    ) -> tuple[list[tuple[int, int, int, int]], tuple[int, int]]:
+        assert 1 >= img_rate_thr >= 0, "The `in_rate_thr` should lie in 0~1"
+        windows = []
+        # Sliding windows.
+        for size, step in zip(sizes, steps):
+            size_w, size_h = size
+            step_w, step_h = step
+
+            x_num = 1 if width <= size_w else math.ceil((width - size_w) / step_w + 1)
+            x_start = [step_w * i for i in range(x_num)]
+            if len(x_start) > 1 and x_start[-1] + size_w > width:
+                x_start[-1] = width - size_w
+
+            y_num = 1 if height <= size_h else math.ceil((height - size_h) / step_h + 1)
+            y_start = [step_h * i for i in range(y_num)]
+            if len(y_start) > 1 and y_start[-1] + size_h > height:
+                y_start[-1] = height - size_h
+
+            start = np.array(list(product(y_start, x_start)), dtype=int)
+            start[:, [0, 1]] = start[:, [1, 0]]
+            windows.append(np.concatenate([start, start + size], axis=1))
+        windows = np.concatenate(windows, axis=0)
+
+        return [
+            (int(box[0]), int(box[1]), int(box[2] - box[0]), int(box[3] - box[1]))
+            for box in windows
+        ], (x_num, y_num)
+
+    def square_pad(self, img: Image.Image) -> Image.Image:
+        w, h = img.size
+        if w == h:
+            return img
+        size = max(w, h)
+        padded = Image.new(img.mode, (size, size), 0)
+        padded.paste(img, (0, 0))
+        return padded
+
+    def get_image_size_for_padding(
+        self, img_width: int, img_height: int
+    ) -> tuple[int, int]:
+        ratio = img_width / img_height
+        if min(img_height, img_width) < 32 and (ratio > 4 or ratio < 1 / 4):
+            new_size = max(img_height, img_width)
+            return new_size, new_size
+        return img_width, img_height
+
+    def get_image_size_for_preprocess(
+        self, img_width: int, img_height: int
+    ) -> tuple[int, int]:
+
+        if max(img_height, img_width) > 3024:
+            scale_factor = 3024 / max(img_height, img_width)
+            img_width = int(img_width * scale_factor)
+            img_height = int(img_height * scale_factor)
+            return img_width, img_height
+        else:
+            return img_width, img_height
+
+    def get_image_size_for_crop(
+        self, img_width: int, img_height: int, window_size: int
+    ):
+        w_ratio = img_width / window_size
+        h_ratio = img_height / window_size
+
+        if w_ratio < 1:
+            width_new = img_width
+        else:
+            decimal_w = w_ratio - img_width // window_size
+            w_ratio = int(w_ratio) + 1 if decimal_w > 0.2 else int(w_ratio)
+            width_new = window_size * w_ratio
+        if h_ratio < 1:
+            height_new = img_height
+        else:
+            decimal_h = h_ratio - img_height // window_size
+            h_ratio = int(h_ratio) + 1 if decimal_h > 0.2 else int(h_ratio)
+            height_new = window_size * h_ratio
+        return int(width_new), int(height_new)
+
+    def patch_crop(self, img: Image.Image, i: int, j: int, th: int, tw: int):
+        target = img.crop((j, i, j + tw, i + th))
+        return target
+
+    def get_num_patches(self, img_width: int, img_height: int) -> tuple[int, int]:
+        img_width, img_height = self.get_image_size_for_padding(img_width, img_height)
+        img_width, img_height = self.get_image_size_for_preprocess(
+            img_width, img_height
+        )
+        window_size = self.determine_window_size(
+            max(img_height, img_width), min(img_height, img_width)
+        )
+        if window_size == 0:
+            return 0, 0
+        else:
+            img_width, img_height = self.get_image_size_for_crop(
+                img_width, img_height, window_size
+            )
+            center_list, (x_num, y_num) = self.slide_window(
+                img_width,
+                img_height,
+                [(window_size, window_size)],
+                [(window_size, window_size)],
+            )
+            full_rows = (len(center_list) - 1) // x_num + 1
+            if len(center_list) > 0 and len(center_list) % x_num == 0:
+                full_rows -= 1
+            return len(center_list), full_rows
+
+    def __call__(
+        self, img: Image.Image
+    ) -> tuple[Image.Image, list[Image.Image], list[bool] | None]:
+        img_width, img_height = img.size
+        new_img_width, new_img_height = self.get_image_size_for_padding(
+            img_width, img_height
+        )
+        if new_img_width != img_width or new_img_height != img_height:
+            img = self.square_pad(img)
+            img_width, img_height = img.size
+
+        new_img_width, new_img_height = self.get_image_size_for_preprocess(
+            img_width, img_height
+        )
+        img = img.resize((new_img_width, new_img_height), Image.Resampling.BILINEAR)
+        window_size = self.determine_window_size(
+            max(new_img_height, new_img_width), min(new_img_height, new_img_width)
+        )
+        if window_size == 0:
+            return img, [], None
+        else:
+            new_img_width, new_img_height = self.get_image_size_for_crop(
+                new_img_width, new_img_height, window_size
+            )
+            if (new_img_width, new_img_height) != (img_width, img_height):
+                img_for_crop = img.resize(
+                    (new_img_width, new_img_height), Image.Resampling.BILINEAR
+                )
+            else:
+                img_for_crop = img
+
+            patches = []
+            newlines = []
+            center_list, (x_num, y_num) = self.slide_window(
+                new_img_width,
+                new_img_height,
+                [(window_size, window_size)],
+                [(window_size, window_size)],
+            )
+            for patch_id, center_lf_point in enumerate(center_list):
+                x, y, patch_w, patch_h = center_lf_point
+                big_patch = self.patch_crop(img_for_crop, y, x, patch_h, patch_w)
+                patches.append(big_patch)
+                if (patch_id + 1) % x_num == 0:
+                    newlines.append(patch_id)
+
+            if newlines and newlines[-1] == len(patches) - 1:
+                newlines.pop()
+
+            return (
+                img,
+                patches,
+                (
+                    [i in newlines for i in range(len(patches))]
+                    if len(patches) > 0
+                    else None
+                ),
+            )
+
+
+class Step3VLProcessor:
+    def __init__(
+        self,
+        config,
+        tokenizer,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        if isinstance(tokenizer, ProcessorMixin):
+            tokenizer = tokenizer.tokenizer
+        self.tokenizer = tokenizer
+
+        self.image_size = 728
+        self.patch_size = 504
+        self.image_preprocessor = Step3VisionProcessor(
+            self.image_size, "bilinear", self.patch_size
+        )
+
+        self.num_image_feature_size = 169
+        self.num_patch_feature_size = 81
+        self.image_token = "<im_patch>"
+        self.image_feature_placeholder = self.image_token * self.num_image_feature_size
+        self.patch_feature_placeholder = self.image_token * self.num_patch_feature_size
+
+        self.patcher = ImagePatcher()
+
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[self.image_token]
+
+    def get_num_image_tokens(self, img_width: int, img_height: int) -> int:
+        num_patches, num_newlines = self.patcher.get_num_patches(img_width, img_height)
+
+        return (
+            num_patches * (self.num_patch_feature_size + 2)
+            + self.num_image_feature_size
+            + 2
+            + num_newlines
+        )
+
+    def _split_images(self, images: list[Image.Image]) -> list[ImageWithPatches]:
+        result = []
+        for img in images:
+            result.append(self.patcher(img))
+        return result
+
+    def _convert_images_to_pixel_values(
+        self,
+        images: list[Image.Image],
+        is_patch: bool = False,
+    ) -> list[torch.Tensor]:
+        return [
+            self.image_preprocessor(img, is_patch=is_patch)["pixel_values"]
+            for img in images
+        ]
+
+    def _get_patch_repl(
+        self,
+        num_patches: int,
+        patch_newline_mask: list[bool] | None,
+    ) -> tuple[str, list[int]]:
+        text = ""
+        token_ids = []
+        for i in range(num_patches):
+            assert len(patch_newline_mask) == num_patches
+            text += f"<patch_start>{self.patch_feature_placeholder}<patch_end>"
+            token_ids.extend(
+                [self.tokenizer.convert_tokens_to_ids("<patch_start>")]
+                + [self.image_token_id] * self.num_patch_feature_size
+                + [self.tokenizer.convert_tokens_to_ids("<patch_end>")]
+            )
+            if patch_newline_mask and patch_newline_mask[i]:
+                text += "<patch_newline>"
+                token_ids.append(
+                    self.tokenizer.convert_tokens_to_ids("<patch_newline>")
+                )
+        return text, token_ids
+
+    def _get_image_repl(
+        self,
+        num_images: int,
+    ) -> tuple[str, list[int]]:
+        text = f"<im_start>{self.image_feature_placeholder}<im_end>"
+        token_ids = (
+            [self.tokenizer.convert_tokens_to_ids("<im_start>")]
+            + [self.image_token_id] * self.num_image_feature_size
+            + [self.tokenizer.convert_tokens_to_ids("<im_end>")]
+        )
+        return text * num_images, token_ids * num_images
+
+    def _get_image_repl_features(
+        self,
+        num_images: int,
+        num_patches: int,
+        patch_new_line_idx: Optional[list[bool]],
+    ) -> tuple[str, list[int]]:
+        if num_patches > 0:
+            patch_repl, patch_repl_ids = self._get_patch_repl(
+                num_patches, patch_new_line_idx
+            )
+        else:
+            patch_repl = ""
+            patch_repl_ids = []
+        image_repl, image_repl_ids = self._get_image_repl(num_images)
+        return patch_repl + image_repl, patch_repl_ids + image_repl_ids
+
+    def replace_placeholder(self, text: str, placeholder: str, repls: list[str]) -> str:
+        parts = text.split(placeholder)
+
+        if len(parts) - 1 != len(repls):
+            raise ValueError(
+                "The number of placeholders does not match the number of replacements."  # noqa: E501
+            )
+
+        result = [parts[0]]
+        for i, repl in enumerate(repls):
+            result.append(repl)
+            result.append(parts[i + 1])
+
+        return "".join(result)
+
+    def __call__(
+        self,
+        text: Optional[Union[str, list[str]]] = None,
+        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        *args,
+        **kwargs,
+    ) -> BatchFeature:
+        if text is None:
+            text = []
+        if not isinstance(text, list):
+            text = [text]
+        if images is None:
+            images = []
+        if not isinstance(images, list):
+            images = [images]
+
+        if len(images) == 0:
+            image_inputs = {}
+            text_inputs = self.tokenizer(text)
+        else:
+            splitted_images_data = self._split_images(images)
+            pixel_values_lst = []
+            patch_pixel_values_lst = []
+            patch_newline_mask_lst = []
+            image_repl_str_lst = []
+            image_repl_ids_lst = []
+            num_patches = []
+            for (
+                raw_img,
+                img_patches,
+                patch_newline_mask,
+            ) in splitted_images_data:  # noqa: E501
+                pixel_values_lst.extend(self._convert_images_to_pixel_values([raw_img]))
+
+                if len(img_patches) > 0:
+                    patch_pixel_values_lst.extend(
+                        self._convert_images_to_pixel_values(img_patches, is_patch=True)
+                    )
+                num_patches.append(len(img_patches))
+
+                image_repl_str, image_repl_ids = self._get_image_repl_features(
+                    1, len(img_patches), patch_newline_mask
+                )
+                image_repl_str_lst.append(image_repl_str)
+                image_repl_ids_lst.extend(image_repl_ids)
+
+                if patch_newline_mask is not None:
+                    patch_newline_mask_lst.extend(patch_newline_mask)
+
+            image_inputs = {
+                "pixel_values": torch.cat(pixel_values_lst),
+                "num_patches": num_patches,
+            }
+            if patch_pixel_values_lst:
+                image_inputs["patch_pixel_values"] = torch.cat(patch_pixel_values_lst)
+            if patch_newline_mask_lst:
+                image_inputs["patch_newline_mask"] = torch.tensor(
+                    patch_newline_mask_lst, dtype=torch.bool
+                )
+
+            text = [
+                self.replace_placeholder(t, self.image_token, image_repl_str_lst)
+                for t in text
+            ]
+            text_inputs = self.tokenizer(text)
+
+        return BatchFeature(
+            {
+                **text_inputs,
+                **image_inputs,
+            },
+            tensor_type=return_tensors,
+        )
+
+
+################################################
+
+
+class Step3VLImageProcessor(SGLangBaseProcessor):
+    models = [Step3VLForConditionalGeneration]
+
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        # TODO, check _processor is tokenizer or processor.
+        processor = Step3VLProcessor(hf_config, _processor)
+        super().__init__(hf_config, server_args, processor, *args, **kwargs)
+        self.IM_TOKEN_ID = 128001
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token="<im_patch>",
+            image_token_id=128001,
+            image_token_regex=re.compile(r"(?:<im_patch>)"),
+        ).build(_processor)
+
+        mean = [0.48145466, 0.4578275, 0.40821073]
+        std = [0.26862954, 0.26130258, 0.27577711]
+
+    def preprocess(self, image):
+        return {"pixel_values": self.transform(image).unsqueeze(0)}
+
+    def __call__(self, image):
+        return self.preprocess(image)
+
+    async def process_mm_data_async(
+        self,
+        image_data: List[Union[str, bytes]],
+        input_text: str | List[int],
+        request_obj,
+        *args,
+        **kwargs,
+    ):
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            image_data=image_data,
+            video_data=request_obj.video_data,
+            multimodal_tokens=self.mm_tokens,
+        )
+
+        mm_items, input_ids, ret = self.process_and_combine_mm_data(
+            base_output, self.mm_tokens
+        )
+
+        return {
+            "input_ids": input_ids.tolist(),
+            "mm_items": mm_items,
+            "im_token_id": self.mm_tokens.image_token_id,
+        }
diff --git a/python/sglang/srt/multimodal/processors/vila.py b/python/sglang/srt/multimodal/processors/vila.py
new file mode 100644
index 000000000..5f9586b6c
--- /dev/null
+++ b/python/sglang/srt/multimodal/processors/vila.py
@@ -0,0 +1,69 @@
+from typing import Any, Dict, List, Optional, Type
+
+import torch.nn as nn
+from transformers.configuration_utils import PretrainedConfig
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+
+from sglang.srt.managers.io_struct import (
+    EmbeddingReqInput,
+    GenerateReqInput,
+    ImageDataInputItem,
+)
+from sglang.srt.models.vila import VILAForConditionalGeneration
+from sglang.srt.multimodal.processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+)
+from sglang.srt.server_args import ServerArgs
+
+
+class VILAProcessor(ProcessorMixin):
+    """A stub class for the VILA processor."""
+
+    tokenizer: PreTrainedTokenizerBase
+
+
+class VILAMultimodalProcessor(BaseMultimodalProcessor):
+    models: List[Type[nn.Module]] = [VILAForConditionalGeneration]
+
+    _processor: VILAProcessor
+
+    def __init__(
+        self,
+        hf_config: PretrainedConfig,
+        server_args: ServerArgs,
+        _processor: VILAProcessor,
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token=self._processor.tokenizer.image_token,
+            image_token_id=hf_config.image_token_id,
+            video_token_id=hf_config.video_token_id,
+        ).build(_processor)
+
+    async def process_mm_data_async(
+        self,
+        image_data: Optional[ImageDataInputItem | List[ImageDataInputItem]],
+        input_text: str | List[int],
+        request_obj: GenerateReqInput | EmbeddingReqInput,
+        **kwargs,
+    ) -> Optional[Dict[str, Any]]:
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            multimodal_tokens=self.mm_tokens,
+            image_data=image_data,
+        )
+
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(
+            base_output, self.mm_tokens
+        )
+
+        return {
+            "input_ids": input_ids.tolist(),
+            "mm_items": mm_items,
+            "im_token_id": self.mm_tokens.image_token_id,
+            "video_token_id": self.mm_tokens.video_token_id,
+        }
diff --git a/python/sglang/srt/offloader.py b/python/sglang/srt/offloader.py
new file mode 100644
index 000000000..aea7d7f23
--- /dev/null
+++ b/python/sglang/srt/offloader.py
@@ -0,0 +1,548 @@
+import logging
+import os
+from abc import ABC
+from typing import Callable, Generator, List, Optional
+
+import torch
+from torch.func import functional_call
+
+from sglang.srt.distributed.naive_distributed import (
+    NaiveDistributed,
+    get_naive_distributed,
+    set_naive_distributed,
+)
+from sglang.srt.host_shared_memory import (
+    HostSharedMemoryManager,
+    get_host_shared_memory_manager,
+    set_host_shared_memory_manager,
+)
+from sglang.srt.layers.parameter import ModelWeightParameter
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import MultiprocessingSerializer, is_pin_memory_available
+
+logger = logging.getLogger(__name__)
+
+_SubmoduleAccessor = Callable[[torch.nn.Module], torch.nn.Module]
+_WhitelistParamNamesCreator = Callable[[torch.nn.Module], List[str]]
+
+
+class BaseOffloader(ABC):
+    def wrap_modules(
+        self,
+        all_modules_generator: Generator[torch.nn.Module, None, None],
+        submodule_accessor: Optional[_SubmoduleAccessor] = None,
+        whitelist_param_names_creator: Optional[_WhitelistParamNamesCreator] = None,
+    ):
+        return list(all_modules_generator)
+
+    def post_init(self):
+        pass
+
+
+class NoopOffloader(BaseOffloader):
+    pass
+
+
+# For simplicity use singleton, but can surely support multi instance
+_instance: Optional[BaseOffloader] = NoopOffloader()
+
+
+def get_offloader():
+    assert _instance is not None
+    return _instance
+
+
+def set_offloader(instance: BaseOffloader):
+    global _instance
+    _instance = instance
+
+
+def create_offloader_from_server_args(server_args: ServerArgs, dp_rank: int):
+    if server_args.cpu_offload_gb > 0:
+        return OffloaderV1(
+            cpu_offload_max_bytes=int(server_args.cpu_offload_gb * 1024**3)
+        )
+    if server_args.offload_group_size > 0:
+        assert (
+            server_args.cpu_offload_gb == 0
+        ), "V2 offload does not support cpu_offload_gb yet"
+        return OffloaderV2(
+            group_size=server_args.offload_group_size,
+            num_in_group=server_args.offload_num_in_group,
+            prefetch_step=server_args.offload_prefetch_step,
+            mode=server_args.offload_mode,
+            dp_rank=dp_rank,
+            dp_size=server_args.dp_size,
+        )
+    return NoopOffloader()
+
+
+class OffloaderV1(BaseOffloader):
+    def __init__(self, cpu_offload_max_bytes: int):
+        self._cpu_offload_bytes = 0
+        self._cpu_offload_max_bytes = cpu_offload_max_bytes
+
+    def wrap_modules(
+        self,
+        all_modules_generator: Generator[torch.nn.Module, None, None],
+        submodule_accessor: Optional[_SubmoduleAccessor] = None,
+        whitelist_param_names_creator: Optional[_WhitelistParamNamesCreator] = None,
+    ):
+        return [self.maybe_offload_to_cpu(module) for module in all_modules_generator]
+
+    def maybe_offload_to_cpu(self, module: torch.nn.Module) -> torch.nn.Module:
+        if (params := next(module.parameters(), None)) is None:
+            return module
+
+        device = params.device
+
+        if device == torch.device("cpu"):
+            return module
+
+        if self._cpu_offload_bytes >= self._cpu_offload_max_bytes:
+            return module
+
+        pin_memory = is_pin_memory_available()
+        # offload parameters to CPU
+        # use pin_memory if possible, which helps cudagraph capture speed
+        offloaded_parameters = False
+        for p in module.parameters():
+            if self._cpu_offload_bytes >= self._cpu_offload_max_bytes:
+                # we use per-parameter offloading
+                # one module might have some parameters offloaded and some not
+                break
+
+            # `torch.empty_like` does not support `pin_memory` argument
+            cpu_data = torch.empty_strided(
+                size=p.data.size(),
+                stride=p.data.stride(),
+                dtype=p.data.dtype,
+                layout=p.data.layout,
+                device="cpu",
+                pin_memory=pin_memory,
+            )
+            cpu_data.copy_(p.data)
+            p.data = cpu_data
+            self._cpu_offload_bytes += p.data.numel() * p.data.element_size()
+            offloaded_parameters = True
+
+        if offloaded_parameters:
+            original_forward = module.forward
+
+            def forward(*args, **kwargs):
+                module.forward = original_forward
+                device_state = {
+                    # here we blindly call `to(device)`
+                    # if the parameter is already on the device, it will be a no-op
+                    k: v.to(device, non_blocking=True)
+                    for k, v in module.state_dict().items()
+                }
+                output = functional_call(module, device_state, args=args, kwargs=kwargs)
+                module.forward = forward
+                return output
+
+            module.forward = forward
+
+        return module
+
+
+class OffloaderV2(BaseOffloader):
+    def __init__(
+        self,
+        group_size: int,
+        num_in_group: int,
+        prefetch_step: int,
+        mode: str,
+        dp_rank: int,
+        dp_size: int,
+    ):
+        self.group_size = group_size
+        self.num_in_group = num_in_group
+        self.prefetch_step = prefetch_step
+        self.mode = mode
+
+        run_id = os.environ["SGLANG_RUN_ID"]
+
+        # Temporarily init inside Offloader, can move if other modules also need this
+        if self.mode in {"sharded_gpu", "shm_cpu"}:
+            from sglang.srt.distributed import get_tensor_model_parallel_world_size
+
+            assert (
+                get_tensor_model_parallel_world_size() == 1
+            ), "not yet support tp_size!=1"
+            set_naive_distributed(
+                NaiveDistributed(
+                    rank=dp_rank,
+                    world_size=dp_size,
+                    rendezvous=f"/tmp/{run_id}",
+                )
+            )
+        if self.mode in {"shm_cpu"}:
+            set_host_shared_memory_manager(
+                HostSharedMemoryManager(
+                    base_name=run_id,
+                )
+            )
+
+        self.offloaders = []
+
+    def wrap_modules(
+        self,
+        all_modules_generator: Generator[torch.nn.Module, None, None],
+        submodule_accessor: Optional[_SubmoduleAccessor] = None,
+        whitelist_param_names_creator: Optional[_WhitelistParamNamesCreator] = None,
+    ):
+        assert len(self.offloaders) == 0, "should only call wrap_modules once"
+
+        alt_stream = torch.cuda.Stream()
+
+        all_modules = []
+        offload_submodules = []
+        for module_index, module in enumerate(all_modules_generator):
+            all_modules.append(module)
+            if module_index % self.group_size >= self.group_size - self.num_in_group:
+                submodule = submodule_accessor(module)
+                whitelist_param_names = whitelist_param_names_creator(submodule)
+                logger.info(
+                    f"[offloader] offload {module_index=} submodule={type(submodule)} params={whitelist_param_names} memory_allocated={torch.cuda.memory_allocated()}"
+                )
+                offload_submodules.append(submodule)
+                self.offloaders.append(
+                    _ModuleOffloader(
+                        mode=self.mode,
+                        module=submodule,
+                        alt_stream=alt_stream,
+                        whitelist_param_names=whitelist_param_names,
+                    )
+                )
+
+        for index, module in enumerate(offload_submodules):
+            _hook_module_forward_for_offloader(
+                index=index,
+                module=module,
+                offloaders=self.offloaders,
+                prefetch_step=self.prefetch_step,
+            )
+
+        return all_modules
+
+    def post_init(self):
+        for offloader in self.offloaders:
+            offloader.post_init()
+
+        for i in range(self.prefetch_step):
+            self.offloaders[i].start_onload()
+
+
+def _hook_module_forward_for_offloader(index, module, offloaders, prefetch_step):
+    def _on_forward_end():
+        offloaders[(index + prefetch_step) % len(offloaders)].start_onload()
+        offloaders[index].offload()
+
+    _hook_module_forward_raw(
+        module,
+        on_forward_end=_on_forward_end,
+        get_parameter_and_buffer_dicts=lambda: offloaders[
+            index
+        ].wait_and_get_device_tensors(),
+    )
+
+
+def _hook_module_forward_raw(module, on_forward_end, get_parameter_and_buffer_dicts):
+    original_forward = module.forward
+
+    def forward(*args, **kwargs):
+        module.forward = original_forward
+        output = functional_call(
+            module, get_parameter_and_buffer_dicts(), args=args, kwargs=kwargs
+        )
+        on_forward_end()
+        module.forward = forward
+        return output
+
+    module.forward = forward
+
+
+class _ModuleOffloader(ABC):
+    def __init__(
+        self,
+        mode: str,
+        module: torch.nn.Module,
+        alt_stream: torch.cuda.Stream,
+        whitelist_param_names: List[str],
+    ):
+        self.mode = mode
+        self.module = module
+        self.device = next(module.parameters()).device
+        self.alt_stream = alt_stream
+
+        assert self.device != torch.device(
+            "cpu"
+        ), "not handled device=cpu case yet (should skip this tensor)"
+
+        self._device_tensors = None
+        self._load_event = None
+
+        param_dict = dict(self.module.named_parameters())
+        assert all(
+            name in param_dict for name in whitelist_param_names
+        ), f"{whitelist_param_names=} {list(param_dict.keys())=}"
+
+        self._param_offloaders = {
+            name: _BaseParamOffloader.create(mode, module=module, param_name=name)
+            for name in whitelist_param_names
+        }
+
+    def post_init(self):
+        for name, param_offloader in self._param_offloaders.items():
+            param_offloader.post_init()
+
+    def start_onload(self):
+        self.alt_stream.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(self.alt_stream):
+            self._device_tensors = self._create_device_tensors()
+            self._load_event = torch.cuda.Event()
+            self._load_event.record()
+
+    def offload(self):
+        self._device_tensors = None
+        self._load_event = None
+
+    def wait_and_get_device_tensors(self):
+        assert self._device_tensors is not None
+        self._load_event.wait()
+        return self._device_tensors
+
+    def _create_device_tensors(self):
+        return {k: v.create_device_tensor() for k, v in self._param_offloaders.items()}
+
+
+class _BaseParamOffloader(ABC):
+    @staticmethod
+    def create(mode: str, **kwargs) -> "_BaseParamOffloader":
+        return {
+            "meta": _MetaParamOffloader,
+            "cpu": _CpuParamOffloader,
+            "shm_cpu": _ShmCpuParamOffloader,
+            "sharded_gpu": _ShardedGpuParamOffloader,
+        }[mode](**kwargs)
+
+    def __init__(self, module, param_name):
+        self._module = module
+        self._param_name = param_name
+
+    @property
+    def _param(self):
+        return getattr(self._module, self._param_name)
+
+    def post_init(self):
+        pass
+
+    def create_device_tensor(self):
+        raise NotImplementedError
+
+
+class _MetaParamOffloader(_BaseParamOffloader):
+    """Usually used for debugging."""
+
+    def __init__(self, module, param_name):
+        super().__init__(module, param_name)
+        _move_param_to_meta(module, param_name)
+
+    def create_device_tensor(self):
+        return torch.empty_like(self._param.data, device="cuda")
+
+
+class _CpuParamOffloader(_BaseParamOffloader):
+    def __init__(self, module, param_name):
+        super().__init__(module, param_name)
+        _move_param_to_cpu(self._param, pin_memory=True)
+
+    def create_device_tensor(self):
+        return self._param.to("cuda", non_blocking=True)
+
+
+class _ShmCpuParamOffloader(_BaseParamOffloader):
+    def __init__(self, module, param_name):
+        super().__init__(module, param_name)
+        self._rank = get_naive_distributed().get_rank()
+        self._world_size = get_naive_distributed().get_world_size()
+
+        from sglang.srt.distributed import get_tensor_model_parallel_world_size
+
+        assert get_tensor_model_parallel_world_size() == 1, "not yet support tp_size!=1"
+        assert (
+            self._param.data.is_contiguous()
+        ), f"not yet support non-contiguous tensor {self._param.shape=} {self._param.stride()=}"
+
+        self.shm_cpu_data = get_host_shared_memory_manager().malloc(
+            shape=self._param.shape, dtype=self._param.dtype
+        )
+
+        if self._rank == 0:
+            self.shm_cpu_data.copy_(self._param.data.to("cpu"))
+            self._param.data = self.shm_cpu_data
+        else:
+            _move_param_to_meta(self._module, self._param_name)
+        get_naive_distributed().barrier()
+
+    def post_init(self):
+        if self._rank == 0:
+            assert (
+                self.shm_cpu_data.data_ptr() == self._param.data.data_ptr()
+            ), f"{self.shm_cpu_data.data_ptr()=} {self._param.data.data_ptr()=} {self.shm_cpu_data=} {self._param.data=}"
+
+        _move_param_to_meta(self._module, self._param_name)
+
+    def create_device_tensor(self):
+        return self.shm_cpu_data.to("cuda", non_blocking=True)
+
+
+def _move_param_to_cpu(param, pin_memory: bool):
+    cpu_data = _empty_strided_like(
+        param.data,
+        device="cpu",
+        pin_memory=pin_memory,
+    )
+    cpu_data.copy_(param.data)
+    param.data = cpu_data
+
+
+def _move_param_to_meta(module, param_name):
+    old_param = getattr(module, param_name)
+    old_param_type = type(old_param)
+
+    new_data = old_param.data.to("meta")
+
+    if old_param_type == ModelWeightParameter:
+        # manually checked how `w13_weight` and `w2_weight` are constructed
+        new_param = ModelWeightParameter(
+            data=new_data,
+            **{
+                k: getattr(old_param, k)
+                for k in ["input_dim", "output_dim", "weight_loader"]
+            },
+        )
+    elif old_param_type == torch.nn.Parameter:
+        new_param = torch.nn.Parameter(
+            data=new_data,
+            requires_grad=False,
+        )
+    else:
+        raise ValueError(f"Unknown {old_param_type=} {old_param=}")
+
+    setattr(module, param_name, new_param)
+
+
+def _empty_strided_like(x: torch.Tensor, device, pin_memory=False):
+    return torch.empty_strided(
+        size=x.size(),
+        stride=x.stride(),
+        dtype=x.dtype,
+        layout=x.layout,
+        device=device,
+        pin_memory=pin_memory,
+    )
+
+
+# ----------------------------------------- ShardedGpu ------------------------------------------------------
+
+
+# TODO unify with ShmCpu mode
+class _ShardedGpuParamOffloader(_BaseParamOffloader):
+    def __init__(self, module, param_name):
+        super().__init__(module, param_name)
+        self._rank = get_naive_distributed().get_rank()
+        self._world_size = get_naive_distributed().get_world_size()
+
+        from sglang.srt.distributed import get_tensor_model_parallel_world_size
+
+        assert get_tensor_model_parallel_world_size() == 1, "not yet support tp_size!=1"
+        assert (
+            self._param.data.is_contiguous()
+        ), f"not yet support non-contiguous tensor {self._param.shape=} {self._param.stride()=}"
+
+        if self._rank == 0:
+            _move_param_to_cpu(self._param, pin_memory=True)
+        else:
+            _move_param_to_meta(self._module, self._param_name)
+
+        self.sharded_param_handles = None
+
+    def post_init(self):
+        # check again since it may be changed
+        assert (
+            self._param.data.is_contiguous()
+        ), f"not yet support non-contiguous tensor {self._param.shape=} {self._param.stride()=}"
+
+        scatter_src = self._param.data
+
+        logger.info(
+            f"[offloader] post_init {scatter_src.nbytes=} {scatter_src.dtype=} {scatter_src.shape=} {torch.cuda.memory_allocated()=}"
+        )
+
+        if self._rank == 0:
+            scatter_src = scatter_src.to("cuda")
+        scatter_list = _even_chunk(scatter_src, self._world_size)
+
+        sharded_param = torch.empty(
+            scatter_list[0].shape, dtype=scatter_list[0].dtype, device="cuda"
+        )
+        self.sharded_param_handles = _create_shared_buffer_tensors(
+            local_tensor=sharded_param
+        )
+
+        get_naive_distributed().scatter(
+            sharded_param, scatter_list if self._rank == 0 else None
+        )
+
+        _move_param_to_meta(self._module, self._param_name)
+
+    def create_device_tensor(self):
+        output = _empty_strided_like(self._param, device="cuda")
+        output_chunks = output.chunk(self._world_size)
+
+        for index in range(self._world_size):
+            src_rank = (self._rank + index) % self._world_size
+            src_buf = self.sharded_param_handles[src_rank]
+            output_chunks[src_rank].copy_(src_buf)
+
+        return output
+
+
+def _even_chunk(x: torch.Tensor, chunks: int):
+    assert x.shape[0] % chunks == 0, f"{x.shape=} {chunks=}"
+    return list(x.chunk(chunks))
+
+
+def _create_shared_buffer_tensors(local_tensor: torch.Tensor) -> List[torch.Tensor]:
+    self_rank = get_naive_distributed().get_rank()
+    world_size = get_naive_distributed().get_world_size()
+
+    object_list = get_naive_distributed().all_gather_object(
+        dict(
+            dup_serialized_local_tensor=[
+                (
+                    None
+                    if interesting_rank == self_rank
+                    else MultiprocessingSerializer.serialize(local_tensor)
+                )
+                for interesting_rank in range(world_size)
+            ]
+        )
+    )
+
+    output_tensors = []
+    for output_rank in range(world_size):
+        remote_serialized_tensor = object_list[output_rank][
+            "dup_serialized_local_tensor"
+        ][self_rank]
+        if output_rank == self_rank:
+            assert remote_serialized_tensor is None
+            output_tensors.append(local_tensor)
+        else:
+            output_tensors.append(
+                MultiprocessingSerializer.deserialize(remote_serialized_tensor)
+            )
+
+    return output_tensors
diff --git a/python/sglang/srt/operations.py b/python/sglang/srt/operations.py
new file mode 100644
index 000000000..f8730cd77
--- /dev/null
+++ b/python/sglang/srt/operations.py
@@ -0,0 +1,209 @@
+from __future__ import annotations
+
+import os
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Callable, Dict, Generator, List, Sequence, Union
+
+import torch
+
+from sglang.srt.layers.dp_attention import set_dp_buffer_len
+
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+_ENABLE_PROFILE = bool(int(os.environ.get("SGLANG_OPERATIONS_ENABLE_PROFILE", "0")))
+
+if _ENABLE_PROFILE:
+    import nvtx
+
+
+def execute_operations(inputs, operations):
+    stages = _convert_operations_to_stages(operations)
+    executor = _StageExecutor("primary", stages, inputs=inputs)
+    for _ in range(executor.num_stages):
+        executor.next()
+    assert executor.done
+    return executor.output
+
+
+def execute_overlapped_operations(
+    inputs_arr: Sequence,
+    operations_arr: Sequence,
+    delta_stages: Sequence[int],
+) -> Sequence:
+    # Make it explicit for clarity; if we need multi-batch overlap, this can be generalized
+    inputs_a, inputs_b = inputs_arr
+    operations_a, operations_b = operations_arr
+    delta_stage_a, delta_stage_b = delta_stages
+    assert delta_stage_a == 0
+    delta_stage = delta_stage_b
+
+    stages_a = _convert_operations_to_stages(operations_a)
+    stages_b = _convert_operations_to_stages(operations_b)
+    executor_a = _StageExecutor("a", stages_a, inputs=inputs_a)
+    executor_b = _StageExecutor("b", stages_b, inputs=inputs_b)
+
+    for _ in range(delta_stage):
+        executor_a.next()
+
+    for _ in range(executor_a.num_stages - delta_stage):
+        executor_a.next()
+        executor_b.next()
+
+    for _ in range(delta_stage):
+        executor_b.next()
+
+    assert executor_a.done and executor_b.done
+    return [executor_a.output, executor_b.output]
+
+
+class YieldOperation:
+    pass
+
+
+@dataclass
+class ExecutionOperation:
+    debug_name: str
+    fn: Callable
+
+
+Operation = Union[YieldOperation, ExecutionOperation, Callable]
+Stage = List[ExecutionOperation]
+
+
+class _StageExecutor:
+    def __init__(self, debug_name: str, stages: List[Stage], inputs: dict):
+        self._debug_name = debug_name
+        self._stages = stages
+        self._index = 0
+        self._stage_state = _StateDict()
+        self._stage_output = inputs
+
+        # handling DP attention
+        forward_batch: ForwardBatch = inputs["forward_batch"]
+        self._global_dp_buffer_len = forward_batch.global_dp_buffer_len
+        self._local_dp_buffer_len = forward_batch.input_ids.shape[0]
+        self._global_num_tokens = forward_batch.global_num_tokens_cpu
+
+    def next(self):
+        assert not self.done
+
+        stage = self._stages[self._index]
+
+        if self._global_dp_buffer_len is not None:
+            set_dp_buffer_len(
+                self._global_dp_buffer_len,
+                self._local_dp_buffer_len,
+                self._global_num_tokens,
+            )
+
+        with _annotate_region(debug_name=f"{self._debug_name}{self._index}"):
+            for op in stage:
+                with _annotate_region(debug_name=op.debug_name):
+                    self._stage_output = op.fn(
+                        state=self._stage_state,
+                        **(
+                            self._stage_output if self._stage_output is not None else {}
+                        ),
+                    )
+
+        self._index += 1
+
+    @property
+    def output(self):
+        assert self.done
+        return self._stage_output
+
+    @property
+    def done(self):
+        return self._index >= self.num_stages
+
+    @property
+    def num_stages(self):
+        return len(self._stages)
+
+
+@contextmanager
+def _annotate_region(debug_name):
+    if _ENABLE_PROFILE:
+        with torch.autograd.profiler.record_function(debug_name):
+            with nvtx.annotate(debug_name):
+                yield
+    else:
+        yield
+
+
+class _StateDict:
+    def __init__(self):
+        self._data = {}
+
+    def __setattr__(self, key, value):
+        if key == "_data":
+            super().__setattr__(key, value)
+            return
+        assert (
+            key not in self._data
+        ), f"`{key}` already exist, are you sure you want to override it?"
+        self._data[key] = value
+
+    def __getattr__(self, item):
+        return self._data[item]
+
+    def __delattr__(self, item):
+        del self._data[item]
+
+    def pop(self, item):
+        return self._data.pop(item)
+
+    def update(self, values: Dict[str, Any]):
+        for k, v in values.items():
+            setattr(self, k, v)
+
+    def get(self, item):
+        return self._data.get(item)
+
+    def clear(self, expect_keys: Sequence[str]):
+        if set(self._data.keys()) != set(expect_keys):
+            raise Exception(
+                f"Unexpected keys when clearning. This may indicate you do not release memory early enough but leave it to here. {list(self._data.keys())=} {expect_keys=}"
+            )
+
+        self._data.clear()
+
+
+def _convert_operations_to_stages(operations: List[Operation]) -> List[Stage]:
+    operations = _decorate_operations(operations)
+    operation_chunks = list(
+        _chunk_by_separator(operations, lambda op: isinstance(op, YieldOperation))
+    )
+    assert all(len(chunk) > 0 for chunk in operation_chunks)
+    return operation_chunks
+
+
+def _chunk_by_separator(
+    items: List[Any], is_separator: Callable[[Any], bool]
+) -> Generator[List[Any], None, None]:
+    pending_items = []
+    for item in items:
+        if is_separator(item):
+            yield pending_items
+            pending_items = []
+        else:
+            pending_items.append(item)
+    if len(pending_items) > 0:
+        yield pending_items
+
+
+def _decorate_operations(operations: List[Operation], debug_name_prefix: str = ""):
+    return [_decorate_operation(op, debug_name_prefix) for op in operations]
+
+
+def _decorate_operation(operation: Operation, debug_name_prefix: str):
+    if isinstance(operation, YieldOperation):
+        return operation
+    return ExecutionOperation(
+        debug_name=debug_name_prefix
+        + getattr(operation, "__name__", "unknown").replace("op_", ""),
+        fn=operation,
+    )
diff --git a/python/sglang/srt/operations_strategy.py b/python/sglang/srt/operations_strategy.py
new file mode 100644
index 000000000..cbed560e3
--- /dev/null
+++ b/python/sglang/srt/operations_strategy.py
@@ -0,0 +1,211 @@
+from dataclasses import dataclass
+from typing import List, Optional
+
+import torch
+
+from sglang.srt import operations
+from sglang.srt.layers.moe.token_dispatcher import DeepEPConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardMode
+from sglang.srt.operations import Operation
+
+
+@dataclass
+class OperationsStrategy:
+    operations: List[Operation]
+    deep_gemm_num_sms: Optional[int] = None
+    tbo_delta_stages: Optional[int] = None
+
+    @classmethod
+    def concat(cls, items: List["OperationsStrategy"]) -> "OperationsStrategy":
+        return OperationsStrategy(
+            operations=[x for item in items for x in item.operations],
+            deep_gemm_num_sms=_assert_all_same(
+                [item.deep_gemm_num_sms for item in items]
+            ),
+            tbo_delta_stages=_assert_all_same(
+                [item.tbo_delta_stages for item in items]
+            ),
+        )
+
+    @staticmethod
+    def init_new_tbo(
+        layers: torch.nn.ModuleList,
+        forward_mode: ForwardMode,
+    ) -> "OperationsStrategy":
+        layer_name = layers[0].__class__.__name__
+        if layer_name == "DeepseekV2DecoderLayer":
+            return OperationsStrategy.concat(
+                [
+                    _compute_moe_deepseek_layer_operations_strategy_tbo(
+                        layer, forward_mode
+                    )
+                    for layer in layers
+                ]
+            )
+        elif layer_name == "Qwen3MoeDecoderLayer":
+            return OperationsStrategy.concat(
+                [
+                    _compute_moe_qwen3_layer_operations_strategy_tbo(
+                        layer, forward_mode
+                    )
+                    for layer in layers
+                ]
+            )
+        else:
+            raise NotImplementedError
+
+
+def _assert_all_same(items: List):
+    assert all(item == items[0] for item in items)
+    return items[0]
+
+
+# -------------------------------- Strategy for DeepSeek ---------------------------------------
+
+
+# TODO can refactor to make it more fancy if we have more complex strategies
+def _compute_moe_deepseek_layer_operations_strategy_tbo(
+    layer: torch.nn.Module,
+    forward_mode: ForwardMode,
+) -> OperationsStrategy:
+    assert layer.is_layer_sparse, "dense layer TBO not yet implemented"
+    if forward_mode == ForwardMode.EXTEND:
+        return _compute_moe_deepseek_blog_prefill(layer)
+    elif (
+        forward_mode == ForwardMode.DECODE or forward_mode == ForwardMode.TARGET_VERIFY
+    ):
+        return _compute_moe_deepseek_blog_decode(layer)
+    else:
+        raise NotImplementedError(f"Unsupported {forward_mode=}")
+
+
+def _compute_moe_deepseek_blog_prefill(layer):
+    device_properties = torch.cuda.get_device_properties(device="cuda")
+    total_num_sms = device_properties.multi_processor_count
+    deep_gemm_num_sms = total_num_sms - DeepEPConfig.get_instance().num_sms
+
+    return OperationsStrategy(
+        deep_gemm_num_sms=deep_gemm_num_sms,
+        tbo_delta_stages=0,
+        operations=[
+            layer.op_comm_prepare_attn,
+            layer.self_attn.op_prepare,
+            layer.self_attn.op_core,
+            layer.op_comm_prepare_mlp,
+            layer.mlp.op_gate,
+            layer.mlp.op_select_experts,
+            layer.mlp.op_dispatch_a,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_b,
+            layer.mlp.op_experts,
+            layer.mlp.op_combine_a,
+            operations.YieldOperation(),
+            layer.mlp.op_shared_experts,
+            layer.mlp.op_combine_b,
+            layer.mlp.op_output,
+            layer.op_comm_postprocess_layer,
+        ],
+    )
+
+
+def _compute_moe_deepseek_blog_decode(layer):
+    return OperationsStrategy(
+        deep_gemm_num_sms=None,
+        tbo_delta_stages=2,
+        operations=[
+            layer.op_comm_prepare_attn,
+            layer.self_attn.op_prepare,
+            operations.YieldOperation(),
+            layer.self_attn.op_core,
+            layer.op_comm_prepare_mlp,
+            layer.mlp.op_gate,
+            layer.mlp.op_select_experts,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_a,
+            layer.mlp.op_shared_experts,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_b,
+            layer.mlp.op_experts,
+            layer.mlp.op_combine_a,
+            operations.YieldOperation(),
+            layer.mlp.op_combine_b,
+            operations.YieldOperation(),
+            layer.mlp.op_output,
+            layer.op_comm_postprocess_layer,
+        ],
+    )
+
+
+# -------------------------------- Strategy for Qwen3 ---------------------------------------
+
+
+# TODO: unstable, current strategy is almost the same as DeepSeek, keep redundant code here for
+# convenience to adjust strategy
+def _compute_moe_qwen3_layer_operations_strategy_tbo(
+    layer: torch.nn.Module,
+    forward_mode: ForwardMode,
+) -> OperationsStrategy:
+    assert layer.is_layer_sparse, "qwen3 moe only support sparse layers"
+    if forward_mode == ForwardMode.EXTEND:
+        return _compute_moe_qwen3_prefill(layer)
+    elif (
+        forward_mode == ForwardMode.DECODE or forward_mode == ForwardMode.TARGET_VERIFY
+    ):
+        return _compute_moe_qwen3_decode(layer)
+    else:
+        raise NotImplementedError(f"Unsupported {forward_mode=}")
+
+
+def _compute_moe_qwen3_prefill(layer):
+    device_properties = torch.cuda.get_device_properties(device="cuda")
+    total_num_sms = device_properties.multi_processor_count
+    deep_gemm_num_sms = total_num_sms - DeepEPConfig.get_instance().num_sms
+
+    return OperationsStrategy(
+        deep_gemm_num_sms=deep_gemm_num_sms,
+        tbo_delta_stages=0,
+        operations=[
+            layer.op_comm_prepare_attn,
+            layer.self_attn.op_prepare,
+            layer.self_attn.op_core,
+            layer.op_comm_prepare_mlp,
+            layer.mlp.op_gate,
+            layer.mlp.op_select_experts,
+            layer.mlp.op_dispatch_a,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_b,
+            layer.mlp.op_experts,
+            layer.mlp.op_combine_a,
+            operations.YieldOperation(),
+            layer.mlp.op_combine_b,
+            layer.mlp.op_output,
+            layer.op_comm_postprocess_layer,
+        ],
+    )
+
+
+def _compute_moe_qwen3_decode(layer):
+    return OperationsStrategy(
+        deep_gemm_num_sms=None,
+        tbo_delta_stages=2,
+        operations=[
+            layer.op_comm_prepare_attn,
+            layer.self_attn.op_prepare,
+            operations.YieldOperation(),
+            layer.self_attn.op_core,
+            layer.op_comm_prepare_mlp,
+            layer.mlp.op_gate,
+            layer.mlp.op_select_experts,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_a,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_b,
+            layer.mlp.op_experts,
+            layer.mlp.op_combine_a,
+            operations.YieldOperation(),
+            layer.mlp.op_combine_b,
+            layer.mlp.op_output,
+            layer.op_comm_postprocess_layer,
+            operations.YieldOperation(),
+        ],
+    )
diff --git a/python/sglang/srt/parser/code_completion_parser.py b/python/sglang/srt/parser/code_completion_parser.py
new file mode 100644
index 000000000..0067ac471
--- /dev/null
+++ b/python/sglang/srt/parser/code_completion_parser.py
@@ -0,0 +1,132 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Completion templates."""
+
+
+import dataclasses
+import logging
+from enum import auto
+
+from sglang.srt.entrypoints.openai.protocol import CompletionRequest
+
+logger = logging.getLogger(__name__)
+completion_template_name = None
+
+
+class FimPosition:
+    """Position of fim middle token."""
+
+    MIDDLE = auto()
+    END = auto()
+
+
+@dataclasses.dataclass
+class CompletionTemplate:
+    """A class that manages completion prompt templates. only for code completion currently."""
+
+    # The name of this template
+    name: str
+
+    # the fim begin token
+    fim_begin_token: str
+
+    # The fim middle token
+    fim_middle_token: str
+
+    # The fim end token
+    fim_end_token: str
+
+    # The position of the fim middle token
+    fim_position: FimPosition
+
+
+# A global registry for all completion templates
+completion_templates: dict[str, CompletionTemplate] = {}
+
+
+def register_completion_template(template: CompletionTemplate, override: bool = False):
+    """Register a new completion template."""
+    if not override:
+        assert (
+            template.name not in completion_templates
+        ), f"{template.name} has been registered."
+
+    completion_templates[template.name] = template
+
+
+def completion_template_exists(template_name: str) -> bool:
+    return template_name in completion_templates
+
+
+def is_completion_template_defined() -> bool:
+    global completion_template_name
+    return completion_template_name is not None
+
+
+def generate_completion_prompt_from_request(request: CompletionRequest) -> str:
+    global completion_template_name
+    if request.suffix == "":
+        return request.prompt
+
+    return generate_completion_prompt(
+        request.prompt, request.suffix, completion_template_name
+    )
+
+
+def generate_completion_prompt(prompt: str, suffix: str, template_name: str) -> str:
+
+    completion_template = completion_templates[template_name]
+    fim_begin_token = completion_template.fim_begin_token
+    fim_middle_token = completion_template.fim_middle_token
+    fim_end_token = completion_template.fim_end_token
+    fim_position = completion_template.fim_position
+
+    if fim_position == FimPosition.MIDDLE:
+        prompt = f"{fim_begin_token}{prompt}{fim_middle_token}{suffix}{fim_end_token}"
+    elif fim_position == FimPosition.END:
+        prompt = f"{fim_begin_token}{prompt}{fim_end_token}{suffix}{fim_middle_token}"
+
+    return prompt
+
+
+register_completion_template(
+    CompletionTemplate(
+        name="deepseek_coder",
+        fim_begin_token="<｜fim▁begin｜>",
+        fim_middle_token="<｜fim▁hole｜>",
+        fim_end_token="<｜fim▁end｜>",
+        fim_position=FimPosition.MIDDLE,
+    )
+)
+
+
+register_completion_template(
+    CompletionTemplate(
+        name="star_coder",
+        fim_begin_token="<fim_prefix>",
+        fim_middle_token="<fim_middle>",
+        fim_end_token="<fim_suffix>",
+        fim_position=FimPosition.END,
+    )
+)
+
+register_completion_template(
+    CompletionTemplate(
+        name="qwen_coder",
+        fim_begin_token="<|fim_prefix|>",
+        fim_middle_token="<|fim_middle|>",
+        fim_end_token="<|fim_suffix|>",
+        fim_position=FimPosition.END,
+    )
+)
diff --git a/python/sglang/srt/parser/conversation.py b/python/sglang/srt/parser/conversation.py
new file mode 100644
index 000000000..8a2fe4e7f
--- /dev/null
+++ b/python/sglang/srt/parser/conversation.py
@@ -0,0 +1,1040 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Conversation chat templates.
+
+This module provides conversation template definitions, data structures, and utilities
+for managing chat templates across different model types in SGLang.
+
+Key components:
+- Conversation class: Defines the structure and behavior of chat templates
+- SeparatorStyle enum: Different conversation formatting styles
+- Template registry: Functions to register and retrieve templates by name or model path
+- Built-in templates: Pre-defined templates for popular models
+"""
+
+# Adapted from
+# https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+import dataclasses
+import json
+import os
+import re
+from enum import IntEnum, auto
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+from typing_extensions import Literal
+
+from sglang.srt.entrypoints.openai.protocol import ChatCompletionRequest
+from sglang.srt.utils import ImageData, read_system_prompt_from_file
+
+
+class SeparatorStyle(IntEnum):
+    """Separator styles."""
+
+    ADD_COLON_SINGLE = auto()
+    ADD_COLON_TWO = auto()
+    ADD_COLON_SPACE_SINGLE = auto()
+    NO_COLON_SINGLE = auto()
+    NO_COLON_TWO = auto()
+    ADD_NEW_LINE_SINGLE = auto()
+    LLAMA2 = auto()
+    LLAMA3 = auto()
+    LLAMA4 = auto()
+    CHATGLM = auto()
+    CHATML = auto()
+    CHATINTERN = auto()
+    DOLLY = auto()
+    RWKV = auto()
+    PHOENIX = auto()
+    ROBIN = auto()
+    FALCON_CHAT = auto()
+    CHATGLM3 = auto()
+    DEEPSEEK_CHAT = auto()
+    METAMATH = auto()
+    DeepSeekVL2 = auto()
+    QWEN2_VL_EMBED = auto()
+    QWEN2_AUDIO = auto()
+    GEMMA3 = auto()
+    MPT = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that manages prompt templates and keeps all conversation history."""
+
+    # The name of this template
+    name: str
+    # The template of the system prompt
+    system_template: str = "{system_message}"
+    # The system message
+    system_message: str = ""
+    # The names of two roles
+    roles: Tuple[str] = ("USER", "ASSISTANT")
+    # All messages. Each item is (role, message).
+    messages: List[List[str]] = ()
+    # The number of few shot examples
+    offset: int = 0
+    # The separator style and configurations
+    sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
+    sep: str = "\n"
+    sep2: str = None
+    # Stop criteria (the default one is EOS token)
+    stop_str: Union[str, List[str]] = None
+    # The string that represents an image token in the prompt
+    image_token: str = "<image>"
+    video_token: str = "<video>"
+    audio_token: str = "<audio>"
+
+    image_data: Optional[List[ImageData]] = None
+    video_data: Optional[List[str]] = None
+    modalities: Optional[List[str]] = None
+    stop_token_ids: Optional[int] = None
+
+    audio_data: Optional[List[str]] = None
+
+    def get_prompt(self) -> str:
+        """Get the prompt for generation."""
+        system_prompt = self.system_template.format(system_message=self.system_message)
+        if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ": "  # must be end with a space
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
+            ret = "" if system_prompt == "" else system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + "\n" + message + self.sep
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.QWEN2_VL_EMBED:
+            ret = "" if system_prompt == "" else system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + "\n" + message + self.sep
+                else:
+                    ret += role + "\n"
+            ret += self.stop_str
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + message + seps[i % 2]
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.RWKV:
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += (
+                        role
+                        + ": "
+                        + message.replace("\r\n", "\n").replace("\n\n", "\n")
+                    )
+                    ret += "\n\n"
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.LLAMA4:
+            # begin_of_text is added by default
+            if self.system_message:
+                ret = system_prompt
+            else:
+                ret = ""
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += f"<|header_start|>{role}<|header_end|>\n\n"
+                    ret += f"{message.strip()}<|eot|>"
+                else:
+                    ret += f"<|header_start|>{role}<|header_end|>\n\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.LLAMA3:
+            if self.system_message:
+                ret = system_prompt
+            else:
+                ret = ""
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
+                    ret += f"{message.strip()}<|eot_id|>"
+                else:
+                    ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.LLAMA2:
+            seps = [self.sep, self.sep2]
+            if self.system_message:
+                ret = system_prompt
+            else:
+                ret = "[INST] "
+            for i, (role, message) in enumerate(self.messages):
+                tag = self.roles[i % 2]
+                if message:
+                    if i == 0:
+                        ret += message + " "
+                    else:
+                        ret += tag + " " + message + seps[i % 2]
+                else:
+                    ret += tag
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATGLM:
+            # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
+            # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
+            round_add_n = 1 if self.name == "chatglm2" else 0
+            if system_prompt:
+                ret = system_prompt + self.sep
+            else:
+                ret = ""
+
+            for i, (role, message) in enumerate(self.messages):
+                if i % 2 == 0:
+                    ret += f"[Round {i // 2 + round_add_n}]{self.sep}"
+
+                if message:
+                    ret += f"{role}：{message}{self.sep}"
+                else:
+                    ret += f"{role}："
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATML:
+            ret = "" if system_prompt == "" else system_prompt + self.sep + "\n"
+            for role, message in self.messages:
+                if message:
+                    ret += role + "\n" + message + self.sep + "\n"
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATGLM3:
+            ret = ""
+            if self.system_message:
+                ret += system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + "\n" + message
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATINTERN:
+            # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if i % 2 == 0:
+                    ret += "<s>"
+                if message:
+                    ret += role + ":" + message + seps[i % 2] + "\n"
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.DOLLY:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ":\n" + message + seps[i % 2]
+                    if i % 2 == 1:
+                        ret += "\n\n"
+                else:
+                    ret += role + ":\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.PHOENIX:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + "<s>" + message + "</s>"
+                else:
+                    ret += role + ": " + "<s>"
+            return ret
+        elif self.sep_style == SeparatorStyle.ROBIN:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ":\n" + message + self.sep
+                else:
+                    ret += role + ":\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.FALCON_CHAT:
+            ret = ""
+            if self.system_message:
+                ret += system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.METAMATH:
+            ret = "" if system_prompt == "" else system_prompt + self.sep
+            for i, (role, message) in enumerate(self.messages):
+                # For MetaMath, sep2 is used to prefix the message.
+                starting_sep = ":\n" if i % 2 == 0 else ": " + self.sep2
+                ending_sep = self.sep if i % 2 == 0 else ""
+                if message:
+                    ret += role + starting_sep + message + ending_sep
+                else:
+                    ret += role + starting_sep
+            return ret
+        elif self.sep_style == SeparatorStyle.DEEPSEEK_CHAT:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.DeepSeekVL2:
+            seps = [self.sep, self.sep2]
+            if system_prompt == "" or system_prompt is None:
+                ret = ""
+            else:
+                ret = system_prompt + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.GEMMA3:
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if i == 0:
+                        ret += message + self.sep
+                    else:
+                        ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.QWEN2_AUDIO:
+            ret = "" if system_prompt == "" else system_prompt + self.sep
+
+            counter = 1
+            for role, message in self.messages:
+                if message:
+                    while self.audio_token in message:
+                        message = message.replace(
+                            self.audio_token, self.audio_token.format(idx=counter), 1
+                        )
+                        counter += 1
+
+                    ret += role + "\n" + message + self.sep
+                else:
+                    ret += role + "\n"
+
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+
+    def set_system_message(self, system_message: str):
+        """Set the system message."""
+        self.system_message = system_message
+
+    def append_message(self, role: str, message: str):
+        """Append a new message."""
+        self.messages.append([role, message])
+
+    def append_image(self, image: str, detail: Literal["auto", "low", "high"]):
+        """Append a new image."""
+        self.image_data.append(ImageData(url=image, detail=detail))
+
+    def append_video(self, video: str):
+        """Append a new video."""
+        self.video_data.append(video)
+
+    def append_audio(self, audio: str):
+        """Append a new audio."""
+        self.audio_data.append(audio)
+
+    def update_last_message(self, message: str):
+        """Update the last output.
+
+        The last message is typically set to be None when constructing the prompt,
+        so we need to update it in-place after getting the response from a model.
+        """
+        self.messages[-1][1] = message
+
+    def to_gradio_chatbot(self):
+        """Convert the conversation to gradio chatbot format."""
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+
+    def to_openai_api_messages(self):
+        """Convert the conversation to OpenAI chat completion format."""
+        if self.system_message == "":
+            ret = []
+        else:
+            ret = [{"role": "system", "content": self.system_message}]
+
+        for i, (_, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append({"role": "user", "content": msg})
+            else:
+                if msg is not None:
+                    ret.append({"role": "assistant", "content": msg})
+        return ret
+
+    def copy(self):
+        return Conversation(
+            name=self.name,
+            system_template=self.system_template,
+            system_message=self.system_message,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            stop_str=self.stop_str,
+            image_token=self.image_token,
+            video_token=self.video_token,
+            audio_token=self.audio_token,
+        )
+
+    def dict(self):
+        return {
+            "template_name": self.name,
+            "system_message": self.system_message,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+        }
+
+
+# A global registry for all conversation templates
+chat_templates: Dict[str, Conversation] = {}
+matching_function_registry: List[Callable] = []
+
+
+def register_conv_template(template: Conversation, override: bool = False):
+    """Register a new conversation template."""
+    if not override:
+        assert (
+            template.name not in chat_templates
+        ), f"{template.name} has been registered."
+
+    chat_templates[template.name] = template
+
+
+def register_conv_template_matching_function(func):
+    matching_function_registry.append(func)
+
+
+def get_conv_template_by_model_path(model_path):
+    for matching_func in matching_function_registry:
+        conv_name = matching_func(model_path)
+        if conv_name is not None:
+            return conv_name
+    return None
+
+
+def chat_template_exists(template_name: str) -> bool:
+    return template_name in chat_templates
+
+
+def generate_embedding_convs(
+    texts: List[str], images: List[str], template_name: str
+) -> List[Conversation]:
+    conv_template = chat_templates[template_name].copy()
+    convs = []
+    for text, image in zip(texts, images):
+        conv = Conversation(
+            name=conv_template.name,
+            system_template=conv_template.system_template,
+            system_message=conv_template.system_message,
+            roles=conv_template.roles,
+            messages=list(conv_template.messages),  # prevent in-place modification
+            offset=conv_template.offset,
+            sep_style=SeparatorStyle(conv_template.sep_style),
+            sep=conv_template.sep,
+            sep2=conv_template.sep2,
+            stop_str=conv_template.stop_str,
+            image_data=[],
+            video_data=[],
+            audio_data=[],
+            modalities=[],
+            image_token=conv_template.image_token,
+            video_token=conv_template.video_token,
+            audio_token=conv_template.audio_token,
+        )
+        real_content = ""
+
+        if image is not None:
+            image_token = (
+                conv.image_token + "\n"
+                if conv.name != "gme-qwen2-vl"
+                else conv.image_token
+            )
+            real_content += image_token
+        if text is not None:
+            real_content += text
+        conv.append_message(conv.roles[0], real_content)
+        # Add a blank message for the assistant.
+        conv.append_message(conv.roles[1], None)
+        convs.append(conv)
+
+    return convs
+
+
+# Models in which system adds modality tokens at prompt start automatically
+# when media inputs exceed modality tokens in prompt (e.g. 3 images but 2 <image> tokens)
+_MODELS_REQUIRING_MODALITY_SUPPLEMENT = {"deepseek-vl2"}
+
+
+# adapted from https://github.com/vllm-project/vllm/blob/5124f5bf51b83e6f344c1bc6652e8c4d81313b34/vllm/entrypoints/chat_utils.py#L856
+def _get_full_multimodal_text_prompt(
+    modality_token: str, modality_count: int, text_prompt: str
+) -> str:
+    """Combine multimodal prompts for a multimodal language model."""
+
+    # For any existing placeholder in the text prompt, we leave it as is
+    left: int = modality_count - text_prompt.count(modality_token)
+    if left < 0:
+        raise ValueError(
+            f"Found more '{modality_token}' placeholders in input prompt than "
+            "actual multimodal data items."
+        )
+
+    # NOTE: For now we always add missing modality_token at the front of
+    # the prompt. This may change to be customizable in the future.
+    return "\n".join([modality_token] * left + [text_prompt])
+
+
+def generate_chat_conv(
+    request: ChatCompletionRequest, template_name: str
+) -> Conversation:
+    conv = chat_templates[template_name].copy()
+    conv = Conversation(
+        name=conv.name,
+        system_template=conv.system_template,
+        system_message=conv.system_message,
+        roles=conv.roles,
+        messages=list(conv.messages),  # prevent in-place modification
+        offset=conv.offset,
+        sep_style=SeparatorStyle(conv.sep_style),
+        sep=conv.sep,
+        sep2=conv.sep2,
+        stop_str=conv.stop_str,
+        image_data=[],
+        video_data=[],
+        audio_data=[],
+        modalities=[],
+        image_token=conv.image_token,
+        audio_token=conv.audio_token,
+        video_token=conv.video_token,
+    )
+
+    if isinstance(request.messages, str):
+        raise ValueError("The messages should be a list of dict.")
+    for message in request.messages:
+        msg_role = message.role
+        if msg_role == "system":
+            if isinstance(message.content, str):
+                conv.system_message = message.content
+            elif isinstance(message.content, list):
+                if (
+                    len(message.content) != 1
+                    or getattr(message.content[0], "type", None) != "text"
+                ):
+                    raise ValueError("The system message should be a single text.")
+                else:
+                    conv.system_message = getattr(message.content[0], "text", "")
+        elif msg_role == "user":
+            # Handle the various types of Chat Request content types here.
+            if isinstance(message.content, str):
+                conv.append_message(conv.roles[0], message.content)
+            else:
+                real_content = ""
+                # calculate number of image_url
+                num_image_url = 0
+                for content in message.content:
+                    if content.type == "image_url":
+                        num_image_url += 1
+                        conv.modalities.append(content.modalities)
+                image_token = (
+                    conv.image_token + "\n"
+                    if conv.name != "qwen2-vl"
+                    else conv.image_token
+                )
+                add_token_as_needed: bool = (
+                    conv.name in _MODELS_REQUIRING_MODALITY_SUPPLEMENT
+                )
+                if add_token_as_needed:
+                    image_token = ""
+
+                audio_token = conv.audio_token
+                video_token = conv.video_token
+                for content in message.content:
+                    if content.type == "text":
+                        if num_image_url > 16:
+                            real_content += "\n"  # for video
+                        real_content += content.text
+                    elif content.type == "image_url":
+                        # NOTE: works for llava and intervl2_5
+                        if conv.name in ["internvl-2-5"]:
+                            real_content = image_token + real_content
+                        else:
+                            real_content += image_token
+                        conv.append_image(
+                            content.image_url.url, content.image_url.detail
+                        )
+                    elif content.type == "video_url":
+                        real_content += video_token
+                        conv.append_video(content.video_url.url)
+                    elif content.type == "audio_url":
+                        real_content += audio_token
+                        conv.append_audio(content.audio_url.url)
+                if add_token_as_needed:
+                    real_content = _get_full_multimodal_text_prompt(
+                        conv.image_token, num_image_url, real_content
+                    )
+                conv.append_message(conv.roles[0], real_content)
+        elif msg_role == "assistant":
+            parsed_content = ""
+            if isinstance(message.content, str):
+                parsed_content = message.content
+            elif isinstance(message.content, list):
+                if (
+                    len(message.content) != 1
+                    or getattr(message.content[0], "type", None) != "text"
+                ):
+                    raise ValueError(
+                        "The assistant's response should be a single text."
+                    )
+                else:
+                    parsed_content = getattr(message.content[0], "text", "")
+            conv.append_message(conv.roles[1], parsed_content)
+        else:
+            raise ValueError(f"Unknown role: {msg_role}")
+
+    # Add a blank message for the assistant.
+    conv.append_message(conv.roles[1], None)
+    return conv
+
+
+# llama2 template
+# reference: https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+# reference: https://github.com/facebookresearch/llama/blob/1a240688810f8036049e8da36b073f63d2ac552c/llama/generation.py#L212
+register_conv_template(
+    Conversation(
+        name="llama-2",
+        system_template="[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n",
+        roles=("[INST]", "[/INST]"),
+        sep_style=SeparatorStyle.LLAMA2,
+        sep=" ",
+        sep2=" </s><s>",
+        stop_str=["[INST]", "[/INST]", "<<SYS>>", "<</SYS>>"],
+    )
+)
+
+# reference: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/blob/main/chat_template.json
+register_conv_template(
+    Conversation(
+        name="mistral",
+        system_template="[SYSTEM_PROMPT]\n{system_message}\n[/SYSTEM_PROMPT]\n\n",
+        roles=("[INST]", "[/INST]"),
+        sep_style=SeparatorStyle.LLAMA2,
+        sep=" ",
+        sep2=" </s><s>",
+        stop_str=["[INST]", "[/INST]", "[SYSTEM_PROMPT]", "[/SYSTEM_PROMPT]"],
+        image_token="[IMG]",
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="devstral",
+        system_template="[SYSTEM_PROMPT]\n{system_message}\n[/SYSTEM_PROMPT]\n\n",
+        system_message=read_system_prompt_from_file("mistralai/Devstral-Small-2505"),
+        roles=("[INST]", "[/INST]"),
+        sep_style=SeparatorStyle.LLAMA2,
+        sep=" ",
+        sep2=" </s><s>",
+        stop_str=["[INST]", "[/INST]", "[SYSTEM_PROMPT]", "[/SYSTEM_PROMPT]"],
+        image_token="[IMG]",
+    )
+)
+
+# reference: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct/blob/main/chat_template.json
+register_conv_template(
+    Conversation(
+        name="llama-4",
+        system_template="<|header_start|>system<|header_end|>\n\n{system_message}<|eot|>",
+        roles=("user", "assistant"),
+        sep_style=SeparatorStyle.LLAMA4,
+        sep="",
+        stop_str=["<|end_of_text|>", "<|eot|>", "<|eom|>"],
+        image_token="<|image|>",
+    )
+)
+
+# TODO (lifuhuang): Refactor BaseMultimodalProcessor to support the default image token "<|image_{index}|>" in the future.
+register_conv_template(
+    Conversation(
+        name="phi-4-mm",
+        system_message="",
+        system_template="{system_message}",
+        roles=("<|user|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="<|end|>",
+        stop_str="<|end|>",
+        image_token="<|endoftext10|>",
+        audio_token="<|endoftext11|>",
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="chatml",
+        system_template="<|im_start|>system\n{system_message}",
+        system_message="You are a helpful assistant.",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_str=["<|endoftext|>", "<|im_end|>"],
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="chatml-llava",
+        system_template="<|im_start|>system\n{system_message}",
+        system_message="You are a helpful assistant.",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_str=["<|endoftext|>", "<|im_end|>"],
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="vicuna_v1.1",
+        system_message="A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+        roles=("USER", "ASSISTANT"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep=" ",
+        sep2="</s>",
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="llama_3_vision",
+        system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
+        system_template="<|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|>",
+        roles=("user", "assistant"),
+        sep_style=SeparatorStyle.LLAMA3,
+        sep="",
+        stop_str=["<|end_of_text|>", "<|eot_id|>"],
+        image_token="<|image|>",
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="llava_llama_3",
+        system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
+        system_template="<|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|>",
+        roles=("user", "assistant"),
+        sep_style=SeparatorStyle.LLAMA3,
+        sep="",
+        stop_str=["<|end_of_text|>", "<|eot_id|>"],
+    )
+)
+# Reference: https://github.com/InternLM/lmdeploy/blob/387bf54b4f124e72aab30ae9755f562e435d3d01/lmdeploy/model.py#L425-L442
+register_conv_template(
+    Conversation(
+        name="internlm2-chat",
+        system_template="<|im_start|>system\n{system_message}",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep="\n",
+        stop_str=["<|im_end|>", "<|action_end|>"],
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="internvl-2-5",
+        system_template="<|im_start|>system\n{system_message}",
+        system_message="你是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。",
+        roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+        sep_style=SeparatorStyle.MPT,
+        sep="<|im_end|>\n",
+        stop_str=["<|im_end|>", "<|action_end|>"],
+        image_token="<IMG_CONTEXT>",
+    )
+)
+
+# Reference: https://huggingface.co/docs/transformers/main/model_doc/qwen2_vl#usage-example
+register_conv_template(
+    Conversation(
+        name="qwen2-vl",
+        system_message="You are a helpful assistant.",
+        system_template="<|im_start|>system\n{system_message}",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep="<|im_end|>\n",
+        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
+        stop_str=["<|im_end|>"],
+        image_token="<|vision_start|><|image_pad|><|vision_end|>",
+        video_token="<|vision_start|><|video_pad|><|vision_end|>",
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="deepseek-vl2",
+        system_template="{system_message}",
+        # system_message="You are a helpful assistant. Please answer truthfully and write out your "
+        # "thinking step by step to be sure you get the right answer.",
+        system_message="",
+        roles=("<|User|>", "<|Assistant|>"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.DeepSeekVL2,
+        sep="\n\n",
+        sep2="<｜end▁of▁sentence｜>",
+        stop_str=["User:", "<｜end▁of▁sentence｜>"],
+    )
+)
+
+# Reference: https://huggingface.co/google/gemma-3-4b-it/blob/main/config.json
+register_conv_template(
+    Conversation(
+        name="gemma-it",
+        system_message="You are a helpful assistant.",
+        system_template="<start_of_turn>user\n{system_message}\n\n",
+        roles=("<start_of_turn>user\n", "<start_of_turn>model\n"),
+        sep="<end_of_turn>\n",
+        sep_style=SeparatorStyle.GEMMA3,
+        stop_str=["<end_of_turn>"],
+        image_token="<start_of_image>",
+        audio_token="<start_of_audio>",
+    )
+)
+
+# Reference: https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct#usage
+register_conv_template(
+    Conversation(
+        name="gme-qwen2-vl",
+        system_message="You are a helpful assistant.",
+        system_template="<|im_start|>system\n{system_message}",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep="<|im_end|>\n",
+        sep_style=SeparatorStyle.QWEN2_VL_EMBED,
+        stop_str="<|endoftext|>",
+        image_token="<|vision_start|><|image_pad|><|vision_end|>",
+    )
+)
+
+# Reference: https://huggingface.co/openbmb/MiniCPM-V-2_6#usage
+register_conv_template(
+    Conversation(
+        name="minicpmv",
+        system_message="You are a helpful assistant",
+        system_template="<|im_start|>system\n{system_message}.",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep="<|im_end|>\n",
+        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
+        stop_str=("<|im_end|>", "<|endoftext|>"),
+        image_token="(<image>./</image>)",
+        video_token="(<video>./</video>)",
+    )
+)
+
+# Reference: https://github.com/deepseek-ai/Janus?tab=readme-ov-file#janus-pro
+register_conv_template(
+    Conversation(
+        name="janus-pro",
+        system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language",
+        system_template="{system_message}.",
+        roles=("User", "Assistant"),
+        sep="\n\n",
+        sep2="<｜end▁of▁sentence｜>",
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        stop_str=["<|User|>", "<｜end▁of▁sentence｜>"],
+        image_token="<image_placeholder>",
+    )
+)
+
+# Reference: https://huggingface.co/openbmb/MiniCPM-o-2_6#usage
+register_conv_template(
+    Conversation(
+        name="minicpmo",
+        system_message="You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
+        system_template="<|im_start|>system\n{system_message}",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep="<|im_end|>\n",
+        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
+        stop_str=("<|im_end|>", "<|endoftext|>"),
+        image_token="(<image>./</image>)",
+        audio_token="(<audio>./</audio>)",
+    )
+)
+
+# Reference: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/chat_template.jinja
+register_conv_template(
+    Conversation(
+        name="kimi-vl",
+        system_message="You are a helpful assistant",
+        system_template="<|im_system|>system<|im_middle|>{system_message}",
+        roles=(
+            "<|im_user|>user<|im_middle|>",
+            "<|im_assistant|>assistant<|im_middle|>",
+        ),
+        messages=[],
+        sep="<|im_end|>",
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        stop_str="<|im_end|>",
+        image_token="<|media_start|>image<|media_content|><|media_pad|><|media_end|>",
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="qwen2-audio",
+        system_template="<|im_start|>system\n{system_message}",
+        system_message="You are a helpful assistant.",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep="<|im_end|>\n",
+        sep_style=SeparatorStyle.QWEN2_AUDIO,
+        stop_str=["<|im_end|>"],
+        audio_token="Audio {idx}: <|audio_bos|><|AUDIO|><|audio_eos|>\n",
+    )
+)
+
+
+MODEL_TYPE_TO_TEMPLATE = {
+    "internvl_chat": "internvl-2-5",
+    "deepseek_vl_v2": "deepseek-vl2",
+    "multi_modality": "janus-pro",
+    "phi4mm": "phi-4-mm",
+    "minicpmv": "minicpmv",
+    "minicpmo": "minicpmo",
+}
+
+
+def get_model_type(model_path: str) -> Optional[str]:
+    config_path = os.path.join(model_path, "config.json")
+    if not os.path.exists(config_path):
+        return None
+    try:
+        with open(config_path, "r", encoding="utf-8") as f:
+            config = json.load(f)
+        return config.get("model_type")
+    except (IOError, json.JSONDecodeError):
+        return None
+
+
+@register_conv_template_matching_function
+def match_internvl(model_path: str):
+    if re.search(r"internvl", model_path, re.IGNORECASE):
+        return "internvl-2-5"
+    model_type = get_model_type(model_path)
+    return MODEL_TYPE_TO_TEMPLATE.get(model_type)
+
+
+@register_conv_template_matching_function
+def match_deepseek_janus_pro(model_path: str):
+    if re.search(r"janus", model_path, re.IGNORECASE):
+        return "janus-pro"
+    model_type = get_model_type(model_path)
+    return MODEL_TYPE_TO_TEMPLATE.get(model_type)
+
+
+@register_conv_template_matching_function
+def match_vicuna(model_path: str):
+    if re.search(r"vicuna|llava-v1\.5|llava-next-video-7b", model_path, re.IGNORECASE):
+        return "vicuna_v1.1"
+
+
+@register_conv_template_matching_function
+def match_deepseek_vl(model_path: str):
+    if re.search(r"deepseek.*vl2", model_path, re.IGNORECASE):
+        return "deepseek-vl2"
+    model_type = get_model_type(model_path)
+    return MODEL_TYPE_TO_TEMPLATE.get(model_type)
+
+
+@register_conv_template_matching_function
+def match_qwen_chat_ml(model_path: str):
+    if re.search(
+        r"llava-v1\.6-34b|llava-v1\.6-yi-34b|llava-next-video-34b|llava-onevision-qwen2",
+        model_path,
+        re.IGNORECASE,
+    ):
+        return "chatml-llava"
+
+
+@register_conv_template_matching_function
+def match_minicpm(model_path: str):
+    match = re.search(r"minicpm-(v|o)", model_path, re.IGNORECASE)
+    if match:
+        return f"minicpm{match.group(1).lower()}"
+    model_type = get_model_type(model_path)
+    return MODEL_TYPE_TO_TEMPLATE.get(model_type)
+
+
+@register_conv_template_matching_function
+def match_phi_4_mm(model_path: str):
+    if "phi-4-multimodal" in model_path.lower():
+        return "phi-4-mm"
+    model_type = get_model_type(model_path)
+    return MODEL_TYPE_TO_TEMPLATE.get(model_type)
diff --git a/python/sglang/srt/parser/harmony_parser.py b/python/sglang/srt/parser/harmony_parser.py
new file mode 100644
index 000000000..ffc0be95e
--- /dev/null
+++ b/python/sglang/srt/parser/harmony_parser.py
@@ -0,0 +1,588 @@
+import re
+from dataclasses import dataclass
+from typing import Iterator, List, Optional, Tuple
+
+
+@dataclass
+class Event:
+    """Represents a parsed event from the Harmony stream."""
+
+    event_type: str
+    content: str
+    raw_text: str = None  # Original text including structural markers
+
+
+@dataclass
+class Token:
+    """A structural token in the Harmony format."""
+
+    type: str
+    start: int
+    end: int
+
+
+def prefix_hold(text: str, tokens: List[str]) -> Tuple[str, str]:
+    """
+    Holds back the longest suffix of `text` that could be a prefix of any token.
+    Returns (emit_now, keep_for_later).
+    """
+    if not text:
+        return "", ""
+    max_hold = 0
+    for tok in tokens:
+        if not tok:
+            continue
+        # Check for prefixes of tok in the suffix of text
+        L = min(len(tok) - 1, len(text))
+        for k in range(L, 0, -1):
+            if tok.startswith(text[-k:]):
+                max_hold = max(max_hold, k)
+                break
+    if max_hold == 0:
+        return text, ""
+    return text[:-max_hold], text[-max_hold:]
+
+
+def iter_tokens(text: str, start_pos: int = 0) -> Iterator[Token]:
+    """Iterate over structural tokens in left-to-right order."""
+    TOKENS = {
+        "<|start|>": "START",
+        "<|channel|>": "CHANNEL",
+        "<|message|>": "MESSAGE",
+        "<|constrain|>": "CONSTRAIN",
+        "<|end|>": "END",
+        "<|call|>": "CALL",
+        "<|return|>": "RETURN",
+    }
+
+    pos = start_pos
+    has_unknown_tokens = False
+    while pos < len(text):
+        # Find next "<|"
+        marker_pos = text.find("<|", pos)
+        if marker_pos == -1:
+            break
+
+        # Emit any text before the marker
+        if marker_pos > pos:
+            yield Token("TEXT", pos, marker_pos)
+
+        # Check which token it is
+        found_token = False
+
+        for literal, token_type in TOKENS.items():
+            if text.startswith(literal, marker_pos):
+                yield Token(token_type, marker_pos, marker_pos + len(literal))
+                pos = marker_pos + len(literal)
+                found_token = True
+                break
+        if not found_token:
+            tail = text[marker_pos:]
+            is_partial = any(lit.startswith(tail) for lit in TOKENS)
+            if is_partial:
+                # Hold whole tail (partial token)
+                yield Token("TEXT", marker_pos, len(text))
+                pos = len(text)
+                break
+            else:
+                # Unknown token like <|weird|> ...
+                has_unknown_tokens = True
+                # Emit the "<|" as a TEXT token first
+                yield Token("TEXT", marker_pos, marker_pos + 2)
+
+                # Try to find a closing "|>" for this unknown token
+                close_pos = text.find("|>", marker_pos + 2)
+                if close_pos != -1:
+                    # Look ahead to the next structural token after the unknown close
+                    next_marker = text.find("<|", close_pos + 2)
+                    if next_marker != -1:
+                        # Emit the unknown body + any following plain text up to next marker
+                        yield Token("TEXT", marker_pos + 2, next_marker)
+                        pos = next_marker
+                    else:
+                        # Emit until the end
+                        yield Token("TEXT", marker_pos + 2, len(text))
+                        pos = len(text)
+                        break
+                else:
+                    # No closing; advance past "<|" and continue scanning
+                    pos = marker_pos + 2
+
+    # Emit any remaining text
+    if pos < len(text):
+        yield Token("TEXT", pos, len(text))
+    elif pos == len(text) and has_unknown_tokens:
+        # Add an empty trailing TEXT token only when we encountered unknown tokens
+        # and the text ends with a known structural token. This matches expected tests.
+        for literal in TOKENS.keys():
+            if text.endswith(literal):
+                yield Token("TEXT", pos, pos)
+                break
+
+
+class CanonicalStrategy:
+    """Parses the canonical Harmony format with channel markers."""
+
+    def __init__(self):
+        self.guard_tokens = [
+            "<|start|>",
+            "<|channel|>",
+            "<|message|>",
+            "<|constrain|>",
+            "<|end|>",
+            "<|call|>",
+            "<|return|>",
+        ]
+
+    def parse(self, text: str) -> Tuple[List[Event], str]:
+        events = []
+        tokens = list(iter_tokens(text))
+
+        if not tokens:
+            return events, ""
+
+        pos = 0
+        while pos < len(tokens):
+            token = tokens[pos]
+
+            if token.type == "TEXT":
+                # Check if this might be incomplete
+                if pos == len(tokens) - 1:  # Last token
+                    emit, hold = prefix_hold(
+                        text[token.start : token.end], self.guard_tokens
+                    )
+                    if emit:
+                        events.append(Event("normal", emit))
+                    return events, hold
+                else:
+                    # Check if this might be commentary filler between blocks
+                    if self._is_commentary_filler_between_blocks(text, tokens, pos):
+                        # Skip this filler text - don't emit as normal content
+                        pos += 1
+                    else:
+                        content = text[token.start : token.end]
+                        # Skip standalone structural tokens that shouldn't be emitted as normal text
+                        if not self._is_standalone_structural_token(content):
+                            events.append(Event("normal", content))
+                        pos += 1
+
+            elif token.type in ("START", "CHANNEL"):
+                # Parse a channel block starting here
+                block_result = self._parse_block(text, tokens, pos)
+                if block_result is None:
+                    # Incomplete block - check if we can emit partial reasoning content
+                    partial_result = self._parse_partial_analysis(text, tokens, pos)
+                    if partial_result:
+                        event, remaining_text = partial_result
+                        events.append(event)
+                        return events, remaining_text
+                    # No partial content, hold entire remaining text
+                    remaining_start = tokens[pos].start
+                    return events, text[remaining_start:]
+                event, new_pos = block_result
+                if event:
+                    events.append(event)
+                pos = new_pos
+
+            else:
+                # Check if this might be commentary filler between blocks
+                if self._is_commentary_filler_between_blocks(text, tokens, pos):
+                    # Skip this filler text - don't emit as normal content
+                    pos += 1
+                else:
+                    # Unexpected token - only emit as text if it's not a standalone structural token
+                    content = text[token.start : token.end]
+                    if not self._is_standalone_structural_token(content):
+                        events.append(Event("normal", content))
+                    pos += 1
+
+        return events, ""
+
+    def _parse_partial_analysis(
+        self, text: str, tokens: List[Token], start_pos: int
+    ) -> Optional[Tuple[Event, str]]:
+        """Try to parse partial analysis content for incremental streaming."""
+        pos = start_pos
+
+        # Skip <|start|> if present
+        if pos < len(tokens) and tokens[pos].type == "START":
+            pos += 1
+
+        # Look for <|channel|> followed by analysis
+        channel_pos = None
+        message_pos = None
+
+        for i in range(pos, len(tokens)):
+            if tokens[i].type == "CHANNEL" and channel_pos is None:
+                channel_pos = i
+            elif tokens[i].type == "MESSAGE":
+                message_pos = i
+                break
+
+        if channel_pos is None or message_pos is None:
+            return None
+
+        # Extract channel type
+        channel_start = (
+            tokens[channel_pos + 1].start
+            if channel_pos + 1 < len(tokens)
+            else tokens[channel_pos].end
+        )
+        channel_end = tokens[message_pos].start
+        channel_header = text[channel_start:channel_end]
+
+        channel_type = self._extract_channel_type(channel_header)
+        if channel_type != "analysis":
+            return None  # Only stream analysis content - tool calls wait for completion
+
+        # Extract partial content after <|message|>
+        content_start = tokens[message_pos].end
+        content = text[content_start:]
+
+        # Return partial reasoning content and preserve the channel structure for next parse
+        remaining_text = text[tokens[start_pos].start : content_start]
+        return Event("reasoning", content), remaining_text
+
+    def _extract_channel_type(self, header_text: str) -> Optional[str]:
+        """Extract channel type from header, ignoring other attributes like to=... or <|constrain|>..."""
+        # Look for channel type at the start of the header (case insensitive)
+        header_clean = header_text.strip()
+
+        if header_clean.lower().startswith("analysis"):
+            return "analysis"
+        elif header_clean.lower().startswith("commentary"):
+            return "commentary"
+        elif header_clean.lower().startswith("final"):
+            return "final"
+        else:
+            return None  # Unknown channel type
+
+    def _parse_block(
+        self, text: str, tokens: List[Token], start_pos: int
+    ) -> Optional[Tuple[Optional[Event], int]]:
+        """Parse a channel block. Returns (event, next_pos) or None if incomplete."""
+        pos = start_pos
+
+        # Skip <|start|> if present
+        if pos < len(tokens) and tokens[pos].type == "START":
+            pos += 1
+
+        # Look for <|channel|> or <|message|> (tool responses go direct to message)
+        channel_pos = None
+        message_pos = None
+
+        for i in range(pos, len(tokens)):
+            if tokens[i].type == "CHANNEL" and channel_pos is None:
+                channel_pos = i
+            elif tokens[i].type == "MESSAGE":
+                message_pos = i
+                break
+
+        if message_pos is None:
+            return None  # No message token found
+
+        # If no channel found, this is a tool response - treat as normal text
+        if channel_pos is None:
+            content_start = tokens[message_pos].end
+            # Find end token after message
+            end_token_pos = None
+            for i in range(message_pos + 1, len(tokens)):
+                if tokens[i].type in ("END", "CALL", "RETURN"):
+                    end_token_pos = i
+                    break
+            if end_token_pos is None:
+                return None  # Incomplete
+            content = text[content_start : tokens[end_token_pos].start]
+            return Event("normal", content), end_token_pos + 1
+
+        # Standard channel block processing - message_pos is already found above
+        pos = channel_pos + 1  # Skip CHANNEL token
+
+        # Extract channel type from header (ignoring other attributes like to=... or <|constrain|>...)
+        channel_start = tokens[pos].start if pos < len(tokens) else tokens[pos - 1].end
+        channel_end = tokens[message_pos].start
+        channel_header = text[channel_start:channel_end]
+
+        channel_type = self._extract_channel_type(channel_header)
+        if not channel_type:
+            return None  # Unknown or malformed channel
+
+        pos = message_pos + 1  # Skip MESSAGE token
+
+        # Find content and end token
+        content_start = tokens[message_pos].end
+        end_pos = pos
+
+        # Each channel type has specific valid end tokens
+        if channel_type == "final":
+            while end_pos < len(tokens) and tokens[end_pos].type != "RETURN":
+                end_pos += 1
+        elif channel_type == "analysis":
+            while end_pos < len(tokens) and tokens[end_pos].type not in ("END", "CALL"):
+                end_pos += 1
+        else:  # commentary
+            while end_pos < len(tokens) and tokens[end_pos].type not in ("END", "CALL"):
+                end_pos += 1
+
+        if end_pos >= len(tokens):
+            # No end token found
+            if channel_type == "final":
+                # Final blocks can end at end of input without requiring <|return|>
+                content = text[content_start:]
+                return Event("normal", content), end_pos
+            return None  # Analysis and commentary need proper end tokens
+
+        end_token = tokens[end_pos]
+        content = text[content_start : end_token.start]
+
+        # Create event based on channel and end token
+        if channel_type == "analysis":
+            if end_token.type == "CALL":
+                # Built-in tools (browser, python) use analysis channel with <|call|>
+                raw_text = text[tokens[start_pos].start : end_token.end]
+                return Event("tool_call", content.strip(), raw_text), end_pos + 1
+            else:
+                return Event("reasoning", content), end_pos + 1
+        elif channel_type == "commentary":
+            if end_token.type == "CALL":
+                raw_text = text[tokens[start_pos].start : end_token.end]
+                return Event("tool_call", content.strip(), raw_text), end_pos + 1
+            else:
+                return Event("normal", content), end_pos + 1
+        elif channel_type == "final":
+            # For final blocks, include any trailing TEXT immediately after <|return|>
+            final_content = content
+            if end_token.type == "RETURN" and end_pos + 1 < len(tokens):
+                next_token = tokens[end_pos + 1]
+                if next_token.type == "TEXT":
+                    final_content += text[next_token.start : next_token.end]
+                    return Event("normal", final_content), end_pos + 2
+            return Event("normal", final_content), end_pos + 1
+
+        return None, end_pos + 1
+
+    def _is_commentary_filler_between_blocks(
+        self, text: str, tokens: List[Token], pos: int
+    ) -> bool:
+        """Check if this is commentary filler text or problematic structural tokens in malformed sequences."""
+        current_token = tokens[pos]
+        current_text = text[current_token.start : current_token.end].strip()
+
+        # Check for commentary filler between CALL and CHANNEL
+        if pos > 0 and pos + 1 < len(tokens):
+            prev_token = tokens[pos - 1]
+            next_token = tokens[pos + 1]
+
+            # Check if we have CALL -> TEXT("commentary") -> CHANNEL pattern
+            if (
+                prev_token.type == "CALL"
+                and next_token.type == "CHANNEL"
+                and current_text.lower() == "commentary"
+            ):
+                return True
+
+        # Check for problematic patterns after CALL tokens (malformed sequences)
+        if pos > 0:
+            prev_token = tokens[pos - 1]
+
+            # Only filter structural tokens that appear immediately after CALL in malformed sequences
+            # These patterns indicate the content is malformed and the structural tokens are noise
+            if prev_token.type == "CALL":
+                # Filter MESSAGE tokens after CALL (should not happen in well-formed content)
+                if current_token.type == "MESSAGE":
+                    return True
+
+                # Filter standalone "commentary" text after CALL
+                if (
+                    current_token.type == "TEXT"
+                    and current_text.lower() == "commentary"
+                ):
+                    return True
+
+        return False
+
+    def _is_standalone_structural_token(self, content: str) -> bool:
+        """Check if content is just a standalone structural token that should be filtered."""
+        content_stripped = content.strip()
+        structural_tokens = [
+            "<|start|>",
+            "<|channel|>",
+            "<|message|>",
+            "<|constrain|>",
+            "<|end|>",
+            "<|call|>",
+            "<|return|>",
+        ]
+        return content_stripped in structural_tokens
+
+
+class TextStrategy:
+    """Parses the text-based Harmony fallback format."""
+
+    def __init__(self):
+        self.buffer_context = ""
+        self.patterns = {
+            "analysis_then_final": re.compile(
+                r"^\s*(?:assistant)?\s*(analysis|commentary)(.*?)\s*assistantfinal\s*(.*)\s*$",
+                re.IGNORECASE | re.DOTALL,
+            ),
+            "final_only": re.compile(
+                r"^\s*assistantfinal\s*(.*)\s*$", re.IGNORECASE | re.DOTALL
+            ),
+            "analysis_only": re.compile(
+                r"^\s*(?:assistant)?\s*(analysis|commentary)(.*)\s*$",
+                re.IGNORECASE | re.DOTALL,
+            ),
+        }
+
+    def set_buffer_context(self, buffer: str):
+        self.buffer_context = buffer
+
+    def parse(self, text: str) -> Tuple[List[Event], str]:
+        events = []
+
+        m = self.patterns["analysis_then_final"].match(text)
+        if m:
+            channel, reasoning, final = m.groups()
+            if channel.lower() == "analysis" and reasoning.strip():
+                events.append(Event("reasoning", reasoning.strip()))
+            elif channel.lower() == "commentary" and reasoning.strip():
+                events.append(Event("normal", reasoning.strip()))
+            if final.strip():
+                events.append(Event("normal", final.strip()))
+            return events, ""
+
+        # If assistantfinal appears to be incomplete (e.g., 'assistantfin'), hold entire buffer
+        if re.search(
+            r"(?:^|\s)(?:assistant)?\s*(analysis|commentary)", text, re.IGNORECASE
+        ):
+            low = text.lower()
+            if "assistantfin" in low and "assistantfinal" not in low:
+                return events, text
+
+        m = self.patterns["final_only"].match(text)
+        if m:
+            final = m.group(1)
+            if final.strip():
+                events.append(Event("normal", final.strip()))
+            return events, ""
+
+        m = self.patterns["analysis_only"].match(text)
+        if m:
+            channel, content = m.groups()
+            emit, hold = prefix_hold(content, ["assistantfinal"])
+            if channel.lower() == "analysis" and emit:
+                # Stream reasoning content as-is based on structural markers only.
+                events.append(Event("reasoning", emit))
+                # Keep the channel header in the remaining buffer to continue parsing
+                # subsequent chunks in the text fallback format. Preserve any held
+                # prefix that may complete into "assistantfinal".
+                if hold:
+                    return events, text[: m.start(2)] + hold
+                else:
+                    return events, channel
+            elif channel.lower() == "commentary" and emit:
+                # For commentary, stream as normal text. Preserve spaces unless holding.
+                content_out = emit if hold else emit.strip()
+                events.append(Event("normal", content_out))
+                if hold:
+                    return events, text[: m.start(2)] + hold
+                else:
+                    return events, ""
+            # If no emit, just return the held content
+            return events, text[: m.start(2)] + hold
+
+        emit, hold = prefix_hold(text, ["analysis", "commentary", "assistantfinal"])
+        if emit:
+            events.append(Event("normal", emit))
+        return events, hold
+
+
+class HarmonyParser:
+    """Facade for parsing Harmony format, switching between strategies."""
+
+    def __init__(self):
+        self.strategy = None
+        self._buffer = ""
+        self._should_filter_commentary = (
+            False  # Track if we should filter commentary in next chunks
+        )
+        self._partial_commentary = (
+            ""  # Track partial commentary being built across chunks
+        )
+
+    def parse(self, chunk: str) -> List[Event]:
+        self._buffer += chunk
+
+        if self.strategy is None:
+            if "<|channel|>" in self._buffer or "<|start|>" in self._buffer:
+                self.strategy = CanonicalStrategy()
+            elif re.search(
+                r"(?:^|\s)(?:assistant)?\s*(analysis|commentary|assistantfinal)",
+                self._buffer,
+                re.IGNORECASE,
+            ):
+                self.strategy = TextStrategy()
+            else:
+                # Not yet determined, hold
+                return []
+
+        if hasattr(self.strategy, "set_buffer_context"):
+            # Provide full buffer context to strategy for smarter whitespace handling
+            self.strategy.set_buffer_context(self._buffer)
+
+        events, remaining = self.strategy.parse(self._buffer)
+
+        # Check if we should start filtering commentary (after <|call|> token or tool_call event)
+        buffer_has_call_token = self._buffer.rstrip().endswith("<|call|>")
+
+        self._buffer = remaining
+
+        # Filter events for streaming case
+        filtered_events = []
+        for event in events:
+            should_filter = False
+
+            if event.event_type == "normal":
+                # Check if we're in a commentary filtering state
+                if self._should_filter_commentary or self._partial_commentary:
+                    # Try to build partial commentary
+                    potential_commentary = (
+                        self._partial_commentary + event.content.strip().lower()
+                    )
+
+                    if potential_commentary == "commentary":
+                        # Complete commentary found - filter it
+                        should_filter = True
+                        self._partial_commentary = ""  # Reset
+                        self._should_filter_commentary = False  # Done filtering
+                    elif "commentary".startswith(potential_commentary):
+                        # Partial match - accumulate and filter this chunk
+                        should_filter = True
+                        self._partial_commentary = potential_commentary
+                    else:
+                        # Not commentary - reset and keep the event
+                        self._partial_commentary = ""
+                        self._should_filter_commentary = False
+                else:
+                    # Not in commentary filtering state - reset partial state
+                    self._partial_commentary = ""
+
+            if should_filter:
+                # Skip this commentary filler
+                continue
+
+            # Update filtering state based on events and buffer state
+            if event.event_type == "tool_call":
+                self._should_filter_commentary = (
+                    True  # Filter commentary after tool calls
+                )
+                self._partial_commentary = ""  # Reset on tool call
+            elif buffer_has_call_token:
+                self._should_filter_commentary = (
+                    True  # Filter commentary after <|call|> token
+                )
+
+            filtered_events.append(event)
+
+        return filtered_events
diff --git a/python/sglang/srt/parser/jinja_template_utils.py b/python/sglang/srt/parser/jinja_template_utils.py
new file mode 100644
index 000000000..be7d44097
--- /dev/null
+++ b/python/sglang/srt/parser/jinja_template_utils.py
@@ -0,0 +1,197 @@
+"""Template utilities for Jinja template processing.
+
+This module provides utilities for analyzing and processing Jinja chat templates,
+including content format detection and message processing.
+"""
+
+import logging
+
+import jinja2
+import transformers.utils.chat_template_utils as hf_chat_utils
+
+from sglang.srt.utils import ImageData
+
+logger = logging.getLogger(__name__)
+
+# ============================================================================
+# JINJA TEMPLATE CONTENT FORMAT DETECTION
+# ============================================================================
+#
+# This adapts vLLM's approach for detecting chat template content format:
+# https://github.com/vllm-project/vllm/blob/02f0c7b220422792f5e53de2a7d51d2d3ff2df28/vllm/entrypoints/chat_utils.py#L296-L313
+# - Analyzes Jinja template AST to detect content iteration patterns
+# - 'openai' format: templates with {%- for content in message['content'] -%} loops
+# - 'string' format: templates that expect simple string content
+# - Processes content accordingly to match template expectations
+
+
+def _is_var_access(node: jinja2.nodes.Node, varname: str) -> bool:
+    """Check if node is a variable access like {{ varname }}"""
+    if isinstance(node, jinja2.nodes.Name):
+        return node.ctx == "load" and node.name == varname
+    return False
+
+
+def _is_attr_access(node: jinja2.nodes.Node, varname: str, key: str) -> bool:
+    """Check if node is an attribute access like {{ varname['key'] }} or {{ varname.key }}"""
+    if isinstance(node, jinja2.nodes.Getitem):
+        return (
+            _is_var_access(node.node, varname)
+            and isinstance(node.arg, jinja2.nodes.Const)
+            and node.arg.value == key
+        )
+
+    if isinstance(node, jinja2.nodes.Getattr):
+        return _is_var_access(node.node, varname) and node.attr == key
+
+    return False
+
+
+def _is_var_or_elems_access(
+    node: jinja2.nodes.Node,
+    varname: str,
+    key: str = None,
+) -> bool:
+    """Check if node accesses varname or varname[key] with filters/tests"""
+    if isinstance(node, jinja2.nodes.Filter):
+        return node.node is not None and _is_var_or_elems_access(
+            node.node, varname, key
+        )
+    if isinstance(node, jinja2.nodes.Test):
+        return _is_var_or_elems_access(node.node, varname, key)
+
+    if isinstance(node, jinja2.nodes.Getitem) and isinstance(
+        node.arg, jinja2.nodes.Slice
+    ):
+        return _is_var_or_elems_access(node.node, varname, key)
+
+    return _is_attr_access(node, varname, key) if key else _is_var_access(node, varname)
+
+
+def _try_extract_ast(chat_template: str):
+    """Try to parse the Jinja template into an AST"""
+    try:
+        jinja_compiled = hf_chat_utils._compile_jinja_template(chat_template)
+        return jinja_compiled.environment.parse(chat_template)
+    except Exception as e:
+        logger.debug(f"Error when compiling Jinja template: {e}")
+        return None
+
+
+def detect_jinja_template_content_format(chat_template: str) -> str:
+    """
+    Detect whether a chat template expects 'string' or 'openai' content format.
+
+    - 'string': content is a simple string (like DeepSeek templates)
+    - 'openai': content is a list of structured dicts (like Llama4 templates)
+
+    Detection logic:
+    - If template has loops like {%- for content in message['content'] -%} → 'openai'
+    - Otherwise → 'string'
+    """
+    jinja_ast = _try_extract_ast(chat_template)
+    if jinja_ast is None:
+        return "string"
+
+    try:
+        # Look for patterns like: {%- for content in message['content'] -%}
+        for loop_ast in jinja_ast.find_all(jinja2.nodes.For):
+            loop_iter = loop_ast.iter
+
+            # Check if iterating over message['content'] or similar
+            if _is_var_or_elems_access(loop_iter, "message", "content"):
+                return "openai"  # Found content iteration → openai format
+
+            # Also check for patterns like: {%- for item in msg.content -%} or {%- for item in m.content -%}
+            if _is_var_or_elems_access(
+                loop_iter, "msg", "content"
+            ) or _is_var_or_elems_access(loop_iter, "m", "content"):
+                return "openai"  # Found content iteration → openai format (glm4v)
+
+        return "string"  # No content loops found → string format
+    except Exception as e:
+        logger.debug(f"Error when parsing AST of Jinja template: {e}")
+        return "string"
+
+
+def process_content_for_template_format(
+    msg_dict: dict,
+    content_format: str,
+    image_data: list,
+    video_data: list,
+    audio_data: list,
+    modalities: list,
+) -> dict:
+    """
+    Process message content based on detected template format.
+
+    Args:
+        msg_dict: Message dictionary with content
+        content_format: 'string' or 'openai' (detected via AST analysis)
+        image_data: List to append extracted image URLs
+        video_data: List to append extracted video URLs
+        audio_data: List to append extracted audio URLs
+        modalities: List to append modalities
+
+    Returns:
+        Processed message dictionary
+    """
+    if not isinstance(msg_dict.get("content"), list):
+        # Already a string or None, no processing needed
+        return {k: v for k, v in msg_dict.items() if v is not None}
+
+    if content_format == "openai":
+        # OpenAI format: preserve structured content list, normalize types
+        processed_content_parts = []
+        for chunk in msg_dict["content"]:
+            if isinstance(chunk, dict):
+                chunk_type = chunk.get("type")
+
+                if chunk_type == "image_url":
+                    image_data.append(
+                        ImageData(
+                            url=chunk["image_url"]["url"],
+                            detail=chunk["image_url"].get("detail", "auto"),
+                        )
+                    )
+                    if chunk.get("modalities"):
+                        modalities.append(chunk.get("modalities"))
+                    # Normalize to simple 'image' type for template compatibility
+                    processed_content_parts.append({"type": "image"})
+                elif chunk_type == "video_url":
+                    video_data.append(chunk["video_url"]["url"])
+                    if chunk.get("modalities"):
+                        modalities.append(chunk.get("modalities"))
+                    # Normalize to simple 'video' type for template compatibility
+                    processed_content_parts.append({"type": "video"})
+                elif chunk_type == "audio_url":
+                    audio_data.append(chunk["audio_url"]["url"])
+                    # Normalize to simple 'audio' type
+                    processed_content_parts.append({"type": "audio"})
+                else:
+                    # Keep other content as-is (text, etc.)
+                    processed_content_parts.append(chunk)
+
+        new_msg = {
+            k: v for k, v in msg_dict.items() if v is not None and k != "content"
+        }
+        new_msg["content"] = processed_content_parts
+        return new_msg
+
+    elif content_format == "string":
+        # String format: flatten to text only (for templates like DeepSeek)
+        text_parts = []
+        for chunk in msg_dict["content"]:
+            if isinstance(chunk, dict) and chunk.get("type") == "text":
+                text_parts.append(chunk["text"])
+            # Note: For string format, we ignore images/audio since the template
+            # doesn't expect structured content - multimodal placeholders would
+            # need to be inserted differently
+
+        new_msg = msg_dict.copy()
+        new_msg["content"] = " ".join(text_parts) if text_parts else ""
+        new_msg = {k: v for k, v in new_msg.items() if v is not None}
+        return new_msg
+
+    else:
+        raise ValueError(f"Invalid content format: {content_format}")
diff --git a/python/sglang/srt/parser/reasoning_parser.py b/python/sglang/srt/parser/reasoning_parser.py
new file mode 100644
index 000000000..f50368aed
--- /dev/null
+++ b/python/sglang/srt/parser/reasoning_parser.py
@@ -0,0 +1,309 @@
+import re
+from typing import Dict, Optional, Tuple, Type
+
+from sglang.srt.parser.harmony_parser import HarmonyParser
+
+
+class StreamingParseResult:
+    """Result of streaming incremental parsing."""
+
+    def __init__(
+        self,
+        normal_text: Optional[str] = None,
+        reasoning_text: Optional[str] = None,
+    ):
+        self.normal_text = normal_text or ""
+        self.reasoning_text = reasoning_text or ""
+
+
+class BaseReasoningFormatDetector:
+    """Base class providing two sets of interfaces: one-time and streaming incremental."""
+
+    def __init__(
+        self,
+        think_start_token: str,
+        think_end_token: str,
+        force_reasoning: bool = False,
+        stream_reasoning: bool = True,
+    ):
+        self.think_start_token = think_start_token
+        self.think_end_token = think_end_token
+        self._in_reasoning = force_reasoning
+        self.stream_reasoning = stream_reasoning
+
+        self._buffer = ""
+        self.stripped_think_start = False
+
+    def detect_and_parse(self, text: str) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses reasoning sections in the provided text.
+        Returns both reasoning content and normal text separately.
+        """
+        in_reasoning = self._in_reasoning or self.think_start_token in text
+
+        if not in_reasoning:
+            return StreamingParseResult(normal_text=text)
+
+        # The text is considered to be in a reasoning block.
+        processed_text = text.replace(self.think_start_token, "").strip()
+
+        if self.think_end_token not in processed_text:
+            # Assume reasoning was truncated before `</think>` token
+            return StreamingParseResult(reasoning_text=processed_text)
+
+        # Extract reasoning content
+        splits = processed_text.split(self.think_end_token, maxsplit=1)
+        reasoning_text = splits[0]
+        normal_text = splits[1].strip()
+
+        return StreamingParseResult(
+            normal_text=normal_text, reasoning_text=reasoning_text
+        )
+
+    def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
+        """
+        Streaming incremental parsing for reasoning content.
+        Handles partial reasoning tags and content.
+
+        If stream_reasoning is False:
+            Accumulates reasoning content until the end tag is found
+        If stream_reasoning is True:
+            Streams reasoning content as it arrives
+        """
+        self._buffer += new_text
+        current_text = self._buffer
+
+        # If the current text is a prefix of the think token, keep buffering
+        if any(
+            token.startswith(current_text) and token != current_text
+            for token in [self.think_start_token, self.think_end_token]
+        ):
+            return StreamingParseResult()
+
+        # Strip `<think>` token if present
+        if not self.stripped_think_start and self.think_start_token in current_text:
+            current_text = current_text.replace(self.think_start_token, "")
+            self.stripped_think_start = True
+            self._in_reasoning = True
+
+        # Handle end of reasoning block
+        if self._in_reasoning and self.think_end_token in current_text:
+            end_idx = current_text.find(self.think_end_token)
+
+            reasoning_text = current_text[:end_idx]
+
+            self._buffer = ""
+            self._in_reasoning = False
+            normal_text = current_text[end_idx + len(self.think_end_token) :]
+
+            return StreamingParseResult(
+                normal_text=normal_text, reasoning_text=reasoning_text.rstrip()
+            )
+
+        # Continue with reasoning content
+        if self._in_reasoning:
+            if self.stream_reasoning:
+                # Stream the content immediately
+                self._buffer = ""
+                return StreamingParseResult(reasoning_text=current_text)
+            else:
+                return StreamingParseResult()
+
+        # If we're not in a reasoning block return as normal text
+        if not self._in_reasoning:
+            self._buffer = ""
+            return StreamingParseResult(normal_text=current_text)
+
+        return StreamingParseResult()
+
+
+class DeepSeekR1Detector(BaseReasoningFormatDetector):
+    """
+    Detector for DeepSeek-R1 model.
+    Assumes reasoning format:
+      (<think>)*(.*)</think>
+    Returns all the text before the </think> tag as `reasoning_text`
+    and the rest of the text as `normal_text`.
+
+    Supported models:
+      - DeepSeek-R1: Always generates thinking content without <think> start tag
+      - DeepSeek-R1-0528: Generates thinking content with <think> start tag
+
+    Format patterns:
+      - DeepSeek-R1: "I need to think about this...</think>The answer is 42."
+      - DeepSeek-R1-0528: "<think>I need to think about this...</think>The answer is 42."
+
+    Args:
+        stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
+            If True, streams reasoning content as it arrives.
+    """
+
+    def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = True):
+        # DeepSeek-R1 is assumed to be reasoning until `</think>` token
+        super().__init__(
+            "<think>",
+            "</think>",
+            force_reasoning=True,
+            stream_reasoning=stream_reasoning,
+        )
+        # https://github.com/sgl-project/sglang/pull/3202#discussion_r1950153599
+
+
+class Qwen3Detector(BaseReasoningFormatDetector):
+    """
+    Detector for Qwen3 models (e.g., Qwen/Qwen3-235B-A22B).
+    Assumes reasoning format:
+      (<think>)*(.*)</think>
+
+    Qwen3 models released before 07/2025 supports switching between thinking mode and normal
+    mode using `enable_thinking` parameter in the request parameter.
+      - enable_thinking=True: "<think>reasoning content</think>The answer is 42."
+      - enable_thinking=False: "The answer is 42." (no thinking tokens)
+
+    Args:
+        stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
+            If True, streams reasoning content as it arrives.
+    """
+
+    def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = False):
+        super().__init__(
+            "<think>",
+            "</think>",
+            force_reasoning=force_reasoning,
+            stream_reasoning=stream_reasoning,
+        )
+
+
+class KimiDetector(BaseReasoningFormatDetector):
+    """
+    Detector for Kimi Thinking model.
+    Assumes reasoning format:
+      ◁think▷*(.*)◁/think▷
+    Returns all the text before the ◁/think▷ tag as `reasoning_text`
+    and the rest of the text as `normal_text`.
+    """
+
+    def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = False):
+        super().__init__(
+            "◁think▷",
+            "◁/think▷",
+            force_reasoning=False,
+            stream_reasoning=stream_reasoning,
+        )
+
+
+class GptOssDetector(BaseReasoningFormatDetector):
+    """
+    Detector for T4-style reasoning format (GPT-OSS), using the HarmonyParser.
+    """
+
+    def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = True):
+        super().__init__(
+            "<|channel|>analysis<|message|>",
+            "<|end|>",
+            force_reasoning=force_reasoning,
+            stream_reasoning=stream_reasoning,
+        )
+        self.parser = HarmonyParser()
+
+    def detect_and_parse(self, text: str) -> StreamingParseResult:
+        events = self.parser.parse(text)
+        # Flush the buffer for one-shot parsing
+        events += self.parser.parse("")
+
+        reasoning_text = "".join(
+            [e.content for e in events if e.event_type == "reasoning"]
+        )
+        normal_parts = []
+        for e in events:
+            if e.event_type == "normal":
+                normal_parts.append(e.content)
+            elif e.event_type == "tool_call":
+                # Use raw_text to preserve structural markers for function call detector
+                normal_parts.append(e.raw_text if e.raw_text else e.content)
+        normal_text = "".join(normal_parts)
+        # Tool call events preserve raw text with structural markers
+
+        return StreamingParseResult(
+            normal_text=normal_text,
+            reasoning_text=reasoning_text,
+        )
+
+    def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
+        events = self.parser.parse(new_text)
+
+        reasoning_text = "".join(
+            [e.content for e in events if e.event_type == "reasoning"]
+        )
+        normal_parts = []
+        for e in events:
+            if e.event_type == "normal":
+                normal_parts.append(e.content)
+            elif e.event_type == "tool_call":
+                # Use raw_text to preserve structural markers for function call detector
+                normal_parts.append(e.raw_text if e.raw_text else e.content)
+        normal_text = "".join(normal_parts)
+
+        return StreamingParseResult(
+            normal_text=normal_text,
+            reasoning_text=reasoning_text,
+        )
+
+
+class ReasoningParser:
+    """
+    Parser that handles both streaming and non-streaming scenarios for extracting
+    reasoning content from model outputs.
+
+    Args:
+        model_type (str): Type of model to parse reasoning from
+        stream_reasoning (bool): If False, accumulates reasoning content until complete.
+            If True, streams reasoning content as it arrives.
+    """
+
+    DetectorMap: Dict[str, Type[BaseReasoningFormatDetector]] = {
+        "deepseek-r1": DeepSeekR1Detector,
+        "deepseek-v3": Qwen3Detector,
+        "glm45": Qwen3Detector,
+        "gpt-oss": GptOssDetector,
+        "kimi": KimiDetector,
+        "qwen3": Qwen3Detector,
+        "qwen3-thinking": Qwen3Detector,
+        "step3": DeepSeekR1Detector,
+    }
+
+    def __init__(
+        self,
+        model_type: Optional[str] = None,
+        stream_reasoning: bool = True,
+        force_reasoning: Optional[bool] = None,
+    ):
+        if not model_type:
+            raise ValueError("Model type must be specified")
+
+        detector_class = self.DetectorMap.get(model_type.lower())
+        if not detector_class:
+            raise ValueError(f"Unsupported model type: {model_type}")
+
+        # Special cases where we override force_reasoning
+        if model_type.lower() in {"qwen3-thinking", "gpt-oss"}:
+            force_reasoning = True
+
+        # Only pass force_reasoning if explicitly set, let detectors use their defaults
+        kwargs = {"stream_reasoning": stream_reasoning}
+        if force_reasoning is not None:
+            kwargs["force_reasoning"] = force_reasoning
+
+        self.detector = detector_class(**kwargs)
+
+    def parse_non_stream(self, full_text: str) -> Tuple[Optional[str], Optional[str]]:
+        """Non-streaming call: one-time parsing"""
+        ret = self.detector.detect_and_parse(full_text)
+        return ret.reasoning_text, ret.normal_text
+
+    def parse_stream_chunk(
+        self, chunk_text: str
+    ) -> Tuple[Optional[str], Optional[str]]:
+        """Streaming call: incremental parsing"""
+        ret = self.detector.parse_streaming_increment(chunk_text)
+        return ret.reasoning_text, ret.normal_text
diff --git a/python/sglang/srt/patch_torch.py b/python/sglang/srt/patch_torch.py
new file mode 100644
index 000000000..8d90ce4c0
--- /dev/null
+++ b/python/sglang/srt/patch_torch.py
@@ -0,0 +1,82 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from typing import Callable, Union
+
+import torch
+from packaging import version
+from torch.multiprocessing import reductions
+
+
+def monkey_patch_torch_reductions():
+    """Monkey patching before Torch https://github.com/pytorch/pytorch/pull/149248 is fixed"""
+
+    if hasattr(reductions, "_reduce_tensor_original"):
+        return
+
+    reductions._reduce_tensor_original = reductions.reduce_tensor
+    reductions._rebuild_cuda_tensor_original = reductions.rebuild_cuda_tensor
+
+    reductions.reduce_tensor = _reduce_tensor_modified
+    reductions.rebuild_cuda_tensor = _rebuild_cuda_tensor_modified
+
+    reductions.init_reductions()
+
+
+# The signature has not been changed for years, and we will not need this when the next version is released,
+# so it looks safe to use a constant.
+_REDUCE_TENSOR_ARG_DEVICE_INDEX = 6
+
+
+def _reduce_tensor_modified(*args, **kwargs):
+    output_fn, output_args = reductions._reduce_tensor_original(*args, **kwargs)
+    output_args = _modify_tuple(
+        output_args, _REDUCE_TENSOR_ARG_DEVICE_INDEX, _device_to_uuid
+    )
+    return output_fn, output_args
+
+
+def _rebuild_cuda_tensor_modified(*args):
+    args = _modify_tuple(args, _REDUCE_TENSOR_ARG_DEVICE_INDEX, _device_from_maybe_uuid)
+    return reductions._rebuild_cuda_tensor_original(*args)
+
+
+def _device_to_uuid(device: int) -> str:
+    return str(torch.cuda.get_device_properties(device).uuid)
+
+
+def _device_from_maybe_uuid(device_maybe_uuid: Union[int, str]) -> int:
+    if isinstance(device_maybe_uuid, int):
+        return device_maybe_uuid
+
+    if isinstance(device_maybe_uuid, str):
+        for device in range(torch.cuda.device_count()):
+            if str(torch.cuda.get_device_properties(device).uuid) == device_maybe_uuid:
+                return device
+        raise Exception("Invalid device_uuid=" + device_maybe_uuid)
+
+    raise Exception(f"Unknown type: {device_maybe_uuid=}")
+
+
+def _modify_tuple(t, index: int, modifier: Callable):
+    return *t[:index], modifier(t[index]), *t[index + 1 :]
+
+
+def monkey_patch_torch_compile():
+    if version.parse(torch.__version__) < version.parse("2.8.0"):
+        # These things are cacheable by torch.compile. torch.compile just doesn't know it.
+        # This was fixed in PyTorch 2.8, but until then, we monkey patch.
+        import torch._higher_order_ops.auto_functionalize as af
+
+        af.auto_functionalized_v2._cacheable = True
+        af.auto_functionalized._cacheable = True
diff --git a/python/sglang/srt/poll_based_barrier.py b/python/sglang/srt/poll_based_barrier.py
new file mode 100644
index 000000000..db1d22763
--- /dev/null
+++ b/python/sglang/srt/poll_based_barrier.py
@@ -0,0 +1,31 @@
+import torch
+
+from sglang.srt.distributed import get_world_group
+
+
+class PollBasedBarrier:
+    def __init__(self, noop: bool = False):
+        self._noop = noop
+        self._local_arrived = False
+
+    def local_arrive(self):
+        assert not self._local_arrived
+        self._local_arrived = True
+
+    def poll_global_arrived(self) -> bool:
+        global_arrived = self._compute_global_arrived()
+        output = self._local_arrived and global_arrived
+        if output:
+            self._local_arrived = False
+        return output
+
+    def _compute_global_arrived(self) -> bool:
+        local_arrived = self._noop or self._local_arrived
+        global_arrived = torch.tensor(local_arrived)
+        # Can optimize if bottleneck
+        torch.distributed.all_reduce(
+            global_arrived,
+            torch.distributed.ReduceOp.MIN,
+            group=get_world_group().cpu_group,
+        )
+        return global_arrived.item()
diff --git a/python/sglang/srt/sampling/custom_logit_processor.py b/python/sglang/srt/sampling/custom_logit_processor.py
new file mode 100644
index 000000000..67514819c
--- /dev/null
+++ b/python/sglang/srt/sampling/custom_logit_processor.py
@@ -0,0 +1,53 @@
+import json
+from abc import ABC, abstractmethod
+from functools import lru_cache
+from typing import Any, Dict, List, Optional
+
+import dill
+import torch
+
+
+@lru_cache(maxsize=None)
+def _cache_from_str(json_str: str):
+    """Deserialize a json string to a Callable object.
+    This function is cached to avoid redundant deserialization.
+    """
+    data = json.loads(json_str)
+    return dill.loads(bytes.fromhex(data["callable"]))
+
+
+class CustomLogitProcessor(ABC):
+    """Abstract base class for callable functions."""
+
+    @abstractmethod
+    def __call__(
+        self,
+        logits: torch.Tensor,
+        custom_param_list: Optional[List[Dict[str, Any]]] = None,
+    ) -> torch.Tensor:
+        """Define the callable behavior."""
+        raise NotImplementedError
+
+    @classmethod
+    def to_str(cls) -> str:
+        """Serialize the callable function to a JSON-compatible string."""
+        return json.dumps({"callable": dill.dumps(cls).hex()})
+
+    @classmethod
+    def from_str(cls, json_str: str):
+        """Deserialize a callable function from a JSON string."""
+        return _cache_from_str(json_str)()
+
+
+class DisallowedTokensLogitsProcessor(CustomLogitProcessor):
+    def __call__(
+        self,
+        logits: torch.Tensor,
+        custom_param_list: Optional[List[Dict[str, Any]]] = None,
+    ) -> torch.Tensor:
+        disallowed_token_ids = custom_param_list[0]["token_ids"]
+        assert all(
+            disallowed_token_ids == c["token_ids"] for c in custom_param_list
+        ), f"{custom_param_list=}"
+        logits[..., disallowed_token_ids] = -float("inf")
+        return logits
diff --git a/python/sglang/srt/sampling/penaltylib/__init__.py b/python/sglang/srt/sampling/penaltylib/__init__.py
new file mode 100644
index 000000000..26a780517
--- /dev/null
+++ b/python/sglang/srt/sampling/penaltylib/__init__.py
@@ -0,0 +1,11 @@
+from sglang.srt.sampling.penaltylib.frequency_penalty import BatchedFrequencyPenalizer
+from sglang.srt.sampling.penaltylib.min_new_tokens import BatchedMinNewTokensPenalizer
+from sglang.srt.sampling.penaltylib.orchestrator import BatchedPenalizerOrchestrator
+from sglang.srt.sampling.penaltylib.presence_penalty import BatchedPresencePenalizer
+
+__all__ = [
+    "BatchedFrequencyPenalizer",
+    "BatchedMinNewTokensPenalizer",
+    "BatchedPresencePenalizer",
+    "BatchedPenalizerOrchestrator",
+]
diff --git a/python/sglang/srt/sampling/penaltylib/frequency_penalty.py b/python/sglang/srt/sampling/penaltylib/frequency_penalty.py
new file mode 100644
index 000000000..893a1c377
--- /dev/null
+++ b/python/sglang/srt/sampling/penaltylib/frequency_penalty.py
@@ -0,0 +1,65 @@
+import torch
+
+from sglang.srt.sampling.penaltylib.orchestrator import (
+    BatchedPenalizerOrchestrator,
+    _BatchedPenalizer,
+)
+
+
+class BatchedFrequencyPenalizer(_BatchedPenalizer):
+    """
+    Frequency penalizer penalizes tokens based on their frequency in the output.
+    """
+
+    def __init__(self, orchestrator: BatchedPenalizerOrchestrator):
+        self.orchestrator = orchestrator
+        self._is_prepared = False
+
+    def _is_required(self) -> bool:
+        return any(
+            req.sampling_params.frequency_penalty != 0.0
+            for req in self.orchestrator.reqs()
+        )
+
+    def _prepare(self):
+        self.cumulated_frequency_penalties = torch.zeros(
+            (len(self.orchestrator.reqs()), self.orchestrator.vocab_size),
+            dtype=torch.float32,
+            device=self.orchestrator.device,
+        )
+
+        self.frequency_penalties = (
+            torch.tensor(
+                data=[
+                    req.sampling_params.frequency_penalty
+                    for req in self.orchestrator.reqs()
+                ],
+                dtype=torch.float32,
+                device=self.orchestrator.device,
+            )
+        ).unsqueeze_(1)
+
+    def _cumulate_output_tokens(self, output_ids: torch.Tensor):
+        self.cumulated_frequency_penalties.scatter_add_(
+            dim=1,
+            index=output_ids.unsqueeze(1),
+            src=self.frequency_penalties,
+        )
+
+    def _apply(self, logits: torch.Tensor) -> torch.Tensor:
+        logits.sub_(self.cumulated_frequency_penalties)
+
+    def _filter(self, keep_indices: torch.Tensor):
+        self.frequency_penalties = self.frequency_penalties[keep_indices]
+        self.cumulated_frequency_penalties = self.cumulated_frequency_penalties[
+            keep_indices
+        ]
+
+    def _merge(self, their: "BatchedFrequencyPenalizer"):
+        self.frequency_penalties = torch.cat(
+            [self.frequency_penalties, their.frequency_penalties], dim=0
+        )
+        self.cumulated_frequency_penalties = torch.cat(
+            [self.cumulated_frequency_penalties, their.cumulated_frequency_penalties],
+            dim=0,
+        )
diff --git a/python/sglang/srt/sampling/penaltylib/min_new_tokens.py b/python/sglang/srt/sampling/penaltylib/min_new_tokens.py
new file mode 100644
index 000000000..da06265d9
--- /dev/null
+++ b/python/sglang/srt/sampling/penaltylib/min_new_tokens.py
@@ -0,0 +1,94 @@
+import torch
+
+from sglang.srt.sampling.penaltylib.orchestrator import (
+    BatchedPenalizerOrchestrator,
+    _BatchedPenalizer,
+)
+
+
+class BatchedMinNewTokensPenalizer(_BatchedPenalizer):
+    """
+    Min new tokens penalizer penalizes tokens based on the length of the output.
+    """
+
+    def __init__(self, orchestrator: BatchedPenalizerOrchestrator):
+        self.orchestrator = orchestrator
+        self._is_prepared = False
+
+    def _is_required(self) -> bool:
+        return any(
+            req.sampling_params.min_new_tokens > 0 for req in self.orchestrator.reqs()
+        )
+
+    def _prepare(self):
+        self.min_new_tokens = torch.tensor(
+            data=[
+                req.sampling_params.min_new_tokens for req in self.orchestrator.reqs()
+            ],
+            dtype=torch.int32,
+            device=self.orchestrator.device,
+        ).unsqueeze_(1)
+
+        padded_stop_token_ids = torch.nn.utils.rnn.pad_sequence(
+            sequences=[
+                torch.tensor(
+                    data=(
+                        list(
+                            (req.sampling_params.stop_token_ids or set())
+                            | (req.tokenizer.additional_stop_token_ids or set())
+                            | {req.tokenizer.eos_token_id}
+                        )
+                    ),
+                    dtype=torch.int64,
+                    device=self.orchestrator.device,
+                )
+                for req in self.orchestrator.reqs()
+            ],
+            batch_first=True,
+            padding_value=self.orchestrator.vocab_size,
+        )
+        self.stop_token_penalties = torch.zeros(
+            size=(len(self.orchestrator.reqs()), self.orchestrator.vocab_size + 1),
+            dtype=torch.float32,
+            device=self.orchestrator.device,
+        ).scatter_add_(
+            dim=1,
+            index=padded_stop_token_ids,
+            src=torch.full_like(
+                input=padded_stop_token_ids,
+                dtype=torch.float32,
+                fill_value=float("-inf"),
+                device=self.orchestrator.device,
+            ),
+        )[
+            :, : self.orchestrator.vocab_size
+        ]
+
+        self.len_output_tokens = torch.zeros(
+            size=(len(self.orchestrator.reqs()), 1),
+            dtype=torch.int32,
+            device=self.orchestrator.device,
+        )
+
+    def _cumulate_output_tokens(self, output_ids: torch.Tensor):
+        self.len_output_tokens += 1
+
+    def _apply(self, logits: torch.Tensor):
+        mask = (self.len_output_tokens < self.min_new_tokens).expand_as(logits)
+        logits[mask] += self.stop_token_penalties[mask]
+
+    def _filter(self, keep_indices: torch.Tensor):
+        self.min_new_tokens = self.min_new_tokens[keep_indices]
+        self.stop_token_penalties = self.stop_token_penalties[keep_indices]
+        self.len_output_tokens = self.len_output_tokens[keep_indices]
+
+    def _merge(self, their: "BatchedMinNewTokensPenalizer"):
+        self.min_new_tokens = torch.cat(
+            [self.min_new_tokens, their.min_new_tokens], dim=0
+        )
+        self.stop_token_penalties = torch.cat(
+            [self.stop_token_penalties, their.stop_token_penalties], dim=0
+        )
+        self.len_output_tokens = torch.cat(
+            [self.len_output_tokens, their.len_output_tokens], dim=0
+        )
diff --git a/python/sglang/srt/sampling/penaltylib/orchestrator.py b/python/sglang/srt/sampling/penaltylib/orchestrator.py
new file mode 100644
index 000000000..1abd255cb
--- /dev/null
+++ b/python/sglang/srt/sampling/penaltylib/orchestrator.py
@@ -0,0 +1,209 @@
+from __future__ import annotations
+
+import abc
+import weakref
+from typing import TYPE_CHECKING, Optional, Set, Type
+
+import torch
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import ScheduleBatch
+
+
+class BatchedPenalizerOrchestrator:
+    def __init__(
+        self,
+        vocab_size: int,
+        batch: ScheduleBatch,
+        penalizers: Set[Type["_BatchedPenalizer"]],
+    ):
+        self.vocab_size = vocab_size
+        self._batch_ref = weakref.ref(batch)
+        self.device = batch.device
+        self.penalizers = {Penalizer: Penalizer(self) for Penalizer in penalizers}
+
+        is_required = False
+        for penalizer in self.penalizers.values():
+            pen_is_required = penalizer.prepare_if_required()
+            is_required |= pen_is_required
+        self.is_required = is_required
+
+    @property
+    def batch(self) -> ScheduleBatch | None:
+        return self._batch_ref()
+
+    @batch.setter
+    def batch(self, value: Optional[ScheduleBatch]):
+        if value is None:
+            self._batch_ref = lambda: None
+        else:
+            self._batch_ref = weakref.ref(value)
+
+    def reqs(self):
+        return self.batch.reqs
+
+    def cumulate_output_tokens(self, output_ids: torch.Tensor):
+        """
+        Feed the output tokens to the penalizers.
+
+        Args:
+            output_ids (torch.Tensor): The output tokens.
+        """
+        for penalizer in self.penalizers.values():
+            penalizer.cumulate_output_tokens(output_ids=output_ids)
+
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        """
+        Apply the penalizers to the logits.
+        Note that it may apply the penalizers in-place.
+
+        Args:
+            logits (torch.Tensor): The logits to apply the penalizers to.
+
+        Returns:
+            torch.Tensor: The logits after applying the penalizers.
+        """
+        for penalizer in self.penalizers.values():
+            penalizer.apply(logits)
+
+    def filter(self, keep_indices: torch.Tensor):
+        """
+        Filter the penalizers based on the indices to keep in the batch.
+
+        Args:
+            keep_indices (torch.Tensor): Tensor of indices to keep in the batch.
+        """
+        if not self.is_required:
+            return
+
+        if len(keep_indices) == 0:
+            self.is_required = False
+            for penalizer in self.penalizers.values():
+                penalizer.teardown()
+            return
+
+        is_required = False
+        for penalizer in self.penalizers.values():
+            tmp_is_required = penalizer.is_required()
+            is_required |= tmp_is_required
+            if tmp_is_required:
+                penalizer.filter(keep_indices=keep_indices)
+            else:
+                penalizer.teardown()
+        self.is_required = is_required
+
+    def merge(self, their: "BatchedPenalizerOrchestrator"):
+        """
+        Merge the penalizers of another orchestrator into this one.
+
+        Note that this function **must** be called _before_ self.batch.reqs is updated (filtered).
+        Each unprepared penalizers would have to be prepared (creating tensors, etc.) first before merging.
+        This step requires the original batch.reqs, before it gets merged with other batch.reqs.
+
+        Args:
+            their (BatchedPenalizerOrchestrator): The orchestrator to merge into this one.
+        """
+        if not self.is_required and not their.is_required:
+            return
+
+        self.is_required = True
+        for penalizer, their_penalizer in their.penalizers.items():
+            self.penalizers[penalizer].merge(their_penalizer)
+
+
+class _BatchedPenalizer(abc.ABC):
+    """
+    An abstract class for a batched penalizer.
+    """
+
+    def is_prepared(self) -> bool:
+        return self._is_prepared
+
+    def is_required(self) -> bool:
+        return self._is_required()
+
+    def prepare(self):
+        if not self._is_prepared:
+            self._prepare()
+            self._is_prepared = True
+
+    def prepare_if_required(self):
+        if self._is_required():
+            self.prepare()
+            return True
+        else:
+            return False
+
+    def teardown(self):
+        self._is_prepared = False
+
+    def cumulate_output_tokens(self, output_ids: torch.Tensor):
+        if not self._is_prepared:
+            return
+
+        self._cumulate_output_tokens(output_ids=output_ids)
+
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        if not self._is_prepared:
+            return
+
+        self._apply(logits=logits)
+
+    def filter(self, keep_indices: torch.Tensor):
+        if not self._is_prepared:
+            return
+
+        self._filter(keep_indices=keep_indices)
+
+    def merge(self, their: "_BatchedPenalizer"):
+        if not self._is_prepared and not their._is_prepared:
+            return
+
+        self.prepare()
+        their.prepare()
+        self._merge(their)
+
+    @abc.abstractmethod
+    def _is_required(self) -> bool:
+        """
+        Check if the penalizer is required to be prepared.
+        """
+        pass
+
+    @abc.abstractmethod
+    def _prepare(self):
+        """
+        Prepare the penalizer.
+        Usually, this is where the penalizer initializes its tensors.
+        """
+        pass
+
+    @abc.abstractmethod
+    def _cumulate_output_tokens(self, output_ids: torch.Tensor):
+        """
+        Cumulate the output tokens.
+        Orchestrator will call this function to feed the output tokens to the penalizer.
+        """
+        pass
+
+    @abc.abstractmethod
+    def _apply(self, logits: torch.Tensor) -> torch.Tensor:
+        """
+        Apply the penalizer to the logits.
+        Penalizers can modify the logits in-place if needed.
+        """
+        pass
+
+    @abc.abstractmethod
+    def _filter(self, keep_indices: torch.Tensor):
+        """
+        Filter the penalizer (tensors or underlying data) based on the indices to keep in the batch.
+        """
+        pass
+
+    @abc.abstractmethod
+    def _merge(self, their: "_BatchedPenalizer"):
+        """
+        Merge the penalizer with another penalizer.
+        """
+        pass
diff --git a/python/sglang/srt/sampling/penaltylib/presence_penalty.py b/python/sglang/srt/sampling/penaltylib/presence_penalty.py
new file mode 100644
index 000000000..4f3a6ace3
--- /dev/null
+++ b/python/sglang/srt/sampling/penaltylib/presence_penalty.py
@@ -0,0 +1,65 @@
+import torch
+
+from sglang.srt.sampling.penaltylib.orchestrator import (
+    BatchedPenalizerOrchestrator,
+    _BatchedPenalizer,
+)
+
+
+class BatchedPresencePenalizer(_BatchedPenalizer):
+    """
+    Presence penalizer penalizes tokens based on their presence in the output.
+    """
+
+    def __init__(self, orchestrator: BatchedPenalizerOrchestrator):
+        self.orchestrator = orchestrator
+        self._is_prepared = False
+
+    def _is_required(self) -> bool:
+        return any(
+            req.sampling_params.presence_penalty != 0.0
+            for req in self.orchestrator.reqs()
+        )
+
+    def _prepare(self):
+        self.cumulated_presence_penalties = torch.zeros(
+            (len(self.orchestrator.reqs()), self.orchestrator.vocab_size),
+            dtype=torch.float32,
+            device=self.orchestrator.device,
+        )
+
+        self.presence_penalties = (
+            torch.tensor(
+                data=[
+                    req.sampling_params.presence_penalty
+                    for req in self.orchestrator.reqs()
+                ],
+                dtype=torch.float32,
+                device=self.orchestrator.device,
+            )
+        ).unsqueeze_(1)
+
+    def _cumulate_output_tokens(self, output_ids: torch.Tensor):
+        self.cumulated_presence_penalties.scatter_(
+            dim=1,
+            index=output_ids.unsqueeze(1),
+            src=self.presence_penalties,
+        )
+
+    def _apply(self, logits: torch.Tensor) -> torch.Tensor:
+        logits.sub_(self.cumulated_presence_penalties)
+
+    def _filter(self, keep_indices: torch.Tensor):
+        self.presence_penalties = self.presence_penalties[keep_indices]
+        self.cumulated_presence_penalties = self.cumulated_presence_penalties[
+            keep_indices
+        ]
+
+    def _merge(self, their: "BatchedPresencePenalizer"):
+        self.presence_penalties = torch.cat(
+            [self.presence_penalties, their.presence_penalties], dim=0
+        )
+        self.cumulated_presence_penalties = torch.cat(
+            [self.cumulated_presence_penalties, their.cumulated_presence_penalties],
+            dim=0,
+        )
diff --git a/python/sglang/srt/sampling/sampling_batch_info.py b/python/sglang/srt/sampling/sampling_batch_info.py
new file mode 100644
index 000000000..6ba8a7777
--- /dev/null
+++ b/python/sglang/srt/sampling/sampling_batch_info.py
@@ -0,0 +1,392 @@
+from __future__ import annotations
+
+import dataclasses
+import logging
+import threading
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple
+
+import torch
+
+import sglang.srt.sampling.penaltylib as penaltylib
+from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
+from sglang.srt.sampling.sampling_params import TOP_K_ALL
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import ScheduleBatch
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclasses.dataclass
+class SamplingBatchInfo:
+    # Basic batched sampling params
+    temperatures: torch.Tensor
+    top_ps: torch.Tensor
+    top_ks: torch.Tensor
+    min_ps: torch.Tensor
+
+    # Whether all requests use greedy sampling
+    is_all_greedy: bool
+
+    # Whether any requests use top_p sampling
+    need_top_p_sampling: bool
+
+    # Whether any requests use top_k sampling
+    need_top_k_sampling: bool
+
+    # Whether any request needs min_p sampling
+    need_min_p_sampling: bool
+
+    # Masking tensors for grammar-guided structured outputs
+    vocab_size: int
+    grammars: Optional[List] = None
+    vocab_mask: Optional[torch.Tensor] = None
+    apply_mask_func: Optional[Callable[[torch.Tensor, torch.Tensor], None]] = None
+
+    # An event used for overlap schedule
+    sampling_info_done: Optional[threading.Event] = None
+
+    # Penalizer
+    penalizer_orchestrator: Optional[penaltylib.BatchedPenalizerOrchestrator] = None
+    linear_penalty: torch.Tensor = None
+
+    # Whether any request has custom logit processor
+    has_custom_logit_processor: bool = False
+    # Custom parameters
+    custom_params: Optional[List[Optional[Dict[str, Any]]]] = None
+    # Custom logit processor
+    custom_logit_processor: Optional[
+        Dict[int, Tuple[CustomLogitProcessor, torch.Tensor]]
+    ] = None
+
+    # Device
+    device: str = "cuda"
+
+    # Handle logit bias
+    logit_bias: Optional[torch.Tensor] = None
+
+    @classmethod
+    def _get_global_server_args_dict(cls):
+        from sglang.srt.managers.schedule_batch import global_server_args_dict
+
+        return global_server_args_dict
+
+    @classmethod
+    def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
+        global_server_args_dict = cls._get_global_server_args_dict()
+
+        reqs = batch.reqs
+        device = batch.device
+        temperatures = torch.tensor(
+            [r.sampling_params.temperature for r in reqs],
+            dtype=torch.float,
+            device=device,
+        ).view(-1, 1)
+        top_ps = torch.tensor(
+            [r.sampling_params.top_p for r in reqs], dtype=torch.float, device=device
+        )
+        top_ks = torch.tensor(
+            [r.sampling_params.top_k for r in reqs], dtype=torch.int32, device=device
+        )
+        min_ps = torch.tensor(
+            [r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device
+        )
+
+        logit_bias = None
+        if any(r.sampling_params.logit_bias is not None for r in reqs):
+            logit_bias = torch.zeros(len(reqs), vocab_size, device=device)
+            for i, r in enumerate(reqs):
+                if r.sampling_params.logit_bias is not None:
+                    for key, value in r.sampling_params.logit_bias.items():
+                        logit_bias[i, int(key)] = value
+
+        # Check if any request has custom logit processor
+        has_custom_logit_processor = global_server_args_dict[
+            "enable_custom_logit_processor"
+        ] and any(  # check the flag first.
+            r.custom_logit_processor for r in reqs
+        )  # then check the requests.
+
+        if has_custom_logit_processor:
+            # Merge the same type of custom logit processors together
+            processor_dict = {}
+            for i, r in enumerate(reqs):
+                if r.custom_logit_processor is None:
+                    continue
+                processor_str = r.custom_logit_processor
+                if processor_str not in processor_dict:
+                    processor_dict[processor_str] = []
+                processor_dict[processor_str].append(i)
+
+            merged_custom_logit_processor = {
+                hash(processor_str): (
+                    # The deserialized custom logit processor object
+                    CustomLogitProcessor.from_str(processor_str),
+                    # The mask tensor for the requests that use this custom logit processor
+                    torch.zeros(len(reqs), dtype=torch.bool)
+                    .scatter_(0, torch.tensor(true_indices), True)
+                    .to(device, non_blocking=True),
+                )
+                for processor_str, true_indices in processor_dict.items()
+            }
+            custom_params = [r.sampling_params.custom_params for r in reqs]
+        else:
+            merged_custom_logit_processor = None
+            custom_params = None
+
+        # Each penalizers will do nothing if they evaluate themselves as not required by looking at
+        # the sampling_params of the requests (See {_is_required()} of each penalizers). So this
+        # should not add hefty computation overhead other than simple checks.
+        #
+        # While we can choose not to even create the class instances if they are not required, this
+        # could add additional complexity to the {ScheduleBatch} class, especially we need to
+        # handle {filter_batch()} and {merge_batch()} cases as well.
+        penalizer_orchestrator = penaltylib.BatchedPenalizerOrchestrator(
+            vocab_size=vocab_size,
+            batch=batch,
+            penalizers={
+                penaltylib.BatchedFrequencyPenalizer,
+                penaltylib.BatchedMinNewTokensPenalizer,
+                penaltylib.BatchedPresencePenalizer,
+            },
+        )
+
+        ret = cls(
+            temperatures=temperatures,
+            top_ps=top_ps,
+            top_ks=top_ks,
+            min_ps=min_ps,
+            is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
+            need_top_p_sampling=any(r.sampling_params.top_p != 1.0 for r in reqs),
+            need_top_k_sampling=any(r.sampling_params.top_k != TOP_K_ALL for r in reqs),
+            need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
+            vocab_size=vocab_size,
+            penalizer_orchestrator=penalizer_orchestrator,
+            has_custom_logit_processor=has_custom_logit_processor,
+            custom_params=custom_params,
+            custom_logit_processor=merged_custom_logit_processor,
+            device=device,
+            logit_bias=logit_bias,
+        )
+        return ret
+
+    def __len__(self):
+        return len(self.temperatures)
+
+    def update_regex_vocab_mask(self):
+        if not self.grammars:
+            self.vocab_mask = None
+            self.apply_mask_func = None
+            return
+
+        # Find a grammar from the list
+        first_grammar = next(grammar for grammar in self.grammars if grammar)
+
+        # TODO(lianmin): Maybe we can reuse the existing mask?
+        self.vocab_mask = first_grammar.allocate_vocab_mask(
+            vocab_size=self.vocab_size,
+            batch_size=len(self.temperatures),
+            device=self.device,
+        )
+        self.apply_mask_func = (
+            first_grammar.apply_vocab_mask
+        )  # force to use static method
+
+        # Apply the mask
+        for i, grammar in enumerate(self.grammars):
+            if grammar and not grammar.finished and not grammar.is_terminated():
+                grammar.fill_vocab_mask(self.vocab_mask, i)
+
+        # Move the mask to the device if needed
+        self.vocab_mask = first_grammar.move_vocab_mask(self.vocab_mask, self.device)
+
+    def update_penalties(self):
+        if self.penalizer_orchestrator.is_required:
+            self.linear_penalty = torch.zeros(
+                (len(self.temperatures), self.vocab_size),
+                dtype=torch.float32,
+                device=self.temperatures.device,
+            )
+            self.penalizer_orchestrator.apply(self.linear_penalty)
+        else:
+            self.linear_penalty = None
+
+    def apply_logits_bias(self, logits: torch.Tensor):
+        if self.linear_penalty is not None:
+            # Used in the overlap mode
+            logits.add_(self.linear_penalty)
+
+        if self.penalizer_orchestrator and self.penalizer_orchestrator.is_required:
+            # Used in the non-overlap mode
+            self.penalizer_orchestrator.apply(logits)
+
+        if self.vocab_mask is not None:
+            self.apply_mask_func(logits=logits, vocab_mask=self.vocab_mask)
+
+        if self.logit_bias is not None:
+            logits.add_(self.logit_bias)
+
+    def filter_batch(self, keep_indices: List[int], keep_indices_device: torch.Tensor):
+        self.penalizer_orchestrator.filter(keep_indices_device)
+
+        if self.has_custom_logit_processor:
+            self._filter_batch_custom_logit_processor(keep_indices, keep_indices_device)
+
+        for item in [
+            "temperatures",
+            "top_ps",
+            "top_ks",
+            "min_ps",
+        ]:
+            value = getattr(self, item, None)
+            setattr(self, item, value[keep_indices_device])
+
+        if self.logit_bias is not None:
+            self.logit_bias = self.logit_bias[keep_indices_device]
+
+    def _filter_batch_custom_logit_processor(
+        self, keep_indices: List[int], keep_indices_device: torch.Tensor
+    ):
+        """Filter the custom logit processor and custom params"""
+        self.custom_logit_processor = {
+            k: (p, mask[keep_indices_device])
+            for k, (p, mask) in self.custom_logit_processor.items()
+            if torch.any(
+                mask[keep_indices_device]
+            )  # ignore the custom logit processor whose mask is all False
+        }
+        self.custom_params = [self.custom_params[i] for i in keep_indices]
+
+        # If the custom logit processor is an empty dict, set the flag to False,
+        # and set the custom logit processor and custom params to None.
+        if len(self.custom_logit_processor) == 0:
+            self.custom_logit_processor = None
+            self.custom_params = None
+            self.has_custom_logit_processor = False
+
+    @staticmethod
+    def merge_custom_logit_processor(
+        lhs: Optional[Dict[int, Tuple[CustomLogitProcessor, torch.Tensor]]],
+        rhs: Optional[Dict[int, Tuple[CustomLogitProcessor, torch.Tensor]]],
+        bs1: int,
+        bs2: int,
+        device: str,
+    ):
+        if lhs is None and rhs is None:
+            return None
+        lhs, rhs = lhs or {}, rhs or {}
+
+        keys = set(lhs.keys()).union(set(rhs.keys()))
+        merged_dict = {}
+
+        for k in keys:
+            # Get the logit processor object
+            processor = lhs[k][0] if k in lhs else rhs[k][0]
+            # Get and merge the mask tensors from the two dicts
+            left_mask = (
+                lhs[k][1]
+                if k in lhs
+                else torch.zeros(bs1, dtype=torch.bool, device=device)
+            )
+            right_mask = (
+                rhs[k][1]
+                if k in rhs
+                else torch.zeros(bs2, dtype=torch.bool, device=device)
+            )
+            merged_dict[k] = (processor, torch.cat([left_mask, right_mask]))
+
+            assert merged_dict[k][1].shape[0] == bs1 + bs2, (
+                f"The batch size of merged mask ({merged_dict[k][1].shape[0]}) does not match "
+                f"the sum of the batch sizes of the two masks ({bs1 + bs2})"
+                f"\n{left_mask=}\n{right_mask=}\n{bs1=}\n{bs2=}"
+                f"\n{lhs=}\n{rhs=}"
+            )
+
+        return merged_dict
+
+    def merge_batch(self, other: "SamplingBatchInfo"):
+        self.penalizer_orchestrator.merge(other.penalizer_orchestrator)
+
+        # Merge the custom logit processors and custom params lists
+        if self.has_custom_logit_processor or other.has_custom_logit_processor:
+            # Merge the custom logit processors
+            self.custom_logit_processor = (
+                SamplingBatchInfo.merge_custom_logit_processor(
+                    self.custom_logit_processor,
+                    other.custom_logit_processor,
+                    len(self),
+                    len(other),
+                    self.device,
+                )
+            )
+            # Merge the custom params lists
+            self.custom_params = self.custom_params or [None] * len(self)
+            other.custom_params = other.custom_params or [None] * len(other)
+            self.custom_params.extend(other.custom_params)
+
+            # Set the flag to True if any of the two has custom logit processor
+            self.has_custom_logit_processor = True
+
+        # Merge logit bias - note this has to come before the temperatures tensor update! Otherwise will cause crashes.
+        # See note below on len(self) and len(other).
+        self.logit_bias = merge_bias_tensor(
+            self.logit_bias, other.logit_bias, len(self), len(other), self.device, 0.0
+        )
+
+        # Note: because the __len()__ operator is defined on the temperatures tensor,
+        # please make sure any merge operation with len(self) or len(other) is done before
+        # the merge operation of the temperatures tensor below.
+        for item in [
+            "temperatures",
+            "top_ps",
+            "top_ks",
+            "min_ps",
+        ]:
+            self_val = getattr(self, item, None)
+            other_val = getattr(other, item, None)
+            setattr(self, item, torch.cat([self_val, other_val]))
+
+        self.is_all_greedy &= other.is_all_greedy
+        self.need_top_p_sampling |= other.need_top_p_sampling
+        self.need_top_k_sampling |= other.need_top_k_sampling
+        self.need_min_p_sampling |= other.need_min_p_sampling
+
+
+def merge_bias_tensor(
+    lhs: Optional[torch.Tensor],
+    rhs: Optional[torch.Tensor],
+    bs1: int,
+    bs2: int,
+    device: str,
+    default: float,
+):
+    """Merge two bias tensors for batch merging.
+
+    Args:
+        lhs: Left-hand side tensor
+        rhs: Right-hand side tensor
+        bs1: Batch size of left-hand side tensor
+        bs2: Batch size of right-hand side tensor
+        device: Device to place the merged tensor on
+        default: Default value for missing tensor elements
+
+    Returns:
+        Merged tensor or None if both inputs are None
+    """
+    if lhs is None and rhs is None:
+        return None
+
+    if lhs is not None and rhs is not None:
+        return torch.cat([lhs, rhs])
+    else:
+        if lhs is not None:
+            shape, dtype = lhs.shape[1:], lhs.dtype
+        else:
+            shape, dtype = rhs.shape[1:], rhs.dtype
+
+        if lhs is None:
+            lhs = torch.empty((bs1, *shape), device=device, dtype=dtype).fill_(default)
+        if rhs is None:
+            rhs = torch.empty((bs2, *shape), device=device, dtype=dtype).fill_(default)
+        return torch.cat([lhs, rhs])
diff --git a/python/sglang/srt/sampling/sampling_params.py b/python/sglang/srt/sampling/sampling_params.py
new file mode 100644
index 000000000..b7d1a6d6e
--- /dev/null
+++ b/python/sglang/srt/sampling/sampling_params.py
@@ -0,0 +1,165 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Sampling parameters for text generation."""
+
+from typing import Any, Dict, List, Optional, Union
+
+_SAMPLING_EPS = 1e-6
+TOP_K_ALL = 1 << 30
+
+
+class SamplingParams:
+    """
+    The sampling parameters.
+
+    See docs/backend/sampling_params.md or
+    https://docs.sglang.ai/backend/sampling_params.html
+    for the documentation.
+    """
+
+    def __init__(
+        self,
+        max_new_tokens: int = 128,
+        stop: Optional[Union[str, List[str]]] = None,
+        stop_token_ids: Optional[List[int]] = None,
+        temperature: float = 1.0,
+        top_p: float = 1.0,
+        top_k: int = -1,
+        min_p: float = 0.0,
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
+        repetition_penalty: float = 1.0,
+        min_new_tokens: int = 0,
+        n: int = 1,
+        json_schema: Optional[str] = None,
+        regex: Optional[str] = None,
+        ebnf: Optional[str] = None,
+        structural_tag: Optional[str] = None,
+        ignore_eos: bool = False,
+        skip_special_tokens: bool = True,
+        spaces_between_special_tokens: bool = True,
+        no_stop_trim: bool = False,
+        custom_params: Optional[Dict[str, Any]] = None,
+        stream_interval: Optional[int] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+    ) -> None:
+        self.max_new_tokens = max_new_tokens
+        self.stop_strs = stop
+        if stop_token_ids:
+            self.stop_token_ids = set(stop_token_ids)
+        else:
+            self.stop_token_ids = None
+        self.temperature = temperature
+        self.top_p = top_p
+        self.top_k = top_k
+        self.min_p = min_p
+        self.frequency_penalty = frequency_penalty
+        self.presence_penalty = presence_penalty
+        self.repetition_penalty = repetition_penalty
+        self.min_new_tokens = min_new_tokens
+        self.regex = regex
+        self.n = n
+        self.json_schema = json_schema
+        self.ebnf = ebnf
+        self.structural_tag = structural_tag
+        self.ignore_eos = ignore_eos
+        self.skip_special_tokens = skip_special_tokens
+        self.spaces_between_special_tokens = spaces_between_special_tokens
+        self.no_stop_trim = no_stop_trim
+        self.custom_params = custom_params
+        self.stream_interval = stream_interval
+        self.logit_bias = logit_bias
+
+        # Process some special cases
+        if 0 <= self.temperature < _SAMPLING_EPS:
+            # top_k = 1 means greedy sampling
+            self.temperature = 1.0
+            self.top_k = 1
+        if self.top_k == -1:
+            self.top_k = TOP_K_ALL  # whole vocabulary
+
+    def verify(self, vocab_size):
+        if self.temperature < 0.0:
+            raise ValueError(
+                f"temperature must be non-negative, got {self.temperature}."
+            )
+        if not 0.0 < self.top_p <= 1.0:
+            raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
+        if not 0.0 <= self.min_p <= 1.0:
+            raise ValueError(f"min_p must be in [0, 1], got {self.min_p}.")
+        if self.top_k < 1 or self.top_k == -1:
+            raise ValueError(
+                f"top_k must be -1 (disable) or at least 1, got {self.top_k}."
+            )
+        if not -2.0 <= self.frequency_penalty <= 2.0:
+            raise ValueError(
+                "frequency_penalty must be in [-2, 2], got "
+                f"{self.frequency_penalty}."
+            )
+        if not -2.0 <= self.presence_penalty <= 2.0:
+            raise ValueError(
+                "presence_penalty must be in [-2, 2], got " f"{self.presence_penalty}."
+            )
+        if not 0.0 <= self.repetition_penalty <= 2.0:
+            raise ValueError(
+                "repetition_penalty must be in [0, 2], got "
+                f"{self.repetition_penalty}."
+            )
+        if not 0 <= self.min_new_tokens:
+            raise ValueError(
+                f"min_new_tokens must be in [0, max_new_tokens], got "
+                f"{self.min_new_tokens}."
+            )
+        if self.max_new_tokens is not None:
+            if self.max_new_tokens < 0:
+                raise ValueError(
+                    f"max_new_tokens must be at least 0, got {self.max_new_tokens}."
+                )
+            if not self.min_new_tokens <= self.max_new_tokens:
+                raise ValueError(
+                    f"min_new_tokens must be in [0, max_new_tokens({self.max_new_tokens})], got "
+                    f"{self.min_new_tokens}."
+                )
+        if self.logit_bias is not None:
+            for token_id in self.logit_bias:
+                if not 0 <= int(token_id) < vocab_size:
+                    raise ValueError(
+                        f"logit_bias must has keys in [0, {vocab_size - 1}], got "
+                        f"{token_id}."
+                    )
+        grammars = [
+            self.json_schema,
+            self.regex,
+            self.ebnf,
+        ]  # since mutually exclusive, only one can be set
+        if sum(x is not None for x in grammars) > 1:
+            raise ValueError("Only one of regex, json_schema, or ebnf can be set.")
+
+    def normalize(self, tokenizer):
+        # Process stop strings
+        if self.stop_strs is None:
+            self.stop_strs = []
+            self.stop_str_max_len = 0
+        else:
+            if isinstance(self.stop_strs, str):
+                self.stop_strs = [self.stop_strs]
+
+            stop_str_max_len = 0
+            for stop_str in self.stop_strs:
+                if tokenizer is not None:
+                    stop_str_ids = tokenizer.encode(stop_str, add_special_tokens=False)
+                    stop_str_max_len = max(stop_str_max_len, len(stop_str_ids))
+                else:
+                    stop_str_max_len = max(stop_str_max_len, len(stop_str))
+            self.stop_str_max_len = stop_str_max_len
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
new file mode 100644
index 000000000..17371a66b
--- /dev/null
+++ b/python/sglang/srt/server_args.py
@@ -0,0 +1,2714 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The arguments of the server."""
+
+import argparse
+import dataclasses
+import json
+import logging
+import os
+import random
+import sys
+import tempfile
+from typing import List, Literal, Optional, Union
+
+from sglang.srt.function_call.function_call_parser import FunctionCallParser
+from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
+from sglang.srt.lora.lora_registry import LoRARef
+from sglang.srt.parser.reasoning_parser import ReasoningParser
+from sglang.srt.utils import (
+    LORA_TARGET_ALL_MODULES,
+    SUPPORTED_LORA_TARGET_MODULES,
+    configure_ipv6,
+    get_device,
+    get_device_memory_capacity,
+    is_cuda,
+    is_flashinfer_available,
+    is_hip,
+    is_port_available,
+    is_remote_url,
+    is_sm90_supported,
+    is_sm100_supported,
+    is_triton_kernels_available,
+    is_valid_ipv6_address,
+    nullable_str,
+)
+from sglang.utils import is_in_ci
+
+logger = logging.getLogger(__name__)
+
+
+# Define constants
+LOAD_FORMAT_CHOICES = [
+    "auto",
+    "pt",
+    "safetensors",
+    "npcache",
+    "dummy",
+    "sharded_state",
+    "gguf",
+    "bitsandbytes",
+    "layered",
+    "remote",
+]
+
+QUANTIZATION_CHOICES = [
+    "awq",
+    "fp8",
+    "gptq",
+    "marlin",
+    "gptq_marlin",
+    "awq_marlin",
+    "bitsandbytes",
+    "gguf",
+    "modelopt",
+    "modelopt_fp4",
+    "petit_nvfp4",
+    "w8a8_int8",
+    "w8a8_fp8",
+    "moe_wna16",
+    "qoq",
+    "w4afp8",
+    "mxfp4",
+]
+
+ATTENTION_BACKEND_CHOICES = [
+    # Common
+    "triton",
+    "torch_native",
+    # NVIDIA specific
+    "cutlass_mla",
+    "fa3",
+    "flashinfer",
+    "flashmla",
+    "trtllm_mla",
+    "trtllm_mha",
+    "dual_chunk_flash_attn",
+    "hybrid_linear_attn",
+    # AMD specific
+    "aiter",
+    "wave",
+    # Other platforms
+    "intel_amx",
+    "ascend",
+]
+
+DISAGG_TRANSFER_BACKEND_CHOICES = ["mooncake", "nixl", "ascend", "fake"]
+
+GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
+
+
+# Allow external code to add more choices
+def add_load_format_choices(choices):
+    LOAD_FORMAT_CHOICES.extend(choices)
+
+
+def add_quantization_method_choices(choices):
+    QUANTIZATION_CHOICES.extend(choices)
+
+
+def add_attention_backend_choices(choices):
+    ATTENTION_BACKEND_CHOICES.extend(choices)
+
+
+def add_disagg_transfer_backend_choices(choices):
+    DISAGG_TRANSFER_BACKEND_CHOICES.extend(choices)
+
+
+def add_grammar_backend_choices(choices):
+    GRAMMAR_BACKEND_CHOICES.extend(choices)
+
+
+@dataclasses.dataclass
+class ServerArgs:
+    # Model and tokenizer
+    model_path: str
+    tokenizer_path: Optional[str] = None
+    tokenizer_mode: str = "auto"
+    tokenizer_worker_num: int = 1
+    skip_tokenizer_init: bool = False
+    load_format: str = "auto"
+    model_loader_extra_config: str = "{}"
+    trust_remote_code: bool = False
+    context_length: Optional[int] = None
+    is_embedding: bool = False
+    enable_multimodal: Optional[bool] = None
+    revision: Optional[str] = None
+    model_impl: str = "auto"
+
+    # HTTP server
+    host: str = "127.0.0.1"
+    port: int = 30000
+    skip_server_warmup: bool = False
+    warmups: Optional[str] = None
+    nccl_port: Optional[int] = None
+
+    # Quantization and data type
+    dtype: str = "auto"
+    quantization: Optional[str] = None
+    quantization_param_path: Optional[str] = None
+    kv_cache_dtype: str = "auto"
+
+    # Memory and scheduling
+    mem_fraction_static: Optional[float] = None
+    max_running_requests: Optional[int] = None
+    max_queued_requests: Optional[int] = sys.maxsize
+    max_total_tokens: Optional[int] = None
+    chunked_prefill_size: Optional[int] = None
+    max_prefill_tokens: int = 16384
+    schedule_policy: str = "fcfs"
+    schedule_conservativeness: float = 1.0
+    page_size: Optional[int] = None
+    hybrid_kvcache_ratio: Optional[float] = None
+    swa_full_tokens_ratio: float = 0.8
+    disable_hybrid_swa_memory: bool = False
+
+    # Runtime options
+    device: Optional[str] = None
+    tp_size: int = 1
+    pp_size: int = 1
+    max_micro_batch_size: Optional[int] = None
+    stream_interval: int = 1
+    stream_output: bool = False
+    random_seed: Optional[int] = None
+    constrained_json_whitespace_pattern: Optional[str] = None
+    watchdog_timeout: float = 300
+    dist_timeout: Optional[int] = None  # timeout for torch.distributed
+    download_dir: Optional[str] = None
+    base_gpu_id: int = 0
+    gpu_id_step: int = 1
+    sleep_on_idle: bool = False
+
+    # Logging
+    log_level: str = "info"
+    log_level_http: Optional[str] = None
+    log_requests: bool = False
+    log_requests_level: int = 2
+    crash_dump_folder: Optional[str] = None
+    show_time_cost: bool = False
+    enable_metrics: bool = False
+    enable_metrics_for_all_schedulers: bool = False
+    bucket_time_to_first_token: Optional[List[float]] = None
+    bucket_inter_token_latency: Optional[List[float]] = None
+    bucket_e2e_request_latency: Optional[List[float]] = None
+    collect_tokens_histogram: bool = False
+    prompt_tokens_buckets: Optional[List[str]] = None
+    generation_tokens_buckets: Optional[List[str]] = None
+    decode_log_interval: int = 40
+    enable_request_time_stats_logging: bool = False
+    kv_events_config: Optional[str] = None
+    gc_warning_threshold_secs: float = 0.0
+
+    # API related
+    api_key: Optional[str] = None
+    served_model_name: Optional[str] = None
+    weight_version: str = "default"
+    chat_template: Optional[str] = None
+    completion_template: Optional[str] = None
+    file_storage_path: str = "sglang_storage"
+    enable_cache_report: bool = False
+    reasoning_parser: Optional[str] = None
+    tool_call_parser: Optional[str] = None
+    tool_server: Optional[str] = None
+
+    # Data parallelism
+    dp_size: int = 1
+    load_balance_method: str = "round_robin"
+    # FIXME: remove this after dp rank scheduling is fully supported with PD-Disaggregation
+    prefill_round_robin_balance: bool = False
+
+    # Multi-node distributed serving
+    dist_init_addr: Optional[str] = None
+    nnodes: int = 1
+    node_rank: int = 0
+
+    # Model override args in JSON
+    json_model_override_args: str = "{}"
+    preferred_sampling_params: Optional[str] = None
+
+    # LoRA
+    enable_lora: Optional[bool] = None
+    max_lora_rank: Optional[int] = None
+    lora_target_modules: Optional[Union[set[str], List[str]]] = None
+    lora_paths: Optional[
+        Union[dict[str, str], List[dict[str, str]], List[str], List[LoRARef]]
+    ] = None
+    max_loaded_loras: Optional[int] = None
+    max_loras_per_batch: int = 8
+    lora_backend: str = "triton"
+
+    # Kernel backend
+    attention_backend: Optional[str] = None
+    decode_attention_backend: Optional[str] = None
+    prefill_attention_backend: Optional[str] = None
+    sampling_backend: Optional[str] = None
+    grammar_backend: Optional[str] = None
+    mm_attention_backend: Optional[str] = None
+
+    # Speculative decoding
+    speculative_algorithm: Optional[str] = None
+    speculative_draft_model_path: Optional[str] = None
+    speculative_draft_model_revision: Optional[str] = None
+    speculative_num_steps: Optional[int] = None
+    speculative_eagle_topk: Optional[int] = None
+    speculative_num_draft_tokens: Optional[int] = None
+    speculative_accept_threshold_single: float = 1.0
+    speculative_accept_threshold_acc: float = 1.0
+    speculative_token_map: Optional[str] = None
+    speculative_attention_mode: str = "prefill"
+
+    # Expert parallelism
+    ep_size: int = 1
+    moe_a2a_backend: Literal["none", "deepep"] = "none"
+    moe_runner_backend: Literal[
+        "auto",
+        "triton",
+        "triton_kernel",
+        "flashinfer_trtllm",
+        "flashinfer_cutlass",
+        "flashinfer_mxfp4",
+    ] = "auto"
+    flashinfer_mxfp4_moe_precision: Literal["default", "bf16"] = "default"
+    enable_flashinfer_allreduce_fusion: bool = False
+    deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
+    ep_num_redundant_experts: int = 0
+    ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
+    init_expert_location: str = "trivial"
+    enable_eplb: bool = False
+    eplb_algorithm: str = "auto"
+    eplb_rebalance_num_iterations: int = 1000
+    eplb_rebalance_layers_per_chunk: Optional[int] = None
+    eplb_min_rebalancing_utilization_threshold: float = 1.0
+    expert_distribution_recorder_mode: Optional[
+        Literal["stat", "stat_approx", "per_pass", "per_token"]
+    ] = None
+    expert_distribution_recorder_buffer_size: Optional[int] = None
+    enable_expert_distribution_metrics: bool = False
+    deepep_config: Optional[str] = None
+    moe_dense_tp_size: Optional[int] = None
+
+    # Hierarchical cache
+    enable_hierarchical_cache: bool = False
+    hicache_ratio: float = 2.0
+    hicache_size: int = 0
+    hicache_write_policy: str = "write_through"
+    hicache_io_backend: str = "kernel"
+    hicache_mem_layout: str = "layer_first"
+    hicache_storage_backend: Optional[str] = None
+    hicache_storage_prefetch_policy: str = "best_effort"
+    hicache_storage_backend_extra_config: Optional[str] = None
+    # LMCache
+    enable_lmcache: bool = False
+
+    # Double Sparsity
+    enable_double_sparsity: bool = False
+    ds_channel_config_path: Optional[str] = None
+    ds_heavy_channel_num: int = 32
+    ds_heavy_token_num: int = 256
+    ds_heavy_channel_type: str = "qk"
+    ds_sparse_decode_threshold: int = 4096
+
+    # Offloading
+    cpu_offload_gb: int = 0
+    offload_group_size: int = -1
+    offload_num_in_group: int = 1
+    offload_prefetch_step: int = 1
+    offload_mode: str = "cpu"
+
+    # Optimization/debug options
+    disable_radix_cache: bool = False
+    cuda_graph_max_bs: Optional[int] = None
+    cuda_graph_bs: Optional[List[int]] = None
+    disable_cuda_graph: bool = False
+    disable_cuda_graph_padding: bool = False
+    enable_profile_cuda_graph: bool = False
+    enable_cudagraph_gc: bool = False
+    enable_nccl_nvls: bool = False
+    enable_symm_mem: bool = False
+    disable_flashinfer_cutlass_moe_fp4_allgather: bool = False
+    enable_tokenizer_batch_encode: bool = False
+    disable_outlines_disk_cache: bool = False
+    disable_custom_all_reduce: bool = False
+    enable_mscclpp: bool = False
+    disable_overlap_schedule: bool = False
+    enable_mixed_chunk: bool = False
+    enable_dp_attention: bool = False
+    enable_dp_lm_head: bool = False
+    enable_two_batch_overlap: bool = False
+    tbo_token_distribution_threshold: float = 0.48
+    enable_torch_compile: bool = False
+    torch_compile_max_bs: int = 32
+    torchao_config: str = ""
+    enable_nan_detection: bool = False
+    enable_p2p_check: bool = False
+    triton_attention_reduce_in_fp32: bool = False
+    triton_attention_num_kv_splits: int = 8
+    num_continuous_decode_steps: int = 1
+    delete_ckpt_after_loading: bool = False
+    enable_memory_saver: bool = False
+    allow_auto_truncate: bool = False
+    enable_custom_logit_processor: bool = False
+    flashinfer_mla_disable_ragged: bool = False
+    disable_shared_experts_fusion: bool = False
+    disable_chunked_prefix_cache: bool = False
+    disable_fast_image_processor: bool = False
+    enable_return_hidden_states: bool = False
+    scheduler_recv_interval: int = 1
+    numa_node: Optional[List[int]] = None
+
+    # Debug tensor dumps
+    debug_tensor_dump_output_folder: Optional[str] = None
+    debug_tensor_dump_input_file: Optional[str] = None
+    debug_tensor_dump_inject: bool = False
+    debug_tensor_dump_prefill_only: bool = False
+
+    # PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
+    disaggregation_mode: str = "null"
+    disaggregation_transfer_backend: str = "mooncake"
+    disaggregation_bootstrap_port: int = 8998
+    disaggregation_decode_tp: Optional[int] = None
+    disaggregation_decode_dp: Optional[int] = None
+    disaggregation_prefill_pp: Optional[int] = 1
+    disaggregation_ib_device: Optional[str] = None
+    num_reserved_decode_tokens: int = 512  # used for decode kv cache offload in PD
+
+    # For model weight update
+    custom_weight_loader: Optional[List[str]] = None
+    weight_loader_disable_mmap: bool = False
+
+    # For PD-Multiplexing
+    enable_pdmux: bool = False
+    sm_group_num: int = 3
+
+    # Mamba cache
+    max_mamba_cache_size: Optional[int] = None
+    mamba_ssm_dtype: str = "float32"
+
+    # Deprecated arguments
+    enable_ep_moe: bool = False
+    enable_deepep_moe: bool = False
+    enable_flashinfer_cutlass_moe: bool = False
+    enable_flashinfer_trtllm_moe: bool = False
+    enable_triton_kernel_moe: bool = False
+    enable_flashinfer_mxfp4_moe: bool = False
+
+    def __post_init__(self):
+        # Check deprecated arguments
+        if self.enable_ep_moe:
+            self.ep_size = self.tp_size
+            print_deprecated_warning(
+                "NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead."
+            )
+        if self.enable_deepep_moe:
+            self.moe_a2a_backend = "deepep"
+            print_deprecated_warning(
+                "NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead."
+            )
+        if self.enable_triton_kernel_moe:
+            self.moe_runner_backend = "triton_kernel"
+            print_deprecated_warning(
+                "NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead."
+            )
+        if self.enable_flashinfer_cutlass_moe:
+            self.moe_runner_backend = "flashinfer_cutlass"
+            print_deprecated_warning(
+                "NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead."
+            )
+        if self.enable_flashinfer_trtllm_moe:
+            self.moe_runner_backend = "flashinfer_trtllm"
+            print_deprecated_warning(
+                "NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead."
+            )
+        if self.enable_flashinfer_mxfp4_moe:
+            self.moe_runner_backend = "flashinfer_mxfp4"
+            print_deprecated_warning(
+                "NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead."
+            )
+
+        # Set missing default values
+        if self.tokenizer_path is None:
+            self.tokenizer_path = self.model_path
+        if self.served_model_name is None:
+            self.served_model_name = self.model_path
+        if self.device is None:
+            self.device = get_device()
+        if self.random_seed is None:
+            self.random_seed = random.randint(0, 1 << 30)
+
+        gpu_mem = get_device_memory_capacity(self.device)
+
+        # Set mem fraction static
+        if self.mem_fraction_static is None:
+            if gpu_mem is not None:
+                # GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
+                # mem_fraction_static = (model weights + KV cache pool) / GPU memory capacity.
+
+                # We want mem_fraction_static to be as large as possible but still has enough room
+                # for activations and cuda graph buffers. We use the following heuristic to
+                # compute the needed size for activations and cuda graph buffers:
+                # - The size of the activation depends on the chunked_prefill_size and model size.
+                # - The size of cuda graph buffers depends on the cuda graph capture range and model size.
+                # For GPUs with more memory, we use a larger chunked_prefill_size and
+                # capture more cuda graphs, so they need to reserve more memory.
+                parallel_size = self.tp_size * self.pp_size
+
+                if gpu_mem < 20 * 1024:
+                    # T4, 4080. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
+                    reserved_mem = (2.8 + parallel_size / 10) * 1024
+                elif gpu_mem < 35 * 1024:
+                    # A10, L40, 4090, 5090. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
+                    reserved_mem = (2.8 + parallel_size / 10) * 1024
+                elif gpu_mem < 90 * 1024:
+                    # H100, A100. (chunked_prefill_size 8k, cuda_graph_max_bs 160)
+                    reserved_mem = (9.5 + parallel_size / 2) * 1024
+                elif gpu_mem < 100 * 1024:
+                    # H20. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
+                    reserved_mem = (12 + parallel_size / 2) * 1024
+                elif gpu_mem < 160 * 1024:
+                    # H200. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
+                    reserved_mem = (12 + parallel_size / 2) * 1024
+                else:
+                    # B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
+                    reserved_mem = 32 * 1024
+
+                # draft model and larger cuda graph buffers
+                if self.speculative_algorithm is not None:
+                    if self.speculative_algorithm == "STANDALONE":
+                        # Standalone speculative decoding needs more memory than other speculative
+                        # decoding algorithms since the draft model is typically larger.
+                        reserved_mem += 6 * 1024
+                    else:
+                        reserved_mem += 2 * 1024
+                if self.enable_dp_attention:
+                    reserved_mem += 4 * 1024
+
+                self.mem_fraction_static = round((gpu_mem - reserved_mem) / gpu_mem, 3)
+            else:
+                self.mem_fraction_static = 0.88
+
+            # Lazy init to avoid circular import
+            # Multimodal models need more memory for the image processor
+            from sglang.srt.configs.model_config import ModelConfig
+
+            model_config = ModelConfig.from_server_args(self)
+            if model_config.is_multimodal:
+                self.adjust_mem_fraction_for_vlm(model_config)
+
+        # Set chunked prefill size, which depends on the gpu memory capacity
+        if self.chunked_prefill_size is None:
+            if gpu_mem is not None:
+                if gpu_mem < 35 * 1024:  # A10, L40, 4090
+                    self.chunked_prefill_size = 2048
+                elif gpu_mem < 160 * 1024:  # H100, H200, A100, H20
+                    self.chunked_prefill_size = 8192
+                else:  # B200, MI300
+                    self.chunked_prefill_size = 16384
+            else:
+                self.chunked_prefill_size = 4096
+
+        # Set cuda graph max batch size
+        if self.cuda_graph_max_bs is None:
+            # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
+            if gpu_mem is not None and gpu_mem < 35 * 1024:
+                if self.tp_size < 4:
+                    self.cuda_graph_max_bs = 8
+                else:
+                    self.cuda_graph_max_bs = 80
+
+        # Set kernel backends for hpu device
+        if self.device == "hpu":
+            self.attention_backend = "torch_native"
+            self.sampling_backend = "pytorch"
+
+        # Model-specific adjustments
+        self.model_specific_adjustments()
+
+        # Set kernel backends
+        if self.device == "cpu":
+            if self.attention_backend is None:
+                self.attention_backend = "intel_amx"
+            self.sampling_backend = "pytorch"
+
+        if self.sampling_backend is None:
+            self.sampling_backend = (
+                "flashinfer" if is_flashinfer_available() else "pytorch"
+            )
+
+        if self.attention_backend == "torch_native":
+            logger.warning(
+                "Cuda graph is disabled because of using torch native attention backend"
+            )
+            self.disable_cuda_graph = True
+
+        if self.attention_backend == "ascend":
+            logger.warning(
+                "At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
+            )
+            self.page_size = 128
+
+        if (
+            self.attention_backend == "flashmla"
+            or self.decode_attention_backend == "flashmla"
+        ):
+            logger.warning(
+                "FlashMLA only supports a page_size of 64, change page_size to 64."
+            )
+            self.page_size = 64
+
+        if (
+            self.attention_backend == "cutlass_mla"
+            or self.decode_attention_backend == "cutlass_mla"
+        ):
+            logger.warning(
+                "Cutlass MLA only supports a page_size of 128, change page_size to 128."
+            )
+            self.page_size = 128
+
+        if (
+            self.attention_backend == "trtllm_mla"
+            or self.decode_attention_backend == "trtllm_mla"
+        ):
+            if not is_sm100_supported():
+                raise ValueError(
+                    "TRTLLM MLA backend is only supported on Blackwell GPUs (SM100). Please use a different backend."
+                )
+
+            if self.page_size not in [32, 64]:
+                logger.warning(
+                    f"TensorRT-LLM MLA only supports page_size of 32 or 64, changing page_size from {self.page_size} to 64."
+                )
+                self.page_size = 64
+
+            if self.kv_cache_dtype not in ["fp8_e4m3", "auto"]:
+                raise ValueError(
+                    "TensorRT-LLM MLA backend only supports kv-cache-dtype of fp8_e4m3 or auto."
+                )
+
+        if (
+            self.attention_backend == "trtllm_mha"
+            or self.decode_attention_backend == "trtllm_mha"
+            or self.prefill_attention_backend == "trtllm_mha"
+        ):
+            if not is_sm100_supported():
+                raise ValueError(
+                    "TRTLLM MHA backend is only supported on Blackwell GPUs (SM100). Please use a different backend."
+                )
+
+            if self.page_size not in [16, 32, 64]:
+                logger.warning(
+                    f"TensorRT-LLM MHA only supports page_size of 16, 32 or 64, changing page_size from {self.page_size} to 64."
+                )
+                self.page_size = 64
+
+        if self.attention_backend == "dual_chunk_flash_attn":
+            logger.warning(
+                "Mixed chunk, radix cache, and cuda graphs are disabled because of using dual chunk flash attention backend"
+            )
+            self.enable_mixed_chunk = False
+            self.disable_cuda_graph = True
+            self.disable_radix_cache = True
+
+        # Set page size
+        if self.page_size is None:
+            self.page_size = 1
+
+        # AMD-specific Triton attention KV splits default number
+        if is_hip():
+            self.triton_attention_num_kv_splits = 16
+
+        # Choose grammar backend
+        if self.grammar_backend is None:
+            self.grammar_backend = "xgrammar"
+
+        if self.dp_size == 1:
+            self.enable_dp_attention = False
+
+        # Data parallelism attention
+        if self.enable_dp_attention:
+            self.schedule_conservativeness = self.schedule_conservativeness * 0.3
+            assert self.tp_size % self.dp_size == 0
+            self.chunked_prefill_size = self.chunked_prefill_size // self.dp_size
+            logger.warning(
+                f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
+            )
+
+        if self.enable_dp_lm_head:
+            assert (
+                self.enable_dp_attention
+            ), "Please enable dp attention when setting enable_dp_lm_head. "
+
+        # MoE kernel
+        if self.moe_runner_backend == "flashinfer_cutlass":
+            assert (
+                self.quantization == "modelopt_fp4"
+            ), "modelopt_fp4 quantization is required for Flashinfer MOE"
+            assert self.ep_size in [
+                1,
+                self.tp_size,
+            ], "The expert parallel size must be 1 or the same as the tensor parallel size"
+
+        if self.moe_runner_backend == "flashinfer_trtllm":
+            assert (
+                self.quantization == "modelopt_fp4" or self.quantization == "fp8"
+            ), "modelopt_fp4 quantization is required for Flashinfer TRTLLM MoE"
+            self.disable_shared_experts_fusion = True
+            logger.warning(
+                "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
+            )
+
+        # DeepEP MoE
+        if self.moe_a2a_backend == "deepep":
+            if self.deepep_mode == "normal":
+                logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
+                self.disable_cuda_graph = True
+            self.ep_size = self.tp_size
+            logger.warning(
+                f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
+            )
+
+        if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
+            self.expert_distribution_recorder_mode = "stat"
+            logger.warning(
+                "EPLB is enabled. The expert_distribution_recorder_mode is automatically set."
+            )
+
+        if (self.enable_eplb or (self.init_expert_location is not None)) and (
+            self.ep_dispatch_algorithm is None
+        ):
+            self.ep_dispatch_algorithm = "static"
+
+        if self.enable_eplb:
+            assert self.ep_size > 1
+
+        if self.enable_expert_distribution_metrics and (
+            self.expert_distribution_recorder_mode is None
+        ):
+            self.expert_distribution_recorder_mode = "stat"
+
+        if self.expert_distribution_recorder_buffer_size is None:
+            if (x := self.eplb_rebalance_num_iterations) is not None:
+                self.expert_distribution_recorder_buffer_size = x
+            elif self.expert_distribution_recorder_mode is not None:
+                self.expert_distribution_recorder_buffer_size = 1000
+
+        # Pipeline parallelism
+        if self.pp_size > 1:
+            self.disable_overlap_schedule = True
+            logger.warning(
+                "Pipeline parallelism is incompatible with overlap schedule."
+            )
+
+        # Hicache
+        if self.hicache_storage_backend == "mooncake":
+            # to use mooncake storage backend, the following conditions must be met:
+            self.hicache_io_backend = "kernel"
+            self.hicache_mem_layout = "page_first"
+
+        # Speculative Decoding
+        if self.speculative_algorithm == "NEXTN":
+            # NEXTN shares the same implementation of EAGLE
+            self.speculative_algorithm = "EAGLE"
+
+        if self.speculative_algorithm in ("EAGLE", "EAGLE3", "STANDALONE"):
+            if self.speculative_algorithm == "STANDALONE":
+                # TODO: support dp attention for standalone speculative decoding
+                assert (
+                    self.enable_dp_attention is False
+                ), "Currently standalone speculative decoding does not support dp attention."
+            if self.max_running_requests is None:
+                self.max_running_requests = 48
+            self.disable_overlap_schedule = True
+            logger.warning(
+                "Overlap scheduler is disabled because of using "
+                "eagle speculative decoding."
+            )
+            if self.enable_mixed_chunk:
+                self.enable_mixed_chunk = False
+                logger.warning(
+                    "Mixed chunked prefill is disabled because of using "
+                    "eagle speculative decoding."
+                )
+
+            model_arch = self.get_hf_config().architectures[0]
+            if model_arch in ["DeepseekV3ForCausalLM", "Glm4MoeForCausalLM"]:
+                # Auto set draft_model_path DeepSeek-V3/R1
+                if self.speculative_draft_model_path is None:
+                    self.speculative_draft_model_path = self.model_path
+                else:
+                    logger.warning(
+                        "DeepSeek MTP does not require setting speculative_draft_model_path."
+                    )
+
+            # Auto choose parameters
+            if self.speculative_num_steps is None:
+                assert (
+                    self.speculative_eagle_topk is None
+                    and self.speculative_num_draft_tokens is None
+                )
+                (
+                    self.speculative_num_steps,
+                    self.speculative_eagle_topk,
+                    self.speculative_num_draft_tokens,
+                ) = auto_choose_speculative_params(self)
+
+            if (
+                self.attention_backend == "trtllm_mha"
+                or self.decode_attention_backend == "trtllm_mha"
+                or self.prefill_attention_backend == "trtllm_mha"
+            ):
+                if self.speculative_eagle_topk > 1:
+                    raise ValueError(
+                        "trtllm_mha backend only supports topk = 1 for speculative decoding."
+                    )
+
+            if (
+                self.speculative_eagle_topk == 1
+                and self.speculative_num_draft_tokens != self.speculative_num_steps + 1
+            ):
+                logger.warning(
+                    "speculative_num_draft_tokens is adjusted to speculative_num_steps + 1 when speculative_eagle_topk == 1"
+                )
+                self.speculative_num_draft_tokens = self.speculative_num_steps + 1
+
+            if (
+                self.speculative_eagle_topk > 1
+                and self.page_size > 1
+                and self.attention_backend != "flashinfer"
+            ):
+                raise ValueError(
+                    "speculative_eagle_topk > 1 with page_size > 1 is unstable and produces incorrect results for paged attention backends. This combination is only supported for the 'flashinfer' backend."
+                )
+
+            # The token generated from the verify step is counted.
+            # If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
+            # assert self.speculative_num_steps < self.speculative_num_draft_tokens
+
+        # GGUF
+        if (
+            self.load_format == "auto" or self.load_format == "gguf"
+        ) and check_gguf_file(self.model_path):
+            self.quantization = self.load_format = "gguf"
+
+        # Model loading
+        if is_remote_url(self.model_path):
+            self.load_format = "remote"
+        if self.custom_weight_loader is None:
+            self.custom_weight_loader = []
+
+        # PD disaggregation
+        if self.disaggregation_mode == "decode":
+            assert (
+                self.disaggregation_decode_tp is None
+            ), "Cannot set --disaggregation-decode-tp for the decode engine."
+            assert (
+                self.disaggregation_decode_dp is None
+            ), "Cannot set --disaggregation-decode-dp for the decode engine."
+
+            self.disable_radix_cache = True
+            logger.warning("KV cache is forced as chunk cache for decode server")
+
+            if self.dp_size > 1 and not is_in_ci():
+                assert self.prefill_round_robin_balance, (
+                    "Prefill round robin balance is required when dp size > 1. "
+                    "Please make sure that the prefill instance is launched with `--load-balance-method round_robin`"
+                    " and `--prefill-round-robin-balance` is set for decode server."
+                )
+        elif self.disaggregation_mode == "prefill":
+            if self.disaggregation_decode_tp is None:
+                self.disaggregation_decode_tp = self.tp_size
+            if self.disaggregation_decode_dp is None:
+                self.disaggregation_decode_dp = self.dp_size
+
+            self.disaggregation_prefill_pp = self.pp_size
+            self.validate_disagg_tp_size(self.tp_size, self.disaggregation_decode_tp)
+
+            self.disable_cuda_graph = True
+            logger.warning("Cuda graph is disabled for prefill server")
+
+        # Propagate env vars
+        os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
+            "1" if self.enable_torch_compile else "0"
+        )
+        os.environ["SGLANG_MAMBA_SSM_DTYPE"] = self.mamba_ssm_dtype
+
+        # Set env var before grammar backends init
+        os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
+            "1" if self.disable_outlines_disk_cache else "0"
+        )
+
+        if self.enable_hierarchical_cache and self.disable_radix_cache:
+            raise ValueError(
+                "The arguments enable-hierarchical-cache and disable-radix-cache are mutually exclusive "
+                "and cannot be used at the same time. Please use only one of them."
+            )
+
+    @staticmethod
+    def add_cli_args(parser: argparse.ArgumentParser):
+        # Model and tokenizer
+        parser.add_argument(
+            "--model-path",
+            "--model",
+            type=str,
+            help="The path of the model weights. This can be a local folder or a Hugging Face repo ID.",
+            required=True,
+        )
+        parser.add_argument(
+            "--tokenizer-path",
+            type=str,
+            default=ServerArgs.tokenizer_path,
+            help="The path of the tokenizer.",
+        )
+        parser.add_argument(
+            "--tokenizer-mode",
+            type=str,
+            default=ServerArgs.tokenizer_mode,
+            choices=["auto", "slow"],
+            help="Tokenizer mode. 'auto' will use the fast "
+            "tokenizer if available, and 'slow' will "
+            "always use the slow tokenizer.",
+        )
+        parser.add_argument(
+            "--tokenizer-worker-num",
+            type=int,
+            default=ServerArgs.tokenizer_worker_num,
+            help="The worker num of the tokenizer manager.",
+        )
+        parser.add_argument(
+            "--skip-tokenizer-init",
+            action="store_true",
+            help="If set, skip init tokenizer and pass input_ids in generate request.",
+        )
+        parser.add_argument(
+            "--load-format",
+            type=str,
+            default=ServerArgs.load_format,
+            choices=LOAD_FORMAT_CHOICES,
+            help="The format of the model weights to load. "
+            '"auto" will try to load the weights in the safetensors format '
+            "and fall back to the pytorch bin format if safetensors format "
+            "is not available. "
+            '"pt" will load the weights in the pytorch bin format. '
+            '"safetensors" will load the weights in the safetensors format. '
+            '"npcache" will load the weights in pytorch format and store '
+            "a numpy cache to speed up the loading. "
+            '"dummy" will initialize the weights with random values, '
+            "which is mainly for profiling."
+            '"gguf" will load the weights in the gguf format. '
+            '"bitsandbytes" will load the weights using bitsandbytes '
+            "quantization."
+            '"layered" loads weights layer by layer so that one can quantize a '
+            "layer before loading another to make the peak memory envelope "
+            "smaller.",
+        )
+        parser.add_argument(
+            "--model-loader-extra-config",
+            type=str,
+            help="Extra config for model loader. "
+            "This will be passed to the model loader corresponding to the chosen load_format.",
+            default=ServerArgs.model_loader_extra_config,
+        )
+        parser.add_argument(
+            "--trust-remote-code",
+            action="store_true",
+            help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
+        )
+        parser.add_argument(
+            "--context-length",
+            type=int,
+            default=ServerArgs.context_length,
+            help="The model's maximum context length. Defaults to None (will use the value from the model's config.json instead).",
+        )
+        parser.add_argument(
+            "--is-embedding",
+            action="store_true",
+            help="Whether to use a CausalLM as an embedding model.",
+        )
+        parser.add_argument(
+            "--enable-multimodal",
+            default=ServerArgs.enable_multimodal,
+            action="store_true",
+            help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",
+        )
+        parser.add_argument(
+            "--revision",
+            type=str,
+            default=None,
+            help="The specific model version to use. It can be a branch "
+            "name, a tag name, or a commit id. If unspecified, will use "
+            "the default version.",
+        )
+        parser.add_argument(
+            "--model-impl",
+            type=str,
+            default=ServerArgs.model_impl,
+            help="Which implementation of the model to use.\n\n"
+            '* "auto" will try to use the SGLang implementation if it exists '
+            "and fall back to the Transformers implementation if no SGLang "
+            "implementation is available.\n"
+            '* "sglang" will use the SGLang model implementation.\n'
+            '* "transformers" will use the Transformers model '
+            "implementation.\n",
+        )
+
+        # HTTP server
+        parser.add_argument(
+            "--host",
+            type=str,
+            default=ServerArgs.host,
+            help="The host of the HTTP server.",
+        )
+        parser.add_argument(
+            "--port",
+            type=int,
+            default=ServerArgs.port,
+            help="The port of the HTTP server.",
+        )
+        parser.add_argument(
+            "--skip-server-warmup",
+            action="store_true",
+            help="If set, skip warmup.",
+        )
+        parser.add_argument(
+            "--warmups",
+            type=str,
+            required=False,
+            help="Specify custom warmup functions (csv) to run before server starts eg. --warmups=warmup_name1,warmup_name2 "
+            "will run the functions `warmup_name1` and `warmup_name2` specified in warmup.py before the server starts listening for requests",
+        )
+        parser.add_argument(
+            "--nccl-port",
+            type=int,
+            default=ServerArgs.nccl_port,
+            help="The port for NCCL distributed environment setup. Defaults to a random port.",
+        )
+
+        # Quantization and data type
+        parser.add_argument(
+            "--dtype",
+            type=str,
+            default=ServerArgs.dtype,
+            choices=["auto", "half", "float16", "bfloat16", "float", "float32"],
+            help="Data type for model weights and activations.\n\n"
+            '* "auto" will use FP16 precision for FP32 and FP16 models, and '
+            "BF16 precision for BF16 models.\n"
+            '* "half" for FP16. Recommended for AWQ quantization.\n'
+            '* "float16" is the same as "half".\n'
+            '* "bfloat16" for a balance between precision and range.\n'
+            '* "float" is shorthand for FP32 precision.\n'
+            '* "float32" for FP32 precision.',
+        )
+        parser.add_argument(
+            "--quantization",
+            type=str,
+            default=ServerArgs.quantization,
+            choices=QUANTIZATION_CHOICES,
+            help="The quantization method.",
+        )
+        parser.add_argument(
+            "--quantization-param-path",
+            type=nullable_str,
+            default=None,
+            help="Path to the JSON file containing the KV cache "
+            "scaling factors. This should generally be supplied, when "
+            "KV cache dtype is FP8. Otherwise, KV cache scaling factors "
+            "default to 1.0, which may cause accuracy issues. ",
+        )
+        parser.add_argument(
+            "--kv-cache-dtype",
+            type=str,
+            default=ServerArgs.kv_cache_dtype,
+            choices=["auto", "fp8_e5m2", "fp8_e4m3"],
+            help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.',
+        )
+
+        # Memory and scheduling
+        parser.add_argument(
+            "--mem-fraction-static",
+            type=float,
+            default=ServerArgs.mem_fraction_static,
+            help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.",
+        )
+        parser.add_argument(
+            "--max-running-requests",
+            type=int,
+            default=ServerArgs.max_running_requests,
+            help="The maximum number of running requests.",
+        )
+        parser.add_argument(
+            "--max-queued-requests",
+            type=int,
+            default=ServerArgs.max_queued_requests,
+            help="The maximum number of queued requests. This option is ignored when using disaggregation-mode.",
+        )
+        parser.add_argument(
+            "--max-total-tokens",
+            type=int,
+            default=ServerArgs.max_total_tokens,
+            help="The maximum number of tokens in the memory pool. If not specified, it will be automatically calculated based on the memory usage fraction. "
+            "This option is typically used for development and debugging purposes.",
+        )
+        parser.add_argument(
+            "--chunked-prefill-size",
+            type=int,
+            default=ServerArgs.chunked_prefill_size,
+            help="The maximum number of tokens in a chunk for the chunked prefill. Setting this to -1 means disabling chunked prefill.",
+        )
+        parser.add_argument(
+            "--max-prefill-tokens",
+            type=int,
+            default=ServerArgs.max_prefill_tokens,
+            help="The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length.",
+        )
+        parser.add_argument(
+            "--schedule-policy",
+            type=str,
+            default=ServerArgs.schedule_policy,
+            choices=["lpm", "random", "fcfs", "dfs-weight", "lof"],
+            help="The scheduling policy of the requests.",
+        )
+        parser.add_argument(
+            "--schedule-conservativeness",
+            type=float,
+            default=ServerArgs.schedule_conservativeness,
+            help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
+        )
+        parser.add_argument(
+            "--page-size",
+            type=int,
+            default=ServerArgs.page_size,
+            help="The number of tokens in a page.",
+        )
+        parser.add_argument(
+            "--hybrid-kvcache-ratio",
+            nargs="?",
+            const=0.5,
+            type=float,
+            default=ServerArgs.hybrid_kvcache_ratio,
+            help=(
+                "Mix ratio in [0,1] between uniform and hybrid kv buffers "
+                "(0.0 = pure uniform: swa_size / full_size = 1)"
+                "(1.0 = pure hybrid: swa_size / full_size = local_attention_size / context_length)"
+            ),
+        )
+        parser.add_argument(
+            "--swa-full-tokens-ratio",
+            type=float,
+            default=ServerArgs.swa_full_tokens_ratio,
+            help="The ratio of SWA layer KV tokens / full layer KV tokens, regardless of the number of swa:full layers. It should be between 0 and 1. "
+            "E.g. 0.5 means if each swa layer has 50 tokens, then each full layer has 100 tokens.",
+        )
+        parser.add_argument(
+            "--disable-hybrid-swa-memory",
+            action="store_true",
+            help="Disable the hybrid SWA memory.",
+        )
+
+        # Runtime options
+        parser.add_argument(
+            "--device",
+            type=str,
+            default=ServerArgs.device,
+            help="The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified.",
+        )
+        parser.add_argument(
+            "--tensor-parallel-size",
+            "--tp-size",
+            type=int,
+            default=ServerArgs.tp_size,
+            help="The tensor parallelism size.",
+        )
+        parser.add_argument(
+            "--pipeline-parallel-size",
+            "--pp-size",
+            type=int,
+            default=ServerArgs.pp_size,
+            help="The pipeline parallelism size.",
+        )
+        parser.add_argument(
+            "--max-micro-batch-size",
+            type=int,
+            default=ServerArgs.max_micro_batch_size,
+            help="The maximum micro batch size in pipeline parallelism.",
+        )
+        parser.add_argument(
+            "--stream-interval",
+            type=int,
+            default=ServerArgs.stream_interval,
+            help="The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher",
+        )
+        parser.add_argument(
+            "--stream-output",
+            action="store_true",
+            help="Whether to output as a sequence of disjoint segments.",
+        )
+        parser.add_argument(
+            "--random-seed",
+            type=int,
+            default=ServerArgs.random_seed,
+            help="The random seed.",
+        )
+        parser.add_argument(
+            "--constrained-json-whitespace-pattern",
+            type=str,
+            default=ServerArgs.constrained_json_whitespace_pattern,
+            help="(outlines backend only) Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
+        )
+        parser.add_argument(
+            "--watchdog-timeout",
+            type=float,
+            default=ServerArgs.watchdog_timeout,
+            help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
+        )
+        parser.add_argument(
+            "--dist-timeout",
+            type=int,
+            default=ServerArgs.dist_timeout,
+            help="Set timeout for torch.distributed initialization.",
+        )
+        parser.add_argument(
+            "--download-dir",
+            type=str,
+            default=ServerArgs.download_dir,
+            help="Model download directory for huggingface.",
+        )
+        parser.add_argument(
+            "--base-gpu-id",
+            type=int,
+            default=ServerArgs.base_gpu_id,
+            help="The base GPU ID to start allocating GPUs from. Useful when running multiple instances on the same machine.",
+        )
+        parser.add_argument(
+            "--gpu-id-step",
+            type=int,
+            default=ServerArgs.gpu_id_step,
+            help="The delta between consecutive GPU IDs that are used. For example, setting it to 2 will use GPU 0,2,4,...",
+        )
+        parser.add_argument(
+            "--sleep-on-idle",
+            action="store_true",
+            help="Reduce CPU usage when sglang is idle.",
+        )
+
+        # Logging
+        parser.add_argument(
+            "--log-level",
+            type=str,
+            default=ServerArgs.log_level,
+            help="The logging level of all loggers.",
+        )
+        parser.add_argument(
+            "--log-level-http",
+            type=str,
+            default=ServerArgs.log_level_http,
+            help="The logging level of HTTP server. If not set, reuse --log-level by default.",
+        )
+        parser.add_argument(
+            "--log-requests",
+            action="store_true",
+            help="Log metadata, inputs, outputs of all requests. The verbosity is decided by --log-requests-level",
+        )
+        parser.add_argument(
+            "--log-requests-level",
+            type=int,
+            default=ServerArgs.log_requests_level,
+            help="0: Log metadata (no sampling parameters). 1: Log metadata and sampling parameters. 2: Log metadata, sampling parameters and partial input/output. 3: Log every input/output.",
+            choices=[0, 1, 2, 3],
+        )
+        parser.add_argument(
+            "--crash-dump-folder",
+            type=str,
+            default=ServerArgs.crash_dump_folder,
+            help="Folder path to dump requests from the last 5 min before a crash (if any). If not specified, crash dumping is disabled.",
+        )
+        parser.add_argument(
+            "--show-time-cost",
+            action="store_true",
+            help="Show time cost of custom marks.",
+        )
+        parser.add_argument(
+            "--enable-metrics",
+            action="store_true",
+            help="Enable log prometheus metrics.",
+        )
+        parser.add_argument(
+            "--enable-metrics-for-all-schedulers",
+            action="store_true",
+            help="Enable --enable-metrics-for-all-schedulers when you want schedulers on all TP ranks (not just TP 0) "
+            "to record request metrics separately. This is especially useful when dp_attention is enabled, as "
+            "otherwise all metrics appear to come from TP 0.",
+        )
+        parser.add_argument(
+            "--bucket-time-to-first-token",
+            type=float,
+            nargs="+",
+            default=ServerArgs.bucket_time_to_first_token,
+            help="The buckets of time to first token, specified as a list of floats.",
+        )
+        parser.add_argument(
+            "--bucket-inter-token-latency",
+            type=float,
+            nargs="+",
+            default=ServerArgs.bucket_inter_token_latency,
+            help="The buckets of inter-token latency, specified as a list of floats.",
+        )
+        parser.add_argument(
+            "--bucket-e2e-request-latency",
+            type=float,
+            nargs="+",
+            default=ServerArgs.bucket_e2e_request_latency,
+            help="The buckets of end-to-end request latency, specified as a list of floats.",
+        )
+        parser.add_argument(
+            "--collect-tokens-histogram",
+            action="store_true",
+            default=ServerArgs.collect_tokens_histogram,
+            help="Collect prompt/generation tokens histogram.",
+        )
+        bucket_rule = (
+            "Supports 3 rule types: 'default' uses predefined buckets; 'tse <middle> <base> <count>' "
+            "generates two sides exponential distributed buckets (e.g., 'tse 1000 2 8' generates buckets "
+            "[984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'customer <value1> "
+            "<value2> ...' uses custom bucket values (e.g., 'customer 10 50 100 500')."
+        )
+        parser.add_argument(
+            "--prompt-tokens-buckets",
+            type=str,
+            nargs="+",
+            default=ServerArgs.prompt_tokens_buckets,
+            help=f"The buckets rule of prompt tokens. {bucket_rule}",
+        )
+        parser.add_argument(
+            "--generation-tokens-buckets",
+            type=str,
+            nargs="+",
+            default=ServerArgs.generation_tokens_buckets,
+            help=f"The buckets rule for generation tokens histogram. {bucket_rule}",
+        )
+        parser.add_argument(
+            "--gc-warning-threshold-secs",
+            type=float,
+            default=ServerArgs.gc_warning_threshold_secs,
+            help="The threshold for long GC warning. If a GC takes longer than this, a warning will be logged. Set to 0 to disable.",
+        )
+        parser.add_argument(
+            "--decode-log-interval",
+            type=int,
+            default=ServerArgs.decode_log_interval,
+            help="The log interval of decode batch.",
+        )
+        parser.add_argument(
+            "--enable-request-time-stats-logging",
+            action="store_true",
+            default=ServerArgs.enable_request_time_stats_logging,
+            help="Enable per request time stats logging",
+        )
+        parser.add_argument(
+            "--kv-events-config",
+            type=str,
+            default=None,
+            help="Config in json format for NVIDIA dynamo KV event publishing. Publishing will be enabled if this flag is used.",
+        )
+
+        # API related
+        parser.add_argument(
+            "--api-key",
+            type=str,
+            default=ServerArgs.api_key,
+            help="Set API key of the server. It is also used in the OpenAI API compatible server.",
+        )
+        parser.add_argument(
+            "--served-model-name",
+            type=str,
+            default=ServerArgs.served_model_name,
+            help="Override the model name returned by the v1/models endpoint in OpenAI API server.",
+        )
+        parser.add_argument(
+            "--weight-version",
+            type=str,
+            default=ServerArgs.weight_version,
+            help="Version identifier for the model weights. Defaults to 'default' if not specified.",
+        )
+        parser.add_argument(
+            "--chat-template",
+            type=str,
+            default=ServerArgs.chat_template,
+            help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server.",
+        )
+        parser.add_argument(
+            "--completion-template",
+            type=str,
+            default=ServerArgs.completion_template,
+            help="The buliltin completion template name or the path of the completion template file. This is only used for OpenAI-compatible API server. only for code completion currently.",
+        )
+        parser.add_argument(
+            "--file-storage-path",
+            type=str,
+            default=ServerArgs.file_storage_path,
+            help="The path of the file storage in backend.",
+        )
+        parser.add_argument(
+            "--enable-cache-report",
+            action="store_true",
+            help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.",
+        )
+        parser.add_argument(
+            "--reasoning-parser",
+            type=str,
+            choices=list(ReasoningParser.DetectorMap.keys()),
+            default=ServerArgs.reasoning_parser,
+            help=f"Specify the parser for reasoning models, supported parsers are: {list(ReasoningParser.DetectorMap.keys())}.",
+        )
+        tool_call_parser_choices = list(FunctionCallParser.ToolCallParserEnum.keys())
+        parser.add_argument(
+            "--tool-call-parser",
+            type=str,
+            choices=tool_call_parser_choices,
+            default=ServerArgs.tool_call_parser,
+            help=f"Specify the parser for handling tool-call interactions. Options include: {tool_call_parser_choices}.",
+        )
+        parser.add_argument(
+            "--tool-server",
+            type=str,
+            default=None,
+            help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
+        )
+
+        # Data parallelism
+        parser.add_argument(
+            "--data-parallel-size",
+            "--dp-size",
+            type=int,
+            default=ServerArgs.dp_size,
+            help="The data parallelism size.",
+        )
+        parser.add_argument(
+            "--load-balance-method",
+            type=str,
+            default=ServerArgs.load_balance_method,
+            help="The load balancing strategy for data parallelism.",
+            choices=[
+                "round_robin",
+                "shortest_queue",
+                "minimum_tokens",
+            ],
+        )
+        parser.add_argument(
+            "--prefill-round-robin-balance",
+            default=ServerArgs.prefill_round_robin_balance,
+            action="store_true",
+            help="Prefill is round robin balanced. This is used to promise decode server can get the correct dp rank.",
+        )
+
+        # Multi-node distributed serving
+        parser.add_argument(
+            "--dist-init-addr",
+            "--nccl-init-addr",  # For backward compatibility. This will be removed in the future.
+            type=str,
+            help="The host address for initializing distributed backend (e.g., `192.168.0.2:25000`).",
+        )
+        parser.add_argument(
+            "--nnodes", type=int, default=ServerArgs.nnodes, help="The number of nodes."
+        )
+        parser.add_argument(
+            "--node-rank", type=int, default=ServerArgs.node_rank, help="The node rank."
+        )
+
+        # Model override args
+        parser.add_argument(
+            "--json-model-override-args",
+            type=str,
+            help="A dictionary in JSON string format used to override default model configurations.",
+            default=ServerArgs.json_model_override_args,
+        )
+        parser.add_argument(
+            "--preferred-sampling-params",
+            type=str,
+            help="json-formatted sampling settings that will be returned in /get_model_info",
+        )
+
+        # LoRA
+        parser.add_argument(
+            "--enable-lora",
+            default=ServerArgs.enable_lora,
+            action="store_true",
+            help="Enable LoRA support for the model. This argument is automatically set to True if `--lora-paths` is provided for backward compatibility.",
+        )
+        parser.add_argument(
+            "--max-lora-rank",
+            default=ServerArgs.max_lora_rank,
+            type=int,
+            help="The maximum rank of LoRA adapters. If not specified, it will be automatically inferred from the adapters provided in --lora-paths.",
+        )
+        parser.add_argument(
+            "--lora-target-modules",
+            type=str,
+            choices=SUPPORTED_LORA_TARGET_MODULES + [LORA_TARGET_ALL_MODULES],
+            nargs="*",
+            default=None,
+            help="The union set of all target modules where LoRA should be applied. If not specified, "
+            "it will be automatically inferred from the adapters provided in --lora-paths. If 'all' is specified, "
+            "all supported modules will be targeted.",
+        )
+        parser.add_argument(
+            "--lora-paths",
+            type=str,
+            nargs="*",
+            default=None,
+            action=LoRAPathAction,
+            help='The list of LoRA adapters to load. Each adapter must be specified in one of the following formats: <PATH> | <NAME>=<PATH> | JSON with schema {"lora_name":str,"lora_path":str,"pinned":bool}',
+        )
+        parser.add_argument(
+            "--max-loras-per-batch",
+            type=int,
+            default=8,
+            help="Maximum number of adapters for a running batch, include base-only request.",
+        )
+        parser.add_argument(
+            "--max-loaded-loras",
+            type=int,
+            default=ServerArgs.max_loaded_loras,
+            help="If specified, it limits the maximum number of LoRA adapters loaded in CPU memory at a time. The value must be greater than or equal to `--max-loras-per-batch`.",
+        )
+        parser.add_argument(
+            "--lora-backend",
+            type=str,
+            default="triton",
+            help="Choose the kernel backend for multi-LoRA serving.",
+        )
+
+        # Kernel backend
+        parser.add_argument(
+            "--attention-backend",
+            type=str,
+            choices=ATTENTION_BACKEND_CHOICES,
+            default=ServerArgs.attention_backend,
+            help="Choose the kernels for attention layers.",
+        )
+        parser.add_argument(
+            "--prefill-attention-backend",
+            type=str,
+            choices=ATTENTION_BACKEND_CHOICES,
+            default=ServerArgs.prefill_attention_backend,
+            help="Choose the kernels for prefill attention layers (have priority over --attention-backend).",
+        )
+        parser.add_argument(
+            "--decode-attention-backend",
+            type=str,
+            choices=ATTENTION_BACKEND_CHOICES,
+            default=ServerArgs.decode_attention_backend,
+            help="Choose the kernels for decode attention layers (have priority over --attention-backend).",
+        )
+        parser.add_argument(
+            "--sampling-backend",
+            type=str,
+            choices=["flashinfer", "pytorch"],
+            default=ServerArgs.sampling_backend,
+            help="Choose the kernels for sampling layers.",
+        )
+        parser.add_argument(
+            "--grammar-backend",
+            type=str,
+            choices=GRAMMAR_BACKEND_CHOICES,
+            default=ServerArgs.grammar_backend,
+            help="Choose the backend for grammar-guided decoding.",
+        )
+        parser.add_argument(
+            "--mm-attention-backend",
+            type=str,
+            choices=["sdpa", "fa3", "triton_attn"],
+            default=ServerArgs.mm_attention_backend,
+            help="Set multimodal attention backend.",
+        )
+
+        # Speculative decoding
+        parser.add_argument(
+            "--speculative-algorithm",
+            type=str,
+            choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE"],
+            help="Speculative algorithm.",
+        )
+        parser.add_argument(
+            "--speculative-draft-model-path",
+            "--speculative-draft-model",
+            type=str,
+            help="The path of the draft model weights. This can be a local folder or a Hugging Face repo ID.",
+        )
+        parser.add_argument(
+            "--speculative-draft-model-revision",
+            type=str,
+            default=None,
+            help="The specific draft model version to use. It can be a branch "
+            "name, a tag name, or a commit id. If unspecified, will use "
+            "the default version.",
+        )
+        parser.add_argument(
+            "--speculative-num-steps",
+            type=int,
+            help="The number of steps sampled from draft model in Speculative Decoding.",
+            default=ServerArgs.speculative_num_steps,
+        )
+        parser.add_argument(
+            "--speculative-eagle-topk",
+            type=int,
+            help="The number of tokens sampled from the draft model in eagle2 each step.",
+            default=ServerArgs.speculative_eagle_topk,
+        )
+        parser.add_argument(
+            "--speculative-num-draft-tokens",
+            type=int,
+            help="The number of tokens sampled from the draft model in Speculative Decoding.",
+            default=ServerArgs.speculative_num_draft_tokens,
+        )
+        parser.add_argument(
+            "--speculative-accept-threshold-single",
+            type=float,
+            help="Accept a draft token if its probability in the target model is greater than this threshold.",
+            default=ServerArgs.speculative_accept_threshold_single,
+        )
+        parser.add_argument(
+            "--speculative-accept-threshold-acc",
+            type=float,
+            help="The accept probability of a draft token is raised from its target probability p to min(1, p / threshold_acc).",
+            default=ServerArgs.speculative_accept_threshold_acc,
+        )
+        parser.add_argument(
+            "--speculative-token-map",
+            type=str,
+            help="The path of the draft model's small vocab table.",
+            default=ServerArgs.speculative_token_map,
+        )
+        parser.add_argument(
+            "--speculative-attention-mode",
+            type=str,
+            choices=["prefill", "decode"],
+            help="Attention backend for speculative decoding operations (both target verify and draft extend). Can be one of 'prefill' (default) or 'decode'.",
+            default=ServerArgs.speculative_attention_mode,
+        )
+
+        # Expert parallelism
+        parser.add_argument(
+            "--expert-parallel-size",
+            "--ep-size",
+            "--ep",
+            type=int,
+            default=ServerArgs.ep_size,
+            help="The expert parallelism size.",
+        )
+        parser.add_argument(
+            "--moe-a2a-backend",
+            type=str,
+            choices=["none", "deepep"],
+            default=ServerArgs.moe_a2a_backend,
+            help="Choose the backend for MoE A2A.",
+        )
+        parser.add_argument(
+            "--moe-runner-backend",
+            type=str,
+            choices=[
+                "auto",
+                "triton",
+                "triton_kernel",
+                "flashinfer_trtllm",
+                "flashinfer_cutlass",
+                "flashinfer_mxfp4",
+            ],
+            default=ServerArgs.moe_runner_backend,
+            help="Choose the runner backend for MoE.",
+        )
+        parser.add_argument(
+            "--flashinfer-mxfp4-moe-precision",
+            type=str,
+            choices=["default", "bf16"],
+            default=ServerArgs.flashinfer_mxfp4_moe_precision,
+            help="Choose the computation precision of flashinfer mxfp4 moe",
+        )
+        parser.add_argument(
+            "--enable-flashinfer-allreduce-fusion",
+            action="store_true",
+            help="Enable FlashInfer allreduce fusion with Residual RMSNorm.",
+        )
+        parser.add_argument(
+            "--deepep-mode",
+            type=str,
+            choices=["normal", "low_latency", "auto"],
+            default="auto",
+            help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
+        )
+        parser.add_argument(
+            "--ep-num-redundant-experts",
+            type=int,
+            default=ServerArgs.ep_num_redundant_experts,
+            help="Allocate this number of redundant experts in expert parallel.",
+        )
+        parser.add_argument(
+            "--ep-dispatch-algorithm",
+            type=str,
+            default=ServerArgs.ep_dispatch_algorithm,
+            help="The algorithm to choose ranks for redundant experts in expert parallel.",
+        )
+        parser.add_argument(
+            "--init-expert-location",
+            type=str,
+            default=ServerArgs.init_expert_location,
+            help="Initial location of EP experts.",
+        )
+        parser.add_argument(
+            "--enable-eplb",
+            action="store_true",
+            help="Enable EPLB algorithm",
+        )
+        parser.add_argument(
+            "--eplb-algorithm",
+            type=str,
+            default=ServerArgs.eplb_algorithm,
+            help="Chosen EPLB algorithm",
+        )
+        parser.add_argument(
+            "--eplb-rebalance-num-iterations",
+            type=int,
+            default=ServerArgs.eplb_rebalance_num_iterations,
+            help="Number of iterations to automatically trigger a EPLB re-balance.",
+        )
+        parser.add_argument(
+            "--eplb-rebalance-layers-per-chunk",
+            type=int,
+            default=ServerArgs.eplb_rebalance_layers_per_chunk,
+            help="Number of layers to rebalance per forward pass.",
+        )
+        parser.add_argument(
+            "--eplb-min-rebalancing-utilization-threshold",
+            type=float,
+            default=ServerArgs.eplb_min_rebalancing_utilization_threshold,
+            help="Minimum threshold for GPU average utilization to trigger EPLB rebalancing. Must be in the range [0.0, 1.0].",
+        )
+        parser.add_argument(
+            "--expert-distribution-recorder-mode",
+            type=str,
+            default=ServerArgs.expert_distribution_recorder_mode,
+            help="Mode of expert distribution recorder.",
+        )
+        parser.add_argument(
+            "--expert-distribution-recorder-buffer-size",
+            type=int,
+            default=ServerArgs.expert_distribution_recorder_buffer_size,
+            help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
+        )
+        parser.add_argument(
+            "--enable-expert-distribution-metrics",
+            action="store_true",
+            help="Enable logging metrics for expert balancedness",
+        )
+        parser.add_argument(
+            "--deepep-config",
+            type=str,
+            default=ServerArgs.deepep_config,
+            help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
+        )
+        parser.add_argument(
+            "--moe-dense-tp-size",
+            type=int,
+            default=ServerArgs.moe_dense_tp_size,
+            help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
+        )
+
+        # Mamba Cache
+        parser.add_argument(
+            "--max-mamba-cache-size",
+            type=int,
+            default=ServerArgs.max_mamba_cache_size,
+            help="The maximum size of the mamba cache.",
+        )
+        parser.add_argument(
+            "--mamba-ssm-dtype",
+            type=str,
+            default=ServerArgs.mamba_ssm_dtype,
+            choices=["float32", "bfloat16"],
+            help="The data type of the SSM states in mamba cache.",
+        )
+
+        # Hierarchical cache
+        parser.add_argument(
+            "--enable-hierarchical-cache",
+            action="store_true",
+            help="Enable hierarchical cache",
+        )
+        parser.add_argument(
+            "--hicache-ratio",
+            type=float,
+            default=ServerArgs.hicache_ratio,
+            help="The ratio of the size of host KV cache memory pool to the size of device pool.",
+        )
+        parser.add_argument(
+            "--hicache-size",
+            type=int,
+            default=ServerArgs.hicache_size,
+            help="The size of host KV cache memory pool in gigabytes, which will override the hicache_ratio if set.",
+        )
+        parser.add_argument(
+            "--hicache-write-policy",
+            type=str,
+            choices=["write_back", "write_through", "write_through_selective"],
+            default=ServerArgs.hicache_write_policy,
+            help="The write policy of hierarchical cache.",
+        )
+        parser.add_argument(
+            "--hicache-io-backend",
+            type=str,
+            choices=["direct", "kernel"],
+            default=ServerArgs.hicache_io_backend,
+            help="The IO backend for KV cache transfer between CPU and GPU",
+        )
+        parser.add_argument(
+            "--hicache-mem-layout",
+            type=str,
+            choices=["layer_first", "page_first"],
+            default=ServerArgs.hicache_mem_layout,
+            help="The layout of host memory pool for hierarchical cache.",
+        )
+        parser.add_argument(
+            "--hicache-storage-backend",
+            type=str,
+            choices=["file", "mooncake", "hf3fs", "nixl"],
+            default=ServerArgs.hicache_storage_backend,
+            help="The storage backend for hierarchical KV cache.",
+        )
+        parser.add_argument(
+            "--hicache-storage-prefetch-policy",
+            type=str,
+            choices=["best_effort", "wait_complete", "timeout"],
+            default=ServerArgs.hicache_storage_prefetch_policy,
+            help="Control when prefetching from the storage backend should stop.",
+        )
+        parser.add_argument(
+            "--hicache-storage-backend-extra-config",
+            type=str,
+            default=ServerArgs.hicache_storage_backend_extra_config,
+            help="A dictionary in JSON string format containing extra configuration for the storage backend.",
+        )
+        # LMCache
+        parser.add_argument(
+            "--enable-lmcache",
+            action="store_true",
+            help="Using LMCache as an alternative hierarchical cache solution",
+        )
+
+        # Double Sparsity
+        parser.add_argument(
+            "--enable-double-sparsity",
+            action="store_true",
+            help="Enable double sparsity attention",
+        )
+        parser.add_argument(
+            "--ds-channel-config-path",
+            type=str,
+            default=ServerArgs.ds_channel_config_path,
+            help="The path of the double sparsity channel config",
+        )
+        parser.add_argument(
+            "--ds-heavy-channel-num",
+            type=int,
+            default=ServerArgs.ds_heavy_channel_num,
+            help="The number of heavy channels in double sparsity attention",
+        )
+        parser.add_argument(
+            "--ds-heavy-token-num",
+            type=int,
+            default=ServerArgs.ds_heavy_token_num,
+            help="The number of heavy tokens in double sparsity attention",
+        )
+        parser.add_argument(
+            "--ds-heavy-channel-type",
+            type=str,
+            default=ServerArgs.ds_heavy_channel_type,
+            help="The type of heavy channels in double sparsity attention",
+        )
+        parser.add_argument(
+            "--ds-sparse-decode-threshold",
+            type=int,
+            default=ServerArgs.ds_sparse_decode_threshold,
+            help="The type of heavy channels in double sparsity attention",
+        )
+
+        # Offloading
+        parser.add_argument(
+            "--cpu-offload-gb",
+            type=int,
+            default=ServerArgs.cpu_offload_gb,
+            help="How many GBs of RAM to reserve for CPU offloading.",
+        )
+        parser.add_argument(
+            "--offload-group-size",
+            type=int,
+            default=ServerArgs.offload_group_size,
+            help="Number of layers per group in offloading.",
+        )
+        parser.add_argument(
+            "--offload-num-in-group",
+            type=int,
+            default=ServerArgs.offload_num_in_group,
+            help="Number of layers to be offloaded within a group.",
+        )
+        parser.add_argument(
+            "--offload-prefetch-step",
+            type=int,
+            default=ServerArgs.offload_prefetch_step,
+            help="Steps to prefetch in offloading.",
+        )
+        parser.add_argument(
+            "--offload-mode",
+            type=str,
+            default=ServerArgs.offload_mode,
+            help="Mode of offloading.",
+        )
+
+        # Optimization/debug options
+        parser.add_argument(
+            "--disable-radix-cache",
+            action="store_true",
+            help="Disable RadixAttention for prefix caching.",
+        )
+        parser.add_argument(
+            "--cuda-graph-max-bs",
+            type=int,
+            default=ServerArgs.cuda_graph_max_bs,
+            help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
+        )
+        parser.add_argument(
+            "--cuda-graph-bs",
+            type=int,
+            nargs="+",
+            help="Set the list of batch sizes for cuda graph.",
+        )
+        parser.add_argument(
+            "--disable-cuda-graph",
+            action="store_true",
+            help="Disable cuda graph.",
+        )
+        parser.add_argument(
+            "--disable-cuda-graph-padding",
+            action="store_true",
+            help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
+        )
+        parser.add_argument(
+            "--enable-profile-cuda-graph",
+            action="store_true",
+            help="Enable profiling of cuda graph capture.",
+        )
+        parser.add_argument(
+            "--enable-cudagraph-gc",
+            action="store_true",
+            help="Enable garbage collection during CUDA graph capture. If disabled (default), GC is frozen during capture to speed up the process.",
+        )
+        parser.add_argument(
+            "--enable-nccl-nvls",
+            action="store_true",
+            help="Enable NCCL NVLS for prefill heavy requests when available.",
+        )
+        parser.add_argument(
+            "--enable-symm-mem",
+            action="store_true",
+            help="Enable NCCL symmetric memory for fast collectives.",
+        )
+        parser.add_argument(
+            "--disable-flashinfer-cutlass-moe-fp4-allgather",
+            action="store_true",
+            help="Disables quantize before all-gather for flashinfer cutlass moe.",
+        )
+        parser.add_argument(
+            "--enable-tokenizer-batch-encode",
+            action="store_true",
+            help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
+        )
+        parser.add_argument(
+            "--disable-outlines-disk-cache",
+            action="store_true",
+            help="Disable disk cache of outlines to avoid possible crashes related to file system or high concurrency.",
+        )
+        parser.add_argument(
+            "--disable-custom-all-reduce",
+            action="store_true",
+            help="Disable the custom all-reduce kernel and fall back to NCCL.",
+        )
+        parser.add_argument(
+            "--enable-mscclpp",
+            action="store_true",
+            help="Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL.",
+        )
+        parser.add_argument(
+            "--disable-overlap-schedule",
+            action="store_true",
+            help="Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker.",
+        )
+        parser.add_argument(
+            "--enable-mixed-chunk",
+            action="store_true",
+            help="Enabling mixing prefill and decode in a batch when using chunked prefill.",
+        )
+        parser.add_argument(
+            "--enable-dp-attention",
+            action="store_true",
+            help="Enabling data parallelism for attention and tensor parallelism for FFN. The dp size should be equal to the tp size. Currently DeepSeek-V2 and Qwen 2/3 MoE models are supported.",
+        )
+        parser.add_argument(
+            "--enable-dp-lm-head",
+            action="store_true",
+            help="Enable vocabulary parallel across the attention TP group to avoid all-gather across DP groups, optimizing performance under DP attention.",
+        )
+        parser.add_argument(
+            "--enable-two-batch-overlap",
+            action="store_true",
+            help="Enabling two micro batches to overlap.",
+        )
+        parser.add_argument(
+            "--tbo-token-distribution-threshold",
+            type=float,
+            default=ServerArgs.tbo_token_distribution_threshold,
+            help="The threshold of token distribution between two batches in micro-batch-overlap, determines whether to two-batch-overlap or two-chunk-overlap. Set to 0 denote disable two-chunk-overlap.",
+        )
+        parser.add_argument(
+            "--enable-torch-compile",
+            action="store_true",
+            help="Optimize the model with torch.compile. Experimental feature.",
+        )
+        parser.add_argument(
+            "--torch-compile-max-bs",
+            type=int,
+            default=ServerArgs.torch_compile_max_bs,
+            help="Set the maximum batch size when using torch compile.",
+        )
+        parser.add_argument(
+            "--torchao-config",
+            type=str,
+            default=ServerArgs.torchao_config,
+            help="Optimize the model with torchao. Experimental feature. Current choices are: int8dq, int8wo, int4wo-<group_size>, fp8wo, fp8dq-per_tensor, fp8dq-per_row",
+        )
+        parser.add_argument(
+            "--enable-nan-detection",
+            action="store_true",
+            help="Enable the NaN detection for debugging purposes.",
+        )
+        parser.add_argument(
+            "--enable-p2p-check",
+            action="store_true",
+            help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
+        )
+        parser.add_argument(
+            "--triton-attention-reduce-in-fp32",
+            action="store_true",
+            help="Cast the intermediate attention results to fp32 to avoid possible crashes related to fp16."
+            "This only affects Triton attention kernels.",
+        )
+        parser.add_argument(
+            "--triton-attention-num-kv-splits",
+            type=int,
+            default=ServerArgs.triton_attention_num_kv_splits,
+            help="The number of KV splits in flash decoding Triton kernel. Larger value is better in longer context scenarios. The default value is 8.",
+        )
+        parser.add_argument(
+            "--num-continuous-decode-steps",
+            type=int,
+            default=ServerArgs.num_continuous_decode_steps,
+            help="Run multiple continuous decoding steps to reduce scheduling overhead. "
+            "This can potentially increase throughput but may also increase time-to-first-token latency. "
+            "The default value is 1, meaning only run one decoding step at a time.",
+        )
+        parser.add_argument(
+            "--delete-ckpt-after-loading",
+            action="store_true",
+            help="Delete the model checkpoint after loading the model.",
+        )
+        parser.add_argument(
+            "--enable-memory-saver",
+            action="store_true",
+            help="Allow saving memory using release_memory_occupation and resume_memory_occupation",
+        )
+        parser.add_argument(
+            "--allow-auto-truncate",
+            action="store_true",
+            help="Allow automatically truncating requests that exceed the maximum input length instead of returning an error.",
+        )
+        parser.add_argument(
+            "--enable-custom-logit-processor",
+            action="store_true",
+            help="Enable users to pass custom logit processors to the server (disabled by default for security)",
+        )
+        parser.add_argument(
+            "--flashinfer-mla-disable-ragged",
+            action="store_true",
+            help="Not using ragged prefill wrapper when running flashinfer mla",
+        )
+        parser.add_argument(
+            "--disable-shared-experts-fusion",
+            action="store_true",
+            help="Disable shared experts fusion optimization for deepseek v3/r1.",
+        )
+        parser.add_argument(
+            "--disable-chunked-prefix-cache",
+            action="store_true",
+            help="Disable chunked prefix cache feature for deepseek, which should save overhead for short sequences.",
+        )
+        parser.add_argument(
+            "--disable-fast-image-processor",
+            action="store_true",
+            help="Adopt base image processor instead of fast image processor.",
+        )
+        parser.add_argument(
+            "--enable-return-hidden-states",
+            action="store_true",
+            help="Enable returning hidden states with responses.",
+        )
+        parser.add_argument(
+            "--scheduler-recv-interval",
+            type=int,
+            default=ServerArgs.scheduler_recv_interval,
+            help="The interval to poll requests in scheduler. Can be set to >1 to reduce the overhead of this.",
+        )
+        parser.add_argument(
+            "--numa-node",
+            type=int,
+            nargs="+",
+            help="Sets the numa node for the subprocesses. i-th element corresponds to i-th subprocess.",
+        )
+
+        # Debug tensor dumps
+        parser.add_argument(
+            "--debug-tensor-dump-output-folder",
+            type=str,
+            default=ServerArgs.debug_tensor_dump_output_folder,
+            help="The output folder for dumping tensors.",
+        )
+        parser.add_argument(
+            "--debug-tensor-dump-input-file",
+            type=str,
+            default=ServerArgs.debug_tensor_dump_input_file,
+            help="The input filename for dumping tensors",
+        )
+        parser.add_argument(
+            "--debug-tensor-dump-inject",
+            type=str,
+            default=ServerArgs.debug_tensor_dump_inject,
+            help="Inject the outputs from jax as the input of every layer.",
+        )
+        parser.add_argument(
+            "--debug-tensor-dump-prefill-only",
+            action="store_true",
+            help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
+        )
+
+        # PD disaggregation
+        parser.add_argument(
+            "--disaggregation-mode",
+            type=str,
+            default="null",
+            choices=["null", "prefill", "decode"],
+            help='Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated',
+        )
+        parser.add_argument(
+            "--disaggregation-transfer-backend",
+            type=str,
+            default=ServerArgs.disaggregation_transfer_backend,
+            choices=DISAGG_TRANSFER_BACKEND_CHOICES,
+            help="The backend for disaggregation transfer. Default is mooncake.",
+        )
+        parser.add_argument(
+            "--disaggregation-bootstrap-port",
+            type=int,
+            default=ServerArgs.disaggregation_bootstrap_port,
+            help="Bootstrap server port on the prefill server. Default is 8998.",
+        )
+        parser.add_argument(
+            "--disaggregation-decode-tp",
+            type=int,
+            default=ServerArgs.disaggregation_decode_tp,
+            help="Decode tp size. If not set, it matches the tp size of the current engine. This is only set on the prefill server.",
+        )
+        parser.add_argument(
+            "--disaggregation-decode-dp",
+            type=int,
+            default=ServerArgs.disaggregation_decode_dp,
+            help="Decode dp size. If not set, it matches the dp size of the current engine. This is only set on the prefill server.",
+        )
+        parser.add_argument(
+            "--disaggregation-prefill-pp",
+            type=int,
+            default=ServerArgs.disaggregation_prefill_pp,
+            help="Prefill pp size. If not set, it is default to 1. This is only set on the decode server.",
+        )
+        parser.add_argument(
+            "--disaggregation-ib-device",
+            type=str,
+            default=ServerArgs.disaggregation_ib_device,
+            help="The InfiniBand devices for disaggregation transfer, accepts single device (e.g., --disaggregation-ib-device mlx5_0) "
+            "or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). "
+            "Default is None, which triggers automatic device detection when mooncake backend is enabled.",
+        )
+        parser.add_argument(
+            "--num-reserved-decode-tokens",
+            type=int,
+            default=ServerArgs.num_reserved_decode_tokens,
+            help="Number of decode tokens that will have memory reserved when adding new request to the running batch.",
+        )
+
+        # Custom weight loader
+        parser.add_argument(
+            "--custom-weight-loader",
+            type=str,
+            nargs="*",
+            default=None,
+            help="The custom dataloader which used to update the model. Should be set with a valid import path, such as my_package.weight_load_func",
+        )
+        parser.add_argument(
+            "--weight-loader-disable-mmap",
+            action="store_true",
+            help="Disable mmap while loading weight using safetensors.",
+        )
+
+        # For PD-Multiplexing
+        parser.add_argument(
+            "--enable-pdmux",
+            action="store_true",
+            help="Enable PD-Multiplexing, PD running on greenctx stream.",
+        )
+
+        parser.add_argument(
+            "--sm-group-num",
+            type=int,
+            default=ServerArgs.sm_group_num,
+            help="Number of sm partition groups.",
+        )
+
+        # Deprecated arguments
+        parser.add_argument(
+            "--enable-ep-moe",
+            action="store_true",
+            help="(Deprecated) Enabling expert parallelism for moe. The ep size is equal to the tp size.",
+        )
+        parser.add_argument(
+            "--enable-deepep-moe",
+            action="store_true",
+            help="(Deprecated) Enabling DeepEP MoE implementation for EP MoE.",
+        )
+        parser.add_argument(
+            "--enable-flashinfer-cutlass-moe",
+            action="store_true",
+            help="(Deprecated) Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
+        )
+        parser.add_argument(
+            "--enable-flashinfer-trtllm-moe",
+            action="store_true",
+            help="(Deprecated) Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP",
+        )
+        parser.add_argument(
+            "--enable-triton-kernel-moe",
+            action="store_true",
+            help="(Deprecated) Use triton moe grouped gemm kernel.",
+        )
+        parser.add_argument(
+            "--enable-flashinfer-mxfp4-moe",
+            action="store_true",
+            help="(Deprecated) Enable FlashInfer MXFP4 MoE backend for modelopt_fp4 quant on Blackwell.",
+        )
+
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        args.tp_size = args.tensor_parallel_size
+        args.pp_size = args.pipeline_parallel_size
+        args.dp_size = args.data_parallel_size
+        args.ep_size = args.expert_parallel_size
+        attrs = [attr.name for attr in dataclasses.fields(cls)]
+        return cls(**{attr: getattr(args, attr) for attr in attrs})
+
+    def url(self):
+        if is_valid_ipv6_address(self.host):
+            return f"http://[{self.host}]:{self.port}"
+        else:
+            return f"http://{self.host}:{self.port}"
+
+    def get_hf_config(self):
+        kwargs = {}
+        hf_config = get_config(
+            self.model_path,
+            trust_remote_code=self.trust_remote_code,
+            revision=self.revision,
+            model_override_args=json.loads(self.json_model_override_args),
+            **kwargs,
+        )
+        return hf_config
+
+    def check_server_args(self):
+        # Check parallel size constraints
+        assert (
+            self.tp_size * self.pp_size
+        ) % self.nnodes == 0, "tp_size must be divisible by number of nodes"
+
+        if self.pp_size > 1:
+            assert (
+                self.disable_overlap_schedule
+                and self.speculative_algorithm is None
+                and not self.enable_mixed_chunk
+            ), "Pipeline parallelism is not compatible with overlap schedule, speculative decoding, mixed chunked prefill."
+
+        assert not (
+            self.dp_size > 1 and self.nnodes != 1 and not self.enable_dp_attention
+        ), "multi-node data parallel is not supported unless dp attention!"
+
+        assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
+        assert self.gpu_id_step >= 1, "gpu_id_step must be positive"
+
+        assert self.moe_dense_tp_size in {
+            1,
+            None,
+        }, "moe_dense_tp_size only support 1 and None currently"
+
+        # Check LoRA
+        self.check_lora_server_args()
+
+        # Check speculative decoding
+        if self.speculative_algorithm is not None:
+            assert (
+                not self.enable_mixed_chunk
+            ), "enable_mixed_chunk is required for speculative decoding"
+
+        # Check chunked prefill
+        # Skip validation if chunked prefill is disabled (i.e., size <= 0).
+        if self.chunked_prefill_size > 0:
+            assert (
+                self.chunked_prefill_size % self.page_size == 0
+            ), "chunked_prefill_size must be divisible by page_size"
+
+        # Check multi tokenizer
+        assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
+        self.validate_buckets_rule(
+            "--prompt-tokens-buckets", self.prompt_tokens_buckets
+        )
+        self.validate_buckets_rule(
+            "--generation-tokens-buckets", self.generation_tokens_buckets
+        )
+
+    def check_lora_server_args(self):
+        assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
+
+        # Enable LoRA if any LoRA paths are provided for backward compatibility.
+        if self.lora_paths:
+            if self.enable_lora is None:
+                self.enable_lora = True
+                logger.warning(
+                    "--enable-lora is set to True because --lora-paths is provided."
+                )
+            elif self.enable_lora is False:
+                logger.warning(
+                    "--enable-lora is set to False, any provided lora_paths will be ignored."
+                )
+
+        if self.enable_lora:
+            if isinstance(self.lora_paths, list):
+                lora_paths = self.lora_paths
+                self.lora_paths = []
+                for lora_path in lora_paths:
+                    if isinstance(lora_path, str):
+                        if "=" in lora_path:
+                            name, path = lora_path.split("=", 1)
+                            lora_ref = LoRARef(
+                                lora_name=name, lora_path=path, pinned=False
+                            )
+                        else:
+                            lora_ref = LoRARef(
+                                lora_name=lora_path, lora_path=lora_path, pinned=False
+                            )
+                    elif isinstance(lora_path, dict):
+                        assert (
+                            "lora_name" in lora_path and "lora_path" in lora_path
+                        ), f"When providing LoRA paths as a list of dict, each dict should contain 'lora_name' and 'lora_path' keys. Got: {lora_path}"
+                        lora_ref = LoRARef(
+                            lora_name=lora_path["lora_name"],
+                            lora_path=lora_path["lora_path"],
+                            pinned=lora_path.get("pinned", False),
+                        )
+                    else:
+                        raise ValueError(
+                            f"Invalid type for item in --lora-paths list: {type(lora_path)}. "
+                            "Expected a string or a dictionary."
+                        )
+                    self.lora_paths.append(lora_ref)
+            elif isinstance(self.lora_paths, dict):
+                self.lora_paths = [
+                    LoRARef(lora_name=k, lora_path=v, pinned=False)
+                    for k, v in self.lora_paths.items()
+                ]
+            elif self.lora_paths is None:
+                self.lora_paths = []
+            else:
+                raise ValueError(
+                    f"Invalid type for --lora-paths: {type(self.lora_paths)}. "
+                    "Expected a list or a dictionary."
+                )
+
+            # Expand target modules
+            if self.lora_target_modules:
+                self.lora_target_modules = set(self.lora_target_modules)
+                if "all" in self.lora_target_modules:
+                    assert (
+                        len(self.lora_target_modules) == 1
+                    ), "If 'all' is specified in --lora-target-modules, it should be the only module specified."
+                    self.lora_target_modules = set(SUPPORTED_LORA_TARGET_MODULES)
+
+            # Ensure sufficient information is provided for LoRA initialization.
+            assert self.lora_paths or (
+                self.max_lora_rank and self.lora_target_modules
+            ), "When no initial --lora-paths is provided, you need to specify both --max-lora-rank and --lora-target-modules for LoRA initialization."
+
+            # Validate max_loaded_loras
+            if self.max_loaded_loras is not None:
+                assert self.max_loaded_loras >= self.max_loras_per_batch, (
+                    "max_loaded_loras should be greater than or equal to max_loras_per_batch. "
+                    f"max_loaded_loras={self.max_loaded_loras}, max_loras_per_batch={self.max_loras_per_batch}"
+                )
+                assert len(self.lora_paths) <= self.max_loaded_loras, (
+                    "The number of LoRA paths should not exceed max_loaded_loras. "
+                    f"max_loaded_loras={self.max_loaded_loras}, lora_paths={len(self.lora_paths)}"
+                )
+
+    def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
+        larger_tp = max(decode_tp, prefill_tp)
+        smaller_tp = min(decode_tp, prefill_tp)
+        assert larger_tp % smaller_tp == 0, (
+            "Different tp size is supported only when one tp is multiple of the other. "
+            f"decode_tp={decode_tp}, prefill_tp={prefill_tp}"
+        )
+
+    def validate_buckets_rule(self, arg_name: str, buckets_rule: List[str]):
+        if not buckets_rule:
+            return
+
+        assert len(buckets_rule) > 0, f"{arg_name} cannot be empty list"
+        rule = buckets_rule[0]
+        assert rule in [
+            "tse",
+            "default",
+            "customer",
+        ], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', 'customer'"
+
+        if rule == "tse":
+            assert (
+                len(buckets_rule) == 4
+            ), f"{arg_name} TSE rule requires exactly 4 parameters: ['tse', middle, base, count], got {len(buckets_rule)}"
+            try:
+                middle = float(buckets_rule[1])
+                base = float(buckets_rule[2])
+                count = int(buckets_rule[3])
+            except (ValueError, IndexError):
+                assert (
+                    False
+                ), f"{arg_name} TSE rule parameters must be: ['tse', <float:middle>, <float:base>, <int:count>]"
+            assert base > 1, f"{arg_name} TSE base must be larger than 1, got: {base}"
+            assert count > 0, f"{arg_name} TSE count must be positive, got: {count}"
+            assert middle > 0, f"{arg_name} TSE middle must be positive, got: {middle}"
+
+        elif rule == "default":
+            assert (
+                len(buckets_rule) == 1
+            ), f"{arg_name} default rule should only have one parameter: ['default'], got {len(buckets_rule)}"
+
+        elif rule == "customer":
+            assert (
+                len(buckets_rule) >= 2
+            ), f"{arg_name} customer rule requires at least one bucket value: ['customer', value1, ...]"
+            try:
+                bucket_values = [float(x) for x in buckets_rule[1:]]
+            except ValueError:
+                assert False, f"{arg_name} customer rule bucket values must be numeric"
+            assert len(set(bucket_values)) == len(
+                bucket_values
+            ), f"{arg_name} customer rule bucket values should not contain duplicates"
+            assert all(
+                val >= 0 for val in bucket_values
+            ), f"{arg_name} customer rule bucket values should be non-negative"
+
+    def model_specific_adjustments(self):
+        hf_config = self.get_hf_config()
+        model_arch = hf_config.architectures[0]
+        if model_arch in ["GptOssForCausalLM"]:
+            if self.attention_backend is None:
+                if is_cuda() and is_sm100_supported():
+                    self.attention_backend = "trtllm_mha"
+                elif is_cuda() and is_sm90_supported():
+                    self.attention_backend = "fa3"
+                else:
+                    self.attention_backend = "triton"
+            supported_backends = ["triton", "trtllm_mha", "fa3"]
+            logger.info(
+                f"Use {self.attention_backend} as attention backend for GptOssForCausalLM"
+            )
+            assert (
+                self.attention_backend in supported_backends
+            ), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'"
+
+            if is_sm100_supported():
+                if not self.enable_dp_attention:
+                    self.enable_flashinfer_allreduce_fusion = True
+                    logger.info(
+                        "Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
+                    )
+            quantization_config = getattr(hf_config, "quantization_config", None)
+            is_mxfp4_quant_format = (
+                quantization_config is not None
+                and quantization_config.get("quant_method") == "mxfp4"
+            )
+
+            if is_sm100_supported() and is_mxfp4_quant_format:
+                self.moe_runner_backend = "flashinfer_mxfp4"
+                logger.warning(
+                    "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
+                )
+            else:
+                if self.moe_runner_backend == "triton_kernel":
+                    assert (
+                        self.ep_size == 1
+                    ), "Triton kernel MoE is only supported when ep_size == 1"
+                if (
+                    self.moe_runner_backend == "auto"
+                    and self.ep_size == 1
+                    and is_triton_kernels_available()
+                ):
+                    self.moe_runner_backend = "triton_kernel"
+                    logger.warning(
+                        "Detected GPT-OSS model, enabling triton_kernels MOE kernel."
+                    )
+            self.disable_hybrid_swa_memory = True
+            if is_mxfp4_quant_format:
+                # use bf16 for mxfp4 triton kernels
+                self.dtype = "bfloat16"
+
+        elif "Llama4" in model_arch:
+            assert self.attention_backend in {
+                "fa3",
+                "aiter",
+                "triton",
+            }, "fa3, aiter, or triton is required for Llama4 model"
+        elif model_arch in [
+            "Gemma2ForCausalLM",
+            "Gemma3ForCausalLM",
+            "Gemma3ForConditionalGeneration",
+            "Gemma3nForCausalLM",
+            "Gemma3nForConditionalGeneration",
+        ]:
+            # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
+            # It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
+            logger.warning(
+                f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
+            )
+            self.disable_hybrid_swa_memory = True
+
+    def adjust_mem_fraction_for_vlm(self, model_config):
+        vision_config = getattr(model_config.hf_config, "vision_config", None)
+        if vision_config is None:
+            return
+
+        # roughly reduce the mem_fraction_static base on params of Vit
+        original_server_arg_mem_fraction = self.mem_fraction_static
+        # a base mem_fraction_static factor for regular Vit
+        base_mem_fraction_reduction_ratio = 0.95
+
+        vit_num_layers = getattr(vision_config, "num_hidden_layers", 24)
+        vit_hidden_size = getattr(vision_config, "hidden_size", 1024)
+
+        # baseline ViT params (ViT-L/14)
+        baseline_vit_layers = 24
+        baseline_vit_hidden_size = 1024
+
+        # weight params count
+        current_complexity_score = vit_num_layers * (vit_hidden_size**2)
+        baseline_complexity_score = baseline_vit_layers * (baseline_vit_hidden_size**2)
+        complexity_ratio = (
+            current_complexity_score / baseline_complexity_score
+            if baseline_complexity_score > 0
+            else 1.0
+        )
+
+        # every time the complexity grows 100%, adjust final factor for 10%
+        sensitivity_scale = 0.1
+        dynamic_adjustment_factor = 1.0 - sensitivity_scale * (complexity_ratio - 1.0)
+        dynamic_adjustment_factor = max(0.8, min(1.05, dynamic_adjustment_factor))
+
+        final_overall_factor = (
+            base_mem_fraction_reduction_ratio * dynamic_adjustment_factor
+        )
+        self.mem_fraction_static = (
+            original_server_arg_mem_fraction * final_overall_factor
+        )
+
+
+def prepare_server_args(argv: List[str]) -> ServerArgs:
+    """
+    Prepare the server arguments from the command line arguments.
+
+    Args:
+        args: The command line arguments. Typically, it should be `sys.argv[1:]`
+            to ensure compatibility with `parse_args` when no arguments are passed.
+
+    Returns:
+        The server arguments.
+    """
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    raw_args = parser.parse_args(argv)
+    server_args = ServerArgs.from_cli_args(raw_args)
+    return server_args
+
+
+ZMQ_TCP_PORT_DELTA = 233
+
+
+@dataclasses.dataclass
+class PortArgs:
+    # The ipc filename for tokenizer to receive inputs from detokenizer (zmq)
+    tokenizer_ipc_name: str
+    # The ipc filename for scheduler (rank 0) to receive inputs from tokenizer (zmq)
+    scheduler_input_ipc_name: str
+    # The ipc filename for detokenizer to receive inputs from scheduler (zmq)
+    detokenizer_ipc_name: str
+
+    # The port for nccl initialization (torch.dist)
+    nccl_port: int
+
+    # The ipc filename for rpc call between Engine and Scheduler
+    rpc_ipc_name: str
+
+    # The ipc filename for Scheduler to send metrics
+    metrics_ipc_name: str
+
+    # The ipc filename for Tokenizer and worker tokenizer
+    tokenizer_worker_ipc_name: Optional[str]
+
+    @staticmethod
+    def init_new(server_args, dp_rank: Optional[int] = None) -> "PortArgs":
+        if server_args.nccl_port is None:
+            nccl_port = server_args.port + random.randint(100, 1000)
+            while True:
+                if is_port_available(nccl_port):
+                    break
+                if nccl_port < 60000:
+                    nccl_port += 42
+                else:
+                    nccl_port -= 43
+        else:
+            nccl_port = server_args.nccl_port
+
+        if not server_args.enable_dp_attention:
+            # Normal case, use IPC within a single node
+            return PortArgs(
+                tokenizer_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
+                scheduler_input_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
+                detokenizer_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
+                nccl_port=nccl_port,
+                rpc_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
+                metrics_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
+                tokenizer_worker_ipc_name=None,
+            )
+        else:
+            # DP attention. Use TCP + port to handle both single-node and multi-node.
+            if server_args.nnodes == 1 and server_args.dist_init_addr is None:
+                dist_init_addr = ("127.0.0.1", server_args.port + ZMQ_TCP_PORT_DELTA)
+            elif server_args.dist_init_addr.startswith("["):  # ipv6 address
+                port_num, host = configure_ipv6(server_args.dist_init_addr)
+                dist_init_addr = (host, str(port_num))
+            else:
+                dist_init_addr = server_args.dist_init_addr.split(":")
+
+            assert (
+                len(dist_init_addr) == 2
+            ), "please provide --dist-init-addr as host:port of head node"
+
+            dist_init_host, dist_init_port = dist_init_addr
+            port_base = int(dist_init_port) + 1
+            detokenizer_port = port_base + 1
+            rpc_port = port_base + 2
+            metrics_ipc_name = port_base + 3
+            if dp_rank is None:
+                # TokenizerManager to DataParallelController
+                scheduler_input_port = port_base + 4
+            else:
+                scheduler_input_port = port_base + 4 + 1 + dp_rank
+
+            return PortArgs(
+                tokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base}",
+                scheduler_input_ipc_name=f"tcp://{dist_init_host}:{scheduler_input_port}",
+                detokenizer_ipc_name=f"tcp://{dist_init_host}:{detokenizer_port}",
+                nccl_port=nccl_port,
+                rpc_ipc_name=f"tcp://{dist_init_host}:{rpc_port}",
+                metrics_ipc_name=f"tcp://{dist_init_host}:{metrics_ipc_name}",
+                tokenizer_worker_ipc_name=None,
+            )
+
+
+class LoRAPathAction(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        lora_paths = []
+        if values:
+            assert isinstance(values, list), "Expected a list of LoRA paths."
+            for lora_path in values:
+                lora_path = lora_path.strip()
+                if lora_path.startswith("{") and lora_path.endswith("}"):
+                    obj = json.loads(lora_path)
+                    assert "lora_path" in obj and "lora_name" in obj, (
+                        f"{repr(lora_path)} looks like a JSON str, "
+                        "but it does not contain 'lora_name' and 'lora_path' keys."
+                    )
+                    lora_paths.append(obj)
+                else:
+                    lora_paths.append(lora_path)
+
+        setattr(namespace, self.dest, lora_paths)
+
+
+class DeprecatedAction(argparse.Action):
+    def __init__(self, option_strings, dest, nargs=0, **kwargs):
+        super(DeprecatedAction, self).__init__(
+            option_strings, dest, nargs=nargs, **kwargs
+        )
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        raise ValueError(self.help)
+
+
+def print_deprecated_warning(message: str):
+    logger.warning(f"\033[33m{message}\033[0m")
+
+
+def auto_choose_speculative_params(self: ServerArgs):
+    """
+    Automatically choose the parameters for speculative decoding.
+
+    You can tune them on your own models and prompts with scripts/playground/bench_speculative.py
+    """
+    hf_config = self.get_hf_config()
+    arch = hf_config.architectures[0]
+    if self.speculative_algorithm == "STANDALONE":
+        # The default value for standalone speculative decoding
+        return (3, 1, 4)
+    if arch in ["LlamaForCausalLM"]:
+        # The default value for llama
+        return (5, 4, 8)
+    elif arch in [
+        "DeepseekV3ForCausalLM",
+        "DeepseekV2ForCausalLM",
+        "GptOssForCausalLM",
+    ]:
+        # The default value for deepseek and gpt-oss
+        return (3, 1, 4)
+    elif arch in ["Grok1ForCausalLM", "Grok1VForCausalLM"]:
+        return (5, 4, 8)
+    else:
+        # The default value for all other models
+        return (5, 4, 8)
diff --git a/python/sglang/srt/speculative/build_eagle_tree.py b/python/sglang/srt/speculative/build_eagle_tree.py
new file mode 100644
index 000000000..fd27f414c
--- /dev/null
+++ b/python/sglang/srt/speculative/build_eagle_tree.py
@@ -0,0 +1,427 @@
+# NOTE: Please run this file to make sure the test cases are correct.
+
+import math
+from enum import IntEnum
+from typing import List, Optional
+
+import torch
+
+from sglang.srt.utils import is_cuda, is_hip
+
+if is_cuda() or is_hip():
+    from sgl_kernel import (
+        build_tree_kernel_efficient as sgl_build_tree_kernel_efficient,
+    )
+
+
+def build_tree_kernel_efficient_preprocess(
+    verified_id: torch.Tensor,
+    score_list: List[torch.Tensor],
+    token_list: List[torch.Tensor],
+    parents_list: List[torch.Tensor],
+    num_verify_tokens: int,
+):
+    score_list = torch.cat(score_list, dim=1).flatten(
+        1
+    )  # b, n, topk; n= 1 + (num_steps-1) * self.topk
+    ss_token_list = torch.cat(
+        token_list, dim=1
+    )  # b, (self.topk + (num_steps-1) * self.topk)
+    top_scores = torch.topk(score_list, num_verify_tokens - 1, dim=-1)
+    top_scores_index = top_scores.indices
+    top_scores_index = torch.sort(top_scores_index).values
+    draft_tokens = torch.gather(ss_token_list, index=top_scores_index, dim=1)
+    draft_tokens = torch.cat((verified_id.unsqueeze(1), draft_tokens), dim=1).flatten()
+
+    if len(parents_list) > 1:
+        parent_list = torch.cat(parents_list[:-1], dim=1)
+    else:
+        batch_size = parents_list[0].shape[0]
+        parent_list = torch.empty(batch_size, 0, device=parents_list[0].device)
+
+    return parent_list, top_scores_index, draft_tokens
+
+
+class TreeMaskMode(IntEnum):
+    FULL_MASK = 0
+    QLEN_ONLY = 1
+    QLEN_ONLY_BITPACKING = 2
+
+
+def build_tree_kernel_efficient(
+    verified_id: torch.Tensor,
+    score_list: List[torch.Tensor],
+    token_list: List[torch.Tensor],
+    parents_list: List[torch.Tensor],
+    seq_lens: torch.Tensor,
+    seq_lens_sum: int,
+    topk: int,
+    spec_steps: int,
+    num_verify_tokens: int,
+    tree_mask_mode: TreeMaskMode = TreeMaskMode.FULL_MASK,
+    tree_mask_buf: Optional[torch.Tensor] = None,
+    position_buf: Optional[torch.Tensor] = None,
+):
+    parent_list, top_scores_index, draft_tokens = (
+        build_tree_kernel_efficient_preprocess(
+            verified_id,
+            score_list,
+            token_list,
+            parents_list,
+            num_verify_tokens,
+        )
+    )
+
+    # seq_lens_sum == sum(seq_lens); seq_lens: sequence length without draft tokens
+    bs = seq_lens.numel()
+    device = seq_lens.device
+    # e.g. for bs=1, tree_mask: num_draft_token, seq_lens_sum + num_draft_token (flattened)
+    # where each row indicates the attending pattern of each draft token
+    # if use_partial_packed_tree_mask is True, tree_mask: num_draft_token (flattened, packed)
+    if tree_mask_buf is not None:
+        tree_mask = tree_mask_buf
+    elif tree_mask_mode == TreeMaskMode.QLEN_ONLY:
+        tree_mask = torch.full(
+            (num_verify_tokens * bs * num_verify_tokens,),
+            True,
+            dtype=torch.bool,
+            device=device,
+        )
+    elif tree_mask_mode == TreeMaskMode.QLEN_ONLY_BITPACKING:
+        packed_dtypes = [torch.uint8, torch.uint16, torch.uint32]
+        packed_dtype_idx = int(math.ceil(math.log2((num_verify_tokens + 7) // 8)))
+        tree_mask = torch.zeros(
+            (num_verify_tokens * bs,),
+            dtype=packed_dtypes[packed_dtype_idx],
+            device=device,
+        )
+    elif tree_mask_mode == TreeMaskMode.FULL_MASK:
+        tree_mask = torch.full(
+            (
+                seq_lens_sum * num_verify_tokens
+                + num_verify_tokens * num_verify_tokens * bs,
+            ),
+            True,
+            device=device,
+        )
+    else:
+        raise NotImplementedError(f"Invalid tree mask: {tree_mask_mode=}")
+
+    # TODO: make them torch.empty and fuse them into `sgl_build_tree_kernel`
+    retrive_index = torch.full(
+        (bs, num_verify_tokens), -1, device=device, dtype=torch.long
+    )
+    retrive_next_token = torch.full(
+        (bs, num_verify_tokens), -1, device=device, dtype=torch.long
+    )
+    retrive_next_sibling = torch.full(
+        (bs, num_verify_tokens), -1, device=device, dtype=torch.long
+    )
+    # position: where each token belongs to
+    # e.g. if depth of each draft token is [0, 1, 1, 2] and the prompt length is 7
+    # then, positions = [7, 8, 8, 9]
+    if position_buf is not None:
+        positions = position_buf
+    else:
+        positions = torch.empty(
+            (bs * num_verify_tokens,), device=device, dtype=torch.long
+        )
+
+    sgl_build_tree_kernel_efficient(
+        parent_list,
+        top_scores_index,
+        seq_lens,
+        tree_mask,
+        positions,
+        retrive_index,
+        retrive_next_token,
+        retrive_next_sibling,
+        topk,
+        spec_steps,
+        num_verify_tokens,
+        tree_mask_mode,
+    )
+    return (
+        tree_mask,
+        positions,
+        retrive_index,
+        retrive_next_token,
+        retrive_next_sibling,
+        draft_tokens,
+    )
+
+
+def test_build_tree_kernel_efficient():
+    verified_id = torch.tensor([29974, 13], device="cuda", dtype=torch.int32)
+    score_list = [
+        torch.tensor(
+            [
+                [[7.1127e-01, 2.8292e-01, 2.2995e-03, 1.7357e-03]],
+                [[9.7476e-01, 2.2219e-02, 6.5031e-04, 1.3212e-04]],
+            ],
+            dtype=torch.float32,
+            device="cuda",
+        ),
+        torch.tensor(
+            [
+                [
+                    [6.9142e-01, 1.2863e-02, 1.6873e-03, 1.1871e-03],
+                    [2.4787e-01, 1.8818e-02, 1.4204e-02, 9.2235e-04],
+                    [2.2971e-03, 1.6700e-06, 1.8737e-07, 8.3146e-08],
+                    [1.2771e-03, 2.4374e-04, 1.7832e-04, 1.1947e-05],
+                ],
+                [
+                    [8.4832e-02, 6.6068e-02, 5.8304e-02, 5.7851e-02],
+                    [2.3616e-03, 1.1243e-03, 5.4368e-04, 2.7768e-04],
+                    [2.5286e-04, 1.5578e-04, 2.8817e-05, 1.2888e-05],
+                    [1.2834e-04, 2.5417e-06, 1.1279e-06, 1.6088e-08],
+                ],
+            ],
+            dtype=torch.float32,
+            device="cuda",
+        ),
+        torch.tensor(
+            [
+                [
+                    [6.6438e-01, 2.6997e-02, 2.4236e-05, 4.0821e-06],
+                    [2.4402e-01, 2.8409e-03, 5.0935e-04, 2.9022e-04],
+                    [1.6178e-02, 2.0567e-03, 4.5892e-04, 3.0034e-05],
+                    [1.3023e-02, 5.0497e-04, 3.6371e-04, 8.7750e-05],
+                ],
+                [
+                    [2.3263e-02, 2.0054e-02, 9.3990e-03, 2.7783e-03],
+                    [6.4156e-02, 5.5506e-04, 1.0429e-04, 9.7211e-05],
+                    [4.9950e-02, 5.0630e-03, 9.0068e-04, 3.3656e-04],
+                    [7.5817e-03, 8.5731e-04, 6.9972e-04, 6.0793e-04],
+                ],
+            ],
+            dtype=torch.float32,
+            device="cuda",
+        ),
+        torch.tensor(
+            [
+                [
+                    [6.6420e-01, 1.0525e-04, 6.5864e-05, 1.2253e-06],
+                    [1.3019e-01, 1.0461e-01, 5.2083e-03, 1.6777e-03],
+                    [2.0103e-02, 6.7335e-03, 1.2625e-04, 1.0364e-05],
+                    [1.5142e-02, 7.0819e-04, 9.6595e-05, 8.7951e-05],
+                ],
+                [
+                    [5.8608e-02, 1.8840e-03, 7.8535e-04, 4.4400e-04],
+                    [1.2185e-02, 2.0684e-03, 1.7418e-03, 1.4327e-03],
+                    [6.2455e-03, 6.1487e-03, 2.6862e-03, 1.8034e-03],
+                    [1.8590e-03, 1.6151e-03, 1.2481e-03, 3.6038e-04],
+                ],
+            ],
+            dtype=torch.float32,
+            device="cuda",
+        ),
+    ]
+    token_list = [
+        torch.tensor(
+            [[29896, 29906, 29900, 29945], [13, 2, 29871, 28956]],
+            dtype=torch.int64,
+            device="cuda",
+        ),
+        torch.tensor(
+            [
+                [
+                    29889,
+                    29974,
+                    29945,
+                    29900,
+                    29974,
+                    29922,
+                    29930,
+                    29958,
+                    29889,
+                    29974,
+                    29930,
+                    29945,
+                    29974,
+                    29922,
+                    29930,
+                    29958,
+                ],
+                [
+                    22550,
+                    4136,
+                    16492,
+                    8439,
+                    29871,
+                    2,
+                    3001,
+                    13,
+                    2,
+                    13,
+                    29906,
+                    29946,
+                    2,
+                    13,
+                    29871,
+                    259,
+                ],
+            ],
+            device="cuda",
+        ),
+        torch.tensor(
+            [
+                [
+                    29946,
+                    29945,
+                    29953,
+                    29906,
+                    29896,
+                    29945,
+                    29900,
+                    29906,
+                    29896,
+                    29945,
+                    29906,
+                    29953,
+                    29896,
+                    29945,
+                    29906,
+                    29946,
+                ],
+                [
+                    29871,
+                    2,
+                    29901,
+                    29889,
+                    29871,
+                    2,
+                    395,
+                    259,
+                    29901,
+                    29871,
+                    2,
+                    29889,
+                    3001,
+                    1234,
+                    7146,
+                    2186,
+                ],
+            ],
+            device="cuda",
+        ),
+        torch.tensor(
+            [
+                [
+                    29946,
+                    29974,
+                    29945,
+                    29930,
+                    29889,
+                    29922,
+                    29974,
+                    29930,
+                    29974,
+                    29946,
+                    29930,
+                    29922,
+                    29889,
+                    29974,
+                    29945,
+                    29922,
+                ],
+                [
+                    29941,
+                    29906,
+                    2,
+                    29946,
+                    29871,
+                    450,
+                    319,
+                    14990,
+                    29946,
+                    29941,
+                    2,
+                    29906,
+                    29871,
+                    2,
+                    3001,
+                    13,
+                ],
+            ],
+            device="cuda",
+        ),
+    ]
+    parents_list = [
+        torch.tensor(
+            [[-1, 0, 1, 2, 3], [-1, 0, 1, 2, 3]], dtype=torch.int64, device="cuda"
+        ),
+        torch.tensor([[4, 8, 9, 10], [4, 5, 6, 7]], dtype=torch.int64, device="cuda"),
+        torch.tensor(
+            [[20, 24, 21, 28], [24, 28, 20, 21]], dtype=torch.int64, device="cuda"
+        ),
+        torch.tensor(
+            [[36, 40, 41, 44], [36, 40, 44, 45]], dtype=torch.int64, device="cuda"
+        ),
+    ]
+    seq_lens = torch.tensor([5, 10], dtype=torch.int64, device="cuda")
+    topk = 4
+    depth = 4
+    num_draft_token = 8
+
+    (
+        tree_mask,
+        position,
+        retrive_index,
+        retrive_next_token,
+        retrive_next_sibling,
+        draft_tokens,
+    ) = build_tree_kernel_efficient(
+        verified_id=verified_id,
+        score_list=score_list,
+        token_list=token_list,
+        parents_list=parents_list,
+        seq_lens=seq_lens,
+        seq_lens_sum=torch.sum(seq_lens).item(),
+        topk=topk,
+        spec_steps=depth,
+        num_verify_tokens=num_draft_token,
+    )
+
+    print("=========== build tree kernel efficient ==========")
+    print(f"{tree_mask=}")
+    print(f"{position=}")
+    print(f"{retrive_index=}")
+    print(f"{retrive_next_token=}")
+    print(f"{retrive_next_sibling=}")
+    print(f"{draft_tokens=}")
+    assert position.tolist() == [5, 6, 6, 7, 7, 8, 8, 9, 10, 11, 12, 12, 12, 12, 13, 14]
+    assert retrive_index.tolist() == [
+        [0, 1, 2, 3, 4, 5, 6, 7],
+        [8, 9, 10, 11, 12, 13, 14, 15],
+    ]
+    assert retrive_next_token.tolist() == [
+        [1, 3, 4, 5, 6, 7, -1, -1],
+        [1, 2, -1, 6, -1, -1, 7, -1],
+    ]
+    assert retrive_next_sibling.tolist() == [
+        [-1, 2, -1, -1, -1, -1, -1, -1],
+        [-1, -1, 3, 4, 5, -1, -1, -1],
+    ]
+    assert draft_tokens.tolist() == [
+        29974,
+        29896,
+        29906,
+        29889,
+        29974,
+        29946,
+        29896,
+        29946,
+        13,
+        13,
+        22550,
+        4136,
+        16492,
+        8439,
+        29871,
+        29941,
+    ]
+
+
+if __name__ == "__main__":
+    test_build_tree_kernel_efficient()
diff --git a/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py b/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py
new file mode 100644
index 000000000..66d2d5a34
--- /dev/null
+++ b/python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py
@@ -0,0 +1,355 @@
+from __future__ import annotations
+
+import bisect
+from typing import TYPE_CHECKING, Callable
+
+import torch
+
+from sglang.srt.layers.dp_attention import DpPaddingMode, set_dp_buffer_len
+from sglang.srt.model_executor.cuda_graph_runner import (
+    CUDA_GRAPH_CAPTURE_FAILED_MSG,
+    CudaGraphRunner,
+    get_batch_sizes_to_capture,
+    get_global_graph_memory_pool,
+    model_capture_mode,
+    set_global_graph_memory_pool,
+    set_torch_compile_config,
+)
+from sglang.srt.model_executor.forward_batch_info import (
+    CaptureHiddenMode,
+    ForwardBatch,
+    ForwardMode,
+)
+from sglang.srt.speculative.eagle_utils import EagleDraftInput
+from sglang.srt.utils import (
+    require_attn_tp_gather,
+    require_gathered_buffer,
+    require_mlp_sync,
+    require_mlp_tp_gather,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.speculative.eagle_worker import EAGLEWorker
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class EAGLEDraftCudaGraphRunner:
+    def __init__(self, eagle_worker: EAGLEWorker):
+        # Parse args
+        self.eagle_worker = eagle_worker
+        self.model_runner = model_runner = eagle_worker.model_runner
+        self.model_runner: EAGLEWorker
+        self.graphs = {}
+        self.output_buffers = {}
+        self.enable_torch_compile = model_runner.server_args.enable_torch_compile
+        self.disable_padding = model_runner.server_args.disable_cuda_graph_padding
+        self.is_encoder_decoder = model_runner.model_config.is_encoder_decoder
+        self.require_gathered_buffer = require_gathered_buffer(model_runner.server_args)
+        self.require_mlp_tp_gather = require_mlp_tp_gather(model_runner.server_args)
+        self.require_mlp_sync = require_mlp_sync(model_runner.server_args)
+        self.require_attn_tp_gather = require_attn_tp_gather(model_runner.server_args)
+        self.dp_size = self.model_runner.dp_size
+        self.tp_size = self.model_runner.tp_size
+        self.topk = model_runner.server_args.speculative_eagle_topk
+        self.speculative_num_steps = model_runner.server_args.speculative_num_steps
+        self.enable_profile_cuda_graph = (
+            model_runner.server_args.enable_profile_cuda_graph
+        )
+        server_args = model_runner.server_args
+
+        # Batch sizes to capture
+        self.capture_bs, self.compile_bs = get_batch_sizes_to_capture(model_runner)
+        self.num_tokens_per_bs = server_args.speculative_eagle_topk
+
+        # Attention backend
+        self.max_bs = max(self.capture_bs)
+        self.max_num_token = self.max_bs * self.num_tokens_per_bs
+        self.model_runner.draft_attn_backend.init_cuda_graph_state(
+            self.max_bs, self.max_num_token
+        )
+        self.seq_len_fill_value = self.model_runner.draft_attn_backend.attn_backends[
+            0
+        ].get_cuda_graph_seq_len_fill_value()
+        self.seq_lens_cpu = torch.full(
+            (self.max_bs,), self.seq_len_fill_value, dtype=torch.int32
+        )
+
+        if self.enable_torch_compile:
+            set_torch_compile_config()
+
+        # Graph inputs
+        with torch.device("cuda"):
+            self.input_ids = torch.zeros((self.max_num_token,), dtype=torch.int64)
+            self.req_pool_indices = torch.zeros((self.max_bs,), dtype=torch.int32)
+            self.seq_lens = torch.full(
+                (self.max_bs,), self.seq_len_fill_value, dtype=torch.int32
+            )
+            self.out_cache_loc = torch.zeros(
+                (self.max_num_token * self.speculative_num_steps,), dtype=torch.int64
+            )
+            self.positions = torch.zeros((self.max_num_token,), dtype=torch.int64)
+            self.mrope_positions = torch.zeros(
+                (3, self.max_num_token), dtype=torch.int64
+            )
+            self.topk_p = torch.zeros((self.max_bs, self.topk), dtype=torch.float32)
+            self.topk_index = torch.zeros((self.max_bs, self.topk), dtype=torch.int64)
+            self.hidden_states = torch.zeros(
+                (self.max_bs, self.model_runner.model_config.hidden_size),
+                dtype=self.model_runner.dtype,
+            )
+
+            if self.require_gathered_buffer:
+                if self.require_mlp_tp_gather:
+                    self.global_num_tokens_gpu = torch.zeros(
+                        (self.dp_size,), dtype=torch.int32
+                    )
+                    self.global_num_tokens_for_logprob_gpu = torch.zeros(
+                        (self.dp_size,), dtype=torch.int32
+                    )
+                else:
+                    assert self.require_attn_tp_gather
+                    self.global_num_tokens_gpu = torch.zeros((1,), dtype=torch.int32)
+                    self.global_num_tokens_for_logprob_gpu = torch.zeros(
+                        (1,), dtype=torch.int32
+                    )
+            else:
+                self.global_num_tokens_gpu = None
+                self.global_num_tokens_for_logprob_gpu = None
+
+        # Capture
+        try:
+            with model_capture_mode():
+                self.capture()
+        except RuntimeError as e:
+            raise Exception(
+                f"Capture cuda graph failed: {e}\n{CUDA_GRAPH_CAPTURE_FAILED_MSG}"
+            )
+
+    def can_run(self, forward_batch: ForwardBatch):
+        if self.require_mlp_tp_gather:
+            cuda_graph_bs = (
+                max(forward_batch.global_num_tokens_cpu) // self.num_tokens_per_bs
+                if self.model_runner.spec_algorithm.is_eagle()
+                else max(forward_batch.global_num_tokens_cpu)
+            )
+        else:
+            cuda_graph_bs = forward_batch.batch_size
+
+        is_bs_supported = (
+            cuda_graph_bs in self.graphs
+            if self.disable_padding
+            else cuda_graph_bs <= self.max_bs
+        )
+
+        if self.require_mlp_sync:
+            is_bs_supported = is_bs_supported and forward_batch.can_run_dp_cuda_graph
+
+        return is_bs_supported
+
+    def capture(self):
+        CudaGraphRunner.capture(self)
+
+    def capture_one_batch_size(self, num_seqs: int, forward: Callable):
+        graph = torch.cuda.CUDAGraph()
+        stream = self.stream
+        num_tokens = num_seqs * self.num_tokens_per_bs
+
+        # Graph inputs
+        req_pool_indices = self.req_pool_indices[:num_seqs]
+        seq_lens = self.seq_lens[:num_seqs]
+        out_cache_loc = self.out_cache_loc[: num_tokens * self.speculative_num_steps]
+        positions = self.positions[:num_tokens]
+        mrope_positions = self.mrope_positions[:, :num_tokens]
+        topk_p = self.topk_p[:num_seqs]
+        topk_index = self.topk_index[:num_seqs]
+        hidden_states = self.hidden_states[:num_seqs]
+
+        if self.require_mlp_tp_gather:
+            self.global_num_tokens_gpu.copy_(
+                torch.tensor(
+                    [num_tokens] * self.dp_size,
+                    dtype=torch.int32,
+                    device=self.input_ids.device,
+                )
+            )
+            self.global_num_tokens_for_logprob_gpu.copy_(
+                torch.tensor(
+                    [num_tokens] * self.dp_size,
+                    dtype=torch.int32,
+                    device=self.input_ids.device,
+                )
+            )
+            global_num_tokens = self.global_num_tokens_gpu
+            global_dp_buffer_len = num_tokens * self.dp_size
+            global_num_tokens_for_logprob = self.global_num_tokens_for_logprob_gpu
+        elif self.require_attn_tp_gather:
+            self.global_num_tokens_gpu.copy_(
+                torch.tensor(
+                    [num_tokens],
+                    dtype=torch.int32,
+                    device=self.input_ids.device,
+                )
+            )
+            self.global_num_tokens_for_logprob_gpu.copy_(
+                torch.tensor(
+                    [num_tokens],
+                    dtype=torch.int32,
+                    device=self.input_ids.device,
+                )
+            )
+            global_num_tokens = self.global_num_tokens_gpu
+            global_dp_buffer_len = num_tokens
+            global_num_tokens_for_logprob = self.global_num_tokens_for_logprob_gpu
+        else:
+            global_num_tokens = None
+            global_dp_buffer_len = None
+            global_num_tokens_for_logprob = None
+
+        spec_info = EagleDraftInput(
+            topk_p=topk_p,
+            topk_index=topk_index,
+            hidden_states=hidden_states,
+            capture_hidden_mode=CaptureHiddenMode.LAST,
+        )
+
+        # Forward batch
+        forward_batch = ForwardBatch(
+            forward_mode=ForwardMode.DECODE,
+            batch_size=num_seqs,
+            input_ids=None,
+            req_pool_indices=req_pool_indices,
+            seq_lens=seq_lens,
+            req_to_token_pool=self.model_runner.req_to_token_pool,
+            token_to_kv_pool=self.model_runner.token_to_kv_pool,
+            out_cache_loc=out_cache_loc,
+            seq_lens_sum=seq_lens.sum().item(),
+            return_logprob=False,
+            positions=positions,
+            mrope_positions=mrope_positions,
+            global_num_tokens_gpu=global_num_tokens,
+            dp_padding_mode=DpPaddingMode.get_default_mode_in_cuda_graph(),
+            global_dp_buffer_len=global_dp_buffer_len,
+            spec_algorithm=self.model_runner.spec_algorithm,
+            spec_info=spec_info,
+            capture_hidden_mode=(
+                spec_info.capture_hidden_mode if spec_info else CaptureHiddenMode.NULL
+            ),
+            global_num_tokens_for_logprob_gpu=global_num_tokens_for_logprob,
+        )
+
+        # Attention backend
+        self.model_runner.draft_attn_backend.init_forward_metadata_capture_cuda_graph(
+            forward_batch
+        )
+
+        # Run and capture
+        def run_once():
+            # Clean intermediate result cache for DP attention
+            forward_batch.dp_local_start_pos = forward_batch.dp_local_num_tokens = None
+            set_dp_buffer_len(global_dp_buffer_len, num_tokens)
+
+            # Backup two fields, which will be modified in-place in `draft_forward`.
+            output_cache_loc_backup = forward_batch.out_cache_loc
+            hidden_states_backup = forward_batch.spec_info.hidden_states
+
+            ret = self.eagle_worker.draft_forward(forward_batch)
+
+            forward_batch.out_cache_loc = output_cache_loc_backup
+            forward_batch.spec_info.hidden_states = hidden_states_backup
+            return ret
+
+        for _ in range(2):
+            torch.cuda.synchronize()
+            self.model_runner.tp_group.barrier()
+
+            run_once()
+
+        with torch.cuda.graph(
+            graph, pool=get_global_graph_memory_pool(), stream=stream
+        ):
+            out = run_once()
+
+        set_global_graph_memory_pool(graph.pool())
+        return graph, out
+
+    def _postprocess_output_to_raw_bs(self, out, raw_bs):
+        score_list, token_list, parents_list = out
+        score_list = [x[:raw_bs] for x in score_list]
+        token_list = [x[:raw_bs] for x in token_list]
+        parents_list = [x[:raw_bs] for x in parents_list]
+        return (score_list, token_list, parents_list)
+
+    def replay(self, forward_batch: ForwardBatch):
+        assert forward_batch.out_cache_loc is not None
+        raw_bs = forward_batch.batch_size
+        raw_num_token = raw_bs * self.num_tokens_per_bs
+
+        # Pad
+        if self.require_mlp_tp_gather:
+            max_num_tokens = max(forward_batch.global_num_tokens_cpu)
+            max_batch_size = (
+                max_num_tokens // self.num_tokens_per_bs
+                if self.model_runner.spec_algorithm.is_eagle()
+                else max_num_tokens
+            )
+            index = bisect.bisect_left(self.capture_bs, max_batch_size)
+        else:
+            index = bisect.bisect_left(self.capture_bs, raw_bs)
+        bs = self.capture_bs[index]
+        if bs != raw_bs:
+            self.seq_lens.fill_(self.seq_len_fill_value)
+            self.out_cache_loc.zero_()
+
+        num_tokens = bs * self.num_tokens_per_bs
+
+        # Common inputs
+        self.req_pool_indices[:raw_bs].copy_(forward_batch.req_pool_indices)
+        self.seq_lens[:raw_bs].copy_(forward_batch.seq_lens)
+        self.out_cache_loc[: raw_num_token * self.speculative_num_steps].copy_(
+            forward_batch.out_cache_loc
+        )
+        self.positions[:raw_num_token].copy_(forward_batch.positions)
+        self.topk_p[:raw_bs].copy_(forward_batch.spec_info.topk_p)
+        self.topk_index[:raw_bs].copy_(forward_batch.spec_info.topk_index)
+        self.hidden_states[:raw_bs].copy_(forward_batch.spec_info.hidden_states)
+
+        # TODO(ch-wan): support num_token_non_padded
+        if self.require_gathered_buffer:
+            self.global_num_tokens_gpu.fill_(bs * self.num_tokens_per_bs)
+            self.global_num_tokens_for_logprob_gpu.fill_(bs * self.num_tokens_per_bs)
+
+        # Attention backend
+        if bs != raw_bs:
+            forward_batch.batch_size = bs
+            forward_batch.seq_lens = self.seq_lens[:bs]
+            forward_batch.req_pool_indices = self.req_pool_indices[:bs]
+            forward_batch.positions = self.positions[:num_tokens]
+
+        if forward_batch.seq_lens_cpu is not None:
+            if bs != raw_bs:
+                self.seq_lens_cpu.fill_(self.seq_len_fill_value)
+            self.seq_lens_cpu[:raw_bs].copy_(forward_batch.seq_lens_cpu)
+            forward_batch.seq_lens_cpu = self.seq_lens_cpu[:bs]
+
+        self.model_runner.draft_attn_backend.init_forward_metadata_replay_cuda_graph(
+            forward_batch, bs
+        )
+        # TODO: The forward_batch.seq_len_sum might need to be updated to reflect the padding in the cuda graph
+
+        # Replay
+        self.graphs[bs].replay()
+        out = self.output_buffers[bs]
+
+        if bs != raw_bs:
+            out = self._postprocess_output_to_raw_bs(out, raw_bs)
+            forward_batch.batch_size = raw_bs
+            forward_batch.positions = self.positions[:raw_num_token]
+            forward_batch.seq_lens = self.seq_lens[:raw_bs]
+            forward_batch.req_pool_indices = self.req_pool_indices[:raw_bs]
+            if forward_batch.seq_lens_cpu is not None:
+                forward_batch.seq_lens_cpu = self.seq_lens_cpu[:raw_bs]
+
+        return out
diff --git a/python/sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py b/python/sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py
new file mode 100644
index 000000000..8340b0ca8
--- /dev/null
+++ b/python/sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py
@@ -0,0 +1,391 @@
+from __future__ import annotations
+
+import bisect
+from typing import TYPE_CHECKING, Callable
+
+import torch
+
+from sglang.srt.layers.dp_attention import DpPaddingMode, set_dp_buffer_len
+from sglang.srt.model_executor.cuda_graph_runner import (
+    CUDA_GRAPH_CAPTURE_FAILED_MSG,
+    CudaGraphRunner,
+    LogitsProcessorOutput,
+    get_batch_sizes_to_capture,
+    get_global_graph_memory_pool,
+    model_capture_mode,
+    set_global_graph_memory_pool,
+    set_torch_compile_config,
+)
+from sglang.srt.model_executor.forward_batch_info import (
+    CaptureHiddenMode,
+    ForwardBatch,
+    ForwardMode,
+)
+from sglang.srt.speculative.eagle_utils import EagleDraftInput, fast_topk
+from sglang.srt.utils import (
+    require_attn_tp_gather,
+    require_gathered_buffer,
+    require_mlp_sync,
+    require_mlp_tp_gather,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.speculative.eagle_worker import EAGLEWorker
+
+
+class EAGLEDraftExtendCudaGraphRunner:
+    def __init__(self, eagle_worker: EAGLEWorker):
+        # Parse args
+        self.eagle_worker = eagle_worker
+        self.model_runner = model_runner = eagle_worker.model_runner
+        self.graphs = {}
+        self.output_buffers = {}
+        self.enable_torch_compile = model_runner.server_args.enable_torch_compile
+        self.disable_padding = model_runner.server_args.disable_cuda_graph_padding
+        self.require_gathered_buffer = require_gathered_buffer(model_runner.server_args)
+        self.require_mlp_tp_gather = require_mlp_tp_gather(model_runner.server_args)
+        self.require_mlp_sync = require_mlp_sync(model_runner.server_args)
+        self.require_attn_tp_gather = require_attn_tp_gather(model_runner.server_args)
+        self.tp_size = self.model_runner.tp_size
+        self.dp_size = model_runner.server_args.dp_size
+        self.speculative_num_steps = model_runner.server_args.speculative_num_steps
+        self.topk = model_runner.server_args.speculative_eagle_topk
+        self.enable_profile_cuda_graph = (
+            model_runner.server_args.enable_profile_cuda_graph
+        )
+        self.capture_bs, self.compile_bs = get_batch_sizes_to_capture(model_runner)
+        self.padded_static_len = -1
+
+        # Attention backend
+        self.num_tokens_per_bs = self.speculative_num_steps + 1
+        self.max_bs = max(self.capture_bs)
+        self.max_num_token = self.max_bs * self.num_tokens_per_bs
+
+        self.eagle_worker.draft_extend_attn_backend.init_cuda_graph_state(
+            self.max_bs, self.max_num_token
+        )
+        self.seq_len_fill_value = (
+            self.eagle_worker.draft_extend_attn_backend.get_cuda_graph_seq_len_fill_value()
+        )
+        self.seq_lens_cpu = torch.full(
+            (self.max_bs,), self.seq_len_fill_value, dtype=torch.int32
+        )
+
+        if self.enable_torch_compile:
+            set_torch_compile_config()
+
+        # Graph inputs
+        with torch.device("cuda"):
+            self.input_ids = torch.zeros((self.max_num_token,), dtype=torch.int64)
+            self.req_pool_indices = torch.zeros((self.max_bs,), dtype=torch.int32)
+            self.out_cache_loc = torch.ones((self.max_num_token,), dtype=torch.int64)
+            self.positions = torch.zeros((self.max_num_token,), dtype=torch.int64)
+            self.mrope_positions = torch.zeros(
+                (3, self.max_num_token), dtype=torch.int64
+            )
+
+            if self.eagle_worker.speculative_algorithm.is_eagle3():
+                self.hidden_states = torch.zeros(
+                    (
+                        self.max_num_token,
+                        (
+                            self.model_runner.model_config.hf_config.target_hidden_size
+                            * 3
+                            if hasattr(
+                                self.model_runner.model_config.hf_config,
+                                "target_hidden_size",
+                            )
+                            else self.model_runner.model_config.hidden_size * 3
+                        ),
+                    ),
+                    dtype=self.model_runner.dtype,
+                )
+            else:
+                self.hidden_states = torch.zeros(
+                    (self.max_num_token, self.model_runner.model_config.hidden_size),
+                    dtype=self.model_runner.dtype,
+                )
+
+            self.seq_lens = torch.ones((self.max_bs,), dtype=torch.int32)
+            self.extend_seq_lens = torch.ones((self.max_bs,), dtype=torch.int32)
+            self.accept_length = torch.full(
+                (self.max_bs,), self.num_tokens_per_bs, dtype=torch.int32
+            )
+
+            if self.require_gathered_buffer:
+                if self.require_mlp_tp_gather:
+                    self.global_num_tokens_gpu = torch.zeros(
+                        (self.dp_size,), dtype=torch.int32
+                    )
+                    self.global_num_tokens_for_logprob_gpu = torch.zeros(
+                        (self.dp_size,), dtype=torch.int32
+                    )
+                else:
+                    assert self.require_attn_tp_gather
+                    self.global_num_tokens_gpu = torch.zeros((1,), dtype=torch.int32)
+                    self.global_num_tokens_for_logprob_gpu = torch.zeros(
+                        (1,), dtype=torch.int32
+                    )
+            else:
+                self.global_num_tokens_gpu = None
+                self.global_num_tokens_for_logprob_gpu = None
+
+            if hasattr(
+                self.model_runner.model_config.hf_config, "draft_vocab_size"
+            ):  # llama_eagle
+                vocab_size = self.model_runner.model_config.hf_config.draft_vocab_size
+            elif hasattr(
+                self.model_runner.model_config.hf_config, "hot_vocab_size"
+            ):  # llama_eagle3
+                vocab_size = self.model_runner.model_config.hf_config.hot_vocab_size
+            else:
+                vocab_size = self.model_runner.model_config.vocab_size
+
+            self.next_token_logits_buffer = torch.zeros(
+                (self.max_bs, vocab_size),
+                dtype=torch.float,
+            )
+
+        # Capture
+        try:
+            with model_capture_mode():
+                self.capture()
+        except RuntimeError as e:
+            raise Exception(
+                f"Capture cuda graph failed: {e}\n{CUDA_GRAPH_CAPTURE_FAILED_MSG}"
+            )
+
+    def can_run(self, forward_batch: ForwardBatch):
+        if self.require_mlp_tp_gather:
+            cuda_graph_bs = (
+                max(forward_batch.global_num_tokens_cpu) // self.num_tokens_per_bs
+                if self.model_runner.spec_algorithm.is_eagle()
+                else max(forward_batch.global_num_tokens_cpu)
+            )
+        else:
+            cuda_graph_bs = forward_batch.seq_lens.numel()
+
+        is_bs_supported = (
+            cuda_graph_bs in self.graphs
+            if self.disable_padding
+            else cuda_graph_bs <= self.max_bs
+        )
+
+        if self.require_mlp_sync:
+            is_bs_supported = is_bs_supported and forward_batch.can_run_dp_cuda_graph
+
+        return is_bs_supported
+
+    def capture(self):
+        CudaGraphRunner.capture(self)
+
+    def capture_one_batch_size(self, bs: int, forward: Callable):
+        graph = torch.cuda.CUDAGraph()
+        stream = self.stream
+        num_tokens = bs * self.num_tokens_per_bs
+
+        # Graph inputs
+        input_ids = self.input_ids[:num_tokens]
+        req_pool_indices = self.req_pool_indices[:bs]
+        seq_lens = self.seq_lens[:bs]
+        extend_seq_lens = self.extend_seq_lens[:bs]
+        accept_length = self.accept_length[:bs]
+        out_cache_loc = self.out_cache_loc[:num_tokens]
+        positions = self.positions[:num_tokens]
+        mrope_positions = self.mrope_positions[:, :num_tokens]
+        hidden_states = self.hidden_states[:num_tokens]
+        next_token_logits_buffer = self.next_token_logits_buffer[:bs]
+
+        if self.require_mlp_tp_gather:
+            self.global_num_tokens_gpu.copy_(
+                torch.tensor(
+                    [num_tokens] * self.dp_size,
+                    dtype=torch.int32,
+                    device=self.input_ids.device,
+                )
+            )
+            self.global_num_tokens_for_logprob_gpu.copy_(
+                torch.tensor(
+                    [bs] * self.dp_size,
+                    dtype=torch.int32,
+                    device=self.input_ids.device,
+                )
+            )
+            global_dp_buffer_len = num_tokens * self.dp_size
+        elif self.require_attn_tp_gather:
+            self.global_num_tokens_gpu.copy_(
+                torch.tensor(
+                    [num_tokens],
+                    dtype=torch.int32,
+                    device=self.input_ids.device,
+                )
+            )
+            self.global_num_tokens_for_logprob_gpu.copy_(
+                torch.tensor(
+                    [bs],
+                    dtype=torch.int32,
+                    device=self.input_ids.device,
+                )
+            )
+            global_dp_buffer_len = num_tokens
+        else:
+            global_dp_buffer_len = None
+
+        spec_info = EagleDraftInput(
+            hidden_states=hidden_states,
+            accept_length=accept_length,
+        )
+        spec_info.positions = None
+
+        # Forward batch
+        forward_batch = ForwardBatch(
+            forward_mode=ForwardMode.DRAFT_EXTEND,
+            batch_size=bs,
+            input_ids=input_ids,
+            req_pool_indices=req_pool_indices,
+            seq_lens=seq_lens,
+            next_token_logits_buffer=next_token_logits_buffer,
+            req_to_token_pool=self.model_runner.req_to_token_pool,
+            token_to_kv_pool=self.model_runner.token_to_kv_pool,
+            out_cache_loc=out_cache_loc,
+            seq_lens_sum=seq_lens.sum().item(),
+            return_logprob=False,
+            positions=positions,
+            mrope_positions=mrope_positions,
+            global_num_tokens_gpu=self.global_num_tokens_gpu,
+            global_num_tokens_for_logprob_gpu=self.global_num_tokens_for_logprob_gpu,
+            dp_padding_mode=DpPaddingMode.get_default_mode_in_cuda_graph(),
+            global_dp_buffer_len=global_dp_buffer_len,
+            spec_algorithm=self.model_runner.spec_algorithm,
+            spec_info=spec_info,
+            capture_hidden_mode=CaptureHiddenMode.LAST,
+            attn_backend=self.eagle_worker.draft_extend_attn_backend,
+            extend_seq_lens=extend_seq_lens,
+            padded_static_len=self.padded_static_len,
+        )
+
+        self.eagle_worker.draft_extend_attn_backend.init_forward_metadata_capture_cuda_graph(
+            bs=bs,
+            num_tokens=num_tokens,
+            req_pool_indices=req_pool_indices,
+            seq_lens=seq_lens,
+            encoder_lens=None,
+            forward_mode=ForwardMode.DRAFT_EXTEND,
+            spec_info=spec_info,
+        )
+
+        # Run and capture
+        def run_once():
+            # Clean intermediate result cache for DP attention
+            forward_batch.dp_local_start_pos = forward_batch.dp_local_num_tokens = None
+            set_dp_buffer_len(global_dp_buffer_len, num_tokens)
+
+            # Backup two fields, which will be modified in-place in `draft_forward`.
+            output_cache_loc_backup = forward_batch.out_cache_loc
+            hidden_states_backup = forward_batch.spec_info.hidden_states
+
+            ret = self.eagle_worker.draft_model_runner.model.forward(
+                forward_batch.input_ids,
+                forward_batch.positions,
+                forward_batch,
+            )
+            probs = torch.softmax(ret.next_token_logits, dim=-1)
+            ret.topk_p, ret.topk_index = fast_topk(probs, self.topk, dim=-1)
+
+            forward_batch.out_cache_loc = output_cache_loc_backup
+            forward_batch.spec_info.hidden_states = hidden_states_backup
+            return ret
+
+        for _ in range(2):
+            torch.cuda.synchronize()
+            self.model_runner.tp_group.barrier()
+
+            run_once()
+
+        with torch.cuda.graph(
+            graph, pool=get_global_graph_memory_pool(), stream=stream
+        ):
+            out = run_once()
+
+        set_global_graph_memory_pool(graph.pool())
+        return graph, out
+
+    def replay(self, forward_batch: ForwardBatch):
+        assert forward_batch.out_cache_loc is not None
+        # batch_size and num_seqs can be different in case there are finished examples
+        # in the batch, which will not be counted as num_seqs
+        raw_bs = forward_batch.batch_size
+        num_tokens = forward_batch.input_ids.shape[0]
+        if self.require_mlp_tp_gather:
+            max_num_tokens = max(forward_batch.global_num_tokens_cpu)
+            max_batch_size = (
+                max_num_tokens // self.num_tokens_per_bs
+                if self.model_runner.spec_algorithm.is_eagle()
+                else max_num_tokens
+            )
+            index = bisect.bisect_left(self.capture_bs, max_batch_size)
+        else:
+            index = bisect.bisect_left(self.capture_bs, raw_bs)
+
+        bs = self.capture_bs[index]
+        if bs * self.num_tokens_per_bs != num_tokens:
+            self.seq_lens.fill_(self.seq_len_fill_value)
+            self.out_cache_loc.zero_()
+            self.accept_length.fill_(1)
+            self.extend_seq_lens.fill_(1)
+
+        # Common inputs
+        self.input_ids[:num_tokens].copy_(forward_batch.input_ids)
+        self.seq_lens[:raw_bs].copy_(forward_batch.seq_lens)
+        if forward_batch.extend_seq_lens is not None:
+            self.extend_seq_lens[:raw_bs].copy_(forward_batch.extend_seq_lens)
+        self.out_cache_loc[:num_tokens].copy_(forward_batch.out_cache_loc)
+        self.positions[:num_tokens].copy_(forward_batch.positions)
+        if (
+            forward_batch.spec_info.hidden_states.shape[1]
+            == self.hidden_states.shape[1]
+        ):
+            self.hidden_states[:num_tokens].copy_(forward_batch.spec_info.hidden_states)
+        if forward_batch.spec_info.accept_length is not None:
+            self.accept_length[:raw_bs].copy_(forward_batch.spec_info.accept_length)
+        self.req_pool_indices[:raw_bs].copy_(forward_batch.req_pool_indices)
+
+        # TODO(ch-wan): support num_token_non_padded
+        if self.require_gathered_buffer:
+            self.global_num_tokens_gpu.fill_(bs * self.num_tokens_per_bs)
+            self.global_num_tokens_for_logprob_gpu.fill_(bs)
+
+        if forward_batch.seq_lens_cpu is not None:
+            if bs != raw_bs:
+                self.seq_lens_cpu.fill_(self.seq_len_fill_value)
+            self.seq_lens_cpu[:raw_bs].copy_(forward_batch.seq_lens_cpu)
+
+        if bs != raw_bs:
+            forward_batch.spec_info.positions = self.positions[:num_tokens]
+            forward_batch.spec_info.accept_length = self.accept_length[:bs]
+
+        self.eagle_worker.draft_extend_attn_backend.init_forward_metadata_replay_cuda_graph(
+            bs=bs,
+            req_pool_indices=self.req_pool_indices,
+            seq_lens=self.seq_lens,
+            seq_lens_sum=forward_batch.seq_lens_sum
+            + (bs - raw_bs) * self.seq_len_fill_value,
+            encoder_lens=None,
+            forward_mode=ForwardMode.DRAFT_EXTEND,
+            spec_info=forward_batch.spec_info,
+            seq_lens_cpu=self.seq_lens_cpu,
+        )
+
+        # Replay
+        self.graphs[bs].replay()
+        out = self.output_buffers[bs]
+        if bs != raw_bs:
+            forward_batch.spec_info.accept_length = self.accept_length[:raw_bs]
+            out_copy = out
+            out = LogitsProcessorOutput(
+                next_token_logits=out.next_token_logits[:raw_bs],
+                hidden_states=out.hidden_states[:raw_bs],
+            )
+            out.topk_p = out_copy.topk_p[:raw_bs]
+            out.topk_index = out_copy.topk_index[:raw_bs]
+        return out
diff --git a/python/sglang/srt/speculative/eagle_utils.py b/python/sglang/srt/speculative/eagle_utils.py
new file mode 100644
index 000000000..14450e9b1
--- /dev/null
+++ b/python/sglang/srt/speculative/eagle_utils.py
@@ -0,0 +1,1297 @@
+from __future__ import annotations
+
+import copy
+import logging
+import os
+import time
+from dataclasses import dataclass
+from typing import List, Optional
+
+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+
+from sglang.srt.constrained.base_grammar_backend import BaseGrammarObject
+from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.layers.sampler import apply_custom_logit_processor
+from sglang.srt.managers.schedule_batch import (
+    Req,
+    ScheduleBatch,
+    get_last_loc,
+    global_server_args_dict,
+)
+from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
+from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, ForwardMode
+from sglang.srt.utils import is_cuda, is_hip, next_power_of_2
+
+logger = logging.getLogger(__name__)
+
+if is_cuda():
+    from sgl_kernel import (
+        fast_topk,
+        top_k_renorm_prob,
+        top_p_renorm_prob,
+        tree_speculative_sampling_target_only,
+        verify_tree_greedy,
+    )
+elif is_hip():
+    from sgl_kernel import fast_topk, verify_tree_greedy
+
+
+logger = logging.getLogger(__name__)
+
+
+# Simulate acceptance length for benchmarking purposes
+SIMULATE_ACC_LEN = os.environ.get("SIMULATE_ACC_LEN")
+SIMULATE_ACC_METHOD = os.environ.get("SIMULATE_ACC_METHOD", "multinomial")
+
+TREE_TRAVERSE_TIME_THRESHOLD = 1  # TODO: set this properly
+
+TREE_SPEC_KERNEL_AVAILABLE = "tree_speculative_sampling_target_only" in globals()
+
+
+@dataclass
+class EagleDraftInput:
+    # The inputs for decode
+    # shape: (b, topk)
+    topk_p: torch.Tensor = None
+    topk_index: torch.Tensor = None
+    # shape: (b, hidden_size)
+    hidden_states: torch.Tensor = None
+    capture_hidden_mode: CaptureHiddenMode = CaptureHiddenMode.FULL
+
+    # Inputs for extend
+    # shape: (b,)
+    verified_id: torch.Tensor = None
+    accept_length: torch.Tensor = None
+    accept_length_cpu: List[int] = None
+
+    # Inputs for the attention backends
+    # shape: (b + 1,)
+    kv_indptr: torch.Tensor = None
+    kv_indices: torch.Tensor = None
+
+    # Shape info for padding
+    num_tokens_per_batch: int = -1
+    num_tokens_for_logprob_per_batch: int = -1
+
+    # Inputs for draft extend
+    # shape: (b,)
+    seq_lens_for_draft_extend: torch.Tensor = None
+    req_pool_indices_for_draft_extend: torch.Tensor = None
+
+    def prepare_for_extend(self, batch: ScheduleBatch):
+
+        if batch.forward_mode.is_idle():
+            return
+
+        # Prefill only generate 1 token.
+        assert len(self.verified_id) == len(batch.seq_lens)
+
+        pt = 0
+        for i, extend_len in enumerate(batch.extend_lens):
+            input_ids = batch.input_ids[pt : pt + extend_len]
+            batch.input_ids[pt : pt + extend_len] = torch.cat(
+                (input_ids[1:], self.verified_id[i].reshape(1))
+            )
+            pt += extend_len
+
+    @classmethod
+    def create_idle_input(
+        cls,
+        device: torch.device,
+        hidden_size: int,
+        dtype: torch.dtype,
+        topk: int,
+        capture_hidden_mode: CaptureHiddenMode,
+    ):
+        return cls(
+            verified_id=torch.empty((0,), device=device, dtype=torch.int32),
+            hidden_states=torch.empty((0, hidden_size), device=device, dtype=dtype),
+            topk_p=torch.empty((0, topk), device=device, dtype=torch.float32),
+            topk_index=torch.empty((0, topk), device=device, dtype=torch.int64),
+            capture_hidden_mode=capture_hidden_mode,
+            accept_length=torch.empty((0,), device=device, dtype=torch.int32),
+            accept_length_cpu=[],
+        )
+
+    def prepare_extend_after_decode(
+        self,
+        batch: ScheduleBatch,
+        speculative_num_steps: int,
+    ):
+
+        if batch.forward_mode.is_idle():
+            return
+
+        batch.input_ids = self.verified_id
+        batch.extend_lens = [x + 1 for x in batch.spec_info.accept_length_cpu]
+        batch.extend_num_tokens = sum(batch.extend_lens)
+        batch.seq_lens = batch.spec_info.seq_lens_for_draft_extend
+        batch.req_pool_indices = batch.spec_info.req_pool_indices_for_draft_extend
+        batch.return_logprob = False
+        batch.return_hidden_states = False
+
+        self.capture_hidden_mode = CaptureHiddenMode.LAST
+        self.accept_length.add_(1)
+        self.positions = torch.empty_like(batch.input_ids, dtype=torch.long)
+        self.verified_id = torch.empty_like(self.accept_length, dtype=torch.int32)
+
+        create_extend_after_decode_spec_info[(len(batch.seq_lens),)](
+            batch.input_ids,
+            batch.seq_lens,
+            self.accept_length,
+            self.positions,
+            self.verified_id,
+            next_power_of_2(max(speculative_num_steps + 1, len(batch.seq_lens))),
+        )
+
+    def generate_attn_arg_prefill(
+        self,
+        req_pool_indices: torch.Tensor,
+        paged_kernel_lens: torch.Tensor,
+        paged_kernel_lens_sum: int,
+        req_to_token: torch.Tensor,
+    ):
+        bs = self.accept_length.numel()
+        qo_indptr = torch.zeros((bs + 1,), dtype=torch.int32, device="cuda")
+        qo_indptr[1:] = torch.cumsum(self.accept_length, dim=0)
+        cum_kv_seq_len = torch.zeros((bs + 1,), dtype=torch.int32, device="cuda")
+        cum_kv_seq_len[1:] = torch.cumsum(paged_kernel_lens, dim=0)
+
+        if paged_kernel_lens_sum is None:
+            paged_kernel_lens_sum = cum_kv_seq_len[-1]
+
+        kv_indices = torch.empty(
+            paged_kernel_lens_sum, dtype=torch.int32, device="cuda"
+        )
+
+        create_flashinfer_kv_indices_triton[(bs,)](
+            req_to_token,
+            req_pool_indices,
+            paged_kernel_lens,
+            cum_kv_seq_len,
+            None,
+            kv_indices,
+            req_to_token.size(1),
+        )
+        return kv_indices, cum_kv_seq_len, qo_indptr, None
+
+    def filter_batch(self, new_indices: torch.Tensor, has_been_filtered: bool = True):
+        if has_been_filtered:
+            # in eagle_utils.py:verify, we have already filtered the batch by `unfinished_index`
+            # therefore, we don't need to filter the batch again in scheduler
+            if len(new_indices) != len(self.topk_p):
+                logger.warning(
+                    f"length of new_indices: {len(new_indices)} != length of topk_p: {len(self.topk_p)}, this should not happen"
+                )
+            self.topk_p = self.topk_p[: len(new_indices)]
+            self.topk_index = self.topk_index[: len(new_indices)]
+            self.hidden_states = self.hidden_states[: len(new_indices)]
+            self.verified_id = self.verified_id[: len(new_indices)]
+        else:
+            # in some cases(e.g draft_extend), we have not filtered the batch by `unfinished_index`
+            self.topk_p = self.topk_p[new_indices]
+            self.topk_index = self.topk_index[new_indices]
+            self.hidden_states = self.hidden_states[new_indices]
+            self.verified_id = self.verified_id[new_indices]
+
+    def merge_batch(self, spec_info: EagleDraftInput):
+        if self.hidden_states is None:
+            self.hidden_states = spec_info.hidden_states
+            self.verified_id = spec_info.verified_id
+            self.topk_p = spec_info.topk_p
+            self.topk_index = spec_info.topk_index
+            return
+        if spec_info.hidden_states is None:
+            return
+        self.hidden_states = torch.cat(
+            [self.hidden_states, spec_info.hidden_states], axis=0
+        )
+        self.verified_id = torch.cat([self.verified_id, spec_info.verified_id], axis=0)
+        self.topk_p = torch.cat([self.topk_p, spec_info.topk_p])
+        self.topk_index = torch.cat([self.topk_index, spec_info.topk_index])
+
+
+@dataclass
+class EagleVerifyOutput:
+    # Draft input batch
+    draft_input: EagleDraftInput
+    # Logit outputs from target worker
+    logits_output: LogitsProcessorOutput
+    # Accepted token ids including the bonus token
+    verified_id: torch.Tensor
+    # Accepted token length per sequence in a batch in CPU.
+    accept_length_per_req_cpu: List[int]
+    # Accepted indices from logits_output.next_token_logits
+    accepted_indices: torch.Tensor
+
+
+@dataclass
+class EagleVerifyInput:
+    draft_token: torch.Tensor
+    custom_mask: torch.Tensor
+    positions: torch.Tensor
+    retrive_index: torch.Tensor
+    retrive_next_token: torch.Tensor
+    retrive_next_sibling: torch.Tensor
+    retrive_cum_len: torch.Tensor
+    spec_steps: int
+    topk: int
+    draft_token_num: int
+    capture_hidden_mode: CaptureHiddenMode
+    seq_lens_sum: int
+    seq_lens_cpu: torch.Tensor
+    grammar: BaseGrammarObject = None
+
+    @classmethod
+    def create_idle_input(cls, topk: int, spec_steps: int, num_verify_tokens: int):
+        return cls(
+            draft_token=torch.empty((0,), dtype=torch.long, device="cuda"),
+            custom_mask=torch.full((0,), True, dtype=torch.bool, device="cuda"),
+            positions=torch.empty((0,), dtype=torch.int64, device="cuda"),
+            retrive_index=torch.full(
+                (0, num_verify_tokens), -1, dtype=torch.long, device="cuda"
+            ),
+            retrive_next_token=torch.full(
+                (0, num_verify_tokens), -1, dtype=torch.long, device="cuda"
+            ),
+            retrive_next_sibling=torch.full(
+                (0, num_verify_tokens), -1, dtype=torch.long, device="cuda"
+            ),
+            retrive_cum_len=None,
+            topk=topk,
+            draft_token_num=num_verify_tokens,
+            spec_steps=spec_steps,
+            capture_hidden_mode=CaptureHiddenMode.FULL,
+            seq_lens_sum=0,
+            seq_lens_cpu=torch.empty((0,), dtype=torch.int32),
+        )
+
+    def prepare_for_verify(self, batch: ScheduleBatch, page_size: int):
+
+        if batch.forward_mode.is_idle():
+            return
+
+        batch.input_ids = self.draft_token
+
+        if page_size == 1:
+            batch.out_cache_loc = batch.alloc_token_slots(len(batch.input_ids))
+            end_offset = batch.seq_lens + self.draft_token_num
+        else:
+            prefix_lens = batch.seq_lens
+            end_offset = prefix_lens + self.draft_token_num
+            last_loc = get_last_loc(
+                batch.req_to_token_pool.req_to_token,
+                batch.req_pool_indices,
+                prefix_lens,
+            )
+            batch.out_cache_loc = batch.alloc_paged_token_slots_extend(
+                prefix_lens, end_offset, last_loc, len(batch.input_ids)
+            )
+            self.last_loc = last_loc
+
+        bs = batch.batch_size()
+        assign_req_to_token_pool[(bs,)](
+            batch.req_pool_indices,
+            batch.req_to_token_pool.req_to_token,
+            batch.seq_lens,
+            end_offset,
+            batch.out_cache_loc,
+            batch.req_to_token_pool.req_to_token.shape[1],
+            next_power_of_2(bs),
+        )
+
+    def generate_attn_arg_prefill(
+        self,
+        req_pool_indices: torch.Tensor,
+        paged_kernel_lens: torch.Tensor,
+        paged_kernel_lens_sum: int,
+        req_to_token: torch.Tensor,
+    ):
+        batch_size = len(req_pool_indices)
+        qo_indptr = torch.arange(
+            0,
+            (1 + batch_size) * self.draft_token_num,
+            step=self.draft_token_num,
+            dtype=torch.int32,
+            device="cuda",
+        )
+        cum_kv_seq_len = torch.zeros(
+            (batch_size + 1,), dtype=torch.int32, device="cuda"
+        )
+
+        paged_kernel_lens = paged_kernel_lens + self.draft_token_num
+        cum_kv_seq_len[1:] = torch.cumsum(paged_kernel_lens, dim=0)
+
+        kv_indices = torch.empty(
+            paged_kernel_lens_sum + self.draft_token_num * batch_size,
+            dtype=torch.int32,
+            device="cuda",
+        )
+        create_flashinfer_kv_indices_triton[(batch_size,)](
+            req_to_token,
+            req_pool_indices,
+            paged_kernel_lens,
+            cum_kv_seq_len,
+            None,
+            kv_indices,
+            req_to_token.size(1),
+        )
+        return kv_indices, cum_kv_seq_len, qo_indptr, self.custom_mask
+
+    def verify(
+        self,
+        batch: ScheduleBatch,
+        logits_output: LogitsProcessorOutput,
+        token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator,
+        page_size: int,
+        vocab_mask: Optional[torch.Tensor] = None,  # For grammar
+    ) -> torch.Tensor:
+        """
+        Verify and find accepted tokens based on logits output and batch
+        (which contains spec decoding information).
+
+        WARNING: This API in-place modifies the states of logits_output
+
+        This API updates values inside logits_output based on the accepted
+        tokens. I.e., logits_output.next_token_logits only contains
+        accepted token logits.
+        """
+        if batch.forward_mode.is_idle():
+            return EagleVerifyOutput(
+                draft_input=EagleDraftInput.create_idle_input(
+                    device=batch.device,
+                    hidden_size=batch.model_config.hidden_size,
+                    dtype=batch.model_config.dtype,
+                    topk=self.topk,
+                    capture_hidden_mode=CaptureHiddenMode.LAST,
+                ),
+                logits_output=logits_output,
+                verified_id=torch.empty(0, dtype=torch.long, device=batch.device),
+                accept_length_per_req_cpu=[],
+                accepted_indices=torch.full(
+                    (0, self.spec_steps + 1),
+                    -1,
+                    dtype=torch.int32,
+                    device=batch.device,
+                ),
+            )
+
+        bs = self.retrive_index.shape[0]
+        candidates = self.draft_token.reshape(bs, self.draft_token_num)
+        sampling_info = batch.sampling_info
+
+        predict_shape = list(logits_output.next_token_logits.shape)[:-1]
+        predict_shape[-1] += 1
+        predict = torch.empty(predict_shape, dtype=torch.int32, device="cuda")
+        accept_index = torch.full(
+            (bs, self.spec_steps + 1), -1, dtype=torch.int32, device="cuda"
+        )
+        accept_length = torch.empty((bs,), dtype=torch.int32, device="cuda")
+
+        if bs != len(sampling_info):
+            sampling_info = copy.deepcopy(sampling_info)
+            # NOTE: retrive_index are the indices of the requests that are kept.
+            sampling_info.filter_batch(self.retrive_index.tolist(), self.retrive_index)
+
+        # Apply the custom logit processors if registered in the sampling info.
+        if sampling_info.has_custom_logit_processor:
+            apply_custom_logit_processor(
+                logits_output.next_token_logits,
+                sampling_info,
+                num_tokens_in_batch=self.draft_token_num,
+            )
+
+        # Apply penalty
+        if sampling_info.penalizer_orchestrator.is_required:
+            # This is a relaxed version of penalties for speculative decoding.
+            linear_penalty = torch.zeros(
+                (bs, logits_output.next_token_logits.shape[1]),
+                dtype=torch.float32,
+                device="cuda",
+            )
+            sampling_info.apply_logits_bias(linear_penalty)
+            logits_output.next_token_logits.add_(
+                torch.repeat_interleave(linear_penalty, self.draft_token_num, dim=0)
+            )
+
+        # Apply grammar mask
+        if vocab_mask is not None:
+            assert self.grammar is not None
+            self.grammar.apply_vocab_mask(
+                logits=logits_output.next_token_logits, vocab_mask=vocab_mask
+            )
+
+        # Sample tokens. Force greedy sampling on AMD
+        is_all_greedy = sampling_info.is_all_greedy
+        if (not is_all_greedy) and (not TREE_SPEC_KERNEL_AVAILABLE):
+            logger.warning(
+                "Tree speculative sampling kernel unavailable (likely AMD/HIP build). "
+                "Falling back to greedy verification."
+            )
+
+        if is_all_greedy or not TREE_SPEC_KERNEL_AVAILABLE:
+            target_predict = torch.argmax(logits_output.next_token_logits, dim=-1)
+            target_predict = target_predict.reshape(bs, self.draft_token_num)
+
+            verify_tree_greedy(
+                predicts=predict,  # mutable
+                accept_index=accept_index,  # mutable
+                accept_token_num=accept_length,  # mutable
+                candidates=candidates,
+                retrive_index=self.retrive_index,
+                retrive_next_token=self.retrive_next_token,
+                retrive_next_sibling=self.retrive_next_sibling,
+                target_predict=target_predict,
+            )
+        else:
+            # apply temperature and get target probs
+            expanded_temperature = torch.repeat_interleave(
+                sampling_info.temperatures, self.draft_token_num, dim=0
+            )  # (bs * draft_token_num, 1)
+
+            target_probs = F.softmax(
+                logits_output.next_token_logits / expanded_temperature, dim=-1
+            )  # (bs * draft_token_num, vocab_size)
+            target_probs = top_k_renorm_prob(
+                target_probs,
+                torch.repeat_interleave(
+                    sampling_info.top_ks, self.draft_token_num, dim=0
+                ),
+            )  # (bs * draft_token_num, vocab_size)
+            if not torch.all(sampling_info.top_ps == 1.0):
+                target_probs = top_p_renorm_prob(
+                    target_probs,
+                    torch.repeat_interleave(
+                        sampling_info.top_ps, self.draft_token_num, dim=0
+                    ),
+                )
+            target_probs = target_probs.reshape(bs, self.draft_token_num, -1)
+
+            draft_probs = torch.zeros(
+                target_probs.shape, dtype=torch.float32, device="cuda"
+            )
+
+            # coins for rejection sampling
+            coins = torch.rand_like(candidates, dtype=torch.float32, device="cuda")
+            # coins for final sampling
+            coins_for_final_sampling = torch.rand(
+                (bs,), dtype=torch.float32, device="cuda"
+            )
+            tree_speculative_sampling_target_only(
+                predicts=predict,  # mutable
+                accept_index=accept_index,  # mutable
+                accept_token_num=accept_length,  # mutable
+                candidates=candidates,
+                retrive_index=self.retrive_index,
+                retrive_next_token=self.retrive_next_token,
+                retrive_next_sibling=self.retrive_next_sibling,
+                uniform_samples=coins,
+                uniform_samples_for_final_sampling=coins_for_final_sampling,
+                target_probs=target_probs,
+                draft_probs=draft_probs,
+                threshold_single=global_server_args_dict[
+                    "speculative_accept_threshold_single"
+                ],
+                threshold_acc=global_server_args_dict[
+                    "speculative_accept_threshold_acc"
+                ],
+                deterministic=True,
+            )
+
+        if SIMULATE_ACC_LEN:
+            # Do simulation
+            accept_index = _generate_simulated_accept_index(
+                accept_index=accept_index,
+                predict=predict,  # mutable
+                accept_length=accept_length,  # mutable
+                simulate_acc_len=SIMULATE_ACC_LEN,
+                bs=bs,
+                spec_steps=self.spec_steps,
+            )
+
+        unfinished_index = []
+        unfinished_accept_index = []
+        accept_index_cpu = accept_index.tolist()
+        predict_cpu = predict.tolist()
+        has_finished = False
+
+        # Iterate every accepted token and check if req has finished after append the token
+        # should be checked BEFORE free kv cache slots
+        for i, (req, accept_index_row) in enumerate(zip(batch.reqs, accept_index_cpu)):
+            for j, idx in enumerate(accept_index_row):
+                if idx == -1:
+                    break
+                id = predict_cpu[idx]
+                req.output_ids.append(id)
+                req.check_finished()
+                if req.finished():
+                    has_finished = True
+                    # set all tokens after finished token to -1 and break
+                    accept_index[i, j + 1 :] = -1
+                    break
+                else:
+                    if req.grammar is not None:
+                        try:
+                            req.grammar.accept_token(id)
+                        except ValueError as e:
+                            logger.info(
+                                f"{i=}, {req=}\n" f"{accept_index=}\n" f"{predict=}\n"
+                            )
+                            raise e
+            if not req.finished():
+                unfinished_index.append(i)
+                if idx == -1:
+                    unfinished_accept_index.append(accept_index[i, :j])
+                else:
+                    unfinished_accept_index.append(accept_index[i])
+            req.spec_verify_ct += 1
+
+        if has_finished:
+            accept_length = (accept_index != -1).sum(dim=1) - 1
+
+        # Free the KV cache for unaccepted tokens
+        # TODO: fuse them
+        accept_index = accept_index[accept_index != -1]
+        verified_id = predict[accept_index]
+        evict_mask = torch.full_like(self.draft_token, True, dtype=torch.bool)
+        evict_mask[accept_index] = False
+
+        if page_size == 1:
+            # TODO: boolean array index leads to a device sync. Remove it.
+            token_to_kv_pool_allocator.free(batch.out_cache_loc[evict_mask])
+        else:
+            if self.topk == 1:
+                # Only evict full empty page. Do not evict partial empty page
+                align_evict_mask_to_page_size[len(batch.seq_lens),](
+                    batch.seq_lens,
+                    evict_mask,
+                    page_size,
+                    self.draft_token_num,
+                    next_power_of_2(self.draft_token_num),
+                )
+                token_to_kv_pool_allocator.free(batch.out_cache_loc[evict_mask])
+            else:
+                # Shift the accepted tokens to the beginning.
+                # Only evict the last part
+                src_cache_loc, tgt_cache_loc, to_free_num_slots = get_src_tgt_cache_loc(
+                    batch.seq_lens,
+                    batch.out_cache_loc,
+                    accept_index,
+                    accept_length,
+                    self.draft_token_num,
+                    page_size,
+                )
+                to_free_slots = torch.empty(
+                    (to_free_num_slots.sum().item(),),
+                    dtype=torch.int64,
+                    device=to_free_num_slots.device,
+                )
+
+                # out_cache_loc: [0  1  2,  3  4  5,  6  7  8]
+                # accept_index:  [0 -1  2,  3  4 -1,  6 -1 -1]
+                # tgt_cache_loc: [0  1   ,  3  4   ,  6      ]
+                # to_free_slots: [      2,        5,     7  8]
+                # to_free_slots also needs to be page-aligned without the first partial page
+                #
+                # split each row of out_cache_loc into two parts.
+                # 1. the first part goes to tgt_cache_loc. length = accept_length[i] + 1
+                # 2. the second part goes to to_free_slots.
+                get_target_cache_loc[(bs,)](
+                    tgt_cache_loc,
+                    to_free_slots,
+                    accept_length,
+                    to_free_num_slots,
+                    batch.out_cache_loc,
+                    self.draft_token_num,
+                    next_power_of_2(self.draft_token_num),
+                    next_power_of_2(bs),
+                )
+
+                # Free the kv cache
+                token_to_kv_pool_allocator.free(to_free_slots)
+
+                # Copy the kv cache
+                batch.token_to_kv_pool_allocator.get_kvcache().move_kv_cache(
+                    tgt_cache_loc, src_cache_loc
+                )
+
+        # Construct EagleVerifyOutput
+        if not has_finished:
+            if page_size == 1 or self.topk == 1:
+                batch.out_cache_loc = batch.out_cache_loc[accept_index]
+                assign_req_to_token_pool[(bs,)](
+                    batch.req_pool_indices,
+                    batch.req_to_token_pool.req_to_token,
+                    batch.seq_lens,
+                    batch.seq_lens + accept_length + 1,
+                    batch.out_cache_loc,
+                    batch.req_to_token_pool.req_to_token.shape[1],
+                    next_power_of_2(bs),
+                )
+            else:
+                batch.out_cache_loc = tgt_cache_loc
+            batch.seq_lens.add_(accept_length + 1)
+
+            draft_input = EagleDraftInput(
+                hidden_states=batch.spec_info.hidden_states[accept_index],
+                verified_id=verified_id,
+                accept_length=accept_length,
+                accept_length_cpu=accept_length.tolist(),
+                seq_lens_for_draft_extend=batch.seq_lens,
+                req_pool_indices_for_draft_extend=batch.req_pool_indices,
+            )
+
+            return EagleVerifyOutput(
+                draft_input=draft_input,
+                logits_output=logits_output,
+                verified_id=verified_id,
+                accept_length_per_req_cpu=draft_input.accept_length_cpu,
+                accepted_indices=accept_index,
+            )
+        else:
+            if page_size == 1 or self.topk == 1:
+                assign_req_to_token_pool[(bs,)](
+                    batch.req_pool_indices,
+                    batch.req_to_token_pool.req_to_token,
+                    batch.seq_lens,
+                    batch.seq_lens + accept_length + 1,
+                    batch.out_cache_loc[accept_index],
+                    batch.req_to_token_pool.req_to_token.shape[1],
+                    next_power_of_2(bs),
+                )
+                batch.seq_lens.add_(accept_length + 1)
+
+            accept_length_cpu = accept_length.tolist()
+            if len(unfinished_accept_index) > 0:
+                unfinished_accept_index = torch.cat(unfinished_accept_index)
+                unfinished_index_device = torch.tensor(
+                    unfinished_index, dtype=torch.int64, device=predict.device
+                )
+                draft_input_accept_length_cpu = [
+                    accept_length_cpu[i] for i in unfinished_index
+                ]
+                if page_size == 1 or self.topk == 1:
+                    batch.out_cache_loc = batch.out_cache_loc[unfinished_accept_index]
+                else:
+                    batch.out_cache_loc = torch.empty(
+                        len(unfinished_index) + sum(draft_input_accept_length_cpu),
+                        dtype=torch.int64,
+                        device=predict.device,
+                    )
+                    accept_length_filter = create_accept_length_filter(
+                        accept_length,
+                        unfinished_index_device,
+                        batch.seq_lens,
+                    )
+                    filter_finished_cache_loc_kernel[(bs,)](
+                        batch.out_cache_loc,
+                        tgt_cache_loc,
+                        accept_length,
+                        accept_length_filter,
+                        next_power_of_2(bs),
+                        next_power_of_2(self.draft_token_num),
+                    )
+
+                draft_input = EagleDraftInput(
+                    hidden_states=batch.spec_info.hidden_states[
+                        unfinished_accept_index
+                    ],
+                    verified_id=predict[unfinished_accept_index],
+                    accept_length_cpu=draft_input_accept_length_cpu,
+                    accept_length=accept_length[unfinished_index_device],
+                    seq_lens_for_draft_extend=batch.seq_lens[unfinished_index_device],
+                    req_pool_indices_for_draft_extend=batch.req_pool_indices[
+                        unfinished_index_device
+                    ],
+                )
+            else:
+                draft_input = EagleDraftInput.create_idle_input(
+                    device=batch.device,
+                    hidden_size=batch.model_config.hidden_size,
+                    dtype=batch.model_config.dtype,
+                    topk=self.topk,
+                    capture_hidden_mode=CaptureHiddenMode.LAST,
+                )
+
+            return EagleVerifyOutput(
+                draft_input=draft_input,
+                logits_output=logits_output,
+                verified_id=verified_id,
+                accept_length_per_req_cpu=accept_length_cpu,
+                accepted_indices=accept_index,
+            )
+
+
+@triton.jit
+def create_extend_after_decode_spec_info(
+    verified_id,
+    seq_lens,
+    accept_lens,
+    positions,
+    new_verified_id,
+    bs_upper: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    offsets = tl.arange(0, bs_upper)
+    seq_length = tl.load(seq_lens + pid)
+    accept_length = tl.load(accept_lens + pid)
+
+    accept_len_cumsum = tl.sum(
+        tl.load(accept_lens + offsets, mask=offsets < pid, other=0)
+    )
+    positions_ptr = positions + accept_len_cumsum
+    mask = offsets < accept_length
+    tl.store(positions_ptr + offsets, seq_length - accept_length + offsets, mask)
+
+    accept_len_cumsum += accept_length - 1
+    verified_id_data = tl.load(verified_id + accept_len_cumsum)
+    tl.store(new_verified_id + pid, verified_id_data)
+
+
+@triton.jit
+def assign_req_to_token_pool(
+    req_pool_indices,
+    req_to_token,
+    start_offset,
+    end_offset,
+    out_cache_loc,
+    pool_len: tl.constexpr,
+    bs_upper: tl.constexpr,
+):
+    BLOCK_SIZE: tl.constexpr = 32
+    pid = tl.program_id(axis=0)
+    kv_start = tl.load(start_offset + pid)
+    kv_end = tl.load(end_offset + pid)
+    token_pool = req_to_token + tl.load(req_pool_indices + pid) * pool_len
+
+    length_offset = tl.arange(0, bs_upper)
+    start = tl.load(start_offset + length_offset, mask=length_offset < pid, other=0)
+    end = tl.load(end_offset + length_offset, mask=length_offset < pid, other=0)
+    out_offset = tl.sum(end - start, axis=0)
+
+    out_cache_ptr = out_cache_loc + out_offset
+
+    save_offset = tl.arange(0, BLOCK_SIZE) + kv_start
+    load_offset = tl.arange(0, BLOCK_SIZE)
+
+    num_loop = tl.cdiv(kv_end - kv_start, BLOCK_SIZE)
+    for _ in range(num_loop):
+        mask = save_offset < kv_end
+        data = tl.load(out_cache_ptr + load_offset, mask=mask)
+        tl.store(token_pool + save_offset, data, mask=mask)
+        save_offset += BLOCK_SIZE
+        load_offset += BLOCK_SIZE
+
+
+@triton.jit
+def assign_draft_cache_locs(
+    req_pool_indices,
+    req_to_token,
+    seq_lens,
+    extend_lens,
+    num_new_pages_per_topk,
+    out_cache_loc,
+    pool_len: tl.constexpr,
+    topk: tl.constexpr,
+    speculative_num_steps: tl.constexpr,
+    page_size: tl.constexpr,
+    bs_upper: tl.constexpr,
+    iter_upper: tl.constexpr,
+):
+    BLOCK_SIZE: tl.constexpr = 128
+    pid = tl.program_id(axis=0)
+
+    if page_size == 1 or topk == 1:
+        copy_len = topk * speculative_num_steps
+        out_cache_ptr = out_cache_loc + pid * topk * speculative_num_steps
+    else:
+        bs_offset = tl.arange(0, bs_upper)
+        copy_len = tl.load(extend_lens + pid)
+        cum_copy_len = tl.sum(tl.load(extend_lens + bs_offset, mask=bs_offset < pid))
+        out_cache_ptr = out_cache_loc + cum_copy_len
+
+    # Part 1: Copy from out_cache_loc to req_to_token
+    kv_start = tl.load(seq_lens + pid)
+    token_pool = req_to_token + tl.load(req_pool_indices + pid) * pool_len
+    num_loop = tl.cdiv(copy_len, BLOCK_SIZE)
+    for i in range(num_loop):
+        copy_offset = tl.arange(0, BLOCK_SIZE) + i * BLOCK_SIZE
+        mask = copy_offset < copy_len
+        data = tl.load(out_cache_ptr + copy_offset, mask=mask)
+        tl.store(token_pool + kv_start + copy_offset, data, mask=mask)
+
+    if page_size == 1 or topk == 1:
+        return
+
+    # Part 2: Copy the indices for the last partial page
+    prefix_len = tl.load(seq_lens + pid)
+    last_page_len = prefix_len % page_size
+    offsets = tl.arange(0, page_size)
+    mask = offsets < last_page_len
+    num_new_pages_per_topk_ = tl.load(num_new_pages_per_topk + pid)
+    prefix_base = token_pool + prefix_len - last_page_len
+
+    for topk_id in range(topk):
+        value = tl.load(prefix_base + offsets, mask=mask)
+        tl.store(
+            prefix_base + topk_id * num_new_pages_per_topk_ * page_size + offsets,
+            value,
+            mask=mask,
+        )
+
+    # Part 3: Remove the padding in out_cache_loc
+    iter_offest = tl.arange(0, iter_upper)
+    for topk_id in range(topk):
+        indices = tl.load(
+            prefix_base
+            + topk_id * num_new_pages_per_topk_ * page_size
+            + last_page_len
+            + iter_offest,
+            mask=iter_offest < speculative_num_steps,
+        )
+        tl.store(
+            out_cache_loc
+            + pid * topk * speculative_num_steps
+            + topk_id * speculative_num_steps
+            + iter_offest,
+            indices,
+            mask=iter_offest < speculative_num_steps,
+        )
+
+
+@triton.jit
+def generate_draft_decode_kv_indices(
+    req_pool_indices,
+    req_to_token,
+    paged_kernel_lens,
+    kv_indices,
+    kv_indptr,
+    positions,
+    pool_len: tl.constexpr,
+    kv_indices_stride: tl.constexpr,
+    kv_indptr_stride: tl.constexpr,
+    bs_upper: tl.constexpr,
+    iter_upper: tl.constexpr,
+    num_tokens_upper: tl.constexpr,
+    page_size: tl.constexpr,
+):
+    BLOCK_SIZE: tl.constexpr = 128
+    iters = tl.program_id(axis=0)
+    bid = tl.program_id(axis=1)
+    topk_id = tl.program_id(axis=2)
+
+    num_steps = tl.num_programs(axis=0)
+    num_seqs = tl.num_programs(axis=1)
+    topk = tl.num_programs(axis=2)
+
+    kv_indices += kv_indices_stride * iters
+    kv_indptr += kv_indptr_stride * iters
+    iters += 1
+
+    load_offset = tl.arange(0, bs_upper)
+    seq_lens = tl.load(paged_kernel_lens + load_offset, mask=load_offset < bid, other=0)
+    seq_len = tl.load(paged_kernel_lens + bid)
+    cum_seq_len = tl.sum(seq_lens)
+
+    # Update kv_indices
+    kv_offset = cum_seq_len * topk + bid * iters * topk + topk_id * (seq_len + iters)
+    kv_ptr = kv_indices + kv_offset
+    token_pool_ptr = req_to_token + tl.load(req_pool_indices + bid) * pool_len
+
+    kv_offset = tl.arange(0, BLOCK_SIZE)
+    num_loop = tl.cdiv(seq_len, BLOCK_SIZE)
+    for _ in range(num_loop):
+        mask = kv_offset < seq_len
+        data = tl.load(token_pool_ptr + kv_offset, mask=mask)
+        tl.store(kv_ptr + kv_offset, data, mask=mask)
+        kv_offset += BLOCK_SIZE
+
+    extend_offset = tl.arange(0, iter_upper)
+    if page_size == 1 or topk == 1:
+        extend_data = tl.load(
+            token_pool_ptr + seq_len + topk_id * num_steps + tl.arange(0, iter_upper),
+            mask=extend_offset < iters,
+        )
+    else:
+        prefix_len = seq_len
+        last_page_len = prefix_len % page_size
+        num_new_pages_per_topk = (
+            last_page_len + num_steps + page_size - 1
+        ) // page_size
+        prefix_base = seq_len // page_size * page_size
+        start = (
+            prefix_base + topk_id * num_new_pages_per_topk * page_size + last_page_len
+        )
+        extend_data = tl.load(
+            token_pool_ptr + start + extend_offset,
+            mask=extend_offset < iters,
+        )
+
+    tl.store(kv_ptr + seq_len + extend_offset, extend_data, mask=extend_offset < iters)
+
+    # Update kv_indptr
+    bs_offset = tl.arange(0, num_tokens_upper)
+
+    zid = bid * topk + topk_id
+    if zid == 0:
+        zid = num_seqs * topk
+    positions = tl.load(positions + bs_offset, mask=bs_offset < zid, other=0)
+    base = tl.sum(positions)
+    tl.store(kv_indptr + zid, base + zid * iters)
+
+
+@triton.jit
+def align_evict_mask_to_page_size(
+    seq_lens,
+    evict_mask,
+    page_size: tl.constexpr,
+    num_draft_tokens: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    t_range = tl.arange(0, BLOCK_SIZE)
+
+    bid = tl.program_id(axis=0)
+    seq_len = tl.load(seq_lens + bid)
+    io_mask = t_range < num_draft_tokens
+    mask_row = tl.load(
+        evict_mask + bid * num_draft_tokens + t_range, mask=io_mask, other=0
+    )
+
+    num_trues = tl.sum(mask_row)
+    num_false = num_draft_tokens - num_trues
+
+    start = (seq_len + num_false - 1) // page_size * page_size - seq_len
+    for i in range(max(start, 0), min(start + page_size, num_draft_tokens)):
+        tl.store(evict_mask + bid * num_draft_tokens + i, False)
+
+
+@triton.jit
+def get_target_cache_loc(
+    tgt_cache_loc,
+    to_free_slots,
+    accept_length,
+    to_free_num_slots,
+    out_cache_loc,
+    num_verify_tokens: tl.constexpr,
+    num_verify_tokens_upper: tl.constexpr,
+    bs_upper: tl.constexpr,
+):
+    bid = tl.program_id(axis=0)
+    offset = tl.arange(0, num_verify_tokens_upper)
+    bs_offset = tl.arange(0, bs_upper)
+
+    # write the first part to tgt_cache_loc
+    accept_len_all = tl.load(accept_length + bs_offset, mask=bs_offset < bid)
+    tgt_cache_loc_start = tl.sum(accept_len_all) + bid
+    copy_len = tl.load(accept_length + bid) + 1
+    out_cache_loc_row = tl.load(
+        out_cache_loc + bid * num_verify_tokens + offset, mask=offset < copy_len
+    )
+    tl.store(
+        tgt_cache_loc + tgt_cache_loc_start + offset,
+        out_cache_loc_row,
+        mask=offset < copy_len,
+    )
+
+    # write the second part to to_free_num_pages
+    to_free_num_slots_all = tl.load(to_free_num_slots + bs_offset, mask=bs_offset < bid)
+    to_free_num_slots_cur = tl.load(to_free_num_slots + bid)
+    out_cache_loc_start = num_verify_tokens - to_free_num_slots_cur
+    to_free_slots_start = tl.sum(to_free_num_slots_all)
+
+    copy_len = to_free_num_slots_cur
+    out_cache_loc_row = tl.load(
+        out_cache_loc + bid * num_verify_tokens + out_cache_loc_start + offset,
+        mask=offset < copy_len,
+    )
+    tl.store(
+        to_free_slots + to_free_slots_start + offset,
+        out_cache_loc_row,
+        mask=offset < copy_len,
+    )
+
+
+@torch.compile(dynamic=True)
+def get_src_tgt_cache_loc(
+    seq_lens: torch.Tensor,
+    out_cache_loc: torch.Tensor,
+    accept_index: torch.Tensor,
+    accept_length: torch.Tensor,
+    draft_token_num: int,
+    page_size: int,
+):
+    src_cache_loc = out_cache_loc[accept_index]
+    tgt_cache_loc = torch.empty_like(src_cache_loc)
+    extended_len = seq_lens + draft_token_num
+    keep_len = torch.minimum(
+        (seq_lens + accept_length + 1 + page_size - 1) // page_size * page_size,
+        extended_len,
+    )
+    to_free_num_slots = extended_len - keep_len
+    return src_cache_loc, tgt_cache_loc, to_free_num_slots
+
+
+@triton.jit
+def filter_finished_cache_loc_kernel(
+    out_cache_loc,
+    tgt_cache_loc,
+    accept_length,
+    accept_length_filter,
+    bs_upper: tl.constexpr,
+    num_verify_tokens_upper: tl.constexpr,
+):
+    bid = tl.program_id(0)
+    bs_offset = tl.arange(0, bs_upper)
+
+    accept_length_all = tl.load(accept_length + bs_offset, mask=bs_offset < bid)
+    old_start = tl.sum(accept_length_all) + bid
+
+    accept_length_filter_all = tl.load(
+        accept_length_filter + bs_offset, mask=bs_offset < bid
+    )
+    new_start = tl.sum(accept_length_filter_all)
+
+    copy_len = tl.load(accept_length_filter + bid)
+    copy_offset = tl.arange(0, num_verify_tokens_upper)
+    value = tl.load(
+        tgt_cache_loc + old_start + copy_offset, mask=copy_offset < copy_len
+    )
+    tl.store(
+        out_cache_loc + new_start + copy_offset, value, mask=copy_offset < copy_len
+    )
+
+
+@torch.compile(dynamic=True)
+def create_accept_length_filter(
+    accept_length: torch.Tensor,
+    unfinished_index_device: torch.Tensor,
+    seq_lens: torch.Tensor,
+):
+    accept_length_filter = torch.zeros_like(accept_length)
+    accept_length_filter[unfinished_index_device] = (
+        accept_length[unfinished_index_device] + 1
+    )
+    seq_lens.add_(accept_length + 1)
+    return accept_length_filter
+
+
+@torch.compile(dynamic=True)
+def select_top_k_tokens(
+    i: int,
+    topk_p: torch.Tensor,
+    topk_index: torch.Tensor,
+    hidden_states: torch.Tensor,
+    scores: torch.Tensor,
+    topk: int,
+):
+    if i == 0:
+        # The first step after extend
+        input_ids = topk_index.flatten()
+        hidden_states = hidden_states.repeat_interleave(topk, dim=0)
+        scores = topk_p  # shape: (b, topk)
+
+        tree_info = (
+            topk_p.unsqueeze(1),  # shape: (b, 1, topk)
+            topk_index,  # shape: (b, topk)
+            torch.arange(-1, topk, dtype=torch.long, device="cuda")
+            .unsqueeze(0)
+            .repeat(topk_p.shape[0], 1),  # shape: (b, topk + 1)
+        )
+    else:
+        # The later decode steps
+        expand_scores = torch.mul(
+            scores.unsqueeze(2), topk_p.reshape(-1, topk, topk)
+        )  # (b, topk, 1) x (b, topk ,topk) -> (b, topk, topk)
+        topk_cs_p, topk_cs_index = fast_topk(
+            expand_scores.flatten(start_dim=1), topk, dim=-1
+        )  # (b, topk)
+        scores = topk_cs_p  # shape: (b, topk)
+
+        topk_index = topk_index.reshape(-1, topk**2)
+        input_ids = torch.gather(topk_index, index=topk_cs_index, dim=1).flatten()
+
+        if hidden_states.shape[0] > 0:
+            selected_input_index = topk_cs_index.flatten() // topk + torch.arange(
+                0, hidden_states.shape[0], step=topk, device="cuda"
+            ).repeat_interleave(topk)
+            hidden_states = hidden_states[selected_input_index, :]
+
+        tree_info = (
+            expand_scores,  # shape: (b, topk, topk)
+            topk_index,  # shape: (b, topk * topk)
+            topk_cs_index + (topk**2 * (i - 1) + topk),  # shape: (b, topk)
+        )
+
+    return input_ids, hidden_states, scores, tree_info
+
+
+def _generate_simulated_accept_index(
+    accept_index,
+    predict,
+    accept_length,
+    simulate_acc_len,
+    bs,
+    spec_steps,
+):
+    simulate_acc_len_float = float(simulate_acc_len)
+    if SIMULATE_ACC_METHOD == "multinomial":
+        simulated_values = torch.normal(
+            mean=simulate_acc_len_float,
+            std=1.0,
+            size=(1,),
+            device="cpu",
+        )
+        # clamp simulated values to be between 1 and self.spec_steps
+        simulated_values = torch.clamp(simulated_values, min=1.0, max=spec_steps + 1)
+        simulate_acc_len = int(simulated_values.round().item())
+    elif SIMULATE_ACC_METHOD == "match-expected":
+        # multinomial sampling does not match the expected length
+        # we keep it for the sake of compatibility of existing tests
+        # but it's better to use "match-expected" for the cases that need to
+        # match the expected length, One caveat is that this will only sample
+        # either round down or round up of the expected length
+        simulate_acc_len_float = max(1.0, min(spec_steps + 1, simulate_acc_len_float))
+        lower = int(simulate_acc_len_float // 1)
+        upper = lower + 1 if lower < spec_steps + 1 else lower
+        if lower == upper:
+            simulate_acc_len = lower
+        else:
+            weight_upper = simulate_acc_len_float - lower
+            weight_lower = 1.0 - weight_upper
+            probs = torch.tensor([weight_lower, weight_upper], device="cpu")
+            sampled_index = torch.multinomial(probs, num_samples=1)
+            simulate_acc_len = lower if sampled_index == 0 else upper
+    else:
+        raise ValueError(f"Invalid simulate_acc_method: {SIMULATE_ACC_METHOD}")
+
+    accept_indx_first_col = accept_index[:, 0].view(-1, 1)
+    sim_accept_index = torch.full(
+        (bs, spec_steps + 1), -1, dtype=torch.int32, device="cuda"
+    )
+    sim_accept_index[:, :simulate_acc_len] = accept_indx_first_col + torch.arange(
+        simulate_acc_len, device=accept_index.device
+    )
+    accept_length.fill_(simulate_acc_len - 1)
+    predict.fill_(100)  # some legit token id
+    return sim_accept_index
+
+
+def traverse_tree(
+    retrieve_next_token: torch.Tensor,
+    retrieve_next_sibling: torch.Tensor,
+    draft_tokens: torch.Tensor,
+    grammar: BaseGrammarObject,
+    allocate_token_bitmask: torch.Tensor,
+):
+    """
+    Traverse the tree constructed by the draft model to generate the logits mask.
+    """
+    assert (
+        retrieve_next_token.shape == retrieve_next_sibling.shape == draft_tokens.shape
+    )
+
+    allocate_token_bitmask.fill_(0)
+
+    def dfs(
+        curr: int,
+        retrieve_next_token: torch.Tensor,
+        retrieve_next_sibling: torch.Tensor,
+        parent_pos: int,
+    ):
+        if curr == 0:
+            # the first token generated by the target model, and thus it is always
+            # accepted from the previous iteration
+            accepted = True
+        else:
+            parent_bitmask = allocate_token_bitmask[parent_pos]
+            curr_token_id = draft_tokens[curr]
+            # 32 boolean bitmask values are packed into 32-bit integers
+            accepted = (
+                parent_bitmask[curr_token_id // 32] & (1 << (curr_token_id % 32))
+            ) != 0
+
+        if accepted:
+            if curr != 0:
+                # Accept the current token
+                grammar.accept_token(draft_tokens[curr])
+            if not grammar.is_terminated():
+                # Generate the bitmask for the current token
+                grammar.fill_vocab_mask(allocate_token_bitmask, curr)
+                if retrieve_next_token[curr] != -1:
+                    # Visit the child node
+                    dfs(
+                        retrieve_next_token[curr],
+                        retrieve_next_token,
+                        retrieve_next_sibling,
+                        curr,
+                    )
+
+            if curr != 0:
+                # Rollback the current token
+                grammar.rollback(1)
+
+        if retrieve_next_sibling[curr] != -1:
+            # Visit the sibling node
+            dfs(
+                retrieve_next_sibling[curr],
+                retrieve_next_token,
+                retrieve_next_sibling,
+                parent_pos,
+            )
+
+    dfs(0, retrieve_next_token, retrieve_next_sibling, -1)
+
+
+def generate_token_bitmask(
+    reqs: List[Req],
+    verify_input: EagleVerifyInput,
+    retrieve_next_token_cpu: torch.Tensor,
+    retrieve_next_sibling_cpu: torch.Tensor,
+    draft_tokens_cpu: torch.Tensor,
+    vocab_size: int,
+):
+    """
+    Generate the logit mask for structured output.
+    Draft model's token can be either valid or invalid with respect to the grammar.
+    We need to perform DFS to
+    1. figure out which tokens are accepted by the grammar.
+    2. if so, what is the corresponding logit mask.
+    """
+
+    num_draft_tokens = draft_tokens_cpu.shape[-1]
+
+    allocate_token_bitmask = None
+    assert len(reqs) == retrieve_next_token_cpu.shape[0]
+    grammar = None
+    for i, req in enumerate(reqs):
+        if req.grammar is not None:
+            if allocate_token_bitmask is None:
+                allocate_token_bitmask = req.grammar.allocate_vocab_mask(
+                    vocab_size=vocab_size,
+                    batch_size=draft_tokens_cpu.numel(),
+                    device="cpu",
+                )
+            grammar = req.grammar
+            s = time.perf_counter()
+            traverse_tree(
+                retrieve_next_token_cpu[i],
+                retrieve_next_sibling_cpu[i],
+                draft_tokens_cpu[i],
+                req.grammar,
+                allocate_token_bitmask[
+                    i * num_draft_tokens : (i + 1) * num_draft_tokens
+                ],
+            )
+            tree_traverse_time = time.perf_counter() - s
+            if tree_traverse_time > TREE_TRAVERSE_TIME_THRESHOLD:
+                logger.warning(
+                    f"Bit mask generation took {tree_traverse_time} seconds with "
+                    f"grammar: {req.grammar}"
+                )
+
+    verify_input.grammar = grammar
+    return allocate_token_bitmask
diff --git a/python/sglang/srt/speculative/eagle_worker.py b/python/sglang/srt/speculative/eagle_worker.py
new file mode 100644
index 000000000..f454971ca
--- /dev/null
+++ b/python/sglang/srt/speculative/eagle_worker.py
@@ -0,0 +1,1142 @@
+import logging
+import os
+import time
+from contextlib import contextmanager
+from typing import List, Optional, Tuple
+
+import torch
+from huggingface_hub import snapshot_download
+
+from sglang.srt.distributed import (
+    GroupCoordinator,
+    get_tp_group,
+    patch_tensor_parallel_group,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.layers.sampler import get_token_ids_logprobs, get_top_logprobs
+from sglang.srt.managers.mm_utils import embed_mm_inputs
+from sglang.srt.managers.schedule_batch import (
+    ScheduleBatch,
+    get_last_loc,
+    global_server_args_dict,
+)
+from sglang.srt.managers.tp_worker import TpModelWorker
+from sglang.srt.model_executor.forward_batch_info import (
+    CaptureHiddenMode,
+    ForwardBatch,
+    ForwardMode,
+)
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.speculative.build_eagle_tree import build_tree_kernel_efficient
+from sglang.srt.speculative.eagle_draft_cuda_graph_runner import (
+    EAGLEDraftCudaGraphRunner,
+)
+from sglang.srt.speculative.eagle_draft_extend_cuda_graph_runner import (
+    EAGLEDraftExtendCudaGraphRunner,
+)
+from sglang.srt.speculative.eagle_utils import (
+    EagleDraftInput,
+    EagleVerifyInput,
+    EagleVerifyOutput,
+    assign_draft_cache_locs,
+    fast_topk,
+    generate_token_bitmask,
+    select_top_k_tokens,
+)
+from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+from sglang.srt.utils import (
+    empty_context,
+    get_available_gpu_memory,
+    get_bool_env_var,
+    is_cuda,
+    next_power_of_2,
+)
+
+if is_cuda():
+    from sgl_kernel import segment_packbits
+
+logger = logging.getLogger(__name__)
+RETURN_ORIGINAL_LOGPROB = get_bool_env_var("RETURN_ORIGINAL_LOGPROB")
+
+
+@contextmanager
+def draft_tp_context(tp_group: GroupCoordinator):
+    # Draft model doesn't use dp and has its own tp group.
+    # We disable mscclpp now because it doesn't support 2 comm groups.
+    with patch_tensor_parallel_group(tp_group):
+        yield
+
+
+class EAGLEWorker(TpModelWorker):
+
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        gpu_id: int,
+        tp_rank: int,
+        dp_rank: Optional[int],
+        moe_ep_rank: int,
+        nccl_port: int,
+        target_worker: TpModelWorker,
+    ):
+        # Parse arguments
+        self.server_args = server_args
+        self.topk = server_args.speculative_eagle_topk
+        self.speculative_num_steps = server_args.speculative_num_steps
+        self.speculative_num_draft_tokens = server_args.speculative_num_draft_tokens
+        self.enable_nan_detection = server_args.enable_nan_detection
+        self.gpu_id = gpu_id
+        self.device = server_args.device
+        self.target_worker = target_worker
+        self.page_size = server_args.page_size
+        self.speculative_algorithm = SpeculativeAlgorithm.from_string(
+            server_args.speculative_algorithm
+        )
+        self.padded_static_len = -1
+
+        # Override the context length of the draft model to be the same as the target model.
+        server_args.context_length = target_worker.model_runner.model_config.context_len
+
+        # Do not capture cuda graph in `super().__init__()`
+        # It will be captured later.
+        backup_disable_cuda_graph = server_args.disable_cuda_graph
+        server_args.disable_cuda_graph = True
+        # Share the allocator with a target worker.
+        # Draft and target worker own their own KV cache pools.
+        self.req_to_token_pool, self.token_to_kv_pool_allocator = (
+            target_worker.get_memory_pool()
+        )
+
+        # Load hot token ids
+        if self.speculative_algorithm.is_eagle3():
+            if server_args.speculative_token_map is not None:
+                logger.warning(
+                    "Speculative token map specified, but EAGLE3 models already have this. Ignoring the specified token map."
+                )
+            self.hot_token_id = None
+        elif server_args.speculative_token_map is not None:
+            self.hot_token_id = load_token_map(server_args.speculative_token_map)
+            server_args.json_model_override_args = (
+                f'{{"hot_vocab_size": {len(self.hot_token_id)}}}'
+            )
+        else:
+            self.hot_token_id = None
+
+        # Init draft worker
+        with empty_context():
+            super().__init__(
+                server_args=server_args,
+                gpu_id=gpu_id,
+                tp_rank=tp_rank,
+                pp_rank=0,  # FIXME
+                dp_rank=dp_rank,
+                moe_ep_rank=moe_ep_rank,
+                nccl_port=nccl_port,
+                is_draft_worker=True,
+                req_to_token_pool=self.req_to_token_pool,
+                token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
+            )
+
+        embed, head = self.target_worker.model_runner.model.get_embed_and_head()
+
+        if self.speculative_algorithm.is_eagle3():
+            # most cases EAGLE3 models don't share lm_head
+            # but some models (e.g. nvidia/gpt-oss-120b-Eagle3) shares
+            if (
+                hasattr(self.draft_model_runner.model, "load_lm_head_from_target")
+                and self.draft_model_runner.model.load_lm_head_from_target
+            ):
+                self.draft_model_runner.model.set_embed_and_head(embed, head)
+            else:
+                self.draft_model_runner.model.set_embed(embed)
+
+            # grab hot token ids
+            if self.draft_model_runner.model.hot_token_id is not None:
+                self.hot_token_id = self.draft_model_runner.model.hot_token_id.to(
+                    embed.device
+                )
+
+        else:
+            if self.hot_token_id is not None:
+                head = head.clone()
+                self.hot_token_id = self.hot_token_id.to(head.device)
+                head.data = head.data[self.hot_token_id]
+
+            # Share the embedding and lm_head
+            self.draft_model_runner.model.set_embed_and_head(embed, head)
+
+        # Init attention backend and cuda graphs
+        self.draft_model_runner.server_args.disable_cuda_graph = (
+            backup_disable_cuda_graph
+        )
+        self.draft_tp_context = (
+            draft_tp_context if server_args.enable_dp_attention else empty_context
+        )
+        with self.draft_tp_context(self.draft_model_runner.tp_group):
+            self.init_attention_backend()
+            self.init_cuda_graphs()
+
+        # Some dummy tensors
+        self.num_new_pages_per_topk = torch.empty(
+            (), dtype=torch.int64, device=self.device
+        )
+        self.extend_lens = torch.empty((), dtype=torch.int64, device=self.device)
+
+    def init_attention_backend(self):
+        # Create multi-step attn backends and cuda graph runners
+
+        self.has_prefill_wrapper_verify = False
+        self.draft_extend_attn_backend = None
+
+        # Initialize decode attention backend
+        self.draft_attn_backend = self._create_decode_backend()
+
+        # Initialize draft extend attention backend (respects speculative_attention_mode setting)
+        self.draft_extend_attn_backend = self._create_draft_extend_backend()
+
+        self.draft_model_runner.draft_attn_backend = self.draft_attn_backend
+
+    def _create_backend(
+        self, backend_name: str, backend_map: dict, error_template: str
+    ):
+        backend_type = getattr(self.server_args, backend_name)
+        if backend_type is None:
+            backend_type = self.server_args.attention_backend
+
+        if backend_type not in backend_map:
+            raise ValueError(error_template.format(backend_type=backend_type))
+
+        return backend_map[backend_type]()
+
+    def _create_decode_backend(self):
+        backend_map = {
+            "flashinfer": self._create_flashinfer_decode_backend,
+            "triton": self._create_triton_decode_backend,
+            "aiter": self._create_aiter_decode_backend,
+            "fa3": self._create_fa3_decode_backend,
+            "hybrid_linear_attn": self._create_fa3_decode_backend,
+            "flashmla": self._create_flashmla_decode_backend,
+            "trtllm_mha": self._create_trtllm_mha_decode_backend,
+            "trtllm_mla": self._create_trtllm_mla_decode_backend,
+        }
+
+        return self._create_backend(
+            "decode_attention_backend",
+            backend_map,
+            "EAGLE is not supported in decode attention backend {backend_type}",
+        )
+
+    def _create_draft_extend_backend(self):
+        backend_map = {
+            "flashinfer": self._create_flashinfer_prefill_backend,
+            "triton": self._create_triton_prefill_backend,
+            "aiter": self._create_aiter_prefill_backend,
+            "fa3": self._create_fa3_prefill_backend,
+            "hybrid_linear_attn": self._create_fa3_prefill_backend,
+            "trtllm_mha": self._create_trtllm_mha_prefill_backend,
+            "trtllm_mla": self._create_trtllm_mla_prefill_backend,
+        }
+        backend_name = (
+            "decode_attention_backend"
+            if self.server_args.speculative_attention_mode == "decode"
+            else "prefill_attention_backend"
+        )
+        return self._create_backend(
+            backend_name,
+            backend_map,
+            "EAGLE is not supported in attention backend {backend_type}",
+        )
+
+    def _create_flashinfer_decode_backend(self):
+        if not global_server_args_dict["use_mla_backend"]:
+            from sglang.srt.layers.attention.flashinfer_backend import (
+                FlashInferMultiStepDraftBackend,
+            )
+
+            self.has_prefill_wrapper_verify = True
+            return FlashInferMultiStepDraftBackend(
+                self.draft_model_runner, self.topk, self.speculative_num_steps
+            )
+        else:
+            from sglang.srt.layers.attention.flashinfer_mla_backend import (
+                FlashInferMLAMultiStepDraftBackend,
+            )
+
+            self.has_prefill_wrapper_verify = True
+            return FlashInferMLAMultiStepDraftBackend(
+                self.draft_model_runner, self.topk, self.speculative_num_steps
+            )
+
+    def _create_triton_decode_backend(self):
+        from sglang.srt.layers.attention.triton_backend import (
+            TritonMultiStepDraftBackend,
+        )
+
+        return TritonMultiStepDraftBackend(
+            self.draft_model_runner, self.topk, self.speculative_num_steps
+        )
+
+    def _create_aiter_decode_backend(self):
+        from sglang.srt.layers.attention.aiter_backend import AiterMultiStepDraftBackend
+
+        return AiterMultiStepDraftBackend(
+            self.draft_model_runner, self.topk, self.speculative_num_steps
+        )
+
+    def _create_fa3_decode_backend(self):
+        from sglang.srt.layers.attention.flashattention_backend import (
+            FlashAttentionMultiStepBackend,
+        )
+
+        return FlashAttentionMultiStepBackend(
+            self.draft_model_runner, self.topk, self.speculative_num_steps
+        )
+
+    def _create_flashmla_decode_backend(self):
+        from sglang.srt.layers.attention.flashmla_backend import (
+            FlashMLAMultiStepDraftBackend,
+        )
+
+        return FlashMLAMultiStepDraftBackend(
+            self.draft_model_runner, self.topk, self.speculative_num_steps
+        )
+
+    def _create_trtllm_mha_decode_backend(self):
+        from sglang.srt.layers.attention.trtllm_mha_backend import (
+            TRTLLMHAAttnMultiStepDraftBackend,
+        )
+
+        self.has_prefill_wrapper_verify = True
+        return TRTLLMHAAttnMultiStepDraftBackend(
+            self.draft_model_runner, self.topk, self.speculative_num_steps
+        )
+
+    def _create_trtllm_mla_decode_backend(self):
+        if not global_server_args_dict["use_mla_backend"]:
+            raise ValueError(
+                "trtllm_mla backend requires MLA model (use_mla_backend=True)."
+            )
+
+        from sglang.srt.layers.attention.trtllm_mla_backend import (
+            TRTLLMMLAMultiStepDraftBackend,
+        )
+
+        self.has_prefill_wrapper_verify = True
+        return TRTLLMMLAMultiStepDraftBackend(
+            self.draft_model_runner, self.topk, self.speculative_num_steps
+        )
+
+    def _create_flashinfer_prefill_backend(self):
+        if not global_server_args_dict["use_mla_backend"]:
+            from sglang.srt.layers.attention.flashinfer_backend import (
+                FlashInferAttnBackend,
+            )
+
+            return FlashInferAttnBackend(self.draft_model_runner, skip_prefill=False)
+        else:
+            from sglang.srt.layers.attention.flashinfer_mla_backend import (
+                FlashInferMLAAttnBackend,
+            )
+
+            return FlashInferMLAAttnBackend(self.draft_model_runner, skip_prefill=False)
+
+    def _create_triton_prefill_backend(self):
+        from sglang.srt.layers.attention.triton_backend import TritonAttnBackend
+
+        return TritonAttnBackend(self.draft_model_runner, skip_prefill=False)
+
+    def _create_aiter_prefill_backend(self):
+        from sglang.srt.layers.attention.aiter_backend import AiterAttnBackend
+
+        return AiterAttnBackend(self.draft_model_runner, skip_prefill=False)
+
+    def _create_fa3_prefill_backend(self):
+        from sglang.srt.layers.attention.flashattention_backend import (
+            FlashAttentionBackend,
+        )
+
+        return FlashAttentionBackend(self.draft_model_runner, skip_prefill=False)
+
+    def _create_trtllm_mha_prefill_backend(self):
+        from sglang.srt.layers.attention.trtllm_mha_backend import TRTLLMHAAttnBackend
+
+        return TRTLLMHAAttnBackend(self.draft_model_runner, skip_prefill=False)
+
+    def _create_trtllm_mla_prefill_backend(self):
+        if not global_server_args_dict["use_mla_backend"]:
+            raise ValueError(
+                "trtllm_mla backend requires MLA model (use_mla_backend=True)."
+            )
+
+        from sglang.srt.layers.attention.trtllm_mla_backend import TRTLLMMLABackend
+
+        return TRTLLMMLABackend(self.draft_model_runner, skip_prefill=False)
+
+    def init_cuda_graphs(self):
+        """Capture cuda graphs."""
+        self.cuda_graph_runner = None
+        self.cuda_graph_runner_for_draft_extend = None
+
+        if self.server_args.disable_cuda_graph:
+            return
+
+        # Capture draft
+        tic = time.perf_counter()
+        before_mem = get_available_gpu_memory(self.device, self.gpu_id)
+        logger.info(
+            f"Capture draft cuda graph begin. This can take up to several minutes. avail mem={before_mem:.2f} GB"
+        )
+        self.cuda_graph_runner = EAGLEDraftCudaGraphRunner(self)
+        after_mem = get_available_gpu_memory(self.device, self.gpu_id)
+        logger.info(
+            f"Capture draft cuda graph end. Time elapsed: {time.perf_counter() - tic:.2f} s. mem usage={(before_mem - after_mem):.2f} GB. avail mem={after_mem:.2f} GB."
+        )
+
+        # Capture extend
+        if self.draft_extend_attn_backend:
+            tic = time.perf_counter()
+            before_mem = get_available_gpu_memory(self.device, self.gpu_id)
+            logger.info(
+                f"Capture draft extend cuda graph begin. This can take up to several minutes. avail mem={before_mem:.2f} GB"
+            )
+            self.cuda_graph_runner_for_draft_extend = EAGLEDraftExtendCudaGraphRunner(
+                self
+            )
+            after_mem = get_available_gpu_memory(self.device, self.gpu_id)
+            logger.info(
+                f"Capture draft extend cuda graph end. Time elapsed: {time.perf_counter() - tic:.2f} s. mem usage={(before_mem - after_mem):.2f} GB. avail mem={after_mem:.2f} GB."
+            )
+
+    @property
+    def draft_model_runner(self):
+        return self.model_runner
+
+    def forward_batch_speculative_generation(
+        self, batch: ScheduleBatch
+    ) -> Tuple[LogitsProcessorOutput, torch.Tensor, int, int, bool]:
+        """Run speculative decoding forward.
+
+        NOTE: Many states of batch is modified as you go through. It is not guaranteed that
+        the final output batch have the same state as the input.
+
+        Args:
+            batch: The batch to run forward. The state of the batch is modified as it runs.
+        Returns:
+            A tuple of the final logit output of the target model, next tokens accepted,
+            the batch id (used for overlap schedule), and number of accepted tokens.
+        """
+        if batch.forward_mode.is_extend() or batch.is_extend_in_batch:
+            logits_output, next_token_ids, bid, seq_lens_cpu = (
+                self.forward_target_extend(batch)
+            )
+            with self.draft_tp_context(self.draft_model_runner.tp_group):
+                self.forward_draft_extend(
+                    batch, logits_output.hidden_states, next_token_ids, seq_lens_cpu
+                )
+            return logits_output, next_token_ids, bid, 0, False
+        else:
+            with self.draft_tp_context(self.draft_model_runner.tp_group):
+                spec_info = self.draft(batch)
+            logits_output, verify_output, model_worker_batch, can_run_cuda_graph = (
+                self.verify(batch, spec_info)
+            )
+
+            with self.draft_tp_context(self.draft_model_runner.tp_group):
+                # NOTE: We should use `check_forward_draft_extend_after_decode`
+                # when DP attention is enabled, but it is slow. Skip it for now.
+                if (
+                    self.server_args.enable_dp_attention
+                    or batch.spec_info.verified_id.shape[0] > 0
+                ):
+                    # decode is not finished
+                    self.forward_draft_extend_after_decode(batch)
+
+            return (
+                logits_output,
+                verify_output.verified_id,
+                model_worker_batch.bid,
+                sum(verify_output.accept_length_per_req_cpu),
+                can_run_cuda_graph,
+            )
+
+    def check_forward_draft_extend_after_decode(self, batch: ScheduleBatch):
+        local_need_forward = batch.spec_info.verified_id.shape[0] > 0
+        if not self.server_args.enable_dp_attention:
+            return local_need_forward
+
+        global_need_forward = torch.tensor(
+            [
+                (local_need_forward),
+            ],
+            dtype=torch.int64,
+        )
+        torch.distributed.all_reduce(
+            global_need_forward, group=get_tp_group().cpu_group
+        )
+        global_need_forward_cnt = global_need_forward[0].item()
+        need_forward = global_need_forward_cnt > 0
+        return need_forward
+
+    def forward_target_extend(
+        self, batch: ScheduleBatch
+    ) -> Tuple[LogitsProcessorOutput, torch.Tensor, int, Optional[torch.Tensor]]:
+        """Run the target extend.
+
+        Args:
+            batch: The batch to run. States could be modified.
+
+        Returns:
+            logits_output: The output of logits. It will contain the full hidden states.
+            next_token_ids: Next token ids generated.
+            bid: The model batch ID. Used for overlap schedule.
+        """
+        # Forward with the target model and get hidden states.
+        # We need the full hidden states to prefill the KV cache of the draft model.
+        model_worker_batch = batch.get_model_worker_batch()
+        model_worker_batch.capture_hidden_mode = CaptureHiddenMode.FULL
+        logits_output, next_token_ids, _ = self.target_worker.forward_batch_generation(
+            model_worker_batch
+        )
+        return (
+            logits_output,
+            next_token_ids,
+            model_worker_batch.bid,
+            model_worker_batch.seq_lens_cpu,
+        )
+
+    def _draft_preprocess_decode(self, batch: ScheduleBatch):
+        # Parse args
+        num_seqs = batch.batch_size()
+        spec_info = batch.spec_info
+
+        # Accumulate penalty
+        if batch.sampling_info.penalizer_orchestrator.is_required:
+            # This is a relaxed version of penalties for speculative decoding.
+            batch.sampling_info.penalizer_orchestrator.cumulate_output_tokens(
+                spec_info.verified_id.to(torch.int64)
+            )
+
+        # Allocate cache locations
+        # Layout of the out_cache_loc
+        # [       topk 0         ] [       topk 1         ]
+        # [iter=0, iter=1, iter=2] [iter=0, iter=1, iter=2]
+        if self.page_size == 1:
+            out_cache_loc, token_to_kv_pool_state_backup = batch.alloc_token_slots(
+                num_seqs * self.speculative_num_steps * self.topk, backup_state=True
+            )
+        else:
+            if self.topk == 1:
+                prefix_lens, seq_lens, last_loc = get_last_loc_large_page_size_top_k_1(
+                    batch.req_to_token_pool.req_to_token,
+                    batch.req_pool_indices,
+                    batch.seq_lens,
+                    self.speculative_num_steps,
+                )
+                extend_num_tokens = num_seqs * self.speculative_num_steps
+            else:
+                # In this case, the last partial page needs to be duplicated.
+                # KV cache layout in batch.req_to_token_pool.req_to_token:
+                #
+                # | -------- | -- xxxx .. | -- xxxx .. | -- xxxx .. |
+                #    prefix     top-k = 0    tok-k = 1    top-k = 2
+                #
+                #  "-" means prefix tokens
+                #  "x" means speculative draft tokens
+                #  "." means padded tokens
+
+                # TODO(lmzheng): The current implementation is still a fake support
+                # for page size > 1. In the `assign_draft_cache_locs` below,
+                # we directly move the indices instead of the real kv cache.
+                # This only works when the kernel backend runs with page size = 1.
+                # If the kernel backend runs with page size > 1, we need to
+                # duplicate the real KV cache. The overhead of duplicating KV
+                # cache seems okay because the draft KV cache only has one layer.
+                # see a related copy operation in MHATokenToKVPool::move_kv_cache.
+
+                (
+                    prefix_lens,
+                    seq_lens,
+                    last_loc,
+                    self.num_new_pages_per_topk,
+                    self.extend_lens,
+                ) = get_last_loc_large_page_size_large_top_k(
+                    batch.req_to_token_pool.req_to_token,
+                    batch.req_pool_indices,
+                    batch.seq_lens,
+                    self.speculative_num_steps,
+                    self.topk,
+                    self.page_size,
+                )
+
+                # TODO(lmzheng): remove this device sync
+                extend_num_tokens = torch.sum(self.extend_lens).item()
+
+            out_cache_loc, token_to_kv_pool_state_backup = (
+                batch.alloc_paged_token_slots_extend(
+                    prefix_lens,
+                    seq_lens,
+                    last_loc,
+                    extend_num_tokens,
+                    backup_state=True,
+                )
+            )
+
+        assign_draft_cache_locs[(num_seqs,)](
+            batch.req_pool_indices,
+            batch.req_to_token_pool.req_to_token,
+            batch.seq_lens,
+            self.extend_lens,
+            self.num_new_pages_per_topk,
+            out_cache_loc,
+            batch.req_to_token_pool.req_to_token.shape[1],
+            self.topk,
+            self.speculative_num_steps,
+            self.page_size,
+            next_power_of_2(num_seqs),
+            next_power_of_2(self.speculative_num_steps),
+        )
+
+        if self.page_size > 1 and self.topk > 1:
+            # Remove padded slots
+            out_cache_loc = out_cache_loc[
+                : num_seqs * self.topk * self.speculative_num_steps
+            ]
+
+        batch.out_cache_loc = out_cache_loc
+        batch.seq_lens_sum = torch.sum(batch.seq_lens).item()
+        batch.return_hidden_states = False
+        spec_info.positions = batch.seq_lens.repeat_interleave(self.topk, dim=0)
+        self.token_to_kv_pool_allocator.restore_state(token_to_kv_pool_state_backup)
+
+    def _draft_preprocess_idle(self, batch: ScheduleBatch):
+        batch.spec_info = EagleDraftInput.create_idle_input(
+            device=self.device,
+            hidden_size=self.model_config.hidden_size,
+            dtype=self.model_config.dtype,
+            topk=self.topk,
+            capture_hidden_mode=CaptureHiddenMode.LAST,
+        )
+
+    def draft(self, batch: ScheduleBatch):
+        # Parse args
+        if batch.forward_mode.is_idle():
+            self._draft_preprocess_idle(batch)
+        else:
+            self._draft_preprocess_decode(batch)
+
+        spec_info = batch.spec_info
+        assert isinstance(spec_info, EagleDraftInput)
+
+        spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
+        spec_info.num_tokens_per_batch = self.topk
+        spec_info.num_tokens_for_logprob_per_batch = self.topk
+        batch.return_hidden_states = False
+
+        # Get forward batch
+        model_worker_batch = batch.get_model_worker_batch()
+        assert model_worker_batch.capture_hidden_mode == CaptureHiddenMode.LAST
+        forward_batch = ForwardBatch.init_new(
+            model_worker_batch, self.draft_model_runner
+        )
+        can_cuda_graph = self.cuda_graph_runner and self.cuda_graph_runner.can_run(
+            forward_batch
+        )
+        if can_cuda_graph:
+            score_list, token_list, parents_list = self.cuda_graph_runner.replay(
+                forward_batch
+            )
+        else:
+            forward_batch.can_run_dp_cuda_graph = False
+            if not forward_batch.forward_mode.is_idle():
+                # Initialize attention backend
+                self.draft_attn_backend.init_forward_metadata(forward_batch)
+            # Run forward steps
+            score_list, token_list, parents_list = self.draft_forward(forward_batch)
+
+        if batch.forward_mode.is_idle():
+            return EagleVerifyInput.create_idle_input(
+                self.topk,
+                self.speculative_num_steps,
+                self.speculative_num_draft_tokens,
+            )
+
+        (
+            tree_mask,
+            position,
+            retrive_index,
+            retrive_next_token,
+            retrive_next_sibling,
+            draft_tokens,
+        ) = build_tree_kernel_efficient(
+            spec_info.verified_id,
+            score_list,
+            token_list,
+            parents_list,
+            batch.seq_lens,
+            batch.seq_lens_sum,
+            self.topk,
+            self.speculative_num_steps,
+            self.speculative_num_draft_tokens,
+        )
+
+        return EagleVerifyInput(
+            draft_token=draft_tokens,
+            custom_mask=tree_mask,
+            positions=position,
+            retrive_index=retrive_index,
+            retrive_next_token=retrive_next_token,
+            retrive_next_sibling=retrive_next_sibling,
+            retrive_cum_len=None,
+            spec_steps=self.speculative_num_steps,
+            topk=self.topk,
+            draft_token_num=self.server_args.speculative_num_draft_tokens,
+            capture_hidden_mode=CaptureHiddenMode.FULL,
+            seq_lens_sum=forward_batch.seq_lens_sum,
+            seq_lens_cpu=forward_batch.seq_lens_cpu,
+        )
+
+    def draft_forward(self, forward_batch: ForwardBatch):
+        # Parse args
+        spec_info = forward_batch.spec_info
+        assert isinstance(spec_info, EagleDraftInput)
+        out_cache_loc = forward_batch.out_cache_loc
+        topk_p, topk_index, hidden_states = (
+            spec_info.topk_p,
+            spec_info.topk_index,
+            spec_info.hidden_states,
+        )
+        if self.hot_token_id is not None:
+            topk_index = self.hot_token_id[topk_index]
+
+        out_cache_loc = out_cache_loc.reshape(
+            forward_batch.batch_size, self.topk, self.speculative_num_steps
+        )
+        out_cache_loc = out_cache_loc.permute((2, 0, 1)).reshape(
+            self.speculative_num_steps, -1
+        )
+
+        # Return values
+        score_list: List[torch.Tensor] = []
+        token_list: List[torch.Tensor] = []
+        parents_list: List[torch.Tensor] = []
+
+        # Forward multiple steps
+        scores = None
+        for i in range(self.speculative_num_steps):
+            input_ids, hidden_states, scores, tree_info = select_top_k_tokens(
+                i, topk_p, topk_index, hidden_states, scores, self.topk
+            )
+            score_list.append(tree_info[0])
+            token_list.append(tree_info[1])
+            parents_list.append(tree_info[2])
+
+            # We don't need to run the last forward. we get 1 token from draft prefill and (#spec steps - 1) tokens here
+            if i == self.speculative_num_steps - 1:
+                break
+
+            # Set inputs
+            forward_batch.input_ids = input_ids
+            # This is a temporary fix for the case that the user is using standalone
+            # speculative decoding and the draft model architecture is gpt-oss. gpt-oss
+            # rope kernel needs cache_loc to be contiguous.
+            if (
+                self.server_args.speculative_algorithm == "STANDALONE"
+                and self.model_config.hf_config.architectures[0] == "GptOssForCausalLM"
+            ):
+                out_cache_loc = out_cache_loc.contiguous()
+            forward_batch.out_cache_loc = out_cache_loc[i]
+            forward_batch.positions.add_(1)
+            forward_batch.attn_backend = self.draft_attn_backend.attn_backends[i]
+            spec_info.hidden_states = hidden_states
+
+            # Run forward
+            logits_output, _ = self.draft_model_runner.forward(
+                forward_batch, skip_attn_backend_init=True
+            )
+            self._detect_nan_if_needed(logits_output)
+            probs = torch.softmax(logits_output.next_token_logits, dim=-1)
+            topk_p, topk_index = fast_topk(probs, self.topk, dim=-1)
+            if self.hot_token_id is not None:
+                topk_index = self.hot_token_id[topk_index]
+            hidden_states = logits_output.hidden_states
+
+        return score_list, token_list, parents_list
+
+    def verify(self, batch: ScheduleBatch, spec_info: EagleVerifyInput):
+        spec_info.prepare_for_verify(batch, self.page_size)
+        batch.return_hidden_states = False
+        batch.forward_mode = (
+            ForwardMode.TARGET_VERIFY
+            if not batch.forward_mode.is_idle()
+            else ForwardMode.IDLE
+        )
+        batch.spec_info = spec_info
+
+        model_worker_batch = batch.get_model_worker_batch(
+            seq_lens_cpu_cache=spec_info.seq_lens_cpu
+        )
+        assert model_worker_batch.capture_hidden_mode == spec_info.capture_hidden_mode
+
+        if batch.has_grammar:
+            retrieve_next_token_cpu = spec_info.retrive_next_token.cpu()
+            retrieve_next_sibling_cpu = spec_info.retrive_next_sibling.cpu()
+            draft_tokens_cpu = spec_info.draft_token.view(
+                spec_info.retrive_next_token.shape
+            ).cpu()
+
+        # Forward
+        logits_output, _, can_run_cuda_graph = (
+            self.target_worker.forward_batch_generation(
+                model_worker_batch, skip_sample=True
+            )
+        )
+
+        vocab_mask = None
+        if batch.has_grammar:
+            # Generate the logit mask for structured output.
+            # Overlap the CPU operations for bitmask generation with the forward pass.
+            vocab_mask = generate_token_bitmask(
+                batch.reqs,
+                spec_info,
+                retrieve_next_token_cpu,
+                retrieve_next_sibling_cpu,
+                draft_tokens_cpu,
+                batch.sampling_info.vocab_size,
+            )
+
+            if vocab_mask is not None:
+                assert spec_info.grammar is not None
+                vocab_mask = vocab_mask.to(spec_info.retrive_next_token.device)
+                # NOTE (sk): otherwise, this vocab mask will be the one from the previous extend stage
+                # and will be applied to produce wrong results
+                batch.sampling_info.vocab_mask = None
+
+        self._detect_nan_if_needed(logits_output)
+        spec_info.hidden_states = logits_output.hidden_states
+        res: EagleVerifyOutput = spec_info.verify(
+            batch,
+            logits_output,
+            self.token_to_kv_pool_allocator,
+            self.page_size,
+            vocab_mask,
+        )
+
+        # Post process based on verified outputs.
+        # Pick indices that we care (accepted)
+        logits_output.next_token_logits = logits_output.next_token_logits[
+            res.accepted_indices
+        ]
+        logits_output.hidden_states = logits_output.hidden_states[res.accepted_indices]
+
+        # QQ: can be optimized
+        if self.target_worker.model_runner.is_hybrid_gdn:
+            # res.draft_input.accept_length is on GPU but may be empty for last verify?
+            accepted_length = (
+                torch.tensor(
+                    res.accept_length_per_req_cpu,
+                    device=logits_output.hidden_states.device,
+                    dtype=torch.int32,
+                )
+                + 1
+            )
+            self.target_worker.model_runner.attn_backend.update_mamba_state_after_mtp_verify(
+                accepted_length, self.target_worker.model_runner.model
+            )
+
+        if batch.return_logprob:
+            self.add_logprob_values(batch, res, logits_output)
+
+        # Prepare the batch for the next draft forwards.
+        batch.forward_mode = (
+            ForwardMode.DECODE if not batch.forward_mode.is_idle() else ForwardMode.IDLE
+        )
+        batch.spec_info = res.draft_input
+
+        return logits_output, res, model_worker_batch, can_run_cuda_graph
+
+    def add_logprob_values(
+        self,
+        batch: ScheduleBatch,
+        res: EagleVerifyOutput,
+        logits_output: LogitsProcessorOutput,
+    ):
+        # Extract args
+        logits_output = res.logits_output
+        top_logprobs_nums = batch.top_logprobs_nums
+        token_ids_logprobs = batch.token_ids_logprobs
+        accepted_indices = res.accepted_indices
+        assert len(accepted_indices) == len(logits_output.next_token_logits)
+
+        temperatures = batch.sampling_info.temperatures
+        num_draft_tokens = batch.spec_info.draft_token_num
+        # acceptance indices are the indices in a "flattened" batch.
+        # dividing it to num_draft_tokens will yield the actual batch index.
+        temperatures = temperatures[accepted_indices // num_draft_tokens]
+        if RETURN_ORIGINAL_LOGPROB:
+            logprobs = torch.nn.functional.log_softmax(
+                logits_output.next_token_logits, dim=-1
+            )
+        else:
+            logprobs = torch.nn.functional.log_softmax(
+                logits_output.next_token_logits / temperatures, dim=-1
+            )
+        batch_next_token_ids = res.verified_id
+        num_tokens_per_req = [accept + 1 for accept in res.accept_length_per_req_cpu]
+
+        # We should repeat top_logprobs_nums to match num_tokens_per_req.
+        top_logprobs_nums_repeat_interleaved = []
+        token_ids_logprobs_repeat_interleaved = []
+        for num, num_tokens in zip(top_logprobs_nums, num_tokens_per_req):
+            top_logprobs_nums_repeat_interleaved.extend([num] * num_tokens)
+        for token_ids, num_tokens in zip(token_ids_logprobs, num_tokens_per_req):
+            token_ids_logprobs_repeat_interleaved.extend([token_ids] * num_tokens)
+
+        # Extract logprobs
+        if any(x > 0 for x in top_logprobs_nums):
+            (
+                logits_output.next_token_top_logprobs_val,
+                logits_output.next_token_top_logprobs_idx,
+            ) = get_top_logprobs(
+                logprobs,
+                top_logprobs_nums_repeat_interleaved,
+            )
+
+        if any(x is not None for x in token_ids_logprobs):
+            (
+                logits_output.next_token_token_ids_logprobs_val,
+                logits_output.next_token_token_ids_logprobs_idx,
+            ) = get_token_ids_logprobs(
+                logprobs,
+                token_ids_logprobs_repeat_interleaved,
+            )
+
+        logits_output.next_token_logprobs = logprobs[
+            torch.arange(len(batch_next_token_ids), device=batch.sampling_info.device),
+            batch_next_token_ids,
+        ]
+
+        # Add output logprobs to the request
+        pt = 0
+        next_token_logprobs = logits_output.next_token_logprobs.tolist()
+        verified_ids = batch_next_token_ids.tolist()
+        for req, num_tokens in zip(batch.reqs, num_tokens_per_req, strict=True):
+            for _ in range(num_tokens):
+                if req.return_logprob:
+                    req.output_token_logprobs_val.append(next_token_logprobs[pt])
+                    req.output_token_logprobs_idx.append(verified_ids[pt])
+                    if req.top_logprobs_num > 0:
+                        req.output_top_logprobs_val.append(
+                            res.logits_output.next_token_top_logprobs_val[pt]
+                        )
+                        req.output_top_logprobs_idx.append(
+                            res.logits_output.next_token_top_logprobs_idx[pt]
+                        )
+                pt += 1
+
+    def forward_draft_extend(
+        self,
+        batch: ScheduleBatch,
+        hidden_states: torch.Tensor,
+        next_token_ids: torch.Tensor,
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        """Run draft model extend. This API modifies the states of the batch.
+
+        Args:
+            batch: The batch to run.
+            hidden_states: Hidden states from the target model forward
+            next_token_ids: Next token ids generated from the target forward.
+        """
+        batch.spec_info = EagleDraftInput(
+            hidden_states=hidden_states,
+            verified_id=next_token_ids,
+            num_tokens_per_batch=1,
+            num_tokens_for_logprob_per_batch=1,
+        )
+        batch.return_hidden_states = False
+        batch.spec_info.prepare_for_extend(batch)
+        batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
+        model_worker_batch = batch.get_model_worker_batch(
+            seq_lens_cpu_cache=seq_lens_cpu
+        )
+        forward_batch = ForwardBatch.init_new(
+            model_worker_batch, self.draft_model_runner
+        )
+        forward_batch.return_logprob = False
+        logits_output, _ = self.draft_model_runner.forward(forward_batch)
+        self._detect_nan_if_needed(logits_output)
+        assert isinstance(forward_batch.spec_info, EagleDraftInput)
+        assert forward_batch.spec_info is batch.spec_info
+        self.capture_for_decode(logits_output, forward_batch.spec_info)
+        has_finished, unfinished_req_index = False, []
+        for i, req in enumerate(batch.reqs):
+            if req.finished():
+                has_finished = True
+            else:
+                unfinished_req_index.append(i)
+        if has_finished:
+            unfinished_index_device = torch.tensor(
+                unfinished_req_index,
+                dtype=torch.int64,
+                device=batch.spec_info.topk_p.device,
+            )
+            batch.spec_info.filter_batch(
+                unfinished_index_device, has_been_filtered=False
+            )
+
+    def forward_draft_extend_after_decode(self, batch: ScheduleBatch):
+        assert isinstance(batch.spec_info, EagleDraftInput)
+        # Backup fields that will be modified in-place
+        seq_lens_backup = batch.seq_lens.clone()
+        req_pool_indices_backup = batch.req_pool_indices
+        accept_length_backup = batch.spec_info.accept_length
+        return_logprob_backup = batch.return_logprob
+
+        input_is_idle = batch.forward_mode.is_idle()
+
+        if not input_is_idle and batch.spec_info.verified_id.numel() == 0:
+            batch = batch.copy()
+            batch.prepare_for_idle()
+            hidden_size = (
+                self.model_config.hidden_size * 3
+                if self.speculative_algorithm.is_eagle3()
+                else self.model_config.hidden_size
+            )
+            batch.spec_info = EagleDraftInput.create_idle_input(
+                device=self.device,
+                hidden_size=hidden_size,
+                dtype=self.model_config.dtype,
+                topk=self.topk,
+                capture_hidden_mode=CaptureHiddenMode.LAST,
+            )
+
+        batch.spec_info.num_tokens_per_batch = self.speculative_num_steps + 1
+        batch.spec_info.num_tokens_for_logprob_per_batch = 1
+        batch.spec_info.prepare_extend_after_decode(
+            batch,
+            self.speculative_num_steps,
+        )
+        batch.forward_mode = (
+            ForwardMode.DRAFT_EXTEND
+            if not batch.forward_mode.is_idle()
+            else ForwardMode.IDLE
+        )
+
+        batch.return_hidden_states = False
+        model_worker_batch = batch.get_model_worker_batch()
+        assert model_worker_batch.capture_hidden_mode == CaptureHiddenMode.LAST
+        forward_batch = ForwardBatch.init_new(
+            model_worker_batch, self.draft_model_runner
+        )
+        if forward_batch.seq_lens_cpu is not None:
+            forward_batch.seq_lens_sum = forward_batch.seq_lens_cpu.sum().item()
+        else:
+            forward_batch.seq_lens_sum = batch.seq_lens.sum().item()
+
+        # Run
+        can_cuda_graph = (
+            self.cuda_graph_runner_for_draft_extend
+            and self.cuda_graph_runner_for_draft_extend.can_run(forward_batch)
+        )
+        if can_cuda_graph:
+            logits_output = self.cuda_graph_runner_for_draft_extend.replay(
+                forward_batch
+            )
+            forward_batch.spec_info.topk_p, forward_batch.spec_info.topk_index = (
+                logits_output.topk_p,
+                logits_output.topk_index,
+            )
+            forward_batch.spec_info.hidden_states = logits_output.hidden_states
+        else:
+            forward_batch.can_run_dp_cuda_graph = False
+            if not forward_batch.forward_mode.is_idle():
+                self.draft_model_runner.attn_backend.init_forward_metadata(
+                    forward_batch
+                )
+            logits_output, _ = self.draft_model_runner.forward(
+                forward_batch, skip_attn_backend_init=True
+            )
+            self.capture_for_decode(logits_output, forward_batch.spec_info)
+
+        self._detect_nan_if_needed(logits_output)
+
+        # Restore backup.
+        # This is because `seq_lens` can be modified in `prepare_extend_after_decode`
+        batch.forward_mode = (
+            ForwardMode.DECODE if not input_is_idle else ForwardMode.IDLE
+        )
+        batch.seq_lens = seq_lens_backup
+        batch.req_pool_indices = req_pool_indices_backup
+        batch.spec_info.accept_length = accept_length_backup
+        batch.return_logprob = return_logprob_backup
+
+    def capture_for_decode(
+        self, logits_output: LogitsProcessorOutput, draft_input: EagleDraftInput
+    ):
+        probs = torch.softmax(logits_output.next_token_logits, dim=-1)
+        draft_input.topk_p, draft_input.topk_index = fast_topk(probs, self.topk, dim=-1)
+        draft_input.hidden_states = logits_output.hidden_states
+
+    def _detect_nan_if_needed(self, logits_output: LogitsProcessorOutput):
+        if self.enable_nan_detection:
+            logits = logits_output.next_token_logits
+            if torch.any(torch.isnan(logits)):
+                logger.error("Detected errors during sampling! NaN in the logits.")
+                raise ValueError("Detected errors during sampling! NaN in the logits.")
+
+
+def load_token_map(token_map_path: str) -> List[int]:
+    if not os.path.exists(token_map_path):
+        cache_dir = snapshot_download(
+            os.path.dirname(token_map_path),
+            ignore_patterns=["*.bin", "*.safetensors"],
+        )
+        token_map_path = os.path.join(cache_dir, os.path.basename(token_map_path))
+    hot_token_id = torch.load(token_map_path, weights_only=True)
+    return torch.tensor(hot_token_id, dtype=torch.int64)
+
+
+@torch.compile(dynamic=True)
+def get_last_loc_large_page_size_top_k_1(
+    req_to_token: torch.Tensor,
+    req_pool_indices: torch.Tensor,
+    seq_lens,
+    speculative_num_steps: int,
+):
+    prefix_lens = seq_lens
+    seq_lens = prefix_lens + speculative_num_steps
+    last_loc = get_last_loc(
+        req_to_token,
+        req_pool_indices,
+        prefix_lens,
+    )
+    return prefix_lens, seq_lens, last_loc
+
+
+# Disable torch.compile for this function because it will be
+# even slower.
+# @torch.compile(dynamic=True)
+def get_last_loc_large_page_size_large_top_k(
+    req_to_token: torch.Tensor,
+    req_pool_indices: torch.Tensor,
+    seq_lens: torch.Tensor,
+    speculative_num_steps: int,
+    topk: int,
+    page_size: int,
+):
+    prefix_lens = seq_lens
+    last_page_lens = prefix_lens % page_size
+    num_new_pages_per_topk = (
+        last_page_lens + speculative_num_steps + page_size - 1
+    ) // page_size
+    seq_lens = prefix_lens // page_size * page_size + num_new_pages_per_topk * (
+        page_size * topk
+    )
+    extend_lens = seq_lens - prefix_lens
+    last_loc = get_last_loc(
+        req_to_token,
+        req_pool_indices,
+        prefix_lens,
+    )
+
+    return prefix_lens, seq_lens, last_loc, num_new_pages_per_topk, extend_lens
diff --git a/python/sglang/srt/speculative/spec_info.py b/python/sglang/srt/speculative/spec_info.py
new file mode 100644
index 000000000..a80963471
--- /dev/null
+++ b/python/sglang/srt/speculative/spec_info.py
@@ -0,0 +1,32 @@
+from enum import IntEnum, auto
+
+
+class SpeculativeAlgorithm(IntEnum):
+    NONE = auto()
+    EAGLE = auto()
+    EAGLE3 = auto()
+    STANDALONE = auto()
+
+    def is_none(self):
+        return self == SpeculativeAlgorithm.NONE
+
+    def is_eagle(self):
+        return self == SpeculativeAlgorithm.EAGLE or self == SpeculativeAlgorithm.EAGLE3
+
+    def is_eagle3(self):
+        return self == SpeculativeAlgorithm.EAGLE3
+
+    def is_standalone(self):
+        return self == SpeculativeAlgorithm.STANDALONE
+
+    @staticmethod
+    def from_string(name: str):
+        name_map = {
+            "EAGLE": SpeculativeAlgorithm.EAGLE,
+            "EAGLE3": SpeculativeAlgorithm.EAGLE3,
+            "STANDALONE": SpeculativeAlgorithm.STANDALONE,
+            None: SpeculativeAlgorithm.NONE,
+        }
+        if name is not None:
+            name = name.upper()
+        return name_map[name]
diff --git a/python/sglang/srt/speculative/standalone_worker.py b/python/sglang/srt/speculative/standalone_worker.py
new file mode 100644
index 000000000..b6004ea01
--- /dev/null
+++ b/python/sglang/srt/speculative/standalone_worker.py
@@ -0,0 +1,109 @@
+import logging
+from contextlib import contextmanager
+from typing import Optional
+
+import torch
+
+from sglang.srt.distributed import GroupCoordinator, patch_tensor_parallel_group
+from sglang.srt.managers.tp_worker import TpModelWorker
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.speculative.eagle_worker import EAGLEWorker, load_token_map
+from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+from sglang.srt.utils import empty_context, get_bool_env_var, is_cuda
+
+if is_cuda():
+    from sgl_kernel import segment_packbits
+
+logger = logging.getLogger(__name__)
+RETURN_ORIGINAL_LOGPROB = get_bool_env_var("RETURN_ORIGINAL_LOGPROB")
+
+
+@contextmanager
+def draft_tp_context(tp_group: GroupCoordinator):
+    # Draft model doesn't use dp and has its own tp group.
+    # We disable mscclpp now because it doesn't support 2 comm groups.
+    with patch_tensor_parallel_group(tp_group):
+        yield
+
+
+class StandaloneWorker(EAGLEWorker):
+
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        gpu_id: int,
+        tp_rank: int,
+        dp_rank: Optional[int],
+        moe_ep_rank: int,
+        nccl_port: int,
+        target_worker: TpModelWorker,
+    ):
+        # Parse arguments
+        self.server_args = server_args
+        self.topk = server_args.speculative_eagle_topk
+        self.speculative_num_steps = server_args.speculative_num_steps
+        self.speculative_num_draft_tokens = server_args.speculative_num_draft_tokens
+        self.enable_nan_detection = server_args.enable_nan_detection
+        self.gpu_id = gpu_id
+        self.device = server_args.device
+        self.target_worker = target_worker
+        self.page_size = server_args.page_size
+        self.speculative_algorithm = SpeculativeAlgorithm.from_string(
+            server_args.speculative_algorithm
+        )
+        self.padded_static_len = -1
+
+        # Override the context length of the draft model to be the same as the target model.
+        server_args.context_length = target_worker.model_runner.model_config.context_len
+
+        # Do not capture cuda graph in `super().__init__()`
+        # It will be captured later.
+        backup_disable_cuda_graph = server_args.disable_cuda_graph
+        server_args.disable_cuda_graph = True
+        # Share the allocator with a target worker.
+        # Draft and target worker own their own KV cache pools.
+        self.req_to_token_pool, self.token_to_kv_pool_allocator = (
+            target_worker.get_memory_pool()
+        )
+
+        # Load hot token ids
+        if server_args.speculative_token_map is not None:
+            self.hot_token_id = load_token_map(server_args.speculative_token_map)
+            server_args.json_model_override_args = (
+                f'{{"hot_vocab_size": {len(self.hot_token_id)}}}'
+            )
+        else:
+            self.hot_token_id = None
+
+        # Init draft worker
+        with empty_context():
+            TpModelWorker.__init__(
+                self,
+                server_args=server_args,
+                gpu_id=gpu_id,
+                tp_rank=tp_rank,
+                pp_rank=0,  # FIXME
+                dp_rank=dp_rank,
+                moe_ep_rank=moe_ep_rank,
+                nccl_port=nccl_port,
+                is_draft_worker=True,
+                req_to_token_pool=self.req_to_token_pool,
+                token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
+            )
+
+        # Init attention backend and cuda graphs
+        self.draft_model_runner.server_args.disable_cuda_graph = (
+            backup_disable_cuda_graph
+        )
+        self.draft_tp_context = (
+            draft_tp_context if server_args.enable_dp_attention else empty_context
+        )
+        with self.draft_tp_context(self.draft_model_runner.tp_group):
+            self.init_attention_backend()
+            self.init_cuda_graphs()
+
+        # Some dummy tensors
+        self.num_new_pages_per_topk = torch.empty(
+            (), dtype=torch.int64, device=self.device
+        )
+        self.extend_lens = torch.empty((), dtype=torch.int64, device=self.device)
diff --git a/python/sglang/srt/tokenizer/tiktoken_tokenizer.py b/python/sglang/srt/tokenizer/tiktoken_tokenizer.py
new file mode 100644
index 000000000..98df443e5
--- /dev/null
+++ b/python/sglang/srt/tokenizer/tiktoken_tokenizer.py
@@ -0,0 +1,166 @@
+import functools
+import json
+from typing import AbstractSet, Collection, List, Literal, Union
+
+
+class TiktokenProcessor:
+    def __init__(self, name: str):
+        self.tokenizer = TiktokenTokenizer(name)
+
+    def image_processor(self, image):
+        return {"pixel_values": [image]}
+
+
+RESERVED_TOKEN_TEXTS = [f"<|reserved_{i}|>" for i in range(3, 128)]
+CONTROL_TOKEN_TEXTS = [f"<|control{i}|>" for i in range(1, 705)]
+
+
+PAD = "<|pad|>"
+EOS = "<|eos|>"
+SEP = "<|separator|>"
+
+DEFAULT_SPECIAL_TOKENS = [PAD, SEP, EOS]
+DEFAULT_CONTROL_TOKENS = {"pad": PAD, "sep": EOS, "eos": SEP}
+
+# default + separate each single digit
+PAT_STR_B = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+
+
+class TiktokenTokenizer:
+    def __init__(self, tokenizer_path):
+        import tiktoken
+        from jinja2 import Template
+
+        # Read the JSON
+        with open(tokenizer_path, "rb") as fin:
+            xtok_dict = json.load(fin)
+
+        # Copy from train/xlm/tokenizers/tiktoken_wrapper.py::Encoding::from_xtok_dict
+        mergeable_ranks = {
+            bytes(item["bytes"]): item["token"] for item in xtok_dict["regular_tokens"]
+        }
+        special_tokens = {
+            bytes(item["bytes"]).decode(): item["token"]
+            for item in xtok_dict["special_tokens"]
+        }
+        if xtok_dict["word_split"] == "V1":
+            pad_str = PAT_STR_B
+        else:
+            assert False, f"Unknown word_split: {xtok_dict['word_split']}"
+        pad_str = xtok_dict.get("pat_str", pad_str)
+
+        kwargs = {
+            "name": tokenizer_path,
+            "pat_str": pad_str,
+            "mergeable_ranks": mergeable_ranks,
+            "special_tokens": special_tokens,
+        }
+        if "default_allowed_special" in xtok_dict:
+            default_allowed_special = set(
+                [
+                    bytes(bytes_list).decode()
+                    for bytes_list in xtok_dict["default_allowed_special"]
+                ]
+            )
+        if "vocab_size" in xtok_dict:
+            kwargs["explicit_n_vocab"] = xtok_dict["vocab_size"]
+
+        # Copy from train/xlm/tokenizers/tiktoken_wrapper.py::Encoding::__init__
+        default_allowed_special = None
+        control_tokens = DEFAULT_CONTROL_TOKENS
+        tokenizer = tiktoken.Encoding(**kwargs)
+        tokenizer._default_allowed_special = default_allowed_special or set()
+        tokenizer._control_tokens = control_tokens
+
+        def encode_patched(
+            self,
+            text: str,
+            *,
+            allowed_special: Union[
+                Literal["all"], AbstractSet[str]
+            ] = set(),  # noqa: B006
+            disallowed_special: Union[Literal["all"], Collection[str]] = "all",
+        ) -> List[int]:
+            if isinstance(allowed_special, set):
+                allowed_special |= self._default_allowed_special
+            return tiktoken.Encoding.encode(
+                self,
+                text,
+                allowed_special=allowed_special,
+                disallowed_special=(),
+            )
+
+        tokenizer.encode = functools.partial(encode_patched, tokenizer)
+
+        # Allow more tokens to prevent crash
+        tokenizer._default_allowed_special |= set(DEFAULT_CONTROL_TOKENS.values())
+        tokenizer._default_allowed_special |= set(
+            CONTROL_TOKEN_TEXTS + RESERVED_TOKEN_TEXTS
+        )
+
+        # Convert to HF interface
+        self.tokenizer = tokenizer
+        self.bos_token_id = None
+        self.eos_token_id = tokenizer._special_tokens[EOS]
+        self.vocab_size = tokenizer.n_vocab
+        self.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: '  + message['content'] + '<|separator|>\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
+        self.chat_template_jinja = Template(self.chat_template)
+        self.additional_stop_token_ids = None
+
+    def encode(self, x, add_special_tokens=False):
+        return self.tokenizer.encode(x)
+
+    def decode(self, x, *args, **kwargs):
+        return self.tokenizer.decode(x)
+
+    def batch_decode(
+        self, batch, skip_special_tokens=True, spaces_between_special_tokens=False
+    ):
+        if len(batch) > 0 and isinstance(batch[0], int):
+            batch = [[x] for x in batch]
+        return self.tokenizer.decode_batch(batch)
+
+    def apply_chat_template(
+        self,
+        messages,
+        tokenize,
+        add_generation_prompt,
+        tools=None,
+        reasoning_effort=None,
+    ):
+        ret = self.chat_template_jinja.render(
+            messages=messages, add_generation_prompt=add_generation_prompt
+        )
+        return self.encode(ret) if tokenize else ret
+
+    def __call__(self, text, **kwargs):
+        return {
+            "input_ids": self.encode(text),
+        }
+
+    def init_xgrammar(self):
+        from xgrammar import TokenizerInfo
+
+        XGRAMMAR_SPECIAL_TOKEN_TEMPLATE = "<|xg_special_token_{}|>"
+
+        enc = self.tokenizer
+        encoded_vocab = {**enc._mergeable_ranks, **enc._special_tokens}
+        encoded_vocab = [
+            token for token, _ in sorted(encoded_vocab.items(), key=lambda x: x[1])
+        ]
+        override_stop_tokens = [2]  # eos
+        # These are treated as special tokens in xgrammar; we want to avoid them
+        # For now, xgrammar treats anything starting with b'\x00' as a special token
+        xgrammar_special_token_ids = []
+        for i, token in enumerate(encoded_vocab):
+            if isinstance(token, bytes) and token.startswith(b"\x00"):
+                xgrammar_special_token_ids.append(i)
+
+        for i, id in enumerate(xgrammar_special_token_ids):
+            encoded_vocab[id] = XGRAMMAR_SPECIAL_TOKEN_TEMPLATE.format(i)
+        tokenizer_info = TokenizerInfo(
+            encoded_vocab, stop_token_ids=override_stop_tokens
+        )
+        assert len(tokenizer_info.special_token_ids) == 0
+
+        return tokenizer_info, override_stop_tokens
diff --git a/python/sglang/srt/torch_memory_saver_adapter.py b/python/sglang/srt/torch_memory_saver_adapter.py
new file mode 100644
index 000000000..a46151782
--- /dev/null
+++ b/python/sglang/srt/torch_memory_saver_adapter.py
@@ -0,0 +1,94 @@
+import logging
+import threading
+import time
+from abc import ABC
+from contextlib import contextmanager, nullcontext
+
+try:
+    import torch_memory_saver
+
+    _memory_saver = torch_memory_saver.torch_memory_saver
+    import_error = None
+except ImportError as e:
+    import_error = e
+    pass
+
+logger = logging.getLogger(__name__)
+
+
+class TorchMemorySaverAdapter(ABC):
+    @staticmethod
+    def create(enable: bool):
+        if enable and import_error is not None:
+            logger.warning(
+                "enable_memory_saver is enabled, but "
+                "torch-memory-saver is not installed. Please install it "
+                "via `pip3 install torch-memory-saver`. "
+            )
+            raise import_error
+        return (
+            _TorchMemorySaverAdapterReal() if enable else _TorchMemorySaverAdapterNoop()
+        )
+
+    def check_validity(self, caller_name):
+        if not self.enabled:
+            logger.warning(
+                f"`{caller_name}` will not save memory because torch_memory_saver is not enabled. "
+                f"Potential causes: `enable_memory_saver` is false, or torch_memory_saver has installation issues."
+            )
+
+    def configure_subprocess(self):
+        raise NotImplementedError
+
+    def region(self, tag: str):
+        raise NotImplementedError
+
+    def pause(self, tag: str):
+        raise NotImplementedError
+
+    def resume(self, tag: str):
+        raise NotImplementedError
+
+    @property
+    def enabled(self):
+        raise NotImplementedError
+
+
+class _TorchMemorySaverAdapterReal(TorchMemorySaverAdapter):
+    """Adapter for TorchMemorySaver with tag-based control"""
+
+    def configure_subprocess(self):
+        return torch_memory_saver.configure_subprocess()
+
+    def region(self, tag: str):
+        return _memory_saver.region(tag=tag)
+
+    def pause(self, tag: str):
+        return _memory_saver.pause(tag=tag)
+
+    def resume(self, tag: str):
+        return _memory_saver.resume(tag=tag)
+
+    @property
+    def enabled(self):
+        return _memory_saver is not None and _memory_saver.enabled
+
+
+class _TorchMemorySaverAdapterNoop(TorchMemorySaverAdapter):
+    @contextmanager
+    def configure_subprocess(self):
+        yield
+
+    @contextmanager
+    def region(self, tag: str):
+        yield
+
+    def pause(self, tag: str):
+        pass
+
+    def resume(self, tag: str):
+        pass
+
+    @property
+    def enabled(self):
+        return False
diff --git a/python/sglang/srt/two_batch_overlap.py b/python/sglang/srt/two_batch_overlap.py
new file mode 100644
index 000000000..e02bc1fd2
--- /dev/null
+++ b/python/sglang/srt/two_batch_overlap.py
@@ -0,0 +1,989 @@
+from __future__ import annotations
+
+import copy
+import dataclasses
+import logging
+from dataclasses import replace
+from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Union
+
+import torch
+
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.communicator import (
+    CommunicateContext,
+    CommunicateSummableTensorPairFn,
+    ScatterMode,
+)
+from sglang.srt.layers.moe import (
+    get_deepep_mode,
+    get_moe_a2a_backend,
+    get_tbo_token_distribution_threshold,
+    is_tbo_enabled,
+)
+from sglang.srt.layers.moe.token_dispatcher import DeepEPDispatcher
+from sglang.srt.layers.quantization import deep_gemm_wrapper
+from sglang.srt.managers.schedule_batch import ScheduleBatch, global_server_args_dict
+from sglang.srt.model_executor.forward_batch_info import (
+    ForwardBatch,
+    ForwardMode,
+    compute_position,
+)
+from sglang.srt.operations import execute_operations, execute_overlapped_operations
+from sglang.srt.operations_strategy import OperationsStrategy
+from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
+from sglang.srt.utils import BumpAllocator, get_bool_env_var, is_hip
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import DispatchOutput
+
+_is_hip = is_hip()
+
+_tbo_debug = get_bool_env_var("SGLANG_TBO_DEBUG")
+
+logger = logging.getLogger(__name__)
+
+
+# -------------------------------- Compute Basic Info ---------------------------------------
+
+
+def get_token_num_per_seq(
+    forward_mode: ForwardMode,
+    spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]] = None,
+):
+    if forward_mode.is_target_verify():
+        return spec_info.draft_token_num
+    elif forward_mode.is_decode():
+        return 1
+    elif forward_mode.is_idle():
+        return 0
+    else:
+        # For extend, we should not use `token_num_per_seq`.
+        return None
+
+
+# TODO: may smartly disable TBO when batch size is too small b/c it will slow down
+def compute_split_seq_index(
+    forward_mode: "ForwardMode",
+    num_tokens: int,
+    extend_lens: Optional[Sequence[int]],
+    token_num_per_seq: Optional[int],
+) -> Optional[int]:
+    if forward_mode == ForwardMode.EXTEND:
+        assert extend_lens is not None
+        return _split_extend_seqs(extend_lens)
+    elif forward_mode.is_target_verify() or forward_mode.is_decode():
+        assert token_num_per_seq is not None
+        return (num_tokens // token_num_per_seq) // 2
+    elif forward_mode.is_idle():
+        assert num_tokens == 0
+        return 0
+    else:
+        raise NotImplementedError()
+
+
+def _is_two_chunk_split_enabled(extend_lens: Sequence[int]) -> bool:
+    if extend_lens is None:
+        return False
+
+    vanilla_split_seq_index = _split_array_by_balanced_sum(extend_lens)
+    left_sum = sum(extend_lens[:vanilla_split_seq_index])
+    overall_sum = sum(extend_lens)
+    threshold = get_tbo_token_distribution_threshold()
+    assert threshold <= 0.5, f"{threshold=}"
+    return left_sum < overall_sum * threshold or left_sum > overall_sum * (
+        1 - threshold
+    )
+
+
+def _split_extend_seqs(arr: Sequence[int]) -> int:
+    if _is_two_chunk_split_enabled(arr):
+        return _split_array_by_cum_less_than_half(arr)
+
+    return _split_array_by_balanced_sum(arr)
+
+
+def _split_array_by_cum_less_than_half(arr: Sequence[int]) -> int:
+    left_sum = 0
+    overall_sum = sum(arr)
+    half_sum = overall_sum // 2
+    chosen_index = 0
+
+    for i in range(len(arr)):
+        left_sum += arr[i]
+        if left_sum > half_sum:
+            chosen_index = i
+            break
+
+    return chosen_index
+
+
+def _split_array_by_balanced_sum(arr: Sequence[int]) -> int:
+    overall_sum = sum(arr)
+    left_sum = 0
+    min_diff = float("inf")
+    best_index = 0
+
+    for i in range(1, len(arr)):
+        left_sum += arr[i - 1]
+        right_sum = overall_sum - left_sum
+        diff = abs(left_sum - right_sum)
+        if diff <= min_diff:
+            min_diff = diff
+            best_index = i
+        else:
+            break
+
+    return best_index
+
+
+def _update_device_and_sum_field_from_cpu_field(
+    batch: ForwardBatch, cpu_field: str, device_field: str, sum_field: str = None
+):
+    cpu_value = getattr(batch, cpu_field, None)
+    old_device_value = getattr(batch, device_field, None)
+    if (
+        cpu_value is None
+        or old_device_value is None
+        or not (isinstance(cpu_value, torch.Tensor) or isinstance(cpu_value, list))
+    ):
+        return
+
+    new_device_value = (
+        cpu_value
+        if isinstance(cpu_value, torch.Tensor)
+        else torch.tensor(cpu_value, dtype=old_device_value.dtype)
+    ).to(device=global_server_args_dict["device"], non_blocking=True)
+    setattr(batch, device_field, new_device_value)
+
+    if sum_field is not None:
+        sum_value = (
+            cpu_value.sum().item()
+            if isinstance(cpu_value, torch.Tensor)
+            else sum(cpu_value)
+        )
+        setattr(batch, sum_field, sum_value)
+
+
+def _compute_mask_offset(seq_index: int, spec_info: Optional[EagleVerifyInput]) -> int:
+    if seq_index == 0:
+        return 0
+
+    offset = 0
+    max_seq_len = min(seq_index, spec_info.seq_lens_cpu.shape[0])
+    for i in range(max_seq_len):
+        offset += (
+            spec_info.seq_lens_cpu[i] + spec_info.draft_token_num
+        ) * spec_info.draft_token_num
+    return offset
+
+
+def split_spec_info(
+    spec_info: Optional[EagleVerifyInput],
+    start_seq_index: int,
+    end_seq_index: int,
+    start_token_index: int,
+    end_token_index: int,
+):
+    if spec_info is None:
+        return None
+    if spec_info.draft_token is not None:
+        draft_token = spec_info.draft_token[start_token_index:end_token_index]
+    else:
+        draft_token = None
+    if spec_info.custom_mask is not None and spec_info.draft_token is not None:
+        custom_mask_start = _compute_mask_offset(start_seq_index, spec_info)
+        if end_seq_index == spec_info.seq_lens_cpu.shape[0]:
+            custom_mask_end = spec_info.custom_mask.shape[0]
+        else:
+            custom_mask_end = _compute_mask_offset(end_seq_index, spec_info)
+
+        if custom_mask_end > custom_mask_start:
+            custom_mask = spec_info.custom_mask[custom_mask_start:custom_mask_end]
+        else:
+            custom_mask = spec_info.custom_mask
+    else:
+        custom_mask = spec_info.custom_mask
+    if spec_info.positions is not None:
+        positions = spec_info.positions[start_token_index:end_token_index]
+    else:
+        positions = None
+    if spec_info.retrive_index is not None:
+        retrive_index = spec_info.retrive_index[start_seq_index:end_seq_index]
+    else:
+        retrive_index = None
+    if spec_info.retrive_next_token is not None:
+        retrive_next_token = spec_info.retrive_next_token[start_seq_index:end_seq_index]
+    else:
+        retrive_next_token = None
+    if spec_info.retrive_next_sibling is not None:
+        retrive_next_sibling = spec_info.retrive_next_sibling[
+            start_seq_index:end_seq_index
+        ]
+    else:
+        retrive_next_sibling = None
+    if spec_info.retrive_cum_len is not None:
+        retrive_cum_len = spec_info.retrive_cum_len[start_seq_index:end_seq_index]
+    else:
+        retrive_cum_len = None
+
+    if spec_info.seq_lens_cpu is not None:
+        seq_lens_cpu = spec_info.seq_lens_cpu[start_seq_index:end_seq_index]
+    else:
+        seq_lens_cpu = None
+    if seq_lens_cpu is not None:
+        seq_lens_sum = seq_lens_cpu.sum()
+    else:
+        seq_lens_sum = None
+    output_spec_info = replace(
+        spec_info,
+        custom_mask=custom_mask,
+        draft_token=draft_token,
+        positions=positions,
+        retrive_index=retrive_index,
+        retrive_next_token=retrive_next_token,
+        retrive_next_sibling=retrive_next_sibling,
+        retrive_cum_len=retrive_cum_len,
+        seq_lens_cpu=seq_lens_cpu,
+        seq_lens_sum=seq_lens_sum,
+    )
+    return output_spec_info
+
+
+def compute_split_token_index(
+    split_seq_index: int,
+    forward_mode: "ForwardMode",
+    extend_seq_lens: Optional[Sequence[int]],
+    token_num_per_seq: Optional[int],
+) -> int:
+    if forward_mode == ForwardMode.EXTEND:
+        assert extend_seq_lens is not None
+        if _is_two_chunk_split_enabled(extend_seq_lens):
+            return sum(extend_seq_lens) // 2
+        return sum(extend_seq_lens[:split_seq_index])
+    elif forward_mode.is_target_verify() or forward_mode.is_decode():
+        assert token_num_per_seq is not None
+        return split_seq_index * token_num_per_seq
+    elif forward_mode.is_idle():
+        assert split_seq_index == 0
+        return 0
+    else:
+        raise NotImplementedError
+
+
+def compute_split_indices_for_cuda_graph_replay(
+    forward_mode: ForwardMode,
+    cuda_graph_num_tokens: int,
+    spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+):
+    forward_mode_for_tbo_split = (
+        forward_mode if forward_mode != ForwardMode.IDLE else ForwardMode.DECODE
+    )
+    token_num_per_seq = get_token_num_per_seq(
+        forward_mode=forward_mode, spec_info=spec_info
+    )
+    tbo_split_seq_index = compute_split_seq_index(
+        forward_mode=forward_mode_for_tbo_split,
+        num_tokens=cuda_graph_num_tokens,
+        extend_lens=None,
+        token_num_per_seq=token_num_per_seq,
+    )
+    tbo_split_token_index = compute_split_token_index(
+        split_seq_index=tbo_split_seq_index,
+        forward_mode=forward_mode_for_tbo_split,
+        extend_seq_lens=None,
+        token_num_per_seq=token_num_per_seq,
+    )
+    return tbo_split_seq_index, tbo_split_token_index
+
+
+# -------------------------------- Preparation ---------------------------------------
+
+
+class TboCudaGraphRunnerPlugin:
+    def __init__(self):
+        self._tbo_children_num_token_non_padded = torch.zeros((2,), dtype=torch.int32)
+
+    def capture_one_batch_size(self, batch: ForwardBatch, num_tokens: int):
+        if not is_tbo_enabled():
+            return
+        token_num_per_seq = get_token_num_per_seq(
+            forward_mode=batch.forward_mode, spec_info=batch.spec_info
+        )
+
+        batch.tbo_split_seq_index = compute_split_seq_index(
+            forward_mode=batch.forward_mode,
+            num_tokens=num_tokens,
+            extend_lens=None,
+            token_num_per_seq=token_num_per_seq,
+        )
+        # For simplicity, when two_batch_overlap is enabled, we only capture CUDA Graph for tbo=true
+        assert batch.tbo_split_seq_index is not None, f"{num_tokens=}"
+
+        self._tbo_children_num_token_non_padded[...] = (
+            TboForwardBatchPreparer.compute_tbo_children_num_token_non_padded(batch)
+        )
+
+        TboForwardBatchPreparer.prepare_raw(
+            batch,
+            tbo_children_num_token_non_padded=self._tbo_children_num_token_non_padded,
+        )
+
+    def replay_prepare(
+        self,
+        forward_mode: ForwardMode,
+        bs: int,
+        num_token_non_padded: int,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        token_num_per_seq = get_token_num_per_seq(
+            forward_mode=forward_mode, spec_info=spec_info
+        )
+        tbo_split_seq_index, tbo_split_token_index = (
+            compute_split_indices_for_cuda_graph_replay(
+                forward_mode=forward_mode,
+                cuda_graph_num_tokens=bs * token_num_per_seq,
+                spec_info=spec_info,
+            )
+        )
+
+        self._tbo_children_num_token_non_padded[...] = (
+            TboForwardBatchPreparer.compute_tbo_children_num_token_non_padded_raw(
+                tbo_split_token_index=tbo_split_token_index,
+                num_token_non_padded=num_token_non_padded,
+            )
+        )
+
+
+class TboDPAttentionPreparer:
+    def prepare_all_gather(
+        self,
+        local_batch: ScheduleBatch,
+    ):
+
+        deepep_mode = get_deepep_mode()
+        enable_deepep_moe = get_moe_a2a_backend().is_deepep()
+        enable_two_batch_overlap = is_tbo_enabled()
+
+        self.enable_two_batch_overlap = enable_two_batch_overlap
+
+        if local_batch is not None:
+            token_num_per_seq = get_token_num_per_seq(
+                forward_mode=local_batch.forward_mode, spec_info=local_batch.spec_info
+            )
+
+            if (
+                local_batch.forward_mode.is_target_verify()
+                or local_batch.forward_mode.is_decode()
+            ):
+                num_tokens = local_batch.batch_size() * token_num_per_seq
+            else:
+                num_tokens = local_batch.extend_num_tokens
+            self.local_tbo_split_seq_index = compute_split_seq_index(
+                forward_mode=local_batch.forward_mode,
+                num_tokens=num_tokens,
+                extend_lens=local_batch.extend_lens,
+                token_num_per_seq=token_num_per_seq,
+            )
+            resolved_deepep_mode = deepep_mode.resolve(local_batch.is_extend_in_batch)
+            local_can_run_tbo = (self.local_tbo_split_seq_index is not None) and not (
+                (
+                    local_batch.forward_mode.is_extend()
+                    and not local_batch.forward_mode.is_target_verify()
+                )
+                and enable_deepep_moe
+                and (resolved_deepep_mode.is_low_latency())
+            )
+        else:
+            self.local_tbo_split_seq_index = 0
+            local_can_run_tbo = True
+
+        local_forward_mode = self._compute_local_forward_mode(local_batch)
+
+        return local_can_run_tbo, local_forward_mode
+
+    def compute_output(self, partial_global_info):
+        local_can_run_tbo_aggregated = min(partial_global_info[:, 0, 0].tolist())
+        forward_modes = partial_global_info[:, 0, 1].tolist()
+
+        global_forward_mode, forward_mode_agree = self._compute_global_forward_mode(
+            forward_modes
+        )
+
+        can_run_tbo = (
+            self.enable_two_batch_overlap
+            and local_can_run_tbo_aggregated
+            and forward_mode_agree
+        )
+
+        tbo_split_seq_index = self.local_tbo_split_seq_index if can_run_tbo else None
+        global_forward_mode = global_forward_mode if can_run_tbo else None
+        return tbo_split_seq_index, global_forward_mode
+
+    @staticmethod
+    def _compute_local_forward_mode(local_batch):
+        return (
+            local_batch.forward_mode if local_batch is not None else ForwardMode.IDLE
+        ).value
+
+    @staticmethod
+    def _compute_global_forward_mode(forward_modes):
+        forward_modes_excluding_idle = [
+            x for x in forward_modes if x != ForwardMode.IDLE.value
+        ]
+
+        if not forward_modes_excluding_idle:
+            return ForwardMode.IDLE, False
+
+        forward_mode_agree = TboDPAttentionPreparer._is_all_same(
+            forward_modes_excluding_idle
+        )
+        global_forward_mode = (
+            ForwardMode(forward_modes_excluding_idle[0]) if forward_mode_agree else None
+        )
+        return global_forward_mode, forward_mode_agree
+
+    @staticmethod
+    def _is_all_same(x):
+        return all(value == x[0] for value in x)
+
+
+class TboForwardBatchPreparer:
+    @classmethod
+    def prepare(cls, batch: ForwardBatch, is_draft_worker: bool = False):
+        if batch.tbo_split_seq_index is None or is_draft_worker:
+            return
+
+        tbo_children_num_token_non_padded = (
+            cls.compute_tbo_children_num_token_non_padded(batch)
+        )
+        cls.prepare_raw(
+            batch, tbo_children_num_token_non_padded=tbo_children_num_token_non_padded
+        )
+
+    @classmethod
+    def prepare_raw(
+        cls, batch: ForwardBatch, tbo_children_num_token_non_padded: torch.Tensor
+    ):
+        from sglang.srt.layers.attention.tbo_backend import TboAttnBackend
+
+        tbo_split_token_index = cls._compute_split_token_index(batch)
+
+        is_enable_two_chunk = (
+            batch.forward_mode == ForwardMode.EXTEND
+            and _is_two_chunk_split_enabled(batch.extend_seq_lens_cpu)
+        )
+
+        if _tbo_debug:
+            logger.info(
+                f"TboForwardBatchPreparer.prepare "
+                f"is_enable_two_chunk={is_enable_two_chunk} "
+                f"tbo_split_seq_index={batch.tbo_split_seq_index} "
+                f"tbo_split_token_index={tbo_split_token_index} "
+                f"extend_seq_lens={batch.extend_seq_lens_cpu} "
+                f"bs={batch.batch_size} "
+                f"forward_mode={batch.forward_mode}"
+            )
+
+        assert isinstance(batch.attn_backend, TboAttnBackend)
+        attn_backend_child_a, attn_backend_child_b = batch.attn_backend.children
+
+        [out_num_token_non_padded_a, out_num_token_non_padded_b] = (
+            tbo_children_num_token_non_padded
+        )
+
+        child_a = cls.filter_batch(
+            batch,
+            start_token_index=0,
+            end_token_index=tbo_split_token_index,
+            start_seq_index=0,
+            end_seq_index=(
+                batch.tbo_split_seq_index + 1
+                if is_enable_two_chunk
+                else batch.tbo_split_seq_index
+            ),
+            output_attn_backend=attn_backend_child_a,
+            out_num_token_non_padded=out_num_token_non_padded_a,
+        )
+        child_b = cls.filter_batch(
+            batch,
+            start_token_index=tbo_split_token_index,
+            end_token_index=batch.input_ids.shape[0],
+            start_seq_index=batch.tbo_split_seq_index,
+            end_seq_index=batch.batch_size,
+            output_attn_backend=attn_backend_child_b,
+            out_num_token_non_padded=out_num_token_non_padded_b,
+        )
+
+        if is_enable_two_chunk:
+            cls.derive_fields_related_to_seq_len_for_two_chunk(
+                batch,
+                child_a=child_a,
+                child_b=child_b,
+                tbo_split_seq_index=batch.tbo_split_seq_index,
+            )
+
+        assert batch.tbo_children is None
+        batch.tbo_children = [child_a, child_b]
+
+    @classmethod
+    def derive_fields_related_to_seq_len_for_two_chunk(
+        cls,
+        batch: ForwardBatch,
+        *,
+        child_a: ForwardBatch,
+        child_b: ForwardBatch,
+        tbo_split_seq_index: int,
+    ):
+        extend_seq_lens_cpu = batch.extend_seq_lens_cpu
+        overall_seq_lens_sum = sum(extend_seq_lens_cpu)
+        half_seq_lens_sum = overall_seq_lens_sum // 2
+        left_last_seq_token_num = half_seq_lens_sum - sum(
+            extend_seq_lens_cpu[:tbo_split_seq_index]
+        )
+        right_first_seq_token_num = (
+            extend_seq_lens_cpu[tbo_split_seq_index] - left_last_seq_token_num
+        )
+
+        # making deepcopy to be extra safe
+        child_a.extend_seq_lens_cpu = copy.deepcopy(child_a.extend_seq_lens_cpu)
+        child_a.extend_seq_lens_cpu[-1] = left_last_seq_token_num
+        child_b.extend_seq_lens_cpu = copy.deepcopy(child_b.extend_seq_lens_cpu)
+        child_b.extend_seq_lens_cpu[0] = right_first_seq_token_num
+        for child in [child_a, child_b]:
+            _update_device_and_sum_field_from_cpu_field(
+                batch=child,
+                cpu_field="extend_seq_lens_cpu",
+                device_field="extend_seq_lens",
+                sum_field="extend_num_tokens",
+            )
+
+        assert (
+            child_a.extend_num_tokens == half_seq_lens_sum
+        ), f"{child_a.extend_num_tokens=}, {half_seq_lens_sum=}"
+
+        child_a.seq_lens_cpu = copy.deepcopy(child_a.seq_lens_cpu)
+        child_a.seq_lens_cpu[-1] = (
+            child_a.extend_seq_lens_cpu[-1] + child_a.extend_prefix_lens_cpu[-1]
+        )
+        _update_device_and_sum_field_from_cpu_field(
+            batch=child_a,
+            cpu_field="seq_lens_cpu",
+            device_field="seq_lens",
+            sum_field="seq_lens_sum",
+        )
+
+        child_b.extend_prefix_lens_cpu = copy.deepcopy(child_b.extend_prefix_lens_cpu)
+        child_b.extend_prefix_lens_cpu[0] += left_last_seq_token_num
+        _update_device_and_sum_field_from_cpu_field(
+            batch=child_b,
+            cpu_field="extend_prefix_lens_cpu",
+            device_field="extend_prefix_lens",
+            sum_field=None,
+        )
+        _, child_b.extend_start_loc = compute_position(
+            global_server_args_dict["attention_backend"],
+            child_b.extend_prefix_lens,
+            child_b.extend_seq_lens,
+            child_b.extend_num_tokens,
+        )
+
+    @classmethod
+    def filter_batch(
+        cls,
+        batch: ForwardBatch,
+        *,
+        start_token_index: int,
+        end_token_index: int,
+        start_seq_index: int,
+        end_seq_index: int,
+        output_attn_backend: AttentionBackend,
+        out_num_token_non_padded: torch.Tensor,
+    ):
+        assert (
+            end_token_index >= start_token_index
+        ), f"{end_token_index=}, {start_token_index=}, batch={batch}"
+        num_tokens = batch.input_ids.shape[0]
+        num_seqs = batch.batch_size
+
+        output_dict = dict()
+
+        for key in [
+            "input_ids",
+            "positions",
+            "out_cache_loc",
+        ]:
+            old_value = getattr(batch, key)
+            assert (
+                old_value.shape[0] == num_tokens
+            ), f"{key=} {old_value=} {num_tokens=} {batch=}"
+            output_dict[key] = old_value[start_token_index:end_token_index]
+
+        for key in [
+            "req_pool_indices",
+            "seq_lens",
+            "seq_lens_cpu",
+            "extend_seq_lens",
+            "extend_prefix_lens",
+            "extend_start_loc",
+            "extend_prefix_lens_cpu",
+            "extend_seq_lens_cpu",
+            "extend_logprob_start_lens_cpu",
+            "lora_ids",
+        ]:
+            old_value = getattr(batch, key)
+            if old_value is None:
+                continue
+            elif batch.forward_mode.is_target_verify() and (
+                key == "extend_seq_lens"
+                or key == "extend_prefix_lens"
+                or key == "extend_start_loc"
+                or key == "extend_prefix_lens_cpu"
+                or key == "extend_seq_lens_cpu"
+                or key == "extend_logprob_start_lens_cpu"
+            ):
+                output_dict[key] = None
+                continue
+            assert (
+                len(old_value) == num_seqs
+            ), f"{key=} {old_value=} {num_seqs=} {batch=}"
+            output_dict[key] = old_value[start_seq_index:end_seq_index]
+
+        spec_info = getattr(batch, "spec_info")
+        output_spec_info = split_spec_info(
+            spec_info=spec_info,
+            start_token_index=start_token_index,
+            end_token_index=end_token_index,
+            start_seq_index=start_seq_index,
+            end_seq_index=end_seq_index,
+        )
+        output_dict["spec_info"] = output_spec_info
+        for key in [
+            "forward_mode",
+            "is_extend_in_batch",
+            "return_logprob",
+            "req_to_token_pool",
+            "token_to_kv_pool",
+            "can_run_dp_cuda_graph",
+            "dp_padding_mode",
+            "global_forward_mode",
+            "spec_algorithm",
+            "capture_hidden_mode",
+            "padded_static_len",
+            "mrope_positions",  # only used by qwen2-vl, thus not care
+            "split_index",  # for split prefill
+            "orig_seq_lens",  # only used by qwen-1m, thus not care
+        ]:
+            output_dict[key] = getattr(batch, key)
+        if not batch.forward_mode.is_target_verify():
+            assert (
+                _compute_extend_num_tokens(batch.input_ids, batch.forward_mode)
+                == batch.extend_num_tokens
+            ), f"{batch=}"
+        extend_num_tokens = _compute_extend_num_tokens(
+            output_dict["input_ids"], output_dict["forward_mode"]
+        )
+
+        # TODO improve, e.g. unify w/ `init_raw`
+        if (
+            global_server_args_dict["moe_dense_tp_size"] == 1
+            and batch.global_dp_buffer_len is not None
+        ):
+            sum_len = end_token_index - start_token_index
+            global_dp_buffer_len = sum_len
+        else:
+            global_dp_buffer_len = None
+
+        output_dict.update(
+            dict(
+                batch_size=end_seq_index - start_seq_index,
+                seq_lens_sum=(
+                    output_dict["seq_lens_cpu"].sum()
+                    if "seq_lens_cpu" in output_dict
+                    else None
+                ),
+                extend_num_tokens=extend_num_tokens,
+                attn_backend=output_attn_backend,
+                num_token_non_padded=out_num_token_non_padded,
+                tbo_split_seq_index=None,
+                tbo_parent_token_range=(start_token_index, end_token_index),
+                tbo_children=None,
+                global_num_tokens_gpu=None,
+                global_num_tokens_cpu=None,
+                global_dp_buffer_len=global_dp_buffer_len,
+                global_num_tokens_for_logprob_gpu=None,
+                global_num_tokens_for_logprob_cpu=None,
+                sampling_info=None,
+                # For logits and logprobs post processing, thus we do not care
+                temp_scaled_logprobs=False,
+                temperature=None,
+                top_p_normalized_logprobs=False,
+                top_p=None,
+                mm_inputs=None,
+                top_logprobs_nums=None,
+                token_ids_logprobs=None,
+                next_token_logits_buffer=None,
+            )
+        )
+
+        errors = []
+        for field in dataclasses.fields(ForwardBatch):
+            if getattr(batch, field.name) is not None and field.name not in output_dict:
+                errors.append(
+                    f"Field {field.name} has value, but is not yet supported (value={getattr(batch, field.name)} batch={batch})"
+                )
+        if len(errors) > 0:
+            raise Exception(f"{len(errors)} errors happen:\n" + "\n\n".join(errors))
+
+        return ForwardBatch(**output_dict)
+
+    @classmethod
+    def compute_tbo_children_num_token_non_padded(cls, batch: ForwardBatch):
+        return cls.compute_tbo_children_num_token_non_padded_raw(
+            tbo_split_token_index=cls._compute_split_token_index(batch),
+            num_token_non_padded=len(batch.input_ids),
+        )
+
+    @classmethod
+    def compute_tbo_children_num_token_non_padded_raw(
+        cls, tbo_split_token_index: int, num_token_non_padded: int
+    ):
+        # TODO we may make padding on both sub-batches to make it slightly more balanced
+        value_a = min(tbo_split_token_index, num_token_non_padded)
+        value_b = max(0, num_token_non_padded - tbo_split_token_index)
+        return torch.tensor([value_a, value_b], dtype=torch.int32).to(
+            device=global_server_args_dict["device"], non_blocking=True
+        )
+
+    @classmethod
+    def _compute_split_token_index(cls, batch: ForwardBatch):
+        token_num_per_seq = get_token_num_per_seq(
+            forward_mode=batch.forward_mode, spec_info=batch.spec_info
+        )
+        return compute_split_token_index(
+            split_seq_index=batch.tbo_split_seq_index,
+            forward_mode=batch.forward_mode,
+            extend_seq_lens=batch.extend_seq_lens_cpu,
+            token_num_per_seq=token_num_per_seq,
+        )
+
+
+def _compute_extend_num_tokens(input_ids, forward_mode: ForwardMode):
+    if (
+        forward_mode.is_decode()
+        or forward_mode.is_idle()
+        or forward_mode.is_target_verify()
+    ):
+        return None
+    elif forward_mode.is_extend():
+        return input_ids.shape[0]
+    raise NotImplementedError
+
+
+# -------------------------------- Execution ---------------------------------------
+
+
+def model_forward_maybe_tbo(
+    layers,
+    enable_tbo: bool,
+    positions: torch.Tensor,
+    forward_batch: ForwardBatch,
+    hidden_states: torch.Tensor,
+    input_data_scatter_mode: ScatterMode,
+    residual: Optional[torch.Tensor],
+    zero_allocator: Optional[BumpAllocator] = None,
+):
+    inputs = dict(
+        positions=positions,
+        hidden_states=hidden_states,
+        forward_batch=forward_batch,
+        residual=residual,
+        zero_allocator=zero_allocator,
+    )
+    layer_input_scatter_mode = layers[0].layer_scatter_modes.layer_input_mode
+    operations_strategy = OperationsStrategy.init_new_tbo(
+        layers, forward_batch.global_forward_mode
+    )
+    if enable_tbo:
+        return _model_forward_tbo(
+            inputs=inputs,
+            operations_strategy=operations_strategy,
+            input_data_scatter_mode=input_data_scatter_mode,
+            layer_input_scatter_mode=layer_input_scatter_mode,
+        )
+    else:
+        return _model_forward_non_tbo(inputs, operations_strategy)
+
+
+def _model_forward_tbo(
+    inputs,
+    operations_strategy: OperationsStrategy,
+    input_data_scatter_mode: ScatterMode,
+    layer_input_scatter_mode: ScatterMode,
+):
+    inputs_arr = _model_forward_tbo_split_inputs(
+        **inputs,
+        input_data_scatter_mode=input_data_scatter_mode,
+        layer_input_scatter_mode=layer_input_scatter_mode,
+    )
+    del inputs
+
+    context = (
+        empty_context()
+        if _is_hip
+        else deep_gemm_wrapper.configure_deep_gemm_num_sms(
+            operations_strategy.deep_gemm_num_sms
+        )
+    )
+
+    with context:
+        outputs_arr = execute_overlapped_operations(
+            inputs_arr=inputs_arr,
+            operations_arr=[operations_strategy.operations] * 2,
+            delta_stages=[0, operations_strategy.tbo_delta_stages],
+        )
+
+    return _model_forward_tbo_merge_outputs(*outputs_arr)
+
+
+def _model_forward_non_tbo(inputs, operations_strategy: OperationsStrategy):
+    outputs = execute_operations(inputs, operations_strategy.operations)
+    return outputs["hidden_states"], outputs["residual"]
+
+
+def _model_forward_tbo_split_inputs(
+    hidden_states: torch.Tensor,
+    residual: torch.Tensor,
+    positions: torch.Tensor,
+    forward_batch: ForwardBatch,
+    zero_allocator: Optional[BumpAllocator],
+    input_data_scatter_mode: ScatterMode,
+    layer_input_scatter_mode: ScatterMode,
+) -> List[Dict]:
+    tbo_splitter_scatter_mode = ScatterMode.TP_ATTN_FULL
+    context = CommunicateContext.init_new()
+
+    hidden_states, residual = CommunicateSummableTensorPairFn.execute(
+        hidden_states_input_mode=input_data_scatter_mode,
+        residual_input_mode=input_data_scatter_mode,
+        output_mode=tbo_splitter_scatter_mode,
+        hidden_states=hidden_states,
+        residual=residual,
+        forward_batch=forward_batch,
+        context=context,
+    )
+
+    inputs_arr = _model_forward_tbo_split_inputs_raw(
+        hidden_states=hidden_states,
+        residual=residual,
+        positions=positions,
+        forward_batch=forward_batch,
+        zero_allocator=zero_allocator,
+    )
+
+    def _post_transform(hidden_states, residual, forward_batch, **kwargs):
+        hidden_states, residual = CommunicateSummableTensorPairFn.execute(
+            hidden_states_input_mode=tbo_splitter_scatter_mode,
+            residual_input_mode=tbo_splitter_scatter_mode,
+            output_mode=layer_input_scatter_mode,
+            hidden_states=hidden_states,
+            residual=residual,
+            forward_batch=forward_batch,
+            context=context,
+        )
+        return dict(
+            hidden_states=hidden_states,
+            residual=residual,
+            forward_batch=forward_batch,
+            **kwargs,
+        )
+
+    return [_post_transform(**inputs) for inputs in inputs_arr]
+
+
+def _model_forward_tbo_split_inputs_raw(
+    hidden_states: torch.Tensor,
+    residual: torch.Tensor,
+    positions: torch.Tensor,
+    forward_batch: ForwardBatch,
+    zero_allocator: Optional[BumpAllocator],
+) -> List[Dict]:
+    return [
+        dict(
+            **_model_forward_filter_inputs(
+                hidden_states=hidden_states,
+                residual=residual,
+                positions=positions,
+                output_forward_batch=output_forward_batch,
+                tbo_subbatch_index=tbo_subbatch_index,
+            ),
+            **(
+                dict(zero_allocator=zero_allocator)
+                if zero_allocator is not None
+                else {}
+            ),
+        )
+        for tbo_subbatch_index, output_forward_batch in enumerate(
+            forward_batch.tbo_children
+        )
+    ]
+
+
+def _model_forward_filter_inputs(
+    hidden_states: torch.Tensor,
+    residual: torch.Tensor,
+    positions: torch.Tensor,
+    output_forward_batch: ForwardBatch,
+    tbo_subbatch_index: int,
+) -> Dict:
+    token_slice = slice(*output_forward_batch.tbo_parent_token_range)
+    return dict(
+        hidden_states=hidden_states[token_slice],
+        residual=None if residual is None else residual[token_slice],
+        positions=positions[token_slice],
+        forward_batch=output_forward_batch,
+        tbo_subbatch_index=tbo_subbatch_index,
+    )
+
+
+def _model_forward_tbo_merge_outputs(output_a, output_b):
+    def _handle_key(name):
+        value_a = output_a[name]
+        value_b = output_b[name]
+        assert (value_a is None) == (value_b is None)
+        if value_a is None:
+            return None
+        return torch.concat([value_a, value_b], dim=0)
+
+    return _handle_key("hidden_states"), _handle_key("residual")
+
+
+# -------------------------------- Utilities and wrappers ---------------------------------------
+
+
+class MaybeTboDeepEPDispatcher:
+    def __init__(self, **kwargs):
+        num_inner_dispatchers = 2 if is_tbo_enabled() else 1
+        self._inners = [
+            DeepEPDispatcher(**kwargs) for _ in range(num_inner_dispatchers)
+        ]
+
+    def _execute(self, name, tbo_subbatch_index: Optional[int] = None, **kwargs):
+        return getattr(self._inners[tbo_subbatch_index or 0], name)(**kwargs)
+
+    def dispatch(self, **kwargs) -> DispatchOutput:
+        return self._execute("dispatch", **kwargs)
+
+    def dispatch_a(self, **kwargs):
+        return self._execute("dispatch_a", **kwargs)
+
+    def dispatch_b(self, **kwargs):
+        return self._execute("dispatch_b", **kwargs)
+
+    def combine(self, **kwargs) -> torch.Tensor:
+        return self._execute("combine", **kwargs)
+
+    def combine_a(self, **kwargs):
+        return self._execute("combine_a", **kwargs)
+
+    def combine_b(self, **kwargs):
+        return self._execute("combine_b", **kwargs)
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
new file mode 100644
index 000000000..0fe5158ff
--- /dev/null
+++ b/python/sglang/srt/utils.py
@@ -0,0 +1,3055 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Common utilities."""
+
+from __future__ import annotations
+
+import asyncio
+import builtins
+import ctypes
+import dataclasses
+import functools
+import importlib
+import io
+import ipaddress
+import itertools
+import json
+import logging
+import os
+import pickle
+import platform
+import random
+import re
+import resource
+import shutil
+import signal
+import socket
+import subprocess
+import sys
+import tempfile
+import threading
+import time
+import traceback
+import uuid
+import warnings
+from collections import OrderedDict, defaultdict
+from contextlib import contextmanager
+from dataclasses import dataclass
+from functools import lru_cache
+from importlib.metadata import PackageNotFoundError, version
+from importlib.util import find_spec
+from io import BytesIO
+from json import JSONDecodeError
+from multiprocessing.reduction import ForkingPickler
+from pathlib import Path
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generic,
+    List,
+    Optional,
+    Protocol,
+    Set,
+    Tuple,
+    TypeVar,
+    Union,
+)
+
+import numpy as np
+import psutil
+import pybase64
+import requests
+import torch
+import torch.distributed
+import torch.distributed as dist
+import triton
+import zmq
+from fastapi.responses import ORJSONResponse
+from packaging import version as pkg_version
+from PIL import Image
+from starlette.routing import Mount
+from torch import nn
+from torch.func import functional_call
+from torch.library import Library
+from torch.profiler import ProfilerActivity, profile, record_function
+from torch.utils._contextlib import _DecoratorContextManager
+from triton.runtime.cache import FileCacheManager
+from typing_extensions import Literal
+
+from sglang.srt.metrics.func_timer import enable_func_timer
+
+logger = logging.getLogger(__name__)
+
+show_time_cost = False
+time_infos = {}
+
+
+HIP_FP8_E4M3_FNUZ_MAX = 224.0
+
+
+# https://pytorch.org/docs/stable/notes/hip.html#checking-for-hip
+def is_hip() -> bool:
+    return torch.version.hip is not None
+
+
+if is_hip():
+    FP8_E4M3_MAX = HIP_FP8_E4M3_FNUZ_MAX
+else:
+    FP8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+FP8_E4M3_MIN = -FP8_E4M3_MAX
+
+builtins.FP8_E4M3_MAX = FP8_E4M3_MAX
+builtins.FP8_E4M3_MIN = FP8_E4M3_MIN
+
+
+def is_cuda():
+    return torch.cuda.is_available() and torch.version.cuda
+
+
+def is_cuda_alike():
+    return is_cuda() or is_hip()
+
+
+def is_hpu() -> bool:
+    return hasattr(torch, "hpu") and torch.hpu.is_available()
+
+
+def is_xpu() -> bool:
+    return hasattr(torch, "xpu") and torch.xpu.is_available()
+
+
+def is_npu() -> bool:
+    return hasattr(torch, "npu") and torch.npu.is_available()
+
+
+def is_host_cpu_x86() -> bool:
+    machine = platform.machine().lower()
+    return (
+        machine in ("x86_64", "amd64", "i386", "i686")
+        and hasattr(torch, "cpu")
+        and torch.cpu.is_available()
+    )
+
+
+def is_cpu() -> bool:
+    return os.getenv("SGLANG_USE_CPU_ENGINE", "0") == "1" and is_host_cpu_x86()
+
+
+def get_cuda_version():
+    if torch.version.cuda:
+        return tuple(map(int, torch.version.cuda.split(".")))
+    return (0, 0)
+
+
+def _check(cc_major):
+    if not is_cuda():
+        return False
+    return torch.cuda.get_device_capability()[0] == cc_major and tuple(
+        map(int, torch.version.cuda.split(".")[:2])
+    ) >= (12, 3)
+
+
+is_ampere_with_cuda_12_3 = lambda: _check(8)
+is_hopper_with_cuda_12_3 = lambda: _check(9)
+
+
+def is_blackwell():
+    if not is_cuda():
+        return False
+    return torch.cuda.get_device_capability()[0] == 10
+
+
+@lru_cache(maxsize=1)
+def is_sm100_supported(device=None) -> bool:
+    return (torch.cuda.get_device_capability(device)[0] == 10) and (
+        torch.version.cuda >= "12.8"
+    )
+
+
+@lru_cache(maxsize=1)
+def is_sm90_supported(device=None) -> bool:
+    return (torch.cuda.get_device_capability(device)[0] == 9) and (
+        torch.version.cuda >= "12.3"
+    )
+
+
+_warned_bool_env_var_keys = set()
+
+
+def get_bool_env_var(name: str, default: str = "false") -> bool:
+    value = os.getenv(name, default)
+    value = value.lower()
+
+    truthy_values = ("true", "1")
+    falsy_values = ("false", "0")
+
+    if (value not in truthy_values) and (value not in falsy_values):
+        if value not in _warned_bool_env_var_keys:
+            logger.warning(
+                f"get_bool_env_var({name}) see non-understandable value={value} and treat as false"
+            )
+        _warned_bool_env_var_keys.add(value)
+
+    return value in truthy_values
+
+
+def get_int_env_var(name: str, default: int = 0) -> int:
+    value = os.getenv(name)
+    if value is None or not value.strip():
+        return default
+    try:
+        return int(value)
+    except ValueError:
+        return default
+
+
+def support_triton(backend: str) -> bool:
+    return backend not in ["torch_native", "intel_amx", "ascend"]
+
+
+try:
+    import sgl_kernel
+
+    is_intel_amx_backend_available = hasattr(
+        torch.ops.sgl_kernel, "convert_weight_packed"
+    )
+except:
+    is_intel_amx_backend_available = False
+
+
+try:
+    # move torch._C._cpu._is_amx_tile_supported() from cpu_has_amx_support
+    # to support torch compile
+    is_amx_tile_supported = torch._C._cpu._is_amx_tile_supported()
+except:
+    is_amx_tile_supported = False
+
+
+def cpu_has_amx_support():
+    try:
+        return (
+            torch._C._cpu._is_amx_tile_supported()  # 旧版没有就会抛 AttributeError
+            and is_intel_amx_backend_available
+        )
+    except Exception:
+        return False
+
+
+
+def use_intel_amx_backend(layer):
+    return getattr(layer, "use_intel_amx_backend", False)
+
+
+def is_flashinfer_available():
+    """
+    Check whether flashinfer is available.
+    As of Oct. 6, 2024, it is only available on NVIDIA GPUs.
+    """
+    if not get_bool_env_var("SGLANG_IS_FLASHINFER_AVAILABLE", default="true"):
+        return False
+    return importlib.util.find_spec("flashinfer") is not None and is_cuda()
+
+
+def random_uuid() -> str:
+    return str(uuid.uuid4().hex)
+
+
+_ENABLE_TORCH_INFERENCE_MODE = get_bool_env_var(
+    "SGLANG_ENABLE_TORCH_INFERENCE_MODE", "false"
+)
+
+
+class DynamicGradMode(_DecoratorContextManager):
+    """
+    A combination of torch.no_grad and torch.inference_mode,
+    with their behavior controlled by an environment variable. Just refer to them.
+    """
+
+    @staticmethod
+    def set_inference_mode(mode: bool):
+        if isinstance(mode, bool):
+            global _ENABLE_TORCH_INFERENCE_MODE
+
+            _ENABLE_TORCH_INFERENCE_MODE = mode
+        else:
+            logger.warning("mode is not a boolean object")
+
+    def __init__(self, mode=True):
+        if not torch._jit_internal.is_scripting():
+            super().__init__()
+        if _ENABLE_TORCH_INFERENCE_MODE:
+            self.mode = mode
+        else:
+            self.prev = False
+
+    def __new__(cls, mode_or_orig_func=True if _ENABLE_TORCH_INFERENCE_MODE else None):
+        if mode_or_orig_func is None or isinstance(mode_or_orig_func, bool):
+            return super().__new__(cls)
+        return cls()(mode_or_orig_func)
+
+    def __enter__(self) -> None:
+        if _ENABLE_TORCH_INFERENCE_MODE:
+            self._inference_mode_context = torch._C._InferenceMode(self.mode)
+            self._inference_mode_context.__enter__()
+        else:
+            self.prev = torch.is_grad_enabled()
+            torch.set_grad_enabled(False)
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        if _ENABLE_TORCH_INFERENCE_MODE:
+            self._inference_mode_context.__exit__(exc_type, exc_value, traceback)
+        else:
+            torch.set_grad_enabled(self.prev)
+
+    def clone(self) -> "DynamicGradMode":
+        r"""
+        Create a copy of this class
+        """
+        if _ENABLE_TORCH_INFERENCE_MODE:
+            return self.__class__(self.mode)
+        else:
+            return self.__class__()
+
+
+def enable_show_time_cost():
+    global show_time_cost
+    show_time_cost = True
+
+
+class TimeInfo:
+    def __init__(self, name, interval=0.1, color=0, indent=0):
+        self.name = name
+        self.interval = interval
+        self.color = color
+        self.indent = indent
+
+        self.acc_time = 0
+        self.last_acc_time = 0
+
+    def check(self):
+        if self.acc_time - self.last_acc_time > self.interval:
+            self.last_acc_time = self.acc_time
+            return True
+        return False
+
+    def pretty_print(self):
+        print(f"\x1b[{self.color}m", end="")
+        print("-" * self.indent * 2, end="")
+        print(f"{self.name}: {self.acc_time:.3f}s\x1b[0m")
+
+
+def mark_start(name, interval=0.1, color=0, indent=0):
+    global time_infos, show_time_cost
+    if not show_time_cost:
+        return
+    torch.cuda.synchronize()
+    if time_infos.get(name, None) is None:
+        time_infos[name] = TimeInfo(name, interval, color, indent)
+    time_infos[name].acc_time -= time.perf_counter()
+
+
+def mark_end(name):
+    global time_infos, show_time_cost
+    if not show_time_cost:
+        return
+    torch.cuda.synchronize()
+    time_infos[name].acc_time += time.perf_counter()
+    if time_infos[name].check():
+        time_infos[name].pretty_print()
+
+
+def calculate_time(show=False, min_cost_ms=0.0):
+    def wrapper(func):
+        def inner_func(*args, **kwargs):
+            torch.cuda.synchronize()
+            if show:
+                start_time = time.perf_counter()
+            result = func(*args, **kwargs)
+            torch.cuda.synchronize()
+            if show:
+                cost_time = (time.perf_counter() - start_time) * 1000
+                if cost_time > min_cost_ms:
+                    print(f"Function {func.__name__} took {cost_time} ms to run.")
+            return result
+
+        return inner_func
+
+    return wrapper
+
+
+def get_available_gpu_memory(
+    device, gpu_id, distributed=False, empty_cache=True, cpu_group=None
+):
+    """
+    Get available memory for cuda:gpu_id device.
+    When distributed is True, the available memory is the minimum available memory of all GPUs.
+    """
+    if device == "cuda":
+        num_gpus = torch.cuda.device_count()
+        assert gpu_id < num_gpus
+
+        if torch.cuda.current_device() != gpu_id:
+            print(
+                f"WARNING: current device is not {gpu_id}, but {torch.cuda.current_device()}, ",
+                "which may cause useless memory allocation for torch CUDA context.",
+            )
+
+        if empty_cache:
+            torch.cuda.empty_cache()
+        free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
+
+    elif device == "xpu":
+        num_gpus = torch.xpu.device_count()
+        assert gpu_id < num_gpus
+
+        if torch.xpu.current_device() != gpu_id:
+            print(
+                f"WARNING: current device is not {gpu_id}, but {torch.xpu.current_device()}, ",
+                "which may cause useless memory allocation for torch XPU context.",
+            )
+
+        if empty_cache:
+            torch.xpu.empty_cache()
+        used_memory = torch.xpu.memory_allocated()
+        total_gpu_memory = torch.xpu.get_device_properties(gpu_id).total_memory
+        free_gpu_memory = total_gpu_memory - used_memory
+
+    elif device == "hpu":
+        num_gpus = torch.hpu.device_count()
+        assert gpu_id < num_gpus
+
+        if torch.hpu.current_device() != gpu_id:
+            print(
+                f"WARNING: current device is not {gpu_id}, but {torch.hpu.current_device()}, ",
+                "which may cause useless memory allocation for torch HPU context.",
+            )
+
+        free_gpu_memory, total_gpu_memory = torch.hpu.mem_get_info()
+
+    elif device == "cpu":
+        # TODO: rename the variables in the current function to be not GPU specific
+        total_free_memory = psutil.virtual_memory().available
+        n_numa_node: int = len(get_cpu_ids_by_node())
+        free_gpu_memory = round(total_free_memory / n_numa_node, 3)
+    elif device == "npu":
+        num_gpus = torch.npu.device_count()
+        assert gpu_id < num_gpus
+
+        if torch.npu.current_device() != gpu_id:
+            print(
+                f"WARNING: current device is not {gpu_id}, but {torch.npu.current_device()}, ",
+                "which may cause useless memory allocation for torch NPU context.",
+            )
+        free_gpu_memory, total_gpu_memory = torch.npu.mem_get_info()
+
+    if distributed:
+        tensor = torch.tensor(free_gpu_memory, dtype=torch.float32)
+        torch.distributed.all_reduce(
+            tensor, op=torch.distributed.ReduceOp.MIN, group=cpu_group
+        )
+        free_gpu_memory = tensor.item()
+
+    return free_gpu_memory / (1 << 30)
+
+
+def is_pin_memory_available() -> bool:
+    return torch.cuda.is_available()
+
+
+class LayerFn(Protocol):
+
+    def __call__(self, layer_id: int, prefix: str) -> torch.nn.Module: ...
+
+
+def make_layers(
+    num_hidden_layers: int,
+    layer_fn: LayerFn,
+    pp_rank: Optional[int] = None,
+    pp_size: Optional[int] = None,
+    prefix: str = "",
+    return_tuple: bool = False,
+    offloader_kwargs: Dict[str, Any] = {},
+) -> Tuple[int, int, torch.nn.ModuleList]:
+    """Make a list of layers with the given layer function"""
+    # circula imports
+    from sglang.srt.distributed import get_pp_indices
+    from sglang.srt.layers.utils import PPMissingLayer
+    from sglang.srt.offloader import get_offloader
+
+    assert not pp_size or num_hidden_layers >= pp_size
+    start_layer, end_layer = (
+        get_pp_indices(
+            num_hidden_layers,
+            pp_rank,
+            pp_size,
+        )
+        if pp_rank is not None and pp_size is not None
+        else (0, num_hidden_layers)
+    )
+    modules = torch.nn.ModuleList(
+        [PPMissingLayer(return_tuple=return_tuple) for _ in range(start_layer)]
+        + get_offloader().wrap_modules(
+            (
+                layer_fn(idx=idx, prefix=add_prefix(idx, prefix))
+                for idx in range(start_layer, end_layer)
+            ),
+            **offloader_kwargs,
+        )
+        + [
+            PPMissingLayer(return_tuple=return_tuple)
+            for _ in range(end_layer, num_hidden_layers)
+        ]
+    )
+    if pp_rank is None or pp_size is None:
+        return modules
+    return modules, start_layer, end_layer
+
+
+def set_random_seed(seed: int) -> None:
+    """Set the random seed for all libraries."""
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+
+
+def find_process_using_port(port: int) -> Optional[psutil.Process]:
+    for conn in psutil.net_connections(kind="inet"):
+        if conn.laddr.port == port:
+            try:
+                return psutil.Process(conn.pid)
+            except psutil.NoSuchProcess:
+                # It could happen by race condition (the proc dies when psutil.Process is called).
+                pass
+
+    return None
+
+
+def wait_port_available(
+    port: int, port_name: str, timeout_s: int = 30, raise_exception: bool = True
+) -> bool:
+    for i in range(timeout_s):
+        if is_port_available(port):
+            return True
+
+        if i > 10 and i % 5 == 0:
+            process = find_process_using_port(port)
+            if process is None:
+                logger.warning(
+                    f"The port {port} is in use, but we could not find the process that uses it."
+                )
+
+            pid = process.pid
+            error_message = f"{port_name} is used by a process already. {process.name()=}' {process.cmdline()=} {process.status()=} {pid=}"
+            logger.info(
+                f"port {port} is in use. Waiting for {i} seconds for {port_name} to be available. {error_message}"
+            )
+        time.sleep(0.1)
+
+    if raise_exception:
+        raise ValueError(
+            f"{port_name} at {port} is not available in {timeout_s} seconds. {error_message}"
+        )
+    return False
+
+
+def is_port_available(port):
+    """Return whether a port is available."""
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        try:
+            s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            s.bind(("", port))
+            s.listen(1)
+            return True
+        except socket.error:
+            return False
+        except OverflowError:
+            return False
+
+
+def get_free_port():
+    # try ipv4
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+    except OSError:
+        # try ipv6
+        with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+
+
+def decode_video_base64(video_base64):
+    from PIL import Image
+
+    # Decode the base64 string
+    video_bytes = pybase64.b64decode(video_base64, validate=True)
+
+    # Placeholder for the start indices of each PNG image
+    img_starts = []
+
+    frame_format = "PNG"  # str(os.getenv('FRAME_FORMAT', "JPEG"))
+
+    assert frame_format in [
+        "PNG",
+        "JPEG",
+    ], "FRAME_FORMAT must be either 'PNG' or 'JPEG'"
+
+    if frame_format == "PNG":
+        # Find each PNG start signature to isolate images
+        i = 0
+        while i < len(video_bytes) - 7:  # Adjusted for the length of the PNG signature
+            # Check if we found the start of a PNG file
+            if (
+                video_bytes[i] == 0x89
+                and video_bytes[i + 1] == 0x50
+                and video_bytes[i + 2] == 0x4E
+                and video_bytes[i + 3] == 0x47
+                and video_bytes[i + 4] == 0x0D
+                and video_bytes[i + 5] == 0x0A
+                and video_bytes[i + 6] == 0x1A
+                and video_bytes[i + 7] == 0x0A
+            ):
+                img_starts.append(i)
+                i += 8  # Skip the PNG signature
+            else:
+                i += 1
+    else:
+        # Find each JPEG start (0xFFD8) to isolate images
+        i = 0
+        while (
+            i < len(video_bytes) - 1
+        ):  # Adjusted for the length of the JPEG SOI signature
+            # Check if we found the start of a JPEG file
+            if video_bytes[i] == 0xFF and video_bytes[i + 1] == 0xD8:
+                img_starts.append(i)
+                # Move to the next byte to continue searching for the next image start
+                i += 2
+            else:
+                i += 1
+
+    frames = []
+    for start_idx in img_starts:
+        # Assuming each image is back-to-back, the end of one image is the start of another
+        # The last image goes until the end of the byte string
+        end_idx = (
+            img_starts[img_starts.index(start_idx) + 1]
+            if img_starts.index(start_idx) + 1 < len(img_starts)
+            else len(video_bytes)
+        )
+        img_bytes = video_bytes[start_idx:end_idx]
+
+        # Convert bytes to a PIL Image
+        img = Image.open(BytesIO(img_bytes))
+
+        # Convert PIL Image to a NumPy array
+        frame = np.array(img)
+
+        # Append the frame to the list of frames
+        frames.append(frame)
+
+    # Ensure there's at least one frame to avoid errors with np.stack
+    if frames:
+        return np.stack(frames, axis=0), img.size
+    else:
+        return np.array([]), (
+            0,
+            0,
+        )  # Return an empty array and size tuple if no frames were found
+
+
+def load_audio(
+    audio_file: str, sr: Optional[int] = None, mono: bool = True
+) -> np.ndarray:
+    # Use soundfile here, since librosa use it under the hood,
+    # and librosa will not support audio loading in the future
+    import soundfile as sf
+    from scipy.signal import resample
+
+    if sr is None:
+        sr = 16000
+
+    # Load audio data
+    if isinstance(audio_file, bytes):
+        audio, original_sr = sf.read(BytesIO(audio_file))
+    elif audio_file.startswith("data:"):
+        audio_file = audio_file.split(",")[1]
+        audio, original_sr = sf.read(
+            BytesIO(pybase64.b64decode(audio_file, validate=True))
+        )
+    elif audio_file.startswith("http://") or audio_file.startswith("https://"):
+        timeout = int(os.getenv("REQUEST_TIMEOUT", "5"))
+        response = requests.get(audio_file, stream=True, timeout=timeout)
+        audio_file = BytesIO(response.content)
+        response.close()
+        audio, original_sr = sf.read(audio_file)
+    elif isinstance(audio_file, str):
+        audio, original_sr = sf.read(audio_file)
+    else:
+        raise ValueError(f"Invalid audio format: {audio_file}")
+
+    # Resample audio if the original sample rate is different from the desired sample rate
+    if original_sr != sr:
+        num_samples = int(len(audio) * float(sr) / original_sr)
+        audio = resample(audio, num_samples)
+
+    # Convert to mono if requested and audio is stereo
+    if mono and len(audio.shape) > 1:
+        audio = np.mean(audio, axis=1)
+
+    return audio
+
+
+@dataclass
+class ImageData:
+    url: str
+    detail: Optional[Literal["auto", "low", "high"]] = "auto"
+
+
+def load_image(
+    image_file: Union[Image.Image, str, ImageData, bytes],
+) -> tuple[Image.Image, tuple[int, int]]:
+    if isinstance(image_file, ImageData):
+        image_file = image_file.url
+
+    image = image_size = None
+    if isinstance(image_file, Image.Image):
+        image = image_file
+        image_size = (image.width, image.height)
+    elif isinstance(image_file, bytes):
+        image = Image.open(BytesIO(image_file))
+    elif image_file.startswith("http://") or image_file.startswith("https://"):
+        timeout = int(os.getenv("REQUEST_TIMEOUT", "3"))
+        response = requests.get(image_file, stream=True, timeout=timeout)
+        try:
+            response.raise_for_status()
+            image = Image.open(response.raw)
+            image.load()  # Force loading to avoid issues after closing the stream
+        finally:
+            response.close()
+    elif image_file.lower().endswith(("png", "jpg", "jpeg", "webp", "gif")):
+        image = Image.open(image_file)
+    elif image_file.startswith("data:"):
+        image_file = image_file.split(",")[1]
+        image = Image.open(BytesIO(pybase64.b64decode(image_file, validate=True)))
+    elif isinstance(image_file, str):
+        image = Image.open(BytesIO(pybase64.b64decode(image_file, validate=True)))
+    else:
+        raise ValueError(f"Invalid image: {image_file}")
+
+    return image, image_size
+
+
+def load_video(video_file: Union[str, bytes], use_gpu: bool = True):
+    # We import decord here to avoid a strange Segmentation fault (core dumped) issue.
+    from decord import VideoReader, cpu, gpu
+
+    try:
+        from decord.bridge import decord_bridge
+
+        ctx = gpu(0)
+        _ = decord_bridge.get_ctx_device(ctx)
+    except Exception:
+        ctx = cpu(0)
+
+    tmp_file = None
+    vr = None
+    try:
+        if isinstance(video_file, bytes):
+            tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
+            tmp_file.write(video_file)
+            tmp_file.close()
+            vr = VideoReader(tmp_file.name, ctx=ctx)
+        elif isinstance(video_file, str):
+            if video_file.startswith(("http://", "https://")):
+                timeout = int(os.getenv("REQUEST_TIMEOUT", "10"))
+                response = requests.get(video_file, stream=True, timeout=timeout)
+                response.raise_for_status()
+                tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
+                for chunk in response.iter_content(chunk_size=8192):
+                    tmp_file.write(chunk)
+                tmp_file.close()
+                vr = VideoReader(tmp_file.name, ctx=ctx)
+            elif video_file.startswith("data:"):
+                _, encoded = video_file.split(",", 1)
+                video_bytes = pybase64.b64decode(encoded)
+                tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
+                tmp_file.write(video_bytes)
+                tmp_file.close()
+                vr = VideoReader(tmp_file.name, ctx=ctx)
+            elif os.path.isfile(video_file):
+                vr = VideoReader(video_file, ctx=ctx)
+            else:
+                video_bytes = pybase64.b64decode(video_file)
+                tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
+                tmp_file.write(video_bytes)
+                tmp_file.close()
+                vr = VideoReader(tmp_file.name, ctx=ctx)
+        else:
+            raise ValueError(f"Unsupported video input type: {type(video_file)}")
+
+        return vr
+
+    finally:
+        if tmp_file and os.path.exists(tmp_file.name):
+            os.unlink(tmp_file.name)
+
+
+def suppress_other_loggers():
+    warnings.filterwarnings(
+        "ignore", category=UserWarning, message="The given NumPy array is not writable"
+    )
+
+    try:
+        from vllm.logger import logger as vllm_default_logger
+    except ImportError:
+        return
+
+    vllm_default_logger.setLevel(logging.WARN)
+    logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(
+        logging.WARN
+    )
+    logging.getLogger("vllm.distributed.device_communicators.shm_broadcast").setLevel(
+        logging.WARN
+    )
+    logging.getLogger("vllm.config").setLevel(logging.ERROR)
+
+
+def assert_pkg_version(pkg: str, min_version: str, message: str):
+    try:
+        installed_version = version(pkg)
+        if pkg_version.parse(installed_version) < pkg_version.parse(min_version):
+            raise Exception(
+                f"{pkg} is installed with version {installed_version}, which "
+                f"is less than the minimum required version {min_version}. " + message
+            )
+    except PackageNotFoundError:
+        raise Exception(
+            f"{pkg} with minimum required version {min_version} is not installed. "
+            + message
+        )
+
+
+def kill_process_tree(parent_pid, include_parent: bool = True, skip_pid: int = None):
+    """Kill the process and all its child processes."""
+    # Remove sigchld handler to avoid spammy logs.
+    if threading.current_thread() is threading.main_thread():
+        signal.signal(signal.SIGCHLD, signal.SIG_DFL)
+
+    if parent_pid is None:
+        parent_pid = os.getpid()
+        include_parent = False
+
+    try:
+        itself = psutil.Process(parent_pid)
+    except psutil.NoSuchProcess:
+        return
+
+    children = itself.children(recursive=True)
+    for child in children:
+        if child.pid == skip_pid:
+            continue
+        try:
+            child.kill()
+        except psutil.NoSuchProcess:
+            pass
+
+    if include_parent:
+        try:
+            if parent_pid == os.getpid():
+                itself.kill()
+                sys.exit(0)
+
+            itself.kill()
+
+            # Sometime processes cannot be killed with SIGKILL (e.g, PID=1 launched by kubernetes),
+            # so we send an additional signal to kill them.
+            itself.send_signal(signal.SIGQUIT)
+        except psutil.NoSuchProcess:
+            pass
+
+
+def monkey_patch_p2p_access_check():
+    """
+    Monkey patch the slow p2p access check.
+    NOTE: We assume the p2p access is always allowed, which can be wrong for some setups.
+    """
+
+    import sglang.srt.distributed.device_communicators.custom_all_reduce_utils as tgt
+
+    setattr(tgt, "gpu_p2p_access_check", lambda *arg, **kwargs: True)
+
+    # Suppress the warnings from this delete function when using sglang.bench_one_batch
+    from sglang.srt.distributed.device_communicators.custom_all_reduce import (
+        CustomAllreduce,
+    )
+
+    setattr(CustomAllreduce, "__del__", lambda *args, **kwargs: None)
+
+
+def monkey_patch_vllm_gguf_config():
+    try:
+        from vllm.model_executor.layers.quantization.gguf import (
+            GGUFConfig,
+            GGUFEmbeddingMethod,
+            GGUFLinearMethod,
+        )
+    except ImportError:
+        return
+
+    from sglang.srt.layers.linear import LinearBase
+    from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+
+    def get_quant_method_with_embedding_replaced(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        if isinstance(layer, LinearBase):
+            return GGUFLinearMethod(self)
+        elif isinstance(layer, VocabParallelEmbedding):
+            # patch to own VocabParallelEmbedding
+            return GGUFEmbeddingMethod(self)
+        return None
+
+    setattr(GGUFConfig, "get_quant_method", get_quant_method_with_embedding_replaced)
+
+
+def set_ulimit(target_soft_limit=65535):
+    # number of open files
+    resource_type = resource.RLIMIT_NOFILE
+    current_soft, current_hard = resource.getrlimit(resource_type)
+
+    if current_soft < target_soft_limit:
+        try:
+            resource.setrlimit(resource_type, (target_soft_limit, current_hard))
+        except ValueError as e:
+            logger.warning(f"Fail to set RLIMIT_NOFILE: {e}")
+
+    # stack size
+    resource_type = resource.RLIMIT_STACK
+    current_soft, current_hard = resource.getrlimit(resource_type)
+    target_soft_limit_stack_size = 1024 * target_soft_limit
+    if current_soft < target_soft_limit_stack_size:
+        try:
+            resource.setrlimit(
+                resource_type, (target_soft_limit_stack_size, current_hard)
+            )
+        except ValueError as e:
+            logger.warning(f"Fail to set RLIMIT_STACK: {e}")
+
+
+def add_api_key_middleware(app, api_key: str):
+    @app.middleware("http")
+    async def authentication(request, call_next):
+        if request.method == "OPTIONS":
+            return await call_next(request)
+        if request.url.path.startswith("/health"):
+            return await call_next(request)
+        if request.url.path.startswith("/metrics"):
+            return await call_next(request)
+        if request.headers.get("Authorization") != "Bearer " + api_key:
+            return ORJSONResponse(content={"error": "Unauthorized"}, status_code=401)
+        return await call_next(request)
+
+
+def prepare_model_and_tokenizer(model_path: str, tokenizer_path: str):
+    if get_bool_env_var("SGLANG_USE_MODELSCOPE"):
+        if not os.path.exists(model_path):
+            from modelscope import snapshot_download
+
+            model_path = snapshot_download(model_path)
+            tokenizer_path = snapshot_download(
+                tokenizer_path, ignore_patterns=["*.bin", "*.safetensors"]
+            )
+    return model_path, tokenizer_path
+
+
+def configure_logger(server_args, prefix: str = ""):
+    if SGLANG_LOGGING_CONFIG_PATH := os.getenv("SGLANG_LOGGING_CONFIG_PATH"):
+        if not os.path.exists(SGLANG_LOGGING_CONFIG_PATH):
+            raise Exception(
+                "Setting SGLANG_LOGGING_CONFIG_PATH from env with "
+                f"{SGLANG_LOGGING_CONFIG_PATH} but it does not exist!"
+            )
+        with open(SGLANG_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
+            custom_config = json.loads(file.read())
+        logging.config.dictConfig(custom_config)
+        return
+    format = f"[%(asctime)s{prefix}] %(message)s"
+    # format = f"[%(asctime)s.%(msecs)03d{prefix}] %(message)s"
+    logging.basicConfig(
+        level=getattr(logging, server_args.log_level.upper()),
+        format=format,
+        datefmt="%Y-%m-%d %H:%M:%S",
+        force=True,
+    )
+
+
+# source: https://github.com/vllm-project/vllm/blob/93b38bea5dd03e1b140ca997dfaadef86f8f1855/vllm/lora/utils.py#L9
+def replace_submodule(
+    model: nn.Module, module_name: str, new_module: nn.Module
+) -> nn.Module:
+    """Replace a submodule in a model with a new module."""
+    parent = model.get_submodule(".".join(module_name.split(".")[:-1]))
+    target_name = module_name.split(".")[-1]
+    setattr(parent, target_name, new_module)
+    return new_module
+
+
+def set_weight_attrs(
+    weight: torch.Tensor,
+    weight_attrs: Optional[Dict[str, Any]],
+):
+    """Set attributes on a weight tensor.
+
+    This method is used to set attributes on a weight tensor. This method
+    will not overwrite existing attributes.
+
+    Args:
+        weight: The weight tensor.
+        weight_attrs: A dictionary of attributes to set on the weight tensor.
+    """
+    if weight_attrs is None:
+        return
+    for key, value in weight_attrs.items():
+        assert not hasattr(weight, key), f"Overwriting existing tensor attribute: {key}"
+        setattr(weight, key, value)
+
+
+def broadcast_pyobj(
+    data: List[Any],
+    rank: int,
+    dist_group: Optional[torch.distributed.ProcessGroup] = None,
+    src: int = 0,
+    force_cpu_device: bool = True,
+):
+    """Broadcast inputs from src rank to all other ranks with torch.dist backend.
+    The `rank` here refer to the source rank on global process group (regardless
+    of dist_group argument).
+    """
+    device = torch.device(
+        "cuda" if torch.cuda.is_available() and not force_cpu_device else "cpu"
+    )
+
+    if rank == src:
+        if len(data) == 0:
+            tensor_size = torch.tensor([0], dtype=torch.long, device=device)
+            dist.broadcast(tensor_size, src=src, group=dist_group)
+        else:
+            serialized_data = pickle.dumps(data)
+            size = len(serialized_data)
+
+            tensor_data = torch.ByteTensor(
+                np.frombuffer(serialized_data, dtype=np.uint8)
+            ).to(device)
+            tensor_size = torch.tensor([size], dtype=torch.long, device=device)
+
+            dist.broadcast(tensor_size, src=src, group=dist_group)
+            dist.broadcast(tensor_data, src=src, group=dist_group)
+        return data
+    else:
+        tensor_size = torch.tensor([0], dtype=torch.long, device=device)
+        dist.broadcast(tensor_size, src=src, group=dist_group)
+        size = tensor_size.item()
+
+        if size == 0:
+            return []
+
+        tensor_data = torch.empty(size, dtype=torch.uint8, device=device)
+        dist.broadcast(tensor_data, src=src, group=dist_group)
+
+        serialized_data = bytes(tensor_data.cpu().numpy())
+        data = pickle.loads(serialized_data)
+        return data
+
+
+def point_to_point_pyobj(
+    data: List[Any],
+    rank: int,
+    group: Optional[torch.distributed.ProcessGroup] = None,
+    src: int = 0,
+    dst: int = 1,
+):
+    """Send data from src to dst in group using DeviceToDevice communication."""
+
+    if rank == src:
+        if len(data) == 0:
+            tensor_size = torch.tensor(
+                [0], dtype=torch.long, device=torch.cuda.current_device()
+            )
+            dist.send(tensor_size, dst=dst, group=group)
+        else:
+            serialized_data = pickle.dumps(data)
+            size = len(serialized_data)
+            tensor_data = torch.ByteTensor(
+                np.frombuffer(serialized_data, dtype=np.uint8)
+            ).cuda(
+                device=torch.cuda.current_device()
+            )  # Move to GPU
+            tensor_size = torch.tensor(
+                [size], dtype=torch.long, device=torch.cuda.current_device()
+            )
+
+            dist.send(tensor_size, dst=dst, group=group)
+            dist.send(tensor_data, dst=dst, group=group)
+        return data
+
+    elif rank == dst:
+        tensor_size = torch.tensor(
+            [0], dtype=torch.long, device=torch.cuda.current_device()
+        )
+        dist.recv(tensor_size, src=src, group=group)
+        size = tensor_size.item()
+
+        if size == 0:
+            return []
+
+        tensor_data = torch.empty(
+            size, dtype=torch.uint8, device=torch.cuda.current_device()
+        )
+        dist.recv(tensor_data, src=src, group=group)
+
+        serialized_data = bytes(
+            tensor_data.cpu().numpy()
+        )  # Move back to host for deserialization
+        data = pickle.loads(serialized_data)
+        return data
+
+    # Other ranks in pp_group do nothing
+    return []
+
+
+step_counter = 0
+
+
+def pytorch_profile(name, func, *args, data_size=-1):
+    """
+    Args:
+        name (string): the name of recorded function.
+        func: the function to be profiled.
+        args: the arguments of the profiled function.
+        data_size (int): some measurement of the computation complexity.
+            Usually, it could be the batch size.
+    """
+    global step_counter
+    os.makedirs("trace", exist_ok=True)
+    with profile(
+        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+        # schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=2),
+        # on_trace_ready=tensorboard_trace_handler('./log_dir'),
+        record_shapes=True,
+        profile_memory=True,
+        with_stack=True,
+    ) as prof:
+        with record_function(name):
+            with open(f"trace/size_{step_counter}.json", "w") as f:
+                json.dump({"size": data_size}, f)
+            result = func(*args)
+    prof.export_chrome_trace(f"trace/{name}_{step_counter}.json")
+    step_counter += 1
+    return result
+
+
+def get_zmq_socket(
+    context: zmq.Context, socket_type: zmq.SocketType, endpoint: str, bind: bool
+):
+    mem = psutil.virtual_memory()
+    total_mem = mem.total / 1024**3
+    available_mem = mem.available / 1024**3
+    if total_mem > 32 and available_mem > 16:
+        buf_size = int(0.5 * 1024**3)
+    else:
+        buf_size = -1
+
+    socket = context.socket(socket_type)
+    if endpoint.find("[") != -1:
+        socket.setsockopt(zmq.IPV6, 1)
+
+    def set_send_opt():
+        socket.setsockopt(zmq.SNDHWM, 0)
+        socket.setsockopt(zmq.SNDBUF, buf_size)
+
+    def set_recv_opt():
+        socket.setsockopt(zmq.RCVHWM, 0)
+        socket.setsockopt(zmq.RCVBUF, buf_size)
+
+    if socket_type == zmq.PUSH:
+        set_send_opt()
+    elif socket_type == zmq.PULL:
+        set_recv_opt()
+    elif socket_type == zmq.DEALER:
+        set_send_opt()
+        set_recv_opt()
+    else:
+        raise ValueError(f"Unsupported socket type: {socket_type}")
+
+    if bind:
+        socket.bind(endpoint)
+    else:
+        socket.connect(endpoint)
+
+    return socket
+
+
+def dump_to_file(dirpath, name, value):
+    from sglang.srt.distributed import get_tensor_model_parallel_rank
+
+    if get_tensor_model_parallel_rank() != 0:
+        return
+
+    os.makedirs(dirpath, exist_ok=True)
+    if value.dtype is torch.bfloat16:
+        value = value.float()
+    value = value.cpu().numpy()
+    output_filename = os.path.join(dirpath, f"pytorch_dump_{name}.npy")
+    logger.info(f"Dump a tensor to {output_filename}. Shape = {value.shape}")
+    np.save(output_filename, value)
+
+
+def is_triton_3():
+    return triton.__version__.startswith("3.")
+
+
+def maybe_torch_compile(*args, **kwargs):
+    """
+    torch.compile does not work for triton 2.2.0, which is needed in xlm1's jax.
+    Therefore, we disable it here.
+    """
+
+    def decorator(func):
+        if is_triton_3():
+            return torch.compile(*args, **kwargs)(func)
+        return func
+
+    return decorator
+
+
+def delete_directory(dirpath):
+    try:
+        # This will remove the directory and all its contents
+        shutil.rmtree(dirpath)
+    except OSError as e:
+        print(f"Warning: {dirpath} : {e.strerror}")
+
+
+# Temporary directory for prometheus multiprocess mode
+# Cleaned up automatically when this object is garbage collected
+prometheus_multiproc_dir: tempfile.TemporaryDirectory
+
+
+def set_prometheus_multiproc_dir():
+    # Set prometheus multiprocess directory
+    # sglang uses prometheus multiprocess mode
+    # we need to set this before importing prometheus_client
+    # https://prometheus.github.io/client_python/multiprocess/
+    global prometheus_multiproc_dir
+
+    if "PROMETHEUS_MULTIPROC_DIR" in os.environ:
+        logger.debug("User set PROMETHEUS_MULTIPROC_DIR detected.")
+        prometheus_multiproc_dir = tempfile.TemporaryDirectory(
+            dir=os.environ["PROMETHEUS_MULTIPROC_DIR"]
+        )
+    else:
+        prometheus_multiproc_dir = tempfile.TemporaryDirectory()
+        os.environ["PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name
+    logger.debug(f"PROMETHEUS_MULTIPROC_DIR: {os.environ['PROMETHEUS_MULTIPROC_DIR']}")
+
+
+def add_prometheus_middleware(app):
+    # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
+    from prometheus_client import CollectorRegistry, make_asgi_app, multiprocess
+
+    registry = CollectorRegistry()
+    multiprocess.MultiProcessCollector(registry)
+    metrics_route = Mount("/metrics", make_asgi_app(registry=registry))
+
+    # Workaround for 307 Redirect for /metrics
+    metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
+    app.routes.append(metrics_route)
+
+
+def bind_port(port):
+    """Bind to a specific port, assuming it's available."""
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)  # Allows address reuse
+    sock.bind(("", port))
+    sock.listen(1)
+    return sock
+
+
+def get_amdgpu_memory_capacity():
+    try:
+        # Run rocm-smi and capture the output
+        result = subprocess.run(
+            [
+                "rocminfo | grep 'gfx' -A 100 | grep 'Pool 1' -A 5 | grep 'Size:' | awk '{print $2}'"
+            ],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            shell=True,
+            text=True,
+        )
+        if result.returncode != 0:
+            raise RuntimeError(f"rocm-smi error: {result.stderr.strip()}")
+
+        # Parse the output to extract memory values in MiB
+        memory_values = [
+            float(mem.split("(")[0].strip()) / 1024
+            for mem in result.stdout.strip().split("\n")
+        ]
+
+        if not memory_values:
+            raise ValueError("No GPU memory values found.")
+
+        # Return the minimum memory value
+        return min(memory_values)
+
+    except FileNotFoundError:
+        raise RuntimeError(
+            "rocm-smi not found. Ensure AMD ROCm drivers are installed and accessible."
+        )
+
+
+def get_device_sm():
+    if torch.cuda.is_available():
+        major, minor = torch.cuda.get_device_capability()
+        return major * 10 + minor
+    return 0
+
+
+def get_nvgpu_memory_capacity():
+    try:
+        # Run nvidia-smi and capture the output
+        result = subprocess.run(
+            ["nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+        )
+
+        if result.returncode != 0:
+            raise RuntimeError(f"nvidia-smi error: {result.stderr.strip()}")
+
+        # Parse the output to extract memory values
+        memory_values = [
+            float(mem)
+            for mem in result.stdout.strip().split("\n")
+            if re.match(r"^\d+(\.\d+)?$", mem.strip())
+        ]
+
+        if not memory_values:
+            # Fallback to torch.cuda.mem_get_info() when failed to get memory capacity from nvidia-smi,
+            # typically in NVIDIA MIG mode.
+            if torch.cuda.is_available():
+                logger.warning(
+                    "Failed to get GPU memory capacity from nvidia-smi, falling back to torch.cuda.mem_get_info()."
+                )
+                return torch.cuda.mem_get_info()[1] // 1024 // 1024  # unit: MB
+            raise ValueError("No GPU memory values found.")
+
+        # Return the minimum memory value
+        return min(memory_values)
+
+    except FileNotFoundError:
+        raise RuntimeError(
+            "nvidia-smi not found. Ensure NVIDIA drivers are installed and accessible."
+        )
+
+
+def get_hpu_memory_capacity():
+    try:
+        # Run hl-smi and capture the output
+        result = subprocess.run(
+            ["hl-smi --query | grep 'Total'"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            shell=True,
+            text=True,
+        )
+
+        if result.returncode != 0:
+            raise RuntimeError(f"hl-smi error: {result.stderr.strip()}")
+
+        # Parse the output to extract memory values in MiB
+        memory_values = [
+            float(mem.split(" ")[-2]) for mem in result.stdout.strip().split("\n")
+        ]
+
+        if not memory_values:
+            raise ValueError("No GPU memory values found.")
+
+        # Return the minimum memory value
+        return min(memory_values)
+
+    except FileNotFoundError:
+        raise RuntimeError(
+            "hl-smi not found. Ensure Habana drivers are installed and accessible."
+        )
+
+
+def get_npu_memory_capacity():
+    try:
+        import torch_npu
+
+        return torch.npu.mem_get_info()[1] // 1024 // 1024  # unit: MB
+    except ImportError as e:
+        raise ImportError("torch_npu is required when run on npu device.")
+
+
+def get_device_memory_capacity(device: str = None):
+    if is_cuda():
+        gpu_mem = get_nvgpu_memory_capacity()
+    elif is_hip():
+        gpu_mem = get_amdgpu_memory_capacity()
+    elif device == "hpu":
+        gpu_mem = get_hpu_memory_capacity()
+    elif device == "npu":
+        gpu_mem = get_npu_memory_capacity()
+    else:
+        # GPU memory is not known yet or no GPU is available.
+        gpu_mem = None
+
+    return gpu_mem
+
+
+# Copy from pytorch and OpenRLHF to allow creating multiple main groups.
+# https://github.com/pytorch/pytorch/blob/main/torch/distributed/distributed_c10d.py
+# https://github.com/OpenRLHF/OpenRLHF/blob/main/openrlhf/utils/distributed_util.py
+def init_custom_process_group(
+    backend=None,
+    init_method=None,
+    timeout=None,
+    world_size=-1,
+    rank=-1,
+    store=None,
+    group_name=None,
+    pg_options=None,
+):
+    from torch.distributed.distributed_c10d import (
+        Backend,
+        PrefixStore,
+        _new_process_group_helper,
+        _world,
+        default_pg_timeout,
+        rendezvous,
+    )
+
+    assert (store is None) or (
+        init_method is None
+    ), "Cannot specify both init_method and store."
+
+    if store is not None:
+        assert world_size > 0, "world_size must be positive if using store"
+        assert rank >= 0, "rank must be non-negative if using store"
+    elif init_method is None:
+        init_method = "env://"
+
+    if backend:
+        backend = Backend(backend)
+    else:
+        backend = Backend("undefined")
+
+    if timeout is None:
+        timeout = default_pg_timeout
+
+    # backward compatible API
+    if store is None:
+        rendezvous_iterator = rendezvous(init_method, rank, world_size, timeout=timeout)
+        store, rank, world_size = next(rendezvous_iterator)
+        store.set_timeout(timeout)
+
+        # Use a PrefixStore to avoid accidental overrides of keys used by
+        # different systems (e.g. RPC) in case the store is multi-tenant.
+        store = PrefixStore(group_name, store)
+
+    # NOTE: The pg_options parameter was renamed into backend_options in PyTorch 2.6.0
+    # https://github.com/pytorch/pytorch/commit/a0c7029a75628cd5fa8df83c0de0ea98ee7fd844
+    # We need to determine the appropriate parameter name based on PyTorch version
+    pg_options_param_name = (
+        "backend_options" if str(torch.__version__) >= "2.6" else "pg_options"
+    )
+    pg, _ = _new_process_group_helper(
+        world_size,
+        rank,
+        [],
+        backend,
+        store,
+        group_name=group_name,
+        **{pg_options_param_name: pg_options},
+        timeout=timeout,
+    )
+
+    _world.pg_group_ranks[pg] = {i: i for i in range(world_size)}
+
+    return pg
+
+
+def crash_on_warnings():
+    # Crash on warning if we are running CI tests
+    return get_bool_env_var("SGLANG_IS_IN_CI")
+
+
+def print_warning_once(msg: str) -> None:
+    # Set the stacklevel to 2 to print the caller's line info
+    logger.warning(msg, stacklevel=2)
+
+
+@functools.lru_cache(None)
+def print_info_once(msg: str) -> None:
+    logger.info(msg)
+
+
+def get_device_name(device_id: int = 0) -> str:
+    if hasattr(torch, "cuda") and torch.cuda.is_available():
+        return torch.cuda.get_device_name(device_id)
+
+    if hasattr(torch, "xpu") and torch.xpu.is_available():
+        return torch.xpu.get_device_name(device_id)
+
+    if hasattr(torch, "hpu") and torch.hpu.is_available():
+        return torch.hpu.get_device_name(device_id)
+
+    if hasattr(torch, "npu") and torch.npu.is_available():
+        return torch.npu.get_device_name(device_id)
+
+
+@lru_cache(maxsize=1)
+def is_habana_available() -> bool:
+    return find_spec("habana_frameworks") is not None
+
+
+@lru_cache(maxsize=8)
+def get_device(device_id: Optional[int] = None) -> str:
+    if is_cpu():
+        if cpu_has_amx_support():
+            logger.info("Intel AMX is detected, using CPU with Intel AMX support.")
+        else:
+            logger.warning(
+                "CPU device enabled, using torch native backend, low performance expected."
+            )
+        return "cpu"
+
+    if hasattr(torch, "cuda") and torch.cuda.is_available():
+        if device_id is None:
+            return "cuda"
+        return "cuda:{}".format(device_id)
+
+    if hasattr(torch, "xpu") and torch.xpu.is_available():
+        if device_id == None:
+            return "xpu"
+        return "xpu:{}".format(device_id)
+
+    if hasattr(torch, "npu") and torch.npu.is_available():
+        if device_id == None:
+            return "npu"
+        return "npu:{}".format(device_id)
+
+    if is_habana_available():
+        try:
+            import habana_frameworks.torch.hpu
+
+            if torch.hpu.is_available():
+                if device_id == None:
+                    return "hpu"
+                return "hpu:{}".format(device_id)
+        except ImportError as e:
+            raise ImportError(
+                "Habana frameworks detected, but failed to import 'habana_frameworks.torch.hpu'."
+            )
+
+    raise RuntimeError("No accelerator (CUDA, XPU, HPU) is available.")
+
+
+@lru_cache(maxsize=1)
+def get_device_count() -> int:
+    if hasattr(torch, "cuda") and torch.cuda.is_available():
+        try:
+            return torch.cuda.device_count()
+        except RuntimeError:
+            return 0
+
+    if hasattr(torch, "xpu") and torch.xpu.is_available():
+        try:
+            return torch.xpu.device_count()
+        except RuntimeError:
+            return 0
+
+    if is_habana_available():
+        try:
+            import habana_frameworks.torch.hpu
+
+            if torch.hpu.is_available():
+                return torch.hpu.device_count()
+        except (ImportError, RuntimeError):
+            return 0
+
+    return 0  # No accelerators available
+
+
+def get_device_core_count(device_id: int = 0) -> int:
+    if hasattr(torch, "cuda") and torch.cuda.is_available():
+        return torch.cuda.get_device_properties(device_id).multi_processor_count
+
+    return 0
+
+
+def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
+    major, minor = None, None
+    if hasattr(torch, "cuda") and torch.cuda.is_available():
+        major, minor = torch.cuda.get_device_capability(device_id)
+
+    if hasattr(torch, "xpu") and torch.xpu.is_available():
+        major, minor, *_ = torch.xpu.get_device_capability(device_id)["version"].split(
+            "."
+        )
+        major, minor = int(major), int(minor)
+
+    if hasattr(torch, "hpu") and torch.hpu.is_available():
+        try:
+            # TODO(HandH1998): `get_device_capability` is not supported by `torch.hpu` for now.
+            # Update this once the support is available.
+            # major, minor = torch.hpu.get_device_capability(device_id)
+            major, minor = None, None
+        except Exception as e:
+            raise RuntimeError(
+                f"An error occurred while getting device capability of hpu: {e}."
+            ) from e
+
+    return major, minor
+
+
+def get_npu_compiler_config():
+    config = {
+        "frozen_parameter": True,
+        "tiling_schedule_optimize": True,
+        "topology_sorting_strategy": "StableRDFS",
+    }
+    return config
+
+
+def get_compiler_backend() -> str:
+    if hasattr(torch, "hpu") and torch.hpu.is_available():
+        return "hpu_backend"
+
+    if hasattr(torch, "npu") and torch.npu.is_available():
+        try:
+            import torchair
+            import torchair.ge_concrete_graph.ge_converter.experimental.patch_for_hcom_allreduce
+            from torchair.configs.compiler_config import CompilerConfig
+        except ImportError as e:
+            raise ImportError(
+                "NPU detected, but torchair package is not installed. "
+                "Please install torchair for torch.compile support on NPU."
+            )
+        compiler_config = CompilerConfig()
+        predefined_config = get_npu_compiler_config()
+        for k, v in predefined_config.items():
+            setattr(compiler_config.experimental_config, k, v)
+
+        npu_backend = torchair.get_npu_backend(compiler_config=compiler_config)
+        return npu_backend
+
+    return "inductor"
+
+
+sglang_lib = Library("sglang", "FRAGMENT")  # noqa
+
+
+# Some backends use pytorch version < 2.4.0 which doesn't
+# support `torch.library.custom_op`.
+def supports_custom_op() -> bool:
+    return hasattr(torch.library, "custom_op")
+
+
+def direct_register_custom_op(
+    op_name: str,
+    op_func: Callable,
+    mutates_args: List[str],
+    fake_impl: Optional[Callable] = None,
+    target_lib: Optional[Library] = None,
+):
+    """
+    `torch.library.custom_op` can have significant overhead because it
+    needs to consider complicated dispatching logic. This function
+    directly registers a custom op and dispatches it to the CUDA backend.
+    See https://gist.github.com/youkaichao/ecbea9ec9fc79a45d2adce1784d7a9a5
+    for more details.
+
+    By default, the custom op is registered to the vLLM library. If you
+    want to register it to a different library, you can pass the library
+    object to the `target_lib` argument.
+
+    IMPORTANT: the lifetime of the operator is tied to the lifetime of the
+    library object. If you want to bind the operator to a different library,
+    make sure the library object is alive when the operator is used.
+
+    Note: This function will silently skip registration if the operator
+    with the same name is already registered to avoid RuntimeError in
+    multi-engine scenarios (e.g., VERL framework).
+    """
+    import torch.library
+
+    my_lib = target_lib or sglang_lib
+
+    # Check if operator is already registered to avoid duplicate registration
+    # This is important for scenarios where multiple SGLang engines run in the same process
+    try:
+        # Try to access the operator to see if it's already registered
+        lib_name = my_lib.m.name if hasattr(my_lib.m, "name") else "sglang"
+        if hasattr(torch.ops, lib_name) and hasattr(
+            getattr(torch.ops, lib_name), op_name
+        ):
+            # Operator already exists, skip registration
+            return
+    except (AttributeError, RuntimeError):
+        # Operator doesn't exist, proceed with registration
+        pass
+
+    if hasattr(torch.library, "infer_schema"):
+        schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args)
+    else:
+        # for pytorch 2.4
+        import torch._custom_op.impl
+
+        schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
+
+    try:
+        my_lib.define(op_name + schema_str)
+        my_lib.impl(op_name, op_func, "CUDA")
+        if fake_impl is not None:
+            my_lib._register_fake(op_name, fake_impl)
+    except RuntimeError as error:
+        if "Tried to register an operator" in str(e) and "multiple times" in str(e):
+            # Silently ignore duplicate registration errors
+            # This can happen in multi-engine scenarios
+            pass
+        else:
+            # Re-raise other RuntimeErrors
+            raise error
+    except AttributeError as error:
+        # Always re-raise AttributeError as it indicates missing dependencies
+        raise error
+
+
+def set_gpu_proc_affinity(
+    tp_size: int,
+    nnodes: int,
+    gpu_id: int,
+):
+    # current process
+    pid = os.getpid()
+    p = psutil.Process(pid)
+
+    tp_size_per_node = tp_size // nnodes
+
+    # total physical cores
+    total_pcores = psutil.cpu_count(logical=False)
+    # physical cores per TP (N.B. more Cores than GPUs on node)
+    num_cores_bind = total_pcores // tp_size_per_node
+
+    # able to handle multiple DP per node
+    start_cpu_id = (gpu_id * num_cores_bind) % total_pcores
+    end_cpu_id = start_cpu_id + num_cores_bind
+
+    if psutil.cpu_count() != psutil.cpu_count(logical=False):
+        # HT on
+        lower_cpu_ids = [id for id in range(start_cpu_id, end_cpu_id)]
+        upper_cpu_ids = [id + total_pcores for id in range(start_cpu_id, end_cpu_id)]
+        bind_cpu_ids = list(itertools.chain(lower_cpu_ids, upper_cpu_ids))
+    else:
+        # HT off
+        bind_cpu_ids = [id for id in range(start_cpu_id, end_cpu_id)]
+
+    # set cpu_affinity to current process
+    p.cpu_affinity(bind_cpu_ids)
+    logger.info(f"Process {pid} gpu_id {gpu_id} is running on CPUs: {p.cpu_affinity()}")
+
+
+@lru_cache(maxsize=2)
+def disable_request_logging() -> bool:
+    return get_bool_env_var("SGLANG_DISABLE_REQUEST_LOGGING")
+
+
+def dataclass_to_string_truncated(
+    data, max_length=2048, skip_names: Optional[Set[str]] = None
+):
+    if skip_names is None:
+        skip_names = set()
+    if isinstance(data, str):
+        if len(data) > max_length:
+            half_length = max_length // 2
+            return f"{repr(data[:half_length])} ... {repr(data[-half_length:])}"
+        else:
+            return f"{repr(data)}"
+    elif isinstance(data, (list, tuple)):
+        if len(data) > max_length:
+            half_length = max_length // 2
+            return str(data[:half_length]) + " ... " + str(data[-half_length:])
+        else:
+            return str(data)
+    elif isinstance(data, dict):
+        return (
+            "{"
+            + ", ".join(
+                f"'{k}': {dataclass_to_string_truncated(v, max_length)}"
+                for k, v in data.items()
+                if k not in skip_names
+            )
+            + "}"
+        )
+    elif dataclasses.is_dataclass(data):
+        fields = dataclasses.fields(data)
+        return (
+            f"{data.__class__.__name__}("
+            + ", ".join(
+                f"{f.name}={dataclass_to_string_truncated(getattr(data, f.name), max_length)}"
+                for f in fields
+                if f.name not in skip_names
+            )
+            + ")"
+        )
+    else:
+        return str(data)
+
+
+def permute_weight(x: torch.Tensor) -> torch.Tensor:
+    b_ = x.shape[0]
+    n_ = x.shape[1]
+    k_ = x.shape[2]
+
+    x_ = x
+    if x.dtype == torch.bfloat16 or x.dtype == torch.float16:
+        x_ = x_.view(int(b_), int(n_ / 16), 16, int(k_ / 32), 4, 8)
+    elif x.dtype == torch.float8_e4m3fnuz or x.dtype == torch.int8:
+        x_ = x_.view(int(b_), int(n_ / 16), 16, int(k_ / 64), 4, 16)
+    else:
+        # return x_
+        x_ = x_.view(int(b_), int(n_ / 16), 16, int(k_ / 8), 2, 4)
+
+    x_ = x_.permute(0, 1, 3, 4, 2, 5)
+    x_ = x_.contiguous()
+    x_ = x_.view(*x.shape)
+    return x_
+
+
+class MultiprocessingSerializer:
+    @staticmethod
+    def serialize(obj, output_str: bool = False):
+        """
+        Serialize a Python object using ForkingPickler.
+
+        Args:
+            obj: The object to serialize.
+            output_str (bool): If True, return a base64-encoded string instead of raw bytes.
+
+        Returns:
+            bytes or str: The serialized object.
+        """
+        buf = io.BytesIO()
+        ForkingPickler(buf).dump(obj)
+        buf.seek(0)
+        output = buf.read()
+
+        if output_str:
+            # Convert bytes to base64-encoded string
+            output = pybase64.b64encode(output).decode("utf-8")
+
+        return output
+
+    @staticmethod
+    def deserialize(data):
+        """
+        Deserialize a previously serialized object.
+
+        Args:
+            data (bytes or str): The serialized data, optionally base64-encoded.
+
+        Returns:
+            The deserialized Python object.
+        """
+        if isinstance(data, str):
+            # Decode base64 string to bytes
+            data = pybase64.b64decode(data, validate=True)
+
+        return ForkingPickler.loads(data)
+
+
+def debug_timing(func):
+    # todo: replace with a more organized instrumentation
+    def wrapper(*args, **kwargs):
+        if logger.isEnabledFor(logging.DEBUG):
+            tic = torch.cuda.Event(enable_timing=True)
+            toc = torch.cuda.Event(enable_timing=True)
+            tic.record()
+            result = func(*args, **kwargs)
+            toc.record()
+            toc.synchronize()  # Wait for the function to complete without synchronizing all ops on the GPU
+            elapsed = tic.elapsed_time(toc)
+            indices = kwargs.get("indices", args[1] if len(args) > 1 else None)
+            num_tokens = len(indices) if indices is not None else 0
+            throughput = num_tokens / elapsed * 1000 if elapsed > 0 else 0
+            logger.debug(
+                f"Transfer time: {elapsed} ms, throughput: {throughput} tokens/s"
+            )
+            return result
+        else:
+            return func(*args, **kwargs)
+
+    return wrapper
+
+
+def nullable_str(val: str):
+    if not val or val == "None":
+        return None
+    return val
+
+
+def pyspy_dump_schedulers():
+    """py-spy dump on all scheduler in a local node."""
+    try:
+        pid = psutil.Process().pid
+        # Command to run py-spy with the PID
+        cmd = f"py-spy dump --pid {pid}"
+        result = subprocess.run(
+            cmd, shell=True, capture_output=True, text=True, check=True
+        )
+        logger.error(f"Pyspy dump for PID {pid}:\n{result.stdout}")
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Pyspy failed to dump PID {pid}. Error: {e.stderr}")
+
+
+def kill_itself_when_parent_died():
+    if sys.platform == "linux":
+        # sigkill this process when parent worker manager dies
+        PR_SET_PDEATHSIG = 1
+        libc = ctypes.CDLL("libc.so.6")
+        libc.prctl(PR_SET_PDEATHSIG, signal.SIGKILL)
+    else:
+        logger.warning("kill_itself_when_parent_died is only supported in linux.")
+
+
+def set_uvicorn_logging_configs():
+    from uvicorn.config import LOGGING_CONFIG
+
+    LOGGING_CONFIG["formatters"]["default"][
+        "fmt"
+    ] = "[%(asctime)s] %(levelprefix)s %(message)s"
+    LOGGING_CONFIG["formatters"]["default"]["datefmt"] = "%Y-%m-%d %H:%M:%S"
+    LOGGING_CONFIG["formatters"]["access"][
+        "fmt"
+    ] = '[%(asctime)s] %(levelprefix)s %(client_addr)s - "%(request_line)s" %(status_code)s'
+    LOGGING_CONFIG["formatters"]["access"]["datefmt"] = "%Y-%m-%d %H:%M:%S"
+
+
+def get_ip() -> str:
+    # SGLANG_HOST_IP env can be ignore
+    host_ip = os.getenv("SGLANG_HOST_IP", "") or os.getenv("HOST_IP", "")
+    if host_ip:
+        return host_ip
+
+    # IP is not set, try to get it from the network interface
+
+    # try ipv4
+    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+    try:
+        s.connect(("8.8.8.8", 80))  # Doesn't need to be reachable
+        return s.getsockname()[0]
+    except Exception:
+        pass
+
+    # try ipv6
+    try:
+        s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
+        # Google's public DNS server, see
+        # https://developers.google.com/speed/public-dns/docs/using#addresses
+        s.connect(("2001:4860:4860::8888", 80))  # Doesn't need to be reachable
+        return s.getsockname()[0]
+    except Exception:
+        pass
+
+    # try  using hostname
+    hostname = socket.gethostname()
+    try:
+        ip_addr = socket.gethostbyname(hostname)
+        warnings.warn("using local ip address: {}".format(ip_addr))
+        return ip_addr
+    except Exception:
+        pass
+
+    warnings.warn(
+        "Failed to get the IP address, using 0.0.0.0 by default."
+        "The value can be set by the environment variable"
+        " SGLANG_HOST_IP or HOST_IP.",
+        stacklevel=2,
+    )
+    return "0.0.0.0"
+
+
+def get_open_port() -> int:
+    port = os.getenv("SGLANG_PORT")
+    if port is not None:
+        port = int(port)
+        while True:
+            try:
+                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                    s.bind(("", port))
+                    return port
+            except OSError:
+                port += 1  # Increment port number if already in use
+                logger.info("Port %d is already in use, trying port %d", port - 1, port)
+    # try ipv4
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+    except OSError:
+        # try ipv6
+        with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+
+
+def is_valid_ipv6_address(address: str) -> bool:
+    try:
+        ipaddress.IPv6Address(address)
+        return True
+    except ValueError:
+        return False
+
+
+def maybe_wrap_ipv6_address(address: str) -> str:
+    if is_valid_ipv6_address(address):
+        return f"[{address}]"
+    return address
+
+
+def format_tcp_address(ip: str, port: int) -> str:
+    return f"tcp://{maybe_wrap_ipv6_address(ip)}:{port}"
+
+
+def configure_ipv6(dist_init_addr):
+    addr = dist_init_addr
+    end = addr.find("]")
+    if end == -1:
+        raise ValueError("invalid IPv6 address format: missing ']'")
+
+    host = addr[: end + 1]
+
+    # this only validates the address without brackets: we still need the below checks.
+    # if it's invalid, immediately raise an error so we know it's not formatting issues.
+    if not is_valid_ipv6_address(host[1:end]):
+        raise ValueError(f"invalid IPv6 address: {host}")
+
+    port_str = None
+    if len(addr) > end + 1:
+        if addr[end + 1] == ":":
+            port_str = addr[end + 2 :]
+        else:
+            raise ValueError("received IPv6 address format: expected ':' after ']'")
+
+    if not port_str:
+        raise ValueError(
+            "a port must be specified in IPv6 address (format: [ipv6]:port)"
+        )
+
+    try:
+        port = int(port_str)
+    except ValueError:
+        raise ValueError(f"invalid port in IPv6 address: '{port_str}'")
+    return port, host
+
+
+def launch_dummy_health_check_server(host, port, enable_metrics):
+    import asyncio
+
+    import uvicorn
+    from fastapi import FastAPI, Response
+
+    app = FastAPI()
+
+    @app.get("/health")
+    async def health():
+        """Check the health of the http server."""
+        return Response(status_code=200)
+
+    @app.get("/health_generate")
+    async def health_generate():
+        """Check the health of the http server."""
+        return Response(status_code=200)
+
+    # Add prometheus middleware
+    if enable_metrics:
+        add_prometheus_middleware(app)
+        enable_func_timer()
+
+    config = uvicorn.Config(
+        app,
+        host=host,
+        port=port,
+        timeout_keep_alive=5,
+        loop="auto",
+        log_config=None,
+        log_level="warning",
+    )
+    server = uvicorn.Server(config=config)
+
+    try:
+        loop = asyncio.get_running_loop()
+        logger.info(
+            f"Dummy health check server scheduled on existing loop at {host}:{port}"
+        )
+        loop.create_task(server.serve())
+
+    except RuntimeError:
+        logger.info(f"Starting dummy health check server at {host}:{port}")
+        server.run()
+
+
+def create_checksum(directory: str):
+    raise NotImplementedError()
+
+
+def set_cuda_arch():
+    if is_flashinfer_available():
+        capability = torch.cuda.get_device_capability()
+        arch = f"{capability[0]}.{capability[1]}"
+        os.environ["TORCH_CUDA_ARCH_LIST"] = f"{arch}{'+PTX' if arch == '9.0' else ''}"
+
+
+def next_power_of_2(n: int):
+    return 1 << (n - 1).bit_length() if n > 0 else 1
+
+
+def round_up(x: int, y: int) -> int:
+    return ((x - 1) // y + 1) * y
+
+
+setattr(triton, "next_power_of_2", next_power_of_2)
+
+
+class EmptyContextManager:
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        pass
+
+
+def empty_context(*args, **kwargs):
+    return EmptyContextManager()
+
+
+def add_prefix(name: str, prefix: str) -> str:
+    """Add a weight path prefix to a module name.
+
+    Args:
+        name: base module name.
+        prefix: weight prefix str to added to the front of `name` concatenated with `.`.
+
+    Returns:
+        The string `prefix.name` if prefix is non-empty, otherwise just `name`.
+    """
+    return name if not prefix else f"{prefix}.{name}"
+
+
+def is_remote_url(url: Union[str, Path]) -> bool:
+    """
+    Check if the URL is a remote URL of the format:
+    <connector_type>://<host>:<port>/<model_name>
+    """
+    if isinstance(url, Path):
+        return False
+
+    pattern = r"(.+)://(.*)"
+    m = re.match(pattern, url)
+    return m is not None
+
+
+def parse_connector_type(url: str) -> str:
+    """
+    Parse the connector type from the URL of the format:
+    <connector_type>://<path>
+    """
+    pattern = r"(.+)://(.*)"
+    m = re.match(pattern, url)
+    if m is None:
+        return ""
+
+    return m.group(1)
+
+
+def retry(
+    fn,
+    max_retry: int,
+    initial_delay: float = 2.0,
+    max_delay: float = 60.0,
+    should_retry: Callable[[Any], bool] = lambda e: True,
+):
+    for try_index in itertools.count():
+        try:
+            return fn()
+        except Exception as e:
+            if try_index >= max_retry:
+                raise Exception(f"retry() exceed maximum number of retries.")
+
+            if not should_retry(e):
+                raise Exception(f"retry() observe errors that should not be retried.")
+
+            delay = min(initial_delay * (2**try_index), max_delay) * (
+                0.75 + 0.25 * random.random()
+            )
+
+            logger.warning(
+                f"retry() failed once ({try_index}th try, maximum {max_retry} retries). Will delay {delay:.2f}s and retry. Error: {e}"
+            )
+            traceback.print_exc()
+
+            time.sleep(delay)
+
+
+def flatten_nested_list(nested_list):
+    if isinstance(nested_list, list):
+        return [
+            item for sublist in nested_list for item in flatten_nested_list(sublist)
+        ]
+    else:
+        return [nested_list]
+
+
+def is_non_idle_and_non_empty(forward_mode, hidden_states):
+    return (
+        (forward_mode is not None)
+        and not forward_mode.is_idle()
+        and hidden_states.shape[0] > 0
+    )
+
+
+def fast_topk(values, topk, dim):
+    if topk == 1:
+        # Use max along the specified dimension to get both value and index
+        return torch.max(values, dim=dim, keepdim=True)
+    else:
+        # Use topk for efficiency with larger k values
+        return torch.topk(values, topk, dim=dim)
+
+
+def bind_or_assign(target, source):
+    if target is not None:
+        target.copy_(source)
+        return target
+    else:
+        return source
+
+
+def get_local_ip_auto() -> str:
+    interface = os.environ.get("SGLANG_LOCAL_IP_NIC", None)
+    return (
+        get_local_ip_by_nic(interface)
+        if interface is not None
+        else get_local_ip_by_remote()
+    )
+
+
+def get_local_ip_by_nic(interface: str) -> str:
+    try:
+        import netifaces
+    except ImportError as e:
+        raise ImportError(
+            "Environment variable SGLANG_LOCAL_IP_NIC requires package netifaces, please install it through 'pip install netifaces'"
+        ) from e
+
+    try:
+        addresses = netifaces.ifaddresses(interface)
+        if netifaces.AF_INET in addresses:
+            for addr_info in addresses[netifaces.AF_INET]:
+                ip = addr_info.get("addr")
+                if ip and ip != "127.0.0.1" and ip != "0.0.0.0":
+                    return ip
+        if netifaces.AF_INET6 in addresses:
+            for addr_info in addresses[netifaces.AF_INET6]:
+                ip = addr_info.get("addr")
+                if ip and not ip.startswith("fe80::") and ip != "::1":
+                    return ip.split("%")[0]
+    except (ValueError, OSError) as e:
+        raise ValueError(
+            "Can not get local ip from NIC. Please verify whether SGLANG_LOCAL_IP_NIC is set correctly."
+        )
+
+    # Fallback
+    return get_local_ip_by_remote()
+
+
+def get_local_ip_by_remote() -> str:
+    # try ipv4
+    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+    try:
+        s.connect(("8.8.8.8", 80))  # Doesn't need to be reachable
+        return s.getsockname()[0]
+    except Exception:
+        pass
+
+    try:
+        hostname = socket.gethostname()
+        ip = socket.gethostbyname(hostname)
+        if ip and ip != "127.0.0.1" and ip != "0.0.0.0":
+            return ip
+    except Exception:
+        pass
+
+    # try ipv6
+    try:
+        s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
+        # Google's public DNS server, see
+        # https://developers.google.com/speed/public-dns/docs/using#addresses
+        s.connect(("2001:4860:4860::8888", 80))  # Doesn't need to be reachable
+        return s.getsockname()[0]
+    except Exception:
+        raise ValueError("Can not get local ip")
+
+
+def is_page_size_one(server_args):
+    return server_args.page_size == 1
+
+
+# TODO(hebiao064): Accelerate FA3 Spec Decode with topk > 1.
+# TODO(hebiao064): Improve the acc rate for FA3 Spec Decode with topk == 1 and page_size > 1.
+def is_no_spec_infer_or_topk_one(server_args):
+    return server_args.speculative_eagle_topk is None or (
+        server_args.speculative_eagle_topk is not None
+        and server_args.speculative_eagle_topk == 1
+        and is_page_size_one(server_args)
+    )
+
+
+def is_fa3_default_architecture(hf_config):
+    architectures = getattr(hf_config, "architectures", None)
+    if not isinstance(architectures, list) or not architectures:
+        return False
+    default_archs = {
+        "Qwen2ForCausalLM",
+        "Llama4ForConditionalGeneration",
+        "LlamaForCausalLM",
+        "Gemma2ForCausalLM",
+        "Gemma3ForConditionalGeneration",
+        "Qwen3ForCausalLM",
+        "Qwen3MoeForCausalLM",
+        "Glm4MoeForCausalLM",
+        "Glm4vMoeForConditionalGeneration",
+        "Step3VLForConditionalGeneration",
+    }
+    return architectures[0] in default_archs
+
+
+# Can be more general if it is used in multiple places (keep it simple and thus not general now)
+class BumpAllocator:
+    def __init__(self, buffer_size: int, dtype, device):
+        self._buffer = torch.zeros((buffer_size,), dtype=dtype, device=device)
+        self._pointer = 0
+
+    def allocate(self, size: int):
+        assert self._pointer + size <= len(self._buffer)
+        output = self._buffer[self._pointer : self._pointer + size]
+        self._pointer += size
+        return output
+
+
+def log_info_on_rank0(logger, msg):
+    from sglang.srt.distributed import get_tensor_model_parallel_rank
+
+    if get_tensor_model_parallel_rank() == 0:
+        logger.info(msg)
+
+
+def load_json_config(data: str):
+    try:
+        return json.loads(data)
+    except JSONDecodeError:
+        return json.loads(Path(data).read_text())
+
+
+def dispose_tensor(x: torch.Tensor):
+    x.set_(torch.empty((0,), device=x.device, dtype=x.dtype))
+
+
+T = TypeVar("T")
+
+
+class Withable(Generic[T]):
+    def __init__(self):
+        self._value: Optional[T] = None
+
+    @property
+    def value(self) -> T:
+        return self._value
+
+    @contextmanager
+    def with_value(self, new_value: T):
+        assert self._value is None
+        self._value = new_value
+        try:
+            yield
+        finally:
+            assert self._value is new_value
+            self._value = None
+
+
+def require_mlp_tp_gather(server_args):
+    """
+    Check if the input of MLP is obtained by all-gather rather than all-reduce. This only happens when each MLP TP group contains multiple attention DP groups.
+    """
+    if server_args.enable_dp_attention:
+        assert server_args.dp_size > 1, "dp_size must be greater than 1"
+        if (
+            server_args.moe_dense_tp_size is None
+        ):  # TODO(ch-wan): some MoE models do not have dense layers
+            return True
+        elif not server_args.enable_dp_lm_head:
+            return True
+        elif server_args.moe_a2a_backend == "none":
+            return True
+        else:
+            return (
+                server_args.moe_dense_tp_size
+                > server_args.tp_size // server_args.dp_size
+            )
+    else:
+        return False
+
+
+def require_attn_tp_gather(server_args):
+    """
+    Check if the input of attention is scattered.
+    """
+    assert server_args.moe_dense_tp_size in [1, None]
+    if server_args.moe_a2a_backend != "none" or server_args.moe_dense_tp_size == 1:
+        if server_args.enable_dp_attention:
+            return server_args.dp_size < server_args.tp_size
+        else:
+            return True
+    else:
+        return False
+
+
+def require_gathered_buffer(server_args):
+    return require_mlp_tp_gather(server_args) or require_attn_tp_gather(server_args)
+
+
+def require_mlp_sync(server_args):
+    return server_args.enable_dp_attention or require_gathered_buffer(server_args)
+
+
+def find_local_repo_dir(repo_id: str, revision: Optional[str] = None) -> Optional[str]:
+    import huggingface_hub as hf
+
+    # Build cache path
+    cache_path = os.path.join(
+        hf.constants.HF_HUB_CACHE,
+        hf.constants.REPO_ID_SEPARATOR.join(["models", *repo_id.split("/")]),
+    )
+
+    # Get revision from main ref if not specified
+    if not revision:
+        ref_path = os.path.join(cache_path, "refs", "main")
+        if os.path.isfile(ref_path):
+            with open(ref_path) as f:
+                revision = f.read().strip()
+
+    # List files from revision directory
+    if revision:
+        rev_dir = os.path.join(cache_path, "snapshots", revision)
+        if os.path.isdir(rev_dir):
+            return rev_dir
+
+    return None
+
+
+def read_system_prompt_from_file(model_name: str) -> str:
+    """Read system prompt from a file in the HuggingFace cache directory.
+
+    Args:
+        model_name: The model name to construct the file path
+
+    Returns:
+        The system prompt content from the file, or empty string if file not found
+    """
+    try:
+        local_repo_dir = find_local_repo_dir(model_name)
+        if local_repo_dir:
+            system_prompt_file = os.path.join(local_repo_dir, "SYSTEM_PROMPT.txt")
+            if os.path.exists(system_prompt_file):
+                with open(system_prompt_file, "r", encoding="utf-8") as f:
+                    return f.read()
+
+        return ""
+    except Exception:
+        # If anything fails, return empty string
+        return ""
+
+
+def bind_or_assign(target, source):
+    if target is not None:
+        target.copy_(source)
+        return target
+    else:
+        return source
+
+
+def prepack_weight_if_needed(weight):
+    if weight.device != torch.device("cpu"):
+        return weight
+    if not cpu_has_amx_support():
+        return weight
+
+    return torch.ops.sgl_kernel.convert_weight_packed(weight)
+
+
+# TODO: currently gemm kernel has the below requirements:
+# OC % TILE_N == 0, where TILE_N = 16
+# IC % TILE_K == 0, where TILE_K = 32
+def dim_is_supported(weight):
+    return weight.size(0) % 16 == 0 and weight.size(1) % 32 == 0
+
+
+def _process_weight_after_loading(module, weight_names, transpose_dims=None) -> None:
+    # Pack weight for get better performance on CPU
+    devices = {getattr(module, weight_name).device for weight_name in weight_names}
+    assert len(devices) == 1, f"Expects all weights to be on the same device"
+    device = devices.pop()
+
+    if transpose_dims:
+        assert len(weight_names) == len(
+            transpose_dims
+        ), "len(weight_names) should be equal to len(transpose_dims)"
+
+    for i, weight_name in enumerate(weight_names):
+        weight_tensor = getattr(module, weight_name)
+
+        # We don't pack weight or use intel amx backend if any weight of this module has unsupported dim.
+        if not dim_is_supported(weight_tensor):
+            logger.warning(
+                f"Expects weight.size(0) % 16 == 0 and weight.size(1) % 32 == 0 "
+                f"but {weight_tensor.size(0)=} and {weight_tensor.size(1)=} in {module}. "
+                f"{module} won't use intel amx backend."
+            )
+            module.use_intel_amx_backend = False
+            return
+
+        if transpose_dims and transpose_dims[i]:
+            weight_tensor = weight_tensor.transpose(*transpose_dims[i])
+
+        packed_weight = torch.nn.Parameter(
+            prepack_weight_if_needed(weight_tensor),
+            requires_grad=False,
+        )
+        packed_weight.__dict__ = weight_tensor.__dict__
+        setattr(module, weight_name, packed_weight)
+
+    module.use_intel_amx_backend = (
+        device == torch.device("cpu") and cpu_has_amx_support()
+    )
+
+    if (
+        module.use_intel_amx_backend
+        and hasattr(module, "bias")
+        and module.bias is not None
+    ):
+        module.bias = torch.nn.Parameter(module.bias.data.float(), requires_grad=False)
+
+
+class PackWeightMethod:
+    def __init__(self, weight_names, transpose_dims=None):
+        self.weight_names = weight_names
+        self.transpose_dims = transpose_dims
+
+    def process_weights_after_loading(self, module) -> None:
+        _process_weight_after_loading(module, self.weight_names, self.transpose_dims)
+
+
+class LazyValue:
+    def __init__(self, creator: Callable):
+        self._creator = creator
+        self._value = None
+
+    @property
+    def value(self):
+        if self._creator is not None:
+            self._value = self._creator()
+            self._creator = None
+        return self._value
+
+
+def dynamic_import(func_path: str):
+    parts = func_path.split(".")
+    if len(parts) < 2:
+        raise ValueError(
+            "func_path should contain both module name and func name (such as 'module.func')"
+        )
+    module_path = ".".join(parts[:-1])
+    func_name = parts[-1]
+    module = importlib.import_module(module_path)
+    func = getattr(module, func_name)
+    return func
+
+
+def gc_object_counts():
+    import gc
+
+    g0 = len(gc.get_objects(0))
+    g1 = len(gc.get_objects(1))
+    g2 = len(gc.get_objects(2))
+    return g0, g1, g2
+
+
+def configure_gc_warning(warn_threshold_secs):
+    import gc
+
+    gc_start_time = {}
+
+    def gc_callback(phase, info):
+        gen = info.get("generation", "?")
+        if phase == "start":
+            gc_start_time[gen] = time.time()
+        elif phase == "stop":
+            duration = time.time() - gc_start_time.get(gen, time.time())
+            if duration > warn_threshold_secs:
+                g0, g1, g2 = gc_object_counts()
+                logger.warn(
+                    f"LONG GARBAGE COLLECTION DETECTED | Generation {gen} | Duration: {duration:.4f}s | # Objects: gen0={g0}, gen1={g1}, gen2={g2} | "
+                    f"This may cause latency jitter. Consider calling the freeze_gc API after sending a few warmup requests."
+                )
+
+    gc.callbacks.append(gc_callback)
+
+
+def freeze_gc(context: str):
+    import gc
+
+    g0_before, g1_before, g2_before = gc_object_counts()
+    gc.freeze()
+    g0_after, g1_after, g2_after = gc_object_counts()
+    logger.info(
+        f"Freezing GC in {context} process. "
+        f"gen0: {g0_before}->{g0_after}, "
+        f"gen1: {g1_before}->{g1_after}, "
+        f"gen2: {g2_before}->{g2_after}"
+    )
+
+
+def configure_gc_logger():
+    logger.info("Enable GC Logger")
+
+    import gc
+
+    gc_start_time = {}
+
+    def gc_callback(phase, info):
+        gen = info.get("generation", "?")
+        if phase == "start":
+            gc_start_time[gen] = time.time()
+            logger.info(f"GC start: Time {time.time()} | Generation {gen}")
+        elif phase == "stop":
+            duration = time.time() - gc_start_time.get(gen, time.time())
+            collected = info.get("collected", "?")
+            uncollectable = info.get("uncollectable", "?")
+            logger.info(
+                f"GC end: Time {time.time()} | Generation {gen} | "
+                f"Duration: {duration:.4f}s | Collected: {collected} | Uncollectable: {uncollectable} "
+                f'{"(LONG GC)" if duration > 0.1 else ""}'
+            )
+
+    gc.callbacks.append(gc_callback)
+
+
+# COPIED FROM DeepGEMM
+def align(x: int, y: int) -> int:
+    return ceil_div(x, y) * y
+
+
+# COPIED FROM DeepGEMM
+def ceil_div(x: int, y: int) -> int:
+    return (x + y - 1) // y
+
+
+def parse_lscpu_topology():
+    try:
+        # Get CPU topology: CPU,Core,Socket,Node
+        output = subprocess.check_output(
+            ["lscpu", "-p=CPU,Core,Socket,Node"], text=True
+        )
+    except Exception as e:
+        raise RuntimeError(f"Unexpected error running 'lscpu': {e}")
+
+    # Parse only data lines (skip comments)
+    cpu_info = []
+    for line in output.splitlines():
+        if not line.startswith("#"):
+            cpu, core, socket, node = map(int, line.strip().split(","))
+            cpu_info.append((cpu, core, socket, node))
+
+    # [(0,0,0,0),(1,1,0,0),...,(43,43,0,1),...,(256,0,0,0),...]
+    return cpu_info
+
+
+def get_physical_cpus_by_numa():
+    cpu_info = parse_lscpu_topology()
+
+    # Map NUMA node -> set of (core_id, socket) to avoid duplicates
+    # 0: {(0,0): 0, (1, 0): 1,...}
+    # ...
+    # 5: {(214,1): 214, (215,1): 215}
+    physical_by_node = defaultdict(dict)  # node -> core_id -> cpu_id
+
+    for cpu, core, socket, node in cpu_info:
+        key = (core, socket)
+        if key not in physical_by_node[node]:
+            physical_by_node[node][
+                key
+            ] = cpu  # pick first CPU seen for that physical core
+
+    # Retrieves CPUs that the current process is allowed to run on
+    cpus_allowed_list = psutil.Process().cpu_affinity()
+
+    # Convert to list of physical CPUs per node
+    # 0: [0,1,2,...,42]
+    # ...
+    # 2: [86,87,...,127]
+    # ...
+    # 5: [214,215,...,255]
+    node_to_cpus = {}
+    for node, core_to_cpu in physical_by_node.items():
+        cpus = sorted(core_to_cpu.values())
+        allowed_cpus = set(cpus).intersection(cpus_allowed_list)
+        node_to_cpus[node] = allowed_cpus
+
+    return node_to_cpus
+
+
+# Only physical cores are used. Logical cores are excluded.
+def get_cpu_ids_by_node():
+    node_to_cpus = get_physical_cpus_by_numa()
+    # Sort by NUMA node index
+    cpu_ids = [
+        ",".join(map(str, sorted(node_to_cpus[node]))) for node in sorted(node_to_cpus)
+    ]
+
+    # ['0,1,2,3', '4,5,6,7', '8,9,10,11', '12,13,14,15', '16,17,18,19', '20,21,22,23']
+    return cpu_ids
+
+
+def is_shm_available(dtype, world_size, local_size):
+    return (
+        cpu_has_amx_support()
+        and dtype in [torch.bfloat16, torch.float]
+        and world_size >= 1
+        and world_size == local_size
+    )
+
+
+def lru_cache_frozenset(maxsize=128):
+    def _to_hashable(o):
+        try:
+            hash(o)
+            return o
+        except TypeError:
+            # Not hashable; convert based on type
+            if isinstance(o, (dict)):
+                return frozenset(
+                    (_to_hashable(k), _to_hashable(v)) for k, v in o.items()
+                )
+            elif isinstance(o, set):
+                return frozenset(_to_hashable(v) for v in o)
+            elif isinstance(o, (list, tuple)) or (
+                isinstance(o, Sequence) and not isinstance(o, (str, bytes))
+            ):
+                return tuple(_to_hashable(v) for v in o)
+            else:
+                raise TypeError(f"Cannot make hashable: {type(o)}")
+
+    def decorator(func):
+        cache = OrderedDict()
+
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            h_args = tuple(_to_hashable(a) for a in args)
+            h_kwargs = frozenset(
+                (_to_hashable(k), _to_hashable(v)) for k, v in kwargs.items()
+            )
+            key = (h_args, h_kwargs)
+            if key in cache:
+                cache.move_to_end(key)
+                return cache[key]
+            result = func(*args, **kwargs)
+            cache[key] = result
+            if maxsize is not None and len(cache) > maxsize:
+                cache.popitem(last=False)
+            return result
+
+        wrapper.cache_clear = cache.clear  # For manual cache clearing
+        return wrapper
+
+    return decorator
+
+
+def get_origin_rid(rid):
+    return rid.split("_", 1)[1] if "_" in rid else rid
+
+
+def apply_module_patch(target_module, target_function, wrappers):
+    original_module, original_function = parse_module_path(
+        target_module, target_function, False
+    )
+
+    original_function_id = id(original_function)
+
+    candidate = original_function
+    for wrapper in wrappers:
+        candidate = wrapper(candidate)
+    if target_function is not None:
+        setattr(original_module, target_function, candidate)
+
+    for key, value in sys.modules.copy().items():
+        if (
+            target_function is not None
+            and hasattr(value, target_function)
+            and id(getattr(value, target_function)) == original_function_id
+        ):
+            setattr(value, target_function, candidate)
+
+
+def parse_module_path(module_path, function_name, create_dummy):
+    from importlib.machinery import ModuleSpec
+
+    def create_dummy_module(full_path, parent=None):
+        """Create and register a placeholder module"""
+        dummy = types.ModuleType(full_path)
+        dummy.__file__ = "vllm_ascend.dummy_module.py"
+        dummy.__spec__ = ModuleSpec(full_path, None)
+        sys.modules[full_path] = dummy
+        if parent:
+            setattr(parent, full_path.split(".")[-1], dummy)
+        return dummy
+
+    def create_placeholder_function(func_name):
+        """Create dummy function that raises when called"""
+
+        def placeholder(*args, **kwargs):
+            raise NotImplementedError(f"Function {func_name} is a placeholder")
+
+        placeholder.__name__ = func_name
+        return placeholder
+
+    modules = module_path.split(".")
+    current_module = None
+    processed_path = []
+
+    for idx, part in enumerate(modules):
+        current_path = ".".join(modules[: idx + 1])
+        parent_path = ".".join(modules[:idx]) if idx > 0 else None
+
+        try:
+            current_module = importlib.import_module(current_path)
+        except ModuleNotFoundError:
+            # Handle missing module
+            parent = importlib.import_module(parent_path) if parent_path else None
+            if parent and hasattr(parent, part):
+                # Use existing attribute from parent
+                current_module = getattr(parent, part)
+                # Check for early function resolution
+                if function_name and hasattr(current_module, function_name):
+                    return current_module, getattr(current_module, function_name)
+                if function_name and create_dummy:
+                    ph_func = create_placeholder_function(function_name)
+                    setattr(current_module, function_name, ph_func)
+                    return current_module, ph_func
+                if function_name:
+                    raise AttributeError(
+                        f"Function {function_name} missing in {current_path}"
+                    )
+            else:
+                if not create_dummy:
+                    raise
+                # Create and register dummy module
+                current_module = create_dummy_module(
+                    current_path,
+                    parent=(
+                        importlib.import_module(parent_path) if parent_path else None
+                    ),
+                )
+
+        processed_path.append(part)
+
+    # Final function handling
+    final_module = sys.modules[module_path]
+    if function_name is not None:
+        if not hasattr(final_module, function_name):
+            if create_dummy:
+                ph_func = create_placeholder_function(function_name)
+                setattr(final_module, function_name, ph_func)
+            else:
+                setattr(final_module, function_name, None)
+        return final_module, getattr(final_module, function_name)
+
+    return final_module, None
+
+
+def mxfp_supported():
+    """
+    Returns whether the current platform supports MX types.
+    """
+    if torch.version.hip:
+        gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
+        return any(gfx in gcn_arch for gfx in ["gfx95"])
+    else:
+        return False
+
+
+@lru_cache(maxsize=1)
+def is_gfx95_supported():
+    """
+    Returns whether the current platform supports MX types.
+    """
+    if torch.version.hip:
+        gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
+        return any(gfx in gcn_arch for gfx in ["gfx95"])
+    else:
+        return False
+
+
+# LoRA-related constants and utilities
+SUPPORTED_LORA_TARGET_MODULES = [
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "o_proj",
+    "gate_proj",
+    "up_proj",
+    "down_proj",
+    "qkv_proj",
+    "gate_up_proj",
+]
+
+LORA_TARGET_ALL_MODULES = "all"
+
+
+class ConcurrentCounter:
+    """
+    An asynchronous counter for managing concurrent tasks that need
+    coordinated increments, decrements, and waiting until the count reaches zero.
+
+    This class is useful for scenarios like tracking the number of in-flight tasks
+    and waiting for them to complete.
+    """
+
+    def __init__(self, initial: int = 0):
+        """
+        Initialize the counter with an optional initial value.
+
+        Args:
+            initial (int): The initial value of the counter. Default is 0.
+        """
+        self._count = initial
+        self._condition = asyncio.Condition()
+
+    def value(self) -> int:
+        """
+        Return the current value of the counter.
+
+        Note:
+            This method is not synchronized. It may return a stale value
+            if other coroutines are concurrently modifying the counter.
+
+        Returns:
+            int: The current counter value.
+        """
+        return self._count
+
+    def __repr__(self) -> str:
+        """Return an informative string representation of the counter."""
+        return f"<ConcurrentCounter value={self.value()}>"
+
+    async def increment(self, n: int = 1, notify_all: bool = True):
+        """
+        Atomically increment the counter by a given amount and notify all waiters.
+
+        Args:
+            n (int): The amount to increment the counter by. Default is 1.
+            notify_all (bool): Whether to notify all waiters after incrementing. Default is True.
+        """
+        async with self._condition:
+            self._count += n
+            if notify_all:
+                self._condition.notify_all()
+
+    async def decrement(self, n: int = 1, notify_all: bool = True):
+        """
+        Atomically decrement the counter by a given amount and notify all waiters.
+
+        Args:
+            n (int): The amount to decrement the counter by. Default is 1.
+            notify_all (bool): Whether to notify all waiters after decrementing. Default is True.
+        """
+        async with self._condition:
+            self._count -= n
+            if notify_all:
+                self._condition.notify_all()
+
+    async def wait_for(self, condition: Callable[[int], bool]):
+        """
+        Asynchronously wait until the counter satisfies a given condition.
+
+        This suspends the calling coroutine without blocking the thread, allowing
+        other tasks to run while waiting. When the condition is met, the coroutine resumes.
+
+        Args:
+            condition (Callable[[int], bool]): A function that takes the current counter value
+                and returns True when the condition is satisfied.
+        """
+        async with self._condition:
+            await self._condition.wait_for(lambda: condition(self._count))
+
+    async def wait_for_zero(self):
+        """
+        Asynchronously wait until the counter reaches zero.
+
+        This suspends the calling coroutine without blocking the thread, allowing
+        other tasks to run while waiting. When the counter becomes zero, the coroutine resumes.
+        """
+        await self.wait_for(lambda count: count == 0)
+
+
+@lru_cache(maxsize=1)
+def is_triton_kernels_available() -> bool:
+    return importlib.util.find_spec("triton_kernels") is not None
+
+
+def check_cuda_result(raw_output):
+    import cuda.bindings.runtime as cuda_rt
+
+    err, *results = raw_output
+    if err != cuda_rt.cudaError_t.cudaSuccess:
+        raise Exception(f"CUDA error: {err}")
+
+    return results
+
+
+def numa_bind_to_node(node: int):
+    libnuma = ctypes.CDLL("libnuma.so")
+    if libnuma.numa_available() < 0:
+        raise SystemError("numa not available on this system")
+
+    libnuma.numa_run_on_node(ctypes.c_int(node))
+    libnuma.numa_set_localalloc()
diff --git a/python/sglang/srt/warmup.py b/python/sglang/srt/warmup.py
new file mode 100644
index 000000000..0bed9fb94
--- /dev/null
+++ b/python/sglang/srt/warmup.py
@@ -0,0 +1,56 @@
+import logging
+from typing import List
+
+import numpy as np
+import tqdm
+
+from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST
+from sglang.srt.managers.io_struct import GenerateReqInput
+from sglang.srt.managers.tokenizer_manager import TokenizerManager
+
+logger = logging.getLogger(__file__)
+
+_warmup_registry = {}
+
+
+def warmup(name: str) -> callable:
+    def decorator(fn: callable):
+        _warmup_registry[name] = fn
+        return fn
+
+    return decorator
+
+
+async def execute_warmups(
+    disaggregation_mode: str,
+    warmup_names: List[str],
+    tokenizer_manager: TokenizerManager,
+):
+    for warmup_name in warmup_names:
+        if warmup_name not in _warmup_registry:
+            logger.warning(f"Could not find custom warmup {warmup_name}")
+            continue
+        logger.info(f"Running warmup {warmup_name}")
+        await _warmup_registry[warmup_name](disaggregation_mode, tokenizer_manager)
+
+
+@warmup("voice_chat")
+async def voice_chat(disaggregation_mode: str, tokenizer_manager: TokenizerManager):
+    # this warms up the fused_moe triton kernels and caches them
+    # if we don't do this we break real time inference for voice chat
+    for i in tqdm.trange(1, 512):
+        size = i * 4
+        generate_req_input = GenerateReqInput(
+            input_ids=(np.random.randint(2**16, size=[size])).tolist(),
+            sampling_params={
+                "max_new_tokens": 30,
+                "temperature": 0.8,
+                "stop_token_ids": [1],
+                "min_p": 0.0,
+            },
+        )
+        if disaggregation_mode != "null":
+            generate_req_input.bootstrap_room = 0
+            generate_req_input.bootstrap_host = FAKE_BOOTSTRAP_HOST
+
+        await tokenizer_manager.generate_request(generate_req_input, None).__anext__()
diff --git a/python/sglang/srt/weight_sync/tensor_bucket.py b/python/sglang/srt/weight_sync/tensor_bucket.py
new file mode 100644
index 000000000..44273713f
--- /dev/null
+++ b/python/sglang/srt/weight_sync/tensor_bucket.py
@@ -0,0 +1,106 @@
+from dataclasses import dataclass
+from typing import List, Tuple
+
+import torch
+
+
+@dataclass
+class FlattenedTensorMetadata:
+    """Metadata for a tensor in a flattened bucket"""
+
+    name: str
+    shape: torch.Size
+    dtype: torch.dtype
+    start_idx: int
+    end_idx: int
+    numel: int
+
+
+class FlattenedTensorBucket:
+    """
+    A bucket that flattens multiple tensors into a single tensor for efficient processing
+    while preserving all metadata needed for reconstruction.
+    """
+
+    def __init__(
+        self,
+        named_tensors: List[Tuple[str, torch.Tensor]] = None,
+        flattened_tensor: torch.Tensor = None,
+        metadata: List[FlattenedTensorMetadata] = None,
+    ):
+        """
+        Initialize a tensor bucket from a list of named tensors OR from pre-flattened data.
+        Args:
+            named_tensors: List of (name, tensor) tuples (for creating new bucket)
+            flattened_tensor: Pre-flattened tensor (for reconstruction)
+            metadata: Pre-computed metadata (for reconstruction)
+        """
+        if named_tensors is not None:
+            # Create bucket from named tensors
+            self.metadata: List[FlattenedTensorMetadata] = [None] * len(named_tensors)
+            self.flattened_tensor: torch.Tensor = None
+
+            if not named_tensors:
+                raise ValueError("Cannot create empty tensor bucket")
+
+            # Collect metadata and flatten tensors
+            current_idx = 0
+            flattened_tensors: List[torch.Tensor] = [None] * len(named_tensors)
+
+            for i, (name, tensor) in enumerate(named_tensors):
+                flattened = tensor.flatten()
+                flattened_tensors[i] = flattened
+
+                # Store metadata
+
+                numel = flattened.numel()
+                metadata_obj = FlattenedTensorMetadata(
+                    name=name,
+                    shape=tensor.shape,
+                    dtype=tensor.dtype,
+                    start_idx=current_idx,
+                    end_idx=current_idx + numel,
+                    numel=numel,
+                )
+                self.metadata[i] = metadata_obj
+                current_idx += numel
+
+            # Concatenate all flattened tensors
+            self.flattened_tensor = torch.cat(flattened_tensors, dim=0)
+        else:
+            # Initialize from pre-flattened data
+            if flattened_tensor is None or metadata is None:
+                raise ValueError(
+                    "Must provide either named_tensors or both flattened_tensor and metadata"
+                )
+            self.flattened_tensor = flattened_tensor
+            self.metadata = metadata
+
+    def get_flattened_tensor(self) -> torch.Tensor:
+        """Get the flattened tensor containing all bucket tensors"""
+        return self.flattened_tensor
+
+    def get_metadata(self) -> List[FlattenedTensorMetadata]:
+        """Get metadata for all tensors in the bucket"""
+        return self.metadata
+
+    def reconstruct_tensors(self) -> List[Tuple[str, torch.Tensor]]:
+        """
+        Reconstruct original tensors from flattened tensor with optimized performance.
+        Uses memory-efficient operations to minimize allocations and copies.
+        """
+        # preallocate the result list
+        reconstructed = [None] * len(self.metadata)
+
+        for i, meta in enumerate(self.metadata):
+            tensor = self.flattened_tensor[meta.start_idx : meta.end_idx].reshape(
+                meta.shape
+            )
+
+            # batch dtype conversion (if needed)
+            if tensor.dtype != meta.dtype:
+                tensor = tensor.to(meta.dtype)
+
+            reconstructed[i] = (meta.name, tensor)
+
+        return reconstructed
diff --git a/python/sglang/srt/weight_sync/utils.py b/python/sglang/srt/weight_sync/utils.py
new file mode 100644
index 000000000..f308207e2
--- /dev/null
+++ b/python/sglang/srt/weight_sync/utils.py
@@ -0,0 +1,119 @@
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.tensor import DTensor
+
+from sglang.srt.entrypoints.engine import Engine
+from sglang.srt.managers.io_struct import UpdateWeightsFromTensorReqInput
+from sglang.srt.model_executor.model_runner import LocalSerializedTensor
+from sglang.srt.utils import MultiprocessingSerializer
+
+
+async def update_weights(
+    engine: Engine,
+    params_batch: list[tuple[str, torch.Tensor]],
+    device_mesh_key: str,
+    device_mesh: DeviceMesh,
+    load_format: Optional[str] = None,
+):
+    """
+    Update weights for the inference engine.
+    This function is designed to be stateless, so that the caller process could keep the stateful engine.
+    Example Use Case:
+        - Multiple Producer Process will call this function in a SPMD style
+
+    Args:
+        engine: The inference engine created by the caller process.
+        params_batch: A list of (name, tensor) tuples. We batched the tensors to avoid the overhead of cpu call.
+        device_mesh_key: The key of the device mesh. Typically "tp" or "infer_tp"
+        device_mesh: The device mesh.
+        load_format: The format of the weights.
+    """
+    infer_tp_size = device_mesh[device_mesh_key].mesh.size()[0]
+    infer_tp_rank = device_mesh[device_mesh_key].get_local_rank()
+    from sglang.srt.patch_torch import monkey_patch_torch_reductions
+
+    monkey_patch_torch_reductions()
+
+    # [
+    #   (name0, ipc_tensor0_tp0),
+    #   (name1, ipc_tensor1_tp0),
+    # ]
+    named_tensors_batch = [
+        (
+            name,
+            MultiprocessingSerializer.serialize(
+                _preprocess_tensor_for_update_weights(tensor.detach())
+            ),
+        )
+        for name, tensor in params_batch
+    ]
+
+    if infer_tp_rank == 0:
+        gathered_serialized_batches = [None for _ in range(infer_tp_size)]
+    else:
+        gathered_serialized_batches = None
+
+    # [
+    #   [ (name0, ipc_tensor0_tp0), (name1, ipc_tensor1_tp0) ],
+    #   [ (name0, ipc_tensor0_tp1), (name1, ipc_tensor1_tp1) ],
+    # ]
+    dist.gather_object(
+        obj=named_tensors_batch,
+        object_gather_list=gathered_serialized_batches,
+        dst=device_mesh[device_mesh_key].mesh.tolist()[0],
+        group=device_mesh[device_mesh_key].get_group(),
+    )
+
+    if infer_tp_rank == 0:
+        # Use zip(*) to "transpose" the data structure.
+        # After transpose, the data structure is like:
+        # [
+        #   ( (name0, ipc_tensor0_tp0), (name0, ipc_tensor0_tp1) ),
+        #   ( (name1, ipc_tensor1_tp0), (name1, ipc_tensor1_tp1) ),
+        # ]
+        logical_tensors = zip(*gathered_serialized_batches, strict=True)
+
+        named_tensors = [
+            # [
+            #   (name0, LocalSerializedTensor(values=[ipc_tensor0_tp0, ipc_tensor0_tp1])),
+            #   (name1, LocalSerializedTensor(values=[ipc_tensor1_tp0, ipc_tensor1_tp1])),
+            # ]
+            (
+                tensor_group[0][0],
+                LocalSerializedTensor(
+                    values=[rank_part[1] for rank_part in tensor_group]
+                ),
+            )
+            for tensor_group in logical_tensors
+        ]
+
+        update_weights_request = UpdateWeightsFromTensorReqInput(
+            serialized_named_tensors=[
+                MultiprocessingSerializer.serialize(named_tensors)
+                for _ in range(infer_tp_size)
+            ],
+            load_format=load_format,
+        )
+
+        return await engine.update_weights_from_tensor(update_weights_request)
+
+
+def _preprocess_tensor_for_update_weights(tensor: torch.Tensor):
+    """
+    Preprocess the tensor for update weights.
+    Example Use Case:
+        - FSDP: we gather tensor by calling full_tensor in _preprocess_tensor_for_update_weights
+        - Megatron: we do nothing here, assuming it is gathered when feed into this func
+
+    Args:
+        tensor: The tensor to be preprocessed.
+
+    Returns:
+        The full tensor if it is a DTensor, otherwise the original tensor.
+    """
+    if isinstance(tensor, DTensor):
+        return tensor.full_tensor()
+    return tensor
diff --git a/python/sglang/test/__init__.py b/python/sglang/test/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/sglang/test/attention/__init__.py b/python/sglang/test/attention/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/sglang/test/attention/test_flashattn_backend.py b/python/sglang/test/attention/test_flashattn_backend.py
new file mode 100644
index 000000000..5e5ebbaf1
--- /dev/null
+++ b/python/sglang/test/attention/test_flashattn_backend.py
@@ -0,0 +1,350 @@
+import unittest
+
+import torch
+
+from sglang.srt.configs.model_config import AttentionArch
+from sglang.srt.layers.attention.flashattention_backend import FlashAttentionBackend
+from sglang.srt.layers.attention.torch_native_backend import TorchNativeAttnBackend
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.model_executor.model_runner import ServerArgs
+from sglang.test.test_utils import CustomTestCase
+
+
+class MockModelRunner:
+    def __init__(
+        self,
+        page_size=1,
+        num_heads=2,
+        head_dim=8,
+    ):
+        self.device = "cuda"
+        self.dtype = torch.float16
+        attention_arch = AttentionArch.MHA
+        # Max batch size for the test.
+        max_batch_size = 160
+        # Total tokens(prefix + extend + decode) in the test should not exceed this length.
+        max_context_len = 2048
+        self.model_config = type(
+            "ModelConfig",
+            (),
+            {
+                "context_len": max_context_len,
+                "is_multimodal": False,
+                "attention_arch": attention_arch,
+            },
+        )
+        self.sliding_window_size = None
+        self.device = self.device
+        # Create a large enough req_to_token_pool to fit the test usage.
+        self.req_to_token_pool = type(
+            "TokenPool",
+            (),
+            {
+                # A typical max_bs * max_context_len for cuda graph decode
+                "size": max_batch_size,
+                # Add req_to_token attribute
+                "req_to_token": torch.zeros(
+                    max_batch_size,
+                    max_context_len,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+            },
+        )
+        self.page_size = page_size
+        max_total_num_tokens = max_batch_size * max_context_len
+        self.token_to_kv_pool = MHATokenToKVPool(
+            size=max_total_num_tokens,
+            page_size=page_size,
+            dtype=self.dtype,
+            head_num=num_heads,
+            head_dim=head_dim,
+            layer_num=1,  # only consider layer=1 for unit test
+            device=self.device,
+            enable_memory_saver=False,
+        )
+        # Required by torch native backend
+        self.server_args = ServerArgs(model_path="fake_model_path")
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA")
+class TestFlashAttentionBackend(CustomTestCase):
+    def setUp(self):
+        # Test parameters
+        self.batch_size = 2
+        self.seq_len = 256
+        self.num_heads = 2
+        self.head_dim = 8
+        self.device = "cuda"
+        self.dtype = torch.float16
+
+    def _init_model_runner(self, page_size=1):
+        self.model_runner = MockModelRunner(
+            page_size=page_size,
+            num_heads=self.num_heads,
+            head_dim=self.head_dim,
+        )
+        self.backend = FlashAttentionBackend(self.model_runner)
+        self.ref_backend = TorchNativeAttnBackend(self.model_runner)
+        self.model_runner.model_config.num_attention_heads = self.num_heads
+
+    def _mock_write_to_req_to_token_pool(self, batch_size, seq_len, page_size):
+        # if page_size > 1, the token pool stores the index to the page.
+        # so we need to multiply the index by page_size.
+        self.req_to_token = (
+            torch.arange(0, batch_size, dtype=torch.int32, device=self.device)[:, None]
+            * seq_len
+            + torch.arange(0, seq_len, dtype=torch.int32, device=self.device)[None, :]
+            + page_size
+        )
+        self.model_runner.req_to_token_pool.req_to_token[:batch_size, :seq_len] = (
+            self.req_to_token
+        )
+
+    def _create_attention_layer(self):
+        """Create attention layer for testing."""
+        return RadixAttention(
+            num_heads=self.num_heads,
+            head_dim=self.head_dim,
+            scaling=1.0,
+            num_kv_heads=self.num_heads,
+            layer_id=0,
+        )
+
+    def _create_qkv_tensors(self, tokens_len):
+        """Create q, k, v tensors for testing."""
+        shape = (tokens_len, self.num_heads, self.head_dim)
+        return (
+            torch.randn(shape, dtype=self.dtype, device=self.device),
+            torch.randn(shape, dtype=self.dtype, device=self.device),
+            torch.randn(shape, dtype=self.dtype, device=self.device),
+        )
+
+    def _run_reference_forward(
+        self, mode, q, k, v, layer, forward_batch, expected_shape
+    ):
+        """Run reference forward pass using native backend."""
+        if mode == ForwardMode.EXTEND:
+            output = self.ref_backend.forward_extend(q, k, v, layer, forward_batch)
+        else:  # ForwardMode.DECODE
+            output = self.ref_backend.forward_decode(q, k, v, layer, forward_batch)
+        return output.view(expected_shape)
+
+    def _verify_output(self, output, expected_shape, output_ref=None):
+        """Verify output tensor shape, dtype, and values."""
+        self.assertEqual(
+            output.shape,
+            expected_shape,
+            f"Expected shape {expected_shape}, got {output.shape}",
+        )
+        self.assertEqual(output.dtype, self.dtype)
+        self.assertEqual(output.device.type, "cuda")
+        self.assertEqual(
+            torch.isnan(output).sum().item(), 0, "Output contains NaN values"
+        )
+
+        if output_ref is not None:
+            if not torch.allclose(output, output_ref, atol=1e-1, rtol=0.0):
+                # Check where the values differ beyond the given tolerances
+                diff_mask = ~torch.isclose(output, output_ref, atol=1e-1, rtol=0.0)
+
+                # Find the first index where the difference occurs
+                if diff_mask.any():
+                    first_mismatch_idx = diff_mask.nonzero()[0]
+                    print(
+                        "First mismatch at index:", tuple(first_mismatch_idx.tolist())
+                    )
+                    print("output:", output[tuple(first_mismatch_idx.tolist())])
+                    print("output_ref:", output_ref[tuple(first_mismatch_idx.tolist())])
+                raise AssertionError(
+                    "Attention output is not close to the torch native backend output"
+                )
+
+    def _create_forward_batch(self, mode, q_len=None, prefix_len=0, page_size=1):
+        """Create a forward batch for testing based on mode and lengths."""
+        self._init_model_runner(page_size=page_size)
+
+        # Default to self.seq_len if not specified
+        q_len = q_len or self.seq_len
+
+        if mode == ForwardMode.EXTEND:
+            total_len = prefix_len + q_len
+            out_cache_start = prefix_len * self.batch_size
+            out_cache_end = total_len * self.batch_size
+
+            forward_batch = ForwardBatch(
+                batch_size=self.batch_size,
+                input_ids=torch.randint(
+                    0, 100, (self.batch_size, q_len), device=self.device
+                ),
+                out_cache_loc=torch.arange(
+                    out_cache_start, out_cache_end, device=self.device
+                ),
+                seq_lens_sum=self.batch_size * total_len,
+                forward_mode=mode,
+                req_pool_indices=torch.arange(self.batch_size, device=self.device),
+                seq_lens=torch.tensor(
+                    [total_len] * self.batch_size, device=self.device
+                ),
+                seq_lens_cpu=torch.tensor([total_len] * self.batch_size, device="cpu"),
+                extend_prefix_lens=torch.tensor(
+                    [prefix_len] * self.batch_size, device=self.device
+                ),
+                extend_prefix_lens_cpu=torch.tensor(
+                    [prefix_len] * self.batch_size, device="cpu"
+                ),
+                extend_seq_lens=torch.tensor(
+                    [q_len] * self.batch_size, device=self.device
+                ),
+                extend_seq_lens_cpu=torch.tensor(
+                    [q_len] * self.batch_size, device="cpu"
+                ),
+                attn_backend=self.backend,
+            )
+        else:  # ForwardMode.DECODE
+            decode_len = q_len  # Assuming 1 for decode testing
+            total_len = self.seq_len + decode_len
+            if mode == ForwardMode.DECODE and page_size > 1:
+                # Get next page_size multiple of self.seq_len
+                out_cache_start = (
+                    self.batch_size * self.seq_len // page_size + 1
+                ) * page_size
+                # out_cache_end is the start of the next block
+                out_cache_end = out_cache_start + decode_len * page_size
+            else:
+                out_cache_start = self.batch_size * self.seq_len
+                out_cache_end = self.batch_size * total_len
+
+            forward_batch = ForwardBatch(
+                batch_size=self.batch_size,
+                input_ids=torch.randint(
+                    0, 100, (self.batch_size, decode_len), device=self.device
+                ),
+                out_cache_loc=torch.tensor(
+                    [out_cache_start, out_cache_end], device=self.device
+                ),
+                seq_lens_sum=self.batch_size * total_len,
+                forward_mode=mode,
+                req_pool_indices=torch.arange(self.batch_size, device=self.device),
+                seq_lens=torch.tensor(
+                    [total_len] * self.batch_size, device=self.device
+                ),
+                seq_lens_cpu=torch.tensor([total_len] * self.batch_size, device="cpu"),
+                attn_backend=self.backend,
+            )
+
+        # Add token pool
+        forward_batch.req_to_token_pool = self.model_runner.req_to_token_pool
+
+        # Write current batch's req_to_token to req_to_token_pool
+        self._mock_write_to_req_to_token_pool(self.batch_size, total_len, page_size)
+        # Add kv pool for this forward batch
+        forward_batch.token_to_kv_pool = self.model_runner.token_to_kv_pool
+
+        return forward_batch
+
+    def _setup_kv_cache(self, forward_batch, layer, cache_len):
+        # Create constant values for the prefix cache for easy debugging
+        cache_k = torch.ones(
+            self.batch_size * cache_len,
+            self.num_heads,
+            self.head_dim,
+            dtype=self.dtype,
+            device=self.device,
+        )
+        cache_v = (
+            torch.ones(
+                self.batch_size * cache_len,
+                self.num_heads,
+                self.head_dim,
+                dtype=self.dtype,
+                device=self.device,
+            )
+            * 2
+        )
+
+        # Set the prefix KV cache
+        forward_batch.token_to_kv_pool.set_kv_buffer(
+            layer,
+            torch.arange(self.batch_size * cache_len, device=self.device),
+            cache_k,
+            cache_v,
+            layer.k_scale,
+            layer.v_scale,
+        )
+
+    def _run_attention_test(self, mode, q_len, prefix_len=0, page_size=1):
+        """
+            Run an attention test with the specified parameters.
+        Args:
+            mode: ForwardMode.EXTEND or ForwardMode.DECODE
+            q_len: Length of the query sequence. For decode mode, q_len is 1.
+            prefix_len: Length of the prefix sequence for extend mode
+            page_size: Page size for the KV cache
+        """
+        layer = self._create_attention_layer()
+
+        # Create forward batch and set up
+        forward_batch = self._create_forward_batch(mode, q_len, prefix_len, page_size)
+
+        # Create QKV tensors for the input
+        q, k, v = self._create_qkv_tensors(self.batch_size * q_len)
+
+        # KV cache for prefixed extend is prefix_len
+        # KV cache for decode is same as seq_len
+        # No KV cache for extend without prefix
+        if mode == ForwardMode.EXTEND:
+            if prefix_len > 0:
+                self._setup_kv_cache(forward_batch, layer, prefix_len)
+        else:
+            self._setup_kv_cache(forward_batch, layer, self.seq_len)
+
+        self.backend.init_forward_metadata(forward_batch)
+
+        if mode == ForwardMode.EXTEND:
+            expected_shape = (
+                self.batch_size * q_len,
+                self.num_heads * self.head_dim,
+            )
+            output = self.backend.forward_extend(q, k, v, layer, forward_batch)
+        else:
+            expected_shape = (self.batch_size, self.num_heads * self.head_dim)
+            output = self.backend.forward_decode(q, k, v, layer, forward_batch)
+
+        output_ref = self._run_reference_forward(
+            mode, q, k, v, layer, forward_batch, expected_shape
+        )
+
+        self._verify_output(output, expected_shape, output_ref)
+
+        return output
+
+    def test_forward_extend(self):
+        """Test the standard extend operation."""
+        self._run_attention_test(ForwardMode.EXTEND, q_len=self.seq_len)
+
+    def test_forward_decode(self):
+        """Test the decode operation with cached tokens."""
+        self._run_attention_test(ForwardMode.DECODE, q_len=1)
+
+    def test_forward_extend_with_prefix(self):
+        """Test extending from cached prefix tokens."""
+        prefix_len = self.seq_len // 2
+        extend_len = self.seq_len - prefix_len
+        self._run_attention_test(
+            ForwardMode.EXTEND, q_len=extend_len, prefix_len=prefix_len
+        )
+
+    def test_forward_extend_with_page_size_greater_than_1(self):
+        """Test extending from cached prefix tokens with page size greater than 1."""
+        self._run_attention_test(ForwardMode.EXTEND, q_len=self.seq_len, page_size=64)
+
+    def test_forward_decode_with_page_size_greater_than_1(self):
+        """Test decode operation with page size greater than 1."""
+        self._run_attention_test(ForwardMode.DECODE, q_len=1, page_size=64)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/sglang/test/attention/test_flashattn_mla_backend.py b/python/sglang/test/attention/test_flashattn_mla_backend.py
new file mode 100644
index 000000000..ebfd0b395
--- /dev/null
+++ b/python/sglang/test/attention/test_flashattn_mla_backend.py
@@ -0,0 +1,285 @@
+import unittest
+
+import torch
+
+from sglang.srt.configs.model_config import AttentionArch
+from sglang.srt.layers.attention.flashattention_backend import FlashAttentionBackend
+from sglang.srt.layers.attention.torch_native_backend import TorchNativeAttnBackend
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.test.test_utils import CustomTestCase
+
+
+class MockModelRunner:
+    def __init__(
+        self,
+        kv_lora_rank,
+        qk_rope_head_dim,
+    ):
+        attention_arch = AttentionArch.MLA
+        self.device = "cuda"
+        self.dtype = torch.float16
+        context_len = 2048
+        self.model_config = type(
+            "ModelConfig",
+            (),
+            {
+                "context_len": context_len,
+                "attention_arch": attention_arch,
+            },
+        )
+        self.sliding_window_size = None
+
+        batch_size = 160
+        # Create a proper req_to_token_pool with the req_to_token attribute
+        self.req_to_token_pool = type(
+            "TokenPool",
+            (),
+            {
+                # A typical max_bs * max_context_len for cuda graph decode
+                "size": batch_size,
+                # Add req_to_token attribute
+                "req_to_token": torch.zeros(
+                    batch_size, context_len, dtype=torch.int32, device=self.device
+                ),
+            },
+        )
+        self.page_size = 1
+        max_total_num_tokens = batch_size * context_len
+        self.token_to_kv_pool = MLATokenToKVPool(
+            size=max_total_num_tokens,
+            page_size=self.page_size,
+            dtype=self.dtype,
+            kv_lora_rank=kv_lora_rank,
+            qk_rope_head_dim=qk_rope_head_dim,
+            layer_num=1,  # only consider layer=1 for unit test
+            device=self.device,
+            enable_memory_saver=False,
+        )
+
+
+class MockReqToTokenPool:
+    def __init__(self, batch_size, seq_len, device):
+        self.req_to_token = (
+            torch.arange(batch_size * seq_len, device=device)
+            .reshape(batch_size, seq_len)
+            .to(torch.int32)
+        )
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA")
+class TestFlashAttentionMLABackend(CustomTestCase):
+    def setUp(self):
+        # Test parameters
+        self.batch_size = 2
+        self.seq_len = 360
+        self.num_heads = 2
+        self.device = "cuda"
+        self.dtype = torch.float16
+        self.kv_lora_rank = 512
+        self.q_lora_rank = 128
+        self.qk_rope_head_dim = 64
+        self.qk_head_dim = self.qk_rope_head_dim + self.kv_lora_rank
+        # Assume no rope scaling
+        self.scaling = self.qk_head_dim**-0.5
+        # Initialize model runner and backend
+        self._init_model_runner()
+        self.backend = FlashAttentionBackend(self.model_runner)
+        self.num_local_heads = 2
+
+    def _init_model_runner(self):
+        self.model_runner = MockModelRunner(
+            kv_lora_rank=self.kv_lora_rank,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+        )
+        self.backend = FlashAttentionBackend(self.model_runner)
+
+    def _create_attention_layer(self):
+        """Create attention layer for testing."""
+        self.attn_mqa = RadixAttention(
+            num_heads=self.num_local_heads,
+            head_dim=self.kv_lora_rank + self.qk_rope_head_dim,
+            scaling=self.scaling,
+            num_kv_heads=1,
+            layer_id=0,
+            v_head_dim=self.kv_lora_rank,
+            prefix="attn_mqa",
+        )
+        return self.attn_mqa
+
+    def _run_reference_forward(
+        self, mode, q, k, v, layer, forward_batch, expected_shape
+    ):
+        """Run reference forward pass using native backend."""
+        if mode == ForwardMode.EXTEND:
+            output = self.ref_backend.forward_extend(q, k, v, layer, forward_batch)
+        else:  # ForwardMode.DECODE
+            output = self.ref_backend.forward_decode(q, k, v, layer, forward_batch)
+        return output.view(expected_shape)
+
+    def _verify_output(self, output, expected_shape):
+        """Verify output tensor shape, dtype, and values."""
+        self.assertEqual(
+            output.shape,
+            expected_shape,
+            f"Expected shape {expected_shape}, got {output.shape}",
+        )
+        self.assertEqual(output.dtype, self.dtype)
+        self.assertEqual(output.device.type, "cuda")
+        self.assertEqual(
+            torch.isnan(output).sum().item(), 0, "Output contains NaN values"
+        )
+
+    def _create_forward_batch(self, mode, q_len=None, prefix_len=0):
+        """Create a forward batch for testing based on mode and lengths."""
+        # Default to self.seq_len if not specified
+        q_len = q_len or self.seq_len
+
+        if mode == ForwardMode.EXTEND:
+            total_len = prefix_len + q_len
+            out_cache_start = prefix_len * self.batch_size
+            out_cache_end = total_len * self.batch_size
+
+            forward_batch = ForwardBatch(
+                batch_size=self.batch_size,
+                input_ids=torch.randint(
+                    0, 100, (self.batch_size, q_len), device=self.device
+                ),
+                out_cache_loc=torch.arange(
+                    out_cache_start, out_cache_end, device=self.device
+                ),
+                seq_lens_sum=self.batch_size * total_len,
+                forward_mode=mode,
+                req_pool_indices=torch.arange(self.batch_size, device=self.device),
+                seq_lens=torch.tensor(
+                    [total_len] * self.batch_size, device=self.device
+                ),
+                seq_lens_cpu=torch.tensor([total_len] * self.batch_size, device="cpu"),
+                extend_prefix_lens=torch.tensor(
+                    [prefix_len] * self.batch_size, device=self.device
+                ),
+                extend_prefix_lens_cpu=torch.tensor(
+                    [prefix_len] * self.batch_size, device="cpu"
+                ),
+                extend_seq_lens=torch.tensor(
+                    [q_len] * self.batch_size, device=self.device
+                ),
+                extend_seq_lens_cpu=torch.tensor(
+                    [q_len] * self.batch_size, device="cpu"
+                ),
+                attn_backend=self.backend,
+            )
+
+        else:  # ForwardMode.DECODE
+            decode_len = q_len  # typically 1 for decode mode
+            total_len = self.seq_len + decode_len
+            out_cache_start = self.batch_size * self.seq_len
+            out_cache_end = self.batch_size * total_len
+
+            forward_batch = ForwardBatch(
+                batch_size=self.batch_size,
+                input_ids=torch.randint(
+                    0, 100, (self.batch_size, decode_len), device=self.device
+                ),
+                out_cache_loc=torch.arange(
+                    out_cache_start, out_cache_end, device=self.device
+                ),
+                seq_lens_sum=self.batch_size * total_len,
+                forward_mode=mode,
+                req_pool_indices=torch.arange(self.batch_size, device=self.device),
+                seq_lens=torch.tensor(
+                    [total_len] * self.batch_size, device=self.device
+                ),
+                seq_lens_cpu=torch.tensor([total_len] * self.batch_size, device="cpu"),
+                attn_backend=self.backend,
+            )
+
+        # Add token pool from model runner to forward batch
+        forward_batch.req_to_token_pool = self.model_runner.req_to_token_pool
+
+        # Add KV cache from model runner to forward batch
+        forward_batch.token_to_kv_pool = self.model_runner.token_to_kv_pool
+
+        return forward_batch
+
+    def _setup_kv_cache(self, forward_batch, layer, cache_len):
+        """Set up KV cache with prefix tokens."""
+        if cache_len <= 0:
+            return
+
+        # Create constant values for the prefix cache for easy debugging
+        latent_cache = torch.ones(
+            self.batch_size * cache_len,
+            1,  # latent cache has only one head in MQA
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            dtype=self.dtype,
+            device=self.device,
+        )
+
+        # Set the prefix KV cache
+        forward_batch.token_to_kv_pool.set_kv_buffer(
+            layer,
+            torch.arange(self.batch_size * cache_len, device=self.device),
+            latent_cache,
+            None,
+        )
+
+    def _run_attention_test(self, mode, q_len, prefix_len=0):
+        """
+            Run an attention test with the specified parameters.
+        Args:
+            mode: ForwardMode.EXTEND or ForwardMode.DECODE
+            q_len: Length of the query sequence. For decode mode, q_len is 1.
+            prefix_len: Length of the prefix sequence for extend mode
+        """
+        layer = self._create_attention_layer()
+
+        # Create forward batch and set up
+        forward_batch = self._create_forward_batch(mode, q_len, prefix_len)
+
+        # Create q, kv_compressed for testing
+        q_shape = (self.batch_size * q_len, self.num_heads, self.qk_head_dim)
+        kv_shape = (self.batch_size * q_len, self.qk_head_dim)
+        q = torch.randn(q_shape, dtype=self.dtype, device=self.device)
+        kv_compressed = torch.randn(kv_shape, dtype=self.dtype, device=self.device)
+        # v is not used for mqa, all values passed in through k
+        k = kv_compressed.unsqueeze(1)
+        v = torch.randn((1), dtype=self.dtype, device=self.device)
+
+        self._setup_kv_cache(forward_batch, layer, prefix_len)
+
+        self.backend.init_forward_metadata(forward_batch)
+
+        expected_shape = (
+            self.batch_size * q_len,
+            self.num_heads * self.kv_lora_rank,
+        )
+
+        if mode == ForwardMode.EXTEND:
+            output = self.backend.forward_extend(q, k, v, layer, forward_batch)
+        else:
+            output = self.backend.forward_decode(q, k, v, layer, forward_batch)
+
+        self._verify_output(output, expected_shape)
+        return output
+
+    def test_forward_extend(self):
+        """Test the standard extend operation."""
+        self._run_attention_test(ForwardMode.EXTEND, q_len=self.seq_len)
+
+    def test_forward_decode(self):
+        """Test the decode operation with cached tokens."""
+        self._run_attention_test(ForwardMode.DECODE, q_len=1)
+
+    def test_forward_extend_with_prefix(self):
+        """Test extending from cached prefix tokens."""
+        prefix_len = self.seq_len // 2
+        extend_len = self.seq_len - prefix_len
+        self._run_attention_test(
+            ForwardMode.EXTEND, q_len=extend_len, prefix_len=prefix_len
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/sglang/test/attention/test_prefix_chunk_info.py b/python/sglang/test/attention/test_prefix_chunk_info.py
new file mode 100644
index 000000000..c02d4d1d6
--- /dev/null
+++ b/python/sglang/test/attention/test_prefix_chunk_info.py
@@ -0,0 +1,226 @@
+import unittest
+
+import torch
+
+from sglang.srt.layers.attention.flashattention_backend import FlashAttentionBackend
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.test.test_utils import CustomTestCase
+
+TEST_CASES = [
+    # Sequence with same prefix lens
+    {
+        "batch_size": 3,
+        "prefix_lens": [64, 64, 64],
+        "max_chunk_capacity": 48,
+        "prefix_chunk_len": 16,
+        "num_prefix_chunks": 4,
+        "prefix_chunk_starts": torch.tensor(
+            [
+                [0, 0, 0],
+                [16, 16, 16],
+                [32, 32, 32],
+                [48, 48, 48],
+            ],
+            dtype=torch.int32,
+        ),
+        "prefix_chunk_seq_lens": torch.tensor(
+            [
+                [16, 16, 16],
+                [16, 16, 16],
+                [16, 16, 16],
+                [16, 16, 16],
+            ],
+            dtype=torch.int32,
+        ),
+    },
+    # Sequence with different prefix lens
+    {
+        "batch_size": 4,
+        "prefix_lens": [16, 32, 48, 64],
+        "max_chunk_capacity": 64,
+        "prefix_chunk_len": 16,
+        "num_prefix_chunks": 4,
+        "prefix_chunk_starts": torch.tensor(
+            [
+                [0, 0, 0, 0],
+                [16, 16, 16, 16],
+                [32, 32, 32, 32],
+                [48, 48, 48, 48],
+            ],
+            dtype=torch.int32,
+        ),
+        "prefix_chunk_seq_lens": torch.tensor(
+            [
+                [16, 16, 16, 16],
+                [0, 16, 16, 16],
+                [0, 0, 16, 16],
+                [0, 0, 0, 16],
+            ],
+            dtype=torch.int32,
+        ),
+    },
+    # Sequence with irregular shapes
+    {
+        "batch_size": 2,
+        "prefix_lens": [1, 64],
+        "max_chunk_capacity": 31,
+        "prefix_chunk_len": 15,
+        "num_prefix_chunks": 5,
+        "prefix_chunk_starts": torch.tensor(
+            [
+                [0, 0],
+                [15, 15],
+                [30, 30],
+                [45, 45],
+                [60, 60],
+            ],
+            dtype=torch.int32,
+        ),
+        "prefix_chunk_seq_lens": torch.tensor(
+            [
+                [1, 15],
+                [0, 15],
+                [0, 15],
+                [0, 15],
+                [0, 4],
+            ],
+            dtype=torch.int32,
+        ),
+    },
+]
+
+
+class MockForwardBatch(ForwardBatch):
+    def __init__(self, max_chunk_capacity: int, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.max_chunk_capacity = max_chunk_capacity
+
+    def get_max_chunk_capacity(self):
+        return self.max_chunk_capacity
+
+
+class MockReqToTokenPool:
+    def __init__(self, batch_size, seq_len, device):
+        self.req_to_token = (
+            torch.arange(batch_size * seq_len, device=device)
+            .reshape(batch_size, seq_len)
+            .to(torch.int32)
+        )
+
+
+# Test correctness of triton kernel for computing kv indices
+def check_kv_indices(forward_batch):
+    for i in range(forward_batch.num_prefix_chunks):
+        computed_kv_indices = forward_batch.prefix_chunk_kv_indices[i]
+        req_to_token = forward_batch.req_to_token_pool.req_to_token[
+            : forward_batch.batch_size, :
+        ]
+        ref_kv_indices = torch.empty(
+            forward_batch.prefix_chunk_num_tokens[i],
+            dtype=torch.int32,
+            device=computed_kv_indices.device,
+        )
+        running_ptr = 0
+        for j in range(forward_batch.batch_size):
+            seq_start = forward_batch.prefix_chunk_starts[i, j].item()
+            seq_len = forward_batch.prefix_chunk_seq_lens[i, j].item()
+            ref_kv_indices[running_ptr : running_ptr + seq_len].copy_(
+                req_to_token[j, seq_start : seq_start + seq_len]
+            )
+            running_ptr += seq_len
+        assert torch.allclose(computed_kv_indices, ref_kv_indices)
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA")
+class TestPrefixChunkInfo(CustomTestCase):
+    def setUp(self):
+        # Common test parameters
+        self.num_local_heads = 128
+        self.kv_lora_rank = 512
+        self.qk_rope_head_dim = 64
+        self.device = torch.device("cuda")
+        self.dtype = torch.bfloat16
+        self.extend_len = 64
+        self.max_bs = 4
+        self.max_seq_len = 128
+
+        # req_to_token_pool
+        self.req_to_token_pool = MockReqToTokenPool(
+            self.max_bs,
+            self.max_seq_len,
+            self.device,
+        )
+
+        # token_to_kv_pool
+        self.token_to_kv_pool = MLATokenToKVPool(
+            size=self.max_bs * self.max_seq_len,
+            page_size=1,  # only consider page=1 for unit test
+            dtype=self.dtype,
+            kv_lora_rank=self.kv_lora_rank,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            layer_num=1,  # only consider layer=1 for unit test
+            device=self.device,
+            enable_memory_saver=False,
+        )
+
+    def test_prefix_chunk_info(self):
+        """Test the standard extend operation."""
+
+        for test_case in TEST_CASES:
+            print(
+                f"Test case with batch_size={test_case['batch_size']}, prefix_lens={test_case['prefix_lens']}, max_chunk_capacity={test_case['max_chunk_capacity']}"
+            )
+            batch_size = test_case["batch_size"]
+            prefix_lens_cpu = test_case["prefix_lens"]
+            assert len(prefix_lens_cpu) == batch_size
+            prefix_lens = torch.tensor(prefix_lens_cpu, device=self.device)
+            max_chunk_capacity = test_case["max_chunk_capacity"]
+            seq_lens_cpu = [
+                self.extend_len + prefix_lens_cpu[i] for i in range(batch_size)
+            ]
+            seq_lens = torch.tensor(seq_lens_cpu, device=self.device)
+
+            # Create forward batch
+            # input_ids and out_cache_loc are dummy tensors in this test
+            forward_batch = MockForwardBatch(
+                max_chunk_capacity=max_chunk_capacity,
+                batch_size=batch_size,
+                input_ids=torch.randint(
+                    0, 100, (batch_size, self.extend_len), device=self.device
+                ),
+                out_cache_loc=torch.arange(
+                    self.max_bs * self.max_seq_len - batch_size * self.extend_len,
+                    self.max_bs * self.max_seq_len,
+                    device=self.device,
+                ),
+                seq_lens_sum=sum(seq_lens_cpu),
+                forward_mode=ForwardMode.EXTEND,
+                req_pool_indices=torch.arange(batch_size, device=self.device),
+                seq_lens=seq_lens,
+                seq_lens_cpu=seq_lens_cpu,
+                extend_prefix_lens=prefix_lens,
+                extend_prefix_lens_cpu=prefix_lens_cpu,
+            )
+            forward_batch.req_to_token_pool = self.req_to_token_pool
+            forward_batch.token_to_kv_pool = self.token_to_kv_pool
+
+            forward_batch.prepare_chunked_prefix_cache_info(self.device)
+            assert forward_batch.get_max_chunk_capacity() == max_chunk_capacity
+            assert forward_batch.prefix_chunk_len == test_case["prefix_chunk_len"]
+            assert forward_batch.num_prefix_chunks == test_case["num_prefix_chunks"]
+            assert torch.allclose(
+                forward_batch.prefix_chunk_starts,
+                test_case["prefix_chunk_starts"].to(self.device),
+            )
+            assert torch.allclose(
+                forward_batch.prefix_chunk_seq_lens,
+                test_case["prefix_chunk_seq_lens"].to(self.device),
+            )
+
+            check_kv_indices(forward_batch)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/sglang/test/attention/test_trtllm_mla_backend.py b/python/sglang/test/attention/test_trtllm_mla_backend.py
new file mode 100755
index 000000000..6f610baf0
--- /dev/null
+++ b/python/sglang/test/attention/test_trtllm_mla_backend.py
@@ -0,0 +1,1268 @@
+import math
+import unittest
+
+import numpy as np
+import torch
+
+from sglang.srt.layers import dp_attention as _dp_attn
+
+# Patch DP-attention globals before importing backends
+# TODO: change the interface of both trtllm_mla and flashinfer backends to take tp_size as an argument instead of patching
+_dp_attn.get_attention_tp_size = lambda: 1  # TP size = 1 for unit test
+
+from sglang.srt.configs.model_config import AttentionArch
+from sglang.srt.layers.attention.flashinfer_mla_backend import FlashInferMLAAttnBackend
+from sglang.srt.layers.attention.trtllm_mla_backend import (
+    TRTLLMMLABackend,
+    TRTLLMMLADecodeMetadata,
+)
+from sglang.srt.layers.attention.utils import TRITON_PAD_NUM_PAGE_PER_BLOCK
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.utils import is_flashinfer_available
+from sglang.test.test_utils import CustomTestCase
+
+# Global configuration for all tests
+DEFAULT_CONFIG = {
+    "device": "cuda",
+    "dtype": torch.bfloat16,
+    "kv_cache_dtype": torch.bfloat16,
+    "context_len": 2048,
+    "max_bs": 64,
+    "tolerance": 1e-2,
+    "seed_cache": 42,
+    "seed_qkv": 123,
+    # MLA model config (TRTLLM MLA has fixed constraints)
+    "num_attention_heads": 128,
+    "kv_lora_rank": 512,
+    "qk_nope_head_dim": 128,
+    "qk_rope_head_dim": 64,
+    "v_head_dim": 512,
+    "num_kv_heads": 1,
+    "layer_id": 0,
+    "tp_q_head_num": 128,
+    "tp_k_head_num": 128,
+    "prefill_head_dim": 192,
+    "prefill_v_head_dim": 128,
+}
+
+ROPE_BASE = 10000
+ROPE_SCALING_CONFIG = {
+    "beta_fast": 32,
+    "beta_slow": 1,
+    "factor": 40,
+    "mscale": 1.0,
+    "mscale_all_dim": 1.0,
+    "original_max_position_embeddings": 4096,
+    "type": "yarn",
+    "rope_type": "deepseek_yarn",
+}
+
+
+def build_rotary_emb(config, device=None):
+    from sglang.srt.layers.rotary_embedding import get_rope_wrapper
+
+    dev = device or config["device"]
+    rope_scaling = config.get("rope_scaling", ROPE_SCALING_CONFIG)
+    rotary = get_rope_wrapper(
+        head_size=config["qk_rope_head_dim"],
+        rotary_dim=config["qk_rope_head_dim"],
+        max_position=config["context_len"],
+        base=ROPE_BASE,
+        rope_scaling=rope_scaling,
+        is_neox_style=False,
+        device=dev,
+    )
+    rotary.cos_sin_cache = rotary.cos_sin_cache.to(dev)
+    return rotary
+
+
+# Centralized test cases for different test scenarios
+TEST_CASES = {
+    "basic_functionality": [
+        {
+            "name": "single",
+            "batch_size": 1,
+            "max_seq_len": 32,
+            "page_size": 32,
+            "description": "Minimal smoke test",
+        },
+        {
+            "name": "batch",
+            "batch_size": 32,
+            "max_seq_len": 128,
+            "page_size": 32,
+            "description": "Medium-scale batch",
+        },
+    ],
+    "output_match": [
+        {
+            "name": "single_fp16",
+            "batch_size": 1,
+            "max_seq_len": 64,
+            "page_size": 32,
+            "description": "Single FP16 vs reference",
+        },
+        {
+            "name": "single_fp8",
+            "batch_size": 1,
+            "max_seq_len": 64,
+            "page_size": 64,
+            "tolerance": 1e-1,
+            "kv_cache_dtype": torch.float8_e4m3fn,
+            "description": "Single FP8 vs reference",
+        },
+        {
+            "name": "batch_fp16",
+            "batch_size": 32,
+            "max_seq_len": 64,
+            "page_size": 32,
+            "description": "Batch FP16 vs reference",
+        },
+        {
+            "name": "batch_fp8",
+            "batch_size": 32,
+            "max_seq_len": 64,
+            "page_size": 64,
+            "tolerance": 1e-1,
+            "kv_cache_dtype": torch.float8_e4m3fn,
+            "description": "Batch FP8 vs reference",
+        },
+    ],
+    "page_size_consistency": [
+        # Only 32 and 64 supported for now in flashinfer TRTLLM-GEN MLA kernel
+        {
+            "name": "page_32",
+            "batch_size": 8,
+            "max_seq_len": 128,
+            "page_size": 32,
+            "description": "32-token pages",
+        },
+        {
+            "name": "page_64",
+            "batch_size": 8,
+            "max_seq_len": 128,
+            "page_size": 64,
+            "description": "64-token pages",
+        },
+    ],
+    "shape_sanity_tests": [
+        {
+            "name": "basic",
+            "batch_size": 1,
+            "max_seq_len": 128,
+            "page_size": 32,
+            "description": "Single sequence",
+        },
+        {
+            "name": "basic_different_pagesize",
+            "batch_size": 1,
+            "max_seq_len": 128,
+            "page_size": 64,
+            "description": "Different page size",
+        },
+        {
+            "name": "batch",
+            "batch_size": 8,
+            "max_seq_len": 128,
+            "page_size": 32,
+            "description": "Batch shapes",
+        },
+    ],
+    "metadata_tests": [
+        {
+            "name": "single_sequence",
+            "batch_size": 1,
+            "max_seq_len": 64,
+            "page_size": 32,
+            "description": "Single sequence metadata",
+        },
+        {
+            "name": "batch_mixed_lengths",
+            "batch_size": 8,
+            "max_seq_len": 128,
+            "page_size": 32,
+            "description": "Mixed sequence lengths",
+        },
+        {
+            "name": "large_batch",
+            "batch_size": 32,
+            "max_seq_len": 256,
+            "page_size": 64,
+            "description": "Large batch stress test",
+        },
+        {
+            "name": "edge_case_short",
+            "batch_size": 4,
+            "max_seq_len": 16,
+            "page_size": 32,
+            "description": "Sub-page sequences",
+        },
+    ],
+}
+
+
+class MockModelRunner:
+    """Minimal fake ModelRunner for testing MLA backends."""
+
+    def __init__(self, config):
+        self.device = config["device"]
+        self.dtype = config["dtype"]
+        self.kv_cache_dtype = config["kv_cache_dtype"]
+        self.page_size = config["page_size"]
+
+        # Server args stub - needed by attention backends
+        self.server_args = type(
+            "ServerArgs",
+            (),
+            {
+                "enable_dp_attention": False,  # Default value for testing
+            },
+        )
+
+        # Model-config stub with MLA attributes
+        self.model_config = type(
+            "ModelConfig",
+            (),
+            {
+                "context_len": config["context_len"],
+                "attention_arch": AttentionArch.MLA,
+                "num_attention_heads": config["num_attention_heads"],
+                "kv_lora_rank": config["kv_lora_rank"],
+                "qk_nope_head_dim": config["qk_nope_head_dim"],
+                "qk_rope_head_dim": config["qk_rope_head_dim"],
+                "v_head_dim": config["v_head_dim"],
+                "scaling": 1.0
+                / ((config["qk_nope_head_dim"] + config["qk_rope_head_dim"]) ** 0.5),
+                "get_num_kv_heads": staticmethod(lambda _: config["num_kv_heads"]),
+            },
+        )
+
+        # Req-to-token pool
+        max_bs = config["max_bs"]
+        max_ctx = self.model_config.context_len
+        req_to_token = torch.arange(
+            max_bs * max_ctx, dtype=torch.int32, device=self.device
+        ).reshape(max_bs, max_ctx)
+        self.req_to_token_pool = type(
+            "TokenPool",
+            (),
+            {
+                "size": max_bs,
+                "req_to_token": req_to_token,
+            },
+        )
+
+        # KV-token pool (MLA)
+        self.token_to_kv_pool = MLATokenToKVPool(
+            size=max_bs * max_ctx,
+            page_size=config["page_size"],
+            dtype=self.kv_cache_dtype,
+            kv_lora_rank=config["kv_lora_rank"],
+            qk_rope_head_dim=config["qk_rope_head_dim"],
+            layer_num=1,
+            device=self.device,
+            enable_memory_saver=False,
+        )
+
+
+def compare_outputs(trtllm_out, reference_out, tolerance=1e-2):
+    """Compare outputs with detailed analysis."""
+
+    # Basic checks
+    assert (
+        trtllm_out.shape == reference_out.shape
+    ), f"Shape mismatch: {trtllm_out.shape} vs {reference_out.shape}"
+    assert (
+        trtllm_out.dtype == reference_out.dtype
+    ), f"Dtype mismatch: {trtllm_out.dtype} vs {reference_out.dtype}"
+
+    # Check for NaN/Inf
+    assert not torch.isnan(trtllm_out).any(), "TRTLLM output contains NaN"
+    assert not torch.isnan(reference_out).any(), "Reference output contains NaN"
+    assert not torch.isinf(trtllm_out).any(), "TRTLLM output contains Inf"
+    assert not torch.isinf(reference_out).any(), "Reference output contains Inf"
+
+    # Element-wise differences
+    diff = (trtllm_out - reference_out).abs()
+    max_diff = diff.max().item()
+    mean_diff = diff.mean().item()
+
+    # Check numerical equivalence
+    all_close = torch.allclose(
+        trtllm_out, reference_out, rtol=tolerance, atol=tolerance
+    )
+
+    if not all_close:
+        print(
+            f"Comparison failed: max_diff={max_diff:.6f}, mean_diff={mean_diff:.6f}, tolerance={tolerance}"
+        )
+        # Find top differences for debugging
+        flat_diff = diff.flatten()
+        top_diff_indices = torch.topk(flat_diff, k=min(5, flat_diff.numel())).indices
+        print("Top 5 differences:")
+        for i, idx in enumerate(top_diff_indices):
+            idx_tuple = np.unravel_index(idx.cpu().numpy(), trtllm_out.shape)
+            trt_val = trtllm_out[idx_tuple].item()
+            ref_val = reference_out[idx_tuple].item()
+            print(
+                f"  [{idx_tuple}]: TRTLLM={trt_val:.6f}, Reference={ref_val:.6f}, diff={abs(trt_val-ref_val):.6f}"
+            )
+
+    return all_close
+
+
+@unittest.skipIf(
+    not torch.cuda.is_available() or not is_flashinfer_available(),
+    "CUDA + flashinfer required",
+)
+class TestTRTLLMMLA(CustomTestCase):
+    """Test suite for TRTLLM MLA backend with centralized configuration."""
+
+    def _merge_config(self, test_case):
+        """Merge test case with default configuration."""
+        config = DEFAULT_CONFIG.copy()
+        config.update(test_case)
+        return config
+
+    def _create_model_components(self, config, is_prefill=False):
+        """Create model runners, backends, and layer for testing."""
+        # Create model runners
+        model_runner_trtllm = MockModelRunner(config)
+        model_runner_reference = MockModelRunner(config)
+
+        # Create backends
+        trtllm_backend = TRTLLMMLABackend(model_runner_trtllm)
+        reference_backend = FlashInferMLAAttnBackend(model_runner_reference)
+
+        head_dim = (
+            config["kv_lora_rank"] + config["qk_rope_head_dim"]
+            if not is_prefill
+            else config["prefill_head_dim"]
+        )
+        v_head_dim = (
+            config["v_head_dim"] if not is_prefill else config["prefill_v_head_dim"]
+        )
+
+        # Create RadixAttention layer
+        layer = RadixAttention(
+            num_heads=config["num_attention_heads"],
+            head_dim=head_dim,
+            scaling=model_runner_trtllm.model_config.scaling,
+            num_kv_heads=config["num_kv_heads"],
+            layer_id=config["layer_id"],
+            v_head_dim=v_head_dim,
+            prefix="attn_mqa",
+        )
+
+        return (
+            model_runner_trtllm,
+            model_runner_reference,
+            trtllm_backend,
+            reference_backend,
+            layer,
+        )
+
+    def _create_qkv_tensors(self, batch_size, config, dtype_override=None):
+        """Create Q, K, V random tensors for given batch size with separate MLA components.
+
+        Args:
+            batch_size: Batch size.
+            config: Configuration dict with model dims and device.
+            dtype_override: Optional torch dtype to override config["dtype"].
+
+        Returns:
+            Tuple of (q_nope, q_rope, k_nope, k_rope, v, cos_sin_cache)
+        """
+        device = config["device"]
+        target_dtype = dtype_override or config["dtype"]
+
+        # Create separate nope and rope components for Q
+        q_nope = torch.randn(
+            (batch_size, config["num_attention_heads"], config["kv_lora_rank"]),
+            dtype=config["dtype"],
+            device=device,
+        )
+        q_rope = torch.randn(
+            (batch_size, config["num_attention_heads"], config["qk_rope_head_dim"]),
+            dtype=config["dtype"],
+            device=device,
+        )
+
+        # Create separate nope and rope components for K
+        k_nope = torch.randn(
+            (batch_size, config["num_kv_heads"], config["kv_lora_rank"]),
+            dtype=config["dtype"],
+            device=device,
+        )
+        k_rope = torch.randn(
+            (batch_size, config["num_kv_heads"], config["qk_rope_head_dim"]),
+            dtype=config["dtype"],
+            device=device,
+        )
+
+        # V tensor (unchanged)
+        v = torch.randn(
+            (batch_size, config["num_kv_heads"], config["v_head_dim"]),
+            dtype=config["dtype"],
+            device=device,
+        )
+
+        return q_nope, q_rope, k_nope, k_rope, v
+
+    def _create_forward_batch(
+        self, batch_size, seq_lens, backend, model_runner, config
+    ):
+        """Create a forward batch for the given backend."""
+        fb = ForwardBatch(
+            batch_size=batch_size,
+            input_ids=torch.randint(0, 100, (batch_size, 1), device=config["device"]),
+            out_cache_loc=torch.arange(batch_size, device=config["device"]),
+            seq_lens_sum=int(seq_lens.sum().item()),
+            forward_mode=ForwardMode.DECODE,
+            req_pool_indices=torch.arange(batch_size, device=config["device"]),
+            seq_lens=seq_lens,
+            seq_lens_cpu=seq_lens.cpu(),
+            attn_backend=backend,
+        )
+        fb.req_to_token_pool = model_runner.req_to_token_pool
+        fb.token_to_kv_pool = model_runner.token_to_kv_pool
+
+        # Add position information for RoPE
+        fb.positions = torch.arange(batch_size, device=config["device"])
+
+        return fb
+
+    def _populate_kv_cache(self, batch_size, seq_lens, model_runners, layer, config):
+        """Populate KV cache with identical data for both backends."""
+        torch.manual_seed(config["seed_cache"])  # Fixed seed for reproducible cache
+
+        for model_runner in model_runners:
+            torch.manual_seed(config["seed_cache"])  # Reset seed for each backend
+            for i in range(batch_size):
+                seq_len = int(seq_lens[i].item())
+                for token_idx in range(seq_len - 1):
+                    # Create random K components for MLA
+                    cache_k_nope = torch.randn(
+                        (1, config["kv_lora_rank"]),
+                        dtype=config["dtype"],
+                        device=config["device"],
+                    )
+                    cache_k_rope = torch.randn(
+                        (1, config["qk_rope_head_dim"]),
+                        dtype=config["dtype"],
+                        device=config["device"],
+                    )
+
+                    # Calculate cache location
+                    cache_loc = model_runner.req_to_token_pool.req_to_token[
+                        i, token_idx
+                    ]
+
+                    # Save to KV cache
+                    model_runner.token_to_kv_pool.set_mla_kv_buffer(
+                        layer,
+                        cache_loc.unsqueeze(0),
+                        cache_k_nope.squeeze(0),
+                        cache_k_rope.squeeze(0),
+                    )
+
+    def test_basic_functionality(self):
+        """Test basic functionality with minimal setup."""
+        print(f"\nRunning basic functionality tests...")
+
+        for test_case in TEST_CASES["basic_functionality"]:
+            with self.subTest(test_case=test_case["name"]):
+                print(f"  Testing {test_case['name']}: {test_case['description']}")
+
+                config = self._merge_config(test_case)
+                batch_size = config["batch_size"]
+                max_seq_len = config["max_seq_len"]
+
+                # Create components
+                model_runner_trtllm, _, trtllm_backend, _, layer = (
+                    self._create_model_components(config)
+                )
+
+                # Create sequence lengths - properly handle different batch sizes
+                if batch_size == 2:
+                    seq_lens = torch.tensor(
+                        [max_seq_len, max_seq_len // 2], device=config["device"]
+                    )
+                else:
+                    # For larger batch sizes, create varied sequence lengths
+                    torch.manual_seed(config["seed_cache"])
+                    seq_lens = torch.randint(
+                        max_seq_len // 2,
+                        max_seq_len + 1,
+                        (batch_size,),
+                        device=config["device"],
+                    )
+                    seq_lens[0] = max_seq_len  # Ensure at least one max length
+
+                # Create forward batch
+                fb = self._create_forward_batch(
+                    batch_size, seq_lens, trtllm_backend, model_runner_trtllm, config
+                )
+                trtllm_backend.init_forward_metadata(fb)
+
+                # Populate KV cache
+                self._populate_kv_cache(
+                    batch_size, seq_lens, [model_runner_trtllm], layer, config
+                )
+
+                # Create Q, K, V tensors with separate MLA components
+                torch.manual_seed(config["seed_qkv"])
+                q_nope, q_rope, k_nope, k_rope, v = self._create_qkv_tensors(
+                    batch_size, config
+                )
+
+                # Run forward decode with separate MLA components
+                output = trtllm_backend.forward_decode(
+                    q_nope, k_nope, None, layer, fb, q_rope=q_rope, k_rope=k_rope
+                )
+
+                # Basic checks
+                expected_shape = (
+                    batch_size,
+                    config["num_attention_heads"] * config["v_head_dim"],
+                )
+                self.assertEqual(output.shape, expected_shape)
+                self.assertEqual(output.dtype, config["dtype"])
+                self.assertFalse(torch.isnan(output).any())
+                self.assertFalse(torch.isinf(output).any())
+
+    def test_decode_output_match(self):
+        """Test that TRTLLM and FlashInfer MLA backends produce matching outputs."""
+        print(f"\nRunning decode output matching tests...")
+
+        for test_case in TEST_CASES["output_match"]:
+            with self.subTest(test_case=test_case["name"]):
+                print(f"  Testing {test_case['name']}: {test_case['description']}")
+
+                config = self._merge_config(test_case)
+                batch_size = config["batch_size"]
+                max_seq_len = config["max_seq_len"]
+                use_fp8 = config["kv_cache_dtype"] == torch.float8_e4m3fn
+
+                # Create components
+                (
+                    model_runner_trtllm,
+                    model_runner_reference,
+                    trtllm_backend,
+                    reference_backend,
+                    layer,
+                ) = self._create_model_components(config)
+
+                # Create identical sequence lengths for both backends
+                torch.manual_seed(config["seed_cache"])
+                seq_lens = torch.randint(
+                    1, max_seq_len, (batch_size,), device=config["device"]
+                )
+                seq_lens[0] = max_seq_len  # Ensure at least one max length
+
+                # Create forward batches with identical inputs
+                fb_trtllm = self._create_forward_batch(
+                    batch_size,
+                    seq_lens.clone(),
+                    trtllm_backend,
+                    model_runner_trtllm,
+                    config,
+                )
+                fb_reference = self._create_forward_batch(
+                    batch_size,
+                    seq_lens.clone(),
+                    reference_backend,
+                    model_runner_reference,
+                    config,
+                )
+
+                # Initialize metadata for both backends
+                trtllm_backend.init_forward_metadata(fb_trtllm)
+                reference_backend.init_forward_metadata(fb_reference)
+
+                # Populate both KV caches identically
+                self._populate_kv_cache(
+                    batch_size,
+                    seq_lens,
+                    [model_runner_trtllm, model_runner_reference],
+                    layer,
+                    config,
+                )
+
+                # Create Q, K, V tensors for current decode step
+                torch.manual_seed(config["seed_qkv"])
+
+                q_nope_ref, q_rope_ref, k_nope_ref, k_rope_ref, v_ref = (
+                    self._create_qkv_tensors(batch_size, config)
+                )
+                q_nope_trt, q_rope_trt, k_nope_trt, k_rope_trt, v_trt = (
+                    q_nope_ref.clone(),
+                    q_rope_ref.clone(),
+                    k_nope_ref.clone(),
+                    k_rope_ref.clone(),
+                    v_ref.clone(),
+                )
+                tolerance = config["tolerance"]
+
+                extra_args = {}
+                if use_fp8:
+                    # TRT kernel applies RoPE + FP8 quantization internally
+                    # pre-apply RoPE on the reference (FlashInfer) path here so
+                    # both paths share the same rope params/cache while keeping
+                    # the TRT path unrotated.
+                    rotary_emb = build_rotary_emb(config)
+                    q_rope_ref, k_rope_ref = rotary_emb(
+                        fb_reference.positions, q_rope_ref, k_rope_ref
+                    )
+                    extra_args = {
+                        "cos_sin_cache": rotary_emb.cos_sin_cache,
+                        "is_neox": rotary_emb.is_neox_style,
+                    }
+
+                    dtype = q_rope_ref.dtype
+                    q_rope_ref = q_rope_ref.to(torch.float8_e4m3fn).to(dtype)
+                    q_nope_ref = q_nope_ref.to(torch.float8_e4m3fn).to(dtype)
+                    k_rope_ref = k_rope_ref.to(torch.float8_e4m3fn).to(dtype)
+                    k_nope_ref = k_nope_ref.to(torch.float8_e4m3fn).to(dtype)
+
+                # Run forward decode on both backends
+                out_trtllm = trtllm_backend.forward_decode(
+                    q_nope_trt,
+                    k_nope_trt,
+                    None,
+                    layer,
+                    fb_trtllm,
+                    q_rope=q_rope_trt,
+                    k_rope=k_rope_trt,
+                    **extra_args,
+                )
+
+                # Reference backend should also take separate components, not concatenated
+                out_reference = reference_backend.forward_decode(
+                    q_nope_ref,
+                    k_nope_ref,
+                    v_ref,
+                    layer,
+                    fb_reference,
+                    q_rope=q_rope_ref,
+                    k_rope=k_rope_ref,
+                )
+
+                # Compare outputs
+                comparison_passed = compare_outputs(
+                    out_trtllm, out_reference, tolerance=tolerance
+                )
+
+                self.assertTrue(
+                    comparison_passed,
+                    f"TRTLLM and Reference outputs differ beyond tolerance. "
+                    f"Config: {test_case['name']}, "
+                    f"Max diff: {(out_trtllm - out_reference).abs().max().item()}",
+                )
+
+    def test_page_size_consistency(self):
+        """Test output consistency across different page sizes."""
+        print(f"\nRunning page size consistency tests...")
+
+        for test_case in TEST_CASES["page_size_consistency"]:
+            with self.subTest(test_case=test_case["name"]):
+                print(f"  Testing {test_case['name']}: {test_case['description']}")
+
+                config = self._merge_config(test_case)
+                batch_size = config["batch_size"]
+                max_seq_len = config["max_seq_len"]
+
+                # Create components
+                model_runner, _, backend, _, layer = self._create_model_components(
+                    config
+                )
+
+                # Create sequence lengths
+                torch.manual_seed(config["seed_cache"])
+                seq_lens = torch.randint(
+                    1, max_seq_len, (batch_size,), device=config["device"]
+                )
+                seq_lens[0] = max_seq_len
+
+                # Create forward batch
+                fb = self._create_forward_batch(
+                    batch_size, seq_lens, backend, model_runner, config
+                )
+                backend.init_forward_metadata(fb)
+
+                # Populate KV cache
+                self._populate_kv_cache(
+                    batch_size, seq_lens, [model_runner], layer, config
+                )
+
+                # Create Q, K, V tensors with separate MLA components
+                torch.manual_seed(config["seed_qkv"])
+                q_nope, q_rope, k_nope, k_rope, v = self._create_qkv_tensors(
+                    batch_size, config
+                )
+
+                # Run forward decode with separate MLA components
+                output = backend.forward_decode(
+                    q_nope, k_nope, None, layer, fb, q_rope=q_rope, k_rope=k_rope
+                )
+
+                expected_shape = (
+                    batch_size,
+                    config["num_attention_heads"] * config["v_head_dim"],
+                )
+                self.assertEqual(
+                    output.shape,
+                    expected_shape,
+                    f"Output shape mismatch: {output.shape} vs {expected_shape}",
+                )
+                self.assertFalse(torch.isnan(output).any(), "Output contains NaN")
+                self.assertFalse(torch.isinf(output).any(), "Output contains Inf")
+
+    def test_shape_sanity(self):
+        """Smoke test decode across several configurations."""
+        print(f"\nRunning shape sanity tests...")
+
+        for test_case in TEST_CASES["shape_sanity_tests"]:
+            with self.subTest(test_case=test_case["name"]):
+                print(f"  Testing {test_case['name']}: {test_case['description']}")
+
+                config = self._merge_config(test_case)
+                batch_size = config["batch_size"]
+                max_seq_len = config["max_seq_len"]
+
+                model_runner, _, backend, _, layer = self._create_model_components(
+                    config
+                )
+
+                # Random seq lens (ensure one matches max)
+                torch.manual_seed(config["seed_cache"])
+                seq_lens = torch.randint(
+                    1, max_seq_len, (batch_size,), device=config["device"]
+                )
+                seq_lens[0] = max_seq_len
+
+                fb = self._create_forward_batch(
+                    batch_size, seq_lens, backend, model_runner, config
+                )
+                backend.init_forward_metadata(fb)
+
+                # Create Q, K, V tensors with separate MLA components
+                torch.manual_seed(config["seed_qkv"])
+                q_nope = torch.randn(
+                    (batch_size, config["num_attention_heads"], config["kv_lora_rank"]),
+                    dtype=config["dtype"],
+                    device=config["device"],
+                )
+                k_nope = torch.randn(
+                    (batch_size, config["num_kv_heads"], config["kv_lora_rank"]),
+                    dtype=config["dtype"],
+                    device=config["device"],
+                )
+                q_rope = torch.randn(
+                    (
+                        batch_size,
+                        config["num_attention_heads"],
+                        config["qk_rope_head_dim"],
+                    ),
+                    dtype=config["dtype"],
+                    device=config["device"],
+                )
+                k_rope = torch.randn(
+                    (batch_size, config["num_kv_heads"], config["qk_rope_head_dim"]),
+                    dtype=config["dtype"],
+                    device=config["device"],
+                )
+                v = None  # Test with None v
+
+                # Run forward decode
+                output = backend.forward_decode(
+                    q_nope, k_nope, v, layer, fb, q_rope=q_rope, k_rope=k_rope
+                )
+
+                # Shape and sanity checks
+                expected_shape = (
+                    batch_size,
+                    config["num_attention_heads"] * config["v_head_dim"],
+                )
+                self.assertEqual(
+                    output.shape,
+                    expected_shape,
+                    f"Output shape mismatch for {test_case['name']}",
+                )
+                self.assertEqual(output.dtype, config["dtype"])
+                self.assertEqual(output.device.type, "cuda")
+                self.assertFalse(
+                    torch.isnan(output).any(),
+                    f"Output contains NaN for {test_case['name']}",
+                )
+                self.assertFalse(
+                    torch.isinf(output).any(),
+                    f"Output contains Inf for {test_case['name']}",
+                )
+
+    def test_metadata_initialization(self):
+        """Test TRTLLM MLA metadata initialization and structure."""
+        print(f"\nRunning metadata initialization tests...")
+
+        for test_case in TEST_CASES["metadata_tests"]:
+            with self.subTest(test_case=test_case["name"]):
+                print(f"  Testing {test_case['name']}: {test_case['description']}")
+
+                config = self._merge_config(test_case)
+                batch_size = config["batch_size"]
+                max_seq_len = config["max_seq_len"]
+
+                # Create components
+                model_runner, _, backend, _, layer = self._create_model_components(
+                    config
+                )
+
+                # Create varied sequence lengths
+                torch.manual_seed(config["seed_cache"])
+                if batch_size == 1:
+                    seq_lens = torch.tensor([max_seq_len], device=config["device"])
+                else:
+                    seq_lens = torch.randint(
+                        max(1, max_seq_len // 4),
+                        max_seq_len + 1,
+                        (batch_size,),
+                        device=config["device"],
+                    )
+                    seq_lens[0] = max_seq_len  # Ensure at least one max length
+
+                # Create forward batch
+                fb = self._create_forward_batch(
+                    batch_size, seq_lens, backend, model_runner, config
+                )
+
+                # Initialize metadata
+                backend.init_forward_metadata(fb)
+
+                # Verify metadata exists
+                self.assertIsNotNone(backend.forward_metadata)
+                self.assertIsInstance(backend.forward_metadata, TRTLLMMLADecodeMetadata)
+
+                # Test metadata structure
+                metadata = backend.forward_metadata
+                self.assertIsNotNone(
+                    metadata.workspace, "Workspace should be allocated"
+                )
+                self.assertIsNotNone(
+                    metadata.block_kv_indices, "Block KV indices should be created"
+                )
+
+                # Test workspace properties
+                self.assertEqual(metadata.workspace.device.type, "cuda")
+                self.assertEqual(metadata.workspace.dtype, torch.uint8)
+                self.assertGreater(
+                    metadata.workspace.numel(), 0, "Workspace should have non-zero size"
+                )
+
+                # Test block KV indices properties
+                self.assertEqual(metadata.block_kv_indices.device.type, "cuda")
+                self.assertEqual(metadata.block_kv_indices.dtype, torch.int32)
+                self.assertEqual(metadata.block_kv_indices.shape[0], batch_size)
+
+                # Verify block indices are valid (>= -1, since -1 is padding)
+                self.assertTrue(
+                    (metadata.block_kv_indices >= -1).all(),
+                    "All block indices should be >= -1 (with -1 as padding)",
+                )
+
+    def test_metadata_block_calculation(self):
+        """Test block count calculation logic."""
+        print(f"\nRunning metadata block calculation tests...")
+
+        test_scenarios = [
+            {"seq_len": 31, "page_size": 32, "expected_min_blocks": 1},
+            {"seq_len": 32, "page_size": 32, "expected_min_blocks": 1},
+            {"seq_len": 33, "page_size": 32, "expected_min_blocks": 2},
+            {"seq_len": 128, "page_size": 32, "expected_min_blocks": 4},
+            {"seq_len": 128, "page_size": 64, "expected_min_blocks": 2},
+        ]
+
+        for scenario in test_scenarios:
+            with self.subTest(scenario=scenario):
+                config = self._merge_config(
+                    {
+                        "batch_size": 1,
+                        "max_seq_len": scenario["seq_len"],
+                        "page_size": scenario["page_size"],
+                    }
+                )
+
+                model_runner, _, backend, _, _ = self._create_model_components(config)
+
+                # Test internal block calculation
+                calculated_blocks = backend._calc_padded_blocks(scenario["seq_len"])
+
+                # Should be at least the minimum required
+                self.assertGreaterEqual(
+                    calculated_blocks,
+                    scenario["expected_min_blocks"],
+                    f"Calculated blocks ({calculated_blocks}) should be >= minimum required ({scenario['expected_min_blocks']})",
+                )
+
+                # Should satisfy page_size constraint
+                total_tokens = calculated_blocks * scenario["page_size"]
+                self.assertGreaterEqual(
+                    total_tokens,
+                    scenario["seq_len"],
+                    f"Total tokens ({total_tokens}) should cover sequence length ({scenario['seq_len']})",
+                )
+
+                # Should satisfy TRT-LLM and Triton constraints
+                trtllm_constraint = 128 // scenario["page_size"]
+                constraint_lcm = math.lcm(
+                    trtllm_constraint, TRITON_PAD_NUM_PAGE_PER_BLOCK
+                )
+                self.assertEqual(
+                    calculated_blocks % constraint_lcm,
+                    0,
+                    f"Block count should be multiple of LCM of constraints ({constraint_lcm})",
+                )
+
+    def test_metadata_kv_indices_correctness(self):
+        """Test KV indices creation and correctness."""
+        print(f"\nRunning KV indices correctness tests...")
+
+        for test_case in TEST_CASES["metadata_tests"][
+            :2
+        ]:  # Test subset for performance
+            with self.subTest(test_case=test_case["name"]):
+                print(f"  Testing {test_case['name']}: {test_case['description']}")
+
+                config = self._merge_config(test_case)
+                batch_size = config["batch_size"]
+                max_seq_len = config["max_seq_len"]
+
+                model_runner, _, backend, _, layer = self._create_model_components(
+                    config
+                )
+
+                # Create known sequence lengths
+                torch.manual_seed(config["seed_cache"])
+                if batch_size == 1:
+                    seq_lens = torch.tensor([max_seq_len], device=config["device"])
+                else:
+                    seq_lens = torch.randint(
+                        max_seq_len // 2,
+                        max_seq_len + 1,
+                        (batch_size,),
+                        device=config["device"],
+                    )
+
+                fb = self._create_forward_batch(
+                    batch_size, seq_lens, backend, model_runner, config
+                )
+
+                # Populate some KV cache to have valid indices
+                self._populate_kv_cache(
+                    batch_size, seq_lens, [model_runner], layer, config
+                )
+
+                # Initialize metadata
+                backend.init_forward_metadata(fb)
+                metadata = backend.forward_metadata
+
+                # Verify KV indices structure
+                block_kv_indices = metadata.block_kv_indices
+
+                for i in range(batch_size):
+                    seq_len = seq_lens[i].item()
+                    expected_blocks = backend._calc_padded_blocks(seq_len)
+
+                    # Count valid (non -1) indices for this sequence
+                    valid_indices = (block_kv_indices[i] >= 0).sum().item()
+
+                    # Should have at least enough blocks for the sequence
+                    min_required_blocks = (seq_len + config["page_size"] - 1) // config[
+                        "page_size"
+                    ]
+                    self.assertGreaterEqual(
+                        valid_indices,
+                        min_required_blocks,
+                        f"Sequence {i} should have at least {min_required_blocks} valid blocks, got {valid_indices}",
+                    )
+
+                    # Verify indices are within valid range
+                    valid_block_indices = block_kv_indices[i][block_kv_indices[i] >= 0]
+                    if len(valid_block_indices) > 0:
+                        max_possible_blocks = (
+                            model_runner.token_to_kv_pool.size // config["page_size"]
+                        )
+                        self.assertTrue(
+                            (valid_block_indices < max_possible_blocks).all(),
+                            f"All block indices should be < {max_possible_blocks}",
+                        )
+
+    def test_metadata_cuda_graph_compatibility(self):
+        """Test metadata compatibility with CUDA graph capture/replay."""
+        print(f"\nRunning CUDA graph compatibility tests...")
+
+        config = self._merge_config(
+            {"batch_size": 4, "max_seq_len": 64, "page_size": 32}
+        )
+
+        model_runner, _, backend, _, layer = self._create_model_components(config)
+        batch_size = config["batch_size"]
+
+        # Initialize CUDA graph state
+        backend.init_cuda_graph_state(
+            max_bs=batch_size, max_num_tokens=config["max_seq_len"] * batch_size
+        )
+
+        # Verify CUDA graph buffers are allocated
+        self.assertIsNotNone(backend.decode_cuda_graph_kv_indices)
+        self.assertIsNotNone(backend.decode_cuda_graph_workspace)
+
+        # Test capture metadata
+        seq_lens = torch.full(
+            (batch_size,), config["max_seq_len"], device=config["device"]
+        )
+        req_pool_indices = torch.arange(batch_size, device=config["device"])
+
+        backend.init_forward_metadata_capture_cuda_graph(
+            bs=batch_size,
+            num_tokens=batch_size,
+            req_pool_indices=req_pool_indices,
+            seq_lens=seq_lens,
+            encoder_lens=None,
+            forward_mode=ForwardMode.DECODE,
+            spec_info=None,
+        )
+
+        # Verify capture metadata
+        self.assertIn(batch_size, backend.decode_cuda_graph_metadata)
+        capture_metadata = backend.decode_cuda_graph_metadata[batch_size]
+
+        self.assertIsNotNone(capture_metadata.workspace)
+        self.assertIsNotNone(capture_metadata.block_kv_indices)
+
+        # Test replay with different sequence lengths
+        new_seq_lens = torch.randint(
+            config["max_seq_len"] // 2,
+            config["max_seq_len"] + 1,
+            (batch_size,),
+            device=config["device"],
+        )
+
+        backend.init_forward_metadata_replay_cuda_graph(
+            bs=batch_size,
+            req_pool_indices=req_pool_indices,
+            seq_lens=new_seq_lens,
+            seq_lens_sum=new_seq_lens.sum().item(),
+            encoder_lens=None,
+            forward_mode=ForwardMode.DECODE,
+            spec_info=None,
+            seq_lens_cpu=new_seq_lens.cpu(),
+        )
+
+        # Verify replay updated the metadata
+        replay_metadata = backend.forward_metadata
+        self.assertIsNotNone(replay_metadata)
+        self.assertEqual(
+            replay_metadata.workspace.data_ptr(), capture_metadata.workspace.data_ptr()
+        )
+
+    def test_metadata_consistency_across_calls(self):
+        """Test metadata consistency across multiple forward calls."""
+        print(f"\nRunning metadata consistency tests...")
+
+        config = self._merge_config(
+            {"batch_size": 2, "max_seq_len": 64, "page_size": 32}
+        )
+
+        model_runner, _, backend, _, layer = self._create_model_components(config)
+
+        # First call
+        seq_lens_1 = torch.tensor([32, 48], device=config["device"])
+        fb_1 = self._create_forward_batch(
+            config["batch_size"], seq_lens_1, backend, model_runner, config
+        )
+        backend.init_forward_metadata(fb_1)
+        metadata_1 = backend.forward_metadata
+
+        # Second call with same sequence lengths
+        seq_lens_2 = torch.tensor([32, 48], device=config["device"])
+        fb_2 = self._create_forward_batch(
+            config["batch_size"], seq_lens_2, backend, model_runner, config
+        )
+        backend.init_forward_metadata(fb_2)
+        metadata_2 = backend.forward_metadata
+
+        # Metadata structure should be consistent
+        self.assertEqual(metadata_1.workspace.shape, metadata_2.workspace.shape)
+        self.assertEqual(
+            metadata_1.block_kv_indices.shape, metadata_2.block_kv_indices.shape
+        )
+
+        # Third call with different sequence lengths
+        seq_lens_3 = torch.tensor([16, 64], device=config["device"])
+        fb_3 = self._create_forward_batch(
+            config["batch_size"], seq_lens_3, backend, model_runner, config
+        )
+        backend.init_forward_metadata(fb_3)
+        metadata_3 = backend.forward_metadata
+
+        # Should still have valid structure
+        self.assertIsNotNone(metadata_3.workspace)
+        self.assertIsNotNone(metadata_3.block_kv_indices)
+        self.assertEqual(metadata_3.block_kv_indices.shape[0], config["batch_size"])
+
+    def test_prefill_output_match_self_attention(self):
+        """Test prefill (forward) behavior of TRTLLM MLA backend vs reference."""
+        print(f"\nRunning prefill output tests...")
+
+        for test_case in TEST_CASES["output_match"][:2]:  # Just a subset for speed
+            with self.subTest(test_case=test_case["name"]):
+                print(
+                    f"Prefill Testing {test_case['name']}: {test_case['description']}"
+                )
+
+                config = self._merge_config(test_case)
+                batch_size = config["batch_size"]
+                max_seq_len = config["max_seq_len"]
+
+                # Create components
+                (
+                    model_runner_trtllm,
+                    model_runner_reference,
+                    trtllm_backend,
+                    reference_backend,
+                    layer,
+                ) = self._create_model_components(config, is_prefill=True)
+
+                # Prefill uses full sequences
+                seq_lens = torch.full(
+                    (batch_size,), max_seq_len, device=config["device"]
+                )
+
+                def _create_forward_batch_prefill(
+                    batch_size,
+                    seq_lens,
+                    extend_prefix_lens,
+                    backend,
+                    model_runner,
+                    config,
+                ):
+                    """Create a forward batch for the given backend."""
+
+                    fb = ForwardBatch(
+                        batch_size=batch_size,
+                        input_ids=torch.randint(
+                            0, 100, (batch_size, 1), device=config["device"]
+                        ),
+                        out_cache_loc=torch.arange(batch_size, device=config["device"]),
+                        seq_lens_sum=int(seq_lens.sum().item()),
+                        extend_prefix_lens=extend_prefix_lens,
+                        extend_prefix_lens_cpu=extend_prefix_lens.cpu().int().tolist(),
+                        extend_seq_lens_cpu=(seq_lens - extend_prefix_lens)
+                        .cpu()
+                        .int()
+                        .tolist(),
+                        forward_mode=ForwardMode.EXTEND,
+                        req_pool_indices=torch.arange(
+                            batch_size, device=config["device"]
+                        ),
+                        seq_lens=seq_lens,
+                        seq_lens_cpu=seq_lens.cpu(),
+                        attn_attend_prefix_cache=False,
+                        mha_return_lse=False,
+                        attn_backend=backend,
+                    )
+                    fb.req_to_token_pool = model_runner.req_to_token_pool
+                    fb.token_to_kv_pool = model_runner.token_to_kv_pool
+
+                    # Add position information for RoPE
+                    fb.positions = torch.arange(batch_size, device=config["device"])
+
+                    return fb
+
+                # Create forward batches
+                fb_trtllm = _create_forward_batch_prefill(
+                    batch_size,
+                    seq_lens.clone(),
+                    torch.zeros(batch_size, device=config["device"], dtype=torch.int32),
+                    trtllm_backend,
+                    model_runner_trtllm,
+                    config,
+                )
+                fb_reference = _create_forward_batch_prefill(
+                    batch_size,
+                    seq_lens.clone(),
+                    torch.zeros(batch_size, device=config["device"], dtype=torch.int32),
+                    reference_backend,
+                    model_runner_reference,
+                    config,
+                )
+
+                # Initialize metadata for both backends
+                trtllm_backend.init_forward_metadata(fb_trtllm)
+                reference_backend.init_forward_metadata(fb_reference)
+
+                # Create Q, K, V tensors for prefill
+                torch.manual_seed(config["seed_qkv"])
+
+                def _create_qkv_tensors_prefill(
+                    batch_size, seq_len, config, dtype_override=None
+                ):
+                    """Create Q, K, V tensors for prefill, using config for head_num and head_dim."""
+                    device = config["device"]
+                    dtype = dtype_override or config["dtype"]
+
+                    total_tokens = batch_size * seq_len
+
+                    tp_q_head_num = config["tp_q_head_num"]
+                    tp_k_head_num = config["tp_k_head_num"]
+                    head_dim = config["prefill_head_dim"]
+                    v_head_dim = config["prefill_v_head_dim"]
+
+                    q = torch.randn(
+                        (total_tokens, tp_q_head_num * head_dim),
+                        dtype=dtype,
+                        device=device,
+                    )
+                    k = torch.randn(
+                        (total_tokens, tp_k_head_num * head_dim),
+                        dtype=dtype,
+                        device=device,
+                    )
+                    v = torch.randn(
+                        (total_tokens, tp_k_head_num * v_head_dim),
+                        dtype=dtype,
+                        device=device,
+                    )
+
+                    # Reshape as requested
+                    q = q.view(-1, tp_q_head_num, head_dim)
+                    k = k.view(-1, tp_k_head_num, head_dim)
+                    v = v.view(-1, tp_k_head_num, v_head_dim)
+
+                    return q, k, v
+
+                q, k, v = _create_qkv_tensors_prefill(batch_size, max_seq_len, config)
+                # Run prefill on both backends
+                out_trtllm = trtllm_backend.forward_extend(
+                    q, k, v, layer, fb_trtllm, False
+                ).view(-1, layer.tp_q_head_num * layer.v_head_dim)
+                out_reference = reference_backend.forward_extend(
+                    q, k, v, layer, fb_reference, False
+                )
+
+                tolerance = config.get("tolerance", 1e-2)
+                comparison_passed = compare_outputs(
+                    out_trtllm, out_reference, tolerance=tolerance
+                )
+                self.assertTrue(
+                    comparison_passed,
+                    f"TRTLLM and Reference prefill outputs differ beyond tolerance. "
+                    f"Config: {test_case['name']}, "
+                    f"Max diff: {(out_trtllm - out_reference).abs().max().item()}",
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/sglang/test/doc_patch.py b/python/sglang/test/doc_patch.py
new file mode 100644
index 000000000..0febb295c
--- /dev/null
+++ b/python/sglang/test/doc_patch.py
@@ -0,0 +1,59 @@
+"""
+Do some monkey patch to make the documentation compilation faster and more reliable.
+
+- Avoid port conflicts
+- Reduce the server launch time
+"""
+
+import weakref
+
+import nest_asyncio
+
+nest_asyncio.apply()
+
+import sglang.srt.server_args as server_args_mod
+from sglang.utils import execute_shell_command, reserve_port
+
+DEFAULT_MAX_RUNNING_REQUESTS = 128
+DEFAULT_MAX_TOTAL_TOKENS = 20480  # To allow multiple servers on the same machine
+
+_original_post_init = server_args_mod.ServerArgs.__post_init__
+
+
+def patched_post_init(self):
+    _original_post_init(self)
+    if self.max_running_requests is None:
+        self.max_running_requests = DEFAULT_MAX_RUNNING_REQUESTS
+    if self.max_total_tokens is None:
+        self.max_total_tokens = DEFAULT_MAX_TOTAL_TOKENS
+    self.cuda_graph_max_bs = 4
+
+
+server_args_mod.ServerArgs.__post_init__ = patched_post_init
+
+process_socket_map = weakref.WeakKeyDictionary()
+
+
+def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None):
+    """
+    Launch the server using the given command.
+    If no port is specified, a free port is reserved.
+    """
+    if port is None:
+        port, lock_socket = reserve_port(host)
+    else:
+        lock_socket = None
+
+    extra_flags = (
+        f"--max-running-requests {DEFAULT_MAX_RUNNING_REQUESTS} "
+        f"--max-total-tokens {DEFAULT_MAX_TOTAL_TOKENS} "
+        f"--cuda-graph-max-bs 4"
+    )
+
+    full_command = f"{command} --port {port} {extra_flags}"
+    process = execute_shell_command(full_command)
+
+    if lock_socket is not None:
+        process_socket_map[process] = lock_socket
+
+    return process, port
diff --git a/python/sglang/test/few_shot_gsm8k.py b/python/sglang/test/few_shot_gsm8k.py
new file mode 100644
index 000000000..7dafcd423
--- /dev/null
+++ b/python/sglang/test/few_shot_gsm8k.py
@@ -0,0 +1,149 @@
+"""
+Run few-shot GSM-8K evaluation.
+
+Usage:
+python3 -m sglang.test.few_shot_gsm8k --num-questions 200
+"""
+
+import argparse
+import ast
+import re
+import time
+
+import numpy as np
+
+from sglang.lang.api import set_default_backend
+from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
+from sglang.utils import download_and_cache_file, dump_state_text, read_jsonl
+
+INVALID = -9999999
+
+
+def get_one_example(lines, i, include_answer):
+    ret = "Question: " + lines[i]["question"] + "\nAnswer:"
+    if include_answer:
+        ret += " " + lines[i]["answer"]
+    return ret
+
+
+def get_few_shot_examples(lines, k):
+    ret = ""
+    for i in range(k):
+        ret += get_one_example(lines, i, True) + "\n\n"
+    return ret
+
+
+def get_answer_value(answer_str):
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+
+
+def run_eval(args):
+    # Select backend
+    set_default_backend(RuntimeEndpoint(f"{args.host}:{args.port}"))
+
+    if args.data_path is None:
+        # Read data
+        url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
+        filename = download_and_cache_file(url)
+    else:
+        filename = args.data_path
+
+    lines = list(read_jsonl(filename))
+
+    # Construct prompts
+    num_questions = args.num_questions
+    num_shots = args.num_shots
+    few_shot_examples = get_few_shot_examples(lines, num_shots)
+
+    questions = []
+    labels = []
+    for i in range(len(lines[:num_questions])):
+        questions.append(get_one_example(lines, i, False))
+        labels.append(get_answer_value(lines[i]["answer"]))
+    assert all(l != INVALID for l in labels)
+    arguments = [{"question": q} for q in questions]
+
+    #####################################
+    ######### SGL Program Begin #########
+    #####################################
+
+    import sglang as sgl
+
+    @sgl.function
+    def few_shot_gsm8k(s, question):
+        s += few_shot_examples + question
+        s += sgl.gen(
+            "answer",
+            max_tokens=args.max_new_tokens,
+            stop=["Question", "Assistant:", "<|separator|>"],
+        )
+
+    #####################################
+    ########## SGL Program End ##########
+    #####################################
+
+    # Run requests
+    tic = time.perf_counter()
+    states = few_shot_gsm8k.run_batch(
+        arguments,
+        temperature=args.temperature if hasattr(args, "temperature") else 0,
+        num_threads=args.parallel,
+        progress_bar=True,
+        return_logprob=getattr(args, "return_logprob", None),
+        logprob_start_len=getattr(args, "logprob_start_len", None),
+    )
+    latency = time.perf_counter() - tic
+
+    preds = []
+    for i in range(len(states)):
+        preds.append(get_answer_value(states[i]["answer"]))
+
+    # print(f"{preds=}")
+    # print(f"{labels=}")
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    invalid = np.mean(np.array(preds) == INVALID)
+
+    # Compute speed
+    num_output_tokens = sum(
+        s.get_meta_info("answer")["completion_tokens"] for s in states
+    )
+    output_throughput = num_output_tokens / latency
+
+    # Print results
+    print(f"Accuracy: {acc:.3f}")
+    print(f"Invalid: {invalid:.3f}")
+    print(f"Latency: {latency:.3f} s")
+    print(f"Output throughput: {output_throughput:.3f} token/s")
+
+    # Dump results
+    dump_state_text("tmp_output_gsm8k.txt", states)
+
+    return {
+        "accuracy": acc,
+        "invalid": invalid,
+        "latency": latency,
+        "output_throughput": output_throughput,
+    }
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-shots", type=int, default=5)
+    parser.add_argument("--data-path", type=str)
+    parser.add_argument("--num-questions", type=int, default=200)
+    parser.add_argument("--max-new-tokens", type=int, default=512)
+    parser.add_argument("--parallel", type=int, default=128)
+    parser.add_argument("--host", type=str, default="http://127.0.0.1")
+    parser.add_argument("--port", type=int, default=30000)
+    parser.add_argument("--temperature", type=float, default=0.0)
+    args = parser.parse_args()
+    run_eval(args)
diff --git a/python/sglang/test/few_shot_gsm8k_engine.py b/python/sglang/test/few_shot_gsm8k_engine.py
new file mode 100644
index 000000000..05b095713
--- /dev/null
+++ b/python/sglang/test/few_shot_gsm8k_engine.py
@@ -0,0 +1,144 @@
+import argparse
+import ast
+import asyncio
+import json
+import re
+import time
+
+import numpy as np
+
+import sglang as sgl
+from sglang.lang.api import set_default_backend
+from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
+from sglang.utils import download_and_cache_file, dump_state_text, read_jsonl
+
+INVALID = -9999999
+
+
+def get_one_example(lines, i, include_answer):
+    ret = "Question: " + lines[i]["question"] + "\nAnswer:"
+    if include_answer:
+        ret += " " + lines[i]["answer"]
+    return ret
+
+
+def get_few_shot_examples(lines, k):
+    ret = ""
+    for i in range(k):
+        ret += get_one_example(lines, i, True) + "\n\n"
+    return ret
+
+
+def get_answer_value(answer_str):
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+
+
+async def concurrent_generate(engine, prompts, sampling_param):
+    tasks = []
+    for prompt in prompts:
+        tasks.append(asyncio.create_task(engine.async_generate(prompt, sampling_param)))
+
+    outputs = await asyncio.gather(*tasks)
+    return outputs
+
+
+def run_eval(args):
+    # Select backend
+    engine = sgl.Engine(model_path=args.model_path, log_level="error")
+
+    if args.local_data_path is None:
+        # Read data
+        url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
+        filename = download_and_cache_file(url)
+    else:
+        filename = args.local_data_path
+
+    lines = list(read_jsonl(filename))
+
+    # Construct prompts
+    num_questions = args.num_questions
+    num_shots = args.num_shots
+    few_shot_examples = get_few_shot_examples(lines, num_shots)
+
+    questions = []
+    labels = []
+    for i in range(len(lines[:num_questions])):
+        questions.append(get_one_example(lines, i, False))
+        labels.append(get_answer_value(lines[i]["answer"]))
+    assert all(l != INVALID for l in labels)
+    arguments = [{"question": q} for q in questions]
+
+    # construct the prompts
+    prompts = []
+    for i, arg in enumerate(arguments):
+        q = arg["question"]
+        prompt = few_shot_examples + q
+        prompts.append(prompt)
+
+    sampling_param = {
+        "stop": ["Question", "Assistant:", "<|separator|>"],
+        "max_new_tokens": 512,
+        "temperature": 0,
+    }
+
+    # Run requests
+    tic = time.perf_counter()
+
+    loop = asyncio.get_event_loop()
+
+    outputs = loop.run_until_complete(
+        concurrent_generate(engine, prompts, sampling_param)
+    )
+
+    # End requests
+    latency = time.perf_counter() - tic
+
+    # Shutdown the engine
+    engine.shutdown()
+
+    # Parse output
+    preds = []
+
+    for output in outputs:
+        preds.append(get_answer_value(output["text"]))
+
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    invalid = np.mean(np.array(preds) == INVALID)
+
+    # Compute speed
+    num_output_tokens = sum(
+        output["meta_info"]["completion_tokens"] for output in outputs
+    )
+    output_throughput = num_output_tokens / latency
+
+    # Print results
+    print(f"Accuracy: {acc:.3f}")
+    print(f"Invalid: {invalid:.3f}")
+    print(f"Latency: {latency:.3f} s")
+    print(f"Output throughput: {output_throughput:.3f} token/s")
+
+    return {
+        "accuracy": acc,
+        "latency": latency,
+        "output_throughput": output_throughput,
+    }
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model-path", type=str, default="meta-llama/Meta-Llama-3.1-8B-Instruct"
+    )
+    parser.add_argument("--local-data-path", type=Optional[str], default=None)
+    parser.add_argument("--num-shots", type=int, default=5)
+    parser.add_argument("--num-questions", type=int, default=200)
+    args = parser.parse_args()
+    metrics = run_eval(args)
diff --git a/python/sglang/test/long_prompt.txt b/python/sglang/test/long_prompt.txt
new file mode 100644
index 000000000..8ec2b829a
--- /dev/null
+++ b/python/sglang/test/long_prompt.txt
@@ -0,0 +1 @@
+You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\nIntroduction\n\nThroughout U.S. history, Congress has created advisory commissions to assist in the development of public policy. Among other contexts, commissions have been used following crisis situations, including the September 11, 2001, terrorist attacks and the 2008 financial crisis. In such situations, advisory commissions may potentially provide Congress with a high-visibility forum to assemble expertise that might not exist within the legislative environment; allow for the in-depth examination of complex, cross-cutting policy issues; and lend bipartisan credibility to a set of findings and recommendations.\nAs Congress considers its range of responses to the coronavirus pandemic, the creation of one or more congressional advisory commissions is an option that could provide a platform for evaluating various pandemic-related policy issues over time. Past congressional advisory commissions have retrospectively evaluated policy responses, brought together diverse groups of experts, and supplemented existing congressional oversight mechanisms. Policymakers may determine that creating an advisory commission is unnecessary and instead prefer to utilize existing congressional oversight structures, such as standing or select committees, or already established oversight entities.\nThis report provides a comparative analysis of five proposed congressional advisory commissions that would investigate various aspects of the COVID-19 pandemic. The five proposed commissions are found in H.R. 6429 (the National Commission on COVID-19 Act, sponsored by Representative Stephanie Murphy), H.R. 6431 (the Made in America Emergency Preparedness Act, sponsored by Representative Brian Fitzpatrick), H.R. 6440 (the Pandemic Rapid Response Act, sponsored by Representative Rodney Davis), H.R. 6455 (the COVID-19 Commission Act, sponsored by Representative Bennie Thompson), and H.R. 6548 (the National Commission on the COVID-19 Pandemic in the United States Act, sponsored by Representative Adam Schiff). The overall structures of each of the proposed commissions are similar in many respects, both to each other and to previous independent advisory entities established by Congress. Specifically, the proposed commissions would (1) exist temporarily; (2) serve in an advisory capacity; and (3) report a work product detailing the commission\'s findings, conclusions, and recommendations. That said, each particular proposed commission has distinctive elements, particularly concerning its membership structure, appointment structure, and time line for reporting its work product to Congress.\nThis report compares the (1) membership structure, (2) appointment structure, (3) rules of procedure and operation, (4) duties and reporting requirements, (5) powers of the commission, (6) staffing issues, and (7) funding for each of the proposed COVID-19 commissions. Table 1 (at the end of this report) provides a side-by-side comparison of major provisions of the five proposals.\n\n	Membership Structure\n\nSeveral matters related to a commission\'s membership structure might be considered. They include the size of a commission, member qualifications, compensation of commission members, and requirements for partisan balance. \n\n		Size of Commission\n\nIn general, there is significant variation in the size of congressional advisory commissions. Among 155 identified congressional commissions created between the 101 st Congress and the 115 th Congress, the median size was 12 members, with the smallest commission having 5 members and the largest 33 members.\nThe membership structure of each of the five proposed commissions is similar to previous independent advisory entities created by Congress. H.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6548 would each create a 10-member entity. H.R. 6455 would create a 25-member entity.\n\n		Qualifications\n\nPast legislation creating congressional commissions has often required or suggested that commission members possess certain substantive qualifications. Such provisions arguably make it more likely that the commission is populated with genuine experts in the policy area, which may improve the commission\'s final work product.\nH.R. 6455 would provide that commissioners \"shall be a United States person with significant expertise\" in a variety of fields related to public health and public administration. H.R. 6440 , H.R. 6429 , H.R. 6431 , and H.R. 6548 would provide \"the sense of Congress\" that commission members should be \"prominent U.S. citizens\" who are nationally recognized experts in a variety of fields relevant to the pandemic and response efforts. In addition, H.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6548 all prohibit the appointment of federal, state, and local government employees and officers. H.R. 6455 would prohibit federal employees from being commission members.\n\n		Compensation of Commission Members\n\nSome congressional commissions have compensated their members. For example, the National Commission on Terrorist Attacks Upon the United States (9/11 Commission) and the Financial Crisis Inquiry Commission provided that commission members could be compensated at a daily rate of basic pay. Nearly all have reimbursed members for travel expenses. Those that have provided for commissioner compensation most frequently provided compensation at the daily equivalent of level IV of the Executive Schedule.\nEach of the five proposals would provide that commission members be compensated at a rate \"not to exceed the daily equivalent of the annual rate of basic pay\" for level IV of the Executive Schedule, \"for each day during which that member is engaged in the actual performance of duties of the Commission.\" Members of three proposed commissions would receive travel expenses, including a per diem.\n\n		Partisan Limitations\n\nEach proposal provides a limit on the number of members appointed from the same political party. H.R. 6455 would provide that not more than 13 of its 25 members may be from the same party. H.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6548 would provide that not more than 5 (of 10) members are from the same party. Most previous advisory entities created by Congress do not impose formal partisan restrictions on the membership structure. It may also be difficult to assess the political affiliation of potential members, who may have no formal affiliation (voter registration, for example) with a political party. Instead, most past advisory commissions usually achieve partisan balance through the appointment structure; for instance, by providing equal (or near-equal) numbers of appointments to congressional leaders of each party.\n\n	Appointment Structure\n\nPast congressional commissions have used a wide variety of appointment structures. Considerations regarding appointment structures include partisan balance, filling vacancies, and the time line for making commission appointments.\nThe statutory scheme may directly designate members of the commission, such as a specific cabinet official or a congressional leader. In other cases, selected congressional leaders, often with balance between the parties, appoint commission members. A third common statutory scheme is to have selected leaders, such as committee chairs and ranking members, recommend candidates for appointment to a commission. These selected leaders may act either in parallel or jointly, and the recommendation may be made either to other congressional leaders, such as the Speaker of the House and President pro tempore of the Senate, or to the President.\nEach of the five commission proposals would delegate most or all appointment authority to congressional leaders (including chamber, party, and committee leaders; see Table 1 for details). Additionally, H.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6548 provide for one appointment to be made by the President. H.R. 6429 , H.R. 6431 , and H.R. 6548 would have the President appoint the commission\'s chair. H.R. 6455 has its membership appointed by the chairs and ranking members of designated House and Senate committees, and the Joint Economic Committee. H.R. 6455 does not provide any executive branch appointments.\nAttention to the proper balance between the number of members appointed by congressional leaders and by other individuals (such as the President), or to the number of Members of Congress required to be among the appointees, or to the qualifications of appointees, can be significant factors in enabling a commission to fulfill its congressional mandate.\nIn general, a commission\'s appointment scheme can impact both the commission\'s ability to fulfill its statutory duties and its final work product. For instance, if the scheme provides only for the appointment of Members of Congress to the commission, it arguably might not have the technical expertise or diversity of knowledge to complete its duties within the time given by statute. Similarly, if the appointment scheme includes qualifying provisos so specific that only a small set of private citizens could serve on the panel, the commission\'s final work product may arguably only represent a narrow range of viewpoints. None of the proposed COVID-19 commissions specify whether Members of Congress may serve on the commission.\n\n		Partisan Balance in Appointment Authority\n\nMost previous congressional advisory commissions have been structured to be bipartisan, with an even (or near-even) split of appointments between leaders of the two major parties. By achieving a nonpartisan or bipartisan character, congressional commissions may make their findings and recommendations more politically acceptable to diverse viewpoints. The bipartisan or nonpartisan arrangement can give recommendations strong credibility, both in Congress and among the public, even when dealing with divisive public policy issues. Similarly, commission recommendations that are perceived as partisan may have difficulty gaining support in Congress.\nIn some cases, however, bipartisanship also can arguably impede a commission\'s ability to complete its mandate. In situations where a commission is tasked with studying divisive or partisan issues, the appointment of an equal number of majority and minority commissioners may serve to promote partisanship within the commission rather than suppress it, raising the possibility of deadlock where neither side can muster a majority to act.\nEach of the five proposals employs a structure where leaders in both the majority and minority parties in Congress would make appointments. H.R. 6429 , H.R. 6431 , and H.R. 6548 would provide for five majority and five minority appointments, including one for the President. H.R. 6440 would include two each by the Senate majority leader, the Senate minority leader, and the Speaker of the House, with one appointment by the House minority leader and one by the President, and the chair appointed by the Speaker and vice chair appointed by the Senate majority leader. H.R. 6455 would have 12 majority and 12 minority appointments made by the 12 committee chairs and ranking members and one member jointly appointed by the chair and vice chair of the Joint Economic Committee.\n\n		Vacancies\n\nAll five proposals provide that vacancies on the commission will not affect its powers and would be filled in the same manner as the original appointment.\n\n		Deadline for Appointments\n\nThree of the bills propose specific deadlines for the appointment of commissioners. H.R. 6429 and H.R. 6548 provide that appointments are made between specific dates in January or February 2021. Further, H.R. 6429 provides that commission members could be appointed in September 2020, if there is no longer a COVID-19 public health emergency in effectâas determined by the Secretary of Health and Human Servicesâas of August 31, 2020. H.R. 6440 would require all appointments be made by December 15, 2020. H.R. 6455 would require appointments to be made within 45 days after enactment. H.R. 6429 , H.R. 6440 , and H.R. 6548 would start the commission\'s work in early 2021, as the commission cannot operate without the appointment of members. H.R. 6429 , however would provide that the proposed commission\'s work would begin no later than October 31, 2020, if members are appointed in September 2020. H.R. 6431 does not specify a deadline for the appointment of members.\nTypically, deadlines for appointment can range from several weeks to several months. For example, the deadline for appointments to the Antitrust Modernization Commission was 60 days after the enactment of its establishing act. The deadline for appointment to the Commission on Wartime Contracting in Iraq and Afghanistan was 120 days from the date of enactment. The deadline for appointment to the 9/11 Commission was December 15, 2002, 18 days after enactment of the act.\n\n	Rules of Procedure and Operations\n\nWhile most statutes that authorize congressional advisory commissions do not provide detailed procedures for how the commission should conduct its business, the statutory language may provide a general structure, including a mechanism for selecting a chair and procedures for creating rules. None of the five COVID-19 commission proposals contain language that directs the process for potentially adopting rules of procedure. For a comparison of each proposed commission\'s specified rules of procedures and operations, see Table 1 .\n\n		Chair Selection\n\nEach bill provides for the selection of a chair and/or vice chair of the commission. H.R. 6429 , H.R. 6431 , and H.R. 6548 would have the chair appointed by the President and the vice chair appointed by congressional leaders of the political party opposite the President. H.R. 6440 would have the chair appointed by the Speaker of the House (in consultation with the Senate majority leader and the House minority leader) and the vice chair appointed by the Senate majority leader (in consultation with the Speaker of the House and the Senate minority leader). H.R. 6455 would have the chair and vice chair chosen from among commission members by a majority vote of the commission, and would require the chair and vice chair to have \"significant experience\" in areas to be studied by the commission.\n\n		Initial Meeting Deadline\n\nAs with the timing of commission appointments, some authorizing statutes are prescriptive in when the commission\'s first meeting should take place. Three of the bills analyzed here provide specific time lines for the commission\'s first meeting. H.R. 6429 would require the first meeting to be no later than March 15, 2021, unless members are appointed in September 2020 (if no public health emergency exists). H.R. 6455 would require the first meeting within 45 days after the appointment of all commission members, which isâgiven the 45-day deadline for appointmentâeffectively a maximum of 90 days after enactment. H.R. 6548 would direct the commission to hold its initial meeting \"as soon as practicable,\" but not later than March 5, 2021. H.R. 6431 and H.R. 6440 do not provide for an initial meeting deadline. Instead, they direct the commission to meet \"as soon as practicable.\" \n\n		Quorum\n\nMost commission statutes provide that a quorum will consist of a particular number of commissioners, usually a majority, but occasionally a supermajority. All five bills would provide for a quorum requirement. H.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6548 would define a quorum as 6 (of 10) members. H.R. 6455 would provide that a quorum is 18 of 25 members (72%).\n\n		Public Access\n\nAll five commission bills would require commission meetings to be open to the public. Each bill would also require that reports be made publicly available.\n\n		Formulating Other Rules of Procedure and Operations\n\nAbsent statutory guidance (eithe r in general statutes or in individual statutes authorizing commissions), advisory entities vary widely in how they adopt their rules of procedure. In general, three models exist: formal written rules, informal rules, and the reliance on norms. Any individual advisory entity might make use of all three of these models for different types of decisionmaking. \nThe choice to adopt written rules or rely on informal norms to guide commission procedure may be based on a variety of factors, such as the entity\'s size, the frequency of meetings, member preferences regarding formality, the level of collegiality among members, and the amount of procedural guidance provided by the entity\'s authorizing statute. Regardless of how procedural issues are handled, protocol for decisionmaking regarding the following operational issues may be important for the commission to consider at the outset of its existence: eligibility to vote and proxy rules; staff hiring, compensation, and work assignments; hearings, meetings, and field visits; nonstaff expenditures and contracting; reports to Congress; budgeting; and procedures for future modification of rules. None of the five COVID-19 commission proposals specify that the proposed commission must adopt written rules.\n\n		FACA Applicability\n\nThe Federal Advisory Committee Act (FACA) mandates certain structural and operational requirements, including formal reporting and oversight procedures, for certain federal advisory bodies that advise the executive branch. Three proposals ( H.R. 6429 , H.R. 6431 , and H.R. 6548 ) specifically exempt the proposed commission from FACA. Of the remaining two, FACA would also likely not apply to the commission proposed in H.R. 6455 because it would be appointed entirely by Members of Congress, although it only specifies that its final report is public, not whether it is specifically sent to Congress and/or the President. It is not clear that FACA would apply to the commission proposed in H.R. 6440 . Although it includes a presidential appointment and its report would be sent to both Congress and the President, its establishment clause specifies that the commission \"is established in the legislative branch,\" and a super-majority of its members would be appointed by Congress.\n\n	Duties and Reporting Requirements\n\nMost congressional commissions are generally considered policy commissionsâtemporary bodies that study particular policy problems and report their findings to Congress or review a specific event. \n\n		General Duties\n\nAll five of the proposed commissions would be tasked with duties that are analogous to those of past policy commissions. While the specific mandates differ somewhat, all proposed commissions are tasked with investigating aspects of the COVID-19 pandemic and submitting one or more reports that include the commission\'s findings, conclusions, and recommendations for legislative action. H.R. 6440 would specifically require the commission to avoid unnecessary duplication of work being conducted by the Government Accountability Office (GAO), congressional committees, and executive branch agency and independent commission investigations.\n\n		Reports\n\nEach proposed commission would be tasked with issuing a final report detailing its findings, conclusions, and recommendations. H.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6548 would provide that the commission \"may submit\" interim reports to Congress and the President, but do not provide time lines on when those reports might be submitted. In each case, the interim report would need to be agreed to by a majority of commission members. H.R. 6431 would also require the commission to submit a report on actions taken by the states and a report on essential products, materials, ingredients, and equipment required to fight pandemics.\nH.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6548 also specify that final reports shall be agreed to by a majority of commission members. H.R. 6455 does not specify a vote threshold for approval of its report.\nNone of the bills make specific provisions for the inclusion of minority viewpoints. Presumably this would leave each commission with discretion on whether to include or exclude minority viewpoints. Past advisory entities have been proposed or established with a variety of statutory reporting conditions, including the specification of majority or super-majority rules for report adoption and provisions requiring the inclusion of minority viewpoints. In practice, advisory bodies that are not given statutory direction on these matters have tended to work under simple-majority rules for report adoption.\n\n		Report Deadlines\n\nH.R. 6429 would require a final report one year after the commission\'s initial meeting. H.R. 6431 and H.R. 6440 would require a final report not later than 18 months after enactment. H.R. 6455 would require a final report to be published not later than 18 months after the commission\'s first meeting. \nH.R. 6548 would require a final report by October 15, 2021. This deadline could be extended by 90 days upon a vote of no fewer than 8 (out of 10) commission members. The commission could vote to extend its final report deadline up to three times, and would be required to notify Congress, the President, and the public of any such extension.\nWhile such a deadline would potentially give the commission a defined period of time to complete its work, setting a particular date for report completion could potentially create unintended time constraints. Any delay in the passage of the legislation or in the appointment process would reduce the amount of time the commission has to complete its work, even with the opportunity for the commission to extend its own deadline up to three times.\nThe length of time a congressional commission has to complete its work is arguably one of the most consequential decisions when designing an advisory entity. If the entity has a short window of time, the quality of its work product may suffer or it may not be able to fulfill its statutory mandate on time.\nOn the other hand, if the commission is given a long period of time to complete its work, it may undermine one of a commission\'s primary legislative advantages, the timely production of expert advice on a current matter. A short deadline may also affect the process of standing up a new commission. The selection of commissioners, recruitment of staff, arrangement of office space, and other logistical matters may require expedited action if short deadlines need to be met.\n\n		Report Submission\n\nOf the five proposed commissions, four ( H.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6548 ) are directed to submit their reports to both Congress and the President. H.R. 6455 requires that the report is made public.\nMost congressional advisory commissions are required to submit their reports to Congress, and sometimes to the President or an executive department or agency head. For example, the National Commission on Severely Distressed Public Housing\'s final report was submitted to both Congress and the Secretary of Housing and Urban Development.\n\n		Commission Termination\n\nCongressional commissions are usually statutorily mandated to terminate. Termination dates for most commissions are linked to either a fixed period of time after the establishment of the commission, the selection of members, or the date of submission of the commission\'s final report. Alternatively, some commissions are given fixed calendar termination dates.\nAll five commission proposals would provide for the commission to terminate within a certain period of time following submission of its final report. H.R. 6429 , H.R. 6431 , H.R. 6440 , and H.R. 6455 would each direct the commission to terminate 60 days after the submission; H.R. 6548 specifies a time line of 90 days after submission.\n\n	Commission Powers\n\nEach of the five proposals would provide the proposed commission with certain powers to carry out its mission (see Table 1 for specifics). One general issue for commissions is who is authorized to execute such powers. In some cases, the commission itself executes its powers, with the commission deciding whether to devise rules and procedures for the general use of such power. In other cases, the legislation specifically authorizes the commission to give discretionary power to subcommittees or individual commission members. Finally, the legislation itself might grant certain powers to individual members of the commission, such as the chair.\n\n		Hearings and Evidence\n\nAll five bills would provide the proposed commission with the power to hold hearings, take testimony, and receive evidence. All five commissions would also be provided the power to administer oaths to witnesses.\n\n		Subpoenas\n\nFour of the bills would provide the commission with subpoena power. H.R. 6440 would not provide subpoena power to the commission. H.R. 6429 , H.R. 6431 , and H.R. 6548 would provide that subpoenas could only be issued by either (1) agreement of the chair and vice chair, or (2) the affirmative vote of 6 (of 10) commission members. H.R. 6455 would require that a subpoena could only be issued by either agreement of the chair and vice chair or an affirmative vote of 18 (of 25) commission members. All four bills that would provide subpoena power contain substantially similar judicial methods of subpoena enforcement.\n\n		Administrative Support\n\nAll five of the bills would provide that the commission receive administrative support from the General Services Administration (GSA). The GSA provides administrative support to dozens of federal entities, including congressional advisory commissions. Each of the five bills would provide that GSA be reimbursed for its services by the commission. Each bill also provides that other departments or agencies may provide funds, facilities, staff, and other services to the commission.\n\n		Other Powers\n\nWithout explicit language authorizing certain activities, commissions often cannot gather information, enter into contracts, use the U.S. mail like an executive branch entity, or accept donations or gifts. \nAll five bills direct that federal agencies provide information to the commission upon request. H.R. 6429 , H.R. 6431 , and H.R. 6548 would also provide that the commission could use the U.S. mails in the same manner as any department or agency, enter into contracts, and accept gifts or donations of services or property.\n\n	Staffing\n\nThe proposed COVID-19 commissions contain staffing provisions commonly found in congressional advisory commission legislation. Congressional advisory commissions are usually authorized to hire staff. Most statutes specify that the commission may hire a lead staffer, often referred to as a \"staff director,\" \"executive director,\" or another similar title, in addition to additional staff as needed. Rather than mandate a specific staff size, many commissions are instead authorized to appoint a staff director and other personnel as necessary, subject to the limitations of available funds.\nMost congressional commissions are also authorized to hire consultants, procure intermittent services, and request that federal agencies detail personnel to aid the work of the commission.\n\n		Director and Commission Staff\n\nFour of the bills provide that the commission may hire staff without regard to certain laws regarding the competitive service; H.R. 6440 does not specifically exempt the commission from such laws. Four bills ( H.R. 6429 , H.R. 6431 , H.R. 6455 , and H.R. 6548 ) would authorize, but not require, the commission to hire a staff director and additional staff, as appropriate. Four proposals would limit staff salaries to level V of the executive schedule. Three of the bills would specifically designate staff as federal employees for the purposes of certain laws, such as workman\'s compensation, retirement, and other benefits.\n\n		Detailees\n\nWhen authorized, some commissions can have federal agency staff detailed to the commission. All five bills would provide that federal employees could be detailed to the commission. Four bills would provide that the detailee would be without reimbursement to his or her home agency. H.R. 6440 would allow detailees on a reimbursable basis. \n\n		Experts and Consultants\n\nAll five bills would provide the commission with the authority to hire experts and consultants. Four of the bills limit the rate of pay for consultants to level IV of the Executive Schedule. H.R. 6440 does not specify a specific limit.\n\n		Security Clearances\n\nFour bills would provide that federal agencies and departments shall cooperate with the commission to provide members and staff appropriate security clearances. H.R. 6440 does not contain a security clearance provision.\n\n	Funding and Costs\n\nCommissions generally require funding to help meet their statutory goals. When designing a commission, therefore, policymakers may consider both how the commission will be funded, and how much funding the commission will be authorized to receive. Four of the five proposals specify a funding mechanism for the commission.\nHow commissions are funded and the amounts that they receive vary considerably. Several factors can contribute to overall commission costs. These factors might include the cost of hiring staff, contracting with outside consultants, and engaging administrative support, among others. Additionally, most commissions reimburse the travel expenditures of commissioners and staff, and some compensate their members. The duration of a commission can also significantly affect its cost; past congressional commissions have been designed to last anywhere from several months to several years.\n\n		Costs\n\nIt is difficult to estimate or predict the potential overall cost of any commission. Annual budgets for congressional advisory entities range from several hundred thousand dollars to millions of dollars annually. Overall expenses for any individual advisory entity depend on a variety of factors, the most important of which are the number of paid staff and the commission\'s duration and scope. Some commissions have few full-time staff; others employ large numbers, such as the National Commission on Terrorist Attacks Upon the United States, which had a full-time paid staff of nearly 80. Secondary factors that can affect commission costs include the number of commissioners, how often the commission meets or holds hearings, whether or not the commission travels or holds field hearings, and the publications the commission produces.\n\n		Authorized Funding\n\nThree of the bills ( H.R. 6429 , H.R. 6440 , and H.R. 6548 ) would authorize the appropriation of \"such sums as may be necessary\" for the commission, to be derived in equal amounts from the contingent fund of the Senate and the applicable accounts of the House of Representatives. H.R. 6429 and H.R. 6548 would provide that funds are available until the commission terminates. H.R. 6455 would authorize the appropriation of $4 million for the commission, to remain available until the commission terminates. H.R. 6431 does not include an authorization of appropriations.\n\n	Comparison of Proposals to Create a COVID-19 Commission\n\n Table 1 provides a side-by-side comparison of major provisions of the five proposals. For each bill, the membership structure, appointment structure, rules of procedure and operation, duties and reporting requirements, proposed commission powers, staffing provisions, and funding are compared.\n\nSummary:\n
diff --git a/python/sglang/test/run_eval.py b/python/sglang/test/run_eval.py
new file mode 100644
index 000000000..9b788cc0a
--- /dev/null
+++ b/python/sglang/test/run_eval.py
@@ -0,0 +1,129 @@
+"""
+Usage:
+python3 -m sglang.test.run_eval --port 30000 --eval-name mmlu --num-examples 10
+"""
+
+import argparse
+import json
+import os
+import time
+
+from sglang.test.simple_eval_common import (
+    ChatCompletionSampler,
+    make_report,
+    set_ulimit,
+)
+
+
+def run_eval(args):
+    set_ulimit()
+
+    if "OPENAI_API_KEY" not in os.environ:
+        os.environ["OPENAI_API_KEY"] = "EMPTY"
+
+    base_url = (
+        f"{args.base_url}/v1" if args.base_url else f"http://{args.host}:{args.port}/v1"
+    )
+
+    if args.eval_name == "mmlu":
+        from sglang.test.simple_eval_mmlu import MMLUEval
+
+        filename = "https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv"
+        eval_obj = MMLUEval(filename, args.num_examples, args.num_threads)
+    elif args.eval_name == "math":
+        from sglang.test.simple_eval_math import MathEval
+
+        equality_checker = ChatCompletionSampler(model="gpt-4-turbo")
+
+        filename = (
+            "https://openaipublic.blob.core.windows.net/simple-evals/math_test.csv"
+        )
+        eval_obj = MathEval(
+            filename, equality_checker, args.num_examples, args.num_threads
+        )
+    elif args.eval_name == "mgsm":
+        from sglang.test.simple_eval_mgsm import MGSMEval
+
+        eval_obj = MGSMEval(args.num_examples, args.num_threads)
+    elif args.eval_name == "mgsm_en":
+        from sglang.test.simple_eval_mgsm import MGSMEval
+
+        eval_obj = MGSMEval(args.num_examples, args.num_threads, languages=["en"])
+    elif args.eval_name == "gpqa":
+        from sglang.test.simple_eval_gpqa import GPQAEval
+
+        filename = (
+            "https://openaipublic.blob.core.windows.net/simple-evals/gpqa_diamond.csv"
+        )
+        eval_obj = GPQAEval(filename, args.num_examples, args.num_threads)
+    elif args.eval_name == "humaneval":
+        from sglang.test.simple_eval_humaneval import HumanEval
+
+        eval_obj = HumanEval(args.num_examples, args.num_threads)
+    else:
+        raise ValueError(f"Invalid eval name: {args.eval_name}")
+
+    sampler = ChatCompletionSampler(
+        model=args.model,
+        max_tokens=getattr(args, "max_tokens", 2048),
+        base_url=base_url,
+        temperature=getattr(args, "temperature", 0.0),
+        reasoning_effort=getattr(args, "reasoning_effort", None),
+    )
+
+    # Run eval
+    tic = time.perf_counter()
+    result = eval_obj(sampler)
+    latency = time.perf_counter() - tic
+
+    # Dump reports
+    metrics = result.metrics | {"score": result.score}
+    file_stem = f"{args.eval_name}_{sampler.model.replace('/', '_')}"
+    report_filename = f"/tmp/{file_stem}.html"
+    print(f"Writing report to {report_filename}")
+    with open(report_filename, "w") as fh:
+        fh.write(make_report(result))
+    metrics = result.metrics | {"score": result.score}
+    print(metrics)
+    result_filename = f"/tmp/{file_stem}.json"
+    with open(result_filename, "w") as f:
+        f.write(json.dumps(metrics, indent=2))
+    print(f"Writing results to {result_filename}")
+
+    # Print results
+    print(f"Total latency: {latency:.3f} s")
+    print(f"Score: {metrics['score']:.3f}")
+
+    return metrics
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default=None,
+        help="Server or API base url if not using http host and port.",
+    )
+    parser.add_argument(
+        "--host", type=str, default="0.0.0.0", help="Default host is 0.0.0.0."
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        help="If not set, the default port is configured according to its default value for different LLM Inference Engines.",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        help="Name or path of the model. If not set, the default model will request /v1/models for conf.",
+    )
+    parser.add_argument("--eval-name", type=str, default="mmlu")
+    parser.add_argument("--num-examples", type=int)
+    parser.add_argument("--num-threads", type=int, default=512)
+    parser.add_argument("--max-tokens", type=int, default=2048)
+    parser.add_argument("--temperature", type=float, default=0.0)
+    parser.add_argument("--reasoning-effort", type=str)
+    args = parser.parse_args()
+
+    run_eval(args)
diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py
new file mode 100644
index 000000000..8ce2e2e20
--- /dev/null
+++ b/python/sglang/test/runners.py
@@ -0,0 +1,874 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import multiprocessing as mp
+import os
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoModelForVision2Seq,
+    AutoProcessor,
+    GenerationConfig,
+)
+
+from sglang.srt.entrypoints.engine import Engine
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils import load_image
+from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER, calculate_rouge_l
+
+DEFAULT_PROMPTS = [
+    "Apple is red. Banana is Yellow. " * 800 + "Apple is",
+    "The capital of the United Kingdom is",
+    "Today is a sunny day and I like",
+    "AI is a field of computer science focused on",
+    # the output of gemma-2-2b from SRT is unstable on the commented prompt
+    # "The capital of France is",
+]
+TEST_RERANK_QUERY_DOCS = [
+    {
+        "query": "How many people live in Berlin?",
+        "documents": [
+            "Berlin is well known for its museums.",
+        ],
+    },
+    {
+        "query": "How many people live in Berlin?",
+        "documents": [
+            "Berlin had a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers.",
+            "Berlin is well known for its museums.",
+        ],
+    },
+]
+
+dirpath = os.path.dirname(__file__)
+with open(os.path.join(dirpath, "long_prompt.txt"), "r") as f:
+    long_prompt = f.read()
+DEFAULT_PROMPTS.append(long_prompt)
+
+NUM_TOP_LOGPROBS = 5
+
+
+def get_dtype_str(torch_dtype):
+    if torch_dtype is torch.float16:
+        return "float16"
+    if torch_dtype is torch.float32:
+        return "float32"
+    else:
+        raise NotImplementedError()
+
+
+def get_top_logprobs(logits, k):
+    logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
+    del logits
+    logprobs, top_indices = torch.topk(logprobs, k=k, dim=-1)
+    return logprobs
+
+
+def get_token_ids_logprobs(logits, token_ids):
+    logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
+    del logits
+    logprobs = logprobs[..., token_ids]
+    return logprobs
+
+
+def _get_sentence_transformer_embedding_model(model_path, torch_dtype):
+    from sentence_transformers import SentenceTransformer
+    from sentence_transformers.util import is_sentence_transformer_model
+
+    if is_sentence_transformer_model(model_path):
+        model = SentenceTransformer(
+            model_path,
+            model_kwargs={"torch_dtype": torch_dtype},
+        )
+    else:  # if no pre-trained sentence-transformers model
+        from sentence_transformers import models
+
+        word_embedding_model = models.Transformer(model_path).to(dtype=torch_dtype)
+        pooling_model = models.Pooling(
+            word_embedding_model.get_word_embedding_dimension(),
+            pooling_mode="lasttoken",
+        )
+        model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
+
+    return model.cuda()
+
+
+@dataclass
+class ModelOutput:
+    output_strs: List[str] = None
+    output_ids: List[int] = None
+    top_input_logprobs: List[torch.Tensor] = None
+    top_output_logprobs: List[torch.Tensor] = None
+    top_output_logprob_idx: List[List[int]] = None
+    embed_logits: List[torch.Tensor] = None
+    scores: List[float] = None
+    input_token_logprobs_lst: List[List[Tuple[float, int, None]]] = None
+    output_token_logprobs_lst: List[List[Tuple[float, int, None]]] = None
+    token_ids_input_logprobs: List[torch.Tensor] = None
+    token_ids_output_logprobs: List[torch.Tensor] = None
+
+
+class HFRunner:
+    def __init__(
+        self,
+        model_path: str,
+        torch_dtype: torch.dtype,
+        model_type: str = "generation",
+        output_str_only: bool = False,
+        trust_remote_code: bool = False,
+        patch_model_do_sample_false: bool = False,
+    ):
+        self.model_type = model_type
+        self.output_str_only = output_str_only
+        self.trust_remote_code = trust_remote_code
+        self.patch_model_do_sample_false = patch_model_do_sample_false
+
+        self.in_queue = mp.Queue()
+        self.out_queue = mp.Queue()
+
+        self.model_proc = mp.Process(
+            target=self.start_model_process,
+            args=(
+                self.in_queue,
+                self.out_queue,
+                model_path,
+                torch_dtype,
+            ),
+        )
+        self.model_proc.start()
+
+    def needs_trust_remote_code(self, model_path):
+        models_needs_trust_remote = [
+            "LxzGordon/URM-LLaMa-3.1-8B",
+        ]
+        if model_path in models_needs_trust_remote:
+            return True
+        return False
+
+    # copy from https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct/blob/main/gme_inference.py
+
+    def _get_gme_qwen2_vl_embeddings(
+        self, prompts, image_data: Optional[List[str]] = None
+    ):
+
+        images = None
+        if image_data is not None:
+            images = [load_image(image)[0] for image in image_data]
+
+        inputs = self.processor(
+            text=prompts,
+            images=images,
+            padding=True,
+            truncation=True,
+            max_length=1800,
+            return_tensors="pt",
+        )
+        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            embeddings = self._forward_gme_qwen2_vl(**inputs)
+        return embeddings.tolist()
+
+    def _forward_gme_qwen2_vl(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        pooling_mask: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        if inputs_embeds is None:
+            inputs_embeds = self.model.model.embed_tokens(input_ids)
+            if pixel_values is not None:
+                pixel_values = pixel_values.type(self.model.visual.get_dtype())
+                image_embeds = self.model.visual(
+                    pixel_values, grid_thw=image_grid_thw
+                ).to(inputs_embeds.device)
+                image_mask = input_ids == self.model.config.image_token_id
+                inputs_embeds[image_mask] = image_embeds
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(inputs_embeds.device)
+
+        outputs = self.model(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+            return_dict=True,
+            inputs_embeds=inputs_embeds,
+            image_grid_thw=image_grid_thw,
+        )
+
+        embeddings = outputs.hidden_states[-1][:, -1]
+        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
+        return embeddings.contiguous()
+
+    def start_model_process(self, in_queue, out_queue, model_path, torch_dtype):
+        # Apply model-specific patches
+        monkey_patch_gemma2_sdpa()
+
+        # Load the model and tokenizer
+        if self.model_type == "generation":
+            config = AutoConfig.from_pretrained(
+                model_path, trust_remote_code=self.trust_remote_code
+            )
+            if self.trust_remote_code:
+                model_cls = AutoModelForCausalLM
+            else:
+                model_arch = getattr(config, "architectures")[0]
+                model_cls = getattr(transformers, model_arch)
+            self.base_model = model_cls.from_pretrained(
+                model_path,
+                torch_dtype=torch_dtype,
+                trust_remote_code=self.trust_remote_code,
+                low_cpu_mem_usage=True,
+            ).cuda()
+        elif self.model_type == "embedding":
+            if "gme-qwen2-vl" in model_path.lower():
+                self.model = AutoModelForVision2Seq.from_pretrained(
+                    model_path,
+                    torch_dtype=torch_dtype,
+                    trust_remote_code=False,
+                    low_cpu_mem_usage=True,
+                ).cuda()
+                self.processor = AutoProcessor.from_pretrained(model_path)
+            elif "clip" in model_path.lower():
+                self.model = AutoModel.from_pretrained(model_path).cuda()
+                self.processor = AutoProcessor.from_pretrained(model_path)
+            else:
+                self.model = _get_sentence_transformer_embedding_model(
+                    model_path, torch_dtype
+                )
+        elif self.model_type == "reward" or self.model_type == "cross_encoder":
+            from transformers import AutoModelForSequenceClassification
+
+            self.model = AutoModelForSequenceClassification.from_pretrained(
+                model_path,
+                torch_dtype=torch_dtype,
+                trust_remote_code=self.needs_trust_remote_code(model_path),
+            ).cuda()
+        else:
+            raise Exception(f"Unrecognized model type {self.model_type}")
+        self.tokenizer = get_tokenizer(
+            model_path,
+            torch_dtype=torch.dtype,
+            trust_remote_code=self.trust_remote_code,
+        )
+
+        # Run forward
+        while True:
+            prompts, image_data, max_new_tokens, lora_paths, token_ids_logprob = (
+                in_queue.get()
+            )
+            if lora_paths is not None:
+                assert len(prompts) == len(lora_paths)
+
+            if prompts is not None:
+                if self.model_type == "generation":
+                    out_queue.put(
+                        self.forward_generation_raw(
+                            base_model=self.base_model,
+                            prompts=prompts,
+                            max_new_tokens=max_new_tokens,
+                            tokenizer=self.tokenizer,
+                            lora_paths=lora_paths,
+                            torch_dtype=torch_dtype,
+                            output_str_only=self.output_str_only,
+                            token_ids_logprob=token_ids_logprob,
+                            patch_model_do_sample_false=self.patch_model_do_sample_false,
+                        )
+                    )
+                elif self.model_type == "embedding":
+                    assert not self.output_str_only
+                    if "gme-qwen2-vl" in model_path.lower():
+                        logits = self._get_gme_qwen2_vl_embeddings(prompts, image_data)
+                    elif "clip" in model_path.lower():
+                        if image_data is not None:
+                            image = load_image(image_data)
+                            inputs = self.processor(
+                                images=image[0], return_tensors="pt"
+                            )
+                            logits = self.model.get_image_features(
+                                pixel_values=inputs.data["pixel_values"].cuda(),
+                            ).tolist()
+                        else:
+                            inputs = self.tokenizer(
+                                prompts, padding=True, return_tensors="pt"
+                            )
+                            logits = self.model.get_text_features(
+                                input_ids=inputs.data["input_ids"].cuda(),
+                                attention_mask=inputs.data["attention_mask"].cuda(),
+                            ).tolist()
+                    else:
+                        logits = self.model.encode(prompts).tolist()
+                    out_queue.put(ModelOutput(embed_logits=logits))
+                elif self.model_type == "cross_encoder":
+                    inputs = self.tokenizer(
+                        prompts, padding=True, return_tensors="pt"
+                    ).to("cuda")
+                    scores = self.model(**inputs).logits
+                    scores = scores.squeeze().tolist()
+                    if not isinstance(scores, list):
+                        scores = [scores]
+                    out_queue.put(ModelOutput(scores=scores))
+
+                elif self.model_type == "reward":
+                    scores = []
+                    for conv in prompts:
+                        conv_formatted = self.tokenizer.apply_chat_template(
+                            conv, tokenize=False
+                        )
+                        conv_tokenized = self.tokenizer(
+                            conv_formatted, return_tensors="pt"
+                        ).to("cuda")
+                        scores.append(
+                            float(self.model(**conv_tokenized).logits[0][0].item())
+                        )
+                    out_queue.put(ModelOutput(scores=scores))
+                else:
+                    raise Exception(f"Unrecognized model type {self.model_type}")
+
+    def forward(
+        self,
+        prompts: Union[
+            List[List[str]], List[str], List[torch.Tensor]
+        ] = DEFAULT_PROMPTS,
+        image_data: Optional[List[str]] = None,
+        max_new_tokens: int = 8,
+        lora_paths: Optional[List[str]] = None,
+        token_ids_logprob: Optional[int] = None,
+    ):
+        self.in_queue.put(
+            (prompts, image_data, max_new_tokens, lora_paths, token_ids_logprob)
+        )
+        return self.out_queue.get()
+
+    def terminate(self):
+        self.model_proc.terminate()
+        self.in_queue = self.out_queue = None
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.model_proc.terminate()
+        self.in_queue = self.out_queue = None
+
+    @staticmethod
+    def forward_generation_raw(
+        base_model,
+        prompts: Union[List[str], List[torch.Tensor]],
+        max_new_tokens: int,
+        tokenizer,
+        torch_dtype: torch.dtype,
+        lora_paths: Optional[List[str]] = None,
+        output_str_only: bool = False,
+        token_ids_logprob: Optional[int] = None,
+        patch_model_do_sample_false: Optional[bool] = False,
+    ) -> ModelOutput:
+        output_strs = []
+        top_input_logprobs = []
+        top_output_logprobs = []
+        if token_ids_logprob is not None:
+            token_ids_input_logprobs = []
+            token_ids_output_logprobs = []
+        else:
+            token_ids_input_logprobs = token_ids_output_logprobs = None
+
+        for i, p in enumerate(prompts):
+            if isinstance(p, str):
+                input_ids = tokenizer.encode(p, return_tensors="pt").cuda()
+            else:
+                input_ids = torch.tensor([p], device="cuda")
+
+            if lora_paths is not None and lora_paths[i] is not None:
+                from peft import PeftModel
+
+                model = PeftModel.from_pretrained(
+                    base_model,
+                    lora_paths[i],
+                    torch_dtype=torch_dtype,
+                    is_trainable=False,
+                )
+            else:
+                model = base_model
+            if patch_model_do_sample_false:
+                model.generation_config.do_sample = False
+            outputs = model.generate(
+                input_ids=input_ids,
+                generation_config=GenerationConfig(
+                    do_sample=False,
+                    temperature=None,
+                    top_p=None,
+                    max_new_tokens=max_new_tokens,
+                    return_dict_in_generate=True,
+                    output_scores=(not output_str_only),
+                    # make sure to disable compile
+                    disable_compile=True,
+                ),
+            )
+
+            text = tokenizer.decode(
+                outputs[0][0][len(input_ids[0]) :], skip_special_tokens=True
+            )
+            # Check if the text is empty or only whitespace.
+            if not text.strip():
+                raise ValueError(
+                    "Received an empty text response. Please verify your input or model configuration."
+                )
+            output_strs.append(text)
+
+            if not output_str_only:
+                # outputs.scores: (num_token, 1, vocab_size)
+                top_output_logprobs.append(
+                    [
+                        get_top_logprobs(logits[0], NUM_TOP_LOGPROBS).tolist()
+                        for logits in outputs.scores
+                    ]
+                )
+                if token_ids_logprob is not None:
+                    token_ids_output_logprobs.append(
+                        [
+                            get_token_ids_logprobs(
+                                logits[0], token_ids_logprob
+                            ).tolist()
+                            for logits in outputs.scores
+                        ]
+                    )
+                del outputs
+
+                input_logits = model.forward(input_ids).logits[0]
+                top_input_logprobs.append(
+                    get_top_logprobs(input_logits, NUM_TOP_LOGPROBS).tolist()
+                )
+                if token_ids_logprob is not None:
+                    token_ids_input_logprobs.append(
+                        get_token_ids_logprobs(input_logits, token_ids_logprob).tolist()
+                    )
+                del input_logits
+
+            if lora_paths is not None and lora_paths[i] is not None:
+                # Unload the LoRA adapter if it is used
+                model.unload()
+
+        return ModelOutput(
+            output_strs=output_strs,
+            top_input_logprobs=top_input_logprobs,
+            top_output_logprobs=top_output_logprobs,
+            token_ids_input_logprobs=token_ids_input_logprobs,
+            token_ids_output_logprobs=token_ids_output_logprobs,
+        )
+
+
+class SRTRunner:
+    def __init__(
+        self,
+        model_path: str,
+        torch_dtype: torch.dtype,
+        model_type: str,
+        tp_size: int = 1,
+        model_impl: str = "auto",
+        port: int = DEFAULT_PORT_FOR_SRT_TEST_RUNNER,
+        lora_paths: Optional[Union[List[str], List[dict[str, str]]]] = None,
+        max_loras_per_batch: int = 4,
+        attention_backend: Optional[str] = None,
+        prefill_attention_backend: Optional[str] = None,
+        decode_attention_backend: Optional[str] = None,
+        lora_backend: str = "triton",
+        disable_cuda_graph: bool = False,
+        disable_radix_cache: bool = False,
+        chunked_prefill_size: Optional[int] = None,
+        dp_size: int = 1,
+        tokenizer_path: Optional[str] = None,
+        mem_fraction_static: float = 0.65,
+        trust_remote_code: bool = False,
+        speculative_draft_model_path: Optional[str] = None,
+        speculative_draft_model_revision: Optional[str] = None,
+        speculative_algorithm: Optional[str] = None,
+        speculative_num_steps: Optional[int] = None,
+        speculative_eagle_topk: Optional[int] = None,
+        speculative_num_draft_tokens: Optional[int] = None,
+        disable_overlap_schedule: bool = False,
+        disable_custom_all_reduce: bool = False,
+        torchao_config: Optional[str] = None,
+        cuda_graph_max_bs: int = 4,
+        sleep_on_idle=False,
+        max_lora_rank: Optional[int] = None,
+        lora_target_modules: Optional[List[str]] = None,
+        enable_lora: Optional[bool] = None,
+        max_loaded_loras: Optional[int] = None,
+    ):
+        self.model_type = model_type
+        self.is_generation = model_type == "generation"
+        enable_dp_attention = dp_size > 1
+
+        spec_kwargs = {}
+        if speculative_draft_model_path:
+            spec_kwargs["speculative_draft_model_path"] = speculative_draft_model_path
+            spec_kwargs["speculative_draft_model_revision"] = (
+                speculative_draft_model_revision
+            )
+            spec_kwargs["speculative_algorithm"] = speculative_algorithm
+            spec_kwargs["speculative_num_steps"] = speculative_num_steps
+            spec_kwargs["speculative_eagle_topk"] = speculative_eagle_topk
+            spec_kwargs["speculative_num_draft_tokens"] = speculative_num_draft_tokens
+
+        self.engine = Engine(
+            model_path=model_path,
+            tp_size=tp_size,
+            dtype=get_dtype_str(torch_dtype),
+            port=port,
+            model_impl=model_impl,
+            torchao_config=torchao_config,
+            mem_fraction_static=mem_fraction_static,
+            trust_remote_code=trust_remote_code,
+            is_embedding=not self.is_generation,
+            lora_paths=lora_paths,
+            max_loras_per_batch=max_loras_per_batch,
+            lora_backend=lora_backend,
+            attention_backend=attention_backend,
+            prefill_attention_backend=prefill_attention_backend,
+            decode_attention_backend=decode_attention_backend,
+            disable_cuda_graph=disable_cuda_graph,
+            disable_radix_cache=disable_radix_cache,
+            chunked_prefill_size=chunked_prefill_size,
+            enable_dp_attention=enable_dp_attention,
+            dp_size=dp_size,
+            tokenizer_path=tokenizer_path,
+            disable_overlap_schedule=disable_overlap_schedule,
+            cuda_graph_max_bs=cuda_graph_max_bs,
+            disable_custom_all_reduce=disable_custom_all_reduce,
+            sleep_on_idle=sleep_on_idle,
+            max_lora_rank=max_lora_rank,
+            lora_target_modules=lora_target_modules,
+            enable_lora=enable_lora,
+            max_loaded_loras=max_loaded_loras,
+            **spec_kwargs,
+        )
+
+        if tokenizer_path is None:
+            self.tokenizer = get_tokenizer(
+                model_path, trust_remote_code=trust_remote_code
+            )
+        else:
+            self.tokenizer = None
+
+    def load_lora_adapter(self, lora_name: str, lora_path: str, pinned: bool = False):
+        return self.engine.load_lora_adapter(lora_name, lora_path, pinned)
+
+    def unload_lora_adapter(self, lora_name: str):
+        return self.engine.unload_lora_adapter(lora_name)
+
+    def forward(
+        self,
+        prompts: Union[
+            List[List[str]], List[str], List[torch.Tensor]
+        ] = DEFAULT_PROMPTS,
+        image_data: Optional[List[str]] = None,
+        max_new_tokens: int = 8,
+        lora_paths: Optional[List[str]] = None,
+        logprob_start_len: int = 0,
+        top_k: Optional[int] = None,
+        token_ids_logprob: Optional[List[int]] = None,
+    ):
+        if self.is_generation:
+            return self.forward_generation_raw(
+                engine=self.engine,
+                prompts=prompts,
+                max_new_tokens=max_new_tokens,
+                lora_paths=lora_paths,
+                logprob_start_len=logprob_start_len,
+                top_k=top_k,
+                token_ids_logprob=token_ids_logprob,
+            )
+        else:
+            if self.model_type == "embedding":
+                response = self.engine.encode(prompt=prompts, image_data=image_data)
+                if isinstance(response, list):
+                    logits = [x["embedding"] for x in response]
+                else:
+                    logits = [response["embedding"]]
+                return ModelOutput(embed_logits=logits)
+            # cross encoder model
+            elif self.model_type == "cross_encoder":
+                response = self.engine.rerank(prompts)
+                if not isinstance(response, list):
+                    response = [response]
+                scores = [x["embedding"] for x in response]
+                return ModelOutput(scores=scores)
+            # reward model
+            else:
+                response = self.engine.encode(prompts)
+                scores = [x["embedding"][0] for x in response]
+                return ModelOutput(scores=scores)
+
+    def batch_forward(
+        self,
+        prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
+        image_data: Optional[List[str]] = None,
+        max_new_tokens=8,
+        lora_paths=None,
+    ):
+        """
+        testing serving by sending all prompts once
+        only return output strings and no logprobs
+        """
+        if self.is_generation:
+            return self.batch_forward_generation_raw(
+                engine=self.engine,
+                prompts=prompts,
+                max_new_tokens=max_new_tokens,
+                lora_paths=lora_paths,
+            )
+        else:
+            response = self.engine.encode(prompts, image_data)
+            if self.model_type == "embedding":
+                logits = [x["embedding"] for x in response]
+                return ModelOutput(embed_logits=logits)
+            else:
+                scores = [x["embedding"][0] for x in response]
+                return ModelOutput(scores=scores)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.engine.shutdown()
+        del self.engine
+
+    @staticmethod
+    def forward_generation_raw(
+        engine: Engine,
+        prompts: Union[List[str], List[torch.Tensor]],
+        max_new_tokens: int = 8,
+        lora_paths: Optional[List[str]] = None,
+        logprob_start_len: int = 0,
+        top_k: Optional[int] = None,
+        token_ids_logprob: Optional[List[int]] = None,
+    ):
+        # the return value contains logprobs from prefill
+        output_strs = []
+        output_ids = []
+        # Input logprobs. Note that the last item in input logprob is equivalent to
+        # the first item in the output logprob.
+        top_input_logprobs = []
+        input_token_logprobs_lst = []
+        top_output_logprobs = []
+        output_token_logprobs_lst = []
+        top_output_logprob_idx = []
+        if token_ids_logprob is not None:
+            token_ids_input_logprobs = []
+            token_ids_output_logprobs = []
+        else:
+            token_ids_input_logprobs = token_ids_output_logprobs = None
+
+        sampling_params = {"max_new_tokens": max_new_tokens, "temperature": 0}
+        if top_k:
+            sampling_params["top_k"] = top_k
+
+        for i, prompt in enumerate(prompts):
+            response = engine.generate(
+                prompt,
+                lora_path=lora_paths[i] if lora_paths else None,
+                sampling_params=sampling_params,
+                return_logprob=True,
+                logprob_start_len=logprob_start_len,
+                top_logprobs_num=NUM_TOP_LOGPROBS,
+                token_ids_logprob=token_ids_logprob,
+            )
+            text = response["text"]
+
+            # Check if the text is empty or only whitespace.
+            if not text.strip():
+                raise ValueError(
+                    "Received an empty text response. Please verify your input or model configuration."
+                )
+            output_strs.append(text)
+            # output_ids.append(response["output_ids"])
+
+            input_token_logprobs = response["meta_info"]["input_token_logprobs"]
+            output_token_logprobs = response["meta_info"]["output_token_logprobs"]
+            # print(i, input_token_logprobs)
+            # print(i, output_token_logprobs)
+            logprobs = response["meta_info"]["input_top_logprobs"]
+            if token_ids_logprob is not None:
+                input_token_ids_logprobs = response["meta_info"][
+                    "input_token_ids_logprobs"
+                ][1:]
+            else:
+                input_token_ids_logprobs = None
+
+            num_prompt_tokens = response["meta_info"]["prompt_tokens"]
+            assert len(input_token_logprobs) == num_prompt_tokens - logprob_start_len
+            assert len(logprobs) == num_prompt_tokens - logprob_start_len
+
+            # The first token logprob has no meaning in sglang.
+            input_token_logprobs = input_token_logprobs[1:]
+            logprobs = logprobs[1:]
+            assert len(input_token_logprobs) == len(logprobs)
+
+            input_token_logprobs_lst.append(
+                input_token_logprobs + [output_token_logprobs[0]]
+            )
+            output_token_logprobs_lst.append(output_token_logprobs)
+
+            top_input_logprobs.append(
+                [[tup[0] for tup in x[:NUM_TOP_LOGPROBS]] for x in logprobs]
+                + [
+                    [
+                        tup[0]
+                        for tup in response["meta_info"]["output_top_logprobs"][0][
+                            :NUM_TOP_LOGPROBS
+                        ]
+                    ]
+                ]
+            )
+            top_output_logprobs.append(
+                [
+                    [tup[0] for tup in x[:NUM_TOP_LOGPROBS]]
+                    for x in response["meta_info"]["output_top_logprobs"]
+                ]
+            )
+            top_output_logprob_idx.append(
+                [
+                    [tup[1] for tup in x[:NUM_TOP_LOGPROBS]]
+                    for x in response["meta_info"]["output_top_logprobs"]
+                ]
+            )
+            if token_ids_logprob is not None:
+                token_ids_input_logprobs.append(
+                    [[tup[0] for tup in x] for x in input_token_ids_logprobs]
+                    + [
+                        [
+                            tup[0]
+                            for tup in response["meta_info"][
+                                "output_token_ids_logprobs"
+                            ][0]
+                        ]
+                    ]
+                )
+                token_ids_output_logprobs.append(
+                    [
+                        [tup[0] for tup in x]
+                        for x in response["meta_info"]["output_token_ids_logprobs"]
+                    ]
+                )
+
+        return ModelOutput(
+            output_strs=output_strs,
+            output_ids=output_ids,
+            top_input_logprobs=top_input_logprobs,
+            top_output_logprobs=top_output_logprobs,
+            input_token_logprobs_lst=input_token_logprobs_lst,
+            output_token_logprobs_lst=output_token_logprobs_lst,
+            top_output_logprob_idx=top_output_logprob_idx,
+            token_ids_input_logprobs=token_ids_input_logprobs,
+            token_ids_output_logprobs=token_ids_output_logprobs,
+        )
+
+    @staticmethod
+    def batch_forward_generation_raw(
+        prompts: Union[List[str], List[torch.Tensor]],
+        max_new_tokens,
+        lora_paths,
+        engine,
+    ):
+        # the return value contains logprobs from prefill
+        output_strs = []
+        sampling_params = {"max_new_tokens": max_new_tokens, "temperature": 0}
+        response = engine.generate(
+            prompts,
+            lora_path=lora_paths if lora_paths else None,
+            sampling_params=sampling_params,
+        )
+        output_strs = [r["text"] for r in response]
+
+        return ModelOutput(
+            output_strs=output_strs,
+        )
+
+
+def monkey_patch_gemma2_sdpa():
+    """
+    Use sdpa by default to fix the OOM issue.
+    Revert this commit:
+    https://github.com/huggingface/transformers/commit/975b988bfe6e7ebb47390cd9a1556c6888804883#diff-5f76eac6f18f4b491521314c318a9692318feb4d19228e9576cce7bde4240834R660
+    """
+    from transformers.models.gemma2.modeling_gemma2 import Gemma2PreTrainedModel
+
+    def _check_and_enable_sdpa(config, hard_check_only: bool = False):
+        config._attn_implementation = "sdpa"
+        return config
+
+    setattr(Gemma2PreTrainedModel, "_check_and_enable_sdpa", _check_and_enable_sdpa)
+
+
+def check_close_model_outputs(
+    hf_outputs: ModelOutput,
+    srt_outputs: ModelOutput,
+    prefill_tolerance: float,
+    decode_tolerance: float,
+    rouge_l_tolerance: float,
+    debug_text: str = "",
+    check_logprobs: bool = True,
+):
+    # Compare output strings
+    print(f"{hf_outputs.output_strs=}")
+    print(f"{srt_outputs.output_strs=}")
+    rouge_l_scores = calculate_rouge_l(hf_outputs.output_strs, srt_outputs.output_strs)
+    print(f"{rouge_l_scores=}")
+    assert all(
+        score >= rouge_l_tolerance for score in rouge_l_scores
+    ), f"Not all ROUGE-L scores are greater than rouge_l_tolerance={rouge_l_tolerance}"
+
+    if check_logprobs:
+        for i in range(len(hf_outputs.output_strs)):
+            # Compare input logprobs
+            hf_logprobs = torch.Tensor(hf_outputs.top_input_logprobs[i])
+            srt_logprobs = torch.Tensor(srt_outputs.top_input_logprobs[i])
+            input_len = hf_logprobs.shape[0]
+            print(
+                "prefill logprobs max_diff", torch.max(abs(hf_logprobs - srt_logprobs))
+            )
+            if input_len <= 100:
+                assert torch.all(abs(hf_logprobs - srt_logprobs) < prefill_tolerance), (
+                    f"prefill logprobs are not all close with {debug_text} "
+                    f"prefill_tolerance={prefill_tolerance}."
+                    f"{hf_logprobs=}, {srt_logprobs=}"
+                )
+
+            # Compare output logprobs
+            hf_logprobs = torch.Tensor(hf_outputs.top_output_logprobs[i])
+            srt_logprobs = torch.Tensor(srt_outputs.top_output_logprobs[i])
+
+            print(
+                "decode logprobs max_diff", torch.max(abs(hf_logprobs - srt_logprobs))
+            )
+            if input_len <= 100:
+                assert torch.all(abs(hf_logprobs - srt_logprobs) < decode_tolerance), (
+                    f"decode logprobs are not all close with {debug_text} "
+                    f"decode_tolerance={decode_tolerance}."
+                    f"{hf_logprobs=}, {srt_logprobs=}"
+                )
diff --git a/python/sglang/test/send_one.py b/python/sglang/test/send_one.py
new file mode 100644
index 000000000..a0aec308d
--- /dev/null
+++ b/python/sglang/test/send_one.py
@@ -0,0 +1,158 @@
+"""
+Run one test prompt.
+
+Usage:
+python3 -m sglang.test.send_one
+"""
+
+import argparse
+import dataclasses
+import json
+
+import requests
+
+
+@dataclasses.dataclass
+class BenchArgs:
+    host: str = "localhost"
+    port: int = 30000
+    batch_size: int = 1
+    temperature: float = 0.0
+    max_new_tokens: int = 512
+    frequency_penalty: float = 0.0
+    presence_penalty: float = 0.0
+    json: bool = False
+    return_logprob: bool = False
+    prompt: str = (
+        "Human: Give me a fully functional FastAPI server. Show the python code.\n\nAssistant:"
+    )
+    image: bool = False
+    many_images: bool = False
+    stream: bool = False
+
+    @staticmethod
+    def add_cli_args(parser: argparse.ArgumentParser):
+        parser.add_argument("--host", type=str, default=BenchArgs.host)
+        parser.add_argument("--port", type=int, default=BenchArgs.port)
+        parser.add_argument("--batch-size", type=int, default=BenchArgs.batch_size)
+        parser.add_argument("--temperature", type=float, default=BenchArgs.temperature)
+        parser.add_argument(
+            "--max-new-tokens", type=int, default=BenchArgs.max_new_tokens
+        )
+        parser.add_argument(
+            "--frequency-penalty", type=float, default=BenchArgs.frequency_penalty
+        )
+        parser.add_argument(
+            "--presence-penalty", type=float, default=BenchArgs.presence_penalty
+        )
+        parser.add_argument("--json", action="store_true")
+        parser.add_argument("--return-logprob", action="store_true")
+        parser.add_argument("--prompt", type=str, default=BenchArgs.prompt)
+        parser.add_argument("--image", action="store_true")
+        parser.add_argument("--many-images", action="store_true")
+        parser.add_argument("--stream", action="store_true")
+
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        attrs = [attr.name for attr in dataclasses.fields(cls)]
+        return cls(**{attr: getattr(args, attr) for attr in attrs})
+
+
+def send_one_prompt(args):
+    if args.image:
+        args.prompt = (
+            "Human: Describe this image in a very short sentence.\n\nAssistant:"
+        )
+        image_data = "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png"
+    elif args.many_images:
+        args.prompt = (
+            "Human: I have one reference image and many images."
+            "Describe their relationship in a very short sentence.\n\nAssistant:"
+        )
+        image_data = [
+            "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png",
+            "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png",
+            "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png",
+            "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png",
+        ]
+    else:
+        image_data = None
+
+    prompt = args.prompt
+
+    if args.json:
+        prompt = (
+            "Human: What is the capital of France and how is that city like. "
+            "Give me 3 trivial information about that city. "
+            "Write in a format of json.\nAssistant:"
+        )
+        json_schema = "$$ANY$$"
+    else:
+        json_schema = None
+
+    if args.batch_size > 1:
+        prompt = [prompt] * args.batch_size
+
+    json_data = {
+        "text": prompt,
+        "image_data": image_data,
+        "sampling_params": {
+            "temperature": args.temperature,
+            "max_new_tokens": args.max_new_tokens,
+            "frequency_penalty": args.frequency_penalty,
+            "presence_penalty": args.presence_penalty,
+            "json_schema": json_schema,
+            "stop": ["Question", "Assistant:", "<|separator|>", "<|eos|>"],
+        },
+        "return_logprob": args.return_logprob,
+        "stream": args.stream,
+    }
+
+    response = requests.post(
+        f"http://{args.host}:{args.port}/generate",
+        json=json_data,
+        stream=args.stream,
+    )
+
+    if args.stream:
+        for chunk in response.iter_lines(decode_unicode=False):
+            chunk = chunk.decode("utf-8")
+            if chunk and chunk.startswith("data:"):
+                if chunk == "data: [DONE]":
+                    break
+                ret = json.loads(chunk[5:].strip("\n"))
+    else:
+        ret = response.json()
+
+    if args.batch_size > 1:
+        ret = ret[0]
+
+    if response.status_code != 200:
+        print(ret)
+        return 0, 0
+
+    latency = ret["meta_info"]["e2e_latency"]
+
+    if "spec_verify_ct" in ret["meta_info"]:
+        acc_length = (
+            ret["meta_info"]["completion_tokens"] / ret["meta_info"]["spec_verify_ct"]
+        )
+    else:
+        acc_length = 1.0
+
+    speed = ret["meta_info"]["completion_tokens"] / latency
+
+    print(ret["text"])
+    print()
+    print(f"{acc_length=:.2f}")
+    print(f"{speed=:.2f} token/s")
+
+    return acc_length, speed
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    BenchArgs.add_cli_args(parser)
+    args = parser.parse_args()
+
+    send_one_prompt(args)
diff --git a/python/sglang/test/simple_eval_common.py b/python/sglang/test/simple_eval_common.py
new file mode 100644
index 000000000..1816a703e
--- /dev/null
+++ b/python/sglang/test/simple_eval_common.py
@@ -0,0 +1,472 @@
+# Adapted from https://github.com/openai/simple-evals/
+
+import os
+import resource
+import time
+from collections import defaultdict
+from dataclasses import dataclass, field
+from multiprocessing.pool import ThreadPool
+from typing import Any, Dict, List, Optional, Tuple
+
+import httpx
+import jinja2
+import numpy as np
+import openai
+import requests
+from openai import OpenAI
+from tqdm import tqdm
+
+OPENAI_SYSTEM_MESSAGE_API = "You are a helpful assistant."
+OPENAI_SYSTEM_MESSAGE_CHATGPT = (
+    "You are ChatGPT, a large language model trained by OpenAI, based on the GPT-4 architecture."
+    + "\nKnowledge cutoff: 2023-12\nCurrent date: 2024-04-01"
+)
+
+
+Message = Dict[str, Any]  # keys role, content
+MessageList = List[Message]
+
+
+class SamplerBase:
+    """
+    Base class for defining a sampling model, which can be evaluated,
+    or used as part of the grading process.
+    """
+
+    def __call__(self, message_list: MessageList) -> str:
+        raise NotImplementedError()
+
+
+@dataclass
+class EvalResult:
+    """
+    Result of running an evaluation (usually consisting of many samples)
+    """
+
+    score: Optional[float]  # top-line metric
+    metrics: Optional[Dict[str, float]]  # other metrics
+    htmls: List[str]  # strings of valid HTML
+    convos: List[MessageList]  # sampled conversations
+
+
+@dataclass
+class SingleEvalResult:
+    """
+    Result of evaluating a single sample
+    """
+
+    score: Optional[float]
+    metrics: Dict[str, float] = field(default_factory=dict)
+    html: Optional[str] = None
+    convo: Optional[MessageList] = None  # sampled conversation
+
+
+class Eval:
+    """
+    Base class for defining an evaluation.
+    """
+
+    def __call__(self, sampler: SamplerBase) -> EvalResult:
+        raise NotImplementedError()
+
+
+class LargerHttpxClient(httpx.Client):
+    def __init__(self):
+        timeout_config = httpx.Timeout(3600)
+        limits = httpx.Limits(
+            max_keepalive_connections=3600,
+            max_connections=3600,
+        )
+        super().__init__(timeout=timeout_config, limits=limits)
+
+
+class ChatCompletionSampler(SamplerBase):
+    """
+    Sample from OpenAI's chat completion API
+    """
+
+    def __init__(
+        self,
+        base_url: str = None,
+        model: Optional[str] = None,
+        system_message: Optional[str] = None,
+        temperature: float = 0.0,
+        reasoning_effort: Optional[str] = None,
+        max_tokens: int = 2048,
+    ):
+        self.client = OpenAI(base_url=base_url, http_client=LargerHttpxClient())
+
+        if model is None:
+            model = self.client.models.list().data[0].id
+
+        self.model = model
+        self.system_message = system_message
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+        self.reasoning_effort = reasoning_effort
+        self.image_format = "url"
+        print(
+            f"ChatCompletionSampler initialized with {self.system_message=} {self.temperature=} {self.max_tokens=} {self.reasoning_effort=}"
+        )
+
+    def _handle_image(
+        self,
+        image: str,
+        encoding: str = "base64",
+        format: str = "png",
+        fovea: int = 768,
+    ):
+        new_image = {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/{format};{encoding},{image}",
+            },
+        }
+        return new_image
+
+    def _handle_text(self, text: str):
+        return {"type": "text", "text": text}
+
+    def _pack_message(self, role: str, content: Any):
+        return {"role": str(role), "content": content}
+
+    def __call__(self, message_list: MessageList) -> str:
+        if self.system_message:
+            message_list = [
+                self._pack_message("system", self.system_message)
+            ] + message_list
+        trial = 0
+        while True:
+            try:
+                response = self.client.chat.completions.create(
+                    model=self.model,
+                    messages=message_list,
+                    temperature=self.temperature,
+                    max_tokens=self.max_tokens,
+                    reasoning_effort=self.reasoning_effort,
+                )
+                return response.choices[0].message.content
+            # NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are rerunning MMMU
+            except openai.BadRequestError as e:
+                print("Bad Request Error", e)
+                return ""
+            except Exception as e:
+                exception_backoff = 2**trial  # expontial back off
+                print(
+                    f"Rate limit exception so wait and retry {trial} after {exception_backoff} sec",
+                    e,
+                )
+                time.sleep(exception_backoff)
+                trial += 1
+            # unknown error shall throw exception
+
+
+QUERY_TEMPLATE_MULTICHOICE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
+
+{Question}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
+ANSWER_PATTERN_MULTICHOICE = r"(?i)Answer\s*:\s*([A-D])"
+ANSWER_PATTERN = r"(?i)Answer\s*:\s*([^\n]+)"
+
+
+EQUALITY_TEMPLATE = r"""
+Look at the following two expressions (answers to a math problem) and judge whether they are equivalent. Only perform trivial simplifications
+
+Examples:
+
+    Expression 1: $2x+3$
+    Expression 2: $3+2x$
+
+Yes
+
+    Expression 1: 3/2
+    Expression 2: 1.5
+
+Yes
+
+    Expression 1: $x^2+2x+1$
+    Expression 2: $y^2+2y+1$
+
+No
+
+    Expression 1: $x^2+2x+1$
+    Expression 2: $(x+1)^2$
+
+Yes
+
+    Expression 1: 3245/5
+    Expression 2: 649
+
+No
+(these are actually equal, don't mark them equivalent if you need to do nontrivial simplifications)
+
+    Expression 1: 2/(-3)
+    Expression 2: -2/3
+
+Yes
+(trivial simplifications are allowed)
+
+    Expression 1: 72 degrees
+    Expression 2: 72
+
+Yes
+(give benefit of the doubt to units)
+
+    Expression 1: 64
+    Expression 2: 64 square feet
+
+Yes
+(give benefit of the doubt to units)
+
+---
+
+YOUR TASK
+
+
+Respond with only "Yes" or "No" (without quotes). Do not include a rationale.
+
+    Expression 1: %(expression1)s
+    Expression 2: %(expression2)s
+""".strip()
+
+
+HTML_JINJA = """
+<h3>Prompt conversation</h3>
+{% for message in prompt_messages %}
+{{ message_to_html(message) | safe }}
+{% endfor %}
+<h3>Sampled message</h3>
+{{ message_to_html(next_message) | safe }}
+<h3>Results</h3>
+<p>Correct Answer: {{ correct_answer }}</p>
+<p>Extracted Answer: {{ extracted_answer }}</p>
+<p>Score: {{ score }}</p>
+"""
+
+
+def format_multichoice_question(row):
+    return QUERY_TEMPLATE_MULTICHOICE.format(**row)
+
+
+def check_equality(sampler: SamplerBase, expr1: str, expr2: str):
+    prompt = EQUALITY_TEMPLATE % {"expression1": expr1, "expression2": expr2}
+    response = sampler([dict(content=prompt, role="user")])
+    return response.lower().strip() == "yes"
+
+
+def _compute_stat(values: list, stat: str):
+    if stat == "mean":
+        return np.mean(values)
+    elif stat == "std":
+        return np.std(values)
+    elif stat == "min":
+        return np.min(values)
+    elif stat == "max":
+        return np.max(values)
+    else:
+        raise ValueError(f"Unknown {stat =}")
+
+
+def aggregate_results(
+    single_eval_results: List[SingleEvalResult],
+    default_stats: Tuple[str] = ("mean", "std"),
+    name2stats: Optional[Dict[str, Tuple[str]]] = None,
+) -> EvalResult:
+    """
+    Aggregate results from multiple evaluations into a single EvalResult.
+    """
+    name2stats = name2stats or {}
+    name2values = defaultdict(list)
+    htmls = []
+    convos = []
+    for single_eval_result in single_eval_results:
+        for name, value in single_eval_result.metrics.items():
+            name2values[name].append(value)
+        if single_eval_result.score is not None:
+            name2values["score"].append(single_eval_result.score)
+        htmls.append(single_eval_result.html)
+        convos.append(single_eval_result.convo)
+    final_metrics = {}
+    for name, values in name2values.items():
+        stats = name2stats.get(name, default_stats)
+        for stat in stats:
+            key = name if stat == "mean" else f"{name}:{stat}"
+            final_metrics[key] = _compute_stat(values, stat)
+    return EvalResult(
+        score=final_metrics.pop("score", None),
+        metrics=final_metrics,
+        htmls=htmls,
+        convos=convos,
+    )
+
+
+def map_with_progress(f: callable, xs: List[Any], num_threads: int):
+    """
+    Apply f to each element of xs, using a ThreadPool, and show progress.
+    """
+    if os.getenv("debug"):
+        return list(map(f, tqdm(xs, total=len(xs))))
+    else:
+        with ThreadPool(min(num_threads, len(xs))) as pool:
+            return list(tqdm(pool.imap(f, xs), total=len(xs)))
+
+
+jinja_env = jinja2.Environment(
+    loader=jinja2.BaseLoader(),
+    undefined=jinja2.StrictUndefined,
+    autoescape=jinja2.select_autoescape(["html", "xml"]),
+)
+_message_template = """
+<div class="message {{ role }}">
+    <div class="role">
+    {{ role }}
+    {% if variant %}<span class="variant">({{ variant }})</span>{% endif %}
+    </div>
+    <div class="content">
+    <pre>{{ content }}</pre>
+    </div>
+</div>
+"""
+
+
+def message_to_html(message: Message) -> str:
+    """
+    Generate HTML snippet (inside a <div>) for a message.
+    """
+    return jinja_env.from_string(_message_template).render(
+        role=message["role"],
+        content=message["content"],
+        variant=message.get("variant", None),
+    )
+
+
+jinja_env.globals["message_to_html"] = message_to_html
+
+
+_report_template = """<!DOCTYPE html>
+<html>
+    <head>
+        <style>
+            .message {
+                padding: 8px 16px;
+                margin-bottom: 8px;
+                border-radius: 4px;
+            }
+            .message.user {
+                background-color: #B2DFDB;
+                color: #00695C;
+            }
+            .message.assistant {
+                background-color: #B39DDB;
+                color: #4527A0;
+            }
+            .message.system {
+                background-color: #EEEEEE;
+                color: #212121;
+            }
+            .role {
+                font-weight: bold;
+                margin-bottom: 4px;
+            }
+            .variant {
+                color: #795548;
+            }
+            table, th, td {
+                border: 1px solid black;
+            }
+            pre {
+                white-space: pre-wrap;
+            }
+        </style>
+    </head>
+    <body>
+    {% if metrics %}
+    <h1>Metrics</h1>
+    <table>
+    <tr>
+        <th>Metric</th>
+        <th>Value</th>
+    </tr>
+    <tr>
+        <td><b>Score</b></td>
+        <td>{{ score | float | round(3) }}</td>
+    </tr>
+    {% for name, value in metrics.items() %}
+    <tr>
+        <td>{{ name }}</td>
+        <td>{{ value }}</td>
+    </tr>
+    {% endfor %}
+    </table>
+    {% endif %}
+    <h1>Examples</h1>
+    {% for html in htmls %}
+    {{ html | safe }}
+    <hr>
+    {% endfor %}
+    </body>
+</html>
+"""
+
+
+def make_report(eval_result: EvalResult) -> str:
+    """
+    Create a standalone HTML report from an EvalResult.
+    """
+    return jinja_env.from_string(_report_template).render(
+        score=eval_result.score,
+        metrics=eval_result.metrics,
+        htmls=eval_result.htmls,
+    )
+
+
+def make_report_from_example_htmls(htmls: List[str]):
+    """
+    Create a standalone HTML report from a list of example htmls
+    """
+    return jinja_env.from_string(_report_template).render(
+        score=None, metrics={}, htmls=htmls
+    )
+
+
+def download_dataset(path, url):
+    print(f"Downloading dataset {path} from {url}")
+    try:
+        response = requests.get(url, stream=True)
+        response.raise_for_status()
+
+        total_size = int(response.headers.get("content-length", 0))
+        block_size = 8192
+
+        with open(path, "wb") as f, tqdm(
+            desc="Downloading",
+            total=total_size,
+            unit="iB",
+            unit_scale=True,
+            unit_divisor=1024,
+        ) as progress_bar:
+            for data in response.iter_content(block_size):
+                size = f.write(data)
+                progress_bar.update(size)
+
+        print(f"Dataset downloaded and saved to {path}")
+    except requests.RequestException as e:
+        raise Exception(f"Failed to download dataset: {e}")
+
+
+def set_ulimit(target_soft_limit=65535):
+    resource_type = resource.RLIMIT_NOFILE
+    current_soft, current_hard = resource.getrlimit(resource_type)
+
+    if current_soft < target_soft_limit:
+        try:
+            resource.setrlimit(resource_type, (target_soft_limit, current_hard))
+        except ValueError as e:
+            print(f"Fail to set RLIMIT_NOFILE: {e}")
diff --git a/python/sglang/test/simple_eval_gpqa.py b/python/sglang/test/simple_eval_gpqa.py
new file mode 100644
index 000000000..b77ca773e
--- /dev/null
+++ b/python/sglang/test/simple_eval_gpqa.py
@@ -0,0 +1,95 @@
+# Adapted from https://github.com/openai/simple-evals/
+
+"""
+GPQA: A Graduate-Level Google-Proof Q&A Benchmark
+David Rein, Betty Li Hou, Asa Cooper Stickland, Jackson Petty, Richard Yuanzhe Pang, Julien Dirani, Julian Michael, Samuel R. Bowman
+https://arxiv.org/abs/2311.12022
+"""
+
+import random
+import re
+from typing import Optional
+
+import pandas
+
+from sglang.test import simple_eval_common as common
+from sglang.test.simple_eval_common import (
+    ANSWER_PATTERN_MULTICHOICE,
+    HTML_JINJA,
+    Eval,
+    EvalResult,
+    MessageList,
+    SamplerBase,
+    SingleEvalResult,
+    format_multichoice_question,
+)
+
+
+class GPQAEval(Eval):
+    def __init__(
+        self,
+        filename: str,
+        num_examples: Optional[int],
+        num_threads: int,
+        n_repeats: int = 1,
+    ):
+        df = pandas.read_csv(filename)
+        examples = [row.to_dict() for _, row in df.iterrows()]
+        rng = random.Random(0)
+        if num_examples:
+            assert n_repeats == 1, "n_repeats only supported for num_examples"
+            examples = rng.sample(examples, num_examples)
+        examples = examples * n_repeats
+        examples = [
+            example | {"permutation": rng.sample(range(4), 4)} for example in examples
+        ]
+        self.examples = examples
+        self.n_repeats = n_repeats
+        self.num_threads = num_threads
+
+    def __call__(self, sampler: SamplerBase) -> EvalResult:
+        def fn(row: dict):
+            choices = [
+                row["Correct Answer"],
+                row["Incorrect Answer 1"],
+                row["Incorrect Answer 2"],
+                row["Incorrect Answer 3"],
+            ]
+            choices = [choices[i] for i in row["permutation"]]
+            correct_index = choices.index(row["Correct Answer"])
+            correct_answer = "ABCD"[correct_index]
+            choices_dict = dict(
+                A=choices[0],
+                B=choices[1],
+                C=choices[2],
+                D=choices[3],
+                Question=row["Question"],
+            )
+            prompt_messages = [
+                sampler._pack_message(
+                    content=format_multichoice_question(choices_dict), role="user"
+                )
+            ]
+            response_text = sampler(prompt_messages)
+            if response_text is None:
+                response_text = ""
+            match = re.search(ANSWER_PATTERN_MULTICHOICE, response_text)
+            extracted_answer = match.group(1) if match else None
+            score = 1.0 if extracted_answer == correct_answer else 0.0
+            html = common.jinja_env.from_string(HTML_JINJA).render(
+                prompt_messages=prompt_messages,
+                next_message=dict(content=response_text, role="assistant"),
+                score=score,
+                correct_answer=correct_answer,
+                extracted_answer=extracted_answer,
+            )
+            convo = prompt_messages + [dict(content=response_text, role="assistant")]
+            return SingleEvalResult(
+                html=html,
+                score=score,
+                convo=convo,
+                metrics={"chars": len(response_text)},
+            )
+
+        results = common.map_with_progress(fn, self.examples, self.num_threads)
+        return common.aggregate_results(results)
diff --git a/python/sglang/test/simple_eval_humaneval.py b/python/sglang/test/simple_eval_humaneval.py
new file mode 100644
index 000000000..25dcdd53a
--- /dev/null
+++ b/python/sglang/test/simple_eval_humaneval.py
@@ -0,0 +1,133 @@
+# Adapted from https://github.com/openai/simple-evals/
+
+"""
+HumanEval: Evaluating Large Language Models Trained on Code
+Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba
+https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/
+"""
+
+import random
+import re
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Dict, List, Optional
+
+import tqdm
+
+try:
+    from human_eval.data import read_problems
+    from human_eval.evaluation import estimate_pass_at_k
+    from human_eval.execution import check_correctness  # , unsafe_execute
+except (ImportError, ModuleNotFoundError):
+    print("\nPlease install human-eval at https://github.com/openai/human-eval.\n")
+    raise
+
+from sglang.test import simple_eval_common as common
+from sglang.test.simple_eval_common import (
+    HTML_JINJA,
+    Eval,
+    EvalResult,
+    SamplerBase,
+    SingleEvalResult,
+)
+
+
+def evaluate_functional_correctness(
+    sample: Dict[str, str],
+    completions: List[str],
+    n_workers: int = 4,
+    timeout: float = 3.0,
+):
+    """
+    Evaluates the functional correctness of generated samples, and writes
+    results to f"{sample_file}_results.jsonl.gz"
+    """
+    import copy
+
+    # Check the generated samples against test suites.
+    with ThreadPoolExecutor(max_workers=n_workers) as executor:
+        futures = []
+        for i, completion in enumerate(completions):
+            args = (sample, completion, timeout, i)
+            future = executor.submit(check_correctness, *args)
+            futures.append(future)
+        results = []
+        for future in as_completed(futures):
+            result = future.result()
+            results.append(result)
+    passed = [int(r["passed"]) for r in results]
+    return passed
+
+
+class HumanEval(Eval):
+    def __init__(
+        self,
+        num_examples: Optional[int],
+        num_threads: int,
+        num_samples_per_task: int = 5,
+        ks_passes: List[int] = [1, 2, 5],
+        timeout: int = 120,
+    ):
+        self.seed = 0
+        self.examples = read_problems()
+        self.examples = list(self.examples.values())
+
+        self._num_examples = num_examples
+        if self._num_examples:
+            self.examples = random.Random(self.seed).sample(self.examples, num_examples)
+        self._num_samples_per_task = num_samples_per_task
+        self._ks_passes = ks_passes
+        self._timeout = timeout
+        self._num_threads = num_threads
+
+    def __call__(self, sampler: SamplerBase) -> EvalResult:
+        instruction = "Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n"
+
+        def find_code(completion):
+            pattern = re.compile(r"```python\n(.*?)```", re.DOTALL)
+            matches = pattern.findall(completion)
+            extracted_answer = matches[0] if len(matches) >= 1 else completion
+            extracted_answer = extracted_answer[
+                extracted_answer.find(":\n    ") + 2 :
+            ]  # remove signature
+            return extracted_answer
+
+        def fn(sample: Dict[str, str]):
+            prompt_messages = [
+                sampler._pack_message(
+                    role="user", content=instruction + sample["prompt"]
+                )
+            ]
+            completions = [
+                find_code(sampler(prompt_messages))
+                for _ in range(self._num_samples_per_task)
+            ]
+            results = evaluate_functional_correctness(sample, completions)
+            total = len(results)
+            correct = sum(results)
+            score = sum(results) / len(results)
+            html = common.jinja_env.from_string(HTML_JINJA).render(
+                prompt_messages=prompt_messages,
+                next_message=dict(content=completions[0], role="assistant"),
+                score=score,
+                correct_answer=[1] * len(results),
+                extracted_answer=results,
+            )
+            convo = prompt_messages + [
+                dict(content=completion, role="assistant") for completion in completions
+            ]
+            return SingleEvalResult(
+                html=html,
+                score=score,
+                convo=convo,
+                metrics={
+                    f"pass@{k}": estimate_pass_at_k([total], [correct], k)
+                    # this will be aggregated so no need of .mean()
+                    for k in self._ks_passes
+                    if total >= k
+                },
+            )
+
+        results = common.map_with_progress(
+            fn, self.examples, num_threads=self._num_threads
+        )
+        return common.aggregate_results(results)
diff --git a/python/sglang/test/simple_eval_math.py b/python/sglang/test/simple_eval_math.py
new file mode 100644
index 000000000..74c49abe5
--- /dev/null
+++ b/python/sglang/test/simple_eval_math.py
@@ -0,0 +1,73 @@
+# Adapted from https://github.com/openai/simple-evals/
+
+"""
+Measuring Mathematical Problem Solving With the MATH Dataset
+Dan Hendrycks, Collin Burns, Saurav Kadavath, Akul Arora, Steven Basart, Eric Tang, Dawn Song, Jacob Steinhardt
+https://arxiv.org/abs/2103.03874
+"""
+
+import random
+import re
+from typing import Optional
+
+import pandas
+
+from sglang.test import simple_eval_common as common
+from sglang.test.simple_eval_common import (
+    ANSWER_PATTERN,
+    HTML_JINJA,
+    Eval,
+    EvalResult,
+    SamplerBase,
+    SingleEvalResult,
+    check_equality,
+)
+
+QUERY_TEMPLATE = """
+Solve the following math problem step by step. The last line of your response should be of the form Answer: $ANSWER (without quotes) where $ANSWER is the answer to the problem.
+
+{Question}
+
+Remember to put your answer on its own line after "Answer:", and you do not need to use a \\boxed command.
+""".strip()
+
+
+class MathEval(Eval):
+    def __init__(
+        self,
+        filename: str,
+        equality_checker: SamplerBase,
+        num_examples: Optional[int],
+        num_threads: int,
+    ):
+        df = pandas.read_csv(filename)
+        examples = [row.to_dict() for _, row in df.iterrows()]
+        if num_examples:
+            examples = random.Random(0).sample(examples, num_examples)
+        self.examples = examples
+        self.equality_checker = equality_checker
+        self.num_threads = num_threads
+
+    def __call__(self, sampler: SamplerBase) -> EvalResult:
+        def fn(row: dict):
+            prompt_messages = [
+                sampler._pack_message(content=QUERY_TEMPLATE.format(**row), role="user")
+            ]
+            response_text = sampler(prompt_messages)
+            match = re.search(ANSWER_PATTERN, response_text)
+            extracted_answer = match.group(1) if match else None
+            score = float(
+                check_equality(self.equality_checker, row["Answer"], extracted_answer)
+            )
+            html = common.jinja_env.from_string(HTML_JINJA).render(
+                prompt_messages=prompt_messages,
+                next_message=dict(content=response_text, role="assistant"),
+                score=score,
+                correct_answer=row["Answer"],
+                extracted_answer=extracted_answer,
+            )
+            convo = prompt_messages + [dict(content=response_text, role="assistant")]
+            return SingleEvalResult(html=html, score=score, convo=convo)
+
+        results = common.map_with_progress(fn, self.examples, self.num_threads)
+        return common.aggregate_results(results)
diff --git a/python/sglang/test/simple_eval_mgsm.py b/python/sglang/test/simple_eval_mgsm.py
new file mode 100644
index 000000000..0b0b72a20
--- /dev/null
+++ b/python/sglang/test/simple_eval_mgsm.py
@@ -0,0 +1,203 @@
+# Adapted from https://github.com/openai/simple-evals/
+
+"""
+MGSM: Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school math problems.
+Language Models are Multilingual Chain-of-Thought Reasoners
+Freda Shi, Mirac Suzgun, Markus Freitag, Xuezhi Wang, Suraj Srivats, Soroush Vosoughi, Hyung Won Chung, Yi Tay, Sebastian Ruder, Denny Zhou, Dipanjan Das, Jason Wei
+https://arxiv.org/abs/2210.03057 reference: https://github.com/google-research/url-nlp
+"""
+
+import re
+import urllib
+from typing import Optional
+
+from sglang.test import simple_eval_common as common
+from sglang.test.simple_eval_common import (
+    HTML_JINJA,
+    Eval,
+    EvalResult,
+    SamplerBase,
+    SingleEvalResult,
+)
+
+ALL_LANGUAGES = ["bn", "de", "en", "es", "fr", "ja", "ru", "sw", "te", "th", "zh"]
+LATIN_LANGUAGES = ["de", "en", "es", "fr", "sw"]
+NON_LATIN_LANGUAGES = ["bn", "ja", "ru", "te", "th", "zh"]
+
+LANG_TO_FPATH = {
+    "bn": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_bn.tsv",
+    "de": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_de.tsv",
+    "en": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_en.tsv",
+    "es": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_es.tsv",
+    "fr": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_fr.tsv",
+    "ja": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_ja.tsv",
+    "ru": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_ru.tsv",
+    "sw": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_sw.tsv",
+    "te": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_te.tsv",
+    "th": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_th.tsv",
+    "zh": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_zh.tsv",
+}
+LANG_TO_INSTRUCTIONS = {
+    "en": """Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of "Answer:". Do not add anything other than the integer answer after "Answer:".
+
+{input}""",
+    "bn": """এই গণিতের সমস্যাটি সমাধান করুন। চূড়ান্ত উত্তর দেওয়ার আগে যুক্তিসম্পন্ন পদক্ষেপ প্রদান করুন। চূড়ান্ত উত্তরটি একক সংখ্যা হিসাবে "উত্তর:" এর পরে শেষ লাইনে দিন। "উত্তর:" এর পরে অন্য কিছু যুক্ত করবেন না।.
+
+{input}""",
+    "de": """Löse dieses Mathematikproblem. Gib die Schritte zur Begründung an, bevor du die endgültige Antwort in der letzten Zeile alleine im Format "Antwort:" gibst. Füge nichts anderes als die ganzzahlige Antwort nach "Antwort:" hinzu.
+
+{input}""",
+    "es": """Resuelve este problema matemático. Proporciona los pasos de razonamiento antes de dar la respuesta final en la última línea por sí misma en el formato de "Respuesta:". No añadas nada más que la respuesta entera después de "Respuesta:".
+
+{input}""",
+    "fr": """Résolvez ce problème de mathématiques. Donnez les étapes de raisonnement avant de fournir la réponse finale sur la dernière ligne elle-même dans le format de "Réponse:". N'ajoutez rien d'autre que la réponse entière après "Réponse:".
+
+{input}""",
+    "ja": """の数学の問題を解いてください。最終的な答えを出す前に、解答の推論過程を記述してください。そして最後の行には "答え:" の形式で答えを記述し、その後には整数の答え以外何も追加しないでください。
+
+{input}""",
+    "ru": """Решите эту математическую задачу. Объясните шаги рассуждения перед тем, как дать окончательный ответ в последней строке сам по себе в формате "Ответ:". Не добавляйте ничего, кроме целочисленного ответа после "Ответ:".
+
+{input}""",
+    "sw": """Suluhisha tatizo hili la hesabu. Toa hatua za mantiki kabla ya kutoa jibu la mwisho kwenye mstari wa mwisho peke yake katika muundo wa "Jibu:". Usiongeze chochote kingine isipokuwa jibu la integer baada ya "Jibu:".
+
+{input}""",
+    "te": """ఈ గణిత సమస్యను పరిష్కరించండి. చివరి సమాధానాన్ని ఇవ్వదానికి ముందు తర్కాత్మక అదుగులను ఇవ్వండి. చివరి పంక్తిలో మాత్రమే 'సమాధానం:' అనే ఆకారంలో చివరి సమాధానాద్ని ఇవ్వండి సమాధానం: తర్వాత పూర్ణాంక సమాధానానికి తప్పించి ఎదేనా చేర్చవద్దు.
+
+{input}""",
+    "th": """แก้ปัญหาคณิตศาสตร์นี้ ให้ให้ขั้นตอนการใช้เหตุผลก่อนที่จะให้คำตอบสุดท้ายในบรรทัดสุดท้ายโดยอยู่ในรูปแบบ "คำตอบ:" ไม่ควรเพิ่มอะไรนอกจากคำตอบที่เป็นจำนวนเต็มหลังจาก "คำตอบ:"
+
+{input}""",
+    "zh": """解决这个数学问题。在最后一行给出答案前，请提供推理步骤。最后一行应该以 "答案: " 的形式独立给出答案。在 "答案：" 后不要添加除整数答案之外的任何内容。
+
+{input}""",
+}
+
+LANG_TO_ANSWER_PREFIX = {
+    "en": "Answer",
+    "bn": "উত্তর",
+    "de": "Antwort",
+    "es": "Respuesta",
+    "fr": "Réponse",
+    "ja": "答え",
+    "ru": "Ответ",
+    "sw": "Jibu",
+    "te": "సమాధానం",
+    "th": "คำตอบ",
+    "zh": "答案",
+}
+
+
+def parse_answer(answer: str, answer_prefix: str) -> str:
+    if answer_prefix not in answer:
+        return ""
+
+    answer_text = answer.split(answer_prefix)[-1].strip()
+
+    # find all the numbers (including decimals) in the string
+    numbers = re.findall(r"\d+\.?\d*", answer_text.replace(",", ""))
+
+    # return the first number (removing trailing decimal point if present),
+    # or an empty string if there were no numbers
+    return numbers[-1].rstrip(".") if numbers else ""
+
+
+def score_mgsm(target: str, prediction: str) -> bool:
+    if "." in prediction:
+        prediction = prediction.rstrip("0").rstrip(".")
+
+    target = target.replace(",", "")
+    prediction = prediction.replace(",", "")
+
+    return target == prediction
+
+
+def get_lang_examples(lang: str) -> list[dict[str, str]]:
+    fpath = LANG_TO_FPATH[lang]
+    examples = []
+    with urllib.request.urlopen(fpath) as f:
+        for line in f.read().decode("utf-8").splitlines():
+            inputs, targets = line.strip().split("\t")
+            if "." in targets:
+                raise ValueError(f"targets {targets} contains a decimal point.")
+            # targets = int(targets.replace(",", ""))
+            examples.append({"inputs": inputs, "targets": targets, "lang": lang})
+    return examples
+
+
+def get_all_examples() -> list[dict[str, str]]:
+    examples = []
+    for lang in ALL_LANGUAGES:
+        if lang != "en":
+            continue
+        examples += get_lang_examples(lang)
+    return examples
+
+
+class MGSMEval(Eval):
+    def __init__(
+        self,
+        num_examples_per_lang: int = 250,  # restrict to a subset of the data for debugging
+        num_threads: int = 64,
+        languages: Optional[list[str]] = ALL_LANGUAGES,
+    ):
+        if languages is None:
+            languages = ALL_LANGUAGES
+        else:
+            for language in languages:
+                if language not in ALL_LANGUAGES:
+                    raise ValueError(
+                        f"language {language} is not a valid language. "
+                        f"It should be one in {ALL_LANGUAGES}"
+                    )
+        self._languages = languages
+        self._num_examples_per_lang = num_examples_per_lang
+        self._num_threads = num_threads
+
+        examples = []
+        for lang in self._languages:
+            lang_examples = get_lang_examples(lang)
+            examples.extend(lang_examples[: self._num_examples_per_lang])
+        self.examples = examples
+
+    def __call__(self, sampler: SamplerBase) -> EvalResult:
+        def fn(example: dict[str, str]):
+            language = example["lang"]
+            latin_language = (
+                "group_latin" if language in LATIN_LANGUAGES else "group_non_latin"
+            )
+            correct_answer = example["targets"]
+            instructoin = LANG_TO_INSTRUCTIONS[language]
+            prompt_messages = [
+                sampler._pack_message(
+                    content=instructoin.format(input=example["inputs"]), role="user"
+                )
+            ]
+            try:
+                response_text = sampler(prompt_messages)
+            except Exception as e:
+                response_text = ""
+
+            answer_prefix = LANG_TO_ANSWER_PREFIX[language]
+            extracted_answer = parse_answer(response_text, answer_prefix)
+
+            score = score_mgsm(correct_answer, extracted_answer)
+            html = common.jinja_env.from_string(HTML_JINJA).render(
+                prompt_messages=prompt_messages,
+                next_message=dict(content=response_text, role="assistant"),
+                score=score,
+                correct_answer=correct_answer,
+                extracted_answer=extracted_answer,
+            )
+            convo = prompt_messages + [dict(content=response_text, role="assistant")]
+            return SingleEvalResult(
+                html=html,
+                score=score,
+                convo=convo,
+                metrics={language: score, latin_language: score},
+            )
+
+        results = common.map_with_progress(
+            fn, self.examples, num_threads=self._num_threads
+        )
+        return common.aggregate_results(results, default_stats=("mean", "std"))
diff --git a/python/sglang/test/simple_eval_mmlu.py b/python/sglang/test/simple_eval_mmlu.py
new file mode 100644
index 000000000..36a5c7fe3
--- /dev/null
+++ b/python/sglang/test/simple_eval_mmlu.py
@@ -0,0 +1,121 @@
+# Adapted from https://github.com/openai/simple-evals/
+
+"""
+Measuring Massive Multitask Language Understanding
+Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, Jacob Steinhardt
+https://arxiv.org/abs/2009.03300
+"""
+
+import random
+import re
+from typing import Optional
+
+import pandas
+
+from sglang.test import simple_eval_common as common
+from sglang.test.simple_eval_common import (
+    ANSWER_PATTERN_MULTICHOICE,
+    HTML_JINJA,
+    Eval,
+    EvalResult,
+    SamplerBase,
+    SingleEvalResult,
+    format_multichoice_question,
+)
+
+subject2category = {
+    "abstract_algebra": "stem",
+    "anatomy": "other",
+    "astronomy": "stem",
+    "business_ethics": "other",
+    "clinical_knowledge": "other",
+    "college_biology": "stem",
+    "college_chemistry": "stem",
+    "college_computer_science": "stem",
+    "college_mathematics": "stem",
+    "college_medicine": "other",
+    "college_physics": "stem",
+    "computer_security": "stem",
+    "conceptual_physics": "stem",
+    "econometrics": "social_sciences",
+    "electrical_engineering": "stem",
+    "elementary_mathematics": "stem",
+    "formal_logic": "humanities",
+    "global_facts": "other",
+    "high_school_biology": "stem",
+    "high_school_chemistry": "stem",
+    "high_school_computer_science": "stem",
+    "high_school_european_history": "humanities",
+    "high_school_geography": "social_sciences",
+    "high_school_government_and_politics": "social_sciences",
+    "high_school_macroeconomics": "social_sciences",
+    "high_school_mathematics": "stem",
+    "high_school_microeconomics": "social_sciences",
+    "high_school_physics": "stem",
+    "high_school_psychology": "social_sciences",
+    "high_school_statistics": "stem",
+    "high_school_us_history": "humanities",
+    "high_school_world_history": "humanities",
+    "human_aging": "other",
+    "human_sexuality": "social_sciences",
+    "international_law": "humanities",
+    "jurisprudence": "humanities",
+    "logical_fallacies": "humanities",
+    "machine_learning": "stem",
+    "management": "other",
+    "marketing": "other",
+    "medical_genetics": "other",
+    "miscellaneous": "other",
+    "moral_disputes": "humanities",
+    "moral_scenarios": "humanities",
+    "nutrition": "other",
+    "philosophy": "humanities",
+    "prehistory": "humanities",
+    "professional_accounting": "other",
+    "professional_law": "humanities",
+    "professional_medicine": "other",
+    "professional_psychology": "social_sciences",
+    "public_relations": "social_sciences",
+    "security_studies": "social_sciences",
+    "sociology": "social_sciences",
+    "us_foreign_policy": "social_sciences",
+    "virology": "other",
+    "world_religions": "humanities",
+}
+
+
+class MMLUEval(Eval):
+    def __init__(self, filename: str, num_examples: Optional[int], num_threads: int):
+        df = pandas.read_csv(filename)
+        examples = [row.to_dict() for _, row in df.iterrows()]
+        if num_examples:
+            examples = random.Random(0).sample(examples, num_examples)
+        self.examples = examples
+        self.num_threads = num_threads
+
+    def __call__(self, sampler: SamplerBase) -> EvalResult:
+        def fn(row: dict):
+            prompt_messages = [
+                sampler._pack_message(
+                    content=format_multichoice_question(row), role="user"
+                )
+            ]
+            response_text = sampler(prompt_messages)
+            match = re.search(ANSWER_PATTERN_MULTICHOICE, response_text)
+            extracted_answer = match.group(1) if match else None
+            score = 1.0 if extracted_answer == row["Answer"] else 0.0
+            html = common.jinja_env.from_string(HTML_JINJA).render(
+                prompt_messages=prompt_messages,
+                next_message=dict(content=response_text, role="assistant"),
+                score=score,
+                correct_answer=row["Answer"],
+                extracted_answer=extracted_answer,
+            )
+            convo = prompt_messages + [dict(content=response_text, role="assistant")]
+            category = subject2category.get(row["Subject"], "other")
+            return SingleEvalResult(
+                html=html, score=score, metrics={category: score}, convo=convo
+            )
+
+        results = common.map_with_progress(fn, self.examples, self.num_threads)
+        return common.aggregate_results(results)
diff --git a/python/sglang/test/test_activation.py b/python/sglang/test/test_activation.py
new file mode 100644
index 000000000..dd5c668cf
--- /dev/null
+++ b/python/sglang/test/test_activation.py
@@ -0,0 +1,105 @@
+import itertools
+import unittest
+
+import torch
+
+from sglang.srt.layers.activation import GeluAndMul, QuickGELU
+from sglang.srt.utils import is_hip
+from sglang.test.test_utils import CustomTestCase
+
+_is_hip = is_hip()
+
+
+class TestGeluAndMul(CustomTestCase):
+    DTYPES = [torch.half, torch.bfloat16]
+    NUM_TOKENS = [7, 83, 2048]
+    D = [512, 4096, 5120, 13824]
+    SEEDS = [0]
+
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+
+    def _run_gelu_and_mul_test(self, num_tokens, d, dtype, seed):
+        torch.manual_seed(seed)
+
+        layer = GeluAndMul().to(dtype=dtype)
+        x = torch.randn(num_tokens, 2 * d, dtype=dtype)
+
+        with torch.inference_mode():
+            ref_out = layer.forward_native(x)
+            out = layer.forward_cuda(x)
+
+        if dtype == torch.bfloat16:
+            atol = rtol = 1e-2
+        else:
+            atol = rtol = 1e-3
+
+        self.assertTrue(torch.allclose(out, ref_out, atol=atol, rtol=rtol))
+
+    def test_gelu_and_mul(self):
+        for params in itertools.product(
+            self.NUM_TOKENS,
+            self.D,
+            self.DTYPES,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                num_tokens=params[0],
+                d=params[1],
+                dtype=params[2],
+                seed=params[3],
+            ):
+                self._run_gelu_and_mul_test(*params)
+
+
+class TestQuickGELU(CustomTestCase):
+    DTYPES = [torch.half, torch.bfloat16]
+    NUM_TOKENS = [7, 83, 2048]  # batch = sequence length
+    DIMS = [512, 4096, 5120, 13824]  # all multiples of 16 bytes
+    SEEDS = [0]
+
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+
+    def _run_gelu_quick_test(self, n_tok: int, dim: int, dtype: torch.dtype, seed: int):
+        torch.manual_seed(seed)
+
+        layer = QuickGELU().to(dtype=dtype)
+
+        x = torch.randn(n_tok, dim, dtype=dtype, device="cuda")
+
+        with torch.inference_mode():
+            ref = layer.forward_native(x)  # x * sigmoid(1.702 * x), fp32 math
+            if _is_hip:
+                out = layer.forward_hip(x)  # 128-bit vectorised kernel from sgl-kernel
+            else:
+                out = layer.forward_cuda(x)
+
+        tol = 1e-2 if dtype is torch.bfloat16 else 1e-3
+        self.assertTrue(
+            torch.allclose(out, ref, atol=tol, rtol=tol),
+            msg=f"Mismatch @ B={n_tok}, D={dim}, dtype={dtype}",
+        )
+        print(f"Match @ B={n_tok}, D={dim}, dtype={dtype}")
+
+    def test_quick_gelu(self):
+        for params in itertools.product(
+            self.NUM_TOKENS, self.DIMS, self.DTYPES, self.SEEDS
+        ):
+            with self.subTest(
+                num_tokens=params[0],
+                dim=params[1],
+                dtype=params[2],
+                seed=params[3],
+            ):
+                self._run_gelu_quick_test(*params)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/python/sglang/test/test_block_fp8.py b/python/sglang/test/test_block_fp8.py
new file mode 100644
index 000000000..45271e116
--- /dev/null
+++ b/python/sglang/test/test_block_fp8.py
@@ -0,0 +1,661 @@
+import itertools
+import os
+import unittest
+
+import torch
+
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
+from sglang.srt.layers.moe.topk import TopKConfig, select_experts
+from sglang.srt.layers.quantization.fp8_kernel import (
+    per_tensor_quant_mla_fp8,
+    per_token_group_quant_fp8,
+    per_token_group_quant_mla_deep_gemm_masked_fp8,
+    static_quant_fp8,
+    w8a8_block_fp8_matmul,
+)
+from sglang.srt.layers.quantization.fp8_utils import input_to_float8
+from sglang.test.test_utils import CustomTestCase
+
+_is_cuda = torch.cuda.is_available() and torch.version.cuda
+
+
+# For test
+def native_per_token_group_quant_fp8(
+    x, group_size, eps=1e-10, dtype=torch.float8_e4m3fn
+):
+    """Function to perform per-token-group quantization on an input tensor `x` using native torch.
+
+    It converts the tensor values into float8 values and returns the
+    quantized tensor along with the scaling factor used for quantization.
+    Note that only `torch.float8_e4m3fn` is supported for now.
+    """
+    assert (
+        x.shape[-1] % group_size == 0
+    ), "the last dimension of `x` cannot be divisible by `group_size`"
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    finfo = torch.finfo(dtype)
+    fp8_min = finfo.min
+    fp8_max = finfo.max
+
+    x_ = x.reshape(x.numel() // group_size, group_size)
+    amax = x_.abs().max(dim=-1, keepdim=True)[0].clamp(min=eps).to(torch.float32)
+    x_s = amax / fp8_max
+    x_q = (x_ / x_s).clamp(min=fp8_min, max=fp8_max).to(dtype)
+    x_q = x_q.reshape(x.shape)
+    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size,))
+
+    return x_q, x_s
+
+
+class TestPerTokenGroupQuantFP8(CustomTestCase):
+    DTYPES = [torch.half, torch.bfloat16, torch.float32]
+    NUM_TOKENS = [7, 83, 2048]
+    D = [512, 4096, 5120, 13824]
+    GROUP_SIZE = [64, 128, 256, 512]
+    SEEDS = [0]
+
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+
+    def _per_token_group_quant_fp8(self, num_tokens, d, dtype, group_size, seed):
+        torch.manual_seed(seed)
+
+        x = torch.rand(num_tokens, d, dtype=dtype)
+
+        with torch.inference_mode():
+            ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size)
+            out, scale = per_token_group_quant_fp8(x, group_size)
+
+        self.assertTrue(
+            torch.allclose(out.to(torch.float32), ref_out.to(torch.float32), rtol=0.20)
+        )
+        self.assertTrue(torch.allclose(scale, ref_scale))
+
+    def test_per_token_group_quant_fp8(self):
+        for params in itertools.product(
+            self.NUM_TOKENS,
+            self.D,
+            self.DTYPES,
+            self.GROUP_SIZE,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                num_tokens=params[0],
+                d=params[1],
+                dtype=params[2],
+                group_size=params[3],
+                seed=params[4],
+            ):
+                self._per_token_group_quant_fp8(*params)
+
+
+# For test
+def native_static_quant_fp8(x, x_s, dtype=torch.float8_e4m3fn):
+    """Function to perform static quantization on an input tensor `x` using native torch.
+
+    It converts the tensor values into float8 values and returns the
+    quantized tensor along with the scaling factor used for quantization.
+    """
+    assert x.is_contiguous(), "`x` is not contiguous"
+    assert x_s.numel() == 1, "only supports per-tensor scale"
+
+    finfo = torch.finfo(dtype)
+    fp8_min = finfo.min
+    fp8_max = finfo.max
+
+    x_ = x.reshape(x.numel() // x.shape[-1], x.shape[-1])
+    x_s_inv = 1.0 / x_s
+    x_q = (x_ * x_s_inv).clamp(min=fp8_min, max=fp8_max).to(dtype)
+    x_q = x_q.reshape(x.shape)
+
+    return x_q, x_s
+
+
+class TestStaticQuantFP8(CustomTestCase):
+    DTYPES = [torch.half, torch.bfloat16, torch.float32]
+    NUM_TOKENS = [7, 83, 2048]
+    D = [512, 4096, 5120, 13824]
+    SEEDS = [0]
+
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+
+    def _static_quant_fp8(self, num_tokens, d, dtype, seed):
+        torch.manual_seed(seed)
+
+        x = torch.rand(num_tokens, d, dtype=dtype)
+        fp8_max = torch.finfo(torch.float8_e4m3fn).max
+        x_s = x.max() / fp8_max
+
+        with torch.inference_mode():
+            ref_out, _ = native_static_quant_fp8(x, x_s)
+            out, _ = static_quant_fp8(x, x_s, repeat_scale=True)
+
+        self.assertTrue(
+            torch.allclose(out.to(torch.float32), ref_out.to(torch.float32), rtol=0.50)
+        )
+
+    def test_static_quant_fp8(self):
+        for params in itertools.product(
+            self.NUM_TOKENS,
+            self.D,
+            self.DTYPES,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                num_tokens=params[0],
+                d=params[1],
+                dtype=params[2],
+                seed=params[3],
+            ):
+                self._static_quant_fp8(*params)
+
+
+class TestPerTensorQuantMlaFP8(CustomTestCase):
+    DTYPES = [torch.half, torch.bfloat16, torch.float32]
+    NUM_TOKENS = [7, 83, 2048]
+    D = [512, 4096, 5120, 13824]
+    LAST_D_EXT = [1024, 0]
+    LAST_D = [512]
+    SEEDS = [0]
+
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+
+    def _per_tensor_quant_mla_fp8(self, num_tokens, d, last_d_ext, last_d, dtype, seed):
+        torch.manual_seed(seed)
+
+        x = torch.rand(
+            (num_tokens, d // last_d, last_d + last_d_ext),
+            dtype=dtype,
+        )
+        x_sub, _ = x.split([last_d, last_d_ext], dim=-1)
+
+        with torch.inference_mode():
+            ref_out, ref_s = input_to_float8(x_sub.transpose(0, 1))
+            out, out_s = per_tensor_quant_mla_fp8(x_sub.transpose(0, 1))
+
+        self.assertTrue(out.is_contiguous())
+        self.assertTrue(
+            torch.allclose(out.to(torch.float32), ref_out.to(torch.float32), rtol=0.50)
+        )
+        self.assertTrue(
+            torch.allclose(out_s.to(torch.float32), ref_s.to(torch.float32))
+        )
+
+    def test_per_tensor_quant_mla_fp8(self):
+        for params in itertools.product(
+            self.NUM_TOKENS,
+            self.D,
+            self.LAST_D_EXT,
+            self.LAST_D,
+            self.DTYPES,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                num_tokens=params[0],
+                d=params[1],
+                last_d_ext=params[2],
+                last_d=params[3],
+                dtype=params[4],
+                seed=params[5],
+            ):
+                self._per_tensor_quant_mla_fp8(*params)
+
+
+class TestPerTokenGroupQuantMlaDeepGemmMaskedFP8(CustomTestCase):
+    DTYPES = [torch.half, torch.bfloat16, torch.float32]
+    B = [128]
+    NUM_TOKENS = [7, 83, 2048, 1024 * 16]
+    D = [512, 128]
+    GROUP_SIZE = [128]
+    SEEDS = [0]
+
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+
+    def _per_token_group_quant_mla_deep_gemm_masked_fp8(
+        self, b, num_tokens, d, dtype, group_size, seed
+    ):
+        torch.manual_seed(seed)
+
+        x = torch.rand(b, num_tokens, d, dtype=dtype)
+
+        with torch.inference_mode():
+            ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size, 1e-12)
+            out, scale, _, _, _ = per_token_group_quant_mla_deep_gemm_masked_fp8(
+                x, group_size
+            )
+            out = out[:, :num_tokens, :]
+            scale = scale[:, :num_tokens, :]
+
+        self.assertTrue(
+            torch.allclose(
+                out.to(torch.float32), ref_out.to(torch.float32), rtol=0.20, atol=1e-2
+            )
+        )
+        self.assertTrue(torch.allclose(scale, ref_scale))
+
+    def test_per_token_group_quant_mla_deep_gemm_masked_fp8(self):
+        for params in itertools.product(
+            self.B,
+            self.NUM_TOKENS,
+            self.D,
+            self.DTYPES,
+            self.GROUP_SIZE,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                b=params[0],
+                num_tokens=params[1],
+                d=params[2],
+                dtype=params[3],
+                group_size=params[4],
+                seed=params[5],
+            ):
+                self._per_token_group_quant_mla_deep_gemm_masked_fp8(*params)
+
+
+# For test
+def native_w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype=torch.float16):
+    """This function performs matrix multiplication with block-wise quantization using native torch.
+
+    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
+    The output is returned in the specified `output_dtype`.
+    """
+
+    A = A.to(torch.float32)
+    B = B.to(torch.float32)
+    assert A.shape[-1] == B.shape[-1]
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+    assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1]
+
+    M = A.numel() // A.shape[-1]
+    N, K = B.shape
+    origin_C_shape = A.shape[:-1] + (N,)
+    A = A.reshape(M, A.shape[-1])
+    As = As.reshape(M, As.shape[-1])
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+    assert n_tiles == Bs.shape[0]
+    assert k_tiles == Bs.shape[1]
+
+    C_shape = (M, N)
+    C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
+
+    A_tiles = [A[:, i * block_k : min((i + 1) * block_k, K)] for i in range(k_tiles)]
+    B_tiles = [
+        [
+            B[
+                j * block_n : min((j + 1) * block_n, N),
+                i * block_k : min((i + 1) * block_k, K),
+            ]
+            for i in range(k_tiles)
+        ]
+        for j in range(n_tiles)
+    ]
+    C_tiles = [C[:, j * block_n : min((j + 1) * block_n, N)] for j in range(n_tiles)]
+    As_tiles = [As[:, i : i + 1] for i in range(k_tiles)]
+
+    for i in range(k_tiles):
+        for j in range(n_tiles):
+            a = A_tiles[i]
+            b = B_tiles[j][i]
+            c = C_tiles[j]
+            s = As_tiles[i] * Bs[j][i]
+            c[:, :] += torch.matmul(a, b.t()) * s
+
+    C = C.reshape(origin_C_shape).to(output_dtype)
+    return C
+
+
+class TestW8A8BlockFP8Matmul(CustomTestCase):
+
+    if not _is_cuda:
+        OUT_DTYPES = [torch.float32, torch.half, torch.bfloat16]
+        M = [1, 7, 83, 512, 2048]
+        NKs = [
+            (N, K)
+            for N in [128, 512, 1024, 4096, 7748, 13824]
+            for K in [256, 4096, 5120, 3884, 13824]
+        ]
+        # BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
+        BLOCK_SIZE = [[128, 128]]
+        SEEDS = [0]
+    else:
+        # use practical shape in DeepSeek V3 for test
+        OUT_DTYPES = [torch.bfloat16]
+        M = [64, 128, 512, 1024, 4096]
+        NKs = [
+            (2112, 7168),
+            (1536, 7168),
+            (3072, 1536),
+            (24576, 7168),
+            (4096, 512),
+            (7168, 2048),
+            (4608, 7168),
+            (512, 7168),
+            (7168, 2304),
+            (7168, 512),
+        ]
+        BLOCK_SIZE = [[128, 128]]
+        SEEDS = [0]
+
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+
+    def _w8a8_block_fp8_matmul(self, M, NK, block_size, out_dtype, seed):
+        N, K = NK
+        torch.manual_seed(seed)
+        # NOTE(HandH1998): to avoid overflow when out_dtype = torch.half
+        factor_for_scale = 1e-2
+        fp8_info = torch.finfo(torch.float8_e4m3fn)
+        fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+        A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+        A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+        B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+        B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+        block_n, block_k = block_size[0], block_size[1]
+        n_tiles = (N + block_n - 1) // block_n
+        k_tiles = (K + block_k - 1) // block_k
+
+        As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale
+        Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
+
+        with torch.inference_mode():
+            ref_out = native_w8a8_block_fp8_matmul(
+                A_fp8, B_fp8, As, Bs, block_size, out_dtype
+            )
+            out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
+
+        self.assertTrue(
+            torch.mean(torch.abs(out.to(torch.float32) - ref_out.to(torch.float32)))
+            / torch.mean(torch.abs(ref_out.to(torch.float32)))
+            < 0.001
+        )
+
+    def test_w8a8_block_fp8_matmul(self):
+        for params in itertools.product(
+            self.M,
+            self.NKs,
+            self.BLOCK_SIZE,
+            self.OUT_DTYPES,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                M=params[0],
+                NKs=params[1],
+                block_size=params[2],
+                out_dtype=params[3],
+                seed=params[4],
+            ):
+                self._w8a8_block_fp8_matmul(*params)
+
+
+# For test
+def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
+    """This function performs fused moe with block-wise quantization using native torch."""
+
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+
+    _, block_k = block_shape[0], block_shape[1]
+    a_q, a_s = native_per_token_group_quant_fp8(a, block_k)
+    # NOTE(HandH1998): Since "index_cuda" not implemented for 'Float8_e4m3fn', we need to cast `float8`` to `float32``.
+    a_q = a_q.to(torch.float32)
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            inter_out = native_w8a8_block_fp8_matmul(
+                a_q[mask], w1[i], a_s[mask], w1_s[i], block_shape, output_dtype=a.dtype
+            )
+            act_out = SiluAndMul().forward_native(inter_out)
+            act_out_q, act_out_s = native_per_token_group_quant_fp8(act_out, block_k)
+            act_out = act_out.to(torch.float32)
+            out[mask] = native_w8a8_block_fp8_matmul(
+                act_out_q, w2[i], act_out_s, w2_s[i], block_shape, output_dtype=a.dtype
+            )
+    return (
+        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+    ).sum(dim=1)
+
+
+class TestW8A8BlockFP8FusedMoE(CustomTestCase):
+    DTYPES = [torch.float32, torch.half, torch.bfloat16]
+    M = [1, 33, 64, 222, 1024 * 128]
+    N = [128, 1024, 2048]
+    K = [256, 4096, 5120]
+    E = [8, 24]
+    TOP_KS = [2, 6]
+    BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
+    # BLOCK_SIZE = [[128, 128]]
+    SEEDS = [0]
+
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+
+    def _w8a8_block_fp8_fused_moe(self, M, N, K, E, topk, block_size, dtype, seed):
+        torch.manual_seed(seed)
+        # NOTE(HandH1998): to avoid overflow when out_dtype = torch.half
+        factor_for_scale = 1e-2
+        fp8_info = torch.finfo(torch.float8_e4m3fn)
+        fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+        a = torch.randn((M, K), dtype=dtype) / 10
+
+        w1_fp32 = (torch.rand((E, 2 * N, K), dtype=torch.float32) - 0.5) * 2 * fp8_max
+        w1 = w1_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+        w2_fp32 = (torch.rand((E, K, N), dtype=torch.float32) - 0.5) * 2 * fp8_max
+        w2 = w2_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+        block_n, block_k = block_size[0], block_size[1]
+        n_tiles_w1 = (2 * N + block_n - 1) // block_n
+        n_tiles_w2 = (K + block_n - 1) // block_n
+        k_tiles_w1 = (K + block_k - 1) // block_k
+        k_tiles_w2 = (N + block_k - 1) // block_k
+
+        w1_s = (
+            torch.rand((E, n_tiles_w1, k_tiles_w1), dtype=torch.float32)
+            * factor_for_scale
+        )
+        w2_s = (
+            torch.rand((E, n_tiles_w2, k_tiles_w2), dtype=torch.float32)
+            * factor_for_scale
+        )
+
+        score = torch.randn((M, E), dtype=dtype)
+
+        with torch.inference_mode():
+            ref_out = torch_w8a8_block_fp8_moe(
+                a, w1, w2, w1_s, w2_s, score, topk, block_size
+            )
+            topk_output = select_experts(
+                hidden_states=a,
+                router_logits=score,
+                topk_config=TopKConfig(top_k=topk, renormalize=False),
+            )
+            out = fused_moe(
+                a,
+                w1,
+                w2,
+                topk_output,
+                use_fp8_w8a8=True,
+                w1_scale=w1_s,
+                w2_scale=w2_s,
+                block_shape=block_size,
+            )
+
+        self.assertTrue(
+            torch.mean(torch.abs(out.to(torch.float32) - ref_out.to(torch.float32)))
+            / torch.mean(torch.abs(ref_out.to(torch.float32)))
+            < 0.02
+        )
+
+    def test_w8a8_block_fp8_fused_moe(self):
+        for params in itertools.product(
+            self.M,
+            self.N,
+            self.K,
+            self.E,
+            self.TOP_KS,
+            self.BLOCK_SIZE,
+            self.DTYPES,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                M=params[0],
+                N=params[1],
+                K=params[2],
+                E=params[3],
+                topk=params[4],
+                block_size=params[5],
+                dtype=params[6],
+                seed=params[7],
+            ):
+                self._w8a8_block_fp8_fused_moe(*params)
+
+
+# For test
+def torch_w8a8_block_fp8_bmm(a, a_s, w, w_s, block_shape, out_dtype):
+    """This function performs bmm with block-wise quantization using native torch."""
+
+    B, N, _ = w.shape
+    _, M, _ = a.shape
+    out = torch.empty((B, M, N), dtype=out_dtype, device=a.device)
+
+    for i in range(B):
+        out[i] = native_w8a8_block_fp8_matmul(
+            a[i], w[i], a_s[i], w_s[i], block_shape, output_dtype=out_dtype
+        )
+
+    return out
+
+
+class TestW8A8BlockFP8BatchedDeepGemm(CustomTestCase):
+    DTYPES = [torch.bfloat16]
+    M = [1, 33, 64, 222, 8192]
+    N = [128, 512]
+    K = [128, 512]
+    BATCH = [128]
+    BLOCK_SIZE = [[128, 128]]
+    SEEDS = [0]
+
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        try:
+            import deep_gemm
+        except ImportError:
+            raise unittest.SkipTest("DeepGEMM is not available")
+        torch.set_default_device("cuda")
+
+    def _w8a8_block_fp8_batched_deep_gemm(self, M, N, K, B, block_size, dtype, seed):
+        torch.manual_seed(seed)
+        factor_for_scale = 1e-2
+        fp8_info = torch.finfo(torch.float8_e4m3fn)
+        fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+        a_fp32 = torch.randn((B, M, K), dtype=torch.float32) / 10
+        a = a_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+        w_fp32 = (torch.rand((B, N, K), dtype=torch.float32) - 0.5) * 2 * fp8_max
+        w = w_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+        block_n, block_k = block_size[0], block_size[1]
+        n_tiles_w = (N + block_n - 1) // block_n
+        k_tiles_w = (K + block_k - 1) // block_k
+
+        w_s = (
+            torch.rand((B, n_tiles_w, k_tiles_w), dtype=torch.float32)
+            * factor_for_scale
+        )
+        a_s = torch.rand((B, M, k_tiles_w), dtype=torch.float32) * factor_for_scale
+
+        ae = a.new_empty(B, (M + 255) // 256 * 256, K)
+        ae_s = a_s.new_empty(B, (M + 255) // 256 * 256, k_tiles_w)
+        oe = torch.empty((B, (M + 255) // 256 * 256, N), dtype=dtype)
+        ae[:, :M, :] = a
+        ae_s[:, :M, :] = a_s
+
+        masked_m = torch.full((B,), M, dtype=torch.int)
+        expected_m = M
+        lhs = (
+            ae,
+            ae_s,
+        )
+        rhs = (
+            w,
+            w_s,
+        )
+
+        from deep_gemm import m_grouped_gemm_fp8_fp8_bf16_nt_masked
+
+        with torch.inference_mode():
+            ref_out = torch_w8a8_block_fp8_bmm(a, a_s, w, w_s, block_size, dtype)
+            m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs, rhs, oe, masked_m, expected_m)
+            out = oe[:, :M, :]
+
+        self.assertTrue(
+            torch.mean(torch.abs(out.to(torch.float32) - ref_out.to(torch.float32)))
+            / torch.mean(torch.abs(ref_out.to(torch.float32)))
+            < 0.0001
+        )
+
+    def test_w8a8_block_fp8_batched_deep_gemm(self):
+
+        for params in itertools.product(
+            self.M,
+            self.N,
+            self.K,
+            self.BATCH,
+            self.BLOCK_SIZE,
+            self.DTYPES,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                M=params[0],
+                N=params[1],
+                K=params[2],
+                B=params[3],
+                block_size=params[4],
+                dtype=params[5],
+                seed=params[6],
+            ):
+                self._w8a8_block_fp8_batched_deep_gemm(*params)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/python/sglang/test/test_block_fp8_deep_gemm_blackwell.py b/python/sglang/test/test_block_fp8_deep_gemm_blackwell.py
new file mode 100644
index 000000000..36d7acddb
--- /dev/null
+++ b/python/sglang/test/test_block_fp8_deep_gemm_blackwell.py
@@ -0,0 +1,252 @@
+import itertools
+import os
+import unittest
+from typing import List, Tuple
+
+import torch
+from deep_gemm import fp8_gemm_nt
+
+from sglang.test.test_utils import CustomTestCase
+
+_is_cuda = torch.cuda.is_available() and torch.version.cuda
+
+
+# Modify form DeepGEMM Blackwell
+def ceil_div(x: int, y: int) -> int:
+    return (x + y - 1) // y
+
+
+def align(x: int, y: int) -> int:
+    return ceil_div(x, y) * y
+
+
+def per_token_group_quant_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2 and x.size(1) % 128 == 0
+    m, n = x.shape
+    x_view = x.view(m, -1, 128)
+    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
+    sf = x_amax / 448.0
+    return (x_view * (1.0 / sf.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), sf
+
+
+def per_block_quant_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros(
+        (align(m, 128), align(n, 128)), dtype=x.dtype, device=x.device
+    )
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    sf = x_amax / 448.0
+    x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
+        x_view.size(0), x_view.size(2)
+    )
+
+
+def ceil_to_ue8m0(x: torch.Tensor):
+    assert x.view(-1).amax().item() > 0
+    return torch.pow(2.0, torch.ceil(torch.log2(x.abs())))
+
+
+def per_token_group_quant_mxfp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2 and x.size(1) % 128 == 0
+    m, n = x.shape
+    x_view = x.view(m, -1, 128)
+    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
+    sf = ceil_to_ue8m0(x_amax / 448.0)
+    return (x_view * (1.0 / sf.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), sf
+
+
+def per_block_quant_mxfp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros(
+        (align(m, 128), align(n, 128)), dtype=x.dtype, device=x.device
+    )
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    sf = ceil_to_ue8m0(x_amax / 448.0)
+    x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
+        x_view.size(0), x_view.size(2)
+    )
+
+
+# For test
+def native_w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype=torch.float16):
+    """This function performs matrix multiplication with block-wise quantization using native torch.
+
+    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
+    The output is returned in the specified `output_dtype`.
+    """
+
+    A = A.to(torch.float32)
+    B = B.to(torch.float32)
+    assert A.shape[-1] == B.shape[-1]
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+    assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1]
+
+    M = A.numel() // A.shape[-1]
+    N, K = B.shape
+    origin_C_shape = A.shape[:-1] + (N,)
+    A = A.reshape(M, A.shape[-1])
+    As = As.reshape(M, As.shape[-1])
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+    assert n_tiles == Bs.shape[0]
+    assert k_tiles == Bs.shape[1]
+
+    C_shape = (M, N)
+    C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
+
+    A_tiles = [A[:, i * block_k : min((i + 1) * block_k, K)] for i in range(k_tiles)]
+    B_tiles = [
+        [
+            B[
+                j * block_n : min((j + 1) * block_n, N),
+                i * block_k : min((i + 1) * block_k, K),
+            ]
+            for i in range(k_tiles)
+        ]
+        for j in range(n_tiles)
+    ]
+    C_tiles = [C[:, j * block_n : min((j + 1) * block_n, N)] for j in range(n_tiles)]
+    As_tiles = [As[:, i : i + 1] for i in range(k_tiles)]
+
+    for i in range(k_tiles):
+        for j in range(n_tiles):
+            a = A_tiles[i]
+            b = B_tiles[j][i]
+            c = C_tiles[j]
+            s = As_tiles[i] * Bs[j][i]
+            c[:, :] += torch.matmul(a, b.t()) * s
+
+    C = C.reshape(origin_C_shape).to(output_dtype)
+    return C
+
+
+def block_quant_dequant(
+    x_q_block: torch.Tensor,
+    x_s: torch.Tensor,
+    block_size: List[int],
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    """This function converts block-wise quantization to unquantized.
+    The inputs are block-wise quantization tensor `x_q_block`, block-wise quantization scale
+    and the block size.
+    The output is an unquantized tensor with dtype.
+    """
+    block_n, block_k = block_size[0], block_size[1]
+    n, k = x_q_block.shape
+    n_tiles = (n + block_n - 1) // block_n
+    k_tiles = (k + block_k - 1) // block_k
+    assert n_tiles == x_s.shape[0]
+    assert k_tiles == x_s.shape[1]
+
+    x_dq_block = torch.empty_like(x_q_block, dtype=dtype)
+
+    for j in range(n_tiles):
+        for i in range(k_tiles):
+            x_q_block_tile = x_q_block[
+                j * block_n : min((j + 1) * block_n, n),
+                i * block_k : min((i + 1) * block_k, k),
+            ]
+            x_dq_block_tile = x_dq_block[
+                j * block_n : min((j + 1) * block_n, n),
+                i * block_k : min((i + 1) * block_k, k),
+            ]
+            x_dq_block_tile[:, :] = x_q_block_tile.to(torch.float32) * x_s[j][i]
+
+    return x_dq_block
+
+
+class TestDeepGemmBlackwell(CustomTestCase):
+
+    if not _is_cuda:
+        OUT_DTYPES = [torch.float32, torch.half, torch.bfloat16]
+        M = [1, 7, 83, 512, 2048]
+        NKs = [
+            (N, K)
+            for N in [128, 512, 1024, 4096, 7748, 13824]
+            for K in [256, 4096, 5120, 3884, 13824]
+        ]
+        # BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
+        BLOCK_SIZE = [[128, 128]]
+        SEEDS = [0]
+    else:
+        # use practical shape in DeepSeek V3 for test
+        OUT_DTYPES = [torch.bfloat16]
+        M = [64, 128, 512, 1024, 4096]
+        NKs = [
+            (2112, 7168),
+            (1536, 7168),
+            # (3072, 1536),
+            # (24576, 7168),
+            # (4096, 512),
+            # (7168, 2048),
+            # (4608, 7168),
+            # (512, 7168),
+            # (7168, 2304),
+            # (7168, 512),
+        ]
+        BLOCK_SIZE = [[128, 128]]
+        SEEDS = [0]
+
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+
+    def _test_deep_gemm_blackwell(self, M, NK, block_size, out_dtype, seed):
+        N, K = NK
+        torch.manual_seed(seed)
+
+        A = torch.empty((M, K), dtype=torch.bfloat16).normal_(0, 0.2)
+        B = torch.empty((N, K), dtype=torch.bfloat16).normal_(0, 0.2)
+
+        A_q, A_s = per_token_group_quant_fp8(A)
+        B_q, B_s = per_block_quant_fp8(B)
+
+        A_dq = block_quant_dequant(A_q, A_s, [1, block_size[1]], out_dtype)
+        B_dq = block_quant_dequant(B_q, B_s, block_size, out_dtype)
+
+        A_qu = per_token_group_quant_mxfp8(A_dq)
+        B_qu = per_block_quant_mxfp8(B_dq)
+        out = None
+
+        with torch.inference_mode():
+            ref_out = native_w8a8_block_fp8_matmul(
+                A_q, B_q, A_s, B_s, block_size, out_dtype
+            )
+            out = torch.empty_like(ref_out)
+            fp8_gemm_nt(A_qu, B_qu, out)
+
+        torch.testing.assert_close(out, ref_out, atol=1e-1, rtol=1e-2)
+
+    def test_deep_gemm_blackwell(self):
+        for params in itertools.product(
+            self.M,
+            self.NKs,
+            self.BLOCK_SIZE,
+            self.OUT_DTYPES,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                M=params[0],
+                NKs=params[1],
+                block_size=params[2],
+                out_dtype=params[3],
+                seed=params[4],
+            ):
+                self._test_deep_gemm_blackwell(*params)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/python/sglang/test/test_block_fp8_ep.py b/python/sglang/test/test_block_fp8_ep.py
new file mode 100644
index 000000000..670f2e0f8
--- /dev/null
+++ b/python/sglang/test/test_block_fp8_ep.py
@@ -0,0 +1,358 @@
+import itertools
+import random
+import unittest
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+import torch
+
+from sglang.srt.layers.moe.ep_moe.kernels import (
+    grouped_gemm_triton,
+    post_reorder_triton_kernel,
+    pre_reorder_triton_kernel,
+    run_moe_ep_preproess,
+    silu_and_mul_triton_kernel,
+)
+from sglang.srt.layers.moe.topk import TopKConfig, select_experts
+from sglang.test.test_utils import CustomTestCase
+
+
+# For test
+def ep_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    router_logits: torch.Tensor,
+    topk_config: TopKConfig,
+    # ep config
+    num_experts: int = 256,
+    fp8_dtype: torch.types = torch.float8_e4m3fn,
+    num_experts_per_partition: int = 128,
+    start_expert_id: int = 0,
+    end_expert_id: int = 127,
+    use_fp8_w8a8: bool = False,
+    w1_scale_inv: Optional[torch.Tensor] = None,
+    w2_scale_inv: Optional[torch.Tensor] = None,
+    block_shape: Optional[List[int]] = None,
+):
+    use_blockwise_fp8 = block_shape is not None
+    top_k = topk_config.top_k
+    topk_output = select_experts(
+        hidden_states=hidden_states,
+        router_logits=router_logits,
+        topk_config=topk_config,
+    )
+    topk_weights, topk_ids, _ = topk_output
+
+    reorder_topk_ids, src2dst, seg_indptr = run_moe_ep_preproess(topk_ids, num_experts)
+
+    gateup_input = torch.empty(
+        (int(hidden_states.shape[0] * top_k), hidden_states.shape[1]),
+        device=hidden_states.device,
+        dtype=(
+            fp8_dtype
+            if (use_fp8_w8a8 and not use_blockwise_fp8)
+            else hidden_states.dtype
+        ),
+    )
+
+    if use_fp8_w8a8 and not use_blockwise_fp8:
+        max_value = (
+            torch.max(hidden_states).repeat(num_experts_per_partition).to(torch.float32)
+        )
+        w1_input_scale = max_value / torch.finfo(fp8_dtype).max
+    else:
+        w1_input_scale = None
+
+    # PreReorder
+    pre_reorder_triton_kernel[(hidden_states.shape[0],)](
+        hidden_states,
+        gateup_input,
+        src2dst,
+        topk_ids,
+        w1_input_scale,
+        start_expert_id,
+        end_expert_id,
+        top_k,
+        hidden_states.shape[1],
+        BLOCK_SIZE=512,
+        use_per_token_if_dynamic=True,
+    )
+
+    seg_indptr_cur_rank = seg_indptr[start_expert_id : end_expert_id + 2]
+    weight_indices_cur_rank = torch.arange(
+        0,
+        num_experts_per_partition,
+        device=hidden_states.device,
+        dtype=torch.int64,
+    )
+
+    # GroupGemm-0
+    gateup_output = torch.empty(
+        gateup_input.shape[0],
+        w1.shape[1],
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+
+    gateup_output = grouped_gemm_triton(
+        a=gateup_input,
+        b=w1,
+        c=gateup_output,
+        batch_size=num_experts_per_partition,
+        weight_column_major=True,
+        seg_indptr=seg_indptr_cur_rank,
+        weight_indices=weight_indices_cur_rank,
+        use_fp8_w8a8=use_fp8_w8a8,
+        scale_a=w1_input_scale,
+        scale_b=w1_scale_inv,
+        block_shape=block_shape,
+    )
+
+    # Act
+    down_input = torch.empty(
+        gateup_output.shape[0],
+        gateup_output.shape[1] // 2,
+        device=gateup_output.device,
+        dtype=(
+            fp8_dtype
+            if (use_fp8_w8a8 and not use_blockwise_fp8)
+            else hidden_states.dtype
+        ),
+    )
+    if use_fp8_w8a8 and not use_blockwise_fp8:
+        w2_input_scale = torch.ones(
+            num_experts_per_partition,
+            dtype=torch.float32,
+            device=hidden_states.device,
+        )
+    else:
+        w2_input_scale = None
+
+    silu_and_mul_triton_kernel[(gateup_output.shape[0],)](
+        gateup_output,
+        down_input,
+        gateup_output.shape[1],
+        reorder_topk_ids,
+        w2_input_scale,
+        start_expert_id,
+        end_expert_id,
+        BLOCK_SIZE=512,
+    )
+
+    # GroupGemm-1
+    down_output = torch.empty(
+        down_input.shape[0],
+        w2.shape[1],
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+
+    down_output = grouped_gemm_triton(
+        a=down_input,
+        b=w2,
+        c=down_output,
+        batch_size=num_experts_per_partition,
+        weight_column_major=True,
+        seg_indptr=seg_indptr_cur_rank,
+        weight_indices=weight_indices_cur_rank,
+        use_fp8_w8a8=use_fp8_w8a8,
+        scale_a=w2_input_scale,
+        scale_b=w2_scale_inv,
+        block_shape=block_shape,
+    )
+
+    # PostReorder
+    output = torch.empty_like(hidden_states)
+    post_reorder_triton_kernel[(hidden_states.size(0),)](
+        down_output,
+        output,
+        src2dst,
+        topk_ids,
+        topk_weights,
+        start_expert_id,
+        end_expert_id,
+        top_k,
+        hidden_states.size(1),
+        0,
+        BLOCK_SIZE=512,
+    )
+    return output
+
+
+# test util
+def block_dequant(
+    x_q_block: torch.Tensor,
+    x_s: torch.Tensor,
+    block_size: List[int],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """This function converts block-wise quantization to tensor-wise quantization.
+    The inputs are block-wise quantization tensor `x_q_block`, block-wise quantization scale
+    and the block size.
+    The outputs are tensor-wise quantization tensor and tensor-wise quantization scale.
+    Note only float8 is supported for now.
+    """
+
+    # process 3D tensor
+    if x_q_block.dim() == 3:
+        batch_size = x_q_block.size(0)
+        return torch.stack(
+            [block_dequant(x_q_block[b], x_s[b], block_size) for b in range(batch_size)]
+        )
+
+    block_n, block_k = block_size[0], block_size[1]
+    n, k = x_q_block.shape
+    n_tiles = (n + block_n - 1) // block_n
+    k_tiles = (k + block_k - 1) // block_k
+    assert n_tiles == x_s.shape[0]
+    assert k_tiles == x_s.shape[1]
+
+    x_dq_block = x_q_block.to(torch.float32)
+
+    x_dq_block_tiles = [
+        [
+            x_dq_block[
+                j * block_n : min((j + 1) * block_n, n),
+                i * block_k : min((i + 1) * block_k, k),
+            ]
+            for i in range(k_tiles)
+        ]
+        for j in range(n_tiles)
+    ]
+
+    for i in range(k_tiles):
+        for j in range(n_tiles):
+            x_dq_block_tiles[j][i][:, :] = x_dq_block_tiles[j][i] * x_s[j][i]
+
+    return x_dq_block
+
+
+class TestW8A8BlockFP8EPMoE(CustomTestCase):
+    DTYPES = [torch.half, torch.bfloat16]
+    M = [1, 222, 1024, 2048]
+    N = [128, 1024, 2048]
+    K = [256, 4096, 5120]
+    E = [8, 16]
+    ep_size = [2, 4]
+    TOP_KS = [2, 4]
+    BLOCK_SIZE = [[128, 128]]
+    SEEDS = [0]
+
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+
+    def _w8a8_block_fp8_ep_moe(
+        self, M, N, K, E, ep_size, topk, block_size, dtype, seed
+    ):
+        torch.manual_seed(seed)
+        random.seed(seed)
+        # NOTE(HandH1998): to avoid overflow when out_dtype = torch.half
+        factor_for_scale = 1e-2
+        fp8_info = torch.finfo(torch.float8_e4m3fn)
+        fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+        a = torch.randn((M, K), dtype=dtype) / 10
+
+        w1_fp32 = (torch.rand((E, 2 * N, K), dtype=dtype) - 0.5) * 2 * fp8_max
+        w1 = w1_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+        w2_fp32 = (torch.rand((E, K, N), dtype=dtype) - 0.5) * 2 * fp8_max
+        w2 = w2_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+        block_n, block_k = block_size[0], block_size[1]
+        n_tiles_w1 = (2 * N + block_n - 1) // block_n
+        n_tiles_w2 = (K + block_n - 1) // block_n
+        k_tiles_w1 = (K + block_k - 1) // block_k
+        k_tiles_w2 = (N + block_k - 1) // block_k
+
+        w1_s = (
+            torch.rand((E, n_tiles_w1, k_tiles_w1), dtype=torch.float32)
+            * factor_for_scale
+        )
+        w2_s = (
+            torch.rand((E, n_tiles_w2, k_tiles_w2), dtype=torch.float32)
+            * factor_for_scale
+        )
+
+        w1_ref = block_dequant(w1, w1_s, block_size).to(dtype)
+        w2_ref = block_dequant(w2, w2_s, block_size).to(dtype)
+
+        score = torch.randn((M, E), dtype=dtype)
+        num_experts_per_partition = E // ep_size
+        cur_rank = random.randint(0, ep_size - 1)
+        start_id = cur_rank * num_experts_per_partition
+        end_id = start_id + num_experts_per_partition - 1
+
+        topk_config = TopKConfig(
+            top_k=topk,
+            renormalize=False,
+        )
+
+        with torch.inference_mode():
+            out = ep_moe(
+                hidden_states=a,
+                w1=w1,
+                w2=w2,
+                router_logits=score,
+                topk_config=topk_config,
+                use_fp8_w8a8=True,
+                w1_scale_inv=w1_s,
+                w2_scale_inv=w2_s,
+                block_shape=block_size,
+                num_experts=E,
+                num_experts_per_partition=num_experts_per_partition,
+                start_expert_id=start_id,
+                end_expert_id=end_id,
+            )
+            ref_out = ep_moe(
+                hidden_states=a,
+                w1=w1_ref,
+                w2=w2_ref,
+                router_logits=score,
+                topk_config=topk_config,
+                use_fp8_w8a8=False,
+                w1_scale_inv=None,
+                w2_scale_inv=None,
+                block_shape=None,
+                num_experts=E,
+                num_experts_per_partition=num_experts_per_partition,
+                start_expert_id=start_id,
+                end_expert_id=end_id,
+            )
+        self.assertTrue(
+            torch.mean(torch.abs(out.to(torch.float32) - ref_out.to(torch.float32)))
+            / (torch.mean(torch.abs(ref_out.to(torch.float32))) + 1e-6)
+            < 0.06
+        )
+
+    def test_w8a8_block_fp8_ep_moe(self):
+        for params in itertools.product(
+            self.M,
+            self.N,
+            self.K,
+            self.E,
+            self.ep_size,
+            self.TOP_KS,
+            self.BLOCK_SIZE,
+            self.DTYPES,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                M=params[0],
+                N=params[1],
+                K=params[2],
+                E=params[3],
+                ep_size=params[4],
+                topk=params[5],
+                block_size=params[6],
+                dtype=params[7],
+                seed=params[8],
+            ):
+                self._w8a8_block_fp8_ep_moe(*params)
+            torch.cuda.empty_cache()
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/python/sglang/test/test_custom_ops.py b/python/sglang/test/test_custom_ops.py
new file mode 100644
index 000000000..c07c95db6
--- /dev/null
+++ b/python/sglang/test/test_custom_ops.py
@@ -0,0 +1,148 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/8ca7a71df787ad711ad3ac70a5bd2eb2bb398938/tests/quantization/test_fp8.py
+
+import pytest
+import torch
+
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz, scaled_fp8_quant
+from sglang.srt.utils import is_cuda, is_hip
+
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+_is_fp8_fnuz = is_fp8_fnuz()
+fp8_dtype = torch.float8_e4m3fnuz if _is_fp8_fnuz else torch.float8_e4m3fn
+
+
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+def test_scaled_fp8_quant_per_tensor(dtype) -> None:
+
+    def quantize_ref_per_tensor(tensor, inv_scale):
+        # The reference implementation that fully aligns to
+        # the kernel being tested.
+        finfo = torch.finfo(fp8_dtype)
+        scale = inv_scale.reciprocal()
+        qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min, max=finfo.max)
+        qweight = qweight.to(fp8_dtype)
+        return qweight
+
+    def dequantize_per_tensor(tensor, inv_scale, dtype):
+        fake_qweight = tensor.to(dtype)
+        dq_weight = fake_qweight * inv_scale
+        return dq_weight
+
+    # Note that we use a shape % 8 != 0 to cover edge cases,
+    # because scaled_fp8_quant is vectorized by 8.
+    x = (torch.randn(size=(11, 11), device="cuda") * 13).to(dtype)
+
+    # Test Per Tensor Dynamic quantization
+    # scale = max(abs(x)) / FP8_E4M3_MAX
+    y, scale = scaled_fp8_quant(x, None)
+    ref_y = quantize_ref_per_tensor(x, scale)
+    torch.testing.assert_close(y, ref_y)
+    torch.testing.assert_close(
+        dequantize_per_tensor(y, scale, dtype),
+        dequantize_per_tensor(ref_y, scale, dtype),
+    )
+
+    # Test Per Tensor Static quantization
+    y, _ = scaled_fp8_quant(x, scale)
+    ref_y = quantize_ref_per_tensor(x, scale)
+    torch.testing.assert_close(y, ref_y)
+    torch.testing.assert_close(
+        dequantize_per_tensor(y, scale, dtype),
+        dequantize_per_tensor(ref_y, scale, dtype),
+    )
+
+
+if _is_cuda or _is_hip:
+
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+    def test_scaled_fp8_quant_per_token_dynamic(dtype) -> None:
+        def quantize_ref_per_token(tensor, inv_scale):
+            # The reference implementation that fully aligns to
+            # the kernel being tested.
+            finfo = torch.finfo(fp8_dtype)
+            scale = inv_scale.reciprocal()
+            qweight = (tensor.to(torch.float32) * scale).clamp(
+                min=finfo.min, max=finfo.max
+            )
+            qweight = qweight.to(fp8_dtype)
+            return qweight
+
+        def dequantize_per_token(tensor, inv_scale, dtype):
+            fake_qweight = tensor.to(dtype)
+            dq_weight = fake_qweight * inv_scale
+            return dq_weight
+
+        # Note that we use a shape % 8 = 0,
+        # because per_token_quant_fp8 is vectorized by 8 elements.
+        x = (torch.randn(size=(11, 16), device="cuda") * 13).to(dtype)
+
+        # Test Per Tensor Dynamic quantization
+        # scale = max(abs(x)) / FP8_E4M3_MAX
+        y, scale = scaled_fp8_quant(x, None, use_per_token_if_dynamic=True)
+        ref_y = quantize_ref_per_token(x, scale)
+        torch.testing.assert_close(y, ref_y)
+        torch.testing.assert_close(
+            dequantize_per_token(y, scale, dtype),
+            dequantize_per_token(ref_y, scale, dtype),
+        )
+
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+    def test_scaled_fp8_quant_with_padding(dtype) -> None:
+        original_rows = 5
+        x = (torch.randn(size=(original_rows, 16), device="cuda") * 13).to(dtype)
+
+        padding_size = 10
+
+        # Test with dynamic quantization
+        y_dynamic, scale_dynamic = scaled_fp8_quant(
+            x, None, num_token_padding=padding_size
+        )
+
+        # Verify output shape has the padded size
+        assert y_dynamic.shape[0] == padding_size
+        assert y_dynamic.shape[1] == x.shape[1]
+
+        # Verify that the actual data in the non-padded region is correctly quantized
+        y_without_padding, scale_without_padding = scaled_fp8_quant(x, None)
+        torch.testing.assert_close(y_dynamic[:original_rows], y_without_padding)
+
+        # Test with static quantization
+        # First get a scale
+        _, scale = scaled_fp8_quant(x, None)
+
+        # Then use it for static quantization with padding
+        y_static, _ = scaled_fp8_quant(x, scale, num_token_padding=padding_size)
+
+        # Verify output shape has the padded size
+        assert y_static.shape[0] == padding_size
+        assert y_static.shape[1] == x.shape[1]
+
+        # Verify that the actual data in the non-padded region is correctly quantized
+        y_static_without_padding, _ = scaled_fp8_quant(x, scale)
+        torch.testing.assert_close(y_static[:original_rows], y_static_without_padding)
+
+        # Test with per-token dynamic quantization
+        y_per_token, scale_per_token = scaled_fp8_quant(
+            x, None, num_token_padding=padding_size, use_per_token_if_dynamic=True
+        )
+
+        # Verify output shape has the padded size
+        assert y_per_token.shape[0] == padding_size
+        assert y_per_token.shape[1] == x.shape[1]
+
+        # Verify that the actual data in the non-padded region is correctly quantized
+        y_per_token_without_padding, scale_per_token_without_padding = scaled_fp8_quant(
+            x, None, use_per_token_if_dynamic=True
+        )
+        torch.testing.assert_close(
+            y_per_token[:original_rows], y_per_token_without_padding
+        )
+        torch.testing.assert_close(
+            scale_per_token[:original_rows], scale_per_token_without_padding
+        )
+
+
+if __name__ == "__main__":
+    # Run the specific test function directly
+    pytest.main([__file__])
diff --git a/python/sglang/test/test_cutlass_moe.py b/python/sglang/test/test_cutlass_moe.py
new file mode 100755
index 000000000..56f276c81
--- /dev/null
+++ b/python/sglang/test/test_cutlass_moe.py
@@ -0,0 +1,299 @@
+import argparse
+import time
+
+import torch
+import triton  # Added import
+import triton.testing  # Added import
+from transformers import AutoConfig
+
+from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts_fp8
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
+from sglang.srt.layers.moe.moe_runner.base import MoeRunnerConfig
+from sglang.srt.layers.moe.topk import StandardTopKOutput
+
+
+# Copy from: https://github.com/deepseek-ai/DeepGEMM/blob/main/deep_gemm/utils.py
+def calc_diff(x, y):
+    x, y = x.double(), y.double()
+    denominator = (x * x + y * y).sum()
+    sim = 2 * (x * y).sum() / denominator
+    return 1 - sim
+
+
+def get_model_config(tp_size: int):
+    config = AutoConfig.from_pretrained(
+        "deepseek-ai/Deepseek-R1", trust_remote_code=True
+    )
+    E = config.n_routed_experts
+    topk = config.num_experts_per_tok
+    intermediate_size = config.moe_intermediate_size
+    shard_intermediate_size = 2 * intermediate_size // tp_size
+
+    return {
+        "num_experts": E,
+        "topk": topk,
+        "hidden_size": config.hidden_size,
+        "shard_intermediate_size": shard_intermediate_size,
+        "dtype": config.torch_dtype,
+        "block_shape": config.quantization_config["weight_block_size"],
+    }
+
+
+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
+    """Converts tensor to FP8 E4M3, scaling values to fit the range."""
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    # Calculate max absolute value safely
+    max_val = torch.max(torch.abs(tensor))
+    # Avoid division by zero if tensor is all zeros
+    if max_val == 0:
+        scale_factor = 1.0
+    else:
+        # Scale factor to bring the max value to finfo.max
+        scale_factor = finfo.max / max_val
+
+    # Apply scaling
+    scaled_tensor = tensor * scale_factor
+
+    # Clamp and convert
+    fp8_tensor = scaled_tensor.clamp(min=finfo.min, max=finfo.max).to(
+        dtype=torch.float8_e4m3fn
+    )
+    return fp8_tensor
+
+
+def run_test(tp_size, batch_size, model_config, check=False):
+    print(f"\n--- Batch Size: {batch_size} ---")
+    torch.set_default_device("cuda")
+    torch.cuda.manual_seed_all(42)  # For reproducible random numbers
+
+    E = model_config["num_experts"]
+    topk = model_config["topk"]
+    H = model_config["hidden_size"]
+    I = model_config["shard_intermediate_size"]
+    block_shape = model_config["block_shape"]  # Tuple (BLOCK_N, BLOCK_K)
+    dtype = model_config["dtype"]  # e.g., torch.bfloat16
+
+    print(
+        f"Config: E={E}, topk={topk}, H={H}, I_shard={I}, dtype={dtype}, block_shape={block_shape}"
+    )
+
+    # --- Input Data ---
+    # Use bf16/fp16 for input activation based on model config
+    x = torch.randn((batch_size, H), device="cuda", dtype=dtype)
+    # --- Weights (Generate in higher precision, then convert to FP8) ---
+    # Generate weights suitable for FP8 conversion (e.g., scaled appropriately)
+    w1_hp = torch.randn((E, I, H), device="cuda", dtype=torch.float32)
+    w2_hp = torch.randn((E, H, I // 2), device="cuda", dtype=torch.float32)
+
+    w1 = to_fp8(w1_hp)
+    w2 = to_fp8(w2_hp)
+
+    # --- Scales for FP8 Weights ---
+    block_n, block_k = block_shape
+    # Calculate number of blocks needed
+    w1_blocks_dim1 = (I + block_n - 1) // block_n
+    w1_blocks_dim2 = (H + block_k - 1) // block_k
+    w2_blocks_dim1 = (H + block_n - 1) // block_n
+    w2_blocks_dim2 = (I // 2 + block_k - 1) // block_k
+
+    # Scales are typically float32 or float16/bfloat16
+    scale_dtype = torch.float32  # Or dtype if scales match model dtype
+    w1_scale = torch.full(
+        (E, w1_blocks_dim1, w1_blocks_dim2), 1, device="cuda", dtype=scale_dtype
+    )  # Avoid zero scales
+    w2_scale = torch.full(
+        (E, w2_blocks_dim1, w2_blocks_dim2), 1, device="cuda", dtype=scale_dtype
+    )  # Avoid zero scales
+
+    # --- Routing Information ---
+    topk_weights = torch.softmax(
+        torch.rand(batch_size, topk, device="cuda", dtype=dtype), dim=-1
+    )
+    topk_ids = torch.randint(0, E, (batch_size, topk), dtype=torch.int32, device="cuda")
+
+    a1_strides = torch.full((E,), H, dtype=torch.int64, device="cuda")
+    c1_strides = torch.full((E,), I, dtype=torch.int64, device="cuda")
+    a2_strides = torch.full((E,), I // 2, dtype=torch.int64, device="cuda")
+    c2_strides = torch.full((E,), H, dtype=torch.int64, device="cuda")
+
+    workspace = torch.empty(
+        (7182 * 1024), device="cuda", dtype=torch.uint8
+    )  # Allocate sufficient workspace
+    # Pointer arrays (often filled by the kernel or a prep step, but needed as args)
+    a_ptrs = torch.empty((E,), dtype=torch.int64, device="cuda")
+    b_ptrs = torch.empty((E,), dtype=torch.int64, device="cuda")
+    out_ptrs = torch.empty((E,), dtype=torch.int64, device="cuda")
+    a_scales_ptrs = torch.empty((E,), dtype=torch.int64, device="cuda")
+    b_scales_ptrs = torch.empty((E,), dtype=torch.int64, device="cuda")
+    expert_offsets = torch.empty((E + 1,), dtype=torch.int32, device="cuda")
+    problem_sizes1 = torch.empty((E, 3), dtype=torch.int32, device="cuda")
+    problem_sizes2 = torch.empty((E, 3), dtype=torch.int32, device="cuda")
+
+    # --- Lambdas for Benchmarking ---
+    cutlass_lambda = lambda: cutlass_fused_experts_fp8(
+        x,
+        w1.transpose(1, 2),  # Transposed
+        w2.transpose(1, 2),  # Transposed
+        w1_scale.transpose(1, 2),
+        w2_scale.transpose(1, 2),
+        topk_weights,
+        topk_ids,
+        a1_strides,
+        c1_strides,
+        a2_strides,
+        c2_strides,
+        workspace,
+        a_ptrs,
+        b_ptrs,
+        out_ptrs,
+        a_scales_ptrs,
+        b_scales_ptrs,
+        expert_offsets,
+        problem_sizes1,
+        problem_sizes2,
+    )
+
+    topk_output = StandardTopKOutput(
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        router_logits=torch.randn(
+            (batch_size, topk), device=topk_weights.device, dtype=dtype
+        ),
+    )
+
+    moe_runner_config = MoeRunnerConfig(
+        num_experts=E,
+        top_k=topk,
+        hidden_size=H,
+        intermediate_size_per_partition=I,
+        params_dtype=dtype,
+        activation="silu",
+        inplace=False,
+    )
+
+    # Note: Triton expects non-transposed weights
+    triton_lambda = lambda: fused_experts(
+        x,
+        w1,
+        w2,
+        topk_output,
+        moe_runner_config,
+        use_fp8_w8a8=True,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        block_shape=block_shape,
+    )
+
+    # --- Warmup ---
+    print("Warming up...")
+    for _ in range(10):
+        _ = cutlass_lambda()
+        _ = triton_lambda()
+    torch.cuda.synchronize()
+
+    # --- Benchmarking ---
+    quantiles = [0.5, 0.2, 0.8]
+    print(f"Benchmarking Cutlass fused_experts...")
+    cutlass_ms, cutlass_min, cutlass_max = triton.testing.do_bench_cudagraph(
+        cutlass_lambda, rep=1000, quantiles=quantiles
+    )
+
+    print(f"Benchmarking Triton fused_experts...")
+    triton_ms, triton_min, triton_max = triton.testing.do_bench_cudagraph(
+        triton_lambda, rep=1000, quantiles=quantiles
+    )
+    print(
+        f"Cutlass fused_experts time: {cutlass_ms:.3f} ms (median) [{cutlass_min:.3f} - {cutlass_max:.3f}]"
+    )
+    print(
+        f"Triton  fused_experts time: {triton_ms:.3f} ms (median) [{triton_min:.3f} - {triton_max:.3f}]"
+    )
+
+    # --- Correctness Check ---
+    if check:
+        print("Running correctness check...")
+        with torch.no_grad():
+            # Run CUTLASS version (requires transposed weights)
+            y_cutlass = cutlass_fused_experts_fp8(
+                x,
+                w1.transpose(1, 2),  # Transposed
+                w2.transpose(1, 2),  # Transposed
+                w1_scale.transpose(1, 2),
+                w2_scale.transpose(1, 2),
+                topk_weights,
+                topk_ids,
+                a1_strides,
+                c1_strides,
+                a2_strides,
+                c2_strides,
+                workspace,
+                a_ptrs,
+                b_ptrs,
+                out_ptrs,
+                a_scales_ptrs,
+                b_scales_ptrs,
+                expert_offsets,
+                problem_sizes1,
+                problem_sizes2,
+            )
+
+            # Run Triton version (requires original shape weights, use inplace=False)
+            y_triton = fused_experts(
+                x,
+                w1,  # Original shape
+                w2,  # Original shape
+                topk_output,
+                moe_runner_config,
+                use_fp8_w8a8=True,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                block_shape=block_shape,
+            )
+
+        diff = calc_diff(y_cutlass, y_triton)
+        print(f"Diff: {diff:.6f}")
+
+        # Tolerance might need adjustment based on FP8 specifics and kernel differences
+        # FP8 comparisons often require higher tolerance than FP16/BF16
+        assert diff < 1e-4, f"Diff too high! {diff}"
+        print("Correctness check passed.")
+
+
+def main(tp_size=8, batch_sizes=[1, 4, 8, 16, 32, 64, 128, 256, 512], check=False):
+    model_config = get_model_config(tp_size)
+    print("Model Config:", model_config)
+    for batch_size in batch_sizes:
+        run_test(tp_size, batch_size, model_config, check)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--tp-size", type=int, default=8, help="Tensor Parallel size")
+    parser.add_argument(
+        "--batch-sizes",
+        type=int,
+        nargs="+",
+        default=[
+            1,
+            4,
+            8,
+            16,
+            32,
+            64,
+            128,
+            256,
+            512,
+            1024,
+            2048,
+            4096,
+            8192,
+        ],  # Adjusted default
+        help="List of batch sizes to test",
+    )
+    parser.add_argument("--check", action="store_true", help="Enable check mode")
+    args = parser.parse_args()
+
+    print(f"Running benchmarks with TP size: {args.tp_size}")
+    print(f"Testing batch sizes: {args.batch_sizes}")
+
+    main(tp_size=args.tp_size, batch_sizes=args.batch_sizes, check=args.check)
diff --git a/python/sglang/test/test_cutlass_w4a8_moe.py b/python/sglang/test/test_cutlass_w4a8_moe.py
new file mode 100644
index 000000000..6706fc962
--- /dev/null
+++ b/python/sglang/test/test_cutlass_w4a8_moe.py
@@ -0,0 +1,295 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Literal, Optional
+
+import pytest
+import torch
+
+from sglang.srt.layers.moe.cutlass_w4a8_moe import cutlass_w4a8_moe
+from sglang.srt.layers.moe.topk import TopKConfig, select_experts
+
+
+def pack_int4_values_to_int8(int4_values_interleaved: torch.Tensor) -> torch.Tensor:
+    if int4_values_interleaved.shape[-1] % 2 != 0:
+        raise ValueError(
+            "the last dim size of int4_values_interleaved tensor must be even."
+        )
+
+    input_tensor_int8 = int4_values_interleaved.to(torch.int8)
+
+    low_nibbles = input_tensor_int8[..., 0::2]
+    high_nibbles = input_tensor_int8[..., 1::2]
+
+    packed_tensor = (high_nibbles << 4) | (low_nibbles & 0x0F)
+
+    return packed_tensor.to(torch.int8)
+
+
+def pack_interleave(num_experts, ref_weight, ref_scale, alignment=4):
+    n, k = ref_weight.shape[1], ref_weight.shape[2]
+
+    weight = pack_int4_values_to_int8(ref_weight.cpu()).cuda()
+    w_q = weight.view((num_experts, n, k // 2)).view(torch.int8)
+    w_q = w_q.contiguous()
+
+    scale_interleaved = ref_scale.reshape(
+        ref_scale.shape[0],
+        ref_scale.shape[1],
+        (ref_scale.shape[2] // alignment),
+        alignment,
+    )  # [E, N, K/4, 4]
+    scale_interleaved = scale_interleaved.permute(0, 2, 1, 3)  # [E, K/4, N, 4]
+    scale_interleaved = scale_interleaved.reshape(
+        ref_scale.shape[0],
+        ref_scale.shape[2] // alignment,
+        ref_scale.shape[1] * alignment,
+    )  # [E, K/4, N*4]
+    w_scale = scale_interleaved.contiguous()
+
+    return w_q, w_scale
+
+
+@pytest.mark.parametrize("M", [1, 2, 4, 8, 16])
+@pytest.mark.parametrize("N", [2048])
+@pytest.mark.parametrize("K", [7168])
+@pytest.mark.parametrize("E", [256])
+@pytest.mark.parametrize("tp_size", [8])
+@pytest.mark.parametrize("use_ep_moe", [True, False])
+@pytest.mark.parametrize("topk", [8])
+@pytest.mark.parametrize("group_size", [128])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+def test_cutlass_w4a8_moe(M, N, K, E, tp_size, use_ep_moe, topk, group_size, dtype):
+    if use_ep_moe:
+        local_e = E // tp_size
+    else:  # tp mode
+        local_e = E
+        N = N // tp_size
+
+    debug = False
+    if debug:
+        a = torch.ones((M, K), dtype=dtype, device="cuda") * 0.001
+        ref_weight_1 = torch.ones((local_e, N * 2, K), dtype=torch.int8, device="cuda")
+        ref_weight_2 = torch.ones((local_e, K, N), dtype=torch.int8, device="cuda")
+        a1_scale = torch.ones(1, dtype=torch.float32, device="cuda")
+        a2_scale = torch.ones(1, dtype=torch.float32, device="cuda")
+        scale_1 = torch.ones(
+            (local_e, N * 2, K // group_size), dtype=dtype, device="cuda"
+        )
+        scale_2 = torch.ones((local_e, K, N // group_size), dtype=dtype, device="cuda")
+    else:
+        a = torch.randn(M, K, dtype=dtype, device="cuda")
+        ref_weight_1 = torch.randint(
+            -8, 8, (local_e, N * 2, K), dtype=torch.int8, device="cuda"
+        )
+        ref_weight_2 = torch.randint(
+            -8, 8, (local_e, K, N), dtype=torch.int8, device="cuda"
+        )
+        affine_coeff = 0.005
+        a1_scale = torch.randn(1, dtype=torch.float32, device="cuda")
+        a2_scale = torch.randn(1, dtype=torch.float32, device="cuda")
+        scale_1 = (
+            torch.randn(local_e, N * 2, K // group_size, dtype=dtype, device="cuda")
+            * affine_coeff
+        )
+        scale_2 = (
+            torch.randn(local_e, K, N // group_size, dtype=dtype, device="cuda")
+            * affine_coeff
+        )
+
+    w1_q, w1_scale = pack_interleave(local_e, ref_weight_1, scale_1)
+    if use_ep_moe:
+        w2_q, w2_scale = pack_interleave(local_e, ref_weight_2, scale_2)
+    else:
+        w2_q, w2_scale = pack_interleave(local_e, ref_weight_2, scale_2, 1)
+
+    device = "cuda"
+    a_strides1 = torch.full((local_e, 3), K, device=device, dtype=torch.int64)
+    c_strides1 = torch.full((local_e, 3), 2 * N, device=device, dtype=torch.int64)
+    a_strides2 = torch.full((local_e, 3), N, device=device, dtype=torch.int64)
+    c_strides2 = torch.full((local_e, 3), K, device=device, dtype=torch.int64)
+    b_strides1 = a_strides1
+    s_strides13 = c_strides1
+    b_strides2 = a_strides2
+    s_strides2 = c_strides2
+
+    score = torch.randn((M, E), dtype=dtype, device=device)
+    topk_output = select_experts(
+        hidden_states=a,
+        router_logits=score,
+        topk_config=TopKConfig(top_k=topk, renormalize=False),
+    )
+    topk_weights, topk_ids, _ = topk_output
+    expert_map = torch.arange(E, dtype=torch.int32, device=device)
+    expert_map[local_e:] = E
+
+    output = cutlass_moe(
+        a,
+        w1_q,
+        w2_q,
+        w1_scale,
+        w2_scale,
+        topk_weights,
+        topk_ids,
+        a_strides1,
+        b_strides1,
+        c_strides1,
+        a_strides2,
+        b_strides2,
+        c_strides2,
+        s_strides13,
+        s_strides2,
+        0,
+        local_e - 1,
+        E,
+        a1_scale,
+        a2_scale,
+        expert_map,
+    )
+
+    ref_output = ref(
+        a,
+        local_e,
+        topk_weights,
+        topk_ids,
+        ref_weight_1,
+        ref_weight_2,
+        scale_1,
+        scale_2,
+        has_pre_quant=True,
+        has_alpha=True,
+        pre_quant_scale_1=a1_scale,
+        pre_quant_scale_2=a2_scale,
+        alpha_1=a1_scale,
+        alpha_2=a2_scale,
+    )
+
+    # compare
+    torch.cuda.synchronize()
+
+    # compare final output
+    torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.1)
+    print("SUCCESS: Final output tensors are close.")
+
+
+def cutlass_moe(
+    a: torch.Tensor,
+    w1_q: torch.Tensor,
+    w2_q: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids_: torch.Tensor,
+    a_strides1: torch.Tensor,
+    b_strides1: torch.Tensor,
+    c_strides1: torch.Tensor,
+    a_strides2: torch.Tensor,
+    b_strides2: torch.Tensor,
+    c_strides2: torch.Tensor,
+    s_strides13: torch.Tensor,
+    s_strides2: torch.Tensor,
+    start_expert_id: int,
+    end_expert_id: int,
+    E: int,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    expert_map: Optional[torch.Tensor] = None,
+    apply_router_weight_on_input: bool = False,
+):
+    local_topk_ids = topk_ids_
+    local_topk_ids = torch.where(expert_map[topk_ids_] != E, expert_map[topk_ids_], E)
+    device = a.device
+
+    local_num_experts = end_expert_id - start_expert_id + 1
+    expert_offsets = torch.empty(
+        (local_num_experts + 1), dtype=torch.int32, device=device
+    )
+    problem_sizes1 = torch.empty(
+        (local_num_experts, 3), dtype=torch.int32, device=device
+    )
+    problem_sizes2 = torch.empty(
+        (local_num_experts, 3), dtype=torch.int32, device=device
+    )
+    return cutlass_w4a8_moe(
+        start_expert_id,
+        end_expert_id,
+        E,
+        a,
+        w1_q,
+        w2_q,
+        w1_scale,
+        w2_scale,
+        topk_weights,
+        topk_ids_,
+        local_topk_ids,
+        a_strides1,
+        b_strides1,
+        c_strides1,
+        a_strides2,
+        b_strides2,
+        c_strides2,
+        s_strides13,
+        s_strides2,
+        expert_offsets,
+        problem_sizes1,
+        problem_sizes2,
+        a1_scale,
+        a2_scale,
+        apply_router_weight_on_input,
+    )
+
+
+def ref(
+    x: torch.Tensor,
+    num_experts: int,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    ref_weight_1: torch.Tensor,
+    ref_weight_2: torch.Tensor,
+    ref_weight_scale_1: torch.Tensor,
+    ref_weight_scale_2: torch.Tensor,
+    has_pre_quant: bool = False,
+    has_alpha: bool = False,
+    pre_quant_scale_1: Optional[torch.Tensor] = None,
+    pre_quant_scale_2: Optional[torch.Tensor] = None,
+    alpha_1: Optional[torch.Tensor] = None,
+    alpha_2: Optional[torch.Tensor] = None,
+):
+    results = torch.zeros_like(x)
+    dtype = x.dtype
+    for e_idx in range(num_experts):
+        mask = topk_ids == e_idx
+        activated_tokens = mask.sum(1).bool()
+        act = x[activated_tokens, :]
+        if act.shape[0] == 0:
+            continue
+        final_scale = (topk_weights * mask).sum(1)[activated_tokens].unsqueeze(1)
+
+        act = (
+            torch.clamp((act / pre_quant_scale_1.float()), -448.0, 448.0)
+            .to(torch.float8_e4m3fn)
+            .to(dtype)
+        )
+        w3_w1 = ref_weight_1[e_idx]
+        ref_w_scale_repeat = (
+            ref_weight_scale_1[e_idx].repeat_interleave(128, dim=1).to(float)
+        )
+        w3_w1 = (w3_w1.to(float) * ref_w_scale_repeat).to(dtype)
+        fc1 = ((torch.matmul(act, w3_w1.T)) * alpha_1).to(torch.float16)
+
+        gate, fc1 = fc1.chunk(2, dim=-1)
+        fc1 = fc1 * torch.nn.functional.silu(gate)
+        act = torch.clamp((fc1 / pre_quant_scale_2.float()), -448.0, 448.0).to(
+            torch.float8_e4m3fn
+        )
+        act = act.to(dtype)
+
+        w2 = ref_weight_2[e_idx]
+        ref_w_scale_repeat = (
+            ref_weight_scale_2[e_idx].repeat_interleave(128, dim=1).to(float)
+        )
+        w2 = (w2.to(float) * ref_w_scale_repeat).to(dtype)
+        fc2 = (torch.matmul(act, w2.T) * alpha_2).to(torch.float16)
+
+        results[activated_tokens, :] += (fc2 * final_scale).to(results.dtype)
+
+    return results
diff --git a/python/sglang/test/test_deepep_utils.py b/python/sglang/test/test_deepep_utils.py
new file mode 100644
index 000000000..aa15b5a0b
--- /dev/null
+++ b/python/sglang/test/test_deepep_utils.py
@@ -0,0 +1,219 @@
+# Copy from deepseek-ai/DeepEP/tests/test_utils.py
+
+import os
+import sys
+from typing import Optional
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+
+def init_dist(local_rank: int, num_local_ranks: int):
+    # NOTES: you may rewrite this function with your own cluster settings
+    ip = os.getenv("MASTER_ADDR", "127.0.0.1")
+    port = int(os.getenv("MASTER_PORT", "8361"))
+    num_nodes = int(os.getenv("WORLD_SIZE", 1))
+    node_rank = int(os.getenv("RANK", 0))
+    assert (num_local_ranks < 8 and num_nodes == 1) or num_local_ranks == 8
+
+    dist.init_process_group(
+        backend="nccl",
+        init_method=f"tcp://{ip}:{port}",
+        world_size=num_nodes * num_local_ranks,
+        rank=node_rank * num_local_ranks + local_rank,
+    )
+    torch.set_default_dtype(torch.bfloat16)
+    torch.set_default_device("cuda")
+    torch.cuda.set_device(local_rank)
+
+    return (
+        dist.get_rank(),
+        dist.get_world_size(),
+        dist.new_group(list(range(num_local_ranks * num_nodes))),
+    )
+
+
+def calc_diff(x: torch.Tensor, y: torch.Tensor):
+    x, y = x.double() + 1, y.double() + 1
+    denominator = (x * x + y * y).sum()
+    sim = 2 * (x * y).sum() / denominator
+    return (1 - sim).item()
+
+
+def per_token_cast_to_fp8(x: torch.Tensor):
+    assert x.dim() == 2 and x.size(1) % 128 == 0
+    m, n = x.shape
+    x_view = x.view(m, -1, 128)
+    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
+    return (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(
+        m, n
+    ), (x_amax / 448.0).view(m, -1)
+
+
+def per_token_cast_back(x_fp8: torch.Tensor, x_scales: torch.Tensor):
+    x_fp32 = x_fp8.to(torch.float32).view(x_fp8.size(0), -1, 128)
+    x_scales = x_scales.view(x_fp8.size(0), -1, 1)
+    return (x_fp32 * x_scales).view(x_fp8.shape).to(torch.bfloat16)
+
+
+def inplace_unique(x: torch.Tensor, num_slots: int):
+    assert x.dim() == 2
+    mask = x < 0
+    x_padded = x.masked_fill(mask, num_slots)
+    bin_count = torch.zeros((x.size(0), num_slots + 1), dtype=x.dtype, device=x.device)
+    bin_count.scatter_add_(1, x_padded, torch.ones_like(x_padded))
+    bin_count = bin_count[:, :num_slots]
+    sorted_bin_count, sorted_bin_idx = torch.sort(bin_count, dim=-1, descending=True)
+    sorted_bin_idx.masked_fill_(sorted_bin_count == 0, -1)
+    sorted_bin_idx = torch.sort(sorted_bin_idx, descending=True, dim=-1).values
+    x[:, :].fill_(-1)
+    valid_len = min(num_slots, x.size(1))
+    x[:, :valid_len] = sorted_bin_idx[:, :valid_len]
+
+
+def create_grouped_scores(
+    scores: torch.Tensor, group_idx: torch.Tensor, num_groups: int
+):
+    num_tokens, num_experts = scores.shape
+    scores = scores.view(num_tokens, num_groups, -1)
+    mask = torch.zeros((num_tokens, num_groups), dtype=torch.bool, device=scores.device)
+    mask = mask.scatter_(1, group_idx, True).unsqueeze(-1).expand_as(scores)
+    return (scores * mask).view(num_tokens, num_experts)
+
+
+def bench(fn, num_warmups: int = 20, num_tests: int = 30, post_fn=None):
+    # Flush L2 cache with 256 MB data
+    torch.cuda.synchronize()
+    cache = torch.empty(int(256e6 // 4), dtype=torch.int, device="cuda")
+
+    # Warmup
+    for _ in range(num_warmups):
+        fn()
+
+    # Flush L2
+    cache.zero_()
+
+    # Testing
+    start_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
+    end_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
+    for i in range(num_tests):
+        # Record
+        start_events[i].record()
+        fn()
+        end_events[i].record()
+        if post_fn is not None:
+            post_fn()
+    torch.cuda.synchronize()
+
+    times = np.array(
+        [s.elapsed_time(e) / 1e3 for s, e in zip(start_events, end_events)]
+    )[1:]
+    return np.average(times), np.min(times), np.max(times)
+
+
+class empty_suppress:
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *_):
+        pass
+
+
+class suppress_stdout_stderr:
+    def __enter__(self):
+        self.outnull_file = open(os.devnull, "w")
+        self.errnull_file = open(os.devnull, "w")
+
+        self.old_stdout_fileno_undup = sys.stdout.fileno()
+        self.old_stderr_fileno_undup = sys.stderr.fileno()
+
+        self.old_stdout_fileno = os.dup(sys.stdout.fileno())
+        self.old_stderr_fileno = os.dup(sys.stderr.fileno())
+
+        self.old_stdout = sys.stdout
+        self.old_stderr = sys.stderr
+
+        os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
+        os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)
+
+        sys.stdout = self.outnull_file
+        sys.stderr = self.errnull_file
+        return self
+
+    def __exit__(self, *_):
+        sys.stdout = self.old_stdout
+        sys.stderr = self.old_stderr
+
+        os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
+        os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
+
+        os.close(self.old_stdout_fileno)
+        os.close(self.old_stderr_fileno)
+
+        self.outnull_file.close()
+        self.errnull_file.close()
+
+
+def bench_kineto(
+    fn,
+    kernel_names,
+    num_tests: int = 30,
+    suppress_kineto_output: bool = False,
+    trace_path: Optional[str] = None,
+    barrier_comm_profiling: bool = False,
+):
+    # Profile
+    suppress = suppress_stdout_stderr if suppress_kineto_output else empty_suppress
+    with suppress():
+        schedule = torch.profiler.schedule(wait=0, warmup=1, active=1, repeat=1)
+        with torch.profiler.profile(
+            activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule
+        ) as prof:
+            for i in range(2):
+                # NOTES: use a large kernel and a barrier to eliminate the unbalanced CPU launch overhead
+                if barrier_comm_profiling:
+                    lhs = torch.randn((8192, 8192), dtype=torch.float, device="cuda")
+                    rhs = torch.randn((8192, 8192), dtype=torch.float, device="cuda")
+                    lhs @ rhs
+                    dist.all_reduce(torch.ones(1, dtype=torch.float, device="cuda"))
+                for _ in range(num_tests):
+                    fn()
+                prof.step()
+
+    # Parse the profiling table
+    assert isinstance(kernel_names, str) or isinstance(kernel_names, tuple)
+    is_tupled = isinstance(kernel_names, tuple)
+    prof_lines = (
+        prof.key_averages()
+        .table(sort_by="cuda_time_total", max_name_column_width=100)
+        .split("\n")
+    )
+    kernel_names = (kernel_names,) if isinstance(kernel_names, str) else kernel_names
+    assert all([isinstance(name, str) for name in kernel_names])
+    for name in kernel_names:
+        assert (
+            sum([name in line for line in prof_lines]) == 1
+        ), f"Errors of the kernel {name} in the profiling table"
+
+    # Save chrome traces
+    if trace_path is not None:
+        prof.export_chrome_trace(trace_path)
+
+    # Return average kernel times
+    units = {"ms": 1e3, "us": 1e6}
+    kernel_times = []
+    for name in kernel_names:
+        for line in prof_lines:
+            if name in line:
+                time_str = line.split()[-2]
+                for unit, scale in units.items():
+                    if unit in time_str:
+                        kernel_times.append(float(time_str.replace(unit, "")) / scale)
+                        break
+                break
+    return tuple(kernel_times) if is_tupled else kernel_times[0]
+
+
+def hash_tensor(t: torch.Tensor):
+    return t.view(torch.int64).sum().item()
diff --git a/python/sglang/test/test_disaggregation_utils.py b/python/sglang/test/test_disaggregation_utils.py
new file mode 100644
index 000000000..f61b71a9d
--- /dev/null
+++ b/python/sglang/test/test_disaggregation_utils.py
@@ -0,0 +1,66 @@
+import time
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    CustomTestCase,
+    popen_with_error_check,
+)
+
+
+class TestDisaggregationBase(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.process_lb, cls.process_decode, cls.process_prefill = None, None, None
+        pass
+
+    @classmethod
+    def launch_lb(cls):
+        lb_command = [
+            "python3",
+            "-m",
+            "sglang_router.launch_router",
+            "--pd-disaggregation",
+            "--mini-lb",  # FIXME: remove this
+            "--prefill",
+            cls.prefill_url,
+            "--decode",
+            cls.decode_url,
+            "--host",
+            cls.base_host,
+            "--port",
+            cls.lb_port,
+        ]
+        print("Starting load balancer:", " ".join(lb_command))
+        cls.process_lb = popen_with_error_check(lb_command)
+        cls.wait_server_ready(cls.lb_url + "/health")
+
+    @classmethod
+    def wait_server_ready(cls, url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH):
+        start_time = time.perf_counter()
+        while True:
+            try:
+                response = requests.get(url)
+                if response.status_code == 200:
+                    print(f"Server {url} is ready")
+                    return
+            except Exception:
+                pass
+
+            if time.perf_counter() - start_time > timeout:
+                raise RuntimeError(f"Server {url} failed to start in {timeout}s")
+            time.sleep(1)
+
+    @classmethod
+    def tearDownClass(cls):
+        for process in [cls.process_lb, cls.process_decode, cls.process_prefill]:
+            if process:
+                try:
+                    kill_process_tree(process.pid)
+                except Exception as e:
+                    print(f"Error killing process {process.pid}: {e}")
+
+        # wait for 5 seconds
+        time.sleep(5)
diff --git a/python/sglang/test/test_dynamic_grad_mode.py b/python/sglang/test/test_dynamic_grad_mode.py
new file mode 100644
index 000000000..078c3139d
--- /dev/null
+++ b/python/sglang/test/test_dynamic_grad_mode.py
@@ -0,0 +1,58 @@
+import unittest
+
+import torch
+
+from sglang.srt.utils import DynamicGradMode
+from sglang.test.test_utils import CustomTestCase
+
+
+class TestDynamicGradMode(CustomTestCase):
+    def test_inference(self):
+        # Test inference_mode
+        DynamicGradMode.set_inference_mode(True)
+
+        @DynamicGradMode()
+        def create_tensor_x():
+            return torch.empty(0)
+
+        X = create_tensor_x()
+        self.assertTrue(not X.requires_grad and X.is_inference())
+
+    def test_no_grad(self):
+        # Test no_grad
+        DynamicGradMode.set_inference_mode(False)
+
+        @DynamicGradMode()
+        def create_tensor_y():
+            return torch.empty(0)
+
+        Y = create_tensor_y()
+        self.assertTrue(not Y.requires_grad and not Y.is_inference())
+
+    def test_nested_inference(self):
+        # Test no_grad nested inference_mode, inference_mode should has higher priority
+        DynamicGradMode.set_inference_mode(False)
+
+        @DynamicGradMode()
+        def create_tensor_z():
+            with torch.inference_mode():
+                return torch.empty(0)
+
+        Z = create_tensor_z()
+        self.assertTrue(not Z.requires_grad and Z.is_inference())
+
+    def test_nested_no_grad(self):
+        # Test inference_mode nested no_grad, inference_mode should has higher priority
+        DynamicGradMode.set_inference_mode(True)
+
+        @DynamicGradMode()
+        def create_tensor_w():
+            with torch.no_grad():
+                return torch.empty(0)
+
+        W = create_tensor_w()
+        self.assertTrue(not W.requires_grad and W.is_inference())
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/python/sglang/test/test_fp4_moe.py b/python/sglang/test/test_fp4_moe.py
new file mode 100644
index 000000000..8f8c8e8a7
--- /dev/null
+++ b/python/sglang/test/test_fp4_moe.py
@@ -0,0 +1,329 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Callable
+
+import pytest
+import torch
+from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe
+from sgl_kernel import scaled_fp4_quant
+
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.moe.cutlass_moe import cutlass_moe_fp4
+from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams, CutlassMoEType
+from sglang.srt.layers.moe.topk import TopKConfig, select_experts
+
+if torch.cuda.get_device_capability() < (10, 0):
+    pytest.skip(
+        reason="Nvfp4 Requires compute capability of 10 or above.",
+        allow_module_level=True,
+    )
+
+kE2M1ToFloat = torch.tensor(
+    [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0], dtype=torch.float32
+)
+
+FLOAT8_E4M3_MAX = 448.0
+FLOAT4_E2M1_MAX = 6.0
+
+
+def convert_swizzled_to_linear(a_sf_swizzled: torch.Tensor, m, k, block_size):
+    m_tiles = (m + 128 - 1) // 128
+    f = block_size * 4
+    k_tiles = (k + f - 1) // f
+    tmp = torch.reshape(a_sf_swizzled, (1, m_tiles, k_tiles, 32, 4, 4))
+    tmp = torch.permute(tmp, (0, 1, 4, 3, 2, 5))
+    out = tmp.reshape(m_tiles * 128, k_tiles * f // block_size)
+    return out[0:m, 0:k]
+
+
+def dequantize_nvfp4_to_dtype(
+    tensor_fp4, tensor_sf, global_scale, dtype, device, block_size=16
+):
+    """Dequantize the fp4 tensor back to high precision."""
+    # Two fp4 values are packed into one uint8.
+    assert tensor_fp4.dtype == torch.uint8
+    m, packed_k = tensor_fp4.shape
+    k = packed_k * 2
+    tensor_f32 = break_fp4_bytes(tensor_fp4, dtype)
+    tensor_f32 = tensor_f32.reshape(m, k // block_size, block_size)
+    tensor_sf = tensor_sf.view(torch.float8_e4m3fn)
+    tensor_sf = convert_swizzled_to_linear(tensor_sf, m, k, block_size)
+    tensor_sf_dtype = tensor_sf.to(torch.float32) / global_scale
+
+    # scale the tensor
+    out = (tensor_f32 * tensor_sf_dtype.unsqueeze(-1)).reshape(m, k)
+    return out.to(dtype=dtype)
+
+
+def break_fp4_bytes(a, dtype):
+    assert a.dtype == torch.uint8
+    m, n = a.shape
+
+    # Vectorized nibble processing
+    a_flat = a.flatten()
+    high = (a_flat & 0xF0) >> 4  # Upper nibbles
+    low = a_flat & 0x0F  # Lower nibbles
+
+    # Combine nibbles for batch processing
+    combined = torch.stack((low, high), dim=1).flatten()
+
+    # Vectorized sign and magnitude extraction
+    signs = (combined & 0x08).to(torch.bool)  # Sign bits
+    abs_vals = (combined & 0x07).to(torch.long)  # Magnitude indices
+
+    # Device-aware lookup and sign application
+    kE2M1 = kE2M1ToFloat.to(device=a.device)
+    values = kE2M1[abs_vals] * torch.where(signs, -1.0, 1.0)
+
+    # Reshape to final form
+    return values.reshape(m, n * 2).to(dtype=dtype)
+
+
+MNK_FACTORS = [
+    (2, 1024, 1024),
+    (2, 1024, 1536),
+    (2, 3072, 1024),
+    (2, 3072, 1536),
+    (64, 1024, 1024),
+    (64, 1024, 1536),
+    (64, 3072, 1024),
+    (64, 2048, 1024),
+    (224, 1024, 1024),
+    (224, 1024, 1536),
+]
+
+
+# Reference implementation of torch_moe
+def torch_moe(a, w1, w2, score, topk, expert_map):
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+    if expert_map is not None:
+        topk_ids = expert_map[topk_ids]
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            out[mask] = SiluAndMul()(a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(
+                0, 1
+            )
+    return (
+        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+    ).sum(dim=1)
+
+
+def check_moe(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    dtype: torch.dtype,
+    moe_impl: Callable,
+    flip_w13: bool,
+):
+    torch.manual_seed(7)
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    quant_blocksize = 16
+    round_up = lambda x, y: (x + y - 1) // y * y
+    sf_w1_2n = round_up(2 * n, 128)
+    sf_w1_k = round_up(k // quant_blocksize, 4)
+    w1_blockscale = torch.empty(
+        (e, sf_w1_2n, sf_w1_k), device="cuda", dtype=torch.float8_e4m3fn
+    )
+
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+    sf_w2_k = round_up(k, 128)
+    sf_w2_n = round_up(n // quant_blocksize, 4)
+    w2_blockscale = torch.empty(
+        (e, sf_w2_k, sf_w2_n), device="cuda", dtype=torch.float8_e4m3fn
+    )
+
+    w1_q = torch.empty((e, 2 * n, k // 2), device="cuda", dtype=torch.uint8)
+    w2_q = torch.empty((e, k, n // 2), device="cuda", dtype=torch.uint8)
+    w1_gs = torch.empty((e,), device="cuda", dtype=torch.float32)
+    w2_gs = torch.empty((e,), device="cuda", dtype=torch.float32)
+
+    for expert in range(e):
+        w1_amax = torch.abs(w1).max().to(torch.float32)
+        w2_amax = torch.abs(w2).max().to(torch.float32)
+        w1_gs[expert] = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w1_amax
+        w2_gs[expert] = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w2_amax
+
+        w1_q[expert], w1_blockscale[expert] = scaled_fp4_quant(
+            w1[expert], w1_gs[expert]
+        )
+
+        w2_q[expert], w2_blockscale[expert] = scaled_fp4_quant(
+            w2[expert], w2_gs[expert]
+        )
+
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+
+    topk_output = select_experts(
+        hidden_states=a,
+        router_logits=score,
+        topk_config=TopKConfig(top_k=topk, renormalize=False),
+    )
+    topk_weights, topk_ids, _ = topk_output
+
+    a1_gs = torch.ones((e,), device="cuda", dtype=torch.float32)
+    a2_gs = torch.ones((e,), device="cuda", dtype=torch.float32)
+    test_output = moe_impl(
+        a=a,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        w1_q=w1_q,
+        w2_q=w2_q,
+        a1_gs=a1_gs,
+        w1_blockscale=w1_blockscale,
+        w1_alphas=(1 / w1_gs),
+        a2_gs=a2_gs,
+        w2_blockscale=w2_blockscale,
+        w2_alphas=(1 / w2_gs),
+    )
+
+    # Reference check:
+    a_global_scale = (
+        (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(a.flatten(), dim=-1)
+    ).to(torch.float32)
+    a_fp4, a_scale_interleaved = scaled_fp4_quant(a, a_global_scale)
+    _, m_k = a_fp4.shape
+    a_in_dtype = dequantize_nvfp4_to_dtype(
+        a_fp4,
+        a_scale_interleaved,
+        a_global_scale,
+        dtype=a.dtype,
+        device=a.device,
+        block_size=quant_blocksize,
+    )
+
+    w1_d = torch.empty((e, 2 * n, k), device="cuda", dtype=dtype)
+    w2_d = torch.empty((e, k, n), device="cuda", dtype=dtype)
+
+    for idx in range(0, e):
+        w1_d[idx] = dequantize_nvfp4_to_dtype(
+            w1_q[idx],
+            w1_blockscale[idx],
+            w1_gs[idx],
+            dtype=w1.dtype,
+            device=w1.device,
+            block_size=quant_blocksize,
+        )
+        w2_d[idx] = dequantize_nvfp4_to_dtype(
+            w2_q[idx],
+            w2_blockscale[idx],
+            w2_gs[idx],
+            dtype=w2.dtype,
+            device=w2.device,
+            block_size=quant_blocksize,
+        )
+
+    if flip_w13:
+        dim = -2
+        size = w1_d.size(dim)
+        assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}"
+        half = size // 2
+        # Reorder weight
+        w1, w3 = w1_d.split(half, dim=dim)
+        w1_d = torch.cat([w3, w1], dim=dim).contiguous()
+
+    torch_output = torch_moe(a_in_dtype, w1_d, w2_d, score, topk, None)
+
+    torch.testing.assert_close(torch_output, test_output, atol=1e-1, rtol=1e-1)
+
+
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("e", [40, 64, 256])
+@pytest.mark.parametrize("topk", [1, 6, 8])
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+@torch.inference_mode()
+def test_cutlass_fp4_moe_no_graph(
+    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype
+):
+    def cutlass_moe_impl(
+        a,
+        topk_weights,
+        topk_ids,
+        w1_q,
+        w2_q,
+        a1_gs,
+        w1_blockscale,
+        w1_alphas,
+        a2_gs,
+        w2_blockscale,
+        w2_alphas,
+    ):
+        params = CutlassMoEParams(
+            CutlassMoEType.BlockscaledFP4,
+            device=a.device,
+            num_experts=e,
+            intermediate_size_per_partition=n,  # n
+            hidden_size=k,
+        )  # k
+        return cutlass_moe_fp4(
+            a=a,
+            a1_gscale=a1_gs,
+            w1_fp4=w1_q,
+            w1_blockscale=w1_blockscale,
+            w1_alphas=w1_alphas,
+            a2_gscale=a2_gs,
+            w2_fp4=w2_q,
+            w2_blockscale=w2_blockscale,
+            w2_alphas=w2_alphas,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            params=params,
+            apply_router_weight_on_input=False,
+        )
+
+    check_moe(m, n, k, e, topk, dtype, cutlass_moe_impl, flip_w13=False)
+
+
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("e", [40, 64, 256])
+@pytest.mark.parametrize("topk", [1, 6, 8])
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+@torch.inference_mode()
+def test_flashinfer_fp4_moe_no_graph(
+    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype
+):
+    def flashinfer_moe_impl(
+        a,
+        topk_weights,
+        topk_ids,
+        w1_q,
+        w2_q,
+        a1_gs,
+        w1_blockscale,
+        w1_alphas,
+        a2_gs,
+        w2_blockscale,
+        w2_alphas,
+    ):
+        return flashinfer_cutlass_fused_moe(
+            a,
+            topk_ids.to(torch.int),
+            topk_weights,
+            w1_q.view(torch.long),
+            w2_q.view(torch.long),
+            a.dtype,
+            quant_scales=[
+                a1_gs,
+                w1_blockscale.view(torch.int32),
+                w1_alphas,
+                a2_gs,
+                w2_blockscale.view(torch.int32),
+                w2_alphas,
+            ],
+        )[0]
+
+    check_moe(m, n, k, e, topk, dtype, flashinfer_moe_impl, flip_w13=True)
+
+
+if __name__ == "__main__":
+    test_cutlass_fp4_moe_no_graph(224, 1024, 1024, 256, 8, torch.half)
+    test_flashinfer_fp4_moe_no_graph(224, 1024, 1024, 256, 8, torch.half)
diff --git a/python/sglang/test/test_layernorm.py b/python/sglang/test/test_layernorm.py
new file mode 100644
index 000000000..05b6593eb
--- /dev/null
+++ b/python/sglang/test/test_layernorm.py
@@ -0,0 +1,113 @@
+import itertools
+import unittest
+
+import torch
+
+from sglang.srt.layers.layernorm import GemmaRMSNorm, RMSNorm
+from sglang.test.test_utils import CustomTestCase
+
+
+class TestRMSNorm(CustomTestCase):
+    DTYPES = [torch.half, torch.bfloat16]
+    NUM_TOKENS = [7, 83, 4096]
+    HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, 8199]
+    ADD_RESIDUAL = [False, True]
+    SEEDS = [0]
+
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+
+    def _run_rms_norm_test(self, num_tokens, hidden_size, add_residual, dtype, seed):
+        torch.manual_seed(seed)
+
+        layer = RMSNorm(hidden_size).to(dtype=dtype)
+        layer.weight.data.normal_(mean=1.0, std=0.1)
+        scale = 1 / (2 * hidden_size)
+        x = torch.randn(num_tokens, hidden_size, dtype=dtype) * scale
+        residual = torch.randn_like(x) * scale if add_residual else None
+
+        with torch.inference_mode():
+            ref_out = layer.forward_native(x, residual)
+            out = layer(x, residual)
+
+        if add_residual:
+            self.assertTrue(torch.allclose(out[0], ref_out[0], atol=1e-2, rtol=1e-2))
+            self.assertTrue(torch.allclose(out[1], ref_out[1], atol=1e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.allclose(out, ref_out, atol=1e-2, rtol=1e-2))
+
+    def test_rms_norm(self):
+        for params in itertools.product(
+            self.NUM_TOKENS,
+            self.HIDDEN_SIZES,
+            self.ADD_RESIDUAL,
+            self.DTYPES,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                num_tokens=params[0],
+                hidden_size=params[1],
+                add_residual=params[2],
+                dtype=params[3],
+                seed=params[4],
+            ):
+                self._run_rms_norm_test(*params)
+
+
+class TestGemmaRMSNorm(CustomTestCase):
+    DTYPES = [torch.half, torch.bfloat16]
+    NUM_TOKENS = [7, 83, 4096]
+    HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, 8199]
+    ADD_RESIDUAL = [False, True]
+    SEEDS = [0]
+
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+
+    def _run_gemma_rms_norm_test(
+        self, num_tokens, hidden_size, add_residual, dtype, seed
+    ):
+        torch.manual_seed(seed)
+
+        layer = GemmaRMSNorm(hidden_size).to(dtype=dtype)
+        layer.weight.data.normal_(mean=1.0, std=0.1)
+        scale = 1 / (2 * hidden_size)
+        x = torch.randn(num_tokens, hidden_size, dtype=dtype) * scale
+        residual = torch.randn_like(x) * scale if add_residual else None
+
+        with torch.inference_mode():
+            ref_out = layer.forward_native(x, residual)
+            out = layer(x, residual)
+
+        if add_residual:
+            self.assertTrue(torch.allclose(out[0], ref_out[0], atol=1e-3, rtol=1e-3))
+            self.assertTrue(torch.allclose(out[1], ref_out[1], atol=1e-3, rtol=1e-3))
+        else:
+            self.assertTrue(torch.allclose(out, ref_out, atol=1e-3, rtol=1e-3))
+
+    def test_gemma_rms_norm(self):
+        for params in itertools.product(
+            self.NUM_TOKENS,
+            self.HIDDEN_SIZES,
+            self.ADD_RESIDUAL,
+            self.DTYPES,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                num_tokens=params[0],
+                hidden_size=params[1],
+                add_residual=params[2],
+                dtype=params[3],
+                seed=params[4],
+            ):
+                self._run_gemma_rms_norm_test(*params)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/python/sglang/test/test_marlin_moe.py b/python/sglang/test/test_marlin_moe.py
new file mode 100644
index 000000000..77b0109df
--- /dev/null
+++ b/python/sglang/test/test_marlin_moe.py
@@ -0,0 +1,286 @@
+import types
+from typing import Optional
+
+import pytest
+import torch
+from sgl_kernel import fused_marlin_moe
+from sgl_kernel.scalar_type import ScalarType, scalar_types
+
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.test.test_marlin_utils import awq_marlin_quantize, marlin_quantize
+
+
+def stack_and_dev(tensors: list[torch.Tensor]):
+    dev = tensors[0].device
+    return torch.stack(tensors, dim=0).to(dev)
+
+
+def torch_experts(
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    quant_dtype: Optional[torch.dtype] = None,
+    apply_router_weights_on_input: bool = False,
+) -> torch.Tensor:
+    assert (
+        global_num_experts == -1
+        or (global_num_experts == w1.shape[0] and expert_map is None)
+        or (expert_map is not None and global_num_experts == expert_map.shape[0])
+    )
+
+    M, K = a.shape
+    topk = topk_ids.shape[1]
+    print("quant_dtype", quant_dtype)
+    # exit(0)
+    if apply_router_weights_on_input:
+        assert topk == 1
+        a = a * topk_weight.to(a.dtype)
+
+    a = a.view(M, -1, K).repeat(1, topk, 1).reshape(-1, K)
+
+    out = torch.zeros(M * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+
+    num_experts = w1.shape[0]
+
+    topk_ids = topk_ids.view(-1)
+    if expert_map is not None:
+        topk_ids = expert_map[topk_ids]
+
+    f32 = torch.float32
+
+    for i in range(num_experts):
+        mask = topk_ids == i
+        if mask.sum():
+            if quant_dtype is None:
+                tmp1 = a[mask] @ w1[i].transpose(0, 1)
+                tmp2 = SiluAndMul()(tmp1)
+                out[mask] = tmp2 @ w2[i].transpose(0, 1)
+
+    if apply_router_weights_on_input:
+        return out
+    else:
+        return (
+            (out.view(M, -1, w2.shape[1]).to(f32) * topk_weight.view(M, -1, 1))
+            .sum(dim=1)
+            .to(out.dtype)
+        )
+
+
+def torch_moe(
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    score: torch.Tensor,
+    topk: int,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    return torch_experts(
+        a, w1, w2, topk_weight, topk_ids, global_num_experts, expert_map
+    )
+
+
+def marlin_moe_generate_valid_test_cases():
+    import itertools
+
+    m_list = [1, 123, 666]
+    n_list = [128, 1024]
+    k_list = [256, 2048]
+    e_list = [4, 12]
+    topk_list = [2, 3]
+    dtype_list = [torch.half, torch.bfloat16]
+    group_size_list = [128]
+    act_order_list = [True, False]
+    quant_type_list = [
+        scalar_types.uint4,
+        scalar_types.uint4b8,
+    ]
+    is_k_full_list = [True, False]
+
+    all_combinations = itertools.product(
+        m_list,
+        n_list,
+        k_list,
+        e_list,
+        topk_list,
+        dtype_list,
+        group_size_list,
+        act_order_list,
+        quant_type_list,
+        is_k_full_list,
+    )
+
+    def is_invalid(
+        m, n, k, e, topk, dtype, group_size, act_order, quant_type, is_k_full
+    ):
+
+        # Filter act_order
+        if act_order:
+            if group_size in (-1, k, n):
+                return False
+            if quant_type not in [scalar_types.uint4b8]:
+                return False
+        elif not is_k_full:
+            return False
+
+        return True
+
+    cases = []
+    for case in all_combinations:
+        if is_invalid(*case):
+            cases.append(case)
+    return cases
+
+
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.parametrize(
+    ("m, n, k, e, topk, dtype, group_size," "act_order, quant_type, is_k_full"),
+    marlin_moe_generate_valid_test_cases(),
+)
+def test_fused_marlin_moe(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    dtype: torch.dtype,
+    group_size: int,
+    act_order: bool,
+    quant_type: ScalarType,
+    is_k_full: bool,
+):
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA device not available")
+
+    torch.manual_seed(0)
+
+    has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
+
+    # Filter act_order
+    if act_order:
+        if group_size == -1:
+            return
+        if group_size in (k, n):
+            return
+        if has_zp:
+            return
+    else:
+        if not is_k_full:
+            return
+
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 20
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 20
+
+    e_map = None
+
+    w_ref1_l = []
+    qweight1_l = []
+    scales1_l = []
+    zeros1_l = []
+    g_idx1_l = []
+    sort_indices1_l = []
+
+    for i in range(w1.shape[0]):
+        if has_zp:
+            w_ref1, qweight1, scales1, zeros1 = awq_marlin_quantize(
+                w1[i].transpose(1, 0), quant_type, group_size
+            )
+
+            w_ref1_l.append(w_ref1.T)
+            qweight1_l.append(qweight1)
+            scales1_l.append(scales1)
+            zeros1_l.append(zeros1)
+        else:
+            test_perm = torch.randperm(k)
+            w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = marlin_quantize(
+                w1[i].transpose(1, 0), quant_type, group_size, act_order, test_perm
+            )
+
+            w_ref1_l.append(w_ref1.T)
+            qweight1_l.append(qweight1)
+            scales1_l.append(scales1)
+            g_idx1_l.append(g_idx1)
+            sort_indices1_l.append(sort_indices1)
+
+    w_ref1 = stack_and_dev(w_ref1_l)
+    qweight1 = stack_and_dev(qweight1_l).contiguous()
+    scales1 = stack_and_dev(scales1_l)
+    g_idx1 = stack_and_dev(g_idx1_l) if g_idx1_l else None
+    zeros1 = stack_and_dev(zeros1_l) if zeros1_l else None
+    sort_indices1 = stack_and_dev(sort_indices1_l) if sort_indices1_l else None
+
+    w_ref2_l = []
+    qweight2_l = []
+    scales2_l = []
+    zeros2_l = []
+    g_idx2_l = []
+    sort_indices2_l = []
+
+    for i in range(w2.shape[0]):
+        if has_zp:
+            w_ref2, qweight2, scales2, zeros2 = awq_marlin_quantize(
+                w2[i].transpose(1, 0), quant_type, group_size
+            )
+
+            w_ref2_l.append(w_ref2.T)
+            qweight2_l.append(qweight2)
+            scales2_l.append(scales2)
+            zeros2_l.append(zeros2)
+        else:
+            test_perm = torch.randperm(n)
+            w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = marlin_quantize(
+                w2[i].transpose(1, 0), quant_type, group_size, act_order, test_perm
+            )
+
+            w_ref2_l.append(w_ref2.T)
+            qweight2_l.append(qweight2)
+            scales2_l.append(scales2)
+            g_idx2_l.append(g_idx2)
+            sort_indices2_l.append(sort_indices2)
+
+    w_ref2 = stack_and_dev(w_ref2_l)
+    qweight2 = stack_and_dev(qweight2_l).contiguous()
+    scales2 = stack_and_dev(scales2_l)
+    g_idx2 = stack_and_dev(g_idx2_l) if g_idx2_l else None
+    zeros2 = stack_and_dev(zeros2_l) if zeros2_l else None
+    sort_indices2 = stack_and_dev(sort_indices2_l) if sort_indices2_l else None
+
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+    from sglang.srt.layers.moe.topk import fused_topk_torch_native
+
+    topk_weights, topk_ids = fused_topk_torch_native(a, score, topk, False)
+
+    torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, expert_map=e_map)
+
+    marlin_output = fused_marlin_moe(
+        a,
+        qweight1,
+        qweight2,
+        scales1,
+        scales2,
+        score,
+        topk_weights,
+        topk_ids,
+        g_idx1=g_idx1,
+        g_idx2=g_idx2,
+        sort_indices1=sort_indices1,
+        sort_indices2=sort_indices2,
+        w1_zeros=zeros1,
+        w2_zeros=zeros2,
+        num_bits=4,
+        is_k_full=is_k_full,
+    )
+
+    torch.testing.assert_close(marlin_output, torch_output, atol=5e-2, rtol=0)
+
+
+if __name__ == "__main__":
+    # Run the specific test function directly
+    pytest.main([__file__])
diff --git a/python/sglang/test/test_marlin_utils.py b/python/sglang/test/test_marlin_utils.py
new file mode 100644
index 000000000..0c0590077
--- /dev/null
+++ b/python/sglang/test/test_marlin_utils.py
@@ -0,0 +1,171 @@
+"""
+Adapted from
+https://github.com/vllm-project/vllm/blob/020f58abcdea65302225663130d08fd8f4dd755a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
+"""
+
+# SPDX-License-Identifier: Apache-2.0
+"""Utility functions used for tests and benchmarks"""
+
+from typing import Optional
+
+import numpy as np
+import torch
+from sgl_kernel.scalar_type import ScalarType
+
+from sglang.srt.layers.quantization.marlin_utils import (
+    GPTQ_MARLIN_TILE,
+    marlin_permute_scales,
+    marlin_zero_points,
+)
+from sglang.srt.layers.quantization.utils import (
+    get_pack_factor,
+    gptq_quantize_weights,
+    quantize_weights,
+    sort_weights,
+)
+
+
+class MarlinWorkspace:
+
+    def __init__(self, out_features, min_thread_n, max_parallel):
+        assert (
+            out_features % min_thread_n == 0
+        ), "out_features = {} is undivisible by min_thread_n = {}".format(
+            out_features, min_thread_n
+        )
+
+        max_workspace_size = (out_features // min_thread_n) * max_parallel
+
+        self.scratch = torch.zeros(max_workspace_size, dtype=torch.int, device="cuda")
+
+
+def marlin_permute_weights(q_w, size_k, size_n, perm, tile=GPTQ_MARLIN_TILE):
+    assert q_w.shape == (size_k, size_n)
+    assert size_k % tile == 0, f"size_k = {size_k}, tile = {tile}"
+    assert size_n % tile == 0, f"size_k = {size_n}, tile = {tile}"
+
+    # Permute weights to 16x64 marlin tiles
+    q_w = q_w.reshape((size_k // tile, tile, size_n // tile, tile))
+    q_w = q_w.permute((0, 2, 1, 3))
+    q_w = q_w.reshape((size_k // tile, size_n * tile))
+
+    q_w = q_w.reshape((-1, perm.numel()))[:, perm].reshape(q_w.shape)
+
+    return q_w
+
+
+def marlin_weights(q_w, size_k, size_n, num_bits, perm):
+    # Permute
+    q_w = marlin_permute_weights(q_w, size_k, size_n, perm)
+
+    # Pack
+    pack_factor = get_pack_factor(num_bits)
+    orig_device = q_w.device
+
+    q_w = q_w.cpu().numpy().astype(np.uint32)
+
+    q_packed = np.zeros((q_w.shape[0], q_w.shape[1] // pack_factor), dtype=np.uint32)
+    for i in range(pack_factor):
+        q_packed |= q_w[:, i::pack_factor] << num_bits * i
+
+    q_packed = torch.from_numpy(q_packed.astype(np.int32)).to(orig_device)
+
+    return q_packed
+
+
+def get_weight_perm(num_bits: int):
+    perm_list: list[int] = []
+    for i in range(32):
+        perm1: list[int] = []
+        col = i // 4
+        for block in [0, 1]:
+            for row in [
+                2 * (i % 4),
+                2 * (i % 4) + 1,
+                2 * (i % 4 + 4),
+                2 * (i % 4 + 4) + 1,
+            ]:
+                perm1.append(16 * row + col + 8 * block)
+        for j in range(4):
+            perm_list.extend([p + 256 * j for p in perm1])
+
+    perm = np.array(perm_list)
+
+    if num_bits == 4:
+        interleave = np.array([0, 2, 4, 6, 1, 3, 5, 7])
+    elif num_bits == 8:
+        interleave = np.array([0, 2, 1, 3])
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
+    perm = torch.from_numpy(perm)
+    return perm
+
+
+def marlin_quantize(
+    w: torch.Tensor,
+    quant_type: ScalarType,
+    group_size: int,
+    act_order: bool,
+    test_perm: Optional[torch.Tensor] = None,
+):
+    size_k, size_n = w.shape
+    num_bits = quant_type.size_bits
+
+    # Normalize group_size
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+
+    # Quantize (and apply act_order if provided)
+    w_ref, q_w, s, g_idx, rand_perm = gptq_quantize_weights(
+        w, quant_type, group_size, act_order, test_perm
+    )
+
+    # For act_order, sort the "weights" and "g_idx" so that group ids are
+    # increasing
+    sort_indices = torch.empty(0, dtype=torch.int, device=w.device)
+    if act_order:
+        q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
+
+    # Reformat to marlin
+    weight_perm = get_weight_perm(num_bits)
+    marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm)
+    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size)
+
+    # Create result
+    res_list = [w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
+    for i in range(len(res_list)):
+        res_list[i] = res_list[i].to(w.device)
+
+    return res_list
+
+
+def awq_marlin_quantize(w: torch.Tensor, quant_type: ScalarType, group_size: int):
+    size_k, size_n = w.shape
+
+    # Normalize group_size
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+
+    # Detect num groups
+    assert size_k % group_size == 0
+    num_groups = size_k // group_size
+
+    # Quantize with zp
+    w_ref, q_w, s, zp = quantize_weights(w, quant_type, group_size, zero_points=True)
+
+    # Reformat to marlin
+    weight_perm = get_weight_perm(quant_type.size_bits)
+    marlin_q_w = marlin_weights(q_w, size_k, size_n, quant_type.size_bits, weight_perm)
+    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size)
+    marlin_zp = marlin_zero_points(zp, num_groups, size_n, quant_type.size_bits)
+
+    # Create result
+    res_list = [w_ref, marlin_q_w, marlin_s, marlin_zp]
+    for i in range(len(res_list)):
+        res_list[i] = res_list[i].to(w.device)
+
+    return res_list
diff --git a/python/sglang/test/test_programs.py b/python/sglang/test/test_programs.py
new file mode 100644
index 000000000..6756f2dd7
--- /dev/null
+++ b/python/sglang/test/test_programs.py
@@ -0,0 +1,575 @@
+"""This file contains the SGL programs used for unit testing."""
+
+import json
+import re
+import time
+
+import numpy as np
+
+import sglang as sgl
+from sglang.utils import download_and_cache_file, read_jsonl
+
+
+def test_few_shot_qa():
+    @sgl.function
+    def few_shot_qa(s, question):
+        s += "The following are questions with answers.\n\n"
+        s += "Q: What is the capital of France?\n"
+        s += "A: Paris\n"
+        s += "Q: What is the capital of Germany?\n"
+        s += "A: Berlin\n"
+        s += "Q: What is the capital of Italy?\n"
+        s += "A: Rome\n"
+        s += "Q: " + question + "\n"
+        s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
+
+    ret = few_shot_qa.run(question="What is the capital of the United States?")
+    assert "washington" in ret["answer"].strip().lower(), f"answer: {ret['answer']}"
+
+    rets = few_shot_qa.run_batch(
+        [
+            {"question": "What is the capital of Japan?"},
+            {"question": "What is the capital of the United Kingdom?"},
+            {"question": "What is the capital city of China?"},
+        ],
+        temperature=0.1,
+    )
+    answers = [x["answer"].strip().lower() for x in rets]
+    assert answers == ["tokyo", "london", "beijing"], f"answers: {answers}"
+
+
+def test_mt_bench():
+    @sgl.function
+    def answer_mt_bench(s, question_1, question_2):
+        s += sgl.system("You are a helpful assistant.")
+        s += sgl.user(question_1)
+        s += sgl.assistant(sgl.gen("answer_1"))
+        with s.user():
+            s += question_2
+        with s.assistant():
+            s += sgl.gen("answer_2")
+
+    question_1 = "Compose an engaging travel blog post about a recent trip to Hawaii, highlighting cultural experiences and must-see attractions."
+    question_2 = (
+        "Rewrite your previous response. Start every sentence with the letter A."
+    )
+    ret = answer_mt_bench.run(
+        question_1=question_1, question_2=question_2, temperature=0.7, max_new_tokens=64
+    )
+    assert len(ret.messages()) in [4, 5]
+
+
+def test_select(check_answer):
+    @sgl.function
+    def true_or_false(s, statement):
+        s += "Determine whether the statement below is True, False, or Unknown.\n"
+        s += "Statement: The capital of France is Pairs.\n"
+        s += "Answer: True\n"
+        s += "Statement: " + statement + "\n"
+        s += "Answer:" + sgl.select("answer", ["True", "False", "Unknown"])
+
+    ret = true_or_false.run(
+        statement="The capital of Germany is Berlin.",
+    )
+    if check_answer:
+        assert ret["answer"] == "True", ret.text()
+    else:
+        assert ret["answer"] in ["True", "False", "Unknown"]
+
+    ret = true_or_false.run(
+        statement="The capital of Canada is Tokyo.",
+    )
+    if check_answer:
+        assert ret["answer"] == "False", ret.text()
+    else:
+        assert ret["answer"] in ["True", "False", "Unknown"]
+
+    ret = true_or_false.run(
+        statement="Purple is a better color than green.",
+    )
+    if check_answer:
+        assert ret["answer"] == "Unknown", ret.text()
+    else:
+        assert ret["answer"] in ["True", "False", "Unknown"]
+
+
+def test_decode_int():
+    @sgl.function
+    def decode_int(s):
+        s += "The number of hours in a day is " + sgl.gen_int("hours") + "\n"
+        s += "The number of days in a year is " + sgl.gen_int("days") + "\n"
+
+    ret = decode_int.run(temperature=0.1)
+    assert int(ret["hours"]) == 24, ret.text()
+    assert int(ret["days"]) == 365, ret.text()
+
+
+def test_decode_json_regex():
+    @sgl.function
+    def decode_json(s):
+        from sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_STR
+
+        s += "Generate a JSON object to describe the basic city information of Paris.\n"
+        s += "Here are the JSON object:\n"
+
+        # NOTE: we recommend using dtype gen or whole regex string to control the output
+
+        with s.var_scope("json_output"):
+            s += "{\n"
+            s += '  "name": ' + sgl.gen(regex=REGEX_STR) + ",\n"
+            s += '  "population": ' + sgl.gen(regex=REGEX_INT, stop=[" ", "\n"]) + ",\n"
+            s += '  "area": ' + sgl.gen(regex=REGEX_INT, stop=[" ", "\n"]) + ",\n"
+            s += '  "latitude": ' + sgl.gen(regex=REGEX_FLOAT, stop=[" ", "\n"]) + "\n"
+            s += "}"
+
+    ret = decode_json.run(temperature=0.0)
+    try:
+        js_obj = json.loads(ret["json_output"])
+    except json.decoder.JSONDecodeError:
+        print("JSONDecodeError", ret["json_output"])
+        raise
+    assert isinstance(js_obj["name"], str)
+    assert isinstance(js_obj["population"], int)
+
+
+def test_decode_json():
+    @sgl.function
+    def decode_json(s):
+        s += "Generate a JSON object to describe the basic city information of Paris.\n"
+
+        with s.var_scope("json_output"):
+            s += "{\n"
+            s += '  "name": ' + sgl.gen_string() + ",\n"
+            s += '  "population": ' + sgl.gen_int() + ",\n"
+            s += '  "area": ' + sgl.gen(dtype=int) + ",\n"
+            s += '  "country": ' + sgl.gen_string() + ",\n"
+            s += '  "timezone": ' + sgl.gen(dtype=str) + "\n"
+            s += "}"
+
+    ret = decode_json.run(max_new_tokens=64)
+    try:
+        js_obj = json.loads(ret["json_output"])
+    except json.decoder.JSONDecodeError:
+        print("JSONDecodeError", ret["json_output"])
+        raise
+    assert isinstance(js_obj["name"], str)
+    assert isinstance(js_obj["population"], int)
+
+
+def test_expert_answer(check_answer=True):
+    @sgl.function
+    def expert_answer(s, question):
+        s += "Question: " + question + "\n"
+        s += (
+            "A good person to answer this question is"
+            + sgl.gen("expert", stop=[".", "\n"])
+            + ".\n"
+        )
+        s += (
+            "For example,"
+            + s["expert"]
+            + " would answer that "
+            + sgl.gen("answer", stop=".")
+            + "."
+        )
+
+    ret = expert_answer.run(question="What is the capital of France?", temperature=0.1)
+
+    if check_answer:
+        assert "paris" in ret.text().lower(), f"Answer: {ret.text()}"
+
+
+def test_tool_use():
+    def calculate(expression):
+        return f"{eval(expression)}"
+
+    @sgl.function
+    def tool_use(s, lhs, rhs):
+        s += "Please perform computations using a calculator. You can use calculate(expression) to get the results.\n"
+        s += "For example,\ncalculate(1+2)=3\ncalculate(3*4)=12\n"
+        s += "Question: What is the product of " + str(lhs) + " and " + str(rhs) + "?\n"
+        s += (
+            "Answer: The answer is calculate("
+            + sgl.gen("expression", stop=")")
+            + ") = "
+        )
+        with s.var_scope("answer"):
+            s += calculate(s["expression"])
+
+    lhs, rhs = 257, 983
+    ret = tool_use(lhs=lhs, rhs=rhs, temperature=0)
+    assert int(ret["answer"]) == lhs * rhs
+
+
+def test_react():
+    @sgl.function
+    def react(s, question):
+        s += """
+Question: Which country does the founder of Microsoft live in?
+Thought 1: I need to search for the founder of Microsoft.
+Action 1: Search [Founder of Microsoft].
+Observation 1: The founder of Microsoft is Bill Gates.
+Thought 2: I need to search for the country where Bill Gates lives in.
+Action 2: Search [Where does Bill Gates live].
+Observation 2: Bill Gates lives in the United States.
+Thought 3: The answer is the United States.
+Action 3: Finish [United States].\n
+"""
+
+        s += "Question: " + question + "\n"
+
+        for i in range(1, 5):
+            s += f"Thought {i}:" + sgl.gen(stop=[".", "\n"]) + ".\n"
+            s += f"Action {i}: " + sgl.select(f"action_{i}", ["Search", "Finish"])
+
+            if s[f"action_{i}"] == "Search":
+                s += " [" + sgl.gen(stop="]") + "].\n"
+                s += f"Observation {i}:" + sgl.gen(stop=[".", "\n"]) + ".\n"
+            else:
+                s += " [" + sgl.gen("answer", stop="]") + "].\n"
+                break
+
+    ret = react.run(
+        question="What country does the creator of Linux live in?",
+        temperature=0.1,
+    )
+    answer = ret["answer"].lower()
+    assert "finland" in answer or "states" in answer
+
+
+def test_parallel_decoding():
+    max_tokens = 64
+    fork_size = 5
+
+    @sgl.function
+    def parallel_decoding(s, topic):
+        s += "Act as a helpful assistant.\n"
+        s += "USER: Give some tips for " + topic + ".\n"
+        s += (
+            "ASSISTANT: Okay. Here are "
+            + str(fork_size)
+            + " concise tips, each under 8 words:\n"
+        )
+
+        # Generate skeleton
+        for i in range(1, 1 + fork_size):
+            s += f"{i}." + sgl.gen(max_tokens=16, stop=[".", "\n"]) + ".\n"
+
+        # Generate detailed tips
+        forks = s.fork(fork_size)
+        for i in range(fork_size):
+            forks[
+                i
+            ] += f"Now, I expand tip {i+1} into a detailed paragraph:\nTip {i+1}:"
+            forks[i] += sgl.gen("detailed_tip", max_tokens, stop=["\n\n"])
+        forks.join()
+
+        # Concatenate tips and summarize
+        s += "Here are these tips with detailed explanation:\n"
+        for i in range(fork_size):
+            s += f"Tip {i+1}:" + forks[i]["detailed_tip"] + "\n"
+
+        s += "\nIn summary," + sgl.gen("summary", max_tokens=512)
+
+    ret = parallel_decoding.run(topic="writing a good blog post", temperature=0.3)
+    assert isinstance(ret["summary"], str)
+
+
+def test_parallel_encoding(check_answer=True):
+    max_tokens = 64
+
+    @sgl.function
+    def parallel_encoding(s, question, context_0, context_1, context_2):
+        s += "USER: I will ask a question based on some statements.\n"
+        s += "ASSISTANT: Sure. I will give the answer.\n"
+        s += "USER: Please memorize these statements.\n"
+
+        contexts = [context_0, context_1, context_2]
+
+        forks = s.fork(len(contexts))
+        forks += lambda i: f"Statement {i}: " + contexts[i] + "\n"
+        forks.join(mode="concate_and_append")
+
+        s += "Now, please answer the following question. " "Do not list options."
+        s += "\nQuestion: " + question + "\n"
+        s += "ASSISTANT:" + sgl.gen("answer", max_tokens=max_tokens)
+
+    ret = parallel_encoding.run(
+        question="Who is the father of Julian?",
+        context_0="Ethan is the father of Liam.",
+        context_1="Noah is the father of Julian.",
+        context_2="Oliver is the father of Carlos.",
+        temperature=0,
+    )
+    answer = ret["answer"]
+
+    if check_answer:
+        assert "Noah" in answer
+
+
+def test_image_qa():
+    @sgl.function
+    def image_qa(s, question):
+        s += sgl.user(sgl.image("example_image.png") + question)
+        s += sgl.assistant(sgl.gen("answer"))
+
+    state = image_qa.run(
+        question="Please describe this image in simple words.",
+        temperature=0,
+        max_new_tokens=64,
+    )
+
+    assert (
+        "taxi" in state.messages()[-1]["content"]
+        or "car" in state.messages()[-1]["content"]
+    ), f"{state.messages()[-1]['content']}"
+
+
+def test_stream():
+    @sgl.function
+    def qa(s, question):
+        s += sgl.system("You are a helpful assistant.")
+        s += sgl.user(question)
+        s += sgl.assistant(sgl.gen("answer"))
+
+    ret = qa(
+        question="Compose an engaging travel blog post about a recent trip to Hawaii, highlighting cultural experiences and must-see attractions.",
+        stream=True,
+    )
+    out = ""
+    for chunk in ret.text_iter():
+        out += chunk
+
+    ret = qa(
+        question="Compose an engaging travel blog post about a recent trip to Hawaii, highlighting cultural experiences and must-see attractions.",
+        stream=True,
+    )
+    out = ""
+    for chunk in ret.text_iter("answer"):
+        out += chunk
+
+
+def test_regex():
+    regex = r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)"
+
+    @sgl.function
+    def regex_gen(s):
+        s += "Q: What is the IP address of the Google DNS servers?\n"
+        s += "A: " + sgl.gen(
+            "answer",
+            temperature=0,
+            regex=regex,
+        )
+
+    state = regex_gen.run()
+    answer = state["answer"]
+    assert re.match(regex, answer)
+
+
+def test_dtype_gen():
+    @sgl.function
+    def dtype_gen(s):
+        s += "Q: What is the full name of DNS?\n"
+        s += "A: The full names is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
+        s += "Q: Which year was DNS invented?\n"
+        s += "A: " + sgl.gen("int_res", dtype=int) + "\n"
+        s += "Q: What is the value of pi?\n"
+        s += "A: " + sgl.gen("float_res", dtype=float) + "\n"
+        s += "Q: Is the sky blue?\n"
+        s += "A: " + sgl.gen("bool_res", dtype=bool) + "\n"
+
+    state = dtype_gen.run()
+
+    try:
+        state["int_res"] = int(state["int_res"])
+        state["float_res"] = float(state["float_res"])
+        state["bool_res"] = bool(state["bool_res"])
+        # assert state["str_res"].startswith('"') and state["str_res"].endswith('"')
+    except ValueError:
+        print(state)
+        raise
+
+
+def test_completion_speculative():
+    @sgl.function(num_api_spec_tokens=64)
+    def gen_character_spec(s):
+        s += "Construct a character within the following format:\n"
+        s += "Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n"
+        s += "\nPlease generate new Name, Birthday and Job.\n"
+        s += (
+            "Name:"
+            + sgl.gen("name", stop="\n")
+            + "\nBirthday:"
+            + sgl.gen("birthday", stop="\n")
+        )
+        s += "\nJob:" + sgl.gen("job", stop="\n") + "\n"
+
+    @sgl.function
+    def gen_character_no_spec(s):
+        s += "Construct a character within the following format:\n"
+        s += "Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n"
+        s += "\nPlease generate new Name, Birthday and Job.\n"
+        s += (
+            "Name:"
+            + sgl.gen("name", stop="\n")
+            + "\nBirthday:"
+            + sgl.gen("birthday", stop="\n")
+        )
+        s += "\nJob:" + sgl.gen("job", stop="\n") + "\n"
+
+    token_usage = sgl.global_config.default_backend.token_usage
+
+    token_usage.reset()
+    gen_character_spec().sync()
+    usage_with_spec = token_usage.prompt_tokens
+
+    token_usage.reset()
+    gen_character_no_spec().sync()
+    usage_with_no_spec = token_usage.prompt_tokens
+
+    assert (
+        usage_with_spec < usage_with_no_spec
+    ), f"{usage_with_spec} vs {usage_with_no_spec}"
+
+
+def test_chat_completion_speculative():
+    @sgl.function(num_api_spec_tokens=256)
+    def gen_character_spec(s):
+        s += sgl.system("You are a helpful assistant.")
+        s += sgl.user("Construct a character within the following format:")
+        s += sgl.assistant(
+            "Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n"
+        )
+        s += sgl.user("Please generate new Name, Birthday and Job.\n")
+        s += sgl.assistant(
+            "Name:"
+            + sgl.gen("name", stop="\n")
+            + "\nBirthday:"
+            + sgl.gen("birthday", stop="\n")
+            + "\nJob:"
+            + sgl.gen("job", stop="\n")
+        )
+
+    gen_character_spec().sync()
+
+
+def test_hellaswag_select():
+    """Benchmark the accuracy of sgl.select on the HellaSwag dataset."""
+
+    def get_one_example(lines, i, include_answer):
+        ret = lines[i]["activity_label"] + ": " + lines[i]["ctx"] + " "
+        if include_answer:
+            ret += lines[i]["endings"][lines[i]["label"]]
+        return ret
+
+    def get_few_shot_examples(lines, k):
+        ret = ""
+        for i in range(k):
+            ret += get_one_example(lines, i, True) + "\n\n"
+        return ret
+
+    # Read data
+    url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl"
+    filename = download_and_cache_file(url)
+    lines = list(read_jsonl(filename))
+
+    # Construct prompts
+    num_questions = 200
+    num_shots = 20
+    few_shot_examples = get_few_shot_examples(lines, num_shots)
+
+    questions = []
+    choices = []
+    labels = []
+    for i in range(len(lines[:num_questions])):
+        questions.append(get_one_example(lines, i, False))
+        choices.append(lines[i]["endings"])
+        labels.append(lines[i]["label"])
+    arguments = [{"question": q, "choices": c} for q, c in zip(questions, choices)]
+
+    #####################################
+    ######### SGL Program Begin #########
+    #####################################
+
+    import sglang as sgl
+
+    @sgl.function
+    def few_shot_hellaswag(s, question, choices):
+        s += few_shot_examples + question
+        s += sgl.select("answer", choices=choices)
+
+    #####################################
+    ########## SGL Program End ##########
+    #####################################
+
+    # Run requests
+    tic = time.perf_counter()
+    rets = few_shot_hellaswag.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=64,
+        progress_bar=True,
+        generator_style=False,
+    )
+    preds = []
+    for i, ret in enumerate(rets):
+        preds.append(choices[i].index(ret["answer"]))
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    accuracy = np.mean(np.array(preds) == np.array(labels))
+
+    # Test generator style of run_batch
+    tic = time.perf_counter()
+    rets = few_shot_hellaswag.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=64,
+        progress_bar=True,
+        generator_style=True,
+    )
+    preds_gen = []
+    for i, ret in enumerate(rets):
+        preds_gen.append(choices[i].index(ret["answer"]))
+    latency_gen = time.perf_counter() - tic
+
+    # Compute accuracy
+    accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))
+    print(f"{accuracy=}, {accuracy_gen=}")
+    assert np.abs(accuracy_gen - accuracy) < 0.1
+    assert np.abs(latency_gen - latency) < 1
+
+    return accuracy, latency
+
+
+def test_gen_min_new_tokens():
+    """
+    Validate sgl.gen(min_tokens) functionality.
+
+    The test asks a question where, without a min_tokens constraint, the generated answer is expected to be short.
+    By enforcing the min_tokens parameter, we ensure the generated answer has at least the specified number of tokens.
+    We verify that the number of tokens in the answer is >= the min_tokens threshold.
+    """
+    import sglang as sgl
+    from sglang.srt.hf_transformers_utils import get_tokenizer
+
+    model_path = sgl.global_config.default_backend.endpoint.get_model_name()
+    MIN_TOKENS, MAX_TOKENS = 64, 128
+
+    @sgl.function
+    def convo_1(s):
+        s += sgl.user("What is the capital of the United States?")
+        s += sgl.assistant(
+            sgl.gen("answer", min_tokens=MIN_TOKENS, max_tokens=MAX_TOKENS)
+        )
+
+    def assert_min_tokens(tokenizer, text):
+        token_ids = tokenizer.encode(text)
+        assert (
+            len(token_ids) >= MIN_TOKENS
+        ), f"Generated {len(token_ids)} tokens, min required: {MIN_TOKENS}. Text: {text}"
+
+    tokenizer = get_tokenizer(model_path)
+
+    state = convo_1.run()
+    assert_min_tokens(tokenizer, state["answer"])
diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
new file mode 100644
index 000000000..0d3d769f4
--- /dev/null
+++ b/python/sglang/test/test_utils.py
@@ -0,0 +1,1430 @@
+"""Common utilities for testing and benchmarking"""
+
+import argparse
+import asyncio
+import copy
+import json
+import logging
+import os
+import random
+import re
+import subprocess
+import threading
+import time
+import unittest
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from functools import partial
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Awaitable, Callable, List, Optional, Tuple
+
+import aiohttp
+import numpy as np
+import requests
+import torch
+import torch.nn.functional as F
+
+from sglang.bench_serving import run_benchmark
+from sglang.global_config import global_config
+from sglang.srt.utils import (
+    get_bool_env_var,
+    get_device,
+    is_port_available,
+    kill_process_tree,
+    retry,
+)
+from sglang.test.run_eval import run_eval
+from sglang.utils import get_exception_traceback
+
+# General test models
+DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
+DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
+DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B"
+DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE = "Qwen/Qwen1.5-MoE-A2.7B"
+DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
+
+# MLA test models
+DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
+DEFAULT_SMALL_CROSS_ENCODER_MODEL_NAME_FOR_TEST = "cross-encoder/ms-marco-MiniLM-L6-v2"
+DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
+DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
+DEFAULT_MODEL_NAME_FOR_TEST_MLA = "lmsys/sglang-ci-dsv3-test"
+DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN = "lmsys/sglang-ci-dsv3-test-NextN"
+
+# FP8 models
+DEFAULT_MODEL_NAME_FOR_TEST_FP8 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
+DEFAULT_MODEL_NAME_FOR_ACCURACY_TEST_FP8 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
+DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8 = (
+    "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic"
+)
+DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8 = (
+    "nvidia/Llama-3.1-8B-Instruct-FP8"
+)
+DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8 = "Qwen/Qwen3-1.7B-FP8"
+DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE = "gaunernst/DeepSeek-V2-Lite-Chat-FP8"
+
+# W8A8 models
+DEFAULT_MODEL_NAME_FOR_TEST_W8A8 = "RedHatAI/Llama-3.2-3B-quantized.w8a8"
+DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"
+
+# EAGLE
+DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
+DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
+DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3 = "jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B"
+DEFAULT_STANDALONE_SPECULATIVE_TARGET_MODEL_FOR_TEST = (
+    "meta-llama/Llama-3.1-8B-Instruct"
+)
+DEFAULT_STANDALONE_SPECULATIVE_DRAFT_MODEL_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
+
+# Other use cases
+DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = (
+    "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+)
+DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
+DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
+DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-V3-0324"
+DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
+    "hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
+)
+DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST = "Qwen/Qwen3-30B-A3B"
+DEFAULT_DEEPSEEK_W4AFP8_MODEL_FOR_TEST = "Barrrrry/DeepSeek-R1-W4AFP8"
+
+# Nightly tests
+DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
+DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
+DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
+DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,zai-org/GLM-4.5-Air-FP8"
+DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4,hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
+DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
+DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST = "Qwen/Qwen2.5-VL-3B-Instruct"
+
+DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
+DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
+
+DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
+
+
+def is_in_ci():
+    """Return whether it is in CI runner."""
+    return get_bool_env_var("SGLANG_IS_IN_CI")
+
+
+def is_in_amd_ci():
+    """Return whether it is in an AMD CI runner."""
+    return get_bool_env_var("SGLANG_AMD_CI")
+
+
+def _use_cached_default_models(model_repo: str):
+    cache_dir = os.getenv("DEFAULT_MODEL_CACHE_DIR")
+    if cache_dir and model_repo:
+        model_path = os.path.join(cache_dir, model_repo)
+        if os.path.isdir(model_path):
+            return os.path.abspath(model_path)
+    return ""
+
+
+if is_in_ci():
+    DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
+        5000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 100
+    )
+else:
+    DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
+        7000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 100
+    )
+DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}"
+
+if is_in_amd_ci():
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 3000
+
+
+def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
+    assert url is not None
+
+    data = {
+        "inputs": prompt,
+        "parameters": {
+            "temperature": temperature,
+            "max_new_tokens": max_tokens,
+            "stop_sequences": stop,
+        },
+    }
+    res = requests.post(url, json=data)
+    assert res.status_code == 200
+    pred = res.json()["generated_text"][0]
+    return pred
+
+
+def find_available_port(base_port: int):
+    port = base_port + random.randint(100, 1000)
+    while True:
+        if is_port_available(port):
+            return port
+        if port < 60000:
+            port += 42
+        else:
+            port -= 43
+
+
+def call_generate_vllm(prompt, temperature, max_tokens, stop=None, n=1, url=None):
+    assert url is not None
+
+    data = {
+        "prompt": prompt,
+        "temperature": temperature,
+        "max_tokens": max_tokens,
+        "stop": stop,
+        "n": n,
+    }
+    res = requests.post(url, json=data)
+    assert res.status_code == 200
+    if n == 1:
+        pred = res.json()["text"][0][len(prompt) :]
+    else:
+        pred = [x[len(prompt) :] for x in res.json()["text"]]
+    return pred
+
+
+def call_generate_outlines(
+    prompt, temperature, max_tokens, stop=None, regex=None, n=1, url=None
+):
+    assert url is not None
+
+    data = {
+        "prompt": prompt,
+        "temperature": temperature,
+        "max_tokens": max_tokens,
+        "stop": stop,
+        "regex": regex,
+        "n": n,
+    }
+    res = requests.post(url, json=data)
+    assert res.status_code == 200
+    if n == 1:
+        pred = res.json()["text"][0][len(prompt) :]
+    else:
+        pred = [x[len(prompt) :] for x in res.json()["text"]]
+    return pred
+
+
+def call_generate_srt_raw(prompt, temperature, max_tokens, stop=None, url=None):
+    assert url is not None
+
+    data = {
+        "text": prompt,
+        "sampling_params": {
+            "temperature": temperature,
+            "max_new_tokens": max_tokens,
+            "stop": stop,
+        },
+    }
+    res = requests.post(url, json=data)
+    assert res.status_code == 200
+    obj = res.json()
+    pred = obj["text"]
+    return pred
+
+
+def call_generate_guidance(
+    prompt, temperature, max_tokens, stop=None, n=1, regex=None, model=None
+):
+    assert model is not None
+    from guidance import gen
+
+    rets = []
+    for _ in range(n):
+        out = (
+            model
+            + prompt
+            + gen(
+                name="answer",
+                max_tokens=max_tokens,
+                temperature=temperature,
+                stop=stop,
+                regex=regex,
+            )
+        )
+        rets.append(out["answer"])
+    return rets if n > 1 else rets[0]
+
+
+def call_select_lightllm(context, choices, url=None):
+    assert url is not None
+
+    scores = []
+    for i in range(len(choices)):
+        data = {
+            "inputs": context + choices[i],
+            "parameters": {
+                "max_new_tokens": 1,
+            },
+        }
+        res = requests.post(url, json=data)
+        assert res.status_code == 200
+        scores.append(0)
+    return np.argmax(scores)
+
+
+def call_select_vllm(context, choices, url=None):
+    assert url is not None
+
+    scores = []
+    for i in range(len(choices)):
+        data = {
+            "prompt": context + choices[i],
+            "max_tokens": 1,
+            "prompt_logprobs": 1,
+        }
+        res = requests.post(url, json=data)
+        assert res.status_code == 200
+        scores.append(res.json().get("prompt_score", 0))
+    return np.argmax(scores)
+
+    """
+    Modify vllm/entrypoints/api_server.py
+
+    if final_output.prompt_logprobs is not None:
+        score = np.mean([prob[t_id] for t_id, prob in zip(final_output.prompt_token_ids[1:], final_output.prompt_logprobs[1:])])
+        ret["prompt_score"] = score
+    """
+
+
+def call_select_guidance(context, choices, model=None):
+    assert model is not None
+    from guidance import select
+
+    out = model + context + select(choices, name="answer")
+    return choices.index(out["answer"])
+
+
+def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
+    parser.add_argument("--parallel", type=int, default=64)
+    parser.add_argument("--host", type=str, default="http://127.0.0.1")
+    parser.add_argument("--port", type=int, default=None)
+    parser.add_argument(
+        "--backend",
+        type=str,
+        required=True,
+        choices=[
+            "vllm",
+            "outlines",
+            "lightllm",
+            "gserver",
+            "guidance",
+            "srt-raw",
+            "llama.cpp",
+        ],
+    )
+    parser.add_argument("--n-ctx", type=int, default=4096)
+    parser.add_argument(
+        "--model-path", type=str, default="meta-llama/Llama-2-7b-chat-hf"
+    )
+    parser.add_argument("--result-file", type=str, default="result.jsonl")
+    args = parser.parse_args()
+
+    if args.port is None:
+        default_port = {
+            "vllm": 21000,
+            "outlines": 21000,
+            "lightllm": 22000,
+            "srt-raw": 30000,
+            "gserver": 9988,
+        }
+        args.port = default_port.get(args.backend, None)
+    return args
+
+
+def auto_config_device() -> str:
+    """Auto-config available device platform"""
+
+    try:
+        device = get_device()
+    except (RuntimeError, ImportError) as e:
+        print(f"Warning: {e} - Falling back to CPU")
+        device = "cpu"
+
+    return device
+
+
+def add_common_sglang_args_and_parse(parser: argparse.ArgumentParser):
+    parser.add_argument("--parallel", type=int, default=64)
+    parser.add_argument("--host", type=str, default="http://127.0.0.1")
+    parser.add_argument("--port", type=int, default=30000)
+    parser.add_argument("--backend", type=str, default="srt")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="auto",
+        choices=["auto", "cuda", "rocm", "cpu"],
+        help="Device type (auto/cuda/rocm/cpu). Auto will detect available platforms",
+    )
+    parser.add_argument("--result-file", type=str, default="result.jsonl")
+    parser.add_argument("--raw-result-file", type=str)
+    args = parser.parse_args()
+
+    return args
+
+
+def select_sglang_backend(args: argparse.Namespace):
+    from sglang.lang.backend.openai import OpenAI
+    from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
+
+    if args.backend.startswith("srt"):
+        if args.backend == "srt-no-parallel":
+            global_config.enable_parallel_encoding = False
+        backend = RuntimeEndpoint(f"{args.host}:{args.port}")
+    elif args.backend.startswith("gpt-"):
+        backend = OpenAI(args.backend)
+    else:
+        raise ValueError(f"Invalid backend: {args.backend}")
+    return backend
+
+
+def _get_call_generate(args: argparse.Namespace):
+    if args.backend == "lightllm":
+        return partial(call_generate_lightllm, url=f"{args.host}:{args.port}/generate")
+    elif args.backend == "vllm":
+        return partial(call_generate_vllm, url=f"{args.host}:{args.port}/generate")
+    elif args.backend == "srt-raw":
+        return partial(call_generate_srt_raw, url=f"{args.host}:{args.port}/generate")
+    elif args.backend == "gserver":
+        return partial(call_generate_gserver, url=f"{args.host}:{args.port}")
+    elif args.backend == "outlines":
+        return partial(call_generate_outlines, url=f"{args.host}:{args.port}/generate")
+    elif args.backend == "guidance":
+        from guidance import models
+
+        model = models.LlamaCpp(args.model_path, n_gpu_layers=-1, n_ctx=args.n_ctx)
+        call_generate = partial(call_generate_guidance, model=model)
+        call_generate("Hello,", 1.0, 8, ".")
+        return call_generate
+    else:
+        raise ValueError(f"Invalid backend: {args.backend}")
+
+
+def _get_call_select(args: argparse.Namespace):
+    if args.backend == "lightllm":
+        return partial(call_select_lightllm, url=f"{args.host}:{args.port}/generate")
+    elif args.backend == "vllm":
+        return partial(call_select_vllm, url=f"{args.host}:{args.port}/generate")
+    elif args.backend == "guidance":
+        from guidance import models
+
+        model = models.LlamaCpp(args.model_path, n_gpu_layers=-1, n_ctx=args.n_ctx)
+        call_select = partial(call_select_guidance, model=model)
+
+        call_select("Hello,", ["world", "earth"])
+        return call_select
+    else:
+        raise ValueError(f"Invalid backend: {args.backend}")
+
+
+def get_call_generate(args: argparse.Namespace):
+    call_generate = _get_call_generate(args)
+
+    def func(*args, **kwargs):
+        try:
+            return call_generate(*args, **kwargs)
+        except Exception:
+            print("Exception in call_generate:\n" + get_exception_traceback())
+            raise
+
+    return func
+
+
+def get_call_select(args: argparse.Namespace):
+    call_select = _get_call_select(args)
+
+    def func(*args, **kwargs):
+        try:
+            return call_select(*args, **kwargs)
+        except Exception:
+            print("Exception in call_select:\n" + get_exception_traceback())
+            raise
+
+    return func
+
+
+def _get_default_models():
+    import inspect
+
+    current_module = inspect.getmodule(_get_default_models)
+    default_models = set()
+    for name, value in current_module.__dict__.items():
+        if (
+            isinstance(name, str)
+            and "DEFAULT_" in name
+            and "MODEL_" in name
+            and isinstance(value, str)
+        ):
+            if "," in value:
+                parts = [part.strip() for part in value.split(",")]
+                default_models.update(parts)
+            else:
+                default_models.add(value.strip())
+    return json.dumps(list(default_models))
+
+
+def try_cached_model(model_repo: str):
+    model_dir = _use_cached_default_models(model_repo)
+    return model_dir if model_dir else model_repo
+
+
+def popen_with_error_check(command: list[str], allow_exit: bool = False):
+    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    def _run_and_check():
+        stdout, stderr = process.communicate()
+
+        while process.poll() is None:
+            time.sleep(5)
+
+        if not allow_exit or process.returncode != 0:
+            raise Exception(
+                f"{command} exited with code {process.returncode}\n{stdout=}\n{stderr=}"
+            )
+
+    t = threading.Thread(target=_run_and_check)
+    t.start()
+    return process
+
+
+def popen_launch_server(
+    model: str,
+    base_url: str,
+    timeout: float,
+    api_key: Optional[str] = None,
+    other_args: list[str] = [],
+    env: Optional[dict] = None,
+    return_stdout_stderr: Optional[tuple] = None,
+    device: str = "auto",
+    pd_separated: bool = False,
+):
+    """Launch a server process with automatic device detection.
+
+    Args:
+        device: Device type ("auto", "cuda", "rocm" or "cpu").
+                If "auto", will detect available platforms automatically.
+    """
+    # Auto-detect device if needed
+    if device == "auto":
+        device = auto_config_device()
+        print(f"Auto-configed device: {device}", flush=True)
+        other_args = list(other_args)
+        other_args += ["--device", str(device)]
+
+    _, host, port = base_url.split(":")
+    host = host[2:]
+
+    if pd_separated:
+        command = "sglang.launch_pd_server"
+    else:
+        command = "sglang.launch_server"
+
+    command = [
+        "python3",
+        "-m",
+        command,
+        "--model-path",
+        model,
+        *[str(x) for x in other_args],
+    ]
+
+    if pd_separated:
+        command.extend(
+            [
+                "--lb-host",
+                host,
+                "--lb-port",
+                port,
+            ]
+        )
+    else:
+        command.extend(
+            [
+                "--host",
+                host,
+                "--port",
+                port,
+            ]
+        )
+
+    if api_key:
+        command += ["--api-key", api_key]
+
+    print(f"command={' '.join(command)}")
+
+    if return_stdout_stderr:
+        process = subprocess.Popen(
+            command,
+            stdout=return_stdout_stderr[0],
+            stderr=return_stdout_stderr[1],
+            env=env,
+            text=True,
+        )
+    else:
+        process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
+
+    start_time = time.perf_counter()
+    with requests.Session() as session:
+        while time.perf_counter() - start_time < timeout:
+
+            return_code = process.poll()
+            if return_code is not None:
+                # Server failed to start (non-zero exit code) or crashed
+                raise Exception(
+                    f"Server process exited with code {return_code}. "
+                    "Check server logs for errors."
+                )
+
+            try:
+                headers = {
+                    "Content-Type": "application/json; charset=utf-8",
+                    "Authorization": f"Bearer {api_key}",
+                }
+                response = session.get(
+                    f"{base_url}/health_generate",
+                    headers=headers,
+                )
+                if response.status_code == 200:
+                    return process
+            except requests.RequestException:
+                pass
+
+            return_code = process.poll()
+            if return_code is not None:
+                raise Exception(
+                    f"Server unexpectedly exits ({return_code=}). Usually there will be error logs describing the cause far above this line."
+                )
+
+            time.sleep(10)
+
+    kill_process_tree(process.pid)
+    raise TimeoutError("Server failed to start within the timeout period.")
+
+
+def popen_launch_pd_server(
+    model: str,
+    base_url: str,
+    timeout: float,
+    api_key: Optional[str] = None,
+    other_args: list[str] = (),
+    env: Optional[dict] = None,
+):
+    _, host, port = base_url.split(":")
+    host = host[2:]
+
+    command = "sglang.launch_server"
+
+    command = [
+        "python3",
+        "-m",
+        command,
+        "--model-path",
+        model,
+        *[str(x) for x in other_args],
+    ]
+
+    command.extend(
+        [
+            "--host",
+            host,
+            "--port",
+            port,
+        ]
+    )
+
+    if api_key:
+        command += ["--api-key", api_key]
+
+    print(f"command={' '.join(command)}")
+
+    process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
+
+    return process
+
+
+def run_with_timeout(
+    func: Callable,
+    args: tuple = (),
+    kwargs: Optional[dict] = None,
+    timeout: float = None,
+):
+    """Run a function with timeout."""
+    ret_value = []
+
+    def _target_func():
+        ret_value.append(func(*args, **(kwargs or {})))
+
+    t = threading.Thread(target=_target_func)
+    t.start()
+    t.join(timeout=timeout)
+    if t.is_alive():
+        raise TimeoutError()
+
+    if not ret_value:
+        raise RuntimeError()
+
+    return ret_value[0]
+
+
+@dataclass
+class TestFile:
+    name: str
+    estimated_time: float = 60
+
+
+def run_unittest_files(files: List[TestFile], timeout_per_file: float):
+    tic = time.perf_counter()
+    success = True
+
+    for i, file in enumerate(files):
+        filename, estimated_time = file.name, file.estimated_time
+        process = None
+
+        def run_one_file(filename):
+            nonlocal process
+
+            filename = os.path.join(os.getcwd(), filename)
+            print(
+                f".\n.\nBegin ({i}/{len(files) - 1}):\npython3 {filename}\n.\n.\n",
+                flush=True,
+            )
+            tic = time.perf_counter()
+
+            process = subprocess.Popen(
+                ["python3", filename], stdout=None, stderr=None, env=os.environ
+            )
+            process.wait()
+            elapsed = time.perf_counter() - tic
+
+            print(
+                f".\n.\nEnd ({i}/{len(files) - 1}):\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
+                flush=True,
+            )
+            return process.returncode
+
+        try:
+            ret_code = run_with_timeout(
+                run_one_file, args=(filename,), timeout=timeout_per_file
+            )
+            assert (
+                ret_code == 0
+            ), f"expected return code 0, but {filename} returned {ret_code}"
+        except TimeoutError:
+            kill_process_tree(process.pid)
+            time.sleep(5)
+            print(
+                f"\nTimeout after {timeout_per_file} seconds when running {filename}\n",
+                flush=True,
+            )
+            success = False
+            break
+
+    if success:
+        print(f"Success. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
+    else:
+        print(f"Fail. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
+
+    return 0 if success else -1
+
+
+def get_similarities(vec1, vec2):
+    return F.cosine_similarity(torch.tensor(vec1), torch.tensor(vec2), dim=0)
+
+
+def get_benchmark_args(
+    base_url="",
+    dataset_name="",
+    dataset_path="",
+    tokenizer="",
+    num_prompts=500,
+    sharegpt_output_len=None,
+    random_input_len=4096,
+    random_output_len=2048,
+    sharegpt_context_len=None,
+    request_rate=float("inf"),
+    disable_stream=False,
+    disable_ignore_eos=False,
+    seed: int = 0,
+    device="auto",
+    pd_separated: bool = False,
+    lora_name=None,
+):
+    return SimpleNamespace(
+        backend="sglang",
+        base_url=base_url,
+        host=None,
+        port=None,
+        dataset_name=dataset_name,
+        dataset_path=dataset_path,
+        model=None,
+        tokenizer=tokenizer,
+        num_prompts=num_prompts,
+        sharegpt_output_len=sharegpt_output_len,
+        sharegpt_context_len=sharegpt_context_len,
+        random_input_len=random_input_len,
+        random_output_len=random_output_len,
+        random_range_ratio=0.0,
+        request_rate=request_rate,
+        multi=None,
+        output_file=None,
+        disable_tqdm=False,
+        disable_stream=disable_stream,
+        return_logprob=False,
+        seed=seed,
+        disable_ignore_eos=disable_ignore_eos,
+        extra_request_body=None,
+        apply_chat_template=False,
+        profile=None,
+        lora_name=lora_name,
+        prompt_suffix="",
+        device=device,
+        pd_separated=pd_separated,
+    )
+
+
+def run_bench_serving(
+    model,
+    num_prompts,
+    request_rate,
+    other_server_args,
+    dataset_name="random",
+    dataset_path="",
+    tokenizer=None,
+    random_input_len=4096,
+    random_output_len=2048,
+    sharegpt_context_len=None,
+    disable_stream=False,
+    disable_ignore_eos=False,
+    need_warmup=False,
+    seed: int = 0,
+    device="auto",
+    background_task: Optional[Callable[[str, asyncio.Event], Awaitable[None]]] = None,
+    lora_name: Optional[str] = None,
+):
+    if device == "auto":
+        device = auto_config_device()
+    # Launch the server
+    base_url = DEFAULT_URL_FOR_TEST
+    process = popen_launch_server(
+        model,
+        base_url,
+        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        other_args=other_server_args,
+    )
+
+    # Run benchmark
+    args = get_benchmark_args(
+        base_url=base_url,
+        dataset_name=dataset_name,
+        dataset_path=dataset_path,
+        tokenizer=tokenizer,
+        num_prompts=num_prompts,
+        random_input_len=random_input_len,
+        random_output_len=random_output_len,
+        sharegpt_context_len=sharegpt_context_len,
+        request_rate=request_rate,
+        disable_stream=disable_stream,
+        disable_ignore_eos=disable_ignore_eos,
+        seed=seed,
+        device=device,
+        lora_name=lora_name,
+    )
+
+    async def _run():
+        if need_warmup:
+            warmup_args = copy.deepcopy(args)
+            warmup_args.num_prompts = 16
+            await asyncio.to_thread(run_benchmark, warmup_args)
+
+        start_event = asyncio.Event()
+        stop_event = asyncio.Event()
+        task_handle = (
+            asyncio.create_task(background_task(base_url, start_event, stop_event))
+            if background_task
+            else None
+        )
+
+        try:
+            start_event.set()
+            result = await asyncio.to_thread(run_benchmark, args)
+        finally:
+            if task_handle:
+                stop_event.set()
+                await task_handle
+
+        return result
+
+    try:
+        res = asyncio.run(_run())
+    finally:
+        kill_process_tree(process.pid)
+
+    assert res["completed"] == num_prompts
+    return res
+
+
+def run_bench_serving_multi(
+    model,
+    base_url,
+    other_server_args,
+    benchmark_args,
+    need_warmup=False,
+    pd_separated=False,
+):
+    # Launch the server
+    process = popen_launch_server(
+        model,
+        base_url,
+        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        other_args=other_server_args,
+        pd_separated=pd_separated,
+    )
+
+    # run benchmark for all
+    res_l = []
+    try:
+        for args in benchmark_args:
+            if need_warmup:
+                warmup_args = copy.deepcopy(args)
+                warmup_args.num_prompts = 16
+                run_benchmark(warmup_args)
+
+            res = run_benchmark(args)
+            res_l.append((args, res))
+    finally:
+        kill_process_tree(process.pid)
+
+    return res_l
+
+
+def run_bench_one_batch(model, other_args):
+    """Launch a offline process with automatic device detection.
+
+    Args:
+        device: Device type ("auto", "cuda", "rocm" or "cpu").
+                If "auto", will detect available platforms automatically.
+    """
+    # Auto-detect device if needed
+
+    device = auto_config_device()
+    print(f"Auto-configed device: {device}", flush=True)
+    other_args += ["--device", str(device)]
+
+    command = [
+        "python3",
+        "-m",
+        "sglang.bench_one_batch",
+        "--batch-size",
+        "1",
+        "--input",
+        "128",
+        "--output",
+        "8",
+        *[str(x) for x in other_args],
+    ]
+    if model is not None:
+        command += ["--model-path", model]
+    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    try:
+        stdout, stderr = process.communicate()
+        output = stdout.decode()
+        error = stderr.decode()
+        print(f"Output: {output}", flush=True)
+        print(f"Error: {error}", flush=True)
+
+        # Return prefill_latency, decode_throughput, decode_latency
+        prefill_line = output.split("\n")[-9]
+        decode_line = output.split("\n")[-3]
+        pattern = (
+            r"latency: (?P<latency>\d+\.\d+).*?throughput:\s*(?P<throughput>\d+\.\d+)"
+        )
+        match = re.search(pattern, prefill_line)
+        if match:
+            prefill_latency = float(match.group("latency"))
+        match = re.search(pattern, decode_line)
+        if match:
+            decode_latency = float(match.group("latency"))
+            decode_throughput = float(match.group("throughput"))
+    finally:
+        kill_process_tree(process.pid)
+
+    return prefill_latency, decode_throughput, decode_latency
+
+
+def run_bench_offline_throughput(model, other_args):
+    command = [
+        "python3",
+        "-m",
+        "sglang.bench_offline_throughput",
+        "--num-prompts",
+        "1",
+        "--dataset-name",
+        "random",
+        "--random-input-len",
+        "256",
+        "--random-output-len",
+        "256",
+        "--model-path",
+        model,
+        *[str(x) for x in other_args],
+    ]
+
+    print(f"{command=}")
+    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    try:
+        stdout, stderr = process.communicate()
+        output = stdout.decode()
+        error = stderr.decode()
+        print(f"Output: {output}", flush=True)
+        print(f"Error: {error}", flush=True)
+
+        output_throughput = -1
+        for line in output.split("\n"):
+            if "Last generation throughput (tok/s):" in line:
+                output_throughput = float(line.split(":")[-1])
+    finally:
+        kill_process_tree(process.pid)
+
+    return output_throughput
+
+
+def run_bench_one_batch_server(
+    model,
+    base_url,
+    server_args,
+    bench_args,
+    other_server_args,
+    simulate_spec_acc_lens=None,
+):
+    from sglang.bench_one_batch_server import run_benchmark
+
+    if simulate_spec_acc_lens is not None:
+        env = {**os.environ, "SIMULATE_ACC_LEN": str(simulate_spec_acc_lens)}
+    else:
+        env = None
+
+    process = popen_launch_server(
+        model,
+        base_url,
+        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        other_args=other_server_args,
+        env=env,
+    )
+    try:
+        run_benchmark(server_args=server_args, bench_args=bench_args)
+    finally:
+        kill_process_tree(process.pid)
+
+
+def lcs(X, Y):
+    m = len(X)
+    n = len(Y)
+    L = [[0] * (n + 1) for _ in range(m + 1)]
+
+    for i in range(m + 1):
+        for j in range(n + 1):
+            if i == 0 or j == 0:
+                L[i][j] = 0
+            elif X[i - 1] == Y[j - 1]:
+                L[i][j] = L[i - 1][j - 1] + 1
+            else:
+                L[i][j] = max(L[i - 1][j], L[i][j - 1])
+
+    return L[m][n]
+
+
+def calculate_rouge_l(output_strs_list1, output_strs_list2):
+    """calculate the ROUGE-L score"""
+    rouge_l_scores = []
+
+    for s1, s2 in zip(output_strs_list1, output_strs_list2):
+        lcs_len = lcs(s1, s2)
+        precision = lcs_len / len(s1) if len(s1) > 0 else 0
+        recall = lcs_len / len(s2) if len(s2) > 0 else 0
+        if precision + recall > 0:
+            fmeasure = (2 * precision * recall) / (precision + recall)
+        else:
+            fmeasure = 0.0
+        rouge_l_scores.append(fmeasure)
+
+    return rouge_l_scores
+
+
+STDERR_FILENAME = "/tmp/stderr.txt"
+STDOUT_FILENAME = "/tmp/stdout.txt"
+
+
+def read_output(output_lines: List[str], filename: str = STDERR_FILENAME):
+    """Print the output in real time with another thread."""
+    while not os.path.exists(filename):
+        time.sleep(0.01)
+
+    pt = 0
+    while pt >= 0:
+        if pt > 0 and not os.path.exists(filename):
+            break
+        try:
+            lines = open(filename).readlines()
+        except FileNotFoundError:
+            print(f"{pt=}, {os.path.exists(filename)=}")
+            raise
+        for line in lines[pt:]:
+            print(line, end="", flush=True)
+            output_lines.append(line)
+            pt += 1
+        time.sleep(0.1)
+
+
+def run_and_check_memory_leak(
+    workload_func,
+    disable_radix_cache,
+    enable_mixed_chunk,
+    disable_overlap,
+    chunked_prefill_size,
+    assert_has_abort,
+):
+    other_args = [
+        "--chunked-prefill-size",
+        str(chunked_prefill_size),
+        "--log-level",
+        "debug",
+    ]
+    if disable_radix_cache:
+        other_args += ["--disable-radix-cache"]
+    if enable_mixed_chunk:
+        other_args += ["--enable-mixed-chunk"]
+    if disable_overlap:
+        other_args += ["--disable-overlap-schedule"]
+
+    model = DEFAULT_MODEL_NAME_FOR_TEST
+    port = random.randint(4000, 5000)
+    base_url = f"http://127.0.0.1:{port}"
+
+    # Create files and launch the server
+    stdout = open(STDOUT_FILENAME, "w")
+    stderr = open(STDERR_FILENAME, "w")
+    process = popen_launch_server(
+        model,
+        base_url,
+        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        other_args=other_args,
+        return_stdout_stderr=(stdout, stderr),
+    )
+
+    # Launch a thread to stream the output
+    output_lines = []
+    t = threading.Thread(target=read_output, args=(output_lines,))
+    t.start()
+
+    # Run the workload
+    workload_func(base_url, model)
+
+    # Clean up everything
+    kill_process_tree(process.pid)
+    stdout.close()
+    stderr.close()
+    if os.path.exists(STDOUT_FILENAME):
+        os.remove(STDOUT_FILENAME)
+    if os.path.exists(STDERR_FILENAME):
+        os.remove(STDERR_FILENAME)
+    kill_process_tree(process.pid)
+    t.join()
+
+    # Assert success
+    has_new_server = False
+    has_leak = False
+    has_abort = False
+    for line in output_lines:
+        if "Uvicorn running" in line:
+            has_new_server = True
+        if "leak" in line:
+            has_leak = True
+        if "Abort" in line:
+            has_abort = True
+
+    assert has_new_server
+    assert not has_leak
+    if assert_has_abort:
+        assert has_abort
+
+
+def run_command_and_capture_output(command, env: Optional[dict] = None):
+    stdout = open(STDOUT_FILENAME, "w")
+    stderr = open(STDERR_FILENAME, "w")
+    process = subprocess.Popen(
+        command, stdout=stdout, stderr=stdout, env=env, text=True
+    )
+
+    # Launch a thread to stream the output
+    output_lines = []
+    t = threading.Thread(target=read_output, args=(output_lines, STDOUT_FILENAME))
+    t.start()
+
+    # Join the process
+    process.wait()
+
+    stdout.close()
+    stderr.close()
+    if os.path.exists(STDOUT_FILENAME):
+        os.remove(STDOUT_FILENAME)
+    if os.path.exists(STDERR_FILENAME):
+        os.remove(STDERR_FILENAME)
+    kill_process_tree(process.pid)
+    t.join()
+
+    return output_lines
+
+
+def run_mmlu_test(
+    disable_radix_cache=False,
+    enable_mixed_chunk=False,
+    disable_overlap=False,
+    chunked_prefill_size=32,
+):
+    def workload_func(base_url, model):
+        # Run the eval
+        args = SimpleNamespace(
+            base_url=base_url,
+            model=model,
+            eval_name="mmlu",
+            num_examples=128,
+            num_threads=128,
+        )
+
+        try:
+            metrics = run_eval(args)
+            assert metrics["score"] >= 0.65, f"{metrics=}"
+        finally:
+            pass
+
+    run_and_check_memory_leak(
+        workload_func,
+        disable_radix_cache,
+        enable_mixed_chunk,
+        disable_overlap,
+        chunked_prefill_size,
+        assert_has_abort=False,
+    )
+
+
+def run_mulit_request_test(
+    disable_radix_cache=False,
+    enable_mixed_chunk=False,
+    enable_overlap=False,
+    chunked_prefill_size=32,
+):
+    def workload_func(base_url, model):
+        def run_one(_):
+            prompt = """
+            System: You are a helpful assistant.
+            User: What is the capital of France?
+            Assistant: The capital of France is
+            """
+
+            response = requests.post(
+                f"{base_url}/generate",
+                json={
+                    "text": prompt,
+                    "sampling_params": {
+                        "temperature": 0,
+                        "max_new_tokens": 8,
+                    },
+                },
+            )
+            ret = response.json()
+
+        with ThreadPoolExecutor(2) as executor:
+            list(executor.map(run_one, list(range(4))))
+
+    run_and_check_memory_leak(
+        workload_func,
+        disable_radix_cache,
+        enable_mixed_chunk,
+        enable_overlap,
+        chunked_prefill_size,
+        assert_has_abort=False,
+    )
+
+
+def write_github_step_summary(content):
+    if not os.environ.get("GITHUB_STEP_SUMMARY"):
+        logging.warning("GITHUB_STEP_SUMMARY environment variable not set")
+        return
+
+    with open(os.environ["GITHUB_STEP_SUMMARY"], "a") as f:
+        f.write(content)
+
+
+def run_logprob_check(self: unittest.TestCase, arg: Tuple):
+    (
+        input_len,
+        output_len,
+        temperature,
+        logprob_start_len,
+        return_logprob,
+        top_logprobs_num,
+    ) = arg
+    input_ids = list(range(input_len))
+
+    response = requests.post(
+        self.base_url + "/generate",
+        json={
+            "input_ids": input_ids,
+            "sampling_params": {
+                "temperature": temperature,
+                "max_new_tokens": output_len,
+                "ignore_eos": True,
+            },
+            "return_logprob": return_logprob,
+            "logprob_start_len": logprob_start_len,
+            "top_logprobs_num": top_logprobs_num,
+        },
+    )
+    response_json = response.json()
+
+    res = response_json
+    self.assertEqual(res["meta_info"]["prompt_tokens"], input_len)
+    self.assertEqual(res["meta_info"]["completion_tokens"], output_len)
+
+    # Test the number of tokens are correct
+    if return_logprob:
+        self.assertEqual(
+            len(res["meta_info"]["input_token_logprobs"]) + logprob_start_len,
+            res["meta_info"]["prompt_tokens"],
+        )
+        self.assertEqual(len(res["meta_info"]["output_token_logprobs"]), output_len)
+
+        if top_logprobs_num:
+            self.assertEqual(
+                len(res["meta_info"]["input_top_logprobs"]) + logprob_start_len,
+                res["meta_info"]["prompt_tokens"],
+            )
+            self.assertEqual(len(res["meta_info"]["output_top_logprobs"]), output_len)
+
+            for i in range(output_len):
+                self.assertEqual(
+                    len(res["meta_info"]["output_top_logprobs"][i]),
+                    top_logprobs_num,
+                )
+
+                # Test the top-1 tokens are the same as output tokens if temperature == 0
+                if temperature == 0:
+                    rank = 0
+                    while rank < len(res["meta_info"]["output_top_logprobs"][i]):
+                        try:
+                            self.assertListEqual(
+                                res["meta_info"]["output_token_logprobs"][i],
+                                res["meta_info"]["output_top_logprobs"][i][rank],
+                            )
+                            break
+                        except AssertionError:
+                            # There's a tie. Allow the second item in this case.
+                            if (
+                                res["meta_info"]["output_top_logprobs"][i][rank][0]
+                                == res["meta_info"]["output_top_logprobs"][i][rank + 1][
+                                    0
+                                ]
+                            ):
+                                rank += 1
+                            else:
+                                raise
+
+
+def send_generate_requests(base_url: str, num_requests: int) -> List[str]:
+    """Sends generate request serially and returns status codes. Max concurrency is 1."""
+
+    def generate():
+        prompt = """
+        System: You are a helpful assistant.
+        User: What is the capital of France?
+        Assistant: The capital of France is
+        """
+        response = requests.post(
+            f"{base_url}/generate",
+            json={
+                "text": prompt,
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": 50,
+                },
+            },
+        )
+        return response.status_code
+
+    return [generate() for _ in range(num_requests)]
+
+
+async def send_concurrent_generate_requests(
+    base_url: str, num_requests: int
+) -> List[str]:
+    """Sends generate request concurrently and returns status codes. Max concurrency is num_requests."""
+
+    async def async_generate():
+        async with aiohttp.ClientSession() as session:
+            prompt = """
+            System: You are a helpful assistant.
+            User: What is the capital of France?
+            Assistant: The capital of France is
+            """
+            async with session.post(
+                f"{base_url}/generate",
+                json={
+                    "text": prompt,
+                    "sampling_params": {
+                        "temperature": 0,
+                        "max_new_tokens": 50,
+                    },
+                },
+            ) as response:
+                return response.status
+
+    tasks = [asyncio.create_task(async_generate()) for _ in range(num_requests)]
+    return await asyncio.gather(*tasks)
+
+
+class CustomTestCase(unittest.TestCase):
+    def _callTestMethod(self, method):
+        max_retry = int(
+            os.environ.get("SGLANG_TEST_MAX_RETRY", "1" if is_in_ci() else "0")
+        )
+        retry(
+            lambda: super(CustomTestCase, self)._callTestMethod(method),
+            max_retry=max_retry,
+        )
+
+
+def dump_bench_raw_result(
+    path: str,
+    states,
+    preds,
+    labels,
+):
+    if not path:
+        return
+
+    rows = []
+    for i in range(len(states)):
+        state = states[i]
+        output = state["answer"]
+        prompt = _ensure_remove_suffix(state.text(), output)
+        rows.append(
+            dict(
+                prompt_id=i,
+                prompt=prompt,
+                output=output,
+                correct=bool(preds[i] == labels[i]),
+            )
+        )
+
+    print(f"BenchRawResultDumper save results to {path}")
+    Path(path).write_text("\n".join(json.dumps(row) for row in rows))
+
+
+def _ensure_remove_suffix(text: str, suffix: str):
+    assert text.endswith(suffix)
+    return text.removesuffix(suffix)
diff --git a/python/sglang/utils.py b/python/sglang/utils.py
new file mode 100644
index 000000000..f6bf20c42
--- /dev/null
+++ b/python/sglang/utils.py
@@ -0,0 +1,535 @@
+"""Common utilities"""
+
+import importlib
+import json
+import logging
+import os
+import random
+import socket
+import subprocess
+import sys
+import time
+import traceback
+import urllib.request
+import weakref
+from concurrent.futures import ThreadPoolExecutor
+from functools import wraps
+from io import BytesIO
+from json import dumps
+from typing import Any, Callable, List, Optional, Tuple, Type, Union
+
+import numpy as np
+import pybase64
+import requests
+from IPython.display import HTML, display
+from pydantic import BaseModel
+from tqdm import tqdm
+
+logger = logging.getLogger(__name__)
+
+
+def execute_once(func):
+    has_run = None
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        nonlocal has_run
+        if not has_run:
+            func(*args, **kwargs)
+            has_run = True
+
+    return wrapper
+
+
+@execute_once
+def info_once(message: str):
+    logger.info(message)
+
+
+def convert_json_schema_to_str(json_schema: Union[dict, str, Type[BaseModel]]) -> str:
+    """Convert a JSON schema to a string.
+    Parameters
+    ----------
+    json_schema
+        The JSON schema.
+    Returns
+    -------
+    str
+        The JSON schema converted to a string.
+    Raises
+    ------
+    ValueError
+        If the schema is not a dictionary, a string or a Pydantic class.
+    """
+    if isinstance(json_schema, dict):
+        schema_str = json.dumps(json_schema)
+    elif isinstance(json_schema, str):
+        schema_str = json_schema
+    elif issubclass(json_schema, BaseModel):
+        schema_str = json.dumps(json_schema.model_json_schema())
+    else:
+        raise ValueError(
+            f"Cannot parse schema {json_schema}. The schema must be either "
+            + "a Pydantic class, a dictionary or a string that contains the JSON "
+            + "schema specification"
+        )
+    return schema_str
+
+
+def get_exception_traceback():
+    etype, value, tb = sys.exc_info()
+    err_str = "".join(traceback.format_exception(etype, value, tb))
+    return err_str
+
+
+def is_same_type(values: list):
+    """Return whether the elements in values are of the same type."""
+    if len(values) <= 1:
+        return True
+    else:
+        t = type(values[0])
+        return all(isinstance(v, t) for v in values[1:])
+
+
+def read_jsonl(filename: str):
+    """Read a JSONL file."""
+    with open(filename) as fin:
+        for line in fin:
+            if line.startswith("#"):
+                continue
+            yield json.loads(line)
+
+
+def dump_state_text(filename: str, states: list, mode: str = "w"):
+    """Dump program state in a text file."""
+    from sglang.lang.interpreter import ProgramState
+
+    with open(filename, mode) as fout:
+        for i, s in enumerate(states):
+            if isinstance(s, str):
+                pass
+            elif isinstance(s, ProgramState):
+                s = s.text()
+            else:
+                s = str(s)
+
+            fout.write(
+                "=" * 40 + f" {i} " + "=" * 40 + "\n" + s + "\n" + "=" * 80 + "\n\n"
+            )
+
+
+class HttpResponse:
+    def __init__(self, resp):
+        self.resp = resp
+
+    def json(self):
+        return json.loads(self.resp.read())
+
+    @property
+    def status_code(self):
+        return self.resp.status
+
+
+def http_request(
+    url,
+    json=None,
+    stream=False,
+    api_key=None,
+    verify=None,
+    method: Optional[str] = None,
+):
+    """A faster version of requests.post with low-level urllib API."""
+    headers = {"Content-Type": "application/json; charset=utf-8"}
+
+    # add the Authorization header if an api key is provided
+    if api_key is not None:
+        headers["Authorization"] = f"Bearer {api_key}"
+
+    if stream:
+        return requests.post(url, json=json, stream=True, headers=headers)
+    else:
+        req = urllib.request.Request(url, headers=headers, method=method)
+        if json is None:
+            data = None
+        else:
+            data = bytes(dumps(json), encoding="utf-8")
+
+        try:
+            resp = urllib.request.urlopen(req, data=data, cafile=verify)
+            return HttpResponse(resp)
+        except urllib.error.HTTPError as e:
+            return HttpResponse(e)
+
+
+def encode_image_base64(image_path: Union[str, bytes]):
+    """Encode an image in base64."""
+    if isinstance(image_path, str):
+        with open(image_path, "rb") as image_file:
+            data = image_file.read()
+            return pybase64.b64encode(data).decode("utf-8")
+    elif isinstance(image_path, bytes):
+        return pybase64.b64encode(image_path).decode("utf-8")
+    else:
+        # image_path is PIL.WebPImagePlugin.WebPImageFile
+        image = image_path
+        buffered = BytesIO()
+        image.save(buffered, format="PNG")
+        return pybase64.b64encode(buffered.getvalue()).decode("utf-8")
+
+
+def encode_frame(frame):
+    import cv2  # pip install opencv-python-headless
+    from PIL import Image
+
+    # Convert the frame to RGB (OpenCV uses BGR by default)
+    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+
+    # Convert the frame to PIL Image to easily convert to bytes
+    im_pil = Image.fromarray(frame)
+
+    # Convert to bytes
+    buffered = BytesIO()
+
+    # frame_format = str(os.getenv('FRAME_FORMAT', "JPEG"))
+
+    im_pil.save(buffered, format="PNG")
+
+    frame_bytes = buffered.getvalue()
+
+    # Return the bytes of the frame
+    return frame_bytes
+
+
+def encode_video_base64(video_path: str, num_frames: int = 16):
+    import cv2  # pip install opencv-python-headless
+
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise IOError(f"Could not open video file:{video_path}")
+
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    print(f"target_frames: {num_frames}")
+
+    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+
+    frames = []
+    for _ in range(total_frames):
+        ret, frame = cap.read()
+        if ret:
+            frames.append(frame)
+        else:
+            # Handle the case where the frame could not be read
+            # print(f"Warning: Could not read frame at index {i}.")
+            pass
+
+    cap.release()
+
+    # Safely select frames based on frame_indices, avoiding IndexError
+    frames = [frames[i] for i in frame_indices if i < len(frames)]
+
+    # If there are not enough frames, duplicate the last frame until we reach the target
+    while len(frames) < num_frames:
+        frames.append(frames[-1])
+
+    # Use ThreadPoolExecutor to process and encode frames in parallel
+    with ThreadPoolExecutor() as executor:
+        encoded_frames = list(executor.map(encode_frame, frames))
+
+    # encoded_frames = list(map(encode_frame, frames))
+
+    # Concatenate all frames bytes
+    video_bytes = b"".join(encoded_frames)
+
+    # Encode the concatenated bytes to base64
+    video_base64 = "video:" + pybase64.b64encode(video_bytes).decode("utf-8")
+
+    return video_base64
+
+
+def _is_chinese_char(cp: int):
+    """Checks whether CP is the codepoint of a CJK character."""
+    # This defines a "chinese character" as anything in the CJK Unicode block:
+    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    #
+    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+    # despite its name. The modern Korean Hangul alphabet is a different block,
+    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+    # space-separated words, so they are not treated specially and handled
+    # like the all of the other languages.
+    if (
+        (cp >= 0x4E00 and cp <= 0x9FFF)
+        or (cp >= 0x3400 and cp <= 0x4DBF)  #
+        or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+        or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+        or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+        or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+        or (cp >= 0xF900 and cp <= 0xFAFF)
+        or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+    ):  #
+        return True
+
+    return False
+
+
+def find_printable_text(text: str):
+    """Returns the longest printable substring of text that contains only entire words."""
+    # Borrowed from https://github.com/huggingface/transformers/blob/061580c82c2db1de9139528243e105953793f7a2/src/transformers/generation/streamers.py#L99
+
+    # After the symbol for a new line, we flush the cache.
+    if text.endswith("\n"):
+        return text
+    # If the last token is a CJK character, we print the characters.
+    elif len(text) > 0 and _is_chinese_char(ord(text[-1])):
+        return text
+    # Otherwise if the penultimate token is a CJK character, we print the characters except for the last one.
+    elif len(text) > 1 and _is_chinese_char(ord(text[-2])):
+        return text[:-1]
+    # Otherwise, prints until the last space char (simple heuristic to avoid printing incomplete words,
+    # which may change with the subsequent token -- there are probably smarter ways to do this!)
+    else:
+        return text[: text.rfind(" ") + 1]
+
+
+class LazyImport:
+    """Lazy import to make `import sglang` run faster."""
+
+    def __init__(self, module_name: str, class_name: str):
+        self.module_name = module_name
+        self.class_name = class_name
+        self._module = None
+
+    def _load(self):
+        if self._module is None:
+            module = importlib.import_module(self.module_name)
+            self._module = getattr(module, self.class_name)
+        return self._module
+
+    def __getattr__(self, name: str):
+        module = self._load()
+        return getattr(module, name)
+
+    def __call__(self, *args, **kwargs):
+        module = self._load()
+        return module(*args, **kwargs)
+
+
+def download_and_cache_file(url: str, filename: Optional[str] = None):
+    """Read and cache a file from a url."""
+    if filename is None:
+        filename = os.path.join("/tmp", url.split("/")[-1])
+
+    # Check if the cache file already exists
+    if os.path.exists(filename):
+        return filename
+
+    print(f"Downloading from {url} to {filename}")
+
+    # Stream the response to show the progress bar
+    response = requests.get(url, stream=True)
+    response.raise_for_status()  # Check for request errors
+
+    # Total size of the file in bytes
+    total_size = int(response.headers.get("content-length", 0))
+    chunk_size = 1024  # Download in chunks of 1KB
+
+    # Use tqdm to display the progress bar
+    with open(filename, "wb") as f, tqdm(
+        desc=filename,
+        total=total_size,
+        unit="B",
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as bar:
+        for chunk in response.iter_content(chunk_size=chunk_size):
+            f.write(chunk)
+            bar.update(len(chunk))
+
+    return filename
+
+
+def is_in_ci():
+    from sglang.test.test_utils import is_in_ci
+
+    return is_in_ci()
+
+
+def print_highlight(html_content: str):
+    if is_in_ci():
+        html_content = str(html_content).replace("\n", "<br>")
+        display(HTML(f"<strong style='color: #00008B;'>{html_content}</strong>"))
+    else:
+        print(html_content)
+
+
+process_socket_map = weakref.WeakKeyDictionary()
+
+
+def reserve_port(host, start=30000, end=40000):
+    """
+    Reserve an available port by trying to bind a socket.
+    Returns a tuple (port, lock_socket) where `lock_socket` is kept open to hold the lock.
+    """
+    candidates = list(range(start, end))
+    random.shuffle(candidates)
+
+    for port in candidates:
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        try:
+            # Attempt to bind to the port on localhost
+            sock.bind((host, port))
+            return port, sock
+        except socket.error:
+            sock.close()  # Failed to bind, try next port
+            continue
+    raise RuntimeError("No free port available.")
+
+
+def release_port(lock_socket):
+    """
+    Release the reserved port by closing the lock socket.
+    """
+    try:
+        lock_socket.close()
+    except Exception as e:
+        print(f"Error closing socket: {e}")
+
+
+def execute_shell_command(command: str) -> subprocess.Popen:
+    """
+    Execute a shell command and return its process handle.
+    """
+    command = command.replace("\\\n", " ").replace("\\", " ")
+    parts = command.split()
+    return subprocess.Popen(parts, text=True, stderr=subprocess.STDOUT)
+
+
+def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None):
+    """
+    Launch the server using the given command.
+    If no port is specified, a free port is reserved.
+    """
+    if port is None:
+        port, lock_socket = reserve_port(host)
+    else:
+        lock_socket = None
+
+    full_command = f"{command} --port {port}"
+    process = execute_shell_command(full_command)
+
+    if lock_socket is not None:
+        process_socket_map[process] = lock_socket
+
+    return process, port
+
+
+def terminate_process(process):
+    """
+    Terminate the process and automatically release the reserved port.
+    """
+    from sglang.srt.utils import kill_process_tree
+
+    kill_process_tree(process.pid)
+
+    lock_socket = process_socket_map.pop(process, None)
+    if lock_socket is not None:
+        release_port(lock_socket)
+
+
+def wait_for_server(base_url: str, timeout: int = None) -> None:
+    """Wait for the server to be ready by polling the /v1/models endpoint.
+
+    Args:
+        base_url: The base URL of the server
+        timeout: Maximum time to wait in seconds. None means wait forever.
+    """
+    start_time = time.perf_counter()
+    while True:
+        try:
+            response = requests.get(
+                f"{base_url}/v1/models",
+                headers={"Authorization": "Bearer None"},
+            )
+            if response.status_code == 200:
+                time.sleep(5)
+                print_highlight(
+                    """\n
+                    NOTE: Typically, the server runs in a separate terminal.
+                    In this notebook, we run the server and notebook code together, so their outputs are combined.
+                    To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
+                    To reduce the log length, we set the log level to warning for the server, the default log level is info.
+                    We are running those notebooks in a CI environment, so the throughput is not representative of the actual performance.
+                    """
+                )
+                break
+
+            if timeout and time.perf_counter() - start_time > timeout:
+                raise TimeoutError("Server did not become ready within timeout period")
+        except requests.exceptions.RequestException:
+            time.sleep(1)
+
+
+class TypeBasedDispatcher:
+    def __init__(self, mapping: List[Tuple[Type, Callable]]):
+        self._mapping = mapping
+
+    def __iadd__(self, other: "TypeBasedDispatcher"):
+        self._mapping.extend(other._mapping)
+        return self
+
+    def __call__(self, obj: Any):
+        for ty, fn in self._mapping:
+            if isinstance(obj, ty):
+                return fn(obj)
+        raise ValueError(f"Invalid object: {obj}")
+
+
+def trim_overlap(existing_text, new_chunk):
+    """
+    Finds the largest suffix of 'existing_text' that is a prefix of 'new_chunk'
+    and removes that overlap from the start of 'new_chunk'.
+    """
+    max_overlap = 0
+    max_possible = min(len(existing_text), len(new_chunk))
+    for i in range(max_possible, 0, -1):
+        if existing_text.endswith(new_chunk[:i]):
+            max_overlap = i
+            break
+    return new_chunk[max_overlap:]
+
+
+def stream_and_merge(llm, prompt, sampling_params):
+    """
+    1) Streams the text,
+    2) Removes chunk overlaps,
+    3) Returns the merged text.
+    """
+    final_text = ""
+    for chunk in llm.generate(prompt, sampling_params, stream=True):
+        chunk_text = chunk["text"]
+        cleaned_chunk = trim_overlap(final_text, chunk_text)
+        final_text += cleaned_chunk
+    return final_text
+
+
+async def async_stream_and_merge(llm, prompt, sampling_params):
+    """
+    Streams tokens asynchronously, removes chunk overlaps,
+    and yields the cleaned chunk in real time for printing.
+    """
+    final_text = ""
+    generator = await llm.async_generate(prompt, sampling_params, stream=True)
+    async for chunk in generator:
+        chunk_text = chunk["text"]
+        cleaned_chunk = trim_overlap(final_text, chunk_text)
+        final_text += cleaned_chunk
+        yield cleaned_chunk  # yield the non-overlapping portion
+
+
+def resolve_obj_by_qualname(qualname: str) -> Any:
+    """
+    Resolve an object by its fully qualified name.
+    """
+    module_name, obj_name = qualname.rsplit(".", 1)
+    module = importlib.import_module(module_name)
+    return getattr(module, obj_name)
diff --git a/python/sglang/version.py b/python/sglang/version.py
new file mode 100644
index 000000000..722515271
--- /dev/null
+++ b/python/sglang/version.py
@@ -0,0 +1 @@
+__version__ = "0.5.2"
diff --git a/scripts/ci/amd_ci_exec.sh b/scripts/ci/amd_ci_exec.sh
new file mode 100755
index 000000000..3bd940eb1
--- /dev/null
+++ b/scripts/ci/amd_ci_exec.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+set -euo pipefail
+
+# Detect GPU family from hostname (e.g., linux-mi35x-gpu-1-xxxxx-runner-zzzzz)
+HOSTNAME_VALUE=$(hostname)
+GPU_FAMILY=""
+
+# Host names look like: linux-mi35x-gpu-1-xxxxx-runner-zzzzz
+if [[ "${HOSTNAME_VALUE}" =~ ^linux-(mi[0-9]+[a-z]*)-gpu-[0-9]+ ]]; then
+  GPU_FAMILY="${BASH_REMATCH[1]}"
+  echo "Detected GPU family from hostname: ${GPU_FAMILY}"
+else
+  echo "Warning: could not parse GPU family from '${HOSTNAME_VALUE}'"
+fi
+
+WORKDIR="/sglang-checkout/test/srt"
+declare -A ENV_MAP=(
+  [SGLANG_AMD_CI]=1
+  [SGLANG_IS_IN_CI]=1
+  [SGLANG_USE_AITER]=1
+)
+
+# Conditionally add GPU_ARCHS only for mi35x
+if [[ "${GPU_FAMILY}" == "mi35x" ]]; then
+  ENV_MAP[GPU_ARCHS]="gfx950"
+fi
+
+# Parse -w/--workdir and -e ENV=VAL
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    -w|--workdir)
+      WORKDIR="$2"
+      shift 2
+      ;;
+    -e)
+      IFS="=" read -r key val <<< "$2"
+      ENV_MAP["$key"]="$val"
+      shift 2
+      ;;
+    --)
+      shift
+      break
+      ;;
+    *)
+      break
+      ;;
+  esac
+done
+
+# Build final ENV_ARGS
+ENV_ARGS=()
+for key in "${!ENV_MAP[@]}"; do
+  ENV_ARGS+=("-e" "$key=${ENV_MAP[$key]}")
+done
+
+# Run docker exec
+docker exec \
+  -w "$WORKDIR" \
+  "${ENV_ARGS[@]}" \
+  ci_sglang "$@"
diff --git a/scripts/ci/amd_ci_install_dependency.sh b/scripts/ci/amd_ci_install_dependency.sh
new file mode 100755
index 000000000..518f0dde9
--- /dev/null
+++ b/scripts/ci/amd_ci_install_dependency.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+set -euo pipefail
+HOSTNAME_VALUE=$(hostname)
+GPU_ARCH="mi30x"   # default
+
+# Host names look like: linux-mi35x-gpu-1-xxxxx-runner-zzzzz
+if [[ "${HOSTNAME_VALUE}" =~ ^linux-(mi[0-9]+[a-z]*)-gpu-[0-9]+ ]]; then
+  GPU_ARCH="${BASH_REMATCH[1]}"
+  echo "Detected GPU architecture from hostname: ${GPU_ARCH}"
+else
+  echo "Warning: could not parse GPU architecture from '${HOSTNAME_VALUE}', defaulting to ${GPU_ARCH}"
+fi
+
+# Install the required dependencies in CI.
+docker exec ci_sglang pip install --upgrade pip
+docker exec ci_sglang pip uninstall sgl-kernel -y || true
+docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
+
+case "${GPU_ARCH}" in
+  mi35x)
+    echo "Runner uses ${GPU_ARCH}; will fetch mi35x image."
+    docker exec ci_sglang pip install -e "python[dev_hip]" --no-deps # TODO: only for mi35x
+    # For lmms_evals evaluating MMMU
+    docker exec -w / ci_sglang git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
+    docker exec -w /lmms-eval ci_sglang pip install -e . --no-deps # TODO: only for mi35x
+    ;;
+  mi30x|mi300|mi325)
+    echo "Runner uses ${GPU_ARCH}; will fetch mi30x image."
+    docker exec ci_sglang pip install -e "python[dev_hip]"
+    # For lmms_evals evaluating MMMU
+    docker exec -w / ci_sglang git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
+    docker exec -w /lmms-eval ci_sglang pip install -e .
+    ;;
+  *)
+    echo "Runner architecture '${GPU_ARCH}' unrecognised;" >&2
+    ;;
+esac
+
+docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
+docker exec -w /human-eval ci_sglang pip install -e .
+
+docker exec -w / ci_sglang mkdir -p /dummy-grok
+mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpublic/sglang/dummy_grok.json -O dummy-grok/config.json
+docker cp ./dummy-grok ci_sglang:/
+
+docker exec ci_sglang pip install huggingface_hub[hf_xet]
+docker exec ci_sglang pip install pytest
diff --git a/scripts/ci/amd_ci_start_container.sh b/scripts/ci/amd_ci_start_container.sh
new file mode 100755
index 000000000..a1f281c8d
--- /dev/null
+++ b/scripts/ci/amd_ci_start_container.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+set -euo pipefail
+
+# Get version from SGLang version.py file
+SGLANG_VERSION_FILE="$(dirname "$0")/../../python/sglang/version.py"
+SGLANG_VERSION="v0.5.0rc0"   # Default version, will be overridden if version.py is found
+
+if [ -f "$SGLANG_VERSION_FILE" ]; then
+  VERSION_FROM_FILE=$(python3 -c '
+import re, sys
+with open(sys.argv[1], "r") as f:
+    content = f.read()
+    match = re.search(r"__version__\s*=\s*[\"'"'"'](.*?)[\"'"'"']", content)
+    if match:
+        print("v" + match.group(1))
+' "$SGLANG_VERSION_FILE" 2>/dev/null || echo "")
+
+  if [ -n "$VERSION_FROM_FILE" ]; then
+      SGLANG_VERSION="$VERSION_FROM_FILE"
+      echo "Using SGLang version from version.py: $SGLANG_VERSION"
+  else
+      echo "Warning: Could not parse version from $SGLANG_VERSION_FILE, using default: $SGLANG_VERSION" >&2
+  fi
+else
+  echo "Warning: version.py not found, using default version: $SGLANG_VERSION" >&2
+fi
+
+
+# Default base tags (can be overridden by command line arguments)
+DEFAULT_MI30X_BASE_TAG="${SGLANG_VERSION}-rocm630-mi30x"
+DEFAULT_MI35X_BASE_TAG="${SGLANG_VERSION}-rocm700-mi35x"
+
+# Parse command line arguments
+MI30X_BASE_TAG="${DEFAULT_MI30X_BASE_TAG}"
+MI35X_BASE_TAG="${DEFAULT_MI35X_BASE_TAG}"
+
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --mi30x-base-tag) MI30X_BASE_TAG="$2"; shift 2;;
+    --mi35x-base-tag) MI35X_BASE_TAG="$2"; shift 2;;
+    -h|--help)
+      echo "Usage: $0 [--mi30x-base-tag TAG] [--mi35x-base-tag TAG]"
+      exit 0
+      ;;
+    *) echo "Unknown option $1"; exit 1;;
+  esac
+done
+
+
+
+# Detect GPU architecture from the Kubernetes runner hostname
+HOSTNAME_VALUE=$(hostname)
+GPU_ARCH="mi30x"   # default
+
+# Host names look like: linux-mi35x-gpu-1-xxxxx-runner-zzzzz
+if [[ "${HOSTNAME_VALUE}" =~ ^linux-(mi[0-9]+[a-z]*)-gpu-[0-9]+ ]]; then
+  GPU_ARCH="${BASH_REMATCH[1]}"
+  echo "Detected GPU architecture from hostname: ${GPU_ARCH}"
+else
+  echo "Warning: could not parse GPU architecture from '${HOSTNAME_VALUE}', defaulting to ${GPU_ARCH}"
+fi
+
+# Normalise / collapse architectures we don’t yet build specifically for
+case "${GPU_ARCH}" in
+  mi35x)
+    echo "Runner uses ${GPU_ARCH}; will fetch mi35x image."
+    ;;
+  mi30x|mi300|mi325)
+    echo "Runner uses ${GPU_ARCH}; will fetch mi30x image."
+    GPU_ARCH="mi30x"
+    ;;
+  *)
+    echo "Runner architecture '${GPU_ARCH}' unrecognised; defaulting to mi30x image." >&2
+    GPU_ARCH="mi30x"
+    ;;
+esac
+
+
+# Set up DEVICE_FLAG based on Kubernetes pod info
+if [[ -f /etc/podinfo/gha-render-devices ]]; then
+  DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
+else
+  DEVICE_FLAG="--device /dev/dri"
+fi
+
+
+# Find the latest image
+find_latest_image() {
+  local gpu_arch=$1
+  local base_tag days_back image_tag
+
+  case "${gpu_arch}" in
+      mi30x) base_tag="${MI30X_BASE_TAG}" ;;
+      mi35x) base_tag="${MI35X_BASE_TAG}" ;;
+      *)     echo "Error: unsupported GPU architecture '${gpu_arch}'" >&2; return 1 ;;
+  esac
+
+  for days_back in {0..6}; do
+    image_tag="${base_tag}-$(date -d "${days_back} days ago" +%Y%m%d)"
+    echo "Checking for image: rocm/sgl-dev:${image_tag}" >&2
+    if docker manifest inspect "rocm/sgl-dev:${image_tag}" >/dev/null 2>&1; then
+      echo "Found available image: rocm/sgl-dev:${image_tag}" >&2
+      echo "rocm/sgl-dev:${image_tag}"
+      return 0
+    fi
+  done
+
+  echo "Error: no ${gpu_arch} image found in the last 7 days for base ${base_tag}" >&2
+  echo "Using hard-coded fallback…" >&2
+  if [[ "${gpu_arch}" == "mi35x" ]]; then
+    echo "rocm/sgl-dev:v0.5.0rc0-rocm700-mi35x-20250812"
+  else
+    echo "rocm/sgl-dev:v0.5.0rc0-rocm630-mi30x-20250812"
+  fi
+}
+
+# Pull and run the latest image
+IMAGE=$(find_latest_image "${GPU_ARCH}")
+echo "Pulling Docker image: ${IMAGE}"
+docker pull "${IMAGE}"
+
+echo "Launching container: ci_sglang"
+docker run -dt --user root --device=/dev/kfd ${DEVICE_FLAG} \
+  -v "${GITHUB_WORKSPACE:-$PWD}:/sglang-checkout" \
+  --ipc=host --group-add video \
+  --shm-size 32g \
+  --cap-add=SYS_PTRACE \
+  -e HF_TOKEN="${HF_TOKEN:-}" \
+  --security-opt seccomp=unconfined \
+  -w /sglang-checkout \
+  --name ci_sglang \
+  "${IMAGE}"
diff --git a/scripts/ci/ci_install_deepep.sh b/scripts/ci/ci_install_deepep.sh
new file mode 100755
index 000000000..4da81eec1
--- /dev/null
+++ b/scripts/ci/ci_install_deepep.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+# Install the dependency in CI.
+set -euxo pipefail
+
+bash scripts/ci/ci_install_dependency.sh
+
+export GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
+export NVSHMEM_DIR=/opt/nvshmem/install
+export LD_LIBRARY_PATH="${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH"
+export PATH="${NVSHMEM_DIR}/bin:$PATH"
+export CUDA_HOME=/usr/local/cuda
+
+if python3 -c "import deep_ep" >/dev/null 2>&1; then
+    echo "deep_ep is already installed or importable. Skipping installation."
+    exit 0
+fi
+
+# Install system dependencies
+apt install -y curl wget git sudo libibverbs-dev rdma-core infiniband-diags openssh-server perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 build-essential cmake
+
+# Install GDRCopy
+rm -rf /opt/gdrcopy && mkdir -p /opt/gdrcopy
+rm -rf /opt/nvshmem && mkdir -p /opt/nvshmem
+cd /opt/gdrcopy
+git clone https://github.com/NVIDIA/gdrcopy.git .
+git checkout v2.4.4
+apt update
+apt install -y nvidia-dkms-535
+apt install -y build-essential devscripts debhelper fakeroot pkg-config dkms
+apt install -y check libsubunit0 libsubunit-dev python3-venv
+cd packages
+CUDA=/usr/local/cuda ./build-deb-packages.sh
+dpkg -i gdrdrv-dkms_*.deb
+dpkg -i libgdrapi_*.deb
+dpkg -i gdrcopy-tests_*.deb
+dpkg -i gdrcopy_*.deb
+
+if [ ! -e "/usr/lib/x86_64-linux-gnu/libmlx5.so" ]; then
+    ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
+fi
+apt-get update && apt-get install -y libfabric-dev
+
+# Install NVSHMEM
+cd /opt/nvshmem
+wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz
+tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz
+mv nvshmem_src nvshmem && cd nvshmem
+NVSHMEM_SHMEM_SUPPORT=0 \
+NVSHMEM_UCX_SUPPORT=0 \
+NVSHMEM_USE_NCCL=0 \
+NVSHMEM_MPI_SUPPORT=0 \
+NVSHMEM_IBGDA_SUPPORT=1 \
+NVSHMEM_PMIX_SUPPORT=0 \
+NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
+NVSHMEM_USE_GDRCOPY=1 \
+cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/opt/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=90
+cd build
+make -j$(nproc) install
+
+# Install DeepEP
+rm -rf /root/.cache/deepep && git clone https://github.com/deepseek-ai/DeepEP.git /root/.cache/deepep && cd /root/.cache/deepep && git checkout b92d0d4860ce6866cd6d31bfbae937f9a7a3772b
+cd /root/.cache/deepep && python3 setup.py install
+
+# Verify configuration
+echo "=== Verify GDRCOPY ==="
+gdrcopy_copybw
+echo "=== Verify NVSHMEM ==="
+nvshmem-info -a
diff --git a/scripts/ci/ci_install_dependency.sh b/scripts/ci/ci_install_dependency.sh
new file mode 100755
index 000000000..e007121a3
--- /dev/null
+++ b/scripts/ci/ci_install_dependency.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+# Install the dependency in CI.
+set -euxo pipefail
+
+IS_BLACKWELL=${IS_BLACKWELL:-0}
+
+if [ "$IS_BLACKWELL" = "1" ]; then
+    CU_VERSION="cu129"
+else
+    CU_VERSION="cu126"
+fi
+
+# Clear torch compilation cache
+python3 -c 'import os, shutil, tempfile, getpass; cache_dir = os.environ.get("TORCHINDUCTOR_CACHE_DIR") or os.path.join(tempfile.gettempdir(), "torchinductor_" + getpass.getuser()); shutil.rmtree(cache_dir, ignore_errors=True)'
+
+# Kill existing processes
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+bash "${SCRIPT_DIR}/../killall_sglang.sh"
+
+# Install apt packages
+apt install -y git libnuma-dev
+
+# Install uv
+if [ "$IS_BLACKWELL" = "1" ]; then
+    # The blackwell CI runner has some issues with pip and uv,
+    # so we can only use pip with `--break-system-packages`
+    PIP_CMD="pip"
+    PIP_INSTALL_SUFFIX="--break-system-packages"
+
+    # Clean up existing installations
+    $PIP_CMD uninstall -y flashinfer_python sgl-kernel sglang vllm $PIP_INSTALL_SUFFIX || true
+else
+    # In normal cases, we use uv, which is much faster than pip.
+    pip install --upgrade pip
+    pip install uv
+    export UV_SYSTEM_PYTHON=true
+
+    PIP_CMD="uv pip"
+    PIP_INSTALL_SUFFIX="--index-strategy unsafe-best-match"
+
+    # Clean up existing installations
+    $PIP_CMD uninstall flashinfer_python sgl-kernel sglang vllm || true
+fi
+
+# Install the main package
+$PIP_CMD install -e "python[dev]" --extra-index-url https://download.pytorch.org/whl/${CU_VERSION} $PIP_INSTALL_SUFFIX
+
+# Install router for pd-disagg test
+SGLANG_ROUTER_BUILD_NO_RUST=1 $PIP_CMD install -e "sgl-router" $PIP_INSTALL_SUFFIX
+
+
+SGL_KERNEL_VERSION=0.3.9.post2
+if [ "$IS_BLACKWELL" = "1" ]; then
+    # TODO auto determine sgl-kernel version
+    $PIP_CMD install https://github.com/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}+cu128-cp310-abi3-manylinux2014_x86_64.whl --force-reinstall $PIP_INSTALL_SUFFIX
+else
+    $PIP_CMD install https://github.com/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}+cu124-cp310-abi3-manylinux2014_x86_64.whl --force-reinstall $PIP_INSTALL_SUFFIX
+fi
+
+# Show current packages
+$PIP_CMD list
+
+# Install additional dependencies
+$PIP_CMD install mooncake-transfer-engine==0.3.5 nvidia-cuda-nvrtc-cu12 py-spy huggingface_hub[hf_xet] $PIP_INSTALL_SUFFIX
+
+if [ "$IS_BLACKWELL" != "1" ]; then
+    # For lmms_evals evaluating MMMU
+    git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
+    $PIP_CMD install -e lmms-eval/ $PIP_INSTALL_SUFFIX
+
+    # Install xformers
+    $PIP_CMD install xformers --index-url https://download.pytorch.org/whl/${CU_VERSION} --no-deps $PIP_INSTALL_SUFFIX
+fi
+
+# Install FlashMLA for attention backend tests
+# $PIP_CMD install git+https://github.com/deepseek-ai/FlashMLA.git $PIP_INSTALL_SUFFIX
+
+# Show current packages
+$PIP_CMD list
+
+echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}"
diff --git a/scripts/ci/ci_install_rust.sh b/scripts/ci/ci_install_rust.sh
new file mode 100755
index 000000000..ac042fc9a
--- /dev/null
+++ b/scripts/ci/ci_install_rust.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+set -euxo pipefail
+
+# Check if sudo is available
+if command -v sudo >/dev/null 2>&1; then
+    sudo apt-get update
+    sudo apt-get install -y libssl-dev pkg-config protobuf-compiler
+else
+    apt-get update
+    apt-get install -y libssl-dev pkg-config protobuf-compiler
+fi
+
+# Install rustup (Rust installer and version manager)
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+
+
+# Follow the installation prompts, then reload your shell
+. "$HOME/.cargo/env"
+source $HOME/.cargo/env
+
+# Verify installation
+rustc --version
+cargo --version
+protoc --version
diff --git a/scripts/ci/ci_start_disaggregation_servers.sh b/scripts/ci/ci_start_disaggregation_servers.sh
new file mode 100755
index 000000000..56490bb06
--- /dev/null
+++ b/scripts/ci/ci_start_disaggregation_servers.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+
+MODEL_PATH="/raid/models/meta-llama/Llama-3.1-8B-Instruct"
+
+# Function to find the first available active IB device
+find_active_ib_device() {
+    for device in mlx5_{0..11}; do
+        if ibv_devinfo $device >/dev/null 2>&1; then
+            state=$(ibv_devinfo $device | grep "state:" | head -1 | awk '{print $2}')
+            if [[ "$state" == "PORT_ACTIVE" ]]; then
+                echo "$device"
+                return 0
+            fi
+        fi
+    done
+    echo "No active IB device found" >&2
+    return 1
+}
+
+# Get the first available active IB device
+DEVICE=$(find_active_ib_device)
+echo "Using IB device: $DEVICE"
+
+# Launch prefill servers on GPU 0–3
+for i in {0..3}; do
+  PORT=$((30001 + i))
+  BOOTSTRAP_PORT=$((9001 + i))
+  HOST="127.0.0.$((i + 1))"
+  echo "Launching PREFILL server on GPU $i at $HOST:$PORT (bootstrap: $BOOTSTRAP_PORT)"
+  CUDA_VISIBLE_DEVICES=$i \
+  python3 -m sglang.launch_server \
+    --model-path "$MODEL_PATH" \
+    --disaggregation-mode prefill \
+    --host "$HOST" \
+    --port "$PORT" \
+    --disaggregation-ib-device "$DEVICE" \
+    --disaggregation-bootstrap-port "$BOOTSTRAP_PORT" &
+done
+
+# Launch decode servers on GPU 4–7
+for i in {4..7}; do
+  PORT=$((30001 + i))
+  HOST="127.0.0.$((i + 1))"
+  echo "Launching DECODE server on GPU $i at $HOST:$PORT"
+  CUDA_VISIBLE_DEVICES=$i \
+  python3 -m sglang.launch_server \
+    --model-path "$MODEL_PATH" \
+    --disaggregation-mode decode \
+    --host "$HOST" \
+    --port "$PORT" \
+    --disaggregation-ib-device "$DEVICE" \
+    --base-gpu-id 0 &
+done
+
+# Wait for disaggregation servers to initialize
+echo "Waiting for disaggregation servers to initialize..."
+
+# Health check with 5-minute timeout
+TIMEOUT=300
+START_TIME=$(date +%s)
+
+echo "Checking health of all 8 servers..."
+while true; do
+    CURRENT_TIME=$(date +%s)
+    ELAPSED=$((CURRENT_TIME - START_TIME))
+
+    if [ $ELAPSED -ge $TIMEOUT ]; then
+        echo "❌ Timeout: Servers did not become healthy within 5 minutes"
+        exit 1
+    fi
+
+    HEALTHY_COUNT=0
+    # Check all 8 servers (127.0.0.1-8:30001-30008)
+    for i in {1..8}; do
+        if curl -s -f "http://127.0.0.$i:$((30000 + i))/health" >/dev/null 2>&1; then
+            HEALTHY_COUNT=$((HEALTHY_COUNT + 1))
+        fi
+    done
+
+    echo "Healthy servers: $HEALTHY_COUNT/8 (elapsed: ${ELAPSED}s)"
+
+    if [ $HEALTHY_COUNT -eq 8 ]; then
+        echo "✅ All 8 servers are healthy!"
+        break
+    else
+        sleep 10  # Wait 10 seconds before next check
+    fi
+done
+
+# Don't launch router here - just keep servers running
+echo "✅ All disaggregation servers are ready and waiting for router connections"
+
+# Keep the script running
+wait
diff --git a/scripts/ci/npu_ci_install_dependency.sh b/scripts/ci/npu_ci_install_dependency.sh
new file mode 100755
index 000000000..71cf46f7f
--- /dev/null
+++ b/scripts/ci/npu_ci_install_dependency.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+set -euo pipefail
+
+PIP_INSTALL="pip install --no-cache-dir"
+
+
+# Install the required dependencies in CI.
+apt update -y && apt install -y \
+    build-essential \
+    cmake \
+    wget \
+    curl \
+    net-tools \
+    zlib1g-dev \
+    lld \
+    clang \
+    locales \
+    ccache \
+    ca-certificates
+update-ca-certificates
+python3 -m ${PIP_INSTALL} --upgrade pip
+
+
+### Download MemFabricV2
+MF_WHL_NAME="mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl"
+MEMFABRIC_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/${MF_WHL_NAME}"
+wget -O "${MF_WHL_NAME}" "${MEMFABRIC_URL}" && ${PIP_INSTALL} "./${MF_WHL_NAME}"
+
+
+### Install vLLM
+VLLM_TAG=v0.8.5
+git clone --depth 1 https://github.com/vllm-project/vllm.git --branch $VLLM_TAG
+(cd vllm && VLLM_TARGET_DEVICE="empty" ${PIP_INSTALL} -v -e .)
+
+
+### Install PyTorch and PTA
+PYTORCH_VERSION=2.6.0
+TORCHVISION_VERSION=0.21.0
+${PIP_INSTALL} torch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION --index-url https://download.pytorch.org/whl/cpu
+
+PTA_VERSION="v7.1.0.1-pytorch2.6.0"
+PTA_NAME="torch_npu-2.6.0.post1-cp311-cp311-manylinux_2_28_aarch64.whl"
+PTA_URL="https://gitee.com/ascend/pytorch/releases/download/${PTA_VERSION}/${PTA_NAME}"
+wget -O "${PTA_NAME}" "${PTA_URL}" && ${PIP_INSTALL} "./${PTA_NAME}"
+
+
+### Install Triton-Ascend
+TRITON_ASCEND_NAME="triton_ascend-3.2.0.dev20250729-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl"
+TRITON_ASCEND_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/${TRITON_ASCEND_NAME}"
+${PIP_INSTALL} attrs==24.2.0 numpy==1.26.4 scipy==1.13.1 decorator==5.1.1 psutil==6.0.0 pytest==8.3.2 pytest-xdist==3.6.1 pyyaml pybind11
+wget -O "${TRITON_ASCEND_NAME}" "${TRITON_ASCEND_URL}" && ${PIP_INSTALL} "./${TRITON_ASCEND_NAME}"
+
+
+### Install sgl-kernel-npu
+SGL_KERNEL_NPU_TAG="20250901"
+git clone --depth 1 https://github.com/sgl-project/sgl-kernel-npu.git --branch ${SGL_KERNEL_NPU_TAG}
+(cd sgl-kernel-npu && bash ./build.sh -a deepep && pip install output/deep_ep*.whl && cd "$(pip show deep-ep | grep -E '^Location:' | awk '{print $2}')" && ln -s deep_ep/deep_ep_cpp*.so)
+
+
+### Install SGLang
+${PIP_INSTALL} -v -e "python[srt_npu]"
diff --git a/scripts/code_sync/copy_from_oss.py b/scripts/code_sync/copy_from_oss.py
new file mode 100644
index 000000000..5590a73a0
--- /dev/null
+++ b/scripts/code_sync/copy_from_oss.py
@@ -0,0 +1,293 @@
+"""
+Sync code from OSS repo to the local repo and open a PR if changes exist.
+
+NOTE:
+1. You need to execute this script in the git root folder.
+2. A GH_TOKEN environment variable is required to create the pull request.
+  - see also https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens
+
+This script will:
+1. Clone the sgl-project/sglang repository (or use a local copy).
+2. Sync specified files and directories using rsync.
+3. Check if the sync operation resulted in any changes.
+4. If there are changes:
+   a. Create a new branch.
+   b. Commit and push the changes.
+   c. Open a pull request using the GitHub CLI (gh).
+
+Usage:
+# Run the full sync and PR creation process
+python3 scripts/copy_from_oss.py
+
+# Perform a dry run without making any actual changes
+python3 scripts/copy_from_oss.py --dry-run
+
+# Use a local directory as the source instead of cloning
+python3 scripts/copy_from_oss.py --local-dir ~/projects/sglang
+"""
+
+import argparse
+import datetime
+import os
+import shutil
+import subprocess
+import tempfile
+
+# --- Configuration Begin ---
+# List of folders and files to copy from the OSS repo.
+# Changes outside these paths will be ignored.
+folder_names = [
+    "3rdparty",
+    "assets",
+    "benchmark",
+    "docker",
+    "docs",
+    "examples",
+    "sgl-kernel",
+    "README.md",
+    "python/sglang/lang",
+    "python/sglang/srt",
+    "python/sglang/test",
+    "test/lang",
+    "test/srt",
+]
+
+private_repo = "your-org/sglang-private-repo"
+# --- Configuration End ---
+
+
+def write_github_step_summary(content):
+    if not os.environ.get("GITHUB_STEP_SUMMARY"):
+        return
+
+    with open(os.environ["GITHUB_STEP_SUMMARY"], "a") as f:
+        f.write(content)
+
+
+def check_dependencies():
+    """Check for required command-line tools."""
+    if not shutil.which("git"):
+        raise EnvironmentError("git is not installed or not in PATH.")
+    if not shutil.which("gh"):
+        raise EnvironmentError("GitHub CLI (gh) is not installed or not in PATH.")
+    print("✅ All dependencies (git, gh) are available.")
+
+
+def checkout_main(dry_run):
+    """Checkout to the main branch."""
+    commands = [
+        "git checkout main",
+        "git reset --hard",
+    ]
+    for cmd in commands:
+        print(f"Run: {cmd}")
+        if not dry_run:
+            try:
+                subprocess.run(cmd, shell=True, check=True, capture_output=True)
+            except subprocess.CalledProcessError as e:
+                print(f"Git command failed: {e.stderr.decode()}")
+                raise
+    print("✅ Checkout the main branch.")
+
+
+def get_source_folder(args):
+    """
+    Prepare the source repository, either by cloning from GitHub or using a local directory.
+    Returns the path to the source repo root, a temporary directory path (if created),
+    and the short commit hash.
+    """
+    temp_dir = None
+    if args.local_dir:
+        oss_root = os.path.expanduser(args.local_dir)
+        if not os.path.exists(oss_root):
+            raise FileNotFoundError(
+                f"Specified local directory {oss_root} does not exist."
+            )
+        print(f"Using local directory as the source: {oss_root}")
+    else:
+        temp_dir = tempfile.mkdtemp()
+        oss_root = temp_dir
+        print(f"Created temporary directory: {oss_root}")
+
+        repo_url = "https://github.com/sgl-project/sglang.git"
+        try:
+            subprocess.run(
+                [
+                    "git",
+                    "clone",
+                    "--single-branch",
+                    "--branch",
+                    "main",
+                    repo_url,
+                    temp_dir,
+                ],
+                check=True,
+                capture_output=True,
+            )
+            print(f"Successfully cloned repository to {temp_dir}")
+        except subprocess.CalledProcessError as e:
+            print(f"Error cloning repository: {e.stderr.decode()}")
+            raise
+
+    commit_hash = subprocess.run(
+        ["git", "-C", oss_root, "rev-parse", "HEAD"],
+        capture_output=True,
+        text=True,
+        check=True,
+    ).stdout.strip()[:8]
+    print(f"✅ Get source OSS code at commit: {commit_hash}")
+    return oss_root, temp_dir, commit_hash
+
+
+def sync_directories(oss_root, folder_names, dry_run):
+    """Sync specified directories from oss_root to current working directory."""
+    rsync_commands = []
+    for folder_name in folder_names:
+        target_name = f"{oss_root}/{folder_name}"
+        src_name = "./" + "/".join(folder_name.split("/")[:-1])
+        cmd = f"rsync -r --delete {target_name} {src_name}"
+        rsync_commands.append(cmd)
+
+    for cmd in rsync_commands:
+        try:
+            print(f"Run: {cmd}")
+            if not dry_run:
+                subprocess.run(cmd, shell=True, check=True)
+        except subprocess.CalledProcessError as e:
+            print(f"Error executing command '{cmd}': {e}")
+            raise
+    print(f"✅ Sync all folders.")
+
+
+def check_for_changes():
+    """Check if there are any uncommitted git changes."""
+    # This command exits with 1 if there are changes, 0 otherwise.
+    result = subprocess.run(["git", "diff", "--quiet"])
+    return result.returncode != 0
+
+
+def create_and_push_branch(branch_name, commit_message, dry_run):
+    """Create a new branch, commit all changes, and push to origin."""
+    commands = [
+        f"git checkout -b {branch_name}",
+        "git config user.name 'github-actions[bot]'",
+        "git config user.email 'github-actions[bot]@users.noreply.github.com'",
+        "git add .",
+        f"git commit -m '{commit_message}'",
+        f"git push origin {branch_name} --force",
+    ]
+    print("\nCreating and pushing git branch...")
+    for cmd in commands:
+        print(f"Run: {cmd}")
+        if not dry_run:
+            try:
+                subprocess.run(cmd, shell=True, check=True, capture_output=True)
+            except subprocess.CalledProcessError as e:
+                print(f"Git command failed: {e.stderr.decode()}")
+                raise
+
+
+def create_pull_request(branch_name, title, body, dry_run):
+    """Create a pull request using the GitHub CLI."""
+    gh_token = os.getenv("GH_TOKEN")
+    if not gh_token:
+        print(
+            "\n⚠️ Warning: GH_TOKEN environment variable not set. Skipping PR creation."
+        )
+        if not dry_run:
+            return
+
+    print("\nCreating pull request...")
+    command = [
+        "gh",
+        "pr",
+        "create",
+        "--base",
+        "main",
+        "--head",
+        branch_name,
+        "--repo",
+        private_repo,
+        "--title",
+        title,
+        "--body",
+        body,
+    ]
+    print(f"Run: {' '.join(command)}")
+    if not dry_run:
+        env = os.environ.copy()
+        env["GH_TOKEN"] = gh_token
+        try:
+            result = subprocess.run(
+                command, check=True, capture_output=True, text=True, env=env
+            )
+            pr_url = result.stdout.strip()
+            msg = f"✅ Successfully created pull request: {pr_url}"
+            print(msg)
+            write_github_step_summary(msg)
+        except subprocess.CalledProcessError as e:
+            print(f"Error creating pull request: {e.stderr}")
+            raise
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Copy code from OSS and open a PR if changes are detected."
+    )
+    parser.add_argument(
+        "--local-dir",
+        type=str,
+        help="Path to local SGLang directory to use instead of cloning from GitHub.",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Dry run the script without executing git, rsync, or gh commands.",
+    )
+    args = parser.parse_args()
+
+    check_dependencies()
+    checkout_main(args.dry_run)
+
+    oss_root, temp_dir, oss_commit = get_source_folder(args)
+
+    try:
+        # Sync directories
+        sync_directories(oss_root, folder_names, args.dry_run)
+
+        # Check for changes and create PR if necessary
+        if not check_for_changes():
+            msg = "😴 No changes detected. The code is already in sync."
+            print(msg)
+            write_github_step_summary(msg)
+            return
+
+        print("✅ Changes detected. Proceeding to create a PR.")
+
+        current_date = datetime.datetime.now().strftime("%Y%m%d")
+        branch_name = f"copy-from-oss-{oss_commit}-{current_date}"
+        commit_message = f"Copy OSS code from {oss_commit} on {current_date}"
+        pr_title = (
+            f"[Automated PR] Copy OSS code from commit {oss_commit} on {current_date}"
+        )
+        pr_body = (
+            f"Copy OSS code from https://github.com/sgl-project/sglang/commit/{oss_commit} on {current_date}."
+            "\n\n---\n\n"
+            "*This is an automated PR created by scripts/copy_from_oss.py.*"
+        )
+
+        create_and_push_branch(branch_name, commit_message, args.dry_run)
+        create_pull_request(branch_name, pr_title, pr_body, args.dry_run)
+
+    finally:
+        # Remove temporary directory if it was created
+        if temp_dir:
+            try:
+                shutil.rmtree(temp_dir)
+                print(f"\nRemoved temporary directory: {temp_dir}")
+            except OSError as e:
+                print(f"Error removing temporary directory {temp_dir}: {e}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/code_sync/copy_to_oss.py b/scripts/code_sync/copy_to_oss.py
new file mode 100644
index 000000000..cd931ffd6
--- /dev/null
+++ b/scripts/code_sync/copy_to_oss.py
@@ -0,0 +1,425 @@
+"""
+Sync a specific commit from the local private repo to the OSS upstream and open a PR.
+
+NOTE:
+1. You need to execute this script in the git root folder.
+2. A GH_TOKEN environment variable is required to create the pull request.
+  - see also https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens
+
+This script will:
+1. Take a commit hash as an argument (or use the latest commit by default).
+2. Create a patch for that commit.
+3. Filter the patch to only include changes in specified directories.
+4. Clone the sgl-project/sglang repository.
+5. Create a new branch in the OSS repo.
+6. Apply the filtered patch, commit, and force push.
+7. Open a pull request to the OSS repo using the GitHub CLI (gh).
+
+Usage:
+# Sync the latest commit from the current branch
+python3 scripts/copy_to_oss.py
+
+# Run the full sync and PR creation process for a given commit
+python3 scripts/copy_to_oss.py --commit <commit_hash>
+
+# Perform a dry run without making any actual changes
+python3 scripts/copy_to_oss.py --commit <commit_hash> --dry-run
+"""
+
+import argparse
+import datetime
+import os
+import shutil
+import subprocess
+import tempfile
+
+# --- Configuration Begin ---
+# List of folders and files to copy to the OSS repo.
+# Changes outside these paths will be ignored.
+folder_names = [
+    "3rdparty",
+    "assets",
+    "benchmark",
+    "docker",
+    "docs",
+    "examples",
+    "sgl-kernel",
+    "README.md",
+    "python/sglang/lang",
+    "python/sglang/srt",
+    "python/sglang/test",
+    "test/lang",
+    "test/srt",
+]
+
+# --- Configuration End ---
+
+
+def write_github_step_summary(content):
+    if not os.environ.get("GITHUB_STEP_SUMMARY"):
+        return
+
+    with open(os.environ["GITHUB_STEP_SUMMARY"], "a") as f:
+        f.write(content)
+
+
+def get_commit_info(commit_ref):
+    """
+    Retrieves the hash and message of a specific commit.
+
+    Args:
+        commit_ref (str): The commit hash, tag, or branch to inspect (e.g., 'HEAD').
+
+    Returns:
+        A tuple containing the (commit_hash, commit_message),
+        or (None, None) if an error occurs.
+    """
+    try:
+        # Use a custom format to get the hash (%H) and the full message (%B)
+        # separated by a null character for safe parsing.
+        command = ["git", "log", "-1", f"--pretty=%H%x00%B", commit_ref]
+        result = subprocess.run(
+            command, capture_output=True, text=True, check=True, encoding="utf-8"
+        )
+
+        # Split the output by the null character separator
+        commit_hash, commit_message = result.stdout.strip().split("\x00", 1)
+        return commit_hash, commit_message
+
+    except FileNotFoundError:
+        print("❌ Error: 'git' command not found. Is Git installed and in your PATH?")
+    except subprocess.CalledProcessError as e:
+        print(f"❌ Error getting commit info for '{commit_ref}': {e.stderr.strip()}")
+        print(
+            "Hint: Make sure you are running this from within a Git repository and the commit exists."
+        )
+
+    return None, None
+
+
+def check_dependencies():
+    """Check for required command-line tools."""
+    if not shutil.which("git"):
+        raise EnvironmentError("git is not installed or not in PATH.")
+    if not shutil.which("gh"):
+        raise EnvironmentError("GitHub CLI (gh) is not installed or not in PATH.")
+    print("✅ All dependencies (git, gh) are available.")
+
+
+def create_filtered_patch(commit_hash, dry_run):
+    """
+    Create a patch file for the given commit, containing only changes
+    to files and directories specified in `folder_names`.
+    """
+    print(f"Creating a filtered patch for commit {commit_hash}")
+
+    try:
+        # Get the list of all files changed in the commit
+        changed_files_raw = subprocess.run(
+            ["git", "diff-tree", "--no-commit-id", "--name-only", "-r", commit_hash],
+            capture_output=True,
+            text=True,
+            check=True,
+        ).stdout
+        changed_files = changed_files_raw.strip().split("\n")
+
+        # Filter the list of files
+        relevant_files = [
+            f for f in changed_files if any(f.startswith(path) for path in folder_names)
+        ]
+
+        if not relevant_files:
+            msg = "\n😴 No relevant file changes found in this commit. Exiting."
+            print(msg)
+            write_github_step_summary(msg)
+            return None, None
+
+        print("Found relevant changes in the following files:")
+        for f in relevant_files:
+            print(f"  - {f}")
+
+        # Create a patch containing only the changes for the relevant files
+        patch_command = [
+            "git",
+            "format-patch",
+            "--stdout",
+            f"{commit_hash}^..{commit_hash}",
+            "--",
+        ] + relevant_files
+
+        print(f"Run: {' '.join(patch_command)}")
+
+        patch_content = subprocess.run(
+            patch_command, capture_output=True, text=True, check=True
+        ).stdout
+
+        # Save the patch to a temporary file
+        patch_file = tempfile.NamedTemporaryFile(
+            mode="w", delete=False, suffix=".patch", encoding="utf-8"
+        )
+        patch_file.write(patch_content)
+        patch_file.close()
+
+        print(f"✅ Filtered patch created successfully at: {patch_file.name}")
+        return patch_file.name, relevant_files
+
+    except subprocess.CalledProcessError as e:
+        print(f"Error creating patch: {e.stderr}")
+        raise
+
+
+def get_oss_repo(dry_run):
+    """
+    Clones the OSS repository into a temporary directory.
+    Returns the path to the repo root and the temp directory itself.
+    """
+    gh_token = os.getenv("GH_TOKEN")
+    if not gh_token:
+        print("⚠️ Warning: GH_TOKEN environment variable not set. Skipping PR creation.")
+        if not dry_run:
+            return
+
+    temp_dir = tempfile.mkdtemp()
+    oss_root = os.path.join(temp_dir, "sglang")
+    print(f"\nCreated temporary directory for OSS repo: {temp_dir}")
+
+    repo_url = f"https://{gh_token}@github.com/sgl-project/sglang.git"
+    command = ["git", "clone", "--branch", "main", repo_url, oss_root]
+
+    print(f"Run: {' '.join(command)}")
+    if not dry_run:
+        try:
+            subprocess.run(command, check=True, capture_output=True)
+            print(f"✅ Successfully cloned repository to {oss_root}")
+        except subprocess.CalledProcessError as e:
+            print(f"Error cloning repository: {e.stderr.decode()}")
+            shutil.rmtree(temp_dir)
+            raise
+
+    return oss_root, temp_dir
+
+
+def apply_patch_and_push(oss_root, patch_file, branch_name, commit_message, dry_run):
+    """
+    In the OSS repo, create a branch, apply the patch, commit, and push.
+    """
+    print("\nApplying patch and pushing to OSS repo...")
+
+    original_cwd = os.getcwd()
+    if not dry_run:
+        os.chdir(oss_root)
+
+    try:
+        # Define commands as lists to avoid shell injection issues
+        commands_to_run = [
+            ["git", "checkout", "-b", branch_name],
+            ["git", "apply", patch_file],
+            ["git", "config", "user.name", "github-actions[bot]"],
+            [
+                "git",
+                "config",
+                "user.email",
+                "github-actions[bot]@users.noreply.github.com",
+            ],
+            ["git", "add", "."],
+        ]
+
+        for cmd_list in commands_to_run:
+            print(f"Run: {' '.join(cmd_list)}")
+            if not dry_run:
+                subprocess.run(cmd_list, check=True, capture_output=True, text=True)
+
+        # Handle commit separately to pass multi-line message safely via stdin
+        commit_cmd = ["git", "commit", "-F", "-"]
+        print(f"Run: {' '.join(commit_cmd)}")
+        if not dry_run:
+            print(f"Commit Message:\n---\n{commit_message}\n---")
+            subprocess.run(
+                commit_cmd,
+                input=commit_message,
+                text=True,
+                check=True,
+                capture_output=True,
+            )
+
+        # Push the changes
+        push_cmd = ["git", "push", "origin", branch_name, "--force"]
+        print(f"Run: {' '.join(push_cmd)}")
+        if not dry_run:
+            subprocess.run(push_cmd, check=True, capture_output=True, text=True)
+
+    except subprocess.CalledProcessError as e:
+        print(f"Git command failed: {e.stderr}")
+        raise
+    finally:
+        if not dry_run:
+            os.chdir(original_cwd)
+
+    print("✅ Branch created, patch applied, and pushed successfully.")
+
+
+def create_pull_request(oss_root, branch_name, title, body, dry_run):
+    """Create a pull request in the OSS repo using the GitHub CLI."""
+    gh_token = os.getenv("GH_TOKEN")
+    if not gh_token:
+        print("⚠️ Warning: GH_TOKEN environment variable not set. Skipping PR creation.")
+        if not dry_run:
+            return
+
+    print("\nCreating pull request...")
+    command = [
+        "gh",
+        "pr",
+        "create",
+        "--base",
+        "main",
+        "--head",
+        branch_name,
+        "--repo",
+        "sgl-project/sglang",
+        "--title",
+        title,
+        "--body",
+        body,
+    ]
+
+    print(f"Run: {' '.join(command)}")
+    if not dry_run:
+        env = os.environ.copy()
+        env["GH_TOKEN"] = gh_token
+        try:
+            result = subprocess.run(
+                command,
+                check=True,
+                capture_output=True,
+                text=True,
+                env=env,
+                cwd=oss_root,
+            )
+            msg = f"✅ Successfully created pull request: {result.stdout.strip()}"
+            print(msg)
+            write_github_step_summary(msg)
+        except subprocess.CalledProcessError as e:
+            print(f"Error creating pull request: {e.stderr}")
+            # Check if a PR already exists
+            if "A pull request for" in e.stderr and "already exists" in e.stderr:
+                print("ℹ️ A PR for this branch likely already exists.")
+            else:
+                raise
+
+
+def get_commit_author(commit_hash):
+    """Get the author name and email of a commit."""
+    try:
+        author_name = subprocess.run(
+            ["git", "show", "-s", "--format=%an", commit_hash],
+            capture_output=True,
+            text=True,
+            check=True,
+        ).stdout.strip()
+        author_email = subprocess.run(
+            ["git", "show", "-s", "--format=%ae", commit_hash],
+            capture_output=True,
+            text=True,
+            check=True,
+        ).stdout.strip()
+        return author_name, author_email
+    except subprocess.CalledProcessError as e:
+        print(f"Error getting commit author for {commit_hash}: {e.stderr}")
+        raise
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Copy a commit from the private repo to OSS and open a PR."
+    )
+    parser.add_argument(
+        "--commit",
+        type=str,
+        default="LAST",
+        help="The commit hash to sync. Defaults to 'LAST' to use the latest commit.",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Dry run the script without executing git, rsync, or gh commands.",
+    )
+    args = parser.parse_args()
+
+    check_dependencies()
+
+    commit_ref = "HEAD" if args.commit == "LAST" else args.commit
+    commit_hash, original_commit_message = get_commit_info(commit_ref)
+
+    if not commit_hash:
+        return  # Exit if we couldn't get commit info
+
+    # Display the details of the commit being processed
+    if args.commit == "LAST":
+        summary = (
+            f"\nℹ️ No commit specified. Using the last commit:\n"
+            f"  - **Hash:** `{commit_hash}`\n"
+            f"  - **Message:** {original_commit_message}\n\n"
+        )
+    else:
+        summary = (
+            f"\nℹ️ Using specified commit:\n"
+            f"  - **Hash:** `{commit_hash}`\n"
+            f"  - **Message:** {original_commit_message}\n\n"
+        )
+    print(summary)
+    write_github_step_summary(summary)
+
+    short_hash = commit_hash[:8]
+
+    patch_file = None
+    temp_dir = None
+    try:
+        # 1. Create a filtered patch from the local repo
+        patch_file, relevant_files = create_filtered_patch(commit_hash, args.dry_run)
+        if not patch_file:
+            return
+
+        # 2. Get the OSS repo
+        oss_root, temp_dir = get_oss_repo(args.dry_run)
+
+        # 3. Get original commit author for the co-author line
+        author_name, author_email = get_commit_author(commit_hash)
+
+        # 4. Prepare content for the commit and PR based on changed files
+        file_list_str = "\n".join([f"- {f}" for f in relevant_files])
+        filename_list_str = ", ".join([f.split("/")[-1] for f in relevant_files])
+        if len(filename_list_str) > 40:
+            filename_list_str = filename_list_str[:40] + "..."
+        current_date = datetime.datetime.now().strftime("%Y%m%d")
+        pr_title = f"[Auto Sync] Update {filename_list_str} ({current_date})"
+        pr_body = (
+            f"Sync changes from commit `{short_hash}`.\n\n"
+            f"**Relevant Files Changed:**\n{file_list_str}"
+            "\n\n---\n\n"
+            "*This is an automated PR created by a script.*"
+        )
+
+        # 5. Create branch, apply patch, and push
+        branch_name = f"sync-{short_hash}-{current_date}"
+        co_author_line = f"Co-authored-by: {author_name} <{author_email}>"
+        commit_message = f"{pr_title}\n\n{co_author_line}"
+        apply_patch_and_push(
+            oss_root, patch_file, branch_name, commit_message, args.dry_run
+        )
+
+        # 6. Create Pull Request
+        create_pull_request(oss_root, branch_name, pr_title, pr_body, args.dry_run)
+
+    finally:
+        # Cleanup temporary files
+        if patch_file and os.path.exists(patch_file):
+            os.remove(patch_file)
+            print(f"\nRemoved temporary patch file: {patch_file}")
+        if temp_dir and os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir)
+            print(f"Removed temporary directory: {temp_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/code_sync/guideline.md b/scripts/code_sync/guideline.md
new file mode 100644
index 000000000..52f08eb4b
--- /dev/null
+++ b/scripts/code_sync/guideline.md
@@ -0,0 +1,27 @@
+### Sync Code Between OSS and Private Fork
+
+You can use the following principles and tools to sync the code between a private fork and the OSS repo [sgl-project/sglang](https://github.com/sgl-project/sglang/tree/main).
+It learns from [Copybara](https://github.com/google/copybara), a tool used at Google for maintaining open-source code synchronization.
+
+## Principals
+
+- The core folders (e.g., `python/sglang/srt`) are 100% mirrored between the private fork and OSS repo.
+- The OSS repo is the single source of truth. If one commit changes `python/sglang/srt` in the private repo, the change should be synced to the OSS repo as soon as possible with the action B below.
+- The common code (e.g., base classes, well-known techniques in the industry without private secrets) goes to `python/sglang/srt`. The private-specific code (e.g., with private-specific features, confidential info) goes to `python/sglang/private` .
+- Anytime you want to make private changes to a file or class under `python/sglang/srt`, duplicate the file and move it under `python/sglang/private`. You can achieve code reuse by importing and inheriting.
+
+## How to sync the code bidirectionally
+### Action A: Copy code from OSS to private
+
+- We can run this action: [Open A PR to Copy Code From OSS](https://github.com/sgl-project/sglang/tree/main/.github/workflows/open-pr-copy-from-oss.yml)
+    - It opens a PR to copy all files under certain folders (e.g., `python/sglang/srt` , `test/srt` , `sgl-kernel` ) from the OSS main branch to the private fork.
+    - Since the OSS repo is the single source of truth, this action copies files and overwrites any changes in the private fork. To prevent the private changes from being overwritten, you need to ensure all private changes are merged into the OSS repo before running this action.
+- This action will be run automatically every day and can also be triggered manually.
+
+### Action B: Copy diff from private to OSS
+
+- We can run this action: [Open A PR to Copy Code To OSS](https://github.com/sgl-project/sglang/tree/main/.github/workflows/open-pr-copy-to-oss.yml)
+    - It opens a PR to apply the diff of one specific commit of the private fork to the OSS main branch. It will only pick the changes under certain folders (e.g., `python/sglang/srt` , `test/srt` , `sgl-kernel` ) and ignore changes under private folders (e.g., `python/sglang/private` )
+    - For example, you can have a PR that changes both `python/sglang/srt` and `python/sglang/private/srt`. Once you merge the PR into the private repo, `python/sglang/srt` becomes desynced between the two repos. You need to run this action on your merge commit immediately to open a PR to send your diff to the OSS repo. Then, we need to merge the OSS PR as soon as possible. Once your OSS PR is merged, we can run action A again.
+    - Action A copies files directly, but Action B applies diff. This is because OSS is the source of truth; action A can just copy files. Action B cannot copy, so it uses diff instead.
+- This action currently needs a manual trigger in order to prevent incidental code leaks. One can also consider making it automatic.
diff --git a/scripts/code_sync/install_github_cli.sh b/scripts/code_sync/install_github_cli.sh
new file mode 100755
index 000000000..2ef1db023
--- /dev/null
+++ b/scripts/code_sync/install_github_cli.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Check if gh is installed before attempting to install it
+if ! command -v gh &> /dev/null
+then
+echo "GitHub CLI not found. Installing now..."
+(type -p wget >/dev/null || ( apt update &&  apt install wget -y)) \
+&&  mkdir -p -m 755 /etc/apt/keyrings \
+&& out=$(mktemp) && wget -nv -O$out https://cli.github.com/packages/githubcli-archive-keyring.gpg \
+&& cat $out |  tee /etc/apt/keyrings/githubcli-archive-keyring.gpg > /dev/null \
+&&  chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg \
+&&  mkdir -p -m 755 /etc/apt/sources.list.d \
+&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" |  tee /etc/apt/sources.list.d/github-cli.list > /dev/null \
+&&  apt update \
+&&  apt install gh -y
+else
+echo "GitHub CLI is already installed. Skipping installation."
+fi
diff --git a/scripts/deprecated/convert_yi_vl.py b/scripts/deprecated/convert_yi_vl.py
new file mode 100644
index 000000000..cdd8aebeb
--- /dev/null
+++ b/scripts/deprecated/convert_yi_vl.py
@@ -0,0 +1,38 @@
+"""
+Convert Yi-VL config into a format usable with SGLang
+
+Usage: python3 scripts/convert_yi_vl.py --model-path <path-to-model>
+"""
+
+import argparse
+import json
+import os
+
+from transformers import AutoConfig, AutoTokenizer
+
+
+def add_image_token(model_path: str):
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    tokenizer.add_tokens(["<image_placeholder>"], special_tokens=True)
+
+    print(tokenizer)
+    tokenizer.save_pretrained(model_path)
+
+
+def edit_model_config(model_path):
+    config = AutoConfig.from_pretrained(model_path)
+
+    setattr(config, "architectures", ["YiVLForCausalLM"])
+    setattr(config, "image_token_index", 64002)
+
+    print(config)
+    config.save_pretrained(model_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str)
+    args = parser.parse_args()
+
+    add_image_token(args.model_path)
+    edit_model_config(args.model_path)
diff --git a/scripts/deprecated/convert_yi_vl.sh b/scripts/deprecated/convert_yi_vl.sh
new file mode 100644
index 000000000..07a7aeba8
--- /dev/null
+++ b/scripts/deprecated/convert_yi_vl.sh
@@ -0,0 +1,13 @@
+# For 34B Model
+mkdir ~/model_weights
+cd ~/model_weights
+git clone https://huggingface.co/01-ai/Yi-VL-34B
+cp ~/model_weights/Yi-VL-34B/vit/clip-vit-H-14-laion2B-s32B-b79K-yi-vl-34B-448/preprocessor_config.json ~/model_weights/Yi-VL-34B
+python3 convert_yi_vl.py --model-path ~/model_weights/Yi-VL-34B
+
+# For 6B Model
+mkdir ~/model_weights
+cd ~/model_weights
+git clone https://huggingface.co/01-ai/Yi-VL-6B
+cp ~/model_weights/Yi-VL-6B/vit/clip-vit-H-14-laion2B-s32B-b79K-yi-vl-6B-448/preprocessor_config.json ~/model_weights/Yi-VL-6B
+python3 convert_yi_vl.py --model-path ~/model_weights/Yi-VL-6B
diff --git a/scripts/deprecated/test_curl.sh b/scripts/deprecated/test_curl.sh
new file mode 100644
index 000000000..1c83208a7
--- /dev/null
+++ b/scripts/deprecated/test_curl.sh
@@ -0,0 +1,9 @@
+curl http://localhost:30000/generate \
+  -H "Content-Type: application/json" \
+  -d '{
+    "text": "Once upon a time,",
+    "sampling_params": {
+      "max_new_tokens": 64,
+      "temperature": 0
+    }
+  }'
diff --git a/scripts/deprecated/test_flashinfer.py b/scripts/deprecated/test_flashinfer.py
new file mode 100644
index 000000000..d017d114e
--- /dev/null
+++ b/scripts/deprecated/test_flashinfer.py
@@ -0,0 +1,217 @@
+import pytest
+import torch
+from flashinfer import (
+    BatchDecodeWithPagedKVCacheWrapper,
+    BatchPrefillWithPagedKVCacheWrapper,
+)
+
+from sglang.srt.layers.attention.triton_ops.decode_attention import decode_attention_fwd
+from sglang.srt.layers.attention.triton_ops.extend_attention import (
+    extend_attention_fwd,
+    redundant_attention,
+)
+from sglang.srt.utils import should_use_tensor_core
+
+flashinfer_prefill_wrapper = None
+flashinfer_decode_wrapper = None
+
+
+@pytest.mark.parametrize("batch_size", [12, 37, 67])
+@pytest.mark.parametrize("kv_len", [54, 97])
+@pytest.mark.parametrize("qo_len", [37, 17])
+@pytest.mark.parametrize("num_kv_heads", [4])
+@pytest.mark.parametrize("num_qo_heads", [32, 4])
+@pytest.mark.parametrize("head_dim", [128])
+def test_batch_prefill_with_paged_kv_cache(
+    batch_size,
+    kv_len,
+    qo_len,
+    num_kv_heads,
+    num_qo_heads,
+    head_dim,
+):
+    init_flashinfer(num_qo_heads, num_kv_heads)
+
+    q = torch.randn(batch_size * qo_len, num_qo_heads, head_dim).to(0).half()
+    qo_indptr = torch.arange(0, batch_size + 1).to(0).int() * qo_len
+    total_tokens = kv_len * batch_size
+    kv_data = torch.randn(total_tokens, 2, num_kv_heads, head_dim).to(0).half()
+    kv_indptr = torch.arange(0, batch_size + 1).to(0).int() * kv_len
+    kv_indices = torch.arange(0, total_tokens).to(0).int()
+    kv_last_page_len = torch.full((batch_size,), 1, dtype=torch.int32).to(0)
+
+    # init args for triton kernel
+    k_extend = (
+        kv_data.view(batch_size, kv_len, 2, -1)[:, -qo_len:, 0]
+        .contiguous()
+        .view(-1, num_kv_heads, head_dim)
+    )
+    v_extend = (
+        kv_data.view(batch_size, kv_len, 2, -1)[:, -qo_len:, 1]
+        .contiguous()
+        .view(-1, num_kv_heads, head_dim)
+    )
+    o_triton = torch.empty_like(q)
+    k_buffer = kv_data[:, 0].view(-1, num_kv_heads, head_dim).contiguous()
+    v_buffer = kv_data[:, 1].view(-1, num_kv_heads, head_dim).contiguous()
+    req_to_token = torch.arange(0, total_tokens).to(0).int().view(batch_size, kv_len)
+    b_req_idx = torch.arange(0, batch_size).to(0).int()
+    b_seq_len = torch.full((batch_size,), kv_len, dtype=torch.int32).to(0)
+    b_start_loc_extend = torch.arange(0, batch_size).to(0).int() * qo_len
+    b_seq_len_extend = torch.full((batch_size,), qo_len, dtype=torch.int32).to(0)
+    max_len_in_batch = kv_len
+    max_len_extend = qo_len
+
+    extend_attention_fwd(
+        q,
+        k_extend,
+        v_extend,
+        o_triton,
+        k_buffer,
+        v_buffer,
+        req_to_token,
+        b_req_idx,
+        None,  # b_start_loc = None
+        b_seq_len,
+        None,  # b_seq_len_prefix = None
+        b_start_loc_extend,
+        b_seq_len_extend,
+        max_len_in_batch,
+        max_len_extend,
+    )
+
+    o_redundant = torch.empty_like(q)
+    b_start_loc = torch.zeros((batch_size,), dtype=torch.int32).to(0)
+    b_start_loc[1:] = torch.cumsum(b_seq_len[:-1], dim=0)
+    b_seq_len_prefix = b_seq_len - b_seq_len_extend
+
+    redundant_attention(
+        q,
+        k_extend,
+        v_extend,
+        o_redundant,
+        k_buffer,
+        v_buffer,
+        req_to_token,
+        b_req_idx,
+        b_start_loc,
+        b_seq_len,
+        b_seq_len_prefix,
+        max_len_in_batch,
+    )
+    print("Mean: ", torch.mean(torch.abs(o_redundant - o_triton)))
+    print("Max: ", torch.max(torch.abs(o_redundant - o_triton)))
+    assert torch.allclose(o_redundant, o_triton, rtol=1e-2, atol=1e-3)
+
+    flashinfer_prefill_wrapper.end_forward()
+
+    flashinfer_prefill_wrapper.begin_forward(
+        qo_indptr,
+        kv_indptr,
+        kv_indices,
+        kv_last_page_len,
+        num_qo_heads,
+        num_kv_heads,
+        head_dim,
+        1,
+    )
+    o = flashinfer_prefill_wrapper.forward(
+        q.contiguous().view(-1, num_qo_heads, head_dim), kv_data
+    )
+
+    print("Mean: ", torch.mean(torch.abs(o - o_triton)))
+    print("Max: ", torch.max(torch.abs(o - o_triton)))
+    assert torch.allclose(o, o_triton, rtol=1e-2, atol=1e-3)
+
+
+@pytest.mark.parametrize("batch_size", [12, 17, 37])
+@pytest.mark.parametrize("kv_len", [54, 127, 537])
+@pytest.mark.parametrize("num_kv_heads", [32])
+@pytest.mark.parametrize("num_qo_heads", [32])
+@pytest.mark.parametrize("head_dim", [128])
+def test_batch_decode_with_paged_kv_cache(
+    batch_size,
+    kv_len,
+    num_kv_heads,
+    num_qo_heads,
+    head_dim,
+):
+    # note(lsyin): when pytest, the number of heads cannot change, because triton kernel has a cache
+    # to test different shape of decode, change the parameters in the __main__, and run decode only once
+    init_flashinfer(num_qo_heads, num_kv_heads)
+
+    q = torch.randn(batch_size, num_qo_heads, head_dim).to(0).half()
+    total_tokens = kv_len * batch_size
+    kv_data = torch.randn(total_tokens, 2, num_kv_heads, head_dim).to(0).half()
+    kv_indptr = torch.arange(0, batch_size + 1).to(0).int() * kv_len
+    kv_indices = torch.arange(0, total_tokens).to(0).int()
+    kv_last_page_len = torch.full((batch_size,), 1, dtype=torch.int32).to(0)
+
+    # init args for triton kernel
+    k_buffer = kv_data[:, 0].view(-1, num_kv_heads, head_dim).contiguous()
+    v_buffer = kv_data[:, 1].view(-1, num_kv_heads, head_dim).contiguous()
+    o_triton = torch.empty_like(q)
+    req_to_token = (
+        torch.arange(0, kv_len * batch_size).to(0).int().view(batch_size, kv_len)
+    )
+    b_req_idx = torch.arange(0, batch_size).to(0).int()
+    b_start_loc = torch.arange(0, batch_size).to(0).int() * kv_len
+    b_seq_len = torch.full((batch_size,), kv_len, dtype=torch.int32).to(0)
+    max_len_in_batch = kv_len
+    other_kv_index = 0
+    decode_attention_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        o_triton,
+        req_to_token,
+        b_req_idx,
+        b_start_loc,
+        b_seq_len,
+        max_len_in_batch,
+        other_kv_index,
+        total_tokens,
+    )
+
+    flashinfer_decode_wrapper.end_forward()
+    flashinfer_decode_wrapper.begin_forward(
+        kv_indptr,
+        kv_indices,
+        kv_last_page_len,
+        num_qo_heads,
+        num_kv_heads,
+        head_dim,
+        1,
+        pos_encoding_mode="NONE",
+        data_type="float16",
+    )
+    o = flashinfer_decode_wrapper.forward(
+        q.contiguous().view(-1, num_qo_heads, head_dim), kv_data
+    )
+
+    print("Mean: ", torch.mean(torch.abs(o - o_triton)))
+    print("Max: ", torch.max(torch.abs(o - o_triton)))
+    assert torch.allclose(o, o_triton, rtol=1e-2, atol=2e-3)
+
+
+def init_flashinfer(num_attention_heads, num_kv_heads):
+    use_tensor_cores = should_use_tensor_core(
+        torch.half, num_attention_heads, num_kv_heads
+    )
+
+    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8, device="cuda")
+
+    global flashinfer_prefill_wrapper, flashinfer_decode_wrapper
+
+    flashinfer_prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
+        workspace_buffer, "NHD"
+    )
+    flashinfer_decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
+        workspace_buffer, "NHD", use_tensor_cores=use_tensor_cores
+    )
+
+
+if __name__ == "__main__":
+    test_batch_prefill_with_paged_kv_cache(12, 54, 37, 8, 8, 128)
+    test_batch_prefill_with_paged_kv_cache(37, 1111, 456, 32, 32, 128)
+    test_batch_decode_with_paged_kv_cache(12, 54, 4, 32, 128)
diff --git a/scripts/deprecated/test_httpserver_concurrent.py b/scripts/deprecated/test_httpserver_concurrent.py
new file mode 100644
index 000000000..855e51f33
--- /dev/null
+++ b/scripts/deprecated/test_httpserver_concurrent.py
@@ -0,0 +1,56 @@
+"""
+python3 -m sglang.launch_server --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --port 30000
+
+Output:
+The capital of France is Paris.\nThe capital of the United States is Washington, D.C.
+
+The capital of the United Kindom is London.\nThe capital of the United Kingdom is London.\nThe capital of
+"""
+
+import argparse
+import asyncio
+import json
+import time
+
+import aiohttp
+import requests
+
+
+async def send_request(url, data, delay=0):
+    await asyncio.sleep(delay)
+    async with aiohttp.ClientSession() as session:
+        async with session.post(url, json=data) as resp:
+            output = await resp.json()
+    return output
+
+
+async def main(args):
+    url = f"{args.host}:{args.port}"
+    task1 = send_request(
+        url + "/generate",
+        {
+            "text": "The capital of France is",
+            "sampling_params": {"temperature": 0, "max_new_tokens": 128},
+        },
+        delay=1,
+    )
+
+    task2 = send_request(
+        url + "/generate",
+        {
+            "text": "The capital of the United Kindom is",
+            "sampling_params": {"temperature": 0, "max_new_tokens": 128},
+        },
+    )
+
+    rets = await asyncio.gather(task1, task2)
+    print(rets)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="http://127.0.0.1")
+    parser.add_argument("--port", type=int, default=30000)
+    args = parser.parse_args()
+
+    asyncio.run(main(args))
diff --git a/scripts/deprecated/test_httpserver_decode.py b/scripts/deprecated/test_httpserver_decode.py
new file mode 100644
index 000000000..57517a15b
--- /dev/null
+++ b/scripts/deprecated/test_httpserver_decode.py
@@ -0,0 +1,55 @@
+"""
+Usage:
+python3 -m sglang.launch_server --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --port 30000
+python3 test_httpserver_decode.py
+
+Output:
+The capital of France is Paris.\nThe capital of the United States is Washington, D.C.\nThe capital of Canada is Ottawa.\nThe capital of Japan is Tokyo
+"""
+
+import argparse
+import json
+
+import requests
+
+
+def test_decode(url, return_logprob=False, top_logprobs_num=0, return_text=False, n=1):
+    response = requests.post(
+        url + "/generate",
+        json={
+            "text": "The capital of France is",
+            "sampling_params": {
+                "temperature": 0 if n == 1 else 0.5,
+                "max_new_tokens": 32,
+                "n": n,
+            },
+            "stream": False,
+            "return_logprob": return_logprob,
+            "top_logprobs_num": top_logprobs_num,
+            "return_text_in_logprobs": return_text,
+            "logprob_start_len": 0,
+        },
+    )
+    print(json.dumps(response.json()))
+    print("=" * 100)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="http://127.0.0.1")
+    parser.add_argument("--port", type=int, default=30000)
+    args = parser.parse_args()
+
+    url = f"{args.host}:{args.port}"
+
+    test_decode(url)
+    test_decode(url, n=3)
+
+    for top_logprobs_num in [0, 3]:
+        for return_text in [True, False]:
+            test_decode(
+                url,
+                return_logprob=True,
+                top_logprobs_num=top_logprobs_num,
+                return_text=return_text,
+            )
diff --git a/scripts/deprecated/test_httpserver_decode_stream.py b/scripts/deprecated/test_httpserver_decode_stream.py
new file mode 100644
index 000000000..616eaf6c4
--- /dev/null
+++ b/scripts/deprecated/test_httpserver_decode_stream.py
@@ -0,0 +1,68 @@
+"""
+Usage:
+python3 -m sglang.launch_server --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --port 30000
+python3 test_httpserver_decode_stream.py
+
+Output:
+The capital of France is Paris.\nThe capital of the United States is Washington, D.C.\nThe capital of Canada is Ottawa.\nThe capital of Japan is Tokyo
+"""
+
+import argparse
+import json
+
+import requests
+
+
+def test_decode_stream(url, return_logprob, top_logprobs_num):
+    response = requests.post(
+        url + "/generate",
+        json={
+            "text": "The capital of France is",
+            "sampling_params": {
+                "temperature": 0,
+                "max_new_tokens": 128,
+            },
+            "stream": True,
+            "return_logprob": return_logprob,
+            "top_logprobs_num": top_logprobs_num,
+            "return_text_in_logprobs": True,
+            "logprob_start_len": 0,
+        },
+        stream=True,
+    )
+
+    prev = 0
+    for chunk in response.iter_lines(decode_unicode=False):
+        chunk = chunk.decode("utf-8")
+        if chunk and chunk.startswith("data:"):
+            if chunk == "data: [DONE]":
+                break
+            data = json.loads(chunk[5:].strip("\n"))
+
+            if return_logprob:
+                assert data["meta_info"]["input_token_logprobs"] is not None
+                assert data["meta_info"]["output_token_logprobs"] is not None
+                for logprob, token_id, token_text in data["meta_info"][
+                    "output_token_logprobs"
+                ][prev:]:
+                    print(f"{token_text:12s}\t{logprob}\t{token_id}", flush=True)
+                prev = len(data["meta_info"]["output_token_logprobs"])
+            else:
+                output = data["text"].strip()
+                print(output[prev:], end="", flush=True)
+                prev = len(output)
+
+    print("=" * 100)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="http://127.0.0.1")
+    parser.add_argument("--port", type=int, default=30000)
+    args = parser.parse_args()
+
+    url = f"{args.host}:{args.port}"
+
+    test_decode_stream(url, False, 0)
+    test_decode_stream(url, True, 0)
+    test_decode_stream(url, True, 3)
diff --git a/scripts/deprecated/test_httpserver_llava.py b/scripts/deprecated/test_httpserver_llava.py
new file mode 100644
index 000000000..791fc6deb
--- /dev/null
+++ b/scripts/deprecated/test_httpserver_llava.py
@@ -0,0 +1,88 @@
+"""
+Usage:
+python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --port 30000
+python3 test_httpserver_llava.py
+
+Output:
+The image features a man standing on the back of a yellow taxi cab, holding
+"""
+
+import argparse
+import asyncio
+import json
+
+import aiohttp
+import requests
+
+
+async def send_request(url, data, delay=0):
+    await asyncio.sleep(delay)
+    async with aiohttp.ClientSession() as session:
+        async with session.post(url, json=data) as resp:
+            output = await resp.json()
+    return output
+
+
+async def test_concurrent(args):
+    url = f"{args.host}:{args.port}"
+
+    response = []
+    for i in range(8):
+        response.append(
+            send_request(
+                url + "/generate",
+                {
+                    "text": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nDescribe this picture ASSISTANT:",
+                    "image_data": "example_image.png",
+                    "sampling_params": {
+                        "temperature": 0,
+                        "max_new_tokens": 64,
+                    },
+                },
+            )
+        )
+
+    rets = await asyncio.gather(*response)
+    for ret in rets:
+        print(ret["text"])
+
+
+def test_streaming(args):
+    url = f"{args.host}:{args.port}"
+
+    response = requests.post(
+        url + "/generate",
+        json={
+            "text": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nDescribe this picture ASSISTANT:",
+            "image_data": "example_image.png",
+            "sampling_params": {
+                "temperature": 0,
+                "max_new_tokens": 128,
+            },
+            "stream": True,
+        },
+        stream=True,
+    )
+
+    prev = 0
+    for chunk in response.iter_lines(decode_unicode=False):
+        chunk = chunk.decode("utf-8")
+        if chunk and chunk.startswith("data:"):
+            if chunk == "data: [DONE]":
+                break
+            data = json.loads(chunk[5:].strip("\n"))
+            output = data["text"].strip()
+            print(output[prev:], end="", flush=True)
+            prev = len(output)
+    print("")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="http://127.0.0.1")
+    parser.add_argument("--port", type=int, default=30000)
+    args = parser.parse_args()
+
+    asyncio.run(test_concurrent(args))
+
+    test_streaming(args)
diff --git a/scripts/deprecated/test_httpserver_reuse.py b/scripts/deprecated/test_httpserver_reuse.py
new file mode 100644
index 000000000..ef866afc6
--- /dev/null
+++ b/scripts/deprecated/test_httpserver_reuse.py
@@ -0,0 +1,42 @@
+"""
+python3 -m sglang.launch_server --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --port 30000
+
+Output:
+The capital of France is Paris.\nThe capital of the United States is Washington, D.C.\nThe capital of Canada is Ottawa.\nThe capital of Japan is Tokyo
+"""
+
+import argparse
+
+import requests
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="http://127.0.0.1")
+    parser.add_argument("--port", type=int, default=30000)
+    args = parser.parse_args()
+
+    url = f"{args.host}:{args.port}"
+
+    response = requests.post(
+        url + "/generate",
+        json={
+            "text": "The capital of France is",
+            "sampling_params": {
+                "temperature": 0,
+                "max_new_tokens": 32,
+            },
+        },
+    )
+    print(response.json())
+
+    response = requests.post(
+        url + "/generate",
+        json={
+            "text": "The capital of France is Paris.\nThe capital of the United States is",
+            "sampling_params": {
+                "temperature": 0,
+                "max_new_tokens": 32,
+            },
+        },
+    )
+    print(response.json())
diff --git a/scripts/deprecated/test_jump_forward.py b/scripts/deprecated/test_jump_forward.py
new file mode 100644
index 000000000..315a50b5b
--- /dev/null
+++ b/scripts/deprecated/test_jump_forward.py
@@ -0,0 +1,138 @@
+import argparse
+from enum import Enum
+
+from pydantic import BaseModel, constr
+
+import sglang as sgl
+from sglang.srt.constrained.outlines_backend import build_regex_from_object
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+
+IP_REGEX = r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)"
+
+ip_jump_forward = (
+    r"The google's DNS sever address is "
+    + IP_REGEX
+    + r" and "
+    + IP_REGEX
+    + r". "
+    + r"The google's website domain name is "
+    + r"www\.(\w)+\.(\w)+"
+    + r"."
+)
+
+
+# fmt: off
+@sgl.function
+def regex_gen(s):
+    s += "Q: What is the IP address of the Google DNS servers?\n"
+    s += "A: " + sgl.gen(
+        "answer",
+        max_tokens=128,
+        temperature=0,
+        regex=ip_jump_forward,
+    )
+# fmt: on
+
+json_jump_forward = (
+    r"""The information about Hogwarts is in the following JSON format\.\n"""
+    + r"""\n\{\n"""
+    + r"""  "name": "[\w\d\s]*",\n"""
+    + r"""  "country": "[\w\d\s]*",\n"""
+    + r"""  "latitude": [-+]?[0-9]*\.?[0-9]+,\n"""
+    + r"""  "population": [-+]?[0-9]+,\n"""
+    + r"""  "top 3 landmarks": \["[\w\d\s]*", "[\w\d\s]*", "[\w\d\s]*"\],\n"""
+    + r"""\}\n"""
+)
+
+# fmt: off
+@sgl.function
+def json_gen(s):
+    s += sgl.gen(
+        "json",
+        max_tokens=128,
+        temperature=0,
+        regex=json_jump_forward,
+    )
+# fmt: on
+
+
+class Weapon(str, Enum):
+    sword = "sword"
+    axe = "axe"
+    mace = "mace"
+    spear = "spear"
+    bow = "bow"
+    crossbow = "crossbow"
+
+
+class Armor(str, Enum):
+    leather = "leather"
+    chainmail = "chainmail"
+    plate = "plate"
+
+
+class Character(BaseModel):
+    name: constr(max_length=10)
+    age: int
+    armor: Armor
+    weapon: Weapon
+    strength: int
+
+
+@sgl.function
+def character_gen(s):
+    s += "Give me a character description who is a wizard.\n"
+    s += sgl.gen(
+        "character",
+        max_tokens=128,
+        temperature=0,
+        regex=build_regex_from_object(Character),
+    )
+
+
+def main(args):
+    # Select backend
+    backend = select_sglang_backend(args)
+    sgl.set_default_backend(backend)
+
+    state = regex_gen.run(temperature=0)
+
+    print("=" * 20, "IP TEST", "=" * 20)
+    print(state.text())
+
+    state = json_gen.run(temperature=0)
+
+    print("=" * 20, "JSON TEST", "=" * 20)
+    print(state.text())
+
+    state = character_gen.run(temperature=0)
+
+    print("=" * 20, "CHARACTER TEST", "=" * 20)
+    print(state.text())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
+
+# ==================== IP TEST ====================
+# Q: What is the IP address of the Google DNS servers?
+# A: The google's DNS sever address is 8.8.8.8 and 8.8.4.4. The google's website domain name is www.google.com.
+# ==================== JSON TEST ====================
+# The information about Hogwarts is in the following JSON format.
+
+# {
+#   "name": "Hogwarts School of Witchcraft and Wizardry",
+#   "country": "Scotland",
+#   "latitude": 55.566667,
+#   "population": 1000,
+#   "top 3 landmarks": ["Hogwarts Castle", "The Great Hall", "The Forbidden Forest"],
+# }
+
+# ==================== CHARACTER TEST ====================
+# Give me a character description who is a wizard.
+# { "name" : "Merlin", "age" : 500, "armor" : "chainmail" , "weapon" : "sword" , "strength" : 10 }
diff --git a/scripts/deprecated/test_robust.py b/scripts/deprecated/test_robust.py
new file mode 100644
index 000000000..633e2e649
--- /dev/null
+++ b/scripts/deprecated/test_robust.py
@@ -0,0 +1,132 @@
+import argparse
+import random
+import string
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+
+TOKENIZER = None
+RANDOM_PREFILL_LEN = None
+RANDOM_DECODE_LEN = None
+
+
+def gen_prompt(token_num):
+    if RANDOM_PREFILL_LEN:
+        token_num = random.randint(1, token_num)
+
+    cha_set = string.ascii_letters + string.digits
+    ret = "".join(random.choices(cha_set, k=token_num))
+    while len(TOKENIZER(ret).input_ids) < token_num:
+        ret += random.choice(cha_set)
+
+    return ret
+
+
+def robust_test_dfs(s, d, args, leaf_states):
+    if d == 0:
+        s += "END"
+        leaf_states.append(s)
+        return
+
+    s += gen_prompt(args.len_prefill)
+    forks = s.fork(args.num_fork)
+    for fork_s in forks:
+        fork_s += gen_prompt(args.len_prefill)
+        new_tokens = (
+            args.len_decode
+            if not RANDOM_DECODE_LEN
+            else random.randint(1, args.len_decode)
+        )
+        fork_s += sgl.gen(
+            max_tokens=new_tokens,
+            ignore_eos=True,
+        )
+
+    for fork_s in forks:
+        robust_test_dfs(fork_s, d - 1, args, leaf_states)
+
+
+def robust_test_bfs(s, args, leaf_states):
+    old_forks = [s]
+    new_forks = []
+    for _ in range(args.depth):
+        for old_fork in old_forks:
+            old_fork += gen_prompt(args.len_prefill)
+            forks = old_fork.fork(args.num_fork)
+            for fork_s in forks:
+                fork_s += gen_prompt(args.len_prefill)
+                new_tokens = (
+                    args.len_decode
+                    if not RANDOM_DECODE_LEN
+                    else random.randint(1, args.len_decode)
+                )
+                fork_s += sgl.gen(
+                    max_tokens=new_tokens,
+                    ignore_eos=True,
+                )
+            new_forks.extend(forks)
+
+        old_forks = new_forks
+        new_forks = []
+
+    for old_fork in old_forks:
+        old_fork += "END"
+        leaf_states.append(old_fork)
+
+
+@sgl.function
+def robust_test(s, args):
+    leaf_states = []
+    if args.mode == "bfs":
+        robust_test_bfs(s, args, leaf_states)
+    else:
+        robust_test_dfs(s, args.depth, args, leaf_states)
+    return leaf_states
+
+
+def main(args):
+    backend = select_sglang_backend(args)
+
+    arguments = [{"args": args} for _ in range(args.num_req)]
+
+    states = robust_test.run_batch(
+        arguments, temperature=0, backend=backend, num_threads=args.parallel
+    )
+
+    with open(f"tmp_robust_{args.mode}.txt", "w") as f:
+        for state in states:
+            leaf_states = state.ret_value
+            for leaf_state in leaf_states:
+                assert leaf_state.text()[-3:] == "END"
+                f.write(leaf_state.text()[:-3] + "\n")
+
+
+if __name__ == "__main__":
+    # fmt: off
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-req", type=int, default=2)
+    parser.add_argument("--depth", type=int, default=3)
+    parser.add_argument("--num-fork", type=int, default=2)
+    parser.add_argument("--len-prefill", type=int, default=128)
+    parser.add_argument("--len-decode", type=int, default=128)
+    parser.add_argument("--random-prefill-len", action="store_true")
+    parser.add_argument("--random-decode-len", action="store_true")
+    parser.add_argument("--mode", type=str, default="bfs", choices=["dfs", "bfs"])
+    parser.add_argument("--tokenizer", type=str, default = "meta-llama/Llama-2-7b-chat-hf")
+    parser.add_argument("--trust-remote-code", action="store_true")
+    parser.add_argument("--seed", type=int, default=42)
+    args = add_common_sglang_args_and_parse(parser)
+    # fmt: on
+
+    RANDOM_PREFILL_LEN = args.random_prefill_len
+    RANDOM_DECODE_LEN = args.random_decode_len
+    TOKENIZER = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
+
+    random.seed(args.seed)
+
+    main(args)
diff --git a/scripts/export_deepseek_nextn.py b/scripts/export_deepseek_nextn.py
new file mode 100644
index 000000000..5da0e4bc3
--- /dev/null
+++ b/scripts/export_deepseek_nextn.py
@@ -0,0 +1,115 @@
+"""
+Export NextN layer for DeepSeek-V3/R1 model. The exported model can be used for speculative decoding.
+
+Usage:
+python3 export_deepseek_nextn.py --input-dir /path/to/DeepSeek-V3 --output-dir /path/to/DeepSeek-V3-NextN
+"""
+
+import argparse
+import json
+import os
+import shutil
+
+from safetensors import safe_open
+from safetensors.torch import save_file
+from transformers import AutoConfig
+
+
+def get_nextn_layer_id(config):
+    if not hasattr(config, "num_hidden_layers"):
+        raise ValueError("'num_hidden_layers' not found in model config.")
+    return config.num_hidden_layers
+
+
+def update_and_save_config(config, output_dir):
+    new_config = config.to_dict()
+    new_config.update(
+        {
+            "num_hidden_layers": 1,
+            "architectures": ["DeepseekV3ForCausalLMNextN"],
+        }
+    )
+    with open(os.path.join(output_dir, "config.json"), "w") as f:
+        json.dump(new_config, f, indent=2, ensure_ascii=False, sort_keys=True)
+
+
+def copy_non_safetensors_files(input_dir, output_dir):
+    for filename in os.listdir(input_dir):
+        src_file_path = os.path.join(input_dir, filename)
+        if os.path.isfile(src_file_path) and not filename.endswith(".safetensors"):
+            dst_file_path = os.path.join(output_dir, filename)
+            shutil.copy2(src_file_path, dst_file_path)
+    print(f"All non-safetensors files have been copied to {output_dir}")
+
+
+def export_nextn_layer_parameters(input_dir, output_dir, nextn_layer_id):
+    prefix = f"model.layers.{nextn_layer_id}"
+    output_path = os.path.join(output_dir, "nextn_layer_parameters.safetensors")
+    params = {}
+    for filename in os.listdir(input_dir):
+        if not filename.endswith(".safetensors"):
+            continue
+
+        file_path = os.path.join(input_dir, filename)
+        print(f"Processing: {filename}")
+
+        try:
+            with safe_open(file_path, framework="pt") as f:
+                matching_keys = [k for k in f.keys() if k.startswith(prefix)]
+
+                if not matching_keys:
+                    print(f"  No parameters starting with '{prefix}' found")
+                    continue
+
+                for key in matching_keys:
+                    if "embed_tokens" in key or "shared_head.head" in key:
+                        continue
+                    new_key = key.replace(prefix, "model.layers.0")
+                    params[new_key] = f.get_tensor(key)
+
+        except Exception as e:
+            print(f"  Error processing {filename}: {str(e)}")
+
+    if params:
+        print(f"Saving {len(params)} parameters to {output_path}")
+        save_file(params, output_path)
+    else:
+        print("No matching parameters found.")
+
+    # Update safetensors index
+    index_path = os.path.join(output_dir, "model.safetensors.index.json")
+    print(f"Updating safetensors index to {index_path}")
+    index_data = {"weight_map": {}}
+    for key in params:
+        index_data["weight_map"][key] = "nextn_layer_parameters.safetensors"
+    with open(index_path, "w") as f:
+        json.dump(index_data, f, indent=4)
+
+    print("All done.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Export NextN layer parameters for DeepSeek-V3/R1"
+    )
+    parser.add_argument(
+        "--input-dir",
+        type=str,
+        required=True,
+        help="Input HF model directory.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        required=True,
+        help="Output nextn model directory.",
+    )
+    args = parser.parse_args()
+
+    config = AutoConfig.from_pretrained(args.input_dir, trust_remote_code=True)
+    assert config.num_nextn_predict_layers == 1, "Only 1 nextn layer is supported."
+    nextn_layer_id = get_nextn_layer_id(config)
+    os.makedirs(args.output_dir, exist_ok=True)
+    copy_non_safetensors_files(args.input_dir, args.output_dir)
+    update_and_save_config(config, args.output_dir)
+    export_nextn_layer_parameters(args.input_dir, args.output_dir, nextn_layer_id)
diff --git a/scripts/killall_sglang.sh b/scripts/killall_sglang.sh
new file mode 100755
index 000000000..7d0fe8bca
--- /dev/null
+++ b/scripts/killall_sglang.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+if [ "$1" = "rocm" ]; then
+    echo "Running in ROCm mode"
+
+    # Clean SGLang processes
+    pgrep -f 'sglang::|sglang\.launch_server|sglang\.bench|sglang\.data_parallel|sglang\.srt' | xargs -r kill -9
+
+else
+    # Show current GPU status
+    nvidia-smi
+
+    # Clean SGLang processes
+    pgrep -f 'sglang::|sglang\.launch_server|sglang\.bench|sglang\.data_parallel|sglang\.srt' | xargs -r kill -9
+
+    # Clean all GPU processes if any argument is provided
+    if [ $# -gt 0 ]; then
+        # Check if sudo is available
+        if command -v sudo >/dev/null 2>&1; then
+            sudo apt-get update
+            sudo apt-get install -y lsof
+        else
+            apt-get update
+            apt-get install -y lsof
+        fi
+        kill -9 $(nvidia-smi | sed -n '/Processes:/,$p' | grep "   [0-9]" | awk '{print $5}') 2>/dev/null
+        lsof /dev/nvidia* | awk '{print $2}' | xargs kill -9 2>/dev/null
+    fi
+
+    # Show GPU status after clean up
+    nvidia-smi
+fi
diff --git a/scripts/playground/bench_speculative.py b/scripts/playground/bench_speculative.py
new file mode 100644
index 000000000..c89e99242
--- /dev/null
+++ b/scripts/playground/bench_speculative.py
@@ -0,0 +1,308 @@
+"""
+Usage:
+# single GPU
+python3 bench_speculative.py --model-path meta-llama/Llama-2-7b-chat-hf --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B
+
+# multiple GPU
+python3 bench_speculative.py --model-path deepseek-ai/DeepSeek-V3 --speculative-draft-model-path lmsys/DeepSeek-V3-NextN --tp-size 8 --trust-remote-code --batch-size 1 4 8 16 32 --steps 0 1 2 --topk 0 1 2 4 --num_draft_tokens 0 2 4 8
+"""
+
+import argparse
+import asyncio
+import json
+import os
+import time
+from types import SimpleNamespace
+
+import numpy as np
+import requests
+from transformers import AutoTokenizer
+
+from sglang.bench_serving import (
+    DatasetRow,
+    benchmark,
+    sample_mmmu_requests,
+    set_global_args,
+)
+from sglang.srt.server_args import ServerArgs
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    kill_process_tree,
+    popen_launch_server,
+)
+
+
+def node0_print(msg):
+    if server_args.node_rank == 0:
+        print(msg)
+
+
+prompts = [
+    "Human: Give me a fully functional FastAPI server. Show the full, long python code without stop.\n\nAssistant:",
+    "Human: Imagine you are an experienced Ethereum developer tasked with creating a smart contract for a blockchain messenger. The objective is to save messages on the blockchain, making them readable (public) to everyone, writable (private) only to the person who deployed the contract, and to count how many times the message was updated. Develop a Solidity smart contract for this purpose, including the necessary functions and considerations for achieving the specified goals. Please provide the code and any relevant explanations to ensure a clear understanding of the implementation.\n\nAssistant:",
+    "Human: Write a travel blog post to Hawaii.\n\nAssistant:",
+    "Human: I want you to act as an English translator, spelling corrector and improver. I will speak to you in any language and you will detect the language, translate it and answer in the corrected and improved version of my text, in English. I want you to replace my simplified A0-level words and sentences with more beautiful and elegant, upper level English words and sentences. Keep the meaning same, but make them more literary. My first sentence is 'istanbulu cok seviyom burada olmak cok guzel'. Answer in more than 5000 words.\n\nAssistant:",
+    "Human: I want you to act as a storyteller. You will come up with entertaining stories that are engaging, imaginative and captivating for the audience. It can be fairy tales, educational stories or any other type of stories which has the potential to capture people's attention and imagination. Depending on the target audience, you may choose specific themes or topics for your storytelling session e.g., if it’s children then you can talk about animals; If it’s adults then history-based tales might engage them better etc. Answer in more than 5000 words. My first request is 'I need an interesting story on perseverance.'\n\nAssistant:",
+    "Human: Solve x^2 = -1. Think step-by-step. Give me a long detailed explanation. \n\nAssistant:",
+    "Human: Tell me about the president of the USA in wikipedia style.\n\nAssistant:",
+    "Human: Hello? Who are you? Write code, math, and poem to explanin yourself.\n\nAssistant:",
+]
+
+
+class FakeTokenizer:
+    def encode(self, text: str, add_special_tokens: bool = False):
+        return []
+
+
+def send_one_batch(base_url, num_prompts, batch_size, tokenizer, is_multimodal):
+    # format: (prompt, input_len, output len). We set input_len as a dummy value 0.
+    if is_multimodal:
+        input_requests = sample_mmmu_requests(
+            num_prompts,
+            tokenizer,
+            512,
+            apply_chat_template=False,
+        )
+        backend = "sglang-oai-chat"
+        api_url = f"{base_url}/v1/chat/completions"
+    else:
+        padded_prompts = (prompts * ((num_prompts + len(prompts) - 1) // len(prompts)))[
+            :num_prompts
+        ]
+        input_requests: List[DatasetRow] = [
+            DatasetRow(p, 0, 512) for p in padded_prompts
+        ]
+        backend = "sglang"
+        api_url = f"{base_url}/generate"
+
+    # We need to set some dummy values in order to call `benchmark` below.
+    args = SimpleNamespace(
+        disable_ignore_eos=False,
+        disable_stream=False,
+        return_logprob=False,
+        backend=backend,
+        dataset_name="custom",
+        num_prompts=None,
+        sharegpt_output_len=None,
+        random_input_len=None,
+        random_output_len=None,
+        random_range_ratio=None,
+        output_file=None,
+        warmup_requests=1,
+        output_details=False,
+    )
+    set_global_args(args)
+
+    # Run benchmark
+    results = asyncio.run(
+        benchmark(
+            backend=backend,
+            api_url=api_url,
+            base_url=base_url,
+            model_id="default",
+            tokenizer=tokenizer,
+            input_requests=input_requests,
+            request_rate=float("inf"),
+            max_concurrency=batch_size,
+            disable_tqdm=False,
+            lora_names=None,
+            extra_request_body={},
+            profile=None,
+        )
+    )
+
+    assert results["completed"] == len(input_requests)
+    acc_length = results["accept_length"] or 1.0
+    avg_output_token = results["total_output_tokens"] / results["completed"]
+
+    server_info = requests.get(base_url + "/get_server_info").json()
+    # We use 20% percentile instead of median on purpose
+    step_time = np.percentile(
+        server_info["internal_states"][0]["step_time_dict"][str(batch_size)], 20
+    )
+    speed = 1 / step_time * acc_length
+
+    return (
+        round(acc_length, 3),
+        round(step_time, 5),
+        round(speed, 3),
+        avg_output_token,
+    )
+
+
+def main(args, server_args):
+    base_url = "http://127.0.0.1:20000"
+
+    configs = []
+    for batch_size in args.batch_size:
+        for steps in args.steps:
+            for topk in args.topk:
+                for num_draft_tokens in args.num_draft_tokens:
+                    if steps * topk + 1 < num_draft_tokens:
+                        continue
+
+                    if (steps == 0 or topk == 0 or num_draft_tokens == 0) and (
+                        steps + topk + num_draft_tokens != 0
+                    ):
+                        # steps == 0 and topk == 0 and num_draft_tokens == 0 is a special case for non-speculative decoding.
+                        continue
+
+                    configs.append((batch_size, steps, topk, num_draft_tokens))
+
+    for i in range(args.start, args.end or len(configs)):
+        batch_size, steps, topk, num_draft_tokens = configs[i]
+
+        node0_print(
+            f"Start {i=}: {batch_size=}, {steps=}, {topk=}, {num_draft_tokens=}"
+        )
+
+        # Create an LLM.
+        if steps == 0:
+            other_args = []
+        else:
+            other_args = [
+                "--speculative-num-steps",
+                steps,
+                "--speculative-eagle-topk",
+                topk,
+                "--speculative-num-draft-tokens",
+                num_draft_tokens,
+            ]
+            if server_args.speculative_draft_model_path is not None:
+                other_args.extend(
+                    [
+                        "--speculative-draft-model-path",
+                        server_args.speculative_draft_model_path,
+                        "--speculative-algorithm",
+                        server_args.speculative_algorithm,
+                    ]
+                )
+
+        other_args.extend(
+            [
+                "--cuda-graph-max-bs",
+                batch_size,
+                "--mem-fraction-static",
+                server_args.mem_fraction_static,
+                "--tp-size",
+                server_args.tp_size,
+                "--max-running-requests",
+                batch_size,
+            ]
+        )
+
+        if server_args.trust_remote_code:
+            other_args.extend(
+                [
+                    "--trust-remote-code",
+                ]
+            )
+
+        if server_args.attention_backend:
+            other_args.extend(
+                [
+                    "--attention-backend",
+                    server_args.attention_backend,
+                ]
+            )
+
+        if server_args.quantization:
+            other_args.extend(
+                [
+                    "--quantization",
+                    server_args.quantization,
+                ]
+            )
+
+        process = popen_launch_server(
+            args.model_path,
+            base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+            env={
+                "SGLANG_RECORD_STEP_TIME": "1",
+                **os.environ,
+            },
+        )
+
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.model_path, trust_remote_code=server_args.trust_remote_code
+        )
+
+        try:
+            # Warmup
+            send_one_batch(
+                base_url, batch_size, batch_size, tokenizer, args.is_multimodal
+            )
+
+            # Benchmark
+            acc_length, step_time, speed, completion_tokens = send_one_batch(
+                base_url,
+                max(args.num_prompts, batch_size),
+                batch_size,
+                tokenizer,
+                args.is_multimodal,
+            )
+        finally:
+            kill_process_tree(process.pid)
+
+        node0_print(
+            f"Finish {i=}: {batch_size=}, {steps=}, {topk=}, {num_draft_tokens=}, {speed=:.2f} token/s, step_time={step_time * 1000:.2f} ms"
+        )
+
+        record = {
+            "batch_size": batch_size,
+            "steps": steps,
+            "topk": topk,
+            "num_draft_tokens": num_draft_tokens,
+            "acc_length": acc_length,
+            "step_time": step_time,
+            "speed": speed,
+            "completion_tokens": completion_tokens,
+        }
+
+        with open(args.output, "a") as fout:
+            fout.write(json.dumps(record) + "\n")
+
+        # Wait for the server to shutdown
+        time.sleep(5)
+
+
+# The __main__ condition is necessary here because we use "spawn" to create subprocesses
+# Spawn starts a fresh program every time, if there is no __main__, it will run into infinite loop to keep spawning processes from sgl.Engine
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        nargs="+",
+        default=(1, 2, 4, 8, 16),
+    )
+    parser.add_argument(
+        "--steps",
+        type=int,
+        nargs="+",
+        default=(0, 1, 3, 5, 7),  # use (0, 1, 2, 3, 4) for large batch size
+    )
+    parser.add_argument(
+        "--topk",
+        type=int,
+        nargs="+",
+        default=(0, 1, 2, 4, 8),
+    )
+    parser.add_argument(
+        "--num_draft_tokens",
+        type=int,
+        nargs="+",
+        default=(0, 2, 4, 8, 16, 32),  # use (0, 2, 4, 8) for large batch size
+    )
+    parser.add_argument("--num-prompts", type=int, default=16)
+    parser.add_argument("--start", type=int, default=0)
+    parser.add_argument("--end", type=int)
+    parser.add_argument("--output", type=str, default="output.jsonl")
+    parser.add_argument("--is-multimodal", action="store_true", default=False)
+    args = parser.parse_args()
+    server_args: ServerArgs = ServerArgs.from_cli_args(args)
+
+    main(args, server_args)
diff --git a/scripts/playground/disaggregation/cli-logprob.py b/scripts/playground/disaggregation/cli-logprob.py
new file mode 100644
index 000000000..4c69a055b
--- /dev/null
+++ b/scripts/playground/disaggregation/cli-logprob.py
@@ -0,0 +1,22 @@
+prompt = "The capital of france is "
+
+import json
+
+import requests
+
+response = requests.post(
+    "http://0.0.0.0:8000/generate",
+    json={
+        "text": prompt,
+        "sampling_params": {"temperature": 0},
+        "return_logprob": True,
+        "return_input_logprob": True,
+        "logprob_start_len": 0,
+    },
+)
+
+j = response.json()
+input_logprobs = j["meta_info"]["input_token_logprobs"]
+output_logprobs = j["meta_info"]["output_token_logprobs"]
+
+print(len(input_logprobs), len(output_logprobs))
diff --git a/scripts/playground/disaggregation/cli-so.py b/scripts/playground/disaggregation/cli-so.py
new file mode 100644
index 000000000..7ccafc7ed
--- /dev/null
+++ b/scripts/playground/disaggregation/cli-so.py
@@ -0,0 +1,34 @@
+import json
+
+import requests
+
+port = 8000
+
+json_schema = json.dumps(
+    {
+        "type": "object",
+        "properties": {
+            "name": {"type": "string", "pattern": "^[\\w]+$"},
+            "population": {"type": "integer"},
+        },
+        "required": ["name", "population"],
+    }
+)
+
+# JSON
+response = requests.post(
+    f"http://localhost:{port}/generate",
+    json={
+        "text": "Here is the information of the capital of France in the JSON format.\n",
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 64,
+            "json_schema": json_schema,
+        },
+    },
+)
+
+print(response.json())
+
+
+# python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --trust-remote-code --disaggregation-mode prefill --tp 2 --disaggregation-ib-device mlx5_roce0,mlx5_roce1 --speculative-algorithm EAGLE --speculative-draft-model-path lmsys/sglang-EAGLE-llama2-chat-7B --speculative-num-steps 3 --speculative-eagle-topk 4 --speculative-num-draft-tokens 16 --cuda-graph-max-bs 8 --host 127.0.0.1 --port 8100
diff --git a/scripts/playground/disaggregation/cli.py b/scripts/playground/disaggregation/cli.py
new file mode 100644
index 000000000..721a6dd5e
--- /dev/null
+++ b/scripts/playground/disaggregation/cli.py
@@ -0,0 +1,29 @@
+import json
+
+import requests
+
+prompt = """
+According to CNBC's Faber, the investors present on the call interpreted this statement as an indication of an upcoming funding round. While speculative, Faber believes the funding round could be as large as $25 billion, and bestow a valuation of between $150 billion and $200 billion on xAI.
+
+For the benefit of those who might not be aware, xAI recently acquired the social media platform X in an all-stock deal that valued the former at $80 billion and the latter at $33 billion, inclusive of $12 billion in liabilities. This meant that the deal bestowed a gross valuation of $45 billion on X before factoring in its debt load of $12 billion.
+
+Bear in mind that Elon Musk took X (then called Twitter) private back in 2022 in a $44 billion deal. Since then, Musk has managed to stem X's cash bleed, with the company reportedly generating $1.2 billion in adjusted EBITDA in 2024.
+
+According to the investors present on the call, xAI is currently generating around $1 billion in annual revenue. This contrasts sharply with the erstwhile muted expectations of many investors, who did not expect the startup to generate any material revenue this year.
+
+Elsewhere, Faber also alludes to the fact that xAI is already working on its next big training supercluster, officially dubbed the Colossus 2, which is expected to eventually house as many as 1 million NVIDIA GPUs at a cost of between $35 billion and $40 billion.
+
+
+Even though xAI's Grok LLM is already largely comparable with OpenAI's cutting-edge models, the Colossus 2 would significantly up the ante, and could feasibly challenge OpenAI's apex position in the AI sphere.
+
+Give your honest take on the above text:
+"""
+
+response = requests.post(
+    "http://0.0.0.0:8000/generate",
+    json={"text": prompt, "sampling_params": {"temperature": 0}},
+)
+
+
+response_json = response.json()
+print(response_json["text"])
diff --git a/scripts/playground/frontend_reasoning.ipynb b/scripts/playground/frontend_reasoning.ipynb
new file mode 100644
index 000000000..fcdce25ab
--- /dev/null
+++ b/scripts/playground/frontend_reasoning.ipynb
@@ -0,0 +1,241 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Launch A Server\n",
+    "\n",
+    "Launch the server with a reasoning model (Qwen 3.5-4B) and reasoning parser."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sglang import separate_reasoning, assistant_begin, assistant_end\n",
+    "from sglang import assistant, function, gen, system, user\n",
+    "from sglang import image\n",
+    "from sglang import RuntimeEndpoint, set_default_backend\n",
+    "from sglang.srt.utils import load_image\n",
+    "from sglang.test.test_utils import is_in_ci\n",
+    "from sglang.utils import print_highlight, terminate_process, wait_for_server\n",
+    "\n",
+    "\n",
+    "if is_in_ci():\n",
+    "    from patch import launch_server_cmd\n",
+    "else:\n",
+    "    from sglang.utils import launch_server_cmd\n",
+    "\n",
+    "\n",
+    "server_process, port = launch_server_cmd(\n",
+    "    \"python3 -m sglang.launch_server --model-path Qwen/Qwen3-4B --reasoning-parser qwen3 --host 0.0.0.0\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")\n",
+    "print(f\"Server started on http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set the default backend. Note: you can set chat_template_name in RontimeEndpoint. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "set_default_backend(\n",
+    "    RuntimeEndpoint(f\"http://localhost:{port}\", chat_template_name=\"qwen\")\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's start with a basic question-answering task. And see how the reasoning content is generated."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@function\n",
+    "def basic_qa(s, question):\n",
+    "    s += system(f\"You are a helpful assistant than can answer questions.\")\n",
+    "    s += user(question)\n",
+    "    s += assistant_begin()\n",
+    "    s += gen(\"answer\", max_tokens=512)\n",
+    "    s += assistant_end()\n",
+    "\n",
+    "\n",
+    "state = basic_qa(\"List 3 countries and their capitals.\")\n",
+    "print_highlight(state[\"answer\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "With `separate_reasoning`, you can move the reasoning content to `{param_name}_reasoning_content` in the state."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@function\n",
+    "def basic_qa_separate_reasoning(s, question):\n",
+    "    s += system(f\"You are a helpful assistant than can answer questions.\")\n",
+    "    s += user(question)\n",
+    "    s += assistant_begin()\n",
+    "    s += separate_reasoning(gen(\"answer\", max_tokens=512), model_type=\"qwen3\")\n",
+    "    s += assistant_end()\n",
+    "\n",
+    "\n",
+    "reasoning_state = basic_qa_separate_reasoning(\"List 3 countries and their capitals.\")\n",
+    "print_highlight(reasoning_state.stream_executor.variable_event.keys())\n",
+    "print_highlight(\n",
+    "    f\"\\nSeparated Reasoning Content:\\n{reasoning_state['answer_reasoning_content']}\"\n",
+    ")\n",
+    "\n",
+    "print_highlight(f\"\\n\\nContent:\\n{reasoning_state['answer']}\")\n",
+    "print_highlight(f\"\\n\\nMessages:\\n{reasoning_state.messages()[-1]}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`separate_reasoning` can also be used in multi-turn conversations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@function\n",
+    "def multi_turn_qa(s):\n",
+    "    s += system(f\"You are a helpful assistant than can answer questions.\")\n",
+    "    s += user(\"Please give me a list of 3 countries and their capitals.\")\n",
+    "    s += assistant(\n",
+    "        separate_reasoning(gen(\"first_answer\", max_tokens=512), model_type=\"qwen3\")\n",
+    "    )\n",
+    "    s += user(\"Please give me another list of 3 countries and their capitals.\")\n",
+    "    s += assistant(\n",
+    "        separate_reasoning(gen(\"second_answer\", max_tokens=512), model_type=\"qwen3\")\n",
+    "    )\n",
+    "    return s\n",
+    "\n",
+    "\n",
+    "reasoning_state = multi_turn_qa()\n",
+    "print_highlight(f\"\\n\\nfirst_answer:\\n{reasoning_state['first_answer']}\")\n",
+    "print_highlight(\n",
+    "    f\"\\n\\nfirst_answer_reasoning_content:\\n{reasoning_state['first_answer_reasoning_content']}\"\n",
+    ")\n",
+    "print_highlight(f\"\\n\\nsecond_answer:\\n{reasoning_state['second_answer']}\")\n",
+    "print_highlight(\n",
+    "    f\"\\n\\nsecond_answer_reasoning_content:\\n{reasoning_state['second_answer_reasoning_content']}\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using No thinking as Qwen 3's advanced feature \n",
+    "\n",
+    "sglang separate_reasoning is particularly useful when combined with Qwen 3's advanced feature.\n",
+    "\n",
+    "[Qwen 3's advanced usages](https://qwenlm.github.io/blog/qwen3/#advanced-usages)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reasoning_state = basic_qa_separate_reasoning(\n",
+    "    \"List 3 countries and their capitals. /no_think\"\n",
+    ")\n",
+    "print_highlight(f\"Reasoning Content:\\n{reasoning_state['answer_reasoning_content']}\")\n",
+    "print_highlight(f\"Content:\\n{reasoning_state['answer']}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`separate_reasoning` can also be used in regular expression generation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@function\n",
+    "def regular_expression_gen(s):\n",
+    "    s += user(\n",
+    "        \"What is the IP address of the Google DNS servers? just provide the answer\"\n",
+    "    )\n",
+    "    s += assistant(\n",
+    "        separate_reasoning(\n",
+    "            gen(\n",
+    "                \"answer\",\n",
+    "                temperature=0,\n",
+    "                regex=r\"((25[0-5]|2[0-4]\\d|[01]?\\d\\d?).){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\",\n",
+    "                max_tokens=512,\n",
+    "            ),\n",
+    "            model_type=\"qwen3\",\n",
+    "        ),\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "reasoning_state = regular_expression_gen()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print_highlight(f\"Answer:\\n{reasoning_state['answer']}\")\n",
+    "print_highlight(\n",
+    "    f\"\\n\\nReasoning Content:\\n{reasoning_state['answer_reasoning_content']}\"\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/scripts/playground/launch_tgi.sh b/scripts/playground/launch_tgi.sh
new file mode 100644
index 000000000..a32405cdd
--- /dev/null
+++ b/scripts/playground/launch_tgi.sh
@@ -0,0 +1,7 @@
+# Assuming the model is downdloaded at /home/ubuntu/model_weights/Llama-2-7b-chat-hf
+docker run --name tgi --rm -ti --gpus all --network host \
+  -v /home/ubuntu/model_weights/Llama-2-7b-chat-hf:/Llama-2-7b-chat-hf \
+  ghcr.io/huggingface/text-generation-inference:1.1.0 \
+  --model-id /Llama-2-7b-chat-hf --num-shard 1  --trust-remote-code \
+  --max-input-length 2048 --max-total-tokens 4096 \
+  --port 24000
diff --git a/scripts/playground/load_tokenizer.py b/scripts/playground/load_tokenizer.py
new file mode 100644
index 000000000..94cf34bc7
--- /dev/null
+++ b/scripts/playground/load_tokenizer.py
@@ -0,0 +1,14 @@
+import argparse
+import code
+
+from sglang.srt.hf_transformers_utils import get_tokenizer
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--name", type=str, default="meta-llama/Meta-Llama-3-8B-Instruct"
+    )
+    args = parser.parse_args()
+
+    t = get_tokenizer(args.name)
+    code.interact(local=locals())
diff --git a/scripts/playground/long_context_example.py b/scripts/playground/long_context_example.py
new file mode 100644
index 000000000..c5e035d29
--- /dev/null
+++ b/scripts/playground/long_context_example.py
@@ -0,0 +1,36 @@
+from urllib.request import urlopen
+
+from openai import OpenAI
+
+test_cases = {
+    "64k": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/64k.txt",
+    "200k": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/200k.txt",
+    "600k": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/600k.txt",
+    "1m": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/1m.txt",
+}
+
+client = OpenAI(api_key="EMPTY", base_url="http://127.0.0.1:30000/v1")
+
+for name, url in test_cases.items():
+    print(f"\n==== Running test case: {name} ====")
+    try:
+        with urlopen(url, timeout=10) as response:
+            prompt = response.read().decode("utf-8")
+    except Exception as e:
+        print(f"Failed to load prompt for {name}: {e}")
+        continue
+
+    try:
+        response = client.chat.completions.create(
+            model="meta-llama/Llama-4-Scout-17B-16E-Instruct",
+            messages=[{"role": "user", "content": prompt}],
+            stream=True,
+            max_tokens=128,
+            temperature=0,
+        )
+
+        for chunk in response:
+            if chunk.choices and chunk.choices[0].delta.content is not None:
+                print(chunk.choices[0].delta.content, end="", flush=True)
+    except Exception as e:
+        print(f"\nError during completion for {name}: {e}")
diff --git a/scripts/playground/lora/analyzer.py b/scripts/playground/lora/analyzer.py
new file mode 100644
index 000000000..15568fc18
--- /dev/null
+++ b/scripts/playground/lora/analyzer.py
@@ -0,0 +1,77 @@
+import glob
+import json
+import os
+import re
+import sys
+
+from tqdm import tqdm
+
+sys.path.append("../../")
+from fix_corrupted_json import clean_json_file
+
+dirpath = "/Users/ying"
+output_file_prefix = "analyzed_log"
+
+time = {}
+tot_time = {}
+size = {}
+
+os.system(f"rm {output_file_prefix}*")
+
+for dirname in glob.glob(os.path.join(dirpath, "trace*")):
+    print(dirname)
+    trace_name = dirname.split("/")[-1]
+    time[trace_name] = {}
+    size[trace_name] = {}
+    total_time = 0
+    for filename in tqdm(glob.glob(os.path.join(dirname, "*.json"))):
+        step_name = filename.split("/")[-1].split(".")[0]
+        step_name = "_".join(step_name.split("_")[1:])
+        if "prefill" not in filename and "decode" not in filename:
+            continue
+
+        match = re.search(r"(prefill|decode)_step_(\d+)\.json", filename)
+        if match:
+            phase = match.group(1)
+            step = match.group(2)
+        else:
+            raise Exception(f"Cannot parse {filename}")
+
+        try:
+            with open(filename, "r") as f:
+                trace = json.load(f)
+        except:
+            clean_json_file(filename, filename)
+            with open(filename, "r") as f:
+                trace = json.load(f)
+
+        for event in trace["traceEvents"]:
+            name = event["name"]
+            if name in ["profile_prefill_step", "profile_decode_step"]:
+                dur = event["dur"] / 1e3
+                time[trace_name][step_name] = dur
+                break
+        total_time += dur
+
+        step = int(step_name.split("_")[-1])
+        with open(os.path.join(dirname, f"size_{step}.json"), "r") as f:
+            size_info = json.load(f)
+        size[trace_name][step_name] = size_info["size"]
+
+    tot_time[trace_name] = total_time
+    time[trace_name] = dict(
+        sorted(time[trace_name].items(), key=lambda x: int(x[0].split("_")[-1]))
+    )
+    size[trace_name] = dict(
+        sorted(size[trace_name].items(), key=lambda x: int(x[0].split("_")[-1]))
+    )
+
+    with open(f"{output_file_prefix}_{trace_name}", "a") as f:
+        for k, v in time[trace_name].items():
+            size_v = size[trace_name][k]
+            print(f"{k:>15}{v:10.2f}\t{size_v}")
+            f.write(f"{k:>15}{v:10.2f}\t{size_v}\n")
+
+with open(f"{output_file_prefix}_total_time", "w") as f:
+    print(tot_time)
+    json.dump(tot_time, f)
diff --git a/scripts/playground/lora/lora_hf_play.py b/scripts/playground/lora/lora_hf_play.py
new file mode 100644
index 000000000..0abddd2c1
--- /dev/null
+++ b/scripts/playground/lora/lora_hf_play.py
@@ -0,0 +1,62 @@
+import torch
+from peft import PeftModel
+from transformers import LlamaForCausalLM, LlamaTokenizer
+
+MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
+# ADAPTER = "winddude/wizardLM-LlaMA-LoRA-7B"
+ADAPTER = "/home/ying/test_lora"
+HF_TOKEN = "..."
+
+
+prompt = """
+### Instruction:
+Write a poem about the transformers Python library.
+Mention the word "large language models" in that poem.
+### Response:
+The Transformers are large language models,
+They're used to make predictions on text.
+"""
+
+
+tokenizer = LlamaTokenizer.from_pretrained(MODEL)
+
+base_model = LlamaForCausalLM.from_pretrained(
+    MODEL,
+    device_map="auto",
+    # load_in_8bit=True,
+    torch_dtype=torch.float16,
+    # use_auth_token=HF_TOKEN,
+).cuda()
+
+
+# base model generate
+with torch.no_grad():
+    output_tensors = base_model.generate(
+        input_ids=tokenizer(prompt, return_tensors="pt").input_ids.cuda(),
+        max_new_tokens=32,
+        do_sample=False,
+    )[0]
+
+output = tokenizer.decode(output_tensors, skip_special_tokens=True)
+print("======= base output ========")
+print(output)
+
+
+# peft model generate
+model = PeftModel.from_pretrained(
+    base_model,
+    ADAPTER,
+    torch_dtype=torch.float16,
+    is_trainable=False,
+)
+
+with torch.no_grad():
+    output_tensors = model.generate(
+        input_ids=tokenizer(prompt, return_tensors="pt").input_ids.cuda(),
+        max_new_tokens=32,
+        do_sample=False,
+    )[0]
+
+output = tokenizer.decode(output_tensors, skip_special_tokens=True)
+print("======= peft output ========")
+print(output)
diff --git a/scripts/playground/lora/lora_vllm_play.py b/scripts/playground/lora/lora_vllm_play.py
new file mode 100644
index 000000000..4f77d8bea
--- /dev/null
+++ b/scripts/playground/lora/lora_vllm_play.py
@@ -0,0 +1,30 @@
+from vllm import LLM, SamplingParams
+from vllm.lora.request import LoRARequest
+
+MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
+ADAPTER = "/home/ying/test_lora"
+prompt = """
+### Instruction:
+Write a poem about the transformers Python library.
+Mention the word "large language models" in that poem.
+### Response:
+The Transformers are large language models,
+They're used to make predictions on text.
+"""
+
+
+llm = LLM(model=MODEL, enable_lora=True)
+
+sampling_params = SamplingParams(
+    temperature=0,
+    max_tokens=32,
+)
+
+prompts = [prompt]
+
+outputs = llm.generate(
+    prompts, sampling_params, lora_request=LoRARequest("test_lora", 1, ADAPTER)
+)
+
+print(outputs[0].prompt)
+print(outputs[0].outputs[0].text)
diff --git a/scripts/playground/reference_hf.py b/scripts/playground/reference_hf.py
new file mode 100644
index 000000000..14d23fb76
--- /dev/null
+++ b/scripts/playground/reference_hf.py
@@ -0,0 +1,197 @@
+"""
+Usage: python3 scripts/playground/reference_hf.py --model-path MODEL_PATH --model-type {text,vlm} [--max-new-tokens NUM] [--dtype DTYPE]
+  --model-path MODEL_PATH: Path to model (default: TinyLlama/TinyLlama-1.1B-Chat-v0.4)
+  --model-type {text,vlm}: Model type, text or vlm (default: text)
+  --max-new-tokens NUM: Max new tokens to generate (default: 16)
+  --dtype DTYPE: Data type for computation (default: float16)
+Note: '--model' is deprecated; use '--model-path'. Runs normal_text() for text, vlm_text_with_image() for vlm.
+
+Reference output:
+========== Prompt 0 ==========
+prefill logits (final) tensor([-8.3125, -7.1172,  3.3398,  ..., -4.9531, -4.1328, -3.4141],
+       device='cuda:0')
+<s> The capital of France is Paris.
+The capital of the United States is Washington, D.C.
+
+========== Prompt 1 ==========
+prefill logits (final) tensor([-8.9062, -9.0156,  4.1484,  ..., -4.9922, -4.4961, -4.0742],
+       device='cuda:0')
+<s> The capital of the United Kindom is London.
+The capital of the United Kingdom is London.
+The capital of
+
+========== Prompt 2 ==========
+prefill logits (final) tensor([-9.6328, -9.0547,  4.0234,  ..., -5.3047, -4.7148, -4.4609],
+       device='cuda:0')
+<s> Today is a sunny day and I like to go for a walk in the park.
+I'm going to the
+"""
+
+import argparse
+
+import requests
+import torch
+from PIL import Image
+from transformers import (
+    AutoModelForCausalLM,
+    AutoModelForImageTextToText,
+    AutoProcessor,
+)
+
+from sglang.srt.hf_transformers_utils import get_tokenizer
+
+
+@torch.no_grad()
+def vlm_text_with_image(args):
+    # Load the processor and model for ImageTextToText tasks
+    processor = AutoProcessor.from_pretrained(args.model_path, trust_remote_code=True)
+    model = AutoModelForImageTextToText.from_pretrained(
+        args.model_path,
+        torch_dtype=args.dtype,
+        low_cpu_mem_usage=True,
+        device_map="auto",
+        trust_remote_code=True,
+    )
+
+    torch.cuda.set_device(0)
+
+    # List of image URLs to process
+    image_urls = [
+        "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
+    ]
+
+    # Conversation template for the processor
+    conversation = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                },
+                {"type": "text", "text": "Describe this image."},
+            ],
+        }
+    ]
+
+    max_new_tokens = args.max_new_tokens
+
+    for i, url in enumerate(image_urls):
+        # Load the image from the URL
+        image = Image.open(requests.get(url, stream=True).raw)
+
+        # Apply the chat template to the text prompt
+        # Notice that not all processors support chat templates.
+        # LLaVA and QWen are two processors that support chat templates.
+        if not hasattr(processor, "apply_chat_template"):
+            raise ValueError("The processor does not support chat templates.")
+        text_prompt = processor.apply_chat_template(
+            conversation, add_generation_prompt=True
+        )
+
+        # Prepare inputs for the model
+        inputs = processor(text=[text_prompt], images=[image], return_tensors="pt").to(
+            "cuda:0"
+        )
+
+        # Generate output from the model
+        output_ids = model.generate(
+            **inputs, do_sample=False, max_new_tokens=max_new_tokens
+        )
+        output_str = processor.decode(output_ids[0])
+
+        # Get the logits from the model's forward pass
+        outputs = model.forward(**inputs)
+        logits = outputs.logits[0, -1, :]
+
+        print(f"\n========== Image {i} ==========")
+        print("prefill logits (final)", logits)
+        # TODO(gaocegege): The output contains numerous <|image_pad|> tokens,
+        # making it cluttered and difficult to read.
+        # These tokens should be removed or cleaned up for better readability.
+        print(output_str)
+
+
+@torch.no_grad()
+def normal_text(args):
+    t = get_tokenizer(args.model_path, trust_remote_code=True)
+    m = AutoModelForCausalLM.from_pretrained(
+        args.model_path,
+        torch_dtype=args.dtype,
+        low_cpu_mem_usage=True,
+        device_map="auto",
+        trust_remote_code=True,
+    )
+
+    prompts = [
+        "The capital of France is",
+        "The capital of the United Kindom is",
+        "Today is a sunny day and I like",
+    ]
+    max_new_tokens = args.max_new_tokens
+
+    torch.cuda.set_device(0)
+
+    for i, p in enumerate(prompts):
+        if isinstance(p, str):
+            input_ids = t.encode(p, return_tensors="pt").to("cuda:0")
+        else:
+            input_ids = torch.tensor([p], device="cuda:0")
+
+        output_ids = m.generate(
+            input_ids, do_sample=False, max_new_tokens=max_new_tokens
+        )
+        output_str = t.decode(output_ids[0])
+
+        prefill_logits = m.forward(input_ids).logits[0][-1]
+
+        print(f"\n========== Prompt {i} ==========")
+        print("prefill logits (final)", prefill_logits)
+        print(output_str)
+
+
+@torch.no_grad()
+def synthetic_tokens(args):
+    m = AutoModelForCausalLM.from_pretrained(
+        args.model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    m.cuda()
+    print(m)
+
+    input_len = 256
+    output_len = 8
+    prompts = [list(range(5, 5 + input_len))]
+
+    for p in prompts:
+        input_ids = p
+        for i in range(output_len + 1):
+            prefill_logits = m.forward(torch.tensor([input_ids], device="cuda")).logits[
+                0
+            ][-1]
+
+            if i == 0:
+                print("prefill logits", prefill_logits)
+            else:
+                print("decode", i - 1, prefill_logits)
+
+            input_ids.append(torch.argmax(prefill_logits).item())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default="TinyLlama/TinyLlama-1.1B-Chat-v0.4",
+    )
+    parser.add_argument("--max-new-tokens", type=int, default=16)
+
+    parser.add_argument("--dtype", type=str, default="float16")
+
+    parser.add_argument("--model-type", type=str, default="text")
+
+    args = parser.parse_args()
+
+    if args.model_type == "vlm":
+        vlm_text_with_image(args)
+    else:
+        normal_text(args)
diff --git a/scripts/playground/replay_request_dump.py b/scripts/playground/replay_request_dump.py
new file mode 100644
index 000000000..301cf948e
--- /dev/null
+++ b/scripts/playground/replay_request_dump.py
@@ -0,0 +1,151 @@
+"""
+Usage:
+# replay from a folder
+python3 replay_request_dump.py --file-number 100 --parallel 512 --input-folder /data/lianmin/sglang_request_dump/grok-mini-0220-engine-5756f8f94-28bm6/
+
+# replay from a single file
+python3 replay_request_dump.py --parallel 512 --input-file /data/sglang_crash_dump/memx-cti-34-sr1.xpop.twttr.net/crash_dump_2025-06-04_20-13-18.pkl
+"""
+
+import argparse
+import glob
+import json
+import pickle
+import time
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import asdict
+from datetime import datetime
+
+import requests
+
+from sglang.bench_serving import set_ulimit
+from sglang.utils import get_exception_traceback
+
+
+def read_records(files):
+    records = []
+    for f in files:
+        tmp = pickle.load(open(f, "rb"))
+        if isinstance(tmp, dict) and "requests" in tmp:
+            records.extend(tmp["requests"])
+        else:
+            records.extend(tmp)
+
+    return records
+
+
+def run_one_request_internal(record):
+    (req, output, replay_init_time, start_time, end_time, idx) = record
+    time.sleep(max(0, (start_time - (time.time() - replay_init_time)) / args.speed))
+
+    if "completion_tokens" in output.get("meta_info", {}):
+        recorded_completion_tokens = output["meta_info"]["completion_tokens"]
+    else:
+        recorded_completion_tokens = ""
+
+    json_data = asdict(req)
+    stream = json_data["stream"]
+
+    if args.ignore_eos:
+        json_data["sampling_params"]["ignore_eos"] = True
+        if recorded_completion_tokens:
+            json_data["sampling_params"]["max_new_tokens"] = recorded_completion_tokens
+
+    response = requests.post(
+        f"http://{args.host}:{args.port}/generate",
+        json=json_data,
+        stream=stream,
+    )
+
+    if stream:
+        for chunk in response.iter_lines(decode_unicode=False):
+            chunk = chunk.decode("utf-8")
+            if chunk and chunk.startswith("data:"):
+                if chunk == "data: [DONE]":
+                    break
+                ret = json.loads(chunk[5:].strip("\n"))
+    else:
+        ret = response.json()
+
+    prompt_tokens = ret["meta_info"]["prompt_tokens"]
+    completion_tokens = ret["meta_info"]["completion_tokens"]
+    print(
+        f"{idx=}, {start_time=:.2f}, {prompt_tokens=}, "
+        f"{completion_tokens=}, {recorded_completion_tokens=}"
+    )
+
+
+def run_one_request(record):
+    # global success_ct, error_ct
+
+    try:
+        run_one_request_internal(record)
+        # success_ct += 1
+    except Exception:
+        # error_ct += 1
+        traceback = get_exception_traceback()
+        print(f"Hit an exception: {traceback}")
+
+
+def main(records):
+    if len(records) == 0:
+        return
+
+    base_time = records[0][-2]
+    base_time_str = datetime.fromtimestamp(base_time).strftime("%y-%m-%d %H:%M:%S")
+    print(f"{base_time_str=}")
+    replay_init_time = time.time()
+
+    for i in range(len(records)):
+        req, output, start_time, end_time = records[i]
+        start_time -= base_time
+        records[i] = (req, output, replay_init_time, start_time, end_time, i)
+
+    with ThreadPoolExecutor(args.parallel) as executor:
+        executor.map(run_one_request, records)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=30000)
+    parser.add_argument(
+        "--input-folder", type=str, default=None, help="Folder containing pickle files"
+    )
+    parser.add_argument(
+        "--input-file", type=str, default=None, help="Single pickle file to process"
+    )
+    parser.add_argument("--file-number", type=int, default=1)
+    parser.add_argument("--req-number", type=int, default=1000000)
+    parser.add_argument("--req-start", type=int, default=0)
+    parser.add_argument("--parallel", type=int, default=512)
+    parser.add_argument("--idx", type=int, default=None)
+    parser.add_argument("--ignore-eos", action="store_true")
+    parser.add_argument("--speed", type=float, default=1)
+    args = parser.parse_args()
+
+    set_ulimit()
+
+    files = []
+    if args.input_file:
+        files = [args.input_file]
+        if args.file_number > 1:
+            print("Warning: --file-number is ignored when --input-file is provided.")
+    elif args.input_folder:
+        files = glob.glob(f"{args.input_folder}/*.pkl")
+        files = files[: args.file_number]
+    else:
+        print("Error: Either --input-folder or --input-file must be provided.")
+        exit(1)
+    print(f"{files=}")
+
+    records = read_records(files)
+    # Sort by the receive time, before filtering
+    records.sort(key=lambda x: x[-2])
+    records = records[args.req_start :]
+    if args.idx:
+        records = [records[args.idx]]
+        print(f"testing {args.idx=}")
+        print(f"{records[0]}")
+    print(f"{len(records)=}")
+    main(records)
diff --git a/scripts/playground/router/test_tree.py b/scripts/playground/router/test_tree.py
new file mode 100644
index 000000000..af41c738e
--- /dev/null
+++ b/scripts/playground/router/test_tree.py
@@ -0,0 +1,207 @@
+import random
+import string
+import time
+import unittest
+from typing import Dict, List, Tuple
+
+from tree import MultiTenantRadixTree
+
+
+class TestMultiTenantRadixTree(unittest.TestCase):
+    def setUp(self):
+        self.tree = MultiTenantRadixTree()
+
+    def test_insert_exact_match(self):
+        """Test 1: Basic insert and exact match operations"""
+        # Insert a single string for one tenant
+        self.tree.insert("hello", "tenant1")
+        matched, tenant = self.tree.prefix_match("hello")
+        self.assertEqual(matched, "hello")
+        self.assertEqual(tenant, "tenant1")
+
+        # Insert same string for different tenant
+        self.tree.insert("hello", "tenant2")
+        matched, tenant = self.tree.prefix_match("hello")
+        self.assertIn(tenant, ["tenant1", "tenant2"])
+
+        # Insert different string for same tenant
+        self.tree.insert("world", "tenant1")
+        matched, tenant = self.tree.prefix_match("world")
+        self.assertEqual(matched, "world")
+        self.assertEqual(tenant, "tenant1")
+
+        print(self.tree.pretty_print())
+
+    def test_insert_partial_match(self):
+        """Test 2: Insert with partial matching scenarios"""
+        # Test partial matches with common prefixes
+        self.tree.insert("hello", "tenant1")
+        print(self.tree.pretty_print())
+        self.tree.insert("help", "tenant2")
+        print(self.tree.pretty_print())
+
+        # Match exact strings
+        matched, tenant = self.tree.prefix_match("hello")
+        self.assertEqual(matched, "hello")
+        self.assertEqual(tenant, "tenant1")
+
+        matched, tenant = self.tree.prefix_match("help")
+        self.assertEqual(matched, "help")
+        self.assertEqual(tenant, "tenant2")
+
+        # Match partial string
+        matched, tenant = self.tree.prefix_match("hel")
+        self.assertEqual(matched, "hel")
+        self.assertIn(tenant, ["tenant1", "tenant2"])
+
+        # Match longer string
+        matched, tenant = self.tree.prefix_match("hello_world")
+        self.assertEqual(matched, "hello")
+        self.assertEqual(tenant, "tenant1")
+
+    def test_insert_edge_cases(self):
+        """Test 3: Edge cases for insert and match operations"""
+        # Empty string
+        self.tree.insert("", "tenant1")
+        matched, tenant = self.tree.prefix_match("")
+        self.assertEqual(matched, "")
+        self.assertEqual(tenant, "tenant1")
+
+        # Single character
+        self.tree.insert("a", "tenant1")
+        matched, tenant = self.tree.prefix_match("a")
+        self.assertEqual(matched, "a")
+        self.assertEqual(tenant, "tenant1")
+
+        # Very long string
+        long_str = "a" * 1000
+        self.tree.insert(long_str, "tenant1")
+        matched, tenant = self.tree.prefix_match(long_str)
+        self.assertEqual(matched, long_str)
+        self.assertEqual(tenant, "tenant1")
+
+        # Unicode characters
+        self.tree.insert("你好", "tenant1")
+        matched, tenant = self.tree.prefix_match("你好")
+        self.assertEqual(matched, "你好")
+        self.assertEqual(tenant, "tenant1")
+
+    def test_simple_eviction(self):
+        """Test 4: Simple eviction scenarios
+        Tenant1: limit 10 chars
+        Tenant2: limit 5 chars
+
+        Should demonstrate:
+        1. Basic eviction when size limit exceeded
+        2. Proper eviction based on last access time
+        3. Verification that shared nodes remain intact for other tenants
+        """
+        # Set up size limits
+        max_size = {"tenant1": 10, "tenant2": 5}
+
+        # Insert strings for both tenants
+        self.tree.insert("hello", "tenant1")  # size 5
+        self.tree.insert("hello", "tenant2")  # size 5
+        self.tree.insert("world", "tenant2")  # size 5, total for tenant2 = 10
+
+        # Verify initial sizes
+        sizes_before = self.tree.get_used_size_per_tenant()
+        self.assertEqual(sizes_before["tenant1"], 5)  # "hello" = 5
+        self.assertEqual(sizes_before["tenant2"], 10)  # "hello" + "world" = 10
+
+        # Evict - should remove "hello" from tenant2 as it's the oldest
+        self.tree.evict_tenant_data(max_size)
+
+        # Verify sizes after eviction
+        sizes_after = self.tree.get_used_size_per_tenant()
+        self.assertEqual(sizes_after["tenant1"], 5)  # Should be unchanged
+        self.assertEqual(sizes_after["tenant2"], 5)  # Only "world" remains
+
+        # Verify "world" remains for tenant2 (was accessed more recently)
+        matched, tenant = self.tree.prefix_match("world")
+        self.assertEqual(matched, "world")
+        self.assertEqual(tenant, "tenant2")
+
+    def test_medium_eviction(self):
+        """Test 5: Medium complexity eviction scenarios with shared prefixes
+        Tenant1: limit 10 chars
+        Tenant2: limit 7 chars (forces one string to be evicted)
+
+        Tree structure after inserts:
+        └── 'h' [t1, t2]
+            ├── 'i' [t1, t2]      # Oldest for t2
+            └── 'e' [t1, t2]
+                ├── 'llo' [t1, t2]
+                └── 'y' [t2]      # Newest for t2
+
+        Size calculations:
+        tenant1: "h"(1) + "i"(1) + "e"(1) + "llo"(3) = 6 chars
+        tenant2: "h"(1) + "i"(1) + "e"(1) + "llo"(3) + "y"(1) = 7 chars
+
+        After eviction (tenant2 exceeds limit by 1 char):
+        "hi" should be removed from tenant2 as it's the oldest access
+        """
+        max_size = {
+            "tenant1": 10,
+            "tenant2": 6,
+        }  # tenant2 will need to evict one string
+
+        # Create a tree with overlapping prefixes
+        self.tree.insert("hi", "tenant1")
+        self.tree.insert("hi", "tenant2")  # OLDEST for t2
+
+        self.tree.insert("hello", "tenant1")
+        self.tree.insert("hello", "tenant2")
+
+        self.tree.insert("hey", "tenant2")  # NEWEST for t2
+
+        # Verify initial sizes
+        sizes_before = self.tree.get_used_size_per_tenant()
+        self.assertEqual(sizes_before["tenant1"], 6)  # h(1) + i(1) + e(1) + llo(3) = 6
+        self.assertEqual(
+            sizes_before["tenant2"], 7
+        )  # h(1) + i(1) + e(1) + llo(3) + y(1) = 7
+
+        print("\nTree before eviction:")
+        print(self.tree.pretty_print())
+
+        # Evict - should remove "hi" from tenant2 as it's the oldest
+        self.tree.evict_tenant_data(max_size)
+
+        print("\nTree after eviction:")
+        print(self.tree.pretty_print())
+
+        # Verify sizes after eviction
+        sizes_after = self.tree.get_used_size_per_tenant()
+        self.assertEqual(sizes_after["tenant1"], 6)  # Should be unchanged
+        self.assertEqual(sizes_after["tenant2"], 6)  # h(1) + e(1) + llo(3) + y(1) = 6
+
+    def test_advanced_eviction(self):
+        ...
+        # Create 4 tenants
+        # Each tenants keeps adding strings with shared prefixes to thousands usage
+        # Set a strict limit for each tenant to only 100
+        # At the end, check whether all of the tenant is under 100 after eviction
+
+        max_size = {"tenant1": 100, "tenant2": 100, "tenant3": 100, "tenant4": 100}
+
+        prefixes = ["aqwefcisdf", "iajsdfkmade", "kjnzxcvewqe", "iejksduqasd"]
+        for i in range(100):
+            for j, prefix in enumerate(prefixes):
+                random_suffix = "".join(random.choices(string.ascii_letters, k=10))
+                self.tree.insert(prefix + random_suffix, f"tenant{j+1}")
+
+        sizes_before = self.tree.get_used_size_per_tenant()
+        print(sizes_before)
+
+        self.tree.evict_tenant_data(max_size)
+
+        sizes_after = self.tree.get_used_size_per_tenant()
+        print(sizes_after)
+        # ensure size_after is below max_size
+        for tenant, size in sizes_after.items():
+            self.assertLessEqual(size, max_size[tenant])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/scripts/playground/router/tree.py b/scripts/playground/router/tree.py
new file mode 100644
index 000000000..9cbfa7cfe
--- /dev/null
+++ b/scripts/playground/router/tree.py
@@ -0,0 +1,292 @@
+import time
+from collections import defaultdict
+from typing import Dict, List
+
+
+class Node:
+    def __init__(self):
+        self.children: Dict[str, Node] = dict()
+        # We choose to use text because most of the use cases are text-to-text,
+        # so we can save the tokenizing overhead.
+        self.text: str = ""
+        # Maps tenant_id to their last access timestamp
+        self.tenant_last_access_time: Dict[str, float] = dict()
+        self.parent = None
+
+
+def shared_prefix_length(s1, s2):
+    min_length = min(len(s1), len(s2))
+    for i in range(min_length):
+        if s1[i] != s2[i]:
+            return i
+    return min_length
+
+
+class MultiTenantRadixTree:
+    """
+    Python Reference of Rust implementation of MultiTenantRadixTree
+
+    MultiTenantRadixTree is the overlap of multiple radix trees by different tenant
+    Each node in the tree can be owned by multiple tenants, allowing for efficient storage of common prefixes
+    while maintaining tenant isolation.
+
+    Key concepts:
+    - Tenant: An entity that owns a subset of the stored strings
+    - Each node tracks which tenants have access to it via tenant_last_access_time
+    - The tree structure is shared, but queries can be filtered by tenant_id
+    """
+
+    def __init__(self):
+        self.root = Node()
+
+    def insert(self, s: str, tenant_id: str) -> None:
+        """
+        Insert string 's' and associate it with the given tenant_id.
+
+        Args:
+            s: The string to insert
+            tenant_id: The identifier of the tenant who owns this string
+        """
+        curr = self.root
+        curr_idx = 0
+        curr.tenant_last_access_time[tenant_id] = time.time()
+
+        while curr_idx < len(s):
+            matched_node = None
+            if s[curr_idx] in curr.children:
+                matched_node = curr.children[s[curr_idx]]
+
+            if matched_node is None:
+                # No match => create a new node
+                new_node = Node()
+                new_node.text = s[curr_idx:]
+                new_node.parent = curr
+
+                curr.children[s[curr_idx]] = new_node
+                curr_idx = len(s)
+                curr = new_node
+                curr.tenant_last_access_time[tenant_id] = time.time()
+            else:
+                shared_len = shared_prefix_length(s[curr_idx:], matched_node.text)
+
+                # 1. If the matched text is shorter than the node text => split the node
+                if shared_len < len(matched_node.text):
+                    # Split structure: [matched_node] => [new_node] -> [contracted_matched_node]
+
+                    matched_text = matched_node.text[:shared_len]
+                    unmatched_text = matched_node.text[shared_len:]
+
+                    new_node = Node()
+                    new_node.text = matched_text
+                    new_node.children = {unmatched_text[0]: matched_node}
+                    new_node.parent = curr
+                    new_node.parent.children[matched_text[0]] = new_node
+                    new_node.tenant_last_access_time = (
+                        matched_node.tenant_last_access_time.copy()
+                    )
+
+                    # Contract matched node
+                    matched_node.text = unmatched_text
+                    matched_node.parent = new_node
+
+                    curr_idx += shared_len
+                    curr = new_node
+                    curr.tenant_last_access_time[tenant_id] = time.time()
+                # 2. If the matched text is longer or equal to the node text => walk down the node
+                else:
+                    curr_idx += shared_len
+                    curr = matched_node
+                    curr.tenant_last_access_time[tenant_id] = time.time()
+
+    def prefix_match(self, s: str) -> tuple[str, int]:
+        """
+        Match string 's' with multiple tenants' trees in one operation.
+
+        Args:
+            s: The string to match
+
+        Returns:
+            Tuple(str, int): The longest prefix of 's' that matches the tree and the first tenant_id that own the matched prefix
+        """
+        curr = self.root
+        curr_idx = 0
+
+        ret_text = ""
+        ret_tenant = None
+
+        while curr_idx < len(s):
+            matched_node = None
+            if s[curr_idx] in curr.children:
+                matched_node = curr.children[s[curr_idx]]
+
+            if matched_node is None:
+                break
+
+            shared_len = shared_prefix_length(s[curr_idx:], matched_node.text)
+            if shared_len == len(matched_node.text):
+                curr_idx += shared_len
+                curr = matched_node
+            else:
+                curr_idx += shared_len
+                curr = matched_node
+                break
+
+        selected_tenant = list(curr.tenant_last_access_time.keys())[0]
+
+        # traverse back to the root to update last access time for the selected tenant
+        while curr != self.root:
+            curr.tenant_last_access_time[selected_tenant] = time.time()
+            curr = curr.parent
+
+        return s[:curr_idx], selected_tenant
+
+    def evict_tenant_data(self, max_size_per_tenant: Dict[str, int]) -> None:
+        """
+        Evict data for tenants that have exceeded their storage limits.
+
+        Args:
+            max_size_per_tenant: Dictionary mapping tenant_id to their maximum allowed storage size
+        """
+
+        def leaf_of(node):
+            """
+            If the node is a leaf for a tenant, add tenant_id to the return list
+            This will return list of tenant ids
+            If not a leaf for all tenants, return []
+            """
+            candidates = dict([(k, True) for k in node.tenant_last_access_time.keys()])
+
+            for n in node.children.values():
+                for c in n.tenant_last_access_time.keys():
+                    candidates[c] = False
+
+            return [k for k, v in candidates.items() if v]
+
+        # maintain a heap with (time, tenant, node) as the value
+        import heapq
+
+        # 1. traverse the tree to
+        #   a. add all the leaves into a heap (a node with N tenants will be added N times into the heap)
+        #   b. calculate the used size for each tenant
+        # do a dfs with stack
+        stack = [self.root]
+        pq = []
+        used_size_per_tenant = defaultdict(int)
+
+        while stack:
+            curr = stack.pop()
+            for t in curr.tenant_last_access_time.keys():
+                used_size_per_tenant[t] += len(curr.text)
+
+            for c in curr.children.values():
+                stack.append(c)
+
+            # if the node is a leaf for a tenant, add the tenant to the heap
+            tenants = leaf_of(curr)
+            for t in tenants:
+                heapq.heappush(pq, (curr.tenant_last_access_time[t], t, curr))
+
+        # 2. pop the heap
+        #   a. if the tenant's used size is less than the limit, continue
+        #   b. if the tenant's used size is greater than the limit, remove the leaf and update the used size, and add its parent to the heap
+        while len(pq) > 0:
+            time, tenant, node = heapq.heappop(pq)
+            if used_size_per_tenant[tenant] <= max_size_per_tenant[tenant]:
+                continue
+
+            # remove the leaf
+            used_size_per_tenant[tenant] -= len(node.text)
+            del node.tenant_last_access_time[tenant]
+            # if no children and no tenants, remove the node
+            if len(node.children) == 0 and len(node.tenant_last_access_time) == 0:
+                del node.parent.children[node.text[0]]
+
+            # add its parent to the heap
+            if tenant in leaf_of(node.parent):
+                heapq.heappush(
+                    pq,
+                    (node.parent.tenant_last_access_time[tenant], tenant, node.parent),
+                )
+
+    def get_used_size_per_tenant(self) -> Dict[str, int]:
+        """
+        Calculate the used storage size for each tenant.
+
+        Returns:
+            Dict[str, int]: A dictionary mapping tenant_id to their used storage size
+        """
+        used_size_per_tenant = defaultdict(int)
+
+        stack = [self.root]
+        while stack:
+            curr = stack.pop()
+            for t in curr.tenant_last_access_time.keys():
+                used_size_per_tenant[t] += len(curr.text)
+
+            for c in curr.children.values():
+                stack.append(c)
+
+        return used_size_per_tenant
+
+    def remove_tenant(self, tenant_id: str) -> None:
+        """
+        Remove all data associated with a specific tenant from the tree.
+        This operation maintains the integrity of the shared tree structure while
+        removing only the specified tenant's access information.
+
+        Args:
+            tenant_id: The identifier of the tenant whose data should be removed
+        """
+        # TODO: Implementation needed
+        pass
+
+    def pretty_print(self) -> str:
+        """
+        Returns a string representation of the tree showing the structure, tenant ownership,
+        and leaf status for each node.
+
+        Returns:
+            str: A formatted string showing the tree hierarchy with tenant information
+        """
+
+        def _node_to_str(node: Node, prefix: str = "", is_last: bool = True) -> str:
+            # Current node representation
+            node_str = prefix
+            node_str += "└── " if is_last else "├── "
+
+            # Add node text
+            node_str += f"'{node.text}' ["
+
+            # Add tenant information including both timestamp and leaf status
+            tenant_info = []
+            for tid, ts in node.tenant_last_access_time.items():
+                time_str = (
+                    time.strftime("%H:%M:%S.", time.localtime(ts))
+                    + f"{(ts % 1):0.3f}"[2:]
+                )
+                tenant_info.append(f"{tid} | {time_str}")
+
+            node_str += ", ".join(tenant_info)
+            node_str += "]\n"
+
+            # Handle children
+            children = list(node.children.items())
+            for i, (char, child) in enumerate(children):
+                is_last_child = i == len(children) - 1
+                # Adjust prefix for children based on whether this is the last child
+                new_prefix = prefix + ("    " if is_last else "│   ")
+                node_str += _node_to_str(child, new_prefix, is_last_child)
+
+            return node_str
+
+        if not self.root.children:
+            return "Empty tree"
+
+        # Start with root's children since root itself is just an empty node
+        result = ""
+        children = list(self.root.children.items())
+        for i, (char, child) in enumerate(children):
+            is_last = i == len(children) - 1
+            result += _node_to_str(child, "", is_last)
+
+        return result
diff --git a/scripts/update_kernel_whl_index.py b/scripts/update_kernel_whl_index.py
new file mode 100644
index 000000000..e6b2279b1
--- /dev/null
+++ b/scripts/update_kernel_whl_index.py
@@ -0,0 +1,33 @@
+# Reference: https://github.com/flashinfer-ai/flashinfer/blob/v0.2.0/scripts/update_whl_index.py
+
+import argparse
+import hashlib
+import pathlib
+import re
+
+
+def update_wheel_index(cuda_version="118"):
+    index_dir = pathlib.Path(f"sgl-whl/cu{cuda_version}/sgl-kernel")
+    index_dir.mkdir(exist_ok=True)
+    base_url = "https://github.com/sgl-project/whl/releases/download"
+
+    for path in sorted(pathlib.Path("sgl-kernel/dist").glob("*.whl")):
+        with open(path, "rb") as f:
+            sha256 = hashlib.sha256(f.read()).hexdigest()
+        ver = re.findall(
+            r"sgl_kernel-([0-9.]+(?:\.post[0-9]+)?)(?:\+cu[0-9]+)?-", path.name
+        )[0]
+        full_url = f"{base_url}/v{ver}/{path.name}#sha256={sha256}"
+        with (index_dir / "index.html").open("a") as f:
+            f.write(f'<a href="{full_url}">{path.name}</a><br>\n')
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--cuda", type=str, default="118")
+    args = parser.parse_args()
+    update_wheel_index(args.cuda)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/version_branch_to_tag.sh b/scripts/version_branch_to_tag.sh
new file mode 100755
index 000000000..9f587fb0b
--- /dev/null
+++ b/scripts/version_branch_to_tag.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+set -euxo pipefail
+
+# This script is used for release.
+# It tags all remote branches starting with 'v' with the same name as the branch,
+# deletes the corresponding branches from the remote, and pushes the tags to the remote repository.
+
+git fetch origin --prune
+
+# List all branches starting with 'v'
+branches=$(git branch -r | grep 'origin/v' | sed 's/origin\///')
+
+# Loop through each branch
+for branch in $branches; do
+    echo "Processing branch: $branch"
+
+    # Get the commit hash for the branch
+    commit_hash=$(git rev-parse origin/$branch)
+
+    # Create a tag with the same name as the branch using the commit hash
+    git tag $branch $commit_hash
+
+    # Delete the branch from the remote
+    git push origin --delete $branch
+done
+
+# Push all tags to the remote repository
+git push --tags
+
+echo "All branches starting with 'v' have been tagged, deleted from remote, and pushed to the remote repository."
diff --git a/sgl-kernel/.clang-format b/sgl-kernel/.clang-format
new file mode 100644
index 000000000..afbd654a7
--- /dev/null
+++ b/sgl-kernel/.clang-format
@@ -0,0 +1,15 @@
+BasedOnStyle: Google
+IndentWidth: 2
+ColumnLimit: 120
+AllowShortFunctionsOnASingleLine: Empty
+DerivePointerAlignment: false
+PointerAlignment: Left
+NamespaceIndentation: None
+SortIncludes: true
+AllowShortLoopsOnASingleLine: false
+BinPackParameters: false              # Prevents packing parameters in declarations
+BinPackArguments: false               # Prevents packing arguments in function calls
+AlignAfterOpenBracket: AlwaysBreak    # Forces a break after the opening parenthesis
+AlignOperands: Align                  # Aligns arguments vertically
+PenaltyBreakBeforeFirstCallParameter: 1  # Encourages breaking before the first argument
+PenaltyReturnTypeOnItsOwnLine: 100    # Keeps return type with function name
diff --git a/sgl-kernel/CMakeLists.txt b/sgl-kernel/CMakeLists.txt
new file mode 100644
index 000000000..80f29921f
--- /dev/null
+++ b/sgl-kernel/CMakeLists.txt
@@ -0,0 +1,534 @@
+cmake_minimum_required(VERSION 3.26 FATAL_ERROR)
+project(sgl-kernel LANGUAGES CXX CUDA)
+
+# CMake
+cmake_policy(SET CMP0169 OLD)
+cmake_policy(SET CMP0177 NEW)
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
+set(CMAKE_COLOR_DIAGNOSTICS ON)
+set(CMAKE_VERBOSE_MAKEFILE ON CACHE BOOL "ON")
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+set(CMAKE_SHARED_LIBRARY_PREFIX "")
+
+# Python
+find_package(Python COMPONENTS Interpreter Development.Module ${SKBUILD_SABI_COMPONENT} REQUIRED)
+
+# CXX
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
+
+# CUDA
+enable_language(CUDA)
+find_package(CUDAToolkit REQUIRED)
+set_property(GLOBAL PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+
+message(STATUS "Detected CUDA_VERSION=${CUDA_VERSION}")
+if ("${CUDA_VERSION}" VERSION_GREATER_EQUAL "13.0")
+    message("CUDA_VERSION ${CUDA_VERSION} >= 13.0")
+elseif ("${CUDA_VERSION}" VERSION_GREATER_EQUAL "12.8")
+    message("CUDA_VERSION ${CUDA_VERSION} >= 12.8")
+elseif ("${CUDA_VERSION}" VERSION_GREATER_EQUAL "12.4")
+    message("CUDA_VERSION ${CUDA_VERSION} >= 12.4")
+elseif ("${CUDA_VERSION}" VERSION_GREATER_EQUAL "12.1")
+    message("CUDA_VERSION ${CUDA_VERSION} >= 12.1")
+elseif ("${CUDA_VERSION}" VERSION_GREATER_EQUAL "11.8")
+    message("CUDA_VERSION ${CUDA_VERSION} >= 11.8")
+endif()
+
+# Torch
+find_package(Torch REQUIRED)
+# clean Torch Flag
+clear_cuda_arches(CMAKE_FLAG)
+
+include(FetchContent)
+
+# cutlass
+FetchContent_Declare(
+    repo-cutlass
+    GIT_REPOSITORY https://github.com/NVIDIA/cutlass
+    GIT_TAG        a49a78ffefc86a87160dfe0ccc3a3a2d1622c918
+    GIT_SHALLOW    OFF
+)
+FetchContent_Populate(repo-cutlass)
+
+# DeepGEMM
+FetchContent_Declare(
+    repo-deepgemm
+    GIT_REPOSITORY https://github.com/sgl-project/DeepGEMM
+    GIT_TAG        sgl
+    GIT_SHALLOW    OFF
+)
+FetchContent_Populate(repo-deepgemm)
+
+FetchContent_Declare(
+    repo-fmt
+    GIT_REPOSITORY https://github.com/fmtlib/fmt
+    GIT_TAG        553ec11ec06fbe0beebfbb45f9dc3c9eabd83d28
+    GIT_SHALLOW    OFF
+)
+FetchContent_Populate(repo-fmt)
+
+# Triton
+FetchContent_Declare(
+    repo-triton
+    GIT_REPOSITORY "https://github.com/triton-lang/triton"
+    GIT_TAG        8f9f695ea8fde23a0c7c88e4ab256634ca27789f
+    GIT_SHALLOW    OFF
+)
+FetchContent_Populate(repo-triton)
+
+# flashinfer
+FetchContent_Declare(
+    repo-flashinfer
+    GIT_REPOSITORY https://github.com/flashinfer-ai/flashinfer.git
+    GIT_TAG        018b551825c8e5579206e6eb9d3229fa679202b3
+    GIT_SHALLOW    OFF
+)
+FetchContent_Populate(repo-flashinfer)
+
+# flash-attention
+FetchContent_Declare(
+    repo-flash-attention
+    GIT_REPOSITORY https://github.com/sgl-project/sgl-attn
+    GIT_TAG        sgl-kernel
+    GIT_SHALLOW    OFF
+)
+FetchContent_Populate(repo-flash-attention)
+
+# flash-attention origin
+FetchContent_Declare(
+    repo-flash-attention-origin
+    GIT_REPOSITORY https://github.com/Dao-AILab/flash-attention.git
+    GIT_TAG        203b9b3dba39d5d08dffb49c09aa622984dff07d
+    GIT_SHALLOW    OFF
+)
+FetchContent_Populate(repo-flash-attention-origin)
+
+# mscclpp
+FetchContent_Declare(
+    repo-mscclpp
+    GIT_REPOSITORY https://github.com/microsoft/mscclpp.git
+    GIT_TAG        51eca89d20f0cfb3764ccd764338d7b22cd486a6
+    GIT_SHALLOW    OFF
+)
+FetchContent_Populate(repo-mscclpp)
+
+# ccache option
+option(ENABLE_CCACHE "Whether to use ccache" ON)
+find_program(CCACHE_FOUND ccache)
+if(CCACHE_FOUND AND ENABLE_CCACHE AND DEFINED ENV{CCACHE_DIR})
+    message(STATUS "Building with CCACHE enabled")
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "ccache")
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "ccache")
+endif()
+
+# Enable gencode below SM90
+option(ENABLE_BELOW_SM90 "Enable below SM90" ON)
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+    set(ENABLE_BELOW_SM90 OFF)
+    message(STATUS "For aarch64, disable gencode below SM90 by default")
+endif()
+
+include_directories(
+    ${PROJECT_SOURCE_DIR}/include
+    ${PROJECT_SOURCE_DIR}/csrc
+    ${repo-cutlass_SOURCE_DIR}/include
+    ${repo-cutlass_SOURCE_DIR}/tools/util/include
+    ${repo-flashinfer_SOURCE_DIR}/include
+    ${repo-flashinfer_SOURCE_DIR}/csrc
+    ${repo-mscclpp_SOURCE_DIR}/include
+)
+
+set(SGL_KERNEL_CUDA_FLAGS
+    "-DNDEBUG"
+    "-DOPERATOR_NAMESPACE=sgl-kernel"
+    "-O3"
+    "-Xcompiler"
+    "-fPIC"
+    "-gencode=arch=compute_90,code=sm_90"
+    "-std=c++17"
+    "-DFLASHINFER_ENABLE_F16"
+    "-DCUTE_USE_PACKED_TUPLE=1"
+    "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1"
+    "-DCUTLASS_VERSIONS_GENERATED"
+    "-DCUTLASS_TEST_LEVEL=0"
+    "-DCUTLASS_TEST_ENABLE_CACHED_RESULTS=1"
+    "-DCUTLASS_DEBUG_TRACE_LEVEL=0"
+    "--expt-relaxed-constexpr"
+    "--expt-extended-lambda"
+    # The following flag leads to the CMAKE_BUILD_PARALLEL_LEVEL breaking,
+    # it triggers OOM with low memory host. Extract the threads number to
+    # option named SGL_KERNEL_COMPILE_THREADS, default value 32.
+    # "--threads=32"
+
+    # Supress warnings
+    "-Xcompiler=-Wno-clang-format-violations"
+    "-Xcompiler=-Wno-conversion"
+    "-Xcompiler=-Wno-deprecated-declarations"
+    "-Xcompiler=-Wno-terminate"
+    "-Xcompiler=-Wfatal-errors"
+    "-Xcompiler=-ftemplate-backtrace-limit=1"
+    "-Xcudafe=--diag_suppress=177"  # variable was declared but never referenced
+
+    # uncomment to debug
+    # "--ptxas-options=-v"
+    # "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage"
+)
+
+set(SGL_KERNEL_COMPILE_THREADS 32 CACHE STRING "Set compilation threads, default 32")
+
+# When SGL_KERNEL_COMPILE_THREADS value is less than 1, set it to 1
+if (NOT SGL_KERNEL_COMPILE_THREADS MATCHES "^[0-9]+$")
+    message(FATAL_ERROR "SGL_KERNEL_COMPILE_THREADS must be an integer, but was set to '${SGL_KERNEL_COMPILE_THREADS}'.")
+elseif (SGL_KERNEL_COMPILE_THREADS LESS 1)
+    message(STATUS "SGL_KERNEL_COMPILE_THREADS was set to a value less than 1. Using 1 instead.")
+    set(SGL_KERNEL_COMPILE_THREADS 1)
+endif()
+
+list(APPEND SGL_KERNEL_CUDA_FLAGS
+    "--threads=${SGL_KERNEL_COMPILE_THREADS}"
+)
+
+option(SGL_KERNEL_ENABLE_BF16             "Enable BF16"             ON)
+option(SGL_KERNEL_ENABLE_FP8              "Enable FP8"              ON)
+option(SGL_KERNEL_ENABLE_FP4              "Enable FP4"              OFF)
+option(SGL_KERNEL_ENABLE_FA3              "Enable FA3"              OFF)
+option(SGL_KERNEL_ENABLE_SM90A            "Enable SM90A"            OFF)
+option(SGL_KERNEL_ENABLE_SM100A           "Enable SM100A"           OFF)
+
+if (SGL_KERNEL_ENABLE_BF16)
+    list(APPEND SGL_KERNEL_CUDA_FLAGS
+        "-DFLASHINFER_ENABLE_BF16"
+    )
+endif()
+
+if (SGL_KERNEL_ENABLE_FP8)
+    list(APPEND SGL_KERNEL_CUDA_FLAGS
+        "-DFLASHINFER_ENABLE_FP8"
+        "-DFLASHINFER_ENABLE_FP8_E4M3"
+        "-DFLASHINFER_ENABLE_FP8_E5M2"
+    )
+endif()
+
+if (ENABLE_BELOW_SM90)
+    list(APPEND SGL_KERNEL_CUDA_FLAGS
+        "-gencode=arch=compute_80,code=sm_80"
+        "-gencode=arch=compute_89,code=sm_89"
+    )
+endif()
+
+if ("${CUDA_VERSION}" VERSION_GREATER_EQUAL "12.8" OR SGL_KERNEL_ENABLE_SM100A)
+    list(APPEND SGL_KERNEL_CUDA_FLAGS
+        "-gencode=arch=compute_100a,code=sm_100a"
+        "-gencode=arch=compute_120a,code=sm_120a"
+    )
+
+    # refer sm_121, sm_110 and sm_101 description  https://github.com/pytorch/pytorch/pull/156176
+    if ("${CUDA_VERSION}" VERSION_GREATER_EQUAL "13.0")
+        list(APPEND SGL_KERNEL_CUDA_FLAGS
+            "-gencode=arch=compute_103a,code=sm_103a"
+            "-gencode=arch=compute_110a,code=sm_110a"
+            "-gencode=arch=compute_121a,code=sm_121a"
+            "--compress-mode=size"
+        )
+    else()
+        list(APPEND SGL_KERNEL_CUDA_FLAGS
+            "-gencode=arch=compute_101a,code=sm_101a"
+        )
+    endif()
+
+else()
+    list(APPEND SGL_KERNEL_CUDA_FLAGS
+        "-use_fast_math"
+    )
+endif()
+
+if ("${CUDA_VERSION}" VERSION_GREATER_EQUAL "12.4" OR SGL_KERNEL_ENABLE_SM90A)
+    set(SGL_KERNEL_ENABLE_FA3 ON)
+    list(APPEND SGL_KERNEL_CUDA_FLAGS
+        "-gencode=arch=compute_90a,code=sm_90a"
+    )
+endif()
+
+if ("${CUDA_VERSION}" VERSION_GREATER_EQUAL "12.8" OR SGL_KERNEL_ENABLE_FP4)
+    list(APPEND SGL_KERNEL_CUDA_FLAGS
+        "-DENABLE_NVFP4=1"
+    )
+endif()
+
+set(SOURCES
+    "csrc/allreduce/custom_all_reduce.cu"
+    "csrc/allreduce/mscclpp_allreduce.cu"
+    "csrc/attention/cascade.cu"
+    "csrc/attention/cutlass_mla_kernel.cu"
+    "csrc/attention/lightning_attention_decode_kernel.cu"
+    "csrc/attention/merge_attn_states.cu"
+    "csrc/attention/vertical_slash_index.cu"
+    "csrc/elementwise/activation.cu"
+    "csrc/elementwise/cast.cu"
+    "csrc/elementwise/copy.cu"
+    "csrc/elementwise/concat_mla.cu"
+    "csrc/elementwise/fused_add_rms_norm_kernel.cu"
+    "csrc/elementwise/rope.cu"
+    "csrc/common_extension.cc"
+
+    "csrc/gemm/awq_kernel.cu"
+    "csrc/gemm/bmm_fp8.cu"
+    "csrc/gemm/dsv3_fused_a_gemm.cu"
+    "csrc/gemm/dsv3_router_gemm_bf16_out.cu"
+    "csrc/gemm/dsv3_router_gemm_entry.cu"
+    "csrc/gemm/dsv3_router_gemm_float_out.cu"
+    "csrc/gemm/fp8_blockwise_gemm_kernel.cu"
+    "csrc/gemm/fp8_gemm_kernel.cu"
+    "csrc/gemm/int8_gemm_kernel.cu"
+    "csrc/gemm/nvfp4_expert_quant.cu"
+    "csrc/gemm/nvfp4_quant_entry.cu"
+    "csrc/gemm/nvfp4_quant_kernels.cu"
+    "csrc/gemm/nvfp4_scaled_mm_entry.cu"
+    "csrc/gemm/nvfp4_scaled_mm_kernels.cu"
+    "csrc/gemm/per_tensor_quant_fp8.cu"
+    "csrc/gemm/per_token_group_quant_8bit.cu"
+    "csrc/gemm/per_token_quant_fp8.cu"
+    "csrc/gemm/qserve_w4a8_per_chn_gemm.cu"
+    "csrc/gemm/qserve_w4a8_per_group_gemm.cu"
+    "csrc/gemm/marlin/gptq_marlin.cu"
+    "csrc/gemm/marlin/gptq_marlin_repack.cu"
+    "csrc/gemm/marlin/awq_marlin_repack.cu"
+    "csrc/gemm/gptq/gptq_kernel.cu"
+
+    "csrc/grammar/apply_token_bitmask_inplace_cuda.cu"
+
+    "csrc/moe/cutlass_moe/w4a8/scaled_mm_entry.cu"
+    "csrc/moe/cutlass_moe/w4a8/w4a8_moe_data.cu"
+    "csrc/moe/cutlass_moe/w4a8/w4a8_grouped_mm_c3x.cu"
+    "csrc/moe/marlin_moe_wna16/ops.cu"
+    "csrc/mamba/causal_conv1d.cu"
+    "csrc/moe/moe_align_kernel.cu"
+    "csrc/moe/moe_fused_gate.cu"
+    "csrc/moe/moe_topk_softmax_kernels.cu"
+    "csrc/moe/nvfp4_blockwise_moe.cu"
+    "csrc/moe/fp8_blockwise_moe_kernel.cu"
+    "csrc/moe/prepare_moe_input.cu"
+
+    "csrc/memory/store.cu"
+    "csrc/kvcacheio/transfer.cu"
+
+    "csrc/speculative/eagle_utils.cu"
+    "csrc/speculative/packbit.cu"
+    "csrc/speculative/speculative_sampling.cu"
+
+    "${repo-flashinfer_SOURCE_DIR}/csrc/norm.cu"
+    "${repo-flashinfer_SOURCE_DIR}/csrc/renorm.cu"
+    "${repo-flashinfer_SOURCE_DIR}/csrc/sampling.cu"
+
+    "${repo-flash-attention_SOURCE_DIR}/csrc/flash_attn/src/flash_fwd_sparse_hdim128_bf16_causal_sm80.cu"
+    "${repo-flash-attention_SOURCE_DIR}/csrc/flash_attn/src/flash_fwd_sparse_hdim128_bf16_sm80.cu"
+    "${repo-flash-attention_SOURCE_DIR}/csrc/flash_attn/src/flash_fwd_sparse_hdim128_fp16_causal_sm80.cu"
+    "${repo-flash-attention_SOURCE_DIR}/csrc/flash_attn/src/flash_fwd_sparse_hdim128_fp16_sm80.cu"
+    "${repo-flash-attention_SOURCE_DIR}/csrc/flash_attn/flash_sparse_api.cpp"
+)
+
+Python_add_library(common_ops MODULE USE_SABI ${SKBUILD_SABI_VERSION} WITH_SOABI ${SOURCES})
+
+target_compile_options(common_ops PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${SGL_KERNEL_CUDA_FLAGS}>)
+target_include_directories(common_ops PRIVATE
+    ${repo-cutlass_SOURCE_DIR}/examples/77_blackwell_fmha
+    ${repo-cutlass_SOURCE_DIR}/examples/common
+    ${repo-flash-attention_SOURCE_DIR}/csrc/flash_attn/src
+)
+
+find_package(Python3 COMPONENTS Interpreter REQUIRED)
+execute_process(
+    COMMAND ${Python3_EXECUTABLE} -c "import torch; print(int(torch._C._GLIBCXX_USE_CXX11_ABI))"
+    OUTPUT_VARIABLE TORCH_CXX11_ABI
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+if(TORCH_CXX11_ABI STREQUAL "0")
+    message(STATUS "Using old C++ ABI (-D_GLIBCXX_USE_CXX11_ABI=0)")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+else()
+    message(STATUS "Using new C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI=1)")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
+endif()
+
+# mscclpp
+set(MSCCLPP_USE_CUDA ON)
+set(MSCCLPP_BYPASS_GPU_CHECK ON)
+set(MSCCLPP_BUILD_TESTS OFF)
+add_subdirectory(
+    ${repo-mscclpp_SOURCE_DIR}
+    ${CMAKE_CURRENT_BINARY_DIR}/mscclpp-build
+)
+target_link_libraries(common_ops PRIVATE ${TORCH_LIBRARIES} c10 cuda cublas cublasLt mscclpp_static)
+
+# flash attention
+target_compile_definitions(common_ops PRIVATE
+    FLASHATTENTION_DISABLE_BACKWARD
+    FLASHATTENTION_DISABLE_DROPOUT
+    FLASHATTENTION_DISABLE_UNEVEN_K
+)
+
+install(TARGETS common_ops LIBRARY DESTINATION sgl_kernel)
+
+# ============================ Optional Install ============================= #
+# set flash-attention sources file
+# Now FA3 support sm80/sm86/sm90
+if (SGL_KERNEL_ENABLE_FA3)
+    set(SGL_FLASH_KERNEL_CUDA_FLAGS
+        "-DNDEBUG"
+        "-DOPERATOR_NAMESPACE=sgl-kernel"
+        "-O3"
+        "-Xcompiler"
+        "-fPIC"
+        "-gencode=arch=compute_90a,code=sm_90a"
+        "-std=c++17"
+        "-DCUTE_USE_PACKED_TUPLE=1"
+        "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1"
+        "-DCUTLASS_VERSIONS_GENERATED"
+        "-DCUTLASS_TEST_LEVEL=0"
+        "-DCUTLASS_TEST_ENABLE_CACHED_RESULTS=1"
+        "-DCUTLASS_DEBUG_TRACE_LEVEL=0"
+        "--expt-relaxed-constexpr"
+        "--expt-extended-lambda"
+        "--use_fast_math"
+        "-Xcompiler=-Wconversion"
+        "-Xcompiler=-fno-strict-aliasing"
+    )
+
+    if (ENABLE_BELOW_SM90)
+        list(APPEND SGL_FLASH_KERNEL_CUDA_FLAGS
+            "-gencode=arch=compute_80,code=sm_80"
+            "-gencode=arch=compute_86,code=sm_86"
+        )
+        # SM8X Logic
+        file(GLOB FA3_SM8X_GEN_SRCS
+            "${repo-flash-attention_SOURCE_DIR}/hopper/instantiations/flash_fwd_hdim*_sm80.cu")
+    endif()
+
+    file(GLOB FA3_BF16_GEN_SRCS
+        "${repo-flash-attention_SOURCE_DIR}/hopper/instantiations/flash_fwd_hdimall_bf16*_sm90.cu")
+    file(GLOB FA3_BF16_GEN_SRCS_
+        "${repo-flash-attention_SOURCE_DIR}/hopper/instantiations/flash_fwd_hdimdiff_bf16*_sm90.cu")
+    list(APPEND FA3_BF16_GEN_SRCS ${FA3_BF16_GEN_SRCS_})
+
+    # FP16 source files
+    file(GLOB FA3_FP16_GEN_SRCS
+        "${repo-flash-attention_SOURCE_DIR}/hopper/instantiations/flash_fwd_hdimall_fp16*_sm90.cu")
+    file(GLOB FA3_FP16_GEN_SRCS_
+        "${repo-flash-attention_SOURCE_DIR}/hopper/instantiations/flash_fwd_hdimdiff_fp16*_sm90.cu")
+    list(APPEND FA3_FP16_GEN_SRCS ${FA3_FP16_GEN_SRCS_})
+
+    # FP8 source files
+    file(GLOB FA3_FP8_GEN_SRCS
+        "${repo-flash-attention_SOURCE_DIR}/hopper/instantiations/flash_fwd_hdimall_e4m3*_sm90.cu")
+    file(GLOB FA3_FP8_GEN_SRCS_
+        "${repo-flash-attention_SOURCE_DIR}/hopper/instantiations/flash_fwd_hdimdiff_e4m3*_sm90.cu")
+    list(APPEND FA3_FP8_GEN_SRCS ${FA3_FP8_GEN_SRCS_})
+
+    set(FA3_GEN_SRCS ${FA3_BF16_GEN_SRCS} ${FA3_FP16_GEN_SRCS} ${FA3_FP8_GEN_SRCS} ${FA3_SM8X_GEN_SRCS})
+
+    set(FLASH_SOURCES
+        "csrc/flash_extension.cc"
+        "${repo-flash-attention_SOURCE_DIR}/hopper/flash_prepare_scheduler.cu"
+        "${repo-flash-attention_SOURCE_DIR}/hopper/flash_api.cpp"
+        "${repo-flash-attention_SOURCE_DIR}/hopper/flash_fwd_combine.cu"
+        "${FA3_GEN_SRCS}"
+    )
+
+    Python_add_library(flash_ops MODULE USE_SABI ${SKBUILD_SABI_VERSION} WITH_SOABI ${FLASH_SOURCES})
+
+    target_compile_options(flash_ops PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${SGL_FLASH_KERNEL_CUDA_FLAGS}>)
+    target_include_directories(flash_ops PRIVATE
+        ${repo-flash-attention_SOURCE_DIR}/hopper
+    )
+    target_link_libraries(flash_ops PRIVATE ${TORCH_LIBRARIES} c10 cuda)
+
+    install(TARGETS flash_ops LIBRARY DESTINATION "sgl_kernel")
+    set(FLASH_OPS_COMPILE_DEFS
+        FLASHATTENTION_DISABLE_BACKWARD
+        FLASHATTENTION_DISABLE_DROPOUT
+        FLASHATTENTION_DISABLE_UNEVEN_K
+        FLASHATTENTION_VARLEN_ONLY
+    )
+
+    if(NOT ENABLE_BELOW_SM90)
+        list(APPEND FLASH_OPS_COMPILE_DEFS FLASHATTENTION_DISABLE_SM8x)
+    endif()
+    target_compile_definitions(flash_ops PRIVATE ${FLASH_OPS_COMPILE_DEFS})
+endif()
+
+# Build spatial_ops as a separate, optional extension for green contexts
+set(SPATIAL_SOURCES
+    "csrc/spatial/greenctx_stream.cu"
+    "csrc/spatial_extension.cc"
+)
+
+Python_add_library(spatial_ops MODULE USE_SABI ${SKBUILD_SABI_VERSION} WITH_SOABI ${SPATIAL_SOURCES})
+target_compile_options(spatial_ops PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${SGL_KERNEL_CUDA_FLAGS}>)
+target_link_libraries(spatial_ops PRIVATE ${TORCH_LIBRARIES} c10 cuda)
+install(TARGETS spatial_ops LIBRARY DESTINATION sgl_kernel)
+
+
+# ============================ DeepGEMM (JIT) ============================= #
+# Create a separate library for DeepGEMM's Python API.
+# This keeps its compilation isolated from the main common_ops.
+set(DEEPGEMM_SOURCES
+    "${repo-deepgemm_SOURCE_DIR}/csrc/python_api.cpp"
+)
+
+Python_add_library(deep_gemm_cpp MODULE USE_SABI ${SKBUILD_SABI_VERSION} WITH_SOABI ${DEEPGEMM_SOURCES})
+
+# Link against necessary libraries, including nvrtc for JIT compilation.
+target_link_libraries(deep_gemm_cpp PRIVATE ${TORCH_LIBRARIES} c10 cuda nvrtc mscclpp_static)
+
+# Add include directories needed by DeepGEMM.
+target_include_directories(deep_gemm_cpp PRIVATE
+    ${repo-deepgemm_SOURCE_DIR}/deep_gemm/include
+    ${repo-cutlass_SOURCE_DIR}/include
+    ${repo-fmt_SOURCE_DIR}/include
+)
+
+# Apply the same compile options as common_ops.
+target_compile_options(deep_gemm_cpp PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${SGL_KERNEL_CUDA_FLAGS}>)
+
+# Create an empty __init__.py to make `deepgemm` a Python package.
+file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/deepgemm_pkg_init.py "")
+install(
+    FILES ${CMAKE_CURRENT_BINARY_DIR}/deepgemm_pkg_init.py
+    DESTINATION deep_gemm
+    RENAME __init__.py
+)
+
+# Install the compiled DeepGEMM API library.
+install(TARGETS deep_gemm_cpp LIBRARY DESTINATION deep_gemm)
+
+# Install the source files required by DeepGEMM for runtime JIT compilation.
+install(
+    DIRECTORY ${repo-deepgemm_SOURCE_DIR}/deep_gemm/
+    DESTINATION deep_gemm
+)
+
+install(DIRECTORY "${repo-cutlass_SOURCE_DIR}/include/cute/"
+        DESTINATION "deep_gemm/include/cute")
+
+install(DIRECTORY "${repo-cutlass_SOURCE_DIR}/include/cutlass/"
+        DESTINATION "deep_gemm/include/cutlass")
+
+# triton_kernels
+install(DIRECTORY "${repo-triton_SOURCE_DIR}/python/triton_kernels/triton_kernels/"
+        DESTINATION "triton_kernels"
+        PATTERN ".git*" EXCLUDE
+        PATTERN "__pycache__" EXCLUDE)
+
+# flash attention 4
+# TODO: find a better install condition.
+if ("${CUDA_VERSION}" VERSION_GREATER_EQUAL "12.8" OR SGL_KERNEL_ENABLE_SM100A)
+    # flash_attn/cute
+    install(DIRECTORY "${repo-flash-attention-origin_SOURCE_DIR}/flash_attn/cute/"
+            DESTINATION "flash_attn/cute"
+            PATTERN ".git*" EXCLUDE
+            PATTERN "__pycache__" EXCLUDE)
+    endif()
diff --git a/sgl-kernel/LICENSE b/sgl-kernel/LICENSE
new file mode 100644
index 000000000..9c422689c
--- /dev/null
+++ b/sgl-kernel/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2023-2024 SGLang Team
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/sgl-kernel/Makefile b/sgl-kernel/Makefile
new file mode 100644
index 000000000..c40489800
--- /dev/null
+++ b/sgl-kernel/Makefile
@@ -0,0 +1,71 @@
+.PHONY: help check-deps install-deps tree ln submodule install build clean rebuild test format update
+
+# Show help for each target
+help: ## Show this help message
+	@echo "Available targets:"
+	@grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
+
+check-deps: ## Check and install required Python formatting dependencies
+	@command -v isort >/dev/null 2>&1 || (echo "Installing isort..." && pip install isort)
+	@command -v black >/dev/null 2>&1 || (echo "Installing black..." && pip install black)
+
+install-deps: ## Install Python formatting tools (isort and black)
+	pip install scikit-build-core isort black
+
+tree: ## Show project directory structure
+	@tree --prune -I "__pycache__|*.egg-info|*.so|build|3rdparty|dist"
+
+submodule: ## Initialize and update git submodules
+	@git submodule update --init --recursive
+
+ln: submodule ## Create compilation database
+	@rm -rf build && mkdir build && cd build && cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=YES -DCMAKE_POLICY_VERSION_MINIMUM=3.5
+
+install: submodule ## Install package in development mode
+	@pip install -e . --no-build-isolation
+
+build: install-deps submodule ## Build and install wheel package
+	@rm -rf dist/* || true && CMAKE_POLICY_VERSION_MINIMUM=3.5 MAX_JOBS=$(nproc) CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) uv build --wheel -Cbuild-dir=build . --verbose --color=always --no-build-isolation && pip3 install dist/*whl --force-reinstall --no-deps
+
+clean: ## Remove build artifacts
+	@rm -rf build dist *.egg-info
+
+rebuild: clean submodule build ## Clean and rebuild the project
+	@echo "Succeed to rebuild"
+
+test: ## Run all tests
+	@find tests -name "test_*.py" | xargs -n 1 python3
+
+format: check-deps ## Format all source files
+	@echo "Formatting source files..."
+	@find csrc tests -name '*.cc' -o -name '*.cu' -o -name '*.cuh' -o -name '*.h' -o -name '*.hpp' | xargs clang-format -i
+	@find python tests -name '*.py' | xargs isort
+	@find python tests -name '*.py' | xargs black
+	@pre-commit run --all-files
+
+FILES_TO_UPDATE = python/sgl_kernel/version.py \
+                 pyproject.toml \
+                 pyproject_rocm.toml \
+                 pyproject_cpu.toml \
+                 ../docker/Dockerfile \
+                 ../.github/workflows/pr-test-pd-router.yml
+
+update: ## Update version numbers across project files. Usage: make update <new_version>
+	@if [ -z "$(filter-out $@,$(MAKECMDGOALS))" ]; then \
+		echo "Version required. Usage: make update <new_version>"; \
+		exit 1; \
+	fi
+	@OLD_VERSION=$$(grep "version" python/sgl_kernel/version.py | cut -d '"' -f2); \
+	NEW_VERSION=$(filter-out $@,$(MAKECMDGOALS)); \
+	echo "Updating version from $$OLD_VERSION to $$NEW_VERSION"; \
+	for file in $(FILES_TO_UPDATE); do \
+		if [ "$(shell uname)" = "Darwin" ]; then \
+			sed -i '' -e "s/$$OLD_VERSION/$$NEW_VERSION/g" $$file; \
+		else \
+			sed -i -e "s/$$OLD_VERSION/$$NEW_VERSION/g" $$file; \
+		fi \
+	done; \
+	echo "Version update complete"
+
+%:
+	@:
diff --git a/sgl-kernel/README.md b/sgl-kernel/README.md
new file mode 100644
index 000000000..47f3dea54
--- /dev/null
+++ b/sgl-kernel/README.md
@@ -0,0 +1,263 @@
+# SGL Kernel
+
+[Kernel Library](https://github.com/sgl-project/sglang/tree/main/sgl-kernel) for SGLang
+
+[![PyPI](https://img.shields.io/pypi/v/sgl-kernel)](https://pypi.org/project/sgl-kernel)
+
+## Installation
+For CUDA 12.1 and above:
+
+```bash
+pip3 install sgl-kernel
+```
+
+For CUDA 11.8:
+
+```bash
+pip3 install sgl-kernel -i https://docs.sglang.ai/whl/cu118
+```
+
+## Build from source
+
+Development build:
+
+```bash
+make build
+```
+
+Note:
+
+The `sgl-kernel` is rapidly evolving. If you experience a compilation failure, try using `make rebuild`.
+
+### Build with [ccache](https://github.com/ccache/ccache)
+```bash
+# or `yum install -y ccache`.
+apt-get install -y ccache
+# Building with ccache is enabled when ccache is installed and CCACHE_DIR is set.
+export CCACHE_DIR=/path/to/your/ccache/dir
+export CCACHE_BACKEND=""
+export CCACHE_KEEP_LOCAL_STORAGE="TRUE"
+unset CCACHE_READONLY
+python -m uv build --wheel -Cbuild-dir=build --color=always .
+```
+
+### Configuring CMake Build Options
+Cmake options can be configuring by adding `-Ccmake.define.<option>=<value>` to the `uv build` flags.
+For example, to enable building FP4 kernels, use:
+```bash
+python -m uv build --wheel -Cbuild-dir=build -Ccmake.define.SGL_KERNEL_ENABLE_FP4=1 --color=always .
+```
+See CMakeLists.txt for more options.
+
+### Parallel Build
+
+We highly recommend you build sgl-kernel with Ninja. Ninja can automatically build sgl-kernel in parallel.
+And if you build the sgl-kernel with cmake, you need to add `CMAKE_BUILD_PARALLEL_LEVEL` and limit the
+nvcc threads to a single thread by setting `SGL_KERNEL_COMPILE_THREADS=1` for parallel build like:
+
+```bash
+CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) python -m uv build --wheel -Cbuild-dir=build \
+-Ccmake.define.SGL_KERNEL_COMPILE_THREADS=1 --color=always .
+```
+
+### ⚠️ Compilation Issue with `sgl-kernel` and CUDA 12.6
+
+When compiling `sgl-kernel` with FlashAttention on a Hopper GPU using CUDA 12.6, you may encounter a segmentation fault:
+
+```bash
+kernel/build/_deps/repo-flash-attention-src/hopper/instantiations/flash_fwd_hdimall_bf16_paged_softcap_sm90.cu -o CMakeFiles/flash_ops.dir/_deps/repo-flash-attention-src/hopper/instantiations/flash_fwd_hdimall_bf16_paged_softcap_sm90.cu.o
+Segmentation fault (core dumped)
+```
+
+⚠️ **Note**: To ensure that FlashAttention compiles correctly on Hopper GPU Architecture(sm90), it is strongly [recommended](https://github.com/Dao-AILab/flash-attention/issues/1453) to use:
+- nvcc version: 12.6
+- ptxas version: 12.8
+
+**1. Check Current Versions**
+
+Before proceeding, verify your current CUDA tool versions:
+```bash
+nvcc --version
+ptxas --version
+```
+**2. Update ptxas to 12.8 (if needed)**
+
+1. Save the following script to a file (e.g., `update_ptxas.sh`).
+```bash
+#!/usr/bin/env bash
+# Source: https://github.com/Dao-AILab/flash-attention/blob/7ff1b621112ba8b538e2fc6a316f2a6b6f22e518/hopper/setup.py#L404
+set -ex
+
+if [ -z "$1" ]; then
+    echo "Usage: $0 <CUDA_VERSION>"
+    exit 1
+fi
+
+CUDA_VERSION=$1
+
+if awk "BEGIN {exit !("$CUDA_VERSION" >= 12.6 && "$CUDA_VERSION" < 12.8)}"; then
+    NVCC_ARCHIVE_VERSION="12.8.93"
+    NVCC_ARCHIVE_NAME="cuda_nvcc-linux-x86_64-${NVCC_ARCHIVE_VERSION}-archive"
+    NVCC_ARCHIVE_TAR="${NVCC_ARCHIVE_NAME}.tar.xz"
+    NVCC_ARCHIVE_URL="https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/linux-x86_64/${NVCC_ARCHIVE_TAR}"
+
+    wget "$NVCC_ARCHIVE_URL"
+    tar -xf "$NVCC_ARCHIVE_TAR"
+
+    mkdir -p /usr/local/cuda/bin
+    cp "${NVCC_ARCHIVE_NAME}/bin/ptxas" /usr/local/cuda/bin/
+
+    # Clean up temporary files
+    rm -f "${NVCC_ARCHIVE_TAR}"
+    rm -rf "${NVCC_ARCHIVE_NAME}"
+fi
+```
+2. Run the script with your CUDA version as the argument, using `sudo`:
+```bash
+sudo bash update_ptxas.sh 12.6
+# Check the version
+ptxas --version
+```
+
+# Developer Guide
+
+## Development Environment Setup
+
+Use Docker to set up the development environment. See [Docker setup guide](https://github.com/sgl-project/sglang/blob/main/docs/developer_guide/development_guide_using_docker.md#setup-docker-container).
+
+Create and enter development container:
+```bash
+docker run -itd --shm-size 32g --gpus all -v $HOME/.cache:/root/.cache --ipc=host --name sglang_zhyncs lmsysorg/sglang:dev /bin/zsh
+docker exec -it sglang_zhyncs /bin/zsh
+```
+
+## Project Structure
+
+### Dependencies
+
+Third-party libraries:
+
+- [CUTLASS](https://github.com/NVIDIA/cutlass)
+- [FlashInfer](https://github.com/flashinfer-ai/flashinfer)
+- [DeepGEMM](https://github.com/deepseek-ai/DeepGEMM)
+- [FlashAttention](https://github.com/Dao-AILab/flash-attention)
+
+### FlashAttention FYI
+
+  FA3 can fail without a enough shared memory for a some shapes, such as higher hidden_dim or some special cases. Right now, fa3 is supported for sm80/sm87 and sm86/sm89.
+
+  The main different Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x.
+
+  And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a. That means if you use **A100(tested)**/A*0/**L20(tested)**/L40/L40s/**3090(tested)** you can use fa3.
+
+### Kernel Development
+
+Steps to add a new kernel:
+
+1. Implement the kernel in [csrc](https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc)
+2. Expose the interface in [include/sgl_kernel_ops.h](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/include/sgl_kernel_ops.h)
+3. Create torch extension in [csrc/common_extension.cc](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/csrc/common_extension.cc)
+4. Update [CMakeLists.txt](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/CMakeLists.txt) to include new CUDA source
+5. Expose Python interface in [python](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/python/sgl_kernel)
+
+### Development Tips
+
+1. When implementing kernels in [csrc](https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc), only define pure CUDA files and C++ interfaces. If you need to use `Torch::tensor`, use `<torch/all.h>` instead of `<torch/extension.h>`. Using `<torch/extension.h>` will cause compilation errors when using SABI.
+
+2. When creating torch extensions, add the function definition with `m.def`, and device binding with `m.impl`:
+- Using torch.compile need `m.def` with schema, it helps auto capture the custom kernel. Reference: [How to add FakeTensor](https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit?tab=t.0#heading=h.ptttacy8y1u9)
+
+- How to write schema: [Schema reference](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/README.md#func)
+
+   ```cpp
+   // We need def with schema here for torch.compile
+   m.def(
+    "bmm_fp8(Tensor A, Tensor B, Tensor! D, Tensor A_scale, Tensor B_scale, Tensor workspace_buffer, int "
+    "cublas_handle, int cuda_stream) -> ()");
+   m.impl("bmm_fp8", torch::kCUDA, &bmm_fp8);
+   ```
+
+3. When exposing Python interfaces, avoid using kwargs in C++ interface kernels.
+
+    **Avoid this:**
+
+    ```cpp
+    torch.ops.sgl_kernel.apply_rope_pos_ids_cos_sin_cache.default(
+        q=query.view(query.shape[0], -1, head_size),
+        k=key.view(key.shape[0], -1, head_size),
+        q_rope=query.view(query.shape[0], -1, head_size),
+        k_rope=key.view(key.shape[0], -1, head_size),
+        cos_sin_cache=cos_sin_cache,
+        pos_ids=positions.long(),
+        interleave=(not is_neox),
+        cuda_stream=get_cuda_stream(),
+    )
+    ```
+
+    **Use this instead:**
+
+    ```cpp
+    torch.ops.sgl_kernel.apply_rope_pos_ids_cos_sin_cache.default(
+        query.view(query.shape[0], -1, head_size),
+        key.view(key.shape[0], -1, head_size),
+        query.view(query.shape[0], -1, head_size),
+        key.view(key.shape[0], -1, head_size),
+        cos_sin_cache,
+        positions.long(),
+        (not is_neox),
+        get_cuda_stream(),
+    )
+    ```
+
+### Integrating Third-Party Libraries with Data Type Conversion
+
+When integrating new third-party libraries like flash-attention, you may encounter data type compatibility issues between the C++ interface and PyTorch bindings. For example, the third-party code might use `float` or `int` types, while PyTorch requires `double` and `int64_t`.
+
+> The reason we need `double` and `int64_t` in torch binding is that TORCH_LIBRARY handles the `Python-to-C++` conversion process. Python's `float` data type actually corresponds to `double` in C++, while Python's `int` corresponds to `int64_t` in C++.
+
+To address this issue, we provide the `make_pytorch_shim` function in [sgl_kernel_torch_shim](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/include/sgl_kernel_torch_shim.h) that handles data type conversions automatically.
+
+When you need to support new data type conversions, you can easily add conversion functions like this:
+
+```cpp
+// Map `int` -> `int64_t`
+template <>
+struct pytorch_library_compatible_type<int> {
+  using type = int64_t;
+  static int convert_from_type(int64_t arg) {
+    TORCH_CHECK(arg <= std::numeric_limits<int>::max(), "int64_t value is too large to be converted  to int");
+    TORCH_CHECK(arg >= std::numeric_limits<int>::min(), "int64_t value is too small to be converted to int");
+    return arg;
+  }
+};
+```
+
+To use this with your library functions, simply wrap them with make_pytorch_shim:
+
+```cpp
+/*
+ * From flash-attention
+ */
+ m.impl("fwd", torch::kCUDA, make_pytorch_shim(&mha_fwd));
+```
+
+### Testing & Benchmarking
+
+1. Add pytest tests in [tests/](https://github.com/sgl-project/sglang/tree/main/sgl-kernel/tests), if you need to skip some test, please use `@pytest.mark.skipif`
+
+```python
+@pytest.mark.skipif(
+    skip_condition, reason="Nvfp4 Requires compute capability of 10 or above."
+)
+```
+
+2. Add benchmarks using [triton benchmark](https://triton-lang.org/main/python-api/generated/triton.testing.Benchmark.html) in [benchmark/](https://github.com/sgl-project/sglang/tree/main/sgl-kernel/benchmark)
+3. Run test suite
+
+### FAQ
+
+- When encountering this error while compiling using ccache: `ImportError: /usr/local/lib/python3.10/dist-packages/sgl_kernel/common_ops.abi3.so: undefined symbol: _ZN3c108ListType3getERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEENS_4Type24SingletonOrSharedTypePtrIS9_EE`, please modify the last command as follows to resolve it: `python3 -m uv build --wheel -Cbuild-dir=build . --color=always --no-build-isolation` .
+
+### Release new version
+
+Update version in [pyproject.toml](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/pyproject.toml) and [version.py](https://github.com/sgl-project/sglang/blob/main/sgl-kernel/python/sgl_kernel/version.py)
diff --git a/sgl-kernel/THIRDPARTYNOTICES.txt b/sgl-kernel/THIRDPARTYNOTICES.txt
new file mode 100644
index 000000000..2a81411ae
--- /dev/null
+++ b/sgl-kernel/THIRDPARTYNOTICES.txt
@@ -0,0 +1,488 @@
+Notice for flashinfer-ai/flashinfer
+-------------------------------
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+-------------------------------------------------------------------------------------------------
+Some of the code in this project are adapted from other open-source projects with different
+licenses. This product also bundles some third-party components under other open source licenses.
+This section summarizes those components and their licenses.
+See licenses/ for text of these licenses.
+
+BSD 3-Clause License
+--------------------
+
+include/flashinfer/attention/hopper/epilogue.cuh
+include/flashinfer/attention/hopper/mainloop.cuh
+include/flashinfer/attention/hopper/kernel_traits.cuh
+include/flashinfer/attention/hopper/named_barrier.cuh
+include/flashinfer/attention/hopper/tile_scheduler.cuh
+include/flashinfer/attention/hopper/utils.cuh
+
+BSD 3-Clause "New" License
+--------------------------
+
+3rdparty/cutlass
+include/flashinfer/attention/hopper/block_sparse_gather.cuh
+
+Notice for NVIDIA/TensorRT-LLM
+-------------------------------
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+Notice for deepseek-ai/DeepGEMM
+-------------------------------
+
+MIT License
+
+Copyright (c) 2025 DeepSeek
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Notice for Dao-AILab/flash-attention
+-------------------------------
+
+BSD 3-Clause License
+
+Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/sgl-kernel/benchmark/bench_activation.py b/sgl-kernel/benchmark/bench_activation.py
new file mode 100644
index 000000000..cfea78915
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_activation.py
@@ -0,0 +1,153 @@
+# Benchmarks SGLang kernels versus vLLM across
+# (kernel, dtype, batch_size, seq_len, dim) and prints speed-up.
+import argparse
+import itertools
+import re
+from typing import List, Tuple
+
+import sgl_kernel
+import torch
+import torch.nn.functional as F
+import triton
+import triton.testing
+from sgl_kernel import gelu_quick  # activation-only kernel
+from sgl_kernel import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
+from vllm import _custom_ops as vllm_ops
+
+if not hasattr(vllm_ops, "silu_and_mul"):
+    vllm_ops = torch.ops._C
+
+
+def str2int_list(arg: str) -> List[int]:
+    if arg in ("", None):
+        return []
+    if re.fullmatch(r"\d+(,\d+)*", arg.strip()) is None:
+        raise argparse.ArgumentTypeError(f"Bad int list: {arg}")
+    return [int(x) for x in arg.split(",")]
+
+
+def calculate_diff(
+    kernel: str, dtype: torch.dtype, batch_size: int, seq_len: int, dim: int
+) -> bool:
+    """Compare vLLM with SGLang for one shape."""
+    device = torch.device("cuda")
+
+    # activation-only quick GELU
+    if kernel == "gelu_quick":
+        x = torch.randn(batch_size, seq_len, dim, dtype=dtype, device=device)
+        ref_out = torch.zeros_like(x)
+        getattr(vllm_ops, kernel)(ref_out, x)
+        test_out = getattr(sgl_kernel, kernel)(x)
+    # fused activation x mul kernels
+    else:
+        x = torch.randn(batch_size, seq_len, 2 * dim, dtype=dtype, device=device)
+        ref_out = torch.zeros(batch_size, seq_len, dim, dtype=dtype, device=device)
+        getattr(vllm_ops, kernel)(ref_out, x)
+        test_out = getattr(sgl_kernel, kernel)(x)
+
+    ok = torch.allclose(ref_out, test_out, rtol=1e-3, atol=1e-5)
+    tag = "✅ match" if ok else "❌ mismatch"
+    print(
+        f"[{kernel:14s} | {str(dtype):9s} | B={batch_size:3d} | "
+        f"L={seq_len:3d} | D={dim:5d}] {tag}"
+    )
+    return ok
+
+
+kernels = ["silu_and_mul", "gelu_and_mul", "gelu_tanh_and_mul", "gelu_quick"]
+dtypes = [torch.float16, torch.bfloat16]
+
+
+def make_configs(bsizes: List[int], slens: List[int], dims_: List[int]) -> List[Tuple]:
+    return list(itertools.product(kernels, dtypes, bsizes, slens, dims_))
+
+
+default_batch_sizes = [2**i for i in range(0, 5, 2)]  # 1,4,16
+default_seq_lens = [2**i for i in range(0, 8, 2)]  # 1,4,16,64
+default_dims = [2**i for i in range(7, 15)]  # 128...16384
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["kernel", "dtype", "batch_size", "seq_len", "dim"],
+        x_vals=[],
+        line_arg="provider",
+        line_vals=["vllm", "sglang", "speedup"],
+        line_names=["vLLM", "SGL Kernel", "Speed-up (x)"],
+        styles=[("blue", "-"), ("green", "-"), ("red", "--")],
+        ylabel="µs (median)  or  × (speed-up)",
+        plot_name="activation-performance",
+        args={},
+    )
+)
+def benchmark(kernel, dtype, batch_size, seq_len, dim, provider):
+    device = torch.device("cuda")
+    in_mult = 1 if kernel == "gelu_quick" else 2
+    x = torch.randn(batch_size, seq_len, in_mult * dim, dtype=dtype, device=device)
+    y0 = torch.zeros(batch_size, seq_len, dim, dtype=dtype, device=device)
+
+    vllm_kernel = getattr(vllm_ops, kernel)
+    sglang_kernel = getattr(sgl_kernel, kernel)
+
+    def baseline():
+        tmp = y0.clone()
+        vllm_kernel(tmp, x)
+        return tmp
+
+    def sglang():
+        return sglang_kernel(x)
+
+    # one-time correctness check
+    if provider == "vllm" and not calculate_diff(
+        kernel, dtype, batch_size, seq_len, dim
+    ):
+        raise ValueError("Mismatch – abort benchmark")
+
+    # timing helper
+    def timed(fn):
+        for _ in range(5):
+            fn()
+        torch.cuda.synchronize()
+        ms, qmin, qmax = triton.testing.do_bench(fn, quantiles=[0.5, 0.2, 0.8])
+        return 1000 * ms, 1000 * qmax, 1000 * qmin
+
+    if provider == "vllm":
+        return timed(baseline)
+    if provider == "sglang":
+        return timed(sglang)
+
+    # provider == "speedup"
+    t_ref, _, _ = timed(baseline)
+    t_sgl, _, _ = timed(sglang)
+    spd = t_ref / t_sgl
+    return (spd, spd, spd)
+
+
+if __name__ == "__main__":
+    p = argparse.ArgumentParser("Activation kernel benchmark")
+    p.add_argument("--batch_sizes", type=str2int_list, default=default_batch_sizes)
+    p.add_argument("--seq_lens", type=str2int_list, default=default_seq_lens)
+    p.add_argument("--dims", type=str2int_list, default=default_dims)
+    p.add_argument("--verify_only", action="store_true")
+    args = p.parse_args()
+
+    # coerce lists
+    if isinstance(args.batch_sizes, str):
+        args.batch_sizes = str2int_list(args.batch_sizes)
+    if isinstance(args.seq_lens, str):
+        args.seq_lens = str2int_list(args.seq_lens)
+    if isinstance(args.dims, str):
+        args.dims = str2int_list(args.dims)
+
+    # patch perf_report grid
+    benchmark_grid = make_configs(args.batch_sizes, args.seq_lens, args.dims)
+    if hasattr(benchmark, "benchmarks"):
+        benchmark.benchmarks.x_vals = benchmark_grid
+    else:
+        benchmark.benchmark.x_vals = benchmark_grid
+
+    if args.verify_only:
+        ok = calculate_diff("gelu_quick", torch.float16, 1, 1, args.dims[0])
+        print("✅ sanity pass" if ok else "❌ mismatch")
+    else:
+        benchmark.run(print_data=True)
diff --git a/sgl-kernel/benchmark/bench_awq_dequant.py b/sgl-kernel/benchmark/bench_awq_dequant.py
new file mode 100644
index 000000000..22280c250
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_awq_dequant.py
@@ -0,0 +1,118 @@
+import itertools
+from typing import List, Tuple
+
+import torch
+import triton
+import triton.testing
+from sgl_kernel import awq_dequantize
+from vllm import _custom_ops as ops
+
+
+def vllm_awq_dequantize(
+    qweight: torch.Tensor, scales: torch.Tensor, qzeros: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    return ops.awq_dequantize(qweight, scales, qzeros, 0, 0, 0)
+
+
+def sglang_awq_dequantize(
+    qweight: torch.Tensor, scales: torch.Tensor, qzeros: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+
+    return awq_dequantize(qweight, scales, qzeros)
+
+
+def calculate_diff(qweight_row: int, qweight_col: int):
+    """Calculate difference between VLLM and SGLang implementations."""
+    device = torch.device("cuda")
+    qweight = torch.randint(
+        0,
+        torch.iinfo(torch.int32).max,
+        (qweight_row, qweight_col),
+        dtype=torch.int32,
+        device=device,
+    )
+    group_size = qweight_row
+    scales_row = qweight_row // group_size
+    scales_col = qweight_col * 8
+    scales = torch.rand(scales_row, scales_col, dtype=torch.float16, device=device)
+    qzeros = torch.randint(
+        0,
+        torch.iinfo(torch.int32).max,
+        (scales_row, qweight_col),
+        dtype=torch.int32,
+        device=device,
+    )
+
+    vllm_out = vllm_awq_dequantize(qweight, scales, qzeros)
+    sglang_out = sglang_awq_dequantize(qweight, scales, qzeros)
+
+    output_diff = torch.abs(vllm_out.float() - sglang_out.float()).mean().item()
+
+    if torch.allclose(
+        vllm_out.to(torch.float32), sglang_out.to(torch.float32), rtol=1e-3, atol=1e-5
+    ):
+        print("✅ All implementations match")
+    else:
+        print("❌ Implementations differ")
+
+
+qweight_row_range = [3584, 18944, 128, 256, 512, 1024]
+qweight_cols_range = [448, 576, 4736, 16, 32, 64, 128]
+
+configs = list(itertools.product(qweight_row_range, qweight_cols_range))
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["qweight_row", "qweight_col"],
+        x_vals=configs,
+        line_arg="provider",
+        line_vals=["vllm", "sglang"],
+        line_names=["VLLM", "SGL Kernel"],
+        styles=[("blue", "-"), ("green", "-")],
+        ylabel="us",
+        plot_name="awq-dequantize-performance",
+        args={},
+    )
+)
+def benchmark(qweight_row, qweight_col, provider):
+    dtype = torch.float16
+    device = torch.device("cuda")
+    qweight = torch.randint(
+        0,
+        torch.iinfo(torch.int32).max,
+        (qweight_row, qweight_col),
+        dtype=torch.int32,
+        device=device,
+    )
+    group_size = qweight_row
+    scales_row = qweight_row // group_size
+    scales_col = qweight_col * 8
+    scales = torch.rand(scales_row, scales_col, dtype=torch.float16, device=device)
+    qzeros = torch.randint(
+        0,
+        torch.iinfo(torch.int32).max,
+        (scales_row, qweight_col),
+        dtype=torch.int32,
+        device=device,
+    )
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "vllm":
+        fn = lambda: vllm_awq_dequantize(
+            qweight.clone(), scales.clone(), qzeros.clone()
+        )
+    elif provider == "sglang":
+        fn = lambda: sglang_awq_dequantize(
+            qweight.clone(), scales.clone(), qzeros.clone()
+        )
+
+    ms, min_ms, max_ms = triton.testing.do_bench(fn, quantiles=quantiles)
+
+    return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+
+if __name__ == "__main__":
+    calculate_diff(qweight_row=3584, qweight_col=448)
+    benchmark.run(print_data=True)
diff --git a/sgl-kernel/benchmark/bench_cutlass_mla.py b/sgl-kernel/benchmark/bench_cutlass_mla.py
new file mode 100644
index 000000000..785e51033
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_cutlass_mla.py
@@ -0,0 +1,145 @@
+import argparse
+import copy
+import itertools
+
+import torch
+import triton
+from sgl_kernel import cutlass_mla_decode, cutlass_mla_get_workspace_size
+
+bs_range = [1, 8, 32, 64, 128, 256]
+qlen_range = [1, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
+
+configs = list(itertools.product(bs_range, qlen_range))
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size", "seq_len"],
+        x_vals=configs,
+        x_log=False,
+        line_arg="provider",
+        line_vals=[
+            "128 heads",
+            "64 heads",
+            "32 heads",
+            "16 heads",
+        ],
+        line_names=[
+            "128 heads",
+            "64 heads",
+            "32 heads",
+            "16 heads",
+        ],
+        styles=[("green", "-"), ("green", "--"), ("blue", "-"), ("blue", "--")],
+        ylabel="GB/s",
+        plot_name="cutlass mla",
+        args={},
+    )
+)
+def benchmark(batch_size, seq_len, provider, block_size, num_kv_splits):
+    d = 576
+    dn = 64
+    dv = 512
+
+    h_q_map = {
+        "128": 128,
+        "64": 64,
+        "32": 32,
+        "16": 16,
+    }
+    parsed_h_q = next(
+        (value for key, value in h_q_map.items() if key in provider), None
+    )
+
+    if parsed_h_q is None:
+        raise ValueError(f"Unknown head configuration in provider: {provider}")
+    h_q = parsed_h_q
+
+    seq_lens = torch.full((batch_size,), seq_len, dtype=torch.int32, device="cuda")
+    max_seq_len = seq_lens.max().item()
+    block_num = (max_seq_len + block_size - 1) // block_size
+
+    # Pad block_num so that small blocks can be packed into full 128-sized CUTLASS tiles.
+    # One 128-wide tile can hold (128 // block_size) small blocks.
+    pack_factor = 128 // block_size
+    block_num = ((block_num + pack_factor - 1) // pack_factor) * pack_factor
+
+    qn = (
+        torch.randn(h_q, batch_size, d - dn, dtype=torch.bfloat16, device="cuda")
+        * 100.0
+    )
+    qr = torch.randn(batch_size, h_q, dn, dtype=torch.bfloat16, device="cuda") * 100.0
+    block_table = torch.randint(
+        0,
+        batch_size * block_num,
+        (batch_size, block_num),
+        dtype=torch.int32,
+        device="cuda",
+    )
+
+    kv_cache = torch.randn(
+        block_table.numel(), block_size, d, dtype=torch.bfloat16, device="cuda"
+    )
+
+    workspace_size = cutlass_mla_get_workspace_size(
+        block_num * block_size, batch_size, num_kv_splits=num_kv_splits
+    )
+    workspace = torch.empty(workspace_size, device="cuda", dtype=torch.uint8)
+
+    quantiles = [0.5, 0.2, 0.8]
+    ms, min_ms, max_ms = triton.testing.do_bench(
+        lambda: cutlass_mla_decode(
+            qn.transpose(0, 1),
+            qr,
+            kv_cache,
+            seq_lens,
+            block_table,
+            workspace,
+            1.44,
+            num_kv_splits,
+        ),
+        quantiles=quantiles,
+    )
+
+    q_size = qn.numel() * qn.element_size() + qr.numel() * qr.element_size()
+
+    gbps = (
+        lambda ms: (
+            q_size + q_size * dv / d + kv_cache.numel() * kv_cache.element_size()
+        )
+        * 1e-9
+        / (ms * 1e-3)
+    )
+    return gbps(ms), gbps(max_ms), gbps(min_ms)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--block-sizes",
+        nargs="+",
+        type=int,
+        default=[1, 32, 64, 128],
+        help="List of batch sizes",
+    )
+    parser.add_argument(
+        "--num-kv-splits",
+        nargs="+",
+        type=int,
+        default=[-1],
+        help="List of batch sizes",
+    )
+    args = parser.parse_args()
+
+    for block_size in args.block_sizes:
+        for kv_split in args.num_kv_splits:
+            print(f"block_size={block_size}, num_kv_splits={kv_split}: ")
+            benchmark.run(
+                print_data=True,
+                show_plots=True,
+                save_path="bench_blackwell_mla_res",
+                block_size=block_size,
+                num_kv_splits=kv_split,
+            )
+
+    print("Benchmark finished!")
diff --git a/sgl-kernel/benchmark/bench_dsv3_fused_a_gemm.py b/sgl-kernel/benchmark/bench_dsv3_fused_a_gemm.py
new file mode 100644
index 000000000..8c1e29980
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_dsv3_fused_a_gemm.py
@@ -0,0 +1,57 @@
+import argparse
+
+import torch
+import torch.nn.functional as F
+import triton
+import triton.testing
+from sgl_kernel import dsv3_fused_a_gemm
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["num_tokens"],
+        x_vals=[i + 1 for i in range(16)],
+        x_log=False,
+        line_arg="impl",
+        line_vals=["torch", "sgl-kernel"],
+        line_names=["torch (bf16)", "dsv3_fused_a_gemm"],
+        styles=[("blue", "-"), ("orange", "-")],
+        ylabel="TFLOPs",
+        plot_name="bf16 dsv3 fused a GEMM throughput",
+        args={},
+    )
+)
+def benchmark(num_tokens, impl):
+    kHdIn = 7168
+    kHdOut = 2112
+    M, K, N = num_tokens, kHdIn, kHdOut
+
+    mat_a = torch.randn((M, K), dtype=torch.bfloat16, device="cuda").contiguous()
+    mat_b = torch.randn((N, K), dtype=torch.bfloat16, device="cuda").transpose(0, 1)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if impl == "torch":
+
+        def runner():
+            F.linear(mat_a, mat_b.T)
+
+    elif impl == "sgl-kernel":
+
+        def runner():
+            dsv3_fused_a_gemm(mat_a, mat_b)
+
+    ms, min_ms, max_ms = triton.testing.do_bench(runner, quantiles=quantiles)
+
+    def tflops(t_ms):
+        flops = 2 * M * K * N
+        return flops / (t_ms * 1e-3) / 1e12
+
+    return tflops(ms), tflops(max_ms), tflops(min_ms)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    args = parser.parse_args()
+
+    benchmark.run(print_data=True, show_plots=True, save_path="bench_dsv3_gemm")
diff --git a/sgl-kernel/benchmark/bench_dsv3_router_gemm.py b/sgl-kernel/benchmark/bench_dsv3_router_gemm.py
new file mode 100644
index 000000000..dee090e21
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_dsv3_router_gemm.py
@@ -0,0 +1,127 @@
+import argparse
+
+import torch
+import torch.nn.functional as F
+import triton
+import triton.testing
+from sgl_kernel import dsv3_router_gemm
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["num_tokens"],
+        x_vals=[i + 1 for i in range(16)],
+        x_log=False,
+        line_arg="impl",
+        line_vals=["torch-256", "sgl-kernel-256", "torch-384", "sgl-kernel-384"],
+        line_names=[
+            "torch-256",
+            "dsv3_router_gemm-256",
+            "torch-384",
+            "dsv3_router_gemm-384",
+        ],
+        styles=[("blue", "-"), ("orange", "-"), ("green", "-"), ("red", "-")],
+        ylabel="TFLOPs",
+        plot_name="input-bf16-output-bf16 dsv3 router gemm throughput",
+        args={},
+    )
+)
+def benchmark_bf16_output(num_tokens, impl):
+    # M: num_tokens, K: hidden_dim, N: num_experts
+    M, K = num_tokens, 7168
+
+    if impl == "torch-256" or impl == "sgl-kernel-256":
+        N = 256
+    elif impl == "torch-384" or impl == "sgl-kernel-384":
+        N = 384
+    else:
+        raise ValueError(f"Unknown impl: {impl}")
+
+    mat_a = torch.randn((M, K), dtype=torch.bfloat16, device="cuda").contiguous()
+    mat_b = torch.randn((N, K), dtype=torch.bfloat16, device="cuda").contiguous()
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if impl == "torch-256" or impl == "torch-384":
+
+        def runner():
+            F.linear(mat_a, mat_b)
+
+    elif impl == "sgl-kernel-256" or impl == "sgl-kernel-384":
+
+        def runner():
+            dsv3_router_gemm(mat_a, mat_b, out_dtype=torch.bfloat16)
+
+    ms, min_ms, max_ms = triton.testing.do_bench(runner, quantiles=quantiles)
+
+    def tflops(t_ms):
+        flops = 2 * M * K * N
+        return flops / (t_ms * 1e-3) / 1e12
+
+    return tflops(ms), tflops(max_ms), tflops(min_ms)
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["num_tokens"],
+        x_vals=[i + 1 for i in range(16)],
+        x_log=False,
+        line_arg="impl",
+        line_vals=["torch-256", "sgl-kernel-256", "torch-384", "sgl-kernel-384"],
+        line_names=[
+            "torch-256",
+            "dsv3_router_gemm-256",
+            "torch-384",
+            "dsv3_router_gemm-384",
+        ],
+        styles=[("blue", "-"), ("orange", "-"), ("green", "-"), ("red", "-")],
+        ylabel="TFLOPs",
+        plot_name="input-bf16-output-fp32 dsv3 router gemm throughput",
+        args={},
+    )
+)
+def benchmark_float_output(num_tokens, impl):
+    # M: num_tokens, K: hidden_dim, N: num_experts
+    M, K = num_tokens, 7168
+
+    if impl == "torch-256" or impl == "sgl-kernel-256":
+        N = 256
+    elif impl == "torch-384" or impl == "sgl-kernel-384":
+        N = 384
+    else:
+        raise ValueError(f"Unknown impl: {impl}")
+
+    mat_a = torch.randn((M, K), dtype=torch.bfloat16, device="cuda").contiguous()
+    mat_b = torch.randn((N, K), dtype=torch.bfloat16, device="cuda").contiguous()
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if impl == "torch-256" or impl == "torch-384":
+
+        def runner():
+            F.linear(mat_a, mat_b).to(torch.float32)
+
+    elif impl == "sgl-kernel-256" or impl == "sgl-kernel-384":
+
+        def runner():
+            dsv3_router_gemm(mat_a, mat_b, out_dtype=torch.float32)
+
+    ms, min_ms, max_ms = triton.testing.do_bench(runner, quantiles=quantiles)
+
+    def tflops(t_ms):
+        flops = 2 * M * K * N
+        return flops / (t_ms * 1e-3) / 1e12
+
+    return tflops(ms), tflops(max_ms), tflops(min_ms)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    args = parser.parse_args()
+
+    benchmark_bf16_output.run(
+        print_data=True, show_plots=True, save_path="bench_dsv3_router_gemm"
+    )
+    benchmark_float_output.run(
+        print_data=True, show_plots=True, save_path="bench_dsv3_router_gemm"
+    )
diff --git a/sgl-kernel/benchmark/bench_fp4_gemm.py b/sgl-kernel/benchmark/bench_fp4_gemm.py
new file mode 100755
index 000000000..80773eb07
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_fp4_gemm.py
@@ -0,0 +1,210 @@
+import argparse
+import copy
+import csv
+import itertools
+
+import pytest
+import torch
+import triton
+from flashinfer import mm_fp4
+from sgl_kernel import cutlass_scaled_fp4_mm, scaled_fp4_quant
+
+FLOAT4_E2M1_MAX = 6.0
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+
+def get_weight_shapes(args):
+    models_tps = args.tp_sizes
+
+    if models_tps == [4]:
+        return [[1024, 3584], [7168, 256], [7168, 2304], [9216, 3584]]
+
+    if models_tps == [8]:
+        return [[512, 3584], [7168, 128], [7168, 1152], [4608, 3584]]
+    return [
+        [1024, 3584],
+        [7168, 256],
+        [7168, 2304],
+        [9216, 3584],
+        [512, 3584],
+        [7168, 128],
+        [7168, 1152],
+        [4608, 3584],
+    ]
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[
+            1,
+            2,
+            4,
+            8,
+            16,
+            32,
+            64,
+            128,
+            256,
+            512,
+            1024,
+            2048,
+            3072,
+            4096,
+            8192,
+            16384,
+        ],
+        # x_vals = [64],
+        x_log=False,
+        line_arg="provider",
+        line_vals=["cutlass", "cudnn", "trtllm"],
+        line_names=["baseline cutlass fp4", "cudnn fp4", "trtllm fp4"],
+        styles=[("red", "solid"), ("blue", "solid"), ("green", "solid")],
+        ylabel="latency (ms)",
+        plot_name="fp4_gemm_benchmark",
+        args={},
+    )
+)
+def benchmark(batch_size, provider, N, K, dtype, correctness, csv_file):
+    M = batch_size
+    packed_k = K
+    K = 2 * packed_k
+    a_dtype = torch.randn((M, K), dtype=dtype, device="cuda")
+    b_dtype = torch.randn((N, K), dtype=dtype, device="cuda")
+    a_global_scale = (
+        (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(a_dtype.flatten(), dim=-1)
+    ).to(torch.float32)
+    b_global_scale = (
+        (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(b_dtype.flatten(), dim=-1)
+    ).to(torch.float32)
+
+    alpha = 1.0 / (a_global_scale * b_global_scale)
+    a_fp4, a_scale_interleaved = scaled_fp4_quant(a_dtype, a_global_scale)
+    # print("a_fp4", a_fp4)
+    b_fp4, b_scale_interleaved = scaled_fp4_quant(b_dtype, b_global_scale)
+    res_fi = torch.empty((M, N), dtype=dtype, device="cuda")
+
+    quantiles = [0.5, 0.2, 0.8]
+    if provider == "cutlass":
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: cutlass_scaled_fp4_mm(
+                a_fp4, b_fp4, a_scale_interleaved, b_scale_interleaved, alpha, dtype
+            ),
+            quantiles=quantiles,
+        )
+    if provider == "cudnn":
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: mm_fp4(
+                a_fp4,
+                b_fp4.T,
+                a_scale_interleaved,
+                b_scale_interleaved.T,
+                alpha,
+                dtype,
+                res_fi,
+            ),
+            quantiles=quantiles,
+        )
+    if provider == "trtllm":
+        a_scale_interleaved = a_scale_interleaved.to(torch.uint8)
+        b_scale_interleaved = b_scale_interleaved.to(torch.uint8)
+        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
+            lambda: mm_fp4(
+                a_fp4,
+                b_fp4.T,
+                a_scale_interleaved,
+                b_scale_interleaved.T,
+                alpha,
+                dtype,
+                res_fi,
+                backend="trtllm",
+            ),
+            quantiles=quantiles,
+        )
+    if correctness:
+        res_cutlass = cutlass_scaled_fp4_mm(
+            a_fp4, b_fp4, a_scale_interleaved, b_scale_interleaved, alpha, dtype
+        )
+        mm_fp4(
+            a_fp4,
+            b_fp4.T,
+            a_scale_interleaved,
+            b_scale_interleaved.T,
+            alpha,
+            dtype,
+            res_fi,
+            backend="cudnn",
+        )
+        assert torch.allclose(
+            res_fi, res_cutlass, atol=1e-3, rtol=1e-3
+        ), "cudnn fp4 doesn't match cutlass fp4"
+        mm_fp4(
+            a_fp4,
+            b_fp4.T,
+            a_scale_interleaved,
+            b_scale_interleaved.T,
+            alpha,
+            dtype,
+            res_fi,
+            backend="trtllm",
+        )
+        assert torch.allclose(
+            res_fi, res_cutlass, atol=1e-3, rtol=1e-3
+        ), "trtllm fp4 doesn't match cutlass fp4"
+
+    if csv_file:
+        with open(csv_file, "a", newline="") as f:
+            writer = csv.writer(f)
+            writer.writerow([provider, M, N, K, ms])
+
+    return ms, min_ms, max_ms
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--tp-sizes",
+        nargs="+",
+        type=int,
+        default=[1],
+        help="List of tensor parallel sizes",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=torch.dtype,
+        default=torch.bfloat16,
+        help="Data type",
+    )
+    parser.add_argument(
+        "--correctness",
+        action="store_true",
+        help="Check correctness",
+    )
+    parser.add_argument(
+        "--csv",
+        type=str,
+        default="results_cutlass_cudnn.csv",
+        help="CSV file to save results",
+    )
+    args = parser.parse_args()
+
+    if args.csv:
+        with open(args.csv, "w", newline="") as f:
+            writer = csv.writer(f)
+            writer.writerow(["provider", "m", "n", "k", "time_ms"])
+
+    NKs = get_weight_shapes(args)
+    for N, K in NKs:
+        print(f"DeepSeek-R1-0528-FP4 N={N} K={K}: ")
+        benchmark.run(
+            print_data=True,
+            show_plots=True,
+            save_path="bench_fp4_res",
+            N=N,
+            K=K,
+            dtype=args.dtype,
+            correctness=args.correctness,
+            csv_file=args.csv,
+        )
+
+    print("Benchmark finished!")
diff --git a/sgl-kernel/benchmark/bench_fp8_blockwise_gemm.py b/sgl-kernel/benchmark/bench_fp8_blockwise_gemm.py
new file mode 100644
index 000000000..ed0410298
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_fp8_blockwise_gemm.py
@@ -0,0 +1,183 @@
+import argparse
+import copy
+import itertools
+
+import deep_gemm
+import torch
+import triton
+from deep_gemm import get_col_major_tma_aligned_tensor
+from sgl_kernel import fp8_blockwise_scaled_mm
+from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
+
+from sglang.srt.layers.quantization.fp8_kernel import (
+    w8a8_block_fp8_matmul_triton as w8a8_block_fp8_matmul,
+)
+
+
+def get_weight_shapes(args):
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    # NOTE(HandH1998): The weight shapes only works for DeepSeek-V3. Modify them, if you tune for another different model.
+    # cannot TP
+    total = [
+        (512 + 64, 7168),
+        ((128 + 64) * 128, 7168),
+        (128 * (128 + 128), 512),
+        (7168, 16384),
+        (7168, 18432),
+    ]
+    # N can TP
+    n_tp = [
+        (18432 * 2, 7168),
+        ((128 + 64) * 128, 7168),
+        (128 * (128 + 128), 512),
+        (24576, 1536),
+        (4096, 7168),
+    ]
+    # K can TP
+    k_tp = [(7168, 18432), (7168, 16384), (7168, 2048)]
+    # only support Deepseek-V3
+    SUPPORT_MODEL = ["deepseek-ai/DeepSeek-V3"]
+
+    weight_shapes = []
+    for model, tp_size in models_tps:
+        assert model in SUPPORT_MODEL
+        for t in total:
+            new_t = [t[0], t[1], model]
+            weight_shapes.append(new_t)
+        for n_t in n_tp:
+            new_t = [n_t[0] // tp_size, n_t[1], model]
+            weight_shapes.append(new_t)
+        for k_t in k_tp:
+            new_t = [k_t[0], k_t[1] // tp_size, model]
+            weight_shapes.append(new_t)
+    return weight_shapes
+
+
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+
+
+def fp8_gemm_deepgemm(
+    x_fp8: torch.Tensor,
+    x_scale: torch.Tensor,
+    y_fp8: torch.Tensor,
+    y_scale: torch.Tensor,
+    m: int,
+    n: int,
+    k: int,
+):
+    """DeepGEMM implementation of FP8 GEMM"""
+    out = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
+
+    # Run DeepGEMM kernel
+    deep_gemm.gemm_fp8_fp8_bf16_nt((x_fp8, x_scale), (y_fp8, y_scale), out)
+    return out
+
+
+def scale_shape(shape, group_shape):
+    assert len(shape) == len(group_shape)
+    return tuple(cdiv(shape[i], group_shape[i]) for i in range(len(group_shape)))
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096],
+        x_log=False,
+        line_arg="provider",
+        line_vals=["vllm", "sgl-kernel", "triton", "deepgemm"],
+        line_names=["vllm", "sgl-kernel", "sglang triton", "deepgemm"],
+        styles=[("blue", "-"), ("orange", "-"), ("red", "-"), ("yellow", "-")],
+        ylabel="GB/s",
+        plot_name="fp8 blockwise scaled matmul",
+        args={},
+    )
+)
+def benchmark(batch_size, provider, N, K):
+    M = batch_size
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    a_fp32 = (torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max
+    a_fp8 = a_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    b_fp32 = (torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max
+    b_fp8 = b_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    scale_a_group_shape = (1, 128)
+    scale_b_group_shape = (128, 128)
+    scale_a_shape = scale_shape(a_fp8.shape, scale_a_group_shape)
+    scale_b_shape = scale_shape(b_fp8.shape, scale_b_group_shape)
+
+    scale_a = torch.randn(scale_a_shape, device="cuda", dtype=torch.float32)
+    scale_b = torch.randn(scale_b_shape, device="cuda", dtype=torch.float32)
+
+    quantiles = [0.5, 0.2, 0.8]
+    if provider == "sgl-kernel":
+        scale_a = scale_a.t().contiguous().t()
+        b_fp8, scale_b = b_fp8.t(), scale_b.t()
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: fp8_blockwise_scaled_mm(
+                a_fp8, b_fp8, scale_a, scale_b, torch.float16
+            ),
+            quantiles=quantiles,
+        )
+    if provider == "vllm":
+        scale_a = scale_a.t().contiguous().t()
+        b_fp8, scale_b = b_fp8.t(), scale_b.t()
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: vllm_scaled_mm(a_fp8, b_fp8, scale_a, scale_b, torch.float16),
+            quantiles=quantiles,
+        )
+    if provider == "triton":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: w8a8_block_fp8_matmul(
+                a_fp8, b_fp8, scale_a, scale_b, [128, 128], torch.float16
+            ),
+            quantiles=quantiles,
+        )
+    if provider == "deepgemm":
+        scale_a_col_major = get_col_major_tma_aligned_tensor(scale_a.clone())
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: fp8_gemm_deepgemm(
+                a_fp8, scale_a_col_major, b_fp8, scale_b, M, N, K
+            ),
+            quantiles=quantiles,
+        )
+    return ms * 1000, max_ms * 1000, min_ms * 1000  # convert to ms
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=["deepseek-ai/DeepSeek-V3"],
+        help="List of models to benchmark",
+    )
+    parser.add_argument(
+        "--tp-sizes",
+        nargs="+",
+        type=int,
+        default=[1],
+        help="List of tensor parallel sizes",
+    )
+    args = parser.parse_args()
+
+    NK_model_names = get_weight_shapes(args)
+    for N, K, model_name in NK_model_names:
+        if N % 128 != 0 or K % 128 != 0:
+            print(f"Skip {N=}, {K=} now")
+            continue
+        print(f"{model_name} N={N} K={K}: ")
+        benchmark.run(
+            print_data=True,
+            show_plots=True,
+            save_path="bench_fp8_blockwise_res",
+            N=N,
+            K=K,
+        )
+
+    print("Benchmark finished!")
diff --git a/sgl-kernel/benchmark/bench_fp8_blockwise_group_gemm.py b/sgl-kernel/benchmark/bench_fp8_blockwise_group_gemm.py
new file mode 100644
index 000000000..6aa131244
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_fp8_blockwise_group_gemm.py
@@ -0,0 +1,328 @@
+import argparse
+import random
+from dataclasses import dataclass
+from typing import List, Tuple
+
+import deep_gemm
+import torch
+from sgl_kernel import fp8_blockwise_scaled_grouped_mm
+
+
+def get_m_alignment_for_contiguous_layout():
+    return 128
+
+
+def ceil_div(x: int, y: int) -> int:
+    return (x + y - 1) // y
+
+
+def per_token_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    pad_size = (128 - (n % 128)) % 128
+    x = torch.nn.functional.pad(x, (0, pad_size), value=0) if pad_size > 0 else x
+    x_view = x.view(m, -1, 128)
+    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
+    fp8_data = (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn)
+    return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
+
+
+def per_block_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros(
+        (ceil_div(m, 128) * 128, ceil_div(n, 128) * 128), dtype=x.dtype, device=x.device
+    )
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (x_amax / 448.0).view(
+        x_view.size(0), x_view.size(2)
+    )
+
+
+def construct_contiguous_grouped(
+    num_groups: int, expected_m_per_group: int, k: int, n: int
+) -> Tuple[
+    int,
+    Tuple[torch.Tensor, torch.Tensor],
+    Tuple[torch.Tensor, torch.Tensor],
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+]:
+    alignment = get_m_alignment_for_contiguous_layout()
+    group_ms = [int(expected_m_per_group) for _ in range(num_groups)]
+    m = sum([ceil_div(x, alignment) * alignment for x in group_ms])
+
+    x = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
+    y = torch.randn((num_groups, n, k), device="cuda", dtype=torch.bfloat16)
+    m_indices = torch.empty(m, device="cuda", dtype=torch.int32)
+    out = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
+
+    start = 0
+    for i, group_m in enumerate(group_ms):
+        actual_end = start + group_m
+        aligned_end = start + ceil_div(group_m, alignment) * alignment
+        m_indices[start:actual_end] = i
+        m_indices[actual_end:aligned_end] = -1
+        start = aligned_end
+
+    assert m % 4 == 0, f"TMA alignment error: {m}"
+    x_fp8 = per_token_cast_to_fp8(x)
+    y_fp8 = (
+        torch.empty_like(y, dtype=torch.float8_e4m3fn),
+        torch.empty(
+            (num_groups, ceil_div(n, 128), k // 128), device="cuda", dtype=torch.float
+        ),
+    )
+    for i in range(num_groups):
+        y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i])
+
+    return m, x_fp8, y_fp8, m_indices, out
+
+
+def bench_deepgemm(
+    expected_m_per_group: int,
+    n: int,
+    k: int,
+    num_groups: int,
+    num_warmup: int,
+    num_run: int,
+) -> Tuple[float, int]:
+    # construct tensors
+    m, x_fp8, y_fp8, m_indices, out = construct_contiguous_grouped(
+        num_groups, expected_m_per_group, k, n
+    )
+
+    def run_deepgemm():
+        deep_gemm.m_grouped_fp8_gemm_nt_contiguous(x_fp8, y_fp8, out, m_indices)
+
+    # warmup
+    for _ in range(num_warmup):
+        run_deepgemm()
+    torch.cuda.synchronize()
+
+    # run
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    latencies: list[float] = []
+    start_event.record()
+    for _ in range(num_run):
+        run_deepgemm()
+    end_event.record()
+    end_event.synchronize()
+    torch.cuda.synchronize()
+    avg = start_event.elapsed_time(end_event) / num_run * 1000  # us
+
+    return avg, m
+
+
+def bench_cutlass(
+    expected_m_per_group: int,
+    n: int,
+    k: int,
+    num_groups: int,
+    num_warmup: int,
+    num_run: int,
+) -> Tuple[float, int]:
+    device = "cuda"
+    alignment = 16
+    n_g = ceil_div(n, alignment) * alignment
+    k_g = ceil_div(k, alignment) * alignment
+    out_dtype = torch.bfloat16
+
+    expert_offsets = torch.zeros((num_groups + 1), device=device, dtype=torch.int32)
+    problem_sizes = torch.zeros((num_groups, 3), device=device, dtype=torch.int32)
+    layout_sfa = torch.zeros((num_groups, 5), device=device, dtype=torch.int32)
+    layout_sfb = torch.zeros((num_groups, 5), device=device, dtype=torch.int32)
+
+    a_tensors = []
+    b_tensors = []
+    a_scales_tensors = []
+    b_scales_tensors = []
+
+    # TODO(@TianQiLin666666): Unique group_ms in all bench function
+    group_ms = [
+        alignment * ceil_div(int(expected_m_per_group), alignment)
+        for _ in range(num_groups)
+    ]
+    for g in range(num_groups):
+        m_g = group_ms[g]
+        expert_offsets[g + 1] = expert_offsets[g] + m_g
+        problem_sizes[g][:] = torch.tensor([m_g, n_g, k_g], device=device)
+
+        a_g, a_scale = per_token_cast_to_fp8(torch.randn((m_g, k_g), device=device))
+        b_g, b_scale = per_block_cast_to_fp8(torch.randn((n_g, k_g), device=device).t())
+        a_tensors.append(a_g)
+        b_tensors.append(b_g)
+        a_scales_tensors.append(a_scale)
+        b_scales_tensors.append(b_scale)
+
+    a_stack = torch.empty(
+        (expert_offsets[-1], k_g), device=device, dtype=torch.float8_e4m3fn
+    )
+    b_stack = torch.empty(
+        (num_groups, n_g, k_g), device=device, dtype=torch.float8_e4m3fn
+    )
+
+    for g in range(num_groups):
+        a_stack[expert_offsets[g] : expert_offsets[g + 1]] = a_tensors[g]
+        b_stack[g] = b_tensors[g].t()
+    b_stack = b_stack.transpose(1, 2)
+
+    a_scale_stack = torch.empty(
+        (expert_offsets[-1], k_g // 128), device=device, dtype=torch.float32
+    )
+    b_scale_stack = torch.empty(
+        (num_groups, n_g // 128, k_g // 128), device=device, dtype=torch.float32
+    )
+
+    for g in range(num_groups):
+        a_scale_stack[expert_offsets[g] : expert_offsets[g + 1]] = a_scales_tensors[g]
+        b_scale_stack[g] = b_scales_tensors[g].t()
+    b_scale_stack = b_scale_stack.transpose(1, 2)
+
+    c_out = torch.empty((expert_offsets[-1], n_g), device=device, dtype=out_dtype)
+    a_strides = torch.full(
+        (num_groups,), a_stack.stride(0), device=device, dtype=torch.int64
+    )
+    c_strides = torch.full(
+        (num_groups,), c_out.stride(0), device=device, dtype=torch.int64
+    )
+    workspace = torch.empty((1024 * 1024 * 1024), device=device, dtype=torch.uint8)
+    a_ptrs = torch.empty((num_groups,), device=device, dtype=torch.int64)
+    b_ptrs = torch.empty((num_groups,), device=device, dtype=torch.int64)
+    out_ptrs = torch.empty((num_groups,), device=device, dtype=torch.int64)
+    a_scales_ptrs = torch.empty((num_groups,), device=device, dtype=torch.int64)
+    b_scales_ptrs = torch.empty((num_groups,), device=device, dtype=torch.int64)
+
+    def run_cutlass():
+        fp8_blockwise_scaled_grouped_mm(
+            c_out,
+            a_ptrs,
+            b_ptrs,
+            out_ptrs,
+            a_scales_ptrs,
+            b_scales_ptrs,
+            a_stack,
+            b_stack,
+            a_scale_stack,
+            b_scale_stack,
+            a_strides,
+            a_strides,
+            c_strides,
+            layout_sfa,
+            layout_sfb,
+            problem_sizes,
+            expert_offsets[:-1],
+            workspace,
+        )
+
+    # warmup
+    for _ in range(num_warmup):
+        run_cutlass()
+    torch.cuda.synchronize()
+
+    # run
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    for _ in range(num_run):
+        run_cutlass()
+    end_event.record()
+    end_event.synchronize()
+    torch.cuda.synchronize()
+    avg = start_event.elapsed_time(end_event) / num_run * 1000  # us
+
+    return avg, expert_offsets[-1]
+
+
+def bench_sglang_triton(
+    expected_m_per_group: int,
+    n: int,
+    k: int,
+    num_groups: int,
+    num_warmup: int,
+    num_run: int,
+) -> Tuple[float, int]:
+    pass
+
+
+benchmark_kernels = {
+    "deepgemm": bench_deepgemm,
+    "cutlass": bench_cutlass,
+    # "triton": bench_sglang_triton,
+}
+
+
+@dataclass
+class ShapeArg:
+    expected_m_per_group: int
+    n: int
+    k: int
+    num_groups: int
+
+
+def benchmark_one_shape(
+    shape_args: List[ShapeArg],
+    num_warmup: int,
+    num_run: int,
+):
+    for shape in shape_args:
+        print(
+            f"\nBenchmark: expected_m_per_group={shape.expected_m_per_group}, n={shape.n}, k={shape.k}, num_groups={shape.num_groups}"
+        )
+        for kernel_name, kernel_func in benchmark_kernels.items():
+            average_time, m = kernel_func(
+                shape.expected_m_per_group,
+                shape.n,
+                shape.k,
+                shape.num_groups,
+                num_warmup,
+                num_run,
+            )
+            print(f"{kernel_name}: {average_time} us")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-warmup", type=int, default=3)
+    parser.add_argument("--num-run", type=int, default=10)
+    shape_args = [
+        # Prefill, DeepSeek-R1, gateup, chunk_size = 4096, TP = 8
+        ShapeArg(expected_m_per_group=128, n=512, k=7168, num_groups=256),
+        # Prefill, DeepSeek-R1, gateup, chunk_size = 8192, TP = 8
+        ShapeArg(expected_m_per_group=256, n=512, k=7168, num_groups=256),
+        # Prefill, DeepSeek-R1, gateup, chunk_size = 8192, TP = 16
+        ShapeArg(expected_m_per_group=256, n=256, k=7168, num_groups=256),
+        # Prefill, DeepSeek-R1, gateup, chunk_size = 16384, TP = 16
+        ShapeArg(expected_m_per_group=512, n=256, k=7168, num_groups=256),
+        # Decode, DeepSeek-R1, gateup, bs = 32, TP = 8
+        ShapeArg(expected_m_per_group=1, n=512, k=7168, num_groups=256),
+        # Decode, DeepSeek-R1, gateup, bs = 64, TP = 16
+        ShapeArg(expected_m_per_group=2, n=256, k=7168, num_groups=256),
+        # Prefill, DeepSeek-R1, gateup, chunk_size = 8192, EP = 8
+        ShapeArg(expected_m_per_group=256, n=4096, k=7168, num_groups=32),
+        # Prefill, DeepSeek-R1, gateup, chunk_size = 16384, EP = 16
+        ShapeArg(expected_m_per_group=512, n=4096, k=7168, num_groups=16),
+        # Decode, DeepSeek-R1, gateup, bs = 128, EP = 8
+        ShapeArg(expected_m_per_group=4, n=4096, k=7168, num_groups=32),
+        # Decode, DeepSeek-R1, gateup, bs = 256, EP = 16
+        ShapeArg(expected_m_per_group=8, n=4096, k=7168, num_groups=16),
+        # Prefill, Qwen3-235B-A22B-FP8, gateup, chunk_size = 16384, TP = 4
+        ShapeArg(expected_m_per_group=1024, n=768, k=4096, num_groups=128),
+        # Prefill, Qwen3-235B-A22B-FP8, down, chunk_size = 16384, TP = 4
+        ShapeArg(expected_m_per_group=1024, n=4096, k=384, num_groups=128),
+        # Decode, Qwen3-235B-A22B-FP8, gateup, bs = 256, TP = 4
+        ShapeArg(expected_m_per_group=16, n=768, k=4096, num_groups=128),
+        # Decode, Qwen3-235B-A22B-FP8, down, bs = 256, TP = 4
+        ShapeArg(expected_m_per_group=16, n=4096, k=384, num_groups=128),
+    ]
+    args = parser.parse_args()
+    benchmark_one_shape(shape_args, args.num_warmup, args.num_run)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sgl-kernel/benchmark/bench_fp8_gemm.py b/sgl-kernel/benchmark/bench_fp8_gemm.py
new file mode 100644
index 000000000..5f16ca028
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_fp8_gemm.py
@@ -0,0 +1,184 @@
+import argparse
+import copy
+import itertools
+from typing import Optional, Tuple
+
+import torch
+import triton
+from sgl_kernel import fp8_scaled_mm as sgl_scaled_mm
+from sgl_kernel import sgl_per_tensor_quant_fp8
+from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
+from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
+
+# Weight Shapes are in the format
+# ([K, N], TP_SPLIT_DIM)
+# Example:
+#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
+#   - TP1 : K = 14336, N = 4096
+#   - TP2 : K = 7168, N = 4096
+#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
+#   - TP1 : K = 4096, N = 6144
+#   - TP4 : K = 4096, N = 1536
+
+# TP1 shapes
+WEIGHT_SHAPES = {
+    "meta-llama/Llama-3.1-8B-Instruct": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-3.3-70B-Instruct": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+    "mistralai/Mistral-Large-Instruct-2407": [
+        ([12288, 14336], 1),
+        ([12288, 12288], 0),
+        ([12288, 57344], 1),
+        ([28672, 12288], 0),
+    ],
+    "Qwen/Qwen2.5-7B-Instruct": [
+        ([3584, 4608], 1),
+        ([3584, 3584], 0),
+        ([3584, 37888], 1),
+        ([18944, 3584], 0),
+    ],
+    "Qwen/Qwen2.5-32B-Instruct": [
+        ([5120, 7168], 1),
+        ([5120, 5120], 0),
+        ([5120, 55296], 1),
+        ([27648, 5120], 0),
+    ],
+    "Qwen/Qwen2.5-72B-Instruct": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 59136], 1),
+        ([29568, 8192], 0),
+    ],
+    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [
+        ([2048, 3072], 1),
+        ([2048, 4096], 1),
+        ([2048, 2048], 0),
+        ([2048, 576], 0),
+        ([2048, 21888], 1),
+        ([10944, 2048], 0),
+        ([2048, 2816], 1),
+        ([1408, 2048], 0),
+    ],
+}
+
+
+def sglang_scaled_fp8_quant(
+    input: torch.Tensor,
+    scale: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    fp8_type_: torch.dtype = torch.float8_e4m3fn
+    output = torch.empty_like(input, device=input.device, dtype=fp8_type_)
+    is_static = True
+    if scale is None:
+        scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+        is_static = False
+    sgl_per_tensor_quant_fp8(input, output, scale, is_static)
+
+    return output, scale
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048],
+        x_log=False,
+        line_arg="provider",
+        line_vals=[
+            "vllm-fp8-fp16",
+            "vllm-fp8-bf16",
+            "sglang-fp8-fp16",
+            "sglang-fp8-bf16",
+        ],
+        line_names=[
+            "vllm-fp8-fp16",
+            "vllm-fp8-bf16",
+            "sglang-fp8-fp16",
+            "sglang-fp8-bf16",
+        ],
+        styles=[("green", "-"), ("green", "--"), ("blue", "-"), ("blue", "--")],
+        ylabel="GB/s",
+        plot_name="fp8 scaled matmul",
+        args={},
+    )
+)
+def benchmark(batch_size, provider, N, K):
+    # M, N, K = batch_size, 4096, 8192
+    M = batch_size
+    a = torch.ones((M, K), device="cuda") * 5.0
+    b = torch.ones((N, K), device="cuda") * 5.0
+    scale_a = torch.randn((M,), device="cuda", dtype=torch.float32)
+    scale_b = torch.randn((N,), device="cuda", dtype=torch.float32)
+    quantiles = [0.5, 0.2, 0.8]
+
+    dtype = torch.float16 if "fp16" in provider else torch.bfloat16
+
+    if "vllm-fp8" in provider:
+        a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
+        b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
+        b_fp8 = b_fp8.t()
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype),
+            quantiles=quantiles,
+        )
+    elif "sglang-fp8" in provider:
+        a_fp8, scale_a_fp8 = sglang_scaled_fp8_quant(a, scale_a)
+        b_fp8, scale_b_fp8 = sglang_scaled_fp8_quant(b, scale_b)
+        b_fp8 = b_fp8.t()
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: sgl_scaled_mm(
+                a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype, bias=None
+            ),
+            quantiles=quantiles,
+        )
+
+    gbps = lambda ms: (2 * M * N * K + M * N) * a.element_size() * 1e-9 / (ms * 1e-3)
+    return gbps(ms), gbps(max_ms), gbps(min_ms)
+
+
+def prepare_shapes(args):
+    KN_model_names = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        assert model in WEIGHT_SHAPES
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KN.append(model)
+            KN_model_names.append(KN)
+    return KN_model_names
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=["meta-llama/Llama-3.1-8B-Instruct"],
+        help="List of models to benchmark",
+    )
+    parser.add_argument(
+        "--tp-sizes",
+        nargs="+",
+        type=int,
+        default=[1],
+        help="List of tensor parallel sizes",
+    )
+    args = parser.parse_args()
+
+    KN_model_names = prepare_shapes(args)
+    for K, N, model_name in KN_model_names:
+        print(f"{model_name} N={N} K={K}: ")
+        benchmark.run(
+            print_data=True, show_plots=True, save_path="bench_fp8_res", N=N, K=K
+        )
+
+    print("Benchmark finished!")
diff --git a/sgl-kernel/benchmark/bench_int8_gemm.py b/sgl-kernel/benchmark/bench_int8_gemm.py
new file mode 100644
index 000000000..c5a709393
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_int8_gemm.py
@@ -0,0 +1,146 @@
+import argparse
+import copy
+import itertools
+
+import torch
+import triton
+from sgl_kernel import int8_scaled_mm
+from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
+
+
+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+WEIGHT_SHAPES = {
+    "meta-llama/Llama-3.1-8B-Instruct": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-3.3-70B-Instruct": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+    "mistralai/Mistral-Large-Instruct-2407": [
+        ([12288, 14336], 1),
+        ([12288, 12288], 0),
+        ([12288, 57344], 1),
+        ([28672, 12288], 0),
+    ],
+    "Qwen/Qwen2.5-7B-Instruct": [
+        ([3584, 4608], 1),
+        ([3584, 3584], 0),
+        ([3584, 37888], 1),
+        ([18944, 3584], 0),
+    ],
+    "Qwen/Qwen2.5-32B-Instruct": [
+        ([5120, 7168], 1),
+        ([5120, 5120], 0),
+        ([5120, 55296], 1),
+        ([27648, 5120], 0),
+    ],
+    "Qwen/Qwen2.5-72B-Instruct": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 59136], 1),
+        ([29568, 8192], 0),
+    ],
+    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [
+        ([2048, 3072], 1),
+        ([2048, 4096], 1),
+        ([2048, 2048], 0),
+        ([2048, 576], 0),
+        ([2048, 21888], 1),
+        ([10944, 2048], 0),
+        ([2048, 2816], 1),
+        ([1408, 2048], 0),
+    ],
+}
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048],
+        x_log=False,
+        line_arg="provider",
+        line_vals=["vllm", "sgl-kernel"],
+        line_names=["vllm int8 gemm", "sgl-kernel int8 gemm"],
+        styles=[("blue", "-"), ("orange", "-")],
+        ylabel="GB/s",
+        plot_name="int8 scaled matmul",
+        args={},
+    )
+)
+def benchmark(batch_size, provider, N, K):
+    M = batch_size
+    a = to_int8(torch.randn((M, K), device="cuda") * 5)
+    b = to_int8(torch.randn((N, K), device="cuda").t() * 5)
+    scale_a = torch.randn((M,), device="cuda", dtype=torch.float32)
+    scale_b = torch.randn((N,), device="cuda", dtype=torch.float32)
+    bias = torch.randn((N,), device="cuda", dtype=torch.float16)
+
+    quantiles = [0.5, 0.2, 0.8]
+    if provider == "sgl-kernel":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: int8_scaled_mm(a, b, scale_a, scale_b, torch.float16, bias),
+            quantiles=quantiles,
+        )
+    if provider == "vllm":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: vllm_scaled_mm(a, b, scale_a, scale_b, torch.float16, bias),
+            quantiles=quantiles,
+        )
+    gbps = (
+        lambda ms: (
+            (2 * M * N * K - M * N) * a.element_size()
+            + (3 * M * N) * scale_a.element_size()
+        )
+        * 1e-9
+        / (ms * 1e-3)
+    )
+    return gbps(ms), gbps(max_ms), gbps(min_ms)
+
+
+def prepare_shapes(args):
+    KN_model_names = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        assert model in WEIGHT_SHAPES
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KN.append(model)
+            KN_model_names.append(KN)
+    return KN_model_names
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=["meta-llama/Llama-3.1-8B-Instruct"],
+        help="List of models to benchmark",
+    )
+    parser.add_argument(
+        "--tp-sizes",
+        nargs="+",
+        type=int,
+        default=[1],
+        help="List of tensor parallel sizes",
+    )
+    args = parser.parse_args()
+
+    KN_model_names = prepare_shapes(args)
+    for K, N, model_name in KN_model_names:
+        print(f"{model_name} N={N} K={K}: ")
+        benchmark.run(
+            print_data=True, show_plots=True, save_path="bench_int8_res", N=N, K=K
+        )
+
+    print("Benchmark finished!")
diff --git a/sgl-kernel/benchmark/bench_lightning_attention_decode.py b/sgl-kernel/benchmark/bench_lightning_attention_decode.py
new file mode 100644
index 000000000..36bdccac0
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_lightning_attention_decode.py
@@ -0,0 +1,299 @@
+import itertools
+import math
+
+import torch
+import triton
+import triton.language as tl
+from sgl_kernel import lightning_attention_decode
+
+
+def next_power_of_2(n):
+    return 2 ** (int(math.ceil(math.log(n, 2))))
+
+
+@triton.jit
+def _decode_kernel(
+    Q,
+    K,
+    V,
+    KV,
+    Out,
+    S,
+    b: tl.constexpr,
+    h: tl.constexpr,
+    n: tl.constexpr,
+    d: tl.constexpr,
+    d_original: tl.constexpr,
+    e: tl.constexpr,
+    e_original: tl.constexpr,
+):
+    off_bh = tl.program_id(0)
+    off_h = off_bh % h
+
+    qk_offset = off_bh * n * d
+    v_offset = off_bh * n * e
+    o_offset = off_bh * n * e
+    kv_offset = off_bh * d * e
+
+    s = tl.load(S + off_h)
+    ratio = tl.exp(-s)
+
+    d_idx = tl.arange(0, d)
+    e_idx = tl.arange(0, e)
+
+    # Create masks for original dimensions
+    d_mask = d_idx < d_original
+    e_mask = e_idx < e_original
+
+    # Load with masking
+    q = tl.load(Q + qk_offset + d_idx, mask=d_mask, other=0.0)
+    k = tl.load(K + qk_offset + d_idx, mask=d_mask, other=0.0)
+    v = tl.load(V + v_offset + e_idx, mask=e_mask, other=0.0)
+
+    # Load KV with 2D masking
+    kv = tl.load(
+        KV + kv_offset + d_idx[:, None] * e + e_idx[None, :],
+        mask=(d_mask[:, None] & e_mask[None, :]),
+        other=0.0,
+    )
+
+    # Compute outer product using element-wise operations
+    k_v_prod = k[:, None] * v[None, :]
+    kv = ratio * kv + k_v_prod
+
+    # Store KV with 2D masking
+    tl.store(
+        KV + kv_offset + d_idx[:, None] * e + e_idx[None, :],
+        kv.to(KV.dtype.element_ty),
+        mask=(d_mask[:, None] & e_mask[None, :]),
+    )
+
+    # Compute matrix-vector multiplication using element-wise operations and reduction
+    o = tl.sum(q[:, None] * kv, axis=0)
+
+    # Store output with masking
+    tl.store(Out + o_offset + e_idx, o.to(Out.dtype.element_ty), mask=e_mask)
+
+
+def triton_lightning_attn_decode(q, k, v, kv, s):
+    """Triton implementation of Lightning Attention decode operation"""
+    b, h, n, d = q.shape
+    e = v.shape[-1]
+    assert n == 1, "Sequence length must be 1 in decode mode"
+
+    # Get padded dimensions (power of 2)
+    d_padded = next_power_of_2(d)
+    e_padded = next_power_of_2(e)
+
+    # Create output tensor (padded)
+    o_padded = torch.empty(b, h, n, e_padded, dtype=v.dtype, device=v.device)
+
+    # Create padded tensors without actually padding the data
+    q_padded = torch.empty(b, h, n, d_padded, dtype=q.dtype, device=q.device)
+    k_padded = torch.empty(b, h, n, d_padded, dtype=k.dtype, device=k.device)
+    v_padded = torch.empty(b, h, n, e_padded, dtype=v.dtype, device=v.device)
+    kv_padded = torch.empty(
+        b, h, d_padded, e_padded, dtype=torch.float32, device=kv.device
+    )
+
+    # Copy data to padded tensors
+    q_padded[..., :d] = q
+    k_padded[..., :d] = k
+    v_padded[..., :e] = v
+    kv_padded[..., :d, :e] = kv
+
+    # Launch kernel
+    grid = (b * h, 1)
+    _decode_kernel[grid](
+        q_padded,
+        k_padded,
+        v_padded,
+        kv_padded,
+        o_padded,
+        s,
+        b=b,
+        h=h,
+        n=n,
+        d=d_padded,
+        d_original=d,
+        e=e_padded,
+        e_original=e,
+    )
+
+    # Get unpadded outputs
+    o = o_padded[..., :e]
+    kv_out = kv_padded[..., :d, :e]
+
+    return o, kv_out
+
+
+def lightning_attention_decode_naive(q, k, v, past_kv, slope):
+    """Naive implementation of lightning attention decode"""
+    original_dtype = q.dtype
+    ratio = torch.exp(-slope)  # [h, 1, 1]
+
+    kv = past_kv
+    b, h, n, d = q.shape
+
+    output = []
+    for i in range(n):
+        kv = ratio * kv.to(torch.float32) + torch.einsum(
+            "... n d, ... n e -> ... d e",
+            k[:, :, i : i + 1],
+            v[:, :, i : i + 1],
+        )
+        qkv = torch.einsum(
+            "... n e, ... e d -> ... n d",
+            q[:, :, i : i + 1].to(torch.float32),
+            kv.to(torch.float32),
+        )
+        output.append(qkv)
+    output = torch.cat(output, dim=-2)
+
+    return output.to(original_dtype), kv
+
+
+def lightning_attention_decode_kernel(q, k, v, past_kv, slope, output, new_kv):
+    return lightning_attention_decode(q, k, v, past_kv, slope, output, new_kv)
+
+
+def calculate_diff(batch_size):
+    dtype = torch.bfloat16
+    device = torch.device("cuda")
+    num_heads = 64
+    head_dim = 96
+    seq_len = 1
+
+    q = torch.randn(
+        batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype
+    )
+    k = torch.randn(
+        batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype
+    )
+    v = torch.randn(
+        batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype
+    )
+    past_kv = torch.randn(batch_size, num_heads, head_dim, head_dim, device=device)
+    slope = torch.randn(num_heads, 1, 1, device=device)
+
+    output_naive, new_kv_naive = lightning_attention_decode_naive(
+        q.clone(), k.clone(), v.clone(), past_kv.clone(), slope.clone()
+    )
+
+    output_kernel = torch.empty_like(output_naive)
+    new_kv_kernel = torch.empty_like(new_kv_naive)
+    lightning_attention_decode_kernel(
+        q.clone(),
+        k.clone(),
+        v.clone(),
+        past_kv.clone(),
+        slope.clone(),
+        output_kernel,
+        new_kv_kernel,
+    )
+
+    output_triton, new_kv_triton = triton_lightning_attn_decode(
+        q.clone(), k.clone(), v.clone(), past_kv.clone(), slope.clone()
+    )
+
+    if (
+        torch.allclose(output_naive, output_kernel, atol=1e-2, rtol=1e-2)
+        and torch.allclose(output_naive, output_triton, atol=1e-2, rtol=1e-2)
+        and torch.allclose(new_kv_naive, new_kv_kernel, atol=1e-2, rtol=1e-2)
+        and torch.allclose(new_kv_naive, new_kv_triton, atol=1e-2, rtol=1e-2)
+    ):
+        print("✅ All implementations match")
+    else:
+        print("❌ Implementations differ")
+
+
+batch_size_range = [i for i in range(1, 65)]  # 1 to 128
+configs = [(bs,) for bs in batch_size_range]
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[list(_) for _ in configs],
+        line_arg="provider",
+        line_vals=["naive", "kernel", "triton"],
+        line_names=["PyTorch Naive", "SGL Kernel", "Triton"],
+        styles=[("blue", "-"), ("red", "-"), ("green", "-")],
+        ylabel="us",
+        plot_name="lightning-attention-decode-performance",
+        args={},
+    )
+)
+def benchmark(batch_size, provider):
+    dtype = torch.bfloat16
+    device = torch.device("cuda")
+    num_heads = 64
+    head_dim = 96
+    seq_len = 1
+
+    q = torch.randn(
+        batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype
+    )
+    k = torch.randn(
+        batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype
+    )
+    v = torch.randn(
+        batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype
+    )
+    past_kv = torch.randn(batch_size, num_heads, head_dim, head_dim, device=device)
+    slope = torch.randn(num_heads, 1, 1, device=device)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "naive":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: lightning_attention_decode_naive(
+                q.clone(), k.clone(), v.clone(), past_kv.clone(), slope.clone()
+            ),
+            quantiles=quantiles,
+        )
+    elif provider == "kernel":
+        output = torch.empty(
+            batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype
+        )
+        new_kv = torch.empty(batch_size, num_heads, head_dim, head_dim, device=device)
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: lightning_attention_decode_kernel(
+                q.clone(),
+                k.clone(),
+                v.clone(),
+                past_kv.clone(),
+                slope.clone(),
+                output,
+                new_kv,
+            ),
+            quantiles=quantiles,
+        )
+    elif provider == "triton":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: triton_lightning_attn_decode(
+                q.clone(), k.clone(), v.clone(), past_kv.clone(), slope.clone()
+            ),
+            quantiles=quantiles,
+        )
+
+    return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default="./configs/benchmark_ops/lightning_attention_decode_sgl/",
+        help="Path to save lightning attention decode benchmark results",
+    )
+    args = parser.parse_args()
+
+    # Run correctness test
+    calculate_diff(batch_size=4)
+
+    # Run performance benchmark
+    benchmark.run(print_data=True)
diff --git a/sgl-kernel/benchmark/bench_moe_align_block_size.py b/sgl-kernel/benchmark/bench_moe_align_block_size.py
new file mode 100644
index 000000000..ed8a7b8f3
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_moe_align_block_size.py
@@ -0,0 +1,401 @@
+import argparse
+import itertools
+
+import torch
+import triton
+import triton.language as tl
+from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
+
+try:
+    from vllm import _custom_ops as ops
+except ImportError:
+    ops = None
+
+USE_RANDOM_PERM = False
+
+
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+@triton.jit
+def moe_align_block_size_stage1(
+    topk_ids_ptr,
+    tokens_cnts_ptr,
+    num_experts: tl.constexpr,
+    numel: tl.constexpr,
+    tokens_per_thread: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    start_idx = pid * tokens_per_thread
+    off_c = (pid + 1) * num_experts
+
+    for i in range(tokens_per_thread):
+        if start_idx + i < numel:
+            idx = tl.load(topk_ids_ptr + start_idx + i)
+            token_cnt = tl.load(tokens_cnts_ptr + off_c + idx)
+            tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1)
+
+
+@triton.jit
+def moe_align_block_size_stage2(
+    tokens_cnts_ptr,
+    num_experts: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    last_cnt = 0
+    for i in range(1, num_experts + 1):
+        token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid)
+        last_cnt = last_cnt + token_cnt
+        tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt)
+
+
+@triton.jit
+def moe_align_block_size_stage3(
+    total_tokens_post_pad_ptr,
+    tokens_cnts_ptr,
+    cumsum_ptr,
+    num_experts: tl.constexpr,
+    block_size: tl.constexpr,
+):
+    last_cumsum = 0
+    off_cnt = num_experts * num_experts
+    for i in range(1, num_experts + 1):
+        token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1)
+        last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size
+        tl.store(cumsum_ptr + i, last_cumsum)
+    tl.store(total_tokens_post_pad_ptr, last_cumsum)
+
+
+@triton.jit
+def moe_align_block_size_stage4(
+    topk_ids_ptr,
+    sorted_token_ids_ptr,
+    expert_ids_ptr,
+    tokens_cnts_ptr,
+    cumsum_ptr,
+    num_experts: tl.constexpr,
+    block_size: tl.constexpr,
+    numel: tl.constexpr,
+    tokens_per_thread: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    start_idx = tl.load(cumsum_ptr + pid)
+    end_idx = tl.load(cumsum_ptr + pid + 1)
+
+    for i in range(start_idx, end_idx, block_size):
+        tl.store(expert_ids_ptr + i // block_size, pid)
+
+    start_idx = pid * tokens_per_thread
+    off_t = pid * num_experts
+
+    for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread, numel)):
+        expert_id = tl.load(topk_ids_ptr + i)
+        token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id)
+        rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id)
+        tl.store(sorted_token_ids_ptr + rank_post_pad, i)
+        tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1)
+
+
+def moe_align_block_size_triton(
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    block_size: int,
+    sorted_token_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+) -> None:
+    numel = topk_ids.numel()
+    grid = (num_experts,)
+    tokens_cnts = torch.zeros(
+        (num_experts + 1, num_experts), dtype=torch.int32, device=topk_ids.device
+    )
+    cumsum = torch.zeros((num_experts + 1,), dtype=torch.int32, device=topk_ids.device)
+    tokens_per_thread = ceil_div(numel, num_experts)
+
+    moe_align_block_size_stage1[grid](
+        topk_ids,
+        tokens_cnts,
+        num_experts,
+        numel,
+        tokens_per_thread,
+    )
+    moe_align_block_size_stage2[grid](
+        tokens_cnts,
+        num_experts,
+    )
+    moe_align_block_size_stage3[(1,)](
+        num_tokens_post_pad,
+        tokens_cnts,
+        cumsum,
+        num_experts,
+        block_size,
+    )
+    moe_align_block_size_stage4[grid](
+        topk_ids,
+        sorted_token_ids,
+        expert_ids,
+        tokens_cnts,
+        cumsum,
+        num_experts,
+        block_size,
+        numel,
+        tokens_per_thread,
+    )
+
+
+def calculate_diff(num_tokens, num_experts=256, block_size=128, topk=8):
+    topk_ids = torch.stack(
+        [
+            torch.randperm(num_experts, dtype=torch.int32, device="cuda")[:topk]
+            for _ in range(num_tokens)
+        ]
+    )
+
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    sorted_ids_cuda = torch.empty(
+        (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
+    )
+    sorted_ids_cuda.fill_(topk_ids.numel())
+    max_num_m_blocks = max_num_tokens_padded // block_size
+    expert_ids_cuda = torch.zeros(
+        (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
+    )
+    num_tokens_post_pad_cuda = torch.empty(
+        (1), dtype=torch.int32, device=topk_ids.device
+    )
+    cumsum_buffer = torch.zeros(
+        num_experts + 1, dtype=torch.int32, device=topk_ids.device
+    )
+
+    sorted_ids_triton = torch.empty_like(sorted_ids_cuda)
+    sorted_ids_triton.fill_(topk_ids.numel())
+    expert_ids_triton = torch.zeros_like(expert_ids_cuda)
+    num_tokens_post_pad_triton = torch.empty_like(num_tokens_post_pad_cuda)
+
+    sorted_ids_vllm = torch.empty_like(sorted_ids_cuda)
+    sorted_ids_vllm.fill_(topk_ids.numel())
+    expert_ids_vllm = torch.zeros_like(expert_ids_cuda)
+    num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_cuda)
+
+    # compare the performance of cuda, triton and vllm implementation
+    sgl_moe_align_block_size(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_ids_cuda,
+        expert_ids_cuda,
+        num_tokens_post_pad_cuda,
+        cumsum_buffer,
+    )
+    moe_align_block_size_triton(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_ids_triton,
+        expert_ids_triton,
+        num_tokens_post_pad_triton,
+    )
+
+    try:
+        ops.moe_align_block_size(
+            topk_ids,
+            num_experts,
+            block_size,
+            sorted_ids_vllm,
+            expert_ids_vllm,
+            num_tokens_post_pad_vllm,
+        )
+        print(f"✅ VLLM implementation works with {num_experts} experts!")
+        vllm_works = True
+    except Exception as e:
+        print(f"❌ VLLM implementation failed with {num_experts} experts: {e}")
+        vllm_works = False
+
+    if torch.allclose(expert_ids_cuda, expert_ids_triton) and torch.allclose(
+        num_tokens_post_pad_cuda, num_tokens_post_pad_triton
+    ):
+        print("✅ SGL and Triton implementations match")
+    else:
+        print("❌ SGL and Triton implementations do not match")
+        print("SGL expert_ids:", expert_ids_cuda)
+        print("Triton expert_ids:", expert_ids_triton)
+        print("SGL num_tokens_post_pad:", num_tokens_post_pad_cuda)
+        print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton)
+
+    if (
+        vllm_works
+        and torch.allclose(expert_ids_cuda, expert_ids_vllm)
+        and torch.allclose(num_tokens_post_pad_cuda, num_tokens_post_pad_vllm)
+    ):
+        print("✅ SGL and VLLM implementations match")
+    else:
+        if not vllm_works:
+            print("⚠️ VLLM comparison skipped due to failure")
+        else:
+            print("❌ SGL and VLLM implementations do not match")
+            print("SGL expert_ids:", expert_ids_cuda)
+            print("VLLM expert_ids:", expert_ids_vllm)
+            print("SGL num_tokens_post_pad:", num_tokens_post_pad_cuda)
+            print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm)
+
+
+# Test range
+num_tokens_range = [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
+num_experts_range = [8, 32, 64, 128, 256]
+topk_range = [1, 2, 4, 8]
+
+configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
+
+
+def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
+    topk_ids = torch.zeros((num_tokens, topk), dtype=torch.int32, device="cuda")
+    for i in range(num_tokens):
+        topk_ids[i, :] = torch.randperm(num_experts, dtype=torch.int32, device="cuda")[
+            :topk
+        ]
+    return topk_ids
+
+
+def sgl_moe_align_block_size_with_empty(
+    topk_ids,
+    num_experts,
+    block_size,
+    sorted_ids,
+    expert_ids,
+    num_tokens_post_pad,
+    pad_sorted_token_ids=False,
+):
+    if not pad_sorted_token_ids:
+        sorted_ids.fill_(topk_ids.numel())
+
+    cumsum_buffer = torch.empty(
+        num_experts + 1, dtype=torch.int32, device=topk_ids.device
+    )
+
+    sgl_moe_align_block_size(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_ids.clone(),
+        expert_ids.clone(),
+        num_tokens_post_pad.clone(),
+        cumsum_buffer,
+        pad_sorted_token_ids,
+    )
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["num_tokens", "num_experts", "topk"],
+        x_vals=configs,
+        line_arg="provider",
+        line_vals=["sgl", "sgl_fusion", "triton"],
+        line_names=["SGL", "SGL Fusion", "Triton"],
+        styles=[("blue", "-"), ("red", "-"), ("green", "-")],
+        ylabel="us",
+        plot_name="moe-align-block-size-performance",
+        args={},
+    )
+)
+def benchmark(num_tokens, num_experts, topk, provider):
+    block_size = 128
+
+    if USE_RANDOM_PERM:
+        topk_ids = get_topk_ids(num_tokens, num_experts, topk)
+    else:
+        topk_ids = torch.randint(
+            0,
+            num_experts,
+            (num_tokens, topk),
+            dtype=torch.int32,
+            device="cuda",
+        )
+
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    sorted_ids = torch.empty(
+        (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
+    )
+    max_num_m_blocks = max_num_tokens_padded // block_size
+    expert_ids = torch.empty(
+        (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
+    )
+    num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
+
+    quantiles = [0.5, 0.2, 0.8]
+    if provider == "sgl":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: sgl_moe_align_block_size_with_empty(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids,
+                expert_ids,
+                num_tokens_post_pad,
+            ),
+            quantiles=quantiles,
+        )
+    elif provider == "sgl_fusion":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: sgl_moe_align_block_size_with_empty(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids,
+                expert_ids,
+                num_tokens_post_pad,
+                pad_sorted_token_ids=True,
+            ),
+            quantiles=quantiles,
+        )
+    elif provider == "triton":
+        sorted_ids.fill_(topk_ids.numel())
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: moe_align_block_size_triton(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids.clone(),
+                expert_ids.clone(),
+                num_tokens_post_pad.clone(),
+            ),
+            quantiles=quantiles,
+        )
+
+    return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_path",
+        type=str,
+        default="./configs/benchmark_ops/moe_align_blocks/",
+        help="Path to save moe align benchmark results",
+    )
+    parser.add_argument(
+        "--num_experts",
+        type=int,
+        default=256,
+        choices=[8, 16, 32, 64, 128, 256],
+        help="Number of experts for benchmark",
+    )
+    parser.add_argument(
+        "--topk",
+        type=int,
+        default=8,
+        choices=[2, 4, 8],
+        help="Top-k value for benchmark",
+    )
+    parser.add_argument(
+        "--skip_full_benchmark",
+        action="store_true",
+        help="Only run the calculate_diff function, skip full benchmarking",
+    )
+    args = parser.parse_args()
+
+    calculate_diff(num_tokens=1024, num_experts=args.num_experts, topk=args.topk)
+
+    if not args.skip_full_benchmark:
+        print(f"\n📊 Running performance benchmark for {args.num_experts} experts...")
+        benchmark.run(print_data=True)
diff --git a/sgl-kernel/benchmark/bench_moe_ep_post_reorder.py b/sgl-kernel/benchmark/bench_moe_ep_post_reorder.py
new file mode 100644
index 000000000..faadd7698
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_moe_ep_post_reorder.py
@@ -0,0 +1,75 @@
+import torch
+import triton
+
+from sglang.srt.layers.moe.ep_moe.kernels import post_reorder_triton_kernel
+
+batch_sizes = [64, 128, 256, 512, 640, 768, 1024, 2048, 4096]
+configs = [(bs,) for bs in batch_sizes]
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[list(_) for _ in configs],
+        line_arg="provider",
+        line_vals=["triton"],
+        line_names=["Triton Kernel"],
+        styles=[("orange", "-")],
+        ylabel="us",
+        plot_name="ep-moe-post-reorder-performance",
+        args={},
+    )
+)
+def benchmark(batch_size, provider):
+    dtype = torch.bfloat16
+    device = torch.device("cuda")
+    hidden_size, topk, start_expert_id, end_expert_id, block_size = 4096, 8, 0, 255, 512
+
+    def alloc_tensors():
+        down_output = torch.randn(
+            batch_size * topk, hidden_size, dtype=dtype, device=device
+        )
+        output = torch.zeros(batch_size, hidden_size, dtype=dtype, device=device)
+        src2dst = torch.randint(
+            0, batch_size * topk, (batch_size, topk), dtype=torch.int32, device=device
+        )
+        topk_ids = torch.randint(
+            start_expert_id,
+            end_expert_id + 1,
+            (batch_size, topk),
+            dtype=torch.int32,
+            device=device,
+        )
+        topk_weights = torch.rand(batch_size, topk, dtype=dtype, device=device)
+        return down_output, output, src2dst, topk_ids, topk_weights
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "triton":
+        d_out, out, s2d, tk_ids, tk_weights = alloc_tensors()
+
+        def run_triton():
+            post_reorder_triton_kernel[(batch_size,)](
+                d_out.view(-1),
+                out.view(-1),
+                s2d.view(-1),
+                tk_ids.view(-1),
+                tk_weights.view(-1),
+                start_expert_id,
+                end_expert_id,
+                topk,
+                hidden_size,
+                0,
+                block_size,
+            )
+
+        ms, min_ms, max_ms = triton.testing.do_bench(run_triton, quantiles=quantiles)
+
+    else:
+        raise ValueError(f"Unknown provider: {provider}")
+
+    return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+
+if __name__ == "__main__":
+    benchmark.run(print_data=True)
diff --git a/sgl-kernel/benchmark/bench_moe_fused_gate.py b/sgl-kernel/benchmark/bench_moe_fused_gate.py
new file mode 100644
index 000000000..36cc9c498
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_moe_fused_gate.py
@@ -0,0 +1,77 @@
+import itertools
+import math
+
+import torch
+import triton
+import triton.language as tl
+from sgl_kernel import moe_fused_gate
+
+from sglang.srt.layers.moe.topk import biased_grouped_topk
+
+
+def biased_grouped_topk_org(scores, bias, num_expert_group, topk_group, topk):
+    return biased_grouped_topk(
+        scores,
+        scores,
+        bias,
+        topk=topk,
+        renormalize=True,
+        num_expert_group=num_expert_group,
+        topk_group=topk_group,
+        routed_scaling_factor=2.5,  # DeepSeek-R1 : 2.5, Kimi K2: 2.872
+    )
+
+
+def biased_grouped_topk_org_fuse_kernel(
+    scores, bias, num_expert_group, topk_group, topk
+):
+    return moe_fused_gate(scores, bias, num_expert_group, topk_group, topk)
+
+
+seq_length_range = [5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000]
+configs = [(sq,) for sq in seq_length_range]
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["seq_length"],
+        x_vals=[list(_) for _ in configs],
+        line_arg="provider",
+        line_vals=["original", "kernel"],
+        line_names=["Original", "SGL Kernel"],
+        styles=[("blue", "-"), ("red", "-")],
+        ylabel="us",
+        plot_name="moe-fused-gate-performance",
+        args={},
+    )
+)
+def benchmark(seq_length, provider):
+    dtype = torch.bfloat16
+    device = torch.device("cuda")
+    num_experts, num_expert_group, topk_group, topk = 256, 8, 4, 8
+
+    scores = torch.randn((seq_length, num_experts), device=device, dtype=dtype)
+    bias = torch.rand(num_experts, device=device, dtype=dtype)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "original":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: biased_grouped_topk_org(
+                scores.clone(), bias.clone(), num_expert_group, topk_group, topk
+            ),
+            quantiles=quantiles,
+        )
+    elif provider == "kernel":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: biased_grouped_topk_org_fuse_kernel(
+                scores.clone(), bias.clone(), num_expert_group, topk_group, topk
+            ),
+            quantiles=quantiles,
+        )
+
+    return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+
+if __name__ == "__main__":
+    benchmark.run(print_data=True)
diff --git a/sgl-kernel/benchmark/bench_moe_topk_softmax.py b/sgl-kernel/benchmark/bench_moe_topk_softmax.py
new file mode 100644
index 000000000..1d3e3e93f
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_moe_topk_softmax.py
@@ -0,0 +1,116 @@
+import itertools
+
+import pytest
+import torch
+import triton
+from sgl_kernel import topk_softmax
+from vllm import _custom_ops as vllm_custom_ops
+
+
+def vllm_topk_softmax(gating_output, topk):
+    num_tokens, num_experts = gating_output.shape
+
+    topk_weights = torch.empty(
+        (num_tokens, topk), device=gating_output.device, dtype=torch.float32
+    )
+    topk_indices = torch.empty(
+        (num_tokens, topk), dtype=torch.int32, device=gating_output.device
+    )
+    token_expert_indices = torch.empty(
+        (num_tokens, topk), dtype=torch.int32, device=gating_output.device
+    )
+    torch.ops._moe_C.topk_softmax(
+        topk_weights, topk_indices, token_expert_indices, gating_output
+    )
+    return topk_weights, topk_indices
+
+
+def sglang_topk_softmax(gating_output, topk):
+    num_tokens, num_experts = gating_output.shape
+
+    topk_weights = torch.empty(
+        (num_tokens, topk), device=gating_output.device, dtype=torch.float32
+    )
+    topk_indices = torch.empty(
+        (num_tokens, topk), dtype=torch.int32, device=gating_output.device
+    )
+
+    topk_softmax(
+        topk_weights=topk_weights,
+        topk_ids=topk_indices,
+        gating_output=gating_output,
+    )
+
+    return topk_weights, topk_indices
+
+
+def calculate_diff(num_tokens, num_experts, topk):
+    gating_output = torch.randn(
+        (num_tokens, num_experts), device="cuda", dtype=torch.float32
+    )
+    weights_vllm, indices_vllm = vllm_topk_softmax(gating_output.clone(), topk)
+    weights_sglang, indices_sglang = sglang_topk_softmax(gating_output.clone(), topk)
+
+    weights_diff = torch.abs(weights_vllm - weights_sglang).mean().item()
+    indices_match = torch.equal(indices_vllm, indices_sglang)
+
+    if (
+        torch.allclose(weights_vllm, weights_sglang, atol=1e-3, rtol=1e-3)
+        and indices_match
+    ):
+        print("✅ VLLM and SGLang topk_softmax implementations match")
+    else:
+        print(
+            f"❌ Implementations differ: Weights diff={weights_diff}, Indices match={indices_match}"
+        )
+
+
+num_tokens_range = [128, 512, 1024, 2048, 4096, 8192, 16384, 32768]
+num_experts_range = [32, 64, 128, 256, 12, 512]
+topk_range = [1, 2, 4, 8]
+
+configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["num_tokens", "num_experts", "topk"],
+        x_vals=configs,
+        line_arg="provider",
+        line_vals=["sglang", "vllm"],
+        line_names=["SGLang", "VLLM"],
+        styles=[("blue", "-"), ("green", "-")],
+        ylabel="Latency (us)",
+        plot_name="topk-softmax-performance",
+        args={},
+    )
+)
+def benchmark(num_tokens, num_experts, topk, provider):
+
+    gating_output = torch.randn(
+        (num_tokens, num_experts), device="cuda", dtype=torch.float32
+    )
+
+    if provider == "vllm" or provider == "vllm1":
+        fn = lambda: vllm_topk_softmax(gating_output, topk)
+    elif provider == "sglang" or provider == "sglang1":
+        fn = lambda: sglang_topk_softmax(gating_output, topk)
+
+    quantiles = [0.5, 0.2, 0.8]
+    ms, min_ms, max_ms = triton.testing.do_bench(fn, quantiles=quantiles)
+
+    return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+
+if __name__ == "__main__":
+    configs = [
+        (20, 256, 4),
+        (20, 256, 8),
+        (20, 12, 4),
+        (20, 12, 1),
+        (20, 512, 4),
+        (20, 512, 1),
+    ]
+    for num_tokens, num_experts, topk in configs:
+        calculate_diff(num_tokens, num_experts, topk)
+    benchmark.run(print_data=True)
diff --git a/sgl-kernel/benchmark/bench_nvfp4_scaled_gemm.py b/sgl-kernel/benchmark/bench_nvfp4_scaled_gemm.py
new file mode 100644
index 000000000..44498a3b4
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_nvfp4_scaled_gemm.py
@@ -0,0 +1,172 @@
+import argparse
+import copy
+import itertools
+
+import torch
+import triton
+from sgl_kernel import cutlass_scaled_fp4_mm, scaled_fp4_quant
+
+FLOAT4_E2M1_MAX = 6.0
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+# Weight Shapes are in the format
+# ([K, N], TP_SPLIT_DIM)
+# Example:
+#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
+#   - TP1 : K = 14336, N = 4096
+#   - TP2 : K = 7168, N = 4096
+#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
+#   - TP1 : K = 4096, N = 6144
+#   - TP4 : K = 4096, N = 1536
+
+# TP1 shapes
+WEIGHT_SHAPES = {
+    "meta-llama/Llama-3.1-8B-Instruct": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-3.3-70B-Instruct": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+    "mistralai/Mistral-Large-Instruct-2407": [
+        ([12288, 14336], 1),
+        ([12288, 12288], 0),
+        ([12288, 57344], 1),
+        ([28672, 12288], 0),
+    ],
+    "Qwen/Qwen2.5-7B-Instruct": [
+        ([3584, 4608], 1),
+        ([3584, 3584], 0),
+        ([3584, 37888], 1),
+        ([18944, 3584], 0),
+    ],
+    "Qwen/Qwen2.5-32B-Instruct": [
+        ([5120, 7168], 1),
+        ([5120, 5120], 0),
+        ([5120, 55296], 1),
+        ([27648, 5120], 0),
+    ],
+    "Qwen/Qwen2.5-72B-Instruct": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 59136], 1),
+        ([29568, 8192], 0),
+    ],
+    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [
+        ([2048, 3072], 1),
+        ([2048, 4096], 1),
+        ([2048, 2048], 0),
+        ([2048, 576], 0),
+        ([2048, 21888], 1),
+        ([10944, 2048], 0),
+        ([2048, 2816], 1),
+        ([1408, 2048], 0),
+    ],
+}
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048],
+        x_log=False,
+        line_arg="provider",
+        line_vals=[
+            "sglang-fp4-fp16",
+            "sglang-fp4-bf16",
+        ],
+        line_names=[
+            "sglang-fp4-fp16",
+            "sglang-fp4-bf16",
+        ],
+        styles=[("green", "-"), ("blue", "-")],
+        ylabel="TFLOPS",
+        plot_name="fp4 block scaled matmul",
+        args={},
+    )
+)
+def benchmark(batch_size, provider, N, K):
+    # M, N, K = batch_size, 4096, 8192
+    run_step = 100
+    dtype = torch.float16 if "fp16" in provider else torch.bfloat16
+    M = batch_size
+    a = torch.randn((M, K), dtype=dtype, device="cuda")
+    b = torch.randn((N, K), dtype=dtype, device="cuda")
+    a_global_scale = (
+        (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(a.flatten(), dim=-1)
+    ).to(torch.float32)
+    b_global_scale = (
+        (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(b.flatten(), dim=-1)
+    ).to(torch.float32)
+    alpha = 1.0 / (a_global_scale * b_global_scale)
+    a_fp4, a_scale_interleaved = scaled_fp4_quant(a, a_global_scale)
+    b_fp4, b_scale_interleaved = scaled_fp4_quant(b, b_global_scale)
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    # Bridging the gap between CPU and GPU
+    for _ in range(25):
+        c = a @ b.t()
+    # Warmup
+    for _ in range(5):
+        cutlass_scaled_fp4_mm(
+            a_fp4, b_fp4, a_scale_interleaved, b_scale_interleaved, alpha, dtype
+        )
+    start_event.record()
+    for _ in range(run_step):
+        cutlass_scaled_fp4_mm(
+            a_fp4, b_fp4, a_scale_interleaved, b_scale_interleaved, alpha, dtype
+        )
+    end_event.record()
+    end_event.synchronize()
+    torch.cuda.synchronize()
+    ms = start_event.elapsed_time(end_event) / run_step
+
+    tflops = lambda ms: (2 * M * N * K) * 1e-9 / ms
+    return tflops(ms)
+
+
+def prepare_shapes(args):
+    KN_model_names = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        assert model in WEIGHT_SHAPES
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KN.append(model)
+            KN_model_names.append(KN)
+    return KN_model_names
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=["meta-llama/Llama-3.1-8B-Instruct"],
+        help="List of models to benchmark",
+    )
+    parser.add_argument(
+        "--tp-sizes",
+        nargs="+",
+        type=int,
+        default=[1],
+        help="List of tensor parallel sizes",
+    )
+    args = parser.parse_args()
+
+    KN_model_names = prepare_shapes(args)
+    for K, N, model_name in KN_model_names:
+        print(f"{model_name} N={N} K={K}: ")
+        benchmark.run(
+            print_data=True, show_plots=True, save_path="bench_fp4_res", N=N, K=K
+        )
+
+    print("Benchmark finished!")
diff --git a/sgl-kernel/benchmark/bench_per_tensor_quant_fp8.py b/sgl-kernel/benchmark/bench_per_tensor_quant_fp8.py
new file mode 100644
index 000000000..8bc7d1e01
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_per_tensor_quant_fp8.py
@@ -0,0 +1,98 @@
+import itertools
+import math
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+import triton
+import triton.testing
+from sgl_kernel import sgl_per_tensor_quant_fp8
+from vllm import _custom_ops as ops
+
+from sglang.srt.utils import is_hip
+
+_is_hip = is_hip()
+fp8_type_ = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
+
+
+def vllm_scaled_fp8_quant(
+    input: torch.Tensor,
+    scale: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    return ops.scaled_fp8_quant(input, scale)
+
+
+def sglang_scaled_fp8_quant(
+    input: torch.Tensor,
+    scale: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    fp8_type_: torch.dtype = torch.float8_e4m3fn
+    output = torch.empty_like(input, device=input.device, dtype=fp8_type_)
+    is_static = True
+    if scale is None:
+        scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+        is_static = False
+    sgl_per_tensor_quant_fp8(input, output, scale, is_static)
+
+    return output, scale
+
+
+def calculate_diff(batch_size: int, seq_len: int):
+    """Calculate difference between VLLM and SGLang implementations."""
+    device = torch.device("cuda")
+    x = torch.rand((batch_size, seq_len), dtype=torch.float16, device=device)
+
+    vllm_out, vllm_scale = vllm_scaled_fp8_quant(x)
+    sglang_out, sglang_scale = sglang_scaled_fp8_quant(x)
+
+    scale_diff = torch.abs(vllm_scale - sglang_scale).item()
+    output_diff = torch.abs(vllm_out.float() - sglang_out.float()).mean().item()
+
+    if torch.allclose(
+        vllm_out.to(torch.float32), sglang_out.to(torch.float32), rtol=1e-3, atol=1e-5
+    ) and torch.allclose(vllm_scale, sglang_scale, rtol=1e-3, atol=1e-5):
+        print("✅ All implementations match")
+    else:
+        print("❌ Implementations differ")
+
+
+batch_size_range = [16, 32, 64, 128]
+seq_len_range = [64, 128, 256, 512, 1024, 2048]
+
+configs = list(itertools.product(batch_size_range, seq_len_range))
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size", "seq_len"],
+        x_vals=configs,
+        line_arg="provider",
+        line_vals=["vllm", "sglang"],
+        line_names=["VLLM", "SGL Kernel"],
+        styles=[("blue", "-"), ("green", "-")],
+        ylabel="us",
+        plot_name="per-tensor-quant-fp8-performance",
+        args={},
+    )
+)
+def benchmark(batch_size, seq_len, provider):
+    dtype = torch.float16
+    device = torch.device("cuda")
+
+    x = torch.randn(batch_size * seq_len, 4096, device=device, dtype=dtype)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "vllm":
+        fn = lambda: vllm_scaled_fp8_quant(x.clone())
+    elif provider == "sglang":
+        fn = lambda: sglang_scaled_fp8_quant(x.clone())
+
+    ms, min_ms, max_ms = triton.testing.do_bench(fn, quantiles=quantiles)
+
+    return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+
+if __name__ == "__main__":
+    calculate_diff(batch_size=4, seq_len=4096)
+    benchmark.run(print_data=True)
diff --git a/sgl-kernel/benchmark/bench_per_token_group_quant_8bit.py b/sgl-kernel/benchmark/bench_per_token_group_quant_8bit.py
new file mode 100644
index 000000000..3f37a3248
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_per_token_group_quant_8bit.py
@@ -0,0 +1,98 @@
+import itertools
+import time
+from functools import partial
+from pathlib import Path
+
+import torch
+import triton
+
+from sglang.srt.bench_utils import bench_kineto
+from sglang.srt.layers.quantization.fp8_kernel import (
+    create_per_token_group_quant_fp8_output_scale,
+)
+from sglang.srt.layers.quantization.fp8_kernel import (
+    per_token_group_quant_8bit as triton_per_token_group_quant_8bit,
+)
+from sglang.srt.layers.quantization.fp8_kernel import sglang_per_token_group_quant_8bit
+from sglang.srt.utils import is_hip
+
+_is_hip = is_hip()
+fp8_type_ = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
+
+
+num_tokens_range = [1, 4, 16, 64, 256, 768, 2048, 8192, 16384]
+hidden_dim_range = [1536, 7168, 18432]  # For DeepSeek V3/R1
+group_size_range = [128]  # For DeepSeek V3/R1
+# TODO test int8
+dst_dtype_range = [fp8_type_]
+flags_range = [
+    dict(
+        column_major_scales=False,
+        scale_tma_aligned=False,
+        scale_ue8m0=False,
+    ),
+    dict(
+        column_major_scales=True,
+        scale_tma_aligned=False,
+        scale_ue8m0=False,
+    ),
+    dict(
+        column_major_scales=True,
+        scale_tma_aligned=True,
+        scale_ue8m0=False,
+    ),
+    dict(
+        column_major_scales=True,
+        scale_tma_aligned=True,
+        scale_ue8m0=True,
+    ),
+]
+
+
+configs = list(
+    itertools.product(
+        num_tokens_range,
+        hidden_dim_range,
+        group_size_range,
+        dst_dtype_range,
+        flags_range,
+    )
+)
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["num_tokens", "hidden_dim", "group_size", "dst_dtype", "flags"],
+        x_vals=configs,
+        line_arg="provider",
+        line_vals=["triton", "sglang"],
+        line_names=["Triton", "SGL Kernel"],
+        styles=[("blue", "-"), ("green", "-")],
+        ylabel="us",
+        plot_name="per-token-group-quant-8bit-performance",
+        args={},
+    )
+)
+def benchmark(num_tokens, hidden_dim, group_size, dst_dtype, flags, provider):
+    if flags["scale_ue8m0"] and group_size != 128:
+        return
+
+    device = torch.device("cuda")
+
+    x = torch.randn(num_tokens, hidden_dim, device=device, dtype=torch.bfloat16)
+
+    fn, kernel_names = {
+        "triton": (triton_per_token_group_quant_8bit, "_per_token_group_quant_fp8"),
+        "sglang": (
+            sglang_per_token_group_quant_8bit,
+            "per_token_group_quant_8bit_kernel",
+        ),
+    }[provider]
+    bench_fn = lambda: fn(x=x, group_size=group_size, dst_dtype=dst_dtype, **flags)
+
+    time_s = bench_kineto(bench_fn, kernel_names=kernel_names)
+    return time_s * 1e6
+
+
+if __name__ == "__main__":
+    benchmark.run(print_data=True)
diff --git a/sgl-kernel/benchmark/bench_per_token_quant_fp8.py b/sgl-kernel/benchmark/bench_per_token_quant_fp8.py
new file mode 100644
index 000000000..a72a1a3d0
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_per_token_quant_fp8.py
@@ -0,0 +1,177 @@
+import itertools
+from typing import Optional, Tuple
+
+import torch
+import triton
+import triton.testing
+from sgl_kernel import sgl_per_token_quant_fp8
+from vllm import _custom_ops as ops
+
+from sglang.srt.utils import is_hip
+
+_is_hip = is_hip()
+fp8_type_ = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
+
+# Get correct FP8 E4M3 maximum value
+if _is_hip:
+    FP8_E4M3_MAX = 224.0  # ROCM uses 224.0
+else:
+    # For CUDA, get the actual max value from the type
+    FP8_E4M3_MAX = float(torch.finfo(fp8_type_).max)
+
+
+def torch_per_token_quant_fp8(
+    input: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Pure PyTorch reference implementation for per-token FP8 quantization."""
+    device = input.device
+    dtype = input.dtype
+
+    # Find max absolute value per token (row) - exactly like CUDA kernel
+    max_vals = torch.abs(input).max(dim=1)[0]  # [num_tokens]
+
+    # Calculate scale per token - exactly like CUDA kernel: scale = max_value / FP8_E4M3_MAX
+    scales = max_vals / FP8_E4M3_MAX  # [num_tokens]
+
+    # No special zero handling - directly compute 1.0 / scale like CUDA kernel
+    scale_inv = 1.0 / scales  # [num_tokens]
+
+    # Quantize: input * scale_inv, then clamp to FP8 range
+    quantized_float = input * scale_inv.unsqueeze(1)  # Broadcast scale_inv
+    quantized_float = torch.clamp(quantized_float, -FP8_E4M3_MAX, FP8_E4M3_MAX)
+
+    # Convert to FP8 - use more explicit conversion
+    quantized_fp8 = quantized_float.to(fp8_type_)
+
+    return quantized_fp8, scales
+
+
+def vllm_per_token_quant_fp8(
+    input: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    return ops.scaled_fp8_quant(input, use_per_token_if_dynamic=True)
+
+
+def sglang_per_token_quant_fp8(
+    input: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    scale = torch.zeros(input.size(0), device=input.device, dtype=torch.float32)
+    output = torch.empty_like(input, device=input.device, dtype=fp8_type_)
+    sgl_per_token_quant_fp8(input, output, scale)
+
+    return output, scale
+
+
+def calculate_diff(batch_size: int, seq_len: int, hidden_dim: int):
+    """Compare Torch reference, VLLM, and SGLang implementations."""
+    device = torch.device("cuda")
+    x = torch.rand(
+        (batch_size * seq_len, hidden_dim), dtype=torch.float16, device=device
+    )
+
+    # Get all three implementations
+    torch_out, torch_scale = torch_per_token_quant_fp8(x)
+    vllm_out, vllm_scale = vllm_per_token_quant_fp8(x)
+    sglang_out, sglang_scale = sglang_per_token_quant_fp8(x)
+
+    print(f"\n=== Comparison for hidden_dim={hidden_dim} ===")
+
+    # Compare scales
+    torch_vllm_scale_diff = torch.abs(torch_scale - vllm_scale).mean().item()
+    torch_sglang_scale_diff = torch.abs(torch_scale - sglang_scale).mean().item()
+    vllm_sglang_scale_diff = torch.abs(vllm_scale - sglang_scale).mean().item()
+
+    print(f"Scale differences:")
+    print(f"  Torch vs VLLM:   {torch_vllm_scale_diff:.8f}")
+    print(f"  Torch vs SGLang: {torch_sglang_scale_diff:.8f}")
+    print(f"  VLLM vs SGLang:  {vllm_sglang_scale_diff:.8f}")
+
+    # Compare outputs
+    torch_vllm_out_diff = torch.abs(torch_out.float() - vllm_out.float()).mean().item()
+    torch_sglang_out_diff = (
+        torch.abs(torch_out.float() - sglang_out.float()).mean().item()
+    )
+    vllm_sglang_out_diff = (
+        torch.abs(vllm_out.float() - sglang_out.float()).mean().item()
+    )
+
+    print(f"Output differences:")
+    print(f"  Torch vs VLLM:   {torch_vllm_out_diff:.8f}")
+    print(f"  Torch vs SGLang: {torch_sglang_out_diff:.8f}")
+    print(f"  VLLM vs SGLang:  {vllm_sglang_out_diff:.8f}")
+
+    # Check tolerances
+    rtol, atol = 1e-3, 1e-5
+
+    torch_vllm_match = torch.allclose(
+        torch_out.float(), vllm_out.float(), rtol=rtol, atol=atol
+    ) and torch.allclose(torch_scale, vllm_scale, rtol=rtol, atol=atol)
+    torch_sglang_match = torch.allclose(
+        torch_out.float(), sglang_out.float(), rtol=rtol, atol=atol
+    ) and torch.allclose(torch_scale, sglang_scale, rtol=rtol, atol=atol)
+
+    if hidden_dim == 1368:
+        rtol = 1e-2
+        # we found vllm sglang has diff when hidden dim is not dividable by 16
+        # and we believe SGLang is closer to Torch implementation
+
+    vllm_sglang_match = torch.allclose(
+        vllm_out.float(), sglang_out.float(), rtol=rtol, atol=atol
+    ) and torch.allclose(vllm_scale, sglang_scale, rtol=rtol, atol=atol)
+
+    print(f"Matches (rtol={rtol}, atol={atol}):")
+    print(f"  Torch vs VLLM:   {'✅' if torch_vllm_match else '❌'}")
+    print(f"  Torch vs SGLang: {'✅' if torch_sglang_match else '❌'}")
+    print(f"  VLLM vs SGLang:  {'✅' if vllm_sglang_match else '❌'}")
+
+
+batch_size_range = [16, 32, 64, 128]
+seq_len_range = [64, 128, 256, 512, 1024, 2048, 4096]
+hidden_dim_range = [1368, 2048, 4096]
+
+configs = list(itertools.product(batch_size_range, seq_len_range, hidden_dim_range))
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size", "seq_len", "hidden_dim"],
+        x_vals=configs,
+        line_arg="provider",
+        line_vals=["torch", "vllm", "sglang"],
+        line_names=["Torch Reference", "VLLM", "SGL Kernel"],
+        styles=[("red", "-"), ("blue", "-"), ("green", "-")],
+        ylabel="us",
+        plot_name="per-token-dynamic-quant-fp8-performance",
+        args={},
+    )
+)
+def benchmark_quantization(batch_size, seq_len, hidden_dim, provider):
+    dtype = torch.float16
+    device = torch.device("cuda")
+
+    x = torch.randn(batch_size * seq_len, hidden_dim, device=device, dtype=dtype)
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "torch":
+        fn = lambda: torch_per_token_quant_fp8(x.clone())
+    elif provider == "vllm":
+        fn = lambda: vllm_per_token_quant_fp8(x.clone())
+    elif provider == "sglang":
+        fn = lambda: sglang_per_token_quant_fp8(x.clone())
+
+    ms, min_ms, max_ms = triton.testing.do_bench(fn, quantiles=quantiles)
+
+    return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+
+if __name__ == "__main__":
+    # Test various hidden dimensions for correctness
+    test_dims = [1368, 2048, 4096]
+
+    for dim in test_dims:
+        calculate_diff(batch_size=4, seq_len=4096, hidden_dim=dim)
+
+    print("\n" + "=" * 60)
+    print("Starting performance benchmark...")
+    benchmark_quantization.run(print_data=True)
diff --git a/sgl-kernel/benchmark/bench_qserve_w4a8_gemm.py b/sgl-kernel/benchmark/bench_qserve_w4a8_gemm.py
new file mode 100644
index 000000000..18fa4869d
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_qserve_w4a8_gemm.py
@@ -0,0 +1,198 @@
+import argparse
+import copy
+import itertools
+
+import torch
+import triton
+from sgl_kernel import (
+    int8_scaled_mm,
+    qserve_w4a8_per_chn_gemm,
+    qserve_w4a8_per_group_gemm,
+)
+
+
+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+WEIGHT_SHAPES = {
+    "meta-llama/Llama-3.1-8B-Instruct": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-3.3-70B-Instruct": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+    "mistralai/Mistral-Large-Instruct-2407": [
+        ([12288, 14336], 1),
+        ([12288, 12288], 0),
+        ([12288, 57344], 1),
+        ([28672, 12288], 0),
+    ],
+    "Qwen/Qwen2.5-7B-Instruct": [
+        ([3584, 4608], 1),
+        ([3584, 3584], 0),
+        ([3584, 37888], 1),
+        ([18944, 3584], 0),
+    ],
+    "Qwen/Qwen2.5-32B-Instruct": [
+        ([5120, 7168], 1),
+        ([5120, 5120], 0),
+        ([5120, 55296], 1),
+        ([27648, 5120], 0),
+    ],
+    "Qwen/Qwen2.5-72B-Instruct": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 59136], 1),
+        ([29568, 8192], 0),
+    ],
+    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [
+        ([2048, 3072], 1),
+        ([2048, 4096], 1),
+        ([2048, 2048], 0),
+        ([2048, 576], 0),
+        ([2048, 21888], 1),
+        ([10944, 2048], 0),
+        ([2048, 2816], 1),
+        ([1408, 2048], 0),
+    ],
+}
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048],
+        x_log=False,
+        line_arg="provider",
+        line_vals=["FP16", "W8A8", "Qserve_W4A8_Per_Channel", "Qserve_W4A8_Per_Group"],
+        line_names=["FP16", "W8A8", "Qserve_W4A8_Per_Channel", "Qserve_W4A8_Per_Group"],
+        styles=[("blue", "-"), ("orange", "-"), ("green", "-"), ("red", "-")],
+        ylabel="ms",
+        plot_name="FP16_vs_W8A8_vs_Qserve_W4A8_GEMM",
+        args={},
+    )
+)
+def benchmark(batch_size, provider, N, K):
+    M = batch_size
+    # For W8A8
+    a = to_int8(torch.randn((M, K), device="cuda") * 5)
+    b = to_int8(torch.randn((N, K), device="cuda").t() * 5)
+    a_fp16 = a.to(torch.float16)
+    b_fp16 = b.to(torch.float16)
+    scale_a = torch.randn((M,), device="cuda", dtype=torch.float32)
+    scale_b = torch.randn((N,), device="cuda", dtype=torch.float32)
+
+    # For Qserve W4A8 per channel
+    a_qserve_chn = a
+    # two int4s pack into one int8
+    b_qserve_chn = to_int8(torch.randn((N, K // 2), device="cuda") * 5)
+    # b_qserve_chn = b.t().contiguous()
+    scale_a_qserve_chn = scale_a.to(torch.float16)
+    scale_b_qserve_chn = scale_b.to(torch.float16)
+    szero_b_qserve_chn = torch.randn((N,), device="cuda", dtype=torch.float16)
+    a_sum_qserve_chn = torch.randn((M,), device="cuda", dtype=torch.float16)
+
+    # For Qserve W4A8 per group
+    group_size = 128
+    assert K % group_size == 0, "K must be divisible by group_size"
+    a_qserve_group = a
+    # two int4s pack into one int8
+    b_qserve_group = to_int8(torch.randn((N, K // 2), device="cuda") * 5)
+    # b_qserve_group = b.t().contiguous()
+    scale_a_qserve_group = scale_a.to(torch.float16)
+    scale_b_qserve_group = scale_b.to(torch.float16)
+    scale_i8_b_qserve_group = to_int8(
+        torch.randn((K // group_size, N), device="cuda", dtype=torch.float16)
+    )
+    zero_i8_b_qserve_group = to_int8(
+        torch.randn((K // group_size, N), device="cuda", dtype=torch.float16)
+    )
+
+    quantiles = [0.5, 0.2, 0.8]
+    if provider == "FP16":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: torch.matmul(a_fp16, b_fp16),
+            quantiles=quantiles,
+        )
+    if provider == "W8A8":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: int8_scaled_mm(a, b, scale_a, scale_b, torch.float16),
+            quantiles=quantiles,
+        )
+    if provider == "Qserve_W4A8_Per_Channel":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: qserve_w4a8_per_chn_gemm(
+                a_qserve_chn,
+                b_qserve_chn,
+                scale_b_qserve_chn,
+                scale_a_qserve_chn,
+                szero_b_qserve_chn,
+                a_sum_qserve_chn,
+            ),
+            quantiles=quantiles,
+        )
+    if provider == "Qserve_W4A8_Per_Group":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: qserve_w4a8_per_group_gemm(
+                a_qserve_group,
+                b_qserve_group,
+                zero_i8_b_qserve_group,
+                scale_i8_b_qserve_group,
+                scale_b_qserve_group,
+                scale_a_qserve_group,
+            ),
+            quantiles=quantiles,
+        )
+
+    return ms, max_ms, min_ms
+
+
+def prepare_shapes(args):
+    KN_model_names = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        assert model in WEIGHT_SHAPES
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KN.append(model)
+            KN_model_names.append(KN)
+    return KN_model_names
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=["meta-llama/Llama-3.1-8B-Instruct"],
+        help="List of models to benchmark",
+    )
+    parser.add_argument(
+        "--tp-sizes",
+        nargs="+",
+        type=int,
+        default=[1],
+        help="List of tensor parallel sizes",
+    )
+    args = parser.parse_args()
+
+    KN_model_names = prepare_shapes(args)
+    for K, N, model_name in KN_model_names:
+        print(f"{model_name} N={N} K={K}: ")
+        benchmark.run(
+            print_data=True,
+            show_plots=True,
+            save_path="bench_qserve_w4a8_gemm_res",
+            N=N,
+            K=K,
+        )
+
+    print("Benchmark finished!")
diff --git a/sgl-kernel/benchmark/bench_rotary_embedding.py b/sgl-kernel/benchmark/bench_rotary_embedding.py
new file mode 100644
index 000000000..b4e0f5e0b
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_rotary_embedding.py
@@ -0,0 +1,96 @@
+import itertools
+
+import torch
+import triton
+from sgl_kernel import FusedSetKVBufferArg
+from sgl_kernel.testing.rotary_embedding import (
+    FlashInferRotaryEmbedding,
+    MHATokenToKVPool,
+    RotaryEmbedding,
+    create_inputs,
+)
+
+from sglang.srt.bench_utils import bench_kineto
+
+configs = [
+    (batch_size, seq_len, save_kv_cache)
+    for batch_size, seq_len in (
+        (1, 1),
+        (32, 1),
+        (128, 1),
+        (512, 1),
+        (2, 512),
+        (4, 4096),
+    )
+    for save_kv_cache in (False, True)
+]
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size", "seq_len", "save_kv_cache"],
+        x_vals=configs,
+        line_arg="provider",
+        line_vals=["sglang"],
+        line_names=["SGL Kernel"],
+        styles=[("green", "-")],
+        ylabel="us",
+        plot_name="bench_rotary_embedding",
+        args={},
+    )
+)
+def benchmark(batch_size, seq_len, save_kv_cache, provider):
+    device = torch.device("cuda")
+
+    num_q_heads = 32
+    num_kv_heads = 8
+    head_size = 64
+    dtype = torch.bfloat16
+
+    config = dict(
+        head_size=head_size,
+        rotary_dim=64,
+        max_position_embeddings=4096,
+        base=8000,
+        is_neox_style=True,
+        dtype=dtype,
+    )
+    rope_flashinfer = FlashInferRotaryEmbedding(**config).to(device)
+    pool_flashinfer = MHATokenToKVPool(head_num=num_kv_heads, head_dim=head_size)
+
+    inputs = create_inputs(
+        head_size=head_size,
+        batch_size=batch_size,
+        seq_len=seq_len,
+        device=device,
+        dtype=dtype,
+        num_q_heads=num_q_heads,
+        num_kv_heads=num_kv_heads,
+    )
+
+    query_flashinfer, key_flashinfer = inputs["query"].clone(), inputs["key"].clone()
+
+    bench_fn = lambda: rope_flashinfer.forward_cuda(
+        inputs["pos_ids"],
+        query_flashinfer,
+        key_flashinfer,
+        fused_set_kv_buffer_arg=(
+            FusedSetKVBufferArg(
+                value=inputs["value"],
+                k_buffer=pool_flashinfer.k_buffer[0].view(-1, num_kv_heads * head_size),
+                v_buffer=pool_flashinfer.v_buffer[0].view(-1, num_kv_heads * head_size),
+                k_scale=None,
+                v_scale=None,
+                cache_loc=inputs["out_cache_loc"],
+            )
+            if save_kv_cache
+            else None
+        ),
+    )
+
+    time_s = bench_kineto(bench_fn, kernel_names="BatchQKApplyRotaryPosIds")
+    return time_s * 1e6
+
+
+if __name__ == "__main__":
+    benchmark.run(print_data=True)
diff --git a/sgl-kernel/benchmark/bench_top_k_top_p_sampling.py b/sgl-kernel/benchmark/bench_top_k_top_p_sampling.py
new file mode 100644
index 000000000..3692b5b39
--- /dev/null
+++ b/sgl-kernel/benchmark/bench_top_k_top_p_sampling.py
@@ -0,0 +1,128 @@
+import itertools
+
+import sgl_kernel
+import torch
+import triton
+import triton.testing
+
+
+def torch_top_k_top_p_joint_sampling_from_probs(
+    normalized_prob, top_k, top_p, eps=1e-4
+):
+    """Reference PyTorch implementation of joint top-k top-p sampling."""
+    batch_size, vocab_size = normalized_prob.shape
+    samples = torch.empty(batch_size, dtype=torch.int64, device=normalized_prob.device)
+
+    for i in range(batch_size):
+        p_val = top_p[i].item()
+        k_val = top_k[i].item()
+
+        # top-p mask
+        sorted_prob, indices = torch.sort(normalized_prob[i], descending=False)
+        cdf = torch.cumsum(sorted_prob, dim=-1)
+        mask_top_p = torch.zeros(
+            vocab_size, dtype=torch.int32, device=normalized_prob.device
+        )
+        mask_top_p.scatter_add_(0, indices, (cdf > (1 - p_val) - eps).int())
+
+        # top-k mask
+        sorted_prob_desc, _ = torch.sort(normalized_prob[i], descending=True)
+        pivot = sorted_prob_desc[k_val - 1]
+        mask_top_k = (normalized_prob[i] >= pivot).int()
+
+        # joint mask
+        mask = torch.minimum(mask_top_p, mask_top_k).bool()
+
+        # sample from masked probs
+        masked_probs = normalized_prob[i] * mask
+        masked_probs = masked_probs / masked_probs.sum()
+        idx = torch.multinomial(masked_probs, 1)
+        samples[i] = idx
+
+    return samples
+
+
+def calculate_diff(batch_size, vocab_size, p):
+    """Compare Torch reference and SGLang kernel for correctness."""
+    torch.manual_seed(42)
+    if p == 0.1:
+        k = int(vocab_size * 0.5)
+    elif p == 0.5:
+        k = int(vocab_size * 0.1)
+    else:
+        raise ValueError("p not recognized")
+
+    device = torch.device("cuda")
+    pre_norm_prob = torch.rand(batch_size, vocab_size, device=device)
+    normalized_prob = pre_norm_prob / pre_norm_prob.sum(dim=-1, keepdim=True)
+
+    top_p_tensor = torch.full((batch_size,), p, device=device)
+    top_k_tensor = torch.full((batch_size,), k, device=device)
+
+    torch_samples = torch_top_k_top_p_joint_sampling_from_probs(
+        normalized_prob, top_k_tensor, top_p_tensor
+    )
+    sglang_samples = sgl_kernel.top_k_top_p_sampling_from_probs(
+        normalized_prob, top_k_tensor, top_p_tensor, filter_apply_order="joint"
+    )
+
+
+# parameter space
+batch_size_range = [16, 64, 128]
+vocab_size_range = [111, 32000]
+p_range = [0.1, 0.5]
+configs = list(itertools.product(batch_size_range, vocab_size_range, p_range))
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["batch_size", "vocab_size", "p"],
+        x_vals=configs,
+        line_arg="provider",
+        line_vals=["torch", "sglang"],
+        line_names=["Torch Reference", "SGL Kernel"],
+        styles=[("red", "-"), ("green", "-")],
+        ylabel="us",
+        plot_name="top-k-top-p-joint-sampling-performance",
+        args={},
+    )
+)
+def benchmark_sampling(batch_size, vocab_size, p, provider):
+    torch.manual_seed(42)
+    if p == 0.1:
+        k = int(vocab_size * 0.5)
+    elif p == 0.5:
+        k = int(vocab_size * 0.1)
+    else:
+        raise ValueError("p not recognized")
+
+    device = torch.device("cuda")
+    pre_norm_prob = torch.rand(batch_size, vocab_size, device=device)
+    normalized_prob = pre_norm_prob / pre_norm_prob.sum(dim=-1, keepdim=True)
+    top_p_tensor = torch.full((batch_size,), p, device=device)
+    top_k_tensor = torch.full((batch_size,), k, device=device)
+
+    if provider == "torch":
+        fn = lambda: torch_top_k_top_p_joint_sampling_from_probs(
+            normalized_prob.clone(), top_k_tensor, top_p_tensor
+        )
+    elif provider == "sglang":
+        fn = lambda: sgl_kernel.top_k_top_p_sampling_from_probs(
+            normalized_prob.clone(),
+            top_k_tensor,
+            top_p_tensor,
+            filter_apply_order="joint",
+        )
+
+    ms, min_ms, max_ms = triton.testing.do_bench(fn, quantiles=[0.5, 0.2, 0.8])
+    return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+
+if __name__ == "__main__":
+    # Correctness check
+    for cfg in configs:
+        calculate_diff(*cfg)
+
+    print("\n" + "=" * 60)
+    print("Starting performance benchmark...")
+    benchmark_sampling.run(print_data=True)
diff --git a/sgl-kernel/build.sh b/sgl-kernel/build.sh
new file mode 100755
index 000000000..0bf5a07ed
--- /dev/null
+++ b/sgl-kernel/build.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+set -ex
+
+PYTHON_VERSION=$1
+CUDA_VERSION=$2
+PYTHON_ROOT_PATH=/opt/python/cp${PYTHON_VERSION//.}-cp${PYTHON_VERSION//.}
+
+if [ -z "$3" ]; then
+   ARCH=$(uname -i)
+else
+   ARCH=$3
+fi
+
+echo "ARCH:  $ARCH"
+if [ ${ARCH} = "aarch64" ]; then
+   LIBCUDA_ARCH="sbsa"
+   BUILDER_NAME="pytorch/manylinuxaarch64-builder"
+else
+   LIBCUDA_ARCH=${ARCH}
+   BUILDER_NAME="pytorch/manylinux2_28-builder"
+fi
+
+if [ ${CUDA_VERSION} = "12.9" ]; then
+   DOCKER_IMAGE="${BUILDER_NAME}:cuda${CUDA_VERSION}"
+   TORCH_INSTALL="pip install --no-cache-dir torch==2.8.0 --index-url https://download.pytorch.org/whl/cu129"
+elif [ ${CUDA_VERSION} = "12.8" ]; then
+   DOCKER_IMAGE="${BUILDER_NAME}:cuda${CUDA_VERSION}"
+   TORCH_INSTALL="pip install --no-cache-dir torch==2.8.0 --index-url https://download.pytorch.org/whl/cu128"
+else
+   DOCKER_IMAGE="${BUILDER_NAME}:cuda${CUDA_VERSION}"
+   TORCH_INSTALL="pip install --no-cache-dir torch==2.8.0 --index-url https://download.pytorch.org/whl/cu126"
+fi
+
+docker run --rm \
+   -v $(pwd):/sgl-kernel \
+   ${DOCKER_IMAGE} \
+   bash -c "
+   # Install CMake (version >= 3.26) - Robust Installation
+   export CMAKE_VERSION_MAJOR=3.31
+   export CMAKE_VERSION_MINOR=1
+   # Setting these flags to reduce OOM chance only on ARM
+   export CMAKE_BUILD_PARALLEL_LEVEL=$(( $(nproc)/3 < 48 ? $(nproc)/3 : 48 ))
+   if [ \"${ARCH}\" = \"aarch64\" ]; then
+      export CUDA_NVCC_FLAGS=\"-Xcudafe --threads=2\"
+      export MAKEFLAGS='-j2'
+      export CMAKE_BUILD_PARALLEL_LEVEL=2
+      export NINJAFLAGS='-j2'
+   fi
+   echo \"Downloading CMake from: https://cmake.org/files/v\${CMAKE_VERSION_MAJOR}/cmake-\${CMAKE_VERSION_MAJOR}.\${CMAKE_VERSION_MINOR}-linux-${ARCH}.tar.gz\"
+   wget https://cmake.org/files/v\${CMAKE_VERSION_MAJOR}/cmake-\${CMAKE_VERSION_MAJOR}.\${CMAKE_VERSION_MINOR}-linux-${ARCH}.tar.gz
+   tar -xzf cmake-\${CMAKE_VERSION_MAJOR}.\${CMAKE_VERSION_MINOR}-linux-${ARCH}.tar.gz
+   mv cmake-\${CMAKE_VERSION_MAJOR}.\${CMAKE_VERSION_MINOR}-linux-${ARCH} /opt/cmake
+   export PATH=/opt/cmake/bin:\$PATH
+   export LD_LIBRARY_PATH=/lib64:\$LD_LIBRARY_PATH
+
+   # Debugging CMake
+   echo \"PATH: \$PATH\"
+   which cmake
+   cmake --version
+
+   yum install numactl-devel -y && \
+   yum install libibverbs -y --nogpgcheck && \
+   ln -sv /usr/lib64/libibverbs.so.1 /usr/lib64/libibverbs.so && \
+   ${PYTHON_ROOT_PATH}/bin/${TORCH_INSTALL} && \
+   ${PYTHON_ROOT_PATH}/bin/pip install --no-cache-dir ninja setuptools==75.0.0 wheel==0.41.0 numpy uv scikit-build-core && \
+   export TORCH_CUDA_ARCH_LIST='8.0 8.9 9.0+PTX' && \
+   export CUDA_VERSION=${CUDA_VERSION} && \
+   mkdir -p /usr/lib/${ARCH}-linux-gnu/ && \
+   ln -s /usr/local/cuda-${CUDA_VERSION}/targets/${LIBCUDA_ARCH}-linux/lib/stubs/libcuda.so /usr/lib/${ARCH}-linux-gnu/libcuda.so && \
+
+   cd /sgl-kernel && \
+   ls -la ${PYTHON_ROOT_PATH}/lib/python${PYTHON_VERSION}/site-packages/wheel/ && \
+   PYTHONPATH=${PYTHON_ROOT_PATH}/lib/python${PYTHON_VERSION}/site-packages ${PYTHON_ROOT_PATH}/bin/python -m uv build --wheel -Cbuild-dir=build . --color=always --no-build-isolation && \
+   ./rename_wheels.sh
+   "
diff --git a/sgl-kernel/cmake/utils.cmake b/sgl-kernel/cmake/utils.cmake
new file mode 100644
index 000000000..0eaa7a61a
--- /dev/null
+++ b/sgl-kernel/cmake/utils.cmake
@@ -0,0 +1,21 @@
+# Adapt from: https://github.com/neuralmagic/vllm-flash-attention/blob/main/cmake/utils.cmake
+#
+# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS` and store them in
+# `CUDA_ARCH_FLAGS`.
+#
+# Example:
+#   CMAKE_CUDA_FLAGS="-Wall -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75"
+#   clear_cuda_arches(CUDA_ARCH_FLAGS)
+#   CUDA_ARCH_FLAGS="-gencode arch=compute_70,code=sm_70;-gencode arch=compute_75,code=sm_75"
+#   CMAKE_CUDA_FLAGS="-Wall"
+#
+macro(clear_cuda_arches CUDA_ARCH_FLAGS)
+    # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
+    string(REGEX MATCHALL "-gencode arch=[^ ]+" CUDA_ARCH_FLAGS
+      ${CMAKE_CUDA_FLAGS})
+
+    # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
+    # and passed back via the `CUDA_ARCHITECTURES` property.
+    string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
+      ${CMAKE_CUDA_FLAGS})
+endmacro()
diff --git a/sgl-kernel/csrc/allreduce/custom_all_reduce.cu b/sgl-kernel/csrc/allreduce/custom_all_reduce.cu
new file mode 100644
index 000000000..a1f5ed53e
--- /dev/null
+++ b/sgl-kernel/csrc/allreduce/custom_all_reduce.cu
@@ -0,0 +1,137 @@
+// Adapted from: https://github.com/vllm-project/vllm/blob/v0.8.2/csrc/custom_all_reduce.cu
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#include <torch/all.h>
+
+#include "custom_all_reduce.cuh"
+
+// Fake pointer type, must match fptr_t type in ops.h.
+// We use this type alias to indicate when pointers are passed in as int64_t.
+using fptr_t = int64_t;
+static_assert(sizeof(void*) == sizeof(fptr_t));
+
+fptr_t
+init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs, torch::Tensor& rank_data, int64_t rank, bool full_nvlink) {
+  int world_size = fake_ipc_ptrs.size();
+  if (world_size > 8) throw std::invalid_argument("world size > 8 is not supported");
+  if (world_size % 2 != 0) throw std::invalid_argument("Odd num gpus is not supported for now");
+  if (rank < 0 || rank >= world_size) throw std::invalid_argument("invalid rank passed in");
+
+  sglang::Signal* ipc_ptrs[8];
+  for (int i = 0; i < world_size; i++) {
+    ipc_ptrs[i] = reinterpret_cast<sglang::Signal*>(fake_ipc_ptrs[i]);
+  }
+  return (fptr_t) new sglang::CustomAllreduce(
+      ipc_ptrs, rank_data.data_ptr(), rank_data.numel(), rank, world_size, full_nvlink);
+}
+
+/**
+ * Make sure tensor t's data lies completely within ((char)t.data_ptr()) +
+ * t.numel() * t.element_size(). This is slightly weaker than t.is_contiguous()
+ * because it allows transpose of contiguous slice (i.e. slicing the first
+ * dimension). Currently, we require this because stride information is not
+ * passed into the kernels and we treat input tensors as flat.
+ *
+ * Examples
+ * A = torch.zeros(3, 3, 3)
+ * 1. A: OK
+ * 2. A[1:]: OK
+ * 3. A.permute(2, 0, 1): OK
+ * 4. A[1:].permute(2, 0, 1): OK
+ * 5. A[None].expand(2, -1, -1, -1): Not OK
+ * 6. A[:, 1:, 1:]: Not OK
+ */
+bool _is_weak_contiguous(torch::Tensor& t) {
+  return t.is_contiguous() ||
+         (t.storage().nbytes() - t.storage_offset() * t.element_size() == t.numel() * t.element_size());
+}
+
+/**
+ * Performs an out-of-place allreduce and stores result in out.
+ *
+ * If _reg_buffer is null, assumes inp.data_ptr() is already IPC-registered.
+ * Otherwise, _reg_buffer is assumed to be IPC-registered and inp is first
+ * copied into _reg_buffer.
+ */
+void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, fptr_t _reg_buffer, int64_t reg_buffer_sz_bytes) {
+  auto fa = reinterpret_cast<sglang::CustomAllreduce*>(_fa);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
+  auto stream = c10::cuda::getCurrentCUDAStream().stream();
+
+  TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
+  TORCH_CHECK_EQ(inp.numel(), out.numel());
+  TORCH_CHECK(_is_weak_contiguous(out));
+  TORCH_CHECK(_is_weak_contiguous(inp));
+  auto input_size = inp.numel() * inp.element_size();
+  auto reg_buffer = reinterpret_cast<void*>(_reg_buffer);
+  if (reg_buffer) {
+    TORCH_CHECK_LE(input_size, reg_buffer_sz_bytes);
+    AT_CUDA_CHECK(cudaMemcpyAsync(reg_buffer, inp.data_ptr(), input_size, cudaMemcpyDeviceToDevice, stream));
+  } else {
+    reg_buffer = inp.data_ptr();
+  }
+  switch (out.scalar_type()) {
+    case at::ScalarType::Float: {
+      fa->allreduce<float>(
+          stream, reinterpret_cast<float*>(reg_buffer), reinterpret_cast<float*>(out.data_ptr()), out.numel());
+      break;
+    }
+    case at::ScalarType::Half: {
+      fa->allreduce<half>(
+          stream, reinterpret_cast<half*>(reg_buffer), reinterpret_cast<half*>(out.data_ptr()), out.numel());
+      break;
+    }
+#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+    case at::ScalarType::BFloat16: {
+      fa->allreduce<nv_bfloat16>(
+          stream,
+          reinterpret_cast<nv_bfloat16*>(reg_buffer),
+          reinterpret_cast<nv_bfloat16*>(out.data_ptr()),
+          out.numel());
+      break;
+    }
+#endif
+    default:
+      throw std::runtime_error("custom allreduce only supports float32, float16 and bfloat16");
+  }
+}
+
+void dispose(fptr_t _fa) {
+  delete reinterpret_cast<sglang::CustomAllreduce*>(_fa);
+}
+
+int64_t meta_size() {
+  return sizeof(sglang::Signal);
+}
+
+void register_buffer(fptr_t _fa, const std::vector<fptr_t>& fake_ipc_ptrs) {
+  auto fa = reinterpret_cast<sglang::CustomAllreduce*>(_fa);
+  TORCH_CHECK(fake_ipc_ptrs.size() == fa->world_size_);
+  void* ipc_ptrs[8];
+  for (int i = 0; i < fake_ipc_ptrs.size(); i++) {
+    ipc_ptrs[i] = reinterpret_cast<void*>(fake_ipc_ptrs[i]);
+  }
+  fa->register_buffer(ipc_ptrs);
+}
+
+// Use vector<int64_t> to represent byte data for python binding compatibility.
+std::tuple<std::vector<int64_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(fptr_t _fa) {
+  auto fa = reinterpret_cast<sglang::CustomAllreduce*>(_fa);
+  auto [handle, offsets] = fa->get_graph_buffer_ipc_meta();
+  std::vector<int64_t> bytes(handle.begin(), handle.end());
+  return std::make_tuple(bytes, offsets);
+}
+
+// Use vector<int64_t> to represent byte data for python binding compatibility.
+void register_graph_buffers(
+    fptr_t _fa, const std::vector<std::vector<int64_t>>& handles, const std::vector<std::vector<int64_t>>& offsets) {
+  auto fa = reinterpret_cast<sglang::CustomAllreduce*>(_fa);
+  std::vector<std::string> bytes;
+  bytes.reserve(handles.size());
+  for (int i = 0; i < handles.size(); i++) {
+    bytes.emplace_back(handles[i].begin(), handles[i].end());
+  }
+  bytes.reserve(handles.size());
+  fa->register_graph_buffers(bytes, offsets);
+}
diff --git a/sgl-kernel/csrc/allreduce/custom_all_reduce.cuh b/sgl-kernel/csrc/allreduce/custom_all_reduce.cuh
new file mode 100644
index 000000000..ec223bdeb
--- /dev/null
+++ b/sgl-kernel/csrc/allreduce/custom_all_reduce.cuh
@@ -0,0 +1,489 @@
+// Adapted from https://github.com/vllm-project/vllm/blob/v0.8.2/csrc/custom_all_reduce.cuh
+#pragma once
+
+#include <cuda.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <array>
+#include <iostream>
+#include <limits>
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+#include "utils.h"
+
+namespace sglang {
+
+constexpr int kMaxBlocks = 36;
+// Counter may overflow, but it's fine since unsigned int overflow is
+// well-defined behavior.
+using FlagType = uint32_t;
+struct Signal {
+  alignas(128) FlagType self_counter[kMaxBlocks][8];
+  // Two sets of peer counters are needed for two syncs. The reason is that
+  // it's possible for peer GPU block to arrive at the second sync point while
+  // the current GPU block haven't passed the first sync point. Thus, peer GPU
+  // may write counter+1 while current GPU is busy waiting for counter. We use
+  // alternating counter array to avoid this possibility.
+  alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
+};
+
+struct __align__(16) RankData {
+  const void* __restrict__ ptrs[8];
+};
+
+struct __align__(16) RankSignals {
+  Signal* signals[8];
+};
+
+// like std::array, but aligned
+template <typename T, int sz>
+struct __align__(alignof(T) * sz) array_t {
+  T data[sz];
+  using type = T;
+  static constexpr int size = sz;
+};
+
+// use packed type to maximize memory efficiency
+// goal: generate ld.128 and st.128 instructions
+template <typename T>
+struct packed_t {
+  // the (P)acked type for load/store
+  using P = array_t<T, 16 / sizeof(T)>;
+  // the (A)ccumulator type for reduction
+  using A = array_t<float, 16 / sizeof(T)>;
+};
+
+#define DINLINE __device__ __forceinline__
+
+// scalar cast functions
+DINLINE float upcast_s(half val) {
+  return __half2float(val);
+}
+
+template <typename T>
+DINLINE T downcast_s(float val);
+template <>
+DINLINE half downcast_s(float val) {
+  return __float2half(val);
+}
+
+// scalar add functions
+// for some reason when compiling with Pytorch, the + operator for half and
+// bfloat is disabled so we call the intrinsics directly
+DINLINE half& assign_add(half& a, half b) {
+  a = __hadd(a, b);
+  return a;
+}
+DINLINE float& assign_add(float& a, float b) {
+  return a += b;
+}
+
+#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+DINLINE float upcast_s(nv_bfloat16 val) {
+  return __bfloat162float(val);
+}
+template <>
+DINLINE nv_bfloat16 downcast_s(float val) {
+  return __float2bfloat16(val);
+}
+DINLINE nv_bfloat16& assign_add(nv_bfloat16& a, nv_bfloat16 b) {
+  a = __hadd(a, b);
+  return a;
+}
+#endif
+
+template <typename T, int N>
+DINLINE array_t<T, N>& packed_assign_add(array_t<T, N>& a, array_t<T, N> b) {
+#pragma unroll
+  for (int i = 0; i < N; i++) {
+    assign_add(a.data[i], b.data[i]);
+  }
+  return a;
+}
+
+template <typename T, int N>
+DINLINE array_t<float, N> upcast(array_t<T, N> val) {
+  if constexpr (std::is_same<T, float>::value) {
+    return val;
+  } else {
+    array_t<float, N> out;
+#pragma unroll
+    for (int i = 0; i < N; i++) {
+      out.data[i] = upcast_s(val.data[i]);
+    }
+    return out;
+  }
+}
+
+template <typename O>
+DINLINE O downcast(array_t<float, O::size> val) {
+  if constexpr (std::is_same<typename O::type, float>::value) {
+    return val;
+  } else {
+    O out;
+#pragma unroll
+    for (int i = 0; i < O::size; i++) {
+      out.data[i] = downcast_s<typename O::type>(val.data[i]);
+    }
+    return out;
+  }
+}
+
+static DINLINE void st_flag_release(FlagType* flag_addr, FlagType flag) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  asm volatile("st.release.sys.global.u32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
+#else
+  asm volatile("membar.sys; st.volatile.global.u32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
+#endif
+}
+
+static DINLINE FlagType ld_flag_acquire(FlagType* flag_addr) {
+  FlagType flag;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  asm volatile("ld.acquire.sys.global.u32 %0, [%1];" : "=r"(flag) : "l"(flag_addr));
+#else
+  asm volatile("ld.volatile.global.u32 %0, [%1]; membar.gl;" : "=r"(flag) : "l"(flag_addr));
+#endif
+  return flag;
+}
+
+static DINLINE void st_flag_volatile(FlagType* flag_addr, FlagType flag) {
+  asm volatile("st.volatile.global.u32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
+}
+
+static DINLINE FlagType ld_flag_volatile(FlagType* flag_addr) {
+  FlagType flag;
+  asm volatile("ld.volatile.global.u32 %0, [%1];" : "=r"(flag) : "l"(flag_addr));
+  return flag;
+}
+
+// is_start: whether this is the very first synchronization barrier.
+// need_fence: whether a memory fence is needed. If true, a release-acquire
+// semantic is used to enforce memory access order before and after this
+// barrier.
+template <int ngpus, bool is_start, bool need_fence = false>
+DINLINE void multi_gpu_barrier(const RankSignals& sg, Signal* self_sg, int rank) {
+  if constexpr (!is_start) __syncthreads();
+  static_assert(!(is_start && need_fence));  // Start barrier shouldn't need fence.
+  if (threadIdx.x < ngpus) {
+    // Increment the counter. Technically we only need one counter, but we use
+    // multiple per block to eliminate the need to share the counter via smem.
+    auto val = self_sg->self_counter[blockIdx.x][threadIdx.x] += 1;
+    // Write the expected counter value to peer and wait for correct value from
+    // peer.
+    auto peer_counter_ptr = &sg.signals[threadIdx.x]->peer_counter[val % 2][blockIdx.x][rank];
+    auto self_counter_ptr = &self_sg->peer_counter[val % 2][blockIdx.x][threadIdx.x];
+    if constexpr (need_fence) {
+      st_flag_release(peer_counter_ptr, val);
+      while (ld_flag_acquire(self_counter_ptr) != val)
+        ;
+    } else {
+      st_flag_volatile(peer_counter_ptr, val);
+      while (ld_flag_volatile(self_counter_ptr) != val)
+        ;
+    }
+  }
+  if constexpr (is_start || need_fence) __syncthreads();
+}
+
+template <typename P, int ngpus, typename A>
+DINLINE P packed_reduce(const P* ptrs[], int idx) {
+  A tmp = upcast(ptrs[0][idx]);
+#pragma unroll
+  for (int i = 1; i < ngpus; i++) {
+    packed_assign_add(tmp, upcast(ptrs[i][idx]));
+  }
+  return downcast<P>(tmp);
+}
+
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1) cross_device_reduce_1stage(
+    RankData* _dp, RankSignals sg, Signal* self_sg, T* __restrict__ result, int rank, int size) {
+  using P = typename packed_t<T>::P;
+  using A = typename packed_t<T>::A;
+  // note: we don't reorder the address so the accumulation order is the same
+  // for all ranks, ensuring bitwise identical results
+  auto dp = *_dp;
+  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
+  // do the actual reduction
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += gridDim.x * blockDim.x) {
+    ((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
+  }
+  multi_gpu_barrier<ngpus, false>(sg, self_sg, rank);
+}
+
+template <typename P>
+DINLINE P* get_tmp_buf(Signal* sg) {
+  return (P*)(((Signal*)sg) + 1);
+}
+
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1) cross_device_reduce_2stage(
+    RankData* _dp, RankSignals sg, Signal* self_sg, T* __restrict__ result, int rank, int size) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = gridDim.x * blockDim.x;
+  using P = typename packed_t<T>::P;
+  using A = typename packed_t<T>::A;
+  int part = size / ngpus;
+  int start = rank * part;
+  int end = rank == ngpus - 1 ? size : start + part;
+  int largest_part = part + size % ngpus;
+  const P* ptrs[ngpus];
+  P* tmps[ngpus];
+#pragma unroll
+  for (int i = 0; i < ngpus; i++) {
+    int target = (rank + i) % ngpus;
+    ptrs[i] = (const P*)_dp->ptrs[target];
+    tmps[i] = get_tmp_buf<P>(sg.signals[target]);
+  }
+  auto tmp_out = tmps[0];
+  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
+  // stage 1: reduce scatter
+  for (int idx = start + tid; idx < end; idx += stride) {
+    tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
+  }
+  multi_gpu_barrier<ngpus, false, true>(sg, self_sg, rank);
+
+  // stage 2: allgather. Note: it's important to match the tid between
+  // the two stages, because visibility across devices is only guaranteed
+  // between threads that have the same tid. If thread i computes the sum of
+  // start + i in the first stage, then thread i also gathers start + i from all
+  // ranks.
+  for (int idx = tid; idx < largest_part; idx += stride) {
+#pragma unroll
+    for (int i = 0; i < ngpus; i++) {
+      int gather_from_rank = ((rank + i) % ngpus);
+      if (gather_from_rank == ngpus - 1 || idx < part) {
+        int dst_idx = gather_from_rank * part + idx;
+        ((P*)result)[dst_idx] = tmps[i][idx];
+      }
+    }
+  }
+}
+
+using IPC_KEY = std::array<uint8_t, sizeof(cudaIpcMemHandle_t)>;
+static_assert(sizeof(IPC_KEY) == sizeof(cudaIpcMemHandle_t));
+static_assert(alignof(IPC_KEY) == alignof(cudaIpcMemHandle_t));
+
+class CustomAllreduce {
+ public:
+  int rank_;
+  int world_size_;
+  bool full_nvlink_;
+
+  RankSignals sg_;
+  // Stores an map from a pointer to its peer pointters from all ranks.
+  std::unordered_map<void*, RankData*> buffers_;
+  Signal* self_sg_;
+
+  // Stores rank data from all ranks. This is mainly for cuda graph purposes.
+  // For cuda graph to work, all kernel arguments must be fixed during graph
+  // capture time. However, the peer pointers are not known during graph capture
+  // time. Therefore, during capture, we increment the rank data pointer and use
+  // that as the argument to the kernel. The kernel arguments are stored in
+  // graph_unreg_buffers_. The actual peer pointers will be filled in at the
+  // memory pointed to by the pointers in graph_unreg_buffers_ when
+  // the IPC handles are exchanged between ranks.
+  //
+  // The overall process looks like this:
+  // 1. Graph capture.
+  // 2. Each rank obtains the IPC handles for each addresses used during cuda
+  // graph capture using get_graph_buffer_ipc_meta.
+  // 3. (In Python) all gather the IPC handles.
+  // 4. Obtain the peer pointers by opening the IPC handles, and store them in
+  // the rank data array at corresponding positions.
+  RankData *d_rank_data_base_, *d_rank_data_end_;
+  std::vector<void*> graph_unreg_buffers_;
+  // a map from IPC handles to opened IPC pointers
+  std::map<IPC_KEY, char*> ipc_handles_;
+
+  /**
+   * Signals are an array of ipc-enabled buffers from all ranks.
+   * For each of the buffer, the layout is as follows:
+   * | -- sizeof(Signal) -- | ------ a few MB ----- |
+   * The first section is for allreduce synchronization, and the second section
+   * is for storing the intermediate results required by some allreduce algos.
+   *
+   * Note: this class does not own any device memory. Any required buffers
+   * are passed in from the constructor.
+   */
+  CustomAllreduce(
+      Signal** signals, void* rank_data, size_t rank_data_sz, int rank, int world_size, bool full_nvlink = true)
+      : rank_(rank),
+        world_size_(world_size),
+        full_nvlink_(full_nvlink),
+        self_sg_(signals[rank]),
+        d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)),
+        d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
+    for (int i = 0; i < world_size_; i++) {
+      sg_.signals[i] = signals[i];
+    }
+  }
+
+  char* open_ipc_handle(const void* ipc_handle) {
+    auto [it, new_handle] = ipc_handles_.insert({*((IPC_KEY*)ipc_handle), nullptr});
+    if (new_handle) {
+      char* ipc_ptr;
+      CHECK_CUDA_SUCCESS(cudaIpcOpenMemHandle(
+          (void**)&ipc_ptr, *((const cudaIpcMemHandle_t*)ipc_handle), cudaIpcMemLazyEnablePeerAccess));
+      it->second = ipc_ptr;
+    }
+    return it->second;
+  }
+
+  std::pair<std::string, std::vector<int64_t>> get_graph_buffer_ipc_meta() {
+    auto num_buffers = graph_unreg_buffers_.size();
+    auto handle_sz = sizeof(cudaIpcMemHandle_t);
+    std::string handles(handle_sz * num_buffers, static_cast<char>(0));
+    std::vector<int64_t> offsets(num_buffers);
+    for (int i = 0; i < num_buffers; i++) {
+      auto ptr = graph_unreg_buffers_[i];
+      void* base_ptr;
+      // note: must share the base address of each allocation, or we get wrong
+      // address
+      if (cuPointerGetAttribute(&base_ptr, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, (CUdeviceptr)ptr) != CUDA_SUCCESS)
+        throw std::runtime_error("failed to get pointer attr");
+      CHECK_CUDA_SUCCESS(cudaIpcGetMemHandle((cudaIpcMemHandle_t*)&handles[i * handle_sz], base_ptr));
+      offsets[i] = ((char*)ptr) - ((char*)base_ptr);
+    }
+    return std::make_pair(handles, offsets);
+  }
+
+  void check_rank_data_capacity(size_t num = 1) {
+    if (d_rank_data_base_ + num > d_rank_data_end_)
+      throw std::runtime_error(
+          "Rank data buffer is overflowed by " + std::to_string(d_rank_data_base_ + num - d_rank_data_end_));
+  }
+
+  /**
+   * Register already-shared IPC pointers.
+   */
+  void register_buffer(void** ptrs) {
+    check_rank_data_capacity();
+    RankData data;
+    for (int i = 0; i < world_size_; i++) {
+      data.ptrs[i] = ptrs[i];
+    }
+    auto d_data = d_rank_data_base_++;
+    CHECK_CUDA_SUCCESS(cudaMemcpy(d_data, &data, sizeof(RankData), cudaMemcpyHostToDevice));
+    buffers_[ptrs[rank_]] = d_data;
+  }
+
+  // Note: when registering graph buffers, we intentionally choose to not
+  // deduplicate the addresses. That means if the allocator reuses some
+  // addresses, they will be registered again. This is to account for the remote
+  // possibility of different allocation patterns between ranks. For example,
+  // rank 1 may get the same input address for the second allreduce, but rank 2
+  // got a different address. IPC handles have internal reference counting
+  // mechanism so overhead should be small.
+  void
+  register_graph_buffers(const std::vector<std::string>& handles, const std::vector<std::vector<int64_t>>& offsets) {
+    auto num_buffers = graph_unreg_buffers_.size();
+    check_rank_data_capacity(num_buffers);
+    std::vector<RankData> rank_data(num_buffers);
+    for (int i = 0; i < num_buffers; i++) {
+      auto self_ptr = graph_unreg_buffers_[i];
+      auto& rd = rank_data[i];
+      for (int j = 0; j < world_size_; j++) {
+        if (j != rank_) {
+          char* handle = open_ipc_handle(&handles[j][i * sizeof(cudaIpcMemHandle_t)]);
+          handle += offsets[j][i];
+          rd.ptrs[j] = handle;
+        } else {
+          rd.ptrs[j] = self_ptr;
+        }
+      }
+    }
+    CHECK_CUDA_SUCCESS(
+        cudaMemcpy(d_rank_data_base_, rank_data.data(), sizeof(RankData) * num_buffers, cudaMemcpyHostToDevice));
+    d_rank_data_base_ += num_buffers;
+    graph_unreg_buffers_.clear();
+  }
+
+  /**
+   * Performs allreduce, assuming input has already been registered.
+   *
+   * Block and grid default configs are results after careful grid search. Using
+   * 36 blocks give the best or close to the best runtime on the devices I
+   * tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also only
+   * take a small amount of SMs. Not quite sure the underlying reason, but my
+   * guess is that too many SMs will cause contention on NVLink bus.
+   */
+  template <typename T>
+  void allreduce(cudaStream_t stream, T* input, T* output, int size, int threads = 512, int block_limit = 36) {
+    auto d = packed_t<T>::P::size;
+    if (size % d != 0)
+      throw std::runtime_error(
+          "custom allreduce currently requires input length to be multiple "
+          "of " +
+          std::to_string(d));
+    if (block_limit > kMaxBlocks)
+      throw std::runtime_error(
+          "max supported block limit is " + std::to_string(kMaxBlocks) + ". Got " + std::to_string(block_limit));
+
+    RankData* ptrs;
+    cudaStreamCaptureStatus status;
+    CHECK_CUDA_SUCCESS(cudaStreamIsCapturing(stream, &status));
+    if (status == cudaStreamCaptureStatusActive) {
+      ptrs = d_rank_data_base_ + graph_unreg_buffers_.size();
+      graph_unreg_buffers_.push_back(input);
+    } else {
+      auto it = buffers_.find(input);
+      if (it == buffers_.end())
+        throw std::runtime_error(
+            "buffer address " + std::to_string(reinterpret_cast<uint64_t>(input)) + " is not registered!");
+      ptrs = it->second;
+    }
+
+    size /= d;
+    auto bytes = size * sizeof(typename packed_t<T>::P);
+    int blocks = std::min(block_limit, (size + threads - 1) / threads);
+#define KL(ngpus, name) name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, rank_, size);
+    // TODO(hanzhi713): Threshold is different for A100 and H100.
+    // Add per device threshold.
+#define REDUCE_CASE(ngpus)                                                                        \
+  case ngpus: {                                                                                   \
+    if (world_size_ == 2) {                                                                       \
+      KL(ngpus, cross_device_reduce_1stage);                                                      \
+    } else if (full_nvlink_) {                                                                    \
+      if ((world_size_ <= 4 && bytes < 512 * 1024) || (world_size_ <= 8 && bytes < 256 * 1024)) { \
+        KL(ngpus, cross_device_reduce_1stage);                                                    \
+      } else {                                                                                    \
+        KL(ngpus, cross_device_reduce_2stage);                                                    \
+      }                                                                                           \
+    }                                                                                             \
+    break;                                                                                        \
+  }
+
+    switch (world_size_) {
+      REDUCE_CASE(2)
+      REDUCE_CASE(4)
+      REDUCE_CASE(6)
+      REDUCE_CASE(8)
+      default:
+        throw std::runtime_error(
+            "custom allreduce only supports num gpus in (2,4,6,8). Actual num "
+            "gpus = " +
+            std::to_string(world_size_));
+    }
+#undef REDUCE_CASE
+#undef KL
+  }
+
+  ~CustomAllreduce() {
+    for (auto [_, ptr] : ipc_handles_) {
+      CHECK_CUDA_SUCCESS(cudaIpcCloseMemHandle(ptr));
+    }
+  }
+};
+/**
+ * To inspect PTX/SASS, copy paste this header file to compiler explorer and add
+ a template instantiation:
+ * template void sglang::CustomAllreduce::allreduce<half>(cudaStream_t, half *,
+ half *, int, int, int);
+*/
+}  // namespace sglang
diff --git a/sgl-kernel/csrc/allreduce/custom_all_reduce.hip b/sgl-kernel/csrc/allreduce/custom_all_reduce.hip
new file mode 100644
index 000000000..7a1a586d4
--- /dev/null
+++ b/sgl-kernel/csrc/allreduce/custom_all_reduce.hip
@@ -0,0 +1,180 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/hip/Exceptions.h>
+#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
+#include <ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h>
+#include <torch/all.h>
+
+#include "custom_all_reduce_hip.cuh"
+
+// fake pointer type, must match fptr_t type in ops.h
+using fptr_t = int64_t;
+static_assert(sizeof(void*) == sizeof(fptr_t));
+
+fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data,
+                      const std::vector<std::string>& handles,
+                      const std::vector<int64_t>& offsets, int64_t rank,
+                      bool full_nvlink) {
+  int world_size = offsets.size();
+  if (world_size > 8)
+    throw std::invalid_argument("world size > 8 is not supported");
+  if (world_size % 2 != 0)
+    throw std::invalid_argument("Odd num gpus is not supported for now");
+  if (world_size != handles.size())
+    throw std::invalid_argument(
+        "handles length should equal to offsets length");
+  if (rank < 0 || rank >= world_size)
+    throw std::invalid_argument("invalid rank passed in");
+
+  hipIpcMemHandle_t ipc_handles[8];
+  for (int i = 0; i < world_size; i++) {
+    std::memcpy(&ipc_handles[i], handles[i].data(), sizeof(hipIpcMemHandle_t));
+  }
+  return (fptr_t) new sglang::CustomAllreduce(
+      reinterpret_cast<sglang::Signal*>(meta.data_ptr()), rank_data.data_ptr(),
+      rank_data.numel(), ipc_handles, offsets, rank, full_nvlink);
+}
+
+/**
+ * Make sure tensor t's data lies completely within ((char)t.data_ptr()) +
+ * t.numel() * t.element_size(). This is slightly weaker than t.is_contiguous()
+ * because it allows transpose of contiguous slice (i.e. slicing the first
+ * dimension). Currently, we require this because stride information is not
+ * passed into the kernels and we treat input tensors as flat.
+ *
+ * Examples
+ * A = torch.zeros(3, 3, 3)
+ * 1. A: OK
+ * 2. A[1:]: OK
+ * 3. A.permute(2, 0, 1): OK
+ * 4. A[1:].permute(2, 0, 1): OK
+ * 5. A[None].expand(2, -1, -1, -1): Not OK
+ * 6. A[:, 1:, 1:]: Not OK
+ */
+bool _is_weak_contiguous(torch::Tensor& t) {
+  return t.is_contiguous() ||
+         (t.storage().nbytes() - t.storage_offset() * t.element_size() ==
+          t.numel() * t.element_size());
+}
+
+void _all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
+                 hipStream_t stream) {
+  auto fa = reinterpret_cast<sglang::CustomAllreduce*>(_fa);
+  TORCH_CHECK(_is_weak_contiguous(out));
+  switch (out.scalar_type()) {
+    case at::ScalarType::Float: {
+      fa->allreduce<float>(stream, reinterpret_cast<float*>(inp.data_ptr()),
+                           reinterpret_cast<float*>(out.data_ptr()),
+                           out.numel());
+      break;
+    }
+    case at::ScalarType::Half: {
+      fa->allreduce<half>(stream, reinterpret_cast<half*>(inp.data_ptr()),
+                          reinterpret_cast<half*>(out.data_ptr()), out.numel());
+      break;
+    }
+#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+    case at::ScalarType::BFloat16: {
+      fa->allreduce<nv_bfloat16>(
+          stream, reinterpret_cast<nv_bfloat16*>(inp.data_ptr()),
+          reinterpret_cast<nv_bfloat16*>(out.data_ptr()), out.numel());
+      break;
+    }
+#endif
+    default:
+      throw std::runtime_error(
+          "custom allreduce only supports float32, float16 and bfloat16");
+  }
+}
+
+void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out) {
+  const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(inp));
+  auto stream = c10::hip::getCurrentHIPStreamMasqueradingAsCUDA().stream();
+  TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
+  TORCH_CHECK_EQ(inp.numel(), out.numel());
+  _all_reduce(_fa, inp, out, stream);
+}
+
+void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer,
+                      torch::Tensor& out) {
+  const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(inp));
+  auto stream = c10::hip::getCurrentHIPStreamMasqueradingAsCUDA().stream();
+
+  auto input_size = inp.numel() * inp.element_size();
+  TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
+  TORCH_CHECK_EQ(inp.numel(), out.numel());
+  TORCH_CHECK(input_size <= reg_buffer.numel() * reg_buffer.element_size(),
+              "registered buffer is too small to contain the input");
+  AT_CUDA_CHECK(hipMemcpyAsync(reg_buffer.data_ptr(), inp.data_ptr(),
+                                input_size, hipMemcpyDeviceToDevice, stream));
+  _all_reduce(_fa, reg_buffer, out, stream);
+}
+
+void dispose(fptr_t _fa) {
+  auto fa = reinterpret_cast<sglang::CustomAllreduce*>(_fa);
+  delete fa;
+}
+
+int64_t meta_size() { return sizeof(sglang::Signal); }
+
+void register_buffer(fptr_t _fa, torch::Tensor& t,
+                     const std::vector<std::string>& handles,
+                     const std::vector<int64_t>& offsets) {
+  auto fa = reinterpret_cast<sglang::CustomAllreduce*>(_fa);
+  fa->register_buffer(handles, offsets, t.data_ptr());
+}
+
+std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta(
+    fptr_t _fa) {
+  auto fa = reinterpret_cast<sglang::CustomAllreduce*>(_fa);
+  auto [handle_bytes, offsets] = fa->get_graph_buffer_ipc_meta();
+  auto options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
+  auto handles =
+      torch::empty({static_cast<int64_t>(handle_bytes.size())}, options);
+  std::memcpy(handles.data_ptr(), handle_bytes.data(), handle_bytes.size());
+  return {handles, std::move(offsets)};
+}
+
+void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,
+                            const std::vector<std::vector<int64_t>>& offsets) {
+  auto fa = reinterpret_cast<sglang::CustomAllreduce*>(_fa);
+  fa->register_graph_buffers(handles, offsets);
+}
+
+void free_meta_buffer(void* buffer) { CUDACHECK(hipFree(buffer)); }
+
+torch::Tensor get_meta_buffer_ipc_handle(torch::Tensor& inp) {
+  auto options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
+  auto data_handle =
+      torch::empty({static_cast<int64_t>(sizeof(hipIpcMemHandle_t))}, options);
+  CUDACHECK(hipIpcGetMemHandle((hipIpcMemHandle_t*)data_handle.data_ptr(),
+                                inp.data_ptr()));
+  return data_handle;
+}
+
+torch::Tensor allocate_meta_buffer(int64_t size) {
+  auto device_index = c10::hip::current_device();
+  at::DeviceGuard device_guard(at::Device(at::DeviceType::CUDA, device_index));
+  void* buffer;
+  hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
+  auto stream = c10::hip::getCurrentHIPStreamMasqueradingAsCUDA().stream();
+  AT_CUDA_CHECK(hipThreadExchangeStreamCaptureMode(&mode));
+  AT_CUDA_CHECK(
+      hipExtMallocWithFlags((void**)&buffer, size, hipDeviceMallocUncached));
+  AT_CUDA_CHECK(hipMemsetAsync(buffer, 0, size, stream));
+  AT_CUDA_CHECK(hipStreamSynchronize(stream));
+  AT_CUDA_CHECK(hipThreadExchangeStreamCaptureMode(&mode));
+  auto options = torch::TensorOptions()
+                     .dtype(torch::kI8)
+                     .device(torch::kCUDA, device_index);
+  return torch::from_blob(buffer, {size}, free_meta_buffer, options);
+}
+
+std::vector<uint8_t> get_device_bdf(int dev) {
+  char busIdStr[] = "0000:00:00.0";
+  std::vector<uint8_t> bdf(sizeof(busIdStr), 0);
+  CUDACHECK(hipDeviceGetPCIBusId((char*)bdf.data(), sizeof(busIdStr), dev));
+  bdf.resize(bdf.size() - 1);  // remove trailing NULL
+  return bdf;
+}
diff --git a/sgl-kernel/csrc/allreduce/custom_all_reduce_hip.cuh b/sgl-kernel/csrc/allreduce/custom_all_reduce_hip.cuh
new file mode 100644
index 000000000..db19a0c5f
--- /dev/null
+++ b/sgl-kernel/csrc/allreduce/custom_all_reduce_hip.cuh
@@ -0,0 +1,582 @@
+// !!! This is a file automatically generated by hipify!!!
+#pragma once
+
+#include <hip/hip_runtime.h>
+#ifdef USE_ROCM
+#include <hip/hip_bf16.h>
+typedef __hip_bfloat16 nv_bfloat16;
+#else
+#include <hip/hip_bf16.h>
+#endif
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+
+#include <iostream>
+#include <limits>
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+#define CUDACHECK(cmd)                                                                     \
+  do {                                                                                     \
+    hipError_t e = cmd;                                                                    \
+    if (e != hipSuccess) {                                                                 \
+      printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, hipGetErrorString(e)); \
+      exit(EXIT_FAILURE);                                                                  \
+    }                                                                                      \
+  } while (0)
+
+namespace sglang {
+
+constexpr int kMaxBlocks = 64;
+// note: we don't want to use atomics for signals because peer atomics are no
+// supported on PCIe links
+struct Signal {
+  alignas(128) uint32_t start[kMaxBlocks][8];
+  alignas(128) uint32_t end[kMaxBlocks][8];
+  alignas(128) uint32_t _flag[kMaxBlocks];  // incremental flags for each rank
+};
+
+#ifdef USE_ROCM
+struct __align__(16) RankData {
+  const void* ptrs[8];
+};
+#else
+struct __align__(16) RankData {
+  const void* __restrict__ ptrs[8];
+};
+#endif
+
+struct __align__(16) RankSignals {
+#ifndef USE_ROCM
+  volatile
+#endif
+      Signal* signals[8];
+};
+
+// like std::array, but aligned
+template <typename T, int sz>
+struct __align__(alignof(T) * sz) array_t {
+  T data[sz];
+  using type = T;
+  static constexpr int size = sz;
+};
+
+// use packed type to maximize memory efficiency
+// goal: generate ld.128 and st.128 instructions
+template <typename T>
+struct packed_t {
+  // the (P)acked type for load/store
+  using P = array_t<T, 16 / sizeof(T)>;
+  // the (A)ccumulator type for reduction
+  using A = array_t<float, 16 / sizeof(T)>;
+};
+
+#define DINLINE __device__ __forceinline__
+
+// scalar cast functions
+DINLINE float upcast_s(half val) {
+  return __half2float(val);
+}
+
+template <typename T>
+DINLINE T downcast_s(float val);
+template <>
+DINLINE half downcast_s(float val) {
+  return __float2half(val);
+}
+
+// scalar add functions
+// for some reason when compiling with Pytorch, the + operator for half and
+// bfloat is disabled so we call the intrinsics directly
+DINLINE half& assign_add(half& a, half b) {
+  a = __hadd(a, b);
+  return a;
+}
+DINLINE float& assign_add(float& a, float b) {
+  return a += b;
+}
+
+#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+DINLINE float upcast_s(nv_bfloat16 val) {
+  return __bfloat162float(val);
+}
+template <>
+DINLINE nv_bfloat16 downcast_s(float val) {
+  return __float2bfloat16(val);
+}
+DINLINE nv_bfloat16& assign_add(nv_bfloat16& a, nv_bfloat16 b) {
+  a = __hadd(a, b);
+  return a;
+}
+#endif
+
+template <typename T, int N>
+DINLINE array_t<T, N>& packed_assign_add(array_t<T, N>& a, array_t<T, N> b) {
+#pragma unroll
+  for (int i = 0; i < N; i++) {
+    assign_add(a.data[i], b.data[i]);
+  }
+  return a;
+}
+
+template <typename T, int N>
+DINLINE array_t<float, N> upcast(array_t<T, N> val) {
+  if constexpr (std::is_same<T, float>::value) {
+    return val;
+  } else {
+    array_t<float, N> out;
+#pragma unroll
+    for (int i = 0; i < N; i++) {
+      out.data[i] = upcast_s(val.data[i]);
+    }
+    return out;
+  }
+}
+
+template <typename O>
+DINLINE O downcast(array_t<float, O::size> val) {
+  if constexpr (std::is_same<typename O::type, float>::value) {
+    return val;
+  } else {
+    O out;
+#pragma unroll
+    for (int i = 0; i < O::size; i++) {
+      out.data[i] = downcast_s<typename O::type>(val.data[i]);
+    }
+    return out;
+  }
+}
+
+// This function is meant to be used as the first synchronization in the all
+// reduce kernel. Thus, it doesn't need to make any visibility guarantees for
+// prior memory accesses. Note: volatile writes will not be reordered against
+// other volatile writes.
+template <int ngpus>
+DINLINE void start_sync(
+    const RankSignals& sg,
+#ifndef USE_ROCM
+    volatile
+#endif
+    Signal* self_sg,
+    int rank) {
+#ifdef USE_ROCM
+  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
+  if (threadIdx.x < ngpus) {
+    // simultaneously write to the corresponding flag of all ranks.
+    // Latency = 1 p2p write
+    __hip_atomic_store(
+        &sg.signals[threadIdx.x]->start[blockIdx.x][rank], flag, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
+    // wait until we got true from all ranks
+    while (__hip_atomic_load(&self_sg->start[blockIdx.x][threadIdx.x], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT) <
+           flag)
+      ;
+  }
+  __syncthreads();
+  // use one thread to update flag
+  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
+#else
+  if (threadIdx.x < ngpus) {
+    // reset flag for next time
+    self_sg->end[blockIdx.x][threadIdx.x] = 0;
+    // simultaneously write to the corresponding flag of all ranks.
+    // Latency = 1 p2p write
+    sg.signals[threadIdx.x]->start[blockIdx.x][rank] = 1;
+    // wait until we got true from all ranks
+    while (!self_sg->start[blockIdx.x][threadIdx.x])
+      ;
+  }
+  __syncthreads();
+#endif
+}
+
+// This function is meant to be used as the second or the final synchronization
+// barrier in the all reduce kernel. If it's the final synchronization barrier,
+// we don't need to make any visibility guarantees for prior memory accesses.
+template <int ngpus, bool final_sync = false>
+DINLINE void end_sync(
+    const RankSignals& sg,
+#ifndef USE_ROCM
+    volatile
+#endif
+    Signal* self_sg,
+    int rank) {
+#ifdef USE_ROCM
+  __syncthreads();
+  // eliminate the case that prior writes are not visible after signals become
+  // visible. Note that I did not managed to make this happen through a lot of
+  // testing. Might be the case that hardware provides stronger guarantee than
+  // the memory model.
+  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
+  if (threadIdx.x < ngpus) {
+    // simultaneously write to the corresponding flag of all ranks.
+    // Latency = 1 p2p write
+    __hip_atomic_store(
+        &sg.signals[threadIdx.x]->end[blockIdx.x][rank],
+        flag,
+        final_sync ? __ATOMIC_RELAXED : __ATOMIC_RELEASE,
+        __HIP_MEMORY_SCOPE_SYSTEM);
+    // wait until we got true from all ranks
+    while (__hip_atomic_load(
+               &self_sg->end[blockIdx.x][threadIdx.x],
+               final_sync ? __ATOMIC_RELAXED : __ATOMIC_ACQUIRE,
+               __HIP_MEMORY_SCOPE_AGENT) < flag)
+      ;
+  }
+  __syncthreads();
+  // use one thread to update flag
+  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
+#else
+  __syncthreads();
+  // eliminate the case that prior writes are not visible after signals become
+  // visible. Note that I did not managed to make this happen through a lot of
+  // testing. Might be the case that hardware provides stronger guarantee than
+  // the memory model.
+  if constexpr (!final_sync) __threadfence_system();
+  if (threadIdx.x < ngpus) {
+    // reset flag for next time
+    self_sg->start[blockIdx.x][threadIdx.x] = 0;
+    // simultaneously write to the corresponding flag of all ranks.
+    // Latency = 1 p2p write
+    sg.signals[threadIdx.x]->end[blockIdx.x][rank] = 1;
+    // wait until we got true from all ranks
+    while (!self_sg->end[blockIdx.x][threadIdx.x])
+      ;
+  }
+  if constexpr (!final_sync) __syncthreads();
+#endif
+}
+
+template <typename P, int ngpus, typename A>
+DINLINE P packed_reduce(const P* ptrs[], int idx) {
+  A tmp = upcast(ptrs[0][idx]);
+#pragma unroll
+  for (int i = 1; i < ngpus; i++) {
+    packed_assign_add(tmp, upcast(ptrs[i][idx]));
+  }
+  return downcast<P>(tmp);
+}
+
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1) cross_device_reduce_1stage(
+    RankData* _dp,
+    RankSignals sg,
+#ifndef USE_ROCM
+    volatile
+#endif
+    Signal* self_sg,
+    T* __restrict__ result,
+    int rank,
+    int size) {
+  using P = typename packed_t<T>::P;
+  using A = typename packed_t<T>::A;
+  // note: we don't reorder the address so the accumulation order is the same
+  // for all ranks, ensuring bitwise identical results
+  auto dp = *_dp;
+  start_sync<ngpus>(sg, self_sg, rank);
+  // do the actual reduction
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += gridDim.x * blockDim.x) {
+    ((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
+  }
+  end_sync<ngpus, true>(sg, self_sg, rank);
+}
+
+template <typename P>
+#ifdef USE_ROCM
+DINLINE P* get_tmp_buf(Signal* sg) {
+#else
+DINLINE P* get_tmp_buf(volatile Signal* sg) {
+#endif
+  return (P*)(((Signal*)sg) + 1);
+}
+
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1) cross_device_reduce_2stage(
+    RankData* _dp,
+    RankSignals sg,
+#ifndef USE_ROCM
+    volatile
+#endif
+    Signal* self_sg,
+    T* __restrict__ result,
+    int rank,
+    int size) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = gridDim.x * blockDim.x;
+  using P = typename packed_t<T>::P;
+  using A = typename packed_t<T>::A;
+  int part = size / ngpus;
+  int start = rank * part;
+  int end = rank == ngpus - 1 ? size : start + part;
+  int largest_part = part + size % ngpus;
+  const P* ptrs[ngpus];
+  P* tmps[ngpus];
+#pragma unroll
+  for (int i = 0; i < ngpus; i++) {
+    int target = (rank + i) % ngpus;
+    ptrs[i] = (const P*)_dp->ptrs[target];
+    tmps[i] = get_tmp_buf<P>(sg.signals[target]);
+  }
+  auto tmp_out = tmps[0];
+  start_sync<ngpus>(sg, self_sg, rank);
+  // stage 1: reduce scatter
+  for (int idx = start + tid; idx < end; idx += stride) {
+    tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
+  }
+  end_sync<ngpus>(sg, self_sg, rank);
+
+  // stage 2: allgather. Note: it's important to match the tid between
+  // the two stages, because visibility across devices is only guaranteed
+  // between threads that have the same tid. If thread i computes the sum of
+  // start + i in the first stage, then thread i also gathers start + i from all
+  // ranks.
+  for (int idx = tid; idx < largest_part; idx += stride) {
+#pragma unroll
+    for (int i = 0; i < ngpus; i++) {
+      int gather_from_rank = ((rank + i) % ngpus);
+      if (gather_from_rank == ngpus - 1 || idx < part) {
+        int dst_idx = gather_from_rank * part + idx;
+        ((P*)result)[dst_idx] = tmps[i][idx];
+      }
+    }
+  }
+}
+
+using IPC_KEY = std::array<uint8_t, sizeof(hipIpcMemHandle_t)>;
+static_assert(sizeof(IPC_KEY) == sizeof(hipIpcMemHandle_t));
+static_assert(alignof(IPC_KEY) == alignof(hipIpcMemHandle_t));
+
+class CustomAllreduce {
+ public:
+  int rank_;
+  int world_size_;
+  bool full_nvlink_;
+
+  // below are device pointers
+  RankSignals sg_;
+  std::unordered_map<void*, RankData*> buffers_;
+  Signal* self_sg_;
+
+  // stores the registered device pointers from all ranks
+  RankData *d_rank_data_base_, *d_rank_data_end_;
+  std::vector<void*> graph_unreg_buffers_;
+  // a map from IPC handles to opened IPC pointers
+  std::map<IPC_KEY, char*> ipc_handles_;
+
+  /**
+   * meta is a pointer to device metadata and temporary buffer for allreduce.
+   *
+   * There's a total of sizeof(Signal) of prefix before the actual data,
+   * so meta + 1 points to actual temporary buffer.
+   *
+   * note: this class does not own any device memory. Any required buffers
+   * are passed in from the constructor
+   */
+  CustomAllreduce(
+      Signal* meta,
+      void* rank_data,
+      size_t rank_data_sz,
+      const hipIpcMemHandle_t* handles,
+      const std::vector<int64_t>& offsets,
+      int rank,
+      bool full_nvlink = true)
+      : rank_(rank),
+        world_size_(offsets.size()),
+        full_nvlink_(full_nvlink),
+        self_sg_(meta),
+        d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)),
+        d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
+    for (int i = 0; i < world_size_; i++) {
+      Signal* rank_sg;
+      if (i != rank_) {
+        char* handle = open_ipc_handle(&handles[i]);
+        handle += offsets[i];
+        rank_sg = (Signal*)handle;
+      } else {
+        rank_sg = self_sg_;
+      }
+      sg_.signals[i] = rank_sg;
+    }
+  }
+
+  char* open_ipc_handle(const void* ipc_handle) {
+    auto [it, new_handle] = ipc_handles_.insert({*((IPC_KEY*)ipc_handle), nullptr});
+    if (new_handle) {
+      char* ipc_ptr;
+      CUDACHECK(hipIpcOpenMemHandle(
+          (void**)&ipc_ptr, *((const hipIpcMemHandle_t*)ipc_handle), hipIpcMemLazyEnablePeerAccess));
+      it->second = ipc_ptr;
+    }
+    return it->second;
+  }
+
+  std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta() {
+    auto num_buffers = graph_unreg_buffers_.size();
+    auto handle_sz = sizeof(hipIpcMemHandle_t);
+    std::vector<uint8_t> handles(handle_sz * num_buffers, 0);
+    std::vector<int64_t> offsets(num_buffers);
+    for (int i = 0; i < num_buffers; i++) {
+      auto ptr = graph_unreg_buffers_[i];
+      void* base_ptr;
+      // note: must share the base address of each allocation, or we get wrong
+      // address
+      if (hipPointerGetAttribute(
+              &base_ptr,
+#ifdef USE_ROCM
+              HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR,
+#else
+              CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
+#endif
+              (hipDeviceptr_t)ptr) != hipSuccess)
+        throw std::runtime_error("failed to get pointer attr");
+      CUDACHECK(hipIpcGetMemHandle((hipIpcMemHandle_t*)&handles[i * handle_sz], base_ptr));
+      offsets[i] = ((char*)ptr) - ((char*)base_ptr);
+    }
+    return std::make_pair(handles, offsets);
+  }
+
+  void check_rank_data_capacity(size_t num = 1) {
+    if (d_rank_data_base_ + num > d_rank_data_end_)
+      throw std::runtime_error(
+          "Rank data buffer is overflowed by " + std::to_string(d_rank_data_base_ + num - d_rank_data_end_));
+  }
+
+  void register_buffer(const std::vector<std::string>& handles, const std::vector<int64_t>& offsets, void* self) {
+    check_rank_data_capacity();
+    RankData data;
+    for (int i = 0; i < world_size_; i++) {
+      if (i != rank_) {
+        char* handle = open_ipc_handle(handles[i].data());
+        handle += offsets[i];
+        data.ptrs[i] = handle;
+      } else {
+        data.ptrs[i] = self;
+      }
+    }
+    auto d_data = d_rank_data_base_++;
+    CUDACHECK(hipMemcpy(d_data, &data, sizeof(RankData), hipMemcpyHostToDevice));
+    buffers_[self] = d_data;
+  }
+
+  // note: when registering graph buffers, we intentionally choose to not
+  // deduplicate the addresses. That means if the allocator reuses some
+  // addresses, they will be registered again. This is to account for the remote
+  // possibility of different allocation patterns between ranks. For example,
+  // rank 1 may get the same input address for the second allreduce, but rank 2
+  // got a different address. IPC handles have internal reference counting
+  // mechanism so overhead should be small.
+  void
+  register_graph_buffers(const std::vector<std::string>& handles, const std::vector<std::vector<int64_t>>& offsets) {
+    auto num_buffers = graph_unreg_buffers_.size();
+    check_rank_data_capacity(num_buffers);
+    std::vector<RankData> rank_data(num_buffers);
+    for (int i = 0; i < num_buffers; i++) {
+      auto self_ptr = graph_unreg_buffers_[i];
+      auto& rd = rank_data[i];
+      for (int j = 0; j < world_size_; j++) {
+        if (j != rank_) {
+          char* handle = open_ipc_handle(&handles[j][i * sizeof(hipIpcMemHandle_t)]);
+          handle += offsets[j][i];
+          rd.ptrs[j] = handle;
+        } else {
+          rd.ptrs[j] = self_ptr;
+        }
+      }
+    }
+    CUDACHECK(hipMemcpy(d_rank_data_base_, rank_data.data(), sizeof(RankData) * num_buffers, hipMemcpyHostToDevice));
+    d_rank_data_base_ += num_buffers;
+    graph_unreg_buffers_.clear();
+  }
+
+  /**
+   * This is the result after careful grid search. Using 36 blocks give the best
+   * or close to the best runtime on the devices I tried: A100, A10, A30, T4,
+   * V100. You'll notice that NCCL kernels also only take a small amount of SMs.
+   * Not quite sure the underlying reason, but my guess is that too many SMs
+   * will cause contention on NVLink bus.
+   */
+  template <typename T>
+  void allreduce(
+      hipStream_t stream,
+      T* input,
+      T* output,
+      int size,
+#ifndef USE_ROCM
+      int threads = 512,
+      int block_limit = 36){
+#else
+      int threads = 512,
+      int block_limit = 16) {
+#endif
+      auto d = packed_t<T>::P::size;
+  if (size % d != 0)
+    throw std::runtime_error(
+        "custom allreduce currently requires input length to be multiple "
+        "of " +
+        std::to_string(d));
+  if (block_limit > kMaxBlocks)
+    throw std::runtime_error(
+        "max supported block limit is " + std::to_string(kMaxBlocks) + ". Got " + std::to_string(block_limit));
+
+  RankData* ptrs;
+  hipStreamCaptureStatus status;
+  CUDACHECK(hipStreamIsCapturing(stream, &status));
+  if (status == hipStreamCaptureStatusActive) {
+    ptrs = d_rank_data_base_ + graph_unreg_buffers_.size();
+    graph_unreg_buffers_.push_back(input);
+  } else {
+    auto it = buffers_.find(input);
+    if (it == buffers_.end())
+      throw std::runtime_error(
+          "buffer address " + std::to_string(reinterpret_cast<uint64_t>(input)) + " is not registered!");
+    ptrs = it->second;
+  }
+
+  size /= d;
+  auto bytes = size * sizeof(typename packed_t<T>::P);
+  int blocks = ::min(block_limit, (size + threads - 1) / threads);
+#define KL(ngpus, name) \
+  hipLaunchKernelGGL(   \
+      (name<T, ngpus>), dim3(blocks), dim3(threads), 0, stream, ptrs, sg_, self_sg_, output, rank_, size);
+#define REDUCE_CASE(ngpus)                                                                        \
+  case ngpus: {                                                                                   \
+    if (world_size_ == 2) {                                                                       \
+      KL(ngpus, cross_device_reduce_1stage);                                                      \
+    } else if (full_nvlink_) {                                                                    \
+      if ((world_size_ <= 4 && bytes < 512 * 1024) || (world_size_ <= 8 && bytes < 256 * 1024)) { \
+        KL(ngpus, cross_device_reduce_1stage);                                                    \
+      } else {                                                                                    \
+        KL(ngpus, cross_device_reduce_2stage);                                                    \
+      }                                                                                           \
+    }                                                                                             \
+    break;                                                                                        \
+  }
+
+  switch (world_size_) {
+    REDUCE_CASE(2)
+    REDUCE_CASE(4)
+    REDUCE_CASE(6)
+    REDUCE_CASE(8)
+    default:
+      throw std::runtime_error(
+          "custom allreduce only supports num gpus in (2,4,6,8). Actual num "
+          "gpus = " +
+          std::to_string(world_size_));
+  }
+#undef REDUCE_CASE
+#undef KL
+}
+
+~CustomAllreduce() {
+  for (auto [_, ptr] : ipc_handles_) {
+    CUDACHECK(hipIpcCloseMemHandle(ptr));
+  }
+}
+};  // namespace sglang
+/**
+ * To inspect PTX/SASS, copy paste this header file to compiler explorer and add
+ a template instantiation:
+ * template void sglang::CustomAllreduce::allreduce<half>(hipStream_t, half *,
+ half *, int, int, int);
+*/
+}  // namespace sglang
diff --git a/sgl-kernel/csrc/allreduce/mscclpp_allreduce.cu b/sgl-kernel/csrc/allreduce/mscclpp_allreduce.cu
new file mode 100644
index 000000000..d9cda22f0
--- /dev/null
+++ b/sgl-kernel/csrc/allreduce/mscclpp_allreduce.cu
@@ -0,0 +1,140 @@
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#include <torch/all.h>
+#include <torch/library.h>
+
+#include "mscclpp_allreduce.cuh"
+
+enum MscclContextSelection {
+  MSCCL1NODELL = 1,
+  MSCCL2NODELL = 2,
+};
+
+class MscclContext {
+ public:
+  MscclContextSelection selection_;
+  std::shared_ptr<sglang::Msccl1NodeLLcontext> msccl_1nodeLL_context;
+  std::shared_ptr<sglang::Msccl2NodeLLcontext> msccl_2nodeLL_context;
+  MscclContext(MscclContextSelection selection) : selection_(selection) {}
+  template <typename T>
+  void allreduce(
+      cudaStream_t stream, T* input, T* output, const size_t input_numel, int threads = 512, int block_limit = 21) {
+    if (selection_ == MSCCL1NODELL) {
+      msccl_1nodeLL_context->allreduce<T>(stream, input, output, input_numel, threads, block_limit);
+    } else if (selection_ == MSCCL2NODELL) {
+      msccl_2nodeLL_context->allreduce<T>(stream, input, output, input_numel, threads, block_limit);
+    }
+  }
+};
+
+using fptr_t = int64_t;
+static_assert(sizeof(void*) == sizeof(fptr_t));
+
+torch::Tensor _unique_id2tensor(const mscclpp::UniqueId& unique_id) {
+  auto options = torch::TensorOptions().dtype(torch::kByte).device(torch::kCPU);
+  auto tensor = torch::empty({static_cast<int64_t>(unique_id.size())}, options);
+  std::memcpy(tensor.data_ptr<uint8_t>(), unique_id.data(), unique_id.size());
+  return tensor;
+}
+
+// Function to convert vector of int32_t back to array of uint8_t
+mscclpp::UniqueId _tensor2unique_id(const torch::Tensor& tensor) {
+  mscclpp::UniqueId unique_id;
+  std::memcpy(unique_id.data(), tensor.data_ptr<uint8_t>(), unique_id.size());
+  return unique_id;
+}
+
+torch::Tensor mscclpp_generate_unique_id() {
+  mscclpp::UniqueId unique_id = mscclpp::TcpBootstrap::createUniqueId();
+  return _unique_id2tensor(unique_id);
+}
+
+fptr_t mscclpp_init_context(
+    const torch::Tensor& unique_id,
+    const int64_t rank,
+    const int64_t world_size,
+    torch::Tensor& scratch,
+    torch::Tensor& put_buffer,
+    const int64_t nranks_per_node,
+    const std::vector<int64_t>& rank_to_node,
+    const std::vector<int64_t>& rank_to_ib,
+    const int64_t context_selection) {
+  MscclContext* context_ptr = new MscclContext(static_cast<MscclContextSelection>(context_selection));
+  mscclpp::UniqueId uid = _tensor2unique_id(unique_id);
+  if (context_selection == MSCCL1NODELL) {
+    void* scratch_ptr = reinterpret_cast<void*>(scratch.data_ptr());
+    const size_t scratch_bytes = scratch.numel() * scratch.element_size();
+    context_ptr->msccl_1nodeLL_context = std::make_shared<sglang::Msccl1NodeLLcontext>(
+        uid, rank, world_size, scratch_ptr, scratch_bytes, nranks_per_node, rank_to_node, rank_to_ib);
+  } else if (context_selection == MSCCL2NODELL) {
+    void* scratch_ptr = reinterpret_cast<void*>(scratch.data_ptr());
+    const size_t scratch_bytes = scratch.numel() * scratch.element_size();
+    void* put_buffer_ptr = reinterpret_cast<void*>(put_buffer.data_ptr());
+    const size_t put_buffer_bytes = put_buffer.numel() * put_buffer.element_size();
+    context_ptr->msccl_2nodeLL_context = std::make_shared<sglang::Msccl2NodeLLcontext>(
+        uid,
+        rank,
+        world_size,
+        scratch_ptr,
+        scratch_bytes,
+        put_buffer_ptr,
+        put_buffer_bytes,
+        nranks_per_node,
+        rank_to_node,
+        rank_to_ib);
+  } else {
+    throw std::runtime_error("invalid context selection");
+  }
+  return (fptr_t)context_ptr;
+}
+
+bool _mscclpp_is_weak_contiguous(torch::Tensor& t) {
+  return t.is_contiguous() ||
+         (t.storage().nbytes() - t.storage_offset() * t.element_size() == t.numel() * t.element_size());
+}
+void mscclpp_allreduce(fptr_t _context, torch::Tensor& inp, torch::Tensor& out, int64_t nthreads, int64_t nblocks) {
+  MscclContext* context = reinterpret_cast<MscclContext*>(_context);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
+  auto stream = c10::cuda::getCurrentCUDAStream().stream();
+
+  TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
+  TORCH_CHECK_EQ(inp.numel(), out.numel());
+  TORCH_CHECK(_mscclpp_is_weak_contiguous(out));
+  TORCH_CHECK(_mscclpp_is_weak_contiguous(inp));
+  switch (out.scalar_type()) {
+    case at::ScalarType::Float: {
+      context->allreduce<float>(
+          stream,
+          reinterpret_cast<float*>(inp.data_ptr()),
+          reinterpret_cast<float*>(out.data_ptr()),
+          inp.numel(),
+          nthreads,
+          nblocks);
+      break;
+    }
+    case at::ScalarType::Half: {
+      context->allreduce<half>(
+          stream,
+          reinterpret_cast<half*>(inp.data_ptr()),
+          reinterpret_cast<half*>(out.data_ptr()),
+          inp.numel(),
+          nthreads,
+          nblocks);
+      break;
+    }
+#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+    case at::ScalarType::BFloat16: {
+      context->allreduce<__nv_bfloat16>(
+          stream,
+          reinterpret_cast<__nv_bfloat16*>(inp.data_ptr()),
+          reinterpret_cast<__nv_bfloat16*>(out.data_ptr()),
+          inp.numel(),
+          nthreads,
+          nblocks);
+      break;
+    }
+#endif
+    default:
+      throw std::runtime_error("custom allreduce only supports float32, float16 and bfloat16");
+  }
+}
diff --git a/sgl-kernel/csrc/allreduce/mscclpp_allreduce.cuh b/sgl-kernel/csrc/allreduce/mscclpp_allreduce.cuh
new file mode 100644
index 000000000..ba0bc33fd
--- /dev/null
+++ b/sgl-kernel/csrc/allreduce/mscclpp_allreduce.cuh
@@ -0,0 +1,779 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+#pragma once
+#ifdef USE_ROCM
+#include <hip/hip_fp16.h>
+#else
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#endif
+
+#include <mscclpp/concurrency_device.hpp>
+#include <mscclpp/core.hpp>
+#include <mscclpp/memory_channel.hpp>
+#include <mscclpp/memory_channel_device.hpp>
+#include <mscclpp/nvls_device.hpp>
+#include <mscclpp/port_channel.hpp>
+#include <mscclpp/port_channel_device.hpp>
+
+// comment this for test_mscclpp_allreduce.cu
+#include "utils.h"
+
+namespace sglang {
+
+__device__ mscclpp::DeviceSyncer deviceSyncer;
+__device__ mscclpp::DeviceSyncer allGatherDeviceSyncer;
+__device__ mscclpp::DeviceSyncer reduceScatterDeviceSyncer;
+__device__ mscclpp::DeviceSyncer ibDeviceSyncer;
+
+template <typename To, typename From>
+__forceinline__ __device__ To bit_cast(const From& src) {
+  static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast");
+
+  union {
+    From f;
+    To t;
+  } u;
+  u.f = src;
+  return u.t;
+}
+
+template <typename T>
+__forceinline__ __device__ T add_elements(T a, T b) {
+  return a + b;
+}
+
+template <>
+__forceinline__ __device__ __half2 add_elements(__half2 a, __half2 b) {
+  return __hadd2(a, b);
+}
+
+#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+template <>
+__forceinline__ __device__ __nv_bfloat162 add_elements(__nv_bfloat162 a, __nv_bfloat162 b) {
+  return __hadd2(a, b);
+}
+#endif
+
+template <typename T>
+__forceinline__ __device__ int4 add_vectors_helper(int4 a, int4 b) {
+  int4 ret;
+  ret.w = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.w), bit_cast<T, int>(b.w)));
+  ret.x = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
+  ret.y = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
+  ret.z = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.z), bit_cast<T, int>(b.z)));
+  return ret;
+}
+
+template <typename T>
+__forceinline__ __device__ int4 add_vectors(int4 a, int4 b) {
+  return add_vectors_helper<T>(a, b);
+}
+
+#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+template <>
+__forceinline__ __device__ int4 add_vectors<__nv_bfloat16>(int4 a, int4 b) {
+  return add_vectors_helper<__nv_bfloat162>(a, b);
+}
+#endif
+
+template <>
+__forceinline__ __device__ int4 add_vectors<__half>(int4 a, int4 b) {
+  return add_vectors_helper<__half2>(a, b);
+}
+
+template <typename T>
+__forceinline__ __device__ uint2 add_vectors_helper(uint2 a, uint2 b) {
+  uint2 ret;
+  ret.x = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
+  ret.y = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
+  return ret;
+}
+
+template <typename T>
+__forceinline__ __device__ uint2 add_vectors(uint2 a, uint2 b) {
+  return add_vectors_helper<T>(a, b);
+}
+
+#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+template <>
+__forceinline__ __device__ uint2 add_vectors<__nv_bfloat16>(uint2 a, uint2 b) {
+  return add_vectors_helper<__nv_bfloat162>(a, b);
+}
+#endif
+
+template <>
+__forceinline__ __device__ uint2 add_vectors<__half>(uint2 a, uint2 b) {
+  return add_vectors_helper<__half2>(a, b);
+}
+
+template <typename T>
+__forceinline__ __device__ int add_vectors_helper(int a, int b) {
+  return bit_cast<int, T>(add_elements(bit_cast<T, int>(a), bit_cast<T, int>(b)));
+}
+
+template <typename T>
+__forceinline__ __device__ int add_vectors(int a, int b) {
+  return add_vectors_helper<T>(a, b);
+}
+
+#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+template <>
+__forceinline__ __device__ int add_vectors<__nv_bfloat16>(int a, int b) {
+  return add_vectors_helper<__nv_bfloat162>(a, b);
+}
+#endif
+
+template <>
+__forceinline__ __device__ int add_vectors<__half>(int a, int b) {
+  return add_vectors_helper<__half2>(a, b);
+}
+
+// -------------------------------------------------------
+// allreduce_LL_1node using LLPacket, origin allreduce2
+// -------------------------------------------------------
+
+__device__ uint64_t globalFlag = 1;
+
+template <typename TYPE>
+__global__ void __launch_bounds__(1024, 1) allreduce_LL_1node(
+    mscclpp::MemoryChannelDeviceHandle* memChans,
+    TYPE* buff,
+    TYPE* scratch,
+    void* resultBuff,
+    int rank,
+    int worldSize,
+    size_t nelems) {
+  nelems = nelems / (sizeof(int) / sizeof(TYPE));
+  // This version of allreduce only works for single nodes
+  const int nPeers = worldSize - 1;
+  const size_t nPkts = nelems / 2;
+  const int nelemsPerRank = nelems / worldSize;
+  const int nPktsPerRank = nelemsPerRank / 2;
+  // flag for packets. Initially 1
+  const uint32_t flag = (uint32_t)globalFlag;
+  // thread block & channel info
+  const int nBlocksPerPeer = gridDim.x / nPeers;
+  const int localBlockIdx = blockIdx.x % nBlocksPerPeer;
+  const int peerIdx = blockIdx.x / nBlocksPerPeer;
+  const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1;
+  mscclpp::MemoryChannelDeviceHandle memChan = memChans[peerIdx];
+  const int tid = threadIdx.x + localBlockIdx * blockDim.x;
+  // double buffering
+  size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LLPacket);
+  void* scratchBuff = (void*)((char*)scratch + scratchBaseOffset);
+  size_t scratchOffset = scratchBaseOffset + rank * nPktsPerRank * sizeof(mscclpp::LLPacket);
+  size_t scratchResultOffset =
+      (flag & 1) ? 2 * nPkts * sizeof(mscclpp::LLPacket) : 3 * nPkts * sizeof(mscclpp::LLPacket);
+  size_t srcOffset = remoteRank * nelemsPerRank * sizeof(int);
+  uint2* src = (uint2*)((char*)buff + rank * nelemsPerRank * sizeof(int));
+  uint2* dst = (uint2*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int));
+
+  // step 1: write to scratch buffer
+  memChan.putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer, flag);
+  // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) {
+    uint2 data = make_uint2(0, 0);
+    for (int index = 0; index < nPeers; index++) {
+      const int remoteRank = index < rank ? index : index + 1;
+      mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank;
+      uint2 val = dstPkt[idx].read(flag);
+      data = add_vectors<TYPE>(val, data);
+    }
+    data = add_vectors<TYPE>(data, src[idx]);
+    dst[idx] = data;
+
+    mscclpp::LLPacket packet;
+    packet.data1 = data.x;
+    packet.flag1 = flag;
+    packet.data2 = data.y;
+    packet.flag2 = flag;
+    size_t offset = scratchResultOffset / sizeof(mscclpp::LLPacket) + (idx + rank * nPktsPerRank);
+    for (int index = 0; index < nPeers; index++) {
+      memChans[index].write(offset, packet);
+    }
+  }
+  // step 3: get data result from scratch buffer
+  mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)((char*)scratch + scratchResultOffset);
+  const int dstOffset = remoteRank * nPktsPerRank;
+  uint2* result = (uint2*)((char*)resultBuff + remoteRank * nelemsPerRank * sizeof(int));
+  for (int idx = threadIdx.x + localBlockIdx * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * nBlocksPerPeer) {
+    uint2 data = dstPkt[idx + dstOffset].read(flag);
+    result[idx].x = data.x;
+    result[idx].y = data.y;
+  }
+  if (threadIdx.x == 0 && blockIdx.x == 0) {
+    globalFlag += 1;
+  }
+}
+
+// -------------------------------------------------------
+// allreduce_LL_2node using LLPacket, origin allreduce5
+// -------------------------------------------------------
+
+template <typename TYPE>
+__global__ void __launch_bounds__(1024, 1) allreduce_LL_2node(
+    mscclpp::MemoryChannelDeviceHandle* memChans,
+    mscclpp::PortChannelDeviceHandle* portChans,
+    TYPE* buff,
+    TYPE* scratch,
+    TYPE* putBuff,
+    TYPE* resultBuff,
+    int rank,
+    int nRanksPerNode,
+    int worldSize,
+    size_t nelems) {
+  nelems = nelems / (sizeof(int) / sizeof(TYPE));
+  // This version of allreduce only works for single nodes
+  const int nPeersInNode = nRanksPerNode - 1;
+  const int nPkts = nelems / 2;
+  const int nelemsPerLocalRank = nelems / nRanksPerNode;
+  const int nPktsPerLocalRank = nelemsPerLocalRank / 2;
+  const int localRankId = rank % nRanksPerNode;
+  // flag for packets. Initially 1
+  const uint32_t flag = (uint32_t)globalFlag;
+  // thread block & channel info
+  const int nBlocksPerPeer = gridDim.x / nPeersInNode;
+  const int localBlockIdx = blockIdx.x % nBlocksPerPeer;
+  const int peerIdx = blockIdx.x / nBlocksPerPeer;
+  const int remoteRankIdx = peerIdx < localRankId ? peerIdx : peerIdx + 1;
+  mscclpp::MemoryChannelDeviceHandle memChan = memChans[peerIdx];
+  mscclpp::PortChannelDeviceHandle portChan = portChans[localRankId];
+  const int tid = threadIdx.x + localBlockIdx * blockDim.x;
+  // double buffering
+  size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LLPacket);
+  size_t putBaseOffset = (flag & 1) ? 0 : nPktsPerLocalRank * sizeof(mscclpp::LLPacket);
+  void* scratchBuff = (void*)((char*)scratch + scratchBaseOffset);
+  size_t scratchOffset = scratchBaseOffset + localRankId * nPktsPerLocalRank * sizeof(mscclpp::LLPacket);
+  size_t scratchResultOffset =
+      (flag & 1) ? 2 * nPkts * sizeof(mscclpp::LLPacket) : 3 * nPkts * sizeof(mscclpp::LLPacket);
+  size_t srcOffset = remoteRankIdx * nelemsPerLocalRank * sizeof(int);
+  uint2* src = (uint2*)((char*)buff + localRankId * nelemsPerLocalRank * sizeof(int));
+  uint2* dst = (uint2*)((char*)resultBuff + localRankId * nelemsPerLocalRank * sizeof(int));
+
+  // step 1: write to scratch buffer
+  if (nRanksPerNode > 1) {
+    memChan.putPackets(
+        scratchOffset, srcOffset, nelemsPerLocalRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer, flag);
+  }
+  // step 2: get data from scratch buffer, do local reduce-scatter in each node.
+  mscclpp::LLPacket* putPkt = (mscclpp::LLPacket*)((char*)putBuff + putBaseOffset);
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerLocalRank; idx += blockDim.x * gridDim.x) {
+    uint2 data = make_uint2(0, 0);
+    for (int index = 0; index < nPeersInNode; index++) {
+      const int remoteRank = index < localRankId ? index : index + 1;
+      mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerLocalRank;
+      uint2 val = dstPkt[idx].read(flag);
+      data = add_vectors<TYPE>(val, data);
+    }
+    data = add_vectors<TYPE>(data, src[idx]);
+    putPkt[idx].write(data.x, data.y, flag);
+    dst[idx] = data;
+  }
+  deviceSyncer.sync(gridDim.x);
+  // step 3. send local reduced data to remote node.
+  if (threadIdx.x == 0 && blockIdx.x == 0) {
+    portChan.put(scratchOffset, putBaseOffset, nPktsPerLocalRank * sizeof(mscclpp::LLPacket));
+    if ((flag & 63) == 0) {
+      portChan.flush();
+    }
+  }
+  // step 4. try to read the data from scratch buffer and write to local peers
+  mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + localRankId * nPktsPerLocalRank;
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerLocalRank; idx += blockDim.x * gridDim.x) {
+    uint2 res = dst[idx];
+    uint2 val = dstPkt[idx].read(flag);
+    res = add_vectors<TYPE>(res, val);
+
+    mscclpp::LLPacket packet;
+    packet.data1 = res.x;
+    packet.flag1 = flag;
+    packet.data2 = res.y;
+    packet.flag2 = flag;
+    size_t offset = scratchResultOffset / sizeof(mscclpp::LLPacket) + (idx + localRankId * nPktsPerLocalRank);
+    for (int index = 0; index < nPeersInNode; index++) {
+      memChans[index].write(offset, packet);
+    }
+    dst[idx] = res;
+  }
+
+  // step 5: get data result from scratch buffer
+  dstPkt = (mscclpp::LLPacket*)((char*)scratch + scratchResultOffset);
+  const int dstOffset = remoteRankIdx * nPktsPerLocalRank;
+  uint2* result = (uint2*)((char*)resultBuff + remoteRankIdx * nelemsPerLocalRank * sizeof(int));
+  if (nRanksPerNode > 1) {
+    for (int idx = threadIdx.x + localBlockIdx * blockDim.x; idx < nPktsPerLocalRank;
+         idx += blockDim.x * nBlocksPerPeer) {
+      uint2 data = dstPkt[idx + dstOffset].read(flag);
+      result[idx] = data;
+    }
+  }
+  if (threadIdx.x == 0 && blockIdx.x == 0) {
+    globalFlag += 1;
+  }
+}
+
+static const mscclpp::Transport IBs[] = {
+    mscclpp::Transport::IB0,
+    mscclpp::Transport::IB1,
+    mscclpp::Transport::IB2,
+    mscclpp::Transport::IB3,
+    mscclpp::Transport::IB4,
+    mscclpp::Transport::IB5,
+    mscclpp::Transport::IB6,
+    mscclpp::Transport::IB7};
+
+class MscclCommGroup {
+ public:
+  std::shared_ptr<mscclpp::Communicator> comm_;
+  const size_t rank_;
+  const size_t world_size_;
+  const std::vector<int64_t> rank_to_node_;
+  const std::vector<int64_t> rank_to_ib_;
+  MscclCommGroup(
+      mscclpp::UniqueId unique_id,
+      const size_t rank,
+      const size_t world_size,
+      const std::vector<int64_t>& rank_to_node,
+      const std::vector<int64_t>& rank_to_ib)
+      : rank_(rank), world_size_(world_size), rank_to_node_(rank_to_node), rank_to_ib_(rank_to_ib) {
+    auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(rank, world_size);
+    bootstrap->initialize(unique_id);
+    comm_ = std::make_shared<mscclpp::Communicator>(bootstrap);
+  }
+  template <typename T>
+  void allreduce(cudaStream_t stream, T* output, size_t input_numel, int threads = 512, int block_limit = 21) {
+    throw std::runtime_error("you should not call allreduce of a base context");
+  }
+  bool is_same_node(int r1, int r2) {
+    return rank_to_node_[r1] == rank_to_node_[r2];
+  }
+
+  void make_connection(
+      std::unordered_map<int, std::shared_ptr<mscclpp::Connection>>& same_node_connections,
+      std::unordered_map<int, std::shared_ptr<mscclpp::Connection>>& cross_node_connections) {
+    same_node_connections.clear();
+    cross_node_connections.clear();
+    std::unordered_map<int, mscclpp::NonblockingFuture<std::shared_ptr<mscclpp::Connection>>> conn_futures;
+    for (int r = 0; r < world_size_; ++r) {
+      if (r == rank_) continue;
+      mscclpp::Transport transport = is_same_node(r, rank_) ? mscclpp::Transport::CudaIpc : IBs[rank_to_ib_[r]];
+      conn_futures.emplace(r, comm_->connectOnSetup(r, 0, transport));
+    }
+    comm_->setup();
+    for (int r = 0; r < world_size_; ++r) {
+      if (r == rank_) continue;
+      if (is_same_node(r, rank_)) {
+        same_node_connections.emplace(r, conn_futures[r].get());
+      } else {
+        cross_node_connections.emplace(r, conn_futures[r].get());
+      }
+    }
+  }
+
+  void make_memory_channels_with_scratch(
+      void* tensor_ptr,
+      const size_t tensor_bytes,
+      void* scratch_ptr,
+      const size_t scratch_bytes,
+      const std::unordered_map<int, std::shared_ptr<mscclpp::Connection>>& connections,
+      std::unordered_map<int, std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>>& semaphores,
+      std::unordered_map<int, mscclpp::RegisteredMemory>& registered_memories,
+      std::unordered_map<int, mscclpp::MemoryChannel>& channels) {
+    channels.clear();
+    make_semaphores<mscclpp::MemoryDevice2DeviceSemaphore>(connections, semaphores);
+    register_tensor_with_connections(scratch_ptr, scratch_bytes, connections, registered_memories);
+    for (const auto& [peer, _] : connections) {
+      channels.emplace(
+          peer, mscclpp::MemoryChannel(semaphores[peer], registered_memories[peer], tensor_ptr, scratch_ptr));
+    }
+  }
+  void make_port_channels_with_scratch(
+      std::shared_ptr<mscclpp::ProxyService> proxyService,
+      void* tensor_ptr,
+      const size_t tensor_bytes,
+      void* scratch_ptr,
+      const size_t scratch_bytes,
+      const std::unordered_map<int, std::shared_ptr<mscclpp::Connection>>& connections,
+      std::unordered_map<int, std::shared_ptr<mscclpp::Host2DeviceSemaphore>>& semaphores,
+      std::unordered_map<int, mscclpp::RegisteredMemory>& registered_memories,
+      std::unordered_map<int, mscclpp::PortChannel>& channels) {
+    channels.clear();
+    make_semaphores<mscclpp::Host2DeviceSemaphore>(connections, semaphores);
+
+    mscclpp::TransportFlags flags;
+    for (const auto& [_, conn] : connections) {
+      flags |= conn->transport();
+    }
+    auto local_reg_memory = comm_->registerMemory(tensor_ptr, tensor_bytes, flags);
+
+    register_tensor_with_connections(scratch_ptr, scratch_bytes, connections, registered_memories);
+    std::unordered_map<int, mscclpp::SemaphoreId> semaphore_ids;
+    std::unordered_map<int, size_t> memory_ids;
+    memory_ids[rank_] = proxyService->addMemory(local_reg_memory);
+    for (const auto& [peer, memory] : registered_memories) {
+      if (peer == rank_) continue;
+      memory_ids[peer] = proxyService->addMemory(memory);
+    }
+    for (const auto& [peer, semaphore] : semaphores) {
+      semaphore_ids[peer] = proxyService->addSemaphore(semaphore);
+    }
+
+    for (const auto& [peer, _] : connections) {
+      channels.emplace(peer, proxyService->portChannel(semaphore_ids[peer], memory_ids[peer], memory_ids[rank_]));
+    }
+  }
+
+  template <typename SemaphoreType>
+  void make_semaphores(
+      const std::unordered_map<int, std::shared_ptr<mscclpp::Connection>>& connections,
+      std::unordered_map<int, std::shared_ptr<SemaphoreType>>& semaphores) {
+    semaphores.clear();
+    for (const auto& [peer, conn] : connections) {
+      semaphores[peer] = std::make_shared<SemaphoreType>(*comm_, conn);
+    }
+    comm_->setup();
+  }
+
+  void register_tensor_with_connections(
+      void* tensor_ptr,
+      size_t tensor_bytes,
+      const std::unordered_map<int, std::shared_ptr<mscclpp::Connection>>& connections,
+      std::unordered_map<int, mscclpp::RegisteredMemory>& registered_memories) {
+    registered_memories.clear();
+    mscclpp::TransportFlags all_transports;
+    for (const auto& [_, connection] : connections) {
+      all_transports |= connection->transport();
+    }
+    mscclpp::RegisteredMemory buf_reg_mem = comm_->registerMemory(tensor_ptr, tensor_bytes, all_transports);
+    registered_memories[rank_] = buf_reg_mem;
+
+    std::unordered_map<int, mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>> remote_mem_futures;
+    for (const auto& [r, connection] : connections) {
+      comm_->sendMemoryOnSetup(buf_reg_mem, r, 0);
+      auto remoteMemory = comm_->recvMemoryOnSetup(r, 0);
+      remote_mem_futures.emplace(r, remoteMemory);
+    }
+    comm_->setup();
+    for (auto& [r, mem_feature] : remote_mem_futures) {
+      registered_memories.emplace(r, mem_feature.get());
+    }
+  }
+
+  void make_device_memory_handle_base_on_new_ptr(
+      const std::unordered_map<int, mscclpp::MemoryChannel>& old_memory_channels,
+      std::unordered_map<int, mscclpp::RegisteredMemory>& registered_sm_memories,
+      std::unordered_map<int, std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>>& memory_semaphores,
+      std::unordered_map<int, mscclpp::MemoryChannel>& memory_channels,
+      mscclpp::GpuBuffer<mscclpp::MemoryChannelDeviceHandle>& device_memory_handle,
+      void* input,
+      void* scratch,
+      const cudaStream_t stream) {
+    memory_channels.clear();
+    for (const auto& [peer, channel] : old_memory_channels) {
+      memory_channels.emplace(
+          peer, mscclpp::MemoryChannel(memory_semaphores[peer], registered_sm_memories[peer], input, scratch));
+    }
+    std::vector<mscclpp::MemoryChannel> memory_channels_list;
+    for (int r = 0; r < world_size_; r++) {
+      if (r == rank_) continue;
+      if (is_same_node(r, rank_)) {
+        memory_channels_list.push_back(memory_channels[r]);
+      }
+    }
+    std::vector<mscclpp::MemoryChannelDeviceHandle> memory_channel_handlers(memory_channels_list.size());
+    std::transform(
+        memory_channels_list.begin(),
+        memory_channels_list.end(),
+        memory_channel_handlers.begin(),
+        [](const mscclpp::MemoryChannel& channel) { return channel.deviceHandle(); });
+    mscclpp::gpuMemcpyAsync<mscclpp::MemoryChannelDeviceHandle>(
+        device_memory_handle.data(),
+        memory_channel_handlers.data(),
+        memory_channel_handlers.size(),
+        stream,
+        cudaMemcpyHostToDevice);
+  }
+};
+
+class Msccl1NodeLLcontext {
+ private:
+  std::shared_ptr<MscclCommGroup> comm_group_ = nullptr;
+  void* scratch_;
+  const size_t scratch_bytes_;
+  std::unordered_map<int, std::shared_ptr<mscclpp::Connection>> same_node_connections_;
+  std::unordered_map<int, std::shared_ptr<mscclpp::Connection>> cross_node_connections_;
+
+  std::unordered_map<int, mscclpp::RegisteredMemory> registered_sm_memories_;
+  std::unordered_map<int, std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memory_semaphores_;
+  std::unordered_map<int, mscclpp::MemoryChannel> memory_channels_;
+  mscclpp::GpuBuffer<mscclpp::MemoryChannelDeviceHandle> d_memHandles_;
+  std::unordered_map<void*, std::unordered_map<int, mscclpp::MemoryChannel>> input_ptr2memory_channels_;
+  std::unordered_map<void*, mscclpp::GpuBuffer<mscclpp::MemoryChannelDeviceHandle>> input_ptr2d_memHandles_;
+  cudaStream_t h2d_stream;
+  const size_t nranks_per_node_;
+
+ public:
+  Msccl1NodeLLcontext(
+      mscclpp::UniqueId unique_id,
+      const size_t rank,
+      const size_t world_size,
+      void* scratch,
+      const size_t scratch_bytes,
+      const size_t nranks_per_node,
+      const std::vector<int64_t>& rank_to_node,
+      const std::vector<int64_t>& rank_to_ib)
+      : scratch_(scratch),
+        scratch_bytes_(scratch_bytes),
+        nranks_per_node_(nranks_per_node),
+        d_memHandles_(nranks_per_node - 1) {
+    CHECK_CUDA_SUCCESS(cudaStreamCreateWithFlags(&h2d_stream, cudaStreamNonBlocking));
+    comm_group_ = std::make_shared<MscclCommGroup>(unique_id, rank, world_size, rank_to_node, rank_to_ib);
+    comm_group_->make_connection(same_node_connections_, cross_node_connections_);
+    comm_group_->make_memory_channels_with_scratch(
+        scratch_,
+        scratch_bytes_,
+        scratch_,
+        scratch_bytes_,
+        same_node_connections_,
+        memory_semaphores_,
+        registered_sm_memories_,
+        memory_channels_);
+    std::vector<mscclpp::MemoryChannel> memory_channels_list;
+    for (int r = 0; r < comm_group_->world_size_; r++) {
+      if (r == comm_group_->rank_) continue;
+      memory_channels_list.push_back(memory_channels_[r]);
+    }
+    std::vector<mscclpp::MemoryChannelDeviceHandle> memory_channel_handlers(memory_channels_list.size());
+    std::transform(
+        memory_channels_list.begin(),
+        memory_channels_list.end(),
+        memory_channel_handlers.begin(),
+        [](const mscclpp::MemoryChannel& channel) { return channel.deviceHandle(); });
+    mscclpp::gpuMemcpy<mscclpp::MemoryChannelDeviceHandle>(
+        d_memHandles_.data(), memory_channel_handlers.data(), memory_channel_handlers.size(), cudaMemcpyHostToDevice);
+  }
+
+  ~Msccl1NodeLLcontext() {
+    CHECK_CUDA_SUCCESS(cudaStreamDestroy(h2d_stream));
+  }
+
+  template <typename T>
+  void allreduce(cudaStream_t stream, T* input, T* output, size_t input_numel, int nthreads = 512, int nblocks = 21) {
+    dim3 nthrs(nthreads);
+    dim3 nblks(nblocks);
+    cudaStreamCaptureStatus capturing_status;
+    CHECK_CUDA_SUCCESS(cudaStreamIsCapturing(stream, &capturing_status));
+    mscclpp::MemoryChannelDeviceHandle* memChans;
+    if (capturing_status != cudaStreamCaptureStatusActive) {
+      std::unordered_map<int, mscclpp::MemoryChannel> memory_channels;
+      comm_group_->make_device_memory_handle_base_on_new_ptr(
+          memory_channels_,
+          registered_sm_memories_,
+          memory_semaphores_,
+          memory_channels,
+          d_memHandles_,
+          input,
+          scratch_,
+          h2d_stream);
+      CHECK_CUDA_SUCCESS(cudaStreamSynchronize(h2d_stream));
+      memChans = d_memHandles_.data();
+    } else {
+      void* input_void_ptr = reinterpret_cast<void*>(input);
+      if (input_ptr2d_memHandles_.find(input_void_ptr) == input_ptr2d_memHandles_.end()) {
+        std::unordered_map<int, mscclpp::MemoryChannel> memory_channels;
+        mscclpp::GpuBuffer<mscclpp::MemoryChannelDeviceHandle> device_memory_handle(comm_group_->world_size_ - 1);
+        comm_group_->make_device_memory_handle_base_on_new_ptr(
+            memory_channels_,
+            registered_sm_memories_,
+            memory_semaphores_,
+            memory_channels,
+            device_memory_handle,
+            input,
+            scratch_,
+            h2d_stream);
+        input_ptr2memory_channels_.emplace(input_void_ptr, memory_channels);
+        input_ptr2d_memHandles_.emplace(input_void_ptr, device_memory_handle);
+      }
+      auto it = input_ptr2d_memHandles_.find(input_void_ptr);
+      memChans = it->second.data();
+    }
+    allreduce_LL_1node<T><<<nblks, nthrs, 0, stream>>>(
+        memChans, (T*)input, (T*)scratch_, output, comm_group_->rank_, comm_group_->world_size_, input_numel);
+
+    cudaError_t status = cudaGetLastError();
+    if (status != cudaSuccess) {
+      printf("rank: %lu failed to launch allreduce_LL_1node: %s\n", comm_group_->rank_, cudaGetErrorString(status));
+    }
+  }
+};
+
+class Msccl2NodeLLcontext {
+ private:
+  std::shared_ptr<MscclCommGroup> comm_group_ = nullptr;
+  void* scratch_;
+  const size_t scratch_bytes_;
+  void* put_buffer_;
+  const size_t put_buffer_bytes_;
+  std::unordered_map<int, std::shared_ptr<mscclpp::Connection>> same_node_connections_;
+  std::unordered_map<int, std::shared_ptr<mscclpp::Connection>> cross_node_connections_;
+
+  std::unordered_map<int, mscclpp::RegisteredMemory> registered_sm_memories_;
+  std::unordered_map<int, mscclpp::RegisteredMemory> registered_port_memories_;
+
+  std::unordered_map<int, std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memory_semaphores_;
+  std::unordered_map<int, std::shared_ptr<mscclpp::Host2DeviceSemaphore>> port_semaphores_;
+
+  std::unordered_map<int, mscclpp::MemoryChannel> memory_channels_;
+  std::unordered_map<int, mscclpp::PortChannel> port_channels_;
+
+  mscclpp::GpuBuffer<mscclpp::MemoryChannelDeviceHandle> d_memHandles_;
+  mscclpp::GpuBuffer<mscclpp::PortChannelDeviceHandle> d_portHandles_;
+
+  std::shared_ptr<mscclpp::ProxyService> proxyService;
+  cudaStream_t h2d_stream;
+  const size_t nranks_per_node_;
+
+  std::unordered_map<void*, std::unordered_map<int, mscclpp::MemoryChannel>> input_ptr2memory_channels_;
+  std::unordered_map<void*, mscclpp::GpuBuffer<mscclpp::MemoryChannelDeviceHandle>> input_ptr2d_memHandles_;
+
+ public:
+  Msccl2NodeLLcontext(
+      mscclpp::UniqueId unique_id,
+      const size_t rank,
+      const size_t world_size,
+      void* scratch,
+      const size_t scratch_bytes,
+      void* put_buffer,
+      const size_t put_buffer_bytes,
+      const size_t nranks_per_node,
+      const std::vector<int64_t>& rank_to_node,
+      const std::vector<int64_t>& rank_to_ib)
+      : scratch_(scratch),
+        scratch_bytes_(scratch_bytes),
+        put_buffer_(put_buffer),
+        put_buffer_bytes_(put_buffer_bytes),
+        nranks_per_node_(nranks_per_node),
+        d_memHandles_(nranks_per_node - 1),
+        d_portHandles_(world_size - nranks_per_node) {
+    CHECK_CUDA_SUCCESS(cudaStreamCreateWithFlags(&h2d_stream, cudaStreamNonBlocking));
+    comm_group_ = std::make_shared<MscclCommGroup>(unique_id, rank, world_size, rank_to_node, rank_to_ib);
+    proxyService = std::make_shared<mscclpp::ProxyService>();
+    proxyService->startProxy();
+    comm_group_->make_connection(same_node_connections_, cross_node_connections_);
+    comm_group_->make_memory_channels_with_scratch(
+        scratch_,
+        scratch_bytes_,
+        scratch_,
+        scratch_bytes_,
+        same_node_connections_,
+        memory_semaphores_,
+        registered_sm_memories_,
+        memory_channels_);
+    comm_group_->make_port_channels_with_scratch(
+        proxyService,
+        put_buffer_,
+        put_buffer_bytes_,
+        scratch_,
+        scratch_bytes_,
+        cross_node_connections_,
+        port_semaphores_,
+        registered_port_memories_,
+        port_channels_);
+    std::vector<mscclpp::MemoryChannel> memory_channels_list;
+    std::vector<mscclpp::PortChannel> port_channels_list;
+    for (int r = 0; r < comm_group_->world_size_; r++) {
+      if (r == comm_group_->rank_) continue;
+      if (comm_group_->is_same_node(r, comm_group_->rank_)) {
+        memory_channels_list.push_back(memory_channels_[r]);
+      } else {
+        port_channels_list.push_back(port_channels_[r]);
+      }
+    }
+    std::vector<mscclpp::MemoryChannelDeviceHandle> memory_channel_handlers(memory_channels_list.size());
+    std::transform(
+        memory_channels_list.begin(),
+        memory_channels_list.end(),
+        memory_channel_handlers.begin(),
+        [](const mscclpp::MemoryChannel& channel) { return channel.deviceHandle(); });
+    mscclpp::gpuMemcpy<mscclpp::MemoryChannelDeviceHandle>(
+        d_memHandles_.data(), memory_channel_handlers.data(), memory_channel_handlers.size(), cudaMemcpyHostToDevice);
+
+    std::vector<mscclpp::PortChannelDeviceHandle> port_channel_handlers(port_channels_list.size());
+    std::transform(
+        port_channels_list.begin(),
+        port_channels_list.end(),
+        port_channel_handlers.begin(),
+        [](const mscclpp::PortChannel& channel) { return channel.deviceHandle(); });
+    mscclpp::gpuMemcpy<mscclpp::PortChannelDeviceHandle>(
+        d_portHandles_.data(), port_channel_handlers.data(), port_channel_handlers.size(), cudaMemcpyHostToDevice);
+  }
+
+  ~Msccl2NodeLLcontext() {
+    CHECK_CUDA_SUCCESS(cudaStreamDestroy(h2d_stream));
+    if (proxyService) {
+      proxyService->stopProxy();
+    }
+  }
+
+  template <typename T>
+  void
+  allreduce(cudaStream_t stream, T* input, T* output, const size_t input_numel, int nthreads = 512, int nblocks = 21) {
+    dim3 nthrs(nthreads);
+    dim3 nblks(nblocks);
+    cudaStreamCaptureStatus capturing_status;
+    CHECK_CUDA_SUCCESS(cudaStreamIsCapturing(stream, &capturing_status));
+    mscclpp::MemoryChannelDeviceHandle* memChans;
+    if (capturing_status != cudaStreamCaptureStatusActive) {
+      std::unordered_map<int, mscclpp::MemoryChannel> memory_channels;
+      comm_group_->make_device_memory_handle_base_on_new_ptr(
+          memory_channels_,
+          registered_sm_memories_,
+          memory_semaphores_,
+          memory_channels,
+          d_memHandles_,
+          input,
+          scratch_,
+          h2d_stream);
+      CHECK_CUDA_SUCCESS(cudaStreamSynchronize(h2d_stream));
+      memChans = d_memHandles_.data();
+    } else {
+      void* input_void_ptr = reinterpret_cast<void*>(input);
+      if (input_ptr2d_memHandles_.find(input_void_ptr) == input_ptr2d_memHandles_.end()) {
+        std::unordered_map<int, mscclpp::MemoryChannel> memory_channels;
+        mscclpp::GpuBuffer<mscclpp::MemoryChannelDeviceHandle> device_memory_handle(7);
+        comm_group_->make_device_memory_handle_base_on_new_ptr(
+            memory_channels_,
+            registered_sm_memories_,
+            memory_semaphores_,
+            memory_channels,
+            device_memory_handle,
+            input,
+            scratch_,
+            h2d_stream);
+        input_ptr2memory_channels_.emplace(input_void_ptr, memory_channels);
+        input_ptr2d_memHandles_.emplace(input_void_ptr, device_memory_handle);
+      }
+      auto it = input_ptr2d_memHandles_.find(input_void_ptr);
+      memChans = it->second.data();
+    }
+    allreduce_LL_2node<T><<<nblks, nthrs, 0, stream>>>(
+        memChans,
+        d_portHandles_.data(),
+        (T*)input,
+        (T*)scratch_,
+        (T*)put_buffer_,
+        output,
+        comm_group_->rank_,
+        nranks_per_node_,
+        comm_group_->world_size_,
+        input_numel);
+
+    cudaError_t status = cudaGetLastError();
+    if (status != cudaSuccess) {
+      printf("rank: %lu failed to launch allreduce_LL_2node: %s\n", comm_group_->rank_, cudaGetErrorString(status));
+    }
+  }
+};
+
+}  // namespace sglang
diff --git a/sgl-kernel/csrc/allreduce/quick_all_reduce.cu b/sgl-kernel/csrc/allreduce/quick_all_reduce.cu
new file mode 100644
index 000000000..757c05d2b
--- /dev/null
+++ b/sgl-kernel/csrc/allreduce/quick_all_reduce.cu
@@ -0,0 +1,111 @@
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#include <torch/all.h>
+
+#ifdef USE_ROCM
+
+#include "quick_all_reduce.h"
+
+quickreduce::fptr_t init_custom_qr(int64_t rank, int64_t world_size, std::optional<int64_t> qr_max_size) {
+  if (world_size > 8) throw std::invalid_argument("world size > 8 is not supported");
+  if (world_size == 6) throw std::invalid_argument("world size == 6 is not supported");
+  if (world_size % 2 != 0) throw std::invalid_argument("Odd num gpus is not supported for now");
+  if (rank < 0 || rank >= world_size) throw std::invalid_argument("invalid rank passed in");
+  quickreduce::DeviceComms* fptr = new quickreduce::DeviceComms();
+  fptr->init(world_size, rank, qr_max_size);
+  return (quickreduce::fptr_t)fptr;
+}
+
+void qr_destroy(quickreduce::fptr_t _fa) {
+  if (_fa) {
+    auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
+    fa->destroy();
+    delete fa;
+  }
+}
+
+torch::Tensor qr_get_handle(quickreduce::fptr_t _fa) {
+  auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
+  hipIpcMemHandle_t handle = fa->get_handle();
+  auto options = torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
+  auto data_handle = torch::empty({static_cast<int64_t>(sizeof(hipIpcMemHandle_t))}, options);
+  std::memcpy(data_handle.data_ptr(), &handle, sizeof(hipIpcMemHandle_t));
+  return data_handle;
+}
+
+void qr_open_handles(quickreduce::fptr_t _fa, const std::vector<torch::Tensor>& handles) {
+  auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
+  std::vector<hipIpcMemHandle_t> ipc_handles;
+  ipc_handles.reserve(handles.size());
+  for (auto& handle : handles) {
+    // Ensure the tensor is on the same device as the current device.
+    hipIpcMemHandle_t ipc_handle;
+    std::memcpy(&ipc_handle, handle.data_ptr(), sizeof(hipIpcMemHandle_t));
+    ipc_handles.push_back(ipc_handle);
+  }
+  fa->open_ipc_handles(ipc_handles);
+}
+
+void qr_all_reduce(
+    quickreduce::fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, int64_t quant_level, bool cast_bf2half) {
+  auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
+  auto stream = at::cuda::getCurrentHIPStreamMasqueradingAsCUDA();
+
+  TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
+  TORCH_CHECK_EQ(inp.numel(), out.numel());
+  TORCH_CHECK_LE(out.numel(), fa->kMaxProblemSize);
+  if (out.scalar_type() == at::ScalarType::Half) {
+    fa->allreduce<half, false>(
+        reinterpret_cast<half*>(inp.data_ptr()),
+        reinterpret_cast<half*>(out.data_ptr()),
+        out.numel(),
+        quant_level,
+        stream);
+  } else if (out.scalar_type() == at::ScalarType::BFloat16) {
+    if (cast_bf2half) {
+      fa->allreduce<half, true>(
+          reinterpret_cast<half*>(inp.data_ptr()),
+          reinterpret_cast<half*>(out.data_ptr()),
+          out.numel(),
+          quant_level,
+          stream);
+    } else {
+      fa->allreduce<quickreduce::nv_bfloat16, false>(
+          reinterpret_cast<quickreduce::nv_bfloat16*>(inp.data_ptr()),
+          reinterpret_cast<quickreduce::nv_bfloat16*>(out.data_ptr()),
+          out.numel(),
+          quant_level,
+          stream);
+    }
+  } else {
+    throw std::runtime_error("quick allreduce only supports float16 and bfloat16");
+  }
+}
+
+int64_t qr_max_size() {
+  // The default is 2GB (2,147,483,648 bytes)
+  return static_cast<int64_t>(std::numeric_limits<int32_t>::max()) + 1;
+}
+
+#define INSTANTIATE_FOR_WORLDSIZE(T, Codec, cast_bf2half)                      \
+  template struct quickreduce::AllReduceTwoshot<T, Codec<T, 2>, cast_bf2half>; \
+  template struct quickreduce::AllReduceTwoshot<T, Codec<T, 4>, cast_bf2half>; \
+  template struct quickreduce::AllReduceTwoshot<T, Codec<T, 8>, cast_bf2half>;
+
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecFP, false)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ4, false)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ6, false)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ8, false)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecFP, true)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ4, true)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ6, true)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ8, true)
+
+INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecFP, false)
+INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ4, false)
+INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ6, false)
+INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ8, false)
+
+#endif  // USE_ROCM
diff --git a/sgl-kernel/csrc/allreduce/quick_all_reduce.cuh b/sgl-kernel/csrc/allreduce/quick_all_reduce.cuh
new file mode 100644
index 000000000..bd9e7b10f
--- /dev/null
+++ b/sgl-kernel/csrc/allreduce/quick_all_reduce.cuh
@@ -0,0 +1,633 @@
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+#include "quick_all_reduce_base.h"
+
+namespace quickreduce {
+
+struct CodecBase {
+  const int thread;
+  const int rank;
+  const int group_leader;
+  __quickreduce_device_inline__ CodecBase(int thread, int rank)
+      : thread(thread), rank(rank), group_leader((threadIdx.x / kThreadGroupSize) * kThreadGroupSize) {
+    set_fp16_ovfl(true);
+  }
+};
+
+// Default full precision codec.
+template <typename T, int world_size>
+struct CodecFP : public CodecBase {
+  static constexpr int kWorldSize = world_size;
+  static constexpr int kRankAtoms = kAtoms / kWorldSize;
+
+  // Codec tile size process by this workgroup.
+  // Each thread processes atoms of f16x8_t (16B).
+  static constexpr int kRankTransmittedTileSize = kBlockSize * kRankAtoms * sizeof(int32x4_t);
+  static_assert(kRankTransmittedTileSize % 16 == 0, "kRankTransmittedTileSize must be 16B aligned.");
+
+  // Total tile size for the collective communication.
+  static constexpr int kTransmittedTileSize = kRankTransmittedTileSize * kWorldSize;
+
+  __quickreduce_device_inline__ CodecFP(int thread, int rank) : CodecBase(thread, rank) {}
+
+  __quickreduce_device_inline__ void send(int32x4_t* __restrict__ send_buffer, const int32x4_t* __restrict__ data) {
+    for (int i = 0; i < kRankAtoms; i++) {
+      __builtin_nontemporal_store(data[i], send_buffer + thread);
+      send_buffer += kAtomStride;
+    }
+  }
+
+  __quickreduce_device_inline__ void recv(int32x4_t** __restrict__ recv_buffer, int32x4_t* __restrict__ data) {
+    for (int i = 0; i < kRankAtoms; i++) {
+      data[i] = __builtin_nontemporal_load(*recv_buffer + thread);
+      *recv_buffer += kAtomStride;
+    }
+  }
+};
+
+// Int4 symmetric quantization codec.
+// We quantize the FP16 data to block-scaled Int4 in blocks of 4 *
+// kThreadGroupSize.
+template <typename T, int world_size>
+struct CodecQ4 : public CodecBase {
+  static constexpr int kWorldSize = world_size;
+
+  // Codec tile size process by this workgroup.
+  // Each threads processes a fragment of fp16x8_t (16B),
+  // into a int4x8_t (4B) and a fp16 scale shared among 32 values.
+  static constexpr int kRankAtoms = kAtoms / kWorldSize;
+  static constexpr int kRankTileStride = 1152;
+  static constexpr int kRankTileScaleOffset = 1024;
+  static constexpr int kRankTransmittedTileSize = kRankTileStride * kRankAtoms;
+  static_assert(kRankTransmittedTileSize % 16 == 0, "kRankTransmittedTileSize must be 16B aligned.");
+
+  static constexpr int kRankBufferTileStride = kRankTileStride / sizeof(int32x4_t);
+
+  // Total tile size for the collective communication.
+  static constexpr int kTransmittedTileSize = kRankTransmittedTileSize * kWorldSize;
+
+  // Constants configuration
+
+  // {-1/8.0h, -1/8.0h}, f16x2_t
+  static constexpr int kScaleFactor = std::is_same<T, half>::value ? 0xB000B000 : 0xBE00BE00;
+
+  // {1e-7, 1e-7}, f16x2_t
+  static constexpr int kScaleEpsilon = std::is_same<T, half>::value ? 0x00010001 : 0x33D733D7;
+
+  // {-8, -8}, f16x2_t
+  static constexpr int kRangeMin = std::is_same<T, half>::value ? 0xC800C800 : 0xC100C100;
+
+  // {+7, +7}, f16x2_t
+  static constexpr int kRangeMax = std::is_same<T, half>::value ? 0x47004700 : 0x40E040E0;
+
+  // {+8, +8}, int16x2_t
+  static constexpr int kRangeBias = 0x00080008;
+
+  __quickreduce_device_inline__ CodecQ4(int thread, int rank) : CodecBase(thread, rank) {}
+
+  __quickreduce_device_inline__ void send(int32x4_t* __restrict__ send_buffer, const int32x4_t* __restrict__ data) {
+    for (int k = 0; k < kRankAtoms; k++) {
+      int32x4_t const atom = data[k];
+
+      // Compute the absolute maximum of the atom in the thread group
+      // In 2 blocks of values, upper/lower halves of the f16x2_t
+      int wblockmax = group_abs_max<T>(atom);
+
+      // Derive scales
+      int decoding_scale;
+      int encoding_scale;
+      decoding_scale = packed_mul<T>(wblockmax, kScaleFactor);
+      encoding_scale = packed_add<T>(decoding_scale, kScaleEpsilon);
+      encoding_scale = packed_rcp<T>(encoding_scale);
+
+      // Apply scales to get quantized values
+      int32x4_t w;
+      for (int i = 0; i < 4; i++) {
+        w[i] = packed_mul<T>(atom[i], encoding_scale);
+        w[i] = packed_max<T>(w[i], kRangeMin);
+        w[i] = packed_min<T>(w[i], kRangeMax);
+      }
+
+      // Convert from f16x2_t to uint16x2_t
+      int32x4_t q;
+      {
+        int16_t* qi = reinterpret_cast<int16_t*>(&q);
+        T* wh = reinterpret_cast<T*>(&w);
+        for (int i = 0; i < 8; i++)
+          qi[i] = (int16_t)rintf(T2float_cast(wh[i]));
+
+        for (int i = 0; i < 4; i++) {
+          q[i] = packed_add<int16_t>(q[i], kRangeBias);
+        }
+      }
+
+      // Pack 8 x q4 into int32_t
+      int qw = q[0] | (q[1] << 4) | (q[2] << 8) | (q[3] << 12);
+
+      // Write quantized atom to send_buffer
+      // note: only the group leader stores the scale
+      uint8_t* atom_ptr = reinterpret_cast<uint8_t*>(send_buffer + k * kRankBufferTileStride);
+      int32_t* qw_ptr = reinterpret_cast<int32_t*>(atom_ptr) + thread;
+      int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) + (thread / 8);
+
+      __builtin_nontemporal_store(qw, qw_ptr);
+      if (threadIdx.x == group_leader) {
+        __builtin_nontemporal_store(decoding_scale, qs_ptr);
+      }
+    }
+  }
+
+  __quickreduce_device_inline__ void recv(int32x4_t** __restrict__ recv_buffer, int32x4_t* __restrict__ data) {
+    for (int k = 0; k < kRankAtoms; k++) {
+      // Directly read quantized atom from recv_buffer
+      uint8_t* atom_ptr = reinterpret_cast<uint8_t*>(*recv_buffer);
+      int32_t* qw_ptr = reinterpret_cast<int32_t*>(atom_ptr) + thread;
+      int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) + (thread / 8);
+
+      int32_t qw = __builtin_nontemporal_load(qw_ptr);
+      int qs = __builtin_nontemporal_load(qs_ptr);
+
+      *recv_buffer += kRankBufferTileStride;
+
+      // Unpack q4 into f16x8_t
+      int32x4_t w;
+      {
+        static constexpr uint kMask000F = 0x000F000F;
+        static constexpr uint kHalf2_1024 = 0x64006400;  // {1024.0, 1024.0}, fp16x2_t
+        static uint constexpr kHalf2_1032 = 0xE408E408;  // {-1032.0, -1032.0}, fp16x2_t
+
+        for (int i = 0; i < 4; i++) {
+          if constexpr (std::is_same<T, half>::value) {
+            int32_t q4 = ((qw >> (i * 4)) & kMask000F) | kHalf2_1024;
+            w[i] = packed_add<half>(q4, kHalf2_1032);
+          } else {
+            int32_t int16_2 = (qw >> (i * 4)) & kMask000F;
+            int16_t low = static_cast<int16_t>(int16_2 & 0xFFFF);
+            int16_t high = static_cast<int16_t>((int16_2 >> 16) & 0xFFFF);
+            nv_bfloat16 bf_low = __float2bfloat16(static_cast<float>(low));
+            nv_bfloat16 bf_high = __float2bfloat16(static_cast<float>(high));
+            nv_bfloat162 bf2 = __halves2bfloat162(bf_low, bf_high);
+            int32_t packed_bf16 = *reinterpret_cast<int32_t*>(&bf2);
+            w[i] = packed_add<nv_bfloat16>(packed_bf16, kRangeMin);
+          }
+        }
+      }
+
+      // Apply decoding scales
+      for (int i = 0; i < 4; i++) {
+        w[i] = packed_mul<T>(w[i], qs);
+      }
+
+      data[k] = w;
+    }
+  }
+};
+
+// Int6 symmetric quantization codec.
+// We quantize the FP16 data to block-scaled Int6 in blocks of 4 *
+// kThreadGroupSize.
+template <typename T, int world_size>
+struct CodecQ6 : public CodecBase {
+  static constexpr int kWorldSize = world_size;
+
+  // Codec tile size process by this workgroup.
+  // Each threads processes a fragment of fp16x8_t (16B),
+  // into a int6x8_t (4B + 2B) and a fp16 scale shared among 32 values.
+  static constexpr int kRankAtoms = kAtoms / kWorldSize;
+  static constexpr int kRankTileStride = 1664;
+  static constexpr int kRankTileQ2Offset = 1024;
+  static constexpr int kRankTileScaleOffset = 1536;
+  static constexpr int kRankTransmittedTileSize = kRankTileStride * kRankAtoms;
+  static_assert(kRankTransmittedTileSize % 16 == 0, "kRankTransmittedTileSize must be 16B aligned.");
+
+  static constexpr int kRankBufferTileStride = kRankTileStride / sizeof(int32x4_t);
+
+  // Total tile size for the collective communication.
+  static constexpr int kTransmittedTileSize = kRankTransmittedTileSize * kWorldSize;
+
+  // Constants configuration
+
+  // {-1/32.0h, -1/32.0h}, fp16x2_t
+  static constexpr int kScaleFactor = std::is_same<T, half>::value ? 0xA800A800 : 0xBD00BD00;
+
+  // {1e-7, 1e-7}, fp16x2_t
+  static constexpr int kScaleEpsilon = std::is_same<T, half>::value ? 0x00010001 : 0x33D733D7;
+
+  // {-32, -32}, fp16x2_t
+  static constexpr int kRangeMin = std::is_same<T, half>::value ? 0xD000D000 : 0xC200C200;
+
+  // {+31, +31}, fp16x2_t
+  static constexpr int kRangeMax = std::is_same<T, half>::value ? 0x4FC04FC0 : 0x41F841F8;
+
+  // {+32, +32}, int16x2_t
+  static constexpr int kRangeBias = 0x00200020;
+
+  __quickreduce_device_inline__ CodecQ6(int thread, int rank) : CodecBase(thread, rank) {}
+
+  __quickreduce_device_inline__ void send(int32x4_t* __restrict__ send_buffer, const int32x4_t* __restrict__ data) {
+    for (int k = 0; k < kRankAtoms; k++) {
+      int32x4_t const atom = data[k];
+
+      // Compute the absolute maximum of the atom in the thread group
+      // In 2 blocks of values, upper/lower halves of the f16x2_t
+      int wblockmax = group_abs_max<T>(atom);
+
+      // Derive scales
+      int decoding_scale;
+      int encoding_scale;
+      decoding_scale = packed_mul<T>(wblockmax, kScaleFactor);
+      encoding_scale = packed_add<T>(decoding_scale, kScaleEpsilon);
+      encoding_scale = packed_rcp<T>(encoding_scale);
+
+      // Apply scales to get quantized values
+      int32x4_t w;
+      for (int i = 0; i < 4; i++) {
+        w[i] = packed_mul<T>(atom[i], encoding_scale);
+        w[i] = packed_max<T>(w[i], kRangeMin);
+        w[i] = packed_min<T>(w[i], kRangeMax);
+      }
+
+      // Convert from f16x2_t to uint16x2_t
+      int32x4_t q;
+      {
+        int16_t* qi = reinterpret_cast<int16_t*>(&q);
+        T* wh = reinterpret_cast<T*>(&w);
+        for (int i = 0; i < 8; i++)
+          qi[i] = (int16_t)rintf(T2float_cast(wh[i]));
+
+        for (int i = 0; i < 4; i++) {
+          q[i] = packed_add<int16_t>(q[i], kRangeBias);
+        }
+      }
+
+      // Pack 8 x q6 into int32_t + int16_t
+      uint32_t q4w;
+      uint16_t q2w = 0;
+      q4w = (q[0] & 0x000F000F) | ((q[1] & 0x000F000F) << 4) | ((q[2] & 0x000F000F) << 8) | ((q[3] & 0x000F000F) << 12);
+      {
+        int16_t* tw = reinterpret_cast<int16_t*>(&q);
+#pragma unroll
+        for (int i = 0; i < 8; i++) {
+          q2w |= (tw[i] >> 4) << (i * 2);
+        }
+      }
+      // Write quantized atom to send_buffer
+      // note: only the group leader stores the scale
+      uint8_t* atom_ptr = reinterpret_cast<uint8_t*>(send_buffer + k * kRankBufferTileStride);
+      uint32_t* q4w_ptr = reinterpret_cast<uint32_t*>(atom_ptr) + thread;
+      uint16_t* q2w_ptr = reinterpret_cast<uint16_t*>(atom_ptr + kRankTileQ2Offset) + thread;
+      int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) + (thread / 8);
+
+      __builtin_nontemporal_store(q4w, q4w_ptr);
+      __builtin_nontemporal_store(q2w, q2w_ptr);
+      if (threadIdx.x == group_leader) {
+        __builtin_nontemporal_store(decoding_scale, qs_ptr);
+      }
+    }
+  }
+
+  __quickreduce_device_inline__ void recv(int32x4_t** __restrict__ recv_buffer, int32x4_t* __restrict__ data) {
+    for (int k = 0; k < kRankAtoms; k++) {
+      // Directly read quantized atom from recv_buffer
+      uint8_t* atom_ptr = reinterpret_cast<uint8_t*>(*recv_buffer);
+      uint32_t* q4w_ptr = reinterpret_cast<uint32_t*>(atom_ptr) + thread;
+      uint16_t* q2w_ptr = reinterpret_cast<uint16_t*>(atom_ptr + kRankTileQ2Offset) + thread;
+      int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) + (thread / 8);
+
+      uint32_t q4w = __builtin_nontemporal_load(q4w_ptr);
+      uint16_t q2w = __builtin_nontemporal_load(q2w_ptr);
+      int qs = __builtin_nontemporal_load(qs_ptr);
+
+      *recv_buffer += kRankBufferTileStride;
+
+      // Unpack q6 into fp16x8_t
+      int32x4_t w;
+      {
+        static uint constexpr kMask000F = 0x000F000F;
+        static uint constexpr kHalf2_1024 = 0x64006400;  // {1024.0, 1024.0}, fp16x2_t
+        static uint constexpr kHalf2_1056 = 0xE420E420;  // {-1056.0, -1056.0}, fp16x2_t
+
+#pragma unroll
+        for (int i = 0; i < 4; i++) {
+          int32_t q4 = q4w & kMask000F;
+          int32_t q2 = (q2w & 0x3) | ((q2w & 0xC) << 14);
+          q4w >>= 4;
+          q2w >>= 4;
+          if constexpr (std::is_same<T, half>::value) {
+            int32_t q6 = q4 | (q2 << 4) | kHalf2_1024;
+            asm volatile("v_pk_add_f16 %0, %1, %2" : "=v"(w[i]) : "v"(q6), "v"(kHalf2_1056));
+          } else {
+            int32_t int16_2 = q4 | (q2 << 4);
+            int16_t low = static_cast<int16_t>(int16_2 & 0xFFFF);
+            int16_t high = static_cast<int16_t>((int16_2 >> 16) & 0xFFFF);
+
+            nv_bfloat16 bf_low = __float2bfloat16(static_cast<float>(low));
+            nv_bfloat16 bf_high = __float2bfloat16(static_cast<float>(high));
+            nv_bfloat162 bf2 = __halves2bfloat162(bf_low, bf_high);
+            int32_t packed_bf16 = *reinterpret_cast<int32_t*>(&bf2);
+            w[i] = packed_add<nv_bfloat16>(packed_bf16, kRangeMin);
+          }
+        }
+      }
+
+      // Apply decoding scales
+      for (int i = 0; i < 4; i++) {
+        w[i] = packed_mul<T>(w[i], qs);
+      }
+
+      // That's pretty much it...
+      data[k] = w;
+    }
+  }
+};
+
+// Int8 symmetric quantization codec.
+// We quantize the FP16 data to block-scaled Int8 in blocks of 4 *
+// kThreadGroupSize.
+template <typename T, int world_size>
+struct CodecQ8 : public CodecBase {
+  static constexpr int kWorldSize = world_size;
+
+  // Codec tile size process by this workgroup.
+  // Each threads processes a fragment of f16x8_t (16B),
+  // into a int8x8_t (8B) and a f16 scale shared among 32 values.
+  static constexpr int kRankAtoms = kAtoms / kWorldSize;
+  static constexpr int kRankTileStride = 2176;
+  static constexpr int kRankTileScaleOffset = 2048;
+  static constexpr int kRankTransmittedTileSize = kRankTileStride * kRankAtoms;
+  static_assert(kRankTransmittedTileSize % 16 == 0, "kRankTileSize must be 16B aligned.");
+
+  static constexpr int kRankBufferTileStride = kRankTileStride / sizeof(int32x4_t);
+
+  // Total tile size for the collective communication.
+  static constexpr int kTransmittedTileSize = kRankTransmittedTileSize * kWorldSize;
+
+  // Constants configuration
+
+  // {-1/128.0h, -1/128.0h}, f16x2_t
+  static constexpr int kScaleFactor = std::is_same<T, half>::value ? 0xA000A000 : 0xBC00BC00;
+
+  // {1e-7, 1e-7}, f16x2_t
+  static constexpr int kScaleEpsilon = std::is_same<T, half>::value ? 0x00010001 : 0x33D733D7;
+
+  // {-128, -128}, f16x2_t
+  static constexpr int kRangeMin = std::is_same<T, half>::value ? 0xD800D800 : 0xC300C300;
+  // {+127, +127}, f16x2_t
+  static constexpr int kRangeMax = std::is_same<T, half>::value ? 0x57F057F0 : 0x42FE42FE;
+
+  // {+128, +128}, int16x2_t
+  static constexpr int kRangeBias = 0x00800080;
+
+  __quickreduce_device_inline__ CodecQ8(int thread, int rank) : CodecBase(thread, rank) {}
+
+  __quickreduce_device_inline__ void send(int32x4_t* __restrict__ send_buffer, int32x4_t const* __restrict__ data) {
+    for (int k = 0; k < kRankAtoms; k++) {
+      int32x4_t const atom = data[k];
+      // Compute the absolute maximum of the atom in the thread group
+      // In 2 blocks of values, upper/lower halves of the f16x2_t
+      int wblockmax = group_abs_max<T>(atom);
+
+      // Derive scales
+      int decoding_scale;
+      int encoding_scale;
+      decoding_scale = packed_mul<T>(wblockmax, kScaleFactor);
+      encoding_scale = packed_add<T>(decoding_scale, kScaleEpsilon);
+      encoding_scale = packed_rcp<T>(encoding_scale);
+
+      // Apply scales to get quantized values
+      int32x4_t w;
+      for (int i = 0; i < 4; i++) {
+        w[i] = packed_mul<T>(atom[i], encoding_scale);
+        w[i] = packed_max<T>(w[i], kRangeMin);
+        w[i] = packed_min<T>(w[i], kRangeMax);
+      }
+
+      // Convert from f16x2_t to uint16x2_t
+      int32x4_t q;
+      {
+        int16_t* qi = reinterpret_cast<int16_t*>(&q);
+        T* wh = reinterpret_cast<T*>(&w);
+        for (int i = 0; i < 8; i++)
+          qi[i] = (int16_t)rintf(T2float_cast(wh[i]));
+
+        for (int i = 0; i < 4; i++) {
+          q[i] = packed_add<int16_t>(q[i], kRangeBias);
+        }
+      }
+
+      // Pack 8 x q8 into int32x2_t
+      int32x2_t qw;
+      qw[0] = q[0] | (q[1] << 8);
+      qw[1] = q[2] | (q[3] << 8);
+
+      // Write quantized atom to send_buffer
+      // note: only the group leader stores the scale
+      uint8_t* atom_ptr = reinterpret_cast<uint8_t*>(send_buffer + k * kRankBufferTileStride);
+      int32x2_t* qw_ptr = reinterpret_cast<int32x2_t*>(atom_ptr) + thread;
+      int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) + (thread / 8);
+
+      __builtin_nontemporal_store(qw, qw_ptr);
+      if (threadIdx.x == group_leader) {
+        __builtin_nontemporal_store(decoding_scale, qs_ptr);
+      }
+    }
+  }
+
+  __quickreduce_device_inline__ void recv(int32x4_t** __restrict__ recv_buffer, int32x4_t* __restrict__ data) {
+    for (int k = 0; k < kRankAtoms; k++) {
+      // Directly read quantized atom from recv_buffer
+      uint8_t* atom_ptr = reinterpret_cast<uint8_t*>(*recv_buffer);
+      int32x2_t* qw_ptr = reinterpret_cast<int32x2_t*>(atom_ptr) + thread;
+      int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) + (thread / 8);
+
+      int32x2_t qw = __builtin_nontemporal_load(qw_ptr);
+      int qs = __builtin_nontemporal_load(qs_ptr);
+
+      *recv_buffer += kRankBufferTileStride;
+
+      // Unpack q8 into fp16x8_t
+      int32x4_t w;
+      {
+        static uint constexpr kMask00FF = 0x00FF00FF;
+
+        // {1024.0, 1024.0}, fp16x2_t
+        static uint constexpr kHalf2_1024 = 0x64006400;
+
+        // {-1152.0, -1152.0}, fp16x2_t
+        static uint constexpr kHalf2_1152 = 0xE480E480;
+
+#pragma unroll
+        for (int i = 0; i < 4; i++) {
+          if constexpr (std::is_same<T, half>::value) {
+            int32_t q8 = ((qw[i / 2] >> ((i % 2) * 8)) & kMask00FF) | kHalf2_1024;
+            w[i] = packed_add<half>(q8, kHalf2_1152);
+          } else {
+            int32_t int16_2 = (qw[i / 2] >> ((i % 2) * 8)) & kMask00FF;
+            int16_t low = static_cast<int16_t>(int16_2 & 0xFFFF);
+            int16_t high = static_cast<int16_t>((int16_2 >> 16) & 0xFFFF);
+            nv_bfloat16 bf_low = __float2bfloat16(static_cast<float>(low));
+            nv_bfloat16 bf_high = __float2bfloat16(static_cast<float>(high));
+            nv_bfloat162 bf2 = __halves2bfloat162(bf_low, bf_high);
+            int32_t packed_bf16 = *reinterpret_cast<int32_t*>(&bf2);
+            w[i] = packed_add<nv_bfloat16>(packed_bf16, kRangeMin);
+          }
+        }
+      }
+
+      // Apply decoding scales
+      for (int i = 0; i < 4; i++) {
+        w[i] = packed_mul<T>(w[i], qs);
+      }
+
+      data[k] = w;
+    }
+  }
+};
+
+// Twoshot All Reduce
+template <typename T, class Codec, bool cast_bf2half>
+struct AllReduceTwoshot {
+  static_assert(sizeof(T) == 2);
+
+  static constexpr int kWorldSize = Codec::kWorldSize;
+
+  __device__ static void
+  run(T const* __restrict__ input,
+      T* __restrict__ output,
+      uint32_t const N,                    // number of elements
+      int const block,                     // block index
+      int const rank,                      // rank index
+      uint8_t** __restrict__ buffer_list,  // communication buffers
+      uint32_t const data_offset,          // offset to start of the data buffer
+      uint32_t flag_color) {
+    // Topology
+    int thread = threadIdx.x + threadIdx.y * kWavefront;
+    uint8_t* rank_buffer = buffer_list[rank];
+    Codec codec(thread, rank);
+    int block_id = blockIdx.x;
+    int grid_size = gridDim.x;
+    // --------------------------------------------------------
+    // Read input into registers
+    int32x4_t tA[kAtoms];
+
+    BufferResource src_buffer(const_cast<T*>(input), N * sizeof(T));
+    uint32_t src_offset = block * kTileSize + thread * sizeof(int32x4_t);
+
+    for (int i = 0; i < kAtoms; i++) {
+      tA[i] = buffer_load_dwordx4(src_buffer.descriptor, src_offset, 0, 0);
+      src_offset += kAtomStride * sizeof(int32x4_t);
+      if constexpr (cast_bf2half) {
+        const nv_bfloat162* bf_buf = reinterpret_cast<const nv_bfloat162*>(&tA[i]);
+        half2 half_buf[4];
+#pragma unroll
+        for (int j = 0; j < 4; ++j) {
+          float2 f = __bfloat1622float2(bf_buf[j]);
+          half_buf[j] = __float22half2_rn(f);
+        }
+        tA[i] = *reinterpret_cast<const int32x4_t*>(half_buf);
+      }
+    }
+
+    // --------------------------------------------------------
+    // Phase-1A: Write segment data into the communication buffer of the target
+    // rank responsible for this segment.
+    uint32_t comm_data0_offset = data_offset + block_id * Codec::kTransmittedTileSize;
+    uint32_t comm_data1_offset = grid_size * Codec::kTransmittedTileSize + comm_data0_offset;
+
+    uint32_t comm_flags0_offset = block_id * (kWorldSize * sizeof(uint32_t));
+    uint32_t comm_flags1_offset = grid_size * (kWorldSize * sizeof(uint32_t)) + comm_flags0_offset;
+
+    for (int r = 0; r < kWorldSize; r++) {
+      int32x4_t* send_buffer =
+          reinterpret_cast<int32x4_t*>(buffer_list[r] + comm_data0_offset + rank * Codec::kRankTransmittedTileSize);
+      codec.send(send_buffer, &tA[r * Codec::kRankAtoms]);
+    }
+
+    __syncthreads();
+    if (thread < kWorldSize) {
+      int r = thread;
+      uint32_t* flag_ptr = reinterpret_cast<uint32_t*>(buffer_list[r] + comm_flags0_offset + rank * sizeof(uint32_t));
+      set_sync_flag(flag_ptr, flag_color);
+    }
+    // --------------------------------------------------------
+    // Phase-1B: Reduce the segment data from the communication buffers.
+    int32x4_t tR[Codec::kRankAtoms] = {};
+    {
+      // Read the data from the communication buffer.
+      int32x4_t* recv_buffer = reinterpret_cast<int32x4_t*>(rank_buffer + comm_data0_offset);
+      uint32_t* flag_ptr = reinterpret_cast<uint32_t*>(rank_buffer + comm_flags0_offset);
+
+      for (int r = 0; r < kWorldSize; r++) {
+        // Wait for the flags to be set.
+        if (thread == 0) {
+          wait_sync_flag(&flag_ptr[r], flag_color);
+        }
+        __syncthreads();
+
+        // note: we reuse tA as temp buffer here
+        codec.recv(&recv_buffer, tA);
+
+        for (int i = 0; i < Codec::kRankAtoms; i++) {
+          packed_assign_add<T>(&tR[i], &tA[i]);
+        }
+      }
+    }
+
+    // Phase-2: Write the reduced segment to every other rank
+    for (int r = 0; r < kWorldSize; r++) {
+      int32x4_t* send_buffer =
+          reinterpret_cast<int32x4_t*>(buffer_list[r] + comm_data1_offset + rank * Codec::kRankTransmittedTileSize);
+      codec.send(send_buffer, tR);
+    }
+
+    __syncthreads();
+    if (thread < kWorldSize) {
+      int r = thread;
+      uint32_t* flag_ptr = reinterpret_cast<uint32_t*>(buffer_list[r] + comm_flags1_offset + rank * sizeof(uint32_t));
+      set_sync_flag(flag_ptr, flag_color);
+    }
+
+    // Phase-2: Read the gather segments from the rank's communication buffer.
+    {
+      // Read the data from the communication buffer.
+      int32x4_t* recv_buffer = reinterpret_cast<int32x4_t*>(rank_buffer + comm_data1_offset);
+      uint32_t* flag_ptr = reinterpret_cast<uint32_t*>(rank_buffer + comm_flags1_offset);
+
+      for (int r = 0; r < kWorldSize; r++) {
+        // Wait for the flags to be set.
+        if (thread == 0) {
+          wait_sync_flag(&flag_ptr[r], flag_color);
+        }
+        __syncthreads();
+
+        // Gather all reduced and final rank segments into tA.
+        codec.recv(&recv_buffer, &tA[r * Codec::kRankAtoms]);
+      }
+    }
+
+    // --------------------------------------------------------
+    // Write the result to output.
+    BufferResource dst_buffer(output, N * sizeof(T));
+    uint32_t dst_offset = block * kTileSize + thread * sizeof(int32x4_t);
+
+    for (int i = 0; i < kAtoms; i++) {
+      if constexpr (cast_bf2half) {
+        const half2* half_buf = reinterpret_cast<const half2*>(&tA[i]);
+        nv_bfloat162 bf16_buf[4];
+#pragma unroll
+        for (int j = 0; j < 4; ++j) {
+          float2 f = __half22float2(half_buf[j]);
+          bf16_buf[j] = __float22bfloat162_rn(f);
+        }
+        buffer_store_dwordx4(*reinterpret_cast<const int32x4_t*>(bf16_buf), dst_buffer.descriptor, dst_offset, 0, 0);
+      } else {
+        buffer_store_dwordx4(tA[i], dst_buffer.descriptor, dst_offset, 0, 0);
+      }
+      dst_offset += kAtomStride * sizeof(int32x4_t);
+    }
+  }
+};
+
+}  // namespace quickreduce
diff --git a/sgl-kernel/csrc/allreduce/quick_all_reduce.h b/sgl-kernel/csrc/allreduce/quick_all_reduce.h
new file mode 100644
index 000000000..1d629e018
--- /dev/null
+++ b/sgl-kernel/csrc/allreduce/quick_all_reduce.h
@@ -0,0 +1,233 @@
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+#include <vector>
+
+#include "quick_all_reduce.cuh"
+
+#define HIP_CHECK(err)                                                                               \
+  do {                                                                                               \
+    hipError_t err_ = (err);                                                                         \
+    if (err_ != hipSuccess) {                                                                        \
+      std::printf("HIP error %d at %s:%d. %s\n", err_, __FILE__, __LINE__, hipGetErrorString(err_)); \
+      throw std::runtime_error("HIP error");                                                         \
+    }                                                                                                \
+  } while (0)
+
+namespace quickreduce {
+using fptr_t = int64_t;
+static_assert(sizeof(void*) == sizeof(fptr_t));
+
+template <typename AllReduceKernel, typename T>
+__global__ __quickreduce_launch_bounds_two_shot__ static void allreduce_prototype_twoshot(
+    T const* A,
+    T* B,
+    uint32_t N,
+    uint32_t num_blocks,
+    int rank,
+    uint8_t** dbuffer_list,
+    uint32_t data_offset,
+    uint32_t flag_color) {
+  int block = blockIdx.x;
+  int grid = gridDim.x;
+
+  while (block < num_blocks) {
+    AllReduceKernel::run(A, B, N, block, rank, dbuffer_list, data_offset, flag_color);
+    block += grid;
+    flag_color++;
+  }
+}
+
+#define TWOSHOT_DISPATCH(__codec)                                         \
+  if (world_size == 2) {                                                  \
+    using LineCodec = __codec<T, 2>;                                      \
+    using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
+    hipLaunchKernelGGL(                                                   \
+        (allreduce_prototype_twoshot<AllReduceKernel, T>),                \
+        dim3(grid),                                                       \
+        dim3(kBlockTwoShot),                                              \
+        0,                                                                \
+        stream,                                                           \
+        A,                                                                \
+        B,                                                                \
+        N,                                                                \
+        num_blocks,                                                       \
+        rank,                                                             \
+        dbuffer_list,                                                     \
+        data_offset,                                                      \
+        flag_color);                                                      \
+  } else if (world_size == 4) {                                           \
+    using LineCodec = __codec<T, 4>;                                      \
+    using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
+    hipLaunchKernelGGL(                                                   \
+        (allreduce_prototype_twoshot<AllReduceKernel, T>),                \
+        dim3(grid),                                                       \
+        dim3(kBlockTwoShot),                                              \
+        0,                                                                \
+        stream,                                                           \
+        A,                                                                \
+        B,                                                                \
+        N,                                                                \
+        num_blocks,                                                       \
+        rank,                                                             \
+        dbuffer_list,                                                     \
+        data_offset,                                                      \
+        flag_color);                                                      \
+  } else if (world_size == 8) {                                           \
+    using LineCodec = __codec<T, 8>;                                      \
+    using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
+    hipLaunchKernelGGL(                                                   \
+        (allreduce_prototype_twoshot<AllReduceKernel, T>),                \
+        dim3(grid),                                                       \
+        dim3(kBlockTwoShot),                                              \
+        0,                                                                \
+        stream,                                                           \
+        A,                                                                \
+        B,                                                                \
+        N,                                                                \
+        num_blocks,                                                       \
+        rank,                                                             \
+        dbuffer_list,                                                     \
+        data_offset,                                                      \
+        flag_color);                                                      \
+  }
+
+enum QuickReduceQuantLevel {
+  F16 = 0,
+  INT8 = 1,
+  INT6 = 2,
+  INT4 = 3,
+};
+
+struct DeviceComms {
+  // Max problem size is 2GB (in bytes) or half of uint32_t max value.
+  int64_t kMaxProblemSize = static_cast<int64_t>(std::numeric_limits<int32_t>::max()) + 1;
+
+  // Max TP-8
+  static int constexpr kMaxWorldSize = 8;
+
+  bool initialized = false;
+  uint32_t flag_color = 1;
+  int world_size;
+  int rank;
+
+  uint8_t* dbuffer;
+  uint8_t** dbuffer_list;
+  hipIpcMemHandle_t buffer_ipc_handle;
+  std::vector<hipIpcMemHandle_t> all_buffer_ipc_handles;
+  std::vector<uint8_t*> buffer_list;
+  uint32_t data_offset;
+
+  DeviceComms() : initialized(false), world_size(1), rank(0) {}
+  ~DeviceComms() {
+    destroy();
+  }
+
+  void init(int world_size, int rank, std::optional<int64_t> max_problem_size = std::nullopt) {
+    destroy();
+    this->world_size = world_size;
+    this->rank = rank;
+    if (max_problem_size.has_value() && max_problem_size.value() > 0) {
+      this->kMaxProblemSize = max_problem_size.value();
+    }
+    // Allocate buffer size for worst case: F16 2-stage buffer.
+    uint32_t flags_buffer_size = 2 * world_size * kMaxNumBlocks * sizeof(uint32_t);
+    static int64_t data_buffer_size = 2 * this->kMaxProblemSize;
+    int64_t total_buffer_size = flags_buffer_size + data_buffer_size;
+    data_offset = flags_buffer_size;
+    HIP_CHECK(hipExtMallocWithFlags((void**)&dbuffer, total_buffer_size, hipDeviceMallocUncached));
+
+    // Clear the flags buffer.
+    HIP_CHECK(hipMemset(dbuffer, 0, flags_buffer_size));
+
+    // Device-side list of IPC buffers.
+    buffer_list.resize(world_size);
+    HIP_CHECK(hipMalloc(&dbuffer_list, world_size * sizeof(uint8_t*)));
+
+    // Create IPC handles for rank's communication buffer.
+    all_buffer_ipc_handles.resize(world_size);
+    HIP_CHECK(hipIpcGetMemHandle(&buffer_ipc_handle, dbuffer));
+
+    initialized = true;
+  }
+  int get_world_size() {
+    return world_size;
+  }
+  int get_rank() {
+    return rank;
+  }
+  bool status() {
+    return initialized;
+  }
+  hipIpcMemHandle_t const get_handle() {
+    return buffer_ipc_handle;
+  }
+
+  void destroy() {
+    if (initialized) {
+      for (int i = 0; i < world_size; i++) {
+        if (i != rank) {
+          HIP_CHECK(hipIpcCloseMemHandle(dbuffer_list[i]));
+        }
+      }
+
+      HIP_CHECK(hipFree(dbuffer));
+      HIP_CHECK(hipFree(dbuffer_list));
+
+      initialized = false;
+    }
+  }
+
+  void open_ipc_handles(std::vector<hipIpcMemHandle_t> const& ipc_handles) {
+    assert(ipc_handles.size() == all_buffer_ipc_handles.size());
+    for (int i = 0; i < world_size; i++) {
+      all_buffer_ipc_handles[i] = ipc_handles[i];
+    }
+
+    // Open device memory access to the IPC communication buffers.
+    // Note: For our own rank, we do not need to open a handle.
+    for (int i = 0; i < world_size; i++) {
+      if (i != rank) {
+        HIP_CHECK(
+            hipIpcOpenMemHandle((void**)&buffer_list[i], all_buffer_ipc_handles[i], hipIpcMemLazyEnablePeerAccess));
+      } else {
+        buffer_list[i] = dbuffer;
+      }
+    }
+
+    HIP_CHECK(hipMemcpy(dbuffer_list, buffer_list.data(), world_size * sizeof(uint8_t*), hipMemcpyHostToDevice));
+  }
+
+  template <typename T, bool cast_bf2half>
+  void allreduce(T const* A, T* B, uint32_t N, int quant_level, hipStream_t stream) {
+    if (world_size != 2 && world_size != 4 && world_size != 8) {
+      throw std::runtime_error("All Reduce not supported for world_size = " + std::to_string(world_size));
+    }
+
+    // Configuration.
+    uint32_t msg_size = N * sizeof(T);
+    uint32_t num_blocks = divceil(msg_size, kTileSize);
+    uint32_t grid = min(kMaxNumBlocks, num_blocks);
+    auto quant_level_ = static_cast<QuickReduceQuantLevel>(quant_level);
+    switch (quant_level_) {
+      case QuickReduceQuantLevel::INT8:
+        TWOSHOT_DISPATCH(CodecQ8)
+        break;
+      case QuickReduceQuantLevel::INT6:
+        TWOSHOT_DISPATCH(CodecQ6)
+        break;
+      case QuickReduceQuantLevel::INT4:
+        TWOSHOT_DISPATCH(CodecQ4)
+        break;
+      default:
+        TWOSHOT_DISPATCH(CodecFP)
+        break;
+    }
+    HIP_CHECK(cudaGetLastError());
+    // Rotate the flag color.
+    flag_color += divceil(N, grid);
+  }
+};
+
+}  // namespace quickreduce
diff --git a/sgl-kernel/csrc/allreduce/quick_all_reduce.hip b/sgl-kernel/csrc/allreduce/quick_all_reduce.hip
new file mode 100644
index 000000000..9934383e1
--- /dev/null
+++ b/sgl-kernel/csrc/allreduce/quick_all_reduce.hip
@@ -0,0 +1,113 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+#include <ATen/hip/Exceptions.h>
+#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
+#include <ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h>
+#include <torch/all.h>
+
+#ifdef USE_ROCM
+
+#include "quick_all_reduce_hip.h"
+
+quickreduce::fptr_t init_custom_qr(int64_t rank, int64_t world_size, std::optional<int64_t> qr_max_size) {
+  if (world_size > 8) throw std::invalid_argument("world size > 8 is not supported");
+  if (world_size == 6) throw std::invalid_argument("world size == 6 is not supported");
+  if (world_size % 2 != 0) throw std::invalid_argument("Odd num gpus is not supported for now");
+  if (rank < 0 || rank >= world_size) throw std::invalid_argument("invalid rank passed in");
+  quickreduce::DeviceComms* fptr = new quickreduce::DeviceComms();
+  fptr->init(world_size, rank, qr_max_size);
+  return (quickreduce::fptr_t)fptr;
+}
+
+void qr_destroy(quickreduce::fptr_t _fa) {
+  if (_fa) {
+    auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
+    fa->destroy();
+    delete fa;
+  }
+}
+
+torch::Tensor qr_get_handle(quickreduce::fptr_t _fa) {
+  auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
+  hipIpcMemHandle_t handle = fa->get_handle();
+  auto options = torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
+  auto data_handle = torch::empty({static_cast<int64_t>(sizeof(hipIpcMemHandle_t))}, options);
+  std::memcpy(data_handle.data_ptr(), &handle, sizeof(hipIpcMemHandle_t));
+  return data_handle;
+}
+
+void qr_open_handles(quickreduce::fptr_t _fa, const std::vector<torch::Tensor>& handles) {
+  auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
+  std::vector<hipIpcMemHandle_t> ipc_handles;
+  ipc_handles.reserve(handles.size());
+  for (auto& handle : handles) {
+    // Ensure the tensor is on the same device as the current device.
+    hipIpcMemHandle_t ipc_handle;
+    std::memcpy(&ipc_handle, handle.data_ptr(), sizeof(hipIpcMemHandle_t));
+    ipc_handles.push_back(ipc_handle);
+  }
+  fa->open_ipc_handles(ipc_handles);
+}
+
+void qr_all_reduce(
+    quickreduce::fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, int64_t quant_level, bool cast_bf2half) {
+  auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
+  const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(inp));
+  auto stream = at::cuda::getCurrentHIPStreamMasqueradingAsCUDA();
+
+  TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
+  TORCH_CHECK_EQ(inp.numel(), out.numel());
+  TORCH_CHECK_LE(out.numel(), fa->kMaxProblemSize);
+  if (out.scalar_type() == at::ScalarType::Half) {
+    fa->allreduce<half, false>(
+        reinterpret_cast<half*>(inp.data_ptr()),
+        reinterpret_cast<half*>(out.data_ptr()),
+        out.numel(),
+        quant_level,
+        stream);
+  } else if (out.scalar_type() == at::ScalarType::BFloat16) {
+    if (cast_bf2half) {
+      fa->allreduce<half, true>(
+          reinterpret_cast<half*>(inp.data_ptr()),
+          reinterpret_cast<half*>(out.data_ptr()),
+          out.numel(),
+          quant_level,
+          stream);
+    } else {
+      fa->allreduce<quickreduce::nv_bfloat16, false>(
+          reinterpret_cast<quickreduce::nv_bfloat16*>(inp.data_ptr()),
+          reinterpret_cast<quickreduce::nv_bfloat16*>(out.data_ptr()),
+          out.numel(),
+          quant_level,
+          stream);
+    }
+  } else {
+    throw std::runtime_error("quick allreduce only supports float16 and bfloat16");
+  }
+}
+
+int64_t qr_max_size() {
+  // The default is 2GB (2,147,483,648 bytes)
+  return static_cast<int64_t>(std::numeric_limits<int32_t>::max()) + 1;
+}
+
+#define INSTANTIATE_FOR_WORLDSIZE(T, Codec, cast_bf2half)                      \
+  template struct quickreduce::AllReduceTwoshot<T, Codec<T, 2>, cast_bf2half>; \
+  template struct quickreduce::AllReduceTwoshot<T, Codec<T, 4>, cast_bf2half>; \
+  template struct quickreduce::AllReduceTwoshot<T, Codec<T, 8>, cast_bf2half>;
+
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecFP, false)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ4, false)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ6, false)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ8, false)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecFP, true)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ4, true)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ6, true)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ8, true)
+
+INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecFP, false)
+INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ4, false)
+INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ6, false)
+INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ8, false)
+
+#endif  // USE_ROCM
diff --git a/sgl-kernel/csrc/allreduce/quick_all_reduce_base.h b/sgl-kernel/csrc/allreduce/quick_all_reduce_base.h
new file mode 100644
index 000000000..759b28f38
--- /dev/null
+++ b/sgl-kernel/csrc/allreduce/quick_all_reduce_base.h
@@ -0,0 +1,318 @@
+#pragma once
+
+#include <hip/hip_bf16.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+
+#include <cstdint>
+
+#define __quickreduce_device_inline__ __device__ __forceinline__
+#define __quickreduce_launch_bounds_two_shot__ __launch_bounds__(256, 4)
+#define __quickreduce_launch_bounds_one_shot__ __launch_bounds__(512, 4)
+
+namespace quickreduce {
+
+typedef __hip_bfloat16 nv_bfloat16;
+typedef __hip_bfloat162 nv_bfloat162;
+
+using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
+using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
+
+// Setup acquire-release semantics for vector memory reads (mubuf instruction)
+// as per architecture.
+#if defined(__gfx942__)
+// CDNA3: Scope bits sc0, sc1
+#define MUBUF_ACQUIRE 16
+#define MUBUF_RELEASE 16
+#elif (defined(__gfx908__) || defined(__gfx90a__))
+// CDNA1 and CDNA2 - glc bit
+#define MUBUF_ACQUIRE 1
+#define MUBUF_RELEASE 0
+#endif
+
+static constexpr int kNegOne = 0xBC00BC00;  // {-1, -1}, fp16x2_t
+
+// Number of atoms (4xf16x2_t) processed by a single thread
+static constexpr int kAtoms = 8;
+
+// We use a workgroup of 256 threads
+static constexpr int kBlockSize = 256;
+static constexpr int kAtomStride = kBlockSize;
+
+// Size and atom stride of source/destination data that the block will
+// process.
+// Workgroup scope = Tile = (256 threads x 8 atoms x 16B)
+static constexpr int kTileSize = kBlockSize * kAtoms * sizeof(int32x4_t);
+
+// Max number of blocks. 304 CUs on MI300
+static constexpr int kMaxNumBlocks = 304 * 4;
+
+// Standard CDNA wavefront size.
+static constexpr int kWavefront = 64;
+
+// 256 thread, 4 wavefronts.
+static dim3 constexpr kBlockTwoShot = {kWavefront, kBlockSize / kWavefront, 1};
+
+// Number of threads in a group for quantization
+// It corresponds to 32 F16 elements in quantization block
+static constexpr int kThreadGroupSize = 8;
+
+// Methods
+__quickreduce_device_inline__ __host__ unsigned long divceil(unsigned long x, unsigned long y) {
+  return ((x + y - 1) / y);
+}
+
+union BufferResource {
+  __quickreduce_device_inline__ constexpr BufferResource() : config(0x00020000U) {}
+
+  __quickreduce_device_inline__ constexpr BufferResource(void* buffer_address, uint32_t buffer_size)
+      : address(buffer_address), range(buffer_size), config(0x00020000U) {}
+
+  int32x4_t descriptor;
+  struct {
+    void* address;  // 8B, out of which first 48b is address, and 16b is stride
+    // (unused)
+    uint32_t range;   // Byte range for the buffer resource
+    uint32_t config;  // Constant, DFMT=32b
+  };
+};
+
+__quickreduce_device_inline__ static int32x4_t buffer_load_dwordx4(
+    int32x4_t srsrc, int32_t voffset, int32_t soffset, int32_t aux) __asm("llvm.amdgcn.raw.buffer.load.v4i32");
+
+__quickreduce_device_inline__ static void
+buffer_store_dwordx4(int32x4_t data, int32x4_t srsrc, int32_t voffset, int32_t soffset, int32_t aux) __asm(
+    "llvm.amdgcn.raw.buffer.store.v4i32");
+
+__quickreduce_device_inline__ static void set_fp16_ovfl(bool const value) {
+#if defined(__gfx942__)
+  if (value) {
+    asm volatile("s_setreg_imm32_b32 0xdc1, 1;" ::);
+  } else {
+    asm volatile("s_setreg_imm32_b32 0xdc1, 0;" ::);
+  }
+#endif
+}
+union bf162_int_union {
+  int i;
+  nv_bfloat162 bf2;
+};
+
+template <typename T>
+__quickreduce_device_inline__ void packed_assign_add(int32x4_t* A, int32x4_t* B);
+
+template <>
+__quickreduce_device_inline__ void packed_assign_add<half>(int32x4_t* A, int32x4_t* B) {
+  int32x4_t& tR_fragment = A[0];
+  int32x4_t& tA_fragment = B[0];
+
+  asm volatile("v_pk_add_f16 %0, %1, %2" : "=v"(tR_fragment[0]) : "v"(tR_fragment[0]), "v"(tA_fragment[0]));
+  asm volatile("v_pk_add_f16 %0, %1, %2" : "=v"(tR_fragment[1]) : "v"(tR_fragment[1]), "v"(tA_fragment[1]));
+  asm volatile("v_pk_add_f16 %0, %1, %2" : "=v"(tR_fragment[2]) : "v"(tR_fragment[2]), "v"(tA_fragment[2]));
+  asm volatile("v_pk_add_f16 %0, %1, %2" : "=v"(tR_fragment[3]) : "v"(tR_fragment[3]), "v"(tA_fragment[3]));
+}
+
+template <>
+__quickreduce_device_inline__ void packed_assign_add<nv_bfloat16>(int32x4_t* A, int32x4_t* B) {
+  nv_bfloat162* tA = reinterpret_cast<nv_bfloat162*>(A);
+  nv_bfloat162* tB = reinterpret_cast<nv_bfloat162*>(B);
+#pragma unroll
+  for (int i = 0; i < 4; i++) {
+    tA[i] = __hadd2(tA[i], tB[i]);
+  }
+}
+
+template <typename T>
+__quickreduce_device_inline__ int packed_max(int a, int b);
+
+template <>
+__quickreduce_device_inline__ int packed_max<half>(int a, int b) {
+  int result;
+  asm volatile("v_pk_max_f16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
+  return result;
+}
+
+template <>
+__quickreduce_device_inline__ int packed_max<nv_bfloat16>(int a, int b) {
+  bf162_int_union A, B, R;
+  A.i = a;
+  B.i = b;
+  R.bf2 = __hmax2(A.bf2, B.bf2);
+  return R.i;
+}
+
+template <typename T>
+__quickreduce_device_inline__ int packed_min(int a, int b);
+
+template <>
+__quickreduce_device_inline__ int packed_min<half>(int a, int b) {
+  int result;
+  asm volatile("v_pk_min_f16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
+  return result;
+}
+
+template <>
+__quickreduce_device_inline__ int packed_min<nv_bfloat16>(int a, int b) {
+  bf162_int_union A, B, R;
+  A.i = a;
+  B.i = b;
+  R.bf2 = __hmin2(A.bf2, B.bf2);
+  return R.i;
+}
+
+template <typename T>
+__quickreduce_device_inline__ int packed_abs_max(int a, int b);
+
+template <>
+__quickreduce_device_inline__ int packed_abs_max<half>(int a, int b) {
+  half2 wmaxh2 = __builtin_bit_cast(half2, a);
+  half2 wminh2 = __builtin_bit_cast(half2, b);
+  half2 wblockmaxh2;
+
+  wblockmaxh2.x = __hgt(__habs(wmaxh2.x), __habs(wminh2.x)) ? wmaxh2.x : wminh2.x;
+  wblockmaxh2.y = __hgt(__habs(wmaxh2.y), __habs(wminh2.y)) ? wmaxh2.y : wminh2.y;
+  return __builtin_bit_cast(int, wblockmaxh2);
+}
+
+template <>
+__quickreduce_device_inline__ int packed_abs_max<nv_bfloat16>(int a, int b) {
+  bf162_int_union A, B, R;
+  A.i = a;
+  B.i = b;
+  R.bf2.x = __hgt(__habs(A.bf2.x), __habs(B.bf2.x)) ? A.bf2.x : B.bf2.x;
+  R.bf2.y = __hgt(__habs(A.bf2.y), __habs(B.bf2.y)) ? A.bf2.y : B.bf2.y;
+  return R.i;
+}
+
+template <typename T>
+__quickreduce_device_inline__ int packed_add(int a, int b);
+
+template <>
+__quickreduce_device_inline__ int packed_add<half>(int a, int b) {
+  int result;
+  asm volatile("v_pk_add_f16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
+  return result;
+}
+
+template <>
+__quickreduce_device_inline__ int packed_add<nv_bfloat16>(int a, int b) {
+  bf162_int_union A, B, R;
+  A.i = a;
+  B.i = b;
+  R.bf2 = __hadd2(A.bf2, B.bf2);
+  return R.i;
+}
+
+template <>
+__quickreduce_device_inline__ int packed_add<int16_t>(int a, int b) {
+  int result;
+  asm volatile("v_pk_add_i16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
+  return result;
+}
+
+template <typename T>
+__quickreduce_device_inline__ int packed_sub(int a, int b);
+
+template <>
+__quickreduce_device_inline__ int packed_sub<half>(int a, int b) {
+  int result;
+
+  // MI300 lacks packed fp16 sub instruction. So we do -1 * min + max
+  asm volatile("v_pk_fma_f16 %0, %1, %2 %3" : "=v"(result) : "v"(kNegOne), "v"(b), "v"(a));
+  return result;
+}
+
+template <>
+__quickreduce_device_inline__ int packed_sub<nv_bfloat16>(int a, int b) {
+  bf162_int_union A, B, R;
+  A.i = a;
+  B.i = b;
+  R.bf2 = __hsub2(A.bf2, B.bf2);
+  return R.i;
+}
+
+template <typename T>
+__quickreduce_device_inline__ int packed_mul(int a, int b);
+
+template <>
+__quickreduce_device_inline__ int packed_mul<half>(int a, int b) {
+  int result;
+  asm volatile("v_pk_mul_f16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
+  return result;
+}
+
+template <>
+__quickreduce_device_inline__ int packed_mul<nv_bfloat16>(int a, int b) {
+  nv_bfloat162* tA = reinterpret_cast<nv_bfloat162*>(&a);
+  nv_bfloat162* tB = reinterpret_cast<nv_bfloat162*>(&b);
+  nv_bfloat162 tR = __hmul2(*tA, *tB);
+  return *(reinterpret_cast<int*>(&tR));
+}
+
+template <typename T>
+__quickreduce_device_inline__ int packed_rcp(int a);
+
+template <>
+__quickreduce_device_inline__ int packed_rcp<half>(int a) {
+  return __builtin_bit_cast(int, h2rcp(__builtin_bit_cast(half2, a)));
+}
+
+template <>
+__quickreduce_device_inline__ int packed_rcp<nv_bfloat16>(int a) {
+  bf162_int_union A, R;
+  A.i = a;
+  R.bf2 = h2rcp(A.bf2);
+  return R.i;
+}
+
+// changes dtype
+__quickreduce_device_inline__ float T2float_cast(half a) {
+  return __half2float(a);
+}
+
+__quickreduce_device_inline__ float T2float_cast(nv_bfloat16 a) {
+  return __bfloat162float(a);
+}
+
+template <typename T>
+__quickreduce_device_inline__ int group_abs_max(int32x4_t atom) {
+  const int group_leader = (threadIdx.x / kThreadGroupSize) * kThreadGroupSize;
+
+  int wmax, wmin, wblockmax;
+  int a, b;
+  a = packed_max<T>(atom[0], atom[1]);
+  b = packed_max<T>(atom[2], atom[3]);
+
+  wmax = packed_max<T>(a, b);
+
+  a = packed_min<T>(atom[0], atom[1]);
+  b = packed_min<T>(atom[2], atom[3]);
+
+  wmin = packed_min<T>(a, b);
+
+  // Reduce the max among a group of threads
+  // Note: This is basically 2 blocks of values setup as the
+  // upper/lower halves of the f16x2_t
+  for (int i = 1; i < kThreadGroupSize; i <<= 1) {
+    int x = __shfl_down(wmax, i);
+    wmax = packed_max<T>(wmax, x);
+
+    int y = __shfl_down(wmin, i);
+    wmin = packed_min<T>(wmin, y);
+  }
+  wblockmax = packed_abs_max<T>(wmax, wmin);
+  // Share with the cohort
+  wblockmax = __shfl(wblockmax, group_leader);
+  return wblockmax;
+}
+
+__quickreduce_device_inline__ void set_sync_flag(uint32_t* flag_ptr, uint32_t flag) {
+  __atomic_store_n(flag_ptr, flag, __ATOMIC_RELEASE);
+}
+
+__quickreduce_device_inline__ void wait_sync_flag(uint32_t* flag_ptr, uint32_t flag) {
+  while (__atomic_load_n(flag_ptr, __ATOMIC_RELAXED) != flag) {
+  }
+}
+
+}  // namespace quickreduce
diff --git a/sgl-kernel/csrc/allreduce/quick_all_reduce_hip.h b/sgl-kernel/csrc/allreduce/quick_all_reduce_hip.h
new file mode 100644
index 000000000..a0a5e3e54
--- /dev/null
+++ b/sgl-kernel/csrc/allreduce/quick_all_reduce_hip.h
@@ -0,0 +1,235 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+#include <vector>
+
+#include "quick_all_reduce.cuh"
+
+#define HIP_CHECK(err)                                                                               \
+  do {                                                                                               \
+    hipError_t err_ = (err);                                                                         \
+    if (err_ != hipSuccess) {                                                                        \
+      std::printf("HIP error %d at %s:%d. %s\n", err_, __FILE__, __LINE__, hipGetErrorString(err_)); \
+      throw std::runtime_error("HIP error");                                                         \
+    }                                                                                                \
+  } while (0)
+
+namespace quickreduce {
+using fptr_t = int64_t;
+static_assert(sizeof(void*) == sizeof(fptr_t));
+
+template <typename AllReduceKernel, typename T>
+__global__ __quickreduce_launch_bounds_two_shot__ static void allreduce_prototype_twoshot(
+    T const* A,
+    T* B,
+    uint32_t N,
+    uint32_t num_blocks,
+    int rank,
+    uint8_t** dbuffer_list,
+    uint32_t data_offset,
+    uint32_t flag_color) {
+  int block = blockIdx.x;
+  int grid = gridDim.x;
+
+  while (block < num_blocks) {
+    AllReduceKernel::run(A, B, N, block, rank, dbuffer_list, data_offset, flag_color);
+    block += grid;
+    flag_color++;
+  }
+}
+
+#define TWOSHOT_DISPATCH(__codec)                                         \
+  if (world_size == 2) {                                                  \
+    using LineCodec = __codec<T, 2>;                                      \
+    using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
+    hipLaunchKernelGGL(                                                   \
+        (allreduce_prototype_twoshot<AllReduceKernel, T>),                \
+        dim3(grid),                                                       \
+        dim3(kBlockTwoShot),                                              \
+        0,                                                                \
+        stream,                                                           \
+        A,                                                                \
+        B,                                                                \
+        N,                                                                \
+        num_blocks,                                                       \
+        rank,                                                             \
+        dbuffer_list,                                                     \
+        data_offset,                                                      \
+        flag_color);                                                      \
+  } else if (world_size == 4) {                                           \
+    using LineCodec = __codec<T, 4>;                                      \
+    using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
+    hipLaunchKernelGGL(                                                   \
+        (allreduce_prototype_twoshot<AllReduceKernel, T>),                \
+        dim3(grid),                                                       \
+        dim3(kBlockTwoShot),                                              \
+        0,                                                                \
+        stream,                                                           \
+        A,                                                                \
+        B,                                                                \
+        N,                                                                \
+        num_blocks,                                                       \
+        rank,                                                             \
+        dbuffer_list,                                                     \
+        data_offset,                                                      \
+        flag_color);                                                      \
+  } else if (world_size == 8) {                                           \
+    using LineCodec = __codec<T, 8>;                                      \
+    using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>; \
+    hipLaunchKernelGGL(                                                   \
+        (allreduce_prototype_twoshot<AllReduceKernel, T>),                \
+        dim3(grid),                                                       \
+        dim3(kBlockTwoShot),                                              \
+        0,                                                                \
+        stream,                                                           \
+        A,                                                                \
+        B,                                                                \
+        N,                                                                \
+        num_blocks,                                                       \
+        rank,                                                             \
+        dbuffer_list,                                                     \
+        data_offset,                                                      \
+        flag_color);                                                      \
+  }
+
+enum QuickReduceQuantLevel {
+  F16 = 0,
+  INT8 = 1,
+  INT6 = 2,
+  INT4 = 3,
+};
+
+struct DeviceComms {
+  // Max problem size is 2GB (in bytes) or half of uint32_t max value.
+  int64_t kMaxProblemSize = static_cast<int64_t>(std::numeric_limits<int32_t>::max()) + 1;
+
+  // Max TP-8
+  static int constexpr kMaxWorldSize = 8;
+
+  bool initialized = false;
+  uint32_t flag_color = 1;
+  int world_size;
+  int rank;
+
+  uint8_t* dbuffer;
+  uint8_t** dbuffer_list;
+  hipIpcMemHandle_t buffer_ipc_handle;
+  std::vector<hipIpcMemHandle_t> all_buffer_ipc_handles;
+  std::vector<uint8_t*> buffer_list;
+  uint32_t data_offset;
+
+  DeviceComms() : initialized(false), world_size(1), rank(0) {}
+  ~DeviceComms() {
+    destroy();
+  }
+
+  void init(int world_size, int rank, std::optional<int64_t> max_problem_size = std::nullopt) {
+    destroy();
+    this->world_size = world_size;
+    this->rank = rank;
+    if (max_problem_size.has_value() && max_problem_size.value() > 0) {
+      this->kMaxProblemSize = max_problem_size.value();
+    }
+    // Allocate buffer size for worst case: F16 2-stage buffer.
+    uint32_t flags_buffer_size = 2 * world_size * kMaxNumBlocks * sizeof(uint32_t);
+    static int64_t data_buffer_size = 2 * this->kMaxProblemSize;
+    int64_t total_buffer_size = flags_buffer_size + data_buffer_size;
+    data_offset = flags_buffer_size;
+    HIP_CHECK(hipExtMallocWithFlags((void**)&dbuffer, total_buffer_size, hipDeviceMallocUncached));
+
+    // Clear the flags buffer.
+    HIP_CHECK(hipMemset(dbuffer, 0, flags_buffer_size));
+
+    // Device-side list of IPC buffers.
+    buffer_list.resize(world_size);
+    HIP_CHECK(hipMalloc(&dbuffer_list, world_size * sizeof(uint8_t*)));
+
+    // Create IPC handles for rank's communication buffer.
+    all_buffer_ipc_handles.resize(world_size);
+    HIP_CHECK(hipIpcGetMemHandle(&buffer_ipc_handle, dbuffer));
+
+    initialized = true;
+  }
+  int get_world_size() {
+    return world_size;
+  }
+  int get_rank() {
+    return rank;
+  }
+  bool status() {
+    return initialized;
+  }
+  hipIpcMemHandle_t const get_handle() {
+    return buffer_ipc_handle;
+  }
+
+  void destroy() {
+    if (initialized) {
+      for (int i = 0; i < world_size; i++) {
+        if (i != rank) {
+          HIP_CHECK(hipIpcCloseMemHandle(dbuffer_list[i]));
+        }
+      }
+
+      HIP_CHECK(hipFree(dbuffer));
+      HIP_CHECK(hipFree(dbuffer_list));
+
+      initialized = false;
+    }
+  }
+
+  void open_ipc_handles(std::vector<hipIpcMemHandle_t> const& ipc_handles) {
+    assert(ipc_handles.size() == all_buffer_ipc_handles.size());
+    for (int i = 0; i < world_size; i++) {
+      all_buffer_ipc_handles[i] = ipc_handles[i];
+    }
+
+    // Open device memory access to the IPC communication buffers.
+    // Note: For our own rank, we do not need to open a handle.
+    for (int i = 0; i < world_size; i++) {
+      if (i != rank) {
+        HIP_CHECK(
+            hipIpcOpenMemHandle((void**)&buffer_list[i], all_buffer_ipc_handles[i], hipIpcMemLazyEnablePeerAccess));
+      } else {
+        buffer_list[i] = dbuffer;
+      }
+    }
+
+    HIP_CHECK(hipMemcpy(dbuffer_list, buffer_list.data(), world_size * sizeof(uint8_t*), hipMemcpyHostToDevice));
+  }
+
+  template <typename T, bool cast_bf2half>
+  void allreduce(T const* A, T* B, uint32_t N, int quant_level, hipStream_t stream) {
+    if (world_size != 2 && world_size != 4 && world_size != 8) {
+      throw std::runtime_error("All Reduce not supported for world_size = " + std::to_string(world_size));
+    }
+
+    // Configuration.
+    uint32_t msg_size = N * sizeof(T);
+    uint32_t num_blocks = divceil(msg_size, kTileSize);
+    uint32_t grid = min(kMaxNumBlocks, num_blocks);
+    auto quant_level_ = static_cast<QuickReduceQuantLevel>(quant_level);
+    switch (quant_level_) {
+      case QuickReduceQuantLevel::INT8:
+        TWOSHOT_DISPATCH(CodecQ8)
+        break;
+      case QuickReduceQuantLevel::INT6:
+        TWOSHOT_DISPATCH(CodecQ6)
+        break;
+      case QuickReduceQuantLevel::INT4:
+        TWOSHOT_DISPATCH(CodecQ4)
+        break;
+      default:
+        TWOSHOT_DISPATCH(CodecFP)
+        break;
+    }
+    HIP_CHECK(hipGetLastError());
+    // Rotate the flag color.
+    flag_color += divceil(N, grid);
+  }
+};
+
+}  // namespace quickreduce
diff --git a/sgl-kernel/csrc/allreduce/test_mscclpp_allreduce.cu b/sgl-kernel/csrc/allreduce/test_mscclpp_allreduce.cu
new file mode 100644
index 000000000..4ca0c5739
--- /dev/null
+++ b/sgl-kernel/csrc/allreduce/test_mscclpp_allreduce.cu
@@ -0,0 +1,153 @@
+/*
+ * this file is used to test mscclpp_allreduce.cu using mpirun
+ * this file is adapted from https://github.com/flashinfer-ai/flashinfer/blob/v0.2.5/src/test_sum_all_reduce.cu
+usage:
+cd PATH-TO-THIS-FILE
+export MPI_HOME=/usr/local/mpi
+# export MPI_HOME=/opt/hpcx/ompi/
+export MSCCLPP_HOME=/workspace/test/mscclpp
+nvcc -O2 -arch=native -std=c++17 test_mscclpp_allreduce.cu \
+  -o test_mscclpp_allreduce -D_GLIBCXX_USE_CXX11_ABI=0 \
+  -I${MSCCLPP_HOME}/include -L${MSCCLPP_HOME}/build -lmscclpp \
+  -lnccl -I${MPI_HOME}/include -L${MPI_HOME}/lib -lmpi
+
+/opt/hpcx/ompi/bin/
+mpirun --allow-run-as-root -H 127.0.0.1:8 -np 8 \
+  --map-by ppr:8:node \
+  --mca btl_openib_warn_no_device_params_found 0 \
+  --mca btl_tcp_if_include bond0 \
+  --allow-run-as-root -np 8 \
+  -x NCCL_RUNTIME_CONNECT=0 -x NCCL_IB_GID_INDEX=3 -x NCCL_DEBUG=WARN \
+  -x LD_PRELOAD=${MSCCLPP_HOME}/build/libmscclpp.so ./test_mscclpp_allreduce
+ */
+#include <mpi.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#ifndef CHECK_CUDA_SUCCESS
+#define CHECK_CUDA_SUCCESS(cmd)                                                             \
+  do {                                                                                      \
+    cudaError_t e = cmd;                                                                    \
+    if (e != cudaSuccess) {                                                                 \
+      printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
+      exit(EXIT_FAILURE);                                                                   \
+    }                                                                                       \
+  } while (0)
+#endif
+
+#include <cstdint>
+
+#include "mscclpp_allreduce.cuh"
+
+template <typename T>
+bool isclose(T a, T b, float rtol = 1e-5, float atol = 1e-8) {
+  return fabs(a - b) <= (atol + rtol * fabs(b));
+}
+
+int main(int argc, char* argv[]) {
+  // init mpi
+  MPI_Init(&argc, &argv);
+  printf("MPI Initialized.\n");
+  int nranks, rank;
+
+  // get work size and rank id
+  MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  cudaSetDevice(rank);
+  printf("nranks: %d, rank: %d\n", nranks, rank);
+
+  // init host and device buffers
+  using T = float;
+  using ReduceT = float;
+  const size_t num_elems = 2 * 1024 * 1024;
+  std::vector<T> host_buf(num_elems);
+  for (uint32_t i = 0; i < num_elems; ++i) {
+    host_buf[i] = T(i + rank);
+  }
+  thrust::device_vector<T> device_buf(host_buf);
+  const size_t buf_size_in_bytes = num_elems * sizeof(T);
+  std::vector<T> host_result_buf(num_elems);
+  thrust::device_vector<T> device_result_buf(host_result_buf);
+
+  std::vector<T> host_scratch_buf(num_elems * 8);
+  for (uint32_t i = 0; i < num_elems; ++i) {
+    host_scratch_buf[i] = 1;
+  }
+  thrust::device_vector<T> device_scratch_buf(host_scratch_buf);
+  std::vector<T> host_put_buf(num_elems);
+  thrust::device_vector<T> device_put_buf(host_put_buf);
+
+  mscclpp::UniqueId unique_id;
+  if (rank == 0) unique_id = mscclpp::TcpBootstrap::createUniqueId();
+  MPI_Bcast(&unique_id, sizeof(unique_id), MPI_BYTE, 0, MPI_COMM_WORLD);
+
+  std::vector<int64_t> rank_to_node(nranks);
+  std::vector<int64_t> rank_to_ib(nranks);
+  for (int i = 0; i < nranks; i++) {
+    rank_to_node[i] = i / 8;
+    rank_to_ib[i] = i % 8;
+  }
+
+  cudaStream_t s;
+  CHECK_CUDA_SUCCESS(cudaStreamCreate(&s));
+  CHECK_CUDA_SUCCESS(cudaStreamSynchronize(s));
+  if (nranks == 8) {
+    auto context = std::make_shared<sglang::Msccl1NodeLLcontext>(
+        unique_id,
+        rank,
+        nranks,
+        thrust::raw_pointer_cast(device_scratch_buf.data()),
+        buf_size_in_bytes * 8,
+        rank_to_node,
+        rank_to_ib);
+    printf("rank: %d, Msccl1NodeLLcontext setup.\n", rank);
+    MPI_Barrier(MPI_COMM_WORLD);
+    context->allreduce<T>(
+        s,
+        thrust::raw_pointer_cast(device_buf.data()),
+        thrust::raw_pointer_cast(device_result_buf.data()),
+        device_buf.size());
+  } else if (nranks == 16) {
+    // TODO: this branch is untested since there is something wrong with mpirun in my test machince
+    auto context = std::make_shared<sglang::Msccl2NodeLLcontext>(
+        unique_id,
+        rank,
+        nranks,
+        thrust::raw_pointer_cast(device_scratch_buf.data()),
+        buf_size_in_bytes * 8,
+        thrust::raw_pointer_cast(device_put_buf.data()),
+        buf_size_in_bytes,
+        rank_to_node,
+        rank_to_ib);
+    printf("rank: %d, Msccl2NodeLLcontext setup.\n", rank);
+    MPI_Barrier(MPI_COMM_WORLD);
+    context->allreduce<T>(
+        s,
+        thrust::raw_pointer_cast(device_buf.data()),
+        thrust::raw_pointer_cast(device_result_buf.data()),
+        device_buf.size());
+  }
+
+  // check result correctness
+  thrust::host_vector<T> host_buf_result = device_result_buf;
+  size_t num_results_error_atol_1e_3_rtol_1e_3 = 0;
+  bool nan_detected = false;
+
+  for (uint32_t i = 0; i < num_elems; ++i) {
+    T expected = T(i * nranks + (nranks - 1) * nranks / 2);
+    if (std::isnan(float(host_buf_result[i]))) {
+      nan_detected = true;
+    }
+    if (!isclose(float(host_buf_result[i]), float(expected), 1e-3, 1e-3)) {
+      num_results_error_atol_1e_3_rtol_1e_3++;
+    }
+  }
+  float result_accuracy = 1. - float(num_results_error_atol_1e_3_rtol_1e_3) / float(num_elems);
+
+  printf("rank: %d, nan_detected: %d accuracy: %f\n", rank, nan_detected, result_accuracy);
+
+  CHECK_CUDA_SUCCESS(cudaStreamDestroy(s));
+  MPI_Finalize();
+  return 0;
+}
diff --git a/sgl-kernel/csrc/attention/cascade.cu b/sgl-kernel/csrc/attention/cascade.cu
new file mode 100644
index 000000000..9d49360dd
--- /dev/null
+++ b/sgl-kernel/csrc/attention/cascade.cu
@@ -0,0 +1,55 @@
+// Adapted from
+// https://github.com/flashinfer-ai/flashinfer/blob/55576c626421b5ee7e7ebe74afd26465c8ae863f/csrc/cascade.cu
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <flashinfer/attention/cascade.cuh>
+
+#include "pytorch_extension_utils.h"
+
+using namespace flashinfer;
+
+void merge_state(
+    at::Tensor v_a, at::Tensor s_a, at::Tensor v_b, at::Tensor s_b, at::Tensor v_merged, at::Tensor s_merged) {
+  CHECK_INPUT(v_a);
+  CHECK_INPUT(s_a);
+  CHECK_INPUT(v_b);
+  CHECK_INPUT(s_b);
+  auto device = v_a.device();
+  CHECK_EQ(s_a.device(), device);
+  CHECK_EQ(v_b.device(), device);
+  CHECK_EQ(s_b.device(), device);
+  CHECK_DIM(3, v_a);
+  CHECK_DIM(2, s_a);
+  CHECK_DIM(3, v_b);
+  CHECK_DIM(2, s_b);
+  CHECK_SHAPE(v_a, v_b);
+  CHECK_SHAPE(s_a, s_b);
+  CHECK_EQ(v_a.size(0), s_a.size(0));
+  CHECK_EQ(v_a.size(1), s_b.size(1));
+  unsigned int seq_len = v_a.size(0);
+  unsigned int num_heads = v_a.size(1);
+  unsigned int head_dim = v_a.size(2);
+
+  const c10::cuda::OptionalCUDAGuard device_guard(v_a.device());
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  bool success = DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(v_a.scalar_type(), c_type, [&] {
+    cudaError_t status = MergeState(
+        static_cast<c_type*>(v_a.data_ptr()),
+        static_cast<float*>(s_a.data_ptr()),
+        static_cast<c_type*>(v_b.data_ptr()),
+        static_cast<float*>(s_b.data_ptr()),
+        static_cast<c_type*>(v_merged.data_ptr()),
+        static_cast<float*>(s_merged.data_ptr()),
+        seq_len,
+        num_heads,
+        head_dim,
+        stream);
+    TORCH_CHECK(status == cudaSuccess, "MergeState kernel launch failed: ", cudaGetErrorString(status));
+    return true;
+  });
+
+  TORCH_CHECK(success, "MergeState kernel launch failed: unsupported data type");
+}
diff --git a/sgl-kernel/csrc/attention/cutlass_mla_kernel.cu b/sgl-kernel/csrc/attention/cutlass_mla_kernel.cu
new file mode 100644
index 000000000..a41779c1b
--- /dev/null
+++ b/sgl-kernel/csrc/attention/cutlass_mla_kernel.cu
@@ -0,0 +1,274 @@
+/*
+Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/kernel_hardware_info.h>
+#include <torch/all.h>
+
+#include <cute/tensor.hpp>
+#include <iostream>
+
+#include "cutlass_sm100_mla/device/sm100_mla.hpp"
+#include "cutlass_sm100_mla/kernel/sm100_mla_tile_scheduler.hpp"
+#include "utils.h"
+
+// clang-format off
+#if !defined(CUDA_VERSION) || CUDA_VERSION < 12040
+void cutlass_mla_decode(
+    torch::Tensor const& out,
+    torch::Tensor const& q_nope,
+    torch::Tensor const& q_pe,
+    torch::Tensor const& kv_c_and_k_pe_cache,
+    torch::Tensor const& seq_lens,
+    torch::Tensor const& page_table,
+    torch::Tensor const& workspace,
+    int64_t num_kv_splits) {
+  TORCH_CHECK(false, "CUDA version must be >= 12.4 for cutlass_mla_decode");
+}
+int64_t cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches, int64_t sm_count, int64_t num_kv_splits) {
+  TORCH_CHECK(false, "CUDA version must be >= 12.4 for cutlass_mla_get_workspace_size");
+}
+#else
+
+#define CUTLASS_CHECK(status)                                                       \
+  {                                                                                 \
+    cutlass::Status error = status;                                                 \
+    TORCH_CHECK(error == cutlass::Status::kSuccess, cutlassGetStatusString(error)); \
+  }
+
+using namespace cute;
+using namespace cutlass::fmha::kernel;
+
+template <bool v>
+struct IsPersistent {
+  static const bool value = v;
+};
+
+template <typename T, bool IsPaged128, typename PersistenceOption = IsPersistent<true>>
+struct MlaSm100 {
+  using Element = T;
+  using ElementAcc = float;
+  using ElementOut = T;
+
+  using TileShape = Shape<_128, _128, Shape<_512, _64>>;
+  using TileShapeH = cute::tuple_element_t<0, TileShape>;
+  using TileShapeD = cute::tuple_element_t<2, TileShape>;
+
+  // H K (D_latent D_rope) B
+  using ProblemShape = cute::tuple<TileShapeH, int, TileShapeD, int>;
+
+  using StrideQ = cute::tuple<int64_t, _1, int64_t>;  // H D B
+  using StrideK = cute::tuple<int64_t, _1, int64_t>;  // K D B
+  using StrideO = StrideK;                            // H D B
+  using StrideLSE = cute::tuple<_1, int>;             // H B
+
+  using TileScheduler =
+      std::conditional_t<PersistenceOption::value, Sm100MlaPersistentTileScheduler, Sm100MlaIndividualTileScheduler>;
+
+  using FmhaKernel = cutlass::fmha::kernel::Sm100FmhaMlaKernelTmaWarpspecialized<
+      TileShape,
+      Element,
+      ElementAcc,
+      ElementOut,
+      ElementAcc,
+      TileScheduler,
+      /*kIsCpAsync=*/!IsPaged128>;
+  using Fmha = cutlass::fmha::device::MLA<FmhaKernel>;
+};
+
+template <typename T>
+typename T::Fmha::Arguments args_from_options(
+    at::Tensor const& out,
+    at::Tensor const& q_nope,
+    at::Tensor const& q_pe,
+    at::Tensor const& kv_c_and_k_pe_cache,
+    at::Tensor const& seq_lens,
+    at::Tensor const& page_table,
+    double sm_scale,
+    int64_t num_kv_splits) {
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = q_nope.device().index();
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  int batches = q_nope.size(0);
+  int page_count_per_seq = page_table.size(1);
+  int page_count_total = kv_c_and_k_pe_cache.size(0);
+  int page_size = kv_c_and_k_pe_cache.size(1);
+  int max_seq_len = page_size * page_count_per_seq;
+  using TileShapeH = typename T::TileShapeH;
+  using TileShapeD = typename T::TileShapeD;
+  auto problem_shape = cute::make_tuple(TileShapeH{}, max_seq_len, TileShapeD{}, batches);
+
+  auto [H, K, D, B] = problem_shape;
+  auto [D_latent, D_rope] = D;
+
+  float scale = float(sm_scale);
+
+  using StrideQ = typename T::StrideQ;
+  using StrideK = typename T::StrideK;
+  using StrideO = typename T::StrideO;
+  using StrideLSE = typename T::StrideLSE;
+
+  StrideQ stride_Q_nope = cute::make_tuple(
+      static_cast<int64_t>(q_nope.stride(1)), _1{}, static_cast<int64_t>(q_nope.stride(0)));
+  StrideQ stride_Q_pe = cute::make_tuple(
+      static_cast<int64_t>(q_pe.stride(1)), _1{}, static_cast<int64_t>(q_pe.stride(0)));
+
+  StrideK stride_C = cute::make_tuple(
+      static_cast<int64_t>(0 + D_latent + D_rope), _1{}, static_cast<int64_t>(page_size * (D_latent + D_rope)));
+  StrideLSE stride_PT = cute::make_stride(_1{}, page_count_per_seq);
+  StrideLSE stride_LSE = cute::make_tuple(_1{}, 0 + H);
+  StrideO stride_O = cute::make_tuple(static_cast<int64_t>(0 + D_latent), _1{}, static_cast<int64_t>(0 + H * D_latent));
+
+  using Element = typename T::Element;
+  using ElementOut = typename T::ElementOut;
+  using ElementAcc = typename T::ElementAcc;
+  auto Q_nope_ptr = static_cast<Element*>(q_nope.data_ptr());
+  auto Q_pe_ptr = static_cast<Element*>(q_pe.data_ptr());
+  auto C_ptr = static_cast<Element*>(kv_c_and_k_pe_cache.data_ptr());
+  typename T::Fmha::Arguments arguments{
+      problem_shape,
+      {scale,
+       Q_nope_ptr,
+       stride_Q_nope,
+       Q_pe_ptr,
+       stride_Q_pe,
+       C_ptr,
+       stride_C,
+       C_ptr + D_latent,
+       stride_C,
+       static_cast<int*>(seq_lens.data_ptr()),
+       static_cast<int*>(page_table.data_ptr()),
+       stride_PT,
+       page_count_total,
+       page_size},
+      {static_cast<ElementOut*>(out.data_ptr()), stride_O, static_cast<ElementAcc*>(nullptr), stride_LSE},
+      hw_info,
+      // TODO(trevor-m): Change split_kv back to -1 when
+      // https://github.com/NVIDIA/cutlass/issues/2274 is fixed. Split_kv=1 will
+      // perform worse with larger context length and smaller batch sizes.
+      static_cast<int>(num_kv_splits), // split_kv
+      nullptr,       // is_var_split_kv
+  };
+  // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
+  // split_kv automatically based on batch size and sequence length to balance
+  // workload across available SMs. Consider using var_split_kv for manual
+  // control if needed.
+  T::Fmha::set_split_kv(arguments);
+  return arguments;
+}
+
+template <typename Element, bool IsPaged128, typename PersistenceOption>
+void runMla(
+    at::Tensor const& out,
+    at::Tensor const& q_nope,
+    at::Tensor const& q_pe,
+    at::Tensor const& kv_c_and_k_pe_cache,
+    at::Tensor const& seq_lens,
+    at::Tensor const& page_table,
+    at::Tensor const& workspace,
+    double sm_scale,
+    int64_t num_kv_splits,
+    cudaStream_t stream) {
+  using MlaSm100Type = MlaSm100<Element, IsPaged128, PersistenceOption>;
+  typename MlaSm100Type::Fmha fmha;
+  auto arguments = args_from_options<MlaSm100Type>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
+
+  CUTLASS_CHECK(fmha.can_implement(arguments));
+
+  CUTLASS_CHECK(fmha.initialize(arguments, workspace.data_ptr(), stream));
+
+  CUTLASS_CHECK(fmha.run(arguments, workspace.data_ptr(), stream));
+}
+
+#define DISPATCH_BOOL(expr, const_expr, ...) \
+  [&]() -> bool {                            \
+    if (expr) {                              \
+      constexpr bool const_expr = true;      \
+      return __VA_ARGS__();                  \
+    } else {                                 \
+      constexpr bool const_expr = false;     \
+      return __VA_ARGS__();                  \
+    }                                        \
+  }()
+
+void cutlass_mla_decode(
+    torch::Tensor const& out,
+    torch::Tensor const& q_nope,
+    torch::Tensor const& q_pe,
+    torch::Tensor const& kv_c_and_k_pe_cache,
+    torch::Tensor const& seq_lens,
+    torch::Tensor const& page_table,
+    torch::Tensor const& workspace,
+    double sm_scale,
+    int64_t num_kv_splits) {
+  auto sm_version = getSMVersion();
+  // On SM103a, half of the accuracy tests are failing.
+  TORCH_CHECK(sm_version == 100, "cutlass_mla_decode is only supported on compute capability 10.0, but found sm version ", sm_version);
+
+  auto in_dtype = q_nope.dtype();
+  at::cuda::CUDAGuard device_guard{(char)q_nope.get_device()};
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(q_nope.get_device());
+  const int page_size = kv_c_and_k_pe_cache.size(1);
+
+  // NOTE(alcanderian): IsPersistent has bug with manual split_kv.
+  // Kernel will hang if batch is too large with large num_kv_splits. (for example bs=8, num_kv_splits=8)
+  // Maybe per batch split kv will fix this.
+  DISPATCH_BOOL(page_size == 128, IsPaged128, [&] {
+    DISPATCH_BOOL(num_kv_splits <= 1, NotManualSplitKV, [&] {
+      if (in_dtype == at::ScalarType::Half) {
+        runMla<cutlass::half_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+      } else if (in_dtype == at::ScalarType::BFloat16) {
+        runMla<cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+      } else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
+        runMla<cutlass::float_e4m3_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+      } else {
+        TORCH_CHECK(false, "Unsupported input data type of MLA");
+      }
+      return true;
+    });
+    return true;
+  });
+}
+
+int64_t cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches, int64_t sm_count, int64_t num_kv_splits) {
+  // Workspace size depends on ElementAcc and ElementLSE (same as ElementAcc)
+  // which are float, so Element type here doesn't matter.
+  using MlaSm100Type = MlaSm100<cutlass::half_t, true>;
+
+  // Get split kv. Requires problem shape and sm_count only.
+  typename MlaSm100Type::Fmha::Arguments arguments;
+  using TileShapeH = typename MlaSm100Type::TileShapeH;
+  using TileShapeD = typename MlaSm100Type::TileShapeD;
+  arguments.problem_shape =
+      cute::make_tuple(TileShapeH{}, static_cast<int>(max_seq_len), TileShapeD{}, static_cast<int>(num_batches));
+  // Assumes device 0 when getting sm_count.
+  arguments.hw_info.sm_count =
+      sm_count <= 0 ? cutlass::KernelHardwareInfo::query_device_multiprocessor_count(/*device_id=*/0) : sm_count;
+  arguments.split_kv = static_cast<int>(num_kv_splits);
+  MlaSm100Type::Fmha::set_split_kv(arguments);
+
+  return MlaSm100Type::Fmha::get_workspace_size(arguments);
+}
+
+#endif
+// clang-format on
diff --git a/sgl-kernel/csrc/attention/cutlass_sm100_mla/device/sm100_mla.hpp b/sgl-kernel/csrc/attention/cutlass_sm100_mla/device/sm100_mla.hpp
new file mode 100644
index 000000000..dd4ed231b
--- /dev/null
+++ b/sgl-kernel/csrc/attention/cutlass_sm100_mla/device/sm100_mla.hpp
@@ -0,0 +1,358 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+  \brief An universal device layer for cutlass 3.x-style kernels.
+*/
+
+// clang-format off
+#pragma once
+
+// common
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+
+#if !defined(__CUDACC_RTC__)
+#include "cutlass/cluster_launch.hpp"
+#include "cutlass/trace.h"
+#endif // !defined(__CUDACC_RTC__)
+
+#include "../kernel/sm100_fmha_mla_tma_warpspecialized.hpp"
+#include "../kernel/sm100_fmha_mla_reduction.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::fmha::device {
+
+using namespace cute;
+using namespace cutlass::fmha::kernel;
+
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////// CUTLASS 3.x API /////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+template<
+    class Kernel_
+>
+class MLA {
+public:
+
+  using Kernel = Kernel_;
+
+  using ReductionKernel = cutlass::fmha::kernel::Sm100FmhaMlaReductionKernel<
+      typename Kernel::ElementOut,
+      typename Kernel::ElementAcc,
+      typename Kernel::ElementAcc,
+      Kernel::TileShapeH::value,
+      Kernel::TileShapeL::value,
+      256 /*Max split*/
+  >;
+
+  /// Argument structure: User API
+  using KernelArguments = typename Kernel::Arguments;
+  using ReductionArguments = typename ReductionKernel::Arguments;
+
+  using Arguments = KernelArguments;
+
+  /// Argument structure: Kernel API
+  using KernelParams = typename Kernel::Params;
+  using ReductionParams = typename ReductionKernel::Params;
+  struct Params {
+    KernelParams fmha_params;
+    ReductionParams reduction_params;
+  };
+
+private:
+
+  /// Kernel API parameters object
+  Params params_;
+
+  bool is_initialized(bool set = false) {
+    static bool initialized = false;
+    if (set) initialized = true;
+    return initialized;
+  }
+
+  static ReductionArguments to_reduction_args(Arguments const& args) {
+    auto [H, K, D, B] = args.problem_shape;
+    return ReductionArguments{
+      nullptr, args.epilogue.ptr_o, nullptr, args.epilogue.ptr_lse,
+      args.mainloop.softmax_scale, B, args.split_kv, K, args.mainloop.ptr_seq,
+      args.ptr_split_kv, Kernel::TileShapeS::value
+    };
+  }
+
+public:
+
+  /// Access the Params structure
+  Params const& params() const {
+    return params_;
+  }
+
+  static void set_split_kv (KernelArguments& args) {
+    if (args.split_kv >= 1) return;
+    auto [H, K, D, B] = args.problem_shape;
+    int sm_count = args.hw_info.sm_count;
+    int max_splits = ceil_div(K, 128);
+    int sms_per_batch = max(1, sm_count / B);
+    int split_heur = min(max_splits, sms_per_batch);
+    int waves = ceil_div(B * split_heur, sm_count);
+    int k_waves = ceil_div(max_splits, split_heur);
+    int split_wave_aware = ceil_div(max_splits, k_waves);
+    args.split_kv = split_wave_aware;
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status
+  can_implement(Arguments const& args) {
+    if (! Kernel::can_implement(args)) {
+      return Status::kInvalid;
+    }
+    if (! ReductionKernel::can_implement(to_reduction_args(args))) {
+      return Status::kInvalid;
+    }
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_bytes = 0;
+    workspace_bytes += Kernel::get_workspace_size(args);
+    workspace_bytes += ReductionKernel::get_workspace_size(to_reduction_args(args));
+    return workspace_bytes;
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int /* smem_capacity */ = -1) {
+    CUTLASS_TRACE_HOST("MLA::maximum_active_blocks()");
+    int max_active_blocks = -1;
+    int smem_size = Kernel::SharedStorageSize;
+
+    // first, account for dynamic smem capacity if needed
+    cudaError_t result;
+    if (smem_size >= (48 << 10)) {
+      CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+      result = cudaFuncSetAttribute(
+          device_kernel<Kernel>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize,
+          smem_size);
+      if (cudaSuccess != result) {
+        result = cudaGetLastError(); // to clear the error bit
+        CUTLASS_TRACE_HOST(
+          "  cudaFuncSetAttribute() returned error: "
+          << cudaGetErrorString(result));
+        return -1;
+      }
+    }
+
+    // query occupancy after setting smem size
+    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks,
+        device_kernel<Kernel>,
+        Kernel::MaxThreadsPerBlock,
+        smem_size);
+
+    if (cudaSuccess != result) {
+      result = cudaGetLastError(); // to clear the error bit
+      CUTLASS_TRACE_HOST(
+        "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error: "
+        << cudaGetErrorString(result));
+      return -1;
+    }
+
+    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+    return max_active_blocks;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status
+  initialize(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
+    CUTLASS_TRACE_HOST("MLA::initialize() - workspace "
+      << workspace << ", stream: " << (stream ? "non-null" : "null"));
+
+    // Initialize the workspace
+    Status status = Kernel::initialize_workspace(args, workspace, stream);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    status = ReductionKernel::initialize_workspace(to_reduction_args(args), workspace, stream);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    KernelParams kernel_params = Kernel::to_underlying_arguments(args, workspace);
+
+    ReductionArguments reduction_args = to_reduction_args(args);
+    if (reduction_args.split_kv > 1) {
+      reduction_args.ptr_oaccum   = kernel_params.epilogue.ptr_o_acc;
+      reduction_args.ptr_lseaccum = kernel_params.epilogue.ptr_lse_acc;
+    }
+    ReductionParams reduction_params = ReductionKernel::to_underlying_arguments(reduction_args, workspace);
+    // Initialize the Params structure
+    params_ = Params {kernel_params, reduction_params};
+
+    if (is_initialized()) return Status::kSuccess;
+
+    // account for dynamic smem capacity if needed
+    // no dynamic smem is needed for reduction kernel
+    int smem_size = Kernel::SharedStorageSize;
+    if (smem_size >= (48 << 10)) {
+      CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+      cudaError_t result = cudaFuncSetAttribute(
+          device_kernel<Kernel>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize,
+          smem_size);
+      if (cudaSuccess != result) {
+        result = cudaGetLastError(); // to clear the error bit
+        CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result));
+        return Status::kErrorInternal;
+      }
+    }
+
+    is_initialized(true);
+
+    return Status::kSuccess;
+  }
+
+  /// Update API is preserved in 3.0, but does not guarantee a lightweight update of params.
+  Status
+  update(Arguments const& args, void* workspace = nullptr) {
+    CUTLASS_TRACE_HOST("MLA()::update() - workspace: " << workspace);
+
+    size_t workspace_bytes = get_workspace_size(args);
+    if (workspace_bytes > 0 && nullptr == workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    auto fmha_params = Kernel::to_underlying_arguments(args, workspace);
+
+    ReductionArguments reduction_args = to_reduction_args(args);
+    if (reduction_args.split_kv > 1) {
+      reduction_args.ptr_oaccum   = fmha_params.epilogue.ptr_o_acc;
+      reduction_args.ptr_lseaccum = fmha_params.epilogue.ptr_lse_acc;
+    }
+    ReductionParams reduction_params = ReductionKernel::to_underlying_arguments(reduction_args, workspace);
+    // Initialize the Params structure
+    params_ = Params {fmha_params, reduction_params};
+
+    return Status::kSuccess;
+  }
+
+  /// Primary run() entry point API that is static allowing users to create and manage their own params.
+  /// Supplied params struct must be construct by calling Kernel::to_underling_arguments()
+  static Status
+  run(Params& params, cudaStream_t stream = nullptr) {
+    CUTLASS_TRACE_HOST("MLA::run()");
+    dim3 const block = Kernel::get_block_shape();
+    dim3 const grid = Kernel::get_grid_shape(params.fmha_params);
+
+    // configure smem size and carveout
+    int smem_size = Kernel::SharedStorageSize;
+
+    Status launch_result;
+    // Use extended launch API only for mainloops that use it
+    if constexpr(Kernel::ArchTag::kMinComputeCapability >= 90) {
+      dim3 cluster(cute::size<0>(typename Kernel::ClusterShape{}),
+                   cute::size<1>(typename Kernel::ClusterShape{}),
+                   cute::size<2>(typename Kernel::ClusterShape{}));
+      void const* kernel = (void const*) device_kernel<Kernel>;
+      void* kernel_params[] = {&params.fmha_params};
+      launch_result = ClusterLauncher::launch(grid, cluster, block, smem_size, stream, kernel, kernel_params);
+    }
+    else {
+      launch_result = Status::kSuccess;
+      device_kernel<Kernel><<<grid, block, smem_size, stream>>>(params.fmha_params);
+    }
+
+    cudaError_t result = cudaGetLastError();
+    if (cudaSuccess != result or Status::kSuccess != launch_result) {
+      //return Status::kSuccess;
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
+      return Status::kErrorInternal;
+    }
+    if (params.reduction_params.split_kv > 1) {
+      // launch reduction kernel
+      dim3 const block = ReductionKernel::get_block_shape();
+      dim3 const grid  = ReductionKernel::get_grid_shape(params.reduction_params);
+      device_kernel<ReductionKernel><<<grid, block, 0, stream>>>(params.reduction_params);
+      cudaError_t result = cudaGetLastError();
+      if (cudaSuccess == result) {
+        return Status::kSuccess;
+      }
+      else {
+        CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
+        return Status::kErrorInternal;
+      }
+    }
+    else {
+      return Status::kSuccess;
+    }
+  }
+
+  //
+  // Non-static launch overloads that first create and set the internal params struct of this kernel handle.
+  //
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  run(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
+    Status status = initialize(args, workspace, stream);
+    if (Status::kSuccess == status) {
+      status = run(params_, stream);
+    }
+    return status;
+  }
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  operator()(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
+    return run(args, workspace, stream);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  run(cudaStream_t stream = nullptr) {
+    return run(params_, stream);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  operator()(cudaStream_t stream = nullptr) {
+    return run(params_, stream);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::fmha::device
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/sgl-kernel/csrc/attention/cutlass_sm100_mla/kernel/sm100_fmha_mla_reduction.hpp b/sgl-kernel/csrc/attention/cutlass_sm100_mla/kernel/sm100_fmha_mla_reduction.hpp
new file mode 100644
index 000000000..b75870d0c
--- /dev/null
+++ b/sgl-kernel/csrc/attention/cutlass_sm100_mla/kernel/sm100_fmha_mla_reduction.hpp
@@ -0,0 +1,198 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// clang-format off
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/arch.h"
+#include "cute/tensor.hpp"
+
+namespace cutlass::fmha::kernel {
+
+using namespace cute;
+template<
+    class ElementOut,
+    class ElementAcc,
+    class ElementScale,
+    size_t kNumHeads,
+    size_t kHeadDimLatent,
+    int kMaxSplits
+>
+struct Sm100FmhaMlaReductionKernel {
+
+  static const int SharedStorageSize = 0;
+  static const int MaxThreadsPerBlock = 128;
+  static const int MinBlocksPerMultiprocessor = 1;
+
+  using ArchTag = cutlass::arch::Sm100;
+
+  static_assert(kHeadDimLatent % MaxThreadsPerBlock == 0);
+  struct Arguments {
+    ElementAcc* ptr_oaccum = nullptr;
+    ElementOut* ptr_o = nullptr;
+    ElementAcc* ptr_lseaccum = nullptr;
+    ElementAcc* ptr_lse = nullptr;
+    ElementScale scale = 1.f;
+    int num_batches = 0;
+    int split_kv = -1;
+    int dim_k = -1;
+    int* ptr_seq = nullptr;
+    int* ptr_split_kv = nullptr;
+    int tile_shape_s = 128;
+  };
+  using Params = Arguments;
+
+  static Params to_underlying_arguments(Arguments const& args, void* workspace) {
+    return {args.ptr_oaccum, args.ptr_o, args.ptr_lseaccum, args.ptr_lse,
+	    args.scale, args.num_batches, args.split_kv, args.dim_k, args.ptr_seq,
+	    args.ptr_split_kv, args.tile_shape_s};
+  }
+
+  static size_t get_workspace_size(Arguments const& /*args*/) {
+    return 0;
+  }
+
+  static Status initialize_workspace(
+      Arguments const& /*args*/, void* /*ws*/, cudaStream_t /*stream*/) {
+    return Status::kSuccess;
+  }
+
+  static dim3 get_grid_shape(Params const& params) {
+    return dim3(kNumHeads, 1, params.num_batches);
+  }
+
+  static dim3 get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  static bool can_implement(Arguments const& args) {
+    if (args.num_batches <= 0) return false;
+    if (args.split_kv <= 0) return false;
+    return true;
+  }
+
+  CUTLASS_DEVICE void operator() (Params const& params, char* smem_raw) {
+    if (params.split_kv <= 1) return;
+    auto blk_coord = make_coord(blockIdx.x, _0{}, blockIdx.z);
+
+    __shared__ ElementAcc sLseScale[kMaxSplits];
+    const size_t offset_lseaccum = get<0>(blk_coord) + kNumHeads * params.split_kv * get<2>(blk_coord);
+    const size_t offset_lse = get<0>(blk_coord) + kNumHeads * get<2>(blk_coord);
+
+    Tensor gLSEaccum = make_tensor(make_gmem_ptr(params.ptr_lseaccum + offset_lseaccum),
+                                   make_shape(params.split_kv), Stride<Int<kNumHeads>>{});
+
+    Tensor gLSE = make_tensor(make_gmem_ptr(params.ptr_lse + offset_lse),
+                              Shape<_1>{}, Stride<_1>{});
+
+    auto dim_k = params.ptr_seq == nullptr ?  params.dim_k : params.ptr_seq[get<2>(blk_coord)];
+    auto local_split_kv = params.ptr_split_kv == nullptr ? params.split_kv : params.ptr_split_kv[get<2>(blk_coord)];
+    auto k_tile_total = ceil_div(dim_k, params.tile_shape_s);
+    auto k_tile_per_cta = ceil_div(k_tile_total, local_split_kv);
+    local_split_kv = ceil_div(k_tile_total, k_tile_per_cta);
+
+    int warp_idx = cutlass::canonical_warp_idx_sync();
+    if (warp_idx == 0) {
+      constexpr int kNLsePerThread = cute::ceil_div(kMaxSplits, 32);
+
+      ElementAcc local_lse[kNLsePerThread];
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kNLsePerThread; ++i) {
+        const int split = i * 32 + threadIdx.x;
+        local_lse[i] = split < local_split_kv ? gLSEaccum(split) : -std::numeric_limits<ElementAcc>::infinity();
+      }
+
+      ElementAcc lse_max = -std::numeric_limits<ElementAcc>::infinity();
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kNLsePerThread; ++i) {
+        lse_max = max(lse_max, local_lse[i]);
+      }
+      CUTLASS_PRAGMA_UNROLL
+      for (int offset = 16; offset >= 1; offset /= 2) {
+        lse_max = max(lse_max, __shfl_xor_sync(0xffffffff, lse_max, offset));
+      }
+      lse_max = lse_max == -std::numeric_limits<ElementAcc>::infinity() ? 0.0f : lse_max;  // In case all local LSEs are -inf
+      lse_max = __shfl_sync(0xffffffff, lse_max, 0);
+
+      ElementAcc sum_lse = 0;
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kNLsePerThread; ++i) {
+        sum_lse = sum_lse + expf(local_lse[i] - lse_max);
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int offset = 16; offset >= 1; offset /= 2) {
+        sum_lse = sum_lse + __shfl_xor_sync(0xffffffff, sum_lse, offset);
+      }
+
+      sum_lse = __shfl_sync(0xffffffff, sum_lse, 0);
+
+      ElementAcc global_lse = (sum_lse == 0.f || sum_lse != sum_lse) ? std::numeric_limits<ElementAcc>::infinity() : logf(sum_lse) + lse_max;
+      if (threadIdx.x == 0 and params.ptr_lse != nullptr) {
+        gLSE(0) = global_lse;
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kNLsePerThread; ++i) {
+        const int split = i * 32 + threadIdx.x;
+        if (split < local_split_kv) {
+          sLseScale[split] = expf(local_lse[i] - global_lse);
+        }
+      }
+    }
+    __syncthreads();
+
+    constexpr int Elements = kHeadDimLatent / MaxThreadsPerBlock;
+    const size_t offset_oaccum = kHeadDimLatent * params.split_kv * (get<0>(blk_coord) + kNumHeads * get<2>(blk_coord));
+    Tensor gOaccum = make_tensor(make_gmem_ptr(params.ptr_oaccum + offset_oaccum),
+                               Shape<Int<kHeadDimLatent>>{}, Stride<_1>{});
+    ElementAcc local_val[Elements] = {0};
+    for (int split = 0; split < local_split_kv; ++split) {
+      ElementAcc lse_scale = sLseScale[split];
+      CUTLASS_PRAGMA_UNROLL
+      for(int i = 0; i < Elements; ++i) {
+        local_val[i] += lse_scale * gOaccum(threadIdx.x + MaxThreadsPerBlock * i);
+      }
+      gOaccum.data() = gOaccum.data() + kHeadDimLatent;
+    }
+    auto ptr_o_local = params.ptr_o + (get<0>(blk_coord) + get<2>(blk_coord) * kNumHeads) * kHeadDimLatent;
+    Tensor gO = make_tensor(make_gmem_ptr(ptr_o_local), Shape<Int<kHeadDimLatent>>{}, Stride<_1>{});
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int i = 0; i < Elements; ++i) {
+      gO(threadIdx.x + MaxThreadsPerBlock * i) = static_cast<ElementOut>(local_val[i]);
+    }
+  }
+};
+
+}  // namespace cutlass::fmha::kernel
diff --git a/sgl-kernel/csrc/attention/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp b/sgl-kernel/csrc/attention/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
new file mode 100644
index 000000000..9809db84e
--- /dev/null
+++ b/sgl-kernel/csrc/attention/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
@@ -0,0 +1,2018 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// clang-format off
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cute/arch/simd_sm100.hpp"
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory_sm80.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "gather_tensor.hpp"  // from examples/common
+#include "common/pow_2.hpp"
+
+namespace cutlass::fmha::kernel {
+
+using namespace cute;
+
+template<
+    class TileShape,
+    class Element_,
+    class ElementAcc_,
+    class ElementOut_,
+    class ElementLSE_,
+    class TileScheduler,
+#ifdef CPASYNC
+    bool kIsCpAsync = true
+#else
+    bool kIsCpAsync = false
+#endif
+>
+struct Sm100FmhaMlaKernelTmaWarpspecialized {
+
+  using Element = Element_;
+  using ElementAcc = ElementAcc_;
+  using ElementOut = ElementOut_;
+  using ElementLSE = ElementLSE_;
+
+  // only 2Sm mode is supported
+  static const bool kIs2Sm = true;
+  static const int MaxThreadsPerBlock = 256;
+  static const int MinBlocksPerMultiprocessor = 1;
+  static const int TotalSNum = 2;
+  static const int TotalPNum = 2;
+  using ArchTag = cutlass::arch::Sm100;
+
+  using ClusterShape = cute::conditional_t<kIs2Sm, Shape<_2, _1, _1>, Shape<_1, _1, _1>>;
+
+  using TileShapeH = tuple_element_t<0, TileShape>;
+  using TileShapeS = tuple_element_t<1, TileShape>;
+  using TileShapeD = tuple_element_t<2, TileShape>;
+
+  using TileShapeL = tuple_element_t<0, TileShapeD>;
+  using TileShapeR = tuple_element_t<1, TileShapeD>;
+  static_assert(TileShapeL{} % TileShapeR{} == 0, "Rope head dim must divide latent head dim");
+
+  using ProblemShape = Shape<TileShapeH, int, TileShapeD, int>;
+  using TensorStride   = Stride<int64_t, _1, int64_t>;
+  using TmemAllocator = cute::conditional_t<kIs2Sm, cute::TMEM::Allocator2Sm, cute::TMEM::Allocator1Sm>;
+
+  static_assert(TileShapeH{} == 128);
+  static const int kWarpsInN = kIs2Sm ? 2 : 1;
+
+  static const int kNumComputeWarps = 4;
+  static const int kNumLoadWarps = kIsCpAsync ? 2 : 1;
+
+  enum class WarpRole {
+    kMma = 0x1, kLoad = 0x2, kCompute = 0x3, kLoadPageTable = 0x4, kEmpty=0x0
+  };
+
+  static const long long unsigned int kWarpAssignment = kIsCpAsync ? 0x4221'3333ull : 0x0021'3333ull;
+
+  static CUTLASS_DEVICE WarpRole warp_idx_to_role(int warp_idx) {
+      return static_cast<WarpRole>((kWarpAssignment >> (4 * warp_idx)) & 0xF);
+  }
+
+  static const int Alignment = 128 / sizeof_bits_v<Element>;
+  static const int AlignmentOut = 128 / sizeof_bits_v<ElementOut>;
+
+  using TileShapeQK = Shape<TileShapeH, TileShapeS, decltype(TileShapeR{} / _1{})>;
+  static const int StagesQK = 24 / sizeof(Element);  // free parameter
+  static const int IterationsQKLatent = decltype(TileShapeL{} / get<2>(TileShapeQK{}))::value;
+  static const int IterationsQKRope = decltype(TileShapeR{} / get<2>(TileShapeQK{}))::value;
+  static const int IterationsQK = IterationsQKLatent + IterationsQKRope;
+
+  using Schedule = cute::conditional_t<kIs2Sm, cutlass::gemm::KernelTmaWarpSpecialized2SmSm100, cutlass::gemm::KernelTmaWarpSpecialized1SmSm100>;
+  using CollectiveMmaQK = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      Element, TensorStride, Alignment,
+      Element, TensorStride, Alignment,
+      ElementAcc,
+      TileShapeQK, ClusterShape, cutlass::gemm::collective::StageCount<StagesQK>,
+      Schedule>::CollectiveOp;
+  using TiledMmaQK = typename CollectiveMmaQK::TiledMma;
+  using CtaShapeQK = typename CollectiveMmaQK::CtaShape_MNK;
+
+  // chosen for unified smem staging between K and V
+  using TileShapePV = Shape<TileShapeH, _256, _32>;
+  using TransposeTensorStride = decltype(select<1,0,2>(TensorStride{}));
+  static const int StagesPV = StagesQK;  // not sure why, but must be at least two. check pipes
+  static const int IterationsPV_K = decltype(TileShapeS{} / get<2>(TileShapePV{}))::value;
+  static const int IterationsPV_N = decltype(TileShapeL{} / get<1>(TileShapePV{}))::value;
+
+  using CollectiveMmaPV = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
+      Element, TensorStride, Alignment,
+      Element, TransposeTensorStride, Alignment,
+      ElementAcc,
+      TileShapePV, ClusterShape, cutlass::gemm::collective::StageCount<StagesPV>,
+      Schedule>::CollectiveOp;
+  using CtaShapePV = typename CollectiveMmaPV::CtaShape_MNK;
+  static_assert(std::is_same_v<TransposeTensorStride, typename CollectiveMmaPV::StrideB>);
+
+  using TiledMmaPV = typename CollectiveMmaPV::TiledMma;
+
+  using AtomThrShapeMNK = typename CollectiveMmaQK::AtomThrShapeMNK;
+  static_assert(typename CollectiveMmaQK::AtomThrShapeMNK{} == typename CollectiveMmaPV::AtomThrShapeMNK{}, "schedule must match");
+
+  static const int StagesPageTable = kIsCpAsync ? StagesPV : 1;
+
+  // pipelines from load to mma, PipelineTmaUmmaAsync, stages tbd
+  // use expect_tx for Q load
+  using PipelineLoadQK = cute::conditional_t<kIsCpAsync, PipelineUmmaConsumerAsync<StagesQK, AtomThrShapeMNK>, PipelineTmaUmmaAsync<StagesQK, ClusterShape, AtomThrShapeMNK>>;
+  using PipelineLoadPV = PipelineLoadQK;
+  // pipeline from mma (Q@K) to softmax, PipelineUmmaAsync, 2 stages
+  using PipelineS = PipelineUmmaAsync<TotalSNum, AtomThrShapeMNK>;
+  // pipeline from softmax (P) to mma (bmm2), PipelineUmmaAsync, 2 stages
+  using PipelineP = PipelineUmmaConsumerAsync<TotalPNum, AtomThrShapeMNK>;
+  // pipeline from mma to softmax (for rescale), PipelineUmmaAsync, 1 stage
+  using PipelineO = PipelineUmmaAsync<1, AtomThrShapeMNK>;
+
+  using PipelinePT = PipelineAsync<StagesPageTable>;
+
+  struct PipelineStorage {
+    alignas(16) typename PipelineLoadQK::SharedStorage load_qk;
+    alignas(16) typename PipelineS::SharedStorage mma_s;
+    alignas(16) typename PipelineP::SharedStorage p_mma;
+    alignas(16) typename PipelineO::SharedStorage mma_o;
+    alignas(16) typename PipelinePT::SharedStorage load_page_table;
+  };
+
+  template<class Layout, class Stages = _1>
+  static CUTE_DEVICE constexpr auto unstageSmemLayout(Layout const& layout, Stages stages = {}) {
+      return composition(layout, make_tuple(_, _, _, make_layout(stages)));
+  }
+
+  using SmemLayoutQ = decltype(unstageSmemLayout(typename CollectiveMmaQK::SmemLayoutA{}, Int<IterationsQK>{}));
+  using SmemLayoutKC = typename CollectiveMmaQK::SmemLayoutB;
+  using SmemLayoutVC = typename CollectiveMmaPV::SmemLayoutB;
+  using SmemLayoutP = decltype(unstageSmemLayout(typename CollectiveMmaPV::SmemLayoutA{}, make_shape(Int<IterationsPV_K>{}, _2{})));
+
+  static const int kBytesLoadQ  = size(AtomThrShapeMNK{}) * cutlass::bits_to_bytes(cosize(take<0,3>(SmemLayoutQ{})) * cute::sizeof_bits_v<Element>);
+  static const int kBytesLoadKC = size(AtomThrShapeMNK{}) * cutlass::bits_to_bytes(cosize(take<0,3>(SmemLayoutKC{})) * cute::sizeof_bits_v<Element>);
+  static const int kBytesLoadVC = size(AtomThrShapeMNK{}) * cutlass::bits_to_bytes(cosize(take<0,3>(SmemLayoutVC{})) * cute::sizeof_bits_v<Element>);
+  // pre-condition for overlapped smem staging
+  static_assert(kBytesLoadKC == kBytesLoadVC);
+  static_assert(StagesQK == StagesPV);
+
+  static const int kTransactionsBytesLoadQK = kBytesLoadKC;
+  static const int kTransactionsBytesLoadExtraQ = kBytesLoadQ;
+  static const int kTransactionsBytesLoadPV = kBytesLoadVC;
+
+  static const int kNamedBarrierExchange = (int) cutlass::arch::ReservedNamedBarriers::TransformBarrier;
+  // This Named Barrier is introduced to solve Q tile loading overwritten issue when enable persistent
+  // tile scheduler for FP8 MLA.
+  static const int kNamedBarrierEpilogue = (int) cutlass::arch::ReservedNamedBarriers::EpilogueBarrier;
+  //
+  static const int kNamedBarrierTmemDealloc = (int) cutlass::arch::ReservedNamedBarriers::TmemAllocBarrier;
+
+  enum class TmemAllocation : uint32_t {
+    kSizeS = TileShapeS::value / kWarpsInN,
+    // Overall
+    kSizeO = TileShapeL::value / kWarpsInN,
+    // Between accumulators we loop over
+    kSizeAccO = decltype(get<1>(TileShapePV{}))::value / kWarpsInN,
+    kNumS = TotalSNum,
+    kNumP = TotalPNum,
+    kNumO = 1,
+    kS0 = 0,
+    kS1 = kS0 + kSizeS,
+    kO0 = kS1 + kSizeS,
+    kTotal = kO0 + kSizeO
+  };
+
+  static_assert(static_cast<int>(TmemAllocation::kTotal) <= TmemAllocator::Sm100TmemCapacityColumns, "using too much tmem");
+
+  struct TensorStorage {
+    // to communicate max and row_sum
+    cute::array<ElementAcc, kNumComputeWarps * cutlass::NumThreadsPerWarp> smem_exchange;
+    cute::array<int, StagesPageTable * TileShapeS::value> smem_page_table;
+    alignas(2048) cute::array<Element, cute::cosize_v<SmemLayoutQ>> smem_q;
+    union {
+      alignas(2048) cute::array<Element, cute::cosize_v<SmemLayoutKC>> smem_kc;
+      alignas(2048) cute::array<Element, cute::cosize_v<SmemLayoutVC>> smem_vc;
+    };
+    alignas(2048) cute::array<Element, cute::cosize_v<SmemLayoutP>> smem_p;
+  };
+
+  struct SharedStorage {
+    PipelineStorage pipelines;
+    TensorStorage tensors;
+    uint32_t tmem_base_ptr;
+  };
+
+  static const int SharedStorageSize = sizeof(SharedStorage);
+  static_assert(SharedStorageSize <= cutlass::arch::sm100_smem_capacity_bytes, "using too much smem");
+
+  struct MainloopArguments {
+    ElementAcc softmax_scale;
+
+    // all tensors strides are (num_heads or seqlen, head_dim, batch)
+    // head_dim stride is always 1
+    Element* ptr_q_latent;
+    TensorStride stride_q_latent;
+    Element* ptr_q_rope;
+    TensorStride stride_q_rope;
+
+    Element* ptr_c_latent;
+    TensorStride stride_c_latent;
+    Element* ptr_k_rope;
+    TensorStride stride_k_rope;
+
+    // for paged attention, we interpret what was previously [batch, seqlen]
+    // as [page_count, page_size], and index according to page_table
+    int* ptr_seq = nullptr;
+    int* ptr_page_table = nullptr;
+    // page table is [batch, seqlen or similar]
+    Stride<_1, int> stride_page_table = {};
+    int page_count = 0;
+    int page_size = TileShapeS{};  // powers of two if kIsCpAsync, otherwise TileShapeS
+  };
+
+  struct EpilogueArguments {
+    ElementOut* ptr_o = nullptr;
+    TensorStride stride_o;
+    ElementLSE* ptr_lse = nullptr;
+    Stride<_1, int> stride_lse;
+    ElementAcc output_scale = 1.0f;
+  };
+
+  struct Arguments {
+    // (num_heads=128, seqlen, (d_latent=512, d_rope=64), batch_count)
+    // for paged attention, seqlen is max seqlen
+    ProblemShape problem_shape;
+    MainloopArguments mainloop;
+    EpilogueArguments epilogue;
+    KernelHardwareInfo hw_info;
+    int split_kv = -1;
+    int* ptr_split_kv = nullptr;
+  };
+
+  using TmaLoadQLatent = typename CollectiveMmaQK::Params::TMA_A;
+  using TmaLoadQRope = typename CollectiveMmaQK::Params::TMA_A;
+  using TmaLoadCLatent = typename CollectiveMmaQK::Params::TMA_B;
+  using TmaLoadKRope = typename CollectiveMmaQK::Params::TMA_B;
+  using TmaLoadCLatentTranspose = typename CollectiveMmaPV::Params::TMA_B;
+
+  struct MainloopParams {
+    TmaLoadQLatent tma_load_q_latent;
+    TmaLoadQRope tma_load_q_rope;
+    TmaLoadCLatent tma_load_c_latent;
+    TmaLoadKRope tma_load_k_rope;
+    TmaLoadCLatentTranspose tma_load_c_latent_transpose;
+  };
+
+  struct EpilogueParams {
+    ElementOut* ptr_o = nullptr;
+    ElementAcc* ptr_o_acc = nullptr;
+    TensorStride stride_o;
+    TensorStride stride_o_acc;
+    ElementLSE* ptr_lse = nullptr;
+    ElementLSE* ptr_lse_acc = nullptr;
+    Stride<_1, int> stride_lse;
+    Stride<_1, int> stride_lse_acc;
+    ElementAcc output_scale = 1.0f;
+  };
+
+  struct Params {
+    ProblemShape problem_shape;
+    MainloopArguments mainloop;
+    EpilogueParams epilogue;
+    MainloopParams mainloop_params;
+    typename TileScheduler::Params tile_scheduler;
+    int split_kv = -1;
+    int* ptr_split_kv = nullptr;
+  };
+
+  static Params to_underlying_arguments(Arguments const& args, void* workspace) {
+    //workspace = nullptr;  // let's get an error if one of these needs workspace
+
+    auto [H, K, D, B] = args.problem_shape;
+    auto [L, R] = D;
+
+    int paged_B = B;
+    int paged_K = K;
+    if (args.mainloop.ptr_page_table != nullptr) {
+      paged_B = args.mainloop.page_count;
+      paged_K = args.mainloop.page_size;
+    }
+
+    auto params_qk_latent = CollectiveMmaQK::to_underlying_arguments(
+        make_shape(H, K, L, B),
+        typename CollectiveMmaQK::Arguments {
+          args.mainloop.ptr_q_latent, args.mainloop.stride_q_latent,
+          args.mainloop.ptr_c_latent, args.mainloop.stride_c_latent,
+        }, nullptr);
+
+    auto params_qk_latent_paged = CollectiveMmaQK::to_underlying_arguments(
+        make_shape(H, paged_K, L, paged_B),
+        typename CollectiveMmaQK::Arguments {
+          args.mainloop.ptr_q_latent, args.mainloop.stride_q_latent,
+          args.mainloop.ptr_c_latent, args.mainloop.stride_c_latent,
+        }, nullptr);
+
+    auto params_qk_rope = CollectiveMmaQK::to_underlying_arguments(
+        make_shape(H, K, R, B),
+        typename CollectiveMmaQK::Arguments {
+          args.mainloop.ptr_q_rope, args.mainloop.stride_q_rope,
+          args.mainloop.ptr_k_rope, args.mainloop.stride_k_rope,
+        }, nullptr);
+
+    auto params_qk_rope_paged = CollectiveMmaQK::to_underlying_arguments(
+        make_shape(H, paged_K, R, paged_B),
+        typename CollectiveMmaQK::Arguments {
+          args.mainloop.ptr_q_rope, args.mainloop.stride_q_rope,
+          args.mainloop.ptr_k_rope, args.mainloop.stride_k_rope,
+        }, nullptr);
+
+
+    auto stride_c_latent_transpose = select<1,0,2>(args.mainloop.stride_c_latent);
+    auto params_pv_latent = CollectiveMmaPV::to_underlying_arguments(
+        make_shape(H, L, paged_K, paged_B),
+        typename CollectiveMmaPV::Arguments {
+          args.mainloop.ptr_q_latent, args.mainloop.stride_q_latent,  // dummy, never used
+          args.mainloop.ptr_c_latent, stride_c_latent_transpose,
+        }, nullptr);
+
+    MainloopParams mainloop_params {
+      params_qk_latent.tma_load_a,
+      params_qk_rope.tma_load_a,
+      params_qk_latent_paged.tma_load_b,
+      params_qk_rope_paged.tma_load_b,
+      params_pv_latent.tma_load_b
+    };
+
+    EpilogueParams epilogue_params;
+
+    epilogue_params.ptr_o = args.epilogue.ptr_o;
+    epilogue_params.stride_o = args.epilogue.stride_o;
+    epilogue_params.ptr_lse = args.epilogue.ptr_lse;
+    epilogue_params.stride_lse = args.epilogue.stride_lse;
+    epilogue_params.output_scale = args.epilogue.output_scale;
+
+    if (args.split_kv > 1) {
+      ElementAcc* ptr_o_acc   = reinterpret_cast<ElementAcc*>(workspace);
+      ElementLSE* ptr_lse_acc = reinterpret_cast<ElementLSE*>(ptr_o_acc + H * L * args.split_kv * B);
+      epilogue_params.ptr_o_acc   = ptr_o_acc;
+      epilogue_params.ptr_lse_acc = ptr_lse_acc;
+
+      epilogue_params.stride_o_acc = make_tuple(static_cast<int64_t>(0 + L) * args.split_kv, _1{}, static_cast<int64_t>(0 + H * L) * args.split_kv);
+      epilogue_params.stride_lse_acc = make_tuple(_1{}, (0 + H) * args.split_kv);
+    }
+
+    return {args.problem_shape, args.mainloop, epilogue_params, mainloop_params,
+            TileScheduler::to_underlying_arguments(args.problem_shape, args.hw_info, ClusterShape{}, args.split_kv), args.split_kv, args.ptr_split_kv};
+  }
+
+  static size_t get_workspace_size(Arguments const& args) {
+    ProblemShape problem_shape = args.problem_shape;
+    auto [H, K, D, B] = problem_shape;
+    auto [D_latent, D_rope] = D;
+    auto split_kv = args.split_kv;
+    return (sizeof(ElementAcc) * D_latent + sizeof(ElementLSE)) * H * split_kv * B;
+  }
+  static Status initialize_workspace(
+      Arguments const& /*args*/, void* /*ws*/, cudaStream_t /*stream*/) {
+    return Status::kSuccess;
+  }
+
+  static dim3 get_grid_shape(Params const& params) {
+    return TileScheduler::get_grid_shape(params.tile_scheduler);
+  }
+
+  static dim3 get_block_shape() {
+    dim3 block(MaxThreadsPerBlock, 1, 1);
+    return block;
+  }
+
+  static bool can_implement(Arguments const& args) {
+    if (kIsCpAsync) {
+      if ((args.mainloop.page_size & (args.mainloop.page_size - 1)) != 0) {
+        return false;
+      }
+      if (args.mainloop.page_size > TileShapeS{}) {
+        return false;
+      }
+    }
+    else {
+      if (args.mainloop.ptr_page_table != nullptr && args.mainloop.page_size != TileShapeS{}) {
+        return false;
+      }
+    }
+    if (get<0>(args.problem_shape) != 128) {
+      return false;
+    }
+    if (get<1>(args.problem_shape) <= 0) {
+      return false;
+    }
+    if (args.split_kv <= 0) {
+      return false;
+    }
+    return true;
+  }
+
+
+  CUTLASS_DEVICE void operator()(Params const& params, char* smem_raw) {
+
+    TileScheduler tile_scheduler(params.tile_scheduler);
+
+    int warp_idx = cutlass::canonical_warp_idx_sync();
+    auto role = warp_idx_to_role(warp_idx);
+    uint32_t lane_predicate = cute::elect_one_sync();
+
+    uint32_t cta_rank_in_cluster = cute::block_rank_in_cluster();
+    int cta_coord_v = cta_rank_in_cluster % size<0>(AtomThrShapeMNK{});
+    bool is_mma_leader_cta = cta_coord_v == 0;
+
+    if (role == WarpRole::kLoad && lane_predicate && ! kIsCpAsync) {
+      prefetch_tma_descriptor(params.mainloop_params.tma_load_q_latent.get_tma_descriptor());
+      prefetch_tma_descriptor(params.mainloop_params.tma_load_c_latent.get_tma_descriptor());
+      prefetch_tma_descriptor(params.mainloop_params.tma_load_q_rope.get_tma_descriptor());
+      prefetch_tma_descriptor(params.mainloop_params.tma_load_k_rope.get_tma_descriptor());
+      prefetch_tma_descriptor(params.mainloop_params.tma_load_c_latent_transpose.get_tma_descriptor());
+    }
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_raw);
+
+    typename PipelineLoadQK::Params pipeline_load_qk_params;
+    if (role == WarpRole::kLoad) {
+      pipeline_load_qk_params.role = PipelineLoadQK::ThreadCategory::Producer;
+    }
+    if (role == WarpRole::kMma) {
+      pipeline_load_qk_params.role = PipelineLoadQK::ThreadCategory::Consumer;
+    }
+    if constexpr (kIsCpAsync) {
+      // we can make our life easier by unconditionally loading blocks
+      // since we know it'll always be legal
+      pipeline_load_qk_params.producer_arv_count = kNumLoadWarps * cutlass::NumThreadsPerWarp * size(AtomThrShapeMNK{});
+    }
+    else {
+      pipeline_load_qk_params.is_leader = lane_predicate && (role == WarpRole::kLoad) && is_mma_leader_cta;
+      pipeline_load_qk_params.transaction_bytes = kTransactionsBytesLoadQK;
+    }
+    pipeline_load_qk_params.initializing_warp = 0;
+    PipelineLoadQK pipeline_load_qk(shared_storage.pipelines.load_qk, pipeline_load_qk_params,
+      ClusterShape{}, /*barrier init*/ cute::true_type{}, /*mask calc*/cute::false_type{});
+
+    typename PipelineS::Params pipeline_mma_s_params;
+    if (role == WarpRole::kMma) {
+      pipeline_mma_s_params.role = PipelineS::ThreadCategory::Producer;
+    }
+    if (role == WarpRole::kCompute) {
+      pipeline_mma_s_params.role = PipelineS::ThreadCategory::Consumer;
+    }
+    pipeline_mma_s_params.consumer_arv_count = kNumComputeWarps * cutlass::NumThreadsPerWarp * size(AtomThrShapeMNK{});
+    pipeline_mma_s_params.initializing_warp = 1;
+    PipelineS pipeline_mma_s(
+      shared_storage.pipelines.mma_s,
+      pipeline_mma_s_params,
+      ClusterShape{}, /*barrier init*/ cute::true_type{}, /*mask calc*/cute::false_type{});
+
+    typename PipelineP::Params pipeline_p_mma_params;
+    if (role == WarpRole::kMma) {
+      pipeline_p_mma_params.role = PipelineP::ThreadCategory::Consumer;
+    }
+    if (role == WarpRole::kCompute) {
+      pipeline_p_mma_params.role = PipelineP::ThreadCategory::Producer;
+    }
+    pipeline_p_mma_params.producer_arv_count = kNumComputeWarps * cutlass::NumThreadsPerWarp * size(AtomThrShapeMNK{});
+    pipeline_p_mma_params.consumer_arv_count = 1;
+    pipeline_p_mma_params.initializing_warp = 2;
+    PipelineP pipeline_p_mma(
+      shared_storage.pipelines.p_mma,
+      pipeline_p_mma_params,
+      ClusterShape{}, /*barrier init*/ cute::true_type{}, /*mask calc*/cute::false_type{});
+
+    typename PipelineO::Params pipeline_mma_o_params;
+    if (role == WarpRole::kMma) {
+      pipeline_mma_o_params.role = PipelineO::ThreadCategory::Producer;
+    }
+    if (role == WarpRole::kCompute) {
+      pipeline_mma_o_params.role = PipelineO::ThreadCategory::Consumer;
+    }
+    pipeline_mma_o_params.consumer_arv_count = kNumComputeWarps * cutlass::NumThreadsPerWarp * size(AtomThrShapeMNK{});
+    pipeline_mma_o_params.initializing_warp = 3;
+    PipelineO pipeline_mma_o(
+      shared_storage.pipelines.mma_o,
+      pipeline_mma_o_params,
+      ClusterShape{}, /*barrier init*/ cute::true_type{}, /*mask calc*/cute::false_type{});
+
+    typename PipelinePT::Params pipeline_pt_params;
+    if (role == WarpRole::kLoad) {
+      pipeline_pt_params.role = PipelinePT::ThreadCategory::Consumer;
+    }
+    if (role == WarpRole::kLoadPageTable) {
+      pipeline_pt_params.role = PipelinePT::ThreadCategory::Producer;
+    }
+    pipeline_pt_params.consumer_arv_count = kNumLoadWarps * cutlass::NumThreadsPerWarp;
+    pipeline_pt_params.producer_arv_count = cutlass::NumThreadsPerWarp;
+    pipeline_pt_params.initializing_warp = 4;
+    PipelinePT pipeline_page_table(
+      shared_storage.pipelines.load_page_table,
+      pipeline_pt_params);
+
+    TmemAllocator tmem_allocator;
+
+    pipeline_init_arrive_relaxed(size(ClusterShape{}));
+
+    pipeline_load_qk.init_masks(ClusterShape{});  // do we need an update here for 2Sm?
+    pipeline_mma_s.init_masks(ClusterShape{});
+    pipeline_p_mma.init_masks(ClusterShape{});
+    pipeline_mma_o.init_masks(ClusterShape{});
+
+    typename PipelineLoadQK::PipelineState pipeline_load_qk_consumer_state;
+    typename PipelineLoadQK::PipelineState pipeline_load_qk_producer_state = cutlass::make_producer_start_state<PipelineLoadQK>();
+
+    typename PipelineS::PipelineState pipeline_mma_s_consumer_state;
+    typename PipelineS::PipelineState pipeline_mma_s_producer_state = cutlass::make_producer_start_state<PipelineS>();
+
+    typename PipelineP::PipelineState pipeline_p_mma_consumer_state;
+    typename PipelineP::PipelineState pipeline_p_mma_producer_state = cutlass::make_producer_start_state<PipelineP>();
+
+    typename PipelineO::PipelineState pipeline_mma_o_consumer_state;
+    typename PipelineO::PipelineState pipeline_mma_o_producer_state = cutlass::make_producer_start_state<PipelineO>();
+
+    typename PipelinePT::PipelineState pipeline_pt_consumer_state;
+    typename PipelinePT::PipelineState pipeline_pt_producer_state = cutlass::make_producer_start_state<PipelinePT>();
+
+    pipeline_init_wait(size(ClusterShape{}));
+
+    if (role == WarpRole::kLoadPageTable) {
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; tile_scheduler.is_valid(); ++tile_scheduler) {
+        auto blk_coord = tile_scheduler.get_block_coord();
+        auto problem_shape = params.problem_shape;
+	auto local_split_kv = params.split_kv;
+        if (params.mainloop.ptr_seq != nullptr) {
+          get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
+	  if (params.ptr_split_kv != nullptr) {
+            local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
+          }
+        }
+	if (local_split_kv <= get<3>(blk_coord))
+	  continue;
+        load_page_table(
+          blk_coord,
+          problem_shape,
+          params.mainloop,
+          shared_storage.tensors,
+          pipeline_page_table, pipeline_pt_producer_state,
+	  local_split_kv
+        );
+      }
+    }
+    else if (role == WarpRole::kLoad) {
+      if constexpr (kIsCpAsync) {
+        CUTLASS_PRAGMA_NO_UNROLL
+        for (; tile_scheduler.is_valid(); ++tile_scheduler) {
+          auto blk_coord = tile_scheduler.get_block_coord();
+	  auto problem_shape = params.problem_shape;
+	  auto local_split_kv = params.split_kv;
+          if (params.mainloop.ptr_seq != nullptr) {
+            get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
+	    if (params.ptr_split_kv != nullptr) {
+              local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
+            }
+          }
+	  if (local_split_kv <= get<3>(blk_coord))
+            continue;
+          load_cpasync(
+            blk_coord,
+            problem_shape,
+            params.mainloop,
+            params.mainloop_params,
+            shared_storage.tensors,
+            pipeline_load_qk, pipeline_load_qk_producer_state,
+	    local_split_kv,
+            /* must be shared pipe */
+            pipeline_page_table, pipeline_pt_consumer_state
+          );
+          cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
+        }
+      }
+      else {
+        if (params.mainloop.ptr_page_table != nullptr) {
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (; tile_scheduler.is_valid(); ++tile_scheduler) {
+            auto blk_coord = tile_scheduler.get_block_coord();
+	    auto problem_shape = params.problem_shape;
+	    auto local_split_kv = params.split_kv;
+            if (params.mainloop.ptr_seq != nullptr) {
+              get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
+	      if (params.ptr_split_kv != nullptr) {
+	        local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
+	      }
+            }
+	    if (local_split_kv <= get<3>(blk_coord))
+              continue;
+            load_tma</* paged= */ true>(
+              blk_coord,
+              problem_shape,
+              params.mainloop,
+              params.mainloop_params,
+              shared_storage.tensors,
+              pipeline_load_qk, pipeline_load_qk_producer_state,
+              pipeline_load_qk, pipeline_load_qk_producer_state,
+	      local_split_kv
+            );
+            cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
+          }
+        }
+        else {
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (; tile_scheduler.is_valid(); ++tile_scheduler) {
+            auto blk_coord = tile_scheduler.get_block_coord();
+	    auto problem_shape = params.problem_shape;
+	    auto local_split_kv = params.split_kv;
+            if (params.mainloop.ptr_seq != nullptr) {
+              get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
+	      if (params.ptr_split_kv != nullptr) {
+                local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
+	      }
+            }
+	    if (local_split_kv <= get<3>(blk_coord))
+              continue;
+            load_tma<false>(
+              blk_coord,
+              problem_shape,
+              params.mainloop,
+              params.mainloop_params,
+              shared_storage.tensors,
+              pipeline_load_qk, pipeline_load_qk_producer_state,
+              pipeline_load_qk, pipeline_load_qk_producer_state,
+	      local_split_kv
+            );
+            cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait();
+          }
+        }
+      }
+    }
+    else if (role == WarpRole::kMma) {
+      tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr);
+      __syncwarp();
+
+      if (is_mma_leader_cta) {
+        CUTLASS_PRAGMA_NO_UNROLL
+        for (; tile_scheduler.is_valid(); ++tile_scheduler) {
+          auto blk_coord = tile_scheduler.get_block_coord();
+          auto problem_shape = params.problem_shape;
+	  auto local_split_kv = params.split_kv;
+          if (params.mainloop.ptr_seq != nullptr) {
+            get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
+            if (params.ptr_split_kv != nullptr) {
+                local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
+            }
+          }
+	  if (local_split_kv <= get<3>(blk_coord))
+            continue;
+          mma(blk_coord,
+            problem_shape,
+            shared_storage.tensors,
+            pipeline_load_qk, pipeline_load_qk_consumer_state,
+            pipeline_load_qk, pipeline_load_qk_consumer_state,
+            pipeline_mma_s, pipeline_mma_s_producer_state,
+            pipeline_p_mma, pipeline_p_mma_consumer_state,
+            pipeline_mma_o, pipeline_mma_o_producer_state,
+	    local_split_kv
+          );
+        }
+      }
+
+      //cutlass::arch::NamedBarrier((kNumComputeWarps + 1) * NumThreadsPerWarp, kNamedBarrierTmemDealloc).arrive_and_wait();
+
+      //uint32_t free_stage_ptr = shared_storage.tmem_base_ptr;
+      //tmem_allocator.free(free_stage_ptr, TmemAllocator::Sm100TmemCapacityColumns);
+    }
+    else if (role == WarpRole::kCompute) {
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; tile_scheduler.is_valid(); ++tile_scheduler) {
+        auto blk_coord = tile_scheduler.get_block_coord();
+        auto problem_shape = params.problem_shape;
+	auto split_kv = params.split_kv;
+	auto local_split_kv = split_kv;
+        if (params.mainloop.ptr_seq != nullptr) {
+          get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
+	  if (params.ptr_split_kv != nullptr) {
+            local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
+          }
+        }
+	if (local_split_kv <= get<3>(blk_coord))
+          continue;
+        compute(
+          blk_coord,
+          problem_shape,
+          params.mainloop,         // for softmax_scale
+          params.epilogue,
+          shared_storage.tensors,  // for smem_comm
+          pipeline_mma_s, pipeline_mma_s_consumer_state,
+          pipeline_p_mma, pipeline_p_mma_producer_state,
+          pipeline_mma_o, pipeline_mma_o_consumer_state,
+	  local_split_kv
+        );
+      }
+
+      //cutlass::arch::NamedBarrier((kNumComputeWarps + 1) * NumThreadsPerWarp, kNamedBarrierTmemDealloc).arrive();
+    }
+
+    cute::cluster_sync();
+    cutlass::arch::NamedBarrier((kNumComputeWarps + 1) * NumThreadsPerWarp, kNamedBarrierTmemDealloc).arrive();
+    if (role == WarpRole::kMma) {
+      uint32_t free_stage_ptr = shared_storage.tmem_base_ptr;
+      tmem_allocator.free(free_stage_ptr, TmemAllocator::Sm100TmemCapacityColumns);
+    }
+  }
+
+  template<class BlkCoord>
+  CUTLASS_DEVICE void load_page_table(
+      BlkCoord const& blk_coord,
+      ProblemShape const& problem_shape,
+      MainloopArguments const& mainloop_args,
+      TensorStorage& shared_tensors,
+      PipelinePT& pipeline_page_table,
+      typename PipelinePT::PipelineState& pipeline_pt_producer_state, int const& split_kv) {
+
+    auto [H, K, D, B] = problem_shape;
+    int batch_coord = get<2>(blk_coord);
+
+    auto mPT_l = make_tensor(make_gmem_ptr(mainloop_args.ptr_page_table),
+                            make_shape(mainloop_args.page_count, B),
+                            mainloop_args.stride_page_table);
+    auto mPT = mPT_l(_, batch_coord);
+
+    int k_tile_total = ceil_div(K, TileShapeS{});
+    int k_tile_per_cta = ceil_div(k_tile_total, split_kv);
+    int k_index = get<3>(blk_coord) * k_tile_per_cta; // lower limit
+    int k_tile_count = max(0, min(k_tile_total, k_index + k_tile_per_cta) - k_index);
+    if (k_tile_count == 0) {
+      return;
+    }
+
+    auto page_size = Pow2{mainloop_args.page_size};
+    auto pages_per_tile = Pow2{TileShapeS{} / page_size};
+    int thread_idx = threadIdx.x % cutlass::NumThreadsPerWarp;
+
+#if 1
+    for (; k_tile_count > 0; ++k_index, --k_tile_count) {
+      pipeline_page_table.producer_acquire(pipeline_pt_producer_state);
+
+      // assume a single warp
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < TileShapeS{}; i += cutlass::NumThreadsPerWarp) {
+        int idx = i + thread_idx;
+        bool guard = idx < pages_per_tile;
+        int smem_idx = pipeline_pt_producer_state.index() * TileShapeS::value + idx;
+        int pt_idx = pages_per_tile * k_index + idx;
+
+        cutlass::arch::cp_async_zfill<sizeof(int), cutlass::arch::CacheOperation::Always>(
+          &shared_tensors.smem_page_table[smem_idx], &mPT(pt_idx), guard
+        );
+      }
+
+      pipeline_page_table.producer_commit(pipeline_pt_producer_state, cutlass::arch::cpasync_barrier_arrive);
+      ++pipeline_pt_producer_state;
+    }
+#endif
+  }
+
+
+  struct Gather {
+    int& page_table_stage;
+    Pow2 pages_per_tile;
+    const int * __restrict__ smem_page_table;
+
+    CUTLASS_DEVICE int operator()(int idx) const {
+      return smem_page_table[page_table_stage * TileShapeS::value + idx % pages_per_tile];
+    }
+
+    CUTLASS_DEVICE friend void print(Gather const&) {
+      printf("<gather>");
+    }
+
+  };
+
+
+  template<class BlkCoord>
+  CUTLASS_DEVICE void load_cpasync(
+      BlkCoord const& blk_coord,
+      ProblemShape const& problem_shape,
+      MainloopArguments const& mainloop_args,
+      MainloopParams const& mainloop_params,
+      TensorStorage& shared_tensors,
+      PipelineLoadQK& pipeline_load,
+      typename PipelineLoadQK::PipelineState& pipeline_load_producer_state,
+      int const& split_kv,
+      PipelinePT& pipeline_page_table,
+      typename PipelinePT::PipelineState& pipeline_pt_consumer_state) {
+
+    auto [H, K, D, B] = problem_shape;
+    auto [D_latent, D_rope] = D;
+
+    using X = Underscore;
+
+    int k_tile_total = ceil_div(K, TileShapeS{});
+    int k_tile_per_cta = ceil_div(k_tile_total, split_kv);
+    int k_index = get<3>(blk_coord) * k_tile_per_cta; // lower limit
+    int k_tile_count = max(0, min(k_tile_total, k_index + k_tile_per_cta) - k_index);
+    if (k_tile_count == 0) {
+      return;
+    }
+
+    // partition all tensors
+    auto mQL = make_tensor(make_gmem_ptr(mainloop_args.ptr_q_latent), make_shape(H, D_latent, B), mainloop_args.stride_q_latent);
+    auto mQR = make_tensor(make_gmem_ptr(mainloop_args.ptr_q_rope), make_shape(H, D_rope, B), mainloop_args.stride_q_rope);
+
+    int  paged_B = mainloop_args.page_count;
+    auto paged_K = Pow2{mainloop_args.page_size};
+    auto mPT_l = make_tensor(make_gmem_ptr(mainloop_args.ptr_page_table), make_shape(paged_B, B), mainloop_args.stride_page_table);
+
+    int batch_coord = get<2>(blk_coord);
+    auto mPT = mPT_l(_, batch_coord);
+
+    auto gQL = local_tile(mQL, TileShapeQK{}, make_coord(_,_,_), Step<_1, X, _1>{});
+    auto gQR = local_tile(mQR, TileShapeQK{}, make_coord(_,_,_), Step<_1, X, _1>{});
+
+    ThrMMA cta_mma_qk = TiledMmaQK{}.get_slice(get<0>(blk_coord) % size(AtomThrShapeMNK{}));
+    ThrMMA cta_mma_pv = TiledMmaPV{}.get_slice(get<0>(blk_coord) % size(AtomThrShapeMNK{}));
+
+    auto tSgQL = cta_mma_qk.partition_A(gQL);
+    auto tSgQR = cta_mma_qk.partition_A(gQR);
+
+    Tensor sQ = make_tensor(make_smem_ptr(shared_tensors.smem_q.begin()), SmemLayoutQ{});
+    Tensor sKC = make_tensor(make_smem_ptr(shared_tensors.smem_kc.begin()), SmemLayoutKC{});
+    Tensor sVC = make_tensor(make_smem_ptr(shared_tensors.smem_vc.begin()), SmemLayoutVC{});
+
+    auto make_copy_for = [](auto sT) {
+      auto rT_a = sT.layout()(_, _, _, _0{});
+      auto rT = make_ordered_layout(shape(rT_a), stride(rT_a));
+      auto threads = Int<kNumLoadWarps * cutlass::NumThreadsPerWarp>{};
+      auto values = Int<sizeof(uint128_t) / sizeof(Element)>{};
+      return make_cotiled_copy(
+          Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<uint128_t>, Element>{},
+          make_ordered_layout(
+              make_shape(threads, values),
+              make_stride(_1{}, _0{})),
+          rT);
+    };
+
+    // like cute::copy, but makes sure we do all page table lookups first
+    auto copy_split = [](auto atom, auto src, auto dst) {
+      auto src_v = group_modes<1, rank_v<decltype(src)>>(src);
+      auto dst_v = group_modes<1, rank_v<decltype(dst)>>(dst);
+
+      auto src_v_ptrs = make_tensor<Element*>(size<1>(src_v));
+      for (int i = 0; i < size<1>(src_v); i++) {
+        src_v_ptrs(i) = &src_v(_0{}, i);
+      }
+
+
+      for (int i = 0; i < size<1>(src_v); i++) {
+        auto src_v_i = make_tensor(
+            make_gmem_ptr(src_v_ptrs(i)),
+            make_shape(shape<0>(src_v)),
+            make_stride(make_stride(_1{}, _0{}))
+        );
+        atom.call(src_v_i, dst_v(_, i));
+      }
+    };
+
+    auto tiled_copy_q = make_copy_for(sQ);
+    auto tiled_copy_kc = make_copy_for(sKC);
+    auto tiled_copy_vc = make_copy_for(sVC);
+
+    auto thr_copy_q = tiled_copy_q.get_thread_slice(threadIdx.x % (kNumLoadWarps * cutlass::NumThreadsPerWarp));
+    auto thr_copy_kc = tiled_copy_kc.get_thread_slice(threadIdx.x % (kNumLoadWarps * cutlass::NumThreadsPerWarp));
+    auto thr_copy_vc = tiled_copy_vc.get_thread_slice(threadIdx.x % (kNumLoadWarps * cutlass::NumThreadsPerWarp));
+
+    auto tQsQ = thr_copy_q.partition_D(sQ);
+    auto tQgQL = thr_copy_q.partition_S(tSgQL);
+    auto tQgQR = thr_copy_q.partition_S(tSgQR);
+
+    auto tKCsKC = thr_copy_kc.partition_D(sKC);
+    auto tVCsVC = thr_copy_vc.partition_D(sVC);
+
+    auto pipeline_pt_release_state = pipeline_pt_consumer_state;
+
+    int page_table_stage = -1;
+    Pow2 pages_per_tile{TileShapeS{} / paged_K};
+    const int * __restrict__ smem_page_table = shared_tensors.smem_page_table.begin();
+    Gather gather{page_table_stage, pages_per_tile, smem_page_table};
+
+    auto mCL = make_tensor(
+        make_gmem_ptr(mainloop_args.ptr_c_latent),
+        ComposedLayout{
+            make_layout(
+                make_shape(make_shape(paged_K, paged_B), _1{}),
+                make_stride(make_stride(get<0>(mainloop_args.stride_c_latent), example::CustomStride(gather, get<2>(mainloop_args.stride_c_latent))), get<1>(mainloop_args.stride_c_latent))),
+            make_coord(_0{}, _0{}),
+            make_identity_layout(make_shape(paged_K * paged_B, D_latent))});
+
+    auto mKR = make_tensor(
+        make_gmem_ptr(mainloop_args.ptr_k_rope),
+        ComposedLayout{
+            make_layout(
+                make_shape(make_shape(paged_K, paged_B), _1{}),
+                make_stride(make_stride(get<0>(mainloop_args.stride_k_rope), example::CustomStride(gather, get<2>(mainloop_args.stride_k_rope))), get<1>(mainloop_args.stride_k_rope))),
+            make_coord(_0{}, _0{}),
+            make_identity_layout(make_shape(paged_K * paged_B, D_latent))});
+
+    auto mCLT = make_tensor(
+        make_gmem_ptr(mainloop_args.ptr_c_latent),
+        ComposedLayout{
+            make_layout(
+                make_shape(_1{}, make_shape(paged_K, paged_B)),
+                make_stride(get<1>(mainloop_args.stride_c_latent), make_stride(get<0>(mainloop_args.stride_c_latent), example::CustomStride(gather, get<2>(mainloop_args.stride_c_latent))))),
+            make_coord(_0{}, _0{}),
+            make_identity_layout(make_shape(D_latent, paged_K * paged_B))});
+
+    auto gCL = local_tile(mCL, TileShapeQK{}, make_coord(_,_,_), Step<X, _1, _1>{});
+    auto gKR = local_tile(mKR, TileShapeQK{}, make_coord(_,_,_), Step<X, _1, _1>{});
+    auto gCLT = local_tile(mCLT, TileShapePV{}, make_coord(_,_,_), Step<X, _1, _1>{});
+
+    auto tSgCL = cta_mma_qk.partition_B(gCL);
+    auto tSgKR = cta_mma_qk.partition_B(gKR);
+    auto tOgCLT = cta_mma_pv.partition_B(gCLT);
+
+    auto tKCgCL = thr_copy_kc.partition_S(tSgCL);
+    auto tKCgKR = thr_copy_kc.partition_S(tSgKR);
+    auto tVCgCLT = thr_copy_vc.partition_S(tOgCLT);
+
+    // latent is first in memory, so let's load it first always
+    // startup: alternate Q and K, set tx count appropriately, for k_idx = 0
+    auto& pipeline_acquire_state = pipeline_load_producer_state;
+    auto pipeline_commit_state = pipeline_acquire_state;
+    int pipeline_offset = 0;
+
+    for (int i = 0; i < StagesPV; i++) {
+      cutlass::arch::cp_async_fence();
+    }
+
+    auto load_stage = [&](auto fn) {
+      pipeline_load.producer_acquire(pipeline_acquire_state);
+      fn(pipeline_acquire_state.index());
+      cutlass::arch::cp_async_fence();
+
+      ++pipeline_acquire_state;
+      ++pipeline_offset;
+
+      if (pipeline_offset == StagesPV - 1) {
+        cutlass::arch::cp_async_wait<StagesPV - 1>();
+        pipeline_load.producer_commit(pipeline_commit_state);
+        ++pipeline_commit_state;
+        --pipeline_offset;
+      }
+    };
+
+    pipeline_page_table.consumer_wait(pipeline_pt_consumer_state);
+    page_table_stage = pipeline_pt_consumer_state.index();
+    ++pipeline_pt_consumer_state;
+
+    // each Q/K tile consists of rope and latent
+    for (int i = 0; i < IterationsQKLatent; i++) {
+      load_stage([&](int index) {
+        cute::copy(tiled_copy_q, tQgQL(_, _, _, _, _0{}, i, batch_coord),  tQsQ(_, _, _, _, i));
+        copy_split(tiled_copy_kc, tKCgCL(_, _, _, _, k_index, i),  tKCsKC(_, _, _, _, index));
+      });
+    }
+
+    for (int i = 0; i < IterationsQKRope; i++) {
+      load_stage([&](int index) {
+        cute::copy(tiled_copy_q, tQgQR(_, _, _, _, _0{}, i, batch_coord),  tQsQ(_, _, _, _, IterationsQKLatent + i));
+        copy_split(tiled_copy_kc, tKCgKR(_, _, _, _, k_index, i),  tKCsKC(_, _, _, _, index));
+      });
+    }
+
+    k_index += 1;
+    k_tile_count -= 1;
+
+    // assume k_tile_count >= 1
+    // perform K+Q load here
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+
+      pipeline_page_table.consumer_wait(pipeline_pt_consumer_state);
+      page_table_stage = pipeline_pt_consumer_state.index();
+      ++pipeline_pt_consumer_state;
+
+      for (int i = 0; i < IterationsQKLatent; i++) {
+        load_stage([&](int index) {
+          copy_split(tiled_copy_kc, tKCgCL(_, _, _, _, k_index, i),  tKCsKC(_, _, _, _, index));
+        });
+      }
+
+      for (int i = 0; i < IterationsQKRope; i++) {
+        load_stage([&](int index) {
+          copy_split(tiled_copy_kc, tKCgKR(_, _, _, _, k_index, i),  tKCsKC(_, _, _, _, index));
+        });
+      }
+
+      page_table_stage = pipeline_pt_release_state.index();
+
+      for (int i = 0; i < IterationsPV_K; i++) {
+        for (int j = 0; j < IterationsPV_N; j++) {
+          load_stage([&](int index) {
+            copy_split(tiled_copy_vc, tVCgCLT(_, _, _, _, j, IterationsPV_K * (k_index - 1) + i),  tVCsVC(_, _, _, _, index));
+          });
+        }
+      }
+
+      pipeline_page_table.consumer_release(pipeline_pt_release_state);
+      ++pipeline_pt_release_state;
+
+      k_index += 1;
+      k_tile_count -= 1;
+    }
+
+    page_table_stage = pipeline_pt_release_state.index();
+
+    for (int i = 0; i < IterationsPV_K; i++) {
+      for (int j = 0; j < IterationsPV_N; j++) {
+        load_stage([&](int index) {
+          copy_split(tiled_copy_vc, tVCgCLT(_, _, _, _, j, IterationsPV_K * (k_index - 1) + i),  tVCsVC(_, _, _, _, index));
+        });
+      }
+    }
+
+    pipeline_page_table.consumer_release(pipeline_pt_release_state);
+    ++pipeline_pt_release_state;
+
+    while (pipeline_offset > 0) {
+      cutlass::arch::cp_async_fence();
+
+      cutlass::arch::cp_async_wait<StagesPV - 1>();
+      pipeline_load.producer_commit(pipeline_commit_state);
+      ++pipeline_commit_state;
+      --pipeline_offset;
+    }
+
+    cutlass::arch::cp_async_wait<0>();
+
+  }
+
+
+  template<bool kIsPaged = false, class BlkCoord>
+  CUTLASS_DEVICE void load_tma(
+      BlkCoord const& blk_coord,
+      ProblemShape const& problem_shape,
+      MainloopArguments const& mainloop_args,
+      MainloopParams const& mainloop_params,
+      TensorStorage& shared_tensors,
+      PipelineLoadQK& pipeline_load_qk,
+      typename PipelineLoadQK::PipelineState& pipeline_load_qk_producer_state,
+      PipelineLoadPV& pipeline_load_pv,
+      typename PipelineLoadPV::PipelineState& pipeline_load_pv_producer_state,
+      int const& split_kv) {
+
+    auto [H, K, D, B] = problem_shape;
+    auto [D_latent, D_rope] = D;
+
+    int k_tile_total = ceil_div(K, TileShapeS{});
+    int k_tile_per_cta = ceil_div(k_tile_total, split_kv);
+    int k_index = get<3>(blk_coord) * k_tile_per_cta; // lower limit
+    int k_tile_count = max(0, min(k_tile_total, k_index + k_tile_per_cta) - k_index);
+    if (k_tile_count == 0) {
+      return;
+    }
+
+    using X = Underscore;
+
+    // partition all tensors
+    auto mQL = mainloop_params.tma_load_q_latent.get_tma_tensor(make_shape(H, D_latent, B));
+    auto mQR = mainloop_params.tma_load_q_rope.get_tma_tensor(make_shape(H, D_rope, B));
+
+    int paged_B = B;
+    int paged_K = K;
+    if constexpr (kIsPaged) {
+      paged_B = mainloop_args.page_count;
+      paged_K = mainloop_args.page_size;
+    }
+    auto mPT_l = make_tensor(make_gmem_ptr(mainloop_args.ptr_page_table), make_shape(paged_B, B), mainloop_args.stride_page_table);
+
+    auto mCL = mainloop_params.tma_load_c_latent.get_tma_tensor(make_shape(paged_K, D_latent, paged_B));
+    auto mKR = mainloop_params.tma_load_k_rope.get_tma_tensor(make_shape(paged_K, D_rope, paged_B));
+
+    auto mCLT = mainloop_params.tma_load_c_latent_transpose.get_tma_tensor(make_shape(D_latent, paged_K, paged_B));
+
+    auto gQL = local_tile(mQL, TileShapeQK{}, make_coord(_,_,_), Step<_1, X, _1>{});
+    auto gQR = local_tile(mQR, TileShapeQK{}, make_coord(_,_,_), Step<_1, X, _1>{});
+
+    auto gCL = local_tile(mCL, TileShapeQK{}, make_coord(_,_,_), Step<X, _1, _1>{});
+    auto gKR = local_tile(mKR, TileShapeQK{}, make_coord(_,_,_), Step<X, _1, _1>{});
+    auto gCLT = local_tile(mCLT, TileShapePV{}, make_coord(_,_,_), Step<X, _1, _1>{});
+
+    ThrMMA cta_mma_qk = TiledMmaQK{}.get_slice(get<0>(blk_coord) % size(AtomThrShapeMNK{}));
+    ThrMMA cta_mma_pv = TiledMmaPV{}.get_slice(get<0>(blk_coord) % size(AtomThrShapeMNK{}));
+
+    auto tSgQL = cta_mma_qk.partition_A(gQL);
+    auto tSgQR = cta_mma_qk.partition_A(gQR);
+
+    auto tSgCL = cta_mma_qk.partition_B(gCL);
+    auto tSgKR = cta_mma_qk.partition_B(gKR);
+
+    auto tOgCLT = cta_mma_pv.partition_B(gCLT);
+
+    Tensor sQ = make_tensor(make_smem_ptr(shared_tensors.smem_q.begin()), SmemLayoutQ{});
+    Tensor sKC = make_tensor(make_smem_ptr(shared_tensors.smem_kc.begin()), SmemLayoutKC{});
+    Tensor sVC = make_tensor(make_smem_ptr(shared_tensors.smem_vc.begin()), SmemLayoutVC{});
+
+    auto [tQLgQL_mkl, tQsQ] = tma_partition(
+        mainloop_params.tma_load_q_latent, _0{}, make_layout(_1{}),
+        group_modes<0,3>(sQ), group_modes<0,3>(tSgQL));
+
+    auto [tQRgQR_mkl, tQsQ_ignore] = tma_partition(
+        mainloop_params.tma_load_q_rope, _0{}, make_layout(_1{}),
+        group_modes<0,3>(sQ), group_modes<0,3>(tSgQR));
+
+    auto [tCLgCL_nkl, tKCsKC] = tma_partition(
+        mainloop_params.tma_load_c_latent, _0{}, make_layout(_1{}),
+        group_modes<0,3>(sKC), group_modes<0,3>(tSgCL));
+
+    auto [tKRgKR_nkl, tKCsKC_ignore] = tma_partition(
+        mainloop_params.tma_load_k_rope, _0{}, make_layout(_1{}),
+        group_modes<0,3>(sKC), group_modes<0,3>(tSgKR));
+
+    auto [tCLTgCLT_nkl, tVCsVC] = tma_partition(
+        mainloop_params.tma_load_c_latent_transpose, _0{}, make_layout(_1{}),
+        group_modes<0,3>(sVC), group_modes<0,3>(tOgCLT));
+
+    uint16_t mcast_mask = 0;
+
+    int batch_coord = get<2>(blk_coord);
+    Tensor tQLgQL = tQLgQL_mkl(_, _, _, batch_coord);
+    Tensor tQRgQR = tQRgQR_mkl(_, _, _, batch_coord);
+
+    auto mPT = mPT_l(_, batch_coord);
+
+    Tensor tCLgCL = tCLgCL_nkl(_, _, _, _);
+    Tensor tKRgKR = tKRgKR_nkl(_, _, _, _);
+
+    // careful: stage and k are swapped here!
+    Tensor tCLTgCLT = tCLTgCLT_nkl(_, _, _, _);
+
+    // latent is first in memory, so let's load it first always
+    // startup: alternate Q and K, set tx count appropriately, for k_idx = 0
+
+    // each Q/K tile consists of rope and latent
+    for (int i = 0; i < IterationsQKLatent; i++) {
+      pipeline_load_qk.producer_expect_transaction(pipeline_load_qk_producer_state, kTransactionsBytesLoadExtraQ);
+      pipeline_load_qk.producer_acquire(pipeline_load_qk_producer_state);
+      auto tma_barrier = pipeline_load_qk.producer_get_barrier(pipeline_load_qk_producer_state);
+
+      if (cute::elect_one_sync()) {
+        // expect the extra bytes
+        // load_qk ql
+        cute::copy(mainloop_params.tma_load_q_latent.with(*tma_barrier, mcast_mask), tQLgQL(_, _0{}, i), tQsQ(_, i));
+        // load_qk cl
+        if constexpr (kIsPaged) {
+          cute::copy(
+              mainloop_params.tma_load_c_latent.with(*tma_barrier, mcast_mask),
+              tCLgCL(_, _0{}, i, mPT(k_index)),
+              tKCsKC(_, pipeline_load_qk_producer_state.index())
+          );
+        }
+        else {
+          cute::copy(
+              mainloop_params.tma_load_c_latent.with(*tma_barrier, mcast_mask),
+              tCLgCL(_, k_index, i, batch_coord),
+              tKCsKC(_, pipeline_load_qk_producer_state.index()));
+        }
+      }
+      ++pipeline_load_qk_producer_state;
+    }
+
+    for (int i = 0; i < IterationsQKRope; i++) {
+      pipeline_load_qk.producer_expect_transaction(pipeline_load_qk_producer_state, kTransactionsBytesLoadExtraQ);
+      pipeline_load_qk.producer_acquire(pipeline_load_qk_producer_state);
+      auto tma_barrier = pipeline_load_qk.producer_get_barrier(pipeline_load_qk_producer_state);
+
+      if (cute::elect_one_sync()) {
+        // expect the extra bytes
+        // load_qk ql
+        cute::copy(mainloop_params.tma_load_q_rope.with(*tma_barrier, mcast_mask), tQRgQR(_, _0{}, i), tQsQ(_, i + IterationsQKLatent));
+        // load_qk cl
+        if constexpr (kIsPaged) {
+          cute::copy(
+              mainloop_params.tma_load_k_rope.with(*tma_barrier, mcast_mask),
+              tKRgKR(_, _0{}, i, mPT(k_index)),
+              tKCsKC(_, pipeline_load_qk_producer_state.index())
+          );
+        }
+        else {
+          cute::copy(
+              mainloop_params.tma_load_k_rope.with(*tma_barrier, mcast_mask),
+              tKRgKR(_, k_index, i, batch_coord),
+              tKCsKC(_, pipeline_load_qk_producer_state.index()));
+        }
+      }
+      ++pipeline_load_qk_producer_state;
+    }
+
+    k_index += 1;
+    k_tile_count -= 1;
+
+    // assume k_tile_count >= 1
+    // perform K+Q load here
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+
+      // perform K load
+      for (int i = 0; i < IterationsQKLatent; i++) {
+        pipeline_load_qk.producer_acquire(pipeline_load_qk_producer_state);
+        auto tma_barrier = pipeline_load_qk.producer_get_barrier(pipeline_load_qk_producer_state);
+
+        if (cute::elect_one_sync()) {
+          // load_qk cl
+          if constexpr (kIsPaged) {
+            cute::copy(
+                mainloop_params.tma_load_c_latent.with(*tma_barrier, mcast_mask),
+                tCLgCL(_, _0{}, i, mPT(k_index)),
+                tKCsKC(_, pipeline_load_qk_producer_state.index())
+            );
+          }
+          else {
+            cute::copy(
+                mainloop_params.tma_load_c_latent.with(*tma_barrier, mcast_mask),
+                tCLgCL(_, k_index, i, batch_coord),
+                tKCsKC(_, pipeline_load_qk_producer_state.index()));
+          }
+        }
+        ++pipeline_load_qk_producer_state;
+      }
+
+      for (int i = 0; i < IterationsQKRope; i++) {
+        pipeline_load_qk.producer_acquire(pipeline_load_qk_producer_state);
+        auto tma_barrier = pipeline_load_qk.producer_get_barrier(pipeline_load_qk_producer_state);
+
+        if (cute::elect_one_sync()) {
+          // load_qk cl
+          if constexpr (kIsPaged) {
+            cute::copy(
+                mainloop_params.tma_load_k_rope.with(*tma_barrier, mcast_mask),
+                tKRgKR(_, _0{}, i, mPT(k_index)),
+                tKCsKC(_, pipeline_load_qk_producer_state.index())
+            );
+          }
+          else {
+            cute::copy(
+                mainloop_params.tma_load_k_rope.with(*tma_barrier, mcast_mask),
+                tKRgKR(_, k_index, i, batch_coord),
+                tKCsKC(_, pipeline_load_qk_producer_state.index()));
+          }
+        }
+        ++pipeline_load_qk_producer_state;
+      }
+
+      // prefetch next K load to keep busy while we transpose-load from cache
+      const int kPrefetchDistance = 1;
+      for (int i = 0; i < IterationsQKLatent; i++) {
+        if (cute::elect_one_sync()) {
+          if constexpr (kIsPaged) {
+            if (k_tile_count > kPrefetchDistance) {
+              cute::prefetch(
+                  mainloop_params.tma_load_c_latent,
+                  tCLgCL(_, _0{}, i, mPT(k_index + kPrefetchDistance))
+              );
+            }
+          }
+          else {
+            cute::prefetch(
+                mainloop_params.tma_load_c_latent,
+                tCLgCL(_, k_index + kPrefetchDistance, i, batch_coord)
+            );
+          }
+        }
+      }
+
+      for (int i = 0; i < IterationsQKRope; i++) {
+        if (cute::elect_one_sync()) {
+          if constexpr (kIsPaged) {
+            if (k_tile_count > kPrefetchDistance) {
+              cute::prefetch(
+                  mainloop_params.tma_load_k_rope,
+                  tKRgKR(_, _0{}, i, mPT(k_index + kPrefetchDistance))
+              );
+            }
+          }
+          else {
+            cute::prefetch(
+                mainloop_params.tma_load_k_rope,
+                tKRgKR(_, k_index + kPrefetchDistance, i, batch_coord)
+            );
+          }
+        }
+      }
+
+      // perform V load (k_idx - 1)
+
+      for (int i = 0; i < IterationsPV_K; i++) {
+        for (int j = 0; j < IterationsPV_N; j++) {
+          pipeline_load_pv.producer_acquire(pipeline_load_pv_producer_state);
+          auto tma_barrier = pipeline_load_pv.producer_get_barrier(pipeline_load_pv_producer_state);
+
+          if (cute::elect_one_sync()) {
+            // load_pv cl
+            // note the transpose in indices!
+            // note we are off-by-one on k_index
+            if constexpr (kIsPaged) {
+              cute::copy(
+                  mainloop_params.tma_load_c_latent_transpose.with(*tma_barrier, mcast_mask, cute::TMA::CacheHintSm100::EVICT_FIRST),
+                  tCLTgCLT(_, j, i, mPT(k_index - 1)),
+                  tVCsVC(_, pipeline_load_pv_producer_state.index())
+              );
+            }
+            else {
+              cute::copy(
+                  mainloop_params.tma_load_c_latent_transpose.with(*tma_barrier, mcast_mask, cute::TMA::CacheHintSm100::EVICT_FIRST),
+                  tCLTgCLT(_, j, IterationsPV_K * (k_index - 1) + i, batch_coord),
+                  tVCsVC(_, pipeline_load_pv_producer_state.index())
+              );
+            }
+          }
+          ++pipeline_load_pv_producer_state;
+        }
+      }
+
+      k_index += 1;
+      k_tile_count -= 1;
+    }
+
+    for (int i = 0; i < IterationsPV_K; i++) {
+      for (int j = 0; j < IterationsPV_N; j++) {
+        pipeline_load_pv.producer_acquire(pipeline_load_pv_producer_state);
+        auto tma_barrier = pipeline_load_pv.producer_get_barrier(pipeline_load_pv_producer_state);
+
+        if (cute::elect_one_sync()) {
+          // load_pv cl
+          // note the transpose in indices
+          // note we are off-by-one on k_index
+
+          if constexpr (kIsPaged) {
+            cute::copy(
+                mainloop_params.tma_load_c_latent_transpose.with(*tma_barrier, mcast_mask, cute::TMA::CacheHintSm100::EVICT_FIRST),
+                tCLTgCLT(_, j, i, mPT(k_index - 1)),
+                tVCsVC(_, pipeline_load_pv_producer_state.index())
+            );
+          }
+          else {
+            cute::copy(
+                mainloop_params.tma_load_c_latent_transpose.with(*tma_barrier, mcast_mask, cute::TMA::CacheHintSm100::EVICT_FIRST),
+                tCLTgCLT(_, j, IterationsPV_K * (k_index - 1) + i, batch_coord),
+                tVCsVC(_, pipeline_load_pv_producer_state.index())
+            );
+          }
+        }
+        ++pipeline_load_pv_producer_state;
+      }
+    }
+  }
+
+  template<class BlkCoord>
+  CUTLASS_DEVICE void mma(
+      BlkCoord const& blk_coord,
+      ProblemShape const& problem_shape,
+      TensorStorage& shared_tensors,
+      PipelineLoadQK& pipeline_load_qk,
+      typename PipelineLoadQK::PipelineState& pipeline_load_qk_consumer_state,
+      PipelineLoadPV& pipeline_load_pv,
+      typename PipelineLoadPV::PipelineState& pipeline_load_pv_consumer_state,
+      PipelineS& pipeline_mma_s,
+      typename PipelineS::PipelineState& pipeline_mma_s_producer_state,
+      PipelineP& pipeline_p_mma,
+      typename PipelineP::PipelineState& pipeline_p_mma_consumer_state,
+      PipelineO& pipeline_mma_o,
+      typename PipelineO::PipelineState& pipeline_mma_o_producer_state,
+      int const& split_kv) {
+
+    auto [H, K, D, B] = problem_shape;
+
+    int k_tile_total = ceil_div(K, TileShapeS{});
+    int k_tile_per_cta = ceil_div(k_tile_total, split_kv);
+    int k_index = get<3>(blk_coord) * k_tile_per_cta; // lower limit
+    int k_tile_count = max(0, min(k_tile_total, k_index + k_tile_per_cta) - k_index);
+    if (k_tile_count == 0) {
+      return;
+    }
+
+    // mma init
+    Tensor sQ = make_tensor(make_smem_ptr(shared_tensors.smem_q.begin()), SmemLayoutQ{});
+    Tensor sKC = make_tensor(make_smem_ptr(shared_tensors.smem_kc.begin()), SmemLayoutKC{});
+    Tensor sVC = make_tensor(make_smem_ptr(shared_tensors.smem_vc.begin()), SmemLayoutVC{});
+    Tensor sP = make_tensor(make_smem_ptr((Element*) shared_tensors.smem_p.begin()), SmemLayoutP{});
+
+    Tensor tSrQ = TiledMmaQK::make_fragment_A(sQ);
+    Tensor tSrKC = TiledMmaQK::make_fragment_B(sKC);
+    Tensor tOrP = TiledMmaPV::make_fragment_A(sP);
+    Tensor tOrVC = TiledMmaPV::make_fragment_B(sVC);
+
+    TiledMmaQK tiled_mma_qk;
+    TiledMmaPV tiled_mma_pv;
+
+    Tensor tStS =  partition_fragment_C(tiled_mma_qk, select<0,1>(TileShapeQK{}));
+    Tensor tOtO =  partition_fragment_C(tiled_mma_pv, select<0,1>(TileShapePV{}));
+
+    tiled_mma_pv.accumulate_ = UMMA::ScaleOut::Zero;
+
+    pipeline_mma_s.producer_acquire(pipeline_mma_s_producer_state);
+
+    // Mma           S0 S1 O0 S2 O1 ... Sn On-1 On
+    // S0 ownership  --    -----        --      --
+    // S1 ownership     --       -----     ----
+    // O ownership         --    --        ---- --
+
+    tiled_mma_qk.accumulate_ = UMMA::ScaleOut::Zero;
+    for (int i = 0; i < IterationsQK; i++) {
+      pipeline_load_qk.consumer_wait(pipeline_load_qk_consumer_state);
+      int read_stage = pipeline_load_qk_consumer_state.index();
+
+      tStS.data() = uint32_t(pipeline_mma_s_producer_state.index() == 0 ? TmemAllocation::kS0 : TmemAllocation::kS1);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tSrQ); ++k_block) {
+        cute::gemm(tiled_mma_qk,
+                   tSrQ(_,_,k_block,i),
+                   tSrKC(_,_,k_block,read_stage),
+                   tStS);
+        tiled_mma_qk.accumulate_ = UMMA::ScaleOut::One;
+      }
+
+      pipeline_load_qk.consumer_release(pipeline_load_qk_consumer_state);
+      ++pipeline_load_qk_consumer_state;
+    }
+
+    pipeline_mma_s.producer_commit(pipeline_mma_s_producer_state);
+    ++pipeline_mma_s_producer_state;
+
+    k_tile_count -= 1;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+
+      pipeline_mma_s.producer_acquire(pipeline_mma_s_producer_state);
+      tiled_mma_qk.accumulate_ = UMMA::ScaleOut::Zero;
+      for (int i = 0; i < IterationsQK; i++) {
+        pipeline_load_qk.consumer_wait(pipeline_load_qk_consumer_state);
+        int read_stage = pipeline_load_qk_consumer_state.index();
+
+        tStS.data() = uint32_t(pipeline_mma_s_producer_state.index() == 0 ? TmemAllocation::kS0 : TmemAllocation::kS1);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int k_block = 0; k_block < size<2>(tSrQ); ++k_block) {
+          cute::gemm(tiled_mma_qk,
+                     tSrQ(_,_,k_block,i),
+                     tSrKC(_,_,k_block,read_stage),
+                     tStS);
+          tiled_mma_qk.accumulate_ = UMMA::ScaleOut::One;
+        }
+
+        pipeline_load_qk.consumer_release(pipeline_load_qk_consumer_state);
+        ++pipeline_load_qk_consumer_state;
+      }
+
+      pipeline_mma_s.producer_commit(pipeline_mma_s_producer_state);
+      ++pipeline_mma_s_producer_state;
+
+      pipeline_mma_o.producer_acquire(pipeline_mma_o_producer_state);
+      pipeline_p_mma.consumer_wait(pipeline_p_mma_consumer_state);
+
+      for (int i = 0; i < IterationsPV_K; i++) {
+        auto acc_flag = tiled_mma_pv.accumulate_;
+        for (int j = 0; j < IterationsPV_N; j++) {
+          pipeline_load_pv.consumer_wait(pipeline_load_pv_consumer_state);
+
+          int read_stage = pipeline_load_pv_consumer_state.index();
+
+          tOtO.data() = uint32_t(TmemAllocation::kO0) + j * uint32_t(TmemAllocation::kSizeAccO);
+          tiled_mma_pv.accumulate_ = acc_flag;
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int k_block = 0; k_block < size<2>(tOrP); ++k_block) {
+            cute::gemm(tiled_mma_pv,
+                       tOrP(_,_,k_block, make_coord(i, pipeline_p_mma_consumer_state.index())),
+                       tOrVC(_,_,k_block,read_stage),
+                       tOtO);
+            tiled_mma_pv.accumulate_ = UMMA::ScaleOut::One;
+          }
+
+          pipeline_load_pv.consumer_release(pipeline_load_pv_consumer_state);
+          ++pipeline_load_pv_consumer_state;
+        }
+      }
+
+      pipeline_p_mma.consumer_release(pipeline_p_mma_consumer_state);
+      ++pipeline_p_mma_consumer_state;
+      pipeline_mma_o.producer_commit(pipeline_mma_o_producer_state);
+      ++pipeline_mma_o_producer_state;
+
+      --k_tile_count;
+    }
+
+    pipeline_mma_o.producer_acquire(pipeline_mma_o_producer_state);
+    pipeline_p_mma.consumer_wait(pipeline_p_mma_consumer_state);
+
+    for (int i = 0; i < IterationsPV_K; i++) {
+      auto acc_flag = tiled_mma_pv.accumulate_;
+      for (int j = 0; j < IterationsPV_N; j++) {
+        pipeline_load_pv.consumer_wait(pipeline_load_pv_consumer_state);
+
+        int read_stage = pipeline_load_pv_consumer_state.index();
+
+        tOtO.data() = uint32_t(TmemAllocation::kO0) + j * uint32_t(TmemAllocation::kSizeAccO);
+        tiled_mma_pv.accumulate_ = acc_flag;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int k_block = 0; k_block < size<2>(tOrP); ++k_block) {
+          cute::gemm(tiled_mma_pv,
+                     tOrP(_,_,k_block, make_coord(i, pipeline_p_mma_consumer_state.index())),
+                     tOrVC(_,_,k_block,read_stage),
+                     tOtO);
+          tiled_mma_pv.accumulate_ = UMMA::ScaleOut::One;
+        }
+
+        pipeline_load_pv.consumer_release(pipeline_load_pv_consumer_state);
+        ++pipeline_load_pv_consumer_state;
+      }
+    }
+
+    pipeline_p_mma.consumer_release(pipeline_p_mma_consumer_state);
+    ++pipeline_p_mma_consumer_state;
+    pipeline_mma_o.producer_commit(pipeline_mma_o_producer_state);
+    ++pipeline_mma_o_producer_state;
+  }
+
+
+  template<class IsLastTile>
+  CUTLASS_DEVICE void softmax(
+      IsLastTile const& is_last_tile,
+      ElementAcc& row_max,
+      ElementAcc& row_sum,
+      ElementAcc& correction_factor,
+      ProblemShape const& problem_shape,
+      MainloopArguments const& mainloop_args,
+      TensorStorage& shared_tensors,
+      int k_index,
+      uint32_t tmem_s,
+      int smem_p_index) {
+
+    auto load_op = cute::SM100_TMEM_LOAD_32dp32b32x{};
+
+    TiledMmaQK tiled_mma_qk;
+
+    Tensor tStS = partition_fragment_C(tiled_mma_qk, select<0,1>(TileShapeQK{}));
+    tStS.data() = tmem_s;
+
+    CUTE_STATIC_ASSERT_V(shape<1>(tStS) == _1{});
+    CUTE_STATIC_ASSERT_V(shape<2>(tStS) == _1{});
+    Tensor tAcc = tStS(make_coord(_,_),_0{},_0{});
+
+    Tensor cS = make_identity_tensor(take<0,2>(CtaShapeQK{}));
+
+    auto tiled_t2r = make_tmem_copy(load_op, tAcc);
+    auto thread_idx = threadIdx.x % size(tiled_t2r);
+
+    auto thread_t2r = tiled_t2r.get_slice(thread_idx);
+    Tensor tTR_cS   = thread_t2r.partition_D(cS);
+    Tensor tTR_rAcc = make_tensor<ElementAcc>(shape(tTR_cS));
+
+    Tensor tTR_rS_frag = make_tensor<Element>(shape(tTR_rAcc));
+    const int AlignmentS = 4;
+    Tensor tTR_tAcc = thread_t2r.partition_S(tAcc);
+    Tensor tTR_rAcc_vec = recast<Array<ElementAcc, AlignmentS>>(tTR_rAcc);
+    Tensor tTR_rS_vec = recast<Array<Element, AlignmentS>>(tTR_rS_frag);
+
+    // load s
+    copy(tiled_t2r, tTR_tAcc, tTR_rAcc);
+
+    if (is_last_tile) {
+      for (int i = 0; i < size(tTR_rAcc); i++) {
+        if (get<1>(tTR_cS(i)) + TileShapeS{} * k_index >= get<1>(problem_shape)) {
+          tTR_rAcc(i) = -std::numeric_limits<ElementAcc>::infinity();
+        }
+      }
+    }
+
+    // max
+    ElementAcc row_max_new = row_max;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(tTR_rAcc); i += 1) {
+      row_max_new = ::fmax(row_max_new, tTR_rAcc(i));
+    }
+
+    // for 2x2 dp, reduce here
+    if constexpr (kWarpsInN > 1) {
+      shared_tensors.smem_exchange[threadIdx.x] = row_max_new;
+      cutlass::arch::NamedBarrier(kNumComputeWarps*NumThreadsPerWarp, kNamedBarrierExchange).sync();
+      // (64, 2) shape
+      int peer_index = (threadIdx.x + 64) % 128;
+      row_max_new = cutlass::max(row_max_new, shared_tensors.smem_exchange[peer_index]);
+    }
+
+#ifndef B2B
+    // find correction factor
+    ElementAcc softmax_scale_log2 = mainloop_args.softmax_scale * static_cast<ElementAcc>(M_LOG2E);
+    correction_factor = ::exp2f(softmax_scale_log2 * (row_max - row_max_new));
+    row_max = row_max_new;
+
+    // softmax
+    ElementAcc row_max_scale_log2 = row_max * softmax_scale_log2;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(tTR_rAcc); i++) {
+      tTR_rAcc(i) = ::exp2f(softmax_scale_log2 * tTR_rAcc(i) - row_max_scale_log2);
+    }
+#endif
+
+    // quantize
+    cutlass::NumericArrayConverter<Element, ElementAcc, AlignmentS> epilogue_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(tTR_rAcc_vec); i++) {
+      tTR_rS_vec(i) = epilogue_op(tTR_rAcc_vec(i));
+    }
+
+    Tensor sP = make_tensor(make_smem_ptr((Element*) shared_tensors.smem_p.begin()), SmemLayoutP{})(_, _, _, make_coord(_, smem_p_index));
+
+    Tensor tOcP = TiledMmaPV{}.get_slice(_0{}).partition_A(cS);
+
+    // have a mapping for each thread to coord
+    // find identical mapping to coords for the MMA
+    auto l = make_ordered_layout(make_shape(make_shape(_64{}, _2{}), make_shape(_16{}, TileShapeS{} / _32{})), make_stride(make_stride(_0{}, _3{}), make_stride(_1{}, _2{})));
+    auto sP_ = as_position_independent_swizzle_tensor(sP);
+    copy_aligned(tTR_rS_frag, sP_.compose(l)(threadIdx.x, _));
+
+    // sum
+    row_sum *= correction_factor;
+
+    static_assert(cute::is_same_v<ElementAcc, float>);
+    auto tTR_rAcc_float2 = recast<float2>(tTR_rAcc);
+    auto sums = make_tensor<float2>(_4{});
+    static_assert(size(tTR_rAcc_float2) % size(sums) == 0);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(sums); i++) {
+      sums(i) = tTR_rAcc_float2(i);
+    }
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = size(sums); i < size(tTR_rAcc_float2); i += size(sums)) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < size(sums); j++) {
+        cute::add(sums(j), sums(j), tTR_rAcc_float2(i + j));
+      }
+    }
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < size(sums); i *= 2) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < size(sums); j += 2*i) {
+        cute::add(sums(j), sums(j), sums(j+i));
+      }
+    }
+    row_sum += sums(0).x + sums(0).y;
+  }
+
+
+  CUTLASS_DEVICE void rescale(
+      ElementAcc correction_factor,
+      uint32_t tmem_o) {
+
+    // for b2b gemm, do nothing
+#ifndef B2B
+    auto load_op = cute::SM100_TMEM_LOAD_32dp32b32x{};
+    auto store_op = TMEM::tmem_load_to_store(load_op);
+
+    TiledMmaPV tiled_mma_pv;
+
+    Tensor tOtO = partition_fragment_C(tiled_mma_pv, select<0,1>(TileShapePV{}));
+    tOtO.data() = tmem_o;
+
+    CUTE_STATIC_ASSERT_V(shape<1>(tOtO) == _1{});
+    CUTE_STATIC_ASSERT_V(shape<2>(tOtO) == _1{});
+    Tensor tAcc = tOtO(make_coord(_,_),_0{},_0{});
+
+    auto cta_tiler_pv = take<0,2>(typename CollectiveMmaPV::CtaShape_MNK{});
+    Tensor gO = make_tensor(make_gmem_ptr((ElementAcc*) nullptr), cta_tiler_pv, make_stride(0, 0));
+
+    auto tiled_t2r = make_tmem_copy(load_op, tAcc);
+    auto tiled_r2t = make_tmem_copy(store_op, tAcc);
+    auto thread_idx = threadIdx.x % size(tiled_t2r);
+
+    auto thread_t2r = tiled_t2r.get_slice(thread_idx);
+    auto thread_r2t = tiled_r2t.get_slice(thread_idx);
+    Tensor tTR_gO   = thread_t2r.partition_D(gO);
+    Tensor tTR_rAcc = make_tensor<ElementAcc>(shape(tTR_gO));
+
+    Tensor tTR_tAcc = thread_t2r.partition_S(tAcc);
+
+    // load o
+    copy(tiled_t2r, tTR_tAcc, tTR_rAcc);
+
+    // multiply by correction factor
+    float2 correction_factor_vec = make_float2(correction_factor, correction_factor);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(tTR_rAcc); i += 2) {
+      float2 in = make_float2(tTR_rAcc(i + 0), tTR_rAcc(i + 1));
+      float2 out;
+      cute::mul(out, in, correction_factor_vec);
+      tTR_rAcc(i + 0) = out.x;
+      tTR_rAcc(i + 1) = out.y;
+    }
+
+    // store o
+    copy(tiled_r2t, tTR_rAcc, tTR_tAcc);
+#endif
+ }
+
+
+  template<class BlkCoord>
+  CUTLASS_DEVICE void epilogue(
+      ElementAcc& row_max,
+      ElementAcc& row_sum,
+      BlkCoord const& cta_coord,
+      ProblemShape const& problem_shape,
+      MainloopArguments const& mainloop_args,
+      EpilogueParams const& epilogue_args,
+      TensorStorage& shared_tensors,
+      uint32_t tmem_o,
+      int const& split_kv) {
+
+    auto load_op = cute::SM100_TMEM_LOAD_32dp32b32x{};
+
+    TiledMmaPV tiled_mma_pv;
+
+    Tensor tOtO = TiledMmaPV::make_fragment_C(partition_shape_C(TiledMmaPV{}, take<0, 2>(TileShapePV{})));
+    tOtO.data() = tmem_o;
+
+    CUTE_STATIC_ASSERT_V(shape<1>(tOtO) == _1{});
+    CUTE_STATIC_ASSERT_V(shape<2>(tOtO) == _1{});
+    Tensor tAcc = tOtO(make_coord(_,_),_0{},_0{});
+
+    auto [H, K, D, B] = problem_shape;
+    auto [D_latent, D_rope] = D;
+    if (epilogue_args.ptr_o_acc != nullptr) {
+      using ElementOutAcc = ElementAcc;
+      constexpr auto AlignmentOutAcc = 128 / cute::sizeof_bits_v<ElementOutAcc>;
+      Tensor mO = make_tensor(make_gmem_ptr(epilogue_args.ptr_o_acc + get<3>(cta_coord) * D_latent), make_shape(H, D_latent, B), epilogue_args.stride_o_acc);
+      auto cta_tiler_pv = take<0,2>(typename CollectiveMmaPV::CtaShape_MNK{});
+      Tensor gO = local_tile(mO, cta_tiler_pv, take<0,3>(cta_coord));
+
+      auto tiled_t2r = make_tmem_copy(load_op, tAcc);
+      auto thread_idx = threadIdx.x % size(tiled_t2r);
+
+      auto thread_t2r = tiled_t2r.get_slice(thread_idx);
+      Tensor tTR_gO   = thread_t2r.partition_D(gO);
+      Tensor tTR_rAcc = make_tensor<ElementAcc>(shape(tTR_gO));
+
+      Tensor tTR_rO_frag = make_tensor<ElementOutAcc>(shape(tTR_rAcc));
+      Tensor tTR_rO_src = recast<Array<ElementOutAcc, AlignmentOutAcc>>(coalesce(tTR_rO_frag));
+      Tensor tR2G_rO_dst = recast<Array<ElementOutAcc, AlignmentOutAcc>>(coalesce(tTR_gO));
+      Tensor tTR_tAcc = thread_t2r.partition_S(tAcc);
+
+      copy(tiled_t2r, tTR_tAcc, tTR_rAcc);
+
+      cutlass::epilogue::thread::LinearCombination<ElementOutAcc, 1, ElementAcc, ElementAcc, cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling> epilogue_op({epilogue_args.output_scale / row_sum});
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tTR_rAcc); i++) {
+        tTR_rO_frag(i) = epilogue_op(tTR_rAcc(i));
+      }
+
+      copy(tTR_rO_src, tR2G_rO_dst);
+
+#ifndef B2B
+
+      // compute LSE
+      ElementAcc lse = cutlass::fast_log(row_sum) + mainloop_args.softmax_scale * row_max;
+
+      // store LSE
+      Tensor mLSE = make_tensor(make_gmem_ptr(epilogue_args.ptr_lse_acc + H * get<3>(cta_coord)), make_shape(H, B), epilogue_args.stride_lse_acc);
+      Tensor gLSE = local_tile(mLSE, append<3>(cta_tiler_pv, _1{}), take<0,3>(cta_coord), Step<_1, Underscore, _1>{});
+      // for 2x2 dp, this must be conditional and the index is wrong
+      if (! kIs2Sm || (threadIdx.x < 64))
+      {
+          gLSE(threadIdx.x) = lse;
+      }
+ #endif
+    }
+    else {
+      Tensor mO = make_tensor(make_gmem_ptr(epilogue_args.ptr_o), make_shape(H, D_latent, B), epilogue_args.stride_o);
+      auto cta_tiler_pv = take<0,2>(typename CollectiveMmaPV::CtaShape_MNK{});
+      Tensor gO = local_tile(mO, cta_tiler_pv, take<0,3>(cta_coord));
+
+      auto tiled_t2r = make_tmem_copy(load_op, tAcc);
+      auto thread_idx = threadIdx.x % size(tiled_t2r);
+
+      auto thread_t2r = tiled_t2r.get_slice(thread_idx);
+      Tensor tTR_gO   = thread_t2r.partition_D(gO);
+      Tensor tTR_rAcc = make_tensor<ElementAcc>(shape(tTR_gO));
+
+      Tensor tTR_rO_frag = make_tensor<ElementOut>(shape(tTR_rAcc));
+      Tensor tTR_rO_src = recast<Array<ElementOut, AlignmentOut>>(coalesce(tTR_rO_frag));
+      Tensor tR2G_rO_dst = recast<Array<ElementOut, AlignmentOut>>(coalesce(tTR_gO));
+      Tensor tTR_tAcc = thread_t2r.partition_S(tAcc);
+
+      copy(tiled_t2r, tTR_tAcc, tTR_rAcc);
+
+      cutlass::epilogue::thread::LinearCombination<ElementOut, 1, ElementAcc, ElementAcc, cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling> epilogue_op({epilogue_args.output_scale / row_sum});
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tTR_rAcc); i++) {
+        tTR_rO_frag(i) = epilogue_op(tTR_rAcc(i));
+      }
+
+      copy(tTR_rO_src, tR2G_rO_dst);
+
+#ifndef B2B
+      if (epilogue_args.ptr_lse != nullptr) {
+        // compute LSE
+        ElementAcc lse = cutlass::fast_log(row_sum) + mainloop_args.softmax_scale * row_max;
+
+        // store LSE
+        Tensor mLSE = make_tensor(make_gmem_ptr(epilogue_args.ptr_lse), make_shape(H, B), epilogue_args.stride_lse);
+        Tensor gLSE = local_tile(mLSE, append<3>(cta_tiler_pv, _1{}), take<0,3>(cta_coord), Step<_1, Underscore, _1>{});
+
+        // for 2x2 dp, this must be conditional and the index is wrong
+        if (! kIs2Sm || (threadIdx.x < 64))
+        {
+          gLSE(threadIdx.x) = lse;
+        }
+      }
+#endif
+    }
+  }
+
+
+  template<class CtaCoord>
+  CUTLASS_DEVICE void compute(
+      CtaCoord const& cta_coord,
+      ProblemShape const& problem_shape,
+      MainloopArguments const& mainloop_args,
+      EpilogueParams const& epilogue_args,
+      TensorStorage& shared_tensors,
+      PipelineS& pipeline_mma_s,
+      typename PipelineS::PipelineState& pipeline_mma_s_consumer_state,
+      PipelineP& pipeline_p_mma,
+      typename PipelineP::PipelineState& pipeline_p_mma_producer_state,
+      PipelineO& pipeline_mma_o,
+      typename PipelineO::PipelineState& pipeline_mma_o_consumer_state,
+      int const& split_kv) {
+
+    auto [H, K, D, B] = problem_shape;
+
+    int k_tile_total = ceil_div(K, TileShapeS{});
+    int k_tile_per_cta = ceil_div(k_tile_total, split_kv);
+    int k_index = get<3>(cta_coord) * k_tile_per_cta; // lower limit
+    int k_tile_count = max(0, min(k_tile_total, k_index + k_tile_per_cta) - k_index);
+    if (k_tile_count == 0) {
+
+      // if we return early, we have to make sure we release the load warp
+      cutlass::arch::NamedBarrier(
+          (kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp,
+          kNamedBarrierEpilogue
+      ).arrive();
+
+      return;
+    }
+    int k_index_final = k_tile_total - 1;
+
+    ElementAcc row_max = -std::numeric_limits<ElementAcc>::infinity();
+    ElementAcc row_sum = 0;
+    ElementAcc correction_factor = 1;
+
+    pipeline_p_mma.producer_acquire(pipeline_p_mma_producer_state);
+    pipeline_mma_s.consumer_wait(pipeline_mma_s_consumer_state);
+
+    auto dispatch_bool = [](bool b, auto fn) {
+      if (b) {
+        fn(cute::true_type{});
+      }
+      else {
+        fn(cute::false_type{});
+      }
+    };
+
+    // softmax s0 -> p0
+    dispatch_bool(k_index == k_index_final, [&](auto is_last_tile) {
+      softmax(
+          is_last_tile,
+          row_max, row_sum, correction_factor,
+          problem_shape, mainloop_args, shared_tensors, k_index,
+          uint32_t(pipeline_mma_s_consumer_state.index() == 0 ? TmemAllocation::kS0 : TmemAllocation::kS1),
+          pipeline_p_mma_producer_state.index()
+      );
+    });
+
+    k_index += 1;
+
+    cutlass::arch::fence_view_async_tmem_load();
+    cutlass::arch::fence_view_async_shared();
+    pipeline_mma_s.consumer_release(pipeline_mma_s_consumer_state);
+    ++pipeline_mma_s_consumer_state;
+    pipeline_p_mma.producer_commit(pipeline_p_mma_producer_state);
+    ++pipeline_p_mma_producer_state;
+
+    k_tile_count -= 1;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      pipeline_p_mma.producer_acquire(pipeline_p_mma_producer_state);
+      pipeline_mma_s.consumer_wait(pipeline_mma_s_consumer_state);
+
+      // softmax s1 -> p1
+      dispatch_bool(k_index == k_index_final, [&](auto is_last_tile) {
+        softmax(
+            is_last_tile,
+            row_max, row_sum, correction_factor,
+            problem_shape, mainloop_args, shared_tensors, k_index,
+            uint32_t(pipeline_mma_s_consumer_state.index() == 0 ? TmemAllocation::kS0 : TmemAllocation::kS1),
+            pipeline_p_mma_producer_state.index()
+        );
+      });
+
+      cutlass::arch::fence_view_async_tmem_load();
+      cutlass::arch::fence_view_async_shared();
+      pipeline_mma_s.consumer_release(pipeline_mma_s_consumer_state);
+      ++pipeline_mma_s_consumer_state;
+      pipeline_p_mma.producer_commit(pipeline_p_mma_producer_state);
+      ++pipeline_p_mma_producer_state;
+
+      pipeline_mma_o.consumer_wait(pipeline_mma_o_consumer_state);
+
+      // rescale
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < IterationsPV_N; j++) {
+        rescale(correction_factor, uint32_t(TmemAllocation::kO0) + j * uint32_t(TmemAllocation::kSizeAccO));
+      }
+
+      cutlass::arch::fence_view_async_tmem_store();
+      pipeline_mma_o.consumer_release(pipeline_mma_o_consumer_state);
+      ++pipeline_mma_o_consumer_state;
+
+      --k_tile_count;
+      k_index += 1;
+    }
+
+    pipeline_mma_o.consumer_wait(pipeline_mma_o_consumer_state);
+
+#ifdef B2B
+    row_sum = 1;
+#else
+    if constexpr (kWarpsInN > 1) {
+      // reduce row_sum if needed (for 2x2 dp)
+      shared_tensors.smem_exchange[threadIdx.x] = row_sum;
+      cutlass::arch::NamedBarrier(kNumComputeWarps*NumThreadsPerWarp, kNamedBarrierExchange).sync();
+      // (64, 2) shape
+      int peer_index = (threadIdx.x + 64) % 128;
+      row_sum += shared_tensors.smem_exchange[peer_index];
+    }
+#endif
+
+    cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive();
+
+    // epilogue
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < IterationsPV_N; j++) {
+      epilogue(
+          row_max, row_sum,
+          replace<1>(cta_coord, j), problem_shape,
+          mainloop_args, epilogue_args, shared_tensors,
+          uint32_t(TmemAllocation::kO0) + j * uint32_t(TmemAllocation::kSizeAccO), split_kv
+      );
+    }
+
+    cutlass::arch::fence_view_async_tmem_load();
+    pipeline_mma_o.consumer_release(pipeline_mma_o_consumer_state);
+    ++pipeline_mma_o_consumer_state;
+  }
+
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::fmha::kernel
diff --git a/sgl-kernel/csrc/attention/cutlass_sm100_mla/kernel/sm100_mla_tile_scheduler.hpp b/sgl-kernel/csrc/attention/cutlass_sm100_mla/kernel/sm100_mla_tile_scheduler.hpp
new file mode 100644
index 000000000..30389e79f
--- /dev/null
+++ b/sgl-kernel/csrc/attention/cutlass_sm100_mla/kernel/sm100_mla_tile_scheduler.hpp
@@ -0,0 +1,160 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// clang-format off
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.h"
+
+namespace cutlass::fmha::kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct Sm100MlaIndividualTileScheduler {
+
+  struct Params {
+    dim3 grid;
+  };
+
+  bool valid_ = true;
+
+  CUTLASS_DEVICE
+  Sm100MlaIndividualTileScheduler(Params const&) {}
+
+  template<class ProblemShape, class ClusterShape>
+  static Params to_underlying_arguments(
+      ProblemShape const& problem_shape, KernelHardwareInfo hw_info,
+      ClusterShape const& cluster_shape, int const& split_kv) {
+    using namespace cute;
+    dim3 grid(get<0>(cluster_shape), get<3>(problem_shape) /* Batch */, split_kv /*Maximum Split KV*/);
+    return Params{ grid };
+  }
+
+  static dim3 get_grid_shape(Params const& params) {
+    return params.grid;
+  }
+
+  CUTLASS_DEVICE
+  bool is_valid() {
+    return valid_;
+  }
+
+  CUTLASS_DEVICE
+  auto get_block_coord() {
+    using namespace cute;
+    return make_coord(blockIdx.x, _0{}, blockIdx.y, blockIdx.z);
+  }
+
+  CUTLASS_DEVICE
+  Sm100MlaIndividualTileScheduler& operator++() {
+    valid_ = false;
+    return *this;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct Sm100MlaPersistentTileScheduler {
+
+  struct Params {
+    int num_blocks;
+    FastDivmod divmod_m_block;
+    FastDivmod divmod_b;
+    FastDivmod divmod_split_kv;
+    KernelHardwareInfo hw_info;
+  };
+
+  int block_idx = 0;
+  Params params;
+
+  CUTLASS_DEVICE
+  Sm100MlaPersistentTileScheduler(Params const& params) : block_idx(blockIdx.x), params(params) {}
+
+  template<class ProblemShape, class ClusterShape>
+  static Params to_underlying_arguments(
+      ProblemShape const& problem_shape, KernelHardwareInfo hw_info,
+      ClusterShape const& cluster_shape, int const& split_kv) {
+    using namespace cute;
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = hw_info.sm_count;
+    if (sm_count <= 1 || sm_count % size<0>(cluster_shape) != 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+    hw_info.sm_count = sm_count;
+
+    int num_m_blocks = size<0>(cluster_shape);
+    int num_blocks = num_m_blocks * get<3>(problem_shape)  /* Batch */;
+    num_blocks *= split_kv; /* Maximum Split KV*/
+
+    return Params {
+      num_blocks,
+      { num_m_blocks}, { get<3>(problem_shape) }, {split_kv},
+      hw_info
+    };
+  }
+
+  static dim3 get_grid_shape(Params const& params) {
+    dim3 grid(std::min(params.num_blocks, params.hw_info.sm_count), 1, 1);
+    return grid;
+  }
+
+  CUTLASS_DEVICE
+  bool is_valid() {
+    return block_idx < params.num_blocks;
+  }
+
+  CUTLASS_DEVICE
+  auto get_block_coord() {
+    using namespace cute;
+    int block_decode = block_idx;
+    int m_block, bidb, n_split_kv;
+    params.divmod_m_block(block_decode, m_block, block_decode);
+    params.divmod_b(block_decode, bidb, block_decode);
+    params.divmod_split_kv(block_decode, n_split_kv, block_decode);
+    return make_coord(m_block, _0{}, bidb, n_split_kv);
+  }
+
+  CUTLASS_DEVICE
+  Sm100MlaPersistentTileScheduler& operator++() {
+    block_idx += gridDim.x;
+    return *this;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::fmha::kernel
diff --git a/sgl-kernel/csrc/attention/lightning_attention_decode_kernel.cu b/sgl-kernel/csrc/attention/lightning_attention_decode_kernel.cu
new file mode 100644
index 000000000..01bd4797c
--- /dev/null
+++ b/sgl-kernel/csrc/attention/lightning_attention_decode_kernel.cu
@@ -0,0 +1,154 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <torch/all.h>
+
+#define THREADS_PER_BLOCK 128
+
+template <typename T>
+__global__ void lightning_attention_decode_kernel(
+    const T* __restrict__ q,            // [b, h, 1, d]
+    const T* __restrict__ k,            // [b, h, 1, d]
+    const T* __restrict__ v,            // [b, h, 1, e]
+    const float* __restrict__ past_kv,  // [b, h, d, e]
+    const float* __restrict__ slope,    // [h, 1, 1]
+    T* __restrict__ output,             // [b, h, 1, e]
+    float* __restrict__ new_kv,         // [b, h, d, e]
+    const int batch_size,
+    const int num_heads,
+    const int qk_dim,
+    const int v_dim) {
+  extern __shared__ char smem[];
+  T* __restrict__ q_shared = reinterpret_cast<T*>(smem);
+  T* __restrict__ k_shared = reinterpret_cast<T*>(smem + qk_dim * sizeof(T));
+  T* __restrict__ v_shared = reinterpret_cast<T*>(smem + 2 * qk_dim * sizeof(T));
+  float* __restrict__ new_kv_shared = reinterpret_cast<float*>(smem + (2 * qk_dim + v_dim) * sizeof(T));
+  T* __restrict__ output_shared =
+      reinterpret_cast<T*>(smem + (2 * qk_dim + v_dim) * sizeof(T) + qk_dim * (v_dim + 1) * sizeof(float));
+
+  const int32_t tid = threadIdx.x;
+  const int32_t current_head = blockIdx.x;
+  const int32_t b = current_head / num_heads;
+  const int32_t h = current_head % num_heads;
+
+  if (b >= batch_size) return;
+
+  const int32_t qk_offset = b * num_heads * qk_dim + h * qk_dim;
+  const int32_t v_offset = b * num_heads * v_dim + h * v_dim;
+  const int32_t kv_offset = b * num_heads * qk_dim * v_dim + h * qk_dim * v_dim;
+
+  // Load q, k, v into shared memory
+  for (int d = tid; d < qk_dim; d += blockDim.x) {
+    q_shared[d] = q[qk_offset + d];
+    k_shared[d] = k[qk_offset + d];
+  }
+  for (int e = tid; e < v_dim; e += blockDim.x) {
+    v_shared[e] = v[v_offset + e];
+  }
+
+  __syncthreads();
+
+  const float ratio = expf(-1.0f * slope[h]);
+
+  // Compute new_kv
+  for (int d = tid; d < qk_dim; d += blockDim.x) {
+    const T k_val = k_shared[d];
+    for (int e = 0; e < v_dim; ++e) {
+      const int past_kv_idx = kv_offset + d * v_dim + e;
+      const T v_val = v_shared[e];
+      const float new_val = ratio * past_kv[past_kv_idx] + k_val * v_val;
+      const int shared_idx = d * (v_dim + 1) + e;
+      new_kv_shared[shared_idx] = new_val;
+    }
+  }
+
+  __syncthreads();
+
+  // Store new_kv to global memory
+  for (int idx = tid; idx < qk_dim * v_dim; idx += blockDim.x) {
+    const int d = idx / v_dim;
+    const int e = idx % v_dim;
+    const int shared_idx = d * (v_dim + 1) + e;
+    const int global_idx = kv_offset + idx;
+    new_kv[global_idx] = new_kv_shared[shared_idx];
+  }
+
+  __syncthreads();
+
+  // Compute output
+  for (int e = tid; e < v_dim; e += blockDim.x) {
+    float sum = 0.0f;
+    for (int d = 0; d < qk_dim; ++d) {
+      const int shared_idx = d * (v_dim + 1) + e;
+      sum += q_shared[d] * new_kv_shared[shared_idx];
+    }
+    output_shared[e] = static_cast<T>(sum);
+  }
+
+  __syncthreads();
+
+  // Store output to global memory
+  if (tid == 0) {
+    for (int e = 0; e < v_dim; ++e) {
+      output[v_offset + e] = output_shared[e];
+    }
+  }
+}
+
+void lightning_attention_decode(
+    const torch::Tensor& q,
+    const torch::Tensor& k,
+    const torch::Tensor& v,
+    const torch::Tensor& past_kv,
+    const torch::Tensor& slope,
+    torch::Tensor output,
+    torch::Tensor new_kv) {
+  TORCH_CHECK(q.is_contiguous(), "q must be contiguous");
+  TORCH_CHECK(k.is_contiguous(), "k must be contiguous");
+  TORCH_CHECK(v.is_contiguous(), "v must be contiguous");
+  TORCH_CHECK(past_kv.is_contiguous(), "past_kv must be contiguous");
+
+  auto batch_size = q.size(0);
+  auto num_heads = q.size(1);
+  auto qk_dim = q.size(3);
+  auto v_dim = v.size(3);
+
+  dim3 block(THREADS_PER_BLOCK);
+  dim3 grid(batch_size * num_heads);
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16, q.scalar_type(), "lightning_attention_decode_kernel", ([&] {
+        size_t smem_size = (2 * qk_dim + 2 * v_dim) * sizeof(scalar_t) + qk_dim * (v_dim + 1) * sizeof(float);
+        lightning_attention_decode_kernel<scalar_t><<<grid, block, smem_size, stream>>>(
+            q.data_ptr<scalar_t>(),
+            k.data_ptr<scalar_t>(),
+            v.data_ptr<scalar_t>(),
+            past_kv.data_ptr<float>(),
+            slope.data_ptr<float>(),
+            output.data_ptr<scalar_t>(),
+            new_kv.data_ptr<float>(),
+            batch_size,
+            num_heads,
+            qk_dim,
+            v_dim);
+      }));
+}
diff --git a/sgl-kernel/csrc/attention/merge_attn_states.cu b/sgl-kernel/csrc/attention/merge_attn_states.cu
new file mode 100644
index 000000000..c719498b8
--- /dev/null
+++ b/sgl-kernel/csrc/attention/merge_attn_states.cu
@@ -0,0 +1,204 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <algorithm>
+#include <optional>
+
+#include "pytorch_extension_utils.h"
+
+// Helper functions to convert between different data types
+// (float, half, bfloat16) for the merge attention states kernel.
+inline __device__ float to_float(float u) {
+  return u;
+}
+inline __device__ float to_float(half u) {
+  return __half2float(u);
+}
+inline __device__ float to_float(__nv_bfloat16 u) {
+  return __bfloat162float(u);
+}
+inline __device__ void from_float(float& d, float s) {
+  d = s;
+}
+inline __device__ void from_float(half& d, float s) {
+  d = __float2half(s);
+}
+inline __device__ void from_float(__nv_bfloat16& d, float s) {
+  d = __float2bfloat16(s);
+}
+
+// Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
+template <typename scalar_t, const uint NUM_THREADS>
+__global__ void merge_attn_states_kernel(
+    scalar_t* output,
+    float* output_lse,
+    const scalar_t* prefix_output,
+    const float* prefix_lse,
+    const scalar_t* suffix_output,
+    const float* suffix_lse,
+    const uint num_tokens,
+    const uint num_heads,
+    const uint head_size) {
+  using pack_128b_t = uint4;
+  const uint pack_size = 16 / sizeof(scalar_t);
+  const uint threads_per_head = head_size / pack_size;
+
+  const uint global_idx = blockIdx.x * NUM_THREADS + threadIdx.x;
+  const uint token_head_threads = num_tokens * num_heads * threads_per_head;
+
+  if (global_idx >= token_head_threads) return;
+
+  // global_idx -> token_idx + head_idx + pack_idx
+  const uint token_head_idx = global_idx / threads_per_head;
+  const uint pack_idx = global_idx % threads_per_head;
+
+  const uint token_idx = token_head_idx / num_heads;
+  const uint head_idx = token_head_idx % num_heads;
+
+  const uint pack_offset = pack_idx * pack_size;  // (0~15)*8, etc.
+  const uint head_offset = token_idx * num_heads * head_size + head_idx * head_size;
+  const scalar_t* prefix_head_ptr = prefix_output + head_offset;
+  const scalar_t* suffix_head_ptr = suffix_output + head_offset;
+  scalar_t* output_head_ptr = output + head_offset;
+
+  // float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
+  // float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
+  float p_lse = prefix_lse[token_idx * num_heads + head_idx];
+  float s_lse = suffix_lse[token_idx * num_heads + head_idx];
+  p_lse = std::isinf(p_lse) ? -std::numeric_limits<float>::infinity() : p_lse;
+  s_lse = std::isinf(s_lse) ? -std::numeric_limits<float>::infinity() : s_lse;
+
+  const float max_lse = fmaxf(p_lse, s_lse);
+  p_lse = p_lse - max_lse;
+  s_lse = s_lse - max_lse;
+  const float p_se = expf(p_lse);
+  const float s_se = expf(s_lse);
+  const float out_se = p_se + s_se;
+  const float p_scale = p_se / out_se;
+  const float s_scale = s_se / out_se;
+
+  if (pack_offset < head_size) {
+    // Pack 128b load
+    pack_128b_t p_out_pack = reinterpret_cast<const pack_128b_t*>(prefix_head_ptr)[pack_offset / pack_size];
+    pack_128b_t s_out_pack = reinterpret_cast<const pack_128b_t*>(suffix_head_ptr)[pack_offset / pack_size];
+    pack_128b_t o_out_pack;
+
+#pragma unroll
+    for (uint i = 0; i < pack_size; ++i) {
+      // Always use float for FMA to keep high precision.
+      // half(uint16_t), bfloat16, float -> float.
+      const float p_out_f = to_float(reinterpret_cast<const scalar_t*>(&p_out_pack)[i]);
+      const float s_out_f = to_float(reinterpret_cast<const scalar_t*>(&s_out_pack)[i]);
+      // fma: a * b + c = p_out_f * p_scale + (s_out_f * s_scale)
+      const float o_out_f = p_out_f * p_scale + (s_out_f * s_scale);
+      // float -> half(uint16_t), bfloat16, float.
+      from_float(reinterpret_cast<scalar_t*>(&o_out_pack)[i], o_out_f);
+    }
+
+    // Pack 128b storage
+    reinterpret_cast<pack_128b_t*>(output_head_ptr)[pack_offset / pack_size] = o_out_pack;
+  }
+  // We only need to write to output_lse once per head.
+  if (output_lse != nullptr && pack_idx == 0) {
+    float out_lse = logf(out_se) + max_lse;
+    output_lse[token_idx * num_heads + head_idx] = out_lse;
+  }
+}
+
+// The following macro is used to dispatch the conversion function based on
+// the output data type. The FN is a macro that calls a function with
+// template<typename scalar_t>.
+#define DISPATCH_BY_SCALAR_DTYPE(scalar_dtype, fn)                      \
+  {                                                                     \
+    if (scalar_dtype == at::ScalarType::Float) {                        \
+      fn(float);                                                        \
+    } else if (scalar_dtype == at::ScalarType::Half) {                  \
+      fn(half);                                                         \
+    } else if (scalar_dtype == at::ScalarType::BFloat16) {              \
+      fn(__nv_bfloat16);                                                \
+    } else {                                                            \
+      TORCH_CHECK(false, "Unsupported data type of O: ", scalar_dtype); \
+    }                                                                   \
+  }
+
+#define LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS)                          \
+  {                                                                              \
+    merge_attn_states_kernel<scalar_t, NUM_THREADS><<<grid, block, 0, stream>>>( \
+        reinterpret_cast<scalar_t*>(output.data_ptr()),                          \
+        reinterpret_cast<float*>(output_lse.data_ptr()),                         \
+        reinterpret_cast<scalar_t*>(prefix_output.data_ptr()),                   \
+        reinterpret_cast<float*>(prefix_lse.data_ptr()),                         \
+        reinterpret_cast<scalar_t*>(suffix_output.data_ptr()),                   \
+        reinterpret_cast<float*>(suffix_lse.data_ptr()),                         \
+        num_tokens,                                                              \
+        num_heads,                                                               \
+        head_size);                                                              \
+  }
+
+/*@brief Merges the attention states from prefix and suffix
+ * into the output tensor. NUM_TOKENS: n, NUM_HEADS: h, HEAD_SIZE: d
+ *
+ * @param output [n,h,d] The output tensor to store the merged attention states.
+ * @param output_lse [h,d] Optional tensor to store the log-sum-exp values.
+ * @param prefix_output [n,h,d] The prefix attention states.
+ * @param prefix_lse [n,h] The log-sum-exp values for the prefix attention
+ * states.
+ * @param suffix_output [n,h,d] The suffix attention states.
+ * @param suffix_lse [n,h] The log-sum-exp values for the suffix attention
+ * states.
+ */
+template <typename scalar_t>
+void merge_attn_states_launcher(
+    const at::Tensor& prefix_output,  // [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    const at::Tensor& prefix_lse,     // [NUM_TOKENS, NUM_HEADS]
+    const at::Tensor& suffix_output,  // [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    const at::Tensor& suffix_lse,     // [NUM_TOKENS, NUM_HEADS]
+    at::Tensor& output,               // [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    at::Tensor& output_lse            // [NUM_TOKENS, NUM_HEADS]
+) {
+  constexpr uint NUM_THREADS = 128;
+  const uint num_tokens = output.size(0);
+  const uint num_heads = output.size(1);
+  const uint head_size = output.size(2);
+  const uint pack_size = 16 / sizeof(scalar_t);
+  TORCH_CHECK(head_size % pack_size == 0, "headsize must be multiple of pack_size:", pack_size);
+  // Process one pack elements per thread. for float, the
+  // pack_size is 4 for half/bf16, the pack_size is 8.
+  const uint threads_per_head = head_size / pack_size;
+  const uint total_threads = num_tokens * num_heads * threads_per_head;
+
+  dim3 block(NUM_THREADS);
+  dim3 grid((total_threads + NUM_THREADS - 1) / NUM_THREADS);
+
+  const c10::cuda::OptionalCUDAGuard device_guard(prefix_output.device());
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS);
+}
+
+#define CALL_MERGE_ATTN_STATES_LAUNCHER(scalar_t) \
+  { merge_attn_states_launcher<scalar_t>(v_a, s_a, v_b, s_b, v_merged, s_merged); }
+
+void merge_state_v2(
+    at::Tensor v_a, at::Tensor s_a, at::Tensor v_b, at::Tensor s_b, at::Tensor v_merged, at::Tensor s_merged) {
+  // Input tensors must be contiguous
+  CHECK_INPUT(v_a);  // v_a prefix_output (seq_len, num_heads, head_dim)
+  CHECK_INPUT(s_a);  // s_a prefix_lse (seq_len, num_heads)
+  CHECK_INPUT(v_b);  // v_b suffix_output (seq_len, num_heads, head_dim)
+  CHECK_INPUT(s_b);  // s_b suffix_lse (seq_len, num_heads)
+  // v_merged output (seq_len, num_heads, head_dim)
+  // s_merged output_lse (seq_len, num_heads)
+  auto device = v_a.device();
+  CHECK_EQ(s_a.device(), device);
+  CHECK_EQ(v_b.device(), device);
+  CHECK_EQ(s_b.device(), device);
+  CHECK_DIM(3, v_a);
+  CHECK_DIM(2, s_a);
+  CHECK_DIM(3, v_b);
+  CHECK_DIM(2, s_b);
+  CHECK_SHAPE(v_a, v_b);
+  CHECK_SHAPE(s_a, s_b);
+  CHECK_EQ(v_a.size(0), s_a.size(0));
+  CHECK_EQ(v_a.size(1), s_b.size(1));
+  DISPATCH_BY_SCALAR_DTYPE(v_merged.dtype(), CALL_MERGE_ATTN_STATES_LAUNCHER);
+}
diff --git a/sgl-kernel/csrc/attention/vertical_slash_index.cu b/sgl-kernel/csrc/attention/vertical_slash_index.cu
new file mode 100644
index 000000000..118f780dd
--- /dev/null
+++ b/sgl-kernel/csrc/attention/vertical_slash_index.cu
@@ -0,0 +1,462 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+// This file is for blocksparse attention utils cuda kernel.
+
+#include <assert.h>
+#include <c10/cuda/CUDAStream.h>
+#include <cuda.h>
+#include <torch/all.h>
+
+// Save the start index of each block in the given range into block_offset.
+// Returns the updated block count.
+__device__ int64_t save_blocks(
+    int* block_offset,
+    int64_t range_start,
+    int64_t range_end,
+    int64_t block_size,
+    int64_t input_block_count,
+    int64_t kv_seqlen) {
+  if (range_start >= kv_seqlen) {
+    return input_block_count;
+  }
+  if (range_end > kv_seqlen) {
+    range_end = kv_seqlen;
+  }
+  int64_t current_block_count = input_block_count;
+  for (int idx = range_start; idx < range_end; idx += block_size) {
+    block_offset[current_block_count++] = idx;
+  }
+  return current_block_count;
+}
+
+// CUDA kernel: convert sparse vertical/slash indices to block/column offsets.
+__global__ void convert_vertical_slash_indexes_kernel(
+    const int* q_seqlens,         // [BATCH, ]
+    const int* kv_seqlens,        // [BATCH, ]
+    const int* vertical_indexes,  // [BATCH, N_HEADS, NNZ_V]
+    const int* slash_indexes,     // [BATCH, N_HEADS, NNZ_S]
+    int* block_count,             // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    int* block_offset,            // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_S]
+    int* column_count,            // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    int* column_index,            // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_V]
+    int64_t N_HEADS,
+    int64_t N_ROWS,
+    int64_t BLOCK_SIZE_M,
+    int64_t BLOCK_SIZE_N,
+    int64_t NNZ_V,
+    int64_t NNZ_S,
+    bool causal  // True for intra, False for succ
+) {
+  const int batch_idx = blockIdx.y;
+  const int head_idx = blockIdx.x;
+  const int group_idx = blockIdx.z;
+
+  int64_t q_seqlen = q_seqlens[batch_idx];
+  int64_t kv_seqlen = kv_seqlens[batch_idx];
+  int64_t block_idx_m = group_idx * blockDim.x + threadIdx.x;
+  int64_t start_m = block_idx_m * BLOCK_SIZE_M;
+  if (start_m >= q_seqlen) {
+    return;
+  }
+  int64_t end_m = start_m + BLOCK_SIZE_M;
+  vertical_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_V;
+  slash_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_S;
+  int64_t row_offset = (batch_idx * N_HEADS + head_idx) * N_ROWS + block_idx_m;
+  block_count += row_offset;
+  block_offset += row_offset * NNZ_S;
+  column_count += row_offset;
+  column_index += row_offset * NNZ_V;
+
+  bool has_slash = true;
+  int64_t tmp_col_cnt = 0, tmp_blk_cnt = 0;
+  int64_t s = 0, v = 0;
+  int64_t v_idx = vertical_indexes[v++];
+  int64_t s_idx = slash_indexes[s++];
+  if (causal) {
+    while (s_idx >= end_m + (kv_seqlen - q_seqlen) && s < NNZ_S) {
+      s_idx = slash_indexes[s++];
+    }
+    if (s_idx > end_m + (kv_seqlen - q_seqlen)) has_slash = false;
+    s_idx = max((kv_seqlen - q_seqlen) + end_m - s_idx, BLOCK_SIZE_M);
+  } else {
+    while (s_idx >= end_m + kv_seqlen && s < NNZ_S) {
+      s_idx = slash_indexes[s++];
+    }
+    if (s_idx > end_m + kv_seqlen) has_slash = false;
+    s_idx = max(kv_seqlen + end_m - s_idx, BLOCK_SIZE_M);
+  }
+
+  int64_t range_start = s_idx - BLOCK_SIZE_M, range_end = s_idx;
+  if (!has_slash) {
+    if (causal) {
+      range_start = (kv_seqlen - q_seqlen) + end_m;
+      range_end = (kv_seqlen - q_seqlen) + end_m + BLOCK_SIZE_N;
+    } else {
+      range_start = kv_seqlen;
+      range_end = kv_seqlen + BLOCK_SIZE_N;
+    }
+  }
+
+  bool slash_finished = false;
+  while (1) {
+    if (v_idx < range_end) {
+      if (v_idx < range_start) {
+        column_index[tmp_col_cnt++] = v_idx;
+      }
+      if (v < NNZ_V) {
+        v_idx = vertical_indexes[v++];
+      } else {
+        if (causal)
+          v_idx = end_m + BLOCK_SIZE_N + (kv_seqlen - q_seqlen);
+        else
+          v_idx = end_m + BLOCK_SIZE_N + kv_seqlen;
+      }
+    } else {
+      if ((s < NNZ_S && causal) || (s < NNZ_S && !causal && slash_indexes[s] >= start_m)) {
+        if (causal)
+          s_idx = max((kv_seqlen - q_seqlen) + end_m - slash_indexes[s++], BLOCK_SIZE_M);
+        else
+          s_idx = max(kv_seqlen + end_m - slash_indexes[s++], BLOCK_SIZE_M);
+      } else {
+        if (v == NNZ_V || (v_idx > range_start && causal)) {
+          // add the last vertical if no more slash
+          if (v == NNZ_V && !causal && v_idx < kv_seqlen) {
+            column_index[tmp_col_cnt++] = v_idx;
+          }
+          tmp_blk_cnt = save_blocks(block_offset, range_start, range_end, BLOCK_SIZE_N, tmp_blk_cnt, kv_seqlen);
+          break;
+        } else {
+          if (causal) {
+            range_start = (kv_seqlen - q_seqlen) + end_m;
+            range_end = (kv_seqlen - q_seqlen) + end_m + BLOCK_SIZE_N;
+          } else {
+            // if slash_finished but there are vertical left, save current
+            // blocks
+            tmp_blk_cnt = save_blocks(block_offset, range_start, range_end, BLOCK_SIZE_N, tmp_blk_cnt, kv_seqlen);
+            range_start = kv_seqlen;
+            range_end = kv_seqlen + BLOCK_SIZE_N;
+          }
+          slash_finished = true;
+        }
+      }
+      if (!slash_finished) {
+        if (s_idx > range_end + BLOCK_SIZE_M) {
+          tmp_blk_cnt = save_blocks(block_offset, range_start, range_end, BLOCK_SIZE_N, tmp_blk_cnt, kv_seqlen);
+          range_start = s_idx - BLOCK_SIZE_M;
+          range_end = s_idx;
+        } else if (s_idx > range_end) {
+          range_end += BLOCK_SIZE_M;
+        }
+      }
+    }
+  }
+
+  block_count[0] = tmp_blk_cnt;
+  column_count[0] = tmp_col_cnt;
+}
+
+// Host function: launches the kernel with 64 threads per block.
+void convert_vertical_slash_indexes_64x64(
+    const int* q_seqlens,         // [BATCH, ]
+    const int* kv_seqlens,        // [BATCH, ]
+    const int* vertical_indexes,  // [BATCH, N_HEADS, NNZ_V]
+    const int* slash_indexes,     // [BATCH, N_HEADS, NNZ_S]
+    int* block_count,             // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    int* block_offset,            // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_S]
+    int* column_count,            // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    int* column_index,            // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_V]
+    int64_t BATCH_SIZE,
+    int64_t N_HEADS,
+    int64_t N_ROWS,
+    int64_t BLOCK_SIZE_M,
+    int64_t BLOCK_SIZE_N,
+    int64_t NNZ_V,
+    int64_t NNZ_S,
+    bool causal) {
+  const int N_THREADS = 64;
+  const dim3 dimBlock((int32_t)N_THREADS);
+  const dim3 dimGrid(
+      (int32_t)N_HEADS, (int32_t)BATCH_SIZE, ((int32_t)N_ROWS + (int32_t)N_THREADS - 1) / (int32_t)N_THREADS);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  convert_vertical_slash_indexes_kernel<<<dimGrid, dimBlock, 0, stream>>>(
+      q_seqlens,
+      kv_seqlens,
+      vertical_indexes,
+      slash_indexes,
+      block_count,
+      block_offset,
+      column_count,
+      column_index,
+      N_HEADS,
+      N_ROWS,
+      BLOCK_SIZE_M,
+      BLOCK_SIZE_N,
+      NNZ_V,
+      NNZ_S,
+      causal);
+}
+
+// Host function: prepares tensor pointers and launches the CUDA kernel.
+void convert_vertical_slash_indexes(
+    torch::Tensor& block_count,      // [BATCH, N_HEADS, NUM_ROWS]
+    torch::Tensor& block_offset,     // [BATCH, N_HEADS, NUM_ROWS, NNZ_S]
+    torch::Tensor& column_count,     // [BATCH, N_HEADS, NUM_ROWS]
+    torch::Tensor& column_index,     // [BATCH, N_HEADS, NUM_ROWS, NNZ_V]
+    torch::Tensor q_seqlens,         // [BATCH, ]
+    torch::Tensor kv_seqlens,        // [BATCH, ]
+    torch::Tensor vertical_indexes,  // [BATCH, N_HEADS, NNZ_V]
+    torch::Tensor slash_indexes,     // [BATCH, N_HEADS, NNZ_S]
+    int64_t context_size,
+    int64_t block_size_M,
+    int64_t block_size_N,
+    bool causal) {
+  cudaSetDevice(q_seqlens.get_device());
+
+  int64_t batch_size = slash_indexes.size(0);
+  int64_t num_heads = slash_indexes.size(1);
+  int64_t nnz_slash = slash_indexes.size(2);
+  int64_t nnz_vertical = vertical_indexes.size(2);
+  int64_t num_rows = (context_size + block_size_M - 1) / block_size_M;
+
+  convert_vertical_slash_indexes_64x64(
+      q_seqlens.data_ptr<int>(),
+      kv_seqlens.data_ptr<int>(),
+      vertical_indexes.data_ptr<int>(),
+      slash_indexes.data_ptr<int>(),
+      block_count.data_ptr<int>(),
+      block_offset.data_ptr<int>(),
+      column_count.data_ptr<int>(),
+      column_index.data_ptr<int>(),
+      batch_size,
+      num_heads,
+      num_rows,
+      block_size_M,
+      block_size_N,
+      nnz_vertical,
+      nnz_slash,
+      causal);
+}
+
+// --- mergehead kernels --- //
+
+// Kernel: like above, but supports per-head variable NNZ_V/NNZ_S.
+__global__ void convert_vertical_slash_indexes_kernel_mergehead(
+    const int* q_seqlens,         // [BATCH, ]
+    const int* kv_seqlens,        // [BATCH, ]
+    const int* vertical_indexes,  // [BATCH, N_HEADS, NNZ_V]
+    const int* slash_indexes,     // [BATCH, N_HEADS, NNZ_S]
+    const int* per_head_vertical_topkv,
+    const int* per_head_slash_topkv,
+    int* block_count,   // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    int* block_offset,  // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_S]
+    int* column_count,  // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    int* column_index,  // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_V]
+    int64_t N_HEADS,
+    int64_t N_ROWS,
+    int64_t BLOCK_SIZE_M,
+    int64_t BLOCK_SIZE_N,
+    int64_t NNZ_V,
+    int64_t NNZ_S,
+    bool causal  // True for intra, False for succ
+) {
+  const int batch_idx = blockIdx.y;
+  const int head_idx = blockIdx.x;
+  const int group_idx = blockIdx.z;
+
+  int64_t q_seqlen = q_seqlens[batch_idx];
+  int64_t kv_seqlen = kv_seqlens[batch_idx];
+  int64_t block_idx_m = group_idx * blockDim.x + threadIdx.x;
+  int64_t start_m = block_idx_m * BLOCK_SIZE_M;
+  if (start_m >= q_seqlen) {
+    return;
+  }
+  int64_t end_m = start_m + BLOCK_SIZE_M;
+  vertical_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_V;
+  slash_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_S;
+  int64_t row_offset = (batch_idx * N_HEADS + head_idx) * N_ROWS + block_idx_m;
+  block_count += row_offset;
+  block_offset += row_offset * NNZ_S;
+  column_count += row_offset;
+  column_index += row_offset * NNZ_V;
+
+  // MergeHead: each head has it's unique max topk NNZ_V，NNZ_S. (NNZ_V，NNZ_S
+  // above is buffer size, use to compute offset)
+  NNZ_S = per_head_slash_topkv[head_idx];
+  NNZ_V = per_head_vertical_topkv[head_idx];
+
+  bool has_slash = true;
+  int64_t tmp_col_cnt = 0, tmp_blk_cnt = 0;
+  int64_t s = 0, v = 0;
+  int64_t v_idx = vertical_indexes[v++];
+  int64_t s_idx = slash_indexes[s++];
+  if (causal) {
+    while (s_idx >= end_m + (kv_seqlen - q_seqlen) && s < NNZ_S) {
+      s_idx = slash_indexes[s++];
+    }
+    if (s_idx > end_m + (kv_seqlen - q_seqlen)) has_slash = false;
+    s_idx = max((kv_seqlen - q_seqlen) + end_m - s_idx, BLOCK_SIZE_M);
+  } else {
+    while (s_idx >= end_m + kv_seqlen && s < NNZ_S) {
+      s_idx = slash_indexes[s++];
+    }
+    if (s_idx > end_m + kv_seqlen) has_slash = false;
+    s_idx = max(kv_seqlen + end_m - s_idx, BLOCK_SIZE_M);
+  }
+
+  int64_t range_start = s_idx - BLOCK_SIZE_M, range_end = s_idx;
+  if (!has_slash) {
+    if (causal) {
+      range_start = (kv_seqlen - q_seqlen) + end_m;
+      range_end = (kv_seqlen - q_seqlen) + end_m + BLOCK_SIZE_N;
+    } else {
+      range_start = kv_seqlen;
+      range_end = kv_seqlen + BLOCK_SIZE_N;
+    }
+  }
+
+  bool slash_finished = false;
+  while (1) {
+    if (v_idx < range_end) {
+      if (v_idx < range_start) {
+        column_index[tmp_col_cnt++] = v_idx;
+      }
+      if (v < NNZ_V) {
+        v_idx = vertical_indexes[v++];
+      } else {
+        if (causal)
+          v_idx = end_m + BLOCK_SIZE_N + (kv_seqlen - q_seqlen);
+        else
+          v_idx = end_m + BLOCK_SIZE_N + kv_seqlen;
+      }
+    } else {
+      if ((s < NNZ_S && causal) || (s < NNZ_S && !causal && slash_indexes[s] >= start_m)) {
+        if (causal)
+          s_idx = max((kv_seqlen - q_seqlen) + end_m - slash_indexes[s++], BLOCK_SIZE_M);
+        else
+          s_idx = max(kv_seqlen + end_m - slash_indexes[s++], BLOCK_SIZE_M);
+      } else {
+        if (v == NNZ_V || (v_idx > range_start && causal)) {
+          // add the last vertical if no more slash
+          if (v == NNZ_V && !causal && v_idx < kv_seqlen) {
+            column_index[tmp_col_cnt++] = v_idx;
+          }
+          tmp_blk_cnt = save_blocks(block_offset, range_start, range_end, BLOCK_SIZE_N, tmp_blk_cnt, kv_seqlen);
+          break;
+        } else {
+          if (causal) {
+            range_start = (kv_seqlen - q_seqlen) + end_m;
+            range_end = (kv_seqlen - q_seqlen) + end_m + BLOCK_SIZE_N;
+          } else {
+            // if slash_finished but there are vertical left, save current
+            // blocks
+            tmp_blk_cnt = save_blocks(block_offset, range_start, range_end, BLOCK_SIZE_N, tmp_blk_cnt, kv_seqlen);
+            range_start = kv_seqlen;
+            range_end = kv_seqlen + BLOCK_SIZE_N;
+          }
+          slash_finished = true;
+        }
+      }
+      if (!slash_finished) {
+        if (s_idx > range_end + BLOCK_SIZE_M) {
+          tmp_blk_cnt = save_blocks(block_offset, range_start, range_end, BLOCK_SIZE_N, tmp_blk_cnt, kv_seqlen);
+          range_start = s_idx - BLOCK_SIZE_M;
+          range_end = s_idx;
+        } else if (s_idx > range_end) {
+          range_end += BLOCK_SIZE_M;
+        }
+      }
+    }
+  }
+
+  block_count[0] = tmp_blk_cnt;
+  column_count[0] = tmp_col_cnt;
+}
+
+// Launch the mergehead kernel with 64 threads per block.
+void convert_vertical_slash_indexes_64x64_mergehead(
+    const int* q_seqlens,         // [BATCH, ]
+    const int* kv_seqlens,        // [BATCH, ]
+    const int* vertical_indexes,  // [BATCH, N_HEADS, NNZ_V]
+    const int* slash_indexes,     // [BATCH, N_HEADS, NNZ_S]
+    int* per_head_vertical_topkv,
+    int* per_head_slash_topkv,
+    int* block_count,   // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    int* block_offset,  // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_S]
+    int* column_count,  // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    int* column_index,  // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_V]
+    int64_t BATCH_SIZE,
+    int64_t N_HEADS,
+    int64_t N_ROWS,
+    int64_t BLOCK_SIZE_M,
+    int64_t BLOCK_SIZE_N,
+    int64_t NNZ_V,
+    int64_t NNZ_S,
+    bool causal) {
+  const int N_THREADS = 64;
+  const dim3 dimBlock(N_THREADS);
+  const dim3 dimGrid(N_HEADS, BATCH_SIZE, (N_ROWS + N_THREADS - 1) / N_THREADS);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  convert_vertical_slash_indexes_kernel_mergehead<<<dimGrid, dimBlock, 0, stream>>>(
+      q_seqlens,
+      kv_seqlens,
+      vertical_indexes,
+      slash_indexes,
+      per_head_vertical_topkv,
+      per_head_slash_topkv,
+      block_count,
+      block_offset,
+      column_count,
+      column_index,
+      N_HEADS,
+      N_ROWS,
+      BLOCK_SIZE_M,
+      BLOCK_SIZE_N,
+      NNZ_V,
+      NNZ_S,
+      causal);
+}
+
+// Host wrapper for mergehead kernel.
+void convert_vertical_slash_indexes_mergehead(
+    torch::Tensor& block_count,            // [BATCH, N_HEADS, NUM_ROWS]
+    torch::Tensor& block_offset,           // [BATCH, N_HEADS, NUM_ROWS, NNZ_S]
+    torch::Tensor& column_count,           // [BATCH, N_HEADS, NUM_ROWS]
+    torch::Tensor& column_index,           // [BATCH, N_HEADS, NUM_ROWS, NNZ_V]
+    torch::Tensor q_seqlens,               // [BATCH, ]
+    torch::Tensor kv_seqlens,              // [BATCH, ]
+    torch::Tensor vertical_indexes,        // [BATCH, N_HEADS, NNZ_V]
+    torch::Tensor slash_indexes,           // [BATCH, N_HEADS, NNZ_S]
+    torch::Tensor vertical_indices_count,  // [N_HEADS, ]
+    torch::Tensor slash_indices_count,
+    int64_t context_size,
+    int64_t block_size_M,
+    int64_t block_size_N,
+    bool causal) {
+  cudaSetDevice(q_seqlens.get_device());
+
+  int batch_size = slash_indexes.size(0);
+  int num_heads = slash_indexes.size(1);
+  int nnz_slash = slash_indexes.size(2);
+  int nnz_vertical = vertical_indexes.size(2);
+  int num_rows = (context_size + block_size_M - 1) / block_size_M;
+
+  convert_vertical_slash_indexes_64x64_mergehead(
+      q_seqlens.data_ptr<int>(),
+      kv_seqlens.data_ptr<int>(),
+      vertical_indexes.data_ptr<int>(),
+      slash_indexes.data_ptr<int>(),
+      vertical_indices_count.data_ptr<int>(),
+      slash_indices_count.data_ptr<int>(),
+      block_count.data_ptr<int>(),
+      block_offset.data_ptr<int>(),
+      column_count.data_ptr<int>(),
+      column_index.data_ptr<int>(),
+      batch_size,
+      num_heads,
+      num_rows,
+      block_size_M,
+      block_size_N,
+      nnz_vertical,
+      nnz_slash,
+      causal);
+}
diff --git a/sgl-kernel/csrc/common_extension.cc b/sgl-kernel/csrc/common_extension.cc
new file mode 100644
index 000000000..4f95c9138
--- /dev/null
+++ b/sgl-kernel/csrc/common_extension.cc
@@ -0,0 +1,481 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <torch/all.h>
+#include <torch/library.h>
+
+#include "sgl_kernel_ops.h"
+
+TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
+  /*
+   * From csrc/allreduce
+   */
+  m.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
+  m.def("register_graph_buffers", &register_graph_buffers);
+  m.def("dispose", &dispose);
+  m.def("meta_size", &meta_size);
+  m.def("register_buffer", &register_buffer);
+
+  m.def(
+      "init_custom_ar(int[] ipc_tensors, Tensor rank_data, "
+      "int rank, bool full_nvlink) -> int");
+  m.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
+
+  m.def(
+      "all_reduce(int fa, Tensor inp, Tensor! out, int reg_buffer, "
+      "int reg_buffer_sz_bytes) -> ()");
+  m.impl("all_reduce", torch::kCUDA, &all_reduce);
+
+  m.def("mscclpp_generate_unique_id", &mscclpp_generate_unique_id);
+  m.def(
+      "mscclpp_init_context(Tensor unique_id, int rank, int world_size, Tensor scratch, Tensor put_buffer, "
+      "int nranks_per_node, int[] rank_to_node, int[] rank_to_ib, int context_selection) -> int");
+  m.impl("mscclpp_init_context", torch::kCUDA, &mscclpp_init_context);
+
+  m.def("mscclpp_allreduce(int context, Tensor inp, Tensor! out, int nthreads, int nblocks) -> ()");
+  m.impl("mscclpp_allreduce", torch::kCUDA, &mscclpp_allreduce);
+
+  /*
+   * From csrc/attention
+   */
+  m.def(
+      "lightning_attention_decode(Tensor q, Tensor k, Tensor v, Tensor past_kv, Tensor slope, Tensor! output, Tensor! "
+      "new_kv) -> ()");
+  m.impl("lightning_attention_decode", torch::kCUDA, &lightning_attention_decode);
+  m.def("merge_state(Tensor v_a, Tensor s_a, Tensor v_b, Tensor s_b, Tensor! v_merged, Tensor! s_merged) -> ()");
+  m.impl("merge_state", torch::kCUDA, &merge_state);
+  m.def("merge_state_v2(Tensor v_a, Tensor s_a, Tensor v_b, Tensor s_b, Tensor! v_merged, Tensor! s_merged) -> ()");
+  m.impl("merge_state_v2", torch::kCUDA, &merge_state_v2);
+  m.def(
+      "cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe, Tensor kv_c_and_k_pe_cache, Tensor seq_lens, Tensor "
+      "page_table, Tensor! workspace, float sm_scale, int num_kv_splits) -> ()");
+  m.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode);
+  m.def("cutlass_mla_get_workspace_size", &cutlass_mla_get_workspace_size);
+
+  /*
+   * From csrc/elementwise
+   */
+  m.def("rmsnorm(Tensor! output, Tensor input, Tensor weight, float eps, bool enable_pdl) -> ()");
+  m.impl("rmsnorm", torch::kCUDA, &rmsnorm);
+
+  m.def("fused_add_rmsnorm(Tensor! input, Tensor! residual, Tensor weight, float eps, bool enable_pdl) -> ()");
+  m.impl("fused_add_rmsnorm", torch::kCUDA, &sgl_fused_add_rmsnorm);
+
+  m.def("gemma_rmsnorm(Tensor! output, Tensor input, Tensor weight, float eps, bool enable_pdl) -> ()");
+  m.impl("gemma_rmsnorm", torch::kCUDA, &gemma_rmsnorm);
+
+  m.def("gemma_fused_add_rmsnorm(Tensor! input, Tensor! residual, Tensor weight, float eps, bool enable_pdl) -> ()");
+  m.impl("gemma_fused_add_rmsnorm", torch::kCUDA, &gemma_fused_add_rmsnorm);
+
+  m.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
+  m.impl("silu_and_mul", torch::kCUDA, &silu_and_mul);
+
+  m.def("gelu_tanh_and_mul(Tensor! out, Tensor input) -> ()");
+  m.impl("gelu_tanh_and_mul", torch::kCUDA, &gelu_tanh_and_mul);
+
+  m.def("gelu_and_mul(Tensor! out, Tensor input) -> ()");
+  m.impl("gelu_and_mul", torch::kCUDA, &gelu_and_mul);
+
+  m.def(
+      "apply_rope_pos_ids_cos_sin_cache(Tensor q, Tensor k, Tensor! q_rope, Tensor! k_rope, Tensor cos_sin_cache, "
+      "Tensor pos_ids, bool interleave, bool enable_pdl, int cuda_stream, "
+      "Tensor? v, Tensor!? k_buffer, Tensor!? v_buffer, Tensor? kv_cache_loc) -> ()");
+  m.impl("apply_rope_pos_ids_cos_sin_cache", torch::kCUDA, &apply_rope_pos_ids_cos_sin_cache);
+
+  m.def(
+      "downcast_fp8(Tensor k, Tensor v, Tensor k_out, Tensor v_out, Tensor k_scale, Tensor v_scale, Tensor loc, int "
+      "mult, int offset, int cuda_stream) -> ()");
+  m.impl("downcast_fp8", torch::kCUDA, &downcast_fp8);
+
+  /*
+   * From csrc/gemm
+   */
+  m.def("awq_dequantize(Tensor qweight, Tensor scales, Tensor qzeros) -> Tensor");
+  m.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
+
+  m.def(
+      "int8_scaled_mm(Tensor mat_a, Tensor mat_b, Tensor scales_a, Tensor scales_b, ScalarType out_dtype, Tensor? "
+      "bias) -> Tensor");
+  m.impl("int8_scaled_mm", torch::kCUDA, &int8_scaled_mm);
+
+  m.def(
+      "fp8_scaled_mm(Tensor mat_a, Tensor mat_b, Tensor scales_a, Tensor scales_b, ScalarType out_dtype, Tensor? "
+      "bias) -> Tensor");
+  m.impl("fp8_scaled_mm", torch::kCUDA, &fp8_scaled_mm);
+
+  m.def(
+      "fp8_blockwise_scaled_mm(Tensor mat_a, Tensor mat_b, Tensor scales_a, Tensor scales_b, ScalarType out_dtype) -> "
+      "Tensor");
+  m.impl("fp8_blockwise_scaled_mm", torch::kCUDA, &fp8_blockwise_scaled_mm);
+
+  m.def(
+      "sgl_per_token_group_quant_fp8(Tensor input, Tensor output_q, Tensor output_s, int group_size,"
+      " float eps, float fp8_min, float fp8_max, bool scale_ue8m0) -> ()");
+  m.impl("sgl_per_token_group_quant_fp8", torch::kCUDA, &sgl_per_token_group_quant_fp8);
+
+  m.def(
+      "sgl_per_token_group_quant_int8(Tensor input, Tensor output_q, Tensor output_s, int group_size,"
+      " float eps, float int8_min, float int8_max) -> ()");
+  m.impl("sgl_per_token_group_quant_int8", torch::kCUDA, &sgl_per_token_group_quant_int8);
+
+  m.def("sgl_per_tensor_quant_fp8(Tensor input, Tensor output_q, Tensor output_s, bool is_static) -> ()");
+  m.impl("sgl_per_tensor_quant_fp8", torch::kCUDA, &sgl_per_tensor_quant_fp8);
+
+  m.def("sgl_per_token_quant_fp8(Tensor input, Tensor output_q, Tensor output_s) -> ()");
+  m.impl("sgl_per_token_quant_fp8", torch::kCUDA, &sgl_per_token_quant_fp8);
+
+  m.def(
+      "cutlass_scaled_fp4_mm(Tensor! out, Tensor a, Tensor b,"
+      "                      Tensor block_scale_a, Tensor block_scale_b,"
+      "                      Tensor alpha) -> ()");
+  m.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
+
+  m.def(
+      "scaled_fp4_quant(Tensor! output, Tensor! input,"
+      "                 Tensor! output_scale, Tensor! input_scale) -> ()");
+  m.impl("scaled_fp4_quant", torch::kCUDA, &scaled_fp4_quant);
+
+  m.def("dsv3_fused_a_gemm(Tensor! output, Tensor mat_a, Tensor mat_b) -> ()");
+  m.impl("dsv3_fused_a_gemm", torch::kCUDA, &dsv3_fused_a_gemm);
+
+  // Compute NVFP4 experts quantization.
+  m.def(
+      "scaled_fp4_experts_quant(Tensor! output, Tensor! output_scale,"
+      "Tensor input, Tensor input_global_scale, Tensor input_offset_by_experts,"
+      "Tensor output_scale_offset_by_experts) -> ()");
+  m.impl("scaled_fp4_experts_quant", torch::kCUDA, &scaled_fp4_experts_quant);
+
+  m.def(
+      "silu_and_mul_scaled_fp4_experts_quant(Tensor! output, Tensor! output_scale,"
+      "Tensor input, Tensor input_global_scale, Tensor mask, bool use_silu_and_mul) -> ()");
+  m.impl("silu_and_mul_scaled_fp4_experts_quant", torch::kCUDA, &silu_and_mul_scaled_fp4_experts_quant);
+
+  m.def(
+      "cutlass_fp4_group_mm(Tensor! output, Tensor a, Tensor b,"
+      "Tensor a_blockscale, Tensor b_blockscale, Tensor alphas,"
+      "Tensor ab_strides, Tensor c_strides, Tensor problem_sizes,"
+      " Tensor expert_offsets, Tensor sf_offsets) -> ()");
+  m.impl("cutlass_fp4_group_mm", torch::kCUDA, &cutlass_fp4_group_mm);
+
+  m.def("dsv3_router_gemm(Tensor! output, Tensor mat_a, Tensor mat_b) -> ()");
+  m.impl("dsv3_router_gemm", torch::kCUDA, &dsv3_router_gemm);
+
+  /*
+   * From csrc/gemm/gptq
+   */
+  m.def(
+      "gptq_marlin_gemm(Tensor! a, Tensor? c_or_none,"
+      "Tensor! b_q_weight, Tensor! b_scales, Tensor? global_scale_or_none,"
+      "Tensor? b_zeros_or_none, Tensor? g_idx_or_none, Tensor? perm_or_none,"
+      "Tensor! workspace, int b_q_type_id, int size_m, int size_n, int size_k,"
+      "bool is_k_full, bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) -> Tensor");
+  m.impl("gptq_marlin_gemm", torch::kCUDA, &gptq_marlin_gemm);
+
+  m.def(
+      "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, Tensor b_gptq_scales, Tensor b_g_idx, bool "
+      "use_shuffle, int bit) -> Tensor");
+  m.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
+
+  m.def("gptq_shuffle(Tensor! q_weight, Tensor q_perm, int bit) -> ()");
+  m.impl("gptq_shuffle", torch::kCUDA, &gptq_shuffle);
+
+  m.def("gptq_marlin_repack(Tensor! b_q_weight, Tensor! perm, int size_k, int size_n, int num_bits) -> Tensor");
+  m.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack);
+
+  m.def("awq_marlin_repack(Tensor! b_q_weight, int size_k, int size_n, int num_bits) -> Tensor");
+  m.impl("awq_marlin_repack", torch::kCUDA, &awq_marlin_repack);
+
+  /*
+   * From csrc/moe
+   */
+  m.def(
+      "moe_align_block_size(Tensor topk_ids, int num_experts, int block_size, Tensor! sorted_token_ids, Tensor! "
+      "experts_ids, Tensor! num_tokens_post_pad, Tensor! cumsum_buffer, bool "
+      "pad_sorted_token_ids) -> ()");
+  m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
+
+  m.def("topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor gating_output, bool renormalize) -> ()");
+  m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
+
+  m.def(
+      "moe_fused_gate(Tensor input, Tensor bias, int num_expert_group, int topk_group, int topk, int "
+      "num_fused_shared_experts, float routed_scaling_factor, bool apply_routed_scaling_factor_on_output) -> "
+      "(Tensor[])");
+  m.impl("moe_fused_gate", torch::kCUDA, &moe_fused_gate);
+  m.def(
+      "fp8_blockwise_scaled_grouped_mm(Tensor output, Tensor a_ptrs, Tensor b_ptrs, Tensor out_ptrs, Tensor "
+      "a_scales_ptrs, Tensor b_scales_ptrs, Tensor a, Tensor b, Tensor scales_a, Tensor scales_b, Tensor "
+      "stride_a, Tensor stride_b, Tensor stride_c, Tensor layout_sfa, Tensor layout_sfb, Tensor problem_sizes, Tensor "
+      "expert_offsets, Tensor workspace) -> ()");
+  m.impl("fp8_blockwise_scaled_grouped_mm", torch::kCUDA, &fp8_blockwise_scaled_grouped_mm);
+  m.def(
+      "prepare_moe_input(Tensor topk_ids, Tensor expert_offsets, Tensor? blockscale_offsets, Tensor problem_sizes1,"
+      " Tensor problem_sizes2, Tensor input_permutation, Tensor output_permutation, int num_experts, int n, int k) -> "
+      "()");
+  m.impl("prepare_moe_input", torch::kCUDA, &prepare_moe_input);
+
+  m.def("shuffle_rows(Tensor input, Tensor dst2src_map, Tensor output) -> ()");
+  m.impl("shuffle_rows", torch::kCUDA, &shuffle_rows);
+  m.def("apply_shuffle_mul_sum(Tensor input, Tensor output, Tensor permutation, Tensor? factors) -> ()");
+  m.impl("apply_shuffle_mul_sum", torch::kCUDA, &apply_shuffle_mul_sum);
+
+  /*
+   * From csrc/moe/marlin_moe_wna16
+   */
+  m.def(
+      "moe_wna16_marlin_gemm(Tensor! a, Tensor? c_or_none,"
+      "Tensor! b_q_weight, Tensor! b_scales, Tensor? b_zeros_or_none,"
+      "Tensor? g_idx_or_none, Tensor? perm_or_none, Tensor! workspace,"
+      "Tensor sorted_token_ids,"
+      "Tensor! expert_ids, Tensor! num_tokens_past_padded,"
+      "Tensor! topk_weights, int moe_block_size, int top_k, "
+      "bool mul_topk_weights, bool is_ep, int b_q_type_id,"
+      "int size_m, int size_n, int size_k,"
+      "bool is_k_full, bool use_atomic_add,"
+      "bool use_fp32_reduce, bool is_zp_float) -> Tensor");
+  m.impl("moe_wna16_marlin_gemm", torch::kCUDA, &moe_wna16_marlin_gemm);
+
+  /*
+   * From csrc/moe/cutlass_moe/w4a8
+   */
+  m.def(
+      "get_cutlass_w4a8_moe_mm_data(Tensor topk_ids, Tensor! expert_offsets, "
+      "                        Tensor! problem_sizes1, Tensor! problem_sizes2, "
+      "                        Tensor! input_permutation, "
+      "                        Tensor! output_permutation, int num_experts, "
+      "                        int n, int k) -> ()");
+  m.impl("get_cutlass_w4a8_moe_mm_data", torch::kCUDA, &get_cutlass_w4a8_moe_mm_data);
+
+  m.def(
+      "cutlass_w4a8_moe_mm(Tensor! d, Tensor a, Tensor b, "
+      "               Tensor a_scales, Tensor b_scales, Tensor expert_offsets, "
+      "               Tensor problem_sizes, Tensor a_strides, "
+      "               Tensor b_strides, Tensor d_strides, Tensor s_strides,"
+      "               int chunk_size, int topk) -> ()");
+  m.impl("cutlass_w4a8_moe_mm", torch::kCUDA, &cutlass_w4a8_moe_mm);
+
+  /*
+   * From csrc/speculative
+   */
+  m.def(
+      "tree_speculative_sampling_target_only(Tensor! predicts, Tensor! accept_index, Tensor! accept_token_num, "
+      "Tensor candidates, Tensor retrive_index, Tensor retrive_next_token, Tensor retrive_next_sibling, "
+      "Tensor uniform_samples, Tensor uniform_samples_for_final_sampling, Tensor target_probs, Tensor draft_probs, "
+      "float threshold_single, float threshold_acc, "
+      "bool deterministic, int cuda_stream) -> ()");
+  m.impl("tree_speculative_sampling_target_only", torch::kCUDA, &tree_speculative_sampling_target_only);
+
+  m.def(
+      "verify_tree_greedy(Tensor! predicts, Tensor! accept_index, Tensor! accept_token_num, "
+      "Tensor candidates, Tensor retrive_index, Tensor retrive_next_token, Tensor retrive_next_sibling, "
+      "Tensor target_predict, int cuda_stream) -> ()");
+  m.impl("verify_tree_greedy", torch::kCUDA, &verify_tree_greedy);
+
+  m.def(
+      "build_tree_kernel_efficient(Tensor parent_list, Tensor selected_index, Tensor verified_seq_len, "
+      "Tensor! tree_mask, Tensor! positions, Tensor! retrive_index, Tensor! retrive_next_token, "
+      "Tensor! retrive_next_sibling, int topk, int depth, int draft_token_num, int tree_mask_mode) -> "
+      "()");
+  m.impl("build_tree_kernel_efficient", torch::kCUDA, &build_tree_kernel_efficient);
+
+  m.def(
+      "segment_packbits(Tensor x, Tensor input_indptr, Tensor output_indptr, Tensor! y, int batch_size, "
+      "int cuda_stream) -> ()");
+  m.impl("segment_packbits", torch::kCUDA, &segment_packbits);
+
+  /*
+   * From csrc/kvcacheio
+   */
+  m.def(
+      "transfer_kv_per_layer(Tensor src_k, Tensor dst_k, Tensor src_v, Tensor dst_v, Tensor src_indices, Tensor "
+      "dst_indices, int item_size, int block_quota, int num_warps_per_block) -> ()");
+  m.impl("transfer_kv_per_layer", torch::kCUDA, &transfer_kv_per_layer);
+  m.def(
+      "transfer_kv_per_layer_pf_lf(Tensor src_k, Tensor dst_k, Tensor src_v, Tensor dst_v, Tensor src_indices, Tensor "
+      "dst_indices, int layer_id, int item_size, int src_layout_dim, int block_quota, int num_warps_per_block) -> ()");
+  m.impl("transfer_kv_per_layer_pf_lf", torch::kCUDA, &transfer_kv_per_layer_pf_lf);
+  m.def(
+      "transfer_kv_all_layer(Tensor src_k_layers, Tensor dst_k_layers, Tensor src_v_layers, Tensor dst_v_layers, "
+      "Tensor src_indices, Tensor dst_indices, int item_size, int num_layers, int block_quota, int "
+      "num_warps_per_block) -> ()");
+  m.impl("transfer_kv_all_layer", torch::kCUDA, &transfer_kv_all_layer);
+  m.def(
+      "transfer_kv_all_layer_lf_pf(Tensor src_k_layers, Tensor dst_k, Tensor src_v_layers, Tensor dst_v, "
+      "Tensor src_indices, Tensor dst_indices, int item_size, int dst_layout_dim, int num_layers, int block_quota, int "
+      "num_warps_per_block) -> ()");
+  m.impl("transfer_kv_all_layer_lf_pf", torch::kCUDA, &transfer_kv_all_layer_lf_pf);
+  m.def(
+      "transfer_kv_per_layer_mla(Tensor src, Tensor dst, Tensor src_indices, Tensor dst_indices, int item_size, int "
+      "block_quota, int num_warps_per_block) -> ()");
+  m.impl("transfer_kv_per_layer_mla", torch::kCUDA, &transfer_kv_per_layer_mla);
+  m.def(
+      "transfer_kv_per_layer_mla_pf_lf(Tensor src, Tensor dst, Tensor src_indices, Tensor dst_indices, int layer_id, "
+      "int item_size, int src_layout_dim, int block_quota, int num_warps_per_block) -> ()");
+  m.impl("transfer_kv_per_layer_mla_pf_lf", torch::kCUDA, &transfer_kv_per_layer_mla_pf_lf);
+  m.def(
+      "transfer_kv_all_layer_mla(Tensor src_layers, Tensor dst_layers, Tensor src_indices, Tensor dst_indices, int "
+      "item_size, int num_layers, int block_quota, int num_warps_per_block) -> ()");
+  m.impl("transfer_kv_all_layer_mla", torch::kCUDA, &transfer_kv_all_layer_mla);
+  m.def(
+      "transfer_kv_all_layer_mla_lf_pf(Tensor src_layers, Tensor dst, Tensor src_indices, Tensor dst_indices, "
+      "int item_size, int dst_layout_dim, int num_layers, int block_quota, int num_warps_per_block) -> ()");
+  m.impl("transfer_kv_all_layer_mla_lf_pf", torch::kCUDA, &transfer_kv_all_layer_mla_lf_pf);
+  m.def(
+      "transfer_kv_direct(Tensor[] src_layers, Tensor[] dst_layers, Tensor src_indices, Tensor dst_indices, int "
+      "page_size) -> ()");
+  m.impl("transfer_kv_direct", torch::kCUDA, &transfer_kv_direct);
+  m.def(
+      "transfer_kv_per_layer_direct_pf_lf(Tensor[] src_ptrs, Tensor[] dst_ptrs, Tensor src_indices, "
+      "Tensor dst_indices, int layer_id, int page_size)->() ");
+  m.impl("transfer_kv_per_layer_direct_pf_lf", torch::kCUDA, &transfer_kv_per_layer_direct_pf_lf);
+  m.def(
+      "transfer_kv_all_layer_direct_lf_pf(Tensor[] src_ptrs, Tensor[] dst_ptrs, Tensor src_indices, "
+      "Tensor dst_indices, int page_size) ->() ");
+  m.impl("transfer_kv_all_layer_direct_lf_pf", torch::kCUDA, &transfer_kv_all_layer_direct_lf_pf);
+
+  /*
+   * From csrc/memory
+   */
+  m.def("store_kv_cache(Tensor k_cache, Tensor v_cache, Tensor out_loc, Tensor k, Tensor v) -> ()");
+  m.impl("store_kv_cache", &store_kv_cache);
+
+  /*
+   * From FlashInfer
+   */
+  m.def(
+      "bmm_fp8(Tensor A, Tensor B, Tensor! D, Tensor A_scale, Tensor B_scale, Tensor workspace_buffer, int "
+      "cublas_handle, int cuda_stream) -> ()",
+      {at::Tag::needs_fixed_stride_order});
+  m.impl("bmm_fp8", torch::kCUDA, &bmm_fp8);
+
+  m.def(
+      "min_p_sampling_from_probs(Tensor probs, Tensor output, Tensor? maybe_indices, Tensor? maybe_min_p_arr, float "
+      "min_p_val, bool deterministic, Generator? gen) -> ()");
+  m.impl("min_p_sampling_from_probs", torch::kCUDA, &min_p_sampling_from_probs);
+
+  m.def("top_k_renorm_probs(Tensor probs, Tensor! renorm_probs, Tensor? maybe_top_k_arr, int top_k_val) -> ()");
+  m.impl("top_k_renorm_probs", torch::kCUDA, &top_k_renorm_probs);
+
+  m.def("top_p_renorm_probs(Tensor probs, Tensor! renorm_probs, Tensor? maybe_top_p_arr, float top_p_val) -> ()");
+  m.impl("top_p_renorm_probs", torch::kCUDA, &top_p_renorm_probs);
+
+  m.def(
+      "top_p_sampling_from_probs(Tensor probs, Tensor output, Tensor? maybe_indices, Tensor? "
+      "maybe_top_p_arr, float top_p_val, bool deterministic, Generator? gen) -> ()");
+  m.impl("top_p_sampling_from_probs", torch::kCUDA, &top_p_sampling_from_probs);
+
+  m.def(
+      "top_k_top_p_sampling_from_probs(Tensor probs, Tensor output, Tensor? maybe_indices, Tensor? maybe_top_k_arr, "
+      "float top_k_val, Tensor? maybe_top_p_arr, float top_p_val, bool deterministic, Generator? gen) -> ()");
+  m.impl("top_k_top_p_sampling_from_probs", torch::kCUDA, &top_k_top_p_sampling_from_probs);
+
+  m.def("top_k_mask_logits(Tensor logits, Tensor mask_logits, Tensor? maybe_top_k_arr, int top_k_val) -> ()");
+  m.impl("top_k_mask_logits", torch::kCUDA, &top_k_mask_logits);
+
+  /*
+   * From Sparse Flash Attention
+   */
+  m.def(
+      "fwd_sparse(Tensor! q, Tensor k, Tensor v, "
+      "Tensor block_count, Tensor block_offset, Tensor column_count, Tensor column_index, "
+      "Tensor!? out, Tensor? alibi_slopes, "
+      "float p_dropout, float softmax_scale, bool is_causal, "
+      "float softcap, bool return_softmax, Generator? gen)"
+      "-> Tensor[]");
+  m.impl("fwd_sparse", torch::kCUDA, &flash::mha_fwd_sparse);
+
+  m.def(
+      "varlen_fwd_sparse(Tensor! q, Tensor k, Tensor v, "
+      "Tensor block_count, Tensor block_offset, Tensor column_count, Tensor column_index, "
+      "Tensor!? out, Tensor cu_seqlens_q, "
+      "Tensor cu_seqlens_k, Tensor? seqused_k, Tensor? alibi_slopes, "
+      "int max_seqlen_q, int max_seqlen_k, float p_dropout, float softmax_scale, bool zero_tensors, "
+      "bool is_causal, float softcap, bool return_softmax, "
+      "Generator? gen) -> Tensor[]");
+  m.impl("varlen_fwd_sparse", torch::kCUDA, &flash::mha_varlen_fwd_sparse);
+
+  // Sparse Attention utils
+  m.def(
+      "convert_vertical_slash_indexes("
+      "   Tensor! block_count, Tensor! block_offset, "
+      "   Tensor! column_count, Tensor! column_index, "
+      "   Tensor q_seqlens, Tensor q_seqlens, "
+      "   Tensor vertical_indexes, Tensor slash_indexes, "
+      "   int context_size, int block_size_M, int block_size_N, "
+      "   bool causal) -> ()");
+  m.impl("convert_vertical_slash_indexes", torch::kCUDA, &convert_vertical_slash_indexes);
+
+  m.def(
+      "convert_vertical_slash_indexes_mergehead("
+      "   Tensor! block_count, Tensor! block_offset, "
+      "   Tensor! column_count, Tensor! column_index, "
+      "   Tensor q_seqlens, Tensor q_seqlens, "
+      "   Tensor vertical_indexes, Tensor slash_indexes, "
+      "   Tensor vertical_indices_count, Tensor slash_indices_count, "
+      "   int context_size, int block_size_M, int block_size_N, "
+      "   bool causal) -> ()");
+  m.impl("convert_vertical_slash_indexes_mergehead", torch::kCUDA, &convert_vertical_slash_indexes_mergehead);
+
+  /*
+   * From csrc/grammar
+   */
+  m.def("apply_token_bitmask_inplace_cuda(Tensor logits, Tensor bitmask, Tensor? indices=None) -> ()");
+  m.impl("apply_token_bitmask_inplace_cuda", &ApplyTokenBitmaskInplace);
+
+  /*
+   * From csrc/gemm (QServe)
+   */
+  m.def(
+      "qserve_w4a8_per_chn_gemm(Tensor _in_feats, Tensor _kernel, Tensor _wscales, Tensor _ascales, Tensor _w_szs, "
+      "Tensor _a_ssums, Tensor! _out_feats) -> ()");
+  m.impl("qserve_w4a8_per_chn_gemm", torch::kCUDA, &qserve_w4a8_per_chn_gemm);
+
+  m.def(
+      "qserve_w4a8_per_group_gemm(Tensor _in_feats, Tensor _kernel, Tensor _zeros, Tensor _scales_i8, Tensor _wscales, "
+      "Tensor _ascales, Tensor! _out_feats) -> ()");
+  m.impl("qserve_w4a8_per_group_gemm", torch::kCUDA, &qserve_w4a8_per_group_gemm);
+
+  m.def("copy_to_gpu_no_ce(Tensor input, Tensor! output) -> ()");
+  m.impl("copy_to_gpu_no_ce", torch::kCUDA, &copy_to_gpu_no_ce);
+  m.def("concat_mla_k(Tensor! k, Tensor k_nope, Tensor k_rope) -> ()");
+  m.impl("concat_mla_k", torch::kCUDA, &concat_mla_k);
+
+  /*
+   * From csrc/mamba
+   */
+  m.def(
+      "causal_conv1d_update(Tensor! x,"
+      "Tensor! conv_state,"
+      "Tensor! weight,"
+      "Tensor? bias_,"
+      "bool silu_activation,"
+      "Tensor? cache_seqlens_,"
+      "Tensor? conv_state_indices,"
+      "int pad_slot_id) -> ()");
+  m.impl("causal_conv1d_update", torch::kCUDA, &causal_conv1d_update);
+
+  m.def(
+      "causal_conv1d_fwd(Tensor! x, Tensor! weight,"
+      "Tensor? bias_,"
+      "Tensor!? conv_states,"
+      "Tensor? query_start_loc,"
+      "Tensor? cache_indices,"
+      "Tensor? has_initial_state,"
+      "bool silu_activation,"
+      "int pad_slot_id) -> ()");
+  m.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
+}
+
+REGISTER_EXTENSION(common_ops)
diff --git a/sgl-kernel/csrc/common_extension_rocm.cc b/sgl-kernel/csrc/common_extension_rocm.cc
new file mode 100644
index 000000000..f4e14d0d5
--- /dev/null
+++ b/sgl-kernel/csrc/common_extension_rocm.cc
@@ -0,0 +1,176 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <torch/library.h>
+
+#include "sgl_kernel_ops.h"
+
+TORCH_LIBRARY_EXPAND(sgl_kernel, m) {
+  /*
+   * From csrc/activation
+   */
+  m.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
+  m.impl("silu_and_mul", torch::kCUDA, &silu_and_mul);
+
+  m.def("gelu_tanh_and_mul(Tensor! out, Tensor input) -> ()");
+  m.impl("gelu_tanh_and_mul", torch::kCUDA, &gelu_tanh_and_mul);
+
+  m.def("gelu_and_mul(Tensor! out, Tensor input) -> ()");
+  m.impl("gelu_and_mul", torch::kCUDA, &gelu_and_mul);
+
+  m.def("gelu_quick(Tensor! out, Tensor input) -> ()");
+  m.impl("gelu_quick", torch::kCUDA, &gelu_quick);
+
+  /*
+   * From csrc/allreduce
+   */
+  m.def(
+      "init_custom_ar(Tensor meta, Tensor rank_data, "
+      "str[] handles, int[] offsets, int rank, "
+      "bool full_nvlink) -> int");
+  m.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
+
+  m.def("all_reduce_reg(int fa, Tensor inp, Tensor! out) -> ()");
+  m.impl("all_reduce_reg", torch::kCUDA, &all_reduce_reg);
+
+  m.def(
+      "all_reduce_unreg(int fa, Tensor inp, Tensor reg_buffer, Tensor! out) -> "
+      "()");
+  m.impl("all_reduce_unreg", torch::kCUDA, &all_reduce_unreg);
+
+  m.def("dispose", &dispose);
+
+  m.def("meta_size", &meta_size);
+
+  m.def(
+      "register_buffer(int fa, Tensor t, str[] handles, "
+      "int[] offsets) -> ()");
+  m.impl("register_buffer", torch::kCUDA, &register_buffer);
+
+  m.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
+  m.def("register_graph_buffers", &register_graph_buffers);
+
+  m.def("allocate_meta_buffer", &allocate_meta_buffer);
+  m.impl("allocate_meta_buffer", torch::kCUDA, &allocate_meta_buffer);
+
+  m.def("get_meta_buffer_ipc_handle", &get_meta_buffer_ipc_handle);
+  m.impl("get_meta_buffer_ipc_handle", torch::kCPU, &get_meta_buffer_ipc_handle);
+
+  // quick allreduce
+#ifdef USE_ROCM
+  m.def(
+      "qr_all_reduce(int fa, Tensor inp, Tensor out, int quant_level, bool "
+      "cast_bf2half) -> ()");
+  m.impl("qr_all_reduce", torch::kCUDA, &qr_all_reduce);
+
+  m.def("init_custom_qr", &init_custom_qr);
+  m.def("qr_destroy", &qr_destroy);
+
+  m.def("qr_get_handle", &qr_get_handle);
+
+  m.def("qr_open_handles(int _fa, Tensor[](b!) handles) -> ()");
+  m.impl("qr_open_handles", torch::kCPU, &qr_open_handles);
+
+  // Max input size in bytes
+  m.def("qr_max_size", &qr_max_size);
+#endif
+
+  /*
+   * From csrc/moe
+   */
+  m.def(
+      "moe_align_block_size(Tensor topk_ids, int num_experts, int block_size, Tensor! sorted_token_ids, Tensor! "
+      "experts_ids, Tensor! num_tokens_post_pad, Tensor! cumsum_buffer, bool "
+      "pad_sorted_token_ids) -> ()");
+  m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
+
+  m.def("topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor gating_output, bool renormalize) -> ()");
+  m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
+
+  /*
+   * From csrc/speculative
+   */
+  m.def(
+      "verify_tree_greedy(Tensor! predicts, Tensor! accept_index, Tensor! accept_token_num, "
+      "Tensor candidates, Tensor retrive_index, Tensor retrive_next_token, Tensor retrive_next_sibling, "
+      "Tensor target_predict, int cuda_stream) -> ()");
+  m.impl("verify_tree_greedy", torch::kCUDA, &verify_tree_greedy);
+
+  m.def(
+      "build_tree_kernel_efficient(Tensor parent_list, Tensor selected_index, Tensor verified_seq_len, "
+      "Tensor! tree_mask, Tensor! positions, Tensor! retrive_index, Tensor! retrive_next_token, "
+      "Tensor! retrive_next_sibling, int topk, int depth, int draft_token_num, int tree_mask_mode) -> "
+      "()");
+  m.impl("build_tree_kernel_efficient", torch::kCUDA, &build_tree_kernel_efficient);
+
+  /*
+   * From XGrammar
+   */
+  m.def("apply_token_bitmask_inplace_cuda(Tensor logits, Tensor bitmask, Tensor? indices=None) -> ()");
+  m.impl("apply_token_bitmask_inplace_cuda", &ApplyTokenBitmaskInplace);
+
+  /*
+   * From csrc/kvcacheio
+   */
+  m.def(
+      "transfer_kv_per_layer(Tensor src_k, Tensor dst_k, Tensor src_v, Tensor dst_v, Tensor src_indices, Tensor "
+      "dst_indices, int item_size, int block_quota, int num_warps_per_block) -> ()");
+  m.impl("transfer_kv_per_layer", torch::kCUDA, &transfer_kv_per_layer);
+  m.def(
+      "transfer_kv_per_layer_pf_lf(Tensor src_k, Tensor dst_k, Tensor src_v, Tensor dst_v, Tensor src_indices, Tensor "
+      "dst_indices, int layer_id, int item_size, int src_layout_dim, int block_quota, int num_warps_per_block) -> ()");
+  m.impl("transfer_kv_per_layer_pf_lf", torch::kCUDA, &transfer_kv_per_layer_pf_lf);
+  m.def(
+      "transfer_kv_all_layer(Tensor src_k_layers, Tensor dst_k_layers, Tensor src_v_layers, Tensor dst_v_layers, "
+      "Tensor src_indices, Tensor dst_indices, int item_size, int num_layers, int block_quota, int "
+      "num_warps_per_block) -> ()");
+  m.impl("transfer_kv_all_layer", torch::kCUDA, &transfer_kv_all_layer);
+  m.def(
+      "transfer_kv_all_layer_lf_pf(Tensor src_k_layers, Tensor dst_k, Tensor src_v_layers, Tensor dst_v, "
+      "Tensor src_indices, Tensor dst_indices, int item_size, int dst_layout_dim, int num_layers, int block_quota, int "
+      "num_warps_per_block) -> ()");
+  m.impl("transfer_kv_all_layer_lf_pf", torch::kCUDA, &transfer_kv_all_layer_lf_pf);
+  m.def(
+      "transfer_kv_per_layer_mla(Tensor src, Tensor dst, Tensor src_indices, Tensor dst_indices, int item_size, int "
+      "block_quota, int num_warps_per_block) -> ()");
+  m.impl("transfer_kv_per_layer_mla", torch::kCUDA, &transfer_kv_per_layer_mla);
+  m.def(
+      "transfer_kv_per_layer_mla_pf_lf(Tensor src, Tensor dst, Tensor src_indices, Tensor dst_indices, int layer_id, "
+      "int item_size, int src_layout_dim, int block_quota, int num_warps_per_block) -> ()");
+  m.impl("transfer_kv_per_layer_mla_pf_lf", torch::kCUDA, &transfer_kv_per_layer_mla_pf_lf);
+  m.def(
+      "transfer_kv_all_layer_mla(Tensor src_layers, Tensor dst_layers, Tensor src_indices, Tensor dst_indices, int "
+      "item_size, int num_layers, int block_quota, int num_warps_per_block) -> ()");
+  m.impl("transfer_kv_all_layer_mla", torch::kCUDA, &transfer_kv_all_layer_mla);
+  m.def(
+      "transfer_kv_all_layer_mla_lf_pf(Tensor src_layers, Tensor dst, Tensor src_indices, Tensor dst_indices, "
+      "int item_size, int dst_layout_dim, int num_layers, int block_quota, int num_warps_per_block) -> ()");
+  m.impl("transfer_kv_all_layer_mla_lf_pf", torch::kCUDA, &transfer_kv_all_layer_mla_lf_pf);
+  m.def(
+      "transfer_kv_direct(Tensor[] src_layers, Tensor[] dst_layers, Tensor src_indices, Tensor dst_indices, int "
+      "page_size) -> ()");
+  m.impl("transfer_kv_direct", torch::kCUDA, &transfer_kv_direct);
+  m.def(
+      "transfer_kv_per_layer_direct_pf_lf(Tensor[] src_ptrs, Tensor[] dst_ptrs, Tensor src_indices, "
+      "Tensor dst_indices, int layer_id, int page_size)->() ");
+  m.impl("transfer_kv_per_layer_direct_pf_lf", torch::kCUDA, &transfer_kv_per_layer_direct_pf_lf);
+  m.def(
+      "transfer_kv_all_layer_direct_lf_pf(Tensor[] src_ptrs, Tensor[] dst_ptrs, Tensor src_indices, "
+      "Tensor dst_indices, int page_size) ->() ");
+  m.impl("transfer_kv_all_layer_direct_lf_pf", torch::kCUDA, &transfer_kv_all_layer_direct_lf_pf);
+}
+
+REGISTER_EXTENSION(common_ops)
diff --git a/sgl-kernel/csrc/cpu/CMakeLists.txt b/sgl-kernel/csrc/cpu/CMakeLists.txt
new file mode 100755
index 000000000..421fbedab
--- /dev/null
+++ b/sgl-kernel/csrc/cpu/CMakeLists.txt
@@ -0,0 +1,83 @@
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+project(sgl_kernel)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+find_package(Python COMPONENTS Interpreter Development.Module ${SKBUILD_SABI_COMPONENT} REQUIRED)
+
+execute_process(
+    COMMAND ${Python_EXECUTABLE}
+            -c "import torch; print(torch.utils.cmake_prefix_path)"
+    OUTPUT_VARIABLE TORCH_PY_PREFIX
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+message(STATUS ${TORCH_PY_PREFIX})
+list(APPEND CMAKE_PREFIX_PATH ${TORCH_PY_PREFIX}/Torch)
+find_package(Torch REQUIRED)
+
+include_directories(
+    ${TORCH_INCLUDE_DIRS}
+    ${TORCH_INSTALL_PREFIX}/include
+    ${Python_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../csrc
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../include
+)
+
+# Platform-specific library directory
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64")
+    set(PLAT_LIB_DIR "/usr/lib/x86_64-linux-gnu")
+    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64")
+    set(PLAT_LIB_DIR "/usr/lib/aarch64-linux-gnu")
+    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le|ppc64")
+    set(PLAT_LIB_DIR "/usr/lib/powerpc64le-linux-gnu")
+else()
+    set(PLAT_LIB_DIR "/usr/lib/${CMAKE_SYSTEM_PROCESSOR}-linux-gnu")
+endif()
+link_directories(${PLAT_LIB_DIR})
+
+# Conda library path support
+if(DEFINED ENV{CONDA_PREFIX})
+    set(CONDA_LIB_DIR "$ENV{CONDA_PREFIX}/lib")
+    message(STATUS "Using Conda lib dir: ${CONDA_LIB_DIR}")
+    link_directories(${CONDA_LIB_DIR})
+    set(CONDA_INCLUDE_DIR "$ENV{CONDA_PREFIX}/include")
+    include_directories(${CONDA_INCLUDE_DIR})
+
+    # Look for libnuma in Conda's lib directory
+    find_library(NUMA_LIB numa HINTS "${CONDA_LIB_DIR}")
+    if(NUMA_LIB)
+        message(STATUS "Found libnuma: ${NUMA_LIB}")
+    else()
+        message(FATAL_ERROR "libnuma not found in Conda environment at ${CONDA_LIB_DIR}\n"
+                            "Please install it using: conda install libnuma numactl\n")
+    endif()
+endif()
+
+file(GLOB SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp")
+
+if(NOT DEFINED ENV{SGLANG_CPU_FP8_CVT_FTZ})
+    set(ENV{SGLANG_CPU_FP8_CVT_FTZ} "1")
+endif()
+
+if("$ENV{SGLANG_CPU_FP8_CVT_FTZ}" STREQUAL "1")
+    message(STATUS "Enabling macro: SGLANG_CPU_FP8_CVT_FTZ")
+    add_compile_definitions(SGLANG_CPU_FP8_CVT_FTZ)
+endif()
+
+add_compile_options(
+    -O3
+    -Wno-unknown-pragmas
+    -march=native
+    -fopenmp
+)
+
+Python_add_library(common_ops MODULE USE_SABI ${SKBUILD_SABI_VERSION} WITH_SOABI ${SOURCES})
+target_link_libraries(common_ops PRIVATE ${TORCH_LIBRARIES} ${NUMA_LIB})
+target_include_directories(common_ops PRIVATE ${TORCH_INCLUDE_DIRS})
+
+install(TARGETS common_ops
+    LIBRARY DESTINATION sgl_kernel
+)
diff --git a/sgl-kernel/csrc/cpu/activation.cpp b/sgl-kernel/csrc/cpu/activation.cpp
new file mode 100644
index 000000000..70756776b
--- /dev/null
+++ b/sgl-kernel/csrc/cpu/activation.cpp
@@ -0,0 +1,135 @@
+#include "common.h"
+#include "vec.h"
+
+namespace {
+
+template <typename scalar_t, typename func_t, typename vec_func_t>
+void act_and_mul_kernel_impl(
+    scalar_t* __restrict__ output,
+    const scalar_t* __restrict__ input,
+    int64_t num_tokens,
+    int64_t dim,
+    const func_t& f,
+    const vec_func_t& vf) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+
+  constexpr int64_t kVecSize = bVec::size();
+  at::parallel_for(0, num_tokens, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t i = begin; i < end; ++i) {
+      // local ptrs
+      const scalar_t* __restrict__ input_ptr = input + i * 2 * dim;
+      const scalar_t* __restrict__ input_other_ptr = input_ptr + dim;
+      scalar_t* __restrict__ output_ptr = output + i * dim;
+
+      int64_t d;
+#pragma GCC unroll 4
+      for (d = 0; d <= dim - kVecSize; d += kVecSize) {
+        bVec x_bvec = bVec::loadu(input_ptr + d);
+        fVec x_fvec0, x_fvec1;
+        std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
+
+        bVec y_bvec = bVec::loadu(input_other_ptr + d);
+        fVec y_fvec0, y_fvec1;
+        std::tie(y_fvec0, y_fvec1) = at::vec::convert_to_float(y_bvec);
+
+        x_fvec0 = vf(x_fvec0);
+        x_fvec1 = vf(x_fvec1);
+
+        x_fvec0 = x_fvec0 * y_fvec0;
+        x_fvec1 = x_fvec1 * y_fvec1;
+
+        x_bvec = convert_from_float_ext<scalar_t>(x_fvec0, x_fvec1);
+        x_bvec.store(output_ptr + d);
+      }
+#pragma GCC unroll 4
+      for (; d < dim; ++d) {
+        float x_val = static_cast<float>(input_ptr[d]);
+        float y_val = static_cast<float>(input_other_ptr[d]);
+        output_ptr[d] = f(x_val) * y_val;
+      }
+    }
+  });
+}
+
+}  // anonymous namespace
+
+// input   : {num_tokens, 2 * d}
+// output  : {num_tokens, d}
+at::Tensor silu_and_mul_cpu(at::Tensor& input) {
+  RECORD_FUNCTION("sgl-kernel::silu_and_mul_cpu", std::vector<c10::IValue>({input}));
+  auto sizes = input.sizes().vec();
+  int64_t last_dim = input.ndimension() - 1;
+  int64_t d = sizes[last_dim] / 2;
+  sizes[last_dim] = d;
+  int64_t num_tokens = input.numel() / input.size(-1);
+  at::Tensor out = at::empty(sizes, input.options());
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "silu_and_mul", [&] {
+    using Vec = at::vec::Vectorized<float>;
+    act_and_mul_kernel_impl(
+        out.data_ptr<scalar_t>(),
+        input.data_ptr<scalar_t>(),
+        num_tokens,
+        d,
+        [](float x) { return x / (1.f + std::exp(-x)); },
+        [](Vec x) { return x / (Vec(1.f) + x.neg().exp()); });
+  });
+  return out;
+}
+
+at::Tensor gelu_tanh_and_mul_cpu(const at::Tensor& input) {
+  RECORD_FUNCTION("sgl-kernel::gelu_tanh_and_mul_cpu", std::vector<c10::IValue>({input}));
+  auto sizes = input.sizes().vec();
+  int64_t last_dim = input.ndimension() - 1;
+  int64_t d = sizes[last_dim] / 2;
+  sizes[last_dim] = d;
+  int64_t num_tokens = input.numel() / input.size(-1);
+  at::Tensor out = at::empty(sizes, input.options());
+  const float sqrt_2_div_pi = std::sqrt(2.f / M_PI);
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "gelu_tanh_and_mul", [&] {
+    using Vec = at::vec::Vectorized<float>;
+    act_and_mul_kernel_impl(
+        out.data_ptr<scalar_t>(),
+        input.data_ptr<scalar_t>(),
+        num_tokens,
+        d,
+        [sqrt_2_div_pi](float x) {
+          float x3 = x * x * x;
+          float tanh_arg = sqrt_2_div_pi * (x + 0.044715f * x3);
+          return 0.5f * x * (1.f + std::tanh(tanh_arg));
+        },
+        [sqrt_2_div_pi](Vec x) {
+          Vec x3 = x * x * x;
+          Vec tanh_arg = Vec(sqrt_2_div_pi) * (x + Vec(0.044715f) * x3);
+          return Vec(0.5f) * x * (Vec(1.f) + tanh_arg.tanh());
+        });
+  });
+
+  return out;
+}
+
+at::Tensor gelu_and_mul_cpu(const at::Tensor& input) {
+  RECORD_FUNCTION("sgl-kernel::gelu_and_mul_cpu", std::vector<c10::IValue>({input}));
+  auto sizes = input.sizes().vec();
+  int64_t last_dim = input.ndimension() - 1;
+  int64_t d = sizes[last_dim] / 2;
+  sizes[last_dim] = d;
+  int64_t num_tokens = input.numel() / input.size(-1);
+  at::Tensor out = at::empty(sizes, input.options());
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "gelu_and_mul", [&] {
+    using Vec = at::vec::Vectorized<float>;
+    const float inv_sqrt2 = 1.0f / std::sqrt(2.0f);
+    act_and_mul_kernel_impl(
+        out.data_ptr<scalar_t>(),
+        input.data_ptr<scalar_t>(),
+        num_tokens,
+        d,
+        [inv_sqrt2](float x) { return 0.5f * x * (1.f + std::erf(x * inv_sqrt2)); },
+        [inv_sqrt2](Vec x) { return Vec(0.5f) * x * (Vec(1.f) + (x * Vec(inv_sqrt2)).erf()); });
+  });
+
+  return out;
+}
diff --git a/sgl-kernel/csrc/cpu/bmm.cpp b/sgl-kernel/csrc/cpu/bmm.cpp
new file mode 100644
index 000000000..9e809a464
--- /dev/null
+++ b/sgl-kernel/csrc/cpu/bmm.cpp
@@ -0,0 +1,123 @@
+#include "common.h"
+#include "gemm.h"
+#include "vec.h"
+
+namespace {
+
+template <typename scalar_t>
+void bmm_kernel_impl(
+    scalar_t* __restrict__ out,
+    const scalar_t* __restrict__ mat1,
+    const scalar_t* __restrict__ mat2,
+    int64_t B,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t mat1_strideB,
+    int64_t mat1_strideM,
+    int64_t out_strideB,
+    int64_t out_strideM,
+    float scale = 0.f) {
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  // mat2 contiguous in [B, N, K]
+  int64_t mat2_strideB = N * K;
+  int64_t mat2_strideN = K;
+
+  const bool use_brgemm = can_use_brgemm<scalar_t>(M);
+
+  // parallel on [B, MB, NB]
+  at::parallel_for(0, B * MB * NB, 0, [&](int64_t begin, int64_t end) {
+    int64_t bs{0}, mb{0}, nb{0};
+    data_index_init(begin, bs, B, mb, MB, nb, NB);
+
+    // for brgemm, use float32 for accumulate
+    alignas(64) float Ctmp[BLOCK_M * BLOCK_N];
+
+    for (int i = begin; i < end; ++i) {
+      UNUSED(i);
+      int mb_start = mb * BLOCK_M;
+      int mb_size = std::min(M - mb_start, BLOCK_M);
+      int nb_start = nb * BLOCK_N;
+      int nb_size = std::min(N - nb_start, BLOCK_N);
+
+      tinygemm_kernel<scalar_t>(
+          /*   A */ mat1 + bs * mat1_strideB + mb_start * mat1_strideM,
+          /*   B */ mat2 + bs * mat2_strideB + nb_start * mat2_strideN /* nb * BLOCK_N * K */,
+          /*   C */ out + bs * out_strideB + mb_start * out_strideM + nb_start,
+          /* Ctmp*/ Ctmp,
+          /*   M */ mb_size,
+          /*   N */ nb_size,
+          /*   K */ K,
+          /* lda */ mat1_strideM,
+          /* ldb */ nb_size,
+          /* ldc */ out_strideM,
+          /* brg */ use_brgemm);
+
+      // move to the next index
+      data_index_step(bs, B, mb, MB, nb, NB);
+    }
+
+    if (use_brgemm) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+}
+
+}  // anonymous namespace
+
+// mat1 : [B, M, K]
+// mat2 : [B, N, K] or [B, OC, IC]
+// out  : [B, M, N]
+// scale: [] 0-dim tensor for per tensor quant
+//
+void bmm_cpu(
+    at::Tensor& out, at::Tensor& mat1, at::Tensor& mat2, bool is_vnni, const std::optional<at::Tensor>& scale) {
+  RECORD_FUNCTION("sgl-kernel::bmm_cpu", std::vector<c10::IValue>({out, mat1, mat2}));
+
+  auto packed_w = is_vnni ? mat2 : convert_weight_packed(mat2);
+
+  // input and out could be non-contiguous
+  // weight needs to be contiguous in [OC, IC] order
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(mat1);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(out);
+  CHECK_INPUT(mat2);
+  CHECK_DIM(3, out);
+  CHECK_DIM(3, mat1);
+  CHECK_DIM(3, mat2);
+
+  int64_t B = mat1.size(0);
+  int64_t M = mat1.size(1);
+  int64_t N = mat2.size(1);
+  int64_t K = mat1.size(2);
+
+  TORCH_CHECK(!scale.has_value(), "bmm: do not support fp8 weight for now.")
+  TORCH_CHECK(N % 32 == 0, "tinygemm requires N to be 32x.");
+
+  int64_t mat1_strideB = mat1.stride(0);
+  int64_t mat1_strideM = mat1.stride(1);
+  int64_t out_strideB = out.stride(0);
+  int64_t out_strideM = out.stride(1);
+
+  // check shapes
+  TORCH_CHECK(mat2.size(0) == B && mat2.size(2) == K, "bmm: mat2 shape mismatch!");
+  TORCH_CHECK(out.size(0) == B && out.size(1) == M, "bmm: out shape mismatch!");
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(mat1.scalar_type(), "bmm_kernel_impl", [&] {
+    bmm_kernel_impl<scalar_t>(
+        out.data_ptr<scalar_t>(),
+        mat1.data_ptr<scalar_t>(),
+        packed_w.data_ptr<scalar_t>(),
+        B,
+        M,
+        N,
+        K,
+        mat1_strideB,
+        mat1_strideM,
+        out_strideB,
+        out_strideM);
+  });
+}
diff --git a/sgl-kernel/csrc/cpu/common.h b/sgl-kernel/csrc/cpu/common.h
new file mode 100644
index 000000000..0fb132607
--- /dev/null
+++ b/sgl-kernel/csrc/cpu/common.h
@@ -0,0 +1,324 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/Parallel.h>
+#include <ATen/record_function.h>
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+namespace {
+
+// dispatch bool
+#define AT_DISPATCH_BOOL(BOOL_V, BOOL_NAME, ...) \
+  [&] {                                          \
+    if (BOOL_V) {                                \
+      constexpr bool BOOL_NAME = true;           \
+      return __VA_ARGS__();                      \
+    } else {                                     \
+      constexpr bool BOOL_NAME = false;          \
+      return __VA_ARGS__();                      \
+    }                                            \
+  }()
+
+// dispatch: bfloat16, float16, int8_t, fp8_e4m3
+#define CPU_DISPATCH_PACKED_TYPES(TYPE, ...)                     \
+  [&] {                                                          \
+    switch (TYPE) {                                              \
+      case at::ScalarType::BFloat16: {                           \
+        using packed_t = at::BFloat16;                           \
+        return __VA_ARGS__();                                    \
+      }                                                          \
+      case at::ScalarType::Half: {                               \
+        using packed_t = at::Half;                               \
+        return __VA_ARGS__();                                    \
+      }                                                          \
+      case at::ScalarType::Char: {                               \
+        using packed_t = int8_t;                                 \
+        return __VA_ARGS__();                                    \
+      }                                                          \
+      case at::ScalarType::Float8_e4m3fn: {                      \
+        using packed_t = at::Float8_e4m3fn;                      \
+        return __VA_ARGS__();                                    \
+      }                                                          \
+      default:                                                   \
+        TORCH_CHECK(false, "Unsupported floating data type.\n"); \
+    }                                                            \
+  }()
+
+// dispatch with mixed dtypes (TYPE1, TYPE2):
+//   TYPE1: the primary dtype (input, output, weight);
+//   TYPE2: the secondary dtype (bias, etc.).
+#define CPU_DISPATCH_REDUCED_FLOATING_TYPES_EXT(TYPE1, TYPE2, ...) \
+  [&] {                                                            \
+    if (TYPE2 == at::kFloat) {                                     \
+      switch (TYPE1) {                                             \
+        case at::ScalarType::BFloat16: {                           \
+          using scalar_t = at::BFloat16;                           \
+          using param_t = float;                                   \
+          return __VA_ARGS__();                                    \
+        }                                                          \
+        case at::ScalarType::Half: {                               \
+          using scalar_t = at::Half;                               \
+          using param_t = float;                                   \
+          return __VA_ARGS__();                                    \
+        }                                                          \
+        default:                                                   \
+          TORCH_CHECK(false, "Unsupported floating data type.\n"); \
+      }                                                            \
+    } else {                                                       \
+      TORCH_CHECK(TYPE1 == TYPE2);                                 \
+      switch (TYPE1) {                                             \
+        case at::ScalarType::BFloat16: {                           \
+          using scalar_t = at::BFloat16;                           \
+          using param_t = at::BFloat16;                            \
+          return __VA_ARGS__();                                    \
+        }                                                          \
+        case at::ScalarType::Half: {                               \
+          using scalar_t = at::Half;                               \
+          using param_t = at::Half;                                \
+          return __VA_ARGS__();                                    \
+        }                                                          \
+        default:                                                   \
+          TORCH_CHECK(false, "Unsupported floating data type.\n"); \
+      }                                                            \
+    }                                                              \
+  }()
+
+#define UNUSED(x) (void)(x)
+
+#define CHECK_CPU(x) TORCH_CHECK(x.device().type() == at::kCPU, #x " must be a CPU tensor")
+
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_LAST_DIM_CONTIGUOUS(x) \
+  TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimension")
+
+#define CHECK_INPUT(x) \
+  CHECK_CPU(x);        \
+  CHECK_CONTIGUOUS(x)
+#define CHECK_LAST_DIM_CONTIGUOUS_INPUT(x) \
+  CHECK_CPU(x);                            \
+  CHECK_LAST_DIM_CONTIGUOUS(x)
+
+#define CHECK_DIM(d, x) TORCH_CHECK(x.dim() == d, #x " must be a " #d "D tensor")
+
+#define CHECK_EQ(a, b) TORCH_CHECK((a) == (b), "CHECK_EQ(" #a ", " #b ") failed. ", a, " vs ", b)
+
+// [NB] Parallel Routines
+//
+//  * at::parallel_for - applies for most of generic use cases, this will be compiled
+//                       against openmp in default torch release.
+//
+//  * parallel_for     - same function as above, can choose payload partition scheme in
+//                       balance211.
+//
+//  * parallel_2d      - parallel for 2 dimensions, used in GEMM, etc.
+//                       this one will do payload balance across 2 dimensions.
+//
+
+// grain size for each thread
+constexpr int GRAIN_SIZE = 1024;
+
+template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+inline T div_up(T x, T y) {
+  return (x + y - 1) / y;
+}
+
+// you can only use at::get_thread_num() with at::parallel_for()
+// as it is lazy initialized, otherwise it will always return 0.
+inline int get_thread_num() {
+#if defined(_OPENMP)
+  return omp_get_thread_num();
+#else
+  return 0;
+#endif
+}
+
+// balance payload across each thread
+template <typename T>
+inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
+#if 0
+    // onednn partition pattern
+    T& n_my = n_end;
+    if (nth <= 1 || n == 0) {
+        n_start = 0;
+        n_my = n;
+    } else {
+        T n1 = div_up(n, nth);
+        T n2 = n1 - 1;
+        T T1 = n - n2 * nth;
+        n_my = ith < T1 ? n1 : n2;
+        n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
+    }
+    n_end += n_start;
+#else
+  // pytorch aten partition pattern
+  T n_my = div_up(n, nth);
+  n_start = ith * n_my;
+  n_end = std::min(n_start + n_my, n);
+#endif
+}
+
+template <typename func_t>
+inline void parallel_for(int n, const func_t& f) {
+#if defined(_OPENMP)
+#pragma omp parallel
+  {
+    int nth = omp_get_num_threads();
+    int ith = omp_get_thread_num();
+    int tbegin, tend;
+    balance211(n, nth, ith, tbegin, tend);
+    f(tbegin, tend);
+  }
+#else
+  f(0, n);
+#endif
+}
+
+// for 1d parallel, use `actual_nth`
+// for 2d parallel, use even nths, e.g. 43->42
+int inline adjust_num_threads(int m) {
+  int actual_nth = at::get_num_threads();
+  if (m == 1) {
+    return actual_nth;
+  }
+  return std::max(1, (actual_nth >> 1) * 2);
+}
+
+template <typename func_t>
+inline void parallel_2d(int m, int n, const func_t& f) {
+  // make sure we have even num_threads
+  int nth = adjust_num_threads(m);
+
+  // [NOTE] thread blocking:
+  //
+  //   1) prefer square block per thread
+  //   2) use even number of CPU cores
+  //   3) use all `num_threads` cores
+  //
+  //   we have:
+  //     TM * TN = T
+  //     BM / TM = BN / TN
+  //   then:
+  //     TM = ((BM / BN) * T) ^ 0.5
+  //
+  float r = float(m) / n;
+  int nth_m = std::ceil(std::sqrt(r * nth));
+  int nth_n = 1;
+  for (; nth_m > 0; --nth_m) {
+    nth_n = nth / nth_m;
+    if (nth_m * nth_n == nth) {
+      break;
+    }
+  }
+
+#if defined(_OPENMP)
+#pragma omp parallel num_threads(nth)
+  {
+    int ith = omp_get_thread_num();
+    int ith_m = ith / nth_n;
+    int ith_n = ith % nth_n;
+
+    int thread_block_m = div_up(m, nth_m);
+    int thread_block_n = div_up(n, nth_n);
+
+    int begin_m = ith_m * thread_block_m;
+    int end_m = std::min(m, begin_m + thread_block_m);
+    int begin_n = ith_n * thread_block_n;
+    int end_n = std::min(n, begin_n + thread_block_n);
+
+    f(begin_m, end_m, begin_n, end_n);
+  }
+#else
+  f(0, m, 0, n);
+#endif
+}
+
+// limit max cache blocks
+// when we need to do pre-unpack for weights, e.g. fp8
+#define MAX_CACHE_BLOCK_SIZE 4
+
+template <typename T>
+inline int get_cache_blocks(int chunk_size) {
+  // L2 2MB and ratio of 50%
+  const int L2_size = 2048 * 1024 >> 1;
+  return std::max(1, int(L2_size / (chunk_size * sizeof(T))));
+}
+
+template <>
+inline int get_cache_blocks<at::Float8_e4m3fn>(int chunk_size) {
+  // fp8 uses bf16 as accumulate type
+  int cache_block_size = get_cache_blocks<at::BFloat16>(chunk_size);
+  return std::min(MAX_CACHE_BLOCK_SIZE, cache_block_size);
+}
+
+// 2d sequential loop in range : [mb0, mb1), [nb0, nb1)
+template <typename T, typename func_t>
+inline void loop_2d(int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1, int64_t chunk_size, const func_t& f) {
+  // get number of blocks for L2 in most inner loop
+  int64_t cache_blocks_nb = get_cache_blocks<T>(chunk_size);
+
+  // loop order: [NB / cache_blocks_nb, MB, cache_blocks_nb]
+  // TODO: implement reverse order of [MB / cache_blocks_mb, NB, cache_blocks_mb]
+  for (int64_t nbb = nb0; nbb < nb1; nbb += cache_blocks_nb) {
+    for (int64_t mb = mb0; mb < mb1; ++mb) {
+      for (int64_t nb = nbb; nb < std::min(nbb + cache_blocks_nb, nb1); ++nb) {
+        f(mb, nb, nb - nbb);
+      }
+    }
+  }
+}
+
+// data indexing for dimension collapse
+template <typename T>
+inline T data_index_init(T offset) {
+  return offset;
+}
+
+template <typename T, typename... Args>
+inline T data_index_init(T offset, T& x, const T& X, Args&&... args) {
+  offset = data_index_init(offset, std::forward<Args>(args)...);
+  x = offset % X;
+  return offset / X;
+}
+
+inline bool data_index_step() {
+  return true;
+}
+
+template <typename T, typename... Args>
+inline bool data_index_step(T& x, const T& X, Args&&... args) {
+  if (data_index_step(std::forward<Args>(args)...)) {
+    x = ((x + 1) == X) ? 0 : (x + 1);
+    return x == 0;
+  }
+  return false;
+}
+
+// forced unroll for perf critical path
+
+#if __has_attribute(always_inline)
+#define ALWAYS_INLINE __attribute__((__always_inline__)) inline
+#else
+#define ALWAYS_INLINE inline
+#endif
+
+template <int n>
+struct Unroll {
+  template <typename Func, typename... Args>
+  ALWAYS_INLINE void operator()(const Func& f, Args... args) const {
+    Unroll<n - 1>{}(f, args...);
+    f(std::integral_constant<int, n - 1>{}, args...);
+  }
+};
+
+template <>
+struct Unroll<1> {
+  template <typename Func, typename... Args>
+  ALWAYS_INLINE void operator()(const Func& f, Args... args) const {
+    f(std::integral_constant<int, 0>{}, args...);
+  }
+};
+
+}  // anonymous namespace
diff --git a/sgl-kernel/csrc/cpu/decode.cpp b/sgl-kernel/csrc/cpu/decode.cpp
new file mode 100644
index 000000000..ae5ac51c8
--- /dev/null
+++ b/sgl-kernel/csrc/cpu/decode.cpp
@@ -0,0 +1,1575 @@
+#include "common.h"
+#include "gemm.h"
+#include "vec.h"
+
+namespace {
+
+// [NOTE] TODO list for this kernel:
+//   1. tune the value for BLOCK_N
+//   2. planning for {batches, num_heads, num_kv_splits}
+//      and use actual num_kv_splits for small seq length
+//   3. try fast impl of `.tanh()`
+//   4. provide amx kernel for index_gemm_kernel_nn when M = 16
+//
+
+#if defined(CPU_CAPABILITY_AVX512)
+// key: from [N, 32] to [32/2, N, 2]
+// val: from [N, 32] to [N/2, 32, 2]
+template <typename scalar_t, typename index_t>
+inline void pack_vnni_Nx32(
+    scalar_t* __restrict__ dst0,
+    scalar_t* __restrict__ dst1,
+    const scalar_t* __restrict__ src,
+    const index_t* __restrict__ ind,
+    int N,
+    int ld_src,
+    int ld_dst0,
+    int ld_dst1,
+    bool convert_v) {
+  __m512i vinputs[16];
+  int n = 0;
+  for (; n < N; ++n) {
+    vinputs[n] = _mm512_loadu_si512(src + ind[n] * ld_src);
+  }
+  // padding with zero to avoid uninitialized vectors
+  for (; n < 16; ++n) {
+    vinputs[n] = _mm512_set1_epi32(0);
+  }
+
+  // pack value, skip 64 elems for deepseek
+  // handle 2 vectors at a time from [2, 32] to [32, 2]
+  if (convert_v) {
+    for (int n = 0; n < 16; n += 2) {
+      __m512i d0, d1;
+      std::tie(d0, d1) = transpose_2x32_16bit(vinputs[n], vinputs[n + 1]);
+      _mm512_storeu_si512(dst1 + (n >> 1) * ld_dst1 * 2, d0);
+      _mm512_storeu_si512(dst1 + (n >> 1) * ld_dst1 * 2 + 32, d1);
+    }
+  }
+
+  // pack key
+  transpose_16x16_32bit(vinputs);
+
+  const __mmask16 vmask = (1 << N) - 1;
+  for (int k = 0; k < 16; ++k) {
+    _mm512_mask_storeu_epi32(dst0 + k * ld_dst0 * 2, vmask, vinputs[k]);
+  }
+}
+#endif
+
+// [NOTE]: MLA vnni format conversion
+//
+//  here we apply same strategy as `FlashMLA`:
+//  each kv_cache is loaded once and packed twice (L2 cache hit)
+//
+//  * for   key: from [N, K/2, 2] to [K/2, N, 2]
+//  * for value: from [N/2, 2, Kv] to [N/2, Kv, 2]
+//
+template <typename scalar_t, typename index_t>
+void pack_vnni(
+    scalar_t* __restrict__ dst0,
+    scalar_t* __restrict__ dst1,
+    const scalar_t* __restrict__ src,
+    const index_t* __restrict__ ind,
+    int N,
+    int K,
+    int Kv,
+    int ld_src,
+    int ld_dst0,
+    int ld_dst1) {
+#if defined(CPU_CAPABILITY_AVX512)
+  const int NB = div_up(N, 16);
+  const int KB = K / 32;    // no remainder
+  const int KBv = Kv / 32;  // no remainder
+
+  for (int nb = 0; nb < NB; ++nb) {
+    for (int kb = 0; kb < KB; ++kb) {
+      // handle 16x512bits each block
+      int nb_size = std::min(N - nb * 16, 16);
+      pack_vnni_Nx32<scalar_t, index_t>(
+          /*    dst0 */ dst0 + ((kb * 32) >> 1) * ld_dst0 * 2 + nb * 16 * 2,
+          /*    dst1 */ dst1 + ((nb * 16) >> 1) * ld_dst1 * 2 + kb * 32 * 2,
+          /*     src */ src + kb * 32,
+          /*     ind */ ind + nb * 16,
+          /*       N */ nb_size,
+          /*  ld_src */ ld_src,
+          /* ld_dst0 */ ld_dst0,
+          /* ld_dst1 */ ld_dst1,
+          /*   cvt_v */ kb < KBv);
+    }
+  }
+#else
+  for (int n = 0; n < N; ++n) {
+    index_t index = ind[n];
+    for (int k = 0; k < K / 2; ++k) {
+      for (int d = 0; d < 2; ++d) {
+        dst0[k * ld_dst0 * 2 + n * 2 + d] = src[index * ld_src + k * 2 + d];
+      }
+    }
+  }
+  // from [N/2, 2, K] to [N/2, K, 2]
+  for (int n = 0; n < (N >> 1) * 2; n += 2) {
+    index_t index0 = ind[n + 0];
+    index_t index1 = ind[n + 1];
+    for (int k = 0; k < Kv; ++k) {
+      dst1[(n >> 1) * ld_dst1 * 2 + k * 2 + 0] = src[index0 * ld_src + k];
+      dst1[(n >> 1) * ld_dst1 * 2 + k * 2 + 1] = src[index1 * ld_src + k];
+    }
+  }
+  if (N % 2 != 0) {
+    index_t index = ind[N - 1];
+    for (int k = 0; k < Kv; ++k) {
+      dst1[(N >> 1) * ld_dst1 * 2 + k * 2 + 0] = src[index * ld_src + k];
+      dst1[(N >> 1) * ld_dst1 * 2 + k * 2 + 1] = 0;
+    }
+  }
+#endif
+}
+
+template <typename scalar_t>
+inline void fill_stub(scalar_t* __restrict__ out, float val, int64_t size) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+  constexpr int kVecSize = Vec::size();
+  const Vec data_vec = Vec(static_cast<scalar_t>(val));
+  int64_t d = 0;
+#pragma GCC unroll 4
+  for (; d <= size - kVecSize; d += kVecSize) {
+    data_vec.store(out + d);
+  }
+  if (size - d > 0) {
+    data_vec.store(out + d, size - d);
+  }
+}
+
+template <typename scalar_t>
+inline void copy_stub(scalar_t* __restrict__ out, const float* __restrict__ acc, float s, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  const fVec s_fvec = fVec(s);
+  int64_t d = 0;
+#pragma GCC unroll 4
+  for (; d <= size - kVecSize; d += kVecSize) {
+    fVec a_fvec0 = fVec::loadu(acc + d) * s_fvec;
+    fVec a_fvec1 = fVec::loadu(acc + d + fVec::size()) * s_fvec;
+    bVec out_bvec = convert_from_float_ext<scalar_t>(a_fvec0, a_fvec1);
+    out_bvec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(acc[d] * s);
+  }
+}
+
+template <typename scalar_t>
+inline void copy_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ src, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  constexpr int kVecSize = bVec::size();
+  int64_t d = 0;
+#pragma GCC unroll 4
+  for (; d <= size - kVecSize; d += kVecSize) {
+    bVec out_bvec = bVec::loadu(src + d);
+    out_bvec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = src[d];
+  }
+}
+
+template <typename scalar_t, int BLOCK_N>
+inline void copy_stub(scalar_t* __restrict__ out, const float* __restrict__ input) {
+  static_assert(BLOCK_N % 32 == 0);
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+
+  constexpr int COLS = BLOCK_N / 16;
+  auto store = [&](auto i) {
+    constexpr int col = i % COLS;
+    // for COLS = 2, 4 use 512bit store
+    if constexpr (col % 2 == 0) {
+      fVec a_fvec0 = fVec::loadu(input + col * 16);
+      fVec a_fvec1 = fVec::loadu(input + col * 16 + 16);
+      bVec out_bvec = convert_from_float_ext<scalar_t>(a_fvec0, a_fvec1);
+      out_bvec.store(out + col * 16);
+    }
+  };
+  Unroll<COLS>{}(store);
+}
+
+// GEMM handles query @ key (indexed) x scale
+//   A : [M, K]
+//   B : [N, K] indexed
+//   C : [M, N]
+//
+template <typename scalar_t, typename index_t, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nt {
+  static inline void apply(
+      const scalar_t* __restrict__ A,
+      const scalar_t* __restrict__ B,
+      float* __restrict__ C,
+      const index_t* __restrict__ indices,
+      float scale,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc,
+      int64_t K,
+      int64_t max_tokens) {
+    for (int64_t m = 0; m < BLOCK_M; ++m) {
+      for (int64_t n = 0; n < BLOCK_N; ++n) {
+        float sum = 0.f;
+        int64_t b_idx = indices[n];
+        TORCH_CHECK(b_idx < max_tokens, "token index out of scope!");
+        for (int64_t k = 0; k < K; ++k) {
+          sum += scale * static_cast<float>(A[m * lda + k]) * static_cast<float>(B[b_idx * ldb + k]);
+        }
+        C[m * ldc + n] = sum;
+      }
+    }
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <typename index_t, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nt<at::BFloat16, index_t, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const at::BFloat16* __restrict__ A,
+      const at::BFloat16* __restrict__ B,
+      float* __restrict__ C,
+      const index_t* __restrict__ indices,
+      float scale,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc,
+      int64_t K,
+      int64_t max_tokens) {
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N;
+
+    __m512bh va;
+    __m512bh vb[COLS];
+    __m512 vc[ROWS * COLS];
+    __m512 vscale = _mm512_set1_ps(scale);
+
+    auto loadc = [&](auto i) { vc[i] = _mm512_setzero_ps(); };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    // for main loop
+    auto compute = [&](auto i, int64_t k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = (__m512bh)(_mm512_loadu_si512(A + row * lda + k));
+      }
+      if constexpr (row == 0) {
+        if constexpr (col + 1 < COLS) {
+          int64_t b_idx_prefetch = indices[col + 1];
+          _mm_prefetch(B + b_idx_prefetch * ldb + k, _MM_HINT_T0);
+        }
+        int64_t b_idx = indices[col];
+        TORCH_CHECK(b_idx < max_tokens, "token index out of scope!");
+        vb[col] = (__m512bh)(_mm512_loadu_si512(B + b_idx * ldb + k));
+      }
+      vc[i] = _mm512_dpbf16_ps(vc[i], va, vb[col]);
+    };
+
+    // for remainder
+    auto compute2 = [&](auto i, int64_t k, __mmask32 mask) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = (__m512bh)(_mm512_maskz_loadu_epi16(mask, A + row * lda + k));
+      }
+      if constexpr (row == 0) {
+        int64_t b_idx = indices[col];
+        TORCH_CHECK(b_idx < max_tokens, "token index out of scope!");
+        vb[col] = (__m512bh)(_mm512_maskz_loadu_epi16(mask, B + b_idx * ldb + k));
+      }
+      vc[i] = _mm512_dpbf16_ps(vc[i], va, vb[col]);
+    };
+
+    int64_t k = 0;
+    for (; k <= K - 32; k += 32) {
+      Unroll<ROWS * COLS>{}(compute, k);
+    }
+    int64_t count = K - k;
+    if (count > 0) {
+      __mmask32 mask = (1ULL << count) - 1;
+      Unroll<ROWS * COLS>{}(compute2, k, mask);
+    }
+
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+      C[row * ldc + col] = _mm512_reduce_add_ps(_mm512_mul_ps(vc[i], vscale));
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+#endif
+
+#define LAUNCH_TINYGEMM_KERNEL_NT(MB_SIZE, NB_SIZE)               \
+  tinygemm_kernel_nt<scalar_t, index_t, MB_SIZE, NB_SIZE>::apply( \
+      A + mb_start * lda, B, C + mb_start * ldc + nb_start, indices + nb_start, scale, lda, ldb, ldc, K, max_tokens);
+
+// this is used when N isn't multiple of 16,
+// N corresponds to `head_size_v` which should be 16x
+template <typename scalar_t, typename index_t>
+inline void tinygemm_kernel_nn_scalar(
+    const float* __restrict__ A,
+    const scalar_t* __restrict__ B,
+    float* __restrict__ C,
+    const index_t* __restrict__ indices,
+    const float* __restrict__ scale,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    int64_t max_tokens) {
+  for (int64_t m = 0; m < M; ++m) {
+    for (int64_t n = 0; n < N; ++n) {
+      C[m * ldc + n] *= scale[m];
+      for (int64_t k = 0; k < K; ++k) {
+        int64_t b_idx = indices[k];
+        TORCH_CHECK(b_idx < max_tokens, "token index out of scope!");
+        C[m * ldc + n] += A[m * lda + k] * static_cast<float>(B[b_idx * ldb + n]);
+      }
+    }
+  }
+}
+
+// GEMM handles v' * scale + attn @ value (indexed)
+//   A : [M, K]
+//   B : [K, N] indexed
+//   C ：[M, N]
+//
+template <typename scalar_t, typename index_t, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn {
+  static inline void apply(
+      const float* __restrict__ A,
+      const scalar_t* __restrict__ B,
+      float* __restrict__ C,
+      const index_t* __restrict__ indices,
+      const float* __restrict__ scale,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc,
+      int64_t K,
+      int64_t max_tokens) {
+    tinygemm_kernel_nn_scalar(A, B, C, indices, scale, BLOCK_M, BLOCK_N, K, lda, ldb, ldc, max_tokens);
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <typename index_t, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn<at::BFloat16, index_t, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const float* __restrict__ A,
+      const at::BFloat16* __restrict__ B,
+      float* __restrict__ C,
+      const index_t* __restrict__ indices,
+      const float* __restrict__ scale,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc,
+      int64_t K,
+      int64_t max_tokens) {
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N / 16;
+
+    __m512 va;
+    __m512 vb[COLS];
+    __m512 vc[ROWS * COLS];
+    __m512 vscale;
+
+    auto loadc = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
+      if constexpr (col == 0) {
+        vscale = _mm512_set1_ps(scale[row]);
+      }
+#pragma GCC diagnostic pop
+      vc[i] = _mm512_loadu_ps(C + row * ldc + col * 16);
+      vc[i] = _mm512_mul_ps(vc[i], vscale);
+    };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    auto compute = [&](auto i, int64_t k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = _mm512_set1_ps(A[row * lda + k]);
+      }
+      if constexpr (row == 0) {
+        if (k + 1 < K) {
+          int64_t b_idx_prefetch = indices[k + 1];
+          _mm_prefetch(B + b_idx_prefetch * ldb + col * 16, _MM_HINT_T0);
+        }
+        int64_t b_idx = indices[k];
+        TORCH_CHECK(b_idx < max_tokens, "token index out of scope!");
+
+        // for COLS = 2, 4, 6, 8 use 512 bit load
+        // for COLS = 1, 3, 5, 7 use 256 bit load
+        if constexpr (COLS % 2 == 0) {
+          if constexpr (col % 2 == 0) {
+            __m512i b16 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(B + b_idx * ldb + col * 16));
+            vb[col + 0] = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32(b16, 0));
+            vb[col + 1] = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32(b16, 1));
+          }
+        } else {
+          __m256i b16 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(B + b_idx * ldb + col * 16));
+          vb[col] = CVT_BF16_TO_FP32(b16);
+        }
+      }
+      vc[i] = _mm512_fmadd_ps(va, vb[col], vc[i]);
+    };
+
+    for (int64_t k = 0; k < K; ++k) {
+      Unroll<ROWS * COLS>{}(compute, k);
+    }
+
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+      _mm512_storeu_ps(C + row * ldc + col * 16, vc[i]);
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+#endif
+
+#define LAUNCH_TINYGEMM_KERNEL_NN(MB_SIZE, NB_SIZE)               \
+  tinygemm_kernel_nn<scalar_t, index_t, MB_SIZE, NB_SIZE>::apply( \
+      A + mb_start * lda,                                         \
+      B + nb_start,                                               \
+      C + mb_start * ldc + nb_start,                              \
+      indices,                                                    \
+      scale + mb_start,                                           \
+      lda,                                                        \
+      ldb,                                                        \
+      ldc,                                                        \
+      K,                                                          \
+      max_tokens);
+
+template <typename scalar_t, typename index_t>
+void index_gemm_kernel_nt(
+    const scalar_t* __restrict__ A,
+    const scalar_t* __restrict__ B,
+    float* __restrict__ C,
+    const index_t* __restrict__ indices,
+    float scale,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    int64_t max_tokens) {
+  // pattern: 1-8-8
+  if (M == 1) {
+    constexpr int64_t BLOCK_N = 8;
+    const int64_t NB = div_up(N, BLOCK_N);
+    int64_t mb_start = 0, lda = 1, ldc = 1;
+
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch (nb_size) {
+        case 1:
+          LAUNCH_TINYGEMM_KERNEL_NT(1, 1);
+          break;
+        case 2:
+          LAUNCH_TINYGEMM_KERNEL_NT(1, 2);
+          break;
+        case 3:
+          LAUNCH_TINYGEMM_KERNEL_NT(1, 3);
+          break;
+        case 4:
+          LAUNCH_TINYGEMM_KERNEL_NT(1, 4);
+          break;
+        case 5:
+          LAUNCH_TINYGEMM_KERNEL_NT(1, 5);
+          break;
+        case 6:
+          LAUNCH_TINYGEMM_KERNEL_NT(1, 6);
+          break;
+        case 7:
+          LAUNCH_TINYGEMM_KERNEL_NT(1, 7);
+          break;
+        case 8:
+          LAUNCH_TINYGEMM_KERNEL_NT(1, 8);
+          break;
+        default:
+          TORCH_CHECK(false, "Unexpected block size, 1x", "nb_size");
+      }
+    }
+    return;
+  }
+
+  // pattern: 1-6-24
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 6;
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  for (int64_t mb = 0; mb < MB; ++mb) {
+    int64_t mb_start = mb * BLOCK_M;
+    int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch (mb_size << 4 | nb_size) {
+        // mb_size = 1
+        case 0x11:
+          LAUNCH_TINYGEMM_KERNEL_NT(1, 1);
+          break;
+        case 0x12:
+          LAUNCH_TINYGEMM_KERNEL_NT(1, 2);
+          break;
+        case 0x13:
+          LAUNCH_TINYGEMM_KERNEL_NT(1, 3);
+          break;
+        case 0x14:
+          LAUNCH_TINYGEMM_KERNEL_NT(1, 4);
+          break;
+        case 0x15:
+          LAUNCH_TINYGEMM_KERNEL_NT(1, 5);
+          break;
+        case 0x16:
+          LAUNCH_TINYGEMM_KERNEL_NT(1, 6);
+          break;
+        // mb_size = 2
+        case 0x21:
+          LAUNCH_TINYGEMM_KERNEL_NT(2, 1);
+          break;
+        case 0x22:
+          LAUNCH_TINYGEMM_KERNEL_NT(2, 2);
+          break;
+        case 0x23:
+          LAUNCH_TINYGEMM_KERNEL_NT(2, 3);
+          break;
+        case 0x24:
+          LAUNCH_TINYGEMM_KERNEL_NT(2, 4);
+          break;
+        case 0x25:
+          LAUNCH_TINYGEMM_KERNEL_NT(2, 5);
+          break;
+        case 0x26:
+          LAUNCH_TINYGEMM_KERNEL_NT(2, 6);
+          break;
+        // mb_size = 3
+        case 0x31:
+          LAUNCH_TINYGEMM_KERNEL_NT(3, 1);
+          break;
+        case 0x32:
+          LAUNCH_TINYGEMM_KERNEL_NT(3, 2);
+          break;
+        case 0x33:
+          LAUNCH_TINYGEMM_KERNEL_NT(3, 3);
+          break;
+        case 0x34:
+          LAUNCH_TINYGEMM_KERNEL_NT(3, 4);
+          break;
+        case 0x35:
+          LAUNCH_TINYGEMM_KERNEL_NT(3, 5);
+          break;
+        case 0x36:
+          LAUNCH_TINYGEMM_KERNEL_NT(3, 6);
+          break;
+        // mb_size = 4
+        case 0x41:
+          LAUNCH_TINYGEMM_KERNEL_NT(4, 1);
+          break;
+        case 0x42:
+          LAUNCH_TINYGEMM_KERNEL_NT(4, 2);
+          break;
+        case 0x43:
+          LAUNCH_TINYGEMM_KERNEL_NT(4, 3);
+          break;
+        case 0x44:
+          LAUNCH_TINYGEMM_KERNEL_NT(4, 4);
+          break;
+        case 0x45:
+          LAUNCH_TINYGEMM_KERNEL_NT(4, 5);
+          break;
+        case 0x46:
+          LAUNCH_TINYGEMM_KERNEL_NT(4, 6);
+          break;
+        default:
+          TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename index_t>
+void index_gemm_kernel_nn(
+    const float* __restrict__ A,
+    const scalar_t* __restrict__ B,
+    float* __restrict__ C,
+    const index_t* __restrict__ indices,
+    float* __restrict__ scale,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    int64_t max_tokens) {
+  constexpr int kVecSize = 16;
+  if ((N & (kVecSize - 1)) != 0) {
+    tinygemm_kernel_nn_scalar(A, B, C, indices, scale, M, N, K, lda, ldb, ldc, max_tokens);
+    return;
+  }
+
+  // pattern: 1-8-8
+  if (M == 1) {
+    constexpr int64_t BLOCK_N = 8 * kVecSize;
+    const int64_t NB = div_up(N, BLOCK_N);
+    int64_t mb_start = 0, lda = 1, ldc = 1;
+
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch (nb_size >> 4) {
+        case 1:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 16);
+          break;
+        case 2:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 32);
+          break;
+        case 3:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 48);
+          break;
+        case 4:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 64);
+          break;
+        case 5:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 80);
+          break;
+        case 6:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 96);
+          break;
+        case 7:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 112);
+          break;
+        case 8:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 128);
+          break;
+        default:
+          TORCH_CHECK(false, "Unexpected block size, 1x", "nb_size");
+      }
+    }
+    return;
+  }
+
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 6 * kVecSize;
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  for (int64_t mb = 0; mb < MB; ++mb) {
+    int64_t mb_start = mb * BLOCK_M;
+    int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch (mb_size << 4 | nb_size >> 4) {
+        // mb_size = 1
+        case 0x11:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 16);
+          break;
+        case 0x12:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 32);
+          break;
+        case 0x13:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 48);
+          break;
+        case 0x14:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 64);
+          break;
+        case 0x15:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 80);
+          break;
+        case 0x16:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 96);
+          break;
+        // mb_size = 2
+        case 0x21:
+          LAUNCH_TINYGEMM_KERNEL_NN(2, 16);
+          break;
+        case 0x22:
+          LAUNCH_TINYGEMM_KERNEL_NN(2, 32);
+          break;
+        case 0x23:
+          LAUNCH_TINYGEMM_KERNEL_NN(2, 48);
+          break;
+        case 0x24:
+          LAUNCH_TINYGEMM_KERNEL_NN(2, 64);
+          break;
+        case 0x25:
+          LAUNCH_TINYGEMM_KERNEL_NN(2, 80);
+          break;
+        case 0x26:
+          LAUNCH_TINYGEMM_KERNEL_NN(2, 96);
+          break;
+        // mb_size = 3
+        case 0x31:
+          LAUNCH_TINYGEMM_KERNEL_NN(3, 16);
+          break;
+        case 0x32:
+          LAUNCH_TINYGEMM_KERNEL_NN(3, 32);
+          break;
+        case 0x33:
+          LAUNCH_TINYGEMM_KERNEL_NN(3, 48);
+          break;
+        case 0x34:
+          LAUNCH_TINYGEMM_KERNEL_NN(3, 64);
+          break;
+        case 0x35:
+          LAUNCH_TINYGEMM_KERNEL_NN(3, 80);
+          break;
+        case 0x36:
+          LAUNCH_TINYGEMM_KERNEL_NN(3, 96);
+          break;
+        // mb_size = 4
+        case 0x41:
+          LAUNCH_TINYGEMM_KERNEL_NN(4, 16);
+          break;
+        case 0x42:
+          LAUNCH_TINYGEMM_KERNEL_NN(4, 32);
+          break;
+        case 0x43:
+          LAUNCH_TINYGEMM_KERNEL_NN(4, 48);
+          break;
+        case 0x44:
+          LAUNCH_TINYGEMM_KERNEL_NN(4, 64);
+          break;
+        case 0x45:
+          LAUNCH_TINYGEMM_KERNEL_NN(4, 80);
+          break;
+        case 0x46:
+          LAUNCH_TINYGEMM_KERNEL_NN(4, 96);
+          break;
+        default:
+          TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+void decode_set_kv_buffer(
+    scalar_t* __restrict__ k_buffer,
+    scalar_t* __restrict__ v_buffer,
+    const scalar_t* __restrict__ key,
+    const scalar_t* __restrict__ value,
+    const int64_t* __restrict__ loc,
+    int64_t batches,
+    int64_t num_heads_kv,
+    int64_t head_size,
+    int64_t head_size_v,
+    int64_t k_strideN,
+    int64_t k_strideH,
+    int64_t v_strideN,
+    int64_t v_strideH,
+    int64_t nk_strideN,
+    int64_t nk_strideH,
+    int64_t nv_strideN,
+    int64_t nv_strideH,
+    bool is_mla) {
+  at::parallel_for(0, batches * num_heads_kv, 0, [&](int64_t begin, int64_t end) {
+    int64_t bs{0}, head_kv_id{0};
+    data_index_init(begin, bs, batches, head_kv_id, num_heads_kv);
+
+    for (int64_t i = begin; i < end; i++) {
+      int64_t loc_val = loc[bs];
+      scalar_t* k_buffer_ptr = k_buffer + loc_val * k_strideN + head_kv_id * k_strideH;
+      const scalar_t* new_key_ptr = key + bs * nk_strideN + head_kv_id * nk_strideH;
+      copy_stub<scalar_t>(k_buffer_ptr, new_key_ptr, head_size);
+      if (!is_mla) {
+        scalar_t* v_buffer_ptr = v_buffer + loc_val * v_strideN + head_kv_id * v_strideH;
+        const scalar_t* new_value_ptr = value + bs * nv_strideN + head_kv_id * nv_strideH;
+        copy_stub<scalar_t>(v_buffer_ptr, new_value_ptr, head_size_v);
+      }
+
+      // move to the next index
+      data_index_step(bs, batches, head_kv_id, num_heads_kv);
+    }
+  });
+}
+
+template <typename scalar_t>
+void decode_accumulate_kv_splits(
+    scalar_t* __restrict__ output,
+    float* __restrict__ attn_logits,
+    int64_t batches,
+    int64_t num_heads,
+    int64_t head_size_v,
+    int64_t num_kv_splits,
+    int64_t l_stride1,
+    int64_t l_stride2) {
+  using Vec = at::vec::Vectorized<float>;
+
+  // parallel on [batches, num_heads]
+  at::parallel_for(0, batches * num_heads, 0, [&](int64_t begin, int64_t end) {
+    // NB: here we use logits[b][h][0] as acc, since
+    // for the first kv split (kv_id == 0):
+    //   m_delta = std::exp(-inf) = 0
+    //   e_logic = std::exp(0) = 1
+    //   acc = acc * m_delta + tv * e_logic = tv
+    for (int64_t i = begin; i < end; ++i) {
+      float* __restrict__ acc = attn_logits + i * l_stride1;
+
+      float s_prime = 0.f;
+      float m_prime = -std::numeric_limits<scalar_t>::infinity();
+
+      // update acc with from each kv_split
+      for (int64_t kv_id = 0; kv_id < num_kv_splits; ++kv_id) {
+        float* __restrict__ tv = acc + kv_id * l_stride2;
+        const float tlogic = (acc + kv_id * l_stride2)[head_size_v];
+
+        float m_i = std::max(tlogic, m_prime);
+        float m_delta = std::exp(m_prime - m_i);
+        float e_logic = std::exp(tlogic - m_i);
+        if (kv_id != 0) {
+          at::vec::map2<float>(
+              [m_delta, e_logic](Vec x, Vec y) { return x * Vec(m_delta) + y * Vec(e_logic); },
+              acc,
+              acc,
+              tv,
+              head_size_v);
+        }
+
+        s_prime = s_prime * m_delta + e_logic;
+        m_prime = m_i;
+      }
+
+      copy_stub<scalar_t>(output + i * head_size_v, acc, 1 / s_prime, head_size_v);
+    }
+  });
+}
+
+template <typename scalar_t, typename index_t, int64_t BLOCK_N>
+void decode_attention_kernel_impl(
+    scalar_t* __restrict__ output,
+    float* __restrict__ attn_logits,
+    const scalar_t* __restrict__ query,
+    const scalar_t* __restrict__ k_buffer,
+    const scalar_t* __restrict__ v_buffer,
+    const index_t* __restrict__ req_to_token,
+    const int64_t* __restrict__ req_pool_indices,
+    const int64_t* __restrict__ seq_lens,
+    int64_t batches,
+    int64_t num_heads,
+    int64_t head_size,
+    int64_t head_size_v,
+    int64_t num_kv_splits,
+    int64_t q_strideM,
+    int64_t q_strideH,
+    int64_t k_strideN,
+    int64_t k_strideH,
+    int64_t v_strideN,
+    int64_t v_strideH,
+    float scaling,
+    float logit_cap,
+    int64_t max_num_reqs,
+    int64_t max_context_len,
+    int64_t max_total_num_tokens) {
+  using Vec = at::vec::Vectorized<float>;
+
+  // strides
+  const int64_t l_stride1 = num_kv_splits * (head_size_v + 1);
+  const int64_t l_stride2 = head_size_v + 1;
+
+  const bool has_logit_cap = logit_cap > 0;
+  float rlogit_cap = has_logit_cap ? 1 / logit_cap : 0.f;
+
+  // parallel on [batches, num_heads, num_kv_splits]
+  at::parallel_for(0, batches * num_heads * num_kv_splits, 0, [&](int64_t begin, int64_t end) {
+    int64_t bs{0}, head_id{0}, kv_id{0};
+    data_index_init(begin, bs, batches, head_id, num_heads, kv_id, num_kv_splits);
+
+    // s_prime and s_delta
+    alignas(64) float s_i[BLOCK_N];
+    float* __restrict__ s_delta = s_i;
+
+    for (int64_t i = begin; i < end; ++i) {
+      // get query
+      const scalar_t* __restrict__ q_ptr = query + bs * q_strideM + head_id * q_strideH;
+
+      // get key/value
+      int64_t seq_len_kv = seq_lens[bs];
+      int64_t req_pool_id = req_pool_indices[bs];
+      TORCH_CHECK(seq_len_kv <= max_context_len, "seq_len_kv out of scope!");
+      TORCH_CHECK(req_pool_id < max_num_reqs, "req_pool_id out of scope!");
+
+      const int64_t SPLIT_SIZE = div_up(seq_len_kv, num_kv_splits);
+      const int64_t kv_start = kv_id * SPLIT_SIZE;
+      const int64_t kv_end = std::min(kv_start + SPLIT_SIZE, seq_len_kv);
+
+      float m_prime = -std::numeric_limits<float>::infinity();
+      float s_prime = 0.f;
+
+      // get v_prime, and init to zero
+      float* __restrict__ v_prime = attn_logits + i * (head_size_v + 1);
+      fill_stub(v_prime, 0.f, head_size_v);
+
+      // loop over K and V sequence with BLOCK_N
+      for (int64_t n = kv_start; n < kv_end; n += BLOCK_N) {
+        int64_t n_size = std::min(BLOCK_N, kv_end - n);
+
+        // calculate s_i <- scale * Q @ K
+        index_gemm_kernel_nt<scalar_t, index_t>(
+            /* A   */ q_ptr,
+            /* B   */ k_buffer + head_id * k_strideH,
+            /* C   */ s_i,
+            /* ind */ req_to_token + req_pool_id * max_context_len + n,
+            /* scl */ scaling,
+            /* M   */ 1,
+            /* N   */ n_size,
+            /* K   */ head_size,
+            /* lda */ 1,
+            /* ldb */ k_strideN,
+            /* ldc */ 1,
+            /* mtt */ max_total_num_tokens);
+
+        // TODO: `tanh` from torch uses sleef u10, going to be slow
+        if (has_logit_cap) {
+          at::vec::map<float>(
+              [logit_cap, rlogit_cap](Vec x) { return Vec(logit_cap) * (x * Vec(rlogit_cap)).tanh(); },
+              s_i,
+              s_i,
+              n_size);
+        }
+
+        // m_i: max value per row
+        float m_i = at::vec::reduce_all<float>([](Vec& x, Vec& y) { return at::vec::maximum(x, y); }, s_i, n_size);
+        m_i = std::max(m_i, m_prime);
+
+        // m_delta <- exp(m' - m_i)
+        float m_delta = std::exp(m_prime - m_i);
+
+        // s_delta <- exp(s_i - m_i)
+        at::vec::map<float>([m_i](Vec x) { return (x - Vec(m_i)).exp_u20(); }, s_delta, s_i, n_size);
+
+        // s' <- s' * m_delta + sum(s_delta)
+        s_prime *= m_delta;
+        s_prime += at::vec::reduce_all<float>([](Vec& x, Vec& y) { return x + y; }, s_delta, n_size);
+
+        m_prime = m_i;
+
+        // calculate V' <- s_delta @ V + V' * m_delta
+        index_gemm_kernel_nn<scalar_t, index_t>(
+            /* A   */ s_delta,
+            /* B   */ v_buffer + head_id * v_strideH,
+            /* C   */ v_prime,
+            /* ind */ req_to_token + req_pool_id * max_context_len + n,
+            /* scl */ &m_delta,
+            /* M   */ 1,
+            /* N   */ head_size_v,
+            /* K   */ n_size,
+            /* lda */ 1,
+            /* ldb */ v_strideN,
+            /* ldc */ 1,
+            /* mtt */ max_total_num_tokens);
+      }  // loop with KV blocks
+
+      // only update v' when kv_split_size > 0
+      if (kv_end > kv_start) {
+        float s = 1 / s_prime;
+        at::vec::map<float>([s](Vec out) { return out * Vec(s); }, v_prime, v_prime, head_size_v);
+
+        v_prime[head_size_v] = m_prime + std::log(s_prime);
+      }
+
+      // move to the next index
+      data_index_step(bs, batches, head_id, num_heads, kv_id, num_kv_splits);
+    }
+  });
+
+  decode_accumulate_kv_splits(
+      output, attn_logits, batches, num_heads, head_size_v, num_kv_splits, l_stride1, l_stride2);
+}  // MHA
+
+template <typename scalar_t, typename index_t, int64_t BLOCK_N>
+void decode_attention_mla_kernel_impl(
+    scalar_t* __restrict__ output,
+    float* __restrict__ attn_logits,
+    const scalar_t* __restrict__ query,
+    const scalar_t* __restrict__ k_buffer,
+    const scalar_t* __restrict__ v_buffer,
+    const index_t* __restrict__ req_to_token,
+    const int64_t* __restrict__ req_pool_indices,
+    const int64_t* __restrict__ seq_lens,
+    scalar_t* __restrict__ buffer,
+    int64_t batches,
+    int64_t num_heads,
+    int64_t head_size,
+    int64_t head_size_v,
+    int64_t num_kv_splits,
+    int64_t q_strideM,
+    int64_t q_strideH,
+    int64_t k_strideN,
+    int64_t k_strideH,
+    int64_t v_strideN,
+    int64_t v_strideH,
+    float scaling,
+    float logit_cap,
+    int64_t max_num_reqs,
+    int64_t max_context_len,
+    int64_t max_total_num_tokens,
+    int64_t buffer_size_per_thread) {
+  using Vec = at::vec::Vectorized<float>;
+
+  // block length for heads
+  const int64_t BLOCK_H = batches == 1 ? 6 : (batches > 16 ? 22 : 11);
+
+  // strides
+  const int64_t l_stride0 = num_heads * num_kv_splits * (head_size_v + 1);
+  const int64_t l_stride1 = num_kv_splits * (head_size_v + 1);
+  const int64_t l_stride2 = head_size_v + 1;
+
+  TORCH_CHECK(logit_cap == 0.f, "decode MLA: expect no logit_cap.");
+
+  // partition the heads into blocks for parallel
+  const int64_t num_blocks = div_up(num_heads, BLOCK_H);
+
+  // parallel on [batches, num_blocks, num_kv_splits]
+  at::parallel_for(0, batches * num_blocks * num_kv_splits, 0, [&](int64_t begin, int64_t end) {
+    int64_t bs{0}, block_id{0}, kv_id{0};
+    data_index_init(begin, bs, batches, block_id, num_blocks, kv_id, num_kv_splits);
+
+    int tid = at::get_thread_num();
+    scalar_t* __restrict__ Btmp0 = buffer + tid * buffer_size_per_thread;
+    scalar_t* __restrict__ Btmp1 = Btmp0 + BLOCK_N * head_size;
+
+    // init Btmp1 just once for each thread to prevent NaN
+    // Btmp0 is not needed as it computes full K every single time
+    fill_stub(Btmp1, 0.f, BLOCK_N * head_size_v);
+
+    alignas(64) float s_i[BLOCK_H * BLOCK_N];
+    float* __restrict__ s_delta = s_i;
+    alignas(64) scalar_t s_delta2[BLOCK_H * BLOCK_N];
+
+    alignas(64) float s_prime[BLOCK_H];
+    alignas(64) float m_prime[BLOCK_H];
+    alignas(64) float m_delta[BLOCK_H];
+
+    for (int64_t i = begin; i < end; ++i) {
+      const int64_t h_start = block_id * BLOCK_H;
+      const int64_t h_end = std::min(block_id * BLOCK_H + BLOCK_H, num_heads);
+      const int64_t h_size = h_end - h_start;
+
+      // get query
+      const scalar_t* __restrict__ q_ptr = query + bs * q_strideM + h_start * q_strideH;
+
+      int64_t seq_len_kv = seq_lens[bs];
+      int64_t req_pool_id = req_pool_indices[bs];
+      TORCH_CHECK(seq_len_kv <= max_context_len, "seq_len_kv out of scope!");
+      TORCH_CHECK(req_pool_id < max_num_reqs, "req_pool_id out of scope!");
+
+      const int64_t SPLIT_SIZE = div_up(seq_len_kv, num_kv_splits);
+      const int64_t kv_start = kv_id * SPLIT_SIZE;
+      const int64_t kv_end = std::min(kv_start + SPLIT_SIZE, seq_len_kv);
+
+      fill_stub(s_prime, 0.f, BLOCK_H);
+      fill_stub(m_prime, -std::numeric_limits<float>::infinity(), BLOCK_H);
+
+      // get v_prime, and init to zero
+      float* __restrict__ v_prime = attn_logits + bs * l_stride0 + h_start * l_stride1 + kv_id * l_stride2;
+      for (int64_t h = 0; h < h_size; ++h) {
+        fill_stub(v_prime + h * l_stride1, 0.f, head_size_v);
+      }
+
+      // loop over K and V sequence with BLOCK_N
+      for (int64_t n = kv_start; n < kv_end; n += BLOCK_N) {
+        int64_t n_size = std::min(BLOCK_N, kv_end - n);
+        const int64_t padded_n_size = div_up(int(n_size), TILE_K) * TILE_K;
+
+        // get key and pack
+        pack_vnni<scalar_t, index_t>(
+            /*    dst0 */ Btmp0,
+            /*    dst1 */ Btmp1,
+            /*     src */ k_buffer + /* head_kv_id */ 0 * k_strideH,
+            /*     ind */ req_to_token + req_pool_id * max_context_len + n,
+            /*       N */ n_size,
+            /*       K */ head_size,
+            /*      Kv */ head_size_v,
+            /*  ld_src */ k_strideN,
+            /* ld_dst0 */ BLOCK_N,
+            /* ld_dst1 */ head_size_v);
+
+        // calculate s_i <- Q @ K
+        at::native::cpublas::brgemm(
+            /* M     */ h_size,
+            /* N     */ n_size,
+            /* K     */ head_size,
+            /* lda   */ q_strideH,
+            /* ldb   */ BLOCK_N,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ q_ptr,
+            /* B     */ Btmp0,
+            /* C     */ s_i);
+
+        const Vec scale_vec = Vec(scaling);
+        for (int64_t h = 0; h < h_size; ++h) {
+          // s_i <- s_i * scale
+          at::vec::map<float>(
+              [scale_vec](Vec x) { return x * scale_vec; }, s_i + h * BLOCK_N, s_i + h * BLOCK_N, n_size);
+
+          // m_i: max value per row
+          float m_i = at::vec::reduce_all<float>(
+              [](Vec& x, Vec& y) { return at::vec::maximum(x, y); }, s_i + h * BLOCK_N, n_size);
+          m_i = std::max(m_i, m_prime[h]);
+
+          // m_delta <- exp(m' - m_i)
+          m_delta[h] = std::exp(m_prime[h] - m_i);
+
+          // s_delta <- exp(s_i - m_i)
+          at::vec::map<float>(
+              [m_i](Vec x) { return (x - Vec(m_i)).exp_u20(); }, s_delta + h * BLOCK_N, s_i + h * BLOCK_N, n_size);
+
+          // s' <- s' * m_delta + sum(s_delta)
+          s_prime[h] *= m_delta[h];
+          s_prime[h] += at::vec::reduce_all<float>([](Vec& x, Vec& y) { return x + y; }, s_delta + h * BLOCK_N, n_size);
+
+          m_prime[h] = m_i;
+
+          // v' <- v' * m_delta
+          float scale_m = m_delta[h];
+          at::vec::map<float>(
+              [scale_m](Vec x) { return x * Vec(scale_m); },
+              v_prime + h * l_stride1,
+              v_prime + h * l_stride1,
+              head_size_v);
+
+          // pad s_delta with 0 first and then convert to scalar_t
+          fill_stub(s_delta + h * BLOCK_N + n_size, 0.f, padded_n_size - n_size);
+          copy_stub<scalar_t, BLOCK_N>(s_delta2 + h * BLOCK_N, s_delta + h * BLOCK_N);
+        }
+
+        // calculate V' <- s_delta @ V + V'
+        at::native::cpublas::brgemm(
+            /* M     */ h_size,
+            /* N     */ head_size_v,
+            /* K     */ padded_n_size,  // n_size
+            /* lda   */ BLOCK_N,
+            /* ldb   */ head_size_v,
+            /* ldc   */ l_stride1,
+            /* add_C */ true,
+            /* A     */ s_delta2,
+            /* B     */ Btmp1,
+            /* C     */ v_prime);
+      }  // loop with KV blocks
+
+      // only update v' when kv_split_size > 0
+      if (kv_end > kv_start) {
+        for (int64_t h = 0; h < h_size; ++h) {
+          float s = 1 / s_prime[h];
+          at::vec::map<float>(
+              [s](Vec out) { return out * Vec(s); }, v_prime + h * l_stride1, v_prime + h * l_stride1, head_size_v);
+          (v_prime + h * l_stride1)[head_size_v] = m_prime[h] + std::log(s_prime[h]);
+        }
+      }
+
+      // move to the next index
+      data_index_step(bs, batches, block_id, num_blocks, kv_id, num_kv_splits);
+    }
+    at::native::cpublas::brgemm_release();
+  });
+
+  decode_accumulate_kv_splits(
+      output, attn_logits, batches, num_heads, head_size_v, num_kv_splits, l_stride1, l_stride2);
+}  // MLA
+
+template <typename scalar_t, typename index_t, int64_t BLOCK_N>
+void decode_attention_grouped_kernel_impl(
+    scalar_t* __restrict__ output,
+    float* __restrict__ attn_logits,
+    const scalar_t* __restrict__ query,
+    const scalar_t* __restrict__ k_buffer,
+    const scalar_t* __restrict__ v_buffer,
+    const index_t* __restrict__ req_to_token,
+    const int64_t* __restrict__ req_pool_indices,
+    const int64_t* __restrict__ seq_lens,
+    int64_t batches,
+    int64_t num_heads,
+    int64_t num_heads_kv,
+    int64_t head_size,
+    int64_t head_size_v,
+    int64_t num_kv_splits,
+    int64_t q_strideM,
+    int64_t q_strideH,
+    int64_t k_strideN,
+    int64_t k_strideH,
+    int64_t v_strideN,
+    int64_t v_strideH,
+    float scaling,
+    float logit_cap,
+    int64_t max_num_reqs,
+    int64_t max_context_len,
+    int64_t max_total_num_tokens) {
+  using Vec = at::vec::Vectorized<float>;
+
+  // block length for heads
+  // we parallel on [batches, divup(num_heads, BLOCK_H), num_kv_splits]
+  // use smaller BLOCK_H when batches is small to utilize all cores
+  constexpr int64_t kBLOCK_H = 16;
+  const int64_t BLOCK_H = std::min(4 * batches, kBLOCK_H);
+
+  // strides
+  const int64_t l_stride0 = num_heads * num_kv_splits * (head_size_v + 1);
+  const int64_t l_stride1 = num_kv_splits * (head_size_v + 1);
+  const int64_t l_stride2 = head_size_v + 1;
+
+  const bool has_logit_cap = logit_cap > 0;
+  float rlogit_cap = has_logit_cap ? 1 / logit_cap : 0.f;
+
+  // partition the heads into blocks for parallel
+  const int64_t num_groups = num_heads / num_heads_kv;
+  const int64_t num_blocks = div_up(num_groups, BLOCK_H);
+
+  // parallel on [batches, num_heads_kv, num_blocks, num_kv_splits]
+  at::parallel_for(0, batches * num_heads_kv * num_blocks * num_kv_splits, 0, [&](int64_t begin, int64_t end) {
+    int64_t bs{0}, head_kv_id{0}, block_id{0}, kv_id{0};
+    data_index_init(begin, bs, batches, head_kv_id, num_heads_kv, block_id, num_blocks, kv_id, num_kv_splits);
+
+    alignas(64) float s_i[BLOCK_H * BLOCK_N];
+    float* __restrict__ s_delta = s_i;
+
+    alignas(64) float s_prime[BLOCK_H];
+    alignas(64) float m_prime[BLOCK_H];
+    alignas(64) float m_delta[BLOCK_H];
+
+    for (int64_t i = begin; i < end; ++i) {
+      const int64_t h_start = head_kv_id * num_groups + block_id * BLOCK_H;
+      const int64_t h_end = head_kv_id * num_groups + std::min(block_id * BLOCK_H + BLOCK_H, num_groups);
+      const int64_t h_size = h_end - h_start;
+
+      // get query
+      const scalar_t* __restrict__ q_ptr = query + bs * q_strideM + h_start * q_strideH;
+
+      int64_t seq_len_kv = seq_lens[bs];
+      int64_t req_pool_id = req_pool_indices[bs];
+      TORCH_CHECK(seq_len_kv <= max_context_len, "seq_len_kv out of scope!");
+      TORCH_CHECK(req_pool_id < max_num_reqs, "req_pool_id out of scope!");
+
+      const int64_t SPLIT_SIZE = div_up(seq_len_kv, num_kv_splits);
+      const int64_t kv_start = kv_id * SPLIT_SIZE;
+      const int64_t kv_end = std::min(kv_start + SPLIT_SIZE, seq_len_kv);
+
+      fill_stub(s_prime, 0.f, BLOCK_H);
+      fill_stub(m_prime, -std::numeric_limits<float>::infinity(), BLOCK_H);
+
+      // get v_prime, and init to zero
+      float* __restrict__ v_prime = attn_logits + bs * l_stride0 + h_start * l_stride1 + kv_id * l_stride2;
+      for (int64_t h = 0; h < h_size; ++h) {
+        fill_stub(v_prime + h * l_stride1, 0.f, head_size_v);
+      }
+
+      // loop over K and V sequence with BLOCK_N
+      for (int64_t n = kv_start; n < kv_end; n += BLOCK_N) {
+        int64_t n_size = std::min(BLOCK_N, kv_end - n);
+
+        // calculate Q @ K
+        index_gemm_kernel_nt<scalar_t, index_t>(
+            /* A   */ q_ptr,
+            /* B   */ k_buffer + head_kv_id * k_strideH,
+            /* C   */ s_i,
+            /* ind */ req_to_token + req_pool_id * max_context_len + n,
+            /* scl */ scaling,
+            /* M   */ h_size,
+            /* N   */ n_size,
+            /* K   */ head_size,
+            /* lda */ q_strideH,
+            /* ldb */ k_strideN,
+            /* ldc */ BLOCK_N,
+            /* mtt */ max_total_num_tokens);
+
+        if (has_logit_cap) {
+          at::vec::map<float>(
+              [logit_cap, rlogit_cap](Vec x) { return Vec(logit_cap) * (x * Vec(rlogit_cap)).tanh(); },
+              s_i,
+              s_i,
+              BLOCK_H * BLOCK_N);
+        }
+
+        // update the scaling coefficients
+        for (int64_t h = 0; h < h_size; ++h) {
+          // m_i: max value per row
+          float m_i = at::vec::reduce_all<float>(
+              [](Vec& x, Vec& y) { return at::vec::maximum(x, y); }, s_i + h * BLOCK_N, n_size);
+          m_i = std::max(m_i, m_prime[h]);
+
+          // m_delta <- exp(m' - m_i)
+          m_delta[h] = std::exp(m_prime[h] - m_i);
+
+          // s_delta <- exp(s_i - m_i)
+          at::vec::map<float>(
+              [m_i](Vec x) { return (x - Vec(m_i)).exp_u20(); }, s_delta + h * BLOCK_N, s_i + h * BLOCK_N, n_size);
+
+          // s' <- s' * m_delta + sum(s_delta)
+          s_prime[h] *= m_delta[h];
+          s_prime[h] += at::vec::reduce_all<float>([](Vec& x, Vec& y) { return x + y; }, s_delta + h * BLOCK_N, n_size);
+
+          m_prime[h] = m_i;
+        }
+
+        // calculate V' <- s_delta @ V + V' * m_delta
+        index_gemm_kernel_nn<scalar_t, index_t>(
+            /* A   */ s_delta,
+            /* B   */ v_buffer + head_kv_id * v_strideH,
+            /* C   */ v_prime,
+            /* ind */ req_to_token + req_pool_id * max_context_len + n,
+            /* scl */ m_delta,
+            /* M   */ h_size,
+            /* N   */ head_size_v,
+            /* K   */ n_size,
+            /* lda */ BLOCK_N,
+            /* ldb */ v_strideN,
+            /* ldc */ l_stride1,
+            /* mtt */ max_total_num_tokens);
+      }  // loop with KV blocks
+
+      // only update v' when kv_split_size > 0
+      if (kv_end > kv_start) {
+        for (int64_t h = 0; h < h_size; ++h) {
+          float s = 1 / s_prime[h];
+          at::vec::map<float>(
+              [s](Vec out) { return out * Vec(s); }, v_prime + h * l_stride1, v_prime + h * l_stride1, head_size_v);
+          (v_prime + h * l_stride1)[head_size_v] = m_prime[h] + std::log(s_prime[h]);
+        }
+      }
+
+      // move to the next index
+      data_index_step(bs, batches, head_kv_id, num_heads_kv, block_id, num_blocks, kv_id, num_kv_splits);
+    }
+  });
+
+  decode_accumulate_kv_splits(
+      output, attn_logits, batches, num_heads, head_size_v, num_kv_splits, l_stride1, l_stride2);
+}  // GQA/MQA
+
+}  // anonymous namespace
+
+// query:            [num_tokens, num_heads, head_size]
+// output:           [num_tokens, num_heads, head_size]
+// k_buffer:         [max_total_num_tokens, num_heads, head_size]
+// v_buffer:         [max_total_num_tokens, num_heads, head_size_v]
+// attn_logits:      [num_seqs, num_heads, num_kv_splits, head_size_v + 1]
+// req_to_token:     [max_num_reqs, max_context_len] int32 or int64
+// req_pool_indices: [num_seqs] int64
+// seq_lens:         [num_seqs] int64
+//
+void decode_attention_cpu(
+    at::Tensor& query,
+    at::Tensor& k_buffer,
+    at::Tensor& v_buffer,
+    at::Tensor& output,
+    at::Tensor& key,
+    at::Tensor& value,
+    at::Tensor& loc,
+    at::Tensor& attn_logits,
+    at::Tensor& req_to_token,
+    at::Tensor& req_pool_indices,
+    at::Tensor& seq_lens,
+    double sm_scale,
+    double logit_cap) {
+  RECORD_FUNCTION(
+      "sgl-kernel::decode_attention_cpu",
+      std::vector<c10::IValue>(
+          {query, output, k_buffer, v_buffer, attn_logits, req_to_token, req_pool_indices, seq_lens}));
+
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(query);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(k_buffer);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(v_buffer);
+  // for MLA, key and value shares the same storage and value could be non-contiguous
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(key);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(value);
+  CHECK_DIM(3, query);
+  CHECK_DIM(3, k_buffer);
+  CHECK_DIM(3, v_buffer);
+  CHECK_DIM(3, key);
+  CHECK_DIM(3, value);
+  CHECK_DIM(1, loc);
+
+  int64_t num_seqs = seq_lens.size(0);
+  int64_t max_num_reqs = req_to_token.size(0);
+  int64_t max_context_len = req_to_token.size(1);
+  int64_t max_total_num_tokens = k_buffer.size(0);
+
+  int64_t num_heads = query.size(1);
+  int64_t num_heads_kv = k_buffer.size(1);
+  int64_t head_size = query.size(2);
+  int64_t head_size_v = v_buffer.size(2);
+
+  int64_t num_kv_splits = attn_logits.size(2);
+
+  CHECK_EQ(loc.numel(), num_seqs);
+  CHECK_EQ(attn_logits.size(0), num_seqs);
+  CHECK_EQ(attn_logits.size(1), num_heads);
+  CHECK_EQ(attn_logits.size(3), head_size_v + 1);
+  CHECK_EQ(attn_logits.scalar_type(), at::kFloat);
+
+  // strides for query
+  int64_t q_strideM = query.stride(0);
+  int64_t q_strideH = query.stride(1);
+
+  // strides for k_buffer and v_buffer
+  int64_t k_strideN = k_buffer.stride(0);
+  int64_t k_strideH = k_buffer.stride(1);
+  int64_t v_strideN = v_buffer.stride(0);
+  int64_t v_strideH = v_buffer.stride(1);
+  // strides for new key and value
+  int64_t nk_strideN = key.stride(0);
+  int64_t nk_strideH = key.stride(1);
+  int64_t nv_strideN = value.stride(0);
+  int64_t nv_strideH = value.stride(1);
+
+  // check index data types
+  const auto index_dtype = req_to_token.scalar_type();
+  TORCH_CHECK(
+      index_dtype == at::kInt || index_dtype == at::kLong,
+      "decode: expect req_to_token to be int32 or int64, got ",
+      index_dtype);
+  TORCH_CHECK(seq_lens.scalar_type() == at::kLong, "decode: expect req_lens to be int64, got ", seq_lens.scalar_type());
+  TORCH_CHECK(
+      req_pool_indices.scalar_type() == at::kLong,
+      "decode: expect req_pool_indices to be int64, got ",
+      req_pool_indices.scalar_type());
+
+  // check if we have MLA here
+  void* k_buffer_data = k_buffer.data_ptr();
+  void* v_buffer_data = v_buffer.data_ptr();
+  const bool is_mla = (k_buffer_data == v_buffer_data) && (num_heads_kv == 1) && (head_size == head_size_v + 64);
+
+  // block length for k_buffer and v_buffer
+  constexpr int BLOCK_N = 256;
+
+  // buffer for packing k_cache and v_cache
+  int num_threads = at::get_num_threads();
+  int64_t size_per_thread = is_mla ? BLOCK_N * head_size + BLOCK_N * head_size_v : 0;
+  auto buffer = at::empty({num_threads, size_per_thread}, k_buffer.options());
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(query.scalar_type(), "decode_attention_kernel", [&] {
+    AT_DISPATCH_INDEX_TYPES(index_dtype, "decode_attention_indices", [&] {
+      // update the kv buffer
+      decode_set_kv_buffer(
+          (scalar_t*)k_buffer_data,
+          (scalar_t*)v_buffer_data,
+          key.data_ptr<scalar_t>(),
+          value.data_ptr<scalar_t>(),
+          loc.data_ptr<int64_t>(),
+          num_seqs,
+          num_heads_kv,
+          head_size,
+          head_size_v,
+          k_strideN,
+          k_strideH,
+          v_strideN,
+          v_strideH,
+          nk_strideN,
+          nk_strideH,
+          nv_strideN,
+          nv_strideH,
+          is_mla);
+
+      if (num_heads == num_heads_kv) {
+        // MHA
+        decode_attention_kernel_impl<scalar_t, index_t, BLOCK_N>(
+            output.data_ptr<scalar_t>(),
+            attn_logits.data_ptr<float>(),
+            query.data_ptr<scalar_t>(),
+            (const scalar_t*)k_buffer_data,
+            (const scalar_t*)v_buffer_data,
+            req_to_token.data_ptr<index_t>(),
+            req_pool_indices.data_ptr<int64_t>(),
+            seq_lens.data_ptr<int64_t>(),
+            num_seqs,
+            num_heads,
+            head_size,
+            head_size_v,
+            num_kv_splits,
+            q_strideM,
+            q_strideH,
+            k_strideN,
+            k_strideH,
+            v_strideN,
+            v_strideH,
+            sm_scale,
+            logit_cap,
+            max_num_reqs,
+            max_context_len,
+            max_total_num_tokens);
+      } else if (is_mla) {
+        // MLA
+        decode_attention_mla_kernel_impl<scalar_t, index_t, BLOCK_N>(
+            output.data_ptr<scalar_t>(),
+            attn_logits.data_ptr<float>(),
+            query.data_ptr<scalar_t>(),
+            (const scalar_t*)k_buffer_data,
+            (const scalar_t*)v_buffer_data,
+            req_to_token.data_ptr<index_t>(),
+            req_pool_indices.data_ptr<int64_t>(),
+            seq_lens.data_ptr<int64_t>(),
+            buffer.data_ptr<scalar_t>(),
+            num_seqs,
+            num_heads,
+            head_size,
+            head_size_v,
+            num_kv_splits,
+            q_strideM,
+            q_strideH,
+            k_strideN,
+            k_strideH,
+            v_strideN,
+            v_strideH,
+            sm_scale,
+            logit_cap,
+            max_num_reqs,
+            max_context_len,
+            max_total_num_tokens,
+            size_per_thread);
+      } else {
+        // GQA/MQA
+        decode_attention_grouped_kernel_impl<scalar_t, index_t, BLOCK_N>(
+            output.data_ptr<scalar_t>(),
+            attn_logits.data_ptr<float>(),
+            query.data_ptr<scalar_t>(),
+            (const scalar_t*)k_buffer_data,
+            (const scalar_t*)v_buffer_data,
+            req_to_token.data_ptr<index_t>(),
+            req_pool_indices.data_ptr<int64_t>(),
+            seq_lens.data_ptr<int64_t>(),
+            num_seqs,
+            num_heads,
+            num_heads_kv,
+            head_size,
+            head_size_v,
+            num_kv_splits,
+            q_strideM,
+            q_strideH,
+            k_strideN,
+            k_strideH,
+            v_strideN,
+            v_strideH,
+            sm_scale,
+            logit_cap,
+            max_num_reqs,
+            max_context_len,
+            max_total_num_tokens);
+      }
+    });
+  });
+}
diff --git a/sgl-kernel/csrc/cpu/extend.cpp b/sgl-kernel/csrc/cpu/extend.cpp
new file mode 100644
index 000000000..3162ccea8
--- /dev/null
+++ b/sgl-kernel/csrc/cpu/extend.cpp
@@ -0,0 +1,723 @@
+#include "common.h"
+#include "gemm.h"
+#include "vec.h"
+
+namespace {
+
+// [NOTE]: extend attention for CPU
+//   1. tune BLOCK_M and BLOCK_N
+//   2. can handle non-contiguous k_exttend and v_extend
+//   3. computes attention for prefix and extend separately
+//   4. TODO: vectorize `pack_vnni` and `pack_vnni2`
+//
+
+template <typename index_t>
+inline index_t get_index(index_t* ind, int i) {
+  return (ind == nullptr) ? (index_t)i : ind[i];
+}
+
+#if defined(CPU_CAPABILITY_AVX512)
+// key: from [N, 32] to [32/2, N, 2]
+template <typename scalar_t, typename index_t>
+inline void pack_vnni_Nx32(
+    scalar_t* __restrict__ dst,
+    const scalar_t* __restrict__ src,
+    const index_t* __restrict__ ind,
+    int N,
+    int ld_src,
+    int ld_dst) {
+  __m512i vinputs[16];
+
+  int n = 0;
+  for (; n < N; ++n) {
+    index_t index = get_index(ind, n);
+    vinputs[n] = _mm512_loadu_si512(src + index * ld_src);
+  }
+  // padding with zero to avoid uninitialized vectors
+  for (; n < 16; ++n) {
+    vinputs[n] = _mm512_set1_epi32(0);
+  }
+
+  // pack key
+  transpose_16x16_32bit(vinputs);
+
+  const __mmask16 vmask = (1 << N) - 1;
+  for (int k = 0; k < 16; ++k) {
+    _mm512_mask_storeu_epi32(dst + k * ld_dst * 2, vmask, vinputs[k]);
+  }
+}
+
+// value: from [K, 32] to [K/2, 32, 2]
+template <typename scalar_t, typename index_t>
+inline void pack_vnni_Kx32(
+    scalar_t* __restrict__ dst,
+    const scalar_t* __restrict__ src,
+    const index_t* __restrict__ ind,
+    int K,
+    int ld_src,
+    int ld_dst) {
+  __m512i vinputs[2];
+
+  int k = 0;
+  for (; k < K; ++k) {
+    index_t index = get_index(ind, k);
+    vinputs[k] = _mm512_loadu_si512(src + index * ld_src);
+  }
+  // padding with zero to avoid uninitialized vectors
+  for (; k < 2; ++k) {
+    vinputs[k] = _mm512_set1_epi32(0);
+  }
+
+  // pack value
+  __m512i d0, d1;
+  std::tie(d0, d1) = transpose_2x32_16bit(vinputs[0], vinputs[1]);
+  _mm512_storeu_si512(dst + 0 * ld_dst * 2, d0);
+  _mm512_storeu_si512(dst + 0 * ld_dst * 2 + 32, d1);
+}
+#endif
+
+// convert to vnni format
+// from [N, K/2, 2] to [K/2, N, 2] for bfloat16 and float16
+template <typename scalar_t, typename index_t>
+void pack_vnni(
+    scalar_t* __restrict__ dst,
+    const scalar_t* __restrict__ src,
+    const index_t* __restrict__ ind,
+    int N,
+    int K,
+    int ld_src,
+    int ld_dst) {
+#if defined(CPU_CAPABILITY_AVX512)
+  const int NB = div_up(N, 16);
+  const int KB = K / 32;  // no remainder
+  const bool is_indexed = ind != nullptr;
+
+  for (int nb = 0; nb < NB; ++nb) {
+    for (int kb = 0; kb < KB; ++kb) {
+      // handle 16x512bits each block
+      int nb_size = std::min(N - nb * 16, 16);
+      pack_vnni_Nx32<scalar_t, index_t>(
+          /*    dst */ dst + ((kb * 32) >> 1) * ld_dst * 2 + nb * 16 * 2,
+          /*    src */ src + kb * 32 + (is_indexed ? 0 : nb * 16 * ld_src),
+          /*    ind */ is_indexed ? ind + nb * 16 : nullptr,
+          /*      N */ nb_size,
+          /* ld_src */ ld_src,
+          /* ld_dst */ ld_dst);
+    }
+  }
+#else
+  for (int n = 0; n < N; ++n) {
+    index_t index = get_index(ind, n);
+    for (int k = 0; k < K / 2; ++k) {
+      for (int d = 0; d < 2; ++d) {
+        dst[k * ld_dst * 2 + n * 2 + d] = src[index * ld_src + k * 2 + d];
+      }
+    }
+  }
+#endif
+}
+
+// convert to vnni format
+// from [K/2, 2, N] to [K/2, N, 2] for bfloat16 and float16
+template <typename scalar_t, typename index_t>
+void pack_vnni2(
+    scalar_t* __restrict__ dst,
+    const scalar_t* __restrict__ src,
+    const index_t* __restrict__ ind,
+    int K,
+    int N,
+    int ld_src,
+    int ld_dst) {
+#if defined(CPU_CAPABILITY_AVX512)
+  const int KB = div_up(K, 2);
+  const int NB = N / 32;  // no remainder
+  const bool is_indexed = ind != nullptr;
+
+  for (int kb = 0; kb < KB; ++kb) {
+    for (int nb = 0; nb < NB; ++nb) {
+      // handle 2x512bits each block
+      int kb_size = std::min(K - kb * 2, 2);
+      pack_vnni_Kx32<scalar_t, index_t>(
+          /*    dst */ dst + ((kb * 2) >> 1) * ld_dst * 2 + nb * 32 * 2,
+          /*    src */ src + (is_indexed ? 0 : kb * 2 * ld_src) + nb * 32,
+          /*    ind */ is_indexed ? ind + kb * 2 : nullptr,
+          /*      K */ kb_size,
+          /* ld_src */ ld_src,
+          /* ld_dst */ ld_dst);
+    }
+  }
+#else
+  int k = 0;
+  for (; k < (K >> 1) * 2; k += 2) {
+    index_t index0 = get_index(ind, k + 0);
+    index_t index1 = get_index(ind, k + 1);
+    for (int n = 0; n < N; ++n) {
+      dst[(k >> 1) * ld_dst * 2 + n * 2 + 0] = src[index0 * ld_src + n];
+      dst[(k >> 1) * ld_dst * 2 + n * 2 + 1] = src[index1 * ld_src + n];
+    }
+  }
+  if (K % 2 != 0) {
+    index_t index = get_index(ind, K - 1);
+    for (int n = 0; n < N; ++n) {
+      dst[(K >> 1) * ld_dst * 2 + n * 2 + 0] = src[index * ld_src + n];
+      dst[(K >> 1) * ld_dst * 2 + n * 2 + 1] = 0;
+    }
+    k += 2;
+  }
+#endif
+}
+
+template <typename scalar_t>
+inline void fill_stub(scalar_t* __restrict__ out, float val, int size) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+  constexpr int kVecSize = Vec::size();
+  const Vec data_vec = Vec(static_cast<scalar_t>(val));
+  int d = 0;
+#pragma GCC unroll 4
+  for (; d <= size - kVecSize; d += kVecSize) {
+    data_vec.store(out + d);
+  }
+  if (size - d > 0) {
+    data_vec.store(out + d, size - d);
+  }
+}
+
+template <typename scalar_t, int BLOCK_N>
+inline void copy_stub(scalar_t* __restrict__ out, const float* __restrict__ input) {
+  static_assert(BLOCK_N % 32 == 0);
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+
+  constexpr int COLS = BLOCK_N / 16;
+  auto store = [&](auto i) {
+    constexpr int col = i % COLS;
+    // for COLS = 2, 4 use 512bit store
+    if constexpr (col % 2 == 0) {
+      fVec a_fvec0 = fVec::loadu(input + col * 16);
+      fVec a_fvec1 = fVec::loadu(input + col * 16 + 16);
+      bVec out_bvec = convert_from_float_ext<scalar_t>(a_fvec0, a_fvec1);
+      out_bvec.store(out + col * 16);
+    }
+  };
+  Unroll<COLS>{}(store);
+}
+
+template <typename scalar_t>
+inline void copy_stub(scalar_t* __restrict__ out, const float* __restrict__ acc, float s, int size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  const fVec s_fvec = fVec(s);
+  int d = 0;
+#pragma GCC unroll 4
+  for (; d <= size - kVecSize; d += kVecSize) {
+    fVec a_fvec0 = fVec::loadu(acc + d) * s_fvec;
+    fVec a_fvec1 = fVec::loadu(acc + d + fVec::size()) * s_fvec;
+    bVec out_bvec = convert_from_float_ext<scalar_t>(a_fvec0, a_fvec1);
+    out_bvec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(acc[d] * s);
+  }
+}
+
+template <typename scalar_t, typename index_t, int BLOCK_M, int BLOCK_N>
+void extend_attention_kernel_impl(
+    scalar_t* __restrict__ o_extend,
+    const scalar_t* __restrict__ q_extend,
+    const scalar_t* __restrict__ k_extend,
+    const scalar_t* __restrict__ v_extend,
+    const scalar_t* __restrict__ k_buffer,
+    const scalar_t* __restrict__ v_buffer,
+    const index_t* __restrict__ req_to_token,
+    const int64_t* __restrict__ req_pool_indices,
+    const int64_t* __restrict__ seq_lens,
+    const index_t* __restrict__ extend_seq_lens,
+    const index_t* __restrict__ extend_start_loc,
+    const void* __restrict__ buffer,
+    int batches,
+    int num_heads,
+    int num_heads_kv,
+    int head_size,
+    int head_size_v,
+    int q_strideM,
+    int q_strideH,
+    int ke_strideN,
+    int ke_strideH,
+    int ve_strideN,
+    int ve_strideH,
+    int k_strideN,
+    int k_strideH,
+    int v_strideN,
+    int v_strideH,
+    float scaling,
+    float logit_cap,
+    int max_num_reqs,
+    int max_context_len,
+    int max_total_num_tokens,
+    int max_len_extend,
+    int buffer_size_per_thread,
+    bool is_prefix_skipped) {
+  using Vec = at::vec::Vectorized<float>;
+
+  // strides
+  const int o_strideM = num_heads * head_size_v;
+  const int o_strideH = head_size_v;
+
+  // we use same buffer for packed key and value
+  const int ldb_tmp = std::max(head_size, head_size_v);
+
+  const bool has_logit_cap = logit_cap > 0;
+  float rlogit_cap = has_logit_cap ? 1 / logit_cap : 0.f;
+
+  const int num_groups = num_heads / num_heads_kv;
+  TORCH_CHECK(num_groups * num_heads_kv == num_heads);
+
+  // number of blocks along M
+  int MB = div_up(max_len_extend, BLOCK_M);
+
+  // parallel on [batches, num_heads, BM]
+  at::parallel_for(0, batches * num_heads * MB, 0, [&](int begin, int end) {
+    int bs{0}, head_id{0}, mb{0};
+    data_index_init(begin, bs, batches, head_id, num_heads, mb, MB);
+
+    int tid = at::get_thread_num();
+    // s_i and s_delta: [BLOCK_M, BLOCK_N]
+    float* __restrict__ s_i = reinterpret_cast<float*>((char*)(buffer) + tid * buffer_size_per_thread);
+    float* __restrict__ s_delta = s_i;
+
+    // v_prime: [BLOCK_M, head_size_v]
+    float* __restrict__ v_prime = s_i + BLOCK_M * BLOCK_N;
+
+    // s_delta2: [BLOCK_M, BLOCK_N]; copy of s_delta in scalar_t
+    scalar_t* __restrict__ s_delta2 = reinterpret_cast<scalar_t*>(v_prime + BLOCK_N * head_size_v);
+
+    // Btmp: [BLOCK_N, max(head_size, head_size_v)]
+    scalar_t* __restrict__ Btmp = s_delta2 + BLOCK_M * BLOCK_N;
+
+    // init Btmp just once for each thread to prevent NaN
+    fill_stub(Btmp, 0.f, BLOCK_N * ldb_tmp);
+
+    alignas(64) float s_prime[BLOCK_M];
+    alignas(64) float m_prime[BLOCK_M];
+
+    for (int i = begin; i < end; ++i) {
+      // seq_len = prefix + extend
+      int head_kv_id = head_id / num_groups;
+      int seq_len = seq_lens[bs];
+      int seq_len_extend = extend_seq_lens[bs];
+      int seq_len_prefix = seq_len - seq_len_extend;
+      int seq_extend_start_loc = extend_start_loc[bs];
+
+      int req_pool_id = req_pool_indices[bs];
+      TORCH_CHECK(seq_len_prefix >= 0, "prefix len < 0!");
+      TORCH_CHECK(seq_len <= max_context_len, "seq_len out of scope!");
+      TORCH_CHECK(req_pool_id < max_num_reqs, "req_pool_id out of scope!");
+
+      if (is_prefix_skipped) {
+        TORCH_CHECK(seq_len_prefix == 0, "extend attention: expect seq_len_prefix to be 0, got ", seq_len_prefix);
+      }
+
+      // offset and size in MB
+      int m = mb * BLOCK_N;
+      int m_size = std::min(BLOCK_M, seq_len_extend - m);
+
+      if (m_size <= 0) {
+        data_index_step(bs, batches, head_id, num_heads, mb, MB);
+        continue;
+      }
+
+      // get query
+      const scalar_t* __restrict__ q_ptr = q_extend + (seq_extend_start_loc + m) * q_strideM + head_id * q_strideH;
+
+      // init v', s' and m'
+      fill_stub(v_prime, 0.f, m_size * head_size_v);
+      fill_stub(s_prime, 0.f, m_size);
+      fill_stub(m_prime, -std::numeric_limits<scalar_t>::infinity(), m_size);
+
+      // stage 1: compute scores with prefix
+      for (int n = 0; n < seq_len_prefix; n += BLOCK_N) {
+        int n_size = std::min(BLOCK_N, seq_len_prefix - n);
+
+        // `n_size` is K in 2nd gemm, pad to TILE_K;
+        const int padded_n_size = div_up(n_size, TILE_K) * TILE_K;
+
+        // get key and pack
+        pack_vnni<scalar_t, index_t>(
+            /*    dst */ Btmp,
+            /*    src */ k_buffer + head_kv_id * k_strideH,
+            /*    ind */ req_to_token + req_pool_id * max_context_len + n,
+            /*     N  */ n_size,
+            /*     K  */ head_size,
+            /* ld_src */ k_strideN,
+            /* ld_dst */ BLOCK_N);
+
+        // calculate s_i <- Q @ K
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ head_size,
+            /* lda   */ q_strideM,
+            /* ldb   */ BLOCK_N,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ q_ptr,
+            /* B     */ Btmp,
+            /* C     */ s_i);
+
+        const Vec scale_vec = Vec(scaling);
+        for (int row = 0; row < m_size; ++row) {
+          // s_i <- s_i * scale
+          at::vec::map<float>(
+              [scale_vec](Vec x) { return x * scale_vec; }, s_i + row * BLOCK_N, s_i + row * BLOCK_N, n_size);
+
+          // TODO: `tanh` from torch uses sleef u10, going to be slow
+          if (has_logit_cap) {
+            at::vec::map<float>(
+                [logit_cap, rlogit_cap](Vec x) { return Vec(logit_cap) * (x * Vec(rlogit_cap)).tanh(); },
+                s_i + row * BLOCK_N,
+                s_i + row * BLOCK_N,
+                n_size);
+          }
+
+          // m_i: max value per row
+          float m_i = at::vec::reduce_all<float>(
+              [](Vec& x, Vec& y) { return at::vec::maximum(x, y); }, s_i + row * BLOCK_N, n_size);
+          m_i = std::max(m_i, m_prime[row]);
+
+          // m_delta <- exp(m' - m_i)
+          float m_delta = std::exp(m_prime[row] - m_i);
+
+          // s_delta <- exp(s_i - m_i)
+          at::vec::map<float>(
+              [m_i](Vec x) { return (x - Vec(m_i)).exp_u20(); }, s_delta + row * BLOCK_N, s_i + row * BLOCK_N, n_size);
+
+          // s' <- s' * m_delta + sum(s_delta)
+          s_prime[row] *= m_delta;
+          s_prime[row] +=
+              at::vec::reduce_all<float>([](Vec& x, Vec& y) { return x + y; }, s_delta + row * BLOCK_N, n_size);
+
+          m_prime[row] = m_i;
+
+          // v' <- v' * m_delta
+          at::vec::map<float>(
+              [m_delta](Vec x) { return x * Vec(m_delta); },
+              v_prime + row * head_size_v,
+              v_prime + row * head_size_v,
+              head_size_v);
+
+          // pad s_delta with 0 first and then convert to scalar_t
+          fill_stub(s_delta + row * BLOCK_N + n_size, 0.f, padded_n_size - n_size);
+          copy_stub<scalar_t, BLOCK_N>(s_delta2 + row * BLOCK_N, s_delta + row * BLOCK_N);
+        }
+
+        // get value and pack
+        pack_vnni2<scalar_t, index_t>(
+            /*    dst */ Btmp,
+            /*    src */ v_buffer + head_kv_id * v_strideH,
+            /*    ind */ req_to_token + req_pool_id * max_context_len + n,
+            /*     K  */ n_size,
+            /*     N  */ head_size_v,
+            /* ld_src */ v_strideN,
+            /* ld_dst */ head_size_v);
+
+        // calculate V' <- s_delta @ V + V'
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ head_size_v,
+            /* K     */ padded_n_size,  // n_size
+            /* lda   */ BLOCK_N,
+            /* ldb   */ head_size_v,
+            /* ldc   */ head_size_v,
+            /* add_C */ true,
+            /* A     */ s_delta2,
+            /* B     */ Btmp,
+            /* C     */ v_prime);
+      }  // loop with seq_len_prefix
+
+      // stage 2: compute the triangle part
+      int num_keys = std::min(seq_len_extend, m + BLOCK_M);
+      for (int n = 0; n < num_keys; n += BLOCK_N) {
+        int n_size = std::min(BLOCK_N, num_keys - n);
+
+        // `n_size` is K in 2nd gemm, pad to TILE_K;
+        const int padded_n_size = div_up(n_size, TILE_K) * TILE_K;
+
+        // get key and pack
+        pack_vnni<scalar_t, index_t>(
+            /*    dst */ Btmp,
+            /*    src */ k_extend + (seq_extend_start_loc + n) * ke_strideN + head_kv_id * ke_strideH,
+            /*    ind */ nullptr,
+            /*     N  */ n_size,
+            /*     K  */ head_size,
+            /* ld_src */ ke_strideN,
+            /* ld_dst */ BLOCK_N);
+
+        // calculate s_i <- Q @ K
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ head_size,
+            /* lda   */ q_strideM,
+            /* ldb   */ BLOCK_N,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ q_ptr,
+            /* B     */ Btmp,
+            /* C     */ s_i);
+
+        // apply causal mask
+        if (num_keys - n <= BLOCK_N) {
+          for (int row = 0; row < m_size; ++row) {
+            int last_col = m + row - n;
+            // fill [last_col + 1, n_size) to -inf
+            float* row_ptr = s_i + row * BLOCK_N;
+            fill_stub(row_ptr + last_col + 1, -std::numeric_limits<float>::infinity(), n_size - last_col - 1);
+          }
+        }
+
+        const Vec scale_vec = Vec(scaling);
+        for (int row = 0; row < m_size; ++row) {
+          // s_i <- s_i * scale
+          at::vec::map<float>(
+              [scale_vec](Vec x) { return x * scale_vec; }, s_i + row * BLOCK_N, s_i + row * BLOCK_N, n_size);
+
+          // TODO: `tanh` from torch uses sleef u10, going to be slow
+          if (has_logit_cap) {
+            at::vec::map<float>(
+                [logit_cap, rlogit_cap](Vec x) { return Vec(logit_cap) * (x * Vec(rlogit_cap)).tanh(); },
+                s_i + row * BLOCK_N,
+                s_i + row * BLOCK_N,
+                n_size);
+          }
+
+          // m_i: max value per row
+          float m_i = at::vec::reduce_all<float>(
+              [](Vec& x, Vec& y) { return at::vec::maximum(x, y); }, s_i + row * BLOCK_N, n_size);
+          m_i = std::max(m_i, m_prime[row]);
+
+          // m_delta <- exp(m' - m_i)
+          float m_delta = std::exp(m_prime[row] - m_i);
+
+          // s_delta <- exp(s_i - m_i)
+          at::vec::map<float>(
+              [m_i](Vec x) { return (x - Vec(m_i)).exp_u20(); }, s_delta + row * BLOCK_N, s_i + row * BLOCK_N, n_size);
+
+          // s' <- s' * m_delta + sum(s_delta)
+          s_prime[row] *= m_delta;
+          s_prime[row] +=
+              at::vec::reduce_all<float>([](Vec& x, Vec& y) { return x + y; }, s_delta + row * BLOCK_N, n_size);
+
+          m_prime[row] = m_i;
+
+          // v' <- v' * m_delta
+          at::vec::map<float>(
+              [m_delta](Vec x) { return x * Vec(m_delta); },
+              v_prime + row * head_size_v,
+              v_prime + row * head_size_v,
+              head_size_v);
+
+          // pad s_delta with 0 first and then convert to scalar_t
+          fill_stub(s_delta + row * BLOCK_N + n_size, 0.f, padded_n_size - n_size);
+          copy_stub<scalar_t, BLOCK_N>(s_delta2 + row * BLOCK_N, s_delta + row * BLOCK_N);
+        }
+
+        // get value and pack
+        pack_vnni2<scalar_t, index_t>(
+            /*    dst */ Btmp,
+            /*    src */ v_extend + (seq_extend_start_loc + n) * ve_strideN + head_kv_id * ve_strideH,
+            /*    ind */ nullptr,
+            /*     K  */ n_size,
+            /*     N  */ head_size_v,
+            /* ld_src */ ve_strideN,
+            /* ld_dst */ head_size_v);
+
+        // calculate V' <- s_delta @ V + V'
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ head_size_v,
+            /* K     */ padded_n_size,  // n_size
+            /* lda   */ BLOCK_N,
+            /* ldb   */ head_size_v,
+            /* ldc   */ head_size_v,
+            /* add_C */ true,
+            /* A     */ s_delta2,
+            /* B     */ Btmp,
+            /* C     */ v_prime);
+      }  // loop with seq_len_extend
+
+      scalar_t* __restrict__ out_ptr = o_extend + (seq_extend_start_loc + m) * o_strideM + head_id * o_strideH;
+      for (int row = 0; row < m_size; ++row) {
+        float s = 1 / s_prime[row];
+        copy_stub<scalar_t>(out_ptr + row * o_strideM, v_prime + row * head_size_v, s, head_size_v);
+      }
+
+      // move to the next index
+      data_index_step(bs, batches, head_id, num_heads, mb, MB);
+    }
+    at::native::cpublas::brgemm_release();
+  });
+}
+
+}  // anonymous namespace
+
+// q_extend, k_extend, v_extend, o_extend: contiguous tensors
+// k_buffer, v_buffer: (prefix + extend) tensors in mem_manager
+//
+// q_extend: [num_tokens, num_heads, head_size]
+// k_extend: [num_extend_tokens, num_heads, head_size]
+// v_extend: [num_extend_tokens, num_heads, head_size]
+// o_extend: [num_tokens, num_heads, head_size]
+// k_buffer: [max_total_num_tokens, num_heads, head_size]
+// v_buffer: [max_total_num_tokens, num_heads, head_size]
+// req_to_token: [max_num_reqs, max_context_len] int32 or int64
+// req_pool_indices: [num_seqs] int64
+// seq_lens: [num_seqs] int64
+// extend_seq_lens: [num_seqs]
+// extend_start_loc: [num_seqs]
+//
+void extend_attention_cpu(
+    at::Tensor& q_extend,
+    at::Tensor& k_extend,
+    at::Tensor& v_extend,
+    at::Tensor& o_extend,
+    at::Tensor& k_buffer,
+    at::Tensor& v_buffer,
+    at::Tensor& req_to_token,
+    at::Tensor& req_pool_indices,
+    at::Tensor& seq_lens,
+    at::Tensor& extend_seq_lens,
+    at::Tensor& extend_start_loc,
+    int64_t max_len_extend,
+    double sm_scale,
+    double logit_cap) {
+  RECORD_FUNCTION(
+      "sgl-kernel::extend_attention_cpu",
+      std::vector<c10::IValue>(
+          {q_extend,
+           k_extend,
+           v_extend,
+           o_extend,
+           k_buffer,
+           v_buffer,
+           req_to_token,
+           req_pool_indices,
+           seq_lens,
+           extend_seq_lens,
+           extend_start_loc}));
+
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(q_extend);
+  CHECK_INPUT(o_extend);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(k_extend);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(v_extend);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(k_buffer);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(v_buffer);
+
+  int num_seqs = seq_lens.size(0);
+  int max_num_reqs = req_to_token.size(0);
+  int max_context_len = req_to_token.size(1);
+  int max_total_num_tokens = k_buffer.size(0);
+
+  int num_heads = q_extend.size(1);
+  int num_heads_kv = k_extend.size(1);
+  int head_size = q_extend.size(2);
+  int head_size_v = v_extend.size(2);
+
+  // strides for q_extend, k_extend and v_extend
+  int q_strideM = q_extend.stride(0);
+  int q_strideH = q_extend.stride(1);
+  int ke_strideN = k_extend.stride(0);
+  int ke_strideH = k_extend.stride(1);
+  int ve_strideN = v_extend.stride(0);
+  int ve_strideH = v_extend.stride(1);
+
+  // strides for k_buffer and v_buffer
+  int k_strideN = k_buffer.stride(0);
+  int k_strideH = k_buffer.stride(1);
+  int v_strideN = v_buffer.stride(0);
+  int v_strideH = v_buffer.stride(1);
+
+  // check sizes
+  CHECK_EQ(req_pool_indices.size(0), num_seqs);
+  CHECK_EQ(extend_seq_lens.size(0), num_seqs);
+  CHECK_EQ(extend_start_loc.size(0), num_seqs);
+  CHECK_EQ(v_extend.size(1), num_heads_kv);
+  CHECK_EQ(k_buffer.size(1), v_buffer.size(1));
+
+  // MLA will skip prefix part
+  const bool is_prefix_skipped = k_buffer.size(1) != num_heads_kv;
+
+  // check index data types
+  const auto index_dtype = req_to_token.scalar_type();
+  TORCH_CHECK(
+      index_dtype == at::kInt || index_dtype == at::kLong,
+      "extend: expect req_to_token to be int32 or int64, got ",
+      index_dtype);
+  TORCH_CHECK(seq_lens.scalar_type() == at::kLong, "extend: expect req_lens to be int64, got ", seq_lens.scalar_type());
+  TORCH_CHECK(
+      req_pool_indices.scalar_type() == at::kLong,
+      "extend: expect req_pool_indices to be int64, got ",
+      req_pool_indices.scalar_type());
+  TORCH_CHECK(
+      extend_seq_lens.scalar_type() == index_dtype && extend_start_loc.scalar_type() == index_dtype,
+      "extend: expect extend_seq_lens and extend_start_loc to have same dtype as req_to_token.");
+
+  // D and DV need to be 32x as we transpose by 512-bit
+  TORCH_CHECK(head_size % 32 == 0, "invalid head_size ", head_size);
+  TORCH_CHECK(head_size_v % 32 == 0, "invalid head_size_v ", head_size_v);
+
+  // block size for query seq length
+  constexpr int BLOCK_M = 32;
+  // block size for key/value seq length
+  constexpr int BLOCK_N = 32;
+
+  const int size_per_thread =
+      /* s_i     */ BLOCK_M * BLOCK_N * sizeof(float) +
+      /* v_prime */ BLOCK_M * head_size_v * sizeof(float) +
+      /* s_delta */ BLOCK_M * BLOCK_N * sizeof(uint16_t) +
+      /* Btmp    */ BLOCK_N * std::max(head_size, head_size_v) * sizeof(uint16_t);
+
+  int num_threads = at::get_num_threads();
+  auto buffer = at::empty({num_threads, size_per_thread}, q_extend.options().dtype(at::kChar));
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(q_extend.scalar_type(), "extend_attention_kernel", [&] {
+    AT_DISPATCH_INDEX_TYPES(index_dtype, "extend_attention_indices", [&] {
+      extend_attention_kernel_impl<scalar_t, index_t, BLOCK_M, BLOCK_N>(
+          o_extend.data_ptr<scalar_t>(),
+          q_extend.data_ptr<scalar_t>(),
+          k_extend.data_ptr<scalar_t>(),
+          v_extend.data_ptr<scalar_t>(),
+          k_buffer.data_ptr<scalar_t>(),
+          v_buffer.data_ptr<scalar_t>(),
+          req_to_token.data_ptr<index_t>(),
+          req_pool_indices.data_ptr<int64_t>(),
+          seq_lens.data_ptr<int64_t>(),
+          extend_seq_lens.data_ptr<index_t>(),
+          extend_start_loc.data_ptr<index_t>(),
+          buffer.data_ptr(),
+          num_seqs,
+          num_heads,
+          num_heads_kv,
+          head_size,
+          head_size_v,
+          q_strideM,
+          q_strideH,
+          ke_strideN,
+          ke_strideH,
+          ve_strideN,
+          ve_strideH,
+          k_strideN,
+          k_strideH,
+          v_strideN,
+          v_strideH,
+          sm_scale,
+          logit_cap,
+          max_num_reqs,
+          max_context_len,
+          max_total_num_tokens,
+          max_len_extend,
+          size_per_thread,
+          is_prefix_skipped);
+    });
+  });
+}
diff --git a/sgl-kernel/csrc/cpu/gemm.cpp b/sgl-kernel/csrc/cpu/gemm.cpp
new file mode 100644
index 000000000..48655b9f7
--- /dev/null
+++ b/sgl-kernel/csrc/cpu/gemm.cpp
@@ -0,0 +1,525 @@
+#include "gemm.h"
+
+#include "common.h"
+#include "vec.h"
+
+namespace {
+
+// packed   layout:
+//   quants {N, K}  int8_t
+//   comp   {N}     int32_t
+template <int BLOCK_N>
+inline void s8s8_compensation(int8_t* __restrict__ packed, int K) {
+#if defined(CPU_CAPABILITY_AVX512)
+  constexpr int COLS = BLOCK_N / 16;
+  __m512i vcomp[COLS];
+
+  for (int col = 0; col < COLS; ++col) {
+    vcomp[col] = _mm512_setzero_si512();
+  }
+
+  const int64_t offset = BLOCK_N * K;
+  const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
+  for (int k = 0; k < K / 4; ++k) {
+    for (int col = 0; col < COLS; ++col) {
+      __m512i vb = _mm512_loadu_si512((const __m512i*)(packed + k * BLOCK_N * 4 + col * 64));
+      vcomp[col] = _mm512_dpbusd_epi32(vcomp[col], off, vb);
+    }
+  }
+
+  for (int col = 0; col < COLS; ++col) {
+    _mm512_storeu_si512((__m512i*)(packed + offset + col * 64), vcomp[col]);
+  }
+#else
+  TORCH_CHECK(false, "s8s8_compensation not implemented!");
+#endif
+}
+
+// convert to vnni format
+// from [N, K] to [K/2, N, 2] for bfloat16 and float16
+template <typename packed_t>
+inline void pack_vnni(packed_t* __restrict__ packed, const packed_t* __restrict__ weight, int N, int K) {
+  const int VNNI_BLK = 2;
+  for (int n = 0; n < N; ++n) {
+    for (int k = 0; k < K / VNNI_BLK; ++k) {
+      for (int d = 0; d < VNNI_BLK; ++d) {
+        packed[k * N * VNNI_BLK + n * VNNI_BLK + d] = weight[n * K + k * VNNI_BLK + d];
+      }
+    }
+  }
+}
+
+template <>
+inline void pack_vnni<int8_t>(int8_t* __restrict__ packed, const int8_t* __restrict__ weight, int N, int K) {
+  constexpr int BLOCK_N = block_size_n();
+  TORCH_CHECK(N == BLOCK_N);
+
+  const int VNNI_BLK = 4;
+  for (int n = 0; n < N; ++n) {
+    for (int k = 0; k < K / VNNI_BLK; ++k) {
+      for (int d = 0; d < VNNI_BLK; ++d) {
+        packed[k * N * VNNI_BLK + n * VNNI_BLK + d] = weight[n * K + k * VNNI_BLK + d];
+      }
+    }
+  }
+  s8s8_compensation<BLOCK_N>(packed, K);
+}
+
+template <typename scalar_t>
+inline void copy_stub(scalar_t* __restrict__ out, const float* __restrict__ input, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+
+  int64_t d;
+#pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec data0 = fVec::loadu(input + d);
+    fVec data1 = fVec::loadu(input + d + fVec::size());
+    bVec out_vec = convert_from_float_ext<scalar_t>(data0, data1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d]);
+  }
+}
+
+template <typename scalar_t>
+inline void copy_add_stub(
+    scalar_t* __restrict__ out, const float* __restrict__ input, const float* __restrict__ bias, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+
+  int64_t d;
+#pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec data0 = fVec::loadu(input + d) + fVec::loadu(bias + d);
+    fVec data1 = fVec::loadu(input + d + fVec::size()) + fVec::loadu(bias + d + fVec::size());
+    bVec out_vec = convert_from_float_ext<scalar_t>(data0, data1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] + bias[d]);
+  }
+}
+
+template <typename scalar_t, bool has_bias, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn {
+  static inline void apply(
+      const scalar_t* __restrict__ A,
+      const scalar_t* __restrict__ B,
+      scalar_t* __restrict__ C,
+      const float* __restrict__ bias,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
+    TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <bool has_bias, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const at::BFloat16* __restrict__ A,
+      const at::BFloat16* __restrict__ B,
+      at::BFloat16* __restrict__ C,
+      const float* __restrict__ bias,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N / 16;
+
+    // prefetch distance
+    constexpr int PREFETCH_SIZE_K = 0;
+
+    __m512bh va;
+    __m512bh vb[COLS];
+    __m512 vc[ROWS * COLS];
+
+    auto loadc = [&](auto i) {
+      constexpr int col = i % COLS;
+      if constexpr (has_bias) {
+        vc[i] = _mm512_loadu_ps(bias + col * 16);
+      } else {
+        vc[i] = _mm512_set1_ps(0.f);
+      }
+    };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    const int64_t K2 = K >> 1;
+    const int64_t lda2 = lda >> 1;
+    const int64_t ldb2 = ldb;  // ldb * 2 >> 1;
+    const float* a_ptr = reinterpret_cast<const float*>(A);
+    const float* b_ptr = reinterpret_cast<const float*>(B);
+
+    auto compute = [&](auto i, int64_t k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = (__m512bh)(_mm512_set1_ps(a_ptr[row * lda2 + k]));
+      }
+      if constexpr (row == 0) {
+        vb[col] = (__m512bh)(_mm512_loadu_si512(b_ptr + k * ldb2 + col * 16));
+        if constexpr (PREFETCH_SIZE_K > 0) {
+          _mm_prefetch(b_ptr + (k + PREFETCH_SIZE_K) * ldb2 + col * 16, _MM_HINT_T0);
+        }
+      }
+      vc[i] = _mm512_dpbf16_ps(vc[i], va, vb[col]);
+    };
+    for (int64_t k = 0; k < K2; ++k) {
+      Unroll<ROWS * COLS>{}(compute, k);
+    }
+
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+      // for COLS = 2, 4 use 512bit store
+      // for COLS = 1, 3 use 256bit store
+      if constexpr (COLS % 2 == 0) {
+        if constexpr (col % 2 == 0) {
+          _mm512_storeu_si512(
+              reinterpret_cast<__m512i*>((C + row * ldc + col * 16)),
+              (__m512i)(_mm512_cvtne2ps_pbh(vc[row * COLS + col + 1], vc[row * COLS + col])));
+        }
+      } else {
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(C + row * ldc + col * 16), (__m256i)(_mm512_cvtneps_pbh(vc[i])));
+      }
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+#endif
+
+#define LAUNCH_TINYGEMM_KERNEL_NN(MB_SIZE, NB_SIZE)                \
+  tinygemm_kernel_nn<scalar_t, has_bias, MB_SIZE, NB_SIZE>::apply( \
+      A + mb_start * lda,                                          \
+      B + nb_start * 2,                                            \
+      C + mb_start * ldc + nb_start,                               \
+      has_bias ? bias + nb_start : nullptr,                        \
+      K,                                                           \
+      lda,                                                         \
+      ldb,                                                         \
+      ldc);
+
+template <typename scalar_t, bool has_bias>
+struct brgemm {
+  static inline void apply(
+      const scalar_t* __restrict__ A,
+      const scalar_t* __restrict__ B,
+      scalar_t* __restrict__ C,
+      float* __restrict__ Ctmp,
+      const float* __restrict__ bias,
+      int64_t M,
+      int64_t N,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
+    constexpr int BLOCK_N = block_size_n();
+    at::native::cpublas::brgemm(M, N, K, lda, ldb, BLOCK_N, /* add_C */ false, A, B, Ctmp);
+
+    // copy from Ctmp to C
+    for (int64_t m = 0; m < M; ++m) {
+      if constexpr (has_bias) {
+        copy_add_stub(C + m * ldc, Ctmp + m * BLOCK_N, bias, N);
+      } else {
+        copy_stub(C + m * ldc, Ctmp + m * BLOCK_N, N);
+      }
+    }
+  }
+};
+
+template <typename scalar_t, bool has_bias>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const scalar_t* __restrict__ B,
+    scalar_t* __restrict__ C,
+    float* __restrict__ Ctmp,
+    const float* __restrict__ bias,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg) {
+  if (brg) {
+    brgemm<scalar_t, has_bias>::apply(A, B, C, Ctmp, bias, M, N, K, lda, ldb, ldc);
+    return;
+  }
+
+  // pattern: 1-4-16, N = 16, 32, 48, 64
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 64;
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+  for (int mb = 0; mb < MB; ++mb) {
+    int64_t mb_start = mb * BLOCK_M;
+    int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch (mb_size << 4 | nb_size >> 4) {
+        // mb_size = 1
+        case 0x11:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 16);
+          break;
+        case 0x12:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 32);
+          break;
+        case 0x13:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 48);
+          break;
+        case 0x14:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 64);
+          break;
+        // mb_size = 2
+        case 0x21:
+          LAUNCH_TINYGEMM_KERNEL_NN(2, 16);
+          break;
+        case 0x22:
+          LAUNCH_TINYGEMM_KERNEL_NN(2, 32);
+          break;
+        case 0x23:
+          LAUNCH_TINYGEMM_KERNEL_NN(2, 48);
+          break;
+        case 0x24:
+          LAUNCH_TINYGEMM_KERNEL_NN(2, 64);
+          break;
+        // mb_size = 3
+        case 0x31:
+          LAUNCH_TINYGEMM_KERNEL_NN(3, 16);
+          break;
+        case 0x32:
+          LAUNCH_TINYGEMM_KERNEL_NN(3, 32);
+          break;
+        case 0x33:
+          LAUNCH_TINYGEMM_KERNEL_NN(3, 48);
+          break;
+        case 0x34:
+          LAUNCH_TINYGEMM_KERNEL_NN(3, 64);
+          break;
+        // mb_size = 4
+        case 0x41:
+          LAUNCH_TINYGEMM_KERNEL_NN(4, 16);
+          break;
+        case 0x42:
+          LAUNCH_TINYGEMM_KERNEL_NN(4, 32);
+          break;
+        case 0x43:
+          LAUNCH_TINYGEMM_KERNEL_NN(4, 48);
+          break;
+        case 0x44:
+          LAUNCH_TINYGEMM_KERNEL_NN(4, 64);
+          break;
+        default:
+          TORCH_CHECK(false, "Unexpected block size, ", mb_size, " x ", nb_size);
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+void weight_packed_linear_kernel_impl(
+    scalar_t* __restrict__ out,
+    const scalar_t* __restrict__ mat1,
+    const scalar_t* __restrict__ mat2,
+    const float* __restrict__ bias,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t mat1_strideM,
+    int64_t out_strideM) {
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  const bool use_brgemm = can_use_brgemm<scalar_t>(M);
+
+  // parallel on [MB, NB]
+  AT_DISPATCH_BOOL(bias != nullptr, has_bias, [&] {
+    parallel_2d(MB, NB, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
+      // for brgemm, use float32 for accumulate
+      alignas(64) float Ctmp[BLOCK_M * BLOCK_N];
+
+      loop_2d<scalar_t>(mb0, mb1, nb0, nb1, BLOCK_N * K, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
+        int64_t mb_start = mb * BLOCK_M;
+        int64_t mb_size = std::min(M - mb_start, BLOCK_M);
+        int64_t nb_start = nb * BLOCK_N;
+        int64_t nb_size = std::min(N - nb_start, BLOCK_N);
+
+        tinygemm_kernel<scalar_t, has_bias>(
+            /*   A */ mat1 + mb_start * mat1_strideM,
+            /*   B */ mat2 + nb_start * K /* nb * BLOCK_N * K */,
+            /*   C */ out + mb_start * out_strideM + nb_start,
+            /* Ctmp*/ Ctmp,
+            /* bias*/ bias + nb_start,
+            /*   M */ mb_size,
+            /*   N */ nb_size,
+            /*   K */ K,
+            /* lda */ mat1_strideM,
+            /* ldb */ nb_size,
+            /* ldc */ out_strideM,
+            /* brg */ use_brgemm);
+      });
+
+      if (use_brgemm) {
+        at::native::cpublas::brgemm_release();
+      }
+    });
+  });
+}
+
+}  // anonymous namespace
+
+// tinygemm interface
+template <typename scalar_t>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const scalar_t* __restrict__ B,
+    scalar_t* __restrict__ C,
+    float* __restrict__ Ctmp,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg) {
+  tinygemm_kernel<scalar_t, false>(A, B, C, Ctmp, nullptr, M, N, K, lda, ldb, ldc, brg);
+}
+
+#define INSTANTIATE_TINYGEMM_TEMPLATE(TYPE) \
+  template void tinygemm_kernel<TYPE>(      \
+      const TYPE* __restrict__ A,           \
+      const TYPE* __restrict__ B,           \
+      TYPE* __restrict__ C,                 \
+      float* __restrict__ Ctmp,             \
+      int64_t M,                            \
+      int64_t N,                            \
+      int64_t K,                            \
+      int64_t lda,                          \
+      int64_t ldb,                          \
+      int64_t ldc,                          \
+      bool brg)
+
+INSTANTIATE_TINYGEMM_TEMPLATE(at::BFloat16);
+INSTANTIATE_TINYGEMM_TEMPLATE(at::Half);
+
+at::Tensor convert_weight_packed(at::Tensor& weight) {
+  // for 3d moe weights
+  // weight : [E, OC, IC]
+  //     w1 : [E, 2N,  K]
+  //     w2 : [E,  K,  N]
+  CHECK_INPUT(weight);
+
+  const int64_t ndim = weight.ndimension();
+  TORCH_CHECK(ndim == 2 || ndim == 3, "expect weight to be 2d or 3d, got ", ndim, "d tensor.");
+  const auto st = weight.scalar_type();
+  const int64_t E = ndim == 3 ? weight.size(0) : 1;
+  const int64_t OC = ndim == 3 ? weight.size(1) : weight.size(0);
+  const int64_t IC = ndim == 3 ? weight.size(2) : weight.size(1);
+
+  // we handle 2 TILE_N at a time.
+  TORCH_CHECK(OC % TILE_N == 0, "invalid weight out features ", OC);
+  TORCH_CHECK(IC % TILE_K == 0, "invalid weight input features ", IC);
+
+  constexpr int64_t BLOCK_N = block_size_n();
+  const int64_t NB = div_up(OC, BLOCK_N);
+
+  // use phony sizes here [E, OC, IC], for each [E], [OC, IC] -> [IC / 2, OC, 2]
+  auto packed_weight = at::empty({}, weight.options());
+  const int64_t stride = OC * IC;
+
+  TORCH_CHECK(
+      st == at::kBFloat16 || st == at::kHalf || st == at::kChar || st == at::kFloat8_e4m3fn,
+      "expect weight to be bfloat16, float16, int8 or fp8_e4m3.");
+
+  CPU_DISPATCH_PACKED_TYPES(st, [&] {
+    // adjust most inner dimension size
+    const int packed_row_size = get_row_size<packed_t>(IC);
+    auto sizes = weight.sizes().vec();
+    sizes[ndim - 1] = packed_row_size;
+    packed_weight.resize_(sizes);
+
+    const packed_t* w_data = weight.data_ptr<packed_t>();
+    packed_t* packed_data = packed_weight.data_ptr<packed_t>();
+
+    // parallel on {E, NB}
+    at::parallel_for(0, E * NB, 0, [&](int64_t begin, int64_t end) {
+      int64_t e{0}, nb{0};
+      data_index_init(begin, e, E, nb, NB);
+
+      for (int64_t i = begin; i < end; ++i) {
+        UNUSED(i);
+
+        int64_t n = nb * BLOCK_N;
+        int64_t n_size = std::min(BLOCK_N, OC - n);
+        pack_vnni<packed_t>(
+            packed_data + e * OC * packed_row_size + n * packed_row_size, w_data + e * stride + n * IC, n_size, IC);
+
+        // move to the next index
+        data_index_step(e, E, nb, NB);
+      }
+    });
+  });
+  return packed_weight;
+}
+
+// mat1 : [M, K]
+// mat2 : [N, K]
+// bias : [N]
+// out  : [M, N]
+//
+at::Tensor
+weight_packed_linear(at::Tensor& mat1, at::Tensor& mat2, const std::optional<at::Tensor>& bias, bool is_vnni) {
+  RECORD_FUNCTION("sgl-kernel::weight_packed_linear", std::vector<c10::IValue>({mat1, mat2, bias}));
+
+  auto packed_w = is_vnni ? mat2 : convert_weight_packed(mat2);
+
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(mat1);
+  CHECK_INPUT(mat2);
+
+  int64_t M = mat1.size(0);
+  int64_t N = mat2.size(0);
+  int64_t K = mat2.size(1);
+  CHECK_EQ(mat1.size(1), K);
+  CHECK_DIM(2, mat1);
+  CHECK_DIM(2, mat2);
+
+  auto out = at::empty({M, N}, mat1.options());
+
+  // strides
+  int64_t mat1_strideM = mat1.stride(0);
+  int64_t out_strideM = out.stride(0);
+
+  const bool has_bias = bias.has_value();
+  const float* bias_data = nullptr;
+  if (has_bias) {
+    CHECK_EQ(bias.value().size(0), N);
+    bias_data = bias.value().data_ptr<float>();
+  }
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(mat1.scalar_type(), "weight_packed_linear_kernel_impl", [&] {
+    weight_packed_linear_kernel_impl<scalar_t>(
+        out.data_ptr<scalar_t>(),
+        mat1.data_ptr<scalar_t>(),
+        packed_w.data_ptr<scalar_t>(),
+        bias_data,
+        M,
+        N,
+        K,
+        mat1_strideM,
+        out_strideM);
+  });
+
+  return out;
+}
diff --git a/sgl-kernel/csrc/cpu/gemm.h b/sgl-kernel/csrc/cpu/gemm.h
new file mode 100644
index 000000000..6a16a2985
--- /dev/null
+++ b/sgl-kernel/csrc/cpu/gemm.h
@@ -0,0 +1,202 @@
+#pragma once
+#include <ATen/native/CPUBlas.h>
+
+#include "common.h"
+
+// amx-bf16
+#define TILE_M 16
+#define TILE_N 16
+#define TILE_K 32
+
+// block size for AMX gemm
+constexpr int block_size_m() {
+  return 2 * TILE_M;
+}
+constexpr int block_size_n() {
+  return 2 * TILE_N;
+}
+
+// define threshold using brgemm (intel AMX)
+template <typename T>
+inline bool can_use_brgemm(int M);
+template <>
+inline bool can_use_brgemm<at::BFloat16>(int M) {
+  return M > 4;
+}
+template <>
+inline bool can_use_brgemm<at::Half>(int M) {
+  return true;
+}
+// this requires PyTorch 2.7 or above
+template <>
+inline bool can_use_brgemm<int8_t>(int M) {
+  return M > 4;
+}
+
+template <>
+inline bool can_use_brgemm<at::Float8_e4m3fn>(int M) {
+  return M > 4;
+}
+
+// work around compiler internal error
+#define BLOCK_K 128  // 4 * TILE_K
+
+// adjust leading dimension size for K
+template <typename T>
+inline int64_t get_row_size(int64_t K) {
+  return K;
+}
+
+template <>
+inline int64_t get_row_size<int8_t>(int64_t K) {
+  return K + sizeof(int32_t);
+}
+
+inline int64_t get_row_size(int64_t K, bool use_int8_w8a8) {
+  return use_int8_w8a8 ? K + sizeof(int32_t) : K;
+}
+
+// pack weight to vnni format
+at::Tensor convert_weight_packed(at::Tensor& weight);
+
+// moe implementations for int8 w8a8
+template <typename scalar_t>
+void fused_experts_int8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ ic2,
+    uint8_t* __restrict__ A_tmp,
+    float* __restrict__ C_tmp,
+    uint8_t* __restrict__ Aq_tmp,
+    float* __restrict__ As_tmp,
+    const scalar_t* __restrict__ input,
+    const int8_t* __restrict__ packed_w1,
+    const int8_t* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    const float* __restrict__ topk_weights,
+    const int32_t* __restrict__ sorted_ids,
+    const int32_t* __restrict__ expert_ids,
+    const int32_t* __restrict__ offsets,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t E,
+    int64_t topk,
+    int64_t num_tokens_post_pad);
+
+// moe implementations for fp8 w8a16
+template <typename scalar_t>
+void fused_experts_fp8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic0,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ ic2,
+    scalar_t* __restrict__ A_tmp,
+    scalar_t* __restrict__ B_tmp,
+    float* __restrict__ C_tmp,
+    const scalar_t* __restrict__ input,
+    const at::Float8_e4m3fn* __restrict__ packed_w1,
+    const at::Float8_e4m3fn* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    int64_t block_size_N,
+    int64_t block_size_K,
+    const float* __restrict__ topk_weights,
+    const int32_t* __restrict__ sorted_ids,
+    const int32_t* __restrict__ expert_ids,
+    const int32_t* __restrict__ offsets,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t E,
+    int64_t topk,
+    int64_t num_tokens_post_pad);
+
+// shared expert implementation for int8 w8a8
+template <typename scalar_t>
+void shared_expert_int8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic1,
+    float* __restrict__ C_tmp,
+    uint8_t* __restrict__ Aq_tmp,
+    float* __restrict__ As_tmp,
+    const scalar_t* __restrict__ input,
+    const int8_t* __restrict__ packed_w1,
+    const int8_t* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    const scalar_t* __restrict__ fused_experts_out,
+    float routed_scaling_factor,
+    int64_t M,
+    int64_t N,
+    int64_t K);
+
+template <typename scalar_t>
+void shared_expert_fp8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic0,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ B_tmp,
+    float* __restrict__ C_tmp,
+    const scalar_t* __restrict__ input,
+    const at::Float8_e4m3fn* __restrict__ packed_w1,
+    const at::Float8_e4m3fn* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    int64_t block_size_N,
+    int64_t block_size_K,
+    const scalar_t* __restrict__ fused_experts_out,
+    float routed_scaling_factor,
+    int64_t M,
+    int64_t N,
+    int64_t K);
+
+// tinygemm interface
+template <typename scalar_t>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const scalar_t* __restrict__ B,
+    scalar_t* __restrict__ C,
+    float* __restrict__ Ctmp,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg);
+
+template <typename scalar_t>
+void tinygemm_kernel(
+    const uint8_t* __restrict__ A,
+    const int8_t* __restrict__ B,
+    scalar_t* __restrict__ C,
+    int32_t* __restrict__ Ctmp,
+    const float* __restrict__ As,
+    const float* __restrict__ Bs,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg);
+
+template <typename scalar_t>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const at::Float8_e4m3fn* __restrict__ B,
+    scalar_t* __restrict__ C,
+    scalar_t* __restrict__ Btmp,
+    float* __restrict__ Ctmp,
+    const float* __restrict__ scale,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg,
+    int64_t block_size_K,
+    bool do_unpack = true);
diff --git a/sgl-kernel/csrc/cpu/gemm_fp8.cpp b/sgl-kernel/csrc/cpu/gemm_fp8.cpp
new file mode 100644
index 000000000..008f83298
--- /dev/null
+++ b/sgl-kernel/csrc/cpu/gemm_fp8.cpp
@@ -0,0 +1,551 @@
+#include "common.h"
+#include "gemm.h"
+#include "vec.h"
+
+namespace {
+
+template <typename scalar_t>
+inline void copy_stub(scalar_t* __restrict__ out, const float* __restrict__ input, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+
+  int64_t d;
+#pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec data0 = fVec::loadu(input + d);
+    fVec data1 = fVec::loadu(input + d + fVec::size());
+    bVec out_vec = convert_from_float_ext<scalar_t>(data0, data1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d]);
+  }
+}
+
+template <typename scalar_t>
+inline void copy_add_stub(
+    scalar_t* __restrict__ out, const float* __restrict__ input, const float* __restrict__ bias, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+
+  int64_t d;
+#pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec data0 = fVec::loadu(input + d) + fVec::loadu(bias + d);
+    fVec data1 = fVec::loadu(input + d + fVec::size()) + fVec::loadu(bias + d + fVec::size());
+    bVec out_vec = convert_from_float_ext<scalar_t>(data0, data1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] + bias[d]);
+  }
+}
+
+inline void unpack_B(
+    at::BFloat16* __restrict__ Btmp,
+    const at::Float8_e4m3fn* __restrict__ packed_B,
+    int N,
+    int K,
+    int ldb,
+    int ldb_tmp,
+    float scale) {
+#if defined(CPU_CAPABILITY_AVX512)
+  // [K/2, N, 2]
+  const int K2 = K >> 1;
+  const int ldb2 = ldb;  // ldb * 2 >> 1;
+  const uint16_t* b_ptr = reinterpret_cast<const uint16_t*>(packed_B);
+  const __m512 vd = _mm512_set1_ps(scale);
+
+  constexpr int BLOCK_N = block_size_n();
+  static_assert(BLOCK_N == 32);
+
+  // prefetch distance
+  constexpr int PREFETCH_SIZE_K = 64;
+
+#pragma GCC unroll 4
+  for (int k = 0; k < K2; ++k) {
+    __m512i b8 = _mm512_loadu_si512(b_ptr + k * ldb2);
+    if constexpr (PREFETCH_SIZE_K > 0) {
+      _mm_prefetch(b_ptr + (k + PREFETCH_SIZE_K) * ldb2, _MM_HINT_T0);
+    }
+
+    __m256i b8_0 = _mm512_extracti32x8_epi32(b8, 0);
+    __m256i b8_1 = _mm512_extracti32x8_epi32(b8, 1);
+
+    __m512bh bf16_0 = CVT_FP8_TO_BF16(b8_0);
+    __m512bh bf16_1 = CVT_FP8_TO_BF16(b8_1);
+
+    // Apply scale
+    __m512 f0_lo = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32((__m512i)bf16_0, 0));
+    __m512 f0_hi = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32((__m512i)bf16_0, 1));
+    __m512 f1_lo = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32((__m512i)bf16_1, 0));
+    __m512 f1_hi = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32((__m512i)bf16_1, 1));
+
+    f0_lo = _mm512_mul_ps(f0_lo, vd);
+    f0_hi = _mm512_mul_ps(f0_hi, vd);
+    f1_lo = _mm512_mul_ps(f1_lo, vd);
+    f1_hi = _mm512_mul_ps(f1_hi, vd);
+
+    bf16_0 = _mm512_cvtne2ps_pbh(f0_hi, f0_lo);
+    bf16_1 = _mm512_cvtne2ps_pbh(f1_hi, f1_lo);
+
+    _mm512_storeu_si512(Btmp + k * ldb_tmp * 2 + 0, (__m512i)bf16_0);
+    _mm512_storeu_si512(Btmp + k * ldb_tmp * 2 + 32, (__m512i)bf16_1);
+  }
+#else
+  TORCH_CHECK(false, "unpack_B: scalar path not implemented!");
+#endif
+}
+
+template <typename scalar_t, typename packed_t, bool has_bias, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn {
+  static inline void apply(
+      const scalar_t* __restrict__ A,
+      const packed_t* __restrict__ B,
+      scalar_t* __restrict__ C,
+      const float* __restrict__ bias,
+      const float* __restrict__ scale,
+      int K,
+      int lda,
+      int ldb,
+      int ldc,
+      int64_t block_size_K) {
+    TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <bool has_bias, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn<at::BFloat16, at::Float8_e4m3fn, has_bias, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const at::BFloat16* __restrict__ A,
+      const at::Float8_e4m3fn* __restrict__ B,
+      at::BFloat16* __restrict__ C,
+      const float* __restrict__ bias,
+      const float* __restrict__ scale,
+      int K,
+      int lda,
+      int ldb,
+      int ldc,
+      int64_t block_size_K) {
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N / 16;
+
+    const int KB = div_up(K, BLOCK_K);
+
+    // prefetch distance
+    constexpr int PREFETCH_SIZE_K = 64;
+    constexpr int PREFETCH_SIZE_KB = 1;
+
+    __m512bh va;
+    __m512bh vb[COLS];
+    __m512 vc[ROWS * COLS];
+    __m512 vsum[ROWS * COLS];
+
+    // block quant scale
+    __m512 vscale;
+
+    auto loadc = [&](auto i) {
+      constexpr int col = i % COLS;
+      if constexpr (has_bias) {
+        vc[i] = _mm512_loadu_ps(bias + col * 16);
+      } else {
+        vc[i] = _mm512_setzero_ps();
+      }
+    };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    const int lda2 = lda >> 1;
+    const int ldb2 = ldb;  // ldb * 2 >> 1;
+    const float* a_ptr = reinterpret_cast<const float*>(A);
+    const uint16_t* b_ptr = reinterpret_cast<const uint16_t*>(B);
+
+    auto compute = [&](auto i, int k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = (__m512bh)(_mm512_set1_ps(a_ptr[row * lda2 + k]));
+        if constexpr (PREFETCH_SIZE_K > 0) {
+          _mm_prefetch(a_ptr + row * lda2 + k + PREFETCH_SIZE_K, _MM_HINT_T0);
+        }
+      }
+      if constexpr (row == 0) {
+        if constexpr (col % 2 == 0) {
+          __m512i b8 = _mm512_loadu_si512(b_ptr + k * ldb2 + col * 16);
+          if constexpr (PREFETCH_SIZE_K > 0) {
+            _mm_prefetch(b_ptr + (k + PREFETCH_SIZE_K) * ldb2 + col * 16, _MM_HINT_T0);
+          }
+          vb[col + 0] = CVT_FP8_TO_BF16(_mm512_extracti32x8_epi32(b8, 0));
+          vb[col + 1] = CVT_FP8_TO_BF16(_mm512_extracti32x8_epi32(b8, 1));
+        }
+      }
+      vsum[i] = _mm512_dpbf16_ps(vsum[i], va, vb[col]);
+    };
+
+    constexpr int BLOCK_K2 = BLOCK_K >> 1;
+    for (int kb = 0; kb < KB; ++kb) {
+      int kb_start = kb * BLOCK_K2;
+      int kb_end = std::min(K >> 1, kb_start + BLOCK_K2);
+      // 1. load scale vector
+      vscale = _mm512_set1_ps(scale[kb]);
+      if constexpr (PREFETCH_SIZE_KB > 0) {
+        _mm_prefetch(scale + kb + PREFETCH_SIZE_KB, _MM_HINT_T0);
+      }
+      // 2. zero vsum for each block
+      Unroll<ROWS * COLS>{}([&](auto i) { vsum[i] = _mm512_setzero_ps(); });
+      // 3. accumulate across each block
+      for (int k = kb_start; k < kb_end; ++k) {
+        Unroll<ROWS * COLS>{}(compute, k);
+      }
+      // 4. apply scale
+      Unroll<ROWS * COLS>{}([&](auto i) { vc[i] = _mm512_fmadd_ps(vsum[i], vscale, vc[i]); });
+    }
+
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+      // for COLS = 2,4 use 512bit store
+      if constexpr (col % 2 == 0) {
+        _mm512_storeu_si512(
+            reinterpret_cast<__m512i*>((C + row * ldc + col * 16)),
+            (__m512i)(_mm512_cvtne2ps_pbh(vc[row * COLS + col + 1], vc[row * COLS + col])));
+      }
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+#endif
+
+#define LAUNCH_TINYGEMM_KERNEL_NN(MB_SIZE, NB_SIZE)                                   \
+  tinygemm_kernel_nn<scalar_t, at::Float8_e4m3fn, has_bias, MB_SIZE, NB_SIZE>::apply( \
+      A + mb_start * lda,                                                             \
+      B + nb_start * 2,                                                               \
+      C + mb_start * ldc + nb_start,                                                  \
+      has_bias ? bias + nb_start : nullptr,                                           \
+      scale,                                                                          \
+      K,                                                                              \
+      lda,                                                                            \
+      ldb,                                                                            \
+      ldc,                                                                            \
+      block_size_K);
+
+template <typename scalar_t, typename packed_t, bool has_bias>
+struct brgemm {
+  static inline void apply(
+      const scalar_t* __restrict__ A,
+      const packed_t* __restrict__ B,
+      scalar_t* __restrict__ C,
+      scalar_t* __restrict__ Btmp,
+      float* __restrict__ Ctmp,
+      const float* __restrict__ bias,
+      const float* __restrict__ scale,
+      int M,
+      int N,
+      int K,
+      int lda,
+      int ldb,
+      int ldc,
+      bool do_unpack = true) {
+    TORCH_CHECK(false, "struct brgemm: primary template not implemented!");
+  }
+};
+
+template <bool has_bias>
+struct brgemm<at::BFloat16, at::Float8_e4m3fn, has_bias> {
+  static inline void apply(
+      const at::BFloat16* __restrict__ A,
+      const at::Float8_e4m3fn* __restrict__ B,
+      at::BFloat16* __restrict__ C,
+      at::BFloat16* __restrict__ Btmp,
+      float* __restrict__ Ctmp,
+      const float* __restrict__ bias,
+      const float* __restrict__ scale,
+      int M,
+      int N,
+      int K,
+      int lda,
+      int ldb,
+      int ldc,
+      bool do_unpack = true) {
+    constexpr int BLOCK_N = block_size_n();
+
+    // [K, BLOCK_N] -> [K / 2, BLOCK_N * 2]
+    const int ldb_tmp = BLOCK_N;
+
+    if (do_unpack) {
+      for (int k = 0; k < K; k += BLOCK_K) {
+        int kb_size = std::min(BLOCK_K, K - k);
+
+        int idx = k >> 7;  // k / BLOCK_K where BLOCK_K = 128
+        unpack_B(Btmp + k * ldb_tmp, B + k * ldb, N, kb_size, ldb, ldb_tmp, scale[idx]);
+      }
+    }
+
+    at::native::cpublas::brgemm(M, N, K, lda, ldb_tmp, BLOCK_N, /* add_C */ false, A, Btmp, Ctmp);
+
+    // copy from Ctmp to C
+    for (int m = 0; m < M; ++m) {
+      if constexpr (has_bias) {
+        copy_add_stub(C + m * ldc, Ctmp + m * BLOCK_N, bias, N);
+      } else {
+        copy_stub(C + m * ldc, Ctmp + m * BLOCK_N, N);
+      }
+    }
+  }
+};
+
+template <typename scalar_t, bool has_bias>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const at::Float8_e4m3fn* __restrict__ B,
+    scalar_t* __restrict__ C,
+    scalar_t* __restrict__ Btmp,
+    float* __restrict__ Ctmp,
+    const float* __restrict__ scale,
+    const float* __restrict__ bias,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg,
+    int64_t block_size_K,
+    bool do_unpack = true) {
+  if (brg) {
+    brgemm<scalar_t, at::Float8_e4m3fn, has_bias>::apply(
+        A, B, C, Btmp, Ctmp, bias, scale, M, N, K, lda, ldb, ldc, do_unpack);
+    return;
+  }
+
+  // pattern: 1-4-16
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 64;
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+  for (int mb = 0; mb < MB; ++mb) {
+    int64_t mb_start = mb * BLOCK_M;
+    int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch (mb_size << 4 | nb_size >> 4) {
+        case 0x12:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 32);
+          break;
+        case 0x22:
+          LAUNCH_TINYGEMM_KERNEL_NN(2, 32);
+          break;
+        case 0x32:
+          LAUNCH_TINYGEMM_KERNEL_NN(3, 32);
+          break;
+        case 0x42:
+          LAUNCH_TINYGEMM_KERNEL_NN(4, 32);
+          break;
+        default:
+          TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+void fp8_scaled_mm_kernel_impl(
+    scalar_t* __restrict__ out,
+    const scalar_t* __restrict__ mat1,
+    const at::Float8_e4m3fn* __restrict__ mat2,
+    const float* __restrict__ scales2,
+    const float* __restrict__ bias,
+    scalar_t* __restrict__ buffer,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t mat1_strideM,
+    int64_t out_strideM,
+    int64_t block_size_N,
+    int64_t block_size_K,
+    int64_t buffer_size_per_thread) {
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  const int64_t scale_size_K = div_up(K, block_size_K);
+  const int64_t blocks_n_per_group = block_size_N / BLOCK_N;
+
+  const bool use_brgemm = can_use_brgemm<at::Float8_e4m3fn>(M);
+
+  // parallel on [MB, NB]
+  AT_DISPATCH_BOOL(bias != nullptr, has_bias, [&] {
+    parallel_2d(MB, NB, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
+      int tid = get_thread_num();
+      scalar_t* __restrict__ Btmp = buffer + tid * buffer_size_per_thread;
+      float* __restrict__ Ctmp = (float*)((void*)(Btmp + MAX_CACHE_BLOCK_SIZE * BLOCK_N * K));
+
+      loop_2d<at::Float8_e4m3fn>(mb0, mb1, nb0, nb1, BLOCK_N * K, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
+        const float* scale_ptr = scales2 + (nb / blocks_n_per_group) * scale_size_K;
+
+        int64_t mb_start = mb * BLOCK_M;
+        int64_t mb_size = std::min(M - mb_start, BLOCK_M);
+        int64_t nb_start = nb * BLOCK_N;
+        int64_t nb_size = std::min(N - nb_start, BLOCK_N);
+
+        // only do unpacking for the first row
+        bool do_unpack = (mb == mb0);
+
+        tinygemm_kernel<scalar_t, has_bias>(
+            /*   A            */ mat1 + mb_start * mat1_strideM,
+            /*   B            */ mat2 + nb_start * K,  // nb * BLOCK_N * K
+            /*   C            */ out + mb_start * out_strideM + nb_start,
+            /*   Btmp         */ Btmp + nb_offset * BLOCK_N * K,
+            /*   Ctmp         */ Ctmp,
+            /*   scale        */ scale_ptr,
+            /*   bias         */ bias + nb_start,
+            /*   M            */ mb_size,
+            /*   N            */ nb_size,
+            /*   K            */ K,
+            /*   lda          */ mat1_strideM,
+            /*   ldb          */ nb_size,
+            /*   ldc          */ out_strideM,
+            /*   brg          */ use_brgemm,
+            /*   block_size_K */ block_size_K,
+            /*   do_unpack    */ do_unpack);
+      });
+
+      if (use_brgemm) {
+        at::native::cpublas::brgemm_release();
+      }
+    });
+  });
+}
+
+}  // anonymous namespace
+
+// tinygemm interface
+template <typename scalar_t>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const at::Float8_e4m3fn* __restrict__ B,
+    scalar_t* __restrict__ C,
+    scalar_t* __restrict__ Btmp,
+    float* __restrict__ Ctmp,
+    const float* __restrict__ scale,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg,
+    int64_t block_size_K,
+    bool do_unpack) {
+  tinygemm_kernel<scalar_t, false>(
+      A, B, C, Btmp, Ctmp, scale, nullptr, M, N, K, lda, ldb, ldc, brg, block_size_K, do_unpack);
+}
+
+#define INSTANTIATE_TINYGEMM_TEMPLATE(TYPE)    \
+  template void tinygemm_kernel<TYPE>(         \
+      const TYPE* __restrict__ A,              \
+      const at::Float8_e4m3fn* __restrict__ B, \
+      TYPE* __restrict__ C,                    \
+      TYPE* __restrict__ Btmp,                 \
+      float* __restrict__ Ctmp,                \
+      const float* __restrict__ scale,         \
+      int64_t M,                               \
+      int64_t N,                               \
+      int64_t K,                               \
+      int64_t lda,                             \
+      int64_t ldb,                             \
+      int64_t ldc,                             \
+      bool brg,                                \
+      int64_t block_size_K,                    \
+      bool do_unpack)
+
+INSTANTIATE_TINYGEMM_TEMPLATE(at::BFloat16);
+INSTANTIATE_TINYGEMM_TEMPLATE(at::Half);
+
+at::Tensor fp8_scaled_mm_cpu(
+    at::Tensor& mat1,
+    at::Tensor& mat2,
+    at::Tensor& scales2,
+    std::vector<int64_t> block_size,
+    const std::optional<at::Tensor>& bias,
+    at::ScalarType out_dtype,
+    bool is_vnni) {
+  RECORD_FUNCTION("sgl-kernel::fp8_scaled_mm_cpu", std::vector<c10::IValue>({mat1, mat2, scales2, block_size, bias}));
+
+  auto packed_w = is_vnni ? mat2 : convert_weight_packed(mat2);
+
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(mat1);
+  CHECK_INPUT(mat2);
+  CHECK_INPUT(scales2);
+  TORCH_CHECK(scales2.scalar_type() == at::kFloat, "fp8_scaled_mm_cpu: expect scales2 to be float32.");
+
+  int64_t M = mat1.size(0);
+  int64_t N = mat2.size(0);
+  int64_t K = mat2.size(1);
+
+  CHECK_EQ(mat1.size(1), K);
+  CHECK_DIM(2, mat1);
+  CHECK_DIM(2, mat2);
+
+  TORCH_CHECK(block_size.size() == 2, "fp8_scaled_mm_cpu: expect block_size.size() to be 2.");
+
+  int64_t block_size_N = block_size[0];
+  int64_t block_size_K = block_size[1];
+
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+  TORCH_CHECK(block_size_N % BLOCK_N == 0, "fp8_scaled_mm_cpu: expect block_size_N to be multiples of BLOCK_N");
+  TORCH_CHECK(block_size_K == BLOCK_K, "fp8_scaled_mm_cpu: expect block_size_K equals to BLOCK_K");
+  CHECK_EQ(scales2.size(0), div_up(N, block_size_N));
+  CHECK_EQ(scales2.size(1), div_up(K, block_size_K));
+
+  const auto st = mat1.scalar_type();
+  TORCH_CHECK(st == at::kBFloat16 || st == at::kHalf, "fp8_scaled_mm_cpu: expect A to be bfloat16 or half.");
+  TORCH_CHECK(st == out_dtype, "fp8_scaled_mm_cpu: expect A has same dtype with out_dtype.");
+  TORCH_CHECK(mat2.scalar_type() == at::kFloat8_e4m3fn, "fp8_scaled_mm_cpu: expect mat2 to be fp8_e4m3.");
+  TORCH_CHECK(scales2.scalar_type() == at::kFloat, "fp8_scaled_mm_cpu: expect scales to be float32.");
+  auto out = at::empty({M, N}, mat1.options().dtype(out_dtype));
+
+  // strides
+  int64_t mat1_strideM = mat1.stride(0);
+  int64_t out_strideM = out.stride(0);
+
+  const bool has_bias = bias.has_value();
+  const float* bias_data = nullptr;
+  if (has_bias) {
+    CHECK_EQ(bias.value().size(0), N);
+    bias_data = bias.value().data_ptr<float>();
+  }
+
+  // Btmp : [T, BLOCK_N * K]
+  // Ctmp : [T, BLOCK_M * BLOCK_N]
+  int num_threads = at::get_num_threads();
+  int64_t size_per_thread = MAX_CACHE_BLOCK_SIZE * BLOCK_N * K + BLOCK_M * BLOCK_N * 2;
+  auto buffer = at::empty({num_threads, size_per_thread}, mat1.options());
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(out_dtype, "fp8_scaled_mm_kernel_impl", [&] {
+    fp8_scaled_mm_kernel_impl<scalar_t>(
+        out.data_ptr<scalar_t>(),
+        mat1.data_ptr<scalar_t>(),
+        packed_w.data_ptr<at::Float8_e4m3fn>(),
+        scales2.data_ptr<float>(),
+        bias_data,
+        buffer.data_ptr<scalar_t>(),
+        M,
+        N,
+        K,
+        mat1_strideM,
+        out_strideM,
+        block_size_N,
+        block_size_K,
+        size_per_thread);
+  });
+
+  return out;
+}
diff --git a/sgl-kernel/csrc/cpu/gemm_int8.cpp b/sgl-kernel/csrc/cpu/gemm_int8.cpp
new file mode 100644
index 000000000..cb6146607
--- /dev/null
+++ b/sgl-kernel/csrc/cpu/gemm_int8.cpp
@@ -0,0 +1,547 @@
+#include "common.h"
+#include "gemm.h"
+#include "vec.h"
+
+namespace {
+
+template <typename scalar_t, bool has_bias, int BLOCK_N>
+struct scale_C {
+  static inline void apply(
+      scalar_t* __restrict__ C,
+      const int32_t* __restrict__ Ctmp,
+      const int32_t* __restrict__ Bcomp,
+      const float* __restrict__ bias,
+      float As,
+      const float* __restrict__ Bs) {
+    TORCH_CHECK(false, "scale_C: scalar path not implemented!");
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <bool has_bias, int BLOCK_N>
+struct scale_C<at::BFloat16, has_bias, BLOCK_N> {
+  static inline void apply(
+      at::BFloat16* __restrict__ C,
+      const int32_t* __restrict__ Ctmp,
+      const int32_t* __restrict__ Bcomp,
+      const float* __restrict__ bias,
+      float As,
+      const float* __restrict__ Bs) {
+    constexpr int COLS = BLOCK_N / 16;
+    static_assert(COLS % 2 == 0);
+
+    __m512 vc[COLS];
+    __m512 vd0 = _mm512_set1_ps(As);
+
+    auto compute = [&](auto col) {
+      __m512 vd1 = _mm512_loadu_ps(Bs + col * 16);
+      __m512i vcomp = _mm512_loadu_si512(Bcomp + col * 16);
+      __m512i vc32 = _mm512_loadu_si512(Ctmp + col * 16);
+      vc[col] = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc32, vcomp));
+      if constexpr (has_bias) {
+        __m512 vbias = _mm512_loadu_ps(bias + col * 16);
+        vc[col] = _mm512_fmadd_ps(_mm512_mul_ps(vc[col], vd0), vd1, vbias);
+      } else {
+        vc[col] = _mm512_mul_ps(_mm512_mul_ps(vc[col], vd0), vd1);
+      }
+    };
+    Unroll<COLS>{}(compute);
+
+    auto storec = [&](auto col) {
+      // for COLS = 2, 4 use 512bit store
+      if constexpr (col % 2 == 0) {
+        _mm512_storeu_si512(
+            reinterpret_cast<__m512i*>((C + col * 16)), (__m512i)(_mm512_cvtne2ps_pbh(vc[col + 1], vc[col + 0])));
+      }
+    };
+    Unroll<COLS>{}(storec);
+  }
+};
+#endif
+
+template <typename scalar_t, bool has_bias, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn {
+  static inline void apply(
+      const uint8_t* __restrict__ A,
+      const int8_t* __restrict__ B,
+      scalar_t* __restrict__ C,
+      const float* __restrict__ As,
+      const float* __restrict__ Bs,
+      const int32_t* __restrict__ Bcomp,
+      const float* __restrict__ bias,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
+    TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <bool has_bias, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const uint8_t* __restrict__ A,
+      const int8_t* __restrict__ B,
+      at::BFloat16* __restrict__ C,
+      const float* __restrict__ As,
+      const float* __restrict__ Bs,
+      const int32_t* __restrict__ Bcomp,
+      const float* __restrict__ bias,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N / 16;
+    static_assert(COLS % 2 == 0);
+
+    // prefetch distance
+    constexpr int PREFETCH_SIZE_K = 0;
+
+    __m512i va;
+    __m512i vb[COLS];
+    __m512i vc[ROWS * COLS];
+    __m512i vcomp[COLS];
+    __m512 vd0;
+    __m512 vd1[COLS];
+
+    // oops! 4x4 spills but we use 4x2
+    __m512 vbias[COLS];
+
+    // [NOTE]: s8s8 igemm compensation in avx512-vnni
+    //
+    // avx512-vnni has no s8s8, so we need to change s8s8 to u8s8 with compensate:
+    //
+    //   a * b = (a + 128) * b - 128 * b
+    //   s   s       u       s    u    s
+    //
+    // 1) 128 * b is pre-computed when packing B to vnni formats
+    // 2) a + 128 is fused when dynamically quantize A
+    //
+    auto loadc = [&](auto i) { vc[i] = _mm512_set1_epi32(0); };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    const int64_t K4 = K >> 2;
+    const int64_t lda4 = lda >> 2;
+    const int64_t ldb4 = ldb;  // ldb * 4 >> 2;
+    const int32_t* a_ptr = reinterpret_cast<const int32_t*>(A);
+    const int32_t* b_ptr = reinterpret_cast<const int32_t*>(B);
+
+    auto compute = [&](auto i, int64_t k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = _mm512_set1_epi32(a_ptr[row * lda4 + k]);
+      }
+      if constexpr (row == 0) {
+        vb[col] = _mm512_loadu_si512(b_ptr + k * ldb4 + col * 16);
+        if constexpr (PREFETCH_SIZE_K > 0) {
+          _mm_prefetch(b_ptr + (k + PREFETCH_SIZE_K) * ldb4 + col * 16, _MM_HINT_T0);
+        }
+      }
+      vc[i] = _mm512_dpbusd_epi32(vc[i], va, vb[col]);
+    };
+    for (int64_t k = 0; k < K4; ++k) {
+      Unroll<ROWS * COLS>{}(compute, k);
+    }
+
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      // load a scale
+      if constexpr (col == 0) {
+        vd0 = _mm512_set1_ps(As[row]);
+      }
+      // load b scale and vcomp per 2 vectors
+      // also load bias if any
+      if constexpr (row == 0) {
+        if constexpr (col % 2 == 0) {
+          vd1[col + 0] = _mm512_loadu_ps(Bs + col * 16);
+          vd1[col + 1] = _mm512_loadu_ps(Bs + col * 16 + 16);
+          vcomp[col + 0] = _mm512_loadu_si512(Bcomp + col * 16);
+          vcomp[col + 1] = _mm512_loadu_si512(Bcomp + col * 16 + 16);
+          if constexpr (has_bias) {
+            vbias[col + 0] = _mm512_loadu_ps(bias + col * 16);
+            vbias[col + 1] = _mm512_loadu_ps(bias + col * 16 + 16);
+          }
+        }
+      }
+
+      // for COLS = 2, 4 use 512bit store
+      if constexpr (col % 2 == 0) {
+        __m512 vc0 = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc[row * COLS + col + 0], vcomp[col + 0]));
+        __m512 vc1 = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc[row * COLS + col + 1], vcomp[col + 1]));
+        if constexpr (has_bias) {
+          vc0 = _mm512_fmadd_ps(_mm512_mul_ps(vc0, vd0), vd1[col + 0], vbias[col + 0]);
+          vc1 = _mm512_fmadd_ps(_mm512_mul_ps(vc1, vd0), vd1[col + 1], vbias[col + 1]);
+        } else {
+          vc0 = _mm512_mul_ps(_mm512_mul_ps(vc0, vd0), vd1[col + 0]);
+          vc1 = _mm512_mul_ps(_mm512_mul_ps(vc1, vd0), vd1[col + 1]);
+        }
+
+        _mm512_storeu_si512(
+            reinterpret_cast<__m512i*>((C + row * ldc + col * 16)), (__m512i)(_mm512_cvtne2ps_pbh(vc1, vc0)));
+      }
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+#endif
+
+#define LAUNCH_TINYGEMM_KERNEL_NN(MB_SIZE, NB_SIZE)                \
+  tinygemm_kernel_nn<scalar_t, has_bias, MB_SIZE, NB_SIZE>::apply( \
+      A + mb_start * lda,                                          \
+      B + nb_start * 4,                                            \
+      C + mb_start * ldc + nb_start,                               \
+      As + mb_start,                                               \
+      Bs + nb_start,                                               \
+      Bcomp + nb_start,                                            \
+      has_bias ? bias + nb_start : nullptr,                        \
+      K,                                                           \
+      lda,                                                         \
+      ldb,                                                         \
+      ldc);
+
+template <typename scalar_t, bool has_bias>
+void tinygemm_kernel(
+    const uint8_t* __restrict__ A,
+    const int8_t* __restrict__ B,
+    scalar_t* __restrict__ C,
+    int32_t* __restrict__ Ctmp,
+    const float* __restrict__ As,
+    const float* __restrict__ Bs,
+    const float* __restrict__ bias,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg) {
+  // B compensation
+  const int32_t* Bcomp = reinterpret_cast<const int32_t*>(B + block_size_n() * K);
+
+  if (brg) {
+    constexpr int BLOCK_N = block_size_n();
+    at::native::cpublas::brgemm(M, N, K, lda, ldb, BLOCK_N, /* add_C */ false, A, B, Ctmp);
+
+    // apply compensation and scale
+    for (int64_t m = 0; m < M; ++m) {
+      scale_C<scalar_t, has_bias, BLOCK_N>::apply(C + m * ldc, Ctmp + m * BLOCK_N, Bcomp, bias, As[m], Bs);
+    }
+    return;
+  }
+
+  // pattern: 1-4-16
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 64;
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+  for (int64_t mb = 0; mb < MB; ++mb) {
+    int64_t mb_start = mb * BLOCK_M;
+    int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch (mb_size << 4 | nb_size >> 4) {
+        // mb_size = 1
+        case 0x12:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 32);
+          break;
+        case 0x14:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 64);
+          break;
+        // mb_size = 2
+        case 0x22:
+          LAUNCH_TINYGEMM_KERNEL_NN(2, 32);
+          break;
+        case 0x24:
+          LAUNCH_TINYGEMM_KERNEL_NN(2, 64);
+          break;
+        // mb_size = 3
+        case 0x32:
+          LAUNCH_TINYGEMM_KERNEL_NN(3, 32);
+          break;
+        case 0x34:
+          LAUNCH_TINYGEMM_KERNEL_NN(3, 64);
+          break;
+        // mb_size = 4
+        case 0x42:
+          LAUNCH_TINYGEMM_KERNEL_NN(4, 32);
+          break;
+        case 0x44:
+          LAUNCH_TINYGEMM_KERNEL_NN(4, 64);
+          break;
+        default:
+          TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+void int8_scaled_mm_kernel_impl(
+    scalar_t* __restrict__ out,
+    const uint8_t* __restrict__ mat1,
+    const int8_t* __restrict__ mat2,
+    const float* __restrict__ scales1,
+    const float* __restrict__ scales2,
+    const float* __restrict__ bias,
+    int64_t M,
+    int64_t N,
+    int64_t K) {
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  const bool use_brgemm = can_use_brgemm<int8_t>(M);
+
+  // K + 4 after compensation
+  const int64_t packed_row_size = get_row_size<int8_t>(K);
+
+  AT_DISPATCH_BOOL(bias != nullptr, has_bias, [&] {
+    parallel_2d(MB, NB, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
+      // for brgemm, use int32_t for accumulate
+      alignas(64) int32_t Ctmp[BLOCK_M * BLOCK_N];
+
+      loop_2d<int8_t>(mb0, mb1, nb0, nb1, BLOCK_N * K, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
+        int mb_start = mb * BLOCK_M;
+        int mb_size = std::min(M - mb_start, BLOCK_M);
+        int nb_start = nb * BLOCK_N;
+        int nb_size = std::min(N - nb_start, BLOCK_N);
+
+        tinygemm_kernel<scalar_t, has_bias>(
+            /*   A */ mat1 + mb_start * K,
+            /*   B */ mat2 + nb_start * packed_row_size /* nb * BLOCK_N * (K + 4) */,
+            /*   C */ out + mb_start * N + nb_start,
+            /* Ctmp*/ Ctmp,
+            /*  As */ scales1 + mb_start,
+            /*  Bs */ scales2 + nb_start,
+            /* bias*/ bias + nb_start,
+            /*   M */ mb_size,
+            /*   N */ nb_size,
+            /*   K */ K,
+            /* lda */ K,
+            /* ldb */ nb_size,
+            /* ldc */ N,
+            /* brg */ use_brgemm);
+      });
+
+      if (use_brgemm) {
+        at::native::cpublas::brgemm_release();
+      }
+    });
+  });
+}
+
+}  // anonymous namespace
+
+// tinygemm interface
+template <typename scalar_t>
+void tinygemm_kernel(
+    const uint8_t* __restrict__ A,
+    const int8_t* __restrict__ B,
+    scalar_t* __restrict__ C,
+    int32_t* __restrict__ Ctmp,
+    const float* __restrict__ As,
+    const float* __restrict__ Bs,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg) {
+  tinygemm_kernel<scalar_t, false>(A, B, C, Ctmp, As, Bs, nullptr, M, N, K, lda, ldb, ldc, brg);
+}
+
+#define INSTANTIATE_TINYGEMM_TEMPLATE(TYPE) \
+  template void tinygemm_kernel<TYPE>(      \
+      const uint8_t* __restrict__ A,        \
+      const int8_t* __restrict__ B,         \
+      TYPE* __restrict__ C,                 \
+      int32_t* __restrict__ Ctmp,           \
+      const float* __restrict__ As,         \
+      const float* __restrict__ Bs,         \
+      int64_t M,                            \
+      int64_t N,                            \
+      int64_t K,                            \
+      int64_t lda,                          \
+      int64_t ldb,                          \
+      int64_t ldc,                          \
+      bool brg)
+
+INSTANTIATE_TINYGEMM_TEMPLATE(at::BFloat16);
+INSTANTIATE_TINYGEMM_TEMPLATE(at::Half);
+
+std::tuple<at::Tensor, at::Tensor> per_token_quant_int8_cpu(at::Tensor& A) {
+  RECORD_FUNCTION("sgl-kernel::per_token_quant_int8_cpu", std::vector<c10::IValue>({A}));
+
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(A);
+  CHECK_DIM(2, A);
+
+  int64_t M = A.size(0);
+  int64_t K = A.size(1);
+  int64_t lda = A.stride(0);
+
+  const auto st = A.scalar_type();
+  TORCH_CHECK(st == at::kBFloat16 || st == at::kHalf, "per_token_quant_int8: expect A to be bfloat16 or half.");
+
+  auto Aq = at::empty({M, K}, A.options().dtype(at::kByte));
+  auto As = at::empty({M}, A.options().dtype(at::kFloat));
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(st, "per_token_quant_int8", [&] {
+    uint8_t* __restrict__ Aq_data = Aq.data_ptr<uint8_t>();
+    float* __restrict__ As_data = As.data_ptr<float>();
+    const scalar_t* __restrict__ A_data = A.data_ptr<scalar_t>();
+
+    at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+      for (int64_t m = begin; m < end; ++m) {
+        quantize_row_int8<scalar_t>(Aq_data + m * K, As_data[m], A_data + m * lda, K);
+      }
+    });
+  });
+  return std::make_tuple(Aq, As);
+}
+
+// weight     :  static, per-channel, symmetric
+// activation : dynamic,   per-token, symmetric
+//
+// mat1    : [M, K]
+// mat2    : [N, K]
+// scales1 : [M]
+// scales2 : [N]
+// bias    : [N]
+// out     : [M, N]
+//
+at::Tensor int8_scaled_mm_cpu(
+    at::Tensor& mat1,
+    at::Tensor& mat2,
+    at::Tensor& scales1,
+    at::Tensor& scales2,
+    const std::optional<at::Tensor>& bias,
+    at::ScalarType out_dtype,
+    bool is_vnni) {
+  RECORD_FUNCTION("sgl-kernel::int8_scaled_mm_cpu", std::vector<c10::IValue>({mat1, mat2, scales1, scales2, bias}));
+
+  auto packed_w = is_vnni ? mat2 : convert_weight_packed(mat2);
+
+  CHECK_INPUT(mat1);
+  CHECK_INPUT(mat2);
+  CHECK_INPUT(scales1);
+  CHECK_INPUT(scales2);
+  CHECK_DIM(2, mat1);
+  CHECK_DIM(2, mat2);
+
+  int64_t M = mat1.size(0);
+  int64_t N = mat2.size(0);
+  int64_t K = mat1.size(1);
+
+  // see [NOTE]: s8s8 igemm compensation in avx512-vnni
+  CHECK_EQ(mat2.size(1), (int64_t)(is_vnni ? K + sizeof(int32_t) : K));
+  CHECK_EQ(scales1.numel(), M);
+  CHECK_EQ(scales2.numel(), N);
+
+  TORCH_CHECK(mat1.scalar_type() == at::kByte, "int8_scaled_mm: expect mat1 to be uint8.");
+  TORCH_CHECK(mat2.scalar_type() == at::kChar, "int8_scaled_mm: expect mat2 to be int8.");
+  TORCH_CHECK(
+      scales1.scalar_type() == at::kFloat && scales2.scalar_type() == at::kFloat,
+      "int8_scaled_mm: expect scales to be float32.");
+
+  auto out = at::empty({M, N}, mat1.options().dtype(out_dtype));
+
+  const bool has_bias = bias.has_value();
+  const float* bias_data = nullptr;
+  if (has_bias) {
+    CHECK_EQ(bias.value().size(0), N);
+    bias_data = bias.value().data_ptr<float>();
+  }
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(out_dtype, "int8_scaled_mm_kernel_impl", [&] {
+    int8_scaled_mm_kernel_impl<scalar_t>(
+        out.data_ptr<scalar_t>(),
+        mat1.data_ptr<uint8_t>(),
+        packed_w.data_ptr<int8_t>(),
+        scales1.data_ptr<float>(),
+        scales2.data_ptr<float>(),
+        bias_data,
+        M,
+        N,
+        K);
+  });
+  return out;
+}
+
+// fused `per_token_quant_int8_cpu` and `int8_scaled_mm_cpu`
+at::Tensor int8_scaled_mm_with_quant(
+    at::Tensor& mat1,
+    at::Tensor& mat2,
+    at::Tensor& scales2,
+    const std::optional<at::Tensor>& bias,
+    at::ScalarType out_dtype,
+    bool is_vnni) {
+  RECORD_FUNCTION("sgl-kernel::int8_scaled_mm_cpu", std::vector<c10::IValue>({mat1, mat2, scales2, bias}));
+
+  auto packed_w = is_vnni ? mat2 : convert_weight_packed(mat2);
+
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(mat1);
+  CHECK_INPUT(mat2);
+  CHECK_INPUT(scales2);
+  CHECK_DIM(2, mat1);
+  CHECK_DIM(2, mat2);
+
+  int64_t M = mat1.size(0);
+  int64_t N = mat2.size(0);
+  int64_t K = mat1.size(1);
+  int64_t lda = mat1.stride(0);
+
+  // see [NOTE]: s8s8 igemm compensation in avx512-vnni
+  CHECK_EQ(mat2.size(1), (int64_t)(is_vnni ? K + sizeof(int32_t) : K));
+  CHECK_EQ(scales2.numel(), N);
+
+  const auto st = mat1.scalar_type();
+  TORCH_CHECK(st == at::kBFloat16 || st == at::kHalf, "int8_scaled_mm_with_quant: expect A to be bfloat16 or half.");
+  TORCH_CHECK(st == out_dtype, "int8_scaled_mm_with_quant: expect A has same dtype with out_dtype.");
+  TORCH_CHECK(mat2.scalar_type() == at::kChar, "int8_scaled_mm_with_quant: expect mat2 to be int8.");
+  TORCH_CHECK(scales2.scalar_type() == at::kFloat, "int8_scaled_mm_with_quant: expect scales to be float32.");
+
+  const int64_t buffer_size = M * K + M * sizeof(float);
+  auto buffer = at::empty({buffer_size}, mat1.options().dtype(at::kByte));
+  auto out = at::empty({M, N}, mat1.options().dtype(out_dtype));
+
+  const bool has_bias = bias.has_value();
+  const float* bias_data = nullptr;
+  if (has_bias) {
+    CHECK_EQ(bias.value().size(0), N);
+    bias_data = bias.value().data_ptr<float>();
+  }
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(out_dtype, "int8_scaled_mm_with_quant_kernel_impl", [&] {
+    uint8_t* __restrict__ Aq_data = buffer.data_ptr<uint8_t>();
+    float* __restrict__ As_data = (float*)((void*)(Aq_data + M * K));
+    const scalar_t* __restrict__ A_data = mat1.data_ptr<scalar_t>();
+
+    at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+      for (int64_t m = begin; m < end; ++m) {
+        quantize_row_int8<scalar_t>(Aq_data + m * K, As_data[m], A_data + m * lda, K);
+      }
+    });
+
+    int8_scaled_mm_kernel_impl<scalar_t>(
+        out.data_ptr<scalar_t>(),
+        Aq_data,
+        packed_w.data_ptr<int8_t>(),
+        As_data,
+        scales2.data_ptr<float>(),
+        bias_data,
+        M,
+        N,
+        K);
+  });
+  return out;
+}
diff --git a/sgl-kernel/csrc/cpu/interface.cpp b/sgl-kernel/csrc/cpu/interface.cpp
new file mode 100644
index 000000000..9057a50f4
--- /dev/null
+++ b/sgl-kernel/csrc/cpu/interface.cpp
@@ -0,0 +1,74 @@
+#include <ATen/record_function.h>
+#include <torch/all.h>
+
+#include "shm.h"
+
+// Communication settings
+static int world_rank = -1;
+static int world_size = -1;
+
+static bool is_initialized = false;
+
+static bool all_ranks_local_p = false;
+
+void initialize(int64_t size, int64_t rank) {
+  if (is_initialized) {
+    return;
+  }
+
+  // Check whether all ranks is on the same physical machine.
+  // If true, we will use an SHM based low latency allreduce
+
+  auto ls_string = std::getenv("LOCAL_SIZE");
+  int ls = 0;
+  if (ls_string != NULL) {
+    ls = std::stoi(std::getenv("LOCAL_SIZE"));
+  }
+
+  if (size >= 1 && size == ls) {
+    all_ranks_local_p = true;
+  }
+
+  world_size = size;
+  world_rank = rank;
+  is_initialized = true;
+
+  const char* addr_string = std::getenv("MASTER_ADDR");
+  if (addr_string == NULL) {
+    addr_string = "";
+  }
+  const char* port_string = std::getenv("MASTER_PORT");
+  if (port_string == NULL) {
+    port_string = "";
+  }
+
+  if (all_ranks_local_p) {
+    shm_initialize(size, rank, addr_string, port_string);
+  }
+}
+
+void shm_allreduce(torch::Tensor& data, int64_t op) {
+  RECORD_FUNCTION("sgl-kernel::shm_allreduce", std::vector<c10::IValue>({data}));
+
+  TORCH_CHECK(op == c10d::ReduceOp::SUM, "Only torch.distributed.ReduceOp.SUM is supported");
+
+  auto numel = data.numel();
+  int data_size = numel * data.element_size();
+  all_reduce_outer_loop(data, numel, data_size);
+
+  return;
+}
+
+torch::Tensor shm_allgather(torch::Tensor& data, int64_t dim) {
+  RECORD_FUNCTION("sgl-kernel::shm_allgather", std::vector<c10::IValue>({data}));
+
+  auto numel = data.numel();
+  int data_size = numel * data.element_size();
+  if (dim < 0) {
+    dim += data.dim();
+  }
+  std::vector<int64_t> result_shape = data.sizes().vec();
+  result_shape[dim] *= world_size;
+  torch::Tensor result_tensor = torch::empty(result_shape, data.options());
+  return all_gather(result_tensor, data, dim, numel, data_size);
+}
diff --git a/sgl-kernel/csrc/cpu/moe.cpp b/sgl-kernel/csrc/cpu/moe.cpp
new file mode 100644
index 000000000..c3d66cec7
--- /dev/null
+++ b/sgl-kernel/csrc/cpu/moe.cpp
@@ -0,0 +1,1322 @@
+#include "common.h"
+#include "gemm.h"
+#include "vec.h"
+
+namespace {
+
+// [NOTE]: Fused MoE kernel with AMX
+//
+//   This file contains implementations for
+//     * `moe_align_block_size`
+//     * `fused_moe`
+//
+//   The functionality is identical to triton kernel, excepts:
+//     * fuse silu_and_mul with gemm1, therefore this kernel
+//       allocates 2 intermediate_caches instead of 3
+//     * add `offsets` in `moe_align_block_size` which keeps track
+//       of starting offset for each M block. this is for keeping
+//       output of silu_and_mul in sorted order, thus load_A for
+//       the 2nd gemm would be contiguous, therefore we can directly
+//       load A from intermediate_cache1.
+//
+//  TODO:
+//     1. tune BLOCK_M and BLOCK_N (BLOCK_N * K fit L2)
+//     2. add prefetch for load A which is indexed access
+//     3. abstract at::native::cpublas::brgemm with WoQ gemm (M = 1 & M != 1)
+//
+
+template <typename scalar_t>
+inline void fill_stub(scalar_t* __restrict__ out, scalar_t val, int64_t size) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+  const Vec data_vec(val);
+  at::vec::map<scalar_t>([data_vec](Vec out) { return out = data_vec; }, out, out, size);
+}
+
+template <typename scalar_t>
+inline void copy_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t size) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+// no remainder
+#pragma GCC unroll 4
+  for (int64_t d = 0; d < size; d += Vec::size()) {
+    Vec data = Vec::loadu(input + d);
+    data.store(out + d);
+  }
+}
+
+template <typename scalar_t>
+inline void copy_mul_stub(scalar_t* __restrict__ out, const float* __restrict__ input, float weight, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  const fVec weight_vec = fVec(weight);
+  int64_t d;
+#pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec data0 = fVec::loadu(input + d) * weight_vec;
+    fVec data1 = fVec::loadu(input + d + fVec::size()) * weight_vec;
+    bVec out_vec = convert_from_float_ext<scalar_t>(data0, data1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] * weight);
+  }
+}
+
+// acc from [topk, K] to [K]
+template <typename scalar_t>
+inline void sum_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t topk, int64_t K) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  if (topk == 1) {
+    // do copy for topk = 1
+    copy_stub(out, input, K);
+  } else {
+    // do sum for topk != 1
+    int64_t d;
+#pragma GCC unroll 4
+    for (d = 0; d <= K - kVecSize; d += kVecSize) {
+      fVec sum_fvec0 = fVec(0.f);
+      fVec sum_fvec1 = fVec(0.f);
+      for (int t = 0; t < topk; ++t) {
+        bVec x_bvec = bVec::loadu(input + t * K + d);
+        fVec x_fvec0, x_fvec1;
+        std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
+
+        sum_fvec0 += x_fvec0;
+        sum_fvec1 += x_fvec1;
+      }
+      bVec out_bvec = convert_from_float_ext<scalar_t>(sum_fvec0, sum_fvec1);
+      out_bvec.store(out + d);
+    }
+    for (; d < K; ++d) {
+      float sum_val = 0.f;
+      for (int t = 0; t < topk; ++t) {
+        sum_val += static_cast<float>(input[t * K + d]);
+      }
+      out[d] = static_cast<scalar_t>(sum_val);
+    }
+  }
+}
+
+// out = input + input2 * scale
+template <typename scalar_t>
+inline void add_mul_stub(
+    scalar_t* __restrict__ out,
+    const float* __restrict__ input,
+    const scalar_t* __restrict__ input2,
+    float scale,
+    int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  const fVec s_vec = fVec(scale);
+  int64_t d;
+#pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec x0 = fVec::loadu(input + d);
+    fVec x1 = fVec::loadu(input + d + fVec::size());
+
+    bVec y_bvec = bVec::loadu(input2 + d);
+    fVec y0, y1;
+    std::tie(y0, y1) = at::vec::convert_to_float(y_bvec);
+
+    x0 = x0 + y0 * s_vec;
+    x1 = x1 + y1 * s_vec;
+    bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] + float(input2[d]) * scale);
+  }
+}
+
+template <int BLOCK_M>
+int moe_align_block_size(
+    int32_t* __restrict__ sorted_ids,
+    int32_t* __restrict__ expert_ids,
+    int32_t* __restrict__ topk_ids,
+    int32_t* __restrict__ total_cnts,
+    int32_t* __restrict__ cumsums,
+    int32_t* __restrict__ offsets,
+    int num_experts,
+    int numel,
+    int num_threads) {
+#define T_INDEX(tt) total_cnts + (tt) * num_experts
+
+  // accumulate count of expert ids locally
+  at::parallel_for(0, numel, 0, [&](int begin, int end) {
+    int tid = at::get_thread_num();
+    int32_t* __restrict__ local_cnts = T_INDEX(tid + 1);
+
+    for (int i = begin; i < end; ++i) {
+      local_cnts[topk_ids[i]]++;
+    }
+  });
+
+  using iVec = at::vec::Vectorized<int32_t>;
+  for (int t = 0; t < num_threads; ++t) {
+    at::vec::map2<int32_t>(
+        [](iVec x, iVec y) { return x + y; }, T_INDEX(t + 1), T_INDEX(t + 1), T_INDEX(t), num_experts);
+  }
+
+  // the last row holds sums of each experts
+  int32_t* total_cnts_t_1 = T_INDEX(num_threads);
+
+  cumsums[0] = 0;
+  for (int e = 0; e < num_experts; ++e) {
+    // accumulate `num_tokens_post_pad`, also as the expert offset
+    cumsums[e + 1] = cumsums[e] + div_up(total_cnts_t_1[e], BLOCK_M) * BLOCK_M;
+
+    for (int k = cumsums[e]; k < cumsums[e + 1]; k += BLOCK_M) {
+      expert_ids[k / BLOCK_M] = e;
+    }
+  }
+  int num_tokens_post_pad = cumsums[num_experts];
+
+  at::parallel_for(0, numel, 0, [&](int begin, int end) {
+    int tid = at::get_thread_num();
+    // thread tid offsets in `total_cnts`
+    int32_t* __restrict__ offsets = T_INDEX(tid);
+
+    for (int i = begin; i < end; ++i) {
+      int32_t expert_id = topk_ids[i];
+      int32_t b_offset = cumsums[expert_id];
+      int32_t t_offset = offsets[expert_id];
+      sorted_ids[b_offset + t_offset] = i;
+      offsets[expert_id]++;
+    }
+  });
+
+  // debug: the offset for thread t_1 should be identical to t_2
+  int32_t* total_cnts_t_2 = T_INDEX(num_threads - 1);
+  for (int e = 0; e < num_experts; ++e) {
+    TORCH_CHECK(total_cnts_t_1[e] == total_cnts_t_2[e]);
+  }
+
+  // padding value for sorted_ids: numel
+  auto sorted_id_size = [=](const int32_t* sorted_ids_ptr) {
+    for (int d = 0; d < BLOCK_M; ++d) {
+      if (sorted_ids_ptr[d] == numel) {
+        return d;
+      }
+    }
+    return BLOCK_M;
+  };
+
+  // offsets holds starting offset for each valida M blocks
+  //   shape : [num_token_blocks + 1]
+  offsets[0] = 0;
+  const int num_token_blocks = num_tokens_post_pad / BLOCK_M;
+  at::parallel_for(0, num_token_blocks, GRAIN_SIZE / BLOCK_M, [&](int begin, int end) {
+    for (int mb = begin; mb < end; ++mb) {
+      offsets[mb + 1] = sorted_id_size(sorted_ids + mb * BLOCK_M);
+    }
+  });
+  // TODO: do we need to vecterize this ?
+  for (int mb = 0; mb < num_token_blocks; ++mb) {
+    offsets[mb + 1] += offsets[mb];
+  }
+  // debug: the last value of offsets should be `numel`
+  TORCH_CHECK(offsets[num_token_blocks] == numel);
+
+  return num_tokens_post_pad;
+}
+
+//   silu :    shape          leading dimension
+//  input0  [m_size, BLOCK_N]    BLOCK_N
+//  input1  [m_size, BLOCK_N]    BLOCK_N
+//  output  [M * topk, N]          N
+template <typename scalar_t, int BLOCK_N>
+inline void silu_and_mul(
+    scalar_t* __restrict__ output,
+    const float* __restrict__ input0,  // x: x0, x1
+    const float* __restrict__ input1,  // y: y0, y1
+    int64_t m_size,
+    int64_t N) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+
+  const fVec one = fVec(1.f);
+
+  // no remainder
+  for (int64_t m = 0; m < m_size; ++m) {
+    scalar_t* __restrict__ out = output + m * N;
+    const float* __restrict__ x = input0 + m * BLOCK_N;
+    const float* __restrict__ y = input1 + m * BLOCK_N;
+
+    for (int64_t d = 0; d < BLOCK_N; d += bVec::size()) {
+      fVec x0 = fVec::loadu(x + d);
+      fVec x1 = fVec::loadu(x + d + fVec::size());
+      fVec y0 = fVec::loadu(y + d);
+      fVec y1 = fVec::loadu(y + d + fVec::size());
+      // silu
+      x0 = x0 / (one + x0.neg().exp_u20());
+      x1 = x1 / (one + x1.neg().exp_u20());
+      // mul
+      x0 = x0 * y0;
+      x1 = x1 * y1;
+      // convert
+      bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
+      out_vec.store(out + d);
+    }
+  }
+}
+
+template <typename scalar_t, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn2 {
+  static inline void apply(
+      const scalar_t* __restrict__ A,
+      const scalar_t* __restrict__ B0,
+      const scalar_t* __restrict__ B1,
+      scalar_t* __restrict__ C,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
+    TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn2<at::BFloat16, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const at::BFloat16* __restrict__ A,
+      const at::BFloat16* __restrict__ B0,
+      const at::BFloat16* __restrict__ B1,
+      at::BFloat16* __restrict__ C,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N / 16;
+
+    static_assert(COLS % 2 == 0);
+
+    // prefetch distance
+    constexpr int PREFETCH_SIZE_K = 0;
+
+    __m512bh va;
+    __m512bh vb0[COLS];
+    __m512bh vb1[COLS];
+    __m512 vc0[ROWS * COLS];
+    __m512 vc1[ROWS * COLS];
+
+    auto loadc = [&](auto i) {
+      vc0[i] = _mm512_set1_ps(0.f);
+      vc1[i] = _mm512_set1_ps(0.f);
+    };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    const int64_t K2 = K >> 1;
+    const int64_t lda2 = lda >> 1;
+    const int64_t ldb2 = ldb;  // ldb * 2 >> 1;
+    const float* a_ptr = reinterpret_cast<const float*>(A);
+    const float* b0_ptr = reinterpret_cast<const float*>(B0);
+    const float* b1_ptr = reinterpret_cast<const float*>(B1);
+
+    auto compute = [&](auto i, int64_t k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = (__m512bh)(_mm512_set1_ps(a_ptr[row * lda2 + k]));
+      }
+      if constexpr (row == 0) {
+        vb0[col] = (__m512bh)(_mm512_loadu_si512(b0_ptr + k * ldb2 + col * 16));
+        vb1[col] = (__m512bh)(_mm512_loadu_si512(b1_ptr + k * ldb2 + col * 16));
+        if constexpr (PREFETCH_SIZE_K > 0) {
+          _mm_prefetch(b0_ptr + (k + PREFETCH_SIZE_K) * ldb2 + col * 16, _MM_HINT_T0);
+          _mm_prefetch(b1_ptr + (k + PREFETCH_SIZE_K) * ldb2 + col * 16, _MM_HINT_T0);
+        }
+      }
+      vc0[i] = _mm512_dpbf16_ps(vc0[i], va, vb0[col]);
+      vc1[i] = _mm512_dpbf16_ps(vc1[i], va, vb1[col]);
+    };
+    for (int64_t k = 0; k < K2; ++k) {
+      Unroll<ROWS * COLS>{}(compute, k);
+    }
+
+    using Vec = at::vec::Vectorized<float>;
+    const Vec one = Vec(1.f);
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+      // for COLS = 2, 4 use 512bit store
+      if constexpr (col % 2 == 0) {
+        Vec x0 = vc0[row * COLS + col + 0];
+        Vec x1 = vc0[row * COLS + col + 1];
+        Vec y0 = vc1[row * COLS + col + 0];
+        Vec y1 = vc1[row * COLS + col + 1];
+        // silu
+        x0 = x0 / (one + x0.neg().exp_u20());
+        x1 = x1 / (one + x1.neg().exp_u20());
+        // mul
+        x0 = x0 * y0;
+        x1 = x1 * y1;
+
+        _mm512_storeu_si512(
+            reinterpret_cast<__m512i*>((C + row * ldc + col * 16)),
+            (__m512i)(_mm512_cvtne2ps_pbh(__m512(x1), __m512(x0))));
+      }
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+#endif
+
+#define LAUNCH_TINYGEMM_KERNEL_NN(MB_SIZE, NB_SIZE)       \
+  tinygemm_kernel_nn2<scalar_t, MB_SIZE, NB_SIZE>::apply( \
+      A + mb_start * lda, B0 + nb_start * 2, B1 + nb_start * 2, C + mb_start * ldc + nb_start, K, lda, ldb, ldc);
+
+template <typename scalar_t>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const scalar_t* __restrict__ B0,
+    const scalar_t* __restrict__ B1,
+    scalar_t* __restrict__ C,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc) {
+  // pattern: 1-(2+2)-(8+8)
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 32;
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+  for (int mb = 0; mb < MB; ++mb) {
+    int64_t mb_start = mb * BLOCK_M;
+    int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch (mb_size << 4 | nb_size >> 4) {
+        // mb_size = 1
+        case 0x12:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 32);
+          break;
+        // mb_size = 2
+        case 0x22:
+          LAUNCH_TINYGEMM_KERNEL_NN(2, 32);
+          break;
+        // mb_size = 3
+        case 0x32:
+          LAUNCH_TINYGEMM_KERNEL_NN(3, 32);
+          break;
+        // mb_size = 4
+        case 0x42:
+          LAUNCH_TINYGEMM_KERNEL_NN(4, 32);
+          break;
+        default:
+          TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+      }
+    }
+  }
+}
+
+template <typename scalar_t, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn {
+  static inline void apply(
+      const scalar_t* __restrict__ A,
+      const scalar_t* __restrict__ B,
+      float* __restrict__ C,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
+    TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn<at::BFloat16, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const at::BFloat16* __restrict__ A,
+      const at::BFloat16* __restrict__ B,
+      float* __restrict__ C,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N / 16;
+
+    static_assert(COLS % 2 == 0);
+
+    // prefetch distance
+    constexpr int PREFETCH_SIZE_K = 0;
+
+    __m512bh va;
+    __m512bh vb[COLS];
+    __m512 vc[ROWS * COLS];
+
+    auto loadc = [&](auto i) { vc[i] = _mm512_set1_ps(0.f); };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    const int64_t K2 = K >> 1;
+    const int64_t lda2 = lda >> 1;
+    const int64_t ldb2 = ldb;  // ldb * 2 >> 1;
+    const float* a_ptr = reinterpret_cast<const float*>(A);
+    const float* b_ptr = reinterpret_cast<const float*>(B);
+
+    auto compute = [&](auto i, int64_t k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = (__m512bh)(_mm512_set1_ps(a_ptr[row * lda2 + k]));
+      }
+      if constexpr (row == 0) {
+        vb[col] = (__m512bh)(_mm512_loadu_si512(b_ptr + k * ldb2 + col * 16));
+        if constexpr (PREFETCH_SIZE_K > 0) {
+          _mm_prefetch(b_ptr + (k + PREFETCH_SIZE_K) * ldb2 + col * 16, _MM_HINT_T0);
+        }
+      }
+      vc[i] = _mm512_dpbf16_ps(vc[i], va, vb[col]);
+    };
+    for (int64_t k = 0; k < K2; ++k) {
+      Unroll<ROWS * COLS>{}(compute, k);
+    }
+
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+      _mm512_storeu_ps(reinterpret_cast<__m512*>(C + row * ldc + col * 16), vc[i]);
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+#endif
+
+#define LAUNCH_TINYGEMM_KERNEL_NN2(MB_SIZE, NB_SIZE)     \
+  tinygemm_kernel_nn<scalar_t, MB_SIZE, NB_SIZE>::apply( \
+      A + mb_start * lda, B + nb_start * 2, C + mb_start * ldc + nb_start, K, lda, ldb, ldc);
+
+template <typename scalar_t>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const scalar_t* __restrict__ B,
+    float* __restrict__ C,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc) {
+  // pattern: 1-2-8
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 32;
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+  for (int mb = 0; mb < MB; ++mb) {
+    int64_t mb_start = mb * BLOCK_M;
+    int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch (mb_size << 4 | nb_size >> 4) {
+        // mb_size = 1
+        case 0x12:
+          LAUNCH_TINYGEMM_KERNEL_NN2(1, 32);
+          break;
+        // mb_size = 2
+        case 0x22:
+          LAUNCH_TINYGEMM_KERNEL_NN2(2, 32);
+          break;
+        // mb_size = 3
+        case 0x32:
+          LAUNCH_TINYGEMM_KERNEL_NN2(3, 32);
+          break;
+        // mb_size = 4
+        case 0x42:
+          LAUNCH_TINYGEMM_KERNEL_NN2(4, 32);
+          break;
+        default:
+          TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+void fused_experts_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ ic2,
+    scalar_t* __restrict__ A_tmp,
+    float* __restrict__ C_tmp,
+    const scalar_t* __restrict__ input,
+    const scalar_t* __restrict__ packed_w1,
+    const scalar_t* __restrict__ packed_w2,
+    const float* __restrict__ topk_weights,
+    const int32_t* __restrict__ sorted_ids,
+    const int32_t* __restrict__ expert_ids,
+    const int32_t* __restrict__ offsets,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t E,
+    int64_t topk,
+    int64_t num_tokens_post_pad) {
+  // handle 2 tiles per block
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  // stage 1: intermediate_cache1 = silu(hidden_states @ w1)
+  const int64_t MB = div_up(num_tokens_post_pad, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  // strides for w1: [E, 2N, K]
+  TORCH_CHECK(N % BLOCK_N == 0, "Fixme when N is not multiples of ", BLOCK_N);
+
+  const int64_t stride_e = 2 * N * K;
+  const int64_t stride_n = K;
+
+  int64_t avg_M = std::max(int64_t(1), M * topk / E);
+  const bool use_brgemm = can_use_brgemm<scalar_t>(avg_M);
+
+  // here we only parallel on half of 2N to fuse silu_and_mul with gemm
+  parallel_2d(MB, NB, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
+    // get local pointers
+    int tid = get_thread_num();
+    scalar_t* __restrict__ A = A_tmp + tid * BLOCK_M * K;
+    float* __restrict__ C0 = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
+    float* __restrict__ C1 = C0 + BLOCK_M * BLOCK_N;
+
+    loop_2d<scalar_t>(mb0, mb1, nb0, nb1, BLOCK_N * K * 2, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
+      // nb_upper from top half and nb_lower from bottom half
+      int64_t nb_upper = nb, nb_lower = nb + NB;
+      int64_t n_size = std::min(N - nb * BLOCK_N, BLOCK_N);
+
+      // B shape [K, n_size] in vnni format
+      int32_t expert_id = expert_ids[mb];
+      const scalar_t* __restrict__ B0 = packed_w1 + expert_id * stride_e + nb_upper * BLOCK_N * stride_n;
+      const scalar_t* __restrict__ B1 = packed_w1 + expert_id * stride_e + nb_lower * BLOCK_N * stride_n;
+
+      // 1.a load A
+      const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
+      int64_t m_size = offsets[mb + 1] - offsets[mb];
+
+      for (int64_t m = 0; m < m_size; ++m) {
+        int32_t index = A_ids[m] / topk;
+        copy_stub(A + m * K, input + index * K, K);
+      }
+
+      if (use_brgemm) {
+        // 1.b gemm: C0 = A @ B0
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B0,
+            /* C     */ C0);
+
+        // 1.c gemm: C1 = A @ B1
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B1,
+            /* C     */ C1);
+
+        // 1.d silu and mul
+        const int64_t offset = offsets[mb];
+        silu_and_mul<scalar_t, BLOCK_N>(ic1 + offset * N + nb * BLOCK_N, C0, C1, m_size, N);
+      } else {
+        // fused 1.bcd: silu_and_mul(A @ B0, A @ B1)
+        const int64_t offset = offsets[mb];
+        tinygemm_kernel(
+            /* A     */ A,
+            /* B0    */ B0,
+            /* B1    */ B1,
+            /* C     */ ic1 + offset * N + nb * BLOCK_N,
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ N);
+      }
+    });
+
+    if (use_brgemm) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+
+  // stage 2: intermediate_cache2 = intermediate_cache1 @ w2
+  //   w2 : [E, K, N] as [E, OC, IC]
+  const int64_t OC = K;  // rename K as OC
+  const int64_t IC = N;  // rename N as IC
+  const int64_t MB2 = MB;
+  const int64_t NB2 = div_up(OC, BLOCK_N);
+  const int64_t stride_e2 = OC * IC;
+  const int64_t stride_oc = IC;
+
+  // parallel on [MB2, NB2]
+  parallel_2d(MB2, NB2, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
+    // get local pointers
+    int tid = get_thread_num();
+    // we won't be using C1 for gemm2
+    float* __restrict__ C = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
+
+    loop_2d<scalar_t>(mb0, mb1, nb0, nb1, BLOCK_N * IC, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
+      int64_t m_size = offsets[mb + 1] - offsets[mb];
+      int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
+
+      // A ptr from ic1 of [M * topk, N] in sorted order
+      // so as to avoid copy A to tmp buffer again
+      const scalar_t* __restrict__ A = ic1 + offsets[mb] * N;
+      const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
+
+      // B shape [IC, n_size] in vnni format
+      int32_t expert_id = expert_ids[mb];
+      const scalar_t* __restrict__ B = packed_w2 + expert_id * stride_e2 + nb * BLOCK_N * stride_oc;
+
+      // 2.a gemm: C = A @ B
+      if (use_brgemm) {
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ IC,
+            /* lda   */ IC,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B,
+            /* C     */ C);
+      } else {
+        tinygemm_kernel(
+            /* A     */ A,
+            /* B     */ B,
+            /* C     */ C,
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ IC,
+            /* lda   */ IC,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N);
+      }
+
+      // 2.b copy from C to ic2 in original order
+      //   and also mul topk_weights in float32
+      for (int64_t m = 0; m < m_size; ++m) {
+        int32_t index = A_ids[m];
+        float weight = topk_weights[index];
+        copy_mul_stub(ic2 + index * K + nb * BLOCK_N, C + m * BLOCK_N, weight, n_size);
+      }
+    });
+
+    if (use_brgemm) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+
+  // stage 3: out = intermediate_cache2.sum(dim=1)
+  //   from [M, topk, K] to [M, K]
+  at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      sum_stub(output + m * K, ic2 + m * topk * K, topk, K);
+    }
+  });
+}
+
+template <typename scalar_t>
+void shared_expert_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic1,
+    float* __restrict__ C_tmp,
+    scalar_t* __restrict__ input,
+    const scalar_t* __restrict__ packed_w1,
+    const scalar_t* __restrict__ packed_w2,
+    const scalar_t* __restrict__ fused_experts_out,
+    float routed_scaling_factor,
+    int64_t M,
+    int64_t N,
+    int64_t K) {
+  // handle 2 tiles per block
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  // stage 1: intermediate_cache1 = silu(hidden_states @ w1)
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  TORCH_CHECK(N % BLOCK_N == 0, "Fixme when N is not multiples of ", BLOCK_N);
+  const int64_t stride_n = K;
+
+  const bool use_brgemm = can_use_brgemm<scalar_t>(M);
+
+  // here we only parallel on half of 2N to fuse silu_and_mul with gemm
+  parallel_2d(MB, NB, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
+    // get local pointers
+    int tid = get_thread_num();
+    float* __restrict__ C0 = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
+    float* __restrict__ C1 = C0 + BLOCK_M * BLOCK_N;
+
+    loop_2d<scalar_t>(mb0, mb1, nb0, nb1, BLOCK_N * K * 2, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
+      // nb_upper from top half and nb_lower from bottom half
+      int64_t nb_upper = nb, nb_lower = nb + NB;
+      int64_t n_size = std::min(N - nb * BLOCK_N, BLOCK_N);
+      int64_t m_size = std::min(M - mb * BLOCK_M, BLOCK_M);
+
+      // A shape [m_size, K]
+      const scalar_t* A = input + mb * BLOCK_M * K;
+
+      // B shape [K, n_size] in vnni format
+      const scalar_t* __restrict__ B0 = packed_w1 + nb_upper * BLOCK_N * stride_n;
+      const scalar_t* __restrict__ B1 = packed_w1 + nb_lower * BLOCK_N * stride_n;
+
+      if (use_brgemm) {
+        // 1.b gemm: C0 = A @ B0
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B0,
+            /* C     */ C0);
+
+        // 1.c gemm: C1 = A @ B1
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B1,
+            /* C     */ C1);
+
+        // 1.d silu and mul
+        silu_and_mul<scalar_t, BLOCK_N>(ic1 + mb * BLOCK_M * N + nb * BLOCK_N, C0, C1, m_size, N);
+      } else {
+        // fused 1.bcd: silu_and_mul(A @ B0, A @ B1)
+        tinygemm_kernel(
+            /* A     */ A,
+            /* B0    */ B0,
+            /* B1    */ B1,
+            /* C     */ ic1 + mb * BLOCK_M * N + nb * BLOCK_N,
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ N);
+      }
+    });
+
+    if (use_brgemm) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+
+  // stage 2: output = intermediate_cache1 @ w2
+  //   w2 : [K, N] as [OC, IC]
+  const int64_t OC = K;  // rename K as OC
+  const int64_t IC = N;  // rename N as IC
+  const int64_t MB2 = MB;
+  const int64_t NB2 = div_up(OC, BLOCK_N);
+  const int64_t stride_oc = IC;
+
+  // parallel on [MB2, NB2]
+  parallel_2d(MB2, NB2, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
+    // get local pointers
+    int tid = get_thread_num();
+    // we won't be using C1 for gemm2
+    float* __restrict__ C = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
+
+    loop_2d<scalar_t>(mb0, mb1, nb0, nb1, BLOCK_N * IC, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
+      int64_t m_size = std::min(M - mb * BLOCK_M, BLOCK_M);
+      int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
+
+      // A shape [m_size, IC]
+      const scalar_t* __restrict__ A = ic1 + mb * BLOCK_M * N;
+
+      // B shape [IC, n_size] in vnni format
+      const scalar_t* __restrict__ B = packed_w2 + nb * BLOCK_N * stride_oc;
+
+      // 2.a gemm: C = A @ B
+      if (use_brgemm) {
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ IC,
+            /* lda   */ IC,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B,
+            /* C     */ C);
+      } else {
+        tinygemm_kernel(
+            /* A     */ A,
+            /* B     */ B,
+            /* C     */ C,
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ IC,
+            /* lda   */ IC,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N);
+      }
+
+      // 2.b copy from C to output and add fused_experts_out
+      scalar_t* __restrict__ out = output + mb * BLOCK_M * K + nb * BLOCK_N;
+      const scalar_t* __restrict__ fused_out = fused_experts_out + mb * BLOCK_M * K + nb * BLOCK_N;
+      for (int64_t m = 0; m < m_size; ++m) {
+        add_mul_stub(out + m * K, C + m * BLOCK_N, fused_out + m * K, routed_scaling_factor, n_size);
+      }
+    });
+
+    if (use_brgemm) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+}
+
+}  // anonymous namespace
+
+// common checks
+static inline void check_moe_scales(
+    bool use_int8_w8a8,
+    bool use_fp8_w8a16,
+    const std::optional<at::Tensor>& w1_scale,
+    const std::optional<at::Tensor>& w2_scale,
+    const std::optional<std::vector<int64_t>> block_size,
+    const std::optional<at::Tensor>& a1_scale,
+    const std::optional<at::Tensor>& a2_scale) {
+  if (use_int8_w8a8) {
+    TORCH_CHECK(w1_scale.has_value(), "missing w1_scale for int8 w8a8.");
+    TORCH_CHECK(w2_scale.has_value(), "missing w2_scale for int8 w8a8.");
+    TORCH_CHECK(!a1_scale.has_value(), "static quantization for activation not supported.");
+    TORCH_CHECK(!a2_scale.has_value(), "static quantization for activation not supported.");
+  }
+  if (use_fp8_w8a16) {
+    TORCH_CHECK(w1_scale.has_value(), "missing w1_scale for fp8 w8a16.");
+    TORCH_CHECK(w2_scale.has_value(), "missing w2_scale for fp8 w8a16.");
+    TORCH_CHECK(block_size.has_value(), "missing block_size for fp8 w8a16.");
+    TORCH_CHECK(block_size.value().size() == 2, "expect block_size.size() to be 2.");
+  }
+}
+
+#define CHECK_MOE_SCALES_FP8(DIM0, DIM1)                      \
+  auto w1s = w1_scale.value();                                \
+  auto w2s = w2_scale.value();                                \
+  auto block_size_val = block_size.value();                   \
+  int64_t block_size_N = block_size_val[0];                   \
+  int64_t block_size_K = block_size_val[1];                   \
+  TORCH_CHECK(w1s.size(DIM0) == div_up(2 * N, block_size_N)); \
+  TORCH_CHECK(w1s.size(DIM1) == div_up(K, block_size_K));     \
+  TORCH_CHECK(w2s.size(DIM0) == div_up(K, block_size_N));     \
+  TORCH_CHECK(w2s.size(DIM1) == div_up(N, block_size_K))
+
+// hidden_states: [M, K]
+// w1: [E, 2N, K]
+// w2: [E, K, N]
+// topk_weights: [M, topk]
+// topk_ids: [M, topk] (int32_t)
+//
+at::Tensor fused_experts_cpu(
+    at::Tensor& hidden_states,
+    at::Tensor& w1,
+    at::Tensor& w2,
+    at::Tensor& topk_weights,
+    at::Tensor& topk_ids,
+    bool inplace,
+    bool use_int8_w8a8,
+    bool use_fp8_w8a16,
+    const std::optional<at::Tensor>& w1_scale,
+    const std::optional<at::Tensor>& w2_scale,
+    const std::optional<std::vector<int64_t>> block_size,
+    const std::optional<at::Tensor>& a1_scale,
+    const std::optional<at::Tensor>& a2_scale,
+    bool is_vnni) {
+  RECORD_FUNCTION(
+      "sgl-kernel::fused_experts_cpu", std::vector<c10::IValue>({hidden_states, w1, w2, topk_weights, topk_ids}));
+
+  auto packed_w1 = is_vnni ? w1 : convert_weight_packed(w1);
+  auto packed_w2 = is_vnni ? w2 : convert_weight_packed(w2);
+
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  const auto st = hidden_states.scalar_type();
+  CHECK_INPUT(hidden_states);
+  CHECK_INPUT(w1);
+  CHECK_INPUT(w2);
+  CHECK_EQ(topk_weights.sizes(), topk_ids.sizes());
+  CHECK_DIM(2, hidden_states);
+  CHECK_DIM(3, w1);
+  CHECK_DIM(3, w2);
+  CHECK_DIM(2, topk_weights);
+  CHECK_DIM(2, topk_ids);
+
+  CHECK_EQ(topk_ids.scalar_type(), at::kInt);
+
+  // TODO: support topk_weights to be bf16 or fp16 in the kernel.
+  // The topk_weights of llama4 is computed via Llama4MoE:custom_routing_function and is bf16/fp16
+  // while the kernel currently only supports it to be float32
+  auto topk_weights_ = topk_weights.to(at::kFloat);
+  CHECK_EQ(topk_weights_.scalar_type(), at::kFloat);
+
+  int64_t M = hidden_states.size(0);
+  int64_t K = hidden_states.size(1);
+  int64_t N = w1.size(1) / 2;
+  int64_t E = w1.size(0);
+  int64_t topk = topk_weights_.size(1);
+
+  // we use int32_t compensation for int8 w8a8
+  int64_t packed_K = get_row_size(K, use_int8_w8a8);
+  int64_t packed_N = get_row_size(N, use_int8_w8a8);
+
+  // check weight shapes
+  CHECK_EQ(w2.size(0), E);
+  CHECK_EQ(w2.size(1), K);
+  CHECK_EQ(packed_w1.size(2), packed_K);
+  CHECK_EQ(packed_w2.size(2), packed_N);
+
+  // check scales
+  check_moe_scales(use_int8_w8a8, use_fp8_w8a16, w1_scale, w2_scale, block_size, a1_scale, a2_scale);
+
+  at::Tensor out_hidden_states = inplace ? hidden_states : at::empty_like(hidden_states);
+
+  // NB: worst case is each expert holds a block with remainder of 1
+  //   1. sorted_ids : [M * topk + E * (BLOCK_M - 1)]
+  //   2. expert_ids : [max_num_blocks]
+  //   3. total_cnts : [T + 1, E]
+  //   4. cumsums    : [E + 1]
+  //   5. offsets    : [max_num_blocks + 1]
+  //
+  int num_threads = at::get_num_threads();
+  int64_t max_num_tokens_padded = M * topk + E * (BLOCK_M - 1);
+  int64_t max_num_blocks = div_up(max_num_tokens_padded, BLOCK_M);
+  auto buffer = at::empty(
+      {max_num_tokens_padded + max_num_blocks + (num_threads + 1) * E + (E + 1) + (max_num_blocks + 1)},
+      topk_ids.options());
+
+  int32_t* __restrict__ sorted_ids = buffer.data_ptr<int32_t>();
+  int32_t* __restrict__ expert_ids = sorted_ids + max_num_tokens_padded;
+  int32_t* __restrict__ total_cnts = expert_ids + max_num_blocks;
+  int32_t* __restrict__ cumsums = total_cnts + (num_threads + 1) * E;
+  int32_t* __restrict__ offsets = cumsums + (E + 1);
+
+  // init sorted_ids with `numel` as the padding number
+  // init expert_ids with `num_experts`
+  int64_t numel = M * topk;
+  at::parallel_for(0, max_num_blocks, GRAIN_SIZE / BLOCK_M, [&](int64_t begin, int64_t end) {
+    int64_t m_start = begin * BLOCK_M;
+    int64_t m_size = std::min((end - begin) * BLOCK_M, max_num_tokens_padded - m_start);
+    fill_stub(sorted_ids + m_start, (int32_t)numel, m_size);
+    fill_stub(expert_ids + begin, (int32_t)E, end - begin);
+  });
+  // zero total_cnts and cumsums
+  at::parallel_for(0, (num_threads + 1) * E + (E + 1), GRAIN_SIZE, [&](int64_t begin, int64_t end) {
+    fill_stub(total_cnts + begin, 0, end - begin);
+  });
+
+  // align experts index
+  int64_t num_tokens_post_pad = moe_align_block_size<BLOCK_M>(
+      sorted_ids, expert_ids, topk_ids.data_ptr<int32_t>(), total_cnts, cumsums, offsets, E, numel, num_threads);
+
+  // unlike triton kernel, we fuse silu with gemm1 so only need 2 intermediate_caches:
+  //   1. intermediate_cache1 : [M * topk, N]
+  //   2. intermediate_cache2 : [M * topk, K]
+  //   3. A_tmp : [T, BLOCK_M * K]
+  //   4. C_tmp : [T, 2 * BLOCK_M * BLOCK_N]
+  //
+  // for int8 w8a8:
+  //   5. Aq_tmp : [M, K] or [M * topk, N]
+  //   6. As_tmp : [M * topk]
+  //
+  // for fp8 w8a16:
+  //   7. intermediate_cache0 : [M * topk, 2N]
+  //   8. B_tmp : [T, MAX_CACHE_BLOCK_SIZE, BLOCK_N, std::max(K, N)]
+  //
+  int64_t buffer_size_nbytes = M * topk * N * 2 + M * topk * K * 2 +
+                               num_threads * BLOCK_M * K * (use_int8_w8a8 ? 1 : 2) +
+                               num_threads * 2 * BLOCK_M * BLOCK_N * sizeof(float);
+
+  if (use_int8_w8a8) {
+    buffer_size_nbytes += std::max(M * K, M * topk * N) + M * topk * sizeof(float);
+  }
+  if (use_fp8_w8a16) {
+    buffer_size_nbytes += M * topk * 2 * N * 2 + num_threads * MAX_CACHE_BLOCK_SIZE * BLOCK_N * std::max(K, N) * 2;
+  }
+
+  auto buffer2 = at::empty({buffer_size_nbytes}, hidden_states.options().dtype(at::kChar));
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(st, "fused_experts_kernel_impl", [&] {
+    scalar_t* __restrict__ intermediate_cache1 = (scalar_t*)((void*)(buffer2.data_ptr<int8_t>()));
+    scalar_t* __restrict__ intermediate_cache2 = intermediate_cache1 + M * topk * N;
+
+    if (use_int8_w8a8) {
+      uint8_t* __restrict__ A_tmp = (uint8_t*)((void*)(intermediate_cache2 + M * topk * K));
+      float* __restrict__ C_tmp = (float*)((void*)(A_tmp + num_threads * BLOCK_M * K));
+      uint8_t* __restrict__ Aq_tmp = (uint8_t*)((void*)(C_tmp + num_threads * 2 * BLOCK_M * BLOCK_N));
+      float* __restrict__ As_tmp = (float*)((void*)(Aq_tmp + std::max(M * K, M * topk * N)));
+
+      auto w1s = w1_scale.value();
+      auto w2s = w2_scale.value();
+      TORCH_CHECK(w1s.numel() == E * 2 * N);
+      TORCH_CHECK(w2s.numel() == E * K);
+
+      fused_experts_int8_kernel_impl<scalar_t>(
+          out_hidden_states.data_ptr<scalar_t>(),
+          intermediate_cache1,
+          intermediate_cache2,
+          A_tmp,
+          C_tmp,
+          Aq_tmp,
+          As_tmp,
+          hidden_states.data_ptr<scalar_t>(),
+          packed_w1.data_ptr<int8_t>(),
+          packed_w2.data_ptr<int8_t>(),
+          w1s.data_ptr<float>(),
+          w2s.data_ptr<float>(),
+          topk_weights_.data_ptr<float>(),
+          sorted_ids,
+          expert_ids,
+          offsets,
+          M,
+          N,
+          K,
+          E,
+          topk,
+          num_tokens_post_pad);
+    } else if (use_fp8_w8a16) {
+      // here we just ignore C_tmp as it is not used
+      scalar_t* __restrict__ A_tmp = (scalar_t*)((void*)(intermediate_cache2 + M * topk * K));
+      float* __restrict__ C_tmp = (float*)((void*)(A_tmp + num_threads * BLOCK_M * K));
+      scalar_t* __restrict__ intermediate_cache0 = (scalar_t*)((void*)(C_tmp + num_threads * 2 * BLOCK_M * BLOCK_N));
+      scalar_t* __restrict__ B_tmp = (scalar_t*)((void*)(intermediate_cache0 + M * topk * 2 * N));
+
+      CHECK_MOE_SCALES_FP8(1, 2);
+      fused_experts_fp8_kernel_impl(
+          out_hidden_states.data_ptr<scalar_t>(),
+          intermediate_cache0,
+          intermediate_cache1,
+          intermediate_cache2,
+          A_tmp,
+          B_tmp,
+          C_tmp,
+          hidden_states.data_ptr<scalar_t>(),
+          packed_w1.data_ptr<at::Float8_e4m3fn>(),
+          packed_w2.data_ptr<at::Float8_e4m3fn>(),
+          w1s.data_ptr<float>(),
+          w2s.data_ptr<float>(),
+          block_size_N,
+          block_size_K,
+          topk_weights_.data_ptr<float>(),
+          sorted_ids,
+          expert_ids,
+          offsets,
+          M,
+          N,
+          K,
+          E,
+          topk,
+          num_tokens_post_pad);
+    } else {
+      scalar_t* __restrict__ A_tmp = intermediate_cache2 + M * topk * K;
+      float* __restrict__ C_tmp = (float*)((void*)(A_tmp + num_threads * BLOCK_M * K));
+
+      fused_experts_kernel_impl<scalar_t>(
+          out_hidden_states.data_ptr<scalar_t>(),
+          intermediate_cache1,
+          intermediate_cache2,
+          A_tmp,
+          C_tmp,
+          hidden_states.data_ptr<scalar_t>(),
+          packed_w1.data_ptr<scalar_t>(),
+          packed_w2.data_ptr<scalar_t>(),
+          topk_weights_.data_ptr<float>(),
+          sorted_ids,
+          expert_ids,
+          offsets,
+          M,
+          N,
+          K,
+          E,
+          topk,
+          num_tokens_post_pad);
+    }
+  });
+  return out_hidden_states;
+}
+
+// shared expert kernel
+//
+// hidden_states: [M, K]
+// w1: [2N, K]
+// w2: [K, N]
+// fused_experts_out
+at::Tensor shared_expert_cpu(
+    at::Tensor& hidden_states,
+    at::Tensor& w1,
+    at::Tensor& w2,
+    at::Tensor& fused_experts_out,
+    double routed_scaling_factor,
+    bool inplace,
+    bool use_int8_w8a8,
+    bool use_fp8_w8a16,
+    const std::optional<at::Tensor>& w1_scale,
+    const std::optional<at::Tensor>& w2_scale,
+    const std::optional<std::vector<int64_t>> block_size,
+    const std::optional<at::Tensor>& a1_scale,
+    const std::optional<at::Tensor>& a2_scale,
+    bool is_vnni) {
+  RECORD_FUNCTION("sgl-kernel::shared_expert_cpu", std::vector<c10::IValue>({hidden_states, w1, w2}));
+
+  auto packed_w1 = is_vnni ? w1 : convert_weight_packed(w1);
+  auto packed_w2 = is_vnni ? w2 : convert_weight_packed(w2);
+
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  const auto st = hidden_states.scalar_type();
+  CHECK_INPUT(hidden_states);
+  CHECK_INPUT(fused_experts_out);
+  CHECK_INPUT(w1);
+  CHECK_INPUT(w2);
+  CHECK_DIM(2, hidden_states);
+  CHECK_DIM(2, w1);
+  CHECK_DIM(2, w2);
+  CHECK_EQ(hidden_states.sizes(), fused_experts_out.sizes());
+  CHECK_EQ(hidden_states.scalar_type(), st);
+
+  int64_t M = hidden_states.size(0);
+  int64_t K = hidden_states.size(1);
+  int64_t N = w1.size(0) / 2;
+
+  // we use int32_t compensation for int8 w8a8
+  int64_t packed_K = get_row_size(K, use_int8_w8a8);
+  int64_t packed_N = get_row_size(N, use_int8_w8a8);
+
+  // check weight shapes
+  CHECK_EQ(w2.size(0), K);
+  CHECK_EQ(packed_w1.size(1), packed_K);
+  CHECK_EQ(packed_w2.size(1), packed_N);
+
+  // check scales
+  check_moe_scales(use_int8_w8a8, use_fp8_w8a16, w1_scale, w2_scale, block_size, a1_scale, a2_scale);
+
+  at::Tensor out_hidden_states = inplace ? hidden_states : at::empty_like(hidden_states);
+
+  // unlike triton kernel, we fuse silu with gemm1 so only need 2 intermediate_caches:
+  //   1. intermediate_cache1 : [M, N]
+  //   2. C_tmp : [T, 2 * BLOCK_M * BLOCK_N]
+  //
+  // for int8 w8a8:
+  //   3. Aq_tmp : [M, K] or [M, N]
+  //   4. As_tmp : [M]
+  //
+  // for fp8 w8a16:
+  //   5. intermediate_cache0 : [M, 2N]
+  //   6. B_tmp: [T, MAX_CACHE_BLOCK_SIZE, BLOCK_M, max(K, N)]
+  //
+  int num_threads = at::get_num_threads();
+  int64_t buffer_size_nbytes = M * N * 2 + num_threads * 2 * BLOCK_M * BLOCK_N * sizeof(float);
+
+  if (use_int8_w8a8) {
+    buffer_size_nbytes += std::max(M * K, M * N) + M * sizeof(float);
+  }
+  if (use_fp8_w8a16) {
+    buffer_size_nbytes += M * 2 * N * 2 + num_threads * MAX_CACHE_BLOCK_SIZE * BLOCK_M * std::max(K, N) * 2;
+  }
+
+  auto buffer = at::empty({buffer_size_nbytes}, hidden_states.options().dtype(at::kChar));
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(st, "share_experts_kernel_impl", [&] {
+    scalar_t* __restrict__ intermediate_cache1 = (scalar_t*)((void*)(buffer.data_ptr<int8_t>()));
+    float* __restrict__ C_tmp = (float*)((void*)(intermediate_cache1 + M * N));
+
+    if (use_int8_w8a8) {
+      uint8_t* __restrict__ Aq_tmp = (uint8_t*)((void*)(C_tmp + num_threads * 2 * BLOCK_M * BLOCK_N));
+      float* __restrict__ As_tmp = (float*)((void*)(Aq_tmp + std::max(M * K, M * N)));
+
+      auto w1s = w1_scale.value();
+      auto w2s = w2_scale.value();
+      TORCH_CHECK(w1s.numel() == 2 * N);
+      TORCH_CHECK(w2s.numel() == K);
+
+      shared_expert_int8_kernel_impl<scalar_t>(
+          out_hidden_states.data_ptr<scalar_t>(),
+          intermediate_cache1,
+          C_tmp,
+          Aq_tmp,
+          As_tmp,
+          hidden_states.data_ptr<scalar_t>(),
+          packed_w1.data_ptr<int8_t>(),
+          packed_w2.data_ptr<int8_t>(),
+          w1s.data_ptr<float>(),
+          w2s.data_ptr<float>(),
+          fused_experts_out.data_ptr<scalar_t>(),
+          routed_scaling_factor,
+          M,
+          N,
+          K);
+    } else if (use_fp8_w8a16) {
+      scalar_t* __restrict__ intermediate_cache0 = (scalar_t*)((void*)(C_tmp + num_threads * 2 * BLOCK_M * BLOCK_N));
+      scalar_t* __restrict__ B_tmp = (scalar_t*)((void*)(intermediate_cache0 + M * 2 * N));
+
+      CHECK_MOE_SCALES_FP8(0, 1);
+      shared_expert_fp8_kernel_impl<scalar_t>(
+          out_hidden_states.data_ptr<scalar_t>(),
+          intermediate_cache0,
+          intermediate_cache1,
+          B_tmp,
+          C_tmp,
+          hidden_states.data_ptr<scalar_t>(),
+          packed_w1.data_ptr<at::Float8_e4m3fn>(),
+          packed_w2.data_ptr<at::Float8_e4m3fn>(),
+          w1s.data_ptr<float>(),
+          w2s.data_ptr<float>(),
+          block_size_N,
+          block_size_K,
+          fused_experts_out.data_ptr<scalar_t>(),
+          routed_scaling_factor,
+          M,
+          N,
+          K);
+    } else {
+      shared_expert_kernel_impl<scalar_t>(
+          out_hidden_states.data_ptr<scalar_t>(),
+          intermediate_cache1,
+          C_tmp,
+          hidden_states.data_ptr<scalar_t>(),
+          packed_w1.data_ptr<scalar_t>(),
+          packed_w2.data_ptr<scalar_t>(),
+          fused_experts_out.data_ptr<scalar_t>(),
+          routed_scaling_factor,
+          M,
+          N,
+          K);
+    }
+  });
+  return out_hidden_states;
+}
diff --git a/sgl-kernel/csrc/cpu/moe_fp8.cpp b/sgl-kernel/csrc/cpu/moe_fp8.cpp
new file mode 100644
index 000000000..281c00897
--- /dev/null
+++ b/sgl-kernel/csrc/cpu/moe_fp8.cpp
@@ -0,0 +1,491 @@
+#include "common.h"
+#include "gemm.h"
+#include "vec.h"
+
+namespace {
+
+template <typename scalar_t>
+inline void copy_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t size) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+// no remainder
+#pragma GCC unroll 4
+  for (int64_t d = 0; d < size; d += Vec::size()) {
+    Vec data = Vec::loadu(input + d);
+    data.store(out + d);
+  }
+}
+
+template <typename scalar_t>
+inline void copy_mul_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, float weight, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  const fVec weight_vec = fVec(weight);
+  int64_t d;
+#pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    bVec x = bVec::loadu(input + d);
+    fVec x0, x1;
+    std::tie(x0, x1) = at::vec::convert_to_float(x);
+    x0 = x0 * weight_vec;
+    x1 = x1 * weight_vec;
+    bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] * weight);
+  }
+}
+
+// acc from [topk, K] to [K]
+template <typename scalar_t>
+inline void sum_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t topk, int64_t K) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  if (topk == 1) {
+    // do copy for topk = 1
+    copy_stub(out, input, K);
+  } else {
+    // do sum for topk != 1
+    int64_t d;
+#pragma GCC unroll 4
+    for (d = 0; d <= K - kVecSize; d += kVecSize) {
+      fVec sum_fvec0 = fVec(0.f);
+      fVec sum_fvec1 = fVec(0.f);
+      for (int t = 0; t < topk; ++t) {
+        bVec x_bvec = bVec::loadu(input + t * K + d);
+        fVec x_fvec0, x_fvec1;
+        std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
+
+        sum_fvec0 += x_fvec0;
+        sum_fvec1 += x_fvec1;
+      }
+      bVec out_bvec = convert_from_float_ext<scalar_t>(sum_fvec0, sum_fvec1);
+      out_bvec.store(out + d);
+    }
+    for (; d < K; ++d) {
+      float sum_val = 0.f;
+      for (int t = 0; t < topk; ++t) {
+        sum_val += static_cast<float>(input[t * K + d]);
+      }
+      out[d] = static_cast<scalar_t>(sum_val);
+    }
+  }
+}
+
+// out = input + input2 * scale
+template <typename scalar_t>
+inline void add_mul_stub(
+    scalar_t* __restrict__ out,
+    const scalar_t* __restrict__ input,
+    const scalar_t* __restrict__ input2,
+    float scale,
+    int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  const fVec s_vec = fVec(scale);
+
+  int64_t d;
+#pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    bVec x_bvec = bVec::loadu(input + d);
+    fVec x0, x1;
+    std::tie(x0, x1) = at::vec::convert_to_float(x_bvec);
+
+    bVec y_bvec = bVec::loadu(input2 + d);
+    fVec y0, y1;
+    std::tie(y0, y1) = at::vec::convert_to_float(y_bvec);
+
+    x0 = x0 + y0 * s_vec;
+    x1 = x1 + y1 * s_vec;
+    bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] + float(input2[d]) * scale);
+  }
+}
+
+template <typename scalar_t>
+inline void silu_and_mul_stub(
+    scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const scalar_t* __restrict__ input2, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  const fVec one = fVec(1.f);
+
+  // no remainder
+#pragma GCC unroll 4
+  for (int64_t d = 0; d < size; d += bVec::size()) {
+    bVec x = bVec::loadu(input + d);
+    fVec x0, x1;
+    std::tie(x0, x1) = at::vec::convert_to_float(x);
+    bVec y = bVec::loadu(input2 + d);
+    fVec y0, y1;
+    std::tie(y0, y1) = at::vec::convert_to_float(y);
+    x0 = x0 / (one + x0.neg().exp_u20());
+    x1 = x1 / (one + x1.neg().exp_u20());
+    x0 = x0 * y0;
+    x1 = x1 * y1;
+    bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
+    out_vec.store(out + d);
+  }
+}
+
+}  // anonymous namespace
+
+template <typename scalar_t>
+void fused_experts_fp8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic0,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ ic2,
+    scalar_t* __restrict__ A_tmp,
+    scalar_t* __restrict__ B_tmp,
+    float* __restrict__ C_tmp,
+    const scalar_t* __restrict__ input,
+    const at::Float8_e4m3fn* __restrict__ packed_w1,
+    const at::Float8_e4m3fn* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    int64_t block_size_N,
+    int64_t block_size_K,
+    const float* __restrict__ topk_weights,
+    const int32_t* __restrict__ sorted_ids,
+    const int32_t* __restrict__ expert_ids,
+    const int32_t* __restrict__ offsets,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t E,
+    int64_t topk,
+    int64_t num_tokens_post_pad) {
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  // stage 1: intermediate_cache0 = hidden_states @ w1
+  const int64_t MB = div_up(num_tokens_post_pad, BLOCK_M);
+  const int64_t NB = div_up(2 * N, BLOCK_N);
+  int64_t scale_size_N = div_up(2 * N, block_size_N);
+  int64_t scale_size_K = div_up(K, block_size_K);
+  int64_t blocks_n_per_group = block_size_N / BLOCK_N;
+
+  const int64_t stride_e = 2 * N * K;
+  const int64_t stride_n = K;
+
+  int64_t avg_M = std::max(int64_t(1), M * topk / E);
+  const bool use_brgemm = can_use_brgemm<at::Float8_e4m3fn>(avg_M);
+
+  int64_t B_tmp_size_per_thread = MAX_CACHE_BLOCK_SIZE * BLOCK_N * std::max(K, N);
+
+  // here we only parallel on half of 2N to fuse silu_and_mul with gemm
+  parallel_2d(MB, NB, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
+    // get local pointers
+    int tid = get_thread_num();
+    scalar_t* __restrict__ A = A_tmp + tid * BLOCK_M * K;
+
+    loop_2d<at::Float8_e4m3fn>(mb0, mb1, nb0, nb1, BLOCK_N * K, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
+      int64_t n_size = std::min(2 * N - nb * BLOCK_N, BLOCK_N);
+
+      // B shape [K, n_size] in vnni format
+      int32_t expert_id = expert_ids[mb];
+      const at::Float8_e4m3fn* __restrict__ B = packed_w1 + expert_id * stride_e + nb * BLOCK_N * stride_n;
+      const float* __restrict__ Bs =
+          w1s + expert_id * scale_size_N * scale_size_K + (nb / blocks_n_per_group) * scale_size_K;
+
+      // do unpacking for the first row or a new expert
+      int32_t pre_expert_id = mb == 0 ? -1 : expert_ids[mb - 1];
+      bool do_unpack = (mb == mb0) || (expert_id != pre_expert_id);
+
+      // 1.a load A
+      const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
+      int64_t m_size = offsets[mb + 1] - offsets[mb];
+
+      for (int64_t m = 0; m < m_size; ++m) {
+        int32_t index = A_ids[m] / topk;
+        copy_stub(A + m * K, input + index * K, K);
+      }
+
+      const int64_t offset = offsets[mb];
+      tinygemm_kernel<scalar_t>(
+          /*   A            */ A,
+          /*   B            */ B,
+          /*   C            */ ic0 + offset * 2 * N + nb * BLOCK_N,
+          /*   Btmp         */ B_tmp + tid * B_tmp_size_per_thread + nb_offset * BLOCK_N * K,
+          /*   Ctmp         */ C_tmp + tid * 2 * BLOCK_M * BLOCK_N,
+          /*   scale        */ Bs,
+          /*   M            */ m_size,
+          /*   N            */ n_size,
+          /*   K            */ K,
+          /*   lda          */ K,
+          /*   ldb          */ n_size,
+          /*   ldc          */ 2 * N,
+          /*   brg          */ use_brgemm,
+          /*   block_size_K */ block_size_K,
+          /*   do_unpack    */ do_unpack);
+    });
+
+    if (use_brgemm) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+
+  // stage 1.5: intermediate_cache1 = silu(intermediate_cache0)
+  at::parallel_for(0, M * topk, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      silu_and_mul_stub(ic1 + m * N, ic0 + m * 2 * N, ic0 + m * 2 * N + N, N);
+    }
+  });
+
+  // stage 2: intermediate_cache2 = intermediate_cache1 @ w2
+  //   w2 : [E, K, N] as [E, OC, IC]
+  const int64_t OC = K;  // rename K as OC
+  const int64_t IC = N;  // rename N as IC
+  const int64_t MB2 = MB;
+  const int64_t NB2 = div_up(OC, BLOCK_N);
+  scale_size_N = div_up(K, block_size_N);
+  scale_size_K = div_up(N, block_size_K);
+  const int64_t stride_e2 = OC * IC;
+  const int64_t stride_oc = IC;
+
+  // parallel on [MB2, NB2]
+  parallel_2d(MB2, NB2, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
+    int tid = get_thread_num();
+    alignas(64) scalar_t C[BLOCK_M * BLOCK_K];
+
+    loop_2d<at::Float8_e4m3fn>(mb0, mb1, nb0, nb1, BLOCK_N * IC, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
+      int64_t m_size = offsets[mb + 1] - offsets[mb];
+      int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
+
+      // A ptr from ic1 of [M * topk, N] in sorted order
+      // so as to avoid copy A to tmp buffer again
+      const scalar_t* __restrict__ A = ic1 + offsets[mb] * N;
+      const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
+
+      // B shape [IC, n_size] in vnni format
+      int32_t expert_id = expert_ids[mb];
+      const at::Float8_e4m3fn* __restrict__ B = packed_w2 + expert_id * stride_e2 + nb * BLOCK_N * stride_oc;
+      const float* __restrict__ Bs =
+          w2s + expert_id * scale_size_N * scale_size_K + (nb / blocks_n_per_group) * scale_size_K;
+
+      // do unpacking for the first row or a new expert
+      int32_t pre_expert_id = mb == 0 ? -1 : expert_ids[mb - 1];
+      bool do_unpack = (mb == mb0) || (expert_id != pre_expert_id);
+
+      tinygemm_kernel<scalar_t>(
+          /*   A            */ A,
+          /*   B            */ B,
+          /*   C            */ C,
+          /*   Btmp         */ B_tmp + tid * B_tmp_size_per_thread + nb_offset * BLOCK_N * IC,
+          /*   Ctmp         */ C_tmp + tid * 2 * BLOCK_M * BLOCK_N,
+          /*   scale        */ Bs,
+          /*   M            */ m_size,
+          /*   N            */ n_size,
+          /*   K            */ IC,
+          /*   lda          */ IC,
+          /*   ldb          */ n_size,
+          /*   ldc          */ BLOCK_N,
+          /*   brg          */ use_brgemm,
+          /*   block_size_K */ block_size_K,
+          /*   do_unpack    */ do_unpack);
+
+      // 2.b copy from C to ic2 in original order
+      //   and also mul topk_weights in float32
+      for (int64_t m = 0; m < m_size; ++m) {
+        int32_t index = A_ids[m];
+        float weight = topk_weights[index];
+        copy_mul_stub(ic2 + index * K + nb * BLOCK_N, C + m * BLOCK_N, weight, n_size);
+      }
+    });
+
+    if (use_brgemm) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+
+  // stage 3: out = intermediate_cache2.sum(dim=1)
+  //   from [M, topk, K] to [M, K]
+  at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      sum_stub(output + m * K, ic2 + m * topk * K, topk, K);
+    }
+  });
+}
+
+#define INSTANTIATE_MOE_FP8_TEMPLATE(TYPE)             \
+  template void fused_experts_fp8_kernel_impl<TYPE>(   \
+      TYPE* __restrict__ output,                       \
+      TYPE* __restrict__ ic0,                          \
+      TYPE* __restrict__ ic1,                          \
+      TYPE* __restrict__ ic2,                          \
+      TYPE* __restrict__ A_tmp,                        \
+      TYPE* __restrict__ B_tmp,                        \
+      float* __restrict__ C_tmp,                       \
+      const TYPE* __restrict__ input,                  \
+      const at::Float8_e4m3fn* __restrict__ packed_w1, \
+      const at::Float8_e4m3fn* __restrict__ packed_w2, \
+      const float* __restrict__ w1s,                   \
+      const float* __restrict__ w2s,                   \
+      int64_t block_size_N,                            \
+      int64_t block_size_K,                            \
+      const float* __restrict__ topk_weights,          \
+      const int32_t* __restrict__ sorted_ids,          \
+      const int32_t* __restrict__ expert_ids,          \
+      const int32_t* __restrict__ offsets,             \
+      int64_t M,                                       \
+      int64_t N,                                       \
+      int64_t K,                                       \
+      int64_t E,                                       \
+      int64_t topk,                                    \
+      int64_t num_tokens_post_pad)
+
+INSTANTIATE_MOE_FP8_TEMPLATE(at::BFloat16);
+INSTANTIATE_MOE_FP8_TEMPLATE(at::Half);
+
+template <typename scalar_t>
+void shared_expert_fp8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic0,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ B_tmp,
+    float* __restrict__ C_tmp,
+    const scalar_t* __restrict__ input,
+    const at::Float8_e4m3fn* __restrict__ packed_w1,
+    const at::Float8_e4m3fn* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    int64_t block_size_N,
+    int64_t block_size_K,
+    const scalar_t* __restrict__ fused_experts_out,
+    float routed_scaling_factor,
+    int64_t M,
+    int64_t N,
+    int64_t K) {
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  // stage 1: intermediate_cache0 = hidden_states @ w1
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(2 * N, BLOCK_N);
+  int64_t scale_size_K = div_up(K, block_size_K);
+  int64_t blocks_n_per_group = block_size_N / BLOCK_N;
+
+  const bool use_brgemm = can_use_brgemm<at::Float8_e4m3fn>(M);
+
+  int64_t B_tmp_size_per_thread = MAX_CACHE_BLOCK_SIZE * BLOCK_N * std::max(K, N);
+
+  parallel_2d(MB, NB, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
+    int tid = get_thread_num();
+
+    loop_2d<at::Float8_e4m3fn>(mb0, mb1, nb0, nb1, BLOCK_N * K, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
+      int64_t m_size = std::min(M - mb * BLOCK_M, BLOCK_M);
+      int64_t n_size = std::min(2 * N - nb * BLOCK_N, BLOCK_N);
+
+      // do unpacking for the first row
+      bool do_unpack = (mb == mb0);
+
+      tinygemm_kernel<scalar_t>(
+          /*   A            */ input + mb * BLOCK_M * K,
+          /*   B            */ packed_w1 + nb * BLOCK_N * K,
+          /*   C            */ ic0 + mb * BLOCK_M * 2 * N + nb * BLOCK_N,
+          /*   Btmp         */ B_tmp + tid * B_tmp_size_per_thread + nb_offset * BLOCK_N * K,
+          /*   Ctmp         */ C_tmp + tid * 2 * BLOCK_M * BLOCK_N,
+          /*   scale        */ w1s + (nb / blocks_n_per_group) * scale_size_K,
+          /*   M            */ m_size,
+          /*   N            */ n_size,
+          /*   K            */ K,
+          /*   lda          */ K,
+          /*   ldb          */ n_size,
+          /*   ldc          */ 2 * N,
+          /*   brg          */ use_brgemm,
+          /*   block_size_K */ block_size_K,
+          /*   do_unpack    */ do_unpack);
+    });
+
+    if (use_brgemm) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+
+  // stage 1.5: intermediate_cache1 = silu(intermediate_cache0)
+  at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      silu_and_mul_stub(ic1 + m * N, ic0 + m * 2 * N, ic0 + m * 2 * N + N, N);
+    }
+  });
+
+  // stage 2: intermediate_cache2 = intermediate_cache1 @ w2
+  //   w2 : [K, N] as [OC, IC]
+  const int64_t OC = K;  // rename K as OC
+  const int64_t IC = N;  // rename N as IC
+  const int64_t MB2 = MB;
+  const int64_t NB2 = div_up(K, BLOCK_N);
+  scale_size_K = div_up(N, block_size_K);
+
+  // parallel on [MB2, NB2]
+  parallel_2d(MB2, NB2, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
+    int tid = get_thread_num();
+    alignas(64) scalar_t C[BLOCK_M * BLOCK_K];
+
+    loop_2d<at::Float8_e4m3fn>(mb0, mb1, nb0, nb1, BLOCK_N * IC, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
+      int64_t m_size = std::min(M - mb * BLOCK_M, BLOCK_M);
+      int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
+
+      // do unpacking for the first row
+      bool do_unpack = (mb == mb0);
+
+      // 2.a gemm: C = A @ B
+      tinygemm_kernel<scalar_t>(
+          /*   A            */ ic1 + mb * BLOCK_M * N,
+          /*   B            */ packed_w2 + nb * BLOCK_N * N,
+          /*   C            */ C,
+          /*   Btmp         */ B_tmp + tid * B_tmp_size_per_thread + nb_offset * BLOCK_N * IC,
+          /*   Ctmp         */ C_tmp + tid * 2 * BLOCK_M * BLOCK_N,
+          /*   scale        */ w2s + (nb / blocks_n_per_group) * scale_size_K,
+          /*   M            */ m_size,
+          /*   N            */ n_size,
+          /*   K            */ IC,
+          /*   lda          */ IC,
+          /*   ldb          */ n_size,
+          /*   ldc          */ BLOCK_N,
+          /*   brg          */ use_brgemm,
+          /*   block_size_K */ block_size_K,
+          /*   do_unpack    */ do_unpack);
+
+      // 2.b copy from C to output and add fused_experts_out
+      scalar_t* __restrict__ out = output + mb * BLOCK_M * K + nb * BLOCK_N;
+      const scalar_t* __restrict__ fused_out = fused_experts_out + mb * BLOCK_M * K + nb * BLOCK_N;
+      for (int64_t m = 0; m < m_size; ++m) {
+        add_mul_stub(out + m * K, C + m * BLOCK_N, fused_out + m * K, routed_scaling_factor, n_size);
+      }
+    });
+  });
+
+  if (use_brgemm) {
+    at::native::cpublas::brgemm_release();
+  }
+}
+
+#define INSTANTIATE_SHARED_EXPERT_FP8_TEMPLATE(TYPE)   \
+  template void shared_expert_fp8_kernel_impl<TYPE>(   \
+      TYPE* __restrict__ output,                       \
+      TYPE* __restrict__ ic0,                          \
+      TYPE* __restrict__ ic1,                          \
+      TYPE* __restrict__ B_tmp,                        \
+      float* __restrict__ C_tmp,                       \
+      const TYPE* __restrict__ input,                  \
+      const at::Float8_e4m3fn* __restrict__ packed_w1, \
+      const at::Float8_e4m3fn* __restrict__ packed_w2, \
+      const float* __restrict__ w1s,                   \
+      const float* __restrict__ w2s,                   \
+      int64_t block_size_N,                            \
+      int64_t block_size_K,                            \
+      const TYPE* __restrict__ fused_experts_out,      \
+      float routed_scaling_factor,                     \
+      int64_t M,                                       \
+      int64_t N,                                       \
+      int64_t K)
+
+INSTANTIATE_SHARED_EXPERT_FP8_TEMPLATE(at::BFloat16);
+INSTANTIATE_SHARED_EXPERT_FP8_TEMPLATE(at::Half);
diff --git a/sgl-kernel/csrc/cpu/moe_int8.cpp b/sgl-kernel/csrc/cpu/moe_int8.cpp
new file mode 100644
index 000000000..8fbac902f
--- /dev/null
+++ b/sgl-kernel/csrc/cpu/moe_int8.cpp
@@ -0,0 +1,1068 @@
+#include "common.h"
+#include "gemm.h"
+#include "vec.h"
+
+namespace {
+
+template <typename scalar_t>
+inline void copy_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t size) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+// no remainder
+#pragma GCC unroll 4
+  for (int64_t d = 0; d < size; d += Vec::size()) {
+    Vec data = Vec::loadu(input + d);
+    data.store(out + d);
+  }
+}
+
+template <>
+inline void copy_stub<uint8_t>(uint8_t* __restrict__ out, const uint8_t* __restrict__ input, int64_t size) {
+  // size might be 64x + 32
+  std::memcpy(out, input, size * sizeof(uint8_t));
+}
+
+template <typename scalar_t>
+inline void copy_mul_stub(scalar_t* __restrict__ out, const float* __restrict__ input, float weight, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  const fVec weight_vec = fVec(weight);
+  int64_t d;
+#pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec data0 = fVec::loadu(input + d) * weight_vec;
+    fVec data1 = fVec::loadu(input + d + fVec::size()) * weight_vec;
+    bVec out_vec = convert_from_float_ext<scalar_t>(data0, data1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] * weight);
+  }
+}
+
+// acc from [topk, K] to [K]
+template <typename scalar_t>
+inline void sum_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t topk, int64_t K) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  if (topk == 1) {
+    // do copy for topk = 1
+    copy_stub(out, input, K);
+  } else {
+    // do sum for topk != 1
+    int64_t d;
+#pragma GCC unroll 4
+    for (d = 0; d <= K - kVecSize; d += kVecSize) {
+      fVec sum_fvec0 = fVec(0.f);
+      fVec sum_fvec1 = fVec(0.f);
+      for (int t = 0; t < topk; ++t) {
+        bVec x_bvec = bVec::loadu(input + t * K + d);
+        fVec x_fvec0, x_fvec1;
+        std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
+
+        sum_fvec0 += x_fvec0;
+        sum_fvec1 += x_fvec1;
+      }
+      bVec out_bvec = convert_from_float_ext<scalar_t>(sum_fvec0, sum_fvec1);
+      out_bvec.store(out + d);
+    }
+    for (; d < K; ++d) {
+      float sum_val = 0.f;
+      for (int t = 0; t < topk; ++t) {
+        sum_val += static_cast<float>(input[t * K + d]);
+      }
+      out[d] = static_cast<scalar_t>(sum_val);
+    }
+  }
+}
+
+// out = input + input2 * scale
+template <typename scalar_t>
+inline void add_mul_stub(
+    scalar_t* __restrict__ out,
+    const float* __restrict__ input,
+    const scalar_t* __restrict__ input2,
+    float scale,
+    int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  const fVec s_vec = fVec(scale);
+  int64_t d;
+#pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec x0 = fVec::loadu(input + d);
+    fVec x1 = fVec::loadu(input + d + fVec::size());
+
+    bVec y_bvec = bVec::loadu(input2 + d);
+    fVec y0, y1;
+    std::tie(y0, y1) = at::vec::convert_to_float(y_bvec);
+
+    x0 = x0 + y0 * s_vec;
+    x1 = x1 + y1 * s_vec;
+    bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] + float(input2[d]) * scale);
+  }
+}
+
+template <typename scalar_t, int BLOCK_N>
+inline void silu_and_mul(
+    scalar_t* __restrict__ C,
+    const int32_t* __restrict__ C0,  // x: x0, x1
+    const int32_t* __restrict__ C1,  // y: y0, y1
+    const float* __restrict__ As,
+    const float* __restrict__ Bs0,
+    const float* __restrict__ Bs1,
+    const int32_t* __restrict__ Bcomp0,
+    const int32_t* __restrict__ Bcomp1,
+    int64_t m_size,
+    int64_t N) {
+#if defined(CPU_CAPABILITY_AVX512)
+  constexpr int COLS = BLOCK_N / 16;
+  static_assert(COLS % 2 == 0);
+
+  __m512 vc0[COLS];
+  __m512 vc1[COLS];
+  __m512i vcomp0[COLS];
+  __m512i vcomp1[COLS];
+  __m512 vas;
+  __m512 vbs0[COLS];
+  __m512 vbs1[COLS];
+
+  auto load_scale_and_comp = [&](auto col) {
+    vcomp0[col] = _mm512_loadu_si512(Bcomp0 + col * 16);
+    vcomp1[col] = _mm512_loadu_si512(Bcomp1 + col * 16);
+    vbs0[col] = _mm512_loadu_ps(Bs0 + col * 16);
+    vbs1[col] = _mm512_loadu_ps(Bs1 + col * 16);
+  };
+  Unroll<COLS>{}(load_scale_and_comp);
+
+  auto scalec = [&](auto col, int64_t m) {
+    // update As
+    vas = _mm512_set1_ps(As[m]);
+    // C = As * (C - Bcomp) * Bs
+    __m512i vc32_0 = _mm512_loadu_si512(C0 + m * BLOCK_N + col * 16);
+    __m512i vc32_1 = _mm512_loadu_si512(C1 + m * BLOCK_N + col * 16);
+    vc0[col] = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc32_0, vcomp0[col]));
+    vc1[col] = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc32_1, vcomp1[col]));
+    vc0[col] = _mm512_mul_ps(_mm512_mul_ps(vc0[col], vas), vbs0[col]);
+    vc1[col] = _mm512_mul_ps(_mm512_mul_ps(vc1[col], vas), vbs1[col]);
+  };
+
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  const fVec one = fVec(1.f);
+  auto silu_and_mul = [&](auto col) {
+    fVec x = fVec(vc0[col]);
+    fVec y = fVec(vc1[col]);
+    x = x / (one + x.neg().exp_u20());
+    vc0[col] = x * y;
+  };
+
+  auto storec = [&](auto col, int64_t m) {
+    if constexpr (col % 2 == 0) {
+      fVec x0 = fVec(vc0[col + 0]);
+      fVec x1 = fVec(vc0[col + 1]);
+      bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
+      out_vec.store(C + m * N + col * 16);
+    }
+  };
+
+  for (int64_t m = 0; m < m_size; ++m) {
+    Unroll<COLS>{}(scalec, m);
+    Unroll<COLS>{}(silu_and_mul);
+    Unroll<COLS>{}(storec, m);
+  }
+#else
+  TORCH_CHECK(false, "silu_and_mul: scalar path not implemented!");
+#endif
+}
+
+template <int BLOCK_N>
+inline void scale_C(
+    float* __restrict__ C,
+    const int32_t* __restrict__ Ctmp,
+    const float* __restrict__ As,
+    const float* __restrict__ Bs,
+    const int32_t* __restrict__ Bcomp,
+    int64_t m_size) {
+#if defined(CPU_CAPABILITY_AVX512)
+  constexpr int COLS = BLOCK_N / 16;
+  static_assert(COLS % 2 == 0);
+
+  __m512 vc[COLS];
+  __m512i vcomp[COLS];
+  __m512 vas;
+  __m512 vbs[COLS];
+
+  auto load_scale_and_comp = [&](auto col) {
+    vcomp[col] = _mm512_loadu_si512(Bcomp + col * 16);
+    vbs[col] = _mm512_loadu_ps(Bs + col * 16);
+  };
+  Unroll<COLS>{}(load_scale_and_comp);
+
+  auto scalec = [&](auto col, int64_t m) {
+    // update As
+    vas = _mm512_set1_ps(As[m]);
+    // C = As * (C - Bcomp) * Bs
+    __m512i vc32 = _mm512_loadu_si512(Ctmp + m * BLOCK_N + col * 16);
+    vc[col] = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc32, vcomp[col]));
+    vc[col] = _mm512_mul_ps(_mm512_mul_ps(vc[col], vas), vbs[col]);
+    _mm512_storeu_ps(C + m * BLOCK_N + col * 16, vc[col]);
+  };
+
+  for (int64_t m = 0; m < m_size; ++m) {
+    Unroll<COLS>{}(scalec, m);
+  }
+#else
+  TORCH_CHECK(false, "scale_C: scalar path not implemented!");
+#endif
+}
+
+/// gemm for w13
+template <typename scalar_t, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_vnni {
+  static inline void apply(
+      const uint8_t* __restrict__ A,
+      const int8_t* __restrict__ B0,
+      const int8_t* __restrict__ B1,
+      scalar_t* __restrict__ C,
+      const float* __restrict__ As,
+      const float* __restrict__ Bs0,
+      const float* __restrict__ Bs1,
+      const int32_t* __restrict__ Bcomp0,
+      const int32_t* __restrict__ Bcomp1,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
+    TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_vnni<at::BFloat16, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const uint8_t* __restrict__ A,
+      const int8_t* __restrict__ B0,
+      const int8_t* __restrict__ B1,
+      at::BFloat16* __restrict__ C,
+      const float* __restrict__ As,
+      const float* __restrict__ Bs0,
+      const float* __restrict__ Bs1,
+      const int32_t* __restrict__ Bcomp0,
+      const int32_t* __restrict__ Bcomp1,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N / 16;
+    static_assert(COLS % 2 == 0);
+
+    __m512i va;
+    __m512i vb0[COLS];
+    __m512i vb1[COLS];
+    __m512i vc0[ROWS * COLS];
+    __m512i vc1[ROWS * COLS];
+    __m512i vcomp0[COLS];
+    __m512i vcomp1[COLS];
+    __m512 vas;
+    __m512 vbs0[COLS];
+    __m512 vbs1[COLS];
+
+    auto loadc = [&](auto i) {
+      vc0[i] = _mm512_set1_epi32(0);
+      vc1[i] = _mm512_set1_epi32(0);
+    };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    const int64_t K4 = K >> 2;
+    const int64_t lda4 = lda >> 2;
+    const int64_t ldb4 = ldb;  // ldb * 4 >> 2;
+    const int32_t* a_ptr = reinterpret_cast<const int32_t*>(A);
+    const int32_t* b0_ptr = reinterpret_cast<const int32_t*>(B0);
+    const int32_t* b1_ptr = reinterpret_cast<const int32_t*>(B1);
+
+    auto compute = [&](auto i, int64_t k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = _mm512_set1_epi32(a_ptr[row * lda4 + k]);
+      }
+      if constexpr (row == 0) {
+        vb0[col] = _mm512_loadu_si512(b0_ptr + k * ldb4 + col * 16);
+        vb1[col] = _mm512_loadu_si512(b1_ptr + k * ldb4 + col * 16);
+      }
+      vc0[i] = _mm512_dpbusd_epi32(vc0[i], va, vb0[col]);
+      vc1[i] = _mm512_dpbusd_epi32(vc1[i], va, vb1[col]);
+    };
+    for (int64_t k = 0; k < K4; ++k) {
+      Unroll<ROWS * COLS>{}(compute, k);
+    }
+
+    auto scalec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      // load a scale
+      if constexpr (col == 0) {
+        vas = _mm512_set1_ps(As[row]);
+      }
+      // load b scale and vcomp
+      if constexpr (row == 0) {
+        vbs0[col] = _mm512_loadu_ps(Bs0 + col * 16);
+        vbs1[col] = _mm512_loadu_ps(Bs1 + col * 16);
+        vcomp0[col] = _mm512_loadu_si512(Bcomp0 + col * 16);
+        vcomp1[col] = _mm512_loadu_si512(Bcomp1 + col * 16);
+      }
+      __m512 c0 = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc0[i], vcomp0[col]));
+      __m512 c1 = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc1[i], vcomp1[col]));
+      vc0[i] = _mm512_castps_si512(_mm512_mul_ps(_mm512_mul_ps(c0, vas), vbs0[col]));
+      vc1[i] = _mm512_castps_si512(_mm512_mul_ps(_mm512_mul_ps(c1, vas), vbs1[col]));
+    };
+    Unroll<ROWS * COLS>{}(scalec);
+
+    using Vec = at::vec::Vectorized<float>;
+    const Vec one = Vec(1.f);
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+      // for COLS = 2, 4 use 512bit store
+      if constexpr (col % 2 == 0) {
+        Vec x0 = _mm512_castsi512_ps(vc0[row * COLS + col + 0]);
+        Vec x1 = _mm512_castsi512_ps(vc0[row * COLS + col + 1]);
+        Vec y0 = _mm512_castsi512_ps(vc1[row * COLS + col + 0]);
+        Vec y1 = _mm512_castsi512_ps(vc1[row * COLS + col + 1]);
+        // silu
+        x0 = x0 / (one + x0.neg().exp_u20());
+        x1 = x1 / (one + x1.neg().exp_u20());
+        // mul
+        x0 = x0 * y0;
+        x1 = x1 * y1;
+
+        _mm512_storeu_si512(
+            reinterpret_cast<__m512i*>((C + row * ldc + col * 16)),
+            (__m512i)(_mm512_cvtne2ps_pbh(__m512(x1), __m512(x0))));
+      }
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+#endif
+
+#define LAUNCH_TINYGEMM_KERNEL_VNNI(MB_SIZE, NB_SIZE)      \
+  tinygemm_kernel_vnni<scalar_t, MB_SIZE, NB_SIZE>::apply( \
+      A + mb_start * lda,                                  \
+      B0 + nb_start * 4,                                   \
+      B1 + nb_start * 4,                                   \
+      C + mb_start * ldc + nb_start,                       \
+      As + mb_start,                                       \
+      Bs0 + nb_start,                                      \
+      Bs1 + nb_start,                                      \
+      Bcomp0 + nb_start,                                   \
+      Bcomp1 + nb_start,                                   \
+      K,                                                   \
+      lda,                                                 \
+      ldb,                                                 \
+      ldc);
+
+template <typename scalar_t>
+void tinygemm_kernel(
+    const uint8_t* __restrict__ A,
+    const int8_t* __restrict__ B0,
+    const int8_t* __restrict__ B1,
+    scalar_t* __restrict__ C,
+    const float* __restrict__ As,
+    const float* __restrict__ Bs0,
+    const float* __restrict__ Bs1,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc) {
+  const int32_t* Bcomp0 = reinterpret_cast<const int32_t*>(B0 + block_size_n() * K);
+  const int32_t* Bcomp1 = reinterpret_cast<const int32_t*>(B1 + block_size_n() * K);
+
+  // pattern: 1-(2+2)-(8+8)
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 32;
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+  for (int mb = 0; mb < MB; ++mb) {
+    int64_t mb_start = mb * BLOCK_M;
+    int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch (mb_size << 4 | nb_size >> 4) {
+        case 0x12:
+          LAUNCH_TINYGEMM_KERNEL_VNNI(1, 32);
+          break;
+        case 0x22:
+          LAUNCH_TINYGEMM_KERNEL_VNNI(2, 32);
+          break;
+        case 0x32:
+          LAUNCH_TINYGEMM_KERNEL_VNNI(3, 32);
+          break;
+        case 0x42:
+          LAUNCH_TINYGEMM_KERNEL_VNNI(4, 32);
+          break;
+        default:
+          TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+      }
+    }
+  }
+}
+
+/// gemm for w2
+template <typename scalar_t, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_vnni2 {
+  static inline void apply(
+      const uint8_t* __restrict__ A,
+      const int8_t* __restrict__ B,
+      float* __restrict__ C,
+      const float* __restrict__ As,
+      const float* __restrict__ Bs,
+      const int32_t* __restrict__ Bcomp,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
+    TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_vnni2<at::BFloat16, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const uint8_t* __restrict__ A,
+      const int8_t* __restrict__ B,
+      float* __restrict__ C,
+      const float* __restrict__ As,
+      const float* __restrict__ Bs,
+      const int32_t* __restrict__ Bcomp,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N / 16;
+    static_assert(COLS % 2 == 0);
+
+    __m512i va;
+    __m512i vb[COLS];
+    __m512i vc[ROWS * COLS];
+    __m512i vcomp[COLS];
+    __m512 vas;
+    __m512 vbs[COLS];
+
+    auto loadc = [&](auto i) { vc[i] = _mm512_set1_epi32(0); };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    const int64_t K4 = K >> 2;
+    const int64_t lda4 = lda >> 2;
+    const int64_t ldb4 = ldb;  // ldb * 4 >> 2;
+    const int32_t* a_ptr = reinterpret_cast<const int32_t*>(A);
+    const int32_t* b_ptr = reinterpret_cast<const int32_t*>(B);
+
+    auto compute = [&](auto i, int64_t k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = _mm512_set1_epi32(a_ptr[row * lda4 + k]);
+      }
+      if constexpr (row == 0) {
+        vb[col] = _mm512_loadu_si512(b_ptr + k * ldb4 + col * 16);
+      }
+      vc[i] = _mm512_dpbusd_epi32(vc[i], va, vb[col]);
+    };
+    for (int64_t k = 0; k < K4; ++k) {
+      Unroll<ROWS * COLS>{}(compute, k);
+    }
+
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      // load a scale
+      if constexpr (col == 0) {
+        vas = _mm512_set1_ps(As[row]);
+      }
+      // load b scale and vcomp per 2 vectors
+      // also load bias if any
+      if constexpr (row == 0) {
+        if constexpr (col % 2 == 0) {
+          vbs[col + 0] = _mm512_loadu_ps(Bs + col * 16);
+          vbs[col + 1] = _mm512_loadu_ps(Bs + col * 16 + 16);
+          vcomp[col + 0] = _mm512_loadu_si512(Bcomp + col * 16);
+          vcomp[col + 1] = _mm512_loadu_si512(Bcomp + col * 16 + 16);
+        }
+      }
+      __m512 x = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc[i], vcomp[col]));
+      x = _mm512_mul_ps(_mm512_mul_ps(x, vas), vbs[col]);
+      _mm512_storeu_ps(reinterpret_cast<__m512*>(C + row * ldc + col * 16), x);
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+#endif
+
+#define LAUNCH_TINYGEMM_KERNEL_VNNI2(MB_SIZE, NB_SIZE)      \
+  tinygemm_kernel_vnni2<scalar_t, MB_SIZE, NB_SIZE>::apply( \
+      A + mb_start * lda,                                   \
+      B + nb_start * 4,                                     \
+      C + mb_start * ldc + nb_start,                        \
+      As + mb_start,                                        \
+      Bs + nb_start,                                        \
+      Bcomp + nb_start,                                     \
+      K,                                                    \
+      lda,                                                  \
+      ldb,                                                  \
+      ldc);
+
+template <typename scalar_t>
+void tinygemm_kernel(
+    const uint8_t* __restrict__ A,
+    const int8_t* __restrict__ B,
+    float* __restrict__ C,
+    const float* __restrict__ As,
+    const float* __restrict__ Bs,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc) {
+  // B compensation
+  const int32_t* Bcomp = reinterpret_cast<const int32_t*>(B + block_size_n() * K);
+
+  // pattern: 1-4-16
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 64;
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+  for (int64_t mb = 0; mb < MB; ++mb) {
+    int64_t mb_start = mb * BLOCK_M;
+    int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch (mb_size << 4 | nb_size >> 4) {
+        case 0x12:
+          LAUNCH_TINYGEMM_KERNEL_VNNI2(1, 32);
+          break;
+        case 0x22:
+          LAUNCH_TINYGEMM_KERNEL_VNNI2(2, 32);
+          break;
+        case 0x32:
+          LAUNCH_TINYGEMM_KERNEL_VNNI2(3, 32);
+          break;
+        case 0x42:
+          LAUNCH_TINYGEMM_KERNEL_VNNI2(4, 32);
+          break;
+        default:
+          TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+      }
+    }
+  }
+}
+
+}  // anonymous namespace
+
+template <typename scalar_t>
+void fused_experts_int8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ ic2,
+    uint8_t* __restrict__ A_tmp,
+    float* __restrict__ C_tmp,
+    uint8_t* __restrict__ Aq_tmp,
+    float* __restrict__ As_tmp,
+    const scalar_t* __restrict__ input,
+    const int8_t* __restrict__ packed_w1,
+    const int8_t* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    const float* __restrict__ topk_weights,
+    const int32_t* __restrict__ sorted_ids,
+    const int32_t* __restrict__ expert_ids,
+    const int32_t* __restrict__ offsets,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t E,
+    int64_t topk,
+    int64_t num_tokens_post_pad) {
+  // handle 2 tiles per block
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  // stage 0: quantize input to uint8, [M, K]
+  at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      quantize_row_int8<scalar_t>(Aq_tmp + m * K, As_tmp[m], input + m * K, K);
+    }
+  });
+
+  // stage 1: intermediate_cache1 = silu(hidden_states @ w1)
+  const int64_t MB = div_up(num_tokens_post_pad, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  // strides for w1: [E, 2N, K]
+  TORCH_CHECK(N % BLOCK_N == 0, "Fixme when N is not multiples of ", BLOCK_N);
+
+  // K and N are packed for int8
+  const int64_t packed_K = get_row_size<int8_t>(K);
+  const int64_t packed_N = get_row_size<int8_t>(N);
+
+  const int64_t stride_e = 2 * N * packed_K;
+  const int64_t stride_n = packed_K;
+
+  int64_t avg_M = std::max(int64_t(1), M * topk / E);
+  const bool use_brgemm = can_use_brgemm<int8_t>(avg_M);
+
+  // here we only parallel on half of 2N to fuse silu_and_mul with gemm
+  parallel_2d(MB, NB, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
+    // get local pointers
+    int tid = get_thread_num();
+    uint8_t* __restrict__ A = A_tmp + tid * BLOCK_M * K;
+    int32_t* __restrict__ C0 = reinterpret_cast<int32_t*>(C_tmp) + tid * 2 * BLOCK_M * BLOCK_N;
+    int32_t* __restrict__ C1 = C0 + BLOCK_M * BLOCK_N;
+
+    alignas(64) float As[BLOCK_M];
+
+    loop_2d<int8_t>(mb0, mb1, nb0, nb1, BLOCK_N * K * 2, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
+      // nb_upper from top half and nb_lower from bottom half
+      int64_t nb_upper = nb, nb_lower = nb + NB;
+      int64_t n_size = std::min(N - nb * BLOCK_N, BLOCK_N);
+
+      // B shape [K, n_size] in vnni format
+      int32_t expert_id = expert_ids[mb];
+      const int8_t* __restrict__ B0 = packed_w1 + expert_id * stride_e + nb_upper * BLOCK_N * stride_n;
+      const int8_t* __restrict__ B1 = packed_w1 + expert_id * stride_e + nb_lower * BLOCK_N * stride_n;
+      const float* __restrict__ Bs0 = w1s + expert_id * 2 * N + nb_upper * BLOCK_N;
+      const float* __restrict__ Bs1 = w1s + expert_id * 2 * N + nb_lower * BLOCK_N;
+
+      // 1.a load A
+      const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
+      int64_t m_size = offsets[mb + 1] - offsets[mb];
+
+      for (int64_t m = 0; m < m_size; ++m) {
+        int32_t index = A_ids[m] / topk;
+        copy_stub(A + m * K, Aq_tmp + index * K, K);
+        As[m] = As_tmp[index];
+      }
+
+      if (use_brgemm) {
+        // 1.b gemm: C0 = A @ B0
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B0,
+            /* C     */ C0);
+
+        // 1.c gemm: C1 = A @ B1
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B1,
+            /* C     */ C1);
+
+        const int32_t* Bcomp0 = reinterpret_cast<const int32_t*>(B0 + block_size_n() * K);
+        const int32_t* Bcomp1 = reinterpret_cast<const int32_t*>(B1 + block_size_n() * K);
+
+        // 1.d silu and mul
+        const int64_t offset = offsets[mb];
+        silu_and_mul<scalar_t, BLOCK_N>(
+            ic1 + offset * N + nb * BLOCK_N, C0, C1, As, Bs0, Bs1, Bcomp0, Bcomp1, m_size, N);
+      } else {
+        // fused 1.bcd: silu_and_mul(A @ B0, A @ B1)
+        const int64_t offset = offsets[mb];
+        tinygemm_kernel(
+            /* A     */ A,
+            /* B0    */ B0,
+            /* B1    */ B1,
+            /* C     */ ic1 + offset * N + nb * BLOCK_N,
+            /* As    */ As,
+            /* Bs0   */ Bs0,
+            /* Bs1   */ Bs1,
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ N);
+      }
+    });
+
+    if (use_brgemm) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+
+  // stage 1.5: quantize ic1 to uint8, [M * topk, N]
+  at::parallel_for(0, M * topk, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      quantize_row_int8<scalar_t>(Aq_tmp + m * N, As_tmp[m], ic1 + m * N, N);
+    }
+  });
+
+  // stage 2: intermediate_cache2 = intermediate_cache1 @ w2
+  //   w2 : [E, K, N] as [E, OC, IC]
+  const int64_t OC = K;  // rename K as OC
+  const int64_t IC = N;  // rename N as IC
+  const int64_t MB2 = MB;
+  const int64_t NB2 = div_up(OC, BLOCK_N);
+  const int64_t stride_e2 = OC * packed_N;
+  const int64_t stride_oc = packed_N;
+
+  // parallel on [MB2, NB2]
+  parallel_2d(MB2, NB2, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
+    // get local pointers
+    int tid = get_thread_num();
+    float* __restrict__ C = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
+    int32_t* __restrict__ C32 = reinterpret_cast<int32_t*>(C + BLOCK_M * BLOCK_N);
+
+    loop_2d<int8_t>(mb0, mb1, nb0, nb1, BLOCK_N * IC, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
+      int64_t m_size = offsets[mb + 1] - offsets[mb];
+      int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
+
+      // A ptr from ic1 of [M * topk, N] in sorted order
+      // so as to avoid copy A to tmp buffer again
+      const uint8_t* __restrict__ A = Aq_tmp + offsets[mb] * N;
+      const float* __restrict__ As = As_tmp + offsets[mb];
+      const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
+
+      // B shape [IC, n_size] in vnni format
+      int32_t expert_id = expert_ids[mb];
+      const int8_t* __restrict__ B = packed_w2 + expert_id * stride_e2 + nb * BLOCK_N * stride_oc;
+      const float* __restrict__ Bs = w2s + expert_id * K + nb * BLOCK_N;
+
+      // 2.a gemm: C = A @ B
+      if (use_brgemm) {
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ IC,
+            /* lda   */ IC,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B,
+            /* C     */ C32);
+
+        // apply scales
+        const int32_t* Bcomp = reinterpret_cast<const int32_t*>(B + block_size_n() * IC);
+        scale_C<BLOCK_N>(C, C32, As, Bs, Bcomp, m_size);
+      } else {
+        tinygemm_kernel<scalar_t>(
+            /* A     */ A,
+            /* B     */ B,
+            /* C     */ C,
+            /* As    */ As,
+            /* Bs    */ Bs,
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ IC,
+            /* lda   */ IC,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N);
+      }
+
+      // 2.b copy from C to ic2 in original order
+      //   and also mul topk_weights in float32
+      for (int64_t m = 0; m < m_size; ++m) {
+        int32_t index = A_ids[m];
+        float weight = topk_weights[index];
+        copy_mul_stub(ic2 + index * K + nb * BLOCK_N, C + m * BLOCK_N, weight, n_size);
+      }
+    });
+
+    if (use_brgemm) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+
+  // stage 3: out = intermediate_cache2.sum(dim=1)
+  //   from [M, topk, K] to [M, K]
+  at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      sum_stub(output + m * K, ic2 + m * topk * K, topk, K);
+    }
+  });
+}
+
+#define INSTANTIATE_MOE_INT8_TEMPLATE(TYPE)           \
+  template void fused_experts_int8_kernel_impl<TYPE>( \
+      TYPE* __restrict__ output,                      \
+      TYPE* __restrict__ ic1,                         \
+      TYPE* __restrict__ ic2,                         \
+      uint8_t* __restrict__ A_tmp,                    \
+      float* __restrict__ C_tmp,                      \
+      uint8_t* __restrict__ Aq_tmp,                   \
+      float* __restrict__ As_tmp,                     \
+      const TYPE* __restrict__ input,                 \
+      const int8_t* __restrict__ packed_w1,           \
+      const int8_t* __restrict__ packed_w2,           \
+      const float* __restrict__ w1s,                  \
+      const float* __restrict__ w2s,                  \
+      const float* __restrict__ topk_weights,         \
+      const int32_t* __restrict__ sorted_ids,         \
+      const int32_t* __restrict__ expert_ids,         \
+      const int32_t* __restrict__ offsets,            \
+      int64_t M,                                      \
+      int64_t N,                                      \
+      int64_t K,                                      \
+      int64_t E,                                      \
+      int64_t topk,                                   \
+      int64_t num_tokens_post_pad)
+
+INSTANTIATE_MOE_INT8_TEMPLATE(at::BFloat16);
+INSTANTIATE_MOE_INT8_TEMPLATE(at::Half);
+
+template <typename scalar_t>
+void shared_expert_int8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic1,
+    float* __restrict__ C_tmp,
+    uint8_t* __restrict__ Aq_tmp,
+    float* __restrict__ As_tmp,
+    const scalar_t* __restrict__ input,
+    const int8_t* __restrict__ packed_w1,
+    const int8_t* __restrict__ packed_w2,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    const scalar_t* __restrict__ fused_experts_out,
+    float routed_scaling_factor,
+    int64_t M,
+    int64_t N,
+    int64_t K) {
+  // handle 2 tiles per block
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  // stage 0: quantize input to uint8, [M, K]
+  at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      quantize_row_int8<scalar_t>(Aq_tmp + m * K, As_tmp[m], input + m * K, K);
+    }
+  });
+
+  // stage 1: intermediate_cache1 = silu(hidden_states @ w1)
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  TORCH_CHECK(N % BLOCK_N == 0, "Fixme when N is not multiples of ", BLOCK_N);
+
+  // K and N are packed for int8
+  const int64_t packed_K = get_row_size<int8_t>(K);
+  const int64_t packed_N = get_row_size<int8_t>(N);
+  const int64_t stride_n = packed_K;
+
+  const bool use_brgemm = can_use_brgemm<int8_t>(M);
+
+  // here we only parallel on half of 2N to fuse silu_and_mul with gemm
+  parallel_2d(MB, NB, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
+    // get local pointers
+    int tid = get_thread_num();
+    int32_t* __restrict__ C0 = reinterpret_cast<int32_t*>(C_tmp) + tid * 2 * BLOCK_M * BLOCK_N;
+    int32_t* __restrict__ C1 = C0 + BLOCK_M * BLOCK_N;
+
+    loop_2d<int8_t>(mb0, mb1, nb0, nb1, BLOCK_N * K * 2, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
+      // nb_upper from top half and nb_lower from bottom half
+      int64_t nb_upper = nb, nb_lower = nb + NB;
+      int64_t n_size = std::min(N - nb * BLOCK_N, BLOCK_N);
+      int64_t m_size = std::min(M - mb * BLOCK_M, BLOCK_M);
+
+      // A shape [m_size, K]
+      const uint8_t* A = Aq_tmp + mb * BLOCK_M * K;
+      const float* As = As_tmp + mb * BLOCK_M;
+
+      // B shape [K, n_size] in vnni format
+      const int8_t* __restrict__ B0 = packed_w1 + nb_upper * BLOCK_N * stride_n;
+      const int8_t* __restrict__ B1 = packed_w1 + nb_lower * BLOCK_N * stride_n;
+      const float* __restrict__ Bs0 = w1s + nb_upper * BLOCK_N;
+      const float* __restrict__ Bs1 = w1s + nb_lower * BLOCK_N;
+
+      if (use_brgemm) {
+        // 1.b gemm: C0 = A @ B0
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B0,
+            /* C     */ C0);
+
+        // 1.c gemm: C1 = A @ B1
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B1,
+            /* C     */ C1);
+
+        const int32_t* Bcomp0 = reinterpret_cast<const int32_t*>(B0 + block_size_n() * K);
+        const int32_t* Bcomp1 = reinterpret_cast<const int32_t*>(B1 + block_size_n() * K);
+
+        // 1.d silu and mul
+        silu_and_mul<scalar_t, BLOCK_N>(
+            ic1 + mb * BLOCK_M * N + nb * BLOCK_N, C0, C1, As, Bs0, Bs1, Bcomp0, Bcomp1, m_size, N);
+      } else {
+        // fused 1.bcd: silu_and_mul(A @ B0, A @ B1)
+        tinygemm_kernel(
+            /* A     */ A,
+            /* B0    */ B0,
+            /* B1    */ B1,
+            /* C     */ ic1 + mb * BLOCK_M * N + nb * BLOCK_N,
+            /* As    */ As,
+            /* Bs0   */ Bs0,
+            /* Bs1   */ Bs1,
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ N);
+      }
+    });
+
+    if (use_brgemm) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+
+  // stage 1.5: quantize ic1 to uint8, [M * topk, N]
+  at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      quantize_row_int8<scalar_t>(Aq_tmp + m * N, As_tmp[m], ic1 + m * N, N);
+    }
+  });
+
+  // stage 2: intermediate_cache2 = intermediate_cache1 @ w2
+  //   w2 : [K, N] as [OC, IC]
+  const int64_t OC = K;  // rename K as OC
+  const int64_t IC = N;  // rename N as IC
+  const int64_t MB2 = MB;
+  const int64_t NB2 = div_up(OC, BLOCK_N);
+  const int64_t stride_oc = packed_N;
+
+  // parallel on [MB2, NB2]
+  parallel_2d(MB2, NB2, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
+    // get local pointers
+    int tid = get_thread_num();
+    float* __restrict__ C = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
+    int32_t* __restrict__ C32 = reinterpret_cast<int32_t*>(C + BLOCK_M * BLOCK_N);
+
+    loop_2d<int8_t>(mb0, mb1, nb0, nb1, BLOCK_N * IC, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
+      int64_t m_size = std::min(M - mb * BLOCK_M, BLOCK_M);
+      int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
+
+      // A shape [m_size, IC]
+      const uint8_t* __restrict__ A = Aq_tmp + mb * BLOCK_M * N;
+      const float* __restrict__ As = As_tmp + mb * BLOCK_M;
+
+      // B shape [IC, n_size] in vnni format
+      const int8_t* __restrict__ B = packed_w2 + nb * BLOCK_N * stride_oc;
+      const float* __restrict__ Bs = w2s + nb * BLOCK_N;
+
+      if (use_brgemm) {
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ IC,
+            /* lda   */ IC,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B,
+            /* C     */ C32);
+
+        // apply scales
+        const int32_t* Bcomp = reinterpret_cast<const int32_t*>(B + block_size_n() * IC);
+        scale_C<BLOCK_N>(C, C32, As, Bs, Bcomp, m_size);
+      } else {
+        // 2.a gemm: C = A @ B
+        tinygemm_kernel<scalar_t>(
+            /* A     */ A,
+            /* B     */ B,
+            /* C     */ C,
+            /* As    */ As,
+            /* Bs    */ Bs,
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ IC,
+            /* lda   */ IC,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N);
+      }
+
+      // 2.b copy from C to output and add fused_experts_out
+      scalar_t* __restrict__ out = output + mb * BLOCK_M * K + nb * BLOCK_N;
+      const scalar_t* __restrict__ fused_out = fused_experts_out + mb * BLOCK_M * K + nb * BLOCK_N;
+      for (int64_t m = 0; m < m_size; ++m) {
+        add_mul_stub(out + m * K, C + m * BLOCK_N, fused_out + m * K, routed_scaling_factor, n_size);
+      }
+    });
+
+    if (use_brgemm) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+}
+
+#define INSTANTIATE_SHARED_EXPERT_INT8_TEMPLATE(TYPE) \
+  template void shared_expert_int8_kernel_impl<TYPE>( \
+      TYPE* __restrict__ output,                      \
+      TYPE* __restrict__ ic1,                         \
+      float* __restrict__ C_tmp,                      \
+      uint8_t* __restrict__ Aq_tmp,                   \
+      float* __restrict__ As_tmp,                     \
+      const TYPE* __restrict__ input,                 \
+      const int8_t* __restrict__ packed_w1,           \
+      const int8_t* __restrict__ packed_w2,           \
+      const float* __restrict__ w1s,                  \
+      const float* __restrict__ w2s,                  \
+      const TYPE* __restrict__ fused_experts_out,     \
+      float routed_scaling_factor,                    \
+      int64_t M,                                      \
+      int64_t N,                                      \
+      int64_t K)
+
+INSTANTIATE_SHARED_EXPERT_INT8_TEMPLATE(at::BFloat16);
+INSTANTIATE_SHARED_EXPERT_INT8_TEMPLATE(at::Half);
diff --git a/sgl-kernel/csrc/cpu/norm.cpp b/sgl-kernel/csrc/cpu/norm.cpp
new file mode 100644
index 000000000..2c4e1f38d
--- /dev/null
+++ b/sgl-kernel/csrc/cpu/norm.cpp
@@ -0,0 +1,304 @@
+#include "common.h"
+#include "vec.h"
+
+namespace {
+
+// NB: avoid using `at::vec::map<>` on bfloat16 or half
+// Llama4TextL2Norm
+template <typename scalar_t>
+void l2norm_kernel_impl(
+    scalar_t* __restrict__ output,
+    const scalar_t* __restrict__ input,
+    int64_t batch_size,
+    int64_t hidden_size,
+    float eps = 1e-5) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+
+  constexpr int kVecSize = bVec::size();
+  at::parallel_for(0, batch_size, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t i = begin; i < end; ++i) {
+      // local ptrs
+      scalar_t* __restrict__ out_ptr = output + i * hidden_size;
+      const scalar_t* __restrict__ input_ptr = input + i * hidden_size;
+
+      fVec sum_fvec = fVec(float(0));
+      float sum_val = float(0);
+
+      int64_t d;
+#pragma GCC unroll 4
+      for (d = 0; d <= hidden_size - kVecSize; d += kVecSize) {
+        bVec x_bvec = bVec::loadu(input_ptr + d);
+        fVec x_fvec0, x_fvec1;
+        std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
+
+        sum_fvec += x_fvec0 * x_fvec0;
+        sum_fvec += x_fvec1 * x_fvec1;
+      }
+#pragma GCC unroll 4
+      for (; d < hidden_size; ++d) {
+        float x_val = static_cast<float>(input_ptr[d]);
+        sum_val += x_val * x_val;
+      }
+
+      sum_val += vec_reduce_sum(sum_fvec);
+      float rsqrt_var = float(1) / std::sqrt(sum_val / hidden_size + eps);
+      const fVec scale_fvec = fVec(rsqrt_var);
+
+#pragma GCC unroll 4
+      for (d = 0; d <= hidden_size - kVecSize; d += kVecSize) {
+        bVec x_bvec = bVec::loadu(input_ptr + d);
+        fVec x_fvec0, x_fvec1;
+        std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
+
+        x_fvec0 = x_fvec0 * scale_fvec;
+        x_fvec1 = x_fvec1 * scale_fvec;
+
+        bVec out_bvec = convert_from_float_ext<scalar_t>(x_fvec0, x_fvec1);
+        out_bvec.store(out_ptr + d);
+      }
+#pragma GCC unroll 4
+      for (; d < hidden_size; ++d) {
+        float x_val = static_cast<float>(input_ptr[d]);
+        out_ptr[d] = static_cast<scalar_t>(x_val * rsqrt_var);
+      }
+    }
+  });
+}
+template <typename scalar_t>
+void rmsnorm_kernel_impl(
+    scalar_t* __restrict__ output,
+    const scalar_t* __restrict__ input,
+    const scalar_t* __restrict__ weight,
+    int64_t batch_size,
+    int64_t hidden_size,
+    int64_t input_strideN,
+    float eps = 1e-5) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+
+  constexpr int kVecSize = bVec::size();
+  at::parallel_for(0, batch_size, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t i = begin; i < end; ++i) {
+      // local ptrs
+      scalar_t* __restrict__ out_ptr = output + i * hidden_size;
+      const scalar_t* __restrict__ input_ptr = input + i * input_strideN;
+
+      fVec sum_fvec = fVec(float(0));
+      float sum_val = float(0);
+
+      int64_t d;
+#pragma GCC unroll 4
+      for (d = 0; d <= hidden_size - kVecSize; d += kVecSize) {
+        bVec x_bvec = bVec::loadu(input_ptr + d);
+        fVec x_fvec0, x_fvec1;
+        std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
+
+        sum_fvec += x_fvec0 * x_fvec0;
+        sum_fvec += x_fvec1 * x_fvec1;
+      }
+#pragma GCC unroll 4
+      for (; d < hidden_size; ++d) {
+        float x_val = static_cast<float>(input_ptr[d]);
+        sum_val += x_val * x_val;
+      }
+
+      sum_val += vec_reduce_sum(sum_fvec);
+      float rsqrt_var = float(1) / std::sqrt(sum_val / hidden_size + eps);
+      const fVec scale_fvec = fVec(rsqrt_var);
+
+#pragma GCC unroll 4
+      for (d = 0; d <= hidden_size - kVecSize; d += kVecSize) {
+        bVec x_bvec = bVec::loadu(input_ptr + d);
+        fVec x_fvec0, x_fvec1;
+        std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
+
+        bVec w_bvec = bVec::loadu(weight + d);
+        fVec w_fvec0, w_fvec1;
+        std::tie(w_fvec0, w_fvec1) = at::vec::convert_to_float(w_bvec);
+
+        x_fvec0 = x_fvec0 * scale_fvec * w_fvec0;
+        x_fvec1 = x_fvec1 * scale_fvec * w_fvec1;
+
+        bVec out_bvec = convert_from_float_ext<scalar_t>(x_fvec0, x_fvec1);
+        out_bvec.store(out_ptr + d);
+      }
+#pragma GCC unroll 4
+      for (; d < hidden_size; ++d) {
+        float x_val = static_cast<float>(input_ptr[d]);
+        float w_val = static_cast<float>(weight[d]);
+        out_ptr[d] = static_cast<scalar_t>(x_val * rsqrt_var * w_val);
+      }
+    }
+  });
+}
+
+template <typename scalar_t>
+void fused_add_rmsnorm_kernel_impl(
+    scalar_t* __restrict__ input,
+    scalar_t* __restrict__ residual,
+    const scalar_t* __restrict__ weight,
+    float* __restrict__ buffer,
+    int64_t batch_size,
+    int64_t hidden_size,
+    int64_t input_strideN,
+    float eps = 1e-5) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+
+  constexpr int kVecSize = bVec::size();
+  at::parallel_for(0, batch_size, 0, [&](int64_t begin, int64_t end) {
+    int tid = at::get_thread_num();
+    float* __restrict__ buffer_ptr = buffer + tid * hidden_size;
+
+    for (int64_t i = begin; i < end; ++i) {
+      // local ptrs
+      scalar_t* __restrict__ input_ptr = input + i * input_strideN;
+      scalar_t* __restrict__ residual_ptr = residual + i * hidden_size;
+
+      fVec sum_fvec = fVec(float(0));
+      float sum_val = float(0);
+
+      int64_t d;
+#pragma GCC unroll 4
+      for (d = 0; d <= hidden_size - kVecSize; d += kVecSize) {
+        bVec x_bvec = bVec::loadu(input_ptr + d);
+        fVec x_fvec0, x_fvec1;
+        std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
+
+        bVec r_bvec = bVec::loadu(residual_ptr + d);
+        fVec r_fvec0, r_fvec1;
+        std::tie(r_fvec0, r_fvec1) = at::vec::convert_to_float(r_bvec);
+
+        x_fvec0 += r_fvec0;
+        x_fvec1 += r_fvec1;
+
+        bVec out_bvec = convert_from_float_ext<scalar_t>(x_fvec0, x_fvec1);
+        out_bvec.store(residual_ptr + d);
+
+        sum_fvec += x_fvec0 * x_fvec0;
+        sum_fvec += x_fvec1 * x_fvec1;
+
+        x_fvec0.store(buffer_ptr + d);
+        x_fvec1.store(buffer_ptr + d + fVec::size());
+      }
+#pragma GCC unroll 4
+      for (; d < hidden_size; ++d) {
+        float x_val = static_cast<float>(input_ptr[d]);
+        float r_val = static_cast<float>(residual_ptr[d]);
+
+        x_val += r_val;
+        residual_ptr[d] = static_cast<scalar_t>(x_val);
+
+        sum_val += x_val * x_val;
+        buffer_ptr[d] = x_val;
+      }
+
+      sum_val += vec_reduce_sum(sum_fvec);
+      float rsqrt_var = float(1) / std::sqrt(sum_val / hidden_size + eps);
+      const fVec scale_fvec = fVec(rsqrt_var);
+
+#pragma GCC unroll 4
+      for (d = 0; d <= hidden_size - kVecSize; d += kVecSize) {
+        fVec x_fvec0 = fVec::loadu(buffer_ptr + d);
+        fVec x_fvec1 = fVec::loadu(buffer_ptr + d + fVec::size());
+
+        bVec w_bvec = bVec::loadu(weight + d);
+        fVec w_fvec0, w_fvec1;
+        std::tie(w_fvec0, w_fvec1) = at::vec::convert_to_float(w_bvec);
+
+        x_fvec0 = x_fvec0 * scale_fvec * w_fvec0;
+        x_fvec1 = x_fvec1 * scale_fvec * w_fvec1;
+        bVec x_bvec = convert_from_float_ext<scalar_t>(x_fvec0, x_fvec1);
+        x_bvec.store(input_ptr + d);
+      }
+#pragma GCC unroll 4
+      for (; d < hidden_size; ++d) {
+        float x_val = buffer_ptr[d] * rsqrt_var * static_cast<float>(weight[d]);
+        input_ptr[d] = x_val;
+      }
+    }
+  });
+}
+
+}  // anonymous namespace
+
+// input : {batch_size, hidden_size}
+at::Tensor l2norm_cpu(at::Tensor& input, double eps) {
+  RECORD_FUNCTION("sgl-kernel::l2norm_cpu", std::vector<c10::IValue>({input}));
+
+  CHECK_INPUT(input);
+  CHECK_DIM(2, input);
+  int64_t batch_size = input.size(0);
+  int64_t hidden_size = input.size(1);
+  at::Tensor output = at::empty_like(input);
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "l2norm_kernel", [&] {
+    l2norm_kernel_impl<scalar_t>(output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), batch_size, hidden_size, eps);
+  });
+  return output;
+}
+
+// input : {batch_size, hidden_size}
+// weight: {hidden_size}
+at::Tensor rmsnorm_cpu(at::Tensor& input, at::Tensor& weight, double eps) {
+  RECORD_FUNCTION("sgl-kernel::rmsnorm_cpu", std::vector<c10::IValue>({input, weight}));
+
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(input);
+  CHECK_INPUT(weight);
+  CHECK_DIM(2, input);
+  CHECK_DIM(1, weight);
+  CHECK_EQ(input.size(1), weight.size(0));
+  int64_t batch_size = input.size(0);
+  int64_t hidden_size = input.size(1);
+  at::Tensor output = at::empty_like(input);
+  int64_t input_strideN = input.stride(0);
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "rmsnorm_kernel", [&] {
+    rmsnorm_kernel_impl<scalar_t>(
+        output.data_ptr<scalar_t>(),
+        input.data_ptr<scalar_t>(),
+        weight.data_ptr<scalar_t>(),
+        batch_size,
+        hidden_size,
+        input_strideN,
+        eps);
+  });
+  return output;
+}
+
+// input   : {batch_size, hidden_size}
+// residual: {batch_size, hidden_size}
+// weight  : {hidden_size}
+void fused_add_rmsnorm_cpu(at::Tensor& input, at::Tensor& residual, at::Tensor& weight, double eps) {
+  RECORD_FUNCTION("sgl-kernel::fused_add_rmsnorm_cpu", std::vector<c10::IValue>({input, residual, weight}));
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(input);
+  CHECK_INPUT(residual);
+  CHECK_INPUT(weight);
+  CHECK_DIM(2, input);
+  CHECK_DIM(2, residual);
+  CHECK_DIM(1, weight);
+  CHECK_EQ(input.size(0), residual.size(0));
+  CHECK_EQ(input.size(1), residual.size(1));
+  CHECK_EQ(input.size(1), weight.size(0));
+  int64_t batch_size = input.size(0);
+  int64_t hidden_size = input.size(1);
+  int64_t input_strideN = input.stride(0);
+
+  // allocate temp buffer to store x in float32 per thread
+  // TODO: implement a singleton for context
+  int64_t num_threads = at::get_num_threads();
+  at::Tensor buffer = at::empty({num_threads, hidden_size}, input.options().dtype(at::kFloat));
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "fused_add_rmsnorm_kernel", [&] {
+    fused_add_rmsnorm_kernel_impl<scalar_t>(
+        input.data_ptr<scalar_t>(),
+        residual.data_ptr<scalar_t>(),
+        weight.data_ptr<scalar_t>(),
+        buffer.data_ptr<float>(),
+        batch_size,
+        hidden_size,
+        input_strideN,
+        eps);
+  });
+}
diff --git a/sgl-kernel/csrc/cpu/numa_utils.cpp b/sgl-kernel/csrc/cpu/numa_utils.cpp
new file mode 100644
index 000000000..2699d0e23
--- /dev/null
+++ b/sgl-kernel/csrc/cpu/numa_utils.cpp
@@ -0,0 +1,91 @@
+#include <numa.h>
+#include <sched.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <string>
+
+#include "common.h"
+
+std::string init_cpu_threads_env(const std::string& cpu_ids) {
+  bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
+  TORCH_CHECK(omp_cpu_mask->size > 0);
+  std::vector<int> omp_cpu_ids;
+  omp_cpu_ids.reserve(omp_cpu_mask->size);
+
+  constexpr int group_size = 8 * sizeof(*omp_cpu_mask->maskp);
+
+  for (int offset = 0; offset < omp_cpu_mask->size; offset += group_size) {
+    unsigned long group_mask = omp_cpu_mask->maskp[offset / group_size];
+    int i = 0;
+    while (group_mask) {
+      if (group_mask & 1) {
+        omp_cpu_ids.emplace_back(offset + i);
+      }
+      ++i;
+      group_mask >>= 1;
+    }
+  }
+
+  // Memory node binding
+  if (numa_available() != -1) {
+    int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front());
+    bitmask* mask = numa_parse_nodestring(std::to_string(mem_node_id).c_str());
+    bitmask* src_mask = numa_get_membind();
+
+    int pid = getpid();
+
+    // move all existing pages to the specified numa node.
+    *(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
+    int page_num = numa_migrate_pages(pid, src_mask, mask);
+    if (page_num == -1) {
+      TORCH_WARN(false, "numa_migrate_pages failed. errno: " + std::to_string(errno));
+    }
+
+    // restrict memory allocation node.
+    numa_set_membind(mask);
+    numa_set_strict(1);
+  }
+
+  // OMP threads binding
+  omp_set_num_threads((int)omp_cpu_ids.size());
+  at::set_num_threads((int)omp_cpu_ids.size());
+  TORCH_CHECK_EQ(omp_cpu_ids.size(), at::get_num_threads());
+  TORCH_CHECK_EQ(omp_cpu_ids.size(), omp_get_max_threads());
+
+  std::vector<std::pair<int, int>> thread_core_mapping;
+  thread_core_mapping.reserve(omp_cpu_ids.size());
+  omp_lock_t writelock;
+  omp_init_lock(&writelock);
+
+#pragma omp parallel for schedule(static, 1)
+  for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+    CPU_SET(omp_cpu_ids[i], &mask);
+    int ret = sched_setaffinity(0, sizeof(cpu_set_t), &mask);
+    if (ret == -1) {
+      TORCH_CHECK(false, "sched_setaffinity failed. errno: " + std::to_string(errno));
+    }
+
+    omp_set_lock(&writelock);
+    thread_core_mapping.emplace_back(syscall(SYS_gettid), omp_cpu_ids[i]);
+    omp_unset_lock(&writelock);
+  }
+
+  omp_destroy_lock(&writelock);
+
+  numa_free_nodemask(omp_cpu_mask);
+
+  std::stringstream ss;
+  ss << "OMP threads binding of Process " << getpid() << ":\n";
+  std::sort(
+      thread_core_mapping.begin(), thread_core_mapping.end(), [](auto&& a, auto&& b) { return a.second < b.second; });
+  for (auto&& item : thread_core_mapping) {
+    ss << "\t"
+       << "OMP tid: " << item.first << ", core " << item.second << "\n";
+  }
+
+  return ss.str();
+}
diff --git a/sgl-kernel/csrc/cpu/qkv_proj.cpp b/sgl-kernel/csrc/cpu/qkv_proj.cpp
new file mode 100644
index 000000000..b3e2072e8
--- /dev/null
+++ b/sgl-kernel/csrc/cpu/qkv_proj.cpp
@@ -0,0 +1,701 @@
+#include "common.h"
+#include "gemm.h"
+#include "vec.h"
+
+namespace {
+
+// [NOTE]: Fused kernel for QKV projection with weight absorption and RoPE
+//
+//   1. `q_a_proj` and `kv_a_proj_with_mqa` fused into one gemm,
+//      otherwise we need to split IC for the 2nd gemm.
+//   2. `q_a_layernorm` and `kv_a_layernorm` fused into one parallel loop.
+//   3. k_input and v_input share the same storage, the torch API did
+//      this in `set_kv_buffer`. No additional memory movement.
+//
+
+// [C0, C1] = A @ [B0, B1]
+template <typename scalar_t>
+void segment_gemm_kernel_impl(
+    scalar_t* __restrict__ C0,
+    scalar_t* __restrict__ C1,
+    const scalar_t* __restrict__ A,
+    const scalar_t* __restrict__ B0,
+    const scalar_t* __restrict__ B1,
+    int64_t M,
+    int64_t N0,
+    int64_t N1,
+    int64_t K) {
+  // convert_weight_packed make sure N0 and N1 are 32x
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB0 = div_up(N0, BLOCK_N);
+  const int64_t NB1 = div_up(N1, BLOCK_N);
+  const int64_t NB = NB0 + NB1;
+
+  const bool use_brgemm = can_use_brgemm<scalar_t>(M);
+
+  // parallel on [MB, NB0 + NB1]
+  at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+    int64_t mb{0}, nb{0};
+    data_index_init(begin, mb, MB, nb, NB);
+
+    // for brgemm, use float32 for accumulate
+    alignas(64) float Ctmp[BLOCK_M * BLOCK_N];
+
+    for (int64_t i = begin; i < end; ++i) {
+      UNUSED(i);
+      int mb_start = mb * BLOCK_M;
+      int mb_size = std::min(M - mb_start, BLOCK_M);
+      int nb_start = nb * BLOCK_N;
+      int nb_size = BLOCK_N;
+
+      const scalar_t* __restrict__ B = nb < NB0 ? B0 : B1;
+      scalar_t* __restrict__ C = nb < NB0 ? C0 : C1;
+      int64_t ldc = nb < NB0 ? N0 : N1;
+      int64_t local_nb_start = nb < NB0 ? nb_start : nb_start - N0;
+
+      tinygemm_kernel<scalar_t>(
+          /*   A */ A + mb_start * K,
+          /*   B */ B + local_nb_start * K /* nb * BLOCK_N * K */,
+          /*   C */ C + mb_start * ldc + local_nb_start,
+          /* Ctmp*/ Ctmp,
+          /*   M */ mb_size,
+          /*   N */ nb_size,
+          /*   K */ K,
+          /* lda */ K,
+          /* ldb */ nb_size,
+          /* ldc */ ldc,
+          /* brg */ use_brgemm);
+
+      // move to the next index
+      data_index_step(mb, MB, nb, NB);
+    }
+
+    if (use_brgemm) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+}
+
+// [C0, C1] = A @ [B0, B1]
+template <typename scalar_t>
+void segment_gemm_kernel_impl(
+    scalar_t* __restrict__ C0,
+    scalar_t* __restrict__ C1,
+    const uint8_t* __restrict__ A,
+    const int8_t* __restrict__ B0,
+    const int8_t* __restrict__ B1,
+    const float* __restrict__ As,
+    const float* __restrict__ Bs0,
+    const float* __restrict__ Bs1,
+    int64_t M,
+    int64_t N0,
+    int64_t N1,
+    int64_t K) {
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB0 = div_up(N0, BLOCK_N);
+  const int64_t NB1 = div_up(N1, BLOCK_N);
+  const int64_t NB = NB0 + NB1;
+
+  const bool use_brgemm = can_use_brgemm<int8_t>(M);
+
+  // K + 4 after compensation
+  const int64_t packed_row_size = get_row_size<int8_t>(K);
+
+  // parallel on [MB, NB0 + NB1]
+  at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+    int64_t mb{0}, nb{0};
+    data_index_init(begin, mb, MB, nb, NB);
+
+    // for brgemm, use float32 for accumulate
+    alignas(64) int32_t Ctmp[BLOCK_M * BLOCK_N];
+
+    for (int64_t i = begin; i < end; ++i) {
+      UNUSED(i);
+      int mb_start = mb * BLOCK_M;
+      int mb_size = std::min(M - mb_start, BLOCK_M);
+      int nb_start = nb * BLOCK_N;
+      int nb_size = BLOCK_N;
+
+      const int8_t* __restrict__ B = nb < NB0 ? B0 : B1;
+      const float* __restrict__ Bs = nb < NB0 ? Bs0 : Bs1;
+      scalar_t* __restrict__ C = nb < NB0 ? C0 : C1;
+      int64_t ldc = nb < NB0 ? N0 : N1;
+      int64_t local_nb_start = nb < NB0 ? nb_start : nb_start - N0;
+
+      tinygemm_kernel<scalar_t>(
+          /*   A */ A + mb_start * K,
+          /*   B */ B + local_nb_start * packed_row_size /* nb * BLOCK_N * (K + 4) */,
+          /*   C */ C + mb_start * ldc + local_nb_start,
+          /* Ctmp*/ Ctmp,
+          /*  As */ As + mb_start,
+          /*  Bs */ Bs + local_nb_start,
+          /*   M */ mb_size,
+          /*   N */ nb_size,
+          /*   K */ K,
+          /* lda */ K,
+          /* ldb */ nb_size,
+          /* ldc */ ldc,
+          /* brg */ use_brgemm);
+
+      // move to the next index
+      data_index_step(mb, MB, nb, NB);
+    }
+
+    if (use_brgemm) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+}
+
+// [C0, C1] = A @ [B0, B1]
+template <typename scalar_t>
+void segment_gemm_kernel_impl(
+    scalar_t* __restrict__ C0,
+    scalar_t* __restrict__ C1,
+    const scalar_t* __restrict__ A,
+    const at::Float8_e4m3fn* __restrict__ B0,
+    const at::Float8_e4m3fn* __restrict__ B1,
+    const float* __restrict__ Bs0,
+    const float* __restrict__ Bs1,
+    scalar_t* __restrict__ Btmp,
+    int64_t M,
+    int64_t N0,
+    int64_t N1,
+    int64_t K,
+    int64_t block_size_N,
+    int64_t block_size_K) {
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB0 = div_up(N0, BLOCK_N);
+  const int64_t NB1 = div_up(N1, BLOCK_N);
+  const int64_t NB = NB0 + NB1;
+
+  const int64_t scale_size_K = div_up(K, block_size_K);
+  const int64_t blocks_n_per_group = block_size_N / BLOCK_N;
+
+  const bool use_brgemm = can_use_brgemm<at::Float8_e4m3fn>(M);
+
+  // parallel on [MB, NB0 + NB1]
+  at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+    int64_t mb{0}, nb{0};
+    data_index_init(begin, mb, MB, nb, NB);
+
+    int tid = at::get_thread_num();
+    // for brgemm, use float32 for accumulate
+    alignas(64) float Ctmp[BLOCK_M * BLOCK_N];
+
+    for (int64_t i = begin; i < end; ++i) {
+      UNUSED(i);
+
+      int mb_start = mb * BLOCK_M;
+      int mb_size = std::min(M - mb_start, BLOCK_M);
+      int nb_start = nb * BLOCK_N;
+      int nb_size = BLOCK_N;
+
+      const at::Float8_e4m3fn* __restrict__ B = nb < NB0 ? B0 : B1;
+      const float* __restrict__ Bs = nb < NB0 ? Bs0 : Bs1;
+      scalar_t* __restrict__ C = nb < NB0 ? C0 : C1;
+      int64_t ldc = nb < NB0 ? N0 : N1;
+      int64_t local_nb_start = nb < NB0 ? nb_start : nb_start - N0;
+      int64_t new_nb = nb < NB0 ? nb : nb - NB0;
+
+      tinygemm_kernel<scalar_t>(
+          /*   A */ A + mb_start * K,
+          /*   B */ B + local_nb_start * K /* nb * BLOCK_N * K */,
+          /*   C */ C + mb_start * ldc + local_nb_start,
+          /* Btmp*/ Btmp + tid * BLOCK_N * K,
+          /* Ctmp*/ Ctmp,
+          /*  Bs */ Bs + (new_nb / blocks_n_per_group) * scale_size_K,
+          /*   M */ mb_size,
+          /*   N */ nb_size,
+          /*   K */ K,
+          /* lda */ K,
+          /* ldb */ nb_size,
+          /* ldc */ ldc,
+          /* brg */ use_brgemm,
+          /* block_size_K */ block_size_K);
+
+      // move to the next index
+      data_index_step(mb, MB, nb, NB);
+    }
+
+    if (use_brgemm) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+}
+
+template <typename scalar_t>
+inline float reduce(const scalar_t* __restrict__ x, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  fVec sum_fvec = fVec(float(0));
+
+// no remainder
+#pragma GCC unroll 4
+  for (int64_t d = 0; d < size; d += bVec::size()) {
+    bVec x_bvec = bVec::loadu(x + d);
+    fVec x_fvec0, x_fvec1;
+    std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
+    sum_fvec += x_fvec0 * x_fvec0;
+    sum_fvec += x_fvec1 * x_fvec1;
+  }
+  return vec_reduce_sum(sum_fvec);
+}
+
+// map2 from aten functional doesn't have fast bf16->fp32 conversion
+template <typename scalar_t>
+inline void map2(scalar_t* y, const scalar_t* x, const scalar_t* __restrict__ w, float scale, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  fVec scale_fvec = fVec(scale);
+
+// no remainder
+#pragma GCC unroll 4
+  for (int64_t d = 0; d < size; d += bVec::size()) {
+    bVec x_bvec = bVec::loadu(x + d);
+    fVec x_fvec0, x_fvec1;
+    std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
+    bVec w_bvec = bVec::loadu(w + d);
+    fVec w_fvec0, w_fvec1;
+    std::tie(w_fvec0, w_fvec1) = at::vec::convert_to_float(w_bvec);
+    x_fvec0 = x_fvec0 * scale_fvec * w_fvec0;
+    x_fvec1 = x_fvec1 * scale_fvec * w_fvec1;
+    bVec out_bvec = convert_from_float_ext<scalar_t>(x_fvec0, x_fvec1);
+    out_bvec.store(y + d);
+  }
+}
+
+template <typename scalar_t>
+void rms_norm_kernel_impl(
+    scalar_t* __restrict__ input0,
+    scalar_t* __restrict__ input1,
+    const scalar_t* __restrict__ weight0,
+    const scalar_t* __restrict__ weight1,
+    int64_t M,
+    int64_t N0,
+    int64_t N1,
+    int64_t stride1,
+    float eps = 1e-5) {
+  at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      scalar_t* x0 = input0 + m * N0;
+      scalar_t* x1 = input1 + m * stride1;
+      float scale0 = reduce(x0, N0);
+      float scale1 = reduce(x1, N1);
+      scale0 = float(1) / std::sqrt(scale0 / N0 + eps);
+      scale1 = float(1) / std::sqrt(scale1 / N1 + eps);
+      map2(x0, x0, weight0, scale0, N0);
+      map2(x1, x1, weight1, scale1, N1);
+    }
+  });
+}
+
+template <typename scalar_t>
+inline void rotary(const scalar_t* input, scalar_t* out, const scalar_t* cos, const scalar_t* sin, int64_t size) {
+  TORCH_CHECK(false, "rotary scalar path not implemented.");
+}
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <>
+inline void rotary<at::BFloat16>(
+    const at::BFloat16* input, at::BFloat16* out, const at::BFloat16* cos, const at::BFloat16* sin, int64_t size) {
+  // permute indices
+  const __m512i idx1 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+  const __m512i idx2 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+  const __m512i idy1 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
+  const __m512i idy2 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
+
+// rotary dim is 64, just 2 iters
+#pragma GCC unroll 2
+  for (int64_t d = 0; d < size; d += 32) {
+    int64_t d2 = d >> 1;
+    // load coefs
+    __m512 vcos = CVT_BF16_TO_FP32(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(cos + d2)));
+    __m512 vsin = CVT_BF16_TO_FP32(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(sin + d2)));
+    // load input
+    __m512i a16 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(input + d));
+    __m512 a = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32(a16, 0));
+    __m512 b = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32(a16, 1));
+    // from [16, 2] to [2, 16]
+    __m512 in1 = _mm512_mask_permutex2var_ps(a, 0xffff, idx1, b);
+    __m512 in2 = _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b);
+    // out1 = in1 * cos - in2 * sin;
+    // out2 = in2 * cos + in1 * sin
+    __m512 out1 = _mm512_sub_ps(_mm512_mul_ps(in1, vcos), _mm512_mul_ps(in2, vsin));
+    __m512 out2 = _mm512_add_ps(_mm512_mul_ps(in2, vcos), _mm512_mul_ps(in1, vsin));
+    // from [2, 16] to [16, 2]
+    a = _mm512_mask_permutex2var_ps(out1, 0xffff, idy1, out2);
+    b = _mm512_mask_permutex2var_ps(out1, 0xffff, idy2, out2);
+
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>((out + d)), (__m512i)(_mm512_cvtne2ps_pbh(b, a)));
+  }
+}
+#endif
+
+template <typename scalar_t>
+void rotary_emb_kernel_impl(
+    scalar_t* q_pe_out,
+    scalar_t* k_pe_out,
+    const scalar_t* q_pe,
+    const scalar_t* k_pe,
+    const int64_t* pos,
+    const scalar_t* cos_sin,
+    int64_t num_seqs,
+    int64_t num_heads,
+    int64_t rotary_dim,
+    int64_t q_strideB,
+    int64_t q_strideH,
+    int64_t k_strideB,
+    int64_t oq_strideB,
+    int64_t oq_strideH,
+    int64_t ok_strideB) {
+  TORCH_CHECK(rotary_dim % 32 == 0, "rotary_dim is not 32x.");
+  const int64_t rotary_offset = rotary_dim / 2;
+
+  // parallel on [num_seqs, num_heads + 1]
+  // top [num_heads] handle q_pe and bottom [1] handle k_pe
+  at::parallel_for(0, num_seqs * (num_heads + 1), GRAIN_SIZE / rotary_dim, [&](int64_t begin, int64_t end) {
+    int64_t seq{0}, head_id{0};
+    data_index_init(begin, seq, num_seqs, head_id, num_heads + 1);
+
+    for (int64_t i = begin; i < end; ++i) {
+      UNUSED(i);
+      // get cos and sin cache ptr
+      int64_t index = pos[seq];
+      const scalar_t* cos = cos_sin + index * rotary_dim;
+      const scalar_t* sin = cos + rotary_offset;
+
+      const scalar_t* input =
+          (head_id < num_heads) ? q_pe + seq * q_strideB + head_id * q_strideH : k_pe + seq * k_strideB;
+      scalar_t* out =
+          (head_id < num_heads) ? q_pe_out + seq * oq_strideB + head_id * oq_strideH : k_pe_out + seq * ok_strideB;
+      rotary<scalar_t>(input, out, cos, sin, rotary_dim);
+
+      // move to the next index
+      data_index_step(seq, num_seqs, head_id, num_heads + 1);
+    }
+  });
+}
+
+}  // anonymous namespace
+
+extern at::Tensor
+weight_packed_linear(at::Tensor& mat1, at::Tensor& mat2, const std::optional<at::Tensor>& bias, bool is_vnni);
+
+extern at::Tensor int8_scaled_mm_with_quant(
+    at::Tensor& mat1,
+    at::Tensor& mat2,
+    at::Tensor& scales2,
+    const std::optional<at::Tensor>& bias,
+    at::ScalarType out_dtype,
+    bool is_vnni);
+
+extern void
+bmm_cpu(at::Tensor& out, at::Tensor& mat1, at::Tensor& mat2, bool is_vnni, const std::optional<at::Tensor>& scale);
+
+extern at::Tensor fp8_scaled_mm_cpu(
+    at::Tensor& mat1,
+    at::Tensor& mat2,
+    at::Tensor& scales2,
+    std::vector<int64_t> block_size,
+    const std::optional<at::Tensor>& bias,
+    at::ScalarType out_dtype,
+    bool is_vnni);
+
+// NB: shapes in DeepDeek R1
+//
+//   hidden_states    : [num_seqs, hidden_size] [1, 7168]
+//   q_a_proj_weight  : [q_lora_rank, hidden_size] [1536, 7168]
+//   q_b_proj_weight  : [num_heads * qk_head_dim, q_lora_rank] [4224, 1536]
+//   kv_a_proj_weight : [kv_lora_rank + qk_rope_head_dim, hidden_size] [576, 7168]
+//   w_kc             : [num_heads, kv_lora_rank, qk_nope_head_dim] [22, 512, 128]
+//   q_a_layernorm_weight  : [q_lora_rank] [1536]
+//   kv_a_layernorm_weight : [kv_lora_rank] [512]
+//
+std::tuple<at::Tensor, at::Tensor, at::Tensor> qkv_proj_with_rope(
+    at::Tensor& hidden_states,
+    at::Tensor& q_a_proj_weight,
+    at::Tensor& q_b_proj_weight,
+    at::Tensor& kv_a_proj_weight,
+    at::Tensor& w_kc,
+    at::Tensor& q_a_layernorm_weight,
+    at::Tensor& kv_a_layernorm_weight,
+    at::Tensor& positions,
+    at::Tensor& cos_sin_cache,
+    double eps,
+    bool use_int8_w8a8,
+    bool use_fp8_w8a16,
+    std::optional<at::Tensor> q_a_proj_scale,
+    std::optional<at::Tensor> q_b_proj_scale,
+    std::optional<at::Tensor> kv_a_proj_scale,
+    bool is_vnni,
+    std::optional<std::vector<int64_t>> block_size) {
+  RECORD_FUNCTION(
+      "sgl-kernel::qkv_proj_with_rope",
+      std::vector<c10::IValue>({hidden_states, q_a_proj_weight, q_b_proj_weight, kv_a_proj_weight, w_kc}));
+
+  const auto st = hidden_states.scalar_type();
+  CHECK_INPUT(hidden_states);
+  CHECK_INPUT(positions);
+  CHECK_INPUT(cos_sin_cache);
+  CHECK_EQ(q_a_layernorm_weight.scalar_type(), st);
+  CHECK_EQ(kv_a_layernorm_weight.scalar_type(), st);
+  CHECK_EQ(positions.scalar_type(), at::kLong);
+  CHECK_EQ(cos_sin_cache.scalar_type(), st);
+  CHECK_DIM(2, hidden_states);
+  CHECK_DIM(3, w_kc);
+  CHECK_DIM(1, q_a_layernorm_weight);
+  CHECK_DIM(1, kv_a_layernorm_weight);
+  CHECK_DIM(1, positions);
+  CHECK_DIM(2, cos_sin_cache);
+
+  // skip contiguous checks for weights, expect prepacked
+  TORCH_CHECK(is_vnni, "qkv_proj_with_rope: expect weights are prepacked!");
+
+  int64_t num_seqs = hidden_states.size(0);
+  int64_t hidden_size = hidden_states.size(1);
+  int64_t q_lora_rank = q_a_proj_weight.size(0);
+  int64_t num_heads = w_kc.size(0);
+  int64_t kv_lora_rank = w_kc.size(1);
+  int64_t qk_head_dim = q_b_proj_weight.size(0) / num_heads;
+  int64_t qk_nope_head_dim = w_kc.size(2);
+  int64_t qk_rope_head_dim = kv_a_proj_weight.size(0) - kv_lora_rank;
+  int64_t rotary_dim = cos_sin_cache.size(1);
+
+  CHECK_EQ(positions.numel(), num_seqs);
+  CHECK_EQ(rotary_dim, qk_rope_head_dim);
+  CHECK_EQ(q_a_layernorm_weight.numel(), q_lora_rank);
+  CHECK_EQ(kv_a_layernorm_weight.numel(), kv_lora_rank);
+
+  // check the packed dimension
+  CHECK_EQ(q_a_proj_weight.size(1), get_row_size(hidden_size, use_int8_w8a8));
+  CHECK_EQ(q_b_proj_weight.size(1), get_row_size(q_lora_rank, use_int8_w8a8));
+  CHECK_EQ(kv_a_proj_weight.size(1), get_row_size(hidden_size, use_int8_w8a8));
+
+  if (use_int8_w8a8) {
+    TORCH_CHECK(q_a_proj_scale.has_value(), "missing q_a_proj_scale for int8 w8a8.");
+    TORCH_CHECK(q_b_proj_scale.has_value(), "missing q_b_proj_scale for int8 w8a8.");
+    TORCH_CHECK(kv_a_proj_scale.has_value(), "missing kv_a_proj_scale for int8 w8a8.");
+  }
+  if (use_fp8_w8a16) {
+    TORCH_CHECK(q_a_proj_scale.has_value(), "missing q_a_proj_scale for fp8 w8a16.");
+    TORCH_CHECK(q_b_proj_scale.has_value(), "missing q_b_proj_scale for fp8 w8a16.");
+    TORCH_CHECK(kv_a_proj_scale.has_value(), "missing kv_a_proj_scale for fp8 w8a16.");
+    TORCH_CHECK(block_size.has_value(), "missing block_size for fp8 w8a16.");
+    TORCH_CHECK(block_size.value().size() == 2, "block_size should be 2D for fp8 w8a16.");
+  }
+  // outputs and temp buffer
+  const auto options = hidden_states.options();
+  auto q_input = at::empty({num_seqs, num_heads, kv_lora_rank + qk_rope_head_dim}, options);
+  auto k_input = at::empty({num_seqs, 1, kv_lora_rank + qk_rope_head_dim}, options);
+  auto v_input = k_input.narrow(-1, 0, kv_lora_rank);
+
+  // outputs of q_a_proj and q_b_proj
+  auto qa = at::empty({num_seqs, q_lora_rank}, options);
+
+  // stage 1: q_a_proj and kv_a_proj
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(st, "qkv_proj_kernel_impl", [&] {
+    if (use_int8_w8a8) {
+      auto q_a_proj_s = q_a_proj_scale.value();
+      auto kv_a_proj_s = kv_a_proj_scale.value();
+      TORCH_CHECK(q_a_proj_s.numel() == q_lora_rank);
+      TORCH_CHECK(kv_a_proj_s.numel() == kv_lora_rank + qk_rope_head_dim);
+
+      auto buffer = at::empty({num_seqs * hidden_size + num_seqs * 4}, options.dtype(at::kByte));
+      uint8_t* __restrict__ Aq_data = buffer.data_ptr<uint8_t>();
+      float* __restrict__ As_data = (float*)((void*)(Aq_data + num_seqs * hidden_size));
+      const scalar_t* __restrict__ A_data = hidden_states.data_ptr<scalar_t>();
+
+      at::parallel_for(0, num_seqs, 0, [&](int64_t begin, int64_t end) {
+        for (int64_t m = begin; m < end; ++m) {
+          quantize_row_int8<scalar_t>(Aq_data + m * hidden_size, As_data[m], A_data + m * hidden_size, hidden_size);
+        }
+      });
+
+      segment_gemm_kernel_impl<scalar_t>(
+          qa.data_ptr<scalar_t>(),
+          k_input.data_ptr<scalar_t>(),
+          Aq_data,
+          q_a_proj_weight.data_ptr<int8_t>(),
+          kv_a_proj_weight.data_ptr<int8_t>(),
+          As_data,
+          q_a_proj_s.data_ptr<float>(),
+          kv_a_proj_s.data_ptr<float>(),
+          num_seqs,
+          q_lora_rank,
+          kv_lora_rank + qk_rope_head_dim,
+          hidden_size);
+    } else if (use_fp8_w8a16) {
+      int64_t block_size_N = block_size.value()[0];
+      int64_t block_size_K = block_size.value()[1];
+      auto q_a_proj_s = q_a_proj_scale.value();
+      auto kv_a_proj_s = kv_a_proj_scale.value();
+      CHECK_EQ(q_a_proj_s.size(0), div_up(q_lora_rank, block_size_N));
+      CHECK_EQ(q_a_proj_s.size(1), div_up(hidden_size, block_size_K));
+      CHECK_EQ(kv_a_proj_s.size(0), div_up(kv_lora_rank + qk_rope_head_dim, block_size_N));
+      CHECK_EQ(kv_a_proj_s.size(1), div_up(hidden_size, block_size_K));
+
+      const int BLOCK_N = block_size_n();
+      const int num_threads = at::get_num_threads();
+      auto buffer = at::empty({num_threads, BLOCK_N * hidden_size}, options);
+      segment_gemm_kernel_impl<scalar_t>(
+          qa.data_ptr<scalar_t>(),
+          k_input.data_ptr<scalar_t>(),
+          hidden_states.data_ptr<scalar_t>(),
+          q_a_proj_weight.data_ptr<at::Float8_e4m3fn>(),
+          kv_a_proj_weight.data_ptr<at::Float8_e4m3fn>(),
+          q_a_proj_s.data_ptr<float>(),
+          kv_a_proj_s.data_ptr<float>(),
+          buffer.data_ptr<scalar_t>(),
+          num_seqs,
+          q_lora_rank,
+          kv_lora_rank + qk_rope_head_dim,
+          hidden_size,
+          block_size_N,
+          block_size_K);
+    } else {
+      segment_gemm_kernel_impl<scalar_t>(
+          qa.data_ptr<scalar_t>(),
+          k_input.data_ptr<scalar_t>(),
+          hidden_states.data_ptr<scalar_t>(),
+          q_a_proj_weight.data_ptr<scalar_t>(),
+          kv_a_proj_weight.data_ptr<scalar_t>(),
+          num_seqs,
+          q_lora_rank,
+          kv_lora_rank + qk_rope_head_dim,
+          hidden_size);
+    }
+  });
+
+  // stage 2: apply rmsnorm inplace
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(st, "rms_norm_kernel_impl", [&] {
+    rms_norm_kernel_impl<scalar_t>(
+        qa.data_ptr<scalar_t>(),
+        v_input.data_ptr<scalar_t>(),
+        q_a_layernorm_weight.data_ptr<scalar_t>(),
+        kv_a_layernorm_weight.data_ptr<scalar_t>(),
+        num_seqs,
+        q_lora_rank,
+        kv_lora_rank,
+        kv_lora_rank + qk_rope_head_dim,
+        eps);
+  });
+
+  // stage 3: q_b_proj
+  at::Tensor qb;
+  std::optional<at::Tensor> bias;
+  if (use_int8_w8a8) {
+    qb = int8_scaled_mm_with_quant(qa, q_b_proj_weight, q_b_proj_scale.value(), bias, at::kBFloat16, is_vnni);
+  } else if (use_fp8_w8a16) {
+    qb = fp8_scaled_mm_cpu(
+        qa, q_b_proj_weight, q_b_proj_scale.value(), block_size.value(), bias, at::kBFloat16, is_vnni);
+  } else {
+    qb = weight_packed_linear(qa, q_b_proj_weight, bias, is_vnni);
+  }
+  qb.as_strided_({num_seqs, num_heads, qk_head_dim}, {num_heads * qk_head_dim, qk_head_dim, 1});
+
+  // stage 4: bmm
+  std::optional<at::Tensor> scale;
+  auto q_nope = qb.narrow(2, 0, qk_nope_head_dim).transpose_(0, 1);
+  auto q_nope_out = q_input.narrow(2, 0, kv_lora_rank).transpose_(0, 1);
+  bmm_cpu(q_nope_out, q_nope, w_kc, is_vnni, scale);
+
+  // stage 5: rope
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(st, "rotary_emb_kernel_impl", [&] {
+    rotary_emb_kernel_impl<scalar_t>(
+        q_input.data_ptr<scalar_t>() + kv_lora_rank,
+        k_input.data_ptr<scalar_t>() + kv_lora_rank,
+        qb.data_ptr<scalar_t>() + qk_nope_head_dim,
+        k_input.data_ptr<scalar_t>() + kv_lora_rank,
+        positions.data_ptr<int64_t>(),
+        cos_sin_cache.data_ptr<scalar_t>(),
+        num_seqs,
+        num_heads,
+        rotary_dim,
+        num_heads * qk_head_dim,
+        qk_head_dim,
+        kv_lora_rank + qk_rope_head_dim,
+        num_heads * (kv_lora_rank + qk_rope_head_dim),
+        kv_lora_rank + qk_rope_head_dim,
+        kv_lora_rank + qk_rope_head_dim);
+  });
+
+  return std::make_tuple(q_input, k_input, v_input);
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> qkv_proj_with_rope_fused_weight(
+    at::Tensor& hidden_states,
+    at::Tensor& qkv_a_proj_weight,
+    at::Tensor& q_b_proj_weight,
+    at::Tensor& w_kc,
+    at::Tensor& q_a_layernorm_weight,
+    at::Tensor& kv_a_layernorm_weight,
+    at::Tensor& positions,
+    at::Tensor& cos_sin_cache,
+    double eps,
+    bool use_int8_w8a8,
+    bool use_fp8_w8a16,
+    std::optional<at::Tensor> qkv_a_proj_scale,
+    std::optional<at::Tensor> q_b_proj_scale,
+    bool is_vnni,
+    std::optional<std::vector<int64_t>> block_size,
+    int64_t q_lora_rank,
+    int64_t kv_lora_rank,
+    int64_t qk_rope_head_dim) {
+  RECORD_FUNCTION(
+      "sgl-kernel::qkv_proj_with_rope_fused_weight",
+      std::vector<c10::IValue>({hidden_states, qkv_a_proj_weight, q_b_proj_weight, w_kc}));
+
+  int64_t hidden_size = hidden_states.size(1);
+  CHECK_EQ(qkv_a_proj_weight.size(0), q_lora_rank + kv_lora_rank + qk_rope_head_dim);
+  CHECK_EQ(qkv_a_proj_weight.size(1), get_row_size(hidden_size, use_int8_w8a8));
+
+  std::vector<at::Tensor> weight_chunks =
+      at::split(qkv_a_proj_weight, {q_lora_rank, kv_lora_rank + qk_rope_head_dim}, 0);
+  at::Tensor q_a_proj_weight = weight_chunks[0];
+  at::Tensor kv_a_proj_weight = weight_chunks[1];
+  at::Tensor q_a_proj_s;
+  at::Tensor kv_a_proj_s;
+
+  if (use_int8_w8a8) {
+    TORCH_CHECK(qkv_a_proj_scale.has_value(), "missing qkv_a_proj_scale for int8 w8a8.");
+    std::vector<at::Tensor> scale_chunks =
+        at::split(qkv_a_proj_scale.value(), {q_lora_rank, kv_lora_rank + qk_rope_head_dim}, 0);
+    q_a_proj_s = scale_chunks[0];
+    kv_a_proj_s = scale_chunks[1];
+  }
+  if (use_fp8_w8a16) {
+    TORCH_CHECK(qkv_a_proj_scale.has_value(), "missing qkv_a_proj_scale for fp8 w8a16.");
+    int64_t block_size_N = block_size.value()[0];
+    int64_t q_a_proj_s_dim0 = div_up(q_lora_rank, block_size_N);
+    int64_t kv_a_proj_s_dim0 = div_up(kv_lora_rank + qk_rope_head_dim, block_size_N);
+    std::vector<at::Tensor> scale_chunks = at::split(qkv_a_proj_scale.value(), {q_a_proj_s_dim0, kv_a_proj_s_dim0}, 0);
+    q_a_proj_s = scale_chunks[0];
+    kv_a_proj_s = scale_chunks[1];
+  }
+
+  return qkv_proj_with_rope(
+      hidden_states,
+      q_a_proj_weight,
+      q_b_proj_weight,
+      kv_a_proj_weight,
+      w_kc,
+      q_a_layernorm_weight,
+      kv_a_layernorm_weight,
+      positions,
+      cos_sin_cache,
+      eps,
+      use_int8_w8a8,
+      use_fp8_w8a16,
+      q_a_proj_s,
+      q_b_proj_scale,
+      kv_a_proj_s,
+      is_vnni,
+      block_size);
+}
diff --git a/sgl-kernel/csrc/cpu/rope.cpp b/sgl-kernel/csrc/cpu/rope.cpp
new file mode 100644
index 000000000..1c6249466
--- /dev/null
+++ b/sgl-kernel/csrc/cpu/rope.cpp
@@ -0,0 +1,346 @@
+#include "common.h"
+#include "vec.h"
+
+namespace {
+
+template <typename scalar_t>
+void rotary_embedding_3D_kernel_impl(
+    scalar_t* __restrict__ query_out,
+    scalar_t* __restrict__ key_out,
+    int64_t* __restrict__ positions,
+    scalar_t* __restrict__ query,
+    scalar_t* __restrict__ key,
+    scalar_t* __restrict__ cos_sin_cache,
+    int64_t num_tokens,
+    int64_t num_heads,
+    int64_t num_kv_heads,
+    int64_t head_size,
+    int64_t rotary_dim,
+    int64_t query_stride_s,
+    int64_t query_out_stride_s,
+    int64_t key_out_stride_s,
+    int64_t key_stride_s,
+    int64_t query_stride_h,
+    int64_t query_out_stride_h) {
+  int64_t HR = rotary_dim;
+  int64_t HK = rotary_dim;
+  int64_t COFF = HR / 2;
+  at::parallel_for(0, num_tokens * num_heads, GRAIN_SIZE / rotary_dim, [&](int64_t begin, int64_t end) {
+    int64_t seq{0}, head_id{0};
+    data_index_init(begin, seq, num_tokens, head_id, num_heads);
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t in_offset_q = seq * query_stride_s + head_id * query_stride_h;
+      int64_t out_offset_q = seq * query_out_stride_s + head_id * query_out_stride_h;
+      int64_t out_offset_k = seq * key_out_stride_s;
+      int64_t p = 0;
+      scalar_t* sin_start = nullptr;
+      scalar_t* cos_start = nullptr;
+      // step 0) get the rotary position embedding for the current position
+      p = positions[seq];
+      sin_start = cos_sin_cache + p * HR + COFF;
+      cos_start = cos_sin_cache + p * HR;
+      // step 1) apply_rotary_pos_emb for the rotary_dim elements in every
+      // head of query/key
+      for (int64_t h = 0; h < rotary_dim; h += 2) {
+        scalar_t cos = cos_start[h >> 1];
+        scalar_t sin = sin_start[h >> 1];
+        scalar_t in1 = query[in_offset_q + h];
+        scalar_t in2 = query[in_offset_q + h + 1];
+        scalar_t out1 = in1 * cos - in2 * sin;
+        scalar_t out2 = in2 * cos + in1 * sin;
+        query_out[out_offset_q + h] = out1;
+        query_out[out_offset_q + h + 1] = out2;
+      }
+      for (int64_t h = 0; h < HK; h += 2) {
+        scalar_t cos = cos_start[h >> 1];
+        scalar_t sin = sin_start[h >> 1];
+        int64_t k_pe_offset = seq * key_stride_s;
+        scalar_t in1_k = key[k_pe_offset + h];
+        scalar_t in2_k = key[k_pe_offset + h + 1];
+        scalar_t out1_k = in1_k * cos - in2_k * sin;
+        scalar_t out2_k = in2_k * cos + in1_k * sin;
+        key_out[out_offset_k + h] = out1_k;
+        key_out[out_offset_k + h + 1] = out2_k;
+      }
+      // move to the next index
+      data_index_step(seq, num_tokens, head_id, num_heads);
+    }
+  });
+}
+
+template <typename scalar_t>
+void rotary_embedding_neox_2D_kernel_impl(
+    int64_t* __restrict__ positions,
+    scalar_t* __restrict__ query,
+    scalar_t* __restrict__ key,
+    scalar_t* __restrict__ cos_sin_cache,
+    int64_t rotary_dim,
+    int64_t query_stride_s,
+    int64_t key_stride_s,
+    int64_t num_heads,
+    int64_t num_kv_heads,
+    int64_t head_size,
+    int64_t num_tokens) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int64_t bVecSize = bVec::size();
+
+  int64_t embed_dim = rotary_dim / 2;
+  bool flag = (embed_dim % bVecSize == 0);
+  int64_t loop_upper = flag ? embed_dim : embed_dim - bVecSize;
+
+  auto compute_loop = [&](int64_t token_head, scalar_t* cache_ptr, scalar_t* qk) {
+    int64_t j = 0;
+    for (; j < loop_upper; j += bVecSize) {
+      int64_t rot_offset = j;
+      int64_t x_index = rot_offset;
+      int64_t y_index = embed_dim + rot_offset;
+
+      int64_t out_x = token_head + x_index;
+      int64_t out_y = token_head + y_index;
+
+      bVec _cos = bVec::loadu(cache_ptr + x_index);
+      bVec _sin = bVec::loadu(cache_ptr + y_index);
+
+      bVec _q_x = bVec::loadu(qk + out_x);
+      bVec _q_y = bVec::loadu(qk + out_y);
+      fVec _cos_0, _cos_1;
+      std::tie(_cos_0, _cos_1) = at::vec::convert_to_float(_cos);
+      fVec _sin_0, _sin_1;
+      std::tie(_sin_0, _sin_1) = at::vec::convert_to_float(_sin);
+      fVec _q_x_0, _q_x_1;
+      std::tie(_q_x_0, _q_x_1) = at::vec::convert_to_float(_q_x);
+      fVec _q_y_0, _q_y_1;
+      std::tie(_q_y_0, _q_y_1) = at::vec::convert_to_float(_q_y);
+
+      auto out1_0 = _q_x_0 * _cos_0 - _q_y_0 * _sin_0;
+      auto out1_1 = _q_x_1 * _cos_1 - _q_y_1 * _sin_1;
+      auto out1 = convert_from_float_ext<scalar_t>(out1_0, out1_1);
+      out1.store(qk + out_x);
+
+      auto out2_0 = _q_y_0 * _cos_0 + _q_x_0 * _sin_0;
+      auto out2_1 = _q_y_1 * _cos_1 + _q_x_1 * _sin_1;
+      auto out2 = convert_from_float_ext<scalar_t>(out2_0, out2_1);
+      out2.store(qk + out_y);
+    }
+    if (!flag) {
+      for (; j < embed_dim; ++j) {
+        int64_t x_index = j;
+        int64_t y_index = embed_dim + j;
+
+        int64_t out_x = token_head + x_index;
+        int64_t out_y = token_head + y_index;
+
+        float _cos = cache_ptr[x_index];
+        float _sin = cache_ptr[y_index];
+
+        float _q_x = qk[out_x];
+        float _q_y = qk[out_y];
+
+        qk[out_x] = _q_x * _cos - _q_y * _sin;
+        qk[out_y] = _q_y * _cos + _q_x * _sin;
+      }
+    }
+  };
+
+#pragma omp parallel for
+  for (int64_t token_idx = 0; token_idx < num_tokens; ++token_idx) {
+    int64_t pos = positions[token_idx];
+    scalar_t* cache_ptr = cos_sin_cache + pos * rotary_dim;
+
+    for (int64_t i = 0; i < num_heads; ++i) {
+      int64_t head_idx = i;
+      int64_t token_head = token_idx * query_stride_s + head_idx * head_size;
+      compute_loop(token_head, cache_ptr, query);
+    }
+
+    for (int64_t i = 0; i < num_kv_heads; ++i) {
+      int64_t head_idx = i;
+      int64_t token_head = token_idx * key_stride_s + head_idx * head_size;
+      compute_loop(token_head, cache_ptr, key);
+    }
+  }
+}
+
+template <typename scalar_t>
+void rotary_embedding_2D_kernel_impl(
+    int64_t* __restrict__ positions,
+    scalar_t* __restrict__ query,
+    scalar_t* __restrict__ key,
+    scalar_t* __restrict__ cos_sin_cache,
+    int64_t rotary_dim,
+    int64_t query_stride_s,
+    int64_t key_stride_s,
+    int64_t num_heads,
+    int64_t num_kv_heads,
+    int64_t head_size,
+    int64_t num_tokens) {
+  int64_t embed_dim = rotary_dim / 2;
+
+  at::parallel_for(0, num_tokens * num_heads, GRAIN_SIZE / rotary_dim, [&](int64_t begin, int64_t end) {
+    int64_t token_idx = {0}, i = {0};
+    data_index_init(begin, token_idx, num_tokens, i, num_heads);
+    for ([[maybe_unused]] auto z : c10::irange(begin, end)) {
+      int64_t pos = positions[token_idx];
+      scalar_t* cache_ptr = cos_sin_cache + pos * rotary_dim;
+      scalar_t* cos_cache_ptr = cache_ptr;
+      scalar_t* sin_cache_ptr = cache_ptr + embed_dim;
+      int64_t head_idx = i;
+      int64_t token_head = token_idx * query_stride_s + head_idx * head_size;
+      scalar_t* head_query = token_head + query;
+      for (int64_t j = 0; j < embed_dim; j += 1) {
+        int64_t rot_offset = j;
+        int64_t x_index = 2 * rot_offset;
+        int64_t y_index = 2 * rot_offset + 1;
+
+        float cos = cos_cache_ptr[rot_offset];
+        float sin = sin_cache_ptr[rot_offset];
+
+        float x = head_query[x_index];
+        float y = head_query[y_index];
+
+        head_query[x_index] = x * cos - y * sin;
+        head_query[y_index] = y * cos + x * sin;
+      }
+      data_index_step(token_idx, num_tokens, i, num_heads);
+    }
+  });
+
+  at::parallel_for(0, num_tokens * num_kv_heads, GRAIN_SIZE / rotary_dim, [&](int64_t begin, int64_t end) {
+    int64_t token_idx{0}, i = {0};
+    data_index_init(begin, token_idx, num_tokens, i, num_kv_heads);
+    for ([[maybe_unused]] auto z : c10::irange(begin, end)) {
+      int64_t pos = positions[token_idx];
+      scalar_t* cache_ptr = cos_sin_cache + pos * rotary_dim;
+      scalar_t* cos_cache_ptr = cache_ptr;
+      scalar_t* sin_cache_ptr = cache_ptr + embed_dim;
+      int64_t head_idx = i;
+      int64_t token_head = token_idx * key_stride_s + head_idx * head_size;
+      scalar_t* head_key = key + token_head;
+      for (int64_t j = 0; j < embed_dim; j += 1) {
+        int64_t rot_offset = j;
+        int64_t x_index = 2 * rot_offset;
+        int64_t y_index = 2 * rot_offset + 1;
+
+        float cos = cos_cache_ptr[rot_offset];
+        float sin = sin_cache_ptr[rot_offset];
+
+        float x = head_key[x_index];
+        float y = head_key[y_index];
+
+        head_key[x_index] = x * cos - y * sin;
+        head_key[y_index] = y * cos + x * sin;
+      }
+      data_index_step(token_idx, num_tokens, i, num_kv_heads);
+    }
+  });
+}
+
+}  // namespace
+
+std::tuple<at::Tensor, at::Tensor> rotary_embedding_cpu(
+    at::Tensor& positions,
+    at::Tensor& query,
+    at::Tensor& key,
+    int64_t head_size,
+    at::Tensor& cos_sin_cache,
+    bool is_neox) {
+  RECORD_FUNCTION("sgl-kernel::rotary_embedding_cpu", std::vector<c10::IValue>({query, key}));
+  CHECK_DIM(1, positions);
+  const auto input_dim = query.dim();
+  const auto input_dtype = query.scalar_type();
+  TORCH_CHECK(
+      input_dim == 2 || input_dim == 3,
+      " Query/Key must be 2D [num_tokens, num_heads*head_size] or 3D [num_tokens, num_heads, head_size] tensor");
+  CHECK_DIM(2, cos_sin_cache);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(query);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(key);
+
+  int64_t rotary_dim = cos_sin_cache.size(1);
+  if (input_dim == 3) {
+    // TODO: add support for head_dim != rotary_dim case when input_dim=3
+    CHECK_EQ(query.size(-1), rotary_dim);
+    // TODO: add support for kv_head != 1
+    CHECK_EQ(key.size(1), 1);
+  }
+
+  int64_t num_tokens = positions.numel();
+  CHECK_EQ(key.size(0), num_tokens);
+  CHECK_EQ(query.size(0), num_tokens);
+
+  TORCH_CHECK(positions.scalar_type() == at::kLong, "expect positions to be int64, got ", positions.scalar_type());
+  TORCH_CHECK(input_dtype == key.scalar_type(), "query and key must have the same data type");
+  TORCH_CHECK(input_dtype == cos_sin_cache.scalar_type(), "query and cos_sin_cache must have the same data type");
+
+  int64_t num_heads = input_dim == 2 ? query.size(-1) / head_size : query.size(1);
+  int64_t num_kv_heads = input_dim == 2 ? key.size(-1) / head_size : key.size(1);
+  int64_t key_stride_s = key.stride(0);
+  int64_t query_stride_s = query.stride(0);
+
+  // input stride of num head dim is meaningful only when input dim = 3
+  int64_t query_stride_h = input_dim == 3 ? query.stride(1) : -1;
+  at::Tensor query_out = at::empty_like(query);
+  at::Tensor key_out = at::empty_like(key);
+  int64_t query_out_stride_s = query_out.stride(0);
+  int64_t key_out_stride_s = key_out.stride(0);
+  // output stride of num head dim is meaningful only when input dim = 3
+  int64_t query_out_stride_h = input_dim == 3 ? query_out.stride(1) : -1;
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(input_dtype, "rotary_embedding_cpu", [&] {
+    if (input_dim == 2) {
+      if (is_neox) {
+        rotary_embedding_neox_2D_kernel_impl<scalar_t>(
+            positions.data_ptr<int64_t>(),
+            query.data_ptr<scalar_t>(),
+            key.data_ptr<scalar_t>(),
+            cos_sin_cache.data_ptr<scalar_t>(),
+            rotary_dim,
+            query_stride_s,
+            key_stride_s,
+            num_heads,
+            num_kv_heads,
+            head_size,
+            num_tokens);
+      } else {
+        rotary_embedding_2D_kernel_impl<scalar_t>(
+            positions.data_ptr<int64_t>(),
+            query.data_ptr<scalar_t>(),
+            key.data_ptr<scalar_t>(),
+            cos_sin_cache.data_ptr<scalar_t>(),
+            rotary_dim,
+            query_stride_s,
+            key_stride_s,
+            num_heads,
+            num_kv_heads,
+            head_size,
+            num_tokens);
+      }
+      query_out = query;
+      key_out = key;
+
+    } else {
+      TORCH_CHECK(
+          is_neox == false, " Query/Key with 3D [num_tokens, num_heads, head_size] does not support neox rope yet");
+      // TODO: add neox style support for rope impl with 3D inputs
+      rotary_embedding_3D_kernel_impl<scalar_t>(
+          query_out.data_ptr<scalar_t>(),
+          key_out.data_ptr<scalar_t>(),
+          positions.data_ptr<int64_t>(),
+          query.data_ptr<scalar_t>(),
+          key.data_ptr<scalar_t>(),
+          cos_sin_cache.data_ptr<scalar_t>(),
+          num_tokens,
+          num_heads,
+          num_kv_heads,
+          head_size,
+          rotary_dim,
+          query_stride_s,
+          query_out_stride_s,
+          key_out_stride_s,
+          key_stride_s,
+          query_stride_h,
+          query_out_stride_h);
+    }
+  });
+  return std::make_tuple(query_out, key_out);
+}
diff --git a/sgl-kernel/csrc/cpu/shm.cpp b/sgl-kernel/csrc/cpu/shm.cpp
new file mode 100644
index 000000000..1bf65a9b2
--- /dev/null
+++ b/sgl-kernel/csrc/cpu/shm.cpp
@@ -0,0 +1,666 @@
+#include "shm.h"
+
+#include <ATen/ATen.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <immintrin.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+// states for collectives
+enum coll_state {
+  coll_begin = 0,
+  coll_allreduce_naive__copy_in_done,
+  coll_allreduce_naive__reduce_done,
+  // alternative state when allreduce is working on alternative buffer
+  // of the double buffer.
+  coll_alt1_allreduce_naive__copy_in_done,
+  coll_alt2_allreduce_naive__copy_in_done,
+  coll_alt1_allreduce_naive__reduce_done,
+  coll_allgather_naive__copy_in_done,
+  coll_alt1_allgather_naive__copy_in_done,
+  coll_alt2_allgather_naive__copy_in_done,
+};
+
+// SHM building blocks
+struct SharedData {
+  const char* name;
+  int descriptor;
+  void* bytes;
+  size_t nbytes;
+};
+
+void shared_open(SharedData* data, const char* name, size_t nbytes) {
+  int d = shm_open(name, O_RDWR, S_IRUSR | S_IWUSR);
+  if (d != -1) {
+    void* bytes = mmap(NULL, nbytes, PROT_READ | PROT_WRITE, MAP_SHARED, d, 0);
+    data->name = name;
+    data->descriptor = d;
+    data->bytes = bytes;
+    data->nbytes = nbytes;
+  } else {
+    if (errno != ENOENT) {
+      // don't print if shm can not be found because we want to loop over from
+      // caller again until the other ranks created the shm
+      printf("shared_open %s failed, errno=%d\n", name, errno);
+    }
+    data->descriptor = -1;
+  }
+}
+
+void shared_create(SharedData* data, const char* name, void* bytes, size_t nbytes) {
+  int d = shm_open(name, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
+  if (d != -1) {
+    nbytes = write(d, bytes, nbytes);
+    if (nbytes > 0) {
+      shared_open(data, name, nbytes);
+    }
+  } else {
+    printf("shared_create %s failed\n", name);
+  }
+}
+
+static int world_size;
+
+// SHM based allreduce helper functions
+// buffer that holds shm name
+#define NAME_BUF_SIZE 1000
+#define MAX_BUF_SIZE 1048576 * 32
+#define NAIVE_ALLREDUCE_THRESHOLD 1048576
+#define SHM_BUFFER_NAME "deepspeed_allreduce_buffer"
+struct allreduce_workspace {
+  enum coll_state states[2];  // idx=0 -- state for symmetric_naive_all_reduce
+                              // idx=1 -- state for distributed_naive_all_reduce
+  // double buffer to avoid syncing between rounds
+  // offset=0 -- 2*NAIVE_ALLREDUCE_THRESHOLD : buffer for
+  // symmetric_naive_all_reduce after that : buffer for
+  // distributed_naive_all_reduce
+  char buffer[2 * NAIVE_ALLREDUCE_THRESHOLD + 2 * MAX_BUF_SIZE];
+};
+
+#define BUFFER0_OFFSET(current_buffer) current_buffer* NAIVE_ALLREDUCE_THRESHOLD
+#define BUFFER1_OFFSET(current_buffer) 2 * NAIVE_ALLREDUCE_THRESHOLD + current_buffer* MAX_BUF_SIZE
+
+struct allreduce_workspace** workspace;
+
+// buffer for small messages, double buffer
+char** symmetric_buffer[2];
+// buffer for large messages, double buffer
+char** distributed_buffer[2];
+
+void wait_buffer_state_until_2(int index, enum coll_state state0, enum coll_state state1, int state_group) {
+  volatile enum coll_state* state_ptr = &(workspace[index]->states[state_group]);
+
+  while (1) {
+    volatile enum coll_state cur_state = *state_ptr;
+    if (cur_state == state0 || cur_state == state1) break;
+  }
+}
+
+__m512 cvt_bf16_to_fp32(const __m256i src) __attribute__((target("avx512bw")));
+inline __m512 cvt_bf16_to_fp32(const __m256i src) {
+  auto y = _mm512_cvtepu16_epi32(src);
+  return _mm512_castsi512_ps(_mm512_bslli_epi128(y, 2));
+}
+
+inline __m256i cvt_fp32_to_bf16(const __m512 src) __attribute__((target("avx512bw")));
+inline __m256i cvt_fp32_to_bf16(const __m512 src) {
+  __m512i value = _mm512_castps_si512(src);
+  __m512i nan = _mm512_set1_epi32(0xffff);
+  auto mask_value = _mm512_cmp_ps_mask(src, src, _CMP_ORD_Q);
+  __m512i ones = _mm512_set1_epi32(0x1);
+  __m512i vec_bias = _mm512_set1_epi32(0x7fff);
+  // uint32_t lsb = (input >> 16) & 1;
+  auto t_value = _mm512_and_si512(_mm512_srli_epi32(value, 16), ones);
+  // uint32_t rounding_bias = 0x7fff + lsb;
+  t_value = _mm512_add_epi32(t_value, vec_bias);
+  // input += rounding_bias;
+  t_value = _mm512_add_epi32(t_value, value);
+  // input = input >> 16;
+  t_value = _mm512_srli_epi32(t_value, 16);
+  // Check NaN before converting back to bf16
+  t_value = _mm512_mask_blend_epi32(mask_value, nan, t_value);
+  return _mm512_cvtusepi32_epi16(t_value);
+}
+
+__m512 cvt_fp16_to_fp32(const __m256i src) __attribute__((target("avx512bw")));
+inline __m512 cvt_fp16_to_fp32(const __m256i src) {
+  return _mm512_cvtph_ps(src);
+}
+
+inline __m256i cvt_fp32_to_fp16(const __m512 src) __attribute__((target("avx512bw")));
+inline __m256i cvt_fp32_to_fp16(const __m512 src) {
+  return _mm512_cvtps_ph(src, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+}
+
+void reduce_bf16_buffers(int start_elements, int num_elements, char* to_buffer, char** buffers)
+    __attribute__((target("avx512bw")));
+
+void reduce_fp16_buffers(int start_elements, int num_elements, char* to_buffer, char** buffers)
+    __attribute__((target("avx512bw")));
+
+void reduce_fp32_buffers(int start_elements, int num_elements, char* to_buffer, char** buffers)
+    __attribute__((target("avx512bw")));
+
+void reduce_all_buffers(
+    int start_elements,
+    int num_elements,
+    c10::ScalarType scalar_type,
+    int to_buffer_idx,
+    char* to_buffer,
+    char** buffers) {
+  switch (scalar_type) {
+    case c10::ScalarType::BFloat16:
+      reduce_bf16_buffers(start_elements, num_elements, to_buffer, buffers);
+      break;
+    case c10::ScalarType::Half:
+      reduce_fp16_buffers(start_elements, num_elements, to_buffer, buffers);
+      break;
+    case c10::ScalarType::Float:
+      reduce_fp32_buffers(start_elements, num_elements, to_buffer, buffers);
+      break;
+    default:
+      assert(!"Should not get here");
+  }
+}
+
+#define CVT_ADD_BF16(x)                                                                  \
+  do {                                                                                   \
+    auto in##x##_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(buffers[x] + i))); \
+    inout_val = _mm512_add_ps(inout_val, in##x##_val);                                   \
+  } while (0)
+
+// Reduce functions down below use vectorized algorithm, the number of bytes
+// processed each iteration depends on vector length.  256bit vector ==> 32
+// bytes, 512bit vector ==> 64 bytes If you change implementation of
+// reduce_bf16_buffers, etc. , check whether this number needs to be changed
+#define VECTOR_LENGTH_IN_BYTES 32
+
+void reduce_bf16_buffers(int start_elements, int num_elements, char* to_buffer, char** buffers) {
+  const int element_size = 2;
+  const int vector_length = VECTOR_LENGTH_IN_BYTES / element_size;
+  int main_elements = num_elements - (num_elements % vector_length);
+  int remain_elements = num_elements % vector_length;
+
+  // process aligned part
+#pragma omp parallel for
+  for (int i = start_elements * element_size; i < (start_elements + main_elements) * element_size;
+       i += VECTOR_LENGTH_IN_BYTES) {
+    auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(buffers[0] + i)));
+    switch (world_size) {
+      case 16:
+        CVT_ADD_BF16(15);
+      case 15:
+        CVT_ADD_BF16(14);
+      case 14:
+        CVT_ADD_BF16(13);
+      case 13:
+        CVT_ADD_BF16(12);
+      case 12:
+        CVT_ADD_BF16(11);
+      case 11:
+        CVT_ADD_BF16(10);
+      case 10:
+        CVT_ADD_BF16(9);
+      case 9:
+        CVT_ADD_BF16(8);
+      case 8:
+        CVT_ADD_BF16(7);
+      case 7:
+        CVT_ADD_BF16(6);
+      case 6:
+        CVT_ADD_BF16(5);
+      case 5:
+        CVT_ADD_BF16(4);
+      case 4:
+        CVT_ADD_BF16(3);
+      case 3:
+        CVT_ADD_BF16(2);
+      case 2:
+        CVT_ADD_BF16(1);
+      case 1:
+        break;
+      default:
+        for (int j = 1; j < world_size; j++) {
+          auto in_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(buffers[j] + i)));
+          inout_val = _mm512_add_ps(inout_val, in_val);
+        }
+    }
+    _mm256_storeu_si256((__m256i*)(to_buffer + i), cvt_fp32_to_bf16(inout_val));
+  }
+
+  // process remaining part
+  int i = (start_elements + main_elements) * element_size;
+  while (remain_elements > 0) {
+    float val = 0.0f;
+    for (int j = 0; j < world_size; j++) {
+      val += *(at::BFloat16*)(buffers[j] + i);
+    }
+    *(at::BFloat16*)(to_buffer + i) = val;
+    remain_elements--;
+    i += element_size;
+  }
+}
+
+#define CVT_ADD_FP16(x)                                                                  \
+  do {                                                                                   \
+    auto in##x##_val = cvt_fp16_to_fp32(_mm256_loadu_si256((__m256i*)(buffers[x] + i))); \
+    inout_val = _mm512_add_ps(inout_val, in##x##_val);                                   \
+  } while (0)
+
+void reduce_fp16_buffers(int start_elements, int num_elements, char* to_buffer, char** buffers) {
+  const int element_size = 2;
+  const int vector_length = VECTOR_LENGTH_IN_BYTES / element_size;
+  int main_elements = num_elements - (num_elements % vector_length);
+  int remain_elements = num_elements % vector_length;
+
+  // process aligned part
+#pragma omp parallel for
+  for (int i = start_elements * element_size; i < (start_elements + main_elements) * element_size;
+       i += VECTOR_LENGTH_IN_BYTES) {
+    auto inout_val = cvt_fp16_to_fp32(_mm256_loadu_si256((__m256i*)(buffers[0] + i)));
+    switch (world_size) {
+      case 16:
+        CVT_ADD_FP16(15);
+      case 15:
+        CVT_ADD_FP16(14);
+      case 14:
+        CVT_ADD_FP16(13);
+      case 13:
+        CVT_ADD_FP16(12);
+      case 12:
+        CVT_ADD_FP16(11);
+      case 11:
+        CVT_ADD_FP16(10);
+      case 10:
+        CVT_ADD_FP16(9);
+      case 9:
+        CVT_ADD_FP16(8);
+      case 8:
+        CVT_ADD_FP16(7);
+      case 7:
+        CVT_ADD_FP16(6);
+      case 6:
+        CVT_ADD_FP16(5);
+      case 5:
+        CVT_ADD_FP16(4);
+      case 4:
+        CVT_ADD_FP16(3);
+      case 3:
+        CVT_ADD_FP16(2);
+      case 2:
+        CVT_ADD_FP16(1);
+      case 1:
+        break;
+      default:
+        for (int j = 1; j < world_size; j++) {
+          auto in_val = cvt_fp16_to_fp32(_mm256_loadu_si256((__m256i*)(buffers[j] + i)));
+          inout_val = _mm512_add_ps(inout_val, in_val);
+        }
+    }
+    _mm256_storeu_si256((__m256i*)(to_buffer + i), cvt_fp32_to_fp16(inout_val));
+  }
+
+  // process remaining part
+  int i = (start_elements + main_elements) * element_size;
+  while (remain_elements > 0) {
+    float val = 0.0f;
+    for (int j = 0; j < world_size; j++) {
+      val += *(at::Half*)(buffers[j] + i);
+    }
+    *(at::Half*)(to_buffer + i) = val;
+    remain_elements--;
+    i += element_size;
+  }
+}
+
+#define CVT_ADD_F32(x)                                            \
+  do {                                                            \
+    auto in##x##_val = _mm256_loadu_ps((float*)(buffers[x] + i)); \
+    inout_val = _mm256_add_ps(inout_val, in##x##_val);            \
+  } while (0)
+
+void reduce_fp32_buffers(int start_elements, int num_elements, char* to_buffer, char** buffers) {
+  const int element_size = 4;
+  const int vector_length = VECTOR_LENGTH_IN_BYTES / element_size;
+  int main_elements = num_elements - (num_elements % vector_length);
+  int remain_elements = num_elements % vector_length;
+
+  // process aligned part
+#pragma omp parallel for
+  for (int i = start_elements * element_size; i < (start_elements + main_elements) * element_size;
+       i += VECTOR_LENGTH_IN_BYTES) {
+    auto inout_val = _mm256_loadu_ps((float*)(buffers[0] + i));
+    switch (world_size) {
+      case 16:
+        CVT_ADD_F32(15);
+      case 15:
+        CVT_ADD_F32(14);
+      case 14:
+        CVT_ADD_F32(13);
+      case 13:
+        CVT_ADD_F32(12);
+      case 12:
+        CVT_ADD_F32(11);
+      case 11:
+        CVT_ADD_F32(10);
+      case 10:
+        CVT_ADD_F32(9);
+      case 9:
+        CVT_ADD_F32(8);
+      case 8:
+        CVT_ADD_F32(7);
+      case 7:
+        CVT_ADD_F32(6);
+      case 6:
+        CVT_ADD_F32(5);
+      case 5:
+        CVT_ADD_F32(4);
+      case 4:
+        CVT_ADD_F32(3);
+      case 3:
+        CVT_ADD_F32(2);
+      case 2:
+        CVT_ADD_F32(1);
+      case 1:
+        break;
+      default:
+        for (int j = 1; j < world_size; j++) {
+          auto in_val = _mm256_loadu_ps((float*)(buffers[j] + i));
+          inout_val = _mm256_add_ps(inout_val, in_val);
+        }
+    }
+    _mm256_storeu_ps((float*)(to_buffer + i), inout_val);
+  }
+
+  // process remaining part
+  int i = (start_elements + main_elements) * element_size;
+  while (remain_elements > 0) {
+    float val = 0.0f;
+    for (int j = 0; j < world_size; j++) {
+      val += *(float*)(buffers[j] + i);
+    }
+    *(float*)(to_buffer + i) = val;
+    remain_elements--;
+    i += element_size;
+  }
+}
+
+static bool is_initialized = false;
+static int world_rank;
+
+void shm_initialize(int size, int rank, const char* addr_string, const char* port_string) {
+  if (is_initialized) {
+    return;
+  }
+  is_initialized = true;
+
+  world_size = size;
+  world_rank = rank;
+
+  char shm_name_prefix[NAME_BUF_SIZE];
+  char shm_name[NAME_BUF_SIZE];
+  snprintf(shm_name_prefix, NAME_BUF_SIZE, "%s_%d_%s_%s", SHM_BUFFER_NAME, getuid(), addr_string, port_string);
+  // create shared workspace for SHM based allreduce
+  SharedData allreduce_buffer;
+  // allocate workspace_buf for current rank
+  struct allreduce_workspace* workspace_buf;
+  struct allreduce_workspace* workspace_buf_other;
+  workspace_buf = (struct allreduce_workspace*)malloc(sizeof(struct allreduce_workspace));
+  snprintf(shm_name, NAME_BUF_SIZE, "%.900s_%d", shm_name_prefix, rank);
+  shared_create(&allreduce_buffer, shm_name, workspace_buf, sizeof(struct allreduce_workspace));
+  workspace_buf = (struct allreduce_workspace*)allreduce_buffer.bytes;
+  workspace_buf->states[0] = coll_alt2_allreduce_naive__copy_in_done;
+  workspace_buf->states[1] = coll_begin;
+
+  // create the workspace pointer list
+  workspace = (struct allreduce_workspace**)malloc(size * sizeof(struct allreduce_workspace*));
+  symmetric_buffer[0] = (char**)malloc(size * sizeof(char**));
+  symmetric_buffer[1] = (char**)malloc(size * sizeof(char**));
+  distributed_buffer[0] = (char**)malloc(size * sizeof(char**));
+  distributed_buffer[1] = (char**)malloc(size * sizeof(char**));
+
+  // map shm of all ranks
+  for (int i = 0; i < size; i++) {
+    if (i != rank) {
+      snprintf(shm_name, NAME_BUF_SIZE, "%.900s_%d", shm_name_prefix, i);
+      // printf("open %s, %d\n", shm_name, rank);
+      do {
+        shared_open(&allreduce_buffer, shm_name, sizeof(struct allreduce_workspace));
+      } while (allreduce_buffer.descriptor == -1 && errno == ENOENT);
+      workspace_buf_other = (struct allreduce_workspace*)allreduce_buffer.bytes;
+      workspace[i] = workspace_buf_other;
+    } else {
+      workspace[i] = workspace_buf;
+    }
+    symmetric_buffer[0][i] = workspace[i]->buffer + BUFFER0_OFFSET(0);
+    symmetric_buffer[1][i] = workspace[i]->buffer + BUFFER0_OFFSET(1);
+    distributed_buffer[0][i] = workspace[i]->buffer + BUFFER1_OFFSET(0);
+    distributed_buffer[1][i] = workspace[i]->buffer + BUFFER1_OFFSET(1);
+  }
+}
+
+static void parallel_memcpy(void* to, void* from, size_t n_bytes) __attribute__((target("avx512bw")));
+static void parallel_memcpy(void* to, void* from, size_t n_bytes) {
+  auto aligned_bytes = n_bytes - (n_bytes % VECTOR_LENGTH_IN_BYTES);
+  // process aligned part
+#pragma omp parallel for
+  for (size_t i = 0; i < aligned_bytes; i += VECTOR_LENGTH_IN_BYTES) {
+    auto val = _mm256_loadu_si256((__m256i*)((char*)from + i));
+    _mm256_storeu_si256((__m256i*)((char*)to + i), val);
+  }
+
+  // process remaining part
+  for (size_t i = aligned_bytes; i < n_bytes; i++) {
+    *((char*)to + i) = *((char*)from + i);
+  }
+}
+
+#define positive_mod(num, mod) ((((num) % (mod)) + (mod)) % (mod))
+#define rank_mod(rank) positive_mod(rank, world_size)
+size_t slice_size(size_t chunk_el, int slice_idx) {
+  size_t slice_size = chunk_el / world_size;
+  return slice_idx == world_size - 1 ? slice_size + (chunk_el % world_size) : slice_size;
+}
+
+char* slice_data(char* data_ptr, size_t chunk_el, int el_size, int slice_idx) {
+  size_t slice_size = chunk_el / world_size;
+  size_t el_offset = slice_size * slice_idx;
+  return data_ptr + el_offset * el_size;
+}
+
+size_t slice_el_start(size_t chunk_el, int slice_idx) {
+  size_t slice_size = chunk_el / world_size;
+  return slice_size * slice_idx;
+}
+
+void symmetric_naive_all_reduce(char* data_ptr, c10::ScalarType scalar_type, size_t chunk_size, size_t chunk_el) {
+  const int state_group = 0;
+  static int current_buffer = 0;
+  static int state_idx = 0;
+
+  // init states to case 0 to get rid of "maybe-uninitialized" warning.
+  enum coll_state copy_current = coll_allreduce_naive__copy_in_done;
+  enum coll_state copy_next = coll_alt1_allreduce_naive__copy_in_done;
+
+  switch (state_idx) {
+    case 0:
+      copy_current = coll_allreduce_naive__copy_in_done;
+      copy_next = coll_alt1_allreduce_naive__copy_in_done;
+      break;
+    case 1:
+      copy_current = coll_alt1_allreduce_naive__copy_in_done;
+      copy_next = coll_alt2_allreduce_naive__copy_in_done;
+      break;
+    case 2:
+      copy_current = coll_alt2_allreduce_naive__copy_in_done;
+      copy_next = coll_allreduce_naive__copy_in_done;
+      break;
+    default:
+      assert(!"Should not get here.");
+  }
+  state_idx = (state_idx + 1) % 3;
+
+  parallel_memcpy(symmetric_buffer[current_buffer][world_rank], data_ptr, chunk_size);
+  std::atomic_thread_fence(std::memory_order_release);
+  workspace[world_rank]->states[state_group] = copy_current;
+
+  for (int i = 0; i < world_size; i++) {
+    // wait until the other rank copy the buffer
+    if (i != world_rank) {
+      wait_buffer_state_until_2(i, copy_current, copy_next, state_group);
+    }
+  }
+
+  // each rank reduce the buffer independently so therre is no need for
+  // synchronization afterward
+  reduce_all_buffers(0, chunk_el, scalar_type, world_rank, data_ptr, symmetric_buffer[current_buffer]);
+
+  // switch buffer
+  current_buffer = 1 - current_buffer;
+}
+
+// naive allreduce distributed, each rank do naive reduce on its slice
+void distributed_naive_reduce(char* data_ptr, c10::ScalarType scalar_type, size_t chunk_size, size_t chunk_el) {
+  const int state_group = 1;
+  static int current_buffer = 0;
+  static int state_idx = 0;
+
+  // init states to case 0 to get rid of "maybe-uninitialized" warning.
+  enum coll_state copy_current = coll_allreduce_naive__copy_in_done;
+  enum coll_state reduce_current = coll_allreduce_naive__reduce_done;
+  enum coll_state copy_next = coll_alt1_allreduce_naive__copy_in_done;
+
+  // similar to symmetric_naive_allreduce, but here we only need two sets of
+  // states, because distributed naive reduce has two barriers in the algorithm
+  switch (state_idx) {
+    case 0:
+      copy_current = coll_allreduce_naive__copy_in_done;
+      reduce_current = coll_allreduce_naive__reduce_done;
+      copy_next = coll_alt1_allreduce_naive__copy_in_done;
+      break;
+    case 1:
+      copy_current = coll_alt1_allreduce_naive__copy_in_done;
+      reduce_current = coll_alt1_allreduce_naive__reduce_done;
+      copy_next = coll_allreduce_naive__copy_in_done;
+      break;
+    default:
+      assert(!"Should not get here.");
+  }
+  state_idx = (state_idx + 1) % 2;
+
+  int data_size = chunk_size / chunk_el;
+  parallel_memcpy(distributed_buffer[current_buffer][world_rank], data_ptr, chunk_size);
+  std::atomic_thread_fence(std::memory_order_release);
+  workspace[world_rank]->states[state_group] = copy_current;
+
+  for (int i = 0; i < world_size; i++) {
+    // wait until all the other ranks copy the buffer
+    if (i != world_rank) wait_buffer_state_until_2(i, copy_current, reduce_current, state_group);
+  }
+
+  // reduce scatter
+  reduce_all_buffers(
+      slice_el_start(chunk_el, world_rank),
+      slice_size(chunk_el, world_rank),
+      scalar_type,
+      world_rank,
+      distributed_buffer[current_buffer][world_rank],
+      distributed_buffer[current_buffer]);
+  std::atomic_thread_fence(std::memory_order_release);
+  workspace[world_rank]->states[state_group] = reduce_current;
+
+  for (int i = 0; i < world_size; i++) {
+    // wait until all the other ranks reduce the buffer
+    if (i != world_rank) wait_buffer_state_until_2(i, reduce_current, copy_next, state_group);
+  }
+
+  for (int i = 0; i < world_size; i++) {
+    int rank = (i + world_rank) % world_size;
+    parallel_memcpy(
+        slice_data(data_ptr, chunk_el, data_size, rank),
+        slice_data(distributed_buffer[current_buffer][rank], chunk_el, chunk_size / chunk_el, rank),
+        slice_size(chunk_el, rank) * data_size);
+  }
+
+  current_buffer = 1 - current_buffer;
+}
+
+void all_reduce_outer_loop(torch::Tensor& data, size_t numel, int data_size) {
+  for (int offset = 0; offset < data_size; offset += MAX_BUF_SIZE) {
+    auto data_ptr = ((char*)(data.data_ptr()) + offset);
+    size_t chunk_size = data_size - offset > MAX_BUF_SIZE ? MAX_BUF_SIZE : data_size - offset;
+    size_t chunk_el = chunk_size / (data_size / numel);
+    if (chunk_size < NAIVE_ALLREDUCE_THRESHOLD) {
+      symmetric_naive_all_reduce(data_ptr, data.scalar_type(), chunk_size, chunk_el);
+    } else {
+      distributed_naive_reduce(data_ptr, data.scalar_type(), chunk_size, chunk_el);
+    }
+  }
+}
+
+void naive_all_gather(char* result_ptr, char* data_ptr, size_t res_stride, size_t chunk_size, size_t chunk_el) {
+  const int state_group = 1;
+  static int current_buffer = 0;
+  static int state_idx = 0;
+
+  // init states to case 0 to get rid of "maybe-uninitialized" warning.
+  enum coll_state copy_current = coll_allgather_naive__copy_in_done;
+  enum coll_state copy_next = coll_alt1_allgather_naive__copy_in_done;
+
+  switch (state_idx) {
+    case 0:
+      copy_current = coll_allgather_naive__copy_in_done;
+      copy_next = coll_alt1_allgather_naive__copy_in_done;
+      break;
+    case 1:
+      copy_current = coll_alt1_allgather_naive__copy_in_done;
+      copy_next = coll_alt2_allgather_naive__copy_in_done;
+      break;
+    case 2:
+      copy_current = coll_alt2_allgather_naive__copy_in_done;
+      copy_next = coll_allgather_naive__copy_in_done;
+      break;
+    default:
+      assert(!"Should not get here.");
+  }
+  state_idx = (state_idx + 1) % 3;
+
+  parallel_memcpy(distributed_buffer[current_buffer][world_rank], data_ptr, chunk_size);
+  std::atomic_thread_fence(std::memory_order_release);
+  workspace[world_rank]->states[state_group] = copy_current;
+
+  for (int i = 0; i < world_size; i++) {
+    // wait until all the other ranks copy the buffer
+    if (i != world_rank) wait_buffer_state_until_2(i, copy_current, copy_next, state_group);
+  }
+  for (int i = 0; i < world_size; i++) {
+    parallel_memcpy(result_ptr + i * res_stride, distributed_buffer[current_buffer][i], chunk_size);
+  }
+  current_buffer = 1 - current_buffer;
+}
+
+torch::Tensor& all_gather(torch::Tensor& result, torch::Tensor& data, int dim, size_t numel, int data_size) {
+  size_t dim_el = data.stride(dim) * data.size(dim);
+  int dtype_size = data_size / numel;
+  size_t dim_size = dim_el * dtype_size;
+  int dim_count = data_size / dim_size;
+  auto data_ptr = (char*)(data.data_ptr());
+  auto result_ptr = (char*)(result.data_ptr());
+  for (int i = 0; i < dim_count; i++) {
+    for (size_t offset = 0; offset < dim_size; offset += MAX_BUF_SIZE) {
+      size_t chunk_size = dim_size - offset > MAX_BUF_SIZE ? MAX_BUF_SIZE : dim_size - offset;
+      size_t chunk_el = chunk_size / dtype_size;
+      naive_all_gather(
+          result_ptr + i * dim_size * world_size + offset,
+          data_ptr + i * dim_size + offset,
+          dim_size,
+          chunk_size,
+          chunk_el);
+    }
+  }
+  return result;
+}
diff --git a/sgl-kernel/csrc/cpu/shm.h b/sgl-kernel/csrc/cpu/shm.h
new file mode 100644
index 000000000..3e903972c
--- /dev/null
+++ b/sgl-kernel/csrc/cpu/shm.h
@@ -0,0 +1,11 @@
+#include <torch/all.h>
+
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+
+#ifndef __SHM_COLLECTIVES__
+#define __SHM_COLLECTIVES__
+#define VECTOR_LENGTH_IN_BYTES 32
+void shm_initialize(int size, int rank, const char* addr_string, const char* port_string);
+void all_reduce_outer_loop(torch::Tensor& data, size_t numel, int data_size);
+torch::Tensor& all_gather(torch::Tensor& result, torch::Tensor& data, int dim, size_t numel, int data_size);
+#endif
diff --git a/sgl-kernel/csrc/cpu/topk.cpp b/sgl-kernel/csrc/cpu/topk.cpp
new file mode 100644
index 000000000..abc5a34fa
--- /dev/null
+++ b/sgl-kernel/csrc/cpu/topk.cpp
@@ -0,0 +1,662 @@
+#include "common.h"
+#include "vec.h"
+
+namespace {
+
+template <typename scalar_t, int SIZE>
+inline void softmax(float* __restrict__ out, const scalar_t* __restrict__ input) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+
+  constexpr int kVecSize = bVec::size();
+
+  // step 1: get max
+  fVec max_fvec = fVec(-std::numeric_limits<float>::infinity());
+  if constexpr (SIZE < kVecSize) {
+    // SIZE = 1, 2, 4, 8, 16; only the top half is used
+    bVec x_bvec = bVec::loadu(input, SIZE);
+    fVec x_fvec0, x_fvec1;
+    std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
+    x_fvec0 = fVec::set(max_fvec, x_fvec0, SIZE);
+    max_fvec = at::vec::maximum(max_fvec, x_fvec0);
+    x_fvec0.store(out, SIZE);
+  } else {
+    for (int d = 0; d < SIZE; d += kVecSize) {
+      bVec x_bvec = bVec::loadu(input + d);
+      fVec x_fvec0, x_fvec1;
+      std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
+
+      max_fvec = at::vec::maximum(max_fvec, x_fvec0);
+      max_fvec = at::vec::maximum(max_fvec, x_fvec1);
+      x_fvec0.store(out + d);
+      x_fvec1.store(out + d + fVec::size());
+    }
+  }
+  float max_val = vec_reduce_max(max_fvec);
+  max_fvec = fVec(max_val);
+
+  // step 2: sum of (x - max).exp()
+  fVec sum_fvec = fVec(float(0));
+  if constexpr (SIZE < fVec::size()) {
+    // SIZE = 1, 2, 4, 8
+    fVec x_fvec = (fVec::loadu(out, SIZE) - max_fvec).exp_u20();
+    x_fvec = fVec::set(sum_fvec, x_fvec, SIZE);
+    sum_fvec += x_fvec;
+    x_fvec.store(out, SIZE);
+  } else {
+    for (int d = 0; d < SIZE; d += fVec::size()) {
+      fVec x_fvec = (fVec::loadu(out + d) - max_fvec).exp_u20();
+      sum_fvec += x_fvec;
+      x_fvec.store(out + d);
+    }
+  }
+  float sum_val = vec_reduce_sum(sum_fvec);
+
+  // step 3: x * (1 / sum)
+  sum_fvec = fVec(1.f / sum_val);
+  if constexpr (SIZE < fVec::size()) {
+    // SIZE = 1, 2, 4, 8
+    fVec out_fvec = fVec::loadu(out, SIZE) * sum_fvec;
+    out_fvec.store(out, SIZE);
+  } else {
+    for (int d = 0; d < SIZE; d += fVec::size()) {
+      fVec out_fvec = fVec::loadu(out + d) * sum_fvec;
+      out_fvec.store(out + d);
+    }
+  }
+}
+
+template <typename scalar_t, int NUM_EXPERTS>
+void grouped_topk_kernel_impl(
+    float* __restrict__ topk_weights,
+    int32_t* __restrict__ topk_ids,
+    const scalar_t* __restrict__ gating_output,
+    int64_t num_tokens,
+    int64_t topk,
+    int64_t num_groups,
+    int64_t topk_group,
+    bool renormalize) {
+  const int64_t num_experts_per_group = NUM_EXPERTS / num_groups;
+  at::parallel_for(0, num_tokens, 0, [&](int64_t begin, int64_t end) {
+    alignas(64) float scores[NUM_EXPERTS];
+
+    using elem_t = std::pair<float, int32_t>;
+    std::vector<elem_t> queue(num_groups);
+    std::vector<elem_t> queue2(topk_group * num_experts_per_group);
+
+    for (int64_t i = begin; i < end; ++i) {
+      // do softmax to get scores
+      softmax<scalar_t, NUM_EXPERTS>(scores, gating_output + i * NUM_EXPERTS);
+
+      // find max score per group
+      for (int64_t g = 0; g < num_groups; ++g) {
+        float gmax = -std::numeric_limits<float>::infinity();
+        for (int64_t e = 0; e < num_experts_per_group; ++e) {
+          gmax = std::max(gmax, scores[g * num_experts_per_group + e]);
+        }
+        queue[g] = {gmax, g};
+      }
+
+      // find group topk
+      std::partial_sort(
+          queue.begin(), queue.begin() + topk_group, queue.end(), [](const elem_t& x, const elem_t& y) -> bool {
+            return x.first > y.first;
+          });
+
+      for (int64_t g = 0; g < topk_group; ++g) {
+        int32_t group_idx = queue[g].second;
+        for (int64_t e = 0; e < num_experts_per_group; ++e) {
+          int32_t expert_idx = group_idx * num_experts_per_group + e;
+          queue2[g * num_experts_per_group + e] = {scores[expert_idx], expert_idx};
+        }
+      }
+
+      // find global topk
+      std::partial_sort(
+          queue2.begin(), queue2.begin() + topk, queue2.end(), [](const elem_t& x, const elem_t& y) -> bool {
+            return x.first > y.first;
+          });
+
+      for (int64_t j = 0; j < topk; ++j) {
+        topk_weights[i * topk + j] = queue2[j].first;
+        topk_ids[i * topk + j] = queue2[j].second;
+      }
+
+      if (renormalize) {
+        float sum = 0.f;
+        for (int64_t j = 0; j < topk; ++j) {
+          sum += topk_weights[i * topk + j];
+        }
+        float scale = 1.f / sum;
+        for (int64_t j = 0; j < topk; ++j) {
+          topk_weights[i * topk + j] *= scale;
+        }
+      }
+    }
+  });
+}
+
+template <typename scalar_t, int SIZE>
+inline void sigmoid(float* __restrict__ out, const scalar_t* __restrict__ input) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+
+  const fVec one = fVec(1.f);
+
+  constexpr int kVecSize = bVec::size();
+  for (int d = 0; d < SIZE; d += kVecSize) {
+    bVec x_bvec = bVec::loadu(input + d);
+    fVec x_fvec0, x_fvec1;
+    std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
+
+    x_fvec0 = one / (one + x_fvec0.neg().exp_u20());
+    x_fvec1 = one / (one + x_fvec1.neg().exp_u20());
+
+    x_fvec0.store(out + d);
+    x_fvec1.store(out + d + fVec::size());
+  }
+}
+
+template <typename scalar_t, int NUM_EXPERTS>
+void topk_sigmoid_kernel_impl(
+    float* __restrict__ topk_weights,
+    int32_t* __restrict__ topk_ids,
+    const scalar_t* __restrict__ gating_output,
+    int64_t num_tokens,
+    int64_t topk,
+    bool renormalize) {
+  using Vec = at::vec::Vectorized<float>;
+  const int64_t num_experts_per_group = NUM_EXPERTS;
+  at::parallel_for(0, num_tokens, 0, [&](int64_t begin, int64_t end) {
+    alignas(64) float scores[NUM_EXPERTS];
+    using elem_t = std::pair<float, int32_t>;
+    std::vector<elem_t> queue(num_experts_per_group);
+
+    for (int64_t i = begin; i < end; ++i) {
+      at::vec::convert<scalar_t, float>(gating_output + i * NUM_EXPERTS, scores, NUM_EXPERTS);
+
+      float gmax = at::vec::reduce_all<float>(
+          [](Vec& x, Vec& y) { return at::vec::maximum(x, y); }, scores, num_experts_per_group);
+
+      // find position of first max,
+      // note that we may have multiple max values.
+      int first_max_idx = -1;
+      for (int64_t e = 0; e < num_experts_per_group; ++e) {
+        if (scores[e] == gmax) {
+          first_max_idx = e;
+          break;
+        }
+      }
+
+      // scalar sigmoid
+      topk_weights[i] = 1.0 / (1.0 + exp(0.0 - gmax));
+      topk_ids[i] = first_max_idx;
+
+      if (renormalize) {
+        float sum = 0.f;
+        for (int64_t j = 0; j < topk; ++j) {
+          sum += topk_weights[i * topk + j];
+        }
+        float scale = 1.f / sum;
+        for (int64_t j = 0; j < topk; ++j) {
+          topk_weights[i * topk + j] *= scale;
+        }
+      }
+    }
+  });
+}
+
+template <typename scalar_t, int NUM_EXPERTS>
+void topk_softmax_kernel_impl(
+    float* __restrict__ topk_weights,
+    int32_t* __restrict__ topk_ids,
+    const scalar_t* __restrict__ gating_output,
+    int64_t num_tokens,
+    int64_t topk,
+    bool renormalize) {
+  const int64_t num_experts_per_group = NUM_EXPERTS;
+  at::parallel_for(0, num_tokens, 0, [&](int64_t begin, int64_t end) {
+    alignas(64) float scores[NUM_EXPERTS];
+    using elem_t = std::pair<float, int32_t>;
+    std::vector<elem_t> queue(num_experts_per_group);
+
+    for (int64_t i = begin; i < end; ++i) {
+      softmax<scalar_t, NUM_EXPERTS>(scores, gating_output + i * NUM_EXPERTS);
+
+      for (int64_t e = 0; e < num_experts_per_group; ++e) {
+        queue[e] = {scores[e], e};
+      }
+
+      std::partial_sort(
+          queue.begin(),
+          queue.begin() + num_experts_per_group,
+          queue.end(),
+          [](const elem_t& x, const elem_t& y) -> bool { return x.first > y.first; });
+
+      for (int64_t j = 0; j < topk; ++j) {
+        topk_weights[i * topk + j] = queue[j].first;
+        topk_ids[i * topk + j] = queue[j].second;
+      }
+
+      if (renormalize) {
+        float sum = 0.f;
+        for (int64_t j = 0; j < topk; ++j) {
+          sum += topk_weights[i * topk + j];
+        }
+        float scale = 1.f / sum;
+        for (int64_t j = 0; j < topk; ++j) {
+          topk_weights[i * topk + j] *= scale;
+        }
+      }
+    }
+  });
+}
+
+template <typename scalar_t, typename param_t, int SIZE>
+inline void
+apply_bias(float* __restrict__ scores2, const float* __restrict__ scores, const param_t* __restrict__ bias) {
+  using fVec = at::vec::Vectorized<float>;
+  using bVec = at::vec::Vectorized<scalar_t>;
+  auto vec_size = bVec::size();
+  int d = 0;
+  for (; d <= SIZE - vec_size; d += vec_size) {
+    fVec bias0, bias1, x0, x1;
+    std::tie(bias0, bias1) = load_float_vec2(bias + d);
+    std::tie(x0, x1) = load_float_vec2(scores + d);
+    x0 = x0 + bias0;
+    x1 = x1 + bias1;
+    x0.store(scores2 + d);
+    x1.store(scores2 + d + fVec::size());
+  }
+  for (; d < SIZE; d++) {
+    scores2[d] = scores[d] + (float)bias[d];
+  }
+}
+
+template <typename scalar_t, typename param_t, int NUM_EXPERTS, int TOPK>
+void biased_grouped_topk_kernel_impl(
+    float* __restrict__ topk_weights,
+    int32_t* __restrict__ topk_ids,
+    const scalar_t* __restrict__ gating_output,
+    const param_t* __restrict__ bias,
+    int64_t num_tokens,
+    int64_t num_groups,
+    int64_t topk_group,
+    bool renormalize) {
+  using Vec = at::vec::Vectorized<float>;
+
+  const int64_t num_experts_per_group = NUM_EXPERTS / num_groups;
+  at::parallel_for(0, num_tokens, 0, [&](int64_t begin, int64_t end) {
+    // scores: sigmoid
+    alignas(64) float scores[NUM_EXPERTS];
+    // scores for choice: sigmoid + bias
+    alignas(64) float scores2[NUM_EXPERTS];
+
+    using elem_t = std::pair<float, int32_t>;
+    std::vector<elem_t> queue(num_groups);
+    std::vector<elem_t> queue2(topk_group * num_experts_per_group);
+
+    for (int64_t i = begin; i < end; ++i) {
+      // do sigmoid to get scores
+      sigmoid<scalar_t, NUM_EXPERTS>(scores, gating_output + i * NUM_EXPERTS);
+
+      apply_bias<scalar_t, param_t, NUM_EXPERTS>(scores2, scores, bias);
+
+      for (int64_t g = 0; g < num_groups; ++g) {
+        // find the max
+        float gmax = at::vec::reduce_all<float>(
+            [](Vec& x, Vec& y) { return at::vec::maximum(x, y); },
+            scores2 + g * num_experts_per_group,
+            num_experts_per_group);
+
+        // find position of first max,
+        // note that we may have multiple max values.
+        int first_max_idx = -1;
+        for (int64_t e = 0; e < num_experts_per_group; ++e) {
+          if (scores2[g * num_experts_per_group + e] == gmax) {
+            first_max_idx = g * num_experts_per_group + e;
+            break;
+          }
+        }
+
+        // find the 2nd max
+        scores2[first_max_idx] = -std::numeric_limits<float>::infinity();
+        float gmax2 = at::vec::reduce_all<float>(
+            [](Vec& x, Vec& y) { return at::vec::maximum(x, y); },
+            scores2 + g * num_experts_per_group,
+            num_experts_per_group);
+        // restore scores for choice
+        scores2[first_max_idx] = gmax;
+
+        queue[g] = {gmax + gmax2, g};
+      }
+
+      // find group topk
+      std::partial_sort(
+          queue.begin(), queue.begin() + topk_group, queue.end(), [](const elem_t& x, const elem_t& y) -> bool {
+            return x.first > y.first;
+          });
+
+      for (int64_t g = 0; g < topk_group; ++g) {
+        int32_t group_idx = queue[g].second;
+        for (int64_t e = 0; e < num_experts_per_group; ++e) {
+          int32_t expert_idx = group_idx * num_experts_per_group + e;
+          queue2[g * num_experts_per_group + e] = {scores2[expert_idx], expert_idx};
+        }
+      }
+
+      // find global topk
+      std::partial_sort(
+          queue2.begin(), queue2.begin() + TOPK, queue2.end(), [](const elem_t& x, const elem_t& y) -> bool {
+            return x.first > y.first;
+          });
+
+      for (int j = 0; j < TOPK; ++j) {
+        int32_t index = queue2[j].second;
+        topk_ids[i * TOPK + j] = index;
+        topk_weights[i * TOPK + j] = scores[index];
+      }
+
+#if defined(CPU_CAPABILITY_AVX512)
+      if (renormalize) {
+        __mmask16 mask = (1ULL << TOPK) - 1;
+        __m512 x = _mm512_maskz_loadu_ps(mask, topk_weights + i * TOPK);
+        float sum = _mm512_reduce_add_ps(x);
+        __m512 vscale = _mm512_set1_ps(1.f / sum);
+        __m512 y = _mm512_mul_ps(x, vscale);
+        _mm512_mask_storeu_ps(topk_weights + i * TOPK, mask, y);
+      }
+#else
+      if (renormalize) {
+        float sum = 0.f;
+        for (int64_t j = 0; j < TOPK; ++j) {
+          sum += topk_weights[i * TOPK + j];
+        }
+        float scale = 1.f / sum;
+        for (int64_t j = 0; j < TOPK; ++j) {
+          topk_weights[i * TOPK + j] *= scale;
+        }
+      }
+#endif
+    }
+  });
+}
+
+#define LAUNCH_GROUPED_TOPK_KERNEL(NE)    \
+  grouped_topk_kernel_impl<scalar_t, NE>( \
+      topk_weights.data_ptr<float>(),     \
+      topk_ids.data_ptr<int32_t>(),       \
+      gating_output.data_ptr<scalar_t>(), \
+      num_tokens,                         \
+      topk,                               \
+      num_expert_group,                   \
+      topk_group,                         \
+      renormalize);
+
+#define LAUNCH_TOPK_SIGMOID_KERNEL(NE)    \
+  topk_sigmoid_kernel_impl<scalar_t, NE>( \
+      topk_weights.data_ptr<float>(),     \
+      topk_ids.data_ptr<int32_t>(),       \
+      gating_output.data_ptr<scalar_t>(), \
+      num_tokens,                         \
+      topk,                               \
+      renormalize);
+
+#define LAUNCH_TOPK_SOFTMAX_KERNEL(NE)    \
+  topk_softmax_kernel_impl<scalar_t, NE>( \
+      topk_weights.data_ptr<float>(),     \
+      topk_ids.data_ptr<int32_t>(),       \
+      gating_output.data_ptr<scalar_t>(), \
+      num_tokens,                         \
+      topk,                               \
+      renormalize);
+
+#define LAUNCH_BIASED_GROUPED_TOPK_KERNEL(NE, NTOPK)             \
+  biased_grouped_topk_kernel_impl<scalar_t, param_t, NE, NTOPK>( \
+      topk_weights.data_ptr<float>(),                            \
+      topk_ids.data_ptr<int32_t>(),                              \
+      gating_output.data_ptr<scalar_t>(),                        \
+      correction_bias.data_ptr<param_t>(),                       \
+      num_tokens,                                                \
+      num_expert_group,                                          \
+      topk_group,                                                \
+      renormalize);
+
+}  // anonymous namespace
+
+std::tuple<at::Tensor, at::Tensor>
+topk_sigmoid_cpu(at::Tensor& hidden_states, at::Tensor& gating_output, int64_t topk, bool renormalize) {
+  RECORD_FUNCTION("sgl-kernel::topk_sigmoid_cpu", std::vector<c10::IValue>({hidden_states, gating_output}));
+  CHECK_INPUT(gating_output);
+
+  const auto st = hidden_states.scalar_type();
+  CHECK_EQ(gating_output.scalar_type(), st);
+
+  int64_t num_tokens = hidden_states.size(0);
+  int64_t num_experts = gating_output.size(1);
+  TORCH_CHECK(gating_output.size(0) == num_tokens, "Number of tokens mismatch");
+  TORCH_CHECK(topk == 1, "topk_sigmoid only supports topk=1 case");
+  at::Tensor topk_weights = at::empty({num_tokens, topk}, hidden_states.options().dtype(at::kFloat));
+  at::Tensor topk_ids = at::empty({num_tokens, topk}, hidden_states.options().dtype(at::kInt));
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(st, "topk_sigmoid_kernel", [&] {
+    switch (num_experts) {
+      case 1:
+        LAUNCH_TOPK_SIGMOID_KERNEL(1);
+        break;
+      case 2:
+        LAUNCH_TOPK_SIGMOID_KERNEL(2);
+        break;
+      case 4:
+        LAUNCH_TOPK_SIGMOID_KERNEL(4);
+        break;
+      case 8:
+        LAUNCH_TOPK_SIGMOID_KERNEL(8);
+        break;
+      case 16:
+        LAUNCH_TOPK_SIGMOID_KERNEL(16);
+        break;
+      case 32:
+        LAUNCH_TOPK_SIGMOID_KERNEL(32);
+        break;
+      case 64:
+        LAUNCH_TOPK_SIGMOID_KERNEL(64);
+        break;
+      case 128:
+        LAUNCH_TOPK_SIGMOID_KERNEL(128);
+        break;
+      case 160:
+        LAUNCH_TOPK_SIGMOID_KERNEL(160);
+        break;
+      case 256:
+        LAUNCH_TOPK_SIGMOID_KERNEL(256);
+        break;
+      default:
+        TORCH_CHECK(false, "Unexpected num_experts: ", num_experts);
+    }
+  });
+  return std::make_tuple(topk_weights, topk_ids);
+}
+
+std::tuple<at::Tensor, at::Tensor>
+topk_softmax_cpu(at::Tensor& hidden_states, at::Tensor& gating_output, int64_t topk, bool renormalize) {
+  RECORD_FUNCTION("sgl-kernel::topk_softmax_cpu", std::vector<c10::IValue>({hidden_states, gating_output}));
+  CHECK_INPUT(gating_output);
+
+  const auto st = hidden_states.scalar_type();
+  CHECK_EQ(gating_output.scalar_type(), st);
+
+  int64_t num_tokens = hidden_states.size(0);
+  int64_t num_experts = gating_output.size(1);
+  TORCH_CHECK(gating_output.size(0) == num_tokens, "Number of tokens mismatch");
+
+  at::Tensor topk_weights = at::empty({num_tokens, topk}, hidden_states.options().dtype(at::kFloat));
+  at::Tensor topk_ids = at::empty({num_tokens, topk}, hidden_states.options().dtype(at::kInt));
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(st, "topk_softmax_cpu", [&] {
+    switch (num_experts) {
+      case 1:
+        LAUNCH_TOPK_SOFTMAX_KERNEL(1);
+        break;
+      case 2:
+        LAUNCH_TOPK_SOFTMAX_KERNEL(2);
+        break;
+      case 4:
+        LAUNCH_TOPK_SOFTMAX_KERNEL(4);
+        break;
+      case 8:
+        LAUNCH_TOPK_SOFTMAX_KERNEL(8);
+        break;
+      case 16:
+        LAUNCH_TOPK_SOFTMAX_KERNEL(16);
+        break;
+      case 32:
+        LAUNCH_TOPK_SOFTMAX_KERNEL(32);
+        break;
+      case 64:
+        LAUNCH_TOPK_SOFTMAX_KERNEL(64);
+        break;
+      case 128:
+        LAUNCH_TOPK_SOFTMAX_KERNEL(128);
+        break;
+      case 160:
+        LAUNCH_TOPK_SOFTMAX_KERNEL(160);
+        break;
+      case 256:
+        LAUNCH_TOPK_SOFTMAX_KERNEL(256);
+        break;
+      default:
+        TORCH_CHECK(false, "Unexpected num_experts: ", num_experts);
+    }
+  });
+  return std::make_tuple(topk_weights, topk_ids);
+}
+
+// grouped topk for DeepSeek V2
+std::tuple<at::Tensor, at::Tensor> grouped_topk_cpu(
+    at::Tensor& hidden_states,
+    at::Tensor& gating_output,
+    int64_t topk,
+    bool renormalize,
+    int64_t num_expert_group,
+    int64_t topk_group,
+    int64_t num_fused_shared_experts,
+    std::optional<double> routed_scaling_factor,
+    std::optional<at::Tensor> num_token_non_padded) {
+  // TODO: Will support num_fused_shared_experts, routed_scaling_factor and num_token_non_padded.
+  // For now, we just check them as default value.
+  TORCH_CHECK(
+      num_fused_shared_experts == 0,
+      "num_fused_shared_experts must be 0 default value, got: ",
+      num_fused_shared_experts);
+  TORCH_CHECK(
+      !routed_scaling_factor.has_value() || routed_scaling_factor.value() == 1.0f,
+      "routed_scaling_factor must be None or 1.0f default value, got: ",
+      routed_scaling_factor.value());
+  TORCH_CHECK(
+      !num_token_non_padded.has_value(),
+      "num_token_non_padded must be None default value, got: ",
+      num_token_non_padded.value());
+
+  RECORD_FUNCTION("sgl-kernel::grouped_topk_cpu", std::vector<c10::IValue>({hidden_states, gating_output}));
+  CHECK_INPUT(gating_output);
+
+  const auto st = hidden_states.scalar_type();
+  CHECK_EQ(gating_output.scalar_type(), st);
+
+  int64_t num_tokens = hidden_states.size(0);
+  int64_t num_experts = gating_output.size(1);
+  TORCH_CHECK(gating_output.size(0) == num_tokens, "Number of tokens mismatch");
+  at::Tensor topk_weights = at::empty({num_tokens, topk}, hidden_states.options().dtype(at::kFloat));
+  at::Tensor topk_ids = at::empty({num_tokens, topk}, hidden_states.options().dtype(at::kInt));
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(st, "grouped_topk_kernel", [&] {
+    switch (num_experts) {
+      case 1:
+        LAUNCH_GROUPED_TOPK_KERNEL(1);
+        break;
+      case 2:
+        LAUNCH_GROUPED_TOPK_KERNEL(2);
+        break;
+      case 4:
+        LAUNCH_GROUPED_TOPK_KERNEL(4);
+        break;
+      case 8:
+        LAUNCH_GROUPED_TOPK_KERNEL(8);
+        break;
+      case 16:
+        LAUNCH_GROUPED_TOPK_KERNEL(16);
+        break;
+      case 32:
+        LAUNCH_GROUPED_TOPK_KERNEL(32);
+        break;
+      case 64:
+        LAUNCH_GROUPED_TOPK_KERNEL(64);
+        break;
+      case 128:
+        LAUNCH_GROUPED_TOPK_KERNEL(128);
+        break;
+      case 160:
+        LAUNCH_GROUPED_TOPK_KERNEL(160);
+        break;
+      case 256:
+        LAUNCH_GROUPED_TOPK_KERNEL(256);
+        break;
+      default:
+        TORCH_CHECK(false, "Unexpected num_experts: ", num_experts);
+    }
+  });
+  return std::make_tuple(topk_weights, topk_ids);
+}
+
+// biased grouped topk DeepSeek V3/R1
+std::tuple<at::Tensor, at::Tensor> biased_grouped_topk_cpu(
+    at::Tensor& hidden_states,
+    at::Tensor& gating_output,
+    at::Tensor& correction_bias,
+    int64_t topk,
+    bool renormalize,
+    int64_t num_expert_group,
+    int64_t topk_group,
+    int64_t num_fused_shared_experts,
+    std::optional<double> routed_scaling_factor,
+    std::optional<at::Tensor> num_token_non_padded) {
+  // TODO: Will support num_fused_shared_experts, routed_scaling_factor and num_token_non_padded.
+  // For now, we just check them as default value.
+  TORCH_CHECK(
+      num_fused_shared_experts == 0,
+      "num_fused_shared_experts must be 0 default value, got: ",
+      num_fused_shared_experts);
+  TORCH_CHECK(
+      !num_token_non_padded.has_value(),
+      "num_token_non_padded must be None default value, got: ",
+      num_token_non_padded.value());
+
+  RECORD_FUNCTION(
+      "sgl-kernel::biased_grouped_topk_cpu", std::vector<c10::IValue>({hidden_states, gating_output, correction_bias}));
+
+  CHECK_INPUT(gating_output);
+  CHECK_INPUT(correction_bias);
+
+  const auto st = hidden_states.scalar_type();
+  CHECK_EQ(gating_output.scalar_type(), st);
+
+  int64_t num_tokens = hidden_states.size(0);
+  int64_t num_experts = gating_output.size(1);
+  TORCH_CHECK(gating_output.size(0) == num_tokens, "Number of tokens mismatch");
+  TORCH_CHECK(correction_bias.numel() == num_experts, "Bias shape mismatch");
+  at::Tensor topk_weights = at::empty({num_tokens, topk}, hidden_states.options().dtype(at::kFloat));
+  at::Tensor topk_ids = at::empty({num_tokens, topk}, hidden_states.options().dtype(at::kInt));
+
+  CPU_DISPATCH_REDUCED_FLOATING_TYPES_EXT(st, correction_bias.scalar_type(), "biased_grouped_topk_kernel", [&] {
+    TORCH_CHECK(topk == 8, "Unexpected topk: ", topk);
+    switch (num_experts) {
+      case 256:
+        LAUNCH_BIASED_GROUPED_TOPK_KERNEL(256, 8);
+        break;
+      default:
+        TORCH_CHECK(false, "Unexpected num_experts: ", num_experts);
+    }
+  });
+  return std::make_tuple(topk_weights, topk_ids);
+}
diff --git a/sgl-kernel/csrc/cpu/torch_extension_cpu.cpp b/sgl-kernel/csrc/cpu/torch_extension_cpu.cpp
new file mode 100644
index 000000000..2c8d9e3ec
--- /dev/null
+++ b/sgl-kernel/csrc/cpu/torch_extension_cpu.cpp
@@ -0,0 +1,373 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <ATen/ATen.h>
+#include <torch/all.h>
+#include <torch/library.h>
+
+#include "sgl_kernel_ops.h"
+#include "shm.h"
+
+// silu_and_mul
+at::Tensor silu_and_mul_cpu(at::Tensor& input);
+
+// gelu_and_mul
+at::Tensor gelu_tanh_and_mul_cpu(const at::Tensor& input);
+at::Tensor gelu_and_mul_cpu(const at::Tensor& input);
+
+// l2norm
+at::Tensor l2norm_cpu(at::Tensor& input, double eps);
+
+// rmsnorm
+at::Tensor rmsnorm_cpu(at::Tensor& input, at::Tensor& weight, double eps);
+
+// fused_add_rmsnorm
+void fused_add_rmsnorm_cpu(at::Tensor& input, at::Tensor& residual, at::Tensor& weight, double eps);
+
+// topk
+std::tuple<at::Tensor, at::Tensor>
+topk_sigmoid_cpu(at::Tensor& hidden_states, at::Tensor& gating_output, int64_t topk, bool renormalize);
+std::tuple<at::Tensor, at::Tensor>
+topk_softmax_cpu(at::Tensor& hidden_states, at::Tensor& gating_output, int64_t topk, bool renormalize);
+
+std::tuple<at::Tensor, at::Tensor> grouped_topk_cpu(
+    at::Tensor& hidden_states,
+    at::Tensor& gating_output,
+    int64_t topk,
+    bool renormalize,
+    int64_t num_expert_group,
+    int64_t topk_group,
+    int64_t num_fused_shared_experts,
+    std::optional<double> routed_scaling_factor,
+    std::optional<at::Tensor> num_token_non_padded);
+
+std::tuple<at::Tensor, at::Tensor> biased_grouped_topk_cpu(
+    at::Tensor& hidden_states,
+    at::Tensor& gating_output,
+    at::Tensor& correction_bias,
+    int64_t topk,
+    bool renormalize,
+    int64_t num_expert_group,
+    int64_t topk_group,
+    int64_t num_fused_shared_experts,
+    std::optional<double> routed_scaling_factor,
+    std::optional<at::Tensor> num_token_non_padded);
+
+// attention
+void decode_attention_cpu(
+    at::Tensor& query,
+    at::Tensor& k_cache,
+    at::Tensor& v_cache,
+    at::Tensor& output,
+    at::Tensor& key,
+    at::Tensor& value,
+    at::Tensor& loc,
+    at::Tensor& attn_logits,
+    at::Tensor& req_to_token,
+    at::Tensor& req_pool_indices,
+    at::Tensor& seq_lens,
+    double sm_scale,
+    double logit_cap);
+
+void extend_attention_cpu(
+    at::Tensor& q_extend,
+    at::Tensor& k_extend,
+    at::Tensor& v_extend,
+    at::Tensor& o_extend,
+    at::Tensor& k_buffer,
+    at::Tensor& v_buffer,
+    at::Tensor& req_to_token,
+    at::Tensor& req_pool_indices,
+    at::Tensor& seq_lens,
+    at::Tensor& extend_seq_lens,
+    at::Tensor& extend_start_loc,
+    int64_t max_len_extend,
+    double sm_scale,
+    double logit_cap);
+
+// weight prepack
+at::Tensor convert_weight_packed(at::Tensor& weight);
+
+// quant
+std::tuple<at::Tensor, at::Tensor> per_token_quant_int8_cpu(at::Tensor& A);
+
+// gemm
+at::Tensor
+weight_packed_linear(at::Tensor& mat1, at::Tensor& mat2, const std::optional<at::Tensor>& bias, bool is_vnni);
+
+// igemm
+at::Tensor int8_scaled_mm_cpu(
+    at::Tensor& mat1,
+    at::Tensor& mat2,
+    at::Tensor& scales1,
+    at::Tensor& scales2,
+    const std::optional<at::Tensor>& bias,
+    at::ScalarType out_dtype,
+    bool is_vnni);
+
+// fp8 gemm
+at::Tensor fp8_scaled_mm_cpu(
+    at::Tensor& mat1,
+    at::Tensor& mat2,
+    at::Tensor& scales2,
+    std::vector<int64_t> block_size,
+    const std::optional<at::Tensor>& bias,
+    at::ScalarType out_dtype,
+    bool is_vnni);
+
+// quant + igemm
+at::Tensor int8_scaled_mm_with_quant(
+    at::Tensor& mat1,
+    at::Tensor& mat2,
+    at::Tensor& scales2,
+    const std::optional<at::Tensor>& bias,
+    at::ScalarType out_dtype,
+    bool is_vnni);
+
+// bmm
+void bmm_cpu(at::Tensor& out, at::Tensor& mat1, at::Tensor& mat2, bool is_vnni, const std::optional<at::Tensor>& scale);
+
+// fused moe
+at::Tensor fused_experts_cpu(
+    at::Tensor& hidden_states,
+    at::Tensor& w1,
+    at::Tensor& w2,
+    at::Tensor& topk_weights,
+    at::Tensor& topk_ids,
+    bool inplace,
+    bool use_int8_w8a8,
+    bool use_fp8_w8a16,
+    const std::optional<at::Tensor>& w1_scale,
+    const std::optional<at::Tensor>& w2_scale,
+    const std::optional<std::vector<int64_t>> block_size,
+    const std::optional<at::Tensor>& a1_scale,
+    const std::optional<at::Tensor>& a2_scale,
+    bool is_vnni);
+
+at::Tensor shared_expert_cpu(
+    at::Tensor& hidden_states,
+    at::Tensor& w1,
+    at::Tensor& w2,
+    at::Tensor& fused_experts_out,
+    double routed_scaling_factor,
+    bool inplace,
+    bool use_int8_w8a8,
+    bool use_fp8_w8a16,
+    const std::optional<at::Tensor>& w1_scale,
+    const std::optional<at::Tensor>& w2_scale,
+    const std::optional<std::vector<int64_t>> block_size,
+    const std::optional<at::Tensor>& a1_scale,
+    const std::optional<at::Tensor>& a2_scale,
+    bool is_vnni);
+
+// weight absorption
+std::tuple<at::Tensor, at::Tensor, at::Tensor> qkv_proj_with_rope(
+    at::Tensor& hidden_states,
+    at::Tensor& q_a_proj_weight,
+    at::Tensor& q_b_proj_weight,
+    at::Tensor& kv_a_proj_weight,
+    at::Tensor& w_kc,
+    at::Tensor& q_a_layernorm_weight,
+    at::Tensor& kv_a_layernorm_weight,
+    at::Tensor& positions,
+    at::Tensor& cos_sin_cache,
+    double eps,
+    bool use_int8_w8a8,
+    bool use_fp8_w8a16,
+    std::optional<at::Tensor> q_a_proj_scale,
+    std::optional<at::Tensor> q_b_proj_scale,
+    std::optional<at::Tensor> kv_a_proj_scale,
+    bool is_vnni,
+    std::optional<std::vector<int64_t>> block_size);
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> qkv_proj_with_rope_fused_weight(
+    at::Tensor& hidden_states,
+    at::Tensor& qkv_a_proj_weight,
+    at::Tensor& q_b_proj_weight,
+    at::Tensor& w_kc,
+    at::Tensor& q_a_layernorm_weight,
+    at::Tensor& kv_a_layernorm_weight,
+    at::Tensor& positions,
+    at::Tensor& cos_sin_cache,
+    double eps,
+    bool use_int8_w8a8,
+    bool use_fp8_w8a16,
+    std::optional<at::Tensor> qkv_a_proj_scale,
+    std::optional<at::Tensor> q_b_proj_scale,
+    bool is_vnni,
+    std::optional<std::vector<int64_t>> block_size,
+    int64_t q_lora_rank,
+    int64_t kv_lora_rank,
+    int64_t qk_rope_head_dim);
+
+// shared memory init
+void initialize(int64_t size, int64_t rank);
+
+// shared mmeory all_reduce
+void shm_allreduce(at::Tensor& data, int64_t op);
+
+// shared memory all_gather
+at::Tensor shm_allgather(at::Tensor& data, int64_t dim);
+
+// rope
+std::tuple<at::Tensor, at::Tensor> rotary_embedding_cpu(
+    at::Tensor& positions,
+    at::Tensor& query,
+    at::Tensor& key,
+    int64_t head_size,
+    at::Tensor& cos_sin_cache,
+    bool is_neox);
+
+// CPU and memory binding
+std::string init_cpu_threads_env(const std::string& cpu_ids);
+
+TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
+  // activation
+  m.def("silu_and_mul_cpu(Tensor input) -> Tensor");
+  m.impl("silu_and_mul_cpu", torch::kCPU, &silu_and_mul_cpu);
+  m.def("gelu_tanh_and_mul_cpu(Tensor input) -> Tensor");
+  m.impl("gelu_tanh_and_mul_cpu", torch::kCPU, &gelu_tanh_and_mul_cpu);
+  m.def("gelu_and_mul_cpu(Tensor input) -> Tensor");
+  m.impl("gelu_and_mul_cpu", torch::kCPU, &gelu_and_mul_cpu);
+
+  // norm
+  m.def("rmsnorm_cpu(Tensor input, Tensor weight, float eps) -> Tensor");
+  m.impl("rmsnorm_cpu", torch::kCPU, &rmsnorm_cpu);
+  m.def("l2norm_cpu(Tensor input, float eps) -> Tensor");
+  m.impl("l2norm_cpu", torch::kCPU, &l2norm_cpu);
+  m.def("fused_add_rmsnorm_cpu(Tensor(a!) input, Tensor residual, Tensor weight, float eps) -> ()");
+  m.impl("fused_add_rmsnorm_cpu", torch::kCPU, &fused_add_rmsnorm_cpu);
+
+  // topk
+  m.def("topk_sigmoid_cpu(Tensor hidden_states, Tensor gating_output, int topk, bool renormalize) -> (Tensor, Tensor)");
+  m.impl("topk_sigmoid_cpu", torch::kCPU, &topk_sigmoid_cpu);
+  m.def("topk_softmax_cpu(Tensor hidden_states, Tensor gating_output, int topk, bool renormalize) -> (Tensor, Tensor)");
+  m.impl("topk_softmax_cpu", torch::kCPU, &topk_softmax_cpu);
+  m.def(
+      "grouped_topk_cpu(Tensor hidden_states, Tensor gating_output, int topk, bool renormalize, int num_expert_group, "
+      "int topk_group, int num_fused_shared_experts, float? routed_scaling_factor, Tensor? num_token_non_padded) -> "
+      "(Tensor, Tensor)");
+  m.impl("grouped_topk_cpu", torch::kCPU, &grouped_topk_cpu);
+
+  // biased group topk
+  m.def(
+      "biased_grouped_topk_cpu(Tensor hidden_states, Tensor gating_output, Tensor correction_bias, int topk, bool "
+      "renormalize, int num_expert_group, int topk_group, int num_fused_shared_experts, float? routed_scaling_factor, "
+      "Tensor? num_token_non_padded) -> (Tensor, Tensor)");
+  m.impl("biased_grouped_topk_cpu", torch::kCPU, &biased_grouped_topk_cpu);
+
+  // decode
+  m.def(
+      "decode_attention_cpu(Tensor query, Tensor k_cache, Tensor v_cahce, Tensor(a!) output, Tensor key, Tensor value, "
+      "Tensor loc, Tensor attn_logits, Tensor req_to_token, Tensor req_pool_indices, Tensor seq_lens, float sm_scale, "
+      "float logit_cap) -> ()");
+  m.impl("decode_attention_cpu", torch::kCPU, &decode_attention_cpu);
+
+  // extend
+  m.def(
+      "extend_attention_cpu(Tensor q_extend, Tensor k_extend, Tensor v_extend, Tensor(a!) o_extend, Tensor k_buffer, "
+      "Tensor v_buffer, Tensor req_to_token, Tensor req_pool_indices, Tensor seq_lens, Tensor extend_seq_lens, Tensor "
+      "extend_start_loc, int max_len_extend, float sm_scale, float logit_cap) -> ()");
+  m.impl("extend_attention_cpu", torch::kCPU, &extend_attention_cpu);
+
+  // weight prepack
+  m.def("convert_weight_packed(Tensor weight) -> Tensor");
+  m.impl("convert_weight_packed", torch::kCPU, &convert_weight_packed);
+
+  // quant
+  m.def("per_token_quant_int8_cpu(Tensor A) -> (Tensor, Tensor)");
+  m.impl("per_token_quant_int8_cpu", torch::kCPU, &per_token_quant_int8_cpu);
+
+  // gemm
+  m.def("weight_packed_linear(Tensor mat1, Tensor mat2, Tensor? bias, bool is_vnni) -> Tensor");
+  m.impl("weight_packed_linear", torch::kCPU, &weight_packed_linear);
+
+  // igemm
+  m.def(
+      "int8_scaled_mm_cpu(Tensor mat1, Tensor mat2, Tensor scales1, Tensor scales2, Tensor? bias, ScalarType "
+      "out_dtype, bool is_vnni) -> Tensor");
+  m.impl("int8_scaled_mm_cpu", torch::kCPU, &int8_scaled_mm_cpu);
+
+  // fp8 gemm
+  m.def(
+      "fp8_scaled_mm_cpu(Tensor mat1, Tensor mat2, Tensor scales2, int[] block_size, Tensor? bias, ScalarType "
+      "out_dtype, bool is_vnni) -> Tensor");
+  m.impl("fp8_scaled_mm_cpu", torch::kCPU, &fp8_scaled_mm_cpu);
+
+  // quant + igemm
+  m.def(
+      "int8_scaled_mm_with_quant(Tensor mat1, Tensor mat2, Tensor scales2, Tensor? bias, ScalarType out_dtype, bool "
+      "is_vnni) -> Tensor");
+  m.impl("int8_scaled_mm_with_quant", torch::kCPU, &int8_scaled_mm_with_quant);
+
+  // bmm
+  m.def("bmm_cpu(Tensor(a!) out, Tensor mat1, Tensor mat2, bool is_vnni, Tensor? scale) -> ()");
+  m.impl("bmm_cpu", torch::kCPU, &bmm_cpu);
+
+  // moe
+  m.def(
+      "fused_experts_cpu(Tensor hidden_states, Tensor w1, Tensor w2, Tensor topk_weights, Tensor topk_ids, bool "
+      "inplace, bool use_int8_w8a8, bool use_fp8_w8a16, Tensor? w1_scale, Tensor? w2_scale, int[]? block_size, Tensor? "
+      "a1_scale, Tensor? a2_scale, bool "
+      "is_vnni) -> Tensor");
+  m.impl("fused_experts_cpu", torch::kCPU, &fused_experts_cpu);
+
+  // weight absorption
+  m.def(
+      "qkv_proj_with_rope(Tensor hidden_states, Tensor q_a_proj_weight, Tensor q_b_proj_weight, Tensor "
+      "kv_a_proj_weight, Tensor w_kc, Tensor q_a_layernorm_weight, Tensor kv_a_layernorm_weight, Tensor positions, "
+      "Tensor cos_sin_cache, float eps, bool use_int8_w8a8, bool use_fp8_w8a16, Tensor? q_a_proj_scale, Tensor? "
+      "q_b_proj_scale, Tensor? "
+      "kv_a_proj_scale, bool is_vnni, int[]? block_size) -> (Tensor, Tensor, Tensor)");
+  m.impl("qkv_proj_with_rope", torch::kCPU, &qkv_proj_with_rope);
+  m.def(
+      "qkv_proj_with_rope_fused_weight(Tensor hidden_states, Tensor qkv_a_proj_weight, Tensor q_b_proj_weight, "
+      "Tensor w_kc, Tensor q_a_layernorm_weight, Tensor kv_a_layernorm_weight, Tensor positions, "
+      "Tensor cos_sin_cache, float eps, bool use_int8_w8a8, bool use_fp8_w8a16, Tensor? qkv_a_proj_scale, Tensor? "
+      "q_b_proj_scale,"
+      "bool is_vnni, int[]? block_size, int q_lora_rank, int kv_lora_rank,"
+      "int qk_rope_head_dim) -> (Tensor, Tensor, Tensor)");
+  m.impl("qkv_proj_with_rope_fused_weight", torch::kCPU, &qkv_proj_with_rope_fused_weight);
+
+  // shared expert
+  m.def(
+      "shared_expert_cpu(Tensor hidden_states, Tensor w1, Tensor w2, Tensor fused_experts_out, float "
+      "routed_scaling_factor, bool inplace, bool use_int8_w8a8, bool use_fp8_w8a16, Tensor? w1_scale, Tensor? "
+      "w2_scale, int[]? block_size, Tensor? a1_scale, Tensor? a2_scale, bool is_vnni) -> Tensor");
+  m.impl("shared_expert_cpu", torch::kCPU, &shared_expert_cpu);
+
+  // all reduce
+  m.def("initialize(int size, int rank) -> ()");
+  m.def("shm_allreduce(Tensor(a!) data, int reduce_op) -> ()");
+  m.impl("shm_allreduce", torch::kCPU, &shm_allreduce);
+  m.def("shm_allgather(Tensor data, int dim) -> Tensor");
+  m.impl("shm_allgather", torch::kCPU, &shm_allgather);
+
+  // rope
+  m.def(
+      "rotary_embedding_cpu(Tensor positions, Tensor query, Tensor key, int head_size, Tensor cos_sin_cache, "
+      "bool is_neox) -> (Tensor, Tensor)");
+  m.impl("rotary_embedding_cpu", torch::kCPU, &rotary_embedding_cpu);
+
+  // CPU and memory binding
+  m.def("init_cpu_threads_env(str cpu_ids) -> str");
+}
+
+TORCH_LIBRARY_IMPL(sgl_kernel, CatchAll, m) {
+  m.impl("init_cpu_threads_env", init_cpu_threads_env);
+  m.impl("initialize", &initialize);
+}
+
+REGISTER_EXTENSION(common_ops)
diff --git a/sgl-kernel/csrc/cpu/vec.h b/sgl-kernel/csrc/cpu/vec.h
new file mode 100644
index 000000000..d28124c1d
--- /dev/null
+++ b/sgl-kernel/csrc/cpu/vec.h
@@ -0,0 +1,308 @@
+#pragma once
+
+#if defined(__AVX512F__) && defined(__AVX512BF16__) && defined(__AMX_BF16__)
+#define CPU_CAPABILITY_AVX512
+#endif
+
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+
+namespace {
+
+using namespace at::vec;
+
+template <typename scalar_t, typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline Vectorized<scalar_t> convert_from_float_ext(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return at::vec::convert_from_float<scalar_t>(a, b);
+}
+
+// allow f16, bf16
+template <typename scalar_t, typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 1>
+inline std::tuple<Vectorized<float>, Vectorized<float>> load_float_vec2(const scalar_t* __restrict__ data) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  bVec x_vec = bVec::loadu(data);
+  fVec x0, x1;
+  std::tie(x0, x1) = at::vec::convert_to_float(x_vec);
+  return std::make_tuple(x0, x1);
+}
+
+// allow  f32
+inline std::tuple<Vectorized<float>, Vectorized<float>> load_float_vec2(const float* __restrict__ data) {
+  using fVec = at::vec::Vectorized<float>;
+  fVec x0 = fVec::loadu(data);
+  fVec x1 = fVec::loadu(data + fVec::size());
+  return std::make_tuple(x0, x1);
+}
+
+#if defined(CPU_CAPABILITY_AVX512)
+
+// `at::vec::convert_from_float<>` from PyTorch doesn't have avx512-bf16 intrinsics
+// use native instruction for bfloat16->float32 conversion
+template <>
+inline Vectorized<at::BFloat16>
+convert_from_float_ext<at::BFloat16>(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return (__m512i)(_mm512_cvtne2ps_pbh(__m512(b), __m512(a)));
+}
+
+#define CVT_BF16_TO_FP32(a) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(a), 16))
+
+#define CVT_FP16_TO_FP32(a) _mm512_cvtps_ph(a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))
+
+// this doesn't handle NaN.
+inline __m512bh cvt_e4m3_bf16_intrinsic_no_nan(__m256i fp8_vec) {
+  const __m512i x = _mm512_cvtepu8_epi16(fp8_vec);
+
+  const __m512i mant = _mm512_slli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(0x07)), 4);
+  const __m512i raw_exp = _mm512_srli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(0x78)), 3);
+  const __m512i exp = _mm512_slli_epi16(_mm512_add_epi16(raw_exp, _mm512_set1_epi16(120)), 7);
+  const __m512i nonsign = _mm512_or_si512(exp, mant);
+
+  const __m512i sign = _mm512_slli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(0x80)), 8);
+  const __m512i combined = _mm512_or_si512(nonsign, sign);
+
+  const __mmask32 is_nonzero = _mm512_cmpneq_epi16_mask(x, _mm512_setzero_si512());
+  return (__m512bh)_mm512_maskz_mov_epi16(is_nonzero, combined);
+}
+
+inline __m512bh cvt_e4m3_bf16_intrinsic_without_denorm(__m256i fp8_vec) {
+  // The following conversion is without denorm behavior, that is to say,
+  //   Max subnorm   : S.0000.111 = 0.875 ∗ 2**(−6)
+  //   Min subnorm   : S.0000.001 = 2**(−9)
+  // 0.0019 ~ 0.0137 cannot be converted correctly.
+  __m512i x = _mm512_cvtepu8_epi16(fp8_vec);
+  auto mask = _mm512_cmpneq_epi16_mask(
+      _mm512_and_si512(x, _mm512_set1_epi16(127)),
+      _mm512_setzero_si512());  // mask = x & 0x7f
+  auto mask_nan = _mm512_cmpneq_epi16_mask(
+      _mm512_and_si512(x, _mm512_set1_epi16(127)),
+      _mm512_set1_epi16(127));                                                      // mask_nan = x & 0x7f
+  auto mantissa = _mm512_slli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(7)), 4);  // mantissa = (x & 7) << 4
+  auto exponent = _mm512_add_epi16(
+      _mm512_srli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(120)), 3),
+      _mm512_set1_epi16(120));  // exponent = (((x >> 3) & 15) + 120)
+  auto nonsign = _mm512_maskz_mov_epi16(mask, _mm512_or_si512(mantissa, _mm512_slli_epi16(exponent, 7)));
+  nonsign = _mm512_mask_mov_epi16(_mm512_set1_epi16(0x7fff), mask_nan, nonsign);  // deal with Nan
+  return (__m512bh)(_mm512_or_si512(
+      nonsign,
+      _mm512_slli_epi16(
+          _mm512_and_si512(x, _mm512_set1_epi16(128)),
+          8)));  // add sign (x & 128) << 8
+}
+
+inline __m512bh cvt_e4m3_bf16_intrinsic_with_denorm(__m256i fp8_vec) {
+  __m512i x = _mm512_cvtepu8_epi16(fp8_vec);
+  __m512i lg2mant = _mm512_mask_mov_epi16(
+      _mm512_mask_mov_epi16(
+          _mm512_setzero_si512(), _mm512_test_epi16_mask(x, _mm512_set1_epi16(2)), _mm512_set1_epi16(1)),
+      _mm512_test_epi16_mask(x, _mm512_set1_epi16(4)),
+      _mm512_set1_epi16(2));
+  return (__m512bh)(_mm512_or_si512(
+      _mm512_maskz_mov_epi16(
+          _mm512_cmpneq_epi16_mask(_mm512_and_si512(x, _mm512_set1_epi16(127)), _mm512_setzero_si512()),
+          _mm512_mask_blend_epi16(
+              _mm512_test_epi16_mask(x, _mm512_set1_epi16(120)),
+              _mm512_or_si512(
+                  _mm512_and_si512(
+                      _mm512_sllv_epi16(
+                          _mm512_and_si512(x, _mm512_set1_epi16(3)), _mm512_sub_epi16(_mm512_set1_epi16(7), lg2mant)),
+                      _mm512_set1_epi16(0x007f)),
+                  _mm512_slli_epi16(_mm512_add_epi16(lg2mant, _mm512_set1_epi16(118)), 7)),
+              _mm512_or_si512(
+                  _mm512_slli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(7)), 4),
+                  _mm512_slli_epi16(
+                      _mm512_add_epi16(
+                          _mm512_srli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(120)), 3), _mm512_set1_epi16(120)),
+                      7)))),
+      _mm512_slli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(128)), 8)));
+}
+
+inline __m512bh CVT_FP8_TO_BF16(__m256i a) {
+#ifdef SGLANG_CPU_FP8_CVT_FTZ
+  return cvt_e4m3_bf16_intrinsic_no_nan(a);
+#else
+  return cvt_e4m3_bf16_intrinsic_with_denorm(a);
+#endif
+}
+
+#endif
+
+// vector to scalar reduction
+#if defined(CPU_CAPABILITY_AVX512) && 0
+inline float vec_reduce_sum(const Vectorized<float>& a) {
+  return _mm512_reduce_add_ps(__m512(a));
+}
+
+inline float vec_reduce_max(const Vectorized<float>& a) {
+  return _mm512_reduce_max_ps(__m512(a));
+}
+#else
+inline float vec_reduce_sum(const Vectorized<float>& a) {
+  return vec_reduce_all([](Vectorized<float>& x, Vectorized<float>& y) { return x + y; }, a);
+}
+
+inline float vec_reduce_max(const Vectorized<float>& a) {
+  return vec_reduce_all([](Vectorized<float>& x, Vectorized<float>& y) { return maximum(x, y); }, a);
+}
+#endif
+
+// https://github.com/InternLM/lmdeploy/blob/086481ed84b59bee3b8e4274e5fc69620040c048/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py#L282
+template <typename scalar_t>
+inline void
+quantize_row_int8(uint8_t* __restrict__ Aq, float& As, const scalar_t* __restrict__ A, int64_t K, float eps = 1e-7) {
+  float amax = 0.f;  // absolute max
+  for (int64_t k = 0; k < K; ++k) {
+    const float val = static_cast<float>(A[k]);
+    amax = std::max(amax, std::abs(val));
+  }
+
+  amax = std::max(amax, eps);
+  const float scale = amax / 127;
+  const float inv_scale = 127 / amax;
+
+  for (int64_t k = 0; k < K; ++k) {
+    const float val = static_cast<float>(A[k]) * inv_scale;
+    Aq[k] = (uint8_t)(std::round(val)) + 128;
+  }
+  As = scale;
+}
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <>
+inline void quantize_row_int8<at::BFloat16>(
+    uint8_t* __restrict__ Aq, float& As, const at::BFloat16* __restrict__ A, int64_t K, float eps) {
+  const __m512 signBit = _mm512_set1_ps(-0.0f);
+  const __m512i off = _mm512_set1_epi32(128);
+
+  // K is 32x, no remainder
+  float amax = 0.f;
+  __m512 vamax0 = _mm512_set1_ps(0.f);
+  __m512 vamax1 = _mm512_set1_ps(0.f);
+  for (int64_t k = 0; k < K; k += 32) {
+    __m512i va = _mm512_loadu_si512((void*)(A + k));
+    __m512 va0 = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32(va, 0));
+    __m512 va1 = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32(va, 1));
+    vamax0 = _mm512_max_ps(vamax0, _mm512_andnot_ps(signBit, va0));
+    vamax1 = _mm512_max_ps(vamax1, _mm512_andnot_ps(signBit, va1));
+  }
+  amax = _mm512_reduce_max_ps(_mm512_max_ps(vamax0, vamax1));
+  amax = std::max(amax, eps);
+  const float scale = amax / 127;
+  const float inv_scale = 127 / amax;
+  const __m512 vd = _mm512_set1_ps(inv_scale);
+
+  for (int64_t k = 0; k < K; k += 32) {
+    __m512i va = _mm512_loadu_si512((void*)(A + k));
+    __m512 va0 = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32(va, 0));
+    __m512 va1 = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32(va, 1));
+    va0 = _mm512_mul_ps(va0, vd);
+    va1 = _mm512_mul_ps(va1, vd);
+    va0 = _mm512_roundscale_ps(va0, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+    va1 = _mm512_roundscale_ps(va1, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+    __m128i i0 = _mm512_cvtepi32_epi8(_mm512_add_epi32(_mm512_cvtps_epi32(va0), off));
+    __m128i i1 = _mm512_cvtepi32_epi8(_mm512_add_epi32(_mm512_cvtps_epi32(va1), off));
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(Aq + k), _mm256_set_m128i(i1, i0));
+  }
+  As = scale;
+}
+#endif
+
+// transpose utils
+// taken from my PR in ggml: https://github.com/ggml-org/llama.cpp/pull/8998
+#if defined(CPU_CAPABILITY_AVX512)
+inline void transpose_16x16_32bit(__m512i* v) {
+  __m512i v1[16];
+  v1[0] = _mm512_unpacklo_epi32(v[0], v[1]);
+  v1[1] = _mm512_unpackhi_epi32(v[0], v[1]);
+  v1[2] = _mm512_unpacklo_epi32(v[2], v[3]);
+  v1[3] = _mm512_unpackhi_epi32(v[2], v[3]);
+  v1[4] = _mm512_unpacklo_epi32(v[4], v[5]);
+  v1[5] = _mm512_unpackhi_epi32(v[4], v[5]);
+  v1[6] = _mm512_unpacklo_epi32(v[6], v[7]);
+  v1[7] = _mm512_unpackhi_epi32(v[6], v[7]);
+  v1[8] = _mm512_unpacklo_epi32(v[8], v[9]);
+  v1[9] = _mm512_unpackhi_epi32(v[8], v[9]);
+  v1[10] = _mm512_unpacklo_epi32(v[10], v[11]);
+  v1[11] = _mm512_unpackhi_epi32(v[10], v[11]);
+  v1[12] = _mm512_unpacklo_epi32(v[12], v[13]);
+  v1[13] = _mm512_unpackhi_epi32(v[12], v[13]);
+  v1[14] = _mm512_unpacklo_epi32(v[14], v[15]);
+  v1[15] = _mm512_unpackhi_epi32(v[14], v[15]);
+
+  v[0] = _mm512_unpacklo_epi64(v1[0], v1[2]);
+  v[1] = _mm512_unpackhi_epi64(v1[0], v1[2]);
+  v[2] = _mm512_unpacklo_epi64(v1[1], v1[3]);
+  v[3] = _mm512_unpackhi_epi64(v1[1], v1[3]);
+  v[4] = _mm512_unpacklo_epi64(v1[4], v1[6]);
+  v[5] = _mm512_unpackhi_epi64(v1[4], v1[6]);
+  v[6] = _mm512_unpacklo_epi64(v1[5], v1[7]);
+  v[7] = _mm512_unpackhi_epi64(v1[5], v1[7]);
+  v[8] = _mm512_unpacklo_epi64(v1[8], v1[10]);
+  v[9] = _mm512_unpackhi_epi64(v1[8], v1[10]);
+  v[10] = _mm512_unpacklo_epi64(v1[9], v1[11]);
+  v[11] = _mm512_unpackhi_epi64(v1[9], v1[11]);
+  v[12] = _mm512_unpacklo_epi64(v1[12], v1[14]);
+  v[13] = _mm512_unpackhi_epi64(v1[12], v1[14]);
+  v[14] = _mm512_unpacklo_epi64(v1[13], v1[15]);
+  v[15] = _mm512_unpackhi_epi64(v1[13], v1[15]);
+
+  v1[0] = _mm512_shuffle_i32x4(v[0], v[4], 0x88);
+  v1[1] = _mm512_shuffle_i32x4(v[1], v[5], 0x88);
+  v1[2] = _mm512_shuffle_i32x4(v[2], v[6], 0x88);
+  v1[3] = _mm512_shuffle_i32x4(v[3], v[7], 0x88);
+  v1[4] = _mm512_shuffle_i32x4(v[0], v[4], 0xdd);
+  v1[5] = _mm512_shuffle_i32x4(v[1], v[5], 0xdd);
+  v1[6] = _mm512_shuffle_i32x4(v[2], v[6], 0xdd);
+  v1[7] = _mm512_shuffle_i32x4(v[3], v[7], 0xdd);
+  v1[8] = _mm512_shuffle_i32x4(v[8], v[12], 0x88);
+  v1[9] = _mm512_shuffle_i32x4(v[9], v[13], 0x88);
+  v1[10] = _mm512_shuffle_i32x4(v[10], v[14], 0x88);
+  v1[11] = _mm512_shuffle_i32x4(v[11], v[15], 0x88);
+  v1[12] = _mm512_shuffle_i32x4(v[8], v[12], 0xdd);
+  v1[13] = _mm512_shuffle_i32x4(v[9], v[13], 0xdd);
+  v1[14] = _mm512_shuffle_i32x4(v[10], v[14], 0xdd);
+  v1[15] = _mm512_shuffle_i32x4(v[11], v[15], 0xdd);
+
+  v[0] = _mm512_shuffle_i32x4(v1[0], v1[8], 0x88);
+  v[1] = _mm512_shuffle_i32x4(v1[1], v1[9], 0x88);
+  v[2] = _mm512_shuffle_i32x4(v1[2], v1[10], 0x88);
+  v[3] = _mm512_shuffle_i32x4(v1[3], v1[11], 0x88);
+  v[4] = _mm512_shuffle_i32x4(v1[4], v1[12], 0x88);
+  v[5] = _mm512_shuffle_i32x4(v1[5], v1[13], 0x88);
+  v[6] = _mm512_shuffle_i32x4(v1[6], v1[14], 0x88);
+  v[7] = _mm512_shuffle_i32x4(v1[7], v1[15], 0x88);
+  v[8] = _mm512_shuffle_i32x4(v1[0], v1[8], 0xdd);
+  v[9] = _mm512_shuffle_i32x4(v1[1], v1[9], 0xdd);
+  v[10] = _mm512_shuffle_i32x4(v1[2], v1[10], 0xdd);
+  v[11] = _mm512_shuffle_i32x4(v1[3], v1[11], 0xdd);
+  v[12] = _mm512_shuffle_i32x4(v1[4], v1[12], 0xdd);
+  v[13] = _mm512_shuffle_i32x4(v1[5], v1[13], 0xdd);
+  v[14] = _mm512_shuffle_i32x4(v1[6], v1[14], 0xdd);
+  v[15] = _mm512_shuffle_i32x4(v1[7], v1[15], 0xdd);
+}
+
+// remove warning : ignoring attributes on template argument ‘__m512i’ [-Wignored-attributes]
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wignored-attributes"
+
+// transpose from [2, 32] to [32, 2]
+inline std::tuple<__m512i, __m512i> transpose_2x32_16bit(__m512i r0, __m512i r1) {
+  // r0: {a0, a1, ..., a31}
+  // r1: {b0, b1, ..., b31}
+  //
+  // d0: {a0,   b0, ..., a15, b15}
+  // d1: {a16, b16, ..., a31, b31}
+  //
+  __m512i d0 = _mm512_unpacklo_epi16(r0, r1);
+  __m512i d1 = _mm512_unpackhi_epi16(r0, r1);
+  r0 = _mm512_shuffle_i32x4(d0, d1, 0x88);
+  r1 = _mm512_shuffle_i32x4(d0, d1, 0xdd);
+  d0 = _mm512_shuffle_i32x4(r0, r1, 0x88);
+  d1 = _mm512_shuffle_i32x4(r0, r1, 0xdd);
+  return std::make_tuple(d0, d1);
+}
+#pragma GCC diagnostic pop
+
+#endif
+
+}  // anonymous namespace
diff --git a/sgl-kernel/csrc/cutlass_extensions/common.hpp b/sgl-kernel/csrc/cutlass_extensions/common.hpp
new file mode 100644
index 000000000..912ef8b16
--- /dev/null
+++ b/sgl-kernel/csrc/cutlass_extensions/common.hpp
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "cuda_runtime.h"
+#include "cutlass/cutlass.h"
+
+/**
+ * A wrapper for a kernel that is used to guard against compilation on
+ * architectures that will never use the kernel. The purpose of this is to
+ * reduce the size of the compiled binary.
+ * __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+ * into code that will be executed on the device where it is defined.
+ */
+template <typename Kernel>
+struct enable_sm90_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
diff --git a/sgl-kernel/csrc/cutlass_extensions/detail/collective/mixed_input_utils.hpp b/sgl-kernel/csrc/cutlass_extensions/detail/collective/mixed_input_utils.hpp
new file mode 100644
index 000000000..fcba4d40c
--- /dev/null
+++ b/sgl-kernel/csrc/cutlass_extensions/detail/collective/mixed_input_utils.hpp
@@ -0,0 +1,482 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+#include "cute/util/type_traits.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_conversion.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective::detail {
+
+template <class Collective>
+struct MixedGroupedGemmInputUtils {
+ private:
+  using KernelSchedule = typename Collective::KernelSchedule;
+  using ConversionMode = typename Collective::ConversionMode;
+  using SmemLayoutA = typename Collective::SmemLayoutA;
+  using SmemLayoutB = typename Collective::SmemLayoutB;
+  using SmemLayoutScale = typename Collective::SmemLayoutScale;
+  using SwappedElementA = typename Collective::SwappedElementA;
+  using SwappedElementB = typename Collective::SwappedElementB;
+  using RealSwappedElementA = typename Collective::RealSwappedElementA;
+  using RealSwappedElementB = typename Collective::RealSwappedElementB;
+  using ElementScale = typename Collective::ElementScale;
+  using ElementZero = typename Collective::ElementZero;
+  using SmemCopyAtomScale = typename Collective::SmemCopyAtomScale;
+  static constexpr auto KernelConversionMode = Collective::KernelConversionMode;
+  static constexpr auto ModeHasScales = Collective::ModeHasScales;
+  static constexpr auto UseScaleLookupTable = Collective::UseScaleLookupTable;
+
+ public:
+  static constexpr auto elements_per_smem_scale() {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return 0;
+    } else if constexpr (ModeHasScales) {
+      return cute::cosize_v<SmemLayoutScale>;
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in scale smem allocation.");
+    }
+  }
+
+  static constexpr auto elements_per_smem_zero() {
+    if constexpr (
+        KernelConversionMode == ConversionMode::DirectConvert ||
+        KernelConversionMode == ConversionMode::ConvertAndScale) {
+      return 0;
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      return cute::cosize_v<SmemLayoutScale>;
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in scale smem allocation.");
+    }
+  }
+
+  // These methods use some the public members of the class. For that reason, we define them after the public section.
+  static constexpr uint32_t compute_tma_transaction_bytes_mk() {
+    return cutlass::bits_to_bytes(
+        size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(cute::sizeof_bits_v<SwappedElementA>));
+  }
+
+  static constexpr uint32_t compute_tma_transaction_bytes_nk() {
+    return cutlass::bits_to_bytes(
+        size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(cute::sizeof_bits_v<SwappedElementB>));
+  }
+
+  static constexpr uint32_t compute_tma_transaction_bytes_extra() {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return 0;
+    } else if constexpr (ModeHasScales) {
+      constexpr uint32_t scale_tx_bytes = cutlass::bits_to_bytes(
+          size<0>(SmemLayoutScale{}) * size<1>(SmemLayoutScale{}) *
+          static_cast<uint32_t>(cute::sizeof_bits_v<ElementScale>));
+      static_assert(scale_tx_bytes % 128 == 0, "Each scale stage must be 128B aligned.");  // required by TMA
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return scale_tx_bytes;
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        // Scale and zero share smem layout
+        constexpr uint32_t zero_tx_bytes = cutlass::bits_to_bytes(
+            size<0>(SmemLayoutScale{}) * size<1>(SmemLayoutScale{}) *
+            static_cast<uint32_t>(cute::sizeof_bits_v<ElementZero>));
+        static_assert(zero_tx_bytes % 128 == 0, "Each zero stage must be 128B aligned.");  // required by TMA
+        return scale_tx_bytes + zero_tx_bytes;
+      } else {
+        static_assert(
+            cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in tma transaction bytes computation.");
+      }
+    } else {
+      static_assert(
+          cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in tma transaction bytes computation.");
+    }
+  }
+
+  /// Utilities to copy A and extra inputs from smem to RF
+  template <class SmemTiledCopyA, class TensorASmemView, class TensorACopyView, class... Ts, class... Us>
+  CUTLASS_DEVICE static void copy_tensors_MK(
+      SmemTiledCopyA const& smem_tiled_copy_A,
+      TensorASmemView const& tCsA,
+      TensorACopyView& tCrA_copy_view,
+      cute::tuple<Ts...> const& partitioned_mma_extra_info,
+      cute::tuple<Us...> const& tiled_copy_and_views,
+      int k_block,
+      int read_stage) {
+    copy(smem_tiled_copy_A, tCsA(_, _, k_block, read_stage), tCrA_copy_view(_, _, k_block));
+
+    if (k_block == 0) {
+      // We are starting a new k-tile so copy the scale
+      if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+        // nothing to do
+      } else if constexpr (ModeHasScales) {
+        auto smem_tiled_copy_S = cute::get<0>(tiled_copy_and_views);
+        auto tCrS_copy_view = cute::get<1>(tiled_copy_and_views);
+        auto tCsS = cute::get<0>(partitioned_mma_extra_info);
+        copy(smem_tiled_copy_S, tCsS(_, _, k_block, read_stage), tCrS_copy_view(_, _, k_block));
+        if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+          // Nothing extra to do
+        } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+          auto tCsZ = cute::get<2>(partitioned_mma_extra_info);
+          auto tCrZ_copy_view = cute::get<2>(tiled_copy_and_views);
+          copy(smem_tiled_copy_S, tCsZ(_, _, k_block, read_stage), tCrZ_copy_view(_, _, k_block));
+        } else {
+          static_assert(
+              cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+        }
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+      }
+    }
+  }
+
+  // The core converter uses a lookup table to converts i4 -> 8 bit value.
+  template <
+      class EngineIn,
+      class LayoutIn,
+      class EngineOut,
+      class LayoutOut,
+      class EngineScale,
+      class LayoutScale>
+  CUTLASS_DEVICE static void lookup_table_convert(  // Accept mutable temporaries
+      Tensor<EngineIn, LayoutIn> const& src,
+      Tensor<EngineOut, LayoutOut>&& dst,
+      Tensor<EngineScale, LayoutScale> const& scales_neg,
+      Tensor<EngineScale, LayoutScale> const& scales_pos) {
+    lookup_table_convert(src, dst, scales_neg, scales_pos);
+  }
+  template <class EngineIn, class LayoutIn, class EngineOut, class LayoutOut, class EngineScale, class LayoutScale>
+  CUTLASS_DEVICE static void lookup_table_convert(
+      Tensor<EngineIn, LayoutIn> const& src,
+      Tensor<EngineOut, LayoutOut>& dst,
+      Tensor<EngineScale, LayoutScale> const& scales_neg,
+      Tensor<EngineScale, LayoutScale> const& scales_pos) {
+    constexpr int N = cute::cosize(LayoutIn{});
+    static_assert(N == 4 || N == 8);
+    static_assert(cosize(LayoutScale{}) <= N / 4, "at least 4 consecutive weights must share the same scale.");
+    using SrcArray = cutlass::Array<cutlass::int4b_t, 8>;
+    using DstArray = cutlass::Array<RealSwappedElementB, 8>;
+    using RegArray = cutlass::AlignedArray<uint32_t, N / 4, sizeof(DstArray)>;
+
+    // View the input as reg
+    auto&& src_reg = cute::recast<uint32_t>(src)(0);
+    auto&& r = cute::recast<RegArray>(dst)(0);
+
+    // Determines if to get from the signed or unsigned candidates
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
+    uint32_t sign;  // ((reg & 0x88888888) | 0x64206420) >> 1
+    asm volatile(
+        "{\n"
+        "  lop3.b32 %0, %1, %2, %3, %4;\n"
+        "}\n"
+        : "=r"(sign)
+        : "r"(src_reg), "n"(0x88888888), "n"(0x64206420), "n"(immLut));
+    sign = sign >> 1;
+
+    // Ignore sign bit when indexing into LUT
+    uint32_t lut_idx = src_reg & 0x77777777;
+    Tensor scales_neg_ = cute::filter(scales_neg);
+    Tensor scales_pos_ = cute::filter(scales_pos);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 4; ++i, lut_idx >>= 16, sign >>= 16) {
+      auto&& scale_neg_ = reinterpret_cast<cutlass::Array<uint32_t, 2> const&>(scales_neg_(i));
+      auto&& scale_pos_ = reinterpret_cast<cutlass::Array<uint32_t, 2> const&>(scales_pos_(i));
+      asm volatile(
+          "{\n"
+          "  .reg .b32 pos, neg                    ;\n"
+          "  prmt .b32 neg, %3, %4, %1             ;\n"
+          "  prmt .b32 pos, %5, %6, %1             ;\n"
+          "  prmt .b32 %0, pos, neg, %2            ;\n"
+          "}\n"
+          : "=r"(r[i])
+          : "r"(lut_idx), "r"(sign), "r"(scale_neg_[0]), "r"(scale_neg_[1]), "r"(scale_pos_[0]), "r"(scale_pos_[1]));
+    }
+  }
+
+  /// Utilities to dequantize A.
+  template <class Layout>
+  CUTLASS_DEVICE static void static_check_scale(Layout const& tensor) {
+    static_assert(
+        shape<0>(Layout{}) >= 4 && stride<0>(Layout{}) == 0,
+        "At least 4 adjacent weights in a thread must share the same scale.");
+  }
+  template <class Engine, class Layout>
+  CUTLASS_DEVICE static void static_check_scale(Tensor<Engine, Layout> const& tensor) {
+    static_check_scale(flatten(Layout{}));
+  }
+
+  template <class EngineIn, class EngineOut, class LayoutIn, class LayoutOut, class... Ts>
+  CUTLASS_DEVICE static void dequantize_A_kblock(
+      Tensor<EngineIn, LayoutIn> const& tCrA_load,
+      Tensor<EngineOut, LayoutOut>& tCrA_mma,
+      cute::tuple<Ts...>& partitioned_extra_info,
+      int const k_block) {
+    static_assert(is_rmem<EngineIn>::value, "Input tensor for A conversion must come from registers");
+    static_assert(is_rmem<EngineOut>::value, "Output tensor for A conversion must come from registers");
+    static_assert(cosize_v<LayoutIn> == cosize_v<LayoutOut>);
+    static_assert(size_v<LayoutIn> == cosize_v<LayoutIn>);
+    static_assert(size_v<LayoutOut> == cosize_v<LayoutOut>);
+    using SrcType = typename EngineIn::value_type;
+    using DstType = typename EngineOut::value_type;
+
+    Tensor src = tCrA_load(_, _, k_block);
+    Tensor dst = tCrA_mma(_, _, k_block);
+
+    CUTE_STATIC_ASSERT_V(
+        size(src(_, 0)) == cosize(src(_, 0).layout()), "The first mode of tensor src must be contiguous in memory");
+    // try to make the size of the first mode equal to 32bit
+    int constexpr NumValPerSrcReg = cute::min(decltype(size(src(_, 0)))::value, ceil_div(32, sizeof_bits_v<SrcType>));
+    Tensor src_vm = cute::group_modes<1, -1>(cute::zipped_divide(src, Int<NumValPerSrcReg>{}));
+    Tensor dst_vm = cute::group_modes<1, -1>(cute::zipped_divide(dst, Int<NumValPerSrcReg>{}));
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size<1>(dst_vm); ++i) {
+        LayoutAwareConvert(src_vm(_, i), dst_vm(_, i));
+      }
+    } else if constexpr (UseScaleLookupTable) {
+      constexpr int num_elements = decltype(size(src))::value;
+      static_assert(
+          is_same_v<RealSwappedElementA, cutlass::int4b_t>,
+          "Lookup table only supports int4 being the quant type now.");
+      static_assert(sizeof_bits_v<ElementScale> == 64, "Lookup table only supports 8 8bit scale values now.");
+      static_assert(
+          num_elements % 4 == 0 && num_elements >= 4, "Lookup table requires a vector size of 4x when converting.");
+
+      Tensor tCrS_neg = cute::get<1>(partitioned_extra_info);
+      auto&& tCrS_pos = cute::get<2>(partitioned_extra_info);  // modification to its value is needed
+      Tensor scales_neg = tCrS_neg(_, _, k_block);
+      Tensor scales_pos = tCrS_pos(_, _, k_block);
+      CUTE_STATIC_ASSERT_V(cute::size(src) == cute::size(scales_neg));
+
+      static_check_scale(scales_neg);
+      static_check_scale(scales_pos);
+      Tensor scales_neg_vm = cute::group_modes<1, -1>(cute::zipped_divide(scales_neg, Int<NumValPerSrcReg>{}));
+      Tensor scales_pos_vm = cute::group_modes<1, -1>(cute::zipped_divide(scales_pos, Int<NumValPerSrcReg>{}));
+
+      if (k_block == 0) {
+        Tensor scales_neg_vm_ = filter(scales_neg_vm);
+        Tensor scales_pos_vm_ = filter(scales_pos_vm);
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(scales_neg_vm_.layout()); ++i) {
+          auto&& scale_neg_ = reinterpret_cast<cutlass::Array<uint32_t, 2> const&>(scales_neg_vm_(i));
+          auto&& scale_pos_ = reinterpret_cast<cutlass::Array<uint32_t, 2>&>(scales_pos_vm_(i));
+          constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+          asm volatile(
+              "{\n"
+              "  lop3 .b32 %0, %2, %4, %5, %6;\n"
+              "  xor  .b32 %1, %3, %5;        \n"
+              "}\n"
+              : "=r"(scale_pos_[0]), "=r"(scale_pos_[1])
+              : "r"(scale_neg_[0]), "r"(scale_neg_[1]), "n"(0xFFFFFF00), "n"(0x80808080), "n"(immLut));
+        }
+      }
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size<1>(dst_vm); ++i) {
+        lookup_table_convert(src_vm(_, i), dst_vm(_, i), scales_neg_vm(_, i), scales_pos_vm(_, i));
+      }
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      Tensor scales = cute::get<1>(partitioned_extra_info)(_, _, k_block);
+      CUTE_STATIC_ASSERT_V(size(src) == size(scales));
+      Tensor scales_vm = cute::group_modes<1, -1>(cute::zipped_divide(scales, Int<NumValPerSrcReg>{}));
+
+      if constexpr (is_same_v<DstType, ElementScale>) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size<1>(dst_vm); ++i) {
+          LayoutAwareConvert(src_vm(_, i), dst_vm(_, i));
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < size<0>(dst_vm); ++j) {
+            dst_vm(j, i) *= scales_vm(j, i);
+          }
+        }
+      } else {
+        auto stage = make_tensor_like<ElementScale>(src_vm(_, 0));
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size<1>(dst_vm); ++i) {
+          LayoutAwareConvert(src_vm(_, i), stage);
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < size<0>(dst_vm); ++j) {
+            stage(j) *= scales_vm(j, i);
+          }
+          LayoutAwareConvert(stage, dst_vm(_, i));
+        }
+      }
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      static_assert(is_same_v<ElementScale, ElementZero>, "ElementScale and ElementZero must be the same.");
+      Tensor scales = cute::get<1>(partitioned_extra_info)(_, _, k_block);
+      Tensor zeros = cute::get<3>(partitioned_extra_info)(_, _, k_block);
+      CUTE_STATIC_ASSERT_V(size(src) == size(scales));
+      CUTE_STATIC_ASSERT_V(size(src) == size(zeros));
+      Tensor scales_vm = cute::group_modes<1, -1>(cute::zipped_divide(scales, Int<NumValPerSrcReg>{}));
+      Tensor zeros_vm = cute::group_modes<1, -1>(cute::zipped_divide(zeros, Int<NumValPerSrcReg>{}));
+
+      if constexpr (is_same_v<DstType, ElementScale>) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size<1>(dst_vm); ++i) {
+          LayoutAwareConvert(src_vm(_, i), dst_vm(_, i));
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < size<0>(dst_vm); ++j) {
+            dst_vm(j, i) = dst_vm(j, i) * scales_vm(j, i) + zeros_vm(j, i);
+          }
+        }
+      } else {
+        auto stage = make_tensor_like<ElementScale>(src_vm(_, 0));
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size<1>(dst_vm); ++i) {
+          LayoutAwareConvert(src_vm(_, i), stage);
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < size<0>(dst_vm); ++j) {
+            stage(j) = stage(j) * scales_vm(j, i) + zeros_vm(j, i);
+          }
+          LayoutAwareConvert(stage, dst_vm(_, i));
+        }
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "No A data is loaded.");
+    }
+  }
+
+  template <class EngineIn, class EngineOut, class LayoutIn, class LayoutOut, class... Ts>
+  CUTLASS_DEVICE static void convert_A_kblock(
+      Tensor<EngineIn, LayoutIn> const& tCrA_load, Tensor<EngineOut, LayoutOut>& tCrA_mma, int const k_block) {
+    static_assert(is_rmem<EngineIn>::value, "Input tensor for A conversion must come from registers");
+    static_assert(is_rmem<EngineOut>::value, "Output tensor for A conversion must come from registers");
+    static_assert(cosize_v<LayoutIn> == cosize_v<LayoutOut>);
+    static_assert(size_v<LayoutIn> == cosize_v<LayoutIn>);
+    static_assert(size_v<LayoutOut> == cosize_v<LayoutOut>);
+    using SrcType = typename EngineIn::value_type;
+
+    Tensor src = tCrA_load(_, _, k_block);
+    Tensor dst = tCrA_mma(_, _, k_block);
+
+    CUTE_STATIC_ASSERT_V(
+        size(src(_, 0)) == cosize(src(_, 0).layout()), "The first mode of tensor src must be contiguous in memory");
+    // try to make the size of the first mode equal to 32bit
+    int constexpr NumValPerSrcReg = cute::min(decltype(size(src(_, 0)))::value, ceil_div(32, sizeof_bits_v<SrcType>));
+    Tensor src_vm = cute::group_modes<1, -1>(cute::zipped_divide(src, Int<NumValPerSrcReg>{}));
+    Tensor dst_vm = cute::group_modes<1, -1>(cute::zipped_divide(dst, Int<NumValPerSrcReg>{}));
+
+    // KernelConversionMode == ConversionMode::DirectConvert
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size<1>(dst_vm); ++i) {
+      LayoutAwareConvert(src_vm(_, i), dst_vm(_, i));
+    }
+  }
+
+  /// Utilities for any additional inputs inside of the TMA load
+  template <class Params, class TensorStorage, class... Ts>
+  CUTLASS_DEVICE static auto partition_extra_tma_inputs(
+      Params const& mainloop_params,
+      cute::tuple<Ts...> const& load_inputs,
+      TensorStorage& shared_tensors,
+      uint2 const& cluster_local_block_id,
+      int const m_coord,
+      int const l_coord) {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple();
+    } else if constexpr (ModeHasScales) {
+      Tensor sS =
+          make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()), SmemLayoutScale{});  // (BLK_M,BLK_K,PIPE)
+      Tensor gS_mkl = get<2>(load_inputs);
+      auto block_tma_s = mainloop_params.tma_load_scale.get_slice(cluster_local_block_id.y);
+      Tensor gS = gS_mkl(_, _, m_coord, _, l_coord);  // (BLK_M,BLK_K,k)
+
+      Tensor tSgS = block_tma_s.partition_S(gS);  // (TMA,TMA_M,TMA_K,k)
+      Tensor tSsS = block_tma_s.partition_D(sS);  // (TMA,TMA_M,TMA_K,PIPE)
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(tSgS, tSsS);
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor sZ =
+            make_tensor(make_smem_ptr(shared_tensors.smem_zero.begin()), SmemLayoutScale{});  // (BLK_M,BLK_K,PIPE)
+        Tensor gZ_mkl = get<3>(load_inputs);
+        auto block_tma_z = mainloop_params.tma_load_zero.get_slice(cluster_local_block_id.y);
+        Tensor gZ = gZ_mkl(_, _, m_coord, _, l_coord);  // (BLK_M,BLK_K,k)
+
+        Tensor tZgZ = block_tma_z.partition_S(gZ);  // (TMA,TMA_M,TMA_K,k)
+        Tensor tZsZ = block_tma_z.partition_D(sZ);  // (TMA,TMA_M,TMA_K,PIPE)
+        return cute::make_tuple(tSgS, tSsS, tZgZ, tZsZ);
+      } else {
+        static_assert(
+            cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for input partitioning.");
+      }
+    } else {
+      static_assert(
+          cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for input partitioning.");
+    }
+  }
+
+  /// Utilities for partitioning extra inputs for loading from smem in the mainloop.
+  template <class ThreadMma, class TensorStorage>
+  CUTLASS_DEVICE static auto
+  partition_extra_mma_info(ThreadMma const& mma_thread_slice, TensorStorage& shared_tensors) {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // nothing to do
+      return cute::make_tuple();
+    } else if constexpr (UseScaleLookupTable) {
+      Tensor sS =
+          make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()), SmemLayoutScale{});  // (BLK_M,BLK_SCALE_K,PIPE)
+      Tensor tCsS = mma_thread_slice.partition_A(sS);
+      Tensor tCrS = make_tensor<ElementScale>(mma_thread_slice.partition_fragment_A(sS(_, _, Int<0>{})).layout());
+
+      return cute::make_tuple(tCsS, tCrS);
+    } else if constexpr (ModeHasScales) {
+      Tensor sS =
+          make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()), SmemLayoutScale{});  // (BLK_M,BLK_SCALE_K,PIPE)
+      Tensor tCsS = mma_thread_slice.partition_A(sS);
+      Tensor tCrS = make_tensor<ElementScale>(mma_thread_slice.partition_fragment_A(sS(_, _, Int<0>{})).layout());
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(tCsS, tCrS);
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor sZ = make_tensor(
+            make_smem_ptr(shared_tensors.smem_zero.begin()), SmemLayoutScale{});  // (BLK_M,BLK_SCALE_K,PIPE)
+        Tensor tCsZ = mma_thread_slice.partition_A(sZ);
+        Tensor tCrZ = make_tensor<ElementZero>(mma_thread_slice.partition_fragment_A(sZ(_, _, Int<0>{})).layout());
+        return cute::make_tuple(tCsS, tCrS, tCsZ, tCrZ);
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+    }
+  }
+
+  /// Returns the tiled copy and copy views for the extra inputs.
+  template <class TiledMma, class... Ts>
+  CUTLASS_DEVICE static auto retile_extra_mma_info(
+      TiledMma const& tiled_mma, cute::tuple<Ts...>& partitioned_extra_info, int const warp_group_thread_idx) {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // nothing to do
+      return cute::make_tuple();
+    } else if constexpr (ModeHasScales) {
+      auto smem_tiled_copy_S = make_tiled_copy_A(SmemCopyAtomScale{}, tiled_mma);
+      auto smem_thr_copy_S = smem_tiled_copy_S.get_thread_slice(warp_group_thread_idx);
+      Tensor tCrS_copy_view = smem_thr_copy_S.retile_D(cute::get<1>(partitioned_extra_info));  // (CPY,CPY_M,CPY_K)
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(smem_tiled_copy_S, tCrS_copy_view);
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor tCrZ_copy_view = smem_thr_copy_S.retile_D(cute::get<3>(partitioned_extra_info));  // (CPY,CPY_M,CPY_K)
+        return cute::make_tuple(smem_tiled_copy_S, tCrS_copy_view, tCrZ_copy_view);
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+    }
+  }
+};
+
+}  // namespace cutlass::gemm::collective::detail
diff --git a/sgl-kernel/csrc/cutlass_extensions/epilogue/epilogue_per_row_per_col_scale.h b/sgl-kernel/csrc/cutlass_extensions/epilogue/epilogue_per_row_per_col_scale.h
new file mode 100644
index 000000000..9f85bee28
--- /dev/null
+++ b/sgl-kernel/csrc/cutlass_extensions/epilogue/epilogue_per_row_per_col_scale.h
@@ -0,0 +1,309 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Adapted from
+// https://github.com/NVIDIA/TensorRT-LLM/blob/be1788106245496872d18e702978e59b6bfd50e0/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue/threadblock/epilogue_per_row_per_col_scale.h
+
+#pragma once
+
+#include <cutlass/arch/memory.h>
+#include <cutlass/numeric_conversion.h>
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+template <
+    typename ThreadblockShape_,
+    int ThreadCount,
+    typename ScaleTileIterator_,
+    typename OutputTileIterator_,
+    typename ElementAccumulator_,
+    typename ElementCompute_,
+    typename ElementwiseFunctor_,
+    bool UseMasking_ = false>
+class EpilogueVisitorPerRowPerCol {
+ public:
+  using ThreadblockShape = ThreadblockShape_;
+  static int const kThreadCount = ThreadCount;
+
+  using ScaleTileIterator = ScaleTileIterator_;
+  using OutputTileIterator = OutputTileIterator_;
+  using ElementwiseFunctor = ElementwiseFunctor_;
+
+  static int const kIterations = OutputTileIterator::kIterations;
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  using ElementOutput = typename OutputTileIterator::Element;
+  using LayoutOutput = cutlass::layout::RowMajor;
+  using ElementAccumulator = ElementAccumulator_;
+
+  using AlphaScaleElementType = typename ScaleTileIterator::Element;
+
+  using ElementCompute = ElementCompute_;
+  using AccumulatorFragment = Array<ElementAccumulator, kElementsPerAccess>;
+  using ComputeFragment = Array<ElementCompute_, kElementsPerAccess>;
+  using OutputVector = Array<ElementOutput, kElementsPerAccess>;
+
+  static int const kThreadsPerRow = OutputTileIterator::ThreadMap::Detail::kAccessWidth;
+  static bool const kHasMultiStepsInRow = (OutputTileIterator::ThreadMap::Iterations::kColumn > 1);
+
+  /// Argument structure
+  struct Arguments {
+    typename ElementwiseFunctor::Params elementwise;
+    int64_t batch_stride_alpha;
+    int64_t batch_stride_C;
+    int64_t batch_stride_D;
+
+    //
+    // Methods
+    //
+    Arguments() : batch_stride_alpha(0), batch_stride_C(0), batch_stride_D(0) {}
+
+    Arguments(typename ElementwiseFunctor::Params elementwise_)
+        : elementwise(elementwise_), batch_stride_alpha(0), batch_stride_C(0), batch_stride_D(0) {}
+
+    Arguments(
+        typename ElementwiseFunctor::Params elementwise_,
+        int64_t batch_stride_alpha_,
+        int64_t batch_stride_C_,
+        int64_t batch_stride_D_)
+        : elementwise(elementwise_),
+          batch_stride_alpha(batch_stride_alpha_),
+          batch_stride_C(batch_stride_C_),
+          batch_stride_D(batch_stride_D_) {}
+  };
+
+  struct Params {
+    typename ElementwiseFunctor::Params elementwise;
+    int64_t batch_stride_alpha;
+    int64_t batch_stride_C;
+    int64_t batch_stride_D;
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Arguments const& args)
+        : elementwise(args.elementwise),
+          batch_stride_alpha(args.batch_stride_alpha),
+          batch_stride_C(args.batch_stride_C),
+          batch_stride_D(args.batch_stride_D) {}
+  };
+
+  /// Shared storage
+  struct SharedStorage {};
+
+ private:
+  Params const& params_;
+  SharedStorage& shared_storage_;
+  MatrixCoord extent_;
+  MatrixCoord extent_real_;
+  ElementwiseFunctor elementwise_;
+
+  bool const with_bias_;
+  bool const per_token_quant_;
+  bool const per_channel_quant_;
+
+  AlphaScaleElementType* ptr_alpha_row_;
+  AlphaScaleElementType* ptr_alpha_col_;
+  ScaleTileIterator iterator_alpha_col_;
+  OutputTileIterator iterator_C_;
+  OutputTileIterator iterator_D_;
+
+  AlphaScaleElementType element_alpha_row_ = 1.0f;
+  AlphaScaleElementType element_alpha_col_ = 1.0f;
+  typename ScaleTileIterator::Fragment fragment_alpha_col_;
+  typename OutputTileIterator::Fragment fragment_C_;
+  typename OutputTileIterator::Fragment fragment_D_;
+
+  ElementAccumulator beta_;
+
+  int column_offset_;
+
+  MatrixCoord thread_offset_;
+
+ public:
+  CUTLASS_DEVICE
+  EpilogueVisitorPerRowPerCol(
+      Params const& params,
+      SharedStorage& shared_storage,
+      cutlass::MatrixCoord const& problem_size,
+      int thread_idx,
+      int warp_idx,
+      int lane_idx,
+      typename ScaleTileIterator::Params params_alpha_col,
+      typename OutputTileIterator::Params params_C,
+      typename OutputTileIterator::Params params_D,
+      bool with_bias,
+      bool per_token_quant,
+      bool per_channel_quant,
+      AlphaScaleElementType* ptr_alpha_row,
+      AlphaScaleElementType* ptr_alpha_col,
+      typename OutputTileIterator::Element* ptr_C,
+      typename OutputTileIterator::Element* ptr_D,
+      cutlass::MatrixCoord const& threadblock_offset = cutlass::MatrixCoord(0, 0),
+      int column_offset = 0,
+      cutlass::MatrixCoord const& problem_size_real = cutlass::MatrixCoord(0, 0))
+      : params_(params),
+        shared_storage_(shared_storage),
+        extent_(problem_size),
+        elementwise_(params.elementwise),
+        with_bias_(with_bias),
+        per_token_quant_(per_token_quant),
+        per_channel_quant_(per_channel_quant),
+        ptr_alpha_row_(ptr_alpha_row),
+        ptr_alpha_col_(ptr_alpha_col),
+        iterator_alpha_col_(params_alpha_col, ptr_alpha_col, problem_size, thread_idx, threadblock_offset),
+        iterator_C_(params_C, ptr_C, problem_size, thread_idx, threadblock_offset),
+        iterator_D_(params_D, ptr_D, problem_size, thread_idx, threadblock_offset),
+        extent_real_(problem_size_real) {
+    if (!per_channel_quant_ && (ptr_alpha_col_ != nullptr)) {
+      element_alpha_col_ = *ptr_alpha_col_;
+    }
+
+    if (!per_token_quant_ && (ptr_alpha_row_ != nullptr)) {
+      element_alpha_row_ = *ptr_alpha_row_;
+    }
+  }
+
+  /// Helper to indicate split-K behavior
+  CUTLASS_DEVICE
+  void set_k_partition(
+      int split_k_index,     ///< Index of this threadblock within split-K partitioned scheme
+      int split_k_slices) {  ///< Total number of split-K slices
+  }
+
+  /// Called to set the batch index
+  CUTLASS_DEVICE
+  void set_batch_index(int batch_idx) {
+    iterator_alpha_col_.add_pointer_offset(batch_idx * params_.batch_stride_alpha);
+    iterator_C_.add_pointer_offset(batch_idx * params_.batch_stride_C);
+    iterator_D_.add_pointer_offset(batch_idx * params_.batch_stride_D);
+  }
+
+  /// Called at the start of the epilogue just before iterating over accumulator slices
+  CUTLASS_DEVICE
+  void begin_epilogue() {
+    if (per_channel_quant_) {
+      iterator_alpha_col_.load(fragment_alpha_col_);
+    }
+
+    if (with_bias_) {
+      iterator_C_.load(fragment_C_);
+    }
+  }
+
+  /// Called at the start of one step before starting accumulator exchange
+  CUTLASS_DEVICE
+  void begin_step(int step_idx) {
+    fragment_D_.clear();
+  }
+
+  /// Called at the start of a row
+  CUTLASS_DEVICE
+  void begin_row(int row_idx) {
+    // load alpha_row in begin_step only when per token(row) scaling is used
+    if (per_token_quant_) {
+      int thread_offset_row =
+          iterator_D_.thread_start_row() + OutputTileIterator::ThreadMap::iteration_offset(row_idx).row();
+
+      arch::global_load<AlphaScaleElementType, sizeof(AlphaScaleElementType)>(
+          element_alpha_row_, ptr_alpha_row_ + thread_offset_row, thread_offset_row < extent_.row());
+    }
+  }
+
+  /// Called after accumulators have been exchanged for each accumulator vector
+  CUTLASS_DEVICE
+  void visit(int iter_idx, int row_idx, int column_idx, int frag_idx, AccumulatorFragment const& accum) {
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess> source_converter;
+
+    ComputeFragment result = source_converter(accum);
+    if (per_channel_quant_) {
+      ComputeFragment alpha_col = reinterpret_cast<ComputeFragment*>(&fragment_alpha_col_)[column_idx];
+      result = per_token_channel_scale_accumulator_(result, alpha_col, element_alpha_row_);
+    } else {
+      result = per_token_scale_accumulator_(result, element_alpha_col_, element_alpha_row_);
+    }
+
+    if (with_bias_) {
+      NumericArrayConverter<ElementCompute, ElementOutput, kElementsPerAccess> bias_converter;
+      OutputVector bias = reinterpret_cast<OutputVector*>(&fragment_C_)[column_idx];
+      result = bias_accumulator_(result, bias_converter(bias));
+    }
+
+    // Convert to the output
+    NumericArrayConverter<ElementOutput, ElementCompute, kElementsPerAccess> output_converter;
+    OutputVector& output = reinterpret_cast<OutputVector*>(&fragment_D_)[frag_idx];
+    output = output_converter(result);
+  }
+
+  /// Called at the end of a row
+  CUTLASS_DEVICE
+  void end_row(int row_idx) {}
+
+  /// Called after all accumulator elements have been visited
+  CUTLASS_DEVICE
+  void end_step(int step_idx) {
+    iterator_D_.store(fragment_D_);
+    ++iterator_D_;
+  }
+
+  /// Called after all steps have been completed
+  CUTLASS_DEVICE
+  void end_epilogue() {}
+
+ private:
+  CUTLASS_DEVICE
+  ComputeFragment per_token_channel_scale_accumulator_(
+      ComputeFragment const& accum, ComputeFragment const& scale_col, AlphaScaleElementType const& scale_row) {
+    ComputeFragment result;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ComputeFragment::kElements; ++i) {
+      result[i] = accum[i] * (scale_col[i] * scale_row);
+    }
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  ComputeFragment per_token_scale_accumulator_(
+      ComputeFragment const& accum, AlphaScaleElementType const& scale_col, AlphaScaleElementType const& scale_row) {
+    ComputeFragment result;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ComputeFragment::kElements; ++i) {
+      result[i] = accum[i] * (scale_col * scale_row);
+    }
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  ComputeFragment bias_accumulator_(ComputeFragment const& accum, ComputeFragment const& bias) {
+    ComputeFragment result;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < OutputVector::kElements; ++i) {
+      result[i] = accum[i] + bias[i];
+    }
+    return result;
+  }
+};
+
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
diff --git a/sgl-kernel/csrc/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_mixed_input.inl b/sgl-kernel/csrc/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_mixed_input.inl
new file mode 100644
index 000000000..db1fdf1e7
--- /dev/null
+++ b/sgl-kernel/csrc/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_mixed_input.inl
@@ -0,0 +1,278 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/gemm/collective/builders/sm90_common.inl"
+#include "cutlass/gemm/collective/collective_builder_decl.hpp"
+#include "cutlass/gemm/collective/collective_mma_decl.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/pipeline/sm90_pipeline.hpp"
+
+// SM90 Collective Builders should be used only starting CUDA 12.0
+#if (__CUDACC_VER_MAJOR__ >= 12)
+#define CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_RS
+template <
+    class ElementA_,
+    class GmemLayoutATag_,
+    int AlignmentA,
+    class ElementB_,
+    class GmemLayoutBTag_,
+    int AlignmentB,
+    class ElementAccumulator,
+    class TileShape_MNK,
+    class ClusterShape_MNK,
+    class StageCountType,
+    class KernelScheduleType>
+struct CollectiveBuilderMixedInput<
+    arch::Sm90,
+    arch::OpClassTensorOp,
+    ElementA_,
+    GmemLayoutATag_,
+    AlignmentA,
+    ElementB_,
+    GmemLayoutBTag_,
+    AlignmentB,
+    ElementAccumulator,
+    TileShape_MNK,
+    ClusterShape_MNK,
+    StageCountType,
+    KernelScheduleType,
+    cute::enable_if_t<
+        (cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized> ||
+         cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpong> ||
+         cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative> ||
+         cute::is_same_v<KernelScheduleType, KernelPtrArrayTmaWarpSpecializedCooperative> ||
+         cute::is_same_v<KernelScheduleType, KernelPtrArrayTmaWarpSpecializedPingpong>) &&
+        (detail::is_use_rmem_A<ElementA_, GmemLayoutATag_, ElementB_, GmemLayoutBTag_>() ||
+         // ConvertAndScale and ConvertAndScaleWithZero
+         cute::is_tuple<ElementA_>::value || cute::is_tuple<ElementB_>::value ||
+         // DirectConvert
+         sizeof_bits<ElementA_>::value != sizeof_bits<ElementB_>::value)>> {
+ private:
+  using ScaleA = detail::deduce_mixed_width_dtype_t<1, ElementA_>;
+  using ScaleB = detail::deduce_mixed_width_dtype_t<1, ElementB_>;
+  using ZeroA = detail::deduce_mixed_width_dtype_t<2, ElementA_>;
+  using ZeroB = detail::deduce_mixed_width_dtype_t<2, ElementB_>;
+  static constexpr bool NeitherIsTuple = !cute::is_tuple<ElementA_>::value && !cute::is_tuple<ElementB_>::value;
+  // Determine if mixed input types.
+  static constexpr bool IsMixedInput = cute::sizeof_bits_v<detail::deduce_mixed_width_dtype_t<0, ElementA_>> !=
+                                       cute::sizeof_bits_v<detail::deduce_mixed_width_dtype_t<0, ElementB_>>;
+  static constexpr bool IsArrayOfPointersGemm = cute::is_any_of_v<
+      KernelScheduleType,
+      KernelPtrArrayTmaWarpSpecializedCooperative,
+      KernelPtrArrayTmaWarpSpecializedPingpong>;
+  static_assert(IsMixedInput || !IsArrayOfPointersGemm, "Only mixed input grouped RS GEMM is supported.");
+
+ public:
+  using ElementA = detail::deduce_mixed_width_dtype_t<0, ElementA_>;
+  using ElementB = detail::deduce_mixed_width_dtype_t<0, ElementB_>;
+
+  static_assert(
+      !IsMixedInput || (cute::is_tuple<ElementA_>::value ^ cute::is_tuple<ElementB_>::value ||
+                        (NeitherIsTuple && (sizeof_bits<ElementA>::value != sizeof_bits<ElementB>::value))),
+      "Either A OR B must be a tuple or the widths of A and B must be different.");
+
+  static constexpr bool IsANarrow = sizeof_bits<ElementA>::value < sizeof_bits<ElementB>::value;
+
+  template <class T>
+  static auto get_stride(T const& t) {
+    if constexpr (not cute::is_layout<cute::remove_pointer_t<T>>::value) {
+      return t;
+    } else {
+      if constexpr (cute::is_pointer_v<T>) {
+        return &cute::stride(*t);
+      } else {
+        return cute::stride(t);
+      }
+    }
+  }
+
+  using GmemLayoutATag = decltype(get_stride(GmemLayoutATag_{}));
+  using GmemLayoutBTag = decltype(get_stride(GmemLayoutBTag_{}));
+
+  using ElementPairA =
+      cute::conditional_t<IsMixedInput && IsANarrow && NeitherIsTuple, cute::tuple<ElementA>, ElementA_>;
+  using ElementPairB =
+      cute::conditional_t<IsMixedInput && !IsANarrow && NeitherIsTuple, cute::tuple<ElementB>, ElementB_>;
+
+  static constexpr bool IsATransformed = cute::is_tuple<ElementPairA>::value;
+  using ElementScale = cute::conditional_t<IsATransformed, ScaleA, ScaleB>;
+  using ElementZero = cute::conditional_t<IsATransformed, ZeroA, ZeroB>;
+
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+  static_assert(
+      detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+      "Should meet TMA alignment requirement\n");
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_rs_tag_to_major_A<GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_rs_tag_to_major_B<GmemLayoutBTag>();
+  // If A is scaled, then we don't need to swap. Otherwise, we must ensure B goes to rmem and we must swap the
+  // operands.
+  static constexpr bool SwapAB =
+      IsMixedInput ? !IsATransformed : detail::is_swapAB<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>();
+  static constexpr bool IsWarpSpecializedTransposeB =
+      detail::is_warpspecialized_transpose_B<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag, KernelScheduleType>();
+  static_assert(!IsMixedInput || !IsWarpSpecializedTransposeB, "Mixed input GEMM does not support WS transpose B.");
+
+  // When we relax the above assertion, we must handle setting the tile mma GmmaMajorB correctly.
+  static constexpr cute::GMMA::Major TiledMmaGmmaMajorB = SwapAB ? GmmaMajorA : GmmaMajorB;
+
+  // For fp32 types, map to tf32 MMA value type.
+  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  // Handle mixed dtypes and MMA.
+  using RealElementA = cute::conditional_t<SwapAB, ElementBMma, ElementAMma>;
+  using RealElementB = cute::conditional_t<SwapAB, ElementAMma, ElementBMma>;
+  using RealElementAMma = cute::conditional_t<IsMixedInput, RealElementB, RealElementA>;
+  // Always the same for element B.
+  using RealElementBMma = RealElementB;
+
+  static_assert(
+      !IsMixedInput || TiledMmaGmmaMajorB == GMMA::Major::K || sizeof_bits<RealElementB>::value == 16,
+      "Mixed input GEMM does not support MN major layout except for 16bit");
+
+  using AtomLayoutMNK = cute::conditional_t<
+      cute::is_any_of_v<
+          KernelScheduleType,
+          KernelTmaWarpSpecializedCooperative,
+          KernelPtrArrayTmaWarpSpecializedCooperative>,
+      Layout<Shape<_2, _1, _1>>,
+      Layout<Shape<_1, _1, _1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(
+      cute::GMMA::rs_op_selector<
+          RealElementAMma,
+          RealElementBMma,
+          ElementAccumulator,
+          TileShape_MNK,
+          GMMA::Major::K,
+          GMMA::Major::K>(),
+      AtomLayoutMNK{}));
+
+  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA = decltype(detail::rs_smem_selector<
+                                   GmmaMajorA,
+                                   ElementAMma,
+                                   decltype(cute::get<0>(TileShape_MNK{})),
+                                   decltype(cute::get<2>(TileShape_MNK{})),
+                                   IsWarpSpecializedTransposeB>());
+  using SmemLayoutAtomB = decltype(detail::rs_smem_selector<
+                                   GmmaMajorB,
+                                   ElementBMma,
+                                   decltype(cute::get<1>(TileShape_MNK{})),
+                                   decltype(cute::get<2>(TileShape_MNK{})),
+                                   IsWarpSpecializedTransposeB>());
+
+  static constexpr size_t SmemAlignmentA = cutlass::detail::alignment_for_swizzle(SmemLayoutAtomA{});
+  static constexpr size_t SmemAlignmentB = cutlass::detail::alignment_for_swizzle(SmemLayoutAtomB{});
+  static constexpr int SmemAlignment = static_cast<int>(cute::max(SmemAlignmentA, SmemAlignmentB));
+
+  // Handle mixed dtype array GEMM's size of tensor map storage.
+  static constexpr size_t TensorMapStorage = sizeof(cute::TmaDescriptor) * size_t(IsMixedInput) * 4;
+  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
+  static constexpr int Sm90ReducedSmemCapacityBytes = detail::sm90_smem_capacity_bytes - KernelSmemCarveout;
+
+  static constexpr int PipelineStages =
+      IsMixedInput ? (IsArrayOfPointersGemm ? detail::compute_stage_count_or_override_single_affine_transformed_input<
+                                                  Sm90ReducedSmemCapacityBytes,
+                                                  RealElementA,
+                                                  RealElementB,
+                                                  ElementScale,
+                                                  ElementZero,
+                                                  TileShape_MNK,
+                                                  StageCountType::bytes,
+                                                  SmemAlignment>(StageCountType{})
+                                            : detail::compute_stage_count_or_override_single_affine_transformed_input<
+                                                  detail::sm90_smem_capacity_bytes,
+                                                  RealElementA,
+                                                  RealElementB,
+                                                  ElementScale,
+                                                  ElementZero,
+                                                  TileShape_MNK,
+                                                  StageCountType::bytes,
+                                                  SmemAlignment>(StageCountType{}))
+                   : detail::compute_stage_count_or_override<
+                         detail::sm90_smem_capacity_bytes,
+                         ElementAMma,
+                         ElementBMma,
+                         TileShape_MNK,
+                         StageCountType::bytes,
+                         SmemAlignment>(StageCountType{});
+
+  using DispatchPolicy = cute::conditional_t<
+      IsMixedInput,
+      cute::conditional_t<
+          IsArrayOfPointersGemm,
+          MainloopSm90ArrayTmaGmmaWarpSpecializedMixedInput<PipelineStages, ClusterShape_MNK, KernelScheduleType>,
+          MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput<PipelineStages, ClusterShape_MNK, KernelScheduleType>>,
+      MainloopSm90TmaGmmaRmemAWarpSpecialized<PipelineStages, ClusterShape_MNK, KernelScheduleType>>;
+
+  using SmemCopyAtomA = cute::conditional_t<SwapAB, void, Copy_Atom<cute::AutoVectorizingCopy, ElementA>>;
+  using SmemCopyAtomB = cute::conditional_t<SwapAB, Copy_Atom<cute::AutoVectorizingCopy, ElementB>, void>;
+
+  // We pack the scale data with the operand that will be optionally scaled and converted before MMA.
+  using StrideA = cute::conditional_t<
+      cute::is_layout<cute::remove_pointer_t<GmemLayoutATag_>>::value,
+      GmemLayoutATag_,
+      TagToStrideA_t<GmemLayoutATag>>;
+  using StrideB = cute::conditional_t<
+      cute::is_layout<cute::remove_pointer_t<GmemLayoutBTag_>>::value,
+      GmemLayoutBTag_,
+      TagToStrideB_t<GmemLayoutBTag>>;
+
+  using CollectiveOp = CollectiveMmaArrayMixedInput<
+      DispatchPolicy,
+      TileShape_MNK,
+      ElementPairA,
+      StrideA,
+      ElementPairB,
+      StrideB,
+      TiledMma,
+      GmemTiledCopyA,
+      SmemLayoutAtomA,
+      SmemCopyAtomA,
+      cute::identity,
+      GmemTiledCopyB,
+      SmemLayoutAtomB,
+      SmemCopyAtomB,
+      cute::identity>;
+
+  static_assert(
+      SmemAlignment == static_cast<int>(cute::max(CollectiveOp::SmemAlignmentA, CollectiveOp::SmemAlignmentB)));
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/sgl-kernel/csrc/cutlass_extensions/gemm/collective/collective_builder_mixed_input.hpp b/sgl-kernel/csrc/cutlass_extensions/gemm/collective/collective_builder_mixed_input.hpp
new file mode 100644
index 000000000..8288ace95
--- /dev/null
+++ b/sgl-kernel/csrc/cutlass_extensions/gemm/collective/collective_builder_mixed_input.hpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass_extensions/gemm/collective/collective_mma_array_mixed_input.hpp"
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    class ArchTag,
+    class OpClass,
+    class ElementA,
+    class GmemLayoutA,
+    int AlignmentA,
+    class ElementB,
+    class GmemLayoutB,
+    int AlignmentB,
+    class ElementAccumulator,
+    class TileShape_MNK,
+    class ClusterShape_MNK,
+    class StageCountType,
+    class KernelScheduleType,
+    class Enable = void>
+struct CollectiveBuilderMixedInput {
+  static_assert(sizeof(ElementA) == 0, "Could not build a collective for given parameters.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_mixed_input.inl"
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/sgl-kernel/csrc/cutlass_extensions/gemm/collective/collective_mma_array_mixed_input.hpp b/sgl-kernel/csrc/cutlass_extensions/gemm/collective/collective_mma_array_mixed_input.hpp
new file mode 100644
index 000000000..08afdffd4
--- /dev/null
+++ b/sgl-kernel/csrc/cutlass_extensions/gemm/collective/collective_mma_array_mixed_input.hpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cutlass/detail/dependent_false.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    class DispatchPolicy,
+    class TileShape,
+    class ElementA,
+    class StrideA,
+    class ElementB,
+    class StrideB,
+    class TiledMma,
+    class GmemTiledCopyA,
+    class SmemLayoutAtomA,
+    class SmemCopyAtomA,
+    class TransformA,
+    class GmemTiledCopyB,
+    class SmemLayoutAtomB,
+    class SmemCopyAtomB,
+    class TransformB>
+struct CollectiveMmaArrayMixedInput {
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Could not find a mainloop specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/sgl-kernel/csrc/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp b/sgl-kernel/csrc/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp
new file mode 100644
index 000000000..b37d5696c
--- /dev/null
+++ b/sgl-kernel/csrc/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp
@@ -0,0 +1,1538 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+#include "cutlass_extensions/detail/collective/mixed_input_utils.hpp"
+
+#define GROUP_SIZE 128
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+    int Stages,
+    class ClusterShape,
+    class KernelSchedule_,
+    class TileShape_,
+    class ElementAOptionalTuple,
+    class StrideA_,
+    class ElementBOptionalTuple,
+    class StrideB_,
+    class TiledMma_,
+    class GmemTiledCopyA_,
+    class SmemLayoutAtomA_,
+    class SmemCopyAtomA_,
+    class TransformA_,
+    class GmemTiledCopyB_,
+    class SmemLayoutAtomB_,
+    class SmemCopyAtomB_,
+    class TransformB_>
+struct CollectiveMmaArrayMixedInput<
+    MainloopSm90ArrayTmaGmmaWarpSpecializedMixedInput<Stages, ClusterShape, KernelSchedule_>,
+    TileShape_,
+    ElementAOptionalTuple,
+    StrideA_,
+    ElementBOptionalTuple,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_> {
+ public:
+  enum class ConversionMode { DirectConvert, ConvertAndScale, ConvertAndScaleWithZero };
+
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90ArrayTmaGmmaWarpSpecializedMixedInput<Stages, ClusterShape, KernelSchedule_>;
+  using TileShape = TileShape_;
+  using KernelSchedule = KernelSchedule_;
+
+ private:
+  template <class T>
+  friend struct detail::MixedGroupedGemmInputUtils;
+  using CollectiveType = CollectiveMma<
+      DispatchPolicy,
+      TileShape_,
+      ElementAOptionalTuple,
+      StrideA_,
+      ElementBOptionalTuple,
+      StrideB_,
+      TiledMma_,
+      GmemTiledCopyA_,
+      SmemLayoutAtomA_,
+      SmemCopyAtomA_,
+      TransformA_,
+      GmemTiledCopyB_,
+      SmemLayoutAtomB_,
+      SmemCopyAtomB_,
+      TransformB_>;
+  using Utils = detail::MixedGroupedGemmInputUtils<CollectiveType>;
+
+  //
+  // Type Aliases
+  //
+  using ScaleA = detail::deduce_mixed_width_dtype_t<1, ElementAOptionalTuple>;
+  using ScaleB = detail::deduce_mixed_width_dtype_t<1, ElementBOptionalTuple>;
+  using ZeroA = detail::deduce_mixed_width_dtype_t<2, ElementAOptionalTuple>;
+  using ZeroB = detail::deduce_mixed_width_dtype_t<2, ElementBOptionalTuple>;
+
+ public:
+  static_assert(
+      cute::is_tuple<ElementAOptionalTuple>::value ^ cute::is_tuple<ElementBOptionalTuple>::value,
+      "Either A OR B must be a tuple. It must take the from {ElementOperand, [ElementScale], [ElementZero]}. Inputs in "
+      "[] are optional.");
+
+  using ElementA = detail::deduce_mixed_width_dtype_t<0, ElementAOptionalTuple>;
+  using ElementB = detail::deduce_mixed_width_dtype_t<0, ElementBOptionalTuple>;
+  static constexpr bool IsATransformed = cute::is_tuple<ElementAOptionalTuple>::value;
+  using ElementScale = cute::conditional_t<IsATransformed, ScaleA, ScaleB>;
+  using ElementZero = cute::conditional_t<IsATransformed, ZeroA, ZeroB>;
+  // For cases where we can't have a void type, we can use this to allow the code to compile when the scale / zero is
+  // void.
+  using NonVoidElementScale = cute::conditional_t<cute::is_void_v<ElementScale>, float, ElementScale>;
+  using NonVoidElementZero = cute::conditional_t<cute::is_void_v<ElementZero>, float, ElementZero>;
+
+  using StrideA = StrideA_;
+  using InternalStrideA = cute::remove_pointer_t<StrideA>;
+  using StrideB = StrideB_;
+  using InternalStrideB = cute::remove_pointer_t<StrideB>;
+
+  using StrideScale = cute::Stride<cute::Int<1>, int64_t, int64_t>;
+  using NonVoidStrideScale =
+      cute::conditional_t<cute::is_void_v<StrideScale>, cute::Stride<_1, int64_t, int64_t>, StrideScale>;
+
+  static_assert(
+      (IsATransformed && (cutlass::gemm::detail::is_k_major<StrideA>() || is_layout<StrideA>::value ||
+                          is_layout<InternalStrideA>::value)) ||
+          (!IsATransformed && (cutlass::gemm::detail::is_k_major<StrideB>() || is_layout<StrideB>::value ||
+                               is_layout<InternalStrideB>::value)),
+      "The transformed type must be K-major.");
+
+  static_assert(
+      (IsATransformed && (sizeof(ElementB) == 2)) || (!IsATransformed && (sizeof(ElementA) == 2)) ||
+          ((cutlass::gemm::detail::is_k_major<StrideA>() || is_layout<StrideA>::value ||
+            is_layout<InternalStrideA>::value) &&
+           (cutlass::gemm::detail::is_k_major<StrideB>() || is_layout<StrideB>::value ||
+            is_layout<InternalStrideB>::value)),
+      "The unscaled element must be 2 bytes OR both inputs must be K-major");
+
+  static_assert(
+      cutlass::gemm::detail::is_mn_major<NonVoidStrideScale>(),
+      "Scale must be MN major [Col Major if A is scaled, Row Major if B is scaled].");
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using GmemTiledCopyScale = cute::SM90_TMA_LOAD;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using SmemCopyAtomScale = Copy_Atom<cute::AutoVectorizingCopy, NonVoidElementScale>;
+
+  // We must ensure the type to be scaled goes to RF
+  static constexpr bool SwapAB = !IsATransformed;
+  using SwappedStrideA = cute::conditional_t<!SwapAB, StrideA, StrideB>;
+  using SwappedStrideB = cute::conditional_t<!SwapAB, StrideB, StrideA>;
+  using InternalSwappedStrideA = cute::conditional_t<!SwapAB, InternalStrideA, InternalStrideB>;
+  using InternalSwappedStrideB = cute::conditional_t<!SwapAB, InternalStrideB, InternalStrideA>;
+  using SwappedSmemLayoutAtomA = cute::conditional_t<!SwapAB, SmemLayoutAtomA, SmemLayoutAtomB>;
+  using SwappedSmemLayoutAtomB = cute::conditional_t<!SwapAB, SmemLayoutAtomB, SmemLayoutAtomA>;
+  using SwappedSmemCopyAtomA = cute::conditional_t<!SwapAB, SmemCopyAtomA, SmemCopyAtomB>;
+  using SwappedSmemCopyAtomB = cute::conditional_t<!SwapAB, SmemCopyAtomB, SmemCopyAtomA>;
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using ConvertedElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using ConvertedElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+  using RealSwappedElementA = cute::conditional_t<!SwapAB, ElementA, ElementB>;
+  using RealSwappedElementB = cute::conditional_t<!SwapAB, ElementB, ElementA>;
+  using SwappedElementA = cute::conditional_t<!SwapAB, ConvertedElementA, ConvertedElementB>;
+  using SwappedElementB = cute::conditional_t<!SwapAB, ConvertedElementB, ConvertedElementA>;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using SwappedTransformA = cute::conditional_t<!SwapAB, TransformA, TransformB>;
+  using SwappedTransformB = cute::conditional_t<!SwapAB, TransformB, TransformA>;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static constexpr int IsSubbyteA = cute::sizeof_bits_v<SwappedElementA> < 8;
+  using TmaElementA = cute::conditional_t<IsSubbyteA, uint8_t, SwappedElementA>;
+  using TmaElementScale = uint_bit_t<sizeof_bits_v<NonVoidElementScale>>;  // in case we have array. translating to uint
+                                                                           // to satisfy tma descriptor's specialization
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  static constexpr int NumProducerThreadEvents = 1;
+
+  using SmemLayoutAtomScale = Layout<Shape<decltype(cute::shape<0>(SwappedSmemLayoutAtomA{})), cute::Int<1>>>;
+  using ScaleTileShape = decltype(make_shape(shape<0>(TileShape{}), shape<1>(SmemLayoutAtomScale{})));
+
+  static_assert(cute::rank(SwappedSmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert(
+      (size<0>(TileShape{}) % size<0>(SwappedSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(
+      (size<2>(TileShape{}) % size<1>(SwappedSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SwappedSmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert(
+      (size<1>(TileShape{}) % size<0>(SwappedSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(
+      (size<2>(TileShape{}) % size<1>(SwappedSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomScale{}) == 2, "SmemLayoutAtomScale must be rank 2");
+  static_assert(
+      (size<0>(TileShape{}) % size<0>(SmemLayoutAtomScale{})) == 0, "SmemLayoutAtomScale must equal the tile shape.");
+  static_assert(
+      (size<2>(TileShape{}) % size<1>(SmemLayoutAtomScale{})) == 0,
+      "SmemLayoutAtomScale must evenly divide tile k shape.");
+
+  /// Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(detail::get_smem_layout<DispatchPolicy::Stages>(
+      SwappedSmemLayoutAtomA{}, select<0, 2>(TileShape{}), InternalSwappedStrideA{}));
+  using SmemLayoutB = decltype(detail::get_smem_layout<DispatchPolicy::Stages>(
+      SwappedSmemLayoutAtomB{}, select<1, 2>(TileShape{}), InternalSwappedStrideB{}));
+
+  // It is assumed that the scales and zero-points share the same smem layout
+  using SmemLayoutScale = decltype(tile_to_shape(
+      SmemLayoutAtomScale{},
+      make_shape(shape<0>(ScaleTileShape{}), shape<1>(ScaleTileShape{}), Int<Stages>{}),
+      cute::conditional_t<
+          ::cutlass::gemm::detail::is_major<0, NonVoidStrideScale>(),
+          Step<_2, _1, _3>,
+          Step<_1, _2, _3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(
+      not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+          cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+      "MMA atom must source A from rmem and B operand from smem_desc for this mainloop.");
+  static_assert(
+      cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(
+      cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  // To relax them, we need to handle loading more than 1 row of scales for every main loop iteration.
+  // We must also handle updating the pipeline transaction bytes on the fly.
+  static_assert(size<1>(SmemLayoutAtomScale{}) == 1, "size<1>(SmemLayoutAtomScale) must be 1.");
+
+ private:
+  static constexpr ConversionMode get_conversion_mode() {
+    if constexpr (cute::is_void_v<ElementScale>) {
+      return ConversionMode::DirectConvert;
+    } else if constexpr (cute::is_void_v<ElementZero>) {
+      return ConversionMode::ConvertAndScale;
+    } else {
+      return ConversionMode::ConvertAndScaleWithZero;
+    }
+  }
+
+ public:
+  static constexpr ConversionMode KernelConversionMode = get_conversion_mode();
+  static constexpr bool ModeHasScales = KernelConversionMode == ConversionMode::ConvertAndScale ||
+                                        KernelConversionMode == ConversionMode::ConvertAndScaleWithZero;
+  static constexpr bool UseScaleLookupTable =
+      KernelConversionMode == ConversionMode::ConvertAndScale && cutlass::detail::is_Array_v<ElementScale>;
+  static constexpr size_t SmemAlignmentA = cutlass::detail::alignment_for_swizzle(SmemLayoutA{});
+  static constexpr size_t SmemAlignmentB = cutlass::detail::alignment_for_swizzle(SmemLayoutB{});
+  static constexpr size_t SmemAlignmentScale = cute::max(SmemAlignmentA, SmemAlignmentB);
+
+  static_assert(SmemAlignmentA >= 128 and SmemAlignmentB >= 128, "Require at least 128B alignment");
+
+  struct SharedStorage {
+    static constexpr int scale_elements = Utils::elements_per_smem_scale();
+    static constexpr int zero_elements = Utils::elements_per_smem_zero();
+    struct TensorStorage {
+      CUTE_ALIGNAS(SmemAlignmentA) cute::ArrayEngine<RealSwappedElementA, cute::cosize_v<SmemLayoutA>> smem_A;
+      CUTE_ALIGNAS(SmemAlignmentB) cute::ArrayEngine<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<NonVoidElementScale, scale_elements> smem_scale;
+      cute::ArrayEngine<NonVoidElementZero, zero_elements> smem_zero;
+    } tensors;
+
+    struct TensorMapStorage {
+      cute::TmaDescriptor smem_tensormap_A;
+      cute::TmaDescriptor smem_tensormap_B;
+      cute::TmaDescriptor smem_tensormap_scale;
+      cute::TmaDescriptor smem_tensormap_zero;
+    };
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+
+  // kernel Arguments
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const** ptr_A;
+    StrideA dA;
+    ElementB const** ptr_B;
+    StrideB dB;
+    ElementScale const** ptr_S = nullptr;
+    NonVoidStrideScale const* dS{};
+    int chunk_size = 0;
+    ElementZero const** ptr_Z = nullptr;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using LayoutA =
+        decltype(detail::get_gmem_layout(repeat_like(InternalSwappedStrideA{}, int32_t(0)), InternalSwappedStrideA{}));
+    using LayoutB =
+        decltype(detail::get_gmem_layout(repeat_like(InternalSwappedStrideB{}, int32_t(0)), InternalSwappedStrideB{}));
+
+    using TMA_A = decltype(make_tma_copy<TmaElementA>(
+        GmemTiledCopyA{},
+        make_tensor(detail::get_logical_ptr(static_cast<SwappedElementA const*>(nullptr)), LayoutA{}),
+        SmemLayoutA{}(_, _, cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(detail::get_logical_ptr(static_cast<SwappedElementB const*>(nullptr)), LayoutB{}),
+        SmemLayoutB{}(_, _, cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{})));  // mcast along M mode for this N load, if any
+
+    using TMA_Scale = decltype(make_tma_copy<TmaElementScale>(
+        GmemTiledCopyScale{},
+        make_tensor(
+            detail::get_logical_ptr(static_cast<NonVoidElementScale const*>(nullptr)),
+            repeat_like(NonVoidStrideScale{}, int32_t(0)),
+            NonVoidStrideScale{}),
+        SmemLayoutScale{}(_, _, cute::Int<0>{}),
+        ScaleTileShape{},
+        _1{}));  // mcast along N mode for this M load, if any. Scale is ALWAYS loaded with A for RF kernel
+
+    using TMA_Zero = decltype(make_tma_copy(
+        GmemTiledCopyScale{},
+        make_tensor(
+            detail::get_logical_ptr(static_cast<NonVoidElementZero const*>(nullptr)),
+            repeat_like(NonVoidStrideScale{}, int32_t(0)),
+            NonVoidStrideScale{}),
+        SmemLayoutScale{}(_, _, cute::Int<0>{}),
+        ScaleTileShape{},
+        _1{}));  // mcast along N mode for this M load, if any. Scale is ALWAYS loaded with A for RF kernel
+
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    TMA_Scale tma_load_scale;
+    TMA_Zero tma_load_zero;
+    void* tensormaps;
+    SwappedElementA const** ptr_A;
+    SwappedStrideA ptr_dA;
+    SwappedElementB const** ptr_B;
+    SwappedStrideB ptr_dB;
+    NonVoidElementScale const** ptr_S;
+    NonVoidStrideScale const* dS;
+    NonVoidElementZero const** ptr_Z;
+    int64_t scale_k;
+    int chunk_size;
+    int reload_factor = (chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{});
+    InternalSwappedStrideA dA;
+    InternalSwappedStrideB dB;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params to_underlying_arguments(ProblemShape problem_shapes, Arguments const& args, void* workspace) {
+    // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
+    // These will be replaced with correct values before the initial tma load.
+    auto init_shape = repeat_like(typename ProblemShape::UnderlyingProblemShape{}, int32_t(1));
+    auto init_M = get<0>(init_shape);
+    auto init_N = get<1>(init_shape);
+    auto init_K = get<2>(init_shape);
+
+    if constexpr (SwapAB) {
+      init_M = get<1>(init_shape);
+      init_N = get<0>(init_shape);
+    }
+    // Batches/Groups are managed by using appropriate pointers to input matrices
+    const uint32_t mock_L = 1;
+    SwappedElementA const* ptr_A_first_batch;
+    SwappedElementB const* ptr_B_first_batch;
+    SwappedStrideA ptr_dA;
+    SwappedStrideB ptr_dB;
+    InternalSwappedStrideA dA;
+    InternalSwappedStrideB dB;
+
+    if constexpr (not SwapAB) {
+      ptr_A_first_batch = reinterpret_cast<SwappedElementA const*>(args.ptr_A);
+      ptr_B_first_batch = reinterpret_cast<SwappedElementB const*>(args.ptr_B);
+    } else {
+      ptr_A_first_batch = reinterpret_cast<SwappedElementA const*>(args.ptr_B);
+      ptr_B_first_batch = reinterpret_cast<SwappedElementB const*>(args.ptr_A);
+    }
+
+    if constexpr (IsGroupedGemmKernel) {
+      // Strides for Grouped Gemm will be replaced prior to the first access regardless.
+      if constexpr (not SwapAB) {
+        ptr_dA = args.dA;
+        ptr_dB = args.dB;
+      } else {
+        ptr_dA = args.dB;
+        ptr_dB = args.dA;
+      }
+      dA = InternalSwappedStrideA{};
+      if constexpr (is_layout<InternalSwappedStrideA>::value) {
+        dA = make_layout(
+            transform_leaf(
+                dA.shape(),
+                [](auto x) {
+                  if constexpr (not is_static_v<decltype(x)>) {
+                    return static_cast<decltype(x)>(1);
+                  } else {
+                    return x;
+                  }
+                }),
+            dA.stride());
+      }
+      dB = InternalSwappedStrideB{};
+    } else {
+      // Tensor shapes for Ptr-Array are initialized correctly only here.
+      auto problem_shape_MNK = problem_shapes.get_host_problem_shape(0);
+      init_M = get<0>(problem_shape_MNK);
+      init_N = get<1>(problem_shape_MNK);
+      init_K = get<2>(problem_shape_MNK);
+
+      if constexpr (not SwapAB) {
+        dA = args.dA;
+        dB = args.dB;
+      } else {
+        dA = args.dB;
+        dB = args.dA;
+      }
+      ptr_dA = SwappedStrideA{};
+      ptr_dB = SwappedStrideB{};
+    }
+    Tensor tensor_a = make_tensor(ptr_A_first_batch, detail::get_gmem_layout(make_shape(init_M, init_K, mock_L), dA));
+    Tensor tensor_b = make_tensor(ptr_B_first_batch, detail::get_gmem_layout(make_shape(init_N, init_K, mock_L), dB));
+
+    typename Params::TMA_A tma_load_a = make_tma_copy<TmaElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_, _, cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{}));  // mcast along N mode for this M load, if any
+    typename Params::TMA_B tma_load_b = make_tma_copy(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_, _, cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{}));  // mcast along M mode for this N load, if any
+    typename Params::TMA_Scale tma_load_scale{};
+    typename Params::TMA_Zero tma_load_zero{};
+
+    void* tensormaps = workspace;
+    auto args_setup =
+        [&](auto ptr_A, auto ptr_B, int64_t scale_k = 0, int chunk_size = 0, int reload_factor = 1) -> Params {
+      return {
+          tma_load_a,
+          tma_load_b,
+          TmaTransactionBytes,
+          tma_load_scale,
+          tma_load_zero,
+          tensormaps,
+          reinterpret_cast<SwappedElementA const**>(ptr_A),
+          ptr_dA,
+          reinterpret_cast<SwappedElementB const**>(ptr_B),
+          ptr_dB,
+          reinterpret_cast<NonVoidElementScale const**>(args.ptr_S),
+          args.dS,
+          reinterpret_cast<NonVoidElementZero const**>(args.ptr_Z),
+          scale_k,
+          chunk_size,
+          reload_factor,
+          dA,
+          dB};
+    };
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return SwapAB ? args_setup(args.ptr_B, args.ptr_A) : args_setup(args.ptr_A, args.ptr_B);
+    } else if constexpr (ModeHasScales) {
+      auto fake_scale_k = 1;
+      ElementScale const* ptr_S = reinterpret_cast<ElementScale const*>(args.ptr_S);
+      StrideScale dS{};
+      Tensor tensor_scale =
+          make_tensor(detail::get_logical_ptr(ptr_S), make_layout(make_shape(init_M, fake_scale_k, mock_L), dS));
+      tma_load_scale = make_tma_copy<TmaElementScale>(
+          GmemTiledCopyScale{},
+          tensor_scale,
+          SmemLayoutScale{}(_, _, cute::Int<0>{}),
+          ScaleTileShape{},
+          _1{});  // mcast along N mode for this M load, if any
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return SwapAB ? args_setup(
+                            args.ptr_B,
+                            args.ptr_A,
+                            fake_scale_k,
+                            args.chunk_size,
+                            (args.chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}))
+                      : args_setup(
+                            args.ptr_A,
+                            args.ptr_B,
+                            fake_scale_k,
+                            args.chunk_size,
+                            (args.chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}));
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        ElementZero const* ptr_Z = reinterpret_cast<ElementZero const*>(args.ptr_Z);
+        Tensor tensor_zero =
+            make_tensor(detail::get_logical_ptr(ptr_Z), make_layout(make_shape(init_M, fake_scale_k, mock_L), dS));
+        tma_load_zero = make_tma_copy(
+            GmemTiledCopyScale{},
+            tensor_zero,
+            SmemLayoutScale{}(_, _, cute::Int<0>{}),
+            ScaleTileShape{},
+            _1{});  // mcast along N mode for this M load, if any
+        return SwapAB ? args_setup(
+                            args.ptr_B,
+                            args.ptr_A,
+                            fake_scale_k,
+                            args.chunk_size,
+                            (args.chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}))
+                      : args_setup(
+                            args.ptr_A,
+                            args.ptr_B,
+                            fake_scale_k,
+                            args.chunk_size,
+                            (args.chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}));
+
+      } else {
+        static_assert(
+            cutlass::detail::dependent_false<KernelSchedule>,
+            "Conversion mode not handled in to_underlying_arguments.");
+      }
+    } else {
+      static_assert(
+          cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in to_underlying_arguments.");
+    }
+  }
+
+  template <class ProblemShape>
+  static size_t get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
+
+    // Calculating workspace size
+    auto calculate_workspace_size = [SizeOfCuTensorMap, sm_count](uint32_t num_input_tensors) {
+      return num_input_tensors * SizeOfCuTensorMap * sm_count;
+    };
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies
+      return calculate_workspace_size(2);
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies,
+      // followed by scale tensormap copies
+      return calculate_workspace_size(3);
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies,
+      // followed by scale and zeros tensormap copies
+      return calculate_workspace_size(4);
+    } else {
+      static_assert(
+          cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in get_workspace_size.");
+    }
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status initialize_workspace(
+      ProblemShape const& problem_shape,
+      Arguments const& args,
+      void* workspace,
+      cudaStream_t stream,
+      CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template <class ProblemShape>
+  CUTLASS_HOST_DEVICE static bool can_implement(ProblemShape problem_shapes, Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+
+    bool implementable = true;
+    if (problem_shapes.is_host_problem_shape_available()) {
+      // Check alignment for all problem sizes
+      for (int i = 0; i < problem_shapes.groups(); i++) {
+        auto problem_shape_MNKL = append<4>(problem_shapes.get_host_problem_shape(i), 1);
+        auto [M, N, K, L] = problem_shape_MNKL;
+        auto get_stride = [](auto stride) {
+          if constexpr (cute::is_pointer_v<cute::decay_t<decltype(stride)>>) {
+            return *stride;
+          } else {
+            return stride;
+          }
+        };
+        auto dA = get_stride(args.dA);
+        auto dB = get_stride(args.dB);
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(
+                                             detail::get_gmem_layout(cute::make_shape(M, K, L), dA));
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(
+                                             detail::get_gmem_layout(cute::make_shape(N, K, L), dB));
+        if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+          implementable = implementable && (args.ptr_S == nullptr);
+          implementable = implementable && (args.ptr_Z == nullptr);
+        } else if constexpr (ModeHasScales) {
+          const int scale_mn = SwapAB ? N : M;
+          const int scale_k = (K + args.chunk_size - 1) / args.chunk_size;
+          constexpr int min_tma_aligned_elements_scale = tma_alignment_bits / cutlass::sizeof_bits<ElementScale>::value;
+          implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_scale>(
+                                               cute::make_shape(scale_mn, scale_k, L), StrideScale{});
+          implementable = implementable && (args.chunk_size == K || ((args.chunk_size % size<2>(TileShape{})) == 0));
+          implementable = implementable && args.chunk_size != 0;
+          implementable = implementable && (args.ptr_S != nullptr);
+          if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+            implementable = implementable && (args.ptr_Z == nullptr);
+          } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+            constexpr int min_tma_aligned_elements_zero = tma_alignment_bits / cutlass::sizeof_bits<ElementZero>::value;
+            implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_zero>(
+                                                 cute::make_shape(scale_mn, scale_k, L), StrideScale{});
+            implementable = implementable && (args.ptr_Z != nullptr);
+          } else {
+            static_assert(
+                cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in can_implement.");
+          }
+        } else {
+          static_assert(
+              cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in can_implement.");
+        }
+      }
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytesMK = Utils::compute_tma_transaction_bytes_mk();
+  static constexpr uint32_t TmaTransactionBytesNK = Utils::compute_tma_transaction_bytes_nk();
+  static constexpr uint32_t TmaTransactionBytesExtra = Utils::compute_tma_transaction_bytes_extra();
+  static constexpr uint32_t TmaTransactionBytes =
+      TmaTransactionBytesMK + TmaTransactionBytesNK + TmaTransactionBytesExtra;
+
+  // Set up the data needed by this collective for load and mma.
+  // Returns a tuple of tensors. The collective and the kernel layer have the contract that the
+  // returned tuple must contain at least two elements, with the first two elements being:
+  // gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  // gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  // The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+    const int32_t mock_L = 1;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(
+        shape(detail::get_gmem_layout(make_shape(M, K, mock_L), mainloop_params.dA)));  // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(
+        shape(detail::get_gmem_layout(make_shape(N, K, mock_L), mainloop_params.dB)));  // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_, _, _), Step<_1, X, _1>{});  // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_, _, _), Step<X, _1, _1>{});  // (BLK_N,BLK_K,n,k,l)
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple(gA_mkl, gB_nkl);
+    } else if constexpr (ModeHasScales) {
+      // The real scale_k that actually works
+      // auto scale_k = K / mainloop_params.chunk_size;
+      auto scale_k = K / GROUP_SIZE;
+
+      Tensor mS_mkl = mainloop_params.tma_load_scale.get_tma_tensor(make_shape(M, scale_k, L));  // (m,scale_k,l)
+      Tensor gS_mkl = local_tile(mS_mkl, ScaleTileShape{}, make_coord(_, _));  // (BLK_M,BLK_Scale_K,m,scale_k,l)
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(gA_mkl, gB_nkl, gS_mkl);
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor mZ_mkl = mainloop_params.tma_load_zero.get_tma_tensor(make_shape(M, scale_k, L));  // (m,scale_k,l)
+        Tensor gZ_mkl = local_tile(mZ_mkl, ScaleTileShape{}, make_coord(_, _));  // (BLK_M,BLK_Scale_K,m,scale_k,l)
+        return cute::make_tuple(gA_mkl, gB_nkl, gS_mkl, gZ_mkl);
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in load_init.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in load_init.");
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Perform a collective-scoped matrix multiply-accumulate
+  // Producer Perspective
+  template <class... Ts, class... TMs, class KTileIterator, class BlockCoord>
+  CUTLASS_DEVICE void load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<Ts...> const& load_inputs,
+      cute::tuple<TMs...> const& input_tensormaps,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter,
+      int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      static_assert(sizeof...(Ts) == 2, "Direct convert needs two inputs");
+      static_assert(sizeof...(TMs) == 2, "Direct convert needs two tensormaps");
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      static_assert(sizeof...(Ts) == 3, "Scaled convert needs three inputs");
+      static_assert(sizeof...(TMs) == 3, "Scaled convert needs three tensormaps");
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      static_assert(sizeof...(Ts) == 4, "Scaled and zero convert needs four inputs");
+      static_assert(sizeof...(TMs) == 4, "Scaled and zero convert needs four tensormaps");
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in TMA load.");
+    }
+
+    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});  // (BLK_M,BLK_K,PIPE)
+    Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});  // (BLK_N,BLK_K,PIPE)
+    Tensor sA = as_position_independent_swizzle_tensor(sA_);                                // (BLK_M,BLK_K,PIPE)
+    Tensor sB = as_position_independent_swizzle_tensor(sB_);                                // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Prepare the TMA loads for A and B
+    //
+
+    constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
+    uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+    auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+    // Partition the inputs based on the current block coordinates.
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+    Tensor gA = gA_mkl(_, _, m_coord, _, l_coord);  // (BLK_M,BLK_K,k)
+    Tensor gB = gB_nkl(_, _, n_coord, _, l_coord);  // (BLK_N,BLK_K,k)
+
+    // Applies the mapping from block_tma_a
+    Tensor tAgA = block_tma_a.partition_S(gA);  // (TMA,TMA_M,TMA_K,k)
+    Tensor tAsA = block_tma_a.partition_D(sA);  // (TMA,TMA_M,TMA_K,PIPE)
+
+    Tensor tBgB = block_tma_b.partition_S(gB);  // (TMA,TMA_N,TMA_K,k)
+    Tensor tBsB = block_tma_b.partition_D(sB);  // (TMA,TMA_N,TMA_K,PIPE)
+
+    uint16_t mcast_mask_a = 0;
+    uint16_t mcast_mask_b = 0;
+    uint16_t mcast_mask_s = 0;
+
+    // Issue TmaLoads
+    // Maps the tile -> block, value
+    if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};  // (m,n) -> block_id
+      for (int n = 0; n < size<1>(block_layout); ++n) {
+        mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x, n, Int<0>{}));
+      }
+    }
+
+    if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};  // (m,n) -> block_id
+      for (int m = 0; m < size<0>(block_layout); ++m) {
+        mcast_mask_b |= (uint16_t(1) << block_layout(m, cluster_local_block_id.y, Int<0>{}));
+      }
+    }
+
+    auto extra_input_partitions = Utils::partition_extra_tma_inputs(
+        mainloop_params, load_inputs, shared_tensors, cluster_local_block_id, m_coord, l_coord);
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+
+      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+      int write_stage = smem_pipe_write.index();
+      if (cute::elect_one_sync()) {
+        copy(
+            mainloop_params.tma_load_a.with(get<0>(input_tensormaps), *tma_barrier, mcast_mask_a),
+            tAgA(_, _, _, *k_tile_iter),
+            tAsA(_, _, _, write_stage));
+        copy(
+            mainloop_params.tma_load_b.with(get<1>(input_tensormaps), *tma_barrier, mcast_mask_b),
+            tBgB(_, _, _, *k_tile_iter),
+            tBsB(_, _, _, write_stage));
+      }
+      if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+        // Nothing extra to do.
+      } else if constexpr (ModeHasScales) {
+        // scale copy
+        auto tSgS = get<0>(extra_input_partitions);
+        auto tSsS = get<1>(extra_input_partitions);
+
+        // Temporary factor which will determine which k tile to reload from gmem. Needed so we don't modify tma
+        // transaction bytes on the fly. We must do a ceiling divide here to correctly handle with chunk_size == K. In
+        // that case, we don't require that K is a multiple of the threadblock tile K
+        const int scale_load_k = *k_tile_iter / 1;
+        // const int scale_load_k = *k_tile_iter / mainloop_params.reload_factor; // This will always be 0 when
+        // chunk_size == K.
+        if (cute::elect_one_sync()) {
+          copy(
+              mainloop_params.tma_load_scale.with(get<2>(input_tensormaps), *tma_barrier, mcast_mask_s),
+              tSgS(_, _, _, scale_load_k),
+              tSsS(_, _, _, write_stage));
+        }
+
+        if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+          // Nothing extra to do
+        } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+          // zero copy
+          auto tZgZ = get<2>(extra_input_partitions);
+          auto tZsZ = get<3>(extra_input_partitions);
+          if (cute::elect_one_sync()) {
+            copy(
+                mainloop_params.tma_load_zero.with(get<3>(input_tensormaps), *tma_barrier, mcast_mask_s),
+                tZgZ(_, _, _, scale_load_k),
+                tZsZ(_, _, _, write_stage));
+          }
+        } else {
+          static_assert(
+              cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for TMA copy op.");
+        }
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for TMA copy op.");
+      }
+      ++k_tile_iter;
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+  }
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      // This helps avoid early exit of blocks in Cluster.
+      // Waits for all stages to either be released (all
+      // Consumer UNLOCKs), or if the stage was never used
+      // then it would just be acquired since the phase was
+      // still inverted from make_producer_start_state.
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <class FrgTensorC>
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SwappedSmemLayoutAtomA{}) == 2, "SwappedSmemLayoutAtomA must be rank 2.");
+    static_assert(cute::rank(SwappedSmemLayoutAtomB{}) == 2, "SwappedSmemLayoutAtomB must be rank 2.");
+    static_assert(
+        !cute::is_void_v<SwappedSmemCopyAtomA>,
+        "SM90 GMMA mainloops must specify a non-void copy atom for smem sourced instructions.");
+    static_assert(
+        cute::is_void_v<SwappedSmemCopyAtomB>,
+        "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    // Obtain warp index
+    int warp_idx = canonical_warp_idx_sync();
+    [[maybe_unused]] int warp_group_thread_idx = thread_idx % 128;
+
+    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});  // (BLK_M,BLK_K,PIPE)
+    Tensor sA = as_position_independent_swizzle_tensor(sA_);                                // (BLK_M,BLK_K,PIPE)
+
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});  // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(
+        stride<0>(typename TiledMma::BLayout{}) == 0 and
+            size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup,
+        "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto mma_thread_slice = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCsA = mma_thread_slice.partition_A(sA);
+    auto mma_warpgroup_slice = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    // Allocate fragments and descriptors
+    Tensor tCrA_mma = mma_thread_slice.partition_fragment_A(sA(_, _, Int<0>{}));  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrA_load = [&] {
+      if constexpr (not is_layout<InternalSwappedStrideA>::value) {
+        // Make register tensor with MMA layout
+        return make_fragment_like<RealSwappedElementA>(tCrA_mma);
+      } else {
+        // Make register tensor matching smem layout, converter will take care of de-swizzling
+        return make_tensor_like<RealSwappedElementA>(tCsA(_, _, _, Int<0>{}));
+      }
+    }();
+    Tensor tCsB = mma_warpgroup_slice.partition_B(sB);        // (MMA,MMA_N,MMA_K,PIPE)
+    Tensor tCrB = mma_warpgroup_slice.make_fragment_B(tCsB);  // (MMA,MMA_N,MMA_K,PIPE)
+
+    //
+    // Copy Atom A retiling
+    //
+    auto smem_tiled_copy_A = make_tiled_copy_A(SwappedSmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A = smem_tiled_copy_A.get_thread_slice(warp_group_thread_idx);
+
+    Tensor tCrA_copy_view = smem_thr_copy_A.retile_D(tCrA_load);  // (CPY,CPY_M,CPY_K)
+
+    // Partition of thread -> shared and thread -> RF
+    auto partitioned_extra_info = Utils::partition_extra_mma_info(mma_thread_slice, shared_tensors);
+    auto copy_partitions_extra_info =
+        Utils::retile_extra_mma_info(tiled_mma, partitioned_extra_info, warp_group_thread_idx);
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));      // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));      // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA_mma) == size<1>(accum));           // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));               // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));  // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));  // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    multiply_add<ElementAccumulator> fma;
+
+    constexpr int NumMMAsPerChunk = GROUP_SIZE / cute::get<0, 1>(tCsB.shape())();
+    constexpr int NumChunksPerTileK = cute::size<1>(sA.shape())() / GROUP_SIZE;
+    cute::array<decltype(make_fragment_like(accum)), NumChunksPerTileK> intermediate_array;
+
+    constexpr int K_BLOCK_MAX = size<2>(tCrA_load);
+    constexpr int K_WAIT_MAX = cute::min(K_BLOCK_MAX - 1, 7);
+    static_assert(K_BLOCK_MAX >= 4, "Consider increasing TileShapeK");
+
+    ConsumerToken barrier_token = {BarrierStatus::WaitAgain};
+    // First k tile
+    {
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+
+      ++smem_pipe_read;
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+
+      // copy smem->rmem for A operand
+
+      Utils::copy_tensors_MK(
+          smem_tiled_copy_A, tCsA, tCrA_copy_view, partitioned_extra_info, copy_partitions_extra_info, 0, read_stage);
+      if (K_BLOCK_MAX > 1) {
+        Utils::copy_tensors_MK(
+            smem_tiled_copy_A, tCsA, tCrA_copy_view, partitioned_extra_info, copy_partitions_extra_info, 1, read_stage);
+      }
+
+      // src: tCrA_load, dst: tCrA_mma
+      Utils::convert_A_kblock(tCrA_load, tCrA_mma, 0);
+
+      tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int chunk_id = 0; chunk_id < NumChunksPerTileK; ++chunk_id) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_id = 0; mma_id < NumMMAsPerChunk; ++mma_id) {
+          int k_block = chunk_id * NumMMAsPerChunk + mma_id;
+
+          warpgroup_arrive();
+
+          // (V,M) x (V,N) => (V,M,N)
+          cute::gemm(tiled_mma, tCrA_mma(_, _, k_block), tCrB(_, _, k_block, read_stage), intermediate_array[chunk_id]);
+          tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+
+          warpgroup_commit_batch();
+
+          if (k_block < K_BLOCK_MAX - 2) {
+            Utils::copy_tensors_MK(
+                smem_tiled_copy_A,
+                tCsA,
+                tCrA_copy_view,
+                partitioned_extra_info,
+                copy_partitions_extra_info,
+                k_block + 2,
+                read_stage);
+          }
+          if (k_block < K_BLOCK_MAX - 1) {
+            Utils::convert_A_kblock(tCrA_load, tCrA_mma, k_block + 1);
+          }
+        }
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int chunk_id_ = 0; chunk_id_ < NumChunksPerTileK; ++chunk_id_) {
+        warpgroup_fence_operand(intermediate_array[chunk_id_]);
+
+        // Apply the group-wise scaling
+        // tCrS  ((4, _2, _2), MMA_M, _1)
+        // accum ((2, _2, _2), MMA_M, _1)
+        auto tCrS = cute::get<1>(partitioned_extra_info);
+        for (int mma_m = 0; mma_m < size<1>(accum); mma_m++) {
+          for (int m = 0; m < size<0, 1>(accum); m++) {
+            for (int n = 0; n < size<0, 2>(accum); n++) {
+              for (int e = 0; e < size<0, 0>(accum); e++) {
+                auto accum_coord = make_coord(make_tuple(e, m, n), mma_m, 0);
+                auto scale_coord = make_coord(make_tuple(0, m, 0), mma_m, 0);
+
+                if (chunk_id_ == 0) {
+                  accum(accum_coord) =
+                      intermediate_array[chunk_id_](accum_coord) * static_cast<float>(tCrS(scale_coord)[0]);
+                } else {
+                  accum(accum_coord) =
+                      fma(intermediate_array[chunk_id_](accum_coord),
+                          static_cast<float>(tCrS(scale_coord)[chunk_id_]),
+                          accum(accum_coord));
+                }
+              }
+            }
+          }
+        }
+      }
+
+      --k_tile_count;
+      if (k_tile_count > 0) {
+        // Wait for K_BLOCK_MAX - 1 to be in flight to ensure that it is safe to overwrite the A registers for the first
+        // mma.
+        pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+        Utils::copy_tensors_MK(
+            smem_tiled_copy_A,
+            tCsA,
+            tCrA_copy_view,
+            partitioned_extra_info,
+            copy_partitions_extra_info,
+            0,
+            smem_pipe_read.index());
+
+        Utils::copy_tensors_MK(
+            smem_tiled_copy_A,
+            tCsA,
+            tCrA_copy_view,
+            partitioned_extra_info,
+            copy_partitions_extra_info,
+            1,
+            smem_pipe_read.index());
+
+        warpgroup_wait<K_WAIT_MAX>();
+        Utils::convert_A_kblock(tCrA_load, tCrA_mma, 0);
+      }
+    }
+
+    if (k_tile_count == 0) {
+      return;
+    }
+
+    // Mainloop GMMAs
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (; k_tile_count > 1; --k_tile_count) {
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      ++smem_pipe_read;
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int chunk_id = 0; chunk_id < NumChunksPerTileK; ++chunk_id) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_id = 0; mma_id < NumMMAsPerChunk; ++mma_id) {
+          int k_block = chunk_id * NumMMAsPerChunk + mma_id;
+
+          warpgroup_arrive();
+          // (V,M) x (V,N) => (V,M,N)
+          cute::gemm(tiled_mma, tCrA_mma(_, _, k_block), tCrB(_, _, k_block, read_stage), intermediate_array[chunk_id]);
+          tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+          warpgroup_commit_batch();
+
+          warpgroup_wait<K_WAIT_MAX>();  // We have K_BLOCK_MAX - 1 GMMA instructions pending for this stage, so we can
+                                         // release prior barrier
+          if (k_block == K_BLOCK_MAX - 1) {
+            pipeline.consumer_release(smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_ on it
+            ++smem_pipe_release;
+          }
+
+          if (k_block == 0) {
+            barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+          }
+
+          if (k_block == K_BLOCK_MAX - 1) {
+            // The last k_block
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int chunk_id_ = 0; chunk_id_ < NumChunksPerTileK; ++chunk_id_) {
+              warpgroup_fence_operand(intermediate_array[chunk_id_]);
+
+              // Apply the group-wise scaling
+              auto tCrS = cute::get<1>(partitioned_extra_info);
+              for (int mma_m = 0; mma_m < size<1>(accum); mma_m++) {
+                for (int m = 0; m < size<0, 1>(accum); m++) {
+                  for (int n = 0; n < size<0, 2>(accum); n++) {
+                    for (int e = 0; e < size<0, 0>(accum); e++) {
+                      auto accum_coord = make_coord(make_tuple(e, m, n), mma_m, 0);
+                      auto scale_coord = make_coord(make_tuple(0, m, 0), mma_m, 0);
+
+                      accum(accum_coord) =
+                          fma(intermediate_array[chunk_id_](accum_coord),
+                              static_cast<float>(tCrS(scale_coord)[chunk_id_]),
+                              accum(accum_coord));
+                    }
+                  }
+                }
+              }
+            }
+
+            pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+            // copy scales when passing k_block=0
+            Utils::copy_tensors_MK(
+                smem_tiled_copy_A,
+                tCsA,
+                tCrA_copy_view,
+                partitioned_extra_info,
+                copy_partitions_extra_info,
+                0,
+                smem_pipe_read.index());
+            Utils::copy_tensors_MK(
+                smem_tiled_copy_A,
+                tCsA,
+                tCrA_copy_view,
+                partitioned_extra_info,
+                copy_partitions_extra_info,
+                1,
+                smem_pipe_read.index());
+            Utils::convert_A_kblock(tCrA_load, tCrA_mma, 0);
+          } else {
+            if (k_block < K_BLOCK_MAX - 2) {
+              Utils::copy_tensors_MK(
+                  smem_tiled_copy_A,
+                  tCsA,
+                  tCrA_copy_view,
+                  partitioned_extra_info,
+                  copy_partitions_extra_info,
+                  k_block + 2,
+                  read_stage);
+            }
+            Utils::convert_A_kblock(tCrA_load, tCrA_mma, k_block + 1);
+          }
+        }
+      }
+    }
+
+    {
+      //
+      // Last k tile
+      //
+      Tensor intermediate = make_fragment_like(accum);
+
+      int read_stage = smem_pipe_read.index();
+
+      tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma(_, _, k_block), tCrB(_, _, k_block, read_stage), intermediate);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+
+        warpgroup_wait<K_WAIT_MAX>();
+        if (k_block == K_BLOCK_MAX - 1) {
+          // release prior barrier
+          pipeline.consumer_release(smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+
+        if (k_block < K_BLOCK_MAX - 2) {
+          Utils::copy_tensors_MK(
+              smem_tiled_copy_A,
+              tCsA,
+              tCrA_copy_view,
+              partitioned_extra_info,
+              copy_partitions_extra_info,
+              k_block + 2,
+              read_stage);
+        }
+        if (k_block < K_BLOCK_MAX - 1) {
+          Utils::convert_A_kblock(tCrA_load, tCrA_mma, k_block + 1);
+        }
+
+        if ((k_block + 1) % NumMMAsPerChunk == 0) {
+          tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+          warpgroup_fence_operand(intermediate);
+
+          // Apply the group-wise scaling
+          auto tCrS = cute::get<1>(partitioned_extra_info);
+          for (int mma_m = 0; mma_m < size<1>(accum); mma_m++) {
+            for (int m = 0; m < size<0, 1>(accum); m++) {
+              for (int n = 0; n < size<0, 2>(accum); n++) {
+                for (int e = 0; e < size<0, 0>(accum); e++) {
+                  auto accum_coord = make_coord(make_tuple(e, m, n), mma_m, 0);
+                  auto scale_coord = make_coord(make_tuple(0, m, 0), mma_m, 0);
+                  int scale_idx = k_block / NumMMAsPerChunk;
+
+                  accum(accum_coord) = fma(
+                      intermediate(accum_coord), static_cast<float>(tCrS(scale_coord)[scale_idx]), accum(accum_coord));
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = 1;
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  //
+  // Methods to perform different parts of TMA/Tensormap modifications
+  //
+  CUTLASS_DEVICE auto tensormaps_init(
+      Params const& mainloop_params, TensorMapStorage& shared_tensormaps, int32_t sm_count, int32_t sm_idx) {
+    cute::TmaDescriptor* gmem_tensormap = reinterpret_cast<cute::TmaDescriptor*>(mainloop_params.tensormaps);
+
+    cute::TmaDescriptor* tma_desc_a = &gmem_tensormap[sm_idx];
+    cute::TmaDescriptor* tma_desc_b = &gmem_tensormap[sm_idx + sm_count];
+    cute::TmaDescriptor* tma_desc_scale = &gmem_tensormap[sm_idx + 2 * sm_count];
+    cute::TmaDescriptor* tma_desc_zero = &gmem_tensormap[sm_idx + 3 * sm_count];
+
+    // Bringing tensormaps from params to smem for modification later
+    Tensor pA_tensormap = make_tensor(mainloop_params.tma_load_a.get_tma_descriptor(), Int<1>{}, Int<1>{});
+    Tensor sA_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_A), Int<1>{}, Int<1>{});
+    Tensor pB_tensormap = make_tensor(mainloop_params.tma_load_b.get_tma_descriptor(), Int<1>{}, Int<1>{});
+    Tensor sB_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_B), Int<1>{}, Int<1>{});
+
+    if (cute::elect_one_sync()) {
+      copy(recast<uint128_t>(pA_tensormap), recast<uint128_t>(sA_tensormap));
+      copy(recast<uint128_t>(pB_tensormap), recast<uint128_t>(sB_tensormap));
+    }
+
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      Tensor pS_tensormap = make_tensor(mainloop_params.tma_load_scale.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sS_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_scale), Int<1>{}, Int<1>{});
+      if (cute::elect_one_sync()) {
+        copy(recast<uint128_t>(pS_tensormap), recast<uint128_t>(sS_tensormap));
+      }
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      Tensor pZ_tensormap = make_tensor(mainloop_params.tma_load_zero.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sZ_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_zero), Int<1>{}, Int<1>{});
+      if (cute::elect_one_sync()) {
+        copy(recast<uint128_t>(pZ_tensormap), recast<uint128_t>(sZ_tensormap));
+      }
+    } else if constexpr (KernelConversionMode != ConversionMode::DirectConvert) {
+      static_assert(
+          cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in tensormaps_init.");
+    }
+
+    __syncwarp();
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple(tma_desc_a, tma_desc_b);
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      return cute::make_tuple(tma_desc_a, tma_desc_b, tma_desc_scale);
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      return cute::make_tuple(tma_desc_a, tma_desc_b, tma_desc_scale, tma_desc_zero);
+    } else {
+      static_assert(
+          cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in tensormaps_init.");
+    }
+  }
+
+  // Replace address for the global tensor (to be done by single thread)
+  CUTLASS_DEVICE
+  void tensormaps_replace_global_address(
+      TensorMapStorage& shared_tensormaps, Params const& mainloop_params, int32_t next_batch) {
+    // Replacing global_address for the next batch
+    cute::tma_descriptor_replace_addr_in_shared_mem(
+        shared_tensormaps.smem_tensormap_A, mainloop_params.ptr_A[next_batch]);
+    cute::tma_descriptor_replace_addr_in_shared_mem(
+        shared_tensormaps.smem_tensormap_B, mainloop_params.ptr_B[next_batch]);
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      cute::tma_descriptor_replace_addr_in_shared_mem(
+          shared_tensormaps.smem_tensormap_scale, mainloop_params.ptr_S[next_batch]);
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      cute::tma_descriptor_replace_addr_in_shared_mem(
+          shared_tensormaps.smem_tensormap_zero, mainloop_params.ptr_Z[next_batch]);
+    } else if constexpr (KernelConversionMode != ConversionMode::DirectConvert) {
+      static_assert(
+          cutlass::detail::dependent_false<KernelSchedule>,
+          "Conversion mode not handled in tensormaps_replace_global_address.");
+    }
+  }
+
+  // Replace dim and strides for the global tensor - used only for Grouped GEMM (to be done by single thread)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE void tensormaps_replace_global_tensor_properties(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_group,
+      ProblemShape_MNKL problem_shape_mnkl) {
+    const uint32_t M = get<0>(problem_shape_mnkl);
+    const uint32_t N = get<1>(problem_shape_mnkl);
+    const uint32_t K = get<2>(problem_shape_mnkl);
+
+    // Replace all dims for consistency
+    constexpr int MaxTensorRank = 5;
+    cute::array<uint32_t, MaxTensorRank> prob_shape_A = {1, 1, 1, 1, 1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_A = {0, 0, 0, 0, 0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_B = {1, 1, 1, 1, 1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_B = {0, 0, 0, 0, 0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_scale = {1, 1, 1, 1, 1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_scale = {0, 0, 0, 0, 0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_zero = {1, 1, 1, 1, 1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_zero = {0, 0, 0, 0, 0};
+
+    SwappedElementA const* ptr_A = nullptr;
+    Tensor tensor_a =
+        make_tensor(ptr_A, detail::get_gmem_layout(make_shape(M, K, Int<1>{}), mainloop_params.ptr_dA[next_group]));
+
+    SwappedElementB const* ptr_B = nullptr;
+    Tensor tensor_b =
+        make_tensor(ptr_B, detail::get_gmem_layout(make_shape(N, K, Int<1>{}), mainloop_params.ptr_dB[next_group]));
+
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_a, tensor_a, prob_shape_A, prob_stride_A);
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_b, tensor_b, prob_shape_B, prob_stride_B);
+
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      NonVoidElementScale const* ptr_S = nullptr;
+      // auto scale_k = K / mainloop_params.chunk_size;
+      auto scale_k = K / GROUP_SIZE;
+      Tensor tensor_scale =
+          make_tensor(detail::get_logical_ptr(ptr_S), make_shape(M, scale_k, Int<1>{}), mainloop_params.dS[next_group]);
+      cute::detail::fill_tma_gmem_shape_stride(
+          mainloop_params.tma_load_scale, tensor_scale, prob_shape_scale, prob_stride_scale);
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      ElementZero const* ptr_Z = nullptr;
+      // auto scale_k = K / mainloop_params.chunk_size;
+      auto scale_k = K / GROUP_SIZE;
+      Tensor tensor_zero =
+          make_tensor(detail::get_logical_ptr(ptr_Z), make_shape(M, scale_k, Int<1>{}), mainloop_params.dS[next_group]);
+      cute::detail::fill_tma_gmem_shape_stride(
+          mainloop_params.tma_load_zero, tensor_zero, prob_shape_zero, prob_stride_zero);
+    } else if constexpr (KernelConversionMode != ConversionMode::DirectConvert) {
+      static_assert(
+          cutlass::detail::dependent_false<KernelSchedule>,
+          "Conversion mode not handled in tensormaps_replace_global_tensor_properties.");
+    }
+
+    // Convert strides to byte strides
+    for (uint64_t& stride : prob_stride_A) {
+      stride = (stride * sizeof_bits_v<SwappedElementA>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_B) {
+      stride = (stride * sizeof_bits_v<SwappedElementB>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_scale) {
+      stride = (stride * sizeof_bits_v<NonVoidElementScale>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_zero) {
+      stride = (stride * sizeof_bits_v<NonVoidElementScale>) / 8;
+    }
+
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(
+        shared_tensormaps.smem_tensormap_A, prob_shape_A, prob_stride_A);
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(
+        shared_tensormaps.smem_tensormap_B, prob_shape_B, prob_stride_B);
+
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      cute::tma_descriptor_replace_dims_strides_in_shared_mem(
+          shared_tensormaps.smem_tensormap_scale, prob_shape_scale, prob_stride_scale);
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      cute::tma_descriptor_replace_dims_strides_in_shared_mem(
+          shared_tensormaps.smem_tensormap_zero, prob_shape_zero, prob_stride_zero);
+    } else if constexpr (KernelConversionMode != ConversionMode::DirectConvert) {
+      static_assert(
+          cutlass::detail::dependent_false<KernelSchedule>,
+          "Conversion mode not handled in tensormaps_replace_global_tensor_properties.");
+    }
+  }
+
+  template <class... TMs, class ProblemShape_MNKL>
+  CUTLASS_DEVICE void tensormaps_perform_update(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      cute::tuple<TMs...> const& input_tensormaps,
+      ProblemShape_MNKL problem_shape_mnkl,
+      int32_t next_batch) {
+    if (cute::elect_one_sync()) {
+      // Replacing global_address for the next batch
+      tensormaps_replace_global_address(shared_tensormaps, mainloop_params, next_batch);
+
+      if constexpr (IsGroupedGemmKernel) {
+        // Replacing global dims and strides for the next batch
+        tensormaps_replace_global_tensor_properties(shared_tensormaps, mainloop_params, next_batch, problem_shape_mnkl);
+      }
+    }
+  }
+
+  template <class... TMs>
+  CUTLASS_DEVICE void
+  tensormaps_cp_fence_release(TensorMapStorage& shared_tensormaps, cute::tuple<TMs...> const& input_tensormaps) {
+    if (cute::elect_one_sync()) {
+      cute::tma_desc_commit_group();
+      cute::tma_desc_wait_group();
+    }
+    // Entire warp must do this (i.e. it's aligned)
+    tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
+    tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      tma_descriptor_cp_fence_release(get<2>(input_tensormaps), shared_tensormaps.smem_tensormap_scale);
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      tma_descriptor_cp_fence_release(get<3>(input_tensormaps), shared_tensormaps.smem_tensormap_zero);
+    } else if constexpr (KernelConversionMode != ConversionMode::DirectConvert) {
+      static_assert(
+          cutlass::detail::dependent_false<KernelSchedule>,
+          "Conversion mode not handled in tensormaps_cp_fence_release.");
+    }
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class... TMs>
+  CUTLASS_DEVICE void tensormaps_fence_acquire(cute::tuple<TMs...> const& input_tensormaps) {
+    cute::tma_descriptor_fence_acquire(get<0>(input_tensormaps));
+    cute::tma_descriptor_fence_acquire(get<1>(input_tensormaps));
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      cute::tma_descriptor_fence_acquire(get<2>(input_tensormaps));
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      cute::tma_descriptor_fence_acquire(get<3>(input_tensormaps));
+    } else if constexpr (KernelConversionMode != ConversionMode::DirectConvert) {
+      static_assert(
+          cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in tensormaps_fence_acquire.");
+    }
+  }
+
+  template <class InputTensors, class ProblemShape_MNKL>
+  CUTLASS_DEVICE InputTensors tensors_perform_update(
+      InputTensors const& input_tensors,
+      [[maybe_unused]] Params const& mainloop_params,
+      [[maybe_unused]] ProblemShape_MNKL problem_shape_mnkl,
+      [[maybe_unused]] int32_t next_batch) {
+    return input_tensors;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/sgl-kernel/csrc/cutlass_extensions/gemm/cutlass_gemm_caller.cuh b/sgl-kernel/csrc/cutlass_extensions/gemm/cutlass_gemm_caller.cuh
new file mode 100644
index 000000000..737916d89
--- /dev/null
+++ b/sgl-kernel/csrc/cutlass_extensions/gemm/cutlass_gemm_caller.cuh
@@ -0,0 +1,62 @@
+// Adapted from
+// https://github.com/vllm-project/vllm/blob/main/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
+
+#pragma once
+
+// clang-format will break include orders
+// clang-format off
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/util/packed_stride.hpp"
+
+// clang-format on
+
+/**
+ * Helper function for checking CUTLASS errors
+ */
+#define CUTLASS_CHECK(status)                                                       \
+  {                                                                                 \
+    cutlass::Status error = status;                                                 \
+    TORCH_CHECK(error == cutlass::Status::kSuccess, cutlassGetStatusString(error)); \
+  }
+
+template <typename GemmKernel>
+void cutlass_gemm_caller(
+    torch::Device device,
+    cute::Shape<int, int, int, int> prob_shape,
+    typename GemmKernel::MainloopArguments mainloop_args,
+    typename GemmKernel::EpilogueArguments epilogue_args,
+    typename GemmKernel::TileSchedulerArguments scheduler = {}) {
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = c10::cuda::current_device();
+  hw_info.sm_count = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+  typename GemmKernel::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGemm, prob_shape, mainloop_args, epilogue_args, hw_info, scheduler};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options = torch::TensorOptions().dtype(torch::kUInt8).device(device);
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(device.index());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
diff --git a/sgl-kernel/csrc/cutlass_extensions/gemm/dispatch_policy.hpp b/sgl-kernel/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
new file mode 100644
index 000000000..6019e4a53
--- /dev/null
+++ b/sgl-kernel/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
@@ -0,0 +1,38 @@
+// Adapted from https://github.com/vllm-project/vllm/blob/main/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
+
+#pragma once
+
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+namespace cutlass::gemm {
+
+//////////////////////////////////////////////////////////////////////////////
+
+// FP8 related policies (including Blocked Scaled Accumulation)
+//  `ScaleGranularityM` specifies scaling granularity along M, while zero-value
+//  `ScaleGranularityM` indicates that scaling granularity is
+//  `size<0>(TileShape_MNK{})` along M.
+template <int ScaleGranularityM = 0>
+struct KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum : KernelTmaWarpSpecializedCooperative {};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp
+// specialized dynamic schedule For FP8 kernels with Block Scaling
+template <
+    int Stages_,
+    class ClusterShape_ = Shape<_1, _1, _1>,
+    class KernelSchedule = KernelTmaWarpSpecialized,
+    int ScaleGranularityM = 0  // `ScaleGranularityM` specifies scaling granularity along M,
+                               // while zero-value `ScaleGranularityM` indicates that scaling
+                               // granularity is `size<0>(TileShape_MNK{})` along M.
+    >
+struct MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8
+    : MainloopSm90TmaGmmaWarpSpecialized<Stages_, ClusterShape_, KernelSchedule> {
+  static_assert(
+      cute::
+          is_same_v<KernelSchedule, KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>>,
+      "KernelSchedule must be one of the warp specialized policies");
+};
+
+//////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm
diff --git a/sgl-kernel/csrc/cutlass_extensions/gemm/fp8_blockwise_gemm_sm90_dispatch.cuh b/sgl-kernel/csrc/cutlass_extensions/gemm/fp8_blockwise_gemm_sm90_dispatch.cuh
new file mode 100644
index 000000000..8deda43e5
--- /dev/null
+++ b/sgl-kernel/csrc/cutlass_extensions/gemm/fp8_blockwise_gemm_sm90_dispatch.cuh
@@ -0,0 +1,197 @@
+// Adapted from
+// https://github.com/vllm-project/vllm/blob/main/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
+#pragma once
+
+#include "cute/tensor.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass_extensions/common.hpp"
+#include "cutlass_extensions/gemm/cutlass_gemm_caller.cuh"
+#include "cutlass_extensions/gemm/dispatch_policy.hpp"
+
+using namespace cute;
+
+template <
+    typename SchedulerType,
+    typename OutType,
+    int GroupSizeM_,
+    int GroupSizeN_,
+    int GroupSizeK_,
+    int TileSizeM_ = 128,
+    class ClusterShape = Shape<_1, _2, _1>>
+struct cutlass_3x_gemm_fp8_blockwise {
+  using GroupSizeM = Int<GroupSizeM_>;
+  using GroupSizeN = Int<GroupSizeN_>;
+  using GroupSizeK = Int<GroupSizeK_>;
+  using TileSizeM = Int<TileSizeM_>;
+
+  static_assert(TileSizeM_ % GroupSizeM_ == 0, "TileSizeM must be a multiple of GroupSizeM");
+
+  using ElementAB = cutlass::float_e4m3_t;
+
+  // A matrix configuration
+  using ElementA = ElementAB;
+  using LayoutA = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+
+  // B matrix configuration
+  using ElementB = ElementAB;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+
+  // C/D matrix configuration
+  using ElementC = void;
+  using LayoutC = cutlass::layout::RowMajor;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<OutType>::value;
+
+  using ElementD = OutType;
+  using LayoutD = cutlass::layout::RowMajor;
+  static constexpr int AlignmentD = AlignmentC;
+
+  using ScaleTileShape = Shape<_1, _128, _128>;
+  using ScaleConfig = decltype(cutlass::detail::sm90_trivial_blockwise_scale_config(ScaleTileShape{}));
+  using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
+  using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
+
+  // Multiply-accumulate blocking/pipelining details
+  using ElementAccumulator = float;                            // Element type for internal accumulation
+  using ElementCompute = float;                                // Element type for compute
+  using TileShape = Shape<TileSizeM, GroupSizeN, GroupSizeK>;  // Threadblock-level tile size
+
+  using ArchTag = cutlass::arch::Sm90;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
+  using StoreEpilogueCompute = typename cutlass::epilogue::fusion::Sm90EVT<cutlass::epilogue::fusion::Sm90AccFetch>;
+
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      TileShape,
+      ClusterShape,
+      EpilogueTileType,
+      ElementAccumulator,
+      ElementCompute,
+      ElementC,
+      LayoutC,
+      AlignmentC,
+      ElementD,
+      LayoutD,
+      AlignmentD,
+      EpilogueSchedule,
+      StoreEpilogueCompute>::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      ElementA,
+      cute::tuple<LayoutA, LayoutSFA>,
+      AlignmentA,
+      ElementB,
+      cute::tuple<LayoutB, LayoutSFB>,
+      AlignmentB,
+      ElementAccumulator,
+      TileShape,
+      ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+          sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      KernelSchedule>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>,  // Indicates ProblemShape
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      SchedulerType>;
+};
+
+template <typename Gemm>
+void cutlass_gemm_caller_blockwise(
+    torch::Tensor& out,
+    torch::Tensor const& a,
+    torch::Tensor const& b,
+    torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales) {
+  using GemmKernel = typename Gemm::GemmKernel;
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementA = ElementAB;
+  using ElementB = ElementAB;
+  using ElementD = typename Gemm::ElementD;
+  using ElementBlockScale = float;
+
+  using ScaleTileShape = Shape<_1, _128, _128>;
+  using ScaleConfig = decltype(cutlass::detail::sm90_trivial_blockwise_scale_config(ScaleTileShape{}));
+  using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
+  using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
+
+  int m = a.size(0);
+  int k = a.size(1);
+  int n = b.size(1);
+
+  auto a_ptr = static_cast<ElementA*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementB*>(b.data_ptr());
+
+  auto a_s_ptr = static_cast<ElementBlockScale*>(a_scales.data_ptr());
+  auto b_s_ptr = static_cast<ElementBlockScale*>(b_scales.data_ptr());
+
+  using StrideA = typename GemmKernel::StrideA;
+  using StrideB = typename GemmKernel::StrideB;
+  using StrideD = typename GemmKernel::StrideD;
+  using StrideC = typename GemmKernel::StrideC;
+
+  StrideA a_stride = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
+  StrideB b_stride = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
+  StrideC c_stride = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1));
+  LayoutSFA layout_sfa = ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
+  LayoutSFB layout_sfb = ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
+
+  typename GemmKernel::MainloopArguments mainloop_args{
+      a_ptr, a_stride, b_ptr, b_stride, a_s_ptr, layout_sfa, b_s_ptr, layout_sfb};
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{{}, c_ptr, c_stride, c_ptr, c_stride};
+
+  typename GemmKernel::TileSchedulerArguments scheduler;
+
+  static constexpr bool UsesStreamKScheduler =
+      cute::is_same_v<typename GemmKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>;
+
+  if constexpr (UsesStreamKScheduler) {
+    using DecompositionMode =
+        typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+    using ReductionMode =
+        typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::ReductionMode;
+
+    scheduler.decomposition_mode = DecompositionMode::StreamK;
+    scheduler.reduction_mode = ReductionMode::Nondeterministic;
+  }
+
+  cutlass_gemm_caller<GemmKernel>(a.device(), {m, n, k, 1}, mainloop_args, epilogue_args, scheduler);
+}
+
+template <typename OutType>
+void cutlass_gemm_blockwise_sm90_fp8_dispatch(
+    torch::Tensor& out,
+    torch::Tensor const& a,
+    torch::Tensor const& b,
+    torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales) {
+  auto k = a.size(1);
+  auto n = b.size(1);
+
+  if (k > 3 * n) {
+    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<cutlass::gemm::StreamKScheduler, OutType, 1, 128, 128>>(
+        out, a, b, a_scales, b_scales);
+  } else {
+    cutlass_gemm_caller_blockwise<
+        cutlass_3x_gemm_fp8_blockwise<cutlass::gemm::PersistentScheduler, OutType, 1, 128, 128>>(
+        out, a, b, a_scales, b_scales);
+  }
+}
diff --git a/sgl-kernel/csrc/cutlass_extensions/gemm/gemm_universal_base_compat.h b/sgl-kernel/csrc/cutlass_extensions/gemm/gemm_universal_base_compat.h
new file mode 100644
index 000000000..b58d84318
--- /dev/null
+++ b/sgl-kernel/csrc/cutlass_extensions/gemm/gemm_universal_base_compat.h
@@ -0,0 +1,356 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Adapted from
+// https://github.com/NVIDIA/TensorRT-LLM/blob/be1788106245496872d18e702978e59b6bfd50e0/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/device/gemm_universal_base_compat.h
+#pragma once
+
+#include <cutlass/cutlass.h>
+#include <cutlass/device_kernel.h>
+#include <cutlass/trace.h>
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*
+    This is the device layer from CUTLASS 2.10 (SHA - cc85b64cf676c45f98a17e3a47c0aafcf817f088)
+    It is replicated here since we needed to duplicate kernel level APIs for mixed dtype GEMMs
+    and SmoothQuant. The newer device layer is not compatible with these older kernel level APIs.
+
+    Note: While CUTLASS 3.x supports stream-k, none of the kernels in the extensions folder support
+          that feature at the moment.
+  */
+
+template <typename GemmKernel_>
+class GemmUniversalBaseCompat {
+ public:
+  using GemmKernel = GemmKernel_;
+  using ThreadblockShape = typename GemmKernel::Mma::Shape;
+
+  using ElementA = typename GemmKernel::ElementA;
+  using LayoutA = typename GemmKernel::LayoutA;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  static ComplexTransform const kTransformA = GemmKernel::kTransformA;
+
+  using ElementB = typename GemmKernel::ElementB;
+  using LayoutB = typename GemmKernel::LayoutB;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  static ComplexTransform const kTransformB = GemmKernel::kTransformB;
+
+  using ElementC = typename GemmKernel::ElementC;
+  using LayoutC = typename GemmKernel::LayoutC;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+
+  using ElementAccumulator = typename GemmKernel::Mma::Policy::Operator::ElementC;
+
+  using EpilogueOutputOp = typename GemmKernel::EpilogueOutputOp;
+  using ThreadblockSwizzle = typename GemmKernel::ThreadblockSwizzle;
+  using Operator = typename GemmKernel::Operator;
+
+  /// Argument structure
+  using Arguments = typename GemmKernel::Arguments;
+
+ protected:
+  /// Kernel parameters object
+  typename GemmKernel::Params params_;
+
+ protected:
+  /// Private helper to obtain the grid dimensions with fix-up for split-K
+  static void get_grid_shape_(gemm::GemmCoord& grid_tiled_shape, int& gemm_k_size, Arguments const& args) {
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        args.problem_size, {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK}, args.batch_count);
+
+    gemm_k_size = args.problem_size.k();
+
+    if (args.mode == GemmUniversalMode::kGemm || args.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      int const kAlignK =
+          const_max(const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value), 1);
+
+      gemm_k_size = round_up(ceil_div(args.problem_size.k(), args.batch_count), kAlignK);
+
+      if (gemm_k_size) {
+        grid_tiled_shape.k() = ceil_div(args.problem_size.k(), gemm_k_size);
+      }
+    }
+  }
+
+ public:
+  /// Constructs the GEMM.
+  GemmUniversalBaseCompat() {}
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const& args) {
+    // Determine grid shape
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int gemm_k_size = 0;
+
+    get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
+
+    ThreadblockSwizzle threadblock_swizzle;
+    dim3 grid = threadblock_swizzle.get_grid_shape(grid_tiled_shape);
+
+    uint32_t const kGridYZMax = ((1 << (sizeof(uint16_t) * 8)) - 1);
+
+    if (!(grid.y <= kGridYZMax && grid.z <= kGridYZMax)) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return GemmKernel::can_implement(args);
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const& args) {
+    CUTLASS_TRACE_HOST("GemmUniversalBaseCompat::get_workspace_size()");
+
+    size_t workspace_bytes = 0;
+
+    // Determine grid shape
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int gemm_k_size = 0;
+
+    get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
+
+    if (args.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      // Split-K parallel always requires a temporary workspace
+      workspace_bytes = sizeof(ElementC) * size_t(args.batch_stride_D) * size_t(grid_tiled_shape.k());
+    } else if (args.mode == GemmUniversalMode::kGemm && grid_tiled_shape.k() > 1) {
+      // Serial split-K only requires a temporary workspace if the number of partitions along the
+      // GEMM K dimension is greater than one.
+      workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n());
+    }
+
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+
+    workspace_bytes += GemmKernel::get_extra_workspace_size(args, grid_tiled_shape);
+
+    return workspace_bytes;
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const& args) {
+    CUTLASS_TRACE_HOST("GemmUniversalBaseCompat::get_grid_shape()");
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int gemm_k_size = 0;
+
+    get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
+    dim3 result = threadblock_swizzle.get_grid_shape(grid_tiled_shape);
+
+    CUTLASS_TRACE_HOST(
+        "  grid_tiled_shape: " << grid_tiled_shape << "\n"
+                               << "  result = {" << result << "}");
+
+    return result;
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    CUTLASS_TRACE_HOST("GemmUniversalBaseCompat::maximum_active_blocks()");
+
+    int max_active_blocks = -1;
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    CUTLASS_TRACE_HOST("  smem_size: " << smem_size << " bytes");
+
+    if (smem_size <= (48 << 10)) {
+      cudaError_t result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+          &max_active_blocks, Kernel<GemmKernel>, GemmKernel::kThreadCount, smem_size);
+
+      if (result == cudaSuccess) {
+        CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+        return max_active_blocks;
+      }
+    } else {
+      // Query assuming zero shared memory then compute occupancy limit based on SMEM
+      cudaError_t result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+          &max_active_blocks, Kernel<GemmKernel>, GemmKernel::kThreadCount, 0);
+
+      if (result != cudaSuccess) {
+        CUTLASS_TRACE_HOST(
+            "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error " << cudaGetErrorString(result));
+
+        return -1;
+      }
+
+      if (smem_capacity < 0) {
+        int device_idx = 0;
+        result = cudaGetDevice(&device_idx);
+
+        if (result != cudaSuccess) {
+          return -1;
+        }
+
+        cudaDeviceProp properties;
+        result = cudaGetDeviceProperties(&properties, device_idx);
+
+        if (result != cudaSuccess) {
+          return -1;
+        }
+
+        smem_capacity = static_cast<int>(properties.sharedMemPerMultiprocessor);
+      }
+
+      int occupancy = std::min(max_active_blocks, smem_capacity / smem_size);
+
+      CUTLASS_TRACE_HOST("  occupancy: " << occupancy);
+
+      return occupancy;
+    }
+
+    CUTLASS_TRACE_HOST("  returning internal error");
+
+    return -1;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
+    CUTLASS_TRACE_HOST(
+        "GemmUniversalBaseCompat::initialize() - workspace " << workspace
+                                                             << ", stream: " << (stream ? "non-null" : "null"));
+
+    size_t workspace_bytes = get_workspace_size(args);
+
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+
+    if (workspace_bytes) {
+      if (!workspace) {
+        CUTLASS_TRACE_HOST("  error: device workspace must not be null");
+
+        return Status::kErrorWorkspaceNull;
+      }
+
+      if (args.mode == GemmUniversalMode::kGemm) {
+        CUTLASS_TRACE_HOST("  clearing device workspace");
+        cudaError_t result = cudaMemsetAsync(workspace, 0, workspace_bytes, stream);
+
+        if (result != cudaSuccess) {
+          CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+
+          return Status::kErrorInternal;
+        }
+      }
+    }
+
+    // Get CUDA grid shape
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int gemm_k_size = 0;
+
+    get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
+
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params(args, grid_tiled_shape, gemm_k_size, static_cast<int*>(workspace));
+
+    // Specify shared memory capacity for kernel.
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result =
+          cudaFuncSetAttribute(Kernel<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const& args, void* workspace = nullptr) {
+    CUTLASS_TRACE_HOST("GemmUniversalBaseCompat()::update() - workspace: " << workspace);
+
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_.update(args, workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+    CUTLASS_TRACE_HOST("GemmUniversalBaseCompat::run()");
+
+    //
+    // Configure grid and block dimensions
+    //
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    //
+    // Launch kernel
+    //
+
+    CUTLASS_TRACE_HOST("  grid: (" << grid << "),  block: (" << block << "),  SMEM: " << smem_size << " bytes");
+
+    // Launch
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    //
+    // Query for errors
+    //
+    cudaError_t result = cudaGetLastError();
+
+    if (result != cudaSuccess) {
+      CUTLASS_TRACE_HOST("  grid launch failed with error " << cudaGetErrorString(result));
+      return Status::kErrorInternal;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr) {
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace device
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/sgl-kernel/csrc/cutlass_extensions/gemm/gemm_with_epilogue_visitor.h b/sgl-kernel/csrc/cutlass_extensions/gemm/gemm_with_epilogue_visitor.h
new file mode 100644
index 000000000..905d11ba2
--- /dev/null
+++ b/sgl-kernel/csrc/cutlass_extensions/gemm/gemm_with_epilogue_visitor.h
@@ -0,0 +1,492 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Adapted from
+// https://github.com/NVIDIA/TensorRT-LLM/blob/be1788106245496872d18e702978e59b6bfd50e0/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/gemm_with_epilogue_visitor.h
+
+#pragma once
+
+#include <cutlass/complex.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/fast_math.h>
+#include <cutlass/matrix_coord.h>
+#include <cutlass/trace.h>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename Mma_,                ///! Threadblock-scoped matrix multiply-accumulate
+    typename Epilogue_,           ///! Epilogue
+    typename ThreadblockSwizzle_  ///! Threadblock swizzling function
+    >
+struct GemmWithEpilogueVisitor {
+ public:
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueVisitor = typename Epilogue::Visitor;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using TensorRefA = TensorRef<ElementA, LayoutA>;
+
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using TensorRefB = TensorRef<ElementB, LayoutB>;
+
+  using ElementCompute = typename EpilogueVisitor::ElementCompute;
+  using LayoutAlphaCol = cutlass::layout::RowMajor;
+  using LayoutAlphaRow = cutlass::layout::ColumnMajor;
+  using TensorRefAlphaCol = TensorRef<ElementCompute, LayoutAlphaCol>;
+  using TensorRefAlphaRow = TensorRef<ElementCompute, LayoutAlphaRow>;
+
+  using ElementC = typename EpilogueVisitor::ElementOutput;
+  using LayoutC = typename Epilogue::Layout;
+  using TensorRefC = TensorRef<ElementC, LayoutC>;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+  using EpilogueOutputOp =
+      typename Epilogue::Visitor::ElementwiseFunctor;  // Define type so GemmUniversalBase doesn't complain
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = EpilogueVisitor::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode;
+    GemmCoord problem_size;
+    int batch_count;
+
+    TensorRefA ref_A;
+    TensorRefB ref_B;
+    TensorRefAlphaCol ref_alpha_col;
+    TensorRefAlphaRow ref_alpha_row;
+    TensorRefC ref_C;
+    TensorRefC ref_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_D;
+
+    typename EpilogueVisitor::Arguments epilogue_visitor;
+
+    //
+    // Methods
+    //
+
+    Arguments() : mode(GemmUniversalMode::kGemm), batch_count(1) {}
+
+    /// constructs an arguments structure
+    Arguments(
+        GemmCoord problem_size_,
+        TensorRefA ref_A_,
+        TensorRefB ref_B_,
+        TensorRefAlphaCol ref_alpha_col_,
+        TensorRefAlphaRow ref_alpha_row_,
+        TensorRefC ref_C_,
+        TensorRefC ref_D_,
+        typename EpilogueVisitor::Arguments epilogue_visitor_)
+        : mode(GemmUniversalMode::kGemm),
+          problem_size(problem_size_),
+          batch_count(1),
+          ref_A(ref_A_),
+          ref_B(ref_B_),
+          ref_alpha_col(ref_alpha_col_),
+          ref_alpha_row(ref_alpha_row_),
+          ref_C(ref_C_),
+          ref_D(ref_D_),
+          batch_stride_A(0),
+          batch_stride_B(0),
+          batch_stride_D(0),
+          epilogue_visitor(epilogue_visitor_) {}
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename EpilogueVisitor::ScaleTileIterator::Params params_alpha_col;
+    typename EpilogueVisitor::ScaleTileIterator::Params params_alpha_row;
+    typename EpilogueVisitor::OutputTileIterator::Params params_C;
+    typename EpilogueVisitor::OutputTileIterator::Params params_D;
+
+    GemmUniversalMode mode;
+    int batch_count;
+    int gemm_k_size;
+
+    void* ptr_A;
+    void* ptr_B;
+    typename EpilogueVisitor::ScaleTileIterator::Element* ptr_alpha_col;
+    typename EpilogueVisitor::ScaleTileIterator::Element* ptr_alpha_row;
+    ElementC* ptr_C;
+    ElementC* ptr_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+
+    typename EpilogueVisitor::Params epilogue_visitor;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params()
+        : swizzle_log_tile(0),
+          params_A(0),
+          params_B(0),
+          params_alpha_col(0),
+          params_C(0),
+          params_D(0),
+          batch_count(0),
+          gemm_k_size(0),
+          mode(cutlass::gemm::GemmUniversalMode::kGemm),
+          ptr_A(nullptr),
+          ptr_B(nullptr),
+          ptr_alpha_col(nullptr),
+          ptr_alpha_row(nullptr),
+          ptr_C(nullptr),
+          ptr_D(nullptr),
+          batch_stride_A(0),
+          batch_stride_B(0) {}
+
+    Params(Arguments const& args, cutlass::gemm::GemmCoord const& grid_tiled_shape_, int gemm_k_size_, int* workspace_)
+        : problem_size(args.problem_size),
+          swizzle_log_tile(0),
+          params_A(args.ref_A.layout()),
+          params_B(args.ref_B.layout()),
+          params_alpha_col(args.ref_alpha_col.layout()),
+          params_alpha_row(args.ref_alpha_col.layout()),
+          params_C(args.ref_C.layout()),
+          params_D(args.ref_D.layout()),
+          mode(args.mode),
+          batch_count(args.batch_count),
+          gemm_k_size(args.problem_size.k()),
+          ptr_A(args.ref_A.data()),
+          ptr_B(args.ref_B.data()),
+          ptr_alpha_col(args.ref_alpha_col.data()),
+          ptr_alpha_row(args.ref_alpha_row.data()),
+          ptr_C(args.ref_C.data()),
+          ptr_D(args.ref_D.data()),
+          batch_stride_A(args.batch_stride_A),
+          batch_stride_B(args.batch_stride_B),
+          epilogue_visitor(args.epilogue_visitor) {
+      ThreadblockSwizzle threadblock_swizzle;
+
+      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+          args.problem_size, {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK}, args.batch_count);
+
+      if (args.mode == GemmUniversalMode::kGemm || args.mode == GemmUniversalMode::kGemmSplitKParallel) {
+        int const kAlignK =
+            const_max(const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value), 1);
+
+        gemm_k_size = round_up(ceil_div(args.problem_size.k(), args.batch_count), kAlignK);
+
+        if (gemm_k_size) {
+          grid_tiled_shape.k() = ceil_div(args.problem_size.k(), gemm_k_size);
+        }
+      }
+
+      swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+
+    struct {
+      typename Epilogue::SharedStorage epilogue;
+      typename EpilogueVisitor::SharedStorage visitor;
+    } epilogue;
+  };
+
+ public:
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  GemmWithEpilogueVisitor() {}
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::gemm::GemmCoord const& problem_size) {
+    CUTLASS_TRACE_HOST("GemmWithEpilogueVisitor::can_implement()");
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = EpilogueVisitor::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (
+        platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value ||
+        platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (
+        platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value ||
+        platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (
+        platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value ||
+        platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const& args) {
+    return can_implement(args.problem_size);
+  }
+
+  static size_t get_extra_workspace_size(Arguments const& args, cutlass::gemm::GemmCoord const& grid_tiled_shape) {
+    return 0;
+  }
+
+#define SPLIT_K_ENABLED 1
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void run_kernel_(Params const& params, SharedStorage& shared_storage) {
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+        params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA* ptr_A = static_cast<ElementA*>(params.ptr_A);
+    ElementB* ptr_B = static_cast<ElementB*>(params.ptr_B);
+
+#if SPLIT_K_ENABLED
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm || params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    } else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    } else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA* const*>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB* const*>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+#endif
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+        threadblock_tile_offset.m() * Mma::Shape::kM,
+        offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{offset_k, threadblock_tile_offset.n() * Mma::Shape::kN};
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+        params.params_A, ptr_A, {params.problem_size.m(), problem_size_k}, thread_idx, tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+        params.params_B, ptr_B, {problem_size_k, params.problem_size.n()}, thread_idx, tb_offset_B);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // assume identity swizzle
+    MatrixCoord threadblock_offset(
+        threadblock_tile_offset.m() * Mma::Shape::kM, threadblock_tile_offset.n() * Mma::Shape::kN);
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    //
+    // Construct the epilogue visitor
+    //
+
+    bool with_bias = true;
+    if (params.ptr_C == nullptr) {
+      with_bias = false;
+    }
+
+    EpilogueVisitor epilogue_visitor(
+        params.epilogue_visitor,
+        shared_storage.epilogue.visitor,
+        params.problem_size.mn(),
+        thread_idx,
+        warp_idx,
+        lane_idx,
+        params.params_alpha_col,
+        params.params_C,
+        params.params_D,
+        with_bias,
+        true,
+        true,
+        params.ptr_alpha_row,
+        params.ptr_alpha_col,
+        params.ptr_C,
+        params.ptr_D,
+        threadblock_offset,
+        blockIdx.y * params.problem_size.m());
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+      // Indicate which position in a serial reduction the output operator is currently updating
+      epilogue_visitor.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+    } else if (params.mode == GemmUniversalMode::kBatched || params.mode == GemmUniversalMode::kArray) {
+      epilogue_visitor.set_batch_index(threadblock_tile_offset.k());
+    }
+
+    // Construct the epilogue
+    Epilogue epilogue(shared_storage.epilogue.epilogue, thread_idx, warp_idx, lane_idx);
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(epilogue_visitor, accumulators);
+  }
+
+  template <typename CompilationArch>
+  CUTLASS_DEVICE void run_kernel(Params const& params, SharedStorage& shared_storage) {
+    if constexpr (platform::is_same<ArchTag, CompilationArch>::value) {
+      run_kernel_(params, shared_storage);
+    } else {
+      CUTLASS_NOT_IMPLEMENTED();
+    }
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const& params, SharedStorage& shared_storage) {
+    run_kernel<ArchTag>(params, shared_storage);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/sgl-kernel/csrc/elementwise/activation.cu b/sgl-kernel/csrc/elementwise/activation.cu
new file mode 100644
index 000000000..43617f87f
--- /dev/null
+++ b/sgl-kernel/csrc/elementwise/activation.cu
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2024 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#ifndef USE_ROCM
+
+#include <flashinfer/activation.cuh>
+
+#include "utils.h"
+
+#else
+#include "hip/hip_act_and_mul.cuh"
+#endif
+
+// Adapted from flashinfer activation
+// https://github.com/flashinfer-ai/flashinfer/blob/4e8eb1879f9c3ba6d75511e5893183bf8f289a62/csrc/activation.cu#L44
+
+namespace detail {
+
+template <typename T>
+__device__ __forceinline__ float to_f32(const T& x) {
+#if USE_ROCM
+  return castToFloat(x);
+#else
+  return static_cast<float>(x);
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ T from_f32(float f32) {
+#if USE_ROCM
+  return castFromFloat<T>(f32);
+#else
+  return static_cast<T>(f32);
+#endif
+}
+
+}  // namespace detail
+
+template <typename T>
+__device__ __forceinline__ T silu(const T& x) {
+  float f32_val = detail::to_f32(x);
+  return detail::from_f32<T>(f32_val / (1.0f + expf(-f32_val)));
+}
+
+template <typename T>
+__device__ __forceinline__ T gelu(const T& x) {
+  constexpr float kAlpha = M_SQRT1_2;
+  float f32_val = detail::to_f32(x);
+  return detail::from_f32<T>(f32_val * (0.5f * (1.0f + erf(f32_val * kAlpha))));
+}
+
+// gelu_quick(x) = x * torch.sigmoid(1.702 * x)
+template <typename T>
+__device__ __forceinline__ T gelu_quick_act(const T& x) {
+  float f32_val = detail::to_f32(x);
+  return detail::from_f32<T>(f32_val / (1.0f + expf(-f32_val * 1.702f)));
+}
+
+template <typename T>
+__device__ __forceinline__ T gelu_tanh(const T& x) {
+  constexpr float kAlpha = 0.044715f;
+  constexpr float kBeta = 0.7978845608028654f;
+  float f32_val = detail::to_f32(x);
+  const float cdf = 0.5f * (1.0f + tanhf((kBeta * (f32_val + kAlpha * f32_val * f32_val * f32_val))));
+  return detail::from_f32<T>(f32_val * cdf);
+}
+
+void silu_and_mul(at::Tensor& out, at::Tensor& input) {
+  int d = input.size(-1) / 2;
+  int64_t num_tokens = input.numel() / input.size(-1);
+  dim3 grid(num_tokens);
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+
+  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16(input.scalar_type(), c_type, [&] {
+    uint32_t vec_size = 16 / sizeof(c_type);
+    dim3 block(std::min(d / vec_size, 1024U));
+#if USE_ROCM
+    sgl_hip::activation::act_and_mul_kernel<c_type, silu>
+        <<<grid, block, 0, stream>>>(static_cast<c_type*>(out.data_ptr()), static_cast<c_type*>(input.data_ptr()), d);
+#else
+    flashinfer::activation::act_and_mul_kernel<c_type, silu>
+        <<<grid, block, 0, stream>>>(static_cast<c_type*>(out.data_ptr()), static_cast<c_type*>(input.data_ptr()), d);
+#endif
+    return true;
+  });
+}
+
+void gelu_tanh_and_mul(at::Tensor& out, at::Tensor& input) {
+  int d = input.size(-1) / 2;
+  int64_t num_tokens = input.numel() / input.size(-1);
+  dim3 grid(num_tokens);
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+
+  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16(input.scalar_type(), c_type, [&] {
+    uint32_t vec_size = 16 / sizeof(c_type);
+    dim3 block(std::min(d / vec_size, 1024U));
+#if USE_ROCM
+    sgl_hip::activation::act_and_mul_kernel<c_type, gelu_tanh>
+        <<<grid, block, 0, stream>>>(static_cast<c_type*>(out.data_ptr()), static_cast<c_type*>(input.data_ptr()), d);
+#else
+    flashinfer::activation::act_and_mul_kernel<c_type, gelu_tanh>
+        <<<grid, block, 0, stream>>>(static_cast<c_type*>(out.data_ptr()), static_cast<c_type*>(input.data_ptr()), d);
+#endif
+    return true;
+  });
+}
+
+void gelu_and_mul(at::Tensor& out, at::Tensor& input) {
+  int d = input.size(-1) / 2;
+  int64_t num_tokens = input.numel() / input.size(-1);
+  dim3 grid(num_tokens);
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+
+  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16(input.scalar_type(), c_type, [&] {
+    uint32_t vec_size = 16 / sizeof(c_type);
+    dim3 block(std::min(d / vec_size, 1024U));
+#if USE_ROCM
+    sgl_hip::activation::act_and_mul_kernel<c_type, gelu>
+        <<<grid, block, 0, stream>>>(static_cast<c_type*>(out.data_ptr()), static_cast<c_type*>(input.data_ptr()), d);
+#else
+    flashinfer::activation::act_and_mul_kernel<c_type, gelu>
+        <<<grid, block, 0, stream>>>(static_cast<c_type*>(out.data_ptr()), static_cast<c_type*>(input.data_ptr()), d);
+#endif
+
+    return true;
+  });
+}
+
+#if USE_ROCM
+void gelu_quick(at::Tensor& out, const at::Tensor& input) {
+  int d = input.size(-1);
+  int64_t num_tokens = input.numel() / input.size(-1);
+  dim3 grid(num_tokens);
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+
+  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16(input.scalar_type(), c_type, [&] {
+    uint32_t vec_size = 16 / sizeof(c_type);
+    dim3 block(std::min(d / vec_size, 1024U));
+    sgl_hip::activation::act_only_kernel<c_type, gelu_quick_act>
+        <<<grid, block, 0, stream>>>(static_cast<c_type*>(out.data_ptr()), static_cast<c_type*>(input.data_ptr()), d);
+
+    return true;
+  });
+}
+#endif
diff --git a/sgl-kernel/csrc/elementwise/activation.hip b/sgl-kernel/csrc/elementwise/activation.hip
new file mode 100644
index 000000000..6ea8ca965
--- /dev/null
+++ b/sgl-kernel/csrc/elementwise/activation.hip
@@ -0,0 +1,173 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+#include "hip/hip_runtime.h"
+/*
+ * Copyright (c) 2024 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/hip/HIPContext.h>
+#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
+#include <torch/all.h>
+
+#ifndef USE_ROCM
+
+#include <flashinfer/activation.cuh>
+
+#include "utils_hip.h"
+
+#else
+#include "hip/hip_act_and_mul_hip.cuh"
+#endif
+
+// Adapted from flashinfer activation
+// https://github.com/flashinfer-ai/flashinfer/blob/4e8eb1879f9c3ba6d75511e5893183bf8f289a62/csrc/activation.cu#L44
+
+namespace detail {
+
+template <typename T>
+__device__ __forceinline__ float to_f32(const T& x) {
+#if USE_ROCM
+  return castToFloat(x);
+#else
+  return static_cast<float>(x);
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ T from_f32(float f32) {
+#if USE_ROCM
+  return castFromFloat<T>(f32);
+#else
+  return static_cast<T>(f32);
+#endif
+}
+
+}  // namespace detail
+
+template <typename T>
+__device__ __forceinline__ T silu(const T& x) {
+  float f32_val = detail::to_f32(x);
+  return detail::from_f32<T>(f32_val / (1.0f + expf(-f32_val)));
+}
+
+template <typename T>
+__device__ __forceinline__ T gelu(const T& x) {
+  constexpr float kAlpha = M_SQRT1_2;
+  float f32_val = detail::to_f32(x);
+  return detail::from_f32<T>(f32_val * (0.5f * (1.0f + erf(f32_val * kAlpha))));
+}
+
+// gelu_quick(x) = x * torch.sigmoid(1.702 * x)
+template <typename T>
+__device__ __forceinline__ T gelu_quick_act(const T& x) {
+  float f32_val = detail::to_f32(x);
+  return detail::from_f32<T>(f32_val / (1.0f + expf(-f32_val * 1.702f)));
+}
+
+template <typename T>
+__device__ __forceinline__ T gelu_tanh(const T& x) {
+  constexpr float kAlpha = 0.044715f;
+  constexpr float kBeta = 0.7978845608028654f;
+  float f32_val = detail::to_f32(x);
+  const float cdf = 0.5f * (1.0f + tanhf((kBeta * (f32_val + kAlpha * f32_val * f32_val * f32_val))));
+  return detail::from_f32<T>(f32_val * cdf);
+}
+
+void silu_and_mul(at::Tensor& out, at::Tensor& input) {
+  int d = input.size(-1) / 2;
+  int64_t num_tokens = input.numel() / input.size(-1);
+  dim3 grid(num_tokens);
+
+  const hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+  const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(input));
+
+  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16(input.scalar_type(), c_type, [&] {
+    uint32_t vec_size = 16 / sizeof(c_type);
+    dim3 block(::min(d / vec_size, 1024U));
+#if USE_ROCM
+   hipLaunchKernelGGL(( sgl_hip::activation::act_and_mul_kernel<c_type, silu>)
+        , dim3(grid), dim3(block), 0, stream, static_cast<c_type*>(out.data_ptr()), static_cast<c_type*>(input.data_ptr()), d);
+#else
+   hipLaunchKernelGGL(( flashinfer::activation::act_and_mul_kernel<c_type, silu>)
+        , dim3(grid), dim3(block), 0, stream, static_cast<c_type*>(out.data_ptr()), static_cast<c_type*>(input.data_ptr()), d);
+#endif
+    return true;
+  });
+}
+
+void gelu_tanh_and_mul(at::Tensor& out, at::Tensor& input) {
+  int d = input.size(-1) / 2;
+  int64_t num_tokens = input.numel() / input.size(-1);
+  dim3 grid(num_tokens);
+
+  const hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+  const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(input));
+
+  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16(input.scalar_type(), c_type, [&] {
+    uint32_t vec_size = 16 / sizeof(c_type);
+    dim3 block(::min(d / vec_size, 1024U));
+#if USE_ROCM
+   hipLaunchKernelGGL(( sgl_hip::activation::act_and_mul_kernel<c_type, gelu_tanh>)
+        , dim3(grid), dim3(block), 0, stream, static_cast<c_type*>(out.data_ptr()), static_cast<c_type*>(input.data_ptr()), d);
+#else
+   hipLaunchKernelGGL(( flashinfer::activation::act_and_mul_kernel<c_type, gelu_tanh>)
+        , dim3(grid), dim3(block), 0, stream, static_cast<c_type*>(out.data_ptr()), static_cast<c_type*>(input.data_ptr()), d);
+#endif
+    return true;
+  });
+}
+
+void gelu_and_mul(at::Tensor& out, at::Tensor& input) {
+  int d = input.size(-1) / 2;
+  int64_t num_tokens = input.numel() / input.size(-1);
+  dim3 grid(num_tokens);
+
+  const hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+  const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(input));
+
+  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16(input.scalar_type(), c_type, [&] {
+    uint32_t vec_size = 16 / sizeof(c_type);
+    dim3 block(::min(d / vec_size, 1024U));
+#if USE_ROCM
+   hipLaunchKernelGGL(( sgl_hip::activation::act_and_mul_kernel<c_type, gelu>)
+        , dim3(grid), dim3(block), 0, stream, static_cast<c_type*>(out.data_ptr()), static_cast<c_type*>(input.data_ptr()), d);
+#else
+   hipLaunchKernelGGL(( flashinfer::activation::act_and_mul_kernel<c_type, gelu>)
+        , dim3(grid), dim3(block), 0, stream, static_cast<c_type*>(out.data_ptr()), static_cast<c_type*>(input.data_ptr()), d);
+#endif
+
+    return true;
+  });
+}
+
+#if USE_ROCM
+void gelu_quick(at::Tensor& out, const at::Tensor& input) {
+  int d = input.size(-1);
+  int64_t num_tokens = input.numel() / input.size(-1);
+  dim3 grid(num_tokens);
+
+  const hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+  const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(input));
+
+  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16(input.scalar_type(), c_type, [&] {
+    uint32_t vec_size = 16 / sizeof(c_type);
+    dim3 block(::min(d / vec_size, 1024U));
+   hipLaunchKernelGGL(( sgl_hip::activation::act_only_kernel<c_type, gelu_quick_act>)
+        , dim3(grid), dim3(block), 0, stream, static_cast<c_type*>(out.data_ptr()), static_cast<c_type*>(input.data_ptr()), d);
+
+    return true;
+  });
+}
+#endif
diff --git a/sgl-kernel/csrc/elementwise/cast.cu b/sgl-kernel/csrc/elementwise/cast.cu
new file mode 100644
index 000000000..a1ff8703f
--- /dev/null
+++ b/sgl-kernel/csrc/elementwise/cast.cu
@@ -0,0 +1,171 @@
+#include "pytorch_extension_utils.h"
+
+template <typename T>
+struct ConvertToFP8 {
+  static __device__ __nv_fp8_storage_t convert_to_fp8(T value) {
+    return 0;
+  }
+};
+
+template <>
+struct ConvertToFP8<__nv_bfloat16> {
+  static __device__ __nv_fp8_storage_t convert_to_fp8(__nv_bfloat16 value) {
+    return __nv_cvt_bfloat16raw_to_fp8(value, __NV_SATFINITE, __NV_E4M3);
+  }
+};
+
+template <>
+struct ConvertToFP8<half> {
+  static __device__ __nv_fp8_storage_t convert_to_fp8(half value) {
+    return __nv_cvt_halfraw_to_fp8(value, __NV_SATFINITE, __NV_E4M3);
+  }
+};
+
+template <typename T>
+struct ConvertFromFloat {
+  static __device__ T convert_from_float(float value) {
+    return 0;
+  }
+};
+
+template <>
+struct ConvertFromFloat<__nv_bfloat16> {
+  static __device__ __nv_bfloat16 convert_from_float(float value) {
+    return __float2bfloat16(value);
+  }
+};
+
+template <>
+struct ConvertFromFloat<half> {
+  static __device__ half convert_from_float(float value) {
+    return __float2half(value);
+  }
+};
+
+template <typename T>
+__global__ void fused_downcast_kernel(
+    const T* cache_k,
+    const T* cache_v,
+    const float* k_scale,
+    const float* v_scale,
+    __nv_fp8_storage_t* output_k,
+    __nv_fp8_storage_t* output_v,
+    const int input_sl,
+    const int head,
+    const int dim,
+    const T max_fp8,
+    const T min_fp8,
+    const int64_t mult,
+    const int64_t offset,
+    const int64_t* loc) {
+  // TODO: change name
+  int token_idx = blockIdx.x;
+  int thread_idx = threadIdx.x;
+  int total_threads = blockDim.x;
+
+  T k_scale_val = ConvertFromFloat<T>::convert_from_float(k_scale[0]);
+  T v_scale_val = ConvertFromFloat<T>::convert_from_float(v_scale[0]);
+
+  T k_scale_inv = static_cast<T>(1.f) / k_scale_val;
+  T v_scale_inv = static_cast<T>(1.f) / v_scale_val;
+
+  auto clamp = [&](T val) { return val > max_fp8 ? max_fp8 : (min_fp8 > val ? min_fp8 : val); };
+
+  if (token_idx < input_sl) {
+    int out_seq_idx = loc[token_idx];
+
+#pragma unroll
+    for (int i = thread_idx; i < head * dim; i += total_threads) {
+      int in_idx = token_idx * head * dim + i;
+      int out_idx = (out_seq_idx * mult + offset) * head * dim + i;
+
+      T k_val = cache_k[in_idx] * k_scale_inv;
+      k_val = clamp(k_val);
+      output_k[out_idx] = ConvertToFP8<T>::convert_to_fp8(k_val);
+
+      T v_val = cache_v[in_idx] * v_scale_inv;
+      v_val = clamp(v_val);
+      output_v[out_idx] = ConvertToFP8<T>::convert_to_fp8(v_val);
+    }
+  }
+}
+
+template <typename T>
+void downcast_fp8_impl(
+    at::Tensor& k,
+    at::Tensor& v,
+    at::Tensor& k_out,
+    at::Tensor& v_out,
+    at::Tensor& k_scale,
+    at::Tensor& v_scale,
+    at::Tensor& loc,
+    int64_t mult,
+    int64_t offset,
+    cudaStream_t stream) {
+  CHECK_INPUT(k);
+  CHECK_INPUT(v);
+  CHECK_INPUT(k_out);
+  CHECK_INPUT(v_out);
+  CHECK_INPUT(k_scale);
+  CHECK_INPUT(v_scale);
+  CHECK_INPUT(loc);
+
+  int64_t input_sl = k.size(0);
+  int64_t head = k.size(1);
+  int64_t dim = k.size(2);
+
+  dim3 grid(input_sl * head);
+  int vec_size = 8;
+  dim3 block(std::min(int(dim) / vec_size, 1024));
+
+  const T max_fp8 = static_cast<T>(448.0f);
+  const T min_fp8 = static_cast<T>(-448.0f);
+
+  fused_downcast_kernel<T><<<grid, block, 0, stream>>>(
+      static_cast<const T*>(k.data_ptr()),
+      static_cast<const T*>(v.data_ptr()),
+      static_cast<const float*>(k_scale.data_ptr()),
+      static_cast<const float*>(v_scale.data_ptr()),
+      static_cast<__nv_fp8_storage_t*>(k_out.data_ptr()),
+      static_cast<__nv_fp8_storage_t*>(v_out.data_ptr()),
+      input_sl,
+      head,
+      dim,
+      max_fp8,
+      min_fp8,
+      mult,
+      offset,
+      static_cast<const int64_t*>(loc.data_ptr()));
+
+  cudaError_t status = cudaGetLastError();
+  TORCH_CHECK(status == cudaSuccess, "Kernel launch failed: " + std::string(cudaGetErrorString(status)));
+}
+
+void downcast_fp8(
+    at::Tensor& k,
+    at::Tensor& v,
+    at::Tensor& k_out,
+    at::Tensor& v_out,
+    at::Tensor& k_scale,
+    at::Tensor& v_scale,
+    at::Tensor& loc,
+    int64_t mult,
+    int64_t offset,
+    int64_t cuda_stream) {
+  CHECK_INPUT(k);
+  CHECK_INPUT(v);
+  CHECK_INPUT(k_out);
+  CHECK_INPUT(v_out);
+
+  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
+  switch (k.scalar_type()) {
+    case at::ScalarType::BFloat16:
+      downcast_fp8_impl<__nv_bfloat16>(k, v, k_out, v_out, k_scale, v_scale, loc, mult, offset, stream);
+      break;
+    case at::ScalarType::Half:
+      downcast_fp8_impl<__half>(k, v, k_out, v_out, k_scale, v_scale, loc, mult, offset, stream);
+      break;
+    default:
+      TORCH_CHECK(false, "Unsupported input type for downcast_fp8. Expected bfloat16 or float16.");
+  }
+}
diff --git a/sgl-kernel/csrc/elementwise/concat_mla.cu b/sgl-kernel/csrc/elementwise/concat_mla.cu
new file mode 100644
index 000000000..b6c236333
--- /dev/null
+++ b/sgl-kernel/csrc/elementwise/concat_mla.cu
@@ -0,0 +1,117 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDADataType.h>
+#include <cuda_runtime.h>
+
+#include "pytorch_extension_utils.h"
+
+constexpr int NUM_LOCAL_HEADS = 128;
+constexpr int QK_NOPE_HEAD_DIM = 128;
+constexpr int QK_ROPE_HEAD_DIM = 64;
+constexpr int K_HEAD_DIM = QK_NOPE_HEAD_DIM + QK_ROPE_HEAD_DIM;
+
+constexpr int HEAD_CHUNK_SIZE = 16;
+constexpr int NUM_HEAD_CHUNKS = NUM_LOCAL_HEADS / HEAD_CHUNK_SIZE;
+
+__forceinline__ __device__ int get_lane_id() {
+  int lane_id;
+  asm("mov.s32 %0, %laneid;" : "=r"(lane_id));
+  return lane_id;
+}
+
+int ceil_div(int a, int b) {
+  return (a + b - 1) / b;
+}
+
+__global__ void concat_mla_k_kernel(
+    nv_bfloat16* k,
+    nv_bfloat16* k_nope,
+    nv_bfloat16* k_rope,
+    const int num_tokens,
+    const int k_stride_0,
+    const int k_stride_1,
+    const int k_nope_stride_0,
+    const int k_nope_stride_1,
+    const int k_rope_stride_0) {
+  const int flat_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+  const int token_id = flat_warp_id / NUM_HEAD_CHUNKS;
+  const int head_chunk_id = flat_warp_id % NUM_HEAD_CHUNKS;
+  const int lane_id = get_lane_id();
+
+  if (token_id >= num_tokens) {
+    return;
+  }
+
+  using KNopeBufType = int2;
+  static_assert(sizeof(KNopeBufType) == QK_NOPE_HEAD_DIM * sizeof(k[0]) / 32);
+  KNopeBufType k_nope_buf[HEAD_CHUNK_SIZE];
+
+  using KRopeBufType = int;
+  static_assert(sizeof(KRopeBufType) == QK_ROPE_HEAD_DIM * sizeof(k[0]) / 32);
+  KRopeBufType k_rope_buf;
+
+  {
+    const int* base_addr = reinterpret_cast<int*>(k_rope + token_id * k_rope_stride_0);
+    k_rope_buf = *(base_addr + lane_id);
+  }
+
+#pragma unroll
+  for (int i = 0; i < HEAD_CHUNK_SIZE; ++i) {
+    const int head_id = head_chunk_id * HEAD_CHUNK_SIZE + i;
+    const int2* base_addr = reinterpret_cast<int2*>(k_nope + token_id * k_nope_stride_0 + head_id * k_nope_stride_1);
+    k_nope_buf[i] = *(base_addr + lane_id);
+  }
+
+#pragma unroll
+  for (int i = 0; i < HEAD_CHUNK_SIZE; ++i) {
+    const int head_id = head_chunk_id * HEAD_CHUNK_SIZE + i;
+
+    {
+      int2* base_addr = reinterpret_cast<int2*>(k + token_id * k_stride_0 + head_id * k_stride_1);
+      *(base_addr + lane_id) = k_nope_buf[i];
+    }
+    {
+      int* base_addr = reinterpret_cast<int*>(k + token_id * k_stride_0 + head_id * k_stride_1 + QK_NOPE_HEAD_DIM);
+      *(base_addr + lane_id) = k_rope_buf;
+    }
+  }
+}
+
+inline void check_tensor(const at::Tensor& t, int64_t shape0, int64_t shape1, int64_t shape2, c10::ScalarType dtype) {
+  TORCH_CHECK_EQ(t.dim(), 3);
+  TORCH_CHECK_EQ(t.size(0), shape0);
+  TORCH_CHECK_EQ(t.size(1), shape1);
+  TORCH_CHECK_EQ(t.size(2), shape2);
+  TORCH_CHECK_EQ(t.dtype(), dtype);
+  TORCH_CHECK(t.device().is_cuda());
+  TORCH_CHECK_EQ(((int64_t)t.data_ptr()) % 16, 0);  // alignment
+}
+
+void concat_mla_k(at::Tensor k, at::Tensor k_nope, at::Tensor k_rope) {
+  const int num_tokens = k.size(0);
+
+  check_tensor(k, num_tokens, NUM_LOCAL_HEADS, K_HEAD_DIM, at::kBFloat16);
+  check_tensor(k_nope, num_tokens, NUM_LOCAL_HEADS, QK_NOPE_HEAD_DIM, at::kBFloat16);
+  check_tensor(k_rope, num_tokens, 1, QK_ROPE_HEAD_DIM, at::kBFloat16);
+  TORCH_CHECK_EQ(k.stride(2), 1);
+  TORCH_CHECK_EQ(k_nope.stride(2), 1);
+  TORCH_CHECK_EQ(k_rope.stride(2), 1);
+
+  const auto stream = at::cuda::getCurrentCUDAStream().stream();
+
+  constexpr int num_warps_per_block = 32;
+  const int grid_size = ceil_div(num_tokens * NUM_HEAD_CHUNKS, num_warps_per_block);
+  const int block_size = num_warps_per_block * 32;
+
+  concat_mla_k_kernel<<<grid_size, block_size, 0, stream>>>(
+      reinterpret_cast<nv_bfloat16*>(k.data_ptr()),
+      reinterpret_cast<nv_bfloat16*>(k_nope.data_ptr()),
+      reinterpret_cast<nv_bfloat16*>(k_rope.data_ptr()),
+      num_tokens,
+      k.stride(0),
+      k.stride(1),
+      k_nope.stride(0),
+      k_nope.stride(1),
+      k_rope.stride(0));
+  cudaError_t err = cudaGetLastError();
+  TORCH_CHECK(err == cudaSuccess, "CUDA kernel launch failed: ", cudaGetErrorString(err));
+}
diff --git a/sgl-kernel/csrc/elementwise/copy.cu b/sgl-kernel/csrc/elementwise/copy.cu
new file mode 100644
index 000000000..09719f510
--- /dev/null
+++ b/sgl-kernel/csrc/elementwise/copy.cu
@@ -0,0 +1,58 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include <vector>
+
+template <int N>
+struct InputArray {
+  int values[N];
+};
+
+template <int N>
+__global__ void copy_to_gpu_no_ce_kernel(const InputArray<N> input_array, int* output) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < N) {
+    output[idx] = input_array.values[idx];
+  }
+}
+
+template <int N>
+void copy_to_gpu_no_ce_impl(const at::Tensor& input, at::Tensor& output) {
+  TORCH_CHECK(input.dim() == 1, "input must be 1-D");
+  TORCH_CHECK(static_cast<int>(input.numel()) == N, "input numel must equal template N");
+  TORCH_CHECK(input.is_contiguous(), "input must be contiguous");
+  TORCH_CHECK(input.dtype() == torch::kInt32, "input dtype must be int32");
+
+  TORCH_CHECK(output.dim() == 1, "output dim");
+  TORCH_CHECK(static_cast<int>(output.numel()) == N, "output size");
+  TORCH_CHECK(output.is_contiguous(), "output contiguous");
+  TORCH_CHECK(output.dtype() == torch::kInt32, "output dtype");
+
+  TORCH_CHECK(input.device().is_cpu(), "input must be a CPU tensor");
+  TORCH_CHECK(output.device().is_cuda(), "output must be a CUDA tensor");
+
+  InputArray<N> input_array;
+  const int* input_ptr = input.data_ptr<int>();
+  for (int i = 0; i < N; ++i)
+    input_array.values[i] = input_ptr[i];
+
+  // may use multi thread blocks if performance bottleneck
+  dim3 grid(1);
+  dim3 block(static_cast<int>(input.numel()));
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  copy_to_gpu_no_ce_kernel<<<grid, block, 0, stream>>>(input_array, output.data_ptr<int>());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+void copy_to_gpu_no_ce(const at::Tensor& input, at::Tensor& output) {
+  int N = static_cast<int>(input.numel());
+  // Can use macro if there are more N needed
+  if (N == 72) {
+    copy_to_gpu_no_ce_impl<72>(input, output);
+  } else if (N == 64) {
+    copy_to_gpu_no_ce_impl<64>(input, output);
+  } else {
+    TORCH_CHECK(false, "unexpected N");
+  }
+}
diff --git a/sgl-kernel/csrc/elementwise/fused_add_rms_norm_kernel.cu b/sgl-kernel/csrc/elementwise/fused_add_rms_norm_kernel.cu
new file mode 100644
index 000000000..02d5b20c9
--- /dev/null
+++ b/sgl-kernel/csrc/elementwise/fused_add_rms_norm_kernel.cu
@@ -0,0 +1,59 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include <flashinfer/norm.cuh>
+
+#include "utils.h"
+
+using namespace flashinfer;
+
+void sgl_fused_add_rmsnorm(
+    torch::Tensor input, torch::Tensor residual, torch::Tensor weight, double eps, bool enable_pdl) {
+  CHECK_INPUT(input);
+  CHECK_INPUT(residual);
+  CHECK_INPUT(weight);
+  auto device = input.device();
+  CHECK_EQ(residual.device(), device);
+  CHECK_EQ(weight.device(), device);
+  CHECK_DIM(2, input);     // input: (batch_size, hidden_size)
+  CHECK_DIM(2, residual);  // residual: (batch_size, hidden_size)
+  CHECK_DIM(1, weight);    // weight: (hidden_size)
+  CHECK_EQ(input.size(0), residual.size(0));
+  CHECK_EQ(input.size(1), residual.size(1));
+  CHECK_EQ(input.size(1), weight.size(0));
+  unsigned int batch_size = input.size(0);
+  unsigned int hidden_size = input.size(1);
+
+  cudaStream_t torch_current_stream = at::cuda::getCurrentCUDAStream();
+  // support float16, bfloat16 and float32
+  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16(input.scalar_type(), c_type, [&] {
+    cudaError_t status = norm::FusedAddRMSNorm(
+        static_cast<c_type*>(input.data_ptr()),
+        static_cast<c_type*>(residual.data_ptr()),
+        static_cast<c_type*>(weight.data_ptr()),
+        batch_size,
+        hidden_size,
+        input.stride(0),
+        residual.stride(0),
+        eps,
+        enable_pdl,
+        torch_current_stream);
+    TORCH_CHECK(
+        status == cudaSuccess, "FusedAddRMSNorm failed with error code " + std::string(cudaGetErrorString(status)));
+    return true;
+  });
+}
diff --git a/sgl-kernel/csrc/elementwise/pos_enc.cuh b/sgl-kernel/csrc/elementwise/pos_enc.cuh
new file mode 100644
index 000000000..a2e4e2ebb
--- /dev/null
+++ b/sgl-kernel/csrc/elementwise/pos_enc.cuh
@@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2023 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SGL_POS_ENC_CUH_
+#define SGL_POS_ENC_CUH_
+
+#include <flashinfer/pos_enc.cuh>  // upstream
+
+namespace flashinfer {
+
+namespace kv_buffer_saver {
+
+template <typename DType, typename IdType, uint32_t vec_size>
+__device__ __forceinline__ void prepare(
+    vec_t<float, vec_size>& v_vec,
+    IdType& kv_cache_offset,
+    DType* v,
+    IdType* kv_cache_loc,
+    uint32_t idx,
+    uint32_t tx,
+    uint32_t kv_head_idx,
+    size_t v_stride_n,
+    size_t v_stride_h) {
+  kv_cache_offset = kv_cache_loc[idx];
+
+  DType* v_ptr = v + get_elem_offset_impl(idx, kv_head_idx, 0, v_stride_n, v_stride_h);
+  v_vec.cast_load(v_ptr + tx * vec_size);
+}
+
+template <typename DType, typename IdType, uint32_t vec_size>
+__device__ __forceinline__ void save(
+    IdType& kv_cache_offset,
+    vec_t<float, vec_size>& k_vec,
+    vec_t<float, vec_size>& v_vec,
+    DType* k_buffer,
+    DType* v_buffer,
+    uint32_t idx,
+    uint32_t tx,
+    uint32_t kv_head_idx,
+    size_t k_buffer_stride_n,
+    size_t k_buffer_stride_h,
+    size_t v_buffer_stride_n,
+    size_t v_buffer_stride_h) {
+  DType* k_buffer_ptr =
+      k_buffer + get_elem_offset_impl(kv_cache_offset, kv_head_idx, 0, k_buffer_stride_n, k_buffer_stride_h);
+  DType* v_buffer_ptr =
+      v_buffer + get_elem_offset_impl(kv_cache_offset, kv_head_idx, 0, v_buffer_stride_n, v_buffer_stride_h);
+  k_vec.cast_store(k_buffer_ptr + tx * vec_size);
+  v_vec.cast_store(v_buffer_ptr + tx * vec_size);
+}
+
+}  // namespace kv_buffer_saver
+
+template <
+    bool save_kv_cache,
+    bool interleave,
+    uint32_t head_dim,
+    uint32_t vec_size,
+    uint32_t bdx,
+    typename DType,
+    typename IdType>
+__global__ void BatchQKApplyRotaryPosIdsCosSinCacheEnhancedHeadParallelismKernel(
+    DType* q,
+    DType* k,
+    DType* v,
+    DType* q_rope,
+    DType* k_rope,
+    DType* k_buffer,
+    DType* v_buffer,
+    float* __restrict__ cos_sin_cache,
+    IdType* __restrict__ pos_ids,
+    uint32_t nnz,
+    uint32_t num_qo_heads,
+    uint32_t num_kv_heads,
+    uint32_t rotary_dim,
+    size_t q_stride_n,
+    size_t q_stride_h,
+    size_t k_stride_n,
+    size_t k_stride_h,
+    size_t v_stride_n,
+    size_t v_stride_h,
+    size_t q_rope_stride_n,
+    size_t q_rope_stride_h,
+    size_t k_rope_stride_n,
+    size_t k_rope_stride_h,
+    size_t k_buffer_stride_n,
+    size_t k_buffer_stride_h,
+    size_t v_buffer_stride_n,
+    size_t v_buffer_stride_h,
+    IdType* __restrict__ kv_cache_loc) {
+  uint32_t bx = blockIdx.x, tx = threadIdx.x, ty = threadIdx.y;
+  uint32_t by = blockIdx.y;
+  const uint32_t bdy = blockDim.y;
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.wait;");
+#endif
+
+  vec_t<float, vec_size> cos, sin;
+  if (bx * bdy + ty < nnz) {
+    const uint32_t idx = bx * bdy + ty;
+    const IdType pos = pos_ids[idx];
+
+    const int half_rotary_dim = rotary_dim / 2;
+
+    // 1. if interleave:
+    //  - cos = cos_sin_cache[pos_id][tx * vec_size // 2]
+    //  - sin = cos_sin_cache[pos_id][(rot_dim // 2) + tx * vec_size // 2]
+    // 2. if not interleave
+    //  - cos = cos_cache[pos_id][(tx * vec_size) % (rot_dim // 2)]
+    //  - sin = sin_cache[pos_id][(rot_dim // 2) + (tx * vec_size) % (rot_dim // 2)]
+    if (tx * vec_size < rotary_dim) {
+      int sin_offset = rotary_dim / 2;
+      int vec_idx;
+      if constexpr (interleave) {
+        vec_idx = (tx * vec_size) / 2;  // Force integer division
+      } else {
+        vec_idx = (tx * vec_size) % half_rotary_dim;  // Use half_rotary_dim
+      }
+      cos.load(cos_sin_cache + (pos * rotary_dim) + vec_idx);
+      sin.load(cos_sin_cache + (pos * rotary_dim) + (sin_offset + vec_idx));
+    }
+
+    if (by < num_qo_heads) {
+      uint32_t qo_head_idx = by;
+      DType* q_ptr = q + get_elem_offset_impl(idx, qo_head_idx, 0, q_stride_n, q_stride_h);
+      DType* q_rope_ptr = q_rope + get_elem_offset_impl(idx, qo_head_idx, 0, q_rope_stride_n, q_rope_stride_h);
+      vec_t<float, vec_size> q_vec;
+      if constexpr (interleave) {
+        q_vec = vec_apply_llama_rope_cos_sin_interleave_reuse_half<vec_size, bdx>(q_ptr, cos, sin, rotary_dim);
+      } else {
+        q_vec = vec_apply_llama_rope_cos_sin<vec_size, bdx>(q_ptr, cos, sin, rotary_dim);
+      }
+      q_vec.cast_store(q_rope_ptr + tx * vec_size);
+    } else {
+      uint32_t kv_head_idx = by - num_qo_heads;
+      DType* k_ptr = k + get_elem_offset_impl(idx, kv_head_idx, 0, k_stride_n, k_stride_h);
+
+      DType* k_rope_ptr = k_rope + get_elem_offset_impl(idx, kv_head_idx, 0, k_rope_stride_n, k_rope_stride_h);
+
+      vec_t<float, vec_size> v_vec;
+      IdType kv_cache_offset;
+      if constexpr (save_kv_cache) {
+        kv_buffer_saver::prepare<DType, IdType, vec_size>(
+            v_vec, kv_cache_offset, v, kv_cache_loc, idx, tx, kv_head_idx, v_stride_n, v_stride_h);
+      }
+
+      vec_t<float, vec_size> k_vec;
+      if constexpr (interleave) {
+        k_vec = vec_apply_llama_rope_cos_sin_interleave_reuse_half<vec_size, bdx>(k_ptr, cos, sin, rotary_dim);
+      } else {
+        k_vec = vec_apply_llama_rope_cos_sin<vec_size, bdx>(k_ptr, cos, sin, rotary_dim);
+      }
+      k_vec.cast_store(k_rope_ptr + tx * vec_size);
+
+      if constexpr (save_kv_cache) {
+        kv_buffer_saver::save<DType, IdType, vec_size>(
+            kv_cache_offset,
+            k_vec,
+            v_vec,
+            k_buffer,
+            v_buffer,
+            idx,
+            tx,
+            kv_head_idx,
+            k_buffer_stride_n,
+            k_buffer_stride_h,
+            v_buffer_stride_n,
+            v_buffer_stride_h);
+      }
+    }
+  }
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.launch_dependents;");
+#endif
+}
+
+template <
+    bool save_kv_cache,
+    bool interleave,
+    uint32_t head_dim,
+    uint32_t vec_size,
+    uint32_t bdx,
+    typename DType,
+    typename IdType>
+__global__ void BatchQKApplyRotaryPosIdsCosSinCacheEnhancedKernel(
+    DType* q,
+    DType* k,
+    DType* v,
+    DType* q_rope,
+    DType* k_rope,
+    DType* k_buffer,
+    DType* v_buffer,
+    float* __restrict__ cos_sin_cache,
+    IdType* __restrict__ pos_ids,
+    uint32_t nnz,
+    uint32_t num_qo_heads,
+    uint32_t num_kv_heads,
+    uint32_t rotary_dim,
+    size_t q_stride_n,
+    size_t q_stride_h,
+    size_t k_stride_n,
+    size_t k_stride_h,
+    size_t v_stride_n,
+    size_t v_stride_h,
+    size_t q_rope_stride_n,
+    size_t q_rope_stride_h,
+    size_t k_rope_stride_n,
+    size_t k_rope_stride_h,
+    size_t k_buffer_stride_n,
+    size_t k_buffer_stride_h,
+    size_t v_buffer_stride_n,
+    size_t v_buffer_stride_h,
+    IdType* __restrict__ kv_cache_loc) {
+  uint32_t bx = blockIdx.x, tx = threadIdx.x, ty = threadIdx.y;
+  const uint32_t bdy = blockDim.y;
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.wait;");
+#endif
+
+  vec_t<float, vec_size> cos, sin;
+  if (bx * bdy + ty < nnz) {
+    const uint32_t idx = bx * bdy + ty;
+    const IdType pos = pos_ids[idx];
+    const int half_rotary_dim = rotary_dim / 2;
+
+    // 1. if interleave:
+    //  - cos = cos_sin_cache[pos_id][tx * vec_size // 2]
+    //  - sin = cos_sin_cache[pos_id][(rot_dim // 2) + tx * vec_size // 2]
+    // 2. if not interleave
+    //  - cos = cos_cache[pos_id][(tx * vec_size) % (rot_dim // 2)]
+    //  - sin = sin_cache[pos_id][(rot_dim // 2) + (tx * vec_size) % (rot_dim // 2)]
+    if (tx * vec_size < rotary_dim) {
+      int sin_offset = rotary_dim / 2;
+      int vec_idx;
+      if constexpr (interleave) {
+        vec_idx = (tx * vec_size) / 2;  // Force integer division
+      } else {
+        vec_idx = (tx * vec_size) % half_rotary_dim;  // Use half_rotary_dim
+      }
+      cos.load(cos_sin_cache + (pos * rotary_dim) + vec_idx);
+      sin.load(cos_sin_cache + (pos * rotary_dim) + (sin_offset + vec_idx));
+    }
+
+    // not to unroll the loop, because num head might be large and might lead to worse performance
+#pragma unroll 1
+    for (uint32_t qo_head_idx = 0; qo_head_idx < num_qo_heads; ++qo_head_idx) {
+      DType* q_ptr = q + get_elem_offset_impl(idx, qo_head_idx, 0, q_stride_n, q_stride_h);
+      DType* q_rope_ptr = q_rope + get_elem_offset_impl(idx, qo_head_idx, 0, q_rope_stride_n, q_rope_stride_h);
+      vec_t<float, vec_size> q_vec;
+      if constexpr (interleave) {
+        q_vec = vec_apply_llama_rope_cos_sin_interleave_reuse_half<vec_size, bdx>(q_ptr, cos, sin, rotary_dim);
+      } else {
+        q_vec = vec_apply_llama_rope_cos_sin<vec_size, bdx>(q_ptr, cos, sin, rotary_dim);
+      }
+      q_vec.cast_store(q_rope_ptr + tx * vec_size);
+    }
+
+#pragma unroll 1
+    for (uint32_t kv_head_idx = 0; kv_head_idx < num_kv_heads; ++kv_head_idx) {
+      DType* k_ptr = k + get_elem_offset_impl(idx, kv_head_idx, 0, k_stride_n, k_stride_h);
+
+      DType* k_rope_ptr = k_rope + get_elem_offset_impl(idx, kv_head_idx, 0, k_rope_stride_n, k_rope_stride_h);
+
+      vec_t<float, vec_size> v_vec;
+      IdType kv_cache_offset;
+      if constexpr (save_kv_cache) {
+        kv_buffer_saver::prepare<DType, IdType, vec_size>(
+            v_vec, kv_cache_offset, v, kv_cache_loc, idx, tx, kv_head_idx, v_stride_n, v_stride_h);
+      }
+
+      vec_t<float, vec_size> k_vec;
+      if constexpr (interleave) {
+        k_vec = vec_apply_llama_rope_cos_sin_interleave_reuse_half<vec_size, bdx>(k_ptr, cos, sin, rotary_dim);
+      } else {
+        k_vec = vec_apply_llama_rope_cos_sin<vec_size, bdx>(k_ptr, cos, sin, rotary_dim);
+      }
+      k_vec.cast_store(k_rope_ptr + tx * vec_size);
+
+      if constexpr (save_kv_cache) {
+        kv_buffer_saver::save<DType, IdType, vec_size>(
+            kv_cache_offset,
+            k_vec,
+            v_vec,
+            k_buffer,
+            v_buffer,
+            idx,
+            tx,
+            kv_head_idx,
+            k_buffer_stride_n,
+            k_buffer_stride_h,
+            v_buffer_stride_n,
+            v_buffer_stride_h);
+      }
+    }
+  }
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.launch_dependents;");
+#endif
+}
+
+#define DISPATCH_SAVE_KV_CACHE(save_kv_cache, SAVE_KV_CACHE, ...) \
+  if (save_kv_cache) {                                            \
+    const bool SAVE_KV_CACHE = true;                              \
+    __VA_ARGS__                                                   \
+  } else {                                                        \
+    const bool SAVE_KV_CACHE = false;                             \
+    __VA_ARGS__                                                   \
+  }
+
+template <typename DType, typename IdType>
+cudaError_t BatchQKApplyRotaryPosIdsCosSinCacheEnhanced(
+    DType* q,
+    DType* k,
+    DType* v,
+    DType* q_rope,
+    DType* k_rope,
+    DType* k_buffer,
+    DType* v_buffer,
+    float* cos_sin_cache,
+    IdType* pos_ids,
+    uint32_t nnz,
+    uint32_t num_qo_heads,
+    uint32_t num_kv_heads,
+    uint32_t rotary_dim,
+    uint32_t head_dim,
+    size_t q_stride_n,
+    size_t q_stride_h,
+    size_t k_stride_n,
+    size_t k_stride_h,
+    size_t v_stride_n,
+    size_t v_stride_h,
+    size_t q_rope_stride_n,
+    size_t q_rope_stride_h,
+    size_t k_rope_stride_n,
+    size_t k_rope_stride_h,
+    size_t k_buffer_stride_n,
+    size_t k_buffer_stride_h,
+    size_t v_buffer_stride_n,
+    size_t v_buffer_stride_h,
+    IdType* kv_cache_loc,
+    bool interleave,
+    bool save_kv_cache,
+    bool enable_pdl,
+    cudaStream_t stream = nullptr) {
+  int dev_id = 0;
+  int num_sms = 0;
+  FLASHINFER_CUDA_CALL(cudaGetDevice(&dev_id));
+  FLASHINFER_CUDA_CALL(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
+
+#define LAUNCH_KERNEL_RAW(kernel_name)                                \
+  do {                                                                \
+    cudaLaunchConfig_t config = {};                                   \
+    config.gridDim = nblks;                                           \
+    config.blockDim = nthrs;                                          \
+    config.dynamicSmemBytes = 0;                                      \
+    config.stream = stream;                                           \
+    cudaLaunchAttribute attrs[1] = {};                                \
+    attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; \
+    attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl; \
+    config.numAttrs = 1;                                              \
+    config.attrs = attrs;                                             \
+                                                                      \
+    FLASHINFER_CUDA_CALL(cudaLaunchKernelEx(                          \
+        &config,                                                      \
+        kernel_name,                                                  \
+        q,                                                            \
+        k,                                                            \
+        v,                                                            \
+        q_rope,                                                       \
+        k_rope,                                                       \
+        k_buffer,                                                     \
+        v_buffer,                                                     \
+        cos_sin_cache,                                                \
+        pos_ids,                                                      \
+        nnz,                                                          \
+        num_qo_heads,                                                 \
+        num_kv_heads,                                                 \
+        rotary_dim,                                                   \
+        q_stride_n,                                                   \
+        q_stride_h,                                                   \
+        k_stride_n,                                                   \
+        k_stride_h,                                                   \
+        v_stride_n,                                                   \
+        v_stride_h,                                                   \
+        q_rope_stride_n,                                              \
+        q_rope_stride_h,                                              \
+        k_rope_stride_n,                                              \
+        k_rope_stride_h,                                              \
+        k_buffer_stride_n,                                            \
+        k_buffer_stride_h,                                            \
+        v_buffer_stride_n,                                            \
+        v_buffer_stride_h,                                            \
+        kv_cache_loc));                                               \
+  } while (0)
+
+  DISPATCH_SAVE_KV_CACHE(save_kv_cache, SAVE_KV_CACHE, {
+    DISPATCH_INTERLEAVE(interleave, INTERLEAVE, {
+      DISPATCH_HEAD_DIM(head_dim, HEAD_DIM, {
+        // operate on 16 Bytes at a time
+        constexpr uint32_t vec_size = std::max(16 / sizeof(DType), HEAD_DIM / 32);
+        // how many threads needed per head_dim
+        constexpr uint32_t bdx = HEAD_DIM / vec_size;
+        // how many threads needed per block
+        uint32_t num_threads = std::max(128U, bdx);
+        // how many tokens can we process in a block
+        uint32_t bdy = num_threads / bdx;
+        // how many blocks needed to process all tokens
+        uint32_t nblks_x = (nnz + bdy - 1) / bdy;
+
+        auto kernel_0 = BatchQKApplyRotaryPosIdsCosSinCacheEnhancedKernel<
+            SAVE_KV_CACHE,
+            INTERLEAVE,
+            HEAD_DIM,
+            vec_size,
+            bdx,
+            DType,
+            IdType>;
+
+        int num_blocks_per_sm_0 = 0;
+        FLASHINFER_CUDA_CALL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+            &num_blocks_per_sm_0, kernel_0, num_threads, /*smem_size=*/0));
+        uint32_t num_ctas_0 = num_blocks_per_sm_0 * num_sms;
+
+        if ((nnz + bdy - 1) / bdy >= num_ctas_0) {
+          dim3 nblks(nblks_x);
+          dim3 nthrs(bdx, bdy);
+          LAUNCH_KERNEL_RAW(kernel_0);
+        } else {
+          dim3 nblks(nblks_x, num_qo_heads + num_kv_heads);
+          dim3 nthrs(bdx, bdy);
+          auto kernel_1 = BatchQKApplyRotaryPosIdsCosSinCacheEnhancedHeadParallelismKernel<
+              SAVE_KV_CACHE,
+              INTERLEAVE,
+              HEAD_DIM,
+              vec_size,
+              bdx,
+              DType,
+              IdType>;
+          LAUNCH_KERNEL_RAW(kernel_1);
+        }
+      });
+    });
+  });
+#undef LAUNCH_KERNEL_RAW
+
+  return cudaSuccess;
+}
+
+}  // namespace flashinfer
+
+#endif  // SGL_POS_ENC_CUH_
diff --git a/sgl-kernel/csrc/elementwise/rope.cu b/sgl-kernel/csrc/elementwise/rope.cu
new file mode 100644
index 000000000..041558f61
--- /dev/null
+++ b/sgl-kernel/csrc/elementwise/rope.cu
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2024 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pos_enc.cuh"
+#include "pytorch_extension_utils.h"
+
+using namespace flashinfer;
+
+void apply_rope_pos_ids_cos_sin_cache(
+    at::Tensor q,
+    at::Tensor k,
+    at::Tensor q_rope,
+    at::Tensor k_rope,
+    at::Tensor cos_sin_cache,
+    at::Tensor pos_ids,
+    bool interleave,
+    bool enable_pdl,
+    int64_t cuda_stream,
+    const std::optional<at::Tensor>& v,
+    const std::optional<at::Tensor>& k_buffer,
+    const std::optional<at::Tensor>& v_buffer,
+    const std::optional<at::Tensor>& kv_cache_loc) {
+  CHECK_LAST_DIM_CONTIGUOUS(q);
+  CHECK_LAST_DIM_CONTIGUOUS(k);
+
+  const bool save_kv_cache = v.has_value();
+  if (save_kv_cache) {
+    TORCH_CHECK(v.has_value());
+    TORCH_CHECK(k_buffer.has_value());
+    TORCH_CHECK(v_buffer.has_value());
+    TORCH_CHECK(kv_cache_loc.has_value());
+    CHECK_LAST_DIM_CONTIGUOUS(v.value());
+    CHECK_LAST_DIM_CONTIGUOUS(k_buffer.value());
+    CHECK_LAST_DIM_CONTIGUOUS(v_buffer.value());
+    CHECK_DIM(3, k_buffer.value());      // k_buffer: (nnz, H_K, D)
+    CHECK_DIM(3, v_buffer.value());      // v_buffer: (nnz, H_V, D)
+    CHECK_DIM(3, v.value());             // v: (nnz, H_V, D)
+    CHECK_DIM(1, kv_cache_loc.value());  // v: (n)
+    CHECK_INPUT(kv_cache_loc.value());
+  }
+  size_t k_buffer_stride_n = save_kv_cache ? k_buffer->stride(0) : 0;
+  size_t k_buffer_stride_h = save_kv_cache ? k_buffer->stride(1) : 0;
+  size_t v_buffer_stride_n = save_kv_cache ? v_buffer->stride(0) : 0;
+  size_t v_buffer_stride_h = save_kv_cache ? v_buffer->stride(1) : 0;
+  size_t v_stride_n = save_kv_cache ? v->stride(0) : 0;
+  size_t v_stride_h = save_kv_cache ? v->stride(1) : 0;
+  auto kv_cache_loc_ptr = save_kv_cache ? static_cast<int64_t*>(kv_cache_loc->data_ptr()) : nullptr;
+
+  CHECK_INPUT(cos_sin_cache);
+  CHECK_INPUT(pos_ids);
+  auto device = q.device();
+  CHECK_EQ(k.device(), device);
+  CHECK_EQ(cos_sin_cache.device(), device);
+  CHECK_EQ(pos_ids.device(), device);
+  CHECK_DIM(3, q);  // q: (nnz, H_Q, D)
+  CHECK_DIM(3, k);  // k: (nnz, H_K, D)
+
+  // cos_sin_cache: (max_seq_len, R)
+  // First half of R is cos, second half is sin
+  CHECK_DIM(2, cos_sin_cache);
+  CHECK_EQ(q.size(0), k.size(0));
+  CHECK_EQ(q.size(2), k.size(2));
+  unsigned int rotary_dim = cos_sin_cache.size(1);
+  unsigned int num_qo_heads = q.size(1);
+  unsigned int num_kv_heads = k.size(1);
+  unsigned int head_dim = q.size(2);
+  unsigned int nnz = q.size(0);
+  size_t q_stride_n = q.stride(0);
+  size_t q_stride_h = q.stride(1);
+  size_t k_stride_n = k.stride(0);
+  size_t k_stride_h = k.stride(1);
+
+  size_t q_rope_stride_n = q_rope.stride(0);
+  size_t q_rope_stride_h = q_rope.stride(1);
+  size_t k_rope_stride_n = k_rope.stride(0);
+  size_t k_rope_stride_h = k_rope.stride(1);
+
+  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
+  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(q.scalar_type(), c_type, [&] {
+    // TODO temporarily only use `BatchQKApplyRotaryPosIdsCosSinCacheEnhanced` when save_kv_cache
+    // to avoid changing original code path; but this branch is feature-complete and should switch to this later
+    if (save_kv_cache) {
+      cudaError_t status = BatchQKApplyRotaryPosIdsCosSinCacheEnhanced(
+          static_cast<c_type*>(q.data_ptr()),
+          static_cast<c_type*>(k.data_ptr()),
+          save_kv_cache ? static_cast<c_type*>(v->data_ptr()) : nullptr,
+          static_cast<c_type*>(q_rope.data_ptr()),
+          static_cast<c_type*>(k_rope.data_ptr()),
+          save_kv_cache ? static_cast<c_type*>(k_buffer->data_ptr()) : nullptr,
+          save_kv_cache ? static_cast<c_type*>(v_buffer->data_ptr()) : nullptr,
+          static_cast<float*>(cos_sin_cache.data_ptr()),
+          static_cast<int64_t*>(pos_ids.data_ptr()),
+          nnz,
+          num_qo_heads,
+          num_kv_heads,
+          rotary_dim,
+          head_dim,
+          q_stride_n,
+          q_stride_h,
+          k_stride_n,
+          k_stride_h,
+          v_stride_n,
+          v_stride_h,
+          q_rope_stride_n,
+          q_rope_stride_h,
+          k_rope_stride_n,
+          k_rope_stride_h,
+          k_buffer_stride_n,
+          k_buffer_stride_h,
+          v_buffer_stride_n,
+          v_buffer_stride_h,
+          kv_cache_loc_ptr,
+          interleave,
+          save_kv_cache,
+          enable_pdl,
+          stream);
+      TORCH_CHECK(
+          status == cudaSuccess,
+          "BatchQKApplyRotaryPosIdsCosSinCacheEnhanced failed with error code " +
+              std::string(cudaGetErrorString(status)));
+    } else {
+      TORCH_CHECK(!enable_pdl);
+      cudaError_t status = BatchQKApplyRotaryPosIdsCosSinCache(
+          static_cast<c_type*>(q.data_ptr()),
+          static_cast<c_type*>(k.data_ptr()),
+          static_cast<c_type*>(q_rope.data_ptr()),
+          static_cast<c_type*>(k_rope.data_ptr()),
+          static_cast<float*>(cos_sin_cache.data_ptr()),
+          static_cast<int64_t*>(pos_ids.data_ptr()),
+          nnz,
+          num_qo_heads,
+          num_kv_heads,
+          rotary_dim,
+          head_dim,
+          q_stride_n,
+          q_stride_h,
+          k_stride_n,
+          k_stride_h,
+          q_rope_stride_n,
+          q_rope_stride_h,
+          k_rope_stride_n,
+          k_rope_stride_h,
+          interleave,
+          stream);
+      TORCH_CHECK(
+          status == cudaSuccess,
+          "BatchQKApplyRotaryPosIdsCosSinCache failed with error code " + std::string(cudaGetErrorString(status)));
+    }
+    return true;
+  });
+}
diff --git a/sgl-kernel/csrc/flash_extension.cc b/sgl-kernel/csrc/flash_extension.cc
new file mode 100644
index 000000000..f80db673f
--- /dev/null
+++ b/sgl-kernel/csrc/flash_extension.cc
@@ -0,0 +1,63 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <torch/all.h>
+#include <torch/library.h>
+
+#include "sgl_flash_kernel_ops.h"
+
+TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
+  /*
+   * From flash-attention
+   */
+  m.def(
+      "fwd(Tensor!  q,"
+      "    Tensor   k,"
+      "    Tensor   v,"
+      "    Tensor?  k_new,"
+      "    Tensor?  v_new,"
+      "    Tensor?  q_v,"
+      "    Tensor!? out,"
+      "    Tensor?  cu_seqlens_q,"
+      "    Tensor?  cu_seqlens_k,"
+      "    Tensor?  cu_seqlens_k_new,"
+      "    Tensor?  seqused_q,"
+      "    Tensor?  seqused_k,"
+      "    int?     max_seqlen_q,"
+      "    int?     max_seqlen_k,"
+      "    Tensor?  page_table,"
+      "    Tensor?  kv_batch_idx,"
+      "    Tensor?  leftpad_k,"
+      "    Tensor?  rotary_cos,"
+      "    Tensor?  rotary_sin,"
+      "    Tensor?  seqlens_rotary,"
+      "    Tensor?  q_descale,"
+      "    Tensor?  k_descale,"
+      "    Tensor?  v_descale,"
+      "    float    softmax_scale,"
+      "    bool     is_causal,"
+      "    int      window_size_left,"
+      "    int      window_size_right,"
+      "    float    softcap,"
+      "    bool     is_rotary_interleaved,"
+      "    Tensor?  scheduler_metadata,"
+      "    int      num_splits,"
+      "    bool?    pack_gqa,"
+      "    int      sm_margin,"
+      "    Tensor?  sinks) -> Tensor[]");
+  m.impl("fwd", torch::kCUDA, make_pytorch_shim(&mha_fwd));
+}
+
+REGISTER_EXTENSION(flash_ops)
diff --git a/sgl-kernel/csrc/gemm/awq_kernel.cu b/sgl-kernel/csrc/gemm/awq_kernel.cu
new file mode 100644
index 000000000..eec933689
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/awq_kernel.cu
@@ -0,0 +1,221 @@
+// Adapted from
+// https://github.com/vllm-project/vllm/blob/eb59b5a6cba6727d3727c0372258db9002f687c1/csrc/quantization/awq/gemm_kernels.cu#L350
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <torch/all.h>
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#include <cuda_bf16.h>
+#endif
+
+template <int lut>
+__device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(res) : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+
+__device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 750
+  uint4 result;
+
+  uint32_t* h = reinterpret_cast<uint32_t*>(&result);
+  uint32_t const i4s = reinterpret_cast<uint32_t const&>(source);
+
+  // First, we extract the i4s and construct an intermediate fp16 number.
+  static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
+  static constexpr uint32_t BOTTOM_MASK = 0x000f000f;
+  static constexpr uint32_t TOP_MASK = 0x00f000f0;
+  static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400;
+
+  // Note that the entire sequence only requires 1 shift instruction. This is
+  // thanks to the register packing format and the fact that we force our
+  // integers to be unsigned, and account for this in the fp16 subtractions. In
+  // addition, I exploit the fact that sub and fma have the same throughput in
+  // order to convert elt_23 and elt_67 to fp16 without having to shift them to
+  // the bottom bits before hand.
+
+  // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW
+  // dependency if we issue immediately before required.
+  const uint32_t top_i4s = i4s >> 8;
+  // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(h[0])
+               : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
+  // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(h[1])
+               : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
+  // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(h[2])
+               : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
+  // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(h[3])
+               : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
+
+  // This is the half2 {1024, 1024} represented as an integer.
+  static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400;
+  // This is the half2 {1 / 16, 1 / 16} represented as an integer.
+  static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00;
+  // This is the half2 {-64, -64} represented as an integer.
+  static constexpr uint32_t NEG_64 = 0xd400d400;
+
+  // Finally, we construct the output numbers.
+  // Convert elt_01
+  asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM));
+  // Convert elt_23
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
+  // Convert elt_45
+  asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM));
+  // Convert elt_67
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
+
+  return result;
+#else
+  assert(false);
+  return {};
+#endif
+}
+
+__device__ uint4 dequantize_s4_to_bf16x2(uint32_t const& source) {
+#if CUDA_VERSION >= 12000
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  uint4 result;
+  uint32_t* h = reinterpret_cast<uint32_t*>(&result);
+  uint32_t const i4s = source;
+
+  // Define masks and constants
+  static constexpr uint32_t MASK = 0x000f000f;
+  static constexpr uint32_t EX = 0x43004300;
+  static constexpr uint32_t MUL = 0x3F803F80;
+  static constexpr uint32_t ADD = 0xC300C300;
+
+  int lo0 = lop3<(0xf0 & 0xcc) | 0xaa>(i4s, MASK, EX);
+  int hi0 = lop3<(0xf0 & 0xcc) | 0xaa>(i4s >> 4, MASK, EX);
+  int lo1 = lop3<(0xf0 & 0xcc) | 0xaa>(i4s >> 8, MASK, EX);
+  int hi1 = lop3<(0xf0 & 0xcc) | 0xaa>(i4s >> 12, MASK, EX);
+
+  nv_bfloat162* res = reinterpret_cast<nv_bfloat162*>(h);
+  res[0] = __hfma2(
+      *reinterpret_cast<nv_bfloat162*>(&lo0),
+      *reinterpret_cast<const nv_bfloat162*>(&MUL),
+      *reinterpret_cast<const nv_bfloat162*>(&ADD));
+  res[1] = __hfma2(
+      *reinterpret_cast<nv_bfloat162*>(&hi0),
+      *reinterpret_cast<const nv_bfloat162*>(&MUL),
+      *reinterpret_cast<const nv_bfloat162*>(&ADD));
+  res[2] = __hfma2(
+      *reinterpret_cast<nv_bfloat162*>(&lo1),
+      *reinterpret_cast<const nv_bfloat162*>(&MUL),
+      *reinterpret_cast<const nv_bfloat162*>(&ADD));
+  res[3] = __hfma2(
+      *reinterpret_cast<nv_bfloat162*>(&hi1),
+      *reinterpret_cast<const nv_bfloat162*>(&MUL),
+      *reinterpret_cast<const nv_bfloat162*>(&ADD));
+
+  return result;
+#else
+  assert(false);
+  return {};
+#endif
+#endif
+}
+
+template <typename OutputT>
+__global__ void __launch_bounds__(256) dequantize_weights(
+    int* __restrict__ qweight,
+    OutputT* __restrict__ scales,
+    int* __restrict__ qzeros,
+    OutputT* __restrict__ output,
+    int group_size,
+    int qweight_cols,
+    int qweight_rows) {
+#if CUDA_VERSION >= 12000
+  int col = blockIdx.x * blockDim.x + threadIdx.x;
+  int row = blockIdx.y * blockDim.y + threadIdx.y;
+  if (col >= qweight_cols || row >= qweight_rows) return;
+
+  int group_idx = row / group_size;
+  int scale_offset = 8 * col + group_idx * qweight_cols * 8;
+  uint4 loaded_scale = *(uint4*)(scales + scale_offset);
+
+  // Handle different data types
+  if constexpr (std::is_same<OutputT, half>::value) {
+    // FP16 path
+    uint4 zeros = dequantize_s4_to_fp16x2(qzeros[col + group_idx * qweight_cols]);
+    uint4 weight_fp16 = dequantize_s4_to_fp16x2(qweight[col + row * qweight_cols]);
+
+    // Use PTX assembly for FP16 operations
+    asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(weight_fp16.x) : "r"(weight_fp16.x), "r"(zeros.x));
+    asm volatile("mul.rn.f16x2 %0, %1, %2;\n" : "=r"(weight_fp16.x) : "r"(weight_fp16.x), "r"(loaded_scale.x));
+    asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(weight_fp16.y) : "r"(weight_fp16.y), "r"(zeros.y));
+    asm volatile("mul.rn.f16x2 %0, %1, %2;\n" : "=r"(weight_fp16.y) : "r"(weight_fp16.y), "r"(loaded_scale.y));
+    asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(weight_fp16.z) : "r"(weight_fp16.z), "r"(zeros.z));
+    asm volatile("mul.rn.f16x2 %0, %1, %2;\n" : "=r"(weight_fp16.z) : "r"(weight_fp16.z), "r"(loaded_scale.z));
+    asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(weight_fp16.w) : "r"(weight_fp16.w), "r"(zeros.w));
+    asm volatile("mul.rn.f16x2 %0, %1, %2;\n" : "=r"(weight_fp16.w) : "r"(weight_fp16.w), "r"(loaded_scale.w));
+
+    OutputT* output_ptr = output + 8 * col + 8 * row * qweight_cols;
+    *(uint4*)output_ptr = weight_fp16;
+  } else if constexpr (std::is_same<OutputT, __nv_bfloat16>::value) {
+    uint4 weight_raw = dequantize_s4_to_bf16x2(qweight[col + row * qweight_cols]);
+    uint4 zero_raw = dequantize_s4_to_bf16x2(qzeros[col + group_idx * qweight_cols]);
+    uint4 scale_raw = *reinterpret_cast<uint4*>(scales + scale_offset);
+
+    // Vectorized processing (each uint4 contains 4 nv_bfloat162)
+    nv_bfloat162* weight_vec = reinterpret_cast<nv_bfloat162*>(&weight_raw);
+    nv_bfloat162* zero_vec = reinterpret_cast<nv_bfloat162*>(&zero_raw);
+    nv_bfloat162* scale_vec = reinterpret_cast<nv_bfloat162*>(&scale_raw);
+
+// Single instruction dual-channel operation
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {  // uint4 = 4 * nv_bfloat162
+      weight_vec[i] = __hmul2(__hsub2(weight_vec[i], zero_vec[i]), scale_vec[i]);
+    }
+
+    // Directly store to OutputT array (guaranteed contiguous memory)
+    OutputT* output_ptr = output + 8 * col + row * qweight_cols * 8;
+    static_assert(sizeof(uint4) == 8 * sizeof(OutputT), "Memory layout mismatch");
+    *reinterpret_cast<uint4*>(output_ptr) = weight_raw;
+  }
+#endif
+}
+
+torch::Tensor awq_dequantize(torch::Tensor qweight, torch::Tensor scales, torch::Tensor qzeros) {
+  int qweight_rows = qweight.size(0);
+  int qweight_cols = qweight.size(1);
+  int group_size = qweight_rows / scales.size(0);
+
+  int x_num_threads = 16;
+  int y_num_threads = 16;
+  int x_blocks = (qweight_cols + x_num_threads - 1) / x_num_threads;
+  int y_blocks = (qweight_rows + y_num_threads - 1) / y_num_threads;
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(qweight));
+
+  auto output_tensor_options = torch::TensorOptions().dtype(scales.dtype()).device(scales.device());
+  at::Tensor output = torch::empty({qweight_rows, qweight_cols * 8}, output_tensor_options);
+
+  auto _qweight = reinterpret_cast<int*>(qweight.data_ptr<int>());
+  auto _zeros = reinterpret_cast<int*>(qzeros.data_ptr<int>());
+
+  dim3 num_blocks(x_blocks, y_blocks);
+  dim3 threads_per_block(x_num_threads, y_num_threads);
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (scales.scalar_type() == at::ScalarType::Half) {
+    auto _scales = reinterpret_cast<half*>(scales.data_ptr<at::Half>());
+    auto _output = reinterpret_cast<half*>(output.data_ptr<at::Half>());
+    dequantize_weights<half><<<num_blocks, threads_per_block, 0, stream>>>(
+        _qweight, _scales, _zeros, _output, group_size, qweight_cols, qweight_rows);
+  } else {
+    auto _scales = reinterpret_cast<__nv_bfloat16*>(scales.data_ptr<at::BFloat16>());
+    auto _output = reinterpret_cast<__nv_bfloat16*>(output.data_ptr<at::BFloat16>());
+    dequantize_weights<__nv_bfloat16><<<num_blocks, threads_per_block, 0, stream>>>(
+        _qweight, _scales, _zeros, _output, group_size, qweight_cols, qweight_rows);
+  }
+
+  return output;
+}
diff --git a/sgl-kernel/csrc/gemm/bmm_fp8.cu b/sgl-kernel/csrc/gemm/bmm_fp8.cu
new file mode 100644
index 000000000..4a82b4b27
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/bmm_fp8.cu
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2024 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <driver_types.h>
+
+#include <flashinfer/gemm/bmm_fp8.cuh>
+
+#include "pytorch_extension_utils.h"
+
+void bmm_fp8(
+    at::Tensor A,
+    at::Tensor B,
+    at::Tensor D,
+    at::Tensor A_scale,
+    at::Tensor B_scale,
+    at::Tensor workspace_buffer,
+    int64_t cublas_handle,
+    int64_t cuda_stream) {
+  TORCH_CHECK(A.is_cuda(), "A must be a CUDA tensor");
+  TORCH_CHECK(B.is_cuda(), "B must be a CUDA tensor");
+  TORCH_CHECK(D.is_cuda(), "D must be a CUDA tensor");
+  TORCH_CHECK(A.dim() == 3, "Expected 3D tensor for A");
+  TORCH_CHECK(B.dim() == 3, "Expected 3D tensor for B");
+  TORCH_CHECK(D.dim() == 3, "Expected 3D tensor for D");
+  TORCH_CHECK(A.size(0) == B.size(0) && A.size(0) == D.size(0), "Batch sizes must match");
+  TORCH_CHECK(A.size(2) == B.size(1), "Incompatible matrix sizes");
+  TORCH_CHECK(A.size(1) == D.size(1) && B.size(2) == D.size(2), "Result tensor has incorrect shape");
+
+  // PyTorch is row major by default. cuBLASLt is column major by default.
+  // We need row major D as expected.
+  // A ^ T * B = D, so D ^ T = B ^ T * A
+  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP8(B.scalar_type(), b_type, [&] {
+    return DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP8(A.scalar_type(), a_type, [&] {
+      return DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(D.scalar_type(), d_type, [&] {
+        auto batch_size = A.size(0);
+        auto m = A.size(1);
+        auto k = A.size(2);
+        auto n = B.size(2);
+
+        auto lt_handle = reinterpret_cast<cublasLtHandle_t>(cublas_handle);
+        auto stream = reinterpret_cast<cudaStream_t>(cuda_stream);
+
+        auto status = flashinfer::bmm_fp8::bmm_fp8_internal_cublaslt(
+            workspace_buffer.data_ptr(),
+            workspace_buffer.numel(),
+            static_cast<b_type*>(B.data_ptr()),
+            static_cast<a_type*>(A.data_ptr()),
+            static_cast<d_type*>(D.data_ptr()),
+            batch_size,
+            n,
+            m,
+            k,
+            static_cast<float*>(B_scale.data_ptr()),
+            static_cast<float*>(A_scale.data_ptr()),
+            lt_handle,
+            stream);
+        TORCH_CHECK(
+            status == CUBLAS_STATUS_SUCCESS, "bmm_fp8_internal_cublaslt failed: ", cublasGetStatusString(status));
+        return true;
+      });
+    });
+  });
+}
diff --git a/sgl-kernel/csrc/gemm/dsv3_fused_a_gemm.cu b/sgl-kernel/csrc/gemm/dsv3_fused_a_gemm.cu
new file mode 100644
index 000000000..37aff1b9a
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/dsv3_fused_a_gemm.cu
@@ -0,0 +1,673 @@
+/*
+ * Adapted from
+ * https://github.com/NVIDIA/TensorRT-LLM/blob/619709fc33bd5dc268f19d6a741fe7ed51c0f8f5/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.cu
+ *
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_bf16.h>
+#include <cuda_runtime.h>
+
+#include "utils.h"
+
+using bf16_t = __nv_bfloat16;
+
+__device__ void hmma_16_8_16_f32acc_bf16ab(
+    float (&d_reg)[4], const bf16_t (&a_reg)[8], const bf16_t (&b_reg)[4], float const (&c_reg)[4]) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  uint32_t a0 = *reinterpret_cast<uint32_t const*>(a_reg + 0);
+  uint32_t a1 = *reinterpret_cast<uint32_t const*>(a_reg + 2);
+  uint32_t a2 = *reinterpret_cast<uint32_t const*>(a_reg + 4);
+  uint32_t a3 = *reinterpret_cast<uint32_t const*>(a_reg + 6);
+  uint32_t b0 = *reinterpret_cast<uint32_t const*>(b_reg + 0);
+  uint32_t b1 = *reinterpret_cast<uint32_t const*>(b_reg + 2);
+  asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+      "{%0,  %1,  %2,  %3},"
+      "{%4,  %5,  %6,  %7},"
+      "{%8,  %9},"
+      "{%10, %11, %12, %13};\n"
+      : "=f"(d_reg[0]), "=f"(d_reg[1]), "=f"(d_reg[2]), "=f"(d_reg[3])
+      : "r"(a0),
+        "r"(a1),
+        "r"(a2),
+        "r"(a3),
+        "r"(b0),
+        "r"(b1),
+        "f"(d_reg[0]),
+        "f"(d_reg[1]),
+        "f"(d_reg[2]),
+        "f"(d_reg[3]));
+#endif
+}
+
+extern "C" {
+__device__ uint32_t __nvvm_get_smem_pointer(void*);
+}
+
+__device__ void ldgsts_128(void const* gPtr, void* sPtr, uint32_t pred) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  if (pred) {
+    uint32_t smemPtrAsUint32 = __nvvm_get_smem_pointer(sPtr);
+    asm volatile("cp.async.cg.shared.global.L2::128B [%0], [%1], %2;\n" ::"r"(smemPtrAsUint32), "l"(gPtr), "n"(16));
+  }
+#endif
+}
+
+__device__ void ldsm_x4(void* smem_ptr, uint32_t* reg_ptr) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  asm volatile("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];\n"
+               : "=r"(reg_ptr[0]), "=r"(reg_ptr[1]), "=r"(reg_ptr[2]), "=r"(reg_ptr[3])
+               : "r"(__nvvm_get_smem_pointer(smem_ptr)));
+#endif
+}
+
+template <class Type>
+__device__ int apply_swizzle_343_on_elem_row_col(int row_idx_, int col_idx_) {
+  uint32_t row_idx = *reinterpret_cast<uint32_t*>(&row_idx_);
+  uint32_t col_idx = *reinterpret_cast<uint32_t*>(&col_idx_);
+  row_idx = row_idx % 8;
+  row_idx = row_idx * (16 / sizeof(Type));
+  col_idx = col_idx ^ row_idx;
+  return *reinterpret_cast<int*>(&col_idx);
+}
+
+__device__ void initialize_barrier(
+    uint64_t* smem_barrier,  // 64 bits user-manged barrier in smem
+    int thread_count = 1)    // Thread count expected to arrive/wait on this barrier
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  uint32_t smem_int_ptr = __nvvm_get_smem_pointer(smem_barrier);
+  asm volatile("mbarrier.init.shared::cta.b64 [%0], %1;\n" ::"r"(smem_int_ptr), "r"(thread_count));
+#endif
+}
+
+// Barrier wait
+__device__ void wait_barrier(
+    uint64_t* smem_barrier,  // 64 bits user-manged barrier in smem
+    int phase_bit)           // Current phase bit the barrier waiting to flip
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  uint32_t smem_int_ptr = __nvvm_get_smem_pointer(smem_barrier);
+  asm volatile(
+      "{\n"
+      ".reg .pred                P1;\n"
+      "LAB_WAIT:\n"
+      "mbarrier.try_wait.parity.shared::cta.b64 P1, [%0], %1;\n"
+      "@P1                       bra DONE;\n"
+      "bra                   LAB_WAIT;\n"
+      "DONE:\n"
+      "}\n" ::"r"(smem_int_ptr),
+      "r"(phase_bit));
+#endif
+}
+
+__device__ bool try_wait_barrier(uint64_t* smem_ptr, int phase_bit) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  uint32_t wait_complete;
+  uint32_t smem_int_ptr = __nvvm_get_smem_pointer(smem_ptr);
+  asm volatile(
+      "{\n\t"
+      ".reg .pred P1; \n\t"
+      "mbarrier.try_wait.parity.shared::cta.b64 P1, [%1], %2; \n\t"
+      "selp.b32 %0, 1, 0, P1; \n\t"
+      "}"
+      : "=r"(wait_complete)
+      : "r"(smem_int_ptr), "r"(phase_bit));
+  return static_cast<bool>(wait_complete);
+#endif
+  return false;
+}
+
+// Barrier arrive
+__device__ void arrive_barrier(uint64_t* smem_barrier)  // 64 bits user-manged barrier in smem
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  uint32_t smem_int_ptr = __nvvm_get_smem_pointer(smem_barrier);
+  asm volatile(
+      "{\n"
+      ".reg .b64 state; \n"
+      "mbarrier.arrive.shared::cta.b64   state, [%0];\n"
+      "}\n" ::"r"(smem_int_ptr));
+#endif
+}
+
+__device__ void ldgsts_arrive(uint64_t* smem_barrier) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  uint32_t smem_int_ptr = __nvvm_get_smem_pointer(smem_barrier);
+  asm volatile("cp.async.mbarrier.arrive.noinc.shared.b64 [%0];" : : "r"(smem_int_ptr));
+#endif
+}
+
+template <int gemm_k, int tile_m, int tile_k, int stage_cnt>
+struct GmemLoaderA {
+  static constexpr int elem_bytes = 2;
+  static constexpr int vec_bytes = 16;
+  static constexpr int vec_elems = vec_bytes / elem_bytes;
+  static constexpr int thread_cnt = 64;
+  static_assert((tile_m * tile_k) % (vec_elems * thread_cnt) == 0);
+  static constexpr int a_inst_cnt_per_iter = (tile_m * tile_k) / (vec_elems * thread_cnt);
+  static_assert(gemm_k % tile_k == 0);
+  static constexpr int k_iter_cnt = gemm_k / tile_k;
+
+  // Extra params to keep the order of k reduction...
+  static constexpr int mma_warp_cnt = 4;
+  static constexpr int per_mma_warp_k = tile_k / mma_warp_cnt;
+  static constexpr int k_each_chunk = gemm_k / mma_warp_cnt;
+
+ private:
+  __device__ int k_project(int tile_k_idx) {
+    return (tile_k_idx / per_mma_warp_k * k_each_chunk) + (tile_k_idx % per_mma_warp_k);
+  }
+
+ public:
+  __device__ GmemLoaderA(bf16_t const* gmem_a_local_, bf16_t* smem_a_, uint64_t* smem_barrier_)
+      : gmem_a(gmem_a_local_), smem_a(smem_a_), smem_barrier(smem_barrier_), local_tid(threadIdx.x % thread_cnt) {}
+
+  __device__ void prepare() {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+// swizzle, that's what we want.
+#pragma unroll
+    for (int i = 0; i < a_inst_cnt_per_iter; i++) {
+      int linear_idx = local_tid * vec_elems + i * thread_cnt * vec_elems;
+      int m_idx = linear_idx / tile_k;
+      int k_idx = linear_idx % tile_k;
+      k_idx = apply_swizzle_343_on_elem_row_col<bf16_t>(m_idx, k_idx);
+      a_smem_offsets[i] = m_idx * tile_k + k_idx;
+    }
+#endif
+  }
+
+  __device__ void issue_mainloop() {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+#pragma unroll 1
+    for (int loop_idx = 0; loop_idx < k_iter_cnt; loop_idx++) {
+      if (need_wait) {
+        wait_barrier(smem_barrier + 1 + stage_idx * 2, phase_bit);
+      }
+      int next_stage_idx = stage_idx + 1;
+      int next_phase_bit = next_stage_idx == stage_cnt ? phase_bit ^ 1 : phase_bit;
+      next_stage_idx = next_stage_idx == stage_cnt ? 0 : next_stage_idx;
+      if (loop_idx != k_iter_cnt - 1) {
+        need_wait = !try_wait_barrier(smem_barrier + 1 + next_stage_idx * 2, next_phase_bit);
+      }
+
+#pragma unroll
+      for (int i = 0; i < a_inst_cnt_per_iter; i++) {
+        int smem_offset = a_smem_offsets[i];
+        bf16_t* smem_ptr_this_iter = smem_a + stage_idx * tile_m * tile_k + smem_offset;
+        int linear_idx = local_tid * vec_elems + i * thread_cnt * vec_elems;
+        int m_idx = linear_idx / tile_k;
+        int k_idx = linear_idx % tile_k;
+        int gmem_offset = m_idx * gemm_k + k_project(k_idx);
+        bf16_t const* gmem_ptr_this_iter = gmem_a + gmem_offset;
+        ldgsts_128(gmem_ptr_this_iter, smem_ptr_this_iter, true);
+      }
+      ldgsts_arrive(smem_barrier + stage_idx * 2);
+
+      stage_idx = next_stage_idx;
+      phase_bit = next_phase_bit;
+      gmem_a += per_mma_warp_k;
+    }
+#endif
+  }
+
+  bf16_t const* gmem_a;
+  bf16_t* smem_a;
+  uint64_t* smem_barrier;
+  int local_tid;
+  int stage_idx = 0;
+  int phase_bit = 1;
+  bool need_wait = true;
+
+  // per smem_stage, store with swizzle information
+  int a_smem_offsets[a_inst_cnt_per_iter];
+};
+
+template <int gemm_k, int tile_n, int tile_k, int stage_cnt>
+struct GmemLoaderB {
+  static constexpr int elem_bytes = 2;
+  static constexpr int vec_bytes = 16;
+  static constexpr int vec_elems = vec_bytes / elem_bytes;
+  static constexpr int thread_cnt = 64;
+  static_assert((tile_n * tile_k) % (vec_elems * thread_cnt) == 0);
+  static constexpr int b_inst_cnt_per_iter = (tile_n * tile_k) / (vec_elems * thread_cnt);
+  static_assert(gemm_k % tile_k == 0);
+  static constexpr int k_iter_cnt = gemm_k / tile_k;
+
+  // Extra params to keep the order of k reduction...
+  static constexpr int mma_warp_cnt = 4;
+  static constexpr int per_mma_warp_k = tile_k / mma_warp_cnt;
+  static constexpr int k_each_chunk = gemm_k / mma_warp_cnt;
+
+ private:
+  __device__ int k_project(int tile_k_idx) {
+    return (tile_k_idx / per_mma_warp_k * k_each_chunk) + (tile_k_idx % per_mma_warp_k);
+  }
+
+ public:
+  __device__ GmemLoaderB(bf16_t const* gmem_b_local_, bf16_t* smem_b_, uint64_t* smem_barrier_, int gemm_n_)
+      : gmem_b(gmem_b_local_),
+        smem_b(smem_b_),
+        smem_barrier(smem_barrier_),
+        gemm_n(gemm_n_),
+        local_tid(threadIdx.x % thread_cnt) {}
+
+  __device__ void prepare() {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+// swizzle, that's what we want.
+#pragma unroll
+    for (int i = 0; i < b_inst_cnt_per_iter; i++) {
+      int linear_idx = local_tid * vec_elems + i * thread_cnt * vec_elems;
+      int n_idx = linear_idx / tile_k;
+      int k_idx = linear_idx % tile_k;
+      k_idx = apply_swizzle_343_on_elem_row_col<bf16_t>(n_idx, k_idx);
+      b_smem_offsets[i] = n_idx * tile_k + k_idx;
+      preds[i] = n_idx < gemm_n;
+    }
+#endif
+  }
+
+  __device__ void issue_mainloop() {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+    asm volatile("griddepcontrol.wait;");
+#pragma unroll 1
+    for (int loop_idx = 0; loop_idx < k_iter_cnt; loop_idx++) {
+      if (need_wait) {
+        wait_barrier(smem_barrier + 1 + stage_idx * 2, phase_bit);
+      }
+      int next_stage_idx = stage_idx + 1;
+      int next_phase_bit = next_stage_idx == stage_cnt ? phase_bit ^ 1 : phase_bit;
+      next_stage_idx = next_stage_idx == stage_cnt ? 0 : next_stage_idx;
+      if (loop_idx != k_iter_cnt - 1) {
+        need_wait = !try_wait_barrier(smem_barrier + 1 + next_stage_idx * 2, next_phase_bit);
+      }
+#pragma unroll
+      for (int i = 0; i < b_inst_cnt_per_iter; i++) {
+        int smem_offset = b_smem_offsets[i];
+        bf16_t* smem_ptr_this_iter = smem_b + stage_idx * tile_n * tile_k + smem_offset;
+        int linear_idx = local_tid * vec_elems + i * thread_cnt * vec_elems;
+        int n_idx = linear_idx / tile_k;
+        int k_idx = linear_idx % tile_k;
+        int gmem_offset = n_idx * gemm_k + k_project(k_idx);
+        bf16_t const* gmem_ptr_this_iter = gmem_b + gmem_offset;
+        ldgsts_128(gmem_ptr_this_iter, smem_ptr_this_iter, preds[i]);
+      }
+      ldgsts_arrive(smem_barrier + stage_idx * 2);
+
+      stage_idx = next_stage_idx;
+      phase_bit = next_phase_bit;
+      gmem_b += per_mma_warp_k;
+    }
+#endif
+  }
+
+  bf16_t const* gmem_b;
+  bf16_t* smem_b;
+  uint64_t* smem_barrier;
+  int gemm_n;
+  int local_tid;
+  int stage_idx = 0;
+  int phase_bit = 1;
+  bool need_wait = true;
+
+  // per smem_stage, store with swizzle information
+  int b_smem_offsets[b_inst_cnt_per_iter];
+  uint32_t preds[b_inst_cnt_per_iter];
+};
+
+template <int gemm_m, int gemm_k, int tile_m, int tile_n, int tile_k, int stage_cnt>
+struct MmaComputer {
+  static constexpr int elem_bytes = 2;
+  static constexpr int thread_cnt = 128;
+  static_assert(gemm_k % tile_k == 0);
+  static_assert(tile_k % (thread_cnt / 32) == 0);
+  static constexpr int per_warp_tile_k = tile_k / (thread_cnt / 32);
+  static constexpr int k_iter_cnt = gemm_k / tile_k;
+  static constexpr int k_phase_cnt = per_warp_tile_k / 16;
+  static constexpr int m_iter_cnt = (tile_m + 15) / 16;
+  static constexpr int n_iter_cnt = (tile_n + 7) / 8;  // Possible to have non-1 n_iter_cnt for ab_swap m16 case.
+  static_assert(m_iter_cnt == 1);
+  static_assert(n_iter_cnt == 1 || n_iter_cnt == 2);
+
+  __device__ MmaComputer(
+      bf16_t* gmem_c_local_, bf16_t* smem_a_, bf16_t* smem_b_, uint64_t* smem_barrier_, int warp_idx_, int gemm_n_)
+      : gmem_c(gmem_c_local_),
+        smem_a(smem_a_),
+        smem_b(smem_b_),
+        smem_barrier(smem_barrier_),
+        warp_idx(warp_idx_ - (thread_cnt / 32)),
+        gemm_n(gemm_n_) {}
+
+ private:
+  __device__ constexpr int internal_b_atom_func(int tid) {
+    if constexpr (tile_n < 8) {
+      return (tid % tile_n) + ((tid % 8) / tile_n * 0) + tid / 8 * 8 * tile_n;
+    } else {
+      return (tid % 8) + ((tid % 32) / 8 * (tile_n * 8));
+    }
+  }
+
+ public:
+  __device__ void prepare() {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+#pragma unroll
+    for (int i = 0; i < k_phase_cnt; i++) {
+      int linear_idx = (lane_idx % 16) + (lane_idx / 16) * 128 + i * 256;
+      int m_idx = linear_idx % tile_m;
+      int k_idx = linear_idx / tile_m + warp_k_offset_in_tile_k;
+      k_idx = apply_swizzle_343_on_elem_row_col<bf16_t>(m_idx, k_idx);
+      a_smem_offsets[0][i] = m_idx * tile_k + k_idx;
+    }
+#pragma unroll
+    for (int n_iter_idx = 0; n_iter_idx < n_iter_cnt; n_iter_idx++) {
+#pragma unroll
+      for (int i = 0; i < k_phase_cnt; i += 2) {  // Special i+=2 for B.
+        int linear_idx = internal_b_atom_func(lane_idx) + i * tile_n * 16 + n_iter_idx * 8;
+        int n_idx = linear_idx % tile_n;
+        int k_idx = linear_idx / tile_n + warp_k_offset_in_tile_k;
+        k_idx = apply_swizzle_343_on_elem_row_col<bf16_t>(n_idx, k_idx);
+        b_smem_offsets[n_iter_idx][i] = n_idx * tile_k + k_idx;
+      }
+    }
+#endif
+  }
+
+  __device__ void issue_mainloop() {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+#pragma unroll 1
+    for (int loop_idx = 0; loop_idx < k_iter_cnt; loop_idx++) {
+      wait_barrier(smem_barrier + 0 + stage_idx * 2, phase_bit);
+
+#pragma unroll
+      for (int i = 0; i < k_phase_cnt; i++) {
+        int smem_offset = a_smem_offsets[0][i];
+        bf16_t* smem_ptr_this_iter = smem_a + stage_idx * tile_m * tile_k + smem_offset;
+        ldsm_x4(smem_ptr_this_iter, reinterpret_cast<uint32_t*>(a_reg[0][i]));
+      }
+
+#pragma unroll
+      for (int n_iter_idx = 0; n_iter_idx < n_iter_cnt; n_iter_idx++) {
+#pragma unroll
+        for (int i = 0; i < k_phase_cnt; i += 2) {
+          int smem_offset = b_smem_offsets[n_iter_idx][i];
+          bf16_t* smem_ptr_this_iter = smem_b + stage_idx * tile_n * tile_k + smem_offset;
+          ldsm_x4(smem_ptr_this_iter, reinterpret_cast<uint32_t*>(b_reg[n_iter_idx][i]));
+        }
+      }
+
+#pragma unroll
+      for (int k_iter_idx = 0; k_iter_idx < k_phase_cnt; k_iter_idx++) {
+#pragma unroll
+        for (int n_iter_idx = 0; n_iter_idx < n_iter_cnt; n_iter_idx++) {
+          hmma_16_8_16_f32acc_bf16ab(
+              acc_reg[0][n_iter_idx], a_reg[0][k_iter_idx], b_reg[n_iter_idx][k_iter_idx], acc_reg[0][n_iter_idx]);
+        }
+      }
+      ::arrive_barrier(smem_barrier + 1 + stage_idx * 2);
+      stage_idx += 1;
+      phase_bit = stage_idx == stage_cnt ? phase_bit ^ 1 : phase_bit;
+      stage_idx = stage_idx == stage_cnt ? 0 : stage_idx;
+    }
+#endif
+  }
+
+  __device__ void epi() {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+    asm volatile("bar.sync %0, %1;" : : "r"(1), "r"(thread_cnt));
+    // reorganize the acc_reg
+    constexpr int thread_m = 2;
+    constexpr int thread_n = 2 * n_iter_cnt;
+    constexpr int cta_mma_n = n_iter_cnt * 8;
+    float acc_reg_reorg[thread_m][thread_n];
+
+    for (int i = 0; i < thread_m; i++) {
+      for (int j = 0; j < thread_n; j++) {
+        acc_reg_reorg[i][j] = acc_reg[0][j / 2][(j % 2) + (i * 2)];
+      }
+    }
+
+    // 4 x cosize(smem_c_layout)
+    float* smem_c = reinterpret_cast<float*>(smem_a);
+    // coord -> index
+    auto smem_c_index_func = [&](int m_idx, int n_idx) {
+      int group_rows = 32 / cta_mma_n;
+      int group_cnt = 2;
+      return (m_idx % group_rows * cta_mma_n) + (m_idx / group_rows * (32 + group_cnt)) + n_idx;
+    };
+    constexpr int cosize_smem_c = ((tile_m * cta_mma_n) / 32) * (32 + 2);
+
+// This should be optimized to STS.64 but can not be STS.128 due to the bank index.
+#pragma unroll
+    for (int m_idx_thread = 0; m_idx_thread < thread_m; m_idx_thread++) {
+#pragma unroll
+      for (int n_idx_thread = 0; n_idx_thread < thread_n; n_idx_thread++) {
+        int m_idx = (lane_idx / 4) + m_idx_thread * 8;
+        int n_idx = ((lane_idx % 4) * 2) + (n_idx_thread % 2) + (n_idx_thread / 2) * 8;
+        smem_c[cosize_smem_c * warp_idx + smem_c_index_func(m_idx, n_idx)] = acc_reg_reorg[m_idx_thread][n_idx_thread];
+      }
+    }
+    asm volatile("bar.sync %0, %1;" : : "r"(1), "r"(thread_cnt));
+
+    if (warp_idx == 0) {
+      constexpr int final_acc_reg_cnt = (tile_m * tile_n + 31) / 32;
+      float acc_final[final_acc_reg_cnt]{};
+
+#pragma unroll
+      for (int reg_idx = 0; reg_idx < final_acc_reg_cnt; reg_idx++) {
+        int linear_idx = reg_idx * 32 + lane_idx;
+        int m_idx = linear_idx % tile_m;
+        int n_idx = linear_idx / tile_m;
+        acc_final[reg_idx] += smem_c[smem_c_index_func(m_idx, n_idx) + 0 * cosize_smem_c] +
+                              smem_c[smem_c_index_func(m_idx, n_idx) + 1 * cosize_smem_c] +
+                              smem_c[smem_c_index_func(m_idx, n_idx) + 2 * cosize_smem_c] +
+                              smem_c[smem_c_index_func(m_idx, n_idx) + 3 * cosize_smem_c];
+      }
+
+#pragma unroll
+      for (int reg_idx = 0; reg_idx < final_acc_reg_cnt; reg_idx++) {
+        int linear_idx = reg_idx * 32 + lane_idx;
+        int m_idx = linear_idx % tile_m;
+        int n_idx = linear_idx / tile_m;
+        if (m_idx < tile_m && n_idx < gemm_n) {
+          gmem_c[n_idx * gemm_m + m_idx] = acc_final[reg_idx];
+        }
+      }
+    }
+#endif
+  }
+
+  bf16_t* gmem_c;
+  bf16_t* smem_a;
+  bf16_t* smem_b;
+  uint64_t* smem_barrier;
+  int warp_idx;
+  int gemm_n;
+  int stage_idx = 0;
+  int phase_bit = 0;
+  int lane_idx = threadIdx.x % 32;
+  int warp_k_offset_in_tile_k = warp_idx * per_warp_tile_k;
+
+  int a_smem_offsets[m_iter_cnt][k_phase_cnt];
+  int b_smem_offsets[n_iter_cnt][k_phase_cnt];
+
+  bf16_t a_reg[m_iter_cnt][k_phase_cnt][8];
+  bf16_t b_reg[n_iter_cnt][k_phase_cnt][4];
+  float acc_reg[m_iter_cnt][n_iter_cnt][4]{};
+};
+
+// AB swapped, kernel is k-major, k-major, m-major
+template <int batch_size, int gemm_m, int gemm_k, int tile_m, int tile_n, int tile_k, int stage_cnt>
+__global__ __launch_bounds__(256, 1) void fused_a_gemm_kernel(
+    bf16_t* output, bf16_t const* mat_a, bf16_t const* mat_b, int gemm_n) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  constexpr int load_thread_cnt = 128;
+  constexpr int compute_thread_cnt = 128;
+  constexpr int thread_cnt = load_thread_cnt + compute_thread_cnt;
+  (void)thread_cnt;
+  static_assert(gemm_m % 16 == 0);
+  static_assert(gemm_k % tile_k == 0);
+  static_assert(gemm_m % tile_m == 0);
+  static_assert(
+      tile_k == 128 || tile_k == 256 || tile_k == 512 ||
+      tile_k == 1024);  // tile_k must be larger than 64 since 4 warp splitK.
+  static_assert(tile_m == 16);
+  constexpr int g2s_vec_bytes = 16;
+  constexpr int a_elem_bytes = 2;
+  constexpr int b_elem_bytes = 2;
+  // constexpr int c_elem_bytes = 2;
+  static_assert((tile_m * a_elem_bytes + tile_n * b_elem_bytes) * tile_k * stage_cnt <= 225 * 1024);
+  static_assert((tile_m * tile_k * a_elem_bytes) % (load_thread_cnt * g2s_vec_bytes) == 0);
+  static_assert((tile_n * tile_k * b_elem_bytes) % (load_thread_cnt * g2s_vec_bytes) == 0);
+
+  extern __shared__ char smem[];
+  uint64_t* smem_barrier = reinterpret_cast<uint64_t*>(smem);  // producer,consumer; producer,consumer; ...
+  bf16_t* smem_a = reinterpret_cast<bf16_t*>(smem + (stage_cnt * 8 * 2 + 1024) / 1024 * 1024);
+  bf16_t* smem_b = smem_a + tile_m * tile_k * stage_cnt;
+
+  int cta_m_idx = tile_m * blockIdx.x;
+  int cta_n_idx = tile_n * blockIdx.y;
+  bf16_t const* gmem_a_local = mat_a + cta_m_idx * gemm_k;
+  bf16_t const* gmem_b_local = mat_b + cta_n_idx * gemm_k;
+  bf16_t* gmem_c_local = output + cta_n_idx * gemm_m + cta_m_idx;
+
+  int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+
+  if (warp_idx == 4) {
+    for (int i = 0; i < stage_cnt; i++) {
+      initialize_barrier(smem_barrier + i * 2 + 0, load_thread_cnt);     // producer
+      initialize_barrier(smem_barrier + i * 2 + 1, compute_thread_cnt);  // consumer
+    }
+  }
+  __syncthreads();
+
+  if (warp_idx < 2) {
+    GmemLoaderA<gemm_k, tile_m, tile_k, stage_cnt> a_loader(gmem_a_local, smem_a, smem_barrier);
+    a_loader.prepare();
+    a_loader.issue_mainloop();
+  } else if (warp_idx < 4) {
+    GmemLoaderB<gemm_k, tile_n, tile_k, stage_cnt> b_loader(gmem_b_local, smem_b, smem_barrier, gemm_n);
+    b_loader.prepare();
+    b_loader.issue_mainloop();
+  } else {
+    MmaComputer<gemm_m, gemm_k, tile_m, tile_n, tile_k, stage_cnt> mma_computer(
+        gmem_c_local, smem_a, smem_b, smem_barrier, warp_idx, gemm_n);
+    mma_computer.prepare();
+    mma_computer.issue_mainloop();
+    mma_computer.epi();
+  }
+  asm volatile("griddepcontrol.launch_dependents;");
+#endif
+}
+
+template <typename T, int kHdIn, int kHdOut, int kTileN>
+void invokeFusedAGemm(T* output, T const* mat_a, T const* mat_b, int num_tokens, cudaStream_t const stream) {
+  constexpr int gemm_m = kHdOut;  // 2112
+  int const gemm_n = num_tokens;  // 16
+  constexpr int gemm_k = kHdIn;   // 7168
+  constexpr int batch_size = 1;
+  std::swap(mat_a, mat_b);
+  constexpr int tile_m = 16;
+  constexpr int tile_n = kTileN;                        // 8 or 16
+  constexpr int tile_k = std::max(256, 1024 / tile_n);  // 256
+  constexpr int max_stage_cnt = 1024 * 192 / ((tile_m + tile_n) * tile_k * sizeof(bf16_t));
+  constexpr int k_iter_cnt = gemm_k / tile_k;
+  constexpr int stage_cnt =
+      k_iter_cnt > max_stage_cnt ? max_stage_cnt : k_iter_cnt;  // possible tunable for smallK > 1 wave n. // 22
+  int cta_m_cnt = gemm_m / tile_m;
+  int cta_n_cnt = (gemm_n + tile_n - 1) / tile_n;
+  constexpr int barrier_bytes = (stage_cnt * 16 + 1023) / 1024 * 1024;  // 4096
+  constexpr int smem_bytes = ((tile_m * 2 + tile_n * 2) * tile_k * stage_cnt + barrier_bytes + 1023) / 1024 * 1024;
+
+  dim3 grid(cta_m_cnt, cta_n_cnt, 1);
+  dim3 block_size(256);
+  cudaLaunchConfig_t config;
+  config.gridDim = grid;
+  config.blockDim = block_size;
+  config.dynamicSmemBytes = smem_bytes;
+  config.stream = stream;
+  cudaLaunchAttribute attrs[1];
+  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+  attrs[0].val.programmaticStreamSerializationAllowed = getEnvEnablePDL();
+  config.numAttrs = 1;
+  config.attrs = attrs;
+  if (smem_bytes >= (48 * 1024)) {
+    cudaFuncSetAttribute(
+        fused_a_gemm_kernel<batch_size, gemm_m, gemm_k, tile_m, tile_n, tile_k, stage_cnt>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        smem_bytes);
+  }
+  cudaLaunchKernelEx(
+      &config,
+      fused_a_gemm_kernel<batch_size, gemm_m, gemm_k, tile_m, tile_n, tile_k, stage_cnt>,
+      output,
+      mat_a,
+      mat_b,
+      gemm_n);
+}
+
+template void invokeFusedAGemm<__nv_bfloat16, 7168, 2112, 8>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, int num_tokens, cudaStream_t);
+
+template void invokeFusedAGemm<__nv_bfloat16, 7168, 2112, 16>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, int num_tokens, cudaStream_t);
+
+void dsv3_fused_a_gemm(torch::Tensor& output, torch::Tensor const& mat_a, torch::Tensor const& mat_b) {
+  TORCH_CHECK(mat_a.dim() == 2 && mat_b.dim() == 2 && output.dim() == 2);
+  int const num_tokens = mat_a.size(0);
+  int const hd_in = mat_a.size(1);
+  int const hd_out = mat_b.size(1);
+
+  constexpr int kHdIn = 7168;
+  constexpr int kHdOut = 2112;
+  TORCH_CHECK(num_tokens >= 1 && num_tokens <= 16, "required 1 <= mat_a.shape[0] <= 16")
+  TORCH_CHECK(hd_in == kHdIn, "required mat_a.shape[1] == 7168")
+  TORCH_CHECK(hd_out == kHdOut, "required mat_b.shape[1] == 2112")
+  TORCH_CHECK(output.size(0) == num_tokens, "required output.shape[0] == mat_a.shape[0]")
+  TORCH_CHECK(output.size(1) == hd_out, "required output.shape[1] == mat_b.shape[1]")
+
+  TORCH_CHECK(mat_a.stride(1) == 1, "mat_a must be a row major tensor");     // Row-major
+  TORCH_CHECK(output.stride(1) == 1, "output must be a row major tensor");   // Row-major
+  TORCH_CHECK(mat_b.stride(0) == 1, "mat_b must be a column major tensor");  // Column-major
+
+  auto const data_type = mat_a.scalar_type();
+  TORCH_CHECK(
+      mat_a.scalar_type() == torch::kBFloat16 && mat_b.scalar_type() == torch::kBFloat16,
+      "Only BFloat16 input dtype is supported")
+  TORCH_CHECK(output.scalar_type() == torch::kBFloat16, "Only BFloat16 output dtype is supported")
+
+  auto const sm = getSMVersion();
+  TORCH_CHECK(sm >= 90, "required CUDA ARCH >= SM_90");
+
+  auto stream = at::cuda::getCurrentCUDAStream(mat_a.get_device());
+  if (num_tokens <= 8) {
+    invokeFusedAGemm<__nv_bfloat16, kHdIn, kHdOut, 8>(
+        reinterpret_cast<__nv_bfloat16*>(output.mutable_data_ptr()),
+        reinterpret_cast<__nv_bfloat16 const*>(mat_a.data_ptr()),
+        reinterpret_cast<__nv_bfloat16 const*>(mat_b.data_ptr()),
+        num_tokens,
+        stream);
+  } else {
+    invokeFusedAGemm<__nv_bfloat16, kHdIn, kHdOut, 16>(
+        reinterpret_cast<__nv_bfloat16*>(output.mutable_data_ptr()),
+        reinterpret_cast<__nv_bfloat16 const*>(mat_a.data_ptr()),
+        reinterpret_cast<__nv_bfloat16 const*>(mat_b.data_ptr()),
+        num_tokens,
+        stream);
+  }
+}
diff --git a/sgl-kernel/csrc/gemm/dsv3_router_gemm_bf16_out.cu b/sgl-kernel/csrc/gemm/dsv3_router_gemm_bf16_out.cu
new file mode 100644
index 000000000..e613bd75c
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/dsv3_router_gemm_bf16_out.cu
@@ -0,0 +1,284 @@
+/*
+ * Adapted from
+ * https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.cu
+ * https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/thop/dsv3RouterGemmOp.cpp
+ *
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include "cuda_bf16.h"
+#include "cuda_runtime.h"
+#include "utils.h"
+
+// Custom FMA implementation using PTX assembly instructions
+__device__ __forceinline__ void fma(float2& d, float2 const& a, float2 const& b, float2 const& c) {
+  asm volatile("fma.rn.f32x2 %0, %1, %2, %3;\n"
+               : "=l"(reinterpret_cast<uint64_t&>(d))
+               : "l"(reinterpret_cast<uint64_t const&>(a)),
+                 "l"(reinterpret_cast<uint64_t const&>(b)),
+                 "l"(reinterpret_cast<uint64_t const&>(c)));
+}
+
+// Convert 8 bfloat16 values from a uint4 to float array - optimized conversion
+template <int VPT>
+__device__ __forceinline__ void bf16_uint4_to_float8(uint4 const& vec, float* dst) {
+  __nv_bfloat16* bf16_ptr = reinterpret_cast<__nv_bfloat16*>(const_cast<uint4*>(&vec));
+
+#pragma unroll
+  for (int i = 0; i < VPT; i++) {
+    dst[i] = __bfloat162float(bf16_ptr[i]);
+  }
+}
+
+template <typename T, int kBlockSize, int VPT, int kNumTokens, int kNumExperts, int kHiddenDim>
+__global__
+__launch_bounds__(128, 1) void router_gemm_kernel_bf16_output(__nv_bfloat16* out, T const* mat_a, T const* mat_b) {
+  // Each block handles one expert column
+  int const n_idx = blockIdx.x;
+  int const tid = threadIdx.x;
+  constexpr int kWarpSize = 32;
+  constexpr int kNumWarps = kBlockSize / kWarpSize;
+  // Constants for this kernel
+  constexpr int k_elems_per_k_iteration = VPT * kBlockSize;
+  constexpr int k_iterations = kHiddenDim / k_elems_per_k_iteration;  // Total K iterations
+
+  // Initialize accumulators for all M rows
+  float acc[kNumTokens] = {};
+
+  // Shared memory for warp-level reduction
+  __shared__ float sm_reduction[kNumTokens][kNumWarps];  // kNumWarps
+
+  // B matrix is in column-major order, so we can directly load a column for the n_idx expert
+  T const* b_col = mat_b + n_idx * kHiddenDim;
+
+  // Pre-compute k_base values for each iteration to help compiler optimize
+  // int k_bases[k_iterations];
+  int k_bases[k_iterations];
+#pragma unroll
+  for (int ki = 0; ki < k_iterations; ki++) {
+    k_bases[ki] = ki * k_elems_per_k_iteration + tid * VPT;
+  }
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.wait;");
+#endif
+
+  // Process the GEMM in chunks
+  for (int ki = 0; ki < k_iterations; ki++) {
+    int const k_base = k_bases[ki];
+
+    // Load B matrix values using vector load (8 bf16 values)
+    uint4 b_vec = *reinterpret_cast<uint4 const*>(b_col + k_base);
+
+    // Convert B values to float
+    float b_float[VPT];
+    bf16_uint4_to_float8<VPT>(b_vec, b_float);
+
+// Process each token
+#pragma unroll
+    for (int m_idx = 0; m_idx < kNumTokens; m_idx++) {
+      // Load both rows of A matrix using vector loads
+      uint4 a_vec = *reinterpret_cast<uint4 const*>(mat_a + (m_idx * kHiddenDim) + k_base);
+
+      // Convert A values to float
+      float a_float[VPT];
+      bf16_uint4_to_float8<VPT>(a_vec, a_float);
+
+// Process elements in this chunk
+#pragma unroll
+      for (int k = 0; k < VPT; k++) {
+        float a = a_float[k];
+        float b = b_float[k];
+        acc[m_idx] += a * b;
+      }
+    }
+  }
+
+  // Perform warp-level reduction
+  int const warpSize = 32;
+  int const warpId = tid / warpSize;
+  int const laneId = tid % warpSize;
+
+  // Register for warp-level reduction results
+  float warp_result[kNumTokens];
+
+#pragma unroll
+  for (int m_idx = 0; m_idx < kNumTokens; m_idx++) {
+    warp_result[m_idx] = acc[m_idx];
+  }
+
+// Perform warp-level reduction using optimized butterfly pattern
+#pragma unroll
+  for (int m = 0; m < kNumTokens; m++) {
+    float sum = warp_result[m];
+
+    // Butterfly reduction pattern
+    sum += __shfl_xor_sync(0xffffffff, sum, 16);
+    sum += __shfl_xor_sync(0xffffffff, sum, 8);
+    sum += __shfl_xor_sync(0xffffffff, sum, 4);
+    sum += __shfl_xor_sync(0xffffffff, sum, 2);
+    sum += __shfl_xor_sync(0xffffffff, sum, 1);
+
+    // Only the first thread in each warp stores to shared memory
+    if (laneId == 0) {
+      sm_reduction[m][warpId] = sum;
+    }
+  }
+
+  __syncthreads();
+
+  // Final reduction across warps (only first thread)
+  if (tid == 0) {
+#pragma unroll
+    for (int m = 0; m < kNumTokens; m++) {
+      float final_sum = 0.0f;
+
+// Sum across the kNumWarps
+#pragma unroll
+      for (int w = 0; w < kNumWarps; w++) {
+        final_sum += sm_reduction[m][w];
+      }
+
+      // Write final result
+      out[m * kNumExperts + n_idx] = __float2bfloat16(final_sum);
+    }
+  }
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.launch_dependents;");
+#endif
+}
+
+template <typename T, int kNumTokens, int kNumExperts, int kHiddenDim>
+void invokeRouterGemmBf16Output(__nv_bfloat16* output, T const* mat_a, T const* mat_b, cudaStream_t stream) {
+  constexpr int VPT = 16 / sizeof(T);
+  constexpr int kBlockSize = 128;
+  cudaLaunchConfig_t config;
+  config.gridDim = kNumExperts;
+  config.blockDim = kBlockSize;
+  config.dynamicSmemBytes = 0;
+  config.stream = stream;
+  cudaLaunchAttribute attrs[1];
+  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+  attrs[0].val.programmaticStreamSerializationAllowed = getEnvEnablePDL();
+  config.numAttrs = 1;
+  config.attrs = attrs;
+  cudaLaunchKernelEx(
+      &config,
+      router_gemm_kernel_bf16_output<T, kBlockSize, VPT, kNumTokens, kNumExperts, kHiddenDim>,
+      output,
+      mat_a,
+      mat_b);
+}
+
+// Template instantiations for DEFAULT_NUM_EXPERTS experts
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 1, 256, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 2, 256, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 3, 256, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 4, 256, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 5, 256, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 6, 256, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 7, 256, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 8, 256, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 9, 256, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 10, 256, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 11, 256, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 12, 256, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 13, 256, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 14, 256, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 15, 256, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 16, 256, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+// Template instantiations for KIMI_K2_NUM_EXPERTS experts
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 1, 384, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 2, 384, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 3, 384, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 4, 384, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 5, 384, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 6, 384, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 7, 384, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 8, 384, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 9, 384, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 10, 384, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 11, 384, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 12, 384, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 13, 384, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 14, 384, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 15, 384, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmBf16Output<__nv_bfloat16, 16, 384, 7168>(
+    __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
diff --git a/sgl-kernel/csrc/gemm/dsv3_router_gemm_entry.cu b/sgl-kernel/csrc/gemm/dsv3_router_gemm_entry.cu
new file mode 100644
index 000000000..4f09e6cf4
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/dsv3_router_gemm_entry.cu
@@ -0,0 +1,161 @@
+/*
+ * Adapted from
+ * https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.cu
+ * https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/thop/dsv3RouterGemmOp.cpp
+ *
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include "cuda_bf16.h"
+#include "cuda_runtime.h"
+#include "utils.h"
+
+static constexpr int DEFAULT_NUM_EXPERTS = 256;
+static constexpr int KIMI_K2_NUM_EXPERTS = 384;
+static constexpr int DEFAULT_HIDDEN_DIM = 7168;
+
+template <typename T, int kNumTokens, int kNumExperts, int kHiddenDim>
+void invokeRouterGemmFloatOutput(float* output, T const* mat_a, T const* mat_b, cudaStream_t stream);
+
+template <typename T, int kNumTokens, int kNumExperts, int kHiddenDim>
+void invokeRouterGemmBf16Output(__nv_bfloat16* output, T const* mat_a, T const* mat_b, cudaStream_t stream);
+
+template <int kBegin, int kEnd, int kNumExperts, int kHiddenDim>
+struct LoopUnroller {
+  static void unroll_float_output(
+      int num_tokens, float* output, __nv_bfloat16 const* input, __nv_bfloat16 const* weights, cudaStream_t stream) {
+    if (num_tokens == kBegin) {
+      invokeRouterGemmFloatOutput<__nv_bfloat16, kBegin, kNumExperts, kHiddenDim>(output, input, weights, stream);
+    } else {
+      LoopUnroller<kBegin + 1, kEnd, kNumExperts, kHiddenDim>::unroll_float_output(
+          num_tokens, output, input, weights, stream);
+    }
+  }
+
+  static void unroll_bf16_output(
+      int num_tokens,
+      __nv_bfloat16* output,
+      __nv_bfloat16 const* input,
+      __nv_bfloat16 const* weights,
+      cudaStream_t stream) {
+    if (num_tokens == kBegin) {
+      invokeRouterGemmBf16Output<__nv_bfloat16, kBegin, kNumExperts, kHiddenDim>(output, input, weights, stream);
+    } else {
+      LoopUnroller<kBegin + 1, kEnd, kNumExperts, kHiddenDim>::unroll_bf16_output(
+          num_tokens, output, input, weights, stream);
+    }
+  }
+};
+
+template <int kEnd, int kNumExperts, int kHiddenDim>
+struct LoopUnroller<kEnd, kEnd, kNumExperts, kHiddenDim> {
+  static void unroll_float_output(
+      int num_tokens, float* output, __nv_bfloat16 const* input, __nv_bfloat16 const* weights, cudaStream_t stream) {
+    if (num_tokens == kEnd) {
+      invokeRouterGemmFloatOutput<__nv_bfloat16, kEnd, kNumExperts, kHiddenDim>(output, input, weights, stream);
+    } else {
+      throw std::invalid_argument("Invalid num_tokens, only supports 1 to 16");
+    }
+  }
+
+  static void unroll_bf16_output(
+      int num_tokens,
+      __nv_bfloat16* output,
+      __nv_bfloat16 const* input,
+      __nv_bfloat16 const* weights,
+      cudaStream_t stream) {
+    if (num_tokens == kEnd) {
+      invokeRouterGemmBf16Output<__nv_bfloat16, kEnd, kNumExperts, kHiddenDim>(output, input, weights, stream);
+    } else {
+      throw std::invalid_argument("Invalid num_tokens, only supports 1 to 16");
+    }
+  }
+};
+
+void dsv3_router_gemm(
+    torch::Tensor& output,       // [num_tokens, num_experts]
+    const torch::Tensor& mat_a,  // [num_tokens, hidden_dim]
+    const torch::Tensor& mat_b   // [num_experts, hidden_dim]
+) {
+  TORCH_CHECK(output.dim() == 2 && mat_a.dim() == 2 && mat_b.dim() == 2);
+
+  const int num_tokens = mat_a.size(0);
+  const int num_experts = mat_b.size(0);
+  const int hidden_dim = mat_a.size(1);
+
+  TORCH_CHECK(mat_a.size(1) == mat_b.size(1), "mat_a and mat_b must have the same hidden_dim");
+  TORCH_CHECK(
+      hidden_dim == DEFAULT_HIDDEN_DIM,
+      "Expected hidden_dim=",
+      DEFAULT_HIDDEN_DIM,
+      ", but got hidden_dim=",
+      hidden_dim);
+  TORCH_CHECK(
+      num_experts == DEFAULT_NUM_EXPERTS || num_experts == KIMI_K2_NUM_EXPERTS,
+      "Expected num_experts=",
+      DEFAULT_NUM_EXPERTS,
+      " or num_experts=",
+      KIMI_K2_NUM_EXPERTS,
+      ", but got num_experts=",
+      num_experts);
+  TORCH_CHECK(
+      num_tokens >= 1 && num_tokens <= 16, "currently num_tokens must be less than or equal to 16 for router_gemm");
+  TORCH_CHECK(mat_a.dtype() == torch::kBFloat16, "mat_a must be bf16");
+  TORCH_CHECK(mat_b.dtype() == torch::kBFloat16, "mat_b must be bf16");
+  TORCH_CHECK(
+      output.dtype() == torch::kFloat32 || output.dtype() == torch::kBFloat16, "output must be float32 or bf16");
+
+  auto const sm = getSMVersion();
+  TORCH_CHECK(sm >= 90, "required CUDA ARCH >= SM_90");
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (output.dtype() == torch::kFloat32) {
+    if (num_experts == DEFAULT_NUM_EXPERTS) {
+      LoopUnroller<1, 16, DEFAULT_NUM_EXPERTS, DEFAULT_HIDDEN_DIM>::unroll_float_output(
+          num_tokens,
+          reinterpret_cast<float*>(output.mutable_data_ptr()),
+          reinterpret_cast<__nv_bfloat16 const*>(mat_a.data_ptr()),
+          reinterpret_cast<__nv_bfloat16 const*>(mat_b.data_ptr()),
+          stream);
+    } else if (num_experts == KIMI_K2_NUM_EXPERTS) {
+      LoopUnroller<1, 16, KIMI_K2_NUM_EXPERTS, DEFAULT_HIDDEN_DIM>::unroll_float_output(
+          num_tokens,
+          reinterpret_cast<float*>(output.mutable_data_ptr()),
+          reinterpret_cast<__nv_bfloat16 const*>(mat_a.data_ptr()),
+          reinterpret_cast<__nv_bfloat16 const*>(mat_b.data_ptr()),
+          stream);
+    }
+  } else if (output.dtype() == torch::kBFloat16) {
+    if (num_experts == DEFAULT_NUM_EXPERTS) {
+      LoopUnroller<1, 16, DEFAULT_NUM_EXPERTS, DEFAULT_HIDDEN_DIM>::unroll_bf16_output(
+          num_tokens,
+          reinterpret_cast<__nv_bfloat16*>(output.mutable_data_ptr()),
+          reinterpret_cast<__nv_bfloat16 const*>(mat_a.data_ptr()),
+          reinterpret_cast<__nv_bfloat16 const*>(mat_b.data_ptr()),
+          stream);
+    } else if (num_experts == KIMI_K2_NUM_EXPERTS) {
+      LoopUnroller<1, 16, KIMI_K2_NUM_EXPERTS, DEFAULT_HIDDEN_DIM>::unroll_bf16_output(
+          num_tokens,
+          reinterpret_cast<__nv_bfloat16*>(output.mutable_data_ptr()),
+          reinterpret_cast<__nv_bfloat16 const*>(mat_a.data_ptr()),
+          reinterpret_cast<__nv_bfloat16 const*>(mat_b.data_ptr()),
+          stream);
+    }
+  }
+}
diff --git a/sgl-kernel/csrc/gemm/dsv3_router_gemm_float_out.cu b/sgl-kernel/csrc/gemm/dsv3_router_gemm_float_out.cu
new file mode 100644
index 000000000..88a364e2c
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/dsv3_router_gemm_float_out.cu
@@ -0,0 +1,283 @@
+/*
+ * Adapted from
+ * https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.cu
+ * https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/thop/dsv3RouterGemmOp.cpp
+ *
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include "cuda_bf16.h"
+#include "cuda_runtime.h"
+#include "utils.h"
+
+// Custom FMA implementation using PTX assembly instructions
+__device__ __forceinline__ void fma(float2& d, float2 const& a, float2 const& b, float2 const& c) {
+  asm volatile("fma.rn.f32x2 %0, %1, %2, %3;\n"
+               : "=l"(reinterpret_cast<uint64_t&>(d))
+               : "l"(reinterpret_cast<uint64_t const&>(a)),
+                 "l"(reinterpret_cast<uint64_t const&>(b)),
+                 "l"(reinterpret_cast<uint64_t const&>(c)));
+}
+
+// Convert 8 bfloat16 values from a uint4 to float array - optimized conversion
+template <int VPT>
+__device__ __forceinline__ void bf16_uint4_to_float8(uint4 const& vec, float* dst) {
+  __nv_bfloat16* bf16_ptr = reinterpret_cast<__nv_bfloat16*>(const_cast<uint4*>(&vec));
+
+#pragma unroll
+  for (int i = 0; i < VPT; i++) {
+    dst[i] = __bfloat162float(bf16_ptr[i]);
+  }
+}
+
+template <typename T, int kBlockSize, int VPT, int kNumTokens, int kNumExperts, int kHiddenDim>
+__global__ __launch_bounds__(128, 1) void router_gemm_kernel_float_output(float* out, T const* mat_a, T const* mat_b) {
+  // Each block handles one expert column
+  int const n_idx = blockIdx.x;
+  int const tid = threadIdx.x;
+  constexpr int kWarpSize = 32;
+  constexpr int kNumWarps = kBlockSize / kWarpSize;
+  // Constants for this kernel
+  constexpr int k_elems_per_k_iteration = VPT * kBlockSize;
+  constexpr int k_iterations = kHiddenDim / k_elems_per_k_iteration;  // Total K iterations
+
+  // Initialize accumulators for all M rows
+  float acc[kNumTokens] = {};
+
+  // Shared memory for warp-level reduction
+  __shared__ float sm_reduction[kNumTokens][kNumWarps];  // kNumWarps
+
+  // B matrix is in column-major order, so we can directly load a column for the n_idx expert
+  T const* b_col = mat_b + n_idx * kHiddenDim;
+
+  // Pre-compute k_base values for each iteration to help compiler optimize
+  // int k_bases[k_iterations];
+  int k_bases[k_iterations];
+#pragma unroll
+  for (int ki = 0; ki < k_iterations; ki++) {
+    k_bases[ki] = ki * k_elems_per_k_iteration + tid * VPT;
+  }
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.wait;");
+#endif
+
+  // Process the GEMM in chunks
+  for (int ki = 0; ki < k_iterations; ki++) {
+    int const k_base = k_bases[ki];
+
+    // Load B matrix values using vector load (8 bf16 values)
+    uint4 b_vec = *reinterpret_cast<uint4 const*>(b_col + k_base);
+
+    // Convert B values to float
+    float b_float[VPT];
+    bf16_uint4_to_float8<VPT>(b_vec, b_float);
+
+// Process each token
+#pragma unroll
+    for (int m_idx = 0; m_idx < kNumTokens; m_idx++) {
+      // Load both rows of A matrix using vector loads
+      uint4 a_vec = *reinterpret_cast<uint4 const*>(mat_a + (m_idx * kHiddenDim) + k_base);
+
+      // Convert A values to float
+      float a_float[VPT];
+      bf16_uint4_to_float8<VPT>(a_vec, a_float);
+
+// Process elements in this chunk
+#pragma unroll
+      for (int k = 0; k < VPT; k++) {
+        float a = a_float[k];
+        float b = b_float[k];
+        acc[m_idx] += a * b;
+      }
+    }
+  }
+
+  // Perform warp-level reduction
+  int const warpSize = 32;
+  int const warpId = tid / warpSize;
+  int const laneId = tid % warpSize;
+
+  // Register for warp-level reduction results
+  float warp_result[kNumTokens];
+
+#pragma unroll
+  for (int m_idx = 0; m_idx < kNumTokens; m_idx++) {
+    warp_result[m_idx] = acc[m_idx];
+  }
+
+// Perform warp-level reduction using optimized butterfly pattern
+#pragma unroll
+  for (int m = 0; m < kNumTokens; m++) {
+    float sum = warp_result[m];
+
+    // Butterfly reduction pattern
+    sum += __shfl_xor_sync(0xffffffff, sum, 16);
+    sum += __shfl_xor_sync(0xffffffff, sum, 8);
+    sum += __shfl_xor_sync(0xffffffff, sum, 4);
+    sum += __shfl_xor_sync(0xffffffff, sum, 2);
+    sum += __shfl_xor_sync(0xffffffff, sum, 1);
+
+    // Only the first thread in each warp stores to shared memory
+    if (laneId == 0) {
+      sm_reduction[m][warpId] = sum;
+    }
+  }
+
+  __syncthreads();
+
+  // Final reduction across warps (only first thread)
+  if (tid == 0) {
+#pragma unroll
+    for (int m = 0; m < kNumTokens; m++) {
+      float final_sum = 0.0f;
+
+// Sum across the kNumWarps
+#pragma unroll
+      for (int w = 0; w < kNumWarps; w++) {
+        final_sum += sm_reduction[m][w];
+      }
+
+      // Write final result
+      out[m * kNumExperts + n_idx] = final_sum;
+    }
+  }
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.launch_dependents;");
+#endif
+}
+
+template <typename T, int kNumTokens, int kNumExperts, int kHiddenDim>
+void invokeRouterGemmFloatOutput(float* output, T const* mat_a, T const* mat_b, cudaStream_t stream) {
+  constexpr int VPT = 16 / sizeof(T);
+  constexpr int kBlockSize = 128;
+  cudaLaunchConfig_t config;
+  config.gridDim = kNumExperts;
+  config.blockDim = kBlockSize;
+  config.dynamicSmemBytes = 0;
+  config.stream = stream;
+  cudaLaunchAttribute attrs[1];
+  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+  attrs[0].val.programmaticStreamSerializationAllowed = getEnvEnablePDL();
+  config.numAttrs = 1;
+  config.attrs = attrs;
+  cudaLaunchKernelEx(
+      &config,
+      router_gemm_kernel_float_output<T, kBlockSize, VPT, kNumTokens, kNumExperts, kHiddenDim>,
+      output,
+      mat_a,
+      mat_b);
+}
+
+// Template instantiations for DEFAULT_NUM_EXPERTS experts
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 1, 256, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 2, 256, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 3, 256, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 4, 256, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 5, 256, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 6, 256, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 7, 256, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 8, 256, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 9, 256, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 10, 256, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 11, 256, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 12, 256, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 13, 256, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 14, 256, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 15, 256, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 16, 256, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+// Template instantiations for KIMI_K2_NUM_EXPERTS experts
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 1, 384, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 2, 384, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 3, 384, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 4, 384, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 5, 384, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 6, 384, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 7, 384, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 8, 384, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 9, 384, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 10, 384, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 11, 384, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 12, 384, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 13, 384, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 14, 384, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 15, 384, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
+
+template void invokeRouterGemmFloatOutput<__nv_bfloat16, 16, 384, 7168>(
+    float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t);
diff --git a/sgl-kernel/csrc/gemm/fp8_blockwise_gemm_kernel.cu b/sgl-kernel/csrc/gemm/fp8_blockwise_gemm_kernel.cu
new file mode 100644
index 000000000..b8b23c427
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/fp8_blockwise_gemm_kernel.cu
@@ -0,0 +1,465 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <cudaTypedefs.h>
+#include <cutlass/arch/arch.h>
+#include <cutlass/arch/memory.h>
+#include <cutlass/arch/mma.h>
+#include <cutlass/array.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/epilogue/thread/activation.h>
+#include <cutlass/epilogue/thread/linear_combination.h>
+#include <cutlass/epilogue/threadblock/default_thread_map_tensor_op.h>
+#include <cutlass/gemm/device/gemm.h>
+#include <cutlass/gemm/device/gemm_universal_adapter.h>
+#include <cutlass/gemm/gemm.h>
+#include <cutlass/gemm/kernel/default_gemm_universal_with_visitor.h>
+#include <cutlass/gemm/thread/mma.h>
+#include <cutlass/layout/matrix.h>
+#include <cutlass/matrix_coord.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/tensor_ref.h>
+#include <cutlass/util/host_tensor.h>
+#include <cutlass/util/tensor_view_io.h>
+#include <torch/all.h>
+
+#include <cute/tensor.hpp>
+#include <cutlass/epilogue/collective/collective_builder.hpp>
+#include <cutlass/epilogue/collective/default_epilogue.hpp>
+#include <cutlass/epilogue/threadblock/fusion/visitors.hpp>
+#include <cutlass/gemm/collective/collective_builder.hpp>
+#include <cutlass/gemm/dispatch_policy.hpp>
+#include <cutlass/gemm/kernel/gemm_universal.hpp>
+#include <cutlass/util/packed_stride.hpp>
+
+#include "cutlass_extensions/gemm/cutlass_gemm_caller.cuh"
+#include "cutlass_extensions/gemm/fp8_blockwise_gemm_sm90_dispatch.cuh"
+#include "utils.h"
+
+using namespace cute;
+
+template <
+    typename OutType,
+    typename MmaTileShape,
+    typename PerSmTileShape,
+    typename EpilogueTileShape,
+    typename ScalesPerTile,
+    int TileSizeM_ = 128,
+    class ClusterShape = Shape<_1, _1, _1>>
+void launch_sm100_fp8_blockwise_scaled_mm(
+    torch::Tensor& out,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b) {
+  static constexpr int ScaleMsPerTile = size<0>(ScalesPerTile{});
+  static constexpr int ScaleGranularityM = size<0>(MmaTileShape{}) / ScaleMsPerTile;
+  static constexpr int ScaleGranularityN = size<1>(MmaTileShape{}) / size<1>(ScalesPerTile{});
+  static constexpr int ScaleGranularityK = size<2>(MmaTileShape{}) / size<2>(ScalesPerTile{});
+
+  using ElementAB = cutlass::float_e4m3_t;
+  using ElementA = ElementAB;
+  using ElementB = ElementAB;
+  using ElementC = void;
+  using ElementD = OutType;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+  using LayoutC = LayoutD;
+  // This means both SFA and SFB are column-major.
+  using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
+      ScaleGranularityM,
+      ScaleGranularityN,
+      ScaleGranularityK,
+      cute::UMMA::Major::MN,
+      cute::UMMA::Major::K>;
+  using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
+  using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
+
+  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+  static constexpr int AlignmentC = AlignmentD;
+
+  using ElementAccumulator = float;
+  using ElementBlockScale = float;
+  using ElementCompute = float;
+  using ArchTag = cutlass::arch::Sm100;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag,
+      cutlass::arch::OpClassTensorOp,
+      PerSmTileShape,
+      ClusterShape,
+      EpilogueTileShape,
+      ElementAccumulator,
+      ElementCompute,
+      ElementC,
+      LayoutC,
+      AlignmentC,
+      ElementD,
+      LayoutD,
+      AlignmentD,
+      cutlass::epilogue::TmaWarpSpecialized1Sm>::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      ElementA,
+      cute::tuple<LayoutA, LayoutSFA>,
+      AlignmentA,
+      ElementB,
+      cute::tuple<LayoutB, LayoutSFB>,
+      AlignmentB,
+      ElementAccumulator,
+      MmaTileShape,
+      ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+          sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      cutlass::gemm::PersistentScheduler>;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  Gemm gemm_op;
+
+  int m = a.size(0);
+  int k = a.size(1);
+  int n = b.size(1);
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  auto scales_a_ptr = static_cast<float*>(scales_a.data_ptr());
+  auto scales_b_ptr = static_cast<float*>(scales_b.data_ptr());
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+
+  using StrideA = typename GemmKernel::StrideA;
+  using StrideB = typename GemmKernel::StrideB;
+  using StrideD = typename GemmKernel::StrideD;
+  using StrideC = typename GemmKernel::StrideD;
+
+  StrideA a_stride = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
+  StrideB b_stride = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
+  StrideC c_stride = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1));
+  LayoutSFA layout_SFA = ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
+  LayoutSFB layout_SFB = ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
+
+  typename GemmKernel::MainloopArguments mainloop_args{
+      a_ptr, a_stride, b_ptr, b_stride, scales_a_ptr, layout_SFA, scales_b_ptr, layout_SFB};
+
+  typename GemmKernel::EpilogueArguments epilogue_args{{}, c_ptr, c_stride, c_ptr, c_stride};
+  epilogue_args.thread.alpha = 1.0f;
+
+  typename GemmKernel::Arguments args = {
+      cutlass::gemm::GemmUniversalMode::kGemm, {m, n, k, 1}, mainloop_args, epilogue_args};
+
+  auto can_implement = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement == cutlass::Status::kSuccess, cutlassGetStatusString(can_implement))
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  auto init_status = gemm_op.initialize(args, workspace.get());
+  TORCH_CHECK(init_status == cutlass::Status::kSuccess, cutlassGetStatusString(init_status));
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  auto status = gemm_op.run(stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, cutlassGetStatusString(status))
+}
+
+template <typename OutType>
+void sm100_fp8_blockwise_dispatch_shape(
+    torch::Tensor& out,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b) {
+  if (a.size(0) <= 128) {
+    using MmaTileShape = Shape<_64, _128, _128>;
+    using PerSmTileShape = Shape<_64, _128, _128>;
+    using EpilogueTileShape = Shape<_64, _64>;
+    using ScalesPerTile = Shape<_64, _1, _1>;
+    launch_sm100_fp8_blockwise_scaled_mm<OutType, MmaTileShape, PerSmTileShape, EpilogueTileShape, ScalesPerTile>(
+        out, a, b, scales_a, scales_b);
+  } else {
+    using MmaTileShape = Shape<_128, _128, _128>;
+    using PerSmTileShape = Shape<_128, _128, _128>;
+    using EpilogueTileShape = Shape<_128, _64>;
+    using ScalesPerTile = Shape<_128, _1, _1>;
+    launch_sm100_fp8_blockwise_scaled_mm<OutType, MmaTileShape, PerSmTileShape, EpilogueTileShape, ScalesPerTile>(
+        out, a, b, scales_a, scales_b);
+  }
+}
+
+template <
+    typename OutType,
+    typename MmaTileShape,
+    typename PerSmTileShape,
+    typename EpilogueTileShape,
+    typename ScalesPerTile,
+    int TileSizeM_ = 128,
+    class ClusterShape = Shape<_1, _1, _1>>
+void launch_sm120_fp8_blockwise_scaled_mm(
+    torch::Tensor& out,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b) {
+  using ElementBlockScale = float;
+
+  // A matrix configuration
+  using ElementA = cutlass::float_e4m3_t;        // Element type for A matrix operand
+  using LayoutATag = cutlass::layout::RowMajor;  // Layout type for A matrix operand
+  constexpr int AlignmentA =
+      128 / cutlass::sizeof_bits<ElementA>::value;  // Memory access granularity/alignment of A matrix in units of
+                                                    // elements (up to 16 bytes)
+
+  // B matrix configuration
+  using ElementB = cutlass::float_e4m3_t;           // Element type for B matrix operand
+  using LayoutBTag = cutlass::layout::ColumnMajor;  // Layout type for B matrix operand
+  constexpr int AlignmentB =
+      128 / cutlass::sizeof_bits<ElementB>::value;  // Memory access granularity/alignment of B matrix in units of
+                                                    // elements (up to 16 bytes)
+
+  // C/D matrix configuration
+  using ElementD = OutType;                      // Element type for D matrix operand
+  using ElementC = void;                         // Element type for C matrix operand
+  using LayoutCTag = cutlass::layout::RowMajor;  // Layout type for C matrix operand
+  using LayoutDTag = cutlass::layout::RowMajor;  // Layout type for D matrix operand
+  constexpr int AlignmentD =
+      128 / cutlass::sizeof_bits<ElementD>::value;  // Memory access granularity/alignment of C matrix in units of
+                                                    // elements (up to 16 bytes)
+  constexpr int AlignmentC =
+      AlignmentD;  // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+  // Kernel functional config
+  using ElementAccumulator = float;      // Element type for internal accumulation
+  using ArchTag = cutlass::arch::Sm120;  // Tag indicating the minimum SM that supports the intended feature
+  using OperatorClass = cutlass::arch::OpClassTensorOp;  // Operator class tag - changed from OpClassBlockScaledTensorOp
+
+  static constexpr int ScaleMsPerTile = size<0>(ScalesPerTile{});
+  static constexpr int ScaleGranularityM = size<0>(MmaTileShape{}) / ScaleMsPerTile;
+  static constexpr int ScaleGranularityN = size<1>(MmaTileShape{}) / size<1>(ScalesPerTile{});
+  static constexpr int ScaleGranularityK = size<2>(MmaTileShape{}) / size<2>(ScalesPerTile{});
+
+  using ScaleConfig = cutlass::detail::Sm120BlockwiseScaleConfig<
+      ScaleGranularityM,
+      ScaleGranularityN,
+      ScaleGranularityK,
+      cute::UMMA::Major::MN,
+      cute::UMMA::Major::K>;
+  // FP8 Block-wise scaling configuration
+  using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());  // Layout type for SFA matrix operand
+  using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());  // Layout type for SFB matrix operand
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      PerSmTileShape,
+      ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator,
+      ElementAccumulator,
+      ElementC,
+      LayoutCTag,
+      AlignmentC,
+      ElementD,
+      LayoutDTag,
+      AlignmentD,
+      cutlass::epilogue::collective::EpilogueScheduleAuto  // Epilogue schedule policy
+      >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      ElementA,
+      cute::tuple<LayoutATag, LayoutSFA>,
+      AlignmentA,
+      ElementB,
+      cute::tuple<LayoutBTag, LayoutSFB>,
+      AlignmentB,
+      ElementAccumulator,
+      MmaTileShape,
+      ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+          sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::collective::KernelScheduleAuto  // Kernel schedule policy. Auto defaults to cooperative kernel
+                                                     // schedule
+      >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>,  // Indicates ProblemShape
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  Gemm gemm_op;
+
+  int m = a.size(0);
+  int k = a.size(1);
+  int n = b.size(1);
+
+  auto a_ptr = static_cast<ElementA*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementB*>(b.data_ptr());
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+
+  auto scales_a_ptr = static_cast<ElementBlockScale*>(scales_a.data_ptr());
+  auto scales_b_ptr = static_cast<ElementBlockScale*>(scales_b.data_ptr());
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+  using StrideC = typename Gemm::GemmKernel::StrideD;
+
+  StrideA stride_a = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
+  StrideB stride_b = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
+  StrideC stride_c = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1));
+  LayoutSFA layout_SFA = ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
+  LayoutSFB layout_SFB = ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
+
+  typename GemmKernel::MainloopArguments mainloop_args{
+      a_ptr, stride_a, b_ptr, stride_b, scales_a_ptr, layout_SFA, scales_b_ptr, layout_SFB};
+
+  typename GemmKernel::EpilogueArguments epilogue_args{{}, c_ptr, stride_c, c_ptr, stride_c};
+  epilogue_args.thread.alpha = 1.0f;
+
+  typename Gemm::Arguments args = {
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {m, n, k, 1},
+      mainloop_args,
+      epilogue_args,
+  };
+
+  auto can_implement = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement == cutlass::Status::kSuccess, cutlassGetStatusString(can_implement))
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  auto init_status = gemm_op.initialize(args, workspace.get());
+  TORCH_CHECK(init_status == cutlass::Status::kSuccess, cutlassGetStatusString(init_status));
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+  auto status = gemm_op.run(stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, cutlassGetStatusString(status))
+}
+
+template <typename OutType>
+void sm120_fp8_blockwise_dispatch_shape(
+    torch::Tensor& out,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b) {
+  using MmaTileShape = Shape<_128, _128, _128>;
+  using PerSmTileShape = Shape<_128, _128, _128>;
+  using EpilogueTileShape = Shape<_128, _64>;
+  using ScalesPerTile = Shape<_128, _1, _1>;
+  launch_sm120_fp8_blockwise_scaled_mm<OutType, MmaTileShape, PerSmTileShape, EpilogueTileShape, ScalesPerTile>(
+      out, a, b, scales_a, scales_b);
+}
+
+torch::Tensor fp8_blockwise_scaled_mm(
+    const torch::Tensor& mat_a,
+    const torch::Tensor& mat_b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const torch::Dtype& out_dtype) {
+  TORCH_CHECK(mat_a.is_cuda(), "mat_a must be a CUDA tensor");
+  TORCH_CHECK(mat_b.is_cuda(), "mat_b must be a CUDA tensor");
+  TORCH_CHECK(mat_a.dim() == 2, "mat_a must be a 2D tensor");
+  TORCH_CHECK(mat_b.dim() == 2, "mat_b must be a 2D tensor");
+  TORCH_CHECK(mat_a.stride(1) == 1, "mat_a must be a row major tensor");
+  TORCH_CHECK(mat_b.stride(0) == 1, "mat_b must be a column major tensor");
+  TORCH_CHECK(mat_a.size(1) == mat_b.size(0), "mat_a and mat_b shapes cannot be multiplied");
+
+  TORCH_CHECK(
+      (mat_a.size(1) * mat_a.element_size()) % 16 == 0, "mat_a must be multiple of 16 bytes for memory alignment");
+  TORCH_CHECK(
+      (mat_b.size(0) * mat_b.element_size()) % 16 == 0, "mat_b must be multiple of 16 bytes for memory alignment");
+  TORCH_CHECK(mat_a.scalar_type() == torch::kFloat8_e4m3fn, "mat_a must be Float8_e4m3fn");
+  TORCH_CHECK(mat_b.scalar_type() == torch::kFloat8_e4m3fn, "mat_b must be Float8_e4m3fn");
+  TORCH_CHECK(out_dtype == torch::kHalf || out_dtype == torch::kBFloat16, "out_dtype must be Half or BFloat16");
+
+  auto is_contiguous_vector = [](const torch::Tensor& t) {
+    auto t_sizes = t.sizes();
+    return t.is_contiguous() &&
+           (t.dim() == 1 || (t.dim() == 2 && *std::min_element(t_sizes.begin(), t_sizes.end()) == 1));
+  };
+
+  TORCH_CHECK(mat_a.size(0) == scales_a.size(0), "size of scales_a is not matched");
+  TORCH_CHECK(mat_a.size(1) / 128 == scales_a.size(1), "size of scales_a is not matched");
+  TORCH_CHECK(scales_a.stride(0) == 1 || is_contiguous_vector(scales_a), "scales_a must be M major");
+  TORCH_CHECK(mat_b.size(0) / 128 == scales_b.size(0), "size of scales_b is not matched");
+  TORCH_CHECK(mat_b.size(1) / 128 == scales_b.size(1), "size of scales_b is not matched");
+  TORCH_CHECK(scales_b.stride(0) == 1 || is_contiguous_vector(scales_b), "scales_b must be K major");
+  TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32, "scales_a must be Float32");
+  TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32, "scales_b must be Float32");
+
+  torch::Tensor out = torch::empty({mat_a.size(0), mat_b.size(1)}, mat_a.options().dtype(out_dtype));
+  TORCH_CHECK((out.size(1) * out.element_size()) % 16 == 0, "out must be multiple of 16 bytes for memory alignment");
+
+  auto sm_version = getSMVersion();
+
+  int64_t original_rows = mat_a.size(0);
+  torch::Tensor mat_a_padded = pad_tensor(mat_a, /*alignment=*/4);
+  torch::Tensor scales_a_padded = pad_tensor(scales_a, /*alignment=*/4, /*col_major=*/true);
+  torch::Tensor out_padded = torch::empty({mat_a_padded.size(0), mat_b.size(1)}, out.options());
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+#if defined CUDA_VERSION && CUDA_VERSION >= 12000
+  if (sm_version == 90) {
+    torch::Tensor scales_b_contiguous = scales_b.contiguous();
+    if (out_dtype == torch::kBFloat16) {
+      cutlass_gemm_blockwise_sm90_fp8_dispatch<cutlass::bfloat16_t>(
+          out_padded, mat_a_padded, mat_b, scales_a_padded, scales_b_contiguous);
+    } else {
+      cutlass_gemm_blockwise_sm90_fp8_dispatch<cutlass::half_t>(
+          out_padded, mat_a_padded, mat_b, scales_a_padded, scales_b_contiguous);
+    }
+    return out_padded.slice(0, 0, original_rows);
+  }
+#endif
+#endif
+
+#if defined(CUTLASS_ARCH_MMA_SM100A_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+#if defined CUDA_VERSION && CUDA_VERSION >= 12080
+  if (sm_version == 100
+#if CUDA_VERSION >= 12090
+      || sm_version == 103
+#endif
+  ) {
+    if (out_dtype == torch::kBFloat16) {
+      sm100_fp8_blockwise_dispatch_shape<cutlass::bfloat16_t>(
+          out_padded, mat_a_padded, mat_b, scales_a_padded, scales_b);
+    } else {
+      sm100_fp8_blockwise_dispatch_shape<cutlass::half_t>(out_padded, mat_a_padded, mat_b, scales_a_padded, scales_b);
+    }
+    return out_padded.slice(0, 0, original_rows);
+  }
+#endif
+#endif
+
+#if defined(CUTLASS_ARCH_MMA_SM120A_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 12080
+  if (sm_version == 120) {
+    if (out_dtype == torch::kBFloat16) {
+      sm120_fp8_blockwise_dispatch_shape<cutlass::bfloat16_t>(
+          out_padded, mat_a_padded, mat_b, scales_a_padded, scales_b);
+    } else {
+      sm120_fp8_blockwise_dispatch_shape<cutlass::half_t>(out_padded, mat_a_padded, mat_b, scales_a_padded, scales_b);
+    }
+    return out_padded.slice(0, 0, original_rows);
+  }
+#endif
+#endif
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false, "No implemented fp8_blockwise_scaled_mm for current compute capability: ", sm_version);
+}
diff --git a/sgl-kernel/csrc/gemm/fp8_gemm_kernel.cu b/sgl-kernel/csrc/gemm/fp8_gemm_kernel.cu
new file mode 100644
index 000000000..0a9e6b7a5
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/fp8_gemm_kernel.cu
@@ -0,0 +1,1252 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Adapted from
+// https://github.com/NVIDIA/TensorRT-LLM/blob/v0.16.0/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_template.h
+// https://github.com/NVIDIA/TensorRT-LLM/blob/v0.16.0/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm89.h
+// https://github.com/NVIDIA/TensorRT-LLM/blob/v0.16.0/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm90.h
+
+#include <ATen/cuda/CUDAContext.h>
+#include <cudaTypedefs.h>
+#include <cutlass/arch/arch.h>
+#include <cutlass/arch/memory.h>
+#include <cutlass/arch/mma.h>
+#include <cutlass/array.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/epilogue/thread/activation.h>
+#include <cutlass/epilogue/thread/linear_combination.h>
+#include <cutlass/epilogue/threadblock/default_thread_map_tensor_op.h>
+#include <cutlass/gemm/device/gemm.h>
+#include <cutlass/gemm/device/gemm_universal_adapter.h>
+#include <cutlass/gemm/gemm.h>
+#include <cutlass/gemm/kernel/default_gemm_universal_with_visitor.h>
+#include <cutlass/gemm/thread/mma.h>
+#include <cutlass/layout/matrix.h>
+#include <cutlass/matrix_coord.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/tensor_ref.h>
+#include <torch/all.h>
+
+#include <cute/tensor.hpp>
+#include <cutlass/epilogue/collective/collective_builder.hpp>
+#include <cutlass/epilogue/collective/default_epilogue.hpp>
+#include <cutlass/epilogue/threadblock/fusion/visitors.hpp>
+#include <cutlass/gemm/collective/collective_builder.hpp>
+#include <cutlass/gemm/dispatch_policy.hpp>
+#include <cutlass/gemm/kernel/gemm_universal.hpp>
+#include <cutlass/util/packed_stride.hpp>
+
+#include "math.hpp"
+#include "utils.h"
+
+using namespace cute;
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12040
+template <
+    typename ElementType,
+    typename OutElementType,
+    typename AccumElementType,
+    typename CtaShape,
+    typename WarpShape,
+    int Stages,
+    bool WithBias,
+    typename FP8MathOperator = cutlass::arch::OpMultiplyAdd,
+    template <typename...> typename EpilogueVisitor = cutlass::epilogue::threadblock::Sm80EVT,
+    typename ThreadblockSwizzle = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>>
+struct DeviceGemmFp8RowwiseSm89 {
+  static_assert(std::is_same_v<ElementType, cutlass::float_e4m3_t>, "ElementType must be FP8(e4m3)");
+
+  using ElementA = ElementType;
+  using LayoutA = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+
+  using ElementB = ElementType;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+
+  using ElementC = OutElementType;
+  using LayoutC = cutlass::layout::RowMajor;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+
+  using ElementOutput = OutElementType;
+  using LayoutOutput = cutlass::layout::RowMajor;
+  static constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+
+  using ElementAccumulator = AccumElementType;
+  using ElementComputeEpilogue = float;
+  using ArchTag = cutlass::arch::Sm89;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+  // Number of epilogue stages in EVT
+  static constexpr int EVTEpilogueStages = 1;
+
+  using OutputTileThreadMap = cutlass::epilogue::threadblock::
+      OutputTileThreadLayout<CtaShape, WarpShape, ElementC, AlignmentC, EVTEpilogueStages>;
+
+  // Definition of EVT
+  using accSrc = cutlass::epilogue::threadblock::VisitorAccFetch;
+
+  using ComputeBScale = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiplies,
+      ElementComputeEpilogue,
+      ElementComputeEpilogue,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+  using bScaleSrc = cutlass::epilogue::threadblock::
+      VisitorRowBroadcast<OutputTileThreadMap, ElementComputeEpilogue, Stride<_0, _1, _0>>;
+  using EpilogueBScale = cutlass::epilogue::threadblock::Sm80EVT<ComputeBScale, accSrc, bScaleSrc>;
+
+  using ComputeAScale = cutlass::epilogue::threadblock::
+      VisitorCompute<cutlass::multiplies, ElementC, ElementComputeEpilogue, cutlass::FloatRoundStyle::round_to_nearest>;
+  using aScaleSrc = cutlass::epilogue::threadblock::
+      VisitorColBroadcast<OutputTileThreadMap, ElementComputeEpilogue, Stride<_1, _0, _0>>;
+  using EpilogueAScale = cutlass::epilogue::threadblock::Sm80EVT<ComputeAScale, EpilogueBScale, aScaleSrc>;
+
+  // With bias
+  using biasSrc =
+      cutlass::epilogue::threadblock::VisitorRowBroadcast<OutputTileThreadMap, ElementOutput, Stride<_0, _1, _0>>;
+  using ComputeAScaleWithBias = cutlass::epilogue::threadblock::VisitorCompute<
+      cutlass::multiply_add,
+      ElementC,
+      ElementComputeEpilogue,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+  using EpilogueAScaleWithBias =
+      cutlass::epilogue::threadblock::Sm80EVT<ComputeAScaleWithBias, EpilogueBScale, aScaleSrc, biasSrc>;
+
+  using dTar = cutlass::epilogue::threadblock::VisitorAuxStore<
+      OutputTileThreadMap,
+      ElementC,
+      cutlass::FloatRoundStyle::round_to_nearest,
+      Stride<int64_t, _1, _0>>;
+  using EpilogueStore = typename cutlass::platform::conditional<
+      WithBias,
+      cutlass::epilogue::threadblock::Sm80EVT<dTar, EpilogueAScaleWithBias>,
+      cutlass::epilogue::threadblock::Sm80EVT<dTar, EpilogueAScale>>::type;
+
+  using EpilogueOp = EpilogueStore;
+
+  using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
+      ElementA,
+      LayoutA,
+      cutlass::ComplexTransform::kNone,
+      AlignmentA,
+      ElementB,
+      LayoutB,
+      cutlass::ComplexTransform::kNone,
+      AlignmentB,
+      ElementC,
+      LayoutC,
+      AlignmentC,
+      ElementAccumulator,
+      ElementComputeEpilogue,
+      OperatorClass,
+      ArchTag,
+      CtaShape,
+      WarpShape,
+      InstructionShape,
+      EpilogueOp,
+      ThreadblockSwizzle,
+      Stages,
+      FP8MathOperator,
+      EVTEpilogueStages>::GemmKernel;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+};
+
+template <typename Gemm, bool WithBias>
+typename Gemm::Arguments prepare_sm89_fp8_args(
+    torch::Tensor& out,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const c10::optional<torch::Tensor>& bias) {
+  using ElementT = typename Gemm::ElementA;
+  using ElementOutput = typename Gemm::ElementD;
+  using ElementComputeEpilogue = float;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  ElementT const* ptr_a = reinterpret_cast<ElementT const*>(a.data_ptr());
+  ElementT const* ptr_b = reinterpret_cast<ElementT const*>(b.data_ptr());
+  ElementOutput const* ptr_bias = nullptr;
+  if constexpr (WithBias) {
+    TORCH_CHECK(bias.has_value())
+    ptr_bias = reinterpret_cast<ElementOutput const*>(bias.value().data_ptr());
+  }
+  ElementOutput* ptr_d = reinterpret_cast<ElementOutput*>(out.data_ptr());
+  ElementComputeEpilogue const* ptr_scales_a = reinterpret_cast<ElementComputeEpilogue const*>(scales_a.data_ptr());
+  ElementComputeEpilogue const* ptr_scales_b = reinterpret_cast<ElementComputeEpilogue const*>(scales_b.data_ptr());
+
+  typename Gemm::Arguments args(
+      cutlass::gemm::GemmUniversalMode::kGemm,  // Mode
+      {m, n, k},                                // Problem size
+      1,                                        // Split-k factor
+      {},                                       // Epilogue args
+      ptr_a,                                    // a pointer
+      ptr_b,                                    // b pointer
+      nullptr,                                  // c pointer (unused)
+      nullptr,                                  // d pointer (unused)
+      m * k,                                    // batch stride a (unused)
+      n * k,                                    // batch stride b (unused)
+      m * n,                                    // batch stride c (unused)
+      m * n,                                    // batch stride d (unused)
+      lda,                                      // stride a
+      ldb,                                      // stride b
+      ldc,                                      // stride c (unused)
+      ldc);                                     // stride d (unused)
+  if constexpr (WithBias) {
+    args.epilogue = {
+        {
+            {
+                {},  // Accumulator
+                {ptr_scales_b, ElementComputeEpilogue(0), {_0{}, _1{}, _0{}}},
+                {}  // Multiplies
+            },
+            {ptr_scales_a, ElementComputeEpilogue(0), {_1{}, _0{}, _0{}}},
+            {ptr_bias, ElementOutput(0), {_0{}, _1{}, _0{}}},
+            {}  // Multiplies
+        },
+        {ptr_d, {n, _1{}, _0{}}}};
+  } else {
+    args.epilogue = {
+        {
+            {
+                {},  // Accumulator
+                {ptr_scales_b, ElementComputeEpilogue(0), {_0{}, _1{}, _0{}}},
+                {}  // Multiplies
+            },
+            {ptr_scales_a, ElementComputeEpilogue(0), {_1{}, _0{}, _0{}}},
+            {}  // Multiplies
+        },
+        {ptr_d, {n, _1{}, _0{}}}};
+  }
+
+  return args;
+}
+
+template <typename Gemm, bool WithBias>
+void launch_sm89_fp8_scaled_mm(
+    torch::Tensor& out,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const c10::optional<torch::Tensor>& bias) {
+  auto args = prepare_sm89_fp8_args<Gemm, WithBias>(out, a, b, scales_a, scales_b, bias);
+  Gemm gemm_op;
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options = torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  auto can_implement = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement == cutlass::Status::kSuccess)
+
+  auto status = gemm_op(args, workspace.data_ptr(), stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess)
+}
+
+template <typename OutType, typename CtaShape, typename WarpShape, int Stages>
+void sm89_fp8_dispatch_bias(
+    torch::Tensor& out,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const c10::optional<torch::Tensor>& bias) {
+  using ElementInput = cutlass::float_e4m3_t;
+  using ElementOutput = OutType;
+  using AccumElementType = float;
+  if (bias) {
+    using Gemm = typename DeviceGemmFp8RowwiseSm89<
+        ElementInput,
+        ElementOutput,
+        AccumElementType,
+        CtaShape,
+        WarpShape,
+        Stages,
+        true>::Gemm;
+    return launch_sm89_fp8_scaled_mm<Gemm, true>(out, a, b, scales_a, scales_b, bias);
+  } else {
+    using Gemm = typename DeviceGemmFp8RowwiseSm89<
+        ElementInput,
+        ElementOutput,
+        AccumElementType,
+        CtaShape,
+        WarpShape,
+        Stages,
+        false>::Gemm;
+    return launch_sm89_fp8_scaled_mm<Gemm, false>(out, a, b, scales_a, scales_b, bias);
+  }
+}
+
+template <typename OutType>
+void sm89_fp8_dispatch_shape(
+    torch::Tensor& out,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const c10::optional<torch::Tensor>& bias) {
+  uint32_t const m = a.size(0);
+  uint32_t const n = out.size(1);
+
+  if (m == 1) {
+    if (n <= 8192) {
+      return sm89_fp8_dispatch_bias<
+          OutType,
+          cutlass::gemm::GemmShape<16, 64, 128>,
+          cutlass::gemm::GemmShape<16, 64, 64>,
+          7>(out, a, b, scales_a, scales_b, bias);
+    } else {
+      return sm89_fp8_dispatch_bias<
+          OutType,
+          cutlass::gemm::GemmShape<32, 64, 128>,
+          cutlass::gemm::GemmShape<16, 64, 64>,
+          5>(out, a, b, scales_a, scales_b, bias);
+    }
+  } else if (m <= 16) {
+    // M in (1, 16]
+    if (n <= 8192) {
+      return sm89_fp8_dispatch_bias<
+          OutType,
+          cutlass::gemm::GemmShape<16, 64, 128>,
+          cutlass::gemm::GemmShape<16, 64, 64>,
+          4>(out, a, b, scales_a, scales_b, bias);
+    } else if (n <= 16384) {
+      return sm89_fp8_dispatch_bias<
+          OutType,
+          cutlass::gemm::GemmShape<32, 64, 128>,
+          cutlass::gemm::GemmShape<16, 64, 64>,
+          5>(out, a, b, scales_a, scales_b, bias);
+    } else {
+      return sm89_fp8_dispatch_bias<
+          OutType,
+          cutlass::gemm::GemmShape<16, 64, 128>,
+          cutlass::gemm::GemmShape<16, 64, 64>,
+          7>(out, a, b, scales_a, scales_b, bias);
+    }
+  } else if (m <= 64) {
+    // M in (16, 64]
+    if (n <= 16384) {
+      return sm89_fp8_dispatch_bias<
+          OutType,
+          cutlass::gemm::GemmShape<32, 64, 128>,
+          cutlass::gemm::GemmShape<16, 64, 64>,
+          7>(out, a, b, scales_a, scales_b, bias);
+    } else {
+      return sm89_fp8_dispatch_bias<
+          OutType,
+          cutlass::gemm::GemmShape<16, 64, 128>,
+          cutlass::gemm::GemmShape<16, 64, 64>,
+          7>(out, a, b, scales_a, scales_b, bias);
+    }
+  } else if (m <= 128) {
+    // M in (64, 128]
+    if (n <= 8192) {
+      return sm89_fp8_dispatch_bias<
+          OutType,
+          cutlass::gemm::GemmShape<64, 64, 128>,
+          cutlass::gemm::GemmShape<32, 64, 64>,
+          4>(out, a, b, scales_a, scales_b, bias);
+    } else if (n <= 16384) {
+      return sm89_fp8_dispatch_bias<
+          OutType,
+          cutlass::gemm::GemmShape<64, 64, 128>,
+          cutlass::gemm::GemmShape<32, 64, 64>,
+          5>(out, a, b, scales_a, scales_b, bias);
+    } else {
+      return sm89_fp8_dispatch_bias<
+          OutType,
+          cutlass::gemm::GemmShape<32, 64, 128>,
+          cutlass::gemm::GemmShape<16, 64, 64>,
+          5>(out, a, b, scales_a, scales_b, bias);
+    }
+  } else if (m <= 256) {
+    // M in (128, 256]
+    if (n <= 8192) {
+      return sm89_fp8_dispatch_bias<
+          OutType,
+          cutlass::gemm::GemmShape<128, 64, 64>,
+          cutlass::gemm::GemmShape<64, 32, 64>,
+          5>(out, a, b, scales_a, scales_b, bias);
+    } else if (n <= 16384) {
+      return sm89_fp8_dispatch_bias<
+          OutType,
+          cutlass::gemm::GemmShape<64, 128, 64>,
+          cutlass::gemm::GemmShape<64, 32, 64>,
+          7>(out, a, b, scales_a, scales_b, bias);
+    } else {
+      return sm89_fp8_dispatch_bias<
+          OutType,
+          cutlass::gemm::GemmShape<128, 64, 128>,
+          cutlass::gemm::GemmShape<64, 32, 128>,
+          4>(out, a, b, scales_a, scales_b, bias);
+    }
+  } else if (m <= 512) {
+    // M in (256, 512)
+    if (n <= 16384) {
+      return sm89_fp8_dispatch_bias<
+          OutType,
+          cutlass::gemm::GemmShape<128, 128, 64>,
+          cutlass::gemm::GemmShape<64, 32, 64>,
+          2>(out, a, b, scales_a, scales_b, bias);
+    } else {
+      return sm89_fp8_dispatch_bias<
+          OutType,
+          cutlass::gemm::GemmShape<128, 128, 64>,
+          cutlass::gemm::GemmShape<64, 32, 64>,
+          4>(out, a, b, scales_a, scales_b, bias);
+    }
+  } else {
+    // M in (512, inf)
+    if (n <= 8192) {
+      return sm89_fp8_dispatch_bias<
+          OutType,
+          cutlass::gemm::GemmShape<128, 128, 64>,
+          cutlass::gemm::GemmShape<64, 32, 64>,
+          3>(out, a, b, scales_a, scales_b, bias);
+    } else {
+      return sm89_fp8_dispatch_bias<
+          OutType,
+          cutlass::gemm::GemmShape<128, 128, 64>,
+          cutlass::gemm::GemmShape<64, 32, 64>,
+          2>(out, a, b, scales_a, scales_b, bias);
+    }
+  }
+}
+#endif
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12000
+template <
+    typename ElementType,
+    typename OutElementType,
+    typename AccumElementType,
+    typename CTAShape,
+    typename ClusterShape,
+    typename MainloopScheduleType,
+    typename EpilogueScheduleType,
+    typename TileSchedulerType = void,
+    bool WithBias = false>
+struct DeviceGemmFp8RowwiseSm90 {
+  static_assert(std::is_same_v<ElementType, cutlass::float_e4m3_t>, "ElementType must be FP8(e4m3)");
+
+  // A matrix configuration
+  using ElementA = ElementType;               // Element type for A matrix operand
+  using LayoutA = cutlass::layout::RowMajor;  // Layout type for A matrix operand
+  static constexpr int AlignmentA =
+      128 / cutlass::sizeof_bits<ElementA>::value;  // Memory access granularity/alignment of A
+                                                    // matrix in units of elements (up to 16 bytes)
+
+  // B matrix configuration
+  using ElementB = ElementType;                  // Element type for B matrix operand
+  using LayoutB = cutlass::layout::ColumnMajor;  // Layout type for B matrix operand
+  static constexpr int AlignmentB =
+      128 / cutlass::sizeof_bits<ElementB>::value;  // Memory access granularity/alignment of B
+                                                    // matrix in units of elements (up to 16 bytes)
+
+  // C/D matrix configuration
+  using ElementC = void;                      // Element type for C matrix operands
+  using LayoutC = cutlass::layout::RowMajor;  // Layout type for C matrix operands
+  static constexpr int AlignmentC =
+      128 / cutlass::sizeof_bits<OutElementType>::value;  // Memory access granularity/alignment of C matrices in
+                                                          // units of elements (up to 16 bytes)
+
+  // Output matrix configuration
+  using ElementOutput = OutElementType;            // Element type for output matrix operands
+  using LayoutOutput = cutlass::layout::RowMajor;  // Layout type for output matrix operands
+  static constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+
+  // // Auxiliary matrix configuration and other fusion types
+  // using ElementBias = float;
+
+  // Multiply-accumulate blocking/pipelining details
+  using ElementAccumulator = AccumElementType;  // Element type for internal accumulation
+  using ElementCompute = float;                 // Element type for compute
+  using ElementComputeEpilogue = float;
+  using ArchTag = cutlass::arch::Sm90;  // Tag indicating the minimum SM that supports the intended feature
+  using OperatorClass = cutlass::arch::OpClassTensorOp;  // Operator class tag
+  using TileShape = CTAShape;                            // Threadblock-level tile size
+
+  static constexpr bool PONG = false;
+  static constexpr bool FAST_ACCUM = true;
+  static constexpr bool USE_BIAS = false;
+
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;      // Stage count maximized
+                                                                         // based on the tile size
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;  // Kernel to launch based on the default
+                                                                         // setting in the Collective Builder
+  // Implement rowwise scaling epilogue.
+  using XScale = cutlass::epilogue::fusion::Sm90ColBroadcast<
+      0,
+      TileShape,
+      ElementComputeEpilogue,
+      ElementComputeEpilogue,
+      cute::Stride<cute::Int<1>, cute::Int<0>, cute::Int<0>>>;
+
+  using WScale = cutlass::epilogue::fusion::Sm90RowBroadcast<
+      0,
+      TileShape,
+      ElementComputeEpilogue,
+      ElementComputeEpilogue,
+      cute::Stride<cute::Int<0>, cute::Int<1>, cute::Int<0>>>;
+
+  using Bias = cutlass::epilogue::fusion::Sm90RowBroadcast<
+      0,
+      TileShape,
+      ElementOutput,
+      ElementOutput,
+      cute::Stride<cute::Int<0>, cute::Int<1>, cute::Int<0>>>;
+
+  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies,
+      ElementComputeEpilogue,  // First stage output type.
+      ElementComputeEpilogue,  // First stage input types.
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 = cutlass::epilogue::fusion::Sm90EVT<Compute0, WScale, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies,
+      ElementOutput,
+      ElementComputeEpilogue,  // Second stage input types.
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute1 = cutlass::epilogue::fusion::Sm90EVT<Compute1, XScale, EVTCompute0>;
+
+  // With bias
+  using ComputeWithBias = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add,
+      ElementOutput,
+      ElementComputeEpilogue,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+  using EVTComputeWithBias = cutlass::epilogue::fusion::Sm90EVT<ComputeWithBias, XScale, EVTCompute0, Bias>;
+
+  using EpilogueEVT = typename cutlass::platform::conditional<WithBias, EVTComputeWithBias, EVTCompute1>::type;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90,
+      cutlass::arch::OpClassTensorOp,
+      TileShape,
+      ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator,
+      ElementComputeEpilogue,
+      ElementC,
+      LayoutC,
+      AlignmentC,
+      ElementOutput,
+      LayoutOutput,
+      AlignmentOutput,
+      cutlass::epilogue::TmaWarpSpecialized,
+      EpilogueEVT>::CollectiveOp;
+
+  using DefaultSchedule = cutlass::gemm::KernelTmaWarpSpecialized;
+  using PongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using FastDefaultSchedule = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using FastPongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+
+  using SlowAccum = DefaultSchedule;
+  using FastAccum = FastPongSchedule;  // Default apply Pingpong
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      ElementA,
+      LayoutA,
+      AlignmentA,
+      ElementB,
+      LayoutB,
+      AlignmentB,
+      ElementAccumulator,
+      TileShape,
+      ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+          sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopScheduleType>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>,  // Indicates ProblemShape
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      TileSchedulerType>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+};
+
+template <typename Gemm, bool WithBias>
+typename Gemm::Arguments prepare_sm90_fp8_args(
+    torch::Tensor& out,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const c10::optional<torch::Tensor>& bias) {
+  using ElementT = typename Gemm::ElementA;
+  using ElementOutput = typename Gemm::ElementD;
+  using ElementComputeEpilogue = float;
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+  ElementT const* ptr_a = reinterpret_cast<ElementT const*>(a.data_ptr());
+  ElementT const* ptr_b = reinterpret_cast<ElementT const*>(b.data_ptr());
+  ElementOutput const* ptr_bias = nullptr;
+  if constexpr (WithBias) {
+    TORCH_CHECK(bias.has_value())
+    ptr_bias = reinterpret_cast<ElementOutput const*>(bias.value().data_ptr());
+  }
+  ElementOutput* ptr_d = reinterpret_cast<ElementOutput*>(out.data_ptr());
+  ElementComputeEpilogue const* ptr_scales_a = reinterpret_cast<ElementComputeEpilogue const*>(scales_a.data_ptr());
+  ElementComputeEpilogue const* ptr_scales_b = reinterpret_cast<ElementComputeEpilogue const*>(scales_b.data_ptr());
+
+  StrideA stride_a = cutlass::make_cute_packed_stride(StrideA{}, make_shape(m, k, 1));
+  StrideB stride_b = cutlass::make_cute_packed_stride(StrideB{}, make_shape(n, k, 1));
+  StrideC stride_c;
+  StrideD stride_d = cutlass::make_cute_packed_stride(StrideD{}, make_shape(m, n, 1));
+  typename Gemm::Arguments args = {
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {m, n, k, 1},
+      {ptr_a, stride_a, ptr_b, stride_b},
+      {{},  // epilogue.thread
+       nullptr,
+       stride_c,
+       ptr_d,
+       stride_d}};
+  if constexpr (WithBias) {
+    args.epilogue.thread = {
+        {ptr_scales_a},
+        {
+            {ptr_scales_b},
+            {},  // Accumulator
+            {}   // Multiplies
+        },
+        {ptr_bias},
+        {},  // Multiplies
+    };
+  } else {
+    args.epilogue.thread = {
+        {ptr_scales_a},
+        {
+            {ptr_scales_b},
+            {},  // Accumulator
+            {}   // Multiplies
+        },
+        {},  // Multiplies
+    };
+  }
+
+  return args;
+}
+
+template <typename Gemm, bool WithBias>
+void launch_sm90_fp8_scaled_mm(
+    torch::Tensor& out,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const c10::optional<torch::Tensor>& bias) {
+  auto args = prepare_sm90_fp8_args<Gemm, WithBias>(out, a, b, scales_a, scales_b, bias);
+  Gemm gemm_op;
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options = torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  auto can_implement = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement == cutlass::Status::kSuccess)
+
+  auto status = gemm_op.run(args, workspace.data_ptr(), stream);
+
+  TORCH_CHECK(status == cutlass::Status::kSuccess)
+}
+
+template <
+    typename OutType,
+    typename CTAShape,
+    typename ClusterShape,
+    typename MainloopScheduleType,
+    typename TileSchedulerType>
+void sm90_fp8_dispatch_bias(
+    torch::Tensor& out,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const c10::optional<torch::Tensor>& bias,
+    bool fast_accum = true,
+    bool use_persistent = false) {
+  using ElementInput = cutlass::float_e4m3_t;
+  using ElementOutput = OutType;
+  using AccumElementType = float;
+  using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized;
+
+  if (bias) {
+    using Gemm = typename DeviceGemmFp8RowwiseSm90<
+        ElementInput,
+        ElementOutput,
+        AccumElementType,
+        CTAShape,
+        ClusterShape,
+        MainloopScheduleType,
+        EpilogueScheduleType,
+        TileSchedulerType,
+        true>::Gemm;
+    return launch_sm90_fp8_scaled_mm<Gemm, true>(out, a, b, scales_a, scales_b, bias);
+  } else {
+    using Gemm = typename DeviceGemmFp8RowwiseSm90<
+        ElementInput,
+        ElementOutput,
+        AccumElementType,
+        CTAShape,
+        ClusterShape,
+        MainloopScheduleType,
+        EpilogueScheduleType,
+        TileSchedulerType,
+        false>::Gemm;
+    return launch_sm90_fp8_scaled_mm<Gemm, false>(out, a, b, scales_a, scales_b, bias);
+  }
+}
+
+template <typename OutType>
+void sm90_fp8_dispatch_shape(
+    torch::Tensor& out,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const c10::optional<torch::Tensor>& bias) {
+  uint32_t const m = a.size(0);
+  using FastPingpongScheduler = cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using FastBasicScheduler = cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
+  using PersistentTileScheduler = cutlass::gemm::PersistentScheduler;
+  using BasicTileScheduler = void;
+  if (m <= 1) {
+    return sm90_fp8_dispatch_bias<
+        OutType,
+        Shape<_64, _64, _128>,
+        Shape<_1, _8, _1>,
+        FastBasicScheduler,
+        BasicTileScheduler>(out, a, b, scales_a, scales_b, bias);
+  }
+  if (m <= 64) {
+    // m in [1, 64]
+    return sm90_fp8_dispatch_bias<
+        OutType,
+        Shape<_64, _64, _128>,
+        Shape<_1, _4, _1>,
+        FastPingpongScheduler,
+        PersistentTileScheduler>(out, a, b, scales_a, scales_b, bias);
+  } else if (m <= 256) {
+    // m in (64, 256]
+    return sm90_fp8_dispatch_bias<
+        OutType,
+        Shape<_64, _64, _128>,
+        Shape<_1, _1, _1>,
+        FastPingpongScheduler,
+        PersistentTileScheduler>(out, a, b, scales_a, scales_b, bias);
+  } else if (m <= 1024) {
+    // m in (256, 1024]
+    return sm90_fp8_dispatch_bias<
+        OutType,
+        Shape<_128, _128, _128>,
+        Shape<_1, _1, _1>,
+        FastPingpongScheduler,
+        PersistentTileScheduler>(out, a, b, scales_a, scales_b, bias);
+  } else {
+    // m in (1024, inf)
+    return sm90_fp8_dispatch_bias<
+        OutType,
+        Shape<_128, _128, _128>,
+        Shape<_2, _1, _1>,
+        FastPingpongScheduler,
+        PersistentTileScheduler>(out, a, b, scales_a, scales_b, bias);
+  }
+}
+#endif
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12080
+template <
+    typename ElementType,
+    typename OutElementType,
+    typename AccumElementType,
+    typename CTAShape,
+    typename ClusterShape,
+    typename MainloopScheduleType,
+    typename EpilogueScheduleType,
+    typename TileSchedulerType = void,
+    bool WithBias = false>
+struct DeviceGemmFp8RowwiseSm100 {
+  static_assert(std::is_same_v<ElementType, cutlass::float_e4m3_t>, "ElementType must be FP8(e4m3)");
+  using TileShape = CTAShape;
+  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+
+  using ElementComputeEpilogue = float;
+  using ScaleA = cutlass::epilogue::fusion::Sm90ColBroadcast<
+      0,
+      TileShape,
+      ElementComputeEpilogue,
+      ElementComputeEpilogue,
+      cute::Stride<cute::Int<1>, cute::Int<0>, cute::Int<0>>>;
+
+  using ScaleB = cutlass::epilogue::fusion::Sm90RowBroadcast<
+      0,
+      TileShape,
+      ElementComputeEpilogue,
+      ElementComputeEpilogue,
+      cute::Stride<cute::Int<0>, cute::Int<1>, cute::Int<0>>>;
+
+  using Bias = cutlass::epilogue::fusion::Sm90RowBroadcast<
+      0,
+      TileShape,
+      OutElementType,
+      OutElementType,
+      cute::Stride<cute::Int<0>, cute::Int<1>, cute::Int<0>>>;
+
+  using Compute0 = cutlass::epilogue::fusion::
+      Sm90Compute<cutlass::multiplies, float, float, cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 = cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementType>::value;
+
+  using LayoutB = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementType>::value;
+
+  using ElementC = void;
+  using LayoutC = cutlass::layout::RowMajor;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<OutElementType>::value;
+
+  using LayoutD = cutlass::layout::RowMajor;
+  static constexpr int AlignmentD = AlignmentC;
+
+  using Compute1MulAdd = cutlass::epilogue::fusion::
+      Sm90Compute<cutlass::multiply_add, OutElementType, float, cutlass::FloatRoundStyle::round_to_nearest>;
+  using Compute1Mul = cutlass::epilogue::fusion::
+      Sm90Compute<cutlass::multiplies, OutElementType, float, cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute = typename std::conditional_t<
+      WithBias,
+      cutlass::epilogue::fusion::Sm90EVT<Compute1MulAdd, ScaleA, EVTCompute0, Bias>,
+      cutlass::epilogue::fusion::Sm90EVT<Compute1Mul, ScaleA, EVTCompute0>>;
+  using ArgumentType = typename EVTCompute::Arguments;
+  // MMA type
+  using ElementAccumulator = AccumElementType;
+
+  // Epilogue types
+  using ElementCompute = float;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm100,
+      cutlass::arch::OpClassTensorOp,
+      TileShape,
+      ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator,
+      ElementCompute,
+      ElementC,
+      LayoutC,
+      AlignmentC,
+      OutElementType,
+      LayoutD,
+      AlignmentD,
+      EpilogueScheduleType,
+      EVTCompute>::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm100,
+      cutlass::arch::OpClassTensorOp,
+      ElementType,
+      LayoutA,
+      AlignmentA,
+      ElementType,
+      LayoutB,
+      AlignmentB,
+      ElementAccumulator,
+      TileShape,
+      ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+          sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopScheduleType>::CollectiveOp;
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmUniversal<Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(torch::Tensor const& tensor) {
+    using Arguments = typename Descriptor::Arguments;
+    auto* data_ptr = static_cast<T*>(tensor.data_ptr());
+    static_assert(
+        std::is_same_v<Descriptor, ScaleA> || std::is_same_v<Descriptor, ScaleB> || std::is_same_v<Descriptor, Bias>);
+    return Arguments{data_ptr};
+  }
+
+ public:
+  static ArgumentType prepare_args(
+      torch::Tensor const& a_scales,
+      torch::Tensor const& b_scales,
+      std::optional<torch::Tensor> const& bias = std::nullopt) {
+    auto a_args = args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = args_from_tensor<ScaleB, float>(b_scales);
+
+    typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
+
+    if constexpr (WithBias) {
+      auto bias_args = args_from_tensor<Bias, OutElementType>(bias.value());
+      return ArgumentType{a_args, evt0_args, bias_args, {}};
+    } else {
+      return ArgumentType{a_args, evt0_args, {}};
+    }
+  }
+};
+
+template <typename GemmType, bool WithBias>
+typename GemmType::Gemm::Arguments prepare_sm100_fp8_args(
+    torch::Tensor& out,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const c10::optional<torch::Tensor>& bias) {
+  using Gemm = typename GemmType::Gemm;
+  using ElementT = typename Gemm::ElementA;
+  using ElementC = typename Gemm::ElementC;
+  using ElementOutput = typename Gemm::ElementD;
+  using ElementComputeEpilogue = float;
+  using GemmKernel = typename Gemm::GemmKernel;
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = StrideC;
+  using StrideAux = StrideC;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+
+  ElementT const* ptr_a = reinterpret_cast<ElementT const*>(a.data_ptr());
+  ElementT const* ptr_b = reinterpret_cast<ElementT const*>(b.data_ptr());
+
+  StrideA stride_a = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
+  StrideB stride_b = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
+  StrideC stride_c = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1));
+  StrideD stride_d = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(m, n, 1));
+  StrideAux aux_stride = stride_d;
+
+  typename GemmKernel::MainloopArguments mainloop_args{ptr_a, stride_a, ptr_b, stride_b};
+
+  typename GemmKernel::ProblemShape prob_shape = {m, n, k, 1};
+  cutlass::KernelHardwareInfo hw_info;
+  typename GemmKernel::TileSchedulerArguments scheduler = {};
+
+  auto ptr_c = static_cast<ElementOutput*>(out.data_ptr());
+
+  auto prepare_epilogue_args = [&](const c10::optional<torch::Tensor>& bias = c10::nullopt) {
+    if constexpr (WithBias) {
+      TORCH_CHECK(bias.has_value(), "Bias tensor is required but not provided.");
+      return typename GemmKernel::EpilogueArguments{
+          GemmType::prepare_args(scales_a, scales_b, bias.value()), ptr_c, stride_c, ptr_c, stride_d};
+    } else {
+      return typename GemmKernel::EpilogueArguments{
+          GemmType::prepare_args(scales_a, scales_b), ptr_c, stride_c, ptr_c, stride_d};
+    }
+  };
+
+  typename GemmKernel::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      prob_shape,
+      mainloop_args,
+      prepare_epilogue_args(bias),
+      hw_info,
+      scheduler};
+  return args;
+}
+
+template <typename Gemm, bool WithBias>
+void launch_sm100_fp8_scaled_mm(
+    torch::Tensor& out,
+    torch::Tensor const& a,
+    torch::Tensor const& b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const c10::optional<torch::Tensor>& bias) {
+  auto args = prepare_sm100_fp8_args<Gemm, WithBias>(out, a, b, scales_a, scales_b, bias);
+
+  typename Gemm::Gemm gemm_op;
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options = torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+  auto can_implement = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement == cutlass::Status::kSuccess)
+  auto status = gemm_op.run(args, workspace.data_ptr(), stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess)
+}
+
+template <typename OutType>
+void sm100_fp8_dispatch_bias(
+    torch::Tensor& out,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const c10::optional<torch::Tensor>& bias) {
+  using CTAShapeDefault = Shape<_256, _128, _64>;
+  using ClusterShapeDefault = Shape<_2, _2, _1>;
+
+  using CTAShape256 = Shape<_128, _128, _128>;
+  using ClusterShape256 = Shape<_2, _1, _1>;
+
+  using CTAShape64 = Shape<_64, _64, _128>;
+  using ClusterShape64 = Shape<_1, _1, _1>;
+
+  using CTAShape16 = Shape<_64, _64, _128>;
+  using ClusterShape16 = Shape<_1, _4, _1>;
+
+  using MainloopScheduleType = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueScheduleType = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileSchedulerType = void;
+
+  using ElementInput = cutlass::float_e4m3_t;
+  using ElementOutput = OutType;
+  using AccumElementType = float;
+
+  // Gemm type with bias
+  using BiasGemmDefault = DeviceGemmFp8RowwiseSm100<
+      ElementInput,
+      ElementOutput,
+      AccumElementType,
+      CTAShapeDefault,
+      ClusterShapeDefault,
+      MainloopScheduleType,
+      EpilogueScheduleType,
+      TileSchedulerType,
+      true>;
+  using BiasGemm256 = DeviceGemmFp8RowwiseSm100<
+      ElementInput,
+      ElementOutput,
+      AccumElementType,
+      CTAShape256,
+      ClusterShape256,
+      MainloopScheduleType,
+      EpilogueScheduleType,
+      TileSchedulerType,
+      true>;
+  using BiasGemm64 = DeviceGemmFp8RowwiseSm100<
+      ElementInput,
+      ElementOutput,
+      AccumElementType,
+      CTAShape64,
+      ClusterShape64,
+      MainloopScheduleType,
+      EpilogueScheduleType,
+      TileSchedulerType,
+      true>;
+  using BiasGemm16 = DeviceGemmFp8RowwiseSm100<
+      ElementInput,
+      ElementOutput,
+      AccumElementType,
+      CTAShape16,
+      ClusterShape16,
+      MainloopScheduleType,
+      EpilogueScheduleType,
+      TileSchedulerType,
+      true>;
+
+  // Gemm type without bias
+  using GemmDefault = DeviceGemmFp8RowwiseSm100<
+      ElementInput,
+      ElementOutput,
+      AccumElementType,
+      CTAShapeDefault,
+      ClusterShapeDefault,
+      MainloopScheduleType,
+      EpilogueScheduleType,
+      TileSchedulerType,
+      false>;
+  using Gemm256 = DeviceGemmFp8RowwiseSm100<
+      ElementInput,
+      ElementOutput,
+      AccumElementType,
+      CTAShape256,
+      ClusterShape256,
+      MainloopScheduleType,
+      EpilogueScheduleType,
+      TileSchedulerType,
+      false>;
+  using Gemm64 = DeviceGemmFp8RowwiseSm100<
+      ElementInput,
+      ElementOutput,
+      AccumElementType,
+      CTAShape64,
+      ClusterShape64,
+      MainloopScheduleType,
+      EpilogueScheduleType,
+      TileSchedulerType,
+      false>;
+  using Gemm16 = DeviceGemmFp8RowwiseSm100<
+      ElementInput,
+      ElementOutput,
+      AccumElementType,
+      CTAShape16,
+      ClusterShape16,
+      MainloopScheduleType,
+      EpilogueScheduleType,
+      TileSchedulerType,
+      false>;
+
+  // next power of 2 (minimum 16)
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 = std::max(static_cast<uint32_t>(16), next_pow_2(m));
+
+  if (bias) {
+    if (mp2 <= 16) {
+      // m in [1, 16]
+      return launch_sm100_fp8_scaled_mm<BiasGemm16, true>(out, a, b, scales_a, scales_b, bias);
+    } else if (mp2 <= 64) {
+      // m in (16, 64]
+      return launch_sm100_fp8_scaled_mm<BiasGemm64, true>(out, a, b, scales_a, scales_b, bias);
+    } else if (mp2 <= 256) {
+      // m in (64, 256]
+      return launch_sm100_fp8_scaled_mm<BiasGemm256, true>(out, a, b, scales_a, scales_b, bias);
+    } else {
+      // m in (256, inf]
+      return launch_sm100_fp8_scaled_mm<BiasGemmDefault, true>(out, a, b, scales_a, scales_b, bias);
+    }
+  } else {
+    if (mp2 <= 16) {
+      // m in [1, 16]
+      return launch_sm100_fp8_scaled_mm<Gemm16, false>(out, a, b, scales_a, scales_b, bias);
+    } else if (mp2 <= 64) {
+      // m in (16, 64]
+      return launch_sm100_fp8_scaled_mm<Gemm64, false>(out, a, b, scales_a, scales_b, bias);
+    } else if (mp2 <= 256) {
+      // m in (64, 256]
+      return launch_sm100_fp8_scaled_mm<Gemm256, false>(out, a, b, scales_a, scales_b, bias);
+    } else {
+      return launch_sm100_fp8_scaled_mm<GemmDefault, false>(out, a, b, scales_a, scales_b, bias);
+    }
+  }
+}
+
+template <typename OutType>
+void sm100_fp8_dispatch_shape(
+    torch::Tensor& out,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const c10::optional<torch::Tensor>& bias) {
+  return sm100_fp8_dispatch_bias<OutType>(out, a, b, scales_a, scales_b, bias);
+}
+#endif
+
+torch::Tensor fp8_scaled_mm(
+    const torch::Tensor& mat_a,
+    const torch::Tensor& mat_b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const torch::Dtype& out_dtype,
+    const c10::optional<torch::Tensor>& bias) {
+  TORCH_CHECK(mat_a.is_cuda(), "mat_a must be a CUDA tensor");
+  TORCH_CHECK(mat_b.is_cuda(), "mat_b must be a CUDA tensor");
+  TORCH_CHECK(mat_a.dim() == 2, "mat_a must be a 2D tensor");
+  TORCH_CHECK(mat_b.dim() == 2, "mat_b must be a 2D tensor");
+  TORCH_CHECK(mat_a.stride(1) == 1, "mat_a must be a row major tensor");
+  TORCH_CHECK(mat_b.stride(0) == 1, "mat_b must be a column major tensor");
+  TORCH_CHECK(mat_a.size(1) == mat_b.size(0), "mat_a and mat_b shapes cannot be multiplied");
+
+  TORCH_CHECK(
+      (mat_a.size(1) * mat_a.element_size()) % 16 == 0, "mat_a must be multiple of 16 bytes for memory alignment");
+  TORCH_CHECK(
+      (mat_b.size(0) * mat_b.element_size()) % 16 == 0, "mat_b must be multiple of 16 bytes for memory alignment");
+  TORCH_CHECK(mat_a.scalar_type() == torch::kFloat8_e4m3fn, "mat_a must be Float8_e4m3fn");
+  TORCH_CHECK(mat_b.scalar_type() == torch::kFloat8_e4m3fn, "mat_b must be Float8_e4m3fn");
+  TORCH_CHECK(out_dtype == torch::kHalf || out_dtype == torch::kBFloat16, "out_dtype must be Half or BFloat16");
+
+  TORCH_CHECK(scales_a.numel() == mat_a.size(0), "size of scales_a is not matched");
+  TORCH_CHECK(scales_b.numel() == mat_b.size(1), "size of scales_b is not matched");
+  TORCH_CHECK(scales_a.is_contiguous(), "scales_a must be contiguous");
+  TORCH_CHECK(scales_b.is_contiguous(), "scales_b msut be contiguous");
+  TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32, "scales_a must be Float32");
+  TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32, "scales_b must be Float32");
+
+  if (bias) {
+    TORCH_CHECK(bias->numel() == mat_b.size(1), "size of bias is not matched");
+    TORCH_CHECK(bias->is_contiguous(), "bias must be contiguous");
+    TORCH_CHECK(bias->dtype() == out_dtype, "bias dtype must match output dtype");
+  }
+
+  torch::Tensor out = torch::empty({mat_a.size(0), mat_b.size(1)}, mat_a.options().dtype(out_dtype));
+  TORCH_CHECK((out.size(1) * out.element_size()) % 16 == 0, "out must be multiple of 16 bytes for memory alignment");
+
+  auto sm_version = getSMVersion();
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12080
+  if (sm_version == 100
+#if CUDA_VERSION >= 12090
+      || sm_version == 103
+#endif
+  ) {
+    if (out_dtype == torch::kBFloat16) {
+      sm100_fp8_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      sm100_fp8_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
+    return out;
+  }
+#endif
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12000
+  if (sm_version >= 90) {
+    if (out_dtype == torch::kBFloat16) {
+      sm90_fp8_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      sm90_fp8_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
+    return out;
+  }
+#endif
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12040
+  if (sm_version == 89) {
+    if (out_dtype == torch::kBFloat16) {
+      sm89_fp8_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      sm89_fp8_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
+    return out;
+  }
+#endif
+
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "No implemented fp8_scaled_mm for current compute capability: ", sm_version);
+}
diff --git a/sgl-kernel/csrc/gemm/gptq/compat.cuh b/sgl-kernel/csrc/gemm/gptq/compat.cuh
new file mode 100644
index 000000000..506eeb769
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/gptq/compat.cuh
@@ -0,0 +1,62 @@
+/*
+Copied from https://github.com/turboderp/exllamav2
+*/
+
+#ifndef _compat_cuh
+#define _compat_cuh
+
+namespace sglang {
+namespace gptq {
+// atomicAdd for half types, to support CC < 7.x
+
+__device__ __forceinline__ void atomicAdd_half(half* address, half val) {
+  unsigned int* address_as_ui = (unsigned int*)((char*)address - ((size_t)address & 2));
+  unsigned int old = *address_as_ui;
+  unsigned int assumed;
+
+  do {
+    assumed = old;
+    __half_raw hsum;
+    hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+    half tmpres = __hadd(hsum, val);
+    hsum = __half_raw(tmpres);
+    old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
+    old = atomicCAS(address_as_ui, assumed, old);
+  } while (assumed != old);
+}
+
+// atomicAdd for half2 types
+
+__device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val) {
+  unsigned int* address_as_ui = (unsigned int*)address;
+  unsigned int old = *address_as_ui;
+  unsigned int assumed;
+  do {
+    assumed = old;
+    half2 old_val = *((half2*)&old);
+    half2 new_val = __hadd2(old_val, val);
+    old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val));
+  } while (assumed != old);
+}
+
+//
+
+#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
+#if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
+
+__device__ __forceinline__ void atomicAdd(half* address, half val) {
+  atomicAdd_half(address, val);
+}
+
+#if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
+__device__ __forceinline__ void atomicAdd(half2* address, half2 val) {
+  atomicAdd_half2(address, val);
+}
+#endif
+
+#endif
+#endif
+
+}  // namespace gptq
+}  // namespace sglang
+#endif
diff --git a/sgl-kernel/csrc/gemm/gptq/gptq_kernel.cu b/sgl-kernel/csrc/gemm/gptq/gptq_kernel.cu
new file mode 100644
index 000000000..4dd5d8a24
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/gptq/gptq_kernel.cu
@@ -0,0 +1,1950 @@
+/*
+Adapted from https://github.com/turboderp/exllamav2 and
+https://github.com/qwopqwop200/GPTQ-for-LLaMa
+*/
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <torch/all.h>
+
+#include <cstdint>
+#include <cstdio>
+
+#include "compat.cuh"
+#include "matrix_view.cuh"
+#include "qdq_2.cuh"
+#include "qdq_3.cuh"
+#include "qdq_4.cuh"
+#include "qdq_8.cuh"
+
+namespace sglang {
+namespace gptq {
+
+#define BLOCK_KN_SIZE 128
+#define BLOCK_M_SIZE_MAX 8
+#define MAX_GROUPS_IN_BLOCK (BLOCK_KN_SIZE / 32)
+#define MAX_Q_GEMM_ROWS 50
+#define MAX_Q_GEMM_ROWS_8BIT 24
+#define MAX_ALT_GEMM_ROWS 8
+#define THREADS_X 32
+#define THREADS_Y 32
+#define DIVIDE(x, size) (((x) + (size) - 1) / (size))
+
+#if defined(USE_ROCM)
+#include <hipblas/hipblas.h>
+__host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(
+    hipblasHandle_t handle,
+    hipblasOperation_t transA,
+    hipblasOperation_t transB,
+    int m,
+    int n,
+    int k,
+    const half* alpha,
+    const half* AP,
+    int lda,
+    const half* BP,
+    int ldb,
+    const half* beta,
+    half* CP,
+    int ldc) {
+  return hipblasHgemm(
+      handle,
+      transA,
+      transB,
+      m,
+      n,
+      k,
+      reinterpret_cast<const hipblasHalf*>(alpha),
+      reinterpret_cast<const hipblasHalf*>(AP),
+      lda,
+      reinterpret_cast<const hipblasHalf*>(BP),
+      ldb,
+      reinterpret_cast<const hipblasHalf*>(beta),
+      reinterpret_cast<hipblasHalf*>(CP),
+      ldc);
+}
+#define hipblasHgemm __compat_hipblasHgemm
+
+// Previous version of PyTorch were converting to rocBLAS instead of hipBLAS.
+#define rocblas_operation_none HIPBLAS_OP_N
+#define rocblas_hgemm __compat_hipblasHgemm
+#endif
+
+__forceinline__ __device__ half2 dot22_8(half2 (&dq)[4], const half* a_ptr, const half2 g_result) {
+  half2 result = {};
+  const half2* a2_ptr = (const half2*)a_ptr;
+#pragma unroll
+  for (int i = 0; i < 4; i++)
+    result = __hfma2(dq[i], *a2_ptr++, result);
+  return __hadd2(result, g_result);
+}
+
+__forceinline__ __device__ float dot22_8_f(half2 (&dq)[4], const half* a_ptr) {
+  half2 result = {};
+  const half2* a2_ptr = (const half2*)a_ptr;
+#pragma unroll
+  for (int i = 0; i < 4; i++)
+    result = __hfma2(dq[i], *a2_ptr++, result);
+  return __half2float(__low2half(result)) + __half2float(__high2half(result));
+}
+
+__forceinline__ __device__ half2 dot22_8(half2 (&dq)[4], const half* a_ptr, const half2 g_result, const half qs_h) {
+  half2 result = {};
+  const half2* a2_ptr = (const half2*)a_ptr;
+#pragma unroll
+  for (int i = 0; i < 4; i++)
+    result = __hfma2(dq[i], *a2_ptr++, result);
+  return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
+}
+
+__forceinline__ __device__ half2 dot22_16(half2 (&dq)[8], const half* a_ptr, const half2 g_result, const half qs_h) {
+  half2 result = {};
+  const half2* a2_ptr = (const half2*)a_ptr;
+#pragma unroll
+  for (int i = 0; i < 8; i++)
+    result = __hfma2(dq[i], *a2_ptr++, result);
+  return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
+}
+
+__forceinline__ __device__ half2 dot22_32(half2 (&dq)[16], const half* a_ptr, const half2 g_result, const half qs_h) {
+  half2 result = {};
+  const half2* a2_ptr = (const half2*)a_ptr;
+#pragma unroll
+  for (int i = 0; i < 16; i += 1)
+    result = __hfma2(dq[i], *a2_ptr++, result);
+  return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
+}
+
+__forceinline__ __device__ float dot22_8_f(half2 (&dq)[4], const half* a_ptr, const float g_result, const float qs_f) {
+  half2 result = {};
+  const half2* a2_ptr = (const half2*)a_ptr;
+#pragma unroll
+  for (int i = 0; i < 4; i++)
+    result = __hfma2(dq[i], *a2_ptr++, result);
+  float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result));
+  return fma(result_f, qs_f, g_result);
+}
+
+__forceinline__ __device__ float dot22_16_f(half2 (&dq)[8], const half* a_ptr, const float g_result, const float qs_f) {
+  half2 result = {};
+  const half2* a2_ptr = (const half2*)a_ptr;
+#pragma unroll
+  for (int i = 0; i < 8; i++)
+    result = __hfma2(dq[i], *a2_ptr++, result);
+  float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result));
+  return fma(result_f, qs_f, g_result);
+}
+
+__forceinline__ __device__ float
+dot22_32_f(half2 (&dq)[16], const half* a_ptr, const float g_result, const float qs_f) {
+  half2 result = {};
+  const half2* a2_ptr = (const half2*)a_ptr;
+#pragma unroll
+  for (int i = 0; i < 16; i += 1)
+    result = __hfma2(dq[i], *a2_ptr++, result);
+  float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result));
+  return fma(result_f, qs_f, g_result);
+}
+
+__forceinline__ __device__ half dot22_8_h(half2 (&dq)[4], const half* a_ptr, const half g_result, const half qs_h) {
+  // Use FP32 accumulator to avoid potential overflow since unscaled weights are
+  // in the range -128..127
+
+  float result = {};
+#pragma unroll
+  for (int i = 0; i < 4; i++) {
+    half2 w01 = dq[i];
+    float w0 = __low2float(w01);
+    float w1 = __high2float(w01);
+    float x0 = __half2float(*a_ptr++);
+    float x1 = __half2float(*a_ptr++);
+    result = fma(w0, x0, result);
+    result = fma(w1, x1, result);
+  }
+  float qs = __half2float(qs_h);
+  result *= qs;
+  half result_h = __float2half_rn(result);
+  return __hadd(result_h, g_result);
+}
+
+__forceinline__ __device__ half dot22_16_h(half2 (&dq)[8], const half* a_ptr, const half g_result, const half qs_h) {
+  half2 result = {};
+  const half2* a2_ptr = (const half2*)a_ptr;
+#pragma unroll
+  for (int i = 0; i < 8; i++)
+    result = __hfma2(dq[i], *a2_ptr++, result);
+  half result_h = __hadd(__low2half(result), __high2half(result));
+  return __hfma(result_h, qs_h, g_result);
+}
+
+__forceinline__ __device__ half dot22_32_h(half2 (&dq)[16], const half* a_ptr, const half g_result, const half qs_h) {
+  half2 result = {};
+  const half2* a2_ptr = (const half2*)a_ptr;
+#pragma unroll
+  for (int i = 0; i < 16; i += 1)
+    result = __hfma2(dq[i], *a2_ptr++, result);
+  half result_h = __hadd(__low2half(result), __high2half(result));
+  return __hfma(result_h, qs_h, g_result);
+}
+
+typedef void (*fp_gemm_half_q_half_gptq_kernel)(
+    const half*,
+    const uint32_t*,
+    const uint32_t*,
+    const half*,
+    half*,
+    const int,
+    const int,
+    const int,
+    const int,
+    const int*);
+
+template <bool first_block, int m_count>
+__global__ void gemm_half_q_half_gptq_4bit_kernel(
+    const half* __restrict__ a,
+    const uint32_t* __restrict__ b_q_weight,
+    const uint32_t* __restrict__ b_gptq_qzeros,
+    const half* __restrict__ b_gptq_scales,
+    half* __restrict__ c,
+    const int size_m,
+    const int size_n,
+    const int size_k,
+    const int groups,
+    const int* __restrict__ b_q_perm) {
+  MatrixView_half a_(a, size_m, size_k);
+  MatrixView_half_rw c_(c, size_m, size_n);
+  MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+
+  auto t = threadIdx.x;
+
+  // Block
+  auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+  auto offset_m = blockIdx.y * m_count;
+  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
+
+  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+
+  int n = offset_n + t * 4;
+
+  // Preload block_a
+  __shared__ half block_a[m_count][BLOCK_KN_SIZE];
+
+  if (offset_k + t < end_k) {
+    for (int m = 0; m < m_count; ++m) {
+      const half* a_ptr = a_.item_ptr(offset_m + m, 0);
+      half* block_a_ptr = block_a[m];
+
+      half a0;
+      if (b_q_perm)
+        a0 = a_ptr[b_q_perm[offset_k + t]];
+      else
+        a0 = a_ptr[offset_k + t];
+      block_a_ptr[t] = a0;
+    }
+  }
+
+  // Zero output
+  if (n >= size_n) return;
+
+  if (blockIdx.z == 0) {
+    for (int m = 0; m < m_count; m++)
+      *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
+  }
+
+  __syncthreads();
+
+  // Find initial group
+  int groupsize = size_k / groups;
+  int group = offset_k / groupsize;
+  int nextgroup = offset_k + groupsize;
+
+  // a, b offset
+  int qk = offset_k / (32 / 4);
+
+  const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+  const half* a_ptr = &block_a[0][0];
+  int a_stride = BLOCK_KN_SIZE;
+
+  // Initial group
+  int zeros[4];
+  float scales[4];
+  half2 z1z16[4][2];
+  half2 y1y16[4][2];
+  b_gptq_qzeros_.item4(zeros, group, n);
+  b_gptq_scales_.item4_f(scales, group, n);
+  dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
+  dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
+  dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
+  dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
+
+  // Column result
+  float block_c[m_count][4] = {};
+
+  // Dequantize and multiply
+  int k = offset_k;
+  while (k < end_k) {
+    if (k == nextgroup) {
+      group++;
+      nextgroup += groupsize;
+      b_gptq_qzeros_.item4(zeros, group, n);
+      b_gptq_scales_.item4_f(scales, group, n);
+      dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
+      dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
+      dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
+      dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
+    }
+
+#pragma unroll
+    for (int j = 0; j < 4; j++) {
+      const int4* b_ptr4 = (int4*)b_ptr;
+      int4 load_int4 = *b_ptr4;
+
+      half2 dq[4][4];
+      dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n, false);
+      dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n, false);
+      dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n, false);
+      dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n, false);
+
+#pragma unroll
+      for (int m = 0; m < m_count; m++) {
+        block_c[m][0] = fma(dot22_8_f(dq[0], a_ptr + m * a_stride), scales[0], block_c[m][0]);
+        block_c[m][1] = fma(dot22_8_f(dq[1], a_ptr + m * a_stride), scales[1], block_c[m][1]);
+        block_c[m][2] = fma(dot22_8_f(dq[2], a_ptr + m * a_stride), scales[2], block_c[m][2]);
+        block_c[m][3] = fma(dot22_8_f(dq[3], a_ptr + m * a_stride), scales[3], block_c[m][3]);
+      }
+
+      b_ptr += size_n;
+      a_ptr += 8;
+    }
+
+    k += 32;
+  }
+
+  for (int m = 0; m < m_count; m++) {
+    half2* out = (half2*)c_.item_ptr(offset_m + m, n);
+    half2 result01 = __halves2half2(__float2half_rn(block_c[m][0]), __float2half_rn(block_c[m][1]));
+    half2 result23 = __halves2half2(__float2half_rn(block_c[m][2]), __float2half_rn(block_c[m][3]));
+    atomicAdd(out, result01);
+    atomicAdd(out + 1, result23);
+  }
+}
+
+template <bool first_block, int m_count>
+__global__ void gemm_half_q_half_gptq_2bit_kernel(
+    const half* __restrict__ a,
+    const uint32_t* __restrict__ b_q_weight,
+    const uint32_t* __restrict__ b_gptq_qzeros,
+    const half* __restrict__ b_gptq_scales,
+    half* __restrict__ c,
+    const int size_m,
+    const int size_n,
+    const int size_k,
+    const int groups,
+    const int* __restrict__ b_q_perm) {
+  MatrixView_half a_(a, size_m, size_k);
+  MatrixView_half_rw c_(c, size_m, size_n);
+  MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+
+  auto t = threadIdx.x;
+
+  // Block
+  auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+  auto offset_m = blockIdx.y * m_count;
+  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
+
+  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+
+  int n = offset_n + t * 4;
+
+  // Preload block_a
+  __shared__ half block_a[m_count][BLOCK_KN_SIZE];
+
+  if (offset_k + t < end_k) {
+    for (int m = 0; m < m_count; ++m) {
+      const half* a_ptr = a_.item_ptr(offset_m + m, 0);
+      half* block_a_ptr = block_a[m];
+
+      half a0;
+      if (b_q_perm)
+        a0 = a_ptr[b_q_perm[offset_k + t]];
+      else
+        a0 = a_ptr[offset_k + t];
+      block_a_ptr[t] = a0;
+    }
+  }
+
+  // Zero output
+  if (n >= size_n) return;
+
+  if (blockIdx.z == 0) {
+    for (int m = 0; m < m_count; m++)
+      *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
+  }
+
+  __syncthreads();
+
+  // Find initial group
+  int groupsize = size_k / groups;
+  int group = offset_k / groupsize;
+  int nextgroup = offset_k + groupsize;
+
+  // a, b offset
+  int qk = offset_k / (32 / 2);
+
+  const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+  const half* a_ptr = &block_a[0][0];
+  int a_stride = BLOCK_KN_SIZE;
+
+  // Initial group
+  int zeros[4];
+  half scales[4];
+  b_gptq_qzeros_.item4(zeros, group, n);
+  b_gptq_scales_.item4(scales, group, n);
+  // Column result
+  half block_c[m_count][4] = {};
+
+  // Dequantize and multiply
+  int k = offset_k;
+  while (k < end_k) {
+    if (k == nextgroup) {
+      group++;
+      nextgroup += groupsize;
+      b_gptq_qzeros_.item4(zeros, group, n);
+      b_gptq_scales_.item4(scales, group, n);
+    }
+
+#pragma unroll
+    for (int j = 0; j < 1; j++) {
+      const int4* b_ptr4 = (int4*)b_ptr;
+      int4 load_int4 = *b_ptr4;
+
+      half2 dq[4][8];
+      dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1);
+      dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1);
+      dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1);
+      dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1);
+
+#pragma unroll
+      for (int m = 0; m < m_count; m++) {
+        block_c[m][0] = dot22_16_h(dq[0], a_ptr + m * a_stride, block_c[m][0], scales[0]);
+        block_c[m][1] = dot22_16_h(dq[1], a_ptr + m * a_stride, block_c[m][1], scales[1]);
+        block_c[m][2] = dot22_16_h(dq[2], a_ptr + m * a_stride, block_c[m][2], scales[2]);
+        block_c[m][3] = dot22_16_h(dq[3], a_ptr + m * a_stride, block_c[m][3], scales[3]);
+      }
+
+      b_ptr += size_n;
+      a_ptr += 16;
+    }
+
+    k += 16;
+  }
+
+  for (int m = 0; m < m_count; m++) {
+    half2* out = (half2*)c_.item_ptr(offset_m + m, n);
+    half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]);
+    half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]);
+    atomicAdd(out, result01);
+    atomicAdd(out + 1, result23);
+  }
+}
+
+template <bool first_block, int m_count>
+__global__ void gemm_half_q_half_gptq_3bit_kernel(
+    const half* __restrict__ a,
+    const uint32_t* __restrict__ b_q_weight,
+    const uint32_t* __restrict__ b_gptq_qzeros,
+    const half* __restrict__ b_gptq_scales,
+    half* __restrict__ c,
+    const int size_m,
+    const int size_n,
+    const int size_k,
+    const int groups,
+    const int* __restrict__ b_q_perm) {
+  MatrixView_half a_(a, size_m, size_k);
+  MatrixView_half_rw c_(c, size_m, size_n);
+  MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+
+  auto t = threadIdx.x;
+
+  // Block
+  auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+  auto offset_m = blockIdx.y * m_count;
+  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
+
+  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+
+  int n = offset_n + t * 4;
+
+  // Preload block_a
+  __shared__ half block_a[m_count][BLOCK_KN_SIZE];
+
+  if (offset_k + t < end_k) {
+    for (int m = 0; m < m_count; ++m) {
+      const half* a_ptr = a_.item_ptr(offset_m + m, 0);
+      half* block_a_ptr = block_a[m];
+
+      half a0;
+      if (b_q_perm)
+        a0 = a_ptr[b_q_perm[offset_k + t]];
+      else
+        a0 = a_ptr[offset_k + t];
+      block_a_ptr[t] = a0;
+    }
+  }
+
+  // Zero output
+  if (n >= size_n) return;
+
+  if (blockIdx.z == 0) {
+    for (int m = 0; m < m_count; m++)
+      *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
+  }
+
+  __syncthreads();
+
+  // Find initial group
+  int groupsize = size_k / groups;
+  int group = offset_k / groupsize;
+  int nextgroup = offset_k + groupsize;
+
+  // a, b offset
+  int qk = offset_k / 32 * 3;
+
+  const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+  const half* a_ptr = &block_a[0][0];
+  int a_stride = BLOCK_KN_SIZE;
+
+  // Initial group
+  int zeros[4];
+  half scales[4];
+  b_gptq_qzeros_.item4(zeros, group, n);
+  b_gptq_scales_.item4(scales, group, n);
+  // Column result
+  half block_c[m_count][4] = {};
+
+  // Dequantize and multiply
+  int k = offset_k;
+  while (k < end_k) {
+    if (k == nextgroup) {
+      group++;
+      nextgroup += groupsize;
+      b_gptq_qzeros_.item4(zeros, group, n);
+      b_gptq_scales_.item4(scales, group, n);
+    }
+
+#pragma unroll
+    for (int j = 0; j < 1; j++) {
+      int4 load_int4[3];
+      load_int4[0] = *((int4*)b_ptr);
+      b_ptr += size_n;
+      load_int4[1] = *((int4*)b_ptr);
+      b_ptr += size_n;
+      load_int4[2] = *((int4*)b_ptr);
+      b_ptr += size_n;
+
+      half2 dq[4][16];
+      dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0], size_n, zeros[0] + 1);
+      dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1], size_n, zeros[1] + 1);
+      dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2], size_n, zeros[2] + 1);
+      dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3], size_n, zeros[3] + 1);
+
+#pragma unroll
+      for (int m = 0; m < m_count; m++) {
+        block_c[m][0] = dot22_32_h(dq[0], a_ptr + m * a_stride, block_c[m][0], scales[0]);
+        block_c[m][1] = dot22_32_h(dq[1], a_ptr + m * a_stride, block_c[m][1], scales[1]);
+        block_c[m][2] = dot22_32_h(dq[2], a_ptr + m * a_stride, block_c[m][2], scales[2]);
+        block_c[m][3] = dot22_32_h(dq[3], a_ptr + m * a_stride, block_c[m][3], scales[3]);
+      }
+      a_ptr += 32;
+    }
+
+    k += 32;
+  }
+
+  for (int m = 0; m < m_count; m++) {
+    half2* out = (half2*)c_.item_ptr(offset_m + m, n);
+    half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]);
+    half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]);
+    atomicAdd(out, result01);
+    atomicAdd(out + 1, result23);
+  }
+}
+
+template <bool first_block, int m_count>
+__global__ void gemm_half_q_half_gptq_8bit_kernel(
+    const half* __restrict__ a,
+    const uint32_t* __restrict__ b_q_weight,
+    const uint32_t* __restrict__ b_gptq_qzeros,
+    const half* __restrict__ b_gptq_scales,
+    half* __restrict__ c,
+    const int size_m,
+    const int size_n,
+    const int size_k,
+    const int groups,
+    const int* __restrict__ b_q_perm) {
+  MatrixView_half a_(a, size_m, size_k);
+  MatrixView_half_rw c_(c, size_m, size_n);
+  MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+
+  auto t = threadIdx.x;
+
+  // Block
+  auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+  auto offset_m = blockIdx.y * m_count;
+  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
+
+  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+
+  int n = offset_n + t * 4;
+
+  // Preload block_a
+  __shared__ half block_a[m_count][BLOCK_KN_SIZE];
+
+  if (offset_k + t < end_k) {
+    for (int m = 0; m < m_count; ++m) {
+      const half* a_ptr = a_.item_ptr(offset_m + m, 0);
+      half* block_a_ptr = block_a[m];
+
+      half a0;
+      if (b_q_perm)
+        a0 = a_ptr[b_q_perm[offset_k + t]];
+      else
+        a0 = a_ptr[offset_k + t];
+      block_a_ptr[t] = a0;
+    }
+  }
+
+  // Zero output
+  if (n >= size_n) return;
+
+  if (blockIdx.z == 0) {
+    for (int m = 0; m < m_count; m++)
+      *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
+  }
+
+  __syncthreads();
+
+  // Find initial group
+  int groupsize = size_k / groups;
+  int group = offset_k / groupsize;
+  int nextgroup = offset_k + groupsize;
+
+  // a, b offset
+  int qk = offset_k / (32 / 8);
+
+  const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+  const half* a_ptr = &block_a[0][0];
+  int a_stride = BLOCK_KN_SIZE;
+
+  // Initial group
+  int zeros[4];
+  half scales[4];
+  b_gptq_qzeros_.item4(zeros, group, n);
+  b_gptq_scales_.item4(scales, group, n);
+  // Column result
+  half block_c[m_count][4] = {};
+
+  // Dequantize and multiply
+  int k = offset_k;
+  while (k < end_k) {
+    if (k == nextgroup) {
+      group++;
+      nextgroup += groupsize;
+      b_gptq_qzeros_.item4(zeros, group, n);
+      b_gptq_scales_.item4(scales, group, n);
+    }
+
+#pragma unroll
+    for (int j = 0; j < 4; j++) {
+      int4 load_int4[2];
+      load_int4[0] = *((int4*)b_ptr);
+      b_ptr += size_n;
+      load_int4[1] = *((int4*)b_ptr);
+      b_ptr += size_n;
+
+      half2 dq[4][4];
+      dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n, zeros[0] + 1);
+      dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n, zeros[1] + 1);
+      dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n, zeros[2] + 1);
+      dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n, zeros[3] + 1);
+
+      for (int m = 0; m < m_count; m++) {
+        block_c[m][0] = dot22_8_h(dq[0], a_ptr + m * a_stride, block_c[m][0], scales[0]);
+        block_c[m][1] = dot22_8_h(dq[1], a_ptr + m * a_stride, block_c[m][1], scales[1]);
+        block_c[m][2] = dot22_8_h(dq[2], a_ptr + m * a_stride, block_c[m][2], scales[2]);
+        block_c[m][3] = dot22_8_h(dq[3], a_ptr + m * a_stride, block_c[m][3], scales[3]);
+      }
+      a_ptr += 8;
+    }
+    k += 32;
+  }
+
+  for (int m = 0; m < m_count; m++) {
+    half2* out = (half2*)c_.item_ptr(offset_m + m, n);
+    half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]);
+    half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]);
+    atomicAdd(out, result01);
+    atomicAdd(out + 1, result23);
+  }
+}
+
+fp_gemm_half_q_half_gptq_kernel pick_gemm_half_q_half_gptq_kernel(bool first_block, const int m_count, const int bit) {
+#define SELECT_KERNEL(M_COUNT)                                             \
+  if (m_count == M_COUNT) {                                                \
+    if (bit == 2) return gemm_half_q_half_gptq_2bit_kernel<true, M_COUNT>; \
+    if (bit == 3) return gemm_half_q_half_gptq_3bit_kernel<true, M_COUNT>; \
+    if (bit == 4) return gemm_half_q_half_gptq_4bit_kernel<true, M_COUNT>; \
+    if (bit == 8) return gemm_half_q_half_gptq_8bit_kernel<true, M_COUNT>; \
+  }
+#if BLOCK_M_SIZE_MAX >= 1
+  SELECT_KERNEL(1);
+#endif
+#if BLOCK_M_SIZE_MAX >= 2
+  SELECT_KERNEL(2);
+#endif
+#if BLOCK_M_SIZE_MAX >= 3
+  SELECT_KERNEL(3);
+#endif
+#if BLOCK_M_SIZE_MAX >= 4
+  SELECT_KERNEL(4);
+#endif
+#if BLOCK_M_SIZE_MAX >= 5
+  SELECT_KERNEL(5);
+#endif
+#if BLOCK_M_SIZE_MAX >= 6
+  SELECT_KERNEL(6);
+#endif
+#if BLOCK_M_SIZE_MAX >= 7
+  SELECT_KERNEL(7);
+#endif
+#if BLOCK_M_SIZE_MAX >= 8
+  SELECT_KERNEL(8);
+#endif
+  return NULL;
+}
+
+void gemm_half_q_half_cuda_part(
+    const half* a,
+    const uint32_t* b_q_weight,
+    const uint32_t* b_gptq_qzeros,
+    const half* b_gptq_scales,
+    const int* b_q_perm,
+    half* c,
+    int size_m,
+    int size_n,
+    int size_k,
+    int m_count,
+    int groups,
+    int bit) {
+  dim3 blockDim, gridDim;
+  blockDim.x = BLOCK_KN_SIZE;
+  blockDim.y = 1;
+  blockDim.z = 1;
+  gridDim.x = DIVIDE(size_n, BLOCK_KN_SIZE * 4);
+  gridDim.y = DIVIDE(size_m, m_count);
+  gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE);
+
+  fp_gemm_half_q_half_gptq_kernel kernel = pick_gemm_half_q_half_gptq_kernel(true, m_count, bit);
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  kernel<<<gridDim, blockDim, 0, stream>>>(
+      a, b_q_weight, b_gptq_qzeros, b_gptq_scales, c, size_m, size_n, size_k, groups, b_q_perm);
+}
+
+__global__ void reconstruct_exllama_8bit_kernel(
+    const uint32_t* __restrict__ b_q_weight,
+    const int* __restrict__ b_q_perm,
+    const uint32_t* __restrict__ b_gptq_qzeros,
+    const half* __restrict__ b_gptq_scales,
+    const int size_k,
+    const int size_n,
+    const int groups,
+    half* __restrict__ b) {
+  MatrixView_half_rw b_(b, size_k, size_n);
+  MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+
+  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
+  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+
+  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+
+  // Preload remapping table
+  __shared__ int perm[BLOCK_KN_SIZE];
+  auto t = threadIdx.x;
+
+  if (b_q_perm) {
+    if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
+  }
+
+  // Column
+  int n = offset_n + t * 4;
+  if (n >= size_n) return;
+
+  // Find initial group
+  int groupsize = size_k / groups;
+  int group = offset_k / groupsize;
+  int nextgroup = offset_k + groupsize;
+
+  // b offset
+  int qk = offset_k / (32 / 8);
+
+  const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+
+  // Initial zeros/scale
+  int zeros[4];
+  half2 scales[4];
+  b_gptq_qzeros_.item4(zeros, group, n);
+  b_gptq_scales_.item4_h2(scales, group, n);
+
+  __syncthreads();
+
+  int k = offset_k;
+  int lk = 0;
+
+  while (k < end_k) {
+    if (k == nextgroup) {
+      group++;
+      nextgroup += groupsize;
+      b_gptq_qzeros_.item4(zeros, group, n);
+      b_gptq_scales_.item4_h2(scales, group, n);
+    }
+
+    for (int p = 0; p < 4; p++) {
+      int4 load_int4[2];
+      load_int4[0] = *((int4*)b_ptr);
+      b_ptr += size_n;
+      load_int4[1] = *((int4*)b_ptr);
+      b_ptr += size_n;
+
+      half2 dq[4][4];
+      dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n, zeros[0] + 1);
+      dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n, zeros[1] + 1);
+      dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n, zeros[2] + 1);
+      dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n, zeros[3] + 1);
+
+      // half* dqh = (half*)dq;
+      if (b_q_perm) {
+        for (int j = 0; j < 4; j++) {
+          for (int v = 0; v < 4; v++)
+            dq[v][j] = __hmul2(scales[v], dq[v][j]);
+          b_.set4(
+              perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
+          b_.set4(
+              perm[lk++],
+              n,
+              __high2half(dq[0][j]),
+              __high2half(dq[1][j]),
+              __high2half(dq[2][j]),
+              __high2half(dq[3][j]));
+        }
+      } else {
+        for (int j = 0; j < 4; j++) {
+          for (int v = 0; v < 4; v++)
+            dq[v][j] = __hmul2(scales[v], dq[v][j]);
+          b_.set4(
+              offset_k + lk++,
+              n,
+              __low2half(dq[0][j]),
+              __low2half(dq[1][j]),
+              __low2half(dq[2][j]),
+              __low2half(dq[3][j]));
+          b_.set4(
+              offset_k + lk++,
+              n,
+              __high2half(dq[0][j]),
+              __high2half(dq[1][j]),
+              __high2half(dq[2][j]),
+              __high2half(dq[3][j]));
+        }
+      }
+    }
+    k += 32;
+  }
+}
+
+__global__ void reconstruct_exllama_4bit_kernel(
+    const uint32_t* __restrict__ b_q_weight,
+    const int* __restrict__ b_q_perm,
+    const uint32_t* __restrict__ b_gptq_qzeros,
+    const half* __restrict__ b_gptq_scales,
+    const int size_k,
+    const int size_n,
+    const int groups,
+    half* __restrict__ b) {
+  MatrixView_half_rw b_(b, size_k, size_n);
+  MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+
+  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
+  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+
+  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+
+  // Preload remapping table
+  __shared__ int perm[BLOCK_KN_SIZE];
+  auto t = threadIdx.x;
+
+  if (b_q_perm) {
+    if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
+  }
+
+  // Column
+  int n = offset_n + t * 4;
+  if (n >= size_n) return;
+
+  // Find initial group
+  int groupsize = size_k / groups;
+  int group = offset_k / groupsize;
+  int nextgroup = offset_k + groupsize;
+
+  // b offset
+  int qk = offset_k / (32 / 4);
+
+  const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+
+  // Initial zeros/scale
+  int zeros[4];
+  half2 scales[4];
+  half2 z1z16[4][2];
+  half2 y1y16[4][2];
+  b_gptq_qzeros_.item4(zeros, group, n);
+  b_gptq_scales_.item4_h2(scales, group, n);
+  dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
+  dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
+  dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
+  dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
+
+  __syncthreads();
+
+  int k = offset_k;
+  int lk = 0;
+
+  while (k < end_k) {
+    if (k == nextgroup) {
+      group++;
+      nextgroup += groupsize;
+      b_gptq_qzeros_.item4(zeros, group, n);
+      b_gptq_scales_.item4_h2(scales, group, n);
+      dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
+      dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
+      dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
+      dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
+    }
+
+    for (int p = 0; p < 4; p++) {
+      half2 dq[4][4];
+      const int4* b_ptr4 = (int4*)b_ptr;
+      int4 load_int4 = *b_ptr4;
+
+      dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n, false);
+      dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n, false);
+      dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n, false);
+      dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n, false);
+
+      b_ptr += size_n;
+      // half* dqh = (half*)dq;
+      if (b_q_perm) {
+        for (int j = 0; j < 4; j++) {
+          for (int v = 0; v < 4; v++)
+            dq[v][j] = __hmul2(scales[v], dq[v][j]);
+          b_.set4(
+              perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
+          b_.set4(
+              perm[lk++],
+              n,
+              __high2half(dq[0][j]),
+              __high2half(dq[1][j]),
+              __high2half(dq[2][j]),
+              __high2half(dq[3][j]));
+        }
+      } else {
+        for (int j = 0; j < 4; j++) {
+          for (int v = 0; v < 4; v++)
+            dq[v][j] = __hmul2(scales[v], dq[v][j]);
+          b_.set4(
+              offset_k + lk++,
+              n,
+              __low2half(dq[0][j]),
+              __low2half(dq[1][j]),
+              __low2half(dq[2][j]),
+              __low2half(dq[3][j]));
+          b_.set4(
+              offset_k + lk++,
+              n,
+              __high2half(dq[0][j]),
+              __high2half(dq[1][j]),
+              __high2half(dq[2][j]),
+              __high2half(dq[3][j]));
+        }
+      }
+    }
+    k += 32;
+  }
+}
+
+__global__ void reconstruct_exllama_3bit_kernel(
+    const uint32_t* __restrict__ b_q_weight,
+    const int* __restrict__ b_q_perm,
+    const uint32_t* __restrict__ b_gptq_qzeros,
+    const half* __restrict__ b_gptq_scales,
+    const int size_k,
+    const int size_n,
+    const int groups,
+    half* __restrict__ b) {
+  MatrixView_half_rw b_(b, size_k, size_n);
+  MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+
+  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
+  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+
+  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+
+  // Preload remapping table
+  __shared__ int perm[BLOCK_KN_SIZE];
+  auto t = threadIdx.x;
+
+  if (b_q_perm) {
+    if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
+  }
+
+  // Column
+  int n = offset_n + t * 4;
+  if (n >= size_n) return;
+
+  // Find initial group
+  int groupsize = size_k / groups;
+  int group = offset_k / groupsize;
+  int nextgroup = offset_k + groupsize;
+
+  // b offset
+  int qk = offset_k / 32 * 3;
+
+  const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+
+  // Initial zeros/scale
+  int zeros[4];
+  half2 scales[4];
+  b_gptq_qzeros_.item4(zeros, group, n);
+  b_gptq_scales_.item4_h2(scales, group, n);
+
+  __syncthreads();
+
+  int k = offset_k;
+  int lk = 0;
+
+  while (k < end_k) {
+    if (k == nextgroup) {
+      group++;
+      nextgroup += groupsize;
+      b_gptq_qzeros_.item4(zeros, group, n);
+      b_gptq_scales_.item4_h2(scales, group, n);
+    }
+
+    for (int p = 0; p < 1; p++) {
+      int4 load_int4[3];
+      load_int4[0] = *((int4*)b_ptr);
+      b_ptr += size_n;
+      load_int4[1] = *((int4*)b_ptr);
+      b_ptr += size_n;
+      load_int4[2] = *((int4*)b_ptr);
+      b_ptr += size_n;
+
+      half2 dq[4][16];
+      dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0], size_n, zeros[0] + 1);
+      dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1], size_n, zeros[1] + 1);
+      dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2], size_n, zeros[2] + 1);
+      dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3], size_n, zeros[3] + 1);
+
+      if (b_q_perm) {
+        for (int j = 0; j < 16; j++) {
+          for (int v = 0; v < 4; v++)
+            dq[v][j] = __hmul2(scales[v], dq[v][j]);
+          b_.set4(
+              perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
+          b_.set4(
+              perm[lk++],
+              n,
+              __high2half(dq[0][j]),
+              __high2half(dq[1][j]),
+              __high2half(dq[2][j]),
+              __high2half(dq[3][j]));
+        }
+      } else {
+        for (int j = 0; j < 16; j++) {
+          for (int v = 0; v < 4; v++)
+            dq[v][j] = __hmul2(scales[v], dq[v][j]);
+          b_.set4(
+              offset_k + lk++,
+              n,
+              __low2half(dq[0][j]),
+              __low2half(dq[1][j]),
+              __low2half(dq[2][j]),
+              __low2half(dq[3][j]));
+          b_.set4(
+              offset_k + lk++,
+              n,
+              __high2half(dq[0][j]),
+              __high2half(dq[1][j]),
+              __high2half(dq[2][j]),
+              __high2half(dq[3][j]));
+        }
+      }
+    }
+    k += 32;
+  }
+}
+
+__global__ void reconstruct_exllama_2bit_kernel(
+    const uint32_t* __restrict__ b_q_weight,
+    const int* __restrict__ b_q_perm,
+    const uint32_t* __restrict__ b_gptq_qzeros,
+    const half* __restrict__ b_gptq_scales,
+    const int size_k,
+    const int size_n,
+    const int groups,
+    half* __restrict__ b) {
+  MatrixView_half_rw b_(b, size_k, size_n);
+  MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+
+  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
+  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+
+  int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+
+  // Preload remapping table
+  __shared__ int perm[BLOCK_KN_SIZE];
+  auto t = threadIdx.x;
+
+  if (b_q_perm) {
+    if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
+  }
+
+  // Column
+  int n = offset_n + t * 4;
+  if (n >= size_n) return;
+
+  // Find initial group
+  int groupsize = size_k / groups;
+  int group = offset_k / groupsize;
+  int nextgroup = offset_k + groupsize;
+
+  // b offset
+  int qk = offset_k / (32 / 2);
+
+  const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+
+  // Initial zeros/scale
+  int zeros[4];
+  half2 scales[4];
+  b_gptq_qzeros_.item4(zeros, group, n);
+  b_gptq_scales_.item4_h2(scales, group, n);
+
+  __syncthreads();
+
+  int k = offset_k;
+  int lk = 0;
+
+  while (k < end_k) {
+    if (k == nextgroup) {
+      group++;
+      nextgroup += groupsize;
+      b_gptq_qzeros_.item4(zeros, group, n);
+      b_gptq_scales_.item4_h2(scales, group, n);
+    }
+
+    for (int p = 0; p < 2; p++) {
+      const int4* b_ptr4 = (int4*)b_ptr;
+      int4 load_int4 = *b_ptr4;
+
+      half2 dq[4][8];
+      dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1);
+      dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1);
+      dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1);
+      dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1);
+
+      b_ptr += size_n;
+      // half* dqh = (half*)dq;
+      if (b_q_perm) {
+        for (int j = 0; j < 8; j++) {
+          for (int v = 0; v < 4; v++)
+            dq[v][j] = __hmul2(scales[v], dq[v][j]);
+          b_.set4(
+              perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
+          b_.set4(
+              perm[lk++],
+              n,
+              __high2half(dq[0][j]),
+              __high2half(dq[1][j]),
+              __high2half(dq[2][j]),
+              __high2half(dq[3][j]));
+        }
+      } else {
+        for (int j = 0; j < 8; j++) {
+          for (int v = 0; v < 4; v++)
+            dq[v][j] = __hmul2(scales[v], dq[v][j]);
+          b_.set4(
+              offset_k + lk++,
+              n,
+              __low2half(dq[0][j]),
+              __low2half(dq[1][j]),
+              __low2half(dq[2][j]),
+              __low2half(dq[3][j]));
+          b_.set4(
+              offset_k + lk++,
+              n,
+              __high2half(dq[0][j]),
+              __high2half(dq[1][j]),
+              __high2half(dq[2][j]),
+              __high2half(dq[3][j]));
+        }
+      }
+    }
+    k += 32;
+  }
+}
+
+void reconstruct_exllama(
+    const uint32_t* b_q_weight,
+    const uint32_t* b_gptq_qzeros,
+    const half* b_gptq_scales,
+    const int* b_q_perm,
+    half* out,
+    int height,
+    int width,
+    int groups,
+    int bit) {
+  dim3 blockDim, gridDim;
+  blockDim.x = BLOCK_KN_SIZE;
+  blockDim.y = 1;
+  gridDim.y = DIVIDE(height, BLOCK_KN_SIZE);
+  gridDim.x = DIVIDE(width, BLOCK_KN_SIZE);
+
+  auto reconstruct_exllama_kernel = reconstruct_exllama_4bit_kernel;
+  if (bit == 2) {
+    reconstruct_exllama_kernel = reconstruct_exllama_2bit_kernel;
+  } else if (bit == 3) {
+    reconstruct_exllama_kernel = reconstruct_exllama_3bit_kernel;
+  } else if (bit == 8) {
+    reconstruct_exllama_kernel = reconstruct_exllama_8bit_kernel;
+  }
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  reconstruct_exllama_kernel<<<gridDim, blockDim, 0, stream>>>(
+      b_q_weight, b_q_perm, b_gptq_qzeros, b_gptq_scales, height, width, groups, out);
+}
+
+__global__ void gemm_half_q_half_alt_4bit_kernel(
+    const half2* __restrict__ vec,
+    const uint32_t* __restrict__ mat,
+    half* __restrict__ mul,
+    const half* __restrict__ scales,
+    const uint32_t* __restrict__ zeros,
+    const int* __restrict__ g_idx,
+    int batch,
+    int height,
+    int width) {
+  int zero_width = width / 8;
+  int vec_height = height * 4;
+  const int blockwidth2 = BLOCK_KN_SIZE / 2;
+  auto b = blockIdx.y * BLOCK_M_SIZE_MAX;
+  int b_end = min(BLOCK_M_SIZE_MAX, batch - b);
+  auto h = BLOCK_KN_SIZE * blockIdx.z / 8;
+  int h_end = min(BLOCK_KN_SIZE / 8, height - h) * 4;
+  auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+
+  __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
+  if (threadIdx.x < h_end) {
+    for (int m = 0; m < b_end; ++m) {
+      blockvec[m][threadIdx.x] = vec[(m + b) * vec_height + blockIdx.z * BLOCK_KN_SIZE / 2 + threadIdx.x];
+    }
+  }
+
+  __shared__ half2 deq2[256][8];
+  auto val = threadIdx.x / 8;
+  auto off = threadIdx.x % 8;
+  for (; val < 256; val += BLOCK_KN_SIZE / 8) {
+    deq2[val][off] = __halves2half2(__int2half_rn(val & 0xF), __int2half_rn(val >> 4));
+  }
+
+  if (blockIdx.z == 0) {
+    for (int m = 0; m < b_end; m++)
+      mul[(b + m) * width + w] = __int2half_rn(0);
+  }
+  __syncthreads();
+
+  int i = width * h + w;
+  int g_h = h * 8;
+  int k = 0;
+  int z_w = w / 8;
+  int z_mod = (w % 8) * 4;
+  half2 res2;
+  half res[BLOCK_M_SIZE_MAX] = {};
+
+  unsigned int tmp;
+  while (k < h_end) {
+    tmp = mat[i];
+    half2 scales_tmp[4];
+    half2 zeros_tmp[4];
+    for (int tmp_k = 0; tmp_k < 4; tmp_k++) {
+      int g = g_idx[g_h + (k + tmp_k) * 2];
+      int g2 = g_idx[g_h + (k + tmp_k) * 2 + 1];
+      half scale_f = scales[g * width + w];
+      half scale_f2 = scales[g2 * width + w];
+      half2 scale = __halves2half2(scale_f, scale_f2);
+      half2 zero = __halves2half2(
+          __hmul(scale_f, __int2half_rn(-((zeros[g * zero_width + z_w] >> z_mod) & 0xF) - 1)),
+          __hmul(scale_f2, __int2half_rn(-((zeros[g2 * zero_width + z_w] >> z_mod) & 0xF) - 1)));
+      scales_tmp[tmp_k] = scale;
+      zeros_tmp[tmp_k] = zero;
+    }
+    for (int m = 0; m < b_end; m++) {
+#ifndef USE_ROCM
+      res2 = {};
+#else
+      res2.x = __half_as_ushort(__float2half(0));
+      res2.y = __half_as_ushort(__float2half(0));
+#endif
+      res2 = __hfma2(__hfma2(deq2[(tmp >> 0) & 0xff][off], scales_tmp[0], zeros_tmp[0]), blockvec[m][k + 0], res2);
+      res2 = __hfma2(__hfma2(deq2[(tmp >> 8) & 0xff][off], scales_tmp[1], zeros_tmp[1]), blockvec[m][k + 1], res2);
+      res2 = __hfma2(__hfma2(deq2[(tmp >> 16) & 0xff][off], scales_tmp[2], zeros_tmp[2]), blockvec[m][k + 2], res2);
+      res2 = __hfma2(__hfma2(deq2[(tmp >> 24) & 0xff][off], scales_tmp[3], zeros_tmp[3]), blockvec[m][k + 3], res2);
+#ifndef USE_ROCM
+      res[m] = __hadd(res[m], __hadd(res2.x, res2.y));
+#else
+      res[m] = __hadd(res[m], __hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)));
+#endif
+    }
+    i += width;
+    k += 4;
+  }
+  for (int m = 0; m < b_end; m++) {
+    atomicAdd(&mul[(b + m) * width + w], res[m]);
+  }
+}
+
+__global__ void gemm_half_q_half_alt_8bit_kernel(
+    const half2* __restrict__ vec,
+    const uint32_t* __restrict__ mat,
+    half* __restrict__ mul,
+    const half* __restrict__ scales,
+    const uint32_t* __restrict__ zeros,
+    const int* __restrict__ g_idx,
+    int batch,
+    int height,
+    int width) {
+  int zero_width = width / 4;
+  int vec_height = height * 2;
+  const int blockwidth2 = BLOCK_KN_SIZE / 2;
+  auto b = blockIdx.y * BLOCK_M_SIZE_MAX;
+  int b_end = min(BLOCK_M_SIZE_MAX, batch - b);
+  auto h = BLOCK_KN_SIZE * blockIdx.z / 4;
+  int h_end = min(BLOCK_KN_SIZE / 4, height - h) * 2;
+  auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+
+  __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
+  if (threadIdx.x < h_end) {
+    for (int m = 0; m < b_end; ++m) {
+      blockvec[m][threadIdx.x] = vec[(m + b) * vec_height + blockIdx.z * BLOCK_KN_SIZE / 2 + threadIdx.x];
+    }
+  }
+
+  if (blockIdx.z == 0) {
+    for (int m = 0; m < b_end; m++)
+      mul[(b + m) * width + w] = __int2half_rn(0);
+  }
+  __syncthreads();
+
+  int i = width * h + w;
+  int g_h = h * 4;
+  int k = 0;
+  int z_w = w / 4;
+  int z_mod = (w % 4) * 8;
+  half2 res2;
+  half res[BLOCK_M_SIZE_MAX] = {};
+
+  unsigned int tmp;
+  while (k < h_end) {
+    tmp = mat[i];
+    half2 scales_tmp[2];
+    half2 zeros_tmp[2];
+    for (int tmp_k = 0; tmp_k < 2; tmp_k++) {
+      int g = g_idx[g_h + (k + tmp_k) * 2];
+      int g2 = g_idx[g_h + (k + tmp_k) * 2 + 1];
+      half scale_f = scales[g * width + w];
+      half scale_f2 = scales[g2 * width + w];
+      half2 scale = __halves2half2(scale_f, scale_f2);
+      half2 zero = __halves2half2(
+          __hmul(scale_f, __int2half_rn(-((zeros[g * zero_width + z_w] >> z_mod) & 0xff) - 1)),
+          __hmul(scale_f2, __int2half_rn(-((zeros[g2 * zero_width + z_w] >> z_mod) & 0xff) - 1)));
+      scales_tmp[tmp_k] = scale;
+      zeros_tmp[tmp_k] = zero;
+    }
+    for (int m = 0; m < b_end; m++) {
+#ifndef USE_ROCM
+      res2 = {};
+#else
+      res2.x = __half_as_ushort(__float2half(0));
+      res2.y = __half_as_ushort(__float2half(0));
+#endif
+      half2 v12 = __halves2half2(__int2half_rn(tmp & 0xFF), __int2half_rn((tmp >> 8) & 0xFF));
+      res2 = __hfma2(__hfma2(v12, scales_tmp[0], zeros_tmp[0]), blockvec[m][k + 0], res2);
+      half2 v34 = __halves2half2(__int2half_rn((tmp >> 16) & 0xFF), __int2half_rn((tmp >> 24) & 0xFF));
+      res2 = __hfma2(__hfma2(v34, scales_tmp[1], zeros_tmp[1]), blockvec[m][k + 1], res2);
+#ifndef USE_ROCM
+      res[m] = __hadd(res[m], __hadd(res2.x, res2.y));
+#else
+      res[m] = __hadd(res[m], __hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)));
+#endif
+    }
+    i += width;
+    k += 2;
+  }
+  for (int m = 0; m < b_end; m++) {
+    atomicAdd(&mul[(b + m) * width + w], res[m]);
+  }
+}
+
+void gemm_half_q_half_alt(
+    const half* a,
+    const uint32_t* b_q_weight,
+    const uint32_t* b_gptq_qzeros,
+    const half* b_gptq_scales,
+    const int* b_g_idx,
+    half* c,
+    int size_m,
+    int size_n,
+    int size_k,
+    int bit) {
+  dim3 blockDim, gridDim;
+  blockDim.x = BLOCK_KN_SIZE;
+  blockDim.y = 1;
+  blockDim.z = 1;
+  gridDim.x = DIVIDE(size_n, BLOCK_KN_SIZE);
+  gridDim.y = DIVIDE(size_m, BLOCK_M_SIZE_MAX);
+  gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE);
+
+  auto kernel = gemm_half_q_half_alt_4bit_kernel;
+  if (bit == 8) {
+    kernel = gemm_half_q_half_alt_8bit_kernel;
+  }
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  kernel<<<gridDim, blockDim, 0, stream>>>(
+      (const half2*)a, b_q_weight, c, b_gptq_scales, b_gptq_qzeros, b_g_idx, size_m, size_k / 32 * bit, size_n);
+}
+
+template <class T, int bit>
+__global__ void reconstruct_gptq_kernel(
+    const uint32_t* __restrict__ w,
+    const half* __restrict__ w_scales,
+    const uint32_t* __restrict__ w_zeros,
+    const int* __restrict__ g_idx,
+    const int height,
+    const int width,
+    const int group,
+    half* __restrict__ out) {
+  // Start of block
+
+  auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+  auto row = blockIdx.y * 32 / bit;
+  if (column >= width) return;
+
+  // Views
+
+  MatrixView_half_rw out_(out, height, width);
+  MatrixView_half w_scales_(w_scales, group, width);
+  T w_zeros_(w_zeros, group, width);
+
+  uint32_t w_read = w[blockIdx.y * width + column];
+  half* out_ptr = out_.item_ptr(row, column);
+
+#pragma unroll
+  for (int s = 0; s < 32; s += bit) {
+    int group = g_idx[row + s / bit];
+    half w_scale = w_scales_.item(group, column);
+    uint32_t w_zero = w_zeros_.item(group, column) + 1;
+    half w_item = __hmul(__int2half_rn((int)((w_read >> s) & ((1 << bit) - 1)) - w_zero), w_scale);
+    *out_ptr = w_item;
+    out_ptr += out_.width;
+  }
+}
+
+__global__ void reconstruct_gptq_3bit_kernel(
+    const uint32_t* __restrict__ w,
+    const half* __restrict__ w_scales,
+    const uint32_t* __restrict__ w_zeros,
+    const int* __restrict__ g_idx,
+    const int height,
+    const int width,
+    const int group,
+    half* __restrict__ out) {
+  // Start of block
+  auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+  auto row = blockIdx.y * 32;
+  if (column >= width) return;
+
+  // Views
+
+  MatrixView_half_rw out_(out, height, width);
+  MatrixView_half w_scales_(w_scales, group, width);
+  MatrixView_q3_row w_zeros_(w_zeros, group, width);
+
+  uint32_t w1 = w[(blockIdx.y * 3) * width + column];
+  uint32_t w2 = w[(blockIdx.y * 3 + 1) * width + column];
+  uint32_t w3 = w[(blockIdx.y * 3 + 2) * width + column];
+  half* out_ptr = out_.item_ptr(row, column);
+
+#pragma unroll
+  for (int i = 0; i < 32; i += 1) {
+    int group = g_idx[row + i];
+    half w_scale = w_scales_.item(group, column);
+    uint32_t w_zero = w_zeros_.item(group, column) + 1;
+    int w_item;
+    if (i == 10) {
+      w_item = (w1 >> 30) | ((w2 << 2) & 0x4);
+    } else if (i == 21) {
+      w_item = (w2 >> 31) | ((w3 << 1) & 0x6);
+    } else if (i < 10) {
+      w_item = ((w1 >> (i * 3)) & 0x7);
+    } else if (i < 21) {
+      w_item = ((w2 >> (i * 3 - 32)) & 0x7);
+    } else {
+      w_item = ((w3 >> (i * 3 - 64)) & 0x7);
+    }
+    *out_ptr = __hmul(__int2half_rn(w_item - w_zero), w_scale);
+    out_ptr += out_.width;
+  }
+}
+
+void reconstruct_gptq(
+    const uint32_t* b_q_weight,
+    const uint32_t* b_gptq_qzeros,
+    const half* b_gptq_scales,
+    const int* b_g_idx,
+    half* out,
+    int height,
+    int width,
+    int groups,
+    int bit) {
+  dim3 blockDim, gridDim;
+  blockDim.x = BLOCK_KN_SIZE;
+  blockDim.y = 1;
+  gridDim.y = DIVIDE(height, 32 / bit);
+  gridDim.x = DIVIDE(width, BLOCK_KN_SIZE);
+
+  auto kernel = reconstruct_gptq_kernel<MatrixView_q4_row, 4>;
+  if (bit == 2) {
+    kernel = reconstruct_gptq_kernel<MatrixView_q2_row, 2>;
+  } else if (bit == 8) {
+    kernel = reconstruct_gptq_kernel<MatrixView_q8_row, 8>;
+  } else if (bit == 3) {
+    kernel = reconstruct_gptq_3bit_kernel;
+    gridDim.y = DIVIDE(height, 32);
+  }
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  kernel<<<gridDim, blockDim, 0, stream>>>(
+      b_q_weight, b_gptq_scales, b_gptq_qzeros, b_g_idx, height, width, groups, out);
+}
+
+void gemm_half_q_half_cuda(
+    cublasHandle_t cublas_handle,
+    const half* a,
+    const uint32_t* b_q_weight,
+    const uint32_t* b_gptq_qzeros,
+    const half* b_gptq_scales,
+    const int* b_g_idx,
+    half* c,
+    half* temp_dq,
+    int size_m,
+    int size_n,
+    int size_k,
+    int groups,
+    bool use_shuffle,
+    int bit) {
+  bool use_reconstruct;
+  if (use_shuffle) {
+    use_reconstruct = ((bit == 8 && size_m > MAX_Q_GEMM_ROWS_8BIT) || (bit != 8 && size_m > MAX_Q_GEMM_ROWS));
+  } else {
+    // The 2/3-bit kernels are somehow slower than dequant + gemm baseline, so
+    // we disabled them for now.
+    use_reconstruct = (bit < 4 || size_m > MAX_ALT_GEMM_ROWS);
+  }
+  if (use_reconstruct) {
+    // Reconstruct FP16 matrix, then cuBLAS
+    if (use_shuffle) {
+      reconstruct_exllama(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, temp_dq, size_k, size_n, groups, bit);
+    } else {
+      reconstruct_gptq(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, temp_dq, size_k, size_n, groups, bit);
+    }
+
+    const half alpha = __float2half(1.0f);
+    const half beta = __float2half(0.0f);
+    cublasHgemm(
+        cublas_handle,
+        CUBLAS_OP_N,
+        CUBLAS_OP_N,
+        size_n,
+        size_m,
+        size_k,
+        &alpha,
+        temp_dq,
+        size_n,
+        a,
+        size_k,
+        &beta,
+        c,
+        size_n);
+  } else if (use_shuffle) {
+    // Quantized matmul
+    int max_chunks = size_m / BLOCK_M_SIZE_MAX;
+    int last_chunk = max_chunks * BLOCK_M_SIZE_MAX;
+    int last_chunk_size = size_m - last_chunk;
+
+    if (max_chunks) {
+      gemm_half_q_half_cuda_part(
+          a,
+          b_q_weight,
+          b_gptq_qzeros,
+          b_gptq_scales,
+          b_g_idx,
+          c,
+          last_chunk,
+          size_n,
+          size_k,
+          BLOCK_M_SIZE_MAX,
+          groups,
+          bit);
+    }
+
+    if (last_chunk_size) {
+      gemm_half_q_half_cuda_part(
+          a + last_chunk * size_k,
+          b_q_weight,
+          b_gptq_qzeros,
+          b_gptq_scales,
+          b_g_idx,
+          c + last_chunk * size_n,
+          last_chunk_size,
+          size_n,
+          size_k,
+          last_chunk_size,
+          groups,
+          bit);
+    }
+  } else {
+    gemm_half_q_half_alt(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, c, size_m, size_n, size_k, bit);
+  }
+}
+
+__global__ void shuffle_4bit_kernel(uint32_t* __restrict__ b_q_weight, const int size_k, const int size_n) {
+  auto n = blockIdx.x * THREADS_X + threadIdx.x;
+  if (n >= size_n) return;
+  int k = 0;
+  uint32_t* b_ptr = b_q_weight + n;
+  while (k < size_k) {
+    shuffle_4bit_8(b_ptr, size_n);
+    b_ptr += 1 * size_n;
+    k += 8;
+  }
+}
+
+__global__ void shuffle_8bit_kernel(uint32_t* __restrict__ b_q_weight, const int size_k, const int size_n) {
+  auto n = blockIdx.x * THREADS_X + threadIdx.x;
+  if (n >= size_n) return;
+  int k = 0;
+  uint32_t* b_ptr = b_q_weight + n;
+  while (k < size_k) {
+    shuffle_8bit_4(b_ptr, size_n);
+    b_ptr += 1 * size_n;
+    k += 4;
+  }
+}
+
+__global__ void shuffle_2bit_kernel(uint32_t* __restrict__ b_q_weight, const int size_k, const int size_n) {
+  auto n = blockIdx.x * THREADS_X + threadIdx.x;
+  if (n >= size_n) return;
+  int k = 0;
+  uint32_t* b_ptr = b_q_weight + n;
+  while (k < size_k) {
+    shuffle_2bit_16(b_ptr, size_n);
+    b_ptr += 1 * size_n;
+    k += 16;
+  }
+}
+
+__global__ void shuffle_3bit_kernel(uint32_t* __restrict__ b_q_weight, const int size_k, const int size_n) {
+  auto n = blockIdx.x * THREADS_X + threadIdx.x;
+  if (n >= size_n) return;
+  int k = 0;
+  uint32_t* b_ptr = b_q_weight + n;
+  while (k < size_k) {
+    shuffle_3bit_32(b_ptr, size_n);
+    b_ptr += 3 * size_n;
+    k += 32;
+  }
+}
+
+__global__ void make_sequential_4bit_kernel(
+    const uint32_t* __restrict__ w, uint32_t* __restrict__ w_new, const int* __restrict__ q_perm, const int w_width) {
+  const uint64_t* w2 = (uint64_t*)w;
+  uint64_t* w_new2 = (uint64_t*)w_new;
+  int w2_stride = w_width >> 1;
+  auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
+  if (w2_column >= w2_stride) return;
+  auto w_new2_row = blockIdx.y;
+  int q_perm_idx = w_new2_row << 3;
+  uint64_t dst = 0;
+
+#pragma unroll
+  for (int i = 0; i < 8; i++) {
+    int source_row = q_perm[q_perm_idx++];
+
+    int w2_row = source_row >> 3;
+    int w2_subrow = source_row & 0x07;
+    int w2_row_shift = w2_subrow << 2;
+    int wnew2_row_shift = i << 2;
+
+    uint64_t src = w2[w2_row * w2_stride + w2_column];
+    src >>= w2_row_shift;
+    src &= 0x0000000f0000000f;
+    src <<= wnew2_row_shift;
+    dst |= src;
+  }
+  w_new2[w_new2_row * w2_stride + w2_column] = dst;
+}
+
+__global__ void make_sequential_2bit_kernel(
+    const uint32_t* __restrict__ w, uint32_t* __restrict__ w_new, const int* __restrict__ q_perm, const int w_width) {
+  const uint64_t* w2 = (uint64_t*)w;
+  uint64_t* w_new2 = (uint64_t*)w_new;
+  int w2_stride = w_width >> 1;
+  auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
+  if (w2_column >= w2_stride) return;
+  auto w_new2_row = blockIdx.y;
+  int q_perm_idx = w_new2_row << 4;
+  uint64_t dst = 0;
+
+#pragma unroll
+  for (int i = 0; i < 16; i++) {
+    int source_row = q_perm[q_perm_idx++];
+
+    int w2_row = source_row >> 4;
+    int w2_subrow = source_row & 0x0f;
+    int w2_row_shift = w2_subrow << 1;
+    int wnew2_row_shift = i << 1;
+
+    uint64_t src = w2[w2_row * w2_stride + w2_column];
+    src >>= w2_row_shift;
+    src &= 0x0000000300000003;
+    src <<= wnew2_row_shift;
+    dst |= src;
+  }
+  w_new2[w_new2_row * w2_stride + w2_column] = dst;
+}
+
+__global__ void make_sequential_3bit_kernel(
+    const uint32_t* __restrict__ w, uint32_t* __restrict__ w_new, const int* __restrict__ q_perm, const int w_width) {
+  auto w_column = THREADS_X * blockIdx.x + threadIdx.x;
+  if (w_column >= w_width) return;
+  auto w_new_row = blockIdx.y * 3;
+  auto q_perm_idx = blockIdx.y << 5;
+  uint32_t dst[3] = {0, 0, 0};
+
+#pragma unroll
+  for (int i = 0; i < 32; i++) {
+    int source_row = q_perm[q_perm_idx++];
+    int z_w = (source_row / 32) * 3;
+    int z_mod = source_row % 32;
+    int z_bit;
+
+    if (z_mod != 10) {
+      if (z_mod != 21) {
+        z_bit = z_mod;
+        if (z_bit > 21) {
+          z_bit *= 3;
+          z_bit -= 64;
+          z_w += 2;
+        } else if (z_bit > 10) {
+          z_bit *= 3;
+          z_bit -= 32;
+          z_w += 1;
+        } else {
+          z_bit *= 3;
+        }
+      } else {
+        z_w += 1;
+      }
+    }
+
+    uint64_t src;
+    if (z_mod == 10) {
+      src = (w[z_w * w_width + w_column] >> 30) | ((w[(z_w + 1) * w_width + w_column] << 2) & 0x4);
+    } else if (z_mod == 21) {
+      src = (w[z_w * w_width + w_column] >> 31) | ((w[(z_w + 1) * w_width + w_column] << 1) & 0x6);
+    } else {
+      src = w[z_w * w_width + w_column];
+      src >>= z_bit;
+      src &= 0x07;
+    }
+
+    z_w = 0;
+    if (i != 10) {
+      if (i != 21) {
+        z_bit = i;
+        if (z_bit > 21) {
+          z_bit *= 3;
+          z_bit -= 64;
+          z_w += 2;
+        } else if (z_bit > 10) {
+          z_bit *= 3;
+          z_bit -= 32;
+          z_w += 1;
+        } else {
+          z_bit *= 3;
+        }
+      } else {
+        z_w += 1;
+      }
+    }
+    if (i == 10) {
+      dst[z_w] |= (src & 0x03) << 30;
+      dst[z_w + 1] |= ((src & 0x4) >> 2);
+    } else if (i == 21) {
+      dst[z_w] |= (src & 0x01) << 31;
+      dst[z_w + 1] |= ((src & 0x6) >> 1);
+    } else {
+      dst[z_w] |= (src << z_bit);
+    }
+  }
+  w_new[w_new_row * w_width + w_column] = dst[0];
+  w_new[(w_new_row + 1) * w_width + w_column] = dst[1];
+  w_new[(w_new_row + 2) * w_width + w_column] = dst[2];
+}
+
+__global__ void make_sequential_8bit_kernel(
+    const uint32_t* __restrict__ w, uint32_t* __restrict__ w_new, const int* __restrict__ q_perm, const int w_width) {
+  const uint64_t* w2 = (uint64_t*)w;
+  uint64_t* w_new2 = (uint64_t*)w_new;
+  int w2_stride = w_width >> 1;
+  auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
+  if (w2_column >= w2_stride) return;
+  auto w_new2_row = blockIdx.y;
+  int q_perm_idx = w_new2_row << 2;
+  uint64_t dst = 0;
+
+#pragma unroll
+  for (int i = 0; i < 4; i++) {
+    int source_row = q_perm[q_perm_idx++];
+
+    int w2_row = source_row >> 2;
+    int w2_subrow = source_row & 0x03;
+    int w2_row_shift = w2_subrow << 3;
+    int wnew2_row_shift = i << 3;
+
+    uint64_t src = w2[w2_row * w2_stride + w2_column];
+    src >>= w2_row_shift;
+    src &= 0x000000ff000000ff;
+    src <<= wnew2_row_shift;
+    dst |= src;
+  }
+  w_new2[w_new2_row * w2_stride + w2_column] = dst;
+}
+
+void shuffle_exllama_weight(uint32_t* q_weight, int* q_perm, int height, int width, int bit) {
+  if (q_perm) {
+    uint32_t* new_qweight = NULL;
+    cudaMalloc(&new_qweight, height / 32 * bit * width * sizeof(uint32_t));
+
+    dim3 blockDim, gridDim;
+    blockDim.x = THREADS_X;
+    blockDim.y = 1;
+    gridDim.x = DIVIDE(width, THREADS_X);
+    gridDim.y = height / 32 * bit;
+
+    auto kernel = make_sequential_4bit_kernel;
+    if (bit == 2) {
+      kernel = make_sequential_2bit_kernel;
+    } else if (bit == 3) {
+      kernel = make_sequential_3bit_kernel;
+      gridDim.y = height / 32;
+    } else if (bit == 8) {
+      kernel = make_sequential_8bit_kernel;
+    }
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    kernel<<<gridDim, blockDim, 0, stream>>>(q_weight, new_qweight, q_perm, width);
+    // Replace qweights
+    cudaMemcpyAsync(q_weight, new_qweight, height / 32 * bit * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice);
+    // Cleanup
+    cudaDeviceSynchronize();
+    cudaFree(new_qweight);
+  }
+  dim3 blockDim, gridDim;
+  blockDim.x = THREADS_X;
+  blockDim.y = 1;
+  gridDim.x = DIVIDE(width, THREADS_X);
+  gridDim.y = 1;
+  auto shuffle_kernel = shuffle_4bit_kernel;
+  if (bit == 2) {
+    shuffle_kernel = shuffle_2bit_kernel;
+  } else if (bit == 3) {
+    shuffle_kernel = shuffle_3bit_kernel;
+  } else if (bit == 8) {
+    shuffle_kernel = shuffle_8bit_kernel;
+  }
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  shuffle_kernel<<<gridDim, blockDim, 0, stream>>>(q_weight, height, width);
+}
+
+}  // namespace gptq
+}  // namespace sglang
+
+torch::Tensor gptq_gemm(
+    torch::Tensor a,
+    torch::Tensor b_q_weight,
+    torch::Tensor b_gptq_qzeros,
+    torch::Tensor b_gptq_scales,
+    torch::Tensor b_g_idx,
+    bool use_shuffle,
+    int64_t bit) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  at::Tensor c = torch::empty({a.size(0), b_q_weight.size(1)}, options);
+  at::Tensor temp_dq = torch::empty({b_q_weight.size(0) * 32 / bit, b_q_weight.size(1)}, options);
+
+  sglang::gptq::gemm_half_q_half_cuda(
+      at::cuda::getCurrentCUDABlasHandle(),
+      (const half*)a.data_ptr(),
+      (const uint32_t*)b_q_weight.data_ptr(),
+      (const uint32_t*)b_gptq_qzeros.data_ptr(),
+      (const half*)b_gptq_scales.data_ptr(),
+      b_g_idx.device().is_meta() ? NULL : (const int*)b_g_idx.data_ptr(),
+      (half*)c.data_ptr(),
+      (half*)temp_dq.data_ptr(),
+      c.size(0),              // m
+      c.size(1),              // n
+      a.size(1),              // k
+      b_gptq_qzeros.size(0),  // group number
+      use_shuffle,
+      bit);
+  return c;
+}
+
+void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(q_weight));
+  sglang::gptq::shuffle_exllama_weight(
+      (uint32_t*)q_weight.data_ptr(),
+      q_perm.device().is_meta() || q_perm.numel() == 0 ? NULL : (int*)q_perm.data_ptr(),
+      q_weight.size(0) * 32 / bit,
+      q_weight.size(1),
+      bit);
+}
diff --git a/sgl-kernel/csrc/gemm/gptq/matrix_view.cuh b/sgl-kernel/csrc/gemm/gptq/matrix_view.cuh
new file mode 100644
index 000000000..3dfc8794c
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/gptq/matrix_view.cuh
@@ -0,0 +1,269 @@
+/*
+Adapted from https://github.com/turboderp/exllamav2 and
+https://github.com/turboderp/exllama
+*/
+
+#ifndef _matrix_view_cuh
+#define _matrix_view_cuh
+
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include "qdq_util.cuh"
+
+namespace sglang {
+namespace gptq {
+
+class MatrixView_half {
+ public:
+  const half* data;
+  const int height;
+  const int width;
+
+  __device__ __forceinline__ MatrixView_half(const half* data, const int height, const int width)
+      : data(data), height(height), width(width) {}
+
+  __device__ __forceinline__ half item(int row, int column) const {
+    return data[row * width + column];
+  }
+  __device__ __forceinline__ half2 item_half2(int row, int column) const {
+    return ((half2*)data)[(row * width + column) / 2];
+  }
+  __device__ __forceinline__ half2 item_half2half2(int row, int column) const {
+    return __half2half2(data[row * width + column]);
+  }
+  __device__ __forceinline__ const half* item_ptr(int row, int column) const {
+    return &data[row * width + column];
+  }
+
+  __device__ __forceinline__ void item4(half (&items)[4], int row, int column) const {
+    half2* ptr = (half2*)item_ptr(row, column);
+    half2 i01 = ptr[0];
+    half2 i23 = ptr[1];
+    items[0] = __low2half(i01);
+    items[1] = __high2half(i01);
+    items[2] = __low2half(i23);
+    items[3] = __high2half(i23);
+  }
+  __device__ __forceinline__ void item4_f(float (&items)[4], int row, int column) const {
+    half2* ptr = (half2*)item_ptr(row, column);
+    half2 i01 = ptr[0];
+    half2 i23 = ptr[1];
+    items[0] = __half2float(__low2half(i01));
+    items[1] = __half2float(__high2half(i01));
+    items[2] = __half2float(__low2half(i23));
+    items[3] = __half2float(__high2half(i23));
+  }
+
+  __device__ __forceinline__ void item4_h2(half2 (&items)[4], int row, int column) const {
+    half2* ptr = (half2*)item_ptr(row, column);
+    half2 i01 = ptr[0];
+    half2 i23 = ptr[1];
+    items[0] = __half2half2(__low2half(i01));
+    items[1] = __half2half2(__high2half(i01));
+    items[2] = __half2half2(__low2half(i23));
+    items[3] = __half2half2(__high2half(i23));
+  }
+};
+
+class MatrixView_half_rw {
+ public:
+  half* data;
+  const int height;
+  const int width;
+
+  __device__ __forceinline__ MatrixView_half_rw(half* data, const int height, const int width)
+      : data(data), height(height), width(width) {}
+
+  __device__ __forceinline__ half item(int row, int column) const {
+    return data[row * width + column];
+  }
+  __device__ __forceinline__ half2 item_half2(int row, int column) const {
+    return ((half2*)data)[(row * width + column) / 2];
+  }
+  __device__ __forceinline__ half* item_ptr(int row, int column) {
+    return &data[row * width + column];
+  }
+  __device__ __forceinline__ void set(int row, int column, half value) {
+    data[row * width + column] = value;
+  }
+  __device__ __forceinline__ void set_half2(int row, int column, half2 value) {
+    ((half2*)data)[(row * width + column) / 2] = value;
+  }
+
+  __device__ __forceinline__ void set4(int row, int column, half v0, half v1, half v2, half v3) {
+    half2 v01 = __halves2half2(v0, v1);
+    half2 v23 = __halves2half2(v2, v3);
+    half2* ptr = (half2*)item_ptr(row, column);
+    ptr[0] = v01;
+    ptr[1] = v23;
+  }
+};
+
+class MatrixView_q4_row {
+ public:
+  const uint32_t* data;
+  const int height;
+  const int width;
+
+  __device__ __forceinline__ MatrixView_q4_row(const uint32_t* data, const int height, const int width)
+      : data(data), height(height), width(width) {}
+
+  __device__ __forceinline__ int item(int row, int column) const {
+    int shift = (column & 0x07) * 4;
+    return (data[row * width / 8 + column / 8] >> shift) & 0x0f;
+  }
+
+  __device__ __forceinline__ void item2(int (&items)[2], int row, int column) const {
+    int shift = (column & 0x07) * 4;
+    uint32_t d = data[row * width / 8 + column / 8] >> shift;
+    items[0] = d & 0x0f;
+    items[1] = (d >> 4) & 0x0f;
+  }
+
+  __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const {
+    int shift = (column & 0x07) * 4;
+    uint32_t d = data[row * width / 8 + column / 8] >> shift;
+    items[0] = d & 0x0f;
+    items[1] = (d >> 4) & 0x0f;
+    items[2] = (d >> 8) & 0x0f;
+    items[3] = (d >> 12) & 0x0f;
+  }
+};
+
+class MatrixView_q4_column {
+ public:
+  const uint32_t* data;
+  const int height;
+  const int width;
+
+  __device__ __forceinline__ MatrixView_q4_column(const uint32_t* data, const int height, const int width)
+      : data(data), height(height), width(width) {}
+
+  __device__ __forceinline__ int item(int row, int column) const {
+    int shift = (row & 0x07) * 4;
+    return (data[row / 8 * width + column] >> shift) & 0x0f;
+  }
+
+  __device__ __forceinline__ uint32_t item_uint32_t(int row, int column) {
+    return data[row / 8 * width + column];
+  }
+  __device__ __forceinline__ const uint32_t* item_uint32_ptr(int row, int column) {
+    return &data[row / 8 * width + column];
+  }
+};
+
+class MatrixView_q2_row {
+ public:
+  const uint32_t* data;
+  const int height;
+  const int width;
+
+  __device__ __forceinline__ MatrixView_q2_row(const uint32_t* data, const int height, const int width)
+      : data(data), height(height), width(width) {}
+
+  __device__ __forceinline__ int item(int row, int column) const {
+    int shift = (column & 0x0f) * 2;
+    return (data[row * width / 16 + column / 16] >> shift) & 0x03;
+  }
+
+  __device__ __forceinline__ void item2(int (&items)[2], int row, int column) const {
+    int shift = (column & 0x0f) * 2;
+    uint32_t d = data[row * width / 16 + column / 16] >> shift;
+    items[0] = d & 0x03;
+    items[1] = (d >> 2) & 0x03;
+  }
+
+  __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const {
+    int shift = (column & 0x0f) * 2;
+    uint32_t d = data[row * width / 16 + column / 16] >> shift;
+    items[0] = d & 0x03;
+    items[1] = (d >> 2) & 0x03;
+    items[2] = (d >> 4) & 0x03;
+    items[3] = (d >> 6) & 0x03;
+  }
+};
+
+class MatrixView_q3_row {
+ public:
+  const uint32_t* data;
+  const int height;
+  const int width;
+
+  __device__ __forceinline__ MatrixView_q3_row(const uint32_t* data, const int height, const int width)
+      : data(data), height(height), width(width) {}
+
+  __device__ __forceinline__ int item(int row, int column) const {
+    int z_w = column * 3 / 32;
+    int z_mod = column & 0x1f;
+
+    if (z_mod == 10) {
+      return (data[row * width * 3 / 32 + z_w] >> 30) | ((data[row * width * 3 / 32 + (z_w + 1)] << 2) & 0x4);
+    } else if (z_mod == 21) {
+      return (data[row * width * 3 / 32 + z_w] >> 31) | ((data[row * width * 3 / 32 + (z_w + 1)] << 1) & 0x6);
+    } else if (z_mod < 10) {
+      return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3)) & 0x07;
+    } else if (z_mod < 21) {
+      return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3 - 32)) & 0x07;
+    } else {
+      return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3 - 64)) & 0x07;
+    }
+  }
+
+  __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const {
+    int shift = (column & 0x1f);
+    uint32_t d;
+    if (shift <= 4) {
+      d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3);
+    } else if (shift == 8) {
+      d = (data[row * width / 32 * 3 + column * 3 / 32] >> 24) |
+          ((data[row * width / 32 * 3 + column * 3 / 32 + 1] & 0x0f) << 8);
+    } else if (shift <= 16) {
+      d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3 - 32);
+    } else if (shift == 20) {
+      d = (data[row * width / 32 * 3 + column * 3 / 32] >> 28) |
+          ((data[row * width / 32 * 3 + column * 3 / 32 + 1] & 0xff) << 4);
+    } else {
+      d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3 - 64);
+    }
+    items[0] = d & 0x07;
+    items[1] = (d >> 3) & 0x07;
+    items[2] = (d >> 6) & 0x07;
+    items[3] = (d >> 9) & 0x07;
+  }
+};
+
+class MatrixView_q8_row {
+ public:
+  const uint32_t* data;
+  const int height;
+  const int width;
+
+  __device__ __forceinline__ MatrixView_q8_row(const uint32_t* data, const int height, const int width)
+      : data(data), height(height), width(width) {}
+
+  __device__ __forceinline__ int item(int row, int column) const {
+    int shift = (column & 0x03) * 8;
+    return (data[row * width / 4 + column / 4] >> shift) & 0xff;
+  }
+
+  __device__ __forceinline__ void item2(int (&items)[2], int row, int column) const {
+    int shift = (column & 0x03) * 8;
+    uint32_t d = data[row * width / 4 + column / 4] >> shift;
+    items[0] = d & 0xff;
+    items[1] = (d >> 8) & 0xff;
+  }
+
+  __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const {
+    int shift = (column & 0x03) * 2;
+    uint32_t d = data[row * width / 4 + column / 4] >> shift;
+    items[0] = d & 0xff;
+    items[1] = (d >> 8) & 0xff;
+    items[2] = (d >> 16) & 0xff;
+    items[3] = (d >> 24) & 0xff;
+  }
+};
+
+}  // namespace gptq
+}  // namespace sglang
+#endif
diff --git a/sgl-kernel/csrc/gemm/gptq/qdq_2.cuh b/sgl-kernel/csrc/gemm/gptq/qdq_2.cuh
new file mode 100644
index 000000000..4a75d7b56
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/gptq/qdq_2.cuh
@@ -0,0 +1,74 @@
+/*
+Copied from https://github.com/turboderp/exllamav2
+*/
+
+#ifndef _qdq_2_cuh
+#define _qdq_2_cuh
+
+#include "qdq_util.cuh"
+
+namespace sglang {
+namespace gptq {
+
+// Permutation:
+//
+// ffddbb99 77553311  eeccaa88 66442200
+
+__forceinline__ __device__ void shuffle_2bit_16(uint32_t* q, int stride) {
+  uint32_t qa = q[0];
+  uint32_t qb = 0;
+
+#pragma unroll
+  for (int i = 0; i < 8; i++) {
+    uint32_t qa0 = qa & 0x03;
+    uint32_t qa1 = (qa & 0x0c) >> 2;
+    qa >>= 4;
+    qb |= (qa1 << (i * 2 + 16));
+    qb |= (qa0 << (i * 2));
+  }
+  q[0] = qb;
+}
+
+__forceinline__ __device__ void dequant_2bit_16(const uint32_t q_0, half2 (&dq)[8], int stride, const uint32_t zero) {
+  const uint32_t c0 = 0x64006400;
+  const half y4_ = __float2half_rn(1.0f / 4.0f);
+  const half y16_ = __float2half_rn(1.0f / 16.0f);
+  const half y64_ = __float2half_rn(1.0f / 64.0f);
+  const half2 y4 = __halves2half2(y4_, y4_);
+  const half2 y16 = __halves2half2(y16_, y16_);
+  const half2 y64 = __halves2half2(y64_, y64_);
+
+  const half_uint16 z1_(0xe400 | zero);  // half(-1024.0f - zero);
+  const half z4_ = __hsub(__int2half_rn(-256), __int2half_rn(zero));
+  const half z16_ = __hsub(__int2half_rn(-64), __int2half_rn(zero));
+  const half z64_ = __hsub(__int2half_rn(-16), __int2half_rn(zero));
+  const half2 z1 = __half2half2(z1_.as_half);
+  const half2 z4 = __half2half2(z4_);
+  const half2 z16 = __half2half2(z16_);
+  const half2 z64 = __half2half2(z64_);
+
+  uint32_t qa = q_0;
+  half2_uint32 q0((qa & 0x00030003) | c0);  // half2(q[ 0], q[ 1])      + 1024
+  half2_uint32 q1((qa & 0x000c000c) | c0);  // half2(q[ 2], q[ 3]) *  4 + 1024
+  half2_uint32 q2((qa & 0x00300030) | c0);  // half2(q[ 4], q[ 5]) * 16 + 1024
+  half2_uint32 q3((qa & 0x00c000c0) | c0);  // half2(q[ 6], q[ 7]) * 64 + 1024
+  qa >>= 8;
+  half2_uint32 q4((qa & 0x00030003) | c0);  // half2(q[ 8], q[ 8])      + 1024
+  half2_uint32 q5((qa & 0x000c000c) | c0);  // half2(q[10], q[11]) *  4 + 1024
+  half2_uint32 q6((qa & 0x00300030) | c0);  // half2(q[12], q[13]) * 16 + 1024
+  half2_uint32 q7((qa & 0x00c000c0) | c0);  // half2(q[14], q[15]) * 64 + 1024
+
+  dq[0] = __hadd2(q0.as_half2, z1);
+  dq[1] = __hfma2(q1.as_half2, y4, z4);
+  dq[2] = __hfma2(q2.as_half2, y16, z16);
+  dq[3] = __hfma2(q3.as_half2, y64, z64);
+  dq[4] = __hadd2(q4.as_half2, z1);
+  dq[5] = __hfma2(q5.as_half2, y4, z4);
+  dq[6] = __hfma2(q6.as_half2, y16, z16);
+  dq[7] = __hfma2(q7.as_half2, y64, z64);
+}
+
+}  // namespace gptq
+}  // namespace sglang
+
+#endif
diff --git a/sgl-kernel/csrc/gemm/gptq/qdq_3.cuh b/sgl-kernel/csrc/gemm/gptq/qdq_3.cuh
new file mode 100644
index 000000000..5996f342d
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/gptq/qdq_3.cuh
@@ -0,0 +1,146 @@
+#ifndef _qdq_3_cuh
+#define _qdq_3_cuh
+
+#include "qdq_util.cuh"
+
+namespace sglang {
+namespace gptq {
+// Permutation:
+//
+// v9997775 55333111  u8886664 44222000  (u, v lsb)
+// vjjjhhhf ffdddbbb  uiiiggge eecccaaa
+// vtttrrrp ppnnnlll  usssqqqo oommmkkk
+
+__forceinline__ __device__ void shuffle_3bit_32(uint32_t* q, int stride) {
+  uint32_t qa = q[0 * stride];
+  uint32_t qb = q[1 * stride];
+  uint32_t qc = q[2 * stride];
+
+  // qa: aa999888 77766655  54443332 22111000
+  // qb: lkkkjjji iihhhggg  fffeeedd dcccbbba
+  // qc: vvvuuutt tsssrrrq  qqpppooo nnnmmmll
+
+  uint32_t qd = qc >> 26;
+  qc <<= 4;
+  qc |= qb >> 28;
+  qb <<= 2;
+  qb |= qa >> 30;
+
+  // qa: ..999888 77766655  54443332 22111000
+  // qb: ..jjjiii hhhgggff  feeedddc ccbbbaaa
+  // qc: ..tttsss rrrqqqpp  pooonnnm mmlllkkk
+  // qd:                               vvvuuu
+
+  uint32_t za = 0;
+  uint32_t zb = 0;
+  uint32_t zc = 0;
+
+  for (int i = 0; i < 5; i++) {
+    uint32_t t0 = qa & 0x07;
+    uint32_t t1 = (qa & 0x38) >> 3;
+    qa >>= 6;
+    za |= (t0 << (i * 3));
+    za |= (t1 << (i * 3 + 16));
+  }
+  for (int i = 0; i < 5; i++) {
+    uint32_t t0 = qb & 0x07;
+    uint32_t t1 = (qb & 0x38) >> 3;
+    qb >>= 6;
+    zb |= (t0 << (i * 3));
+    zb |= (t1 << (i * 3 + 16));
+  }
+  for (int i = 0; i < 5; i++) {
+    uint32_t t0 = qc & 0x07;
+    uint32_t t1 = (qc & 0x38) >> 3;
+    qc >>= 6;
+    zc |= (t0 << (i * 3));
+    zc |= (t1 << (i * 3 + 16));
+  }
+
+  // za:  9997775 55333111   8886664 44222000
+  // zb:  jjjhhhf ffdddbbb   iiiggge eecccaaa
+  // zc:  tttrrrp ppnnnlll   sssqqqo oommmkkk
+  // qd:                               vvvuuu
+
+  za |= ((qd & 0x01) >> 0) << 15;
+  zb |= ((qd & 0x02) >> 1) << 15;
+  zc |= ((qd & 0x04) >> 2) << 15;
+  za |= ((qd & 0x08) >> 3) << 31;
+  zb |= ((qd & 0x10) >> 4) << 31;
+  zc |= ((qd & 0x20) >> 5) << 31;
+
+  // za: v9997775 55333111  u8886664 44222000  (u, v lsb)
+  // zb: vjjjhhhf ffdddbbb  uiiiggge eecccaaa
+  // zc: vtttrrrp ppnnnlll  usssqqqo oommmkkk
+
+  q[0 * stride] = za;
+  q[1 * stride] = zb;
+  q[2 * stride] = zc;
+}
+
+__forceinline__ __device__ void dequant_3bit_32(
+    const uint32_t q_0, const uint32_t q_1, const uint32_t q_2, half2 (&dq)[16], int stride, const uint32_t zero) {
+  const uint32_t c0 = 0x64006400;
+  const half y8_ = __float2half_rn(1.0f / 8.0f);
+  const half y64_ = __float2half_rn(1.0f / 64.0f);
+  const half2 y8 = __halves2half2(y8_, y8_);
+  const half2 y64 = __halves2half2(y64_, y64_);
+  const half_uint16 z1_(0xe400 | zero);  // half(-1024.0f - zero);
+  const half z8_ = __hsub(__int2half_rn(-128), __int2half_rn(zero));
+  const half z64_ = __hsub(__int2half_rn(-16), __int2half_rn(zero));
+  const half2 z1 = __halves2half2(z1_.as_half, z1_.as_half);
+  const half2 z8 = __halves2half2(z8_, z8_);
+  const half2 z64 = __halves2half2(z64_, z64_);
+
+  uint32_t qa = q_0;
+  uint32_t qb = q_1;
+  uint32_t qc = q_2;
+
+  half2_uint32 q0((qa & 0x00070007) | c0);  // half2(q[ 0], q[ 1])      + 1024
+  half2_uint32 q1((qa & 0x00380038) | c0);  // half2(q[ 2], q[ 3]) *  8 + 1024
+  qa >>= 6;
+  half2_uint32 q2((qa & 0x00070007) | c0);  // half2(q[ 4], q[ 5])      + 1024
+  half2_uint32 q3((qa & 0x00380038) | c0);  // half2(q[ 6], q[ 7]) *  8 + 1024
+  half2_uint32 q4((qa & 0x01c001c0) | c0);  // half2(q[ 8], q[ 9]) * 64 + 1024
+  qa >>= 9;
+  qa &= 0x00010001;
+  half2_uint32 q5((qb & 0x00070007) | c0);  // half2(q[10], q[11])      + 1024
+  half2_uint32 q6((qb & 0x00380038) | c0);  // half2(q[12], q[13]) *  8 + 1024
+  qb >>= 6;
+  half2_uint32 q7((qb & 0x00070007) | c0);  // half2(q[14], q[15])      + 1024
+  half2_uint32 q8((qb & 0x00380038) | c0);  // half2(q[16], q[17]) *  8 + 1024
+  half2_uint32 q9((qb & 0x01c001c0) | c0);  // half2(q[18], q[19]) * 64 + 1024
+  qb >>= 8;
+  qb &= 0x00020002;
+  half2_uint32 q10((qc & 0x00070007) | c0);  // half2(q[20], q[21])      + 1024
+  half2_uint32 q11((qc & 0x00380038) | c0);  // half2(q[22], q[23]) *  8 + 1024
+  qc >>= 6;
+  half2_uint32 q12((qc & 0x00070007) | c0);  // half2(q[24], q[25])      + 1024
+  half2_uint32 q13((qc & 0x00380038) | c0);  // half2(q[26], q[27]) *  8 + 1024
+  half2_uint32 q14((qc & 0x01c001c0) | c0);  // half2(q[28], q[29]) * 64 + 1024
+  qc >>= 7;
+  qc &= 0x00040004;
+  half2_uint32 q15((qa | qb | qc) | c0);
+
+  dq[0] = __hadd2(q0.as_half2, z1);
+  dq[1] = __hfma2(q1.as_half2, y8, z8);
+  dq[2] = __hadd2(q2.as_half2, z1);
+  dq[3] = __hfma2(q3.as_half2, y8, z8);
+  dq[4] = __hfma2(q4.as_half2, y64, z64);
+  dq[5] = __hadd2(q5.as_half2, z1);
+  dq[6] = __hfma2(q6.as_half2, y8, z8);
+  dq[7] = __hadd2(q7.as_half2, z1);
+  dq[8] = __hfma2(q8.as_half2, y8, z8);
+  dq[9] = __hfma2(q9.as_half2, y64, z64);
+  dq[10] = __hadd2(q10.as_half2, z1);
+  dq[11] = __hfma2(q11.as_half2, y8, z8);
+  dq[12] = __hadd2(q12.as_half2, z1);
+  dq[13] = __hfma2(q13.as_half2, y8, z8);
+  dq[14] = __hfma2(q14.as_half2, y64, z64);
+  dq[15] = __hadd2(q15.as_half2, z1);
+}
+
+}  // namespace gptq
+}  // namespace sglang
+
+#endif
diff --git a/sgl-kernel/csrc/gemm/gptq/qdq_4.cuh b/sgl-kernel/csrc/gemm/gptq/qdq_4.cuh
new file mode 100644
index 000000000..c96af4718
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/gptq/qdq_4.cuh
@@ -0,0 +1,114 @@
+/*
+Copied from https://github.com/turboderp/exllamav2
+*/
+
+#ifndef _qdq_4_cuh
+#define _qdq_4_cuh
+
+#include "qdq_util.cuh"
+
+namespace sglang {
+namespace gptq {
+// Permutation:
+//
+// 77775555 33331111  66664444 22220000
+
+__forceinline__ __device__ void shuffle_4bit_8(uint32_t* q, int stride) {
+  uint32_t qa = q[0];
+  uint32_t qb = 0;
+
+#pragma unroll
+  for (int i = 0; i < 4; i++) {
+    uint32_t qa0 = qa & 0x0f;
+    uint32_t qa1 = (qa & 0xf0) >> 4;
+    qa >>= 8;
+    qb |= (qa1 << (i * 4 + 16));
+    qb |= (qa0 << (i * 4));
+  }
+  q[0] = qb;
+}
+
+__forceinline__ __device__ void dequant_4bit_8(const uint32_t q_0, half2 (&dq)[4], int stride, const uint32_t zero) {
+  const uint32_t c0 = 0x64006400;
+  const half y16_ = __float2half_rn(1.0f / 16.0f);
+  const half2 y16 = __halves2half2(y16_, y16_);
+  const half_uint16 z1_(0xe400 | zero);  // half(-1024.0f - zero);
+  const half z16_ = __hsub(__int2half_rn(-64), __int2half_rn(zero));
+  const half2 z1 = __half2half2(z1_.as_half);
+  const half2 z16 = __half2half2(z16_);
+
+  uint32_t qa = q_0;
+  half2_uint32 q0((qa & 0x000f000f) | c0);  // half2(q[ 0], q[ 1])      + 1024
+  half2_uint32 q1((qa & 0x00f000f0) | c0);  // half2(q[ 2], q[ 3]) * 16 + 1024
+  qa >>= 8;
+  half2_uint32 q2((qa & 0x000f000f) | c0);  // half2(q[ 4], q[ 5])      + 1024
+  half2_uint32 q3((qa & 0x00f000f0) | c0);  // half2(q[ 6], q[ 7]) * 16 + 1024
+
+  dq[0] = __hadd2(q0.as_half2, z1);
+  dq[1] = __hfma2(q1.as_half2, y16, z16);
+  dq[2] = __hadd2(q2.as_half2, z1);
+  dq[3] = __hfma2(q3.as_half2, y16, z16);
+}
+
+__forceinline__ __device__ void
+dequant_4bit_8_prep_zero_scale(const uint32_t zero, const half scale, half2 (&z1z16)[2], half2 (&y1y16)[2]) {
+  half_uint16 z1(0xe400 | zero);  // half(-1024.0f - zero);
+  half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero));
+
+  half2 scale2 = __half2half2(scale);
+
+  z1z16[0] = __hmul2(scale2, __half2half2(z1.as_half));
+  z1z16[1] = __hmul2(scale2, __half2half2(z16));
+
+  const half y1 = __float2half_rn(1.0f);
+  const half y16 = __float2half_rn(1.0f / 16.0f);
+
+  y1y16[0] = __hmul2(scale2, __half2half2(y1));
+  y1y16[1] = __hmul2(scale2, __half2half2(y16));
+}
+
+__forceinline__ __device__ void dequant_4bit_8_prep_zero(const uint32_t zero, half2 (&z1z16)[2], half2 (&y1y16)[2]) {
+  half_uint16 z1(0xe400 | zero);  // half(-1024.0f - zero);
+  half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero));
+
+  z1z16[0] = __half2half2(z1.as_half);
+  z1z16[1] = __half2half2(z16);
+
+  const half y1 = __float2half_rn(1.0f);
+  const half y16 = __float2half_rn(1.0f / 16.0f);
+
+  y1y16[0] = __half2half2(y1);
+  y1y16[1] = __half2half2(y16);
+}
+
+__forceinline__ __device__ void
+dequant_4bit_8_gptq(const uint32_t q_0, half2 (&dq)[4], half2 (&z1z16)[2], half2 (&y1y16)[2], int stride, bool scaled) {
+  const uint32_t c0 = 0x64006400;
+
+  uint32_t qa = q_0;
+  half2_uint32 q0((qa & 0x000f000f) | c0);  // half2( q[0]      + 1024, q[1]      + 1024 )
+  half2_uint32 q1((qa & 0x00f000f0) | c0);  // half2( q[2] * 16 + 1024, q[3] * 16 + 1024 )
+  qa >>= 8;
+  half2_uint32 q2((qa & 0x000f000f) | c0);  // half2( q[4]      + 1024, q[5]      + 1024 )
+  half2_uint32 q3((qa & 0x00f000f0) | c0);  // half2( q[6] * 16 + 1024, q[7] * 16 + 1024 )
+
+  if (scaled) {
+    dq[0] = __hfma2(q0.as_half2, y1y16[0],
+                    z1z16[0]);  // half2( q[0] * s - z * s, q[1] * s - z * s)
+    dq[1] = __hfma2(q1.as_half2, y1y16[1],
+                    z1z16[1]);  // half2( q[2] * s - z * s, q[3] * s - z * s)
+    dq[2] = __hfma2(q2.as_half2, y1y16[0], z1z16[0]);
+    dq[3] = __hfma2(q3.as_half2, y1y16[1], z1z16[1]);
+  } else {
+    dq[0] = __hadd2(q0.as_half2, z1z16[0]);  // half2( q[0] - z, q[1] - z )
+    dq[1] = __hfma2(q1.as_half2, y1y16[1],
+                    z1z16[1]);               // half2( q[2] - z, q[3] - z )
+    dq[2] = __hadd2(q2.as_half2, z1z16[0]);  // half2( q[4] - z, q[5] - z )
+    dq[3] = __hfma2(q3.as_half2, y1y16[1],
+                    z1z16[1]);  // half2( q[6] - z, q[7] - z )
+  }
+}
+}  // namespace gptq
+}  // namespace sglang
+
+#endif
diff --git a/sgl-kernel/csrc/gemm/gptq/qdq_8.cuh b/sgl-kernel/csrc/gemm/gptq/qdq_8.cuh
new file mode 100644
index 000000000..c6a49d6dc
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/gptq/qdq_8.cuh
@@ -0,0 +1,30 @@
+/*
+Copied from https://github.com/turboderp/exllamav2
+*/
+
+#ifndef _qdq_8_cuh
+#define _qdq_8_cuh
+
+#include "qdq_util.cuh"
+
+namespace sglang {
+namespace gptq {
+
+__forceinline__ __device__ void shuffle_8bit_4(uint32_t* q, int stride) {}
+
+__forceinline__ __device__ void
+dequant_8bit_8(const uint32_t q_0, const uint32_t q_1, half2 (&dq)[4], int stride, const uint32_t zero) {
+  half dqh[8];
+  for (int i = 0; i < 4; i++)
+    dqh[i] = dq_ns(exb(q_0, i * 8, 0xff), zero);
+  for (int i = 0; i < 4; i++)
+    dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero);
+
+  for (int i = 0; i < 4; i++)
+    dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
+}
+
+}  // namespace gptq
+}  // namespace sglang
+
+#endif
diff --git a/sgl-kernel/csrc/gemm/gptq/qdq_util.cuh b/sgl-kernel/csrc/gemm/gptq/qdq_util.cuh
new file mode 100644
index 000000000..0977269c3
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/gptq/qdq_util.cuh
@@ -0,0 +1,53 @@
+/*
+Copied from https://github.com/turboderp/exllamav2
+*/
+
+#ifndef _qdq_util_cuh
+#define _qdq_util_cuh
+
+namespace sglang {
+namespace gptq {
+
+union half2_uint32 {
+  uint32_t as_uint32;
+  half2 as_half2;
+  __device__ half2_uint32(uint32_t val) : as_uint32(val) {}
+  __device__ half2_uint32(half2 val) : as_half2(val) {}
+};
+
+union half_uint16 {
+  uint16_t as_uint16;
+  half as_half;
+  __device__ half_uint16(uint16_t val) : as_uint16(val) {}
+  __device__ half_uint16(half val) : as_half(val) {}
+};
+
+// Max_scale premultiplied by 1/256
+
+__forceinline__ __device__ half dq_scale(const int qs, const half max_scale) {
+  int qs_i = qs + 1;
+  half qs_h = __int2half_rn(qs_i * qs_i);
+  qs_h = __hmul(qs_h, max_scale);
+  return qs_h;
+}
+
+__forceinline__ __device__ half dq(const int q, const int qzero, const half scale) {
+  return __hmul(__int2half_rn(q - qzero), scale);
+}
+
+__forceinline__ __device__ half dq_ns(const int q, const int qzero) {
+  // return __hsub(__int2half_rn(q), __int2half_rn(qzero));
+  return __int2half_rn(q - qzero);
+}
+
+__forceinline__ __device__ int exb(const uint32_t q, const int shift, const int mask) {
+  return (int)((q >> shift) & mask);
+}
+
+__forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0, const int shift, const int mask) {
+  return (int)(__funnelshift_rc(q0, q1, shift) & mask);
+}
+
+}  // namespace gptq
+}  // namespace sglang
+#endif
diff --git a/sgl-kernel/csrc/gemm/int8_gemm_kernel.cu b/sgl-kernel/csrc/gemm/int8_gemm_kernel.cu
new file mode 100644
index 000000000..f18c81865
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/int8_gemm_kernel.cu
@@ -0,0 +1,747 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <ATen/cuda/CUDAContext.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/epilogue/thread/linear_combination.h>
+#include <cutlass/epilogue/threadblock/epilogue_with_visitor.h>
+#include <cutlass/gemm/device/gemm.h>
+#include <cutlass/gemm/device/gemm_universal_adapter.h>
+#include <cutlass/numeric_types.h>
+
+#include <cute/atom/mma_atom.hpp>
+#include <cute/tensor.hpp>
+#include <cutlass/epilogue/collective/collective_builder.hpp>
+#include <cutlass/gemm/collective/collective_builder.hpp>
+#include <cutlass/gemm/kernel/gemm_universal.hpp>
+#include <cutlass/util/packed_stride.hpp>
+
+#include "cutlass_extensions/epilogue/epilogue_per_row_per_col_scale.h"
+#include "cutlass_extensions/gemm/gemm_universal_base_compat.h"
+#include "cutlass_extensions/gemm/gemm_with_epilogue_visitor.h"
+#include "utils.h"
+
+using namespace cute;
+
+template <
+    typename ElementOutput,
+    typename ArchTag,
+    typename ThreadblockShape,
+    typename WarpShape,
+    typename InstructionShape,
+    int NumStages>
+void cutlass_int8_scaled_mm(
+    torch::Tensor& out,
+    const torch::Tensor& mat_a,
+    const torch::Tensor& mat_b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const c10::optional<torch::Tensor>& bias) {
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+  using ElementInputA = int8_t;
+  using ElementInputB = int8_t;
+
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+  using ThreadblockSwizzle = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>;
+
+  using DefaultGemmConf = cutlass::gemm::device::
+      DefaultGemmConfiguration<OperatorClass, ArchTag, ElementInputA, ElementInputB, ElementOutput, ElementCompute>;
+  using EpilogueOutputOp = typename DefaultGemmConf::EpilogueOutputOp;
+
+  using GemmKernel_ = typename cutlass::gemm::kernel::DefaultGemm<
+      ElementInputA,
+      cutlass::layout::RowMajor,
+      DefaultGemmConf::kAlignmentA,
+      ElementInputB,
+      cutlass::layout::ColumnMajor,
+      DefaultGemmConf::kAlignmentB,
+      ElementOutput,
+      cutlass::layout::RowMajor,
+      ElementAccumulator,
+      OperatorClass,
+      ArchTag,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      EpilogueOutputOp,
+      ThreadblockSwizzle,
+      NumStages,
+      true,
+      typename DefaultGemmConf::Operator>::GemmKernel;
+
+  using AlphaColTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+      cutlass::epilogue::threadblock::OutputTileOptimalThreadMap<
+          typename GemmKernel_::Epilogue::OutputTileIterator::ThreadMap::Shape,
+          typename GemmKernel_::Epilogue::OutputTileIterator::ThreadMap::Count,
+          GemmKernel_::Epilogue::OutputTileIterator::ThreadMap::kThreads,
+          GemmKernel_::Epilogue::OutputTileIterator::kElementsPerAccess,
+          cutlass::sizeof_bits<ElementOutput>::value>,
+      ElementCompute>;
+
+  using EpilogueVisitor = typename cutlass::epilogue::threadblock::EpilogueVisitorPerRowPerCol<
+      ThreadblockShape,
+      GemmKernel_::kThreadCount,
+      AlphaColTileIterator,
+      typename GemmKernel_::Epilogue::OutputTileIterator,
+      ElementAccumulator,
+      ElementCompute,
+      EpilogueOutputOp>;
+
+  using Epilogue = typename cutlass::epilogue::threadblock::
+      EpilogueWithVisitorFromExistingEpilogue<EpilogueVisitor, typename GemmKernel_::Epilogue>::Epilogue;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmWithEpilogueVisitor<typename GemmKernel_::Mma, Epilogue, ThreadblockSwizzle>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalBaseCompat<GemmKernel>;
+
+  Gemm gemm_op;
+
+  int m = mat_a.size(0);
+  int k = mat_a.size(1);
+  int n = mat_b.size(1);
+
+  auto a_ptr = static_cast<ElementInputA*>(mat_a.data_ptr());
+  auto b_ptr = static_cast<ElementInputB*>(mat_b.data_ptr());
+  auto o_ptr = static_cast<ElementOutput*>(out.data_ptr());
+
+  auto a_s_ptr = static_cast<ElementCompute*>(scales_a.data_ptr());
+  auto b_s_ptr = static_cast<ElementCompute*>(scales_b.data_ptr());
+
+  int64_t lda = mat_a.stride(0);
+  int64_t ldb = mat_b.stride(1);
+  int64_t ldd = out.stride(0);
+
+  ElementOutput* bias_ptr = nullptr;
+  int64_t ldc = 0;
+  if (bias) {
+    bias_ptr = static_cast<ElementOutput*>(bias->data_ptr());
+  }
+
+  typename EpilogueOutputOp::Params linearScalingParams;
+  typename EpilogueVisitor::Arguments visitor_args{linearScalingParams};
+
+  typename Gemm::Arguments args{
+      {m, n, k}, {a_ptr, lda}, {b_ptr, ldb}, {b_s_ptr, 0}, {a_s_ptr, 0}, {bias_ptr, ldc}, {o_ptr, ldd}, visitor_args};
+
+  auto workspace = torch::empty(
+      gemm_op.get_workspace_size(args), torch::TensorOptions().dtype(torch::kUInt8).device(mat_a.device()));
+
+  auto stream = at::cuda::getCurrentCUDAStream(mat_a.get_device());
+
+  auto can_implement = gemm_op.can_implement(args);
+  TORCH_CHECK(
+      can_implement == cutlass::Status::kSuccess,
+      "gemm cannot implement, error: ",
+      cutlassGetStatusString(can_implement));
+
+  auto status = gemm_op(args, workspace.data_ptr(), stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "gemm executioin failed, error: ", cutlassGetStatusString(status));
+}
+
+template <typename ElementOutput, typename ArchTag, typename InstructionShape>
+void sm75_dispatch_shape(
+    torch::Tensor& out,
+    const torch::Tensor& mat_a,
+    const torch::Tensor& mat_b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const c10::optional<torch::Tensor>& bias) {
+  int m = mat_a.size(0);
+  if (m <= 32) {
+    cutlass_int8_scaled_mm<
+        ElementOutput,
+        ArchTag,
+        cutlass::gemm::GemmShape<32, 128, 64>,
+        cutlass::gemm::GemmShape<32, 64, 64>,
+        InstructionShape,
+        2>(out, mat_a, mat_b, scales_a, scales_b, bias);
+  } else if (m <= 64) {
+    cutlass_int8_scaled_mm<
+        ElementOutput,
+        ArchTag,
+        cutlass::gemm::GemmShape<64, 128, 128>,
+        cutlass::gemm::GemmShape<64, 64, 64>,
+        InstructionShape,
+        2>(out, mat_a, mat_b, scales_a, scales_b, bias);
+  } else if (m <= 256) {
+    cutlass_int8_scaled_mm<
+        ElementOutput,
+        ArchTag,
+        cutlass::gemm::GemmShape<128, 128, 128>,
+        cutlass::gemm::GemmShape<64, 64, 64>,
+        InstructionShape,
+        2>(out, mat_a, mat_b, scales_a, scales_b, bias);
+  } else {
+    cutlass_int8_scaled_mm<
+        ElementOutput,
+        ArchTag,
+        cutlass::gemm::GemmShape<128, 128, 64>,
+        cutlass::gemm::GemmShape<64, 64, 64>,
+        InstructionShape,
+        2>(out, mat_a, mat_b, scales_a, scales_b, bias);
+  }
+}
+
+template <typename ElementOutput, typename ArchTag, typename InstructionShape>
+void sm80_dispatch_shape(
+    torch::Tensor& out,
+    const torch::Tensor& mat_a,
+    const torch::Tensor& mat_b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const c10::optional<torch::Tensor>& bias) {
+  int m = mat_a.size(0);
+  int n = mat_b.size(1);
+  if (m <= 16) {
+    if (n <= 4096) {
+      cutlass_int8_scaled_mm<
+          ElementOutput,
+          ArchTag,
+          cutlass::gemm::GemmShape<16, 64, 128>,
+          cutlass::gemm::GemmShape<16, 64, 64>,
+          InstructionShape,
+          6>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      cutlass_int8_scaled_mm<
+          ElementOutput,
+          ArchTag,
+          cutlass::gemm::GemmShape<16, 64, 128>,
+          cutlass::gemm::GemmShape<16, 64, 64>,
+          InstructionShape,
+          5>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
+  } else if (m <= 32) {
+    if (n <= 4096) {
+      cutlass_int8_scaled_mm<
+          ElementOutput,
+          ArchTag,
+          cutlass::gemm::GemmShape<32, 64, 128>,
+          cutlass::gemm::GemmShape<32, 64, 64>,
+          InstructionShape,
+          6>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      cutlass_int8_scaled_mm<
+          ElementOutput,
+          ArchTag,
+          cutlass::gemm::GemmShape<32, 64, 128>,
+          cutlass::gemm::GemmShape<32, 64, 64>,
+          InstructionShape,
+          5>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
+  } else if (m <= 64) {
+    if (n <= 4096) {
+      cutlass_int8_scaled_mm<
+          ElementOutput,
+          ArchTag,
+          cutlass::gemm::GemmShape<64, 64, 128>,
+          cutlass::gemm::GemmShape<32, 64, 64>,
+          InstructionShape,
+          5>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      cutlass_int8_scaled_mm<
+          ElementOutput,
+          ArchTag,
+          cutlass::gemm::GemmShape<64, 128, 128>,
+          cutlass::gemm::GemmShape<64, 64, 64>,
+          InstructionShape,
+          5>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
+  } else if (m <= 128 && n < 8192) {
+    cutlass_int8_scaled_mm<
+        ElementOutput,
+        ArchTag,
+        cutlass::gemm::GemmShape<64, 128, 128>,
+        cutlass::gemm::GemmShape<64, 64, 64>,
+        InstructionShape,
+        5>(out, mat_a, mat_b, scales_a, scales_b, bias);
+  } else {
+    cutlass_int8_scaled_mm<
+        ElementOutput,
+        ArchTag,
+        cutlass::gemm::GemmShape<128, 128, 64>,
+        cutlass::gemm::GemmShape<64, 64, 64>,
+        InstructionShape,
+        5>(out, mat_a, mat_b, scales_a, scales_b, bias);
+  }
+}
+
+// Dispatch shape for sm89 (L40S, L20, RTX 4090), according to:
+// https://github.com/vllm-project/vllm/blob/main/csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh
+template <typename ElementOutput, typename ArchTag, typename InstructionShape>
+void sm89_dispatch_shape(
+    torch::Tensor& out,
+    const torch::Tensor& mat_a,
+    const torch::Tensor& mat_b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const c10::optional<torch::Tensor>& bias) {
+  int m = mat_a.size(0);
+  int n = mat_b.size(1);
+  if (m <= 16) {
+    if (n <= 8192) {
+      cutlass_int8_scaled_mm<
+          ElementOutput,
+          ArchTag,
+          cutlass::gemm::GemmShape<16, 64, 128>,
+          cutlass::gemm::GemmShape<16, 64, 64>,
+          InstructionShape,
+          5>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      cutlass_int8_scaled_mm<
+          ElementOutput,
+          ArchTag,
+          cutlass::gemm::GemmShape<16, 128, 128>,
+          cutlass::gemm::GemmShape<16, 64, 64>,
+          InstructionShape,
+          4>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
+  } else if (m <= 32) {
+    if (n <= 8192) {
+      cutlass_int8_scaled_mm<
+          ElementOutput,
+          ArchTag,
+          cutlass::gemm::GemmShape<32, 64, 128>,
+          cutlass::gemm::GemmShape<16, 64, 64>,
+          InstructionShape,
+          5>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      cutlass_int8_scaled_mm<
+          ElementOutput,
+          ArchTag,
+          cutlass::gemm::GemmShape<32, 128, 128>,
+          cutlass::gemm::GemmShape<32, 64, 64>,
+          InstructionShape,
+          4>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
+  } else if (m <= 64) {
+    if (n <= 8192) {
+      cutlass_int8_scaled_mm<
+          ElementOutput,
+          ArchTag,
+          cutlass::gemm::GemmShape<64, 64, 128>,
+          cutlass::gemm::GemmShape<32, 64, 64>,
+          InstructionShape,
+          5>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      cutlass_int8_scaled_mm<
+          ElementOutput,
+          ArchTag,
+          cutlass::gemm::GemmShape<64, 128, 128>,
+          cutlass::gemm::GemmShape<64, 64, 64>,
+          InstructionShape,
+          3>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
+  } else if (m <= 128) {
+    if (n <= 8192) {
+      cutlass_int8_scaled_mm<
+          ElementOutput,
+          ArchTag,
+          cutlass::gemm::GemmShape<64, 128, 128>,
+          cutlass::gemm::GemmShape<32, 64, 64>,
+          InstructionShape,
+          3>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else if (n <= 16384) {
+      cutlass_int8_scaled_mm<
+          ElementOutput,
+          ArchTag,
+          cutlass::gemm::GemmShape<128, 128, 64>,
+          cutlass::gemm::GemmShape<64, 64, 64>,
+          InstructionShape,
+          5>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      cutlass_int8_scaled_mm<
+          ElementOutput,
+          ArchTag,
+          cutlass::gemm::GemmShape<64, 64, 128>,
+          cutlass::gemm::GemmShape<32, 64, 64>,
+          InstructionShape,
+          5>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
+  } else if (m <= 256) {
+    if (n <= 4096) {
+      cutlass_int8_scaled_mm<
+          ElementOutput,
+          ArchTag,
+          cutlass::gemm::GemmShape<64, 128, 128>,
+          cutlass::gemm::GemmShape<64, 64, 64>,
+          InstructionShape,
+          3>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else if (n <= 8192) {
+      cutlass_int8_scaled_mm<
+          ElementOutput,
+          ArchTag,
+          cutlass::gemm::GemmShape<128, 128, 64>,
+          cutlass::gemm::GemmShape<64, 64, 64>,
+          InstructionShape,
+          5>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else if (n <= 16384) {
+      cutlass_int8_scaled_mm<
+          ElementOutput,
+          ArchTag,
+          cutlass::gemm::GemmShape<256, 128, 64>,
+          cutlass::gemm::GemmShape<64, 64, 64>,
+          InstructionShape,
+          3>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      cutlass_int8_scaled_mm<
+          ElementOutput,
+          ArchTag,
+          cutlass::gemm::GemmShape<128, 128, 64>,
+          cutlass::gemm::GemmShape<64, 64, 64>,
+          InstructionShape,
+          5>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
+  } else {
+    cutlass_int8_scaled_mm<
+        ElementOutput,
+        ArchTag,
+        cutlass::gemm::GemmShape<32, 64, 128>,
+        cutlass::gemm::GemmShape<16, 64, 64>,
+        InstructionShape,
+        5>(out, mat_a, mat_b, scales_a, scales_b, bias);
+  }
+}
+
+template <
+    typename ElementOutput,
+    typename TileShape,
+    typename ClusterShape,
+    typename MainloopScheduleType,
+    bool WithBias>
+void cutlass_int8_scaled_mm_sm90(
+    torch::Tensor& out,
+    const torch::Tensor& mat_a,
+    const torch::Tensor& mat_b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const c10::optional<torch::Tensor>& bias) {
+  using ArchTag = cutlass::arch::Sm90;
+
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+  using ElementInputA = int8_t;
+  using ElementInputB = int8_t;
+
+  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementInputA>::value;
+  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementInputB>::value;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+  static constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits<ElementOutput>::value;
+
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+  using EpilogueScheduleType = cutlass::epilogue::TmaWarpSpecialized;
+  using TileSchedulerType = cutlass::gemm::PersistentScheduler;
+
+  using XScale = cutlass::epilogue::fusion::
+      Sm90ColBroadcast<0, TileShape, ElementCompute, ElementCompute, Stride<Int<1>, Int<0>, Int<0>>>;
+
+  using WScale = cutlass::epilogue::fusion::
+      Sm90RowBroadcast<0, TileShape, ElementCompute, ElementCompute, Stride<Int<0>, Int<1>, Int<0>>>;
+
+  using Bias = cutlass::epilogue::fusion::
+      Sm90RowBroadcast<0, TileShape, ElementOutput, ElementOutput, Stride<Int<0>, Int<1>, Int<0>>>;
+
+  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+
+  // Scale
+  using Compute0 = cutlass::epilogue::fusion::
+      Sm90Compute<cutlass::multiplies, ElementCompute, ElementCompute, cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 = cutlass::epilogue::fusion::Sm90EVT<Compute0, WScale, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::
+      Sm90Compute<cutlass::multiplies, ElementOutput, ElementCompute, cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute1 = cutlass::epilogue::fusion::Sm90EVT<Compute1, XScale, EVTCompute0>;
+
+  // With bias
+  using ComputeWithBias = cutlass::epilogue::fusion::
+      Sm90Compute<cutlass::multiply_add, ElementOutput, ElementCompute, cutlass::FloatRoundStyle::round_to_nearest>;
+  using EVTComputeWithBias = cutlass::epilogue::fusion::Sm90EVT<ComputeWithBias, XScale, EVTCompute0, Bias>;
+
+  using EpilogueEVT = typename cutlass::platform::conditional<WithBias, EVTComputeWithBias, EVTCompute1>::type;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      TileShape,
+      ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator,
+      ElementCompute,
+      ElementOutput,
+      cutlass::layout::RowMajor,
+      AlignmentC,
+      ElementOutput,
+      cutlass::layout::RowMajor,
+      AlignmentOutput,
+      EpilogueScheduleType,
+      EpilogueEVT>::CollectiveOp;
+
+  using Stages = cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+      sizeof(typename CollectiveEpilogue::SharedStorage))>;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      ElementInputA,
+      cutlass::layout::RowMajor,
+      AlignmentA,
+      ElementInputB,
+      cutlass::layout::ColumnMajor,
+      AlignmentB,
+      ElementAccumulator,
+      TileShape,
+      ClusterShape,
+      Stages,
+      MainloopScheduleType>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>,  // Indicates ProblemShape
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      TileSchedulerType>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  Gemm gemm_op;
+
+  int m = mat_a.size(0);
+  int k = mat_a.size(1);
+  int n = mat_b.size(1);
+
+  auto a_ptr = static_cast<ElementInputA*>(mat_a.data_ptr());
+  auto b_ptr = static_cast<ElementInputB*>(mat_b.data_ptr());
+  auto o_ptr = static_cast<ElementOutput*>(out.data_ptr());
+
+  auto a_s_ptr = static_cast<ElementCompute*>(scales_a.data_ptr());
+  auto b_s_ptr = static_cast<ElementCompute*>(scales_b.data_ptr());
+
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+
+  StrideA stride_a = cutlass::make_cute_packed_stride(StrideA{}, make_shape(m, k, 1));
+  StrideB stride_b = cutlass::make_cute_packed_stride(StrideB{}, make_shape(n, k, 1));
+  StrideC stride_c;
+  StrideD stride_d = cutlass::make_cute_packed_stride(StrideD{}, make_shape(m, n, 1));
+
+  typename Gemm::Arguments args = {
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {m, n, k, 1},
+      {a_ptr, stride_a, b_ptr, stride_b},
+      {{},  // epilogue.thread
+       nullptr,
+       stride_c,
+       o_ptr,
+       stride_d}};
+
+  if constexpr (WithBias) {
+    ElementOutput* bias_ptr = static_cast<ElementOutput*>(bias->data_ptr());
+    args.epilogue.thread = {
+        {a_s_ptr},
+        {{b_s_ptr}, {}, {}},
+        {bias_ptr},
+        {},
+    };
+  } else {
+    args.epilogue.thread = {
+        {a_s_ptr},
+        {{b_s_ptr}, {}, {}},
+        {},
+    };
+  }
+
+  auto workspace = torch::empty(
+      gemm_op.get_workspace_size(args), torch::TensorOptions().dtype(torch::kUInt8).device(mat_a.device()));
+
+  auto stream = at::cuda::getCurrentCUDAStream(mat_a.get_device());
+
+  auto can_implement = gemm_op.can_implement(args);
+  TORCH_CHECK(
+      can_implement == cutlass::Status::kSuccess,
+      "gemm cannot implement, error: ",
+      cutlassGetStatusString(can_implement));
+
+  auto status = gemm_op(args, workspace.data_ptr(), stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "gemm executioin failed, error: ", cutlassGetStatusString(status));
+}
+
+template <typename ElementOutput, typename TileShape, typename ClusterShape, typename MainloopScheduleType>
+void sm90_dispatch_bias(
+    torch::Tensor& out,
+    const torch::Tensor& mat_a,
+    const torch::Tensor& mat_b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const c10::optional<torch::Tensor>& bias) {
+  if (bias) {
+    cutlass_int8_scaled_mm_sm90<ElementOutput, TileShape, ClusterShape, MainloopScheduleType, true>(
+        out, mat_a, mat_b, scales_a, scales_b, bias);
+  } else {
+    cutlass_int8_scaled_mm_sm90<ElementOutput, TileShape, ClusterShape, MainloopScheduleType, false>(
+        out, mat_a, mat_b, scales_a, scales_b, bias);
+  }
+}
+
+template <typename ElementOutput>
+void sm90_dispatch_shape(
+    torch::Tensor& out,
+    const torch::Tensor& mat_a,
+    const torch::Tensor& mat_b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const c10::optional<torch::Tensor>& bias) {
+  int m = mat_a.size(0);
+  int n = mat_b.size(1);
+  if (m <= 32) {
+    if (n < 8192) {
+      return sm90_dispatch_bias<
+          ElementOutput,
+          Shape<_64, _64, _128>,
+          Shape<_1, _8, _1>,
+          cutlass::gemm::KernelTmaWarpSpecialized>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      return sm90_dispatch_bias<
+          ElementOutput,
+          Shape<_64, _128, _128>,
+          Shape<_1, _8, _1>,
+          cutlass::gemm::KernelTmaWarpSpecialized>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
+  } else if (m <= 64) {
+    if (n < 8192) {
+      return sm90_dispatch_bias<
+          ElementOutput,
+          Shape<_64, _64, _128>,
+          Shape<_1, _4, _1>,
+          cutlass::gemm::KernelTmaWarpSpecialized>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      return sm90_dispatch_bias<
+          ElementOutput,
+          Shape<_64, _64, _256>,
+          Shape<_1, _1, _1>,
+          cutlass::gemm::KernelTmaWarpSpecialized>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
+  } else if (m <= 128) {
+    if (n <= 4096) {
+      return sm90_dispatch_bias<
+          ElementOutput,
+          Shape<_64, _64, _128>,
+          Shape<_2, _1, _1>,
+          cutlass::gemm::KernelTmaWarpSpecialized>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      return sm90_dispatch_bias<
+          ElementOutput,
+          Shape<_64, _128, _128>,
+          Shape<_2, _1, _1>,
+          cutlass::gemm::KernelTmaWarpSpecialized>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
+  } else {
+    return sm90_dispatch_bias<
+        ElementOutput,
+        Shape<_128, _128, _128>,
+        Shape<_2, _1, _1>,
+        cutlass::gemm::KernelTmaWarpSpecializedPingpong>(out, mat_a, mat_b, scales_a, scales_b, bias);
+  }
+}
+
+torch::Tensor int8_scaled_mm(
+    const torch::Tensor& mat_a,
+    const torch::Tensor& mat_b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const torch::Dtype& out_dtype,
+    const c10::optional<torch::Tensor>& bias) {
+  TORCH_CHECK(mat_a.is_cuda(), "mat_a must be a CUDA tensor");
+  TORCH_CHECK(mat_b.is_cuda(), "mat_b must be a CUDA tensor");
+  TORCH_CHECK(mat_a.dim() == 2, "mat_a must be a 2D tensor");
+  TORCH_CHECK(mat_b.dim() == 2, "mat_b must be a 2D tensor");
+  TORCH_CHECK(mat_a.stride(1) == 1, "mat_a must be a row major tensor");
+  TORCH_CHECK(mat_b.stride(0) == 1, "mat_b must be a column major tensor");
+  TORCH_CHECK(mat_a.size(1) == mat_b.size(0), "mat_a and mat_b shapes cannot be multiplied");
+  TORCH_CHECK(mat_a.size(1) % 16 == 0, "mat_a.size(1) must be multiple of 16 for memory alignment");
+  TORCH_CHECK(mat_b.size(0) % 16 == 0, "mat_b.size(0) must be multiple of 16 for memory alignment");
+  TORCH_CHECK(mat_b.size(1) % 8 == 0, "mat_b.size(1) must be multiple of 8 for memory alignment");  // out.stride(0)
+  TORCH_CHECK(mat_a.scalar_type() == torch::kInt8, "mat_a must be Int8");
+  TORCH_CHECK(mat_b.scalar_type() == torch::kInt8, "mat_b must be Int8");
+  TORCH_CHECK(out_dtype == torch::kHalf || out_dtype == torch::kBFloat16, "out_dtype must be Half or BFloat16");
+
+  TORCH_CHECK(scales_a.numel() == mat_a.size(0), "size of scales_a is not matched");
+  TORCH_CHECK(scales_b.numel() == mat_b.size(1), "size of scales_b is not matched");
+  TORCH_CHECK(scales_a.is_contiguous(), "scales_a must be contiguous");
+  TORCH_CHECK(scales_b.is_contiguous(), "scales_b msut be contiguous");
+  TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32, "scales_a must be Float32");
+  TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32, "scales_b must be Float32");
+
+  if (bias) {
+    TORCH_CHECK(bias->numel() == mat_b.size(1), "size of bias is not matched");
+    TORCH_CHECK(bias->is_contiguous(), "bias must be contiguous");
+    TORCH_CHECK(bias->dtype() == out_dtype, "bias dtype must match output dtype");
+  }
+
+  torch::Tensor out = torch::empty({mat_a.size(0), mat_b.size(1)}, mat_a.options().dtype(out_dtype));
+
+  auto sm_version = getSMVersion();
+
+  if (sm_version >= 75 && sm_version < 80) {
+    TORCH_CHECK(out_dtype == torch::kHalf, "out_dtype must be Half for SM75");
+    sm75_dispatch_shape<cutlass::half_t, cutlass::arch::Sm75, cutlass::gemm::GemmShape<8, 8, 16>>(
+        out, mat_a, mat_b, scales_a, scales_b, bias);
+  } else if (sm_version >= 80 && sm_version < 90) {
+    // sm86/sm89 has a much smaller shared memory size (100K) than sm80 (160K)
+    if (sm_version == 86 || sm_version == 89) {
+      if (out_dtype == torch::kBFloat16) {
+        sm89_dispatch_shape<cutlass::bfloat16_t, cutlass::arch::Sm80, cutlass::gemm::GemmShape<16, 8, 32>>(
+            out, mat_a, mat_b, scales_a, scales_b, bias);
+      } else {
+        sm89_dispatch_shape<cutlass::half_t, cutlass::arch::Sm80, cutlass::gemm::GemmShape<16, 8, 32>>(
+            out, mat_a, mat_b, scales_a, scales_b, bias);
+      }
+    } else {
+      if (out_dtype == torch::kBFloat16) {
+        sm80_dispatch_shape<cutlass::bfloat16_t, cutlass::arch::Sm80, cutlass::gemm::GemmShape<16, 8, 32>>(
+            out, mat_a, mat_b, scales_a, scales_b, bias);
+      } else {
+        sm80_dispatch_shape<cutlass::half_t, cutlass::arch::Sm80, cutlass::gemm::GemmShape<16, 8, 32>>(
+            out, mat_a, mat_b, scales_a, scales_b, bias);
+      }
+    }
+  } else if (sm_version == 90) {
+#if defined CUDA_VERSION && CUDA_VERSION >= 12000
+    // cutlass 3.x
+    if (out_dtype == torch::kBFloat16) {
+      sm90_dispatch_shape<cutlass::bfloat16_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      sm90_dispatch_shape<cutlass::half_t>(out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
+#else
+    // fallback to cutlass 2.x
+    if (out_dtype == torch::kBFloat16) {
+      sm80_dispatch_shape<cutlass::bfloat16_t, cutlass::arch::Sm80, cutlass::gemm::GemmShape<16, 8, 32>>(
+          out, mat_a, mat_b, scales_a, scales_b, bias);
+    } else {
+      sm80_dispatch_shape<cutlass::half_t, cutlass::arch::Sm80, cutlass::gemm::GemmShape<16, 8, 32>>(
+          out, mat_a, mat_b, scales_a, scales_b, bias);
+    }
+#endif
+  } else {
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "No implemented int8_scaled_mm for current compute capability.");
+  }
+
+  return out;
+}
diff --git a/sgl-kernel/csrc/gemm/marlin/awq_marlin_repack.cu b/sgl-kernel/csrc/gemm/marlin/awq_marlin_repack.cu
new file mode 100644
index 000000000..1c8de396d
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/marlin/awq_marlin_repack.cu
@@ -0,0 +1,253 @@
+#include "marlin.cuh"
+
+namespace marlin {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+template <int const num_threads, int const num_bits>
+__global__ void awq_marlin_repack_kernel(
+    uint32_t const* __restrict__ b_q_weight_ptr, uint32_t* __restrict__ out_ptr, int size_k, int size_n) {
+  return;
+}
+#else
+
+template <int const num_threads, int const num_bits>
+__global__ void awq_marlin_repack_kernel(
+    uint32_t const* __restrict__ b_q_weight_ptr, uint32_t* __restrict__ out_ptr, int size_k, int size_n) {
+  constexpr int pack_factor = 32 / num_bits;
+
+  int k_tiles = size_k / tile_k_size;
+  int n_tiles = size_n / tile_n_size;
+  int block_k_tiles = div_ceil(k_tiles, gridDim.x);
+
+  auto start_k_tile = blockIdx.x * block_k_tiles;
+  if (start_k_tile >= k_tiles) {
+    return;
+  }
+
+  int finish_k_tile = min(start_k_tile + block_k_tiles, k_tiles);
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<repack_stages - 2>();
+    __syncthreads();
+  };
+
+  extern __shared__ int4 sh[];
+
+  constexpr int tile_n_ints = tile_n_size / pack_factor;
+
+  constexpr int stage_n_threads = tile_n_ints / 4;
+  constexpr int stage_k_threads = tile_k_size;
+  constexpr int stage_size = stage_k_threads * stage_n_threads;
+
+  auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) {
+    if (n_tile_id >= n_tiles) {
+      cp_async_fence();
+      return;
+    }
+
+    int first_n = n_tile_id * tile_n_size;
+    int first_n_packed = first_n / pack_factor;
+
+    int4* sh_ptr = sh + stage_size * pipe;
+
+    if (threadIdx.x < stage_size) {
+      auto k_id = threadIdx.x / stage_n_threads;
+      auto n_id = threadIdx.x % stage_n_threads;
+
+      int first_k = k_tile_id * tile_k_size;
+
+      cp_async4(
+          &sh_ptr[k_id * stage_n_threads + n_id],
+          reinterpret_cast<int4 const*>(
+              &(b_q_weight_ptr[(first_k + k_id) * (size_n / pack_factor) + first_n_packed + (n_id * 4)])));
+    }
+
+    cp_async_fence();
+  };
+
+  auto repack_tile = [&](int pipe, int k_tile_id, int n_tile_id) {
+    if (n_tile_id >= n_tiles) {
+      return;
+    }
+
+    auto warp_id = threadIdx.x / 32;
+    auto th_id = threadIdx.x % 32;
+
+    if (warp_id >= 4) {
+      return;
+    }
+
+    int tc_col = th_id / 4;
+    int tc_row = (th_id % 4) * 2;
+
+    constexpr int tc_offsets[4] = {0, 1, 8, 9};
+
+    int cur_n = warp_id * 16 + tc_col;
+    int cur_n_packed = cur_n / pack_factor;
+    int cur_n_pos = cur_n % pack_factor;
+
+    constexpr int sh_stride = tile_n_ints;
+    constexpr uint32_t mask = (1 << num_bits) - 1;
+
+    int4* sh_stage_ptr = sh + stage_size * pipe;
+    uint32_t* sh_stage_int_ptr = reinterpret_cast<uint32_t*>(sh_stage_ptr);
+
+    // Undo interleaving
+    int cur_n_pos_unpacked;
+    if constexpr (num_bits == 4) {
+      constexpr int undo_pack[8] = {0, 4, 1, 5, 2, 6, 3, 7};
+      cur_n_pos_unpacked = undo_pack[cur_n_pos];
+    } else {
+      constexpr int undo_pack[4] = {0, 2, 1, 3};
+      cur_n_pos_unpacked = undo_pack[cur_n_pos];
+    }
+
+    uint32_t vals[8];
+#pragma unroll
+    for (int i = 0; i < 4; i++) {
+      int cur_elem = tc_row + tc_offsets[i];
+
+      int packed_src_0 = sh_stage_int_ptr[cur_n_packed + sh_stride * cur_elem];
+      int packed_src_1 = sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) + sh_stride * cur_elem];
+
+      vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
+      vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
+    }
+
+    constexpr int tile_size = tile_k_size * tile_n_size / pack_factor;
+    int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
+
+    // Result of:
+    // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+    if constexpr (num_bits == 4) {
+      constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+
+      uint32_t res = 0;
+#pragma unroll
+      for (int i = 0; i < 8; i++) {
+        res |= vals[pack_idx[i]] << (i * 4);
+      }
+
+      out_ptr[out_offset + th_id * 4 + warp_id] = res;
+
+    } else {
+      constexpr int pack_idx[4] = {0, 2, 1, 3};
+
+      uint32_t res1 = 0;
+      uint32_t res2 = 0;
+#pragma unroll
+      for (int i = 0; i < 4; i++) {
+        res1 |= vals[pack_idx[i]] << (i * 8);
+        res2 |= vals[4 + pack_idx[i]] << (i * 8);
+      }
+
+      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
+      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 1] = res2;
+    }
+  };
+
+  auto start_pipes = [&](int k_tile_id, int n_tile_id) {
+#pragma unroll
+    for (int pipe = 0; pipe < repack_stages - 1; pipe++) {
+      fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe);
+    }
+
+    wait_for_stage();
+  };
+#pragma unroll
+  for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) {
+    int n_tile_id = 0;
+
+    start_pipes(k_tile_id, n_tile_id);
+
+    while (n_tile_id < n_tiles) {
+#pragma unroll
+      for (int pipe = 0; pipe < repack_stages; pipe++) {
+        fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id, n_tile_id + pipe + repack_stages - 1);
+        repack_tile(pipe, k_tile_id, n_tile_id + pipe);
+        wait_for_stage();
+      }
+      n_tile_id += repack_stages;
+    }
+  }
+}
+#endif
+}  // namespace marlin
+
+#define CALL_IF(NUM_BITS)                                                                                      \
+  else if (num_bits == NUM_BITS) {                                                                             \
+    cudaFuncSetAttribute(                                                                                      \
+        marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS>,                                    \
+        cudaFuncAttributeMaxDynamicSharedMemorySize,                                                           \
+        max_shared_mem);                                                                                       \
+    marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS>                                         \
+        <<<blocks, marlin::repack_threads, max_shared_mem, stream>>>(b_q_weight_ptr, out_ptr, size_k, size_n); \
+  }
+
+torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k, int64_t size_n, int64_t num_bits) {
+  // Verify compatibility with marlin tile of 16x64
+  TORCH_CHECK(
+      size_k % marlin::tile_k_size == 0,
+      "size_k = ",
+      size_k,
+      " is not divisible by tile_k_size = ",
+      marlin::tile_k_size);
+  TORCH_CHECK(
+      size_n % marlin::tile_n_size == 0,
+      "size_n = ",
+      size_n,
+      " is not divisible by tile_n_size = ",
+      marlin::tile_n_size);
+
+  TORCH_CHECK(num_bits == 4 || num_bits == 8, "num_bits must be 4 or 8. Got = ", num_bits);
+  int const pack_factor = 32 / num_bits;
+
+  // Verify B
+  TORCH_CHECK(b_q_weight.size(0) == size_k, "b_q_weight.size(0) = ", b_q_weight.size(0), " is not size_k = ", size_k);
+  TORCH_CHECK(
+      (size_n / pack_factor) == b_q_weight.size(1),
+      "Shape mismatch: b_q_weight.size(1) = ",
+      b_q_weight.size(1),
+      ", size_n = ",
+      size_n,
+      ", pack_factor = ",
+      pack_factor);
+
+  // Verify device and strides
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+  TORCH_CHECK(b_q_weight.dtype() == at::kInt, "b_q_weight type is not kInt");
+
+  // Alloc buffers
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(b_q_weight));
+  auto options = torch::TensorOptions().dtype(b_q_weight.dtype()).device(b_q_weight.device());
+  torch::Tensor out = torch::empty({size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor}, options);
+
+  // Get ptrs
+  uint32_t const* b_q_weight_ptr = reinterpret_cast<uint32_t const*>(b_q_weight.data_ptr());
+  uint32_t* out_ptr = reinterpret_cast<uint32_t*>(out.data_ptr());
+
+  // Get dev info
+  int dev = b_q_weight.get_device();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
+  int blocks;
+  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem, cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  if (false) {
+  }
+  CALL_IF(4)
+  CALL_IF(8)
+  else {
+    TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits);
+  }
+
+  return out;
+}
diff --git a/sgl-kernel/csrc/gemm/marlin/dequant.h b/sgl-kernel/csrc/gemm/marlin/dequant.h
new file mode 100644
index 000000000..e91b8b9f4
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/marlin/dequant.h
@@ -0,0 +1,459 @@
+/*
+Fast Dequantization (Converting INT4/INT8/FP4/FP8 to FP16/BF16)
+
+The process of fast dequantization can be summarized as a combination
+of bitwise operations and floating-point computations:
+
+weight =>(bit_op / bitwise operations)=>
+f16_value =>(flop / floating-point computation)=>
+dequantized_weight
+
+Since the dequantized weights typically require subtracting the zero point and
+applying a scale factor, the floating-point computation step can be fused with
+the zero-point subtraction and scaling operations.
+
+The following are the parts that need to be modified for the fused operation
+of zero-point subtraction and scaling.
+
+## INT4 => FP16/BF16 or INT8 => FP16
+
+The floating-point computation is `__hsub2`
+
+If has zero points:
+
+    flop(bit_op(weight)) - flop(bit_op(zp))
+  = sub(bit_op(weight), bias) - sub(bit_op(zp), bias)
+  = bit_op(weight) - bit_op(zp)
+
+so we don't need additional modification.
+
+If has float zero points:
+
+    flop(bit_op(weight)) - fzp
+  = sub(bit_op(weight), bias) - fzp
+  = bit_op(weight) - (fzp + bias)
+
+where the `fzp + bias` can be computed at weight loading. But this
+may have accuracy issue, so we should not use this in most cases.
+
+If has not zero points:
+
+    scale(flop(bit_op(weight)))
+  = scale(sub(bit_op(weight), bias))
+  = scale(bit_op(weight)) - scale(bias)
+  = fma(bit_op(weight), scale_factor, scale(bias))
+
+where the `scale(bias)` can be cached. But this may have accuracy issue,
+so we should not use this in most cases.
+
+
+## INT8 => BF16
+
+INT8 => BF16 is a special case, it use byte_perm instead of flop.
+We cannot fused byte_perm with scaling.
+
+
+## FP4/FP8 => FP16/BF16
+
+    scale(flop(bit_op(weight)))
+  = scale(mul(bit_op(weight), multiplier))
+  = mul(bit_op(weight), scale_factor * multiplier)
+
+where `scale_factor * multiplier` can be computed at weight loading.
+
+*/
+
+#include "marlin_dtypes.cuh"
+
+namespace MARLIN_NAMESPACE_NAME {
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
+// Lookup-table based 3-input logical operation; explicitly used for
+// dequantization as the compiler does not seem to automatically recognize it in
+// all cases.
+template <int lut>
+__device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(res) : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+
+// Constructs destination register by taking bytes from 2 sources (based on
+// mask)
+template <int start_byte, int mask>
+__device__ inline uint32_t prmt(uint32_t a) {
+  uint32_t res;
+  asm volatile("prmt.b32 %0, %1, %2, %3;\n" : "=r"(res) : "r"(a), "n"(start_byte), "n"(mask));
+  return res;
+}
+
+template <typename scalar_t2, sglang::ScalarTypeId w_type_id, bool skip_flop = false>
+__device__ inline void dequant(int q, scalar_t2* frag_b);
+
+//
+// Efficiently dequantize 4bit values packed in an int32 value into a full
+// B-fragment of 4 fp16 values. We mostly follow the strategy in the link below,
+// with some small changes:
+// - FP16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287
+// - BF16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L327-L385
+//
+template <>
+__device__ inline void dequant<half2, sglang::kU4B8.id(), true>(int q, half2* frag_b) {
+  const int MASK = 0x000f000f;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  q >>= 4;
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+
+  frag_b[0] = *reinterpret_cast<half2*>(&lo);
+  frag_b[1] = *reinterpret_cast<half2*>(&hi);
+}
+
+template <>
+__device__ inline void dequant<half2, sglang::kU4B8.id(), false>(int q, half2* frag_b) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  // clang-format off
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // clang-format on
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  const int SUB = 0x64086408;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd480d480;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo), *reinterpret_cast<const half2*>(&SUB));
+  frag_b[1] = __hfma2(
+      *reinterpret_cast<half2*>(&hi), *reinterpret_cast<const half2*>(&MUL), *reinterpret_cast<const half2*>(&ADD));
+}
+
+template <>
+__device__ inline void dequant<half2, sglang::kU4.id(), true>(int q, half2* frag_b) {
+  dequant<half2, sglang::kU4B8.id(), true>(q, frag_b);
+}
+
+template <>
+__device__ inline void dequant<half2, sglang::kU4.id(), false>(int q, half2* frag_b) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  // clang-format off
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // clang-format on
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  const int SUB = 0x64006400;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd400d400;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo), *reinterpret_cast<const half2*>(&SUB));
+  frag_b[1] = __hfma2(
+      *reinterpret_cast<half2*>(&hi), *reinterpret_cast<const half2*>(&MUL), *reinterpret_cast<const half2*>(&ADD));
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, sglang::kU4B8.id(), true>(int q, nv_bfloat162* frag_b) {
+  static constexpr uint32_t MASK = 0x000f000f;
+  static constexpr uint32_t EX = 0x43004300;
+
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  // clang-format off
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  q >>= 4;
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  // clang-format on
+
+  frag_b[0] = *reinterpret_cast<nv_bfloat162*>(&lo);
+  frag_b[1] = *reinterpret_cast<nv_bfloat162*>(&hi);
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, sglang::kU4B8.id(), false>(int q, nv_bfloat162* frag_b) {
+  dequant<nv_bfloat162, sglang::kU4B8.id(), true>(q, frag_b);
+
+  static constexpr uint32_t SUB = 0x43084308;
+
+  frag_b[0] = __hsub2(frag_b[0], *reinterpret_cast<const nv_bfloat162*>(&SUB));
+  frag_b[1] = __hsub2(frag_b[1], *reinterpret_cast<const nv_bfloat162*>(&SUB));
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, sglang::kU4.id(), true>(int q, nv_bfloat162* frag_b) {
+  dequant<nv_bfloat162, sglang::kU4B8.id(), true>(q, frag_b);
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, sglang::kU4.id(), false>(int q, nv_bfloat162* frag_b) {
+  dequant<nv_bfloat162, sglang::kU4.id(), true>(q, frag_b);
+
+  static constexpr uint32_t SUB = 0x43004300;
+
+  frag_b[0] = __hsub2(frag_b[0], *reinterpret_cast<const nv_bfloat162*>(&SUB));
+  frag_b[1] = __hsub2(frag_b[1], *reinterpret_cast<const nv_bfloat162*>(&SUB));
+}
+
+//
+// Fast Int8ToFp16/Int8ToBf16: Efficiently dequantize 8bit int values to fp16 or
+// bf16 Reference:
+// - FP16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
+// - BF16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L125-L175
+//
+template <>
+__device__ inline void dequant<half2, sglang::kU8B128.id(), true>(int q, half2* frag_b) {
+  static constexpr uint32_t mask_for_elt_01 = 0x5250;
+  static constexpr uint32_t mask_for_elt_23 = 0x5351;
+  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+
+  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
+  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
+
+  frag_b[0] = *reinterpret_cast<half2*>(&lo);
+  frag_b[1] = *reinterpret_cast<half2*>(&hi);
+}
+
+template <>
+__device__ inline void dequant<half2, sglang::kU8B128.id(), false>(int q, half2* frag_b) {
+  dequant<half2, sglang::kU8B128.id(), true>(q, frag_b);
+
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
+  frag_b[0] = __hsub2(frag_b[0], *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  frag_b[1] = __hsub2(frag_b[1], *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+}
+
+template <>
+__device__ inline void dequant<half2, sglang::kU8.id(), true>(int q, half2* frag_b) {
+  dequant<half2, sglang::kU8B128.id(), true>(q, frag_b);
+}
+
+template <>
+__device__ inline void dequant<half2, sglang::kU8.id(), false>(int q, half2* frag_b) {
+  dequant<half2, sglang::kU8.id(), true>(q, frag_b);
+
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64006400;
+  frag_b[0] = __hsub2(frag_b[0], *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  frag_b[1] = __hsub2(frag_b[1], *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, sglang::kU8B128.id(), false>(int q, nv_bfloat162* frag_b) {
+  float fp32_intermediates[4];
+  uint32_t* fp32_intermediates_casted = reinterpret_cast<uint32_t*>(fp32_intermediates);
+
+  static constexpr uint32_t fp32_base = 0x4B000000;
+  fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
+  fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
+  fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
+  fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);
+
+  fp32_intermediates[0] -= 8388736.f;
+  fp32_intermediates[1] -= 8388736.f;
+  fp32_intermediates[2] -= 8388736.f;
+  fp32_intermediates[3] -= 8388736.f;
+
+  uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(frag_b);
+  bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0], fp32_intermediates_casted[1], 0x7632);
+  bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2], fp32_intermediates_casted[3], 0x7632);
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, sglang::kU8.id(), false>(int q, nv_bfloat162* frag_b) {
+  float fp32_intermediates[4];
+  uint32_t* fp32_intermediates_casted = reinterpret_cast<uint32_t*>(fp32_intermediates);
+
+  static constexpr uint32_t fp32_base = 0x4B000000;
+  fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
+  fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
+  fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
+  fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);
+
+  fp32_intermediates[0] -= 8388608.f;
+  fp32_intermediates[1] -= 8388608.f;
+  fp32_intermediates[2] -= 8388608.f;
+  fp32_intermediates[3] -= 8388608.f;
+
+  uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(frag_b);
+  bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0], fp32_intermediates_casted[1], 0x7632);
+  bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2], fp32_intermediates_casted[3], 0x7632);
+}
+
+template <>
+__device__ inline void dequant<half2, sglang::kFE4M3fn.id(), true>(int q, half2* frag_b) {
+  // Constants for FP8 (E4M3) and FP16 formats
+  constexpr int FP8_EXPONENT = 4, FP16_EXPONENT = 5;
+  constexpr int RIGHT_SHIFT = FP16_EXPONENT - FP8_EXPONENT;
+  constexpr int MASK = 0x7F007F00;
+
+  // Extract and shift FP8 values to FP16 format
+  int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  q <<= 8;
+  int Out2 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const half2*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const half2*>(&Out2);
+}
+
+template <>
+__device__ inline void dequant<half2, sglang::kFE4M3fn.id(), false>(int q, half2* frag_b) {
+  dequant<half2, sglang::kFE4M3fn.id(), true>(q, frag_b);
+
+  // Constants for FP8 (E4M3) and FP16 formats
+  constexpr int FP8_EXPONENT = 4, FP16_EXPONENT = 5;
+
+  // Construct and apply exponent bias
+  constexpr int BIAS_OFFSET = (1 << (FP16_EXPONENT - 1)) - (1 << (FP8_EXPONENT - 1));
+  const half2 bias_reg = __float2half2_rn(float(1 << BIAS_OFFSET));
+
+  // Convert to half2 and apply bias
+  frag_b[1] = __hmul2(frag_b[1], bias_reg);
+  frag_b[0] = __hmul2(frag_b[0], bias_reg);
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, sglang::kFE4M3fn.id(), true>(int q, nv_bfloat162* frag_b) {
+  // Constants for FP8 (E4M3) and BF16 formats
+  constexpr int FP8_EXPONENT = 4, BF16_EXPONENT = 8;
+  constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP8_EXPONENT;
+
+  constexpr int MASK = 0x7F007F00;
+
+  // Extract and shift FP8 values to BF16 format
+  int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  q <<= 8;
+  int Out2 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const nv_bfloat162*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2);
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, sglang::kFE4M3fn.id(), false>(int q, nv_bfloat162* frag_b) {
+  dequant<nv_bfloat162, sglang::kFE4M3fn.id(), true>(q, frag_b);
+
+  // Constants for FP8 (E4M3) and BF16 formats
+  constexpr int FP8_EXPONENT = 4, BF16_EXPONENT = 8;
+
+  // Construct and apply exponent bias
+  constexpr int BIAS_OFFSET = (1 << (BF16_EXPONENT - 1)) - (1 << (FP8_EXPONENT - 1));
+  // Add 127 (float exponent bias) to BIAS_OFFSET and shift to float exponent
+  // position
+  constexpr uint32_t BIAS = (BIAS_OFFSET + 127) << 23;
+  const nv_bfloat162 bias_reg = __float2bfloat162_rn(*reinterpret_cast<const float*>(&BIAS));
+
+  // Convert to bfloat162 and apply bias
+  frag_b[1] = __hmul2(frag_b[1], bias_reg);
+  frag_b[0] = __hmul2(frag_b[0], bias_reg);
+}
+
+template <>
+__device__ inline void dequant<half2, sglang::kFE2M1f.id(), true>(int q, half2* frag_b) {
+  // Constants for FP4 (E2M1) and FP16 formats
+  constexpr int FP4_EXPONENT = 2, FP16_EXPONENT = 5;
+  constexpr int RIGHT_SHIFT = FP16_EXPONENT - FP4_EXPONENT;
+  constexpr int MASK = 0x70007000;
+
+  // Extract and shift FP4 values to FP16 format
+  int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  q <<= 4;
+  int Out2 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const half2*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const half2*>(&Out2);
+}
+
+template <>
+__device__ inline void dequant<half2, sglang::kFE2M1f.id(), false>(int q, half2* frag_b) {
+  dequant<half2, sglang::kFE2M1f.id(), true>(q, frag_b);
+
+  // Constants for FP4 (E2M1) and FP16 formats
+  constexpr int FP4_EXPONENT = 2, FP16_EXPONENT = 5;
+
+  // Construct and apply exponent bias
+  constexpr int BIAS_OFFSET = (1 << (FP16_EXPONENT - 1)) - (1 << (FP4_EXPONENT - 1));
+  const half2 bias_reg = __float2half2_rn(float(1 << BIAS_OFFSET));
+
+  // Convert to half2 and apply bias
+  frag_b[1] = __hmul2(frag_b[1], bias_reg);
+  frag_b[0] = __hmul2(frag_b[0], bias_reg);
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, sglang::kFE2M1f.id(), true>(int q, nv_bfloat162* frag_b) {
+  // Constants for FP4 (E2M1) and FP16 formats
+  constexpr int FP4_EXPONENT = 2, BF16_EXPONENT = 8;
+  constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP4_EXPONENT;
+  constexpr int MASK = 0x70007000;
+
+  // Extract and shift FP4 values to FP16 format
+  int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+  q <<= 4;
+  int Out2 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT);
+
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const nv_bfloat162*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2);
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, sglang::kFE2M1f.id(), false>(int q, nv_bfloat162* frag_b) {
+  dequant<nv_bfloat162, sglang::kFE2M1f.id(), true>(q, frag_b);
+
+  // Constants for FP4 (E2M1) and BF16 formats
+  constexpr int FP4_EXPONENT = 2, BF16_EXPONENT = 8;
+
+  // Construct and apply exponent bias
+  constexpr int BIAS_OFFSET = (1 << (BF16_EXPONENT - 1)) - (1 << (FP4_EXPONENT - 1));
+  // Add 127 (float exponent bias) to BIAS_OFFSET and shift to float exponent
+  // position
+  constexpr uint32_t BIAS = (BIAS_OFFSET + 127) << 23;
+  const nv_bfloat162 bias_reg = __float2bfloat162_rn(*reinterpret_cast<const float*>(&BIAS));
+
+  // Convert to half2 and apply bias
+  frag_b[1] = __hmul2(frag_b[1], bias_reg);
+  frag_b[0] = __hmul2(frag_b[0], bias_reg);
+}
+
+template <typename scalar_t2>
+__device__ inline void dequant_fp8_scales(int q, scalar_t2* frag_b);
+
+template <>
+__device__ inline void dequant_fp8_scales<half2>(int q, half2* frag_b) {
+  int Out1 = (q & 0xFF00FF00) >> 1;
+  ;
+  q <<= 8;
+  int Out2 = (q & 0xFF00FF00) >> 1;
+
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const half2*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const half2*>(&Out2);
+};
+
+template <>
+__device__ inline void dequant_fp8_scales<nv_bfloat162>(int q, nv_bfloat162* frag_b) {
+  constexpr int FP8_EXPONENT = 4, BF16_EXPONENT = 8;
+  constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP8_EXPONENT;
+  constexpr int MASK = 0x7F007F00;
+
+  // Extract and shift FP8 values to BF16 format
+  int Out1 = ((q & 0x80008000) >> 1) | ((q & MASK) >> RIGHT_SHIFT);
+  q <<= 8;
+  int Out2 = ((q & 0x80008000) >> 1) | ((q & MASK) >> RIGHT_SHIFT);
+
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const nv_bfloat162*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2);
+}
+
+#endif
+
+}  // namespace MARLIN_NAMESPACE_NAME
diff --git a/sgl-kernel/csrc/gemm/marlin/gptq_marlin.cu b/sgl-kernel/csrc/gemm/marlin/gptq_marlin.cu
new file mode 100644
index 000000000..b3437fc4b
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/marlin/gptq_marlin.cu
@@ -0,0 +1,1120 @@
+/*
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Adapted from https://github.com/IST-DASLab/marlin
+ */
+
+#ifndef MARLIN_NAMESPACE_NAME
+#define MARLIN_NAMESPACE_NAME marlin
+#endif
+
+#include "kernel.h"
+#include "marlin_template.h"
+
+#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)                                        \
+  static_assert(                                                                         \
+      std::is_same<scalar_t, half>::value || std::is_same<scalar_t, nv_bfloat16>::value, \
+      "only float16 and bfloat16 is supported");
+
+namespace marlin {
+
+__global__ void MarlinDefault(MARLIN_KERNEL_PARAMS){};
+
+using MarlinFuncPtr = void (*)(MARLIN_KERNEL_PARAMS);
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+__global__ void permute_cols_kernel(
+    int4 const* __restrict__ a_int4_ptr,
+    int const* __restrict__ perm_int_ptr,
+    int4* __restrict__ out_int4_ptr,
+    int size_m,
+    int size_k,
+    int lda,
+    int block_rows) {}
+
+}  // namespace marlin
+
+torch::Tensor gptq_marlin_gemm(
+    torch::Tensor& a,
+    std::optional<torch::Tensor> c_or_none,
+    torch::Tensor& b_q_weight,
+    torch::Tensor& b_scales,
+    std::optional<torch::Tensor> const& b_zeros_or_none,
+    std::optional<torch::Tensor> const& g_idx_or_none,
+    std::optional<torch::Tensor> const& perm_or_none,
+    torch::Tensor& workspace,
+    sglang::ScalarTypeId const& b_q_type_id,
+    int64_t size_m,
+    int64_t size_n,
+    int64_t size_k,
+    bool is_k_full,
+    bool use_atomic_add,
+    bool use_fp32_reduce,
+    bool is_zp_float) {
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
+  return torch::empty({1, 1});
+}
+
+#else
+
+// For a given "a" of size [M,K] performs a permutation of the K columns based
+// on the given "perm" indices.
+__global__ void permute_cols_kernel(
+    int4 const* __restrict__ a_int4_ptr,
+    int const* __restrict__ perm_int_ptr,
+    int4* __restrict__ out_int4_ptr,
+    int size_m,
+    int size_k,
+    int lda,
+    int block_rows) {
+  auto start_row = block_rows * blockIdx.x;
+  int finish_row = start_row + block_rows;
+  if (finish_row > size_m) {
+    finish_row = size_m;
+  }
+  int cur_block_rows = finish_row - start_row;
+
+  int input_row_stride = lda * sizeof(half) / 16;
+  int output_row_stride = size_k * sizeof(half) / 16;
+
+  auto permute_row = [&](int row) {
+    int iters = size_k / default_threads;
+    int rest = size_k % default_threads;
+
+    int input_offset = row * input_row_stride;
+    int output_offset = row * output_row_stride;
+
+    half const* a_row_half = reinterpret_cast<half const*>(a_int4_ptr + input_offset);
+    half* out_half = reinterpret_cast<half*>(out_int4_ptr + output_offset);
+
+    int base_k = 0;
+
+    for (int i = 0; i < iters; i++) {
+      auto cur_k = base_k + threadIdx.x;
+      int src_pos = perm_int_ptr[cur_k];
+
+      out_half[cur_k] = a_row_half[src_pos];
+
+      base_k += default_threads;
+    }
+
+    if (rest) {
+      if (threadIdx.x < rest) {
+        auto cur_k = base_k + threadIdx.x;
+        int src_pos = perm_int_ptr[cur_k];
+
+        out_half[cur_k] = a_row_half[src_pos];
+      }
+    }
+  };
+
+  for (int i = 0; i < cur_block_rows; i++) {
+    int cur_row = start_row + i;
+    if (cur_row < size_m) {
+      permute_row(cur_row);
+    }
+  }
+}
+
+typedef struct {
+  int thread_k;
+  int thread_n;
+  int num_threads;
+} thread_config_t;
+
+thread_config_t small_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {128, 128, 256},
+    {64, 128, 128},
+    {128, 64, 128}};
+
+thread_config_t large_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {64, 256, 256},
+    {64, 128, 128},
+    {128, 64, 128}};
+
+typedef struct {
+  int blocks_per_sm;
+  thread_config_t tb_cfg;
+} exec_config_t;
+
+int get_scales_cache_size(
+    thread_config_t const& th_config,
+    int prob_m,
+    int prob_n,
+    int prob_k,
+    int num_bits,
+    int group_size,
+    bool has_act_order,
+    bool is_k_full) {
+  bool cache_scales_chunk = has_act_order && !is_k_full;
+
+  int tb_n = th_config.thread_n;
+  int tb_k = th_config.thread_k;
+
+  // Get max scale groups per thread-block
+  int tb_groups;
+  if (group_size == -1) {
+    tb_groups = 1;
+  } else if (group_size == 0) {
+    tb_groups = div_ceil(tb_k, 32);  // Worst case is 32 group size
+  } else {
+    tb_groups = div_ceil(tb_k, group_size);
+  }
+
+  if (cache_scales_chunk) {
+    int load_groups = tb_groups * pipe_stages * 2;  // Chunk size is 2x pipeline over dim K
+    load_groups = max(load_groups, 32);             // We load at least 32 scale groups
+    return load_groups * tb_n * 2;
+  } else {
+    int tb_scales = tb_groups * tb_n * 2;
+
+    return tb_scales * pipe_stages;
+  }
+}
+
+int get_kernel_cache_size(
+    thread_config_t const& th_config,
+    int thread_m_blocks,
+    int prob_m,
+    int prob_n,
+    int prob_k,
+    int num_bits,
+    int group_size,
+    bool has_act_order,
+    bool is_k_full,
+    int has_zp,
+    int is_zp_float) {
+  int pack_factor = 32 / num_bits;
+
+  // Get B size
+  int tb_k = th_config.thread_k;
+  int tb_n = th_config.thread_n;
+  int tb_m = thread_m_blocks * 16;
+  int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
+  int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
+  int sh_red_size = tb_m * (tb_n + 8);
+  int sh_s_size =
+      get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits, group_size, has_act_order, is_k_full);
+  int sh_g_idx_size = has_act_order && !is_k_full ? pipe_stages * tb_k / 4 : 0;
+  int sh_zp_size = 0;
+  if (has_zp) {
+    if (is_zp_float)
+      sh_zp_size = sh_s_size;
+    else if (num_bits == 4)
+      sh_zp_size = sh_s_size / 4;
+    else if (num_bits == 8)
+      sh_zp_size = sh_s_size / 2;
+  }
+
+  int total_size = max(sh_b_size, sh_red_size) + sh_a_size + sh_s_size + sh_zp_size + sh_g_idx_size;
+
+  return total_size;
+}
+
+bool is_valid_config(
+    thread_config_t const& th_config,
+    int thread_m_blocks,
+    int prob_m,
+    int prob_n,
+    int prob_k,
+    int num_bits,
+    int group_size,
+    bool has_act_order,
+    bool is_k_full,
+    int has_zp,
+    int is_zp_float,
+    int max_shared_mem) {
+  // Sanity
+  if (th_config.thread_k == -1 || th_config.thread_n == -1 || th_config.num_threads == -1) {
+    return false;
+  }
+
+  // Verify K/N are divisible by thread K/N
+  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
+    return false;
+  }
+
+  // Verify min for thread K/N
+  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
+    return false;
+  }
+
+  // num_threads must be at least 128 (= 4 warps)
+  if (th_config.num_threads < 128) {
+    return false;
+  }
+
+  // Check that pipeline fits into cache
+  int cache_size = get_kernel_cache_size(
+      th_config,
+      thread_m_blocks,
+      prob_m,
+      prob_n,
+      prob_k,
+      num_bits,
+      group_size,
+      has_act_order,
+      is_k_full,
+      has_zp,
+      is_zp_float);
+  return cache_size <= max_shared_mem;
+}
+
+#define _GET_IF(                                                                                                       \
+    W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT) \
+  else if (                                                                                                            \
+      q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS && thread_n_blocks == THREAD_N_BLOCKS &&                  \
+      thread_k_blocks == THREAD_K_BLOCKS && m_block_size_8 == M_BLOCK_SIZE_8 && group_blocks == GROUP_BLOCKS &&        \
+      num_threads == NUM_THREADS && is_zp_float == IS_ZP_FLOAT) {                                                      \
+    kernel = Marlin<                                                                                                   \
+        scalar_t,                                                                                                      \
+        W_TYPE.id(),                                                                                                   \
+        NUM_THREADS,                                                                                                   \
+        THREAD_M_BLOCKS,                                                                                               \
+        THREAD_N_BLOCKS,                                                                                               \
+        THREAD_K_BLOCKS,                                                                                               \
+        M_BLOCK_SIZE_8,                                                                                                \
+        pipe_stages,                                                                                                   \
+        GROUP_BLOCKS,                                                                                                  \
+        IS_ZP_FLOAT>;                                                                                                  \
+  }
+
+// COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false)
+//         this is the most common cases
+// BIGGROUP: cases for big group size (group_blocks in [-1, 8])
+// FZP: cases for float-zero-point (is_zp_float = true)
+// ACT: cases for act order case (group_blocks == 0)
+// FP4: cases for nvfp4(e2m1) (group_blocks == 1)
+#define COMMON_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
+  _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false)  \
+  _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false)   \
+  _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, false)   \
+  _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false)   \
+  _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
+  _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
+  _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
+  _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
+
+#define COMMON_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
+  _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
+  _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
+  _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
+  _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
+                                                                        \
+  _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
+  _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
+  _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
+  _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
+                                                                        \
+  _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
+  _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
+  _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
+  _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
+
+#define COMMON_GET_IF(W_TYPE)            \
+  COMMON_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+  COMMON_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+  COMMON_GET_IF_M1(W_TYPE, 4, 8, 128)    \
+  COMMON_GET_IF_M234(W_TYPE, 16, 4, 256) \
+  COMMON_GET_IF_M234(W_TYPE, 8, 4, 128)  \
+  COMMON_GET_IF_M234(W_TYPE, 4, 8, 128)
+
+#define BIGGROUP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
+  _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false)  \
+  _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false)   \
+  _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
+  _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
+
+#define BIGGROUP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)   \
+  _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
+  _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
+  _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
+  _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
+  _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
+  _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
+
+#define BIGGROUP_GET_IF(W_TYPE)            \
+  BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+  BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+  BIGGROUP_GET_IF_M1(W_TYPE, 4, 8, 128)    \
+  BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \
+  BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128)  \
+  BIGGROUP_GET_IF_M234(W_TYPE, 4, 8, 128)
+
+#define FP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)        \
+  _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
+  _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
+
+#define FP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
+  _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
+  _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
+  _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
+
+#define FP4_GET_IF(W_TYPE)            \
+  FP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+  FP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+  FP4_GET_IF_M1(W_TYPE, 4, 8, 128)    \
+  FP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
+  FP4_GET_IF_M234(W_TYPE, 8, 4, 128)  \
+  FP4_GET_IF_M234(W_TYPE, 4, 8, 128)
+
+// We currently have 4-bit models only with group_blocks == 4
+#define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
+  _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, true) \
+  _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
+
+#define FZP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
+  _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
+  _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
+  _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
+
+#define FZP_GET_IF(W_TYPE)            \
+  FZP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+  FZP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+  FZP_GET_IF_M1(W_TYPE, 4, 8, 128)    \
+  FZP_GET_IF_M234(W_TYPE, 16, 4, 256) \
+  FZP_GET_IF_M234(W_TYPE, 8, 4, 128)  \
+  FZP_GET_IF_M234(W_TYPE, 4, 8, 128)
+
+// We currently have 4-bit models only with group_blocks == 4
+#define ACT_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)        \
+  _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS, false) \
+  _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
+
+#define ACT_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
+  _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
+  _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
+  _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
+
+#define ACT_GET_IF(W_TYPE)            \
+  ACT_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+  ACT_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+  ACT_GET_IF_M1(W_TYPE, 4, 8, 128)    \
+  ACT_GET_IF_M234(W_TYPE, 16, 4, 256) \
+  ACT_GET_IF_M234(W_TYPE, 8, 4, 128)  \
+  ACT_GET_IF_M234(W_TYPE, 4, 8, 128)
+
+template <typename scalar_t>
+MarlinFuncPtr get_marlin_kernel(
+    const sglang::ScalarType q_type,
+    int thread_m_blocks,
+    int thread_n_blocks,
+    int thread_k_blocks,
+    bool m_block_size_8,
+    bool has_act_order,
+    bool has_zp,
+    int group_blocks,
+    int num_threads,
+    bool is_zp_float) {
+  int num_bits = q_type.size_bits();
+  auto kernel = MarlinDefault;
+  if (false) {
+  }
+
+  COMMON_GET_IF(sglang::kU4)
+  COMMON_GET_IF(sglang::kU4B8)
+  COMMON_GET_IF(sglang::kU8B128)
+
+  FP4_GET_IF(sglang::kFE2M1f)
+
+  BIGGROUP_GET_IF(sglang::kFE4M3fn)
+
+  ACT_GET_IF(sglang::kU4B8)
+  ACT_GET_IF(sglang::kU8B128)
+
+  if (std::is_same<scalar_t, half>::value) {
+    if (false) {
+    }
+    FZP_GET_IF(sglang::kU4)
+  }
+
+  return kernel;
+}
+
+template <typename scalar_t>
+exec_config_t determine_exec_config(
+    const sglang::ScalarType& q_type,
+    int prob_m,
+    int prob_n,
+    int prob_k,
+    int thread_m_blocks,
+    bool m_block_size_8,
+    int num_bits,
+    int group_size,
+    bool has_act_order,
+    bool is_k_full,
+    bool has_zp,
+    bool is_zp_float,
+    int max_shared_mem,
+    int sms) {
+  exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
+  thread_config_t* thread_configs = thread_m_blocks > 1 ? large_batch_thread_configs : small_batch_thread_configs;
+  int thread_configs_size = thread_m_blocks > 1 ? sizeof(large_batch_thread_configs) / sizeof(thread_config_t)
+                                                : sizeof(small_batch_thread_configs) / sizeof(thread_config_t);
+
+  for (int i = 0; i < thread_configs_size; i++) {
+    thread_config_t th_config = thread_configs[i];
+
+    if (!is_valid_config(
+            th_config,
+            thread_m_blocks,
+            prob_m,
+            prob_n,
+            prob_k,
+            num_bits,
+            group_size,
+            has_act_order,
+            is_k_full,
+            has_zp,
+            is_zp_float,
+            max_shared_mem)) {
+      continue;
+    }
+
+    int cache_size = get_kernel_cache_size(
+        th_config,
+        thread_m_blocks,
+        prob_m,
+        prob_n,
+        prob_k,
+        num_bits,
+        group_size,
+        has_act_order,
+        is_k_full,
+        has_zp,
+        is_zp_float);
+
+    int group_blocks = 0;
+    if (!has_act_order) {
+      group_blocks = group_size == -1 ? -1 : group_size / 16;
+    }
+
+    auto kernel = get_marlin_kernel<scalar_t>(
+        q_type,
+        thread_m_blocks,
+        th_config.thread_n / 16,
+        th_config.thread_k / 16,
+        m_block_size_8,
+        has_act_order,
+        has_zp,
+        group_blocks,
+        th_config.num_threads,
+        is_zp_float);
+
+    if (kernel == MarlinDefault) continue;
+
+    // int m_tiles = div_ceil(prob_m, thread_m_blocks * 16);
+    // int n_tiles = prob_n / th_config.thread_n;
+    // int k_tiles = prob_k / th_config.thread_k;
+
+    return {1, th_config};
+  }
+
+  return exec_cfg;
+}
+
+template <typename scalar_t>
+void marlin_mm(
+    const void* A,
+    const void* B,
+    void* C,
+    void* C_tmp,
+    void* s,
+    void* s2,
+    void* zp,
+    void* g_idx,
+    void* perm,
+    void* a_tmp,
+    int prob_m,
+    int prob_n,
+    int prob_k,
+    int lda,
+    void* workspace,
+    sglang::ScalarType const& q_type,
+    bool has_act_order,
+    bool is_k_full,
+    bool has_zp,
+    int num_groups,
+    int group_size,
+    int dev,
+    cudaStream_t stream,
+    int thread_k_init,
+    int thread_n_init,
+    int sms,
+    bool use_atomic_add,
+    bool use_fp32_reduce,
+    bool is_zp_float) {
+  if (has_zp) {
+    TORCH_CHECK(
+        q_type == sglang::kU4 || q_type == sglang::kU8,
+        "q_type must be u4 or u8 when has_zp = True. Got = ",
+        q_type.str());
+  } else {
+    TORCH_CHECK(
+        q_type == sglang::kU4B8 || q_type == sglang::kU8B128 || q_type == sglang::kFE4M3fn || q_type == sglang::kFE2M1f,
+        "q_type must be uint4b8, uint8b128, float8_e4m3fn or float4_e2m1f when "
+        "has_zp = False. Got = ",
+        q_type.str());
+  }
+
+  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m, ", ", prob_n, ", ", prob_k, "]");
+
+  int group_blocks = 0;
+  if (has_act_order) {
+    if (is_k_full) {
+      TORCH_CHECK(group_size != -1);
+      group_blocks = group_size / 16;
+      TORCH_CHECK(
+          prob_k % group_blocks == 0, "prob_k = ", prob_k, " is not divisible by group_blocks = ", group_blocks);
+    } else {
+      TORCH_CHECK(group_size == 0);
+      group_blocks = 0;
+    }
+  } else {
+    if (group_size == -1) {
+      group_blocks = -1;
+    } else {
+      group_blocks = group_size / 16;
+      TORCH_CHECK(
+          prob_k % group_blocks == 0, "prob_k = ", prob_k, " is not divisible by group_blocks = ", group_blocks);
+    }
+  }
+
+  int num_bits = q_type.size_bits();
+  const int4* A_ptr = (const int4*)A;
+  const int4* B_ptr = (const int4*)B;
+  int4* C_ptr = (int4*)C;
+  int4* C_tmp_ptr = (int4*)C_tmp;
+  const int4* s_ptr = (const int4*)s;
+  const uint16_t* s2_ptr = (const uint16_t*)s2;
+  const int4* zp_ptr = (const int4*)zp;
+  const int* g_idx_ptr = (const int*)g_idx;
+  const int* perm_ptr = (const int*)perm;
+  int4* a_tmp_ptr = (int4*)a_tmp;
+
+  int* locks = (int*)workspace;
+
+  if (has_act_order) {
+    // Permute A columns
+    int block_rows = div_ceil(prob_m, sms);
+    // avoid ">>>" being formatted to "> > >"
+    // clang-format off
+    permute_cols_kernel<<<sms, default_threads, 0, stream>>>(
+        A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, lda, block_rows);
+    // clang-format on
+    A_ptr = a_tmp_ptr;
+    lda = prob_k;
+
+    // If we have a full K, then we can run the non-act-order version of Marlin
+    // (since the weight rows are reordered by increasing group ids, and by
+    // having a full K, we have full original groups)
+    if (is_k_full) has_act_order = false;
+  }
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem, cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  int max_par = 16;
+  if (prob_n <= 4096) max_par = 16 * 8;
+  int max_shared_mem_new = max_shared_mem;
+  int rest_m = prob_m;
+  int max_thread_m_blocks = 4;
+  while (rest_m) {
+    int par_count = rest_m / (max_thread_m_blocks * 16);
+    if (par_count > max_par) par_count = max_par;
+    int prob_m_split = par_count > 0 ? (par_count * (max_thread_m_blocks * 16)) : rest_m;
+
+    int thread_k = thread_k_init;
+    int thread_n = thread_n_init;
+
+    int thread_m_blocks = min(div_ceil(prob_m_split, 16), max_thread_m_blocks);
+    int m_block_size_8 = prob_m_split <= 8;
+
+    // Set thread config
+    exec_config_t exec_cfg;
+    thread_config_t thread_tfg;
+    if (thread_k != -1 && thread_n != -1) {
+      thread_tfg = thread_config_t{thread_k, thread_n, default_threads};
+      exec_cfg = exec_config_t{1, thread_tfg};
+      TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n, " is not divisible by thread_n = ", thread_n);
+      TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k, " is not divisible by thread_k = ", thread_k);
+    } else {
+      // Auto config
+      exec_cfg = determine_exec_config<scalar_t>(
+          q_type,
+          prob_m_split,
+          prob_n,
+          prob_k,
+          thread_m_blocks,
+          m_block_size_8,
+          num_bits,
+          group_size,
+          has_act_order,
+          is_k_full,
+          has_zp,
+          is_zp_float,
+          max_shared_mem,
+          sms);
+      thread_tfg = exec_cfg.tb_cfg;
+      if (thread_tfg.thread_k == -1 && max_thread_m_blocks > 1) {
+        max_thread_m_blocks--;
+        continue;
+      }
+    }
+
+    int num_threads = thread_tfg.num_threads;
+    thread_k = thread_tfg.thread_k;
+    thread_n = thread_tfg.thread_n;
+    int blocks = sms * exec_cfg.blocks_per_sm;
+    if (exec_cfg.blocks_per_sm > 1) max_shared_mem_new = max_shared_mem / exec_cfg.blocks_per_sm - 1024;
+
+    int thread_k_blocks = thread_k / 16;
+    int thread_n_blocks = thread_n / 16;
+
+    TORCH_CHECK(
+        is_valid_config(
+            thread_tfg,
+            thread_m_blocks,
+            prob_m_split,
+            prob_n,
+            prob_k,
+            num_bits,
+            group_size,
+            has_act_order,
+            is_k_full,
+            has_zp,
+            is_zp_float,
+            max_shared_mem_new),
+        "Invalid thread config: thread_m_blocks = ",
+        thread_m_blocks,
+        ", thread_k = ",
+        thread_tfg.thread_k,
+        ", thread_n = ",
+        thread_tfg.thread_n,
+        ", num_threads = ",
+        thread_tfg.num_threads,
+        " for MKN = [",
+        prob_m,
+        ", ",
+        prob_k,
+        ", ",
+        prob_n,
+        "] and num_bits = ",
+        num_bits,
+        ", prob_m_split = ",
+        prob_m_split,
+        ", group_size = ",
+        group_size,
+        ", has_act_order = ",
+        has_act_order,
+        ", is_k_full = ",
+        is_k_full,
+        ", has_zp = ",
+        has_zp,
+        ", is_zp_float = ",
+        is_zp_float,
+        ", max_shared_mem_new = ",
+        max_shared_mem_new);
+
+    auto kernel = get_marlin_kernel<scalar_t>(
+        q_type,
+        thread_m_blocks,
+        thread_n_blocks,
+        thread_k_blocks,
+        m_block_size_8,
+        has_act_order,
+        has_zp,
+        group_blocks,
+        num_threads,
+        is_zp_float);
+
+    if (kernel == MarlinDefault) {
+      TORCH_CHECK(
+          false,
+          "Unsupported shapes: MNK = [",
+          prob_m,
+          ", ",
+          prob_n,
+          ", ",
+          prob_k,
+          "]",
+          ", has_act_order = ",
+          has_act_order,
+          ", num_groups = ",
+          num_groups,
+          ", group_size = ",
+          group_size,
+          ", prob_m_split = ",
+          prob_m_split,
+          ", thread_m_blocks = ",
+          thread_m_blocks,
+          ", thread_n_blocks = ",
+          thread_n_blocks,
+          ", thread_k_blocks = ",
+          thread_k_blocks,
+          ", num_threads = ",
+          num_threads,
+          ", num_bits = ",
+          num_bits);
+    }
+
+    cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem_new);
+
+    bool part_use_atomic_add = use_atomic_add && div_ceil(prob_m_split, 64) * prob_n <= 2048;
+
+    // avoid ">>>" being formatted to "> > >"
+    // clang-format off
+    kernel<<<blocks, num_threads, max_shared_mem_new, stream>>>(
+        A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr, num_groups,
+        prob_m_split, prob_n, prob_k, lda, locks, part_use_atomic_add,
+        use_fp32_reduce, max_shared_mem_new);
+    // clang-format on
+
+    A_ptr += prob_m_split * (lda / 8);
+    C_ptr += prob_m_split * (prob_n / 8);
+    rest_m -= prob_m_split;
+  }
+}
+
+}  // namespace marlin
+
+torch::Tensor gptq_marlin_gemm(
+    torch::Tensor& a,
+    std::optional<torch::Tensor> c_or_none,
+    torch::Tensor& b_q_weight,
+    torch::Tensor& b_scales,
+    std::optional<torch::Tensor> const& global_scale_or_none,
+    std::optional<torch::Tensor> const& b_zeros_or_none,
+    std::optional<torch::Tensor> const& g_idx_or_none,
+    std::optional<torch::Tensor> const& perm_or_none,
+    torch::Tensor& workspace,
+    sglang::ScalarTypeId const& b_q_type_id,
+    int64_t size_m,
+    int64_t size_n,
+    int64_t size_k,
+    bool is_k_full,
+    bool use_atomic_add,
+    bool use_fp32_reduce,
+    bool is_zp_float) {
+  sglang::ScalarType const b_q_type = sglang::ScalarType::from_id(b_q_type_id);
+  int pack_factor = 32 / b_q_type.size_bits();
+
+  // Verify A
+  TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0), ", size_m = ", size_m);
+  TORCH_CHECK(a.size(1) == size_k, "Shape mismatch: a.size(1) = ", a.size(1), ", size_k = ", size_k);
+
+  // Verify B
+  TORCH_CHECK(
+      size_k % MARLIN_NAMESPACE_NAME::tile_size == 0,
+      "size_k = ",
+      size_k,
+      " is not divisible by tile_size = ",
+      MARLIN_NAMESPACE_NAME::tile_size);
+  TORCH_CHECK(
+      (size_k / MARLIN_NAMESPACE_NAME::tile_size) == b_q_weight.size(0),
+      "Shape mismatch: b_q_weight.size(0) = ",
+      b_q_weight.size(0),
+      ", size_k = ",
+      size_k,
+      ", tile_size = ",
+      MARLIN_NAMESPACE_NAME::tile_size);
+  TORCH_CHECK(
+      b_q_weight.size(1) % MARLIN_NAMESPACE_NAME::tile_size == 0,
+      "b_q_weight.size(1) = ",
+      b_q_weight.size(1),
+      " is not divisible by tile_size = ",
+      MARLIN_NAMESPACE_NAME::tile_size);
+  int actual_size_n = (b_q_weight.size(1) / MARLIN_NAMESPACE_NAME::tile_size) * pack_factor;
+  TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n, ", actual_size_n = ", actual_size_n);
+
+  // Verify device and strides
+  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
+  TORCH_CHECK(a.stride(1) == 1, "A.stride(1) is not 1");
+  // We use int4 (16 bytes) to load A, so A must aligned to 16 bytes
+  TORCH_CHECK(a.stride(0) % 8 == 0, "A.stride(0) must divisible by 8");
+  TORCH_CHECK(((uint64_t)a.data_ptr()) % 16 == 0, "A must aligned to 16 bytes");
+
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+
+  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
+  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+
+  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_k = -1;
+  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_n = -1;
+  // sms: number of SMs to use for the kernel
+  int sms = -1;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
+
+  // Alloc buffers
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  torch::Tensor c;
+  if (c_or_none.has_value()) {
+    c = c_or_none.value();
+    TORCH_CHECK(c.device().is_cuda(), "c is not on GPU");
+    TORCH_CHECK(c.is_contiguous(), "c is not contiguous");
+    TORCH_CHECK(c.size(0) == size_m, "Shape mismatch: c.size(0) = ", c.size(0), ", size_m = ", size_m);
+    TORCH_CHECK(c.size(1) == size_n, "Shape mismatch: c.size(1) = ", c.size(1), ", size_n = ", size_n);
+  } else {
+    c = torch::empty({size_m, size_n}, options);
+  }
+  if (size_m == 0) return c;
+
+  // Alloc C tmp buffer that is going to be used for the global reduce
+  torch::Tensor c_tmp;
+  auto options_fp32 = torch::TensorOptions().dtype(at::kFloat).device(a.device());
+  if (use_fp32_reduce) {
+    int max_m_block_size = (size_m + 16 - 1) / 16 * 16;
+    max_m_block_size = min(max_m_block_size, 64);
+    int max_c_tmp_size = sms * max_m_block_size * MARLIN_NAMESPACE_NAME::max_thread_n;
+    c_tmp = torch::empty({max_c_tmp_size}, options_fp32);
+  } else {
+    c_tmp = torch::empty({0}, options_fp32);
+  }
+
+  // Detect groupsize and act_order
+  int num_groups = -1;
+  int group_size = -1;
+
+  int rank = b_scales.sizes().size();
+  TORCH_CHECK(rank == 2, "b_scales rank = ", rank, " is not 2");
+  TORCH_CHECK(b_scales.size(1) == size_n, "b_scales dim 1 = ", b_scales.size(1), " is not size_n = ", size_n);
+  num_groups = b_scales.size(0);
+
+  torch::Tensor g_idx, perm, a_tmp;
+  if (g_idx_or_none.has_value() && perm_or_none.has_value()) {
+    g_idx = g_idx_or_none.value();
+    perm = perm_or_none.value();
+
+    TORCH_CHECK(g_idx.device().is_cuda(), "g_idx is not on GPU");
+    TORCH_CHECK(g_idx.is_contiguous(), "g_idx is not contiguous");
+    TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
+    TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");
+
+    // Verify g_idx and perm
+    TORCH_CHECK(
+        (g_idx.size(-1) == 0 && perm.size(-1) == 0) || (g_idx.size(-1) == size_k && perm.size(-1) == size_k),
+        "Unexpected g_idx.size(-1) = ",
+        g_idx.size(-1),
+        " and perm.size(-1) = ",
+        perm.size(-1),
+        ", where size_k = ",
+        size_k);
+  } else {
+    g_idx = torch::empty({0}, options);
+    perm = torch::empty({0}, options);
+    a_tmp = torch::empty({0}, options);
+  }
+  bool has_act_order = g_idx.size(-1) > 0 && perm.size(-1) > 0;
+
+  if (has_act_order) {
+    a_tmp = torch::empty({size_m, size_k}, options);
+    if (is_k_full) {
+      TORCH_CHECK(num_groups > 1, "For act_order, num_groups must be > 1");
+      TORCH_CHECK(size_k % num_groups == 0, "size_k = ", size_k, ", is not divisible by num_groups = ", num_groups);
+      group_size = size_k / num_groups;
+    } else {
+      group_size = 0;
+    }
+
+  } else {
+    a_tmp = torch::empty({0}, options);
+    if (num_groups > 1) {
+      TORCH_CHECK(
+          size_k % num_groups == 0, "size_k = ", size_k, ", is not divisible by b_scales.size(0) = ", b_scales.size(0));
+      group_size = size_k / num_groups;
+    } else {
+      group_size = -1;
+    }
+  }
+
+  torch::Tensor global_scale;
+  if (global_scale_or_none.has_value()) {
+    global_scale = global_scale_or_none.value();
+    TORCH_CHECK(b_q_type == sglang::kFE2M1f, "global_scale can only be used for float4_e2m1f.");
+  } else {
+    global_scale = torch::empty({0}, options);
+    TORCH_CHECK(!(b_q_type == sglang::kFE2M1f), "the global_scale parameter must be passed for float4_e2m1f.");
+  }
+
+  torch::Tensor b_zeros;
+  if (b_zeros_or_none.has_value()) {
+    b_zeros = b_zeros_or_none.value();
+    TORCH_CHECK(b_zeros.device().is_cuda(), "b_zeros is not on GPU");
+    TORCH_CHECK(b_zeros.is_contiguous(), "b_zeros is not contiguous");
+  } else {
+    b_zeros = torch::empty({0}, options);
+  }
+  bool has_zp = b_zeros.size(-1) > 0;
+  if (has_zp) {
+    TORCH_CHECK(
+        b_q_type == sglang::kU4 || b_q_type == sglang::kU8,
+        "b_q_type must be u4 or u8 when has_zp = True. Got = ",
+        b_q_type.str());
+  } else {
+    TORCH_CHECK(
+        b_q_type == sglang::kU4B8 || b_q_type == sglang::kU8B128 || b_q_type == sglang::kFE4M3fn ||
+            b_q_type == sglang::kFE2M1f,
+        "b_q_type must be uint4b8, uint8b128, float8_e4m3fn or "
+        "float4_e2m1f when "
+        "has_zp = False. Got = ",
+        b_q_type.str());
+  }
+
+  if (has_zp && is_zp_float) {
+    TORCH_CHECK(
+        a.scalar_type() == at::ScalarType::Half,
+        "Computation type must be float16 (half) when using float zero "
+        "points.");
+  }
+
+  // Verify b_zeros
+  if (has_zp) {
+    int rank = b_zeros.sizes().size();
+    TORCH_CHECK(rank == 2, "b_zeros rank = ", rank, " is not 2");
+    if (is_zp_float) {
+      TORCH_CHECK(b_zeros.size(1) == size_n, "b_zeros dim 1 = ", b_zeros.size(1), " is not size_n = ", size_n);
+      TORCH_CHECK(
+          num_groups == b_zeros.size(0), "b_zeros dim 0 = ", b_zeros.size(0), " is not num_groups = ", num_groups);
+      TORCH_CHECK(num_groups != -1, "num_groups must be != -1");
+    } else {
+      TORCH_CHECK(
+          b_zeros.size(0) == num_groups, "b_zeros dim 0 = ", b_zeros.size(0), " is not num_groups = ", num_groups);
+      TORCH_CHECK(
+          b_zeros.size(1) == size_n / pack_factor,
+          "b_zeros dim 1 = ",
+          b_zeros.size(1),
+          " is not size_n / pack_factor = ",
+          size_n / pack_factor);
+    }
+  }
+
+  // Verify workspace size
+  TORCH_CHECK(
+      size_n % MARLIN_NAMESPACE_NAME::min_thread_n == 0,
+      "size_n = ",
+      size_n,
+      ", is not divisible by min_thread_n = ",
+      MARLIN_NAMESPACE_NAME::min_thread_n);
+
+  int min_workspace_size = sms;
+  TORCH_CHECK(
+      workspace.numel() >= min_workspace_size,
+      "workspace.numel = ",
+      workspace.numel(),
+      " is below min_workspace_size = ",
+      min_workspace_size);
+
+  int dev = a.get_device();
+  if (a.scalar_type() == at::ScalarType::Half) {
+    void* scales_ptr;
+    if (b_q_type == sglang::kFE2M1f) {
+      scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+    } else {
+      scales_ptr = b_scales.data_ptr<at::Half>();
+    }
+
+    marlin::marlin_mm<half>(
+        a.data_ptr<at::Half>(),
+        b_q_weight.data_ptr(),
+        c.data_ptr<at::Half>(),
+        c_tmp.data_ptr<float>(),
+        scales_ptr,
+        global_scale.data_ptr<at::Half>(),
+        b_zeros.data_ptr(),
+        g_idx.data_ptr(),
+        perm.data_ptr(),
+        a_tmp.data_ptr<at::Half>(),
+        size_m,
+        size_n,
+        size_k,
+        a.stride(0),
+        workspace.data_ptr(),
+        b_q_type,
+        has_act_order,
+        is_k_full,
+        has_zp,
+        num_groups,
+        group_size,
+        dev,
+        at::cuda::getCurrentCUDAStream(dev),
+        thread_k,
+        thread_n,
+        sms,
+        use_atomic_add,
+        use_fp32_reduce,
+        is_zp_float);
+  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
+    void* scales_ptr;
+    if (b_q_type == sglang::kFE2M1f) {
+      scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+    } else {
+      scales_ptr = b_scales.data_ptr<at::BFloat16>();
+    }
+
+    marlin::marlin_mm<nv_bfloat16>(
+        a.data_ptr<at::BFloat16>(),
+        b_q_weight.data_ptr(),
+        c.data_ptr<at::BFloat16>(),
+        c_tmp.data_ptr<float>(),
+        scales_ptr,
+        global_scale.data_ptr<at::BFloat16>(),
+        b_zeros.data_ptr(),
+        g_idx.data_ptr(),
+        perm.data_ptr(),
+        a_tmp.data_ptr<at::BFloat16>(),
+        size_m,
+        size_n,
+        size_k,
+        a.stride(0),
+        workspace.data_ptr(),
+        b_q_type,
+        has_act_order,
+        is_k_full,
+        has_zp,
+        num_groups,
+        group_size,
+        dev,
+        at::cuda::getCurrentCUDAStream(dev),
+        thread_k,
+        thread_n,
+        sms,
+        use_atomic_add,
+        use_fp32_reduce,
+        is_zp_float);
+  } else {
+    TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
+  }
+
+  return c;
+}
+
+#endif
diff --git a/sgl-kernel/csrc/gemm/marlin/gptq_marlin_repack.cu b/sgl-kernel/csrc/gemm/marlin/gptq_marlin_repack.cu
new file mode 100644
index 000000000..3bdf84161
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/marlin/gptq_marlin_repack.cu
@@ -0,0 +1,329 @@
+#include "marlin.cuh"
+
+namespace marlin {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+template <int const num_threads, int const num_bits, bool const has_perm>
+__global__ void gptq_marlin_repack_kernel(
+    uint32_t const* __restrict__ b_q_weight_ptr,
+    uint32_t const* __restrict__ perm_ptr,
+    uint32_t* __restrict__ out_ptr,
+    int size_k,
+    int size_n) {
+  return;
+}
+#else
+template <int const num_threads, int const num_bits, bool const has_perm>
+__global__ void gptq_marlin_repack_kernel(
+    uint32_t const* __restrict__ b_q_weight_ptr,
+    uint32_t const* __restrict__ perm_ptr,
+    uint32_t* __restrict__ out_ptr,
+    int size_k,
+    int size_n) {
+  constexpr int pack_factor = 32 / num_bits;
+
+  int k_tiles = size_k / tile_k_size;
+  int n_tiles = size_n / tile_n_size;
+  int block_k_tiles = div_ceil(k_tiles, gridDim.x);
+
+  auto start_k_tile = blockIdx.x * block_k_tiles;
+  if (start_k_tile >= k_tiles) {
+    return;
+  }
+
+  int finish_k_tile = min(start_k_tile + block_k_tiles, k_tiles);
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<repack_stages - 2>();
+    __syncthreads();
+  };
+
+  extern __shared__ int4 sh[];
+
+  constexpr int perm_size = tile_k_size / 4;
+
+  int4* sh_perm_ptr = sh;
+  int4* sh_pipe_ptr = sh_perm_ptr;
+  if constexpr (has_perm) {
+    sh_pipe_ptr += perm_size;
+  }
+
+  constexpr int tile_ints = tile_k_size / pack_factor;
+
+  constexpr int stage_n_threads = tile_n_size / 4;
+  constexpr int stage_k_threads = has_perm ? tile_k_size : tile_ints;
+  constexpr int stage_size = stage_k_threads * stage_n_threads;
+
+  auto load_perm_to_shared = [&](int k_tile_id) {
+    int first_k_int4 = (k_tile_id * tile_k_size) / 4;
+
+    int4 const* perm_int4_ptr = reinterpret_cast<int4 const*>(perm_ptr);
+
+    if (threadIdx.x < perm_size) {
+      sh_perm_ptr[threadIdx.x] = perm_int4_ptr[first_k_int4 + threadIdx.x];
+    }
+    __syncthreads();
+  };
+
+  auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) {
+    if (n_tile_id >= n_tiles) {
+      cp_async_fence();
+      return;
+    }
+
+    int first_n = n_tile_id * tile_n_size;
+
+    int4* sh_ptr = sh_pipe_ptr + stage_size * pipe;
+
+    if constexpr (has_perm) {
+      if (threadIdx.x < stage_size) {
+        auto k_id = threadIdx.x / stage_n_threads;
+        auto n_id = threadIdx.x % stage_n_threads;
+
+        uint32_t const* sh_perm_int_ptr = reinterpret_cast<uint32_t const*>(sh_perm_ptr);
+
+        int src_k = sh_perm_int_ptr[k_id];
+        int src_k_packed = src_k / pack_factor;
+
+        cp_async4(
+            &sh_ptr[k_id * stage_n_threads + n_id],
+            reinterpret_cast<int4 const*>(&(b_q_weight_ptr[src_k_packed * size_n + first_n + (n_id * 4)])));
+      }
+
+    } else {
+      if (threadIdx.x < stage_size) {
+        auto k_id = threadIdx.x / stage_n_threads;
+        auto n_id = threadIdx.x % stage_n_threads;
+
+        int first_k = k_tile_id * tile_k_size;
+        int first_k_packed = first_k / pack_factor;
+
+        cp_async4(
+            &sh_ptr[k_id * stage_n_threads + n_id],
+            reinterpret_cast<int4 const*>(&(b_q_weight_ptr[(first_k_packed + k_id) * size_n + first_n + (n_id * 4)])));
+      }
+    }
+
+    cp_async_fence();
+  };
+
+  auto repack_tile = [&](int pipe, int k_tile_id, int n_tile_id) {
+    if (n_tile_id >= n_tiles) {
+      return;
+    }
+
+    auto warp_id = threadIdx.x / 32;
+    auto th_id = threadIdx.x % 32;
+
+    if (warp_id >= 4) {
+      return;
+    }
+
+    int tc_col = th_id / 4;
+    int tc_row = (th_id % 4) * 2;
+
+    constexpr int tc_offsets[4] = {0, 1, 8, 9};
+
+    int cur_n = warp_id * 16 + tc_col;
+
+    constexpr int sh_stride = 64;
+    constexpr uint32_t mask = (1 << num_bits) - 1;
+
+    int4* sh_stage_ptr = sh_pipe_ptr + stage_size * pipe;
+    uint32_t* sh_stage_int_ptr = reinterpret_cast<uint32_t*>(sh_stage_ptr);
+
+    uint32_t* sh_perm_int_ptr = reinterpret_cast<uint32_t*>(sh_perm_ptr);
+
+    uint32_t vals[8];
+
+    if constexpr (has_perm) {
+      for (int i = 0; i < 4; i++) {
+        int k_idx = tc_row + tc_offsets[i];
+
+        uint32_t src_k = sh_perm_int_ptr[k_idx];
+        uint32_t src_k_pos = src_k % pack_factor;
+
+        uint32_t b1_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n];
+        uint32_t b1_cur_val = (b1_val >> (src_k_pos * num_bits)) & mask;
+
+        uint32_t b2_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n + 8];
+        uint32_t b2_cur_val = (b2_val >> (src_k_pos * num_bits)) & mask;
+
+        vals[i] = b1_cur_val;
+        vals[4 + i] = b2_cur_val;
+      }
+
+    } else {
+      uint32_t b1_vals[tile_ints];
+      uint32_t b2_vals[tile_ints];
+
+#pragma unroll
+      for (int i = 0; i < tile_ints; i++) {
+        b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i];
+        b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i];
+      }
+
+#pragma unroll
+      for (int i = 0; i < 4; i++) {
+        int cur_elem = tc_row + tc_offsets[i];
+        int cur_int = cur_elem / pack_factor;
+        int cur_pos = cur_elem % pack_factor;
+
+        vals[i] = (b1_vals[cur_int] >> (cur_pos * num_bits)) & mask;
+        vals[4 + i] = (b2_vals[cur_int] >> (cur_pos * num_bits)) & mask;
+      }
+    }
+
+    constexpr int tile_size = tile_k_size * tile_n_size / pack_factor;
+    int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
+
+    // Result of:
+    // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+    if constexpr (num_bits == 4) {
+      constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+
+      uint32_t res = 0;
+#pragma unroll
+      for (int i = 0; i < 8; i++) {
+        res |= vals[pack_idx[i]] << (i * 4);
+      }
+
+      out_ptr[out_offset + th_id * 4 + warp_id] = res;
+
+    } else {
+      constexpr int pack_idx[4] = {0, 2, 1, 3};
+
+      uint32_t res1 = 0;
+      uint32_t res2 = 0;
+#pragma unroll
+      for (int i = 0; i < 4; i++) {
+        res1 |= vals[pack_idx[i]] << (i * 8);
+        res2 |= vals[4 + pack_idx[i]] << (i * 8);
+      }
+
+      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
+      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 1] = res2;
+    }
+  };
+
+  auto start_pipes = [&](int k_tile_id, int n_tile_id) {
+#pragma unroll
+    for (int pipe = 0; pipe < repack_stages - 1; pipe++) {
+      fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe);
+    }
+
+    wait_for_stage();
+  };
+#pragma unroll
+  for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) {
+    int n_tile_id = 0;
+
+    if constexpr (has_perm) {
+      load_perm_to_shared(k_tile_id);
+    }
+
+    start_pipes(k_tile_id, n_tile_id);
+
+    while (n_tile_id < n_tiles) {
+#pragma unroll
+      for (int pipe = 0; pipe < repack_stages; pipe++) {
+        fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id, n_tile_id + pipe + repack_stages - 1);
+        repack_tile(pipe, k_tile_id, n_tile_id + pipe);
+        wait_for_stage();
+      }
+      n_tile_id += repack_stages;
+    }
+  }
+}
+#endif
+}  // namespace marlin
+
+#define CALL_IF(NUM_BITS, HAS_PERM)                                                    \
+  else if (num_bits == NUM_BITS && has_perm == HAS_PERM) {                             \
+    cudaFuncSetAttribute(                                                              \
+        marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, HAS_PERM>, \
+        cudaFuncAttributeMaxDynamicSharedMemorySize,                                   \
+        max_shared_mem);                                                               \
+    marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, HAS_PERM>      \
+        <<<blocks, marlin::repack_threads, max_shared_mem, stream>>>(                  \
+            b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n);                        \
+  }
+
+torch::Tensor
+gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm, int64_t size_k, int64_t size_n, int64_t num_bits) {
+  // Verify compatibility with marlin tile of 16x64
+  TORCH_CHECK(
+      size_k % marlin::tile_k_size == 0,
+      "size_k = ",
+      size_k,
+      " is not divisible by tile_k_size = ",
+      marlin::tile_k_size);
+  TORCH_CHECK(
+      size_n % marlin::tile_n_size == 0,
+      "size_n = ",
+      size_n,
+      " is not divisible by tile_n_size = ",
+      marlin::tile_n_size);
+
+  TORCH_CHECK(num_bits == 4 || num_bits == 8, "num_bits must be 4 or 8. Got = ", num_bits);
+  int const pack_factor = 32 / num_bits;
+
+  // Verify B
+  TORCH_CHECK(
+      (size_k / pack_factor) == b_q_weight.size(0),
+      "Shape mismatch: b_q_weight.size(0) = ",
+      b_q_weight.size(0),
+      ", size_k = ",
+      size_k,
+      ", pack_factor = ",
+      pack_factor);
+  TORCH_CHECK(b_q_weight.size(1) == size_n, "b_q_weight.size(1) = ", b_q_weight.size(1), " is not size_n = ", size_n);
+
+  // Verify device and strides
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+  TORCH_CHECK(b_q_weight.dtype() == at::kInt, "b_q_weight type is not kInt");
+
+  TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
+  TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");
+  TORCH_CHECK(perm.dtype() == at::kInt, "perm type is not at::kInt");
+
+  // Alloc buffers
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(b_q_weight));
+  auto options = torch::TensorOptions().dtype(b_q_weight.dtype()).device(b_q_weight.device());
+  torch::Tensor out = torch::empty({size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor}, options);
+
+  // Detect if there is act_order
+  bool has_perm = perm.size(0) != 0;
+
+  // Get ptrs
+  uint32_t const* b_q_weight_ptr = reinterpret_cast<uint32_t const*>(b_q_weight.data_ptr());
+  uint32_t const* perm_ptr = reinterpret_cast<uint32_t const*>(perm.data_ptr());
+  uint32_t* out_ptr = reinterpret_cast<uint32_t*>(out.data_ptr());
+
+  // Get dev info
+  int dev = b_q_weight.get_device();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
+  int blocks;
+  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem, cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  if (false) {
+  }
+  CALL_IF(4, false)
+  CALL_IF(4, true)
+  CALL_IF(8, false)
+  CALL_IF(8, true)
+  else {
+    TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits, ", has_perm = ", has_perm);
+  }
+
+  return out;
+}
diff --git a/sgl-kernel/csrc/gemm/marlin/kernel.h b/sgl-kernel/csrc/gemm/marlin/kernel.h
new file mode 100644
index 000000000..72137b244
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/marlin/kernel.h
@@ -0,0 +1,36 @@
+
+#ifndef MARLIN_NAMESPACE_NAME
+#define MARLIN_NAMESPACE_NAME marlin
+#endif
+
+#include "marlin.cuh"
+#include "marlin_dtypes.cuh"
+#include "scalar_type.hpp"
+
+#define MARLIN_KERNEL_PARAMS                                                                                         \
+  const int4 *__restrict__ A, const int4 *__restrict__ B, int4 *__restrict__ C, int4 *__restrict__ C_tmp,            \
+      const int4 *__restrict__ scales_ptr, const uint16_t *__restrict__ scale2_ptr, const int4 *__restrict__ zp_ptr, \
+      const int *__restrict__ g_idx, int num_groups, int prob_m, int prob_n, int prob_k, int lda, int *locks,        \
+      bool use_atomic_add, bool use_fp32_reduce, int max_shared_mem
+
+namespace MARLIN_NAMESPACE_NAME {
+template <
+    typename scalar_t,                     // compute dtype, half or nv_float16
+    const sglang::ScalarTypeId w_type_id,  // weight ScalarType id
+    const int threads,                     // number of threads in a threadblock
+    const int thread_m_blocks,             // number of 16x16 blocks in the m
+                                           // dimension (batchsize) of the
+                                           // threadblock
+    const int thread_n_blocks,             // same for n dimension (output)
+    const int thread_k_blocks,             // same for k dimension (reduction)
+    const bool m_block_size_8,             // whether m_block_size == 8
+                                           // only works when thread_m_blocks == 1
+    const int stages,                      // number of stages for the async global->shared
+                                           // fetch pipeline
+    const int group_blocks,                // number of consecutive 16x16 blocks
+                                           // with a separate quantization scale
+    const bool is_zp_float                 // is zero point of float16 type?
+    >
+__global__ void Marlin(MARLIN_KERNEL_PARAMS);
+
+}
diff --git a/sgl-kernel/csrc/gemm/marlin/marlin.cuh b/sgl-kernel/csrc/gemm/marlin/marlin.cuh
new file mode 100644
index 000000000..99903968d
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/marlin/marlin.cuh
@@ -0,0 +1,96 @@
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <torch/all.h>
+
+#include <iostream>
+
+#ifndef MARLIN_NAMESPACE_NAME
+#define MARLIN_NAMESPACE_NAME marlin
+#endif
+
+namespace MARLIN_NAMESPACE_NAME {
+// Marlin params
+
+// 8 warps are a good choice since every SM has 4 schedulers and having more
+// than 1 warp per schedule allows some more latency hiding. At the same time,
+// we want relatively few warps to have many registers per warp and small tiles.
+static constexpr int default_threads = 256;
+
+static constexpr int pipe_stages = 4;  // 4 pipeline stages fit into shared memory
+
+static constexpr int min_thread_n = 64;
+static constexpr int min_thread_k = 64;
+static constexpr int max_thread_n = 256;
+
+static constexpr int tile_size = 16;
+static constexpr int max_par = 16;
+
+// Repack params
+static constexpr int repack_stages = 8;
+
+static constexpr int repack_threads = 256;
+
+static constexpr int tile_k_size = tile_size;
+static constexpr int tile_n_size = tile_k_size * 4;
+
+// Helpers
+template <typename T, int n>
+struct Vec {
+  T elems[n];
+  __device__ T& operator[](int i) {
+    return elems[i];
+  }
+};
+
+using I4 = Vec<int, 4>;
+
+constexpr int div_ceil(int a, int b) {
+  return (a + b - 1) / b;
+}
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+// No support for async
+#else
+
+__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr, bool pred = true) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem),
+      "l"(glob_ptr),
+      "n"(BYTES));
+}
+
+__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
+      "}\n" ::"r"(smem),
+      "l"(glob_ptr),
+      "n"(BYTES));
+}
+
+__device__ inline void cp_async_fence() {
+  asm volatile("cp.async.commit_group;\n" ::);
+}
+
+template <int n>
+__device__ inline void cp_async_wait() {
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
+}
+
+#endif
+
+}  // namespace MARLIN_NAMESPACE_NAME
diff --git a/sgl-kernel/csrc/gemm/marlin/marlin_dtypes.cuh b/sgl-kernel/csrc/gemm/marlin/marlin_dtypes.cuh
new file mode 100644
index 000000000..f1b821f10
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/marlin/marlin_dtypes.cuh
@@ -0,0 +1,82 @@
+#ifndef _data_types_cuh
+#define _data_types_cuh
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+#include "marlin.cuh"
+
+#ifndef MARLIN_NAMESPACE_NAME
+#define MARLIN_NAMESPACE_NAME marlin
+#endif
+
+namespace MARLIN_NAMESPACE_NAME {
+
+template <typename scalar_t>
+class ScalarType {};
+
+template <>
+class ScalarType<half> {
+ public:
+  using scalar_t = half;
+  using scalar_t2 = half2;
+
+  // Matrix fragments for tensor core instructions; their precise layout is
+  // documented here:
+  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
+  using FragA = Vec<half2, 4>;
+  using FragB = Vec<half2, 2>;
+  using FragC = Vec<float, 4>;
+  using FragS = Vec<half2, 1>;
+  using FragZP = Vec<half2, 4>;
+
+  static __device__ float inline num2float(const half x) {
+    return __half2float(x);
+  }
+
+  static __device__ half2 inline num2num2(const half x) {
+    return __half2half2(x);
+  }
+
+  static __device__ half2 inline nums2num2(const half x1, const half x2) {
+    return __halves2half2(x1, x2);
+  }
+
+  static __host__ __device__ half inline float2num(const float x) {
+    return __float2half(x);
+  }
+};
+
+template <>
+class ScalarType<nv_bfloat16> {
+ public:
+  using scalar_t = nv_bfloat16;
+  using scalar_t2 = nv_bfloat162;
+
+  using FragA = Vec<nv_bfloat162, 4>;
+  using FragB = Vec<nv_bfloat162, 2>;
+  using FragC = Vec<float, 4>;
+  using FragS = Vec<nv_bfloat162, 1>;
+  using FragZP = Vec<nv_bfloat162, 4>;
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
+  static __device__ float inline num2float(const nv_bfloat16 x) {
+    return __bfloat162float(x);
+  }
+
+  static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) {
+    return __bfloat162bfloat162(x);
+  }
+
+  static __device__ nv_bfloat162 inline nums2num2(const nv_bfloat16 x1, const nv_bfloat16 x2) {
+    return __halves2bfloat162(x1, x2);
+  }
+
+  static __host__ __device__ nv_bfloat16 inline float2num(const float x) {
+    return __float2bfloat16(x);
+  }
+#endif
+};
+
+}  // namespace MARLIN_NAMESPACE_NAME
+
+#endif
diff --git a/sgl-kernel/csrc/gemm/marlin/marlin_template.h b/sgl-kernel/csrc/gemm/marlin/marlin_template.h
new file mode 100644
index 000000000..01eb33878
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/marlin/marlin_template.h
@@ -0,0 +1,1629 @@
+/*
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Adapted from https://github.com/IST-DASLab/marlin
+ */
+#ifndef MARLIN_NAMESPACE_NAME
+#define MARLIN_NAMESPACE_NAME marlin
+#endif
+
+#include "dequant.h"
+#include "marlin.cuh"
+#include "marlin_dtypes.cuh"
+#include "scalar_type.hpp"
+
+#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)                                        \
+  static_assert(                                                                         \
+      std::is_same<scalar_t, half>::value || std::is_same<scalar_t, nv_bfloat16>::value, \
+      "only float16 and bfloat16 is supported");
+
+namespace MARLIN_NAMESPACE_NAME {
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+template <
+    typename scalar_t,                     // compute dtype, half or nv_float16
+    const sglang::ScalarTypeId w_type_id,  // weight ScalarType id
+    const int threads,                     // number of threads in a threadblock
+    const int thread_m_blocks,             // number of 16x16 blocks in the m
+                                           // dimension (batchsize) of the
+                                           // threadblock
+    const int thread_n_blocks,             // same for n dimension (output)
+    const int thread_k_blocks,             // same for k dimension (reduction)
+    const bool m_block_size_8,             // whether m_block_size == 8
+                                           // only works when thread_m_blocks == 1
+    const int stages,                      // number of stages for the async global->shared
+                                           // fetch pipeline
+    const bool has_act_order,              // whether act_order is enabled
+    const int group_blocks,                // number of consecutive 16x16 blocks
+                                           // with a separate quantization scale
+    const bool is_zp_float                 // is zero point of float16 type?
+    >
+__global__ void Marlin(
+    const int4* __restrict__ A,           // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,           // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,                 // fp16 output buffer of shape mxn
+    int4* __restrict__ C_tmp,             // fp32 tmp output buffer (for reduce)
+    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
+                                          // (k/groupsize)xn
+    const int* __restrict__ g_idx,        // int32 group indices of shape k
+    int num_groups,                       // number of scale groups per output channel
+    int prob_m,                           // batch dimension m
+    int prob_n,                           // output dimension n
+    int prob_k,                           // reduction dimension k
+    int* locks,                           // extra global storage for barrier synchronization
+    bool use_fp32_reduce                  // whether to use fp32 global reduce
+) {}
+
+}  // namespace marlin
+
+#else
+
+// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
+// output/accumulation.
+template <typename scalar_t>
+__device__ inline void
+mma(const typename ScalarType<scalar_t>::FragA& a_frag,
+    const typename ScalarType<scalar_t>::FragB& frag_b,
+    typename ScalarType<scalar_t>::FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  float* c = reinterpret_cast<float*>(&frag_c);
+  if constexpr (std::is_same<scalar_t, half>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else {
+    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+  }
+}
+
+template <typename scalar_t>
+__device__ inline void mma_trans(
+    const typename ScalarType<scalar_t>::FragA& a_frag,
+    const typename ScalarType<scalar_t>::FragB& frag_b,
+    const typename ScalarType<scalar_t>::FragB& frag_b2,
+    typename ScalarType<scalar_t>::FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
+  float* c = reinterpret_cast<float*>(&frag_c);
+  if constexpr (std::is_same<scalar_t, half>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(b[0]),
+          "r"(b2[0]),
+          "r"(b[1]),
+          "r"(b2[1]),
+          "r"(a[0]),
+          "r"(a[1]),
+          "f"(c[0]),
+          "f"(c[1]),
+          "f"(c[2]),
+          "f"(c[3]));
+  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(b[0]),
+          "r"(b2[0]),
+          "r"(b[1]),
+          "r"(b2[1]),
+          "r"(a[0]),
+          "r"(a[1]),
+          "f"(c[0]),
+          "f"(c[1]),
+          "f"(c[2]),
+          "f"(c[3]));
+  } else {
+    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+  }
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in tensor core layout.
+template <int count, typename scalar_t>
+__device__ inline void ldsm(typename ScalarType<scalar_t>::FragA& frag_a, const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  if constexpr (count == 4) {
+    asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+                 : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+                 : "r"(smem));
+  } else if constexpr (count == 2) {
+    asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n" : "=r"(a[0]), "=r"(a[1]) : "r"(smem));
+  } else if constexpr (count == 1) {
+    asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n" : "=r"(a[0]) : "r"(smem));
+  } else {
+    static_assert(count == 1 || count == 2 || count == 4, "invalid count");
+  }
+}
+
+// Multiply dequantized values by the corresponding quantization scale; used
+// only for grouped quantization.
+template <typename scalar_t>
+__device__ inline void
+scale(typename ScalarType<scalar_t>::FragB& frag_b, typename ScalarType<scalar_t>::FragS& frag_s, int i) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 s = ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_s)[i]);
+  frag_b[0] = __hmul2(frag_b[0], s);
+  frag_b[1] = __hmul2(frag_b[1], s);
+}
+
+template <typename scalar_t>
+__device__ inline void scale_and_sub(typename ScalarType<scalar_t>::FragB& frag_b, scalar_t s, scalar_t zp) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 s2 = ScalarType<scalar_t>::num2num2(s);
+  scalar_t2 zp2 = ScalarType<scalar_t>::num2num2(zp);
+  frag_b[0] = __hfma2(frag_b[0], s2, __hneg2(zp2));
+  frag_b[1] = __hfma2(frag_b[1], s2, __hneg2(zp2));
+}
+
+template <typename scalar_t>
+__device__ inline void
+sub_zp(typename ScalarType<scalar_t>::FragB& frag_b, typename ScalarType<scalar_t>::scalar_t2& frag_zp, int i) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 zp = ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_zp)[i]);
+  frag_b[0] = __hsub2(frag_b[0], zp);
+  frag_b[1] = __hsub2(frag_b[1], zp);
+}
+
+// Same as above, but for act_order (each K is multiplied individually)
+template <typename scalar_t>
+__device__ inline void scale4(
+    typename ScalarType<scalar_t>::FragB& frag_b,
+    typename ScalarType<scalar_t>::FragS& frag_s_1,
+    typename ScalarType<scalar_t>::FragS& frag_s_2,
+    typename ScalarType<scalar_t>::FragS& frag_s_3,
+    typename ScalarType<scalar_t>::FragS& frag_s_4,
+    int i) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 s_val_1_2;
+  s_val_1_2.x = reinterpret_cast<scalar_t*>(&frag_s_1)[i];
+  s_val_1_2.y = reinterpret_cast<scalar_t*>(&frag_s_2)[i];
+
+  scalar_t2 s_val_3_4;
+  s_val_3_4.x = reinterpret_cast<scalar_t*>(&frag_s_3)[i];
+  s_val_3_4.y = reinterpret_cast<scalar_t*>(&frag_s_4)[i];
+
+  frag_b[0] = __hmul2(frag_b[0], s_val_1_2);
+  frag_b[1] = __hmul2(frag_b[1], s_val_3_4);
+}
+
+// Given 2 floats multiply by 2 scales (halves)
+template <typename scalar_t>
+__device__ inline void scale_float(float* c, typename ScalarType<scalar_t>::FragS& s) {
+  scalar_t* s_ptr = reinterpret_cast<scalar_t*>(&s);
+  c[0] = __fmul_rn(c[0], ScalarType<scalar_t>::num2float(s_ptr[0]));
+  c[1] = __fmul_rn(c[1], ScalarType<scalar_t>::num2float(s_ptr[1]));
+}
+
+// Wait until barrier reaches `count`, then lock for current threadblock.
+__device__ inline void barrier_acquire(int* lock, int count) {
+  if (threadIdx.x == 0) {
+    int state = -1;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n" : "=r"(state) : "l"(lock));
+    while (state != count);
+  }
+  __syncthreads();
+}
+
+// Release barrier and increment visitation count.
+__device__ inline void barrier_release(int* lock, bool reset = false) {
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    if (reset) {
+      lock[0] = 0;
+      return;
+    }
+    int val = 1;
+    // Make sure that all writes since acquiring this barrier are visible
+    // globally, while releasing the barrier.
+    asm volatile("fence.acq_rel.gpu;\n");
+    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n" : : "l"(lock), "r"(val));
+  }
+}
+
+// Wait until value of lock to be negative, and then add 1
+__device__ inline void wait_negative_and_add(int* lock) {
+  if (threadIdx.x == 0) {
+    int state = 0;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n" : "=r"(state) : "l"(lock));
+    while (state >= 0);
+    atomicAdd(lock, 1);
+  }
+  __syncthreads();
+}
+
+template <
+    typename scalar_t,                     // compute dtype, half or nv_float16
+    const sglang::ScalarTypeId w_type_id,  // weight ScalarType id
+    const int threads,                     // number of threads in a threadblock
+    const int thread_m_blocks,             // number of 16x16 blocks in the m
+                                           // dimension (batchsize) of the
+                                           // threadblock
+    const int thread_n_blocks,             // same for n dimension (output)
+    const int thread_k_blocks,             // same for k dimension (reduction)
+    const bool m_block_size_8,             // whether m_block_size == 8
+                                           // only works when thread_m_blocks == 1
+    const int stages,                      // number of stages for the async global->shared
+                                           // fetch pipeline
+    const int group_blocks,                // number of consecutive 16x16 blocks
+                                           // with a separate quantization scale
+    const bool is_zp_float                 // is zero point of float16 type?
+    >
+__global__ void Marlin(
+    const int4* __restrict__ A,               // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,               // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,                     // fp16 output buffer of shape mxn
+    int4* __restrict__ C_tmp,                 // fp32 tmp output buffer (for reduce)
+    const int4* __restrict__ scales_ptr,      // fp16 quantization scales of shape
+                                              // (k/groupsize)xn
+    const uint16_t* __restrict__ scale2_ptr,  // fp16 global scale (for nvfp4
+                                              // only)
+    const int4* __restrict__ zp_ptr,          // 4bit packed zero-points of shape
+                                              // (k/groupsize)x(n/pack_factor)
+    const int* __restrict__ g_idx,            // int32 group indices of shape k
+    int num_groups,                           // number of scale groups per output channel
+    int prob_m,                               // batch dimension m
+    int prob_n,                               // output dimension n
+    int prob_k,                               // reduction dimension k
+    int lda,                                  // A.stride(0), equal to prob_k is A is contiguous
+    int* locks,                               // extra global storage for barrier synchronization
+    bool use_atomic_add,                      // whether to use atomic add to reduce
+    bool use_fp32_reduce,                     // whether to use fp32 global reduce
+    int max_shared_mem) {
+  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
+  // same size, which might involve multiple column "slices" (of width 16 *
+  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
+  // example:
+  //   0 1 3
+  //   0 2 3
+  //   1 2 4
+  // While this kind of partitioning makes things somewhat more complicated, it
+  // ensures good utilization of all SMs for many kinds of shape and GPU
+  // configurations, while requiring as few slow global cross-threadblock
+  // reductions as possible.
+  using Dtype = ScalarType<scalar_t>;
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  using FragA = typename ScalarType<scalar_t>::FragA;
+  using FragB = typename ScalarType<scalar_t>::FragB;
+  using FragC = typename ScalarType<scalar_t>::FragC;
+  using FragS = typename ScalarType<scalar_t>::FragS;
+  using FragZP = typename ScalarType<scalar_t>::FragZP;
+
+  static constexpr auto w_type = sglang::ScalarType::from_id(w_type_id);
+  constexpr bool has_zp = w_type == sglang::kU4 || w_type == sglang::kU8;
+  constexpr bool is_int_type =
+      w_type == sglang::kU4 || w_type == sglang::kU8 || w_type == sglang::kU4B8 || w_type == sglang::kU8B128;
+  // see comments of dequant.h for more details
+  constexpr bool dequant_skip_flop = !is_int_type ||
+                                     has_zp && !is_zp_float && !std::is_same<scalar_t, nv_bfloat16>::value ||
+                                     has_zp && !is_zp_float && !(w_type == sglang::kU8);
+
+  scalar_t2 global_scale;
+
+  if constexpr (w_type == sglang::kFE2M1f) {
+    uint16_t val = scale2_ptr[0];
+    global_scale = Dtype::num2num2(*reinterpret_cast<scalar_t*>(&val));
+  }
+
+  constexpr bool has_act_order = group_blocks == 0;
+  constexpr int m_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks);
+
+  constexpr int pack_factor = 32 / w_type.size_bits();
+  static_assert(thread_m_blocks == 1 || !m_block_size_8);
+
+  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
+  // better partitioning with less reductions
+  int parallel = 1;
+  if (prob_m > m_block_size) {
+    parallel = prob_m / m_block_size;
+    prob_m = m_block_size;
+  }
+
+  int k_tiles = prob_k / 16 / thread_k_blocks;
+  int n_tiles = prob_n / 16 / thread_n_blocks;
+  int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x);
+
+  if constexpr (!has_act_order && group_blocks != -1) {
+    if (group_blocks >= thread_k_blocks) {
+      // Ensure that the number of tiles in each stripe is a multiple of the
+      // groupsize; this avoids an annoying special case where a stripe starts
+      // in the middle of group.
+      iters = (group_blocks / thread_k_blocks) * div_ceil(iters, (group_blocks / thread_k_blocks));
+    }
+  }
+
+  int slice_row = (iters * blockIdx.x) % k_tiles;
+  int slice_col_par = (iters * blockIdx.x) / k_tiles;
+  int slice_col = slice_col_par;
+  int slice_iters;      // number of threadblock tiles in the current slice
+  int slice_count = 0;  // total number of active threadblocks in the current slice
+  int slice_idx;        // index of threadblock in current slice; numbered bottom to
+                        // top
+
+  int par_id = 0;
+  int locks_off = 0;
+
+  // We can easily implement parallel problem execution by just remapping
+  // indices and advancing global pointers
+  if (slice_col_par >= n_tiles) {
+    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * lda / 8;
+    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
+    slice_col = slice_col_par % n_tiles;
+    par_id = slice_col_par / n_tiles;
+  }
+  if (parallel * n_tiles >= gridDim.x) {
+    // when parallel * n_tiles >= sms
+    // then there are at most $sms$ conflict tile blocks
+    locks_off = blockIdx.x;
+  } else {
+    locks_off = (iters * blockIdx.x) / k_tiles - 1;
+  }
+
+  // Compute all information about the current slice which is required for
+  // synchronization.
+  auto init_slice = [&](bool first_init = false) {
+    slice_iters = iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
+    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters == 0) return;
+    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
+    slice_count = 1;
+    slice_idx = 0;
+    int col_first = iters * div_ceil(k_tiles * slice_col_par, iters);
+    if (col_first <= k_tiles * (slice_col_par + 1)) {
+      int col_off = col_first - k_tiles * slice_col_par;
+      slice_count = div_ceil(k_tiles - col_off, iters);
+      if (col_off > 0) slice_count++;
+      int delta_first = iters * blockIdx.x - col_first;
+      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
+        slice_idx = slice_count - 1;
+      else {
+        slice_idx = slice_count - 1 - delta_first / iters;
+        if (col_off > 0) slice_idx--;
+      }
+    }
+    if (parallel * n_tiles >= gridDim.x) {
+      if (slice_count > 1 && slice_idx == slice_count - 1) {
+        locks_off++;
+      }
+    } else {
+      locks_off++;
+    }
+
+    if (first_init && use_atomic_add && slice_count > 1 && slice_idx == 0) {
+      constexpr int threads_per_m = 16 * thread_n_blocks / 8;
+      int m_per_thread = div_ceil(thread_m_blocks * 16, threads / threads_per_m);
+      if (m_block_size_8) m_per_thread = div_ceil(8, threads / threads_per_m);
+      for (int i = 0; i < m_per_thread; i++) {
+        int row = threads / threads_per_m * i + threadIdx.x / threads_per_m;
+        if (row < prob_m) {
+          int col = slice_col * 16 * thread_n_blocks / 8 + threadIdx.x % threads_per_m;
+          C[row * prob_n / 8 + col] = {0, 0, 0, 0};
+        }
+      }
+      // After write zero to output, write a negative value to lock.
+      // Every SM that processes the same slice would wait for
+      // the negative value, and then atomicAdd 1 to it.
+      // After all SMs are processed, the lock value would back to 0 again.
+      __syncthreads();
+      if (threadIdx.x == 0) locks[locks_off] = 1 - slice_count;
+    }
+
+    if (slice_col == n_tiles) {
+      A += 16 * thread_m_blocks * lda / 8;
+      C += 16 * thread_m_blocks * prob_n / 8;
+      slice_col = 0;
+      par_id++;
+    }
+  };
+  init_slice(true);
+
+  // A sizes/strides
+
+  // stride of the A matrix in global memory
+  int a_gl_stride = lda / 8;
+  // stride of an A matrix tile in shared memory
+  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
+  // delta between subsequent A tiles in global memory
+  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
+  // between subsequent accesses within a tile
+  int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory writes
+  constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory tile reads
+  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
+  // within a shared memory tile
+  constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
+  // overall size of a tile
+  constexpr int a_sh_stage = a_sh_stride * m_block_size;
+  // number of shared write iterations for a tile
+  constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);
+
+  // B sizes/strides
+  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
+  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
+  constexpr int b_thread_vecs = w_type.size_bits() == 4 ? 1 : 2;
+  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
+
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
+  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
+  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
+  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
+
+  // Scale sizes/strides without act_order
+  int s_gl_stride = prob_n / 8;
+  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+  constexpr int s_tb_groups = !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks
+                                  ? thread_k_blocks / group_blocks / (w_type == sglang::kFE2M1f ? 2 : 1)
+                                  : 1;
+  constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
+  int s_gl_rd_delta = s_gl_stride;
+
+  // Scale size/strides with act_order
+  constexpr int tb_k = 16 * thread_k_blocks;
+  constexpr int g_idx_stage = has_act_order ? (tb_k * sizeof(int)) / 16 : 0;
+  // constexpr int act_s_row_stride      = 1;
+  // int           act_s_col_stride      = act_s_row_stride * num_groups;
+  constexpr int act_s_max_num_groups = 32;
+  int act_s_col_stride = 1;
+  int act_s_col_warp_stride = act_s_col_stride * 8;
+
+  int tb_n_warps = thread_n_blocks / 4;
+  int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
+
+  // Zero-points sizes/strides
+  int zp_gl_stride = is_zp_float ? prob_n / 8 : (prob_n / pack_factor) / 4;
+  constexpr int zp_sh_stride = is_zp_float ? 16 * thread_n_blocks / 8 : ((16 * thread_n_blocks) / pack_factor) / 4;
+  constexpr int zp_tb_groups = s_tb_groups;
+  constexpr int zp_sh_stage = has_zp ? zp_tb_groups * zp_sh_stride : 0;
+  int zp_gl_rd_delta = zp_gl_stride;
+
+  // Global A read index of current thread.
+  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + (threadIdx.x % a_gl_rd_delta_o);
+  a_gl_rd += a_gl_rd_delta_o * slice_row;
+  // Shared write index of current thread.
+  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) + (threadIdx.x % a_gl_rd_delta_o);
+  // Shared read index.
+  int a_sh_rd = a_sh_stride * ((threadIdx.x % 32) % (16 / (m_block_size_8 ? 2 : 1))) +
+                (threadIdx.x % 32) / (16 / (m_block_size_8 ? 2 : 1));
+  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+
+  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) + (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
+  b_gl_rd += b_sh_stride * slice_col;
+  b_gl_rd += b_gl_rd_delta_o * slice_row;
+  auto b_sh_wr = threadIdx.x * b_thread_vecs;
+  auto b_sh_rd = threadIdx.x * b_thread_vecs;
+
+  // For act_order
+  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
+  int slice_k_start = tb_k * slice_row;
+  int slice_k_finish = slice_k_start + tb_k * slice_iters;
+  int slice_k_start_shared_fetch = slice_k_start;
+  int slice_n_offset = act_s_col_tb_stride * slice_col;
+
+  // No act_order
+  int s_gl_rd;
+  if constexpr (!has_act_order) {
+    if constexpr (group_blocks == -1) {
+      s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+    } else {
+      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) / (w_type == sglang::kFE2M1f ? 2 : 1) +
+                s_sh_stride * slice_col + threadIdx.x;
+    }
+  }
+  auto s_sh_wr = threadIdx.x;
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+
+  // Zero-points
+  int zp_gl_rd;
+  if constexpr (has_zp) {
+    if constexpr (group_blocks == -1) {
+      zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+    } else {
+      zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) + zp_sh_stride * slice_col + threadIdx.x;
+    }
+  }
+  auto zp_sh_wr = threadIdx.x;
+  bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride;
+
+  // We use a different scale layout for grouped and column-wise quantization as
+  // we scale a `half2` tile in column-major layout in the former and in
+  // row-major in the latter case.
+  int s_sh_rd;
+  if constexpr (group_blocks != -1 && w_type == sglang::kFE2M1f) {
+    auto warp_id = threadIdx.x / 32;
+    int n_warps = thread_n_blocks / 4;
+    int warp_row = warp_id / n_warps;
+
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) / 4;
+    s_sh_rd = s_sh_rd * 2 + warp_row % 2;
+
+  } else if constexpr (group_blocks != -1)
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) / 4;
+  else if constexpr (group_blocks == -1 && (m_block_size_8 || (has_zp && !dequant_skip_flop)))
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) / 8;
+  else
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) % 4;
+
+  // Zero-points have the same read layout as the scales
+  // (without column-wise case)
+  constexpr int num_col_threads = 8;
+  constexpr int num_row_threads = 4;
+  constexpr int num_ints_per_thread = 8 / pack_factor;
+  int zp_sh_rd;
+  if constexpr (has_zp) {
+    if constexpr (is_zp_float) {
+      if constexpr (group_blocks != -1) {
+        zp_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) / 4;
+      }
+    } else {
+      zp_sh_rd = num_ints_per_thread * num_col_threads * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                 num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
+    }
+  }
+
+  // Precompute which thread should not read memory in which iterations; this is
+  // needed if there are more threads than required for a certain tilesize or
+  // when the batchsize is not a multiple of 16.
+  bool a_sh_wr_pred[a_sh_wr_iters];
+#pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
+
+  // To ensure that writing and reading A tiles to/from shared memory, the
+  // latter in fragment format, is fully bank conflict free, we need to use a
+  // rather fancy XOR-based layout. The key here is that neither reads nor
+  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
+  // same shared memory banks. Further, it seems (based on NSight-Compute) that
+  // each warp must also write a consecutive memory segment?
+  auto transform_a = [&](int i) {
+    int row = i / a_gl_rd_delta_o;
+    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ (row % 8);
+  };
+  // Since the computation of this remapping is non-trivial and, due to our main
+  // loop unrolls, all shared memory accesses are static, we simply precompute
+  // both transformed reads and writes.
+  int a_sh_wr_trans[a_sh_wr_iters];
+#pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
+  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
+#pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++) {
+#pragma unroll
+    for (int j = 0; j < thread_m_blocks; j++)
+      a_sh_rd_trans[i][j] = transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+  }
+
+  // Since B-accesses have non-constant stride they have to be computed at
+  // runtime; we break dependencies between subsequent accesses with a tile by
+  // maintining multiple pointers (we have enough registers), a tiny
+  // optimization.
+  const int4* B_ptr[b_sh_wr_iters];
+#pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++)
+    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
+
+  extern __shared__ int4 sh[];
+  // Shared memory storage for global fetch pipelines.
+  constexpr int sh_red_size = (2 * thread_n_blocks + 1) * 16 * thread_m_blocks;
+  constexpr int sh_b_size = stages * b_sh_stage;
+  int4* sh_b = sh;
+  int4* sh_red = sh;
+  int4* sh_g_idx = sh_b + (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
+  int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
+  constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride) : (stages * s_sh_stage);
+  int4* sh_s = sh_zp + (stages * zp_sh_stage);
+  // shared memory reused by reduction should be smaller than
+  // shared memory used by weight.
+  static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <= stages * b_sh_stage);
+  int4* sh_a = sh_s + sh_s_size;
+  // constexpr int shm_size_used =
+  //     stages * (g_idx_stage + zp_sh_stage) + sh_s_size +
+  //     (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
+
+  // Register storage for double buffer of shared memory reads.
+  FragA frag_a[2][thread_m_blocks];
+  I4 frag_b_quant[2][b_thread_vecs];
+  FragC frag_c[thread_m_blocks][4][2];
+  FragS frag_s[2][4];                    // No act-order
+  FragS act_frag_s[2][4][4];             // For act-order
+  int frag_qzp[2][num_ints_per_thread];  // Zero-points
+  FragZP frag_zp;                        // Zero-points in fp16
+  FragZP frag_zpf[2];                    // Zero-points in fp16 in HQQ
+
+  // Zero accumulators.
+  auto zero_accums = [&]() {
+#pragma unroll
+    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
+      reinterpret_cast<float*>(frag_c)[i] = 0;
+  };
+
+  int sh_first_group_id = -1;
+  int sh_num_groups = -1;
+
+  auto fetch_act_order_scales_to_shared = [&](bool is_async, int first_group_id, int last_group_id) {
+    sh_first_group_id = first_group_id;
+    sh_num_groups = last_group_id - first_group_id + 1;
+
+    if (sh_num_groups > act_s_max_num_groups) {
+      sh_num_groups = act_s_max_num_groups;
+    }
+
+    if (sh_first_group_id + sh_num_groups > num_groups) {
+      sh_num_groups = num_groups - sh_first_group_id;
+    }
+
+    int row_offset = first_group_id * s_gl_stride;
+
+    if (is_async) {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          cp_async4_pred(
+              &sh_s[(i * s_sh_stride) + threadIdx.x],
+              &scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset + threadIdx.x]);
+        }
+      }
+    } else {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          sh_s[(i * s_sh_stride) + threadIdx.x] =
+              scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset + threadIdx.x];
+        }
+      }
+    }
+  };
+  // Asynchronously fetch the next A, B and s tile from global to the next
+  // shared memory pipeline location.
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
+    if (pred) {
+      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+#pragma unroll
+      for (int i = 0; i < a_sh_wr_iters; i++) {
+        cp_async4_pred(
+            &sh_a_stage[a_sh_wr_trans[i]],
+            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
+            a_sh_wr_pred[i]);
+      }
+      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+#pragma unroll
+      for (int i = 0; i < b_sh_wr_iters; i++) {
+#pragma unroll
+        for (int j = 0; j < b_thread_vecs; j++) {
+          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
+        }
+
+        B_ptr[i] += b_gl_rd_delta_o;
+      }
+
+      if constexpr (has_act_order) {
+        // Fetch g_idx thread-block portion
+        int full_pipe = a_off;
+        int cur_k = slice_k_start_shared_fetch + tb_k * full_pipe;
+        if (cur_k < prob_k && cur_k < slice_k_finish) {
+          int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+
+          int4 const* cur_g_idx_stage_ptr = reinterpret_cast<int4 const*>(&g_idx[cur_k]);
+
+          if (threadIdx.x < g_idx_stage) {
+            cp_async4_pred(&sh_g_idx_stage[threadIdx.x], &cur_g_idx_stage_ptr[threadIdx.x]);
+          }
+        }
+      } else {
+        if constexpr (group_blocks != -1) {
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+
+          if constexpr (group_blocks >= thread_k_blocks) {
+            // Only fetch scales if this tile starts a new group
+            if (pipe % (group_blocks / thread_k_blocks) == 0) {
+              if (s_sh_wr_pred) {
+                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
+              }
+              s_gl_rd += s_gl_rd_delta;
+            }
+          } else {
+            for (int i = 0; i < s_tb_groups; i++) {
+              if (s_sh_wr_pred) {
+                cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr], &scales_ptr[s_gl_rd]);
+              }
+              s_gl_rd += s_gl_rd_delta;
+            }
+          }
+        }
+
+        if constexpr (has_zp && group_blocks != -1) {
+          int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+
+          if constexpr (group_blocks >= thread_k_blocks) {
+            // Only fetch zero-points if this tile starts a new group
+            if (pipe % (group_blocks / thread_k_blocks) == 0) {
+              if (zp_sh_wr_pred) {
+                cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
+              }
+              zp_gl_rd += zp_gl_rd_delta;
+            }
+          } else {
+            for (int i = 0; i < zp_tb_groups; i++) {
+              if (zp_sh_wr_pred) {
+                cp_async4(&sh_zp_stage[i * zp_sh_stride + zp_sh_wr], &zp_ptr[zp_gl_rd]);
+              }
+              zp_gl_rd += zp_gl_rd_delta;
+            }
+          }
+        }
+      }
+    }
+    // Insert a fence even when we are winding down the pipeline to ensure that
+    // waiting is also correct at this point.
+    cp_async_fence();
+  };
+
+  auto fetch_col_zp_to_shared = [&]() {
+    if (zp_sh_wr_pred) {
+      cp_async4(&sh_zp[zp_sh_wr], &zp_ptr[zp_gl_rd]);
+    }
+  };
+
+  auto fetch_col_scale_to_shared = [&]() {
+    if (s_sh_wr_pred) {
+      cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+    }
+  };
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<stages - 2>();
+    __syncthreads();
+  };
+
+  // Load the next sub-tile from the current location in the shared memory pipe
+  // into the current register buffer.
+  auto fetch_to_registers = [&](int k, int pipe) {
+    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+#pragma unroll
+    for (int i = 0; i < thread_m_blocks; i++)
+      ldsm<m_block_size_8 ? 2 : 4, scalar_t>(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
+    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+
+#pragma unroll
+    for (int i = 0; i < b_thread_vecs; i++) {
+      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(&sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
+    }
+  };
+
+  bool is_same_group[stages];
+  int same_group_id[stages];
+
+  auto init_same_group = [&](int pipe) {
+    if constexpr (!has_act_order) {
+      return;
+    }
+
+    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
+
+    int group_id_1 = sh_g_idx_int_ptr[0];
+    int group_id_2 = sh_g_idx_int_ptr[tb_k - 1];
+
+    is_same_group[pipe] = group_id_1 == group_id_2;
+    same_group_id[pipe] = group_id_1;
+  };
+
+  auto fetch_scales_to_registers = [&](int k, int full_pipe) {
+    int pipe = full_pipe % stages;
+
+    if constexpr (!has_act_order) {
+      // No act-order case
+      if constexpr (group_blocks == -1) {
+        // load only when starting a new slice
+        if (k == 0 && full_pipe == 0) {
+          reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd];
+          reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+        }
+      } else if constexpr (group_blocks != -1) {
+        if constexpr (group_blocks >= thread_k_blocks) {
+          if (k % b_sh_wr_iters == 0) {
+            int4* sh_s_stage =
+                sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) * (pipe / (group_blocks / thread_k_blocks)));
+            reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+          } else {
+            reinterpret_cast<int4*>(&frag_s[1])[0] = reinterpret_cast<int4*>(&frag_s[0])[0];
+          }
+        } else {
+          auto warp_id = threadIdx.x / 32;
+          int n_warps = thread_n_blocks / 4;
+
+          int warp_row = warp_id / n_warps;
+
+          int cur_k = warp_row * 16;
+          cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+          int k_blocks = cur_k / 16;
+          int cur_group_id = k_blocks / (group_blocks * (w_type == sglang::kFE2M1f ? 2 : 1));
+
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+
+          if constexpr (w_type_id != sglang::kFE2M1f.id()) {
+            reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
+          } else {
+            reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
+                reinterpret_cast<int2*>(sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)];
+          }
+        }
+      }
+
+      return;
+    }
+
+    // Act-order case
+
+    // Determine K of the "current" thread-block
+    int cur_k = slice_k_start + tb_k * full_pipe;
+    if (cur_k >= prob_k || cur_k >= slice_k_finish) {
+      return;
+    }
+
+    // Reset (to current thread-block) since we read g_idx portion from the
+    // shared memory
+    cur_k = 0;
+
+    // Progress to current iteration
+    cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+    // Determine "position" inside the thread-block (based on warp and
+    // thread-id)
+    auto warp_id = threadIdx.x / 32;
+    int n_warps = thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N
+
+    int warp_row = warp_id / n_warps;
+    int warp_col = warp_id % n_warps;
+
+    cur_k += warp_row * 16;
+
+    auto th_id = threadIdx.x % 32;
+    cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix
+
+    int s_col_shift =
+        /*slice_n_offset +*/ (act_s_col_warp_stride * warp_col) + (th_id / 4) * act_s_col_stride;
+
+    if (is_same_group[pipe]) {
+      if (k % 2 == 0) {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
+            sh_s[(same_group_id[pipe] - sh_first_group_id) * s_sh_stride + s_col_shift];
+      } else {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
+            *(reinterpret_cast<int4*>(&(act_frag_s[(k - 1) % 2][0][0])));
+      }
+
+      for (int i = 1; i < 4; i++) {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) = *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0])));
+      }
+      return;
+    }
+
+    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
+
+    constexpr int k_frag_offsets[4] = {0, 1, 8, 9};  // Tensor core offsets per thread
+
+#pragma unroll
+    for (int i = 0; i < 4; i++) {
+      int actual_k = cur_k + k_frag_offsets[i];
+
+      int group_id = sh_g_idx_int_ptr[actual_k];
+      int rel_group_id = group_id - sh_first_group_id;
+
+      *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) = sh_s[rel_group_id * s_sh_stride + s_col_shift];
+    }
+  };
+
+  auto fetch_zp_to_registers = [&](int k, int full_pipe) {
+    // This code does not handle group_blocks == 0,
+    // which signifies act_order.
+    // has_zp implies AWQ, which doesn't have act_order,
+    static_assert(!has_zp || group_blocks != 0);
+
+    if constexpr (has_zp && !is_zp_float) {
+      int pipe = full_pipe % stages;
+
+      if constexpr (group_blocks == -1) {
+        // load only when starting a new slice
+        if (k == 0 && full_pipe == 0) {
+#pragma unroll
+          for (int i = 0; i < num_ints_per_thread; i++) {
+            frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp))[zp_sh_rd + i];
+          }
+        }
+
+      } else if constexpr (group_blocks >= thread_k_blocks) {
+        if (k % b_sh_wr_iters == 0) {
+          int4* sh_zp_stage =
+              sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) * (pipe / (group_blocks / thread_k_blocks)));
+#pragma unroll
+          for (int i = 0; i < num_ints_per_thread; i++) {
+            frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+          }
+        }
+      } else {
+        auto warp_id = threadIdx.x / 32;
+        int n_warps = thread_n_blocks / 4;
+
+        int warp_row = warp_id / n_warps;
+
+        int cur_k = warp_row * 16;
+        cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+        int k_blocks = cur_k / 16;
+        int cur_group_id = 0;
+
+        // Suppress bogus and persistent divide-by-zero warning
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress divide_by_zero
+        cur_group_id = k_blocks / group_blocks;
+#pragma nv_diagnostic pop
+
+        int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+
+        sh_zp_stage += cur_group_id * zp_sh_stride;
+
+#pragma unroll
+        for (int i = 0; i < num_ints_per_thread; i++) {
+          frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+        }
+      }
+    }
+
+    else if constexpr (has_zp && is_zp_float) {
+      int pipe = full_pipe % stages;
+
+      if constexpr (group_blocks != -1) {
+        if constexpr (group_blocks >= thread_k_blocks) {
+          if (k % b_sh_wr_iters == 0) {
+            int4* sh_zp_stage =
+                sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) * (pipe / (group_blocks / thread_k_blocks)));
+            reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] = sh_zp_stage[zp_sh_rd];
+          }
+        } else {
+          auto warp_id = threadIdx.x / 32;
+          int n_warps = thread_n_blocks / 4;
+
+          int warp_row = warp_id / n_warps;
+
+          int cur_k = warp_row * 16;
+          cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+          int k_blocks = cur_k / 16;
+          // Suppress bogus and persistent divide-by-zero warning
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress divide_by_zero
+          int cur_group_id = k_blocks / group_blocks;
+#pragma nv_diagnostic pop
+
+          int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+
+          reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] = sh_zp_stage[zp_sh_rd + cur_group_id * zp_sh_stride];
+        }
+      }
+    }
+  };
+
+  auto dequant_data = [&](int q, scalar_t2* frag_b_ptr) {
+    dequant<scalar_t2, w_type_id, dequant_skip_flop>(q, frag_b_ptr);
+  };
+
+  // Execute the actual tensor core matmul of a sub-tile.
+  bool is_first_matmul_in_slice = true;
+  auto matmul = [&](int k) {
+    int k2 = k % 2;
+    const bool is_new_zp = ((group_blocks != -1) && (group_blocks < thread_k_blocks || k == 0)) ||
+                           (group_blocks == -1 && is_first_matmul_in_slice);
+    if constexpr (has_zp && !is_zp_float) {
+      if (is_new_zp) {
+        if constexpr (group_blocks == -1) is_first_matmul_in_slice = false;
+        FragB frag_zp_0;
+        FragB frag_zp_1;
+        int zp_quant_0, zp_quant_1;
+
+        if constexpr (w_type.size_bits() == 4) {
+          zp_quant_0 = frag_qzp[k2][0];
+          zp_quant_1 = zp_quant_0 >> 8;
+        } else {
+          static_assert(w_type.size_bits() == 8);
+          zp_quant_0 = frag_qzp[k2][0];
+          zp_quant_1 = frag_qzp[k2][1];
+        }
+
+        dequant_data(zp_quant_0, reinterpret_cast<scalar_t2*>(&frag_zp));
+        dequant_data(zp_quant_1, reinterpret_cast<scalar_t2*>(&frag_zp) + 2);
+      }
+    }
+    if constexpr (!dequant_skip_flop && has_zp && is_zp_float) {
+      if (is_new_zp) {
+        reinterpret_cast<int4*>(&frag_zp)[0] = reinterpret_cast<int4*>(&frag_zpf[k2])[0];
+      }
+    }
+
+    if constexpr (w_type == sglang::kFE2M1f) {
+      int s_quant_0 = reinterpret_cast<int*>(frag_s[k2])[0];
+      int s_quant_1 = reinterpret_cast<int*>(frag_s[k2])[1];
+
+      dequant_fp8_scales<scalar_t2>(s_quant_0, reinterpret_cast<scalar_t2*>(&frag_s[k2]));
+      dequant_fp8_scales<scalar_t2>(s_quant_1, reinterpret_cast<scalar_t2*>(&frag_s[k2]) + 2);
+    }
+
+// We have the m dimension as the inner loop in order to encourage overlapping
+// dequantization and matmul operations.
+#pragma unroll
+    for (int j = 0; j < 4; j++) {
+      FragB frag_b0;
+      FragB frag_b1;
+      int b_quant_0, b_quant_1;
+
+      if constexpr (w_type_id == sglang::kFE2M1f.id()) {
+        b_quant_1 = frag_b_quant[k2][0][j];
+        b_quant_0 = b_quant_1 << 8;
+      } else if constexpr (w_type.size_bits() == 4) {
+        b_quant_0 = frag_b_quant[k2][0][j];
+        b_quant_1 = b_quant_0 >> 8;
+      } else {
+        static_assert(w_type.size_bits() == 8);
+        int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k2]);
+        b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
+        b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
+      }
+
+      dequant_data(b_quant_0, reinterpret_cast<scalar_t2*>(&frag_b0));
+      dequant_data(b_quant_1, reinterpret_cast<scalar_t2*>(&frag_b1));
+
+      if constexpr (dequant_skip_flop && has_zp && !is_zp_float) {
+        sub_zp<scalar_t>(frag_b0, frag_zp[j], 0);
+        sub_zp<scalar_t>(frag_b1, frag_zp[j], 1);
+      }
+
+      // Apply scale to frag_b0
+      if constexpr (has_act_order) {
+        static_assert(group_blocks != -1);
+        scale4<scalar_t>(
+            frag_b0, act_frag_s[k2][0][j], act_frag_s[k2][1][j], act_frag_s[k2][2][j], act_frag_s[k2][3][j], 0);
+        scale4<scalar_t>(
+            frag_b1, act_frag_s[k2][0][j], act_frag_s[k2][1][j], act_frag_s[k2][2][j], act_frag_s[k2][3][j], 1);
+      } else if constexpr (!dequant_skip_flop && has_zp && !is_zp_float && group_blocks == -1) {
+        int idx = (threadIdx.x / 4) % 2;
+        scalar_t2 s2 = Dtype::nums2num2(
+            reinterpret_cast<scalar_t*>(&frag_s[j / 2][j % 2 * 2 + 0])[idx],
+            reinterpret_cast<scalar_t*>(&frag_s[j / 2][j % 2 * 2 + 1])[idx]);
+        if (is_new_zp) frag_zp[j] = __hmul2(frag_zp[j], s2);
+        scale_and_sub<scalar_t>(frag_b0, s2.x, frag_zp[j].x);
+        scale_and_sub<scalar_t>(frag_b1, s2.y, frag_zp[j].y);
+      } else if constexpr (!dequant_skip_flop && has_zp && group_blocks != -1) {
+        if (is_new_zp) frag_zp[j] = __hmul2(frag_zp[j], *reinterpret_cast<scalar_t2*>(&frag_s[k2][j]));
+        scale_and_sub<scalar_t>(frag_b0, frag_s[k2][j][0].x, frag_zp[j].x);
+        scale_and_sub<scalar_t>(frag_b1, frag_s[k2][j][0].y, frag_zp[j].y);
+      } else if constexpr (group_blocks != -1) {
+        scale<scalar_t>(frag_b0, frag_s[k2][j], 0);
+        scale<scalar_t>(frag_b1, frag_s[k2][j], 1);
+      }
+
+#pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        if constexpr (m_block_size_8) {
+          mma_trans<scalar_t>(frag_a[k2][i], frag_b0, frag_b1, frag_c[i][j][0]);
+        } else {
+          mma<scalar_t>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
+          mma<scalar_t>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
+        }
+      }
+    }
+  };
+
+  // Since we slice across the k dimension of a tile in order to increase the
+  // number of warps while keeping the n dimension of a tile reasonable, we have
+  // multiple warps that accumulate their partial sums of the same output
+  // location; which we have to reduce over in the end. We do in shared memory.
+  auto thread_block_reduce = [&]() {
+    constexpr int red_off = threads / b_sh_stride_threads / 2;
+    if (red_off >= 1) {
+      auto red_idx = threadIdx.x / b_sh_stride_threads;
+      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride_threads;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) + (threadIdx.x % b_sh_stride_threads);
+
+      // Parallel logarithmic shared memory reduction. We make sure to avoid any
+      // unnecessary read or write iterations, e.g., for two warps we write only
+      // once by warp 1 and read only once by warp 0.
+
+#pragma unroll
+      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
+#pragma unroll
+        for (int i = red_off; i > 0; i /= 2) {
+          if (i <= red_idx && red_idx < 2 * i) {
+#pragma unroll
+            for (int j = 0; j < 4 * 2; j += (m_block_size_8 ? 2 : 1)) {
+              int red_sh_wr = red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
+              if (i < red_off) {
+                float* c_rd = reinterpret_cast<float*>(&sh_red[red_sh_delta * j + red_sh_rd]);
+                float* c_wr = reinterpret_cast<float*>(&sh_red[red_sh_wr]);
+#pragma unroll
+                for (int k = 0; k < 4; k++)
+                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] += c_rd[k] + c_wr[k];
+              }
+              sh_red[red_sh_wr] = reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+            }
+          }
+          __syncthreads();
+        }
+        if (red_idx == 0) {
+#pragma unroll
+          for (int i = 0; i < 4 * 2; i += (m_block_size_8 ? 2 : 1)) {
+            float* c_rd = reinterpret_cast<float*>(&sh_red[red_sh_delta * i + red_sh_rd]);
+#pragma unroll
+            for (int j = 0; j < 4; j++)
+              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] += c_rd[j];
+          }
+        }
+        __syncthreads();
+      }
+    }
+  };
+
+  // Since multiple threadblocks may process parts of the same column slice, we
+  // finally have to globally reduce over the results. As the striped
+  // partitioning minimizes the number of such reductions and our outputs are
+  // usually rather small, we perform this reduction serially in L2 cache.
+  auto global_reduce_fp16 = [&](bool first = false, bool last = false) {
+    // We are very careful here to reduce directly in the output buffer to
+    // maximize L2 cache utilization in this step. To do this, we write out
+    // results in FP16 (but still reduce with FP32 compute).
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    if (threadIdx.x < active_threads) {
+      int c_gl_stride = prob_n / 8;
+      int c_gl_wr_delta_o = 8 * c_gl_stride;
+      int c_gl_wr_delta_i = 4 * (active_threads / 32);
+      int c_gl_wr;
+      if constexpr (m_block_size_8) {
+        c_gl_wr = c_gl_stride * ((threadIdx.x % 4) * 2) + 4 * (threadIdx.x / 32) + (threadIdx.x % 32) / 8;
+        c_gl_wr += (2 * thread_n_blocks) * slice_col;
+      } else {
+        c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) + 4 * (threadIdx.x / 32) + threadIdx.x % 4;
+        c_gl_wr += (2 * thread_n_blocks) * slice_col;
+      }
+      constexpr int c_sh_wr_delta = active_threads;
+      auto c_sh_wr = threadIdx.x;
+
+      int row = (threadIdx.x % 32) / 4;
+
+      if (!first) {
+// Interestingly, doing direct global accesses here really seems to mess up
+// the compiler and lead to slowdowns, hence we also use async-copies even
+// though these fetches are not actually asynchronous.
+#pragma unroll
+        for (int i = 0; i < (m_block_size_8 ? 2 : thread_m_blocks * 4); i++) {
+          if constexpr (m_block_size_8) {
+            cp_async4_pred(
+                &sh_red[c_sh_wr + c_sh_wr_delta * i],
+                &C[c_gl_wr + i * c_gl_stride + (threadIdx.x % 8) / 4 * c_gl_wr_delta_i],
+                (threadIdx.x % 4) * 2 + i < prob_m);
+          } else {
+            cp_async4_pred(
+                &sh_red[c_sh_wr + c_sh_wr_delta * i],
+                &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)],
+                i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
+          }
+        }
+        cp_async_fence();
+        cp_async_wait<0>();
+      }
+
+#pragma unroll
+      for (int i = 0; i < (m_block_size_8 ? 2 : thread_m_blocks * 4); i++) {
+        bool mask = (!m_block_size_8) && (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) ||
+                    (m_block_size_8) && ((threadIdx.x % 4) * 2 + i < prob_m);
+        if (mask) {
+          if (!first) {
+            int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta];
+#pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              int delta = 0;
+              if constexpr (m_block_size_8) {
+                delta = j % 2 == 1 ? -2 : 0;
+              }
+              reinterpret_cast<float*>(&frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta] +=
+                  Dtype::num2float(reinterpret_cast<scalar_t*>(&c_red)[j]);
+            }
+          }
+          if (!last) {
+            int4 c;
+#pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              int delta = 0;
+              if constexpr (m_block_size_8) {
+                delta = j % 2 == 1 ? -2 : 0;
+              }
+              reinterpret_cast<scalar_t*>(&c)[j] =
+                  Dtype::float2num(reinterpret_cast<float*>(&frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta]);
+            }
+            if constexpr (m_block_size_8)
+              C[c_gl_wr + i * c_gl_stride + (threadIdx.x % 8) / 4 * c_gl_wr_delta_i] = c;
+            else
+              C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] = c;
+          }
+        }
+      }
+    }
+  };
+
+  // Globally reduce over threadblocks that compute the same column block.
+  // We use a tmp C buffer to reduce in full fp32 precision.
+  auto global_reduce_fp32 = [&](bool first = false, bool last = false) {
+    constexpr int tb_m = thread_m_blocks * 16;
+    constexpr int tb_n = thread_n_blocks * 16;
+
+    constexpr int c_size = tb_m * tb_n * sizeof(float) / 16;
+
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    bool is_th_active = threadIdx.x < active_threads;
+
+    constexpr int num_floats = thread_m_blocks * 4 * 2 * 4;
+    constexpr int th_size = num_floats * sizeof(float) / 16;
+
+    int c_cur_offset = locks_off * c_size;
+
+    if (!is_th_active) {
+      return;
+    }
+
+    if (!first) {
+      float* frag_c_ptr = reinterpret_cast<float*>(&frag_c);
+#pragma unroll
+      for (int k = 0; k < th_size; k += (m_block_size_8 ? 2 : 1)) {
+        sh_red[threadIdx.x] = C_tmp[c_cur_offset + active_threads * k + threadIdx.x];
+
+        float* sh_c_ptr = reinterpret_cast<float*>(&sh_red[threadIdx.x]);
+#pragma unroll
+        for (int f = 0; f < 4; f++) {
+          frag_c_ptr[k * 4 + f] += sh_c_ptr[f];
+        }
+      }
+    }
+
+    if (!last) {
+      int4* frag_c_ptr = reinterpret_cast<int4*>(&frag_c);
+#pragma unroll
+      for (int k = 0; k < th_size; k += (m_block_size_8 ? 2 : 1)) {
+        C_tmp[c_cur_offset + active_threads * k + threadIdx.x] = frag_c_ptr[k];
+      }
+    }
+  };
+
+  // Write out the reduce final result in the correct layout. We only actually
+  // reshuffle matrix fragments in this step, the reduction above is performed
+  // in fragment layout.
+  auto write_result = [&]() {
+    int c_gl_stride = prob_n / 8;
+    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
+    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
+    constexpr int c_sh_rd_delta = c_sh_stride * (threads / (2 * thread_n_blocks));
+
+    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) + (threadIdx.x % (2 * thread_n_blocks));
+    c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    int c_sh_wr;
+    if constexpr (m_block_size_8) {
+      c_sh_wr = (8 * c_sh_stride) * ((threadIdx.x % 32) % 4 * 2) + (threadIdx.x % 32) / 4;
+      c_sh_wr += 64 * (threadIdx.x / 32);
+    } else {
+      c_sh_wr = (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
+      c_sh_wr += 32 * (threadIdx.x / 32);
+    }
+
+    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) + (threadIdx.x % (2 * thread_n_blocks));
+
+    int c_gl_wr_end = c_gl_stride * prob_m;
+    // We first reorder in shared memory to guarantee the most efficient final
+    // global write patterns
+    auto write = [&](int idx, float c0, float c1, FragS& s) {
+      scalar_t2 res = Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));
+
+      // For per-column quantization we finally apply the scale here (only for
+      // 4-bit)
+      if constexpr (
+          !has_act_order && group_blocks == -1 && w_type.size_bits() == 4 && (has_zp && dequant_skip_flop || !has_zp)) {
+        res = __hmul2(res, s[0]);
+      }
+
+      if constexpr (w_type == sglang::kFE2M1f) {
+        res = __hmul2(res, global_scale);
+      }
+
+      if constexpr (m_block_size_8) {
+        ((scalar_t*)sh_red)[idx] = res.x;
+        ((scalar_t*)sh_red)[idx + 8 * c_sh_stride] = res.y;
+      } else {
+        ((scalar_t2*)sh_red)[idx] = res;
+      }
+    };
+
+    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+#pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+#pragma unroll
+        for (int j = 0; j < 4; j++) {
+          if constexpr (m_block_size_8) {
+            int wr = c_sh_wr + 16 * j;
+            write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+            write(wr + 8, frag_c[i][j][0][2], frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 1]);
+          } else {
+            int wr = c_sh_wr + 8 * j;
+            write(
+                wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0], frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+            write(
+                wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2], frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
+            write(
+                wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0], frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
+            write(
+                wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2], frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
+          }
+        }
+        c_sh_wr += 16 * (4 * c_sh_stride);
+      }
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (int i = 0; i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks)); i++) {
+      if (c_gl_wr < c_gl_wr_end) {
+        if (use_atomic_add && slice_count > 1) {
+          scalar_t2* C_half2 = reinterpret_cast<scalar_t2*>(&C[c_gl_wr]);
+          scalar_t2* sh_red_half2 = reinterpret_cast<scalar_t2*>(&sh_red[c_sh_rd]);
+#pragma unroll
+          for (int a = 0; a < 4; a++) {
+            atomicAdd(&C_half2[a], sh_red_half2[a]);
+          }
+        } else {
+          C[c_gl_wr] = sh_red[c_sh_rd];
+        }
+        c_gl_wr += c_gl_wr_delta;
+        c_sh_rd += c_sh_rd_delta;
+      }
+    }
+    __syncthreads();
+  };
+
+  // Start global fetch and register load pipelines.
+  auto start_pipes = [&]() {
+
+#pragma unroll
+    for (int i = 0; i < stages - 1; i++) {
+      if (has_act_order && i == 0) {
+        int last_g_idx = slice_k_start + stages * tb_k * 2;
+        if (last_g_idx >= prob_k) {
+          last_g_idx = prob_k - 1;
+        }
+        fetch_act_order_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]);
+      }
+
+      if constexpr (has_zp && !is_zp_float && group_blocks == -1) {
+        if (i == 0) {
+          fetch_col_zp_to_shared();
+          if constexpr (!dequant_skip_flop) {
+            fetch_col_scale_to_shared();
+          }
+        }
+      }
+      fetch_to_shared(i, i, i < slice_iters);
+    }
+
+    zero_accums();
+    wait_for_stage();
+    init_same_group(0);
+    fetch_to_registers(0, 0);
+    fetch_scales_to_registers(0, 0);
+    fetch_zp_to_registers(0, 0);
+    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
+    if constexpr (has_act_order) {
+      slice_k_start_shared_fetch += tb_k * (stages - 1);
+    }
+  };
+  if (slice_iters) {
+    start_pipes();
+  }
+
+  // Main loop.
+  while (slice_iters) {
+    // We unroll over both the global fetch and the register load pipeline to
+    // ensure all shared memory accesses are static. Note that both pipelines
+    // have even length meaning that the next iteration will always start at
+    // index 0.
+
+#pragma unroll
+    for (int pipe = 0; pipe < stages;) {
+#pragma unroll
+      for (int k = 0; k < b_sh_wr_iters; k++) {
+        fetch_to_registers(k + 1, pipe % stages);
+        fetch_scales_to_registers(k + 1, pipe);
+        fetch_zp_to_registers(k + 1, pipe);
+        if (k == b_sh_wr_iters - 2) {
+          fetch_to_shared((pipe + stages - 1) % stages, pipe, slice_iters >= stages);
+          pipe++;
+          wait_for_stage();
+          init_same_group(pipe % stages);
+        }
+        matmul(k);
+      }
+      slice_iters--;
+      if (slice_iters == 0) {
+        break;
+      }
+    }
+
+    a_gl_rd += a_gl_rd_delta_o * stages;
+
+    if constexpr (has_act_order) {
+      slice_k_start += tb_k * stages;
+
+      if (slice_k_start < prob_k) {
+        slice_k_start_shared_fetch += tb_k * stages;
+        int first_group_id = g_idx[slice_k_start];
+        int last_g_idx = slice_k_start + stages * tb_k * 2;
+        if (last_g_idx >= prob_k) {
+          last_g_idx = prob_k - 1;
+        }
+        int last_group_id = g_idx[last_g_idx];
+        if (last_group_id >= sh_first_group_id + sh_num_groups) {
+          fetch_act_order_scales_to_shared(false, first_group_id, last_group_id);
+          __syncthreads();
+        }
+      }
+    }
+
+    // Process results and, if necessary, proceed to the next column slice.
+    // While this pattern may not be the most readable, other ways of writing
+    // the loop seemed to noticeably worse performance after compilation.
+    if (slice_iters == 0) {
+      cp_async_wait<0>();
+      bool last = slice_idx == slice_count - 1;
+      // For per-column scales, we only fetch them here in the final step before
+      // write-out
+      if constexpr (!has_act_order && group_blocks == -1 && (has_zp && dequant_skip_flop || !has_zp)) {
+        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
+          if (s_sh_wr_pred) {
+            cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+          }
+          cp_async_fence();
+        }
+      }
+
+      thread_block_reduce();
+      if constexpr (!has_act_order && group_blocks == -1 && (has_zp && dequant_skip_flop || !has_zp)) {
+        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
+          cp_async_wait<0>();
+          __syncthreads();
+          if (threadIdx.x / 32 < thread_n_blocks / 4) {
+            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+            reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+            if constexpr (m_block_size_8) {
+              int idx = (threadIdx.x / 4) % 2;
+              scalar_t2* frag_s_half2 = reinterpret_cast<scalar_t2*>(frag_s);
+#pragma unroll
+              for (int i = 0; i < 8; i++) {
+                frag_s_half2[i] = Dtype::num2num2(reinterpret_cast<scalar_t*>(&frag_s_half2[i])[idx]);
+              }
+            }
+          }
+        }
+      }
+
+      // For 8-bit channelwise, we apply the scale before the global reduction
+      // that converts the fp32 results to fp16 (so that we avoid possible
+      // overflow in fp16)
+      if constexpr (
+          !has_act_order && group_blocks == -1 && w_type.size_bits() == 8 && (has_zp && dequant_skip_flop || !has_zp)) {
+        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+#pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+#pragma unroll
+            for (int j = 0; j < 4; j++) {
+              scale_float<scalar_t>(reinterpret_cast<float*>(&frag_c[i][j][0][0]), frag_s[j / 2][2 * (j % 2) + 0]);
+              scale_float<scalar_t>(
+                  reinterpret_cast<float*>(&frag_c[i][j][0][2]), frag_s[j / 2][2 * (j % 2) + (m_block_size_8 ? 1 : 0)]);
+
+              if constexpr (!m_block_size_8) {
+                scale_float<scalar_t>(reinterpret_cast<float*>(&frag_c[i][j][1][0]), frag_s[j / 2][2 * (j % 2) + 1]);
+                scale_float<scalar_t>(reinterpret_cast<float*>(&frag_c[i][j][1][2]), frag_s[j / 2][2 * (j % 2) + 1]);
+              }
+            }
+          }
+        }
+      }
+
+      if (slice_count > 1 && !use_atomic_add) {
+        // only globally reduce if there is more than one block in a slice
+        barrier_acquire(&locks[locks_off], slice_idx);
+        if (use_fp32_reduce) {
+          global_reduce_fp32(slice_idx == 0, last);
+        } else {
+          global_reduce_fp16(slice_idx == 0, last);
+        }
+        barrier_release(&locks[locks_off], last);
+      }
+      if (use_atomic_add && slice_count > 1 && slice_idx != 0) wait_negative_and_add(&locks[locks_off]);
+      if (last || use_atomic_add)
+        // only the last block in a slice actually writes the result
+        write_result();
+      slice_row = 0;
+      slice_col_par++;
+      slice_col++;
+      is_first_matmul_in_slice = true;
+      init_slice();
+
+      if (slice_iters) {
+        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + (threadIdx.x % a_gl_rd_delta_o);
+#pragma unroll
+        for (int i = 0; i < b_sh_wr_iters; i++)
+          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
+        if (slice_col == 0) {
+#pragma unroll
+          for (int i = 0; i < b_sh_wr_iters; i++)
+            B_ptr[i] -= b_gl_stride;
+        }
+
+        // Update slice k/n for scales loading
+        if constexpr (has_act_order) {
+          slice_k_start = tb_k * slice_row;
+          slice_k_finish = slice_k_start + tb_k * slice_iters;
+          slice_k_start_shared_fetch = slice_k_start;
+          slice_n_offset = act_s_col_tb_stride * slice_col;
+
+        } else {
+          s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+          zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+        }
+
+        start_pipes();
+      }
+    }
+  }
+}
+
+}  // namespace MARLIN_NAMESPACE_NAME
+
+#endif
diff --git a/sgl-kernel/csrc/gemm/math.hpp b/sgl-kernel/csrc/gemm/math.hpp
new file mode 100644
index 000000000..6764e1fd6
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/math.hpp
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <climits>
+#include <iostream>
+
+inline constexpr uint32_t next_pow_2(uint32_t const num) {
+  if (num <= 1) return num;
+  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
+
+template <typename A, typename B>
+static inline constexpr auto div_ceil(A a, B b) {
+  return (a + b - 1) / b;
+}
+
+// Round a down to the next multiple of b. The caller is responsible for making
+// sure that b is non-zero
+template <typename T>
+inline constexpr T round_to_previous_multiple_of(T a, T b) {
+  return a % b == 0 ? a : (a / b) * b;
+}
+
+// Round a up to the next multiple of b. The caller is responsible for making
+// sure that b is non-zero
+template <typename T>
+inline constexpr T round_to_next_multiple_of(T a, T b) {
+  return a % b == 0 ? a : ((a / b) + 1) * b;
+}
diff --git a/sgl-kernel/csrc/gemm/nvfp4_expert_quant.cu b/sgl-kernel/csrc/gemm/nvfp4_expert_quant.cu
new file mode 100644
index 000000000..e18f2057b
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/nvfp4_expert_quant.cu
@@ -0,0 +1,728 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+#include <torch/all.h>
+
+#include "nvfp4_quant.cuh"
+#include "utils.h"
+
+// Quantizes the provided PackedVec into the uint32_t output
+template <class Type, bool UE8M0_SF = false>
+__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal, uint8_t* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  // Get absolute maximum values among the local 8 values.
+  auto localMax = __habs2(vec.elts[0]);
+
+// Local maximum value.
+#pragma unroll
+  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    localMax = __hmax2(localMax, __habs2(vec.elts[i]));
+  }
+
+  // Get the absolute maximum among all 16 values (two threads).
+  localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
+  // Get the final absolute maximum values.
+  float vecMax = float(__hmax(localMax.x, localMax.y));
+
+  // Get the SF (max value of the vector / max value of e2m1).
+  // maximum value of e2m1 = 6.0.
+  // TODO: use half as compute data type.
+  float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
+  // 8 bits representation of the SF.
+  uint8_t fp8SFVal;
+  // Write the SF to global memory (STG.8).
+  if constexpr (UE8M0_SF) {
+    // Extract the 8 exponent bits from float32.
+    // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits.
+    uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23;
+    fp8SFVal = tmp & 0xff;
+    // Convert back to fp32.
+    reinterpret_cast<uint32_t&>(SFValue) = tmp << 23;
+  } else {
+    // Here SFValue is always positive, so E4M3 is the same as UE4M3.
+    __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
+    reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp;
+    // Convert back to fp32.
+    SFValue = float(tmp);
+  }
+  // Get the output scale.
+  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
+  //                       reciprocal(SFScaleVal))
+  float outputScale =
+      SFValue != 0 ? reciprocal_approximate_ftz(SFValue * reciprocal_approximate_ftz(SFScaleVal)) : 0.0f;
+
+  if (SFout) {
+    // Write the SF to global memory (STG.8).
+    *SFout = fp8SFVal;
+  }
+
+  // Convert the input to float.
+  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
+
+#pragma unroll
+  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    if constexpr (std::is_same_v<Type, half>) {
+      fp2Vals[i] = __half22float2(vec.elts[i]);
+    } else {
+      fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
+    }
+    fp2Vals[i].x *= outputScale;
+    fp2Vals[i].y *= outputScale;
+  }
+
+  // Convert to e2m1 values.
+  uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
+
+  // Write the e2m1 values to global memory.
+  return e2m1Vec;
+#else
+  return 0;
+#endif
+}
+
+__device__ __forceinline__ float silu(const float& val) {
+  return val / (1.0f + __expf(-val));
+}
+
+template <class Type>
+inline __device__ void silu_and_mul(PackedVec<Type>& x_vec, const PackedVec<Type>& y_vec) {
+  float2 x[CVT_FP4_ELTS_PER_THREAD / 2];
+  float2 y[CVT_FP4_ELTS_PER_THREAD / 2];
+
+#pragma unroll
+  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    if constexpr (std::is_same_v<Type, half>) {
+      x[i] = __half22float2(x_vec.elts[i]);
+      y[i] = __half22float2(y_vec.elts[i]);
+      x[i].x = silu(x[i].x) * y[i].x;
+      x[i].y = silu(x[i].y) * y[i].y;
+      x_vec.elts[i] = __float22half2_rn(x[i]);
+    } else {
+      x[i] = __bfloat1622float2(x_vec.elts[i]);
+      y[i] = __bfloat1622float2(y_vec.elts[i]);
+      x[i].x = silu(x[i].x) * y[i].x;
+      x[i].y = silu(x[i].y) * y[i].y;
+      x_vec.elts[i] = __float22bfloat162_rn(x[i]);
+    }
+  }
+}
+
+// Use UE4M3 by default.
+template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
+__global__ void
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__launch_bounds__(512, 4) cvt_fp16_to_fp4(
+#else
+cvt_fp16_to_fp4(
+#endif
+    int32_t numRows,
+    int32_t numCols,
+    Type const* in,
+    float const* SFScale,
+    uint32_t* out,
+    uint32_t* SFout,
+    uint32_t* input_offset_by_experts,
+    uint32_t* output_scale_offset_by_experts,
+    int32_t* mask,
+    int n_experts,
+    bool low_latency) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  using PackedVec = PackedVec<Type>;
+  static constexpr int CVT_FP4_NUM_THREADS_PER_SF = (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
+  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, "Vec size is not matched.");
+
+  // Input tensor row/col loops.
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int colsPerRow = numCols / CVT_FP4_ELTS_PER_THREAD;
+  // TODO(kaixih@nvidia): For now, we assume mask is used together with
+  // silu_and_mal. Maybe we want a more general behavior of mask later. In the
+  // silu case, the input last dim doubles.
+  bool use_mask = mask != nullptr;
+  int actualColsPerRow = use_mask ? colsPerRow * 2 : colsPerRow;
+
+  // Each global thread processes one element
+  for (int globalIdx = tid; globalIdx < numRows * colsPerRow; globalIdx += gridDim.x * blockDim.x) {
+    // Calculate which row and column this global thread should process
+    int rowIdx = globalIdx / colsPerRow;
+    int colIdx = globalIdx % colsPerRow;
+
+    // Find index within the experts using different strategies based on expert
+    // count
+    int rowIdx_in_expert = 0;
+    int expert_idx = 0;
+
+    if constexpr (SMALL_NUM_EXPERTS) {
+      for (int i = 0; i < n_experts; i++) {
+        uint32_t current_offset = __ldca(&input_offset_by_experts[i]);
+        uint32_t next_offset = __ldca(&input_offset_by_experts[i + 1]);
+        if (rowIdx >= current_offset && rowIdx < next_offset) {
+          rowIdx_in_expert = rowIdx - current_offset;
+          expert_idx = i;
+          break;
+        }
+      }
+    } else {
+      // Load input offsets into registers first, then do the computation.
+      // Local array size set to 17 because of register limit.
+      uint32_t local_offsets[17];
+      for (int chunk_start = 0; chunk_start < n_experts; chunk_start += 16) {
+        *reinterpret_cast<int4*>(local_offsets) =
+            __ldca(reinterpret_cast<const int4*>(&input_offset_by_experts[chunk_start]));
+        *reinterpret_cast<int4*>(local_offsets + 4) =
+            __ldca(reinterpret_cast<const int4*>(&input_offset_by_experts[chunk_start + 4]));
+        *reinterpret_cast<int4*>(local_offsets + 8) =
+            __ldca(reinterpret_cast<const int4*>(&input_offset_by_experts[chunk_start + 8]));
+        *reinterpret_cast<int4*>(local_offsets + 12) =
+            __ldca(reinterpret_cast<const int4*>(&input_offset_by_experts[chunk_start + 12]));
+        local_offsets[16] = __ldca(&input_offset_by_experts[chunk_start + 16]);
+
+// Check against the 16 loaded offsets
+#pragma unroll
+        for (int i = 0; i < 16; i++) {
+          if (rowIdx >= local_offsets[i] && rowIdx < local_offsets[i + 1]) {
+            rowIdx_in_expert = rowIdx - local_offsets[i];
+            expert_idx = chunk_start + i;
+            break;
+          }
+        }
+      }
+    }
+
+    // Early exit when using masks.
+    if (use_mask && rowIdx_in_expert >= mask[expert_idx]) {
+      continue;
+    }
+
+    int64_t inOffset = rowIdx * actualColsPerRow + colIdx;
+    PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+    if (use_mask) {
+      PackedVec in_vec_mul = reinterpret_cast<PackedVec const*>(in)[inOffset + colsPerRow];
+      silu_and_mul(in_vec, in_vec_mul);
+    }
+
+    // Get the output tensor offset.
+    // Same as inOffset because 8 elements are packed into one uint32_t.
+    int64_t outOffset = rowIdx * colsPerRow + colIdx;
+    auto& out_pos = out[outOffset];
+
+    // Get the global scaling factor, which will be applied to the SF.
+    // Note SFScale is the same as next GEMM's alpha, which is
+    // (448.f / (Alpha_A / 6.f)).
+    float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx];
+
+    int factor = CVT_FP4_SF_VEC_SIZE * 4;
+    // The actual output_scales dim is computed from the padded numCols.
+    int32_t numCols_padded = (numCols + factor - 1) / factor * factor;
+    int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4;
+    uint32_t* SFout_in_expert = SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout;
+
+    auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<uint32_t, CVT_FP4_NUM_THREADS_PER_SF>(
+        rowIdx_in_expert, colIdx, numCols, SFout_in_expert);
+
+    out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+  }
+#endif
+}
+
+// Use UE4M3 by default.
+template <class Type, bool UE8M0_SF = false>
+__global__ void
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__launch_bounds__(512, 4) cvt_fp16_to_fp4_expert(
+#else
+cvt_fp16_to_fp4_expert(
+#endif
+    int32_t numRows,
+    int32_t numCols,
+    Type const* in,
+    float const* SFScale,
+    uint32_t* out,
+    uint32_t* SFout,
+    int32_t* mask,
+    bool use_silu_and_mul,
+    int n_experts) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  using PackedVec = PackedVec<Type>;
+  static constexpr int CVT_FP4_NUM_THREADS_PER_SF = (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
+  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, "Vec size is not matched.");
+
+  // Input tensor row/col loops.
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = (gridDim.x * blockDim.x) / n_experts;
+  int remainder = (gridDim.x * blockDim.x) % n_experts;
+  int expert_idx;
+  int tid_in_expert;
+  int actual_stride;
+  if (remainder > 0) {
+    int bound = remainder * (stride + 1);
+    if (tid < bound) {
+      expert_idx = tid / (stride + 1);
+      tid_in_expert = tid % (stride + 1);
+      actual_stride = stride + 1;
+    } else {
+      expert_idx = remainder + (tid - bound) / stride;
+      tid_in_expert = (tid - bound) % stride;
+      actual_stride = stride;
+    }
+  } else {
+    expert_idx = tid / stride;
+    tid_in_expert = tid % stride;
+    actual_stride = stride;
+  }
+  int m = numRows / n_experts;
+  int padded_m = (m + (128 - 1)) / 128 * 128;
+
+  int colsPerRow = numCols / CVT_FP4_ELTS_PER_THREAD;
+  // TODO(kaixih@nvidia): For now, we assume mask is used together with
+  // silu_and_mal. Maybe we want a more general behavior of mask later. In the
+  // silu case, the input last dim doubles.
+  bool use_mask = mask != nullptr;
+  int actualColsPerRow = use_silu_and_mul ? colsPerRow * 2 : colsPerRow;
+
+  // Each global thread processes one element
+  for (int globalIdx = tid_in_expert + expert_idx * m * colsPerRow; globalIdx < (expert_idx + 1) * m * colsPerRow;
+       globalIdx += actual_stride) {
+    // Calculate which row and column this global thread should process
+    int rowIdx = globalIdx / colsPerRow;
+    int colIdx = globalIdx % colsPerRow;
+
+    // Find index within the experts
+    int rowIdx_in_expert = rowIdx - expert_idx * m;
+
+    // Early exit when using masks.
+    if (use_mask && rowIdx_in_expert >= mask[expert_idx]) {
+      break;
+    }
+
+    int64_t inOffset = rowIdx * actualColsPerRow + colIdx;
+    PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+    if (use_silu_and_mul) {
+      PackedVec in_vec_mul = reinterpret_cast<PackedVec const*>(in)[inOffset + colsPerRow];
+      silu_and_mul(in_vec, in_vec_mul);
+    }
+
+    // Get the output tensor offset.
+    // Same as inOffset because 8 elements are packed into one uint32_t.
+    int64_t outOffset = rowIdx * colsPerRow + colIdx;
+    auto& out_pos = out[outOffset];
+
+    // Get the global scaling factor, which will be applied to the SF.
+    // Note SFScale is the same as next GEMM's alpha, which is
+    // (448.f / (Alpha_A / 6.f)).
+    float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx];
+
+    int factor = CVT_FP4_SF_VEC_SIZE * 4;
+    // The actual output_scales dim is computed from the padded numCols.
+    int32_t numCols_padded = (numCols + factor - 1) / factor * factor;
+    int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4;
+    uint32_t* SFout_in_expert = SFout + expert_idx * padded_m * numCols_SFout;
+
+    auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<uint32_t, CVT_FP4_NUM_THREADS_PER_SF>(
+        rowIdx_in_expert, colIdx, numCols, SFout_in_expert);
+
+    out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+  }
+#endif
+}
+
+// Kernel for LARGE_M_TOPK = true (large m_topk optimized version)
+template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
+__global__ void
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__launch_bounds__(1024, 4) cvt_fp16_to_fp4(
+#else
+cvt_fp16_to_fp4(
+#endif
+    int32_t numRows,
+    int32_t numCols,
+    Type const* in,
+    float const* SFScale,
+    uint32_t* out,
+    uint32_t* SFout,
+    uint32_t* input_offset_by_experts,
+    uint32_t* output_scale_offset_by_experts,
+    int32_t* mask,
+    int n_experts) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  using PackedVec = PackedVec<Type>;
+  static constexpr int CVT_FP4_NUM_THREADS_PER_SF = (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
+  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, "Vec size is not matched.");
+  extern __shared__ uint32_t shared_input_offsets[];
+
+  // Load input offsets into shared memory.
+  // If n_experts is larger than 4, use vectorized int4 to save instructions.
+  // If n_experts is smaller than 4, read directly.
+  if constexpr (SMALL_NUM_EXPERTS) {
+    for (int i = threadIdx.x; i < n_experts + 1; i += blockDim.x) {
+      shared_input_offsets[i] = input_offset_by_experts[i];
+    }
+  } else {
+    for (int i = threadIdx.x * 4; i < n_experts; i += blockDim.x * 4) {
+      *reinterpret_cast<int4*>(&shared_input_offsets[i]) = *reinterpret_cast<const int4*>(&input_offset_by_experts[i]);
+    }
+    if (threadIdx.x == 0) {
+      shared_input_offsets[n_experts] = input_offset_by_experts[n_experts];
+    }
+  }
+
+  __syncthreads();
+
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int colsPerRow = numCols / CVT_FP4_ELTS_PER_THREAD;
+  bool use_mask = mask != nullptr;
+  int actualColsPerRow = use_mask ? colsPerRow * 2 : colsPerRow;
+
+  // Each global thread processes one element
+  for (int globalIdx = tid; globalIdx < numRows * colsPerRow; globalIdx += gridDim.x * blockDim.x) {
+    // Calculate which row and column this global thread should process
+    int rowIdx = globalIdx / colsPerRow;
+    int colIdx = globalIdx % colsPerRow;
+
+    // Find expert using binary search for better performance with large m_topk
+    int rowIdx_in_expert = 0;
+    int expert_idx = 0;
+
+    // Binary search through experts using shared memory
+    int left = 0, right = n_experts - 1;
+    while (left <= right) {
+      int mid = (left + right) / 2;
+      // Get offsets: shared_input_offsets[i] corresponds to
+      // input_offset_by_experts[i]
+      uint32_t mid_offset = shared_input_offsets[mid];
+      uint32_t next_offset = shared_input_offsets[mid + 1];
+
+      if (rowIdx >= mid_offset && rowIdx < next_offset) {
+        rowIdx_in_expert = rowIdx - mid_offset;
+        expert_idx = mid;
+        break;
+      } else if (rowIdx < mid_offset) {
+        right = mid - 1;
+      } else {
+        left = mid + 1;
+      }
+    }
+
+    if (use_mask && rowIdx_in_expert >= mask[expert_idx]) {
+      continue;
+    }
+
+    int64_t inOffset = rowIdx * actualColsPerRow + colIdx;
+
+    PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+    if (use_mask) {
+      PackedVec in_vec_mul = reinterpret_cast<PackedVec const*>(in)[inOffset + colsPerRow];
+      silu_and_mul(in_vec, in_vec_mul);
+    }
+
+    int64_t outOffset = rowIdx * colsPerRow + colIdx;
+    auto& out_pos = out[outOffset];
+
+    float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx];
+
+    int factor = CVT_FP4_SF_VEC_SIZE * 4;
+    int32_t numCols_padded = (numCols + factor - 1) / factor * factor;
+    int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4;
+    uint32_t* SFout_in_expert = SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout;
+
+    auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<uint32_t, CVT_FP4_NUM_THREADS_PER_SF>(
+        rowIdx_in_expert, colIdx, numCols, SFout_in_expert);
+
+    out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+  }
+#endif
+}
+
+template <typename T>
+void quant_impl(
+    void* output,
+    void* output_scale,
+    void* input,
+    void* input_global_scale,
+    void* input_offset_by_experts,
+    void* output_scale_offset_by_experts,
+    void* mask,
+    bool use_silu_and_mul,
+    int m_topk,
+    int k,
+    int n_experts,
+    cudaStream_t stream) {
+  // TODO: this multiProcessorCount should be cached.
+  int device;
+  cudaGetDevice(&device);
+  int multiProcessorCount;
+  cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, device);
+
+  // Grid, Block size.
+  // Each thread converts 8 values.
+  int const workSizePerRow = k / ELTS_PER_THREAD;
+  int const totalWorkSize = m_topk * workSizePerRow;
+  dim3 block(std::min(workSizePerRow, 512));
+  // Get number of blocks per SM (assume we can fully utilize the SM).
+  int const numBlocksPerSM = 2048 / block.x;
+  dim3 grid(std::min(static_cast<int>((totalWorkSize + block.x - 1) / block.x), multiProcessorCount * numBlocksPerSM));
+  while (grid.x <= multiProcessorCount && block.x > 64) {
+    grid.x *= 2;
+    block.x = (block.x + 1) / 2;
+  }
+
+  // TODO(kaixih@nvidia): Should relax this to allow any grid size.
+  if (mask != nullptr) {
+    grid.x = (grid.x + n_experts - 1) / n_experts * n_experts;
+    cvt_fp16_to_fp4_expert<T, false><<<grid, block, 0, stream>>>(
+        m_topk,
+        k,
+        reinterpret_cast<T*>(input),
+        reinterpret_cast<float*>(input_global_scale),
+        reinterpret_cast<uint32_t*>(output),
+        reinterpret_cast<uint32_t*>(output_scale),
+        reinterpret_cast<int32_t*>(mask),
+        use_silu_and_mul,
+        n_experts);
+    return;
+  }
+
+  int const blockRepeat = (totalWorkSize + block.x * grid.x - 1) / (block.x * grid.x);
+  if (blockRepeat > 1) {
+    size_t shared_mem_size = (n_experts + 1) * sizeof(uint32_t);
+    if (n_experts >= 4) {
+      cvt_fp16_to_fp4<T, false, false><<<grid, block, shared_mem_size, stream>>>(
+          m_topk,
+          k,
+          reinterpret_cast<T*>(input),
+          reinterpret_cast<float*>(input_global_scale),
+          reinterpret_cast<uint32_t*>(output),
+          reinterpret_cast<uint32_t*>(output_scale),
+          reinterpret_cast<uint32_t*>(input_offset_by_experts),
+          reinterpret_cast<uint32_t*>(output_scale_offset_by_experts),
+          reinterpret_cast<int32_t*>(mask),
+          n_experts);
+    } else {
+      cvt_fp16_to_fp4<T, false, true><<<grid, block, shared_mem_size, stream>>>(
+          m_topk,
+          k,
+          reinterpret_cast<T*>(input),
+          reinterpret_cast<float*>(input_global_scale),
+          reinterpret_cast<uint32_t*>(output),
+          reinterpret_cast<uint32_t*>(output_scale),
+          reinterpret_cast<uint32_t*>(input_offset_by_experts),
+          reinterpret_cast<uint32_t*>(output_scale_offset_by_experts),
+          reinterpret_cast<int32_t*>(mask),
+          n_experts);
+    }
+  } else {
+    if (n_experts >= 16) {
+      cvt_fp16_to_fp4<T, false, false><<<grid, block, 0, stream>>>(
+          m_topk,
+          k,
+          reinterpret_cast<T*>(input),
+          reinterpret_cast<float*>(input_global_scale),
+          reinterpret_cast<uint32_t*>(output),
+          reinterpret_cast<uint32_t*>(output_scale),
+          reinterpret_cast<uint32_t*>(input_offset_by_experts),
+          reinterpret_cast<uint32_t*>(output_scale_offset_by_experts),
+          reinterpret_cast<int32_t*>(mask),
+          n_experts,
+          /* bool low_latency */ true);
+    } else {
+      cvt_fp16_to_fp4<T, false, true><<<grid, block, 0, stream>>>(
+          m_topk,
+          k,
+          reinterpret_cast<T*>(input),
+          reinterpret_cast<float*>(input_global_scale),
+          reinterpret_cast<uint32_t*>(output),
+          reinterpret_cast<uint32_t*>(output_scale),
+          reinterpret_cast<uint32_t*>(input_offset_by_experts),
+          reinterpret_cast<uint32_t*>(output_scale_offset_by_experts),
+          reinterpret_cast<int32_t*>(mask),
+          n_experts,
+          /* bool low_latency */ true);
+    }
+  }
+}
+
+// Avoid redefinition warnings
+#undef CHECK_CONTIGUOUS
+#undef CHECK_TH_CUDA
+#undef CHECK_INPUT
+
+/*Quantization entry for fp4 experts quantization*/
+#define CHECK_TH_CUDA(x, m) TORCH_CHECK(x.is_cuda(), m, "must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x, m) TORCH_CHECK(x.is_contiguous(), m, "must be contiguous")
+#define CHECK_INPUT(x, m) \
+  CHECK_TH_CUDA(x, m);    \
+  CHECK_CONTIGUOUS(x, m);
+
+// constexpr auto FP8 = at::ScalarType::Float8_e4m3fn;
+constexpr auto HALF = at::ScalarType::Half;
+constexpr auto BF16 = at::ScalarType::BFloat16;
+constexpr auto FLOAT = at::ScalarType::Float;
+constexpr auto INT = at::ScalarType::Int;
+constexpr auto UINT8 = at::ScalarType::Byte;
+
+void scaled_fp4_experts_quant_sm100a(
+    torch::Tensor& output,
+    torch::Tensor& output_scale,
+    torch::Tensor const& input,
+    torch::Tensor const& input_global_scale,
+    torch::Tensor const& input_offset_by_experts,
+    torch::Tensor const& output_scale_offset_by_experts) {
+  auto sm_version = getSMVersion();
+  TORCH_CHECK(sm_version == 100 || sm_version == 103, "fp4_quant is only supported on sm100a/sm103a");
+
+  CHECK_INPUT(output, "output must be a CUDA tensor");
+  CHECK_INPUT(output_scale, "output_scale must be a CUDA tensor");
+  CHECK_INPUT(input, "input must be a CUDA tensor");
+  CHECK_INPUT(input_global_scale, "input_global_scale must be a CUDA tensor");
+  CHECK_INPUT(input_offset_by_experts, "input_offset_by_experts must be a CUDA tensor");
+  CHECK_INPUT(output_scale_offset_by_experts, "output_scale_offset_by_experts must be a CUDA tensor");
+
+  TORCH_CHECK(output.dim() == 2);
+  TORCH_CHECK(output_scale.dim() == 2);
+  TORCH_CHECK(input.dim() == 2);
+  TORCH_CHECK(input_global_scale.dim() == 1);
+  TORCH_CHECK(input_offset_by_experts.dim() == 1);
+  TORCH_CHECK(output_scale_offset_by_experts.dim() == 1);
+
+  TORCH_CHECK(input.scalar_type() == HALF || input.scalar_type() == BF16);
+  TORCH_CHECK(input_global_scale.scalar_type() == FLOAT);
+  TORCH_CHECK(input_offset_by_experts.scalar_type() == INT);
+  TORCH_CHECK(output_scale_offset_by_experts.scalar_type() == INT);
+  // output is uint8 (two nvfp4 values are packed into one uint8)
+  // output_scale is int32 (four fp8 values are packed into one int32)
+  TORCH_CHECK(output.scalar_type() == UINT8);
+  TORCH_CHECK(output_scale.scalar_type() == INT);
+
+  const int BLOCK_SIZE = 16;
+  auto m_topk = input.size(0);
+  auto k = input.size(1);
+  TORCH_CHECK(k % BLOCK_SIZE == 0, "k must be a multiple of 16");
+  auto n_experts = input_global_scale.size(0);
+  TORCH_CHECK(input_offset_by_experts.size(0) == n_experts + 1);
+  TORCH_CHECK(output_scale_offset_by_experts.size(0) == n_experts + 1);
+  TORCH_CHECK(output.size(0) == m_topk);
+  TORCH_CHECK(output.size(1) == k / 2);
+  int scales_k = k / BLOCK_SIZE;
+  // 4 means the swizzle requirement by nvidia nvfp4.
+  int padded_k = (scales_k + (4 - 1)) / 4 * 4;
+  // 4 means 4 fp8 values are packed into one int32
+  TORCH_CHECK(output_scale.size(1) * 4 == padded_k);
+
+  auto in_dtype = input.dtype();
+  at::cuda::CUDAGuard device_guard{(char)input.get_device()};
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(input.get_device());
+  if (in_dtype == at::ScalarType::Half) {
+    quant_impl<half>(
+        output.data_ptr(),
+        output_scale.data_ptr(),
+        input.data_ptr(),
+        input_global_scale.data_ptr(),
+        input_offset_by_experts.data_ptr(),
+        output_scale_offset_by_experts.data_ptr(),
+        nullptr,  // mask
+        false,    // use_silu_and_mul
+        m_topk,
+        k,
+        n_experts,
+        stream);
+  } else if (in_dtype == at::ScalarType::BFloat16) {
+    quant_impl<__nv_bfloat16>(
+        output.data_ptr(),
+        output_scale.data_ptr(),
+        input.data_ptr(),
+        input_global_scale.data_ptr(),
+        input_offset_by_experts.data_ptr(),
+        output_scale_offset_by_experts.data_ptr(),
+        nullptr,  // mask
+        false,    // use_silu_and_mul
+        m_topk,
+        k,
+        n_experts,
+        stream);
+  } else {
+    TORCH_CHECK(false, "Expected input data type to be half or bfloat16");
+  }
+}
+
+void silu_and_mul_scaled_fp4_experts_quant_sm100a(
+    torch::Tensor& output,
+    torch::Tensor& output_scale,
+    torch::Tensor const& input,
+    torch::Tensor const& input_global_scale,
+    torch::Tensor const& mask,
+    bool use_silu_and_mul) {
+  auto sm_version = getSMVersion();
+  TORCH_CHECK(sm_version == 100 || sm_version == 103, "fp4_quant is only supported on sm100a/sm103a");
+
+  CHECK_INPUT(output, "output must be a CUDA tensor");
+  CHECK_INPUT(output_scale, "output_scale must be a CUDA tensor");
+  CHECK_INPUT(input, "input must be a CUDA tensor");
+  CHECK_INPUT(input_global_scale, "input_global_scale must be a CUDA tensor");
+  CHECK_INPUT(mask, "mask must be a CUDA tensor");
+
+  TORCH_CHECK(output.dim() == 2);
+  TORCH_CHECK(output_scale.dim() == 2);
+  TORCH_CHECK(input.dim() == 2);
+  TORCH_CHECK(input_global_scale.dim() == 1);
+
+  TORCH_CHECK(input.scalar_type() == HALF || input.scalar_type() == BF16);
+  TORCH_CHECK(input_global_scale.scalar_type() == FLOAT);
+  TORCH_CHECK(mask.scalar_type() == INT);
+  // output is uint8 (two nvfp4 values are packed into one uint8)
+  // output_scale is int32 (four fp8 values are packed into one int32)
+  TORCH_CHECK(output.scalar_type() == UINT8);
+  TORCH_CHECK(output_scale.scalar_type() == INT);
+
+  const int BLOCK_SIZE = 16;
+  auto m_topk = input.size(0);
+  auto k_by_2 = input.size(1);
+  auto k = k_by_2;
+  if (use_silu_and_mul) {
+    TORCH_CHECK(k_by_2 % 2 == 0, "k must be a multiple of 2");
+    k = k_by_2 / 2;
+  }
+  auto n_experts = input_global_scale.size(0);
+  TORCH_CHECK(mask.size(0) == n_experts);
+  TORCH_CHECK(output.size(0) == m_topk);
+  TORCH_CHECK(output.size(1) == k / 2);
+  int scales_k = k / BLOCK_SIZE;
+  // 4 means the swizzle requirement by nvidia nvfp4.
+  int padded_k = (scales_k + (4 - 1)) / 4 * 4;
+  // 4 means 4 fp8 values are packed into one int32
+  TORCH_CHECK(output_scale.size(1) * 4 == padded_k);
+
+  auto in_dtype = input.dtype();
+  at::cuda::CUDAGuard device_guard{(char)input.get_device()};
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(input.get_device());
+  if (in_dtype == at::ScalarType::Half) {
+    quant_impl<half>(
+        output.data_ptr(),
+        output_scale.data_ptr(),
+        input.data_ptr(),
+        input_global_scale.data_ptr(),
+        nullptr,  // input_offset_by_experts
+        nullptr,  // output_scale_offset_by_experts
+        mask.data_ptr(),
+        use_silu_and_mul,
+        m_topk,
+        k,
+        n_experts,
+        stream);
+  } else if (in_dtype == at::ScalarType::BFloat16) {
+    quant_impl<__nv_bfloat16>(
+        output.data_ptr(),
+        output_scale.data_ptr(),
+        input.data_ptr(),
+        input_global_scale.data_ptr(),
+        nullptr,  // input_offset_by_experts
+        nullptr,  // output_scale_offset_by_experts
+        mask.data_ptr(),
+        use_silu_and_mul,
+        m_topk,
+        k,
+        n_experts,
+        stream);
+  } else {
+    TORCH_CHECK(false, "Expected input data type to be half or bfloat16");
+  }
+}
diff --git a/sgl-kernel/csrc/gemm/nvfp4_quant.cuh b/sgl-kernel/csrc/gemm/nvfp4_quant.cuh
new file mode 100644
index 000000000..b2aa5f006
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/nvfp4_quant.cuh
@@ -0,0 +1,176 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cuda.h>
+#include <cuda_fp8.h>
+#include <cutlass/arch/config.h>
+
+// Get type2 from type or vice versa (applied to half and bfloat16)
+template <typename T>
+struct TypeConverter {
+  using Type = half2;
+};  // keep for generality
+
+template <>
+struct TypeConverter<half2> {
+  using Type = half;
+};
+
+template <>
+struct TypeConverter<half> {
+  using Type = half2;
+};
+
+template <>
+struct TypeConverter<__nv_bfloat162> {
+  using Type = __nv_bfloat16;
+};
+
+template <>
+struct TypeConverter<__nv_bfloat16> {
+  using Type = __nv_bfloat162;
+};
+
+#define ELTS_PER_THREAD 8
+
+constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
+constexpr int CVT_FP4_SF_VEC_SIZE = 16;
+
+// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
+  // PTX instructions used here requires sm100a/sm103a.
+#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0]),
+        "f"(array[1]),
+        "f"(array[2]),
+        "f"(array[3]),
+        "f"(array[4]),
+        "f"(array[5]),
+        "f"(array[6]),
+        "f"(array[7]));
+  return val;
+#else
+  return 0;
+#endif
+}
+
+// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
+  // PTX instructions used here requires sm100a/sm103a.
+#if CUTLASS_ARCH_MMA_SM100A_ENABLED || CUTLASS_ARCH_MMA_SM103A_ENABLED
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0].x),
+        "f"(array[0].y),
+        "f"(array[1].x),
+        "f"(array[1].y),
+        "f"(array[2].x),
+        "f"(array[2].y),
+        "f"(array[3].x),
+        "f"(array[3].y));
+  return val;
+#else
+  return 0;
+#endif
+}
+
+// Fast reciprocal.
+inline __device__ float reciprocal_approximate_ftz(float a) {
+  float b;
+  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
+  return b;
+}
+
+template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
+__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx, int numCols, SFType* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 || CVT_FP4_NUM_THREADS_PER_SF == 2);
+
+  // One pair of threads write one SF to global memory.
+  // TODO: stage through smem for packed STG.32
+  // is it better than STG.8 from 4 threads ?
+  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
+    // SF vector index (16 elements share one SF in the K dimension).
+    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
+    int32_t mIdx = rowIdx;
+
+    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
+    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
+
+    int32_t mTileIdx = mIdx / (32 * 4);
+    // SF vector size 16.
+    int factor = CVT_FP4_SF_VEC_SIZE * 4;
+    int32_t numKTiles = (numCols + factor - 1) / factor;
+    int64_t mTileStride = numKTiles * 32 * 4 * 4;
+
+    int32_t kTileIdx = (kIdx / 4);
+    int64_t kTileStride = 32 * 4 * 4;
+
+    // M tile layout [32, 4] is column-major.
+    int32_t outerMIdx = (mIdx % 32);
+    int64_t outerMStride = 4 * 4;
+
+    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
+    int64_t innerMStride = 4;
+
+    int32_t innerKIdx = (kIdx % 4);
+    int64_t innerKStride = 1;
+
+    // Compute the global offset.
+    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride + outerMIdx * outerMStride +
+                       innerMIdx * innerMStride + innerKIdx * innerKStride;
+
+    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
+  }
+#endif
+  return nullptr;
+}
+
+// Define a 16 bytes packed data type.
+template <class Type>
+struct PackedVec {
+  typename TypeConverter<Type>::Type elts[4];
+};
+
+template <>
+struct PackedVec<__nv_fp8_e4m3> {
+  __nv_fp8x2_e4m3 elts[8];
+};
diff --git a/sgl-kernel/csrc/gemm/nvfp4_quant_entry.cu b/sgl-kernel/csrc/gemm/nvfp4_quant_entry.cu
new file mode 100644
index 000000000..d960aa730
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/nvfp4_quant_entry.cu
@@ -0,0 +1,74 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <torch/all.h>
+
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+void scaled_fp4_quant_sm100a(
+    torch::Tensor& output, torch::Tensor const& input, torch::Tensor& output_sf, torch::Tensor const& input_sf);
+
+void scaled_fp4_experts_quant_sm100a(
+    torch::Tensor& output,
+    torch::Tensor& output_scale,
+    torch::Tensor const& input,
+    torch::Tensor const& input_global_scale,
+    torch::Tensor const& input_offset_by_experts,
+    torch::Tensor const& output_scale_offset_by_experts);
+
+void silu_and_mul_scaled_fp4_experts_quant_sm100a(
+    torch::Tensor& output,
+    torch::Tensor& output_scale,
+    torch::Tensor const& input,
+    torch::Tensor const& input_global_scale,
+    torch::Tensor const& mask,
+    bool use_silu_and_mul);
+
+#endif
+
+void scaled_fp4_quant(
+    torch::Tensor& output, torch::Tensor const& input, torch::Tensor& output_sf, torch::Tensor const& input_sf) {
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+  return scaled_fp4_quant_sm100a(output, input, output_sf, input_sf);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization");
+}
+
+void scaled_fp4_experts_quant(
+    torch::Tensor& output,
+    torch::Tensor& output_scale,
+    torch::Tensor const& input,
+    torch::Tensor const& input_global_scale,
+    torch::Tensor const& input_offset_by_experts,
+    torch::Tensor const& output_scale_offset_by_experts) {
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+  return scaled_fp4_experts_quant_sm100a(
+      output, output_scale, input, input_global_scale, input_offset_by_experts, output_scale_offset_by_experts);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 experts quantization kernel");
+}
+
+void silu_and_mul_scaled_fp4_experts_quant(
+    torch::Tensor& output,
+    torch::Tensor& output_scale,
+    torch::Tensor const& input,
+    torch::Tensor const& input_global_scale,
+    torch::Tensor const& mask,
+    bool use_silu_and_mul) {
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+  return silu_and_mul_scaled_fp4_experts_quant_sm100a(
+      output, output_scale, input, input_global_scale, mask, use_silu_and_mul);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 experts quantization kernel");
+}
diff --git a/sgl-kernel/csrc/gemm/nvfp4_quant_kernels.cu b/sgl-kernel/csrc/gemm/nvfp4_quant_kernels.cu
new file mode 100644
index 000000000..d307f5fb7
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/nvfp4_quant_kernels.cu
@@ -0,0 +1,239 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+#include <torch/all.h>
+
+#include "nvfp4_quant.cuh"
+#include "utils.h"
+
+// Quantizes the provided PackedVec into the uint32_t output
+template <class Type, bool UE8M0_SF = false>
+__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal, uint8_t* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  // Get absolute maximum values among the local 8 values.
+  auto localMax = __habs2(vec.elts[0]);
+
+// Local maximum value.
+#pragma unroll
+  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    localMax = __hmax2(localMax, __habs2(vec.elts[i]));
+  }
+
+  // Get the absolute maximum among all 16 values (two threads).
+  localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
+  // Get the final absolute maximum values.
+  float vecMax = float(__hmax(localMax.x, localMax.y));
+
+  // Get the SF (max value of the vector / max value of e2m1).
+  // maximum value of e2m1 = 6.0.
+  // TODO: use half as compute data type.
+  float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
+  // 8 bits representation of the SF.
+  uint8_t fp8SFVal;
+  // Write the SF to global memory (STG.8).
+  if constexpr (UE8M0_SF) {
+    __nv_fp8_e8m0 tmp;
+    tmp.__x = __nv_cvt_float_to_e8m0(SFValue, __NV_SATFINITE, cudaRoundPosInf);
+    SFValue = static_cast<float>(tmp);
+    fp8SFVal = tmp.__x;
+  } else {
+    // Here SFValue is always positive, so E4M3 is the same as UE4M3.
+    __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
+    fp8SFVal = tmp.__x;
+    SFValue = static_cast<float>(tmp);
+  }
+  // Get the output scale.
+  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
+  //                       reciprocal(SFScaleVal))
+  float outputScale =
+      SFValue != 0 ? reciprocal_approximate_ftz(SFValue * reciprocal_approximate_ftz(SFScaleVal)) : 0.0f;
+
+  if (SFout) {
+    // Write the SF to global memory (STG.8).
+    *SFout = fp8SFVal;
+  }
+
+  // Convert the input to float.
+  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
+
+#pragma unroll
+  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    if constexpr (std::is_same_v<Type, half>) {
+      fp2Vals[i] = __half22float2(vec.elts[i]);
+    } else {
+      fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
+    }
+    fp2Vals[i].x *= outputScale;
+    fp2Vals[i].y *= outputScale;
+  }
+
+  // Convert to e2m1 values.
+  uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
+
+  // Write the e2m1 values to global memory.
+  return e2m1Vec;
+#else
+  return 0;
+#endif
+}
+
+// Use UE4M3 by default.
+template <class Type, bool UE8M0_SF = false>
+__global__ void
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__launch_bounds__(512, 4) cvt_fp16_to_fp4(
+#else
+cvt_fp16_to_fp4(
+#endif
+    int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, uint32_t* out, uint32_t* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  using PackedVec = PackedVec<Type>;
+  static constexpr int CVT_FP4_NUM_THREADS_PER_SF = (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
+  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, "Vec size is not matched.");
+
+  // Get the global scaling factor, which will be applied to the SF.
+  // Note SFScale is the same as next GEMM's alpha, which is
+  // (448.f / (Alpha_A / 6.f)).
+  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
+
+  // Input tensor row/col loops.
+  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
+    for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD; colIdx += blockDim.x) {
+      int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
+      PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+      // Get the output tensor offset.
+      // Same as inOffset because 8 elements are packed into one uint32_t.
+      int64_t outOffset = inOffset;
+      auto& out_pos = out[outOffset];
+
+      auto sf_out =
+          cvt_quant_to_fp4_get_sf_out_offset<uint32_t, CVT_FP4_NUM_THREADS_PER_SF>(rowIdx, colIdx, numCols, SFout);
+
+      out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+    }
+  }
+#endif
+}
+
+template <typename T>
+void invokeFP4Quantization(
+    int m,
+    int n,
+    T const* input,
+    float const* SFScale,
+    int64_t* output,
+    int32_t* SFOuput,
+    bool useUE8M0,
+    int multiProcessorCount,
+    cudaStream_t stream) {
+  // Grid, Block size.
+  // Each thread converts 8 values.
+  dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
+  // Get number of blocks per SM (assume we can fully utilize the SM).
+  int const numBlocksPerSM = 2048 / block.x;
+  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
+
+  // Launch the cvt kernel.
+  if (useUE8M0) {
+    cvt_fp16_to_fp4<T, true><<<grid, block, 0, stream>>>(
+        m, n, input, SFScale, reinterpret_cast<uint32_t*>(output), reinterpret_cast<uint32_t*>(SFOuput));
+  } else {
+    cvt_fp16_to_fp4<T, false><<<grid, block, 0, stream>>>(
+        m, n, input, SFScale, reinterpret_cast<uint32_t*>(output), reinterpret_cast<uint32_t*>(SFOuput));
+  }
+}
+
+// Instantiate the function.
+template void invokeFP4Quantization(
+    int m,
+    int n,
+    half const* input,
+    float const* SFScale,
+    int64_t* output,
+    int32_t* SFOuput,
+    bool useUE8M0,
+    int multiProcessorCount,
+    cudaStream_t stream);
+
+template void invokeFP4Quantization(
+    int m,
+    int n,
+    __nv_bfloat16 const* input,
+    float const* SFScale,
+    int64_t* output,
+    int32_t* SFOuput,
+    bool useUE8M0,
+    int multiProcessorCount,
+    cudaStream_t stream);
+
+inline int getMultiProcessorCount() {
+  static int multi_processor_count = []() {
+    int device_id = 0;
+    int count = 0;
+
+    // Get the current CUDA device ID
+    CHECK_CUDA_SUCCESS(cudaGetDevice(&device_id));
+
+    // Get the number of multiprocessors for the current device
+    CHECK_CUDA_SUCCESS(cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, device_id));
+
+    return count;  // Initialize the static variable
+  }();
+
+  return multi_processor_count;  // Return the cached value on subsequent calls
+}
+
+void scaled_fp4_quant_sm100a(
+    torch::Tensor& output, torch::Tensor const& input, torch::Tensor& output_sf, torch::Tensor const& input_sf) {
+  auto sm_version = getSMVersion();
+  TORCH_CHECK(sm_version == 100 || sm_version == 103, "fp4_quant is only supported on sm100a/sm103a");
+
+  int32_t m = input.size(0);
+  int32_t n = input.size(1);
+
+  TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16.");
+
+  int multiProcessorCount = getMultiProcessorCount();
+
+  auto input_sf_ptr = static_cast<float const*>(input_sf.data_ptr());
+  auto sf_out = static_cast<int32_t*>(output_sf.data_ptr());
+  auto output_ptr = static_cast<int64_t*>(output.data_ptr());
+  at::cuda::CUDAGuard device_guard{(char)input.get_device()};
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(input.get_device());
+
+  // We don't support e8m0 scales at this moment.
+  bool useUE8M0 = false;
+
+  switch (input.scalar_type()) {
+    case torch::kHalf: {
+      auto input_ptr = reinterpret_cast<half const*>(input.data_ptr());
+      invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, sf_out, useUE8M0, multiProcessorCount, stream);
+      break;
+    }
+    case torch::kBFloat16: {
+      auto input_ptr = reinterpret_cast<__nv_bfloat16 const*>(input.data_ptr());
+      invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, sf_out, useUE8M0, multiProcessorCount, stream);
+      break;
+    }
+    default: {
+      std::cerr << "Observing: " << input.scalar_type() << " for the input datatype which is invalid";
+      throw std::runtime_error("Unsupported input data type for quantize_to_fp4.");
+    }
+  }
+}
diff --git a/sgl-kernel/csrc/gemm/nvfp4_scaled_mm_entry.cu b/sgl-kernel/csrc/gemm/nvfp4_scaled_mm_entry.cu
new file mode 100644
index 000000000..8bcd8c52b
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/nvfp4_scaled_mm_entry.cu
@@ -0,0 +1,39 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <torch/all.h>
+
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+void cutlass_scaled_fp4_mm_sm100a(
+    torch::Tensor& D,
+    torch::Tensor const& A,
+    torch::Tensor const& B,
+    torch::Tensor const& A_sf,
+    torch::Tensor const& B_sf,
+    torch::Tensor const& alpha);
+#endif
+
+void cutlass_scaled_fp4_mm(
+    torch::Tensor& D,
+    torch::Tensor const& A,
+    torch::Tensor const& B,
+    torch::Tensor const& A_sf,
+    torch::Tensor const& B_sf,
+    torch::Tensor const& alpha) {
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+  return cutlass_scaled_fp4_mm_sm100a(D, A, B, A_sf, B_sf, alpha);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 mm kernel.");
+}
diff --git a/sgl-kernel/csrc/gemm/nvfp4_scaled_mm_kernels.cu b/sgl-kernel/csrc/gemm/nvfp4_scaled_mm_kernels.cu
new file mode 100644
index 000000000..a103545dd
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/nvfp4_scaled_mm_kernels.cu
@@ -0,0 +1,453 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+// clang-format off
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/util/packed_stride.hpp"
+// clang-format on
+
+/**
+ * Helper function for checking CUTLASS errors
+ */
+#define CUTLASS_CHECK(status)                                                       \
+  {                                                                                 \
+    cutlass::Status error = status;                                                 \
+    TORCH_CHECK(error == cutlass::Status::kSuccess, cutlassGetStatusString(error)); \
+  }
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+// Config(half_t/bfloat16_t) for M <= 128
+template <typename T>
+struct KernelConfigM128 {
+  using OutputType = T;
+  using MmaTileShape = Shape<_128, _256, _256>;
+  using ClusterShape = Shape<int, int, _1>;
+  using EpilogueTile = Shape<_128, _64>;  // Avoid register spilling
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized1Sm;
+  using MainloopSchedule = cutlass::gemm::KernelTmaWarpSpecialized1SmNvf4Sm100;
+  const static dim3 preferred_cluster;
+  const static dim3 fallback_cluster;
+};
+template <typename T>
+const dim3 KernelConfigM128<T>::preferred_cluster(1, 4, 1);
+template <typename T>
+const dim3 KernelConfigM128<T>::fallback_cluster(1, 2, 1);
+
+// Config(half_t/bfloat16_t) for M <= 256
+template <typename T>
+struct KernelConfigM256 {
+  using OutputType = T;
+  using MmaTileShape = Shape<_256, _256, _256>;
+  using ClusterShape = Shape<int, int, _1>;
+  using EpilogueTile = Shape<_128, _64>;  // Avoid register spilling
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized2Sm;
+  using MainloopSchedule = cutlass::gemm::KernelTmaWarpSpecialized2SmNvf4Sm100;
+  const static dim3 preferred_cluster;
+  const static dim3 fallback_cluster;
+};
+template <typename T>
+const dim3 KernelConfigM256<T>::preferred_cluster(2, 4, 1);
+template <typename T>
+const dim3 KernelConfigM256<T>::fallback_cluster(2, 1, 1);
+
+// Default config(half_t/bfloat16_t) for M > 256
+template <typename T>
+struct KernelConfigDefault {
+  using OutputType = T;
+  using MmaTileShape = Shape<_256, _256, _256>;
+  using ClusterShape = Shape<int, int, _1>;
+  using EpilogueTile = Shape<_128, _64>;  // Avoid register spilling
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized2Sm;
+  using MainloopSchedule = cutlass::gemm::KernelTmaWarpSpecialized2SmNvf4Sm100;
+  const static dim3 preferred_cluster;
+  const static dim3 fallback_cluster;
+};
+template <typename T>
+const dim3 KernelConfigDefault<T>::preferred_cluster(4, 4, 1);
+template <typename T>
+const dim3 KernelConfigDefault<T>::fallback_cluster(2, 1, 1);
+
+struct KernelConfigFp32 {
+  using OutputType = float;
+  using MmaTileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<int, int, _1>;
+  using EpilogueTile = cutlass::epilogue::collective::EpilogueTileAuto;
+  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized1Sm;
+  using MainloopSchedule = cutlass::gemm::KernelTmaWarpSpecialized1SmNvf4Sm100;
+  const static dim3 preferred_cluster;
+  const static dim3 fallback_cluster;
+};
+const dim3 KernelConfigFp32::preferred_cluster = dim3(1, 4, 1);
+const dim3 KernelConfigFp32::fallback_cluster = dim3(1, 2, 1);
+
+template <typename KernelConfig>
+struct Fp4GemmSm100 {
+  using Config = KernelConfig;  // For generating args
+  using OutputType = typename KernelConfig::OutputType;
+  // A matrix configuration
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using LayoutATag = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA = 32;
+
+  // B matrix configuration
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB = 32;
+
+  // C/D matrix configuration
+  using ElementD = OutputType;
+  using ElementC = OutputType;
+  using LayoutCTag = cutlass::layout::RowMajor;
+  using LayoutDTag = cutlass::layout::RowMajor;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  // Kernel functional config
+  using ElementAccumulator = float;
+  using ArchTag = cutlass::arch::Sm100;
+  using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;
+
+  // Kernel Perf config
+  using MmaTileShape = typename KernelConfig::MmaTileShape;
+  using ClusterShape = typename KernelConfig::ClusterShape;
+  using EpilogueTile = typename KernelConfig::EpilogueTile;
+  using EpilogueSchedule = typename KernelConfig::EpilogueSchedule;
+  using MainloopSchedule = typename KernelConfig::MainloopSchedule;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      MmaTileShape,
+      ClusterShape,
+      EpilogueTile,
+      ElementAccumulator,
+      ElementAccumulator,
+      void,
+      LayoutCTag,
+      AlignmentC,
+      ElementD,
+      LayoutDTag,
+      AlignmentD,
+      EpilogueSchedule,
+      cutlass::epilogue::fusion::LinearCombination<ElementD, float, void, float>>::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      ElementA,
+      LayoutATag,
+      AlignmentA,
+      ElementB,
+      LayoutBTag,
+      AlignmentB,
+      ElementAccumulator,
+      MmaTileShape,
+      ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+          sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      MainloopSchedule>::CollectiveOp;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmUniversal<Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using LayoutA = decltype(cute::make_layout(make_shape(0, 0, 0), StrideA{}));
+  using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using LayoutB = decltype(cute::make_layout(make_shape(0, 0, 0), StrideB{}));
+  using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using LayoutC = decltype(cute::make_layout(make_shape(0, 0, 0), StrideC{}));
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+  using LayoutD = decltype(cute::make_layout(make_shape(0, 0, 0), StrideD{}));
+};
+
+template <typename T>
+typename T::Gemm::Arguments args_from_options(
+    at::Tensor& D,
+    at::Tensor const& A,
+    at::Tensor const& B,
+    at::Tensor const& A_sf,
+    at::Tensor const& B_sf,
+    at::Tensor const& alpha,
+    int64_t M,
+    int64_t N,
+    int64_t K) {
+  using ElementA = typename T::Gemm::ElementA;
+  using ElementB = typename T::Gemm::ElementB;
+  using ElementSFA = cutlass::float_ue4m3_t;
+  using ElementSFB = cutlass::float_ue4m3_t;
+  using ElementD = typename T::Gemm::ElementD;
+  using ElementCompute = float;
+  using StrideA = typename T::StrideA;
+  using StrideB = typename T::StrideB;
+  using StrideD = typename T::StrideD;
+  using Sm1xxBlkScaledConfig = typename T::Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+
+  int m = static_cast<int>(M);
+  int n = static_cast<int>(N);
+  int k = static_cast<int>(K);
+  auto stride_A = cutlass::make_cute_packed_stride(StrideA{}, {m, k, 1});
+  auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {n, k, 1});
+  auto stride_D = cutlass::make_cute_packed_stride(StrideD{}, {m, n, 1});
+
+  auto layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(m, n, k, 1));
+  auto layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(m, n, k, 1));
+
+  typename T::Gemm::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {m, n, k, 1},
+      {// Mainloop arguments
+       static_cast<ElementA const*>(A.data_ptr()),
+       stride_A,
+       static_cast<ElementB const*>(B.data_ptr()),
+       stride_B,
+       static_cast<ElementSFA const*>(A_sf.data_ptr()),
+       layout_SFA,
+       static_cast<ElementSFB const*>(B_sf.data_ptr()),
+       layout_SFB},
+      {     // Epilogue arguments
+       {},  // epilogue.thread
+       nullptr,
+       stride_D,
+       static_cast<ElementD*>(D.data_ptr()),
+       stride_D}};
+  auto& fusion_args = arguments.epilogue.thread;
+  fusion_args.alpha_ptr = static_cast<ElementCompute const*>(alpha.data_ptr());
+  using KernelConfig = typename T::Config;
+  arguments.hw_info.cluster_shape = KernelConfig::preferred_cluster;
+  arguments.hw_info.cluster_shape_fallback = KernelConfig::fallback_cluster;
+  return arguments;
+}
+
+template <typename T>
+void runGemm(
+    at::Tensor& D,
+    at::Tensor const& A,
+    at::Tensor const& B,
+    at::Tensor const& A_sf,
+    at::Tensor const& B_sf,
+    at::Tensor const& alpha,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    cudaStream_t stream) {
+  typename T::Gemm gemm;
+  auto arguments = args_from_options<T>(D, A, B, A_sf, B_sf, alpha, m, n, k);
+
+  size_t workspace_size = T::Gemm::get_workspace_size(arguments);
+  auto const workspace_options = torch::TensorOptions().dtype(torch::kUInt8).device(A.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream));
+
+  CUTLASS_CHECK(gemm.run(arguments, workspace.data_ptr(), stream));
+}
+
+// Dispatch function to select appropriate config based on M
+template <typename OutType>
+void cutlassFp4GemmDispatch(
+    torch::Tensor& D,
+    torch::Tensor const& A,
+    torch::Tensor const& B,
+    torch::Tensor const& A_sf,
+    torch::Tensor const& B_sf,
+    torch::Tensor const& alpha,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    cudaStream_t stream) {
+  if (m <= 128) {
+    // m in [1, 128]
+    runGemm<Fp4GemmSm100<KernelConfigM128<OutType>>>(D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  } else if (m <= 256) {
+    // m in (128, 256]
+    runGemm<Fp4GemmSm100<KernelConfigM256<OutType>>>(D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  } else {
+    // m in (256, inf)
+    runGemm<Fp4GemmSm100<KernelConfigDefault<OutType>>>(D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  }
+}
+
+// Dispatch function to select appropriate config based on M
+template <>
+void cutlassFp4GemmDispatch<float>(
+    torch::Tensor& D,
+    torch::Tensor const& A,
+    torch::Tensor const& B,
+    torch::Tensor const& A_sf,
+    torch::Tensor const& B_sf,
+    torch::Tensor const& alpha,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    cudaStream_t stream) {
+  runGemm<Fp4GemmSm100<KernelConfigFp32>>(D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+}
+
+#else
+template <typename T>
+void cutlassFp4GemmDispatch(
+    at::Tensor& D,
+    at::Tensor const& A,
+    at::Tensor const& B,
+    at::Tensor const& A_sf,
+    at::Tensor const& B_sf,
+    at::Tensor const& alpha,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    cudaStream_t stream) {
+  TORCH_CHECK(
+      false,
+      "Unsupported CUTLASS version. Set VLLM_CUTLASS_SRC_DIR to "
+      "a CUTLASS 3.8 source directory to enable support.");
+}
+#endif  // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+#define CHECK_TYPE(x, st, m) TORCH_CHECK(x.scalar_type() == st, "Inconsistency of Tensor type:", m)
+#define CHECK_TH_CUDA(x, m) TORCH_CHECK(x.is_cuda(), m, "must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x, m) TORCH_CHECK(x.is_contiguous(), m, "must be contiguous")
+#define CHECK_INPUT(x, st, m) \
+  CHECK_TH_CUDA(x, m);        \
+  CHECK_CONTIGUOUS(x, m);     \
+  CHECK_TYPE(x, st, m)
+
+constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
+constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
+
+void cutlass_scaled_fp4_mm_sm100a(
+    torch::Tensor& D,
+    torch::Tensor const& A,
+    torch::Tensor const& B,
+    torch::Tensor const& A_sf,
+    torch::Tensor const& B_sf,
+    torch::Tensor const& alpha) {
+  CHECK_INPUT(A, FLOAT4_E2M1X2, "a");
+  CHECK_INPUT(B, FLOAT4_E2M1X2, "b");
+
+  CHECK_INPUT(A_sf, SF_DTYPE, "scale_a");
+  CHECK_INPUT(B_sf, SF_DTYPE, "scale_b");
+
+  CHECK_INPUT(alpha, at::ScalarType::Float, "alpha");
+
+  TORCH_CHECK(A.dim() == 2, "a must be a matrix");
+  TORCH_CHECK(B.dim() == 2, "b must be a matrix");
+  TORCH_CHECK(
+      A.size(1) == B.size(1),
+      "a and b shapes cannot be multiplied (",
+      A.size(0),
+      "x",
+      A.size(1),
+      " and ",
+      B.size(0),
+      "x",
+      B.size(1),
+      ")");
+
+  auto const m = A.size(0);
+  auto const n = B.size(0);
+  auto const k = A.size(1) * 2;
+
+  constexpr int alignment = 32;
+  TORCH_CHECK(
+      k % alignment == 0,
+      "Expected k to be divisible by ",
+      alignment,
+      ", but got a shape: (",
+      A.size(0),
+      "x",
+      A.size(1),
+      "), k: ",
+      k,
+      ".");
+  TORCH_CHECK(
+      n % alignment == 0,
+      "Expected n to be divisible by ",
+      alignment,
+      ", but got b shape: (",
+      B.size(0),
+      "x",
+      B.size(1),
+      ").");
+
+  auto round_up = [](int x, int y) { return (x + y - 1) / y * y; };
+  int rounded_m = round_up(m, 128);
+  int rounded_n = round_up(n, 128);
+  // Since k is divisible by 32 (alignment), k / 16 is guaranteed to be an
+  // integer.
+  int rounded_k = round_up(k / 16, 4);
+
+  TORCH_CHECK(A_sf.dim() == 2, "scale_a must be a matrix");
+  TORCH_CHECK(B_sf.dim() == 2, "scale_b must be a matrix");
+  TORCH_CHECK(
+      A_sf.size(1) == B_sf.size(1),
+      "scale_a and scale_b shapes cannot be multiplied (",
+      A_sf.size(0),
+      "x",
+      A_sf.size(1),
+      " and ",
+      B_sf.size(0),
+      "x",
+      B_sf.size(1),
+      ")");
+  TORCH_CHECK(
+      A_sf.size(0) == rounded_m && A_sf.size(1) == rounded_k,
+      "scale_a must be padded and swizzled to a shape (",
+      rounded_m,
+      "x",
+      rounded_k,
+      "), but got a shape (",
+      A_sf.size(0),
+      "x",
+      A_sf.size(1),
+      ")");
+  TORCH_CHECK(
+      B_sf.size(0) == rounded_n && B_sf.size(1) == rounded_k,
+      "scale_b must be padded and swizzled to a shape (",
+      rounded_n,
+      "x",
+      rounded_k,
+      "), but got a shape (",
+      B_sf.size(0),
+      "x",
+      B_sf.size(1),
+      ")");
+
+  auto out_dtype = D.dtype();
+  at::cuda::CUDAGuard device_guard{(char)A.get_device()};
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(A.get_device());
+
+  if (out_dtype == at::ScalarType::Half) {
+    cutlassFp4GemmDispatch<cutlass::half_t>(D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  } else if (out_dtype == at::ScalarType::BFloat16) {
+    cutlassFp4GemmDispatch<cutlass::bfloat16_t>(D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  } else if (out_dtype == at::ScalarType::Float) {
+    cutlassFp4GemmDispatch<float>(D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  } else {
+    TORCH_CHECK(false, "Unsupported output data type of nvfp4 mm");
+  }
+}
diff --git a/sgl-kernel/csrc/gemm/per_tensor_quant_fp8.cu b/sgl-kernel/csrc/gemm/per_tensor_quant_fp8.cu
new file mode 100644
index 000000000..7afff7794
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/per_tensor_quant_fp8.cu
@@ -0,0 +1,123 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/util/Float8_e4m3fn.h>
+
+#include <cmath>
+#include <cub/block/block_reduce.cuh>
+#include <flashinfer/vec_dtypes.cuh>
+
+#include "utils.h"
+
+template <typename T>
+__global__ void
+per_tensor_absmax_kernel(const T* __restrict__ input, float* __restrict__ output_s, const int64_t num_elements) {
+  float max_value = 0.0f;
+  unsigned int tid = threadIdx.x;
+  unsigned int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int grid_size = blockDim.x * gridDim.x;
+
+  constexpr uint32_t vec_size = 16 / sizeof(T);
+  using vec_t = flashinfer::vec_t<T, vec_size>;
+
+  const int32_t num_vec_elems = num_elements / vec_size;
+
+  for (int32_t i = gid; i < num_vec_elems; i += grid_size) {
+    vec_t input_vec;
+    input_vec.cast_load(input + i * vec_size);
+
+#pragma unroll
+    for (uint32_t j = 0; j < vec_size; ++j) {
+      float val = static_cast<float>(input_vec[j]);
+      max_value = fmaxf(max_value, fabsf(val));
+    }
+  }
+
+  const int32_t remaining_start = num_vec_elems * vec_size;
+  for (int32_t idx = remaining_start + gid; idx < num_elements; idx += grid_size) {
+    float val = static_cast<float>(input[idx]);
+    max_value = fmaxf(max_value, fabsf(val));
+  }
+
+  max_value = blockReduceMax(max_value);
+
+  if (tid == 0) {
+    atomicMaxFloat(output_s, max_value / FP8_E4M3_MAX);
+  }
+}
+
+template <typename T, typename DST_DTYPE>
+__global__ void per_tensor_quant_fp8_kernel(
+    const T* __restrict__ input,
+    DST_DTYPE* __restrict__ output,
+    const float* __restrict__ scale,
+    const int64_t num_elements) {
+  const int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int grid_size = blockDim.x * gridDim.x;
+  const float scale_val = 1.0f / (*scale);
+
+  // We want to store 128 bits of data at a time. 16 = 128 / 8 bits
+  // Load is already vectorized, so 16 elements work for T.
+  const uint32_t VEC_SIZE = 16;
+  using vec_t = flashinfer::vec_t<T, VEC_SIZE>;
+
+  const int32_t num_vec_elems = num_elements / VEC_SIZE;
+
+  for (int32_t i = gid; i < num_vec_elems; i += grid_size) {
+    vec_t input_vec;
+    input_vec.cast_load(input + i * VEC_SIZE);
+
+    DST_DTYPE output_arr[VEC_SIZE];
+#pragma unroll
+    for (uint32_t j = 0; j < VEC_SIZE; ++j) {
+      float val = fmax(fmin(static_cast<float>(input_vec[j]) * scale_val, FP8_E4M3_MAX), -FP8_E4M3_MAX);
+#if !defined(USE_ROCM) || defined(HIP_FP8_TYPE_E4M3)
+      output_arr[j] = static_cast<DST_DTYPE>(val);
+#else
+      output_arr[j] = c10::Float8_e4m3fnuz(
+          __hip_cvt_float_to_fp8(val, fp8::fp8_type::__default_saturation, fp8::fp8_type::__default_interpret),
+          c10::Float8_e4m3fnuz::from_bits());
+#endif
+    }
+    *(uint4*)(output + i * VEC_SIZE) = *(uint4*)output_arr;
+  }
+
+  const int32_t remaining_start = num_vec_elems * VEC_SIZE;
+  for (int32_t idx = remaining_start + gid; idx < num_elements; idx += grid_size) {
+    float val = fmax(-FP8_E4M3_MAX, fmin(static_cast<float>(input[idx]) * scale_val, FP8_E4M3_MAX));
+#if !defined(USE_ROCM) || defined(HIP_FP8_TYPE_E4M3)
+    output[idx] = static_cast<DST_DTYPE>(val);
+#else
+    output[idx] = c10::Float8_e4m3fnuz(
+        __hip_cvt_float_to_fp8(val, fp8::fp8_type::__default_saturation, fp8::fp8_type::__default_interpret),
+        c10::Float8_e4m3fnuz::from_bits());
+#endif
+  }
+}
+
+void sgl_per_tensor_quant_fp8(torch::Tensor input, torch::Tensor output_q, torch::Tensor output_s, bool is_static) {
+  CHECK_INPUT(input);
+  CHECK_INPUT(output_q);
+  CHECK_INPUT(output_s);
+
+  const int block_size = 256;
+  const int num_elements = input.numel();
+  const int num_blocks = min((num_elements + block_size - 1) / block_size, 1024);
+
+  dim3 grid(num_blocks);
+  dim3 block(block_size);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16(input.scalar_type(), scalar_t, [&] {
+    if (is_static == false) {
+      per_tensor_absmax_kernel<scalar_t><<<grid, block, 0, stream>>>(
+          static_cast<scalar_t*>(input.data_ptr()), static_cast<float*>(output_s.data_ptr()), num_elements);
+    }
+
+    per_tensor_quant_fp8_kernel<scalar_t, __nv_fp8_e4m3><<<grid, block, 0, stream>>>(
+        static_cast<scalar_t*>(input.data_ptr()),
+        static_cast<__nv_fp8_e4m3*>(output_q.data_ptr()),
+        static_cast<float*>(output_s.data_ptr()),
+        num_elements);
+    return true;
+  });
+}
diff --git a/sgl-kernel/csrc/gemm/per_token_group_quant_8bit.cu b/sgl-kernel/csrc/gemm/per_token_group_quant_8bit.cu
new file mode 100644
index 000000000..82daaef19
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/per_token_group_quant_8bit.cu
@@ -0,0 +1,240 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_fp8.h>
+
+#include <cmath>
+#include <flashinfer/vec_dtypes.cuh>
+
+#include "utils.h"
+
+__device__ __forceinline__ float GroupReduceMax(float val, const int tid) {
+  unsigned mask = threadIdx.x % 32 >= 16 ? 0xffff0000 : 0x0000ffff;
+
+  val = fmaxf(val, __shfl_xor_sync(mask, val, 8));
+  val = fmaxf(val, __shfl_xor_sync(mask, val, 4));
+  val = fmaxf(val, __shfl_xor_sync(mask, val, 2));
+  val = fmaxf(val, __shfl_xor_sync(mask, val, 1));
+  return val;
+}
+
+template <
+    typename T,
+    typename DST_DTYPE,
+    bool IS_COLUMN_MAJOR = false,
+    bool SCALE_UE8M0 = false,
+    typename scale_packed_t = std::conditional_t<SCALE_UE8M0, uint32_t, float>>
+__global__ void per_token_group_quant_8bit_kernel(
+    const T* __restrict__ input,
+    void* __restrict__ output_q,
+    scale_packed_t* __restrict__ output_s,
+    const int group_size,
+    const int num_groups,
+    const int groups_per_block,
+    const float eps,
+    const float min_8bit,
+    const float max_8bit,
+    const int num_groups_per_row = 0,
+    const int scale_stride = 0) {
+  const int threads_per_group = 16;
+  const int64_t local_group_id = threadIdx.x / threads_per_group;
+  const int lane_id = threadIdx.x % threads_per_group;
+
+  const int64_t block_group_id = blockIdx.x * groups_per_block;
+  const int64_t global_group_id = block_group_id + local_group_id;
+  const int64_t block_group_offset = global_group_id * group_size;
+
+  float local_absmax = eps;
+
+  using scale_element_t = std::conditional_t<SCALE_UE8M0, uint8_t, float>;
+  static_assert(sizeof(scale_packed_t) % sizeof(scale_element_t) == 0);
+
+  const T* group_input = input + block_group_offset;
+  DST_DTYPE* group_output = static_cast<DST_DTYPE*>(output_q) + block_group_offset;
+  scale_element_t* scale_output;
+
+  if constexpr (IS_COLUMN_MAJOR) {
+    const int num_elems_per_pack = static_cast<int>(sizeof(scale_packed_t) / sizeof(scale_element_t));
+    const int row_idx = global_group_id / num_groups_per_row;
+    const int col_idx_unpacked = global_group_id % num_groups_per_row;
+    const int col_idx = col_idx_unpacked / num_elems_per_pack;
+    const int pack_idx = col_idx_unpacked % num_elems_per_pack;
+    scale_output = reinterpret_cast<scale_element_t*>(output_s) +
+                   (col_idx * scale_stride * num_elems_per_pack + row_idx * num_elems_per_pack + pack_idx);
+  } else {
+    static_assert(!SCALE_UE8M0);
+    scale_output = output_s + global_group_id;
+  }
+
+  constexpr uint32_t vec_size = 16 / sizeof(T);
+  using vec_t = flashinfer::vec_t<T, vec_size>;
+
+  const int32_t num_vec_elems = group_size / vec_size;
+
+  for (int32_t i = lane_id; i < num_vec_elems; i += 16) {
+    vec_t input_vec;
+    input_vec.cast_load(group_input + i * vec_size);
+
+#pragma unroll
+    for (uint32_t j = 0; j < vec_size; ++j) {
+      float val = static_cast<float>(input_vec[j]);
+      float abs_val = fabsf(val);
+      local_absmax = fmaxf(local_absmax, abs_val);
+    }
+  }
+
+  local_absmax = GroupReduceMax(local_absmax, lane_id);
+
+  float y_s = local_absmax / max_8bit;
+  if constexpr (SCALE_UE8M0) {
+    y_s = exp2f(ceilf(log2f(fmaxf(y_s, 1e-10f))));
+  }
+
+  // TODO can optimize
+  scale_element_t y_s_quant;
+  if constexpr (SCALE_UE8M0) {
+    y_s_quant = (uint8_t)(((int)log2f(y_s)) + 127);
+  } else {
+    y_s_quant = y_s;
+  }
+
+  if (lane_id == 0) {
+    *scale_output = y_s_quant;
+  }
+
+  for (int32_t i = lane_id; i < num_vec_elems; i += 16) {
+    vec_t input_vec;
+    input_vec.cast_load(group_input + i * vec_size);
+
+#pragma unroll
+    for (uint32_t j = 0; j < vec_size; ++j) {
+      float val = static_cast<float>(input_vec[j]);
+      float q_val = fminf(fmaxf(val / y_s, min_8bit), max_8bit);
+      group_output[i * vec_size + j] = DST_DTYPE(q_val);
+    }
+  }
+}
+
+void sgl_per_token_group_quant_8bit(
+    torch::Tensor input,
+    torch::Tensor output_q,
+    torch::Tensor output_s,
+    int64_t group_size,
+    double eps,
+    double min_8bit,
+    double max_8bit,
+    bool scale_ue8m0 = false) {
+  CHECK_INPUT(input);
+  CHECK_INPUT(output_q);
+
+  const int num_groups = input.numel() / group_size;
+
+  CHECK_EQ(input.numel() % group_size, 0);
+  CHECK_EQ(output_s.dim(), 2);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  constexpr int THREADS_PER_GROUP = 16;
+
+  int groups_per_block = 1;
+
+  if (num_groups % 16 == 0) {
+    groups_per_block = 16;
+  } else if (num_groups % 8 == 0) {
+    groups_per_block = 8;
+  } else if (num_groups % 4 == 0) {
+    groups_per_block = 4;
+  } else if (num_groups % 2 == 0) {
+    groups_per_block = 2;
+  }
+
+  auto dst_type = output_q.scalar_type();
+  const int num_blocks = num_groups / groups_per_block;
+  const int num_threads = groups_per_block * THREADS_PER_GROUP;
+
+  const bool is_column_major = output_s.stride(0) < output_s.stride(1);
+  const int hidden_dim = input.size(input.dim() - 1);
+  const int num_groups_per_row = hidden_dim / group_size;
+  const int scale_stride = output_s.stride(1);
+
+#define LAUNCH_KERNEL(T, DST_DTYPE)                                                               \
+  do {                                                                                            \
+    dim3 grid(num_blocks);                                                                        \
+    dim3 block(num_threads);                                                                      \
+    if (is_column_major) {                                                                        \
+      if (scale_ue8m0) {                                                                          \
+        per_token_group_quant_8bit_kernel<T, DST_DTYPE, true, true><<<grid, block, 0, stream>>>(  \
+            static_cast<T*>(input.data_ptr()),                                                    \
+            output_q.data_ptr(),                                                                  \
+            static_cast<uint32_t*>(output_s.data_ptr()),                                          \
+            group_size,                                                                           \
+            num_groups,                                                                           \
+            groups_per_block,                                                                     \
+            (float)eps,                                                                           \
+            (float)min_8bit,                                                                      \
+            (float)max_8bit,                                                                      \
+            num_groups_per_row,                                                                   \
+            scale_stride);                                                                        \
+      } else {                                                                                    \
+        per_token_group_quant_8bit_kernel<T, DST_DTYPE, true, false><<<grid, block, 0, stream>>>( \
+            static_cast<T*>(input.data_ptr()),                                                    \
+            output_q.data_ptr(),                                                                  \
+            static_cast<float*>(output_s.data_ptr()),                                             \
+            group_size,                                                                           \
+            num_groups,                                                                           \
+            groups_per_block,                                                                     \
+            (float)eps,                                                                           \
+            (float)min_8bit,                                                                      \
+            (float)max_8bit,                                                                      \
+            num_groups_per_row,                                                                   \
+            scale_stride);                                                                        \
+      }                                                                                           \
+    } else {                                                                                      \
+      assert(!scale_ue8m0);                                                                       \
+      per_token_group_quant_8bit_kernel<T, DST_DTYPE, false><<<grid, block, 0, stream>>>(         \
+          static_cast<T*>(input.data_ptr()),                                                      \
+          output_q.data_ptr(),                                                                    \
+          static_cast<float*>(output_s.data_ptr()),                                               \
+          group_size,                                                                             \
+          num_groups,                                                                             \
+          groups_per_block,                                                                       \
+          (float)eps,                                                                             \
+          (float)min_8bit,                                                                        \
+          (float)max_8bit);                                                                       \
+    }                                                                                             \
+  } while (0)
+
+  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16(input.scalar_type(), scalar_t, [&] {
+    if (dst_type == at::ScalarType::Char) {
+      LAUNCH_KERNEL(scalar_t, int8_t);
+      return true;
+    } else if (dst_type == at::ScalarType::Float8_e4m3fn) {
+      LAUNCH_KERNEL(scalar_t, __nv_fp8_e4m3);
+      return true;
+    }
+    return false;
+  });
+
+#undef LAUNCH_KERNEL
+}
+
+void sgl_per_token_group_quant_int8(
+    torch::Tensor input,
+    torch::Tensor output_q,
+    torch::Tensor output_s,
+    int64_t group_size,
+    double eps,
+    double int8_min,
+    double int8_max) {
+  sgl_per_token_group_quant_8bit(input, output_q, output_s, group_size, eps, int8_min, int8_max);
+}
+
+void sgl_per_token_group_quant_fp8(
+    torch::Tensor input,
+    torch::Tensor output_q,
+    torch::Tensor output_s,
+    int64_t group_size,
+    double eps,
+    double fp8_min,
+    double fp8_max,
+    bool scale_ue8m0) {
+  sgl_per_token_group_quant_8bit(input, output_q, output_s, group_size, eps, fp8_min, fp8_max, scale_ue8m0);
+}
diff --git a/sgl-kernel/csrc/gemm/per_token_quant_fp8.cu b/sgl-kernel/csrc/gemm/per_token_quant_fp8.cu
new file mode 100644
index 000000000..e73716c86
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/per_token_quant_fp8.cu
@@ -0,0 +1,227 @@
+#include <ATen/cuda/CUDAContext.h>
+
+#include <cmath>
+#include <flashinfer/vec_dtypes.cuh>
+
+#include "utils.h"
+
+static constexpr int kWarpSize = 32;
+
+// ---------------------------------------------------------------------------
+// 1. Warp‑local, no shared memory
+//    • One warp handles one token.
+//    • Eight tokens per 256‑thread CTA.
+// ---------------------------------------------------------------------------
+template <typename T, typename DST_DTYPE, int kTokensPerCTA = 8, int kVecSize = 16>
+__global__ void per_token_quant_fp8_kernel(
+    const T* __restrict__ input,
+    DST_DTYPE* __restrict__ output_q,
+    float* __restrict__ output_s,
+    const int64_t hidden_dim,
+    const int64_t num_tokens) {
+  const int warp_id = threadIdx.x / kWarpSize;        // 0‑7  (8 warps)
+  const int lane_id = threadIdx.x & (kWarpSize - 1);  // 0‑31
+  const int token_id = blockIdx.x * kTokensPerCTA + warp_id;
+  if (token_id >= num_tokens) return;
+
+  // Global tensors for this token
+  const T* token_input = input + token_id * hidden_dim;
+  DST_DTYPE* token_output = output_q + token_id * hidden_dim;
+  float* token_scale = output_s + token_id;
+
+  //
+  // Pass-1: Perform a warp reduce to find the max_value of a token's hidden_dim
+  //
+  float max_value = 0.f;
+  using vec_t = flashinfer::vec_t<T, kVecSize>;
+  const int32_t num_vec_elems = hidden_dim / kVecSize;
+
+  for (int32_t i = lane_id; i < num_vec_elems; i += kWarpSize) {
+    vec_t input_vec;
+    input_vec.cast_load(token_input + i * kVecSize);
+
+#pragma unroll
+    for (uint32_t j = 0; j < kVecSize; ++j) {
+      max_value = fmaxf(max_value, fabsf(static_cast<float>(input_vec[j])));
+    }
+  }
+
+  float warp_max = warpReduceMax(max_value);
+
+  __shared__ float scale;
+  scale = warp_max / FP8_E4M3_MAX;
+  // Broadcast scale
+  if (lane_id == 0) {
+    token_scale[0] = scale;
+  }
+  float scale_inv = (scale == 0.f) ? 0.f : 1.0f / scale;
+
+  //
+  // Pass-2: quantize and write back
+  //
+  for (int i = lane_id; i < num_vec_elems; i += kWarpSize) {
+    vec_t input_vec;
+    input_vec.cast_load(token_input + i * kVecSize);
+    DST_DTYPE output_arr[kVecSize];
+#pragma unroll
+    for (uint32_t j = 0; j < kVecSize; ++j) {
+      float val = static_cast<float>(input_vec[j]) * scale_inv;
+      val = fmaxf(fminf(val, FP8_E4M3_MAX), -FP8_E4M3_MAX);
+#if !defined(USE_ROCM) || defined(HIP_FP8_TYPE_E4M3)
+      output_arr[j] = static_cast<DST_DTYPE>(val);
+#else
+      output_arr[j] = c10::Float8_e4m3fnuz(
+          __hip_cvt_float_to_fp8(val, fp8::fp8_type::__default_saturation, fp8::fp8_type::__default_interpret),
+          c10::Float8_e4m3fnuz::from_bits());
+#endif
+    }
+    if constexpr (kVecSize == 16) {
+      *(uint4*)(token_output + i * kVecSize) = *(uint4*)output_arr;
+    } else {
+      // Use element-wise copy for vector size 8 to ensure correctness
+      for (int k = 0; k < kVecSize; ++k) {
+        token_output[i * kVecSize + k] = output_arr[k];
+      }
+    }
+  }
+}
+
+// ---------------------------------------------------------------------------
+// 2.  Baseline kernel (1 token / CTA, CUB block reduce)
+// ---------------------------------------------------------------------------
+template <typename T, typename DST_DTYPE, int kVecSize = 16>
+__global__ void per_token_quant_fp8_small_batch_kernel(
+    const T* __restrict__ input,
+    DST_DTYPE* __restrict__ output_q,
+    float* __restrict__ output_s,
+    const int64_t hidden_dim,
+    const int64_t num_tokens) {
+  const int token_idx = blockIdx.x;
+  if (token_idx >= num_tokens) return;
+
+  const int tid = threadIdx.x;
+  const int block_dim = blockDim.x;
+
+  const T* token_input = input + token_idx * hidden_dim;
+  DST_DTYPE* token_output = output_q + token_idx * hidden_dim;
+
+  float max_value = 0.0f;
+
+  // Use template parameter for vector size
+  using vec_t = flashinfer::vec_t<T, kVecSize>;
+  const int32_t num_vec_elems = hidden_dim / kVecSize;
+
+  // Find max using vectorized loads
+  for (int32_t i = tid; i < num_vec_elems; i += block_dim) {
+    vec_t input_vec;
+    input_vec.cast_load(token_input + i * kVecSize);
+
+#pragma unroll
+    for (uint32_t j = 0; j < kVecSize; ++j) {
+      float val = static_cast<float>(input_vec[j]);
+      max_value = fmaxf(max_value, fabsf(val));
+    }
+  }
+
+  max_value = blockReduceMax(max_value);
+
+  __shared__ float scale;
+  if (tid == 0) {
+    scale = max_value / FP8_E4M3_MAX;
+    output_s[token_idx] = scale;
+  }
+  __syncthreads();
+
+  const float scale_inv = 1.0f / scale;
+
+  // Quantize using vectorized loads
+  for (int32_t i = tid; i < num_vec_elems; i += block_dim) {
+    vec_t input_vec;
+    input_vec.cast_load(token_input + i * kVecSize);
+
+    DST_DTYPE output_arr[kVecSize];
+#pragma unroll
+    for (uint32_t j = 0; j < kVecSize; ++j) {
+      float val = fmaxf(fminf(static_cast<float>(input_vec[j]) * scale_inv, FP8_E4M3_MAX), -FP8_E4M3_MAX);
+#if !defined(USE_ROCM) || defined(HIP_FP8_TYPE_E4M3)
+      output_arr[j] = static_cast<DST_DTYPE>(val);
+#else
+      output_arr[j] = c10::Float8_e4m3fnuz(
+          __hip_cvt_float_to_fp8(val, fp8::fp8_type::__default_saturation, fp8::fp8_type::__default_interpret),
+          c10::Float8_e4m3fnuz::from_bits());
+#endif
+    }
+
+    if constexpr (kVecSize == 16) {
+      *(uint4*)(token_output + i * kVecSize) = *(uint4*)output_arr;
+    } else {
+      // Use element-wise copy for vector size 8 to ensure correctness
+      for (int k = 0; k < kVecSize; ++k) {
+        token_output[i * kVecSize + k] = output_arr[k];
+      }
+    }
+  }
+}
+
+void sgl_per_token_quant_fp8(torch::Tensor input, torch::Tensor output_q, torch::Tensor output_s) {
+  CHECK_INPUT(input);
+  CHECK_INPUT(output_q);
+  CHECK_INPUT(output_s);
+  const auto input_sizes = input.sizes();
+  const int64_t num_tokens = input_sizes[0];
+  const int64_t hidden_dim = input_sizes[1];
+  TORCH_CHECK(hidden_dim % 8 == 0, "Hidden dimension must be divisible by 8, but got ", hidden_dim);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const int sm_count = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+  const int TOKENS_PER_CTA = 8;
+  const bool use_warp_kernel = (num_tokens >= sm_count * 2 * TOKENS_PER_CTA);
+  const bool use_vec16 = (hidden_dim % 16 == 0);
+
+  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16(input.scalar_type(), scalar_t, [&] {
+    if (use_warp_kernel) {
+      // -------- warp‑local ---------------------------------------------------
+      constexpr int THREADS = TOKENS_PER_CTA * kWarpSize;  // 256
+      dim3 grid((num_tokens + TOKENS_PER_CTA - 1) / TOKENS_PER_CTA);
+      dim3 block(THREADS);
+
+      if (use_vec16) {
+        per_token_quant_fp8_kernel<scalar_t, __nv_fp8_e4m3, TOKENS_PER_CTA, 16><<<grid, block, 0, stream>>>(
+            static_cast<const scalar_t*>(input.data_ptr()),
+            static_cast<__nv_fp8_e4m3*>(output_q.data_ptr()),
+            static_cast<float*>(output_s.data_ptr()),
+            hidden_dim,
+            num_tokens);
+      } else {
+        per_token_quant_fp8_kernel<scalar_t, __nv_fp8_e4m3, TOKENS_PER_CTA, 8><<<grid, block, 0, stream>>>(
+            static_cast<const scalar_t*>(input.data_ptr()),
+            static_cast<__nv_fp8_e4m3*>(output_q.data_ptr()),
+            static_cast<float*>(output_s.data_ptr()),
+            hidden_dim,
+            num_tokens);
+      }
+    } else {
+      // -------- baseline -----------------------------------------------------
+      constexpr int THREADS = 256;
+      dim3 grid(num_tokens);
+      dim3 block(THREADS);
+
+      if (use_vec16) {
+        per_token_quant_fp8_small_batch_kernel<scalar_t, __nv_fp8_e4m3, 16><<<grid, block, 0, stream>>>(
+            static_cast<const scalar_t*>(input.data_ptr()),
+            static_cast<__nv_fp8_e4m3*>(output_q.data_ptr()),
+            static_cast<float*>(output_s.data_ptr()),
+            hidden_dim,
+            num_tokens);
+      } else {
+        per_token_quant_fp8_small_batch_kernel<scalar_t, __nv_fp8_e4m3, 8><<<grid, block, 0, stream>>>(
+            static_cast<const scalar_t*>(input.data_ptr()),
+            static_cast<__nv_fp8_e4m3*>(output_q.data_ptr()),
+            static_cast<float*>(output_s.data_ptr()),
+            hidden_dim,
+            num_tokens);
+      }
+    }
+    return true;
+  });
+}
diff --git a/sgl-kernel/csrc/gemm/qserve_w4a8_per_chn_gemm.cu b/sgl-kernel/csrc/gemm/qserve_w4a8_per_chn_gemm.cu
new file mode 100644
index 000000000..79180737f
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/qserve_w4a8_per_chn_gemm.cu
@@ -0,0 +1,710 @@
+// Implemented by Haotian Tang and Shang Yang.
+// @article{lin2024qserve,
+//   title={QServe: W4A8KV4 Quantization and System Co-design for Efficient LLM Serving},
+//   author={Lin*, Yujun and Tang*, Haotian and Yang*, Shang and Zhang, Zhekai and Xiao, Guangxuan and Gan, Chuang and
+//   Han, Song}, journal={arXiv preprint arXiv:2405.04532}, year={2024}
+// }
+// @article{yang2025lserve,
+//   title={LServe: Efficient Long-sequence LLM Serving with Unified Sparse Attention},
+//   author={Yang*, Shang and Guo*, Junxian and Tang, Haotian and Hu, Qinghao and Xiao, Guangxuan and Tang, Jiaming and
+//   Lin, Yujun and Liu, Zhijian and Lu, Yao and Han, Song}, year={2025}
+// }
+
+// Adapted from https://github.com/mit-han-lab/omniserve/blob/main/kernels/csrc/qgemm/w4a8_per_chn/gemm_cuda.cu
+
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_fp16.h>
+#include <cuda_pipeline_primitives.h>
+#include <torch/all.h>
+
+#include "utils.h"
+
+#define OP_M 16
+#define OP_N 8
+#define OP_K 32
+#define INTRIN_M 16
+#define INTRIN_N 16
+#define INTRIN_K 32
+#define WARP_SIZE 32
+#define SMEM_PAD_A 0
+#define SMEM_PAD_B 0
+#define PACK_SIZE 16
+#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 4)
+#define L2_CACHEHINT(size) ".L2::" #size "B"
+#else
+#define L2_CACHEHINT(size)
+#endif
+
+#define KERNEL_LAUNCH_CODE                                                                                   \
+  constexpr int NUM_WARPS = (CTA_M / WARP_M) * (CTA_N / WARP_N) * (CTA_K / WARP_K);                          \
+  constexpr int SCALES_SMEM_SIZE = (G >= CTA_K) ? (CTA_N * STAGES * 2) : (CTA_N * (CTA_K / G) * STAGES * 2); \
+  constexpr int kSmemByteSize =                                                                              \
+      ((CTA_M * (CTA_K + SMEM_PAD_A) + CTA_N * (CTA_K + SMEM_PAD_B) / 2) * STAGES + SCALES_SMEM_SIZE) *      \
+      sizeof(int8_t);                                                                                        \
+  if (kSmemByteSize >= 99 * 1024) {                                                                          \
+    printf(                                                                                                  \
+        "This kernel requires %d Bytes of shared memory, which exceeds "                                     \
+        "device limit.\n",                                                                                   \
+        kSmemByteSize);                                                                                      \
+    return;                                                                                                  \
+  }                                                                                                          \
+  int num_blocks_m = (num_out_feats + CTA_M - 1) / CTA_M;                                                    \
+  int num_blocks_n = num_out_channels / CTA_N / 1;                                                           \
+  const int log_tile = get_log_tile<8>((num_out_feats + CTA_M - 1) / CTA_M);                                 \
+  const int tile_shift = 1 << log_tile;                                                                      \
+  dim3 num_blocks(num_blocks_n* tile_shift, (num_blocks_m + tile_shift - 1) / tile_shift);                   \
+  dim3 threads_per_block(WARP_SIZE, NUM_WARPS);                                                              \
+  auto kernel_func = dense_kernel0<CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, STAGES, G>;                  \
+  cudaFuncSetAttribute(kernel_func, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemByteSize);             \
+  kernel_func<<<num_blocks, threads_per_block, kSmemByteSize, stream>>>(                                     \
+      in_feats, kernel, wscales, ascales, w_szs, a_ssums, out_feats, num_in_feats, num_out_channels, num_in_channels);
+
+template <int N>
+__inline__ __host__ __device__ int get_log_tile(int n) {
+  if (N >= 8 && n >= 6)
+    return 3;
+  else if (N >= 4 && n >= 3)
+    return 2;
+  else if (N >= 2 && n >= 2)
+    return 1;
+  else
+    return 0;
+}
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+__inline__ __device__ uint2 get_block_idx_mapping(int blockIdx_x, int blockIdx_y, int log_tile) {
+  return make_uint2((blockIdx_x >> log_tile), (blockIdx_y << log_tile) + ((blockIdx_x) & ((1 << (log_tile)) - 1)));
+}
+
+__inline__ __device__ uint32_t cast_smem_ptr_to_uint(void const* const ptr) {
+  uint32_t smem_int_ptr;
+
+  asm("{.reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 %0, "
+      "smem_ptr; }\n"
+      : "=r"(smem_int_ptr)
+      : "l"(ptr));
+
+  return smem_int_ptr;
+}
+
+__inline__ __device__ void ldmatrix_m8n8_x4_b16(int8_t* shared_warp, int ax0_0, uint32_t addr) {
+  __asm__ __volatile__(
+      "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
+      "{%0, %1, %2, %3}, [%4];"
+      : "=r"(((unsigned*)(shared_warp + (ax0_0 * 16)))[0]),
+        "=r"(((unsigned*)(shared_warp + (ax0_0 * 16)))[1]),
+        "=r"(((unsigned*)(shared_warp + (ax0_0 * 16)))[2]),
+        "=r"(((unsigned*)(shared_warp + (ax0_0 * 16)))[3])
+      : "r"(addr));
+}
+
+__inline__ __device__ void ldmatrix_m8n8_x4_trans_b16(int8_t* shared_warp, int ax0_0, uint32_t addr) {
+  __asm__ __volatile__(
+      "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+      "{%0, %1, %2, %3}, [%4];"
+      : "=r"(((unsigned*)(shared_warp + (ax0_0 * 16)))[0]),
+        "=r"(((unsigned*)(shared_warp + (ax0_0 * 16)))[1]),
+        "=r"(((unsigned*)(shared_warp + (ax0_0 * 16)))[2]),
+        "=r"(((unsigned*)(shared_warp + (ax0_0 * 16)))[3])
+      : "r"(addr));
+}
+
+// function from lmdeploy
+__inline__ __device__ void cp_async_cg_A(uint32_t smem_int_ptr, const uint4* __restrict__ src, bool mask) {
+  const int cp_size = 16;
+  asm volatile("{"
+                "  .reg .pred p;"
+                "  setp.ne.b32 p, %0, 0;"
+                "  @p cp.async.cg.shared.global" L2_CACHEHINT(128) " [%1], [%2], %3;"
+                "}" ::"r"((int)mask),
+                "r"(smem_int_ptr),
+                "l"(src),
+                "n"(cp_size));
+}
+
+__device__ __inline__ void mma_m16n8k32(void* C_warp, void* A_shared_warp, void* B_shared_warp) {
+  __asm__ __volatile__(
+      "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32"
+      "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+      : "=r"(((int*)C_warp)[0]), "=r"(((int*)C_warp)[1]), "=r"(((int*)C_warp)[2]), "=r"(((int*)C_warp)[3])
+      : "r"(((unsigned*)A_shared_warp)[0]),
+        "r"(((unsigned*)A_shared_warp)[1]),
+        "r"(((unsigned*)A_shared_warp)[2]),
+        "r"(((unsigned*)A_shared_warp)[3]),
+        "r"(((unsigned*)B_shared_warp)[0]),
+        "r"(((unsigned*)B_shared_warp)[1]),
+        "r"(((int*)C_warp)[0]),
+        "r"(((int*)C_warp)[1]),
+        "r"(((int*)C_warp)[2]),
+        "r"(((int*)C_warp)[3]));
+}
+
+template <int CTA_M, int CTA_N, int CTA_K, int CTA_SIZE, int SHARED_K_ITERS, int STAGES>
+__device__ __inline__ void global_to_share_one_stage_A(
+    int8_t* src,
+    int8_t* dst,
+    int global_ncols,
+    int cta_offset_m,
+    int cta_offset_n,
+    int global_iter_k,
+    int shared_iter_k,
+    bool mask,
+    bool* preds) {
+  constexpr int total_global_iters = (CTA_M * CTA_K) / PACK_SIZE / CTA_SIZE;
+  constexpr int partial_global_iters = total_global_iters / SHARED_K_ITERS;
+  constexpr int cta_step_m_or_n = (CTA_SIZE * PACK_SIZE) / CTA_K;
+  constexpr int warp_step_m_or_n = (WARP_SIZE * PACK_SIZE) / CTA_K;
+  constexpr int threads_per_row = CTA_K / PACK_SIZE;
+  constexpr int kSmemCol = CTA_K + SMEM_PAD_A;
+  int8_t* dst_hoisted = dst;
+  int8_t* src_hoisted = src + global_iter_k * CTA_K;
+
+  if (mask) {
+#pragma unroll
+    for (int _global_iter = 0; _global_iter < partial_global_iters; ++_global_iter) {
+      int global_iter = shared_iter_k * partial_global_iters + _global_iter;
+
+      void* dst_ptr = (void*)(dst_hoisted + global_iter * cta_step_m_or_n * kSmemCol);
+      uint4* src_ptr = (uint4*)(src_hoisted + global_iter * cta_step_m_or_n * global_ncols);
+      // *dst_ptr = *src_ptr;
+      if constexpr (STAGES > 1) {
+        uint32_t addr = cast_smem_ptr_to_uint(dst_ptr);
+        cp_async_cg_A(addr, src_ptr, preds[global_iter]);
+      } else {
+        if (preds[global_iter]) *(uint4*)dst_ptr = *src_ptr;
+      }
+    }
+  }
+}
+
+template <int CTA_M, int CTA_N, int CTA_K, int CTA_SIZE, int SHARED_K_ITERS, int STAGES>
+__device__ __inline__ void global_to_share_one_stage_B(
+    int8_t* src,
+    int8_t* dst,
+    int global_ncols,
+    int cta_offset_m,
+    int cta_offset_n,
+    int global_iter_k,
+    int shared_iter_k,
+    bool mask) {
+  constexpr int total_global_iters = (CTA_N * CTA_K) / 32 / CTA_SIZE;
+  constexpr int NUM_WARPS = CTA_SIZE / WARP_SIZE;
+  constexpr int warps_per_row = CTA_K / 32;
+  constexpr int cta_step_m_or_n = NUM_WARPS / warps_per_row;
+  constexpr int kSmemCol = CTA_K;
+  int8_t* dst_hoisted = dst;
+  int8_t* src_hoisted = src + global_iter_k * CTA_K * PACK_SIZE;
+
+#pragma unroll
+  for (int global_iter = 0; global_iter < total_global_iters; ++global_iter) {
+    void* dst_ptr = (void*)(dst_hoisted + global_iter * cta_step_m_or_n * kSmemCol * PACK_SIZE);
+    uint4* src_ptr = (uint4*)(src_hoisted + global_iter * cta_step_m_or_n * global_ncols * PACK_SIZE);
+    if constexpr (STAGES > 1) {
+      uint32_t addr = cast_smem_ptr_to_uint(dst_ptr);
+      cp_async_cg_A(addr, src_ptr, mask);
+    } else {
+      if (mask) *(uint4*)dst_ptr = *src_ptr;
+    }
+  }
+}
+
+template <int CTA_M, int CTA_N, int CTA_K, int CTA_SIZE, int STAGES, int G>
+__device__ __inline__ void global_to_share_one_stage_zeros(
+    int8_t* src,
+    int8_t* dst,
+    int global_ncols,
+    int cta_offset_m,
+    int cta_offset_n,
+    int global_iter_k,
+    int shared_iter_k,
+    bool mask) {
+  constexpr int threads_needed = CTA_N / PACK_SIZE / 1;
+  constexpr int threads_used = threads_needed < CTA_SIZE ? threads_needed : CTA_SIZE;
+  constexpr int total_global_iters = CTA_N / PACK_SIZE / threads_used;
+  constexpr int threads_per_row = CTA_N / PACK_SIZE;
+  constexpr int kSmemCol = CTA_N;
+  bool local_mask = mask & (threadIdx.y * WARP_SIZE + threadIdx.x < threads_used);
+  int g_idx = global_iter_k * CTA_K / G;
+
+  void* dst_ptr = (void*)(dst + (threadIdx.x % threads_per_row) * PACK_SIZE);
+  uint4* src_ptr = (uint4*)(src + g_idx * global_ncols + cta_offset_n + (threadIdx.x % threads_per_row) * PACK_SIZE);
+  if (STAGES > 1) {
+    uint32_t addr = cast_smem_ptr_to_uint(dst_ptr);
+    cp_async_cg_A(addr, src_ptr, local_mask);
+  } else {
+    if (local_mask) {
+      *(uint4*)dst_ptr = *src_ptr;
+    }
+  }
+}
+
+template <int CTA_M, int CTA_N, int CTA_K, int CTA_SIZE, int STAGES>
+__device__ __inline__ void
+share_to_reg_one_stage_A(int8_t* src, int8_t* dst, int warp_offset_m, int warp_offset_n, int k_0_1, int shared_iters) {
+  constexpr int kSmemCol = CTA_K + SMEM_PAD_A;
+  int ld_col = (k_0_1 * INTRIN_K + (threadIdx.x / 16) * 16) / PACK_SIZE;
+
+  for (int shared_iter = 0; shared_iter < shared_iters; ++shared_iter) {
+    int ld_row = warp_offset_m + shared_iter * INTRIN_M + (threadIdx.x % 16);
+    int ld_col_swizzled = ld_col ^ (ld_row / 2) & 3;
+    void* addr_ptr = (void*)(src + ld_row * kSmemCol + ld_col_swizzled * PACK_SIZE);
+    uint32_t addr = cast_smem_ptr_to_uint(addr_ptr);
+    ldmatrix_m8n8_x4_b16(dst, shared_iter, addr);
+  }
+}
+
+template <int WARP_K, int CTA_N, int CTA_K, int CTA_SIZE, int STAGES, int G>
+__device__ __inline__ void share_to_reg_one_stage_B(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* zeros,
+    int8_t* scales_i8,
+    int warp_offset_m,
+    int warp_offset_n,
+    int k_0_0,
+    int k_0_1,
+    int shared_iters) {
+  constexpr int kSmemCol = CTA_K + SMEM_PAD_B;
+#pragma unroll
+  for (int shared_iter = 0; shared_iter < shared_iters; ++shared_iter) {
+    uint4 loaded =
+        *((uint4*)(src) + warp_offset_n / 32 * kSmemCol + shared_iter * 32 / 32 * kSmemCol + k_0_1 * INTRIN_K +
+          threadIdx.x);
+
+    auto ptr = (uint32_t*)dst + shared_iter * 8;
+    ptr[0] = loaded.x & 0x0F0F0F0F;
+    ptr[4] = (loaded.x & 0xF0F0F0F0) >> 4;
+    ptr[2] = loaded.y & 0x0F0F0F0F;
+    ptr[6] = (loaded.y & 0xF0F0F0F0) >> 4;
+    ptr[1] = loaded.z & 0x0F0F0F0F;
+    ptr[5] = (loaded.z & 0xF0F0F0F0) >> 4;
+    ptr[3] = loaded.w & 0x0F0F0F0F;
+    ptr[7] = (loaded.w & 0xF0F0F0F0) >> 4;
+  }
+}
+
+template <int CTA_M, int CTA_N, int CTA_K, int WARP_M, int WARP_N, int WARP_K, int STAGES, int G>
+__global__ void dense_kernel0(
+    int8_t* __restrict__ A,
+    int8_t* __restrict__ B,
+    half2* __restrict__ wscales,
+    half* __restrict__ ascales,
+    half2* __restrict__ w_szs,
+    half* __restrict__ a_ssums,
+    half* __restrict__ C,
+    int M,
+    int64_t N,
+    int64_t K) {
+  constexpr int SPLITK = 1;
+  constexpr int NUM_WARPS_MN = CTA_M / WARP_M * CTA_N / WARP_N;
+  constexpr int NUM_WARPS = NUM_WARPS_MN * CTA_K / WARP_K;
+  constexpr int CTA_SIZE = NUM_WARPS * WARP_SIZE;
+  constexpr int CTA_SIZE_MN = NUM_WARPS_MN * WARP_SIZE;
+  constexpr int SLICES = CTA_K / WARP_K;
+  int num_blocks_n = (N + CTA_N - 1) / CTA_N;
+  int num_blocks_m = (M + CTA_M - 1) / CTA_M;
+
+  int blockIdx_n = blockIdx.x;
+  int blockIdx_m = blockIdx.y;
+  const int log_tile = get_log_tile<8>((M + CTA_M - 1) / CTA_M);
+  const uint2 block_idx_mapping = get_block_idx_mapping(blockIdx_n, blockIdx_m, log_tile);
+  blockIdx_n = block_idx_mapping.x;
+  blockIdx_m = block_idx_mapping.y;
+
+  int C_warp[CTA_M * CTA_N / CTA_SIZE_MN];
+  constexpr int kSmemPadKA = CTA_K + SMEM_PAD_A;
+  constexpr int kSmemPadKB = CTA_K + SMEM_PAD_B;
+  constexpr int kSmemSizeAPerStage = CTA_M * kSmemPadKA;
+  constexpr int kSmemSizeBPerStage = CTA_N * kSmemPadKB / 2;
+  constexpr int kSmemSizeA = kSmemSizeAPerStage * STAGES;
+  constexpr int kSmemSizeB = kSmemSizeBPerStage * STAGES;
+
+  constexpr int scales_load_interval = G >= CTA_K ? G / CTA_K : 1;
+  constexpr int scales_per_load = G < CTA_K ? CTA_K / G : 1;
+  constexpr int kSmemSizeScales = CTA_N * STAGES;
+
+  extern __shared__ int8_t mem_shared[];
+  int8_t* A_shared = mem_shared;
+
+  int8_t* B_shared = mem_shared + kSmemSizeA;
+  int8_t* zeros_shared = mem_shared + kSmemSizeA + kSmemSizeB;
+  int8_t* scales_i8_shared = mem_shared + kSmemSizeA + kSmemSizeB + kSmemSizeScales;
+
+  int8_t A_shared_warp_[2][WARP_M * WARP_K / WARP_SIZE];
+  int8_t B_shared_warp_[2][WARP_N * WARP_K / WARP_SIZE];
+  constexpr int A_total_global_iters = (CTA_M * CTA_K) / PACK_SIZE / CTA_SIZE;
+  constexpr int B_total_global_iters = (CTA_N * CTA_K) / PACK_SIZE / CTA_SIZE;
+  constexpr int A_src_step_m = (CTA_SIZE * PACK_SIZE) / CTA_K;
+  constexpr int A_warp_step_m = (WARP_SIZE * PACK_SIZE) / CTA_K;
+  constexpr int A_threads_per_row = CTA_K / PACK_SIZE;
+
+  constexpr int B_warps_per_row = CTA_K / 32;
+  constexpr int B_src_step_n = NUM_WARPS / B_warps_per_row;
+
+  int cta_offset_m = blockIdx_m * CTA_M;
+  int cta_offset_n = blockIdx_n * CTA_N;
+  int warp_mn = threadIdx.y % NUM_WARPS_MN;
+  int slice_id = threadIdx.y / NUM_WARPS_MN;
+  int warp_offset_m = (warp_mn % (CTA_M / WARP_M)) * WARP_M;
+  int warp_offset_n = (warp_mn / (CTA_M / WARP_M)) * WARP_N;
+  int warp_offset_k = slice_id * WARP_K;
+
+  for (int i = 0; i < CTA_M * CTA_N / CTA_SIZE_MN; i++)
+    C_warp[i] = 0;
+
+  int gemm_iters = (K + CTA_K - 1) / CTA_K;
+  int k_0_0_ld = 0;
+  int k_0_0 = 0;
+  constexpr int prologue_stages = STAGES == 1 ? 1 : STAGES - 1;
+  int A_hoisted_row = threadIdx.y * A_warp_step_m + (threadIdx.x / A_threads_per_row);
+  int A_hoisted_col = (threadIdx.x % A_threads_per_row);
+  int A_hoisted_col_swizzled = A_hoisted_col ^ (A_hoisted_row / 2) & 3;
+
+  int8_t* A_shared_hoisted = A_shared + A_hoisted_row * kSmemPadKA + A_hoisted_col_swizzled * PACK_SIZE;
+  int8_t* B_shared_hoisted = B_shared + (threadIdx.y % B_warps_per_row) * 32 * PACK_SIZE +
+                             (threadIdx.y / B_warps_per_row) * kSmemPadKB * PACK_SIZE + threadIdx.x * PACK_SIZE;
+  int8_t* A_hoisted = A + cta_offset_m * K + A_hoisted_row * K + A_hoisted_col * PACK_SIZE;
+  int8_t* B_hoisted = B + cta_offset_n / 32 * K * PACK_SIZE + (threadIdx.y % B_warps_per_row) * 32 * PACK_SIZE +
+                      (threadIdx.y / B_warps_per_row) * K * PACK_SIZE + threadIdx.x * PACK_SIZE;
+
+  bool A_g2s_preds[A_total_global_iters];
+#pragma unroll
+  for (int i = 0; i < A_total_global_iters; i++) {
+    A_g2s_preds[i] = (cta_offset_m + A_hoisted_row + i * A_src_step_m) < M;
+  }
+
+  int* C_shared = reinterpret_cast<int*>(mem_shared);
+
+#pragma unroll
+  for (k_0_0_ld = 0; k_0_0_ld < prologue_stages; ++k_0_0_ld) {
+    global_to_share_one_stage_A<CTA_M, CTA_N, CTA_K, CTA_SIZE, 1, STAGES>(
+        A_hoisted,
+        A_shared_hoisted + k_0_0_ld * kSmemSizeAPerStage,
+        K,
+        cta_offset_m,
+        cta_offset_n,
+        k_0_0_ld,
+        0,
+        true,
+        A_g2s_preds);
+    global_to_share_one_stage_B<CTA_M, CTA_N, CTA_K, CTA_SIZE, 1, STAGES>(
+        B_hoisted, B_shared_hoisted + k_0_0_ld * kSmemSizeBPerStage, K, cta_offset_m, cta_offset_n, k_0_0_ld, 0, true);
+
+    if constexpr (STAGES > 1) __pipeline_commit();
+  }
+  if constexpr (STAGES > 1) __pipeline_wait_prior(STAGES - 2);
+  __syncthreads();
+
+  share_to_reg_one_stage_A<CTA_M, CTA_N, CTA_K, CTA_SIZE, STAGES>(
+      A_shared + warp_offset_k, A_shared_warp_[0], warp_offset_m, warp_offset_n, 0, WARP_M / INTRIN_M);
+  share_to_reg_one_stage_B<CTA_M, CTA_N, CTA_K, CTA_SIZE, STAGES, G>(
+      B_shared + warp_offset_k * PACK_SIZE,
+      B_shared_warp_[0],
+      zeros_shared,
+      scales_i8_shared,
+      warp_offset_m,
+      warp_offset_n,
+      0,
+      0,
+      WARP_N / 32);
+  constexpr int SHARED_K_ITERS = WARP_K / INTRIN_K;
+
+  for (; k_0_0 < gemm_iters; ++k_0_0, ++k_0_0_ld) {
+    int ld_stage = k_0_0_ld % STAGES;
+    int compute_stage = k_0_0 % STAGES;
+    int8_t* A_shared_this_compute_stage;
+    int8_t* B_shared_this_compute_stage;
+    int8_t* zeros_shared_this_compute_stage;
+    int8_t* scales_i8_shared_this_compute_stage;
+
+    for (int iter_k = 0; iter_k < SHARED_K_ITERS; ++iter_k) {
+      A_shared_this_compute_stage = A_shared + compute_stage * kSmemSizeAPerStage + warp_offset_k;
+      B_shared_this_compute_stage = B_shared + compute_stage * kSmemSizeBPerStage + warp_offset_k * PACK_SIZE;
+      zeros_shared_this_compute_stage = zeros_shared + (compute_stage)*CTA_N;
+      scales_i8_shared_this_compute_stage = scales_i8_shared + (compute_stage)*CTA_N;
+
+      share_to_reg_one_stage_A<CTA_M, CTA_N, CTA_K, CTA_SIZE, STAGES>(
+          A_shared_this_compute_stage,
+          A_shared_warp_[(iter_k + 1) % 2],
+          warp_offset_m,
+          warp_offset_n,
+          (iter_k + 1) % SHARED_K_ITERS,
+          WARP_M / INTRIN_M);
+      share_to_reg_one_stage_B<CTA_M, CTA_N, CTA_K, CTA_SIZE, STAGES, G>(
+          B_shared_this_compute_stage,
+          B_shared_warp_[(iter_k + 1) % 2],
+          zeros_shared_this_compute_stage,
+          scales_i8_shared_this_compute_stage,
+          warp_offset_m,
+          warp_offset_n,
+          k_0_0 + (iter_k == SHARED_K_ITERS - 1),
+          (iter_k + 1) % SHARED_K_ITERS,
+          WARP_N / 32);
+      int8_t* A_shared_warp = A_shared_warp_[iter_k % 2];
+      int8_t* B_shared_warp = B_shared_warp_[iter_k % 2];
+
+      for (int j_0_4 = 0; j_0_4 < WARP_N / INTRIN_N; ++j_0_4) {
+        for (int i_0_3 = 0; i_0_3 < WARP_M / INTRIN_M; ++i_0_3) {
+          mma_m16n8k32(
+              (void*)(C_warp + i_0_3 * WARP_N / INTRIN_N * 8 + j_0_4 * 8),
+              (void*)(A_shared_warp + i_0_3 * 16),
+              (void*)(B_shared_warp + j_0_4 * 16));
+          mma_m16n8k32(
+              (void*)(C_warp + i_0_3 * WARP_N / INTRIN_N * 8 + j_0_4 * 8 + 4),
+              (void*)(A_shared_warp + i_0_3 * 16),
+              (void*)(B_shared_warp + j_0_4 * 16 + 8));
+        }
+      }
+
+      if (iter_k < SHARED_K_ITERS - 1) {
+        if constexpr (STAGES == 1) __syncthreads();
+        global_to_share_one_stage_A<CTA_M, CTA_N, CTA_K, CTA_SIZE, WARP_K / INTRIN_K, STAGES>(
+            A_hoisted,
+            A_shared_hoisted + ld_stage * kSmemSizeAPerStage,
+            K,
+            cta_offset_m,
+            cta_offset_n,
+            k_0_0_ld,
+            iter_k,
+            k_0_0_ld < gemm_iters,
+            A_g2s_preds);
+        global_to_share_one_stage_B<CTA_M, CTA_N, CTA_K, CTA_SIZE, WARP_K / INTRIN_K, STAGES>(
+            B_hoisted,
+            B_shared_hoisted + ld_stage * kSmemSizeBPerStage,
+            K,
+            cta_offset_m,
+            cta_offset_n,
+            k_0_0_ld,
+            iter_k,
+            k_0_0_ld < gemm_iters);
+      }
+
+      if (iter_k == SHARED_K_ITERS - 2) {
+        if constexpr (STAGES == 1 && SHARED_K_ITERS > 2) {
+          __syncthreads();
+        }
+        global_to_share_one_stage_A<CTA_M, CTA_N, CTA_K, CTA_SIZE, WARP_K / INTRIN_K, STAGES>(
+            A_hoisted,
+            A_shared_hoisted + ld_stage * kSmemSizeAPerStage,
+            K,
+            cta_offset_m,
+            cta_offset_n,
+            k_0_0_ld,
+            iter_k + 1,
+            k_0_0_ld < gemm_iters,
+            A_g2s_preds);
+        global_to_share_one_stage_B<CTA_M, CTA_N, CTA_K, CTA_SIZE, WARP_K / INTRIN_K, STAGES>(
+            B_hoisted,
+            B_shared_hoisted + ld_stage * kSmemSizeBPerStage,
+            K,
+            cta_offset_m,
+            cta_offset_n,
+            k_0_0_ld,
+            iter_k + 1,
+            k_0_0_ld < gemm_iters);
+        if constexpr (STAGES > 1) {
+          __pipeline_commit();
+          __pipeline_wait_prior(STAGES - 2);
+        }
+        compute_stage = (k_0_0 + 1) % STAGES;
+        __syncthreads();
+      }
+    }
+  }
+
+  __pipeline_commit();
+  __pipeline_wait_prior(0);
+  __syncthreads();
+
+  if constexpr (SLICES > 1) {
+#pragma unroll
+    for (int z = 0; z < SLICES; ++z) {
+      if (slice_id == z) {
+#pragma unroll
+        for (int ax0_0_1 = 0; ax0_0_1 < WARP_M / INTRIN_M; ++ax0_0_1) {
+#pragma unroll
+          for (int ax1_0_1 = 0; ax1_0_1 < WARP_N / INTRIN_N; ++ax1_0_1) {
+#pragma unroll
+            for (int local_id = 0; local_id < OP_M * 16 / WARP_SIZE; ++local_id) {
+              if (z > 0) {
+                C_warp[ax0_0_1 * WARP_N / INTRIN_N * 8 + ax1_0_1 * 8 + local_id] += C_shared
+                    [warp_offset_m * CTA_N + ax0_0_1 * OP_M * CTA_N + warp_offset_n + ax1_0_1 * 16 +
+                     ((local_id % 4) / 2 * 8 + (threadIdx.x / 4)) * CTA_N + (local_id / 4) * 8 + (local_id % 2) +
+                     (threadIdx.x % 4) * 2];
+              }
+              C_shared
+                  [warp_offset_m * CTA_N + ax0_0_1 * OP_M * CTA_N + warp_offset_n + ax1_0_1 * 16 +
+                   ((local_id % 4) / 2 * 8 + (threadIdx.x / 4)) * CTA_N + (local_id / 4) * 8 + (local_id % 2) +
+                   (threadIdx.x % 4) * 2] = C_warp[ax0_0_1 * WARP_N / INTRIN_N * 8 + ax1_0_1 * 8 + local_id];
+            };
+          }
+        }
+      }
+      __syncthreads();
+    }
+    if (slice_id == 0) {
+#pragma unroll
+      for (int ax0_0_1 = 0; ax0_0_1 < WARP_M / INTRIN_M; ++ax0_0_1) {
+#pragma unroll
+        for (int ax1_0_1 = 0; ax1_0_1 < WARP_N / INTRIN_N; ++ax1_0_1) {
+#pragma unroll
+          for (int local_id = 0; local_id < OP_M * 16 / WARP_SIZE; ++local_id) {
+            C_warp[ax0_0_1 * WARP_N / INTRIN_N * 8 + ax1_0_1 * 8 + local_id] = C_shared
+                [warp_offset_m * CTA_N + ax0_0_1 * OP_M * CTA_N + warp_offset_n + ax1_0_1 * 16 +
+                 ((local_id % 4) / 2 * 8 + (threadIdx.x / 4)) * CTA_N + (local_id / 4) * 8 + (local_id % 2) +
+                 (threadIdx.x % 4) * 2];
+          };
+        }
+      }
+    }
+  }
+
+  int row_wb_thd = cta_offset_m + warp_offset_m + (threadIdx.x / 4);
+  int col_wb_thd = cta_offset_n + warp_offset_n + (threadIdx.x % 4) * 2;
+  if (slice_id == 0) {
+    for (int ax0_0_1 = 0; ax0_0_1 < WARP_M / INTRIN_M; ++ax0_0_1) {
+      int row_wb_1 = row_wb_thd + ax0_0_1 * OP_M;
+      for (int ax1_0_1 = 0; ax1_0_1 < WARP_N / INTRIN_N; ++ax1_0_1) {
+        int col_wb_1 = col_wb_thd + ax1_0_1 * 16;
+        int* C_warp_local = C_warp + ax0_0_1 * WARP_N / INTRIN_N * 8 + ax1_0_1 * 8;
+        for (int local_id = 0; local_id < OP_M * 16 / WARP_SIZE; local_id += 2) {
+          int row_wb = row_wb_1 + (local_id % 4) / 2 * 8;
+          if (row_wb < M) {
+            int col_wb = col_wb_1 + (local_id / 4) * 8 + (local_id % 2);
+            float2 wscale = __half22float2(*(wscales + col_wb / 2));
+            float2 w_sz = __half22float2(*(w_szs + col_wb / 2));
+            float ascale = __half2float(ascales[row_wb]);
+            float a_ssum = __half2float(a_ssums[row_wb]);
+            float2 psums =
+                make_float2(__int2float_rn(C_warp_local[local_id]), __int2float_rn(C_warp_local[local_id + 1]));
+            psums.x = psums.x * wscale.x * ascale - w_sz.x * a_ssum;
+            psums.y = psums.y * wscale.y * ascale - w_sz.y * a_ssum;
+            *reinterpret_cast<half2*>(C + row_wb * N + col_wb) = __float22half2_rn(psums);
+          }
+        };
+      }
+    }
+  }
+}
+#else
+template <int CTA_M, int CTA_N, int CTA_K, int WARP_M, int WARP_N, int WARP_K, int STAGES, int G>
+__global__ void dense_kernel0(
+    int8_t* __restrict__ A,
+    int8_t* __restrict__ B,
+    half2* __restrict__ wscales,
+    half* __restrict__ ascales,
+    half2* __restrict__ w_szs,
+    half* __restrict__ a_ssums,
+    half* __restrict__ C,
+    int M,
+    int64_t N,
+    int64_t K) {
+  // Not implemented for SM < 800
+  assert(false);
+  return;
+}
+#endif
+
+void qserve_w4a8_per_chn_gemm(
+    const torch::Tensor& _in_feats,
+    const torch::Tensor& _kernel,
+    const torch::Tensor& _wscales,
+    const torch::Tensor& _ascales,
+    const torch::Tensor& _w_szs,
+    const torch::Tensor& _a_ssums,
+    torch::Tensor& _out_feats) {
+  // Check input tensor
+  TORCH_CHECK(_in_feats.is_cuda(), "_in_feats must be a CUDA tensor");
+  TORCH_CHECK(_in_feats.dim() == 2, "_in_feats must be a 2D tensor");
+  TORCH_CHECK(_in_feats.is_contiguous(), "_in_feats must be contiguous");
+  TORCH_CHECK(_in_feats.scalar_type() == torch::kInt8, "_in_feats must be int8");
+  // Check kernel tensor
+  TORCH_CHECK(_kernel.is_cuda(), "_kernel must be a CUDA tensor");
+  TORCH_CHECK(_kernel.dim() == 2, "_kernel must be a 2D tensor");
+  TORCH_CHECK(_kernel.is_contiguous(), "_kernel must be contiguous");
+  TORCH_CHECK(_kernel.scalar_type() == torch::kInt8, "_kernel must be int8");
+  // Check output tensor
+  TORCH_CHECK(_out_feats.is_cuda(), "_out_feats must be a CUDA tensor");
+  TORCH_CHECK(_out_feats.is_contiguous(), "_out_feats must be contiguous");
+  TORCH_CHECK(_out_feats.scalar_type() == torch::kHalf, "_out_feats must be half");
+
+  int num_in_feats = _in_feats.size(0);
+  int num_in_channels = _in_feats.size(1);
+  int num_out_feats = _out_feats.size(-2);
+  int num_out_channels = _out_feats.size(-1);
+
+  // Check matmul shape
+  TORCH_CHECK(num_out_channels == _kernel.size(0), "num_out_channels must be equal to _kernel.size(0)");
+  TORCH_CHECK(num_in_feats == num_out_feats, "num_in_feats must be equal to num_out_feats");
+
+  // Check _ascales
+  TORCH_CHECK(_ascales.is_cuda(), "_ascales must be a CUDA tensor");
+  TORCH_CHECK(_ascales.is_contiguous(), "_ascales must be contiguous");
+  TORCH_CHECK(_ascales.scalar_type() == torch::kHalf, "_ascales must be half");
+  TORCH_CHECK(_ascales.numel() == num_in_feats, "_ascales must have num_in_feats elements");
+
+  // Check _wscales
+  TORCH_CHECK(_wscales.is_cuda(), "_wscales must be a CUDA tensor");
+  TORCH_CHECK(_wscales.is_contiguous(), "_wscales must be contiguous");
+  TORCH_CHECK(_wscales.scalar_type() == torch::kHalf, "_wscales must be half");
+  TORCH_CHECK(_wscales.numel() == num_out_channels, "_wscales must have num_out_channels elements");
+
+  // Check _w_szs
+  TORCH_CHECK(_w_szs.is_cuda(), "_w_szs must be a CUDA tensor");
+  TORCH_CHECK(_w_szs.is_contiguous(), "_w_szs must be contiguous");
+  TORCH_CHECK(_w_szs.scalar_type() == torch::kHalf, "_w_szs must be half");
+  TORCH_CHECK(_w_szs.numel() == num_out_channels, "_w_szs must have num_out_channels elements");
+
+  // Check _a_ssums
+  TORCH_CHECK(_a_ssums.is_cuda(), "_a_ssums must be a CUDA tensor");
+  TORCH_CHECK(_a_ssums.is_contiguous(), "_a_ssums must be contiguous");
+  TORCH_CHECK(_a_ssums.scalar_type() == torch::kHalf, "_a_ssums must be half");
+  TORCH_CHECK(_a_ssums.numel() == num_in_feats, "_a_ssums must have num_in_feats elements");
+
+  auto in_feats = reinterpret_cast<int8_t*>(_in_feats.data_ptr<int8_t>());
+  auto kernel = reinterpret_cast<int8_t*>(_kernel.data_ptr<int8_t>());
+  auto w_szs = reinterpret_cast<half2*>(_w_szs.data_ptr());
+  auto a_ssums = reinterpret_cast<half*>(_a_ssums.data_ptr());
+  auto wscales = reinterpret_cast<half2*>(_wscales.data_ptr());
+  auto ascales = reinterpret_cast<half*>(_ascales.data_ptr());
+  auto out_feats = reinterpret_cast<half*>(_out_feats.data_ptr<at::Half>());
+  auto stream = at::cuda::getCurrentCUDAStream(_in_feats.get_device());
+
+  auto sm_version = getSMVersion();
+  if (sm_version >= 80) {
+    constexpr int G = 128;
+
+    if (num_out_feats > 256) {
+      constexpr int CTA_M = 128;
+      constexpr int CTA_N = 128;
+      constexpr int CTA_K = 64;
+      constexpr int WARP_M = 64;
+      constexpr int WARP_N = 32;
+      constexpr int WARP_K = 64;
+      constexpr int STAGES = 3;
+      KERNEL_LAUNCH_CODE
+    } else if (num_out_feats >= 128) {
+      constexpr int CTA_M = 64;
+      constexpr int CTA_N = 64;
+      constexpr int CTA_K = 64;
+      constexpr int WARP_M = 32;
+      constexpr int WARP_N = 32;
+      constexpr int WARP_K = 64;
+      constexpr int STAGES = 4;
+      KERNEL_LAUNCH_CODE
+    } else {
+      constexpr int CTA_M = 32;
+      constexpr int CTA_N = 64;
+      constexpr int CTA_K = 128;
+      constexpr int WARP_M = 32;
+      constexpr int WARP_N = 32;
+      constexpr int WARP_K = 64;
+      constexpr int STAGES = 3;
+      KERNEL_LAUNCH_CODE
+    }
+  } else {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false, "No implemented qserve_w4a8_per_chn_gemm for current compute capability: ", sm_version);
+  }
+  return;
+}
diff --git a/sgl-kernel/csrc/gemm/qserve_w4a8_per_group_gemm.cu b/sgl-kernel/csrc/gemm/qserve_w4a8_per_group_gemm.cu
new file mode 100644
index 000000000..a99a203e8
--- /dev/null
+++ b/sgl-kernel/csrc/gemm/qserve_w4a8_per_group_gemm.cu
@@ -0,0 +1,795 @@
+// Implemented by Haotian Tang and Shang Yang.
+// @article{lin2024qserve,
+//   title={QServe: W4A8KV4 Quantization and System Co-design for Efficient LLM Serving},
+//   author={Lin*, Yujun and Tang*, Haotian and Yang*, Shang and Zhang, Zhekai and Xiao, Guangxuan and Gan, Chuang and
+//   Han, Song}, journal={arXiv preprint arXiv:2405.04532}, year={2024}
+// }
+// @article{yang2025lserve,
+//   title={LServe: Efficient Long-sequence LLM Serving with Unified Sparse Attention},
+//   author={Yang*, Shang and Guo*, Junxian and Tang, Haotian and Hu, Qinghao and Xiao, Guangxuan and Tang, Jiaming and
+//   Lin, Yujun and Liu, Zhijian and Lu, Yao and Han, Song}, year={2025}
+// }
+
+// Adapted from https://github.com/mit-han-lab/omniserve/blob/main/kernels/csrc/qgemm/w4a8_per_group/gemm_cuda.cu
+
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_fp16.h>
+#include <cuda_pipeline_primitives.h>
+#include <torch/all.h>
+
+#include "utils.h"
+
+#define OP_M 16
+#define OP_N 8
+#define OP_K 32
+#define INTRIN_M 16
+#define INTRIN_N 16
+#define INTRIN_K 32
+#define WARP_SIZE 32
+#define SMEM_PAD_A 0
+#define SMEM_PAD_B 0
+#define PACK_SIZE 16
+#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 4)
+#define L2_CACHEHINT(size) ".L2::" #size "B"
+#else
+#define L2_CACHEHINT(size)
+#endif
+
+#define KERNEL_LAUNCH_CODE                                                                                   \
+  constexpr int NUM_WARPS = (CTA_M / WARP_M) * (CTA_N / WARP_N) * (CTA_K / WARP_K);                          \
+  constexpr int SCALES_SMEM_SIZE = (G >= CTA_K) ? (CTA_N * STAGES * 2) : (CTA_N * (CTA_K / G) * STAGES * 2); \
+  constexpr int kSmemByteSize =                                                                              \
+      ((CTA_M * (CTA_K + SMEM_PAD_A) + CTA_N * (CTA_K + SMEM_PAD_B) / 2) * STAGES + SCALES_SMEM_SIZE) *      \
+      sizeof(int8_t);                                                                                        \
+  if (kSmemByteSize >= 99 * 1024) {                                                                          \
+    printf(                                                                                                  \
+        "This kernel requires %d Bytes of shared memory, which exceeds "                                     \
+        "device limit.\n",                                                                                   \
+        kSmemByteSize);                                                                                      \
+    return;                                                                                                  \
+  }                                                                                                          \
+  int num_blocks_m = (num_out_feats + CTA_M - 1) / CTA_M;                                                    \
+  int num_blocks_n = num_out_channels / CTA_N / 1;                                                           \
+  const int log_tile = get_log_tile<8>((num_out_feats + CTA_M - 1) / CTA_M);                                 \
+  const int tile_shift = 1 << log_tile;                                                                      \
+  dim3 num_blocks(num_blocks_n* tile_shift, (num_blocks_m + tile_shift - 1) / tile_shift);                   \
+  dim3 threads_per_block(WARP_SIZE, NUM_WARPS);                                                              \
+  auto kernel_func = dense_kernel0<CTA_M, CTA_N, CTA_K, WARP_M, WARP_N, WARP_K, STAGES, G>;                  \
+  cudaFuncSetAttribute(kernel_func, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemByteSize);             \
+  kernel_func<<<num_blocks, threads_per_block, kSmemByteSize, stream>>>(                                     \
+      in_feats,                                                                                              \
+      kernel,                                                                                                \
+      zeros,                                                                                                 \
+      scales_i8,                                                                                             \
+      wscales,                                                                                               \
+      ascales,                                                                                               \
+      out_feats,                                                                                             \
+      num_in_feats,                                                                                          \
+      num_out_channels,                                                                                      \
+      num_in_channels);
+
+template <int N>
+__inline__ __host__ __device__ int get_log_tile(int n) {
+  if (N >= 8 && n >= 6)
+    return 3;
+  else if (N >= 4 && n >= 3)
+    return 2;
+  else if (N >= 2 && n >= 2)
+    return 1;
+  else
+    return 0;
+}
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+__inline__ __device__ uint2 get_block_idx_mapping(int blockIdx_x, int blockIdx_y, int log_tile) {
+  return make_uint2((blockIdx_x >> log_tile), (blockIdx_y << log_tile) + ((blockIdx_x) & ((1 << (log_tile)) - 1)));
+}
+
+__inline__ __device__ uint32_t cast_smem_ptr_to_uint(void const* const ptr) {
+  uint32_t smem_int_ptr;
+
+  asm("{.reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 %0, "
+      "smem_ptr; }\n"
+      : "=r"(smem_int_ptr)
+      : "l"(ptr));
+
+  return smem_int_ptr;
+}
+
+__inline__ __device__ void ldmatrix_m8n8_x4_b16(int8_t* shared_warp, int ax0_0, uint32_t addr) {
+  __asm__ __volatile__(
+      "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
+      "{%0, %1, %2, %3}, [%4];"
+      : "=r"(((unsigned*)(shared_warp + (ax0_0 * 16)))[0]),
+        "=r"(((unsigned*)(shared_warp + (ax0_0 * 16)))[1]),
+        "=r"(((unsigned*)(shared_warp + (ax0_0 * 16)))[2]),
+        "=r"(((unsigned*)(shared_warp + (ax0_0 * 16)))[3])
+      : "r"(addr));
+}
+
+__inline__ __device__ void ldmatrix_m8n8_x4_trans_b16(int8_t* shared_warp, int ax0_0, uint32_t addr) {
+  __asm__ __volatile__(
+      "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+      "{%0, %1, %2, %3}, [%4];"
+      : "=r"(((unsigned*)(shared_warp + (ax0_0 * 16)))[0]),
+        "=r"(((unsigned*)(shared_warp + (ax0_0 * 16)))[1]),
+        "=r"(((unsigned*)(shared_warp + (ax0_0 * 16)))[2]),
+        "=r"(((unsigned*)(shared_warp + (ax0_0 * 16)))[3])
+      : "r"(addr));
+}
+
+// function from lmdeploy
+__inline__ __device__ void cp_async_cg_A(uint32_t smem_int_ptr, const uint4* __restrict__ src, bool mask) {
+  const int cp_size = 16;
+  asm volatile("{"
+                "  .reg .pred p;"
+                "  setp.ne.b32 p, %0, 0;"
+                "  @p cp.async.cg.shared.global" L2_CACHEHINT(128) " [%1], [%2], %3;"
+                "}" ::"r"((int)mask),
+                "r"(smem_int_ptr),
+                "l"(src),
+                "n"(cp_size));
+}
+
+__device__ __inline__ void mma_m16n8k32(void* C_warp, void* A_shared_warp, void* B_shared_warp) {
+  __asm__ __volatile__(
+      "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32"
+      "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+      : "=r"(((int*)C_warp)[0]), "=r"(((int*)C_warp)[1]), "=r"(((int*)C_warp)[2]), "=r"(((int*)C_warp)[3])
+      : "r"(((unsigned*)A_shared_warp)[0]),
+        "r"(((unsigned*)A_shared_warp)[1]),
+        "r"(((unsigned*)A_shared_warp)[2]),
+        "r"(((unsigned*)A_shared_warp)[3]),
+        "r"(((unsigned*)B_shared_warp)[0]),
+        "r"(((unsigned*)B_shared_warp)[1]),
+        "r"(((int*)C_warp)[0]),
+        "r"(((int*)C_warp)[1]),
+        "r"(((int*)C_warp)[2]),
+        "r"(((int*)C_warp)[3]));
+}
+
+template <int CTA_M, int CTA_N, int CTA_K, int CTA_SIZE, int SHARED_K_ITERS, int STAGES>
+__device__ __inline__ void global_to_share_one_stage_A(
+    int8_t* src,
+    int8_t* dst,
+    int global_ncols,
+    int cta_offset_m,
+    int cta_offset_n,
+    int global_iter_k,
+    int shared_iter_k,
+    bool mask,
+    bool* preds) {
+  constexpr int total_global_iters = (CTA_M * CTA_K) / PACK_SIZE / CTA_SIZE;
+  constexpr int partial_global_iters = total_global_iters / SHARED_K_ITERS;
+  constexpr int cta_step_m_or_n = (CTA_SIZE * PACK_SIZE) / CTA_K;
+  constexpr int warp_step_m_or_n = (WARP_SIZE * PACK_SIZE) / CTA_K;
+  constexpr int threads_per_row = CTA_K / PACK_SIZE;
+  constexpr int kSmemCol = CTA_K + SMEM_PAD_A;
+  int8_t* dst_hoisted = dst;
+  int8_t* src_hoisted = src + global_iter_k * CTA_K;
+
+  if (mask) {
+#pragma unroll
+    for (int _global_iter = 0; _global_iter < partial_global_iters; ++_global_iter) {
+      int global_iter = shared_iter_k * partial_global_iters + _global_iter;
+      void* dst_ptr = (void*)(dst_hoisted + global_iter * cta_step_m_or_n * kSmemCol);
+      uint4* src_ptr = (uint4*)(src_hoisted + global_iter * cta_step_m_or_n * global_ncols);
+      if constexpr (STAGES > 1) {
+        uint32_t addr = cast_smem_ptr_to_uint(dst_ptr);
+        cp_async_cg_A(addr, src_ptr, preds[global_iter]);
+      } else {
+        if (preds[global_iter]) *(uint4*)dst_ptr = *src_ptr;
+      }
+    }
+  }
+}
+
+template <int CTA_M, int CTA_N, int CTA_K, int CTA_SIZE, int SHARED_K_ITERS, int STAGES>
+__device__ __inline__ void global_to_share_one_stage_B(
+    int8_t* src,
+    int8_t* dst,
+    int global_ncols,
+    int cta_offset_m,
+    int cta_offset_n,
+    int global_iter_k,
+    int shared_iter_k,
+    bool mask) {
+  constexpr int total_global_iters = (CTA_N * CTA_K) / 32 / CTA_SIZE;
+  constexpr int NUM_WARPS = CTA_SIZE / WARP_SIZE;
+  constexpr int warps_per_row = CTA_K / 32;
+  constexpr int cta_step_m_or_n = NUM_WARPS / warps_per_row;
+  constexpr int kSmemCol = CTA_K;
+  int8_t* dst_hoisted = dst;
+  int8_t* src_hoisted = src + global_iter_k * CTA_K * PACK_SIZE;
+
+#pragma unroll
+  for (int global_iter = 0; global_iter < total_global_iters; ++global_iter) {
+    void* dst_ptr = (void*)(dst_hoisted + global_iter * cta_step_m_or_n * kSmemCol * PACK_SIZE);
+    uint4* src_ptr = (uint4*)(src_hoisted + global_iter * cta_step_m_or_n * global_ncols * PACK_SIZE);
+    if constexpr (STAGES > 1) {
+      uint32_t addr = cast_smem_ptr_to_uint(dst_ptr);
+      cp_async_cg_A(addr, src_ptr, mask);
+    } else {
+      if (mask) *(uint4*)dst_ptr = *src_ptr;
+    }
+  }
+}
+
+template <int CTA_M, int CTA_N, int CTA_K, int CTA_SIZE, int STAGES, int G>
+__device__ __inline__ void global_to_share_one_stage_zeros(
+    int8_t* src,
+    int8_t* dst,
+    int global_ncols,
+    int cta_offset_m,
+    int cta_offset_n,
+    int global_iter_k,
+    int shared_iter_k,
+    bool mask) {
+  constexpr int threads_needed = CTA_N / PACK_SIZE / 1;
+  constexpr int threads_used = threads_needed < CTA_SIZE ? threads_needed : CTA_SIZE;
+  constexpr int total_global_iters = CTA_N / PACK_SIZE / threads_used;
+  constexpr int threads_per_row = CTA_N / PACK_SIZE;
+  constexpr int kSmemCol = CTA_N;
+  bool local_mask = mask & (threadIdx.y * WARP_SIZE + threadIdx.x < threads_used);
+  int g_idx = global_iter_k * CTA_K / G;
+
+  void* dst_ptr = (void*)(dst + (threadIdx.x % threads_per_row) * PACK_SIZE);
+  uint4* src_ptr = (uint4*)(src + g_idx * global_ncols + cta_offset_n + (threadIdx.x % threads_per_row) * PACK_SIZE);
+  if (STAGES > 1) {
+    uint32_t addr = cast_smem_ptr_to_uint(dst_ptr);
+    cp_async_cg_A(addr, src_ptr, local_mask);
+  } else {
+    if (local_mask) {
+      *(uint4*)dst_ptr = *src_ptr;
+    }
+  }
+}
+
+template <int CTA_M, int CTA_N, int CTA_K, int CTA_SIZE, int STAGES>
+__device__ __inline__ void
+share_to_reg_one_stage_A(int8_t* src, int8_t* dst, int warp_offset_m, int warp_offset_n, int k_0_1, int shared_iters) {
+  constexpr int kSmemCol = CTA_K + SMEM_PAD_A;
+  int ld_col = (k_0_1 * INTRIN_K + (threadIdx.x / 16) * 16) / PACK_SIZE;
+
+  for (int shared_iter = 0; shared_iter < shared_iters; ++shared_iter) {
+    int ld_row = warp_offset_m + shared_iter * INTRIN_M + (threadIdx.x % 16);
+    int ld_col_swizzled = ld_col ^ (ld_row / 2) & 3;
+    void* addr_ptr = (void*)(src + ld_row * kSmemCol + ld_col_swizzled * PACK_SIZE);
+    uint32_t addr = cast_smem_ptr_to_uint(addr_ptr);
+    ldmatrix_m8n8_x4_b16(dst, shared_iter, addr);
+  }
+}
+
+template <int WARP_K, int CTA_N, int CTA_K, int CTA_SIZE, int STAGES, int G>
+__device__ __inline__ void share_to_reg_one_stage_B(
+    int8_t* src,
+    int8_t* dst,
+    int8_t* zeros,
+    int8_t* scales_i8,
+    int warp_offset_m,
+    int warp_offset_n,
+    int k_0_0,
+    int k_0_1,
+    int shared_iters) {
+  constexpr int kSmemCol = CTA_K + SMEM_PAD_B;
+#pragma unroll
+  for (int shared_iter = 0; shared_iter < shared_iters; ++shared_iter) {
+    uint4 loaded =
+        *((uint4*)(src) + warp_offset_n / 32 * kSmemCol + shared_iter * 32 / 32 * kSmemCol + k_0_1 * INTRIN_K +
+          threadIdx.x);
+    uint32_t loaded_0 = loaded.x & 0x0F0F0F0F;
+    uint32_t loaded_4 = (loaded.x & 0xF0F0F0F0) >> 4;
+    uint32_t loaded_2 = loaded.y & 0x0F0F0F0F;
+    uint32_t loaded_6 = (loaded.y & 0xF0F0F0F0) >> 4;
+    uint32_t loaded_1 = loaded.z & 0x0F0F0F0F;
+    uint32_t loaded_5 = (loaded.z & 0xF0F0F0F0) >> 4;
+    uint32_t loaded_3 = loaded.w & 0x0F0F0F0F;
+    uint32_t loaded_7 = (loaded.w & 0xF0F0F0F0) >> 4;
+
+    auto ptr = (uint32_t*)dst + shared_iter * 8;
+    int scales_zeros_offset = warp_offset_n + (threadIdx.x / 4) * 4 + shared_iter * 32;
+    uint32_t packed_scales = *reinterpret_cast<uint32_t*>(scales_i8 + scales_zeros_offset);
+    uint32_t packed_zeros = *reinterpret_cast<uint32_t*>(zeros + scales_zeros_offset);
+
+    uint32_t scale_0 = packed_scales & 0xFF;
+    uint32_t zero_point_0 = __byte_perm(packed_zeros, 0, 0x00000000);
+    uint32_t ptr_0 = loaded_0 * scale_0;
+    uint32_t ptr_1 = loaded_1 * scale_0;
+    ptr[0] = __vadd4(ptr_0, zero_point_0);
+    ptr[1] = __vadd4(ptr_1, zero_point_0);
+
+    uint32_t scale_1 = (packed_scales & 0xFF00) >> 8;
+    uint32_t zero_point_1 = __byte_perm(packed_zeros, 0, 0x00001111);
+    uint32_t ptr_2 = loaded_2 * scale_1;
+    uint32_t ptr_3 = loaded_3 * scale_1;
+    ptr[2] = __vadd4(ptr_2, zero_point_1);
+    ptr[3] = __vadd4(ptr_3, zero_point_1);
+
+    uint32_t scale_2 = (packed_scales & 0xFF0000) >> 16;
+    uint32_t zero_point_2 = __byte_perm(packed_zeros, 0, 0x00002222);
+    uint32_t ptr_4 = loaded_4 * scale_2;
+    uint32_t ptr_5 = loaded_5 * scale_2;
+    ptr[4] = __vadd4(ptr_4, zero_point_2);
+    ptr[5] = __vadd4(ptr_5, zero_point_2);
+
+    uint32_t scale_3 = (packed_scales & 0xFF000000) >> 24;
+    uint32_t zero_point_3 = __byte_perm(packed_zeros, 0, 0x00003333);
+    uint32_t ptr_6 = loaded_6 * scale_3;
+    uint32_t ptr_7 = loaded_7 * scale_3;
+    ptr[6] = __vadd4(ptr_6, zero_point_3);
+    ptr[7] = __vadd4(ptr_7, zero_point_3);
+  }
+}
+
+template <int CTA_M, int CTA_N, int CTA_K, int WARP_M, int WARP_N, int WARP_K, int STAGES, int G>
+__global__ void dense_kernel0(
+    int8_t* __restrict__ A,
+    int8_t* __restrict__ B,
+    int8_t* __restrict__ zeros,
+    int8_t* __restrict__ scales_i8,
+    half2* __restrict__ wscales,
+    half* __restrict__ ascales,
+    half* __restrict__ C,
+    int M,
+    int64_t N,
+    int64_t K) {
+  constexpr int SPLITK = 1;
+  constexpr int NUM_WARPS_MN = CTA_M / WARP_M * CTA_N / WARP_N;
+  constexpr int NUM_WARPS = NUM_WARPS_MN * CTA_K / WARP_K;
+  constexpr int CTA_SIZE = NUM_WARPS * WARP_SIZE;
+  constexpr int CTA_SIZE_MN = NUM_WARPS_MN * WARP_SIZE;
+  constexpr int SLICES = CTA_K / WARP_K;
+  int num_blocks_n = (N + CTA_N - 1) / CTA_N;
+  int num_blocks_m = (M + CTA_M - 1) / CTA_M;
+
+  int blockIdx_n = blockIdx.x;
+  int blockIdx_m = blockIdx.y;
+  const int log_tile = get_log_tile<8>((M + CTA_M - 1) / CTA_M);
+  const uint2 block_idx_mapping = get_block_idx_mapping(blockIdx_n, blockIdx_m, log_tile);
+  blockIdx_n = block_idx_mapping.x;
+  blockIdx_m = block_idx_mapping.y;
+
+  int C_warp[CTA_M * CTA_N / CTA_SIZE_MN];
+  constexpr int kSmemPadKA = CTA_K + SMEM_PAD_A;
+  constexpr int kSmemPadKB = CTA_K + SMEM_PAD_B;
+  constexpr int kSmemSizeAPerStage = CTA_M * kSmemPadKA;
+  constexpr int kSmemSizeBPerStage = CTA_N * kSmemPadKB / 2;
+  constexpr int kSmemSizeA = kSmemSizeAPerStage * STAGES;
+  constexpr int kSmemSizeB = kSmemSizeBPerStage * STAGES;
+
+  constexpr int scales_load_interval = G >= CTA_K ? G / CTA_K : 1;
+  constexpr int scales_per_load = G < CTA_K ? CTA_K / G : 1;
+  constexpr int kSmemSizeScales = CTA_N * STAGES;
+
+  extern __shared__ int8_t mem_shared[];
+  int8_t* A_shared = mem_shared;
+
+  int8_t* B_shared = mem_shared + kSmemSizeA;
+  int8_t* zeros_shared = mem_shared + kSmemSizeA + kSmemSizeB;
+  int8_t* scales_i8_shared = mem_shared + kSmemSizeA + kSmemSizeB + kSmemSizeScales;
+
+  int8_t A_shared_warp_[2][WARP_M * WARP_K / WARP_SIZE];
+  int8_t B_shared_warp_[2][WARP_N * WARP_K / WARP_SIZE];
+  constexpr int A_total_global_iters = (CTA_M * CTA_K) / PACK_SIZE / CTA_SIZE;
+  constexpr int B_total_global_iters = (CTA_N * CTA_K) / PACK_SIZE / CTA_SIZE;
+  constexpr int A_src_step_m = (CTA_SIZE * PACK_SIZE) / CTA_K;
+  constexpr int A_warp_step_m = (WARP_SIZE * PACK_SIZE) / CTA_K;
+  constexpr int A_threads_per_row = CTA_K / PACK_SIZE;
+
+  constexpr int B_warps_per_row = CTA_K / 32;
+  constexpr int B_src_step_n = NUM_WARPS / B_warps_per_row;
+
+  int cta_offset_m = blockIdx_m * CTA_M;
+  int cta_offset_n = blockIdx_n * CTA_N;
+  int warp_mn = threadIdx.y % NUM_WARPS_MN;
+  int slice_id = threadIdx.y / NUM_WARPS_MN;
+  int warp_offset_m = (warp_mn % (CTA_M / WARP_M)) * WARP_M;
+  int warp_offset_n = (warp_mn / (CTA_M / WARP_M)) * WARP_N;
+  int warp_offset_k = slice_id * WARP_K;
+
+  for (int i = 0; i < CTA_M * CTA_N / CTA_SIZE_MN; i++)
+    C_warp[i] = 0;
+
+  int gemm_iters = (K + CTA_K - 1) / CTA_K;
+
+  int k_0_0_ld = 0;
+  int k_0_0 = 0;
+  constexpr int prologue_stages = STAGES == 1 ? 1 : STAGES - 1;
+  int A_hoisted_row = threadIdx.y * A_warp_step_m + (threadIdx.x / A_threads_per_row);
+  int A_hoisted_col = (threadIdx.x % A_threads_per_row);
+  int A_hoisted_col_swizzled = A_hoisted_col ^ (A_hoisted_row / 2) & 3;
+
+  int8_t* A_shared_hoisted = A_shared + A_hoisted_row * kSmemPadKA + A_hoisted_col_swizzled * PACK_SIZE;
+  int8_t* B_shared_hoisted = B_shared + (threadIdx.y % B_warps_per_row) * 32 * PACK_SIZE +
+                             (threadIdx.y / B_warps_per_row) * kSmemPadKB * PACK_SIZE + threadIdx.x * PACK_SIZE;
+  int8_t* A_hoisted = A + cta_offset_m * K + A_hoisted_row * K + A_hoisted_col * PACK_SIZE;
+  int8_t* B_hoisted = B + cta_offset_n / 32 * K * PACK_SIZE + (threadIdx.y % B_warps_per_row) * 32 * PACK_SIZE +
+                      (threadIdx.y / B_warps_per_row) * K * PACK_SIZE + threadIdx.x * PACK_SIZE;
+
+  bool A_g2s_preds[A_total_global_iters];
+#pragma unroll
+  for (int i = 0; i < A_total_global_iters; i++) {
+    A_g2s_preds[i] = (cta_offset_m + A_hoisted_row + i * A_src_step_m) < M;
+  }
+
+  int* C_shared = reinterpret_cast<int*>(mem_shared);
+
+#pragma unroll
+  for (k_0_0_ld = 0; k_0_0_ld < prologue_stages; ++k_0_0_ld) {
+    global_to_share_one_stage_A<CTA_M, CTA_N, CTA_K, CTA_SIZE, 1, STAGES>(
+        A_hoisted,
+        A_shared_hoisted + k_0_0_ld * kSmemSizeAPerStage,
+        K,
+        cta_offset_m,
+        cta_offset_n,
+        k_0_0_ld,
+        0,
+        true,
+        A_g2s_preds);
+    global_to_share_one_stage_B<CTA_M, CTA_N, CTA_K, CTA_SIZE, 1, STAGES>(
+        B_hoisted, B_shared_hoisted + k_0_0_ld * kSmemSizeBPerStage, K, cta_offset_m, cta_offset_n, k_0_0_ld, 0, true);
+    global_to_share_one_stage_zeros<CTA_M, CTA_N, CTA_K, CTA_SIZE, STAGES, G>(
+        zeros, zeros_shared + (k_0_0_ld)*CTA_N, N, cta_offset_m, cta_offset_n, k_0_0_ld, 0, k_0_0_ld < gemm_iters);
+    global_to_share_one_stage_zeros<CTA_M, CTA_N, CTA_K, CTA_SIZE, STAGES, G>(
+        scales_i8,
+        scales_i8_shared + (k_0_0_ld)*CTA_N,
+        N,
+        cta_offset_m,
+        cta_offset_n,
+        k_0_0_ld,
+        0,
+        k_0_0_ld < gemm_iters);
+
+    if constexpr (STAGES > 1) __pipeline_commit();
+  }
+  if constexpr (STAGES > 1) __pipeline_wait_prior(STAGES - 2);
+  __syncthreads();
+
+  share_to_reg_one_stage_A<CTA_M, CTA_N, CTA_K, CTA_SIZE, STAGES>(
+      A_shared + warp_offset_k, A_shared_warp_[0], warp_offset_m, warp_offset_n, 0, WARP_M / INTRIN_M);
+  share_to_reg_one_stage_B<CTA_M, CTA_N, CTA_K, CTA_SIZE, STAGES, G>(
+      B_shared + warp_offset_k * PACK_SIZE,
+      B_shared_warp_[0],
+      zeros_shared,
+      scales_i8_shared,
+      warp_offset_m,
+      warp_offset_n,
+      0,
+      0,
+      WARP_N / 32);
+  constexpr int SHARED_K_ITERS = WARP_K / INTRIN_K;
+
+  for (; k_0_0 < gemm_iters; ++k_0_0, ++k_0_0_ld) {
+    int ld_stage = k_0_0_ld % STAGES;
+    int compute_stage = k_0_0 % STAGES;
+    int8_t* A_shared_this_compute_stage;
+    int8_t* B_shared_this_compute_stage;
+    int8_t* zeros_shared_this_compute_stage;
+    int8_t* scales_i8_shared_this_compute_stage;
+
+    for (int iter_k = 0; iter_k < SHARED_K_ITERS; ++iter_k) {
+      A_shared_this_compute_stage = A_shared + compute_stage * kSmemSizeAPerStage + warp_offset_k;
+      B_shared_this_compute_stage = B_shared + compute_stage * kSmemSizeBPerStage + warp_offset_k * PACK_SIZE;
+      zeros_shared_this_compute_stage = zeros_shared + (compute_stage)*CTA_N;
+      scales_i8_shared_this_compute_stage = scales_i8_shared + (compute_stage)*CTA_N;
+
+      share_to_reg_one_stage_A<CTA_M, CTA_N, CTA_K, CTA_SIZE, STAGES>(
+          A_shared_this_compute_stage,
+          A_shared_warp_[(iter_k + 1) % 2],
+          warp_offset_m,
+          warp_offset_n,
+          (iter_k + 1) % SHARED_K_ITERS,
+          WARP_M / INTRIN_M);
+      share_to_reg_one_stage_B<CTA_M, CTA_N, CTA_K, CTA_SIZE, STAGES, G>(
+          B_shared_this_compute_stage,
+          B_shared_warp_[(iter_k + 1) % 2],
+          zeros_shared_this_compute_stage,
+          scales_i8_shared_this_compute_stage,
+          warp_offset_m,
+          warp_offset_n,
+          k_0_0 + (iter_k == SHARED_K_ITERS - 1),
+          (iter_k + 1) % SHARED_K_ITERS,
+          WARP_N / 32);
+      int8_t* A_shared_warp = A_shared_warp_[iter_k % 2];
+      int8_t* B_shared_warp = B_shared_warp_[iter_k % 2];
+
+      for (int j_0_4 = 0; j_0_4 < WARP_N / INTRIN_N; ++j_0_4) {
+        for (int i_0_3 = 0; i_0_3 < WARP_M / INTRIN_M; ++i_0_3) {
+          mma_m16n8k32(
+              (void*)(C_warp + i_0_3 * WARP_N / INTRIN_N * 8 + j_0_4 * 8),
+              (void*)(A_shared_warp + i_0_3 * 16),
+              (void*)(B_shared_warp + j_0_4 * 16));
+          mma_m16n8k32(
+              (void*)(C_warp + i_0_3 * WARP_N / INTRIN_N * 8 + j_0_4 * 8 + 4),
+              (void*)(A_shared_warp + i_0_3 * 16),
+              (void*)(B_shared_warp + j_0_4 * 16 + 8));
+        }
+      }
+
+      if (iter_k < SHARED_K_ITERS - 1) {
+        if constexpr (STAGES == 1) __syncthreads();
+        global_to_share_one_stage_A<CTA_M, CTA_N, CTA_K, CTA_SIZE, WARP_K / INTRIN_K, STAGES>(
+            A_hoisted,
+            A_shared_hoisted + ld_stage * kSmemSizeAPerStage,
+            K,
+            cta_offset_m,
+            cta_offset_n,
+            k_0_0_ld,
+            iter_k,
+            k_0_0_ld < gemm_iters,
+            A_g2s_preds);
+        global_to_share_one_stage_B<CTA_M, CTA_N, CTA_K, CTA_SIZE, WARP_K / INTRIN_K, STAGES>(
+            B_hoisted,
+            B_shared_hoisted + ld_stage * kSmemSizeBPerStage,
+            K,
+            cta_offset_m,
+            cta_offset_n,
+            k_0_0_ld,
+            iter_k,
+            k_0_0_ld < gemm_iters);
+      }
+
+      if (iter_k == SHARED_K_ITERS - 2) {
+        if constexpr (STAGES == 1 && SHARED_K_ITERS > 2) {
+          __syncthreads();
+        }
+        global_to_share_one_stage_A<CTA_M, CTA_N, CTA_K, CTA_SIZE, WARP_K / INTRIN_K, STAGES>(
+            A_hoisted,
+            A_shared_hoisted + ld_stage * kSmemSizeAPerStage,
+            K,
+            cta_offset_m,
+            cta_offset_n,
+            k_0_0_ld,
+            iter_k + 1,
+            k_0_0_ld < gemm_iters,
+            A_g2s_preds);
+        global_to_share_one_stage_B<CTA_M, CTA_N, CTA_K, CTA_SIZE, WARP_K / INTRIN_K, STAGES>(
+            B_hoisted,
+            B_shared_hoisted + ld_stage * kSmemSizeBPerStage,
+            K,
+            cta_offset_m,
+            cta_offset_n,
+            k_0_0_ld,
+            iter_k + 1,
+            k_0_0_ld < gemm_iters);
+        global_to_share_one_stage_zeros<CTA_M, CTA_N, CTA_K, CTA_SIZE, STAGES, G>(
+            zeros,
+            zeros_shared + (ld_stage)*CTA_N,
+            N,
+            cta_offset_m,
+            cta_offset_n,
+            k_0_0_ld,
+            iter_k,
+            k_0_0_ld < gemm_iters);
+        global_to_share_one_stage_zeros<CTA_M, CTA_N, CTA_K, CTA_SIZE, STAGES, G>(
+            scales_i8,
+            scales_i8_shared + (ld_stage)*CTA_N,
+            N,
+            cta_offset_m,
+            cta_offset_n,
+            k_0_0_ld,
+            iter_k,
+            k_0_0_ld < gemm_iters);
+        if constexpr (STAGES > 1) {
+          __pipeline_commit();
+          __pipeline_wait_prior(STAGES - 2);
+        }
+        compute_stage = (k_0_0 + 1) % STAGES;
+        __syncthreads();
+      }
+    }
+  }
+  __pipeline_commit();
+  __pipeline_wait_prior(0);
+  __syncthreads();
+
+  if constexpr (SLICES > 1) {
+#pragma unroll
+    for (int z = 0; z < SLICES; ++z) {
+      if (slice_id == z) {
+#pragma unroll
+        for (int ax0_0_1 = 0; ax0_0_1 < WARP_M / INTRIN_M; ++ax0_0_1) {
+#pragma unroll
+          for (int ax1_0_1 = 0; ax1_0_1 < WARP_N / INTRIN_N; ++ax1_0_1) {
+#pragma unroll
+            for (int local_id = 0; local_id < OP_M * 16 / WARP_SIZE; ++local_id) {
+              if (z > 0) {
+                C_warp[ax0_0_1 * WARP_N / INTRIN_N * 8 + ax1_0_1 * 8 + local_id] += C_shared
+                    [warp_offset_m * CTA_N + ax0_0_1 * OP_M * CTA_N + warp_offset_n + ax1_0_1 * 16 +
+                     ((local_id % 4) / 2 * 8 + (threadIdx.x / 4)) * CTA_N + (local_id / 4) * 8 + (local_id % 2) +
+                     (threadIdx.x % 4) * 2];
+              }
+              C_shared
+                  [warp_offset_m * CTA_N + ax0_0_1 * OP_M * CTA_N + warp_offset_n + ax1_0_1 * 16 +
+                   ((local_id % 4) / 2 * 8 + (threadIdx.x / 4)) * CTA_N + (local_id / 4) * 8 + (local_id % 2) +
+                   (threadIdx.x % 4) * 2] = C_warp[ax0_0_1 * WARP_N / INTRIN_N * 8 + ax1_0_1 * 8 + local_id];
+            };
+          }
+        }
+      }
+      __syncthreads();
+    }
+    if (slice_id == 0) {
+#pragma unroll
+      for (int ax0_0_1 = 0; ax0_0_1 < WARP_M / INTRIN_M; ++ax0_0_1) {
+#pragma unroll
+        for (int ax1_0_1 = 0; ax1_0_1 < WARP_N / INTRIN_N; ++ax1_0_1) {
+#pragma unroll
+          for (int local_id = 0; local_id < OP_M * 16 / WARP_SIZE; ++local_id) {
+            C_warp[ax0_0_1 * WARP_N / INTRIN_N * 8 + ax1_0_1 * 8 + local_id] = C_shared
+                [warp_offset_m * CTA_N + ax0_0_1 * OP_M * CTA_N + warp_offset_n + ax1_0_1 * 16 +
+                 ((local_id % 4) / 2 * 8 + (threadIdx.x / 4)) * CTA_N + (local_id / 4) * 8 + (local_id % 2) +
+                 (threadIdx.x % 4) * 2];
+          };
+        }
+      }
+    }
+  }
+
+  int row_wb_thd = cta_offset_m + warp_offset_m + (threadIdx.x / 4);
+  int col_wb_thd = cta_offset_n + warp_offset_n + (threadIdx.x % 4) * 2;
+  if (slice_id == 0) {
+    for (int ax0_0_1 = 0; ax0_0_1 < WARP_M / INTRIN_M; ++ax0_0_1) {
+      int row_wb_1 = row_wb_thd + ax0_0_1 * OP_M;
+      for (int ax1_0_1 = 0; ax1_0_1 < WARP_N / INTRIN_N; ++ax1_0_1) {
+        int col_wb_1 = col_wb_thd + ax1_0_1 * 16;
+        int* C_warp_local = C_warp + ax0_0_1 * WARP_N / INTRIN_N * 8 + ax1_0_1 * 8;
+        for (int local_id = 0; local_id < OP_M * 16 / WARP_SIZE; local_id += 2) {
+          int row_wb = row_wb_1 + (local_id % 4) / 2 * 8;
+          if (row_wb < M) {
+            int col_wb = col_wb_1 + (local_id / 4) * 8 + (local_id % 2);
+            float2 wscale = __half22float2(*(wscales + col_wb / 2));
+            float ascale = __half2float(ascales[row_wb]);
+            float2 psums =
+                make_float2(__int2float_rn(C_warp_local[local_id]), __int2float_rn(C_warp_local[local_id + 1]));
+            psums.x *= wscale.x * ascale;
+            psums.y *= wscale.y * ascale;
+            *reinterpret_cast<half2*>(C + row_wb * N + col_wb) = __float22half2_rn(psums);
+          }
+        };
+      }
+    }
+  }
+}
+#else
+template <int CTA_M, int CTA_N, int CTA_K, int WARP_M, int WARP_N, int WARP_K, int STAGES, int G>
+__global__ void dense_kernel0(
+    int8_t* __restrict__ A,
+    int8_t* __restrict__ B,
+    int8_t* __restrict__ zeros,
+    int8_t* __restrict__ scales_i8,
+    half2* __restrict__ wscales,
+    half* __restrict__ ascales,
+    half* __restrict__ C,
+    int M,
+    int64_t N,
+    int64_t K) {
+  // Not implemented for SM < 800
+  assert(false);
+  return;
+}
+#endif
+
+void qserve_w4a8_per_group_gemm(
+    const torch::Tensor& _in_feats,
+    const torch::Tensor& _kernel,
+    const torch::Tensor& _zeros,
+    const torch::Tensor& _scales_i8,
+    const torch::Tensor& _wscales,
+    const torch::Tensor& _ascales,
+    torch::Tensor& _out_feats) {
+  // Check input tensor
+  TORCH_CHECK(_in_feats.is_cuda(), "_in_feats must be a CUDA tensor");
+  TORCH_CHECK(_in_feats.dim() == 2, "_in_feats must be a 2D tensor");
+  TORCH_CHECK(_in_feats.is_contiguous(), "_in_feats must be contiguous");
+  TORCH_CHECK(_in_feats.scalar_type() == torch::kInt8, "_in_feats must be int8");
+  // Check kernel tensor
+  TORCH_CHECK(_kernel.is_cuda(), "_kernel must be a CUDA tensor");
+  TORCH_CHECK(_kernel.dim() == 2, "_kernel must be a 2D tensor");
+  TORCH_CHECK(_kernel.is_contiguous(), "_kernel must be contiguous");
+  TORCH_CHECK(_kernel.scalar_type() == torch::kInt8, "_kernel must be int8");
+  // Check output tensor
+  TORCH_CHECK(_out_feats.is_cuda(), "_out_feats must be a CUDA tensor");
+  TORCH_CHECK(_out_feats.is_contiguous(), "_out_feats must be contiguous");
+  TORCH_CHECK(_out_feats.scalar_type() == torch::kHalf, "_out_feats must be half");
+
+  int num_in_feats = _in_feats.size(0);
+  int num_in_channels = _in_feats.size(1);
+  int num_out_feats = _out_feats.size(-2);
+  int num_out_channels = _out_feats.size(-1);
+
+  // Check matmul shape
+  TORCH_CHECK(num_out_channels == _kernel.size(0), "num_out_channels must be equal to _kernel.size(0)");
+  TORCH_CHECK(num_in_feats == num_out_feats, "num_in_feats must be equal to num_out_feats");
+
+  // Check _ascales
+  TORCH_CHECK(_ascales.is_cuda(), "_ascales must be a CUDA tensor");
+  TORCH_CHECK(_ascales.is_contiguous(), "_ascales must be contiguous");
+  TORCH_CHECK(_ascales.scalar_type() == torch::kHalf, "_ascales must be half");
+  TORCH_CHECK(_ascales.numel() == num_in_feats, "_ascales must have num_in_feats elements");
+
+  // Check _wscales
+  TORCH_CHECK(_wscales.is_cuda(), "_wscales must be a CUDA tensor");
+  TORCH_CHECK(_wscales.is_contiguous(), "_wscales must be contiguous");
+  TORCH_CHECK(_wscales.scalar_type() == torch::kHalf, "_wscales must be half");
+  TORCH_CHECK(_wscales.numel() == num_out_channels, "_wscales must have num_out_channels elements");
+
+  // Check _scales_i8
+  TORCH_CHECK(_scales_i8.is_cuda(), "_scales_i8 must be a CUDA tensor");
+  TORCH_CHECK(_scales_i8.dim() == 2, "_scales_i8 must be a 2D tensor");
+  TORCH_CHECK(_scales_i8.is_contiguous(), "_scales_i8 must be contiguous");
+  TORCH_CHECK(_scales_i8.scalar_type() == torch::kInt8, "_scales_i8 must be int8");
+  TORCH_CHECK(num_in_channels % _scales_i8.size(0) == 0, "num_in_channels must be divisible by _scales_i8.size(0)");
+  TORCH_CHECK(num_out_channels == _scales_i8.size(1), "num_out_channels must be equal to _scales_i8.size(1)");
+
+  // Check _zeros
+  TORCH_CHECK(_zeros.is_cuda(), "_zeros must be a CUDA tensor");
+  TORCH_CHECK(_zeros.dim() == 2, "_zeros must be a 2D tensor");
+  TORCH_CHECK(_zeros.is_contiguous(), "_zeros must be contiguous");
+  TORCH_CHECK(_zeros.scalar_type() == torch::kInt8, "_zeros must be int8");
+  TORCH_CHECK(num_in_channels % _zeros.size(0) == 0, "num_in_channels must be divisible by _zeros.size(0)");
+  TORCH_CHECK(num_out_channels == _zeros.size(1), "num_out_channels must be equal to _zeros.size(1)");
+
+  // Check group size
+  auto group_size = num_in_channels / _scales_i8.size(0);
+  TORCH_CHECK(group_size == 128, "group_size must be 128");
+
+  auto in_feats = reinterpret_cast<int8_t*>(_in_feats.data_ptr<int8_t>());
+  auto kernel = reinterpret_cast<int8_t*>(_kernel.data_ptr<int8_t>());
+  auto zeros = reinterpret_cast<int8_t*>(_zeros.data_ptr<int8_t>());
+  auto scales_i8 = reinterpret_cast<int8_t*>(_scales_i8.data_ptr<int8_t>());
+  auto wscales = reinterpret_cast<half2*>(_wscales.data_ptr());
+  auto ascales = reinterpret_cast<half*>(_ascales.data_ptr());
+  // auto options =
+  //     torch::TensorOptions().dtype(torch::kHalf).device(_in_feats.device());
+  auto out_feats = reinterpret_cast<half*>(_out_feats.data_ptr<at::Half>());
+  auto stream = at::cuda::getCurrentCUDAStream(_in_feats.get_device());
+  auto sm_version = getSMVersion();
+  if (sm_version >= 80) {
+    constexpr int G = 128;
+
+    if (num_out_feats > 128) {
+      constexpr int CTA_M = 128;
+      constexpr int CTA_N = 64;
+      constexpr int CTA_K = 64;
+      constexpr int WARP_M = 64;
+      constexpr int WARP_N = 32;
+      constexpr int WARP_K = 64;
+      constexpr int STAGES = 4;
+      KERNEL_LAUNCH_CODE
+    } else if (num_out_feats >= 128) {
+      if (num_in_channels <= 4096) {
+        constexpr int CTA_M = 64;
+        constexpr int CTA_N = 64;
+        constexpr int CTA_K = 64;
+        constexpr int WARP_M = 32;
+        constexpr int WARP_N = 32;
+        constexpr int WARP_K = 64;
+        constexpr int STAGES = 4;
+        KERNEL_LAUNCH_CODE
+      } else {
+        constexpr int CTA_M = 64;
+        constexpr int CTA_N = 64;
+        constexpr int CTA_K = 128;
+        constexpr int WARP_M = 32;
+        constexpr int WARP_N = 32;
+        constexpr int WARP_K = 64;
+        constexpr int STAGES = 3;
+        KERNEL_LAUNCH_CODE
+      }
+    } else {
+      constexpr int CTA_M = 32;
+      constexpr int CTA_N = 64;
+      constexpr int CTA_K = 128;
+      constexpr int WARP_M = 32;
+      constexpr int WARP_N = 32;
+      constexpr int WARP_K = 64;
+      constexpr int STAGES = 3;
+      KERNEL_LAUNCH_CODE
+    }
+  } else {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false, "No implemented qserve_w4a8_per_group_gemm for current compute capability: ", sm_version);
+  }
+  return;
+}
diff --git a/sgl-kernel/csrc/grammar/apply_token_bitmask_inplace_cuda.cu b/sgl-kernel/csrc/grammar/apply_token_bitmask_inplace_cuda.cu
new file mode 100644
index 000000000..84b678551
--- /dev/null
+++ b/sgl-kernel/csrc/grammar/apply_token_bitmask_inplace_cuda.cu
@@ -0,0 +1,271 @@
+// Adapted from
+// https://github.com/mlc-ai/xgrammar/blob/v0.1.18/python/xgrammar/kernels/apply_token_bitmask_inplace_cuda.cu
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// clang-format off
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+
+
+#if !defined(USE_ROCM) && (!defined(CUDA_VERSION) || CUDA_VERSION < 12040)
+void ApplyTokenBitmaskInplace(at::Tensor logits, at::Tensor bitmask, at::optional<at::Tensor> indices = at::nullopt) {
+  TORCH_CHECK(false, "CUDA version must be >= 12.4 for ApplyTokenBitmaskInplace");
+}
+#else
+
+#ifndef CUDART_INF_FP16
+#ifndef USE_ROCM
+#define CUDART_INF_FP16 __ushort_as_half((unsigned short)0x7C00U)
+#endif
+#endif
+
+#ifndef CUDART_INF_BF16
+#ifndef USE_ROCM
+#define CUDART_INF_BF16 __ushort_as_bfloat16((unsigned short)0x7F80U)
+#endif
+#endif
+
+constexpr int32_t BITS_PER_BLOCK = 32;
+constexpr int32_t THREADS_PER_THREAD_BLOCK = 256;
+
+template <typename T>
+__device__ T NegativeInfinity() {
+  return -INFINITY;
+}
+
+template <>
+__device__ __half NegativeInfinity<__half>() {
+#ifdef USE_ROCM
+  return __float2half(-INFINITY);
+#else
+  return -CUDART_INF_FP16;
+#endif
+}
+
+template <>
+__device__ __nv_bfloat16 NegativeInfinity<__nv_bfloat16>() {
+#ifdef USE_ROCM
+  return __nv_bfloat16(-INFINITY);
+#else
+  return -CUDART_INF_BF16;
+#endif
+}
+
+template <typename T, typename PackedT>
+__device__ PackedT PackedNegativeInfinity() {
+  constexpr int kAlignment = sizeof(PackedT) / sizeof(T);
+  T packed[kAlignment];
+#pragma unroll
+  for (int i = 0; i < kAlignment; i++) {
+    packed[i] = NegativeInfinity<T>();
+  }
+  return *reinterpret_cast<PackedT*>(packed);
+}
+
+template <typename T, typename PackedT, int32_t kBitsPerThread>
+__global__ void __launch_bounds__(THREADS_PER_THREAD_BLOCK) LogitsBitmaskKernel(
+    T* __restrict__ logits,
+    const int32_t* __restrict__ bitmask,
+    const int32_t* __restrict__ indices,
+    int32_t vocab_size,
+    int32_t logits_stride,
+    int32_t bitmask_stride) {
+  constexpr int kAlignment = sizeof(PackedT) / sizeof(T);
+  constexpr uint32_t kPackedMask = (1 << kAlignment) - 1;
+
+  const int batch_idx = (indices == nullptr) ? blockIdx.y : indices[blockIdx.y];
+
+  const int block_offset = blockIdx.x * THREADS_PER_THREAD_BLOCK * kBitsPerThread;
+  T* logits_gmem_ptr = logits + batch_idx * logits_stride + block_offset;
+  const int32_t* bitmask_gmem_ptr = bitmask + batch_idx * bitmask_stride + block_offset / BITS_PER_BLOCK;
+  const int bitmask_inner_idx = threadIdx.x % (BITS_PER_BLOCK / kAlignment);
+  T logits_reg[kAlignment];
+
+#pragma unroll
+  for (int offset = threadIdx.x * kAlignment; offset < THREADS_PER_THREAD_BLOCK * kBitsPerThread;
+       offset += THREADS_PER_THREAD_BLOCK * kAlignment) {
+    if (block_offset + offset >= vocab_size) {
+      break;
+    }
+
+    const uint32_t bitmask_val =
+        (~bitmask_gmem_ptr[offset / BITS_PER_BLOCK] >> (bitmask_inner_idx * kAlignment)) & kPackedMask;
+
+    if (bitmask_val == 0) {
+      continue;
+    }
+
+    if (bitmask_val == kPackedMask) {
+      *reinterpret_cast<PackedT*>(logits_gmem_ptr + offset) = PackedNegativeInfinity<T, PackedT>();
+      continue;
+    }
+
+    *reinterpret_cast<PackedT*>(logits_reg) = *reinterpret_cast<PackedT*>(logits_gmem_ptr + offset);
+#pragma unroll
+    for (int i = 0; i < kAlignment; i++) {
+      if (((bitmask_val >> i) & 1)) {
+        logits_reg[i] = NegativeInfinity<T>();
+      }
+    }
+    *reinterpret_cast<PackedT*>(logits_gmem_ptr + offset) = *reinterpret_cast<PackedT*>(logits_reg);
+  }
+}
+
+template <typename T, typename = std::enable_if_t<std::is_integral<T>::value>>
+constexpr auto CeilDiv(T numerator, T denominator) {
+  return (numerator + denominator - 1) / denominator;
+}
+
+template <typename T, typename PackedT>
+void ApplyTokenBitmaskInplaceDispatchToBitsPerThread(
+    T* __restrict__ logits,
+    const int32_t* __restrict__ bitmask,
+    const int32_t* __restrict__ indices,
+    int32_t vocab_size,
+    int32_t logits_stride,
+    int32_t bitmask_stride,
+    int32_t num_rows) {
+  constexpr int kAlignment = sizeof(PackedT) / sizeof(T);
+  const int32_t num_blocks_per_row = CeilDiv(2048 / THREADS_PER_THREAD_BLOCK * 128, num_rows);
+  const int32_t num_bits_per_thread = CeilDiv(vocab_size, THREADS_PER_THREAD_BLOCK * num_blocks_per_row);
+
+  const dim3 block(THREADS_PER_THREAD_BLOCK);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+
+  if (num_bits_per_thread <= 4 && kAlignment <= 4) {
+    const dim3 grid(CeilDiv(vocab_size, THREADS_PER_THREAD_BLOCK * 4), num_rows);
+    LogitsBitmaskKernel<T, PackedT, 4>
+        <<<grid, block, 0, stream>>>(logits, bitmask, indices, vocab_size, logits_stride, bitmask_stride);
+  } else if (num_bits_per_thread <= 8 && kAlignment <= 8) {
+    const dim3 grid(CeilDiv(vocab_size, THREADS_PER_THREAD_BLOCK * 8), num_rows);
+    LogitsBitmaskKernel<T, PackedT, 8>
+        <<<grid, block, 0, stream>>>(logits, bitmask, indices, vocab_size, logits_stride, bitmask_stride);
+  } else if (num_bits_per_thread <= 16 && kAlignment <= 16) {
+    const dim3 grid(CeilDiv(vocab_size, THREADS_PER_THREAD_BLOCK * 16), num_rows);
+    LogitsBitmaskKernel<T, PackedT, 16>
+        <<<grid, block, 0, stream>>>(logits, bitmask, indices, vocab_size, logits_stride, bitmask_stride);
+  } else {
+    const dim3 grid(CeilDiv(vocab_size, THREADS_PER_THREAD_BLOCK * 32), num_rows);
+    LogitsBitmaskKernel<T, PackedT, 32>
+        <<<grid, block, 0, stream>>>(logits, bitmask, indices, vocab_size, logits_stride, bitmask_stride);
+  }
+}
+
+template <typename T>
+void ApplyTokenBitmaskInplaceDispatchToPackedT(
+    T* __restrict__ logits,
+    const int32_t* __restrict__ bitmask,
+    const int32_t* __restrict__ indices,
+    int32_t vocab_size,
+    int32_t logits_stride,
+    int32_t bitmask_stride,
+    int32_t num_rows) {
+  if (logits_stride % (sizeof(float4) / sizeof(T)) == 0) {
+    ApplyTokenBitmaskInplaceDispatchToBitsPerThread<T, float4>(
+        logits, bitmask, indices, vocab_size, logits_stride, bitmask_stride, num_rows);
+  } else {
+    ApplyTokenBitmaskInplaceDispatchToBitsPerThread<T, T>(
+        logits, bitmask, indices, vocab_size, logits_stride, bitmask_stride, num_rows);
+  }
+}
+
+void ApplyTokenBitmaskInplace(at::Tensor logits, at::Tensor bitmask, at::optional<at::Tensor> indices = at::nullopt) {
+  TORCH_CHECK(logits.is_cuda(), "logits must be a CUDA tensor.");
+  TORCH_CHECK(logits.is_contiguous(), "logits must be contiguous.");
+  TORCH_CHECK(logits.dim() == 1 || logits.dim() == 2, "logits must be a 1D or 2D tensor.");
+  std::pair<int32_t, int32_t> logits_shape =
+      logits.dim() == 2 ? std::make_pair(static_cast<int32_t>(logits.size(0)), static_cast<int32_t>(logits.size(1)))
+                        : std::make_pair(1, static_cast<int32_t>(logits.size(0)));
+
+  TORCH_CHECK(bitmask.is_cuda(), "bitmask must be a CUDA tensor.");
+  TORCH_CHECK(bitmask.is_contiguous(), "bitmask must be contiguous.");
+  TORCH_CHECK(bitmask.dim() == 1 || bitmask.dim() == 2, "bitmask must be a 1D or 2D tensor.");
+  std::pair<int32_t, int32_t> bitmask_shape =
+      bitmask.dim() == 2 ? std::make_pair(static_cast<int32_t>(bitmask.size(0)), static_cast<int32_t>(bitmask.size(1)))
+                         : std::make_pair(1, static_cast<int32_t>(bitmask.size(0)));
+
+  TORCH_CHECK(bitmask.dtype() == torch::kInt32, "bitmask must be of type int32.");
+
+  TORCH_CHECK(
+      (logits_shape.second + BITS_PER_BLOCK - 1) / BITS_PER_BLOCK >= bitmask_shape.second,
+      "The provided logits's vocab size should be no less than the bitmask's vocab size "
+      "(converted from bitmask size). But got vocab size ",
+      logits_shape.second,
+      " vs bitmask size ",
+      bitmask_shape.second);
+
+  int vocab_size = std::min(logits_shape.second, bitmask_shape.second * BITS_PER_BLOCK);
+
+  int32_t num_rows = logits_shape.first;
+  int32_t* indices_ptr = nullptr;
+  if (indices) {
+    TORCH_CHECK(indices->is_cuda(), "indices must be a CUDA tensor.");
+    TORCH_CHECK(indices->is_contiguous(), "indices must be contiguous.");
+    TORCH_CHECK(indices->dim() == 1, "indices must be a 1D tensor.");
+    TORCH_CHECK(indices->dtype() == torch::kInt32, "indices must be of type int32.");
+    num_rows = indices->size(0);
+    indices_ptr = indices->data_ptr<int32_t>();
+  } else {
+    TORCH_CHECK(logits_shape.first == bitmask_shape.first, "logits and bitmask must have the same batch size.");
+  }
+
+  switch (logits.scalar_type()) {
+    case torch::kFloat32: {
+      ApplyTokenBitmaskInplaceDispatchToPackedT(
+          logits.data_ptr<float>(),
+          bitmask.data_ptr<int32_t>(),
+          indices_ptr,
+          vocab_size,
+          logits_shape.second,
+          bitmask_shape.second,
+          num_rows);
+      break;
+    }
+    case torch::kFloat16: {
+      ApplyTokenBitmaskInplaceDispatchToPackedT(
+          reinterpret_cast<__half*>(logits.data_ptr<torch::Half>()),
+          bitmask.data_ptr<int32_t>(),
+          indices_ptr,
+          vocab_size,
+          logits_shape.second,
+          bitmask_shape.second,
+          num_rows);
+      break;
+    }
+    case torch::kBFloat16: {
+      ApplyTokenBitmaskInplaceDispatchToPackedT(
+          reinterpret_cast<__nv_bfloat16*>(logits.data_ptr<torch::BFloat16>()),
+          bitmask.data_ptr<int32_t>(),
+          indices_ptr,
+          vocab_size,
+          logits_shape.second,
+          bitmask_shape.second,
+          num_rows);
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "logits dtype must be float, half or bfloat16.");
+      break;
+  }
+}
+#endif
+// clang-format on
diff --git a/sgl-kernel/csrc/grammar/apply_token_bitmask_inplace_cuda.hip b/sgl-kernel/csrc/grammar/apply_token_bitmask_inplace_cuda.hip
new file mode 100644
index 000000000..3b923bf39
--- /dev/null
+++ b/sgl-kernel/csrc/grammar/apply_token_bitmask_inplace_cuda.hip
@@ -0,0 +1,273 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+// Adapted from
+// https://github.com/mlc-ai/xgrammar/blob/v0.1.18/python/xgrammar/kernels/apply_token_bitmask_inplace_cuda.cu
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// clang-format off
+#include <hip/hip_bf16.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <torch/all.h>
+#include <ATen/hip/HIPContext.h>
+
+
+#if !defined(USE_ROCM) && (!defined(DTK_VERSION) || DTK_VERSION < 12040)
+void ApplyTokenBitmaskInplace(at::Tensor logits, at::Tensor bitmask, at::optional<at::Tensor> indices = at::nullopt) {
+  TORCH_CHECK(false, "CUDA version must be >= 12.4 for ApplyTokenBitmaskInplace");
+}
+#else
+
+#ifndef CUDART_INF_FP16
+#ifndef USE_ROCM
+#define CUDART_INF_FP16 __ushort_as_half((unsigned short)0x7C00U)
+#endif
+#endif
+
+#ifndef CUDART_INF_BF16
+#ifndef USE_ROCM
+#define CUDART_INF_BF16 __ushort_as_bfloat16((unsigned short)0x7F80U)
+#endif
+#endif
+
+constexpr int32_t BITS_PER_BLOCK = 32;
+constexpr int32_t THREADS_PER_THREAD_BLOCK = 256;
+
+template <typename T>
+__device__ T NegativeInfinity() {
+  return -INFINITY;
+}
+
+template <>
+__device__ __half NegativeInfinity<__half>() {
+#ifdef USE_ROCM
+  return __float2half(-INFINITY);
+#else
+  return -CUDART_INF_FP16;
+#endif
+}
+
+template <>
+__device__ __hip_bfloat16 NegativeInfinity<__hip_bfloat16>() {
+#ifdef USE_ROCM
+  return __hip_bfloat16(-INFINITY);
+#else
+  return -CUDART_INF_BF16;
+#endif
+}
+
+template <typename T, typename PackedT>
+__device__ PackedT PackedNegativeInfinity() {
+  constexpr int kAlignment = sizeof(PackedT) / sizeof(T);
+  T packed[kAlignment];
+#pragma unroll
+  for (int i = 0; i < kAlignment; i++) {
+    packed[i] = NegativeInfinity<T>();
+  }
+  return *reinterpret_cast<PackedT*>(packed);
+}
+
+template <typename T, typename PackedT, int32_t kBitsPerThread>
+__global__ void __launch_bounds__(THREADS_PER_THREAD_BLOCK) LogitsBitmaskKernel(
+    T* __restrict__ logits,
+    const int32_t* __restrict__ bitmask,
+    const int32_t* __restrict__ indices,
+    int32_t vocab_size,
+    int32_t logits_stride,
+    int32_t bitmask_stride) {
+  constexpr int kAlignment = sizeof(PackedT) / sizeof(T);
+  constexpr uint32_t kPackedMask = (1 << kAlignment) - 1;
+
+  const int batch_idx = (indices == nullptr) ? blockIdx.y : indices[blockIdx.y];
+
+  const int block_offset = blockIdx.x * THREADS_PER_THREAD_BLOCK * kBitsPerThread;
+  T* logits_gmem_ptr = logits + batch_idx * logits_stride + block_offset;
+  const int32_t* bitmask_gmem_ptr = bitmask + batch_idx * bitmask_stride + block_offset / BITS_PER_BLOCK;
+  const int bitmask_inner_idx = threadIdx.x % (BITS_PER_BLOCK / kAlignment);
+  T logits_reg[kAlignment];
+
+#pragma unroll
+  for (int offset = threadIdx.x * kAlignment; offset < THREADS_PER_THREAD_BLOCK * kBitsPerThread;
+       offset += THREADS_PER_THREAD_BLOCK * kAlignment) {
+    if (block_offset + offset >= vocab_size) {
+      break;
+    }
+
+    const uint32_t bitmask_val =
+        (~bitmask_gmem_ptr[offset / BITS_PER_BLOCK] >> (bitmask_inner_idx * kAlignment)) & kPackedMask;
+
+    if (bitmask_val == 0) {
+      continue;
+    }
+
+    if (bitmask_val == kPackedMask) {
+      *reinterpret_cast<PackedT*>(logits_gmem_ptr + offset) = PackedNegativeInfinity<T, PackedT>();
+      continue;
+    }
+
+    *reinterpret_cast<PackedT*>(logits_reg) = *reinterpret_cast<PackedT*>(logits_gmem_ptr + offset);
+#pragma unroll
+    for (int i = 0; i < kAlignment; i++) {
+      if (((bitmask_val >> i) & 1)) {
+        logits_reg[i] = NegativeInfinity<T>();
+      }
+    }
+    *reinterpret_cast<PackedT*>(logits_gmem_ptr + offset) = *reinterpret_cast<PackedT*>(logits_reg);
+  }
+}
+
+template <typename T, typename = std::enable_if_t<std::is_integral<T>::value>>
+constexpr auto CeilDiv(T numerator, T denominator) {
+  return (numerator + denominator - 1) / denominator;
+}
+
+template <typename T, typename PackedT>
+void ApplyTokenBitmaskInplaceDispatchToBitsPerThread(
+    T* __restrict__ logits,
+    const int32_t* __restrict__ bitmask,
+    const int32_t* __restrict__ indices,
+    int32_t vocab_size,
+    int32_t logits_stride,
+    int32_t bitmask_stride,
+    int32_t num_rows) {
+  constexpr int kAlignment = sizeof(PackedT) / sizeof(T);
+  const int32_t num_blocks_per_row = CeilDiv(2048 / THREADS_PER_THREAD_BLOCK * 128, num_rows);
+  const int32_t num_bits_per_thread = CeilDiv(vocab_size, THREADS_PER_THREAD_BLOCK * num_blocks_per_row);
+
+  const dim3 block(THREADS_PER_THREAD_BLOCK);
+  hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA().stream();
+
+  if (num_bits_per_thread <= 4 && kAlignment <= 4) {
+    const dim3 grid(CeilDiv(vocab_size, THREADS_PER_THREAD_BLOCK * 4), num_rows);
+   hipLaunchKernelGGL(( LogitsBitmaskKernel<T, PackedT, 4>)
+        , dim3(grid), dim3(block), 0, stream, logits, bitmask, indices, vocab_size, logits_stride, bitmask_stride);
+  } else if (num_bits_per_thread <= 8 && kAlignment <= 8) {
+    const dim3 grid(CeilDiv(vocab_size, THREADS_PER_THREAD_BLOCK * 8), num_rows);
+   hipLaunchKernelGGL(( LogitsBitmaskKernel<T, PackedT, 8>)
+        , dim3(grid), dim3(block), 0, stream, logits, bitmask, indices, vocab_size, logits_stride, bitmask_stride);
+  } else if (num_bits_per_thread <= 16 && kAlignment <= 16) {
+    const dim3 grid(CeilDiv(vocab_size, THREADS_PER_THREAD_BLOCK * 16), num_rows);
+   hipLaunchKernelGGL(( LogitsBitmaskKernel<T, PackedT, 16>)
+        , dim3(grid), dim3(block), 0, stream, logits, bitmask, indices, vocab_size, logits_stride, bitmask_stride);
+  } else {
+    const dim3 grid(CeilDiv(vocab_size, THREADS_PER_THREAD_BLOCK * 32), num_rows);
+   hipLaunchKernelGGL(( LogitsBitmaskKernel<T, PackedT, 32>)
+        , dim3(grid), dim3(block), 0, stream, logits, bitmask, indices, vocab_size, logits_stride, bitmask_stride);
+  }
+}
+
+template <typename T>
+void ApplyTokenBitmaskInplaceDispatchToPackedT(
+    T* __restrict__ logits,
+    const int32_t* __restrict__ bitmask,
+    const int32_t* __restrict__ indices,
+    int32_t vocab_size,
+    int32_t logits_stride,
+    int32_t bitmask_stride,
+    int32_t num_rows) {
+  if (logits_stride % (sizeof(float4) / sizeof(T)) == 0) {
+    ApplyTokenBitmaskInplaceDispatchToBitsPerThread<T, float4>(
+        logits, bitmask, indices, vocab_size, logits_stride, bitmask_stride, num_rows);
+  } else {
+    ApplyTokenBitmaskInplaceDispatchToBitsPerThread<T, T>(
+        logits, bitmask, indices, vocab_size, logits_stride, bitmask_stride, num_rows);
+  }
+}
+
+void ApplyTokenBitmaskInplace(at::Tensor logits, at::Tensor bitmask, at::optional<at::Tensor> indices = at::nullopt) {
+  TORCH_CHECK(logits.is_cuda(), "logits must be a CUDA tensor.");
+  TORCH_CHECK(logits.is_contiguous(), "logits must be contiguous.");
+  TORCH_CHECK(logits.dim() == 1 || logits.dim() == 2, "logits must be a 1D or 2D tensor.");
+  std::pair<int32_t, int32_t> logits_shape =
+      logits.dim() == 2 ? std::make_pair(static_cast<int32_t>(logits.size(0)), static_cast<int32_t>(logits.size(1)))
+                        : std::make_pair(1, static_cast<int32_t>(logits.size(0)));
+
+  TORCH_CHECK(bitmask.is_cuda(), "bitmask must be a CUDA tensor.");
+  TORCH_CHECK(bitmask.is_contiguous(), "bitmask must be contiguous.");
+  TORCH_CHECK(bitmask.dim() == 1 || bitmask.dim() == 2, "bitmask must be a 1D or 2D tensor.");
+  std::pair<int32_t, int32_t> bitmask_shape =
+      bitmask.dim() == 2 ? std::make_pair(static_cast<int32_t>(bitmask.size(0)), static_cast<int32_t>(bitmask.size(1)))
+                         : std::make_pair(1, static_cast<int32_t>(bitmask.size(0)));
+
+  TORCH_CHECK(bitmask.dtype() == torch::kInt32, "bitmask must be of type int32.");
+
+  TORCH_CHECK(
+      (logits_shape.second + BITS_PER_BLOCK - 1) / BITS_PER_BLOCK >= bitmask_shape.second,
+      "The provided logits's vocab size should be no less than the bitmask's vocab size "
+      "(converted from bitmask size). But got vocab size ",
+      logits_shape.second,
+      " vs bitmask size ",
+      bitmask_shape.second);
+
+  int vocab_size = ::min(logits_shape.second, bitmask_shape.second * BITS_PER_BLOCK);
+
+  int32_t num_rows = logits_shape.first;
+  int32_t* indices_ptr = nullptr;
+  if (indices) {
+    TORCH_CHECK(indices->is_cuda(), "indices must be a CUDA tensor.");
+    TORCH_CHECK(indices->is_contiguous(), "indices must be contiguous.");
+    TORCH_CHECK(indices->dim() == 1, "indices must be a 1D tensor.");
+    TORCH_CHECK(indices->dtype() == torch::kInt32, "indices must be of type int32.");
+    num_rows = indices->size(0);
+    indices_ptr = indices->data_ptr<int32_t>();
+  } else {
+    TORCH_CHECK(logits_shape.first == bitmask_shape.first, "logits and bitmask must have the same batch size.");
+  }
+
+  switch (logits.scalar_type()) {
+    case torch::kFloat32: {
+      ApplyTokenBitmaskInplaceDispatchToPackedT(
+          logits.data_ptr<float>(),
+          bitmask.data_ptr<int32_t>(),
+          indices_ptr,
+          vocab_size,
+          logits_shape.second,
+          bitmask_shape.second,
+          num_rows);
+      break;
+    }
+    case torch::kFloat16: {
+      ApplyTokenBitmaskInplaceDispatchToPackedT(
+          reinterpret_cast<__half*>(logits.data_ptr<torch::Half>()),
+          bitmask.data_ptr<int32_t>(),
+          indices_ptr,
+          vocab_size,
+          logits_shape.second,
+          bitmask_shape.second,
+          num_rows);
+      break;
+    }
+    case torch::kBFloat16: {
+      ApplyTokenBitmaskInplaceDispatchToPackedT(
+          reinterpret_cast<__hip_bfloat16*>(logits.data_ptr<torch::BFloat16>()),
+          bitmask.data_ptr<int32_t>(),
+          indices_ptr,
+          vocab_size,
+          logits_shape.second,
+          bitmask_shape.second,
+          num_rows);
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "logits dtype must be float, half or bfloat16.");
+      break;
+  }
+}
+#endif
+// clang-format on
diff --git a/sgl-kernel/csrc/kvcacheio/transfer.cu b/sgl-kernel/csrc/kvcacheio/transfer.cu
new file mode 100644
index 000000000..bca9f326c
--- /dev/null
+++ b/sgl-kernel/csrc/kvcacheio/transfer.cu
@@ -0,0 +1,573 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/util/irange.h>
+
+#include <cstdint>
+
+#ifndef USE_ROCM
+#define WARP_SIZE 32
+#include "pytorch_extension_utils.h"
+#else
+#include "pytorch_extension_utils_rocm.h"
+#include "utils.h"  // WARP_SIZE
+#endif
+
+__device__ __forceinline__ void
+transfer_item_warp(int32_t lane_id, const void* src_addr, void* dst_addr, int64_t item_size_bytes) {
+  const uint64_t* __restrict__ src = static_cast<const uint64_t*>(src_addr);
+  uint64_t* __restrict__ dst = static_cast<uint64_t*>(dst_addr);
+  const int total_chunks = item_size_bytes / sizeof(uint64_t);
+
+#pragma unroll
+  for (int j = lane_id; j < total_chunks; j += WARP_SIZE) {
+#ifndef USE_ROCM
+    uint64_t tmp;
+    asm volatile("ld.global.nc.b64 %0,[%1];" : "=l"(tmp) : "l"(src + j) : "memory");
+    asm volatile("st.global.cg.b64 [%0],%1;" ::"l"(dst + j), "l"(tmp) : "memory");
+
+#else
+    uint64_t tmp = __builtin_nontemporal_load(src + j);
+    __builtin_nontemporal_store(tmp, dst + j);
+#endif
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ T* get_global_offset_lf(
+    T* base,
+    const uintptr_t* __restrict__ /*unused*/,
+    int64_t layer_id,
+    int64_t layer_dim,
+    int64_t page_id,
+    int64_t item_size_bytes) {
+  // layer first
+  return base + layer_id * layer_dim + page_id * item_size_bytes;
+}
+
+template <typename T>
+__device__ __forceinline__ T* get_global_offset_pf(
+    T* base,
+    const uintptr_t* __restrict__ /*unused*/,
+    int64_t layer_id,
+    int64_t page_dim,
+    int64_t page_id,
+    int64_t item_size_bytes) {
+  // page first
+  return base + page_id * page_dim + layer_id * item_size_bytes;
+}
+
+// get offset from layer base table when layers are not contiguous
+template <typename T>
+__device__ __forceinline__ T* get_global_offset_lf_tbl(
+    T* /*unused*/,
+    const uintptr_t* __restrict__ layer_base_tbl,
+    int64_t layer_id,
+    int64_t /*unused*/,
+    int64_t page_id,
+    int64_t item_size_bytes) {
+  return reinterpret_cast<T*>(layer_base_tbl[layer_id]) + page_id * item_size_bytes;
+}
+
+template <auto SrcOffsetFn, auto DstOffsetFn, bool IsMLA>
+__global__ void transfer_kernel_impl(
+    const void* __restrict__ src_k,
+    void* __restrict__ dst_k,
+    const void* __restrict__ src_v,
+    void* __restrict__ dst_v,
+    const int64_t* __restrict__ src_indices,
+    const int64_t* __restrict__ dst_indices,
+    int64_t start_layer_id,
+    int64_t num_layers_to_process,
+    int64_t num_items,
+    int64_t items_per_warp,
+    int64_t item_size_bytes,
+    int64_t src_layout_dim,
+    int64_t dst_layout_dim,
+    const uintptr_t* __restrict__ src_k_layer_tbl,
+    const uintptr_t* __restrict__ dst_k_layer_tbl,
+    const uintptr_t* __restrict__ src_v_layer_tbl,
+    const uintptr_t* __restrict__ dst_v_layer_tbl) {
+  int32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int32_t lane_id = tid % WARP_SIZE;
+  int32_t warp_id = tid / WARP_SIZE;
+
+  for (int i = 0; i < items_per_warp; ++i) {
+    int64_t item_id = warp_id * items_per_warp + i;
+    if (item_id >= num_items) {
+      break;
+    }
+    const int64_t src_page_id = src_indices[item_id];
+    const int64_t dst_page_id = dst_indices[item_id];
+
+    // Loop over layers if necessary
+    for (int64_t layer_id = start_layer_id; layer_id < start_layer_id + num_layers_to_process; ++layer_id) {
+      const char* src_ptr = SrcOffsetFn(
+          static_cast<const char*>(src_k), src_k_layer_tbl, layer_id, src_layout_dim, src_page_id, item_size_bytes);
+      char* dst_ptr = DstOffsetFn(
+          static_cast<char*>(dst_k), dst_k_layer_tbl, layer_id, dst_layout_dim, dst_page_id, item_size_bytes);
+      transfer_item_warp(lane_id, src_ptr, dst_ptr, item_size_bytes);
+
+      if constexpr (!IsMLA) {
+        const char* src_v_ptr = SrcOffsetFn(
+            static_cast<const char*>(src_v), src_v_layer_tbl, layer_id, src_layout_dim, src_page_id, item_size_bytes);
+        char* dst_v_ptr = DstOffsetFn(
+            static_cast<char*>(dst_v), dst_v_layer_tbl, layer_id, dst_layout_dim, dst_page_id, item_size_bytes);
+        transfer_item_warp(lane_id, src_v_ptr, dst_v_ptr, item_size_bytes);
+      }
+    }
+  }
+}
+
+template <auto SrcOffsetFn, auto DstOffsetFn, bool IsMLA>
+void transfer_kv_launcher(
+    const at::Tensor& src_k,
+    at::Tensor& dst_k,
+    const at::Tensor& src_v,
+    at::Tensor& dst_v,
+    const at::Tensor& src_indices,
+    const at::Tensor& dst_indices,
+    int64_t start_layer_id,
+    int64_t num_layers_to_process,
+    int64_t item_size,
+    int64_t src_layout_dim,
+    int64_t dst_layout_dim,
+    const at::Tensor& src_k_layers,
+    const at::Tensor& dst_k_layers,
+    const at::Tensor& src_v_layers,
+    const at::Tensor& dst_v_layers,
+    int64_t block_quota,
+    int64_t num_warps_per_block) {
+  TORCH_CHECK(src_indices.is_cuda(), "Source indices must be a CUDA tensor");
+  TORCH_CHECK(dst_indices.is_cuda(), "Destination indices must be a CUDA tensor");
+  TORCH_CHECK(src_indices.scalar_type() == at::kLong, "Source indices must be of type long");
+  TORCH_CHECK(dst_indices.scalar_type() == at::kLong, "Destination indices must be of type long");
+  TORCH_CHECK(src_indices.numel() == dst_indices.numel(), "Source and destination indices must have the same length");
+  TORCH_CHECK(item_size % 8 == 0, "Item byte size must be divisible by 8");
+
+  auto div_up = [](int64_t x, int64_t y) { return (x + y - 1) / y; };
+  const int64_t num_items = src_indices.numel();
+  const int64_t items_per_warp = div_up(num_items, block_quota * num_warps_per_block);
+  const int32_t num_blocks = div_up(num_items, items_per_warp * num_warps_per_block);
+  dim3 grid_dim(num_blocks, 1, 1);
+  const int32_t threads_per_block = num_warps_per_block * WARP_SIZE;
+
+  const void* src_k_ptr = src_k.defined() ? src_k.data_ptr() : nullptr;
+  void* dst_k_ptr = dst_k.defined() ? dst_k.data_ptr() : nullptr;
+  const void* src_v_ptr = IsMLA || !src_v.defined() ? nullptr : src_v.data_ptr();
+  void* dst_v_ptr = IsMLA || !dst_v.defined() ? nullptr : dst_v.data_ptr();
+  const uintptr_t* src_k_tbl_ptr = src_k_layers.defined() ? src_k_layers.data_ptr<uintptr_t>() : nullptr;
+  const uintptr_t* dst_k_tbl_ptr = dst_k_layers.defined() ? dst_k_layers.data_ptr<uintptr_t>() : nullptr;
+  const uintptr_t* src_v_tbl_ptr = IsMLA || !src_v_layers.defined() ? nullptr : src_v_layers.data_ptr<uintptr_t>();
+  const uintptr_t* dst_v_tbl_ptr = IsMLA || !dst_v_layers.defined() ? nullptr : dst_v_layers.data_ptr<uintptr_t>();
+
+  cudaStream_t torch_current_stream = at::cuda::getCurrentCUDAStream();
+  transfer_kernel_impl<SrcOffsetFn, DstOffsetFn, IsMLA><<<grid_dim, threads_per_block, 0, torch_current_stream>>>(
+      src_k_ptr,
+      dst_k_ptr,
+      src_v_ptr,
+      dst_v_ptr,
+      src_indices.data_ptr<int64_t>(),
+      dst_indices.data_ptr<int64_t>(),
+      start_layer_id,
+      num_layers_to_process,
+      num_items,
+      items_per_warp,
+      item_size,
+      src_layout_dim,
+      dst_layout_dim,
+      src_k_tbl_ptr,
+      dst_k_tbl_ptr,
+      src_v_tbl_ptr,
+      dst_v_tbl_ptr);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+void transfer_kv_per_layer(
+    const at::Tensor src_k,
+    at::Tensor dst_k,
+    const at::Tensor src_v,
+    at::Tensor dst_v,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t item_size,
+    int64_t block_quota,
+    int64_t num_warps_per_block) {
+  at::Tensor empty;
+  transfer_kv_launcher<get_global_offset_lf<const char>, get_global_offset_lf<char>, false>(
+      src_k,
+      dst_k,
+      src_v,
+      dst_v,
+      src_indices,
+      dst_indices,
+      0,
+      1,
+      item_size,
+      0,
+      0,
+      empty,
+      empty,
+      empty,
+      empty,
+      block_quota,
+      num_warps_per_block);
+}
+
+void transfer_kv_per_layer_pf_lf(
+    const at::Tensor src_k,
+    at::Tensor dst_k,
+    const at::Tensor src_v,
+    at::Tensor dst_v,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t layer_id,
+    int64_t item_size,
+    int64_t src_layout_dim,
+    int64_t block_quota,
+    int64_t num_warps_per_block) {
+  at::Tensor empty;
+  transfer_kv_launcher<get_global_offset_pf<const char>, get_global_offset_lf<char>, false>(
+      src_k,
+      dst_k,
+      src_v,
+      dst_v,
+      src_indices,
+      dst_indices,
+      layer_id,
+      1,
+      item_size,
+      src_layout_dim,
+      0,
+      empty,
+      empty,
+      empty,
+      empty,
+      block_quota,
+      num_warps_per_block);
+}
+
+void transfer_kv_all_layer(
+    const at::Tensor src_k_layers,
+    const at::Tensor dst_k_layers,
+    const at::Tensor src_v_layers,
+    const at::Tensor dst_v_layers,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t item_size,
+    int64_t num_layers,
+    int64_t block_quota,
+    int64_t num_warps_per_block) {
+  TORCH_CHECK(num_layers == src_k_layers.size(0), "Number of layers in source k tensor does not match num_layers");
+  at::Tensor empty;
+  transfer_kv_launcher<get_global_offset_lf_tbl<const char>, get_global_offset_lf_tbl<char>, false>(
+      empty,
+      empty,
+      empty,
+      empty,
+      src_indices,
+      dst_indices,
+      0,
+      num_layers,
+      item_size,
+      0,
+      0,
+      src_k_layers,
+      dst_k_layers,
+      src_v_layers,
+      dst_v_layers,
+      block_quota,
+      num_warps_per_block);
+}
+
+void transfer_kv_all_layer_lf_pf(
+    const at::Tensor src_k_layers,
+    at::Tensor dst_k,
+    const at::Tensor src_v_layers,
+    at::Tensor dst_v,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t item_size,
+    int64_t dst_layout_dim,
+    int64_t num_layers,
+    int64_t block_quota,
+    int64_t num_warps_per_block) {
+  TORCH_CHECK(num_layers == src_k_layers.size(0), "Number of layers in source k tensor does not match num_layers");
+  at::Tensor empty;
+  transfer_kv_launcher<get_global_offset_lf_tbl<const char>, get_global_offset_pf<char>, false>(
+      empty,
+      dst_k,
+      empty,
+      dst_v,
+      src_indices,
+      dst_indices,
+      0,
+      num_layers,
+      item_size,
+      0,
+      dst_layout_dim,
+      src_k_layers,
+      empty,
+      src_v_layers,
+      empty,
+      block_quota,
+      num_warps_per_block);
+}
+
+void transfer_kv_per_layer_mla(
+    const at::Tensor src,
+    at::Tensor dst,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t item_size,
+    int64_t block_quota,
+    int64_t num_warps_per_block) {
+  at::Tensor empty;
+  transfer_kv_launcher<get_global_offset_lf<const char>, get_global_offset_lf<char>, true>(
+      src,
+      dst,
+      empty,
+      empty,
+      src_indices,
+      dst_indices,
+      0,
+      1,
+      item_size,
+      0,
+      0,
+      empty,
+      empty,
+      empty,
+      empty,
+      block_quota,
+      num_warps_per_block);
+}
+
+void transfer_kv_per_layer_mla_pf_lf(
+    const at::Tensor src,
+    at::Tensor dst,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t layer_id,
+    int64_t item_size,
+    int64_t src_layout_dim,
+    int64_t block_quota,
+    int64_t num_warps_per_block) {
+  at::Tensor empty;
+  transfer_kv_launcher<get_global_offset_pf<const char>, get_global_offset_lf<char>, true>(
+      src,
+      dst,
+      empty,
+      empty,
+      src_indices,
+      dst_indices,
+      layer_id,
+      1,
+      item_size,
+      src_layout_dim,
+      0,
+      empty,
+      empty,
+      empty,
+      empty,
+      block_quota,
+      num_warps_per_block);
+}
+
+void transfer_kv_all_layer_mla(
+    const at::Tensor src_layers,
+    const at::Tensor dst_layers,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t item_size,
+    int64_t num_layers,
+    int64_t block_quota,
+    int64_t num_warps_per_block) {
+  TORCH_CHECK(num_layers == src_layers.size(0), "Number of layers in source tensor does not match num_layers");
+  at::Tensor empty;
+  transfer_kv_launcher<get_global_offset_lf_tbl<const char>, get_global_offset_lf_tbl<char>, true>(
+      empty,
+      empty,
+      empty,
+      empty,
+      src_indices,
+      dst_indices,
+      0,
+      num_layers,
+      item_size,
+      0,
+      0,
+      src_layers,
+      dst_layers,
+      empty,
+      empty,
+      block_quota,
+      num_warps_per_block);
+}
+
+void transfer_kv_all_layer_mla_lf_pf(
+    const at::Tensor src_layers,
+    at::Tensor dst,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t item_size,
+    int64_t dst_layout_dim,
+    int64_t num_layers,
+    int64_t block_quota,
+    int64_t num_warps_per_block) {
+  TORCH_CHECK(num_layers == src_layers.size(0), "Number of layers in source tensor does not match num_layers");
+  at::Tensor empty;
+  transfer_kv_launcher<get_global_offset_lf_tbl<const char>, get_global_offset_pf<char>, true>(
+      empty,
+      dst,
+      empty,
+      empty,
+      src_indices,
+      dst_indices,
+      0,
+      num_layers,
+      item_size,
+      0,
+      dst_layout_dim,
+      src_layers,
+      empty,
+      empty,
+      empty,
+      block_quota,
+      num_warps_per_block);
+}
+
+inline void transfer_page_direct(
+    const at::Tensor src_buffer,
+    at::Tensor dst_buffer,
+    int64_t src_page_index,
+    int64_t dst_page_index,
+    int64_t page_size) {
+  dst_buffer.slice(0, dst_page_index, dst_page_index + page_size)
+      .copy_(
+          src_buffer.slice(0, src_page_index, src_page_index + page_size),
+          /* non_blocking= */ true);
+}
+
+void transfer_kv_direct(
+    const std::vector<at::Tensor>& src_layers,
+    std::vector<at::Tensor> dst_layers,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t page_size) {
+  TORCH_CHECK(
+      src_layers.size() == dst_layers.size(), "Source and destination layers must have the same number of layers");
+  TORCH_CHECK(src_indices.numel() == dst_indices.numel(), "Source and destination indices must have the same length");
+  TORCH_CHECK(page_size > 0, "Page size must be positive");
+  TORCH_CHECK(src_indices.numel() % page_size == 0, "Source indices size must be divisible by page size");
+
+  auto src_indices_cpu = src_indices.cpu();
+  auto dst_indices_cpu = dst_indices.cpu();
+
+  const auto num_indices = src_indices_cpu.numel();
+  const int64_t num_layers = src_layers.size();
+  int64_t* src_indices_ptr = src_indices_cpu.data_ptr<int64_t>();
+  int64_t* dst_indices_ptr = dst_indices_cpu.data_ptr<int64_t>();
+
+  int64_t start_index = 0;
+  int64_t end_index = 0;
+
+  for (int64_t i = 0; i < num_indices; ++i) {
+    if (i < num_indices - 1) {
+      auto src_diff = src_indices_ptr[i + 1] - src_indices_ptr[i];
+      auto dst_diff = dst_indices_ptr[i + 1] - dst_indices_ptr[i];
+
+      if (src_diff == 1 && dst_diff == 1) {
+        continue;
+      }
+      end_index = i + 1;
+    } else {  // last batch
+      end_index = num_indices;
+    }
+    auto src_index = src_indices_ptr[start_index];
+    auto dst_index = dst_indices_ptr[start_index];
+    auto num_tokens = end_index - start_index;
+
+    for (int64_t j = 0; j < num_layers; ++j) {
+      transfer_page_direct(src_layers[j], dst_layers[j], src_index, dst_index, num_tokens);
+    }
+    start_index = end_index;
+  }
+}
+
+template <bool IsLf2Pf>
+inline void transfer_kv_page_first_direct_impl(
+    const std::vector<at::Tensor>& src_ptrs,
+    std::vector<at::Tensor> dst_ptrs,
+    const at::Tensor& src_indices,
+    const at::Tensor& dst_indices,
+    int64_t start_layer_id,
+    int64_t page_size) {
+  TORCH_CHECK(src_indices.numel() == dst_indices.numel(), "Source and destination indices must have the same length");
+  TORCH_CHECK(page_size > 0, "Page size must be positive");
+  TORCH_CHECK(src_indices.numel() % page_size == 0, "Source indices size must be divisible by page size");
+
+  auto src_indices_cpu = src_indices.cpu();
+  auto dst_indices_cpu = dst_indices.cpu();
+  const int64_t num_pages = src_indices_cpu.size(0) / page_size;
+
+  if constexpr (IsLf2Pf) {
+    const bool is_mla = dst_ptrs.size() == 1;
+    const int64_t num_layers = is_mla ? src_ptrs.size() : src_ptrs.size() / 2;
+
+    for (const auto i : c10::irange(num_pages)) {
+      auto s_index = src_indices_cpu[i * page_size].item<int64_t>();
+      auto d_index = dst_indices_cpu[i * page_size].item<int64_t>() / page_size;
+      for (int64_t j = 0; j < num_layers; ++j) {
+        transfer_page_direct(
+            src_ptrs[j], dst_ptrs[0].select(0, d_index).select(0, start_layer_id + j), s_index, 0, page_size);
+        if (!is_mla) {
+          transfer_page_direct(
+              src_ptrs[j + num_layers],
+              dst_ptrs[1].select(0, d_index).select(0, start_layer_id + j),
+              s_index,
+              0,
+              page_size);
+        }
+      }
+    }
+  } else {
+    const bool is_mla = src_ptrs.size() == 1;
+    const int64_t num_layers = is_mla ? dst_ptrs.size() : dst_ptrs.size() / 2;
+
+    for (const auto i : c10::irange(num_pages)) {
+      auto s_index = src_indices_cpu[i * page_size].item<int64_t>() / page_size;
+      auto d_index = dst_indices_cpu[i * page_size].item<int64_t>();
+      for (int64_t j = 0; j < num_layers; ++j) {
+        transfer_page_direct(
+            src_ptrs[0].select(0, s_index).select(0, start_layer_id + j), dst_ptrs[j], 0, d_index, page_size);
+        if (!is_mla) {
+          transfer_page_direct(
+              src_ptrs[1].select(0, s_index).select(0, start_layer_id + j),
+              dst_ptrs[j + num_layers],
+              0,
+              d_index,
+              page_size);
+        }
+      }
+    }
+  }
+}
+
+void transfer_kv_per_layer_direct_pf_lf(
+    const std::vector<at::Tensor>& src_ptrs,
+    std::vector<at::Tensor> dst_ptrs,
+    const at::Tensor& src_indices,
+    const at::Tensor& dst_indices,
+    int64_t layer_id,
+    int64_t page_size) {
+  transfer_kv_page_first_direct_impl<false>(src_ptrs, dst_ptrs, src_indices, dst_indices, layer_id, page_size);
+}
+
+void transfer_kv_all_layer_direct_lf_pf(
+    const std::vector<at::Tensor>& src_ptrs,
+    std::vector<at::Tensor> dst_ptrs,
+    const at::Tensor& src_indices,
+    const at::Tensor& dst_indices,
+    int64_t page_size) {
+  transfer_kv_page_first_direct_impl<true>(src_ptrs, dst_ptrs, src_indices, dst_indices, 0, page_size);
+}
diff --git a/sgl-kernel/csrc/kvcacheio/transfer.hip b/sgl-kernel/csrc/kvcacheio/transfer.hip
new file mode 100644
index 000000000..065fbec99
--- /dev/null
+++ b/sgl-kernel/csrc/kvcacheio/transfer.hip
@@ -0,0 +1,576 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+#include "hip/hip_runtime.h"
+#include <ATen/hip/HIPContext.h>
+#include <c10/hip/HIPException.h>
+#include <c10/util/irange.h>
+
+#include <cstdint>
+
+#ifndef USE_ROCM
+#define WARP_SIZE 32
+#include "pytorch_extension_utils.h"
+#else
+#include "pytorch_extension_utils_rocm.h"
+#include "utils_hip.h"  // WARP_SIZE
+#endif
+
+__device__ __forceinline__ void
+transfer_item_warp(int32_t lane_id, const void* src_addr, void* dst_addr, int64_t item_size_bytes) {
+  const uint64_t* __restrict__ src = static_cast<const uint64_t*>(src_addr);
+  uint64_t* __restrict__ dst = static_cast<uint64_t*>(dst_addr);
+  const int total_chunks = item_size_bytes / sizeof(uint64_t);
+
+#pragma unroll
+  for (int j = lane_id; j < total_chunks; j += WARP_SIZE) {
+#ifndef USE_ROCM
+    uint64_t tmp;
+    asm volatile("ld.global.nc.b64 %0,[%1];" : "=l"(tmp) : "l"(src + j) : "memory");
+    asm volatile("st.global.cg.b64 [%0],%1;" ::"l"(dst + j), "l"(tmp) : "memory");
+
+#else
+    uint64_t tmp = __builtin_nontemporal_load(src + j);
+    __builtin_nontemporal_store(tmp, dst + j);
+#endif
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ T* get_global_offset_lf(
+    T* base,
+    const uintptr_t* __restrict__ /*unused*/,
+    int64_t layer_id,
+    int64_t layer_dim,
+    int64_t page_id,
+    int64_t item_size_bytes) {
+  // layer first
+  return base + layer_id * layer_dim + page_id * item_size_bytes;
+}
+
+template <typename T>
+__device__ __forceinline__ T* get_global_offset_pf(
+    T* base,
+    const uintptr_t* __restrict__ /*unused*/,
+    int64_t layer_id,
+    int64_t page_dim,
+    int64_t page_id,
+    int64_t item_size_bytes) {
+  // page first
+  return base + page_id * page_dim + layer_id * item_size_bytes;
+}
+
+// get offset from layer base table when layers are not contiguous
+template <typename T>
+__device__ __forceinline__ T* get_global_offset_lf_tbl(
+    T* /*unused*/,
+    const uintptr_t* __restrict__ layer_base_tbl,
+    int64_t layer_id,
+    int64_t /*unused*/,
+    int64_t page_id,
+    int64_t item_size_bytes) {
+  return reinterpret_cast<T*>(layer_base_tbl[layer_id]) + page_id * item_size_bytes;
+}
+
+template <auto SrcOffsetFn, auto DstOffsetFn, bool IsMLA>
+__global__ void transfer_kernel_impl(
+    const void* __restrict__ src_k,
+    void* __restrict__ dst_k,
+    const void* __restrict__ src_v,
+    void* __restrict__ dst_v,
+    const int64_t* __restrict__ src_indices,
+    const int64_t* __restrict__ dst_indices,
+    int64_t start_layer_id,
+    int64_t num_layers_to_process,
+    int64_t num_items,
+    int64_t items_per_warp,
+    int64_t item_size_bytes,
+    int64_t src_layout_dim,
+    int64_t dst_layout_dim,
+    const uintptr_t* __restrict__ src_k_layer_tbl,
+    const uintptr_t* __restrict__ dst_k_layer_tbl,
+    const uintptr_t* __restrict__ src_v_layer_tbl,
+    const uintptr_t* __restrict__ dst_v_layer_tbl) {
+  int32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int32_t lane_id = tid % WARP_SIZE;
+  int32_t warp_id = tid / WARP_SIZE;
+
+  for (int i = 0; i < items_per_warp; ++i) {
+    int64_t item_id = warp_id * items_per_warp + i;
+    if (item_id >= num_items) {
+      break;
+    }
+    const int64_t src_page_id = src_indices[item_id];
+    const int64_t dst_page_id = dst_indices[item_id];
+
+    // Loop over layers if necessary
+    for (int64_t layer_id = start_layer_id; layer_id < start_layer_id + num_layers_to_process; ++layer_id) {
+      const char* src_ptr = SrcOffsetFn(
+          static_cast<const char*>(src_k), src_k_layer_tbl, layer_id, src_layout_dim, src_page_id, item_size_bytes);
+      char* dst_ptr = DstOffsetFn(
+          static_cast<char*>(dst_k), dst_k_layer_tbl, layer_id, dst_layout_dim, dst_page_id, item_size_bytes);
+      transfer_item_warp(lane_id, src_ptr, dst_ptr, item_size_bytes);
+
+      if constexpr (!IsMLA) {
+        const char* src_v_ptr = SrcOffsetFn(
+            static_cast<const char*>(src_v), src_v_layer_tbl, layer_id, src_layout_dim, src_page_id, item_size_bytes);
+        char* dst_v_ptr = DstOffsetFn(
+            static_cast<char*>(dst_v), dst_v_layer_tbl, layer_id, dst_layout_dim, dst_page_id, item_size_bytes);
+        transfer_item_warp(lane_id, src_v_ptr, dst_v_ptr, item_size_bytes);
+      }
+    }
+  }
+}
+
+template <auto SrcOffsetFn, auto DstOffsetFn, bool IsMLA>
+void transfer_kv_launcher(
+    const at::Tensor& src_k,
+    at::Tensor& dst_k,
+    const at::Tensor& src_v,
+    at::Tensor& dst_v,
+    const at::Tensor& src_indices,
+    const at::Tensor& dst_indices,
+    int64_t start_layer_id,
+    int64_t num_layers_to_process,
+    int64_t item_size,
+    int64_t src_layout_dim,
+    int64_t dst_layout_dim,
+    const at::Tensor& src_k_layers,
+    const at::Tensor& dst_k_layers,
+    const at::Tensor& src_v_layers,
+    const at::Tensor& dst_v_layers,
+    int64_t block_quota,
+    int64_t num_warps_per_block) {
+  TORCH_CHECK(src_indices.is_cuda(), "Source indices must be a CUDA tensor");
+  TORCH_CHECK(dst_indices.is_cuda(), "Destination indices must be a CUDA tensor");
+  TORCH_CHECK(src_indices.scalar_type() == at::kLong, "Source indices must be of type long");
+  TORCH_CHECK(dst_indices.scalar_type() == at::kLong, "Destination indices must be of type long");
+  TORCH_CHECK(src_indices.numel() == dst_indices.numel(), "Source and destination indices must have the same length");
+  TORCH_CHECK(item_size % 8 == 0, "Item byte size must be divisible by 8");
+
+  auto div_up = [](int64_t x, int64_t y) { return (x + y - 1) / y; };
+  const int64_t num_items = src_indices.numel();
+  const int64_t items_per_warp = div_up(num_items, block_quota * num_warps_per_block);
+  const int32_t num_blocks = div_up(num_items, items_per_warp * num_warps_per_block);
+  dim3 grid_dim(num_blocks, 1, 1);
+  const int32_t threads_per_block = num_warps_per_block * WARP_SIZE;
+
+  const void* src_k_ptr = src_k.defined() ? src_k.data_ptr() : nullptr;
+  void* dst_k_ptr = dst_k.defined() ? dst_k.data_ptr() : nullptr;
+  const void* src_v_ptr = IsMLA || !src_v.defined() ? nullptr : src_v.data_ptr();
+  void* dst_v_ptr = IsMLA || !dst_v.defined() ? nullptr : dst_v.data_ptr();
+  const uintptr_t* src_k_tbl_ptr = src_k_layers.defined() ? src_k_layers.data_ptr<uintptr_t>() : nullptr;
+  const uintptr_t* dst_k_tbl_ptr = dst_k_layers.defined() ? dst_k_layers.data_ptr<uintptr_t>() : nullptr;
+  const uintptr_t* src_v_tbl_ptr = IsMLA || !src_v_layers.defined() ? nullptr : src_v_layers.data_ptr<uintptr_t>();
+  const uintptr_t* dst_v_tbl_ptr = IsMLA || !dst_v_layers.defined() ? nullptr : dst_v_layers.data_ptr<uintptr_t>();
+
+  hipStream_t torch_current_stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+ hipLaunchKernelGGL(( transfer_kernel_impl<SrcOffsetFn, DstOffsetFn, IsMLA>), dim3(grid_dim), dim3(threads_per_block), 0, torch_current_stream, 
+      src_k_ptr,
+      dst_k_ptr,
+      src_v_ptr,
+      dst_v_ptr,
+      src_indices.data_ptr<int64_t>(),
+      dst_indices.data_ptr<int64_t>(),
+      start_layer_id,
+      num_layers_to_process,
+      num_items,
+      items_per_warp,
+      item_size,
+      src_layout_dim,
+      dst_layout_dim,
+      src_k_tbl_ptr,
+      dst_k_tbl_ptr,
+      src_v_tbl_ptr,
+      dst_v_tbl_ptr);
+  C10_HIP_KERNEL_LAUNCH_CHECK();
+}
+
+void transfer_kv_per_layer(
+    const at::Tensor src_k,
+    at::Tensor dst_k,
+    const at::Tensor src_v,
+    at::Tensor dst_v,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t item_size,
+    int64_t block_quota,
+    int64_t num_warps_per_block) {
+  at::Tensor empty;
+  transfer_kv_launcher<get_global_offset_lf<const char>, get_global_offset_lf<char>, false>(
+      src_k,
+      dst_k,
+      src_v,
+      dst_v,
+      src_indices,
+      dst_indices,
+      0,
+      1,
+      item_size,
+      0,
+      0,
+      empty,
+      empty,
+      empty,
+      empty,
+      block_quota,
+      num_warps_per_block);
+}
+
+void transfer_kv_per_layer_pf_lf(
+    const at::Tensor src_k,
+    at::Tensor dst_k,
+    const at::Tensor src_v,
+    at::Tensor dst_v,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t layer_id,
+    int64_t item_size,
+    int64_t src_layout_dim,
+    int64_t block_quota,
+    int64_t num_warps_per_block) {
+  at::Tensor empty;
+  transfer_kv_launcher<get_global_offset_pf<const char>, get_global_offset_lf<char>, false>(
+      src_k,
+      dst_k,
+      src_v,
+      dst_v,
+      src_indices,
+      dst_indices,
+      layer_id,
+      1,
+      item_size,
+      src_layout_dim,
+      0,
+      empty,
+      empty,
+      empty,
+      empty,
+      block_quota,
+      num_warps_per_block);
+}
+
+void transfer_kv_all_layer(
+    const at::Tensor src_k_layers,
+    const at::Tensor dst_k_layers,
+    const at::Tensor src_v_layers,
+    const at::Tensor dst_v_layers,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t item_size,
+    int64_t num_layers,
+    int64_t block_quota,
+    int64_t num_warps_per_block) {
+  TORCH_CHECK(num_layers == src_k_layers.size(0), "Number of layers in source k tensor does not match num_layers");
+  at::Tensor empty;
+  transfer_kv_launcher<get_global_offset_lf_tbl<const char>, get_global_offset_lf_tbl<char>, false>(
+      empty,
+      empty,
+      empty,
+      empty,
+      src_indices,
+      dst_indices,
+      0,
+      num_layers,
+      item_size,
+      0,
+      0,
+      src_k_layers,
+      dst_k_layers,
+      src_v_layers,
+      dst_v_layers,
+      block_quota,
+      num_warps_per_block);
+}
+
+void transfer_kv_all_layer_lf_pf(
+    const at::Tensor src_k_layers,
+    at::Tensor dst_k,
+    const at::Tensor src_v_layers,
+    at::Tensor dst_v,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t item_size,
+    int64_t dst_layout_dim,
+    int64_t num_layers,
+    int64_t block_quota,
+    int64_t num_warps_per_block) {
+  TORCH_CHECK(num_layers == src_k_layers.size(0), "Number of layers in source k tensor does not match num_layers");
+  at::Tensor empty;
+  transfer_kv_launcher<get_global_offset_lf_tbl<const char>, get_global_offset_pf<char>, false>(
+      empty,
+      dst_k,
+      empty,
+      dst_v,
+      src_indices,
+      dst_indices,
+      0,
+      num_layers,
+      item_size,
+      0,
+      dst_layout_dim,
+      src_k_layers,
+      empty,
+      src_v_layers,
+      empty,
+      block_quota,
+      num_warps_per_block);
+}
+
+void transfer_kv_per_layer_mla(
+    const at::Tensor src,
+    at::Tensor dst,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t item_size,
+    int64_t block_quota,
+    int64_t num_warps_per_block) {
+  at::Tensor empty;
+  transfer_kv_launcher<get_global_offset_lf<const char>, get_global_offset_lf<char>, true>(
+      src,
+      dst,
+      empty,
+      empty,
+      src_indices,
+      dst_indices,
+      0,
+      1,
+      item_size,
+      0,
+      0,
+      empty,
+      empty,
+      empty,
+      empty,
+      block_quota,
+      num_warps_per_block);
+}
+
+void transfer_kv_per_layer_mla_pf_lf(
+    const at::Tensor src,
+    at::Tensor dst,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t layer_id,
+    int64_t item_size,
+    int64_t src_layout_dim,
+    int64_t block_quota,
+    int64_t num_warps_per_block) {
+  at::Tensor empty;
+  transfer_kv_launcher<get_global_offset_pf<const char>, get_global_offset_lf<char>, true>(
+      src,
+      dst,
+      empty,
+      empty,
+      src_indices,
+      dst_indices,
+      layer_id,
+      1,
+      item_size,
+      src_layout_dim,
+      0,
+      empty,
+      empty,
+      empty,
+      empty,
+      block_quota,
+      num_warps_per_block);
+}
+
+void transfer_kv_all_layer_mla(
+    const at::Tensor src_layers,
+    const at::Tensor dst_layers,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t item_size,
+    int64_t num_layers,
+    int64_t block_quota,
+    int64_t num_warps_per_block) {
+  TORCH_CHECK(num_layers == src_layers.size(0), "Number of layers in source tensor does not match num_layers");
+  at::Tensor empty;
+  transfer_kv_launcher<get_global_offset_lf_tbl<const char>, get_global_offset_lf_tbl<char>, true>(
+      empty,
+      empty,
+      empty,
+      empty,
+      src_indices,
+      dst_indices,
+      0,
+      num_layers,
+      item_size,
+      0,
+      0,
+      src_layers,
+      dst_layers,
+      empty,
+      empty,
+      block_quota,
+      num_warps_per_block);
+}
+
+void transfer_kv_all_layer_mla_lf_pf(
+    const at::Tensor src_layers,
+    at::Tensor dst,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t item_size,
+    int64_t dst_layout_dim,
+    int64_t num_layers,
+    int64_t block_quota,
+    int64_t num_warps_per_block) {
+  TORCH_CHECK(num_layers == src_layers.size(0), "Number of layers in source tensor does not match num_layers");
+  at::Tensor empty;
+  transfer_kv_launcher<get_global_offset_lf_tbl<const char>, get_global_offset_pf<char>, true>(
+      empty,
+      dst,
+      empty,
+      empty,
+      src_indices,
+      dst_indices,
+      0,
+      num_layers,
+      item_size,
+      0,
+      dst_layout_dim,
+      src_layers,
+      empty,
+      empty,
+      empty,
+      block_quota,
+      num_warps_per_block);
+}
+
+inline void transfer_page_direct(
+    const at::Tensor src_buffer,
+    at::Tensor dst_buffer,
+    int64_t src_page_index,
+    int64_t dst_page_index,
+    int64_t page_size) {
+  dst_buffer.slice(0, dst_page_index, dst_page_index + page_size)
+      .copy_(
+          src_buffer.slice(0, src_page_index, src_page_index + page_size),
+          /* non_blocking= */ true);
+}
+
+void transfer_kv_direct(
+    const std::vector<at::Tensor>& src_layers,
+    std::vector<at::Tensor> dst_layers,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t page_size) {
+  TORCH_CHECK(
+      src_layers.size() == dst_layers.size(), "Source and destination layers must have the same number of layers");
+  TORCH_CHECK(src_indices.numel() == dst_indices.numel(), "Source and destination indices must have the same length");
+  TORCH_CHECK(page_size > 0, "Page size must be positive");
+  TORCH_CHECK(src_indices.numel() % page_size == 0, "Source indices size must be divisible by page size");
+
+  auto src_indices_cpu = src_indices.cpu();
+  auto dst_indices_cpu = dst_indices.cpu();
+
+  const auto num_indices = src_indices_cpu.numel();
+  const int64_t num_layers = src_layers.size();
+  int64_t* src_indices_ptr = src_indices_cpu.data_ptr<int64_t>();
+  int64_t* dst_indices_ptr = dst_indices_cpu.data_ptr<int64_t>();
+
+  int64_t start_index = 0;
+  int64_t end_index = 0;
+
+  for (int64_t i = 0; i < num_indices; ++i) {
+    if (i < num_indices - 1) {
+      auto src_diff = src_indices_ptr[i + 1] - src_indices_ptr[i];
+      auto dst_diff = dst_indices_ptr[i + 1] - dst_indices_ptr[i];
+
+      if (src_diff == 1 && dst_diff == 1) {
+        continue;
+      }
+      end_index = i + 1;
+    } else {  // last batch
+      end_index = num_indices;
+    }
+    auto src_index = src_indices_ptr[start_index];
+    auto dst_index = dst_indices_ptr[start_index];
+    auto num_tokens = end_index - start_index;
+
+    for (int64_t j = 0; j < num_layers; ++j) {
+      transfer_page_direct(src_layers[j], dst_layers[j], src_index, dst_index, num_tokens);
+    }
+    start_index = end_index;
+  }
+}
+
+template <bool IsLf2Pf>
+inline void transfer_kv_page_first_direct_impl(
+    const std::vector<at::Tensor>& src_ptrs,
+    std::vector<at::Tensor> dst_ptrs,
+    const at::Tensor& src_indices,
+    const at::Tensor& dst_indices,
+    int64_t start_layer_id,
+    int64_t page_size) {
+  TORCH_CHECK(src_indices.numel() == dst_indices.numel(), "Source and destination indices must have the same length");
+  TORCH_CHECK(page_size > 0, "Page size must be positive");
+  TORCH_CHECK(src_indices.numel() % page_size == 0, "Source indices size must be divisible by page size");
+
+  auto src_indices_cpu = src_indices.cpu();
+  auto dst_indices_cpu = dst_indices.cpu();
+  const int64_t num_pages = src_indices_cpu.size(0) / page_size;
+
+  if constexpr (IsLf2Pf) {
+    const bool is_mla = dst_ptrs.size() == 1;
+    const int64_t num_layers = is_mla ? src_ptrs.size() : src_ptrs.size() / 2;
+
+    for (const auto i : c10::irange(num_pages)) {
+      auto s_index = src_indices_cpu[i * page_size].item<int64_t>();
+      auto d_index = dst_indices_cpu[i * page_size].item<int64_t>() / page_size;
+      for (int64_t j = 0; j < num_layers; ++j) {
+        transfer_page_direct(
+            src_ptrs[j], dst_ptrs[0].select(0, d_index).select(0, start_layer_id + j), s_index, 0, page_size);
+        if (!is_mla) {
+          transfer_page_direct(
+              src_ptrs[j + num_layers],
+              dst_ptrs[1].select(0, d_index).select(0, start_layer_id + j),
+              s_index,
+              0,
+              page_size);
+        }
+      }
+    }
+  } else {
+    const bool is_mla = src_ptrs.size() == 1;
+    const int64_t num_layers = is_mla ? dst_ptrs.size() : dst_ptrs.size() / 2;
+
+    for (const auto i : c10::irange(num_pages)) {
+      auto s_index = src_indices_cpu[i * page_size].item<int64_t>() / page_size;
+      auto d_index = dst_indices_cpu[i * page_size].item<int64_t>();
+      for (int64_t j = 0; j < num_layers; ++j) {
+        transfer_page_direct(
+            src_ptrs[0].select(0, s_index).select(0, start_layer_id + j), dst_ptrs[j], 0, d_index, page_size);
+        if (!is_mla) {
+          transfer_page_direct(
+              src_ptrs[1].select(0, s_index).select(0, start_layer_id + j),
+              dst_ptrs[j + num_layers],
+              0,
+              d_index,
+              page_size);
+        }
+      }
+    }
+  }
+}
+
+void transfer_kv_per_layer_direct_pf_lf(
+    const std::vector<at::Tensor>& src_ptrs,
+    std::vector<at::Tensor> dst_ptrs,
+    const at::Tensor& src_indices,
+    const at::Tensor& dst_indices,
+    int64_t layer_id,
+    int64_t page_size) {
+  transfer_kv_page_first_direct_impl<false>(src_ptrs, dst_ptrs, src_indices, dst_indices, layer_id, page_size);
+}
+
+void transfer_kv_all_layer_direct_lf_pf(
+    const std::vector<at::Tensor>& src_ptrs,
+    std::vector<at::Tensor> dst_ptrs,
+    const at::Tensor& src_indices,
+    const at::Tensor& dst_indices,
+    int64_t page_size) {
+  transfer_kv_page_first_direct_impl<true>(src_ptrs, dst_ptrs, src_indices, dst_indices, 0, page_size);
+}
diff --git a/sgl-kernel/csrc/mamba/causal_conv1d.cu b/sgl-kernel/csrc/mamba/causal_conv1d.cu
new file mode 100644
index 000000000..5eb3d9b1b
--- /dev/null
+++ b/sgl-kernel/csrc/mamba/causal_conv1d.cu
@@ -0,0 +1,669 @@
+// clang-format off
+// adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/csrc/causal_conv1d_fwd.cu
+// and https://github.com/Dao-AILab/causal-conv1d/blob/main/csrc/causal_conv1d_update.cu
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "causal_conv1d.h"
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+#include <c10/cuda/CUDAException.h>  // For C10_CUDA_CHECK and C10_CUDA_KERNEL_LAUNCH_CHECK
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+
+#define BOOL_SWITCH(COND, CONST_NAME, ...)                                           \
+    [&] {                                                                            \
+        if (COND) {                                                                  \
+            static constexpr bool CONST_NAME = true;                                 \
+            return __VA_ARGS__();                                                    \
+        } else {                                                                     \
+            static constexpr bool CONST_NAME = false;                                \
+            return __VA_ARGS__();                                                    \
+        }                                                                            \
+    }()
+
+#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
+
+#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, NAME, ...)              \
+    if (ITYPE == at::ScalarType::Half) {                                            \
+        using input_t = at::Half;                                                   \
+        using weight_t = at::Half;                                                  \
+        __VA_ARGS__();                                                              \
+    } else if (ITYPE == at::ScalarType::BFloat16) {                                 \
+        using input_t = at::BFloat16;                                               \
+        using weight_t = at::BFloat16;                                              \
+        __VA_ARGS__();                                                              \
+    } else if (ITYPE == at::ScalarType::Float)  {                                   \
+        using input_t = float;                                                      \
+        using weight_t = float;                                                     \
+        __VA_ARGS__();                                                              \
+    } else {                                                                        \
+        AT_ERROR(#NAME, " not implemented for input type '", toString(ITYPE), "'"); \
+    }
+
+
+template<typename input_t, typename weight_t>
+void causal_conv1d_fwd_cuda(ConvParamsBase &params, cudaStream_t stream);
+
+template<typename input_t, typename weight_t>
+void causal_conv1d_update_cuda(ConvParamsBase &params, cudaStream_t stream);
+
+void set_conv_params_fwd(ConvParamsBase &params,
+                         // sizes
+                         const size_t batch,
+                         const size_t dim,
+                         const size_t seqlen,
+                         const size_t width,
+                         // device pointers
+                         const at::Tensor x,
+                         const at::Tensor weight,
+                         const at::Tensor out,
+                         const std::optional<at::Tensor>& bias,
+                         bool silu_activation,
+                         int64_t pad_slot_id,
+                         const std::optional<at::Tensor>& query_start_loc = std::nullopt,
+                         const std::optional<at::Tensor>& cache_indices = std::nullopt,
+                         const std::optional<at::Tensor>& has_initial_state = std::nullopt) {
+
+    // Reset the parameters
+    memset(&params, 0, sizeof(params));
+
+    params.batch = batch;
+    params.dim = dim;
+    params.seqlen = seqlen;
+    params.width = width;
+    params.pad_slot_id = pad_slot_id;
+
+    params.silu_activation = silu_activation;
+
+    // Set the pointers and strides.
+    params.x_ptr = x.data_ptr();
+    params.weight_ptr = weight.data_ptr();
+    params.bias_ptr = bias.has_value() ? bias.value().data_ptr() : nullptr;
+    params.out_ptr = out.data_ptr();
+    // All stride are in elements, not bytes.
+    params.query_start_loc_ptr = query_start_loc.has_value() ? query_start_loc.value().data_ptr() : nullptr;
+    params.cache_indices_ptr = cache_indices.has_value() ? cache_indices.value().data_ptr() : nullptr;
+    params.has_initial_state_ptr = has_initial_state.has_value() ? has_initial_state.value().data_ptr() : nullptr;
+    const bool varlen = params.query_start_loc_ptr != nullptr;
+    params.x_batch_stride = x.stride(varlen ? 1 : 0);
+    params.x_c_stride = x.stride(varlen ? 0 : 1);
+    params.x_l_stride = x.stride(varlen ? 1 : -1);
+    params.weight_c_stride = weight.stride(0);
+    params.weight_width_stride = weight.stride(1);
+    params.out_batch_stride = out.stride(varlen ? 1 : 0);
+    params.out_c_stride = out.stride(varlen ? 0 : 1);
+    params.out_l_stride = out.stride(varlen ? 1 : -1);
+}
+
+
+void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
+                  const std::optional<at::Tensor> &bias_,
+                  const std::optional<at::Tensor> &conv_states,
+                  const std::optional<at::Tensor> &query_start_loc,
+                  const std::optional<at::Tensor> &cache_indices,
+                  const std::optional<at::Tensor> &has_initial_state,
+                  bool silu_activation,
+                 // used to identify padding entries if cache_indices provided
+                 // in case of padding, the kernel will return early
+                  int64_t pad_slot_id) {
+    auto input_type = x.scalar_type();
+    auto weight_type = weight.scalar_type();
+    TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
+    TORCH_CHECK(weight_type == at::ScalarType::Float || weight_type == at::ScalarType::Half || weight_type == at::ScalarType::BFloat16);
+
+    TORCH_CHECK(x.is_cuda());
+    TORCH_CHECK(weight.is_cuda());
+
+    const bool varlen = query_start_loc.has_value() ? true : false;
+    const auto sizes = x.sizes();
+    const int batch_size = varlen ? query_start_loc.value().sizes()[0] - 1 : sizes[0];
+    const int dim = varlen ? sizes[0] : sizes[1];
+    const int seqlen = varlen ? sizes[1] : sizes[2];
+    const int width = weight.size(-1);
+    if (varlen){
+        CHECK_SHAPE(x, dim, seqlen);
+    }
+    else {
+        CHECK_SHAPE(x, batch_size, dim, seqlen);
+    }
+    CHECK_SHAPE(weight, dim, width);
+
+
+
+    if (bias_.has_value()) {
+        auto bias = bias_.value();
+        TORCH_CHECK(bias.scalar_type() == weight_type);
+        TORCH_CHECK(bias.is_cuda());
+        TORCH_CHECK(bias.stride(-1) == 1);
+        CHECK_SHAPE(bias, dim);
+    }
+
+
+    if (has_initial_state.has_value()) {
+        auto has_initial_state_ = has_initial_state.value();
+        TORCH_CHECK(has_initial_state_.scalar_type() == at::ScalarType::Bool);
+        TORCH_CHECK(has_initial_state_.is_cuda());
+        CHECK_SHAPE(has_initial_state_, batch_size);
+    }
+
+
+    if (query_start_loc.has_value()) {
+        auto query_start_loc_ = query_start_loc.value();
+        TORCH_CHECK(query_start_loc_.scalar_type() == at::ScalarType::Int);
+        TORCH_CHECK(query_start_loc_.is_cuda());
+    }
+
+
+    if (cache_indices.has_value()) {
+        auto cache_indices_ = cache_indices.value();
+        TORCH_CHECK(cache_indices_.scalar_type() == at::ScalarType::Int);
+        TORCH_CHECK(cache_indices_.is_cuda());
+        CHECK_SHAPE(cache_indices_, batch_size);
+    }
+
+    at::Tensor out = x;
+
+    ConvParamsBase params;
+    set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out,
+                        bias_,
+                        silu_activation,
+                        pad_slot_id,
+                        query_start_loc,
+                        cache_indices,
+                        has_initial_state
+                        );
+
+    if (conv_states.has_value()) {
+        auto conv_states_ = conv_states.value();
+        TORCH_CHECK(conv_states_.scalar_type() == input_type);
+        TORCH_CHECK(conv_states_.is_cuda());
+        params.conv_states_ptr = conv_states_.data_ptr();
+        params.conv_states_batch_stride = conv_states_.stride(0);
+        params.conv_states_c_stride = conv_states_.stride(1);
+        params.conv_states_l_stride = conv_states_.stride(2);
+    } else {
+        params.conv_states_ptr = nullptr;
+    }
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)x.get_device()};
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_fwd", [&] {
+            causal_conv1d_fwd_cuda<input_t, weight_t>(params, stream);
+    });
+}
+
+
+void causal_conv1d_update(const at::Tensor &x,
+                     const at::Tensor &conv_state,
+                     const at::Tensor &weight,
+                     const std::optional<at::Tensor> &bias_,
+                     bool silu_activation,
+                     const std::optional<at::Tensor> &cache_seqlens_,
+                     const std::optional<at::Tensor> &conv_state_indices_,
+                     // used to identify padding entries if cache_indices provided
+                     // in case of padding, the kernel will return early
+                     int64_t pad_slot_id) {
+    auto input_type = x.scalar_type();
+    auto weight_type = weight.scalar_type();
+    TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
+    TORCH_CHECK(weight_type == at::ScalarType::Float || weight_type == at::ScalarType::Half || weight_type == at::ScalarType::BFloat16);
+    TORCH_CHECK(weight_type == input_type, "weight type must equal to input type, other variations are disabled due to binary size limitations");
+    TORCH_CHECK(conv_state.scalar_type() == input_type);
+
+    TORCH_CHECK(x.is_cuda());
+    TORCH_CHECK(conv_state.is_cuda());
+    TORCH_CHECK(weight.is_cuda());
+
+    const auto sizes = x.sizes();
+    const int batch_size = sizes[0];
+    const int dim = sizes[1];
+    const int seqlen = sizes[2];
+    const int width = weight.size(-1);
+    const int conv_state_len = conv_state.size(2);
+    TORCH_CHECK(conv_state_len >= width - 1);
+
+    CHECK_SHAPE(x, batch_size, dim, seqlen);
+    CHECK_SHAPE(weight, dim, width);
+
+    TORCH_CHECK(width >= 2 && width <= 4, "causal_conv1d only supports width between 2 and 4");
+
+    if (bias_.has_value()) {
+        auto bias = bias_.value();
+        TORCH_CHECK(bias.scalar_type() == weight_type);
+        TORCH_CHECK(bias.is_cuda());
+        TORCH_CHECK(bias.stride(-1) == 1);
+        CHECK_SHAPE(bias, dim);
+    }
+
+    at::Tensor out = x;
+
+    ConvParamsBase params;
+    set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out,
+                        bias_,
+                        silu_activation,
+                        pad_slot_id);
+    params.conv_state_ptr = conv_state.data_ptr();
+    params.conv_state_len = conv_state_len;
+    // All stride are in elements, not bytes.
+    params.conv_state_batch_stride = conv_state.stride(0);
+    params.conv_state_c_stride = conv_state.stride(1);
+    params.conv_state_l_stride = conv_state.stride(2);
+
+    if (cache_seqlens_.has_value()) {
+        auto cache_seqlens = cache_seqlens_.value();
+        TORCH_CHECK(cache_seqlens.scalar_type() == torch::kInt32);
+        TORCH_CHECK(cache_seqlens.is_cuda());
+        TORCH_CHECK(cache_seqlens.stride(-1) == 1);
+        CHECK_SHAPE(cache_seqlens, batch_size);
+        params.cache_seqlens = cache_seqlens.data_ptr<int32_t>();
+    } else {
+        params.cache_seqlens = nullptr;
+    }
+
+    if (conv_state_indices_.has_value()) {
+        auto conv_state_indices = conv_state_indices_.value();
+        TORCH_CHECK(conv_state_indices.scalar_type() == torch::kInt32)
+        TORCH_CHECK(conv_state_indices.is_cuda());
+        TORCH_CHECK(conv_state_indices.stride(0) == 1)
+        CHECK_SHAPE(conv_state_indices, batch_size);
+
+        int conv_state_entries = conv_state.size(0);
+        CHECK_SHAPE(conv_state, conv_state_entries, dim, conv_state_len);
+
+        params.conv_state_indices_ptr = conv_state_indices.data_ptr<int32_t>();
+    } else {
+        CHECK_SHAPE(conv_state, batch_size, dim, conv_state_len);
+        params.conv_state_indices_ptr = nullptr;
+    }
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)x.get_device()};
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_update", [&] {
+            causal_conv1d_update_cuda<input_t, weight_t>(params, stream);
+    });
+}
+
+template<int kNThreads_, int kWidth_, bool kIsVecLoad_, typename input_t_, typename weight_t_>
+struct Causal_conv1d_fwd_kernel_traits {
+    using input_t = input_t_;
+    using weight_t = weight_t_;
+    static constexpr int kNThreads = kNThreads_;
+    static constexpr int kWidth = kWidth_;
+    static constexpr int kNBytes = sizeof(input_t);
+    static_assert(kNBytes == 2 || kNBytes == 4);
+    static constexpr int kNElts = kNBytes == 4 ? 4 : 8;
+    static_assert(kWidth <= kNElts);
+    static constexpr bool kIsVecLoad = kIsVecLoad_;
+    using vec_t = typename BytesToType<kNBytes * kNElts>::Type;
+    using BlockLoadT = cub::BlockLoad<input_t, kNThreads, kNElts, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
+    using BlockLoadVecT = cub::BlockLoad<vec_t, kNThreads, 1, cub::BLOCK_LOAD_DIRECT>;
+    using BlockStoreT = cub::BlockStore<input_t, kNThreads, kNElts, cub::BLOCK_STORE_WARP_TRANSPOSE>;
+    using BlockStoreVecT = cub::BlockStore<vec_t, kNThreads, 1, cub::BLOCK_STORE_DIRECT>;
+    static constexpr int kSmemIOSize = kIsVecLoad
+        ? 0
+        : custom_max({sizeof(typename BlockLoadT::TempStorage), sizeof(typename BlockStoreT::TempStorage)});
+    static constexpr int kSmemExchangeSize = kNThreads * kNBytes * kNElts;
+    static constexpr int kSmemSize = kSmemIOSize + kSmemExchangeSize;
+};
+
+template<typename Ktraits>
+__global__ __launch_bounds__(Ktraits::kNThreads)
+void causal_conv1d_fwd_kernel(ConvParamsBase params) {
+    constexpr int kWidth = Ktraits::kWidth;
+    constexpr int kNThreads = Ktraits::kNThreads;
+    constexpr int kNElts = Ktraits::kNElts;
+    constexpr bool kIsVecLoad = Ktraits::kIsVecLoad;
+    using input_t = typename Ktraits::input_t;
+    using vec_t = typename Ktraits::vec_t;
+    using weight_t = typename Ktraits::weight_t;
+
+    // Shared memory.
+    extern __shared__ char smem_[];
+    auto& smem_load = reinterpret_cast<typename Ktraits::BlockLoadT::TempStorage&>(smem_);
+    auto& smem_load_vec = reinterpret_cast<typename Ktraits::BlockLoadVecT::TempStorage&>(smem_);
+    auto& smem_store = reinterpret_cast<typename Ktraits::BlockStoreT::TempStorage&>(smem_);
+    auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_);
+    vec_t *smem_exchange = reinterpret_cast<vec_t *>(smem_ + Ktraits::kSmemIOSize);
+
+    const bool kVarlen = params.query_start_loc_ptr != nullptr;
+    const int tidx = threadIdx.x;
+    const int batch_id = blockIdx.x;
+    const int channel_id = blockIdx.y;
+    const int *query_start_loc = kVarlen ? reinterpret_cast<int *>(params.query_start_loc_ptr) : nullptr;
+    const int sequence_start_index = kVarlen ? query_start_loc[batch_id] : batch_id;
+    const int seqlen = kVarlen ? query_start_loc[batch_id + 1] - sequence_start_index : params.seqlen;
+
+    input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + sequence_start_index * params.x_batch_stride
+        + channel_id * params.x_c_stride;
+    weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride;
+    input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + sequence_start_index * params.out_batch_stride
+        + channel_id * params.out_c_stride;
+    float bias_val = params.bias_ptr == nullptr ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]);
+
+    bool has_initial_state = params.has_initial_state_ptr == nullptr ? false
+        : reinterpret_cast<bool *>(params.has_initial_state_ptr)[batch_id];
+
+    int* cache_indices = params.cache_indices_ptr == nullptr ? nullptr
+        : reinterpret_cast<int *>(params.cache_indices_ptr);
+    int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id];
+    // cache_index == params.pad_slot_id is defined as padding, so we exit early
+    if (cache_index == params.pad_slot_id){
+        return;
+    }
+    input_t *conv_states = params.conv_states_ptr == nullptr ? nullptr
+        : reinterpret_cast<input_t *>(params.conv_states_ptr) + cache_index * params.conv_states_batch_stride + channel_id * params.conv_states_c_stride;
+
+    // Thread 0 will load the last elements of the previous chunk, so we initialize those to 0.
+    if (tidx == 0) {
+        input_t initial_state[kNElts] = {0};
+        if (has_initial_state) {
+            #pragma unroll
+            for (int w = 0; w < kWidth - 1; ++w){ initial_state[kNElts - 1 - (kWidth - 2) + w ] = conv_states[w]; }
+        }
+        smem_exchange[kNThreads - 1] = reinterpret_cast<vec_t *>(initial_state)[0];
+    }
+
+    float weight_vals[kWidth];
+    #pragma unroll
+    for (int i = 0; i < kWidth; ++i) { weight_vals[i] = float(weight[i * params.weight_width_stride]); }
+
+    constexpr int kChunkSize = kNThreads * kNElts;
+    const int n_chunks = (seqlen + kChunkSize - 1) / kChunkSize;
+    for (int chunk = 0; chunk < n_chunks; ++chunk) {
+        input_t x_vals_load[2 * kNElts] = {0};
+        if constexpr(kIsVecLoad) {
+            typename Ktraits::BlockLoadVecT(smem_load_vec).Load(reinterpret_cast<vec_t*>(x), *reinterpret_cast<vec_t (*)[1]>(&x_vals_load[kNElts]), (seqlen - chunk * kChunkSize) / kNElts);
+        } else {
+            __syncthreads();
+            typename Ktraits::BlockLoadT(smem_load).Load(x, *reinterpret_cast<input_t (*)[kNElts]>(&x_vals_load[kNElts]), seqlen - chunk * kChunkSize);
+        }
+        x += kChunkSize;
+        __syncthreads();
+        // Thread kNThreads - 1 don't write yet, so that thread 0 can read
+        // the last elements of the previous chunk.
+        if (tidx < kNThreads - 1) { smem_exchange[tidx] = reinterpret_cast<vec_t *>(x_vals_load)[1]; }
+        __syncthreads();
+        reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[tidx > 0 ? tidx - 1 : kNThreads - 1];
+        __syncthreads();
+        // Now thread kNThreads - 1 can write the last elements of the current chunk.
+        if (tidx == kNThreads - 1) { smem_exchange[tidx] = reinterpret_cast<vec_t *>(x_vals_load)[1]; }
+
+        float x_vals[2 * kNElts];
+        #pragma unroll
+        for (int i = 0; i < 2 * kNElts; ++i) { x_vals[i] = float(x_vals_load[i]); }
+
+        float out_vals[kNElts];
+        #pragma unroll
+        for (int i = 0; i < kNElts; ++i) {
+            out_vals[i] = bias_val;
+            #pragma unroll
+            for (int w = 0; w < kWidth; ++w) {
+                out_vals[i] += weight_vals[w] * x_vals[kNElts + i - (kWidth - w - 1)];
+            }
+        }
+
+        if (params.silu_activation) {
+            #pragma unroll
+            for (int i = 0; i < kNElts; ++i) {
+                out_vals[i] = out_vals[i] / (1 + expf(-out_vals[i]));
+            }
+        }
+
+        input_t out_vals_store[kNElts];
+        #pragma unroll
+        for (int i = 0; i < kNElts; ++i) { out_vals_store[i] = out_vals[i]; }
+        if constexpr(kIsVecLoad) {
+            typename Ktraits::BlockStoreVecT(smem_store_vec).Store(reinterpret_cast<vec_t*>(out), reinterpret_cast<vec_t (&)[1]>(out_vals_store), (seqlen - chunk * kChunkSize) / kNElts);
+        } else {
+            typename Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, seqlen - chunk * kChunkSize);
+        }
+        out += kChunkSize;
+
+        int final_state_position =  ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize);
+        // in case the final state is separated between the last "smem_exchange" and
+        // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2),
+        // (which occurs when `final_state_position` is a non-positivie index)
+        // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
+        if (conv_states != nullptr && final_state_position < 0 && seqlen > kWidth){
+            input_t vals_load[kNElts] = {0};
+            if ((chunk == n_chunks - 2) && (tidx == kNThreads - 1)){
+                // chunk = n_chunks - 2, a segment of the final state sits in the last index
+                reinterpret_cast<vec_t *>(vals_load)[0] = smem_exchange[kNThreads - 1];
+                #pragma unroll
+                for (int w = 0; w < -final_state_position; ++w){
+                    conv_states[w] = vals_load[kNElts + final_state_position + w];
+                }
+            }
+            if ((chunk == n_chunks - 1) && tidx == 0){
+                // chunk = n_chunks - 1, the second segment of the final state first positions
+                reinterpret_cast<vec_t *>(vals_load)[0] = smem_exchange[0];
+                for (int w = -final_state_position; w < kWidth - 1; ++w){
+                    conv_states[w] = vals_load[w + final_state_position];
+                }
+                return;
+            }
+        }
+    }
+    // Final state is stored in the smem_exchange last token slot,
+    // in case seqlen < kWidth, we would need to take the final state from the
+    // initial state which is stored in conv_states
+    // in case seqlen > kWidth, we would need to load the last kWidth - 1 data
+    // and load it into conv_state accordingly
+    int last_thread =  ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize) / kNElts;
+    if (conv_states != nullptr && tidx == last_thread) {
+        input_t x_vals_load[kNElts * 2] = {0};
+        // in case we are on the first kWidth tokens
+        if (last_thread == 0 && seqlen < kWidth){
+            // Need to take the initial state
+            reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[0];
+            const int offset = seqlen - (kWidth - 1);
+            #pragma unroll
+            for (int w = 0; w < kWidth - 1; ++w){
+                // pad the existing state
+                if ((w - seqlen) >= 0 && has_initial_state) { conv_states[w - seqlen] = conv_states[w]; }
+                else if ((w - seqlen) >= 0 && !has_initial_state) { conv_states[w - seqlen] = input_t(0.0f); }
+            }
+            #pragma unroll
+            for (int w = 0; w < kWidth - 1; ++w){
+                if (offset + w >= 0)
+                    conv_states[w] = x_vals_load[offset + w ];
+            }
+        }
+        else {
+            // in case the final state is in between the threads data
+            const int offset = ((seqlen - (kWidth - 1)) % (kNElts));
+            if ((offset + kWidth - 2) >= kNElts && (last_thread + 1 < kNThreads)){
+                // In case last_thread == kNThreads - 1, accessing last_thread + 1 will result in a
+                // illegal access error on H100.
+                // Therefore, we access last_thread + 1, only if the final state data sits there
+                reinterpret_cast<vec_t *>(x_vals_load)[1] = smem_exchange[last_thread + 1];
+            }
+            reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[last_thread];
+            #pragma unroll
+            for (int w = 0; w < kWidth - 1; ++w){
+                conv_states[w] = x_vals_load[offset + w ];
+            }
+        }
+
+    }
+}
+
+
+template<int kNThreads, int kWidth, typename input_t, typename weight_t>
+void causal_conv1d_fwd_launch(ConvParamsBase &params, cudaStream_t stream) {
+    static constexpr int kNElts = sizeof(input_t) == 4 ? 4 : 8;
+    const bool kVarlen = params.query_start_loc_ptr != nullptr;
+    BOOL_SWITCH(params.seqlen % kNElts == 0 && !kVarlen, kIsVecLoad, [&] {
+        using Ktraits = Causal_conv1d_fwd_kernel_traits<kNThreads, kWidth, kIsVecLoad, input_t, weight_t>;
+        constexpr int kSmemSize = Ktraits::kSmemSize;
+        dim3 grid(params.batch, params.dim);
+
+        auto kernel = &causal_conv1d_fwd_kernel<Ktraits>;
+
+        if (kSmemSize >= 48 * 1024) {
+            #ifndef USE_ROCM
+            C10_CUDA_CHECK(cudaFuncSetAttribute(
+                kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
+            #else
+            // There is a slight signature discrepancy in HIP and CUDA "FuncSetAttribute" function.
+            C10_CUDA_CHECK(cudaFuncSetAttribute(
+                (void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
+            std::cerr << "Warning (causal_conv1d fwd launch): attempting to set maxDynamicSharedMemorySize on an AMD GPU which is currently a non-op (in ROCm versions <= 6.1). This might lead to undefined behavior. \n" << std::endl;
+            #endif
+        }
+        kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
+
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+    });
+}
+
+template<typename input_t, typename weight_t>
+void causal_conv1d_fwd_cuda(ConvParamsBase &params, cudaStream_t stream) {
+    if (params.width == 2) {
+        causal_conv1d_fwd_launch<128, 2, input_t, weight_t>(params, stream);
+    } else if (params.width == 3) {
+        causal_conv1d_fwd_launch<128, 3, input_t, weight_t>(params, stream);
+    } else if (params.width == 4) {
+        causal_conv1d_fwd_launch<128, 4, input_t, weight_t>(params, stream);
+    }
+}
+
+
+template void causal_conv1d_fwd_cuda<float, float>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_fwd_cuda<at::Half, at::Half>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_fwd_cuda<at::BFloat16, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
+
+
+
+
+template<int kNThreads_, int kWidth_, typename input_t_, typename weight_t_>
+struct Causal_conv1d_update_kernel_traits {
+    using input_t = input_t_;
+    using weight_t = weight_t_;
+    static constexpr int kNThreads = kNThreads_;
+    static constexpr int kWidth = kWidth_;
+    static constexpr int kNBytes = sizeof(input_t);
+    static_assert(kNBytes == 2 || kNBytes == 4);
+};
+
+template<typename Ktraits, bool kIsCircularBuffer>
+__global__ __launch_bounds__(Ktraits::kNThreads)
+void causal_conv1d_update_kernel(ConvParamsBase params) {
+    constexpr int kWidth = Ktraits::kWidth;
+    constexpr int kNThreads = Ktraits::kNThreads;
+    using input_t = typename Ktraits::input_t;
+    using weight_t = typename Ktraits::weight_t;
+
+    const int tidx = threadIdx.x;
+    const int batch_id = blockIdx.x;
+    const int channel_id = blockIdx.y * kNThreads + tidx;
+    if (channel_id >= params.dim) return;
+
+    input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
+        + channel_id * params.x_c_stride;
+
+    // If params.conv_state_batch_indices is set, then the conv state is gathered from the conv state tensor
+    // along the batch axis. Otherwise, the conv state coordinate is the same as the batch id.
+    const int conv_state_batch_coord = params.conv_state_indices_ptr == nullptr
+        ? batch_id
+        : params.conv_state_indices_ptr[batch_id];
+    // conv_state_batch_coord == params.pad_slot_id is defined as padding so we exit early
+    if (conv_state_batch_coord == params.pad_slot_id){
+        return;
+    }
+    input_t *conv_state = reinterpret_cast<input_t *>(params.conv_state_ptr)
+        + conv_state_batch_coord * params.conv_state_batch_stride
+        + channel_id * params.conv_state_c_stride;
+
+    weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride;
+    input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
+        + channel_id * params.out_c_stride;
+    float bias_val = params.bias_ptr == nullptr ? 0.f : float(reinterpret_cast<weight_t *>(params.bias_ptr)[channel_id]);
+
+    int state_len = params.conv_state_len;
+    int advance_len = params.seqlen;
+    int cache_seqlen = kIsCircularBuffer ? params.cache_seqlens[batch_id] % state_len : 0;
+    int update_idx = cache_seqlen - (kWidth - 1);
+    update_idx = update_idx < 0 ? update_idx + state_len : update_idx;
+
+    float weight_vals[kWidth] = {0};
+    #pragma unroll
+    for (int i = 0; i < kWidth; ++i) { weight_vals[i] = float(weight[i * params.weight_width_stride]); }
+
+    float x_vals[kWidth] = {0};
+    if constexpr (!kIsCircularBuffer) {
+        #pragma unroll 2
+        for (int i = 0; i < state_len - advance_len - (kWidth - 1); ++i) {
+            conv_state[i * params.conv_state_l_stride] = conv_state[(i + advance_len) * params.conv_state_l_stride];
+        }
+        #pragma unroll
+        for (int i = 0; i < kWidth - 1; ++i) {
+            input_t state_val = conv_state[(state_len - (kWidth - 1) + i) * params.conv_state_l_stride];
+            if (i < advance_len + (kWidth - 1) && state_len - advance_len - (kWidth - 1) + i >= 0) {
+                conv_state[(state_len - advance_len - (kWidth - 1) + i) * params.conv_state_l_stride] = state_val;
+            }
+            x_vals[i] = float(state_val);
+        }
+    } else {
+        #pragma unroll
+        for (int i = 0; i < kWidth - 1; ++i, update_idx = update_idx + 1 >= state_len ? update_idx + 1 - state_len : update_idx + 1) {
+            input_t state_val = conv_state[update_idx * params.conv_state_l_stride];
+            x_vals[i] = float(state_val);
+        }
+    }
+    #pragma unroll 2
+    for (int i = 0; i < params.seqlen; ++i) {
+        input_t x_val = x[i * params.x_l_stride];
+        if constexpr (!kIsCircularBuffer) {
+            if (i < advance_len && state_len - advance_len + i >= 0) {
+                conv_state[(state_len - advance_len + i) * params.conv_state_l_stride] = x_val;
+            }
+        } else {
+            conv_state[update_idx * params.conv_state_l_stride] = x_val;
+            ++update_idx;
+            update_idx = update_idx >= state_len ? update_idx - state_len : update_idx;
+        }
+        x_vals[kWidth - 1] = float(x_val);
+        float out_val = bias_val;
+        #pragma unroll
+        for (int j = 0; j < kWidth; ++j) { out_val += weight_vals[j] * x_vals[j]; }
+        if (params.silu_activation) { out_val = out_val / (1 + expf(-out_val)); }
+        out[i * params.out_l_stride] = input_t(out_val);
+        // Shift the input buffer by 1
+        #pragma unroll
+        for (int i = 0; i < kWidth - 1; ++i) { x_vals[i] = x_vals[i + 1]; }
+    }
+}
+
+template<int kNThreads, int kWidth, typename input_t, typename weight_t>
+void causal_conv1d_update_launch(ConvParamsBase &params, cudaStream_t stream) {
+    using Ktraits = Causal_conv1d_update_kernel_traits<kNThreads, kWidth, input_t, weight_t>;
+    dim3 grid(params.batch, (params.dim + kNThreads - 1) / kNThreads);
+    auto kernel = params.cache_seqlens == nullptr
+        ? &causal_conv1d_update_kernel<Ktraits, false>
+        : &causal_conv1d_update_kernel<Ktraits, true>;
+    kernel<<<grid, Ktraits::kNThreads, 0, stream>>>(params);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template<typename input_t, typename weight_t>
+void causal_conv1d_update_cuda(ConvParamsBase &params, cudaStream_t stream) {
+    if (params.width == 2) {
+        causal_conv1d_update_launch<64, 2, input_t, weight_t>(params, stream);
+    } else if (params.width == 3) {
+        causal_conv1d_update_launch<64, 3, input_t, weight_t>(params, stream);
+    } else if (params.width == 4) {
+        causal_conv1d_update_launch<64, 4, input_t, weight_t>(params, stream);
+    }
+}
+
+template void causal_conv1d_update_cuda<float, float>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_update_cuda<at::Half, at::Half>(ConvParamsBase &params, cudaStream_t stream);
+template void causal_conv1d_update_cuda<at::BFloat16, at::BFloat16>(ConvParamsBase &params, cudaStream_t stream);
diff --git a/sgl-kernel/csrc/mamba/causal_conv1d.h b/sgl-kernel/csrc/mamba/causal_conv1d.h
new file mode 100644
index 000000000..33f8e7432
--- /dev/null
+++ b/sgl-kernel/csrc/mamba/causal_conv1d.h
@@ -0,0 +1,159 @@
+/******************************************************************************
+ * Copyright (c) 2024, Tri Dao.
+ ******************************************************************************/
+// clang-format off
+// adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/csrc/causal_conv1d.h
+#pragma once
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct ConvParamsBase {
+    using index_t = uint32_t;
+
+    int batch, dim, seqlen, width;
+    int64_t pad_slot_id;
+    bool silu_activation;
+
+    index_t x_batch_stride;
+    index_t x_c_stride;
+    index_t x_l_stride;
+    index_t weight_c_stride;
+    index_t weight_width_stride;
+    index_t out_batch_stride;
+    index_t out_c_stride;
+    index_t out_l_stride;
+
+    int conv_state_len;
+    index_t conv_state_batch_stride;
+    index_t conv_state_c_stride;
+    index_t conv_state_l_stride;
+
+    // Common data pointers.
+    void *__restrict__ x_ptr;
+    void *__restrict__ weight_ptr;
+    void *__restrict__ bias_ptr;
+    void *__restrict__ out_ptr;
+
+    void *__restrict__ conv_state_ptr;
+    void *__restrict__ query_start_loc_ptr;
+    void *__restrict__ has_initial_state_ptr;
+    void *__restrict__ cache_indices_ptr;
+    int32_t *__restrict__ cache_seqlens;
+
+    // For the continuous batching case. Makes it so that the mamba state for
+    // the current batch doesn't need to be a contiguous tensor.
+    int32_t *__restrict__ conv_state_indices_ptr;
+
+    void *__restrict__ seq_idx_ptr;
+
+    // No __restrict__ since initial_states could be the same as final_states.
+    void * initial_states_ptr;
+    index_t initial_states_batch_stride;
+    index_t initial_states_l_stride;
+    index_t initial_states_c_stride;
+
+    void * final_states_ptr;
+    index_t final_states_batch_stride;
+    index_t final_states_l_stride;
+    index_t final_states_c_stride;
+
+    void *  conv_states_ptr;
+    index_t conv_states_batch_stride;
+    index_t conv_states_l_stride;
+    index_t conv_states_c_stride;
+};
+
+
+#ifndef USE_ROCM
+    #include <cuda_bf16.h>
+
+    template<typename T>
+    __device__ inline T shuffle_xor(T val, int offset) {
+        return __shfl_xor_sync(uint32_t(-1), val, offset);
+    }
+
+    constexpr size_t custom_max(std::initializer_list<size_t> ilist)
+    {
+        return std::max(ilist);
+    }
+
+    template<typename T>
+    constexpr T constexpr_min(T a, T b) {
+        return std::min(a, b);
+    }
+
+#else
+    #include <hip/hip_bf16.h>
+
+    template<typename T>
+    __device__ inline T shuffle_xor(T val, int offset) {
+        return __shfl_xor(val, offset);
+    }
+    constexpr size_t custom_max(std::initializer_list<size_t> ilist)
+    {
+        return *std::max_element(ilist.begin(), ilist.end());
+    }
+
+    template<typename T>
+    constexpr T constexpr_min(T a, T b) {
+        return a < b ? a : b;
+    }
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int BYTES> struct BytesToType {};
+
+template<> struct BytesToType<16> {
+    using Type = uint4;
+    static_assert(sizeof(Type) == 16);
+};
+
+template<> struct BytesToType<8> {
+    using Type = uint64_t;
+    static_assert(sizeof(Type) == 8);
+};
+
+template<> struct BytesToType<4> {
+    using Type = uint32_t;
+    static_assert(sizeof(Type) == 4);
+};
+
+template<> struct BytesToType<2> {
+    using Type = uint16_t;
+    static_assert(sizeof(Type) == 2);
+};
+
+template<> struct BytesToType<1> {
+    using Type = uint8_t;
+    static_assert(sizeof(Type) == 1);
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+struct SumOp {
+__device__ inline T operator()(T const & x, T const & y) { return x + y; }
+};
+
+template<int THREADS>
+struct Allreduce {
+    static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || THREADS == 4);
+    template<typename T, typename Operator>
+    static __device__ inline T run(T x, Operator &op) {
+        constexpr int OFFSET = THREADS / 2;
+        x = op(x, __shfl_xor_sync(uint32_t(-1), x, OFFSET));
+        return Allreduce<OFFSET>::run(x, op);
+    }
+};
+
+template<>
+struct Allreduce<2> {
+template<typename T, typename Operator>
+static __device__ inline T run(T x, Operator &op) {
+    x = op(x, __shfl_xor_sync(uint32_t(-1), x, 1));
+    return x;
+}
+};
diff --git a/sgl-kernel/csrc/memory/store.cu b/sgl-kernel/csrc/memory/store.cu
new file mode 100644
index 000000000..c6dd97ebd
--- /dev/null
+++ b/sgl-kernel/csrc/memory/store.cu
@@ -0,0 +1,147 @@
+#include <ATen/Dispatch.h>
+#include <ATen/core/TensorBody.h>
+#include <c10/cuda/CUDAStream.h>
+#include <c10/util/Exception.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+namespace {
+
+using std::size_t;
+using std::uint64_t;
+
+// Each warp will process 256 bytes per loop iteration
+template <typename T>
+__global__ void store_kv_cache_256x1(
+    uint64_t* __restrict__ k_cache,
+    uint64_t* __restrict__ v_cache,
+    const T* __restrict__ out_loc,
+    const size_t length,
+    const uint64_t* __restrict__ k,
+    const uint64_t* __restrict__ v,
+    const size_t kv_cache_stride,
+    const size_t kv_input_stride,
+    const size_t num_items) {
+  const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const auto warp_id = idx / 32;
+  const auto lane_id = idx % 32;
+  if (warp_id >= length) return;
+  const auto offset = out_loc[warp_id];
+  const auto k_dst = k_cache + offset * kv_cache_stride;
+  const auto v_dst = v_cache + offset * kv_cache_stride;
+  const auto k_src = k + warp_id * kv_input_stride;
+  const auto v_src = v + warp_id * kv_input_stride;
+  for (size_t i = 0; i < num_items; ++i) {
+    k_dst[lane_id + i * 32] = k_src[lane_id + i * 32];
+    v_dst[lane_id + i * 32] = v_src[lane_id + i * 32];
+  }
+}
+
+// Each warp will process 128 bytes per loop iteration
+template <typename T>
+__global__ void store_kv_cache_128x2(
+    uint64_t* __restrict__ k_cache,
+    uint64_t* __restrict__ v_cache,
+    const T* __restrict__ out_loc,
+    const size_t length,
+    const uint64_t* __restrict__ k,
+    const uint64_t* __restrict__ v,
+    const size_t kv_cache_stride,
+    const size_t kv_input_stride,
+    const size_t num_items) {
+  const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const auto warp_id = idx / 32;
+  const auto lane_id = idx % 32;
+  if (warp_id >= length) return;
+  const auto offset = out_loc[warp_id];
+  const auto copy_k = lane_id < 16;
+  const auto copy_id = lane_id % 16;
+  const auto cache = copy_k ? k_cache : v_cache;
+  const auto input = copy_k ? k : v;
+  const auto dst = cache + offset * kv_cache_stride;
+  const auto src = input + warp_id * kv_input_stride;
+  for (size_t i = 0; i < num_items; ++i) {
+    dst[copy_id + i * 16] = src[copy_id + i * 16];
+  }
+}
+
+}  // namespace
+
+auto store_kv_cache(at::Tensor k_cache, at::Tensor v_cache, at::Tensor out_loc, at::Tensor k, at::Tensor v) -> void {
+  const auto max_tokens = k_cache.size(0);
+  const auto num_tokens = out_loc.size(0);
+  k_cache = k_cache.view({max_tokens, -1});
+  v_cache = v_cache.view({max_tokens, -1});
+  k = k.view({num_tokens, -1});
+  v = v.view({num_tokens, -1});
+
+  TORCH_CHECK(
+      k_cache.is_cuda() && v_cache.is_cuda() && out_loc.is_cuda() && k.is_cuda() && v.is_cuda(),
+      "All tensors must be CUDA tensors");
+  TORCH_CHECK(k_cache.sizes() == v_cache.sizes(), "k_cache and v_cache must have the same size");
+  TORCH_CHECK(k_cache.strides() == v_cache.strides(), "k_cache and v_cache must have the same strides");
+  TORCH_CHECK(k.sizes() == v.sizes(), "k and v must have the same size");
+  TORCH_CHECK(k.strides() == v.strides(), "k and v must have the same strides");
+  TORCH_CHECK(k.stride(-1) == 1 && k_cache.stride(-1) == 1, "k and k_cache must be contiguous in head.");
+  TORCH_CHECK(k.size(-1) == k_cache.size(-1), "k and k_cache must have the same head size");
+  TORCH_CHECK(out_loc.dim() == 1 && out_loc.is_contiguous(), "out_loc must be a 1D contiguous tensor");
+  static_assert(sizeof(uint64_t) == 8, "uint64_t must be 8 bytes, our code assumes that");
+
+  const auto length = out_loc.size(0);
+  const auto elem_size = k.element_size();
+  const auto size_bytes = elem_size * k.size(-1);
+  const auto kv_cache_stride_bytes = elem_size * k_cache.stride(-2);
+  const auto kv_input_stride_bytes = elem_size * k.stride(-2);
+  const auto kv_cache_stride = kv_cache_stride_bytes / 8;
+  const auto kv_input_stride = kv_input_stride_bytes / 8;
+
+  const auto k_cache_ptr = static_cast<uint64_t*>(k_cache.data_ptr());
+  const auto v_cache_ptr = static_cast<uint64_t*>(v_cache.data_ptr());
+  const auto k_ptr = static_cast<const uint64_t*>(k.data_ptr());
+  const auto v_ptr = static_cast<const uint64_t*>(v.data_ptr());
+  const auto num_threads = 256;
+  const auto num_warps = num_threads / 32;
+  const auto num_blocks = (length + num_warps - 1) / num_warps;
+  const auto stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_INTEGRAL_TYPES(out_loc.scalar_type(), "store_kv_cache", [&] {
+    if constexpr (!std::is_same_v<scalar_t, int32_t> && !std::is_same_v<scalar_t, int64_t>) {
+      // do not instantiate the kernel if out_loc is not int32 or int64
+      TORCH_CHECK(false, "out_loc must be of type int32 or int64, got: ", out_loc.scalar_type());
+    } else {
+      if (size_bytes % 256 == 0) {
+        const auto items_per_warp = size_bytes / 256;
+        store_kv_cache_256x1<<<num_blocks, num_threads, 0, stream>>>(
+            k_cache_ptr,
+            v_cache_ptr,
+            out_loc.data_ptr<scalar_t>(),
+            length,
+            k_ptr,
+            v_ptr,
+            kv_cache_stride,
+            kv_input_stride,
+            items_per_warp);
+      } else if (size_bytes % 128 == 0) {
+        const auto items_per_warp = size_bytes / 128;
+        store_kv_cache_128x2<<<num_blocks, num_threads, 0, stream>>>(
+            k_cache_ptr,
+            v_cache_ptr,
+            out_loc.data_ptr<scalar_t>(),
+            length,
+            k_ptr,
+            v_ptr,
+            kv_cache_stride,
+            kv_input_stride,
+            items_per_warp);
+      } else {
+        TORCH_CHECK(
+            false,
+            "The last dimension size bytes of k and v must be"
+            " divisible by 128 at least, got: ",
+            size_bytes);
+      }
+    }
+  });
+}
diff --git a/sgl-kernel/csrc/moe/cutlass_moe/w4a8/scaled_mm_entry.cu b/sgl-kernel/csrc/moe/cutlass_moe/w4a8/scaled_mm_entry.cu
new file mode 100644
index 000000000..28ed28c1e
--- /dev/null
+++ b/sgl-kernel/csrc/moe/cutlass_moe/w4a8/scaled_mm_entry.cu
@@ -0,0 +1,91 @@
+#include <c10/cuda/CUDAGuard.h>
+#include <cudaTypedefs.h>
+#include <torch/all.h>
+
+int32_t get_sm_version_num() {
+  int32_t major_capability, minor_capability;
+  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor, 0);
+  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor, 0);
+  int32_t version_num = major_capability * 10 + minor_capability;
+  return version_num;
+}
+
+void cutlass_w4a8_moe_mm_sm90(
+    torch::Tensor& d_tensors,
+    torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors,
+    torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales,
+    torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes,
+    torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides,
+    torch::Tensor const& d_strides,
+    torch::Tensor const& s_strides,
+    int64_t chunk_size,
+    int64_t topk);
+
+void get_cutlass_w4a8_moe_mm_data_caller(
+    const torch::Tensor& topk_ids,
+    torch::Tensor& expert_offsets,
+    torch::Tensor& problem_sizes1,
+    torch::Tensor& problem_sizes2,
+    torch::Tensor& input_permutation,
+    torch::Tensor& output_permutation,
+    const int64_t num_experts,
+    const int64_t n,
+    const int64_t k);
+
+void cutlass_w4a8_moe_mm(
+    torch::Tensor& d_tensors,
+    torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors,
+    torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales,
+    torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes,
+    torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides,
+    torch::Tensor const& d_strides,
+    torch::Tensor const& s_strides,
+    int64_t chunk_size,
+    int64_t topk) {
+  cutlass_w4a8_moe_mm_sm90(
+      d_tensors,
+      a_tensors,
+      b_tensors,
+      a_scales,
+      b_scales,
+      expert_offsets,
+      problem_sizes,
+      a_strides,
+      b_strides,
+      d_strides,
+      s_strides,
+      chunk_size,
+      topk);
+  return;
+}
+
+void get_cutlass_w4a8_moe_mm_data(
+    const torch::Tensor& topk_ids,
+    torch::Tensor& expert_offsets,
+    torch::Tensor& problem_sizes1,
+    torch::Tensor& problem_sizes2,
+    torch::Tensor& input_permutation,
+    torch::Tensor& output_permutation,
+    const int64_t num_experts,
+    const int64_t n,
+    const int64_t k) {
+  get_cutlass_w4a8_moe_mm_data_caller(
+      topk_ids,
+      expert_offsets,
+      problem_sizes1,
+      problem_sizes2,
+      input_permutation,
+      output_permutation,
+      num_experts,
+      n,
+      k);
+  return;
+}
diff --git a/sgl-kernel/csrc/moe/cutlass_moe/w4a8/w4a8_get_group_starts.cuh b/sgl-kernel/csrc/moe/cutlass_moe/w4a8/w4a8_get_group_starts.cuh
new file mode 100644
index 000000000..8cd50c60c
--- /dev/null
+++ b/sgl-kernel/csrc/moe/cutlass_moe/w4a8/w4a8_get_group_starts.cuh
@@ -0,0 +1,92 @@
+#pragma once
+
+#include <c10/cuda/CUDAStream.h>
+#include <cuda.h>
+#include <torch/all.h>
+
+#include "cutlass/bfloat16.h"
+#include "cutlass/float8.h"
+
+template <typename ElementA, typename ElementB, typename ElementC, typename ElementAccumulator>
+__global__ void int4_fp8_get_group_gemm_starts(
+    int32_t* expert_offsets,
+    ElementA** a_offsets,
+    ElementB** b_offsets,
+    ElementC** out_offsets,
+    ElementAccumulator** a_scales_offsets,
+    cutlass::bfloat16_t** b_scales_offsets,
+    ElementA* a_base_as_int,
+    ElementB* b_base_as_int,
+    ElementC* out_base_as_int,
+    ElementAccumulator* a_scales_base_as_int,
+    cutlass::bfloat16_t* b_scales_base_as_int,
+    int64_t n,
+    int64_t k,
+    bool per_act_token,
+    bool per_out_ch) {
+  int expert_id = threadIdx.x;
+  int32_t expert_offset = expert_offsets[expert_id];
+
+  a_offsets[expert_id] = a_base_as_int + expert_offset * k;
+  b_offsets[expert_id] = b_base_as_int + expert_id * k * n / 2;
+  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
+  a_scales_offsets[expert_id] = a_scales_base_as_int + (per_act_token ? expert_offset : 0);
+  b_scales_offsets[expert_id] = b_scales_base_as_int + (per_out_ch ? expert_id * n * k / 128 : expert_id);
+}
+
+#define __CALL_W4A8_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE)                              \
+  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                                        \
+    int4_fp8_get_group_gemm_starts<cutlass::float_e4m3_t, cutlass::int8_t, C_TYPE, float> \
+        <<<1, num_experts, 0, stream>>>(                                                  \
+            static_cast<int32_t*>(expert_offsets.data_ptr()),                             \
+            static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()),                      \
+            static_cast<cutlass::int8_t**>(b_ptrs.data_ptr()),                            \
+            static_cast<C_TYPE**>(out_ptrs.data_ptr()),                                   \
+            static_cast<float**>(a_scales_ptrs.data_ptr()),                               \
+            static_cast<cutlass::bfloat16_t**>(b_scales_ptrs.data_ptr()),                 \
+            static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()),                    \
+            static_cast<cutlass::int8_t*>(b_tensors.data_ptr()),                          \
+            static_cast<C_TYPE*>(out_tensors.data_ptr()),                                 \
+            static_cast<float*>(a_scales.data_ptr()),                                     \
+            static_cast<cutlass::bfloat16_t*>(b_scales.data_ptr()),                       \
+            out_tensors.size(1),                                                          \
+            a_tensors.size(1),                                                            \
+            per_act_token,                                                                \
+            per_out_ch);                                                                  \
+  }
+
+namespace {
+
+void run_int4_fp8_get_group_gemm_starts(
+    torch::Tensor const& expert_offsets,
+    torch::Tensor& a_ptrs,
+    torch::Tensor& b_ptrs,
+    torch::Tensor& out_ptrs,
+    torch::Tensor& a_scales_ptrs,
+    torch::Tensor& b_scales_ptrs,
+    torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors,
+    torch::Tensor& out_tensors,
+    torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales) {
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b_tensors.dtype() == torch::kInt8);
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kBFloat16);
+
+  int num_experts = static_cast<int>(expert_offsets.size(0));
+  bool per_act_token = a_scales.numel() != 1;
+  bool per_out_ch = b_scales.numel() != num_experts;
+
+  auto stream = at::cuda::getCurrentCUDAStream(expert_offsets.device().index());
+
+  if (false) {
+  }
+  __CALL_W4A8_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t)
+  __CALL_W4A8_GET_STARTS_KERNEL(torch::kFloat16, half)
+  else {
+    TORCH_CHECK(false, "Invalid output type (must be float16 or bfloat16)");
+  }
+}
+
+}  // namespace
diff --git a/sgl-kernel/csrc/moe/cutlass_moe/w4a8/w4a8_grouped_mm_c3x.cu b/sgl-kernel/csrc/moe/cutlass_moe/w4a8/w4a8_grouped_mm_c3x.cu
new file mode 100644
index 000000000..bd63d2ee1
--- /dev/null
+++ b/sgl-kernel/csrc/moe/cutlass_moe/w4a8/w4a8_grouped_mm_c3x.cu
@@ -0,0 +1,386 @@
+#include <c10/cuda/CUDAGuard.h>
+#include <cudaTypedefs.h>
+#include <torch/all.h>
+
+#include <type_traits>
+
+#include "cutlass/cutlass.h"
+#include "w4a8_grouped_mm_c3x.cuh"
+
+using namespace cute;
+
+namespace {
+
+enum class Sched { PP, CO };
+
+template <int M, int N, int K, int A, int B, int C, Sched S>
+struct SM90W4A8Config {
+  using KernelSchedule = std::conditional_t<
+      S == Sched::PP,
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpong,
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative>;
+
+  using EpilogueSchedule = std::conditional_t<
+      S == Sched::PP,
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong,
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative>;
+
+  using TileShape = cute::Shape<cute::Int<M>, cute::Int<N>, cute::Int<K>>;
+  using ClusterShape = cute::Shape<cute::Int<A>, cute::Int<B>, cute::Int<C>>;
+  using Cutlass3xW4A8Gemm = cutlass_3x_w4a8_group_gemm<TileShape, ClusterShape, KernelSchedule, EpilogueSchedule>;
+};
+
+template <int M, int N, int K, int A, int B, int C>
+using SM90_PP = SM90W4A8Config<M, N, K, A, B, C, Sched::PP>;
+
+template <int M, int N, int K, int A, int B, int C>
+using SM90_CO = SM90W4A8Config<M, N, K, A, B, C, Sched::CO>;
+
+template <typename Config>
+inline void invoke_gemm(
+    torch::Tensor& d_tensors,
+    torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors,
+    torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales,
+    torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes,
+    torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides,
+    torch::Tensor const& d_strides,
+    torch::Tensor const& s_strides,
+    int64_t chunk_size) {
+  using GemmT = typename Config::Cutlass3xW4A8Gemm;
+  cutlass_w4a8_group_gemm_caller<GemmT>(
+      d_tensors,
+      a_tensors,
+      b_tensors,
+      a_scales,
+      b_scales,
+      expert_offsets,
+      problem_sizes,
+      a_strides,
+      b_strides,
+      d_strides,
+      s_strides,
+      chunk_size);
+}
+
+void dispatch_w4a8_moe_mm_sm90(
+    torch::Tensor& d_tensors,
+    torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors,
+    torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales,
+    torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes,
+    torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides,
+    torch::Tensor const& d_strides,
+    torch::Tensor const& s_strides,
+    int64_t chunk_size,
+    int64_t topk) {
+  uint32_t const m = a_tensors.size(0) / topk;
+  uint32_t const n = d_tensors.size(1);
+  uint32_t const k = a_tensors.size(1);
+
+  if (n == 4096 && k == 7168) {
+    // group gemm 1
+    if (m <= 4) {
+      invoke_gemm<SM90_PP<64, 32, 512, 2, 1, 1>>(
+          d_tensors,
+          a_tensors,
+          b_tensors,
+          a_scales,
+          b_scales,
+          expert_offsets,
+          problem_sizes,
+          a_strides,
+          b_strides,
+          d_strides,
+          s_strides,
+          chunk_size);
+    } else if (m <= 16) {
+      invoke_gemm<SM90_CO<128, 16, 512, 2, 1, 1>>(
+          d_tensors,
+          a_tensors,
+          b_tensors,
+          a_scales,
+          b_scales,
+          expert_offsets,
+          problem_sizes,
+          a_strides,
+          b_strides,
+          d_strides,
+          s_strides,
+          chunk_size);
+    } else if (m <= 256) {
+      invoke_gemm<SM90_CO<128, 16, 512, 1, 1, 1>>(
+          d_tensors,
+          a_tensors,
+          b_tensors,
+          a_scales,
+          b_scales,
+          expert_offsets,
+          problem_sizes,
+          a_strides,
+          b_strides,
+          d_strides,
+          s_strides,
+          chunk_size);
+    } else if (m <= 1024) {
+      invoke_gemm<SM90_CO<128, 32, 512, 2, 1, 1>>(
+          d_tensors,
+          a_tensors,
+          b_tensors,
+          a_scales,
+          b_scales,
+          expert_offsets,
+          problem_sizes,
+          a_strides,
+          b_strides,
+          d_strides,
+          s_strides,
+          chunk_size);
+    } else {
+      invoke_gemm<SM90_CO<128, 64, 512, 1, 1, 1>>(
+          d_tensors,
+          a_tensors,
+          b_tensors,
+          a_scales,
+          b_scales,
+          expert_offsets,
+          problem_sizes,
+          a_strides,
+          b_strides,
+          d_strides,
+          s_strides,
+          chunk_size);
+    }
+  } else if (n == 7168 && k == 2048) {
+    // group gemm 2
+    if (m <= 8) {
+      invoke_gemm<SM90_PP<64, 16, 512, 1, 1, 1>>(
+          d_tensors,
+          a_tensors,
+          b_tensors,
+          a_scales,
+          b_scales,
+          expert_offsets,
+          problem_sizes,
+          a_strides,
+          b_strides,
+          d_strides,
+          s_strides,
+          chunk_size);
+    } else if (m <= 512) {
+      invoke_gemm<SM90_CO<128, 32, 512, 1, 1, 1>>(
+          d_tensors,
+          a_tensors,
+          b_tensors,
+          a_scales,
+          b_scales,
+          expert_offsets,
+          problem_sizes,
+          a_strides,
+          b_strides,
+          d_strides,
+          s_strides,
+          chunk_size);
+    } else {
+      invoke_gemm<SM90_CO<128, 64, 512, 1, 1, 1>>(
+          d_tensors,
+          a_tensors,
+          b_tensors,
+          a_scales,
+          b_scales,
+          expert_offsets,
+          problem_sizes,
+          a_strides,
+          b_strides,
+          d_strides,
+          s_strides,
+          chunk_size);
+    }
+  } else if (n == 512 && k == 7168) {
+    // group gemm 1 for tp
+    if (m <= 4) {
+      invoke_gemm<SM90_PP<64, 32, 512, 2, 1, 1>>(
+          d_tensors,
+          a_tensors,
+          b_tensors,
+          a_scales,
+          b_scales,
+          expert_offsets,
+          problem_sizes,
+          a_strides,
+          b_strides,
+          d_strides,
+          s_strides,
+          chunk_size);
+    } else if (m <= 16) {
+      invoke_gemm<SM90_CO<128, 16, 512, 2, 1, 1>>(
+          d_tensors,
+          a_tensors,
+          b_tensors,
+          a_scales,
+          b_scales,
+          expert_offsets,
+          problem_sizes,
+          a_strides,
+          b_strides,
+          d_strides,
+          s_strides,
+          chunk_size);
+    } else if (m <= 256) {
+      invoke_gemm<SM90_CO<128, 16, 512, 2, 1, 1>>(
+          d_tensors,
+          a_tensors,
+          b_tensors,
+          a_scales,
+          b_scales,
+          expert_offsets,
+          problem_sizes,
+          a_strides,
+          b_strides,
+          d_strides,
+          s_strides,
+          chunk_size);
+    } else if (m <= 1024) {
+      invoke_gemm<SM90_CO<128, 32, 512, 2, 1, 1>>(
+          d_tensors,
+          a_tensors,
+          b_tensors,
+          a_scales,
+          b_scales,
+          expert_offsets,
+          problem_sizes,
+          a_strides,
+          b_strides,
+          d_strides,
+          s_strides,
+          chunk_size);
+    } else {
+      invoke_gemm<SM90_CO<128, 64, 512, 1, 1, 1>>(
+          d_tensors,
+          a_tensors,
+          b_tensors,
+          a_scales,
+          b_scales,
+          expert_offsets,
+          problem_sizes,
+          a_strides,
+          b_strides,
+          d_strides,
+          s_strides,
+          chunk_size);
+    }
+  } else if (n == 7168 && k == 256) {
+    // group gemm 2 for tp
+    if (m <= 8) {
+      invoke_gemm<SM90_PP<64, 16, 128, 1, 1, 1>>(
+          d_tensors,
+          a_tensors,
+          b_tensors,
+          a_scales,
+          b_scales,
+          expert_offsets,
+          problem_sizes,
+          a_strides,
+          b_strides,
+          d_strides,
+          s_strides,
+          chunk_size);
+    } else if (m <= 512) {
+      invoke_gemm<SM90_PP<128, 32, 128, 2, 1, 1>>(
+          d_tensors,
+          a_tensors,
+          b_tensors,
+          a_scales,
+          b_scales,
+          expert_offsets,
+          problem_sizes,
+          a_strides,
+          b_strides,
+          d_strides,
+          s_strides,
+          chunk_size);
+    } else {
+      invoke_gemm<SM90_PP<128, 64, 128, 1, 1, 1>>(
+          d_tensors,
+          a_tensors,
+          b_tensors,
+          a_scales,
+          b_scales,
+          expert_offsets,
+          problem_sizes,
+          a_strides,
+          b_strides,
+          d_strides,
+          s_strides,
+          chunk_size);
+    }
+  } else {
+    if (k % 512 == 0) {
+      invoke_gemm<SM90_CO<128, 32, 512, 1, 1, 1>>(
+          d_tensors,
+          a_tensors,
+          b_tensors,
+          a_scales,
+          b_scales,
+          expert_offsets,
+          problem_sizes,
+          a_strides,
+          b_strides,
+          d_strides,
+          s_strides,
+          chunk_size);
+    } else {
+      invoke_gemm<SM90_PP<128, 64, 128, 1, 1, 1>>(
+          d_tensors,
+          a_tensors,
+          b_tensors,
+          a_scales,
+          b_scales,
+          expert_offsets,
+          problem_sizes,
+          a_strides,
+          b_strides,
+          d_strides,
+          s_strides,
+          chunk_size);
+    }
+  }
+}
+
+}  // namespace
+
+void cutlass_w4a8_moe_mm_sm90(
+    torch::Tensor& d_tensors,
+    torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors,
+    torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales,
+    torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes,
+    torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides,
+    torch::Tensor const& d_strides,
+    torch::Tensor const& s_strides,
+    int64_t chunk_size,
+    int64_t topk) {
+  dispatch_w4a8_moe_mm_sm90(
+      d_tensors,
+      a_tensors,
+      b_tensors,
+      a_scales,
+      b_scales,
+      expert_offsets,
+      problem_sizes,
+      a_strides,
+      b_strides,
+      d_strides,
+      s_strides,
+      chunk_size,
+      topk);
+}
diff --git a/sgl-kernel/csrc/moe/cutlass_moe/w4a8/w4a8_grouped_mm_c3x.cuh b/sgl-kernel/csrc/moe/cutlass_moe/w4a8/w4a8_grouped_mm_c3x.cuh
new file mode 100644
index 000000000..d8b794997
--- /dev/null
+++ b/sgl-kernel/csrc/moe/cutlass_moe/w4a8/w4a8_grouped_mm_c3x.cuh
@@ -0,0 +1,277 @@
+#pragma once
+
+/**
+ * @file w4a8_grouped_mm_c3x.cuh
+ * @brief Implementation of grouped GEMM operation with int4 and fp8 mixed
+ * precision
+ *
+ * This file implements a grouped GEMM operation that multiplies FP8 matrices
+ * (A) with quantized INT4 matrices (B), applying per-block scaling factors.
+ * The implementation is optimized for NVIDIA Hopper GPUs, leveraging Tensor
+ * Cores for mixed precision arithmetic.
+ *
+ * Key features:
+ * - Supports grouped GEMM operations with multiple experts
+ * - Uses FP8 (e4m3) for matrix A
+ * - Uses INT4 quantization for matrix B with per-block scaling
+ * - Implements preprocessing for INT4 encoding and scale packing
+ * - Optimized for Hopper architecture with Tensor Core operations
+ */
+
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_fp8.h>
+#include <cuda_runtime.h>
+#include <torch/all.h>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass_extensions/gemm/collective/collective_builder_mixed_input.hpp"
+#include "w4a8_get_group_starts.cuh"
+
+using namespace cute;
+
+namespace {
+
+// Type definitions
+using MmaType = cutlass::float_e4m3_t;     // FP8 e4m3 type
+using QuantType = cutlass::int4b_t;        // 4-bit integer type
+using ElementAccumulator = float;          // Accumulator type
+using ElementScale = cutlass::bfloat16_t;  // Scale type
+using ElementC = cutlass::bfloat16_t;      // Output type
+using ElementD = ElementC;                 // Output type
+using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;
+
+// Architecture-specific configurations
+using ArchTag = cutlass::arch::Sm90;
+using OperatorClass = cutlass::arch::OpClassTensorOp;
+// constexpr int TileShapeK = 512;
+// using TileShape = Shape<_128, _32, cute::Int<TileShapeK>>;
+// using ClusterShape = Shape<_1, _1, _1>;
+
+// Layout configurations
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using LayoutC = cutlass::layout::RowMajor;
+using LayoutD = LayoutC;
+
+// Transposed layouts
+using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+using LayoutC_Transpose = typename cutlass::layout::LayoutTranspose<LayoutC>::type;
+using LayoutD_Transpose = typename cutlass::layout::LayoutTranspose<LayoutD>::type;
+
+// Alignments
+static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<MmaType>::value;
+static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<QuantType>::value;
+static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+template <typename TileShape, typename ClusterShape, typename KernelSchedule, typename EpilogueSchedule>
+struct cutlass_3x_w4a8_group_gemm {
+  static constexpr int GroupSize = 128;
+  static constexpr int PackedScalesNum = get<2>(TileShape{}) / GroupSize;
+  using ElementScalePacked = cutlass::Array<ElementScale, PackedScalesNum>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      TileShape,
+      ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator,
+      ElementAccumulator,
+      ElementC,
+      LayoutC_Transpose*,
+      AlignmentC,
+      ElementD,
+      LayoutD_Transpose*,
+      AlignmentD,
+      EpilogueSchedule>::CollectiveOp;
+
+  using CollectiveMainloopScaleOnly = typename cutlass::gemm::collective::CollectiveBuilderMixedInput<
+      ArchTag,
+      OperatorClass,
+      cute::tuple<QuantType, ElementScalePacked>,
+      LayoutB_Transpose*,
+      AlignmentB,
+      MmaType,
+      LayoutA_Transpose*,
+      AlignmentA,
+      ElementAccumulator,
+      TileShape,
+      ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+          sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      KernelSchedule>::CollectiveOp;
+
+  // Define the final kernel and GEMM operation types
+  using GemmKernelScaleOnly =
+      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloopScaleOnly, CollectiveEpilogue>;
+
+  using GemmScaleOnly = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelScaleOnly>;
+
+  using StrideA = cute::remove_pointer_t<cutlass::detail::TagToStrideA_t<LayoutA*>>;
+  using StrideB = cute::remove_pointer_t<cutlass::detail::TagToStrideB_t<LayoutB*>>;
+  using StrideC = typename GemmKernelScaleOnly::InternalStrideC;
+  using StrideD = typename GemmKernelScaleOnly::InternalStrideD;
+  using StrideS = typename CollectiveMainloopScaleOnly::StrideScale;
+};
+
+/**
+ * @brief Main function to run int4 * fp8 grouped GEMM from PyTorch
+ *
+ * This function performs multiple GEMM operations in parallel where each
+ * operation multiplies an FP8 matrix (A) with a quantized INT4 matrix (B),
+ * applying per-channel scaling factors. It's designed for efficient execution
+ * on NVIDIA Hopper GPUs, leveraging Tensor Cores for optimal performance with
+ * mixed precision arithmetic.
+ *
+ * The function includes preprocessing steps for both INT4 tensors and scale
+ * factors to ensure optimal performance and correct operation.
+ *
+ * @param d_tensors Output tensor D with shape [total_m, total_n]
+ * @param a_tensors Tensor containing all A matrices (fp8_e4m3) with shape
+ * [total_m, K]
+ * @param b_tensors Tensor containing all B matrices (int4 packed as int8) with
+ * shape [E, N, K/2]
+ * @param a_scales Tensor containing A matrix scale factors
+ * @param b_scales Tensor containing B matrix scale factors with shape [E,
+ * K//512, N*4]
+ * @param expert_offsets Tensor containing expert offsets for determining group
+ * boundaries (int32)
+ * @param problem_sizes Tensor containing problem sizes with shape [num_experts,
+ * 3] (M, N, K for each group) (int32)
+ * @param a_strides Stride information for A tensors
+ * @param b_strides Stride information for B tensors
+ * @param d_strides Stride information for D tensors
+ * @param s_strides Stride information for scale tensors
+ * @param chunk_size Size of each chunk for scales (K / number of scale chunks)
+ */
+// template <typename TileShape, typename ClusterShape, typename KernelSchedule, typename EpilogueSchedule>
+template <typename Gemm>
+void cutlass_w4a8_group_gemm_caller(
+    torch::Tensor& d_tensors,
+    torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors,
+    torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales,
+    torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes,
+    torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides,
+    torch::Tensor const& d_strides,
+    torch::Tensor const& s_strides,
+    int64_t chunk_size) {
+  //   using Gemm = cutlass_3x_w4a8_group_gemm<TileShape, ClusterShape, KernelSchedule, EpilogueSchedule>;
+  using Args = typename Gemm::GemmScaleOnly::Arguments;
+
+  int num_experts = static_cast<int>(expert_offsets.size(0));
+  bool per_act_token = a_scales.numel() != 1;
+  bool per_out_ch = b_scales.numel() != num_experts;
+
+  // Check inputs
+  TORCH_CHECK(a_tensors.dim() == 2, "A tensor must be 2D");
+  TORCH_CHECK(b_tensors.dim() == 3, "B tensor must be 3D [E, N, K/2]");
+  TORCH_CHECK(b_scales.dim() == 3, "Scale tensor must be 3D [E, K//512, N*4]");
+  TORCH_CHECK(a_scales.dim() == 1, "A Scale tensor must be 1D [1]");
+  TORCH_CHECK(expert_offsets.dim() == 1, "expert_offsets must be a 1D tensor");
+  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
+
+  // Check tensor shapes
+  TORCH_CHECK(problem_sizes.size(0) == num_experts, "problem_sizes must have num_experts rows");
+  TORCH_CHECK(problem_sizes.size(1) == 3, "problem_sizes must have 3 columns (N, M, K)");
+  TORCH_CHECK(b_tensors.size(0) == num_experts, "B tensor first dimension must match number of groups");
+  TORCH_CHECK(b_scales.size(0) == num_experts, "Scale tensor first dimension must match number of groups");
+  TORCH_CHECK(b_tensors.size(2) * 2 == a_tensors.size(1), "B tensor K/2 dimension must match A tensor K dimension");
+
+  // Check tensor types
+  TORCH_CHECK(a_tensors.scalar_type() == torch::kFloat8_e4m3fn, "A tensor must be fp8 (float_e4m3_t) type");
+  TORCH_CHECK(b_tensors.scalar_type() == torch::kInt8, "B tensor must contain packed int4 values (stored as int8)");
+  TORCH_CHECK(expert_offsets.scalar_type() == torch::kInt32, "Expert offsets must be int32 type");
+  TORCH_CHECK(problem_sizes.scalar_type() == torch::kInt32, "Problem sizes must be int32 type");
+
+  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
+  auto options_int = torch::TensorOptions().dtype(torch::kInt64).device(a_tensors.device());
+
+  torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
+
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = a_tensors.device().index();
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  Args arguments;
+  decltype(arguments.epilogue.thread) fusion_args;
+  fusion_args.alpha = 0;
+  fusion_args.beta = 0;
+  fusion_args.alpha_ptr = a_scales.data_ptr<float>();
+  ;
+  fusion_args.beta_ptr = nullptr;
+  fusion_args.alpha_ptr_array = nullptr;
+  fusion_args.beta_ptr_array = nullptr;
+  fusion_args.dAlpha = {cute::_0{}, cute::_0{}, 0};
+  fusion_args.dBeta = {cute::_0{}, cute::_0{}, 0};
+
+  ProblemShape::UnderlyingProblemShape* problem_sizes_as_shapes =
+      static_cast<ProblemShape::UnderlyingProblemShape*>(problem_sizes.data_ptr());
+
+  run_int4_fp8_get_group_gemm_starts(
+      expert_offsets,
+      a_ptrs,
+      b_ptrs,
+      out_ptrs,
+      a_scales_ptrs,
+      b_scales_ptrs,
+      a_tensors,
+      b_tensors,
+      d_tensors,
+      a_scales,
+      b_scales);
+
+  arguments = Args{
+      cutlass::gemm::GemmUniversalMode::kGrouped,
+      {num_experts, problem_sizes_as_shapes, nullptr},
+      {static_cast<const QuantType**>(b_ptrs.data_ptr()),
+       static_cast<typename Gemm::StrideB*>(b_strides.data_ptr()),
+       static_cast<const MmaType**>(a_ptrs.data_ptr()),
+       static_cast<typename Gemm::StrideA*>(a_strides.data_ptr()),
+       static_cast<const typename Gemm::ElementScalePacked**>(b_scales_ptrs.data_ptr()),
+       static_cast<typename Gemm::StrideS*>(s_strides.data_ptr()),
+       static_cast<int>(chunk_size)},
+      {fusion_args,
+       nullptr,
+       nullptr,
+       static_cast<ElementD**>(out_ptrs.data_ptr()),
+       static_cast<typename Gemm::StrideD*>(d_strides.data_ptr())},
+      hw_info};
+
+  // Instantiate and run GEMM
+  typename Gemm::GemmScaleOnly gemm;
+  size_t workspace_size = Gemm::GemmScaleOnly::get_workspace_size(arguments);
+  auto const workspace_options = torch::TensorOptions().dtype(torch::kUInt8).device(a_tensors.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  cutlass::Status status = gemm.can_implement(arguments);
+  if (status != cutlass::Status::kSuccess) {
+    TORCH_CHECK(false, "GEMM implementation not supported");
+  }
+
+  status = gemm.initialize(arguments, workspace.data_ptr(), stream);
+  if (status != cutlass::Status::kSuccess) {
+    TORCH_CHECK(false, "GEMM initialization failed");
+  }
+
+  status = gemm.run(stream);
+  if (status != cutlass::Status::kSuccess) {
+    TORCH_CHECK(false, "GEMM execution failed");
+  }
+}
+
+}  // namespace
diff --git a/sgl-kernel/csrc/moe/cutlass_moe/w4a8/w4a8_moe_data.cu b/sgl-kernel/csrc/moe/cutlass_moe/w4a8/w4a8_moe_data.cu
new file mode 100644
index 000000000..f30f7b025
--- /dev/null
+++ b/sgl-kernel/csrc/moe/cutlass_moe/w4a8/w4a8_moe_data.cu
@@ -0,0 +1,79 @@
+#include <c10/cuda/CUDAGuard.h>
+#include <cudaTypedefs.h>
+#include <torch/all.h>
+
+#include <iostream>
+
+constexpr uint64_t THREADS_PER_EXPERT = 512;
+
+__global__ void compute_problem_sizes_w4a8(
+    const int32_t* __restrict__ topk_ids,
+    int32_t* problem_sizes1,
+    int32_t* problem_sizes2,
+    int32_t* atomic_buffer,
+    const int topk_length,
+    const int n,
+    const int k) {
+  int expert_id = blockIdx.x;
+
+  int occurrences = 0;
+  for (int i = threadIdx.x; i < topk_length; i += THREADS_PER_EXPERT) {
+    occurrences += (topk_ids[i] == expert_id);
+  }
+  atomicAdd(&atomic_buffer[expert_id], occurrences);
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    int final_occurrences = atomic_buffer[expert_id];
+    problem_sizes1[expert_id * 3] = 2 * n;
+    problem_sizes1[expert_id * 3 + 1] = final_occurrences;
+    problem_sizes1[expert_id * 3 + 2] = k;
+    problem_sizes2[expert_id * 3] = k;
+    problem_sizes2[expert_id * 3 + 1] = final_occurrences;
+    problem_sizes2[expert_id * 3 + 2] = n;
+  }
+}
+
+__global__ void compute_expert_offsets_w4a8(
+    const int32_t* __restrict__ problem_sizes1,
+    int32_t* expert_offsets,
+    int32_t* atomic_buffer,
+    const int num_experts) {
+  int32_t tot_offset = 0;
+  expert_offsets[0] = 0;
+  for (int i = 0; i < num_experts; ++i) {
+    atomic_buffer[i] = tot_offset;
+    tot_offset += problem_sizes1[i * 3 + 1];
+    expert_offsets[i + 1] = tot_offset;
+  }
+}
+
+void get_cutlass_w4a8_moe_mm_data_caller(
+    const torch::Tensor& topk_ids,
+    torch::Tensor& expert_offsets,
+    torch::Tensor& problem_sizes1,
+    torch::Tensor& problem_sizes2,
+    torch::Tensor& input_permutation,
+    torch::Tensor& output_permutation,
+    const int64_t num_experts,
+    const int64_t n,
+    const int64_t k) {
+  auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
+  auto options_int32 = torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
+  torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);
+
+  int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
+  compute_problem_sizes_w4a8<<<num_experts, num_threads, 0, stream>>>(
+      static_cast<const int32_t*>(topk_ids.data_ptr()),
+      static_cast<int32_t*>(problem_sizes1.data_ptr()),
+      static_cast<int32_t*>(problem_sizes2.data_ptr()),
+      static_cast<int32_t*>(atomic_buffer.data_ptr()),
+      topk_ids.numel(),
+      n,
+      k);
+  compute_expert_offsets_w4a8<<<1, 1, 0, stream>>>(
+      static_cast<const int32_t*>(problem_sizes1.data_ptr()),
+      static_cast<int32_t*>(expert_offsets.data_ptr()),
+      static_cast<int32_t*>(atomic_buffer.data_ptr()),
+      num_experts);
+}
diff --git a/sgl-kernel/csrc/moe/cutlass_moe_helper.cu b/sgl-kernel/csrc/moe/cutlass_moe_helper.cu
new file mode 100644
index 000000000..576ad233b
--- /dev/null
+++ b/sgl-kernel/csrc/moe/cutlass_moe_helper.cu
@@ -0,0 +1,142 @@
+#pragma once
+
+#include <c10/cuda/CUDAStream.h>
+#include <cuda.h>
+#include <torch/all.h>
+
+#include "cutlass/bfloat16.h"
+#include "cutlass/float8.h"
+
+template <
+    typename ElementAB,
+    typename ElementC,
+    typename ElementAccumulator,
+    typename LayoutSFA,
+    typename LayoutSFB,
+    typename ScaleConfig>
+__global__ void get_group_gemm_starts(
+    int32_t* expert_offsets,
+    ElementAB** a_offsets,
+    ElementAB** b_offsets,
+    ElementC** out_offsets,
+    ElementAccumulator** a_scales_offsets,
+    ElementAccumulator** b_scales_offsets,
+    ElementAB* a_base_as_int,
+    ElementAB* b_base_as_int,
+    ElementC* out_base_as_int,
+    ElementAccumulator* a_scales_base_as_int,
+    ElementAccumulator* b_scales_base_as_int,
+    LayoutSFA* layout_sfa_base_as_int,
+    LayoutSFB* layout_sfb_base_as_int,
+    int* problem_sizes,
+    int* problem_sizes_transpose,
+    bool transpose = false) {
+  int64_t expert_id = static_cast<int64_t>(threadIdx.x);
+
+  if (expert_id >= gridDim.x * blockDim.x) {
+    return;
+  }
+
+  int m = problem_sizes[expert_id * 3];
+  int n = problem_sizes[expert_id * 3 + 1];
+  int k = problem_sizes[expert_id * 3 + 2];
+  if (transpose) {
+    problem_sizes_transpose[expert_id * 3] = n;
+    problem_sizes_transpose[expert_id * 3 + 1] = m;
+    problem_sizes_transpose[expert_id * 3 + 2] = k;
+  }
+
+  int64_t expert_offset = static_cast<int64_t>(expert_offsets[expert_id]);
+  int64_t a_stride = 0;
+  int64_t b_stride = 0;
+  int64_t a_scale_stride = 0;
+  int64_t b_scale_stride = 0;
+  if (!transpose) {
+    a_stride = expert_offset * k;
+    b_stride = expert_id * k * n;
+    a_scale_stride = expert_offset * k / 128;
+    b_scale_stride = expert_id * k * n / 128 / 128;
+  } else {
+    a_stride = expert_id * k * n;
+    b_stride = expert_offset * k;
+    a_scale_stride = expert_id * k * n / 128 / 128;
+    b_scale_stride = expert_offset * k / 128;
+  }
+  a_offsets[expert_id] = a_base_as_int + a_stride;
+  b_offsets[expert_id] = b_base_as_int + b_stride;
+  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
+  a_scales_offsets[expert_id] = a_scales_base_as_int + a_scale_stride;
+  b_scales_offsets[expert_id] = b_scales_base_as_int + b_scale_stride;
+
+  LayoutSFA* layout_sfa_ptr = layout_sfa_base_as_int + expert_id;
+  LayoutSFB* layout_sfb_ptr = layout_sfb_base_as_int + expert_id;
+
+  if (!transpose) {
+    *layout_sfa_ptr = ScaleConfig::tile_atom_to_shape_SFA(cute::make_shape(m, n, k, 1));
+    *layout_sfb_ptr = ScaleConfig::tile_atom_to_shape_SFB(cute::make_shape(m, n, k, 1));
+  } else {
+    *layout_sfa_ptr = ScaleConfig::tile_atom_to_shape_SFA(cute::make_shape(n, m, k, 1));
+    *layout_sfb_ptr = ScaleConfig::tile_atom_to_shape_SFB(cute::make_shape(n, m, k, 1));
+  }
+}
+
+#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE, LayoutSFA, LayoutSFB, ScaleConfig)         \
+  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                                                 \
+    get_group_gemm_starts<cutlass::float_e4m3_t, C_TYPE, float, LayoutSFA, LayoutSFB, ScaleConfig> \
+        <<<1, num_experts, 0, stream>>>(                                                           \
+            static_cast<int32_t*>(expert_offsets.data_ptr()),                                      \
+            static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()),                               \
+            static_cast<cutlass::float_e4m3_t**>(b_ptrs.data_ptr()),                               \
+            static_cast<C_TYPE**>(out_ptrs.data_ptr()),                                            \
+            static_cast<float**>(a_scales_ptrs.data_ptr()),                                        \
+            static_cast<float**>(b_scales_ptrs.data_ptr()),                                        \
+            static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()),                             \
+            static_cast<cutlass::float_e4m3_t*>(b_tensors.data_ptr()),                             \
+            static_cast<C_TYPE*>(out_tensors.data_ptr()),                                          \
+            static_cast<float*>(a_scales.data_ptr()),                                              \
+            static_cast<float*>(b_scales.data_ptr()),                                              \
+            reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),                                   \
+            reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr()),                                   \
+            static_cast<int*>(problem_sizes.data_ptr()),                                           \
+            static_cast<int*>(problem_sizes_transpose.data_ptr()),                                 \
+            transpose);                                                                            \
+  }
+
+namespace {
+template <typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
+void run_get_group_gemm_starts(
+    torch::Tensor const& expert_offsets,
+    torch::Tensor& a_ptrs,
+    torch::Tensor& b_ptrs,
+    torch::Tensor& out_ptrs,
+    torch::Tensor& a_scales_ptrs,
+    torch::Tensor& b_scales_ptrs,
+    torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors,
+    torch::Tensor& out_tensors,
+    torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales,
+    torch::Tensor const& layout_sfa,
+    torch::Tensor const& layout_sfb,
+    torch::Tensor const& problem_sizes,
+    torch::Tensor& problem_sizes_transpose,
+    bool transpose = false) {
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(out_tensors.size(1) % 128 == 0 or out_tensors.size(0) % 128 == 0);
+  TORCH_CHECK(a_tensors.size(1) % 128 == 0 or a_tensors.size(0) % 128 == 0);
+
+  int num_experts = (int)expert_offsets.size(0);
+  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
+
+  if (false) {
+  }
+  __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t, LayoutSFA, LayoutSFB, ScaleConfig)
+  __CALL_GET_STARTS_KERNEL(torch::kFloat16, half, LayoutSFA, LayoutSFB, ScaleConfig)
+  else {
+    TORCH_CHECK(false, "Invalid output type (must be float16 or bfloat16)");
+  }
+}
+}  // namespace
diff --git a/sgl-kernel/csrc/moe/fp8_blockwise_moe_kernel.cu b/sgl-kernel/csrc/moe/fp8_blockwise_moe_kernel.cu
new file mode 100644
index 000000000..b2e1fc83c
--- /dev/null
+++ b/sgl-kernel/csrc/moe/fp8_blockwise_moe_kernel.cu
@@ -0,0 +1,810 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cutlass/arch/arch.h>
+#include <torch/all.h>
+
+#include "cute/tensor.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass_moe_helper.cu"
+#include "utils.h"
+
+using namespace cute;
+
+using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;
+
+template <typename OutType, typename ScheduleConfig, typename LayoutD>
+void launch_sm90_fp8_blockwise_scaled_group_mm(
+    torch::Tensor& out_ptrs,
+    const torch::Tensor& a_ptrs,
+    const torch::Tensor& b_ptrs,
+    const torch::Tensor& a_scales_ptrs,
+    const torch::Tensor& b_scales_ptrs,
+    const torch::Tensor& stride_a,
+    const torch::Tensor& stride_b,
+    const torch::Tensor& stride_c,
+    const torch::Tensor& layout_sfa,
+    const torch::Tensor& layout_sfb,
+    const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets,
+    const torch::Tensor& workspace) {
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = void;
+  using ElementD = OutType;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = LayoutD;
+
+  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using ArchTag = cutlass::arch::Sm90;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+  static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
+  using CustomEVTIdentity =  // acc
+      cutlass::epilogue::fusion::Sm90EVT<
+          cutlass::epilogue::fusion::
+              Sm90Compute<cutlass::epilogue::thread::Identity, ElementD, ElementAccumulator, RoundStyle>,
+          cutlass::epilogue::fusion::Sm90AccFetch>;
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      typename ScheduleConfig::MmaTileShape,
+      typename ScheduleConfig::ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator,
+      ElementAccumulator,
+      ElementC,  // Use void to avoid load Matrix C
+      LayoutC*,
+      AlignmentC,
+      ElementD,
+      LayoutC*,
+      AlignmentC,
+      typename ScheduleConfig::EpilogueSchedule,
+      CustomEVTIdentity>::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      ElementA,
+      cute::tuple<LayoutA*, typename ScheduleConfig::LayoutSFA*>,
+      AlignmentA,
+      ElementB,
+      cute::tuple<LayoutB*, typename ScheduleConfig::LayoutSFB*>,
+      AlignmentB,
+      ElementAccumulator,
+      typename ScheduleConfig::MmaTileShape,
+      typename ScheduleConfig::ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+          sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      typename ScheduleConfig::KernelSchedule>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop, CollectiveEpilogue, void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
+  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
+  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
+  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
+  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
+
+  int num_experts = (int)expert_offsets.size(0);
+  Gemm gemm_op;
+
+  typename GemmKernel::MainloopArguments mainloop_args{
+      static_cast<const ElementA**>(a_ptrs.data_ptr()),
+      static_cast<StrideA*>(stride_a.data_ptr()),
+      static_cast<const ElementB**>(b_ptrs.data_ptr()),
+      static_cast<StrideB*>(stride_b.data_ptr()),
+      static_cast<const ElementAccumulator**>(a_scales_ptrs.data_ptr()),
+      reinterpret_cast<typename ScheduleConfig::LayoutSFA*>(layout_sfa.data_ptr()),
+      static_cast<const ElementAccumulator**>(b_scales_ptrs.data_ptr()),
+      reinterpret_cast<typename ScheduleConfig::LayoutSFB*>(layout_sfb.data_ptr())};
+
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = c10::cuda::current_device();
+  hw_info.sm_count = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {},
+      nullptr,
+      static_cast<StrideC*>(stride_c.data_ptr()),
+      static_cast<ElementD**>(out_ptrs.data_ptr()),
+      static_cast<StrideC*>(stride_c.data_ptr())};
+
+  UnderlyingProblemShape* problem_sizes_as_shapes = static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
+  typename GemmKernel::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGrouped,
+      {num_experts, problem_sizes_as_shapes, nullptr},
+      mainloop_args,
+      epilogue_args,
+      hw_info};
+
+  at::cuda::CUDAGuard device_guard{(char)a_ptrs.get_device()};
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(a_ptrs.get_device());
+
+  auto can_implement_status = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess, "Failed to implement GEMM");
+
+  auto status = gemm_op.initialize(args, workspace.data_ptr(), stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
+
+  status = gemm_op.run(stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
+}
+
+template <typename OutType, typename ScheduleConfig, typename LayoutD>
+void launch_sm100_fp8_blockwise_scaled_group_mm(
+    torch::Tensor& out_ptrs,
+    const torch::Tensor& a_ptrs,
+    const torch::Tensor& b_ptrs,
+    const torch::Tensor& a_scales_ptrs,
+    const torch::Tensor& b_scales_ptrs,
+    const torch::Tensor& stride_a,
+    const torch::Tensor& stride_b,
+    const torch::Tensor& stride_c,
+    const torch::Tensor& layout_sfa,
+    const torch::Tensor& layout_sfb,
+    const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets,
+    const torch::Tensor& workspace) {
+  using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int, int, int>>;
+  using ElementA = cutlass::float_e4m3_t;
+  using ElementB = cutlass::float_e4m3_t;
+  using ElementC = OutType;
+  using ElementD = ElementC;
+  using ElementAccumulator = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = LayoutD;
+
+  static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
+  static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+
+  using ArchTag = cutlass::arch::Sm100;
+  using OperatorClass = cutlass::arch::OpClassTensorOp;
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      typename ScheduleConfig::MmaTileShape,
+      typename ScheduleConfig::ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      ElementAccumulator,
+      ElementAccumulator,
+      void,
+      LayoutC*,
+      AlignmentC,
+      ElementD,
+      LayoutC*,
+      AlignmentC,
+      typename ScheduleConfig::EpilogueSchedule>::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag,
+      OperatorClass,
+      ElementA,
+      cute::tuple<LayoutA*, typename ScheduleConfig::LayoutSFA*>,
+      AlignmentA,
+      ElementB,
+      cute::tuple<LayoutB*, typename ScheduleConfig::LayoutSFB*>,
+      AlignmentB,
+      ElementAccumulator,
+      typename ScheduleConfig::MmaTileShape,
+      typename ScheduleConfig::ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+          sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      typename ScheduleConfig::KernelSchedule>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop, CollectiveEpilogue, void>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
+  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
+  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
+  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
+  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
+
+  int num_experts = (int)expert_offsets.size(0);
+  // Create an instance of the GEMM
+  Gemm gemm_op;
+
+  typename GemmKernel::MainloopArguments mainloop_args{
+      static_cast<const ElementA**>(a_ptrs.data_ptr()),
+      static_cast<StrideA*>(stride_a.data_ptr()),
+      static_cast<const ElementB**>(b_ptrs.data_ptr()),
+      static_cast<StrideB*>(stride_b.data_ptr()),
+      static_cast<const ElementAccumulator**>(a_scales_ptrs.data_ptr()),
+      reinterpret_cast<typename ScheduleConfig::LayoutSFA*>(layout_sfa.data_ptr()),
+      static_cast<const ElementAccumulator**>(b_scales_ptrs.data_ptr()),
+      reinterpret_cast<typename ScheduleConfig::LayoutSFB*>(layout_sfb.data_ptr())};
+
+  cutlass::KernelHardwareInfo hw_info;
+
+  hw_info.device_id = 0;
+  // sm_count is the number of SMs on the current device, since we only support SM100 blackwell, so we set it to 148
+  hw_info.sm_count = 148;
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {},
+      nullptr,
+      static_cast<StrideC*>(stride_c.data_ptr()),
+      static_cast<ElementD**>(out_ptrs.data_ptr()),
+      static_cast<StrideC*>(stride_c.data_ptr())};
+
+  UnderlyingProblemShape* problem_sizes_as_shapes = static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
+  typename GemmKernel::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGrouped,
+      {num_experts, problem_sizes_as_shapes, nullptr},
+      mainloop_args,
+      epilogue_args,
+      hw_info};
+
+  at::cuda::CUDAGuard device_guard{(char)a_ptrs.get_device()};
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(a_ptrs.get_device());
+
+  auto can_implement_status = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess, "Failed to implement GEMM");
+
+  auto status = gemm_op.initialize(args, workspace.data_ptr(), stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
+
+  status = gemm_op.run(stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
+}
+
+template <typename OutType>
+void sm100_fp8_blockwise_group_mm_dispatch_shape(
+    torch::Tensor& output,
+    torch::Tensor& a_ptrs,
+    torch::Tensor& b_ptrs,
+    torch::Tensor& out_ptrs,
+    torch::Tensor& a_scales_ptrs,
+    torch::Tensor& b_scales_ptrs,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const torch::Tensor& stride_a,
+    const torch::Tensor& stride_b,
+    const torch::Tensor& stride_c,
+    const torch::Tensor& layout_sfa,
+    const torch::Tensor& layout_sfb,
+    const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets,
+    const torch::Tensor& workspace) {
+  // Check the first matrix size to decide on the configuration
+  // Assuming all matrices in the group have similar size characteristics
+  // bool use_small_config = a[0].size(0) <= 128;
+  struct MmaConfig1 {
+    using ElementA = cutlass::float_e4m3_t;
+    using MmaTileShape = Shape<_256, _32, _128>;
+    using ClusterShape = Shape<_2, _1, _1>;  // Layout type for SFB matrix operand
+    using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise2SmSm100;
+    using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm;
+    using ScaleConfig =
+        cutlass::detail::Sm100BlockwiseScaleConfig<128, 1, 128, cute::UMMA::Major::K, cute::UMMA::Major::K>;
+    using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
+    using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
+  };
+  struct MmaConfig2 {
+    using ElementA = cutlass::float_e4m3_t;
+    using MmaTileShape = Shape<_128, _128, _128>;
+    using ClusterShape = Shape<_1, _1, _1>;  // Layout type for SFB matrix operand
+    using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100;
+    using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+    using ScaleConfig =
+        cutlass::detail::Sm100BlockwiseScaleConfig<1, 128, 128, cute::UMMA::Major::K, cute::UMMA::Major::K>;
+    using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
+    using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
+  };
+  struct MmaConfig3 {
+    using ElementA = cutlass::float_e4m3_t;
+    using MmaTileShape = Shape<_64, _128, _128>;
+    using ClusterShape = Shape<_1, _1, _1>;  // Layout type for SFB matrix operand
+    using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100;
+    using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;
+    using ScaleConfig =
+        cutlass::detail::Sm100BlockwiseScaleConfig<1, 128, 128, cute::UMMA::Major::K, cute::UMMA::Major::K>;
+    using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
+    using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
+  };
+  int num_experts = (int)expert_offsets.size(0);
+  torch::TensorOptions options_int = torch::TensorOptions().dtype(torch::kInt64).device(a.device());
+  torch::Tensor problem_sizes_transpose = torch::empty(num_experts * 3, options_int);
+  torch::Tensor output_t = output.t();
+  torch::Tensor a_t = a.t();
+  torch::Tensor b_t = b.transpose(1, 2);
+  torch::Tensor scales_a_t = scales_a.t();
+  torch::Tensor scales_b_t = scales_b.transpose(1, 2);
+
+  if (a.size(0) <= 2048 && a.size(1) >= 2048) {
+    run_get_group_gemm_starts<MmaConfig1::LayoutSFA, MmaConfig1::LayoutSFB, MmaConfig1::ScaleConfig>(
+        expert_offsets,
+        a_ptrs,
+        b_ptrs,
+        out_ptrs,
+        a_scales_ptrs,
+        b_scales_ptrs,
+        b_t,
+        a_t,
+        output_t,
+        scales_b_t,
+        scales_a_t,
+        layout_sfa,
+        layout_sfb,
+        problem_sizes,
+        problem_sizes_transpose,
+        true);
+    launch_sm100_fp8_blockwise_scaled_group_mm<OutType, MmaConfig1, cutlass::layout::ColumnMajor>(
+        out_ptrs,
+        a_ptrs,
+        b_ptrs,
+        a_scales_ptrs,
+        b_scales_ptrs,
+        stride_a,
+        stride_b,
+        stride_c,
+        layout_sfa,
+        layout_sfb,
+        problem_sizes_transpose,
+        expert_offsets,
+        workspace);
+    output = output_t.t();
+  } else if (a.size(0) > 2048 && a.size(1) >= 2048) {
+    run_get_group_gemm_starts<MmaConfig2::LayoutSFA, MmaConfig2::LayoutSFB, MmaConfig2::ScaleConfig>(
+        expert_offsets,
+        a_ptrs,
+        b_ptrs,
+        out_ptrs,
+        a_scales_ptrs,
+        b_scales_ptrs,
+        a,
+        b,
+        output,
+        scales_a,
+        scales_b,
+        layout_sfa,
+        layout_sfb,
+        problem_sizes,
+        problem_sizes_transpose);
+    launch_sm100_fp8_blockwise_scaled_group_mm<OutType, MmaConfig2, cutlass::layout::RowMajor>(
+        out_ptrs,
+        a_ptrs,
+        b_ptrs,
+        a_scales_ptrs,
+        b_scales_ptrs,
+        stride_a,
+        stride_b,
+        stride_c,
+        layout_sfa,
+        layout_sfb,
+        problem_sizes,
+        expert_offsets,
+        workspace);
+  } else {
+    run_get_group_gemm_starts<MmaConfig3::LayoutSFA, MmaConfig3::LayoutSFB, MmaConfig3::ScaleConfig>(
+        expert_offsets,
+        a_ptrs,
+        b_ptrs,
+        out_ptrs,
+        a_scales_ptrs,
+        b_scales_ptrs,
+        a,
+        b,
+        output,
+        scales_a,
+        scales_b,
+        layout_sfa,
+        layout_sfb,
+        problem_sizes,
+        problem_sizes_transpose);
+    launch_sm100_fp8_blockwise_scaled_group_mm<OutType, MmaConfig3, cutlass::layout::RowMajor>(
+        out_ptrs,
+        a_ptrs,
+        b_ptrs,
+        a_scales_ptrs,
+        b_scales_ptrs,
+        stride_a,
+        stride_b,
+        stride_c,
+        layout_sfa,
+        layout_sfb,
+        problem_sizes,
+        expert_offsets,
+        workspace);
+  }
+}
+
+template <typename OutType>
+void sm90_fp8_blockwise_group_mm_dispatch_shape(
+    torch::Tensor& output,
+    torch::Tensor& a_ptrs,
+    torch::Tensor& b_ptrs,
+    torch::Tensor& out_ptrs,
+    torch::Tensor& a_scales_ptrs,
+    torch::Tensor& b_scales_ptrs,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const torch::Tensor& stride_a,
+    const torch::Tensor& stride_b,
+    const torch::Tensor& stride_c,
+    const torch::Tensor& layout_sfa,
+    const torch::Tensor& layout_sfb,
+    const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets,
+    const torch::Tensor& workspace) {
+  struct MmaConfigSmallM {
+    // Swap A/B
+    using ElementA = cutlass::float_e4m3_t;
+    using MmaTileShape = Shape<_128, _32, _128>;
+    using ClusterShape = Shape<_2, _1, _1>;
+    // TODO: Check Pingpong or Cooperative
+    using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8BlockScaledAccum;
+    using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+    using ScaleConfig =
+        cutlass::detail::Sm90BlockwiseScaleConfig<128, 1, 128, cute::GMMA::Major::K, cute::GMMA::Major::K>;
+    using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
+    using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
+  };
+
+  struct MmaConfigH20LargeK {
+    using ElementA = cutlass::float_e4m3_t;
+    using MmaTileShape = Shape<_64, _128, _128>;
+    using ClusterShape = Shape<_2, _1, _1>;
+    using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8BlockScaledAccum;
+    using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+    using ScaleConfig =
+        cutlass::detail::Sm90BlockwiseScaleConfig<1, 128, 128, cute::GMMA::Major::K, cute::GMMA::Major::K>;
+    using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
+    using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
+  };
+
+  struct MmaConfigHx00AndH20SmallK {
+    using ElementA = cutlass::float_e4m3_t;
+    using MmaTileShape = Shape<_128, _128, _128>;
+    using ClusterShape = Shape<_1, _2, _1>;
+    using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperativeFP8BlockScaledAccum;
+    using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;
+    using ScaleConfig =
+        cutlass::detail::Sm90BlockwiseScaleConfig<1, 128, 128, cute::GMMA::Major::K, cute::GMMA::Major::K>;
+    using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
+    using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
+  };
+
+  int num_experts = (int)expert_offsets.size(0);
+  torch::TensorOptions options_int = torch::TensorOptions().dtype(torch::kInt64).device(a.device());
+  torch::Tensor problem_sizes_transpose = torch::empty(num_experts * 3, options_int);
+  torch::Tensor output_t = output.t();
+  torch::Tensor a_t = a.t();
+  torch::Tensor b_t = b.transpose(1, 2);
+  torch::Tensor scales_a_t = scales_a.t();
+  torch::Tensor scales_b_t = scales_b.transpose(1, 2);
+
+  const std::string H20_device_type_str("NVIDIA H20");
+  bool is_h20_device = std::string(at::cuda::getCurrentDeviceProperties()->name) == H20_device_type_str;
+
+  if (a.size(0) <= 2048) {
+    run_get_group_gemm_starts<MmaConfigSmallM::LayoutSFA, MmaConfigSmallM::LayoutSFB, MmaConfigSmallM::ScaleConfig>(
+        expert_offsets,
+        a_ptrs,
+        b_ptrs,
+        out_ptrs,
+        a_scales_ptrs,
+        b_scales_ptrs,
+        b_t,
+        a_t,
+        output_t,
+        scales_b_t,
+        scales_a_t,
+        layout_sfa,
+        layout_sfb,
+        problem_sizes,
+        problem_sizes_transpose,
+        true);
+    launch_sm90_fp8_blockwise_scaled_group_mm<OutType, MmaConfigSmallM, cutlass::layout::ColumnMajor>(
+        out_ptrs,
+        a_ptrs,
+        b_ptrs,
+        a_scales_ptrs,
+        b_scales_ptrs,
+        stride_a,
+        stride_b,
+        stride_c,
+        layout_sfa,
+        layout_sfb,
+        problem_sizes_transpose,
+        expert_offsets,
+        workspace);
+    output = output_t.t();
+  } else {
+    if (is_h20_device && a.size(1) > 128) {
+      // For H20 with K > 128, use Pingpong Schedule
+      run_get_group_gemm_starts<
+          MmaConfigH20LargeK::LayoutSFA,
+          MmaConfigH20LargeK::LayoutSFB,
+          MmaConfigH20LargeK::ScaleConfig>(
+          expert_offsets,
+          a_ptrs,
+          b_ptrs,
+          out_ptrs,
+          a_scales_ptrs,
+          b_scales_ptrs,
+          a,
+          b,
+          output,
+          scales_a,
+          scales_b,
+          layout_sfa,
+          layout_sfb,
+          problem_sizes,
+          problem_sizes_transpose);
+      launch_sm90_fp8_blockwise_scaled_group_mm<OutType, MmaConfigH20LargeK, cutlass::layout::RowMajor>(
+          out_ptrs,
+          a_ptrs,
+          b_ptrs,
+          a_scales_ptrs,
+          b_scales_ptrs,
+          stride_a,
+          stride_b,
+          stride_c,
+          layout_sfa,
+          layout_sfb,
+          problem_sizes,
+          expert_offsets,
+          workspace);
+    } else {
+      // For H20 with K <= 128, and H100 & H200 & H800, use Cooperative Schedule
+      run_get_group_gemm_starts<
+          MmaConfigHx00AndH20SmallK::LayoutSFA,
+          MmaConfigHx00AndH20SmallK::LayoutSFB,
+          MmaConfigHx00AndH20SmallK::ScaleConfig>(
+          expert_offsets,
+          a_ptrs,
+          b_ptrs,
+          out_ptrs,
+          a_scales_ptrs,
+          b_scales_ptrs,
+          a,
+          b,
+          output,
+          scales_a,
+          scales_b,
+          layout_sfa,
+          layout_sfb,
+          problem_sizes,
+          problem_sizes_transpose);
+      launch_sm90_fp8_blockwise_scaled_group_mm<OutType, MmaConfigHx00AndH20SmallK, cutlass::layout::RowMajor>(
+          out_ptrs,
+          a_ptrs,
+          b_ptrs,
+          a_scales_ptrs,
+          b_scales_ptrs,
+          stride_a,
+          stride_b,
+          stride_c,
+          layout_sfa,
+          layout_sfb,
+          problem_sizes,
+          expert_offsets,
+          workspace);
+    }
+  }
+}
+
+/**
+ * @brief Performs blockwise grouped matrix multiplication on FP8 quantized inputs,
+ *        with per-block scaling.
+ *
+ * This function dispatches to hardware-specific implementations (e.g., SM100 FP8)
+ * to compute:
+ *     C_i = scale_a[i] * A_i * scale_b[i] * B_i
+ * for each expert group `i`, using input `problem_sizes` and `expert_offsets`
+ * to describe the individual matrix dimensions and their offsets.
+ *
+ * Input tensors A and B must be quantized to 8-bit formats and dequantized before multiplication.
+ * The output tensor is written with bfloat16 or half precision.
+ *
+ * @param output         Output tensor (must be of type bfloat16 or half).
+ * @param a              Input tensor A (must be kFloat8_e4m3fn).
+ * @param b              Input tensor B (must be kFloat8_e4m3fn).
+ * @param scales_a       Scaling factors for tensor A, float32 per expert group.
+ * @param scales_b       Scaling factors for tensor B, float32 per expert group.
+ * @param stride_a       Stride information for tensor A (int32).
+ * @param stride_b       Stride information for tensor B (int32).
+ * @param stride_c       Stride information for output tensor C (int32).
+ * @param layout_sfa     Layout descriptor for A (int32), e.g., row-major/column-major.
+ * @param layout_sfb     Layout descriptor for B (int32).
+ * @param problem_sizes  2D int32 tensor of shape (num_experts, 3), specifying (M, N, K)
+ *                       for each grouped matrix multiplication problem.
+ * @param expert_offsets 1D int32 tensor of size (num_experts), used to index into
+ *                       the grouped input tensors for dispatch.
+ *  @note Performance Optimization:
+ *       If the batch size (a.size(0)) is smaller than 512, the implementation
+ *       will internally transpose input matrices to align with the optimal memory access
+ *       pattern for better GPU efficiency. This transformation is done within the kernel.
+ */
+void fp8_blockwise_scaled_grouped_mm(
+    torch::Tensor& output,
+    torch::Tensor& a_ptrs,
+    torch::Tensor& b_ptrs,
+    torch::Tensor& out_ptrs,
+    torch::Tensor& a_scales_ptrs,
+    torch::Tensor& b_scales_ptrs,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const torch::Tensor& stride_a,
+    const torch::Tensor& stride_b,
+    const torch::Tensor& stride_c,
+    const torch::Tensor& layout_sfa,
+    const torch::Tensor& layout_sfb,
+    const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets,
+    const torch::Tensor& workspace) {
+  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be 2D tensor");
+  TORCH_CHECK(problem_sizes.size(1) == 3, "problem_sizes must have shape (num_experts, 3)");
+  TORCH_CHECK(
+      problem_sizes.size(0) == expert_offsets.size(0), "Number of experts in problem_sizes must match expert_offsets");
+  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32, "problem_sizes must be int32");
+  TORCH_CHECK(a.scalar_type() == torch::kFloat8_e4m3fn, "a must be kFloat8_e4m3fn");
+  TORCH_CHECK(b.scalar_type() == torch::kFloat8_e4m3fn, "b must be kFloat8_e4m3fn");
+  TORCH_CHECK(
+      output.scalar_type() == torch::kBFloat16 || output.scalar_type() == torch::kHalf,
+      "output must be bfloat16 or half");
+  TORCH_CHECK(scales_a.scalar_type() == torch::kFloat32, "scales_a must be float32");
+  TORCH_CHECK(scales_b.scalar_type() == torch::kFloat32, "scales_b must be float32");
+  TORCH_CHECK(stride_a.scalar_type() == torch::kInt64, "stride_a must be int64");
+  TORCH_CHECK(stride_b.scalar_type() == torch::kInt64, "stride_b must be int64");
+  TORCH_CHECK(stride_c.scalar_type() == torch::kInt64, "stride_c must be int64");
+  TORCH_CHECK(layout_sfa.scalar_type() == torch::kInt32, "layout_sfa must be int32");
+  TORCH_CHECK(layout_sfb.scalar_type() == torch::kInt32, "layout_sfb must be int32");
+  TORCH_CHECK(expert_offsets.scalar_type() == torch::kInt32, "expert_offsets must be int32");
+
+  TORCH_CHECK(output.dim() == 2, "output must be 2D tensor");
+  TORCH_CHECK(a.dim() == 2, "a must be 2D tensor");
+  TORCH_CHECK(b.dim() == 3, "b must be 3D tensor");
+  TORCH_CHECK(scales_a.dim() == 2, "scales_a must be 2D tensor");
+  TORCH_CHECK(scales_b.dim() == 3, "scales_b must be 3D tensor");
+  TORCH_CHECK(stride_a.dim() == 1, "stride_a must be 1D tensor");
+  TORCH_CHECK(stride_b.dim() == 1, "stride_b must be 1D tensor");
+  TORCH_CHECK(stride_c.dim() == 1, "stride_c must be 1D tensor");
+  TORCH_CHECK(layout_sfa.dim() == 2, "layout_sfa must be 1D tensor");
+  TORCH_CHECK(layout_sfb.dim() == 2, "layout_sfb must be 1D tensor");
+  TORCH_CHECK(a_ptrs.dim() == 1, "a_ptrs must be 1D tensor");
+  TORCH_CHECK(b_ptrs.dim() == 1, "b_ptrs must be 1D tensor");
+  TORCH_CHECK(out_ptrs.dim() == 1, "out_ptrs must be 1D tensor");
+  TORCH_CHECK(a_scales_ptrs.dim() == 1, "a_scales_ptrs must be 1D tensor");
+  TORCH_CHECK(b_scales_ptrs.dim() == 1, "b_scales_ptrs must be 1D tensor");
+  TORCH_CHECK(expert_offsets.dim() == 1, "expert_offsets must be 1D tensor");
+  TORCH_CHECK(workspace.dim() == 1, "workspace must be 1D tensor");
+
+  bool can_implement = false;
+  auto sm_version = getSMVersion();
+
+#if defined(CUTLASS_ARCH_MMA_SM100A_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+#if defined CUDA_VERSION && CUDA_VERSION >= 12080
+  if (sm_version == 100
+#if CUDA_VERSION >= 12090
+      || sm_version == 103
+#endif
+  ) {
+    if (output.scalar_type() == torch::kBFloat16) {
+      sm100_fp8_blockwise_group_mm_dispatch_shape<cutlass::bfloat16_t>(
+          output,
+          a_ptrs,
+          b_ptrs,
+          out_ptrs,
+          a_scales_ptrs,
+          b_scales_ptrs,
+          a,
+          b,
+          scales_a,
+          scales_b,
+          stride_a,
+          stride_b,
+          stride_c,
+          layout_sfa,
+          layout_sfb,
+          problem_sizes,
+          expert_offsets,
+          workspace);
+    } else {
+      sm100_fp8_blockwise_group_mm_dispatch_shape<cutlass::half_t>(
+          output,
+          a_ptrs,
+          b_ptrs,
+          out_ptrs,
+          a_scales_ptrs,
+          b_scales_ptrs,
+          a,
+          b,
+          scales_a,
+          scales_b,
+          stride_a,
+          stride_b,
+          stride_c,
+          layout_sfa,
+          layout_sfb,
+          problem_sizes,
+          expert_offsets,
+          workspace);
+    }
+    can_implement = true;
+  }
+#endif
+#endif
+
+#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) && defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
+  if (sm_version == 90) {
+    if (output.scalar_type() == torch::kBFloat16) {
+      sm90_fp8_blockwise_group_mm_dispatch_shape<cutlass::bfloat16_t>(
+          output,
+          a_ptrs,
+          b_ptrs,
+          out_ptrs,
+          a_scales_ptrs,
+          b_scales_ptrs,
+          a,
+          b,
+          scales_a,
+          scales_b,
+          stride_a,
+          stride_b,
+          stride_c,
+          layout_sfa,
+          layout_sfb,
+          problem_sizes,
+          expert_offsets,
+          workspace);
+    } else {
+      sm90_fp8_blockwise_group_mm_dispatch_shape<cutlass::half_t>(
+          output,
+          a_ptrs,
+          b_ptrs,
+          out_ptrs,
+          a_scales_ptrs,
+          b_scales_ptrs,
+          a,
+          b,
+          scales_a,
+          scales_b,
+          stride_a,
+          stride_b,
+          stride_c,
+          layout_sfa,
+          layout_sfb,
+          problem_sizes,
+          expert_offsets,
+          workspace);
+    }
+    can_implement = true;
+  }
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      can_implement, "No implemented fp8_blockwise_scaled_grouped_mm for current compute capability: ", sm_version);
+}
diff --git a/sgl-kernel/csrc/moe/marlin_moe_wna16/generate_kernels.py b/sgl-kernel/csrc/moe/marlin_moe_wna16/generate_kernels.py
new file mode 100644
index 000000000..b3ed863a3
--- /dev/null
+++ b/sgl-kernel/csrc/moe/marlin_moe_wna16/generate_kernels.py
@@ -0,0 +1,129 @@
+# SPDX-License-Identifier: Apache-2.0
+import glob
+import itertools
+import os
+import subprocess
+
+import jinja2
+
+FILE_HEAD = """
+// auto generated by generate.py
+// clang-format off
+#pragma once
+
+#include "kernel.h"
+#include "marlin_template.h"
+
+namespace MARLIN_NAMESPACE_NAME {
+""".strip()
+
+TEMPLATE = (
+    "template __global__ void Marlin<"
+    "{{scalar_t}}, "
+    "{{w_type_id}}, "
+    "{{threads}}, "
+    "{{thread_m_blocks}}, "
+    "{{thread_n_blocks}}, "
+    "{{thread_k_blocks}}, "
+    "{{'true' if m_block_size_8 else 'false'}}, "
+    "{{stages}}, "
+    "{{'true' if has_act_order else 'false'}}, "
+    "{{'true' if has_zp else 'false'}}, "
+    "{{group_blocks}}, "
+    "{{'true' if is_zp_float else 'false'}}>"
+    "( MARLIN_KERNEL_PARAMS );"
+)
+
+KERNEL_FILE_TEMPLATE = (
+    "// auto generated by generate.py\n"
+    "// clang-format off\n"
+    "#pragma once\n\n"
+    "{% for kernel_file in kernel_files %}"
+    '#include "{{ kernel_file }}"\n'
+    "{% endfor %}"
+)
+
+KERNEL_FILE_NAME = "kernel_marlin.cuh"
+
+# int8 with zero point case (sglang::kU8) is also supported,
+# we don't add it to reduce wheel size.
+SCALAR_TYPES = ["sglang::kU4", "sglang::kU4B8", "sglang::kU8B128"]
+THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
+
+THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
+# group_blocks:
+#   = 0 : act order case
+#   = -1 : channelwise quantization
+#   > 0 : group_size=16*group_blocks
+GROUP_BLOCKS = [0, -1, 2, 4, 8]
+DTYPES = ["fp16", "bf16"]
+
+
+def remove_old_kernels():
+    for filename in glob.glob(os.path.dirname(__file__) + "/kernel_*.cuh"):
+        subprocess.call(["rm", "-f", filename])
+
+
+def generate_new_kernels():
+    kernel_files = set()
+    for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES):
+        has_zp = "B" not in scalar_type
+        all_template_str_list = []
+
+        for group_blocks, m_blocks, thread_configs in itertools.product(
+            GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
+        ):
+
+            has_act_order = group_blocks == 0
+            if has_zp and has_act_order:
+                continue
+            if thread_configs[2] == 256:
+                if m_blocks <= 1 and thread_configs[0] != 128:
+                    continue
+                if m_blocks > 1 and thread_configs[0] != 64:
+                    continue
+
+            k_blocks = thread_configs[0] // 16
+            n_blocks = thread_configs[1] // 16
+            threads = thread_configs[2]
+
+            c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
+
+            template_str = jinja2.Template(TEMPLATE).render(
+                scalar_t=c_dtype,
+                w_type_id=scalar_type + ".id()",
+                threads=threads,
+                thread_m_blocks=max(m_blocks, 1),
+                thread_n_blocks=n_blocks,
+                thread_k_blocks=k_blocks,
+                m_block_size_8=m_blocks == 0.5,
+                stages="pipe_stages",
+                has_act_order=has_act_order,
+                has_zp=has_zp,
+                group_blocks=group_blocks,
+                is_zp_float=False,
+            )
+
+            all_template_str_list.append(template_str)
+
+        file_content = FILE_HEAD + "\n\n"
+        file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
+        filename = f"kernel_{dtype}_{scalar_type[8:].lower()}.cuh"
+
+        with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
+            f.write(file_content)
+            kernel_files.add(filename)
+
+    kernel_files = list(kernel_files)
+    kernel_files.sort()
+
+    file_content = jinja2.Template(KERNEL_FILE_TEMPLATE).render(
+        kernel_files=kernel_files
+    )
+    with open(os.path.join(os.path.dirname(__file__), KERNEL_FILE_NAME), "w") as f:
+        f.write(file_content)
+
+
+if __name__ == "__main__":
+    remove_old_kernels()
+    generate_new_kernels()
diff --git a/sgl-kernel/csrc/moe/marlin_moe_wna16/kernel.h b/sgl-kernel/csrc/moe/marlin_moe_wna16/kernel.h
new file mode 100644
index 000000000..afa7c377b
--- /dev/null
+++ b/sgl-kernel/csrc/moe/marlin_moe_wna16/kernel.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#ifndef MARLIN_NAMESPACE_NAME
+#define MARLIN_NAMESPACE_NAME marlin_moe_wna16
+#endif
+
+#include "gemm/marlin/marlin.cuh"
+#include "gemm/marlin/marlin_dtypes.cuh"
+#include "scalar_type.hpp"
+
+#define MARLIN_KERNEL_PARAMS                                                                                         \
+  const int4 *__restrict__ A, const int4 *__restrict__ B, int4 *__restrict__ C, int4 *__restrict__ C_tmp,            \
+      const int4 *__restrict__ scales_ptr, const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx,           \
+      const int32_t *__restrict__ sorted_token_ids_ptr, const int32_t *__restrict__ expert_ids_ptr,                  \
+      const int32_t *__restrict__ num_tokens_past_padded_ptr, const float *__restrict__ topk_weights_ptr, int top_k, \
+      bool mul_topk_weights, bool is_ep, int num_groups, int prob_m, int prob_n, int prob_k, int *locks,             \
+      bool use_atomic_add, bool use_fp32_reduce
+
+namespace MARLIN_NAMESPACE_NAME {
+template <
+    typename scalar_t,                     // compute dtype, half or nv_float16
+    const sglang::ScalarTypeId w_type_id,  // weight ScalarType id
+    const int threads,                     // number of threads in a threadblock
+    const int thread_m_blocks,             // number of 16x16 blocks in the m
+                                           // dimension (batchsize) of the
+                                           // threadblock
+    const int thread_n_blocks,             // same for n dimension (output)
+    const int thread_k_blocks,             // same for k dimension (reduction)
+    const bool m_block_size_8,             // whether m_block_size == 8
+                                           // only works when thread_m_blocks == 1
+    const int stages,                      // number of stages for the async global->shared
+                                           // fetch pipeline
+    const bool has_act_order,              // whether act_order is enabled
+    const bool has_zp,                     // whether zero-points are enabled
+    const int group_blocks,                // number of consecutive 16x16 blocks
+                                           // with a separate quantization scale
+    const bool is_zp_float                 // is zero point of float16 type?
+    >
+__global__ void Marlin(MARLIN_KERNEL_PARAMS);
+
+}
diff --git a/sgl-kernel/csrc/moe/marlin_moe_wna16/kernel_bf16_ku4.cuh b/sgl-kernel/csrc/moe/marlin_moe_wna16/kernel_bf16_ku4.cuh
new file mode 100644
index 000000000..7e83bed8f
--- /dev/null
+++ b/sgl-kernel/csrc/moe/marlin_moe_wna16/kernel_bf16_ku4.cuh
@@ -0,0 +1,90 @@
+// auto generated by generate.py
+// clang-format off
+#pragma once
+
+#include "kernel.h"
+#include "marlin_template.h"
+
+namespace MARLIN_NAMESPACE_NAME {
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 256, 1, 8, 8, true, pipe_stages, false, true, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 128, 1, 8, 4, true, pipe_stages, false, true, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 256, 1, 8, 8, false, pipe_stages, false, true, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 128, 1, 8, 4, false, pipe_stages, false, true, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 256, 2, 16, 4, false, pipe_stages, false, true, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 128, 2, 8, 4, false, pipe_stages, false, true, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 256, 3, 16, 4, false, pipe_stages, false, true, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 128, 3, 8, 4, false, pipe_stages, false, true, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 256, 4, 16, 4, false, pipe_stages, false, true, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 128, 4, 8, 4, false, pipe_stages, false, true, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 256, 1, 8, 8, true, pipe_stages, false, true, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 128, 1, 8, 4, true, pipe_stages, false, true, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 256, 1, 8, 8, false, pipe_stages, false, true, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 128, 1, 8, 4, false, pipe_stages, false, true, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 256, 2, 16, 4, false, pipe_stages, false, true, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 128, 2, 8, 4, false, pipe_stages, false, true, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 256, 3, 16, 4, false, pipe_stages, false, true, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 128, 3, 8, 4, false, pipe_stages, false, true, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 256, 4, 16, 4, false, pipe_stages, false, true, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 128, 4, 8, 4, false, pipe_stages, false, true, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 256, 1, 8, 8, true, pipe_stages, false, true, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 128, 1, 8, 4, true, pipe_stages, false, true, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 256, 1, 8, 8, false, pipe_stages, false, true, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 128, 1, 8, 4, false, pipe_stages, false, true, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 256, 2, 16, 4, false, pipe_stages, false, true, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 128, 2, 8, 4, false, pipe_stages, false, true, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 256, 3, 16, 4, false, pipe_stages, false, true, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 128, 3, 8, 4, false, pipe_stages, false, true, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 256, 4, 16, 4, false, pipe_stages, false, true, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 128, 4, 8, 4, false, pipe_stages, false, true, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 256, 1, 8, 8, true, pipe_stages, false, true, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 128, 1, 8, 4, true, pipe_stages, false, true, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 256, 1, 8, 8, false, pipe_stages, false, true, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 128, 1, 8, 4, false, pipe_stages, false, true, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 256, 2, 16, 4, false, pipe_stages, false, true, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 128, 2, 8, 4, false, pipe_stages, false, true, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 256, 3, 16, 4, false, pipe_stages, false, true, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 128, 3, 8, 4, false, pipe_stages, false, true, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 256, 4, 16, 4, false, pipe_stages, false, true, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4.id(), 128, 4, 8, 4, false, pipe_stages, false, true, 8, false>( MARLIN_KERNEL_PARAMS );
+
+}
diff --git a/sgl-kernel/csrc/moe/marlin_moe_wna16/kernel_bf16_ku4b8.cuh b/sgl-kernel/csrc/moe/marlin_moe_wna16/kernel_bf16_ku4b8.cuh
new file mode 100644
index 000000000..60e2dea31
--- /dev/null
+++ b/sgl-kernel/csrc/moe/marlin_moe_wna16/kernel_bf16_ku4b8.cuh
@@ -0,0 +1,110 @@
+// auto generated by generate.py
+// clang-format off
+#pragma once
+
+#include "kernel.h"
+#include "marlin_template.h"
+
+namespace MARLIN_NAMESPACE_NAME {
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 1, 8, 8, true, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 1, 8, 4, true, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 1, 8, 8, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 1, 8, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 2, 16, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 2, 8, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 3, 16, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 3, 8, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 4, 16, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 4, 8, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 1, 8, 8, true, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 1, 8, 4, true, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 1, 8, 8, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 1, 8, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 2, 16, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 2, 8, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 3, 16, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 3, 8, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 4, 16, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 4, 8, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 1, 8, 8, true, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 1, 8, 4, true, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 1, 8, 8, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 1, 8, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 2, 16, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 2, 8, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 3, 16, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 3, 8, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 4, 16, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 4, 8, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 1, 8, 8, true, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 1, 8, 4, true, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 1, 8, 8, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 1, 8, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 2, 16, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 2, 8, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 3, 16, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 3, 8, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 4, 16, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 4, 8, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 1, 8, 8, true, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 1, 8, 4, true, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 1, 8, 8, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 1, 8, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 2, 16, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 2, 8, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 3, 16, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 3, 8, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 256, 4, 16, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU4B8.id(), 128, 4, 8, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+}
diff --git a/sgl-kernel/csrc/moe/marlin_moe_wna16/kernel_bf16_ku8b128.cuh b/sgl-kernel/csrc/moe/marlin_moe_wna16/kernel_bf16_ku8b128.cuh
new file mode 100644
index 000000000..7eb6b18de
--- /dev/null
+++ b/sgl-kernel/csrc/moe/marlin_moe_wna16/kernel_bf16_ku8b128.cuh
@@ -0,0 +1,110 @@
+// auto generated by generate.py
+// clang-format off
+#pragma once
+
+#include "kernel.h"
+#include "marlin_template.h"
+
+namespace MARLIN_NAMESPACE_NAME {
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 1, 8, 8, true, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 1, 8, 4, true, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 1, 8, 8, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 1, 8, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 2, 16, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 2, 8, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 3, 16, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 3, 8, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 4, 16, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 4, 8, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 1, 8, 8, true, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 1, 8, 4, true, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 1, 8, 8, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 1, 8, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 2, 16, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 2, 8, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 3, 16, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 3, 8, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 4, 16, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 4, 8, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 1, 8, 8, true, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 1, 8, 4, true, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 1, 8, 8, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 1, 8, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 2, 16, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 2, 8, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 3, 16, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 3, 8, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 4, 16, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 4, 8, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 1, 8, 8, true, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 1, 8, 4, true, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 1, 8, 8, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 1, 8, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 2, 16, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 2, 8, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 3, 16, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 3, 8, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 4, 16, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 4, 8, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 1, 8, 8, true, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 1, 8, 4, true, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 1, 8, 8, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 1, 8, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 2, 16, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 2, 8, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 3, 16, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 3, 8, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 256, 4, 16, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<nv_bfloat16, sglang::kU8B128.id(), 128, 4, 8, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+}
diff --git a/sgl-kernel/csrc/moe/marlin_moe_wna16/kernel_fp16_ku4.cuh b/sgl-kernel/csrc/moe/marlin_moe_wna16/kernel_fp16_ku4.cuh
new file mode 100644
index 000000000..ec41e018b
--- /dev/null
+++ b/sgl-kernel/csrc/moe/marlin_moe_wna16/kernel_fp16_ku4.cuh
@@ -0,0 +1,90 @@
+// auto generated by generate.py
+// clang-format off
+#pragma once
+
+#include "kernel.h"
+#include "marlin_template.h"
+
+namespace MARLIN_NAMESPACE_NAME {
+
+template __global__ void Marlin<half, sglang::kU4.id(), 256, 1, 8, 8, true, pipe_stages, false, true, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 128, 1, 8, 4, true, pipe_stages, false, true, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 256, 1, 8, 8, false, pipe_stages, false, true, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 128, 1, 8, 4, false, pipe_stages, false, true, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 256, 2, 16, 4, false, pipe_stages, false, true, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 128, 2, 8, 4, false, pipe_stages, false, true, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 256, 3, 16, 4, false, pipe_stages, false, true, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 128, 3, 8, 4, false, pipe_stages, false, true, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 256, 4, 16, 4, false, pipe_stages, false, true, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 128, 4, 8, 4, false, pipe_stages, false, true, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 256, 1, 8, 8, true, pipe_stages, false, true, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 128, 1, 8, 4, true, pipe_stages, false, true, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 256, 1, 8, 8, false, pipe_stages, false, true, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 128, 1, 8, 4, false, pipe_stages, false, true, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 256, 2, 16, 4, false, pipe_stages, false, true, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 128, 2, 8, 4, false, pipe_stages, false, true, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 256, 3, 16, 4, false, pipe_stages, false, true, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 128, 3, 8, 4, false, pipe_stages, false, true, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 256, 4, 16, 4, false, pipe_stages, false, true, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 128, 4, 8, 4, false, pipe_stages, false, true, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 256, 1, 8, 8, true, pipe_stages, false, true, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 128, 1, 8, 4, true, pipe_stages, false, true, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 256, 1, 8, 8, false, pipe_stages, false, true, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 128, 1, 8, 4, false, pipe_stages, false, true, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 256, 2, 16, 4, false, pipe_stages, false, true, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 128, 2, 8, 4, false, pipe_stages, false, true, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 256, 3, 16, 4, false, pipe_stages, false, true, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 128, 3, 8, 4, false, pipe_stages, false, true, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 256, 4, 16, 4, false, pipe_stages, false, true, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 128, 4, 8, 4, false, pipe_stages, false, true, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 256, 1, 8, 8, true, pipe_stages, false, true, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 128, 1, 8, 4, true, pipe_stages, false, true, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 256, 1, 8, 8, false, pipe_stages, false, true, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 128, 1, 8, 4, false, pipe_stages, false, true, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 256, 2, 16, 4, false, pipe_stages, false, true, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 128, 2, 8, 4, false, pipe_stages, false, true, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 256, 3, 16, 4, false, pipe_stages, false, true, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 128, 3, 8, 4, false, pipe_stages, false, true, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 256, 4, 16, 4, false, pipe_stages, false, true, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4.id(), 128, 4, 8, 4, false, pipe_stages, false, true, 8, false>( MARLIN_KERNEL_PARAMS );
+
+}
diff --git a/sgl-kernel/csrc/moe/marlin_moe_wna16/kernel_fp16_ku4b8.cuh b/sgl-kernel/csrc/moe/marlin_moe_wna16/kernel_fp16_ku4b8.cuh
new file mode 100644
index 000000000..7df28701b
--- /dev/null
+++ b/sgl-kernel/csrc/moe/marlin_moe_wna16/kernel_fp16_ku4b8.cuh
@@ -0,0 +1,110 @@
+// auto generated by generate.py
+// clang-format off
+#pragma once
+
+#include "kernel.h"
+#include "marlin_template.h"
+
+namespace MARLIN_NAMESPACE_NAME {
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 1, 8, 8, true, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 1, 8, 4, true, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 1, 8, 8, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 1, 8, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 2, 16, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 2, 8, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 3, 16, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 3, 8, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 4, 16, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 4, 8, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 1, 8, 8, true, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 1, 8, 4, true, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 1, 8, 8, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 1, 8, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 2, 16, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 2, 8, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 3, 16, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 3, 8, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 4, 16, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 4, 8, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 1, 8, 8, true, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 1, 8, 4, true, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 1, 8, 8, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 1, 8, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 2, 16, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 2, 8, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 3, 16, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 3, 8, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 4, 16, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 4, 8, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 1, 8, 8, true, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 1, 8, 4, true, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 1, 8, 8, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 1, 8, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 2, 16, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 2, 8, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 3, 16, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 3, 8, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 4, 16, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 4, 8, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 1, 8, 8, true, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 1, 8, 4, true, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 1, 8, 8, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 1, 8, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 2, 16, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 2, 8, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 3, 16, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 3, 8, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 256, 4, 16, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU4B8.id(), 128, 4, 8, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+}
diff --git a/sgl-kernel/csrc/moe/marlin_moe_wna16/kernel_fp16_ku8b128.cuh b/sgl-kernel/csrc/moe/marlin_moe_wna16/kernel_fp16_ku8b128.cuh
new file mode 100644
index 000000000..1150844e2
--- /dev/null
+++ b/sgl-kernel/csrc/moe/marlin_moe_wna16/kernel_fp16_ku8b128.cuh
@@ -0,0 +1,110 @@
+// auto generated by generate.py
+// clang-format off
+#pragma once
+
+#include "kernel.h"
+#include "marlin_template.h"
+
+namespace MARLIN_NAMESPACE_NAME {
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 1, 8, 8, true, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 1, 8, 4, true, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 1, 8, 8, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 1, 8, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 2, 16, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 2, 8, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 3, 16, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 3, 8, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 4, 16, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 4, 8, 4, false, pipe_stages, true, false, 0, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 1, 8, 8, true, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 1, 8, 4, true, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 1, 8, 8, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 1, 8, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 2, 16, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 2, 8, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 3, 16, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 3, 8, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 4, 16, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 4, 8, 4, false, pipe_stages, false, false, -1, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 1, 8, 8, true, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 1, 8, 4, true, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 1, 8, 8, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 1, 8, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 2, 16, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 2, 8, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 3, 16, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 3, 8, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 4, 16, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 4, 8, 4, false, pipe_stages, false, false, 2, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 1, 8, 8, true, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 1, 8, 4, true, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 1, 8, 8, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 1, 8, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 2, 16, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 2, 8, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 3, 16, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 3, 8, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 4, 16, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 4, 8, 4, false, pipe_stages, false, false, 4, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 1, 8, 8, true, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 1, 8, 4, true, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 1, 8, 8, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 1, 8, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 2, 16, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 2, 8, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 3, 16, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 3, 8, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 256, 4, 16, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+template __global__ void Marlin<half, sglang::kU8B128.id(), 128, 4, 8, 4, false, pipe_stages, false, false, 8, false>( MARLIN_KERNEL_PARAMS );
+
+}
diff --git a/sgl-kernel/csrc/moe/marlin_moe_wna16/kernel_marlin.cuh b/sgl-kernel/csrc/moe/marlin_moe_wna16/kernel_marlin.cuh
new file mode 100644
index 000000000..bb828dc5b
--- /dev/null
+++ b/sgl-kernel/csrc/moe/marlin_moe_wna16/kernel_marlin.cuh
@@ -0,0 +1,10 @@
+// auto generated by generate.py
+// clang-format off
+#pragma once
+
+#include "kernel_bf16_ku4.cuh"
+#include "kernel_bf16_ku4b8.cuh"
+#include "kernel_bf16_ku8b128.cuh"
+#include "kernel_fp16_ku4.cuh"
+#include "kernel_fp16_ku4b8.cuh"
+#include "kernel_fp16_ku8b128.cuh"
diff --git a/sgl-kernel/csrc/moe/marlin_moe_wna16/marlin_template.h b/sgl-kernel/csrc/moe/marlin_moe_wna16/marlin_template.h
new file mode 100644
index 000000000..ade562af6
--- /dev/null
+++ b/sgl-kernel/csrc/moe/marlin_moe_wna16/marlin_template.h
@@ -0,0 +1,1805 @@
+/*
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Adapted from https://github.com/IST-DASLab/marlin
+ */
+#pragma once
+
+#ifndef MARLIN_NAMESPACE_NAME
+#define MARLIN_NAMESPACE_NAME marlin_moe_wna16
+#endif
+
+#include "gemm/marlin/marlin.cuh"
+#include "gemm/marlin/marlin_dtypes.cuh"
+#include "scalar_type.hpp"
+
+#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)                                        \
+  static_assert(                                                                         \
+      std::is_same<scalar_t, half>::value || std::is_same<scalar_t, nv_bfloat16>::value, \
+      "only float16 and bfloat16 is supported");
+
+namespace MARLIN_NAMESPACE_NAME {
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+template <
+    typename scalar_t,                     // compute dtype, half or nv_float16
+    const sglang::ScalarTypeId w_type_id,  // weight ScalarType id
+    const int threads,                     // number of threads in a threadblock
+    const int thread_m_blocks,             // number of 16x16 blocks in the m
+                                           // dimension (batchsize) of the
+                                           // threadblock
+    const int thread_n_blocks,             // same for n dimension (output)
+    const int thread_k_blocks,             // same for k dimension (reduction)
+    const bool m_block_size_8,             // whether m_block_size == 8
+                                           // only works when thread_m_blocks == 1
+    const int stages,                      // number of stages for the async global->shared
+                                           // fetch pipeline
+    const bool has_act_order,              // whether act_order is enabled
+    const bool has_zp,                     // whether zero-points are enabled
+    const int group_blocks,                // number of consecutive 16x16 blocks
+                                           // with a separate quantization scale
+    const bool is_zp_float                 // is zero point of float16 type?
+    >
+__global__ void Marlin(
+    const int4* __restrict__ A,                              // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,                              // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,                                    // fp16 output buffer of shape mxn
+    int4* __restrict__ C_tmp,                                // fp32 tmp output buffer (for reduce)
+    const int4* __restrict__ scales_ptr,                     // fp16 quantization scales of shape
+                                                             // (k/groupsize)xn
+    const int4* __restrict__ zp_ptr,                         // 4bit packed zero-points of shape
+                                                             // (k/groupsize)x(n/pack_factor)
+    const int* __restrict__ g_idx,                           // int32 group indices of shape k
+    const int32_t* __restrict__ sorted_token_ids_ptr,        // moe sorted_ids
+    const int32_t* __restrict__ expert_ids_ptr,              // moe expert ids
+    const int32_t* __restrict__ num_tokens_past_padded_ptr,  // moe num tokens
+    const float* __restrict__ topk_weights_ptr,              // moe top weights
+    int top_k,                                               // num of experts per token
+    bool mul_topk_weights,                                   // mul topk weights or not
+    bool is_ep,                                              // expert parallelism
+    int num_groups,                                          // number of scale groups per output channel
+    int prob_m,                                              // batch dimension m
+    int prob_n,                                              // output dimension n
+    int prob_k,                                              // reduction dimension k
+    int* locks,                                              // extra global storage for barrier synchronization
+    bool use_atomic_add,                                     // whether to use atomic add to reduce
+    bool use_fp32_reduce                                     // whether to use fp32 global reduce
+) {}
+
+}  // namespace MARLIN_NAMESPACE_NAME
+
+#else
+
+// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
+// output/accumulation.
+template <typename scalar_t>
+__device__ inline void
+mma(const typename ScalarType<scalar_t>::FragA& a_frag,
+    const typename ScalarType<scalar_t>::FragB& frag_b,
+    typename ScalarType<scalar_t>::FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  float* c = reinterpret_cast<float*>(&frag_c);
+  if constexpr (std::is_same<scalar_t, half>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else {
+    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+  }
+}
+
+template <typename scalar_t>
+__device__ inline void mma_trans(
+    const typename ScalarType<scalar_t>::FragA& a_frag,
+    const typename ScalarType<scalar_t>::FragB& frag_b,
+    const typename ScalarType<scalar_t>::FragB& frag_b2,
+    typename ScalarType<scalar_t>::FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
+  float* c = reinterpret_cast<float*>(&frag_c);
+  if constexpr (std::is_same<scalar_t, half>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(b[0]),
+          "r"(b2[0]),
+          "r"(b[1]),
+          "r"(b2[1]),
+          "r"(a[0]),
+          "r"(a[1]),
+          "f"(c[0]),
+          "f"(c[1]),
+          "f"(c[2]),
+          "f"(c[3]));
+  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(b[0]),
+          "r"(b2[0]),
+          "r"(b[1]),
+          "r"(b2[1]),
+          "r"(a[0]),
+          "r"(a[1]),
+          "f"(c[0]),
+          "f"(c[1]),
+          "f"(c[2]),
+          "f"(c[3]));
+  } else {
+    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+  }
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in tensor core layout.
+template <int count, typename scalar_t>
+__device__ inline void ldsm(typename ScalarType<scalar_t>::FragA& frag_a, const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  if constexpr (count == 4) {
+    asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+                 : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+                 : "r"(smem));
+  } else if constexpr (count == 2) {
+    asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n" : "=r"(a[0]), "=r"(a[1]) : "r"(smem));
+  } else if constexpr (count == 1) {
+    asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n" : "=r"(a[0]) : "r"(smem));
+  } else {
+    static_assert(count == 1 || count == 2 || count == 4, "invalid count");
+  }
+}
+
+// Lookup-table based 3-input logical operation; explicitly used for
+// dequantization as the compiler does not seem to automatically recognize it in
+// all cases.
+template <int lut>
+__device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(res) : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+
+// Constructs destination register by taking bytes from 2 sources (based on
+// mask)
+template <int start_byte, int mask>
+__device__ inline uint32_t prmt(uint32_t a) {
+  uint32_t res;
+  asm volatile("prmt.b32 %0, %1, %2, %3;\n" : "=r"(res) : "r"(a), "n"(start_byte), "n"(mask));
+  return res;
+}
+
+template <typename scalar_t, int bit>
+__device__ inline typename ScalarType<scalar_t>::FragB dequant(int q, typename ScalarType<scalar_t>::FragB& frag_b);
+
+//
+// Efficiently dequantize 4bit values packed in an int32 value into a full
+// B-fragment of 4 fp16 values. We mostly follow the strategy in the link below,
+// with some small changes:
+// - FP16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287
+// - BF16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L327-L385
+//
+template <>
+__device__ inline typename ScalarType<half>::FragB dequant<half, 4>(int q, typename ScalarType<half>::FragB& frag_b) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  const int SUB = 0x64086408;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd480d480;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo), *reinterpret_cast<const half2*>(&SUB));
+  frag_b[1] = __hfma2(
+      *reinterpret_cast<half2*>(&hi), *reinterpret_cast<const half2*>(&MUL), *reinterpret_cast<const half2*>(&ADD));
+  return frag_b;
+}
+
+template <>
+__device__ inline typename ScalarType<nv_bfloat16>::FragB
+dequant<nv_bfloat16, 4>(int q, typename ScalarType<nv_bfloat16>::FragB& frag_b) {
+  static constexpr uint32_t MASK = 0x000f000f;
+  static constexpr uint32_t EX = 0x43004300;
+
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  q >>= 4;
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+
+  static constexpr uint32_t MUL = 0x3F803F80;
+  static constexpr uint32_t ADD = 0xC308C308;
+
+  frag_b[0] = __hfma2(
+      *reinterpret_cast<nv_bfloat162*>(&lo),
+      *reinterpret_cast<const nv_bfloat162*>(&MUL),
+      *reinterpret_cast<const nv_bfloat162*>(&ADD));
+  frag_b[1] = __hfma2(
+      *reinterpret_cast<nv_bfloat162*>(&hi),
+      *reinterpret_cast<const nv_bfloat162*>(&MUL),
+      *reinterpret_cast<const nv_bfloat162*>(&ADD));
+  return frag_b;
+}
+
+//
+// Fast Int8ToFp16/Int8ToBf16: Efficiently dequantize 8bit int values to fp16 or
+// bf16 Reference:
+// - FP16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
+// - BF16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L125-L175
+//
+template <>
+__device__ inline typename ScalarType<half>::FragB dequant<half, 8>(int q, typename ScalarType<half>::FragB& frag_b) {
+  static constexpr uint32_t mask_for_elt_01 = 0x5250;
+  static constexpr uint32_t mask_for_elt_23 = 0x5351;
+  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+
+  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
+  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
+
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
+
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo), *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi), *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  return frag_b;
+}
+
+template <>
+__device__ inline typename ScalarType<nv_bfloat16>::FragB
+dequant<nv_bfloat16, 8>(int q, typename ScalarType<nv_bfloat16>::FragB& frag_b) {
+  float fp32_intermediates[4];
+  uint32_t* fp32_intermediates_casted = reinterpret_cast<uint32_t*>(fp32_intermediates);
+
+  static constexpr uint32_t fp32_base = 0x4B000000;
+  fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
+  fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
+  fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
+  fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);
+
+  fp32_intermediates[0] -= 8388736.f;
+  fp32_intermediates[1] -= 8388736.f;
+  fp32_intermediates[2] -= 8388736.f;
+  fp32_intermediates[3] -= 8388736.f;
+
+  uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(&frag_b);
+  bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0], fp32_intermediates_casted[1], 0x7632);
+  bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2], fp32_intermediates_casted[3], 0x7632);
+
+  return frag_b;
+}
+
+// Multiply dequantized values by the corresponding quantization scale; used
+// only for grouped quantization.
+template <typename scalar_t>
+__device__ inline void
+scale(typename ScalarType<scalar_t>::FragB& frag_b, typename ScalarType<scalar_t>::FragS& frag_s, int i) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 s = ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_s)[i]);
+  frag_b[0] = __hmul2(frag_b[0], s);
+  frag_b[1] = __hmul2(frag_b[1], s);
+}
+
+template <typename scalar_t>
+__device__ inline void scale_and_sub(typename ScalarType<scalar_t>::FragB& frag_b, scalar_t s, scalar_t zp) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 s2 = ScalarType<scalar_t>::num2num2(s);
+  scalar_t2 zp2 = ScalarType<scalar_t>::num2num2(zp);
+  frag_b[0] = __hfma2(frag_b[0], s2, __hneg2(zp2));
+  frag_b[1] = __hfma2(frag_b[1], s2, __hneg2(zp2));
+}
+
+template <typename scalar_t>
+__device__ inline void
+sub_zp(typename ScalarType<scalar_t>::FragB& frag_b, typename ScalarType<scalar_t>::scalar_t2& frag_zp, int i) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 zp = ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_zp)[i]);
+  frag_b[0] = __hsub2(frag_b[0], zp);
+  frag_b[1] = __hsub2(frag_b[1], zp);
+}
+
+// Same as above, but for act_order (each K is multiplied individually)
+template <typename scalar_t>
+__device__ inline void scale4(
+    typename ScalarType<scalar_t>::FragB& frag_b,
+    typename ScalarType<scalar_t>::FragS& frag_s_1,
+    typename ScalarType<scalar_t>::FragS& frag_s_2,
+    typename ScalarType<scalar_t>::FragS& frag_s_3,
+    typename ScalarType<scalar_t>::FragS& frag_s_4,
+    int i) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 s_val_1_2;
+  s_val_1_2.x = reinterpret_cast<scalar_t*>(&frag_s_1)[i];
+  s_val_1_2.y = reinterpret_cast<scalar_t*>(&frag_s_2)[i];
+
+  scalar_t2 s_val_3_4;
+  s_val_3_4.x = reinterpret_cast<scalar_t*>(&frag_s_3)[i];
+  s_val_3_4.y = reinterpret_cast<scalar_t*>(&frag_s_4)[i];
+
+  frag_b[0] = __hmul2(frag_b[0], s_val_1_2);
+  frag_b[1] = __hmul2(frag_b[1], s_val_3_4);
+}
+
+// Given 2 floats multiply by 2 scales (halves)
+template <typename scalar_t>
+__device__ inline void scale_float(float* c, typename ScalarType<scalar_t>::FragS& s) {
+  scalar_t* s_ptr = reinterpret_cast<scalar_t*>(&s);
+  c[0] = __fmul_rn(c[0], ScalarType<scalar_t>::num2float(s_ptr[0]));
+  c[1] = __fmul_rn(c[1], ScalarType<scalar_t>::num2float(s_ptr[1]));
+}
+
+// Wait until barrier reaches `count`, then lock for current threadblock.
+__device__ inline void barrier_acquire(int* lock, int count) {
+  if (threadIdx.x == 0) {
+    int state = -1;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n" : "=r"(state) : "l"(lock));
+    while (state != count);
+  }
+  __syncthreads();
+}
+
+// Release barrier and increment visitation count.
+__device__ inline void barrier_release(int* lock, bool reset = false) {
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    if (reset) {
+      lock[0] = 0;
+      return;
+    }
+    int val = 1;
+    // Make sure that all writes since acquiring this barrier are visible
+    // globally, while releasing the barrier.
+    asm volatile("fence.acq_rel.gpu;\n");
+    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n" : : "l"(lock), "r"(val));
+  }
+}
+
+// Wait until value of lock to be negative, and then add 1
+__device__ inline void wait_negative_and_add(int* lock) {
+  if (threadIdx.x == 0) {
+    int state = 0;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n" : "=r"(state) : "l"(lock));
+    while (state >= 0);
+    atomicAdd(lock, 1);
+  }
+  __syncthreads();
+}
+
+template <
+    typename scalar_t,                     // compute dtype, half or nv_float16
+    const sglang::ScalarTypeId w_type_id,  // weight ScalarType id
+    const int threads,                     // number of threads in a threadblock
+    const int thread_m_blocks,             // number of 16x16 blocks in the m
+                                           // dimension (batchsize) of the
+                                           // threadblock
+    const int thread_n_blocks,             // same for n dimension (output)
+    const int thread_k_blocks,             // same for k dimension (reduction)
+    const bool m_block_size_8,             // whether m_block_size == 8
+                                           // only works when thread_m_blocks == 1
+    const int stages,                      // number of stages for the async global->shared
+                                           // fetch pipeline
+    const bool has_act_order,              // whether act_order is enabled
+    const bool has_zp,                     // whether zero-points are enabled
+    const int group_blocks,                // number of consecutive 16x16 blocks
+                                           // with a separate quantization scale
+    const bool is_zp_float                 // is zero point of float16 type?
+    >
+__global__ void Marlin(
+    const int4* __restrict__ A,                              // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,                              // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,                                    // fp16 output buffer of shape mxn
+    int4* __restrict__ C_tmp,                                // fp32 tmp output buffer (for reduce)
+    const int4* __restrict__ scales_ptr,                     // fp16 quantization scales of shape
+                                                             // (k/groupsize)xn
+    const int4* __restrict__ zp_ptr,                         // 4bit packed zero-points of shape
+                                                             // (k/groupsize)x(n/pack_factor)
+    const int* __restrict__ g_idx,                           // int32 group indices of shape k
+    const int32_t* __restrict__ sorted_token_ids_ptr,        // moe sorted_ids
+    const int32_t* __restrict__ expert_ids_ptr,              // moe expert ids
+    const int32_t* __restrict__ num_tokens_past_padded_ptr,  // moe num tokens
+    const float* __restrict__ topk_weights_ptr,              // moe top weights
+    int top_k,                                               // num of experts per token
+    bool mul_topk_weights,                                   // mul topk weights or not
+    bool is_ep,                                              // expert parallelism
+    int num_groups,                                          // number of scale groups per output channel
+    int prob_m,                                              // batch dimension m
+    int prob_n,                                              // output dimension n
+    int prob_k,                                              // reduction dimension k
+    int* locks,                                              // extra global storage for barrier synchronization
+    bool use_atomic_add,                                     // whether to use atomic add to reduce
+    bool use_fp32_reduce                                     // whether to use fp32 global reduce
+) {
+  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
+  // same size, which might involve multiple column "slices" (of width 16 *
+  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
+  // example:
+  //   0 1 3
+  //   0 2 3
+  //   1 2 4
+  // While this kind of partitioning makes things somewhat more complicated, it
+  // ensures good utilization of all SMs for many kinds of shape and GPU
+  // configurations, while requiring as few slow global cross-threadblock
+  // reductions as possible.
+  using Dtype = ScalarType<scalar_t>;
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  using FragA = typename ScalarType<scalar_t>::FragA;
+  using FragB = typename ScalarType<scalar_t>::FragB;
+  using FragC = typename ScalarType<scalar_t>::FragC;
+  using FragS = typename ScalarType<scalar_t>::FragS;
+  using FragZP = typename ScalarType<scalar_t>::FragZP;
+
+  extern __shared__ int4 sh[];
+  static constexpr auto w_type = sglang::ScalarType::from_id(w_type_id);
+
+  constexpr int pack_factor = 32 / w_type.size_bits();
+  static_assert(thread_m_blocks == 1 || !m_block_size_8);
+  constexpr int moe_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks);
+  const int group_size = (!has_act_order && group_blocks == -1) ? prob_k : prob_k / num_groups;
+  const int scales_expert_stride = prob_n * prob_k / group_size / 8;
+  const int zp_expert_stride =
+      is_zp_float ? prob_n * prob_k / group_size / 8 : prob_n * prob_k / group_size / (pack_factor * 4);
+
+  // parallel: num valid moe blocks
+  int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
+  int parallel = num_tokens_past_padded / moe_block_size;
+  int num_valid_blocks = parallel;
+  if (is_ep) {
+    for (int i = 0; i < parallel; i++) {
+      if (expert_ids_ptr[i] == -1) num_valid_blocks--;
+    }
+  }
+  int num_invalid_blocks = parallel - num_valid_blocks;
+  parallel = num_valid_blocks;
+
+  int k_tiles = prob_k / 16 / thread_k_blocks;
+  int n_tiles = prob_n / 16 / thread_n_blocks;
+  int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x);
+
+  if constexpr (!has_act_order && group_blocks != -1) {
+    if (group_blocks >= thread_k_blocks) {
+      // Ensure that the number of tiles in each stripe is a multiple of the
+      // groupsize; this avoids an annoying special case where a stripe starts
+      // in the middle of group.
+      iters = (group_blocks / thread_k_blocks) * div_ceil(iters, (group_blocks / thread_k_blocks));
+    }
+  }
+
+  int slice_row = (iters * blockIdx.x) % k_tiles;
+  int slice_col_par = (iters * blockIdx.x) / k_tiles;
+  int slice_col = slice_col_par;
+  int slice_iters;      // number of threadblock tiles in the current slice
+  int slice_count = 0;  // total number of active threadblocks in the current slice
+  int slice_idx;        // index of threadblock in current slice; numbered bottom to
+                        // top
+
+  int par_id = 0;
+  int block_id = -1;
+  int64_t expert_id = 0;  // use int64 to avoid computation result overflow
+  int old_expert_id = 0;
+  int64_t B_expert_off = 0;
+
+  int4* sh_block_sorted_ids_int4 = sh;
+  int32_t* sh_block_sorted_ids = reinterpret_cast<int*>(sh_block_sorted_ids_int4);
+  int4* sh_block_topk_weights_int4 = sh_block_sorted_ids_int4 + moe_block_size / 4;
+  scalar_t2* sh_block_topk_weights = reinterpret_cast<scalar_t2*>(sh_block_topk_weights_int4);
+  int4* sh_new = sh_block_topk_weights_int4 + moe_block_size / 4;
+
+  int32_t block_num_valid_tokens = 0;
+  int32_t locks_off = 0;
+
+  // We can easily implement parallel problem execution by just remapping
+  // indices and advancing global pointers
+  if (slice_col_par >= n_tiles) {
+    slice_col = slice_col_par % n_tiles;
+    par_id = slice_col_par / n_tiles;
+  }
+  if (parallel * n_tiles >= gridDim.x) {
+    // when parallel * n_tiles >= sms
+    // then there are at most $sms$ conflict tile blocks
+    locks_off = blockIdx.x;
+  } else {
+    locks_off = (iters * blockIdx.x) / k_tiles - 1;
+  }
+
+  // read moe block data given block_id
+  // block_sorted_ids / block_num_valid_tokens / block_topk_weights
+  auto read_moe_block_data = [&](int block_id) {
+    block_num_valid_tokens = moe_block_size;
+#pragma unroll
+    for (int i = 0; i < moe_block_size / 4; i++) {
+      int4 sorted_token_ids_int4 =
+          reinterpret_cast<const int4*>(sorted_token_ids_ptr)[block_id * moe_block_size / 4 + i];
+      int* sorted_token_ids = reinterpret_cast<int*>(&sorted_token_ids_int4);
+#pragma unroll
+      for (int j = 0; j < 4; j++) {
+        if (sorted_token_ids[j] >= prob_m * top_k) {
+          block_num_valid_tokens = i * 4 + j;
+          break;
+        }
+      }
+      if (block_num_valid_tokens != moe_block_size) break;
+    }
+
+    __syncthreads();
+    int tid4 = threadIdx.x / 4;
+    if (threadIdx.x % 4 == 0 && threadIdx.x < block_num_valid_tokens) {
+      sh_block_sorted_ids_int4[tid4] =
+          reinterpret_cast<const int4*>(sorted_token_ids_ptr)[block_id * moe_block_size / 4 + tid4];
+
+      if (mul_topk_weights) {
+#pragma unroll
+        for (int i = 0; i < 4; i++) {
+          sh_block_topk_weights[tid4 * 4 + i] =
+              Dtype::num2num2(Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[tid4 * 4 + i]]));
+        }
+      }
+    }
+    __syncthreads();
+  };
+
+  // when move to next moe block, find the next block_id and expert_id
+  // and then read moe block data
+  auto update_next_moe_block_data = [&]() {
+    if (par_id >= parallel) return;
+
+    old_expert_id = expert_id;
+    if (num_invalid_blocks > 0) {
+      int skip_count = block_id == -1 ? par_id : 0;
+      block_id++;
+      for (int i = block_id; i < num_tokens_past_padded / moe_block_size; i++) {
+        expert_id = expert_ids_ptr[i];
+        if (expert_id != -1) {
+          if (skip_count == 0) {
+            block_id = i;
+            break;
+          };
+          skip_count--;
+        };
+      }
+    } else {
+      block_id = par_id;
+      expert_id = expert_ids_ptr[block_id];
+    }
+
+    B_expert_off = expert_id * prob_n * prob_k / (pack_factor * 4);
+    scales_ptr += (expert_id - old_expert_id) * scales_expert_stride;
+    if constexpr (has_zp) {
+      zp_ptr += (expert_id - old_expert_id) * zp_expert_stride;
+    }
+    if constexpr (has_act_order) {
+      g_idx += (expert_id - old_expert_id) * prob_k;
+    }
+
+    read_moe_block_data(block_id);
+  };
+
+  // Compute all information about the current slice which is required for
+  // synchronization.
+  auto init_slice = [&](bool first_init = false) {
+    slice_iters = iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
+    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters == 0) return;
+    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
+    slice_count = 1;
+    slice_idx = 0;
+    int col_first = iters * div_ceil(k_tiles * slice_col_par, iters);
+    if (col_first <= k_tiles * (slice_col_par + 1)) {
+      int col_off = col_first - k_tiles * slice_col_par;
+      slice_count = div_ceil(k_tiles - col_off, iters);
+      if (col_off > 0) slice_count++;
+      int delta_first = iters * blockIdx.x - col_first;
+      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
+        slice_idx = slice_count - 1;
+      else {
+        slice_idx = slice_count - 1 - delta_first / iters;
+        if (col_off > 0) slice_idx--;
+      }
+    }
+    if (parallel * n_tiles >= gridDim.x) {
+      if (slice_count > 1 && slice_idx == slice_count - 1) {
+        locks_off++;
+      }
+    } else {
+      locks_off++;
+    }
+
+    if (first_init && use_atomic_add && slice_count > 1 && slice_idx == 0) {
+      constexpr int threads_per_m = 16 * thread_n_blocks / 8;
+      int m_per_thread = div_ceil(block_num_valid_tokens, threads / threads_per_m);
+      for (int i = 0; i < m_per_thread; i++) {
+        int row = threads / threads_per_m * i + threadIdx.x / threads_per_m;
+        if (row < block_num_valid_tokens) {
+          int64_t sorted_row = sh_block_sorted_ids[row];
+          int col = slice_col * 16 * thread_n_blocks / 8 + threadIdx.x % threads_per_m;
+          C[sorted_row * prob_n / 8 + col] = {0, 0, 0, 0};
+        }
+      }
+      // After write zero to output, write a negative value to lock.
+      // Every SM that processes the same slice would wait for
+      // the negative value, and then atomicAdd 1 to it.
+      // After all SMs are processed, the lock value would back to 0 again.
+      __syncthreads();
+      if (threadIdx.x == 0) locks[locks_off] = 1 - slice_count;
+    }
+
+    if (slice_col == n_tiles) {
+      slice_col = 0;
+      par_id++;
+      update_next_moe_block_data();
+    }
+  };
+
+  update_next_moe_block_data();
+  init_slice(true);
+
+  // A sizes/strides
+
+  // stride of the A matrix in global memory
+  int a_gl_stride = prob_k / 8;
+  // stride of an A matrix tile in shared memory
+  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
+  // delta between subsequent A tiles in global memory
+  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
+  // between subsequent accesses within a tile
+  int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory writes
+  constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory tile reads
+  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
+  // within a shared memory tile
+  constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
+  // overall size of a tile
+  constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks);
+  // number of shared write iterations for a tile
+  constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);
+
+  // B sizes/strides
+  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
+  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
+  constexpr int b_thread_vecs = w_type.size_bits() == 4 ? 1 : 2;
+  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
+
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
+  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
+  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
+  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
+
+  // Scale sizes/strides without act_order
+  int s_gl_stride = prob_n / 8;
+  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+  constexpr int s_tb_groups =
+      !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks ? thread_k_blocks / group_blocks : 1;
+  constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
+  int s_gl_rd_delta = s_gl_stride;
+
+  // Scale size/strides with act_order
+  constexpr int tb_k = 16 * thread_k_blocks;
+  constexpr int g_idx_stage = has_act_order ? (tb_k * sizeof(int)) / 16 : 0;
+  // constexpr int act_s_row_stride      = 1;
+  // int           act_s_col_stride      = act_s_row_stride * num_groups;
+  int act_s_col_stride = 1;
+  int act_s_col_warp_stride = act_s_col_stride * 8;
+  int tb_n_warps = thread_n_blocks / 4;
+  int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
+
+  // Zero-points sizes/strides
+  int zp_gl_stride = is_zp_float ? prob_n / 8 : (prob_n / pack_factor) / 4;
+  constexpr int zp_sh_stride = is_zp_float ? 16 * thread_n_blocks / 8 : ((16 * thread_n_blocks) / pack_factor) / 4;
+  constexpr int zp_tb_groups = s_tb_groups;
+  constexpr int zp_sh_stage = has_zp ? zp_tb_groups * zp_sh_stride : 0;
+  int zp_gl_rd_delta = zp_gl_stride;
+
+  // Global A read index of current thread.
+  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + (threadIdx.x % a_gl_rd_delta_o);
+  a_gl_rd += a_gl_rd_delta_o * slice_row;
+  // Shared write index of current thread.
+  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) + (threadIdx.x % a_gl_rd_delta_o);
+  // Shared read index.
+  int a_sh_rd = a_sh_stride * ((threadIdx.x % 32) % (16 / (m_block_size_8 ? 2 : 1))) +
+                (threadIdx.x % 32) / (16 / (m_block_size_8 ? 2 : 1));
+  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+
+  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) + (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
+  b_gl_rd += b_sh_stride * slice_col;
+  b_gl_rd += b_gl_rd_delta_o * slice_row;
+  int b_sh_wr = threadIdx.x * b_thread_vecs;
+  int b_sh_rd = threadIdx.x * b_thread_vecs;
+
+  // For act_order
+  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
+  int slice_k_start = tb_k * slice_row;
+  int slice_k_finish = slice_k_start + tb_k * slice_iters;
+  int slice_k_start_shared_fetch = slice_k_start;
+  int slice_n_offset = act_s_col_tb_stride * slice_col;
+
+  // No act_order
+  int s_gl_rd;
+  if constexpr (!has_act_order) {
+    if constexpr (group_blocks == -1) {
+      s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+    } else {
+      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) + s_sh_stride * slice_col + threadIdx.x;
+    }
+  }
+  int s_sh_wr = threadIdx.x;
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+
+  // Zero-points
+  int zp_gl_rd;
+  if constexpr (has_zp) {
+    if constexpr (group_blocks == -1) {
+      zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+    } else {
+      zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) + zp_sh_stride * slice_col + threadIdx.x;
+    }
+  }
+  int zp_sh_wr = threadIdx.x;
+  bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride;
+
+  // We use a different scale layout for grouped and column-wise quantization as
+  // we scale a `half2` tile in column-major layout in the former and in
+  // row-major in the latter case.
+  int s_sh_rd;
+  if constexpr (group_blocks != -1)
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) / 4;
+  else if constexpr (group_blocks == -1 && (m_block_size_8 || has_zp))
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) / 8;
+  else
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) % 4;
+
+  // Zero-points have the same read layout as the scales
+  // (without column-wise case)
+  constexpr int num_col_threads = 8;
+  constexpr int num_row_threads = 4;
+  constexpr int num_ints_per_thread = 8 / pack_factor;
+  int zp_sh_rd;
+  if constexpr (has_zp) {
+    if constexpr (is_zp_float) {
+      if constexpr (group_blocks != -1) {
+        zp_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) / 4;
+      }
+    } else {
+      zp_sh_rd = num_ints_per_thread * num_col_threads * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                 num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
+    }
+  }
+
+  // To ensure that writing and reading A tiles to/from shared memory, the
+  // latter in fragment format, is fully bank conflict free, we need to use a
+  // rather fancy XOR-based layout. The key here is that neither reads nor
+  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
+  // same shared memory banks. Further, it seems (based on NSight-Compute) that
+  // each warp must also write a consecutive memory segment?
+  auto transform_a = [&](int i) {
+    int row = i / a_gl_rd_delta_o;
+    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
+  };
+  // Since the computation of this remapping is non-trivial and, due to our main
+  // loop unrolls, all shared memory accesses are static, we simply precompute
+  // both transformed reads and writes.
+  int a_sh_wr_trans[a_sh_wr_iters];
+#pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
+  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
+#pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++) {
+#pragma unroll
+    for (int j = 0; j < thread_m_blocks; j++)
+      a_sh_rd_trans[i][j] = transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+  }
+
+  // Since B-accesses have non-constant stride they have to be computed at
+  // runtime; we break dependencies between subsequent accesses with a tile by
+  // maintining multiple pointers (we have enough registers), a tiny
+  // optimization.
+  const int4* B_ptr[b_sh_wr_iters];
+#pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++)
+    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
+
+  // Shared memory storage for global fetch pipelines.
+  int4* sh_a = sh_new;
+  int4* sh_b = sh_a + (stages * a_sh_stage);
+  int4* sh_g_idx = sh_b + (stages * b_sh_stage);
+  int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
+  int4* sh_s = sh_zp + (stages * zp_sh_stage);
+  int4* sh_red = sh_b;
+
+  // Register storage for double buffer of shared memory reads.
+  FragA frag_a[2][thread_m_blocks];
+  I4 frag_b_quant[2][b_thread_vecs];
+  FragC frag_c[thread_m_blocks][4][2];
+  FragS frag_s[2][4];                    // No act-order
+  FragS act_frag_s[2][4][4];             // For act-order
+  int frag_qzp[2][num_ints_per_thread];  // Zero-points
+  FragZP frag_zp;                        // Zero-points in fp16
+  FragZP frag_zpf[2];                    // Zero-points in fp16 in HQQ
+
+  // Zero accumulators.
+  auto zero_accums = [&]() {
+#pragma unroll
+    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
+      reinterpret_cast<float*>(frag_c)[i] = 0;
+  };
+
+  int sh_first_group_id = -1;
+  int sh_num_groups = -1;
+  constexpr int sh_max_num_groups = 32;
+
+  auto fetch_act_order_scales_to_shared = [&](bool is_async, int first_group_id, int last_group_id) {
+    sh_first_group_id = first_group_id;
+    sh_num_groups = last_group_id - first_group_id + 1;
+
+    if (sh_num_groups < sh_max_num_groups) {
+      sh_num_groups = sh_max_num_groups;
+    }
+
+    if (sh_first_group_id + sh_num_groups > num_groups) {
+      sh_num_groups = num_groups - sh_first_group_id;
+    }
+
+    int row_offset = first_group_id * s_gl_stride;
+
+    if (is_async) {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          cp_async4_pred(
+              &sh_s[(i * s_sh_stride) + threadIdx.x],
+              &scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset + threadIdx.x]);
+        }
+      }
+    } else {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          sh_s[(i * s_sh_stride) + threadIdx.x] =
+              scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset + threadIdx.x];
+        }
+      }
+    }
+  };
+  // Asynchronously fetch the next A, B and s tile from global to the next
+  // shared memory pipeline location.
+  int a_remaining_load_count_in_slice = stages;
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
+    if (pred) {
+      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+      if (prob_k > thread_k_blocks * 16 * stages || slice_col == 0 || a_remaining_load_count_in_slice > 0) {
+        a_remaining_load_count_in_slice--;
+#pragma unroll
+        for (int i = 0; i < a_sh_wr_iters; i++) {
+          int a_idx = a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off;
+          int row = a_idx / a_gl_stride;
+          int64_t sorted_row = 0;
+          if (!m_block_size_8 || row < 8) sorted_row = sh_block_sorted_ids[row] / top_k;
+          int64_t true_idx = sorted_row * a_gl_stride + a_idx % a_gl_stride;
+          cp_async4_pred(&sh_a_stage[a_sh_wr_trans[i]], &A[true_idx], row < block_num_valid_tokens);
+        }
+      }
+      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+#pragma unroll
+      for (int i = 0; i < b_sh_wr_iters; i++) {
+#pragma unroll
+        for (int j = 0; j < b_thread_vecs; j++) {
+          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j + B_expert_off);
+        }
+
+        B_ptr[i] += b_gl_rd_delta_o;
+      }
+
+      if constexpr (has_act_order) {
+        // Fetch g_idx thread-block portion
+        int full_pipe = a_off;
+        int cur_k = slice_k_start_shared_fetch + tb_k * full_pipe;
+        if (cur_k < prob_k && cur_k < slice_k_finish) {
+          int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+
+          int4 const* cur_g_idx_stage_ptr = reinterpret_cast<int4 const*>(&g_idx[cur_k]);
+
+          if (threadIdx.x < g_idx_stage) {
+            cp_async4_pred(&sh_g_idx_stage[threadIdx.x], &cur_g_idx_stage_ptr[threadIdx.x]);
+          }
+        }
+      } else {
+        if constexpr (group_blocks != -1) {
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+
+          if constexpr (group_blocks >= thread_k_blocks) {
+            // Only fetch scales if this tile starts a new group
+            if (pipe % (group_blocks / thread_k_blocks) == 0) {
+              if (s_sh_wr_pred) {
+                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
+              }
+              s_gl_rd += s_gl_rd_delta;
+            }
+          } else {
+            for (int i = 0; i < s_tb_groups; i++) {
+              if (s_sh_wr_pred) {
+                cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr], &scales_ptr[s_gl_rd]);
+              }
+              s_gl_rd += s_gl_rd_delta;
+            }
+          }
+        }
+
+        if constexpr (has_zp && group_blocks != -1) {
+          int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+
+          if constexpr (group_blocks >= thread_k_blocks) {
+            // Only fetch zero-points if this tile starts a new group
+            if (pipe % (group_blocks / thread_k_blocks) == 0) {
+              if (zp_sh_wr_pred) {
+                cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
+              }
+              zp_gl_rd += zp_gl_rd_delta;
+            }
+          } else {
+            for (int i = 0; i < zp_tb_groups; i++) {
+              if (zp_sh_wr_pred) {
+                cp_async4(&sh_zp_stage[i * zp_sh_stride + zp_sh_wr], &zp_ptr[zp_gl_rd]);
+              }
+              zp_gl_rd += zp_gl_rd_delta;
+            }
+          }
+        }
+      }
+    }
+    // Insert a fence even when we are winding down the pipeline to ensure that
+    // waiting is also correct at this point.
+    cp_async_fence();
+  };
+
+  auto fetch_col_zp_to_shared = [&]() {
+    if (zp_sh_wr_pred) {
+      cp_async4(&sh_zp[zp_sh_wr], &zp_ptr[zp_gl_rd]);
+    }
+  };
+
+  auto fetch_col_scale_to_shared = [&]() {
+    if (s_sh_wr_pred) {
+      cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+    }
+  };
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<stages - 2>();
+    __syncthreads();
+  };
+
+  // Load the next sub-tile from the current location in the shared memory pipe
+  // into the current register buffer.
+  auto fetch_to_registers = [&](int k, int pipe) {
+    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+#pragma unroll
+    for (int i = 0; i < thread_m_blocks; i++)
+      ldsm<m_block_size_8 ? 2 : 4, scalar_t>(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
+    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+
+#pragma unroll
+    for (int i = 0; i < b_thread_vecs; i++) {
+      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(&sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
+    }
+  };
+
+  bool is_same_group[stages];
+  int same_group_id[stages];
+
+  auto init_same_group = [&](int pipe) {
+    if constexpr (!has_act_order) {
+      return;
+    }
+
+    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
+
+    int group_id_1 = sh_g_idx_int_ptr[0];
+    int group_id_2 = sh_g_idx_int_ptr[tb_k - 1];
+
+    is_same_group[pipe] = group_id_1 == group_id_2;
+    same_group_id[pipe] = group_id_1;
+  };
+
+  auto fetch_scales_to_registers = [&](int k, int full_pipe) {
+    int pipe = full_pipe % stages;
+
+    if constexpr (!has_act_order) {
+      // No act-order case
+      if constexpr (group_blocks == -1) {
+        // load only when starting a new slice
+        if (k == 0 && full_pipe == 0) {
+          reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd];
+          reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+        }
+      } else if constexpr (group_blocks != -1) {
+        if constexpr (group_blocks >= thread_k_blocks) {
+          int4* sh_s_stage =
+              sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) * (pipe / (group_blocks / thread_k_blocks)));
+          reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+        } else {
+          int warp_id = threadIdx.x / 32;
+          int n_warps = thread_n_blocks / 4;
+
+          int warp_row = warp_id / n_warps;
+
+          int cur_k = warp_row * 16;
+          cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+          int k_blocks = cur_k / 16;
+          int cur_group_id = k_blocks / group_blocks;
+
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+
+          reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
+        }
+      }
+
+      return;
+    }
+
+    // Act-order case
+
+    // Determine K of the "current" thread-block
+    int cur_k = slice_k_start + tb_k * full_pipe;
+    if (cur_k >= prob_k || cur_k >= slice_k_finish) {
+      return;
+    }
+
+    // Reset (to current thread-block) since we read g_idx portion from the
+    // shared memory
+    cur_k = 0;
+
+    // Progress to current iteration
+    cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+    // Determine "position" inside the thread-block (based on warp and
+    // thread-id)
+    int warp_id = threadIdx.x / 32;
+    int n_warps = thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N
+
+    int warp_row = warp_id / n_warps;
+    int warp_col = warp_id % n_warps;
+
+    cur_k += warp_row * 16;
+
+    int th_id = threadIdx.x % 32;
+    cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix
+
+    int s_col_shift =
+        /*slice_n_offset +*/ (act_s_col_warp_stride * warp_col) + (th_id / 4) * act_s_col_stride;
+
+    if (is_same_group[pipe]) {
+      if (k % 2 == 0) {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
+            sh_s[(same_group_id[pipe] - sh_first_group_id) * s_sh_stride + s_col_shift];
+      } else {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
+            *(reinterpret_cast<int4*>(&(act_frag_s[(k - 1) % 2][0][0])));
+      }
+
+      for (int i = 1; i < 4; i++) {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) = *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0])));
+      }
+      return;
+    }
+
+    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
+
+    constexpr int k_frag_offsets[4] = {0, 1, 8, 9};  // Tensor core offsets per thread
+
+#pragma unroll
+    for (int i = 0; i < 4; i++) {
+      int actual_k = cur_k + k_frag_offsets[i];
+
+      int group_id = sh_g_idx_int_ptr[actual_k];
+      int rel_group_id = group_id - sh_first_group_id;
+
+      *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) = sh_s[rel_group_id * s_sh_stride + s_col_shift];
+    }
+  };
+
+  auto fetch_zp_to_registers = [&](int k, int full_pipe) {
+    // This code does not handle group_blocks == 0,
+    // which signifies act_order.
+    // has_zp implies AWQ, which doesn't have act_order,
+    static_assert(!has_zp || group_blocks != 0);
+
+    if constexpr (has_zp && !is_zp_float) {
+      int pipe = full_pipe % stages;
+
+      if constexpr (group_blocks == -1) {
+        // load only when starting a new slice
+        if (k == 0 && full_pipe == 0) {
+#pragma unroll
+          for (int i = 0; i < num_ints_per_thread; i++) {
+            frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp))[zp_sh_rd + i];
+          }
+        }
+
+      } else if constexpr (group_blocks >= thread_k_blocks) {
+        int4* sh_zp_stage =
+            sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) * (pipe / (group_blocks / thread_k_blocks)));
+        for (int i = 0; i < num_ints_per_thread; i++) {
+          frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+        }
+      } else {
+        int warp_id = threadIdx.x / 32;
+        int n_warps = thread_n_blocks / 4;
+
+        int warp_row = warp_id / n_warps;
+
+        int cur_k = warp_row * 16;
+        cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+        int k_blocks = cur_k / 16;
+        int cur_group_id = 0;
+
+        // Suppress bogus and persistent divide-by-zero warning
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress divide_by_zero
+        cur_group_id = k_blocks / group_blocks;
+#pragma nv_diagnostic pop
+
+        int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+
+        sh_zp_stage += cur_group_id * zp_sh_stride;
+
+        for (int i = 0; i < num_ints_per_thread; i++) {
+          frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
+        }
+      }
+    }
+
+    else if constexpr (has_zp && is_zp_float) {
+      int pipe = full_pipe % stages;
+
+      if constexpr (group_blocks != -1) {
+        if constexpr (group_blocks >= thread_k_blocks) {
+          int4* sh_zp_stage =
+              sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) * (pipe / (group_blocks / thread_k_blocks)));
+          reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] = sh_zp_stage[zp_sh_rd];
+        } else {
+          int warp_id = threadIdx.x / 32;
+          int n_warps = thread_n_blocks / 4;
+
+          int warp_row = warp_id / n_warps;
+
+          int cur_k = warp_row * 16;
+          cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+          int k_blocks = cur_k / 16;
+          // Suppress bogus and persistent divide-by-zero warning
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress divide_by_zero
+          int cur_group_id = k_blocks / group_blocks;
+#pragma nv_diagnostic pop
+
+          int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
+
+          reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] = sh_zp_stage[zp_sh_rd + cur_group_id * zp_sh_stride];
+        }
+      }
+    }
+  };
+
+  // Execute the actual tensor core matmul of a sub-tile.
+  bool is_first_matmul_in_slice = true;
+  auto matmul = [&](int k) {
+    int k2 = k % 2;
+    const bool is_new_zp = ((group_blocks != -1) && (group_blocks < thread_k_blocks || k == 0)) ||
+                           (group_blocks == -1 && is_first_matmul_in_slice);
+    if constexpr (has_zp && !is_zp_float) {
+      if (is_new_zp) {
+        if constexpr (group_blocks == -1) is_first_matmul_in_slice = false;
+        FragB frag_zp_0;
+        FragB frag_zp_1;
+        int zp_quant_0, zp_quant_1;
+
+        if constexpr (w_type.size_bits() == 4) {
+          zp_quant_0 = frag_qzp[k2][0];
+          zp_quant_1 = zp_quant_0 >> 8;
+        } else {
+          static_assert(w_type.size_bits() == 8);
+          zp_quant_0 = frag_qzp[k2][0];
+          zp_quant_1 = frag_qzp[k2][1];
+        }
+
+        dequant<scalar_t, w_type.size_bits()>(zp_quant_0, frag_zp_0);
+        dequant<scalar_t, w_type.size_bits()>(zp_quant_1, frag_zp_1);
+
+        frag_zp[0] = frag_zp_0[0];
+        frag_zp[1] = frag_zp_0[1];
+        frag_zp[2] = frag_zp_1[0];
+        frag_zp[3] = frag_zp_1[1];
+      }
+    }
+// We have the m dimension as the inner loop in order to encourage overlapping
+// dequantization and matmul operations.
+#pragma unroll
+    for (int j = 0; j < 4; j++) {
+      FragB frag_b0;
+      FragB frag_b1;
+      int b_quant_0, b_quant_1;
+
+      if constexpr (w_type.size_bits() == 4) {
+        b_quant_0 = frag_b_quant[k2][0][j];
+        b_quant_1 = b_quant_0 >> 8;
+      } else {
+        static_assert(w_type.size_bits() == 8);
+        int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k2]);
+        b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
+        b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
+      }
+
+      dequant<scalar_t, w_type.size_bits()>(b_quant_0, frag_b0);
+      dequant<scalar_t, w_type.size_bits()>(b_quant_1, frag_b1);
+
+      // Apply scale to frag_b0
+      if constexpr (has_act_order) {
+        static_assert(group_blocks != -1);
+        scale4<scalar_t>(
+            frag_b0, act_frag_s[k2][0][j], act_frag_s[k2][1][j], act_frag_s[k2][2][j], act_frag_s[k2][3][j], 0);
+        scale4<scalar_t>(
+            frag_b1, act_frag_s[k2][0][j], act_frag_s[k2][1][j], act_frag_s[k][2][j], act_frag_s[k2][3][j], 1);
+
+      } else if constexpr (has_zp && !is_zp_float && group_blocks == -1) {
+        int idx = (threadIdx.x / 4) % 2;
+        scalar_t2 s2 = Dtype::nums2num2(
+            reinterpret_cast<scalar_t*>(&frag_s[j / 2][j % 2 * 2 + 0])[idx],
+            reinterpret_cast<scalar_t*>(&frag_s[j / 2][j % 2 * 2 + 1])[idx]);
+        if (is_new_zp) frag_zp[j] = __hmul2(frag_zp[j], s2);
+        scale_and_sub<scalar_t>(frag_b0, s2.x, frag_zp[j].x);
+        scale_and_sub<scalar_t>(frag_b1, s2.y, frag_zp[j].y);
+      } else if constexpr (has_zp && !is_zp_float && group_blocks != -1) {
+        if (is_new_zp) frag_zp[j] = __hmul2(frag_zp[j], *reinterpret_cast<scalar_t2*>(&frag_s[k2][j]));
+        scale_and_sub<scalar_t>(frag_b0, frag_s[k % 2][j][0].x, frag_zp[j].x);
+        scale_and_sub<scalar_t>(frag_b1, frag_s[k % 2][j][0].y, frag_zp[j].y);
+      } else if constexpr (has_zp && is_zp_float && group_blocks != -1) {
+        if (is_new_zp) frag_zpf[k2][j] = __hmul2(frag_zpf[k2][j], *reinterpret_cast<scalar_t2*>(&frag_s[k2][j]));
+        scale_and_sub<scalar_t>(frag_b0, frag_s[k2][j].x, frag_zpf[k2][j].x);
+        scale_and_sub<scalar_t>(frag_b1, frag_s[k2][j].y, frag_zpf[k2][j].y);
+      } else if constexpr (group_blocks != -1) {
+        scale<scalar_t>(frag_b0, frag_s[k2][j], 0);
+        scale<scalar_t>(frag_b1, frag_s[k2][j], 1);
+      }
+
+#pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        if constexpr (m_block_size_8) {
+          mma_trans<scalar_t>(frag_a[k2][i], frag_b0, frag_b1, frag_c[i][j][0]);
+        } else {
+          mma<scalar_t>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
+          mma<scalar_t>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
+        }
+      }
+    }
+  };
+
+  // Since we slice across the k dimension of a tile in order to increase the
+  // number of warps while keeping the n dimension of a tile reasonable, we have
+  // multiple warps that accumulate their partial sums of the same output
+  // location; which we have to reduce over in the end. We do in shared memory.
+  auto thread_block_reduce = [&]() {
+    constexpr int red_off = threads / b_sh_stride_threads / 2;
+    if (red_off >= 1) {
+      int red_idx = threadIdx.x / b_sh_stride_threads;
+      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride_threads;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) + (threadIdx.x % b_sh_stride_threads);
+
+      // Parallel logarithmic shared memory reduction. We make sure to avoid any
+      // unnecessary read or write iterations, e.g., for two warps we write only
+      // once by warp 1 and read only once by warp 0.
+
+#pragma unroll
+      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
+#pragma unroll
+        for (int i = red_off; i > 0; i /= 2) {
+          if (i <= red_idx && red_idx < 2 * i) {
+#pragma unroll
+            for (int j = 0; j < 4 * 2; j += (m_block_size_8 ? 2 : 1)) {
+              int red_sh_wr = red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
+              if (i < red_off) {
+                float* c_rd = reinterpret_cast<float*>(&sh_red[red_sh_delta * j + red_sh_rd]);
+                float* c_wr = reinterpret_cast<float*>(&sh_red[red_sh_wr]);
+#pragma unroll
+                for (int k = 0; k < 4; k++)
+                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] += c_rd[k] + c_wr[k];
+              }
+              sh_red[red_sh_wr] = reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+            }
+          }
+          __syncthreads();
+        }
+        if (red_idx == 0) {
+#pragma unroll
+          for (int i = 0; i < 4 * 2; i += (m_block_size_8 ? 2 : 1)) {
+            float* c_rd = reinterpret_cast<float*>(&sh_red[red_sh_delta * i + red_sh_rd]);
+#pragma unroll
+            for (int j = 0; j < 4; j++)
+              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] += c_rd[j];
+          }
+        }
+        __syncthreads();
+      }
+    }
+  };
+
+  // Since multiple threadblocks may process parts of the same column slice, we
+  // finally have to globally reduce over the results. As the striped
+  // partitioning minimizes the number of such reductions and our outputs are
+  // usually rather small, we perform this reduction serially in L2 cache.
+  auto global_reduce_fp16 = [&](bool first = false, bool last = false) {
+    // We are very careful here to reduce directly in the output buffer to
+    // maximize L2 cache utilization in this step. To do this, we write out
+    // results in FP16 (but still reduce with FP32 compute).
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    bool is_th_active = threadIdx.x < active_threads;
+    if (!is_th_active) {
+      return;
+    }
+
+    int c_gl_stride = prob_n / 8;
+    int c_gl_wr_delta_o = 8 * c_gl_stride;
+    int c_gl_wr_delta_i = 4 * (active_threads / 32);
+    int c_gl_wr;
+    if constexpr (m_block_size_8) {
+      c_gl_wr = c_gl_stride * ((threadIdx.x % 4) * 2) + 4 * (threadIdx.x / 32) + (threadIdx.x % 32) / 8;
+      c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    } else {
+      c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) + 4 * (threadIdx.x / 32) + threadIdx.x % 4;
+      c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    }
+    constexpr int c_sh_wr_delta = active_threads;
+    int c_sh_wr = threadIdx.x;
+
+    if (!first) {
+
+#pragma unroll
+      for (int i = 0; i < (m_block_size_8 ? 2 : thread_m_blocks * 4); i++) {
+        int c_idx;
+        if constexpr (m_block_size_8)
+          c_idx = c_gl_wr + i * c_gl_stride + (threadIdx.x % 8) / 4 * c_gl_wr_delta_i;
+        else
+          c_idx = c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2);
+        if (c_idx / c_gl_stride < block_num_valid_tokens) {
+          int64_t sorted_row = sh_block_sorted_ids[c_idx / c_gl_stride];
+          int64_t true_idx = sorted_row * c_gl_stride + c_idx % c_gl_stride;
+          sh_red[c_sh_wr + c_sh_wr_delta * i] = C[true_idx];
+        }
+      }
+    }
+
+#pragma unroll
+    for (int i = 0; i < (m_block_size_8 ? 2 : thread_m_blocks * 4); i++) {
+      if (!first) {
+        int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta];
+#pragma unroll
+        for (int j = 0; j < 2 * 4; j++) {
+          int delta = 0;
+          if constexpr (m_block_size_8) {
+            delta = j % 2 == 1 ? -2 : 0;
+          }
+          reinterpret_cast<float*>(&frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta] +=
+              Dtype::num2float(reinterpret_cast<scalar_t*>(&c_red)[j]);
+        }
+      }
+      if (!last) {
+        int4 c;
+#pragma unroll
+        for (int j = 0; j < 2 * 4; j++) {
+          int delta = 0;
+          if constexpr (m_block_size_8) {
+            delta = j % 2 == 1 ? -2 : 0;
+          }
+          reinterpret_cast<scalar_t*>(&c)[j] =
+              Dtype::float2num(reinterpret_cast<float*>(&frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta]);
+        }
+
+        int c_idx;
+        if constexpr (m_block_size_8)
+          c_idx = c_gl_wr + i * c_gl_stride + (threadIdx.x % 8) / 4 * c_gl_wr_delta_i;
+        else
+          c_idx = c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2);
+        if (c_idx / c_gl_stride < block_num_valid_tokens) {
+          int64_t sorted_row = sh_block_sorted_ids[c_idx / c_gl_stride];
+          int64_t true_idx = sorted_row * c_gl_stride + c_idx % c_gl_stride;
+          C[true_idx] = c;
+        }
+      }
+    }
+  };
+
+  // Globally reduce over threadblocks that compute the same column block.
+  // We use a tmp C buffer to reduce in full fp32 precision.
+  auto global_reduce_fp32 = [&](bool first = false, bool last = false) {
+    constexpr int tb_m = thread_m_blocks * 16;
+    constexpr int tb_n = thread_n_blocks * 16;
+
+    constexpr int c_size = tb_m * tb_n * sizeof(float) / 16;
+
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    bool is_th_active = threadIdx.x < active_threads;
+
+    constexpr int num_floats = thread_m_blocks * 4 * 2 * 4;
+    constexpr int th_size = num_floats * sizeof(float) / 16;
+
+    int c_cur_offset = locks_off * c_size;
+
+    if (!is_th_active) {
+      return;
+    }
+
+    if (!first) {
+      float* frag_c_ptr = reinterpret_cast<float*>(&frag_c);
+#pragma unroll
+      for (int k = 0; k < th_size; k++) {
+        if constexpr (m_block_size_8) {
+          if (k % 2) continue;
+        } else {
+          if (k / 8 * 16 + (threadIdx.x % 32) / 4 >= block_num_valid_tokens) continue;
+        }
+
+        sh_red[threadIdx.x] = C_tmp[c_cur_offset + active_threads * k + threadIdx.x];
+
+        float* sh_c_ptr = reinterpret_cast<float*>(&sh_red[threadIdx.x]);
+#pragma unroll
+        for (int f = 0; f < 4; f++) {
+          frag_c_ptr[k * 4 + f] += sh_c_ptr[f];
+        }
+      }
+    }
+
+    if (!last) {
+      int4* frag_c_ptr = reinterpret_cast<int4*>(&frag_c);
+#pragma unroll
+      for (int k = 0; k < th_size; k++) {
+        if constexpr (m_block_size_8) {
+          if (k % 2) continue;
+        } else {
+          if (k / 8 * 16 + (threadIdx.x % 32) / 4 >= block_num_valid_tokens) continue;
+        }
+
+        C_tmp[c_cur_offset + active_threads * k + threadIdx.x] = frag_c_ptr[k];
+      }
+    }
+  };
+
+  // Write out the reduce final result in the correct layout. We only actually
+  // reshuffle matrix fragments in this step, the reduction above is performed
+  // in fragment layout.
+  auto write_result = [&]() {
+    int c_gl_stride = prob_n / 8;
+    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
+    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
+    constexpr int c_sh_rd_delta = c_sh_stride * (threads / (2 * thread_n_blocks));
+
+    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) + (threadIdx.x % (2 * thread_n_blocks));
+    c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    int c_sh_wr;
+    if constexpr (m_block_size_8) {
+      c_sh_wr = (8 * c_sh_stride) * ((threadIdx.x % 32) % 4 * 2) + (threadIdx.x % 32) / 4;
+      c_sh_wr += 64 * (threadIdx.x / 32);
+    } else {
+      c_sh_wr = (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
+      c_sh_wr += 32 * (threadIdx.x / 32);
+    }
+
+    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) + (threadIdx.x % (2 * thread_n_blocks));
+
+    // We first reorder in shared memory to guarantee the most efficient final
+    // global write patterns
+    auto write = [&](int idx, float c0, float c1, FragS& s) {
+      scalar_t2 res = Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));
+
+      // For per-column quantization we finally apply the scale here (only for
+      // 4-bit)
+      if constexpr (!has_act_order && group_blocks == -1 && w_type.size_bits() == 4 && !has_zp) {
+        res = __hmul2(res, s[0]);
+      }
+
+      if constexpr (m_block_size_8) {
+        ((scalar_t*)sh_red)[idx] = res.x;
+        ((scalar_t*)sh_red)[idx + 8 * c_sh_stride] = res.y;
+      } else {
+        ((scalar_t2*)sh_red)[idx] = res;
+      }
+    };
+
+    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+#pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+#pragma unroll
+        for (int j = 0; j < 4; j++) {
+          if constexpr (m_block_size_8) {
+            int wr = c_sh_wr + 16 * j;
+            write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+            write(wr + 8, frag_c[i][j][0][2], frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 1]);
+          } else {
+            int wr = c_sh_wr + 8 * j;
+            write(
+                wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0], frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+            write(
+                wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2], frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
+            write(
+                wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0], frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
+            write(
+                wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2], frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
+          }
+        }
+        c_sh_wr += 16 * (4 * c_sh_stride);
+      }
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (int i = 0; i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks)); i++) {
+      int row = c_gl_wr / c_gl_stride;
+      if (row < block_num_valid_tokens) {
+        int64_t sorted_row = sh_block_sorted_ids[row];
+        int64_t true_idx = sorted_row * c_gl_stride + c_gl_wr % c_gl_stride;
+        scalar_t2 topk_weight_score;
+        if (mul_topk_weights) topk_weight_score = sh_block_topk_weights[row];
+        if (use_atomic_add && slice_count > 1 || mul_topk_weights) {
+          scalar_t2* C_half2 = reinterpret_cast<scalar_t2*>(&C[true_idx]);
+          scalar_t2* sh_red_half2 = reinterpret_cast<scalar_t2*>(&sh_red[c_sh_rd]);
+#pragma unroll
+          for (int a = 0; a < 4; a++) {
+            scalar_t2 res = sh_red_half2[a];
+            if (mul_topk_weights) {
+              res = __hmul2(res, topk_weight_score);
+            }
+
+            if (use_atomic_add && slice_count > 1) {
+              atomicAdd(&C_half2[a], res);
+            } else {
+              C_half2[a] = res;
+            };
+          }
+        } else {
+          C[true_idx] = sh_red[c_sh_rd];
+        }
+        c_gl_wr += c_gl_wr_delta;
+        c_sh_rd += c_sh_rd_delta;
+      }
+    }
+    __syncthreads();
+  };
+
+  // Start global fetch and register load pipelines.
+  auto start_pipes = [&]() {
+
+#pragma unroll
+    for (int i = 0; i < stages - 1; i++) {
+      if (has_act_order && i == 0) {
+        int last_g_idx = slice_k_start + stages * tb_k * 2;
+        if (last_g_idx >= prob_k) {
+          last_g_idx = prob_k - 1;
+        }
+        fetch_act_order_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]);
+      }
+
+      if constexpr (has_zp && !is_zp_float && group_blocks == -1) {
+        if (i == 0) {
+          fetch_col_zp_to_shared();
+          fetch_col_scale_to_shared();
+        }
+      }
+      fetch_to_shared(i, i, i < slice_iters);
+    }
+
+    zero_accums();
+    wait_for_stage();
+    init_same_group(0);
+    fetch_to_registers(0, 0);
+    fetch_scales_to_registers(0, 0);
+    fetch_zp_to_registers(0, 0);
+    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
+    slice_k_start_shared_fetch += tb_k * (stages - 1);
+  };
+  if (slice_iters) {
+    start_pipes();
+  }
+
+  // Main loop.
+  while (slice_iters) {
+    // We unroll over both the global fetch and the register load pipeline to
+    // ensure all shared memory accesses are static. Note that both pipelines
+    // have even length meaning that the next iteration will always start at
+    // index 0.
+
+#pragma unroll
+    for (int pipe = 0; pipe < stages;) {
+#pragma unroll
+      for (int k = 0; k < b_sh_wr_iters; k++) {
+        fetch_to_registers(k + 1, pipe % stages);
+        fetch_scales_to_registers(k + 1, pipe);
+        fetch_zp_to_registers(k + 1, pipe);
+        if (k == b_sh_wr_iters - 2) {
+          fetch_to_shared((pipe + stages - 1) % stages, pipe, slice_iters >= stages);
+          pipe++;
+          wait_for_stage();
+          init_same_group(pipe % stages);
+        }
+        matmul(k);
+      }
+      slice_iters--;
+      if (slice_iters == 0) {
+        break;
+      }
+    }
+    a_remaining_load_count_in_slice = 0;
+
+    a_gl_rd += a_gl_rd_delta_o * stages;
+    slice_k_start += tb_k * stages;
+    slice_k_start_shared_fetch += tb_k * stages;
+
+    if constexpr (has_act_order) {
+      int first_group_id = g_idx[slice_k_start];
+      int last_g_idx = slice_k_start + stages * tb_k * 2;
+      if (last_g_idx >= prob_k) {
+        last_g_idx = prob_k - 1;
+      }
+      int last_group_id = g_idx[last_g_idx];
+      if (last_group_id >= sh_first_group_id + sh_num_groups) {
+        fetch_act_order_scales_to_shared(false, first_group_id, last_group_id);
+        __syncthreads();
+      }
+    }
+
+    // Process results and, if necessary, proceed to the next column slice.
+    // While this pattern may not be the most readable, other ways of writing
+    // the loop seemed to noticeably worse performance after compilation.
+    if (slice_iters == 0) {
+      cp_async_wait<0>();
+      bool last = slice_idx == slice_count - 1;
+      // For per-column scales, we only fetch them here in the final step before
+      // write-out
+      if constexpr (!has_act_order && group_blocks == -1 && !has_zp) {
+        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
+          if (s_sh_wr_pred) {
+            cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+          }
+          cp_async_fence();
+        }
+      }
+
+      thread_block_reduce();
+      if constexpr (!has_act_order && group_blocks == -1 && !has_zp) {
+        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
+          cp_async_wait<0>();
+          __syncthreads();
+          if (threadIdx.x / 32 < thread_n_blocks / 4) {
+            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+            reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+            if constexpr (m_block_size_8) {
+              int idx = (threadIdx.x / 4) % 2;
+              scalar_t2* frag_s_half2 = reinterpret_cast<scalar_t2*>(frag_s);
+#pragma unroll
+              for (int i = 0; i < 8; i++) {
+                frag_s_half2[i] = Dtype::num2num2(reinterpret_cast<scalar_t*>(&frag_s_half2[i])[idx]);
+              }
+            }
+          }
+        }
+      }
+
+      // For 8-bit channelwise, we apply the scale before the global reduction
+      // that converts the fp32 results to fp16 (so that we avoid possible
+      // overflow in fp16)
+      if constexpr (!has_act_order && group_blocks == -1 && w_type.size_bits() == 8 && !has_zp) {
+        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+#pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+#pragma unroll
+            for (int j = 0; j < 4; j++) {
+              scale_float<scalar_t>(reinterpret_cast<float*>(&frag_c[i][j][0][0]), frag_s[j / 2][2 * (j % 2) + 0]);
+              scale_float<scalar_t>(
+                  reinterpret_cast<float*>(&frag_c[i][j][0][2]), frag_s[j / 2][2 * (j % 2) + (m_block_size_8 ? 1 : 0)]);
+
+              if constexpr (!m_block_size_8) {
+                scale_float<scalar_t>(reinterpret_cast<float*>(&frag_c[i][j][1][0]), frag_s[j / 2][2 * (j % 2) + 1]);
+                scale_float<scalar_t>(reinterpret_cast<float*>(&frag_c[i][j][1][2]), frag_s[j / 2][2 * (j % 2) + 1]);
+              }
+            }
+          }
+        }
+      }
+
+      if (slice_count > 1 && !use_atomic_add) {
+        // only globally reduce if there is more than one block in a slice
+        barrier_acquire(&locks[locks_off], slice_idx);
+        if (use_fp32_reduce) {
+          global_reduce_fp32(slice_idx == 0, last);
+        } else {
+          global_reduce_fp16(slice_idx == 0, last);
+        }
+        barrier_release(&locks[locks_off], last);
+      }
+      if (use_atomic_add && slice_count > 1 && slice_idx != 0) wait_negative_and_add(&locks[locks_off]);
+      if (last || use_atomic_add)
+        // only the last block in a slice actually writes the result
+        write_result();
+      if (slice_row) a_remaining_load_count_in_slice = stages;
+      slice_row = 0;
+      slice_col_par++;
+      slice_col++;
+      is_first_matmul_in_slice = true;
+      init_slice();
+      if (slice_iters) {
+        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + (threadIdx.x % a_gl_rd_delta_o);
+#pragma unroll
+        for (int i = 0; i < b_sh_wr_iters; i++)
+          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
+        if (slice_col == 0) {
+#pragma unroll
+          for (int i = 0; i < b_sh_wr_iters; i++)
+            B_ptr[i] -= b_gl_stride;
+        }
+
+        // Update slice k/n for scales loading
+        if constexpr (has_act_order) {
+          slice_k_start = tb_k * slice_row;
+          slice_k_finish = slice_k_start + tb_k * slice_iters;
+          slice_k_start_shared_fetch = slice_k_start;
+          slice_n_offset = act_s_col_tb_stride * slice_col;
+
+        } else {
+          s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+          zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+        }
+
+        start_pipes();
+      }
+    }
+  }
+}
+
+}  // namespace MARLIN_NAMESPACE_NAME
+
+#endif
diff --git a/sgl-kernel/csrc/moe/marlin_moe_wna16/ops.cu b/sgl-kernel/csrc/moe/marlin_moe_wna16/ops.cu
new file mode 100644
index 000000000..b249f6415
--- /dev/null
+++ b/sgl-kernel/csrc/moe/marlin_moe_wna16/ops.cu
@@ -0,0 +1,1111 @@
+/*
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Adapted from https://github.com/IST-DASLab/marlin
+ */
+
+#ifndef MARLIN_NAMESPACE_NAME
+#define MARLIN_NAMESPACE_NAME marlin_moe_wna16
+#endif
+
+#include "kernel.h"
+#include "kernel_marlin.cuh"
+
+#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)                                        \
+  static_assert(                                                                         \
+      std::is_same<scalar_t, half>::value || std::is_same<scalar_t, nv_bfloat16>::value, \
+      "only float16 and bfloat16 is supported");
+
+namespace MARLIN_NAMESPACE_NAME {
+
+__global__ void MarlinDefault(MARLIN_KERNEL_PARAMS){};
+
+using MarlinFuncPtr = void (*)(MARLIN_KERNEL_PARAMS);
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+template <int moe_block_size>
+__global__ void permute_cols_kernel(
+    int4 const* __restrict__ a_int4_ptr,
+    int const* __restrict__ perm_int_ptr,
+    int4* __restrict__ out_int4_ptr,
+    const int32_t* __restrict__ sorted_token_ids_ptr,
+    const int32_t* __restrict__ expert_ids_ptr,
+    const int32_t* __restrict__ num_tokens_past_padded_ptr,
+    int size_m,
+    int size_k,
+    int top_k) {};
+}
+
+torch::Tensor moe_wna16_marlin_gemm(
+    torch::Tensor& a,
+    std::optional<torch::Tensor> const& c_or_none,
+    torch::Tensor& b_q_weight,
+    torch::Tensor& b_scales,
+    std::optional<torch::Tensor> const& b_zeros_or_none,
+    std::optional<torch::Tensor> const& g_idx_or_none,
+    std::optional<torch::Tensor> const& perm_or_none,
+    torch::Tensor& workspace,
+    torch::Tensor& sorted_token_ids,
+    torch::Tensor& expert_ids,
+    torch::Tensor& num_tokens_past_padded,
+    torch::Tensor& topk_weights,
+    int64_t moe_block_size,
+    int64_t top_k,
+    bool mul_topk_weights,
+    bool is_ep,
+    sglang::ScalarTypeId const& b_q_type_id,
+    int64_t size_m,
+    int64_t size_n,
+    int64_t size_k,
+    bool is_k_full,
+    bool use_atomic_add,
+    bool use_fp32_reduce,
+    bool is_zp_float) {
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
+  return torch::empty({1, 1});
+}
+
+#else
+
+// For a given "a" of size [M,K] performs a permutation of the K columns based
+// on the given "perm" indices.
+template <int moe_block_size>
+__global__ void permute_cols_kernel(
+    int4 const* __restrict__ a_int4_ptr,
+    int const* __restrict__ perm_int_ptr,
+    int4* __restrict__ out_int4_ptr,
+    const int32_t* __restrict__ sorted_token_ids_ptr,
+    const int32_t* __restrict__ expert_ids_ptr,
+    const int32_t* __restrict__ num_tokens_past_padded_ptr,
+    int size_m,
+    int size_k,
+    int top_k) {
+  int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
+  int num_moe_blocks = div_ceil(num_tokens_past_padded, moe_block_size);
+  int32_t block_sorted_ids[moe_block_size];
+  int block_num_valid_tokens = 0;
+  int64_t old_expert_id = 0;
+  int64_t expert_id = 0;
+  int row_stride = size_k * sizeof(half) / 16;
+
+  auto read_moe_block_data = [&](int block_id) {
+    block_num_valid_tokens = moe_block_size;
+    int4* tmp_block_sorted_ids = reinterpret_cast<int4*>(block_sorted_ids);
+    for (int i = 0; i < moe_block_size / 4; i++) {
+      tmp_block_sorted_ids[i] = ((int4*)sorted_token_ids_ptr)[block_id * moe_block_size / 4 + i];
+    }
+    for (int i = 0; i < moe_block_size; i++) {
+      if (block_sorted_ids[i] >= size_m * top_k) {
+        block_num_valid_tokens = i;
+        break;
+      };
+    }
+  };
+
+  auto permute_row = [&](int row) {
+    int iters = size_k / default_threads;
+    int rest = size_k % default_threads;
+
+    int in_offset = (row / top_k) * row_stride;
+    int out_offset = row * row_stride;
+
+    half const* a_row_half = reinterpret_cast<half const*>(a_int4_ptr + in_offset);
+    half* out_half = reinterpret_cast<half*>(out_int4_ptr + out_offset);
+
+    int base_k = 0;
+
+    for (int i = 0; i < iters; i++) {
+      int cur_k = base_k + threadIdx.x;
+      int src_pos = perm_int_ptr[cur_k];
+
+      out_half[cur_k] = a_row_half[src_pos];
+
+      base_k += default_threads;
+    }
+
+    if (rest) {
+      if (threadIdx.x < rest) {
+        int cur_k = base_k + threadIdx.x;
+        int src_pos = perm_int_ptr[cur_k];
+
+        out_half[cur_k] = a_row_half[src_pos];
+      }
+    }
+  };
+
+  for (int index = blockIdx.x; index < num_moe_blocks; index += gridDim.x) {
+    old_expert_id = expert_id;
+    int tmp_expert_id = expert_ids_ptr[index];
+    if (tmp_expert_id == -1) continue;
+    expert_id = tmp_expert_id;
+    perm_int_ptr += (expert_id - old_expert_id) * size_k;
+    read_moe_block_data(index);
+
+    for (int i = 0; i < block_num_valid_tokens; i++)
+      permute_row(block_sorted_ids[i]);
+  }
+}
+
+typedef struct {
+  int thread_k;
+  int thread_n;
+  int num_threads;
+} thread_config_t;
+
+thread_config_t small_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {128, 128, 256},
+    {64, 128, 128}};
+
+thread_config_t large_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {64, 256, 256},
+    {64, 128, 128}};
+
+typedef struct {
+  int blocks_per_sm;
+  thread_config_t tb_cfg;
+} exec_config_t;
+
+int get_scales_cache_size(
+    thread_config_t const& th_config,
+    int prob_m,
+    int prob_n,
+    int prob_k,
+    int num_bits,
+    int group_size,
+    bool has_act_order,
+    bool is_k_full) {
+  bool cache_scales_chunk = has_act_order && !is_k_full;
+
+  int tb_n = th_config.thread_n;
+  int tb_k = th_config.thread_k;
+
+  // Get max scale groups per thread-block
+  int tb_groups;
+  if (group_size == -1) {
+    tb_groups = 1;
+  } else if (group_size == 0) {
+    tb_groups = div_ceil(tb_k, 32);  // Worst case is 32 group size
+  } else {
+    tb_groups = div_ceil(tb_k, group_size);
+  }
+
+  if (cache_scales_chunk) {
+    int load_groups = tb_groups * pipe_stages * 2;  // Chunk size is 2x pipeline over dim K
+    load_groups = max(load_groups, 32);             // We load at least 32 scale groups
+    return load_groups * tb_n * 2;
+
+  } else {
+    int tb_scales = tb_groups * tb_n * 2;
+
+    return tb_scales * pipe_stages;
+  }
+}
+
+int get_kernel_cache_size(
+    thread_config_t const& th_config,
+    int thread_m_blocks,
+    int prob_m,
+    int prob_n,
+    int prob_k,
+    int num_bits,
+    int group_size,
+    bool has_act_order,
+    bool is_k_full,
+    int has_zp,
+    int is_zp_float) {
+  int pack_factor = 32 / num_bits;
+
+  // Get B size
+  int tb_k = th_config.thread_k;
+  int tb_n = th_config.thread_n;
+  int tb_m = thread_m_blocks * 16;
+
+  // shm size for block_sorted_ids/block_topk_weights
+  // both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
+  int sh_block_meta_size = tb_m * 4 * 2;
+  int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
+  int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
+  int sh_s_size =
+      get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits, group_size, has_act_order, is_k_full);
+  int sh_g_idx_size = has_act_order && !is_k_full ? pipe_stages * tb_k / 4 : 0;
+  int sh_zp_size = 0;
+  if (has_zp) {
+    if (is_zp_float)
+      sh_zp_size = sh_s_size;
+    else if (num_bits == 4)
+      sh_zp_size = sh_s_size / 4;
+    else if (num_bits == 8)
+      sh_zp_size = sh_s_size / 2;
+  }
+
+  int total_size = sh_a_size + sh_b_size + sh_s_size + sh_zp_size + sh_g_idx_size + sh_block_meta_size;
+
+  return total_size;
+}
+
+bool is_valid_config(
+    thread_config_t const& th_config,
+    int thread_m_blocks,
+    int prob_m,
+    int prob_n,
+    int prob_k,
+    int num_bits,
+    int group_size,
+    bool has_act_order,
+    bool is_k_full,
+    int has_zp,
+    int is_zp_float,
+    int max_shared_mem) {
+  // Sanity
+  if (th_config.thread_k == -1 || th_config.thread_n == -1 || th_config.num_threads == -1) {
+    return false;
+  }
+
+  // Verify K/N are divisible by thread K/N
+  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
+    return false;
+  }
+
+  // Verify min for thread K/N
+  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
+    return false;
+  }
+
+  // num_threads must be at least 128 (= 4 warps)
+  if (th_config.num_threads < 128) {
+    return false;
+  }
+
+  // Check that pipeline fits into cache
+  int cache_size = get_kernel_cache_size(
+      th_config,
+      thread_m_blocks,
+      prob_m,
+      prob_n,
+      prob_k,
+      num_bits,
+      group_size,
+      has_act_order,
+      is_k_full,
+      has_zp,
+      is_zp_float);
+  return cache_size <= max_shared_mem;
+}
+
+#define __GET_IF(                                                                                                     \
+    W_TYPE,                                                                                                           \
+    THREAD_M_BLOCKS,                                                                                                  \
+    THREAD_N_BLOCKS,                                                                                                  \
+    THREAD_K_BLOCKS,                                                                                                  \
+    M_BLOCK_SIZE_8,                                                                                                   \
+    HAS_ACT_ORDER,                                                                                                    \
+    HAS_ZP,                                                                                                           \
+    GROUP_BLOCKS,                                                                                                     \
+    NUM_THREADS,                                                                                                      \
+    IS_ZP_FLOAT)                                                                                                      \
+  else if (                                                                                                           \
+      q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS && thread_n_blocks == THREAD_N_BLOCKS &&                 \
+      thread_k_blocks == THREAD_K_BLOCKS && m_block_size_8 == M_BLOCK_SIZE_8 && has_act_order == HAS_ACT_ORDER &&     \
+      has_zp == HAS_ZP && group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS && is_zp_float == IS_ZP_FLOAT) { \
+    kernel = Marlin<                                                                                                  \
+        scalar_t,                                                                                                     \
+        W_TYPE.id(),                                                                                                  \
+        NUM_THREADS,                                                                                                  \
+        THREAD_M_BLOCKS,                                                                                              \
+        THREAD_N_BLOCKS,                                                                                              \
+        THREAD_K_BLOCKS,                                                                                              \
+        M_BLOCK_SIZE_8,                                                                                               \
+        pipe_stages,                                                                                                  \
+        HAS_ACT_ORDER,                                                                                                \
+        HAS_ZP,                                                                                                       \
+        GROUP_BLOCKS,                                                                                                 \
+        IS_ZP_FLOAT>;                                                                                                 \
+  }
+
+#define GPTQ_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)                        \
+  __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, true, false, 0, NUM_THREADS, false)    \
+  __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, false, 0, NUM_THREADS, false)   \
+                                                                                       \
+  __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, false, -1, NUM_THREADS, false)  \
+  __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, false, 2, NUM_THREADS, false)   \
+  __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, false, 4, NUM_THREADS, false)   \
+  __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, false, 8, NUM_THREADS, false)   \
+  __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, false, -1, NUM_THREADS, false) \
+  __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, false, 2, NUM_THREADS, false)  \
+  __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, false, 4, NUM_THREADS, false)  \
+  __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, false, 8, NUM_THREADS, false)
+
+#define GPTQ_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)                      \
+  __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, false, 0, NUM_THREADS, false)   \
+  __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, false, 0, NUM_THREADS, false)   \
+  __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, false, 0, NUM_THREADS, false)   \
+                                                                                       \
+  __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, false, -1, NUM_THREADS, false) \
+  __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, false, 2, NUM_THREADS, false)  \
+  __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, false, 4, NUM_THREADS, false)  \
+  __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, false, 8, NUM_THREADS, false)  \
+                                                                                       \
+  __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, false, -1, NUM_THREADS, false) \
+  __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, false, 2, NUM_THREADS, false)  \
+  __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, false, 4, NUM_THREADS, false)  \
+  __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, false, 8, NUM_THREADS, false)  \
+                                                                                       \
+  __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, false, -1, NUM_THREADS, false) \
+  __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, false, 2, NUM_THREADS, false)  \
+  __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, false, 4, NUM_THREADS, false)  \
+  __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, false, 8, NUM_THREADS, false)
+
+#define AWQ_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)                        \
+  __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, -1, NUM_THREADS, false)  \
+  __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, 2, NUM_THREADS, false)   \
+  __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, 4, NUM_THREADS, false)   \
+  __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, 8, NUM_THREADS, false)   \
+  __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, -1, NUM_THREADS, false) \
+  __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, 2, NUM_THREADS, false)  \
+  __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, 4, NUM_THREADS, false)  \
+  __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, 8, NUM_THREADS, false)
+
+#define AWQ_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)                      \
+  __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, -1, NUM_THREADS, false) \
+  __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, 2, NUM_THREADS, false)  \
+  __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, 4, NUM_THREADS, false)  \
+  __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, 8, NUM_THREADS, false)  \
+                                                                                      \
+  __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, -1, NUM_THREADS, false) \
+  __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, 2, NUM_THREADS, false)  \
+  __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, 4, NUM_THREADS, false)  \
+  __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, 8, NUM_THREADS, false)  \
+                                                                                      \
+  __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, -1, NUM_THREADS, false) \
+  __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, 2, NUM_THREADS, false)  \
+  __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, 4, NUM_THREADS, false)  \
+  __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, 8, NUM_THREADS, false)
+
+// We currently have 4-bit models only with group_blocks == 4
+#define HQQ_GET_IF(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)                         \
+  __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, 4, NUM_THREADS, true)  \
+  __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, 4, NUM_THREADS, true) \
+  __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, 4, NUM_THREADS, true) \
+  __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, 4, NUM_THREADS, true) \
+  __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, 4, NUM_THREADS, true)
+
+template <typename scalar_t>
+MarlinFuncPtr get_marlin_kernel(
+    const sglang::ScalarType q_type,
+    int thread_m_blocks,
+    int thread_n_blocks,
+    int thread_k_blocks,
+    bool m_block_size_8,
+    bool has_act_order,
+    bool has_zp,
+    int group_blocks,
+    int num_threads,
+    bool is_zp_float) {
+  int num_bits = q_type.size_bits();
+  auto kernel = MarlinDefault;
+  if (false) {
+  }
+  GPTQ_GET_IF_M1(sglang::kU4B8, 8, 8, 256)
+  GPTQ_GET_IF_M1(sglang::kU4B8, 8, 4, 128)
+
+  GPTQ_GET_IF_M234(sglang::kU4B8, 16, 4, 256)
+  GPTQ_GET_IF_M234(sglang::kU4B8, 8, 4, 128)
+
+  GPTQ_GET_IF_M1(sglang::kU8B128, 8, 8, 256)
+  GPTQ_GET_IF_M1(sglang::kU8B128, 8, 4, 128)
+
+  GPTQ_GET_IF_M234(sglang::kU8B128, 16, 4, 256)
+  GPTQ_GET_IF_M234(sglang::kU8B128, 8, 4, 128)
+
+  AWQ_GET_IF_M1(sglang::kU4, 8, 8, 256)
+  AWQ_GET_IF_M1(sglang::kU4, 8, 4, 128)
+
+  AWQ_GET_IF_M234(sglang::kU4, 16, 4, 256)
+  AWQ_GET_IF_M234(sglang::kU4, 8, 4, 128)
+
+  return kernel;
+}
+
+template <typename scalar_t>
+exec_config_t determine_exec_config(
+    const sglang::ScalarType& q_type,
+    int prob_m,
+    int prob_n,
+    int prob_k,
+    int thread_m_blocks,
+    bool m_block_size_8,
+    int num_bits,
+    int group_size,
+    bool has_act_order,
+    bool is_k_full,
+    bool has_zp,
+    bool is_zp_float,
+    int max_shared_mem) {
+  exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
+  thread_config_t* thread_configs = thread_m_blocks > 1 ? large_batch_thread_configs : small_batch_thread_configs;
+  int thread_configs_size = thread_m_blocks > 1 ? sizeof(large_batch_thread_configs) / sizeof(thread_config_t)
+                                                : sizeof(small_batch_thread_configs) / sizeof(thread_config_t);
+
+  int count = 0;
+  constexpr int device_max_reg_size = 255 * 1024;
+  for (int i = 0; i < thread_configs_size; i++) {
+    thread_config_t th_config = thread_configs[i];
+
+    if (!is_valid_config(
+            th_config,
+            thread_m_blocks,
+            prob_m,
+            prob_n,
+            prob_k,
+            num_bits,
+            group_size,
+            has_act_order,
+            is_k_full,
+            has_zp,
+            is_zp_float,
+            max_shared_mem)) {
+      continue;
+    }
+
+    int cache_size = get_kernel_cache_size(
+        th_config,
+        thread_m_blocks,
+        prob_m,
+        prob_n,
+        prob_k,
+        num_bits,
+        group_size,
+        has_act_order,
+        is_k_full,
+        has_zp,
+        is_zp_float);
+
+    int group_blocks = 0;
+    if (!has_act_order) {
+      group_blocks = group_size == -1 ? -1 : group_size / 16;
+    }
+
+    auto kernel = get_marlin_kernel<scalar_t>(
+        q_type,
+        thread_m_blocks,
+        th_config.thread_n / 16,
+        th_config.thread_k / 16,
+        m_block_size_8,
+        has_act_order,
+        has_zp,
+        group_blocks,
+        th_config.num_threads,
+        is_zp_float);
+
+    if (kernel == MarlinDefault) continue;
+
+    if (thread_m_blocks > 1) {
+      exec_cfg = {1, th_config};
+      break;
+    } else {
+      cudaFuncAttributes attr;
+      cudaFuncGetAttributes(&attr, kernel);
+      int reg_size = max(attr.numRegs, 1) * th_config.num_threads * 4;
+      int allow_count = min(device_max_reg_size / reg_size, max_shared_mem / (cache_size + 1024));
+      allow_count = max(min(allow_count, 4), 1);
+      if (allow_count > count) {
+        count = allow_count;
+        exec_cfg = {count, th_config};
+      };
+    }
+  }
+
+  return exec_cfg;
+}
+
+template <typename scalar_t>
+void marlin_mm(
+    const void* A,
+    const void* B,
+    void* C,
+    void* C_tmp,
+    void* s,
+    void* zp,
+    void* g_idx,
+    void* perm,
+    void* a_tmp,
+    void* sorted_token_ids,
+    void* expert_ids,
+    void* num_tokens_past_padded,
+    void* topk_weights,
+    int moe_block_size,
+    int top_k,
+    bool mul_topk_weights,
+    bool is_ep,
+    int prob_m,
+    int prob_n,
+    int prob_k,
+    void* workspace,
+    sglang::ScalarType const& q_type,
+    bool has_act_order,
+    bool is_k_full,
+    bool has_zp,
+    int num_groups,
+    int group_size,
+    int dev,
+    cudaStream_t stream,
+    int thread_k,
+    int thread_n,
+    int sms,
+    bool use_atomic_add,
+    bool use_fp32_reduce,
+    bool is_zp_float) {
+  int thread_m_blocks = div_ceil(moe_block_size, 16);
+  bool m_block_size_8 = moe_block_size == 8;
+
+  if (has_zp) {
+    TORCH_CHECK(
+        q_type == sglang::kU4 || q_type == sglang::kU8,
+        "q_type must be u4 or u8 when has_zp = True. Got = ",
+        q_type.str());
+  } else {
+    TORCH_CHECK(
+        q_type == sglang::kU4B8 || q_type == sglang::kU8B128,
+        "q_type must be uint4b8 or uint8b128 when has_zp = False. Got = ",
+        q_type.str());
+  }
+
+  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m, ", ", prob_n, ", ", prob_k, "]");
+
+  int group_blocks = 0;
+  if (has_act_order) {
+    if (is_k_full) {
+      TORCH_CHECK(group_size != -1);
+      group_blocks = group_size / 16;
+      TORCH_CHECK(
+          prob_k % group_blocks == 0, "prob_k = ", prob_k, " is not divisible by group_blocks = ", group_blocks);
+    } else {
+      TORCH_CHECK(group_size == 0);
+      group_blocks = 0;
+    }
+  } else {
+    if (group_size == -1) {
+      group_blocks = -1;
+    } else {
+      group_blocks = group_size / 16;
+      TORCH_CHECK(
+          prob_k % group_blocks == 0, "prob_k = ", prob_k, " is not divisible by group_blocks = ", group_blocks);
+    }
+  }
+
+  int num_bits = q_type.size_bits();
+  const int4* A_ptr = (const int4*)A;
+  const int4* B_ptr = (const int4*)B;
+  int4* C_ptr = (int4*)C;
+  int4* C_tmp_ptr = (int4*)C_tmp;
+  const int4* s_ptr = (const int4*)s;
+  const int4* zp_ptr = (const int4*)zp;
+  const int* g_idx_ptr = (const int*)g_idx;
+  const int* perm_ptr = (const int*)perm;
+  int4* a_tmp_ptr = (int4*)a_tmp;
+  const int32_t* sorted_token_ids_ptr = (const int32_t*)sorted_token_ids;
+  const int32_t* expert_ids_ptr = (const int32_t*)expert_ids;
+  const int32_t* num_tokens_past_padded_ptr = (const int32_t*)num_tokens_past_padded;
+  const float* topk_weights_ptr = (const float*)topk_weights;
+  int* locks = (int*)workspace;
+
+  if (has_act_order) {
+    // Permute A columns
+    auto kernel = permute_cols_kernel<8>;
+    if (moe_block_size == 8) {
+    } else if (moe_block_size == 16)
+      kernel = permute_cols_kernel<16>;
+    else if (moe_block_size == 32)
+      kernel = permute_cols_kernel<32>;
+    else if (moe_block_size == 48)
+      kernel = permute_cols_kernel<48>;
+    else if (moe_block_size == 64)
+      kernel = permute_cols_kernel<64>;
+    else
+      TORCH_CHECK(false, "unsupported moe_block_size ", moe_block_size);
+
+    // avoid ">>>" being formatted to "> > >"
+    // clang-format off
+    kernel<<<sms, default_threads, 0, stream>>>(
+        A_ptr, perm_ptr, a_tmp_ptr, sorted_token_ids_ptr, expert_ids_ptr,
+        num_tokens_past_padded_ptr, prob_m, prob_k, top_k);
+    // clang-format on
+    A_ptr = a_tmp_ptr;
+    prob_m = prob_m * top_k;
+    top_k = 1;
+
+    // If we have a full K, then we can run the non-act-order version of Marlin
+    // (since the weight rows are reordered by increasing group ids, and by
+    // having a full K, we have full original groups)
+    if (is_k_full) has_act_order = false;
+  }
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem, cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  // Set thread config
+  exec_config_t exec_cfg;
+  thread_config_t thread_tfg;
+  if (thread_k != -1 && thread_n != -1) {
+    thread_tfg = thread_config_t{thread_k, thread_n, default_threads};
+    exec_cfg = exec_config_t{1, thread_tfg};
+    TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n, " is not divisible by thread_n = ", thread_n);
+    TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k, " is not divisible by thread_k = ", thread_k);
+  } else {
+    // Auto config
+    exec_cfg = determine_exec_config<scalar_t>(
+        q_type,
+        prob_m,
+        prob_n,
+        prob_k,
+        thread_m_blocks,
+        m_block_size_8,
+        num_bits,
+        group_size,
+        has_act_order,
+        is_k_full,
+        has_zp,
+        is_zp_float,
+        max_shared_mem);
+    thread_tfg = exec_cfg.tb_cfg;
+  }
+
+  int num_threads = thread_tfg.num_threads;
+  thread_k = thread_tfg.thread_k;
+  thread_n = thread_tfg.thread_n;
+  int blocks = sms * exec_cfg.blocks_per_sm;
+  if (exec_cfg.blocks_per_sm > 1) max_shared_mem = max_shared_mem / exec_cfg.blocks_per_sm - 1024;
+
+  int thread_k_blocks = thread_k / 16;
+  int thread_n_blocks = thread_n / 16;
+
+  TORCH_CHECK(
+      is_valid_config(
+          thread_tfg,
+          thread_m_blocks,
+          prob_m,
+          prob_n,
+          prob_k,
+          num_bits,
+          group_size,
+          has_act_order,
+          is_k_full,
+          has_zp,
+          is_zp_float,
+          max_shared_mem),
+      "Invalid thread config: thread_m_blocks = ",
+      thread_m_blocks,
+      ", thread_k = ",
+      thread_tfg.thread_k,
+      ", thread_n = ",
+      thread_tfg.thread_n,
+      ", num_threads = ",
+      thread_tfg.num_threads,
+      " for MKN = [",
+      prob_m,
+      ", ",
+      prob_k,
+      ", ",
+      prob_n,
+      "] and num_bits = ",
+      num_bits,
+      ", group_size = ",
+      group_size,
+      ", has_act_order = ",
+      has_act_order,
+      ", is_k_full = ",
+      is_k_full,
+      ", has_zp = ",
+      has_zp,
+      ", is_zp_float = ",
+      is_zp_float,
+      ", max_shared_mem = ",
+      max_shared_mem);
+
+  auto kernel = get_marlin_kernel<scalar_t>(
+      q_type,
+      thread_m_blocks,
+      thread_n_blocks,
+      thread_k_blocks,
+      m_block_size_8,
+      has_act_order,
+      has_zp,
+      group_blocks,
+      num_threads,
+      is_zp_float);
+
+  if (kernel == MarlinDefault) {
+    TORCH_CHECK(
+        false,
+        "Unsupported shapes: MNK = [",
+        prob_m,
+        ", ",
+        prob_n,
+        ", ",
+        prob_k,
+        "]",
+        ", has_act_order = ",
+        has_act_order,
+        ", num_groups = ",
+        num_groups,
+        ", group_size = ",
+        group_size,
+        ", thread_m_blocks = ",
+        thread_m_blocks,
+        ", thread_n_blocks = ",
+        thread_n_blocks,
+        ", thread_k_blocks = ",
+        thread_k_blocks,
+        ", num_bits = ",
+        num_bits);
+  }
+
+  cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);
+  // avoid ">>>" being formatted to "> > >"
+  // clang-format off
+  kernel<<<blocks, num_threads, max_shared_mem, stream>>>(
+      A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,
+      sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr,
+      topk_weights_ptr, top_k, mul_topk_weights, is_ep, num_groups, prob_m,
+      prob_n, prob_k, locks, use_atomic_add, use_fp32_reduce);
+  // clang-format on
+}
+
+}  // namespace MARLIN_NAMESPACE_NAME
+
+torch::Tensor moe_wna16_marlin_gemm(
+    torch::Tensor& a,
+    std::optional<torch::Tensor> const& c_or_none,
+    torch::Tensor& b_q_weight,
+    torch::Tensor& b_scales,
+    std::optional<torch::Tensor> const& b_zeros_or_none,
+    std::optional<torch::Tensor> const& g_idx_or_none,
+    std::optional<torch::Tensor> const& perm_or_none,
+    torch::Tensor& workspace,
+    torch::Tensor& sorted_token_ids,
+    torch::Tensor& expert_ids,
+    torch::Tensor& num_tokens_past_padded,
+    torch::Tensor& topk_weights,
+    int64_t moe_block_size,
+    int64_t top_k,
+    bool mul_topk_weights,
+    bool is_ep,
+    sglang::ScalarTypeId const& b_q_type_id,
+    int64_t size_m,
+    int64_t size_n,
+    int64_t size_k,
+    bool is_k_full,
+    bool use_atomic_add,
+    bool use_fp32_reduce,
+    bool is_zp_float) {
+  sglang::ScalarType const b_q_type = sglang::ScalarType::from_id(b_q_type_id);
+  int pack_factor = 32 / b_q_type.size_bits();
+
+  if (moe_block_size != 8) {
+    TORCH_CHECK(moe_block_size % 16 == 0, "unsupported moe_block_size=", moe_block_size);
+    TORCH_CHECK(moe_block_size >= 16 && moe_block_size <= 64, "unsupported moe_block_size=", moe_block_size);
+  }
+
+  // Verify A
+  TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0), ", size_m = ", size_m);
+  TORCH_CHECK(a.size(1) == size_k, "Shape mismatch: a.size(1) = ", a.size(1), ", size_k = ", size_k);
+
+  // Verify B
+  TORCH_CHECK(
+      size_k % MARLIN_NAMESPACE_NAME::tile_size == 0,
+      "size_k = ",
+      size_k,
+      " is not divisible by tile_size = ",
+      MARLIN_NAMESPACE_NAME::tile_size);
+  TORCH_CHECK(
+      (size_k / MARLIN_NAMESPACE_NAME::tile_size) == b_q_weight.size(1),
+      "Shape mismatch: b_q_weight.size(1) = ",
+      b_q_weight.size(1),
+      ", size_k = ",
+      size_k,
+      ", tile_size = ",
+      MARLIN_NAMESPACE_NAME::tile_size);
+  TORCH_CHECK(
+      b_q_weight.size(2) % MARLIN_NAMESPACE_NAME::tile_size == 0,
+      "b_q_weight.size(2) = ",
+      b_q_weight.size(2),
+      " is not divisible by tile_size = ",
+      MARLIN_NAMESPACE_NAME::tile_size);
+  int actual_size_n = (b_q_weight.size(2) / MARLIN_NAMESPACE_NAME::tile_size) * pack_factor;
+  TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n, ", actual_size_n = ", actual_size_n);
+
+  // Verify device and strides
+  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
+  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+
+  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
+  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+
+  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_k = -1;
+  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_n = -1;
+  // sms: number of SMs to use for the kernel
+  int sms = -1;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
+
+  // Alloc buffers
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  torch::Tensor c;
+  if (c_or_none.has_value()) {
+    c = c_or_none.value();
+    TORCH_CHECK(c.device().is_cuda(), "c is not on GPU");
+    TORCH_CHECK(c.is_contiguous(), "c is not contiguous");
+    TORCH_CHECK(
+        c.size(0) == size_m * top_k, "Shape mismatch: c.size(0) = ", c.size(0), ", size_m * topk = ", size_m * top_k);
+    TORCH_CHECK(c.size(1) == size_n, "Shape mismatch: c.size(1) = ", c.size(1), ", size_n = ", size_n);
+  } else {
+    c = torch::empty({size_m * top_k, size_n}, options);
+  }
+
+  // Alloc C tmp buffer that is going to be used for the global reduce
+  torch::Tensor c_tmp;
+  auto options_fp32 = torch::TensorOptions().dtype(at::kFloat).device(a.device());
+  if (use_fp32_reduce && !use_atomic_add) {
+    // max num of threadblocks is sms * 4
+    long max_c_tmp_size = min(
+        (long)size_n * sorted_token_ids.size(0), (long)sms * 4 * moe_block_size * MARLIN_NAMESPACE_NAME::max_thread_n);
+    if (moe_block_size == 8) max_c_tmp_size *= 2;
+    c_tmp = torch::empty({max_c_tmp_size}, options_fp32);
+  } else {
+    c_tmp = torch::empty({0}, options_fp32);
+  }
+
+  // Detect groupsize and act_order
+  int num_groups = -1;
+  int group_size = -1;
+
+  int rank = b_scales.sizes().size();
+  TORCH_CHECK(rank == 3, "b_scales rank = ", rank, " is not 3");
+  TORCH_CHECK(b_scales.size(2) == size_n, "b_scales dim 2 = ", b_scales.size(2), " is not size_n = ", size_n);
+  num_groups = b_scales.size(1);
+
+  torch::Tensor g_idx, perm, a_tmp;
+  ;
+  if (g_idx_or_none.has_value() && perm_or_none.has_value()) {
+    g_idx = g_idx_or_none.value();
+    perm = perm_or_none.value();
+
+    TORCH_CHECK(g_idx.device().is_cuda(), "g_idx is not on GPU");
+    TORCH_CHECK(g_idx.is_contiguous(), "g_idx is not contiguous");
+    TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
+    TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");
+
+    // Verify g_idx and perm
+    TORCH_CHECK(
+        (g_idx.size(-1) == 0 && perm.size(-1) == 0) || (g_idx.size(-1) == size_k && perm.size(-1) == size_k),
+        "Unexpected g_idx.size(-1) = ",
+        g_idx.size(-1),
+        " and perm.size(-1) = ",
+        perm.size(-1),
+        ", where size_k = ",
+        size_k);
+  } else {
+    g_idx = torch::empty({0}, options);
+    perm = torch::empty({0}, options);
+    a_tmp = torch::empty({0}, options);
+  }
+  bool has_act_order = g_idx.size(-1) > 0 && perm.size(-1) > 0;
+
+  if (has_act_order) {
+    a_tmp = torch::empty({size_m * top_k, size_k}, options);
+    if (is_k_full) {
+      TORCH_CHECK(num_groups > 1, "For act_order, num_groups must be > 1");
+      TORCH_CHECK(size_k % num_groups == 0, "size_k = ", size_k, ", is not divisible by num_groups = ", num_groups);
+      group_size = size_k / num_groups;
+    } else {
+      group_size = 0;
+    }
+
+  } else {
+    a_tmp = torch::empty({0}, options);
+    if (num_groups > 1) {
+      TORCH_CHECK(
+          size_k % num_groups == 0, "size_k = ", size_k, ", is not divisible by b_scales.size(1) = ", b_scales.size(1));
+      group_size = size_k / num_groups;
+    } else {
+      group_size = -1;
+    }
+  }
+
+  torch::Tensor b_zeros;
+  if (b_zeros_or_none.has_value()) {
+    b_zeros = b_zeros_or_none.value();
+    TORCH_CHECK(b_zeros.device().is_cuda(), "b_zeros is not on GPU");
+    TORCH_CHECK(b_zeros.is_contiguous(), "b_zeros is not contiguous");
+  } else {
+    b_zeros = torch::empty({0}, options);
+  }
+  bool has_zp = b_zeros.size(-1) > 0;
+
+  if (has_zp) {
+    TORCH_CHECK(b_q_type == sglang::kU4, "b_q_type must be u4 when has_zp = True. Got = ", b_q_type.str());
+  } else {
+    TORCH_CHECK(
+        b_q_type == sglang::kU4B8 || b_q_type == sglang::kU8B128,
+        "b_q_type must be uint4b8 or uint8b128 when has_zp = False. Got = ",
+        b_q_type.str());
+  }
+
+  if (has_zp && is_zp_float) {
+    TORCH_CHECK(
+        a.scalar_type() == at::ScalarType::Half,
+        "Computation type must be float16 (half) when using float zero "
+        "points.");
+  }
+
+  // Verify b_zeros
+  if (has_zp) {
+    int rank = b_zeros.sizes().size();
+    TORCH_CHECK(rank == 3, "b_zeros rank = ", rank, " is not 3");
+    if (is_zp_float) {
+      TORCH_CHECK(b_zeros.size(2) == size_n, "b_zeros dim 2 = ", b_zeros.size(2), " is not size_n = ", size_n);
+      TORCH_CHECK(
+          num_groups == b_zeros.size(1), "b_zeros dim 1 = ", b_zeros.size(1), " is not num_groups = ", num_groups);
+      TORCH_CHECK(num_groups != -1, "num_groups must be != -1");
+    } else {
+      TORCH_CHECK(
+          b_zeros.size(1) == num_groups, "b_zeros dim 1 = ", b_zeros.size(1), " is not num_groups = ", num_groups);
+      TORCH_CHECK(
+          b_zeros.size(2) == size_n / pack_factor,
+          "b_zeros dim 2 = ",
+          b_zeros.size(2),
+          " is not size_n / pack_factor = ",
+          size_n / pack_factor);
+    }
+  }
+
+  // Verify workspace size
+  TORCH_CHECK(
+      size_n % MARLIN_NAMESPACE_NAME::min_thread_n == 0,
+      "size_n = ",
+      size_n,
+      ", is not divisible by min_thread_n = ",
+      MARLIN_NAMESPACE_NAME::min_thread_n);
+
+  int max_n_tiles = size_n / MARLIN_NAMESPACE_NAME::min_thread_n;
+  int min_workspace_size = min(max_n_tiles * (int)(sorted_token_ids.size(0) / moe_block_size), sms * 4);
+  TORCH_CHECK(
+      workspace.numel() >= min_workspace_size,
+      "workspace.numel = ",
+      workspace.numel(),
+      " is below min_workspace_size = ",
+      min_workspace_size);
+
+  int dev = a.get_device();
+  if (a.scalar_type() == at::ScalarType::Half) {
+    MARLIN_NAMESPACE_NAME::marlin_mm<half>(
+        a.data_ptr<at::Half>(),
+        b_q_weight.data_ptr(),
+        c.data_ptr<at::Half>(),
+        c_tmp.data_ptr<float>(),
+        b_scales.data_ptr<at::Half>(),
+        b_zeros.data_ptr(),
+        g_idx.data_ptr(),
+        perm.data_ptr(),
+        a_tmp.data_ptr<at::Half>(),
+        sorted_token_ids.data_ptr(),
+        expert_ids.data_ptr(),
+        num_tokens_past_padded.data_ptr(),
+        topk_weights.data_ptr(),
+        moe_block_size,
+        top_k,
+        mul_topk_weights,
+        is_ep,
+        size_m,
+        size_n,
+        size_k,
+        workspace.data_ptr(),
+        b_q_type,
+        has_act_order,
+        is_k_full,
+        has_zp,
+        num_groups,
+        group_size,
+        dev,
+        at::cuda::getCurrentCUDAStream(dev),
+        thread_k,
+        thread_n,
+        sms,
+        use_atomic_add,
+        use_fp32_reduce,
+        is_zp_float);
+  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
+    MARLIN_NAMESPACE_NAME::marlin_mm<nv_bfloat16>(
+        a.data_ptr<at::BFloat16>(),
+        b_q_weight.data_ptr(),
+        c.data_ptr<at::BFloat16>(),
+        c_tmp.data_ptr<float>(),
+        b_scales.data_ptr<at::BFloat16>(),
+        b_zeros.data_ptr(),
+        g_idx.data_ptr(),
+        perm.data_ptr(),
+        a_tmp.data_ptr<at::BFloat16>(),
+        sorted_token_ids.data_ptr(),
+        expert_ids.data_ptr(),
+        num_tokens_past_padded.data_ptr(),
+        topk_weights.data_ptr(),
+        moe_block_size,
+        top_k,
+        mul_topk_weights,
+        is_ep,
+        size_m,
+        size_n,
+        size_k,
+        workspace.data_ptr(),
+        b_q_type,
+        has_act_order,
+        is_k_full,
+        has_zp,
+        num_groups,
+        group_size,
+        dev,
+        at::cuda::getCurrentCUDAStream(dev),
+        thread_k,
+        thread_n,
+        sms,
+        use_atomic_add,
+        use_fp32_reduce,
+        is_zp_float);
+  } else {
+    TORCH_CHECK(false, "moe_wna16_marlin_gemm only supports bfloat16 and float16");
+  }
+
+  return c;
+}
+
+#endif
diff --git a/sgl-kernel/csrc/moe/moe_align_kernel.cu b/sgl-kernel/csrc/moe/moe_align_kernel.cu
new file mode 100644
index 000000000..64429e1ac
--- /dev/null
+++ b/sgl-kernel/csrc/moe/moe_align_kernel.cu
@@ -0,0 +1,364 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <THC/THCAtomics.cuh>
+
+#include "utils.h"
+#define WARP_SIZE 64
+#define VEC_SIZE 4
+using Vec = int4;
+
+template <typename scalar_t>
+__global__ void count_and_sort_expert_tokens_kernel(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids,
+    int32_t* __restrict__ cumsum_buffer,
+    size_t numel) {
+  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const size_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = tid; i < numel; i += stride) {
+    int32_t expert_id = topk_ids[i] + 1;
+    int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
+    sorted_token_ids[rank_post_pad] = i;
+  }
+}
+
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__ int warp_exclusive_scan(int v, unsigned mask = 0xffffffffu) {
+  int original = v;
+#pragma unroll
+  for (int offset = 1; offset < WARP_SIZE; offset <<= 1) {
+    int n = __shfl_up(v, offset);
+    if ((threadIdx.x & (WARP_SIZE - 1)) >= offset) v += n;
+  }
+  return v - original;
+}
+#endif
+
+template <typename scalar_t>
+__global__ void moe_align_block_size_kernel(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids,
+    int32_t* __restrict__ expert_ids,
+    int32_t* __restrict__ total_tokens_post_pad,
+    int32_t num_experts,
+    int32_t block_size,
+    size_t numel,
+    int32_t* __restrict__ cumsum,
+    bool pad_sorted_token_ids,
+    const int32_t scan_size) {
+  extern __shared__ int32_t smem[];
+  int32_t* shared_counts = smem;                  // [num_experts]
+  int32_t* prefix = shared_counts + num_experts;  // [num_experts + 1]
+  int32_t* scan_buf = prefix + num_experts + 1;   // [scan_size]
+  __shared__ int32_t s_total_tokens_post_pad;
+
+  const size_t tid = threadIdx.x;
+  const size_t stride = blockDim.x;
+
+  if (tid < num_experts) {
+    shared_counts[tid] = 0;
+  }
+
+  __syncthreads();
+
+  for (size_t i = tid; i < numel; i += stride) {
+    int expert_id = topk_ids[i] + 1;
+    atomicAdd(&shared_counts[expert_id], 1);
+  }
+
+  __syncthreads();
+
+  int32_t padded_count = 0;
+  if (tid < num_experts) {
+    int32_t count = shared_counts[tid];
+    padded_count = (count + block_size - 1) / block_size * block_size;
+    scan_buf[tid] = padded_count;
+  }
+
+#ifndef __CUDA_ARCH__  // HIP
+
+  if (tid >= num_experts && tid < scan_size) {
+    scan_buf[tid] = 0;
+  }
+
+  __syncthreads();
+
+  // Blelloch scan
+  int offset = 1;
+#pragma unroll
+  for (int d = scan_size >> 1; d > 0; d >>= 1) {
+    if (tid < d) {
+      int ai = offset * (2 * tid + 1) - 1;
+      int bi = offset * (2 * tid + 2) - 1;
+      scan_buf[bi] += scan_buf[ai];
+    }
+    offset <<= 1;
+    __syncthreads();
+  }
+
+  // down-sweep
+  if (tid == 0) {
+    prefix[num_experts] = scan_buf[scan_size - 1];
+    scan_buf[scan_size - 1] = 0;
+  }
+  __syncthreads();
+
+#pragma unroll
+  for (int d = 1; d < scan_size; d <<= 1) {
+    offset >>= 1;
+    if (tid < d) {
+      int ai = offset * (2 * tid + 1) - 1;
+      int bi = offset * (2 * tid + 2) - 1;
+      if (bi < scan_size) {
+        int temp = scan_buf[ai];
+        scan_buf[ai] = scan_buf[bi];
+        scan_buf[bi] += temp;
+      }
+    }
+    __syncthreads();
+  }
+
+  if (tid < num_experts) {
+    prefix[tid] = scan_buf[tid];
+  }
+
+  if (tid == 0) {
+    s_total_tokens_post_pad = prefix[num_experts];
+    *total_tokens_post_pad = s_total_tokens_post_pad;
+  }
+  __syncthreads();
+
+#else  // CUDA
+
+  // Intra warp prefix sum
+  int32_t* warp_sums = scan_buf + scan_size;  // [<= 32]
+  const int warp_id = tid / WARP_SIZE;
+  const int lane_id = tid & (WARP_SIZE - 1);
+  const int num_warps_for_scan = (scan_size + WARP_SIZE - 1) / WARP_SIZE;
+  const int warp_sum = warp_exclusive_scan(padded_count) + padded_count;
+  if (lane_id == WARP_SIZE - 1) warp_sums[warp_id] = warp_sum;
+  __syncthreads();
+
+  // warp0 accumulate all the block's prefix sum
+  if (tid < WARP_SIZE) {
+    int val = (tid < num_warps_for_scan) ? warp_sums[tid] : 0;
+    int incl = warp_exclusive_scan(val) + val;
+    warp_sums[tid] = incl;
+  }
+  __syncthreads();
+
+  // Every thread obtains the whole block's sum
+  if (tid == 0) {
+    prefix[num_experts] = warp_sums[num_warps_for_scan - 1];
+    s_total_tokens_post_pad = prefix[num_experts];
+    *total_tokens_post_pad = s_total_tokens_post_pad;
+  }
+  __syncthreads();
+
+  // Fill 0 to scan_buf extended area (tid >= num_expert)
+  if (tid >= num_experts && tid < scan_size) scan_buf[tid] = 0;
+  __syncthreads();
+
+  // Perform 2 level exclusive-prefix-sum to scan_buf
+  int v = (tid < scan_size) ? scan_buf[tid] : 0;
+  int pre = warp_exclusive_scan(v);
+  if (lane_id == WARP_SIZE - 1) warp_sums[warp_id] = pre + v;
+  __syncthreads();
+
+  if (warp_id == 0) {
+    int val = (lane_id < num_warps_for_scan) ? warp_sums[lane_id] : 0;
+    warp_sums[lane_id] = warp_exclusive_scan(val);
+  }
+  __syncthreads();
+
+  int offset = warp_sums[warp_id];
+  if (tid < scan_size) scan_buf[tid] = pre + offset;
+  __syncthreads();
+
+  // Write prefix[0..num_experts - 1] and cumsum
+  if (tid < num_experts) prefix[tid] = scan_buf[tid];
+#endif
+
+  if (tid <= num_experts) {
+    cumsum[tid] = prefix[tid];
+  }
+  // fill expert_ids
+  const int32_t num_blocks = s_total_tokens_post_pad / block_size;
+  for (int32_t i = tid; i < num_blocks; i += stride) {
+    int32_t block_start = i * block_size;
+    int left = 0, right = num_experts;
+    while (left < right) {
+      int mid = (left + right) >> 1;
+      if (prefix[mid] <= block_start) {
+        left = mid + 1;
+      } else {
+        right = mid;
+      }
+    }
+    expert_ids[i] = left - 2;
+  }
+
+  if (pad_sorted_token_ids) {
+    Vec fill_vec;
+    fill_vec.x = fill_vec.y = fill_vec.z = fill_vec.w = numel;
+    int32_t total_vecs = (s_total_tokens_post_pad + VEC_SIZE - 1) / VEC_SIZE;
+    Vec* out_ptr = reinterpret_cast<Vec*>(sorted_token_ids);
+    for (int32_t i = tid; i < total_vecs; i += stride) {
+      out_ptr[i] = fill_vec;
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void moe_align_block_size_small_batch_expert_kernel(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids,
+    int32_t* __restrict__ expert_ids,
+    int32_t* __restrict__ total_tokens_post_pad,
+    int32_t num_experts,
+    int32_t block_size,
+    size_t numel,
+    bool pad_sorted_token_ids) {
+  const size_t tid = threadIdx.x;
+  const size_t stride = blockDim.x;
+
+  extern __shared__ int32_t shared_mem[];
+  int32_t* cumsum = shared_mem;
+  int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1);
+
+  for (int i = 0; i < num_experts; ++i) {
+    tokens_cnts[(threadIdx.x + 1) * num_experts + i] = 0;
+  }
+
+  for (size_t i = tid; i < numel; i += stride) {
+    ++tokens_cnts[(threadIdx.x + 1) * num_experts + topk_ids[i] + 1];
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x < num_experts) {
+    tokens_cnts[threadIdx.x] = 0;
+    for (int i = 1; i <= blockDim.x; ++i) {
+      tokens_cnts[i * num_experts + threadIdx.x] += tokens_cnts[(i - 1) * num_experts + threadIdx.x];
+    }
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    cumsum[0] = 0;
+    for (int i = 1; i <= num_experts; ++i) {
+      cumsum[i] = cumsum[i - 1] + CEILDIV(tokens_cnts[blockDim.x * num_experts + i - 1], block_size) * block_size;
+    }
+    *total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x < num_experts) {
+    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; i += block_size) {
+      expert_ids[i / block_size] = threadIdx.x - 1;
+    }
+  }
+
+  if (pad_sorted_token_ids) {
+    Vec fill_vec;
+    fill_vec.x = fill_vec.y = fill_vec.z = fill_vec.w = numel;
+    int32_t total_vecs = (*total_tokens_post_pad + VEC_SIZE - 1) / VEC_SIZE;
+    Vec* out_ptr = reinterpret_cast<Vec*>(sorted_token_ids);
+    for (int32_t i = tid; i < total_vecs; i += stride) {
+      out_ptr[i] = fill_vec;
+    }
+  }
+
+  __syncthreads();
+
+  for (size_t i = tid; i < numel; i += stride) {
+    int32_t expert_id = topk_ids[i] + 1;
+    int32_t rank_post_pad = tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id];
+    sorted_token_ids[rank_post_pad] = i;
+    ++tokens_cnts[threadIdx.x * num_experts + expert_id];
+  }
+}
+
+void moe_align_block_size(
+    torch::Tensor topk_ids,
+    int64_t num_experts,
+    int64_t block_size,
+    torch::Tensor sorted_token_ids,
+    torch::Tensor experts_ids,
+    torch::Tensor num_tokens_post_pad,
+    torch::Tensor cumsum_buffer,
+    bool pad_sorted_token_ids) {
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  int threads = 1024;
+
+  threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
+
+  DISPATCH_INTEGRAL_TYPES(topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
+    bool small_batch_expert_mode = (topk_ids.numel() < 1024) && (num_experts <= 64);
+
+    if (small_batch_expert_mode) {
+      const int32_t threads = max((int32_t)num_experts, WARP_SIZE);
+      const int32_t shared_mem_size = ((threads + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t);
+
+      auto small_batch_expert_kernel = moe_align_block_size_small_batch_expert_kernel<scalar_t>;
+      small_batch_expert_kernel<<<1, threads, shared_mem_size, stream>>>(
+          topk_ids.data_ptr<scalar_t>(),
+          sorted_token_ids.data_ptr<int32_t>(),
+          experts_ids.data_ptr<int32_t>(),
+          num_tokens_post_pad.data_ptr<int32_t>(),
+          num_experts,
+          block_size,
+          topk_ids.numel(),
+          pad_sorted_token_ids);
+    } else {
+      auto align_kernel = moe_align_block_size_kernel<scalar_t>;
+
+      const size_t scan_size = next_pow2(num_experts);
+      const size_t shared_mem_size = (num_experts + (num_experts + 1) + scan_size + WARP_SIZE) * sizeof(int32_t);
+      align_kernel<<<1, threads, shared_mem_size, stream>>>(
+          topk_ids.data_ptr<scalar_t>(),
+          sorted_token_ids.data_ptr<int32_t>(),
+          experts_ids.data_ptr<int32_t>(),
+          num_tokens_post_pad.data_ptr<int32_t>(),
+          num_experts,
+          block_size,
+          topk_ids.numel(),
+          cumsum_buffer.data_ptr<int32_t>(),
+          pad_sorted_token_ids,
+          scan_size);
+
+      const int block_threads = std::min(256, (int)threads);
+      const int num_blocks = (topk_ids.numel() + block_threads - 1) / block_threads;
+      const int max_blocks = 65535;
+      const int actual_blocks = std::min(num_blocks, max_blocks);
+
+      auto sort_kernel = count_and_sort_expert_tokens_kernel<scalar_t>;
+      sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
+          topk_ids.data_ptr<scalar_t>(),
+          sorted_token_ids.data_ptr<int32_t>(),
+          cumsum_buffer.data_ptr<int32_t>(),
+          topk_ids.numel());
+    }
+  });
+}
diff --git a/sgl-kernel/csrc/moe/moe_align_kernel.hip b/sgl-kernel/csrc/moe/moe_align_kernel.hip
new file mode 100644
index 000000000..c462ac6ce
--- /dev/null
+++ b/sgl-kernel/csrc/moe/moe_align_kernel.hip
@@ -0,0 +1,367 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+#include "hip/hip_runtime.h"
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <ATen/ATen.h>
+#include <ATen/hip/HIPContext.h>
+#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
+
+#include <THH/THHAtomics.cuh>
+
+#include "utils_hip.h"
+#define WARP_SIZE 64
+#define VEC_SIZE 4
+using Vec = int4;
+
+template <typename scalar_t>
+__global__ void count_and_sort_expert_tokens_kernel(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids,
+    int32_t* __restrict__ cumsum_buffer,
+    size_t numel) {
+  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const size_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = tid; i < numel; i += stride) {
+    int32_t expert_id = topk_ids[i] + 1;
+    int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
+    sorted_token_ids[rank_post_pad] = i;
+  }
+}
+
+#ifdef __DTK_ARCH__
+__device__ __forceinline__ int warp_exclusive_scan(int v, unsigned mask = 0xffffffffu) {
+  int original = v;
+#pragma unroll
+  for (int offset = 1; offset < WARP_SIZE; offset <<= 1) {
+    int n = __shfl_up(v, offset);
+    if ((threadIdx.x & (WARP_SIZE - 1)) >= offset) v += n;
+  }
+  return v - original;
+}
+#endif
+
+template <typename scalar_t>
+__global__ void moe_align_block_size_kernel(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids,
+    int32_t* __restrict__ expert_ids,
+    int32_t* __restrict__ total_tokens_post_pad,
+    int32_t num_experts,
+    int32_t block_size,
+    size_t numel,
+    int32_t* __restrict__ cumsum,
+    bool pad_sorted_token_ids,
+    const int32_t scan_size) {
+  extern __shared__ int32_t smem[];
+  int32_t* shared_counts = smem;                  // [num_experts]
+  int32_t* prefix = shared_counts + num_experts;  // [num_experts + 1]
+  int32_t* scan_buf = prefix + num_experts + 1;   // [scan_size]
+  __shared__ int32_t s_total_tokens_post_pad;
+
+  const size_t tid = threadIdx.x;
+  const size_t stride = blockDim.x;
+
+  if (tid < num_experts) {
+    shared_counts[tid] = 0;
+  }
+
+  __syncthreads();
+
+  for (size_t i = tid; i < numel; i += stride) {
+    int expert_id = topk_ids[i] + 1;
+    atomicAdd(&shared_counts[expert_id], 1);
+  }
+
+  __syncthreads();
+
+  int32_t padded_count = 0;
+  if (tid < num_experts) {
+    int32_t count = shared_counts[tid];
+    padded_count = (count + block_size - 1) / block_size * block_size;
+    scan_buf[tid] = padded_count;
+  }
+
+#ifndef __DTK_ARCH__  // HIP
+
+  if (tid >= num_experts && tid < scan_size) {
+    scan_buf[tid] = 0;
+  }
+
+  __syncthreads();
+
+  // Blelloch scan
+  int offset = 1;
+#pragma unroll
+  for (int d = scan_size >> 1; d > 0; d >>= 1) {
+    if (tid < d) {
+      int ai = offset * (2 * tid + 1) - 1;
+      int bi = offset * (2 * tid + 2) - 1;
+      scan_buf[bi] += scan_buf[ai];
+    }
+    offset <<= 1;
+    __syncthreads();
+  }
+
+  // down-sweep
+  if (tid == 0) {
+    prefix[num_experts] = scan_buf[scan_size - 1];
+    scan_buf[scan_size - 1] = 0;
+  }
+  __syncthreads();
+
+#pragma unroll
+  for (int d = 1; d < scan_size; d <<= 1) {
+    offset >>= 1;
+    if (tid < d) {
+      int ai = offset * (2 * tid + 1) - 1;
+      int bi = offset * (2 * tid + 2) - 1;
+      if (bi < scan_size) {
+        int temp = scan_buf[ai];
+        scan_buf[ai] = scan_buf[bi];
+        scan_buf[bi] += temp;
+      }
+    }
+    __syncthreads();
+  }
+
+  if (tid < num_experts) {
+    prefix[tid] = scan_buf[tid];
+  }
+
+  if (tid == 0) {
+    s_total_tokens_post_pad = prefix[num_experts];
+    *total_tokens_post_pad = s_total_tokens_post_pad;
+  }
+  __syncthreads();
+
+#else  // CUDA
+
+  // Intra warp prefix sum
+  int32_t* warp_sums = scan_buf + scan_size;  // [<= 32]
+  const int warp_id = tid / WARP_SIZE;
+  const int lane_id = tid & (WARP_SIZE - 1);
+  const int num_warps_for_scan = (scan_size + WARP_SIZE - 1) / WARP_SIZE;
+  const int warp_sum = warp_exclusive_scan(padded_count) + padded_count;
+  if (lane_id == WARP_SIZE - 1) warp_sums[warp_id] = warp_sum;
+  __syncthreads();
+
+  // warp0 accumulate all the block's prefix sum
+  if (tid < WARP_SIZE) {
+    int val = (tid < num_warps_for_scan) ? warp_sums[tid] : 0;
+    int incl = warp_exclusive_scan(val) + val;
+    warp_sums[tid] = incl;
+  }
+  __syncthreads();
+
+  // Every thread obtains the whole block's sum
+  if (tid == 0) {
+    prefix[num_experts] = warp_sums[num_warps_for_scan - 1];
+    s_total_tokens_post_pad = prefix[num_experts];
+    *total_tokens_post_pad = s_total_tokens_post_pad;
+  }
+  __syncthreads();
+
+  // Fill 0 to scan_buf extended area (tid >= num_expert)
+  if (tid >= num_experts && tid < scan_size) scan_buf[tid] = 0;
+  __syncthreads();
+
+  // Perform 2 level exclusive-prefix-sum to scan_buf
+  int v = (tid < scan_size) ? scan_buf[tid] : 0;
+  int pre = warp_exclusive_scan(v);
+  if (lane_id == WARP_SIZE - 1) warp_sums[warp_id] = pre + v;
+  __syncthreads();
+
+  if (warp_id == 0) {
+    int val = (lane_id < num_warps_for_scan) ? warp_sums[lane_id] : 0;
+    warp_sums[lane_id] = warp_exclusive_scan(val);
+  }
+  __syncthreads();
+
+  int offset = warp_sums[warp_id];
+  if (tid < scan_size) scan_buf[tid] = pre + offset;
+  __syncthreads();
+
+  // Write prefix[0..num_experts - 1] and cumsum
+  if (tid < num_experts) prefix[tid] = scan_buf[tid];
+#endif
+
+  if (tid <= num_experts) {
+    cumsum[tid] = prefix[tid];
+  }
+  // fill expert_ids
+  const int32_t num_blocks = s_total_tokens_post_pad / block_size;
+  for (int32_t i = tid; i < num_blocks; i += stride) {
+    int32_t block_start = i * block_size;
+    int left = 0, right = num_experts;
+    while (left < right) {
+      int mid = (left + right) >> 1;
+      if (prefix[mid] <= block_start) {
+        left = mid + 1;
+      } else {
+        right = mid;
+      }
+    }
+    expert_ids[i] = left - 2;
+  }
+
+  if (pad_sorted_token_ids) {
+    Vec fill_vec;
+    fill_vec.x = fill_vec.y = fill_vec.z = fill_vec.w = numel;
+    int32_t total_vecs = (s_total_tokens_post_pad + VEC_SIZE - 1) / VEC_SIZE;
+    Vec* out_ptr = reinterpret_cast<Vec*>(sorted_token_ids);
+    for (int32_t i = tid; i < total_vecs; i += stride) {
+      out_ptr[i] = fill_vec;
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void moe_align_block_size_small_batch_expert_kernel(
+    const scalar_t* __restrict__ topk_ids,
+    int32_t* __restrict__ sorted_token_ids,
+    int32_t* __restrict__ expert_ids,
+    int32_t* __restrict__ total_tokens_post_pad,
+    int32_t num_experts,
+    int32_t block_size,
+    size_t numel,
+    bool pad_sorted_token_ids) {
+  const size_t tid = threadIdx.x;
+  const size_t stride = blockDim.x;
+
+  extern __shared__ int32_t shared_mem[];
+  int32_t* cumsum = shared_mem;
+  int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1);
+
+  for (int i = 0; i < num_experts; ++i) {
+    tokens_cnts[(threadIdx.x + 1) * num_experts + i] = 0;
+  }
+
+  for (size_t i = tid; i < numel; i += stride) {
+    ++tokens_cnts[(threadIdx.x + 1) * num_experts + topk_ids[i] + 1];
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x < num_experts) {
+    tokens_cnts[threadIdx.x] = 0;
+    for (int i = 1; i <= blockDim.x; ++i) {
+      tokens_cnts[i * num_experts + threadIdx.x] += tokens_cnts[(i - 1) * num_experts + threadIdx.x];
+    }
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    cumsum[0] = 0;
+    for (int i = 1; i <= num_experts; ++i) {
+      cumsum[i] = cumsum[i - 1] + CEILDIV(tokens_cnts[blockDim.x * num_experts + i - 1], block_size) * block_size;
+    }
+    *total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x < num_experts) {
+    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; i += block_size) {
+      expert_ids[i / block_size] = threadIdx.x - 1;
+    }
+  }
+
+  if (pad_sorted_token_ids) {
+    Vec fill_vec;
+    fill_vec.x = fill_vec.y = fill_vec.z = fill_vec.w = numel;
+    int32_t total_vecs = (*total_tokens_post_pad + VEC_SIZE - 1) / VEC_SIZE;
+    Vec* out_ptr = reinterpret_cast<Vec*>(sorted_token_ids);
+    for (int32_t i = tid; i < total_vecs; i += stride) {
+      out_ptr[i] = fill_vec;
+    }
+  }
+
+  __syncthreads();
+
+  for (size_t i = tid; i < numel; i += stride) {
+    int32_t expert_id = topk_ids[i] + 1;
+    int32_t rank_post_pad = tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id];
+    sorted_token_ids[rank_post_pad] = i;
+    ++tokens_cnts[threadIdx.x * num_experts + expert_id];
+  }
+}
+
+void moe_align_block_size(
+    torch::Tensor topk_ids,
+    int64_t num_experts,
+    int64_t block_size,
+    torch::Tensor sorted_token_ids,
+    torch::Tensor experts_ids,
+    torch::Tensor num_tokens_post_pad,
+    torch::Tensor cumsum_buffer,
+    bool pad_sorted_token_ids) {
+  const hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+
+  int threads = 1024;
+
+  threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
+
+  DISPATCH_INTEGRAL_TYPES(topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
+    bool small_batch_expert_mode = (topk_ids.numel() < 1024) && (num_experts <= 64);
+
+    if (small_batch_expert_mode) {
+      const int32_t threads = max((int32_t)num_experts, WARP_SIZE);
+      const int32_t shared_mem_size = ((threads + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t);
+
+      auto small_batch_expert_kernel = moe_align_block_size_small_batch_expert_kernel<scalar_t>;
+     hipLaunchKernelGGL(( small_batch_expert_kernel), dim3(1), dim3(threads), shared_mem_size, stream, 
+          topk_ids.data_ptr<scalar_t>(),
+          sorted_token_ids.data_ptr<int32_t>(),
+          experts_ids.data_ptr<int32_t>(),
+          num_tokens_post_pad.data_ptr<int32_t>(),
+          num_experts,
+          block_size,
+          topk_ids.numel(),
+          pad_sorted_token_ids);
+    } else {
+      auto align_kernel = moe_align_block_size_kernel<scalar_t>;
+
+      const size_t scan_size = next_pow2(num_experts);
+      const size_t shared_mem_size = (num_experts + (num_experts + 1) + scan_size + WARP_SIZE) * sizeof(int32_t);
+     hipLaunchKernelGGL(( align_kernel), dim3(1), dim3(threads), shared_mem_size, stream, 
+          topk_ids.data_ptr<scalar_t>(),
+          sorted_token_ids.data_ptr<int32_t>(),
+          experts_ids.data_ptr<int32_t>(),
+          num_tokens_post_pad.data_ptr<int32_t>(),
+          num_experts,
+          block_size,
+          topk_ids.numel(),
+          cumsum_buffer.data_ptr<int32_t>(),
+          pad_sorted_token_ids,
+          scan_size);
+
+      const int block_threads = ::min(256, (int)threads);
+      const int num_blocks = (topk_ids.numel() + block_threads - 1) / block_threads;
+      const int max_blocks = 65535;
+      const int actual_blocks = ::min(num_blocks, max_blocks);
+
+      auto sort_kernel = count_and_sort_expert_tokens_kernel<scalar_t>;
+     hipLaunchKernelGGL(( sort_kernel), dim3(actual_blocks), dim3(block_threads), 0, stream, 
+          topk_ids.data_ptr<scalar_t>(),
+          sorted_token_ids.data_ptr<int32_t>(),
+          cumsum_buffer.data_ptr<int32_t>(),
+          topk_ids.numel());
+    }
+  });
+}
diff --git a/sgl-kernel/csrc/moe/moe_fused_gate.cu b/sgl-kernel/csrc/moe/moe_fused_gate.cu
new file mode 100644
index 000000000..782a884fb
--- /dev/null
+++ b/sgl-kernel/csrc/moe/moe_fused_gate.cu
@@ -0,0 +1,521 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_runtime.h>
+#include <cutlass/array.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/numeric_types.h>
+#include <stdio.h>
+#include <torch/all.h>
+
+#include <cfloat>
+#include <type_traits>
+template <typename T, int N>
+using AlignedArray = cutlass::AlignedArray<T, N>;
+using bfloat16_t = cutlass::bfloat16_t;
+using float16_t = cutlass::half_t;
+using float32_t = float;
+
+// QQ NOTE: to handle the case for at::Half, error: more than one operator ">" matches these operands: built-in operator
+// "arithmetic > arithmetic" function "operator>(const __half &, const __half &)"
+template <typename T>
+__device__ inline bool cmp_gt(const T& a, const T& b) {
+  if constexpr (std::is_same<T, at::Half>::value) {
+    // at::Half (or float16_t in our native case) causes ambiguity, so we cast to float.
+    return static_cast<float>(a) > static_cast<float>(b);
+  } else {
+    // For types like float, at::BFloat16, or cutlass::half_t / cutlass::bfloat16_t, assume operator> works as expected.
+    return a > b;
+  }
+}
+
+template <typename T>
+__device__ inline bool cmp_eq(const T& a, const T& b) {
+  if constexpr (std::is_same<T, at::Half>::value) {
+    return static_cast<float>(a) == static_cast<float>(b);
+  } else {
+    return a == b;
+  }
+}
+
+// Fixed constants common to both dynamic and static template versions:
+static constexpr int WARP_SIZE = 32;
+static constexpr int WARPS_PER_CTA = 6;
+static constexpr int MAX_VPT = 32;  // maximum VPT we support, > params.VPT = num_expert / num_expert_group
+
+// Create an alias for Array using AlignedArray
+template <typename T, int N>
+using Array = AlignedArray<T, N>;
+// QQ: NOTE expression must have a constant value, this has to be > params.VPT
+template <typename T>
+using AccessType = AlignedArray<T, MAX_VPT>;
+
+template <typename T, typename Params>
+__device__ void moe_fused_gate_impl(
+    void* input,
+    void* bias,
+    float* output_ptr,
+    int32_t* indices_ptr,
+    int64_t num_rows,
+    int64_t topk_group,
+    int64_t topk,
+    int64_t num_fused_shared_experts,
+    double routed_scaling_factor,
+    bool apply_routed_scaling_factor_on_output,
+    Params params) {
+  int tidx = threadIdx.x;
+  int64_t thread_row =
+      blockIdx.x * params.ROWS_PER_CTA + threadIdx.y * params.ROWS_PER_WARP + tidx / params.THREADS_PER_ROW;
+  if (thread_row >= num_rows) {
+    return;
+  }
+
+  // Calculate topk_excluding_share_expert_fusion from topk
+  int64_t topk_excluding_share_expert_fusion = topk - num_fused_shared_experts;
+
+  // Cast pointers to type T:
+  auto* input_ptr = reinterpret_cast<T*>(input);
+  auto* bias_ptr = reinterpret_cast<T*>(bias);
+  auto* thread_row_ptr = input_ptr + thread_row * params.NUM_EXPERTS;
+
+  int thread_group_idx = tidx % params.THREADS_PER_ROW;
+  int first_elt_read_by_thread = thread_group_idx * params.VPT;
+
+  // Create local arrays for the row chunk and bias chunk and then reinterpret the address of row_chunk as a pointer to
+  // AccessType.
+  T* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
+  Array<T, MAX_VPT> row_chunk;
+  AccessType<T> const* vec_thread_read_ptr = reinterpret_cast<AccessType<T> const*>(thread_read_ptr);
+
+  T* bias_thread_read_ptr = bias_ptr + first_elt_read_by_thread;
+  Array<T, MAX_VPT> bias_chunk;
+  AccessType<T> const* vec_bias_thread_read_ptr = reinterpret_cast<AccessType<T> const*>(bias_thread_read_ptr);
+
+// QQ NOTE: doing the follow will be slower than loop assign and more importantly
+// have misaligned address issue when params.VPT < 8 and mismatch with MAX_VPT
+// AccessType<T>* row_chunk_vec_ptr = reinterpret_cast<AccessType<T>*>(&row_chunk);
+// row_chunk_vec_ptr[0] = vec_thread_read_ptr[0];
+#pragma unroll
+  for (int ii = 0; ii < params.VPT; ++ii) {
+    row_chunk[ii] = vec_thread_read_ptr[0][ii];
+    bias_chunk[ii] = vec_bias_thread_read_ptr[0][ii];
+  }
+
+  __syncthreads();
+
+////////////////////// Sigmoid //////////////////////
+#pragma unroll
+  for (int ii = 0; ii < params.VPT; ++ii) {
+    row_chunk[ii] = static_cast<T>(1.0f / (1.0f + expf(-float(row_chunk[ii]))));
+  }
+  __syncthreads();
+
+////////////////////// Add Bias //////////////////////
+#pragma unroll
+  for (int ii = 0; ii < params.VPT; ++ii) {
+    bias_chunk[ii] = row_chunk[ii] + bias_chunk[ii];
+  }
+
+////////////////////// Exclude Groups //////////////////////
+#pragma unroll
+  for (int k_idx = 0; k_idx < params.THREADS_PER_ROW - topk_group;
+       ++k_idx) {  // QQ NOTE Here params.THREADS_PER_ROW = num_expert_group
+    int expert = first_elt_read_by_thread;
+    // local argmax
+    T max_val = static_cast<T>(-FLT_MAX);
+    T max_val_second = static_cast<T>(-FLT_MAX);
+#pragma unroll
+    for (int ii = 0; ii < params.VPT; ++ii) {
+      T val = bias_chunk[ii];
+
+      if (cmp_gt(val, max_val)) {
+        max_val_second = max_val;
+        max_val = val;
+      } else if (cmp_gt(val, max_val_second)) {
+        max_val_second = val;
+      }
+    }
+
+    // QQ NOTE: currently fixed to pick top2 sigmoid weight value in each expert group and sum them as the group weight
+    // to select expert groups
+    T max_sum = max_val + max_val_second;
+
+// argmin reduce
+#pragma unroll
+    for (int mask = params.THREADS_PER_ROW / 2; mask > 0; mask /= 2) {
+      T other_max_sum =
+          static_cast<T>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(max_sum), mask, params.THREADS_PER_ROW));
+      int other_expert = __shfl_xor_sync(0xFFFFFFFF, expert, mask, params.THREADS_PER_ROW);
+
+      // higher indices win
+      if (cmp_gt(max_sum, other_max_sum) || (cmp_eq(other_max_sum, max_sum) && other_expert > expert)) {
+        max_sum = other_max_sum;
+        expert = other_expert;
+      }
+    }
+
+    // clear the max value in the thread
+    if (k_idx < params.THREADS_PER_ROW - topk_group) {
+      int const thread_to_clear_in_group = expert / params.VPT;
+
+      if (thread_group_idx == thread_to_clear_in_group) {
+#pragma unroll
+        for (int ii = 0; ii < params.VPT; ++ii) {
+          bias_chunk[ii] = static_cast<T>(FLT_MAX);
+        }
+      }
+    }
+  }
+
+  __syncthreads();
+
+  ////////////////////// Topk //////////////////////
+  float output_sum = 0.0f;
+  for (int k_idx = 0; k_idx < topk_excluding_share_expert_fusion; ++k_idx) {
+    // local argmax
+    T max_val = bias_chunk[0];
+    int expert = first_elt_read_by_thread;
+
+    if (!cmp_eq(max_val, static_cast<T>(FLT_MAX))) {
+#pragma unroll
+      for (int ii = 1; ii < params.VPT; ++ii) {
+        T val = bias_chunk[ii];
+        if (cmp_gt(val, max_val)) {
+          max_val = val;
+          expert = first_elt_read_by_thread + ii;
+        }
+      }
+    } else {
+      max_val = static_cast<T>(-FLT_MAX);
+    }
+
+    // argmax reduce
+#pragma unroll
+    for (int mask = params.THREADS_PER_ROW / 2; mask > 0; mask /= 2) {
+      T other_max =
+          static_cast<T>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(max_val), mask, params.THREADS_PER_ROW));
+      int other_expert = __shfl_xor_sync(0xFFFFFFFF, expert, mask, params.THREADS_PER_ROW);
+
+      // lower indices to win
+      if (cmp_gt(other_max, max_val) || (cmp_eq(other_max, max_val) && other_expert < expert)) {
+        max_val = other_max;
+        expert = other_expert;
+      }
+    }
+
+    int thread_to_clear_in_group = expert / params.VPT;
+    int64_t idx = topk * thread_row + k_idx;
+
+    if (thread_group_idx == thread_to_clear_in_group) {
+      int expert_to_clear_in_thread = expert % params.VPT;
+
+      // clear the max value in the thread
+      bias_chunk[expert_to_clear_in_thread] = static_cast<T>(-FLT_MAX);
+
+      // store output
+      output_ptr[idx] = static_cast<float>(row_chunk[expert_to_clear_in_thread]);
+      indices_ptr[idx] = static_cast<int32_t>(expert);
+    }
+
+    // accumulate sum for all elements
+    if (thread_group_idx == 0) {
+      output_sum += output_ptr[idx];
+    }
+
+    __syncthreads();
+  }
+
+  if (thread_group_idx == 0 && num_fused_shared_experts > 0) {
+    int64_t last_idx = topk * thread_row + topk_excluding_share_expert_fusion;
+    int64_t expert_offset = 0;
+    indices_ptr[last_idx] = static_cast<int32_t>(params.NUM_EXPERTS + expert_offset);
+
+    // Set the weight to the sum of all weights divided by routed_scaling_factor
+    output_ptr[last_idx] = output_sum / routed_scaling_factor;
+
+    if (num_fused_shared_experts > 1) {
+      for (int i = 1; i < num_fused_shared_experts; ++i) {
+        ++last_idx;
+        ++expert_offset;
+        indices_ptr[last_idx] = static_cast<int32_t>(params.NUM_EXPERTS + expert_offset);
+        // Set the weight to the sum of all weights divided by routed_scaling_factor
+        output_ptr[last_idx] = output_sum / routed_scaling_factor;
+      }
+    }
+  }
+  __syncthreads();
+
+  ////////////////////// Rescale Output //////////////////////
+  if (thread_group_idx == 0) {
+#pragma unroll
+    for (int ii = 0; ii < topk; ++ii) {
+      int64_t const idx = topk * thread_row + ii;
+      output_ptr[idx] = output_ptr[idx] / output_sum;
+      if (apply_routed_scaling_factor_on_output) {
+        output_ptr[idx] *= routed_scaling_factor;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Templated Kernel Version (using compile-time constants)
+//------------------------------------------------------------------------------
+template <int VPT_, int NUM_EXPERTS_, int THREADS_PER_ROW_, int ROWS_PER_WARP_, int ROWS_PER_CTA_, int WARPS_PER_CTA_>
+struct KernelParams {
+  static constexpr int VPT = VPT_;
+  static constexpr int NUM_EXPERTS = NUM_EXPERTS_;
+  static constexpr int THREADS_PER_ROW = THREADS_PER_ROW_;
+  static constexpr int ROWS_PER_WARP = ROWS_PER_WARP_;
+  static constexpr int ROWS_PER_CTA = ROWS_PER_CTA_;
+  static constexpr int WARPS_PER_CTA = WARPS_PER_CTA_;
+};
+
+template <
+    typename T,
+    int VPT,
+    int NUM_EXPERTS,
+    int THREADS_PER_ROW,
+    int ROWS_PER_WARP,
+    int ROWS_PER_CTA,
+    int WARPS_PER_CTA>
+__global__ void moe_fused_gate_kernel(
+    void* input,
+    void* bias,
+    float* output_ptr,
+    int32_t* indices_ptr,
+    int64_t num_rows,
+    int64_t topk_group,
+    int64_t topk,
+    int64_t num_fused_shared_experts,
+    double routed_scaling_factor,
+    bool apply_routed_scaling_factor_on_output) {
+  KernelParams<VPT, NUM_EXPERTS, THREADS_PER_ROW, ROWS_PER_WARP, ROWS_PER_CTA, WARPS_PER_CTA> params;
+  moe_fused_gate_impl<T>(
+      input,
+      bias,
+      output_ptr,
+      indices_ptr,
+      num_rows,
+      topk_group,
+      topk,
+      num_fused_shared_experts,
+      routed_scaling_factor,
+      apply_routed_scaling_factor_on_output,
+      params);
+}
+
+// Macro to compute compile-time constants and launch the kernel.
+#define LAUNCH_MOE_GATE_CONFIG(T, EXPERTS, EXPERT_GROUP)                                                 \
+  do {                                                                                                   \
+    constexpr int VPT = (EXPERTS) / (EXPERT_GROUP);                                                      \
+    /* If EXPERT_GROUP > WARP_SIZE, fall back to 1 row per warp */                                       \
+    constexpr int ROWS_PER_WARP = ((EXPERT_GROUP) <= WARP_SIZE) ? (WARP_SIZE / (EXPERT_GROUP)) : 1;      \
+    constexpr int ROWS_PER_CTA = WARPS_PER_CTA * ROWS_PER_WARP;                                          \
+    moe_fused_gate_kernel<T, VPT, (EXPERTS), (EXPERT_GROUP), ROWS_PER_WARP, ROWS_PER_CTA, WARPS_PER_CTA> \
+        <<<num_blocks, block_dim, 0, stream>>>(                                                          \
+            input.data_ptr(),                                                                            \
+            bias.data_ptr(),                                                                             \
+            output.data_ptr<float>(),                                                                    \
+            indices.data_ptr<int32_t>(),                                                                 \
+            num_rows,                                                                                    \
+            topk_group,                                                                                  \
+            topk,                                                                                        \
+            num_fused_shared_experts,                                                                    \
+            routed_scaling_factor,                                                                       \
+            apply_routed_scaling_factor_on_output);                                                      \
+    dispatched = true;                                                                                   \
+  } while (0)
+
+//------------------------------------------------------------------------------
+// Dynamic Kernel Version (parameters computed at runtime)
+//------------------------------------------------------------------------------
+struct KernelParamsDynamic {
+  int VPT;
+  int NUM_EXPERTS;
+  int THREADS_PER_ROW;
+  int ROWS_PER_WARP;
+  int ROWS_PER_CTA;
+  int WARPS_PER_CTA;
+};
+
+template <typename T>
+__global__ void moe_fused_gate_kernel_dynamic(
+    void* input,
+    void* bias,
+    float* output_ptr,
+    int32_t* indices_ptr,
+    int64_t num_rows,
+    int64_t num_experts,
+    int64_t num_expert_group,
+    int64_t topk_group,
+    int64_t topk,
+    int64_t num_fused_shared_experts,
+    double routed_scaling_factor,
+    bool apply_routed_scaling_factor_on_output) {
+  KernelParamsDynamic params;
+  params.NUM_EXPERTS = num_experts;             // e.g, for deepseek v3, this is 256
+  params.VPT = num_experts / num_expert_group;  // e.g., for deepseek v3, this is 256 / 8 = 32
+  params.THREADS_PER_ROW = num_expert_group;    // fixed as num_expert_group, e.g., for deepseek v3, this is 8
+  params.WARPS_PER_CTA = WARPS_PER_CTA;         // fixed as 6
+  params.ROWS_PER_WARP = std::max<int64_t>(1, WARP_SIZE / num_expert_group);  // WARP_SIZE is fixed as 32
+  params.ROWS_PER_CTA = params.WARPS_PER_CTA * params.ROWS_PER_WARP;
+
+  moe_fused_gate_impl<T>(
+      input,
+      bias,
+      output_ptr,
+      indices_ptr,
+      num_rows,
+      topk_group,
+      topk,
+      num_fused_shared_experts,
+      routed_scaling_factor,
+      apply_routed_scaling_factor_on_output,
+      params);
+}
+
+//------------------------------------------------------------------------------
+// Host Launcher Function
+//------------------------------------------------------------------------------
+std::vector<at::Tensor> moe_fused_gate(
+    at::Tensor& input,
+    at::Tensor& bias,
+    int64_t num_expert_group,
+    int64_t topk_group,
+    int64_t topk,
+    int64_t num_fused_shared_experts,
+    double routed_scaling_factor,
+    bool apply_routed_scaling_factor_on_output) {
+  int64_t num_rows = input.size(0);
+  int32_t num_experts = input.size(1);
+  auto options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+  auto output = torch::empty({num_rows, topk}, options);
+  auto indices = torch::empty({num_rows, topk}, options.dtype(torch::kInt32));
+
+  // Compute grid dimensions based on runtime value for num_expert_group.
+  int64_t rows_per_warp = std::max<int64_t>(1, WARP_SIZE / num_expert_group);
+  int64_t num_warps = (num_rows + rows_per_warp - 1) / rows_per_warp;
+  int64_t num_blocks = (num_warps + WARPS_PER_CTA - 1) / WARPS_PER_CTA;
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  dim3 block_dim(WARP_SIZE, WARPS_PER_CTA);
+
+  // Check 1: Ensure that num_experts is a power of 2.
+  TORCH_CHECK((num_experts & (num_experts - 1)) == 0, "num_experts must be a power of 2, but got ", num_experts);
+
+  // Check 2: Ensure that num_experts is divisible by num_expert_group. (this also means num_expert_group is power of 2)
+  TORCH_CHECK(
+      num_experts % num_expert_group == 0,
+      "num_experts must be divisible by num_expert_group, but got ",
+      num_experts,
+      " / ",
+      num_expert_group);
+
+  int computed_vpt = num_experts / num_expert_group;
+  // Check 3: Ensure that num_experts/num_expert_group does not exceed MAX_VPT=32. Maximum VPT indicate max value per
+  // threads we can process.
+  TORCH_CHECK(
+      computed_vpt <= MAX_VPT,
+      "Per group experts: num_experts / num_expert_group = (",
+      computed_vpt,
+      ") exceeds the maximum supported (",
+      MAX_VPT,
+      ")");
+
+  // Dispatch to templated kernel for known compile-time configurations.
+  // We currently only support for:
+  //   Case 1: 256 experts, with 8 or 16 groups.
+  //   Case 2: 128 experts, with 4 or 8 groups.
+  //   Case 3: other cases, require 8 <= num_experts / num_expert_group <= 32
+  bool dispatched = false;
+  switch (num_experts) {
+    case 256:
+      if (num_expert_group == 8)
+        // This is deepseek v3 case. Here VPT = 256/8 = 32, ROWS_PER_WARP = 32/8 = 4, ROWS_PER_CTA = 6 * 4 = 24.
+        if (input.scalar_type() == at::kBFloat16) {
+          LAUNCH_MOE_GATE_CONFIG(bfloat16_t, 256, 8);
+        } else if (input.scalar_type() == at::kHalf) {
+          LAUNCH_MOE_GATE_CONFIG(float16_t, 256, 8);
+        } else if (input.scalar_type() == at::kFloat) {
+          LAUNCH_MOE_GATE_CONFIG(float32_t, 256, 8);
+        } else if (num_expert_group == 16)
+          // Here VPT = 256/16 = 16, ROWS_PER_WARP = 32/16 = 2, ROWS_PER_CTA = 6 * 2 = 12.
+          if (input.scalar_type() == at::kBFloat16) {
+            LAUNCH_MOE_GATE_CONFIG(bfloat16_t, 256, 16);
+          } else if (input.scalar_type() == at::kHalf) {
+            LAUNCH_MOE_GATE_CONFIG(float16_t, 256, 16);
+          } else if (input.scalar_type() == at::kFloat) {
+            LAUNCH_MOE_GATE_CONFIG(float32_t, 256, 16);
+          }
+      break;
+    case 128:
+      if (num_expert_group == 4)
+        // VPT = 128/4 = 32, ROWS_PER_WARP = 32/16 = 2, ROWS_PER_CTA = 6 * 2 = 12.
+        if (input.scalar_type() == at::kBFloat16) {
+          LAUNCH_MOE_GATE_CONFIG(bfloat16_t, 128, 4);
+        } else if (input.scalar_type() == at::kHalf) {
+          LAUNCH_MOE_GATE_CONFIG(float16_t, 128, 4);
+        } else if (input.scalar_type() == at::kFloat) {
+          LAUNCH_MOE_GATE_CONFIG(float32_t, 128, 4);
+        } else if (num_expert_group == 8)
+          // VPT = 128/8 = 16, ROWS_PER_WARP = 32/8 = 4, ROWS_PER_CTA = 6 * 4 = 24.
+          if (input.scalar_type() == at::kBFloat16) {
+            LAUNCH_MOE_GATE_CONFIG(bfloat16_t, 128, 8);
+          } else if (input.scalar_type() == at::kHalf) {
+            LAUNCH_MOE_GATE_CONFIG(float16_t, 128, 8);
+          } else if (input.scalar_type() == at::kFloat) {
+            LAUNCH_MOE_GATE_CONFIG(float32_t, 128, 8);
+          }
+      break;
+    default:
+      break;
+  }
+  if (!dispatched) {
+    // Fallback to the dynamic kernel if none of the supported combinations match.
+    // currently only support num_experts / num_expert_group <= 32 for dynamic kernels
+    if (input.scalar_type() == at::kBFloat16) {
+      moe_fused_gate_kernel_dynamic<bfloat16_t><<<num_blocks, block_dim, 0, stream>>>(
+          input.data_ptr(),
+          bias.data_ptr(),
+          output.data_ptr<float>(),
+          indices.data_ptr<int32_t>(),
+          num_rows,
+          num_experts,
+          num_expert_group,
+          topk_group,
+          topk,
+          num_fused_shared_experts,
+          routed_scaling_factor,
+          apply_routed_scaling_factor_on_output);
+    } else if (input.scalar_type() == at::kHalf) {
+      moe_fused_gate_kernel_dynamic<float16_t><<<num_blocks, block_dim, 0, stream>>>(
+          input.data_ptr(),
+          bias.data_ptr(),
+          output.data_ptr<float>(),
+          indices.data_ptr<int32_t>(),
+          num_rows,
+          num_experts,
+          num_expert_group,
+          topk_group,
+          topk,
+          num_fused_shared_experts,
+          routed_scaling_factor,
+          apply_routed_scaling_factor_on_output);
+    } else if (input.scalar_type() == at::kFloat) {
+      moe_fused_gate_kernel_dynamic<float32_t><<<num_blocks, block_dim, 0, stream>>>(
+          input.data_ptr(),
+          bias.data_ptr(),
+          output.data_ptr<float>(),
+          indices.data_ptr<int32_t>(),
+          num_rows,
+          num_experts,
+          num_expert_group,
+          topk_group,
+          topk,
+          num_fused_shared_experts,
+          routed_scaling_factor,
+          apply_routed_scaling_factor_on_output);
+    } else {
+      TORCH_CHECK(false, "Unsupported data type for moe_fused_gate");
+    }
+  }
+  return {output, indices};
+}
diff --git a/sgl-kernel/csrc/moe/moe_topk_softmax_kernels.cu b/sgl-kernel/csrc/moe/moe_topk_softmax_kernels.cu
new file mode 100644
index 000000000..6ca4a7846
--- /dev/null
+++ b/sgl-kernel/csrc/moe/moe_topk_softmax_kernels.cu
@@ -0,0 +1,591 @@
+// Adapt from https://github.com/vllm-project/vllm/blob/v0.7.3/csrc/moe/topk_softmax_kernels.cu
+// which is originally adapted from
+// https://github.com/NVIDIA/TensorRT-LLM/blob/v0.7.1/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#ifndef USE_ROCM
+#include <cub/cub.cuh>
+#include <cub/util_type.cuh>
+#include <cuda/functional>
+#else
+#include <hipcub/hipcub.hpp>
+#include <hipcub/util_type.hpp>
+#endif
+
+#include "utils.h"
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+// Define reduction operators based on CUDA version
+// CUDA 13 (12.9+) deprecated cub::Max/Min in favor of cuda::maximum/minimum
+#if CUDA_VERSION >= 12090
+using MaxReduceOp = cuda::maximum<>;
+using MinReduceOp = cuda::minimum<>;
+#else
+using MaxReduceOp = cub::Max;
+using MinReduceOp = cub::Min;
+#endif
+
+/// Aligned array type
+template <
+    typename T,
+    /// Number of elements in the array
+    int N,
+    /// Alignment requirement in bytes
+    int Alignment = sizeof(T) * N>
+class alignas(Alignment) AlignedArray {
+  T data[N];
+};
+
+// ========================== Util functions to convert types ==========================
+template <typename T>
+__device__ float convert_to_float(T x) {
+  if constexpr (std::is_same_v<T, __half>) {
+    return __half2float(x);
+  } else if constexpr (std::is_same_v<T, __hip_bfloat16>) {
+    return __bfloat162float(x);
+  } else if constexpr (std::is_same_v<T, float>) {
+    return x;
+  } else {
+    return static_cast<float>(x);
+  }
+}
+
+// ====================== Softmax things ===============================
+// We have our own implementation of softmax here so we can support transposing the output
+// in the softmax kernel when we extend this module to support expert-choice routing.
+template <typename T, int TPB>
+__launch_bounds__(TPB) __global__
+    void moeSoftmax(const T* input, const bool* finished, float* output, const int num_cols) {
+  using BlockReduce = cub::BlockReduce<float, TPB>;
+  __shared__ typename BlockReduce::TempStorage tmpStorage;
+
+  __shared__ float normalizing_factor;
+  __shared__ float float_max;
+
+  const int thread_row_offset = blockIdx.x * num_cols;
+
+  float threadData(-FLT_MAX);
+
+  // Don't touch finished rows.
+  if ((finished != nullptr) && finished[blockIdx.x]) {
+    return;
+  }
+
+  for (int ii = threadIdx.x; ii < num_cols; ii += TPB) {
+    const int idx = thread_row_offset + ii;
+    threadData = max(convert_to_float<T>(input[idx]), threadData);
+  }
+
+  const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, MaxReduceOp());
+
+  if (threadIdx.x == 0) {
+    float_max = maxElem;
+  }
+  __syncthreads();
+
+  threadData = 0;
+
+  for (int ii = threadIdx.x; ii < num_cols; ii += TPB) {
+    const int idx = thread_row_offset + ii;
+    threadData += exp((convert_to_float<T>(input[idx]) - float_max));
+  }
+
+  const auto Z = BlockReduce(tmpStorage).Sum(threadData);
+
+  if (threadIdx.x == 0) {
+    normalizing_factor = 1.f / Z;
+  }
+  __syncthreads();
+
+  for (int ii = threadIdx.x; ii < num_cols; ii += TPB) {
+    const int idx = thread_row_offset + ii;
+    const float val = exp((convert_to_float<T>(input[idx]) - float_max)) * normalizing_factor;
+    output[idx] = val;
+  }
+}
+
+template <int TPB>
+__launch_bounds__(TPB) __global__ void moeTopK(
+    const float* inputs_after_softmax,
+    const bool* finished,
+    float* output,
+    int* indices,
+    const int num_experts,
+    const int k,
+    const int start_expert,
+    const int end_expert,
+    const bool renormalize) {
+  using cub_kvp = cub::KeyValuePair<int, float>;
+  using BlockReduce = cub::BlockReduce<cub_kvp, TPB>;
+  __shared__ typename BlockReduce::TempStorage tmpStorage;
+
+  cub_kvp thread_kvp;
+  cub::ArgMax arg_max;
+
+  const int block_row = blockIdx.x;
+
+  const bool row_is_active = finished ? !finished[block_row] : true;
+  const int thread_read_offset = blockIdx.x * num_experts;
+  float row_sum_for_renormalize = 0;
+  for (int k_idx = 0; k_idx < k; ++k_idx) {
+    thread_kvp.key = 0;
+    thread_kvp.value = -1.f;  // This is OK because inputs are probabilities
+
+    cub_kvp inp_kvp;
+    for (int expert = threadIdx.x; expert < num_experts; expert += TPB) {
+      const int idx = thread_read_offset + expert;
+      inp_kvp.key = expert;
+      inp_kvp.value = inputs_after_softmax[idx];
+
+      for (int prior_k = 0; prior_k < k_idx; ++prior_k) {
+        const int prior_winning_expert = indices[k * block_row + prior_k];
+
+        if (prior_winning_expert == expert) {
+          inp_kvp = thread_kvp;
+        }
+      }
+
+      thread_kvp = arg_max(inp_kvp, thread_kvp);
+    }
+
+    const cub_kvp result_kvp = BlockReduce(tmpStorage).Reduce(thread_kvp, arg_max);
+    if (threadIdx.x == 0) {
+      // Ignore experts the node isn't responsible for with expert parallelism
+      const int expert = result_kvp.key;
+      const bool node_uses_expert = expert >= start_expert && expert < end_expert;
+      const bool should_process_row = row_is_active && node_uses_expert;
+
+      const int idx = k * block_row + k_idx;
+      output[idx] = result_kvp.value;
+      indices[idx] = should_process_row ? (expert - start_expert) : num_experts;
+      assert(indices[idx] >= 0);
+      row_sum_for_renormalize += result_kvp.value;
+    }
+    __syncthreads();
+  }
+
+  if (renormalize && threadIdx.x == 0) {
+    float row_sum_for_renormalize_inv = 1.f / row_sum_for_renormalize;
+    for (int k_idx = 0; k_idx < k; ++k_idx) {
+      const int idx = k * block_row + k_idx;
+      output[idx] = output[idx] * row_sum_for_renormalize_inv;
+    }
+  }
+}
+
+// ====================== TopK softmax things ===============================
+
+/*
+  A Top-K gating softmax written to exploit when the number of experts in the MoE layers
+  are a small power of 2. This allows us to cleanly share the rows among the threads in
+  a single warp and eliminate communication between warps (so no need to use shared mem).
+
+  It fuses the softmax, max and argmax into a single kernel.
+
+  Limitations:
+  1) This implementation is intended for when the number of experts is a small power of 2.
+  2) This implementation assumes k is small, but will work for any k.
+*/
+
+template <typename T, int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG>
+__launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__ void topkGatingSoftmax(
+    const T* input,
+    const bool* finished,
+    float* output,
+    const int num_rows,
+    int* indices,
+    const int k,
+    const int start_expert,
+    const int end_expert,
+    const bool renormalize) {
+  // We begin by enforcing compile time assertions and setting up compile time constants.
+  static_assert(VPT == (VPT & -VPT), "VPT must be power of 2");
+  static_assert(NUM_EXPERTS == (NUM_EXPERTS & -NUM_EXPERTS), "NUM_EXPERTS must be power of 2");
+  static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2");
+  static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");
+
+  // Number of bytes each thread pulls in per load
+  static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(T);
+  static constexpr int ELTS_PER_ROW = NUM_EXPERTS;
+  static constexpr int THREADS_PER_ROW = ELTS_PER_ROW / VPT;
+  static constexpr int LDG_PER_THREAD = VPT / ELTS_PER_LDG;
+
+  // Restrictions based on previous section.
+  static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg");
+  static_assert(WARP_SIZE % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
+  static_assert(THREADS_PER_ROW == (THREADS_PER_ROW & -THREADS_PER_ROW), "THREADS_PER_ROW must be power of 2");
+  static_assert(THREADS_PER_ROW <= WARP_SIZE, "THREADS_PER_ROW can be at most warp size");
+
+  // We have NUM_EXPERTS elements per row. We specialize for small #experts
+  static constexpr int ELTS_PER_WARP = WARP_SIZE * VPT;
+  static constexpr int ROWS_PER_WARP = ELTS_PER_WARP / ELTS_PER_ROW;
+  static constexpr int ROWS_PER_CTA = WARPS_PER_CTA * ROWS_PER_WARP;
+
+  // Restrictions for previous section.
+  static_assert(ELTS_PER_WARP % ELTS_PER_ROW == 0, "The elts per row must cleanly divide the total elt per warp");
+
+  // ===================== From this point, we finally start computing run-time variables. ========================
+
+  // Compute CTA and warp rows. We pack multiple rows into a single warp, and a block contains WARPS_PER_CTA warps.
+  // This, each block processes a chunk of rows. We start by computing the start row for each block.
+  const int cta_base_row = blockIdx.x * ROWS_PER_CTA;
+
+  // Now, using the base row per thread block, we compute the base row per warp.
+  const int warp_base_row = cta_base_row + threadIdx.y * ROWS_PER_WARP;
+
+  // The threads in a warp are split into sub-groups that will work on a row.
+  // We compute row offset for each thread sub-group
+  const int thread_row_in_warp = threadIdx.x / THREADS_PER_ROW;
+  const int thread_row = warp_base_row + thread_row_in_warp;
+
+  // Threads with indices out of bounds should early exit here.
+  if (thread_row >= num_rows) {
+    return;
+  }
+  const bool row_is_active = finished ? !finished[thread_row] : true;
+
+  // We finally start setting up the read pointers for each thread. First, each thread jumps to the start of the
+  // row it will read.
+  const T* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
+
+  // Now, we compute the group each thread belong to in order to determine the first column to start loads.
+  const int thread_group_idx = threadIdx.x % THREADS_PER_ROW;
+  const int first_elt_read_by_thread = thread_group_idx * ELTS_PER_LDG;
+  const T* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
+
+  // Determine the pointer type to use to read in the data depending on the BYTES_PER_LDG template param. In theory,
+  // this can support all powers of 2 up to 16.
+  // NOTE(woosuk): The original implementation uses CUTLASS aligned array here.
+  // We defined our own aligned array and use it here to avoid the dependency on CUTLASS.
+  using AccessType = AlignedArray<T, ELTS_PER_LDG>;
+
+  // Finally, we pull in the data from global mem
+  T row_chunk_temp[VPT];
+  AccessType* row_chunk_vec_ptr = reinterpret_cast<AccessType*>(&row_chunk_temp);
+  const AccessType* vec_thread_read_ptr = reinterpret_cast<const AccessType*>(thread_read_ptr);
+#pragma unroll
+  for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
+    row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
+  }
+
+  float row_chunk[VPT];
+#pragma unroll
+  for (int ii = 0; ii < VPT; ++ii) {
+    row_chunk[ii] = convert_to_float<T>(row_chunk_temp[ii]);
+  }
+
+  // First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just
+  // convert to float afterwards for the exp + sum reduction.
+  float thread_max = row_chunk[0];
+#pragma unroll
+  for (int ii = 1; ii < VPT; ++ii) {
+    thread_max = max(thread_max, row_chunk[ii]);
+  }
+
+// Now, we find the max within the thread group and distribute among the threads. We use a butterfly reduce.
+#pragma unroll
+  for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2) {
+    thread_max = max(thread_max, SGLANG_SHFL_XOR_SYNC_WIDTH(0xffffffff, thread_max, mask, THREADS_PER_ROW));
+  }
+
+  // From this point, thread max in all the threads have the max within the row.
+  // Now, we subtract the max from each element in the thread and take the exp. We also compute the thread local sum.
+  float row_sum = 0;
+#pragma unroll
+  for (int ii = 0; ii < VPT; ++ii) {
+    row_chunk[ii] = expf(row_chunk[ii] - thread_max);
+    row_sum += row_chunk[ii];
+  }
+
+// Now, we perform the sum reduce within each thread group. Similar to the max reduce, we use a bufferfly pattern.
+#pragma unroll
+  for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2) {
+    row_sum += SGLANG_SHFL_XOR_SYNC_WIDTH(0xffffffff, row_sum, mask, THREADS_PER_ROW);
+  }
+
+  // From this point, all threads have the max and the sum for their rows in the thread_max and thread_sum variables
+  // respectively. Finally, we can scale the rows for the softmax. Technically, for top-k gating we don't need to
+  // compute the entire softmax row. We can likely look at the maxes and only compute for the top-k values in the row.
+  // However, this kernel will likely not be a bottle neck and it seems better to closer match torch and find the
+  // argmax after computing the softmax.
+  const float reciprocal_row_sum = 1.f / row_sum;
+
+#pragma unroll
+  for (int ii = 0; ii < VPT; ++ii) {
+    row_chunk[ii] = row_chunk[ii] * reciprocal_row_sum;
+  }
+
+  // Now, softmax_res contains the softmax of the row chunk. Now, I want to find the topk elements in each row, along
+  // with the max index.
+  int start_col = first_elt_read_by_thread;
+  static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
+
+  float row_sum_for_renormalize = 0;
+
+  for (int k_idx = 0; k_idx < k; ++k_idx) {
+    // First, each thread does the local argmax
+    float max_val = row_chunk[0];
+    int expert = start_col;
+#pragma unroll
+    for (int ldg = 0, col = start_col; ldg < LDG_PER_THREAD; ++ldg, col += COLS_PER_GROUP_LDG) {
+#pragma unroll
+      for (int ii = 0; ii < ELTS_PER_LDG; ++ii) {
+        float val = row_chunk[ldg * ELTS_PER_LDG + ii];
+
+        // No check on the experts here since columns with the smallest index are processed first and only
+        // updated if > (not >=)
+        if (val > max_val) {
+          max_val = val;
+          expert = col + ii;
+        }
+      }
+    }
+
+// Now, we perform the argmax reduce. We use the butterfly pattern so threads reach consensus about the max.
+// This will be useful for K > 1 so that the threads can agree on "who" had the max value. That thread can
+// then blank out their max with -inf and the warp can run more iterations...
+#pragma unroll
+    for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2) {
+      float other_max = SGLANG_SHFL_XOR_SYNC_WIDTH(0xffffffff, max_val, mask, THREADS_PER_ROW);
+      int other_expert = SGLANG_SHFL_XOR_SYNC_WIDTH(0xffffffff, expert, mask, THREADS_PER_ROW);
+
+      // We want lower indices to "win" in every thread so we break ties this way
+      if (other_max > max_val || (other_max == max_val && other_expert < expert)) {
+        max_val = other_max;
+        expert = other_expert;
+      }
+    }
+
+    // Write the max for this k iteration to global memory.
+    if (thread_group_idx == 0) {
+      // Add a guard to ignore experts not included by this node
+      const bool node_uses_expert = expert >= start_expert && expert < end_expert;
+      const bool should_process_row = row_is_active && node_uses_expert;
+
+      // The lead thread from each sub-group will write out the final results to global memory. (This will be a
+      // single) thread per row of the input/output matrices.
+      const int idx = k * thread_row + k_idx;
+      output[idx] = max_val;
+      indices[idx] = should_process_row ? (expert - start_expert) : NUM_EXPERTS;
+      row_sum_for_renormalize += max_val;
+    }
+
+    // Finally, we clear the value in the thread with the current max if there is another iteration to run.
+    if (k_idx + 1 < k) {
+      const int ldg_group_for_expert = expert / COLS_PER_GROUP_LDG;
+      const int thread_to_clear_in_group = (expert / ELTS_PER_LDG) % THREADS_PER_ROW;
+
+      // Only the thread in the group which produced the max will reset the "winning" value to -inf.
+      if (thread_group_idx == thread_to_clear_in_group) {
+        const int offset_for_expert = expert % ELTS_PER_LDG;
+        // Safe to set to any negative value since row_chunk values must be between 0 and 1.
+        row_chunk[ldg_group_for_expert * ELTS_PER_LDG + offset_for_expert] = -10000.f;
+      }
+    }
+  }
+
+  // Fuse renormalization of topk_weights into this kernel
+  if (renormalize && thread_group_idx == 0) {
+    float row_sum_for_renormalize_inv = 1.f / row_sum_for_renormalize;
+#pragma unroll
+    for (int k_idx = 0; k_idx < k; ++k_idx) {
+      const int idx = k * thread_row + k_idx;
+      output[idx] = output[idx] * row_sum_for_renormalize_inv;
+    }
+  }
+}
+
+namespace detail {
+// Constructs some constants needed to partition the work across threads at compile time.
+template <typename T, int EXPERTS, int BYTES_PER_LDG>
+struct TopkConstants {
+  static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(T);
+  static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE) == 0, "");
+  static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE));
+  static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
+  static constexpr int THREADS_PER_ROW = EXPERTS / VPT;
+  static constexpr int ROWS_PER_WARP = WARP_SIZE / THREADS_PER_ROW;
+};
+}  // namespace detail
+
+template <typename T, int EXPERTS, int WARPS_PER_TB>
+void topkGatingSoftmaxLauncherHelper(
+    const T* input,
+    const bool* finished,
+    float* output,
+    int* indices,
+    const int num_rows,
+    const int k,
+    const int start_expert,
+    const int end_expert,
+    const bool renormalize,
+    cudaStream_t stream) {
+  static constexpr std::size_t MAX_BYTES_PER_LDG = 16;
+
+  static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(T) * EXPERTS);
+  using Constants = detail::TopkConstants<T, EXPERTS, BYTES_PER_LDG>;
+  static constexpr int VPT = Constants::VPT;
+  static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
+  const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
+  const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
+
+  dim3 block_dim(WARP_SIZE, WARPS_PER_TB);
+  topkGatingSoftmax<T, VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG><<<num_blocks, block_dim, 0, stream>>>(
+      input, finished, output, num_rows, indices, k, start_expert, end_expert, renormalize);
+}
+
+#define LAUNCH_SOFTMAX(TYPE, NUM_EXPERTS, WARPS_PER_TB)             \
+  topkGatingSoftmaxLauncherHelper<TYPE, NUM_EXPERTS, WARPS_PER_TB>( \
+      gating_output, nullptr, topk_weights, topk_indices, num_tokens, topk, 0, num_experts, renormalize, stream);
+
+template <typename T>
+void topkGatingSoftmaxKernelLauncher(
+    const T* gating_output,
+    float* topk_weights,
+    int* topk_indices,
+    float* softmax_workspace,
+    const int num_tokens,
+    const int num_experts,
+    const int topk,
+    const bool renormalize,
+    cudaStream_t stream) {
+  static constexpr int WARPS_PER_TB = 4;
+  switch (num_experts) {
+    case 1:
+      LAUNCH_SOFTMAX(T, 1, WARPS_PER_TB);
+      break;
+    case 2:
+      LAUNCH_SOFTMAX(T, 2, WARPS_PER_TB);
+      break;
+    case 4:
+      LAUNCH_SOFTMAX(T, 4, WARPS_PER_TB);
+      break;
+    case 8:
+      LAUNCH_SOFTMAX(T, 8, WARPS_PER_TB);
+      break;
+    case 16:
+      LAUNCH_SOFTMAX(T, 16, WARPS_PER_TB);
+      break;
+    case 32:
+      LAUNCH_SOFTMAX(T, 32, WARPS_PER_TB);
+      break;
+    case 64:
+      LAUNCH_SOFTMAX(T, 64, WARPS_PER_TB);
+      break;
+    case 128:
+      LAUNCH_SOFTMAX(T, 128, WARPS_PER_TB);
+      break;
+    case 256:
+      LAUNCH_SOFTMAX(T, 256, WARPS_PER_TB);
+      break;
+    default: {
+      TORCH_CHECK(
+          softmax_workspace != nullptr,
+          "softmax_workspace must be provided for num_experts that are not a power of 2.");
+      static constexpr int TPB = 256;
+      moeSoftmax<T, TPB><<<num_tokens, TPB, 0, stream>>>(gating_output, nullptr, softmax_workspace, num_experts);
+      moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
+          softmax_workspace, nullptr, topk_weights, topk_indices, num_experts, topk, 0, num_experts, renormalize);
+    }
+  }
+}
+
+void topk_softmax(
+    torch::Tensor& topk_weights,  // [num_tokens, topk]
+    torch::Tensor& topk_indices,  // [num_tokens, topk]
+    torch::Tensor& gating_output,
+    const bool renormalize)  // [num_tokens, num_experts]
+{
+  // Check data type
+  TORCH_CHECK(
+      gating_output.scalar_type() == at::ScalarType::Float || gating_output.scalar_type() == at::ScalarType::Half ||
+          gating_output.scalar_type() == at::ScalarType::BFloat16,
+      "gating_output must be float32, float16, or bfloat16");
+
+  // Check dimensions
+  TORCH_CHECK(gating_output.dim() == 2, "gating_output must be 2D tensor [num_tokens, num_experts]");
+  TORCH_CHECK(topk_weights.dim() == 2, "topk_weights must be 2D tensor [num_tokens, topk]");
+  TORCH_CHECK(topk_indices.dim() == 2, "topk_indices must be 2D tensor [num_tokens, topk]");
+
+  // Check shapes
+  TORCH_CHECK(
+      gating_output.size(0) == topk_weights.size(0),
+      "First dimension of topk_weights must match num_tokens in gating_output");
+  TORCH_CHECK(
+      gating_output.size(0) == topk_indices.size(0),
+      "First dimension of topk_indices must match num_tokens in gating_output");
+  TORCH_CHECK(
+      topk_weights.size(-1) == topk_indices.size(-1),
+      "Second dimension of topk_indices must match topk in topk_weights");
+  TORCH_CHECK(topk_weights.size(-1) <= gating_output.size(-1), "topk must be less than or equal to num_experts");
+
+  const int num_experts = static_cast<int>(gating_output.size(-1));
+  const int num_tokens = static_cast<int>(gating_output.size(0));
+  const int topk = static_cast<int>(topk_weights.size(-1));
+
+  const bool is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
+  const bool needs_workspace = !is_pow_2 || num_experts > 256;
+  const int64_t workspace_size = needs_workspace ? num_tokens * num_experts : 0;
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  torch::Tensor softmax_workspace =
+      torch::empty({workspace_size}, gating_output.options().dtype(at::ScalarType::Float));
+
+  const at::ScalarType dtype = gating_output.scalar_type();
+  if (dtype == at::ScalarType::Float) {
+    topkGatingSoftmaxKernelLauncher<float>(
+        gating_output.data_ptr<float>(),
+        topk_weights.data_ptr<float>(),
+        topk_indices.data_ptr<int>(),
+        softmax_workspace.data_ptr<float>(),
+        num_tokens,
+        num_experts,
+        topk,
+        renormalize,
+        stream);
+  } else if (dtype == at::ScalarType::Half) {
+    topkGatingSoftmaxKernelLauncher<__half>(
+        reinterpret_cast<const __half*>(gating_output.data_ptr<at::Half>()),
+        topk_weights.data_ptr<float>(),
+        topk_indices.data_ptr<int>(),
+        softmax_workspace.data_ptr<float>(),
+        num_tokens,
+        num_experts,
+        topk,
+        renormalize,
+        stream);
+  } else if (dtype == at::ScalarType::BFloat16) {
+    topkGatingSoftmaxKernelLauncher<__nv_bfloat16>(
+        reinterpret_cast<const __nv_bfloat16*>(gating_output.data_ptr<at::BFloat16>()),
+        topk_weights.data_ptr<float>(),
+        topk_indices.data_ptr<int>(),
+        softmax_workspace.data_ptr<float>(),
+        num_tokens,
+        num_experts,
+        topk,
+        renormalize,
+        stream);
+  } else {
+    TORCH_CHECK(false, "Unsupported gating_output dtype: ", dtype);
+  }
+}
diff --git a/sgl-kernel/csrc/moe/moe_topk_softmax_kernels.hip b/sgl-kernel/csrc/moe/moe_topk_softmax_kernels.hip
new file mode 100644
index 000000000..96329f22c
--- /dev/null
+++ b/sgl-kernel/csrc/moe/moe_topk_softmax_kernels.hip
@@ -0,0 +1,594 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+#include "hip/hip_runtime.h"
+// Adapt from https://github.com/vllm-project/vllm/blob/v0.7.3/csrc/moe/topk_softmax_kernels.cu
+// which is originally adapted from
+// https://github.com/NVIDIA/TensorRT-LLM/blob/v0.7.1/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <ATen/hip/HIPContext.h>
+#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
+#include <torch/all.h>
+
+#ifndef USE_ROCM
+#include <hipcub/hipcub.hpp>
+#include <hipcub/hipcub.hpp>
+#include <cuda/functional>
+#else
+#include <hipcub/hipcub.hpp>
+#include <hipcub/util_type.hpp>
+#endif
+
+#include "utils_hip.h"
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+// Define reduction operators based on CUDA version
+// CUDA 13 (12.9+) deprecated hipcub::Max/Min in favor of cuda::maximum/minimum
+#if DTK_VERSION >= 12090
+using MaxReduceOp = cuda::maximum<>;
+using MinReduceOp = cuda::minimum<>;
+#else
+using MaxReduceOp = hipcub::Max;
+using MinReduceOp = hipcub::Min;
+#endif
+
+/// Aligned array type
+template <
+    typename T,
+    /// Number of elements in the array
+    int N,
+    /// Alignment requirement in bytes
+    int Alignment = sizeof(T) * N>
+class alignas(Alignment) AlignedArray {
+  T data[N];
+};
+
+// ========================== Util functions to convert types ==========================
+template <typename T>
+__device__ float convert_to_float(T x) {
+  if constexpr (std::is_same_v<T, __half>) {
+    return __half2float(x);
+  } else if constexpr (std::is_same_v<T, __hip_bfloat16>) {
+    return __bfloat162float(x);
+  } else if constexpr (std::is_same_v<T, float>) {
+    return x;
+  } else {
+    return static_cast<float>(x);
+  }
+}
+
+// ====================== Softmax things ===============================
+// We have our own implementation of softmax here so we can support transposing the output
+// in the softmax kernel when we extend this module to support expert-choice routing.
+template <typename T, int TPB>
+__launch_bounds__(TPB) __global__
+    void moeSoftmax(const T* input, const bool* finished, float* output, const int num_cols) {
+  using BlockReduce = hipcub::BlockReduce<float, TPB>;
+  __shared__ typename BlockReduce::TempStorage tmpStorage;
+
+  __shared__ float normalizing_factor;
+  __shared__ float float_max;
+
+  const int thread_row_offset = blockIdx.x * num_cols;
+
+  float threadData(-FLT_MAX);
+
+  // Don't touch finished rows.
+  if ((finished != nullptr) && finished[blockIdx.x]) {
+    return;
+  }
+
+  for (int ii = threadIdx.x; ii < num_cols; ii += TPB) {
+    const int idx = thread_row_offset + ii;
+    threadData = max(convert_to_float<T>(input[idx]), threadData);
+  }
+
+  const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, MaxReduceOp());
+
+  if (threadIdx.x == 0) {
+    float_max = maxElem;
+  }
+  __syncthreads();
+
+  threadData = 0;
+
+  for (int ii = threadIdx.x; ii < num_cols; ii += TPB) {
+    const int idx = thread_row_offset + ii;
+    threadData += exp((convert_to_float<T>(input[idx]) - float_max));
+  }
+
+  const auto Z = BlockReduce(tmpStorage).Sum(threadData);
+
+  if (threadIdx.x == 0) {
+    normalizing_factor = 1.f / Z;
+  }
+  __syncthreads();
+
+  for (int ii = threadIdx.x; ii < num_cols; ii += TPB) {
+    const int idx = thread_row_offset + ii;
+    const float val = exp((convert_to_float<T>(input[idx]) - float_max)) * normalizing_factor;
+    output[idx] = val;
+  }
+}
+
+template <int TPB>
+__launch_bounds__(TPB) __global__ void moeTopK(
+    const float* inputs_after_softmax,
+    const bool* finished,
+    float* output,
+    int* indices,
+    const int num_experts,
+    const int k,
+    const int start_expert,
+    const int end_expert,
+    const bool renormalize) {
+  using cub_kvp = hipcub::KeyValuePair<int, float>;
+  using BlockReduce = hipcub::BlockReduce<cub_kvp, TPB>;
+  __shared__ typename BlockReduce::TempStorage tmpStorage;
+
+  cub_kvp thread_kvp;
+  hipcub::ArgMax arg_max;
+
+  const int block_row = blockIdx.x;
+
+  const bool row_is_active = finished ? !finished[block_row] : true;
+  const int thread_read_offset = blockIdx.x * num_experts;
+  float row_sum_for_renormalize = 0;
+  for (int k_idx = 0; k_idx < k; ++k_idx) {
+    thread_kvp.key = 0;
+    thread_kvp.value = -1.f;  // This is OK because inputs are probabilities
+
+    cub_kvp inp_kvp;
+    for (int expert = threadIdx.x; expert < num_experts; expert += TPB) {
+      const int idx = thread_read_offset + expert;
+      inp_kvp.key = expert;
+      inp_kvp.value = inputs_after_softmax[idx];
+
+      for (int prior_k = 0; prior_k < k_idx; ++prior_k) {
+        const int prior_winning_expert = indices[k * block_row + prior_k];
+
+        if (prior_winning_expert == expert) {
+          inp_kvp = thread_kvp;
+        }
+      }
+
+      thread_kvp = arg_max(inp_kvp, thread_kvp);
+    }
+
+    const cub_kvp result_kvp = BlockReduce(tmpStorage).Reduce(thread_kvp, arg_max);
+    if (threadIdx.x == 0) {
+      // Ignore experts the node isn't responsible for with expert parallelism
+      const int expert = result_kvp.key;
+      const bool node_uses_expert = expert >= start_expert && expert < end_expert;
+      const bool should_process_row = row_is_active && node_uses_expert;
+
+      const int idx = k * block_row + k_idx;
+      output[idx] = result_kvp.value;
+      indices[idx] = should_process_row ? (expert - start_expert) : num_experts;
+      assert(indices[idx] >= 0);
+      row_sum_for_renormalize += result_kvp.value;
+    }
+    __syncthreads();
+  }
+
+  if (renormalize && threadIdx.x == 0) {
+    float row_sum_for_renormalize_inv = 1.f / row_sum_for_renormalize;
+    for (int k_idx = 0; k_idx < k; ++k_idx) {
+      const int idx = k * block_row + k_idx;
+      output[idx] = output[idx] * row_sum_for_renormalize_inv;
+    }
+  }
+}
+
+// ====================== TopK softmax things ===============================
+
+/*
+  A Top-K gating softmax written to exploit when the number of experts in the MoE layers
+  are a small power of 2. This allows us to cleanly share the rows among the threads in
+  a single warp and eliminate communication between warps (so no need to use shared mem).
+
+  It fuses the softmax, max and argmax into a single kernel.
+
+  Limitations:
+  1) This implementation is intended for when the number of experts is a small power of 2.
+  2) This implementation assumes k is small, but will work for any k.
+*/
+
+template <typename T, int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG>
+__launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__ void topkGatingSoftmax(
+    const T* input,
+    const bool* finished,
+    float* output,
+    const int num_rows,
+    int* indices,
+    const int k,
+    const int start_expert,
+    const int end_expert,
+    const bool renormalize) {
+  // We begin by enforcing compile time assertions and setting up compile time constants.
+  static_assert(VPT == (VPT & -VPT), "VPT must be power of 2");
+  static_assert(NUM_EXPERTS == (NUM_EXPERTS & -NUM_EXPERTS), "NUM_EXPERTS must be power of 2");
+  static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2");
+  static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");
+
+  // Number of bytes each thread pulls in per load
+  static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(T);
+  static constexpr int ELTS_PER_ROW = NUM_EXPERTS;
+  static constexpr int THREADS_PER_ROW = ELTS_PER_ROW / VPT;
+  static constexpr int LDG_PER_THREAD = VPT / ELTS_PER_LDG;
+
+  // Restrictions based on previous section.
+  static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg");
+  static_assert(WARP_SIZE % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
+  static_assert(THREADS_PER_ROW == (THREADS_PER_ROW & -THREADS_PER_ROW), "THREADS_PER_ROW must be power of 2");
+  static_assert(THREADS_PER_ROW <= WARP_SIZE, "THREADS_PER_ROW can be at most warp size");
+
+  // We have NUM_EXPERTS elements per row. We specialize for small #experts
+  static constexpr int ELTS_PER_WARP = WARP_SIZE * VPT;
+  static constexpr int ROWS_PER_WARP = ELTS_PER_WARP / ELTS_PER_ROW;
+  static constexpr int ROWS_PER_CTA = WARPS_PER_CTA * ROWS_PER_WARP;
+
+  // Restrictions for previous section.
+  static_assert(ELTS_PER_WARP % ELTS_PER_ROW == 0, "The elts per row must cleanly divide the total elt per warp");
+
+  // ===================== From this point, we finally start computing run-time variables. ========================
+
+  // Compute CTA and warp rows. We pack multiple rows into a single warp, and a block contains WARPS_PER_CTA warps.
+  // This, each block processes a chunk of rows. We start by computing the start row for each block.
+  const int cta_base_row = blockIdx.x * ROWS_PER_CTA;
+
+  // Now, using the base row per thread block, we compute the base row per warp.
+  const int warp_base_row = cta_base_row + threadIdx.y * ROWS_PER_WARP;
+
+  // The threads in a warp are split into sub-groups that will work on a row.
+  // We compute row offset for each thread sub-group
+  const int thread_row_in_warp = threadIdx.x / THREADS_PER_ROW;
+  const int thread_row = warp_base_row + thread_row_in_warp;
+
+  // Threads with indices out of bounds should early exit here.
+  if (thread_row >= num_rows) {
+    return;
+  }
+  const bool row_is_active = finished ? !finished[thread_row] : true;
+
+  // We finally start setting up the read pointers for each thread. First, each thread jumps to the start of the
+  // row it will read.
+  const T* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
+
+  // Now, we compute the group each thread belong to in order to determine the first column to start loads.
+  const int thread_group_idx = threadIdx.x % THREADS_PER_ROW;
+  const int first_elt_read_by_thread = thread_group_idx * ELTS_PER_LDG;
+  const T* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
+
+  // Determine the pointer type to use to read in the data depending on the BYTES_PER_LDG template param. In theory,
+  // this can support all powers of 2 up to 16.
+  // NOTE(woosuk): The original implementation uses CUTLASS aligned array here.
+  // We defined our own aligned array and use it here to avoid the dependency on CUTLASS.
+  using AccessType = AlignedArray<T, ELTS_PER_LDG>;
+
+  // Finally, we pull in the data from global mem
+  T row_chunk_temp[VPT];
+  AccessType* row_chunk_vec_ptr = reinterpret_cast<AccessType*>(&row_chunk_temp);
+  const AccessType* vec_thread_read_ptr = reinterpret_cast<const AccessType*>(thread_read_ptr);
+#pragma unroll
+  for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
+    row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
+  }
+
+  float row_chunk[VPT];
+#pragma unroll
+  for (int ii = 0; ii < VPT; ++ii) {
+    row_chunk[ii] = convert_to_float<T>(row_chunk_temp[ii]);
+  }
+
+  // First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just
+  // convert to float afterwards for the exp + sum reduction.
+  float thread_max = row_chunk[0];
+#pragma unroll
+  for (int ii = 1; ii < VPT; ++ii) {
+    thread_max = max(thread_max, row_chunk[ii]);
+  }
+
+// Now, we find the max within the thread group and distribute among the threads. We use a butterfly reduce.
+#pragma unroll
+  for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2) {
+    thread_max = max(thread_max, SGLANG_SHFL_XOR_SYNC_WIDTH(0xffffffff, thread_max, mask, THREADS_PER_ROW));
+  }
+
+  // From this point, thread max in all the threads have the max within the row.
+  // Now, we subtract the max from each element in the thread and take the exp. We also compute the thread local sum.
+  float row_sum = 0;
+#pragma unroll
+  for (int ii = 0; ii < VPT; ++ii) {
+    row_chunk[ii] = expf(row_chunk[ii] - thread_max);
+    row_sum += row_chunk[ii];
+  }
+
+// Now, we perform the sum reduce within each thread group. Similar to the max reduce, we use a bufferfly pattern.
+#pragma unroll
+  for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2) {
+    row_sum += SGLANG_SHFL_XOR_SYNC_WIDTH(0xffffffff, row_sum, mask, THREADS_PER_ROW);
+  }
+
+  // From this point, all threads have the max and the sum for their rows in the thread_max and thread_sum variables
+  // respectively. Finally, we can scale the rows for the softmax. Technically, for top-k gating we don't need to
+  // compute the entire softmax row. We can likely look at the maxes and only compute for the top-k values in the row.
+  // However, this kernel will likely not be a bottle neck and it seems better to closer match torch and find the
+  // argmax after computing the softmax.
+  const float reciprocal_row_sum = 1.f / row_sum;
+
+#pragma unroll
+  for (int ii = 0; ii < VPT; ++ii) {
+    row_chunk[ii] = row_chunk[ii] * reciprocal_row_sum;
+  }
+
+  // Now, softmax_res contains the softmax of the row chunk. Now, I want to find the topk elements in each row, along
+  // with the max index.
+  int start_col = first_elt_read_by_thread;
+  static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
+
+  float row_sum_for_renormalize = 0;
+
+  for (int k_idx = 0; k_idx < k; ++k_idx) {
+    // First, each thread does the local argmax
+    float max_val = row_chunk[0];
+    int expert = start_col;
+#pragma unroll
+    for (int ldg = 0, col = start_col; ldg < LDG_PER_THREAD; ++ldg, col += COLS_PER_GROUP_LDG) {
+#pragma unroll
+      for (int ii = 0; ii < ELTS_PER_LDG; ++ii) {
+        float val = row_chunk[ldg * ELTS_PER_LDG + ii];
+
+        // No check on the experts here since columns with the smallest index are processed first and only
+        // updated if > (not >=)
+        if (val > max_val) {
+          max_val = val;
+          expert = col + ii;
+        }
+      }
+    }
+
+// Now, we perform the argmax reduce. We use the butterfly pattern so threads reach consensus about the max.
+// This will be useful for K > 1 so that the threads can agree on "who" had the max value. That thread can
+// then blank out their max with -inf and the warp can run more iterations...
+#pragma unroll
+    for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2) {
+      float other_max = SGLANG_SHFL_XOR_SYNC_WIDTH(0xffffffff, max_val, mask, THREADS_PER_ROW);
+      int other_expert = SGLANG_SHFL_XOR_SYNC_WIDTH(0xffffffff, expert, mask, THREADS_PER_ROW);
+
+      // We want lower indices to "win" in every thread so we break ties this way
+      if (other_max > max_val || (other_max == max_val && other_expert < expert)) {
+        max_val = other_max;
+        expert = other_expert;
+      }
+    }
+
+    // Write the max for this k iteration to global memory.
+    if (thread_group_idx == 0) {
+      // Add a guard to ignore experts not included by this node
+      const bool node_uses_expert = expert >= start_expert && expert < end_expert;
+      const bool should_process_row = row_is_active && node_uses_expert;
+
+      // The lead thread from each sub-group will write out the final results to global memory. (This will be a
+      // single) thread per row of the input/output matrices.
+      const int idx = k * thread_row + k_idx;
+      output[idx] = max_val;
+      indices[idx] = should_process_row ? (expert - start_expert) : NUM_EXPERTS;
+      row_sum_for_renormalize += max_val;
+    }
+
+    // Finally, we clear the value in the thread with the current max if there is another iteration to run.
+    if (k_idx + 1 < k) {
+      const int ldg_group_for_expert = expert / COLS_PER_GROUP_LDG;
+      const int thread_to_clear_in_group = (expert / ELTS_PER_LDG) % THREADS_PER_ROW;
+
+      // Only the thread in the group which produced the max will reset the "winning" value to -inf.
+      if (thread_group_idx == thread_to_clear_in_group) {
+        const int offset_for_expert = expert % ELTS_PER_LDG;
+        // Safe to set to any negative value since row_chunk values must be between 0 and 1.
+        row_chunk[ldg_group_for_expert * ELTS_PER_LDG + offset_for_expert] = -10000.f;
+      }
+    }
+  }
+
+  // Fuse renormalization of topk_weights into this kernel
+  if (renormalize && thread_group_idx == 0) {
+    float row_sum_for_renormalize_inv = 1.f / row_sum_for_renormalize;
+#pragma unroll
+    for (int k_idx = 0; k_idx < k; ++k_idx) {
+      const int idx = k * thread_row + k_idx;
+      output[idx] = output[idx] * row_sum_for_renormalize_inv;
+    }
+  }
+}
+
+namespace detail {
+// Constructs some constants needed to partition the work across threads at compile time.
+template <typename T, int EXPERTS, int BYTES_PER_LDG>
+struct TopkConstants {
+  static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(T);
+  static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE) == 0, "");
+  static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE));
+  static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
+  static constexpr int THREADS_PER_ROW = EXPERTS / VPT;
+  static constexpr int ROWS_PER_WARP = WARP_SIZE / THREADS_PER_ROW;
+};
+}  // namespace detail
+
+template <typename T, int EXPERTS, int WARPS_PER_TB>
+void topkGatingSoftmaxLauncherHelper(
+    const T* input,
+    const bool* finished,
+    float* output,
+    int* indices,
+    const int num_rows,
+    const int k,
+    const int start_expert,
+    const int end_expert,
+    const bool renormalize,
+    hipStream_t stream) {
+  static constexpr std::size_t MAX_BYTES_PER_LDG = 16;
+
+  static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(T) * EXPERTS);
+  using Constants = detail::TopkConstants<T, EXPERTS, BYTES_PER_LDG>;
+  static constexpr int VPT = Constants::VPT;
+  static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
+  const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
+  const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
+
+  dim3 block_dim(WARP_SIZE, WARPS_PER_TB);
+ hipLaunchKernelGGL(( topkGatingSoftmax<T, VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG>), dim3(num_blocks), dim3(block_dim), 0, stream, 
+      input, finished, output, num_rows, indices, k, start_expert, end_expert, renormalize);
+}
+
+#define LAUNCH_SOFTMAX(TYPE, NUM_EXPERTS, WARPS_PER_TB)             \
+  topkGatingSoftmaxLauncherHelper<TYPE, NUM_EXPERTS, WARPS_PER_TB>( \
+      gating_output, nullptr, topk_weights, topk_indices, num_tokens, topk, 0, num_experts, renormalize, stream);
+
+template <typename T>
+void topkGatingSoftmaxKernelLauncher(
+    const T* gating_output,
+    float* topk_weights,
+    int* topk_indices,
+    float* softmax_workspace,
+    const int num_tokens,
+    const int num_experts,
+    const int topk,
+    const bool renormalize,
+    hipStream_t stream) {
+  static constexpr int WARPS_PER_TB = 4;
+  switch (num_experts) {
+    case 1:
+      LAUNCH_SOFTMAX(T, 1, WARPS_PER_TB);
+      break;
+    case 2:
+      LAUNCH_SOFTMAX(T, 2, WARPS_PER_TB);
+      break;
+    case 4:
+      LAUNCH_SOFTMAX(T, 4, WARPS_PER_TB);
+      break;
+    case 8:
+      LAUNCH_SOFTMAX(T, 8, WARPS_PER_TB);
+      break;
+    case 16:
+      LAUNCH_SOFTMAX(T, 16, WARPS_PER_TB);
+      break;
+    case 32:
+      LAUNCH_SOFTMAX(T, 32, WARPS_PER_TB);
+      break;
+    case 64:
+      LAUNCH_SOFTMAX(T, 64, WARPS_PER_TB);
+      break;
+    case 128:
+      LAUNCH_SOFTMAX(T, 128, WARPS_PER_TB);
+      break;
+    case 256:
+      LAUNCH_SOFTMAX(T, 256, WARPS_PER_TB);
+      break;
+    default: {
+      TORCH_CHECK(
+          softmax_workspace != nullptr,
+          "softmax_workspace must be provided for num_experts that are not a power of 2.");
+      static constexpr int TPB = 256;
+     hipLaunchKernelGGL(( moeSoftmax<T, TPB>), dim3(num_tokens), dim3(TPB), 0, stream, gating_output, nullptr, softmax_workspace, num_experts);
+     hipLaunchKernelGGL(( moeTopK<TPB>), dim3(num_tokens), dim3(TPB), 0, stream, 
+          softmax_workspace, nullptr, topk_weights, topk_indices, num_experts, topk, 0, num_experts, renormalize);
+    }
+  }
+}
+
+void topk_softmax(
+    torch::Tensor& topk_weights,  // [num_tokens, topk]
+    torch::Tensor& topk_indices,  // [num_tokens, topk]
+    torch::Tensor& gating_output,
+    const bool renormalize)  // [num_tokens, num_experts]
+{
+  // Check data type
+  TORCH_CHECK(
+      gating_output.scalar_type() == at::ScalarType::Float || gating_output.scalar_type() == at::ScalarType::Half ||
+          gating_output.scalar_type() == at::ScalarType::BFloat16,
+      "gating_output must be float32, float16, or bfloat16");
+
+  // Check dimensions
+  TORCH_CHECK(gating_output.dim() == 2, "gating_output must be 2D tensor [num_tokens, num_experts]");
+  TORCH_CHECK(topk_weights.dim() == 2, "topk_weights must be 2D tensor [num_tokens, topk]");
+  TORCH_CHECK(topk_indices.dim() == 2, "topk_indices must be 2D tensor [num_tokens, topk]");
+
+  // Check shapes
+  TORCH_CHECK(
+      gating_output.size(0) == topk_weights.size(0),
+      "First dimension of topk_weights must match num_tokens in gating_output");
+  TORCH_CHECK(
+      gating_output.size(0) == topk_indices.size(0),
+      "First dimension of topk_indices must match num_tokens in gating_output");
+  TORCH_CHECK(
+      topk_weights.size(-1) == topk_indices.size(-1),
+      "Second dimension of topk_indices must match topk in topk_weights");
+  TORCH_CHECK(topk_weights.size(-1) <= gating_output.size(-1), "topk must be less than or equal to num_experts");
+
+  const int num_experts = static_cast<int>(gating_output.size(-1));
+  const int num_tokens = static_cast<int>(gating_output.size(0));
+  const int topk = static_cast<int>(topk_weights.size(-1));
+
+  const bool is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
+  const bool needs_workspace = !is_pow_2 || num_experts > 256;
+  const int64_t workspace_size = needs_workspace ? num_tokens * num_experts : 0;
+
+  const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(gating_output));
+  const hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+  torch::Tensor softmax_workspace =
+      torch::empty({workspace_size}, gating_output.options().dtype(at::ScalarType::Float));
+
+  const at::ScalarType dtype = gating_output.scalar_type();
+  if (dtype == at::ScalarType::Float) {
+    topkGatingSoftmaxKernelLauncher<float>(
+        gating_output.data_ptr<float>(),
+        topk_weights.data_ptr<float>(),
+        topk_indices.data_ptr<int>(),
+        softmax_workspace.data_ptr<float>(),
+        num_tokens,
+        num_experts,
+        topk,
+        renormalize,
+        stream);
+  } else if (dtype == at::ScalarType::Half) {
+    topkGatingSoftmaxKernelLauncher<__half>(
+        reinterpret_cast<const __half*>(gating_output.data_ptr<at::Half>()),
+        topk_weights.data_ptr<float>(),
+        topk_indices.data_ptr<int>(),
+        softmax_workspace.data_ptr<float>(),
+        num_tokens,
+        num_experts,
+        topk,
+        renormalize,
+        stream);
+  } else if (dtype == at::ScalarType::BFloat16) {
+    topkGatingSoftmaxKernelLauncher<__hip_bfloat16>(
+        reinterpret_cast<const __hip_bfloat16*>(gating_output.data_ptr<at::BFloat16>()),
+        topk_weights.data_ptr<float>(),
+        topk_indices.data_ptr<int>(),
+        softmax_workspace.data_ptr<float>(),
+        num_tokens,
+        num_experts,
+        topk,
+        renormalize,
+        stream);
+  } else {
+    TORCH_CHECK(false, "Unsupported gating_output dtype: ", dtype);
+  }
+}
diff --git a/sgl-kernel/csrc/moe/nvfp4_blockwise_moe.cu b/sgl-kernel/csrc/moe/nvfp4_blockwise_moe.cu
new file mode 100644
index 000000000..c68ca552d
--- /dev/null
+++ b/sgl-kernel/csrc/moe/nvfp4_blockwise_moe.cu
@@ -0,0 +1,471 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#include <cutlass/arch/arch.h>
+#include <torch/all.h>
+
+#include <cassert>
+
+#include "cute/tensor.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/gett.hpp"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/tensor_view_io.h"
+
+using namespace cute;
+
+template <
+    typename ElementAB,
+    typename ElementC,
+    typename ElementSF,
+    typename ElementAccumulator,
+    typename LayoutSFA,
+    typename LayoutSFB,
+    typename ScaleConfig>
+__global__ void __get_group_gemm_starts(
+    ElementAB** a_offsets,
+    ElementAB** b_offsets,
+    ElementC** out_offsets,
+    ElementSF** a_scales_offsets,
+    ElementSF** b_scales_offsets,
+    ElementAccumulator** alpha_offsets,
+    LayoutSFA* layout_sfa_base_as_int,
+    LayoutSFB* layout_sfb_base_as_int,
+    ElementAB* a_base_as_int,
+    ElementAB* b_base_as_int,
+    ElementC* out_base_as_int,
+    ElementSF* a_scales_base_as_int,
+    ElementSF* b_scales_base_as_int,
+    ElementAccumulator* alphas_base_as_int,
+    const int32_t* expert_offsets,
+    const int32_t* sf_offsets,
+    const int32_t* problem_sizes_as_shapes,
+    const int K,
+    const int N) {
+  int64_t expert_id = threadIdx.x;
+  if (expert_id >= gridDim.x * blockDim.x) {
+    return;
+  }
+  // Originally int32_t but upcasting to int64_t to avoid overflow
+  // during offset calculations
+  int64_t expert_offset = static_cast<int64_t>(expert_offsets[expert_id]);
+  int64_t sf_offset = static_cast<int64_t>(sf_offsets[expert_id]);
+  // size for block in block scale.
+  int64_t group_size = 16;
+  int64_t m = static_cast<int64_t>(problem_sizes_as_shapes[expert_id * 3]);
+  int64_t n = static_cast<int64_t>(problem_sizes_as_shapes[expert_id * 3 + 1]);
+  int64_t k = static_cast<int64_t>(problem_sizes_as_shapes[expert_id * 3 + 2]);
+  assert((m >= 0 && n == N && k == K && k % 2 == 0) && "unexpected problem sizes");
+
+  int64_t half_k = static_cast<int64_t>(k / 2);
+  int64_t group_k = static_cast<int64_t>(k / group_size);
+  // Shape of A as uint8/byte = [M, K // 2]
+  // Shape of B as uint8/byte = [E, N, K // 2]
+  a_offsets[expert_id] = a_base_as_int + expert_offset * half_k;
+
+  b_offsets[expert_id] = b_base_as_int + expert_id * n * half_k;
+  // Shape of C = [M, N]
+  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
+  // Shape of a_scale = [sum(sf_sizes), K // group_size]
+  a_scales_offsets[expert_id] = a_scales_base_as_int + sf_offset * group_k;
+
+  assert((reinterpret_cast<uintptr_t>(a_scales_offsets[expert_id]) % 128) == 0 && "TMA requires 128-byte alignment");
+
+  // Shape of B scale = [E, N, K // group_size]
+  b_scales_offsets[expert_id] = b_scales_base_as_int + expert_id * n * group_k;
+  assert((reinterpret_cast<uintptr_t>(b_scales_offsets[expert_id]) % 128) == 0 && "TMA requires 128-byte alignment");
+  // Shape of alpha = [E]
+  alpha_offsets[expert_id] = alphas_base_as_int + expert_id;
+
+  LayoutSFA* layout_sfa_ptr = layout_sfa_base_as_int + expert_id;
+  LayoutSFB* layout_sfb_ptr = layout_sfb_base_as_int + expert_id;
+
+  *layout_sfa_ptr = ScaleConfig::tile_atom_to_shape_SFA(
+      cute::make_shape(static_cast<int>(m), static_cast<int>(n), static_cast<int>(k), 1));
+  *layout_sfb_ptr = ScaleConfig::tile_atom_to_shape_SFB(
+      cute::make_shape(static_cast<int>(m), static_cast<int>(n), static_cast<int>(k), 1));
+}
+
+#define __CALL_GET_STARTS_KERNEL_BLOCKSCALE(                                                            \
+    ELEMENT_AB_TYPE, SF_TYPE, TENSOR_C_TYPE, C_TYPE, LayoutSFA, LayoutSFB, ScaleConfig)                 \
+  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                                                      \
+    __get_group_gemm_starts<ELEMENT_AB_TYPE, C_TYPE, SF_TYPE, float, LayoutSFA, LayoutSFB, ScaleConfig> \
+        <<<1, num_experts, 0, stream>>>(                                                                \
+            static_cast<ELEMENT_AB_TYPE**>(a_starts.data_ptr()),                                        \
+            static_cast<ELEMENT_AB_TYPE**>(b_starts.data_ptr()),                                        \
+            static_cast<C_TYPE**>(out_starts.data_ptr()),                                               \
+            static_cast<SF_TYPE**>(a_scales_starts.data_ptr()),                                         \
+            static_cast<SF_TYPE**>(b_scales_starts.data_ptr()),                                         \
+            static_cast<float**>(alpha_starts.data_ptr()),                                              \
+            reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),                                        \
+            reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr()),                                        \
+            static_cast<ELEMENT_AB_TYPE*>(a_tensors.data_ptr()),                                        \
+            static_cast<ELEMENT_AB_TYPE*>(b_tensors.data_ptr()),                                        \
+            static_cast<C_TYPE*>(out_tensors.data_ptr()),                                               \
+            static_cast<SF_TYPE*>(a_scales.data_ptr()),                                                 \
+            static_cast<SF_TYPE*>(b_scales.data_ptr()),                                                 \
+            static_cast<float*>(alphas.data_ptr()),                                                     \
+            static_cast<int32_t*>(expert_offsets.data_ptr()),                                           \
+            static_cast<int32_t*>(sf_offsets.data_ptr()),                                               \
+            static_cast<int32_t*>(problem_sizes.data_ptr()),                                            \
+            K,                                                                                          \
+            N);                                                                                         \
+  }
+
+template <typename LayoutSFA, typename LayoutSFB, typename ScaleConfig>
+void run_get_group_gemm_starts(
+    const torch::Tensor& a_starts,
+    const torch::Tensor& b_starts,
+    const torch::Tensor& out_starts,
+    const torch::Tensor& a_scales_starts,
+    const torch::Tensor& b_scales_starts,
+    const torch::Tensor& alpha_starts,
+    const torch::Tensor& layout_sfa,
+    const torch::Tensor& layout_sfb,
+    /*these are used for their base addresses*/
+    torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors,
+    torch::Tensor const& out_tensors,
+    torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales,
+    torch::Tensor const& alphas,
+    torch::Tensor const& expert_offsets,
+    torch::Tensor const& sf_offsets,
+    torch::Tensor const& problem_sizes,
+    int M,
+    int N,
+    int K) {
+  int num_experts = (int)expert_offsets.size(0);
+  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
+
+  TORCH_CHECK(out_tensors.size(1) == N, "Output tensor shape doesn't match expected shape");
+  TORCH_CHECK(
+      K / 2 == b_tensors.size(2),
+      "b_tensors(dim = 2) and a_tensors(dim = 1) trailing"
+      " dimension must match");
+  if (false) {
+  }
+  //(ELEMENT_AB_TYPE, BS_TYPE, TENSOR_C_TYPE, C_TYPE, LayoutSFA, LayoutSFB,
+  // ScaleConfig)
+  __CALL_GET_STARTS_KERNEL_BLOCKSCALE(
+      cutlass::float_e2m1_t,
+      cutlass::float_ue4m3_t,
+      torch::kBFloat16,
+      cutlass::bfloat16_t,
+      LayoutSFA,
+      LayoutSFB,
+      ScaleConfig)
+  __CALL_GET_STARTS_KERNEL_BLOCKSCALE(
+      cutlass::float_e2m1_t, cutlass::float_ue4m3_t, torch::kFloat16, half, LayoutSFA, LayoutSFB, ScaleConfig)
+  else {
+    TORCH_CHECK(false, "Invalid output type (must be float16 or bfloat16)");
+  }
+}
+
+template <typename OutType>
+void run_fp4_blockwise_scaled_group_mm(
+    torch::Tensor& output,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& a_blockscale,
+    const torch::Tensor& b_blockscales,
+    const torch::Tensor& alphas,
+    const torch::Tensor& ab_strides,
+    const torch::Tensor& c_strides,
+    const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets,
+    const torch::Tensor& sf_offsets,
+    int M,
+    int N,
+    int K) {
+  using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int32_t, int32_t, int32_t>>;
+  using ElementType = cutlass::float_e2m1_t;
+  using ElementSFType = cutlass::float_ue4m3_t;
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  using ElementC = OutType;
+  using ElementD = ElementC;
+  using ElementAccumulator = float;
+  // Layout definitions
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = LayoutC;
+
+  // Alignment constraints
+  static constexpr int AlignmentA = 32;
+  static constexpr int AlignmentB = 32;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  // Architecture definitions
+  using ArchTag = cutlass::arch::Sm100;
+  using EpilogueOperatorClass = cutlass::arch::OpClassTensorOp;             // Epilogue Operator class tag
+  using MainloopOperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;  // Mainloop Operator class tag
+  using StageCountType = cutlass::gemm::collective::StageCountAuto;         // Stage count maximized based
+                                                                            // on the tile size
+
+  using ClusterShape = Shape<_1, _1, _1>;
+  struct MMA1SMConfig {
+    using MmaTileShape = Shape<_128, _128, _128>;
+    using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmNvf4Sm100;  // Kernel to launch
+    using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm;           // Epilogue to launch
+  };
+
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      ArchTag,
+      EpilogueOperatorClass,
+      typename MMA1SMConfig::MmaTileShape,
+      ClusterShape,
+      Shape<_128, _64>,
+      ElementAccumulator,
+      ElementAccumulator,
+      ElementC,
+      LayoutC*,
+      AlignmentC,
+      ElementD,
+      LayoutC*,
+      AlignmentD,
+      typename MMA1SMConfig::EpilogueSchedule>::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      ArchTag,
+      MainloopOperatorClass,
+      ElementA,
+      LayoutA*,
+      AlignmentA,
+      ElementB,
+      LayoutB*,
+      AlignmentB,
+      ElementAccumulator,
+      typename MMA1SMConfig::MmaTileShape,
+      ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+          sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      typename MMA1SMConfig::KernelSchedule>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop, CollectiveEpilogue>;
+
+  using Gemm1SM = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using Gemm = Gemm1SM;
+  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
+  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
+  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
+  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
+
+  using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
+  using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB;
+  using ScaleConfig = typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+
+  using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
+  int num_experts = static_cast<int>(expert_offsets.size(0));
+  auto options_int = torch::TensorOptions().dtype(torch::kInt64).device(a.device());
+
+  torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor alpha_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor layout_sfa = torch::empty({num_experts, 5}, options_int);
+  torch::Tensor layout_sfb = torch::empty({num_experts, 5}, options_int);
+
+  run_get_group_gemm_starts<LayoutSFA, LayoutSFB, ScaleConfig>(
+      a_ptrs,
+      b_ptrs,
+      out_ptrs,
+      a_scales_ptrs,
+      b_scales_ptrs,
+      alpha_ptrs,
+      layout_sfa,
+      layout_sfb,
+      a,
+      b,
+      output,
+      a_blockscale,
+      b_blockscales,
+      alphas,
+      expert_offsets,
+      sf_offsets,
+      problem_sizes,
+      M,
+      N,
+      K);
+
+  // Create an instance of the GEMM
+  Gemm gemm_op;
+
+  // Initialize problem_sizes_as_shapes correctly
+  UnderlyingProblemShape* problem_sizes_as_shapes = static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
+
+  // Set the Scheduler info
+  cutlass::KernelHardwareInfo hw_info;
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm100GroupParams<
+      typename ProblemShape::UnderlyingProblemShape>::RasterOrderOptions;
+  typename Gemm::GemmKernel::TileSchedulerArguments scheduler;
+  scheduler.raster_order = RasterOrderOptions::AlongM;
+  hw_info.device_id = a.get_device();
+  static std::unordered_map<int, int> cached_sm_counts;
+  if (cached_sm_counts.find(hw_info.device_id) == cached_sm_counts.end()) {
+    cached_sm_counts[hw_info.device_id] =
+        cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+  }
+  hw_info.sm_count = min(cached_sm_counts[hw_info.device_id], INT_MAX);
+
+  // Mainloop Arguments
+  typename GemmKernel::MainloopArguments mainloop_args{
+      static_cast<const ElementType**>(a_ptrs.data_ptr()),
+      static_cast<StrideA*>(ab_strides.data_ptr()),
+      static_cast<const ElementType**>(b_ptrs.data_ptr()),
+      static_cast<StrideB*>(ab_strides.data_ptr()),
+      static_cast<const ElementSFType**>(a_scales_ptrs.data_ptr()),
+      reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),
+      static_cast<const ElementSFType**>(b_scales_ptrs.data_ptr()),
+      reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr())};
+
+  // Epilogue Arguments
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {},  // epilogue.thread
+      nullptr,
+      static_cast<StrideC*>(c_strides.data_ptr()),
+      static_cast<ElementD**>(out_ptrs.data_ptr()),
+      static_cast<StrideC*>(c_strides.data_ptr())};
+  auto& fusion_args = epilogue_args.thread;
+  fusion_args.alpha_ptr_array = reinterpret_cast<float**>(alpha_ptrs.data_ptr());
+  fusion_args.dAlpha = {_0{}, _0{}, 1};
+
+  // Gemm Arguments
+  typename GemmKernel::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGrouped,
+      {num_experts, problem_sizes_as_shapes, nullptr},
+      mainloop_args,
+      epilogue_args,
+      hw_info,
+      scheduler};
+
+  size_t workspace_size = Gemm::get_workspace_size(args);
+  auto const workspace_options = torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  auto can_implement_status = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess, "Failed to implement GEMM");
+
+  // Run the GEMM
+  auto status = gemm_op.initialize(args, workspace.data_ptr());
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
+
+  status = gemm_op.run(args, workspace.data_ptr(), stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
+}
+
+#define CHECK_TYPE(x, st, m) TORCH_CHECK(x.scalar_type() == st, ": Inconsistency of Tensor type:", m)
+#define CHECK_TH_CUDA(x, m) TORCH_CHECK(x.is_cuda(), m, ": must be a CUDA tensor.")
+#define CHECK_CONTIGUOUS(x, m) TORCH_CHECK(x.is_contiguous(), m, ": must be contiguous.")
+#define CHECK_INPUT(x, st, m) \
+  CHECK_TH_CUDA(x, m);        \
+  CHECK_CONTIGUOUS(x, m);     \
+  CHECK_TYPE(x, st, m)
+
+void cutlass_fp4_group_mm(
+    torch::Tensor& output,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& a_blockscale,
+    const torch::Tensor& b_blockscales,
+    const torch::Tensor& alphas,
+    const torch::Tensor& ab_strides,
+    const torch::Tensor& c_strides,
+    const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets,
+    const torch::Tensor& sf_offsets) {
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+
+  constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
+  constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
+  // Input validation
+  CHECK_INPUT(a, FLOAT4_E2M1X2, "a");
+  CHECK_INPUT(b, FLOAT4_E2M1X2, "b");
+  CHECK_INPUT(a_blockscale, SF_DTYPE, "a_blockscale");
+  CHECK_INPUT(b_blockscales, SF_DTYPE, "b_blockscales");
+  CHECK_INPUT(alphas, at::ScalarType::Float, "alphas");
+
+  TORCH_CHECK(
+      a_blockscale.dim() == 2,
+      "expected a_blockscale to be of shape [num_experts, rounded_m,"
+      " k // group_size], observed rank: ",
+      a_blockscale.dim())
+  TORCH_CHECK(
+      b_blockscales.dim() == 3,
+      "expected b_blockscale to be of shape: "
+      " [num_experts, n, k // group_size], observed rank: ",
+      b_blockscales.dim())
+  TORCH_CHECK(problem_sizes.dim() == 2, "problem_sizes must be  a 2D tensor");
+  TORCH_CHECK(problem_sizes.size(1) == 3, "problem_sizes must have the shape (num_experts, 3)");
+  TORCH_CHECK(
+      problem_sizes.size(0) == expert_offsets.size(0), "Number of experts in problem_sizes must match expert_offsets");
+  TORCH_CHECK(problem_sizes.dtype() == torch::kInt32, "problem_sizes must be int32.");
+
+  int M = static_cast<int>(a.size(0));
+  int N = static_cast<int>(b.size(1));
+  int E = static_cast<int>(b.size(0));
+  int K = static_cast<int>(2 * b.size(2));
+
+  if (output.scalar_type() == torch::kBFloat16) {
+    run_fp4_blockwise_scaled_group_mm<cutlass::bfloat16_t>(
+        output,
+        a,
+        b,
+        a_blockscale,
+        b_blockscales,
+        alphas,
+        ab_strides,
+        c_strides,
+        problem_sizes,
+        expert_offsets,
+        sf_offsets,
+        M,
+        N,
+        K);
+  } else {
+    run_fp4_blockwise_scaled_group_mm<cutlass::half_t>(
+        output,
+        a,
+        b,
+        a_blockscale,
+        b_blockscales,
+        alphas,
+        ab_strides,
+        c_strides,
+        problem_sizes,
+        expert_offsets,
+        sf_offsets,
+        M,
+        N,
+        K);
+  }
+#else
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_fp4_group_mm kernel, sgl-kernel must "
+      "be compiled with ENABLE_NVFP4 for SM100+ and CUDA "
+      "12.8 or above.");
+#endif
+}
diff --git a/sgl-kernel/csrc/moe/prepare_moe_input.cu b/sgl-kernel/csrc/moe/prepare_moe_input.cu
new file mode 100644
index 000000000..46fff0649
--- /dev/null
+++ b/sgl-kernel/csrc/moe/prepare_moe_input.cu
@@ -0,0 +1,392 @@
+#include <c10/cuda/CUDAGuard.h>
+#include <cudaTypedefs.h>
+#include <torch/all.h>
+
+#include <flashinfer/vec_dtypes.cuh>
+#include <iostream>
+
+#include "cutlass/array.h"
+#include "utils.h"
+
+constexpr uint64_t THREADS_PER_EXPERT = 512;
+
+__global__ void compute_problem_sizes(
+    const int* __restrict__ topk_ids,
+    int32_t* problem_sizes1,
+    int32_t* problem_sizes2,
+    int32_t* atomic_buffer,
+    const int64_t topk_length,
+    const int64_t n,
+    const int64_t k) {
+  int expert_id = blockIdx.x;
+
+  int occurrences = 0;
+  for (int i = threadIdx.x; i < topk_length; i += THREADS_PER_EXPERT) {
+    occurrences += (topk_ids[i] == expert_id);
+  }
+  atomicAdd(&atomic_buffer[expert_id], occurrences);
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    int final_occurrences = atomic_buffer[expert_id];
+    problem_sizes1[expert_id * 3] = final_occurrences;
+    problem_sizes1[expert_id * 3 + 1] = static_cast<int32_t>(2 * n);
+    problem_sizes1[expert_id * 3 + 2] = static_cast<int32_t>(k);
+    problem_sizes2[expert_id * 3] = final_occurrences;
+    problem_sizes2[expert_id * 3 + 1] = static_cast<int32_t>(k);
+    problem_sizes2[expert_id * 3 + 2] = static_cast<int32_t>(n);
+  }
+}
+
+__global__ void compute_expert_offsets(
+    const int32_t* __restrict__ problem_sizes1,
+    int32_t* expert_offsets,
+    int32_t* atomic_buffer,
+    const int64_t num_experts) {
+  int32_t tot_offset = 0;
+  expert_offsets[0] = 0;
+  for (int i = 0; i < num_experts; ++i) {
+    atomic_buffer[i] = tot_offset;
+    tot_offset += problem_sizes1[i * 3];
+    expert_offsets[i + 1] = tot_offset;
+  }
+}
+
+__global__ void compute_expert_blockscale_offsets(
+    const int32_t* __restrict__ problem_sizes1,
+    int32_t* expert_offsets,
+    int32_t* blockscale_offsets,
+    int32_t* atomic_buffer,
+    const int64_t num_experts) {
+  int32_t tot_offset = 0;
+  int32_t tot_rounded_offset = 0;
+  expert_offsets[0] = 0;
+  blockscale_offsets[0] = 0;
+  for (int i = 0; i < num_experts; ++i) {
+    atomic_buffer[i] = tot_offset;
+    int num_tokens = problem_sizes1[i * 3];
+    int rounded_num_tokens = (num_tokens + (128 - 1)) / 128 * 128;
+    tot_offset += num_tokens;
+    tot_rounded_offset += rounded_num_tokens;
+    expert_offsets[i + 1] = tot_offset;
+    blockscale_offsets[i + 1] = tot_rounded_offset;
+  }
+}
+
+__global__ void compute_arg_sorts(
+    const int32_t* __restrict__ topk_ids,
+    int32_t* input_permutation,
+    int32_t* output_permutation,
+    int32_t* atomic_buffer,
+    const int64_t topk_length,
+    const int64_t topk) {
+  int expert_id = blockIdx.x;
+
+  for (int i = threadIdx.x; i < topk_length; i += THREADS_PER_EXPERT) {
+    if (topk_ids[i] == expert_id) {
+      int start = atomicAdd(&atomic_buffer[expert_id], 1);
+      input_permutation[start] = i / topk;
+      output_permutation[i] = start;
+    }
+  }
+}
+
+void get_moe_prepare_input_caller(
+    const torch::Tensor& topk_ids,
+    torch::Tensor& expert_offsets,
+    const std::optional<torch::Tensor>& blockscale_offsets,
+    torch::Tensor& problem_sizes1,
+    torch::Tensor& problem_sizes2,
+    torch::Tensor& input_permutation,
+    torch::Tensor& output_permutation,
+    const int64_t num_experts,
+    const int64_t n,
+    const int64_t k) {
+  auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
+  auto options_int32 = torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
+  torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);
+
+  uint32_t num_threads = static_cast<uint32_t>(min(THREADS_PER_EXPERT, topk_ids.numel()));
+  uint32_t num_blocks = static_cast<uint32_t>(num_experts);
+
+  compute_problem_sizes<<<num_blocks, num_threads, 0, stream>>>(
+      static_cast<const int32_t*>(topk_ids.data_ptr()),
+      static_cast<int32_t*>(problem_sizes1.data_ptr()),
+      static_cast<int32_t*>(problem_sizes2.data_ptr()),
+      static_cast<int32_t*>(atomic_buffer.data_ptr()),
+      topk_ids.numel(),
+      n,
+      k);
+  if (blockscale_offsets.has_value()) {
+    compute_expert_blockscale_offsets<<<1, 1, 0, stream>>>(
+        static_cast<const int32_t*>(problem_sizes1.data_ptr()),
+        static_cast<int32_t*>(expert_offsets.data_ptr()),
+        static_cast<int32_t*>(blockscale_offsets.value().data_ptr()),
+        static_cast<int32_t*>(atomic_buffer.data_ptr()),
+        num_experts);
+  } else {
+    compute_expert_offsets<<<1, 1, 0, stream>>>(
+        static_cast<const int32_t*>(problem_sizes1.data_ptr()),
+        static_cast<int32_t*>(expert_offsets.data_ptr()),
+        static_cast<int32_t*>(atomic_buffer.data_ptr()),
+        num_experts);
+  }
+  compute_arg_sorts<<<num_blocks, num_threads, 0, stream>>>(
+      static_cast<const int32_t*>(topk_ids.data_ptr()),
+      static_cast<int32_t*>(input_permutation.data_ptr()),
+      static_cast<int32_t*>(output_permutation.data_ptr()),
+      static_cast<int32_t*>(atomic_buffer.data_ptr()),
+      topk_ids.numel(),
+      topk_ids.size(1));
+}
+
+void prepare_moe_input(
+    const torch::Tensor& topk_ids,
+    torch::Tensor& expert_offsets,
+    const std::optional<torch::Tensor>& blockscale_offsets,
+    torch::Tensor& problem_sizes1,
+    torch::Tensor& problem_sizes2,
+    torch::Tensor& input_permutation,
+    torch::Tensor& output_permutation,
+    const int64_t num_experts,
+    const int64_t n,
+    const int64_t k) {
+  TORCH_CHECK(topk_ids.dtype() == torch::kInt32);
+  get_moe_prepare_input_caller(
+      topk_ids,
+      expert_offsets,
+      blockscale_offsets,
+      problem_sizes1,
+      problem_sizes2,
+      input_permutation,
+      output_permutation,
+      num_experts,
+      n,
+      k);
+  return;
+}
+
+template <typename T>
+__global__ void shuffleRowsKernel(
+    const T* input,
+    const int32_t* dst2src_map,
+    T* output,
+    int64_t num_src_rows,
+    int64_t num_dst_rows,
+    int64_t num_cols) {
+  int64_t dest_row_idx = blockIdx.x;
+  int64_t const source_row_idx = dst2src_map[dest_row_idx];
+
+  if (blockIdx.x < num_dst_rows) {
+    // Load 128-bits per thread
+    constexpr uint64_t ELEM_PER_THREAD = 128 / sizeof(T) / 8;
+    using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
+
+    // Duplicate and permute rows
+    auto const* source_row_ptr = reinterpret_cast<DataElem const*>(input + source_row_idx * num_cols);
+    auto* dest_row_ptr = reinterpret_cast<DataElem*>(output + dest_row_idx * num_cols);
+
+    auto const start_offset = threadIdx.x;
+    auto const stride = blockDim.x;
+    auto const num_elems_in_col = num_cols / ELEM_PER_THREAD;
+
+    for (auto elem_index = start_offset; elem_index < num_elems_in_col; elem_index += stride) {
+      dest_row_ptr[elem_index] = source_row_ptr[elem_index];
+    }
+  }
+}
+
+#define DECLARE_SHUFFLE_ROWS(T)      \
+  __global__ void shuffleRowsKernel( \
+      const T* input,                \
+      const int32_t* dst2src_map,    \
+      T* output,                     \
+      int64_t num_src_rows,          \
+      int64_t num_dest_rows,         \
+      int64_t num_cols);
+
+DECLARE_SHUFFLE_ROWS(float);
+DECLARE_SHUFFLE_ROWS(half);
+DECLARE_SHUFFLE_ROWS(__nv_bfloat16);
+DECLARE_SHUFFLE_ROWS(__nv_fp8_e4m3);
+DECLARE_SHUFFLE_ROWS(uint8_t);
+
+#define SHUFFLE_ROWS(T)                                    \
+  shuffleRowsKernel<T><<<blocks, threads, 0, stream>>>(    \
+      reinterpret_cast<const T*>(input),                   \
+      static_cast<const int32_t*>(dst2src_map.data_ptr()), \
+      reinterpret_cast<T*>(output),                        \
+      num_src_rows,                                        \
+      num_dst_rows,                                        \
+      num_cols)
+
+#define DTYPE_DISPATCH_CASE(T, CUDA_T) \
+  case T:                              \
+    SHUFFLE_ROWS(CUDA_T);              \
+    break;
+
+void shuffle_rows_caller(
+    const torch::Tensor& input_tensor, const torch::Tensor& dst2src_map, torch::Tensor& output_tensor) {
+  TORCH_CHECK(
+      input_tensor.scalar_type() == output_tensor.scalar_type(),
+      "Input and output tensors must have the same data type");
+  auto stream = at::cuda::getCurrentCUDAStream().stream();
+  uint32_t blocks = static_cast<uint32_t>(output_tensor.size(0));
+  uint32_t threads = 256;
+  int64_t num_dst_rows = output_tensor.size(0);
+  int64_t num_src_rows = input_tensor.size(0);
+  int64_t num_cols = input_tensor.size(1);
+  const void* input = input_tensor.data_ptr();
+  void* output = output_tensor.data_ptr();
+  switch (input_tensor.scalar_type()) {
+    DTYPE_DISPATCH_CASE(torch::kFloat16, half);
+    DTYPE_DISPATCH_CASE(torch::kBFloat16, __nv_bfloat16);
+    DTYPE_DISPATCH_CASE(torch::kFloat32, float);
+    DTYPE_DISPATCH_CASE(torch::kFloat8_e4m3fn, __nv_fp8_e4m3);
+    DTYPE_DISPATCH_CASE(torch::kUInt8, uint8_t);
+    default:
+      TORCH_CHECK(false, "[moe replicate input] data type dispatch fail!");
+  }
+  return;
+}
+
+void shuffle_rows(const torch::Tensor& input_tensor, const torch::Tensor& dst2src_map, torch::Tensor& output_tensor) {
+  shuffle_rows_caller(input_tensor, dst2src_map, output_tensor);
+  return;
+}
+
+template <typename scalar_t>
+__global__ void apply_shuffle_mul_sum_kernel(
+    const scalar_t* __restrict__ input_tensor,  // [m * topk, k] (expert-major layout)
+    scalar_t* __restrict__ output_tensor,       // [m, k] (token-major layout)
+    const int32_t* __restrict__ permutation,    // [m * topk] (c_map: token-major-idx -> expert-major-idx)
+    int m,
+    int topk,
+    int row_stride,
+    const scalar_t* __restrict__ factors)  // [m * topk] (topk_weights, token-major layout)
+{
+  int i = blockIdx.x;
+  if (i >= m) {
+    return;
+  }
+
+  constexpr uint32_t vec_size = 16 / sizeof(scalar_t);
+  using t = float;
+  using vec_t = flashinfer::vec_t<t, vec_size>;
+  int thread_idx = threadIdx.x;
+  int stride = blockDim.x;
+
+  for (int d_vec_idx = thread_idx; d_vec_idx < row_stride / vec_size; d_vec_idx += stride) {
+    int d = d_vec_idx * vec_size;
+    vec_t sum_vec;
+    sum_vec.fill(0.0f);
+
+    for (int j = 0; j < topk; ++j) {
+      int token_major_idx = i * topk + j;
+      int src_row = permutation[token_major_idx];
+
+      vec_t val_vec;
+      val_vec.cast_load(input_tensor + src_row * row_stride + d);
+
+      t factor = 1.0;
+      if (factors != nullptr) {
+        factor = factors[token_major_idx];
+      }
+
+#pragma unroll
+      for (int k = 0; k < vec_size; ++k) {
+        sum_vec[k] += factor * val_vec[k];
+      }
+    }
+    sum_vec.cast_store(output_tensor + i * row_stride + d);
+  }
+
+  //  remainder part
+  int remainder_start = (row_stride / vec_size) * vec_size;
+  for (int d = remainder_start + thread_idx; d < row_stride; d += stride) {
+    t sum_val = 0.0;
+    for (int j = 0; j < topk; ++j) {
+      int token_major_idx = i * topk + j;
+      int src_row = permutation[token_major_idx];
+      t val = input_tensor[src_row * row_stride + d];
+
+      t factor = 1.0;
+      if (factors != nullptr) {
+        factor = factors[token_major_idx];
+      }
+      sum_val += factor * val;
+    }
+    output_tensor[i * row_stride + d] = sum_val;
+  }
+}
+
+void get_apply_shuffle_mul_sum_caller(
+    const torch::Tensor& input_tensor,                // [m * topk, row_stride], bf16/f16
+    torch::Tensor& output_tensor,                     // [m, row_stride], bf16/f16
+    const torch::Tensor& permutation,                 // [m * topk], int32
+    const std::optional<torch::Tensor>& factors_opt)  // optional [m * topk], bf16/f16
+{
+  TORCH_CHECK(input_tensor.dim() == 2, "input_tensor must be 2D [m * topk, row_stride]");
+  TORCH_CHECK(output_tensor.dim() == 2, "output_tensor must be 2D [m, row_stride]");
+  TORCH_CHECK(permutation.dim() == 1, "permutation must be 1D [m * topk]");
+
+  int m = output_tensor.size(0);
+  int topk = int(permutation.size(0) / m);
+  int row_stride = output_tensor.size(1);
+
+  TORCH_CHECK(permutation.size(0) == m * topk, "permutation size must match m * topk");
+
+  auto scalar_type = output_tensor.scalar_type();
+  uint32_t vec_size = 16 / sizeof(scalar_type);
+  auto blockDim = std::min(row_stride / vec_size, 1024U);
+  dim3 block(blockDim);
+
+  dim3 grid(m);  // blockIdx.x = j, blockIdx.y = i
+  auto stream = at::cuda::getCurrentCUDAStream(input_tensor.device().index());
+
+  const int32_t* perm_ptr = permutation.data_ptr<int32_t>();
+
+  void* factors_ptr = nullptr;
+  if (factors_opt.has_value()) {
+    TORCH_CHECK(factors_opt->dtype() == output_tensor.dtype(), "Factors must match output dtype");
+    TORCH_CHECK(factors_opt->numel() == m * topk, "Factors must have shape [m * topk]");
+    factors_ptr = factors_opt->data_ptr();
+  }
+
+  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16(output_tensor.scalar_type(), scalar_t, [&] {
+    apply_shuffle_mul_sum_kernel<scalar_t><<<grid, block, 0, stream>>>(
+        static_cast<const scalar_t*>(input_tensor.data_ptr()),
+        static_cast<scalar_t*>(output_tensor.data_ptr()),
+        perm_ptr,
+        m,
+        topk,
+        row_stride,
+        static_cast<const scalar_t*>(factors_ptr));
+    return true;
+  });
+}
+
+/**
+ * @brief Applies a permutation-based shuffle, element-wise multiplication, and reduction over the second dimension.
+ *
+ * This function performs the equivalent of the following PyTorch expression:
+ *
+ *     (c2[c_map].view(m, topk, k) * topk_weights.view(m, topk, 1).to(out_dtype)).sum(dim=1)
+ *
+ * Specifically:
+ * - `input` is shuffled using the `permutation` tensor.
+ * - The shuffled tensor is reshaped and multiplied element-wise with `factors` (e.g., top-k weights).
+ * - The result is summed along dimension 1 (the top-k dimension), and stored in `output`.
+ *
+ * @param input        Input tensor of shape (m * topk, k), representing c2.
+ * @param output       Output tensor of shape (m, k), where the final reduced results are stored.
+ * @param permutation  Index tensor (e.g., c_map) that maps positions in `input` to shuffled layout.
+ * @param factors      Optional scaling factors (e.g., top-k weights), shape (m * topk) or (m, topk).
+ */
+void apply_shuffle_mul_sum(
+    const torch::Tensor& input,
+    torch::Tensor& output,
+    const torch::Tensor& permutation,
+    const std::optional<torch::Tensor>& factors) {
+  get_apply_shuffle_mul_sum_caller(input, output, permutation, factors);
+}
diff --git a/sgl-kernel/csrc/spatial/cuda_utils.h b/sgl-kernel/csrc/spatial/cuda_utils.h
new file mode 100644
index 000000000..65132975f
--- /dev/null
+++ b/sgl-kernel/csrc/spatial/cuda_utils.h
@@ -0,0 +1,46 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+
+#define CUDA_RT(call)                                                                                        \
+  do {                                                                                                       \
+    cudaError_t _status = (call);                                                                            \
+    if (_status != cudaSuccess) {                                                                            \
+      std::cerr << "ERROR: CUDA RT call \"" << #call << "\" in line " << __LINE__ << " of file " << __FILE__ \
+                << " failed with " << cudaGetErrorString(_status) << std::endl;                              \
+      TORCH_CHECK(                                                                                           \
+          false,                                                                                             \
+          c10::str(                                                                                          \
+              "ERROR: CUDA RT call \"",                                                                      \
+              #call,                                                                                         \
+              "\" in line ",                                                                                 \
+              __LINE__,                                                                                      \
+              " of file ",                                                                                   \
+              __FILE__,                                                                                      \
+              " failed with ",                                                                               \
+              cudaGetErrorString(_status)));                                                                 \
+    }                                                                                                        \
+  } while (0)
+
+#define CUDA_DRV(call)                                                                                        \
+  do {                                                                                                        \
+    CUresult _status = (call);                                                                                \
+    if (_status != CUDA_SUCCESS) {                                                                            \
+      const char* err_str;                                                                                    \
+      cuGetErrorString(_status, &err_str);                                                                    \
+      std::cerr << "ERROR: CUDA DRV call \"" << #call << "\" in line " << __LINE__ << " of file " << __FILE__ \
+                << " failed with " << err_str << std::endl;                                                   \
+      TORCH_CHECK(                                                                                            \
+          false,                                                                                              \
+          c10::str(                                                                                           \
+              "ERROR: CUDA DRV call \"",                                                                      \
+              #call,                                                                                          \
+              "\" in line ",                                                                                  \
+              __LINE__,                                                                                       \
+              " of file ",                                                                                    \
+              __FILE__,                                                                                       \
+              " failed with ",                                                                                \
+              err_str));                                                                                      \
+    }                                                                                                         \
+  } while (0)
diff --git a/sgl-kernel/csrc/spatial/greenctx_stream.cu b/sgl-kernel/csrc/spatial/greenctx_stream.cu
new file mode 100644
index 000000000..0366565ef
--- /dev/null
+++ b/sgl-kernel/csrc/spatial/greenctx_stream.cu
@@ -0,0 +1,98 @@
+// Documentation: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html
+#include <torch/all.h>
+
+#include <cstdlib>
+
+#include "cuda_utils.h"
+#include "greenctx_stream.h"
+
+static int CUDA_DRIVER_VERSION;
+
+using PFN_cuGreenCtxStreamCreate = CUresult(CUDAAPI*)(CUstream*, CUgreenCtx, unsigned int, int);
+
+auto probe_cuGreenCtxStreamCreate() -> PFN_cuGreenCtxStreamCreate {
+  static PFN_cuGreenCtxStreamCreate pfn = nullptr;
+  CUDA_DRV(cuGetProcAddress("cuGreenCtxStreamCreate", reinterpret_cast<void**>(&pfn), CUDA_DRIVER_VERSION, 0, nullptr));
+  return pfn;
+}
+
+static std::vector<int64_t> create_greenctx_stream_fallback(CUgreenCtx gctx[2]) {
+  CUstream streamA, streamB;
+  CUcontext ctx;
+
+  CUDA_DRV(cuCtxFromGreenCtx(&ctx, gctx[0]));
+  CUDA_DRV(cuCtxPushCurrent(ctx));
+  CUDA_DRV(cuStreamCreate(&streamA, CU_STREAM_NON_BLOCKING));
+  CUDA_DRV(cuCtxPopCurrent(nullptr));
+
+  CUDA_DRV(cuCtxFromGreenCtx(&ctx, gctx[1]));
+  CUDA_DRV(cuCtxPushCurrent(ctx));
+  CUDA_DRV(cuStreamCreate(&streamB, CU_STREAM_NON_BLOCKING));
+  CUDA_DRV(cuCtxPopCurrent(nullptr));
+
+  return {(int64_t)streamA, (int64_t)streamB};
+}
+
+inline void destroy_green_context(CUgreenCtx gctx) {
+  if (!gctx) return;
+  CUDA_DRV(cuGreenCtxDestroy(gctx));
+}
+
+static std::vector<int64_t> create_greenctx_stream_direct_dynamic(CUgreenCtx gctx[2]) {
+  // This symbol is introduced in CUDA 12.5
+  const static auto pfn = probe_cuGreenCtxStreamCreate();
+  if (!pfn) {
+    TORCH_WARN("cuGreenCtxStreamCreate(cuda>=12.5) is not available, using fallback");
+    return create_greenctx_stream_fallback(gctx);
+  }
+
+  CUstream streamA, streamB;
+  CUDA_DRV(pfn(&streamA, gctx[0], CU_STREAM_NON_BLOCKING, 0));
+  CUDA_DRV(pfn(&streamB, gctx[1], CU_STREAM_NON_BLOCKING, 0));
+
+  return {(int64_t)streamA, (int64_t)streamB};
+}
+
+std::vector<int64_t> create_greenctx_stream_by_value(int64_t smA, int64_t smB, int64_t device) {
+  CUDA_DRV(cuDriverGetVersion(&CUDA_DRIVER_VERSION));
+
+  CUgreenCtx gctx[3];
+  CUdevResourceDesc desc[3];
+  CUdevResource input;
+  CUdevResource resources[4];
+
+  TORCH_CHECK(smA > 0 && smB > 0, "SM counts must be positive");
+
+  CUDA_DRV(cuDeviceGetDevResource((CUdevice)device, &input, CU_DEV_RESOURCE_TYPE_SM));
+
+  const unsigned minCount = static_cast<unsigned>(smA + smB);
+  const unsigned minCountA = static_cast<unsigned>(smA);
+  TORCH_CHECK(minCount <= input.sm.smCount, "Not enough SMs available for the requested configuration");
+
+  unsigned nbGroups = 1;
+  CUDA_DRV(cuDevSmResourceSplitByCount(&resources[2], &nbGroups, &input, &resources[3], 0, minCount));
+  CUDA_DRV(cuDevResourceGenerateDesc(&desc[2], &resources[2], 1));
+  CUDA_DRV(cuGreenCtxCreate(&gctx[2], desc[2], (CUdevice)device, CU_GREEN_CTX_DEFAULT_STREAM));
+  CUDA_DRV(cuGreenCtxGetDevResource(gctx[2], &input, CU_DEV_RESOURCE_TYPE_SM));
+  nbGroups = 1;
+  CUDA_DRV(cuDevSmResourceSplitByCount(&resources[0], &nbGroups, &input, &resources[1], 0, minCountA));
+  CUDA_DRV(cuDevResourceGenerateDesc(&desc[0], &resources[0], 1));
+  CUDA_DRV(cuGreenCtxCreate(&gctx[0], desc[0], (CUdevice)device, CU_GREEN_CTX_DEFAULT_STREAM));
+  CUDA_DRV(cuDevResourceGenerateDesc(&desc[1], &resources[1], 1));
+  CUDA_DRV(cuGreenCtxCreate(&gctx[1], desc[1], (CUdevice)device, CU_GREEN_CTX_DEFAULT_STREAM));
+
+  const int smCountA = resources[0].sm.smCount;
+  const int smCountB = resources[1].sm.smCount;
+
+  std::vector<int64_t> streams = create_greenctx_stream_direct_dynamic(gctx);
+
+  destroy_green_context(gctx[2]);
+
+  std::vector<int64_t> vec = {
+      streams[0],  // streamA
+      streams[1],  // streamB
+      (int64_t)smCountA,
+      (int64_t)smCountB};
+
+  return vec;
+}
diff --git a/sgl-kernel/csrc/spatial/greenctx_stream.h b/sgl-kernel/csrc/spatial/greenctx_stream.h
new file mode 100644
index 000000000..2577e9f29
--- /dev/null
+++ b/sgl-kernel/csrc/spatial/greenctx_stream.h
@@ -0,0 +1,2 @@
+#include <vector>
+std::vector<int64_t> create_greenctx_stream_by_value(int64_t smA, int64_t smB, int64_t device);
diff --git a/sgl-kernel/csrc/spatial_extension.cc b/sgl-kernel/csrc/spatial_extension.cc
new file mode 100644
index 000000000..27833cd70
--- /dev/null
+++ b/sgl-kernel/csrc/spatial_extension.cc
@@ -0,0 +1,29 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <torch/all.h>
+#include <torch/library.h>
+
+#include "sgl_kernel_ops.h"
+
+TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
+  /*
+   * From csrc/spatial
+   */
+  m.def("create_greenctx_stream_by_value(int smA, int smB, int device) -> int[]");
+  m.impl("create_greenctx_stream_by_value", &create_greenctx_stream_by_value);
+}
+
+REGISTER_EXTENSION(spatial_ops)
diff --git a/sgl-kernel/csrc/speculative/eagle_utils.cu b/sgl-kernel/csrc/speculative/eagle_utils.cu
new file mode 100644
index 000000000..7bf5db274
--- /dev/null
+++ b/sgl-kernel/csrc/speculative/eagle_utils.cu
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2025 by SGLang team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#ifndef USE_ROCM
+#include "pytorch_extension_utils.h"
+#else
+#include "pytorch_extension_utils_rocm.h"
+#endif
+
+typedef enum { FULL_MASK = 0, QLEN_ONLY = 1, QLEN_ONLY_BITPACKING = 2 } TreeMaskMode;
+
+// parent_list [bs, topk * (depth - 1) + 1)]
+// selected_index [bs, draft_token_num - 1]
+// verified_seq_len [bs]
+// tree_mask [draft_token*(seq_len[0]+draft_token) | draft_token*(seq_len[1]+draft_token) | ..] =
+// [sum(verified_seq_len)*draft_token+bs*draft_token*draft_token] positions [bs * draft_token] retrive_index [b,
+// draft_token] retrive_next_token [b, draft_token] retrive_next_sibling [b, draft_token]
+__global__ void build_tree_efficient(
+    int64_t* parent_list,
+    int64_t* selected_index,
+    int64_t* verified_seq_len,
+    bool* tree_mask,
+    int64_t* positions,
+    int64_t* retrive_index,
+    int64_t* retrive_next_token,
+    int64_t* retrive_next_sibling,
+    int topk,
+    int depth,
+    int draft_token_num,
+    int tree_mask_mode) {
+  int bid = blockIdx.x;
+  int tid = threadIdx.x;
+
+  if (tid >= draft_token_num) {
+    return;
+  }
+  int seq_tree_idx = draft_token_num * draft_token_num * bid;
+  for (int i = 0; i < bid; i++) {
+    seq_tree_idx += verified_seq_len[i] * draft_token_num;
+  }
+  int seq_len = verified_seq_len[bid];
+  int token_tree_idx;
+  if (tree_mask_mode == FULL_MASK) {
+    token_tree_idx = seq_tree_idx + (seq_len + draft_token_num) * tid + seq_len + 1;
+  } else {
+    token_tree_idx = draft_token_num * draft_token_num * bid + draft_token_num * tid + 1;
+  }
+  tree_mask[token_tree_idx - 1] = true;
+  for (int i = 0; i < draft_token_num - 1; i++) {
+    tree_mask[token_tree_idx + i] = false;
+  }
+
+  int position = 0;
+  if (tid == 0) {
+    positions[bid * draft_token_num] = seq_len;
+
+    int retrive_index_offset = bid * draft_token_num;
+    for (int i = draft_token_num - 1; i > 0; --i) {
+      int current_token_idx = retrive_index_offset + i;
+      retrive_index[bid * draft_token_num + i] = current_token_idx;
+      int parent_tb_idx = selected_index[bid * (draft_token_num - 1) + i - 1] / topk;
+      int parent_position = 0;
+      if (parent_tb_idx > 0) {
+        int parent_token_idx = parent_list[bid * (topk * (depth - 1) + 1) + parent_tb_idx];
+        for (; parent_position < draft_token_num; ++parent_position) {
+          if (selected_index[bid * (draft_token_num - 1) + parent_position] == parent_token_idx) {
+            ++parent_position;
+            break;
+          }
+        }
+      }
+      if (parent_position == draft_token_num) {
+        printf(
+            "WARNING: invalid eagle tree!!! Detected a token with no parent token selected. "
+            "Please check if the logprob has nan. The token will be ignored to keep proceeding.\n");
+        continue;
+      }
+
+      if (retrive_next_token[bid * draft_token_num + parent_position] == -1) {
+        retrive_next_token[bid * draft_token_num + parent_position] = i;
+      } else {
+        int origin_next_token = retrive_next_token[bid * draft_token_num + parent_position];
+        retrive_next_token[bid * draft_token_num + parent_position] = i;
+        retrive_next_sibling[bid * draft_token_num + i] = origin_next_token;
+      }
+    }
+    retrive_index[bid * draft_token_num] = bid * draft_token_num;
+  } else {
+    int cur_position = tid - 1;
+    while (true) {
+      position += 1;
+      tree_mask[token_tree_idx + cur_position] = true;
+      int parent_tb_idx = selected_index[bid * (draft_token_num - 1) + cur_position] / topk;
+      if (parent_tb_idx == 0) {
+        break;
+      }
+
+      int token_idx = parent_list[bid * (topk * (depth - 1) + 1) + parent_tb_idx];
+      for (cur_position = 0; cur_position < draft_token_num; ++cur_position) {
+        if (selected_index[bid * (draft_token_num - 1) + cur_position] == token_idx) {
+          break;
+        }
+      }
+    }
+    positions[bid * draft_token_num + tid] = position + seq_len;
+  }
+}
+
+// parent_list [bs, topk * (depth - 1) + 1)]
+// selected_index [bs, draft_token_num - 1]
+// verified_seq_len [bs]
+// tree_mask: [draft_token*num_bytes_per_item | .. ] = [bs*draft_token*num_bytes_per_item]
+// positions [bs * draft_token]
+// retrive_index [bs, draft_token]
+// retrive_next_token [bs, draft_token]
+// retrive_next_sibling [bs, draft_token]
+__global__ void build_tree_efficient_partial_packed(
+    int64_t* parent_list,
+    int64_t* selected_index,
+    int64_t* verified_seq_len,
+    uint8_t* tree_mask,
+    int64_t* positions,
+    int64_t* retrive_index,
+    int64_t* retrive_next_token,
+    int64_t* retrive_next_sibling,
+    int topk,
+    int depth,
+    int draft_token_num,
+    size_t num_bytes_per_item) {
+  int bid = blockIdx.x;
+  int tid = threadIdx.x;
+
+  if (tid >= draft_token_num) {
+    return;
+  }
+  int seq_len = verified_seq_len[bid];
+  int token_tree_idx = (bid * draft_token_num + tid) * num_bytes_per_item;
+  tree_mask[token_tree_idx] = 1;  // little endian
+
+  int position = 0;
+  if (tid == 0) {
+    positions[bid * draft_token_num] = seq_len;
+
+    int retrive_index_offset = bid * draft_token_num;
+    for (int i = draft_token_num - 1; i > 0; --i) {
+      int current_token_idx = retrive_index_offset + i;
+      retrive_index[bid * draft_token_num + i] = current_token_idx;
+      int parent_tb_idx = selected_index[bid * (draft_token_num - 1) + i - 1] / topk;
+      int parent_position = 0;
+      if (parent_tb_idx > 0) {
+        int parent_token_idx = parent_list[bid * (topk * (depth - 1) + 1) + parent_tb_idx];
+        for (; parent_position < draft_token_num; ++parent_position) {
+          if (selected_index[bid * (draft_token_num - 1) + parent_position] == parent_token_idx) {
+            ++parent_position;
+            break;
+          }
+        }
+      }
+      if (parent_position == draft_token_num) {
+        printf(
+            "WARNING: invalid eagle tree!!! Detected a token with no parent token selected. "
+            "Please check if the logprob has nan. The token will be ignored to keep proceeding.\n");
+        continue;
+      }
+
+      if (retrive_next_token[bid * draft_token_num + parent_position] == -1) {
+        retrive_next_token[bid * draft_token_num + parent_position] = i;
+      } else {
+        int origin_next_token = retrive_next_token[bid * draft_token_num + parent_position];
+        retrive_next_token[bid * draft_token_num + parent_position] = i;
+        retrive_next_sibling[bid * draft_token_num + i] = origin_next_token;
+      }
+    }
+    retrive_index[bid * draft_token_num] = bid * draft_token_num;
+  } else {
+    int cur_position = tid - 1;
+    while (true) {
+      position += 1;
+      int byte_idx = (cur_position + 1) / 8;
+      int bit_idx = (cur_position + 1) % 8;
+      tree_mask[token_tree_idx + byte_idx] |= (1 << bit_idx);
+      int parent_tb_idx = selected_index[bid * (draft_token_num - 1) + cur_position] / topk;
+      if (parent_tb_idx == 0) {
+        break;
+      }
+
+      int token_idx = parent_list[bid * (topk * (depth - 1) + 1) + parent_tb_idx];
+      for (cur_position = 0; cur_position < draft_token_num; ++cur_position) {
+        if (selected_index[bid * (draft_token_num - 1) + cur_position] == token_idx) {
+          break;
+        }
+      }
+    }
+    positions[bid * draft_token_num + tid] = position + seq_len;
+  }
+}
+
+void build_tree_kernel_efficient(
+    at::Tensor parent_list,
+    at::Tensor selected_index,
+    at::Tensor verified_seq_len,
+    at::Tensor tree_mask,
+    at::Tensor positions,
+    at::Tensor retrive_index,
+    at::Tensor retrive_next_token,
+    at::Tensor retrive_next_sibling,
+    int64_t topk,
+    int64_t depth,
+    int64_t draft_token_num,
+    int64_t tree_mask_mode) {
+  // TODO (ying) check shape
+  // TODO (ying) check type
+  int bs = parent_list.size(0);
+  dim3 grid(bs);
+  dim3 block(draft_token_num);
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (tree_mask_mode == QLEN_ONLY_BITPACKING) {
+    size_t num_bytes_per_item = 1;
+    if (draft_token_num > 16) {
+      num_bytes_per_item = 4;
+    } else if (draft_token_num > 8) {
+      num_bytes_per_item = 2;
+    }
+    build_tree_efficient_partial_packed<<<grid, block, 0, stream>>>(
+        static_cast<int64_t*>(parent_list.data_ptr()),
+        static_cast<int64_t*>(selected_index.data_ptr()),
+        static_cast<int64_t*>(verified_seq_len.data_ptr()),
+        static_cast<uint8_t*>(tree_mask.data_ptr()),
+        static_cast<int64_t*>(positions.data_ptr()),
+        static_cast<int64_t*>(retrive_index.data_ptr()),
+        static_cast<int64_t*>(retrive_next_token.data_ptr()),
+        static_cast<int64_t*>(retrive_next_sibling.data_ptr()),
+        int32_t(topk),
+        int32_t(depth),
+        int32_t(draft_token_num),
+        num_bytes_per_item);
+  } else {
+    build_tree_efficient<<<grid, block, 0, stream>>>(
+        static_cast<int64_t*>(parent_list.data_ptr()),
+        static_cast<int64_t*>(selected_index.data_ptr()),
+        static_cast<int64_t*>(verified_seq_len.data_ptr()),
+        static_cast<bool*>(tree_mask.data_ptr()),
+        static_cast<int64_t*>(positions.data_ptr()),
+        static_cast<int64_t*>(retrive_index.data_ptr()),
+        static_cast<int64_t*>(retrive_next_token.data_ptr()),
+        static_cast<int64_t*>(retrive_next_sibling.data_ptr()),
+        int32_t(topk),
+        int32_t(depth),
+        int32_t(draft_token_num),
+        int32_t(tree_mask_mode));
+  }
+}
+
+template <typename IdType, typename IdType2>
+__global__ void VerifyTreeGreedy(
+    IdType* predicts,
+    IdType* accept_index,
+    IdType* accept_token_num,  // mutable
+    IdType2* candidates,
+    IdType2* retrive_index,
+    IdType2* retrive_next_token,
+    IdType2* retrive_next_sibling,
+    IdType2* target_predict,
+    uint32_t batch_size,
+    uint32_t num_speculative_tokens,
+    uint32_t num_draft_tokens) {
+  uint32_t bx = blockIdx.x;
+
+  IdType2 last_accepted_retrive_idx = retrive_index[bx * num_draft_tokens];
+  accept_index[bx * num_speculative_tokens] = last_accepted_retrive_idx;
+  uint32_t num_accepted_tokens = 0;
+  IdType2 cur_index = 0;
+
+  for (uint32_t j = 1; j < num_speculative_tokens; ++j) {
+    cur_index = retrive_next_token[bx * num_draft_tokens + cur_index];
+    while (cur_index != -1) {
+      IdType2 draft_index = retrive_index[bx * num_draft_tokens + cur_index];
+      IdType2 draft_token_id = candidates[bx * num_draft_tokens + cur_index];
+      IdType2 target_token_id = target_predict[last_accepted_retrive_idx];
+
+      if (draft_token_id == target_token_id) {
+        // accept token
+        predicts[last_accepted_retrive_idx] = target_token_id;
+        ++num_accepted_tokens;
+        accept_index[bx * num_speculative_tokens + num_accepted_tokens] = draft_index;
+        last_accepted_retrive_idx = draft_index;
+        break;
+      } else {
+        cur_index = retrive_next_sibling[bx * num_draft_tokens + cur_index];
+      }
+    }
+    if (cur_index == -1) break;
+  }
+  accept_token_num[bx] = num_accepted_tokens;
+  predicts[last_accepted_retrive_idx] = target_predict[last_accepted_retrive_idx];
+}
+
+// predicts: [tot_num_draft_tokens]
+// accept_index: [bs, num_spec_step]
+// accept_token_num: [bs]
+// candidates: [bs, num_draft_tokens]
+// retrive_index: [bs, num_draft_tokens]
+// retrive_next_token: [bs, num_draft_tokens]
+// retrive_next_sibling: [bs, num_draft_tokens]
+// target_predict: [bs, num_draft_tokens]
+void verify_tree_greedy(
+    at::Tensor predicts,
+    at::Tensor accept_index,
+    at::Tensor accept_token_num,  // mutable
+    at::Tensor candidates,
+    at::Tensor retrive_index,
+    at::Tensor retrive_next_token,
+    at::Tensor retrive_next_sibling,
+    at::Tensor target_predict,
+    int64_t cuda_stream = 0) {
+  CHECK_INPUT(candidates);
+  CHECK_INPUT(retrive_index);
+  CHECK_INPUT(retrive_next_token);
+  CHECK_INPUT(retrive_next_sibling);
+  CHECK_INPUT(target_predict);
+  auto device = target_predict.device();
+  CHECK_EQ(candidates.device(), device);
+  CHECK_EQ(retrive_index.device(), device);
+  CHECK_EQ(retrive_next_token.device(), device);
+  CHECK_EQ(retrive_next_sibling.device(), device);
+  CHECK_EQ(target_predict.device(), device);
+  CHECK_DIM(1, predicts);
+  CHECK_DIM(2, accept_index);
+  CHECK_DIM(1, accept_token_num);
+  CHECK_DIM(2, candidates);
+  CHECK_DIM(2, retrive_index);
+  CHECK_DIM(2, retrive_next_token);
+  CHECK_DIM(2, retrive_next_sibling);
+  CHECK_DIM(2, target_predict);
+  unsigned int batch_size = candidates.size(0);
+  unsigned int num_spec_step = accept_index.size(1);
+  unsigned int num_draft_tokens = candidates.size(1);
+  CHECK_EQ(batch_size, accept_index.size(0));
+  CHECK_EQ(batch_size, accept_token_num.size(0));
+  CHECK_EQ(batch_size, retrive_index.size(0));
+  CHECK_EQ(batch_size, retrive_next_token.size(0));
+  CHECK_EQ(batch_size, retrive_next_sibling.size(0));
+  CHECK_EQ(batch_size, target_predict.size(0));
+  CHECK_EQ(num_draft_tokens, retrive_index.size(1));
+  CHECK_EQ(num_draft_tokens, retrive_next_token.size(1));
+  CHECK_EQ(num_draft_tokens, retrive_next_sibling.size(1));
+  CHECK_EQ(num_draft_tokens, target_predict.size(1));
+  CHECK_EQ(batch_size, accept_index.size(0));
+  CHECK_EQ(batch_size, accept_token_num.size(0));
+  if (predicts.scalar_type() != at::kInt) {
+    throw std::runtime_error("Expected 'predicts' to be of type int (torch.int32).");
+  }
+  if (accept_index.scalar_type() != at::kInt) {
+    throw std::runtime_error("Expected 'accept_index' to be of type int (torch.int32).");
+  }
+  if (accept_token_num.scalar_type() != at::kInt) {
+    throw std::runtime_error("Expected 'accept_token_num' to be of type int (torch.int32).");
+  }
+  if (candidates.scalar_type() != at::kLong) {
+    throw std::runtime_error("Expected 'candidates' to be of type long (torch.int64).");
+  }
+  if (retrive_index.scalar_type() != at::kLong) {
+    throw std::runtime_error("Expected 'retrive_index' to be of type long (torch.int64).");
+  }
+  if (retrive_next_token.scalar_type() != at::kLong) {
+    throw std::runtime_error("Expected 'retrive_next_token' to be of type long (torch.int64).");
+  }
+  if (retrive_next_sibling.scalar_type() != at::kLong) {
+    throw std::runtime_error("Expected 'retrive_next_sibling' to be of type long (torch.int64).");
+  }
+  if (target_predict.scalar_type() != at::kLong) {
+    throw std::runtime_error("Expected 'target_predict' to be of type long (torch.int64).");
+  }
+
+  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
+  dim3 grid(batch_size);
+  dim3 block(1);
+
+  VerifyTreeGreedy<int32_t, int64_t><<<grid, block, 0, stream>>>(
+      static_cast<int32_t*>(predicts.data_ptr()),
+      static_cast<int32_t*>(accept_index.data_ptr()),
+      static_cast<int32_t*>(accept_token_num.data_ptr()),
+      static_cast<int64_t*>(candidates.data_ptr()),
+      static_cast<int64_t*>(retrive_index.data_ptr()),
+      static_cast<int64_t*>(retrive_next_token.data_ptr()),
+      static_cast<int64_t*>(retrive_next_sibling.data_ptr()),
+      static_cast<int64_t*>(target_predict.data_ptr()),
+      batch_size,
+      num_spec_step,
+      num_draft_tokens);
+}
diff --git a/sgl-kernel/csrc/speculative/eagle_utils.hip b/sgl-kernel/csrc/speculative/eagle_utils.hip
new file mode 100644
index 000000000..4903f78c4
--- /dev/null
+++ b/sgl-kernel/csrc/speculative/eagle_utils.hip
@@ -0,0 +1,411 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+#include "hip/hip_runtime.h"
+/*
+ * Copyright (c) 2025 by SGLang team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/hip/HIPContext.h>
+
+#ifndef USE_ROCM
+#include "pytorch_extension_utils.h"
+#else
+#include "pytorch_extension_utils_rocm.h"
+#endif
+
+typedef enum { FULL_MASK = 0, QLEN_ONLY = 1, QLEN_ONLY_BITPACKING = 2 } TreeMaskMode;
+
+// parent_list [bs, topk * (depth - 1) + 1)]
+// selected_index [bs, draft_token_num - 1]
+// verified_seq_len [bs]
+// tree_mask [draft_token*(seq_len[0]+draft_token) | draft_token*(seq_len[1]+draft_token) | ..] =
+// [sum(verified_seq_len)*draft_token+bs*draft_token*draft_token] positions [bs * draft_token] retrive_index [b,
+// draft_token] retrive_next_token [b, draft_token] retrive_next_sibling [b, draft_token]
+__global__ void build_tree_efficient(
+    int64_t* parent_list,
+    int64_t* selected_index,
+    int64_t* verified_seq_len,
+    bool* tree_mask,
+    int64_t* positions,
+    int64_t* retrive_index,
+    int64_t* retrive_next_token,
+    int64_t* retrive_next_sibling,
+    int topk,
+    int depth,
+    int draft_token_num,
+    int tree_mask_mode) {
+  int bid = blockIdx.x;
+  int tid = threadIdx.x;
+
+  if (tid >= draft_token_num) {
+    return;
+  }
+  int seq_tree_idx = draft_token_num * draft_token_num * bid;
+  for (int i = 0; i < bid; i++) {
+    seq_tree_idx += verified_seq_len[i] * draft_token_num;
+  }
+  int seq_len = verified_seq_len[bid];
+  int token_tree_idx;
+  if (tree_mask_mode == FULL_MASK) {
+    token_tree_idx = seq_tree_idx + (seq_len + draft_token_num) * tid + seq_len + 1;
+  } else {
+    token_tree_idx = draft_token_num * draft_token_num * bid + draft_token_num * tid + 1;
+  }
+  tree_mask[token_tree_idx - 1] = true;
+  for (int i = 0; i < draft_token_num - 1; i++) {
+    tree_mask[token_tree_idx + i] = false;
+  }
+
+  int position = 0;
+  if (tid == 0) {
+    positions[bid * draft_token_num] = seq_len;
+
+    int retrive_index_offset = bid * draft_token_num;
+    for (int i = draft_token_num - 1; i > 0; --i) {
+      int current_token_idx = retrive_index_offset + i;
+      retrive_index[bid * draft_token_num + i] = current_token_idx;
+      int parent_tb_idx = selected_index[bid * (draft_token_num - 1) + i - 1] / topk;
+      int parent_position = 0;
+      if (parent_tb_idx > 0) {
+        int parent_token_idx = parent_list[bid * (topk * (depth - 1) + 1) + parent_tb_idx];
+        for (; parent_position < draft_token_num; ++parent_position) {
+          if (selected_index[bid * (draft_token_num - 1) + parent_position] == parent_token_idx) {
+            ++parent_position;
+            break;
+          }
+        }
+      }
+      if (parent_position == draft_token_num) {
+        printf(
+            "WARNING: invalid eagle tree!!! Detected a token with no parent token selected. "
+            "Please check if the logprob has nan. The token will be ignored to keep proceeding.\n");
+        continue;
+      }
+
+      if (retrive_next_token[bid * draft_token_num + parent_position] == -1) {
+        retrive_next_token[bid * draft_token_num + parent_position] = i;
+      } else {
+        int origin_next_token = retrive_next_token[bid * draft_token_num + parent_position];
+        retrive_next_token[bid * draft_token_num + parent_position] = i;
+        retrive_next_sibling[bid * draft_token_num + i] = origin_next_token;
+      }
+    }
+    retrive_index[bid * draft_token_num] = bid * draft_token_num;
+  } else {
+    int cur_position = tid - 1;
+    while (true) {
+      position += 1;
+      tree_mask[token_tree_idx + cur_position] = true;
+      int parent_tb_idx = selected_index[bid * (draft_token_num - 1) + cur_position] / topk;
+      if (parent_tb_idx == 0) {
+        break;
+      }
+
+      int token_idx = parent_list[bid * (topk * (depth - 1) + 1) + parent_tb_idx];
+      for (cur_position = 0; cur_position < draft_token_num; ++cur_position) {
+        if (selected_index[bid * (draft_token_num - 1) + cur_position] == token_idx) {
+          break;
+        }
+      }
+    }
+    positions[bid * draft_token_num + tid] = position + seq_len;
+  }
+}
+
+// parent_list [bs, topk * (depth - 1) + 1)]
+// selected_index [bs, draft_token_num - 1]
+// verified_seq_len [bs]
+// tree_mask: [draft_token*num_bytes_per_item | .. ] = [bs*draft_token*num_bytes_per_item]
+// positions [bs * draft_token]
+// retrive_index [bs, draft_token]
+// retrive_next_token [bs, draft_token]
+// retrive_next_sibling [bs, draft_token]
+__global__ void build_tree_efficient_partial_packed(
+    int64_t* parent_list,
+    int64_t* selected_index,
+    int64_t* verified_seq_len,
+    uint8_t* tree_mask,
+    int64_t* positions,
+    int64_t* retrive_index,
+    int64_t* retrive_next_token,
+    int64_t* retrive_next_sibling,
+    int topk,
+    int depth,
+    int draft_token_num,
+    size_t num_bytes_per_item) {
+  int bid = blockIdx.x;
+  int tid = threadIdx.x;
+
+  if (tid >= draft_token_num) {
+    return;
+  }
+  int seq_len = verified_seq_len[bid];
+  int token_tree_idx = (bid * draft_token_num + tid) * num_bytes_per_item;
+  tree_mask[token_tree_idx] = 1;  // little endian
+
+  int position = 0;
+  if (tid == 0) {
+    positions[bid * draft_token_num] = seq_len;
+
+    int retrive_index_offset = bid * draft_token_num;
+    for (int i = draft_token_num - 1; i > 0; --i) {
+      int current_token_idx = retrive_index_offset + i;
+      retrive_index[bid * draft_token_num + i] = current_token_idx;
+      int parent_tb_idx = selected_index[bid * (draft_token_num - 1) + i - 1] / topk;
+      int parent_position = 0;
+      if (parent_tb_idx > 0) {
+        int parent_token_idx = parent_list[bid * (topk * (depth - 1) + 1) + parent_tb_idx];
+        for (; parent_position < draft_token_num; ++parent_position) {
+          if (selected_index[bid * (draft_token_num - 1) + parent_position] == parent_token_idx) {
+            ++parent_position;
+            break;
+          }
+        }
+      }
+      if (parent_position == draft_token_num) {
+        printf(
+            "WARNING: invalid eagle tree!!! Detected a token with no parent token selected. "
+            "Please check if the logprob has nan. The token will be ignored to keep proceeding.\n");
+        continue;
+      }
+
+      if (retrive_next_token[bid * draft_token_num + parent_position] == -1) {
+        retrive_next_token[bid * draft_token_num + parent_position] = i;
+      } else {
+        int origin_next_token = retrive_next_token[bid * draft_token_num + parent_position];
+        retrive_next_token[bid * draft_token_num + parent_position] = i;
+        retrive_next_sibling[bid * draft_token_num + i] = origin_next_token;
+      }
+    }
+    retrive_index[bid * draft_token_num] = bid * draft_token_num;
+  } else {
+    int cur_position = tid - 1;
+    while (true) {
+      position += 1;
+      int byte_idx = (cur_position + 1) / 8;
+      int bit_idx = (cur_position + 1) % 8;
+      tree_mask[token_tree_idx + byte_idx] |= (1 << bit_idx);
+      int parent_tb_idx = selected_index[bid * (draft_token_num - 1) + cur_position] / topk;
+      if (parent_tb_idx == 0) {
+        break;
+      }
+
+      int token_idx = parent_list[bid * (topk * (depth - 1) + 1) + parent_tb_idx];
+      for (cur_position = 0; cur_position < draft_token_num; ++cur_position) {
+        if (selected_index[bid * (draft_token_num - 1) + cur_position] == token_idx) {
+          break;
+        }
+      }
+    }
+    positions[bid * draft_token_num + tid] = position + seq_len;
+  }
+}
+
+void build_tree_kernel_efficient(
+    at::Tensor parent_list,
+    at::Tensor selected_index,
+    at::Tensor verified_seq_len,
+    at::Tensor tree_mask,
+    at::Tensor positions,
+    at::Tensor retrive_index,
+    at::Tensor retrive_next_token,
+    at::Tensor retrive_next_sibling,
+    int64_t topk,
+    int64_t depth,
+    int64_t draft_token_num,
+    int64_t tree_mask_mode) {
+  // TODO (ying) check shape
+  // TODO (ying) check type
+  int bs = parent_list.size(0);
+  dim3 grid(bs);
+  dim3 block(draft_token_num);
+  const hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+
+  if (tree_mask_mode == QLEN_ONLY_BITPACKING) {
+    size_t num_bytes_per_item = 1;
+    if (draft_token_num > 16) {
+      num_bytes_per_item = 4;
+    } else if (draft_token_num > 8) {
+      num_bytes_per_item = 2;
+    }
+   hipLaunchKernelGGL(( build_tree_efficient_partial_packed), dim3(grid), dim3(block), 0, stream, 
+        static_cast<int64_t*>(parent_list.data_ptr()),
+        static_cast<int64_t*>(selected_index.data_ptr()),
+        static_cast<int64_t*>(verified_seq_len.data_ptr()),
+        static_cast<uint8_t*>(tree_mask.data_ptr()),
+        static_cast<int64_t*>(positions.data_ptr()),
+        static_cast<int64_t*>(retrive_index.data_ptr()),
+        static_cast<int64_t*>(retrive_next_token.data_ptr()),
+        static_cast<int64_t*>(retrive_next_sibling.data_ptr()),
+        int32_t(topk),
+        int32_t(depth),
+        int32_t(draft_token_num),
+        num_bytes_per_item);
+  } else {
+   hipLaunchKernelGGL(( build_tree_efficient), dim3(grid), dim3(block), 0, stream, 
+        static_cast<int64_t*>(parent_list.data_ptr()),
+        static_cast<int64_t*>(selected_index.data_ptr()),
+        static_cast<int64_t*>(verified_seq_len.data_ptr()),
+        static_cast<bool*>(tree_mask.data_ptr()),
+        static_cast<int64_t*>(positions.data_ptr()),
+        static_cast<int64_t*>(retrive_index.data_ptr()),
+        static_cast<int64_t*>(retrive_next_token.data_ptr()),
+        static_cast<int64_t*>(retrive_next_sibling.data_ptr()),
+        int32_t(topk),
+        int32_t(depth),
+        int32_t(draft_token_num),
+        int32_t(tree_mask_mode));
+  }
+}
+
+template <typename IdType, typename IdType2>
+__global__ void VerifyTreeGreedy(
+    IdType* predicts,
+    IdType* accept_index,
+    IdType* accept_token_num,  // mutable
+    IdType2* candidates,
+    IdType2* retrive_index,
+    IdType2* retrive_next_token,
+    IdType2* retrive_next_sibling,
+    IdType2* target_predict,
+    uint32_t batch_size,
+    uint32_t num_speculative_tokens,
+    uint32_t num_draft_tokens) {
+  uint32_t bx = blockIdx.x;
+
+  IdType2 last_accepted_retrive_idx = retrive_index[bx * num_draft_tokens];
+  accept_index[bx * num_speculative_tokens] = last_accepted_retrive_idx;
+  uint32_t num_accepted_tokens = 0;
+  IdType2 cur_index = 0;
+
+  for (uint32_t j = 1; j < num_speculative_tokens; ++j) {
+    cur_index = retrive_next_token[bx * num_draft_tokens + cur_index];
+    while (cur_index != -1) {
+      IdType2 draft_index = retrive_index[bx * num_draft_tokens + cur_index];
+      IdType2 draft_token_id = candidates[bx * num_draft_tokens + cur_index];
+      IdType2 target_token_id = target_predict[last_accepted_retrive_idx];
+
+      if (draft_token_id == target_token_id) {
+        // accept token
+        predicts[last_accepted_retrive_idx] = target_token_id;
+        ++num_accepted_tokens;
+        accept_index[bx * num_speculative_tokens + num_accepted_tokens] = draft_index;
+        last_accepted_retrive_idx = draft_index;
+        break;
+      } else {
+        cur_index = retrive_next_sibling[bx * num_draft_tokens + cur_index];
+      }
+    }
+    if (cur_index == -1) break;
+  }
+  accept_token_num[bx] = num_accepted_tokens;
+  predicts[last_accepted_retrive_idx] = target_predict[last_accepted_retrive_idx];
+}
+
+// predicts: [tot_num_draft_tokens]
+// accept_index: [bs, num_spec_step]
+// accept_token_num: [bs]
+// candidates: [bs, num_draft_tokens]
+// retrive_index: [bs, num_draft_tokens]
+// retrive_next_token: [bs, num_draft_tokens]
+// retrive_next_sibling: [bs, num_draft_tokens]
+// target_predict: [bs, num_draft_tokens]
+void verify_tree_greedy(
+    at::Tensor predicts,
+    at::Tensor accept_index,
+    at::Tensor accept_token_num,  // mutable
+    at::Tensor candidates,
+    at::Tensor retrive_index,
+    at::Tensor retrive_next_token,
+    at::Tensor retrive_next_sibling,
+    at::Tensor target_predict,
+    int64_t cuda_stream = 0) {
+  CHECK_INPUT(candidates);
+  CHECK_INPUT(retrive_index);
+  CHECK_INPUT(retrive_next_token);
+  CHECK_INPUT(retrive_next_sibling);
+  CHECK_INPUT(target_predict);
+  auto device = target_predict.device();
+  CHECK_EQ(candidates.device(), device);
+  CHECK_EQ(retrive_index.device(), device);
+  CHECK_EQ(retrive_next_token.device(), device);
+  CHECK_EQ(retrive_next_sibling.device(), device);
+  CHECK_EQ(target_predict.device(), device);
+  CHECK_DIM(1, predicts);
+  CHECK_DIM(2, accept_index);
+  CHECK_DIM(1, accept_token_num);
+  CHECK_DIM(2, candidates);
+  CHECK_DIM(2, retrive_index);
+  CHECK_DIM(2, retrive_next_token);
+  CHECK_DIM(2, retrive_next_sibling);
+  CHECK_DIM(2, target_predict);
+  unsigned int batch_size = candidates.size(0);
+  unsigned int num_spec_step = accept_index.size(1);
+  unsigned int num_draft_tokens = candidates.size(1);
+  CHECK_EQ(batch_size, accept_index.size(0));
+  CHECK_EQ(batch_size, accept_token_num.size(0));
+  CHECK_EQ(batch_size, retrive_index.size(0));
+  CHECK_EQ(batch_size, retrive_next_token.size(0));
+  CHECK_EQ(batch_size, retrive_next_sibling.size(0));
+  CHECK_EQ(batch_size, target_predict.size(0));
+  CHECK_EQ(num_draft_tokens, retrive_index.size(1));
+  CHECK_EQ(num_draft_tokens, retrive_next_token.size(1));
+  CHECK_EQ(num_draft_tokens, retrive_next_sibling.size(1));
+  CHECK_EQ(num_draft_tokens, target_predict.size(1));
+  CHECK_EQ(batch_size, accept_index.size(0));
+  CHECK_EQ(batch_size, accept_token_num.size(0));
+  if (predicts.scalar_type() != at::kInt) {
+    throw std::runtime_error("Expected 'predicts' to be of type int (torch.int32).");
+  }
+  if (accept_index.scalar_type() != at::kInt) {
+    throw std::runtime_error("Expected 'accept_index' to be of type int (torch.int32).");
+  }
+  if (accept_token_num.scalar_type() != at::kInt) {
+    throw std::runtime_error("Expected 'accept_token_num' to be of type int (torch.int32).");
+  }
+  if (candidates.scalar_type() != at::kLong) {
+    throw std::runtime_error("Expected 'candidates' to be of type long (torch.int64).");
+  }
+  if (retrive_index.scalar_type() != at::kLong) {
+    throw std::runtime_error("Expected 'retrive_index' to be of type long (torch.int64).");
+  }
+  if (retrive_next_token.scalar_type() != at::kLong) {
+    throw std::runtime_error("Expected 'retrive_next_token' to be of type long (torch.int64).");
+  }
+  if (retrive_next_sibling.scalar_type() != at::kLong) {
+    throw std::runtime_error("Expected 'retrive_next_sibling' to be of type long (torch.int64).");
+  }
+  if (target_predict.scalar_type() != at::kLong) {
+    throw std::runtime_error("Expected 'target_predict' to be of type long (torch.int64).");
+  }
+
+  hipStream_t stream = reinterpret_cast<hipStream_t>(cuda_stream);
+  dim3 grid(batch_size);
+  dim3 block(1);
+
+ hipLaunchKernelGGL(( VerifyTreeGreedy<int32_t, int64_t>), dim3(grid), dim3(block), 0, stream, 
+      static_cast<int32_t*>(predicts.data_ptr()),
+      static_cast<int32_t*>(accept_index.data_ptr()),
+      static_cast<int32_t*>(accept_token_num.data_ptr()),
+      static_cast<int64_t*>(candidates.data_ptr()),
+      static_cast<int64_t*>(retrive_index.data_ptr()),
+      static_cast<int64_t*>(retrive_next_token.data_ptr()),
+      static_cast<int64_t*>(retrive_next_sibling.data_ptr()),
+      static_cast<int64_t*>(target_predict.data_ptr()),
+      batch_size,
+      num_spec_step,
+      num_draft_tokens);
+}
diff --git a/sgl-kernel/csrc/speculative/packbit.cu b/sgl-kernel/csrc/speculative/packbit.cu
new file mode 100644
index 000000000..1decc3ded
--- /dev/null
+++ b/sgl-kernel/csrc/speculative/packbit.cu
@@ -0,0 +1,51 @@
+// This is only a plugin used for flashinfer 0.1.6. The new version does not need it.
+/*
+ * Copyright (c) 2025 by SGLang team.
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <flashinfer/quantization.cuh>
+
+#include "pytorch_extension_utils.h"
+
+using namespace flashinfer;
+
+// bitorder = "little"
+void segment_packbits(
+    at::Tensor x,
+    at::Tensor input_indptr,
+    at::Tensor output_indptr,
+    at::Tensor y,
+    int64_t batch_size,
+    int64_t cuda_stream) {
+  CHECK_INPUT(x);
+  CHECK_INPUT(input_indptr);
+  CHECK_INPUT(output_indptr);
+  auto device = x.device();
+  CHECK_EQ(input_indptr.device(), device);
+  CHECK_EQ(output_indptr.device(), device);
+  CHECK_EQ(y.device(), device);
+  CHECK_GE(output_indptr.size(0), batch_size + 1);
+
+  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
+  cudaError_t status = quantization::SegmentPackBits(
+      static_cast<bool*>(x.data_ptr()),
+      static_cast<uint8_t*>(y.data_ptr()),
+      static_cast<int32_t*>(input_indptr.data_ptr()),
+      static_cast<int32_t*>(output_indptr.data_ptr()),
+      batch_size,
+      quantization::BitOrder::kLittle,
+      stream);
+}
diff --git a/sgl-kernel/csrc/speculative/speculative_sampling.cu b/sgl-kernel/csrc/speculative/speculative_sampling.cu
new file mode 100644
index 000000000..ca545e99e
--- /dev/null
+++ b/sgl-kernel/csrc/speculative/speculative_sampling.cu
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2025 by SGLang team.
+ * Copyright (c) 2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "pytorch_extension_utils.h"
+#include "speculative_sampling.cuh"
+
+using namespace flashinfer;
+
+// predicts: [tot_num_draft_tokens]
+// accept_index: [bs, num_spec_step]
+// accept_token_num: [bs]
+// candidates: [bs, num_draft_tokens]
+// retrive_index: [bs, num_draft_tokens]
+// retrive_next_token: [bs, num_draft_tokens]
+// retrive_next_sibling: [bs, num_draft_tokens]
+// uniform_samples: [bs, num_draft_tokens]
+// target_probs: [bs, num_draft_tokens, vocab_size]
+void tree_speculative_sampling_target_only(
+    at::Tensor predicts,
+    at::Tensor accept_index,
+    at::Tensor accept_token_num,  // mutable
+    at::Tensor candidates,
+    at::Tensor retrive_index,
+    at::Tensor retrive_next_token,
+    at::Tensor retrive_next_sibling,
+    at::Tensor uniform_samples,
+    at::Tensor uniform_samples_for_final_sampling,
+    at::Tensor target_probs,
+    at::Tensor draft_probs,
+    double threshold_single,
+    double threshold_acc,
+    bool deterministic = true,
+    int64_t cuda_stream = 0) {
+  CHECK_INPUT(candidates);
+  CHECK_INPUT(retrive_index);
+  CHECK_INPUT(retrive_next_token);
+  CHECK_INPUT(retrive_next_sibling);
+  CHECK_INPUT(uniform_samples);
+  CHECK_INPUT(uniform_samples_for_final_sampling);
+  CHECK_INPUT(target_probs);
+  auto device = target_probs.device();
+  CHECK_EQ(candidates.device(), device);
+  CHECK_EQ(retrive_index.device(), device);
+  CHECK_EQ(retrive_next_token.device(), device);
+  CHECK_EQ(retrive_next_sibling.device(), device);
+  CHECK_EQ(uniform_samples.device(), device);
+  CHECK_EQ(uniform_samples_for_final_sampling.device(), device);
+  CHECK_EQ(target_probs.device(), device);
+  CHECK_DIM(1, predicts);
+  CHECK_DIM(2, accept_index);
+  CHECK_DIM(1, accept_token_num);
+  CHECK_DIM(2, candidates);
+  CHECK_DIM(2, retrive_index);
+  CHECK_DIM(2, retrive_next_token);
+  CHECK_DIM(2, retrive_next_sibling);
+  CHECK_DIM(2, uniform_samples);
+  CHECK_DIM(3, target_probs);
+  CHECK_DIM(3, draft_probs);
+  unsigned int batch_size = uniform_samples.size(0);
+  unsigned int num_spec_step = accept_index.size(1);
+  unsigned int num_draft_tokens = candidates.size(1);
+  unsigned int vocab_size = target_probs.size(2);
+  CHECK_EQ(batch_size, candidates.size(0));
+  CHECK_EQ(batch_size, retrive_index.size(0));
+  CHECK_EQ(batch_size, retrive_next_token.size(0));
+  CHECK_EQ(batch_size, retrive_next_sibling.size(0));
+  CHECK_EQ(batch_size, target_probs.size(0));
+  CHECK_EQ(num_draft_tokens, retrive_index.size(1));
+  CHECK_EQ(num_draft_tokens, retrive_next_token.size(1));
+  CHECK_EQ(num_draft_tokens, retrive_next_sibling.size(1));
+  CHECK_EQ(num_draft_tokens, uniform_samples.size(1));
+  CHECK_EQ(num_draft_tokens, target_probs.size(1));
+  CHECK_EQ(vocab_size, target_probs.size(2));
+  CHECK_EQ(batch_size, accept_index.size(0));
+  CHECK_EQ(batch_size, accept_token_num.size(0));
+  if (predicts.scalar_type() != at::kInt) {
+    throw std::runtime_error("Expected 'predicts' to be of type int (torch.int32).");
+  }
+  if (accept_index.scalar_type() != at::kInt) {
+    throw std::runtime_error("Expected 'accept_index' to be of type int (torch.int32).");
+  }
+  if (accept_token_num.scalar_type() != at::kInt) {
+    throw std::runtime_error("Expected 'accept_token_num' to be of type int (torch.int32).");
+  }
+  if (candidates.scalar_type() != at::kLong) {
+    throw std::runtime_error("Expected 'candidates' to be of type long (torch.int64).");
+  }
+  if (retrive_index.scalar_type() != at::kLong) {
+    throw std::runtime_error("Expected 'retrive_index' to be of type long (torch.int64).");
+  }
+  if (retrive_next_token.scalar_type() != at::kLong) {
+    throw std::runtime_error("Expected 'retrive_next_token' to be of type long (torch.int64).");
+  }
+  if (retrive_next_sibling.scalar_type() != at::kLong) {
+    throw std::runtime_error("Expected 'retrive_next_sibling' to be of type long (torch.int64).");
+  }
+  if (uniform_samples.scalar_type() != at::kFloat) {
+    throw std::runtime_error("Expected 'uniform_samples' to be of type float (torch.float32).");
+  }
+  if (uniform_samples_for_final_sampling.scalar_type() != at::kFloat) {
+    throw std::runtime_error("Expected 'uniform_samples_for_final_sampling' to be of type float (torch.float32).");
+  }
+  if (target_probs.scalar_type() != at::kFloat) {
+    throw std::runtime_error("Expected 'target_probs' to be of type float (torch.float32).");
+  }
+  if (draft_probs.scalar_type() != at::kFloat) {
+    throw std::runtime_error("Expected 'target_probs' to be of type float (torch.float32).");
+  }
+  CHECK_GE(threshold_single, 0);
+  CHECK_GE(1, threshold_single);
+  CHECK_GE(threshold_acc, 0);
+  CHECK_GE(1, threshold_acc);
+
+  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
+  cudaError_t status = sampling::TreeSpeculativeSamplingTargetOnly<float, int32_t, int64_t>(
+      static_cast<int32_t*>(predicts.data_ptr()),
+      static_cast<int32_t*>(accept_index.data_ptr()),
+      static_cast<int32_t*>(accept_token_num.data_ptr()),
+      static_cast<int64_t*>(candidates.data_ptr()),
+      static_cast<int64_t*>(retrive_index.data_ptr()),
+      static_cast<int64_t*>(retrive_next_token.data_ptr()),
+      static_cast<int64_t*>(retrive_next_sibling.data_ptr()),
+      static_cast<float*>(uniform_samples.data_ptr()),
+      static_cast<float*>(uniform_samples_for_final_sampling.data_ptr()),
+      static_cast<float*>(target_probs.data_ptr()),
+      static_cast<float*>(draft_probs.data_ptr()),
+      batch_size,
+      num_spec_step,
+      num_draft_tokens,
+      vocab_size,
+      static_cast<float>(threshold_single),
+      static_cast<float>(threshold_acc),
+      deterministic,
+      stream);
+
+  TORCH_CHECK(
+      status == cudaSuccess,
+      "TreeSpeculativeSamplingTargetOnly failed with error code " + std::string(cudaGetErrorString(status)));
+}
diff --git a/sgl-kernel/csrc/speculative/speculative_sampling.cuh b/sgl-kernel/csrc/speculative/speculative_sampling.cuh
new file mode 100644
index 000000000..59f18bc2f
--- /dev/null
+++ b/sgl-kernel/csrc/speculative/speculative_sampling.cuh
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2025 by SGLang team.
+ * Copyright (c) 2024-2025 by FlashInfer team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SPECULATIVE_SAMPLING_CUH_
+#define SPECULATIVE_SAMPLING_CUH_
+
+#include <assert.h>
+
+#include <flashinfer/sampling.cuh>
+
+namespace flashinfer {
+
+namespace sampling {
+
+using namespace cub;
+
+template <
+    uint32_t BLOCK_THREADS,
+    BlockScanAlgorithm SCAN_ALGORITHM,
+    BlockReduceAlgorithm REDUCE_ALGORITHM,
+    uint32_t VEC_SIZE,
+    bool DETERMINISTIC,
+    typename DType,
+    typename IdType,
+    typename IdType2>
+__global__ void TreeSpeculativeSamplingTargetOnly(
+    IdType* predicts,          // mutable
+    IdType* accept_index,      // mutable
+    IdType* accept_token_num,  // mutable
+    IdType2* candidates,
+    IdType2* retrive_index,
+    IdType2* retrive_next_token,
+    IdType2* retrive_next_sibling,
+    DType* uniform_samples,
+    DType* uniform_samples_for_final_sampling,
+    DType* target_probs,
+    DType* draft_probs,
+    uint32_t batch_size,
+    uint32_t num_speculative_tokens,
+    uint32_t num_draft_tokens,
+    uint32_t d,
+    DType threshold_single,
+    DType threshold_acc) {
+  const uint32_t bx = blockIdx.x, tx = threadIdx.x;
+
+  extern __shared__ __align__(alignof(SamplingTempStorage<BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM>))
+      uint8_t smem_sampling[];
+  auto& temp_storage =
+      reinterpret_cast<SamplingTempStorage<BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM>&>(smem_sampling);
+
+  DType prob_acc = 0.0;
+  uint32_t cur_prob_offset = bx * num_draft_tokens * d;
+  DType coin = uniform_samples[bx * num_draft_tokens];
+  IdType2 last_accepted_retrive_idx = retrive_index[bx * num_draft_tokens];
+  accept_index[bx * num_speculative_tokens] = last_accepted_retrive_idx;
+  uint32_t num_accepted_tokens = 0;
+  IdType2 cur_index = 0;
+
+  for (uint32_t j = 1; j < num_speculative_tokens; ++j) {
+    cur_index = retrive_next_token[bx * num_draft_tokens + cur_index];
+    while (cur_index != -1) {
+      IdType2 draft_index = retrive_index[bx * num_draft_tokens + cur_index];
+      IdType2 draft_token_id = candidates[bx * num_draft_tokens + cur_index];
+      DType target_prob_single = target_probs[cur_prob_offset + draft_token_id];
+      prob_acc += target_prob_single;
+
+      if (coin <= prob_acc / threshold_acc || target_prob_single >= threshold_single) {
+        // accept token
+        prob_acc = 0.;
+        cur_prob_offset = (bx * num_draft_tokens + cur_index) * d;
+        coin = uniform_samples[bx * num_draft_tokens + cur_index];
+        predicts[last_accepted_retrive_idx] = draft_token_id;
+        ++num_accepted_tokens;
+        accept_index[bx * num_speculative_tokens + num_accepted_tokens] = draft_index;
+        last_accepted_retrive_idx = draft_index;
+        break;
+      } else {
+        // FIXME: leverage draft probs
+        draft_probs[cur_prob_offset + draft_token_id] = target_probs[cur_prob_offset + draft_token_id];
+        cur_index = retrive_next_sibling[bx * num_draft_tokens + cur_index];
+      }
+    }
+    if (cur_index == -1) break;
+  }
+  accept_token_num[bx] = num_accepted_tokens;
+
+  // we need a different coin for the final sampling
+  coin = uniform_samples_for_final_sampling[bx];
+
+  // sample from relu(target_probs - draft_probs)
+  DType sum_relu_q_minus_p(0);
+  vec_t<DType, VEC_SIZE> q_vec, p_vec;
+  DType relu_q_minus_p[VEC_SIZE];
+  for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
+    q_vec.fill(DType(0));
+    p_vec.fill(DType(0));
+    if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
+      q_vec.load(target_probs + cur_prob_offset + i * BLOCK_THREADS * VEC_SIZE + tx * VEC_SIZE);
+      if (num_accepted_tokens != num_speculative_tokens - 1) {
+        // there is no draft_probs for the bonus token
+        p_vec.load(draft_probs + cur_prob_offset + i * BLOCK_THREADS * VEC_SIZE + tx * VEC_SIZE);
+      }
+    }
+#pragma unroll
+    for (uint32_t j = 0; j < VEC_SIZE; ++j) {
+      relu_q_minus_p[j] = max(q_vec[j] - p_vec[j], DType(0));
+    }
+    sum_relu_q_minus_p += BlockReduce<DType, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
+                              .Sum<VEC_SIZE>(relu_q_minus_p);
+    __syncthreads();
+  }
+  if (tx == 0) {
+    temp_storage.block_aggregate.value = sum_relu_q_minus_p;
+  }
+  // init the first rejected token to (d - 1)
+  temp_storage.sampled_id = d - 1;
+  __syncthreads();
+  sum_relu_q_minus_p = temp_storage.block_aggregate.value;
+  DType u = coin * sum_relu_q_minus_p;
+
+  DType aggregate_relu_q_minus_p(0);
+  for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
+    q_vec.fill(DType(0));
+    p_vec.fill(DType(0));
+    if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
+      q_vec.load(target_probs + cur_prob_offset + i * BLOCK_THREADS * VEC_SIZE + tx * VEC_SIZE);
+      if (num_accepted_tokens != num_speculative_tokens - 1) {
+        // there is no draft_probs for the bonus token
+        p_vec.load(draft_probs + cur_prob_offset + i * BLOCK_THREADS * VEC_SIZE + tx * VEC_SIZE);
+      }
+    }
+
+    vec_t<DType, VEC_SIZE> relu_q_minus_p_vec;
+#pragma unroll
+    for (uint32_t j = 0; j < VEC_SIZE; ++j) {
+      relu_q_minus_p_vec[j] = max(q_vec[j] - p_vec[j], DType(0));
+    }
+
+    DeviceSamplingFromProb<VEC_SIZE, BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM, DETERMINISTIC>(
+        i, d, [&](DType x) { return x > 0; }, u, relu_q_minus_p_vec, aggregate_relu_q_minus_p, &temp_storage);
+    if (aggregate_relu_q_minus_p > u) {
+      break;
+    }
+  }
+  __syncthreads();
+  // set the first rejected token
+  predicts[last_accepted_retrive_idx] = temp_storage.sampled_id;
+  // value at not used indices are undefined
+}
+
+template <typename DType, typename IdType, typename IdType2>
+cudaError_t TreeSpeculativeSamplingTargetOnly(
+    IdType* predicts,                   // mutable
+    IdType* output_token_ids,           // mutable
+    IdType* output_accepted_token_num,  // mutable
+    IdType2* candidates,
+    IdType2* retrive_index,
+    IdType2* retrive_next_token,
+    IdType2* retrive_next_sibling,
+    DType* uniform_samples,
+    DType* uniform_samples_for_final_sampling,
+    DType* target_probs,
+    DType* draft_probs,
+    uint32_t batch_size,
+    uint32_t num_speculative_tokens,
+    uint32_t num_draft_tokens,
+    uint32_t d,
+    DType threshold_single = 1,
+    DType threshold_acc = 1,
+    bool deterministic = true,
+    cudaStream_t stream = 0) {
+  constexpr uint32_t BLOCK_THREADS = 1024;
+  const uint32_t vec_size = std::gcd(16 / sizeof(DType), d);
+
+  const uint32_t smem_size = sizeof(SamplingTempStorage<BLOCK_THREADS, SCAN_ALGO, REDUCE_ALGO>);
+  dim3 nblks(batch_size);
+  dim3 nthrs(BLOCK_THREADS);
+  float capped_threshold_acc = fmaxf(threshold_acc, 1e-9f);
+  void* args[] = {
+      &predicts,
+      &output_token_ids,
+      &output_accepted_token_num,
+      &candidates,
+      &retrive_index,
+      &retrive_next_token,
+      &retrive_next_sibling,
+      &uniform_samples,
+      &uniform_samples_for_final_sampling,
+      &target_probs,
+      &draft_probs,
+      &batch_size,
+      &num_speculative_tokens,
+      &num_draft_tokens,
+      &d,
+      &threshold_single,
+      &capped_threshold_acc};
+  DISPATCH_ALIGNED_VEC_SIZE(
+      vec_size, VEC_SIZE, {DISPATCH_DETERMINISTIC(deterministic, DETERMINISTIC, {
+        auto kernel = TreeSpeculativeSamplingTargetOnly<
+            BLOCK_THREADS,
+            SCAN_ALGO,
+            REDUCE_ALGO,
+            VEC_SIZE,
+            DETERMINISTIC,
+            DType,
+            IdType,
+            IdType2>;
+        FLASHINFER_CUDA_CALL(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+        FLASHINFER_CUDA_CALL(cudaLaunchKernel((void*)kernel, nblks, nthrs, args, smem_size, stream));
+      })});
+  return cudaSuccess;
+}
+
+}  // namespace sampling
+
+}  // namespace flashinfer
+
+#endif  // SPECULATIVE_SAMPLING_CUH_
diff --git a/sgl-kernel/include/hip/hip_act_and_mul.cuh b/sgl-kernel/include/hip/hip_act_and_mul.cuh
new file mode 100644
index 000000000..ddb1b702d
--- /dev/null
+++ b/sgl-kernel/include/hip/hip_act_and_mul.cuh
@@ -0,0 +1,87 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include "utils.h"
+
+#define kBitsToLoad 128
+#define kBytesToLoad (kBitsToLoad / 8)
+
+// Adapted from
+// [flashinfer::activation::act_and_mul_kernel](https://github.com/flashinfer-ai/flashinfer/blob/4e8eb1879f9c3ba6d75511e5893183bf8f289a62/include/flashinfer/activation.cuh#L29)
+
+namespace sgl_hip {
+namespace activation {
+
+template <typename T, T (*Activation)(const T&)>
+__global__ void act_and_mul_kernel(T* __restrict__ out, const T* __restrict__ input, const int d) {
+  constexpr uint32_t vec_size = kBytesToLoad / sizeof(T);
+  const int64_t token_idx = blockIdx.x;
+  const int64_t thread_idx = threadIdx.x;
+  const int64_t stride = blockDim.x;
+  const int64_t offset = token_idx * 2 * d;
+
+#pragma unroll 1
+  for (uint32_t idx = thread_idx; idx < d / vec_size; idx += stride) {
+    sgl_hip::vec_t<T, vec_size> x_vec, y_vec, out_vec;
+    x_vec.cast_load(input + offset + idx * vec_size);
+    y_vec.cast_load(input + offset + d + idx * vec_size);
+#pragma unroll
+    for (uint32_t i = 0; i < vec_size; ++i) {
+      out_vec[i] = Activation(x_vec[i]) * y_vec[i];
+    }
+    out_vec.cast_store(out + token_idx * d + idx * vec_size);
+  }
+
+  const int64_t remaining_offset = d - d % (stride * vec_size);
+  // process the remaining elements
+#pragma unroll 1
+  for (int64_t idx = thread_idx; idx < d % (stride * vec_size); idx += stride) {
+    T x = input[offset + remaining_offset + idx], y = input[offset + remaining_offset + d + idx];
+    out[token_idx * d + remaining_offset + idx] = Activation(x) * y;
+  }
+}
+
+template <typename T, T (*Activation)(const T&)>
+__global__ void act_only_kernel(T* __restrict__ out, const T* __restrict__ input, const int d) {
+  constexpr uint32_t vec_size = kBytesToLoad / sizeof(T);
+  const int64_t token_idx = blockIdx.x;
+  const int64_t thread_idx = threadIdx.x;
+  const int64_t stride = blockDim.x;
+  const int64_t offset = token_idx * d;
+
+#pragma unroll 1
+  for (uint32_t idx = thread_idx; idx < d / vec_size; idx += stride) {
+    sgl_hip::vec_t<T, vec_size> x_vec, y_vec, out_vec;
+    x_vec.cast_load(input + offset + idx * vec_size);
+#pragma unroll
+    for (uint32_t i = 0; i < vec_size; ++i) {
+      out_vec[i] = Activation(x_vec[i]);
+    }
+    out_vec.cast_store(out + token_idx * d + idx * vec_size);
+  }
+
+  const int64_t remaining_offset = d - d % (stride * vec_size);
+  // process the remaining elements
+#pragma unroll 1
+  for (int64_t idx = thread_idx; idx < d % (stride * vec_size); idx += stride) {
+    T x = input[offset + remaining_offset + idx];
+    out[token_idx * d + remaining_offset + idx] = Activation(x);
+  }
+}
+
+}  // namespace activation
+}  // namespace sgl_hip
diff --git a/sgl-kernel/include/hip/hip_act_and_mul_hip.cuh b/sgl-kernel/include/hip/hip_act_and_mul_hip.cuh
new file mode 100644
index 000000000..d45409fa5
--- /dev/null
+++ b/sgl-kernel/include/hip/hip_act_and_mul_hip.cuh
@@ -0,0 +1,90 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+#include "hip/hip_runtime.h"
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include "utils_hip.h"
+
+#define kBitsToLoad 128
+#define kBytesToLoad (kBitsToLoad / 8)
+
+// Adapted from
+// [flashinfer::activation::act_and_mul_kernel](https://github.com/flashinfer-ai/flashinfer/blob/4e8eb1879f9c3ba6d75511e5893183bf8f289a62/include/flashinfer/activation.cuh#L29)
+
+namespace sgl_hip {
+namespace activation {
+
+template <typename T, T (*Activation)(const T&)>
+__global__ void act_and_mul_kernel(T* __restrict__ out, const T* __restrict__ input, const int d) {
+  constexpr uint32_t vec_size = kBytesToLoad / sizeof(T);
+  const int64_t token_idx = blockIdx.x;
+  const int64_t thread_idx = threadIdx.x;
+  const int64_t stride = blockDim.x;
+  const int64_t offset = token_idx * 2 * d;
+
+#pragma unroll 1
+  for (uint32_t idx = thread_idx; idx < d / vec_size; idx += stride) {
+    sgl_hip::vec_t<T, vec_size> x_vec, y_vec, out_vec;
+    x_vec.cast_load(input + offset + idx * vec_size);
+    y_vec.cast_load(input + offset + d + idx * vec_size);
+#pragma unroll
+    for (uint32_t i = 0; i < vec_size; ++i) {
+      out_vec[i] = Activation(x_vec[i]) * y_vec[i];
+    }
+    out_vec.cast_store(out + token_idx * d + idx * vec_size);
+  }
+
+  const int64_t remaining_offset = d - d % (stride * vec_size);
+  // process the remaining elements
+#pragma unroll 1
+  for (int64_t idx = thread_idx; idx < d % (stride * vec_size); idx += stride) {
+    T x = input[offset + remaining_offset + idx], y = input[offset + remaining_offset + d + idx];
+    out[token_idx * d + remaining_offset + idx] = Activation(x) * y;
+  }
+}
+
+template <typename T, T (*Activation)(const T&)>
+__global__ void act_only_kernel(T* __restrict__ out, const T* __restrict__ input, const int d) {
+  constexpr uint32_t vec_size = kBytesToLoad / sizeof(T);
+  const int64_t token_idx = blockIdx.x;
+  const int64_t thread_idx = threadIdx.x;
+  const int64_t stride = blockDim.x;
+  const int64_t offset = token_idx * d;
+
+#pragma unroll 1
+  for (uint32_t idx = thread_idx; idx < d / vec_size; idx += stride) {
+    sgl_hip::vec_t<T, vec_size> x_vec, y_vec, out_vec;
+    x_vec.cast_load(input + offset + idx * vec_size);
+#pragma unroll
+    for (uint32_t i = 0; i < vec_size; ++i) {
+      out_vec[i] = Activation(x_vec[i]);
+    }
+    out_vec.cast_store(out + token_idx * d + idx * vec_size);
+  }
+
+  const int64_t remaining_offset = d - d % (stride * vec_size);
+  // process the remaining elements
+#pragma unroll 1
+  for (int64_t idx = thread_idx; idx < d % (stride * vec_size); idx += stride) {
+    T x = input[offset + remaining_offset + idx];
+    out[token_idx * d + remaining_offset + idx] = Activation(x);
+  }
+}
+
+}  // namespace activation
+}  // namespace sgl_hip
diff --git a/sgl-kernel/include/hip/hip_math_def.h b/sgl-kernel/include/hip/hip_math_def.h
new file mode 100644
index 000000000..356ed953f
--- /dev/null
+++ b/sgl-kernel/include/hip/hip_math_def.h
@@ -0,0 +1,94 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#ifdef USE_ROCM
+
+#include <hip/hip_bf16.h>
+#include <hip/hip_common.h>
+#include <hip/hip_fp16.h>
+
+// Adapted from flashinfer-rocm [PR#491](https://github.com/flashinfer-ai/flashinfer/pull/491)
+
+namespace amdgpu {
+
+template <typename T>
+__forceinline__ __device__ T shfl_xor_sync(unsigned mask, T var, int laneMask, int width = warpSize);
+
+template <typename srcDtype, typename destDtype>
+__forceinline__ __device__ destDtype cast(srcDtype val);
+
+// specialization
+template <>
+__forceinline__ __device__ float shfl_xor_sync(unsigned mask, float var, int laneMask, int width) {
+  return __shfl_xor(var, laneMask, width);
+}
+
+template <>
+__forceinline__ __device__ int shfl_xor_sync(unsigned mask, int var, int laneMask, int width) {
+  return __shfl_xor(var, laneMask, width);
+}
+
+template <>
+__forceinline__ __device__ float cast(float val) {
+  return val;
+}
+
+template <>
+__forceinline__ __device__ float cast(__half val) {
+  return __half2float(val);
+}
+
+template <>
+__forceinline__ __device__ float cast(__hip_bfloat16 val) {
+  return __bfloat162float(val);
+}
+
+template <>
+__forceinline__ __device__ __half cast(float fval) {
+  return __float2half(fval);
+}
+
+template <>
+__forceinline__ __device__ __hip_bfloat16 cast(float fval) {
+  return __float2bfloat16(fval);
+}
+
+}  // namespace amdgpu
+
+template <typename T>
+__forceinline__ __device__ T __shfl_xor_sync(unsigned mask, T var, int laneMask, int width = warpSize) {
+  return amdgpu::shfl_xor_sync(mask, var, laneMask, width);
+}
+
+template <typename srcDtype>
+__device__ __forceinline__ float castToFloat(srcDtype val) {
+  return amdgpu::cast<srcDtype, float>(val);
+}
+
+template <typename dstDtype>
+__device__ __forceinline__ dstDtype castFromFloat(float val) {
+  return amdgpu::cast<float, dstDtype>(val);
+}
+
+// operator overload to support flashinfer
+__host__ __device__ __forceinline__ __half operator*(const __half& x, const __half& y) {
+  __half h_x = x;
+  __half h_y = y;
+  return __hmul(h_x, h_y);
+}
+
+#endif
diff --git a/sgl-kernel/include/hip/hip_vec_dtypes.h b/sgl-kernel/include/hip/hip_vec_dtypes.h
new file mode 100644
index 000000000..a68a6986e
--- /dev/null
+++ b/sgl-kernel/include/hip/hip_vec_dtypes.h
@@ -0,0 +1,101 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#if USE_ROCM
+
+#include <hip/hip_bf16.h>
+#include <hip/hip_common.h>
+#include <hip/hip_fp16.h>
+
+// Adapted from flashinfer-rocm [PR#491](https://github.com/flashinfer-ai/flashinfer/pull/491)d
+
+#define SGL_HIP_INLINE inline __attribute__((always_inline)) __device__
+
+namespace sgl_hip {
+
+template <typename float_t, size_t vec_size>
+struct vec_t;
+
+template <typename srcDtype, typename dstDtype, size_t vec_size>
+SGL_HIP_INLINE void cast_load_impl(vec_t<dstDtype, vec_size>& dst, const srcDtype* src);
+
+template <typename srcDtype, typename dstDtype, size_t vec_size>
+SGL_HIP_INLINE void cast_store_impl(dstDtype* dst_ptr, const vec_t<srcDtype, vec_size>& src);
+
+template <typename float_t, size_t vec_size>
+struct vec_t {
+  SGL_HIP_INLINE float_t& operator[](size_t i);
+  SGL_HIP_INLINE const float_t& operator[](size_t i) const;
+  SGL_HIP_INLINE float_t* ptr();
+
+  SGL_HIP_INLINE void load(const float_t* ptr);
+  SGL_HIP_INLINE void store(float_t* ptr) const;
+
+  template <typename T>
+  SGL_HIP_INLINE void cast_from(const vec_t<T, vec_size>& src);
+  template <typename T>
+  SGL_HIP_INLINE void cast_load(const T* ptr);
+  template <typename T>
+  SGL_HIP_INLINE void cast_store(T* ptr) const;
+};
+
+}  // namespace sgl_hip
+
+// **** impl *****
+
+namespace sgl_hip {
+
+template <typename srcDtype, typename dstDtype, size_t vec_size>
+SGL_HIP_INLINE void cast_load_impl(vec_t<dstDtype, vec_size>& dst, const srcDtype* src_ptr) {
+  if constexpr (std::is_same<srcDtype, dstDtype>::value) {
+    dst.load(src_ptr);
+  } else {
+    vec_t<srcDtype, vec_size> tmp;
+    tmp.load(src_ptr);
+    dst.cast_from(tmp);
+  }
+}
+
+template <typename srcDtype, typename dstDtype, size_t vec_size>
+SGL_HIP_INLINE void cast_store_impl(dstDtype* dst_ptr, const vec_t<srcDtype, vec_size>& src) {
+  if constexpr (std::is_same<srcDtype, dstDtype>::value) {
+    src.store(dst_ptr);
+  } else {
+    vec_t<dstDtype, vec_size> tmp;
+    tmp.cast_from(src);
+    tmp.store(dst_ptr);
+  }
+}
+
+template <typename float_t, size_t vec_size>
+template <typename T>
+SGL_HIP_INLINE void vec_t<float_t, vec_size>::cast_load(const T* ptr) {
+  cast_load_impl(*this, ptr);
+}
+
+template <typename float_t, size_t vec_size>
+template <typename T>
+SGL_HIP_INLINE void vec_t<float_t, vec_size>::cast_store(T* ptr) const {
+  cast_store_impl(ptr, *this);
+}
+
+}  // namespace sgl_hip
+
+#include "impl/hip_vec_bf16_impl.h"
+#include "impl/hip_vec_fp32_impl.h"
+#include "impl/hip_vec_half_impl.h"
+#endif
diff --git a/sgl-kernel/include/hip/impl/hip_vec_bf16_impl.h b/sgl-kernel/include/hip/impl/hip_vec_bf16_impl.h
new file mode 100644
index 000000000..b783f3f43
--- /dev/null
+++ b/sgl-kernel/include/hip/impl/hip_vec_bf16_impl.h
@@ -0,0 +1,177 @@
+#pragma once
+
+#if USE_ROCM
+
+#include <hip/hip_bf16.h>
+#include <hip/hip_common.h>
+
+// Adapted from flashinfer-rocm [PR#491](https://github.com/flashinfer-ai/flashinfer/pull/491)
+
+using nv_bfloat16 = __hip_bfloat16;
+using nv_bfloat162 = __hip_bfloat162;
+
+__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 make_bfloat162(const __hip_bfloat16 x, const __hip_bfloat16 y) {
+  __hip_bfloat162 t;
+  t.x = x;
+  t.y = y;
+  return t;
+}
+
+namespace sgl_hip {
+
+// nv_bfloat16 x 1
+template <>
+struct vec_t<nv_bfloat16, 1> {
+  nv_bfloat16 data;
+  SGL_HIP_INLINE nv_bfloat16& operator[](size_t i) {
+    return ((nv_bfloat16*)(&data))[i];
+  }
+  SGL_HIP_INLINE const nv_bfloat16& operator[](size_t i) const {
+    return ((const nv_bfloat16*)(&data))[i];
+  }
+  SGL_HIP_INLINE nv_bfloat16* ptr() {
+    return reinterpret_cast<nv_bfloat16*>(&data);
+  }
+  SGL_HIP_INLINE void load(const nv_bfloat16* ptr);
+  SGL_HIP_INLINE void store(nv_bfloat16* ptr) const;
+  template <typename T>
+  SGL_HIP_INLINE void cast_from(const vec_t<T, 1>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+};
+
+SGL_HIP_INLINE void vec_t<nv_bfloat16, 1>::load(const nv_bfloat16* ptr) {
+  data = *ptr;
+}
+
+SGL_HIP_INLINE void vec_t<nv_bfloat16, 1>::store(nv_bfloat16* ptr) const {
+  *ptr = data;
+}
+
+// nv_bfloat16 x 2
+template <>
+struct vec_t<nv_bfloat16, 2> {
+  nv_bfloat162 data;
+
+  SGL_HIP_INLINE nv_bfloat16& operator[](size_t i) {
+    return ((nv_bfloat16*)(&data))[i];
+  }
+  SGL_HIP_INLINE const nv_bfloat16& operator[](size_t i) const {
+    return ((const nv_bfloat16*)(&data))[i];
+  }
+  SGL_HIP_INLINE nv_bfloat16* ptr() {
+    return reinterpret_cast<nv_bfloat16*>(&data);
+  }
+  SGL_HIP_INLINE void load(const nv_bfloat16* ptr);
+  SGL_HIP_INLINE void store(nv_bfloat16* ptr) const;
+  template <typename T>
+  SGL_HIP_INLINE void cast_from(const vec_t<T, 2>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+};
+
+SGL_HIP_INLINE void vec_t<nv_bfloat16, 2>::load(const nv_bfloat16* ptr) {
+  data = *((nv_bfloat162*)ptr);
+}
+
+SGL_HIP_INLINE void vec_t<nv_bfloat16, 2>::store(nv_bfloat16* ptr) const {
+  *((nv_bfloat162*)ptr) = data;
+}
+
+template <>
+struct vec_t<nv_bfloat16, 4> {
+  uint2 data;
+
+  SGL_HIP_INLINE nv_bfloat16& operator[](size_t i) {
+    return ((nv_bfloat16*)(&data))[i];
+  }
+  SGL_HIP_INLINE const nv_bfloat16& operator[](size_t i) const {
+    return ((const nv_bfloat16*)(&data))[i];
+  }
+  SGL_HIP_INLINE nv_bfloat16* ptr() {
+    return reinterpret_cast<nv_bfloat16*>(&data);
+  }
+  SGL_HIP_INLINE void load(const nv_bfloat16* ptr);
+  SGL_HIP_INLINE void store(nv_bfloat16* ptr) const;
+  template <typename T>
+  SGL_HIP_INLINE void cast_from(const vec_t<T, 4>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+};
+
+SGL_HIP_INLINE void vec_t<nv_bfloat16, 4>::load(const nv_bfloat16* ptr) {
+  data = *((uint2*)ptr);
+}
+
+SGL_HIP_INLINE void vec_t<nv_bfloat16, 4>::store(nv_bfloat16* ptr) const {
+  *((uint2*)ptr) = data;
+}
+
+// nv_bfloat16 x 8 or more
+
+template <size_t vec_size>
+struct vec_t<nv_bfloat16, vec_size> {
+  uint4 data[vec_size / 8];
+
+  SGL_HIP_INLINE nv_bfloat16& operator[](size_t i) {
+    return ((nv_bfloat16*)data)[i];
+  }
+  SGL_HIP_INLINE const nv_bfloat16& operator[](size_t i) const {
+    return ((const nv_bfloat16*)data)[i];
+  }
+  SGL_HIP_INLINE nv_bfloat16* ptr() {
+    return reinterpret_cast<nv_bfloat16*>(&data);
+  }
+  SGL_HIP_INLINE void load(const nv_bfloat16* ptr) {
+#pragma unoll
+    for (size_t i = 0; i < vec_size / 8; ++i) {
+      data[i] = ((uint4*)ptr)[i];
+    }
+  }
+  SGL_HIP_INLINE void store(nv_bfloat16* ptr) const {
+#pragma unoll
+    for (size_t i = 0; i < vec_size / 8; ++i) {
+      ((uint4*)ptr)[i] = data[i];
+    }
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_from(const vec_t<T, vec_size>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+};
+
+}  // namespace sgl_hip
+
+#endif
diff --git a/sgl-kernel/include/hip/impl/hip_vec_fp32_impl.h b/sgl-kernel/include/hip/impl/hip_vec_fp32_impl.h
new file mode 100644
index 000000000..97cba6320
--- /dev/null
+++ b/sgl-kernel/include/hip/impl/hip_vec_fp32_impl.h
@@ -0,0 +1,129 @@
+#pragma once
+
+#if USE_ROCM
+
+#include <hip/hip_common.h>
+
+// Adapted from flashinfer-rocm [PR#491](https://github.com/flashinfer-ai/flashinfer/pull/491)
+
+namespace sgl_hip {
+
+template <>
+struct vec_t<float, 1> {
+  float data;
+
+  SGL_HIP_INLINE float& operator[](size_t i) {
+    return ((float*)(&data))[i];
+  }
+  SGL_HIP_INLINE const float& operator[](size_t i) const {
+    return ((const float*)(&data))[i];
+  }
+  SGL_HIP_INLINE float* ptr() {
+    return reinterpret_cast<float*>(&data);
+  }
+  SGL_HIP_INLINE void load(const float* ptr);
+  SGL_HIP_INLINE void store(float* ptr) const;
+  template <typename T>
+  SGL_HIP_INLINE void cast_from(const vec_t<T, 1>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+};
+
+SGL_HIP_INLINE void vec_t<float, 1>::load(const float* ptr) {
+  data = *ptr;
+}
+
+SGL_HIP_INLINE void vec_t<float, 1>::store(float* ptr) const {
+  *ptr = data;
+}
+
+// float x 2
+
+template <>
+struct vec_t<float, 2> {
+  float2 data;
+
+  SGL_HIP_INLINE float& operator[](size_t i) {
+    return ((float*)(&data))[i];
+  }
+  SGL_HIP_INLINE const float& operator[](size_t i) const {
+    return ((const float*)(&data))[i];
+  }
+  SGL_HIP_INLINE float* ptr() {
+    return reinterpret_cast<float*>(&data);
+  }
+  SGL_HIP_INLINE void load(const float* ptr);
+  SGL_HIP_INLINE void store(float* ptr) const;
+  template <typename T>
+  SGL_HIP_INLINE void cast_from(const vec_t<T, 2>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+};
+
+SGL_HIP_INLINE void vec_t<float, 2>::load(const float* ptr) {
+  data = *((float2*)ptr);
+}
+
+SGL_HIP_INLINE void vec_t<float, 2>::store(float* ptr) const {
+  *((float2*)ptr) = data;
+}
+
+// float x 4 or more
+template <size_t vec_size>
+struct vec_t<float, vec_size> {
+  float4 data[vec_size / 4];
+
+  SGL_HIP_INLINE float& operator[](size_t i) {
+    return ((float*)(data))[i];
+  }
+  SGL_HIP_INLINE const float& operator[](size_t i) const {
+    return ((const float*)(data))[i];
+  }
+  SGL_HIP_INLINE float* ptr() {
+    return reinterpret_cast<float*>(&data);
+  }
+  SGL_HIP_INLINE void load(const float* ptr) {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 4; ++i) {
+      data[i] = ((float4*)ptr)[i];
+    }
+  }
+  SGL_HIP_INLINE void store(float* ptr) const {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 4; ++i) {
+      ((float4*)ptr)[i] = data[i];
+    }
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_from(const vec_t<T, vec_size>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+};
+
+}  // namespace sgl_hip
+
+#endif
diff --git a/sgl-kernel/include/hip/impl/hip_vec_half_impl.h b/sgl-kernel/include/hip/impl/hip_vec_half_impl.h
new file mode 100644
index 000000000..767b9c62f
--- /dev/null
+++ b/sgl-kernel/include/hip/impl/hip_vec_half_impl.h
@@ -0,0 +1,172 @@
+#pragma once
+
+#if USE_ROCM
+
+#include <hip/hip_common.h>
+#include <hip/hip_fp16.h>
+
+// Adapted from flashinfer-rocm [PR#491](https://github.com/flashinfer-ai/flashinfer/pull/491)
+
+using half = __half;
+using half2 = __half2;
+
+namespace sgl_hip {
+
+// half x 1
+template <>
+struct vec_t<half, 1> {
+  half data;
+
+  SGL_HIP_INLINE half& operator[](size_t i) {
+    return ((half*)(&data))[i];
+  }
+  SGL_HIP_INLINE const half& operator[](size_t i) const {
+    return ((const half*)(&data))[i];
+  }
+  SGL_HIP_INLINE half* ptr() {
+    return reinterpret_cast<half*>(&data);
+  }
+  SGL_HIP_INLINE void load(const half* ptr);
+  SGL_HIP_INLINE void store(half* ptr) const;
+  template <typename T>
+  SGL_HIP_INLINE void cast_from(const vec_t<T, 1>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+};
+
+SGL_HIP_INLINE void vec_t<half, 1>::load(const half* ptr) {
+  data = *ptr;
+}
+
+SGL_HIP_INLINE void vec_t<half, 1>::store(half* ptr) const {
+  *ptr = data;
+}
+
+// half x 2
+template <>
+struct vec_t<half, 2> {
+  half2 data;
+
+  SGL_HIP_INLINE half& operator[](size_t i) {
+    return ((half*)(&data))[i];
+  }
+  SGL_HIP_INLINE const half& operator[](size_t i) const {
+    return ((const half*)(&data))[i];
+  }
+  SGL_HIP_INLINE half* ptr() {
+    return reinterpret_cast<half*>(&data);
+  }
+  SGL_HIP_INLINE void load(const half* ptr);
+  SGL_HIP_INLINE void store(half* ptr) const;
+  template <typename T>
+  SGL_HIP_INLINE void cast_from(const vec_t<T, 2>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+};
+
+SGL_HIP_INLINE void vec_t<half, 2>::load(const half* ptr) {
+  data = *((half2*)ptr);
+}
+
+SGL_HIP_INLINE void vec_t<half, 2>::store(half* ptr) const {
+  *((half2*)ptr) = data;
+}
+
+// half x 4
+
+template <>
+struct vec_t<half, 4> {
+  uint2 data;
+
+  SGL_HIP_INLINE half& operator[](size_t i) {
+    return ((half*)(&data))[i];
+  }
+  SGL_HIP_INLINE const half& operator[](size_t i) const {
+    return ((const half*)(&data))[i];
+  }
+  SGL_HIP_INLINE half* ptr() {
+    return reinterpret_cast<half*>(&data);
+  }
+  SGL_HIP_INLINE void load(const half* ptr);
+  SGL_HIP_INLINE void store(half* ptr) const;
+  template <typename T>
+  SGL_HIP_INLINE void cast_from(const vec_t<T, 4>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+};
+
+SGL_HIP_INLINE void vec_t<half, 4>::load(const half* ptr) {
+  data = *((uint2*)ptr);
+}
+
+SGL_HIP_INLINE void vec_t<half, 4>::store(half* ptr) const {
+  *((uint2*)ptr) = data;
+}
+
+// half x 8 or more
+
+template <size_t vec_size>
+struct vec_t<half, vec_size> {
+  uint4 data[vec_size / 8];
+
+  SGL_HIP_INLINE half& operator[](size_t i) {
+    return ((half*)data)[i];
+  }
+  SGL_HIP_INLINE const half& operator[](size_t i) const {
+    return ((const half*)data)[i];
+  }
+  SGL_HIP_INLINE half* ptr() {
+    return reinterpret_cast<half*>(&data);
+  }
+  SGL_HIP_INLINE void load(const half* ptr) {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 8; ++i) {
+      data[i] = ((uint4*)ptr)[i];
+    }
+  }
+  SGL_HIP_INLINE void store(half* ptr) const {
+#pragma unroll
+    for (size_t i = 0; i < vec_size / 8; ++i) {
+      ((uint4*)ptr)[i] = data[i];
+    }
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_from(const vec_t<T, vec_size>& src) {
+    cast_from_impl(*this, src);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_load(const T* ptr) {
+    cast_load_impl(*this, ptr);
+  }
+  template <typename T>
+  SGL_HIP_INLINE void cast_store(T* ptr) const {
+    cast_store_impl(ptr, *this);
+  }
+};
+
+}  // namespace sgl_hip
+#endif
diff --git a/sgl-kernel/include/pytorch_extension_utils_rocm.h b/sgl-kernel/include/pytorch_extension_utils_rocm.h
new file mode 100644
index 000000000..fa5fd129f
--- /dev/null
+++ b/sgl-kernel/include/pytorch_extension_utils_rocm.h
@@ -0,0 +1,20 @@
+#include <torch/library.h>
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
+
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_LAST_DIM_CONTIGUOUS(x) \
+  TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimension")
+
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+#define CHECK_LAST_DIM_CONTIGUOUS_INPUT(x) \
+  CHECK_CUDA(x);                           \
+  CHECK_LAST_DIM_CONTIGUOUS(x)
+
+#define CHECK_DIM(d, x) TORCH_CHECK(x.dim() == d, #x " must be a " #d "D tensor")
+
+#define CHECK_EQ(a, b) TORCH_CHECK((a) == (b), "CHECK_EQ(" #a ", " #b ") failed. ", a, " vs ", b)
+
+#define CHECK_GE(a, b) TORCH_CHECK((a) >= (b), "CHECK_GE(" #a ", " #b ") failed. ", a, " vs ", b)
diff --git a/sgl-kernel/include/scalar_type.hpp b/sgl-kernel/include/scalar_type.hpp
new file mode 100644
index 000000000..9161a38b7
--- /dev/null
+++ b/sgl-kernel/include/scalar_type.hpp
@@ -0,0 +1,330 @@
+#pragma once
+
+// For TORCH_CHECK
+#include <torch/library.h>
+
+namespace sglang {
+
+//
+//  ScalarType can represent a wide range of floating point and integer types,
+//  in particular it can be used to represent sub-byte data types (something
+//  that torch.dtype currently does not support).
+//
+//  The type definitions on the Python side can be found in: vllm/scalar_type.py
+//  these type definitions should be kept up to date with any Python API changes
+//  here.
+//
+class ScalarType {
+ public:
+  enum NanRepr : uint8_t {
+    NAN_NONE = 0,                // nans are not supported
+    NAN_IEEE_754 = 1,            // nans are: exp all 1s, mantissa not all 0s
+    NAN_EXTD_RANGE_MAX_MIN = 2,  // nans are: exp all 1s, mantissa all 1s
+
+    NAN_REPR_ID_MAX
+  };
+
+  constexpr ScalarType(
+      uint8_t exponent,
+      uint8_t mantissa,
+      bool signed_,
+      int32_t bias,
+      bool finite_values_only = false,
+      NanRepr nan_repr = NAN_IEEE_754)
+      : exponent(exponent),
+        mantissa(mantissa),
+        signed_(signed_),
+        bias(bias),
+        finite_values_only(finite_values_only),
+        nan_repr(nan_repr) {};
+
+  static constexpr ScalarType int_(uint8_t size_bits, int32_t bias = 0) {
+    return ScalarType(0, size_bits - 1, true, bias);
+  }
+
+  static constexpr ScalarType uint(uint8_t size_bits, int32_t bias = 0) {
+    return ScalarType(0, size_bits, false, bias);
+  }
+
+  // IEEE 754 compliant floating point type
+  static constexpr ScalarType float_IEEE754(uint8_t exponent, uint8_t mantissa) {
+    TORCH_CHECK(mantissa > 0 && exponent > 0);
+    return ScalarType(exponent, mantissa, true, 0, false, NAN_IEEE_754);
+  }
+
+  // IEEE 754 non-compliant floating point type
+  static constexpr ScalarType float_(uint8_t exponent, uint8_t mantissa, bool finite_values_only, NanRepr nan_repr) {
+    TORCH_CHECK(nan_repr < NAN_REPR_ID_MAX, "Invalid NanRepr");
+    TORCH_CHECK(mantissa > 0 && exponent > 0);
+    TORCH_CHECK(
+        nan_repr != NAN_IEEE_754,
+        "use `float_IEEE754` constructor for floating point types that "
+        "follow IEEE 754 conventions");
+    return ScalarType(exponent, mantissa, true, 0, finite_values_only, nan_repr);
+  }
+
+  uint8_t const exponent;  // size of the exponent field (0 for integer types)
+  uint8_t const mantissa;  // size of the mantissa field (size of the integer
+                           // excluding the sign bit for integer types)
+  bool const signed_;      // flag if the type supports negative numbers (i.e. has a
+                           // sign bit)
+  int32_t const bias;      // stored values equal value + bias,
+                           // used for quantized type
+
+  // Extra Floating point info
+  bool const finite_values_only;  // i.e. no +/-inf if true
+  NanRepr const nan_repr;         // how NaNs are represented
+                                  // (not applicable for integer types)
+
+  using Id = int64_t;
+
+ private:
+  // Field size in id
+  template <typename T_>
+  static constexpr size_t member_id_field_width() {
+    using T = std::decay_t<T_>;
+    return std::is_same_v<T, bool> ? 1 : sizeof(T) * 8;
+  }
+
+  template <typename Fn, typename Init, typename Member, typename... Rest>
+  static constexpr auto reduce_members_helper(Fn f, Init val, Member member, Rest... rest) {
+    auto new_val = f(val, member);
+    if constexpr (sizeof...(rest) > 0) {
+      return reduce_members_helper(f, new_val, rest...);
+    } else {
+      return new_val;
+    };
+  }
+
+  template <typename Fn, typename Init>
+  constexpr auto reduce_members(Fn f, Init init) const {
+    // Should be in constructor order for `from_id`
+    return reduce_members_helper(f, init, exponent, mantissa, signed_, bias, finite_values_only, nan_repr);
+  };
+
+  template <typename Fn, typename Init>
+  static constexpr auto reduce_member_types(Fn f, Init init) {
+    constexpr auto dummy_type = ScalarType(0, 0, false, 0, false, NAN_NONE);
+    return dummy_type.reduce_members(f, init);
+  };
+
+  static constexpr auto id_size_bits() {
+    return reduce_member_types(
+        [](int acc, auto member) -> int { return acc + member_id_field_width<decltype(member)>(); }, 0);
+  }
+
+ public:
+  // unique id for this scalar type that can be computed at compile time for
+  //  c++17 template specialization this is not needed once we migrate to
+  //  c++20 and can pass literal classes as template parameters
+  constexpr Id id() const {
+    static_assert(id_size_bits() <= sizeof(Id) * 8, "ScalarType id is too large to be stored");
+
+    auto or_and_advance = [](std::pair<Id, uint32_t> result, auto member) -> std::pair<Id, uint32_t> {
+      auto [id, bit_offset] = result;
+      auto constexpr bits = member_id_field_width<decltype(member)>();
+      return {id | (int64_t(member) & ((uint64_t(1) << bits) - 1)) << bit_offset, bit_offset + bits};
+    };
+    return reduce_members(or_and_advance, std::pair<Id, uint32_t>{}).first;
+  }
+
+  // create a ScalarType from an id, for c++17 template specialization,
+  //  this is not needed once we migrate to c++20 and can pass literal
+  //  classes as template parameters
+  static constexpr ScalarType from_id(Id id) {
+    auto extract_and_advance = [id](auto result, auto member) {
+      using T = decltype(member);
+      auto [tuple, bit_offset] = result;
+      auto constexpr bits = member_id_field_width<T>();
+      auto extracted_val = static_cast<T>((int64_t(id) >> bit_offset) & ((uint64_t(1) << bits) - 1));
+      auto new_tuple = std::tuple_cat(tuple, std::make_tuple(extracted_val));
+      return std::pair<decltype(new_tuple), int>{new_tuple, bit_offset + bits};
+    };
+
+    auto [tuple_args, _] = reduce_member_types(extract_and_advance, std::pair<std::tuple<>, int>{});
+    return std::apply([](auto... args) { return ScalarType(args...); }, tuple_args);
+  }
+
+  constexpr int64_t size_bits() const {
+    return mantissa + exponent + is_signed();
+  }
+  constexpr bool is_signed() const {
+    return signed_;
+  }
+  constexpr bool is_integer() const {
+    return exponent == 0;
+  }
+  constexpr bool is_floating_point() const {
+    return exponent > 0;
+  }
+  constexpr bool is_ieee_754() const {
+    return is_floating_point() && finite_values_only == false && nan_repr == NAN_IEEE_754;
+  }
+  constexpr bool has_nans() const {
+    return is_floating_point() && nan_repr != NAN_NONE;
+  }
+  constexpr bool has_infs() const {
+    return is_floating_point() && finite_values_only == false;
+  }
+  constexpr bool has_bias() const {
+    return bias != 0;
+  }
+
+ private:
+  double _floating_point_max() const {
+    TORCH_CHECK(mantissa <= 52 && exponent <= 11, "Cannot represent max/min as a double for type ", str());
+
+    uint64_t max_mantissa = (uint64_t(1) << mantissa) - 1;
+    if (nan_repr == NAN_EXTD_RANGE_MAX_MIN) {
+      max_mantissa -= 1;
+    }
+
+    uint64_t max_exponent = (uint64_t(1) << exponent) - 2;
+    if (nan_repr == NAN_EXTD_RANGE_MAX_MIN || nan_repr == NAN_NONE) {
+      TORCH_CHECK(exponent < 11, "Cannot represent max/min as a double for type ", str());
+      max_exponent += 1;
+    }
+
+    // adjust the exponent to match that of a double
+    //  for now we assume the exponent bias is the standard 2^(e-1) -1, (where e
+    //  is the exponent bits), there is some precedent for non-standard biases,
+    //  example `float8_e4m3b11fnuz` here: https://github.com/jax-ml/ml_dtypes
+    //  but to avoid premature over complication we are just assuming the
+    //  standard exponent bias until there is a need to support non-standard
+    //  biases
+    uint64_t exponent_bias = (uint64_t(1) << (exponent - 1)) - 1;
+    uint64_t exponent_bias_double = (uint64_t(1) << 10) - 1;  // double e = 11
+
+    uint64_t max_exponent_double = max_exponent - exponent_bias + exponent_bias_double;
+
+    // shift the mantissa into the position for a double and
+    // the exponent
+    uint64_t double_raw = (max_mantissa << (52 - mantissa)) | (max_exponent_double << 52);
+
+    return *reinterpret_cast<double*>(&double_raw);
+  }
+
+  constexpr std::variant<int64_t, double> _raw_max() const {
+    if (is_floating_point()) {
+      return {_floating_point_max()};
+    } else {
+      TORCH_CHECK(size_bits() < 64 || size_bits() == 64 && is_signed(), "Cannot represent max as a int64_t");
+      return {(int64_t(1) << mantissa) - 1};
+    }
+  }
+
+  constexpr std::variant<int64_t, double> _raw_min() const {
+    if (is_floating_point()) {
+      TORCH_CHECK(is_signed(), "We currently assume all floating point types are signed");
+      constexpr uint64_t sign_bit_double = (uint64_t(1) << 63);
+
+      double max = _floating_point_max();
+      uint64_t max_raw = *reinterpret_cast<uint64_t*>(&max);
+      uint64_t min_raw = max_raw | sign_bit_double;
+      return {*reinterpret_cast<double*>(&min_raw)};
+    } else {
+      TORCH_CHECK(!is_signed() || size_bits() <= 64, "Cannot represent min as a int64_t");
+      if (is_signed()) {
+        // set the top bit to 1 (i.e. INT64_MIN) and the rest to 0
+        // then perform an arithmetic shift right to set all the bits above
+        // (size_bits() - 1) to 1
+        return {INT64_MIN >> (64 - size_bits())};
+      } else {
+        return {int64_t(0)};
+      }
+    }
+  }
+
+ public:
+  // Max representable value for this scalar type.
+  // (accounting for bias if there is one)
+  constexpr std::variant<int64_t, double> max() const {
+    return std::visit([this](auto x) -> std::variant<int64_t, double> { return {x - bias}; }, _raw_max());
+  }
+
+  // Min representable value for this scalar type.
+  // (accounting for bias if there is one)
+  constexpr std::variant<int64_t, double> min() const {
+    return std::visit([this](auto x) -> std::variant<int64_t, double> { return {x - bias}; }, _raw_min());
+  }
+
+  std::string str() const {
+    /* naming generally follows: https://github.com/jax-ml/ml_dtypes
+     * for floating point types (leading f) the scheme is:
+     *  `float<size_bits>_e<exponent_bits>m<mantissa_bits>[flags]`
+     *  flags:
+     *  - no-flags: means it follows IEEE 754 conventions
+     *  - f: means finite values only (no infinities)
+     *  - n: means nans are supported (non-standard encoding)
+     * for integer types the scheme is:
+     *  `[u]int<size_bits>[b<bias>]`
+     *  - if bias is not present it means its zero
+     */
+    if (is_floating_point()) {
+      auto ret =
+          "float" + std::to_string(size_bits()) + "_e" + std::to_string(exponent) + "m" + std::to_string(mantissa);
+      if (!is_ieee_754()) {
+        if (finite_values_only) {
+          ret += "f";
+        }
+        if (nan_repr != NAN_NONE) {
+          ret += "n";
+        }
+      }
+      return ret;
+    } else {
+      auto ret = ((is_signed()) ? "int" : "uint") + std::to_string(size_bits());
+      if (has_bias()) {
+        ret += "b" + std::to_string(bias);
+      }
+      return ret;
+    }
+  }
+
+  constexpr bool operator==(ScalarType const& other) const {
+    return mantissa == other.mantissa && exponent == other.exponent && bias == other.bias && signed_ == other.signed_ &&
+           finite_values_only == other.finite_values_only && nan_repr == other.nan_repr;
+  }
+};
+
+using ScalarTypeId = ScalarType::Id;
+
+// "rust style" names generally following:
+//   https://github.com/pytorch/pytorch/blob/6d9f74f0af54751311f0dd71f7e5c01a93260ab3/torch/csrc/api/include/torch/types.h#L60-L70
+static inline constexpr auto kS4 = ScalarType::int_(4);
+static inline constexpr auto kU4 = ScalarType::uint(4);
+static inline constexpr auto kU4B8 = ScalarType::uint(4, 8);
+static inline constexpr auto kS8 = ScalarType::int_(8);
+static inline constexpr auto kU8 = ScalarType::uint(8);
+static inline constexpr auto kU8B128 = ScalarType::uint(8, 128);
+
+static inline constexpr auto kFE2M1f = ScalarType::float_(2, 1, true, ScalarType::NAN_NONE);
+static inline constexpr auto kFE3M2f = ScalarType::float_(3, 2, true, ScalarType::NAN_NONE);
+static inline constexpr auto kFE4M3fn = ScalarType::float_(4, 3, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN);
+static inline constexpr auto kFE5M2 = ScalarType::float_IEEE754(5, 2);
+static inline constexpr auto kFE8M7 = ScalarType::float_IEEE754(8, 7);
+static inline constexpr auto kFE5M10 = ScalarType::float_IEEE754(5, 10);
+
+// Fixed width style names, generally following:
+//  https://github.com/pytorch/pytorch/blob/6d9f74f0af54751311f0dd71f7e5c01a93260ab3/torch/csrc/api/include/torch/types.h#L47-L57
+static inline constexpr auto kInt4 = kS4;
+static inline constexpr auto kUint4 = kU4;
+static inline constexpr auto kUint4b8 = kU4B8;
+static inline constexpr auto kInt8 = kS8;
+static inline constexpr auto kUint8 = kU8;
+static inline constexpr auto kUint8b128 = kU8B128;
+
+static inline constexpr auto kFloat4_e2m1f = kFE2M1f;
+static inline constexpr auto kFloat6_e3m2f = kFE3M2f;
+static inline constexpr auto kFloat8_e4m3fn = kFE4M3fn;
+static inline constexpr auto kFloat8_e5m2 = kFE5M2;
+static inline constexpr auto kFloat16_e8m7 = kFE8M7;
+static inline constexpr auto kFloat16_e5m10 = kFE5M10;
+
+// colloquial names
+static inline constexpr auto kHalf = kFE5M10;
+static inline constexpr auto kFloat16 = kHalf;
+static inline constexpr auto kBFloat16 = kFE8M7;
+
+static inline constexpr auto kFloat16Id = kFloat16.id();
+};  // namespace sglang
diff --git a/sgl-kernel/include/sgl_flash_kernel_ops.h b/sgl-kernel/include/sgl_flash_kernel_ops.h
new file mode 100644
index 000000000..383e207c3
--- /dev/null
+++ b/sgl-kernel/include/sgl_flash_kernel_ops.h
@@ -0,0 +1,86 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <Python.h>
+#include <torch/library.h>
+#include <torch/torch.h>
+
+#include <vector>
+
+#include "sgl_kernel_torch_shim.h"
+
+#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
+
+#define _CONCAT(A, B) A##B
+#define CONCAT(A, B) _CONCAT(A, B)
+
+#define _STRINGIFY(A) #A
+#define STRINGIFY(A) _STRINGIFY(A)
+
+#define REGISTER_EXTENSION(NAME)                                                                      \
+  PyMODINIT_FUNC CONCAT(PyInit_, NAME)() {                                                            \
+    static struct PyModuleDef module = {PyModuleDef_HEAD_INIT, STRINGIFY(NAME), nullptr, 0, nullptr}; \
+    return PyModule_Create(&module);                                                                  \
+  }
+
+/*
+ * From flash-attention
+ */
+std::vector<at::Tensor> mha_fwd(
+    at::Tensor& q,        // (b, s_q, h, d) or (total_q, h, d) if there is cu_seqlens_q
+    const at::Tensor& k,  // (b_k, s_k, h_k, d) or (total_k, h_k, d) if there is cu_seqlens_k or (num_pages, page_size,
+                          // h_k, d) if there is page_table.
+    const at::Tensor& v,  // (b_k, s_k, h_k, dv) or (total_k, h_k, dv) if there is cu_seqlens_k or (num_pages,
+                          // page_size, h_k, dv) if there is page_table.
+    std::optional<const at::Tensor>&
+        k_new_,  // (b, s_k_new, h_k, d) or (total_k_new, h_k, d) if there is cu_seqlens_k_new
+    std::optional<const at::Tensor>&
+        v_new_,  // (b, s_k_new, h_k, dv) or (total_k_new, h_k, dv) if there is cu_seqlens_k_new
+    std::optional<const at::Tensor>& q_v_,           // (b, s_q, h, dv) or (total_q_new, h, dv) if there is cu_seqlens_q
+    std::optional<at::Tensor>& out_,                 // (b, s_q, h, dv) or (total_q, h, dv) if there is cu_seqlens_q
+    std::optional<const at::Tensor>& cu_seqlens_q_,  // b+1
+    std::optional<const at::Tensor>& cu_seqlens_k_,  // b+1
+    std::optional<const at::Tensor>& cu_seqlens_k_new_,  // b+1
+    std::optional<const at::Tensor>&
+        seqused_q_,  // b. If given, only this many elements of each batch element's queries and outputs are used.
+    std::optional<const at::Tensor>&
+        seqused_k_,  // b. If given, only this many elements of each batch element's keys are used.
+    std::optional<int> max_seqlen_q_,
+    // TODO: check if we need max_seqlen_k
+    std::optional<int> max_seqlen_k_,
+    std::optional<const at::Tensor>& page_table_,      // (b_k, max_num_pages_per_seq)
+    std::optional<const at::Tensor>& kv_batch_idx_,    // b. indices to index into the KV cache
+    std::optional<const at::Tensor>& leftpad_k_,       // b
+    std::optional<const at::Tensor>& rotary_cos_,      // seqlen_ro x (rotary_dim / 2)
+    std::optional<const at::Tensor>& rotary_sin_,      // seqlen_ro x (rotary_dim / 2)
+    std::optional<const at::Tensor>& seqlens_rotary_,  // b
+    std::optional<at::Tensor>& q_descale_,             // (b, h_k), not (b, h)
+    std::optional<at::Tensor>& k_descale_,             // (b, h_k)
+    std::optional<at::Tensor>& v_descale_,             // (b, h_k)
+    float const softmax_scale,
+    bool is_causal,
+    int window_size_left,
+    int window_size_right,
+    float const softcap,
+    bool const is_rotary_interleaved,  // if true, rotary combines indices 0 & 1, else indices 0 & rotary_dim / 2
+    std::optional<at::Tensor>& scheduler_metadata_,  // (b + 1)
+    int num_splits,
+    std::optional<bool> pack_gqa_,
+    int const sm_margin,
+    std::optional<const at::Tensor>& sinks_);
diff --git a/sgl-kernel/include/sgl_kernel_ops.h b/sgl-kernel/include/sgl_kernel_ops.h
new file mode 100644
index 000000000..a13af546a
--- /dev/null
+++ b/sgl-kernel/include/sgl_kernel_ops.h
@@ -0,0 +1,771 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <Python.h>
+#include <torch/all.h>
+#include <torch/library.h>
+#include <torch/torch.h>
+
+#include <tuple>
+#include <vector>
+
+#include "scalar_type.hpp"
+
+#define _CONCAT(A, B) A##B
+#define CONCAT(A, B) _CONCAT(A, B)
+
+#define _STRINGIFY(A) #A
+#define STRINGIFY(A) _STRINGIFY(A)
+
+#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
+
+#define REGISTER_EXTENSION(NAME)                                                                      \
+  PyMODINIT_FUNC CONCAT(PyInit_, NAME)() {                                                            \
+    static struct PyModuleDef module = {PyModuleDef_HEAD_INIT, STRINGIFY(NAME), nullptr, 0, nullptr}; \
+    return PyModule_Create(&module);                                                                  \
+  }
+
+using fptr_t = int64_t;
+
+/*
+ * From csrc/allreduce
+ */
+#ifdef USE_ROCM
+// ROCM custom allreduce
+fptr_t init_custom_ar(
+    torch::Tensor& meta,
+    torch::Tensor& rank_data,
+    const std::vector<std::string>& handles,
+    const std::vector<int64_t>& offsets,
+    int64_t rank,
+    bool full_nvlink);
+void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out);
+void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer, torch::Tensor& out);
+void dispose(fptr_t _fa);
+int64_t meta_size();
+void register_buffer(
+    fptr_t _fa, torch::Tensor& t, const std::vector<std::string>& handles, const std::vector<int64_t>& offsets);
+std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta(fptr_t _fa);
+void register_graph_buffers(
+    fptr_t _fa, const std::vector<std::string>& handles, const std::vector<std::vector<int64_t>>& offsets);
+torch::Tensor allocate_meta_buffer(int64_t size);
+torch::Tensor get_meta_buffer_ipc_handle(torch::Tensor& inp);
+// quick allreduce
+fptr_t init_custom_qr(int64_t rank, int64_t world_size, std::optional<int64_t> qr_max_size = std::nullopt);
+void qr_destroy(fptr_t _fa);
+torch::Tensor qr_get_handle(fptr_t _fa);
+void qr_open_handles(fptr_t _fa, const std::vector<torch::Tensor>& handles);
+void qr_all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, int64_t quant_level, bool cast_bf2half = false);
+int64_t qr_max_size();
+#else
+// custom allreduce
+fptr_t
+init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs, torch::Tensor& rank_data, int64_t rank, bool full_nvlink);
+void dispose(fptr_t _fa);
+int64_t meta_size();
+void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, fptr_t _reg_buffer, int64_t reg_buffer_sz_bytes);
+std::tuple<std::vector<int64_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(fptr_t _fa);
+void register_buffer(fptr_t _fa, const std::vector<fptr_t>& fake_ipc_ptrs);
+void register_graph_buffers(
+    fptr_t _fa, const std::vector<std::vector<int64_t>>& handles, const std::vector<std::vector<int64_t>>& offsets);
+
+// mscclpp
+torch::Tensor mscclpp_generate_unique_id();
+fptr_t mscclpp_init_context(
+    const torch::Tensor& unique_id,
+    const int64_t rank,
+    const int64_t world_size,
+    torch::Tensor& scratch,
+    torch::Tensor& put_buffer,
+    const int64_t nranks_per_node,
+    const std::vector<int64_t>& rank_to_node,
+    const std::vector<int64_t>& rank_to_ib,
+    const int64_t context_selection);
+void mscclpp_allreduce(fptr_t _context, torch::Tensor& inp, torch::Tensor& out, int64_t nthreads, int64_t nblocks);
+#endif
+
+/*
+ * From csrc/attention
+ */
+void lightning_attention_decode(
+    const torch::Tensor& q,
+    const torch::Tensor& k,
+    const torch::Tensor& v,
+    const torch::Tensor& past_kv,
+    const torch::Tensor& slope,
+    torch::Tensor output,
+    torch::Tensor new_kv);
+void merge_state(
+    at::Tensor v_a, at::Tensor s_a, at::Tensor v_b, at::Tensor s_b, at::Tensor v_merged, at::Tensor s_merged);
+void merge_state_v2(
+    at::Tensor v_a, at::Tensor s_a, at::Tensor v_b, at::Tensor s_b, at::Tensor v_merged, at::Tensor s_merged);
+void cutlass_mla_decode(
+    torch::Tensor const& out,
+    torch::Tensor const& q_nope,
+    torch::Tensor const& q_pe,
+    torch::Tensor const& kv_c_and_k_pe_cache,
+    torch::Tensor const& seq_lens,
+    torch::Tensor const& page_table,
+    torch::Tensor const& workspace,
+    double sm_scale,
+    int64_t num_kv_splits = 1 /* Set to 1 to avoid cuda_graph issue by default. */);
+int64_t cutlass_mla_get_workspace_size(
+    int64_t max_seq_len,
+    int64_t num_batches,
+    int64_t sm_count = 0,
+    int64_t num_kv_splits = 1 /* Set to 1 to avoid cuda_graph issue by default. */);
+
+/*
+ * From csrc/elementwise
+ */
+void rmsnorm(at::Tensor& output, at::Tensor& input, at::Tensor& weight, double eps, bool enable_pdl);
+void sgl_fused_add_rmsnorm(
+    torch::Tensor input, torch::Tensor residual, torch::Tensor weight, double eps, bool enable_pdl);
+void gemma_rmsnorm(at::Tensor& output, at::Tensor& input, at::Tensor& weight, double eps, bool enable_pdl);
+void gemma_fused_add_rmsnorm(at::Tensor& input, at::Tensor& residual, at::Tensor& weight, double eps, bool enable_pdl);
+void silu_and_mul(at::Tensor& out, at::Tensor& input);
+void gelu_tanh_and_mul(at::Tensor& out, at::Tensor& input);
+void gelu_and_mul(at::Tensor& out, at::Tensor& input);
+
+void apply_rope_pos_ids_cos_sin_cache(
+    at::Tensor q,
+    at::Tensor k,
+    at::Tensor q_rope,
+    at::Tensor k_rope,
+    at::Tensor cos_sin_cache,
+    at::Tensor pos_ids,
+    bool interleave,
+    bool enable_pdl,
+    int64_t cuda_stream,
+    const std::optional<at::Tensor>& v,
+    const std::optional<at::Tensor>& k_buffer,
+    const std::optional<at::Tensor>& v_buffer,
+    const std::optional<at::Tensor>& kv_cache_loc);
+
+void downcast_fp8(
+    at::Tensor& k,
+    at::Tensor& v,
+    at::Tensor& k_out,
+    at::Tensor& v_out,
+    at::Tensor& k_scale,
+    at::Tensor& v_scale,
+    at::Tensor& loc,
+    int64_t mult,
+    int64_t offset,
+    int64_t cuda_stream);
+
+#ifdef USE_ROCM
+void gelu_quick(at::Tensor& out, const at::Tensor& input);
+#endif
+
+/*
+ * From csrc/gemm
+ */
+torch::Tensor awq_dequantize(torch::Tensor qweight, torch::Tensor scales, torch::Tensor qzeros);
+void cutlass_scaled_fp4_mm(
+    torch::Tensor& D,
+    torch::Tensor const& A,
+    torch::Tensor const& B,
+    torch::Tensor const& A_sf,
+    torch::Tensor const& B_sf,
+    torch::Tensor const& alpha);
+torch::Tensor int8_scaled_mm(
+    const torch::Tensor& mat_a,
+    const torch::Tensor& mat_b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const torch::Dtype& out_dtype,
+    const c10::optional<torch::Tensor>& bias);
+torch::Tensor fp8_scaled_mm(
+    const torch::Tensor& mat_a,
+    const torch::Tensor& mat_b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const torch::Dtype& out_dtype,
+    const c10::optional<torch::Tensor>& bias);
+torch::Tensor fp8_blockwise_scaled_mm(
+    const torch::Tensor& mat_a,
+    const torch::Tensor& mat_b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const torch::Dtype& out_dtype);
+void scaled_fp4_quant(
+    torch::Tensor& output, torch::Tensor const& input, torch::Tensor& output_scale, torch::Tensor const& input_scale);
+void sgl_per_token_group_quant_fp8(
+    at::Tensor input,
+    at::Tensor output_q,
+    at::Tensor output_s,
+    int64_t group_size,
+    double eps,
+    double fp8_min,
+    double fp8_max,
+    bool scale_ue8m0);
+void sgl_per_token_group_quant_int8(
+    at::Tensor input,
+    at::Tensor output_q,
+    at::Tensor output_s,
+    int64_t group_size,
+    double eps,
+    double int8_min,
+    double int8_max);
+void sgl_per_tensor_quant_fp8(at::Tensor input, at::Tensor output_q, at::Tensor output_s, bool is_static);
+void sgl_per_token_quant_fp8(at::Tensor input, at::Tensor output_q, at::Tensor output_s);
+void bmm_fp8(
+    at::Tensor A,
+    at::Tensor B,
+    at::Tensor D,
+    at::Tensor A_scale,
+    at::Tensor B_scale,
+    at::Tensor workspace_buffer,
+    int64_t cublas_handle,
+    int64_t cuda_stream);
+void dsv3_router_gemm(torch::Tensor& output, const torch::Tensor& mat_a, const torch::Tensor& mat_b);
+void dsv3_fused_a_gemm(torch::Tensor& output, torch::Tensor const& mat_a, torch::Tensor const& mat_b);
+
+torch::Tensor gptq_marlin_gemm(
+    torch::Tensor& a,
+    std::optional<torch::Tensor> c_or_none,
+    torch::Tensor& b_q_weight,
+    torch::Tensor& b_scales,
+    std::optional<torch::Tensor> const& global_scale_or_none,
+    std::optional<torch::Tensor> const& b_zeros_or_none,
+    std::optional<torch::Tensor> const& g_idx_or_none,
+    std::optional<torch::Tensor> const& perm_or_none,
+    torch::Tensor& workspace,
+    sglang::ScalarTypeId const& b_q_type_id,
+    int64_t size_m,
+    int64_t size_n,
+    int64_t size_k,
+    bool is_k_full,
+    bool use_atomic_add,
+    bool use_fp32_reduce,
+    bool is_zp_float);
+
+torch::Tensor gptq_gemm(
+    torch::Tensor a,
+    torch::Tensor b_q_weight,
+    torch::Tensor b_gptq_qzeros,
+    torch::Tensor b_gptq_scales,
+    torch::Tensor b_g_idx,
+    bool use_shuffle,
+    int64_t bit);
+
+void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
+
+torch::Tensor
+gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm, int64_t size_k, int64_t size_n, int64_t num_bits);
+
+torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k, int64_t size_n, int64_t num_bits);
+
+/*
+ * From csrc/moe
+ */
+void moe_align_block_size(
+    torch::Tensor topk_ids,
+    int64_t num_experts,
+    int64_t block_size,
+    torch::Tensor sorted_token_ids,
+    torch::Tensor experts_ids,
+    torch::Tensor num_tokens_post_pad,
+    torch::Tensor cumsum_buffer,
+    bool pad_sorted_token_ids);
+
+void topk_softmax(
+    torch::Tensor& topk_weights, torch::Tensor& topk_indices, torch::Tensor& gating_output, bool renormalize);
+
+std::vector<at::Tensor> moe_fused_gate(
+    at::Tensor& input,
+    at::Tensor& bias,
+    int64_t num_expert_group,
+    int64_t topk_group,
+    int64_t topk,
+    int64_t num_fused_shared_experts,
+    double routed_scaling_factor,
+    bool apply_routed_scaling_factor_on_output);
+
+void fp8_blockwise_scaled_grouped_mm(
+    torch::Tensor& output,
+    torch::Tensor& a_ptrs,
+    torch::Tensor& b_ptrs,
+    torch::Tensor& out_ptrs,
+    torch::Tensor& a_scales_ptrs,
+    torch::Tensor& b_scales_ptrs,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& scales_a,
+    const torch::Tensor& scales_b,
+    const torch::Tensor& stride_a,
+    const torch::Tensor& stride_b,
+    const torch::Tensor& stride_c,
+    const torch::Tensor& layout_sfa,
+    const torch::Tensor& layout_sfb,
+    const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets,
+    const torch::Tensor& workspace);
+
+void prepare_moe_input(
+    const torch::Tensor& topk_ids,
+    torch::Tensor& expert_offsets,
+    const std::optional<torch::Tensor>& blockscale_offsets,
+    torch::Tensor& problem_sizes1,
+    torch::Tensor& problem_sizes2,
+    torch::Tensor& input_permutation,
+    torch::Tensor& output_permutation,
+    const int64_t num_experts,
+    const int64_t n,
+    const int64_t k);
+
+void shuffle_rows(const torch::Tensor& input_tensor, const torch::Tensor& dst2src_map, torch::Tensor& output_tensor);
+
+void apply_shuffle_mul_sum(
+    const torch::Tensor& input,
+    torch::Tensor& output,
+    const torch::Tensor& permutation,
+    const std::optional<torch::Tensor>& factors);
+
+void cutlass_fp4_group_mm(
+    torch::Tensor& output,
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const torch::Tensor& a_blockscale,
+    const torch::Tensor& b_blockscales,
+    const torch::Tensor& alphas,
+    const torch::Tensor& ab_strides,
+    const torch::Tensor& c_strides,
+    const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets,
+    const torch::Tensor& sf_offsets);
+
+void scaled_fp4_experts_quant(
+    torch::Tensor& output,
+    torch::Tensor& output_scale,
+    torch::Tensor const& input,
+    torch::Tensor const& input_global_scale,
+    torch::Tensor const& input_offset_by_experts,
+    torch::Tensor const& output_scale_offset_by_experts);
+
+void silu_and_mul_scaled_fp4_experts_quant(
+    torch::Tensor& output,
+    torch::Tensor& output_scale,
+    torch::Tensor const& input,
+    torch::Tensor const& input_global_scale,
+    torch::Tensor const& mask,
+    bool use_silu_and_mul);
+/*
+ * From csrc/moe/cutlass_moe/w4a8
+ */
+void get_cutlass_w4a8_moe_mm_data(
+    const torch::Tensor& topk_ids,
+    torch::Tensor& expert_offsets,
+    torch::Tensor& problem_sizes1,
+    torch::Tensor& problem_sizes2,
+    torch::Tensor& input_permutation,
+    torch::Tensor& output_permutation,
+    const int64_t num_experts,
+    const int64_t n,
+    const int64_t k);
+
+void cutlass_w4a8_moe_mm(
+    torch::Tensor& d_tensors,
+    torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors,
+    torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales,
+    torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes,
+    torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides,
+    torch::Tensor const& d_strides,
+    torch::Tensor const& s_strides,
+    int64_t chunk_size,
+    int64_t topk);
+
+torch::Tensor moe_wna16_marlin_gemm(
+    torch::Tensor& a,
+    std::optional<torch::Tensor> const& c_or_none,
+    torch::Tensor& b_q_weight,
+    torch::Tensor& b_scales,
+    std::optional<torch::Tensor> const& b_zeros_or_none,
+    std::optional<torch::Tensor> const& g_idx_or_none,
+    std::optional<torch::Tensor> const& perm_or_none,
+    torch::Tensor& workspace,
+    torch::Tensor& sorted_token_ids,
+    torch::Tensor& expert_ids,
+    torch::Tensor& num_tokens_past_padded,
+    torch::Tensor& topk_weights,
+    int64_t moe_block_size,
+    int64_t top_k,
+    bool mul_topk_weights,
+    bool is_ep,
+    sglang::ScalarTypeId const& b_q_type_id,
+    int64_t size_m,
+    int64_t size_n,
+    int64_t size_k,
+    bool is_k_full,
+    bool use_atomic_add,
+    bool use_fp32_reduce,
+    bool is_zp_float);
+
+/*
+ * From csrc/speculative
+ */
+void tree_speculative_sampling_target_only(
+    at::Tensor predicts,          // mutable
+    at::Tensor accept_index,      // mutable
+    at::Tensor accept_token_num,  // mutable
+    at::Tensor candidates,
+    at::Tensor retrive_index,
+    at::Tensor retrive_next_token,
+    at::Tensor retrive_next_sibling,
+    at::Tensor uniform_samples,
+    at::Tensor uniform_samples_for_final_sampling,
+    at::Tensor target_probs,
+    at::Tensor draft_probs,
+    double threshold_single = 1,
+    double threshold_acc = 1,
+    bool deterministic = true,
+    int64_t cuda_stream = 0);
+
+void verify_tree_greedy(
+    at::Tensor predicts,          // mutable
+    at::Tensor accept_index,      // mutable
+    at::Tensor accept_token_num,  // mutable
+    at::Tensor candidates,
+    at::Tensor retrive_index,
+    at::Tensor retrive_next_token,
+    at::Tensor retrive_next_sibling,
+    at::Tensor target_predict,
+    int64_t cuda_stream = 0);
+
+void build_tree_kernel_efficient(
+    at::Tensor parent_list,
+    at::Tensor selected_index,
+    at::Tensor verified_seq_len,
+    at::Tensor tree_mask,
+    at::Tensor positions,
+    at::Tensor retrive_index,
+    at::Tensor retrive_next_token,
+    at::Tensor retrive_next_sibling,
+    int64_t topk,
+    int64_t depth,
+    int64_t draft_token_num,
+    int64_t tree_mask_mode);
+
+void segment_packbits(
+    at::Tensor x,
+    at::Tensor input_indptr,
+    at::Tensor output_indptr,
+    at::Tensor y,
+    int64_t batch_size,
+    int64_t cuda_stream = 0);
+
+/*
+ * From csrc/kvcacheio
+ */
+void transfer_kv_per_layer(
+    const at::Tensor src_k,
+    at::Tensor dst_k,
+    const at::Tensor src_v,
+    at::Tensor dst_v,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t item_size,
+    int64_t block_quota,
+    int64_t num_warps_per_block);
+
+void transfer_kv_per_layer_pf_lf(
+    const at::Tensor src_k,
+    at::Tensor dst_k,
+    const at::Tensor src_v,
+    at::Tensor dst_v,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t layer_id,
+    int64_t item_size,
+    int64_t src_layout_dim,
+    int64_t block_quota,
+    int64_t num_warps_per_block);
+
+void transfer_kv_all_layer(
+    const at::Tensor src_k_layers,
+    const at::Tensor dst_k_layers,
+    const at::Tensor src_v_layers,
+    const at::Tensor dst_v_layers,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t item_size,
+    int64_t num_layers,
+    int64_t block_quota,
+    int64_t num_warps_per_block);
+
+void transfer_kv_all_layer_lf_pf(
+    const at::Tensor src_k_layers,
+    at::Tensor dst_k,
+    const at::Tensor src_v_layers,
+    at::Tensor dst_v,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t item_size,
+    int64_t dst_layout_dim,
+    int64_t num_layers,
+    int64_t block_quota,
+    int64_t num_warps_per_block);
+
+void transfer_kv_per_layer_mla(
+    const at::Tensor src,
+    at::Tensor dst,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t item_size,
+    int64_t block_quota,
+    int64_t num_warps_per_block);
+
+void transfer_kv_per_layer_mla_pf_lf(
+    const at::Tensor src,
+    at::Tensor dst,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t layer_id,
+    int64_t item_size,
+    int64_t src_layout_dim,
+    int64_t block_quota,
+    int64_t num_warps_per_block);
+
+void transfer_kv_all_layer_mla(
+    const at::Tensor src_layers,
+    const at::Tensor dst_layers,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t item_size,
+    int64_t num_layers,
+    int64_t block_quota,
+    int64_t num_warps_per_block);
+
+void transfer_kv_all_layer_mla_lf_pf(
+    const at::Tensor src_layers,
+    at::Tensor dst,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t item_size,
+    int64_t dst_layout_dim,
+    int64_t num_layers,
+    int64_t block_quota,
+    int64_t num_warps_per_block);
+
+void transfer_kv_direct(
+    const std::vector<at::Tensor>& src_layers,
+    std::vector<at::Tensor> dst_layers,
+    const at::Tensor src_indices,
+    const at::Tensor dst_indices,
+    int64_t page_size);
+
+void transfer_kv_per_layer_direct_pf_lf(
+    const std::vector<at::Tensor>& src_ptrs,
+    std::vector<at::Tensor> dst_ptrs,
+    const at::Tensor& src_indices,
+    const at::Tensor& dst_indices,
+    int64_t layer_id,
+    int64_t page_size);
+
+void transfer_kv_all_layer_direct_lf_pf(
+    const std::vector<at::Tensor>& src_ptrs,
+    std::vector<at::Tensor> dst_ptrs,
+    const at::Tensor& src_indices,
+    const at::Tensor& dst_indices,
+    int64_t page_size);
+
+/*
+ * From FlashInfer
+ */
+void min_p_sampling_from_probs(
+    at::Tensor probs,
+    at::Tensor output,
+    std::optional<at::Tensor> maybe_indices,
+    std::optional<at::Tensor> maybe_min_p_arr,
+    double min_p_val,
+    bool deterministic,
+    std::optional<at::Generator> gen);
+
+void top_k_renorm_probs(
+    at::Tensor probs, at::Tensor renorm_probs, std::optional<at::Tensor> maybe_top_k_arr, int64_t top_k_val);
+
+void top_p_renorm_probs(
+    at::Tensor probs, at::Tensor renorm_probs, std::optional<at::Tensor> maybe_top_p_arr, double top_p_val);
+
+void top_k_top_p_sampling_from_probs(
+    at::Tensor probs,
+    at::Tensor output,
+    std::optional<at::Tensor> maybe_indices,
+    std::optional<at::Tensor> maybe_top_k_arr,
+    double top_k_val,
+    std::optional<at::Tensor> maybe_top_p_arr,
+    double top_p_val,
+    bool deterministic,
+    std::optional<at::Generator> gen);
+
+void top_p_sampling_from_probs(
+    at::Tensor probs,
+    at::Tensor output,
+    std::optional<at::Tensor> maybe_indices,
+    std::optional<at::Tensor> maybe_top_p_arr,
+    double top_p_val,
+    bool deterministic,
+    std::optional<at::Generator> gen);
+
+void top_k_mask_logits(
+    at::Tensor logits, at::Tensor mask_logits, std::optional<at::Tensor> maybe_top_k_arr, int64_t top_k_val);
+
+namespace flash {
+/*
+ * From fa2 sparse
+ */
+std::vector<at::Tensor> mha_fwd_sparse(
+    at::Tensor& q,        // batch_size x seqlen_q x num_heads x head_size
+    const at::Tensor& k,  // batch_size x seqlen_k x num_heads_k x head_size
+    const at::Tensor& v,  // batch_size x seqlen_k x num_heads_k x head_size
+    const at::Tensor& block_count,
+    const at::Tensor& block_offset,
+    const at::Tensor& column_count,
+    const at::Tensor& column_index,
+    const std::optional<at::Tensor>& out_,           // batch_size x seqlen_q x num_heads x head_size
+    const std::optional<at::Tensor>& alibi_slopes_,  // num_heads or batch_size x num_heads
+    const double p_dropout,
+    const double softmax_scale,
+    bool is_causal,
+    const double softcap,
+    const bool return_softmax,
+    std::optional<at::Generator> gen_);
+
+std::vector<at::Tensor> mha_varlen_fwd_sparse(
+    at::Tensor& q,        // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+    const at::Tensor& k,  // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i.
+    const at::Tensor& v,  // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i.
+    const at::Tensor& block_count,
+    const at::Tensor& block_offset,
+    const at::Tensor& column_count,
+    const at::Tensor& column_index,
+    const c10::optional<at::Tensor>& out_,  // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+    const at::Tensor& cu_seqlens_q,         // b+1
+    const at::Tensor& cu_seqlens_k,         // b+1
+    const c10::optional<at::Tensor>&
+        seqused_k,  // b. If given, only this many elements of each batch element's keys are used.
+    const c10::optional<at::Tensor>& alibi_slopes_,  // num_heads or b x num_heads
+    int64_t max_seqlen_q,
+    const int64_t max_seqlen_k,
+    const double p_dropout,
+    const double softmax_scale,
+    const bool zero_tensors,
+    bool is_causal,
+    const double softcap,
+    const bool return_softmax,
+    c10::optional<at::Generator> gen_);
+}  // namespace flash
+
+void convert_vertical_slash_indexes(
+    torch::Tensor& block_count,      // [BATCH, N_HEADS, NUM_ROWS]
+    torch::Tensor& block_offset,     // [BATCH, N_HEADS, NUM_ROWS, NNZ_S]
+    torch::Tensor& column_count,     // [BATCH, N_HEADS, NUM_ROWS]
+    torch::Tensor& column_index,     // [BATCH, N_HEADS, NUM_ROWS, NNZ_V]
+    torch::Tensor q_seqlens,         // [BATCH, ]
+    torch::Tensor kv_seqlens,        // [BATCH, ]
+    torch::Tensor vertical_indexes,  // [BATCH, N_HEADS, NNZ_V]
+    torch::Tensor slash_indexes,     // [BATCH, N_HEADS, NNZ_S]
+    int64_t context_size,
+    int64_t block_size_M,
+    int64_t block_size_N,
+    bool causal);
+
+void convert_vertical_slash_indexes_mergehead(
+    torch::Tensor& block_count,            // [BATCH, N_HEADS, NUM_ROWS]
+    torch::Tensor& block_offset,           // [BATCH, N_HEADS, NUM_ROWS, NNZ_S]
+    torch::Tensor& column_count,           // [BATCH, N_HEADS, NUM_ROWS]
+    torch::Tensor& column_index,           // [BATCH, N_HEADS, NUM_ROWS, NNZ_V]
+    torch::Tensor q_seqlens,               // [BATCH, ]
+    torch::Tensor kv_seqlens,              // [BATCH, ]
+    torch::Tensor vertical_indexes,        // [BATCH, N_HEADS, NNZ_V]
+    torch::Tensor slash_indexes,           // [BATCH, N_HEADS, NNZ_S]
+    torch::Tensor vertical_indices_count,  // [N_HEADS, ]
+    torch::Tensor slash_indices_count,
+    int64_t context_size,
+    int64_t block_size_M,
+    int64_t block_size_N,
+    bool causal);
+
+/*
+ * From XGrammar
+ */
+void ApplyTokenBitmaskInplace(at::Tensor logits, at::Tensor bitmask, at::optional<at::Tensor> indices = at::nullopt);
+
+/*
+ * From QServe
+ */
+void qserve_w4a8_per_chn_gemm(
+    const torch::Tensor& _in_feats,
+    const torch::Tensor& _kernel,
+    const torch::Tensor& _wscales,
+    const torch::Tensor& _ascales,
+    const torch::Tensor& _w_szs,
+    const torch::Tensor& _a_ssums,
+    torch::Tensor& _out_feats);
+
+void qserve_w4a8_per_group_gemm(
+    const torch::Tensor& _in_feats,
+    const torch::Tensor& _kernel,
+    const torch::Tensor& _zeros,
+    const torch::Tensor& _scales_i8,
+    const torch::Tensor& _wscales,
+    const torch::Tensor& _ascales,
+    torch::Tensor& _out_feats);
+
+/*
+ * From csrc/spatial
+ */
+std::vector<int64_t> create_greenctx_stream_by_value(int64_t smA, int64_t smB, int64_t device);
+
+/*
+ * From csrc/memory
+ */
+void store_kv_cache(at::Tensor k_cache, at::Tensor v_cache, at::Tensor out_loc, at::Tensor k, at::Tensor v);
+
+void copy_to_gpu_no_ce(const at::Tensor& input, at::Tensor& output);
+void concat_mla_k(torch::Tensor k, torch::Tensor k_nope, torch::Tensor k_rope);
+
+/*
+ * From csrc/mamba
+ */
+void causal_conv1d_update(
+    const at::Tensor& x,
+    const at::Tensor& conv_state,
+    const at::Tensor& weight,
+    const std::optional<at::Tensor>& bias_,
+    bool silu_activation,
+    const std::optional<at::Tensor>& cache_seqlens_,
+    const std::optional<at::Tensor>& conv_state_indices_,
+    int64_t pad_slot_id);
+
+void causal_conv1d_fwd(
+    const at::Tensor& x,
+    const at::Tensor& weight,
+    const std::optional<at::Tensor>& bias_,
+    const std::optional<at::Tensor>& conv_states,
+    const std::optional<at::Tensor>& query_start_loc,
+    const std::optional<at::Tensor>& cache_indices,
+    const std::optional<at::Tensor>& has_initial_state,
+    bool silu_activation,
+    int64_t pad_slot_id);
diff --git a/sgl-kernel/include/sgl_kernel_torch_shim.h b/sgl-kernel/include/sgl_kernel_torch_shim.h
new file mode 100644
index 000000000..5a26f9076
--- /dev/null
+++ b/sgl-kernel/include/sgl_kernel_torch_shim.h
@@ -0,0 +1,122 @@
+/*Adapt from:
+https://github.com/neuralmagic/vllm-flash-attention/blob/90eacc1af2a7c3de62ea249e929ed5faccf38954/csrc/common/pytorch_shim.h
+  Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include <torch/library.h>
+
+/**
+ * Unfortunately, the type signatures of the flash_attn ops are not compatible
+ * with the PyTorch library bindings. To get around that we use
+ * `make_pytorch_shim` which creates a lambda that exposes the API using
+ * PyTorch compatible types to the types, then converts them to the types
+ * expected by the flash_attn ops. This shims allows us to make minimal changes
+ * to `flash_api.cpp` making it easier to synchronize with upstream changes.
+ *
+ * The `pytorch_library_compatible_type` struct is used to map from the
+ * flash_attn ops types to a PyTorch library compatible one. The main issues is
+ * that the following types are not support by PyTorch library bindings:
+ *  - `int`
+ *  - `float`
+ *  - `std::optional<T> &`
+ *  - `std::optional<const at::Tensor> &`
+ * So we convert them to (respectively):
+ *  - `int64_t`
+ *  - `double`
+ *  - `const std::optional<T>&`
+ *  - `const std::optional<at::Tensor>&`
+ */
+
+template <typename T>
+struct pytorch_library_compatible_type {
+  using type = T;
+  static T convert_from_type(T arg) {
+    return arg;
+  }
+};
+
+template <typename T>
+using pytorch_library_compatible_type_t = typename pytorch_library_compatible_type<T>::type;
+
+template <typename T>
+T convert_from_pytorch_compatible_type(pytorch_library_compatible_type_t<T> arg) {
+  return pytorch_library_compatible_type<T>::convert_from_type(arg);
+}
+
+// Map `c10::optional<T> &` -> `const c10::optional<T>&`
+//  (NOTE: this is bit unsafe but non of the ops in flash_attn mutate
+//   the optional container)
+template <typename T>
+struct pytorch_library_compatible_type<c10::optional<T>&> {
+  using type = const c10::optional<T>&;
+  static c10::optional<T>& convert_from_type(const c10::optional<T>& arg) {
+    return const_cast<c10::optional<T>&>(arg);
+  }
+};
+
+// Map `c10::optional<T>` ->
+//          `c10::optional<pytorch_library_compatible_type_t<T>>`
+//  (NOTE: tested for `c10::optional<int>` -> `c10::optional<int64_t>`)
+template <typename T>
+struct pytorch_library_compatible_type<c10::optional<T>> {
+  using type = c10::optional<pytorch_library_compatible_type_t<T>>;
+  static c10::optional<pytorch_library_compatible_type_t<T>> convert_from_type(c10::optional<T> arg) {
+    return arg;
+  }
+};
+
+// Map `c10::optional<const at::Tensor>&` -> `const c10::optional<at::Tensor>&`
+template <>
+struct pytorch_library_compatible_type<c10::optional<const at::Tensor>&> {
+  using type = const c10::optional<at::Tensor>&;
+  static c10::optional<const at::Tensor>& convert_from_type(const c10::optional<at::Tensor>& arg) {
+    return const_cast<c10::optional<const at::Tensor>&>(reinterpret_cast<const c10::optional<const at::Tensor>&>(arg));
+  }
+};
+
+// Map `int` -> `int64_t`
+template <>
+struct pytorch_library_compatible_type<int> {
+  using type = int64_t;
+  static int convert_from_type(int64_t arg) {
+    TORCH_CHECK(arg <= std::numeric_limits<int>::max(), "int64_t value is too large to be converted to int");
+    TORCH_CHECK(arg >= std::numeric_limits<int>::min(), "int64_t value is too small to be converted to int");
+    return arg;
+  }
+};
+
+// Map `float` -> `double`
+template <>
+struct pytorch_library_compatible_type<float> {
+  using type = double;
+  static float convert_from_type(double arg) {
+    TORCH_CHECK(
+        std::abs(arg) <= std::numeric_limits<float>::max(), "double value is too large to be converted to float");
+    return arg;
+  }
+};
+
+//
+//  Shim Utils
+//
+
+template <typename Ret, typename... Args>
+auto make_pytorch_shim(Ret (*fun)(Args... args)) {
+  return [fun](pytorch_library_compatible_type_t<Args>... args) {
+    return fun(convert_from_pytorch_compatible_type<Args>(args)...);
+  };
+}
diff --git a/sgl-kernel/include/utils.h b/sgl-kernel/include/utils.h
new file mode 100644
index 000000000..7cfef8136
--- /dev/null
+++ b/sgl-kernel/include/utils.h
@@ -0,0 +1,449 @@
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <cuda_runtime.h>
+#include <torch/all.h>
+
+#ifdef USE_ROCM
+// Adapted from flashinfer-rocm [PR#491](https://github.com/flashinfer-ai/flashinfer/pull/491)
+#define _DISPATCH_CASE_F16(c_type, ...) \
+  case at::ScalarType::Half: {          \
+    using c_type = __half;              \
+    return __VA_ARGS__();               \
+  }
+
+#define _DISPATCH_CASE_BF16(c_type, ...) \
+  case at::ScalarType::BFloat16: {       \
+    using c_type = __hip_bfloat16;       \
+    return __VA_ARGS__();                \
+  }
+#endif  // USE_ROCM
+
+#ifndef USE_ROCM
+// Adapt from FlashInfer
+#ifdef FLASHINFER_ENABLE_F16
+#define _DISPATCH_CASE_F16(c_type, ...) \
+  case at::ScalarType::Half: {          \
+    using c_type = nv_half;             \
+    return __VA_ARGS__();               \
+  }
+#else
+#define _DISPATCH_CASE_F16(c_type, ...)
+#endif  // FLASHINFER_ENABLE_F16
+
+#ifdef FLASHINFER_ENABLE_BF16
+#define _DISPATCH_CASE_BF16(c_type, ...) \
+  case at::ScalarType::BFloat16: {       \
+    using c_type = nv_bfloat16;          \
+    return __VA_ARGS__();                \
+  }
+#else
+#define _DISPATCH_CASE_BF16(c_type, ...)
+#endif  // FLASHINFER_ENABLE_BF16
+
+#ifdef FLASHINFER_ENABLE_FP8_E4M3
+#define _DISPATCH_CASE_FP8_E4M3(c_type, ...) \
+  case at::ScalarType::Float8_e4m3fn: {      \
+    using c_type = __nv_fp8_e4m3;            \
+    return __VA_ARGS__();                    \
+  }
+#else
+#define _DISPATCH_CASE_FP8_E4M3(c_type, ...)
+#endif  // FLASHINFER_ENABLE_FP8_E4M3
+
+#ifdef FLASHINFER_ENABLE_FP8_E5M2
+#define _DISPATCH_CASE_FP8_E5M2(c_type, ...) \
+  case at::ScalarType::Float8_e5m2: {        \
+    using c_type = __nv_fp8_e5m2;            \
+    return __VA_ARGS__();                    \
+  }
+#else
+#define _DISPATCH_CASE_FP8_E5M2(c_type, ...)
+#endif  // FLASHINFER_ENABLE_FP8_E5M2
+
+#define DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(pytorch_dtype, c_type, ...)                 \
+  [&]() -> bool {                                                                        \
+    switch (pytorch_dtype) {                                                             \
+      _DISPATCH_CASE_F16(c_type, __VA_ARGS__)                                            \
+      _DISPATCH_CASE_BF16(c_type, __VA_ARGS__)                                           \
+      default:                                                                           \
+        std::ostringstream oss;                                                          \
+        oss << __PRETTY_FUNCTION__ << " failed to dispatch data type " << pytorch_dtype; \
+        TORCH_CHECK(false, oss.str());                                                   \
+        return false;                                                                    \
+    }                                                                                    \
+  }()
+
+#define DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP8(pytorch_dtype, c_type, ...)                      \
+  [&]() -> bool {                                                                            \
+    switch (pytorch_dtype) {                                                                 \
+      _DISPATCH_CASE_FP8_E4M3(c_type, __VA_ARGS__)                                           \
+      _DISPATCH_CASE_FP8_E5M2(c_type, __VA_ARGS__)                                           \
+      default:                                                                               \
+        std::ostringstream oss;                                                              \
+        oss << __PRETTY_FUNCTION__ << " failed to dispatch fp8 data type " << pytorch_dtype; \
+        TORCH_CHECK(false, oss.str());                                                       \
+        return false;                                                                        \
+    }                                                                                        \
+  }()
+
+#define DISPATCH_PYTORCH_DTYPE_TO_CTYPE(pytorch_dtype, c_type, ...)                      \
+  [&]() -> bool {                                                                        \
+    switch (pytorch_dtype) {                                                             \
+      _DISPATCH_CASE_F16(c_type, __VA_ARGS__)                                            \
+      _DISPATCH_CASE_BF16(c_type, __VA_ARGS__)                                           \
+      _DISPATCH_CASE_FP8_E4M3(c_type, __VA_ARGS__)                                       \
+      _DISPATCH_CASE_FP8_E5M2(c_type, __VA_ARGS__)                                       \
+      default:                                                                           \
+        std::ostringstream oss;                                                          \
+        oss << __PRETTY_FUNCTION__ << " failed to dispatch data type " << pytorch_dtype; \
+        TORCH_CHECK(false, oss.str());                                                   \
+        return false;                                                                    \
+    }                                                                                    \
+  }()
+
+#define _DISPATCH_SWITCH(var_name, cond, ...)                                           \
+  [&]() -> bool {                                                                       \
+    switch (cond) {                                                                     \
+      __VA_ARGS__                                                                       \
+      default:                                                                          \
+        std::ostringstream oss;                                                         \
+        oss << __PRETTY_FUNCTION__ << " failed to dispatch " var_name " " << int(cond); \
+        TORCH_CHECK(false, oss.str());                                                  \
+        return false;                                                                   \
+    }                                                                                   \
+  }()
+
+#define _DISPATCH_SWITCH_U16x2(var1_name, var2_name, cond1, cond2, ...)                                             \
+  [&]() -> bool {                                                                                                   \
+    switch (pack_u16(cond1, cond2)) {                                                                               \
+      __VA_ARGS__                                                                                                   \
+      default:                                                                                                      \
+        std::ostringstream oss;                                                                                     \
+        oss << __PRETTY_FUNCTION__ << " failed to dispatch (" var1_name ", " var2_name "): (" << int(cond1) << ", " \
+            << int(cond2) << ")";                                                                                   \
+        TORCH_CHECK(false, oss.str());                                                                              \
+        return false;                                                                                               \
+    }                                                                                                               \
+  }()
+
+#define _DISPATCH_CASE(case_expr, case_var, ...) \
+  case case_expr: {                              \
+    constexpr auto case_var = case_expr;         \
+    return __VA_ARGS__();                        \
+  }
+
+#define _DISPATCH_CASE_U16x2(case_expr1, case_expr2, case_var1, case_var2, ...) \
+  case pack_u16(case_expr1, case_expr2): {                                      \
+    constexpr auto case_var1 = case_expr1;                                      \
+    constexpr auto case_var2 = case_expr2;                                      \
+    return __VA_ARGS__();                                                       \
+  }
+
+#define DISPATCH_BOOL(expr, const_expr, ...) \
+  [&]() -> bool {                            \
+    if (expr) {                              \
+      constexpr bool const_expr = true;      \
+      return __VA_ARGS__();                  \
+    } else {                                 \
+      constexpr bool const_expr = false;     \
+      return __VA_ARGS__();                  \
+    }                                        \
+  }()
+
+inline void check_shape(const at::Tensor& a, const at::Tensor& b, const char* a_name, const char* b_name) {
+  TORCH_CHECK(a.dim() == b.dim(), a_name, ".dim() != ", b_name, ".dim(). ", a.dim(), " vs ", b.dim());
+  for (int i = 0; i < a.dim(); ++i) {
+    TORCH_CHECK(a.size(i) == b.size(i), a_name, ".size(", i, ") != ", b_name, ".size(", i, ")");
+  }
+}
+
+inline constexpr uint32_t pack_u16(uint16_t a, uint16_t b) {
+  return (uint32_t(a) << 16) | uint32_t(b);
+}
+
+#define CHECK_GQA_HEAD_DIVISIBLE(num_qo_heads, num_kv_heads) \
+  TORCH_CHECK(                                               \
+      num_qo_heads % num_kv_heads == 0,                      \
+      "num_qo_heads(",                                       \
+      num_qo_heads,                                          \
+      ") must be divisible by num_kv_heads(",                \
+      num_kv_heads,                                          \
+      ")")
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
+
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_LAST_DIM_CONTIGUOUS(x) \
+  TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimension")
+
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+#define CHECK_LAST_DIM_CONTIGUOUS_INPUT(x) \
+  CHECK_CUDA(x);                           \
+  CHECK_LAST_DIM_CONTIGUOUS(x)
+
+#define CHECK_DIM(d, x) TORCH_CHECK(x.dim() == d, #x " must be a " #d "D tensor")
+
+#define CHECK_SHAPE(a, b) check_shape(a, b, #a, #b)
+
+#define CHECK_EQ(a, b) TORCH_CHECK((a) == (b), "CHECK_EQ(" #a ", " #b ") failed. ", a, " vs ", b)
+
+#define CHECK_GE(a, b) TORCH_CHECK((a) >= (b), "CHECK_GE(" #a ", " #b ") failed. ", a, " vs ", b)
+
+inline bool is_float8_tensor(const at::Tensor& tensor) {
+  return tensor.scalar_type() == at::ScalarType::Float8_e4m3fn || tensor.scalar_type() == at::ScalarType::Float8_e5m2;
+}
+#endif  // USE_ROCM
+
+struct cuda_error : public std::runtime_error {
+  /**
+   * @brief Constructs a `cuda_error` object with the given `message`.
+   *
+   * @param message The error char array used to construct `cuda_error`
+   */
+  cuda_error(const char* message) : std::runtime_error(message) {}
+  /**
+   * @brief Constructs a `cuda_error` object with the given `message` string.
+   *
+   * @param message The `std::string` used to construct `cuda_error`
+   */
+  cuda_error(std::string const& message) : cuda_error{message.c_str()} {}
+};
+
+#define CHECK_CUDA_SUCCESS(cmd)                                         \
+  do {                                                                  \
+    cudaError_t e = cmd;                                                \
+    if (e != cudaSuccess) {                                             \
+      std::stringstream _message;                                       \
+      auto s = cudaGetErrorString(e);                                   \
+      _message << std::string(s) + "\n" << __FILE__ << ':' << __LINE__; \
+      throw cuda_error(_message.str());                                 \
+    }                                                                   \
+  } while (0)
+
+#define CHECK_IS_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_IS_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_CUDA_INPUT(x) \
+  CHECK_IS_CUDA(x);         \
+  CHECK_IS_CONTIGUOUS(x)
+
+inline int getSMVersion() {
+  int device{-1};
+  CHECK_CUDA_SUCCESS(cudaGetDevice(&device));
+  int sm_major = 0;
+  int sm_minor = 0;
+  CHECK_CUDA_SUCCESS(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device));
+  CHECK_CUDA_SUCCESS(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
+  return sm_major * 10 + sm_minor;
+}
+
+inline bool isDeviceType(const std::string& device_type) {
+  int deviceCount;
+  CHECK_CUDA_SUCCESS(cudaGetDeviceCount(&deviceCount));
+
+  int device_id = -1;
+  if (deviceCount >= 1) {
+    CHECK_CUDA_SUCCESS(cudaGetDevice(&device_id));
+  } else {
+    return false;
+  }
+
+  cudaDeviceProp prop;
+  CHECK_CUDA_SUCCESS(cudaGetDeviceProperties(&prop, device_id));
+  if (device_type == std::string(prop.name)) {
+    return true;
+  }
+  return false;
+}
+
+inline bool getBoolEnv(char const* name) {
+  char const* env = std::getenv(name);
+  return env && env[0] == '1' && env[1] == '\0';
+}
+
+inline bool getEnvEnablePDL() {
+  static std::once_flag flag;
+  static bool enablePDL = false;
+  std::call_once(flag, [&]() {
+    if (getSMVersion() >= 90) {
+      // PDL will be enabled by setting the env variables `TRTLLM_ENABLE_PDL` to `1`
+      enablePDL = getBoolEnv("TRTLLM_ENABLE_PDL");
+    }
+  });
+  return enablePDL;
+}
+
+// SGLANG_SHFL_XOR_* adapted from https://github.com/vllm-project/vllm/blob/v0.7.3/csrc/cuda_compat.h#L19-L28
+#ifndef USE_ROCM
+#define SGLANG_SHFL_XOR_SYNC(mask, var, lane_mask) __shfl_xor_sync((mask), (var), (lane_mask))
+#define SGLANG_SHFL_XOR_SYNC_WIDTH(mask, var, lane_mask, width) __shfl_xor_sync((mask), (var), (lane_mask), (width))
+#else
+#define SGLANG_SHFL_XOR_SYNC(mask, var, lane_mask) __shfl_xor((var), (lane_mask))
+#define SGLANG_SHFL_XOR_SYNC_WIDTH(mask, var, lane_mask, width) __shfl_xor((var), (lane_mask), (width))
+#endif
+
+#define DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16(pytorch_dtype, c_type, ...)           \
+  [&]() -> bool {                                                                        \
+    switch (pytorch_dtype) {                                                             \
+      case at::ScalarType::Float: {                                                      \
+        using c_type = float;                                                            \
+        return __VA_ARGS__();                                                            \
+      }                                                                                  \
+        _DISPATCH_CASE_F16(c_type, __VA_ARGS__)                                          \
+        _DISPATCH_CASE_BF16(c_type, __VA_ARGS__)                                         \
+      default:                                                                           \
+        std::ostringstream oss;                                                          \
+        oss << __PRETTY_FUNCTION__ << " failed to dispatch data type " << pytorch_dtype; \
+        TORCH_CHECK(false, oss.str());                                                   \
+        return false;                                                                    \
+    }                                                                                    \
+  }()
+
+#define DISPATCH_CASE_INTEGRAL_TYPES(...)              \
+  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)  \
+  AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)  \
+  AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)   \
+  AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
+
+#define DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
+
+#define CEILDIV(x, y) (((x) + (y) - 1) / (y))
+
+#ifndef USE_ROCM
+#define WARP_SIZE 32
+#else
+#if defined(__GFX9__) || !defined(__HIP_DEVICE_COMPILE__)
+#define WARP_SIZE 64
+#else
+#define WARP_SIZE 32
+#endif
+#endif
+
+#ifdef USE_ROCM
+
+#include "hip/hip_math_def.h"
+#include "hip/hip_vec_dtypes.h"
+
+#else
+
+template <typename srcDtype>
+__device__ __forceinline__ float castToFloat(srcDtype val) {
+  return static_cast<srcDtype>(val);
+}
+
+template <typename dstDtype>
+__device__ __forceinline__ dstDtype castFromFloat(float val) {
+  return static_cast<dstDtype>(val);
+}
+
+#endif
+
+// add FP8 support
+// #ifndef USE_ROCM
+// #include <c10/util/Float8_e4m3fn.h>
+// using FP8_TYPE = c10::Float8_e4m3fn;
+// C10_HOST_DEVICE constexpr auto FP8_E4M3_MAX = std::numeric_limits<FP8_TYPE>::max();
+// #else  // USE_ROCM
+// #if HIP_FP8_TYPE_FNUZ
+// #include <c10/util/Float8_e4m3fnuz.h>
+// using FP8_TYPE = c10::Float8_e4m3fnuz;
+// constexpr auto FP8_E4M3_MAX = 224.0f;
+// #else
+// #if HIP_FP8_TYPE_E4M3
+// #include <c10/util/Float8_e4m3fn.h>
+// using FP8_TYPE = c10::Float8_e4m3fn;
+// C10_HOST_DEVICE constexpr auto FP8_E4M3_MAX = std::numeric_limits<FP8_TYPE>::max();
+// #else
+// #error "fp8 is not supported in this processor (arch < gfx942)."
+// #endif  // HIP_FP8_TYPE_E4M3
+// #endif  // HIP_FP8_TYPE_FNUZ
+// #endif  // USE_ROCM
+
+#define FULL_MASK 0xffffffff
+
+__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
+#ifndef USE_ROCM
+  float old;
+  old = (value >= 0) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value)))
+                     : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
+  return old;
+#else
+  int* addr_as_i = (int*)addr;
+  int old = *addr_as_i, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(addr_as_i, assumed, __float_as_int(fmaxf(value, __int_as_float(assumed))));
+  } while (assumed != old);
+  return __int_as_float(old);
+#endif
+}
+
+__device__ __forceinline__ float warpReduceMax(float value) {
+  value = fmaxf(value, __shfl_xor_sync(FULL_MASK, value, 16));
+  value = fmaxf(value, __shfl_xor_sync(FULL_MASK, value, 8));
+  value = fmaxf(value, __shfl_xor_sync(FULL_MASK, value, 4));
+  value = fmaxf(value, __shfl_xor_sync(FULL_MASK, value, 2));
+  value = fmaxf(value, __shfl_xor_sync(FULL_MASK, value, 1));
+  return value;
+}
+
+__device__ __forceinline__ float blockReduceMax(float value) {
+  static __shared__ float warpLevelMaxs[WARP_SIZE];
+  const int laneId = threadIdx.x % WARP_SIZE;
+  const int warpId = threadIdx.x / WARP_SIZE;
+
+  value = warpReduceMax(value);
+
+  if (laneId == 0) warpLevelMaxs[warpId] = value;
+  __syncthreads();
+
+  value = (threadIdx.x < blockDim.x / WARP_SIZE) ? warpLevelMaxs[laneId] : 0;
+  if (warpId == 0) value = warpReduceMax(value);
+
+  return value;
+}
+
+// Pads to a multiple of `alignment` rows.
+inline torch::Tensor pad_tensor(const torch::Tensor& tensor, int64_t alignment = 4, bool is_column_major = false) {
+  int64_t rows = tensor.size(0);
+  int64_t cols = tensor.size(1);
+  int64_t pad_rows = (alignment - (rows % alignment)) % alignment;  // Compute padding size
+
+  if (pad_rows == 0) {
+    return tensor;  // Already aligned
+  }
+
+  torch::Tensor padding = torch::zeros({pad_rows, cols}, tensor.options());
+  torch::Tensor tensor_padded = torch::cat({tensor, padding}, 0);  // Pad along rows
+
+  // Ensure column-major layout
+  if (is_column_major) {
+    return tensor_padded.t().contiguous().t();
+  }
+  return tensor_padded;
+}
+
+// Get the next power of 2 of a number
+inline uint32_t next_pow2(uint32_t x) noexcept {
+  if (x <= 1) return 1;
+  return 1u << (32 - __builtin_clz(x - 1));
+}
diff --git a/sgl-kernel/include/utils_hip.h b/sgl-kernel/include/utils_hip.h
new file mode 100644
index 000000000..db1056b0f
--- /dev/null
+++ b/sgl-kernel/include/utils_hip.h
@@ -0,0 +1,451 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+/* Copyright 2025 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <hip/hip_runtime.h>
+#include <torch/all.h>
+
+#ifdef USE_ROCM
+// Adapted from flashinfer-rocm [PR#491](https://github.com/flashinfer-ai/flashinfer/pull/491)
+#define _DISPATCH_CASE_F16(c_type, ...) \
+  case at::ScalarType::Half: {          \
+    using c_type = __half;              \
+    return __VA_ARGS__();               \
+  }
+
+#define _DISPATCH_CASE_BF16(c_type, ...) \
+  case at::ScalarType::BFloat16: {       \
+    using c_type = __hip_bfloat16;       \
+    return __VA_ARGS__();                \
+  }
+#endif  // USE_ROCM
+
+#ifndef USE_ROCM
+// Adapt from FlashInfer
+#ifdef FLASHINFER_ENABLE_F16
+#define _DISPATCH_CASE_F16(c_type, ...) \
+  case at::ScalarType::Half: {          \
+    using c_type = nv_half;             \
+    return __VA_ARGS__();               \
+  }
+#else
+#define _DISPATCH_CASE_F16(c_type, ...)
+#endif  // FLASHINFER_ENABLE_F16
+
+#ifdef FLASHINFER_ENABLE_BF16
+#define _DISPATCH_CASE_BF16(c_type, ...) \
+  case at::ScalarType::BFloat16: {       \
+    using c_type = nv_bfloat16;          \
+    return __VA_ARGS__();                \
+  }
+#else
+#define _DISPATCH_CASE_BF16(c_type, ...)
+#endif  // FLASHINFER_ENABLE_BF16
+
+#ifdef FLASHINFER_ENABLE_FP8_E4M3
+#define _DISPATCH_CASE_FP8_E4M3(c_type, ...) \
+  case at::ScalarType::Float8_e4m3fn: {      \
+    using c_type = __nv_fp8_e4m3;            \
+    return __VA_ARGS__();                    \
+  }
+#else
+#define _DISPATCH_CASE_FP8_E4M3(c_type, ...)
+#endif  // FLASHINFER_ENABLE_FP8_E4M3
+
+#ifdef FLASHINFER_ENABLE_FP8_E5M2
+#define _DISPATCH_CASE_FP8_E5M2(c_type, ...) \
+  case at::ScalarType::Float8_e5m2: {        \
+    using c_type = __nv_fp8_e5m2;            \
+    return __VA_ARGS__();                    \
+  }
+#else
+#define _DISPATCH_CASE_FP8_E5M2(c_type, ...)
+#endif  // FLASHINFER_ENABLE_FP8_E5M2
+
+#define DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(pytorch_dtype, c_type, ...)                 \
+  [&]() -> bool {                                                                        \
+    switch (pytorch_dtype) {                                                             \
+      _DISPATCH_CASE_F16(c_type, __VA_ARGS__)                                            \
+      _DISPATCH_CASE_BF16(c_type, __VA_ARGS__)                                           \
+      default:                                                                           \
+        std::ostringstream oss;                                                          \
+        oss << __PRETTY_FUNCTION__ << " failed to dispatch data type " << pytorch_dtype; \
+        TORCH_CHECK(false, oss.str());                                                   \
+        return false;                                                                    \
+    }                                                                                    \
+  }()
+
+#define DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP8(pytorch_dtype, c_type, ...)                      \
+  [&]() -> bool {                                                                            \
+    switch (pytorch_dtype) {                                                                 \
+      _DISPATCH_CASE_FP8_E4M3(c_type, __VA_ARGS__)                                           \
+      _DISPATCH_CASE_FP8_E5M2(c_type, __VA_ARGS__)                                           \
+      default:                                                                               \
+        std::ostringstream oss;                                                              \
+        oss << __PRETTY_FUNCTION__ << " failed to dispatch fp8 data type " << pytorch_dtype; \
+        TORCH_CHECK(false, oss.str());                                                       \
+        return false;                                                                        \
+    }                                                                                        \
+  }()
+
+#define DISPATCH_PYTORCH_DTYPE_TO_CTYPE(pytorch_dtype, c_type, ...)                      \
+  [&]() -> bool {                                                                        \
+    switch (pytorch_dtype) {                                                             \
+      _DISPATCH_CASE_F16(c_type, __VA_ARGS__)                                            \
+      _DISPATCH_CASE_BF16(c_type, __VA_ARGS__)                                           \
+      _DISPATCH_CASE_FP8_E4M3(c_type, __VA_ARGS__)                                       \
+      _DISPATCH_CASE_FP8_E5M2(c_type, __VA_ARGS__)                                       \
+      default:                                                                           \
+        std::ostringstream oss;                                                          \
+        oss << __PRETTY_FUNCTION__ << " failed to dispatch data type " << pytorch_dtype; \
+        TORCH_CHECK(false, oss.str());                                                   \
+        return false;                                                                    \
+    }                                                                                    \
+  }()
+
+#define _DISPATCH_SWITCH(var_name, cond, ...)                                           \
+  [&]() -> bool {                                                                       \
+    switch (cond) {                                                                     \
+      __VA_ARGS__                                                                       \
+      default:                                                                          \
+        std::ostringstream oss;                                                         \
+        oss << __PRETTY_FUNCTION__ << " failed to dispatch " var_name " " << int(cond); \
+        TORCH_CHECK(false, oss.str());                                                  \
+        return false;                                                                   \
+    }                                                                                   \
+  }()
+
+#define _DISPATCH_SWITCH_U16x2(var1_name, var2_name, cond1, cond2, ...)                                             \
+  [&]() -> bool {                                                                                                   \
+    switch (pack_u16(cond1, cond2)) {                                                                               \
+      __VA_ARGS__                                                                                                   \
+      default:                                                                                                      \
+        std::ostringstream oss;                                                                                     \
+        oss << __PRETTY_FUNCTION__ << " failed to dispatch (" var1_name ", " var2_name "): (" << int(cond1) << ", " \
+            << int(cond2) << ")";                                                                                   \
+        TORCH_CHECK(false, oss.str());                                                                              \
+        return false;                                                                                               \
+    }                                                                                                               \
+  }()
+
+#define _DISPATCH_CASE(case_expr, case_var, ...) \
+  case case_expr: {                              \
+    constexpr auto case_var = case_expr;         \
+    return __VA_ARGS__();                        \
+  }
+
+#define _DISPATCH_CASE_U16x2(case_expr1, case_expr2, case_var1, case_var2, ...) \
+  case pack_u16(case_expr1, case_expr2): {                                      \
+    constexpr auto case_var1 = case_expr1;                                      \
+    constexpr auto case_var2 = case_expr2;                                      \
+    return __VA_ARGS__();                                                       \
+  }
+
+#define DISPATCH_BOOL(expr, const_expr, ...) \
+  [&]() -> bool {                            \
+    if (expr) {                              \
+      constexpr bool const_expr = true;      \
+      return __VA_ARGS__();                  \
+    } else {                                 \
+      constexpr bool const_expr = false;     \
+      return __VA_ARGS__();                  \
+    }                                        \
+  }()
+
+inline void check_shape(const at::Tensor& a, const at::Tensor& b, const char* a_name, const char* b_name) {
+  TORCH_CHECK(a.dim() == b.dim(), a_name, ".dim() != ", b_name, ".dim(). ", a.dim(), " vs ", b.dim());
+  for (int i = 0; i < a.dim(); ++i) {
+    TORCH_CHECK(a.size(i) == b.size(i), a_name, ".size(", i, ") != ", b_name, ".size(", i, ")");
+  }
+}
+
+inline constexpr uint32_t pack_u16(uint16_t a, uint16_t b) {
+  return (uint32_t(a) << 16) | uint32_t(b);
+}
+
+#define CHECK_GQA_HEAD_DIVISIBLE(num_qo_heads, num_kv_heads) \
+  TORCH_CHECK(                                               \
+      num_qo_heads % num_kv_heads == 0,                      \
+      "num_qo_heads(",                                       \
+      num_qo_heads,                                          \
+      ") must be divisible by num_kv_heads(",                \
+      num_kv_heads,                                          \
+      ")")
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
+
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_LAST_DIM_CONTIGUOUS(x) \
+  TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimension")
+
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+#define CHECK_LAST_DIM_CONTIGUOUS_INPUT(x) \
+  CHECK_CUDA(x);                           \
+  CHECK_LAST_DIM_CONTIGUOUS(x)
+
+#define CHECK_DIM(d, x) TORCH_CHECK(x.dim() == d, #x " must be a " #d "D tensor")
+
+#define CHECK_SHAPE(a, b) check_shape(a, b, #a, #b)
+
+#define CHECK_EQ(a, b) TORCH_CHECK((a) == (b), "CHECK_EQ(" #a ", " #b ") failed. ", a, " vs ", b)
+
+#define CHECK_GE(a, b) TORCH_CHECK((a) >= (b), "CHECK_GE(" #a ", " #b ") failed. ", a, " vs ", b)
+
+inline bool is_float8_tensor(const at::Tensor& tensor) {
+  return tensor.scalar_type() == at::ScalarType::Float8_e4m3fn || tensor.scalar_type() == at::ScalarType::Float8_e5m2;
+}
+#endif  // USE_ROCM
+
+struct cuda_error : public std::runtime_error {
+  /**
+   * @brief Constructs a `cuda_error` object with the given `message`.
+   *
+   * @param message The error char array used to construct `cuda_error`
+   */
+  cuda_error(const char* message) : std::runtime_error(message) {}
+  /**
+   * @brief Constructs a `cuda_error` object with the given `message` string.
+   *
+   * @param message The `std::string` used to construct `cuda_error`
+   */
+  cuda_error(std::string const& message) : cuda_error{message.c_str()} {}
+};
+
+#define CHECK_CUDA_SUCCESS(cmd)                                         \
+  do {                                                                  \
+    hipError_t e = cmd;                                                \
+    if (e != hipSuccess) {                                             \
+      std::stringstream _message;                                       \
+      auto s = hipGetErrorString(e);                                   \
+      _message << std::string(s) + "\n" << __FILE__ << ':' << __LINE__; \
+      throw cuda_error(_message.str());                                 \
+    }                                                                   \
+  } while (0)
+
+#define CHECK_IS_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_IS_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_CUDA_INPUT(x) \
+  CHECK_IS_CUDA(x);         \
+  CHECK_IS_CONTIGUOUS(x)
+
+inline int getSMVersion() {
+  int device{-1};
+  CHECK_CUDA_SUCCESS(hipGetDevice(&device));
+  int sm_major = 0;
+  int sm_minor = 0;
+  CHECK_CUDA_SUCCESS(hipDeviceGetAttribute(&sm_major, hipDeviceAttributeComputeCapabilityMajor, device));
+  CHECK_CUDA_SUCCESS(hipDeviceGetAttribute(&sm_minor, hipDeviceAttributeComputeCapabilityMinor, device));
+  return sm_major * 10 + sm_minor;
+}
+
+inline bool isDeviceType(const std::string& device_type) {
+  int deviceCount;
+  CHECK_CUDA_SUCCESS(hipGetDeviceCount(&deviceCount));
+
+  int device_id = -1;
+  if (deviceCount >= 1) {
+    CHECK_CUDA_SUCCESS(hipGetDevice(&device_id));
+  } else {
+    return false;
+  }
+
+  hipDeviceProp_t prop;
+  CHECK_CUDA_SUCCESS(hipGetDeviceProperties(&prop, device_id));
+  if (device_type == std::string(prop.name)) {
+    return true;
+  }
+  return false;
+}
+
+inline bool getBoolEnv(char const* name) {
+  char const* env = std::getenv(name);
+  return env && env[0] == '1' && env[1] == '\0';
+}
+
+inline bool getEnvEnablePDL() {
+  static std::once_flag flag;
+  static bool enablePDL = false;
+  std::call_once(flag, [&]() {
+    if (getSMVersion() >= 90) {
+      // PDL will be enabled by setting the env variables `TRTLLM_ENABLE_PDL` to `1`
+      enablePDL = getBoolEnv("TRTLLM_ENABLE_PDL");
+    }
+  });
+  return enablePDL;
+}
+
+// SGLANG_SHFL_XOR_* adapted from https://github.com/vllm-project/vllm/blob/v0.7.3/csrc/cuda_compat.h#L19-L28
+#ifndef USE_ROCM
+#define SGLANG_SHFL_XOR_SYNC(mask, var, lane_mask) __shfl_xor_sync((mask), (var), (lane_mask))
+#define SGLANG_SHFL_XOR_SYNC_WIDTH(mask, var, lane_mask, width) __shfl_xor_sync((mask), (var), (lane_mask), (width))
+#else
+#define SGLANG_SHFL_XOR_SYNC(mask, var, lane_mask) __shfl_xor((var), (lane_mask))
+#define SGLANG_SHFL_XOR_SYNC_WIDTH(mask, var, lane_mask, width) __shfl_xor((var), (lane_mask), (width))
+#endif
+
+#define DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16(pytorch_dtype, c_type, ...)           \
+  [&]() -> bool {                                                                        \
+    switch (pytorch_dtype) {                                                             \
+      case at::ScalarType::Float: {                                                      \
+        using c_type = float;                                                            \
+        return __VA_ARGS__();                                                            \
+      }                                                                                  \
+        _DISPATCH_CASE_F16(c_type, __VA_ARGS__)                                          \
+        _DISPATCH_CASE_BF16(c_type, __VA_ARGS__)                                         \
+      default:                                                                           \
+        std::ostringstream oss;                                                          \
+        oss << __PRETTY_FUNCTION__ << " failed to dispatch data type " << pytorch_dtype; \
+        TORCH_CHECK(false, oss.str());                                                   \
+        return false;                                                                    \
+    }                                                                                    \
+  }()
+
+#define DISPATCH_CASE_INTEGRAL_TYPES(...)              \
+  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)  \
+  AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)  \
+  AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)   \
+  AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
+
+#define DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
+
+#define CEILDIV(x, y) (((x) + (y) - 1) / (y))
+
+#ifndef USE_ROCM
+#define WARP_SIZE 32
+#else
+#if defined(__GFX9__) || !defined(__HIP_DEVICE_COMPILE__)
+#define WARP_SIZE 64
+#else
+#define WARP_SIZE 32
+#endif
+#endif
+
+#ifdef USE_ROCM
+
+#include "hip/hip_math_def.h"
+#include "hip/hip_vec_dtypes.h"
+
+#else
+
+template <typename srcDtype>
+__device__ __forceinline__ float castToFloat(srcDtype val) {
+  return static_cast<srcDtype>(val);
+}
+
+template <typename dstDtype>
+__device__ __forceinline__ dstDtype castFromFloat(float val) {
+  return static_cast<dstDtype>(val);
+}
+
+#endif
+
+// add FP8 support
+// #ifndef USE_ROCM
+// #include <c10/util/Float8_e4m3fn.h>
+// using FP8_TYPE = c10::Float8_e4m3fn;
+// C10_HOST_DEVICE constexpr auto FP8_E4M3_MAX = std::numeric_limits<FP8_TYPE>::max();
+// #else  // USE_ROCM
+// #if HIP_FP8_TYPE_FNUZ
+// #include <c10/util/Float8_e4m3fnuz.h>
+// using FP8_TYPE = c10::Float8_e4m3fnuz;
+// constexpr auto FP8_E4M3_MAX = 224.0f;
+// #else
+// #if HIP_FP8_TYPE_E4M3
+// #include <c10/util/Float8_e4m3fn.h>
+// using FP8_TYPE = c10::Float8_e4m3fn;
+// C10_HOST_DEVICE constexpr auto FP8_E4M3_MAX = std::numeric_limits<FP8_TYPE>::max();
+// #else
+// #error "fp8 is not supported in this processor (arch < gfx942)."
+// #endif  // HIP_FP8_TYPE_E4M3
+// #endif  // HIP_FP8_TYPE_FNUZ
+// #endif  // USE_ROCM
+
+#define FULL_MASK 0xffffffff
+
+__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
+#ifndef USE_ROCM
+  float old;
+  old = (value >= 0) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value)))
+                     : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
+  return old;
+#else
+  int* addr_as_i = (int*)addr;
+  int old = *addr_as_i, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(addr_as_i, assumed, __float_as_int(fmaxf(value, __int_as_float(assumed))));
+  } while (assumed != old);
+  return __int_as_float(old);
+#endif
+}
+
+__device__ __forceinline__ float warpReduceMax(float value) {
+  value = fmaxf(value, __shfl_xor_sync(FULL_MASK, value, 16));
+  value = fmaxf(value, __shfl_xor_sync(FULL_MASK, value, 8));
+  value = fmaxf(value, __shfl_xor_sync(FULL_MASK, value, 4));
+  value = fmaxf(value, __shfl_xor_sync(FULL_MASK, value, 2));
+  value = fmaxf(value, __shfl_xor_sync(FULL_MASK, value, 1));
+  return value;
+}
+
+__device__ __forceinline__ float blockReduceMax(float value) {
+  static __shared__ float warpLevelMaxs[WARP_SIZE];
+  const int laneId = threadIdx.x % WARP_SIZE;
+  const int warpId = threadIdx.x / WARP_SIZE;
+
+  value = warpReduceMax(value);
+
+  if (laneId == 0) warpLevelMaxs[warpId] = value;
+  __syncthreads();
+
+  value = (threadIdx.x < blockDim.x / WARP_SIZE) ? warpLevelMaxs[laneId] : 0;
+  if (warpId == 0) value = warpReduceMax(value);
+
+  return value;
+}
+
+// Pads to a multiple of `alignment` rows.
+inline torch::Tensor pad_tensor(const torch::Tensor& tensor, int64_t alignment = 4, bool is_column_major = false) {
+  int64_t rows = tensor.size(0);
+  int64_t cols = tensor.size(1);
+  int64_t pad_rows = (alignment - (rows % alignment)) % alignment;  // Compute padding size
+
+  if (pad_rows == 0) {
+    return tensor;  // Already aligned
+  }
+
+  torch::Tensor padding = torch::zeros({pad_rows, cols}, tensor.options());
+  torch::Tensor tensor_padded = torch::cat({tensor, padding}, 0);  // Pad along rows
+
+  // Ensure column-major layout
+  if (is_column_major) {
+    return tensor_padded.t().contiguous().t();
+  }
+  return tensor_padded;
+}
+
+// Get the next power of 2 of a number
+inline uint32_t next_pow2(uint32_t x) noexcept {
+  if (x <= 1) return 1;
+  return 1u << (32 - __builtin_clz(x - 1));
+}
diff --git a/sgl-kernel/pyproject.toml b/sgl-kernel/pyproject.toml
new file mode 100644
index 000000000..778001ecb
--- /dev/null
+++ b/sgl-kernel/pyproject.toml
@@ -0,0 +1,39 @@
+[build-system]
+requires = [
+  "scikit-build-core>=0.10",
+  "torch>=2.8.0",
+  "wheel",
+]
+build-backend = "scikit_build_core.build"
+
+[project]
+name = "sgl-kernel"
+version = "0.3.9.post2"
+description = "Kernel Library for SGLang"
+readme = "README.md"
+requires-python = ">=3.10"
+license = { file = "LICENSE" }
+classifiers = [
+  "Programming Language :: Python :: 3",
+  "License :: OSI Approved :: Apache Software License",
+  "Environment :: GPU :: NVIDIA CUDA"
+]
+dependencies = []
+
+[project.urls]
+"Homepage" = "https://github.com/sgl-project/sglang/tree/main/sgl-kernel"
+"Bug Tracker" = "https://github.com/sgl-project/sglang/issues"
+
+[tool.wheel]
+exclude = [
+  "dist*",
+  "tests*",
+]
+
+[tool.scikit-build]
+cmake.build-type = "Release"
+minimum-version = "build-system.requires"
+
+wheel.py-api = "cp310"
+wheel.license-files = []
+wheel.packages = ["python/sgl_kernel"]
diff --git a/sgl-kernel/pyproject_cpu.toml b/sgl-kernel/pyproject_cpu.toml
new file mode 100644
index 000000000..79ae096e8
--- /dev/null
+++ b/sgl-kernel/pyproject_cpu.toml
@@ -0,0 +1,36 @@
+[build-system]
+requires = [
+  "scikit-build-core>=0.10",
+  "torch>=2.7.1",
+  "wheel",
+]
+build-backend = "scikit_build_core.build"
+
+[project]
+name = "sgl-kernel"
+version = "0.3.9.post2"
+description = "Kernel Library for SGLang"
+readme = "README.md"
+requires-python = ">=3.10"
+license = { file = "LICENSE" }
+classifiers = [
+  "Programming Language :: Python :: 3",
+  "License :: OSI Approved :: Apache Software License",
+  "Environment :: CPU"
+]
+dependencies = []
+
+[project.urls]
+"Homepage" = "https://github.com/sgl-project/sglang/tree/main/sgl-kernel"
+"Bug Tracker" = "https://github.com/sgl-project/sglang/issues"
+
+[tool.wheel]
+exclude = [
+  "dist*",
+  "tests*",
+]
+
+[tool.scikit-build]
+cmake.source-dir = "csrc/cpu"
+cmake.build-type = "Release"
+minimum-version = "build-system.requires"
diff --git a/sgl-kernel/pyproject_rocm.toml b/sgl-kernel/pyproject_rocm.toml
new file mode 100644
index 000000000..4c787f885
--- /dev/null
+++ b/sgl-kernel/pyproject_rocm.toml
@@ -0,0 +1,32 @@
+[build-system]
+requires = [
+  "setuptools>=75.0",
+  "scikit-build-core>=0.10",
+  "torch>=2.8.0",
+  "wheel",
+]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "sgl-kernel"
+version = "0.3.9.post2"
+description = "Kernel Library for SGLang"
+readme = "README.md"
+requires-python = ">=3.10"
+license = { file = "LICENSE" }
+classifiers = [
+  "Programming Language :: Python :: 3",
+  "License :: OSI Approved :: Apache Software License",
+  "Environment :: GPU :: NVIDIA CUDA"
+]
+dependencies = []
+
+[project.urls]
+"Homepage" = "https://github.com/sgl-project/sglang/tree/main/sgl-kernel"
+"Bug Tracker" = "https://github.com/sgl-project/sglang/issues"
+
+[tool.wheel]
+exclude = [
+  "dist*",
+  "tests*",
+]
diff --git a/sgl-kernel/python/sgl_kernel/__init__.py b/sgl-kernel/python/sgl_kernel/__init__.py
new file mode 100755
index 000000000..76c87d30b
--- /dev/null
+++ b/sgl-kernel/python/sgl_kernel/__init__.py
@@ -0,0 +1,117 @@
+import ctypes
+import os
+import platform
+
+import torch
+
+SYSTEM_ARCH = platform.machine()
+
+cuda_path = f"/usr/local/cuda/targets/{SYSTEM_ARCH}-linux/lib/libcudart.so.12"
+if os.path.exists(cuda_path):
+    ctypes.CDLL(cuda_path, mode=ctypes.RTLD_GLOBAL)
+
+from sgl_kernel import common_ops
+from sgl_kernel.allreduce import *
+from sgl_kernel.attention import (
+    cutlass_mla_decode,
+    cutlass_mla_get_workspace_size,
+    lightning_attention_decode,
+    merge_state,
+    merge_state_v2,
+)
+from sgl_kernel.cutlass_moe import cutlass_w4a8_moe_mm, get_cutlass_w4a8_moe_mm_data
+from sgl_kernel.elementwise import (
+    FusedSetKVBufferArg,
+    apply_rope_with_cos_sin_cache_inplace,
+    concat_mla_k,
+    copy_to_gpu_no_ce,
+    downcast_fp8,
+    fused_add_rmsnorm,
+    gelu_and_mul,
+    gelu_tanh_and_mul,
+    gemma_fused_add_rmsnorm,
+    gemma_rmsnorm,
+    rmsnorm,
+    silu_and_mul,
+)
+from sgl_kernel.mamba import causal_conv1d_fwd, causal_conv1d_update
+
+if torch.version.hip is not None:
+    from sgl_kernel.elementwise import gelu_quick
+
+from sgl_kernel.fused_moe import fused_marlin_moe
+from sgl_kernel.gemm import (
+    awq_dequantize,
+    bmm_fp8,
+    cutlass_scaled_fp4_mm,
+    dsv3_fused_a_gemm,
+    dsv3_router_gemm,
+    fp8_blockwise_scaled_mm,
+    fp8_scaled_mm,
+    gptq_gemm,
+    gptq_marlin_gemm,
+    gptq_shuffle,
+    int8_scaled_mm,
+    qserve_w4a8_per_chn_gemm,
+    qserve_w4a8_per_group_gemm,
+    scaled_fp4_experts_quant,
+    scaled_fp4_grouped_quant,
+    scaled_fp4_quant,
+    sgl_per_tensor_quant_fp8,
+    sgl_per_token_group_quant_fp8,
+    sgl_per_token_group_quant_int8,
+    sgl_per_token_quant_fp8,
+    shuffle_rows,
+    silu_and_mul_scaled_fp4_grouped_quant,
+)
+from sgl_kernel.grammar import apply_token_bitmask_inplace_cuda
+from sgl_kernel.kvcacheio import (
+    transfer_kv_all_layer,
+    transfer_kv_all_layer_mla,
+    transfer_kv_per_layer,
+    transfer_kv_per_layer_mla,
+)
+from sgl_kernel.marlin import (
+    awq_marlin_moe_repack,
+    awq_marlin_repack,
+    gptq_marlin_repack,
+)
+from sgl_kernel.memory import set_kv_buffer_kernel
+from sgl_kernel.moe import (
+    apply_shuffle_mul_sum,
+    cutlass_fp4_group_mm,
+    fp8_blockwise_scaled_grouped_mm,
+    moe_align_block_size,
+    moe_fused_gate,
+    prepare_moe_input,
+    topk_softmax,
+)
+from sgl_kernel.sampling import (
+    min_p_sampling_from_probs,
+    top_k_mask_logits,
+    top_k_renorm_prob,
+    top_k_top_p_sampling_from_logits,
+    top_k_top_p_sampling_from_probs,
+    top_p_renorm_prob,
+    top_p_sampling_from_probs,
+)
+from sgl_kernel.speculative import (
+    build_tree_kernel_efficient,
+    segment_packbits,
+    tree_speculative_sampling_target_only,
+    verify_tree_greedy,
+)
+from sgl_kernel.top_k import fast_topk
+from sgl_kernel.version import __version__
+
+
+def create_greenctx_stream_by_value(*args, **kwargs):
+    from sgl_kernel.spatial import create_greenctx_stream_by_value as _impl
+
+    return _impl(*args, **kwargs)
+
+
+def get_sm_available(*args, **kwargs):
+    from sgl_kernel.spatial import get_sm_available as _impl
+
+    return _impl(*args, **kwargs)
diff --git a/sgl-kernel/python/sgl_kernel/_fa4_interface.py b/sgl-kernel/python/sgl_kernel/_fa4_interface.py
new file mode 100644
index 000000000..512b0aaef
--- /dev/null
+++ b/sgl-kernel/python/sgl_kernel/_fa4_interface.py
@@ -0,0 +1,376 @@
+# Adapted from https://github.com/Dao-AILab/flash-attention/blob/203b9b3dba39d5d08dffb49c09aa622984dff07d/flash_attn/cute/interface.py
+
+# Copyright (c) 2025, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+# [2025-07-04] Version in Cute-DSL, for Hopper and Blackwell. You'd need to install nvidia-cutlass-dsl==4.1.0.
+
+
+import math
+from typing import Optional, Tuple
+
+import cuda.bindings.driver as cuda
+import cutlass
+import cutlass.cute as cute
+import torch
+from cutlass.cute.runtime import from_dlpack
+from flash_attn.cute.flash_fwd import FlashAttentionForwardSm90
+from flash_attn.cute.flash_fwd_sm100 import FlashAttentionForwardSm100
+
+
+def maybe_contiguous(x):
+    return x.contiguous() if x is not None and x.stride(-1) != 1 else x
+
+
+torch2cute_dtype_map = {
+    torch.float16: cutlass.Float16,
+    torch.bfloat16: cutlass.BFloat16,
+    torch.float32: cutlass.Float32,
+}
+
+
+def _flash_attn_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: Optional[torch.Tensor] = None,
+    cu_seqlens_k: Optional[torch.Tensor] = None,
+    seqused_q: Optional[torch.Tensor] = None,
+    seqused_k: Optional[torch.Tensor] = None,
+    page_table: Optional[torch.Tensor] = None,
+    softmax_scale: Optional[float] = None,
+    causal: bool = False,
+    softcap: Optional[float] = None,
+    window_size_left: Optional[int] = None,
+    window_size_right: Optional[int] = None,
+    learnable_sink: Optional[torch.Tensor] = None,
+    # m_block_size: int = 128,
+    # n_block_size: int = 64,
+    # num_threads: int = 128,
+    m_block_size: int = 128,
+    n_block_size: int = 128,
+    num_threads: int = 384,
+    pack_gqa: Optional[bool] = None,
+    _compute_capability: Optional[int] = None,
+    return_softmax_lse: Optional[bool] = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    q, k, v = [maybe_contiguous(t) for t in (q, k, v)]
+    num_head, head_dim = q.shape[-2:]
+    if cu_seqlens_q is None:
+        batch_size, seqlen_q = q.shape[:2]
+        total_q = batch_size * seqlen_q
+    else:
+        batch_size = cu_seqlens_q.shape[0] - 1
+        seqlen_q = None
+        total_q = q.shape[0]
+    if page_table is not None:
+        assert cu_seqlens_k is None, "page_table is not supported with cu_seqlens_k"
+        assert page_table.dtype == torch.int32, "page_table must be int32"
+        assert (
+            page_table.stride(-1) == 1
+        ), "page_table must be contiguous in the last dimension"
+        max_num_pages_per_seq = page_table.shape[1]
+        assert page_table.shape == (batch_size, max_num_pages_per_seq)
+        num_pages, page_size = k.shape[:2]
+        seqlen_k = num_pages * page_size
+    else:
+        num_pages, page_size = None, None
+        seqlen_k = k.shape[-3]
+    num_head_kv = k.shape[-2]
+    head_dim_v = v.shape[-1]
+    if cu_seqlens_k is None:
+        if page_table is None:
+            assert k.shape == (batch_size, seqlen_k, num_head_kv, head_dim)
+            assert v.shape == (batch_size, seqlen_k, num_head_kv, head_dim_v)
+        else:
+            assert k.shape == (num_pages, page_size, num_head_kv, head_dim)
+            assert v.shape == (num_pages, page_size, num_head_kv, head_dim_v)
+    else:
+        assert k.shape == (seqlen_k, num_head_kv, head_dim)
+        assert v.shape == (seqlen_k, num_head_kv, head_dim_v)
+        assert cu_seqlens_k.shape == (
+            batch_size + 1,
+        ), "cu_seqlens_k must have shape (batch_size + 1,)"
+    if cu_seqlens_q is not None:
+        assert cu_seqlens_q.shape == (
+            batch_size + 1,
+        ), "cu_seqlens_q must have shape (batch_size + 1,)"
+    assert seqused_q is None or seqused_q.shape == (
+        batch_size,
+    ), "seqused_q must have shape (batch_size,)"
+    assert seqused_k is None or seqused_k.shape == (
+        batch_size,
+    ), "seqused_k must have shape (batch_size,)"
+    assert q.dtype in [
+        torch.float16,
+        torch.bfloat16,
+    ], "inputs must be float16 or bfloat16"
+    assert q.dtype == k.dtype == v.dtype, "inputs must have the same dtype"
+    for t in [cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k]:
+        if t is not None:
+            assert (
+                t.dtype == torch.int32
+            ), "cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k must be int32"
+            assert (
+                t.stride(0) == 1
+            ), "cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k must be contiguous"
+    if learnable_sink is not None:
+        assert learnable_sink.shape == (num_head,)
+        assert learnable_sink.dtype == torch.bfloat16, "learnable_sink must be bfloat16"
+    assert all(
+        t is None or t.is_cuda
+        for t in (
+            q,
+            k,
+            v,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            seqused_q,
+            seqused_k,
+            page_table,
+            learnable_sink,
+        )
+    ), "inputs must be on CUDA device"
+    assert num_head % num_head_kv == 0, "num_head must be divisible by num_head_kv"
+    assert head_dim <= 256, "head_dim must be less than or equal to 256"
+    alignment = 16 // q.element_size()
+    assert head_dim % alignment == 0, f"head_dim must be divisible by {alignment}"
+    assert head_dim_v % alignment == 0, f"head_dim_v must be divisible by {alignment}"
+    if softmax_scale is None:
+        softmax_scale = 1.0 / math.sqrt(head_dim)
+    if softcap == 0.0:
+        softcap = None
+    qhead_per_kvhead = num_head // num_head_kv
+    if pack_gqa is None:
+        pack_gqa = qhead_per_kvhead > 1
+
+    out_torch_dtype = q.dtype
+    device = q.device
+    q_batch_seqlen_shape = (
+        (batch_size, seqlen_q) if cu_seqlens_q is None else (total_q,)
+    )
+    out = torch.empty(
+        *q_batch_seqlen_shape,
+        num_head,
+        head_dim_v,
+        dtype=out_torch_dtype,
+        device=device,
+    )
+    lse_shape = (
+        (batch_size, num_head, seqlen_q)
+        if cu_seqlens_q is None
+        else (num_head, total_q)
+    )
+    lse = (
+        torch.empty(lse_shape, dtype=torch.float32, device=device)
+        if return_softmax_lse
+        else None
+    )
+
+    dtype = torch2cute_dtype_map[q.dtype]
+    q_tensor, k_tensor, v_tensor, o_tensor = [
+        from_dlpack(t.detach(), assumed_align=16).mark_layout_dynamic(
+            leading_dim=t.ndim - 1
+        )
+        for t in (q, k, v, out)
+    ]
+    lse_tensor = (
+        from_dlpack(lse.detach(), assumed_align=4).mark_layout_dynamic(
+            leading_dim=lse.ndim - 1
+        )
+        if lse is not None
+        else None
+    )
+    (
+        cu_seqlens_q_tensor,
+        cu_seqlens_k_tensor,
+        seqused_q_tensor,
+        seqused_k_tensor,
+        learnable_sink_tensor,
+    ) = [
+        (
+            from_dlpack(t.detach(), assumed_align=4).mark_layout_dynamic(leading_dim=0)
+            if t is not None
+            else None
+        )
+        for t in (cu_seqlens_q, cu_seqlens_k, seqused_q, seqused_k, learnable_sink)
+    ]
+    page_table_tensor = (
+        from_dlpack(page_table.detach(), assumed_align=4).mark_layout_dynamic(
+            leading_dim=1
+        )
+        if page_table is not None
+        else None
+    )
+    if causal:
+        window_size_right = 0
+    local = window_size_left is not None or window_size_right is not None
+    if window_size_left is not None or window_size_right is not None:
+        if window_size_left is None and window_size_right == 0:
+            causal, local = True, False
+        else:
+            causal, local = False, True
+    compute_capability = (
+        torch.cuda.get_device_capability()[0]
+        if _compute_capability is None
+        else _compute_capability
+    )
+    assert compute_capability in [
+        9,
+        10,
+    ], "Unsupported compute capability. Supported: 9.x, 10.x"
+    current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+
+    if compute_capability == 9:  # TODO: tune block size according to hdim
+        if head_dim == head_dim_v == 128 and not causal and not local:
+            n_block_size = 192
+    if compute_capability == 10:
+        # TODO: fix the varlen case
+        if (
+            pack_gqa
+            and (128 % qhead_per_kvhead != 0)
+            or (cu_seqlens_q is not None or seqused_q is not None)
+        ):
+            pack_gqa = False
+
+    compile_key = (
+        dtype,
+        head_dim,
+        head_dim_v,
+        qhead_per_kvhead,
+        causal,
+        softcap is not None,
+        lse is None,
+        cu_seqlens_q is None,
+        cu_seqlens_k is None,
+        seqused_q is None,
+        seqused_k is None,
+        page_table is not None,
+        window_size_left is not None,
+        window_size_right is not None,
+        learnable_sink is not None,
+        m_block_size,
+        n_block_size,
+        num_threads,
+        pack_gqa,
+        compute_capability,
+    )
+    if compile_key not in _flash_attn_fwd.compile_cache:
+        if compute_capability == 9:
+            assert page_table is None, "paged KV not supported on SM 9.0"
+            # fa_fwd = FlashAttentionForwardSm80(
+            fa_fwd = FlashAttentionForwardSm90(
+                dtype,
+                head_dim,
+                head_dim_v,
+                qhead_per_kvhead,
+                is_causal=causal,
+                is_local=local,
+                pack_gqa=pack_gqa,
+                m_block_size=m_block_size,
+                n_block_size=n_block_size,
+                # num_stages=1,
+                num_stages=2,
+                num_threads=num_threads,
+                Q_in_regs=False,
+            )
+        elif compute_capability == 10:
+            assert page_size in [
+                None,
+                128,
+            ], "Only page_size=128 is supported for paged KV on SM 10.0"
+            fa_fwd = FlashAttentionForwardSm100(
+                head_dim,
+                head_dim_v,
+                qhead_per_kvhead=qhead_per_kvhead,
+                is_causal=causal,
+                is_local=local,
+                pack_gqa=pack_gqa,
+                is_persistent=not causal
+                and not local
+                and cu_seqlens_q is None
+                and seqused_q is None,
+            )
+        else:
+            raise ValueError(
+                f"Unsupported compute capability: {compute_capability}. Supported: 9.x, 10.x"
+            )
+        # TODO: check @can_implement
+        _flash_attn_fwd.compile_cache[compile_key] = cute.compile(
+            fa_fwd,
+            q_tensor,
+            k_tensor,
+            v_tensor,
+            o_tensor,
+            lse_tensor,
+            softmax_scale,
+            current_stream,
+            cu_seqlens_q_tensor,
+            cu_seqlens_k_tensor,
+            seqused_q_tensor,
+            seqused_k_tensor,
+            page_table_tensor,
+            softcap,
+            window_size_left,
+            window_size_right,
+            learnable_sink_tensor,
+        )
+    _flash_attn_fwd.compile_cache[compile_key](
+        q_tensor,
+        k_tensor,
+        v_tensor,
+        o_tensor,
+        lse_tensor,
+        softmax_scale,
+        current_stream,
+        cu_seqlens_q_tensor,
+        cu_seqlens_k_tensor,
+        seqused_q_tensor,
+        seqused_k_tensor,
+        page_table_tensor,
+        softcap,
+        window_size_left,
+        window_size_right,
+        learnable_sink_tensor,
+    )
+    return out, lse
+
+
+_flash_attn_fwd.compile_cache = {}
+
+
+def flash_attn_varlen_func(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: Optional[torch.Tensor] = None,
+    cu_seqlens_k: Optional[torch.Tensor] = None,
+    seqused_q: Optional[torch.Tensor] = None,
+    seqused_k: Optional[torch.Tensor] = None,
+    page_table: Optional[torch.Tensor] = None,
+    softmax_scale: Optional[float] = None,
+    causal: bool = False,
+    window_size: Tuple[Optional[int], Optional[int]] = (None, None),
+    learnable_sink: Optional[torch.Tensor] = None,
+    softcap: float = 0.0,
+    pack_gqa: Optional[bool] = None,
+    return_softmax_lse: Optional[bool] = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    out, lse = _flash_attn_fwd(
+        q,
+        k,
+        v,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        seqused_q,
+        seqused_k,
+        page_table=page_table,
+        softmax_scale=softmax_scale,
+        causal=causal,
+        window_size_left=window_size[0],
+        window_size_right=window_size[1],
+        learnable_sink=learnable_sink,
+        softcap=softcap,
+        pack_gqa=pack_gqa,
+        return_softmax_lse=return_softmax_lse,
+    )
+
+    return (out, lse) if return_softmax_lse else out
diff --git a/sgl-kernel/python/sgl_kernel/allreduce.py b/sgl-kernel/python/sgl_kernel/allreduce.py
new file mode 100644
index 000000000..544fc1d77
--- /dev/null
+++ b/sgl-kernel/python/sgl_kernel/allreduce.py
@@ -0,0 +1,173 @@
+from typing import List, Optional, Tuple
+
+import torch
+
+if torch.version.hip is not None:
+    # ROCM custom allreduce
+    def init_custom_ar(
+        meta: torch.Tensor,
+        rank_data: torch.Tensor,
+        handles: List[str],
+        offsets: List[int],
+        rank: int,
+        full_nvlink: bool,
+    ) -> int:
+        return torch.ops.sgl_kernel.init_custom_ar.default(
+            meta, rank_data, handles, offsets, rank, full_nvlink
+        )
+
+    def all_reduce_reg(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
+        torch.ops.sgl_kernel.all_reduce_reg.default(fa, inp, out)
+
+    def all_reduce_unreg(
+        fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor, out: torch.Tensor
+    ) -> None:
+        torch.ops.sgl_kernel.all_reduce_unreg.default(fa, inp, reg_buffer, out)
+
+    def dispose(fa: int) -> None:
+        torch.ops.sgl_kernel.dispose.default(fa)
+
+    def meta_size() -> int:
+        return torch.ops.sgl_kernel.meta_size.default()
+
+    def register_buffer(
+        fa: int, t: torch.Tensor, handles: List[str], offsets: List[int]
+    ) -> None:
+        return torch.ops.sgl_kernel.register_buffer.default(fa, t, handles, offsets)
+
+    def get_graph_buffer_ipc_meta(fa: int) -> Tuple[torch.Tensor, List[int]]:
+        return torch.ops.sgl_kernel.get_graph_buffer_ipc_meta.default(fa)
+
+    def register_graph_buffers(
+        fa: int, handles: List[str], offsets: List[List[int]]
+    ) -> None:
+        torch.ops.sgl_kernel.register_graph_buffers.default(fa, handles, offsets)
+
+    def allocate_meta_buffer(size: int) -> torch.Tensor:
+        return torch.ops.sgl_kernel.allocate_meta_buffer.default(size)
+
+    def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
+        return torch.ops.sgl_kernel.get_meta_buffer_ipc_handle.default(inp)
+
+    # ROCM quick allreduce
+    def init_custom_qr(
+        rank: int, world_size: int, qr_max_size: Optional[int] = None
+    ) -> int:
+        return torch.ops.sgl_kernel.init_custom_qr.default(
+            world_size, rank, qr_max_size
+        )
+
+    def qr_get_handle(fa: int) -> torch.Tensor:
+        return torch.ops.sgl_kernel.qr_get_handle.default(fa)
+
+    def qr_open_handles(fa: int, handles: list[torch.Tensor]) -> None:
+        torch.ops.sgl_kernel.qr_open_handles.default(fa, handles)
+
+    def qr_all_reduce(
+        fa: int,
+        profile: int,
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        cast_bf162half: bool,
+    ) -> None:
+        torch.ops.sgl_kernel.qr_all_reduce.default(
+            fa, profile, inp, out, cast_bf162half
+        )
+
+    def qr_destroy(fa: int) -> None:
+        torch.ops.sgl_kernel.qr_destroy.default(fa)
+
+    def qr_max_size() -> int:
+        return torch.ops.sgl_kernel.qr_max_size.default()
+
+    # mscclpp
+    def mscclpp_generate_unique_id() -> bytes:
+        raise NotImplementedError()
+
+    def mscclpp_init_context(
+        unique_id: bytes,
+        rank: int,
+        world_size: int,
+        scratch: torch.Tensor,
+        put_buffer: torch.Tensor,
+        nranks_per_node: int,
+        rank_to_node: List[int],
+        rank_to_ib: List[int],
+        context_selection: int,
+    ) -> int:
+        raise NotImplementedError()
+
+    def mscclpp_allreduce(
+        context: int, inp: torch.Tensor, out: torch.Tensor, nthreads: int, nblocks: int
+    ) -> None:
+        raise NotImplementedError()
+
+else:
+
+    def init_custom_ar(
+        ipc_tensors: List[int], rank_data: torch.Tensor, rank: int, full_nvlink: bool
+    ) -> int:
+        return torch.ops.sgl_kernel.init_custom_ar.default(
+            ipc_tensors, rank_data, rank, full_nvlink
+        )
+
+    def dispose(fa: int) -> None:
+        torch.ops.sgl_kernel.dispose.default(fa)
+
+    def all_reduce(
+        fa: int,
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        reg_buffer: int,
+        reg_buffer_sz_bytes: int,
+    ) -> None:
+        torch.ops.sgl_kernel.all_reduce.default(
+            fa, inp, out, reg_buffer, reg_buffer_sz_bytes
+        )
+
+    def get_graph_buffer_ipc_meta(fa) -> Tuple[List[int], List[int]]:
+        return torch.ops.sgl_kernel.get_graph_buffer_ipc_meta.default(fa)
+
+    def register_buffer(fa: int, fake_ipc_ptrs: List[int]) -> None:
+        return torch.ops.sgl_kernel.register_buffer.default(fa, fake_ipc_ptrs)
+
+    def register_graph_buffers(
+        fa: int, handles: List[List[int]], offsets: List[List[int]]
+    ) -> None:
+        torch.ops.sgl_kernel.register_graph_buffers.default(fa, handles, offsets)
+
+    def meta_size() -> int:
+        return torch.ops.sgl_kernel.meta_size.default()
+
+    def mscclpp_generate_unique_id() -> torch.Tensor:
+        return torch.ops.sgl_kernel.mscclpp_generate_unique_id.default()
+
+    def mscclpp_init_context(
+        unique_id: torch.Tensor,
+        rank: int,
+        world_size: int,
+        scratch: torch.Tensor,
+        put_buffer: torch.Tensor,
+        nranks_per_node: int,
+        rank_to_node: List[int],
+        rank_to_ib: List[int],
+        context_selection: int,
+    ) -> int:
+        return torch.ops.sgl_kernel.mscclpp_init_context.default(
+            unique_id,
+            rank,
+            world_size,
+            scratch,
+            put_buffer,
+            nranks_per_node,
+            rank_to_node,
+            rank_to_ib,
+            context_selection,
+        )
+
+    def mscclpp_allreduce(
+        context: int, inp: torch.Tensor, out: torch.Tensor, nthreads: int, nblocks: int
+    ) -> None:
+        torch.ops.sgl_kernel.mscclpp_allreduce.default(
+            context, inp, out, nthreads, nblocks
+        )
diff --git a/sgl-kernel/python/sgl_kernel/attention.py b/sgl-kernel/python/sgl_kernel/attention.py
new file mode 100644
index 000000000..f15b4fa24
--- /dev/null
+++ b/sgl-kernel/python/sgl_kernel/attention.py
@@ -0,0 +1,138 @@
+from typing import Optional, Tuple
+
+import torch
+
+
+def lightning_attention_decode(q, k, v, past_kv, slope, output, new_kv):
+    torch.ops.sgl_kernel.lightning_attention_decode.default(
+        q, k, v, past_kv, slope, output, new_kv
+    )
+
+
+def merge_state(
+    v_a: torch.Tensor,
+    s_a: torch.Tensor,
+    v_b: torch.Tensor,
+    s_b: torch.Tensor,
+    v_merged: Optional[torch.Tensor] = None,
+    s_merged: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    s_a = s_a.to(torch.float32)
+    s_b = s_b.to(torch.float32)
+    # Avoid creating new tensors if they are already provided
+    if v_merged is None:
+        v_merged = torch.empty_like(v_a)
+    if s_merged is None:
+        s_merged = torch.empty_like(s_a)
+    torch.ops.sgl_kernel.merge_state.default(v_a, s_a, v_b, s_b, v_merged, s_merged)
+    return v_merged, s_merged
+
+
+def merge_state_v2(
+    v_a: torch.Tensor,
+    s_a: torch.Tensor,
+    v_b: torch.Tensor,
+    s_b: torch.Tensor,
+    v_merged: Optional[torch.Tensor] = None,
+    s_merged: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    s_a = s_a.to(torch.float32)
+    s_b = s_b.to(torch.float32)
+    # TODO(DefTruth): Currently, the custom merge_attn_states kernel
+    # does not support the FP8 data type and non - CUDA devices.
+    # It may be necessary to fall back to using the Triton kernel.
+
+    # Avoid creating new tensors if they are already provided
+    if v_merged is None:
+        v_merged = torch.empty_like(v_a)
+    if s_merged is None:
+        s_merged = torch.empty_like(s_a)
+    torch.ops.sgl_kernel.merge_state_v2.default(v_a, s_a, v_b, s_b, v_merged, s_merged)
+    return v_merged, s_merged
+
+
+def cutlass_mla_decode(
+    q_nope: torch.Tensor,
+    q_pe: torch.Tensor,
+    kv_c_and_k_pe_cache: torch.Tensor,
+    seq_lens: torch.Tensor,
+    page_table: torch.Tensor,
+    workspace: torch.Tensor,
+    sm_scale: float,
+    num_kv_splits: int = 1,  # Set to 1 to avoid cuda_graph issue by default.
+) -> torch.Tensor:
+    assert q_nope.ndim == 3, f"q_nope must be a 3D tensor, but got {q_nope.ndim}"
+    assert q_pe.ndim == 3, f"q_pe must be a 3D tensor, but got {q_pe.ndim}"
+    assert (
+        kv_c_and_k_pe_cache.ndim == 3
+    ), f"kv_c_and_k_pe_cache must be a 3D tensor, but got {kv_c_and_k_pe_cache.ndim}"
+
+    B_q, H, D_q_nope = q_nope.shape
+    B_q_2, H_2, D_q_pe = q_pe.shape
+    assert (B_q == B_q_2) and (H == H_2)
+
+    _, PAGE_SIZE, D_ckv = kv_c_and_k_pe_cache.shape
+
+    D_latent = 512
+    D_rope = 64
+    assert D_q_nope == D_latent
+    assert D_q_pe == D_rope
+    assert D_ckv == D_latent + D_rope
+
+    MAX_HEADS = 128
+    assert H <= MAX_HEADS, f"H must be <= {MAX_HEADS}, but got {H}"
+    if H < MAX_HEADS:
+        q_nope_padded = q_nope.new_empty((B_q, MAX_HEADS, D_q_nope))
+        q_nope_padded[:, :H] = q_nope
+        q_nope = q_nope_padded
+
+        q_pe_padded = q_pe.new_empty((B_q, MAX_HEADS, D_q_pe))
+        q_pe_padded[:, :H] = q_pe
+        q_pe = q_pe_padded
+
+    assert len(page_table.shape) == 2
+    B_block_table, block_num = page_table.shape
+    assert B_block_table == B_q
+    assert block_num > 0, f"block num must be greater than 0, got {block_num}"
+    assert block_num % (128 / PAGE_SIZE) == 0
+
+    # TODO(kaixih@nvidia): support fp8
+    assert q_nope.dtype in (
+        torch.float16,
+        torch.bfloat16,
+    ), f"q_nope.dtype needs to be fp16 or bf16 but got {q_nope.dtype}."
+    assert q_nope.dtype == q_pe.dtype == kv_c_and_k_pe_cache.dtype
+    assert (
+        seq_lens.dtype == torch.int32
+    ), f"seq_lens.dtype needs to be int32 but got {seq_lens.dtype}."
+    assert (
+        page_table.dtype == torch.int32
+    ), f"page_table.dtype needs to be int32 but got {page_table.dtype}."
+
+    out = q_nope.new_empty((B_q, MAX_HEADS, D_latent))
+
+    torch.ops.sgl_kernel.cutlass_mla_decode.default(
+        out,
+        q_nope,
+        q_pe,
+        kv_c_and_k_pe_cache,
+        seq_lens,
+        page_table,
+        workspace,
+        sm_scale,
+        num_kv_splits,
+    )
+    return out[:, :H].contiguous()
+
+
+def cutlass_mla_get_workspace_size(
+    max_seq_len: int,
+    num_batches: int,
+    sm_count: int = 0,
+    num_kv_splits: int = 1,  # Set to 1 to avoid cuda_graph issue by default.
+) -> int:
+    assert max_seq_len > 0, f"max_seq_len must be greater than 0, got {max_seq_len}"
+    assert num_batches > 0, f"num_batches must be greater than 0, got {num_batches}"
+    return torch.ops.sgl_kernel.cutlass_mla_get_workspace_size.default(
+        max_seq_len, num_batches, sm_count, num_kv_splits
+    )
diff --git a/sgl-kernel/python/sgl_kernel/cutlass_moe.py b/sgl-kernel/python/sgl_kernel/cutlass_moe.py
new file mode 100644
index 000000000..52256bda1
--- /dev/null
+++ b/sgl-kernel/python/sgl_kernel/cutlass_moe.py
@@ -0,0 +1,112 @@
+import torch
+
+
+def get_cutlass_w4a8_moe_mm_data(
+    topk_ids: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    problem_sizes1: torch.Tensor,
+    problem_sizes2: torch.Tensor,
+    input_permutation: torch.Tensor,
+    output_permutation: torch.Tensor,
+    num_experts: int,
+    n: int,
+    k: int,
+):
+    """
+    Prepare data necessary to perform CUTLASS grouped matrix multiplications
+    used in CUTLASS-based fused MoE.
+
+    The function takes in topk_ids (token-expert mapping) and uses it to
+    compute:
+    - expert_offsets: Indices that mark at which token index each expert begins
+                      its computation after the input is sorted with
+                      input_permutation. The number of tokens computed with
+                      expert E is expert_offsets[E + 1] - expert_offsets[E]
+    - problem_sizes1, problem_sizes2: MxNxK sizes of each expert's
+                                      multiplication in two grouped MMs used in
+                                      the fused MoE operation.
+    - input_permutation: Permutation that must be used to shuffle the input
+                         before executing the MMs.
+    - output_permutation: Permutation that must be used to shuffle the output
+                          after executing the MMs.
+    """
+    torch.ops.sgl_kernel.get_cutlass_w4a8_moe_mm_data.default(
+        topk_ids,
+        expert_offsets,
+        problem_sizes1,
+        problem_sizes2,
+        input_permutation,
+        output_permutation,
+        num_experts,
+        n,
+        k,
+    )
+
+
+def cutlass_w4a8_moe_mm(
+    d: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_scales: torch.Tensor,
+    b_scales: torch.Tensor,
+    experts_offsets: torch.tensor,
+    problem_sizes: torch.tensor,
+    a_strides: torch.tensor,
+    b_strides: torch.tensor,
+    d_strides: torch.tensor,
+    s_strides: torch.tensor,
+    chunk_size: int = 128,
+    topk: int = 8,
+):
+    """
+    Perform grouped matrix multiplication between int4 weights and fp8 activations.
+
+    This function executes multiple GEMM operations in parallel, which is useful for
+    scenarios like Mixture of Experts (MoE) where different inputs go through different
+    experts. The implementation leverages NVIDIA Hopper architecture features for
+    optimal performance with quantized weights.
+
+    Args:
+        d: Output matrices of shape [total_m, total_n]
+        a: Activation matrices in FP8 (float_e4m3_t) format
+           Each tensor should be of shape [total_m, K] in row-major layout
+        b: Weight matrices in packed int4 format
+           Each tensor should be of shape [E, N, K//2] in column-major layout
+           where each byte contains two 4-bit integers
+        a_scales: Scale factors for the inputs
+        b_scales: Scale factors for the quantized weights
+           Each tensor should be of shape [E, K//512, N*8]
+        experts_offsets: Tensor containing expert offsets for determining group boundaries
+        problem_sizes: with shape [num_experts, 3] (M, N, K for each group) (int32)
+        a_strides: Strides information for A matrices
+        b_strides: Strides information for B matrices
+        d_strides: Strides information for D matrices
+        s_strides: Strides information for b_scales matrices
+        chunk_size: Number of elements each scale value applies to (K//512), default to 128
+
+    Requirements:
+        - All tensors must be on a CUDA device
+        - Requires an NVIDIA Hopper GPU (H100)
+        - A tensors must be in float8_e4m3fn format
+        - B tensors must contain packed int4 values (stored as int8)
+
+    Note:
+        The function computes: D = (A * (B * scales))
+        for each group of tensors in parallel
+    """
+
+    torch.ops.sgl_kernel.cutlass_w4a8_moe_mm.default(
+        d,
+        a,
+        b,
+        a_scales,
+        b_scales,
+        experts_offsets,
+        problem_sizes,
+        a_strides,
+        b_strides,
+        d_strides,
+        s_strides,
+        chunk_size,
+        topk,
+    )
diff --git a/sgl-kernel/python/sgl_kernel/elementwise.py b/sgl-kernel/python/sgl_kernel/elementwise.py
new file mode 100644
index 000000000..af3adfd4a
--- /dev/null
+++ b/sgl-kernel/python/sgl_kernel/elementwise.py
@@ -0,0 +1,381 @@
+from dataclasses import dataclass
+from typing import List, Optional
+
+import torch
+from sgl_kernel.utils import get_cuda_stream, is_arch_support_pdl
+
+
+# These implementations extensively draw from and build upon the FlashInfer project https://github.com/flashinfer-ai/flashinfer
+# Kudos to @yzh119
+def rmsnorm(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+    out: Optional[torch.Tensor] = None,
+    enable_pdl: Optional[bool] = None,
+) -> torch.Tensor:
+    r"""Root mean square normalization.
+
+    ``out[i] = (input[i] / RMS(input)) * weight[i]``
+
+    Parameters
+    ----------
+    input: torch.Tensor
+        Input tensor, shape (batch_size, hidden_size).
+    weight: torch.Tensor
+        Weight tensor, shape (hidden_size,).
+    eps: float
+        Epsilon for numerical stability.
+    out: Optional[torch.Tensor]
+        The output tensor, if specified, the kernel will update this tensor inplace.
+    enable_pdl: Optional[bool]
+        Whether to enable `programmatic dependent launch
+        <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization>`_
+        If None, will be automatically enabled on Hopper architecture.
+
+    Returns
+    -------
+    output: torch.Tensor
+        Normalized tensor, shape (batch_size, hidden_size).
+    """
+    if out is None:
+        out = torch.empty_like(input)
+    if enable_pdl is None:
+        enable_pdl = is_arch_support_pdl()
+    torch.ops.sgl_kernel.rmsnorm.default(out, input, weight, eps, enable_pdl)
+    return out
+
+
+def fused_add_rmsnorm(
+    input: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+    enable_pdl: Optional[bool] = None,
+) -> None:
+    r"""Fused add root mean square normalization.
+
+    Step 1:
+    ``residual[i] += input[i]``
+
+    Step 2:
+    ``input[i] = (residual[i] / RMS(residual)) * weight[i]``
+
+    Parameters
+    ----------
+    input: torch.Tensor
+        Input tensor, shape (batch_size, hidden_size).
+    residual: torch.Tensor
+        Residual tensor, shape (batch_size, hidden_size).
+    weight: torch.Tensor
+        Weight tensor, shape (hidden_size,).
+    eps: float
+        Epsilon for numerical stability.
+    enable_pdl: Optional[bool]
+        Whether to enable `programmatic dependent launch
+        <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization>`_
+        If None, will be automatically enabled on Hopper architecture.
+    """
+    if enable_pdl is None:
+        enable_pdl = is_arch_support_pdl()
+    torch.ops.sgl_kernel.fused_add_rmsnorm.default(
+        input, residual, weight, eps, enable_pdl
+    )
+
+
+def gemma_rmsnorm(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+    out: Optional[torch.Tensor] = None,
+    enable_pdl: Optional[bool] = None,
+) -> torch.Tensor:
+    r"""Gemma-style root mean square normalization.
+
+    ``out[i] = (input[i] / RMS(input)) * (weight[i] + 1)``
+
+    Parameters
+    ----------
+    input: torch.Tensor
+        Input tensor, shape (batch_size, hidden_size).
+    weight: torch.Tensor
+        Weight tensor, shape (hidden_size,).
+    eps: float
+        Epsilon for numerical stability.
+    out: Optional[torch.Tensor]
+        The output tensor, if specified, the kernel will update this tensor inplace.
+    enable_pdl: Optional[bool]
+        Whether to enable `programmatic dependent launch
+        <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization>`_
+        If None, will be automatically enabled on Hopper architecture.
+
+    Returns
+    -------
+    output: torch.Tensor
+        Gemma Normalized tensor, shape (batch_size, hidden_size).
+    """
+    if out is None:
+        out = torch.empty_like(input)
+    if enable_pdl is None:
+        enable_pdl = is_arch_support_pdl()
+    torch.ops.sgl_kernel.gemma_rmsnorm.default(out, input, weight, eps, enable_pdl)
+    return out
+
+
+def gemma_fused_add_rmsnorm(
+    input: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+    enable_pdl: Optional[bool] = None,
+) -> None:
+    r"""Gemma-style fused add root mean square normalization.
+
+    Step 1:
+    ``residual[i] += input[i]``
+
+    Step 2:
+    ``input[i] = (residual[i] / RMS(residual)) * (weight + 1)``
+
+    Parameters
+    ----------
+    input: torch.Tensor
+        Input tensor, shape (batch_size, hidden_size).
+    residual: torch.Tensor
+        Residual tensor, shape (batch_size, hidden_size).
+    weight: torch.Tensor
+        Weight tensor, shape (hidden_size,).
+    eps: float
+        Epsilon for numerical stability.
+    enable_pdl: Optional[bool]
+        Whether to enable `programmatic dependent launch
+        <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization>`_
+        If None, will be automatically enabled on Hopper architecture.
+    """
+    if enable_pdl is None:
+        enable_pdl = is_arch_support_pdl()
+    torch.ops.sgl_kernel.gemma_fused_add_rmsnorm.default(
+        input, residual, weight, eps, enable_pdl
+    )
+
+
+def _check_shape(input: torch.Tensor, output: torch.Tensor) -> None:
+    assert input.ndim == output.ndim, f"{input.ndim} != {output.ndim}"
+    assert (
+        input.shape[:-1] == output.shape[:-1]
+    ), f"{input.shape[:-1]} != {output.shape[:-1]}"
+    assert (
+        input.shape[-1] == 2 * output.shape[-1]
+    ), f"{input.shape[-1]} != {2 * output.shape[-1]}"
+
+
+def silu_and_mul(input: torch.Tensor, out: torch.Tensor = None) -> torch.Tensor:
+    if input.shape[-1] * input.dtype.itemsize % 16 != 0:
+        raise ValueError("The pointers must be multiple of 16 bytes.")
+    if out is not None:
+        _check_shape(input, out)
+    else:
+        out = torch.empty(
+            input.shape[:-1] + (input.shape[-1] // 2,),
+            device=input.device,
+            dtype=input.dtype,
+        )
+    torch.ops.sgl_kernel.silu_and_mul.default(out, input)
+    return out
+
+
+def gelu_tanh_and_mul(input: torch.Tensor, out: torch.Tensor = None) -> torch.Tensor:
+    if input.shape[-1] * input.dtype.itemsize % 16 != 0:
+        raise ValueError("The pointers must be multiple of 16 bytes.")
+    if out is not None:
+        _check_shape(input, out)
+    else:
+        out = torch.empty(
+            input.shape[:-1] + (input.shape[-1] // 2,),
+            device=input.device,
+            dtype=input.dtype,
+        )
+    torch.ops.sgl_kernel.gelu_tanh_and_mul.default(out, input)
+    return out
+
+
+def gelu_and_mul(input: torch.Tensor, out: torch.Tensor = None) -> torch.Tensor:
+    if input.shape[-1] * input.dtype.itemsize % 16 != 0:
+        raise ValueError("The pointers must be multiple of 16 bytes.")
+    if out is not None:
+        _check_shape(input, out)
+    else:
+        out = torch.empty(
+            input.shape[:-1] + (input.shape[-1] // 2,),
+            device=input.device,
+            dtype=input.dtype,
+        )
+    torch.ops.sgl_kernel.gelu_and_mul.default(out, input)
+    return out
+
+
+if torch.version.hip is not None:
+
+    def gelu_quick(input: torch.Tensor, out: torch.Tensor = None) -> torch.Tensor:
+        """
+        Quick-GELU:  y = x * sigmoid(1.702 * x)
+
+        The CUDA/HIP kernel uses 128-bit (16-byte) vector loads & stores,
+        so the last-dimension byte length must be a multiple of 16 bytes.
+        """
+        if input.shape[-1] * input.dtype.itemsize % 16 != 0:
+            raise ValueError(
+                f"The last dimension ({input.shape[-1]}) x itemsize "
+                f"({input.dtype.itemsize}) must be a multiple of 16 bytes."
+            )
+
+        if out is not None:
+            assert input.shape == out.shape, f"{input.shape} != {out.shape}"
+        else:
+            out = torch.empty_like(input)
+
+        torch.ops.sgl_kernel.gelu_quick(out, input)
+        return out
+
+
+@dataclass
+class FusedSetKVBufferArg:
+    """
+    value : Optional[torch.Tensor]
+        Value tensor, shape: ``(nnz, num_v_heads * head_size)``.
+    k_buffer : Optional[torch.Tensor]
+        Buffer for keys, shape: ``(nnz, num_k_heads * head_size)``.
+    v_buffer : Optional[torch.Tensor]
+        Buffer for values, shape: ``(nnz, num_v_heads * head_size)``.
+    k_scale : Optional[float]
+        Scale factor for keys.
+    v_scale : Optional[float]
+        Scale factor for values.
+    cache_loc : Optional[torch.Tensor]
+        Cache location tensor, used for indexing kv cache.
+    """
+
+    value: torch.Tensor
+    k_buffer: torch.Tensor
+    v_buffer: torch.Tensor
+    k_scale: Optional[float]
+    v_scale: Optional[float]
+    cache_loc: torch.Tensor
+
+
+def apply_rope_with_cos_sin_cache_inplace(
+    positions: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    head_size: int,
+    cos_sin_cache: torch.Tensor,
+    is_neox: bool = True,
+    fused_set_kv_buffer_arg: Optional[FusedSetKVBufferArg] = None,
+    enable_pdl: Optional[bool] = None,
+) -> None:
+    r"""
+    Apply rotary embedding to keys and queries with precomputed cos/sin values.
+    This is designed to be compatible with the SGL/vLLM implementation.
+    The result is inplace applied to the input tensors.
+
+    Parameters
+    ----------
+    positions : torch.Tensor
+        Position indices, shape: ``(nnz)``.
+    query : torch.Tensor
+        Query tensor, shape: ``(nnz, num_q_heads * head_size)``.
+    key : torch.Tensor
+        Key tensor, shape: ``(nnz, num_k_heads * head_size)``.
+    cos_sin_cache : torch.Tensor
+        Cosine and Sine cache tensor, shape: ``(max_seq_len, rotary_dim)``.
+        Cosine is the first half and Sine is the second half on rotary_dim.
+    is_neox : bool
+        Whether to use Neox style RoPE, default: ``True``.
+
+        * If ``True``, the last dimension of the query/key tensor is not interleaved, i.e.,
+          we rotate the first half dimensions ``([..., :head_dim//2])`` and the second half
+          dimensions ``([..., head_dim//2:])``.
+
+        * If ``False``, the last dimension of the query/key tensor is interleaved, i.e.,
+          we rotate the even dimensions ``([..., ::2])`` and odd dimensions ``([..., 1::2])``.
+    fused_set_kv_buffer_arg : FusedSetKVBufferArg
+        Fuse the set-kv-buffer operation into this kernel
+
+    Note
+    ----
+    The rotary dimension is determined by the cosine cache and sine cache.
+    """
+    if cos_sin_cache.dtype != torch.float32:
+        raise ValueError("cos_sin_cache should be float32")
+
+    if enable_pdl is None:
+        # the non-fused branch does not yet support PDL, but after we switch to our impl for that branch it will
+        enable_pdl = is_arch_support_pdl() and (fused_set_kv_buffer_arg is not None)
+
+    if (a := fused_set_kv_buffer_arg) is not None:
+        assert a.k_scale is None, "k_scale is not yet supported"
+        assert a.v_scale is None, "v_scale is not yet supported"
+        assert a.cache_loc.dtype == torch.int64, f"{a.cache_loc.dtype=}"
+
+    def _view_3d(x):
+        return x.view(x.shape[0], -1, head_size)
+
+    torch.ops.sgl_kernel.apply_rope_pos_ids_cos_sin_cache.default(
+        _view_3d(query),
+        _view_3d(key),
+        _view_3d(query),
+        _view_3d(key),
+        cos_sin_cache,
+        positions.long(),
+        (not is_neox),
+        enable_pdl,
+        get_cuda_stream(),
+        (
+            _view_3d(fused_set_kv_buffer_arg.value)
+            if fused_set_kv_buffer_arg is not None
+            else None
+        ),
+        (
+            _view_3d(fused_set_kv_buffer_arg.k_buffer)
+            if fused_set_kv_buffer_arg is not None
+            else None
+        ),
+        (
+            _view_3d(fused_set_kv_buffer_arg.v_buffer)
+            if fused_set_kv_buffer_arg is not None
+            else None
+        ),
+        (
+            fused_set_kv_buffer_arg.cache_loc
+            if fused_set_kv_buffer_arg is not None
+            else None
+        ),
+    )
+
+
+def downcast_fp8(
+    k: torch.Tensor,
+    v: torch.Tensor,
+    k_out: torch.Tensor,
+    v_out: torch.Tensor,
+    k_scale: torch.Tensor,
+    v_scale: torch.Tensor,
+    loc: torch.Tensor,
+    mult: int = 1,
+    offset: int = 0,
+) -> None:
+    torch.ops.sgl_kernel.downcast_fp8(
+        k, v, k_out, v_out, k_scale, v_scale, loc, mult, offset, get_cuda_stream()
+    )
+
+
+def copy_to_gpu_no_ce(input: List[int], output: torch.Tensor):
+    torch.ops.sgl_kernel.copy_to_gpu_no_ce(input, output)
+
+
+def concat_mla_k(
+    k: torch.Tensor,
+    k_nope: torch.Tensor,
+    k_rope: torch.Tensor,
+):
+    torch.ops.sgl_kernel.concat_mla_k(k, k_nope, k_rope)
diff --git a/sgl-kernel/python/sgl_kernel/flash_attn.py b/sgl-kernel/python/sgl_kernel/flash_attn.py
new file mode 100644
index 000000000..ada7b6d07
--- /dev/null
+++ b/sgl-kernel/python/sgl_kernel/flash_attn.py
@@ -0,0 +1,459 @@
+from functools import lru_cache
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+# try:
+#     from sgl_kernel import flash_ops
+# except:
+#     raise ImportError("Can not import sgl_kernel. Please check your installation.")
+
+try:
+    from ._fa4_interface import flash_attn_varlen_func as flash_attn_varlen_func_v4
+except ImportError:
+    flash_attn_varlen_func_v4 = None
+
+
+@lru_cache(maxsize=1)
+def is_fa3_supported(device=None) -> bool:
+    #  There some fa3 FYI
+    #  FA3 can fail without a enough shared memory for a some shapes, such as higher
+    #  hidden_dim or some special cases.
+    #  Right now, fa3 is supported for sm80/sm87 and sm86/sm89. The main different
+    #  Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information
+    #  https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x
+    #  And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a.
+    #  That means if you use A100/A*0/L20/L40/L40s/4090 you can use fa3.
+    return (torch.version.cuda >= "12.3") and (
+        torch.cuda.get_device_capability(device)[0] == 9
+        or torch.cuda.get_device_capability(device)[0] == 8
+    )
+
+
+def maybe_contiguous(x):
+    return x.contiguous() if x is not None and x.stride(-1) != 1 else x
+
+
+# def flash_attn_with_kvcache(
+#     q,
+#     k_cache,
+#     v_cache,
+#     k=None,
+#     v=None,
+#     qv=None,
+#     rotary_cos=None,
+#     rotary_sin=None,
+#     cache_seqlens: Optional[Union[(int, torch.Tensor)]] = None,
+#     cache_batch_idx: Optional[torch.Tensor] = None,
+#     cache_leftpad: Optional[torch.Tensor] = None,
+#     page_table: Optional[torch.Tensor] = None,
+#     cu_seqlens_q: Optional[torch.Tensor] = None,
+#     cu_seqlens_k_new: Optional[torch.Tensor] = None,
+#     max_seqlen_q: Optional[int] = None,
+#     rotary_seqlens: Optional[torch.Tensor] = None,
+#     q_descale: Optional[torch.Tensor] = None,
+#     k_descale: Optional[torch.Tensor] = None,
+#     v_descale: Optional[torch.Tensor] = None,
+#     softmax_scale=None,
+#     causal=False,
+#     window_size=(-1, -1),  # -1 means infinite context window
+#     softcap=0.0,  # 0.0 means deactivated
+#     rotary_interleaved=True,
+#     scheduler_metadata=None,
+#     num_splits=0,  # Can be tuned for speed
+#     pack_gqa=None,  # Can be tuned for speed
+#     sm_margin=0,  # Can be tuned if some SMs are used for communication
+#     return_softmax_lse=False,
+#     sinks=None,
+#     ver=3,
+# ):
+#     """
+#     If k and v are not None, k_cache and v_cache will be updated *inplace* with the new values from
+#     k and v. This is useful for incremental decoding: you can pass in the cached keys/values from
+#     the previous step, and update them with the new keys/values from the current step, and do
+#     attention with the updated cache, all in 1 kernel.
+
+#     If you pass in k / v, you must make sure that the cache is large enough to hold the new values.
+#     For example, the KV cache could be pre-allocated with the max sequence length, and you can use
+#     cache_seqlens to keep track of the current sequence lengths of each sequence in the batch.
+
+#     Also apply rotary embedding if rotary_cos and rotary_sin are passed in. The key @k will be
+#     rotated by rotary_cos and rotary_sin at indices cache_seqlens, cache_seqlens + 1, etc.
+#     If causal or local (i.e., window_size != (-1, -1)), the query @q will be rotated by rotary_cos
+#     and rotary_sin at indices cache_seqlens, cache_seqlens + 1, etc.
+#     If not causal and not local, the query @q will be rotated by rotary_cos and rotary_sin at
+#     indices cache_seqlens only (i.e. we consider all tokens in @q to be at position cache_seqlens).
+
+#     See tests/test_flash_attn.py::test_flash_attn_kvcache for examples of how to use this function.
+
+#     Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
+#     than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
+#     For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
+#     0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.
+
+#     If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
+#     For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
+#         1 1 1 1 0
+#         1 1 1 1 1
+#     If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
+#         0 0
+#         0 0
+#         0 0
+#         1 0
+#         1 1
+#     If the row of the mask is all zero, the output will be zero.
+
+#     If window_size != (-1, -1), implements sliding window local attention. Query at position i
+#     will only attend to keys between
+#     [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.
+
+#     Note: Does not support backward pass.
+
+#     Arguments:
+#         q: (batch_size, seqlen, nheads, headdim)
+#         k_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim) if there's no page_table,
+#             or (num_blocks, page_block_size, nheads_k, headdim) if there's a page_table (i.e. paged KV cache)
+#             page_block_size must be a multiple of 256.
+#         v_cache: (batch_size_cache, seqlen_cache, nheads_k, headdim_v) if there's no page_table,
+#             or (num_blocks, page_block_size, nheads_k, headdim_v) if there's a page_table (i.e. paged KV cache)
+#         k [optional]: (batch_size, seqlen_new, nheads_k, headdim). If not None, we concatenate
+#             k with k_cache, starting at the indices specified by cache_seqlens.
+#         v [optional]: (batch_size, seqlen_new, nheads_k, headdim_v). Similar to k.
+#         qv [optional]: (batch_size, seqlen, nheads, headdim_v)
+#         rotary_cos [optional]: (seqlen_ro, rotary_dim / 2). If not None, we apply rotary embedding
+#             to k and q. Only applicable if k and v are passed in. rotary_dim must be divisible by 16.
+#         rotary_sin [optional]: (seqlen_ro, rotary_dim / 2). Similar to rotary_cos.
+#         cache_seqlens: int, or (batch_size,), dtype torch.int32. The sequence lengths of the
+#             KV cache.
+#         cache_batch_idx: (batch_size,), dtype torch.int32. The indices used to index into the KV cache.
+#             If None, we assume that the batch indices are [0, 1, 2, ..., batch_size - 1].
+#             If the indices are not distinct, and k and v are provided, the values updated in the cache
+#                  might come from any of the duplicate indices.
+#         cache_leftpad: (batch_size,), dtype torch.int32. The index that the KV cache starts. If None, assume 0.
+#         page_table [optional]: (batch_size, max_num_blocks_per_seq), dtype torch.int32.
+#         softmax_scale: float. The scaling of QK^T before applying softmax.
+#             Default to 1 / sqrt(headdim).
+#         causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+#         window_size: (left, right). If not (-1, -1), implements sliding window local attention.
+#         softcap: float. Anything > 0 activates softcapping attention.
+#         rotary_interleaved: bool. Only applicable if rotary_cos and rotary_sin are passed in.
+#             If True, rotary embedding will combine dimensions 0 & 1, 2 & 3, etc. If False,
+#             rotary embedding will combine dimensions 0 & rotary_dim / 2, 1 & rotary_dim / 2 + 1
+#             (i.e. GPT-NeoX style).
+#         num_splits: int. If > 1, split the key/value into this many chunks along the sequence.
+#            If num_splits == 1, we don't split the key/value. If num_splits == 0, we use a heuristic
+#            to automatically determine the number of splits.
+#            Don't change this unless you know what you are doing.
+#         return_softmax_lse: bool. Whether to return the logsumexp of the attention scores.
+
+#     Return:
+#         out: (batch_size, seqlen, nheads, headdim).
+#         softmax_lse [optional, if return_softmax_lse=True]: (batch_size, nheads, seqlen). The
+#             logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+#             normalization factor).
+#     """
+#     if ver == 4:
+#         raise NotImplementedError("haven't implemented flash_attn_with_kvcache for fa4")
+
+#     assert k_cache.stride(-1) == 1, "k_cache must have contiguous last dimension"
+#     assert v_cache.stride(-1) == 1, "v_cache must have contiguous last dimension"
+#     if softmax_scale is None:
+#         softmax_scale = (q.shape[-1] + (qv.shape[-1] if qv is not None else 0)) ** (
+#             -0.5
+#         )
+#     if cache_seqlens is not None and isinstance(cache_seqlens, int):
+#         cache_seqlens = torch.full(
+#             (k_cache.shape[0],), cache_seqlens, dtype=torch.int32, device=k_cache.device
+#         )
+#         cache_seqlens = maybe_contiguous(cache_seqlens)
+
+#     q, k_cache, k, v = [maybe_contiguous(x) for x in (q, k_cache, k, v)]
+#     v_cache = (
+#         v_cache.contiguous()
+#         if v_cache.stride(-1) != 1 and v_cache.stride(-3) != 1
+#         else v_cache
+#     )
+#     cu_seqlens_q, cu_seqlens_k_new = [
+#         maybe_contiguous(x) for x in (cu_seqlens_q, cu_seqlens_k_new)
+#     ]
+#     page_table, cache_batch_idx, cache_leftpad = [
+#         maybe_contiguous(x) for x in (page_table, cache_batch_idx, cache_leftpad)
+#     ]
+#     rotary_cos, rotary_sin = [maybe_contiguous(x) for x in (rotary_cos, rotary_sin)]
+#     rotary_seqlens = maybe_contiguous(rotary_seqlens)
+
+#     if hasattr(torch.version, 'hip') and torch.version.hip is not None:
+#         # HIP环境回退
+#         from flash_attn import flash_attn_with_kvcache as fa_with_kv
+#         out, softmax_lse, *rest = fa_with_kv(
+#             q, k, v, k_cache, v_cache, cache_seqlens, cache_batch_idx,
+#             block_tables, softmax_scale, causal, alibi_slopes, out
+#     )
+#     else:
+#         out, softmax_lse, *rest = torch.ops.sgl_kernel.fwd.default(
+#             q,
+#             k_cache,
+#             v_cache,
+#             k,
+#             v,
+#             qv,
+#             None,  # out
+#             cu_seqlens_q,
+#             None,  # cu_seqlens_k
+#             cu_seqlens_k_new,
+#             None,  # seqused_q
+#             cache_seqlens,
+#             max_seqlen_q,
+#             None,  # max_seqlen_k
+#             page_table,
+#             cache_batch_idx,
+#             cache_leftpad,
+#             rotary_cos,
+#             rotary_sin,
+#             rotary_seqlens,
+#             q_descale,
+#             k_descale,
+#             v_descale,
+#             softmax_scale,
+#             causal,
+#             window_size[0],
+#             window_size[1],
+#             softcap,
+#             rotary_interleaved,
+#             scheduler_metadata,
+#             num_splits,
+#             pack_gqa,
+#             sm_margin,
+#             sinks,
+#     )
+#     return (out, softmax_lse) if return_softmax_lse else out
+def flash_attn_with_kvcache(
+    q,
+    k_cache,
+    v_cache,
+    k=None,
+    v=None,
+    qv=None,
+    rotary_cos=None,
+    rotary_sin=None,
+    cache_seqlens: Optional[Union[(int, torch.Tensor)]] = None,
+    cache_batch_idx: Optional[torch.Tensor] = None,
+    cache_leftpad: Optional[torch.Tensor] = None,
+    page_table: Optional[torch.Tensor] = None,
+    cu_seqlens_q: Optional[torch.Tensor] = None,
+    cu_seqlens_k_new: Optional[torch.Tensor] = None,
+    max_seqlen_q: Optional[int] = None,
+    rotary_seqlens: Optional[torch.Tensor] = None,
+    q_descale: Optional[torch.Tensor] = None,
+    k_descale: Optional[torch.Tensor] = None,
+    v_descale: Optional[torch.Tensor] = None,
+    softmax_scale=None,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite context window
+    softcap=0.0,  # 0.0 means deactivated
+    rotary_interleaved=True,
+    scheduler_metadata=None,
+    num_splits=0,  # Can be tuned for speed
+    pack_gqa=None,  # Can be tuned for speed
+    sm_margin=0,  # Can be tuned if some SMs are used for communication
+    return_softmax_lse=False,
+    sinks=None,
+    ver=3,
+):
+    if ver == 4:
+        raise NotImplementedError("haven't implemented flash_attn_with_kvcache for fa4")
+
+    # HIP环境检测和回退
+    # if hasattr(torch.version, 'hip') and torch.version.hip is not None:
+    #     # 简单PyTorch回退，处理实际的张量形状
+    #     # q: [1, 4, 256], k_cache: [411528, 1, 1, 256], v_cache: [411528, 1, 1, 256]
+        
+    #     if softmax_scale is None:
+    #         softmax_scale = (q.shape[-1]) ** (-0.5)
+        
+    #     # 重塑以匹配attention计算
+    #     q_reshaped = q.unsqueeze(1)  # [1, 1, 4, 256]
+    #     k_reshaped = k_cache.squeeze(1).squeeze(1)  # [411528, 256]  
+    #     v_reshaped = v_cache.squeeze(1).squeeze(1)  # [411528, 256]
+        
+    #     # 简单的点积attention
+    #     scores = torch.matmul(q, k_reshaped.T) * softmax_scale  # [1, 4, 411528]
+    #     attn_weights = torch.softmax(scores, dim=-1)
+    #     out = torch.matmul(attn_weights, v_reshaped)  # [1, 4, 256]
+        
+    #     if return_softmax_lse:
+    #         softmax_lse = torch.zeros(1, 4, 1, device=q.device)
+    #         return out, softmax_lse
+    #     return out
+
+    # 原始sgl_kernel实现
+    assert k_cache.stride(-1) == 1, "k_cache must have contiguous last dimension"
+    assert v_cache.stride(-1) == 1, "v_cache must have contiguous last dimension"
+    if softmax_scale is None:
+        softmax_scale = (q.shape[-1] + (qv.shape[-1] if qv is not None else 0)) ** (-0.5)
+    if cache_seqlens is not None and isinstance(cache_seqlens, int):
+        cache_seqlens = torch.full(
+            (k_cache.shape[0],), cache_seqlens, dtype=torch.int32, device=k_cache.device
+        )
+        cache_seqlens = maybe_contiguous(cache_seqlens)
+
+    q, k_cache, k, v = [maybe_contiguous(x) for x in (q, k_cache, k, v)]
+    v_cache = (
+        v_cache.contiguous()
+        if v_cache.stride(-1) != 1 and v_cache.stride(-3) != 1
+        else v_cache
+    )
+    cu_seqlens_q, cu_seqlens_k_new = [
+        maybe_contiguous(x) for x in (cu_seqlens_q, cu_seqlens_k_new)
+    ]
+    page_table, cache_batch_idx, cache_leftpad = [
+        maybe_contiguous(x) for x in (page_table, cache_batch_idx, cache_leftpad)
+    ]
+    rotary_cos, rotary_sin = [maybe_contiguous(x) for x in (rotary_cos, rotary_sin)]
+    rotary_seqlens = maybe_contiguous(rotary_seqlens)
+
+    out, softmax_lse, *rest = torch.ops.sgl_kernel.fwd.default(
+        q,
+        k_cache,
+        v_cache,
+        k,
+        v,
+        qv,
+        None,  # out
+        cu_seqlens_q,
+        None,  # cu_seqlens_k
+        cu_seqlens_k_new,
+        None,  # seqused_q
+        cache_seqlens,
+        max_seqlen_q,
+        None,  # max_seqlen_k
+        page_table,
+        cache_batch_idx,
+        cache_leftpad,
+        rotary_cos,
+        rotary_sin,
+        rotary_seqlens,
+        q_descale,
+        k_descale,
+        v_descale,
+        softmax_scale,
+        causal,
+        window_size[0],
+        window_size[1],
+        softcap,
+        rotary_interleaved,
+        scheduler_metadata,
+        num_splits,
+        pack_gqa,
+        sm_margin,
+        sinks,
+    )
+    return (out, softmax_lse) if return_softmax_lse else out
+
+
+def flash_attn_varlen_func(
+    q,
+    k,
+    v,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    seqused_q=None,
+    seqused_k=None,
+    softmax_scale=None,
+    causal=False,
+    qv=None,
+    q_descale=None,
+    k_descale=None,
+    v_descale=None,
+    window_size=(-1, -1),
+    softcap=0.0,
+    num_splits=1,
+    pack_gqa=None,
+    sm_margin=0,
+    return_softmax_lse=False,
+    sinks=None,
+    ver=3,
+):
+    if ver == 4:
+        assert (
+            flash_attn_varlen_func_v4 is not None
+        ), "FA4 is not available, please check your installation."
+        # Using `(-1, -1)` as no sliding window causes correctness issues for FA4.
+        if window_size == (-1, -1):
+            window_size = (None, None)
+        return flash_attn_varlen_func_v4(
+            q,
+            k,
+            v,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            # max_seqlen_q,
+            # max_seqlen_k,
+            seqused_q=seqused_q,
+            seqused_k=seqused_k,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            # qv=qv,
+            # q_descale=q_descale,
+            # k_descale=k_descale,
+            # v_descale=v_descale,
+            window_size=window_size,
+            softcap=softcap,
+            # num_splits=num_splits,
+            pack_gqa=pack_gqa,
+            # sm_margin=sm_margin,
+            return_softmax_lse=return_softmax_lse,
+            learnable_sink=sinks,
+        )
+
+    if not is_fa3_supported():
+        raise NotImplementedError(
+            "flash_attn at sgl-kernel is only supported on sm90 and above"
+        )
+
+    if softmax_scale is None:
+        softmax_scale = (q.shape[-1] + (qv.shape[-1] if qv is not None else 0)) ** (
+            -0.5
+        )
+
+    out, softmax_lse, *rest = torch.ops.sgl_kernel.fwd.default(
+        q,
+        k,
+        v,
+        None,  # k_new
+        None,  # v_new
+        qv,  # qv
+        None,  # out
+        cu_seqlens_q,
+        cu_seqlens_k,
+        None,  # cu_seqlens_k_new
+        seqused_q,
+        seqused_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        None,  # page_table,
+        None,  # kv_batch_idx
+        None,  # leftpad_k
+        None,  # rotary cos
+        None,  # rotary sin
+        None,  # seqlens_rotary
+        q_descale,
+        k_descale,
+        v_descale,
+        softmax_scale,
+        causal,
+        window_size[0],
+        window_size[1],
+        softcap,
+        is_rotary_interleaved=False,
+        scheduler_metadata=None,
+        num_splits=num_splits,
+        pack_gqa=pack_gqa,
+        sm_margin=sm_margin,
+        sinks=sinks,
+    )
+
+    return (out, softmax_lse, *rest) if return_softmax_lse else out
diff --git a/sgl-kernel/python/sgl_kernel/fused_moe.py b/sgl-kernel/python/sgl_kernel/fused_moe.py
new file mode 100644
index 000000000..c9a11bfc0
--- /dev/null
+++ b/sgl-kernel/python/sgl_kernel/fused_moe.py
@@ -0,0 +1,225 @@
+import functools
+from typing import Optional
+
+import torch
+from sgl_kernel import silu_and_mul
+
+
+def get_scalar_type(num_bits: int, has_zp: bool):
+    from sgl_kernel.scalar_type import scalar_types
+
+    if has_zp:
+        assert num_bits == 4
+        return scalar_types.uint4
+    else:
+        return scalar_types.uint4b8 if num_bits == 4 else scalar_types.uint8b128
+
+
+def fused_marlin_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    g_idx1: Optional[torch.Tensor] = None,
+    g_idx2: Optional[torch.Tensor] = None,
+    sort_indices1: Optional[torch.Tensor] = None,
+    sort_indices2: Optional[torch.Tensor] = None,
+    w1_zeros: Optional[torch.Tensor] = None,
+    w2_zeros: Optional[torch.Tensor] = None,
+    workspace: Optional[torch.Tensor] = None,
+    num_bits: int = 8,
+    is_k_full: bool = True,
+    inplace: bool = False,
+) -> torch.Tensor:
+    """
+    This function computes a Mixture of Experts (MoE) layer using two sets of
+    weights, w1 and w2, and top-k gating mechanism.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+    - w1 (torch.Tensor): The first set of expert weights.
+    - w2 (torch.Tensor): The second set of expert weights.
+    - w1_scale (torch.Tensor): Scale to be used for w1.
+    - w2_scale (torch.Tensor): Scale to be used for w2.
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - g_idx1 (Optional[torch.Tensor]): The first set of act_order indices.
+    - g_idx2 (Optional[torch.Tensor]): The second set of act_order indices.
+    - sort_indices1 (Optional[torch.Tensor]): The first act_order input
+        permutation.
+    - sort_indices2 (Optional[torch.Tensor]): The second act_order input
+        permutation.
+    - topk_weights (torch.Tensor): Top-k weights.
+    - topk_ids (torch.Tensor): Indices of topk-k elements.
+    - w1_zeros (Optional[torch.Tensor]): Optional zero points to be used for w1.
+    - w2_zeros (Optional[torch.Tensor]): Optional zero points to be used for w2.
+    - num_bits (bool): The number of bits in expert weights quantization.
+
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+    # Delay the import to avoid circular dependency
+    from sglang.srt.layers.moe.fused_moe_triton import (
+        moe_align_block_size,
+        try_get_optimal_moe_config,
+    )
+
+    # Check constraints.
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+    assert hidden_states.shape[1] == w1.shape[1] * 16, "Hidden size mismatch w1"
+    assert hidden_states.shape[1] == w2.shape[2] // (
+        num_bits // 2
+    ), "Hidden size mismatch w2"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
+    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
+    assert hidden_states.dtype in [torch.float16, torch.bfloat16]
+    assert num_bits in [4, 8]
+
+    M, K = hidden_states.shape
+    E = w1.shape[0]
+    N = w2.shape[1] * 16
+    topk = topk_ids.shape[1]
+
+    get_config_func = functools.partial(
+        try_get_optimal_moe_config,
+        w1.shape,
+        w2.shape,
+        topk_ids.shape[1],
+        None,
+        is_marlin=True,
+    )
+    config = get_config_func(M)
+
+    block_size_m = config["BLOCK_SIZE_M"]
+
+    if global_num_experts == -1:
+        global_num_experts = E
+    sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
+        topk_ids, block_size_m, global_num_experts
+    )
+
+    if workspace is None:
+        max_workspace_size = (max(2 * N, K) // 64) * (
+            sorted_token_ids.size(0) // block_size_m
+        )
+        device = hidden_states.device
+        sms = torch.cuda.get_device_properties(device).multi_processor_count
+        max_workspace_size = min(max_workspace_size, sms * 4)
+        workspace = torch.zeros(
+            max_workspace_size, dtype=torch.int, device=device, requires_grad=False
+        )
+
+    scalar_type1 = get_scalar_type(num_bits, w1_zeros is not None)
+    scalar_type2 = get_scalar_type(num_bits, w2_zeros is not None)
+
+    intermediate_cache2 = torch.empty(
+        (M * topk_ids.shape[1], N),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+    intermediate_cache13 = torch.empty(
+        (M * topk_ids.shape[1] * max(2 * N, K),),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+    intermediate_cache1 = intermediate_cache13[: M * topk_ids.shape[1] * 2 * N]
+    intermediate_cache1 = intermediate_cache1.view(-1, 2 * N)
+    intermediate_cache3 = intermediate_cache13[: M * topk_ids.shape[1] * K]
+    intermediate_cache3 = intermediate_cache3.view(-1, K)
+
+    use_atomic_add = (
+        hidden_states.dtype == torch.half
+        or torch.cuda.get_device_capability(hidden_states.device)[0] >= 9
+    )
+
+    intermediate_cache1 = torch.ops.sgl_kernel.moe_wna16_marlin_gemm.default(
+        hidden_states,
+        intermediate_cache1,
+        w1,
+        w1_scale,
+        w1_zeros,
+        g_idx1,
+        sort_indices1,
+        workspace,
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        topk_weights,
+        moe_block_size=block_size_m,
+        top_k=topk,
+        mul_topk_weights=False,
+        is_ep=expert_map is not None,
+        b_q_type_id=scalar_type1.id,
+        size_m=M,
+        size_n=2 * N,
+        size_k=K,
+        is_k_full=is_k_full,
+        use_atomic_add=use_atomic_add,
+        use_fp32_reduce=True,
+        is_zp_float=False,
+    )
+
+    silu_and_mul(intermediate_cache1.view(-1, 2 * N), intermediate_cache2)
+
+    if expert_map is not None:
+        intermediate_cache3.zero_()
+
+    intermediate_cache3 = torch.ops.sgl_kernel.moe_wna16_marlin_gemm.default(
+        intermediate_cache2,
+        intermediate_cache3,
+        w2,
+        w2_scale,
+        w2_zeros,
+        g_idx2,
+        sort_indices2,
+        workspace,
+        sorted_token_ids,
+        expert_ids,
+        num_tokens_post_padded,
+        topk_weights,
+        moe_block_size=block_size_m,
+        top_k=1,
+        mul_topk_weights=True,
+        is_ep=expert_map is not None,
+        b_q_type_id=scalar_type2.id,
+        size_m=M * topk,
+        size_n=K,
+        size_k=N,
+        is_k_full=is_k_full,
+        use_atomic_add=use_atomic_add,
+        use_fp32_reduce=True,
+        is_zp_float=False,
+    ).view(-1, topk, K)
+
+    output = hidden_states if inplace else torch.empty_like(hidden_states)
+    return torch.sum(
+        intermediate_cache3.view(*intermediate_cache3.shape), dim=1, out=output
+    )
+
+
+def fused_marlin_moe_fake(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    g_idx1: Optional[torch.Tensor] = None,
+    g_idx2: Optional[torch.Tensor] = None,
+    sort_indices1: Optional[torch.Tensor] = None,
+    sort_indices2: Optional[torch.Tensor] = None,
+    w1_zeros: Optional[torch.Tensor] = None,
+    w2_zeros: Optional[torch.Tensor] = None,
+    num_bits: int = 8,
+    is_k_full: bool = True,
+) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
diff --git a/sgl-kernel/python/sgl_kernel/gemm.py b/sgl-kernel/python/sgl_kernel/gemm.py
new file mode 100644
index 000000000..36672877d
--- /dev/null
+++ b/sgl-kernel/python/sgl_kernel/gemm.py
@@ -0,0 +1,550 @@
+from typing import Optional, Tuple
+
+import torch
+from sgl_kernel.scalar_type import ScalarType
+from sgl_kernel.utils import _get_cache_buf, get_cuda_stream
+
+
+def awq_dequantize(
+    qweight: torch.Tensor, scales: torch.Tensor, qzeros: torch.Tensor
+) -> torch.ByteTensor:
+    return torch.ops.sgl_kernel.awq_dequantize.default(qweight, scales, qzeros)
+
+
+def int8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
+    return torch.ops.sgl_kernel.int8_scaled_mm.default(
+        mat_a,
+        mat_b,
+        scales_a,
+        scales_b,
+        out_dtype,
+        bias,
+    )
+
+
+def fp8_blockwise_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype):
+    return torch.ops.sgl_kernel.fp8_blockwise_scaled_mm.default(
+        mat_a,
+        mat_b,
+        scales_a,
+        scales_b,
+        out_dtype,
+    )
+
+
+def fp8_scaled_mm(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
+    return torch.ops.sgl_kernel.fp8_scaled_mm.default(
+        mat_a,
+        mat_b,
+        scales_a,
+        scales_b,
+        out_dtype,
+        bias,
+    )
+
+
+def _bmm_fp8_internal(
+    workspace_buffer: torch.Tensor,
+    A: torch.Tensor,
+    B: torch.Tensor,
+    D: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_scale: torch.Tensor,
+) -> None:
+    cublas_handle = torch.cuda.current_blas_handle()
+    torch.ops.sgl_kernel.bmm_fp8.default(
+        A,
+        B,
+        D,
+        A_scale,
+        B_scale,
+        workspace_buffer,
+        cublas_handle,
+        get_cuda_stream(),
+    )
+
+
+def bmm_fp8(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_scale: torch.Tensor,
+    dtype: torch.dtype,
+    out: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    if out is None:
+        out = torch.empty(
+            (A.shape[0], A.shape[1], B.shape[2]),
+            device=A.device,
+            dtype=dtype,
+        )
+    workspace_buffer = _get_cache_buf("bmm_fp8_workspace", 32 * 1024 * 1024, A.device)
+    _bmm_fp8_internal(workspace_buffer, A, B, out, A_scale, B_scale)
+    return out
+
+
+def dsv3_fused_a_gemm(
+    mat_a: torch.Tensor,
+    mat_b: torch.Tensor,
+    output: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    if output is None:
+        output = torch.empty(
+            (mat_a.shape[0], mat_b.shape[1]),
+            device=mat_a.device,
+            dtype=mat_a.dtype,
+        )
+    torch.ops.sgl_kernel.dsv3_fused_a_gemm.default(output, mat_a, mat_b)
+    return output
+
+
+def sgl_per_token_group_quant_fp8(
+    input: torch.Tensor,
+    output_q: torch.Tensor,
+    output_s: torch.Tensor,
+    group_size: int,
+    eps: float,
+    fp8_min: float,
+    fp8_max: float,
+    scale_ue8m0: bool,
+) -> None:
+    torch.ops.sgl_kernel.sgl_per_token_group_quant_fp8.default(
+        input, output_q, output_s, group_size, eps, fp8_min, fp8_max, scale_ue8m0
+    )
+
+
+def sgl_per_token_group_quant_int8(
+    input: torch.Tensor,
+    output_q: torch.Tensor,
+    output_s: torch.Tensor,
+    group_size: int,
+    eps: float,
+    int8_min: float,
+    int8_max: float,
+) -> None:
+    torch.ops.sgl_kernel.sgl_per_token_group_quant_int8.default(
+        input, output_q, output_s, group_size, eps, int8_min, int8_max
+    )
+
+
+def sgl_per_tensor_quant_fp8(
+    input: torch.Tensor,
+    output_q: torch.Tensor,
+    output_s: torch.Tensor,
+    is_static: bool,
+) -> None:
+    torch.ops.sgl_kernel.sgl_per_tensor_quant_fp8.default(
+        input, output_q, output_s, is_static
+    )
+
+
+def sgl_per_token_quant_fp8(
+    input: torch.Tensor,
+    output_q: torch.Tensor,
+    output_s: torch.Tensor,
+) -> None:
+    torch.ops.sgl_kernel.sgl_per_token_quant_fp8.default(input, output_q, output_s)
+
+
+def cutlass_scaled_fp4_mm(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    block_scale_a: torch.Tensor,
+    block_scale_b: torch.Tensor,
+    alpha: torch.Tensor,
+    out_dtype: torch.dtype,
+) -> torch.Tensor:
+    assert a.ndim == 2 and b.ndim == 2
+    m, n = a.shape[0], b.shape[0]
+    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
+    torch.ops.sgl_kernel.cutlass_scaled_fp4_mm.default(
+        out, a, b, block_scale_a, block_scale_b, alpha
+    )
+    return out
+
+
+def scaled_fp4_quant(
+    input: torch.Tensor, input_global_scale: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantize input tensor to FP4 and return quantized tensor and scale.
+
+    This function quantizes the last dimension of the given tensor `input`. For
+    every 16 consecutive elements, a single dynamically computed scaling factor
+    is shared. This scaling factor is quantized using the `input_global_scale`
+    and is stored in a swizzled layout (see
+    https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-b-layout-4x).
+
+    Args:
+        input: The input tensor to be quantized to FP4
+        input_global_scale: A scalar scaling factor for the entire tensor.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP4 but every
+            two values are packed into a uint8 and float8_e4m3 scaling factors
+            in a sizzled layout.
+    """
+    assert input.ndim >= 1, f"input.ndim needs to be >= 1, but got {input.ndim}."
+    other_dims = 1 if input.ndim == 1 else -1
+    input = input.reshape(other_dims, input.shape[-1])
+    m, n = input.shape
+    block_size = 16
+    device = input.device
+
+    assert n % block_size == 0, f"last dim has to be multiple of 16, but got {n}."
+    assert input.dtype in (
+        torch.float16,
+        torch.bfloat16,
+    ), f"input.dtype needs to be fp16 or bf16 but got {input.dtype}."
+
+    # Two fp4 values will be packed into an uint8.
+    output = torch.empty((m, n // 2), device=device, dtype=torch.uint8)
+
+    # We use the rounded values to store the swizzled values. Then, the scaling
+    # factors in float8_e4m3fn are packed into an int32 for every 4 values.
+    rounded_m = ((m + 128 - 1) // 128) * 128
+    scale_n = n // block_size
+    rounded_n = ((scale_n + 4 - 1) // 4) * 4
+    # padded part should be zeroed out
+    if rounded_n > scale_n:
+        output_scale = torch.zeros(
+            (rounded_m, rounded_n // 4), device=device, dtype=torch.int32
+        )
+    else:
+        output_scale = torch.empty(
+            (rounded_m, rounded_n // 4), device=device, dtype=torch.int32
+        )
+
+    torch.ops.sgl_kernel.scaled_fp4_quant.default(
+        output, input, output_scale, input_global_scale
+    )
+    output_scale = output_scale.view(torch.float8_e4m3fn)
+    return output, output_scale
+
+
+def qserve_w4a8_per_chn_gemm(
+    in_feats: torch.Tensor,
+    kernel: torch.Tensor,
+    wscales: torch.Tensor,
+    ascales: torch.Tensor,
+    w_szs: torch.Tensor,
+    a_ssums: torch.Tensor,
+    out_feats: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    if out_feats is None:
+        # NOTE(HandH1998): qserve_w4a8_per_chn_gemm only supports out dtype=torch.float16 now
+        out_feats = torch.empty(
+            (in_feats.shape[0], kernel.shape[0]),
+            device=in_feats.device,
+            dtype=torch.float16,
+        )
+    torch.ops.sgl_kernel.qserve_w4a8_per_chn_gemm.default(
+        in_feats, kernel, wscales, ascales, w_szs, a_ssums, out_feats
+    )
+    return out_feats
+
+
+def qserve_w4a8_per_group_gemm(
+    in_feats: torch.Tensor,
+    kernel: torch.Tensor,
+    zeros: torch.Tensor,
+    scales_i8: torch.Tensor,
+    wscales: torch.Tensor,
+    ascales: torch.Tensor,
+    out_feats: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    if out_feats is None:
+        # NOTE(HandH1998): qserve_w4a8_per_group_gemm only supports out dtype=torch.float16 now
+        out_feats = torch.empty(
+            (in_feats.shape[0], kernel.shape[0]),
+            device=in_feats.device,
+            dtype=torch.float16,
+        )
+    torch.ops.sgl_kernel.qserve_w4a8_per_group_gemm.default(
+        in_feats, kernel, zeros, scales_i8, wscales, ascales, out_feats
+    )
+    return out_feats
+
+
+def dsv3_router_gemm(
+    hidden_states: torch.Tensor,
+    router_weights: torch.Tensor,
+    out_dtype: torch.dtype = torch.bfloat16,
+) -> torch.Tensor:
+    output = torch.empty(
+        hidden_states.shape[0],
+        router_weights.shape[0],
+        device=hidden_states.device,
+        dtype=out_dtype,
+    )
+    torch.ops.sgl_kernel.dsv3_router_gemm(
+        output,
+        hidden_states,
+        router_weights,
+    )
+    return output
+
+
+def shuffle_rows(input_tensor, dst2src_map, output_tensor_shape):
+    output_tensor = torch.empty(
+        output_tensor_shape,
+        device=input_tensor.device,
+        dtype=input_tensor.dtype,
+    )
+    torch.ops.sgl_kernel.shuffle_rows.default(input_tensor, dst2src_map, output_tensor)
+    return output_tensor
+
+
+def scaled_fp4_grouped_quant(
+    input_tensor: torch.Tensor,
+    input_global_scale: torch.Tensor,
+    mask: torch.Tensor,
+):
+    """
+    Quantize input tensor to FP4 and return quantized tensor and scale, for
+    grouped gemm inputs (e.g., grouped_gemm_nt_masked for flashinfer).
+    Args:
+        input: The input tensor to be quantized to FP4, with shape (l, m, k)
+            l is number of groups, m is number of tokens per group, k is number of features.
+        input_global_scale: A scalar scaling factor for the entire tensor, with
+            shape (l,).
+    Outputs:
+        output: The quantized tensor in FP4, with shape (m, k // 2, l) but the physical
+            layout is (l, m, k // 2). `// 2` is because two fp4 values are packed into
+            an uint8.
+        output_scales: The blockscale tensor in FP8-E4M3, with shape (32, 4, rm, 4, rk, l)
+            but the physical layout is (l, rm, rk, 32, 4, 4).
+    Note:
+        For the shape of output_scales, `32 * 4 * rm` is a padded m to nearest multiple of 128.
+        `4 * rk` is a padded `k // 16` to nearest multiple of 4. These layout constants are
+        required by the NVIDIA Blackwell MMA operations.
+    """
+    device = input_tensor.device
+    l, m, k = input_tensor.shape
+    sf_vec_size = 16
+    assert k % sf_vec_size == 0, f"k must be multiple of 16, but got {k}."
+
+    scale_k = k // sf_vec_size
+    padded_k = (scale_k + (4 - 1)) // 4 * 4
+    padded_k_int32 = padded_k // 4
+    padded_m = (m + (128 - 1)) // 128 * 128
+    output = torch.empty(l, m, k // 2, device=device, dtype=torch.uint8)
+    output_scales = torch.empty(
+        l, padded_m, padded_k_int32, device=device, dtype=torch.int32
+    )
+
+    torch.ops.sgl_kernel.silu_and_mul_scaled_fp4_experts_quant.default(
+        output.view(l * m, k // 2),
+        output_scales.view(l * padded_m, padded_k_int32),
+        input_tensor.view(l * m, k),
+        input_global_scale,
+        mask,
+        use_silu_and_mul=False,
+    )
+    # The physical layout of the output is (l, m, k // 2), but we want to return a
+    # logical layout (m, k // 2, l) required by the flashinfer masked group gemm.
+    output = output.permute(1, 2, 0)
+    # The physical layout of the output scales is already swizzled as (l, rm, rk, 32, 4, 4), a
+    # requirement for the flashinfer masked group gemm, where rm=m/128 and rk=k/4. The logic
+    # layout is (32, 4, rm, 4, rk, l).
+    output_scales = output_scales.view(torch.float8_e4m3fn).view(
+        l, padded_m // 128, padded_k // 4, 32, 4, 4
+    )
+    output_scales = output_scales.permute(3, 4, 1, 5, 2, 0)
+    return output, output_scales
+
+
+def silu_and_mul_scaled_fp4_grouped_quant(
+    input_tensor: torch.Tensor,
+    input_global_scale: torch.Tensor,
+    mask: torch.Tensor,
+):
+    """
+    Quantize input tensor to FP4 and return quantized tensor and scale, for
+    grouped gemm inputs (e.g., grouped_gemm_nt_masked for flashinfer).
+    Args:
+        input: The input tensor to be quantized to FP4, with shape (l, m, k * 2)
+            l is number of groups, m is number of tokens per group, k is number of features.
+        input_global_scale: A scalar scaling factor for the entire tensor, with
+            shape (l,).
+        mask: The mask tensor, with shape (l,)
+    Outputs:
+        output: The quantized tensor in FP4, with shape (m, k // 2, l) but the physical
+            layout is (l, m, k // 2). `// 2` is because two fp4 values are packed into
+            an uint8.
+        output_scales: The blockscale tensor in FP8-E4M3, with shape (32, 4, rm, 4, rk, l)
+            but the physical layout is (l, rm, rk, 32, 4, 4).
+    Note:
+        For the shape of output_scales, `32 * 4 * rm` is a padded m to nearest multiple of 128.
+        `4 * rk` is a padded `k // 16` to nearest multiple of 4. These layout constants are
+        required by the NVIDIA Blackwell MMA operations.
+    """
+    device = input_tensor.device
+    l, m, k_by_2 = input_tensor.shape
+    k = k_by_2 // 2
+    sf_vec_size = 16
+    assert k % sf_vec_size == 0, f"k must be multiple of 16, but got {k}."
+
+    scale_k = k // sf_vec_size
+    padded_k = (scale_k + (4 - 1)) // 4 * 4
+    padded_k_int32 = padded_k // 4
+    padded_m = (m + (128 - 1)) // 128 * 128
+    output = torch.empty(l, m, k // 2, device=device, dtype=torch.uint8)
+    output_scales = torch.empty(
+        l, padded_m, padded_k_int32, device=device, dtype=torch.int32
+    )
+
+    torch.ops.sgl_kernel.silu_and_mul_scaled_fp4_experts_quant.default(
+        output.view(l * m, k // 2),
+        output_scales.view(l * padded_m, padded_k_int32),
+        input_tensor.view(l * m, k_by_2),
+        input_global_scale,
+        mask,
+        use_silu_and_mul=True,
+    )
+    # The physical layout of the output is (l, m, k // 2), but we want to return a
+    # logical layout (m, k // 2, l) required by the flashinfer masked group gemm.
+    output = output.permute(1, 2, 0)
+    # The physical layout of the output scales is already swizzled as (l, rm, rk, 32, 4, 4), a
+    # requirement for the flashinfer masked group gemm, where rm=m/128 and rk=k/4. The logic
+    # layout is (32, 4, rm, 4, rk, l).
+    output_scales = output_scales.view(torch.float8_e4m3fn).view(
+        l, padded_m // 128, padded_k // 4, 32, 4, 4
+    )
+    output_scales = output_scales.permute(3, 4, 1, 5, 2, 0)
+    return output, output_scales
+
+
+def scaled_fp4_experts_quant(
+    input_tensor: torch.Tensor,
+    input_global_scale: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    blockscale_offsets: torch.Tensor,
+    topk: int,
+    expert_map: Optional[torch.Tensor] = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantize input tensor to FP4 and return quantized tensor and scale, for
+    packed MoE Inputs.
+    Args:
+        input: The input tensor to be quantized to FP4
+        expert_map: The expert map tensor
+        input_global_scale: A scalar scaling factor for the entire tensor.
+        expert_offsets: The expert offsets tensor
+        blockscale_offsets: The blockscale offsets tensor
+    Outputs:
+        output: The quantized tensor in FP4
+        output_scales: The blockscale tensor in FP8-E4M3
+    """
+    assert (
+        input_tensor.ndim == 2
+    ), f"input.ndim needs to be == 2, but got {input_tensor.ndim}."
+    if expert_map is not None:
+        (m, k) = input_tensor.shape
+        output_tensor_shape = (m * topk, k)
+        input_tensor = shuffle_rows(input_tensor, expert_map, output_tensor_shape)
+    m_numtopk, k = input_tensor.shape
+    # Control the maximum number of tokens per expert supported by the
+    # NVFP4 MoE Expert Quantization. This is used to prevent the kernel
+    # from running out of memory. This value can also be increased to support
+    # larger models.
+    import os
+
+    MAX_TOKENS_PER_EXPERT = os.environ.get("MODELOPT_MAX_TOKENS_PER_EXPERT", 65536)
+    assert m_numtopk <= MAX_TOKENS_PER_EXPERT * topk, (
+        f"m_numtopk must be less than MAX_TOKENS_PER_EXPERT("
+        f"{MAX_TOKENS_PER_EXPERT})"
+        f" for cutlass_moe_fp4, observed m_numtopk = {m_numtopk}. Use"
+        f" MODELOPT_MAX_TOKENS_PER_EXPERT to set this value."
+    )
+    scales_k = k // 16
+    padded_k = (scales_k + (4 - 1)) // 4
+
+    # output is uint8 and packed fp4 values
+    output = torch.empty(
+        m_numtopk, k // 2, device=input_tensor.device, dtype=torch.uint8
+    )
+    # padded part should be zeroed out
+    if padded_k > scales_k:
+        output_scales = torch.zeros(
+            MAX_TOKENS_PER_EXPERT * topk,
+            padded_k,
+            dtype=torch.int32,
+            device=input_tensor.device,
+        )
+    else:
+        output_scales = torch.empty(
+            MAX_TOKENS_PER_EXPERT * topk,
+            padded_k,
+            dtype=torch.int32,
+            device=input_tensor.device,
+        )
+    torch.ops.sgl_kernel.scaled_fp4_experts_quant.default(
+        output,
+        output_scales,
+        input_tensor,
+        input_global_scale,
+        expert_offsets,
+        blockscale_offsets,
+    )
+    output_scales = output_scales.view(torch.float8_e4m3fn)
+    return output, output_scales
+
+
+# GPTQ kernels
+def gptq_marlin_gemm(
+    a: torch.Tensor,
+    c: Optional[torch.Tensor],
+    b_q_weight: torch.Tensor,
+    b_scales: torch.Tensor,
+    global_scale: Optional[torch.Tensor],
+    b_zeros: Optional[torch.Tensor],
+    g_idx: Optional[torch.Tensor],
+    perm: Optional[torch.Tensor],
+    workspace: torch.Tensor,
+    b_q_type: ScalarType,
+    size_m: int,
+    size_n: int,
+    size_k: int,
+    is_k_full: bool = True,
+    use_atomic_add: bool = False,
+    use_fp32_reduce: bool = False,
+    is_zp_float: bool = False,
+) -> torch.Tensor:
+    return torch.ops.sgl_kernel.gptq_marlin_gemm(
+        a,
+        c,
+        b_q_weight,
+        b_scales,
+        global_scale,
+        b_zeros,
+        g_idx,
+        perm,
+        workspace,
+        b_q_type.id,
+        size_m,
+        size_n,
+        size_k,
+        is_k_full,
+        use_atomic_add,
+        use_fp32_reduce,
+        is_zp_float,
+    )
+
+
+def gptq_gemm(
+    a: torch.Tensor,
+    b_q_weight: torch.Tensor,
+    b_gptq_qzeros: torch.Tensor,
+    b_gptq_scales: torch.Tensor,
+    b_g_idx: torch.Tensor,
+    use_shuffle: bool,
+    bit: int,
+) -> torch.Tensor:
+    return torch.ops.sgl_kernel.gptq_gemm(
+        a, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, use_shuffle, bit
+    )
+
+
+def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor, bit: int) -> None:
+    torch.torch.ops.sgl_kernel.gptq_shuffle(q_weight, q_perm, bit)
diff --git a/sgl-kernel/python/sgl_kernel/grammar.py b/sgl-kernel/python/sgl_kernel/grammar.py
new file mode 100644
index 000000000..971f94bc2
--- /dev/null
+++ b/sgl-kernel/python/sgl_kernel/grammar.py
@@ -0,0 +1,15 @@
+from typing import List, Optional, Union
+
+import torch
+
+
+def apply_token_bitmask_inplace_cuda(
+    logits: torch.Tensor,
+    bitmask: torch.Tensor,
+    indices: Optional[Union[List[int], torch.Tensor]] = None,
+) -> None:
+    if isinstance(indices, list):
+        indices = torch.tensor(indices, dtype=torch.int32, device=logits.device)
+    if indices is not None:
+        indices = indices.to(logits.device)
+    torch.ops.sgl_kernel.apply_token_bitmask_inplace_cuda(logits, bitmask, indices)
diff --git a/sgl-kernel/python/sgl_kernel/kvcacheio.py b/sgl-kernel/python/sgl_kernel/kvcacheio.py
new file mode 100644
index 000000000..5714b6a0d
--- /dev/null
+++ b/sgl-kernel/python/sgl_kernel/kvcacheio.py
@@ -0,0 +1,243 @@
+from typing import List
+
+import torch
+
+
+def is_hip() -> bool:
+    return torch.version.hip is not None
+
+
+_is_hip = is_hip()
+
+
+def transfer_kv_per_layer(
+    src_k: torch.Tensor,
+    dst_k: torch.Tensor,
+    src_v: torch.Tensor,
+    dst_v: torch.Tensor,
+    src_indices: torch.Tensor,
+    dst_indices: torch.Tensor,
+    item_size: int,
+    block_quota: int = 2,
+    num_warps_per_block: int = 16 if _is_hip else 32,
+):
+    torch.ops.sgl_kernel.transfer_kv_per_layer(
+        src_k,
+        dst_k,
+        src_v,
+        dst_v,
+        src_indices,
+        dst_indices,
+        item_size,
+        block_quota,
+        num_warps_per_block,
+    )
+
+
+def transfer_kv_per_layer_pf_lf(
+    src_k: torch.Tensor,
+    dst_k: torch.Tensor,
+    src_v: torch.Tensor,
+    dst_v: torch.Tensor,
+    src_indices: torch.Tensor,
+    dst_indices: torch.Tensor,
+    layer_id: int,
+    item_size: int,
+    src_layout_dim: int,
+    block_quota: int = 2,
+    num_warps_per_block: int = 16 if _is_hip else 32,
+):
+    torch.ops.sgl_kernel.transfer_kv_per_layer_pf_lf(
+        src_k,
+        dst_k,
+        src_v,
+        dst_v,
+        src_indices,
+        dst_indices,
+        layer_id,
+        item_size,
+        src_layout_dim,
+        block_quota,
+        num_warps_per_block,
+    )
+
+
+def transfer_kv_all_layer(
+    src_k_layers: torch.Tensor,
+    dst_k_layers: torch.Tensor,
+    src_v_layers: torch.Tensor,
+    dst_v_layers: torch.Tensor,
+    src_indices: torch.Tensor,
+    dst_indices: torch.Tensor,
+    item_size: int,
+    num_layers: int,
+    block_quota: int = 2,
+    num_warps_per_block: int = 16 if _is_hip else 32,
+):
+    torch.ops.sgl_kernel.transfer_kv_all_layer(
+        src_k_layers,
+        dst_k_layers,
+        src_v_layers,
+        dst_v_layers,
+        src_indices,
+        dst_indices,
+        item_size,
+        num_layers,
+        block_quota,
+        num_warps_per_block,
+    )
+
+
+def transfer_kv_all_layer_lf_pf(
+    src_k_layers: torch.Tensor,
+    dst_k: torch.Tensor,
+    src_v_layers: torch.Tensor,
+    dst_v: torch.Tensor,
+    src_indices: torch.Tensor,
+    dst_indices: torch.Tensor,
+    item_size: int,
+    dst_layout_dim: int,
+    num_layers: int,
+    block_quota: int = 2,
+    num_warps_per_block: int = 16 if _is_hip else 32,
+):
+    torch.ops.sgl_kernel.transfer_kv_all_layer_lf_pf(
+        src_k_layers,
+        dst_k,
+        src_v_layers,
+        dst_v,
+        src_indices,
+        dst_indices,
+        item_size,
+        dst_layout_dim,
+        num_layers,
+        block_quota,
+        num_warps_per_block,
+    )
+
+
+def transfer_kv_direct(
+    src_layers: List[torch.Tensor],
+    dst_layers: List[torch.Tensor],
+    src_indices: torch.Tensor,
+    dst_indices: torch.Tensor,
+    page_size: int,
+):
+    torch.ops.sgl_kernel.transfer_kv_direct(
+        src_layers, dst_layers, src_indices, dst_indices, page_size
+    )
+
+
+def transfer_kv_per_layer_direct_pf_lf(
+    src_ptrs: List[torch.Tensor],
+    dst_ptrs: List[torch.Tensor],
+    src_indices: torch.Tensor,
+    dst_indices: torch.Tensor,
+    layer_id: int,
+    page_size: int,
+):
+    torch.ops.sgl_kernel.transfer_kv_per_layer_direct_pf_lf(
+        src_ptrs, dst_ptrs, src_indices, dst_indices, layer_id, page_size
+    )
+
+
+def transfer_kv_all_layer_direct_lf_pf(
+    src_ptrs: List[torch.Tensor],
+    dst_ptrs: List[torch.Tensor],
+    src_indices: torch.Tensor,
+    dst_indices: torch.Tensor,
+    page_size: int,
+):
+    torch.ops.sgl_kernel.transfer_kv_all_layer_direct_lf_pf(
+        src_ptrs, dst_ptrs, src_indices, dst_indices, page_size
+    )
+
+
+def transfer_kv_per_layer_mla(
+    src: torch.Tensor,
+    dst: torch.Tensor,
+    src_indices: torch.Tensor,
+    dst_indices: torch.Tensor,
+    item_size: int,
+    block_quota: int = 2,
+    num_warps_per_block: int = 16 if _is_hip else 32,
+):
+    torch.ops.sgl_kernel.transfer_kv_per_layer_mla(
+        src,
+        dst,
+        src_indices,
+        dst_indices,
+        item_size,
+        block_quota,
+        num_warps_per_block,
+    )
+
+
+def transfer_kv_per_layer_mla_pf_lf(
+    src: torch.Tensor,
+    dst: torch.Tensor,
+    src_indices: torch.Tensor,
+    dst_indices: torch.Tensor,
+    layer_id: int,
+    item_size: int,
+    src_layout_dim: int,
+    block_quota: int = 2,
+    num_warps_per_block: int = 16 if _is_hip else 32,
+):
+    torch.ops.sgl_kernel.transfer_kv_per_layer_mla_pf_lf(
+        src,
+        dst,
+        src_indices,
+        dst_indices,
+        layer_id,
+        item_size,
+        src_layout_dim,
+        block_quota,
+        num_warps_per_block,
+    )
+
+
+def transfer_kv_all_layer_mla(
+    src_layers: torch.Tensor,
+    dst_layers: torch.Tensor,
+    src_indices: torch.Tensor,
+    dst_indices: torch.Tensor,
+    item_size: int,
+    num_layers: int,
+    block_quota: int = 2,
+    num_warps_per_block: int = 16 if _is_hip else 32,
+):
+    torch.ops.sgl_kernel.transfer_kv_all_layer_mla(
+        src_layers,
+        dst_layers,
+        src_indices,
+        dst_indices,
+        item_size,
+        num_layers,
+        block_quota,
+        num_warps_per_block,
+    )
+
+
+def transfer_kv_all_layer_mla_lf_pf(
+    src_layers: torch.Tensor,
+    dst: torch.Tensor,
+    src_indices: torch.Tensor,
+    dst_indices: torch.Tensor,
+    item_size: int,
+    dst_layout_dim: int,
+    num_layers: int,
+    block_quota: int = 2,
+    num_warps_per_block: int = 16 if _is_hip else 32,
+):
+    torch.ops.sgl_kernel.transfer_kv_all_layer_mla_lf_pf(
+        src_layers,
+        dst,
+        src_indices,
+        dst_indices,
+        item_size,
+        dst_layout_dim,
+        num_layers,
+        block_quota,
+        num_warps_per_block,
+    )
diff --git a/sgl-kernel/python/sgl_kernel/mamba.py b/sgl-kernel/python/sgl_kernel/mamba.py
new file mode 100644
index 000000000..85aa5b947
--- /dev/null
+++ b/sgl-kernel/python/sgl_kernel/mamba.py
@@ -0,0 +1,50 @@
+from typing import Optional
+
+import torch
+
+
+# mamba
+def causal_conv1d_fwd(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias_: Optional[torch.Tensor],
+    conv_states: Optional[torch.Tensor],
+    query_start_loc: Optional[torch.Tensor],
+    cache_indices: Optional[torch.Tensor],
+    has_initial_state: Optional[torch.Tensor],
+    silu_activation: bool,
+    pad_slot_id: int,
+):
+    torch.ops.sgl_kernel.causal_conv1d_fwd(
+        x,
+        weight,
+        bias_,
+        conv_states,
+        query_start_loc,
+        cache_indices,
+        has_initial_state,
+        silu_activation,
+        pad_slot_id,
+    )
+
+
+def causal_conv1d_update(
+    x: torch.Tensor,
+    conv_state: torch.Tensor,
+    weight: torch.Tensor,
+    bias_: Optional[torch.Tensor],
+    silu_activation: bool,
+    cache_seqlens: Optional[torch.Tensor],
+    conv_state_indices: Optional[torch.Tensor],
+    pad_slot_id: int,
+):
+    torch.ops.sgl_kernel.causal_conv1d_update(
+        x,
+        conv_state,
+        weight,
+        bias_,
+        silu_activation,
+        cache_seqlens,
+        conv_state_indices,
+        pad_slot_id,
+    )
diff --git a/sgl-kernel/python/sgl_kernel/marlin.py b/sgl-kernel/python/sgl_kernel/marlin.py
new file mode 100644
index 000000000..823d3fd4b
--- /dev/null
+++ b/sgl-kernel/python/sgl_kernel/marlin.py
@@ -0,0 +1,44 @@
+import torch
+
+
+def gptq_marlin_repack(
+    b_q_weight,
+    perm,
+    size_k,
+    size_n,
+    num_bits,
+) -> torch.Tensor:
+    return torch.ops.sgl_kernel.gptq_marlin_repack(
+        b_q_weight,
+        perm,
+        size_k,
+        size_n,
+        num_bits,
+    )
+
+
+def awq_marlin_repack(
+    b_q_weight: torch.Tensor, size_k: int, size_n: int, num_bits: int
+) -> torch.Tensor:
+    return torch.ops.sgl_kernel.awq_marlin_repack(b_q_weight, size_k, size_n, num_bits)
+
+
+def awq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = torch.ops.sgl_kernel.awq_marlin_repack(
+            b_q_weight[e], size_k, size_n, num_bits
+        )
+    return output
diff --git a/sgl-kernel/python/sgl_kernel/memory.py b/sgl-kernel/python/sgl_kernel/memory.py
new file mode 100644
index 000000000..eb997db0c
--- /dev/null
+++ b/sgl-kernel/python/sgl_kernel/memory.py
@@ -0,0 +1,18 @@
+import torch
+
+
+def set_kv_buffer_kernel(
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    loc: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    fallback: bool = False,
+):
+    try:
+        if fallback:
+            raise RuntimeError("Fallback to torch implementation")
+        torch.ops.sgl_kernel.store_kv_cache(k_cache, v_cache, loc, k, v)
+    except RuntimeError:  # ok, fallback to torch implementation
+        k_cache[loc] = k
+        v_cache[loc] = v
diff --git a/sgl-kernel/python/sgl_kernel/moe.py b/sgl-kernel/python/sgl_kernel/moe.py
new file mode 100755
index 000000000..66fec9f2b
--- /dev/null
+++ b/sgl-kernel/python/sgl_kernel/moe.py
@@ -0,0 +1,197 @@
+from typing import Any, Dict, Optional
+
+import torch
+
+
+def moe_align_block_size(
+    topk_ids,
+    num_experts,
+    block_size,
+    sorted_token_ids,
+    experts_ids,
+    num_tokens_post_pad,
+    cumsum_buffer,
+    pad_sorted_token_ids=False,
+):
+    torch.ops.sgl_kernel.moe_align_block_size.default(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_token_ids,
+        experts_ids,
+        num_tokens_post_pad,
+        cumsum_buffer,
+        pad_sorted_token_ids,
+    )
+
+
+def topk_softmax(
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    gating_output: float,
+    renormalize: bool = False,
+) -> None:
+    torch.ops.sgl_kernel.topk_softmax.default(
+        topk_weights, topk_ids, gating_output, renormalize
+    )
+
+
+def moe_fused_gate(
+    input_tensor,
+    bias,
+    num_expert_group,
+    topk_group,
+    topk,
+    num_fused_shared_experts=0,
+    routed_scaling_factor=0,
+    apply_routed_scaling_factor_on_output=False,
+):
+    # This fused kernel function is used to select topk expert in a hierarchical 2-layer fashion
+    # it split group of expert into num_expert_group, and use top2 expert weight sum in each group
+    # as the group weight to select expert groups and then select topk experts within the selected groups
+    # the #experts is decided by the input tensor shape and we currently only support power of 2 #experts
+    # and #experts should be divisible by num_expert_group. #expert/num_expert_group <= 32 is limited for now.
+    # for non-supported case, we suggest to use the biased_grouped_topk func in sglang.srt.layers.moe.topk
+    # num_fused_shared_experts: if > 0, the last several experts will be
+    #   replaced with shared experts. the shared experts will be divided by the
+    #   routed_scaling_factor - this is intended to cancel out later when routed+shared
+    #   output is scaled so that shared experts are not scaled.
+    # routed_scaling_factor: if > 0, the experts will be scaled by this factor
+    # apply_routed_scaling_factor_on_output: if true, output will be
+    #   scaled by the routed_scaling_factor
+    return torch.ops.sgl_kernel.moe_fused_gate.default(
+        input_tensor,
+        bias,
+        num_expert_group,
+        topk_group,
+        topk,
+        num_fused_shared_experts,
+        routed_scaling_factor,
+        apply_routed_scaling_factor_on_output,
+    )
+
+
+def fp8_blockwise_scaled_grouped_mm(
+    output,
+    a_ptrs,
+    b_ptrs,
+    out_ptrs,
+    a_scales_ptrs,
+    b_scales_ptrs,
+    a,
+    b,
+    scales_a,
+    scales_b,
+    stride_a,
+    stride_b,
+    stride_c,
+    layout_sfa,
+    layout_sfb,
+    problem_sizes,
+    expert_offsets,
+    workspace,
+):
+    torch.ops.sgl_kernel.fp8_blockwise_scaled_grouped_mm.default(
+        output,
+        a_ptrs,
+        b_ptrs,
+        out_ptrs,
+        a_scales_ptrs,
+        b_scales_ptrs,
+        a,
+        b,
+        scales_a,
+        scales_b,
+        stride_a,
+        stride_b,
+        stride_c,
+        layout_sfa,
+        layout_sfb,
+        problem_sizes,
+        expert_offsets,
+        workspace,
+    )
+
+
+def prepare_moe_input(
+    topk_ids,
+    expert_offsets,
+    problem_sizes1,
+    problem_sizes2,
+    input_permutation,
+    output_permutation,
+    num_experts,
+    n,
+    k,
+    blockscale_offsets: Optional[torch.Tensor] = None,
+):
+    torch.ops.sgl_kernel.prepare_moe_input.default(
+        topk_ids,
+        expert_offsets,
+        blockscale_offsets,
+        problem_sizes1,
+        problem_sizes2,
+        input_permutation,
+        output_permutation,
+        num_experts,
+        n,
+        k,
+    )
+
+
+def apply_shuffle_mul_sum(
+    input,
+    output,
+    permutation,
+    factors,
+):
+    torch.ops.sgl_kernel.apply_shuffle_mul_sum.default(
+        input, output, permutation, factors
+    )
+
+
+def cutlass_fp4_group_mm(
+    a_fp4,
+    b_fp4,
+    a_blockscale,
+    b_blockscale,
+    alphas,
+    out_dtype,
+    device,
+    params: Dict[str, Any],
+):
+    """
+    An FP4 Blockscaled Group Gemm that takes in  a_tensors, b_tensors and runs
+    the gemms for each combination based on the specified problem sizes.
+
+    This is used as the MoE gemm during NVFP4 Quantized FusedMoE forward.
+    - a/b_tensors: the NVFP4 a_ptrs and b_ptrs tensors which are quantized
+                     input and expert weights.
+    - a_/b_scales: The blockscales in FP8-E4M3 precision
+    - ab_strides/c_strides: Strides for the a/b tensors between rows.
+    - expert_offsets/sf_offsets: Indices that mark at which token index
+                    each expert begins its computation. The number of tokens
+                    computed with expert E is expert_offsets[E + 1] -
+                    expert_offsets[E] And the sf_size per expert is
+                    sf_offset[E+1] - sf_offset[E]
+    - problem_sizes: MxNxK sizes of each expert's multiplication in two grouped
+                     MMs used in the fused MoE operation.
+    """
+    m_topk = a_fp4.shape[0]
+    n = b_fp4.shape[1]
+    c_shape = (m_topk, n)
+    c = torch.empty(c_shape, device=device, dtype=out_dtype)
+    torch.ops.sgl_kernel.cutlass_fp4_group_mm.default(
+        c,
+        a_fp4,
+        b_fp4,
+        a_blockscale,
+        b_blockscale,
+        alphas,
+        params["ab_strides"],
+        params["c_strides"],
+        params["problem_sizes"],
+        params["expert_offsets"],
+        params["blockscale_offsets"],
+    )
+    return c.to(dtype=out_dtype)
diff --git a/sgl-kernel/python/sgl_kernel/sampling.py b/sgl-kernel/python/sgl_kernel/sampling.py
new file mode 100644
index 000000000..4ee6f24d3
--- /dev/null
+++ b/sgl-kernel/python/sgl_kernel/sampling.py
@@ -0,0 +1,543 @@
+from typing import Optional, Union
+
+import torch
+from sgl_kernel.utils import _to_tensor_scalar_tuple
+
+
+def _top_k_renorm_probs_internal(
+    probs: torch.Tensor,
+    maybe_top_k_arr: Optional[torch.Tensor],
+    top_k_val: int,
+) -> torch.Tensor:
+    probs = probs.float()
+    maybe_top_k_arr = maybe_top_k_arr.int() if maybe_top_k_arr is not None else None
+    renorm_probs = torch.empty_like(probs)
+    torch.ops.sgl_kernel.top_k_renorm_probs.default(
+        probs, renorm_probs, maybe_top_k_arr, top_k_val
+    )
+    return renorm_probs
+
+
+def top_k_renorm_probs(
+    probs: torch.Tensor,
+    top_k: Union[torch.Tensor, int],
+) -> torch.Tensor:
+    r"""Adapt from https://github.com/flashinfer-ai/flashinfer/flashinfer/sampling.py
+    Fused GPU kernel for renormalizing probabilities by top-k thresholding.
+
+    Parameters
+    ----------
+    probs: torch.Tensor
+        Probabilities, shape ``(batch_size, num_classes)``.
+    top_k: Union[torch.Tensor, int]
+        Either a scalar or a tensor of shape ``(batch_size,)``, representing the top-k threshold for for
+        for re-normalizing probabilities, should be in ``(0, num_classes)``.
+        If a scalar, the same threshold is used for all requests.
+        If a tensor, each request has its own threshold.
+        We keep the top-k probabilities, set the rest to zero, and renormalize the probabilities.
+
+    Returns
+    -------
+    renorm_probs: torch.Tensor
+        Renormalized probabilities, shape ``(batch_size, num_classes)``.
+
+    Note
+    ----
+    This combination of ``top_k_renorm_probs`` and ``sampling_from_probs`` should be equivalent to
+    ``top_k_sampling_from_probs``.
+    """
+    return _top_k_renorm_probs_internal(probs, *_to_tensor_scalar_tuple(top_k))
+
+
+top_k_renorm_prob = top_k_renorm_probs
+
+
+def _top_p_renorm_probs_internal(
+    probs: torch.Tensor,
+    maybe_top_p_arr: Optional[torch.Tensor],
+    top_p_val: float,
+) -> torch.Tensor:
+    probs = probs.float()
+    maybe_top_p_arr = maybe_top_p_arr.float() if maybe_top_p_arr is not None else None
+    renorm_probs = torch.empty_like(probs)
+    torch.ops.sgl_kernel.top_p_renorm_probs.default(
+        probs, renorm_probs, maybe_top_p_arr, top_p_val
+    )
+    return renorm_probs
+
+
+def top_p_renorm_probs(
+    probs: torch.Tensor,
+    top_p: Union[torch.Tensor, float],
+) -> torch.Tensor:
+    r"""Adapt from https://github.com/flashinfer-ai/flashinfer/flashinfer/sampling.py
+    Fused GPU kernel for renormalizing probabilities by top-p thresholding.
+
+    Parameters
+    ----------
+    probs: torch.Tensor
+        Probabilities, shape ``(batch_size, num_classes)``.
+    top_p: Union[torch.Tensor, float]
+        Either a scalar or a tensor of shape ``(batch_size,)``, representing the top-p threshold for for
+        re-normalizing probabilities, should be in ``(0, 1)``.
+        If a scalar, the same threshold is used for all requests.
+        If a tensor, each request has its own threshold.
+        We mask out the probabilities less than `threshold` where the cumulative sum
+        of ``probs[probs >= threshold]`` is `top_p`, and renormalize the probabilities.
+
+    Returns
+    -------
+    renorm_probs: torch.Tensor
+        Renormalized probabilities, shape ``(batch_size, num_classes)``.
+
+    Note
+    ----
+    This combination of ``top_p_renorm_probs`` and ``sampling_from_probs`` should be equivalent to
+    ``top_p_sampling_from_probs``.
+
+    """
+    return _top_p_renorm_probs_internal(probs, *_to_tensor_scalar_tuple(top_p))
+
+
+top_p_renorm_prob = top_p_renorm_probs
+
+
+def _top_p_sampling_from_probs_internal(
+    probs: torch.Tensor,
+    indices: Optional[torch.Tensor],
+    maybe_top_p_arr: Optional[torch.Tensor],
+    top_p_val: float,
+    deterministic: bool,
+    generator: Optional[torch.Generator],
+) -> torch.Tensor:
+    with probs.device as device:
+        probs = probs.float()
+        maybe_top_p_arr = (
+            maybe_top_p_arr.float() if maybe_top_p_arr is not None else None
+        )
+        samples = torch.empty(probs.size(0), dtype=torch.int32, device=device)
+        torch.ops.sgl_kernel.top_p_sampling_from_probs.default(
+            probs,
+            samples,
+            indices,
+            maybe_top_p_arr,
+            top_p_val,
+            deterministic,
+            generator,
+        )
+        return samples
+
+
+def top_p_sampling_from_probs(
+    probs: torch.Tensor,
+    top_p: Union[torch.Tensor, float],
+    indices: Optional[torch.Tensor] = None,
+    deterministic: bool = True,
+    generator: Optional[torch.Generator] = None,
+    check_nan: bool = False,
+) -> torch.Tensor:
+    r"""Adapt from https://github.com/flashinfer-ai/flashinfer/flashinfer/sampling.py
+    Fused GPU kernel for top-p sampling (nucleus sampling) from probabilities,
+    this operator implements GPU-based rejection sampling without explicit sorting.
+    Check the `blog post <https://flashinfer.ai/2025/03/10/sampling.html>`_ for more details.
+
+    The multiple rounds of rejection sampling are implemented in a single CUDA kernel,
+    which is more efficient than the naive implementation that launches a series of kernels.
+
+    Parameters
+    ----------
+    probs: torch.Tensor
+        Probabilities for sampling. When indices is not provided, shape should be ``(batch_size, num_classes)``
+        and the i-th output will be sampled from the i-th row of probabilities. When indices is provided,
+        shape should be ``(unique_batch_size, num_classes)`` where unique_batch_size is the number of unique
+        probability distributions.
+    top_p: Union[torch.Tensor, float]
+        Either a scalar or a tensor of shape ``(batch_size,)``, representing the threshold for top-p sampling.
+        If a scalar, the same threshold is used for all requests.
+        If a tensor, each request has its own threshold.
+    indices: Optional[torch.Tensor]
+        Optional indices tensor of shape ``(batch_size,)`` that maps each output to a row in probs.
+        For example, if indices[i] = j, then the i-th output will be sampled from probs[j].
+        This allows reusing the same probability distribution for multiple outputs.
+        If indices is not provided, the i-th output will be sampled from the i-th row of probs.
+    deterministic: bool
+        Whether to use deterministic kernel implementation, default is ``True``.
+    generator: Optional[torch.Generator]
+        A random number generator for the operation.
+    check_nan: bool
+        Whether to check nan in :attr:`probs`, default is ``False``.
+
+    Returns
+    -------
+    samples: torch.Tensor
+        Sampled categories, shape ``(batch_size,)``.
+
+    Note
+    ----
+    This function expects float32 inputs, and the output is int32.
+
+    """
+    if check_nan:
+        if torch.any(torch.isnan(probs)):
+            raise ValueError("Input probs contains NaN.")
+    return _top_p_sampling_from_probs_internal(
+        probs, indices, *_to_tensor_scalar_tuple(top_p), deterministic, generator
+    )
+
+
+def _top_k_top_p_sampling_from_probs_internal(
+    probs: torch.Tensor,
+    indices: Optional[torch.Tensor],
+    maybe_top_k_arr: Optional[torch.Tensor],
+    top_k_val: int,
+    maybe_top_p_arr: Optional[torch.Tensor],
+    top_p_val: float,
+    deterministic: bool,
+    generator: Optional[torch.Generator],
+) -> torch.Tensor:
+    with probs.device as device:
+        probs = probs.float()
+        maybe_top_k_arr = maybe_top_k_arr.int() if maybe_top_k_arr is not None else None
+        maybe_top_p_arr = (
+            maybe_top_p_arr.float() if maybe_top_p_arr is not None else None
+        )
+        samples = torch.empty(probs.size(0), dtype=torch.int32, device=device)
+        torch.ops.sgl_kernel.top_k_top_p_sampling_from_probs.default(
+            probs,
+            samples,
+            indices,
+            maybe_top_k_arr,
+            top_k_val,
+            maybe_top_p_arr,
+            top_p_val,
+            deterministic,
+            generator,
+        )
+        return samples
+
+
+def top_k_top_p_sampling_from_probs(
+    probs: torch.Tensor,
+    top_k: Union[torch.Tensor, int],
+    top_p: Union[torch.Tensor, float],
+    indices: Optional[torch.Tensor] = None,
+    filter_apply_order: str = "top_k_first",
+    deterministic: bool = True,
+    generator: Optional[torch.Generator] = None,
+    check_nan: bool = False,
+) -> torch.Tensor:
+    r"""Adapt from https://github.com/flashinfer-ai/flashinfer/flashinfer/sampling.py
+    Fused GPU kernel for top-k and top-p sampling from probabilities,
+
+    this operator implements GPU-based rejection sampling without explicit sorting.
+    Check the `blog post <https://flashinfer.ai/2025/03/10/sampling.html>`_ for more details.
+
+    The multiple rounds of rejection sampling are implemented in a single CUDA kernel,
+    which is more efficient than the naive implementation that launches a series of kernels.
+
+    Parameters
+    ----------
+    probs: torch.Tensor
+        Probabilities for sampling. When indices is not provided, shape should be ``(batch_size, num_classes)``
+        and the i-th output will be sampled from the i-th row of probabilities. When indices is provided,
+        shape should be ``(unique_batch_size, num_classes)`` where unique_batch_size is the number of unique
+        probability distributions.
+    top_k: Union[torch.Tensor, int]
+        Either a scalar or a tensor of shape ``(batch_size,)``, representing the threshold for top-k sampling.
+        If a scalar, the same threshold is used for all requests.
+        If a tensor, each request has its own threshold.
+    top_p: Union[torch.Tensor, float]
+        Either a scalar or a tensor of shape ``(batch_size,)``, representing the threshold for top-p sampling.
+        If a scalar, the same threshold is used for all requests.
+        If a tensor, each request has its own threshold.
+    indices: Optional[torch.Tensor]
+        Optional indices tensor of shape ``(batch_size,)`` that maps each output to a row in probs.
+        For example, if indices[i] = j, then the i-th output will be sampled from probs[j].
+        This allows reusing the same probability distribution for multiple outputs.
+        If indices is not provided, the i-th output will be sampled from the i-th row of probs.
+    filter_apply_order: str
+        The order of applying top-k and top-p sampling, should be either ``"top_k_first"`` or ``"joint"``.
+        If ``"top_k_first"``, we first apply top-k filter, then apply top-p sampling on the top-k results.
+        If ``"joint"``, we apply top-k and top-p filter simultaneously in each round. Default is ``"top_k_first"``.
+    deterministic: bool
+        Whether to use deterministic kernel implementation, default is ``True``.
+    generator: Optional[torch.Generator]
+        A random number generator for the operation.
+    check_nan: bool
+        Whether to check nan in :attr:`probs`, default is ``False``.
+
+    Returns
+    -------
+    samples: torch.Tensor
+        Sampled categories, shape ``(batch_size,)``.
+
+    Note
+    ----
+    This function expects float32 inputs, and the output is int32.
+
+    """
+    if filter_apply_order == "top_k_first":
+        renorm_probs = top_k_renorm_probs(probs, top_k)
+        return top_p_sampling_from_probs(
+            renorm_probs,
+            top_p,
+            indices,
+            deterministic,
+            check_nan=check_nan,
+            generator=generator,
+        )
+    elif filter_apply_order == "joint":
+        if check_nan:
+            if torch.any(torch.isnan(probs)):
+                raise ValueError("Input probs contains NaN.")
+        return _top_k_top_p_sampling_from_probs_internal(
+            probs,
+            indices,
+            *_to_tensor_scalar_tuple(top_k),
+            *_to_tensor_scalar_tuple(top_p),
+            deterministic,
+            generator,
+        )
+    else:
+        raise ValueError(f"Invalid filter_apply_order: {filter_apply_order}")
+
+
+def _min_p_sampling_from_probs_internal(
+    probs: torch.Tensor,
+    indices: Optional[torch.Tensor],
+    maybe_min_p_arr: Optional[torch.Tensor],
+    min_p_val: float,
+    deterministic: bool,
+    generator: Optional[torch.Generator],
+) -> torch.Tensor:
+    with probs.device as device:
+        probs = probs.float()
+        maybe_min_p_arr = (
+            maybe_min_p_arr.float() if maybe_min_p_arr is not None else None
+        )
+        samples = torch.empty(probs.size(0), dtype=torch.int32, device=device)
+        torch.ops.sgl_kernel.min_p_sampling_from_probs.default(
+            probs,
+            samples,
+            indices,
+            maybe_min_p_arr,
+            min_p_val,
+            deterministic,
+            generator,
+        )
+        return samples
+
+
+def min_p_sampling_from_probs(
+    probs: torch.Tensor,
+    min_p: Union[torch.Tensor, float],
+    indices: Optional[torch.Tensor] = None,
+    deterministic: bool = True,
+    generator: Optional[torch.Generator] = None,
+    check_nan: bool = False,
+) -> torch.Tensor:
+    r"""Adapt from https://github.com/flashinfer-ai/flashinfer/flashinfer/sampling.py
+    Fused GPU kernel for `min_p sampling <https://arxiv.org/abs/2407.01082>`_ from probabilities,
+
+    this operator implements GPU-based rejection sampling without explicit sorting.
+    Check the `blog post <https://flashinfer.ai/2025/03/10/sampling.html>`_ for more details.
+
+    The multiple rounds of rejection sampling are implemented in a single CUDA kernel,
+    which is more efficient than the naive implementation that launches a series of kernels.
+
+    Parameters
+    ----------
+    probs: torch.Tensor
+        Probabilities for sampling. When indices is not provided, shape should be ``(batch_size, num_classes)``
+        and the i-th output will be sampled from the i-th row of probabilities. When indices is provided,
+        shape should be ``(unique_batch_size, num_classes)`` where unique_batch_size is the number of unique
+        probability distributions.
+    min_p: Union[torch.Tensor, float]
+        Either a scalar or a tensor of shape ``(batch_size,)``, representing the threshold for min-p sampling.
+        If a scalar, the same threshold is used for all requests.
+        If a tensor, each request has its own threshold.
+    indices: Optional[torch.Tensor]
+        Optional indices tensor of shape ``(batch_size,)`` that maps each output to a row in probs.
+        For example, if indices[i] = j, then the i-th output will be sampled from probs[j].
+        This allows reusing the same probability distribution for multiple outputs.
+        If indices is not provided, the i-th output will be sampled from the i-th row of probs.
+    deterministic: bool
+        Whether to use deterministic kernel implementation, default is ``True``.
+    generator: Optional[torch.Generator]
+        A random number generator for the operation.
+    check_nan: bool
+        Whether to check nan in :attr:`probs`, default is ``False``.
+
+    Returns
+    -------
+    samples: torch.Tensor
+        Sampled categories, shape ``(batch_size,)``.
+
+    Note
+    ----
+    This function expects float32 inputs, and the output is int32.
+    """
+    if check_nan:
+        if torch.any(torch.isnan(probs)):
+            raise ValueError("Input probs contains NaN.")
+    return _min_p_sampling_from_probs_internal(
+        probs, indices, *_to_tensor_scalar_tuple(min_p), deterministic, generator
+    )
+
+
+def _top_k_mask_logits_internal(
+    logits: torch.Tensor,
+    maybe_top_k_arr: Optional[torch.Tensor],
+    top_k_val: int,
+) -> torch.Tensor:
+    logits = logits.float()
+    maybe_top_k_arr = maybe_top_k_arr.int() if maybe_top_k_arr is not None else None
+    mask_logits = torch.empty_like(logits)
+    torch.ops.sgl_kernel.top_k_mask_logits.default(
+        logits, mask_logits, maybe_top_k_arr, top_k_val
+    )
+    return mask_logits
+
+
+def top_k_mask_logits(
+    logits: torch.Tensor,
+    top_k: Union[torch.Tensor, int],
+) -> torch.Tensor:
+    r"""Adapt from https://github.com/flashinfer-ai/flashinfer/flashinfer/sampling.py
+    Fused GPU kernel for masking logits by top-k thresholding.
+
+    Parameters
+    ----------
+    logits: torch.Tensor
+        Logits before softmax, shape ``(batch_size, num_classes)``.
+    top_k: Union[torch.Tensor, int]
+        Either a scalar or a tensor of shape ``(batch_size,)``, representing the top-k threshold for for
+        for masking logits, should be in ``(0, num_classes)``.
+        If a scalar, the same threshold is used for all requests.
+        If a tensor, each request has its own threshold.
+        We keep the top-k logits, set the rest to negative infinity.
+
+    Returns
+    -------
+    masked_logits: torch.Tensor
+        Masked logits, shape ``(batch_size, num_classes)``.
+
+    Examples
+    --------
+
+    >>> import torch
+    >>> import flashinfer
+    >>> torch.manual_seed(42)
+    >>> batch_size = 4
+    >>> vocab_size = 5
+    >>> top_k = 3
+    >>> logits = torch.randn(batch_size, vocab_size).to(0)
+    >>> logits
+    tensor([[ 1.9269,  1.4873,  0.9007, -2.1055, -0.7581],
+            [ 1.0783,  0.8008,  1.6806,  0.3559, -0.6866],
+            [-0.4934,  0.2415, -0.2316,  0.0418, -0.2516],
+            [ 0.8599, -0.3097, -0.3957,  0.8034, -0.6216]], device='cuda:0')
+    >>> masked_logits = flashinfer.sampling.top_k_mask_logits(logits, top_k)
+    >>> masked_logits
+    tensor([[ 1.9269,  1.4873,  0.9007,    -inf,    -inf],
+            [ 1.0783,  0.8008,  1.6806,    -inf,    -inf],
+            [   -inf,  0.2415, -0.2316,  0.0418,    -inf],
+            [ 0.8599, -0.3097,    -inf,  0.8034,    -inf]], device='cuda:0')
+
+    Note
+    ----
+    The combination of ``top_k_mask_logits`` and ``softmax`` should be equivalent to ``top_k_renorm_probs``.
+
+    See Also
+    --------
+    top_k_renorm_probs
+    """
+    return _top_k_mask_logits_internal(logits, *_to_tensor_scalar_tuple(top_k))
+
+
+def top_k_top_p_sampling_from_logits(
+    logits: torch.Tensor,
+    top_k: Union[torch.Tensor, int],
+    top_p: Union[torch.Tensor, float],
+    indices: Optional[torch.Tensor] = None,
+    filter_apply_order: str = "top_k_first",
+    deterministic: bool = True,
+    generator: Optional[torch.Generator] = None,
+    check_nan: bool = False,
+) -> torch.Tensor:
+    r"""Adapt from https://github.com/flashinfer-ai/flashinfer/flashinfer/sampling.py
+    Fused GPU kernel for top-k and top-p sampling from probabilities,
+
+    this operator implements GPU-based rejection sampling without explicit sorting.
+    Check the `blog post <https://flashinfer.ai/2025/03/10/sampling.html>`_ for more details.
+
+    The multiple rounds of rejection sampling are implemented in a single CUDA kernel,
+    which is more efficient than the naive implementation that launches a series of kernels.
+
+    Parameters
+    ----------
+    logits: torch.Tensor
+        Pre-softmax logits for sampling. When indices is not provided, shape should be ``(batch_size, num_classes)``
+        and the i-th output will be sampled from the i-th row of logits. When indices is provided,
+        shape should be ``(unique_batch_size, num_classes)`` where unique_batch_size is the number of unique
+        probability distributions.
+    top_k: Union[torch.Tensor, int]
+        Either a scalar or a tensor of shape ``(batch_size,)``, representing the threshold for top-k sampling.
+        If a scalar, the same threshold is used for all requests.
+        If a tensor, each request has its own threshold.
+    top_p: Union[torch.Tensor, float]
+        Either a scalar or a tensor of shape ``(batch_size,)``, representing the threshold for top-p sampling.
+        If a scalar, the same threshold is used for all requests.
+        If a tensor, each request has its own threshold.
+    indices: Optional[torch.Tensor]
+        Optional indices tensor of shape ``(batch_size,)`` that maps each output to a row in probs.
+        For example, if indices[i] = j, then the i-th output will be sampled from probs[j].
+        This allows reusing the same probability distribution for multiple outputs.
+        If indices is not provided, the i-th output will be sampled from the i-th row of probs.
+    filter_apply_order: str
+        The order of applying top-k and top-p sampling, should be either ``"top_k_first"`` or ``"joint"``.
+        If ``"top_k_first"``, we first apply top-k filter, then apply top-p sampling on the top-k results.
+        If ``"joint"``, we apply top-k and top-p filter simultaneously in each round. Default is ``"top_k_first"``.
+    deterministic: bool
+        Whether to use deterministic kernel implementation, default is ``True``.
+    generator: Optional[torch.Generator]
+        A random number generator for the operation.
+    check_nan: bool
+        Whether to check nan in :attr:`probs`, default is ``False``.
+
+    Returns
+    -------
+    samples: torch.Tensor
+        Sampled categories, shape ``(batch_size,)``.
+
+    Note
+    ----
+    This function expects float32 inputs, and the output is int32.
+
+    """
+    if filter_apply_order == "top_k_first":
+        masked_logits = top_k_mask_logits(logits, top_k)
+        probs = torch.softmax(masked_logits, dim=-1)
+        return top_p_sampling_from_probs(
+            probs,
+            top_p,
+            indices,
+            deterministic,
+            check_nan=check_nan,
+            generator=generator,
+        )
+    elif filter_apply_order == "joint":
+        probs = torch.softmax(logits, dim=-1)
+        if check_nan:
+            if torch.any(torch.isnan(probs)):
+                raise ValueError("Input probs contains NaN.")
+        return _top_k_top_p_sampling_from_probs_internal(
+            probs,
+            indices,
+            *_to_tensor_scalar_tuple(top_k),
+            *_to_tensor_scalar_tuple(top_p),
+            deterministic,
+            generator,
+        )
+    else:
+        raise ValueError(f"Invalid filter_apply_order: {filter_apply_order}")
diff --git a/sgl-kernel/python/sgl_kernel/scalar_type.py b/sgl-kernel/python/sgl_kernel/scalar_type.py
new file mode 100644
index 000000000..571c1dca7
--- /dev/null
+++ b/sgl-kernel/python/sgl_kernel/scalar_type.py
@@ -0,0 +1,352 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import functools
+import struct
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional, Union
+
+_SCALAR_TYPES_ID_MAP = {}
+
+
+# Mirrors enum in `core/scalar_type.hpp`
+class NanRepr(Enum):
+    NONE = 0  # nans are not supported
+    IEEE_754 = 1  # nans are: Exp all 1s, mantissa not all 0s
+    EXTD_RANGE_MAX_MIN = 2  # nans are: Exp all 1s, mantissa all 1s
+
+
+# This ScalarType class is a parallel implementation of the C++ ScalarType
+# class found in csrc/core/scalar_type.hpp.  These two classes should be kept
+# in sync until the inductor fully supports custom C++ classes.
+@dataclass(frozen=True)
+class ScalarType:
+    """
+    ScalarType can represent a wide range of floating point and integer
+    types, in particular it can be used to represent sub-byte data types
+    (something that torch.dtype currently does not support). It is also
+    capable of  representing types with a bias, i.e.:
+      `stored_value = value + bias`,
+    this is useful for quantized types (e.g. standard GPTQ 4bit uses a bias
+    of 8). The implementation for this class can be found in
+    csrc/core/scalar_type.hpp, these type signatures should be kept in sync
+    with that file.
+    """
+
+    exponent: int
+    """
+    Number of bits in the exponent if this is a floating point type
+    (zero if this an integer type)
+    """
+
+    mantissa: int
+    """
+    Number of bits in the mantissa if this is a floating point type,
+    or the number bits representing an integer excluding the sign bit if
+    this an integer type.
+    """
+
+    signed: bool
+    "If the type is signed (i.e. has a sign bit)"
+
+    bias: int
+    """
+    bias used to encode the values in this scalar type
+    (value = stored_value - bias, default 0) for example if we store the
+    type as an unsigned integer with a bias of 128 then the value 0 will be
+    stored as 128 and -1 will be stored as 127 and 1 will be stored as 129.
+    """
+
+    _finite_values_only: bool = False
+    """
+    Private: if infs are supported, used `has_infs()` instead.
+    """
+
+    nan_repr: NanRepr = NanRepr.IEEE_754
+    """
+    How NaNs are represent in this scalar type, returns NanRepr value.
+    (not applicable for integer types)
+    """
+
+    def _floating_point_max_int(self) -> int:
+        assert (
+            self.mantissa <= 52 and self.exponent <= 11
+        ), f"Cannot represent max/min as a double for type {self.__str__()}"
+
+        max_mantissa = (1 << self.mantissa) - 1
+        if self.nan_repr == NanRepr.EXTD_RANGE_MAX_MIN:
+            max_mantissa = max_mantissa - 1
+
+        max_exponent = (1 << self.exponent) - 2
+        if self.nan_repr == NanRepr.EXTD_RANGE_MAX_MIN or self.nan_repr == NanRepr.NONE:
+            assert (
+                self.exponent < 11
+            ), f"Cannot represent max/min as a double for type {self.__str__()}"
+            max_exponent = max_exponent + 1
+
+        # adjust the exponent to match that of a double
+        # for now we assume the exponent bias is the standard 2^(e-1) -1, (where
+        # e is the exponent bits), there is some precedent for non-standard
+        # biases, example `float8_e4m3b11fnuz` here:
+        # https://github.com/jax-ml/ml_dtypes but to avoid premature over
+        # complication we are just assuming the standard exponent bias until
+        # there is a need to support non-standard biases
+        exponent_bias = (1 << (self.exponent - 1)) - 1
+        exponent_bias_double = (1 << 10) - 1  # double e = 11
+
+        max_exponent_double = max_exponent - exponent_bias + exponent_bias_double
+
+        # shift the mantissa and exponent into the proper positions for an
+        # IEEE double and bitwise-or them together.
+        return (max_mantissa << (52 - self.mantissa)) | (max_exponent_double << 52)
+
+    def _floating_point_max(self) -> float:
+        double_raw = self._floating_point_max_int()
+        return struct.unpack("!d", struct.pack("!Q", double_raw))[0]
+
+    def _raw_max(self) -> Union[int, float]:
+        if self.is_floating_point():
+            return self._floating_point_max()
+        else:
+            assert (
+                self.size_bits < 64 or self.size_bits == 64 and self.is_signed()
+            ), "Cannot represent max as an int"
+            return (1 << self.mantissa) - 1
+
+    def _raw_min(self) -> Union[int, float]:
+        if self.is_floating_point():
+            assert (
+                self.is_signed()
+            ), "We currently assume all floating point types are signed"
+            sign_bit_double = 1 << 63
+
+            max_raw = self._floating_point_max_int()
+            min_raw = max_raw | sign_bit_double
+            return struct.unpack("!d", struct.pack("!Q", min_raw))[0]
+        else:
+            assert (
+                not self.is_signed() or self.size_bits <= 64
+            ), "Cannot represent min as a int64_t"
+
+            if self.is_signed():
+                return -(1 << (self.size_bits - 1))
+            else:
+                return 0
+
+    @functools.cached_property
+    def id(self) -> int:
+        """
+        Convert the ScalarType to an int which can be passed to pytorch custom
+        ops. This layout of the int must be kept in sync with the C++
+        ScalarType's from_id method.
+        """
+        val = 0
+        offset = 0
+
+        def or_and_advance(member, bit_width):
+            nonlocal val
+            nonlocal offset
+            bit_mask = (1 << bit_width) - 1
+            val = val | (int(member) & bit_mask) << offset
+            offset = offset + bit_width
+
+        or_and_advance(self.exponent, 8)
+        or_and_advance(self.mantissa, 8)
+        or_and_advance(self.signed, 1)
+        or_and_advance(self.bias, 32)
+        or_and_advance(self._finite_values_only, 1)
+        or_and_advance(self.nan_repr.value, 8)
+
+        assert offset <= 64, f"ScalarType fields too big {offset} to fit into an int64"
+
+        _SCALAR_TYPES_ID_MAP[val] = self
+
+        return val
+
+    @property
+    def size_bits(self) -> int:
+        return self.exponent + self.mantissa + int(self.signed)
+
+    def min(self) -> Union[int, float]:
+        """
+        Min representable value for this scalar type.
+        (accounting for bias if there is one)
+        """
+        return self._raw_min() - self.bias
+
+    def max(self) -> Union[int, float]:
+        """
+        Max representable value for this scalar type.
+        (accounting for bias if there is one)
+        """
+        return self._raw_max() - self.bias
+
+    def is_signed(self) -> bool:
+        """
+        If the type is signed (i.e. has a sign bit), same as `signed`
+        added for consistency with:
+        https://pytorch.org/docs/stable/generated/torch.Tensor.is_signed.html
+        """
+        return self.signed
+
+    def is_floating_point(self) -> bool:
+        "If the type is a floating point type"
+        return self.exponent != 0
+
+    def is_integer(self) -> bool:
+        "If the type is an integer type"
+        return self.exponent == 0
+
+    def has_bias(self) -> bool:
+        "If the type has a non-zero bias"
+        return self.bias != 0
+
+    def has_infs(self) -> bool:
+        "If the type is floating point and supports infinity"
+        return not self._finite_values_only
+
+    def has_nans(self) -> bool:
+        return self.nan_repr != NanRepr.NONE
+
+    def is_ieee_754(self) -> bool:
+        """
+        If the type is a floating point type that follows IEEE 754
+        conventions
+        """
+        return self.nan_repr == NanRepr.IEEE_754 and not self._finite_values_only
+
+    def __str__(self) -> str:
+        """
+        naming generally follows: https://github.com/jax-ml/ml_dtypes
+        for floating point types (leading f) the scheme is:
+        `float<size_bits>_e<exponent_bits>m<mantissa_bits>[flags]`
+        flags:
+          - no-flags: means it follows IEEE 754 conventions
+          - f: means finite values only (no infinities)
+          - n: means nans are supported (non-standard encoding)
+        for integer types the scheme is:
+          `[u]int<size_bits>[b<bias>]`
+          - if bias is not present it means its zero
+        """
+        if self.is_floating_point():
+            ret = (
+                "float"
+                + str(self.size_bits)
+                + "_e"
+                + str(self.exponent)
+                + "m"
+                + str(self.mantissa)
+            )
+
+            if not self.is_ieee_754():
+                if self._finite_values_only:
+                    ret = ret + "f"
+                if self.nan_repr != NanRepr.NONE:
+                    ret = ret + "n"
+
+            return ret
+        else:
+            ret = ("int" if self.is_signed() else "uint") + str(self.size_bits)
+            if self.has_bias():
+                ret = ret + "b" + str(self.bias)
+            return ret
+
+    def __repr__(self) -> str:
+        return "ScalarType." + self.__str__()
+
+    # __len__ needs to be defined (and has to throw TypeError) for pytorch's
+    # opcheck to work.
+    def __len__(self) -> int:
+        raise TypeError
+
+    #
+    # Convenience Constructors
+    #
+
+    @classmethod
+    def int_(cls, size_bits: int, bias: Optional[int]) -> "ScalarType":
+        "Create a signed integer scalar type (size_bits includes sign-bit)."
+        ret = cls(0, size_bits - 1, True, bias if bias else 0)
+        ret.id  # noqa B018: make sure the id is cached
+        return ret
+
+    @classmethod
+    def uint(cls, size_bits: int, bias: Optional[int]) -> "ScalarType":
+        """Create a unsigned integer scalar type."""
+        ret = cls(0, size_bits, False, bias if bias else 0)
+        ret.id  # noqa B018: make sure the id is cached
+        return ret
+
+    @classmethod
+    def float_IEEE754(cls, exponent: int, mantissa: int) -> "ScalarType":
+        """
+        Create a standard floating point type
+        (i.e. follows IEEE 754 conventions).
+        """
+        assert mantissa > 0 and exponent > 0
+        ret = cls(exponent, mantissa, True, 0)
+        ret.id  # noqa B018: make sure the id is cached
+        return ret
+
+    @classmethod
+    def float_(
+        cls, exponent: int, mantissa: int, finite_values_only: bool, nan_repr: NanRepr
+    ) -> "ScalarType":
+        """
+        Create a non-standard floating point type
+        (i.e. does not follow IEEE 754 conventions).
+        """
+        assert mantissa > 0 and exponent > 0
+        assert nan_repr != NanRepr.IEEE_754, (
+            "use `float_IEEE754` constructor for floating point types that "
+            "follow IEEE 754 conventions"
+        )
+        ret = cls(exponent, mantissa, True, 0, finite_values_only, nan_repr)
+        ret.id  # noqa B018: make sure the id is cached
+        return ret
+
+    @classmethod
+    def from_id(cls, scalar_type_id: int):
+        if scalar_type_id not in _SCALAR_TYPES_ID_MAP:
+            raise ValueError(f"scalar_type_id {scalar_type_id} doesn't exists.")
+        return _SCALAR_TYPES_ID_MAP[scalar_type_id]
+
+
+# naming generally follows: https://github.com/jax-ml/ml_dtypes
+# for floating point types (leading f) the scheme is:
+#  `float<size_bits>_e<exponent_bits>m<mantissa_bits>[flags]`
+#  flags:
+#  - no-flags: means it follows IEEE 754 conventions
+#  - f: means finite values only (no infinities)
+#  - n: means nans are supported (non-standard encoding)
+# for integer types the scheme is:
+#  `[u]int<size_bits>[b<bias>]`
+#  - if bias is not present it means its zero
+
+
+class scalar_types:
+    int4 = ScalarType.int_(4, None)
+    uint4 = ScalarType.uint(4, None)
+    int8 = ScalarType.int_(8, None)
+    uint8 = ScalarType.uint(8, None)
+    float8_e4m3fn = ScalarType.float_(4, 3, True, NanRepr.EXTD_RANGE_MAX_MIN)
+    float8_e5m2 = ScalarType.float_IEEE754(5, 2)
+    float16_e8m7 = ScalarType.float_IEEE754(8, 7)
+    float16_e5m10 = ScalarType.float_IEEE754(5, 10)
+
+    # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
+    float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE)
+
+    # fp4, https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+    float4_e2m1f = ScalarType.float_(2, 1, True, NanRepr.NONE)
+
+    # "gptq" types
+    uint2b2 = ScalarType.uint(2, 2)
+    uint3b4 = ScalarType.uint(3, 4)
+    uint4b8 = ScalarType.uint(4, 8)
+    uint8b128 = ScalarType.uint(8, 128)
+
+    # colloquial names
+    bfloat16 = float16_e8m7
+    float16 = float16_e5m10
diff --git a/sgl-kernel/python/sgl_kernel/sparse_flash_attn.py b/sgl-kernel/python/sgl_kernel/sparse_flash_attn.py
new file mode 100644
index 000000000..29b2f0405
--- /dev/null
+++ b/sgl-kernel/python/sgl_kernel/sparse_flash_attn.py
@@ -0,0 +1,293 @@
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+
+def maybe_contiguous(x):
+    return x.contiguous() if x is not None and x.stride(-1) != 1 else x
+
+
+# Sparse attention utils
+def convert_vertical_slash_indexes(
+    q_seqlens: torch.Tensor,  # [BATCH, ]
+    kv_seqlens: torch.Tensor,  # [BATCH, ]
+    vertical_indexes: torch.Tensor,  # [BATCH, N_HEADS, NNZ_V]
+    slash_indexes: torch.Tensor,  # [BATCH, N_HEADS, NNZ_S]
+    context_size: int,
+    block_size_M: int,
+    block_size_N: int,
+    causal: bool = True,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    batch_size = slash_indexes.size(0)
+    num_heads = slash_indexes.size(1)
+    nnz_slash = slash_indexes.size(2)
+    nnz_vertical = vertical_indexes.size(2)
+    num_rows = (context_size + block_size_M - 1) // block_size_M
+
+    block_count = torch.zeros(
+        batch_size, num_heads, num_rows, dtype=q_seqlens.dtype, device=q_seqlens.device
+    )
+    block_offset = torch.zeros(
+        batch_size,
+        num_heads,
+        num_rows,
+        nnz_slash,
+        dtype=q_seqlens.dtype,
+        device=q_seqlens.device,
+    )
+    column_count = torch.zeros(
+        batch_size, num_heads, num_rows, dtype=q_seqlens.dtype, device=q_seqlens.device
+    )
+    column_index = torch.zeros(
+        batch_size,
+        num_heads,
+        num_rows,
+        nnz_vertical,
+        dtype=q_seqlens.dtype,
+        device=q_seqlens.device,
+    )
+
+    torch.ops.sgl_kernel.convert_vertical_slash_indexes.default(
+        block_count,
+        block_offset,
+        column_count,
+        column_index,
+        q_seqlens,
+        kv_seqlens,
+        vertical_indexes,
+        slash_indexes,
+        context_size,
+        block_size_M,
+        block_size_N,
+        causal,
+    )
+    return block_count, block_offset, column_count, column_index
+
+
+def convert_vertical_slash_indexes_mergehead(
+    q_seqlens: torch.Tensor,  # [BATCH, ]
+    kv_seqlens: torch.Tensor,  # [BATCH, ]
+    vertical_indexes: torch.Tensor,  # [BATCH, N_HEADS, NNZ_V]
+    slash_indexes: torch.Tensor,  # [BATCH, N_HEADS, NNZ_S]
+    # [N_HEADS] : different head use different number of indices
+    vertical_indices_count: torch.Tensor,
+    slash_indices_count: torch.Tensor,
+    context_size: int,
+    block_size_M: int,
+    block_size_N: int,
+    causal: bool = True,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    batch_size = slash_indexes.size(0)
+    num_heads = slash_indexes.size(1)
+    nnz_slash = slash_indexes.size(2)
+    nnz_vertical = vertical_indexes.size(2)
+    num_rows = (context_size + block_size_M - 1) // block_size_M
+
+    block_count = torch.empty(
+        batch_size, num_heads, num_rows, dtype=q_seqlens.dtype, device=q_seqlens.device
+    )
+    block_offset = torch.empty(
+        batch_size,
+        num_heads,
+        num_rows,
+        nnz_slash,
+        dtype=q_seqlens.dtype,
+        device=q_seqlens.device,
+    )
+    column_count = torch.empty(
+        batch_size, num_heads, num_rows, dtype=q_seqlens.dtype, device=q_seqlens.device
+    )
+    column_index = torch.empty(
+        batch_size,
+        num_heads,
+        num_rows,
+        nnz_vertical,
+        dtype=q_seqlens.dtype,
+        device=q_seqlens.device,
+    )
+
+    torch.ops.sgl_kernel.convert_vertical_slash_indexes_mergehead.default(
+        block_count,
+        block_offset,
+        column_count,
+        column_index,
+        q_seqlens,
+        kv_seqlens,
+        vertical_indexes,
+        slash_indexes,
+        vertical_indices_count,
+        slash_indices_count,
+        context_size,
+        block_size_M,
+        block_size_N,
+        causal,
+    )
+    return block_count, block_offset, column_count, column_index
+
+
+def sparse_attn_func(
+    q,
+    k,
+    v,
+    block_count,
+    block_offset,
+    column_count,
+    column_index,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    softcap=0.0,  # 0.0 means deactivated
+    alibi_slopes=None,
+    deterministic=False,
+    return_attn_probs=False,
+    *,
+    return_softmax_lse=False,
+    out=None,
+):
+    """Compute attention with vertical and slash sparsity patterns.
+    Most Arguments are the same with the flash_attn_func interface, except for 4 extra args:
+    block_count and block_offset for slash sparsity patterns, and
+    column_count and column_index for vertical sparsity patterns.
+    For more details please refer to Appendix C.4.2 of paper https://arxiv.org/abs/2407.02490.
+
+    Arguments:
+        q: (batch_size, seqlen, nheads, headdim)
+        k: (batch_size, seqlen, nheads_k, headdim)
+        v: (batch_size, seqlen, nheads_k, headdim)
+        block_count: (batch_size, nheads, cdiv(seqlen, BLOCK_M))
+        block_offset: (batch_size, nheads, cdiv(seqlen, BLOCK_M), NNZ_S)
+        column_count: (batch_size, nheads, cdiv(seqlen, BLOCK_M))
+        column_index: (batch_size, nheads, cdiv(seqlen, BLOCK_M), NNZ_V)
+        dropout_p: float. Dropout probability.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
+            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
+            is added to the attention score of query i and key j.
+        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
+            which is slightly slower and uses more memory. The forward pass is always deterministic.
+        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
+           testing only. The returned probabilities are not guaranteed to be correct
+           (they might not have the right scaling).
+    Return:
+        out: (batch_size, seqlen, nheads, headdim).
+        softmax_lse [optional, if return_softmax_lse=True]: (batch_size, nheads, seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+    """
+    if softmax_scale is None:
+        softmax_scale = q.shape[-1] ** (-0.5)
+
+    q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
+    out, softmax_lse = torch.ops.sgl_kernel.fwd_sparse.default(
+        q,
+        k,
+        v,
+        block_count,
+        block_offset,
+        column_count,
+        column_index,
+        out,
+        alibi_slopes,
+        dropout_p,
+        softmax_scale,
+        causal,
+        softcap,
+        return_attn_probs and dropout_p > 0,
+        None,
+    )
+    return (out, softmax_lse) if return_softmax_lse else out
+
+
+def sparse_attn_varlen_func(
+    q,
+    k,
+    v,
+    block_count,
+    block_offset,
+    column_count,
+    column_index,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    softcap=0.0,  # 0.0 means deactivated
+    alibi_slopes=None,
+    deterministic=False,
+    return_attn_probs=False,
+    *,
+    return_softmax_lse=False,
+    out=None,
+):
+    """Compute attention with vertical and slash sparsity patterns.
+    Most Arguments are the same with the flash_attn_varlen_func interface, except for 4 extra args:
+    block_count and block_offset for slash sparsity patterns, and
+    column_count and column_index for vertical sparsity patterns.
+    For more details please refer to Appendix C.4.2 of paper https://arxiv.org/abs/2407.02490.
+
+    Arguments:
+        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
+        k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
+        v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
+        block_count: (batch_size, nheads, cdiv(seqlen, BLOCK_M))
+        block_offset: (batch_size, nheads, cdiv(seqlen, BLOCK_M), NNZ_S)
+        column_count: (batch_size, nheads, cdiv(seqlen, BLOCK_M))
+        column_index: (batch_size, nheads, cdiv(seqlen, BLOCK_M), NNZ_V)
+        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into q.
+        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into kv.
+        max_seqlen_q: int. Maximum query sequence length in the batch.
+        max_seqlen_k: int. Maximum key sequence length in the batch.
+        dropout_p: float. Dropout probability.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        softcap: float. Anything > 0 activates softcapping attention.
+        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
+            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
+            is added to the attention score of query i and key j.
+        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
+            which is slightly slower and uses more memory. The forward pass is always deterministic.
+        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
+           testing only. The returned probabilities are not guaranteed to be correct
+           (they might not have the right scaling).
+    Return:
+        out: (total, nheads, headdim).
+        softmax_lse [optional, if return_softmax_lse=True]: (nheads, total_q_seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+    """
+    if softmax_scale is None:
+        softmax_scale = q.shape[-1] ** (-0.5)
+
+    q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
+    out, softmax_lse = torch.ops.sgl_kernel.varlen_fwd_sparse.default(
+        q,
+        k,
+        v,
+        block_count,
+        block_offset,
+        column_count,
+        column_index,
+        out,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        None,
+        alibi_slopes,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        False,
+        causal,
+        softcap,
+        return_attn_probs and dropout_p > 0,
+        None,
+    )
+    return (out, softmax_lse) if return_softmax_lse else out
diff --git a/sgl-kernel/python/sgl_kernel/spatial.py b/sgl-kernel/python/sgl_kernel/spatial.py
new file mode 100644
index 000000000..201ec7b75
--- /dev/null
+++ b/sgl-kernel/python/sgl_kernel/spatial.py
@@ -0,0 +1,63 @@
+import torch
+from torch.cuda.streams import ExternalStream
+
+try:
+    from . import spatial_ops  # triggers TORCH extension registration
+except Exception as _e:
+    _spatial_import_error = _e
+else:
+    _spatial_import_error = None
+
+_IMPORT_ERROR = ImportError(
+    "Failed to load sgl_kernel.spatial_ops extension. Ensure CUDA Driver >= 12.4"
+)
+
+
+def create_greenctx_stream_by_value(
+    SM_a: int, SM_b: int, device_id: int = None
+) -> tuple[ExternalStream, ExternalStream]:
+    """
+    Create two streams for greenctx.
+    Args:
+        sm_A (int): The SM of stream A.
+        sm_B (int): The weight of stream B.
+        device_id (int): The device id.
+    Returns:
+        tuple[ExternalStream, ExternalStream]: The two streams.
+    """
+    if _spatial_import_error is not None:
+        raise _IMPORT_ERROR from _spatial_import_error
+    if device_id is None:
+        device_id = torch.cuda.current_device()
+
+    res = torch.ops.sgl_kernel.create_greenctx_stream_by_value(SM_a, SM_b, device_id)
+
+    stream_a = ExternalStream(
+        stream_ptr=res[0], device=torch.device(f"cuda:{device_id}")
+    )
+    stream_b = ExternalStream(
+        stream_ptr=res[1], device=torch.device(f"cuda:{device_id}")
+    )
+
+    return stream_a, stream_b
+
+
+def get_sm_available(device_id: int = None) -> int:
+    """
+    Get the SMs available on the device.
+    Args:
+        device_id (int): The device id.
+    Returns:
+        int: The SMs available.
+    """
+    if _spatial_import_error is not None:
+        raise _IMPORT_ERROR from _spatial_import_error
+    if device_id is None:
+        device_id = torch.cuda.current_device()
+
+    device_props = torch.cuda.get_device_properties(device_id)
+
+    # Get the number of Streaming Multiprocessors (SMs)
+    sm_count = device_props.multi_processor_count
+
+    return sm_count
diff --git a/sgl-kernel/python/sgl_kernel/speculative.py b/sgl-kernel/python/sgl_kernel/speculative.py
new file mode 100644
index 000000000..ea2e3ac8a
--- /dev/null
+++ b/sgl-kernel/python/sgl_kernel/speculative.py
@@ -0,0 +1,107 @@
+import torch
+from sgl_kernel.utils import get_cuda_stream
+
+
+def tree_speculative_sampling_target_only(
+    predicts: torch.Tensor,  # mutable
+    accept_index: torch.Tensor,  # mutable
+    accept_token_num: torch.Tensor,  # mutable
+    candidates: torch.Tensor,
+    retrive_index: torch.Tensor,
+    retrive_next_token: torch.Tensor,
+    retrive_next_sibling: torch.Tensor,
+    uniform_samples: torch.Tensor,
+    uniform_samples_for_final_sampling: torch.Tensor,
+    target_probs: torch.Tensor,
+    draft_probs: torch.Tensor,
+    threshold_single: float = 1.0,
+    threshold_acc: float = 1.0,
+    deterministic: bool = True,
+) -> None:
+    torch.ops.sgl_kernel.tree_speculative_sampling_target_only.default(
+        predicts,
+        accept_index,
+        accept_token_num,
+        candidates,
+        retrive_index,
+        retrive_next_token,
+        retrive_next_sibling,
+        uniform_samples,
+        uniform_samples_for_final_sampling,
+        target_probs,
+        draft_probs,
+        threshold_single,
+        threshold_acc,
+        deterministic,
+        get_cuda_stream(),
+    )
+
+
+def verify_tree_greedy(
+    predicts: torch.Tensor,  # mutable
+    accept_index: torch.Tensor,  # mutable
+    accept_token_num: torch.Tensor,  # mutable
+    candidates: torch.Tensor,
+    retrive_index: torch.Tensor,
+    retrive_next_token: torch.Tensor,
+    retrive_next_sibling: torch.Tensor,
+    target_predict: torch.Tensor,
+) -> None:
+    torch.ops.sgl_kernel.verify_tree_greedy.default(
+        predicts,
+        accept_index,
+        accept_token_num,
+        candidates,
+        retrive_index,
+        retrive_next_token,
+        retrive_next_sibling,
+        target_predict,
+        get_cuda_stream(),
+    )
+
+
+def build_tree_kernel_efficient(
+    parent_list: torch.Tensor,
+    selected_index: torch.Tensor,
+    verified_seq_len: torch.Tensor,
+    tree_mask: torch.Tensor,
+    positions: torch.Tensor,
+    retrive_index: torch.Tensor,
+    retrive_next_token: torch.Tensor,
+    retrive_next_sibling: torch.Tensor,
+    topk: int,
+    depth: int,
+    draft_token_num: int,
+    tree_mask_mode: int,
+) -> None:
+    torch.ops.sgl_kernel.build_tree_kernel_efficient.default(
+        parent_list,
+        selected_index,
+        verified_seq_len,
+        tree_mask,
+        positions,
+        retrive_index,
+        retrive_next_token,
+        retrive_next_sibling,
+        topk,
+        depth,
+        draft_token_num,
+        tree_mask_mode,
+    )
+
+
+def segment_packbits(
+    x: torch.Tensor,
+    input_indptr: torch.Tensor,
+    output_indptr: torch.Tensor,
+    y: torch.Tensor,
+    batch_size: int,
+) -> None:
+    torch.ops.sgl_kernel.segment_packbits.default(
+        x,
+        input_indptr,
+        output_indptr,
+        y,
+        batch_size,
+        torch.cuda.current_stream().cuda_stream,
+    )
diff --git a/sgl-kernel/python/sgl_kernel/testing/__init__.py b/sgl-kernel/python/sgl_kernel/testing/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/sgl-kernel/python/sgl_kernel/testing/rotary_embedding.py b/sgl-kernel/python/sgl_kernel/testing/rotary_embedding.py
new file mode 100644
index 000000000..e26208048
--- /dev/null
+++ b/sgl-kernel/python/sgl_kernel/testing/rotary_embedding.py
@@ -0,0 +1,217 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import pytest
+import torch
+from sgl_kernel import FusedSetKVBufferArg, apply_rope_with_cos_sin_cache_inplace
+
+
+# vLLM torch native
+def _apply_rotary_emb(
+    x: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    is_neox_style: bool,
+) -> torch.Tensor:
+    """
+    Args:
+        x: [num_tokens, num_heads, head_size]
+        cos: [num_tokens, head_size // 2]
+        sin: [num_tokens, head_size // 2]
+        is_neox_style: Whether to use the Neox-style or GPT-J-style rotary
+            positional embeddings.
+    """
+    cos = cos.unsqueeze(-2).to(x.dtype)
+    sin = sin.unsqueeze(-2).to(x.dtype)
+    if is_neox_style:
+        x1, x2 = torch.chunk(x, 2, dim=-1)
+    else:
+        x1 = x[..., ::2]
+        x2 = x[..., 1::2]
+    o1 = x1 * cos - x2 * sin
+    o2 = x2 * cos + x1 * sin
+    if is_neox_style:
+        return torch.cat((o1, o2), dim=-1)
+    else:
+        return torch.stack((o1, o2), dim=-1).flatten(-2)
+
+
+class RotaryEmbedding(torch.nn.Module):
+    # Reference: https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/rotary_embedding.py
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+    ) -> None:
+        super().__init__()
+        self.head_size = head_size
+        self.rotary_dim = rotary_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.is_neox_style = is_neox_style
+        self.dtype = dtype
+
+        cache = self._compute_cos_sin_cache()
+        self.cos_sin_cache: torch.Tensor
+        self.register_buffer("cos_sin_cache", cache, persistent=False)
+
+    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+        inv_freq = 1.0 / (
+            base
+            ** (
+                torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
+            )
+        )
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        """Compute the cos and sin cache."""
+        inv_freq = self._compute_inv_freq(self.base)
+        t = torch.arange(self.max_position_embeddings, dtype=torch.float)
+
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """A PyTorch-native implementation of forward()."""
+        if offsets is not None:
+            positions = positions + offsets
+
+        positions = positions.flatten()
+        num_tokens = positions.shape[0]
+        cos_sin = self.cos_sin_cache.index_select(0, positions)
+
+        # Modification: float32 is required for the rotary embedding to work correctly
+        query = query.to(torch.float32)
+        key = key.to(torch.float32)
+
+        cos, sin = cos_sin.chunk(2, dim=-1)
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+
+        # Modification: convert to the correct dtype
+        query = query.to(self.dtype)
+        key = key.to(self.dtype)
+        return query, key
+
+
+class FlashInferRotaryEmbedding(RotaryEmbedding):
+    def forward_cuda(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        fused_set_kv_buffer_arg: Optional[FusedSetKVBufferArg] = None,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        apply_rope_with_cos_sin_cache_inplace(
+            positions=positions,
+            query=query,
+            key=key,
+            fused_set_kv_buffer_arg=fused_set_kv_buffer_arg,
+            head_size=self.head_size,
+            cos_sin_cache=self.cos_sin_cache,
+            is_neox=self.is_neox_style,
+        )
+
+        return query, key
+
+
+class MHATokenToKVPool:
+    KV_POOL_SIZE = 16384
+
+    def __init__(
+        self,
+        head_num: int,
+        head_dim: int,
+    ):
+        self.head_num = head_num
+        self.head_dim = head_dim
+        self.size = MHATokenToKVPool.KV_POOL_SIZE
+        self.page_size = 1
+        self.store_dtype = torch.bfloat16
+        self.device = "cuda"
+        self.layer_num = 1
+        self.start_layer = 0
+        self._create_buffers()
+
+    def _create_buffers(self):
+        self.k_buffer = [
+            torch.zeros(
+                (self.size + self.page_size, self.head_num, self.head_dim),
+                dtype=self.store_dtype,
+                device=self.device,
+            )
+            for _ in range(self.layer_num)
+        ]
+        self.v_buffer = [
+            torch.zeros(
+                (self.size + self.page_size, self.head_num, self.head_dim),
+                dtype=self.store_dtype,
+                device=self.device,
+            )
+            for _ in range(self.layer_num)
+        ]
+
+    def set_kv_buffer(
+        self,
+        loc: torch.Tensor,
+        cache_k: torch.Tensor,
+        cache_v: torch.Tensor,
+    ):
+        layer_id = 0
+        self.k_buffer[layer_id - self.start_layer][loc] = cache_k
+        self.v_buffer[layer_id - self.start_layer][loc] = cache_v
+
+
+def create_inputs(
+    head_size: int,
+    batch_size: int,
+    seq_len: int,
+    device,
+    dtype: torch.dtype,
+    num_q_heads: int,
+    num_kv_heads: int,
+):
+    pos_ids = torch.arange(seq_len, device=device).repeat(batch_size)
+    query = torch.randn(
+        batch_size * seq_len, num_q_heads * head_size, dtype=dtype, device=device
+    )
+    key = torch.randn(
+        batch_size * seq_len, num_kv_heads * head_size, dtype=dtype, device=device
+    )
+    value = torch.randn(
+        batch_size * seq_len, num_kv_heads * head_size, dtype=dtype, device=device
+    )
+    out_cache_loc = torch.randperm(
+        MHATokenToKVPool.KV_POOL_SIZE, dtype=torch.int64, device=device
+    )[: batch_size * seq_len].clone()
+
+    return dict(
+        pos_ids=pos_ids, query=query, key=key, value=value, out_cache_loc=out_cache_loc
+    )
diff --git a/sgl-kernel/python/sgl_kernel/top_k.py b/sgl-kernel/python/sgl_kernel/top_k.py
new file mode 100644
index 000000000..fc29a6db8
--- /dev/null
+++ b/sgl-kernel/python/sgl_kernel/top_k.py
@@ -0,0 +1,11 @@
+import torch
+
+
+def fast_topk(values, topk, dim):
+    if topk == 1:
+        # Use max along the specified dimension to get both value and index
+        return torch.max(values, dim=dim, keepdim=True)
+    else:
+        # Use topk for efficiency with larger k values
+        # TODO: implement faster cuda kernels for large vocab sizes
+        return torch.topk(values, topk, dim=dim)
diff --git a/sgl-kernel/python/sgl_kernel/utils.py b/sgl-kernel/python/sgl_kernel/utils.py
new file mode 100644
index 000000000..f2fa0b617
--- /dev/null
+++ b/sgl-kernel/python/sgl_kernel/utils.py
@@ -0,0 +1,50 @@
+# Copyright 2025 SGLang Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import functools
+from typing import Dict, Tuple
+
+import torch
+
+
+def get_cuda_stream() -> int:
+    return torch.cuda.current_stream().cuda_stream
+
+
+_cache_buf: Dict[Tuple[str, torch.device], torch.Tensor] = {}
+
+
+def _get_cache_buf(name: str, bytes: int, device: torch.device) -> torch.Tensor:
+    key = (name, device)
+    buf = _cache_buf.get(key)
+    if buf is None:
+        buf = torch.empty(bytes, dtype=torch.uint8, device=device)
+        _cache_buf[key] = buf
+    return buf
+
+
+def _to_tensor_scalar_tuple(x):
+    if isinstance(x, torch.Tensor):
+        return (x, 0)
+    else:
+        return (None, x)
+
+
+@functools.lru_cache(maxsize=1)
+def is_arch_support_pdl() -> bool:
+    # Hopper arch's compute capability == 9.0
+    device = torch.cuda.current_device()
+    major, minor = torch.cuda.get_device_capability(device)
+    return major >= 9
diff --git a/sgl-kernel/python/sgl_kernel/version.py b/sgl-kernel/python/sgl_kernel/version.py
new file mode 100644
index 000000000..c2c2a2efd
--- /dev/null
+++ b/sgl-kernel/python/sgl_kernel/version.py
@@ -0,0 +1 @@
+__version__ = "0.3.9.post2"
diff --git a/sgl-kernel/rename_wheels.sh b/sgl-kernel/rename_wheels.sh
new file mode 100755
index 000000000..018eeb27b
--- /dev/null
+++ b/sgl-kernel/rename_wheels.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+set -ex
+
+WHEEL_DIR="dist"
+
+wheel_files=($WHEEL_DIR/*.whl)
+for wheel in "${wheel_files[@]}"; do
+    intermediate_wheel="${wheel/linux/manylinux2014}"
+
+    # Extract the current python version from the wheel name
+    if [[ $intermediate_wheel =~ -cp([0-9]+)- ]]; then
+        cp_version="${BASH_REMATCH[1]}"
+    else
+        echo "Could not extract Python version from wheel name: $intermediate_wheel"
+        continue
+    fi
+
+    # Detect CUDA version and add appropriate suffix
+    if ls /usr/local/ | grep -q "12.4"; then
+        new_wheel="${intermediate_wheel/-cp${cp_version}/+cu124-cp${cp_version}}"
+    elif ls /usr/local/ | grep -q "12.8"; then
+        new_wheel="${intermediate_wheel/-cp${cp_version}/+cu128-cp${cp_version}}"
+    else
+        new_wheel="$intermediate_wheel"
+    fi
+
+    if [[ "$wheel" != "$new_wheel" ]]; then
+        echo "Renaming $wheel to $new_wheel"
+        mv -- "$wheel" "$new_wheel"
+    fi
+done
+echo "Wheel renaming completed."
diff --git a/sgl-kernel/setup_hip.py b/sgl-kernel/setup_hip.py
new file mode 100644
index 000000000..d7691b53b
--- /dev/null
+++ b/sgl-kernel/setup_hip.py
@@ -0,0 +1,100 @@
+# Copyright 2025 SGLang Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import platform
+import sys
+from pathlib import Path
+
+from setuptools import find_packages, setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+root = Path(__file__).parent.resolve()
+arch = platform.machine().lower()
+
+
+def _get_version():
+    with open(root / "pyproject.toml") as f:
+        for line in f:
+            if line.startswith("version"):
+                return line.split("=")[1].strip().strip('"')
+
+
+operator_namespace = "sgl_kernel"
+include_dirs = [
+    root / "include",
+    root / "csrc",
+]
+
+sources = [
+    "csrc/allreduce/custom_all_reduce.hip",
+    "csrc/allreduce/quick_all_reduce.cu",
+    "csrc/common_extension_rocm.cc",
+    "csrc/elementwise/activation.cu",
+    "csrc/grammar/apply_token_bitmask_inplace_cuda.cu",
+    "csrc/moe/moe_align_kernel.cu",
+    "csrc/moe/moe_topk_softmax_kernels.cu",
+    "csrc/speculative/eagle_utils.cu",
+    "csrc/kvcacheio/transfer.cu",
+]
+
+cxx_flags = [
+    "-O3",
+    "-Wno-switch-bool",
+    "-Wno-macro-redefined",
+    "-Wno-deprecated-declarations",
+    "-w",
+]
+libraries = ["c10", "torch", "torch_python"]
+extra_link_args = ["-Wl,-rpath,$ORIGIN/../../torch/lib", f"-L/usr/lib/{arch}-linux-gnu"]
+
+hipcc_flags = [
+    "-fPIC",
+    "-O3",
+    "-std=c++17",
+    "-D__HIP_PLATFORM_HCC__=1",
+    "--offload-arch=gfx928",
+    "--offload-arch=gfx936",
+    "--gpu-max-threads-per-block=1024",
+    "-Wno-macro-redefined",
+    "-Wno-deprecated-declarations",
+    "-funroll-loops",
+    "-Rpass-analysis=unroll-loops",
+    "-w",
+]
+
+ext_modules = [
+    CUDAExtension(
+        name="sgl_kernel.common_ops",
+        sources=sources,
+        include_dirs=include_dirs,
+        extra_compile_args={
+            "nvcc": hipcc_flags,
+            "cxx": cxx_flags,
+        },
+        libraries=libraries,
+        extra_link_args=extra_link_args,
+        py_limited_api=False,
+    ),
+]
+
+setup(
+    name="sgl-kernel",
+    version=_get_version(),
+    packages=find_packages(where="python"),
+    package_dir={"": "python"},
+    ext_modules=ext_modules,
+    cmdclass={"build_ext": BuildExtension.with_options(use_ninja=True)},
+    options={"bdist_wheel": {"py_limited_api": "cp39"}},
+)
\ No newline at end of file
diff --git a/sgl-kernel/setup_rocm.py b/sgl-kernel/setup_rocm.py
new file mode 100644
index 000000000..6e3466ec3
--- /dev/null
+++ b/sgl-kernel/setup_rocm.py
@@ -0,0 +1,116 @@
+# Copyright 2025 SGLang Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import os
+import platform
+import sys
+from pathlib import Path
+
+import torch
+from setuptools import find_packages, setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+root = Path(__file__).parent.resolve()
+arch = platform.machine().lower()
+
+
+def _get_version():
+    with open(root / "pyproject.toml") as f:
+        for line in f:
+            if line.startswith("version"):
+                return line.split("=")[1].strip().strip('"')
+
+
+operator_namespace = "sgl_kernel"
+include_dirs = [
+    root / "include",
+    root / "include" / "impl",
+    root / "csrc",
+]
+
+sources = [
+    "csrc/allreduce/custom_all_reduce.hip",
+    "csrc/allreduce/quick_all_reduce.cu",
+    "csrc/common_extension_rocm.cc",
+    "csrc/elementwise/activation.cu",
+    "csrc/grammar/apply_token_bitmask_inplace_cuda.cu",
+    "csrc/moe/moe_align_kernel.cu",
+    "csrc/moe/moe_topk_softmax_kernels.cu",
+    "csrc/speculative/eagle_utils.cu",
+    "csrc/kvcacheio/transfer.cu",
+]
+
+cxx_flags = ["-O3"]
+libraries = ["hiprtc", "amdhip64", "c10", "torch", "torch_python"]
+extra_link_args = ["-Wl,-rpath,$ORIGIN/../../torch/lib", f"-L/usr/lib/{arch}-linux-gnu"]
+
+default_target = "gfx942"
+amdgpu_target = os.environ.get("AMDGPU_TARGET", default_target)
+
+if torch.cuda.is_available():
+    try:
+        amdgpu_target = torch.cuda.get_device_properties(0).gcnArchName.split(":")[0]
+    except Exception as e:
+        print(f"Warning: Failed to detect GPU properties: {e}")
+else:
+    print(f"Warning: torch.cuda not available. Using default target: {amdgpu_target}")
+
+if amdgpu_target not in ["gfx942", "gfx950"]:
+    print(
+        f"Warning: Unsupported GPU architecture detected '{amdgpu_target}'. Expected 'gfx942' or 'gfx950'."
+    )
+    sys.exit(1)
+
+fp8_macro = (
+    "-DHIP_FP8_TYPE_FNUZ" if amdgpu_target == "gfx942" else "-DHIP_FP8_TYPE_E4M3"
+)
+
+hipcc_flags = [
+    "-DNDEBUG",
+    f"-DOPERATOR_NAMESPACE={operator_namespace}",
+    "-O3",
+    "-Xcompiler",
+    "-fPIC",
+    "-std=c++17",
+    f"--amdgpu-target={amdgpu_target}",
+    "-DENABLE_BF16",
+    "-DENABLE_FP8",
+    fp8_macro,
+]
+
+ext_modules = [
+    CUDAExtension(
+        name="sgl_kernel.common_ops",
+        sources=sources,
+        include_dirs=include_dirs,
+        extra_compile_args={
+            "nvcc": hipcc_flags,
+            "cxx": cxx_flags,
+        },
+        libraries=libraries,
+        extra_link_args=extra_link_args,
+        py_limited_api=False,
+    ),
+]
+
+setup(
+    name="sgl-kernel",
+    version=_get_version(),
+    packages=find_packages(where="python"),
+    package_dir={"": "python"},
+    ext_modules=ext_modules,
+    cmdclass={"build_ext": BuildExtension.with_options(use_ninja=True)},
+    options={"bdist_wheel": {"py_limited_api": "cp39"}},
+)
diff --git a/sgl-kernel/tests/spatial/test_greenctx_stream.py b/sgl-kernel/tests/spatial/test_greenctx_stream.py
new file mode 100644
index 000000000..c57bc3360
--- /dev/null
+++ b/sgl-kernel/tests/spatial/test_greenctx_stream.py
@@ -0,0 +1,25 @@
+import pytest
+import torch
+import torch.nn.functional as F
+from sgl_kernel import create_greenctx_stream_by_value, get_sm_available
+
+
+def test_green_ctx():
+    A = torch.randn(5120, 5120).cuda()
+    B = torch.randn(5120, 5120).cuda()
+    C = torch.matmul(A, B)
+    sm_counts = get_sm_available(0)
+    stream_group = create_greenctx_stream_by_value(sm_counts // 2, sm_counts // 2, 0)
+    with torch.cuda.stream(stream_group[0]):
+        for _ in range(100):
+            result_0 = torch.matmul(A, B)
+    with torch.cuda.stream(stream_group[1]):
+        for _ in range(100):
+            result_1 = torch.matmul(A, B)
+    torch.cuda.synchronize()
+    assert torch.allclose(result_0, C)
+    assert torch.allclose(result_1, C)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/speculative/test_eagle_utils.py b/sgl-kernel/tests/speculative/test_eagle_utils.py
new file mode 100644
index 000000000..503355387
--- /dev/null
+++ b/sgl-kernel/tests/speculative/test_eagle_utils.py
@@ -0,0 +1,87 @@
+import pytest
+import torch
+import torch.nn.functional as F
+from sgl_kernel import verify_tree_greedy
+
+
+def test_verify_tree_greedy():
+    candidates = torch.tensor(
+        [
+            [0, 1, 2, 3, 4, 5],
+            [7, 8, 9, 10, 11, 12],
+        ],
+        dtype=torch.int64,
+        device="cuda",
+    )
+    retrive_index = torch.tensor(
+        [
+            [0, 1, 2, 3, 4, 5],
+            [6, 7, 8, 9, 10, 11],
+        ],
+        dtype=torch.int64,
+        device="cuda",
+    )
+    retrive_next_token = torch.tensor(
+        [
+            [1, 2, -1, 4, 5, -1],
+            [4, 2, 3, -1, 5, -1],
+        ],
+        dtype=torch.int64,
+        device="cuda",
+    )
+    retrive_next_sibling = torch.tensor(
+        [
+            [-1, 3, -1, -1, -1, -1],
+            [-1, -1, -1, -1, 1, -1],
+        ],
+        dtype=torch.int64,
+        device="cuda",
+    )
+
+    target_logits = torch.full((2, 6, 20), 1, dtype=torch.float32, device="cuda")
+    target_logits[0, 0, 3] = 10
+    target_logits[0, 3, 4] = 10
+    target_logits[0, 4, 5] = 10
+    target_logits[1, 0, 11] = 10
+    target_logits[1, 4, 12] = 10
+    for i in range(target_logits.shape[0]):
+        for j in range(target_logits.shape[1]):
+            if torch.max(target_logits[i][j]) < 10:
+                target_logits[i][j][18] = 10
+
+    target_predict = torch.argmax(target_logits, dim=-1)
+    predict_shape = (12,)
+
+    bs = candidates.shape[0]
+    num_spec_step = 4
+
+    predicts = torch.full(
+        predict_shape, -1, dtype=torch.int32, device="cuda"
+    )  # mutable
+    accept_index = torch.full(
+        (bs, num_spec_step), -1, dtype=torch.int32, device="cuda"
+    )  # mutable
+    accept_token_num = torch.full((bs,), 0, dtype=torch.int32, device="cuda")  # mutable
+
+    verify_tree_greedy(
+        predicts=predicts,
+        accept_index=accept_index,
+        accept_token_num=accept_token_num,
+        candidates=candidates,
+        retrive_index=retrive_index,
+        retrive_next_token=retrive_next_token,
+        retrive_next_sibling=retrive_next_sibling,
+        target_predict=target_predict,
+    )
+
+    # Check the expected output.
+    assert predicts.tolist() == [3, -1, -1, 4, 5, 18, 11, -1, -1, -1, 12, 18]
+    assert accept_index.tolist() == [
+        [0, 3, 4, 5],
+        [6, 10, 11, -1],
+    ]
+    assert accept_token_num.tolist() == [3, 2]
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/speculative/test_speculative_sampling.py b/sgl-kernel/tests/speculative/test_speculative_sampling.py
new file mode 100644
index 000000000..a9b59bb2e
--- /dev/null
+++ b/sgl-kernel/tests/speculative/test_speculative_sampling.py
@@ -0,0 +1,129 @@
+import pytest
+import torch
+import torch.nn.functional as F
+from sgl_kernel import tree_speculative_sampling_target_only
+
+test_cases = [
+    (
+        1,
+        1,
+        [3, -1, -1, 4, 5, 18, 11, -1, -1, -1, 12, 18],
+        [[0, 3, 4, 5], [6, 10, 11, -1]],
+        [3, 2],
+    ),
+    (
+        0,  # threshold_single
+        0,  # threshold_acc
+        [1, 2, 18, -1, -1, -1, 11, -1, -1, -1, 12, 18],
+        [[0, 1, 2, -1], [6, 10, 11, -1]],
+        [2, 2],
+    ),
+]
+
+
+@pytest.mark.parametrize(
+    "threshold_single, threshold_acc, expected_predicts, expected_accept_index, expected_accept_token_num",
+    test_cases,
+)
+def test_tree_speculative_sampling_target_only(
+    threshold_single,
+    threshold_acc,
+    expected_predicts,
+    expected_accept_index,
+    expected_accept_token_num,
+):
+    """
+    Tests the tree_speculative_sampling_target_only function using Pytest parameterization.
+    """
+    device = "cuda"
+
+    candidates = torch.tensor(
+        [
+            [0, 1, 2, 3, 4, 5],
+            [7, 8, 9, 10, 11, 12],
+        ],
+        dtype=torch.int64,
+        device=device,
+    )
+    retrive_index = torch.tensor(
+        [
+            [0, 1, 2, 3, 4, 5],
+            [6, 7, 8, 9, 10, 11],
+        ],
+        dtype=torch.int64,
+        device=device,
+    )
+    retrive_next_token = torch.tensor(
+        [
+            [1, 2, -1, 4, 5, -1],
+            [4, 2, 3, -1, 5, -1],
+        ],
+        dtype=torch.int64,
+        device=device,
+    )
+    retrive_next_sibling = torch.tensor(
+        [
+            [-1, 3, -1, -1, -1, -1],
+            [-1, -1, -1, -1, 1, -1],
+        ],
+        dtype=torch.int64,
+        device=device,
+    )
+
+    target_logits = torch.full((2, 6, 20), 1, dtype=torch.float32, device=device)
+    target_logits[0, 0, 3] = 10
+    target_logits[0, 3, 4] = 10
+    target_logits[0, 4, 5] = 10
+    target_logits[1, 0, 11] = 10
+    target_logits[1, 4, 12] = 10
+
+    for i in range(target_logits.shape[0]):
+        for j in range(target_logits.shape[1]):
+            if torch.max(target_logits[i, j]) < 10:
+                target_logits[i, j, 18] = 10
+
+    temperatures = torch.tensor([0.01, 0.01], dtype=torch.float32, device=device)
+    bs, num_draft_tokens = candidates.shape
+    num_spec_step = len(expected_accept_index[0])
+    predict_shape = (len(expected_predicts),)
+
+    predicts = torch.full(predict_shape, -1, dtype=torch.int32, device=device)
+    accept_index = torch.full((bs, num_spec_step), -1, dtype=torch.int32, device=device)
+    accept_token_num = torch.full((bs,), 0, dtype=torch.int32, device=device)
+
+    expanded_temperature = temperatures.unsqueeze(1).unsqueeze(1)
+    target_probs = F.softmax(target_logits / expanded_temperature, dim=-1)
+    draft_probs = torch.full_like(target_probs, 0, dtype=torch.float32, device=device)
+    coins = torch.rand(bs, num_draft_tokens, device=device, dtype=torch.float32)
+    coins_for_final_sampling = torch.rand(bs, device=device).to(torch.float32)
+
+    tree_speculative_sampling_target_only(
+        predicts=predicts,
+        accept_index=accept_index,
+        accept_token_num=accept_token_num,
+        candidates=candidates,
+        retrive_index=retrive_index,
+        retrive_next_token=retrive_next_token,
+        retrive_next_sibling=retrive_next_sibling,
+        uniform_samples=coins,
+        uniform_samples_for_final_sampling=coins_for_final_sampling,
+        target_probs=target_probs,
+        draft_probs=draft_probs,
+        threshold_single=threshold_single,
+        threshold_acc=threshold_acc,
+        deterministic=True,
+    )
+
+    assert (
+        predicts.tolist() == expected_predicts
+    ), f"Predicts mismatch for thresholds ({threshold_single}, {threshold_acc})"
+    assert (
+        accept_index.tolist() == expected_accept_index
+    ), f"Accept index mismatch for thresholds ({threshold_single}, {threshold_acc})"
+    assert (
+        accept_token_num.tolist() == expected_accept_token_num
+    ), f"Accept token num mismatch for thresholds ({threshold_single}, {threshold_acc})"
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_activation.py b/sgl-kernel/tests/test_activation.py
new file mode 100644
index 000000000..43593441e
--- /dev/null
+++ b/sgl-kernel/tests/test_activation.py
@@ -0,0 +1,39 @@
+# Adapted from https://github.com/flashinfer-ai/flashinfer/blob/4e8eb1879f9c3ba6d75511e5893183bf8f289a62/tests/test_activation.py
+
+import pytest
+import sgl_kernel
+import torch
+
+
+@pytest.mark.parametrize("dim", [128, 256, 512, 2048, 4096, 11008, 16384])
+@pytest.mark.parametrize("batch_size", [1, 2, 4, 8, 16])
+@pytest.mark.parametrize("seq_len", [1, 2, 4, 8, 16, 32, 64, 128, 512])
+def test_fused_silu_mul(dim, batch_size, seq_len):
+    x = torch.randn(batch_size, seq_len, 2 * dim).to(0).to(torch.float16)
+    y_ref = x[..., dim:] * torch.nn.functional.silu(x[..., :dim])
+    y = sgl_kernel.silu_and_mul(x)
+    torch.testing.assert_close(y_ref, y, rtol=1e-3, atol=1e-3)
+
+
+@pytest.mark.parametrize("dim", [128, 256, 512, 2048, 4096, 11008, 16384])
+@pytest.mark.parametrize("batch_size", [1, 2, 4, 8, 16])
+@pytest.mark.parametrize("seq_len", [1, 2, 4, 8, 16, 32, 64, 128, 512])
+def test_fused_gelu_tanh_mul(dim, batch_size, seq_len):
+    x = torch.randn(batch_size, seq_len, 2 * dim).to(0).to(torch.float16)
+    y_ref = x[..., dim:] * torch.nn.functional.gelu(x[..., :dim], approximate="tanh")
+    y = sgl_kernel.gelu_tanh_and_mul(x)
+    torch.testing.assert_close(y_ref, y, rtol=1e-3, atol=1e-3)
+
+
+@pytest.mark.parametrize("dim", [128, 256, 512, 2048, 4096, 11008, 16384])
+@pytest.mark.parametrize("batch_size", [1, 2, 4, 8, 16])
+@pytest.mark.parametrize("seq_len", [1, 2, 4, 8, 16, 32, 64, 128, 512])
+def test_fused_gelu_mul(dim, batch_size, seq_len):
+    x = torch.randn(batch_size, seq_len, 2 * dim).to(0).to(torch.float16)
+    y_ref = x[..., dim:] * torch.nn.functional.gelu(x[..., :dim], approximate="none")
+    y = sgl_kernel.gelu_and_mul(x)
+    torch.testing.assert_close(y_ref, y, rtol=1e-3, atol=1e-3)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_apply_token_bitmask_inplace.py b/sgl-kernel/tests/test_apply_token_bitmask_inplace.py
new file mode 100644
index 000000000..480479134
--- /dev/null
+++ b/sgl-kernel/tests/test_apply_token_bitmask_inplace.py
@@ -0,0 +1,23 @@
+import pytest
+import torch
+from sgl_kernel import apply_token_bitmask_inplace_cuda
+
+
+def test_apply_token_bitmask_inplace_kernel():
+    neginf = float("-inf")
+    bool_mask = torch.tensor([0, 1, 0, 1, 0, 1, 0, 1, 0, 1], dtype=torch.bool)
+    logits = torch.tensor(
+        [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], dtype=torch.float32
+    )
+    expected = torch.where(bool_mask, logits, neginf)
+
+    logits_gpu = logits.to("cuda")
+    bitmask = torch.tensor([0b1010101010], dtype=torch.int32).to("cuda")
+    apply_token_bitmask_inplace_cuda(logits_gpu, bitmask)
+    torch.cuda.synchronize()
+    torch.testing.assert_close(logits_gpu, expected.to("cuda"))
+
+
+if __name__ == "__main__":
+    test_apply_token_bitmask_inplace_kernel()
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_awq_dequant.py b/sgl-kernel/tests/test_awq_dequant.py
new file mode 100644
index 000000000..da68e88d0
--- /dev/null
+++ b/sgl-kernel/tests/test_awq_dequant.py
@@ -0,0 +1,115 @@
+import itertools
+from typing import Optional, Tuple
+
+import pytest
+import torch
+from sgl_kernel import awq_dequantize
+
+
+def reverse_awq_order(t: torch.Tensor):
+    bits = 4
+    AWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]
+    reverse_order_tensor = torch.arange(
+        t.shape[-1],
+        dtype=torch.int32,
+        device=t.device,
+    )
+    reverse_order_tensor = reverse_order_tensor.view(-1, 32 // bits)
+    reverse_order_tensor = reverse_order_tensor[:, AWQ_REVERSE_ORDER]
+    reverse_order_tensor = reverse_order_tensor.view(-1)
+
+    t = t[:, reverse_order_tensor] & 0xF
+    return t
+
+
+# qweights - [R     , C // 8], int32
+# scales   - [R // G, C     ], float16
+# zeros    - [R // G, C // 8], int32
+def awq_dequantize_torch(
+    qweight: torch.Tensor, scales: torch.Tensor, qzeros: torch.Tensor, group_size: int
+) -> torch.Tensor:
+
+    if group_size == -1:
+        group_size = qweight.shape[0]
+
+    bits = 4
+    shifts = torch.arange(0, 32, bits, device=qzeros.device)
+
+    iweights = torch.bitwise_right_shift(qweight[:, :, None], shifts[None, None, :]).to(
+        torch.int8
+    )
+
+    iweights = iweights.view(iweights.shape[0], -1)
+
+    zeros = torch.bitwise_right_shift(qzeros[:, :, None], shifts[None, None, :]).to(
+        torch.int8
+    )
+    zeros = zeros.view(qzeros.shape[0], -1)
+    zeros = reverse_awq_order(zeros)
+
+    iweights = reverse_awq_order(iweights)
+
+    iweights = torch.bitwise_and(iweights, (2**bits) - 1)
+    zeros = torch.bitwise_and(zeros, (2**bits) - 1)
+
+    scales = scales.repeat_interleave(group_size, dim=0)
+    zeros = zeros.repeat_interleave(group_size, dim=0)
+    return (iweights - zeros) * scales
+
+
+def sglang_awq_dequantize(
+    qweight: torch.Tensor, scales: torch.Tensor, qzeros: torch.Tensor
+) -> torch.Tensor:
+    return awq_dequantize(qweight, scales, qzeros)
+
+
+@pytest.mark.parametrize(
+    "qweight_row,qweight_col,is_bf16_act",
+    list(
+        itertools.product(
+            [3584, 18944, 128, 256, 512, 1024, 1536],
+            [448, 576, 4736, 16, 32, 64, 128, 72],
+            [True, False],
+        )
+    ),
+)
+def test_awq_dequant_compare_implementations(
+    qweight_row: int, qweight_col: int, is_bf16_act: bool
+):
+    device = torch.device("cuda")
+    qweight = torch.randint(
+        0,
+        torch.iinfo(torch.int32).max,
+        (qweight_row, qweight_col),
+        dtype=torch.int32,
+        device=device,
+    )
+    group_size = qweight_row
+    scales_row = qweight_row // group_size
+    scales_col = qweight_col * 8
+
+    if is_bf16_act:
+        scales = torch.rand(scales_row, scales_col, dtype=torch.bfloat16, device=device)
+    else:
+        scales = torch.rand(scales_row, scales_col, dtype=torch.float16, device=device)
+
+    qzeros = torch.randint(
+        0,
+        torch.iinfo(torch.int32).max,
+        (scales_row, qweight_col),
+        dtype=torch.int32,
+        device=device,
+    )
+
+    # Run both implementations
+    torch_out = awq_dequantize_torch(qweight, scales, qzeros, group_size)
+    sglang_out = sglang_awq_dequantize(qweight, scales, qzeros)
+
+    # Compare results
+    torch.testing.assert_close(
+        torch_out.to(torch.float32), sglang_out.to(torch.float32), rtol=1e-3, atol=1e-5
+    )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_bmm_fp8.py b/sgl-kernel/tests/test_bmm_fp8.py
new file mode 100644
index 000000000..e0be92896
--- /dev/null
+++ b/sgl-kernel/tests/test_bmm_fp8.py
@@ -0,0 +1,43 @@
+# Adapted from https://github.com/flashinfer-ai/flashinfer/blob/4e8eb1879f9c3ba6d75511e5893183bf8f289a62/tests/test_bmm_fp8.py
+
+import pytest
+import torch
+import torch.nn.functional as F
+from sgl_kernel import bmm_fp8
+
+
+def to_float8(x, dtype=torch.float8_e4m3fn):
+    finfo = torch.finfo(dtype)
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    scale = finfo.max / amax
+    x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
+    return x_scl_sat.to(dtype), scale.float().reciprocal()
+
+
+@pytest.mark.parametrize("input_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
+@pytest.mark.parametrize("mat2_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
+@pytest.mark.parametrize("res_dtype", [torch.bfloat16, torch.float16])
+def test_bmm_fp8(input_dtype, mat2_dtype, res_dtype):
+    if input_dtype == torch.float8_e5m2 and mat2_dtype == torch.float8_e5m2:
+        pytest.skip("Invalid combination: both input and mat2 are e5m2")
+
+    input = torch.randn([16, 48, 64], device="cuda", dtype=torch.bfloat16)
+    input_fp8, input_inv_s = to_float8(input, dtype=input_dtype)
+
+    # mat2 row  major -> column major
+    mat2 = torch.randn([16, 80, 64], device="cuda", dtype=torch.bfloat16).transpose(
+        -2, -1
+    )
+    mat2_fp8, mat2_inv_s = to_float8(mat2, dtype=mat2_dtype)
+
+    res = torch.empty([16, 48, 80], device="cuda", dtype=res_dtype)
+    bmm_fp8(input_fp8, mat2_fp8, input_inv_s, mat2_inv_s, res_dtype, res)
+
+    reference = torch.bmm(input, mat2)
+    cos_sim = F.cosine_similarity(reference.reshape(-1), res.reshape(-1), dim=0)
+    assert cos_sim > 0.99
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_causal_conv1d.py b/sgl-kernel/tests/test_causal_conv1d.py
new file mode 100644
index 000000000..a10e1f45e
--- /dev/null
+++ b/sgl-kernel/tests/test_causal_conv1d.py
@@ -0,0 +1,489 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/tests/kernels/mamba/test_causal_conv1d.py
+from typing import Optional
+
+import torch
+from sgl_kernel import causal_conv1d_fwd
+from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel
+
+PAD_SLOT_ID = -1
+
+
+def causal_conv1d_fn(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    query_start_loc: Optional[torch.Tensor] = None,
+    cache_indices: Optional[torch.Tensor] = None,
+    has_initial_state: Optional[torch.Tensor] = None,
+    conv_states: Optional[torch.Tensor] = None,
+    activation: Optional[str] = "silu",
+    pad_slot_id: int = PAD_SLOT_ID,
+):
+    """
+    x: (batch, dim, seqlen) or (dim,cu_seq_len) for varlen
+        sequences are concatenated from left to right for varlen
+    weight: (dim, width)
+    bias: (dim,)
+    query_start_loc: (batch + 1) int32
+        The cumulative sequence lengths of the sequences in
+        the batch, used to index into sequence. prepended by 0.
+        for example: query_start_loc = torch.Tensor([0,10,16,17]),
+        x.shape=(dim,17)
+    cache_indices: (batch)  int32
+        indicates the corresponding state index,
+        like so: conv_state = conv_states[cache_indices[batch_id]]
+    has_initial_state: (batch) bool
+        indicates whether should the kernel take the current state as initial
+        state for the calculations
+    conv_states: (...,dim,width - 1) itype
+        updated inplace if provided
+    activation: either None or "silu" or "swish"
+    pad_slot_id: int
+            if cache_indices is passed, lets the kernel identify padded
+            entries that will not be processed,
+            for example: cache_indices = [pad_slot_id, 1, 20, pad_slot_id]
+            in this case, the kernel will not process entries at
+            indices 0 and 3
+    out: (batch, dim, seqlen)
+    """
+    if activation not in [None, "silu", "swish"]:
+        raise NotImplementedError("activation must be None, silu, or swish")
+    if x.stride(-1) != 1:
+        x = x.contiguous()
+    bias = bias.contiguous() if bias is not None else None
+
+    causal_conv1d_fwd(
+        x,
+        weight,
+        bias,
+        conv_states,
+        query_start_loc,
+        cache_indices,
+        has_initial_state,
+        activation in ["silu", "swish"],
+        pad_slot_id,
+    )
+    return x
+
+
+def causal_conv1d_update(
+    x: torch.Tensor,
+    conv_state: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    activation: Optional[str] = None,
+    cache_seqlens: Optional[torch.Tensor] = None,
+    conv_state_indices: Optional[torch.Tensor] = None,
+    pad_slot_id: int = PAD_SLOT_ID,
+):
+    """
+    x: (batch, dim) or (batch, dim, seqlen)
+    conv_state: (batch, dim, state_len), where state_len >= width - 1
+    weight: (dim, width)
+    bias: (dim,)
+    cache_seqlens: (batch,), dtype int32.
+        If not None, the conv_state is treated as a circular buffer.
+        The conv_state will be updated by copying x to the conv_state
+        starting at the index
+        @cache_seqlens % state_len.
+    conv_state_indices: (batch,), dtype int32
+        If not None, the conv_state is a larger tensor along the batch dim,
+        and we are selecting the batch coords specified by conv_state_indices.
+        Useful for a continuous batching scenario.
+    pad_slot_id: int
+            if cache_indices is passed, lets the kernel identify padded
+            entries that will not be processed,
+            for example: cache_indices = [pad_slot_id, 1 ,20 ,pad_slot_id]
+            in this case, the kernel will not process entries at
+            indices 0 and 3
+    out: (batch, dim) or (batch, dim, seqlen)
+    """
+    if activation not in [None, "silu", "swish"]:
+        raise NotImplementedError(
+            f"activation must be None, silu, or swish, actual: {activation}"
+        )
+    activation_val = activation in ["silu", "swish"]
+    unsqueeze = x.dim() == 2
+    if unsqueeze:
+        x = x.unsqueeze(-1)
+    causal_conv1d_update_kernel(
+        x,
+        conv_state,
+        weight,
+        bias,
+        activation_val,
+        cache_seqlens,
+        conv_state_indices,
+        pad_slot_id,
+    )
+    if unsqueeze:
+        x = x.squeeze(-1)
+    return x
+
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+
+def causal_conv1d_ref(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    initial_states: Optional[torch.Tensor] = None,
+    return_final_states: bool = False,
+    final_states_out: Optional[torch.Tensor] = None,
+    activation: Optional[str] = "silu",
+):
+    """
+    x: (batch, dim, seqlen)
+    weight: (dim, width)
+    bias: (dim,)
+    initial_states: (batch, dim, width - 1)
+    final_states_out: (batch, dim, width - 1)
+
+    out: (batch, dim, seqlen)
+    """
+    if activation not in [None, "silu", "swish"]:
+        raise NotImplementedError("activation must be None, silu, or swish")
+    dtype_in = x.dtype
+    x = x.to(weight.dtype)
+    seqlen = x.shape[-1]
+    dim, width = weight.shape
+    if initial_states is None:
+        out = F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)
+    else:
+        x = torch.cat([initial_states, x], dim=-1)
+        out = F.conv1d(x, weight.unsqueeze(1), bias, padding=0, groups=dim)
+    out = out[..., :seqlen]
+    if return_final_states:
+        final_states = F.pad(x, (width - 1 - x.shape[-1], 0)).to(
+            dtype_in
+        )  # (batch, dim, width - 1)
+        if final_states_out is not None:
+            final_states_out.copy_(final_states)
+        else:
+            final_states_out = final_states
+    out = (out if activation is None else F.silu(out)).to(dtype=dtype_in)
+    return (out, None) if not return_final_states else (out, final_states_out)
+
+
+def causal_conv1d_update_ref(
+    x, conv_state, weight, bias=None, activation=None, cache_seqlens=None
+):
+    """
+    x: (batch, dim) or (batch, dim, seqlen)
+    conv_state: (batch, dim, state_len), where state_len >= width - 1
+    weight: (dim, width)
+    bias: (dim,)
+    cache_seqlens: (batch,), dtype int32.
+        If not None, the conv_state is treated as a circular buffer.
+        The conv_state will be updated by copying x to the
+        conv_state starting at the index
+        @cache_seqlens % state_len before performing the convolution.
+
+    out: (batch, dim) or (batch, dim, seqlen)
+    """
+    if activation not in [None, "silu", "swish"]:
+        raise NotImplementedError("activation must be None, silu, or swish")
+    dtype_in = x.dtype
+    unsqueeze = x.dim() == 2
+    if unsqueeze:
+        x = x.unsqueeze(-1)
+    batch, dim, seqlen = x.shape
+    width = weight.shape[1]
+    state_len = conv_state.shape[-1]
+    assert conv_state.shape == (batch, dim, state_len)
+    assert weight.shape == (dim, width)
+    if cache_seqlens is None:
+        x_new = torch.cat([conv_state, x], dim=-1).to(
+            weight.dtype
+        )  # (batch, dim, state_len + seqlen)
+        conv_state.copy_(x_new[:, :, -state_len:])
+    else:
+        width_idx = torch.arange(
+            -(width - 1), 0, dtype=torch.long, device=x.device
+        ).unsqueeze(0) + cache_seqlens.unsqueeze(1)
+        width_idx = (
+            torch.remainder(width_idx, state_len).unsqueeze(1).expand(-1, dim, -1)
+        )
+        x_new = torch.cat([conv_state.gather(2, width_idx), x], dim=-1).to(weight.dtype)
+        copy_idx = torch.arange(seqlen, dtype=torch.long, device=x.device).unsqueeze(
+            0
+        ) + cache_seqlens.unsqueeze(1)
+        copy_idx = torch.remainder(copy_idx, state_len).unsqueeze(1).expand(-1, dim, -1)
+        conv_state.scatter_(2, copy_idx, x)
+    out = F.conv1d(x_new, weight.unsqueeze(1), bias, padding=0, groups=dim)[
+        :, :, -seqlen:
+    ]
+    if unsqueeze:
+        out = out.squeeze(-1)
+    return (out if activation is None else F.silu(out)).to(dtype=dtype_in)
+
+
+@pytest.mark.parametrize("itype", [torch.bfloat16, torch.float])
+@pytest.mark.parametrize("silu_activation", [True])
+@pytest.mark.parametrize("has_bias", [True])
+@pytest.mark.parametrize("has_initial_state", [True, False])
+@pytest.mark.parametrize("width", [4])
+@pytest.mark.parametrize(
+    "seqlen", [1, 8, 16, 32, 64, 128, 256, 512, 784, 1024, 1025, 2048, 4096]
+)
+@pytest.mark.parametrize("dim", [64])
+@pytest.mark.parametrize("batch", [1])
+def test_causal_conv1d(
+    batch, dim, seqlen, width, has_bias, silu_activation, has_initial_state, itype
+):
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
+    if itype == torch.bfloat16:
+        rtol, atol = 1e-2, 5e-2
+    x = torch.randn(batch, dim, seqlen, device=device, dtype=itype).contiguous()
+
+    weight = torch.randn(dim, width, device=device, dtype=itype)
+    bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
+    if has_initial_state:
+        initial_states = torch.randn(batch, dim, width - 1, device=device, dtype=itype)
+        has_initial_state_tensor = torch.ones(batch, dtype=torch.bool, device=x.device)
+    else:
+        initial_states = None
+        has_initial_state_tensor = None
+    x_ref = x.clone()
+    weight_ref = weight.clone()
+    bias_ref = bias.clone() if bias is not None else None
+    initial_states_ref = initial_states.clone() if initial_states is not None else None
+    activation = None if not silu_activation else "silu"
+    out = causal_conv1d_fn(
+        x,
+        weight,
+        bias,
+        activation=activation,
+        conv_states=initial_states,
+        has_initial_state=has_initial_state_tensor,
+    )
+    out_ref, final_states_ref = causal_conv1d_ref(
+        x_ref,
+        weight_ref,
+        bias_ref,
+        initial_states=initial_states_ref,
+        return_final_states=True,
+        activation=activation,
+    )
+    if has_initial_state:
+        assert initial_states is not None and final_states_ref is not None
+        assert torch.allclose(initial_states, final_states_ref, rtol=rtol, atol=atol)
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("itype", [torch.bfloat16])
+@pytest.mark.parametrize("silu_activation", [False, True])
+@pytest.mark.parametrize("has_bias", [False, True])
+@pytest.mark.parametrize("seqlen", [1])
+@pytest.mark.parametrize("width", [4])
+@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
+def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation, itype):
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
+    if itype == torch.bfloat16:
+        rtol, atol = 1e-2, 5e-2
+
+    batch = 2
+    x = torch.randn(batch, dim, seqlen, device=device, dtype=itype)
+    x_ref = x.clone()
+    conv_state = torch.randn(batch, dim, width - 1, device=device, dtype=itype)
+
+    weight = torch.randn(dim, width, device=device, dtype=itype)
+    bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
+    conv_state_ref = conv_state.detach().clone()
+    activation = None if not silu_activation else "silu"
+    out = causal_conv1d_update(x, conv_state, weight, bias, activation=activation)
+    out_ref = causal_conv1d_update_ref(
+        x_ref, conv_state_ref, weight, bias, activation=activation
+    )
+
+    assert torch.equal(conv_state, conv_state_ref)
+    assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("silu_activation", [False, True])
+@pytest.mark.parametrize("has_bias", [False, True])
+@pytest.mark.parametrize("seqlen", [1, 4, 5])
+@pytest.mark.parametrize("width", [2, 3, 4])
+@pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096])
+# tests correctness in case subset of the sequences are padded
+@pytest.mark.parametrize("with_padding", [True, False])
+def test_causal_conv1d_update_with_batch_gather(
+    with_padding, dim, width, seqlen, has_bias, silu_activation, itype
+):
+    device = "cuda"
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
+    if itype == torch.bfloat16:
+        rtol, atol = 1e-2, 5e-2
+
+    batch_size = 3
+    padding = 5 if with_padding else 0
+    padded_batch_size = batch_size + padding
+    total_entries = 10 * batch_size
+
+    x = torch.randn(padded_batch_size, dim, 1, device=device, dtype=itype)
+    x_ref = x.clone()
+
+    conv_state_indices = torch.randperm(total_entries)[:batch_size].to(
+        dtype=torch.int32, device=device
+    )
+    unused_states_bool = torch.ones(total_entries, dtype=torch.bool, device=device)
+    unused_states_bool[conv_state_indices] = False
+    padded_state_indices = torch.concat(
+        [
+            conv_state_indices,
+            torch.as_tensor([PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
+        ],
+        dim=0,
+    )
+    conv_state = torch.randn(total_entries, dim, width - 1, device=device, dtype=itype)
+    conv_state_for_padding_test = conv_state.clone()
+
+    weight = torch.randn(dim, width, device=device, dtype=itype)
+    bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
+    conv_state_ref = conv_state[conv_state_indices, :].detach().clone()
+    activation = None if not silu_activation else "silu"
+    out = causal_conv1d_update(
+        x,
+        conv_state,
+        weight,
+        bias,
+        activation=activation,
+        conv_state_indices=padded_state_indices,
+        pad_slot_id=PAD_SLOT_ID,
+    )
+    out_ref = causal_conv1d_update_ref(
+        x_ref[:batch_size], conv_state_ref, weight, bias, activation=activation
+    )
+
+    assert torch.equal(conv_state[conv_state_indices, :], conv_state_ref)
+    assert torch.allclose(out[:batch_size], out_ref, rtol=rtol, atol=atol)
+    assert torch.equal(
+        conv_state[unused_states_bool], conv_state_for_padding_test[unused_states_bool]
+    )
+
+
+@pytest.mark.parametrize("itype", [torch.bfloat16])
+@pytest.mark.parametrize("silu_activation", [True])
+@pytest.mark.parametrize("has_bias", [True])
+@pytest.mark.parametrize("width", [4])
+@pytest.mark.parametrize(
+    "seqlen", [8, 16, 32, 64, 128, 256, 512, 784, 1024, 2048, 2049, 4096]
+)
+@pytest.mark.parametrize("dim", [64, 4096])
+# tests correctness in case subset of the sequences are padded
+@pytest.mark.parametrize("with_padding", [True, False])
+def test_causal_conv1d_varlen(
+    with_padding, dim, seqlen, width, has_bias, silu_activation, itype
+):
+    device = "cuda"
+    torch.cuda.empty_cache()
+    rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
+    if itype == torch.bfloat16:
+        rtol, atol = 1e-2, 5e-2
+
+    seqlens = []
+    batch_size = 4
+    if seqlen < 10:
+        batch_size = 1
+    padding = 3 if with_padding else 0
+    padded_batch_size = batch_size + padding
+    nsplits = padded_batch_size - 1
+
+    eos_pos = torch.randperm(seqlen - 1)[:nsplits].sort().values
+    seqlens.append(
+        torch.diff(
+            torch.cat([torch.tensor([-1]), eos_pos, torch.tensor([seqlen - 1])])
+        ).tolist()
+    )
+    assert sum(seqlens[-1]) == seqlen
+    assert all(s > 0 for s in seqlens[-1])
+
+    total_entries = batch_size * 10
+    cumsum = torch.cumsum(torch.tensor(seqlens[0]), dim=0).to(torch.int32)
+    cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum], dim=0)
+    x = torch.randn(1, 4096 + dim + 64, seqlen, device=device, dtype=itype)[
+        :, 4096 : 4096 + dim, :
+    ]
+    weight = torch.randn(dim, width, device=device, dtype=itype)
+    bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
+    x_ref = x.clone()
+    weight_ref = weight.clone()
+    bias_ref = bias.clone() if bias is not None else None
+    activation = None if not silu_activation else "silu"
+    final_states = torch.randn(
+        total_entries, dim, width - 1, device=x.device, dtype=x.dtype
+    )
+    final_states_ref = final_states.clone()
+    has_initial_states = torch.randint(
+        0, 2, (cumsum.shape[0] - 1,), dtype=torch.bool, device=x.device
+    )
+    state_indices = torch.randperm(total_entries, dtype=torch.int32, device=x.device)[
+        :batch_size
+    ]
+    padded_state_indices = torch.concat(
+        [
+            state_indices,
+            torch.as_tensor([PAD_SLOT_ID] * padding, dtype=torch.int32, device=device),
+        ],
+        dim=-1,
+    )
+
+    out = causal_conv1d_fn(
+        x.squeeze(0),
+        weight,
+        bias,
+        cumsum.cuda(),
+        padded_state_indices,
+        has_initial_states,
+        final_states,
+        activation,
+        PAD_SLOT_ID,
+    )
+    out_ref = []
+    out_ref_b = []
+
+    splits = [torch.split(var, seqlens[0], dim=-1) for var in (x_ref)]
+    for i in range(len(seqlens[0])):
+        x_s = [v[i].unsqueeze(0) for v in splits][0]
+        if padded_state_indices[i] == PAD_SLOT_ID:
+            continue
+        out_ref_b.append(
+            causal_conv1d_ref(
+                x_s,
+                weight_ref,
+                bias_ref,
+                activation=activation,
+                return_final_states=True,
+                final_states_out=final_states_ref[padded_state_indices[i]].unsqueeze(0),
+                initial_states=(
+                    final_states_ref[padded_state_indices[i]].unsqueeze(0)
+                    if has_initial_states[i]
+                    else None
+                ),
+            )
+        )
+    out_ref.append(torch.cat([t[0] for t in out_ref_b], dim=2))
+    out_ref_tensor = torch.cat(out_ref, dim=0)
+
+    unpadded_out = out[:, : out_ref_tensor.shape[-1]]
+    assert torch.allclose(unpadded_out, out_ref_tensor, rtol=rtol, atol=atol)
+    assert torch.allclose(
+        final_states[state_indices],
+        final_states_ref[state_indices],
+        rtol=rtol,
+        atol=atol,
+    )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_custom_allreduce.py b/sgl-kernel/tests/test_custom_allreduce.py
new file mode 100644
index 000000000..d729392c3
--- /dev/null
+++ b/sgl-kernel/tests/test_custom_allreduce.py
@@ -0,0 +1,185 @@
+import ctypes
+import multiprocessing as mp
+import random
+import socket
+import unittest
+from typing import Any, List, Optional
+
+import sgl_kernel.allreduce as custom_ops
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from sglang.srt.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
+
+
+def _run_correctness_worker(world_size, rank, distributed_init_port, test_sizes):
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    distributed_init_method = f"tcp://localhost:{distributed_init_port}"
+    dist.init_process_group(
+        backend="nccl",
+        init_method=distributed_init_method,
+        rank=rank,
+        world_size=world_size,
+    )
+    group = dist.group.WORLD
+
+    try:
+        device = torch.device(f"cuda:{rank}")
+        max_size = 8192 * 1024
+        meta_ptrs = TestCustomAllReduce.create_shared_buffer(
+            custom_ops.meta_size() + max_size, group=group
+        )
+
+        rank_data = torch.empty(8 * 1024 * 1024, dtype=torch.uint8, device=device)
+        buffer_ptrs = TestCustomAllReduce.create_shared_buffer(max_size, group=group)
+
+        custom_ptr = custom_ops.init_custom_ar(meta_ptrs, rank_data, rank, True)
+        custom_ops.register_buffer(custom_ptr, buffer_ptrs)
+
+        test_loop = 10
+        for sz in test_sizes:
+            for dtype in [torch.float32, torch.float16, torch.bfloat16]:
+                for _ in range(test_loop):
+                    inp1 = torch.randint(1, 16, (sz,), dtype=dtype, device=device)
+                    inp1_ref = inp1.clone()
+                    out1 = torch.empty_like(inp1)
+
+                    custom_ops.all_reduce(
+                        custom_ptr, inp1, out1, buffer_ptrs[rank], max_size
+                    )
+
+                    dist.all_reduce(inp1_ref, group=group)
+
+                    torch.testing.assert_close(out1, inp1_ref)
+
+    finally:
+        dist.barrier(group=group)
+        if custom_ptr is not None:
+            custom_ops.dispose(custom_ptr)
+        if buffer_ptrs:
+            TestCustomAllReduce.free_shared_buffer(buffer_ptrs, group)
+        if meta_ptrs:
+            TestCustomAllReduce.free_shared_buffer(meta_ptrs, group)
+
+        dist.destroy_process_group(group=group)
+
+
+def get_open_port() -> int:
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(("127.0.0.1", 0))
+            return s.getsockname()[1]
+    except OSError:
+        with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
+            s.bind(("::1", 0))
+            return s.getsockname()[1]
+
+
+def multi_process_parallel(
+    world_size: int, test_target: Any, target_args: tuple = ()
+) -> None:
+    mp.set_start_method("spawn", force=True)
+
+    procs = []
+    distributed_init_port = get_open_port()
+    for i in range(world_size):
+        proc_args = (world_size, i, distributed_init_port) + target_args
+        proc = mp.Process(target=test_target, args=proc_args, name=f"Worker-{i}")
+        proc.start()
+        procs.append(proc)
+
+    for i in range(world_size):
+        procs[i].join()
+        assert (
+            procs[i].exitcode == 0
+        ), f"Process {i} failed with exit code {procs[i].exitcode}"
+
+
+class TestCustomAllReduce(unittest.TestCase):
+    test_sizes = [
+        512,
+        2560,
+        4096,
+        5120,
+        7680,
+        32768,
+        262144,
+        524288,
+        1048576,
+        2097152,
+    ]
+    world_sizes = [2, 4, 8]
+
+    @staticmethod
+    def create_shared_buffer(
+        size_in_bytes: int, group: Optional[ProcessGroup] = None
+    ) -> List[int]:
+        lib = CudaRTLibrary()
+        pointer = lib.cudaMalloc(size_in_bytes)
+        handle = lib.cudaIpcGetMemHandle(pointer)
+        if group is None:
+            group = dist.group.WORLD
+        world_size = dist.get_world_size(group=group)
+        rank = dist.get_rank(group=group)
+
+        handle_bytes = ctypes.string_at(ctypes.addressof(handle), ctypes.sizeof(handle))
+        input_tensor = torch.ByteTensor(list(handle_bytes)).to(f"cuda:{rank}")
+        gathered_tensors = [torch.empty_like(input_tensor) for _ in range(world_size)]
+        dist.all_gather(gathered_tensors, input_tensor, group=group)
+
+        handles = []
+        handle_type = type(handle)
+        for tensor in gathered_tensors:
+            bytes_list = tensor.cpu().tolist()
+            bytes_data = bytes(bytes_list)
+            handle_obj = handle_type()
+            ctypes.memmove(ctypes.addressof(handle_obj), bytes_data, len(bytes_data))
+            handles.append(handle_obj)
+
+        pointers: List[int] = []
+        for i, h in enumerate(handles):
+            if i == rank:
+                pointers.append(pointer.value)
+            else:
+                try:
+                    opened_ptr = lib.cudaIpcOpenMemHandle(h)
+                    pointers.append(opened_ptr.value)
+                except Exception as e:
+                    print(f"Rank {rank}: Failed to open IPC handle from rank {i}: {e}")
+                    raise
+
+        dist.barrier(group=group)
+        return pointers
+
+    @staticmethod
+    def free_shared_buffer(
+        pointers: List[int], group: Optional[ProcessGroup] = None
+    ) -> None:
+        if group is None:
+            group = dist.group.WORLD
+        rank = dist.get_rank(group=group)
+        lib = CudaRTLibrary()
+        if pointers and len(pointers) > rank and pointers[rank] is not None:
+            lib.cudaFree(ctypes.c_void_p(pointers[rank]))
+        dist.barrier(group=group)
+
+    def test_correctness(self):
+        for world_size in self.world_sizes:
+            available_gpus = torch.cuda.device_count()
+            if world_size > available_gpus:
+                print(
+                    f"Skipping world_size={world_size}, requires {world_size} GPUs, found {available_gpus}"
+                )
+                continue
+
+            print(f"Running test for world_size={world_size}")
+            multi_process_parallel(
+                world_size, _run_correctness_worker, target_args=(self.test_sizes,)
+            )
+            print(f"custom allreduce tp = {world_size}: OK")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sgl-kernel/tests/test_cutlass_mla.py b/sgl-kernel/tests/test_cutlass_mla.py
new file mode 100644
index 000000000..71de8327a
--- /dev/null
+++ b/sgl-kernel/tests/test_cutlass_mla.py
@@ -0,0 +1,104 @@
+import pytest
+import torch
+import torch.nn.functional as F
+from sgl_kernel import cutlass_mla_decode, cutlass_mla_get_workspace_size
+from torch import Tensor
+
+# Disable tests on SM103 until the accuracy issues are fixed.
+if torch.cuda.get_device_capability() != (10, 0):
+    pytest.skip(
+        reason="Cutlass MLA Requires compute capability of 10.",
+        allow_module_level=True,
+    )
+
+
+def ref_mla(
+    out: Tensor,  # (bs, num_heads, v_head_dim)
+    query: Tensor,  # (bs, num_heads, head_dim)
+    kv_cache: Tensor,  # (num_blocks, block_size, head_dim)
+    scale: float,
+    block_tables: Tensor,  # (bs, max_num_blocks)
+    seq_lens: Tensor,  # (bs,)
+):
+    bs, num_heads, v_head_dim = out.shape
+    head_dim = query.shape[2]
+
+    for i in range(bs):
+        # gather and flatten KV-cache
+        kv = kv_cache[block_tables[i]]  # (max_num_blocks, block_size, head_dim)
+        kv = kv.view(1, -1, head_dim)[:, : seq_lens[i]]  # (1, seq_len, head_dim)
+        v = kv[:, :, :v_head_dim]
+
+        q = query[i].view(num_heads, 1, head_dim)
+        o = F.scaled_dot_product_attention(q, kv, v, scale=scale, enable_gqa=True)
+        out[i] = o.view(num_heads, v_head_dim)
+
+    return out
+
+
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("mean_seq_len", [128, 1024, 4096])
+@pytest.mark.parametrize("bs", [1, 2, 4])
+@pytest.mark.parametrize("varlen", [False, True])
+@pytest.mark.parametrize("block_size", [1, 16, 64, 128])
+@pytest.mark.parametrize("num_heads", [16, 32, 64, 128])
+@pytest.mark.parametrize("num_kv_splits", [-1, 1])
+def test_cutlass_mla_decode(
+    dtype: torch.dtype,
+    mean_seq_len: int,
+    bs: int,
+    varlen: bool,
+    block_size: int,
+    num_heads: int,
+    num_kv_splits: int,
+):
+    torch.set_default_dtype(dtype)
+    torch.set_default_device("cuda")
+    torch.manual_seed(42)
+
+    d = 576
+    h_q = num_heads
+    dv = 512
+
+    q_nope_dim = 128
+    q_pe_dim = 64
+    scale = (q_nope_dim + q_pe_dim) ** (-0.5)
+    if varlen:
+        seq_lens = torch.empty(bs).normal_(mean_seq_len, mean_seq_len / 2)
+        seq_lens = seq_lens.clip(2).to(torch.int32)
+    else:
+        seq_lens = torch.full((bs,), mean_seq_len, dtype=torch.int32)
+    max_seq_len = seq_lens.max().item()
+    block_num = (max_seq_len + block_size - 1) // block_size
+
+    # Pad block_num so that small blocks can be packed into full 128-sized CUTLASS tiles.
+    # One 128-wide tile can hold (128 // block_size) small blocks.
+    pack_factor = 128 // block_size
+    block_num = ((block_num + pack_factor - 1) // pack_factor) * pack_factor
+
+    # Lager q values to detect split kv error
+    q = torch.randn(bs, h_q, d) * 100.0
+    block_table = torch.randint(0, bs * block_num, (bs, block_num), dtype=torch.int32)
+
+    kv_cache = torch.randn(block_table.numel(), block_size, d)
+
+    workspace_size = cutlass_mla_get_workspace_size(
+        block_num * block_size, bs, num_kv_splits=num_kv_splits
+    )
+    workspace = torch.empty(workspace_size, device="cuda", dtype=torch.uint8)
+
+    q_nope = torch.empty((h_q, bs, dv)).transpose(0, 1)
+    q_nope.copy_(q[:, :, :dv])
+    q_pe = q[:, :, dv:].clone()
+
+    out_ref = q.new_zeros(bs, h_q, dv)
+    ref_mla(out_ref, q, kv_cache, scale, block_table, seq_lens)
+    out = cutlass_mla_decode(
+        q_nope, q_pe, kv_cache, seq_lens, block_table, workspace, scale, num_kv_splits
+    )
+
+    torch.testing.assert_close(out, out_ref, atol=1e-2, rtol=1e-2)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_cutlass_w4a8_moe_mm.py b/sgl-kernel/tests/test_cutlass_w4a8_moe_mm.py
new file mode 100644
index 000000000..3f9e60077
--- /dev/null
+++ b/sgl-kernel/tests/test_cutlass_w4a8_moe_mm.py
@@ -0,0 +1,283 @@
+import pytest
+import torch
+from sgl_kernel import cutlass_w4a8_moe_mm, sgl_per_tensor_quant_fp8
+from utils import is_hopper
+
+
+def pack_int4_values_to_int8(int4_values_interleaved: torch.Tensor) -> torch.Tensor:
+    if int4_values_interleaved.shape[-1] % 2 != 0:
+        raise ValueError(
+            "the last dim size of int4_values_interleaved tensor must be even."
+        )
+
+    input_tensor_int8 = int4_values_interleaved.to(torch.int8)
+
+    low_nibbles = input_tensor_int8[..., 0::2]
+    high_nibbles = input_tensor_int8[..., 1::2]
+
+    packed_tensor = (high_nibbles << 4) | (low_nibbles & 0x0F)
+
+    return packed_tensor.to(torch.int8)
+
+
+def pack_interleave(num_experts, ref_weight, ref_scale):
+    n, k = ref_weight.shape[1], ref_weight.shape[2]
+
+    weight = pack_int4_values_to_int8(ref_weight.cpu()).cuda()
+    w_q = weight.view((num_experts, n, k // 2)).view(torch.int8)
+    w_q = w_q.contiguous()
+
+    alignment = 4 if k % 512 == 0 else 1
+    scale_interleaved = ref_scale.reshape(
+        ref_scale.shape[0],
+        ref_scale.shape[1],
+        (ref_scale.shape[2] // alignment),
+        alignment,
+    )  # [E, N, K/4, 4]
+    scale_interleaved = scale_interleaved.permute(0, 2, 1, 3)  # [E, K/4, N, 4]
+    scale_interleaved = scale_interleaved.reshape(
+        ref_scale.shape[0],
+        ref_scale.shape[2] // alignment,
+        ref_scale.shape[1] * alignment,
+    )  # [E, K/4, N*4]
+    w_scale = scale_interleaved.contiguous()
+
+    return w_q, w_scale
+
+
+@pytest.mark.skipif(
+    not is_hopper(),
+    reason="cutlass_w4a8_moe_mm is only supported on sm90",
+)
+@pytest.mark.parametrize("batch_size", [1, 2, 4, 8, 16])
+def test_int4_fp8_grouped_gemm_single_expert(batch_size):
+    # Test parameters
+    num_experts = 1
+    m = batch_size  # batch size
+    k = 512  # input dimension
+    n = 1024  # output dimension
+    torch.manual_seed(0)
+    dtype = torch.bfloat16
+    device = "cuda"
+    debug = False
+
+    print(f"\nTesting with batch_size={batch_size}")
+
+    # Create input tensors with ones
+    if debug:
+        a = torch.ones(m, k, dtype=torch.bfloat16, device=device)
+        ref_w = torch.ones(num_experts, n, k, dtype=torch.int8, device=device)
+        ref_w_scale = torch.ones(num_experts, n, k // 128, dtype=dtype, device=device)
+    else:
+        a = torch.randn(m, k, dtype=dtype, device=device)
+        ref_w = torch.randint(
+            -8, 8, (num_experts, n, k), dtype=torch.int8, device=device
+        )
+        affine_coeff = 0.005
+        ref_w_scale = (
+            torch.randn(num_experts, n, k // 128, dtype=dtype, device=device)
+            * affine_coeff
+        )
+
+    w, w_scale = pack_interleave(num_experts, ref_w, ref_w_scale)
+
+    # Create expert offsets and problem sizes
+    expert_offsets = torch.tensor([0, m], dtype=torch.int32, device=device)
+    problem_sizes = torch.tensor([[n, m, k]], dtype=torch.int32, device=device)
+
+    a_strides = torch.full((num_experts, 3), k, device=device, dtype=torch.int64)
+    c_strides = torch.full((num_experts, 3), n, device=device, dtype=torch.int64)
+    b_strides = a_strides
+    s_strides = c_strides
+
+    # Quantize input
+    a_q, a_scale = _per_tensor_quant_fp8(a)
+
+    # Create output tensor
+    c = torch.empty((m, n), dtype=torch.bfloat16, device=device)
+    cutlass_w4a8_moe_mm(
+        c,
+        a_q,
+        w,
+        a_scale,
+        w_scale,
+        expert_offsets[:-1],
+        problem_sizes,
+        a_strides,
+        b_strides,
+        c_strides,
+        s_strides,
+        128,
+        8,
+    )
+    c = c.to(dtype)
+
+    # Reference implementation
+    experts_selection_result = torch.full((m,), 0)
+    c_ref = ref_grouped_gemm(
+        c, a_q, a_scale, ref_w, ref_w_scale, num_experts, experts_selection_result
+    )
+
+    # Compare results
+    try:
+        torch.testing.assert_close(c, c_ref, rtol=1e-2, atol=0.1)
+    except AssertionError as e:
+        # torch.set_printoptions(threshold=10_000)
+        print(f"  FAILURE: tensors are NOT close.")
+        print(f"    Ref tensor: {c_ref.flatten()}")
+        print(f"    Cutlass tensor: {c.flatten()}")
+        print(
+            f"    Max absolute difference: {torch.max(torch.abs(c.to(c_ref.dtype) - c_ref))}"
+        )
+        print(
+            f"    Mean absolute difference: {torch.mean(torch.abs(c.to(c_ref.dtype) - c_ref))}"
+        )
+        print(f"    AssertionError: {e}")
+        raise
+
+
+def _per_tensor_quant_fp8(
+    x: torch.Tensor,
+    dtype: torch.dtype = torch.float8_e4m3fn,
+):
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+    x_s = torch.empty(
+        1,
+        device=x.device,
+        dtype=torch.float32,
+    )
+    sgl_per_tensor_quant_fp8(x, x_q, x_s, is_static=False)
+    return x_q, x_s
+
+
+@pytest.mark.skipif(
+    not is_hopper(),
+    reason="cutlass_w4a8_moe_mm is only supported on sm90",
+)
+@pytest.mark.parametrize("batch_size", [2, 4, 8, 16, 32])
+@pytest.mark.parametrize("k", [512, 1024, 2048, 4096, 7168])
+@pytest.mark.parametrize("n", [256, 512, 1024, 2048])
+@pytest.mark.parametrize("num_experts", [2, 4, 6, 8])
+def test_int4_fp8_grouped_gemm_multi_experts(batch_size, k, n, num_experts):
+    torch.manual_seed(0)
+    dtype = torch.bfloat16
+    device = "cuda"
+    debug = False
+
+    print(
+        f"\nTesting with batch_size={batch_size}, k={k}, n={n}, num_experts={num_experts}"
+    )
+
+    if debug:
+        a = torch.ones(batch_size, k, dtype=torch.bfloat16, device=device)
+        ref_w = torch.ones(num_experts, n, k, dtype=torch.int8, device=device)
+        ref_w_scale = torch.ones(num_experts, n, k // 128, dtype=dtype, device=device)
+    else:
+        a = torch.randn(batch_size, k, dtype=dtype, device=device)
+        ref_w = torch.randint(
+            -8, 8, (num_experts, n, k), dtype=torch.int8, device=device
+        )
+        affine_coeff = 0.005
+        ref_w_scale = (
+            torch.randn(num_experts, n, k // 128, dtype=dtype, device=device)
+            * affine_coeff
+        )
+
+    w, w_scale = pack_interleave(num_experts, ref_w, ref_w_scale)
+
+    # random select experts
+    experts_selection_result = torch.randint(
+        0, num_experts, (batch_size,), device=device
+    )
+    permutation = torch.argsort(experts_selection_result)
+    expert_token_counts = torch.bincount(
+        experts_selection_result, minlength=num_experts
+    )
+
+    # Create problem sizes and offsets for active experts
+    problem_sizes = []
+    for i in range(num_experts):
+        problem_sizes.append([n, expert_token_counts[i].item(), k])
+    problem_sizes = torch.tensor(problem_sizes, dtype=torch.int32, device=device)
+
+    expert_offsets = []
+    offset = 0
+    for i in range(num_experts):
+        expert_offsets.append(offset)
+        offset += problem_sizes[i][1].item()
+    expert_offsets = torch.tensor(expert_offsets, dtype=torch.int32, device=device)
+
+    # Permute input and quantize
+    a_q, a_scale = _per_tensor_quant_fp8(a)
+    a_q_perm = a_q[permutation]
+
+    # Create stride tensors
+    a_strides = torch.full((num_experts, 3), k, device=device, dtype=torch.int64)
+    c_strides = torch.full((num_experts, 3), n, device=device, dtype=torch.int64)
+    b_strides = a_strides
+    s_strides = c_strides
+
+    c_perm = torch.empty((batch_size, n), dtype=torch.bfloat16, device=device)
+    cutlass_w4a8_moe_mm(
+        c_perm,
+        a_q_perm,
+        w,
+        a_scale,
+        w_scale,
+        expert_offsets,
+        problem_sizes,
+        a_strides,
+        b_strides,
+        c_strides,
+        s_strides,
+        128,
+        8,
+    )
+
+    # Un-permute the result
+    c = torch.empty_like(c_perm)
+    c[permutation] = c_perm
+    c = c.to(dtype)
+
+    c_ref = ref_grouped_gemm(
+        c, a_q, a_scale, ref_w, ref_w_scale, num_experts, experts_selection_result
+    )
+
+    # Compare results
+    try:
+        torch.testing.assert_close(c, c_ref, rtol=1e-2, atol=0.1)
+    except AssertionError as e:
+        print(f"  FAILURE: tensors are NOT close.")
+        print(
+            f"    Max absolute difference: {torch.max(torch.abs(c.to(c_ref.dtype) - c_ref))}"
+        )
+        print(
+            f"    Mean absolute difference: {torch.mean(torch.abs(c.to(c_ref.dtype) - c_ref))}"
+        )
+        print(f"    AssertionError: {e}")
+        raise
+
+
+def ref_grouped_gemm(
+    c, a_q, a_scale, w, w_scale, num_experts, experts_selection_result
+):
+    dtype = torch.bfloat16
+    c_ref = torch.zeros_like(c)
+    for i in range(num_experts):
+        token_idx = torch.where(experts_selection_result == i)[0]
+        if len(token_idx) == 0:
+            continue
+        a = a_q[token_idx]
+
+        ref_w_scale_repeat = w_scale[i].repeat_interleave(128, dim=1).to(torch.float32)
+        ref_w = w[i].to(torch.float32) * ref_w_scale_repeat
+        c = torch.matmul(a.to(torch.float32), ref_w.t()) * a_scale
+        c_ref[token_idx] = c.to(dtype)
+
+    return c_ref
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_dsv3_fused_a_gemm.py b/sgl-kernel/tests/test_dsv3_fused_a_gemm.py
new file mode 100644
index 000000000..914af95e2
--- /dev/null
+++ b/sgl-kernel/tests/test_dsv3_fused_a_gemm.py
@@ -0,0 +1,32 @@
+import pytest
+import torch
+import torch.nn.functional as F
+from sgl_kernel import dsv3_fused_a_gemm
+
+
+@pytest.mark.parametrize("num_tokens", [i + 1 for i in range(16)])
+def test_dsv3_fused_a_gemm(num_tokens):
+    kHdIn = 7168
+    kHdOut = 2112
+
+    mat_a = torch.randn(
+        (num_tokens, kHdIn), dtype=torch.bfloat16, device="cuda"
+    ).contiguous()
+    mat_b = torch.randn((kHdOut, kHdIn), dtype=torch.bfloat16, device="cuda").transpose(
+        0, 1
+    )
+    output = torch.empty(
+        (num_tokens, kHdOut), dtype=torch.bfloat16, device="cuda"
+    ).contiguous()
+
+    ref = F.linear(mat_a, mat_b.T)
+
+    output = dsv3_fused_a_gemm(mat_a, mat_b)
+
+    assert torch.allclose(
+        output, ref, rtol=1e-2, atol=1e-3
+    ), "Fused GEMM output mismatch with torch.nn.functional.linear reference"
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_dsv3_router_gemm.py b/sgl-kernel/tests/test_dsv3_router_gemm.py
new file mode 100644
index 000000000..575769d6d
--- /dev/null
+++ b/sgl-kernel/tests/test_dsv3_router_gemm.py
@@ -0,0 +1,35 @@
+import pytest
+import torch
+import torch.nn.functional as F
+from sgl_kernel import dsv3_router_gemm
+
+
+@pytest.mark.parametrize("num_tokens", [i + 1 for i in range(16)])
+@pytest.mark.parametrize("num_experts", [256, 384])
+def test_dsv3_router_gemm(num_tokens, num_experts):
+    hidden_dim = 7168
+
+    mat_a = torch.randn(
+        (num_tokens, hidden_dim), dtype=torch.bfloat16, device="cuda"
+    ).contiguous()
+    mat_b = torch.randn(
+        (num_experts, hidden_dim), dtype=torch.bfloat16, device="cuda"
+    ).contiguous()
+
+    bf16_ref = F.linear(mat_a, mat_b)
+    float_ref = bf16_ref.to(torch.float32)
+
+    bf16_output = dsv3_router_gemm(mat_a, mat_b, out_dtype=torch.bfloat16)
+    float_output = dsv3_router_gemm(mat_a, mat_b, out_dtype=torch.float32)
+
+    assert torch.allclose(
+        bf16_output, bf16_ref, rtol=1e-2, atol=1e-3
+    ), "Router GEMM output in bf16 dtype mismatch with torch.nn.functional.linear reference"
+
+    assert torch.allclose(
+        float_output, float_ref, rtol=1e-2, atol=1e-3
+    ), "Router GEMM output in float32 dtype mismatch with torch.nn.functional.linear reference"
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_flash_attention.py b/sgl-kernel/tests/test_flash_attention.py
new file mode 100644
index 000000000..159390e54
--- /dev/null
+++ b/sgl-kernel/tests/test_flash_attention.py
@@ -0,0 +1,1368 @@
+# Adapted from https://github.com/Dao-AILab/flash-attention/blob/main/hopper/test_flash_attn.py
+import itertools
+import math
+from typing import Optional
+
+import pytest
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+
+apply_rotary_emb = None
+
+
+def is_hopper():
+    #  Only Hopper supports different V headdim
+    return torch.cuda.get_device_properties(0).major == 9
+
+
+def is_fa3_supported(device=None) -> bool:
+    #  There some fa3 FYI
+    #  FA3 can fail without a enough shared memory for a some shapes, such as higher
+    #  hidden_dim or some special cases.
+    #  Right now, fa3 is supported for sm80/sm87 and sm86/sm89. The main different
+    #  Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information
+    #  https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x
+    #  And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a.
+    #  That means if you use A100/A*0/L20/L40/L40s/4090 you can use fa3.
+    return (torch.version.cuda >= "12.3") and (
+        torch.cuda.get_device_capability(device)[0] == 9
+        or torch.cuda.get_device_capability(device)[0] == 8
+    )
+
+
+DISABLE_BACKWARD = True
+# For CI test, we close them to True.
+# DISABLE_SPLIT = os.getenv("FLASH_ATTENTION_DISABLE_SPLIT", "FALSE") == "TRUE"
+# DISABLE_PAGEDKV = os.getenv("FLASH_ATTENTION_DISABLE_PAGEDKV", "FALSE") == "TRUE"
+# DISABLE_APPENDKV = os.getenv("FLASH_ATTENTION_DISABLE_APPENDKV", "FALSE") == "TRUE"
+# DISABLE_LOCAL = os.getenv("FLASH_ATTENTION_DISABLE_LOCAL", "FALSE") == "TRUE"
+# DISABLE_SOFTCAP = os.getenv("FLASH_ATTENTION_DISABLE_SOFTCAP", "FALSE") == "TRUE"
+# DISABLE_PACKGQA = os.getenv("FLASH_ATTENTION_DISABLE_PACKGQA", "FALSE") == "TRUE"
+# DISABLE_FP16 = os.getenv("FLASH_ATTENTION_DISABLE_FP16", "FALSE") == "TRUE"
+# DISABLE_FP8 = (
+#     os.getenv("FLASH_ATTENTION_DISABLE_FP8", "FALSE") == "TRUE"
+#     or torch.cuda.get_device_capability("cuda")[0] < 9
+# )
+
+DISABLE_SPLIT = False
+DISABLE_PAGEDKV = True
+DISABLE_APPENDKV = False
+DISABLE_LOCAL = False
+DISABLE_SOFTCAP = True
+DISABLE_PACKGQA = False
+DISABLE_FP16 = True
+DISABLE_FP8 = True
+
+
+# Adapted from https://github.com/Dao-AILab/flash-attention/blob/main/hopper/padding.py
+def unpad_input(hidden_states, attention_mask, unused_mask=None):
+    """
+    Arguments:
+        hidden_states: (batch, seqlen, ...)
+        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
+        unused_mask: (batch, seqlen), bool / int, 1 means the element is allocated but unused.
+    Return:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask + unused_mask.
+        indices: (total_nnz), the indices of masked tokens from the flattened input sequence.
+        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
+        max_seqlen_in_batch: int
+        seqused: (batch), returns the number of tokens selected in attention_mask + unused_mask.
+    """
+    all_masks = (
+        (attention_mask + unused_mask) if unused_mask is not None else attention_mask
+    )
+    seqlens_in_batch = all_masks.sum(dim=-1, dtype=torch.int32)
+    used_seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(all_masks.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
+    # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
+    # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
+    # index with integer indices.
+    return (
+        rearrange(hidden_states, "b s ... -> (b s) ...")[indices],
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+        used_seqlens_in_batch,
+    )
+
+
+def generate_random_padding_mask(
+    max_seqlen, batch_size, device, mode="random", zero_lengths=False
+):
+    assert mode in ["full", "random", "third"]
+    if mode == "full":
+        lengths = torch.full(
+            (batch_size, 1), max_seqlen, device=device, dtype=torch.int32
+        )
+    elif mode == "random":
+        lengths = torch.randint(
+            max(0 if zero_lengths else 1, max_seqlen - 20),
+            max_seqlen + 1,
+            (batch_size, 1),
+            device=device,
+        )
+    elif mode == "third":
+        lengths = torch.randint(
+            max_seqlen // 3, max_seqlen + 1, (batch_size, 1), device=device
+        )
+
+    if zero_lengths:
+        # Generate zero-lengths every 5 batches and the last batch.
+        for i in range(batch_size):
+            if i % 5 == 0:
+                lengths[i] = 0
+        lengths[-1] = 0
+    padding_mask = (
+        repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size)
+        < lengths
+    )
+    return padding_mask
+
+
+def pad_input(hidden_states, indices, batch, seqlen):
+    """
+    Arguments:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence.
+        batch: int, batch size for the padded sequence.
+        seqlen: int, maximum sequence length for the padded sequence.
+    Return:
+        hidden_states: (batch, seqlen, ...)
+    """
+    dim = hidden_states.shape[1:]
+    output = torch.zeros(
+        (batch * seqlen), *dim, device=hidden_states.device, dtype=hidden_states.dtype
+    )
+    output[indices] = hidden_states
+    return rearrange(output, "(b s) ... -> b s ...", b=batch)
+
+
+def construct_local_mask(
+    seqlen_q,
+    seqlen_k,
+    window_size=(-1, -1),  # -1 means infinite window size
+    sink_token_length=0,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    key_leftpad=None,
+    device=None,
+):
+    row_idx = rearrange(
+        torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1"
+    )
+    col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long)
+    if key_leftpad is not None:
+        key_leftpad = rearrange(key_leftpad, "b -> b 1 1 1")
+        col_idx = repeat(col_idx, "s -> b 1 1 s", b=key_leftpad.shape[0])
+        col_idx = torch.where(col_idx >= key_leftpad, col_idx - key_leftpad, 2**32)
+    sk = (
+        seqlen_k
+        if key_padding_mask is None
+        else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1")
+    )
+    sq = (
+        seqlen_q
+        if query_padding_mask is None
+        else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1")
+    )
+    if window_size[0] < 0:
+        return col_idx > row_idx + sk - sq + window_size[1]
+    else:
+        sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk
+        return torch.logical_or(
+            col_idx > torch.minimum(row_idx + sk - sq + window_size[1], sk),
+            torch.logical_and(
+                col_idx < row_idx + sk - sq - window_size[0],
+                col_idx >= sink_token_length,
+            ),
+        )
+
+
+def attention_ref(
+    q,
+    k,
+    v,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    key_leftpad=None,
+    attn_bias=None,
+    dropout_p=0.0,
+    dropout_mask=None,
+    causal=False,
+    qv=None,
+    q_descale=None,
+    k_descale=None,
+    v_descale=None,
+    window_size=(-1, -1),  # -1 means infinite window size
+    sink_token_length=0,
+    sinks: Optional[torch.Tensor] = None,
+    softcap=0.0,
+    upcast=True,
+    reorder_ops=False,
+    intermediate_dtype=None,
+):
+    """
+    Arguments:
+        q: (batch_size, seqlen_q, nheads, head_dim)
+        k: (batch_size, seqlen_k, nheads, head_dim)
+        v: (batch_size, seqlen_k, nheads, head_dim_v)
+        qv: (batch_size, seqlen_q, nheads, head_dim_v)
+        query_padding_mask: (batch_size, seqlen_q)
+        key_padding_mask: (batch_size, seqlen_k)
+        attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k)
+        dropout_p: float
+        dropout_mask: (batch_size, nheads, seqlen_q, seqlen_k)
+        causal: whether to apply causal masking
+        upcast: whether to cast all inputs to fp32, do all computation in fp32, then cast
+            output back to fp16/bf16.
+        reorder_ops: whether to change the order of operations (scaling k instead of scaling k, etc.)
+            without changing the math. This is to estimate the numerical error from operation
+            reordering.
+    Output:
+        output: (batch_size, seqlen_q, nheads, head_dim_v)
+        attention: (batch_size, nheads, seqlen_q, seqlen_k), softmax after dropout
+    """
+    if causal:
+        window_size = (window_size[0], 0)
+    dtype_og = q.dtype
+    if upcast:
+        q, k, v = q.float(), k.float(), v.float()
+        qv = qv.float() if qv is not None else None
+    if q_descale is not None:
+        q_descale = repeat(q_descale, "b h -> b 1 (h g) 1", g=q.shape[2] // k.shape[2])
+        q = (q.float() * q_descale).to(q.dtype)
+        qv = (qv.float() * q_descale).to(qv.dtype) if qv is not None else None
+    if k_descale is not None:
+        k = (k.float() * rearrange(k_descale, "b h -> b 1 h 1")).to(dtype=k.dtype)
+    if v_descale is not None:
+        v = (v.float() * rearrange(v_descale, "b h -> b 1 h 1")).to(dtype=v.dtype)
+    seqlen_q, seqlen_k = q.shape[1], k.shape[1]
+    k = repeat(k, "b s h d -> b s (h g) d", g=q.shape[2] // k.shape[2])
+    v = repeat(v, "b s h d -> b s (h g) d", g=q.shape[2] // v.shape[2])
+    d = q.shape[-1]
+    dv = v.shape[-1]
+    softmax_scale = 1.0 / math.sqrt(d if qv is None else d + dv)
+    if not reorder_ops:
+        scores = torch.einsum("bthd,bshd->bhts", q * softmax_scale, k)
+    else:
+        scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
+    if qv is not None:
+        scores = scores + torch.einsum("bthd,bshd->bhts", qv * softmax_scale, v)
+    if softcap > 0:
+        scores = torch.tanh(scores / softcap) * softcap
+    if key_padding_mask is not None:
+        scores.masked_fill_(
+            rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf")
+        )
+    if window_size[0] >= 0 or window_size[1] >= 0:
+        local_mask = construct_local_mask(
+            seqlen_q,
+            seqlen_k,
+            window_size,
+            sink_token_length,
+            query_padding_mask,
+            key_padding_mask,
+            key_leftpad=key_leftpad,
+            device=q.device,
+        )
+        scores.masked_fill_(local_mask, float("-inf"))
+    if attn_bias is not None:
+        scores = scores + attn_bias
+    if sinks is None:
+        attention = torch.softmax(scores, dim=-1).to(v.dtype)
+    else:
+        scores_fp32 = scores.to(torch.float32)
+        logits_max = torch.amax(scores_fp32, dim=-1, keepdim=True)
+        sinks = rearrange(sinks, "h -> h 1 1")
+        logits_or_sinks_max = torch.maximum(sinks, logits_max)
+        unnormalized_scores = torch.exp(scores_fp32 - logits_or_sinks_max)
+        normalizer = unnormalized_scores.sum(dim=-1, keepdim=True) + torch.exp(
+            sinks - logits_or_sinks_max
+        )
+        attention = (unnormalized_scores / normalizer).to(v.dtype)
+    # We want to mask here so that the attention matrix doesn't have any NaNs
+    # Otherwise we'll get NaN in dV
+    if query_padding_mask is not None:
+        attention = attention.masked_fill(
+            rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0
+        )
+    # Without this we might get NaN in dv
+    if key_padding_mask is not None:
+        attention = attention.masked_fill(
+            rearrange(~key_padding_mask, "b s -> b 1 1 s"), 0.0
+        )
+    # Some rows might be completely masked out so we fill them with zero instead of NaN
+    if window_size[0] >= 0 or window_size[1] >= 0:
+        attention = attention.masked_fill(
+            torch.all(local_mask, dim=-1, keepdim=True), 0.0
+        )
+    dropout_scaling = 1.0 / (1 - dropout_p)
+    # attention_drop = attention.masked_fill(~dropout_mask, 0.0) * dropout_scaling
+    # output = torch.einsum('bhts,bshd->bthd', attention_drop , v)
+    if dropout_mask is not None:
+        attention_drop = attention.masked_fill(~dropout_mask, 0.0)
+    else:
+        attention_drop = attention
+    if intermediate_dtype is not None:
+        attention_drop = attention_drop.to(intermediate_dtype).to(attention_drop.dtype)
+    output = torch.einsum("bhts,bshd->bthd", attention_drop, v * dropout_scaling)
+    if query_padding_mask is not None:
+        output.masked_fill_(rearrange(~query_padding_mask, "b s -> b s 1 1"), 0.0)
+    return output.to(dtype=dtype_og), attention.to(dtype=dtype_og)
+
+
+def generate_qkv(
+    q,
+    k,
+    v,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    kvpacked=False,
+    qkvpacked=False,
+    add_unused_qkv=False,
+    query_unused_mask=None,
+    key_unused_mask=None,
+):
+    """
+    Arguments:
+        q: (batch_size, seqlen_q, nheads, d)
+        k: (batch_size, seqlen_k, nheads_k, d)
+        v: (batch_size, seqlen_k, nheads_k, d)
+        query_padding_mask: (batch_size, seqlen), bool
+        key_padding_mask: (batch_size, seqlen), bool
+    """
+    assert not (kvpacked and qkvpacked)
+    batch_size, seqlen_q, nheads, d = q.shape
+    _, seqlen_k, nheads_k, _ = k.shape
+    assert k.shape == (batch_size, seqlen_k, nheads_k, d)
+    assert v.shape == (batch_size, seqlen_k, nheads_k, d)
+    if query_unused_mask is not None or key_unused_mask is not None:
+        assert not kvpacked
+        assert not qkvpacked
+
+    if query_padding_mask is not None:
+        q_unpad, indices_q, cu_seqlens_q, max_seqlen_q, seqused_q = unpad_input(
+            q,
+            query_padding_mask,
+            query_unused_mask,
+        )
+        output_pad_fn = lambda output_unpad: pad_input(
+            output_unpad, indices_q, batch_size, seqlen_q
+        )
+    else:
+        q_unpad = rearrange(q, "b s h d -> (b s) h d")
+        cu_seqlens_q = torch.arange(
+            0,
+            (batch_size + 1) * seqlen_q,
+            step=seqlen_q,
+            dtype=torch.int32,
+            device=q_unpad.device,
+        )
+        seqused_q = None
+        max_seqlen_q = seqlen_q
+        output_pad_fn = lambda output_unpad: rearrange(
+            output_unpad, "(b s) h d -> b s h d", b=batch_size
+        )
+
+    if key_padding_mask is not None:
+        k_unpad, indices_k, cu_seqlens_k, max_seqlen_k, seqused_k = unpad_input(
+            k, key_padding_mask, key_unused_mask
+        )
+        v_unpad, _, _, _, _ = unpad_input(v, key_padding_mask, key_unused_mask)
+    else:
+        k_unpad = rearrange(k, "b s h d -> (b s) h d")
+        v_unpad = rearrange(v, "b s h d -> (b s) h d")
+        cu_seqlens_k = torch.arange(
+            0,
+            (batch_size + 1) * seqlen_k,
+            step=seqlen_k,
+            dtype=torch.int32,
+            device=k_unpad.device,
+        )
+        seqused_k = None
+        max_seqlen_k = seqlen_k
+
+    if qkvpacked:
+        assert (query_padding_mask == key_padding_mask).all()
+        assert nheads == nheads_k
+        qkv_unpad = torch.stack([q_unpad, k_unpad, v_unpad], dim=1)
+        qkv = torch.stack([q, k, v], dim=2)
+        if query_padding_mask is not None:
+            dqkv_pad_fn = lambda dqkv_unpad: pad_input(
+                dqkv_unpad, indices_q, batch_size, seqlen_q
+            )
+        else:
+            dqkv_pad_fn = lambda dqkv_unpad: rearrange(
+                dqkv_unpad, "(b s) t h d -> b s t h d", b=batch_size
+            )
+        return (
+            qkv_unpad.detach().requires_grad_(),
+            cu_seqlens_q,
+            max_seqlen_q,
+            qkv.detach().requires_grad_(),
+            output_pad_fn,
+            dqkv_pad_fn,
+        )
+    elif kvpacked:
+        kv_unpad = torch.stack([k_unpad, v_unpad], dim=1)
+        kv = torch.stack([k, v], dim=2)
+        dq_pad_fn = output_pad_fn
+        if key_padding_mask is not None:
+            dkv_pad_fn = lambda dkv_unpad: pad_input(
+                dkv_unpad, indices_k, batch_size, seqlen_k
+            )
+        else:
+            dkv_pad_fn = lambda dkv_unpad: rearrange(
+                dkv_unpad, "(b s) t h d -> b s t h d", b=batch_size
+            )
+        return (
+            q_unpad.detach().requires_grad_(),
+            kv_unpad.detach().requires_grad_(),
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            q.detach().requires_grad_(),
+            kv.detach().requires_grad_(),
+            output_pad_fn,
+            dq_pad_fn,
+            dkv_pad_fn,
+        )
+    else:
+        dq_pad_fn = output_pad_fn
+        if key_padding_mask is not None:
+            dk_pad_fn = lambda dk_unpad: pad_input(
+                dk_unpad, indices_k, batch_size, seqlen_k
+            )
+        else:
+            dk_pad_fn = lambda dk_unpad: rearrange(
+                dk_unpad, "(b s) h d -> b s h d", b=batch_size
+            )
+        return (
+            q_unpad.detach().requires_grad_(),
+            k_unpad.detach().requires_grad_(),
+            v_unpad.detach().requires_grad_(),
+            cu_seqlens_q,
+            cu_seqlens_k,
+            seqused_q,
+            seqused_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            q.detach().requires_grad_(),
+            k.detach().requires_grad_(),
+            v.detach().requires_grad_(),
+            output_pad_fn,
+            dq_pad_fn,
+            dk_pad_fn,
+        )
+
+
+@pytest.mark.skipif(
+    not is_fa3_supported(),
+    reason="flash_attn at sgl-kernel is only supported on sm90 or sm80",
+)
+# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float8_e4m3fn])
+@pytest.mark.parametrize(
+    "dtype", [torch.bfloat16] + ([torch.float8_e4m3fn] if not DISABLE_FP8 else [])
+)
+# @pytest.mark.parametrize("dtype", [torch.bfloat16])
+# @pytest.mark.parametrize("dtype", [torch.float8_e4m3fn])
+@pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
+# @pytest.mark.parametrize("mha_type", ["mha"])
+@pytest.mark.parametrize("has_sink", [False, True])
+# @pytest.mark.parametrize("has_sink", [False])
+@pytest.mark.parametrize("new_kv", [False] + ([True] if not DISABLE_APPENDKV else []))
+# @pytest.mark.parametrize("new_kv", [True])
+# @pytest.mark.parametrize(
+#     "causal,local",
+#     [(False, False), (True, False)] + ([(False, True)] if not DISABLE_LOCAL else []),
+# )
+# @pytest.mark.parametrize("causal,local", [(False, False), (True, False)])
+@pytest.mark.parametrize("causal,local", [(False, False)])
+@pytest.mark.parametrize(
+    "seqlen_new_eq_seqlen_q", [True, False] if not DISABLE_APPENDKV else [True]
+)
+# @pytest.mark.parametrize("seqlen_new_eq_seqlen_q", [True])
+# @pytest.mark.parametrize("has_rotary_seqlens", [False, True])
+@pytest.mark.parametrize("has_rotary_seqlens", [False])
+@pytest.mark.parametrize(
+    "rotary_interleaved", [False, True] if not DISABLE_APPENDKV else [False]
+)
+# @pytest.mark.parametrize("rotary_interleaved", [True])
+@pytest.mark.parametrize(
+    "rotary_fraction",
+    (
+        [0.0, 0.5, 1.0]
+        if (not DISABLE_APPENDKV) and (apply_rotary_emb is not None)
+        else [0.0]
+    ),
+)
+# @pytest.mark.parametrize("rotary_fraction", [0.0])
+@pytest.mark.parametrize(
+    "page_size", [None] + ([1, 4, 128] if not DISABLE_PAGEDKV else [])
+)
+# @pytest.mark.parametrize("page_size", [None])
+# @pytest.mark.parametrize("has_leftpad", [False, True])
+@pytest.mark.parametrize("has_leftpad", [False])
+# @pytest.mark.parametrize("has_batch_idx", [False, True])
+@pytest.mark.parametrize("has_batch_idx", [False])
+# @pytest.mark.parametrize("varlen_q", [False, True])
+@pytest.mark.parametrize("varlen_q", [False])
+# @pytest.mark.parametrize("d", [32, 59, 64, 80, 128, 256])
+# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
+# @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192])
+# @pytest.mark.parametrize('d', [56, 80])
+@pytest.mark.parametrize("d", [64])
+# @pytest.mark.parametrize("d", [192])
+@pytest.mark.parametrize(
+    "seqlen_q,seqlen_k",
+    [
+        (1, 128),
+        (1, 339),
+        (3, 1024),
+        (64, 800),
+        (64, 256),
+        (3, 799),
+        (64, 2048),
+        (16, 20000),
+        # (1, 128 * 1024),
+        # (16, 128 * 1024),
+        (128, 128),
+        (256, 512),  # To test appending KV with more than 1 block
+        (2048, 3577),  # Enough tile to test persistent scheduler
+    ],
+)
+# @pytest.mark.parametrize('seqlen_q,seqlen_k', [(256, 128)])
+def test_flash_attn_kvcache(
+    seqlen_q,
+    seqlen_k,
+    d,
+    varlen_q,
+    has_batch_idx,
+    has_leftpad,
+    page_size,
+    rotary_fraction,
+    rotary_interleaved,
+    has_rotary_seqlens,
+    seqlen_new_eq_seqlen_q,
+    causal,
+    local,
+    new_kv,
+    mha_type,
+    dtype,
+    has_sink,
+):
+    from sgl_kernel.flash_attn import flash_attn_with_kvcache
+
+    if page_size is not None and seqlen_k % page_size != 0:
+        pytest.skip()
+    if seqlen_q > seqlen_k and new_kv:
+        pytest.skip()
+    if not new_kv and rotary_fraction > 0.0:
+        pytest.skip()
+    if rotary_fraction == 0.0 and has_rotary_seqlens:
+        pytest.skip()
+    device = "cuda"
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 5
+    # batch_size = 1
+    batch_size_cache = batch_size if not has_batch_idx else batch_size * 2
+    nheads = 6
+    # nheads = 1
+    # rotary_dim must be a multiple of 16, and must be <= d
+    rotary_dim = math.floor(int(rotary_fraction * d) / 16) * 16
+    nheads_k = nheads if mha_type == "mha" else (1 if mha_type == "mqa" else 3)
+    assert nheads % nheads_k == 0
+    dtype_ref = torch.bfloat16 if dtype == torch.float8_e4m3fn else dtype
+    dv_vals = [128, d] if d > 128 and d <= 192 else ([256, 512, d] if d <= 64 else [d])
+
+    if has_sink:
+        sinks = torch.randn(nheads, dtype=torch.bfloat16, device=device)
+    else:
+        sinks = None
+
+    if dtype == torch.float8_e4m3fn or not is_hopper():
+        # for fp8 and ampere arch, we not support v head dim != qk head dim
+        dv_vals = [d]
+    for dv in dv_vals:
+        has_qv = d == 64 and dv >= 256
+        q = (
+            torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype_ref)
+            .to(dtype)
+            .to(dtype_ref)
+        )
+        if has_qv:
+            qv = (
+                torch.randn(
+                    batch_size, seqlen_q, nheads, dv, device=device, dtype=dtype_ref
+                )
+                .to(dtype)
+                .to(dtype_ref)
+            )
+        else:
+            qv = None
+        if varlen_q:
+            query_padding_mask = generate_random_padding_mask(
+                seqlen_q, batch_size, device, mode="random"
+            )
+            q_unpad, indices_q, cu_seqlens_q, max_seqlen_q, *rest = unpad_input(
+                q, query_padding_mask
+            )
+            output_pad_fn = lambda output_unpad: pad_input(
+                output_unpad, indices_q, batch_size, seqlen_q
+            )
+            qv_unpad = (
+                rearrange(qv, "b s ... -> (b s) ...")[indices_q] if has_qv else None
+            )
+        else:
+            query_padding_mask = None
+            q_unpad = q
+            qv_unpad = qv
+            cu_seqlens_q, max_seqlen_q = None, None
+        # Put window_size after QKV randn so that window_size changes from test to test
+        window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,))
+
+        seqlen_new = (
+            seqlen_q
+            if seqlen_new_eq_seqlen_q
+            else torch.randint(1, seqlen_q + 1, (1,)).item()
+        )
+        cu_seqlens_k_new = None
+        key_new_padding_mask = None
+        if new_kv:
+            k = (
+                torch.randn(
+                    batch_size, seqlen_new, nheads_k, d, device=device, dtype=dtype_ref
+                )
+                .to(dtype)
+                .to(dtype_ref)
+            )
+            v = (
+                torch.randn(
+                    batch_size, seqlen_new, nheads_k, dv, device=device, dtype=dtype_ref
+                )
+                .to(dtype)
+                .to(dtype_ref)
+            )
+            if varlen_q:  # k & v are also varlen
+                key_new_padding_mask = generate_random_padding_mask(
+                    seqlen_new, batch_size, device, mode="random"
+                )
+                k_unpad, indices_k, cu_seqlens_k_new, *rest = unpad_input(
+                    k, key_new_padding_mask
+                )
+                v_unpad, *rest = unpad_input(v, key_new_padding_mask)
+            else:
+                k_unpad, v_unpad = k, v
+        else:
+            k, v, k_unpad, v_unpad = None, None, None, None
+        if page_size is None:
+            k_cache = (
+                torch.randn(
+                    batch_size_cache,
+                    seqlen_k,
+                    nheads_k,
+                    d,
+                    device=device,
+                    dtype=dtype_ref,
+                )
+                .to(dtype)
+                .to(dtype_ref)
+            )
+            v_cache = (
+                torch.randn(
+                    batch_size_cache,
+                    seqlen_k,
+                    nheads_k,
+                    dv,
+                    device=device,
+                    dtype=dtype_ref,
+                )
+                .to(dtype)
+                .to(dtype_ref)
+            )
+            page_table = None
+        else:
+            (
+                k_cache,
+                v_cache,
+                page_table,
+                k_cache_paged,
+                v_cache_paged,
+                num_blocks,
+            ) = _generate_block_kvcache(
+                seqlen_k,
+                page_size,
+                batch_size_cache,
+                nheads_k,
+                d,
+                dv,
+                device,
+                dtype,
+                dtype_ref,
+            )
+        cache_seqlens = torch.randint(
+            0 if new_kv else 1,
+            # If we don't use seqlen_q in the case of causal and rotary, cos/sin won't be long enough
+            (
+                (
+                    seqlen_k
+                    - (seqlen_q if (causal or local) and rotary_dim > 1 else seqlen_new)
+                    + 1
+                )
+                if new_kv
+                else (seqlen_k + 1)
+            ),
+            (batch_size,),
+            dtype=torch.int32,
+            device=device,
+        )
+        if has_leftpad:
+            cache_leftpad = torch.cat(
+                [
+                    (
+                        torch.randint(
+                            0,
+                            cache_seqlens[i].item(),
+                            (1,),
+                            dtype=torch.int32,
+                            device=device,
+                        )
+                        if cache_seqlens[i].item() > 0
+                        else torch.zeros(1, dtype=torch.int32, device=device)
+                    )
+                    for i in range(batch_size)
+                ]
+            )
+        else:
+            cache_leftpad = None
+        if has_batch_idx:
+            cache_batch_idx = torch.randperm(
+                batch_size_cache, dtype=torch.int32, device=device
+            )[:batch_size]
+        else:
+            cache_batch_idx = None
+        arange = rearrange(torch.arange(seqlen_k, device=device), "s -> 1 s")
+        cache_seqlens_expanded = rearrange(cache_seqlens, "b -> b 1")
+        if not new_kv:
+            key_padding_mask = arange < cache_seqlens_expanded
+        else:
+            k_new_seqlens = (
+                key_new_padding_mask.sum(-1, keepdims=True) if varlen_q else seqlen_new
+            )
+            key_padding_mask = arange < cache_seqlens_expanded + k_new_seqlens
+        if has_leftpad:
+            key_padding_mask = torch.logical_and(
+                key_padding_mask,
+                arange >= cache_leftpad.unsqueeze(-1).expand(-1, seqlen_k),
+            )
+        # cache_seqlens = torch.tensor([64], dtype=torch.int32, device=device)
+        rotary_seqlens = cache_seqlens if not has_rotary_seqlens else cache_seqlens // 2
+        if rotary_dim > 0:
+            angle = (
+                torch.rand(
+                    seqlen_k if page_size is None else num_blocks * page_size,
+                    rotary_dim // 2,
+                    device=device,
+                )
+                * 2
+                * math.pi
+            )
+            cos = torch.cos(angle).to(dtype=dtype_ref).to(dtype).to(dtype_ref)
+            sin = torch.sin(angle).to(dtype=dtype_ref).to(dtype).to(dtype_ref)
+            if causal or local:
+                q_ro = apply_rotary_emb(
+                    q,
+                    cos,
+                    sin,
+                    seqlen_offsets=rotary_seqlens,
+                    interleaved=rotary_interleaved,
+                )
+            else:
+                q_ro = rearrange(
+                    apply_rotary_emb(
+                        rearrange(q, "b s h d -> b 1 (s h) d"),
+                        cos,
+                        sin,
+                        seqlen_offsets=rotary_seqlens,
+                        interleaved=rotary_interleaved,
+                    ),
+                    "b 1 (s h) d -> b s h d",
+                    s=seqlen_q,
+                )
+            # q_ro = q
+            k_ro = apply_rotary_emb(
+                k,
+                cos,
+                sin,
+                seqlen_offsets=rotary_seqlens,
+                interleaved=rotary_interleaved,
+            )
+        else:
+            cos, sin = None, None
+            q_ro, k_ro = q, k
+        # k_cache[:, 64:] = -1
+        k_cache_ref = (
+            k_cache if not has_batch_idx else k_cache[cache_batch_idx]
+        ).clone()
+        v_cache_ref = (
+            v_cache if not has_batch_idx else v_cache[cache_batch_idx]
+        ).clone()
+        if new_kv:
+            update_mask = torch.logical_and(
+                cache_seqlens_expanded <= arange,
+                arange < cache_seqlens_expanded + k_new_seqlens,
+            )
+            k_to_update = rearrange(k_ro, "b s ... -> (b s) ...")
+            v_to_update = rearrange(v, "b s ... -> (b s) ...")
+            if varlen_q:
+                k_to_update = k_to_update[indices_k]
+                v_to_update = v_to_update[indices_k]
+            k_cache_ref[update_mask] = k_to_update
+            v_cache_ref[update_mask] = v_to_update
+        k_cache_rep = repeat(
+            k_cache_ref, "b s h d -> b s (h g) d", g=nheads // nheads_k
+        )
+        v_cache_rep = repeat(
+            v_cache_ref, "b s h d -> b s (h g) d", g=nheads // nheads_k
+        )
+        out_ref, _ = attention_ref(
+            q_ro,
+            k_cache_rep,
+            v_cache_rep,
+            query_padding_mask,
+            key_padding_mask,
+            causal=causal,
+            qv=qv,
+            window_size=window_size,
+            key_leftpad=cache_leftpad,
+            sinks=sinks,
+        )
+        out_pt, _ = attention_ref(
+            q_ro,
+            k_cache_rep,
+            v_cache_rep,
+            query_padding_mask,
+            key_padding_mask,
+            causal=causal,
+            qv=qv,
+            window_size=window_size,
+            upcast=False,
+            reorder_ops=True,
+            key_leftpad=cache_leftpad,
+            intermediate_dtype=dtype if dtype == torch.float8_e4m3fn else None,
+            sinks=sinks,
+        )
+        q = q.to(dtype)
+        q_unpad = q_unpad.to(dtype) if varlen_q else None
+        k_cache = k_cache.to(dtype)
+        v_cache = v_cache.to(dtype)
+        k_cache_paged = k_cache_paged.to(dtype) if page_size is not None else None
+        v_cache_paged = v_cache_paged.to(dtype) if page_size is not None else None
+        k = k.to(dtype) if k is not None else None
+        v = v.to(dtype) if v is not None else None
+        k_unpad = k_unpad.to(dtype) if k_unpad is not None else None
+        v_unpad = v_unpad.to(dtype) if v_unpad is not None else None
+        qv = qv.to(dtype) if qv is not None else None
+        qv_unpad = qv_unpad.to(dtype) if (varlen_q and qv is not None) else None
+        cos = cos.to(dtype) if cos is not None else None
+        sin = sin.to(dtype) if sin is not None else None
+        k_cache_saved = k_cache.clone() if page_size is None else k_cache_paged.clone()
+        v_cache_saved = v_cache.clone() if page_size is None else v_cache_paged.clone()
+        num_splits_vals = [1, 0] if not DISABLE_SPLIT else [1]
+        precompute_metadata_vals = [False]
+        for num_splits, precompute_metadata in itertools.product(
+            num_splits_vals, precompute_metadata_vals
+        ):
+            scheduler_metadata = None
+            # Repeat to test metadata reuse
+            for _ in range(1 if not precompute_metadata else 2):
+                if page_size is None:
+                    k_cache.copy_(k_cache_saved)
+                    v_cache.copy_(v_cache_saved)
+                else:
+                    k_cache_paged.copy_(k_cache_saved)
+                    v_cache_paged.copy_(v_cache_saved)
+                out, lse, *rest = flash_attn_with_kvcache(
+                    q if not varlen_q else q_unpad,
+                    k_cache if page_size is None else k_cache_paged,
+                    v_cache if page_size is None else v_cache_paged,
+                    k if not new_kv or not varlen_q else k_unpad,
+                    v if not new_kv or not varlen_q else v_unpad,
+                    qv=qv if not varlen_q else qv_unpad,
+                    rotary_cos=cos,
+                    rotary_sin=sin,
+                    cache_seqlens=cache_seqlens,
+                    cache_batch_idx=cache_batch_idx,
+                    cache_leftpad=cache_leftpad,
+                    page_table=page_table,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k_new=cu_seqlens_k_new,
+                    max_seqlen_q=max_seqlen_q,
+                    rotary_seqlens=rotary_seqlens,
+                    causal=causal,
+                    window_size=window_size,
+                    rotary_interleaved=rotary_interleaved,
+                    scheduler_metadata=scheduler_metadata,
+                    num_splits=num_splits,
+                    return_softmax_lse=True,
+                    sinks=sinks,
+                )
+                if varlen_q:
+                    out = output_pad_fn(out)
+                # out = flash_attn_with_kvcache(
+                #     q, k_cache, v_cache, cache_seqlens=cache_seqlens, causal=causal, window_size=window_size
+                # )
+                # out = flash_attn_with_kvcache(q, k_cache, v_cache, causal=causal, window_size=window_size)
+                # qk = torch.einsum("bqhd,bkhd->bhqk", q, k_cache_ref)
+                # m = qk.amax(-1, keepdim=True)
+                # s_tmp = torch.exp((qk - m) / math.sqrt(d))
+                # o1 = torch.einsum('bhst,bthd->bshd', s_tmp, v_cache_ref)
+                # lse_ref = torch.logsumexp(qk / math.sqrt(d), -1)
+                # probs = torch.softmax(qk, dim=-1)
+                print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+                print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+                print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
+                print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
+                # breakpoint()
+
+                # Check that FlashAttention's numerical error is at most twice the numerical error
+                # of a Pytorch implementation.
+                if new_kv:
+                    if page_size is None:
+                        k_cache_select = (
+                            k_cache.to(dtype_ref)
+                            if not has_batch_idx
+                            else k_cache.to(dtype_ref)[cache_batch_idx]
+                        )
+                        v_cache_select = (
+                            v_cache.to(dtype_ref)
+                            if not has_batch_idx
+                            else v_cache.to(dtype_ref)[cache_batch_idx]
+                        )
+                    else:
+                        k_cache_select = rearrange(
+                            k_cache_paged.to(dtype_ref)[
+                                (
+                                    page_table
+                                    if not has_batch_idx
+                                    else page_table[cache_batch_idx]
+                                ).flatten()
+                            ],
+                            "(b nblocks) block_size ... -> b (nblocks block_size) ...",
+                            b=batch_size,
+                        )[:, :seqlen_k].to(dtype_ref)
+                        v_cache_select = rearrange(
+                            v_cache_paged.to(dtype_ref)[
+                                (
+                                    page_table
+                                    if not has_batch_idx
+                                    else page_table[cache_batch_idx]
+                                ).flatten()
+                            ],
+                            "(b nblocks) block_size ... -> b (nblocks block_size) ...",
+                            b=batch_size,
+                        )[:, :seqlen_k].to(dtype_ref)
+                    k_cache_ref = k_cache_ref.to(dtype).to(dtype_ref)
+                    v_cache_ref = v_cache_ref.to(dtype).to(dtype_ref)
+                    if dtype is not torch.float8_e4m3fn:
+                        assert torch.equal(v_cache_select, v_cache_ref)
+                    else:
+                        assert torch.allclose(
+                            v_cache_select, v_cache_ref, rtol=1e-3, atol=1e-3
+                        )
+                    # breakpoint()
+                    # if rotary_dim == 0 and dtype is not torch.float8_e4m3fn:
+                    if rotary_dim == 0:
+                        assert torch.equal(k_cache_select, k_cache_ref)
+                    else:
+                        # if not torch.allclose(k_cache_select, k_cache_ref, rtol=1e-3, atol=1e-3):
+                        #     breakpoint()
+                        if dtype is not torch.float8_e4m3fn:
+                            assert torch.allclose(
+                                k_cache_select, k_cache_ref, rtol=1e-3, atol=1e-3
+                            )
+                        else:
+                            assert torch.allclose(
+                                k_cache_select, k_cache_ref, rtol=1e-1, atol=1e-1
+                            )
+                mult = 4 if dtype == torch.float8_e4m3fn else 2
+                assert (out - out_ref).abs().max().item() <= mult * (
+                    out_pt - out_ref
+                ).abs().max().item() + 1e-5
+                mult_mean = 3 if dtype == torch.float8_e4m3fn else 1.5
+                assert (out - out_ref).abs().mean().item() <= mult_mean * (
+                    out_pt - out_ref
+                ).abs().mean().item()
+
+
+def _generate_block_kvcache(
+    seqlen_k, page_size, batch_size, nheads_k, d, dv, device, dtype, dtype_ref
+):
+    num_blocks = math.ceil(seqlen_k / page_size) * batch_size * 3
+    k_cache_paged = (
+        torch.randn(num_blocks, page_size, nheads_k, d, device=device, dtype=dtype_ref)
+        .to(dtype)
+        .to(dtype_ref)
+    )
+    v_cache_paged = (
+        torch.randn(num_blocks, page_size, nheads_k, dv, device=device, dtype=dtype_ref)
+        .to(dtype)
+        .to(dtype_ref)
+    )
+    page_table = rearrange(
+        torch.randperm(num_blocks, dtype=torch.int32, device=device),
+        "(b nblocks) -> b nblocks",
+        b=batch_size,
+    )
+    k_cache = rearrange(
+        k_cache_paged[page_table.flatten()],
+        "(b nblocks) block_size ... -> b (nblocks block_size) ...",
+        b=batch_size,
+    )[:, :seqlen_k]
+    v_cache = rearrange(
+        v_cache_paged[page_table.flatten()],
+        "(b nblocks) block_size ... -> b (nblocks block_size) ...",
+        b=batch_size,
+    )[:, :seqlen_k]
+    return k_cache, v_cache, page_table, k_cache_paged, v_cache_paged, num_blocks
+
+
+@pytest.mark.skipif(
+    not is_fa3_supported(),
+    reason="flash_attn at sgl-kernel is only supported on sm90 or sm80",
+)
+# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float8_e4m3fn])
+@pytest.mark.parametrize(
+    "dtype", [torch.bfloat16] + ([torch.float8_e4m3fn] if not DISABLE_FP8 else [])
+)
+# @pytest.mark.parametrize("dtype", [torch.bfloat16])
+# @pytest.mark.parametrize("dtype", [torch.float8_e4m3fn])
+@pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
+# @pytest.mark.parametrize("mha_type", ["mha"])
+@pytest.mark.parametrize("has_sink", [False, True])
+# @pytest.mark.parametrize("has_sink", [False])
+# @pytest.mark.parametrize("has_qv", [False, True])
+@pytest.mark.parametrize("has_qv", [False])
+# @pytest.mark.parametrize("deterministic", [False, True])
+@pytest.mark.parametrize("deterministic", [False])
+@pytest.mark.parametrize("softcap", [0.0] + ([15.0] if not DISABLE_SOFTCAP else []))
+# @pytest.mark.parametrize("softcap", [0.0])
+@pytest.mark.parametrize("local", [False])
+# @pytest.mark.parametrize("local", [False])
+@pytest.mark.parametrize("causal", [False, True])
+# @pytest.mark.parametrize("causal", [False])
+@pytest.mark.parametrize("add_unused_qkv", [False, True])
+# @pytest.mark.parametrize("add_unused_qkv", [True])
+# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
+# @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192, 256])
+# @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192])
+# @pytest.mark.parametrize('d', [56, 80])
+# @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128])
+# @pytest.mark.parametrize("d", [64, 96, 128])
+# @pytest.mark.parametrize("d", COMPILED_HDIMS)
+@pytest.mark.parametrize("d", [128])
+@pytest.mark.parametrize(
+    "seqlen_q,seqlen_k",
+    [
+        (1, 1),
+        (1, 3),
+        (2, 1),
+        (511, 1),
+        (3, 513),
+        (64, 128),
+        (128, 128),
+        (256, 256),
+        (113, 203),
+        (128, 217),
+        (113, 211),
+        (108, 256),
+        (256, 512),
+        (307, 256),
+        (640, 128),
+        (512, 256),
+        (1024, 1024),
+        (1023, 1024),
+        (1024, 1023),
+        (2048, 2048),
+    ],
+)
+def test_flash_attn_varlen_output(
+    seqlen_q,
+    seqlen_k,
+    d,
+    add_unused_qkv,
+    causal,
+    local,
+    softcap,
+    deterministic,
+    has_qv,
+    mha_type,
+    dtype,
+    has_sink,
+):
+    from sgl_kernel.flash_attn import flash_attn_varlen_func
+
+    device = "cuda"
+    # set seed
+    torch.random.manual_seed(seqlen_q + seqlen_k + d + int(causal) * 2 + int(local))
+    # batch_size = 40
+    # nheads = 16
+    batch_size = 9 if seqlen_q <= 2048 else 2
+    nheads = 6
+    # batch_size = 2
+    # nheads = 1
+    nheads_kv = nheads if mha_type == "mha" else (2 if mha_type == "gqa" else 1)
+    dtype_ref = torch.bfloat16 if dtype == torch.float8_e4m3fn else dtype
+    dv_vals = [128, d] if d > 128 and d <= 192 else ([256, 512, d] if d <= 64 else [d])
+    if dtype == torch.float8_e4m3fn:
+        dv_vals = [d]
+    for dv in dv_vals:
+        q_ref = torch.randn(
+            batch_size, seqlen_q, nheads, d, device=device, dtype=dtype_ref
+        )
+        if softcap > 0.0:
+            # Ensure the values of qk are at least within softcap range.
+            q_ref = (q_ref * softcap / 4).detach().requires_grad_()
+        q_ref = q_ref.to(dtype).to(dtype_ref).requires_grad_()
+        k_ref = (
+            torch.randn(
+                batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype_ref
+            )
+            .to(dtype)
+            .to(dtype_ref)
+            .requires_grad_()
+        )
+        v_ref = (
+            torch.randn(
+                batch_size, seqlen_k, nheads_kv, dv, device=device, dtype=dtype_ref
+            )
+            .to(dtype)
+            .to(dtype_ref)
+            .requires_grad_()
+        )
+        if has_qv:
+            qv_ref = (
+                torch.randn(
+                    batch_size, seqlen_q, nheads, dv, device=device, dtype=dtype_ref
+                )
+                .to(dtype)
+                .to(dtype_ref)
+            )
+        else:
+            qv_ref = None
+        # Put window_size after QKV randn so that window_size changes from test to test
+        window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,))
+
+        if has_sink:
+            sinks = torch.randn(nheads, dtype=torch.bfloat16, device=device)
+        else:
+            sinks = None
+
+        if dtype == torch.float8_e4m3fn:
+            q_descale, k_descale, v_descale = [
+                torch.rand(batch_size, nheads_kv, device=device, dtype=torch.float32)
+                * 2
+                for _ in range(3)
+            ]
+        else:
+            q_descale, k_descale, v_descale = None, None, None
+        q, k, v = [x.detach().requires_grad_() for x in (q_ref, k_ref, v_ref)]
+        qv = qv_ref.detach() if has_qv else None
+        query_padding_mask = generate_random_padding_mask(
+            seqlen_q, batch_size, device, mode="random", zero_lengths=False
+        )
+        key_padding_mask = generate_random_padding_mask(
+            seqlen_k, batch_size, device, mode="random", zero_lengths=True
+        )
+
+        def _gen_unused_masks(padding_mask, add_unused, max_seq_len, bs, device):
+            if add_unused:
+                another_mask = generate_random_padding_mask(max_seq_len, bs, device)
+                attn_mask = torch.logical_and(padding_mask, another_mask)
+                unused_mask = torch.logical_xor(
+                    torch.logical_or(padding_mask, another_mask), attn_mask
+                )
+            else:
+                attn_mask = padding_mask
+                unused_mask = None
+            return attn_mask, unused_mask
+
+        query_padding_mask, query_unused_mask = _gen_unused_masks(
+            query_padding_mask, add_unused_qkv, seqlen_q, batch_size, q.device
+        )
+        key_padding_mask, key_unused_mask = _gen_unused_masks(
+            key_padding_mask, add_unused_qkv, seqlen_k, batch_size, k.device
+        )
+
+        (
+            q_unpad,
+            k_unpad,
+            v_unpad,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            seqused_q,
+            seqused_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            q,
+            k,
+            v,
+            output_pad_fn,
+            dq_pad_fn,
+            dk_pad_fn,
+        ) = generate_qkv(
+            q,
+            k,
+            v,
+            query_padding_mask,
+            key_padding_mask,
+            kvpacked=False,
+            query_unused_mask=query_unused_mask,
+            key_unused_mask=key_unused_mask,
+        )
+        q_unpad, k_unpad, v_unpad = [
+            x.detach().to(dtype).requires_grad_() for x in (q_unpad, k_unpad, v_unpad)
+        ]
+        out_ref, attn_ref = attention_ref(
+            q_ref,
+            k_ref,
+            v_ref,
+            query_padding_mask,
+            key_padding_mask,
+            causal=causal,
+            qv=qv_ref,
+            q_descale=q_descale,
+            k_descale=k_descale,
+            v_descale=v_descale,
+            window_size=window_size,
+            softcap=softcap,
+            sinks=sinks,
+        )
+        out_pt, attn_pt = attention_ref(
+            q_ref,
+            k_ref,
+            v_ref,
+            query_padding_mask,
+            key_padding_mask,
+            causal=causal,
+            qv=qv_ref,
+            q_descale=q_descale,
+            k_descale=k_descale,
+            v_descale=v_descale,
+            window_size=window_size,
+            softcap=softcap,
+            upcast=False,
+            reorder_ops=True,
+            intermediate_dtype=dtype if dtype == torch.float8_e4m3fn else None,
+            sinks=sinks,
+        )
+
+        print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
+        print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
+
+        if query_unused_mask is not None:
+            q_zero_masking = rearrange(query_unused_mask, "b s -> b s 1 1")
+
+        # Numerical error if we just do any arithmetic on out_ref
+        fwd_atol = 2 * (out_ref + 0.3 - 0.3 - out_ref).abs().max().item()
+        rtol = 2 if softcap == 0.0 else 3
+
+        pack_gqa_vals = [False, True] if not DISABLE_PACKGQA else [False]
+        num_splits_vals = [1, 3] if not DISABLE_SPLIT else [1]
+        for pack_gqa, num_splits in itertools.product(pack_gqa_vals, num_splits_vals):
+            out_unpad, lse, *rest = flash_attn_varlen_func(
+                q_unpad,
+                k_unpad,
+                v_unpad,
+                cu_seqlens_q,
+                cu_seqlens_k,
+                max_seqlen_q,
+                max_seqlen_k,
+                seqused_q=seqused_q,
+                seqused_k=seqused_k,
+                causal=causal,
+                q_descale=q_descale,
+                k_descale=k_descale,
+                v_descale=v_descale,
+                window_size=window_size,
+                softcap=softcap,
+                return_softmax_lse=True,
+                sinks=sinks,
+            )
+            out = output_pad_fn(out_unpad)
+            if query_unused_mask is not None:
+                out.masked_fill_(q_zero_masking, 0.0)
+            print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+            print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+
+            # Check that FlashAttention's numerical error is at most 3x the numerical error
+            # of a Pytorch implementation.
+            assert (out - out_ref).abs().max().item() <= rtol * (
+                out_pt - out_ref
+            ).abs().max().item() + fwd_atol
+
+    if not DISABLE_BACKWARD and dtype != torch.float8_e4m3fn and not has_qv:
+        g_unpad = torch.randn_like(out_unpad)
+        do_o = ((g_unpad.float() * out_unpad.float()).sum(-1)).transpose(-1, -2)
+        dq_unpad, dk_unpad, dv_unpad = torch.autograd.grad(
+            out_unpad, (q_unpad, k_unpad, v_unpad), g_unpad
+        )
+        dq = dq_pad_fn(dq_unpad)
+        dk = dk_pad_fn(dk_unpad)
+        dv = dk_pad_fn(dv_unpad)
+        if key_unused_mask is not None:
+            k_zero_masking = rearrange(key_unused_mask, "b s -> b s 1 1")
+            dk.masked_fill_(k_zero_masking, 0.0)
+            dv.masked_fill_(k_zero_masking, 0.0)
+        if query_unused_mask is not None:
+            dq.masked_fill_(q_zero_masking, 0.0)
+        # print(f"dO_O max diff: {(softmax_d - do_o).abs().max().item()}")
+        # assert (softmax_d - do_o).abs().max().item() <= 1e-5
+        # assert dq_accum.abs().max().item() == 0.0
+        g = output_pad_fn(g_unpad)
+
+        # dq, dk, dv = torch.autograd.grad(out, (q, k, v), g)
+        dq_ref, dk_ref, dv_ref = torch.autograd.grad(out_ref, (q_ref, k_ref, v_ref), g)
+        dq_pt, dk_pt, dv_pt = torch.autograd.grad(out_pt, (q_ref, k_ref, v_ref), g)
+        print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}")
+        print(f"dK max diff: {(dk - dk_ref).abs().max().item()}")
+        print(f"dV max diff: {(dv - dv_ref).abs().max().item()}")
+        print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}")
+        print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}")
+        print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}")
+        print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}")
+        print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}")
+        print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}")
+        print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}")
+        print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}")
+        print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}")
+
+    if not DISABLE_BACKWARD and dtype != torch.float8_e4m3fn and not has_qv:
+        dq_atol = 2 * (dq_ref + 0.3 - 0.3 - dq_ref).abs().max().item() + (
+            0 if softcap == 0 else 3e-4
+        )
+        assert (dq - dq_ref).abs().max().item() <= rtol * (
+            dq_pt - dq_ref
+        ).abs().max().item() + dq_atol
+        dk_atol = 2 * (dk_ref + 0.3 - 0.3 - dk_ref).abs().max().item() + (
+            0 if softcap == 0 else 3e-4
+        )
+        assert (dk - dk_ref).abs().max().item() <= rtol * (
+            dk_pt - dk_ref
+        ).abs().max().item() + dk_atol
+        dv_atol = 2 * (dv_ref + 0.3 - 0.3 - dv_ref).abs().max().item() + (
+            0 if softcap == 0 else 3e-4
+        )
+        assert (dv - dv_ref).abs().max().item() <= rtol * (
+            dv_pt - dv_ref
+        ).abs().max().item() + dv_atol
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_flash_attention_4.py b/sgl-kernel/tests/test_flash_attention_4.py
new file mode 100644
index 000000000..f50492923
--- /dev/null
+++ b/sgl-kernel/tests/test_flash_attention_4.py
@@ -0,0 +1,877 @@
+# Adapted from https://github.com/Dao-AILab/flash-attention/blob/b31ae1e4cd22cf5f820a2995b74b7cd3bd54355a/tests/cute/test_flash_attn.py
+
+# Copyright (c) 2025, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+
+import itertools
+import math
+from functools import partial
+
+import pytest
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from sgl_kernel.flash_attn import flash_attn_varlen_func
+from utils import is_hopper
+
+flash_attn_varlen_func = partial(flash_attn_varlen_func, ver=4)
+
+
+def unpad_input(hidden_states, attention_mask, unused_mask=None):
+    """
+    Arguments:
+        hidden_states: (batch, seqlen, ...)
+        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
+        unused_mask: (batch, seqlen), bool / int, 1 means the element is allocated but unused.
+    Return:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask + unused_mask.
+        indices: (total_nnz), the indices of masked tokens from the flattened input sequence.
+        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
+        max_seqlen_in_batch: int
+        seqused: (batch), returns the number of tokens selected in attention_mask + unused_mask.
+    """
+    all_masks = (
+        (attention_mask + unused_mask) if unused_mask is not None else attention_mask
+    )
+    seqlens_in_batch = all_masks.sum(dim=-1, dtype=torch.int32)
+    used_seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(all_masks.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
+    # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
+    # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
+    # index with integer indices.
+    return (
+        rearrange(hidden_states, "b s ... -> (b s) ...")[indices],
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+        used_seqlens_in_batch,
+    )
+
+
+def pad_input(hidden_states, indices, batch, seqlen):
+    """
+    Arguments:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence.
+        batch: int, batch size for the padded sequence.
+        seqlen: int, maximum sequence length for the padded sequence.
+    Return:
+        hidden_states: (batch, seqlen, ...)
+    """
+    dim = hidden_states.shape[1:]
+    output = torch.zeros(
+        (batch * seqlen), *dim, device=hidden_states.device, dtype=hidden_states.dtype
+    )
+    output[indices] = hidden_states
+    return rearrange(output, "(b s) ... -> b s ...", b=batch)
+
+
+def generate_random_padding_mask(
+    max_seqlen, batch_size, device, mode="random", zero_lengths=False
+):
+    assert mode in ["full", "random", "third"]
+    if mode == "full":
+        lengths = torch.full(
+            (batch_size, 1), max_seqlen, device=device, dtype=torch.int32
+        )
+    elif mode == "random":
+        lengths = torch.randint(
+            max(0 if zero_lengths else 1, max_seqlen - 20),
+            max_seqlen + 1,
+            (batch_size, 1),
+            device=device,
+        )
+    elif mode == "third":
+        lengths = torch.randint(
+            max_seqlen // 3, max_seqlen + 1, (batch_size, 1), device=device
+        )
+
+    if zero_lengths:
+        # Generate zero-lengths every 5 batches and the last batch.
+        for i in range(batch_size):
+            if i % 5 == 0:
+                lengths[i] = 0
+        lengths[-1] = 0
+    padding_mask = (
+        repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size)
+        < lengths
+    )
+    return padding_mask
+
+
+def generate_qkv(
+    q,
+    k,
+    v,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    qv=None,
+    kvpacked=False,
+    qkvpacked=False,
+    query_unused_mask=None,
+    key_unused_mask=None,
+):
+    """
+    Arguments:
+        q: (batch_size, seqlen_q, nheads, d)
+        k: (batch_size, seqlen_k, nheads_k, d)
+        v: (batch_size, seqlen_k, nheads_k, d_v)
+        query_padding_mask: (batch_size, seqlen), bool
+        key_padding_mask: (batch_size, seqlen), bool
+    """
+    assert not (kvpacked and qkvpacked)
+    batch_size, seqlen_q, nheads, d = q.shape
+    d_v = v.shape[-1]
+    _, seqlen_k, nheads_k, _ = k.shape
+    assert k.shape == (batch_size, seqlen_k, nheads_k, d)
+    assert v.shape == (batch_size, seqlen_k, nheads_k, d_v)
+    if query_unused_mask is not None or key_unused_mask is not None:
+        assert not kvpacked
+        assert not qkvpacked
+
+    if query_padding_mask is not None:
+        q_unpad, indices_q, cu_seqlens_q, max_seqlen_q, seqused_q = unpad_input(
+            q, query_padding_mask, query_unused_mask
+        )
+        output_pad_fn = lambda output_unpad: pad_input(
+            output_unpad, indices_q, batch_size, seqlen_q
+        )
+        qv_unpad = (
+            rearrange(qv, "b s ... -> (b s) ...")[indices_q] if qv is not None else None
+        )
+    else:
+        q_unpad = rearrange(q, "b s h d -> (b s) h d")
+        cu_seqlens_q = torch.arange(
+            0,
+            (batch_size + 1) * seqlen_q,
+            step=seqlen_q,
+            dtype=torch.int32,
+            device=q_unpad.device,
+        )
+        seqused_q = None
+        max_seqlen_q = seqlen_q
+        output_pad_fn = lambda output_unpad: rearrange(
+            output_unpad, "(b s) h d -> b s h d", b=batch_size
+        )
+        qv_unpad = rearrange(qv, "b s ... -> (b s) ...") if qv is not None else None
+
+    if key_padding_mask is not None:
+        k_unpad, indices_k, cu_seqlens_k, max_seqlen_k, seqused_k = unpad_input(
+            k, key_padding_mask, key_unused_mask
+        )
+        v_unpad, *rest = unpad_input(v, key_padding_mask, key_unused_mask)
+    else:
+        k_unpad = rearrange(k, "b s h d -> (b s) h d")
+        v_unpad = rearrange(v, "b s h d -> (b s) h d")
+        cu_seqlens_k = torch.arange(
+            0,
+            (batch_size + 1) * seqlen_k,
+            step=seqlen_k,
+            dtype=torch.int32,
+            device=k_unpad.device,
+        )
+        seqused_k = None
+        max_seqlen_k = seqlen_k
+
+    if qkvpacked:
+        assert (query_padding_mask == key_padding_mask).all()
+        assert nheads == nheads_k
+        qkv_unpad = torch.stack([q_unpad, k_unpad, v_unpad], dim=1)
+        qkv = torch.stack([q, k, v], dim=2)
+        if query_padding_mask is not None:
+            dqkv_pad_fn = lambda dqkv_unpad: pad_input(
+                dqkv_unpad, indices_q, batch_size, seqlen_q
+            )
+        else:
+            dqkv_pad_fn = lambda dqkv_unpad: rearrange(
+                dqkv_unpad, "(b s) t h d -> b s t h d", b=batch_size
+            )
+        return (
+            qkv_unpad.detach().requires_grad_(),
+            cu_seqlens_q,
+            max_seqlen_q,
+            qkv.detach().requires_grad_(),
+            output_pad_fn,
+            dqkv_pad_fn,
+        )
+    elif kvpacked:
+        kv_unpad = torch.stack([k_unpad, v_unpad], dim=1)
+        kv = torch.stack([k, v], dim=2)
+        dq_pad_fn = output_pad_fn
+        if key_padding_mask is not None:
+            dkv_pad_fn = lambda dkv_unpad: pad_input(
+                dkv_unpad, indices_k, batch_size, seqlen_k
+            )
+        else:
+            dkv_pad_fn = lambda dkv_unpad: rearrange(
+                dkv_unpad, "(b s) t h d -> b s t h d", b=batch_size
+            )
+        return (
+            q_unpad.detach().requires_grad_(),
+            kv_unpad.detach().requires_grad_(),
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            q.detach().requires_grad_(),
+            kv.detach().requires_grad_(),
+            output_pad_fn,
+            dq_pad_fn,
+            dkv_pad_fn,
+        )
+    else:
+        dq_pad_fn = output_pad_fn
+        if key_padding_mask is not None:
+            dk_pad_fn = lambda dk_unpad: pad_input(
+                dk_unpad, indices_k, batch_size, seqlen_k
+            )
+        else:
+            dk_pad_fn = lambda dk_unpad: rearrange(
+                dk_unpad, "(b s) h d -> b s h d", b=batch_size
+            )
+        return (
+            q_unpad.detach().requires_grad_(),
+            k_unpad.detach().requires_grad_(),
+            v_unpad.detach().requires_grad_(),
+            qv_unpad.detach() if qv is not None else None,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            seqused_q,
+            seqused_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            q.detach().requires_grad_(),
+            k.detach().requires_grad_(),
+            v.detach().requires_grad_(),
+            qv.detach() if qv is not None else None,
+            output_pad_fn,
+            dq_pad_fn,
+            dk_pad_fn,
+        )
+
+
+def construct_local_mask(
+    seqlen_q,
+    seqlen_k,
+    window_size=(None, None),
+    sink_token_length=0,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    key_leftpad=None,
+    device=None,
+):
+    row_idx = rearrange(
+        torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1"
+    )
+    col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long)
+    if key_leftpad is not None:
+        key_leftpad = rearrange(key_leftpad, "b -> b 1 1 1")
+        col_idx = repeat(col_idx, "s -> b 1 1 s", b=key_leftpad.shape[0])
+        col_idx = torch.where(col_idx >= key_leftpad, col_idx - key_leftpad, 2**32)
+    sk = (
+        seqlen_k
+        if key_padding_mask is None
+        else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1")
+    )
+    sq = (
+        seqlen_q
+        if query_padding_mask is None
+        else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1")
+    )
+    if window_size[0] is None:
+        return col_idx > row_idx + sk - sq + window_size[1]
+    else:
+        sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk
+        return torch.logical_or(
+            col_idx > torch.minimum(row_idx + sk - sq + window_size[1], sk),
+            torch.logical_and(
+                col_idx < row_idx + sk - sq - window_size[0],
+                col_idx >= sink_token_length,
+            ),
+        )
+
+
+def construct_chunk_mask(
+    seqlen_q,
+    seqlen_k,
+    attention_chunk,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    key_leftpad=None,
+    device=None,
+):
+    row_idx = rearrange(
+        torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1"
+    )
+    col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long)
+    if key_leftpad is not None:
+        key_leftpad = rearrange(key_leftpad, "b -> b 1 1 1")
+        col_idx = repeat(col_idx, "s -> b 1 1 s", b=key_leftpad.shape[0])
+        col_idx = torch.where(col_idx >= key_leftpad, col_idx - key_leftpad, 2**32)
+    sk = (
+        seqlen_k
+        if key_padding_mask is None
+        else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1")
+    )
+    sq = (
+        seqlen_q
+        if query_padding_mask is None
+        else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1")
+    )
+    sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk
+    # Subtract remainder instead of divide and then multiply to take care of negative values
+    col_limit_left_chunk = row_idx + sk - sq - (row_idx + sk - sq) % attention_chunk
+    return torch.logical_or(
+        col_idx < col_limit_left_chunk,
+        col_idx >= col_limit_left_chunk + attention_chunk,
+    )
+
+
+def attention_ref(
+    q,
+    k,
+    v,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    key_leftpad=None,
+    attn_bias=None,
+    dropout_p=0.0,
+    dropout_mask=None,
+    causal=False,
+    qv=None,
+    q_descale=None,
+    k_descale=None,
+    v_descale=None,
+    window_size=(None, None),
+    attention_chunk=0,
+    sink_token_length=0,
+    learnable_sink=None,
+    softcap=0.0,
+    upcast=True,
+    reorder_ops=False,
+    intermediate_dtype=None,
+):
+    """
+    Arguments:
+        q: (batch_size, seqlen_q, nheads, head_dim)
+        k: (batch_size, seqlen_k, nheads, head_dim)
+        v: (batch_size, seqlen_k, nheads, head_dim_v)
+        qv: (batch_size, seqlen_q, nheads, head_dim_v)
+        query_padding_mask: (batch_size, seqlen_q)
+        key_padding_mask: (batch_size, seqlen_k)
+        attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k)
+        dropout_p: float
+        dropout_mask: (batch_size, nheads, seqlen_q, seqlen_k)
+        causal: whether to apply causal masking
+        upcast: whether to cast all inputs to fp32, do all computation in fp32, then cast
+            output back to fp16/bf16.
+        reorder_ops: whether to change the order of operations (scaling k instead of scaling k, etc.)
+            without changing the math. This is to estimate the numerical error from operation
+            reordering.
+    Output:
+        output: (batch_size, seqlen_q, nheads, head_dim_v)
+        attention: (batch_size, nheads, seqlen_q, seqlen_k), softmax after dropout
+    """
+    if causal:
+        window_size = (window_size[0], 0)
+    dtype_og = q.dtype
+    if upcast:
+        q, k, v = q.float(), k.float(), v.float()
+        qv = qv.float() if qv is not None else None
+    if q_descale is not None:
+        q_descale = repeat(q_descale, "b h -> b 1 (h g) 1", g=q.shape[2] // k.shape[2])
+        q = (q.float() * q_descale).to(q.dtype)
+        qv = (qv.float() * q_descale).to(qv.dtype) if qv is not None else None
+    if k_descale is not None:
+        k = (k.float() * rearrange(k_descale, "b h -> b 1 h 1")).to(dtype=k.dtype)
+    if v_descale is not None:
+        v = (v.float() * rearrange(v_descale, "b h -> b 1 h 1")).to(dtype=v.dtype)
+    seqlen_q, seqlen_k = q.shape[1], k.shape[1]
+    k = repeat(k, "b s h d -> b s (h g) d", g=q.shape[2] // k.shape[2])
+    v = repeat(v, "b s h d -> b s (h g) d", g=q.shape[2] // v.shape[2])
+    d = q.shape[-1]
+    dv = v.shape[-1]
+    softmax_scale = 1.0 / math.sqrt(d if qv is None else d + dv)
+    if not reorder_ops:
+        scores = torch.einsum("bthd,bshd->bhts", q * softmax_scale, k)
+    else:
+        scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
+    if qv is not None:
+        scores = scores + torch.einsum("bthd,bshd->bhts", qv * softmax_scale, v)
+    if softcap > 0:
+        scores = torch.tanh(scores / softcap) * softcap
+    if key_padding_mask is not None:
+        scores.masked_fill_(
+            rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf")
+        )
+    local_mask = None
+    if window_size[0] is not None or window_size[1] is not None:
+        local_mask = construct_local_mask(
+            seqlen_q,
+            seqlen_k,
+            window_size,
+            sink_token_length,
+            query_padding_mask,
+            key_padding_mask,
+            key_leftpad=key_leftpad,
+            device=q.device,
+        )
+    if attention_chunk > 0:
+        chunk_mask = construct_chunk_mask(
+            seqlen_q,
+            seqlen_k,
+            attention_chunk,
+            query_padding_mask,
+            key_padding_mask,
+            key_leftpad=key_leftpad,
+            device=q.device,
+        )
+        local_mask = (
+            torch.logical_or(local_mask, chunk_mask)
+            if local_mask is not None
+            else chunk_mask
+        )
+    if local_mask is not None:
+        scores.masked_fill_(local_mask, float("-inf"))
+    if attn_bias is not None:
+        scores = scores + attn_bias
+    if learnable_sink is None:
+        attention = torch.softmax(scores, dim=-1).to(v.dtype)
+    else:
+        scores_fp32 = scores.to(torch.float32)
+        logits_max = torch.amax(scores_fp32, dim=-1, keepdim=True)
+        learnable_sink = rearrange(learnable_sink, "h -> h 1 1")
+        logits_or_sinks_max = torch.maximum(learnable_sink, logits_max)
+        unnormalized_scores = torch.exp(scores_fp32 - logits_or_sinks_max)
+        normalizer = unnormalized_scores.sum(dim=-1, keepdim=True) + torch.exp(
+            learnable_sink - logits_or_sinks_max
+        )
+        attention = (unnormalized_scores / normalizer).to(v.dtype)
+    # We want to mask here so that the attention matrix doesn't have any NaNs
+    # Otherwise we'll get NaN in dV
+    if query_padding_mask is not None:
+        attention = attention.masked_fill(
+            rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0
+        )
+    # Without this we might get NaN in dv
+    if key_padding_mask is not None:
+        attention = attention.masked_fill(
+            rearrange(~key_padding_mask, "b s -> b 1 1 s"), 0.0
+        )
+    # Some rows might be completely masked out so we fill them with zero instead of NaN
+    if local_mask is not None:
+        attention = attention.masked_fill(
+            torch.all(local_mask, dim=-1, keepdim=True), 0.0
+        )
+    dropout_scaling = 1.0 / (1 - dropout_p)
+    # attention_drop = attention.masked_fill(~dropout_mask, 0.0) * dropout_scaling
+    # output = torch.einsum('bhts,bshd->bthd', attention_drop , v)
+    if dropout_mask is not None:
+        attention_drop = attention.masked_fill(~dropout_mask, 0.0)
+    else:
+        attention_drop = attention
+    if intermediate_dtype is not None:
+        attention_drop = attention_drop.to(intermediate_dtype).to(attention_drop.dtype)
+    output = torch.einsum("bhts,bshd->bthd", attention_drop, v * dropout_scaling)
+    if query_padding_mask is not None:
+        output.masked_fill_(rearrange(~query_padding_mask, "b s -> b s 1 1"), 0.0)
+    return output.to(dtype=dtype_og), attention.to(dtype=dtype_og)
+
+
+@pytest.mark.skipif(
+    is_hopper(),
+    reason="skip on hopper",
+)
+# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float8_e4m3fn])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
+# @pytest.mark.parametrize("mha_type", ["mqa"])
+@pytest.mark.parametrize("has_learnable_sink", [False, True])
+# @pytest.mark.parametrize("has_learnable_sink", [False])
+# @pytest.mark.parametrize("has_qv", [False, True])
+@pytest.mark.parametrize("has_qv", [False])
+# @pytest.mark.parametrize("deterministic", [False, True])
+@pytest.mark.parametrize("deterministic", [False])
+# @pytest.mark.parametrize("softcap", [0.0, 15.0])
+@pytest.mark.parametrize("softcap", [0.0])
+@pytest.mark.parametrize("local", [False, True])
+# @pytest.mark.parametrize("local", [False])
+@pytest.mark.parametrize("causal", [False, True])
+# @pytest.mark.parametrize("causal", [False])
+# @pytest.mark.parametrize("add_unused_qkv", [False, True])
+@pytest.mark.parametrize("add_unused_qkv", [False])
+# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
+# @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192, 256])
+# @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192])
+# @pytest.mark.parametrize('d', [56, 80])
+# @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128])
+# @pytest.mark.parametrize("d", [64, 96, 128])
+@pytest.mark.parametrize("d", [128, 192])
+# @pytest.mark.parametrize("d", [192])
+@pytest.mark.parametrize(
+    "seqlen_q,seqlen_k",
+    [
+        # (1, 1),
+        # (1, 3),
+        # (2, 1),
+        (511, 1),
+        (3, 513),
+        (64, 128),
+        (128, 128),
+        (256, 256),
+        (113, 203),
+        (128, 217),
+        (113, 211),
+        (108, 256),
+        (256, 512),
+        (307, 256),
+        (640, 128),
+        (512, 256),
+        (1024, 1024),
+        (1023, 1024),
+        (1024, 1023),
+        (2048, 2048),
+    ],
+)
+def test_flash_attn_varlen_output(
+    seqlen_q,
+    seqlen_k,
+    d,
+    add_unused_qkv,
+    causal,
+    local,
+    softcap,
+    deterministic,
+    has_qv,
+    has_learnable_sink,
+    mha_type,
+    dtype,
+):
+    if (
+        causal or local
+    ):  # Right now we only support causal attention with seqlen_k == seqlen_q
+        seqlen_k = seqlen_q
+    device = "cuda"
+    # set seed
+    torch.random.manual_seed(seqlen_q + seqlen_k + d + int(causal) * 2 + int(local))
+    batch_size = 49 if seqlen_q <= 1024 else 7
+    nheads = 6
+    # batch_size = 1
+    # nheads = 1
+    nheads_kv = nheads if mha_type == "mha" else (3 if mha_type == "gqa" else 1)
+    dtype_ref = torch.bfloat16 if dtype == torch.float8_e4m3fn else dtype
+    # dv_vals = [128, d] if d > 128 and d <= 192 else ([256, 512, d] if d <= 64 else [d])
+    dv_vals = [128] if d == 192 else ([d] if d != 128 else [64, d])
+    if dtype == torch.float8_e4m3fn:
+        dv_vals = [d]
+    # attention_chunk_vals = [torch.randint(1, seqlen_k * 2, (1,)).item(), 0] if seqlen_q <= seqlen_k else [0]
+    attention_chunk_vals = [0]
+    for dv, attention_chunk in itertools.product(dv_vals, attention_chunk_vals):
+        q_ref = torch.randn(
+            batch_size, seqlen_q, nheads, d, device=device, dtype=dtype_ref
+        )
+        if softcap > 0.0:
+            # Ensure the values of qk are at least within softcap range.
+            q_ref = (q_ref * softcap / 4).detach().requires_grad_()
+        q_ref = q_ref.to(dtype).to(dtype_ref).requires_grad_()
+        k_ref = (
+            torch.randn(
+                batch_size, seqlen_k, nheads_kv, d, device=device, dtype=dtype_ref
+            )
+            .to(dtype)
+            .to(dtype_ref)
+            .requires_grad_()
+        )
+        v_ref = (
+            torch.randn(
+                batch_size, seqlen_k, nheads_kv, dv, device=device, dtype=dtype_ref
+            )
+            .to(dtype)
+            .to(dtype_ref)
+            .requires_grad_()
+        )
+        if has_qv:
+            qv_ref = (
+                torch.randn(
+                    batch_size, seqlen_q, nheads, dv, device=device, dtype=dtype_ref
+                )
+                .to(dtype)
+                .to(dtype_ref)
+            )
+        else:
+            qv_ref = None
+        # Put window_size after QKV randn so that window_size changes from test to test
+        window_size = (
+            (None, None) if not local else torch.randint(0, seqlen_k, (2,)).tolist()
+        )
+        if has_learnable_sink:
+            learnable_sink = torch.randn(nheads, dtype=torch.bfloat16, device=device)
+        else:
+            learnable_sink = None
+        if dtype == torch.float8_e4m3fn:
+            q_descale, k_descale, v_descale = [
+                torch.rand(batch_size, nheads_kv, device=device, dtype=torch.float32)
+                * 2
+                for _ in range(3)
+            ]
+        else:
+            q_descale, k_descale, v_descale = None, None, None
+        q, k, v = [x.detach().requires_grad_() for x in (q_ref, k_ref, v_ref)]
+        qv = qv_ref.detach() if has_qv else None
+        query_padding_mask = generate_random_padding_mask(
+            seqlen_q, batch_size, device, mode="random", zero_lengths=False
+        )
+        # TODO: test zero_lengths
+        key_padding_mask = generate_random_padding_mask(
+            # seqlen_k, batch_size, device, mode="random", zero_lengths=True
+            seqlen_k,
+            batch_size,
+            device,
+            mode="random",
+            zero_lengths=False,
+        )
+
+        def _gen_unused_masks(padding_mask, add_unused, max_seq_len, bs, device):
+            if add_unused:
+                another_mask = generate_random_padding_mask(max_seq_len, bs, device)
+                attn_mask = torch.logical_and(padding_mask, another_mask)
+                unused_mask = torch.logical_xor(
+                    torch.logical_or(padding_mask, another_mask), attn_mask
+                )
+            else:
+                attn_mask = padding_mask
+                unused_mask = None
+            return attn_mask, unused_mask
+
+        query_padding_mask, query_unused_mask = _gen_unused_masks(
+            query_padding_mask, add_unused_qkv, seqlen_q, batch_size, q.device
+        )
+        # query_padding_mask[:] = True
+        # query_unused_mask = None
+        key_padding_mask, key_unused_mask = _gen_unused_masks(
+            key_padding_mask, add_unused_qkv, seqlen_k, batch_size, k.device
+        )
+
+        if causal or local:
+            key_padding_mask = query_padding_mask
+
+        (
+            q_unpad,
+            k_unpad,
+            v_unpad,
+            qv_unpad,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            seqused_q,
+            seqused_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            q,
+            k,
+            v,
+            qv,
+            output_pad_fn,
+            dq_pad_fn,
+            dk_pad_fn,
+        ) = generate_qkv(
+            q,
+            k,
+            v,
+            query_padding_mask,
+            key_padding_mask,
+            qv=qv,
+            kvpacked=False,
+            query_unused_mask=query_unused_mask,
+            key_unused_mask=key_unused_mask,
+        )
+        q_unpad, k_unpad, v_unpad = [
+            x.detach().to(dtype).requires_grad_() for x in (q_unpad, k_unpad, v_unpad)
+        ]
+        out_ref, attn_ref = attention_ref(
+            q_ref,
+            k_ref,
+            v_ref,
+            query_padding_mask,
+            key_padding_mask,
+            causal=causal,
+            qv=qv_ref,
+            q_descale=q_descale,
+            k_descale=k_descale,
+            v_descale=v_descale,
+            window_size=window_size,
+            attention_chunk=attention_chunk,
+            learnable_sink=learnable_sink,
+            softcap=softcap,
+        )
+        out_pt, attn_pt = attention_ref(
+            q_ref,
+            k_ref,
+            v_ref,
+            query_padding_mask,
+            key_padding_mask,
+            causal=causal,
+            qv=qv_ref,
+            q_descale=q_descale,
+            k_descale=k_descale,
+            v_descale=v_descale,
+            window_size=window_size,
+            attention_chunk=attention_chunk,
+            learnable_sink=learnable_sink,
+            softcap=softcap,
+            upcast=False,
+            reorder_ops=True,
+            intermediate_dtype=dtype if dtype == torch.float8_e4m3fn else None,
+        )
+
+        print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
+        print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
+
+        if query_unused_mask is not None:
+            q_zero_masking = rearrange(query_unused_mask, "b s -> b s 1 1")
+
+        # Numerical error if we just do any arithmetic on out_ref
+        fwd_atol = 2 * (out_ref + 0.3 - 0.3 - out_ref).abs().max().item()
+        rtol = 2 if softcap == 0.0 else 3
+
+        pack_gqa_vals = [False, True, None]
+        # num_splits_vals = [1, 3]
+        num_splits_vals = [1]
+        for pack_gqa, num_splits in itertools.product(pack_gqa_vals, num_splits_vals):
+            out_unpad, lse = flash_attn_varlen_func(
+                q_unpad,
+                k_unpad,
+                v_unpad,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=None,
+                max_seqlen_k=None,
+                # seqused_q=seqused_q,
+                # seqused_k=seqused_k,
+                causal=causal,
+                # qv=qv_unpad,
+                # q_descale=q_descale,
+                # k_descale=k_descale, v_descale=v_descale,
+                window_size=window_size,
+                # attention_chunk=attention_chunk,
+                sinks=learnable_sink,
+                softcap=softcap,
+                pack_gqa=pack_gqa,
+                return_softmax_lse=True,
+            )
+            out = output_pad_fn(out_unpad)
+            if query_unused_mask is not None:
+                out.masked_fill_(q_zero_masking, 0.0)
+            print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+            print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+            # if not causal:
+            #     print(f"LSE max diff: {(lse - lse_ref).abs().max().item()}")
+            # breakpoint()
+
+            # Check that FlashAttention's numerical error is at most 3x the numerical error
+            # of a Pytorch implementation.
+            assert (out - out_ref).abs().max().item() <= rtol * (
+                out_pt - out_ref
+            ).abs().max().item() + fwd_atol
+
+        if (
+            dtype != torch.float8_e4m3fn
+            and not has_qv
+            and not dv > 256
+            and not attention_chunk != 0
+            and dv == d
+            and not has_learnable_sink
+            and False
+        ):
+            g_unpad = torch.randn_like(out_unpad)
+            do_o = ((g_unpad.float() * out_unpad.float()).sum(-1)).transpose(-1, -2)
+            # import flash_attn_3_cuda
+            # dq_unpad, dk_unpad, dv_unpad, softmax_d, dq_accum, lse_log2 = flash_attn_3_cuda.bwd_varlen(
+            #     g_unpad,
+            #     q_unpad,
+            #     k_unpad,
+            #     v_unpad,
+            #     out_unpad,
+            #     lse,
+            #     None,
+            #     None,
+            #     None,
+            #     cu_seqlens_q,
+            #     cu_seqlens_k,
+            #     None, None,
+            #     max_seqlen_q,
+            #     max_seqlen_k,
+            #     d ** (-0.5),
+            #     causal,
+            #     window_size[0], window_size[1],
+            #     softcap,
+            #     deterministic,
+            #     0,  # sm_margin
+            # )
+            dq_unpad, dk_unpad, dv_unpad = torch.autograd.grad(
+                out_unpad, (q_unpad, k_unpad, v_unpad), g_unpad
+            )
+            dq = dq_pad_fn(dq_unpad)
+            dk = dk_pad_fn(dk_unpad)
+            dv = dk_pad_fn(dv_unpad)
+            if key_unused_mask is not None:
+                k_zero_masking = rearrange(key_unused_mask, "b s -> b s 1 1")
+                dk.masked_fill_(k_zero_masking, 0.0)
+                dv.masked_fill_(k_zero_masking, 0.0)
+            if query_unused_mask is not None:
+                dq.masked_fill_(q_zero_masking, 0.0)
+            # print(f"dO_O max diff: {(softmax_d - do_o).abs().max().item()}")
+            # assert (softmax_d - do_o).abs().max().item() <= 1e-5
+            # assert dq_accum.abs().max().item() == 0.0
+            g = output_pad_fn(g_unpad)
+
+            # qk = torch.einsum('bthd,bshd->bhts', q / (d ** 0.5), k).float()
+            # qk = torch.masked_fill(qk, rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf"))
+            # dS = torch.einsum('bthd,bshd->bhts', g.float(), v.float())
+            # P = torch.softmax(qk, -1)
+            # dP = P * (dS - (g.float() * out.float()).sum(-1).transpose(1, 2).unsqueeze(-1))
+            # dQ = torch.einsum('bhts,bshd->bthd', dP, k.float())
+            # dV = torch.einsum('bhts,bthd->bshd', P, g.float())
+            # dK = torch.einsum('bhts,bthd->bshd', dP, q.float())
+
+            # dq, dk, dv = torch.autograd.grad(out, (q, k, v), g)
+            dq_ref, dk_ref, dv_ref = torch.autograd.grad(
+                out_ref, (q_ref, k_ref, v_ref), g
+            )
+            dq_pt, dk_pt, dv_pt = torch.autograd.grad(out_pt, (q_ref, k_ref, v_ref), g)
+            print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}")
+            print(f"dK max diff: {(dk - dk_ref).abs().max().item()}")
+            print(f"dV max diff: {(dv - dv_ref).abs().max().item()}")
+            print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}")
+            print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}")
+            print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}")
+            print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}")
+            print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}")
+            print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}")
+            print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}")
+            print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}")
+            print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}")
+            # breakpoint()
+            dq_atol = 2 * (dq_ref + 0.3 - 0.3 - dq_ref).abs().max().item() + (
+                0 if softcap == 0 else 3e-4
+            )
+            assert (dq - dq_ref).abs().max().item() <= rtol * (
+                dq_pt - dq_ref
+            ).abs().max().item() + dq_atol
+            dk_atol = 2 * (dk_ref + 0.3 - 0.3 - dk_ref).abs().max().item() + (
+                0 if softcap == 0 else 3e-4
+            )
+            assert (dk - dk_ref).abs().max().item() <= rtol * (
+                dk_pt - dk_ref
+            ).abs().max().item() + dk_atol
+            dv_atol = 2 * (dv_ref + 0.3 - 0.3 - dv_ref).abs().max().item() + (
+                0 if softcap == 0 else 3e-4
+            )
+            assert (dv - dv_ref).abs().max().item() <= rtol * (
+                dv_pt - dv_ref
+            ).abs().max().item() + dv_atol
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_fp4_gemm.py b/sgl-kernel/tests/test_fp4_gemm.py
new file mode 100644
index 000000000..47401618b
--- /dev/null
+++ b/sgl-kernel/tests/test_fp4_gemm.py
@@ -0,0 +1,154 @@
+import pytest
+import torch
+from sgl_kernel import cutlass_scaled_fp4_mm, scaled_fp4_quant
+
+skip_condition = torch.cuda.get_device_capability() < (10, 0)
+
+DTYPES = [torch.float16, torch.bfloat16]
+# m, n, k
+SHAPES = [(128, 128, 64), (128, 128, 128), (256, 128, 64), (128, 256, 128)]
+PAD_SHAPES = [(150, 128, 64), (128, 128, 96)]
+SHAPES.extend(PAD_SHAPES)
+
+FLOAT4_E2M1_MAX = 6.0
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+kE2M1ToFloatArray = [
+    0.0,
+    0.5,
+    1.0,
+    1.5,
+    2.0,
+    3.0,
+    4.0,
+    6.0,
+]
+
+
+def e2m1_to_fp32(int4_value):
+    signBit = int4_value & 0x8
+    int4_absValue = int4_value & 0x7
+    float_result = kE2M1ToFloatArray[int4_absValue]
+    if signBit:
+        float_result = -float_result
+    return float_result
+
+
+def break_fp4_bytes(a, dtype):
+    assert a.dtype == torch.uint8
+    m, n = a.shape
+    a = a.flatten()
+    # Get upper 4 bits
+    highHalfByte = (a & 0xF0) >> 4
+    # Get lower 4 bits
+    lowHalfByte = a & 0x0F
+    fH = torch.tensor([e2m1_to_fp32(x) for x in highHalfByte]).to(a.device)
+    fL = torch.tensor([e2m1_to_fp32(x) for x in lowHalfByte]).to(a.device)
+    # [0xAB, 0xCD] -> [0xB, 0xA, 0xD, 0xC]
+    out = torch.stack((fL, fH), dim=-1).reshape(m, n * 2)
+    return out
+
+
+def convert_swizzled_to_linear(a_sf_swizzled: torch.Tensor, m, k, block_size):
+    sf_m, sf_k = a_sf_swizzled.shape
+    m_tiles = (m + 128 - 1) // 128
+    f = block_size * 4
+    k_tiles = (k + f - 1) // f
+    tmp = torch.reshape(a_sf_swizzled, (1, m_tiles, k_tiles, 32, 4, 4))
+    tmp = torch.permute(tmp, (0, 1, 4, 3, 2, 5))
+    out = tmp.reshape(m_tiles * 128, k_tiles * f // block_size)
+    return out[0:m, 0:k]
+
+
+def dequantize_to_dtype(
+    tensor_fp4, tensor_sf, global_scale, dtype, device, block_size=16
+):
+    """Dequantize the fp4 tensor back to high precision."""
+    # Two fp4 values are packed into one uint8.
+    assert tensor_fp4.dtype == torch.uint8
+    m, packed_k = tensor_fp4.shape
+    k = packed_k * 2
+    tensor_f32 = break_fp4_bytes(tensor_fp4, dtype)
+    tensor_f32 = tensor_f32.reshape(m, k // block_size, block_size)
+    tensor_sf = tensor_sf.view(torch.float8_e4m3fn)
+    tensor_sf = convert_swizzled_to_linear(tensor_sf, m, k, block_size)
+    tensor_sf_dtype = tensor_sf.to(torch.float32) / global_scale
+
+    # scale the tensor
+    out = (tensor_f32 * tensor_sf_dtype.unsqueeze(-1)).reshape(m, k)
+    return out
+
+
+def get_ref_results(
+    a_fp4,
+    b_fp4,
+    a_sf,
+    b_sf,
+    a_global_scale,
+    b_global_scale,
+    m,
+    n,
+    dtype,
+    block_size,
+    device,
+):
+    _, m_k = a_fp4.shape
+    _, n_k = b_fp4.shape
+    assert m_k == n_k
+    a_in_dtype = dequantize_to_dtype(
+        a_fp4, a_sf, a_global_scale, dtype=dtype, device=device, block_size=block_size
+    )
+    b_in_dtype = dequantize_to_dtype(
+        b_fp4, b_sf, b_global_scale, dtype=dtype, device=device, block_size=block_size
+    )
+    return torch.matmul(a_in_dtype, b_in_dtype.t())
+
+
+@pytest.mark.skipif(
+    skip_condition, reason="Nvfp4 Requires compute capability of 10 or above."
+)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@torch.inference_mode()
+def test_nvfp4_gemm(
+    dtype: torch.dtype,
+    shape: tuple[int, int],
+) -> None:
+    m, n, packed_k = shape
+    k = packed_k * 2
+    block_size = 16
+    a_dtype = torch.randn((m, k), dtype=dtype, device="cuda")
+    b_dtype = torch.randn((n, k), dtype=dtype, device="cuda")
+
+    a_global_scale = (
+        (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(a_dtype.flatten(), dim=-1)
+    ).to(torch.float32)
+    b_global_scale = (
+        (FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(b_dtype.flatten(), dim=-1)
+    ).to(torch.float32)
+    alpha = 1.0 / (a_global_scale * b_global_scale)
+    a_fp4, a_scale_interleaved = scaled_fp4_quant(a_dtype, a_global_scale)
+    b_fp4, b_scale_interleaved = scaled_fp4_quant(b_dtype, b_global_scale)
+
+    expected_out = get_ref_results(
+        a_fp4,
+        b_fp4,
+        a_scale_interleaved,
+        b_scale_interleaved,
+        a_global_scale,
+        b_global_scale,
+        m,
+        n,
+        dtype,
+        block_size,
+        "cuda",
+    )
+    out = cutlass_scaled_fp4_mm(
+        a_fp4, b_fp4, a_scale_interleaved, b_scale_interleaved, alpha, dtype
+    )
+
+    torch.testing.assert_close(out, expected_out.to(dtype=dtype), atol=1e-1, rtol=1e-1)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_fp4_quantize.py b/sgl-kernel/tests/test_fp4_quantize.py
new file mode 100644
index 000000000..3e83e47ac
--- /dev/null
+++ b/sgl-kernel/tests/test_fp4_quantize.py
@@ -0,0 +1,261 @@
+import pytest
+import torch
+from sgl_kernel import (
+    scaled_fp4_grouped_quant,
+    scaled_fp4_quant,
+    silu_and_mul,
+    silu_and_mul_scaled_fp4_grouped_quant,
+)
+
+skip_condition = torch.cuda.get_device_capability() < (10, 0)
+
+DTYPES = [torch.float16, torch.bfloat16]
+SHAPES = [(128, 64), (128, 128), (256, 64), (256, 128)]
+PAD_SHAPES = [
+    (90, 64),
+    (150, 64),
+    (128, 48),
+    (128, 80),
+    (150, 80),
+    (90, 48),
+    (90, 128),
+    (150, 128),
+    (150, 48),
+    (90, 80),
+]
+
+FLOAT4_E2M1_MAX = 6.0
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+# E2M1 to float
+# 0111 -> 6
+# 0110 -> 4
+# 0101 -> 3
+# 0100 -> 2
+# 0011 -> 1.5
+# 0010 -> 1
+# 0001 -> 0.5
+# 0000 -> 0
+E2M1_TO_FLOAT32 = [
+    0.0,
+    0.5,
+    1.0,
+    1.5,
+    2.0,
+    3.0,
+    4.0,
+    6.0,
+    0.0,
+    -0.5,
+    -1.0,
+    -1.5,
+    -2.0,
+    -3.0,
+    -4.0,
+    -6.0,
+]
+BLOCK_SIZE = 16
+
+
+def cast_from_fp4(x, m, n):
+    # The fp4 values are packed in uint8 as [v_1st | v_2nd]
+    v_2nd = x & 0xF
+    v_1st = (x >> 4) & 0xF
+    c = torch.stack((v_2nd, v_1st), dim=-1)
+    out = torch.tensor([E2M1_TO_FLOAT32[x] for x in c.flatten()])
+    out = out.reshape(m, n).to(torch.float32)
+    return out
+
+
+def cast_to_fp4(x):
+    sign = torch.sign(x)
+    x = torch.abs(x)
+    x[(x >= 0.0) & (x <= 0.25)] = 0.0
+    x[(x > 0.25) & (x < 0.75)] = 0.5
+    x[(x >= 0.75) & (x <= 1.25)] = 1.0
+    x[(x > 1.25) & (x < 1.75)] = 1.5
+    x[(x >= 1.75) & (x <= 2.5)] = 2.0
+    x[(x > 2.5) & (x < 3.5)] = 3.0
+    x[(x >= 3.5) & (x <= 5.0)] = 4.0
+    x[x > 5.0] = 6.0
+    return x * sign
+
+
+def get_reciprocal(x):
+    if isinstance(x, torch.Tensor):
+        return torch.where(x == 0, torch.tensor(0.0, dtype=x.dtype), 1.0 / x)
+    elif isinstance(x, (float, int)):
+        return 0.0 if x == 0 else 1.0 / x
+    else:
+        raise TypeError("Input must be a float, int, or a torch.Tensor.")
+
+
+def ref_nvfp4_quant(x, global_scale):
+    assert global_scale.dtype == torch.float32
+    assert x.ndim == 2
+    m, n = x.shape
+    x = torch.reshape(x, (m, n // BLOCK_SIZE, BLOCK_SIZE))
+    vec_max = torch.max(torch.abs(x), dim=-1, keepdim=True)[0].to(torch.float32)
+    scale = global_scale * (vec_max * get_reciprocal(FLOAT4_E2M1_MAX))
+    scale = scale.to(torch.float8_e4m3fn).to(torch.float32)
+    output_scale = get_reciprocal(scale * get_reciprocal(global_scale))
+
+    scaled_x = x.to(torch.float32) * output_scale
+    clipped_x = torch.clamp(scaled_x, -6.0, 6.0).reshape(m, n)
+    return cast_to_fp4(clipped_x), scale.squeeze(-1)
+
+
+def recover_swizzled_scales(scale, m, n):
+    rounded_m = ((m + 128 - 1) // 128) * 128
+    scale_n = n // BLOCK_SIZE
+    rounded_n = ((scale_n + 4 - 1) // 4) * 4
+    # Recover the swizzled scaling factor to linear layout
+    tmp = torch.reshape(scale, (1, rounded_m // 128, rounded_n // 4, 32, 4, 4))
+    tmp = torch.permute(tmp, (0, 1, 4, 3, 2, 5))
+    result = torch.reshape(tmp, (rounded_m, rounded_n)).to(torch.float32)
+    return result[:m, :scale_n]
+
+
+@pytest.mark.skipif(
+    skip_condition, reason="Nvfp4 Requires compute capability of 10 or above."
+)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@torch.inference_mode()
+def test_quantize_to_fp4(
+    dtype: torch.dtype,
+    shape: tuple[int, int],
+) -> None:
+    torch.manual_seed(42)
+    torch.set_default_device("cuda:0")
+
+    m, n = shape
+
+    x = torch.randn((m, n), dtype=dtype)
+    tensor_amax = torch.abs(x).max().to(torch.float32)
+    global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
+    out_ref, scale_ref = ref_nvfp4_quant(x, global_scale)
+
+    out, out_scale = scaled_fp4_quant(x, global_scale)
+    scale_ans = recover_swizzled_scales(out_scale, m, n)
+    out_ans = cast_from_fp4(out, m, n)
+
+    torch.testing.assert_close(out_ans, out_ref)
+    torch.testing.assert_close(scale_ans, scale_ref)
+
+
+@pytest.mark.skipif(
+    skip_condition, reason="Nvfp4 Requires compute capability of 10 or above."
+)
+@pytest.mark.parametrize("pad_shape", PAD_SHAPES)
+@torch.inference_mode()
+def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None:
+    torch.manual_seed(42)
+    dtype = torch.float16
+    torch.set_default_device("cuda:0")
+
+    m, n = pad_shape
+
+    x = torch.randn((m, n), dtype=dtype)
+
+    tensor_amax = torch.abs(x).max().to(torch.float32)
+    global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
+    out_ref, scale_ref = ref_nvfp4_quant(x, global_scale)
+
+    out, out_scale = scaled_fp4_quant(x, global_scale)
+
+    scale_ans = recover_swizzled_scales(out_scale, m, n)
+    out_ans = cast_from_fp4(out, m, n)
+
+    torch.testing.assert_close(out_ans, out_ref)
+    torch.testing.assert_close(scale_ans, scale_ref)
+
+
+@pytest.mark.skipif(
+    skip_condition, reason="Nvfp4 Requires compute capability of 10 or above."
+)
+@pytest.mark.parametrize("shape", [(2, 512, 2048), (2, 100, 128), (2, 128, 96)])
+def test_quantize_to_fp4_grouped(shape):
+    torch.manual_seed(42)
+    torch.set_default_device("cuda:0")
+
+    l, m, k = shape
+    x = torch.randn((l, m, k), dtype=torch.bfloat16)
+    max_m = m // 2
+    assert max_m <= m
+    mask = torch.randint(1, max_m, (l,), dtype=torch.int32)
+    tensor_amax = x.abs().amax(dim=(1, 2)).to(torch.float32)
+    x_sf_global = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
+    output, output_scales = scaled_fp4_grouped_quant(
+        x,
+        x_sf_global,
+        mask,
+    )
+    # output in logical (m, k, l), but its physical layout is (l, m, k).
+    # So permute first to (l, m, k).
+    output = output.permute(2, 0, 1)
+    # output_scale in logical (32, 4, rm, 4, rk, l), but its physical layout is (l, rm, rk, 32, 4, 4).
+    # So permute first to (l, rm, rk, 32, 4, 4).
+    padded_m = ((m + 128 - 1) // 128) * 128
+    output_scales = output_scales.permute(5, 2, 4, 0, 1, 3).view(l, padded_m, -1)
+    for i in range(l):
+        a_fp4, a_scale_interleaved = scaled_fp4_quant(x[i], x_sf_global[i])
+        torch.testing.assert_close(a_fp4[: mask[i]], output[i][: mask[i]])
+        # Recover swizzled scales to linear layout and drop padded values, so
+        # no extra checks on padding are needed.
+        scale_ref = recover_swizzled_scales(a_scale_interleaved, m, k)
+        scale_ans = recover_swizzled_scales(output_scales[i], m, k)
+        torch.testing.assert_close(scale_ref[: mask[i]], scale_ans[: mask[i]])
+
+
+@pytest.mark.skipif(
+    skip_condition, reason="Nvfp4 Requires compute capability of 10 or above."
+)
+@pytest.mark.parametrize("shape", [(32, 100, 2048), (32, 512, 2048), (6, 6144, 2048)])
+def test_silu_and_mul_quantize_to_fp4_grouped(shape):
+    torch.manual_seed(42)
+    torch.set_default_device("cuda:0")
+
+    l, m, k = shape
+    x = torch.randn((l, m, k * 2), dtype=torch.bfloat16)
+    max_m = m // 2
+    assert max_m <= m
+    mask = torch.randint(1, max_m, (l,), dtype=torch.int32)
+
+    ref_y = silu_and_mul(x)
+    tensor_amax = ref_y.abs().amax(dim=(1, 2)).to(torch.float32)
+    y_sf_global = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
+    ref_output, ref_output_scales = scaled_fp4_grouped_quant(
+        ref_y,
+        y_sf_global,
+        mask,
+    )
+    output, output_scales = silu_and_mul_scaled_fp4_grouped_quant(
+        x,
+        y_sf_global,
+        mask,
+    )
+
+    # output in logical (m, k, l), but its physical layout is (l, m, k).
+    # So permute first to (l, m, k).
+    output = output.permute(2, 0, 1)
+    ref_output = ref_output.permute(2, 0, 1)
+
+    # output_scale in logical (32, 4, rm, 4, rk, l), but its physical layout is (l, rm, rk, 32, 4, 4).
+    # So permute first to (l, rm, rk, 32, 4, 4).
+    padded_m = ((m + 128 - 1) // 128) * 128
+    output_scales = output_scales.permute(5, 2, 4, 0, 1, 3).view(l, padded_m, -1)
+    ref_output_scales = ref_output_scales.permute(5, 2, 4, 0, 1, 3).view(
+        l, padded_m, -1
+    )
+
+    for i in range(l):
+        torch.testing.assert_close(ref_output[i, : mask[i]], output[i, : mask[i]])
+        # We need to recover the swizzled scales to linear layout before applying mask slice.
+        scale_ref = recover_swizzled_scales(ref_output_scales[i], m, k)
+        scale_ans = recover_swizzled_scales(output_scales[i], m, k)
+        torch.testing.assert_close(scale_ref[: mask[i]], scale_ans[: mask[i]])
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_fp8_blockwise_gemm.py b/sgl-kernel/tests/test_fp8_blockwise_gemm.py
new file mode 100644
index 000000000..4c1dde336
--- /dev/null
+++ b/sgl-kernel/tests/test_fp8_blockwise_gemm.py
@@ -0,0 +1,93 @@
+import os
+import random
+from typing import Optional, Type
+
+import pytest
+import torch
+from sgl_kernel import fp8_blockwise_scaled_mm
+
+
+def cdiv(a: int, b: int) -> int:
+    return -(a // -b)
+
+
+def scale_shape(shape, group_shape):
+    assert len(shape) == len(group_shape)
+    return tuple(cdiv(shape[i], group_shape[i]) for i in range(len(group_shape)))
+
+
+def baseline_scaled_mm(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    out_dtype: Type[torch.dtype],
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    # We treat N-dimensional group scaling as extended numpy-style broadcasting
+    # in numpy simply stretches dimensions with an extent of 1 to match the
+    # the target shape by repeating the data along that dimension (broadcasting)
+    # , we extend these semantics to say if the extent of a dimension in the
+    # source shape is not 1 and does not match the target shape we repeat each
+    # element along that dimension src_shape[dim] // target_shape[dim] times
+    # example if we have:
+    #       a = [[1, 2], and target_shape = (2, 4)
+    #            [3, 4]]
+    # then we would expand a to:
+    #       a = [[1, 1, 2, 2],
+    #            [3, 3, 4, 4]]
+    # NOTE this function this function does not explicitly broadcast dimensions
+    # with an extent of 1, since this can be done implicitly by pytorch
+    def group_broadcast(t, shape):
+        for i, s in enumerate(shape):
+            if t.shape[i] != s and t.shape[i] != 1:
+                assert s % t.shape[i] == 0
+                t = (
+                    t.unsqueeze(i + 1)
+                    .expand(*t.shape[: i + 1], s // t.shape[i], *t.shape[i + 1 :])
+                    .flatten(i, i + 1)
+                )
+        return t
+
+    scale_a = group_broadcast(scale_a, a.shape)
+    scale_b = group_broadcast(scale_b, b.shape)
+    output = torch.mm(
+        (scale_a * a.to(dtype=torch.float32)), (scale_b * b.to(dtype=torch.float32))
+    ).to(out_dtype)
+    if bias is not None:
+        output = output + bias
+    return output
+
+
+def _test_accuracy_once(M, N, K, out_dtype, device):
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+    a_fp32 = (torch.rand(M, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
+    a_fp8 = a_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    b_fp32 = (torch.rand(N, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
+    b_fp8 = b_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn).t()
+    scale_a_group_shape = (1, 128)
+    scale_b_group_shape = (128, 128)
+    scale_a_shape = scale_shape(a_fp8.shape, scale_a_group_shape)
+    scale_b_shape = scale_shape(b_fp8.shape, scale_b_group_shape)
+    scale_a = torch.randn(scale_a_shape, device=device, dtype=torch.float32) * 0.001
+    scale_b = torch.randn(scale_b_shape, device=device, dtype=torch.float32) * 0.001
+    scale_a = scale_a.t().contiguous().t()
+    scale_b = scale_b.t().contiguous().t()
+    o = baseline_scaled_mm(a_fp8, b_fp8, scale_a, scale_b, out_dtype)
+    o1 = fp8_blockwise_scaled_mm(a_fp8, b_fp8, scale_a, scale_b, out_dtype)
+    rtol = 0.02
+    atol = 1
+    torch.testing.assert_close(o, o1, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("M", [1, 3, 5, 127, 128, 512, 1024, 4096])
+@pytest.mark.parametrize("N", [128, 512, 1024, 4096, 8192, 14080])
+@pytest.mark.parametrize("K", [512, 1024, 4096, 8192, 14080, 16384])
+@pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
+def test_accuracy(M, N, K, out_dtype):
+    _test_accuracy_once(M, N, K, out_dtype, "cuda")
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_fp8_blockwise_moe.py b/sgl-kernel/tests/test_fp8_blockwise_moe.py
new file mode 100755
index 000000000..6f2279393
--- /dev/null
+++ b/sgl-kernel/tests/test_fp8_blockwise_moe.py
@@ -0,0 +1,221 @@
+import random
+from typing import Tuple
+
+import pytest
+import torch
+from sgl_kernel import fp8_blockwise_scaled_grouped_mm
+
+
+def cdiv(a: int, b: int) -> int:
+    return -(a // -b)
+
+
+def scale_shape(shape, group_shape):
+    return tuple(cdiv(shape[i], group_shape[i]) for i in range(len(group_shape)))
+
+
+def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(min=finfo.min, max=finfo.max)).to(
+        dtype=torch.float8_e4m3fn
+    )
+
+
+# Copy from: https://github.com/deepseek-ai/DeepGEMM/blob/main/deep_gemm/utils.py
+def calc_diff(x, y):
+    x, y = x.double(), y.double()
+    denominator = (x * x + y * y).sum()
+    sim = 2 * (x * y).sum() / denominator
+    return 1 - sim
+
+
+def ceil_div(x: int, y: int) -> int:
+    return (x + y - 1) // y
+
+
+def per_token_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    pad_size = (128 - (n % 128)) % 128
+    x = torch.nn.functional.pad(x, (0, pad_size), value=0) if pad_size > 0 else x
+    x_view = x.view(m, -1, 128)
+    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
+    fp8_data = (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn)
+    return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
+
+
+def per_block_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros(
+        (ceil_div(m, 128) * 128, ceil_div(n, 128) * 128), dtype=x.dtype, device=x.device
+    )
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (x_amax / 448.0).view(
+        x_view.size(0), x_view.size(2)
+    )
+
+
+def baseline_scaled_mm(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    out_dtype: type[torch.dtype],
+) -> torch.Tensor:
+
+    def group_broadcast(t, shape):
+        for i, s in enumerate(shape):
+            if t.shape[i] != s and t.shape[i] != 1:
+                assert s % t.shape[i] == 0
+                t = (
+                    t.unsqueeze(i + 1)
+                    .expand(*t.shape[: i + 1], s // t.shape[i], *t.shape[i + 1 :])
+                    .flatten(i, i + 1)
+                )
+        return t
+
+    scale_a = group_broadcast(scale_a, a.shape)
+    scale_b = group_broadcast(scale_b, b.shape)
+
+    return torch.mm(
+        (scale_a * a.to(dtype=torch.float32)), (scale_b * b.to(dtype=torch.float32))
+    ).to(out_dtype)
+
+
+def is_sm100_supported(device=None) -> bool:
+    return (torch.cuda.get_device_capability(device)[0] == 10) and (
+        torch.version.cuda >= "12.8"
+    )
+
+
+def is_sm90_supported(device=None) -> bool:
+    return (torch.cuda.get_device_capability(device)[0] == 9) and (
+        torch.version.cuda >= "12.3"
+    )
+
+
+@pytest.mark.skipif(
+    not (is_sm100_supported() or is_sm90_supported()),
+    reason="fp8_blockwise_scaled_grouped_mm at sgl-kernel is only supported on sm100 or sm90",
+)
+@pytest.mark.parametrize("num_experts", [8, 16, 32, 64, 128])
+@pytest.mark.parametrize("out_dtype", [torch.half, torch.bfloat16])
+def test_fp8_blockwise_scaled_grouped_mm(num_experts, out_dtype):
+    device = "cuda"
+    alignment = 128
+    n_g = random.randint(1, 64) * 128
+    k_g = random.randint(1, 64) * 128
+
+    expert_offsets = torch.zeros((num_experts + 1), device=device, dtype=torch.int32)
+    problem_sizes = torch.zeros((num_experts, 3), device=device, dtype=torch.int32)
+    layout_sfa = torch.zeros((num_experts, 5), device=device, dtype=torch.int32)
+    layout_sfb = torch.zeros((num_experts, 5), device=device, dtype=torch.int32)
+
+    a_tensors = []
+    b_tensors = []
+    a_scales_tensors = []
+    b_scales_tensors = []
+    baseline_tensors = []
+
+    for g in range(num_experts):
+        m_g = random.randint(1, 256)
+        expert_offsets[g + 1] = expert_offsets[g] + m_g
+        problem_sizes[g][:] = torch.tensor([m_g, n_g, k_g], device=device)
+
+        a = torch.randn((m_g, k_g), device=device, dtype=out_dtype)  # (M, K):(K, 1)
+        b = torch.randn((n_g, k_g), device=device, dtype=out_dtype).t()  # (K, N):(1, K)
+
+        a_g, a_scale = per_token_cast_to_fp8(
+            a
+        )  # ag -- (M, K):(K, 1), a_scale() -- (M, k):(k, 1)
+        b_g, b_scale = per_block_cast_to_fp8(
+            b
+        )  # bg -- (K, N):(N, 1), b_scale() -- (k, n):(n, 1)
+        a_tensors.append(a_g)
+        b_tensors.append(b_g)
+        a_scales_tensors.append(a_scale)
+        b_scales_tensors.append(b_scale)
+
+        baseline = torch.mm(a, b)
+        baseline_tensors.append(baseline)
+    a_stack = torch.empty(
+        (expert_offsets[-1], k_g), device=device, dtype=torch.float8_e4m3fn
+    )
+    b_stack = torch.empty(
+        (num_experts, n_g, k_g), device=device, dtype=torch.float8_e4m3fn
+    )
+    a_scale_stack = torch.empty(
+        (expert_offsets[-1], (k_g // 128)), device=device, dtype=torch.float32
+    )
+    b_scale_stack = torch.empty(
+        (num_experts, n_g // 128, k_g // 128), device=device, dtype=torch.float32
+    )
+
+    for g in range(num_experts):
+        # Matrix A is Row-Major.
+        a_stack[expert_offsets[g] : expert_offsets[g + 1], :] = a_tensors[
+            g
+        ]  # a_stack[expert_offsets[g] : expert_offsets[g + 1], :] -- (M, K):(K, 1)
+        b_stack[g] = b_tensors[g].t()  # b_stack[g] -- (N, K):(K, 1)
+
+        # We need K-Major scale factor
+        a_scale_stack[expert_offsets[g] : expert_offsets[g + 1], :] = a_scales_tensors[
+            g
+        ]
+        b_scale_stack[g] = b_scales_tensors[
+            g
+        ].t()  # b_scale_stack[g] -- (k, n):(n, 1), we need transpose & contiguous later
+    b_stack = b_stack.transpose(1, 2)  # Transpose Matrix B to Column-Major.
+    b_scale_stack = b_scale_stack.transpose(1, 2)
+
+    c_out = torch.empty((expert_offsets[-1], n_g), device=device, dtype=out_dtype)
+    a_strides = torch.full(
+        (num_experts,), a_stack.stride(0), device=device, dtype=torch.int64
+    )
+    c_strides = torch.full(
+        (num_experts,), c_out.stride(0), device=device, dtype=torch.int64
+    )
+    workspace = torch.empty((1024 * 1024 * 1024), device=device, dtype=torch.uint8)
+    a_ptrs = torch.empty((num_experts,), device=device, dtype=torch.int64)
+    b_ptrs = torch.empty((num_experts,), device=device, dtype=torch.int64)
+    out_ptrs = torch.empty((num_experts,), device=device, dtype=torch.int64)
+    a_scales_ptrs = torch.empty((num_experts,), device=device, dtype=torch.int64)
+    b_scales_ptrs = torch.empty((num_experts,), device=device, dtype=torch.int64)
+
+    fp8_blockwise_scaled_grouped_mm(
+        c_out,
+        a_ptrs,
+        b_ptrs,
+        out_ptrs,
+        a_scales_ptrs,
+        b_scales_ptrs,
+        a_stack,
+        b_stack,
+        a_scale_stack,
+        b_scale_stack,
+        a_strides,
+        a_strides,
+        c_strides,
+        layout_sfa,
+        layout_sfb,
+        problem_sizes,
+        expert_offsets[:-1],
+        workspace,
+    )
+
+    for g in range(num_experts):
+        baseline = baseline_tensors[g]
+        actual = c_out[expert_offsets[g] : expert_offsets[g + 1]]
+        diff = calc_diff(actual, baseline)
+        assert diff < 0.001
+        print(
+            f"m_g={baseline.shape[0]} n_g={n_g} k_g={k_g} num_experts={num_experts}, out_dtype={out_dtype}, diff={diff:.5f}: OK"
+        )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_fp8_gemm.py b/sgl-kernel/tests/test_fp8_gemm.py
new file mode 100644
index 000000000..e70e62af2
--- /dev/null
+++ b/sgl-kernel/tests/test_fp8_gemm.py
@@ -0,0 +1,49 @@
+import pytest
+import torch
+from sgl_kernel import fp8_scaled_mm
+
+
+def torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias):
+    o = torch.matmul(a.to(torch.float32), b.to(torch.float32))
+    o = o.to(torch.float32)
+    temp1 = o * scale_a.view(-1, 1)
+    temp2 = temp1 * scale_b.view(1, -1)
+    final = temp2.to(out_dtype)
+    if bias is not None:
+        final = final + bias.view(1, -1)
+    return final
+
+
+def _test_accuracy_once(M, N, K, with_bias, out_dtype, device):
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+    a_fp32 = (torch.rand(M, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
+    a_fp8 = a_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    b_fp32 = (torch.rand(N, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
+    b_fp8 = b_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    scale_a = torch.randn((M,), device=device, dtype=torch.float32) * 0.001
+    scale_b = torch.randn((N,), device=device, dtype=torch.float32) * 0.001
+    if with_bias:
+        bias = torch.randn((N,), device=device, dtype=out_dtype)
+    else:
+        bias = None
+    b_fp8 = b_fp8.t()
+    o = torch_scaled_mm(a_fp8, b_fp8, scale_a, scale_b, out_dtype, bias)
+    o1 = fp8_scaled_mm(a_fp8, b_fp8, scale_a, scale_b, out_dtype, bias)
+    rtol = 0.02
+    atol = 1
+    torch.testing.assert_close(o, o1, rtol=rtol, atol=atol)
+    print(f"M={M}, N={N}, K={K}, with_bias={with_bias}, out_dtype={out_dtype}: OK")
+
+
+@pytest.mark.parametrize("M", [1, 128, 512, 1024, 4096])
+@pytest.mark.parametrize("N", [16, 128, 512, 1024, 4096])
+@pytest.mark.parametrize("K", [512, 1024, 4096, 8192, 16384])
+@pytest.mark.parametrize("with_bias", [True, False])
+@pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
+def test_accuracy(M, N, K, with_bias, out_dtype):
+    _test_accuracy_once(M, N, K, with_bias, out_dtype, "cuda")
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_gptq_kernel.py b/sgl-kernel/tests/test_gptq_kernel.py
new file mode 100644
index 000000000..0ef4fad2c
--- /dev/null
+++ b/sgl-kernel/tests/test_gptq_kernel.py
@@ -0,0 +1,131 @@
+import pytest
+import torch
+from sgl_kernel import gptq_gemm
+
+from sglang.srt.layers.quantization.utils import pack_cols, pack_rows
+
+
+def torch_dequantize(q_weight, q_zeros, scales, g_idx, use_shuffle, bit, K, N):
+    assert bit == 4, "Reference dequantization only supports 4-bit"
+    group_size = K // scales.shape[0]
+    pack_factor = 32 // bit
+
+    # unpack q_weight: (K//pack_factor, N) -> (K, N)
+    unpacked_q_weight = torch.empty(
+        q_weight.shape[0] * pack_factor,
+        q_weight.shape[1],
+        dtype=torch.uint8,
+        device=q_weight.device,
+    )
+    for i in range(pack_factor):
+        unpacked_q_weight[i::pack_factor, :] = (q_weight >> (i * 4)) & 0x0F
+
+    # unpack q_zeros: (num_groups, N//pack_factor) -> (num_groups, N)
+    unpacked_q_zeros = torch.empty(
+        q_zeros.shape[0],
+        q_zeros.shape[1] * pack_factor,
+        dtype=torch.uint8,
+        device=q_zeros.device,
+    )
+    for i in range(pack_factor):
+        unpacked_q_zeros[:, i::pack_factor] = (q_zeros >> (i * 4)) & 0x0F
+
+    unpacked_q_zeros += 1
+    unpacked_q_zeros = unpacked_q_zeros.to(scales.dtype)
+
+    scale_zeros = unpacked_q_zeros * scales  # (num_groups, N)
+
+    current_g_idx = torch.tensor(
+        [i // group_size for i in range(K)], dtype=torch.int32, device=q_weight.device
+    )
+
+    scale_mat = scales[current_g_idx]  # (K, N)
+    scale_zeros_mat = scale_zeros[current_g_idx]  # (K, N)
+
+    # dequant: weight * scale - scale_zeros
+    dequantized_b = unpacked_q_weight.to(scales.dtype) * scale_mat - scale_zeros_mat
+
+    return dequantized_b.reshape(K, N)
+
+
+def torch_gptq_gemm(
+    a, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, use_shuffle, bit
+):
+    K, N = a.shape[1], b_q_weight.shape[1]
+
+    b_dequant = torch_dequantize(
+        b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, use_shuffle, bit, K, N
+    )
+    c = torch.matmul(a, b_dequant)
+    return c
+
+
+def _test_gptq_gemm_once(M, N, K, bit, group_size, use_shuffle, dtype, device="cuda"):
+
+    b_fp = torch.randn(K, N, dtype=dtype, device=device)
+
+    assert K % group_size == 0, "K must be divisible by group_size"
+    num_groups = K // group_size
+
+    if use_shuffle:
+        return
+    else:
+        g_idx = torch.tensor(
+            [i // group_size for i in range(K)], dtype=torch.int32, device=device
+        )
+        b_shuffled = b_fp[g_idx]
+
+    b_grouped = b_shuffled.reshape(num_groups, group_size, N)
+
+    b_max = torch.max(b_grouped, dim=1, keepdim=True)[0]
+    b_min = torch.min(b_grouped, dim=1, keepdim=True)[0]
+
+    scales = (b_max - b_min) / (2**bit - 1)
+    scales = scales.clamp(min=1e-6)
+
+    zeros_float = (-b_min / scales).round()
+
+    q_b = (
+        (b_grouped / scales + zeros_float).round().clamp(0, 2**bit - 1).to(torch.uint8)
+    )
+
+    q_zeros_unpacked = zeros_float.to(torch.uint8) - 1
+
+    b_q_weight = pack_rows(q_b.reshape(K, N), bit, K, N)
+
+    q_zeros_unpacked = q_zeros_unpacked.reshape(num_groups, N)
+    b_gptq_qzeros = pack_cols(q_zeros_unpacked, bit, num_groups, N)
+    b_gptq_scales = scales.squeeze(1)
+
+    a = torch.randn(M, K, dtype=dtype, device=device)
+
+    c_ref = torch_gptq_gemm(
+        a, b_q_weight, b_gptq_qzeros, b_gptq_scales, g_idx, use_shuffle, bit
+    )
+    c_out = gptq_gemm(
+        a, b_q_weight, b_gptq_qzeros, b_gptq_scales, g_idx, use_shuffle, bit
+    )
+
+    rtol = 4e-2
+    atol = 4e-2
+    torch.testing.assert_close(c_ref, c_out, rtol=rtol, atol=atol)
+    print(
+        f"✅ Test passed: M={M}, N={N}, K={K}, bit={bit}, group_size={group_size}, use_shuffle={use_shuffle}, dtype={dtype}"
+    )
+
+
+@pytest.mark.parametrize("M", [1, 8, 128])
+@pytest.mark.parametrize("N", [2048, 4096])
+@pytest.mark.parametrize("K", [2048, 4096])
+@pytest.mark.parametrize("bit", [4])
+@pytest.mark.parametrize("group_size", [128])
+@pytest.mark.parametrize("use_shuffle", [False])
+@pytest.mark.parametrize("dtype", [torch.float16])
+def test_gptq_gemm(M, N, K, bit, group_size, use_shuffle, dtype):
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+    _test_gptq_gemm_once(M, N, K, bit, group_size, use_shuffle, dtype, "cuda")
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/sgl-kernel/tests/test_int8_gemm.py b/sgl-kernel/tests/test_int8_gemm.py
new file mode 100644
index 000000000..80f32cd02
--- /dev/null
+++ b/sgl-kernel/tests/test_int8_gemm.py
@@ -0,0 +1,48 @@
+import pytest
+import torch
+from sgl_kernel import int8_scaled_mm
+from utils import is_sm10x
+
+
+def to_int8(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias):
+    o = torch.matmul(a.to(torch.float32), b.to(torch.float32))
+    if bias is not None:
+        o = o.to(torch.float32) * scale_a.view(-1, 1) * scale_b.view(1, -1) + bias
+    else:
+        o = o.to(torch.float32) * scale_a.view(-1, 1) * scale_b.view(1, -1)
+    return o.to(out_dtype)
+
+
+def _test_accuracy_once(M, N, K, with_bias, out_dtype, device):
+    a = to_int8(torch.randn((M, K), device=device) * 5)
+    b = to_int8(torch.randn((N, K), device=device).t() * 5)
+    scale_a = torch.randn((M,), device="cuda", dtype=torch.float32)
+    scale_b = torch.randn((N,), device="cuda", dtype=torch.float32)
+    if with_bias:
+        bias = torch.randn((N,), device="cuda", dtype=out_dtype) * 10
+    else:
+        bias = None
+    o = int8_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
+    o1 = torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
+    torch.testing.assert_close(o, o1)
+
+
+@pytest.mark.skipif(
+    is_sm10x(),
+    reason="int8_scaled_mm is only supported on sm90 and lower",
+)
+@pytest.mark.parametrize("M", [1, 16, 32, 64, 128, 512, 1024, 4096, 8192])
+@pytest.mark.parametrize("N", [16, 128, 512, 1024, 4096, 8192, 16384])
+@pytest.mark.parametrize("K", [512, 1024, 4096, 8192, 16384])
+@pytest.mark.parametrize("with_bias", [True, False])
+@pytest.mark.parametrize("out_dtype", [torch.float16, torch.bfloat16])
+def test_accuracy(M, N, K, with_bias, out_dtype):
+    _test_accuracy_once(M, N, K, with_bias, out_dtype, "cuda")
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_kvcacheio.py b/sgl-kernel/tests/test_kvcacheio.py
new file mode 100644
index 000000000..07fcc2413
--- /dev/null
+++ b/sgl-kernel/tests/test_kvcacheio.py
@@ -0,0 +1,485 @@
+import pytest
+import torch
+from sgl_kernel.kvcacheio import (
+    transfer_kv_all_layer,
+    transfer_kv_all_layer_direct_lf_pf,
+    transfer_kv_all_layer_mla,
+    transfer_kv_direct,
+    transfer_kv_per_layer,
+    transfer_kv_per_layer_direct_pf_lf,
+    transfer_kv_per_layer_mla,
+)
+
+
+def ref_copy_with_indices(src_pool, dst_pool, src_indices, dst_indices):
+    dst_pool[dst_indices] = src_pool[src_indices].to(dst_pool.device)
+
+
+def ref_copy_with_indices_pf_direct(
+    src_pool, dst_pool, src_indices, dst_indices, page_size, layer_id, lf_to_pf=False
+):
+    if lf_to_pf:
+        for i in range(0, len(src_indices), page_size):
+            dst_pool[dst_indices[i] // page_size][layer_id] = src_pool[layer_id][
+                src_indices[i : i + page_size]
+            ].to(dst_pool.device)
+    else:
+        for i in range(0, len(src_indices), page_size):
+            dst_pool[layer_id][dst_indices[i : i + page_size]] = src_pool[
+                src_indices[i] // page_size
+            ][layer_id].to(dst_pool.device)
+
+
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("num_items_to_transfer", [1, 128, 1024])
+@pytest.mark.parametrize("page_size", [1, 16, 64])
+@pytest.mark.parametrize("item_size", [256])
+@pytest.mark.parametrize("total_items_in_pool", [10240])
+@pytest.mark.parametrize("is_mla", [False, True])
+@pytest.mark.parametrize("all_layers", [False, True])
+def test_transfer_kv(
+    dtype: torch.dtype,
+    num_items_to_transfer: int,
+    item_size: int,
+    page_size: int,
+    total_items_in_pool: int,
+    is_mla: bool,
+    all_layers: bool,
+):
+    """
+    Tests the per-layer transfer functions, treating tensors as memory pools.
+    """
+
+    original_dtype = torch.get_default_dtype()
+    torch.set_default_dtype(dtype)
+    device = "cuda"
+    torch.cuda.manual_seed(42)
+
+    num_layers = 4  # A small number of layers for pool creation
+
+    total_pages_in_pool = total_items_in_pool // page_size
+    num_pages_to_transfer = num_items_to_transfer // page_size
+    if num_pages_to_transfer == 0:
+        torch.set_default_dtype(original_dtype)
+        return
+    page_indices = torch.randperm(total_pages_in_pool, dtype=torch.int64)
+    src_indices_host = torch.cat(
+        [
+            torch.arange(p * page_size, (p + 1) * page_size)
+            for p in page_indices[:num_pages_to_transfer]
+        ]
+    )
+    src_indices_device = src_indices_host.to(device)
+    dst_indices_host = torch.cat(
+        [
+            torch.arange(p * page_size, (p + 1) * page_size)
+            for p in page_indices[num_pages_to_transfer : 2 * num_pages_to_transfer]
+        ]
+    )
+    dst_indices_device = dst_indices_host.to(device)
+
+    # Prepare memory pools based on whether it's an MLA case.
+    if is_mla:
+        src_pool_host = torch.randn(
+            num_layers, total_items_in_pool, item_size
+        ).pin_memory()
+        dst_pool_ref = torch.zeros_like(src_pool_host).to(device)
+        dst_pool_kernel = torch.zeros_like(dst_pool_ref)
+        dst_pool_direct = torch.zeros_like(dst_pool_ref)
+    else:
+        src_k_pool = torch.randn(
+            num_layers, total_items_in_pool, item_size
+        ).pin_memory()
+        src_v_pool = torch.randn(
+            num_layers, total_items_in_pool, item_size
+        ).pin_memory()
+        dst_k_pool_ref = torch.zeros_like(src_k_pool).to(device)
+        dst_v_pool_ref = torch.zeros_like(src_v_pool).to(device)
+        dst_k_pool_kernel = torch.zeros_like(dst_k_pool_ref)
+        dst_v_pool_kernel = torch.zeros_like(dst_v_pool_ref)
+        dst_k_pool_direct = torch.zeros_like(dst_k_pool_ref)
+        dst_v_pool_direct = torch.zeros_like(dst_v_pool_ref)
+
+    torch.cuda.synchronize()
+
+    # We will test the per-layer function on the first layer (index 0) of the pool.
+    layer_idx_to_test = 0
+
+    if is_mla:
+        if not all_layers:
+            ref_copy_with_indices(
+                src_pool_host[layer_idx_to_test],
+                dst_pool_ref[layer_idx_to_test],
+                src_indices_host,
+                dst_indices_device,
+            )
+            transfer_kv_per_layer_mla(
+                src_pool_host[layer_idx_to_test],
+                dst_pool_kernel[layer_idx_to_test],
+                src_indices_device,
+                dst_indices_device,
+                item_size=item_size * dtype.itemsize,
+            )
+            transfer_kv_direct(
+                [src_pool_host[layer_idx_to_test]],
+                [dst_pool_direct[layer_idx_to_test]],
+                src_indices_host,
+                dst_indices_device,
+                page_size=page_size,
+            )
+        else:
+            for layer_id in range(num_layers):
+                ref_copy_with_indices(
+                    src_pool_host[layer_id],
+                    dst_pool_ref[layer_id],
+                    src_indices_host,
+                    dst_indices_device,
+                )
+            src_layers_device = torch.tensor(
+                [src_pool_host[layer_id].data_ptr() for layer_id in range(num_layers)],
+                dtype=torch.uint64,
+                device=device,
+            )
+            dst_layers_device = torch.tensor(
+                [
+                    dst_pool_kernel[layer_id].data_ptr()
+                    for layer_id in range(num_layers)
+                ],
+                dtype=torch.uint64,
+                device=device,
+            )
+            transfer_kv_all_layer_mla(
+                src_layers_device,
+                dst_layers_device,
+                src_indices_device,
+                dst_indices_device,
+                item_size=item_size * dtype.itemsize,
+                num_layers=num_layers,
+            )
+            transfer_kv_direct(
+                [src_pool_host[layer_id] for layer_id in range(num_layers)],
+                [dst_pool_direct[layer_id] for layer_id in range(num_layers)],
+                src_indices_host,
+                dst_indices_device,
+                page_size=page_size,
+            )
+        torch.cuda.synchronize()
+        torch.testing.assert_close(dst_pool_kernel, dst_pool_ref)
+        torch.testing.assert_close(dst_pool_direct, dst_pool_ref)
+    else:
+        if not all_layers:
+            ref_copy_with_indices(
+                src_k_pool[layer_idx_to_test],
+                dst_k_pool_ref[layer_idx_to_test],
+                src_indices_host,
+                dst_indices_device,
+            )
+            ref_copy_with_indices(
+                src_v_pool[layer_idx_to_test],
+                dst_v_pool_ref[layer_idx_to_test],
+                src_indices_host,
+                dst_indices_device,
+            )
+            transfer_kv_per_layer(
+                src_k_pool[layer_idx_to_test],
+                dst_k_pool_kernel[layer_idx_to_test],
+                src_v_pool[layer_idx_to_test],
+                dst_v_pool_kernel[layer_idx_to_test],
+                src_indices_device,
+                dst_indices_device,
+                item_size=item_size * dtype.itemsize,
+            )
+            transfer_kv_direct(
+                [src_k_pool[layer_idx_to_test], src_v_pool[layer_idx_to_test]],
+                [
+                    dst_k_pool_direct[layer_idx_to_test],
+                    dst_v_pool_direct[layer_idx_to_test],
+                ],
+                src_indices_host,
+                dst_indices_device,
+                page_size=page_size,
+            )
+        else:
+            for layer_id in range(num_layers):
+                ref_copy_with_indices(
+                    src_k_pool[layer_id],
+                    dst_k_pool_ref[layer_id],
+                    src_indices_host,
+                    dst_indices_device,
+                )
+                ref_copy_with_indices(
+                    src_v_pool[layer_id],
+                    dst_v_pool_ref[layer_id],
+                    src_indices_host,
+                    dst_indices_device,
+                )
+
+            src_k_layers_device = torch.tensor(
+                [src_k_pool[layer_id].data_ptr() for layer_id in range(num_layers)],
+                dtype=torch.uint64,
+                device=device,
+            )
+            src_v_layers_device = torch.tensor(
+                [src_v_pool[layer_id].data_ptr() for layer_id in range(num_layers)],
+                dtype=torch.uint64,
+                device=device,
+            )
+            dst_k_layers_device = torch.tensor(
+                [
+                    dst_k_pool_kernel[layer_id].data_ptr()
+                    for layer_id in range(num_layers)
+                ],
+                dtype=torch.uint64,
+                device=device,
+            )
+            dst_v_layers_device = torch.tensor(
+                [
+                    dst_v_pool_kernel[layer_id].data_ptr()
+                    for layer_id in range(num_layers)
+                ],
+                dtype=torch.uint64,
+                device=device,
+            )
+            transfer_kv_all_layer(
+                src_k_layers_device,
+                dst_k_layers_device,
+                src_v_layers_device,
+                dst_v_layers_device,
+                src_indices_device,
+                dst_indices_device,
+                item_size=item_size * dtype.itemsize,
+                num_layers=num_layers,
+            )
+            transfer_kv_direct(
+                [src_k_pool[layer_id] for layer_id in range(num_layers)]
+                + [src_v_pool[layer_id] for layer_id in range(num_layers)],
+                [dst_k_pool_direct[layer_id] for layer_id in range(num_layers)]
+                + [dst_v_pool_direct[layer_id] for layer_id in range(num_layers)],
+                src_indices_host,
+                dst_indices_device,
+                page_size=page_size,
+            )
+        torch.cuda.synchronize()
+        torch.testing.assert_close(dst_k_pool_kernel, dst_k_pool_ref)
+        torch.testing.assert_close(dst_v_pool_kernel, dst_v_pool_ref)
+        torch.testing.assert_close(dst_k_pool_direct, dst_k_pool_ref)
+        torch.testing.assert_close(dst_v_pool_direct, dst_v_pool_ref)
+
+    torch.set_default_dtype(original_dtype)
+
+
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("num_items_to_transfer", [128, 1024, 8192])
+@pytest.mark.parametrize("page_size", [16, 64, 128])
+@pytest.mark.parametrize("item_size", [256])
+@pytest.mark.parametrize("total_items_in_pool", [20480])
+@pytest.mark.parametrize("is_mla", [False, True])
+@pytest.mark.parametrize("lf_to_pf", [False, True])
+def test_transfer_kv_pf_direct(
+    dtype: torch.dtype,
+    num_items_to_transfer: int,
+    item_size: int,
+    page_size: int,
+    total_items_in_pool: int,
+    is_mla: bool,
+    lf_to_pf: bool,
+):
+    original_dtype = torch.get_default_dtype()
+    torch.set_default_dtype(dtype)
+    device = "cuda"
+    torch.cuda.manual_seed(42)
+
+    num_layers = 4
+
+    total_pages_in_pool = total_items_in_pool // page_size
+    num_pages_to_transfer = num_items_to_transfer // page_size
+    if num_pages_to_transfer == 0:
+        torch.set_default_dtype(original_dtype)
+        return
+    page_indices = torch.randperm(total_pages_in_pool, dtype=torch.int64)
+    src_indices_host = torch.cat(
+        [
+            torch.arange(p * page_size, (p + 1) * page_size)
+            for p in page_indices[:num_pages_to_transfer]
+        ]
+    )
+    src_indices_device = src_indices_host.to(device)
+    dst_indices_host = torch.cat(
+        [
+            torch.arange(p * page_size, (p + 1) * page_size)
+            for p in page_indices[num_pages_to_transfer : 2 * num_pages_to_transfer]
+        ]
+    )
+    dst_indices_device = dst_indices_host.to(device)
+
+    # We will test the per-layer function on the first layer (index 0) of the pool.
+    layer_idx_to_test = 0
+
+    if lf_to_pf:
+        if is_mla:
+            src_pool = torch.randn(num_layers, total_items_in_pool, item_size).to(
+                device
+            )
+            src_pool_ptrs = [src_pool[i] for i in range(num_layers)]
+            dst_pool_ref = torch.zeros(
+                total_pages_in_pool, num_layers, page_size, item_size
+            ).pin_memory()
+            dst_pool_direct = torch.zeros_like(dst_pool_ref)
+            torch.cuda.synchronize()
+
+            transfer_kv_all_layer_direct_lf_pf(
+                src_pool_ptrs,
+                [dst_pool_direct],
+                src_indices_host,
+                dst_indices_host,
+                page_size,
+            )
+            for i in range(num_layers):
+                ref_copy_with_indices_pf_direct(
+                    src_pool,
+                    dst_pool_ref,
+                    src_indices_device,
+                    dst_indices_host,
+                    page_size,
+                    i,
+                    lf_to_pf=True,
+                )
+            torch.cuda.synchronize()
+            torch.testing.assert_close(dst_pool_direct, dst_pool_ref)
+
+        else:
+            src_k_pool = torch.randn(num_layers, total_items_in_pool, item_size).to(
+                device
+            )
+            src_k_pool_ptrs = [src_k_pool[i] for i in range(num_layers)]
+            src_v_pool = torch.randn(num_layers, total_items_in_pool, item_size).to(
+                device
+            )
+            src_v_pool_ptrs = [src_v_pool[i] for i in range(num_layers)]
+            dst_k_pool_ref = torch.zeros(
+                total_pages_in_pool, num_layers, page_size, item_size
+            ).pin_memory()
+            dst_v_pool_ref = torch.zeros_like(dst_k_pool_ref)
+            dst_k_pool_direct = torch.zeros_like(dst_k_pool_ref)
+            dst_v_pool_direct = torch.zeros_like(dst_v_pool_ref)
+            torch.cuda.synchronize()
+
+            transfer_kv_all_layer_direct_lf_pf(
+                src_k_pool_ptrs + src_v_pool_ptrs,
+                [dst_k_pool_direct, dst_v_pool_direct],
+                src_indices_host,
+                dst_indices_host,
+                page_size,
+            )
+            for i in range(num_layers):
+                ref_copy_with_indices_pf_direct(
+                    src_k_pool,
+                    dst_k_pool_ref,
+                    src_indices_device,
+                    dst_indices_host,
+                    page_size,
+                    i,
+                    lf_to_pf=True,
+                )
+                ref_copy_with_indices_pf_direct(
+                    src_v_pool,
+                    dst_v_pool_ref,
+                    src_indices_device,
+                    dst_indices_host,
+                    page_size,
+                    i,
+                    lf_to_pf=True,
+                )
+            torch.cuda.synchronize()
+            torch.testing.assert_close(dst_k_pool_direct, dst_k_pool_ref)
+            torch.testing.assert_close(dst_v_pool_direct, dst_v_pool_ref)
+    else:
+        if is_mla:
+            src_pool = torch.randn(
+                total_pages_in_pool, num_layers, page_size, item_size
+            ).pin_memory()
+
+            dst_pool_ref = torch.zeros(num_layers, total_items_in_pool, item_size).to(
+                device
+            )
+            dst_pool_direct = torch.zeros_like(dst_pool_ref)
+            dst_pool_direct_ptrs = [dst_pool_direct[i] for i in range(num_layers)]
+            torch.cuda.synchronize()
+
+            transfer_kv_per_layer_direct_pf_lf(
+                [src_pool],
+                [dst_pool_direct_ptrs[layer_idx_to_test]],
+                src_indices_host,
+                dst_indices_host,
+                layer_idx_to_test,
+                page_size,
+            )
+            ref_copy_with_indices_pf_direct(
+                src_pool,
+                dst_pool_ref,
+                src_indices_host,
+                dst_indices_device,
+                page_size,
+                layer_idx_to_test,
+                lf_to_pf=False,
+            )
+            torch.cuda.synchronize()
+            torch.testing.assert_close(dst_pool_direct, dst_pool_ref)
+        else:
+            src_k_pool = torch.randn(
+                total_pages_in_pool, num_layers, page_size, item_size
+            ).pin_memory()
+            src_v_pool = torch.randn(
+                total_pages_in_pool, num_layers, page_size, item_size
+            ).pin_memory()
+
+            dst_k_pool_ref = torch.zeros(num_layers, total_items_in_pool, item_size).to(
+                device
+            )
+            dst_k_pool_direct = torch.zeros_like(dst_k_pool_ref)
+            dst_k_pool_direct_ptrs = [dst_k_pool_direct[i] for i in range(num_layers)]
+
+            dst_v_pool_ref = torch.zeros_like(dst_k_pool_ref)
+            dst_v_pool_direct = torch.zeros_like(dst_v_pool_ref)
+            dst_v_pool_direct_ptrs = [dst_v_pool_direct[i] for i in range(num_layers)]
+            torch.cuda.synchronize()
+
+            transfer_kv_per_layer_direct_pf_lf(
+                [src_k_pool, src_v_pool],
+                [
+                    dst_k_pool_direct_ptrs[layer_idx_to_test],
+                    dst_v_pool_direct_ptrs[layer_idx_to_test],
+                ],
+                src_indices_host,
+                dst_indices_host,
+                layer_idx_to_test,
+                page_size,
+            )
+
+            ref_copy_with_indices_pf_direct(
+                src_k_pool,
+                dst_k_pool_ref,
+                src_indices_host,
+                dst_indices_device,
+                page_size,
+                layer_idx_to_test,
+                lf_to_pf=False,
+            )
+            ref_copy_with_indices_pf_direct(
+                src_v_pool,
+                dst_v_pool_ref,
+                src_indices_host,
+                dst_indices_device,
+                page_size,
+                layer_idx_to_test,
+                lf_to_pf=False,
+            )
+
+            torch.cuda.synchronize()
+            torch.testing.assert_close(dst_k_pool_direct, dst_k_pool_ref)
+            torch.testing.assert_close(dst_v_pool_direct, dst_v_pool_ref)
+    torch.set_default_dtype(original_dtype)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_lightning_attention_decode.py b/sgl-kernel/tests/test_lightning_attention_decode.py
new file mode 100644
index 000000000..f2f0ba258
--- /dev/null
+++ b/sgl-kernel/tests/test_lightning_attention_decode.py
@@ -0,0 +1,84 @@
+import pytest
+import torch
+from sgl_kernel import lightning_attention_decode
+
+
+def naive_lightning_attention_decode(q, k, v, past_kv, slope):
+    """Naive implementation of lightning attention decode"""
+    original_dtype = q.dtype
+    ratio = torch.exp(-slope)  # [h, 1, 1]
+
+    kv = past_kv
+    b, h, n, d = q.shape
+
+    output = []
+    for i in range(n):
+        kv = ratio * kv.to(torch.float32) + torch.einsum(
+            "... n d, ... n e -> ... d e",
+            k[:, :, i : i + 1],
+            v[:, :, i : i + 1],
+        )
+        qkv = torch.einsum(
+            "... n e, ... e d -> ... n d",
+            q[:, :, i : i + 1].to(torch.float32),
+            kv.to(torch.float32),
+        )
+        output.append(qkv)
+    output = torch.cat(output, dim=-2)
+
+    return output.to(original_dtype), kv
+
+
+configs = [
+    # (batch_size, num_heads, dim, embed_dim)
+    (1, 8, 64, 64),
+    (2, 8, 64, 64),
+    (1, 32, 32, 64),
+    (2, 32, 32, 64),
+    (4, 32, 64, 64),
+    (4, 32, 64, 64),
+    (16, 64, 96, 96),
+    (64, 64, 96, 96),
+]
+
+dtypes = [torch.float32, torch.float16, torch.bfloat16]
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.parametrize("dtype", dtypes)
+@pytest.mark.parametrize("batch_size,num_heads,dim,embed_dim", configs)
+def test_lightning_attention_decode(dtype, batch_size, num_heads, dim, embed_dim):
+    device = torch.device("cuda")
+
+    q = torch.randn(batch_size, num_heads, 1, dim, device=device, dtype=dtype)
+    k = torch.randn(batch_size, num_heads, 1, dim, device=device, dtype=dtype)
+    v = torch.randn(batch_size, num_heads, 1, embed_dim, device=device, dtype=dtype)
+    past_kv = torch.randn(batch_size, num_heads, dim, embed_dim, device=device)
+    slope = torch.randn(num_heads, 1, 1, device=device)
+
+    ref_output, ref_new_kv = naive_lightning_attention_decode(q, k, v, past_kv, slope)
+
+    output = torch.empty_like(ref_output)
+    new_kv = torch.empty_like(ref_new_kv)
+    lightning_attention_decode(q, k, v, past_kv, slope, output, new_kv)
+
+    rtol = 1e-2
+    atol = 1e-2
+
+    torch.testing.assert_close(
+        output,
+        ref_output,
+        rtol=rtol,
+        atol=atol,
+    )
+
+    torch.testing.assert_close(
+        new_kv,
+        ref_new_kv,
+        rtol=rtol,
+        atol=atol,
+    )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_marlin_gemm.py b/sgl-kernel/tests/test_marlin_gemm.py
new file mode 100644
index 000000000..9c4d48a39
--- /dev/null
+++ b/sgl-kernel/tests/test_marlin_gemm.py
@@ -0,0 +1,121 @@
+import pytest
+import torch
+from sgl_kernel import gptq_marlin_gemm
+from sgl_kernel.scalar_type import scalar_types
+
+from sglang.srt.layers.quantization.marlin_utils import marlin_make_workspace
+from sglang.test.test_marlin_utils import awq_marlin_quantize, marlin_quantize
+
+MNK_FACTORS = [
+    (1, 1, 1),
+    (1, 4, 8),
+    (1, 7, 5),
+    (13, 17, 67),
+    (26, 37, 13),
+    (67, 13, 11),
+    (257, 13, 11),
+    (658, 13, 11),
+]
+
+
+# uint4 for awq
+# uint4b8 for gptq
+@pytest.mark.parametrize("k_chunk", [128])
+@pytest.mark.parametrize("n_chunk", [64, 256])
+@pytest.mark.parametrize("quant_type", [scalar_types.uint4, scalar_types.uint4b8])
+@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
+@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+@pytest.mark.parametrize("act_order", [False, True])
+@pytest.mark.parametrize("is_k_full", [False, True])
+@pytest.mark.parametrize("use_atomic_add", [False, True])
+@pytest.mark.parametrize("use_fp32_reduce", [False, True])
+def test_gptq_marlin_gemm(
+    k_chunk,
+    n_chunk,
+    quant_type,
+    group_size,
+    mnk_factors,
+    act_order,
+    is_k_full,
+    use_atomic_add,
+    use_fp32_reduce,
+):
+    m_factor, n_factor, k_factor = mnk_factors
+    has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
+
+    size_m = m_factor
+    size_k = k_chunk * k_factor
+    size_n = n_chunk * n_factor
+
+    if act_order:
+        if group_size == -1:
+            return
+        if group_size == size_k:
+            return
+        if has_zp:
+            return
+
+    if size_k % group_size != 0:
+        return
+
+    a_input = torch.randn((size_m, size_k), dtype=torch.float16, device="cuda")
+    b_weight = torch.randn((size_k, size_n), dtype=torch.float16, device="cuda")
+
+    if has_zp:
+        # AWQ style, unsigned + runtime zero-point
+        if group_size == 16:
+            return
+        w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize(
+            b_weight, quant_type, group_size
+        )
+        g_idx = None
+        sort_indices = None
+        marlin_s2 = None
+    else:
+        # GPTQ style, unsigned + symmetric bias
+        if group_size == 16:
+            return
+        w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
+            b_weight, quant_type, group_size, act_order
+        )
+        marlin_zp = None
+        marlin_s2 = None
+
+    workspace = marlin_make_workspace(w_ref.device)
+
+    # marlin gemm
+    output = gptq_marlin_gemm(
+        a_input,
+        None,
+        marlin_q_w,
+        marlin_s,
+        marlin_s2,
+        marlin_zp,
+        g_idx,
+        sort_indices,
+        workspace,
+        quant_type,
+        a_input.shape[0],
+        b_weight.shape[1],
+        a_input.shape[1],
+        is_k_full=is_k_full,
+        use_atomic_add=use_atomic_add,
+        use_fp32_reduce=use_fp32_reduce,
+        is_zp_float=False,
+    )
+    # ref gemm
+    output_ref = torch.matmul(a_input, w_ref)
+
+    torch.cuda.synchronize()
+
+    max_diff = torch.mean(torch.abs(output - output_ref)) / torch.mean(
+        torch.abs(output_ref)
+    )
+
+    assert max_diff < 0.04
+
+
+if __name__ == "__main__":
+    import subprocess
+
+    subprocess.call(["pytest", "--tb=short", str(__file__)])
diff --git a/sgl-kernel/tests/test_marlin_repack.py b/sgl-kernel/tests/test_marlin_repack.py
new file mode 100644
index 000000000..508099106
--- /dev/null
+++ b/sgl-kernel/tests/test_marlin_repack.py
@@ -0,0 +1,148 @@
+import numpy as np
+import pytest
+import torch
+from sgl_kernel import awq_marlin_repack, gptq_marlin_repack
+from sgl_kernel.scalar_type import scalar_types
+
+from sglang.srt.layers.quantization.utils import (
+    gptq_quantize_weights,
+    pack_cols,
+    pack_rows,
+    quantize_weights,
+    sort_weights,
+)
+from sglang.test.test_marlin_utils import get_weight_perm, marlin_weights
+
+GPTQ_MARLIN_TILE = 16
+MARLIN_K_CHUNKS = [128]
+MARLIN_N_CHUNKS = [64, 256]
+
+MNK_FACTORS = [
+    (1, 1, 1),
+    (1, 4, 8),
+    (1, 7, 5),
+    (13, 17, 67),
+    (26, 37, 13),
+    (67, 13, 11),
+    (257, 13, 11),
+    (658, 13, 11),
+]
+
+
+def awq_pack(
+    q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    assert q_w.shape == (size_k, size_n)
+
+    # Interleave column dim (for the dequantize code) and pack it to int32
+    if num_bits == 4:
+        interleave = np.array([0, 2, 4, 6, 1, 3, 5, 7])
+    elif num_bits == 8:
+        interleave = np.array([0, 2, 1, 3])
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    q_w = q_w.reshape((-1, len(interleave)))[:, interleave].ravel()
+    q_w = q_w.reshape((-1, size_n)).contiguous()
+
+    return pack_cols(q_w, num_bits, size_k, size_n)
+
+
+@pytest.mark.parametrize("num_bits", [4, 8])
+@pytest.mark.parametrize("k_tiles,n_tiles", [(1, 1), (2, 2)])
+@pytest.mark.parametrize("group_size", [16, 32])
+def test_awq_marlin_repack_correct(num_bits, k_tiles, n_tiles, group_size):
+    tile_k, tile_n = 16, 64
+    size_k = k_tiles * tile_k
+    size_n = n_tiles * tile_n
+    pack_factor = 32 // num_bits
+
+    b_weight = torch.randn((size_k, size_n), dtype=torch.float16, device="cuda")
+
+    w_ref, q_w, s, zp = quantize_weights(
+        b_weight, scalar_types.uint4, group_size, zero_points=True
+    )
+
+    q_w_awq = awq_pack(q_w, num_bits, size_k, size_n)
+
+    weight_perm = get_weight_perm(num_bits)
+    q_w_marlin = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm)
+
+    out_gpu = awq_marlin_repack(q_w_awq, size_k, size_n, num_bits)
+    assert out_gpu.is_cuda and out_gpu.dtype == torch.int32
+
+    expected_cols = size_n * tile_k // pack_factor
+    assert list(out_gpu.shape) == [size_k // tile_k, expected_cols]
+
+    torch.cuda.synchronize()
+
+    torch.testing.assert_close(out_gpu, q_w_marlin)
+
+
+@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
+@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
+@pytest.mark.parametrize("quant_type", [scalar_types.uint4b8])
+@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
+@pytest.mark.parametrize("act_order", [False, True])
+@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+def test_gptq_marlin_repack(
+    k_chunk, n_chunk, quant_type, group_size, act_order, mnk_factors
+):
+    m_factor, n_factor, k_factor = mnk_factors
+
+    size_k = k_chunk * k_factor
+    size_n = n_chunk * n_factor
+
+    # Filter act_order
+    if act_order:
+        if group_size == -1:
+            return
+        if group_size == size_k:
+            return
+
+    # Normalize group_size
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+
+    if size_k % group_size != 0:
+        pytest.skip("size_k must be divisible by group_size")
+
+    # Create input
+    b_weight = torch.randn((size_k, size_n), dtype=torch.float16, device="cuda")
+
+    # Quantize (and apply act_order if provided)
+    w_ref, q_w, s, g_idx, rand_perm = gptq_quantize_weights(
+        b_weight, quant_type, group_size, act_order
+    )
+
+    q_w_gptq = pack_rows(q_w, quant_type.size_bits, size_k, size_n)
+
+    # For act_order, sort the "weights" and "g_idx" so that group ids are
+    # increasing
+    sort_indices = torch.empty(0, dtype=torch.int, device=b_weight.device)
+    if act_order:
+        q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
+
+    marlin_layout_perm = get_weight_perm(quant_type.size_bits)
+    q_w_marlin_ref = marlin_weights(
+        q_w, size_k, size_n, quant_type.size_bits, marlin_layout_perm
+    )
+
+    # Run Marlin repack GPU kernel
+    q_w_marlin = gptq_marlin_repack(
+        q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits
+    )
+
+    torch.cuda.synchronize()
+
+    torch.testing.assert_close(q_w_marlin, q_w_marlin_ref)
+
+
+if __name__ == "__main__":
+    import subprocess
+
+    subprocess.call(["pytest", "--tb=short", str(__file__)])
diff --git a/sgl-kernel/tests/test_merge_state.py b/sgl-kernel/tests/test_merge_state.py
new file mode 100644
index 000000000..70b9628d9
--- /dev/null
+++ b/sgl-kernel/tests/test_merge_state.py
@@ -0,0 +1,142 @@
+# Adapted from https://github.com/flashinfer-ai/flashinfer/blob/55576c626421b5ee7e7ebe74afd26465c8ae863f/flashinfer/triton/kernels/cascade.py
+
+from typing import List
+
+import pytest
+import torch
+import triton
+import triton.language as tl
+from sgl_kernel import merge_state
+
+
+def check_input(x: torch.Tensor):
+    assert x.is_cuda, f"{str(x)} must be a CUDA Tensor"
+    assert x.is_contiguous(), f"{str(x)} must be contiguous"
+
+
+def check_dim(d, x: torch.Tensor):
+    assert x.dim() == d, f"{str(x)} must be a {d}D tensor"
+
+
+def check_shape(a: torch.Tensor, b: torch.Tensor):
+    assert a.dim() == b.dim(), "tensors should have same dim"
+    for i in range(a.dim()):
+        assert a.size(i) == b.size(
+            i
+        ), f"tensors shape mismatch, {a.size()} and {b.size()}"
+
+
+def check_device(tensors: List[torch.Tensor]):
+    device = tensors[0].device
+    for t in tensors:
+        assert (
+            t.device == device
+        ), f"All tensors should be on the same device, but got {device} and {t.device}"
+
+
+@triton.jit
+def state_merge(o, m, d, other_o, other_m, other_d):
+    m_max = tl.maximum(m, other_m)
+    d = d * tl.exp2(m - m_max) + other_d * tl.exp2(other_m - m_max)
+    o = o * tl.exp2(m - m_max) + other_o * tl.exp2(other_m - m_max)
+    return o, m_max, d
+
+
+@triton.jit
+def state_normalize(o, m, d):
+    o = o / d
+    return o, m, d
+
+
+@triton.jit
+def state_get_lse(o, m, d):
+    return m + tl.log2(d)
+
+
+@triton.jit
+def merge_state_kernel(
+    v_a_ptr,
+    s_a_ptr,
+    v_b_ptr,
+    s_b_ptr,
+    v_merged_ptr,
+    s_merged_ptr,
+    num_heads,
+    head_dim,
+    bdx: tl.constexpr,
+    bdy: tl.constexpr,
+):
+    pos = tl.program_id(axis=0)
+    for tx in tl.range(bdx):
+        for head_idx in tl.range(bdy):
+            s_a_val = tl.load(s_a_ptr + pos * num_heads + head_idx)
+            s_b_val = tl.load(s_b_ptr + pos * num_heads + head_idx)
+
+            offsets = (pos * num_heads + head_idx) * head_dim + tx
+            v_a = tl.load(v_a_ptr + offsets)
+            v_b = tl.load(v_b_ptr + offsets)
+
+            v_merged, s_max, d = state_merge(
+                o=v_a, m=s_a_val, d=1, other_o=v_b, other_m=s_b_val, other_d=1
+            )
+            v_merged, s_max, d = state_normalize(v_merged, s_max, d)
+            v_merged_offset = (pos * num_heads + head_idx) * head_dim + tx
+            tl.store(v_merged_ptr + v_merged_offset, v_merged)
+
+            if s_merged_ptr:
+                tl.store(
+                    s_merged_ptr + pos * num_heads + head_idx,
+                    tl.log2(d) + s_max,
+                )
+
+
+def merge_state_triton(
+    v_a: torch.Tensor, s_a: torch.Tensor, v_b: torch.Tensor, s_b: torch.Tensor
+):
+    check_input(v_a)
+    check_input(s_a)
+    check_input(v_b)
+    check_input(s_b)
+    check_device([v_a, s_a, v_b, s_b])
+    check_dim(3, v_a)
+    check_dim(2, s_a)
+    check_dim(3, v_b)
+    check_dim(2, s_b)
+    check_shape(v_a, v_b)
+    check_shape(s_a, s_b)
+    assert v_a.size(0) == s_a.size(0)
+    assert v_a.size(1) == s_b.size(1)
+    s_a = s_a.to(torch.float32)
+    s_b = s_b.to(torch.float32)
+    seq_len = v_a.size(0)
+    num_heads = v_a.size(1)
+    head_dim = v_a.size(2)
+    v_merged = torch.empty_like(v_a).to(s_a.device)
+    s_merged = torch.empty((seq_len, num_heads)).to(s_a.device)
+    bdx = head_dim
+    bdy = num_heads
+
+    merge_state_kernel[lambda meta: (seq_len,)](
+        v_a, s_a, v_b, s_b, v_merged, s_merged, num_heads, head_dim, bdx=bdx, bdy=bdy
+    )
+
+    return v_merged, s_merged
+
+
+@pytest.mark.parametrize("seq_len", [2048])
+@pytest.mark.parametrize("num_heads", [32])
+@pytest.mark.parametrize("head_dim", [128])
+def test_merge_state(seq_len, num_heads, head_dim):
+    va = torch.randn(seq_len, num_heads, head_dim).half().to("cuda:0")
+    sa = torch.randn(seq_len, num_heads, dtype=torch.float32).to("cuda:0")
+    vb = torch.randn(seq_len, num_heads, head_dim).half().to("cuda:0")
+    sb = torch.randn(seq_len, num_heads, dtype=torch.float32).to("cuda:0")
+    v_merged, s_merged = merge_state_triton(va, sa, vb, sb)
+    v_merged_std, s_merged_std = merge_state(va, sa, vb, sb)
+
+    assert torch.allclose(v_merged, v_merged_std, atol=1e-2)
+    assert torch.allclose(s_merged, s_merged_std, atol=1e-2)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_merge_state_v2.py b/sgl-kernel/tests/test_merge_state_v2.py
new file mode 100644
index 000000000..62326f75c
--- /dev/null
+++ b/sgl-kernel/tests/test_merge_state_v2.py
@@ -0,0 +1,400 @@
+from typing import Optional
+
+import pytest
+import torch
+import triton
+import triton.language as tl
+from sgl_kernel import merge_state, merge_state_v2
+
+
+@triton.jit
+def merge_state_kernel(
+    output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] v_merged
+    output_lse,  # [NUM_TOKENS, NUM_HEADS] s_merged
+    prefix_output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] v_a
+    prefix_lse,  # [NUM_TOKENS, NUM_HEADS] s_a
+    suffix_output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] v_b
+    suffix_lse,  # [NUM_TOKENS, NUM_HEADS] s_b
+    HEAD_SIZE: tl.constexpr,
+    PADDED_HEAD_SIZE: tl.constexpr,
+    OUTPUT_LSE: tl.constexpr,
+):
+    token_idx = tl.program_id(0)
+    num_tokens = tl.num_programs(0)
+    head_idx = tl.program_id(1)
+    num_heads = tl.num_programs(1)
+
+    p_lse = tl.load(prefix_lse + token_idx * num_heads + head_idx)
+    s_lse = tl.load(suffix_lse + token_idx * num_heads + head_idx)
+    p_lse = float("-inf") if p_lse == float("inf") else p_lse
+    s_lse = float("-inf") if s_lse == float("inf") else s_lse
+
+    max_lse = tl.maximum(p_lse, s_lse)
+    p_lse = p_lse - max_lse
+    s_lse = s_lse - max_lse
+    out_se = tl.exp(p_lse) + tl.exp(s_lse)
+
+    if OUTPUT_LSE:
+        out_lse = tl.log(out_se) + max_lse
+        tl.store(output_lse + token_idx * num_heads + head_idx, out_lse)
+
+    head_arange = tl.arange(0, PADDED_HEAD_SIZE)
+    head_mask = head_arange < HEAD_SIZE
+    p_out = tl.load(
+        prefix_output
+        + token_idx * num_heads * HEAD_SIZE
+        + head_idx * HEAD_SIZE
+        + head_arange,
+        mask=head_mask,
+    )
+    s_out = tl.load(
+        suffix_output
+        + token_idx * num_heads * HEAD_SIZE
+        + head_idx * HEAD_SIZE
+        + head_arange,
+        mask=head_mask,
+    )
+
+    p_scale = tl.exp(p_lse) / out_se
+    s_scale = tl.exp(s_lse) / out_se
+    out = p_out * p_scale + s_out * s_scale
+    tl.store(
+        output + token_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE + head_arange,
+        out,
+        mask=head_mask,
+    )
+
+
+def merge_state_triton(
+    prefix_output: torch.Tensor,
+    prefix_lse: torch.Tensor,
+    suffix_output: torch.Tensor,
+    suffix_lse: torch.Tensor,
+    output: Optional[torch.Tensor] = None,
+    output_lse: Optional[torch.Tensor] = None,
+) -> None:
+    num_tokens = output.shape[0]
+    num_query_heads = output.shape[1]
+    head_size = output.shape[2]
+    padded_head_size = triton.next_power_of_2(head_size)
+    # Avoid creating new tensors if they are already provided
+    if output is None:
+        output = torch.empty_like(prefix_output)
+    if output_lse is None:
+        output_lse = torch.empty_like(prefix_lse)
+
+    merge_state_kernel[(num_tokens, num_query_heads)](
+        output,
+        output_lse,
+        prefix_output,
+        prefix_lse,
+        suffix_output,
+        suffix_lse,
+        head_size,
+        padded_head_size,
+        output_lse is not None,
+    )
+    return output, output_lse
+
+
+# Naive PyTorch Implements of Merge Attn States
+def merge_state_torch(
+    prefix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    prefix_lse: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS]
+    suffix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    suffix_lse: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS]
+    output: Optional[torch.Tensor] = None,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    output_lse: Optional[torch.Tensor] = None,  # [NUM_TOKENS, NUM_HEADS]
+):
+    # Avoid creating new tensors if they are already provided
+    if output is None:
+        output = torch.empty_like(prefix_output)
+    if output_lse is None:
+        output_lse = torch.empty_like(prefix_lse)
+    p_lse = prefix_lse
+    s_lse = suffix_lse
+    # inf -> -inf
+    p_lse[p_lse == torch.inf] = -torch.inf
+    s_lse[s_lse == torch.inf] = -torch.inf
+    # max_lse [NUM_HEADS, NUM_TOKENS]
+    max_lse = torch.maximum(p_lse, s_lse)
+    p_lse = p_lse - max_lse
+    s_lse = s_lse - max_lse
+    p_lse_exp = torch.exp(p_lse)
+    s_lse_exp = torch.exp(s_lse)
+    out_se = p_lse_exp + s_lse_exp
+    if output_lse is not None:
+        output_lse = torch.log(out_se) + max_lse
+    p_scale = p_lse_exp / out_se
+    s_scale = s_lse_exp / out_se
+    p_scale = p_scale.unsqueeze(2)  # [NUM_TOKENS, NUM_HEADS, 1]
+    s_scale = s_scale.unsqueeze(2)  # [NUM_TOKENS, NUM_HEADS, 1]
+    output = prefix_output * p_scale + suffix_output * s_scale
+    return output, output_lse
+
+
+NUM_BATCH_TOKENS = [256, 512, 613, 1024, 1536]
+NUM_QUERY_HEADS = [8, 16, 32]
+HEAD_SIZES = [32, 48, 64, 128, 256]
+DTYPES = [torch.half, torch.bfloat16]
+
+all_case_info: list[tuple] = []
+
+
+def generate_markdown_table():
+    global all_case_info
+    table_header = (
+        "| tokens | heads | headsize | dtype "
+        "| device | torch | triton | v1 | v2 | speedup(vs triton) | speedup(vs v1)|"
+    )
+    table_separator = (
+        "| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |"
+    )
+
+    def shortly_dtype(dtype: torch.dtype) -> str:
+        return str(dtype).removeprefix("torch.")
+
+    def shortly_device(device: str) -> str:
+        return device.removeprefix("NVIDIA").strip()
+
+    print(table_header)
+    print(table_separator)
+    for info in all_case_info:
+        (
+            num_tokens,
+            num_heads,
+            head_size,
+            dtype,
+            device,
+            time_torch,
+            time_triton,
+            time_v1,
+            time_v2,
+        ) = info
+        dtype = shortly_dtype(dtype)
+        device = shortly_device(device)
+        improved_triton = time_triton / time_v2
+        improved_v1 = time_v1 / time_v2
+        print(
+            f"| {num_tokens} | {num_heads} | {head_size} "
+            f"| {dtype} | {device} | {time_torch:.4f}ms "
+            f"| {time_triton:.4f}ms "
+            f"| {time_v1:.4f}ms "
+            f"| {time_v2:.4f}ms "
+            f"| {improved_triton:.4f}x "
+            f"| {improved_v1:.4f}x |"
+        )
+
+
+@pytest.mark.parametrize("num_tokens", NUM_BATCH_TOKENS)
+@pytest.mark.parametrize("num_query_heads", NUM_QUERY_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("output_dtype", DTYPES)
+@torch.inference_mode()
+def test_merge_attn_states(
+    num_tokens: int, num_query_heads: int, head_size: int, output_dtype: torch.dtype
+):
+    if not torch.cuda.is_available():
+        pytest.skip(
+            "Currently only support compare triton merge_attn_states "
+            "with custom cuda merge_attn_states kernel"
+        )
+
+    NUM_TOKENS = num_tokens
+    NUM_HEADS = num_query_heads
+    HEAD_SIZE = head_size
+
+    print(
+        f"\nNUM_TOKENS:{NUM_TOKENS}, NUM_HEADS:{NUM_HEADS}, "
+        f"HEAD_SIZE:{HEAD_SIZE}, DTYPE: {output_dtype}, "
+        f"Device: {torch.cuda.get_device_name()}"
+    )
+
+    # prefix_lse and suffix_lse contain inf and normal values
+    prefix_lse = torch.randn(NUM_TOKENS, NUM_HEADS, dtype=torch.float32, device="cuda")
+    suffix_lse = torch.randn(NUM_TOKENS, NUM_HEADS, dtype=torch.float32, device="cuda")
+
+    # Generate boolean masks
+    mask_prefix = torch.rand(NUM_TOKENS, NUM_HEADS) < 0.1
+    mask_suffix = torch.rand(NUM_TOKENS, NUM_HEADS) < 0.1
+    # Ensure that the same position is not True at the same time
+    combined_mask = torch.logical_and(mask_prefix, mask_suffix)
+    mask_prefix = torch.logical_and(mask_prefix, ~combined_mask)
+    mask_suffix = torch.logical_and(mask_suffix, ~combined_mask)
+
+    prefix_lse[mask_prefix] = float("inf")
+    suffix_lse[mask_suffix] = float("inf")
+
+    # Other input tensors (need to be initialized but
+    # no actual calculation needed)
+    output = torch.zeros(
+        (NUM_TOKENS, NUM_HEADS, HEAD_SIZE), dtype=output_dtype, device="cuda"
+    )
+    output_lse = torch.zeros(
+        (NUM_TOKENS, NUM_HEADS), dtype=torch.float32, device="cuda"
+    )
+    prefix_output = torch.randn(
+        (NUM_TOKENS, NUM_HEADS, HEAD_SIZE), dtype=output_dtype, device="cuda"
+    )
+    suffix_output = torch.randn(
+        (NUM_TOKENS, NUM_HEADS, HEAD_SIZE), dtype=output_dtype, device="cuda"
+    )
+
+    warmup_times = 2
+    repeat_times = 20
+
+    def perf_kernel_fn(
+        output_fn: torch.Tensor,
+        output_lse_fn: torch.Tensor,
+        kernel_fn: callable,
+        fn_type: str = "torch",
+    ):
+        # Avoid inplace inf -> -inf, we have to use prefix_lse
+        # and suffix_lse for other kernel.
+        if fn_type == "torch":
+            prefix_lse_ = prefix_lse.clone()
+            suffix_lse_ = suffix_lse.clone()
+        else:
+            prefix_lse_ = prefix_lse
+            suffix_lse_ = suffix_lse
+
+        if fn_type == "cuda_v1":
+            # merge_state v1 kernel not support float32
+            if output_dtype not in (torch.half, torch.bfloat16):
+                return 0, output_fn, output_lse_fn
+
+        total_time = 0
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        try:
+            for _ in range(warmup_times):
+                output_fn, output_lse_fn = kernel_fn(
+                    prefix_output,
+                    prefix_lse_,
+                    suffix_output,
+                    suffix_lse_,
+                    output_fn,
+                    output_lse_fn,
+                )
+            torch.cuda.synchronize()
+
+            for _ in range(repeat_times):
+                start.record()
+                output_fn, output_lse_fn = kernel_fn(
+                    prefix_output,
+                    prefix_lse_,
+                    suffix_output,
+                    suffix_lse_,
+                    output_fn,
+                    output_lse_fn,
+                )
+                end.record()
+                torch.cuda.synchronize()
+                total_time += start.elapsed_time(end)
+
+            avg_time = total_time / repeat_times
+            return avg_time, output_fn, output_lse_fn
+        except Exception as e:
+            return 0, output_fn, output_lse_fn
+
+    # 0. Run the Torch kernel
+    output_torch = output.clone()
+    output_lse_torch = output_lse.clone()
+    time_torch, output_torch, output_lse_torch = perf_kernel_fn(
+        output_torch, output_lse_torch, merge_state_torch, fn_type="torch"
+    )
+
+    # 1. Run the Triton kernel
+    output_ref_triton = output.clone()
+    output_lse_ref_triton = output_lse.clone()
+    time_triton, output_ref_triton, output_lse_ref_triton = perf_kernel_fn(
+        output_ref_triton,
+        output_lse_ref_triton,
+        merge_state_triton,
+        fn_type="triton",
+    )
+
+    # 2. Run the merge_state V1 kernel
+    output_v1 = output.clone()
+    output_lse_v1 = output_lse.clone()
+    time_v1, output_v1, output_lse_v1 = perf_kernel_fn(
+        output_v1, output_lse_v1, merge_state, fn_type="cuda_v1"
+    )
+
+    # 3. Run the merge_state V2 kernel
+    output_v2 = output.clone()
+    output_lse_v2 = output_lse.clone()
+    time_v2, output_v2, output_lse_v2 = perf_kernel_fn(
+        output_v2, output_lse_v2, merge_state_v2, fn_type="cuda_v2"
+    )
+
+    # 4. Performance compare
+    improved = time_triton / time_v2
+    print(f"  Torch time: {time_torch:.6f}ms")
+    print(f" Triton time: {time_triton:.6f}ms")
+    print(f"CUDA v1 time: {time_v1:.6f}ms")
+    print(f"CUDA v2 time: {time_v2:.6f}ms, Performance: {improved:.5f}x")
+    print("-" * 100)
+
+    # 5. Correctness compare
+    # Liger Kernel: Efficient Triton Kernels for LLM Training
+    # https://arxiv.org/pdf/2410.10989, 3.3 Correctness
+    # use rtol = 1e-2 for bfloat16.
+    rtol = 1e-2 if output_dtype == torch.bfloat16 else 1e-3
+
+    def diff(a: torch.Tensor, b: torch.Tensor):
+        max_diff = torch.max(torch.abs(a.float() - b.float()))
+        return max_diff
+
+    # Use Triton output as reference because we want to replace
+    # the Triton kernel with custom CUDA kernel for merge attn
+    # states operation.
+    output_ref = output_ref_triton
+    output_lse_ref = output_lse_ref_triton
+    torch.testing.assert_close(
+        output_v2.float(), output_ref.float(), atol=1e-3, rtol=rtol
+    )
+    print("Output all match, max abs diff:")
+    print(f"(Triton  vs Torch) : {diff(output_torch, output_ref)}")
+    print(f"(CUDA v2 vs Torch) : {diff(output_torch, output_v2)}")
+    print(f"(CUDA v2 vs Triton): {diff(output_ref, output_v2)}")
+    print("-" * 100)
+
+    torch.testing.assert_close(
+        output_lse_v2.float(), output_lse_ref.float(), atol=1e-3, rtol=rtol
+    )
+    print("Output LSE all match, max abs diff:")
+    print(f"(Triton  vs Torch) : {diff(output_lse_torch, output_lse_ref)}")
+    print(f"(CUDA v2 vs Torch) : {diff(output_lse_torch, output_lse_v2)}")
+    print(f"(CUDA v2 vs Triton): {diff(output_lse_ref, output_lse_v2)}")
+    print("-" * 100)
+
+    print(
+        "All output values test passed! All inf values "
+        "are correctly replaced with -inf."
+    )
+    print("-" * 100)
+
+    device = torch.cuda.get_device_name()
+    all_case_info.append(
+        (
+            NUM_TOKENS,
+            NUM_HEADS,
+            HEAD_SIZE,
+            output_dtype,
+            device,
+            time_torch,
+            time_triton,
+            time_v1,
+            time_v2,
+        )
+    )
+    if len(all_case_info) == (
+        len(NUM_BATCH_TOKENS) * len(HEAD_SIZES) * len(NUM_QUERY_HEADS) * len(DTYPES)
+    ):
+        generate_markdown_table()
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_moe_align.py b/sgl-kernel/tests/test_moe_align.py
new file mode 100644
index 000000000..90f04ec95
--- /dev/null
+++ b/sgl-kernel/tests/test_moe_align.py
@@ -0,0 +1,250 @@
+import itertools
+
+import pytest
+import torch
+import triton
+import triton.language as tl
+from sgl_kernel import moe_align_block_size
+
+
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+@triton.jit
+def moe_align_block_size_stage1(
+    topk_ids_ptr,
+    tokens_cnts_ptr,
+    num_experts: tl.constexpr,
+    numel: tl.constexpr,
+    tokens_per_thread: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    start_idx = pid * tokens_per_thread
+    off_c = (pid + 1) * num_experts
+
+    for i in range(tokens_per_thread):
+        if start_idx + i < numel:
+            idx = tl.load(topk_ids_ptr + start_idx + i)
+            token_cnt = tl.load(tokens_cnts_ptr + off_c + idx)
+            tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1)
+
+
+@triton.jit
+def moe_align_block_size_stage2(
+    tokens_cnts_ptr,
+    num_experts: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    last_cnt = 0
+    for i in range(1, num_experts + 1):
+        token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid)
+        last_cnt = last_cnt + token_cnt
+        tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt)
+
+
+@triton.jit
+def moe_align_block_size_stage3(
+    total_tokens_post_pad_ptr,
+    tokens_cnts_ptr,
+    cumsum_ptr,
+    num_experts: tl.constexpr,
+    block_size: tl.constexpr,
+):
+    last_cumsum = 0
+    off_cnt = num_experts * num_experts
+    for i in range(1, num_experts + 1):
+        token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1)
+        last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size
+        tl.store(cumsum_ptr + i, last_cumsum)
+    tl.store(total_tokens_post_pad_ptr, last_cumsum)
+
+
+@triton.jit
+def moe_align_block_size_stage4(
+    topk_ids_ptr,
+    sorted_token_ids_ptr,
+    expert_ids_ptr,
+    tokens_cnts_ptr,
+    cumsum_ptr,
+    num_experts: tl.constexpr,
+    block_size: tl.constexpr,
+    numel: tl.constexpr,
+    tokens_per_thread: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    start_idx = tl.load(cumsum_ptr + pid)
+    end_idx = tl.load(cumsum_ptr + pid + 1)
+
+    for i in range(start_idx, end_idx, block_size):
+        tl.store(expert_ids_ptr + i // block_size, pid)
+
+    start_idx = pid * tokens_per_thread
+    off_t = pid * num_experts
+
+    for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread, numel)):
+        expert_id = tl.load(topk_ids_ptr + i)
+        token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id)
+        rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id)
+        tl.store(sorted_token_ids_ptr + rank_post_pad, i)
+        tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1)
+
+
+def moe_align_block_size_triton(
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    block_size: int,
+    sorted_token_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+) -> None:
+    numel = topk_ids.numel()
+    grid = (num_experts,)
+    tokens_cnts = torch.zeros(
+        (num_experts + 1, num_experts), dtype=torch.int32, device=topk_ids.device
+    )
+    cumsum = torch.zeros((num_experts + 1,), dtype=torch.int32, device=topk_ids.device)
+    tokens_per_thread = ceil_div(numel, num_experts)
+
+    moe_align_block_size_stage1[grid](
+        topk_ids,
+        tokens_cnts,
+        num_experts,
+        numel,
+        tokens_per_thread,
+    )
+    moe_align_block_size_stage2[grid](
+        tokens_cnts,
+        num_experts,
+    )
+    moe_align_block_size_stage3[(1,)](
+        num_tokens_post_pad,
+        tokens_cnts,
+        cumsum,
+        num_experts,
+        block_size,
+    )
+    moe_align_block_size_stage4[grid](
+        topk_ids,
+        sorted_token_ids,
+        expert_ids,
+        tokens_cnts,
+        cumsum,
+        num_experts,
+        block_size,
+        numel,
+        tokens_per_thread,
+    )
+
+
+@pytest.mark.parametrize(
+    "block_size,num_tokens,topk,num_experts,pad_sorted_token_ids",
+    list(
+        itertools.product(
+            [32, 64, 128, 256],  # block_size
+            [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096],  # num_tokens
+            [1, 2, 4, 8, 16, 32, 64],  # topk
+            [64, 160, 256, 257, 260, 264],  #  num_experts
+            [True, False],  # pad_sorted_token_ids
+        )
+    ),
+)
+def test_moe_align_block_size_compare_implementations(
+    block_size, num_tokens, topk, num_experts, pad_sorted_token_ids
+):
+
+    topk_ids = torch.argsort(torch.rand(num_tokens, num_experts, device="cuda"), dim=1)[
+        :, :topk
+    ]
+
+    max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1)
+
+    sorted_ids_cuda = torch.empty(
+        (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
+    )
+    if not pad_sorted_token_ids:
+        sorted_ids_cuda.fill_(topk_ids.numel())
+    max_num_m_blocks = max_num_tokens_padded // block_size
+    expert_ids_cuda = torch.zeros(
+        (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
+    )
+    num_tokens_post_pad_cuda = torch.empty(
+        (1), dtype=torch.int32, device=topk_ids.device
+    )
+    cumsum_buffer = torch.empty(
+        num_experts + 2, dtype=torch.int32, device=topk_ids.device
+    )
+
+    sorted_ids_triton = torch.empty_like(sorted_ids_cuda)
+    sorted_ids_triton.fill_(topk_ids.numel())
+    expert_ids_triton = torch.zeros_like(expert_ids_cuda)
+    num_tokens_post_pad_triton = torch.empty_like(num_tokens_post_pad_cuda)
+
+    moe_align_block_size(
+        topk_ids,
+        num_experts + 1,
+        block_size,
+        sorted_ids_cuda,
+        expert_ids_cuda,
+        num_tokens_post_pad_cuda,
+        cumsum_buffer,
+        pad_sorted_token_ids,
+    )
+
+    moe_align_block_size_triton(
+        topk_ids,
+        num_experts + 1,
+        block_size,
+        sorted_ids_triton,
+        expert_ids_triton,
+        num_tokens_post_pad_triton,
+    )
+
+    assert torch.allclose(expert_ids_cuda, expert_ids_triton, atol=0, rtol=0), (
+        f"Expert IDs mismatch for block_size={block_size}, "
+        f"num_tokens={num_tokens}, topk={topk}\n"
+        f"CUDA expert_ids: {expert_ids_cuda}\n"
+        f"Triton expert_ids: {expert_ids_triton}"
+    )
+
+    assert torch.allclose(
+        num_tokens_post_pad_cuda, num_tokens_post_pad_triton, atol=0, rtol=0
+    ), (
+        f"Num tokens post pad mismatch for block_size={block_size}, "
+        f"num_tokens={num_tokens}, topk={topk}\n"
+        f"CUDA num_tokens_post_pad: {num_tokens_post_pad_cuda}\n"
+        f"Triton num_tokens_post_pad: {num_tokens_post_pad_triton}"
+    )
+
+    # Select an expert to check
+    expert_idx = expert_ids_cuda.max().item()
+
+    # Get the first and last block id where expert_ids_cuda == expert_idx
+    matching_indices = torch.where(expert_ids_cuda == expert_idx)[0]
+    block_sorted_start = matching_indices[0].item() * block_size
+    block_sorted_end = min(
+        (matching_indices[-1].item() + 1) * block_size, num_tokens_post_pad_cuda.item()
+    )
+
+    selected_sorted_ids_cuda = sorted_ids_cuda[
+        block_sorted_start:block_sorted_end
+    ].sort()[0]
+    selected_sorted_ids_triton = sorted_ids_triton[
+        block_sorted_start:block_sorted_end
+    ].sort()[0]
+
+    assert torch.allclose(
+        selected_sorted_ids_cuda,
+        selected_sorted_ids_triton,
+        atol=0,
+        rtol=0,
+    ), (
+        f"Sorted IDs mismatch for block_size={block_size}, "
+        f"num_tokens={num_tokens}, topk={topk}\n"
+        f"CUDA sorted_ids: {selected_sorted_ids_cuda}\n"
+        f"Triton sorted_ids: {selected_sorted_ids_triton}"
+    )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_moe_fused_gate.py b/sgl-kernel/tests/test_moe_fused_gate.py
new file mode 100644
index 000000000..983895752
--- /dev/null
+++ b/sgl-kernel/tests/test_moe_fused_gate.py
@@ -0,0 +1,103 @@
+import pytest
+import torch
+from sgl_kernel import moe_fused_gate
+
+from sglang.srt.layers.moe.topk import biased_grouped_topk
+
+
+@pytest.mark.parametrize(
+    "seq_length",
+    list(range(1, 10))
+    + [16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536],
+)
+@pytest.mark.parametrize(
+    "params",
+    [
+        (128, 4, 2, 4),
+        (256, 8, 4, 8),  # deepseek v3
+        (512, 16, 8, 16),
+    ],
+)
+@pytest.mark.parametrize("num_fused_shared_experts", [0, 1, 2])
+@pytest.mark.parametrize("apply_routed_scaling_factor_on_output", [False, True])
+def test_moe_fused_gate_combined(
+    seq_length, params, num_fused_shared_experts, apply_routed_scaling_factor_on_output
+):
+    num_experts, num_expert_group, topk_group, topk = params
+    dtype = torch.float32
+
+    torch.manual_seed(seq_length)
+    tensor = torch.rand((seq_length, num_experts), dtype=dtype, device="cuda")
+    scores = tensor.clone()
+    bias = torch.rand(num_experts, dtype=dtype, device="cuda")
+    topk = topk + num_fused_shared_experts
+
+    output, indices = moe_fused_gate(
+        tensor,
+        bias,
+        num_expert_group=num_expert_group,
+        topk_group=topk_group,
+        topk=topk,
+        num_fused_shared_experts=num_fused_shared_experts,
+        routed_scaling_factor=2.5,
+        apply_routed_scaling_factor_on_output=apply_routed_scaling_factor_on_output,
+    )
+    ref_output, ref_indices = biased_grouped_topk(
+        scores,
+        scores,
+        bias,
+        topk=topk,
+        renormalize=True,
+        num_expert_group=num_expert_group,
+        topk_group=topk_group,
+        num_fused_shared_experts=num_fused_shared_experts,
+        routed_scaling_factor=2.5,
+        apply_routed_scaling_factor_on_output=apply_routed_scaling_factor_on_output,
+    )
+
+    # When num_fused_shared_experts > 0, ignore the comparison of the last topk dimension
+    if num_fused_shared_experts > 0:
+        original_indices = indices.clone()
+        original_ref_indices = ref_indices.clone()
+
+        indices = indices[:, :-1]
+        ref_indices = ref_indices[:, :-1]
+
+        valid_min = num_experts
+        valid_max = num_experts + num_fused_shared_experts
+        shared_indices = original_indices[:, -1]
+        shared_ref_indices = original_ref_indices[:, -1]
+        if shared_indices is not None:
+            assert torch.all(
+                (shared_indices >= valid_min) & (shared_indices < valid_max)
+            ), f"Shared expert indices out of range: found values outside [{valid_min}, {valid_max})"
+        if shared_ref_indices is not None:
+            assert torch.all(
+                (shared_ref_indices >= valid_min) & (shared_ref_indices < valid_max)
+            ), f"Shared expert reference indices out of range: found values outside [{valid_min}, {valid_max})"
+
+    idx_check = torch.allclose(
+        ref_indices.sort()[0].to(torch.int32),
+        indices.sort()[0].to(torch.int32),
+        rtol=1e-04,
+        atol=1e-05,
+    )
+    output_check = torch.allclose(
+        ref_output.sort()[0].to(torch.float32),
+        output.sort()[0].to(torch.float32),
+        rtol=1e-02,
+        atol=1e-03,
+    )
+
+    assert idx_check, (
+        f"Indices mismatch at seq_length {seq_length}, dtype {dtype}, "
+        f"params {params}, num_fused_shared_experts {num_fused_shared_experts}"
+    )
+    assert output_check, (
+        f"Output mismatch at seq_length {seq_length}, dtype {dtype}, "
+        f"params {params}, num_fused_shared_experts {num_fused_shared_experts}"
+    )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_moe_topk_softmax.py b/sgl-kernel/tests/test_moe_topk_softmax.py
new file mode 100644
index 000000000..b9a802c51
--- /dev/null
+++ b/sgl-kernel/tests/test_moe_topk_softmax.py
@@ -0,0 +1,139 @@
+import itertools
+
+import pytest
+import torch
+from sgl_kernel import topk_softmax
+
+
+@pytest.mark.parametrize(
+    "num_tokens, num_experts, topk",
+    list(
+        itertools.product(
+            [1, 16, 128, 512, 1024, 2048],  # num_tokens
+            [4, 8, 16, 32, 64, 128, 256],  # num_experts
+            [1, 2, 4],  # topk
+        )
+    ),
+)
+def test_topk_softmax(num_tokens, num_experts, topk):
+    gating_output = torch.randn(
+        (num_tokens, num_experts), dtype=torch.float32, device="cuda"
+    )
+
+    topk_weights = torch.empty((num_tokens, topk), dtype=torch.float32, device="cuda")
+    topk_indices = torch.empty((num_tokens, topk), dtype=torch.int32, device="cuda")
+
+    topk_softmax(
+        topk_weights,
+        topk_indices,
+        gating_output,
+    )
+
+    # Native torch implementation
+    softmax_output = torch.softmax(gating_output, dim=-1)
+    topk_weights_ref, topk_indices_ref = torch.topk(softmax_output, topk, dim=-1)
+
+    # Verify the top-k weights and indices match the torch native ones
+    assert torch.allclose(
+        topk_weights_ref, topk_weights, atol=1e-3, rtol=1e-3
+    ), f"Weights mismatch: torch={topk_indices_ref} vs SGLang={topk_weights}"
+
+    assert torch.allclose(
+        topk_indices_ref.int(), topk_indices, atol=0, rtol=0
+    ), f"Indices mismatch: torch={topk_indices_ref}, SGLang={topk_indices}"
+
+
+@pytest.mark.parametrize(
+    "num_tokens, num_experts, topk, dtype",
+    list(
+        itertools.product(
+            [1, 16, 128, 512, 1024, 2048],  # num_tokens
+            [4, 8, 16, 32, 64, 128, 256],  # num_experts
+            [1, 2, 4],  # topk
+            [torch.float16, torch.bfloat16, torch.float32],  # dtype
+        )
+    ),
+)
+def test_topk_softmax_dtype_regression(num_tokens, num_experts, topk, dtype):
+    gating_output = torch.randn((num_tokens, num_experts), dtype=dtype, device="cuda")
+
+    topk_weights = torch.empty((num_tokens, topk), dtype=torch.float32, device="cuda")
+    topk_indices = torch.empty((num_tokens, topk), dtype=torch.int32, device="cuda")
+
+    topk_softmax(
+        topk_weights,
+        topk_indices,
+        gating_output,
+    )
+
+    topk_weights_ref = torch.empty(
+        (num_tokens, topk), dtype=torch.float32, device="cuda"
+    )
+    topk_indices_ref = torch.empty((num_tokens, topk), dtype=torch.int32, device="cuda")
+
+    topk_softmax(
+        topk_weights_ref,
+        topk_indices_ref,
+        gating_output.float(),
+    )
+
+    assert torch.allclose(
+        topk_weights_ref, topk_weights, atol=1e-3, rtol=1e-3
+    ), f"Weights mismatch: SGLang old interface={topk_indices_ref} vs SGLang new interface={topk_weights}"
+
+    assert torch.allclose(
+        topk_indices_ref.int(), topk_indices, atol=0, rtol=0
+    ), f"Indices mismatch: SGLang old interface={topk_indices_ref}, SGLang new interface={topk_indices}"
+
+
+@pytest.mark.parametrize(
+    "num_tokens, num_experts, topk",
+    list(
+        itertools.product(
+            [1, 16, 128, 512, 1024, 2048],  # num_tokens
+            [4, 8, 16, 32, 64, 128, 256],  # num_experts
+            [1, 2, 4],  # topk
+        )
+    ),
+)
+def test_topk_softmax_renormalize(num_tokens, num_experts, topk):
+    gating_output = torch.randn(
+        (num_tokens, num_experts), dtype=torch.bfloat16, device="cuda"
+    )
+
+    topk_weights = torch.empty((num_tokens, topk), dtype=torch.float32, device="cuda")
+    topk_indices = torch.empty((num_tokens, topk), dtype=torch.int32, device="cuda")
+
+    topk_softmax(
+        topk_weights,
+        topk_indices,
+        gating_output,
+        renormalize=True,
+    )
+
+    topk_weights_ref = torch.empty(
+        (num_tokens, topk), dtype=torch.float32, device="cuda"
+    )
+    topk_indices_ref = torch.empty((num_tokens, topk), dtype=torch.int32, device="cuda")
+    token_expert_indices_ref = torch.empty(
+        (num_tokens, topk), dtype=torch.int32, device="cuda"
+    )
+
+    topk_softmax(
+        topk_weights_ref,
+        topk_indices_ref,
+        gating_output,
+    )
+    topk_weights_ref = topk_weights_ref / topk_weights_ref.sum(dim=-1, keepdim=True)
+
+    assert torch.allclose(
+        topk_weights_ref, topk_weights, atol=1e-3, rtol=1e-3
+    ), f"Weights mismatch: SGLang w/o fused renormalize={topk_indices_ref} vs SGLang w/ fused renormalize={topk_weights}"
+
+    assert torch.allclose(
+        topk_indices_ref.int(), topk_indices, atol=0, rtol=0
+    ), f"Indices mismatch: SGLang w/o fused renormalize={topk_indices_ref}, SGLang w/ fused renormalize={topk_indices}"
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_mscclpp.py b/sgl-kernel/tests/test_mscclpp.py
new file mode 100644
index 000000000..0a4332bd3
--- /dev/null
+++ b/sgl-kernel/tests/test_mscclpp.py
@@ -0,0 +1,146 @@
+import multiprocessing as mp
+import os
+import socket
+import unittest
+from enum import IntEnum
+from typing import Any
+
+import sgl_kernel.allreduce as custom_ops
+import torch
+import torch.distributed as dist
+
+
+class MscclContextSelection(IntEnum):
+    MSCCL1SHOT1NODELL = 1
+    MSCCL1SHOT2NODELL = 2
+
+
+def _run_correctness_worker(world_size, rank, distributed_init_port, test_sizes):
+    device = torch.device(f"cuda:{rank % torch.cuda.device_count()}")
+    torch.cuda.set_device(device)
+    distributed_init_method = f"tcp://localhost:{distributed_init_port}"
+    dist.init_process_group(
+        backend="nccl",
+        init_method=distributed_init_method,
+        rank=rank,
+        world_size=world_size,
+    )
+    group = dist.group.WORLD
+    cpu_group = torch.distributed.new_group(list(range(world_size)), backend="gloo")
+    if rank == 0:
+        unique_id = [custom_ops.mscclpp_generate_unique_id()]
+    else:
+        unique_id = [None]
+    dist.broadcast_object_list(
+        unique_id, src=0, device=torch.device("cpu"), group=cpu_group
+    )
+    unique_id = unique_id[0]
+    rank_to_node, rank_to_ib = list(range(world_size)), list(range(world_size))
+    for r in range(world_size):
+        rank_to_node[r] = r // 8
+        rank_to_ib[r] = rank % 8
+    MAX_BYTES = 2**20
+    scratch = torch.empty(
+        MAX_BYTES * 8, dtype=torch.bfloat16, device=torch.cuda.current_device()
+    )
+    put_buffer = torch.empty(
+        MAX_BYTES, dtype=torch.bfloat16, device=torch.cuda.current_device()
+    )
+    print(f"[{rank}] start mscclpp_context init")
+    nranks_per_node = torch.cuda.device_count()
+    selection = int(MscclContextSelection.MSCCL1SHOT1NODELL)
+    mscclpp_context = custom_ops.mscclpp_init_context(
+        unique_id,
+        rank,
+        world_size,
+        scratch,
+        put_buffer,
+        nranks_per_node,
+        rank_to_node,
+        rank_to_ib,
+        selection,
+    )
+    try:
+        test_loop = 10
+        for sz in test_sizes:
+            for dtype in [torch.float32, torch.float16, torch.bfloat16]:
+                if sz * dtype.itemsize > MAX_BYTES:
+                    continue
+                if rank == 0:
+                    print(f"mscclpp allreduce test sz {sz}, dtype {dtype}")
+                for _ in range(test_loop):
+                    inp1 = torch.randint(1, 16, (sz,), dtype=dtype, device=device)
+                    inp1_ref = inp1.clone()
+                    out1 = torch.empty_like(inp1)
+                    custom_ops.mscclpp_allreduce(
+                        mscclpp_context, inp1, out1, nthreads=512, nblocks=21
+                    )
+                    dist.all_reduce(inp1_ref, group=group)
+                    torch.testing.assert_close(out1, inp1_ref)
+    finally:
+        dist.barrier(group=group)
+        dist.destroy_process_group(group=group)
+
+
+def get_open_port() -> int:
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(("127.0.0.1", 0))
+            return s.getsockname()[1]
+    except OSError:
+        with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
+            s.bind(("::1", 0))
+            return s.getsockname()[1]
+
+
+def multi_process_parallel(
+    world_size: int, test_target: Any, target_args: tuple = ()
+) -> None:
+    mp.set_start_method("spawn", force=True)
+
+    procs = []
+    distributed_init_port = get_open_port()
+    for i in range(world_size):
+        proc_args = (world_size, i, distributed_init_port) + target_args
+        proc = mp.Process(target=test_target, args=proc_args, name=f"Worker-{i}")
+        proc.start()
+        procs.append(proc)
+
+    for i in range(world_size):
+        procs[i].join()
+        assert (
+            procs[i].exitcode == 0
+        ), f"Process {i} failed with exit code {procs[i].exitcode}"
+
+
+class TestMSCCLAllReduce(unittest.TestCase):
+    test_sizes = [
+        512,
+        2560,
+        4096,
+        5120,
+        7680,
+        32768,
+        262144,
+        524288,
+    ]
+    world_sizes = [8]
+
+    def test_correctness(self):
+        for world_size in self.world_sizes:
+            available_gpus = torch.cuda.device_count()
+            if world_size > available_gpus:
+                print(
+                    f"Skipping world_size={world_size}, found {available_gpus} and now ray is not supported here"
+                )
+                continue
+
+            print(f"Running test for world_size={world_size}")
+            multi_process_parallel(
+                world_size, _run_correctness_worker, target_args=(self.test_sizes,)
+            )
+            print(f"custom allreduce tp = {world_size}: OK")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sgl-kernel/tests/test_norm.py b/sgl-kernel/tests/test_norm.py
new file mode 100644
index 000000000..d22da931f
--- /dev/null
+++ b/sgl-kernel/tests/test_norm.py
@@ -0,0 +1,133 @@
+# Adapted from https://github.com/flashinfer-ai/flashinfer/blob/4e8eb1879f9c3ba6d75511e5893183bf8f289a62/tests/test_norm.py
+
+import pytest
+import sgl_kernel
+import torch
+
+
+def llama_rms_norm(x, w, eps=1e-6):
+    orig_dtype = x.dtype
+    x = x.float()
+    variance = x.pow(2).mean(dim=-1, keepdim=True)
+    x = x * torch.rsqrt(variance + eps)
+    x = x * w.float()
+    x = x.to(orig_dtype)
+    return x
+
+
+def gemma_rms_norm(x, w, eps=1e-6):
+    orig_dtype = x.dtype
+    x = x.float()
+    variance = x.pow(2).mean(dim=-1, keepdim=True)
+    x = x * torch.rsqrt(variance + eps)
+    x = x * (1.0 + w.float())
+    x = x.to(orig_dtype)
+    return x
+
+
+def gemma_fused_add_rms_norm(x, residual, w, eps=1e-6):
+    orig_dtype = x.dtype
+    x = x + residual
+    residual = x
+    x = x.float()
+    variance = x.pow(2).mean(dim=-1, keepdim=True)
+    x = x * torch.rsqrt(variance + eps)
+    x = x * (1.0 + w.float())
+    x = x.to(orig_dtype)
+    return x, residual
+
+
+def fused_add_rms_norm(x, residual, weight, eps):
+    orig_dtype = x.dtype
+    x = x.to(torch.float32)
+    x = x + residual.to(torch.float32)
+    residual = x.to(orig_dtype)
+
+    variance = x.pow(2).mean(dim=-1, keepdim=True)
+    x = x * torch.rsqrt(variance + eps)
+    x = (x * weight.float()).to(orig_dtype)
+    return x, residual
+
+
+@pytest.mark.parametrize("batch_size", [1, 19, 99, 989])
+@pytest.mark.parametrize("hidden_size", [111, 500, 1024, 3072, 3584, 4096, 8192, 16384])
+@pytest.mark.parametrize("dtype", [torch.float16])
+@pytest.mark.parametrize("specify_out", [True, False])
+def test_norm(batch_size, hidden_size, dtype, specify_out):
+    x = torch.randn(batch_size, hidden_size).to(0).to(dtype)
+    w = torch.randn(hidden_size).to(0).to(dtype)
+
+    y_ref = llama_rms_norm(x, w)
+    if specify_out:
+        y = torch.empty_like(x)
+        sgl_kernel.rmsnorm(x, w, out=y)
+    else:
+        y = sgl_kernel.rmsnorm(x, w)
+
+    torch.testing.assert_close(y_ref, y, rtol=1e-3, atol=1e-3)
+
+
+@pytest.mark.parametrize("batch_size", [1, 19, 99, 989])
+@pytest.mark.parametrize("hidden_size", [111, 500, 1024, 3072, 3584, 4096, 8192, 16384])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
+def test_fused_add_rmsnorm(batch_size, hidden_size, dtype):
+    eps = 1e-6
+
+    x = torch.randn(batch_size, hidden_size, dtype=dtype, device="cuda")
+    residual = torch.randn_like(x)
+    weight = torch.randn(hidden_size, dtype=dtype, device="cuda")
+
+    x_native, residual_native = fused_add_rms_norm(
+        x.clone(), residual.clone(), weight, eps
+    )
+
+    x_fused = x.clone()
+    residual_fused = residual.clone()
+    sgl_kernel.fused_add_rmsnorm(x_fused, residual_fused, weight, eps)
+
+    torch.testing.assert_close(x_fused, x_native, rtol=1e-3, atol=1e-3)
+    torch.testing.assert_close(residual_fused, residual_native, rtol=1e-3, atol=1e-3)
+
+
+@pytest.mark.parametrize("batch_size", [1, 19, 99, 989])
+@pytest.mark.parametrize("hidden_size", [111, 500, 1024, 3072, 3584, 4096, 8192, 16384])
+@pytest.mark.parametrize("dtype", [torch.float16])
+@pytest.mark.parametrize("specify_out", [True, False])
+def test_gemma_norm(batch_size, hidden_size, dtype, specify_out):
+    x = torch.randn(batch_size, hidden_size).to(0).to(dtype)
+    w = torch.randn(hidden_size).to(0).to(dtype)
+
+    y_ref = gemma_rms_norm(x, w)
+    if specify_out:
+        y = torch.empty_like(x)
+        sgl_kernel.gemma_rmsnorm(x, w, out=y)
+    else:
+        y = sgl_kernel.gemma_rmsnorm(x, w)
+
+    torch.testing.assert_close(y_ref, y, rtol=1e-3, atol=1e-3)
+
+
+@pytest.mark.parametrize("batch_size", [1, 19, 99, 989])
+@pytest.mark.parametrize("hidden_size", [111, 500, 1024, 3072, 3584, 4096, 8192, 16384])
+@pytest.mark.parametrize("dtype", [torch.float16])
+def test_gemma_fused_add_rmsnorm(batch_size, hidden_size, dtype):
+    eps = 1e-6
+
+    x = torch.randn(batch_size, hidden_size, dtype=dtype, device="cuda")
+    residual = torch.randn_like(x)
+    weight = torch.randn(hidden_size, dtype=dtype, device="cuda")
+
+    x_native, residual_native = gemma_fused_add_rms_norm(
+        x.clone(), residual.clone(), weight, eps
+    )
+
+    x_fused = x.clone()
+    residual_fused = residual.clone()
+    sgl_kernel.gemma_fused_add_rmsnorm(x_fused, residual_fused, weight, eps)
+
+    torch.testing.assert_close(x_fused, x_native, rtol=1e-3, atol=1e-3)
+    torch.testing.assert_close(residual_fused, residual_native, rtol=1e-3, atol=1e-3)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_per_tensor_quant_fp8.py b/sgl-kernel/tests/test_per_tensor_quant_fp8.py
new file mode 100644
index 000000000..0840f298f
--- /dev/null
+++ b/sgl-kernel/tests/test_per_tensor_quant_fp8.py
@@ -0,0 +1,67 @@
+import itertools
+from typing import Optional, Tuple
+
+import pytest
+import torch
+from sgl_kernel import sgl_per_tensor_quant_fp8
+
+from sglang.srt.utils import is_hip
+
+_is_hip = is_hip()
+fp8_type_ = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
+
+
+def sglang_scaled_fp8_quant(
+    input: torch.Tensor,
+    scale: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    fp8_type_: torch.dtype = torch.float8_e4m3fn
+    output = torch.empty_like(input, device=input.device, dtype=fp8_type_)
+    is_static = True
+    if scale is None:
+        scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+        is_static = False
+    sgl_per_tensor_quant_fp8(input, output, scale, is_static)
+
+    return output, scale
+
+
+def torch_scaled_fp8_quant(tensor, inv_scale):
+    # The reference implementation that fully aligns to
+    # the kernel being tested.
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    scale = inv_scale.reciprocal()
+    qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min, max=finfo.max)
+    qweight = qweight.to(torch.float8_e4m3fn)
+    return qweight
+
+
+@pytest.mark.parametrize(
+    "num_tokens,hidden_dim",
+    list(itertools.product([128, 256, 512], [512, 2048, 4096])),
+)
+def test_per_tensor_quant_compare_implementations(
+    num_tokens: int,
+    hidden_dim: int,
+):
+    device = torch.device("cuda")
+    x = torch.rand((num_tokens, hidden_dim), dtype=torch.float16, device=device)
+
+    sglang_out, sglang_scale = sglang_scaled_fp8_quant(x)
+    torch_out = torch_scaled_fp8_quant(x, sglang_scale)
+
+    torch.testing.assert_close(
+        sglang_out.float(), torch_out.float(), rtol=1e-3, atol=1e-3
+    )
+
+    scale = torch.rand(1, dtype=torch.float32, device=device)
+    sglang_out, sglang_scale = sglang_scaled_fp8_quant(x, scale)
+    torch_out = torch_scaled_fp8_quant(x, scale)
+
+    torch.testing.assert_close(
+        sglang_out.float(), torch_out.float(), rtol=1e-3, atol=1e-3
+    )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_per_token_group_quant_8bit.py b/sgl-kernel/tests/test_per_token_group_quant_8bit.py
new file mode 100644
index 000000000..778d14d31
--- /dev/null
+++ b/sgl-kernel/tests/test_per_token_group_quant_8bit.py
@@ -0,0 +1,97 @@
+import itertools
+
+import pytest
+import torch
+
+from sglang.srt.layers.quantization import deep_gemm_wrapper
+from sglang.srt.layers.quantization.fp8_kernel import (
+    per_token_group_quant_8bit as triton_per_token_group_quant_8bit,
+)
+from sglang.srt.layers.quantization.fp8_kernel import sglang_per_token_group_quant_8bit
+from sglang.srt.layers.quantization.utils import assert_fp8_all_close
+from sglang.srt.utils import is_hip
+
+_is_hip = is_hip()
+fp8_type_ = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
+
+
+@pytest.mark.parametrize(
+    "num_tokens, hidden_dim, group_size, dst_dtype, flags",
+    list(
+        itertools.product(
+            [127, 128, 512, 1024, 4096, 8192],  # num_tokens
+            [256, 512, 1024, 2048, 4096],  # hidden_dim
+            [8, 16, 32, 64, 128],  # group_size
+            # TODO test int8
+            [fp8_type_],  # dtype
+            [
+                dict(
+                    column_major_scales=False,
+                    scale_tma_aligned=False,
+                    scale_ue8m0=False,
+                ),
+                dict(
+                    column_major_scales=True,
+                    scale_tma_aligned=False,
+                    scale_ue8m0=False,
+                ),
+                dict(
+                    column_major_scales=True,
+                    scale_tma_aligned=True,
+                    scale_ue8m0=False,
+                ),
+                dict(
+                    column_major_scales=True,
+                    scale_tma_aligned=True,
+                    scale_ue8m0=True,
+                ),
+            ],
+        )
+    ),
+)
+def test_per_token_group_quant_with_column_major(
+    num_tokens,
+    hidden_dim,
+    group_size,
+    dst_dtype,
+    flags,
+):
+    if flags["scale_ue8m0"] and ((group_size != 128) or (hidden_dim % 512 != 0)):
+        pytest.skip()
+        return
+    if flags["scale_ue8m0"] and not deep_gemm_wrapper.DEEPGEMM_BLACKWELL:
+        pytest.skip("scale_ue8m0 only supported on Blackwell")
+        return
+
+    x = torch.randn(num_tokens, hidden_dim, device="cuda", dtype=torch.bfloat16)
+
+    execute_kwargs = dict(
+        x=x,
+        group_size=group_size,
+        eps=1e-10,
+        dst_dtype=dst_dtype,
+        **flags,
+    )
+
+    x_q_triton, x_s_triton = triton_per_token_group_quant_8bit(**execute_kwargs)
+    x_q_sglang, x_s_sglang = sglang_per_token_group_quant_8bit(**execute_kwargs)
+
+    # torch.set_printoptions(profile="full")
+    # print(f"{x_q_triton=}")
+    # print(f"{x_s_triton=}")
+    # print(f"{x_q_sglang=}")
+    # print(f"{x_s_sglang=}")
+    # torch.set_printoptions(profile="default")
+
+    assert_fp8_all_close(x_q_triton, x_q_sglang)
+    torch.testing.assert_close(
+        x_s_triton.contiguous(),
+        x_s_sglang.contiguous(),
+        rtol=1e-3,
+        atol=1e-5,
+        msg=lambda message: message + f" {x_s_triton=} {x_s_sglang=}",
+    )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_per_token_quant_fp8.py b/sgl-kernel/tests/test_per_token_quant_fp8.py
new file mode 100644
index 000000000..40ec9d897
--- /dev/null
+++ b/sgl-kernel/tests/test_per_token_quant_fp8.py
@@ -0,0 +1,57 @@
+import itertools
+from typing import Optional, Tuple
+
+import pytest
+import torch
+from sgl_kernel import sgl_per_token_quant_fp8
+
+from sglang.srt.utils import is_hip
+
+_is_hip = is_hip()
+fp8_type_ = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
+
+
+def torch_per_token_quant_fp8(tensor, inv_scale):
+    # The reference implementation that fully aligns to
+    # the kernel being tested.
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    inv_scale = inv_scale.view(-1, 1)
+    scale = inv_scale.reciprocal()
+    qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min, max=finfo.max)
+    qweight = qweight.to(torch.float8_e4m3fn)
+    return qweight
+
+
+def sglang_per_token_quant_fp8(
+    input: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    scale = torch.zeros(input.size(0), device=input.device, dtype=torch.float32)
+    output = torch.empty_like(input, device=input.device, dtype=fp8_type_)
+
+    sgl_per_token_quant_fp8(input, output, scale)
+    scale = scale.reshape(-1, 1)
+
+    return output, scale
+
+
+@pytest.mark.parametrize(
+    "num_tokens,hidden_dim",
+    list(itertools.product([128, 256, 512], [512, 1368, 2048, 4096])),
+)
+def test_per_token_quant_compare_implementations(
+    num_tokens: int,
+    hidden_dim: int,
+):
+    device = torch.device("cuda")
+    x = torch.rand((num_tokens, hidden_dim), dtype=torch.float16, device=device)
+
+    sglang_out, sglang_scale = sglang_per_token_quant_fp8(x)
+    torch_out = torch_per_token_quant_fp8(x, sglang_scale)
+
+    torch.testing.assert_close(
+        sglang_out.float(), torch_out.float(), rtol=1e-3, atol=1e-3
+    )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_qserve_w4a8_per_chn_gemm.py b/sgl-kernel/tests/test_qserve_w4a8_per_chn_gemm.py
new file mode 100644
index 000000000..9410710d7
--- /dev/null
+++ b/sgl-kernel/tests/test_qserve_w4a8_per_chn_gemm.py
@@ -0,0 +1,118 @@
+import pytest
+import torch
+from sgl_kernel import qserve_w4a8_per_chn_gemm
+
+
+# Adapted from https://github.com/mit-han-lab/omniserve/blob/main/omniserve/modeling/layers/quantized_linear/w4a8_linear.py
+def convert_to_qserve_format(qweight, scale, zero):
+    assert qweight.min() >= 0 and qweight.max() <= 15, "Quantized weight out of range"
+    in_features = qweight.shape[1]
+    out_features = qweight.shape[0]
+    assert in_features % 32 == 0, "Input features must be divisible by 32"
+    assert out_features % 32 == 0, "Output features must be divisible by 32"
+
+    # ---- Repack the weight ---- #
+    # pack to M // 32, K // 32, (8, 4), ([2], 2, 2, 4)
+    qweight_unpack_reorder = (
+        qweight.reshape(
+            out_features // 32,
+            2,
+            2,
+            8,
+            in_features // 32,
+            2,
+            4,
+            4,
+        )
+        .permute(0, 4, 3, 6, 1, 5, 2, 7)
+        .contiguous()
+    )
+    qweight_unpack_reorder = (
+        qweight_unpack_reorder.permute(0, 1, 2, 3, 5, 6, 7, 4)
+        .contiguous()
+        .to(torch.int8)
+    )
+    # B_fp16_reorder = B_fp16_reorder[:, :, :, :, :, :, [3, 2, 1, 0]].contiguous()
+    # [16, 0, 17, 1, ...]
+    qweight_unpack_repacked = (
+        qweight_unpack_reorder[..., 1] << 4
+    ) + qweight_unpack_reorder[..., 0]
+    qweight_unpack_repacked = qweight_unpack_repacked.reshape(
+        out_features // 32, in_features // 32, 32, 16
+    )
+    qweight_unpack_repacked = qweight_unpack_repacked.reshape(
+        out_features, in_features // 2
+    ).contiguous()
+
+    # ---- Pack the scales ---- #
+    scale = scale.reshape(out_features).to(torch.float16).contiguous()
+    szero = zero.reshape(out_features).to(torch.float16).contiguous() * scale
+
+    return qweight_unpack_repacked, scale, szero
+
+
+# INT4 Quantization
+def asym_quantize_tensor(tensor):
+    tensor_min = tensor.min(dim=-1, keepdim=True)[0]
+    tensor_max = tensor.max(dim=-1, keepdim=True)[0]
+    q_min = 0
+    q_max = 15
+    tensor_scale = (tensor_max - tensor_min) / (q_max - q_min)
+    tensor_zero = q_min - torch.round(tensor_min / tensor_scale)
+    tensor_q = torch.clamp(
+        torch.round(tensor / tensor_scale) + tensor_zero, q_min, q_max
+    ).to(torch.int8)
+    return tensor_q, tensor_scale.to(torch.float16), tensor_zero.to(torch.int8)
+
+
+# INT8 Quantization
+def sym_quantize_tensor(tensor):
+    tensor_scale = tensor.abs().max(dim=-1, keepdim=True)[0] / 127
+    tensor_q = torch.clamp(torch.round(tensor / tensor_scale), -128, 127).to(torch.int8)
+    return tensor_q, tensor_scale.to(torch.float16)
+
+
+def torch_w4a8_per_chn_gemm(a, b, a_scale, b_scale, b_zero, out_dtype):
+    print(a.shape)
+    print(b.shape)
+    print(b_zero.shape)
+    o = torch.matmul(
+        a.to(torch.float16), (b.to(torch.float16) - b_zero.to(torch.float16)).t()
+    )
+    o = o * a_scale.view(-1, 1) * b_scale.view(1, -1)
+    return o.to(out_dtype)
+
+
+def _test_accuracy_once(M, N, K, out_dtype, device):
+    # to avoid overflow, multiply 0.01
+    a = torch.randn((M, K), device=device, dtype=torch.float32) * 0.01
+    b = torch.randn((N, K), device=device, dtype=torch.float32) * 0.01
+
+    # symmetric quantize a
+    a_q, a_scale = sym_quantize_tensor(a)
+    # asymmetric quantize b
+    b_q, b_scale, b_zero = asym_quantize_tensor(b)
+    # convert to qserve format
+    b_q_format, b_scale_format, b_szero_format = convert_to_qserve_format(
+        b_q, b_scale, b_zero
+    )
+
+    # cal sum of every row of a
+    a_sum = a.sum(dim=-1, keepdim=True).to(torch.float16)
+    out = qserve_w4a8_per_chn_gemm(
+        a_q, b_q_format, b_scale_format, a_scale, b_szero_format, a_sum
+    )
+    ref_out = torch_w4a8_per_chn_gemm(a_q, b_q, a_scale, b_scale, b_zero, out_dtype)
+    torch.testing.assert_close(out, ref_out, rtol=1e-3, atol=1e-2)
+
+
+@pytest.mark.parametrize("M", [1, 16, 32, 64, 128, 512, 1024, 4096, 8192])
+@pytest.mark.parametrize("N", [128, 512, 1024, 4096, 8192, 16384])
+@pytest.mark.parametrize("K", [512, 1024, 4096, 8192, 16384])
+@pytest.mark.parametrize("out_dtype", [torch.float16])
+def test_accuracy(M, N, K, out_dtype):
+    _test_accuracy_once(M, N, K, out_dtype, "cuda")
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_qserve_w4a8_per_group_gemm.py b/sgl-kernel/tests/test_qserve_w4a8_per_group_gemm.py
new file mode 100644
index 000000000..fc26a2e60
--- /dev/null
+++ b/sgl-kernel/tests/test_qserve_w4a8_per_group_gemm.py
@@ -0,0 +1,183 @@
+import pytest
+import torch
+from sgl_kernel import qserve_w4a8_per_group_gemm
+
+
+# Adapted from https://github.com/mit-han-lab/omniserve/blob/main/omniserve/modeling/layers/quantized_linear/w4a8_linear.py
+def convert_to_qserve_format(qweight, chn_scale, scale_i8, zero_i8, group_size):
+    assert qweight.min() >= 0 and qweight.max() <= 15, "Quantized weight out of range"
+    in_features = qweight.shape[1]
+    out_features = qweight.shape[0]
+    assert in_features % 32 == 0, "Input features must be divisible by 32"
+    assert out_features % 32 == 0, "Output features must be divisible by 32"
+    assert group_size == 128, "Group size must be 128"
+    assert (
+        in_features % group_size == 0
+    ), "Input features must be divisible by group_size"
+
+    # ---- Repack the weight ---- #
+    # pack to M // 32, K // 32, (8, 4), ([2], 2, 2, 4)
+    qweight_unpack_reorder = (
+        qweight.reshape(
+            out_features // 32,
+            2,
+            2,
+            8,
+            in_features // 32,
+            2,
+            4,
+            4,
+        )
+        .permute(0, 4, 3, 6, 1, 5, 2, 7)
+        .contiguous()
+    )
+    qweight_unpack_reorder = (
+        qweight_unpack_reorder.permute(0, 1, 2, 3, 5, 6, 7, 4)
+        .contiguous()
+        .to(torch.int8)
+    )
+    # B_fp16_reorder = B_fp16_reorder[:, :, :, :, :, :, [3, 2, 1, 0]].contiguous()
+    # [16, 0, 17, 1, ...]
+    qweigth_unpack_repacked = (
+        qweight_unpack_reorder[..., 1] << 4
+    ) + qweight_unpack_reorder[..., 0]
+    qweigth_unpack_repacked = qweigth_unpack_repacked.reshape(
+        out_features // 32, in_features // 32, 32, 16
+    )
+    qweigth_unpack_repacked = qweigth_unpack_repacked.reshape(
+        out_features, in_features // 2
+    )
+
+    # ---- Pack the scales ---- #
+    chn_scale = chn_scale.reshape(out_features)
+
+    scale_i8 = (
+        scale_i8.reshape(out_features, in_features // group_size)
+        .transpose(0, 1)
+        .contiguous()
+    )
+    scale_i8 = scale_i8.reshape(in_features // group_size, out_features // 32, 32)
+    scale_i8 = (
+        scale_i8.reshape(in_features // group_size, out_features // 32, 4, 8)
+        .transpose(-2, -1)
+        .contiguous()
+    )
+    scale_i8 = scale_i8.reshape(in_features // group_size, out_features).contiguous()
+
+    # ---- Pack the zeros ---- #
+    zero_i8 = -zero_i8
+    # zero_i8 = zero_i8.int()  # convert to 2-complement
+
+    zero_i8 = (
+        zero_i8.reshape(out_features, in_features // group_size)
+        .transpose(0, 1)
+        .contiguous()
+    )
+    zero_i8 = zero_i8.reshape(in_features // group_size, out_features // 32, 32)
+    # for the last dimension, organize as 0, 8, 16, 24, 1, 9, 17, 25, ... following the requirement of tensor core gemm
+    zero_i8 = (
+        zero_i8.reshape(in_features // group_size, out_features // 32, 4, 8)
+        .transpose(-2, -1)
+        .contiguous()
+    )
+    zero_i8 = (
+        zero_i8.reshape(in_features // group_size, out_features).contiguous() * scale_i8
+    )
+
+    return qweigth_unpack_repacked, chn_scale, scale_i8, zero_i8
+
+
+# Progressive Group INT4 Quantization
+def progressive_group_quantize_tensor(tensor, group_size):
+    assert group_size == 128, "Group size must be 128"
+    assert (
+        tensor.shape[-1] % group_size == 0
+    ), "Input features must be divisible by group_size"
+    # Channel scale
+    # NOTE(HandH1998): use protective quantization range
+    chn_scale = tensor.abs().max(dim=-1, keepdim=True)[0] / 119
+    tensor_i8 = torch.clamp(torch.round(tensor / chn_scale), -119, 119)
+
+    # Group scale
+    tensor_i8 = tensor_i8.reshape(-1, group_size)
+    tensor_i8_min = tensor_i8.min(dim=-1, keepdim=True)[0]
+    tensor_i8_max = tensor_i8.max(dim=-1, keepdim=True)[0]
+    q_min = 0
+    q_max = 15
+    scale_i8 = torch.round((tensor_i8_max - tensor_i8_min) / (q_max - q_min))
+    zero_i8 = q_min - torch.round(tensor_i8_min / scale_i8)
+    tensor_q = (
+        torch.clamp(torch.round(tensor_i8 / scale_i8) + zero_i8, q_min, q_max)
+        .reshape(tensor.shape[0], -1)
+        .to(torch.int8)
+    )
+    return (
+        tensor_q,
+        chn_scale.to(torch.float16),
+        scale_i8.reshape(tensor.shape[0], -1).to(torch.int8),
+        zero_i8.reshape(tensor.shape[0], -1).to(torch.int8),
+    )
+
+
+# INT8 Quantization
+def sym_quantize_tensor(tensor):
+    tensor_scale = tensor.abs().max(dim=-1, keepdim=True)[0] / 127
+    tensor_q = torch.clamp(torch.round(tensor / tensor_scale), -128, 127).to(torch.int8)
+    return tensor_q, tensor_scale.to(torch.float16)
+
+
+def torch_w4a8_per_group_gemm(
+    a, b, a_scale, b_chn_scale, b_scale_i8, b_zero_i8, group_size, out_dtype
+):
+    assert group_size == 128, "Group size must be 128"
+    b_dq = (
+        b.reshape(-1, group_size).to(torch.float32)
+        - b_zero_i8.reshape(-1, 1).to(torch.float32)
+    ) * b_scale_i8.reshape(-1, 1).to(torch.float32)
+    b_dq = b_dq.reshape(b.shape[0], b.shape[1])
+    o = torch.matmul(a.to(torch.float32), b_dq.t())
+    o = o * a_scale.view(-1, 1) * b_chn_scale.view(1, -1)
+    return o.to(out_dtype)
+
+
+def _test_accuracy_once(M, N, K, group_size, out_dtype, device):
+    # to avoid overflow, multiply 0.01
+    a = torch.randn((M, K), device=device, dtype=torch.float32) * 0.01
+    b = torch.randn((N, K), device=device, dtype=torch.float32) * 0.01
+
+    # symmetric quantize a
+    a_q, a_scale = sym_quantize_tensor(a)
+    # asymmetric quantize b
+    b_q, b_chn_scale, b_scale_i8, b_zero_i8 = progressive_group_quantize_tensor(
+        b, group_size
+    )
+    # convert to qserve format
+    b_q_format, b_chn_scale_format, b_scale_i8_format, b_zero_i8_format = (
+        convert_to_qserve_format(b_q, b_chn_scale, b_scale_i8, b_zero_i8, group_size)
+    )
+
+    out = qserve_w4a8_per_group_gemm(
+        a_q,
+        b_q_format,
+        b_zero_i8_format,
+        b_scale_i8_format,
+        b_chn_scale_format,
+        a_scale,
+    )
+    ref_out = torch_w4a8_per_group_gemm(
+        a_q, b_q, a_scale, b_chn_scale, b_scale_i8, b_zero_i8, group_size, out_dtype
+    )
+    torch.testing.assert_close(out, ref_out, rtol=1e-3, atol=1e-5)
+
+
+@pytest.mark.parametrize("M", [1, 16, 32, 64, 128, 512, 1024, 4096, 8192])
+@pytest.mark.parametrize("N", [128, 512, 1024, 4096, 8192, 16384])
+@pytest.mark.parametrize("K", [512, 1024, 4096, 8192, 16384])
+@pytest.mark.parametrize("group_size", [128])
+@pytest.mark.parametrize("out_dtype", [torch.float16])
+def test_accuracy(M, N, K, group_size, out_dtype):
+    _test_accuracy_once(M, N, K, group_size, out_dtype, "cuda")
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_rotary_embedding.py b/sgl-kernel/tests/test_rotary_embedding.py
new file mode 100644
index 000000000..d9f9364b0
--- /dev/null
+++ b/sgl-kernel/tests/test_rotary_embedding.py
@@ -0,0 +1,138 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import pytest
+import torch
+from sgl_kernel import FusedSetKVBufferArg, apply_rope_with_cos_sin_cache_inplace
+from sgl_kernel.testing.rotary_embedding import (
+    FlashInferRotaryEmbedding,
+    MHATokenToKVPool,
+    RotaryEmbedding,
+    create_inputs,
+)
+
+
+@pytest.mark.parametrize(
+    "head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype, device, batch_size, seq_len, num_q_heads, num_kv_heads, save_kv_cache",
+    [
+        # GPT-OSS cases
+        *[
+            (
+                64,
+                64,
+                4096,
+                8000,
+                True,
+                torch.bfloat16,
+                "cuda",
+                batch_size,
+                seq_len,
+                64,
+                8,
+                save_kv_cache,
+            )
+            for batch_size, seq_len in (
+                (1, 1),
+                (32, 1),
+                (128, 1),
+                (512, 1),
+                (2, 512),
+                (4, 4096),
+            )
+            for save_kv_cache in (False, True)
+        ],
+        # Other cases
+        (64, 64, 32, 8000, True, torch.bfloat16, "cuda", 32, 32, 1, 1, False),
+        (256, 128, 4096, 10000, True, torch.bfloat16, "cuda", 2, 512, 4, 2, False),
+        (512, 128, 311, 10000, True, torch.bfloat16, "cuda", 3, 39, 4, 2, False),
+        (128, 128, 2048, 10000, False, torch.bfloat16, "cuda", 2, 512, 32, 8, False),
+        (128, 128, 2048, 10000, False, torch.bfloat16, "cuda", 2, 512, 16, 4, False),
+        (512, 128, 311, 10000, False, torch.bfloat16, "cuda", 3, 39, 4, 2, False),
+    ],
+)
+def test_correctness(
+    head_size: int,
+    rotary_dim: int,
+    max_position_embeddings: int,
+    base: int,
+    is_neox_style: bool,
+    dtype: torch.dtype,
+    device: str,
+    batch_size: int,
+    seq_len: int,
+    num_q_heads: int,
+    num_kv_heads: int,
+    save_kv_cache: bool,
+):
+    config = dict(
+        head_size=head_size,
+        rotary_dim=rotary_dim,
+        max_position_embeddings=max_position_embeddings,
+        base=base,
+        is_neox_style=is_neox_style,
+        dtype=dtype,
+    )
+
+    rope_ref = RotaryEmbedding(**config).to(device)
+    rope_flashinfer = FlashInferRotaryEmbedding(**config).to(device)
+
+    inputs = create_inputs(
+        head_size=head_size,
+        batch_size=batch_size,
+        seq_len=seq_len,
+        device=device,
+        dtype=dtype,
+        num_q_heads=num_q_heads,
+        num_kv_heads=num_kv_heads,
+    )
+
+    if save_kv_cache:
+        pool_ref = MHATokenToKVPool(head_num=num_kv_heads, head_dim=head_size)
+        pool_flashinfer = MHATokenToKVPool(head_num=num_kv_heads, head_dim=head_size)
+
+    query_ref, key_ref = inputs["query"].clone(), inputs["key"].clone()
+    query_flashinfer, key_flashinfer = inputs["query"].clone(), inputs["key"].clone()
+
+    query_ref_out, key_ref_out = rope_ref.forward_native(
+        inputs["pos_ids"], query_ref, key_ref
+    )
+    if save_kv_cache:
+        pool_ref.set_kv_buffer(
+            loc=inputs["out_cache_loc"],
+            cache_k=key_ref_out.view(-1, num_kv_heads, head_size),
+            cache_v=inputs["value"].view(-1, num_kv_heads, head_size),
+        )
+
+    query_flashinfer_out, key_flashinfer_out = rope_flashinfer.forward_cuda(
+        inputs["pos_ids"],
+        query_flashinfer,
+        key_flashinfer,
+        fused_set_kv_buffer_arg=(
+            FusedSetKVBufferArg(
+                value=inputs["value"],
+                k_buffer=pool_flashinfer.k_buffer[0].view(-1, num_kv_heads * head_size),
+                v_buffer=pool_flashinfer.v_buffer[0].view(-1, num_kv_heads * head_size),
+                k_scale=None,
+                v_scale=None,
+                cache_loc=inputs["out_cache_loc"],
+            )
+            if save_kv_cache
+            else None
+        ),
+    )
+
+    torch.testing.assert_close(
+        query_ref_out, query_flashinfer_out, atol=1e-2, rtol=1e-2
+    )
+    torch.testing.assert_close(key_ref_out, key_flashinfer_out, atol=1e-2, rtol=1e-2)
+    if save_kv_cache:
+        for field in ["k_buffer", "v_buffer"]:
+            x_ref = getattr(pool_ref, field)[0]
+            x_flashinfer = getattr(pool_flashinfer, field)[0]
+            torch.testing.assert_close(x_ref, x_flashinfer, atol=1e-2, rtol=1e-2)
+            nonzero_ref = x_ref != 0
+            nonzero_flashinfer = x_ref != 0
+            assert torch.all(nonzero_ref == nonzero_flashinfer)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_sampling.py b/sgl-kernel/tests/test_sampling.py
new file mode 100644
index 000000000..dc5734cb7
--- /dev/null
+++ b/sgl-kernel/tests/test_sampling.py
@@ -0,0 +1,185 @@
+# Adapted from https://github.com/flashinfer-ai/flashinfer/blob/93e1a2634e22355b0856246b032b285ad1d1da6b/tests/test_sampling.py
+
+import pytest
+import sgl_kernel
+import torch
+
+
+@pytest.mark.parametrize("batch_size", [1, 99, 989])
+@pytest.mark.parametrize("vocab_size", [111, 32000, 128256])
+@pytest.mark.parametrize("k", [100])
+@pytest.mark.parametrize("p", [0.1, 0.5])
+def test_top_k_top_p_sampling_from_probs_logits_top_k_first_alignment(
+    batch_size, vocab_size, k, p
+):
+    torch.manual_seed(42)
+    logits = torch.randn(batch_size, vocab_size, device="cuda:0") * 5
+    generator_logits = torch.Generator("cuda:0")
+    generator_probs = generator_logits.clone_state()
+    samples = sgl_kernel.sampling.top_k_top_p_sampling_from_logits(
+        logits, k, p, filter_apply_order="top_k_first", generator=generator_logits
+    )
+    samples_ref = sgl_kernel.sampling.top_k_top_p_sampling_from_probs(
+        torch.softmax(logits, dim=-1),
+        k,
+        p,
+        filter_apply_order="top_k_first",
+        generator=generator_probs,
+    )
+    assert torch.all(samples == samples_ref)
+
+
+@pytest.mark.parametrize("batch_size", [1, 99, 989])
+@pytest.mark.parametrize("vocab_size", [111, 32000, 128256])
+@pytest.mark.parametrize("k", [100])
+@pytest.mark.parametrize("p", [0.1, 0.5])
+def test_top_k_top_p_sampling_from_probs_logits_joint_alignment(
+    batch_size, vocab_size, k, p
+):
+    torch.manual_seed(42)
+    logits = torch.randn(batch_size, vocab_size, device="cuda:0") * 5
+    generator_logits = torch.Generator("cuda:0")
+    generator_probs = generator_logits.clone_state()
+    samples = sgl_kernel.sampling.top_k_top_p_sampling_from_logits(
+        logits, k, p, filter_apply_order="joint", generator=generator_logits
+    )
+    samples_ref = sgl_kernel.sampling.top_k_top_p_sampling_from_probs(
+        torch.softmax(logits, dim=-1),
+        k,
+        p,
+        filter_apply_order="joint",
+        generator=generator_probs,
+    )
+    assert torch.all(samples == samples_ref)
+
+
+@pytest.mark.parametrize("batch_size", [1, 99, 989])
+@pytest.mark.parametrize("vocab_size", [111, 32000, 128256])
+@pytest.mark.parametrize("p", [0.1, 0.5])
+def test_top_k_top_p_joint_sampling_from_probs(batch_size, vocab_size, p):
+    torch.manual_seed(42)
+    if p == 0.1:
+        k = int(vocab_size * 0.5)
+    elif p == 0.5:
+        k = int(vocab_size * 0.1)
+    else:
+        raise ValueError("p not recognized")
+    eps = 1e-4
+    pre_norm_prob = torch.rand(batch_size, vocab_size, device="cuda:0")
+    normalized_prob = pre_norm_prob / pre_norm_prob.sum(dim=-1, keepdim=True)
+    # top-p mask
+    sorted_prob, indices = torch.sort(normalized_prob, descending=False)
+    cdf = torch.cumsum(sorted_prob, dim=-1)
+    mask_top_p = torch.zeros(batch_size, vocab_size, dtype=torch.int32, device="cuda:0")
+    mask_top_p.scatter_add_(1, indices, (cdf > (1 - p) - eps).int())
+    # top-k mask
+    sorted_prob, _ = torch.sort(normalized_prob, descending=True)
+    pivot = sorted_prob[:, k - 1]
+    mask_top_k = (normalized_prob >= pivot.unsqueeze(-1)).int()
+    # overall mask
+    mask = torch.minimum(mask_top_p, mask_top_k)
+    top_p_tensor = torch.full((batch_size,), p, device="cuda:0")
+    top_k_tensor = torch.full((batch_size,), k, device="cuda:0")
+
+    num_trails = 1000
+    for _ in range(num_trails):
+        samples = sgl_kernel.top_k_top_p_sampling_from_probs(
+            normalized_prob,
+            top_k_tensor,
+            top_p_tensor,
+            filter_apply_order="joint",
+        )
+        assert torch.all(samples < vocab_size) and torch.all(samples >= 0)
+        assert torch.all(mask[torch.arange(batch_size), samples] == 1), normalized_prob[
+            torch.arange(batch_size), samples
+        ]
+
+
+@pytest.mark.parametrize("batch_size", [1, 99, 989])
+@pytest.mark.parametrize("vocab_size", [111, 32000, 128256])
+@pytest.mark.parametrize("p", [0.1, 0.5, 0.9])
+def test_top_p_renorm_probs(batch_size, vocab_size, p):
+    torch.manual_seed(42)
+    pre_norm_prob = torch.rand(batch_size, vocab_size, device="cuda:0")
+    normalized_prob = pre_norm_prob / pre_norm_prob.sum(dim=-1, keepdim=True)
+    sorted_prob, indices = torch.sort(normalized_prob, descending=False)
+    cdf = torch.cumsum(sorted_prob, dim=-1)
+    mask = torch.zeros(batch_size, vocab_size, dtype=torch.int32, device="cuda:0")
+    mask.scatter_add_(1, indices, (cdf >= (1 - p)).int())
+    renorm_prob_ground_truth = normalized_prob.clone()
+    renorm_prob_ground_truth[mask == 0] = 0
+    renorm_prob_ground_truth = renorm_prob_ground_truth / renorm_prob_ground_truth.sum(
+        dim=-1, keepdim=True
+    )
+
+    renorm_prob = sgl_kernel.top_p_renorm_prob(normalized_prob, p)
+    torch.testing.assert_close(
+        renorm_prob_ground_truth,
+        renorm_prob,
+        rtol=1e-3,
+        atol=1e-3,
+    )
+
+
+@pytest.mark.parametrize("batch_size", [1, 99, 989])
+@pytest.mark.parametrize("vocab_size", [111, 32000, 128256])
+@pytest.mark.parametrize("k", [10, 100, 500])
+def test_top_k_renorm_probs(batch_size, vocab_size, k):
+    if k > vocab_size:
+        pytest.skip("k should be less than vocab_size")
+    torch.manual_seed(42)
+    pre_norm_prob = torch.rand(batch_size, vocab_size, device="cuda:0")
+    normalized_prob = pre_norm_prob / pre_norm_prob.sum(dim=-1, keepdim=True)
+    sorted_prob, _ = torch.sort(normalized_prob, descending=True)
+    pivot = sorted_prob[:, k - 1]
+    mask = (normalized_prob >= pivot.unsqueeze(-1)).int()
+    renorm_prob_ground_truth = normalized_prob.clone()
+    renorm_prob_ground_truth[mask == 0] = 0
+    renorm_prob_ground_truth = renorm_prob_ground_truth / renorm_prob_ground_truth.sum(
+        dim=-1, keepdim=True
+    )
+
+    renorm_prob = sgl_kernel.top_k_renorm_prob(normalized_prob, k)
+    for i in range(batch_size):
+        torch.testing.assert_close(
+            renorm_prob_ground_truth[i],
+            renorm_prob[i],
+            rtol=1e-3,
+            atol=1e-3,
+        )
+
+
+@pytest.mark.parametrize("batch_size", [1, 99, 989])
+@pytest.mark.parametrize("vocab_size", [111, 32000, 128256])
+@pytest.mark.parametrize("p", [0.05, 0.1, 0.2, 0.7, 1])
+def test_min_p_sampling(batch_size, vocab_size, p):
+    torch.manual_seed(42)
+    pre_norm_prob = torch.rand(batch_size, vocab_size, device="cuda:0")
+    normalized_prob = pre_norm_prob / pre_norm_prob.sum(dim=-1, keepdim=True)
+    sorted_prob, indices = torch.sort(normalized_prob, descending=False)
+    # scale min-p
+    top_probs = sorted_prob[:, -1].unsqueeze(-1)
+    scaled_p = p * top_probs
+    # min-p mask
+    mask = torch.zeros(batch_size, vocab_size, dtype=torch.int32, device="cuda:0")
+    mask.scatter_add_(1, indices, (sorted_prob >= scaled_p).int())
+    min_p_tensor = torch.full((batch_size,), p, device="cuda:0")
+
+    num_trails = 1000
+    for _ in range(num_trails):
+        samples = sgl_kernel.min_p_sampling_from_probs(
+            normalized_prob,
+            min_p_tensor,
+        )
+
+        assert torch.all(mask[torch.arange(batch_size), samples] == 1), samples[
+            torch.nonzero(mask[torch.arange(batch_size), samples] == 0)
+        ]
+
+        assert torch.all(mask[torch.arange(batch_size), samples] == 1), samples[
+            torch.nonzero(mask[torch.arange(batch_size), samples] == 0)
+        ]
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/test_sparse_flash_attn.py b/sgl-kernel/tests/test_sparse_flash_attn.py
new file mode 100644
index 000000000..28c64cb61
--- /dev/null
+++ b/sgl-kernel/tests/test_sparse_flash_attn.py
@@ -0,0 +1,492 @@
+import math
+from typing import List, Optional, Tuple
+
+import pytest
+import torch
+from einops import rearrange, repeat
+from sgl_kernel.sparse_flash_attn import (
+    convert_vertical_slash_indexes,
+    convert_vertical_slash_indexes_mergehead,
+    sparse_attn_func,
+)
+from test_flash_attention import construct_local_mask, is_fa3_supported
+
+
+def ref_attn(
+    q,
+    k,
+    v,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    attn_bias=None,
+    dropout_p=0.0,
+    dropout_mask=None,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite window size
+    softcap=0.0,
+    upcast=True,
+    reorder_ops=False,
+    key_leftpad=None,
+):
+    """
+    Arguments:
+        q: (batch_size, seqlen_q, nheads, head_dim)
+        k: (batch_size, seqlen_k, nheads_k, head_dim)
+        v: (batch_size, seqlen_k, nheads_k, head_dim)
+        query_padding_mask: (batch_size, seqlen_q)
+        key_padding_mask: (batch_size, seqlen_k)
+        attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k)
+        dropout_p: float
+        dropout_mask: (batch_size, nheads, seqlen_q, seqlen_k)
+        causal: whether to apply causal masking
+        window_size: (int, int), left and right window size
+        upcast: whether to cast all inputs to fp32, do all computation in fp32, then cast
+            output back to fp16/bf16.
+        reorder_ops: whether to change the order of operations (scaling k instead of scaling q, etc.)
+            without changing the math. This is to estimate the numerical error from operation
+            reordering.
+    Output:
+        output: (batch_size, seqlen_q, nheads, head_dim)
+        lse: (batch_size, nheads, seqlen_q)
+    """
+    if causal:
+        window_size = (window_size[0], 0)
+    dtype_og = q.dtype
+    if upcast:
+        q, k, v = q.float(), k.float(), v.float()
+    seqlen_q, seqlen_k = q.shape[1], k.shape[1]
+    k = repeat(k, "b s h d -> b s (h g) d", g=q.shape[2] // k.shape[2])
+    v = repeat(v, "b s h d -> b s (h g) d", g=q.shape[2] // v.shape[2])
+    d = q.shape[-1]
+    if not reorder_ops:
+        scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(d), k)
+    else:
+        scores = torch.einsum("bthd,bshd->bhts", q, k / math.sqrt(d))
+
+    lse_ref = scores.logsumexp(dim=-1)
+
+    if softcap > 0:
+        scores = scores / softcap
+        scores = scores.tanh()
+        scores = scores * softcap
+    if key_padding_mask is not None:
+        scores.masked_fill_(
+            rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf")
+        )
+    if window_size[0] >= 0 or window_size[1] >= 0:
+        local_mask = construct_local_mask(
+            seqlen_q,
+            seqlen_k,
+            window_size,
+            query_padding_mask,
+            key_padding_mask,
+            q.device,
+            key_leftpad=key_leftpad,
+        )
+        scores.masked_fill_(local_mask, float("-inf"))
+    if attn_bias is not None:
+        scores = scores + attn_bias
+    attention = torch.softmax(scores, dim=-1).to(v.dtype)
+    # Some rows might be completely masked out so we fill them with zero instead of NaN
+    if window_size[0] >= 0 or window_size[1] >= 0:
+        attention = attention.masked_fill(
+            torch.all(local_mask, dim=-1, keepdim=True), 0.0
+        )
+    # We want to mask here so that the attention matrix doesn't have any NaNs
+    # Otherwise we'll get NaN in dV
+    if query_padding_mask is not None:
+        attention = attention.masked_fill(
+            rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0
+        )
+    dropout_scaling = 1.0 / (1 - dropout_p)
+    # attention_drop = attention.masked_fill(~dropout_mask, 0.0) * dropout_scaling
+    # output = torch.einsum('bhts,bshd->bthd', attention_drop , v)
+    if dropout_mask is not None:
+        attention_drop = attention.masked_fill(~dropout_mask, 0.0)
+    else:
+        attention_drop = attention
+    output = torch.einsum("bhts,bshd->bthd", attention_drop, v * dropout_scaling)
+    if query_padding_mask is not None:
+        output.masked_fill_(rearrange(~query_padding_mask, "b s -> b s 1 1"), 0.0)
+
+    return output.to(dtype=dtype_og), lse_ref
+
+
+def ref_paged_attn(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    query_lens: List[int],
+    kv_lens: List[int],
+    block_tables: torch.Tensor,
+    scale: float,
+    sliding_window: Optional[int] = None,
+    soft_cap: Optional[float] = None,
+) -> torch.Tensor:
+    num_seqs = len(query_lens)
+    block_tables = block_tables.cpu().numpy()
+    _, block_size, num_kv_heads, head_size = key_cache.shape
+
+    outputs: List[torch.Tensor] = []
+    start_idx = 0
+    for i in range(num_seqs):
+        query_len = query_lens[i]
+        kv_len = kv_lens[i]
+        # clone to avoid clobbering the query tensor
+        q = query[start_idx : start_idx + query_len].clone()
+        q *= scale
+
+        num_kv_blocks = (kv_len + block_size - 1) // block_size
+        block_indices = block_tables[i, :num_kv_blocks]
+
+        k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
+        k = k[:kv_len]
+        v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
+        v = v[:kv_len]
+
+        if q.shape[1] != k.shape[1]:
+            k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
+            v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
+        attn = torch.einsum("qhd,khd->hqk", q, k).float()
+        empty_mask = torch.ones(query_len, kv_len)
+        mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
+        if sliding_window is not None:
+            sliding_window_mask = (
+                torch.triu(
+                    empty_mask, diagonal=kv_len - (query_len + sliding_window) + 1
+                )
+                .bool()
+                .logical_not()
+            )
+            mask |= sliding_window_mask
+        if soft_cap is not None:
+            attn = soft_cap * torch.tanh(attn / soft_cap)
+        attn.masked_fill_(mask, float("-inf"))
+        attn = torch.softmax(attn, dim=-1).to(v.dtype)
+        out = torch.einsum("hqk,khd->qhd", attn, v)
+
+        outputs.append(out)
+        start_idx += query_len
+
+    return torch.cat(outputs, dim=0)
+
+
+@pytest.mark.skipif(
+    not is_fa3_supported(),
+    reason="flash_attn at sgl-kernel is only supported on sm90 or sm80",
+)
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize(
+    "seq_lens",
+    [
+        (1, 1),
+        (1, 1024),
+        (1, 2048),
+        (1023, 2049),
+        (1023, 1023),
+        (32, 32),
+        (65, 65),
+        (129, 129),
+    ],
+)
+@pytest.mark.parametrize("num_heads", [1, 2, 4])
+@pytest.mark.parametrize("head_size", [128])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("NNZ_S", [0, 1, 2, 3, 7, 15, 32])
+@torch.inference_mode()
+def test_sparse_attention(
+    batch_size,
+    seq_lens,
+    num_heads,
+    head_size,
+    dtype,
+    NNZ_S,
+) -> None:
+    torch.set_default_device("cuda")
+    torch.cuda.manual_seed_all(0)
+    block_size_M = 64
+    block_size_N = 64
+    seqlen_q, seqlen_k = seq_lens
+    q = torch.randn(
+        batch_size, seqlen_q, num_heads, head_size, dtype=dtype, requires_grad=False
+    )
+    k = torch.randn(
+        batch_size, seqlen_k, num_heads, head_size, dtype=dtype, requires_grad=False
+    )
+    v = torch.randn(
+        batch_size, seqlen_k, num_heads, head_size, dtype=dtype, requires_grad=False
+    )
+    NUM_ROWS = (seqlen_q + block_size_M - 1) // block_size_M
+    if NNZ_S * block_size_N > seqlen_k:
+        return
+    NNZ_V = seqlen_k - NNZ_S * block_size_N
+    block_count = torch.tensor(
+        [NNZ_S] * batch_size * NUM_ROWS * num_heads, dtype=torch.int32
+    ).reshape(batch_size, num_heads, NUM_ROWS)
+    column_count = torch.tensor(
+        [NNZ_V] * batch_size * NUM_ROWS * num_heads, dtype=torch.int32
+    ).reshape(batch_size, num_heads, NUM_ROWS)
+    block_offset = torch.tensor(
+        [[i * block_size_N for i in range(NNZ_S)]] * batch_size * NUM_ROWS * num_heads,
+        dtype=torch.int32,
+    ).reshape(batch_size, num_heads, NUM_ROWS, NNZ_S)
+    column_index = torch.tensor(
+        [[NNZ_S * block_size_N + i for i in range(NNZ_V)]]
+        * batch_size
+        * NUM_ROWS
+        * num_heads,
+        dtype=torch.int32,
+    ).reshape(batch_size, num_heads, NUM_ROWS, NNZ_V)
+    out, lse = sparse_attn_func(
+        q,
+        k,
+        v,
+        block_count,
+        block_offset,
+        column_count,
+        column_index,
+        return_softmax_lse=True,
+    )
+
+    ref_out, ref_lse = ref_attn(q, k, v)
+
+    torch.testing.assert_close(
+        out, ref_out, atol=2e-2, rtol=1e-2
+    ), f"{torch.max(torch.abs(out - ref_out))}"
+    torch.testing.assert_close(
+        lse, ref_lse, atol=2e-2, rtol=1e-2
+    ), f"{torch.max(torch.abs(lse - ref_lse))}"
+
+
+# sparse attention utils
+# origin
+@pytest.mark.skipif(
+    not is_fa3_supported(),
+    reason="flash_attn at sgl-kernel is only supported on sm90 or sm80",
+)
+@pytest.mark.parametrize("causal", [True, False])
+def test_convert_vertical_slash_indexes(causal):
+    # Prepare small, hand-checkable inputs
+    q_seqlens = torch.tensor([4], dtype=torch.int32, device="cuda")  # [BATCH]
+    kv_seqlens = torch.tensor([4], dtype=torch.int32, device="cuda")
+    vertical_indexes = torch.tensor(
+        [[[1, 3]]], dtype=torch.int32, device="cuda"
+    )  # [BATCH, N_HEADS, NNZ_V]
+    slash_indexes = torch.tensor(
+        [[[2]]], dtype=torch.int32, device="cuda"
+    )  # [BATCH, N_HEADS, NNZ_S]
+    context_size = 4
+    block_size_M = 2
+    block_size_N = 2
+
+    # Call your CUDA kernel wrapper
+    block_count, block_offset, column_count, column_index = (
+        convert_vertical_slash_indexes(
+            q_seqlens,
+            kv_seqlens,
+            vertical_indexes,
+            slash_indexes,
+            context_size,
+            block_size_M,
+            block_size_N,
+            causal=causal,
+        )
+    )
+
+    # Manually create expected outputs for this input
+    # There are 2 rows (blocks): row0 (tokens 0-1), row1 (tokens 2-3)
+    # Fill these expected tensors based on your CUDA kernel's logic
+    # For demonstration, we assume:
+    # - block_count: how many slash indices fall into each block
+    # - block_offset: the value of those indices
+    # - column_count: number of valid vertical indices per block
+    # - column_index: the actual vertical indices
+
+    expected_column_index = torch.tensor(
+        [[[[0, 0], [0, 0]]]], dtype=torch.int32, device="cuda"
+    )
+
+    # If causal=False, update these tensors according to expected behavior
+    if not causal:
+        # Update these values if your kernel produces different output in non-causal mode
+        expected_column_index = torch.tensor(
+            [[[[1, 0], [1, 3]]]], dtype=torch.int32, device="cuda"
+        )
+
+    # Assert that outputs match expectations
+    assert torch.equal(column_index, expected_column_index)
+
+
+# mergehead
+@pytest.mark.skipif(
+    not is_fa3_supported(),
+    reason="flash_attn at sgl-kernel is only supported on sm90 or sm80",
+)
+@pytest.mark.parametrize("causal", [True, False])
+def test_convert_vertical_slash_indexes_mergehead(causal):
+    # Prepare small, hand-checkable inputs for mergehead version
+    q_seqlens = torch.tensor([4], dtype=torch.int32, device="cuda")
+    kv_seqlens = torch.tensor([4], dtype=torch.int32, device="cuda")
+    vertical_indexes = torch.tensor(
+        [
+            [
+                [1, 3],  # head 0
+                [2, 0],  # head 1
+            ]
+        ],
+        dtype=torch.int32,
+        device="cuda",
+    )  # [BATCH, N_HEADS, NNZ_V]
+    slash_indexes = torch.tensor(
+        [
+            [
+                [2, 0],  # head 0
+                [1, 3],  # head 1
+            ]
+        ],
+        dtype=torch.int32,
+        device="cuda",
+    )  # [BATCH, N_HEADS, NNZ_S]
+    vertical_indices_count = torch.tensor([2, 1], dtype=torch.int32, device="cuda")
+    slash_indices_count = torch.tensor([1, 2], dtype=torch.int32, device="cuda")
+    context_size = 4
+    block_size_M = 2
+    block_size_N = 2
+
+    # Call your CUDA kernel wrapper
+    block_count, block_offset, column_count, column_index = (
+        convert_vertical_slash_indexes_mergehead(
+            q_seqlens,
+            kv_seqlens,
+            vertical_indexes,
+            slash_indexes,
+            vertical_indices_count,
+            slash_indices_count,
+            context_size,
+            block_size_M,
+            block_size_N,
+            causal=causal,
+        )
+    )
+
+    # Manually create expected outputs for this input
+    # For demonstration, assume:
+    # - batch=1, head=2, num_rows=2, nnz_v=2, nnz_s=2
+    # Fill these expected tensors according to your kernel's behavior
+
+    expected_column_index = torch.tensor(
+        [[[[1, 0], [1, 3]], [[-1079459945, -1077788999], [-1080050043, -1104625879]]]],
+        dtype=torch.int32,
+        device="cuda",
+    )
+
+    if not causal:
+        # If non-causal mode output is different, update these values
+        expected_column_index = torch.tensor(
+            [[[[1, 0], [1, 3]], [[2, -1077788999], [2, -1104625879]]]],
+            dtype=torch.int32,
+            device="cuda",
+        )
+
+    # Assert that outputs match expectations
+    assert torch.equal(column_index, expected_column_index)
+
+
+# skip cause use fa2 for test
+# @pytest.mark.parametrize("seq_lens", [[(1024, 1328)],
+#                                     [(1024, 1328), (1, 2048)],
+#                                     [(1025, 1328), (2, 2048)],
+#                                     [(1025, 2049), (2, 1281)],
+#                                     ])
+# @pytest.mark.parametrize("head_size", [128])
+# @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+# @torch.inference_mode()
+# def test_sparse_attention_varlen(
+#         seq_lens,
+#         head_size,
+#         dtype,
+# ) -> None:
+#     torch.set_default_device("cuda")
+#     torch.cuda.manual_seed_all(0)
+#     block_size_M = 64
+#     block_size_N = 64
+#     num_seqs = len(seq_lens)
+#     query_lens = [x[0] for x in seq_lens]
+#     kv_lens = [x[1] for x in seq_lens]
+#     num_heads = 1
+#     query = torch.randn(sum(query_lens),
+#                         num_heads,
+#                         head_size,
+#                         dtype=dtype)
+#     key = torch.randn(sum(kv_lens),
+#                     num_heads,
+#                     head_size,
+#                     dtype=dtype)
+#     value = torch.randn_like(key)
+#     cu_query_lens = torch.tensor([0] + query_lens,
+#                                 dtype=torch.int32).cumsum(dim=0,
+#                                                         dtype=torch.int32)
+#     cu_kv_lens = torch.tensor([0] + kv_lens,
+#                                 dtype=torch.int32).cumsum(dim=0,
+#                                                         dtype=torch.int32)
+#     max_query_len = max(query_lens)
+#     max_kv_len = max(kv_lens)
+
+#     NUM_ROWS = (max_query_len + block_size_M - 1) // block_size_M
+#     NNZ_S = 20
+#     NNZ_V = 2048
+#     batch_size = len(query_lens)
+
+#     block_counts = []
+#     column_counts = []
+#     block_offsets = []
+#     column_indices = []
+#     for b in range(batch_size):
+#         block_counts.append(torch.tensor([NNZ_S] * NUM_ROWS * num_heads, dtype=torch.int32).reshape(num_heads, NUM_ROWS))
+#         columns = kv_lens[b] - NNZ_S * block_size_N
+#         column_counts.append(torch.tensor([columns] * NUM_ROWS * num_heads, dtype=torch.int32).reshape(num_heads, NUM_ROWS))
+#         block_offsets.append(torch.tensor([[i * block_size_N for i in range(NNZ_S)]] * NUM_ROWS * num_heads, dtype=torch.int32).reshape(num_heads, NUM_ROWS, NNZ_S))
+#         column_indices.append(torch.tensor([[NNZ_S * block_size_N + i for i in range(NNZ_V)]] * NUM_ROWS * num_heads, dtype=torch.int32).reshape(num_heads, NUM_ROWS, NNZ_V))
+#     block_count = torch.concat(block_counts).reshape(batch_size, num_heads, NUM_ROWS)
+#     column_count = torch.concat(column_counts).reshape(batch_size, num_heads, NUM_ROWS)
+#     block_offset = torch.concat(block_offsets).reshape(batch_size, num_heads, NUM_ROWS, NNZ_S)
+#     column_index = torch.concat(column_indices).reshape(batch_size, num_heads, NUM_ROWS, NNZ_V)
+#     out, lse = sparse_attn_varlen_func(
+#         query,
+#         key,
+#         value,
+#         block_count,
+#         block_offset,
+#         column_count,
+#         column_index,
+#         cu_seqlens_q=cu_query_lens,
+#         cu_seqlens_k=cu_kv_lens,
+#         max_seqlen_q=max_query_len,
+#         max_seqlen_k=max_kv_len,
+#         return_softmax_lse=True,
+#     )
+
+#     max_num_blocks_per_seq = (max_kv_len + 2048 - 1) // 2048
+#     block_tables = torch.randint(0,
+#                                  2048,
+#                                  (len(query_lens), max_num_blocks_per_seq),
+#                                  dtype=torch.int32)
+#     scale = head_size**-0.5
+
+#     ref_out, ref_lse, _ = ref_paged_attn(
+#         query,
+#         key,
+#         value,
+#         query_lens=query_lens,
+#         kv_lens=kv_lens,
+#         block_tables=block_tables,
+#         scale=scale
+#     )
+
+#     torch.testing.assert_close(out, ref_out, atol=2e-2, rtol=1e-2), \
+#         f"{torch.max(torch.abs(out - ref_out))}"
+#     torch.testing.assert_close(lse, ref_lse, atol=2e-2, rtol=1e-2), \
+#         f"{torch.max(torch.abs(lse - ref_lse))}"
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sgl-kernel/tests/utils.py b/sgl-kernel/tests/utils.py
new file mode 100644
index 000000000..8fa9a2234
--- /dev/null
+++ b/sgl-kernel/tests/utils.py
@@ -0,0 +1,9 @@
+import torch
+
+
+def is_sm10x():
+    return torch.cuda.get_device_capability() >= (10, 0)
+
+
+def is_hopper():
+    return torch.cuda.get_device_capability() == (9, 0)
diff --git a/sgl-router/.cargo/config.toml b/sgl-router/.cargo/config.toml
new file mode 100644
index 000000000..305d57fa3
--- /dev/null
+++ b/sgl-router/.cargo/config.toml
@@ -0,0 +1,15 @@
+[build]
+rustflags = []
+incremental = true
+
+[target.aarch64-apple-darwin]
+rustflags = [
+    "-C", "link-arg=-undefined",
+    "-C", "link-arg=dynamic_lookup",
+]
+
+[target.x86_64-apple-darwin]
+rustflags = [
+    "-C", "link-arg=-undefined",
+    "-C", "link-arg=dynamic_lookup",
+]
diff --git a/sgl-router/.coveragerc b/sgl-router/.coveragerc
new file mode 100644
index 000000000..5bab1e8d2
--- /dev/null
+++ b/sgl-router/.coveragerc
@@ -0,0 +1,9 @@
+[run]
+source = py_src/sglang_router
+omit =
+    py_src/sglang_router/mini_lb.py
+
+[report]
+fail_under = 80
+omit =
+    py_src/sglang_router/mini_lb.py
diff --git a/sgl-router/Cargo.toml b/sgl-router/Cargo.toml
new file mode 100644
index 000000000..4ecfae55d
--- /dev/null
+++ b/sgl-router/Cargo.toml
@@ -0,0 +1,121 @@
+[package]
+name = "sglang_router_rs"
+version = "0.0.0"
+edition = "2021"
+
+[features]
+default = ["grpc-client"]
+grpc-client = []
+grpc-server = []
+
+[lib]
+name = "sglang_router_rs"
+# Pure Rust library: Just omit crate-type (defaults to rlib)
+# Python/C binding + Rust library: Use ["cdylib", "rlib"]
+crate-type = ["cdylib", "rlib"]
+
+[[bin]]
+name = "sglang-router"
+path = "src/main.rs"
+
+[dependencies]
+clap = { version = "4", features = ["derive"] }
+axum = { version = "0.8.4", features = ["macros", "ws", "tracing"] }
+tower = { version = "0.5", features = ["full"] }
+tower-http = { version = "0.6", features = ["trace", "compression-gzip", "cors", "timeout", "limit", "request-id", "util"] }
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
+bytes = "1.8.0"
+rand = "0.9.2"
+reqwest = { version = "0.12.8", features = ["stream", "blocking", "json"] }
+futures-util = "0.3"
+futures = "0.3"
+pyo3 = { version = "0.25.1", features = ["extension-module"] }
+dashmap = "6.1.0"
+http = "1.1.0"
+tokio = { version = "1.42.0", features = ["full"] }
+async-trait = "0.1"
+tracing = "0.1"
+tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "chrono"] }
+tracing-log = "0.2"
+tracing-appender = "0.2.3"
+chrono = "0.4"
+kube = { version = "1.1.0", features = ["runtime", "derive"] }
+k8s-openapi = { version = "0.25.0", features = ["v1_33"] }
+metrics = "0.24.2"
+metrics-exporter-prometheus = "0.17.0"
+uuid = { version = "1.10", features = ["v4", "serde"] }
+thiserror = "2.0.12"
+regex = "1.10"
+url = "2.5.4"
+tokio-stream = { version = "0.1", features = ["sync"] }
+anyhow = "1.0"
+tokenizers = { version = "0.22.0" }
+tiktoken-rs = { version = "0.7.0" }
+minijinja = { version = "2.0" }
+rustls = { version = "0.23", default-features = false, features = ["ring", "std"] }
+hf-hub = { version = "0.4.3", features = ["tokio"] }
+rmcp = { version = "0.6.3", features = ["client", "server",
+    "transport-child-process",
+    "transport-sse-client-reqwest",
+    "transport-streamable-http-client-reqwest",
+    "transport-streamable-http-server",
+    "transport-streamable-http-server-session",
+    "reqwest",
+    "auth"] }
+serde_yaml = "0.9"
+
+# gRPC and Protobuf dependencies
+tonic = { version = "0.12", features = ["tls", "gzip", "transport"] }
+prost = "0.13"
+prost-types = "0.13"
+deadpool = { version = "0.12", features = ["managed", "rt_tokio_1"] }
+backoff = { version = "0.4", features = ["tokio"] }
+strum = { version = "0.26", features = ["derive"] }
+once_cell = "1.21.3"
+
+[build-dependencies]
+tonic-build = "0.12"
+prost-build = "0.13"
+
+[dev-dependencies]
+criterion = { version = "0.5", features = ["html_reports"] }
+tower = { version = "0.5", features = ["util"] }
+http-body-util = "0.1"
+portpicker = "0.1"
+tempfile = "3.8"
+lazy_static = "1.4"
+
+[[bench]]
+name = "request_processing"
+harness = false
+path = "benches/request_processing.rs"
+
+[[bench]]
+name = "tokenizer_benchmark"
+harness = false
+path = "benches/tokenizer_benchmark.rs"
+
+[[bench]]
+name = "tool_parser_benchmark"
+harness = false
+path = "benches/tool_parser_benchmark.rs"
+
+[profile.release]
+lto = "thin"
+codegen-units = 1
+
+[profile.dev]
+opt-level = 0
+debug = true
+split-debuginfo = "unpacked"
+incremental = true
+
+
+[profile.dev.build-override]
+opt-level = 3
+codegen-units = 1
+
+[profile.dev-opt]
+inherits = "dev"
+opt-level = 1
diff --git a/sgl-router/MANIFEST.in b/sgl-router/MANIFEST.in
new file mode 100644
index 000000000..4baa6c84f
--- /dev/null
+++ b/sgl-router/MANIFEST.in
@@ -0,0 +1,5 @@
+# Must include:
+include Cargo.toml        # Rust project configuration
+include build.rs          # Build script for protobuf generation
+recursive-include src *.rs  # Rust source files
+recursive-include src/proto *.proto  # Protobuf definitions
diff --git a/sgl-router/Makefile b/sgl-router/Makefile
new file mode 100644
index 000000000..feda1ee63
--- /dev/null
+++ b/sgl-router/Makefile
@@ -0,0 +1,92 @@
+# SGLang Router Makefile
+# Provides convenient shortcuts for common development tasks
+
+.PHONY: help bench bench-quick bench-baseline bench-compare test build clean
+
+help: ## Show this help message
+	@echo "SGLang Router Development Commands"
+	@echo "=================================="
+	@echo ""
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "  \033[36m%-20s\033[0m %s\n", $$1, $$2}'
+	@echo ""
+
+build: ## Build the project in release mode
+	@echo "Building SGLang Router..."
+	@cargo build --release
+
+test: ## Run all tests
+	@echo "Running tests..."
+	@cargo test
+
+bench: ## Run full benchmark suite
+	@echo "Running full benchmarks..."
+	@python3 scripts/run_benchmarks.py
+
+bench-quick: ## Run quick benchmarks only
+	@echo "Running quick benchmarks..."
+	@python3 scripts/run_benchmarks.py --quick
+
+bench-baseline: ## Save current performance as baseline
+	@echo "Saving performance baseline..."
+	@python3 scripts/run_benchmarks.py --save-baseline main
+
+bench-compare: ## Compare with saved baseline
+	@echo "Comparing with baseline..."
+	@python3 scripts/run_benchmarks.py --compare-baseline main
+
+bench-ci: ## Run benchmarks suitable for CI (quick mode)
+	@echo "Running CI benchmarks..."
+	@python3 scripts/run_benchmarks.py --quick
+
+clean: ## Clean build artifacts
+	@echo "Cleaning build artifacts..."
+	@cargo clean
+
+docs: ## Generate and open documentation
+	@echo "Generating documentation..."
+	@cargo doc --open
+
+check: ## Run cargo check and clippy
+	@echo "Running cargo check..."
+	@cargo check
+	@echo "Running clippy..."
+	@cargo clippy
+
+fmt: ## Format code with rustfmt
+	@echo "Formatting code..."
+	@cargo fmt
+
+# Development workflow shortcuts
+dev-setup: build test ## Set up development environment
+	@echo "Development environment ready!"
+
+pre-commit: fmt check test bench-quick ## Run pre-commit checks
+	@echo "Pre-commit checks passed!"
+
+# Benchmark analysis shortcuts
+bench-report: ## Open benchmark HTML report
+	@if [ -f "target/criterion/request_processing/report/index.html" ]; then \
+		echo "Opening benchmark report..."; \
+		if command -v xdg-open >/dev/null 2>&1; then \
+			xdg-open target/criterion/request_processing/report/index.html; \
+		elif command -v open >/dev/null 2>&1; then \
+			open target/criterion/request_processing/report/index.html; \
+		else \
+			echo "Please open target/criterion/request_processing/report/index.html in your browser"; \
+		fi \
+	else \
+		echo "No benchmark report found. Run 'make bench' first."; \
+	fi
+
+bench-clean: ## Clean benchmark results
+	@echo "Cleaning benchmark results..."
+	@rm -rf target/criterion
+
+# Performance monitoring
+perf-monitor: ## Run continuous performance monitoring
+	@echo "Starting performance monitoring..."
+	@if command -v watch >/dev/null 2>&1; then \
+		watch -n 300 'make bench-quick'; \
+	else \
+		echo "Warning: 'watch' command not found. Install it or run 'make bench-quick' manually."; \
+	fi
diff --git a/sgl-router/README.md b/sgl-router/README.md
new file mode 100644
index 000000000..af73536b3
--- /dev/null
+++ b/sgl-router/README.md
@@ -0,0 +1,402 @@
+# SGLang Router
+
+SGLang router is a standalone Rust module that enables data parallelism across SGLang instances, providing high-performance request routing and advanced load balancing. The router supports multiple load balancing algorithms including cache-aware, power of two, random, and round robin, and acts as a specialized load balancer for prefill-decode disaggregated serving architectures.
+
+## Documentation
+
+- **User Guide**: [docs.sglang.ai/advanced_features/router.html](https://docs.sglang.ai/advanced_features/router.html)
+
+## Quick Start
+
+### Prerequisites
+
+**Rust and Cargo:**
+```bash
+# Install rustup (Rust installer and version manager)
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
+
+# Follow the installation prompts, then reload your shell
+source $HOME/.cargo/env
+
+# Verify installation
+rustc --version
+cargo --version
+```
+
+**Python with pip installed**
+
+### Installation
+
+#### Option A: Build and Install Wheel (Recommended)
+```bash
+# Install build dependencies
+pip install setuptools-rust wheel build
+
+# Build the wheel package
+python -m build
+
+# Install the generated wheel
+pip install dist/*.whl
+
+# One-liner for development (rebuild + install)
+python -m build && pip install --force-reinstall dist/*.whl
+```
+
+#### Option B: Development Mode
+
+```bash
+# Currently broken
+pip install -e .
+```
+
+⚠️ **Warning**: Editable installs may suffer performance degradation. Use wheel builds for performance testing.
+
+### Basic Usage
+
+```bash
+# Build Rust components
+cargo build
+```
+
+#### Using the Rust Binary Directly (Alternative to Python)
+```bash
+# Build the Rust binary
+cargo build --release
+
+# Launch router with worker URLs in regular mode
+./target/release/sglang-router \
+    --worker-urls http://worker1:8000 http://worker2:8000
+
+# Or use cargo run
+cargo run --release -- \
+    --worker-urls http://worker1:8000 http://worker2:8000
+```
+
+#### Launch Router with Python (Original Method)
+```bash
+# Launch router with worker URLs
+python -m sglang_router.launch_router \
+    --worker-urls http://worker1:8000 http://worker2:8000
+```
+
+#### Launch Router with Worker URLs in prefill-decode mode
+```bash
+# Note that the prefill and decode URLs must be provided in the following format:
+# http://<ip>:<port> for  decode nodes
+# http://<ip>:<port> bootstrap-port for  prefill nodes, where bootstrap-port is optional
+
+# Using Rust binary directly
+./target/release/sglang-router \
+    --pd-disaggregation \
+    --policy cache_aware \
+    --prefill http://127.0.0.1:30001 9001 \
+    --prefill http://127.0.0.2:30002 9002 \
+    --prefill http://127.0.0.3:30003 9003 \
+    --prefill http://127.0.0.4:30004 9004 \
+    --decode http://127.0.0.5:30005 \
+    --decode http://127.0.0.6:30006 \
+    --decode http://127.0.0.7:30007 \
+    --host 0.0.0.0 \
+    --port 8080
+
+# Or using Python launcher
+python -m sglang_router.launch_router \
+    --pd-disaggregation \
+    --policy cache_aware \
+    --prefill http://127.0.0.1:30001 9001 \
+    --prefill http://127.0.0.2:30002 9002 \
+    --prefill http://127.0.0.3:30003 9003 \
+    --prefill http://127.0.0.4:30004 9004 \
+    --decode http://127.0.0.5:30005 \
+    --decode http://127.0.0.6:30006 \
+    --decode http://127.0.0.7:30007 \
+    --host 0.0.0.0 \
+    --port 8080
+````
+
+## Configuration
+
+### Logging
+
+Enable structured logging with optional file output:
+
+```python
+from sglang_router import Router
+
+# Console logging (default)
+router = Router(worker_urls=["http://worker1:8000", "http://worker2:8000"])
+
+# File logging enabled
+router = Router(
+    worker_urls=["http://worker1:8000", "http://worker2:8000"],
+    log_dir="./logs"  # Daily log files created here
+)
+```
+
+Set log level with `--log-level` flag ([documentation](https://docs.sglang.ai/backend/server_arguments.html#logging)).
+
+### Metrics
+
+Prometheus metrics endpoint available at `127.0.0.1:29000` by default.
+
+```bash
+# Custom metrics configuration
+python -m sglang_router.launch_router \
+    --worker-urls http://localhost:8080 http://localhost:8081 \
+    --prometheus-host 0.0.0.0 \
+    --prometheus-port 9000
+```
+
+### Retries and Circuit Breakers
+
+- Retries (regular router) are enabled by default with exponential backoff and jitter. You can tune them via CLI:
+
+```bash
+python -m sglang_router.launch_router \
+  --worker-urls http://localhost:8080 http://localhost:8081 \
+  --retry-max-retries 3 \
+  --retry-initial-backoff-ms 100 \
+  --retry-max-backoff-ms 10000 \
+  --retry-backoff-multiplier 2.0 \
+  --retry-jitter-factor 0.1
+```
+
+- Circuit Breaker defaults protect workers and auto-recover. Tune thresholds/timeouts:
+
+```bash
+python -m sglang_router.launch_router \
+  --worker-urls http://localhost:8080 http://localhost:8081 \
+  --cb-failure-threshold 5 \
+  --cb-success-threshold 2 \
+  --cb-timeout-duration-secs 30 \
+  --cb-window-duration-secs 60
+```
+
+Behavior summary:
+- Closed → Open after N consecutive failures (failure-threshold)
+- Open → HalfOpen after timeout (timeout-duration-secs)
+- HalfOpen → Closed after M consecutive successes (success-threshold)
+- Any failure in HalfOpen reopens immediately
+
+Retry predicate (regular router): retry on 408/429/500/502/503/504, otherwise return immediately. Backoff/jitter observed between attempts.
+
+### Request ID Tracking
+
+Track requests across distributed systems with configurable headers:
+
+```bash
+# Use custom request ID headers
+python -m sglang_router.launch_router \
+    --worker-urls http://localhost:8080 \
+    --request-id-headers x-trace-id x-request-id
+```
+
+Default headers: `x-request-id`, `x-correlation-id`, `x-trace-id`, `request-id`
+
+## Advanced Features
+
+### Kubernetes Service Discovery
+
+Automatic worker discovery and management in Kubernetes environments.
+
+#### Basic Service Discovery
+
+```bash
+python -m sglang_router.launch_router \
+    --service-discovery \
+    --selector app=sglang-worker role=inference \
+    --service-discovery-namespace default
+```
+
+#### PD (Prefill-Decode) Mode
+
+For disaggregated prefill/decode routing:
+
+```bash
+python -m sglang_router.launch_router \
+    --pd-disaggregation \
+    --policy cache_aware \
+    --service-discovery \
+    --prefill-selector app=sglang component=prefill \
+    --decode-selector app=sglang component=decode \
+    --service-discovery-namespace sglang-system
+
+# With separate routing policies:
+python -m sglang_router.launch_router \
+    --pd-disaggregation \
+    --prefill-policy cache_aware \
+    --decode-policy power_of_two \
+    --service-discovery \
+    --prefill-selector app=sglang component=prefill \
+    --decode-selector app=sglang component=decode \
+    --service-discovery-namespace sglang-system
+
+# in lws case, such as tp16(1 leader pod, 1 worker pod)
+python -m sglang_router.launch_router \
+    --pd-disaggregation \
+    --policy cache_aware \
+    --service-discovery \
+    --prefill-selector app=sglang component=prefill role=leader\
+    --decode-selector app=sglang component=decode role=leader\
+    --service-discovery-namespace sglang-system
+```
+
+#### Kubernetes Pod Configuration
+
+**Prefill Server Pod:**
+```yaml
+apiVersion: v1
+kind: Pod
+metadata:
+  name: sglang-prefill-1
+  labels:
+    app: sglang
+    component: prefill
+  annotations:
+    sglang.ai/bootstrap-port: "9001"  # Optional: Bootstrap port
+spec:
+  containers:
+  - name: sglang
+    image: lmsys/sglang:latest
+    ports:
+    - containerPort: 8000  # Main API port
+    - containerPort: 9001  # Optional: Bootstrap port
+```
+
+**Decode Server Pod:**
+```yaml
+apiVersion: v1
+kind: Pod
+metadata:
+  name: sglang-decode-1
+  labels:
+    app: sglang
+    component: decode
+spec:
+  containers:
+  - name: sglang
+    image: lmsys/sglang:latest
+    ports:
+    - containerPort: 8000
+```
+
+#### RBAC Configuration
+
+**Namespace-scoped (recommended):**
+```yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: sglang-router
+  namespace: sglang-system
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  namespace: sglang-system
+  name: sglang-router
+rules:
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: sglang-router
+  namespace: sglang-system
+subjects:
+- kind: ServiceAccount
+  name: sglang-router
+  namespace: sglang-system
+roleRef:
+  kind: Role
+  name: sglang-router
+  apiGroup: rbac.authorization.k8s.io
+```
+
+#### Complete PD Example
+
+```bash
+python -m sglang_router.launch_router \
+    --pd-disaggregation \
+    --policy cache_aware \
+    --service-discovery \
+    --prefill-selector app=sglang component=prefill environment=production \
+    --decode-selector app=sglang component=decode environment=production \
+    --service-discovery-namespace production \
+    --host 0.0.0.0 \
+    --port 8080 \
+    --prometheus-host 0.0.0.0 \
+    --prometheus-port 9090
+```
+
+### Command Line Arguments Reference
+
+#### Service Discovery
+- `--service-discovery`: Enable Kubernetes service discovery
+- `--service-discovery-port`: Port for worker URLs (default: 8000)
+- `--service-discovery-namespace`: Kubernetes namespace to watch
+- `--selector`: Label selectors for regular mode (format: `key1=value1 key2=value2`)
+
+#### PD Mode
+- `--pd-disaggregation`: Enable Prefill-Decode disaggregated mode
+- `--prefill`: Initial prefill server (format: `URL BOOTSTRAP_PORT`)
+- `--decode`: Initial decode server URL
+- `--prefill-selector`: Label selector for prefill pods
+- `--decode-selector`: Label selector for decode pods
+- `--policy`: Routing policy (`cache_aware`, `random`, `power_of_two`, `round_robin`)
+- `--prefill-policy`: Separate routing policy for prefill nodes (optional, overrides `--policy` for prefill)
+- `--decode-policy`: Separate routing policy for decode nodes (optional, overrides `--policy` for decode)
+
+## Development
+
+### Build Process
+
+```bash
+# Build Rust project
+cargo build
+
+# Build Python binding (see Installation section above)
+```
+
+**Note**: When modifying Rust code, you must rebuild the wheel for changes to take effect.
+
+### Troubleshooting
+
+**VSCode Rust Analyzer Issues:**
+Set `rust-analyzer.linkedProjects` to the absolute path of `Cargo.toml`:
+
+```json
+{
+  "rust-analyzer.linkedProjects": ["/workspaces/sglang/sgl-router/Cargo.toml"]
+}
+```
+
+### CI/CD Pipeline
+
+The continuous integration pipeline includes comprehensive testing, benchmarking, and publishing:
+
+#### Build & Test
+
+1. **Build Wheels**: Uses `cibuildwheel` for manylinux x86_64 packages
+2. **Build Source Distribution**: Creates source distribution for pip fallback
+3. **Rust HTTP Server Benchmarking**: Performance testing of router overhead
+4. **Basic Inference Testing**: End-to-end validation through the router
+5. **PD Disaggregation Testing**: Benchmark and sanity checks for prefill-decode load balancing
+
+#### Publishing
+- **PyPI Publishing**: Wheels and source distributions are published only when the version changes in `pyproject.toml`
+- **Container Images**: Docker images published using `/docker/Dockerfile.router`
+
+## Features
+- **High Performance**: Rust-based routing with connection pooling and optimized request handling
+- **Advanced Load Balancing**: Multiple algorithms including:
+  - **Cache-Aware**: Intelligent routing based on cache locality for optimal performance
+  - **Power of Two**: Chooses the less loaded of two randomly selected workers
+  - **Random**: Distributes requests randomly across available workers
+  - **Round Robin**: Sequential distribution across workers in rotation
+- **Prefill-Decode Disaggregation**: Specialized load balancing for separated prefill and decode servers
+- **Service Discovery**: Automatic Kubernetes worker discovery and health management
+- **Monitoring**: Comprehensive Prometheus metrics and structured logging
+- **Scalability**: Handles thousands of concurrent connections with efficient resource utilization
diff --git a/sgl-router/benches/request_processing.rs b/sgl-router/benches/request_processing.rs
new file mode 100644
index 000000000..eba5680aa
--- /dev/null
+++ b/sgl-router/benches/request_processing.rs
@@ -0,0 +1,692 @@
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use serde_json::{from_str, to_string, to_value, to_vec};
+use std::time::Instant;
+
+use sglang_router_rs::core::{BasicWorker, Worker, WorkerType};
+use sglang_router_rs::protocols::spec::{
+    ChatCompletionRequest, ChatMessage, CompletionRequest, GenerateParameters, GenerateRequest,
+    SamplingParams, StringOrArray, UserMessageContent,
+};
+use sglang_router_rs::routers::http::pd_types::{
+    generate_room_id, get_hostname, RequestWithBootstrap,
+};
+
+fn create_test_worker() -> BasicWorker {
+    BasicWorker::new(
+        "http://test-server:8000".to_string(),
+        WorkerType::Prefill {
+            bootstrap_port: Some(5678),
+        },
+    )
+}
+
+// Helper function to get bootstrap info from worker
+fn get_bootstrap_info(worker: &BasicWorker) -> (String, Option<u16>) {
+    let hostname = get_hostname(worker.url());
+    let bootstrap_port = match worker.worker_type() {
+        WorkerType::Prefill { bootstrap_port } => bootstrap_port,
+        _ => None,
+    };
+    (hostname, bootstrap_port)
+}
+
+/// Create a default GenerateRequest for benchmarks with minimal fields set
+fn default_generate_request() -> GenerateRequest {
+    GenerateRequest {
+        text: None,
+        prompt: None,
+        input_ids: None,
+        stream: false,
+        parameters: None,
+        sampling_params: None,
+        return_logprob: false,
+        // SGLang Extensions
+        lora_path: None,
+        session_params: None,
+        return_hidden_states: false,
+        rid: None,
+    }
+}
+
+/// Create a default ChatCompletionRequest for benchmarks with minimal fields set
+fn default_chat_completion_request() -> ChatCompletionRequest {
+    ChatCompletionRequest {
+        model: String::new(),
+        messages: vec![],
+        max_tokens: None,
+        max_completion_tokens: None,
+        temperature: None,
+        top_p: None,
+        n: None,
+        stream: false,
+        stream_options: None,
+        stop: None,
+        presence_penalty: None,
+        frequency_penalty: None,
+        logit_bias: None,
+        logprobs: false,
+        top_logprobs: None,
+        user: None,
+        response_format: None,
+        seed: None,
+        tools: None,
+        tool_choice: None,
+        parallel_tool_calls: None,
+        function_call: None,
+        functions: None,
+        // SGLang Extensions
+        top_k: None,
+        min_p: None,
+        min_tokens: None,
+        repetition_penalty: None,
+        regex: None,
+        ebnf: None,
+        stop_token_ids: None,
+        no_stop_trim: false,
+        ignore_eos: false,
+        continue_final_message: false,
+        skip_special_tokens: true,
+        // SGLang Extensions
+        lora_path: None,
+        session_params: None,
+        separate_reasoning: true,
+        stream_reasoning: true,
+        chat_template_kwargs: None,
+        return_hidden_states: false,
+    }
+}
+
+/// Create a default CompletionRequest for benchmarks with minimal fields set
+fn default_completion_request() -> CompletionRequest {
+    CompletionRequest {
+        model: String::new(),
+        prompt: StringOrArray::String(String::new()),
+        suffix: None,
+        max_tokens: None,
+        temperature: None,
+        top_p: None,
+        n: None,
+        stream: false,
+        stream_options: None,
+        logprobs: None,
+        echo: false,
+        stop: None,
+        presence_penalty: None,
+        frequency_penalty: None,
+        best_of: None,
+        logit_bias: None,
+        user: None,
+        seed: None,
+        // SGLang Extensions
+        top_k: None,
+        min_p: None,
+        min_tokens: None,
+        repetition_penalty: None,
+        regex: None,
+        ebnf: None,
+        json_schema: None,
+        stop_token_ids: None,
+        no_stop_trim: false,
+        ignore_eos: false,
+        skip_special_tokens: true,
+        // SGLang Extensions
+        lora_path: None,
+        session_params: None,
+        return_hidden_states: false,
+        other: serde_json::Map::new(),
+    }
+}
+
+// Sample request data for benchmarks
+fn create_sample_generate_request() -> GenerateRequest {
+    GenerateRequest {
+        text: Some("Write a story about artificial intelligence".to_string()),
+        parameters: Some(GenerateParameters {
+            max_new_tokens: Some(100),
+            temperature: Some(0.8),
+            top_p: Some(0.9),
+            top_k: Some(50),
+            repetition_penalty: Some(1.0),
+            ..Default::default()
+        }),
+        sampling_params: Some(SamplingParams {
+            temperature: Some(0.8),
+            top_p: Some(0.9),
+            top_k: Some(50),
+            frequency_penalty: Some(0.0),
+            presence_penalty: Some(0.0),
+            repetition_penalty: Some(1.0),
+            ..Default::default()
+        }),
+        ..default_generate_request()
+    }
+}
+
+fn create_sample_chat_completion_request() -> ChatCompletionRequest {
+    ChatCompletionRequest {
+        model: "gpt-3.5-turbo".to_string(),
+        messages: vec![
+            ChatMessage::System {
+                role: "system".to_string(),
+                content: "You are a helpful assistant".to_string(),
+                name: None,
+            },
+            ChatMessage::User {
+                role: "user".to_string(),
+                content: UserMessageContent::Text(
+                    "Explain quantum computing in simple terms".to_string(),
+                ),
+                name: None,
+            },
+        ],
+        max_tokens: Some(150),
+        max_completion_tokens: Some(150),
+        temperature: Some(0.7),
+        top_p: Some(1.0),
+        n: Some(1),
+        presence_penalty: Some(0.0),
+        frequency_penalty: Some(0.0),
+        parallel_tool_calls: Some(true),
+        ..default_chat_completion_request()
+    }
+}
+
+fn create_sample_completion_request() -> CompletionRequest {
+    CompletionRequest {
+        model: "text-davinci-003".to_string(),
+        prompt: StringOrArray::String("Complete this sentence: The future of AI is".to_string()),
+        max_tokens: Some(50),
+        temperature: Some(0.8),
+        top_p: Some(1.0),
+        n: Some(1),
+        presence_penalty: Some(0.0),
+        frequency_penalty: Some(0.0),
+        best_of: Some(1),
+        ..default_completion_request()
+    }
+}
+
+fn create_large_chat_completion_request() -> ChatCompletionRequest {
+    let mut messages = vec![ChatMessage::System {
+        role: "system".to_string(),
+        content: "You are a helpful assistant with extensive knowledge.".to_string(),
+        name: None,
+    }];
+
+    // Add many user/assistant pairs to simulate a long conversation
+    for i in 0..50 {
+        messages.push(ChatMessage::User {
+            role: "user".to_string(),
+            content: UserMessageContent::Text(format!("Question {}: What do you think about topic number {} which involves complex reasoning about multiple interconnected systems and their relationships?", i, i)),
+            name: None,
+        });
+        messages.push(ChatMessage::Assistant {
+            role: "assistant".to_string(),
+            content: Some(format!("Answer {}: This is a detailed response about topic {} that covers multiple aspects and provides comprehensive analysis of the interconnected systems you mentioned.", i, i)),
+            name: None,
+            tool_calls: None,
+            function_call: None,
+            reasoning_content: None,
+        });
+    }
+
+    ChatCompletionRequest {
+        model: "gpt-4".to_string(),
+        messages,
+        max_tokens: Some(1000),
+        max_completion_tokens: Some(1000),
+        temperature: Some(0.7),
+        top_p: Some(0.95),
+        n: Some(1),
+        presence_penalty: Some(0.1),
+        frequency_penalty: Some(0.1),
+        top_logprobs: Some(5),
+        user: Some("benchmark_user".to_string()),
+        seed: Some(42),
+        parallel_tool_calls: Some(true),
+        ..default_chat_completion_request()
+    }
+}
+
+// Benchmark JSON serialization
+fn bench_json_serialization(c: &mut Criterion) {
+    let mut group = c.benchmark_group("json_serialization");
+
+    let generate_req = create_sample_generate_request();
+    let chat_req = create_sample_chat_completion_request();
+    let completion_req = create_sample_completion_request();
+    let large_chat_req = create_large_chat_completion_request();
+
+    group.bench_function("generate_request", |b| {
+        b.iter(|| {
+            let json = to_string(black_box(&generate_req)).unwrap();
+            black_box(json);
+        });
+    });
+
+    group.bench_function("chat_completion_request", |b| {
+        b.iter(|| {
+            let json = to_string(black_box(&chat_req)).unwrap();
+            black_box(json);
+        });
+    });
+
+    group.bench_function("completion_request", |b| {
+        b.iter(|| {
+            let json = to_string(black_box(&completion_req)).unwrap();
+            black_box(json);
+        });
+    });
+
+    group.bench_function("large_chat_completion_request", |b| {
+        b.iter(|| {
+            let json = to_string(black_box(&large_chat_req)).unwrap();
+            black_box(json);
+        });
+    });
+
+    group.bench_function("generate_request_to_bytes", |b| {
+        b.iter(|| {
+            let bytes = to_vec(black_box(&generate_req)).unwrap();
+            black_box(bytes);
+        });
+    });
+
+    group.finish();
+}
+
+// Benchmark JSON deserialization
+fn bench_json_deserialization(c: &mut Criterion) {
+    let mut group = c.benchmark_group("json_deserialization");
+
+    let generate_json = to_string(&create_sample_generate_request()).unwrap();
+    let chat_json = to_string(&create_sample_chat_completion_request()).unwrap();
+    let completion_json = to_string(&create_sample_completion_request()).unwrap();
+    let large_chat_json = to_string(&create_large_chat_completion_request()).unwrap();
+
+    group.bench_function("generate_request", |b| {
+        b.iter(|| {
+            let req: GenerateRequest = from_str(black_box(&generate_json)).unwrap();
+            black_box(req);
+        });
+    });
+
+    group.bench_function("chat_completion_request", |b| {
+        b.iter(|| {
+            let req: ChatCompletionRequest = from_str(black_box(&chat_json)).unwrap();
+            black_box(req);
+        });
+    });
+
+    group.bench_function("completion_request", |b| {
+        b.iter(|| {
+            let req: CompletionRequest = from_str(black_box(&completion_json)).unwrap();
+            black_box(req);
+        });
+    });
+
+    group.bench_function("large_chat_completion_request", |b| {
+        b.iter(|| {
+            let req: ChatCompletionRequest = from_str(black_box(&large_chat_json)).unwrap();
+            black_box(req);
+        });
+    });
+
+    group.finish();
+}
+
+// Benchmark bootstrap injection (replaces request adaptation)
+fn bench_bootstrap_injection(c: &mut Criterion) {
+    let mut group = c.benchmark_group("bootstrap_injection");
+
+    let generate_req = create_sample_generate_request();
+    let chat_req = create_sample_chat_completion_request();
+    let completion_req = create_sample_completion_request();
+    let large_chat_req = create_large_chat_completion_request();
+    let worker = create_test_worker();
+    let (hostname, bootstrap_port) = get_bootstrap_info(&worker);
+
+    group.bench_function("generate_bootstrap_injection", |b| {
+        b.iter(|| {
+            let request_with_bootstrap = RequestWithBootstrap {
+                original: &generate_req,
+                bootstrap_host: hostname.clone(),
+                bootstrap_port,
+                bootstrap_room: generate_room_id(),
+            };
+            let json = to_value(black_box(&request_with_bootstrap)).unwrap();
+            black_box(json);
+        });
+    });
+
+    group.bench_function("chat_completion_bootstrap_injection", |b| {
+        b.iter(|| {
+            let request_with_bootstrap = RequestWithBootstrap {
+                original: &chat_req,
+                bootstrap_host: hostname.clone(),
+                bootstrap_port,
+                bootstrap_room: generate_room_id(),
+            };
+            let json = to_value(black_box(&request_with_bootstrap)).unwrap();
+            black_box(json);
+        });
+    });
+
+    group.bench_function("completion_bootstrap_injection", |b| {
+        b.iter(|| {
+            let request_with_bootstrap = RequestWithBootstrap {
+                original: &completion_req,
+                bootstrap_host: hostname.clone(),
+                bootstrap_port,
+                bootstrap_room: generate_room_id(),
+            };
+            let json = to_value(black_box(&request_with_bootstrap)).unwrap();
+            black_box(json);
+        });
+    });
+
+    group.bench_function("large_chat_completion_bootstrap_injection", |b| {
+        b.iter(|| {
+            let request_with_bootstrap = RequestWithBootstrap {
+                original: &large_chat_req,
+                bootstrap_host: hostname.clone(),
+                bootstrap_port,
+                bootstrap_room: generate_room_id(),
+            };
+            let json = to_value(black_box(&request_with_bootstrap)).unwrap();
+            black_box(json);
+        });
+    });
+
+    group.finish();
+}
+
+// Benchmark direct JSON routing (replaces regular routing)
+fn bench_direct_json_routing(c: &mut Criterion) {
+    let mut group = c.benchmark_group("direct_json_routing");
+
+    let generate_req = create_sample_generate_request();
+    let chat_req = create_sample_chat_completion_request();
+    let completion_req = create_sample_completion_request();
+
+    group.bench_function("generate_to_json", |b| {
+        b.iter(|| {
+            let json = to_value(black_box(&generate_req)).unwrap();
+            black_box(json);
+        });
+    });
+
+    group.bench_function("generate_to_json_string", |b| {
+        b.iter(|| {
+            let json = to_string(black_box(&generate_req)).unwrap();
+            black_box(json);
+        });
+    });
+
+    group.bench_function("generate_to_bytes", |b| {
+        b.iter(|| {
+            let bytes = to_vec(black_box(&generate_req)).unwrap();
+            black_box(bytes);
+        });
+    });
+
+    group.bench_function("chat_completion_to_json", |b| {
+        b.iter(|| {
+            let json = to_value(black_box(&chat_req)).unwrap();
+            black_box(json);
+        });
+    });
+
+    group.bench_function("chat_completion_to_json_string", |b| {
+        b.iter(|| {
+            let json = to_string(black_box(&chat_req)).unwrap();
+            black_box(json);
+        });
+    });
+
+    group.bench_function("completion_to_json", |b| {
+        b.iter(|| {
+            let json = to_value(black_box(&completion_req)).unwrap();
+            black_box(json);
+        });
+    });
+
+    group.finish();
+}
+
+// Benchmark throughput with different request sizes
+fn bench_throughput_by_size(c: &mut Criterion) {
+    let mut group = c.benchmark_group("throughput_by_size");
+
+    // Create requests of different sizes
+    let small_generate = GenerateRequest {
+        text: Some("Hi".to_string()),
+        ..default_generate_request()
+    };
+
+    let medium_generate = GenerateRequest {
+        text: Some("Write a medium length story about AI".repeat(10)),
+        ..default_generate_request()
+    };
+
+    let large_generate = GenerateRequest {
+        text: Some("Write a very long and detailed story about artificial intelligence and its impact on society".repeat(100)),
+        ..default_generate_request()
+    };
+
+    let worker = create_test_worker();
+    let (hostname, bootstrap_port) = get_bootstrap_info(&worker);
+
+    for (name, req) in [
+        ("small", &small_generate),
+        ("medium", &medium_generate),
+        ("large", &large_generate),
+    ] {
+        let json = to_string(req).unwrap();
+        let size_bytes = json.len();
+        let hostname_clone = hostname.clone();
+
+        group.throughput(Throughput::Bytes(size_bytes as u64));
+        group.bench_with_input(BenchmarkId::new("serialize", name), &req, |b, req| {
+            b.iter(|| {
+                let json = to_string(black_box(req)).unwrap();
+                black_box(json);
+            });
+        });
+
+        group.bench_with_input(
+            BenchmarkId::new("deserialize", name),
+            &json,
+            |b, json_str| {
+                b.iter(|| {
+                    let req: GenerateRequest = black_box(from_str(json_str)).unwrap();
+                    black_box(req);
+                });
+            },
+        );
+
+        group.bench_with_input(
+            BenchmarkId::new("bootstrap_inject", name),
+            &req,
+            move |b, req| {
+                let hostname = hostname_clone.clone();
+                b.iter(|| {
+                    let request_with_bootstrap = RequestWithBootstrap {
+                        original: req,
+                        bootstrap_host: hostname.clone(),
+                        bootstrap_port,
+                        bootstrap_room: generate_room_id(),
+                    };
+                    let json = to_value(&request_with_bootstrap).unwrap();
+                    black_box(json);
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+// Benchmark full round-trip: deserialize -> inject bootstrap -> serialize
+fn bench_full_round_trip(c: &mut Criterion) {
+    let mut group = c.benchmark_group("full_round_trip");
+
+    let generate_json = to_string(&create_sample_generate_request()).unwrap();
+    let chat_json = to_string(&create_sample_chat_completion_request()).unwrap();
+    let completion_json = to_string(&create_sample_completion_request()).unwrap();
+    let worker = create_test_worker();
+    let (hostname, bootstrap_port) = get_bootstrap_info(&worker);
+
+    group.bench_function("generate_openai_to_pd_pipeline", |b| {
+        b.iter(|| {
+            // Deserialize OpenAI request
+            let req: GenerateRequest = from_str(black_box(&generate_json)).unwrap();
+            // Create wrapper with bootstrap fields
+            let request_with_bootstrap = RequestWithBootstrap {
+                original: &req,
+                bootstrap_host: hostname.clone(),
+                bootstrap_port,
+                bootstrap_room: generate_room_id(),
+            };
+            // Serialize final request
+            let pd_json = to_string(&request_with_bootstrap).unwrap();
+            black_box(pd_json);
+        });
+    });
+
+    group.bench_function("chat_completion_openai_to_pd_pipeline", |b| {
+        b.iter(|| {
+            let req: ChatCompletionRequest = from_str(black_box(&chat_json)).unwrap();
+            let request_with_bootstrap = RequestWithBootstrap {
+                original: &req,
+                bootstrap_host: hostname.clone(),
+                bootstrap_port,
+                bootstrap_room: generate_room_id(),
+            };
+            let pd_json = to_string(&request_with_bootstrap).unwrap();
+            black_box(pd_json);
+        });
+    });
+
+    group.bench_function("completion_openai_to_pd_pipeline", |b| {
+        b.iter(|| {
+            let req: CompletionRequest = from_str(black_box(&completion_json)).unwrap();
+            let request_with_bootstrap = RequestWithBootstrap {
+                original: &req,
+                bootstrap_host: hostname.clone(),
+                bootstrap_port,
+                bootstrap_room: generate_room_id(),
+            };
+            let pd_json = to_string(&request_with_bootstrap).unwrap();
+            black_box(pd_json);
+        });
+    });
+
+    group.bench_function("generate_direct_json_pipeline", |b| {
+        b.iter(|| {
+            // Deserialize OpenAI request
+            let req: GenerateRequest = from_str(black_box(&generate_json)).unwrap();
+            // Convert to JSON for direct routing (no bootstrap injection)
+            let routing_json = to_value(&req).unwrap();
+            let json_string = to_string(&routing_json).unwrap();
+            black_box(json_string);
+        });
+    });
+
+    group.finish();
+}
+
+fn benchmark_summary(c: &mut Criterion) {
+    let group = c.benchmark_group("benchmark_summary");
+
+    println!("\nSGLang Router Performance Benchmark Suite");
+    println!("=============================================");
+
+    // Quick performance overview
+    let generate_req = create_sample_generate_request();
+    let worker = create_test_worker();
+
+    println!("\nQuick Performance Overview:");
+
+    // Measure serialization
+    let start = Instant::now();
+    for _ in 0..1000 {
+        let _ = black_box(to_string(&generate_req).unwrap());
+    }
+    let serialize_time = start.elapsed().as_nanos() / 1000;
+    println!("  * Serialization (avg):     {:>8} ns/req", serialize_time);
+
+    // Measure deserialization
+    let json = to_string(&generate_req).unwrap();
+    let start = Instant::now();
+    for _ in 0..1000 {
+        let _: GenerateRequest = black_box(from_str(&json).unwrap());
+    }
+    let deserialize_time = start.elapsed().as_nanos() / 1000;
+    println!(
+        "  * Deserialization (avg):   {:>8} ns/req",
+        deserialize_time
+    );
+
+    // Measure bootstrap injection (replaces adaptation)
+    let (hostname, bootstrap_port) = get_bootstrap_info(&worker);
+    let start = Instant::now();
+    for _ in 0..1000 {
+        let request_with_bootstrap = RequestWithBootstrap {
+            original: &generate_req,
+            bootstrap_host: hostname.clone(),
+            bootstrap_port,
+            bootstrap_room: generate_room_id(),
+        };
+        let _ = black_box(to_value(&request_with_bootstrap).unwrap());
+    }
+    let inject_time = start.elapsed().as_nanos() / 1000;
+    println!("  * Bootstrap Injection (avg): {:>6} ns/req", inject_time);
+
+    // Calculate ratios
+    let total_pipeline = serialize_time + deserialize_time + inject_time;
+    println!("  * Total Pipeline (avg):    {:>8} ns/req", total_pipeline);
+
+    println!("\nPerformance Insights:");
+    if deserialize_time > serialize_time * 2 {
+        println!("  • Deserialization is significantly faster than serialization");
+    }
+    if inject_time < serialize_time / 10 {
+        println!(
+            "  • Bootstrap injection overhead is negligible ({:.1}% of serialization)",
+            (inject_time as f64 / serialize_time as f64) * 100.0
+        );
+    }
+    if total_pipeline < 100_000 {
+        println!("  • Total pipeline latency is excellent (< 100μs)");
+    }
+
+    println!("\nSimplification Benefits:");
+    println!("  • Eliminated complex type conversion layer");
+    println!("  • Reduced memory allocations");
+    println!("  • Automatic field preservation (no manual mapping)");
+    println!("  • Direct JSON manipulation improves performance");
+
+    println!("\nRecommendations:");
+    if serialize_time > deserialize_time {
+        println!("  • Focus optimization efforts on serialization rather than deserialization");
+    }
+    println!("  • PD mode overhead is minimal - safe to use for latency-sensitive workloads");
+    println!("  • Consider batching small requests to improve overall throughput");
+
+    println!("\n{}", "=".repeat(50));
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    benchmark_summary,
+    bench_json_serialization,
+    bench_json_deserialization,
+    bench_bootstrap_injection,
+    bench_direct_json_routing,
+    bench_throughput_by_size,
+    bench_full_round_trip
+);
+criterion_main!(benches);
diff --git a/sgl-router/benches/tokenizer_benchmark.rs b/sgl-router/benches/tokenizer_benchmark.rs
new file mode 100644
index 000000000..a40abcc4e
--- /dev/null
+++ b/sgl-router/benches/tokenizer_benchmark.rs
@@ -0,0 +1,1400 @@
+//! Comprehensive tokenizer benchmark with clean summary output
+//! Each test adds a row to the final summary table
+
+use criterion::{black_box, criterion_group, BenchmarkId, Criterion, Throughput};
+use sglang_router_rs::tokenizer::{
+    huggingface::HuggingFaceTokenizer, sequence::Sequence, stop::*, stream::DecodeStream, traits::*,
+};
+use std::collections::BTreeMap;
+use std::path::PathBuf;
+use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
+use std::sync::{Arc, Mutex, OnceLock};
+use std::thread;
+use std::time::{Duration, Instant};
+
+// Include the common test utilities
+#[path = "../tests/common/mod.rs"]
+mod common;
+use common::ensure_tokenizer_cached;
+
+// Cache the tokenizer path for the entire benchmark run
+static TOKENIZER_PATH: OnceLock<PathBuf> = OnceLock::new();
+
+fn get_tokenizer_path() -> &'static PathBuf {
+    TOKENIZER_PATH.get_or_init(ensure_tokenizer_cached)
+}
+
+// Production target: 100k tokens per second
+const TARGET_TOKENS_PER_SECOND: u64 = 100_000;
+
+// Typical prompt sizes
+const SHORT_PROMPT: &str = "What is the capital of France?";
+const MEDIUM_PROMPT: &str = "Write a detailed explanation of quantum computing, including its principles, current applications, and future potential. Be sure to cover both the theoretical foundations and practical implementations.";
+const LONG_PROMPT: &str = "You are an expert software engineer. Review the following code and provide detailed feedback on performance optimizations, potential bugs, and architectural improvements. Consider scalability, maintainability, and best practices. The code implements a distributed caching system with the following requirements: 1) High availability across multiple regions, 2) Sub-millisecond latency for cache hits, 3) Automatic failover and recovery, 4) Support for both LRU and LFU eviction policies, 5) Real-time monitoring and alerting. Please analyze each component thoroughly and suggest concrete improvements with code examples where appropriate.";
+
+// System prompts can be quite large
+fn generate_system_prompt(size: usize) -> String {
+    let base = "You are a helpful AI assistant with expertise in ";
+    let domains = vec![
+        "mathematics",
+        "physics",
+        "chemistry",
+        "biology",
+        "computer science",
+        "engineering",
+        "medicine",
+        "law",
+        "economics",
+        "philosophy",
+    ];
+
+    let mut prompt = base.to_string();
+    while prompt.len() < size {
+        for domain in &domains {
+            prompt.push_str(domain);
+            prompt.push_str(", ");
+            if prompt.len() >= size {
+                break;
+            }
+        }
+    }
+    prompt
+}
+
+// Global results storage
+lazy_static::lazy_static! {
+    static ref BENCHMARK_RESULTS: Mutex<BTreeMap<String, String>> = Mutex::new(BTreeMap::new());
+}
+
+fn add_result(category: &str, result: String) {
+    let mut results = BENCHMARK_RESULTS.lock().unwrap();
+    let index = results.len();
+    results.insert(format!("{:03}_{}", index, category), result);
+}
+
+fn bench_encode_throughput(c: &mut Criterion) {
+    let tokenizer_path = get_tokenizer_path();
+    let tokenizer = Arc::new(
+        HuggingFaceTokenizer::from_file(tokenizer_path.to_str().unwrap())
+            .expect("Failed to load tokenizer"),
+    );
+
+    // Pre-generate system prompts
+    let system_1k = generate_system_prompt(1000);
+    let system_4k = generate_system_prompt(4000);
+    let system_16k = generate_system_prompt(16000);
+
+    let test_cases = vec![
+        ("short_30B", SHORT_PROMPT),
+        ("medium_230B", MEDIUM_PROMPT),
+        ("long_670B", LONG_PROMPT),
+        ("system_1KB", system_1k.as_str()),
+        ("system_4KB", system_4k.as_str()),
+        ("system_16KB", system_16k.as_str()),
+    ];
+
+    let mut group = c.benchmark_group("encode_throughput");
+
+    for (name, prompt) in test_cases {
+        let prompt_len = prompt.len();
+        let tokenizer_clone = tokenizer.clone();
+
+        // Get token count once
+        let encoding = tokenizer.encode(prompt).unwrap();
+        let token_count = encoding.token_ids().len();
+
+        // Track if metrics have been printed for this test case
+        let printed = Arc::new(AtomicBool::new(false));
+
+        group.throughput(Throughput::Bytes(prompt_len as u64));
+        group.bench_function(name, |b| {
+            let printed_clone = printed.clone();
+            let tokenizer = tokenizer_clone.clone();
+
+            b.iter_custom(|iters| {
+                let start = Instant::now();
+                for _ in 0..iters {
+                    black_box(tokenizer.encode(prompt).unwrap());
+                }
+                let duration = start.elapsed();
+
+                // Store result only once per test case
+                if !printed_clone.load(Ordering::Relaxed) {
+                    let ops_per_sec = iters as f64 / duration.as_secs_f64();
+                    let chars_per_sec = (iters as f64 * prompt_len as f64) / duration.as_secs_f64();
+                    let tokens_per_sec =
+                        (iters as f64 * token_count as f64) / duration.as_secs_f64();
+
+                    let result = format!(
+                        "{:<15} | {:>8} | {:>8} | {:>12.0} | {:>12.0} | {:>10.0} | {:>10}",
+                        name,
+                        prompt_len,
+                        token_count,
+                        chars_per_sec,
+                        tokens_per_sec,
+                        ops_per_sec,
+                        1
+                    );
+                    add_result("encode", result);
+
+                    printed_clone.store(true, Ordering::Relaxed);
+                }
+
+                duration
+            });
+        });
+    }
+
+    group.finish();
+}
+
+fn bench_batch_encode(c: &mut Criterion) {
+    let tokenizer_path = get_tokenizer_path();
+    let tokenizer = Arc::new(
+        HuggingFaceTokenizer::from_file(tokenizer_path.to_str().unwrap())
+            .expect("Failed to load tokenizer"),
+    );
+
+    let batch_sizes = vec![1, 8, 16, 32, 64, 128];
+    let prompt = MEDIUM_PROMPT;
+    let prompt_len = prompt.len();
+    let encoding = tokenizer.encode(prompt).unwrap();
+    let token_count = encoding.token_ids().len();
+
+    let mut group = c.benchmark_group("batch_encode");
+
+    for batch_size in batch_sizes {
+        let prompts: Vec<&str> = vec![prompt; batch_size];
+        let printed = Arc::new(AtomicBool::new(false));
+        let tokenizer_clone = tokenizer.clone();
+
+        group.throughput(Throughput::Elements(batch_size as u64));
+        group.bench_with_input(
+            BenchmarkId::from_parameter(batch_size),
+            &batch_size,
+            |b, &size| {
+                let printed_clone = printed.clone();
+                let tokenizer = tokenizer_clone.clone();
+
+                b.iter_custom(|iters| {
+                    let start = Instant::now();
+                    for _ in 0..iters {
+                        black_box(tokenizer.encode_batch(&prompts).unwrap());
+                    }
+                    let duration = start.elapsed();
+
+                    if !printed_clone.load(Ordering::Relaxed) {
+                        let prompts_per_sec = (iters as f64 * size as f64) / duration.as_secs_f64();
+                        let tokens_per_sec = prompts_per_sec * token_count as f64;
+                        let chars_per_sec = prompts_per_sec * prompt_len as f64;
+
+                        let result = format!(
+                            "{:<15} | {:>8} | {:>8} | {:>12.0} | {:>12.0} | {:>10.0} | {:>10}",
+                            format!("batch_{}", size),
+                            prompt_len * size,
+                            token_count * size,
+                            prompts_per_sec,
+                            tokens_per_sec,
+                            chars_per_sec,
+                            1
+                        );
+                        add_result("batch", result);
+
+                        printed_clone.store(true, Ordering::Relaxed);
+                    }
+
+                    duration
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_concurrent_encode(c: &mut Criterion) {
+    let tokenizer = Arc::new(
+        HuggingFaceTokenizer::from_file(get_tokenizer_path().to_str().unwrap())
+            .expect("Failed to load tokenizer"),
+    );
+
+    let client_counts = vec![1, 4, 8, 16, 32];
+
+    let mut group = c.benchmark_group("concurrent_encode");
+    group.measurement_time(Duration::from_secs(2));
+
+    for num_clients in client_counts {
+        let printed = Arc::new(AtomicBool::new(false));
+        let tokenizer_clone = tokenizer.clone();
+
+        group.bench_with_input(
+            BenchmarkId::from_parameter(num_clients),
+            &num_clients,
+            |b, &clients| {
+                let printed_clone = printed.clone();
+
+                b.iter_custom(|_iters| {
+                    let tokenizer = tokenizer_clone.clone();
+                    let total_operations = Arc::new(AtomicU64::new(0));
+                    let total_chars = Arc::new(AtomicU64::new(0));
+                    let start = Instant::now();
+
+                    let handles: Vec<_> = (0..clients)
+                        .map(|client_id| {
+                            let tokenizer = tokenizer.clone();
+                            let total_ops = total_operations.clone();
+                            let total_ch = total_chars.clone();
+
+                            thread::spawn(move || {
+                                let prompts = [SHORT_PROMPT, MEDIUM_PROMPT, LONG_PROMPT];
+                                let prompt = prompts[client_id % prompts.len()];
+                                let mut local_ops = 0u64;
+                                let mut local_chars = 0u64;
+
+                                while start.elapsed() < Duration::from_millis(500) {
+                                    let _ = tokenizer.encode(prompt).unwrap();
+                                    local_ops += 1;
+                                    local_chars += prompt.len() as u64;
+                                }
+
+                                total_ops.fetch_add(local_ops, Ordering::Relaxed);
+                                total_ch.fetch_add(local_chars, Ordering::Relaxed);
+                            })
+                        })
+                        .collect();
+
+                    for handle in handles {
+                        handle.join().unwrap();
+                    }
+
+                    let duration = start.elapsed();
+
+                    if !printed_clone.load(Ordering::Relaxed) {
+                        let total_ops = total_operations.load(Ordering::Relaxed);
+                        let total_ch = total_chars.load(Ordering::Relaxed);
+                        let ops_per_sec = total_ops as f64 / duration.as_secs_f64();
+                        let chars_per_sec = total_ch as f64 / duration.as_secs_f64();
+                        let per_client = ops_per_sec / clients as f64;
+
+                        let result = format!(
+                            "{:<15} | {:>10} | {:>12.0} | {:>12.0} | {:>15.0}",
+                            format!("{}_clients", clients),
+                            total_ops,
+                            ops_per_sec,
+                            chars_per_sec,
+                            per_client
+                        );
+                        add_result("concurrent", result);
+
+                        printed_clone.store(true, Ordering::Relaxed);
+                    }
+
+                    duration
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_decode_performance(c: &mut Criterion) {
+    let tokenizer = Arc::new(
+        HuggingFaceTokenizer::from_file(get_tokenizer_path().to_str().unwrap())
+            .expect("Failed to load tokenizer"),
+    );
+
+    let test_text = "The quick brown fox jumps over the lazy dog. ".repeat(10);
+    let encoding = tokenizer.encode(&test_text).unwrap();
+    let tokens = encoding.token_ids();
+    let num_tokens = tokens.len();
+
+    let mut group = c.benchmark_group("decode_performance");
+
+    // Test direct decode
+    let printed_direct = Arc::new(AtomicBool::new(false));
+    group.bench_function("direct_decode", |b| {
+        let printed = printed_direct.clone();
+        let tokenizer = tokenizer.clone();
+
+        b.iter_custom(|iters| {
+            let start = Instant::now();
+            for _ in 0..iters {
+                black_box(tokenizer.decode(tokens, false).unwrap());
+            }
+            let duration = start.elapsed();
+
+            if !printed.load(Ordering::Relaxed) {
+                let ops_per_sec = iters as f64 / duration.as_secs_f64();
+                let tokens_per_sec = ops_per_sec * num_tokens as f64;
+
+                let result = format!(
+                    "{:<20} | {:>10} | {:>12.0} | {:>12.0} | {:>10}",
+                    "Direct", num_tokens, tokens_per_sec, ops_per_sec, 1
+                );
+                add_result("decode", result);
+
+                printed.store(true, Ordering::Relaxed);
+            }
+
+            duration
+        });
+    });
+
+    // Test DecodeStream
+    let printed_stream = Arc::new(AtomicBool::new(false));
+    group.bench_function("decode_stream", |b| {
+        let printed = printed_stream.clone();
+        let tokenizer = tokenizer.clone();
+
+        b.iter_custom(|iters| {
+            let start = Instant::now();
+            for _ in 0..iters {
+                let mut decoder = DecodeStream::new(tokenizer.clone(), &[], false);
+                let mut output = String::new();
+                for token in tokens {
+                    if let Some(text) = decoder.step(*token).unwrap() {
+                        output.push_str(&text);
+                    }
+                }
+                black_box(output);
+            }
+            let duration = start.elapsed();
+
+            if !printed.load(Ordering::Relaxed) {
+                let ops_per_sec = iters as f64 / duration.as_secs_f64();
+                let tokens_per_sec = ops_per_sec * num_tokens as f64;
+
+                let result = format!(
+                    "{:<20} | {:>10} | {:>12.0} | {:>12.0} | {:>10}",
+                    "DecodeStream", num_tokens, tokens_per_sec, ops_per_sec, 1
+                );
+                add_result("decode", result);
+
+                printed.store(true, Ordering::Relaxed);
+            }
+
+            duration
+        });
+    });
+
+    // Test Sequence
+    let printed_seq = Arc::new(AtomicBool::new(false));
+    group.bench_function("sequence_decode", |b| {
+        let printed = printed_seq.clone();
+        let tokenizer = tokenizer.clone();
+
+        b.iter_custom(|iters| {
+            let start = Instant::now();
+            for _ in 0..iters {
+                let mut sequence = Sequence::new(tokenizer.clone());
+                let mut output = String::new();
+                for token in tokens {
+                    let text = sequence.append_token(*token).unwrap();
+                    output.push_str(&text);
+                }
+                black_box(output);
+            }
+            let duration = start.elapsed();
+
+            if !printed.load(Ordering::Relaxed) {
+                let ops_per_sec = iters as f64 / duration.as_secs_f64();
+                let tokens_per_sec = ops_per_sec * num_tokens as f64;
+
+                let result = format!(
+                    "{:<20} | {:>10} | {:>12.0} | {:>12.0} | {:>10}",
+                    "Sequence", num_tokens, tokens_per_sec, ops_per_sec, 1
+                );
+                add_result("decode", result);
+
+                printed.store(true, Ordering::Relaxed);
+            }
+
+            duration
+        });
+    });
+
+    group.finish();
+}
+
+fn bench_streaming_decode_100k(c: &mut Criterion) {
+    let tokenizer = Arc::new(
+        HuggingFaceTokenizer::from_file(get_tokenizer_path().to_str().unwrap())
+            .expect("Failed to load tokenizer"),
+    );
+
+    let sample_text = "The quick brown fox jumps over the lazy dog. ".repeat(1000);
+    let encoding = tokenizer.encode(&sample_text).unwrap();
+    let all_tokens = encoding.token_ids();
+
+    let mut group = c.benchmark_group("streaming_100k");
+    group.measurement_time(Duration::from_secs(1));
+
+    // Test DecodeStream
+    let printed_stream = Arc::new(AtomicBool::new(false));
+    group.bench_function("decode_stream_100k", |b| {
+        let printed = printed_stream.clone();
+        let tokenizer = tokenizer.clone();
+
+        b.iter_custom(|_iters| {
+            let start = Instant::now();
+            let mut decoder = DecodeStream::new(tokenizer.clone(), &[], false);
+            let mut output = String::new();
+            let mut tokens_processed = 0u64;
+
+            for token in all_tokens.iter().cycle() {
+                if start.elapsed() >= Duration::from_millis(500) {
+                    break;
+                }
+
+                if let Some(text) = decoder.step(*token).unwrap() {
+                    output.push_str(&text);
+                }
+                tokens_processed += 1;
+            }
+
+            let duration = start.elapsed();
+
+            if !printed.load(Ordering::Relaxed) {
+                let tokens_per_sec = tokens_processed as f64 / duration.as_secs_f64();
+                let status = if tokens_per_sec >= TARGET_TOKENS_PER_SECOND as f64 {
+                    "PASS"
+                } else {
+                    "BELOW"
+                };
+
+                let result = format!(
+                    "{:<20} | {:>12} | {:>12.0} | {:>12} | {:>10} | {:>12}",
+                    "DecodeStream",
+                    tokens_processed,
+                    tokens_per_sec,
+                    TARGET_TOKENS_PER_SECOND,
+                    1,
+                    status
+                );
+                add_result("streaming_100k", result);
+
+                printed.store(true, Ordering::Relaxed);
+            }
+
+            duration
+        });
+    });
+
+    // Test Sequence
+    let printed_seq = Arc::new(AtomicBool::new(false));
+    group.bench_function("sequence_100k", |b| {
+        let printed = printed_seq.clone();
+        let tokenizer = tokenizer.clone();
+
+        b.iter_custom(|_iters| {
+            let start = Instant::now();
+            let mut sequence = Sequence::new(tokenizer.clone());
+            let mut output = String::new();
+            let mut tokens_processed = 0u64;
+
+            for token in all_tokens.iter().cycle() {
+                if start.elapsed() >= Duration::from_millis(500) {
+                    break;
+                }
+
+                let text = sequence.append_token(*token).unwrap();
+                output.push_str(&text);
+                tokens_processed += 1;
+            }
+
+            let duration = start.elapsed();
+
+            if !printed.load(Ordering::Relaxed) {
+                let tokens_per_sec = tokens_processed as f64 / duration.as_secs_f64();
+                let status = if tokens_per_sec >= TARGET_TOKENS_PER_SECOND as f64 {
+                    "PASS"
+                } else {
+                    "BELOW"
+                };
+
+                let result = format!(
+                    "{:<20} | {:>12} | {:>12.0} | {:>12} | {:>10} | {:>12}",
+                    "Sequence",
+                    tokens_processed,
+                    tokens_per_sec,
+                    TARGET_TOKENS_PER_SECOND,
+                    1,
+                    status
+                );
+                add_result("streaming_100k", result);
+
+                printed.store(true, Ordering::Relaxed);
+            }
+
+            duration
+        });
+    });
+
+    group.finish();
+}
+
+fn bench_latency_distribution(c: &mut Criterion) {
+    let tokenizer = Arc::new(
+        HuggingFaceTokenizer::from_file(get_tokenizer_path().to_str().unwrap())
+            .expect("Failed to load tokenizer"),
+    );
+
+    // Test latency for individual token processing
+    let sample_tokens = vec![1, 450, 6635, 3290, 491, 278, 3474, 29892];
+
+    let mut group = c.benchmark_group("latency");
+
+    // Encode latency
+    let system_4k = generate_system_prompt(4000);
+    let test_cases = vec![
+        ("encode_short", SHORT_PROMPT),
+        ("encode_medium", MEDIUM_PROMPT),
+        ("encode_long", LONG_PROMPT),
+        ("encode_4KB", system_4k.as_str()),
+    ];
+
+    for (name, prompt) in test_cases {
+        let printed = Arc::new(AtomicBool::new(false));
+        group.bench_function(name, |b| {
+            let printed_clone = printed.clone();
+            let tokenizer = tokenizer.clone();
+
+            b.iter_custom(|iters| {
+                // Only collect detailed latency on first iteration
+                let total_duration = if !printed_clone.load(Ordering::Relaxed) {
+                    let mut latencies = Vec::new();
+
+                    // Warm up
+                    for _ in 0..100 {
+                        let _ = tokenizer.encode(prompt).unwrap();
+                    }
+
+                    // Measure for statistics
+                    for _ in 0..1000 {
+                        let start = Instant::now();
+                        let _ = tokenizer.encode(prompt).unwrap();
+                        let latency = start.elapsed();
+                        latencies.push(latency);
+                    }
+
+                    latencies.sort();
+                    let p50 = latencies[latencies.len() / 2];
+                    let p95 = latencies[latencies.len() * 95 / 100];
+                    let p99 = latencies[latencies.len() * 99 / 100];
+                    let max = latencies.last().unwrap();
+
+                    let result = format!(
+                        "{:<20} | {:>10.1} | {:>10.1} | {:>10.1} | {:>10.1} | {:>10}",
+                        name,
+                        p50.as_micros() as f64,
+                        p95.as_micros() as f64,
+                        p99.as_micros() as f64,
+                        max.as_micros() as f64,
+                        1000
+                    );
+                    add_result("latency", result);
+
+                    printed_clone.store(true, Ordering::Relaxed);
+
+                    // Return median for consistency
+                    p50 * iters as u32
+                } else {
+                    // Regular benchmark iterations
+                    let start = Instant::now();
+                    for _ in 0..iters {
+                        black_box(tokenizer.encode(prompt).unwrap());
+                    }
+                    start.elapsed()
+                };
+
+                total_duration
+            });
+        });
+    }
+
+    // Decode token latency
+    let printed_decode = Arc::new(AtomicBool::new(false));
+    group.bench_function("decode_token", |b| {
+        let printed_clone = printed_decode.clone();
+        let tokenizer = tokenizer.clone();
+        let tokens = sample_tokens.clone();
+
+        b.iter_custom(|iters| {
+            let total_duration = if !printed_clone.load(Ordering::Relaxed) {
+                let mut latencies = Vec::new();
+                let mut decoder = DecodeStream::new(tokenizer.clone(), &[], false);
+
+                for token in tokens.iter().cycle().take(1000) {
+                    let start = Instant::now();
+                    let _ = decoder.step(*token).unwrap();
+                    let latency = start.elapsed();
+                    latencies.push(latency);
+                }
+
+                latencies.sort();
+                let p50 = latencies[latencies.len() / 2];
+                let p95 = latencies[latencies.len() * 95 / 100];
+                let p99 = latencies[latencies.len() * 99 / 100];
+                let max = latencies.last().unwrap();
+
+                let result = format!(
+                    "{:<20} | {:>10.1} | {:>10.1} | {:>10.1} | {:>10.1} | {:>10}",
+                    "decode_token",
+                    p50.as_micros() as f64,
+                    p95.as_micros() as f64,
+                    p99.as_micros() as f64,
+                    max.as_micros() as f64,
+                    1000
+                );
+                add_result("latency", result);
+
+                // Check target latency
+                let target_latency = Duration::from_micros(10);
+                if p50 > target_latency {
+                    let warning = format!(
+                        "WARNING: P50 latency exceeds target of {:?} for 100k tokens/sec",
+                        target_latency
+                    );
+                    add_result("latency_warning", warning);
+                }
+
+                printed_clone.store(true, Ordering::Relaxed);
+
+                // Return approximate time for consistency
+                p50 * iters as u32
+            } else {
+                // Regular benchmark iterations
+                let start = Instant::now();
+                for _ in 0..iters {
+                    let mut decoder = DecodeStream::new(tokenizer.clone(), &[], false);
+                    for token in tokens.iter().take(10) {
+                        black_box(decoder.step(*token).unwrap());
+                    }
+                }
+                start.elapsed()
+            };
+
+            total_duration
+        });
+    });
+
+    group.finish();
+}
+
+fn bench_concurrent_streaming(c: &mut Criterion) {
+    let tokenizer = Arc::new(
+        HuggingFaceTokenizer::from_file(get_tokenizer_path().to_str().unwrap())
+            .expect("Failed to load tokenizer"),
+    );
+
+    let num_sequences = 16;
+    let tokens_per_sequence = 10_000;
+
+    let sample_text = "The quick brown fox jumps over the lazy dog. ".repeat(100);
+    let encoding = tokenizer.encode(&sample_text).unwrap();
+    let token_batch: Vec<u32> = encoding.token_ids().to_vec();
+
+    let mut group = c.benchmark_group("concurrent_streaming");
+    group.measurement_time(Duration::from_secs(2));
+
+    let printed = Arc::new(AtomicBool::new(false));
+    group.bench_function("concurrent_16_sequences", |b| {
+        let printed_clone = printed.clone();
+        let tokenizer = tokenizer.clone();
+        let tokens = token_batch.clone();
+
+        b.iter_custom(|_iters| {
+            let total_tokens = Arc::new(AtomicU64::new(0));
+            let start = Instant::now();
+
+            let handles: Vec<_> = (0..num_sequences)
+                .map(|_seq_id| {
+                    let tokenizer = tokenizer.clone();
+                    let tokens = tokens.clone();
+                    let total_tokens = total_tokens.clone();
+
+                    thread::spawn(move || {
+                        let mut decoder = DecodeStream::new(tokenizer, &[], false);
+                        let mut output = String::new();
+                        let mut local_count = 0u64;
+
+                        for token in tokens.iter().cycle().take(tokens_per_sequence) {
+                            if let Some(text) = decoder.step(*token).unwrap() {
+                                output.push_str(&text);
+                            }
+                            local_count += 1;
+                        }
+
+                        total_tokens.fetch_add(local_count, Ordering::Relaxed);
+                    })
+                })
+                .collect();
+
+            for handle in handles {
+                handle.join().unwrap();
+            }
+
+            let duration = start.elapsed();
+
+            if !printed_clone.load(Ordering::Relaxed) {
+                let total = total_tokens.load(Ordering::Relaxed);
+                let throughput = total as f64 / duration.as_secs_f64();
+                let per_seq = throughput / num_sequences as f64;
+
+                let result = format!(
+                    "{:<20} | {:>10} | {:>12.0} | {:>15.0} | {:>15}",
+                    format!("{}_sequences", num_sequences),
+                    total,
+                    throughput,
+                    per_seq,
+                    num_sequences
+                );
+                add_result("concurrent_streaming", result);
+
+                printed_clone.store(true, Ordering::Relaxed);
+            }
+
+            duration
+        });
+    });
+
+    group.finish();
+}
+
+fn bench_stop_sequences(c: &mut Criterion) {
+    let tokenizer = Arc::new(
+        HuggingFaceTokenizer::from_file(get_tokenizer_path().to_str().unwrap())
+            .expect("Failed to load tokenizer"),
+    );
+
+    let config = StopSequenceConfig::default()
+        .with_stop_sequence("</s>")
+        .with_stop_sequence("\n\n")
+        .with_stop_sequence("###")
+        .with_stop_token(2);
+
+    let sample_text = "Hello world! This is a test. ### Stop here. Continue after.".repeat(100);
+    let encoding = tokenizer.encode(&sample_text).unwrap();
+    let tokens = encoding.token_ids();
+
+    let mut group = c.benchmark_group("stop_sequences");
+
+    // No stops
+    let printed_no_stop = Arc::new(AtomicBool::new(false));
+    group.bench_function("no_stops", |b| {
+        let printed_clone = printed_no_stop.clone();
+        let tokenizer = tokenizer.clone();
+
+        b.iter_custom(|iters| {
+            let start = Instant::now();
+            let mut total_tokens = 0u64;
+
+            for _ in 0..iters {
+                let mut decoder = StopSequenceDecoder::new(
+                    tokenizer.clone(),
+                    StopSequenceConfig::default(),
+                    false,
+                );
+                for token in tokens {
+                    let _ = decoder.process_token(*token).unwrap();
+                    total_tokens += 1;
+                }
+            }
+
+            let duration = start.elapsed();
+
+            if !printed_clone.load(Ordering::Relaxed) {
+                let tokens_per_sec = total_tokens as f64 / duration.as_secs_f64();
+                let seq_per_sec = iters as f64 / duration.as_secs_f64();
+
+                let result = format!(
+                    "{:<20} | {:>10} | {:>12} | {:>12.0} | {:>10.0}",
+                    "No stops", iters, total_tokens, tokens_per_sec, seq_per_sec
+                );
+                add_result("stop_sequences", result);
+
+                printed_clone.store(true, Ordering::Relaxed);
+            }
+
+            duration
+        });
+    });
+
+    // With stops
+    let printed_with_stops = Arc::new(AtomicBool::new(false));
+    group.bench_function("with_stops", |b| {
+        let printed_clone = printed_with_stops.clone();
+        let tokenizer = tokenizer.clone();
+        let config = config.clone();
+
+        b.iter_custom(|iters| {
+            let start = Instant::now();
+            let mut total_tokens = 0u64;
+            let mut total_sequences = 0u64;
+
+            for _ in 0..iters {
+                let mut decoder =
+                    StopSequenceDecoder::new(tokenizer.clone(), config.clone(), false);
+                let mut sequence_tokens = 0u64;
+
+                for token in tokens {
+                    let result = decoder.process_token(*token).unwrap();
+                    sequence_tokens += 1;
+
+                    if matches!(
+                        result,
+                        SequenceDecoderOutput::Stopped | SequenceDecoderOutput::StoppedWithText(_)
+                    ) {
+                        break;
+                    }
+                }
+
+                total_tokens += sequence_tokens;
+                total_sequences += 1;
+            }
+
+            let duration = start.elapsed();
+
+            if !printed_clone.load(Ordering::Relaxed) {
+                let tokens_per_sec = total_tokens as f64 / duration.as_secs_f64();
+                let seq_per_sec = total_sequences as f64 / duration.as_secs_f64();
+
+                let result = format!(
+                    "{:<20} | {:>10} | {:>12} | {:>12.0} | {:>10.0}",
+                    "With stops", total_sequences, total_tokens, tokens_per_sec, seq_per_sec
+                );
+                add_result("stop_sequences", result);
+
+                printed_clone.store(true, Ordering::Relaxed);
+            }
+
+            duration
+        });
+    });
+
+    group.finish();
+}
+
+fn bench_multithreaded_encode(c: &mut Criterion) {
+    let tokenizer = Arc::new(
+        HuggingFaceTokenizer::from_file(get_tokenizer_path().to_str().unwrap())
+            .expect("Failed to load tokenizer"),
+    );
+
+    let thread_counts = vec![1, 2, 4, 8, 16];
+    let operations_per_thread = 1000;
+
+    // Test with medium-sized prompt for balanced workload
+    let test_prompt = MEDIUM_PROMPT;
+
+    let mut group = c.benchmark_group("multithreaded_encode");
+    group.measurement_time(Duration::from_secs(2));
+
+    let mut baseline_throughput = 0.0;
+
+    for num_threads in thread_counts {
+        let printed = Arc::new(AtomicBool::new(false));
+        let tokenizer_clone = tokenizer.clone();
+
+        group.bench_with_input(
+            BenchmarkId::from_parameter(num_threads),
+            &num_threads,
+            |b, &threads| {
+                let printed_clone = printed.clone();
+                let tokenizer = tokenizer_clone.clone();
+
+                b.iter_custom(|_iters| {
+                    let total_operations = Arc::new(AtomicU64::new(0));
+                    let total_tokens = Arc::new(AtomicU64::new(0));
+                    let start = Instant::now();
+
+                    let handles: Vec<_> = (0..threads)
+                        .map(|_| {
+                            let tokenizer = tokenizer.clone();
+                            let total_ops = total_operations.clone();
+                            let total_tok = total_tokens.clone();
+
+                            thread::spawn(move || {
+                                for _ in 0..operations_per_thread {
+                                    let encoding = tokenizer.encode(test_prompt).unwrap();
+                                    total_tok.fetch_add(
+                                        encoding.token_ids().len() as u64,
+                                        Ordering::Relaxed,
+                                    );
+                                }
+                                total_ops
+                                    .fetch_add(operations_per_thread as u64, Ordering::Relaxed);
+                            })
+                        })
+                        .collect();
+
+                    for handle in handles {
+                        handle.join().unwrap();
+                    }
+
+                    let duration = start.elapsed();
+
+                    if !printed_clone.load(Ordering::Relaxed) {
+                        let total_ops = total_operations.load(Ordering::Relaxed);
+                        let total_tok = total_tokens.load(Ordering::Relaxed);
+                        let ops_per_sec = total_ops as f64 / duration.as_secs_f64();
+                        let tokens_per_sec = total_tok as f64 / duration.as_secs_f64();
+
+                        if threads == 1 {
+                            baseline_throughput = tokens_per_sec;
+                        }
+
+                        let efficiency = if threads == 1 {
+                            100.0
+                        } else {
+                            (tokens_per_sec / (baseline_throughput * threads as f64)) * 100.0
+                        };
+
+                        let result = format!(
+                            "{:<20} | {:>10} | {:>12.0} | {:>12.0} | {:>10} | {:>11.1}%",
+                            format!("encode_{}_threads", threads),
+                            total_ops,
+                            ops_per_sec,
+                            tokens_per_sec,
+                            threads,
+                            efficiency
+                        );
+                        add_result("mt_encode", result);
+
+                        printed_clone.store(true, Ordering::Relaxed);
+                    }
+
+                    duration
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_multithreaded_decode(c: &mut Criterion) {
+    let tokenizer = Arc::new(
+        HuggingFaceTokenizer::from_file(get_tokenizer_path().to_str().unwrap())
+            .expect("Failed to load tokenizer"),
+    );
+
+    let thread_counts = vec![1, 2, 4, 8, 16];
+    let tokens_per_thread = 5000;
+
+    // Generate tokens for decoding
+    let test_text = "The quick brown fox jumps over the lazy dog. ".repeat(100);
+    let encoding = tokenizer.encode(&test_text).unwrap();
+    let test_tokens: Vec<u32> = encoding.token_ids().to_vec();
+
+    let mut group = c.benchmark_group("multithreaded_decode");
+    group.measurement_time(Duration::from_secs(2));
+
+    let mut baseline_throughput = 0.0;
+
+    for num_threads in thread_counts {
+        let printed = Arc::new(AtomicBool::new(false));
+        let tokenizer_clone = tokenizer.clone();
+        let tokens = test_tokens.clone();
+
+        group.bench_with_input(
+            BenchmarkId::from_parameter(num_threads),
+            &num_threads,
+            |b, &threads| {
+                let printed_clone = printed.clone();
+                let tokenizer = tokenizer_clone.clone();
+                let tokens = tokens.clone();
+
+                b.iter_custom(|_iters| {
+                    let total_tokens = Arc::new(AtomicU64::new(0));
+                    let start = Instant::now();
+
+                    let handles: Vec<_> = (0..threads)
+                        .map(|_| {
+                            let tokenizer = tokenizer.clone();
+                            let tokens = tokens.clone();
+                            let total_tok = total_tokens.clone();
+
+                            thread::spawn(move || {
+                                let mut decoder = DecodeStream::new(tokenizer, &[], false);
+                                let mut output = String::new();
+                                let mut local_tokens = 0u64;
+
+                                for token in tokens.iter().cycle().take(tokens_per_thread) {
+                                    if let Some(text) = decoder.step(*token).unwrap() {
+                                        output.push_str(&text);
+                                    }
+                                    local_tokens += 1;
+                                }
+
+                                total_tok.fetch_add(local_tokens, Ordering::Relaxed);
+                            })
+                        })
+                        .collect();
+
+                    for handle in handles {
+                        handle.join().unwrap();
+                    }
+
+                    let duration = start.elapsed();
+
+                    if !printed_clone.load(Ordering::Relaxed) {
+                        let total = total_tokens.load(Ordering::Relaxed);
+                        let tokens_per_sec = total as f64 / duration.as_secs_f64();
+
+                        if threads == 1 {
+                            baseline_throughput = tokens_per_sec;
+                        }
+
+                        let efficiency = if threads == 1 {
+                            100.0
+                        } else {
+                            (tokens_per_sec / (baseline_throughput * threads as f64)) * 100.0
+                        };
+
+                        let result = format!(
+                            "{:<20} | {:>12} | {:>12.0} | {:>10} | {:>11.1}%",
+                            format!("decode_{}_threads", threads),
+                            total,
+                            tokens_per_sec,
+                            threads,
+                            efficiency
+                        );
+                        add_result("mt_decode", result);
+
+                        printed_clone.store(true, Ordering::Relaxed);
+                    }
+
+                    duration
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_memory_efficiency(c: &mut Criterion) {
+    let tokenizer = Arc::new(
+        HuggingFaceTokenizer::from_file(get_tokenizer_path().to_str().unwrap())
+            .expect("Failed to load tokenizer"),
+    );
+
+    let large_text = "The quick brown fox jumps over the lazy dog. ".repeat(1000);
+    let encoding = tokenizer.encode(&large_text).unwrap();
+
+    let mut group = c.benchmark_group("memory");
+
+    // Track owned baseline time
+    let mut owned_time_ns = 0.0;
+
+    // Owned
+    let printed_owned = Arc::new(AtomicBool::new(false));
+    group.bench_function("token_ids_owned", |b| {
+        let printed_clone = printed_owned.clone();
+        let encoding = encoding.clone();
+
+        b.iter_custom(|iters| {
+            let start = Instant::now();
+            for _ in 0..iters {
+                let _ = black_box(encoding.token_ids());
+            }
+            let duration = start.elapsed();
+
+            if !printed_clone.load(Ordering::Relaxed) {
+                let ops_per_sec = iters as f64 / duration.as_secs_f64();
+                let time_per_call = duration.as_nanos() as f64 / iters as f64;
+                owned_time_ns = time_per_call;
+
+                let result = format!(
+                    "{:<20} | {:>12.0} | {:>11.0}ns | {:>12}",
+                    "token_ids(owned)", ops_per_sec, time_per_call, "baseline"
+                );
+                add_result("memory", result);
+
+                printed_clone.store(true, Ordering::Relaxed);
+            }
+
+            duration
+        });
+    });
+
+    // Reference
+    let printed_ref = Arc::new(AtomicBool::new(false));
+
+    group.bench_function("token_ids_ref", |b| {
+        let printed_clone = printed_ref.clone();
+        let encoding = encoding.clone();
+
+        b.iter_custom(|iters| {
+            let start = Instant::now();
+            for _ in 0..iters {
+                let _ = black_box(encoding.token_ids());
+            }
+            let duration = start.elapsed();
+
+            if !printed_clone.load(Ordering::Relaxed) {
+                let ops_per_sec = iters as f64 / duration.as_secs_f64();
+                let time_per_call = duration.as_nanos() as f64 / iters as f64;
+
+                // Calculate improvement
+                let improvement = if owned_time_ns > 0.0 {
+                    format!("{:.1}x faster", owned_time_ns / time_per_call)
+                } else {
+                    "N/A".to_string()
+                };
+
+                let result = format!(
+                    "{:<20} | {:>12.0} | {:>11.0}ns | {:>12}",
+                    "token_ids_ref(ref)", ops_per_sec, time_per_call, improvement
+                );
+                add_result("memory", result);
+
+                printed_clone.store(true, Ordering::Relaxed);
+            }
+
+            duration
+        });
+    });
+
+    group.finish();
+}
+
+fn bench_scaling_characteristics(c: &mut Criterion) {
+    let tokenizer = Arc::new(
+        HuggingFaceTokenizer::from_file(get_tokenizer_path().to_str().unwrap())
+            .expect("Failed to load tokenizer"),
+    );
+
+    let thread_counts = vec![1, 2, 4, 8, 16];
+    let tokens_per_thread = 10000;
+
+    let mut group = c.benchmark_group("scaling");
+    group.measurement_time(Duration::from_secs(2));
+
+    let mut baseline_throughput = 0.0;
+
+    for num_threads in thread_counts {
+        let printed = Arc::new(AtomicBool::new(false));
+
+        group.bench_with_input(
+            BenchmarkId::from_parameter(num_threads),
+            &num_threads,
+            |b, &threads| {
+                let printed_clone = printed.clone();
+                let tokenizer = tokenizer.clone();
+
+                b.iter_custom(|_iters| {
+                    let total_tokens = Arc::new(AtomicU64::new(0));
+                    let start = Instant::now();
+
+                    let handles: Vec<_> = (0..threads)
+                        .map(|_| {
+                            let tokenizer = tokenizer.clone();
+                            let total_tokens = total_tokens.clone();
+
+                            thread::spawn(move || {
+                                let mut decoder = DecodeStream::new(tokenizer, &[], false);
+                                let sample_tokens = [1, 450, 6635, 3290, 491];
+
+                                for token in sample_tokens.iter().cycle().take(tokens_per_thread) {
+                                    let _ = decoder.step(*token).unwrap();
+                                }
+
+                                total_tokens.fetch_add(tokens_per_thread as u64, Ordering::Relaxed);
+                            })
+                        })
+                        .collect();
+
+                    for handle in handles {
+                        handle.join().unwrap();
+                    }
+
+                    let duration = start.elapsed();
+
+                    if !printed_clone.load(Ordering::Relaxed) {
+                        let total = total_tokens.load(Ordering::Relaxed);
+                        let throughput = total as f64 / duration.as_secs_f64();
+
+                        if threads == 1 {
+                            baseline_throughput = throughput;
+                        }
+
+                        let efficiency = if threads == 1 {
+                            100.0
+                        } else {
+                            (throughput / (baseline_throughput * threads as f64)) * 100.0
+                        };
+
+                        let result = format!(
+                            "{:<15} | {:>12} | {:>12.0} | {:>11.1}%",
+                            format!("{}_threads", threads),
+                            total,
+                            throughput,
+                            efficiency
+                        );
+                        add_result("scaling", result);
+
+                        printed_clone.store(true, Ordering::Relaxed);
+                    }
+
+                    duration
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+// Print final summary table
+fn print_summary() {
+    println!("\n{}", "=".repeat(120));
+    println!("TOKENIZER BENCHMARK SUMMARY");
+    println!("{}", "=".repeat(120));
+
+    let results = BENCHMARK_RESULTS.lock().unwrap();
+
+    let mut current_category = String::new();
+    for (key, value) in results.iter() {
+        let category = key.split('_').skip(1).collect::<Vec<_>>().join("_");
+
+        if category != current_category {
+            current_category = category.clone();
+
+            // Print section header based on category
+            println!("\n{}", "-".repeat(120));
+            match category.as_str() {
+                "encode" => {
+                    println!("ENCODING THROUGHPUT");
+                    println!(
+                        "{:<15} | {:>8} | {:>8} | {:>12} | {:>12} | {:>10} | {:>10}",
+                        "Test Case",
+                        "Size(B)",
+                        "Tokens",
+                        "Chars/sec",
+                        "Tokens/sec",
+                        "Ops/sec",
+                        "Thread"
+                    );
+                }
+                "batch" => {
+                    println!("BATCH ENCODING");
+                    println!(
+                        "{:<15} | {:>8} | {:>8} | {:>12} | {:>12} | {:>10} | {:>10}",
+                        "Batch Size",
+                        "Size(B)",
+                        "Tokens",
+                        "Prompts/sec",
+                        "Tokens/sec",
+                        "Chars/sec",
+                        "Thread"
+                    );
+                }
+                "concurrent" => {
+                    println!("CONCURRENT ENCODING");
+                    println!(
+                        "{:<15} | {:>10} | {:>12} | {:>12} | {:>15}",
+                        "Clients", "Total Ops", "Ops/sec", "Chars/sec", "Per-Client/sec"
+                    );
+                }
+                "mt_encode" => {
+                    println!("MULTI-THREADED ENCODING");
+                    println!(
+                        "{:<20} | {:>10} | {:>12} | {:>12} | {:>10} | {:>12}",
+                        "Configuration",
+                        "Total Ops",
+                        "Ops/sec",
+                        "Tokens/sec",
+                        "Threads",
+                        "Efficiency"
+                    );
+                }
+                "decode" => {
+                    println!("DECODE PERFORMANCE");
+                    println!(
+                        "{:<20} | {:>10} | {:>12} | {:>12} | {:>10}",
+                        "Method", "Tokens", "Tokens/sec", "Ops/sec", "Thread"
+                    );
+                }
+                "mt_decode" => {
+                    println!("MULTI-THREADED DECODING");
+                    println!(
+                        "{:<20} | {:>12} | {:>12} | {:>10} | {:>12}",
+                        "Configuration", "Total Tokens", "Tokens/sec", "Threads", "Efficiency"
+                    );
+                }
+                "streaming_100k" => {
+                    println!("STREAMING DECODE (100K Target)");
+                    println!(
+                        "{:<20} | {:>12} | {:>12} | {:>12} | {:>10} | {:>12}",
+                        "Method", "Tokens", "Tokens/sec", "Target", "Thread", "Status"
+                    );
+                }
+                "concurrent_streaming" => {
+                    println!("CONCURRENT STREAMING");
+                    println!(
+                        "{:<20} | {:>10} | {:>12} | {:>15} | {:>15}",
+                        "Sequences", "Total", "Aggregate/sec", "Per-Seq/sec", "Threads"
+                    );
+                }
+                "stop_sequences" => {
+                    println!("STOP SEQUENCE PERFORMANCE");
+                    println!(
+                        "{:<20} | {:>10} | {:>12} | {:>12} | {:>10}",
+                        "Config", "Sequences", "Tokens", "Tokens/sec", "Seq/sec"
+                    );
+                }
+                "latency" => {
+                    println!("LATENCY DISTRIBUTION");
+                    println!(
+                        "{:<20} | {:>10} | {:>10} | {:>10} | {:>10} | {:>10}",
+                        "Operation", "P50(µs)", "P95(µs)", "P99(µs)", "Max(µs)", "Samples"
+                    );
+                }
+                "scaling" => {
+                    println!("SCALING CHARACTERISTICS");
+                    println!(
+                        "{:<15} | {:>12} | {:>12} | {:>12}",
+                        "Threads", "Total Tokens", "Tokens/sec", "Efficiency"
+                    );
+                }
+                "memory" => {
+                    println!("MEMORY EFFICIENCY");
+                    println!(
+                        "{:<20} | {:>12} | {:>12} | {:>12}",
+                        "Operation", "Calls/sec", "Time/call", "Improvement"
+                    );
+                }
+                _ => {}
+            }
+            println!("{}", "-".repeat(120));
+        }
+
+        println!("{}", value);
+    }
+
+    println!("\n{}", "=".repeat(120));
+}
+
+fn run_benchmarks(c: &mut Criterion) {
+    bench_encode_throughput(c);
+    bench_batch_encode(c);
+    bench_concurrent_encode(c);
+    bench_multithreaded_encode(c);
+    bench_decode_performance(c);
+    bench_multithreaded_decode(c);
+    bench_streaming_decode_100k(c);
+    bench_concurrent_streaming(c);
+    bench_stop_sequences(c);
+    bench_latency_distribution(c);
+    bench_scaling_characteristics(c);
+    bench_memory_efficiency(c);
+
+    // Print summary at the end
+    print_summary();
+}
+
+criterion_group!(benches, run_benchmarks);
+criterion::criterion_main!(benches);
diff --git a/sgl-router/benches/tool_parser_benchmark.rs b/sgl-router/benches/tool_parser_benchmark.rs
new file mode 100644
index 000000000..35bd7f820
--- /dev/null
+++ b/sgl-router/benches/tool_parser_benchmark.rs
@@ -0,0 +1,848 @@
+//! Comprehensive tool parser benchmark for measuring performance under various scenarios
+//!
+//! This benchmark tests:
+//! - Single parser parsing performance
+//! - Registry creation overhead
+//! - Concurrent parsing with shared parsers
+//! - Streaming vs complete parsing
+//! - Different model formats (JSON, Mistral, Qwen, Pythonic, etc.)
+
+use criterion::{black_box, criterion_group, BenchmarkId, Criterion, Throughput};
+use sglang_router_rs::tool_parser::{
+    registry::ParserRegistry, state::ParseState, types::StreamResult,
+};
+use std::collections::BTreeMap;
+use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
+use std::sync::{Arc, Mutex};
+use std::thread;
+use std::time::{Duration, Instant};
+use tokio::runtime::Runtime;
+
+// Test data for different parser formats - realistic complex examples
+const JSON_SIMPLE: &str = r#"{"name": "code_interpreter", "arguments": "{\"language\": \"python\", \"code\": \"import numpy as np\\nimport matplotlib.pyplot as plt\\n\\n# Generate sample data\\nx = np.linspace(0, 10, 100)\\ny = np.sin(x) * np.exp(-x/10)\\n\\n# Create the plot\\nplt.figure(figsize=(10, 6))\\nplt.plot(x, y, 'b-', linewidth=2)\\nplt.grid(True)\\nplt.xlabel('Time (s)')\\nplt.ylabel('Amplitude')\\nplt.title('Damped Oscillation')\\nplt.show()\"}"}"#;
+
+const JSON_ARRAY: &str = r#"[{"name": "web_search", "arguments": "{\"query\": \"latest developments in quantum computing 2024\", \"num_results\": 10, \"search_type\": \"news\", \"date_range\": \"2024-01-01:2024-12-31\", \"exclude_domains\": [\"reddit.com\", \"facebook.com\"], \"language\": \"en\"}"}, {"name": "analyze_sentiment", "arguments": "{\"text\": \"The breakthrough in quantum error correction represents a significant milestone. Researchers are optimistic about practical applications within the next decade.\", \"granularity\": \"sentence\", \"aspects\": [\"technology\", \"timeline\", \"impact\"], \"confidence_threshold\": 0.85}"}, {"name": "create_summary", "arguments": "{\"content_ids\": [\"doc_1234\", \"doc_5678\", \"doc_9012\"], \"max_length\": 500, \"style\": \"technical\", \"include_citations\": true}"}]"#;
+
+const JSON_WITH_PARAMS: &str = r#"{"name": "database_query", "parameters": {"connection_string": "postgresql://user:pass@localhost:5432/analytics", "query": "SELECT customer_id, COUNT(*) as order_count, SUM(total_amount) as lifetime_value, AVG(order_amount) as avg_order_value FROM orders WHERE created_at >= '2024-01-01' GROUP BY customer_id HAVING COUNT(*) > 5 ORDER BY lifetime_value DESC LIMIT 100", "timeout_ms": 30000, "read_consistency": "strong", "partition_key": "customer_id"}}"#;
+
+const MISTRAL_FORMAT: &str = r#"I'll help you analyze the sales data and create visualizations. Let me start by querying the database and then create some charts.
+
+[TOOL_CALLS] [{"name": "sql_query", "arguments": {"database": "sales_analytics", "query": "WITH monthly_sales AS (SELECT DATE_TRUNC('month', order_date) as month, SUM(total_amount) as revenue, COUNT(DISTINCT customer_id) as unique_customers, COUNT(*) as total_orders FROM orders WHERE order_date >= CURRENT_DATE - INTERVAL '12 months' GROUP BY DATE_TRUNC('month', order_date)) SELECT month, revenue, unique_customers, total_orders, LAG(revenue) OVER (ORDER BY month) as prev_month_revenue, (revenue - LAG(revenue) OVER (ORDER BY month)) / LAG(revenue) OVER (ORDER BY month) * 100 as growth_rate FROM monthly_sales ORDER BY month DESC", "format": "json", "timeout": 60000}}]
+
+Based on the query results, I can see interesting trends in your sales data."#;
+
+const MISTRAL_MULTI: &str = r#"Let me help you with a comprehensive analysis of your application's performance.
+
+[TOOL_CALLS] [{"name": "get_metrics", "arguments": {"service": "api-gateway", "metrics": ["latency_p50", "latency_p95", "latency_p99", "error_rate", "requests_per_second"], "start_time": "2024-01-01T00:00:00Z", "end_time": "2024-01-01T23:59:59Z", "aggregation": "5m", "filters": {"environment": "production", "region": "us-east-1"}}}, {"name": "analyze_logs", "arguments": {"log_group": "/aws/lambda/process-orders", "query": "fields @timestamp, @message, @requestId, duration | filter @message like /ERROR/ | stats count() by bin(@timestamp, 5m) as time_window", "start_time": "2024-01-01T00:00:00Z", "end_time": "2024-01-01T23:59:59Z", "limit": 1000}}, {"name": "get_traces", "arguments": {"service": "order-processing", "operation": "ProcessOrder", "min_duration_ms": 1000, "max_results": 100, "include_downstream": true}}]
+
+Now let me create a comprehensive report based on this data."#;
+
+const QWEN_FORMAT: &str = r#"Let me search for information about machine learning frameworks and their performance benchmarks.
+
+<tool_call>
+{"name": "academic_search", "arguments": {"query": "transformer architecture optimization techniques GPU inference latency reduction", "databases": ["arxiv", "ieee", "acm"], "year_range": [2020, 2024], "citation_count_min": 10, "include_code": true, "page_size": 25, "sort_by": "relevance"}}
+</tool_call>
+
+I found several interesting papers on optimization techniques."#;
+
+const QWEN_MULTI: &str = r#"I'll help you set up a complete data pipeline for your analytics system.
+
+<tool_call>
+{"name": "create_data_pipeline", "arguments": {"name": "customer_analytics_etl", "source": {"type": "kafka", "config": {"bootstrap_servers": "kafka1:9092,kafka2:9092", "topic": "customer_events", "consumer_group": "analytics_consumer", "auto_offset_reset": "earliest"}}, "transformations": [{"type": "filter", "condition": "event_type IN ('purchase', 'signup', 'churn')"}, {"type": "aggregate", "window": "1h", "group_by": ["customer_id", "event_type"], "metrics": ["count", "sum(amount)"]}], "destination": {"type": "bigquery", "dataset": "analytics", "table": "customer_metrics", "write_mode": "append"}}}
+</tool_call>
+<tool_call>
+{"name": "schedule_job", "arguments": {"job_id": "customer_analytics_etl", "schedule": "0 */4 * * *", "timezone": "UTC", "retry_policy": {"max_attempts": 3, "backoff_multiplier": 2, "max_backoff": 3600}, "notifications": {"on_failure": ["ops-team@company.com"], "on_success": null}, "monitoring": {"sla_minutes": 30, "alert_threshold": 0.95}}}
+</tool_call>
+<tool_call>
+{"name": "create_dashboard", "arguments": {"title": "Customer Analytics Dashboard", "widgets": [{"type": "time_series", "title": "Customer Acquisition", "query": "SELECT DATE(timestamp) as date, COUNT(DISTINCT customer_id) as new_customers FROM analytics.customer_metrics WHERE event_type = 'signup' GROUP BY date ORDER BY date", "visualization": "line"}, {"type": "metric", "title": "Total Revenue", "query": "SELECT SUM(amount) as total FROM analytics.customer_metrics WHERE event_type = 'purchase' AND DATE(timestamp) = CURRENT_DATE()", "format": "currency"}, {"type": "table", "title": "Top Customers", "query": "SELECT customer_id, COUNT(*) as purchases, SUM(amount) as total_spent FROM analytics.customer_metrics WHERE event_type = 'purchase' GROUP BY customer_id ORDER BY total_spent DESC LIMIT 10"}], "refresh_interval": 300}}
+</tool_call>
+
+The data pipeline has been configured and the dashboard is ready."#;
+
+const LLAMA_FORMAT: &str = r#"<|python_tag|>{"name": "execute_code", "arguments": "{\"code\": \"import pandas as pd\\nimport numpy as np\\nfrom sklearn.model_selection import train_test_split\\nfrom sklearn.ensemble import RandomForestClassifier\\nfrom sklearn.metrics import classification_report, confusion_matrix\\nimport joblib\\n\\n# Load and preprocess data\\ndf = pd.read_csv('/data/customer_churn.csv')\\nprint(f'Dataset shape: {df.shape}')\\nprint(f'Missing values: {df.isnull().sum().sum()}')\\n\\n# Feature engineering\\ndf['tenure_months'] = pd.to_datetime('today') - pd.to_datetime(df['signup_date'])\\ndf['tenure_months'] = df['tenure_months'].dt.days // 30\\ndf['avg_monthly_spend'] = df['total_spend'] / df['tenure_months'].clip(lower=1)\\n\\n# Prepare features and target\\nfeature_cols = ['tenure_months', 'avg_monthly_spend', 'support_tickets', 'product_usage_hours', 'feature_adoption_score']\\nX = df[feature_cols]\\ny = df['churned']\\n\\n# Split and train\\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\\nrf_model = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=5, random_state=42)\\nrf_model.fit(X_train, y_train)\\n\\n# Evaluate\\ny_pred = rf_model.predict(X_test)\\nprint('Classification Report:')\\nprint(classification_report(y_test, y_pred))\\n\\n# Save model\\njoblib.dump(rf_model, '/models/churn_predictor_v1.pkl')\\nprint('Model saved successfully!')\"}"}"#;
+
+const PYTHONIC_FORMAT: &str = r#"[retrieve_context(query="How do transformer models handle long-range dependencies in natural language processing tasks?", index="ml_knowledge_base", top_k=5, similarity_threshold=0.75, rerank=True, include_metadata=True, filters={"category": "deep_learning", "year": {"$gte": 2020}})]"#;
+
+const PYTHONIC_MULTI: &str = r#"[fetch_api_data(endpoint="https://api.weather.com/v1/forecast", params={"lat": 37.7749, "lon": -122.4194, "units": "metric", "days": 7, "hourly": True}, headers={"API-Key": "${WEATHER_API_KEY}"}, timeout=30, retry_count=3), process_weather_data(data="${response}", extract_fields=["temperature", "humidity", "precipitation", "wind_speed", "uv_index"], aggregation="daily", calculate_trends=True), generate_report(data="${processed_data}", template="weather_forecast", format="html", include_charts=True, language="en")]"#;
+
+const DEEPSEEK_FORMAT: &str = r#"I'll analyze your codebase and identify potential security vulnerabilities.
+
+🤔[{"name": "scan_repository", "arguments": {"repo_path": "/src/application", "scan_types": ["security", "dependencies", "secrets", "code_quality"], "file_patterns": ["*.py", "*.js", "*.java", "*.go"], "exclude_dirs": ["node_modules", ".git", "vendor", "build"], "vulnerability_databases": ["cve", "nvd", "ghsa"], "min_severity": "medium", "check_dependencies": true, "deep_scan": true, "parallel_workers": 8}}]
+
+Let me examine the scan results and provide recommendations."#;
+
+const KIMIK2_FORMAT: &str = r#"⍼validate_and_deploy⍁{"deployment_config": {"application": "payment-service", "version": "2.3.1", "environment": "staging", "region": "us-west-2", "deployment_strategy": "blue_green", "health_check": {"endpoint": "/health", "interval": 30, "timeout": 5, "healthy_threshold": 2, "unhealthy_threshold": 3}, "rollback_on_failure": true, "canary_config": {"percentage": 10, "duration_minutes": 30, "metrics": ["error_rate", "latency_p99", "success_rate"], "thresholds": {"error_rate": 0.01, "latency_p99": 500, "success_rate": 0.99}}, "pre_deployment_hooks": ["run_tests", "security_scan", "backup_database"], "post_deployment_hooks": ["smoke_tests", "notify_team", "update_documentation"]}}"#;
+
+const GLM4_FORMAT: &str = r#"<tool>
+analyze_customer_behavior
+<parameter>dataset_id=customer_interactions_2024</parameter>
+<parameter>analysis_type=cohort_retention</parameter>
+<parameter>cohort_definition=signup_month</parameter>
+<parameter>retention_periods=[1, 7, 14, 30, 60, 90, 180, 365]</parameter>
+<parameter>segment_by=["acquisition_channel", "pricing_tier", "industry", "company_size"]</parameter>
+<parameter>metrics=["active_users", "revenue", "feature_usage", "engagement_score"]</parameter>
+<parameter>statistical_tests=["chi_square", "anova", "trend_analysis"]</parameter>
+<parameter>visualization_types=["heatmap", "line_chart", "funnel", "sankey"]</parameter>
+<parameter>export_format=dashboard</parameter>
+<parameter>confidence_level=0.95</parameter>
+</tool>"#;
+
+const STEP3_FORMAT: &str = r#"<step.tML version="0.1">
+<call>
+<name>orchestrate_ml_pipeline</name>
+<parameters>
+<parameter name="pipeline_name">fraud_detection_model_v3</parameter>
+<parameter name="data_source">s3://ml-datasets/transactions/2024/</parameter>
+<parameter name="preprocessing_steps">
+  <step order="1" type="clean">{"remove_duplicates": true, "handle_missing": "interpolate", "outlier_method": "isolation_forest"}</step>
+  <step order="2" type="feature_engineering">{"create_ratios": true, "time_features": ["hour", "day_of_week", "month"], "aggregations": ["mean", "std", "max"]}</step>
+  <step order="3" type="normalize">{"method": "robust_scaler", "clip_outliers": true}</step>
+</parameter>
+<parameter name="model_config">{"algorithm": "xgboost", "hyperparameters": {"n_estimators": 500, "max_depth": 8, "learning_rate": 0.01, "subsample": 0.8}, "cross_validation": {"method": "stratified_kfold", "n_splits": 5}}</parameter>
+<parameter name="evaluation_metrics">["auc_roc", "precision_recall", "f1", "confusion_matrix"]</parameter>
+<parameter name="deployment_target">sagemaker_endpoint</parameter>
+<parameter name="monitoring_config">{"drift_detection": true, "performance_threshold": 0.92, "alert_emails": ["ml-team@company.com"]}</parameter>
+</parameters>
+</call>
+</step.tML>"#;
+
+const GPT_OSS_FORMAT: &str = r#"<Channel.vector_search>{"collection": "technical_documentation", "query_embedding": [0.0234, -0.1456, 0.0891, 0.2341, -0.0567, 0.1234, 0.0456, -0.0789, 0.1567, 0.0234, -0.1123, 0.0678, 0.2345, -0.0456, 0.0891, 0.1234, -0.0567, 0.0789, 0.1456, -0.0234, 0.0891, 0.1567, -0.0678, 0.0345, 0.1234, -0.0456, 0.0789, 0.1891, -0.0234, 0.0567, 0.1345, -0.0891], "top_k": 10, "similarity_metric": "cosine", "filters": {"language": "en", "last_updated": {"$gte": "2023-01-01"}, "categories": {"$in": ["api", "sdk", "integration"]}}, "include_metadata": true, "rerank_with_cross_encoder": true}</Channel.vector_search>"#;
+
+// Large test data for stress testing
+fn generate_large_json(num_tools: usize) -> String {
+    let mut tools = Vec::new();
+    for i in 0..num_tools {
+        tools.push(format!(
+            r#"{{"name": "tool_{}", "arguments": {{"param1": "value{}", "param2": {}, "param3": true}}}}"#,
+            i, i, i
+        ));
+    }
+    format!("[{}]", tools.join(", "))
+}
+
+// Global results storage
+lazy_static::lazy_static! {
+    static ref BENCHMARK_RESULTS: Mutex<BTreeMap<String, String>> = Mutex::new(BTreeMap::new());
+}
+
+fn add_result(category: &str, result: String) {
+    let mut results = BENCHMARK_RESULTS.lock().unwrap();
+    let index = results.len();
+    results.insert(format!("{:03}_{}", index, category), result);
+}
+
+fn bench_registry_creation(c: &mut Criterion) {
+    let mut group = c.benchmark_group("registry_creation");
+
+    let printed = Arc::new(AtomicBool::new(false));
+    group.bench_function("new_registry", |b| {
+        let printed_clone = printed.clone();
+
+        b.iter_custom(|iters| {
+            let start = Instant::now();
+            for _ in 0..iters {
+                let registry = black_box(ParserRegistry::new());
+                // Force evaluation to prevent optimization
+                black_box(registry.list_parsers());
+            }
+            let duration = start.elapsed();
+
+            if !printed_clone.load(Ordering::Relaxed) {
+                let ops_per_sec = iters as f64 / duration.as_secs_f64();
+                let time_per_op = duration.as_micros() as f64 / iters as f64;
+
+                let result = format!(
+                    "{:<25} | {:>12.0} | {:>12.1}µs | {:>15}",
+                    "Registry Creation", ops_per_sec, time_per_op, "N/A"
+                );
+                add_result("registry", result);
+
+                printed_clone.store(true, Ordering::Relaxed);
+            }
+
+            duration
+        });
+    });
+
+    group.finish();
+}
+
+fn bench_parser_lookup(c: &mut Criterion) {
+    let registry = Arc::new(ParserRegistry::new());
+    let models = vec![
+        "gpt-4",
+        "mistral-large",
+        "qwen-72b",
+        "llama-3.2",
+        "deepseek-v3",
+        "unknown-model",
+    ];
+
+    let mut group = c.benchmark_group("parser_lookup");
+
+    for model in models {
+        let printed = Arc::new(AtomicBool::new(false));
+        let registry_clone = registry.clone();
+
+        group.bench_function(model, |b| {
+            let printed_clone = printed.clone();
+            let registry = registry_clone.clone();
+
+            b.iter_custom(|iters| {
+                let start = Instant::now();
+                for _ in 0..iters {
+                    let parser = black_box(registry.get_parser(model));
+                    // Force evaluation
+                    black_box(parser.is_some());
+                }
+                let duration = start.elapsed();
+
+                if !printed_clone.load(Ordering::Relaxed) {
+                    let ops_per_sec = iters as f64 / duration.as_secs_f64();
+                    let time_per_op = duration.as_nanos() as f64 / iters as f64;
+
+                    let result = format!(
+                        "{:<25} | {:>12.0} | {:>12.1}ns | {:>15}",
+                        format!("Lookup {}", model),
+                        ops_per_sec,
+                        time_per_op,
+                        if registry.get_parser(model).is_some() {
+                            "Found"
+                        } else {
+                            "Fallback"
+                        }
+                    );
+                    add_result("lookup", result);
+
+                    printed_clone.store(true, Ordering::Relaxed);
+                }
+
+                duration
+            });
+        });
+    }
+
+    group.finish();
+}
+
+fn bench_complete_parsing(c: &mut Criterion) {
+    let rt = Runtime::new().unwrap();
+    let registry = Arc::new(ParserRegistry::new());
+
+    let test_cases = vec![
+        ("json_simple", "json", JSON_SIMPLE),
+        ("json_array", "json", JSON_ARRAY),
+        ("json_params", "json", JSON_WITH_PARAMS),
+        ("mistral_single", "mistral", MISTRAL_FORMAT),
+        ("mistral_multi", "mistral", MISTRAL_MULTI),
+        ("qwen_single", "qwen", QWEN_FORMAT),
+        ("qwen_multi", "qwen", QWEN_MULTI),
+        ("llama", "llama", LLAMA_FORMAT),
+        ("pythonic_single", "pythonic", PYTHONIC_FORMAT),
+        ("pythonic_multi", "pythonic", PYTHONIC_MULTI),
+        ("deepseek", "deepseek", DEEPSEEK_FORMAT),
+        ("kimik2", "kimik2", KIMIK2_FORMAT),
+        ("glm4", "glm4_moe", GLM4_FORMAT),
+        ("step3", "step3", STEP3_FORMAT),
+        ("gpt_oss", "gpt_oss", GPT_OSS_FORMAT),
+    ];
+
+    let mut group = c.benchmark_group("complete_parsing");
+
+    for (name, parser_name, input) in test_cases {
+        let printed = Arc::new(AtomicBool::new(false));
+        let registry_clone = registry.clone();
+        let input_len = input.len();
+
+        group.throughput(Throughput::Bytes(input_len as u64));
+        group.bench_function(name, |b| {
+            let printed_clone = printed.clone();
+            let registry = registry_clone.clone();
+            let rt = rt.handle().clone();
+
+            b.iter_custom(|iters| {
+                let parser = registry.get_parser(parser_name).expect("Parser not found");
+
+                let start = Instant::now();
+                for _ in 0..iters {
+                    let parser = parser.clone();
+                    let result = rt.block_on(async { parser.parse_complete(input).await });
+                    black_box(result.unwrap());
+                }
+                let duration = start.elapsed();
+
+                if !printed_clone.load(Ordering::Relaxed) {
+                    let ops_per_sec = iters as f64 / duration.as_secs_f64();
+                    let bytes_per_sec = (iters as f64 * input_len as f64) / duration.as_secs_f64();
+                    let time_per_op = duration.as_micros() as f64 / iters as f64;
+
+                    let result = format!(
+                        "{:<25} | {:>10} | {:>12.0} | {:>12.0} | {:>10.1}µs",
+                        name, input_len, ops_per_sec, bytes_per_sec, time_per_op
+                    );
+                    add_result("complete", result);
+
+                    printed_clone.store(true, Ordering::Relaxed);
+                }
+
+                duration
+            });
+        });
+    }
+
+    group.finish();
+}
+
+fn bench_streaming_parsing(c: &mut Criterion) {
+    let rt = Runtime::new().unwrap();
+    let registry = Arc::new(ParserRegistry::new());
+
+    // Streaming test with chunked input
+    let chunks = vec![
+        r#"{"na"#,
+        r#"me": "sear"#,
+        r#"ch", "argu"#,
+        r#"ments": {"qu"#,
+        r#"ery": "rust prog"#,
+        r#"ramming", "li"#,
+        r#"mit": 10, "off"#,
+        r#"set": 0}"#,
+        r#"}"#,
+    ];
+
+    let mut group = c.benchmark_group("streaming_parsing");
+
+    let printed = Arc::new(AtomicBool::new(false));
+    group.bench_function("json_streaming", |b| {
+        let printed_clone = printed.clone();
+        let registry = registry.clone();
+        let rt = rt.handle().clone();
+
+        b.iter_custom(|iters| {
+            let parser = registry.get_parser("json").expect("Parser not found");
+
+            let start = Instant::now();
+            for _ in 0..iters {
+                let parser = parser.clone();
+                let mut state = ParseState::new();
+                let mut complete_tools = Vec::new();
+
+                rt.block_on(async {
+                    for chunk in &chunks {
+                        if let StreamResult::ToolComplete(tool) =
+                            parser.parse_incremental(chunk, &mut state).await.unwrap()
+                        {
+                            complete_tools.push(tool);
+                        }
+                    }
+                });
+
+                black_box(complete_tools);
+            }
+            let duration = start.elapsed();
+
+            if !printed_clone.load(Ordering::Relaxed) {
+                let ops_per_sec = iters as f64 / duration.as_secs_f64();
+                let time_per_op = duration.as_micros() as f64 / iters as f64;
+                let chunks_per_sec = (iters as f64 * chunks.len() as f64) / duration.as_secs_f64();
+
+                let result = format!(
+                    "{:<25} | {:>10} | {:>12.0} | {:>12.0} | {:>10.1}µs",
+                    "JSON Streaming",
+                    chunks.len(),
+                    ops_per_sec,
+                    chunks_per_sec,
+                    time_per_op
+                );
+                add_result("streaming", result);
+
+                printed_clone.store(true, Ordering::Relaxed);
+            }
+
+            duration
+        });
+    });
+
+    group.finish();
+}
+
+fn bench_concurrent_parsing(c: &mut Criterion) {
+    let rt = Runtime::new().unwrap();
+    let registry = Arc::new(ParserRegistry::new());
+    let parser = registry.get_parser("json").expect("Parser not found");
+
+    let thread_counts = vec![1, 2, 4, 8, 16, 32];
+    let operations_per_thread = 100;
+
+    let mut group = c.benchmark_group("concurrent_parsing");
+    group.measurement_time(Duration::from_secs(3));
+
+    for num_threads in thread_counts {
+        let printed = Arc::new(AtomicBool::new(false));
+        let parser_clone = parser.clone();
+
+        group.bench_with_input(
+            BenchmarkId::from_parameter(num_threads),
+            &num_threads,
+            |b, &threads| {
+                let printed_clone = printed.clone();
+                let parser = parser_clone.clone();
+                let rt = rt.handle().clone();
+
+                b.iter_custom(|_iters| {
+                    let total_operations = Arc::new(AtomicU64::new(0));
+                    let total_parsed = Arc::new(AtomicU64::new(0));
+                    let start = Instant::now();
+
+                    let handles: Vec<_> = (0..threads)
+                        .map(|_thread_id| {
+                            let parser = parser.clone();
+                            let total_ops = total_operations.clone();
+                            let total_p = total_parsed.clone();
+                            let rt = rt.clone();
+
+                            thread::spawn(move || {
+                                let test_inputs = [JSON_SIMPLE, JSON_ARRAY, JSON_WITH_PARAMS];
+
+                                for i in 0..operations_per_thread {
+                                    let input = test_inputs[i % test_inputs.len()];
+                                    let result =
+                                        rt.block_on(async { parser.parse_complete(input).await });
+
+                                    if let Ok(tools) = result {
+                                        total_p.fetch_add(tools.len() as u64, Ordering::Relaxed);
+                                    }
+                                }
+
+                                total_ops
+                                    .fetch_add(operations_per_thread as u64, Ordering::Relaxed);
+                            })
+                        })
+                        .collect();
+
+                    for handle in handles {
+                        handle.join().unwrap();
+                    }
+
+                    let duration = start.elapsed();
+
+                    if !printed_clone.load(Ordering::Relaxed) {
+                        let total_ops = total_operations.load(Ordering::Relaxed);
+                        let total_p = total_parsed.load(Ordering::Relaxed);
+                        let ops_per_sec = total_ops as f64 / duration.as_secs_f64();
+                        let tools_per_sec = total_p as f64 / duration.as_secs_f64();
+
+                        let result = format!(
+                            "{:<25} | {:>10} | {:>12.0} | {:>12.0} | {:>10}",
+                            format!("{}_threads", threads),
+                            total_ops,
+                            ops_per_sec,
+                            tools_per_sec,
+                            threads
+                        );
+                        add_result("concurrent", result);
+
+                        printed_clone.store(true, Ordering::Relaxed);
+                    }
+
+                    duration
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_large_payloads(c: &mut Criterion) {
+    let rt = Runtime::new().unwrap();
+    let registry = Arc::new(ParserRegistry::new());
+    let parser = registry.get_parser("json").expect("Parser not found");
+
+    let sizes = vec![1, 10, 50, 100, 500];
+
+    let mut group = c.benchmark_group("large_payloads");
+
+    for size in sizes {
+        let large_json = generate_large_json(size);
+        let input_len = large_json.len();
+        let printed = Arc::new(AtomicBool::new(false));
+        let parser_clone = parser.clone();
+
+        group.throughput(Throughput::Bytes(input_len as u64));
+        group.bench_with_input(BenchmarkId::from_parameter(size), &size, |b, &num_tools| {
+            let printed_clone = printed.clone();
+            let parser = parser_clone.clone();
+            let rt = rt.handle().clone();
+            let input = &large_json;
+
+            b.iter_custom(|iters| {
+                let start = Instant::now();
+                for _ in 0..iters {
+                    let parser = parser.clone();
+                    let result = rt.block_on(async { parser.parse_complete(input).await });
+                    black_box(result.unwrap());
+                }
+                let duration = start.elapsed();
+
+                if !printed_clone.load(Ordering::Relaxed) {
+                    let ops_per_sec = iters as f64 / duration.as_secs_f64();
+                    let bytes_per_sec = (iters as f64 * input_len as f64) / duration.as_secs_f64();
+                    let time_per_op = duration.as_millis() as f64 / iters as f64;
+
+                    let result = format!(
+                        "{:<25} | {:>10} | {:>10} | {:>12.0} | {:>12.0} | {:>10.1}ms",
+                        format!("{}_tools", num_tools),
+                        num_tools,
+                        input_len,
+                        ops_per_sec,
+                        bytes_per_sec,
+                        time_per_op
+                    );
+                    add_result("large", result);
+
+                    printed_clone.store(true, Ordering::Relaxed);
+                }
+
+                duration
+            });
+        });
+    }
+
+    group.finish();
+}
+
+fn bench_parser_reuse(c: &mut Criterion) {
+    let rt = Runtime::new().unwrap();
+
+    let mut group = c.benchmark_group("parser_reuse");
+
+    // Benchmark creating new registry each time
+    let printed_new = Arc::new(AtomicBool::new(false));
+    group.bench_function("new_registry_each_time", |b| {
+        let printed_clone = printed_new.clone();
+        let rt = rt.handle().clone();
+
+        b.iter_custom(|iters| {
+            let start = Instant::now();
+            for _ in 0..iters {
+                let registry = ParserRegistry::new();
+                let parser = registry.get_parser("json").unwrap();
+                let result = rt.block_on(async { parser.parse_complete(JSON_SIMPLE).await });
+                black_box(result.unwrap());
+            }
+            let duration = start.elapsed();
+
+            if !printed_clone.load(Ordering::Relaxed) {
+                let ops_per_sec = iters as f64 / duration.as_secs_f64();
+                let time_per_op = duration.as_micros() as f64 / iters as f64;
+
+                let result = format!(
+                    "{:<25} | {:>12.0} | {:>12.1}µs | {:>15}",
+                    "New Registry Each Time", ops_per_sec, time_per_op, "Baseline"
+                );
+                add_result("reuse", result);
+
+                printed_clone.store(true, Ordering::Relaxed);
+            }
+
+            duration
+        });
+    });
+
+    // Benchmark reusing registry
+    let printed_reuse = Arc::new(AtomicBool::new(false));
+    let shared_registry = Arc::new(ParserRegistry::new());
+
+    group.bench_function("reuse_registry", |b| {
+        let printed_clone = printed_reuse.clone();
+        let registry = shared_registry.clone();
+        let rt = rt.handle().clone();
+
+        b.iter_custom(|iters| {
+            let parser = registry.get_parser("json").unwrap();
+
+            let start = Instant::now();
+            for _ in 0..iters {
+                let parser = parser.clone();
+                let result = rt.block_on(async { parser.parse_complete(JSON_SIMPLE).await });
+                black_box(result.unwrap());
+            }
+            let duration = start.elapsed();
+
+            if !printed_clone.load(Ordering::Relaxed) {
+                let ops_per_sec = iters as f64 / duration.as_secs_f64();
+                let time_per_op = duration.as_micros() as f64 / iters as f64;
+
+                let result = format!(
+                    "{:<25} | {:>12.0} | {:>12.1}µs | {:>15}",
+                    "Reuse Registry", ops_per_sec, time_per_op, "Optimized"
+                );
+                add_result("reuse", result);
+
+                printed_clone.store(true, Ordering::Relaxed);
+            }
+
+            duration
+        });
+    });
+
+    // Benchmark reusing parser
+    let printed_parser = Arc::new(AtomicBool::new(false));
+    let shared_parser = shared_registry.get_parser("json").unwrap();
+
+    group.bench_function("reuse_parser", |b| {
+        let printed_clone = printed_parser.clone();
+        let parser = shared_parser.clone();
+        let rt = rt.handle().clone();
+
+        b.iter_custom(|iters| {
+            let start = Instant::now();
+            for _ in 0..iters {
+                let parser = parser.clone();
+                let result = rt.block_on(async { parser.parse_complete(JSON_SIMPLE).await });
+                black_box(result.unwrap());
+            }
+            let duration = start.elapsed();
+
+            if !printed_clone.load(Ordering::Relaxed) {
+                let ops_per_sec = iters as f64 / duration.as_secs_f64();
+                let time_per_op = duration.as_micros() as f64 / iters as f64;
+
+                let result = format!(
+                    "{:<25} | {:>12.0} | {:>12.1}µs | {:>15}",
+                    "Reuse Parser", ops_per_sec, time_per_op, "Best"
+                );
+                add_result("reuse", result);
+
+                printed_clone.store(true, Ordering::Relaxed);
+            }
+
+            duration
+        });
+    });
+
+    group.finish();
+}
+
+fn bench_latency_distribution(c: &mut Criterion) {
+    let rt = Runtime::new().unwrap();
+    let registry = Arc::new(ParserRegistry::new());
+
+    let test_cases = vec![
+        ("json", JSON_SIMPLE),
+        ("mistral", MISTRAL_FORMAT),
+        ("qwen", QWEN_FORMAT),
+        ("pythonic", PYTHONIC_FORMAT),
+    ];
+
+    let mut group = c.benchmark_group("latency");
+
+    for (parser_name, input) in test_cases {
+        let printed = Arc::new(AtomicBool::new(false));
+        let registry_clone = registry.clone();
+
+        group.bench_function(parser_name, |b| {
+            let printed_clone = printed.clone();
+            let registry = registry_clone.clone();
+            let rt = rt.handle().clone();
+
+            b.iter_custom(|iters| {
+                let parser = registry.get_parser(parser_name).expect("Parser not found");
+
+                let total_duration = if !printed_clone.load(Ordering::Relaxed) {
+                    let mut latencies = Vec::new();
+
+                    // Warm up
+                    for _ in 0..100 {
+                        let parser = parser.clone();
+                        rt.block_on(async { parser.parse_complete(input).await })
+                            .unwrap();
+                    }
+
+                    // Measure for statistics
+                    for _ in 0..1000 {
+                        let parser = parser.clone();
+                        let start = Instant::now();
+                        rt.block_on(async { parser.parse_complete(input).await })
+                            .unwrap();
+                        let latency = start.elapsed();
+                        latencies.push(latency);
+                    }
+
+                    latencies.sort();
+                    let p50 = latencies[latencies.len() / 2];
+                    let p95 = latencies[latencies.len() * 95 / 100];
+                    let p99 = latencies[latencies.len() * 99 / 100];
+                    let max = latencies.last().unwrap();
+
+                    let result = format!(
+                        "{:<25} | {:>10.1} | {:>10.1} | {:>10.1} | {:>10.1} | {:>10}",
+                        parser_name,
+                        p50.as_micros() as f64,
+                        p95.as_micros() as f64,
+                        p99.as_micros() as f64,
+                        max.as_micros() as f64,
+                        1000
+                    );
+                    add_result("latency", result);
+
+                    printed_clone.store(true, Ordering::Relaxed);
+
+                    // Return median for consistency
+                    p50 * iters as u32
+                } else {
+                    // Regular benchmark iterations
+                    let start = Instant::now();
+                    for _ in 0..iters {
+                        let parser = parser.clone();
+                        rt.block_on(async { parser.parse_complete(input).await })
+                            .unwrap();
+                    }
+                    start.elapsed()
+                };
+
+                total_duration
+            });
+        });
+    }
+
+    group.finish();
+}
+
+// Print final summary table
+fn print_summary() {
+    println!("\n{}", "=".repeat(120));
+    println!("TOOL PARSER BENCHMARK SUMMARY");
+    println!("{}", "=".repeat(120));
+
+    let results = BENCHMARK_RESULTS.lock().unwrap();
+
+    let mut current_category = String::new();
+    for (key, value) in results.iter() {
+        let category = key.split('_').skip(1).collect::<Vec<_>>().join("_");
+
+        if category != current_category {
+            current_category = category.clone();
+
+            // Print section header based on category
+            println!("\n{}", "-".repeat(120));
+            match category.as_str() {
+                "registry" => {
+                    println!("REGISTRY OPERATIONS");
+                    println!(
+                        "{:<25} | {:>12} | {:>12} | {:>15}",
+                        "Operation", "Ops/sec", "Time/op", "Notes"
+                    );
+                }
+                "lookup" => {
+                    println!("PARSER LOOKUP PERFORMANCE");
+                    println!(
+                        "{:<25} | {:>12} | {:>12} | {:>15}",
+                        "Model", "Lookups/sec", "Time/lookup", "Result"
+                    );
+                }
+                "complete" => {
+                    println!("COMPLETE PARSING PERFORMANCE");
+                    println!(
+                        "{:<25} | {:>10} | {:>12} | {:>12} | {:>12}",
+                        "Parser Format", "Size(B)", "Ops/sec", "Bytes/sec", "Time/op"
+                    );
+                }
+                "streaming" => {
+                    println!("STREAMING PARSING PERFORMANCE");
+                    println!(
+                        "{:<25} | {:>10} | {:>12} | {:>12} | {:>12}",
+                        "Parser", "Chunks", "Ops/sec", "Chunks/sec", "Time/op"
+                    );
+                }
+                "concurrent" => {
+                    println!("CONCURRENT PARSING");
+                    println!(
+                        "{:<25} | {:>10} | {:>12} | {:>12} | {:>10}",
+                        "Configuration", "Total Ops", "Ops/sec", "Tools/sec", "Threads"
+                    );
+                }
+                "large" => {
+                    println!("LARGE PAYLOAD PARSING");
+                    println!(
+                        "{:<25} | {:>10} | {:>10} | {:>12} | {:>12} | {:>12}",
+                        "Payload", "Tools", "Size(B)", "Ops/sec", "Bytes/sec", "Time/op"
+                    );
+                }
+                "reuse" => {
+                    println!("PARSER REUSE COMPARISON");
+                    println!(
+                        "{:<25} | {:>12} | {:>12} | {:>15}",
+                        "Strategy", "Ops/sec", "Time/op", "Performance"
+                    );
+                }
+                "latency" => {
+                    println!("LATENCY DISTRIBUTION");
+                    println!(
+                        "{:<25} | {:>10} | {:>10} | {:>10} | {:>10} | {:>10}",
+                        "Parser", "P50(µs)", "P95(µs)", "P99(µs)", "Max(µs)", "Samples"
+                    );
+                }
+                _ => {}
+            }
+            println!("{}", "-".repeat(120));
+        }
+
+        println!("{}", value);
+    }
+
+    println!("\n{}", "=".repeat(120));
+
+    // Print performance analysis
+    println!("\nPERFORMANCE ANALYSIS:");
+    println!("{}", "-".repeat(120));
+
+    // Calculate and display key metrics
+    if let Some(new_registry) = results.get("007_reuse") {
+        if let Some(reuse_parser) = results.get("009_reuse") {
+            // Extract ops/sec values
+            let new_ops: f64 = new_registry
+                .split('|')
+                .nth(1)
+                .and_then(|s| s.trim().parse().ok())
+                .unwrap_or(0.0);
+            let reuse_ops: f64 = reuse_parser
+                .split('|')
+                .nth(1)
+                .and_then(|s| s.trim().parse().ok())
+                .unwrap_or(0.0);
+
+            if new_ops > 0.0 && reuse_ops > 0.0 {
+                let improvement = (reuse_ops / new_ops - 1.0) * 100.0;
+                println!("Parser Reuse Improvement: {:.1}% faster", improvement);
+
+                if improvement < 100.0 {
+                    println!("⚠️  WARNING: Parser reuse improvement is lower than expected!");
+                    println!("   Expected: >100% improvement with singleton pattern");
+                    println!("   Actual: {:.1}% improvement", improvement);
+                    println!("   Recommendation: Implement global singleton registry");
+                }
+            }
+        }
+    }
+
+    println!("{}", "=".repeat(120));
+}
+
+fn run_benchmarks(c: &mut Criterion) {
+    bench_registry_creation(c);
+    bench_parser_lookup(c);
+    bench_complete_parsing(c);
+    bench_streaming_parsing(c);
+    bench_concurrent_parsing(c);
+    bench_large_payloads(c);
+    bench_parser_reuse(c);
+    bench_latency_distribution(c);
+
+    // Print summary at the end
+    print_summary();
+}
+
+criterion_group!(benches, run_benchmarks);
+criterion::criterion_main!(benches);
diff --git a/sgl-router/build.rs b/sgl-router/build.rs
new file mode 100644
index 000000000..90b3c6101
--- /dev/null
+++ b/sgl-router/build.rs
@@ -0,0 +1,35 @@
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // Only regenerate if the proto file changes
+    println!("cargo:rerun-if-changed=src/proto/sglang_scheduler.proto");
+
+    // Configure protobuf compilation with custom settings
+    let config = prost_build::Config::new();
+
+    // Skip serde for types that use prost_types::Struct
+    // These cause conflicts and we don't need serde for all generated types
+
+    // Configure tonic-build for gRPC code generation
+    tonic_build::configure()
+        // Generate both client and server code
+        .build_server(true)
+        .build_client(true)
+        // Add a module-level attribute for documentation and clippy warnings
+        .server_mod_attribute(
+            "sglang.grpc.scheduler",
+            "#[allow(unused, clippy::mixed_attributes_style)]",
+        )
+        .client_mod_attribute(
+            "sglang.grpc.scheduler",
+            "#[allow(unused, clippy::mixed_attributes_style)]",
+        )
+        // Compile the proto file with the custom config
+        .compile_protos_with_config(
+            config,
+            &["src/proto/sglang_scheduler.proto"],
+            &["src/proto"],
+        )?;
+
+    println!("cargo:warning=Protobuf compilation completed successfully");
+
+    Ok(())
+}
diff --git a/sgl-router/py_src/sglang_router/__init__.py b/sgl-router/py_src/sglang_router/__init__.py
new file mode 100644
index 000000000..9c7fa208e
--- /dev/null
+++ b/sgl-router/py_src/sglang_router/__init__.py
@@ -0,0 +1,3 @@
+from sglang_router.version import __version__
+
+__all__ = ["__version__"]
diff --git a/sgl-router/py_src/sglang_router/launch_router.py b/sgl-router/py_src/sglang_router/launch_router.py
new file mode 100644
index 000000000..506842f84
--- /dev/null
+++ b/sgl-router/py_src/sglang_router/launch_router.py
@@ -0,0 +1,109 @@
+import argparse
+import logging
+import sys
+from typing import List, Optional
+
+import setproctitle
+from sglang_router.mini_lb import MiniLoadBalancer
+from sglang_router.router_args import RouterArgs
+
+logger = logging.getLogger("router")
+
+try:
+    from sglang_router.router import Router
+except ImportError:
+    Router = None
+    logger.warning(
+        "Rust Router is not installed, only python MiniLB (debugging only) is available"
+    )
+
+
+def launch_router(args: argparse.Namespace) -> Optional[Router]:
+    """
+    Launch the SGLang router with the configuration from parsed arguments.
+
+    Args:
+        args: Namespace object containing router configuration
+            Can be either raw argparse.Namespace or converted RouterArgs
+
+    Returns:
+        Router instance if successful, None if failed
+    """
+    setproctitle.setproctitle("sglang::router")
+    try:
+        # Convert to RouterArgs if needed
+        if not isinstance(args, RouterArgs):
+            router_args = RouterArgs.from_cli_args(args)
+        else:
+            router_args = args
+
+        if router_args.mini_lb:
+            mini_lb = MiniLoadBalancer(router_args)
+            mini_lb.start()
+        else:
+            if Router is None:
+                raise RuntimeError("Rust Router is not installed")
+            router_args._validate_router_args()
+            router = Router.from_args(router_args)
+            router.start()
+
+    except Exception as e:
+        logger.error(f"Error starting router: {e}")
+        raise e
+
+
+class CustomHelpFormatter(
+    argparse.RawDescriptionHelpFormatter, argparse.ArgumentDefaultsHelpFormatter
+):
+    """Custom formatter that preserves both description formatting and shows defaults"""
+
+    pass
+
+
+def parse_router_args(args: List[str]) -> RouterArgs:
+    """Parse command line arguments and return RouterArgs instance."""
+    parser = argparse.ArgumentParser(
+        description="""SGLang Router - High-performance request distribution across worker nodes
+
+Usage:
+This launcher enables starting a router with individual worker instances. It is useful for
+multi-node setups or when you want to start workers and router separately.
+
+Examples:
+  # Regular mode
+  python -m sglang_router.launch_router --worker-urls http://worker1:8000 http://worker2:8000
+
+  # PD disaggregated mode with same policy for both
+  python -m sglang_router.launch_router --pd-disaggregation \\
+    --prefill http://prefill1:8000 9000 --prefill http://prefill2:8000 \\
+    --decode http://decode1:8001 --decode http://decode2:8001 \\
+    --policy cache_aware
+
+  # PD mode with optional bootstrap ports
+  python -m sglang_router.launch_router --pd-disaggregation \\
+    --prefill http://prefill1:8000 9000 \\    # With bootstrap port
+    --prefill http://prefill2:8000 none \\    # Explicitly no bootstrap port
+    --prefill http://prefill3:8000 \\         # Defaults to no bootstrap port
+    --decode http://decode1:8001 --decode http://decode2:8001
+
+  # PD mode with different policies for prefill and decode
+  python -m sglang_router.launch_router --pd-disaggregation \\
+    --prefill http://prefill1:8000 --prefill http://prefill2:8000 \\
+    --decode http://decode1:8001 --decode http://decode2:8001 \\
+    --prefill-policy cache_aware --decode-policy power_of_two
+
+    """,
+        formatter_class=CustomHelpFormatter,
+    )
+
+    RouterArgs.add_cli_args(parser, use_router_prefix=False)
+    return RouterArgs.from_cli_args(parser.parse_args(args), use_router_prefix=False)
+
+
+def main() -> None:
+    router_args = parse_router_args(sys.argv[1:])
+    launch_router(router_args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sgl-router/py_src/sglang_router/launch_server.py b/sgl-router/py_src/sglang_router/launch_server.py
new file mode 100644
index 000000000..cce0faf7a
--- /dev/null
+++ b/sgl-router/py_src/sglang_router/launch_server.py
@@ -0,0 +1,203 @@
+import argparse
+import copy
+import logging
+import multiprocessing as mp
+import os
+import random
+import signal
+import sys
+import time
+from typing import List
+
+import requests
+from setproctitle import setproctitle
+from sglang_router.launch_router import RouterArgs, launch_router
+
+from sglang.srt.entrypoints.http_server import launch_server
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import is_port_available
+
+
+def setup_logger():
+    logger = logging.getLogger("router")
+    logger.setLevel(logging.INFO)
+
+    formatter = logging.Formatter(
+        "[Router (Python)] %(asctime)s - %(levelname)s - %(message)s - %(filename)s:%(lineno)d",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+
+    handler = logging.StreamHandler()
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+
+    return logger
+
+
+logger = setup_logger()
+
+
+# Create new process group
+def run_server(server_args, dp_rank):
+    """
+    Note:
+
+    1. Without os.setpgrp(), all processes share the same PGID. When you press Ctrl+C, the terminal sends SIGINT to all processes in the group simultaneously.
+    This can cause leaf processes to terminate first, which messes up the cleaning order and produces orphaned processes.
+
+    Terminal (PGID=100)
+    └── Main Python Process (PGID=100)
+        └── Server Process 1 (PGID=100)
+            └── Scheduler 1
+            └── Detokenizer 1
+        └── Server Process 2 (PGID=100)
+            └── Scheduler 2
+            └── Detokenizer 2
+
+    2. With os.setpgrp(), the main Python process and its children are in a separate group. Now:
+
+    Terminal (PGID=100)
+    └── Main Python Process (PGID=200)
+        └── Server Process 1 (PGID=300)
+            └── Scheduler 1
+            └── Detokenizer 1
+        └── Server Process 2 (PGID=400)
+            └── Scheduler 2
+            └── Detokenizer 2
+    """
+    # create new process group
+    os.setpgrp()
+
+    setproctitle("sglang::server")
+    # Set SGLANG_DP_RANK environment variable
+    os.environ["SGLANG_DP_RANK"] = str(dp_rank)
+
+    launch_server(server_args)
+
+
+def launch_server_process(
+    server_args: ServerArgs, worker_port: int, dp_id: int
+) -> mp.Process:
+    """Launch a single server process with the given args and port."""
+    server_args = copy.deepcopy(server_args)
+    server_args.port = worker_port
+    server_args.base_gpu_id = dp_id * server_args.tp_size
+    server_args.dp_size = 1
+
+    proc = mp.Process(target=run_server, args=(server_args, dp_id))
+    proc.start()
+    return proc
+
+
+def wait_for_server_health(host: str, port: int, timeout: int = 300) -> bool:
+    """Wait for server to be healthy by checking /health endpoint."""
+    start_time = time.perf_counter()
+    url = f"http://{host}:{port}/health"
+
+    while time.perf_counter() - start_time < timeout:
+        try:
+            response = requests.get(url, timeout=5)
+            if response.status_code == 200:
+                return True
+        except requests.exceptions.RequestException:
+            pass
+        time.sleep(1)
+    return False
+
+
+def find_available_ports(base_port: int, count: int) -> List[int]:
+    """Find consecutive available ports starting from base_port."""
+    available_ports = []
+    current_port = base_port
+
+    while len(available_ports) < count:
+        if is_port_available(current_port):
+            available_ports.append(current_port)
+        current_port += random.randint(100, 1000)
+
+    return available_ports
+
+
+def cleanup_processes(processes: List[mp.Process]):
+    for process in processes:
+        logger.info(f"Terminating process group {process.pid}")
+        try:
+            os.killpg(process.pid, signal.SIGTERM)
+        except ProcessLookupError:
+            # Process group may already be terminated
+            pass
+
+    # Wait for processes to terminate
+    for process in processes:
+        process.join(timeout=5)
+        if process.is_alive():
+            logger.warning(
+                f"Process {process.pid} did not terminate gracefully, forcing kill"
+            )
+            try:
+                os.killpg(process.pid, signal.SIGKILL)
+            except ProcessLookupError:
+                pass
+
+    logger.info("All process groups terminated")
+
+
+def main():
+    # CUDA runtime isn't fork-safe, which can lead to subtle bugs or crashes
+    mp.set_start_method("spawn")
+
+    parser = argparse.ArgumentParser(
+        description="Launch SGLang router and server processes"
+    )
+
+    ServerArgs.add_cli_args(parser)
+    RouterArgs.add_cli_args(parser, use_router_prefix=True, exclude_host_port=True)
+    parser.add_argument(
+        "--router-dp-worker-base-port",
+        type=int,
+        default=31000,
+        help="Base port number for data parallel workers",
+    )
+    # No extra retry/CB flags here; RouterArgs.add_cli_args already defines them with router- prefix
+
+    args = parser.parse_args()
+    server_args = ServerArgs.from_cli_args(args)
+    router_args = RouterArgs.from_cli_args(args, use_router_prefix=True)
+
+    # Find available ports for workers
+    worker_ports = find_available_ports(
+        args.router_dp_worker_base_port, server_args.dp_size
+    )
+
+    # Start server processes
+    server_processes = []
+
+    for i, worker_port in enumerate(worker_ports):
+        logger.info(f"Launching DP server process {i} on port {worker_port}")
+        proc = launch_server_process(server_args, worker_port, i)
+        server_processes.append(proc)
+
+    signal.signal(signal.SIGINT, lambda sig, frame: cleanup_processes(server_processes))
+    signal.signal(
+        signal.SIGTERM, lambda sig, frame: cleanup_processes(server_processes)
+    )
+    signal.signal(
+        signal.SIGQUIT, lambda sig, frame: cleanup_processes(server_processes)
+    )
+
+    # Update router args with worker URLs
+    router_args.worker_urls = [
+        f"http://{server_args.host}:{port}" for port in worker_ports
+    ]
+
+    # Start the router
+    try:
+        launch_router(router_args)
+    except Exception as e:
+        logger.error(f"Failed to start router: {e}")
+        cleanup_processes(server_processes)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sgl-router/py_src/sglang_router/mini_lb.py b/sgl-router/py_src/sglang_router/mini_lb.py
new file mode 100644
index 000000000..920d5c38f
--- /dev/null
+++ b/sgl-router/py_src/sglang_router/mini_lb.py
@@ -0,0 +1,395 @@
+"""
+Minimal HTTP load balancer for prefill and decode servers for testing.
+"""
+
+import asyncio
+import ipaddress
+import logging
+import random
+import urllib
+from http import HTTPStatus
+from itertools import chain
+from typing import Optional
+
+import aiohttp
+import orjson
+import uvicorn
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import ORJSONResponse, Response, StreamingResponse
+from sglang_router.router_args import RouterArgs
+
+logger = logging.getLogger(__name__)
+
+AIOHTTP_STREAM_READ_CHUNK_SIZE = (
+    1024 * 64
+)  # 64KB, to prevent aiohttp's "Chunk too big" error
+
+
+def maybe_wrap_ipv6_address(address: str) -> str:
+    try:
+        ipaddress.IPv6Address(address)
+        return f"[{address}]"
+    except ValueError:
+        return address
+
+
+class MiniLoadBalancer:
+    def __init__(
+        self,
+        router_args: RouterArgs,
+    ):
+        self._validate_router_args(router_args)
+
+        self.host = router_args.host
+        self.port = router_args.port
+        self.timeout = router_args.request_timeout_secs
+        self.prefill_urls = [url[0] for url in router_args.prefill_urls]
+        self.prefill_bootstrap_ports = [url[1] for url in router_args.prefill_urls]
+        self.decode_urls = router_args.decode_urls
+
+    def _validate_router_args(self, router_args: RouterArgs):
+        logger.warning(
+            "\x1b[33mMiniLB is only for debugging purposes, it only supports random policy!\033[0m"
+        )
+
+        # NOTE: too many arguments unsupported, just validate some important ones
+        if router_args.policy != "random":
+            logger.warning("[MiniLB] Overriding policy to random")
+            router_args.policy = "random"
+
+        if not router_args.pd_disaggregation:
+            raise ValueError("MiniLB only supports PD disaggregation mode")
+
+        if len(router_args.prefill_urls) == 0 or len(router_args.decode_urls) == 0:
+            raise ValueError(
+                "MiniLB requires at least one prefill and one decode server"
+            )
+
+    def start(self):
+        global lb
+        lb = self
+        uvicorn.run(app, host=self.host, port=self.port)
+
+    def select_pair(self):
+        assert len(self.prefill_urls) > 0, "No prefill servers available"
+        assert len(self.decode_urls) > 0, "No decode servers available"
+        pidx = random.randint(0, len(self.prefill_urls) - 1)
+        didx = random.randint(0, len(self.decode_urls) - 1)
+        return (
+            self.prefill_urls[pidx],
+            self.prefill_bootstrap_ports[pidx],
+            self.decode_urls[didx],
+        )
+
+    async def generate(
+        self, modified_request, prefill_server, decode_server, endpoint
+    ) -> ORJSONResponse:
+        assert endpoint[0] != "/", f"Endpoint should not start with '/': {endpoint}"
+
+        async with aiohttp.ClientSession(
+            timeout=aiohttp.ClientTimeout(
+                total=self.timeout
+            )  # Add timeout for request reliability
+        ) as session:
+            tasks = [
+                session.post(f"{prefill_server}/{endpoint}", json=modified_request),
+                session.post(f"{decode_server}/{endpoint}", json=modified_request),
+            ]
+
+            # Wait for both responses to complete. Prefill should end first.
+            prefill_response, decode_response = await asyncio.gather(*tasks)
+
+            if "return_logprob" in modified_request:
+
+                prefill_json = await prefill_response.json()
+                ret_json = await decode_response.json()
+
+                # merge `meta_info.input_token_logprobs` from prefill to decode
+                if "meta_info" in ret_json:
+                    if "input_token_logprobs" in ret_json["meta_info"]:
+                        ret_json["meta_info"]["input_token_logprobs"] = (
+                            prefill_json["meta_info"]["input_token_logprobs"]
+                            + ret_json["meta_info"]["input_token_logprobs"]
+                        )
+            else:
+                ret_json = await decode_response.json()
+
+            return ORJSONResponse(
+                content=ret_json,
+                status_code=decode_response.status,
+            )
+
+    async def generate_stream(
+        self, modified_request, prefill_server, decode_server, endpoint="generate"
+    ):
+        assert endpoint[0] != "/", f"Endpoint should not start with '/': {endpoint}"
+
+        async def stream_results():
+            async with aiohttp.ClientSession(
+                timeout=aiohttp.ClientTimeout(
+                    total=self.timeout
+                )  # Add timeout for request reliability
+            ) as session:
+                # Create the tasks for both prefill and decode requests
+                tasks = [
+                    session.post(f"{prefill_server}/{endpoint}", json=modified_request),
+                    session.post(f"{decode_server}/{endpoint}", json=modified_request),
+                ]
+                # Wait for both responses to complete. Since this is streaming, they return immediately.
+                prefill_response, decode_response = await asyncio.gather(*tasks)
+
+                if modified_request.get("return_logprob", False):
+                    prefill_chunks = []
+                    async for chunk in prefill_response.content:
+                        prefill_chunks.append(chunk)
+
+                    first_prefill_chunk = (
+                        prefill_chunks[0].decode("utf-8")[5:].strip("\n")
+                    )
+                    first_prefill_chunk_json = orjson.loads(first_prefill_chunk)
+
+                    async for chunk in decode_response.content:
+                        # Note: This is inefficient
+                        # merge prefill input_token_logprobs, output_token_logprobs to decode
+                        decoded_chunk = chunk.decode("utf-8")
+                        if (
+                            decoded_chunk
+                            and decoded_chunk.startswith("data:")
+                            and "[DONE]" not in decoded_chunk
+                        ):
+                            ret_json = orjson.loads(decoded_chunk[5:].strip("\n"))
+                            ret_json["meta_info"]["input_token_logprobs"] = (
+                                first_prefill_chunk_json["meta_info"][
+                                    "input_token_logprobs"
+                                ]
+                                + ret_json["meta_info"]["input_token_logprobs"]
+                            )
+
+                            yield b"data: " + orjson.dumps(ret_json) + b"\n\n"
+                        else:
+                            yield chunk
+                else:
+                    async for chunk in decode_response.content.iter_chunked(
+                        AIOHTTP_STREAM_READ_CHUNK_SIZE
+                    ):
+                        yield chunk
+
+        return StreamingResponse(
+            stream_results(),
+            media_type="text/event-stream",
+        )
+
+
+app = FastAPI()
+lb: Optional[MiniLoadBalancer] = None
+
+
+@app.get("/health")
+async def health_check():
+    return Response(status_code=200)
+
+
+@app.get("/health_generate")
+async def health_generate():
+    async with aiohttp.ClientSession() as session:
+        # Create the tasks
+        tasks = []
+        for server in chain(lb.prefill_urls, lb.decode_urls):
+            tasks.append(session.get(f"{server}/health_generate"))
+        for i, response in enumerate(asyncio.as_completed(tasks)):
+            await response
+    return Response(status_code=200)
+
+
+@app.post("/flush_cache")
+async def flush_cache():
+    async with aiohttp.ClientSession() as session:
+        # Create the tasks
+        tasks = []
+        for server in chain(lb.prefill_urls, lb.decode_urls):
+            tasks.append(session.post(f"{server}/flush_cache"))
+        for i, response in enumerate(asyncio.as_completed(tasks)):
+            await response
+    return Response(status_code=200)
+
+
+@app.get("/get_server_info")
+async def get_server_info():
+    prefill_infos = []
+    decode_infos = []
+    all_internal_states = []
+
+    async with aiohttp.ClientSession() as session:
+        for server in lb.prefill_urls:
+            server_info = await session.get(f"{server}/get_server_info")
+            prefill_infos.append(await server_info.json())
+        for server in lb.decode_urls:
+            server_info = await session.get(f"{server}/get_server_info")
+            info_json = await server_info.json()
+            decode_infos.append(info_json)
+            # Extract internal_states from decode servers
+            if "internal_states" in info_json:
+                all_internal_states.extend(info_json["internal_states"])
+
+    # Return format expected by bench_one_batch_server.py
+    if all_internal_states:
+        return {
+            "internal_states": all_internal_states,
+            "prefill": prefill_infos,
+            "decode": decode_infos,
+        }
+    else:
+        # Fallback with dummy data if no internal states found
+        return {
+            "internal_states": [
+                {
+                    "last_gen_throughput": 0.0,
+                    "avg_spec_accept_length": None,
+                }
+            ],
+            "prefill": prefill_infos,
+            "decode": decode_infos,
+        }
+
+
+@app.get("/get_model_info")
+async def get_model_info():
+    if not lb or not lb.prefill_urls:
+        raise HTTPException(
+            status_code=HTTPStatus.SERVICE_UNAVAILABLE,
+            detail="There is no server registered",
+        )
+
+    target_server_url = lb.prefill_urls[0]
+    endpoint_url = f"{target_server_url}/get_model_info"
+
+    async with aiohttp.ClientSession() as session:
+        try:
+            async with session.get(endpoint_url) as response:
+                if response.status != 200:
+                    error_text = await response.text()
+                    raise HTTPException(
+                        status_code=HTTPStatus.BAD_GATEWAY,
+                        detail=(
+                            f"Failed to get model info from {target_server_url}"
+                            f"Status: {response.status}, Response: {error_text}"
+                        ),
+                    )
+
+                model_info_json = await response.json()
+                return ORJSONResponse(content=model_info_json)
+
+        except aiohttp.ClientError as e:
+            raise HTTPException(
+                status_code=HTTPStatus.SERVICE_UNAVAILABLE,
+                detail=f"Failed to get model info from backend",
+            )
+
+
+@app.post("/generate")
+async def handle_generate_request(request_data: dict):
+    prefill_server, bootstrap_port, decode_server = lb.select_pair()
+
+    # Parse and transform prefill_server for bootstrap data
+    parsed_url = urllib.parse.urlparse(prefill_server)
+    hostname = maybe_wrap_ipv6_address(parsed_url.hostname)
+    modified_request = request_data.copy()
+
+    batch_size = _get_request_batch_size(modified_request)
+    if batch_size is not None:
+        modified_request.update(
+            {
+                "bootstrap_host": [hostname] * batch_size,
+                "bootstrap_port": [bootstrap_port] * batch_size,
+                "bootstrap_room": [
+                    _generate_bootstrap_room() for _ in range(batch_size)
+                ],
+            }
+        )
+    else:
+        modified_request.update(
+            {
+                "bootstrap_host": hostname,
+                "bootstrap_port": bootstrap_port,
+                "bootstrap_room": _generate_bootstrap_room(),
+            }
+        )
+
+    if request_data.get("stream", False):
+        return await lb.generate_stream(
+            modified_request, prefill_server, decode_server, "generate"
+        )
+    else:
+        return await lb.generate(
+            modified_request, prefill_server, decode_server, "generate"
+        )
+
+
+async def _forward_to_backend(request_data: dict, endpoint_name: str):
+    prefill_server, bootstrap_port, decode_server = lb.select_pair()
+
+    # Parse and transform prefill_server for bootstrap data
+    parsed_url = urllib.parse.urlparse(prefill_server)
+    hostname = maybe_wrap_ipv6_address(parsed_url.hostname)
+    modified_request = request_data.copy()
+    modified_request.update(
+        {
+            "bootstrap_host": hostname,
+            "bootstrap_port": bootstrap_port,
+            "bootstrap_room": _generate_bootstrap_room(),
+        }
+    )
+
+    if request_data.get("stream", False):
+        return await lb.generate_stream(
+            modified_request,
+            prefill_server,
+            decode_server,
+            endpoint=endpoint_name,
+        )
+    else:
+        return await lb.generate(
+            modified_request,
+            prefill_server,
+            decode_server,
+            endpoint=endpoint_name,
+        )
+
+
+@app.post("/v1/chat/completions")
+async def handle_chat_completion_request(request_data: dict):
+    return await _forward_to_backend(request_data, "v1/chat/completions")
+
+
+@app.post("/v1/completions")
+async def handle_completion_request(request_data: dict):
+    return await _forward_to_backend(request_data, "v1/completions")
+
+
+def _generate_bootstrap_room():
+    return random.randint(0, 2**63 - 1)
+
+
+# We may utilize `GenerateReqInput`'s logic later
+def _get_request_batch_size(request):
+    if (text := request.get("text")) is not None:
+        return None if isinstance(text, str) else len(text)
+    if (input_ids := request.get("input_ids")) is not None:
+        return None if isinstance(input_ids[0], int) else len(input_ids)
+    return None
+
+
+@app.get("/v1/models")
+async def get_models():
+    prefill_server = lb.prefill_urls[0]  # Get the first prefill server
+    async with aiohttp.ClientSession() as session:
+        try:
+            response = await session.get(f"{prefill_server}/v1/models")
+            if response.status != 200:
+                raise HTTPException(
+                    status_code=response.status,
+                    detail=f"Prefill server error: Status {response.status}",
+                )
+            return ORJSONResponse(content=await response.json())
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
diff --git a/sgl-router/py_src/sglang_router/router.py b/sgl-router/py_src/sglang_router/router.py
new file mode 100644
index 000000000..72a99ffbb
--- /dev/null
+++ b/sgl-router/py_src/sglang_router/router.py
@@ -0,0 +1,129 @@
+from typing import Optional
+
+from sglang_router.router_args import RouterArgs
+from sglang_router_rs import PolicyType
+from sglang_router_rs import Router as _Router
+
+
+def policy_from_str(policy_str: Optional[str]) -> PolicyType:
+    """Convert policy string to PolicyType enum."""
+    if policy_str is None:
+        return None
+    policy_map = {
+        "random": PolicyType.Random,
+        "round_robin": PolicyType.RoundRobin,
+        "cache_aware": PolicyType.CacheAware,
+        "power_of_two": PolicyType.PowerOfTwo,
+    }
+    return policy_map[policy_str]
+
+
+class Router:
+    """
+    A high-performance router for distributing requests across worker nodes.
+
+    Args:
+        worker_urls: List of URLs for worker nodes that will handle requests. Each URL should include
+            the protocol, host, and port (e.g., ['http://worker1:8000', 'http://worker2:8000'])
+        policy: Load balancing policy to use. Options:
+            - PolicyType.Random: Randomly select workers
+            - PolicyType.RoundRobin: Distribute requests in round-robin fashion
+            - PolicyType.CacheAware: Distribute requests based on cache state and load balance
+            - PolicyType.PowerOfTwo: Select best of two random workers based on load (PD mode only)
+        host: Host address to bind the router server. Default: '127.0.0.1'
+        port: Port number to bind the router server. Default: 3001
+        worker_startup_timeout_secs: Timeout in seconds for worker startup. Default: 300
+        worker_startup_check_interval: Interval in seconds between checks for worker initialization. Default: 10
+        cache_threshold: Cache threshold (0.0-1.0) for cache-aware routing. Routes to cached worker
+            if the match rate exceeds threshold, otherwise routes to the worker with the smallest
+            tree. Default: 0.5
+        balance_abs_threshold: Load balancing is triggered when (max_load - min_load) > abs_threshold
+            AND max_load > min_load * rel_threshold. Otherwise, use cache aware. Default: 32
+        balance_rel_threshold: Load balancing is triggered when (max_load - min_load) > abs_threshold
+            AND max_load > min_load * rel_threshold. Otherwise, use cache aware. Default: 1.0001
+        eviction_interval_secs: Interval in seconds between cache eviction operations in cache-aware
+            routing. Default: 60
+        max_payload_size: Maximum payload size in bytes. Default: 256MB
+        max_tree_size: Maximum size of the approximation tree for cache-aware routing. Default: 2^24
+        dp_aware: Enable data parallelism aware schedule. Default: False
+        api_key: The api key used for the authorization with the worker.
+            Useful when the dp aware scheduling strategy is enabled.
+            Default: None
+        log_dir: Directory to store log files. If None, logs are only output to console. Default: None
+        log_level: Logging level. Options: 'debug', 'info', 'warning', 'error', 'critical'.
+        service_discovery: Enable Kubernetes service discovery. When enabled, the router will
+            automatically discover worker pods based on the selector. Default: False
+        selector: Dictionary mapping of label keys to values for Kubernetes pod selection.
+            Example: {"app": "sglang-worker"}. Default: {}
+        service_discovery_port: Port to use for service discovery. The router will generate
+            worker URLs using this port. Default: 80
+        service_discovery_namespace: Kubernetes namespace to watch for pods. If not provided,
+            watches pods across all namespaces (requires cluster-wide permissions). Default: None
+        prefill_selector: Dictionary mapping of label keys to values for Kubernetes pod selection
+            for prefill servers (PD mode only). Default: {}
+        decode_selector: Dictionary mapping of label keys to values for Kubernetes pod selection
+            for decode servers (PD mode only). Default: {}
+        prometheus_port: Port to expose Prometheus metrics. Default: None
+        prometheus_host: Host address to bind the Prometheus metrics server. Default: None
+        pd_disaggregation: Enable PD (Prefill-Decode) disaggregated mode. Default: False
+        prefill_urls: List of (url, bootstrap_port) tuples for prefill servers (PD mode only)
+        decode_urls: List of URLs for decode servers (PD mode only)
+        prefill_policy: Specific load balancing policy for prefill nodes (PD mode only).
+            If not specified, uses the main policy. Default: None
+        decode_policy: Specific load balancing policy for decode nodes (PD mode only).
+            If not specified, uses the main policy. Default: None
+        request_id_headers: List of HTTP headers to check for request IDs. If not specified,
+            uses common defaults: ['x-request-id', 'x-correlation-id', 'x-trace-id', 'request-id'].
+            Example: ['x-my-request-id', 'x-custom-trace-id']. Default: None
+        bootstrap_port_annotation: Kubernetes annotation name for bootstrap port (PD mode).
+            Default: 'sglang.ai/bootstrap-port'
+        request_timeout_secs: Request timeout in seconds. Default: 600
+        max_concurrent_requests: Maximum number of concurrent requests allowed for rate limiting. Default: 256
+        queue_size: Queue size for pending requests when max concurrent limit reached (0 = no queue, return 429 immediately). Default: 100
+        queue_timeout_secs: Maximum time (in seconds) a request can wait in queue before timing out. Default: 60
+        rate_limit_tokens_per_second: Token bucket refill rate (tokens per second). If not set, defaults to max_concurrent_requests. Default: None
+        cors_allowed_origins: List of allowed origins for CORS. Empty list allows all origins. Default: []
+        health_failure_threshold: Number of consecutive health check failures before marking worker unhealthy. Default: 3
+        health_success_threshold: Number of consecutive health check successes before marking worker healthy. Default: 2
+        health_check_timeout_secs: Timeout in seconds for health check requests. Default: 5
+        health_check_interval_secs: Interval in seconds between runtime health checks. Default: 60
+        health_check_endpoint: Health check endpoint path. Default: '/health'
+        model_path: Model path for loading tokenizer (HuggingFace model ID or local path). Default: None
+        tokenizer_path: Explicit tokenizer path (overrides model_path tokenizer if provided). Default: None
+    """
+
+    def __init__(self, router: _Router):
+        self._router = router
+
+    @staticmethod
+    def from_args(args: RouterArgs) -> "Router":
+        """Create a router from a RouterArgs instance."""
+
+        args_dict = vars(args)
+        # Convert RouterArgs to _Router parameters
+        args_dict["worker_urls"] = (
+            []
+            if args_dict["service_discovery"] or args_dict["pd_disaggregation"]
+            else args_dict["worker_urls"]
+        )
+        args_dict["policy"] = policy_from_str(args_dict["policy"])
+        args_dict["prefill_urls"] = (
+            args_dict["prefill_urls"] if args_dict["pd_disaggregation"] else None
+        )
+        args_dict["decode_urls"] = (
+            args_dict["decode_urls"] if args_dict["pd_disaggregation"] else None
+        )
+        args_dict["prefill_policy"] = policy_from_str(args_dict["prefill_policy"])
+        args_dict["decode_policy"] = policy_from_str(args_dict["decode_policy"])
+
+        # remoge mini_lb parameter
+        args_dict.pop("mini_lb")
+
+        return Router(_Router(**args_dict))
+
+    def start(self) -> None:
+        """Start the router server.
+
+        This method blocks until the server is shut down.
+        """
+        self._router.start()
diff --git a/sgl-router/py_src/sglang_router/router_args.py b/sgl-router/py_src/sglang_router/router_args.py
new file mode 100644
index 000000000..ad0a2ac9f
--- /dev/null
+++ b/sgl-router/py_src/sglang_router/router_args.py
@@ -0,0 +1,577 @@
+import argparse
+import dataclasses
+import logging
+from typing import Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+
+@dataclasses.dataclass
+class RouterArgs:
+    # Worker configuration
+    worker_urls: List[str] = dataclasses.field(default_factory=list)
+    host: str = "127.0.0.1"
+    port: int = 30000
+
+    # PD-specific configuration
+    mini_lb: bool = False
+    pd_disaggregation: bool = False  # Enable PD disaggregated mode
+    prefill_urls: List[tuple] = dataclasses.field(
+        default_factory=list
+    )  # List of (url, bootstrap_port)
+    decode_urls: List[str] = dataclasses.field(default_factory=list)
+
+    # Routing policy
+    policy: str = "cache_aware"
+    prefill_policy: Optional[str] = None  # Specific policy for prefill nodes in PD mode
+    decode_policy: Optional[str] = None  # Specific policy for decode nodes in PD mode
+    worker_startup_timeout_secs: int = 600
+    worker_startup_check_interval: int = 30
+    cache_threshold: float = 0.3
+    balance_abs_threshold: int = 64
+    balance_rel_threshold: float = 1.5
+    eviction_interval_secs: int = 120
+    max_tree_size: int = 2**26
+    max_payload_size: int = 512 * 1024 * 1024  # 512MB default for large batches
+    dp_aware: bool = False
+    api_key: Optional[str] = None
+    log_dir: Optional[str] = None
+    log_level: Optional[str] = None
+    # Service discovery configuration
+    service_discovery: bool = False
+    selector: Dict[str, str] = dataclasses.field(default_factory=dict)
+    service_discovery_port: int = 80
+    service_discovery_namespace: Optional[str] = None
+    # PD service discovery configuration
+    prefill_selector: Dict[str, str] = dataclasses.field(default_factory=dict)
+    decode_selector: Dict[str, str] = dataclasses.field(default_factory=dict)
+    bootstrap_port_annotation: str = "sglang.ai/bootstrap-port"
+    # Prometheus configuration
+    prometheus_port: Optional[int] = None
+    prometheus_host: Optional[str] = None
+    # Request ID headers configuration
+    request_id_headers: Optional[List[str]] = None
+    # Request timeout in seconds
+    request_timeout_secs: int = 1800
+    # Max concurrent requests for rate limiting
+    max_concurrent_requests: int = 256
+    # Queue size for pending requests when max concurrent limit reached
+    queue_size: int = 100
+    # Maximum time (in seconds) a request can wait in queue before timing out
+    queue_timeout_secs: int = 60
+    # Token bucket refill rate (tokens per second). If not set, defaults to max_concurrent_requests
+    rate_limit_tokens_per_second: Optional[int] = None
+    # CORS allowed origins
+    cors_allowed_origins: List[str] = dataclasses.field(default_factory=list)
+    # Retry configuration
+    retry_max_retries: int = 5
+    retry_initial_backoff_ms: int = 50
+    retry_max_backoff_ms: int = 30_000
+    retry_backoff_multiplier: float = 1.5
+    retry_jitter_factor: float = 0.2
+    disable_retries: bool = False
+    # Health check configuration
+    health_failure_threshold: int = 3
+    health_success_threshold: int = 2
+    health_check_timeout_secs: int = 5
+    health_check_interval_secs: int = 60
+    health_check_endpoint: str = "/health"
+    # Circuit breaker configuration
+    cb_failure_threshold: int = 10
+    cb_success_threshold: int = 3
+    cb_timeout_duration_secs: int = 60
+    cb_window_duration_secs: int = 120
+    disable_circuit_breaker: bool = False
+    # Tokenizer configuration
+    model_path: Optional[str] = None
+    tokenizer_path: Optional[str] = None
+
+    @staticmethod
+    def add_cli_args(
+        parser: argparse.ArgumentParser,
+        use_router_prefix: bool = False,
+        exclude_host_port: bool = False,
+    ):
+        """
+        Add router-specific arguments to an argument parser.
+
+        Args:
+            parser: The argument parser to add arguments to
+            use_router_prefix: If True, prefix all arguments with 'router-' to avoid conflicts
+            exclude_host_port: If True, don't add host and port arguments (used when inheriting from server)
+        """
+        prefix = "router-" if use_router_prefix else ""
+
+        # Worker configuration
+        if not exclude_host_port:
+            parser.add_argument(
+                "--host",
+                type=str,
+                default=RouterArgs.host,
+                help="Host address to bind the router server",
+            )
+            parser.add_argument(
+                "--port",
+                type=int,
+                default=RouterArgs.port,
+                help="Port number to bind the router server",
+            )
+
+        parser.add_argument(
+            "--worker-urls",
+            type=str,
+            nargs="*",
+            default=[],
+            help="List of worker URLs (e.g., http://worker1:8000 http://worker2:8000)",
+        )
+
+        # Routing policy configuration
+        parser.add_argument(
+            f"--{prefix}policy",
+            type=str,
+            default=RouterArgs.policy,
+            choices=["random", "round_robin", "cache_aware", "power_of_two"],
+            help="Load balancing policy to use. In PD mode, this is used for both prefill and decode unless overridden",
+        )
+        parser.add_argument(
+            f"--{prefix}prefill-policy",
+            type=str,
+            default=None,
+            choices=["random", "round_robin", "cache_aware", "power_of_two"],
+            help="Specific policy for prefill nodes in PD mode. If not specified, uses the main policy",
+        )
+        parser.add_argument(
+            f"--{prefix}decode-policy",
+            type=str,
+            default=None,
+            choices=["random", "round_robin", "cache_aware", "power_of_two"],
+            help="Specific policy for decode nodes in PD mode. If not specified, uses the main policy",
+        )
+
+        # PD-specific arguments
+        parser.add_argument(
+            f"--{prefix}mini-lb",
+            action="store_true",
+            help="Enable MiniLB",
+        )
+        parser.add_argument(
+            f"--{prefix}pd-disaggregation",
+            action="store_true",
+            help="Enable PD (Prefill-Decode) disaggregated mode",
+        )
+        parser.add_argument(
+            f"--{prefix}prefill",
+            nargs="+",
+            action="append",
+            help="Prefill server URL and optional bootstrap port. Can be specified multiple times. "
+            "Format: --prefill URL [BOOTSTRAP_PORT]. "
+            "BOOTSTRAP_PORT can be a port number, 'none', or omitted (defaults to none).",
+        )
+        parser.add_argument(
+            f"--{prefix}decode",
+            nargs=1,
+            action="append",
+            metavar=("URL",),
+            help="Decode server URL. Can be specified multiple times.",
+        )
+        parser.add_argument(
+            f"--{prefix}worker-startup-timeout-secs",
+            type=int,
+            default=RouterArgs.worker_startup_timeout_secs,
+            help="Timeout in seconds for worker startup",
+        )
+        parser.add_argument(
+            f"--{prefix}worker-startup-check-interval",
+            type=int,
+            default=RouterArgs.worker_startup_check_interval,
+            help="Interval in seconds between checks for worker startup",
+        )
+        parser.add_argument(
+            f"--{prefix}cache-threshold",
+            type=float,
+            default=RouterArgs.cache_threshold,
+            help="Cache threshold (0.0-1.0) for cache-aware routing",
+        )
+        parser.add_argument(
+            f"--{prefix}balance-abs-threshold",
+            type=int,
+            default=RouterArgs.balance_abs_threshold,
+            help="Load balancing is triggered when (max_load - min_load) > abs_threshold AND max_load > min_load * rel_threshold. Otherwise, use cache aware",
+        )
+        parser.add_argument(
+            f"--{prefix}balance-rel-threshold",
+            type=float,
+            default=RouterArgs.balance_rel_threshold,
+            help="Load balancing is triggered when (max_load - min_load) > abs_threshold AND max_load > min_load * rel_threshold. Otherwise, use cache aware",
+        )
+        parser.add_argument(
+            f"--{prefix}eviction-interval-secs",
+            type=int,
+            default=RouterArgs.eviction_interval_secs,
+            help="Interval in seconds between cache eviction operations",
+        )
+        parser.add_argument(
+            f"--{prefix}max-tree-size",
+            type=int,
+            default=RouterArgs.max_tree_size,
+            help="Maximum size of the approximation tree for cache-aware routing",
+        )
+        parser.add_argument(
+            f"--{prefix}max-payload-size",
+            type=int,
+            default=RouterArgs.max_payload_size,
+            help="Maximum payload size in bytes",
+        )
+        parser.add_argument(
+            f"--{prefix}dp-aware",
+            action="store_true",
+            help="Enable data parallelism aware schedule",
+        )
+        parser.add_argument(
+            f"--{prefix}api-key",
+            type=str,
+            default=None,
+            help="The api key used for the authorization with the worker.  Useful when the dp aware scheduling strategy is enaled.",
+        )
+        parser.add_argument(
+            f"--{prefix}log-dir",
+            type=str,
+            default=None,
+            help="Directory to store log files. If not specified, logs are only output to console.",
+        )
+        parser.add_argument(
+            f"--{prefix}log-level",
+            type=str,
+            default="info",
+            choices=["debug", "info", "warning", "error", "critical"],
+            help="Set the logging level. If not specified, defaults to INFO.",
+        )
+        parser.add_argument(
+            f"--{prefix}service-discovery",
+            action="store_true",
+            help="Enable Kubernetes service discovery",
+        )
+        parser.add_argument(
+            f"--{prefix}selector",
+            type=str,
+            nargs="+",
+            default={},
+            help="Label selector for Kubernetes service discovery (format: key1=value1 key2=value2)",
+        )
+        parser.add_argument(
+            f"--{prefix}service-discovery-port",
+            type=int,
+            default=RouterArgs.service_discovery_port,
+            help="Port to use for discovered worker pods",
+        )
+        parser.add_argument(
+            f"--{prefix}service-discovery-namespace",
+            type=str,
+            help="Kubernetes namespace to watch for pods. If not provided, watches all namespaces (requires cluster-wide permissions)",
+        )
+        parser.add_argument(
+            f"--{prefix}prefill-selector",
+            type=str,
+            nargs="+",
+            default={},
+            help="Label selector for prefill server pods in PD mode (format: key1=value1 key2=value2)",
+        )
+        parser.add_argument(
+            f"--{prefix}decode-selector",
+            type=str,
+            nargs="+",
+            default={},
+            help="Label selector for decode server pods in PD mode (format: key1=value1 key2=value2)",
+        )
+        # Prometheus configuration
+        parser.add_argument(
+            f"--{prefix}prometheus-port",
+            type=int,
+            default=29000,
+            help="Port to expose Prometheus metrics. If not specified, Prometheus metrics are disabled",
+        )
+        parser.add_argument(
+            f"--{prefix}prometheus-host",
+            type=str,
+            default="127.0.0.1",
+            help="Host address to bind the Prometheus metrics server",
+        )
+        parser.add_argument(
+            f"--{prefix}request-id-headers",
+            type=str,
+            nargs="*",
+            help="Custom HTTP headers to check for request IDs (e.g., x-request-id x-trace-id). If not specified, uses common defaults.",
+        )
+        parser.add_argument(
+            f"--{prefix}request-timeout-secs",
+            type=int,
+            default=RouterArgs.request_timeout_secs,
+            help="Request timeout in seconds",
+        )
+        # Retry configuration
+        parser.add_argument(
+            f"--{prefix}retry-max-retries",
+            type=int,
+            default=RouterArgs.retry_max_retries,
+        )
+        parser.add_argument(
+            f"--{prefix}retry-initial-backoff-ms",
+            type=int,
+            default=RouterArgs.retry_initial_backoff_ms,
+        )
+        parser.add_argument(
+            f"--{prefix}retry-max-backoff-ms",
+            type=int,
+            default=RouterArgs.retry_max_backoff_ms,
+        )
+        parser.add_argument(
+            f"--{prefix}retry-backoff-multiplier",
+            type=float,
+            default=RouterArgs.retry_backoff_multiplier,
+        )
+        parser.add_argument(
+            f"--{prefix}retry-jitter-factor",
+            type=float,
+            default=RouterArgs.retry_jitter_factor,
+        )
+        parser.add_argument(
+            f"--{prefix}disable-retries",
+            action="store_true",
+            help="Disable retries (equivalent to setting retry_max_retries=1)",
+        )
+        # Circuit breaker configuration
+        parser.add_argument(
+            f"--{prefix}cb-failure-threshold",
+            type=int,
+            default=RouterArgs.cb_failure_threshold,
+        )
+        parser.add_argument(
+            f"--{prefix}cb-success-threshold",
+            type=int,
+            default=RouterArgs.cb_success_threshold,
+        )
+        parser.add_argument(
+            f"--{prefix}cb-timeout-duration-secs",
+            type=int,
+            default=RouterArgs.cb_timeout_duration_secs,
+        )
+        parser.add_argument(
+            f"--{prefix}cb-window-duration-secs",
+            type=int,
+            default=RouterArgs.cb_window_duration_secs,
+        )
+        parser.add_argument(
+            f"--{prefix}disable-circuit-breaker",
+            action="store_true",
+            help="Disable circuit breaker (equivalent to setting cb_failure_threshold to u32::MAX)",
+        )
+        # Health check configuration
+        parser.add_argument(
+            f"--{prefix}health-failure-threshold",
+            type=int,
+            default=RouterArgs.health_failure_threshold,
+            help="Number of consecutive health check failures before marking worker unhealthy",
+        )
+        parser.add_argument(
+            f"--{prefix}health-success-threshold",
+            type=int,
+            default=RouterArgs.health_success_threshold,
+            help="Number of consecutive health check successes before marking worker healthy",
+        )
+        parser.add_argument(
+            f"--{prefix}health-check-timeout-secs",
+            type=int,
+            default=RouterArgs.health_check_timeout_secs,
+            help="Timeout in seconds for health check requests",
+        )
+        parser.add_argument(
+            f"--{prefix}health-check-interval-secs",
+            type=int,
+            default=RouterArgs.health_check_interval_secs,
+            help="Interval in seconds between runtime health checks",
+        )
+        parser.add_argument(
+            f"--{prefix}health-check-endpoint",
+            type=str,
+            default=RouterArgs.health_check_endpoint,
+            help="Health check endpoint path",
+        )
+        parser.add_argument(
+            f"--{prefix}max-concurrent-requests",
+            type=int,
+            default=RouterArgs.max_concurrent_requests,
+            help="Maximum number of concurrent requests allowed (for rate limiting)",
+        )
+        parser.add_argument(
+            f"--{prefix}queue-size",
+            type=int,
+            default=RouterArgs.queue_size,
+            help="Queue size for pending requests when max concurrent limit reached (0 = no queue, return 429 immediately)",
+        )
+        parser.add_argument(
+            f"--{prefix}queue-timeout-secs",
+            type=int,
+            default=RouterArgs.queue_timeout_secs,
+            help="Maximum time (in seconds) a request can wait in queue before timing out",
+        )
+        parser.add_argument(
+            f"--{prefix}rate-limit-tokens-per-second",
+            type=int,
+            default=RouterArgs.rate_limit_tokens_per_second,
+            help="Token bucket refill rate (tokens per second). If not set, defaults to max_concurrent_requests",
+        )
+        parser.add_argument(
+            f"--{prefix}cors-allowed-origins",
+            type=str,
+            nargs="*",
+            default=[],
+            help="CORS allowed origins (e.g., http://localhost:3000 https://example.com)",
+        )
+        # Tokenizer configuration
+        parser.add_argument(
+            f"--{prefix}model-path",
+            type=str,
+            default=None,
+            help="Model path for loading tokenizer (HuggingFace model ID or local path)",
+        )
+        parser.add_argument(
+            f"--{prefix}tokenizer-path",
+            type=str,
+            default=None,
+            help="Explicit tokenizer path (overrides model_path tokenizer if provided)",
+        )
+
+    @classmethod
+    def from_cli_args(
+        cls, args: argparse.Namespace, use_router_prefix: bool = False
+    ) -> "RouterArgs":
+        """
+        Create RouterArgs instance from parsed command line arguments.
+
+        Args:
+            args: Parsed command line arguments
+            use_router_prefix: If True, look for arguments with 'router-' prefix
+        """
+        prefix = "router_" if use_router_prefix else ""
+        cli_args_dict = vars(args)
+        args_dict = {}
+
+        for attr in dataclasses.fields(cls):
+            # Auto strip prefix from args
+            if f"{prefix}{attr.name}" in cli_args_dict:
+                args_dict[attr.name] = cli_args_dict[f"{prefix}{attr.name}"]
+            elif attr.name in cli_args_dict:
+                args_dict[attr.name] = cli_args_dict[attr.name]
+
+        # parse special arguments and remove "--prefill" and "--decode" from cli_args_dict
+        args_dict["prefill_urls"] = cls._parse_prefill_urls(
+            cli_args_dict.get(f"{prefix}prefill", None)
+        )
+        args_dict["decode_urls"] = cls._parse_decode_urls(
+            cli_args_dict.get(f"{prefix}decode", None)
+        )
+        args_dict["selector"] = cls._parse_selector(
+            cli_args_dict.get(f"{prefix}selector", None)
+        )
+        args_dict["prefill_selector"] = cls._parse_selector(
+            cli_args_dict.get(f"{prefix}prefill_selector", None)
+        )
+        args_dict["decode_selector"] = cls._parse_selector(
+            cli_args_dict.get(f"{prefix}decode_selector", None)
+        )
+
+        # Mooncake-specific annotation
+        args_dict["bootstrap_port_annotation"] = "sglang.ai/bootstrap-port"
+
+        return cls(**args_dict)
+
+    def _validate_router_args(self):
+        # Validate configuration based on mode
+        if self.pd_disaggregation:
+            # Validate PD configuration - skip URL requirements if using service discovery
+            if not self.service_discovery:
+                if not self.prefill_urls:
+                    raise ValueError("PD disaggregation mode requires --prefill")
+                if not self.decode_urls:
+                    raise ValueError("PD disaggregation mode requires --decode")
+
+            # Warn about policy usage in PD mode
+            if self.prefill_policy and self.decode_policy and self.policy:
+                logger.warning(
+                    "Both --prefill-policy and --decode-policy are specified. "
+                    "The main --policy flag will be ignored for PD mode."
+                )
+            elif self.prefill_policy and not self.decode_policy and self.policy:
+                logger.info(
+                    f"Using --prefill-policy '{self.prefill_policy}' for prefill nodes "
+                    f"and --policy '{self.policy}' for decode nodes."
+                )
+            elif self.decode_policy and not self.prefill_policy and self.policy:
+                logger.info(
+                    f"Using --policy '{self.policy}' for prefill nodes "
+                    f"and --decode-policy '{self.decode_policy}' for decode nodes."
+                )
+
+    @staticmethod
+    def _parse_selector(selector_list):
+        if not selector_list:
+            return {}
+
+        selector = {}
+        for item in selector_list:
+            if "=" in item:
+                key, value = item.split("=", 1)
+                selector[key] = value
+        return selector
+
+    @staticmethod
+    def _parse_prefill_urls(prefill_list):
+        """Parse prefill URLs from --prefill arguments.
+
+        Format: --prefill URL [BOOTSTRAP_PORT]
+        Example:
+            --prefill http://prefill1:8080 9000  # With bootstrap port
+            --prefill http://prefill2:8080 none  # Explicitly no bootstrap port
+            --prefill http://prefill3:8080       # Defaults to no bootstrap port
+        """
+        if not prefill_list:
+            return []
+
+        prefill_urls = []
+        for prefill_args in prefill_list:
+
+            url = prefill_args[0]
+
+            # Handle optional bootstrap port
+            if len(prefill_args) >= 2:
+                bootstrap_port_str = prefill_args[1]
+                # Handle 'none' as None
+                if bootstrap_port_str.lower() == "none":
+                    bootstrap_port = None
+                else:
+                    try:
+                        bootstrap_port = int(bootstrap_port_str)
+                    except ValueError:
+                        raise ValueError(
+                            f"Invalid bootstrap port: {bootstrap_port_str}. Must be a number or 'none'"
+                        )
+            else:
+                # No bootstrap port specified, default to None
+                bootstrap_port = None
+
+            prefill_urls.append((url, bootstrap_port))
+
+        return prefill_urls
+
+    @staticmethod
+    def _parse_decode_urls(decode_list):
+        """Parse decode URLs from --decode arguments.
+
+        Format: --decode URL
+        Example: --decode http://decode1:8081 --decode http://decode2:8081
+        """
+        if not decode_list:
+            return []
+
+        # decode_list is a list of single-element lists due to nargs=1
+        return [url[0] for url in decode_list]
diff --git a/sgl-router/py_src/sglang_router/version.py b/sgl-router/py_src/sglang_router/version.py
new file mode 100644
index 000000000..c11f861af
--- /dev/null
+++ b/sgl-router/py_src/sglang_router/version.py
@@ -0,0 +1 @@
+__version__ = "0.1.9"
diff --git a/sgl-router/py_test/__init__.py b/sgl-router/py_test/__init__.py
new file mode 100644
index 000000000..893097780
--- /dev/null
+++ b/sgl-router/py_test/__init__.py
@@ -0,0 +1 @@
+"""Test package root for router Python tests."""
diff --git a/sgl-router/py_test/conftest.py b/sgl-router/py_test/conftest.py
new file mode 100644
index 000000000..894e12bf5
--- /dev/null
+++ b/sgl-router/py_test/conftest.py
@@ -0,0 +1,8 @@
+import sys
+from pathlib import Path
+
+# Ensure local sources in py_src are importable ahead of any installed package
+_ROOT = Path(__file__).resolve().parents[1]
+_SRC = _ROOT / "py_src"
+if str(_SRC) not in sys.path:
+    sys.path.insert(0, str(_SRC))
diff --git a/sgl-router/py_test/e2e/conftest.py b/sgl-router/py_test/e2e/conftest.py
new file mode 100644
index 000000000..7987f328c
--- /dev/null
+++ b/sgl-router/py_test/e2e/conftest.py
@@ -0,0 +1,512 @@
+import json
+import logging
+import os
+import shutil
+import signal
+import socket
+import subprocess
+import time
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Callable, Optional
+from urllib.parse import urlparse
+
+import pytest
+import requests
+
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _find_available_port() -> int:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("127.0.0.1", 0))
+        return s.getsockname()[1]
+
+
+def _parse_url(base_url: str) -> tuple[str, str]:
+    """Parse a base URL and return (host, port) as strings.
+
+    This is more robust than simple string splitting and supports different schemes
+    and URL shapes like trailing paths.
+    """
+    parsed = urlparse(base_url)
+    return parsed.hostname or "127.0.0.1", (
+        str(parsed.port) if parsed.port is not None else ""
+    )
+
+
+def _wait_router_health(base_url: str, timeout: float) -> None:
+    start = time.perf_counter()
+    with requests.Session() as session:
+        while time.perf_counter() - start < timeout:
+            try:
+                r = session.get(f"{base_url}/health", timeout=5)
+                if r.status_code == 200:
+                    return
+            except requests.RequestException:
+                pass
+            time.sleep(2)
+    raise TimeoutError("Router failed to become healthy in time")
+
+
+def _popen_launch_router(
+    model: str,
+    base_url: str,
+    dp_size: int,
+    timeout: float,
+    policy: str = "cache_aware",
+) -> subprocess.Popen:
+    host, port = _parse_url(base_url)
+
+    prom_port = _find_available_port()
+
+    cmd = [
+        "python3",
+        "-m",
+        "sglang_router.launch_server",
+        "--model-path",
+        model,
+        "--host",
+        host,
+        "--port",
+        port,
+        "--dp",
+        str(dp_size),
+        "--router-policy",
+        policy,
+        "--allow-auto-truncate",
+        "--router-prometheus-port",
+        str(prom_port),
+        "--router-prometheus-host",
+        "127.0.0.1",
+    ]
+
+    proc = subprocess.Popen(cmd)
+    _wait_router_health(base_url, timeout)
+    return proc
+
+
+def _popen_launch_worker(
+    model: str,
+    base_url: str,
+    *,
+    dp_size: int | None = None,
+    api_key: str | None = None,
+    base_gpu_id: int | None = 0,
+) -> subprocess.Popen:
+    host, port = _parse_url(base_url)
+
+    cmd = [
+        "python3",
+        "-m",
+        "sglang.launch_server",
+        "--model-path",
+        model,
+        "--host",
+        host,
+        "--port",
+        port,
+        "--base-gpu-id",
+        str(base_gpu_id or 0),
+    ]
+    if dp_size is not None:
+        cmd += ["--dp-size", str(dp_size)]
+    if api_key is not None:
+        cmd += ["--api-key", api_key]
+    return subprocess.Popen(cmd)
+
+
+def _popen_launch_router_only(
+    base_url: str,
+    policy: str = "round_robin",
+    timeout: float = 120.0,
+    *,
+    dp_aware: bool = False,
+    api_key: str | None = None,
+) -> subprocess.Popen:
+    host, port = _parse_url(base_url)
+
+    prom_port = _find_available_port()
+    cmd = [
+        "python3",
+        "-m",
+        "sglang_router.launch_router",
+        "--host",
+        host,
+        "--port",
+        port,
+        "--policy",
+        policy,
+    ]
+    if dp_aware:
+        cmd += ["--dp-aware"]
+    if api_key is not None:
+        cmd += ["--api-key", api_key]
+    cmd += [
+        "--prometheus-port",
+        str(prom_port),
+        "--prometheus-host",
+        "127.0.0.1",
+    ]
+    proc = subprocess.Popen(cmd)
+    _wait_router_health(base_url, timeout)
+    return proc
+
+
+def _terminate(proc: subprocess.Popen, timeout: float = 120) -> None:
+    if proc is None:
+        return
+    proc.terminate()
+    start = time.perf_counter()
+    while proc.poll() is None:
+        if time.perf_counter() - start > timeout:
+            proc.kill()
+            break
+        time.sleep(1)
+
+
+def _which(cmd: str) -> Optional[str]:
+    try:
+        return shutil.which(cmd)
+    except Exception as e:
+        logger.warning("shutil.which(%r) failed: %s", cmd, e)
+        return None
+
+
+def _graceful_stop_popen(p: subprocess.Popen) -> None:
+    if p is None:
+        return
+    try:
+        if p.poll() is None:
+            p.terminate()
+            for _ in range(5):
+                if p.poll() is not None:
+                    break
+                time.sleep(1)
+            if p.poll() is None:
+                p.kill()
+    except Exception as e:
+        logger.warning("Exception during graceful stop of popen: %s", e)
+
+
+def _pid_alive(pid: int) -> bool:
+    try:
+        os.kill(pid, 0)
+        return True
+    except Exception:
+        return False
+
+
+def _graceful_stop_pid(pid: int) -> None:
+    try:
+        if _pid_alive(pid):
+            try:
+                os.kill(pid, signal.SIGTERM)
+            except Exception:
+                pass
+            for _ in range(5):
+                if not _pid_alive(pid):
+                    break
+                time.sleep(1)
+            if _pid_alive(pid):
+                try:
+                    os.kill(pid, signal.SIGKILL)
+                except Exception:
+                    pass
+    except Exception:
+        pass
+
+
+def _graceful_stop_any(obj) -> None:
+    try:
+        if isinstance(obj, subprocess.Popen):
+            _graceful_stop_popen(obj)
+            return
+        if isinstance(obj, int):
+            _graceful_stop_pid(obj)
+            return
+        proc_obj = getattr(obj, "proc", None)
+        if isinstance(proc_obj, subprocess.Popen):
+            _graceful_stop_popen(proc_obj)
+    except Exception:
+        pass
+
+
+@pytest.fixture(scope="session")
+def genai_bench_runner() -> Callable[..., None]:
+    """Provide a callable to run genai-bench and validate metrics.
+
+    Usage in tests:
+      def test(..., genai_bench_runner):
+          genai_bench_runner(router_url=..., model_path=..., experiment_folder=...)
+    """
+
+    def _run(
+        *,
+        router_url: str,
+        model_path: str,
+        experiment_folder: str,
+        timeout_sec: int | None = None,
+        thresholds: dict | None = None,
+        extra_env: dict | None = None,
+        num_concurrency: int = 32,
+        traffic_scenario: str = "D(4000,100)",
+        max_requests_per_run: int | None = None,
+        clean_experiment: bool = True,
+        kill_procs: list | None = None,
+        drain_delay_sec: int = 6,
+    ) -> None:
+        cli = _which("genai-bench")
+        if not cli:
+            pytest.fail(
+                "genai-bench CLI not found; please install it to run benchmarks"
+            )
+
+        # Clean previous experiment folder under current working directory
+        if clean_experiment:
+            exp_dir = Path.cwd() / experiment_folder
+            if exp_dir.exists():
+                shutil.rmtree(exp_dir, ignore_errors=True)
+
+        # Default requests per run if not provided
+        mrr = (
+            max_requests_per_run
+            if max_requests_per_run is not None
+            else num_concurrency * 3
+        )
+
+        cmd = [
+            cli,
+            "benchmark",
+            "--api-backend",
+            "openai",
+            "--api-base",
+            router_url,
+            "--api-key",
+            "dummy-token",
+            "--api-model-name",
+            model_path,
+            "--model-tokenizer",
+            model_path,
+            "--task",
+            "text-to-text",
+            "--num-concurrency",
+            str(num_concurrency),
+            "--traffic-scenario",
+            traffic_scenario,
+            "--max-requests-per-run",
+            str(mrr),
+            "--max-time-per-run",
+            "2",
+            "--experiment-folder-name",
+            experiment_folder,
+            "--experiment-base-dir",
+            str(Path.cwd()),
+        ]
+
+        env = os.environ.copy()
+        if extra_env:
+            env.update(extra_env)
+
+        to = timeout_sec or int(os.environ.get("GENAI_BENCH_TEST_TIMEOUT", "120"))
+        proc = subprocess.Popen(
+            cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
+        )
+        stdout = stderr = ""
+        rc = None
+        try:
+            try:
+                stdout, stderr = proc.communicate(timeout=to)
+            except subprocess.TimeoutExpired:
+                # Simple: kill the CLI process if it doesn't exit in time
+                try:
+                    proc.kill()
+                except Exception:
+                    pass
+                stdout, stderr = proc.communicate()
+            rc = proc.returncode
+
+            # Prefer exact path under cwd; fallback to rglob search
+            base = Path.cwd()
+            direct = base / experiment_folder
+            candidates = [direct] if direct.is_dir() else []
+            if not candidates:
+                for p in base.rglob(experiment_folder):
+                    if p.is_dir() and p.name == experiment_folder:
+                        candidates = [p]
+                        break
+            if not candidates:
+                raise AssertionError(
+                    "Benchmark failed: experiment folder not found: "
+                    f"{experiment_folder}\nExit code: {rc}\nSTDOUT (tail):\n{stdout[-1000:]}\nSTDERR (tail):\n{stderr[-1000:]}"
+                )
+            actual_folder = candidates[0]
+
+            json_files = [
+                p
+                for p in actual_folder.rglob("*.json")
+                if "experiment_metadata" not in p.name
+            ]
+            if not json_files:
+                raise AssertionError(
+                    "Benchmark failed: no JSON results found\n"
+                    f"Exit code: {rc}\nSTDOUT (tail):\n{stdout[-1000:]}\nSTDERR (tail):\n{stderr[-1000:]}"
+                )
+
+            th = thresholds  # None means "log only", no validation
+
+            for jf in json_files:
+                with jf.open("r") as f:
+                    data = json.load(f)
+                stats = data.get("aggregated_metrics", {}).get("stats", {})
+            ttft_mean = float(stats.get("ttft", {}).get("mean", float("inf")))
+            e2e_latency_mean = float(
+                stats.get("e2e_latency", {}).get("mean", float("inf"))
+            )
+            input_tp_mean = float(stats.get("input_throughput", {}).get("mean", 0.0))
+            output_tp_mean = float(stats.get("output_throughput", {}).get("mean", 0.0))
+
+            logger.info(
+                "genai-bench[%s] %s ttft_mean=%.3fs e2e_latency_mean=%.3fs input_tp_mean=%.1f tok/s output_tp_mean=%.1f tok/s",
+                experiment_folder,
+                jf.name,
+                ttft_mean,
+                e2e_latency_mean,
+                input_tp_mean,
+                output_tp_mean,
+            )
+
+            if th is not None:
+                assert (
+                    ttft_mean <= th["ttft_mean_max"]
+                ), f"TTFT validation failed: {ttft_mean} > {th['ttft_mean_max']} (file={jf.name})"
+                assert (
+                    e2e_latency_mean <= th["e2e_latency_mean_max"]
+                ), f"E2E latency validation failed: {e2e_latency_mean} > {th['e2e_latency_mean_max']} (file={jf.name})"
+                assert (
+                    input_tp_mean >= th["input_throughput_mean_min"]
+                ), f"Input throughput validation failed: {input_tp_mean} < {th['input_throughput_mean_min']} (file={jf.name})"
+                assert (
+                    output_tp_mean >= th["output_throughput_mean_min"]
+                ), f"Output throughput validation failed: {output_tp_mean} < {th['output_throughput_mean_min']} (file={jf.name})"
+
+        finally:
+            # Always attempt to stop workers to avoid resource leakage
+            if kill_procs:
+                # Give router/workers a small grace period to finish any last drains
+                if drain_delay_sec > 0:
+                    try:
+                        time.sleep(drain_delay_sec)
+                    except Exception:
+                        pass
+                for p in kill_procs:
+                    _graceful_stop_any(p)
+                try:
+                    time.sleep(2)
+                except Exception:
+                    pass
+
+    return _run
+
+
+def pytest_configure(config):
+    config.addinivalue_line("markers", "e2e: mark as end-to-end test")
+
+
+@pytest.fixture(scope="session")
+def e2e_model() -> str:
+    # Always use the default test model
+    return DEFAULT_MODEL_NAME_FOR_TEST
+
+
+@pytest.fixture
+def e2e_router(e2e_model: str):
+    # Keep this available but tests below use router-only to avoid GPU contention
+    base_url = DEFAULT_URL_FOR_TEST
+    proc = _popen_launch_router(
+        e2e_model, base_url, dp_size=2, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
+    )
+    try:
+        yield SimpleNamespace(proc=proc, url=base_url)
+    finally:
+        _terminate(proc)
+
+
+@pytest.fixture
+def e2e_router_only_rr():
+    port = _find_available_port()
+    base_url = f"http://127.0.0.1:{port}"
+    proc = _popen_launch_router_only(base_url, policy="round_robin")
+    try:
+        yield SimpleNamespace(proc=proc, url=base_url)
+    finally:
+        _terminate(proc)
+
+
+@pytest.fixture(scope="session")
+def e2e_primary_worker(e2e_model: str):
+    port = _find_available_port()
+    base_url = f"http://127.0.0.1:{port}"
+    proc = _popen_launch_worker(e2e_model, base_url)
+    # Router health gate will handle worker readiness
+    try:
+        yield SimpleNamespace(proc=proc, url=base_url)
+    finally:
+        _terminate(proc)
+
+
+@pytest.fixture
+def e2e_router_only_rr_dp_aware_api():
+    """Router-only with dp-aware enabled and an API key."""
+    port = _find_available_port()
+    base_url = f"http://127.0.0.1:{port}"
+    api_key = "secret"
+    proc = _popen_launch_router_only(
+        base_url, policy="round_robin", timeout=180.0, dp_aware=True, api_key=api_key
+    )
+    try:
+        yield SimpleNamespace(proc=proc, url=base_url, api_key=api_key)
+    finally:
+        _terminate(proc)
+
+
+@pytest.fixture
+def e2e_worker_dp2_api(e2e_model: str, e2e_router_only_rr_dp_aware_api):
+    """Worker with dp-size=2 and the same API key as the dp-aware router."""
+    port = _find_available_port()
+    base_url = f"http://127.0.0.1:{port}"
+    api_key = e2e_router_only_rr_dp_aware_api.api_key
+    proc = _popen_launch_worker(e2e_model, base_url, dp_size=2, api_key=api_key)
+    try:
+        yield SimpleNamespace(proc=proc, url=base_url)
+    finally:
+        _terminate(proc)
+
+
+@pytest.fixture(scope="session")
+def e2e_two_workers_dp2(e2e_model: str):
+    """Launch two workers, each with dp_size=2, mapped to GPUs [0,1] and [2,3]."""
+    workers = []
+    try:
+        # Worker A on GPUs 0-1
+        port_a = _find_available_port()
+        url_a = f"http://127.0.0.1:{port_a}"
+        proc_a = _popen_launch_worker(e2e_model, url_a, dp_size=2, base_gpu_id=0)
+        workers.append(SimpleNamespace(proc=proc_a, url=url_a))
+
+        # Worker B on GPUs 2-3
+        port_b = _find_available_port()
+        url_b = f"http://127.0.0.1:{port_b}"
+        proc_b = _popen_launch_worker(e2e_model, url_b, dp_size=2, base_gpu_id=2)
+        workers.append(SimpleNamespace(proc=proc_b, url=url_b))
+
+        yield workers
+    finally:
+        for w in workers:
+            _terminate(w.proc)
diff --git a/sgl-router/py_test/e2e/test_pd_router.py b/sgl-router/py_test/e2e/test_pd_router.py
new file mode 100644
index 000000000..f6a73cd01
--- /dev/null
+++ b/sgl-router/py_test/e2e/test_pd_router.py
@@ -0,0 +1,262 @@
+import logging
+import os
+import socket
+import subprocess
+import time
+from types import SimpleNamespace
+from typing import Optional
+
+import pytest
+import requests
+
+from sglang.test.run_eval import run_eval
+
+logger = logging.getLogger(__name__)
+
+
+def _find_available_port() -> int:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("127.0.0.1", 0))
+        return s.getsockname()[1]
+
+
+def _wait_health(url: str, timeout: float = 180.0) -> None:
+    start = time.perf_counter()
+    with requests.Session() as session:
+        while time.perf_counter() - start < timeout:
+            try:
+                r = session.get(f"{url}/health", timeout=5)
+                if r.status_code == 200:
+                    return
+            except requests.RequestException:
+                pass
+            time.sleep(1)
+    raise TimeoutError(f"Service at {url} failed to become healthy in time")
+
+
+def _detect_ib_device() -> Optional[str]:
+    """Return first active IB device name (e.g., mlx5_0) or None if unavailable."""
+    # Fast check that ibv_devinfo exists
+    try:
+        subprocess.run(
+            ["ibv_devinfo", "-l"],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+            timeout=1,
+        )
+    except (FileNotFoundError, subprocess.TimeoutExpired):
+        return None
+
+    for i in range(12):
+        dev = f"mlx5_{i}"
+        try:
+            res = subprocess.run(
+                ["ibv_devinfo", dev],
+                capture_output=True,
+                text=True,
+                timeout=2,
+            )
+            if res.returncode == 0 and ("state:" in res.stdout):
+                for line in res.stdout.splitlines():
+                    if "state:" in line and "PORT_ACTIVE" in line:
+                        return dev
+        except Exception:
+            pass
+    return None
+
+
+def _popen_launch_prefill_worker(
+    model: str,
+    bootstrap_port: int,
+    ib_device: Optional[str] = None,
+    base_gpu_id: int = 0,
+) -> SimpleNamespace:
+    port = _find_available_port()
+    url = f"http://127.0.0.1:{port}"
+    cmd = [
+        "python3",
+        "-m",
+        "sglang.launch_server",
+        "--model-path",
+        model,
+        "--disaggregation-mode",
+        "prefill",
+        "--host",
+        "127.0.0.1",
+        "--port",
+        str(port),
+        "--disaggregation-bootstrap-port",
+        str(bootstrap_port),
+        "--base-gpu-id",
+        str(base_gpu_id),
+    ]
+    if ib_device:
+        cmd += ["--disaggregation-ib-device", ib_device]
+    proc = subprocess.Popen(cmd)
+    _wait_health(url, timeout=300.0)
+    return SimpleNamespace(proc=proc, url=url, bootstrap_port=bootstrap_port)
+
+
+def _popen_launch_decode_worker(
+    model: str, ib_device: Optional[str] = None, base_gpu_id: int = 0
+) -> SimpleNamespace:
+    port = _find_available_port()
+    url = f"http://127.0.0.1:{port}"
+    cmd = [
+        "python3",
+        "-m",
+        "sglang.launch_server",
+        "--model-path",
+        model,
+        "--disaggregation-mode",
+        "decode",
+        "--host",
+        "127.0.0.1",
+        "--port",
+        str(port),
+        "--base-gpu-id",
+        str(base_gpu_id),
+    ]
+    if ib_device:
+        cmd += ["--disaggregation-ib-device", ib_device]
+    proc = subprocess.Popen(cmd)
+    _wait_health(url, timeout=300.0)
+    return SimpleNamespace(proc=proc, url=url)
+
+
+def _terminate(proc: subprocess.Popen, timeout: float = 120) -> None:
+    if proc is None:
+        return
+    proc.terminate()
+    start = time.perf_counter()
+    while proc.poll() is None:
+        if time.perf_counter() - start > timeout:
+            proc.kill()
+            break
+        time.sleep(1)
+
+
+@pytest.fixture(scope="module")
+def pd_cluster(e2e_model: str):
+    """Start 2 prefill + 2 decode workers and one PD router, once per module."""
+    # Environment capability checks: require sgl_kernel and GPU backend
+    try:
+        import sgl_kernel  # noqa: F401
+    except Exception as e:  # pragma: no cover - environment dependent
+        pytest.fail(f"PD e2e requires sgl_kernel but it is not available: {e}")
+
+    try:
+        import torch  # noqa: F401
+    except Exception as e:  # pragma: no cover - environment dependent
+        pytest.fail(
+            f"PD e2e requires torch but it is not available or misconfigured: {e}"
+        )
+
+    if not torch.cuda.is_available():  # pragma: no cover - environment dependent
+        pytest.fail("PD e2e requires CUDA backend, but CUDA is not available")
+
+    workers: list[SimpleNamespace] = []
+    router_proc = None
+    try:
+        ib_device = _detect_ib_device()
+
+        # Launch 4 workers across 4 GPUs: prefill on 0,1 and decode on 2,3
+        pf1 = _popen_launch_prefill_worker(
+            e2e_model,
+            bootstrap_port=_find_available_port(),
+            ib_device=ib_device,
+            base_gpu_id=0,
+        )
+        pf2 = _popen_launch_prefill_worker(
+            e2e_model,
+            bootstrap_port=_find_available_port(),
+            ib_device=ib_device,
+            base_gpu_id=1,
+        )
+        dc1 = _popen_launch_decode_worker(e2e_model, ib_device=ib_device, base_gpu_id=2)
+        dc2 = _popen_launch_decode_worker(e2e_model, ib_device=ib_device, base_gpu_id=3)
+        prefills = [pf1, pf2]
+        decodes = [dc1, dc2]
+        workers.extend(prefills + decodes)
+
+        # PD router with two prefill and two decode endpoints
+        rport = _find_available_port()
+        router_url = f"http://127.0.0.1:{rport}"
+        pport = _find_available_port()
+
+        prefill = [(pf.url, pf.bootstrap_port) for pf in prefills]
+        decode = [dc.url for dc in decodes]
+
+        cmd = [
+            "python3",
+            "-m",
+            "sglang_router.launch_router",
+            "--host",
+            "127.0.0.1",
+            "--port",
+            str(rport),
+            "--policy",
+            "round_robin",
+            "--pd-disaggregation",
+        ]
+        for url, bport in prefill:
+            cmd += ["--prefill", url, str(bport)]
+        for url in decode:
+            cmd += ["--decode", url]
+        cmd += [
+            "--prometheus-port",
+            str(pport),
+            "--prometheus-host",
+            "127.0.0.1",
+        ]
+
+        router_proc = subprocess.Popen(cmd)
+        _wait_health(router_url, timeout=180.0)
+
+        yield SimpleNamespace(
+            router_url=router_url, workers=workers, router_proc=router_proc
+        )
+    finally:
+        if router_proc is not None:
+            _terminate(router_proc)
+        for w in workers:
+            _terminate(w.proc)
+
+
+@pytest.mark.e2e
+def test_pd_mmlu(e2e_model: str, pd_cluster):
+    """
+    Launch 4 workers, start a PD router (2 prefill + 2 decode), then run MMLU.
+    """
+    args = SimpleNamespace(
+        base_url=pd_cluster.router_url,
+        model=e2e_model,
+        eval_name="mmlu",
+        num_examples=64,
+        num_threads=32,
+        temperature=0.1,
+    )
+    metrics = run_eval(args)
+    assert metrics["score"] >= 0.65
+
+
+@pytest.mark.e2e
+def test_pd_genai_bench(e2e_model: str, pd_cluster, genai_bench_runner):
+    """
+    Launch 4 workers, start a PD router (2 prefill + 2 decode), then run a
+    short genai-bench benchmark and validate aggregate metrics.
+    """
+    # Run genai-bench against the shared router
+    policy_label = "benchmark_round_robin_pd"
+    genai_bench_runner(
+        router_url=pd_cluster.router_url,
+        model_path=e2e_model,
+        experiment_folder=policy_label,
+        thresholds={
+            "ttft_mean_max": 12,
+            "e2e_latency_mean_max": 15,
+            "input_throughput_mean_min": 400,
+            "output_throughput_mean_min": 20,
+        },
+        kill_procs=pd_cluster.workers,
+    )
diff --git a/sgl-router/py_test/e2e/test_regular_router.py b/sgl-router/py_test/e2e/test_regular_router.py
new file mode 100644
index 000000000..856ecda72
--- /dev/null
+++ b/sgl-router/py_test/e2e/test_regular_router.py
@@ -0,0 +1,169 @@
+import threading
+import time
+from types import SimpleNamespace
+
+import pytest
+import requests
+
+from sglang.test.run_eval import run_eval
+
+
+@pytest.mark.e2e
+def test_mmlu(e2e_router_only_rr, e2e_two_workers_dp2, e2e_model):
+    # Attach two dp=2 workers (total 4 GPUs) to a fresh router-only instance
+    base = e2e_router_only_rr.url
+    for w in e2e_two_workers_dp2:
+        r = requests.post(f"{base}/add_worker", params={"url": w.url}, timeout=180)
+        r.raise_for_status()
+
+    args = SimpleNamespace(
+        base_url=base,
+        model=e2e_model,
+        eval_name="mmlu",
+        num_examples=64,
+        num_threads=32,
+        temperature=0.1,
+    )
+    metrics = run_eval(args)
+    assert metrics["score"] >= 0.65
+
+
+@pytest.mark.e2e
+def test_genai_bench(
+    e2e_router_only_rr, e2e_two_workers_dp2, e2e_model, genai_bench_runner
+):
+    """Attach a worker to the regular router and run a short genai-bench."""
+    base = e2e_router_only_rr.url
+    for w in e2e_two_workers_dp2:
+        r = requests.post(f"{base}/add_worker", params={"url": w.url}, timeout=180)
+        r.raise_for_status()
+
+    genai_bench_runner(
+        router_url=base,
+        model_path=e2e_model,
+        experiment_folder="benchmark_round_robin_regular",
+        thresholds={
+            "ttft_mean_max": 6,
+            "e2e_latency_mean_max": 14,
+            "input_throughput_mean_min": 1000,
+            "output_throughput_mean_min": 12,
+        },
+        kill_procs=e2e_two_workers_dp2,
+    )
+
+
+@pytest.mark.e2e
+def test_add_and_remove_worker_live(e2e_router_only_rr, e2e_primary_worker, e2e_model):
+    base = e2e_router_only_rr.url
+    worker_url = e2e_primary_worker.url
+
+    r = requests.post(f"{base}/add_worker", params={"url": worker_url}, timeout=180)
+    r.raise_for_status()
+
+    with requests.Session() as s:
+        for i in range(8):
+            r = s.post(
+                f"{base}/v1/completions",
+                json={
+                    "model": e2e_model,
+                    "prompt": f"x{i}",
+                    "max_tokens": 1,
+                    "stream": False,
+                },
+                timeout=120,
+            )
+            r.raise_for_status()
+
+    # Remove the worker
+    r = requests.post(f"{base}/remove_worker", params={"url": worker_url}, timeout=60)
+    r.raise_for_status()
+
+
+@pytest.mark.e2e
+def test_lazy_fault_tolerance_live(e2e_router_only_rr, e2e_primary_worker, e2e_model):
+    base = e2e_router_only_rr.url
+    worker = e2e_primary_worker
+
+    r = requests.post(f"{base}/add_worker", params={"url": worker.url}, timeout=180)
+    r.raise_for_status()
+
+    def killer():
+        time.sleep(10)
+        try:
+            worker.proc.terminate()
+        except Exception:
+            pass
+
+    t = threading.Thread(target=killer, daemon=True)
+    t.start()
+
+    args = SimpleNamespace(
+        base_url=base,
+        model=e2e_model,
+        eval_name="mmlu",
+        num_examples=32,
+        num_threads=16,
+        temperature=0.0,
+    )
+    metrics = run_eval(args)
+    assert 0.0 <= metrics["score"] <= 1.0
+
+
+@pytest.mark.e2e
+def test_dp_aware_worker_expansion_and_api_key(
+    e2e_model,
+    e2e_router_only_rr_dp_aware_api,
+    e2e_worker_dp2_api,
+):
+    """
+    Launch a router-only instance in dp_aware mode and a single worker with dp_size=2
+    and API key protection. Verify expansion, auth enforcement, and basic eval.
+    """
+    import os
+
+    router_url = e2e_router_only_rr_dp_aware_api.url
+    worker_url = e2e_worker_dp2_api.url
+    api_key = e2e_router_only_rr_dp_aware_api.api_key
+
+    # Attach worker; router should expand to dp_size logical workers
+    r = requests.post(
+        f"{router_url}/add_worker", params={"url": worker_url}, timeout=180
+    )
+    r.raise_for_status()
+
+    r = requests.get(f"{router_url}/list_workers", timeout=30)
+    r.raise_for_status()
+    urls = r.json().get("urls", [])
+    assert len(urls) == 2
+    assert set(urls) == {f"{worker_url}@0", f"{worker_url}@1"}
+
+    # Verify API key enforcement path-through
+    # 1) Without Authorization -> 401 from backend
+    r = requests.post(
+        f"{router_url}/v1/completions",
+        json={"model": e2e_model, "prompt": "hi", "max_tokens": 1},
+        timeout=60,
+    )
+    assert r.status_code == 401
+
+    # 2) With correct Authorization -> 200
+    r = requests.post(
+        f"{router_url}/v1/completions",
+        json={"model": e2e_model, "prompt": "hi", "max_tokens": 1},
+        headers={"Authorization": f"Bearer {api_key}"},
+        timeout=60,
+    )
+    assert r.status_code == 200
+
+    # Finally, run MMLU eval through the router with auth
+    os.environ["OPENAI_API_KEY"] = api_key
+    args = SimpleNamespace(
+        base_url=router_url,
+        model=e2e_model,
+        eval_name="mmlu",
+        num_examples=64,
+        num_threads=32,
+        temperature=0.1,
+    )
+    metrics = run_eval(args)
+    assert metrics["score"] >= 0.65
diff --git a/sgl-router/py_test/fixtures/__init__.py b/sgl-router/py_test/fixtures/__init__.py
new file mode 100644
index 000000000..4ac754df8
--- /dev/null
+++ b/sgl-router/py_test/fixtures/__init__.py
@@ -0,0 +1 @@
+"""Shared fixtures for router integration tests."""
diff --git a/sgl-router/py_test/fixtures/mock_worker.py b/sgl-router/py_test/fixtures/mock_worker.py
new file mode 100644
index 000000000..13b3c71a6
--- /dev/null
+++ b/sgl-router/py_test/fixtures/mock_worker.py
@@ -0,0 +1,252 @@
+"""
+Lightweight mock worker HTTP server for router integration tests.
+
+Implements minimal endpoints used by the router:
+- GET /health, /health_generate
+- POST /generate, /v1/completions, /v1/chat/completions
+- POST /flush_cache
+- GET /get_server_info, /get_model_info, /v1/models
+
+Behavior knobs are controlled via CLI flags to simulate failures, latency, and load.
+"""
+
+import argparse
+import asyncio
+import json
+import os
+import random
+import signal
+import sys
+import time
+from contextlib import asynccontextmanager
+from typing import Optional
+
+import uvicorn
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.responses import JSONResponse, PlainTextResponse, StreamingResponse
+
+# Global state (per-process)
+_inflight = 0
+_failures_seen = 0
+
+
+def _parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser()
+    p.add_argument("--host", default="127.0.0.1")
+    p.add_argument("--port", type=int, required=True)
+    p.add_argument("--worker-id", default=None)
+    p.add_argument("--latency-ms", type=int, default=0)
+    p.add_argument("--timeout", action="store_true")
+    p.add_argument("--status-code", type=int, default=200)
+    p.add_argument("--fail-first-n", type=int, default=0)
+    p.add_argument("--random-fail-rate", type=float, default=0.0)
+    p.add_argument("--require-api-key", action="store_true")
+    p.add_argument("--api-key", default=None)
+    p.add_argument("--max-payload-bytes", type=int, default=10 * 1024 * 1024)
+    p.add_argument("--stream", action="store_true")
+    p.add_argument("--dp-size", type=int, default=1)
+    p.add_argument("--crash-on-request", action="store_true")
+    p.add_argument("--health-fail-after-ms", type=int, default=0)
+    return p.parse_args()
+
+
+def _extract_worker_id(args: argparse.Namespace) -> str:
+    if args.worker_id:
+        return str(args.worker_id)
+    # default to port (unique enough for tests)
+    return f"worker-{args.port}"
+
+
+def create_app(args: argparse.Namespace) -> FastAPI:
+    app = FastAPI()
+    worker_id = _extract_worker_id(args)
+    start_ts = time.time()
+    crashed = {"done": False}
+
+    async def maybe_delay():
+        if args.latency_ms > 0:
+            await asyncio.sleep(args.latency_ms / 1000.0)
+
+    def should_fail() -> Optional[int]:
+        global _failures_seen
+        # Fail first N requests (500)
+        if args.fail_first_n > 0 and _failures_seen < args.fail_first_n:
+            _failures_seen += 1
+            return 500
+        # Random failure probability (500)
+        if args.random_fail_rate > 0.0 and random.random() < args.random_fail_rate:
+            return 500
+        # Forced status code override (non-200) for all responses
+        if args.status_code != 200:
+            return int(args.status_code)
+        return None
+
+    def check_api_key(request: Request):
+        if not args.require_api_key:
+            return
+        auth = request.headers.get("Authorization")
+        if not auth or not auth.startswith("Bearer "):
+            raise HTTPException(status_code=401, detail="Unauthorized")
+        key = auth.split(" ", 1)[1]
+        if args.api_key and key != args.api_key:
+            raise HTTPException(status_code=401, detail="Unauthorized")
+
+    @asynccontextmanager
+    async def track_inflight():
+        global _inflight
+        _inflight += 1
+        try:
+            yield
+        finally:
+            _inflight -= 1
+
+    @app.get("/health")
+    async def health():
+        if (
+            args.health_fail_after_ms
+            and (time.time() - start_ts) * 1000.0 >= args.health_fail_after_ms
+        ):
+            return PlainTextResponse("bad", status_code=500)
+        return PlainTextResponse("ok", status_code=200)
+
+    @app.get("/health_generate")
+    async def health_generate():
+        return PlainTextResponse("ok", status_code=200)
+
+    @app.post("/flush_cache")
+    async def flush_cache():
+        return PlainTextResponse("ok", status_code=200)
+
+    @app.get("/get_model_info")
+    async def get_model_info():
+        return JSONResponse({"model": "mock", "vocab_size": 32000})
+
+    @app.get("/v1/models")
+    async def list_models():
+        return JSONResponse({"data": [{"id": "mock", "object": "model"}]})
+
+    @app.get("/get_server_info")
+    async def get_server_info(request: Request):
+        # Enforce API key on server info when required (used by dp_aware probing)
+        check_api_key(request)
+        return JSONResponse(
+            {
+                "worker_id": worker_id,
+                "load_in_flight": _inflight,
+                "cache": {"size": 0, "hit_rate": 0.0},
+                "dp_size": int(args.dp_size),
+            }
+        )
+
+    @app.get("/get_load")
+    async def get_load():
+        return JSONResponse({"load": _inflight})
+
+    def make_json_response(obj: dict, status_code: int = 200) -> JSONResponse:
+        resp = JSONResponse(obj, status_code=status_code)
+        resp.headers["X-Worker-Id"] = worker_id
+        return resp
+
+    async def handle_text_request(request: Request):
+        # Authorization
+        check_api_key(request)
+
+        # Payload limit
+        body = await request.body()
+        if len(body) > args.max_payload_bytes:
+            return make_json_response({"error": "payload too large"}, status_code=413)
+
+        # Simulate crash on first request
+        if args.crash_on_request and not crashed["done"]:
+            crashed["done"] = True
+            os._exit(1)
+
+        # Optional timeout (simulate hang)
+        if args.timeout:
+            await asyncio.sleep(3600)
+
+        # Optional latency
+        await maybe_delay()
+
+        # Optional failures
+        fail_code = should_fail()
+        if fail_code is not None and fail_code != 200:
+            return make_json_response(
+                {"error": f"mock failure {fail_code}"}, status_code=fail_code
+            )
+
+        # Build response echoing minimal shape
+        try:
+            data = await request.json()
+        except (json.JSONDecodeError, ValueError):
+            data = {}
+
+        now = time.time()
+        ret = {
+            "id": f"cmpl-{int(now*1000)}",
+            "object": "text_completion",
+            "created": int(now),
+            "model": "mock",
+            "choices": [
+                {
+                    "text": "ok",
+                    "index": 0,
+                    "finish_reason": "stop",
+                }
+            ],
+            "worker_id": worker_id,
+            "echo": data,
+        }
+        return make_json_response(ret, status_code=200)
+
+    async def handle_stream_request(request: Request):
+        check_api_key(request)
+
+        async def gen():
+            # minimal 2-chunk stream then [DONE]
+            for i in range(2):
+                await asyncio.sleep(0.01)
+                chunk = {
+                    "choices": [{"delta": {"content": "x"}}],
+                    "worker_id": worker_id,
+                }
+                yield f"data: {json.dumps(chunk)}\n\n"
+            yield "data: [DONE]\n\n"
+
+        headers = {"X-Worker-Id": worker_id}
+        return StreamingResponse(gen(), media_type="text/event-stream", headers=headers)
+
+    @app.post("/generate")
+    async def generate(request: Request):
+        async with track_inflight():
+            if args.stream:
+                return await handle_stream_request(request)
+            return await handle_text_request(request)
+
+    @app.post("/v1/completions")
+    async def completions(request: Request):
+        async with track_inflight():
+            if args.stream:
+                return await handle_stream_request(request)
+            return await handle_text_request(request)
+
+    @app.post("/v1/chat/completions")
+    async def chat_completions(request: Request):
+        async with track_inflight():
+            if args.stream:
+                return await handle_stream_request(request)
+            return await handle_text_request(request)
+
+    return app
+
+
+def main() -> None:
+    args = _parse_args()
+    app = create_app(args)
+    # Handle SIGTERM gracefully for fast test teardown
+    signal.signal(signal.SIGTERM, lambda *_: sys.exit(0))
+    uvicorn.run(app, host=args.host, port=args.port, log_level="warning")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sgl-router/py_test/fixtures/ports.py b/sgl-router/py_test/fixtures/ports.py
new file mode 100644
index 000000000..d616cffa1
--- /dev/null
+++ b/sgl-router/py_test/fixtures/ports.py
@@ -0,0 +1,8 @@
+import socket
+
+
+def find_free_port() -> int:
+    """Return an available TCP port on localhost."""
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("127.0.0.1", 0))
+        return s.getsockname()[1]
diff --git a/sgl-router/py_test/fixtures/router_manager.py b/sgl-router/py_test/fixtures/router_manager.py
new file mode 100644
index 000000000..c536a0015
--- /dev/null
+++ b/sgl-router/py_test/fixtures/router_manager.py
@@ -0,0 +1,158 @@
+import subprocess
+import time
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+
+import requests
+
+from .ports import find_free_port
+
+
+@dataclass
+class ProcHandle:
+    process: subprocess.Popen
+    url: str
+
+
+class RouterManager:
+    """Helper to spawn a router process and interact with admin endpoints."""
+
+    def __init__(self):
+        self._children: List[subprocess.Popen] = []
+
+    def start_router(
+        self,
+        worker_urls: Optional[List[str]] = None,
+        policy: str = "round_robin",
+        port: Optional[int] = None,
+        extra: Optional[Dict] = None,
+        # PD options
+        pd_disaggregation: bool = False,
+        prefill_urls: Optional[List[tuple]] = None,
+        decode_urls: Optional[List[str]] = None,
+        prefill_policy: Optional[str] = None,
+        decode_policy: Optional[str] = None,
+    ) -> ProcHandle:
+        worker_urls = worker_urls or []
+        port = port or find_free_port()
+        cmd = [
+            "python3",
+            "-m",
+            "sglang_router.launch_router",
+            "--host",
+            "127.0.0.1",
+            "--port",
+            str(port),
+            "--policy",
+            policy,
+        ]
+        # Avoid Prometheus port collisions by assigning a free port per router
+        prom_port = find_free_port()
+        cmd.extend(
+            ["--prometheus-port", str(prom_port), "--prometheus-host", "127.0.0.1"]
+        )
+        if worker_urls:
+            cmd.extend(["--worker-urls", *worker_urls])
+
+        # PD routing configuration
+        if pd_disaggregation:
+            cmd.append("--pd-disaggregation")
+            if prefill_urls:
+                for url, bport in prefill_urls:
+                    if bport is None:
+                        cmd.extend(["--prefill", url, "none"])
+                    else:
+                        cmd.extend(["--prefill", url, str(bport)])
+            if decode_urls:
+                for url in decode_urls:
+                    cmd.extend(["--decode", url])
+            if prefill_policy:
+                cmd.extend(["--prefill-policy", prefill_policy])
+            if decode_policy:
+                cmd.extend(["--decode-policy", decode_policy])
+
+        # Map supported extras to CLI flags (subset for integration)
+        if extra:
+            flag_map = {
+                "max_payload_size": "--max-payload-size",
+                "dp_aware": "--dp-aware",
+                "api_key": "--api-key",
+                # Health/monitoring
+                "worker_startup_check_interval": "--worker-startup-check-interval",
+                # Cache-aware tuning
+                "cache_threshold": "--cache-threshold",
+                "balance_abs_threshold": "--balance-abs-threshold",
+                "balance_rel_threshold": "--balance-rel-threshold",
+                # Retry
+                "retry_max_retries": "--retry-max-retries",
+                "retry_initial_backoff_ms": "--retry-initial-backoff-ms",
+                "retry_max_backoff_ms": "--retry-max-backoff-ms",
+                "retry_backoff_multiplier": "--retry-backoff-multiplier",
+                "retry_jitter_factor": "--retry-jitter-factor",
+                "disable_retries": "--disable-retries",
+                # Circuit breaker
+                "cb_failure_threshold": "--cb-failure-threshold",
+                "cb_success_threshold": "--cb-success-threshold",
+                "cb_timeout_duration_secs": "--cb-timeout-duration-secs",
+                "cb_window_duration_secs": "--cb-window-duration-secs",
+                "disable_circuit_breaker": "--disable-circuit-breaker",
+                # Rate limiting
+                "max_concurrent_requests": "--max-concurrent-requests",
+                "queue_size": "--queue-size",
+                "queue_timeout_secs": "--queue-timeout-secs",
+                "rate_limit_tokens_per_second": "--rate-limit-tokens-per-second",
+            }
+            for k, v in extra.items():
+                if v is None:
+                    continue
+                flag = flag_map.get(k)
+                if not flag:
+                    continue
+                if isinstance(v, bool):
+                    if v:
+                        cmd.append(flag)
+                else:
+                    cmd.extend([flag, str(v)])
+
+        proc = subprocess.Popen(cmd)
+        self._children.append(proc)
+        url = f"http://127.0.0.1:{port}"
+        self._wait_health(url)
+        return ProcHandle(process=proc, url=url)
+
+    def _wait_health(self, base_url: str, timeout: float = 30.0):
+        start = time.time()
+        with requests.Session() as s:
+            while time.time() - start < timeout:
+                try:
+                    r = s.get(f"{base_url}/health", timeout=2)
+                    if r.status_code == 200:
+                        return
+                except requests.RequestException:
+                    pass
+                time.sleep(0.2)
+        raise TimeoutError(f"Router at {base_url} did not become healthy")
+
+    def add_worker(self, base_url: str, worker_url: str) -> None:
+        r = requests.post(f"{base_url}/add_worker", params={"url": worker_url})
+        assert r.status_code == 200, f"add_worker failed: {r.status_code} {r.text}"
+
+    def remove_worker(self, base_url: str, worker_url: str) -> None:
+        r = requests.post(f"{base_url}/remove_worker", params={"url": worker_url})
+        assert r.status_code == 200, f"remove_worker failed: {r.status_code} {r.text}"
+
+    def list_workers(self, base_url: str) -> list[str]:
+        r = requests.get(f"{base_url}/list_workers")
+        assert r.status_code == 200, f"list_workers failed: {r.status_code} {r.text}"
+        data = r.json()
+        return data.get("urls", [])
+
+    def stop_all(self):
+        for p in self._children:
+            if p.poll() is None:
+                p.terminate()
+                try:
+                    p.wait(timeout=5)
+                except subprocess.TimeoutExpired:
+                    p.kill()
+        self._children.clear()
diff --git a/sgl-router/py_test/integration/__init__.py b/sgl-router/py_test/integration/__init__.py
new file mode 100644
index 000000000..1e342eca0
--- /dev/null
+++ b/sgl-router/py_test/integration/__init__.py
@@ -0,0 +1 @@
+"""Integration test package for the router."""
diff --git a/sgl-router/py_test/integration/conftest.py b/sgl-router/py_test/integration/conftest.py
new file mode 100644
index 000000000..21b9369d7
--- /dev/null
+++ b/sgl-router/py_test/integration/conftest.py
@@ -0,0 +1,109 @@
+import os
+import subprocess
+import time
+from pathlib import Path
+from typing import Dict, Iterable, List, Tuple
+
+import pytest
+import requests
+
+from ..fixtures.ports import find_free_port
+from ..fixtures.router_manager import RouterManager
+
+
+def pytest_configure(config):
+    config.addinivalue_line("markers", "integration: mark as router integration test")
+
+
+@pytest.fixture
+def router_manager() -> Iterable[RouterManager]:
+    mgr = RouterManager()
+    try:
+        yield mgr
+    finally:
+        mgr.stop_all()
+
+
+def _spawn_mock_worker(args: List[str]) -> Tuple[subprocess.Popen, str, str]:
+    repo_root = Path(__file__).resolve().parents[2]
+    script = repo_root / "py_test" / "fixtures" / "mock_worker.py"
+    port = find_free_port()
+    worker_id = f"worker-{port}"
+    base_cmd = [
+        "python3",
+        str(script),
+        "--port",
+        str(port),
+        "--worker-id",
+        worker_id,
+    ]
+    cmd = base_cmd + args
+    proc = subprocess.Popen(cmd)
+    url = f"http://127.0.0.1:{port}"
+    _wait_health(url)
+    return proc, url, worker_id
+
+
+def _wait_health(url: str, timeout: float = 10.0):
+    start = time.time()
+    with requests.Session() as s:
+        while time.time() - start < timeout:
+            try:
+                r = s.get(f"{url}/health", timeout=1)
+                if r.status_code == 200:
+                    return
+            except requests.RequestException:
+                pass
+            time.sleep(0.1)
+    raise TimeoutError(f"Mock worker at {url} did not become healthy")
+
+
+@pytest.fixture
+def mock_worker():
+    """Start a single healthy mock worker; yields (process, url, worker_id)."""
+    proc, url, worker_id = _spawn_mock_worker([])
+    try:
+        yield proc, url, worker_id
+    finally:
+        if proc.poll() is None:
+            proc.terminate()
+            try:
+                proc.wait(timeout=3)
+            except subprocess.TimeoutExpired:
+                proc.kill()
+
+
+@pytest.fixture
+def mock_workers():
+    """Factory to start N workers with custom args.
+
+    Usage:
+        procs, urls, ids = mock_workers(n=3, args=["--latency-ms", "5"])  # same args for all
+        ...
+    """
+
+    procs: List[subprocess.Popen] = []
+
+    def _start(n: int, args: List[str] | None = None):
+        args = args or []
+        new_procs: List[subprocess.Popen] = []
+        urls: List[str] = []
+        ids: List[str] = []
+        for _ in range(n):
+            p, url, wid = _spawn_mock_worker(args)
+            procs.append(p)
+            new_procs.append(p)
+            urls.append(url)
+            ids.append(wid)
+        return new_procs, urls, ids
+
+    try:
+        yield _start
+    finally:
+        for p in procs:
+            if p.poll() is None:
+                p.terminate()
+                try:
+                    p.wait(timeout=3)
+                except subprocess.TimeoutExpired:
+                    p.kill()
diff --git a/sgl-router/py_test/integration/load_balancing/__init__.py b/sgl-router/py_test/integration/load_balancing/__init__.py
new file mode 100644
index 000000000..77b8c2460
--- /dev/null
+++ b/sgl-router/py_test/integration/load_balancing/__init__.py
@@ -0,0 +1 @@
+"""Load balancing integration tests."""
diff --git a/sgl-router/py_test/integration/load_balancing/test_cache_aware.py b/sgl-router/py_test/integration/load_balancing/test_cache_aware.py
new file mode 100644
index 000000000..acbbd3682
--- /dev/null
+++ b/sgl-router/py_test/integration/load_balancing/test_cache_aware.py
@@ -0,0 +1,73 @@
+import collections
+import concurrent.futures
+import uuid
+
+import pytest
+import requests
+
+
+@pytest.mark.integration
+def test_cache_aware_affinity(mock_workers, router_manager):
+    # Two workers; same prompt should stick to one due to cache tree
+    _, urls, ids = mock_workers(n=2)
+    rh = router_manager.start_router(worker_urls=urls, policy="cache_aware")
+
+    counts = collections.Counter()
+    with requests.Session() as s:
+        for i in range(12):
+            r = s.post(
+                f"{rh.url}/v1/completions",
+                json={
+                    "model": "test-model",
+                    "prompt": "repeated prompt for cache",
+                    "max_tokens": 1,
+                    "stream": False,
+                },
+            )
+            assert r.status_code == 200
+            wid = r.headers.get("X-Worker-Id") or r.json().get("worker_id")
+            counts[wid] += 1
+
+    # Expect strong skew toward one worker (tree match); majority > 80%
+    top = max(counts.values())
+    assert top >= 10, counts
+
+
+@pytest.mark.integration
+def test_cache_aware_diverse_prompts_balances(mock_workers, router_manager):
+    # Add latency so concurrent requests overlap and influence load-based selection
+    _, urls, ids = mock_workers(n=3, args=["--latency-ms", "30"])
+    rh = router_manager.start_router(
+        worker_urls=urls,
+        policy="cache_aware",
+        extra={
+            "cache_threshold": 0.99,
+            "balance_abs_threshold": 0,
+            "balance_rel_threshold": 1.0,
+        },
+    )
+
+    counts = collections.Counter()
+
+    def call(i):
+        # Use diverse, unrelated prompts to avoid prefix matches entirely
+        prompt = str(uuid.uuid4())
+        r = requests.post(
+            f"{rh.url}/v1/completions",
+            json={
+                "model": "test-model",
+                "prompt": prompt,
+                "max_tokens": 1,
+                "stream": False,
+            },
+            timeout=5,
+        )
+        assert r.status_code == 200
+        return r.headers.get("X-Worker-Id") or r.json().get("worker_id")
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=16) as ex:
+        for wid in ex.map(call, range(40)):
+            counts[wid] += 1
+
+    # Expect participation of at least two workers
+    assert sum(1 for v in counts.values() if v > 0) >= 2, counts
diff --git a/sgl-router/py_test/integration/load_balancing/test_power_of_two.py b/sgl-router/py_test/integration/load_balancing/test_power_of_two.py
new file mode 100644
index 000000000..c56f4d38a
--- /dev/null
+++ b/sgl-router/py_test/integration/load_balancing/test_power_of_two.py
@@ -0,0 +1,89 @@
+import collections
+import concurrent.futures
+import time
+
+import pytest
+import requests
+
+
+@pytest.mark.integration
+def test_power_of_two_prefers_less_loaded(mock_workers, router_manager):
+    # Start two workers: one slow (higher inflight), one fast
+    # Router monitors /get_load and Power-of-Two uses cached loads to choose
+    # Start one slow and one fast worker using the fixture factory
+    procs_slow, urls_slow, ids_slow = mock_workers(n=1, args=["--latency-ms", "200"])
+    procs_fast, urls_fast, ids_fast = mock_workers(n=1, args=["--latency-ms", "0"])
+    procs = procs_slow + procs_fast
+    urls = urls_slow + urls_fast
+    ids = ids_slow + ids_fast
+    slow_id = ids_slow[0]
+
+    rh = router_manager.start_router(
+        worker_urls=urls,
+        policy="power_of_two",
+        extra={"worker_startup_check_interval": 1},
+    )
+
+    # Prime: fire a burst to create measurable load on slow worker, then wait for monitor tick
+
+    def _prime_call(i):
+        try:
+            requests.post(
+                f"{rh.url}/v1/completions",
+                json={
+                    "model": "test-model",
+                    "prompt": f"warm-{i}",
+                    "max_tokens": 1,
+                    "stream": False,
+                },
+                timeout=5,
+            )
+        except Exception:
+            pass
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as ex:
+        list(ex.map(_prime_call, range(128)))
+    time.sleep(2)
+
+    # Apply direct background load on the slow worker to amplify load diff
+    def _direct_load(i):
+        try:
+            requests.post(
+                f"{slow_url}/v1/completions",
+                json={
+                    "model": "test-model",
+                    "prompt": f"bg-{i}",
+                    "max_tokens": 1,
+                    "stream": False,
+                },
+                timeout=5,
+            )
+        except Exception:
+            pass
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as ex:
+        list(ex.map(_direct_load, range(128)))
+    time.sleep(1)
+
+    def call(i):
+        r = requests.post(
+            f"{rh.url}/v1/completions",
+            json={
+                "model": "test-model",
+                "prompt": f"p{i}",
+                "max_tokens": 1,
+                "stream": False,
+            },
+            timeout=5,
+        )
+        assert r.status_code == 200
+        return r.headers.get("X-Worker-Id") or r.json().get("worker_id")
+
+    counts = collections.Counter()
+    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as ex:
+        for wid in ex.map(call, range(200)):
+            counts[wid] += 1
+
+    # Expect the slow worker (higher latency/inflight) to receive fewer requests
+    fast_worker_id = [i for i in ids if i != slow_id][0]
+    assert counts[slow_id] < counts[fast_worker_id], counts
diff --git a/sgl-router/py_test/integration/load_balancing/test_random.py b/sgl-router/py_test/integration/load_balancing/test_random.py
new file mode 100644
index 000000000..41a613e12
--- /dev/null
+++ b/sgl-router/py_test/integration/load_balancing/test_random.py
@@ -0,0 +1,33 @@
+import collections
+import math
+
+import pytest
+import requests
+
+
+@pytest.mark.integration
+def test_random_distribution(mock_workers, router_manager):
+    procs, urls, ids = mock_workers(n=4)
+    rh = router_manager.start_router(worker_urls=urls, policy="random")
+
+    counts = collections.Counter()
+    N = 200
+    with requests.Session() as s:
+        for i in range(N):
+            r = s.post(
+                f"{rh.url}/v1/completions",
+                json={
+                    "model": "test-model",
+                    "prompt": f"p{i}",
+                    "max_tokens": 1,
+                    "stream": False,
+                },
+            )
+            assert r.status_code == 200
+            wid = r.headers.get("X-Worker-Id") or r.json().get("worker_id")
+            counts[wid] += 1
+
+    # simple statistical tolerance: each worker should be within ±50% of mean
+    mean = N / len(ids)
+    for wid in ids:
+        assert 0.5 * mean <= counts[wid] <= 1.5 * mean, counts
diff --git a/sgl-router/py_test/integration/load_balancing/test_round_robin.py b/sgl-router/py_test/integration/load_balancing/test_round_robin.py
new file mode 100644
index 000000000..966f3747a
--- /dev/null
+++ b/sgl-router/py_test/integration/load_balancing/test_round_robin.py
@@ -0,0 +1,34 @@
+import collections
+import time
+
+import pytest
+import requests
+
+
+@pytest.mark.integration
+def test_round_robin_distribution(mock_workers, router_manager):
+    procs, urls, ids = mock_workers(n=3)
+
+    rh = router_manager.start_router(worker_urls=urls, policy="round_robin")
+
+    counts = collections.Counter()
+    with requests.Session() as s:
+        for i in range(30):
+            r = s.post(
+                f"{rh.url}/v1/completions",
+                json={
+                    "model": "test-model",
+                    "prompt": f"hello {i}",
+                    "max_tokens": 1,
+                    "stream": False,
+                },
+            )
+            assert r.status_code == 200
+            wid = r.headers.get("X-Worker-Id") or r.json().get("worker_id")
+            assert wid in ids
+            counts[wid] += 1
+
+    # Expect near-even distribution across 3 workers
+    # 30 requests -> ideally 10 each; allow small tolerance ±3
+    for wid in ids:
+        assert 7 <= counts[wid] <= 13, counts
diff --git a/sgl-router/py_test/integration/test_api_auth.py b/sgl-router/py_test/integration/test_api_auth.py
new file mode 100644
index 000000000..b8ba5c670
--- /dev/null
+++ b/sgl-router/py_test/integration/test_api_auth.py
@@ -0,0 +1,38 @@
+import pytest
+import requests
+
+
+@pytest.mark.integration
+def test_router_api_key_enforcement(router_manager, mock_workers):
+    # Start backend requiring API key; router should forward Authorization header transparently
+    _, urls, _ = mock_workers(
+        n=1, args=["--require-api-key", "--api-key", "correct_api_key"]
+    )
+    rh = router_manager.start_router(
+        worker_urls=urls,
+        policy="round_robin",
+        extra={},
+    )
+
+    # No auth -> 401
+    r = requests.post(
+        f"{rh.url}/v1/completions",
+        json={"model": "test-model", "prompt": "x", "max_tokens": 1, "stream": False},
+    )
+    assert r.status_code == 401
+
+    # Invalid auth -> 401
+    r = requests.post(
+        f"{rh.url}/v1/completions",
+        json={"model": "test-model", "prompt": "x", "max_tokens": 1, "stream": False},
+        headers={"Authorization": "Bearer wrong"},
+    )
+    assert r.status_code == 401
+
+    # Correct auth -> 200
+    r = requests.post(
+        f"{rh.url}/v1/completions",
+        json={"model": "test-model", "prompt": "x", "max_tokens": 1, "stream": False},
+        headers={"Authorization": "Bearer correct_api_key"},
+    )
+    assert r.status_code == 200
diff --git a/sgl-router/py_test/integration/test_circuit_breaker.py b/sgl-router/py_test/integration/test_circuit_breaker.py
new file mode 100644
index 000000000..7e7ba409b
--- /dev/null
+++ b/sgl-router/py_test/integration/test_circuit_breaker.py
@@ -0,0 +1,191 @@
+import time
+
+import pytest
+import requests
+
+
+@pytest.mark.integration
+def test_circuit_breaker_opens_and_recovers(router_manager, mock_workers):
+    # A single worker that fails first 3 requests, then succeeds
+    _, [wurl], _ = mock_workers(n=1, args=["--fail-first-n", "3"])  # fails first 3
+    rh = router_manager.start_router(
+        worker_urls=[wurl],
+        policy="round_robin",
+        extra={
+            "cb_failure_threshold": 3,
+            "cb_success_threshold": 2,
+            "cb_timeout_duration_secs": 3,
+            "cb_window_duration_secs": 10,
+            "disable_retries": True,
+        },
+    )
+
+    def post_once():
+        return requests.post(
+            f"{rh.url}/v1/completions",
+            json={
+                "model": "test-model",
+                "prompt": "trigger",
+                "max_tokens": 1,
+                "stream": False,
+            },
+            timeout=3,
+        )
+
+    saw_503 = False
+    for _ in range(8):
+        r = post_once()
+        if r.status_code == 503:
+            saw_503 = True
+            break
+    assert saw_503, "circuit breaker did not open to return 503"
+
+    time.sleep(4)
+    r1 = post_once()
+    r2 = post_once()
+    assert r1.status_code == 200 and r2.status_code == 200
+
+
+@pytest.mark.integration
+def test_circuit_breaker_half_open_failure_reopens(router_manager, mock_workers):
+    _, [wurl], _ = mock_workers(n=1, args=["--status-code", "500"])  # always fail
+    rh = router_manager.start_router(
+        worker_urls=[wurl],
+        policy="round_robin",
+        extra={
+            "cb_failure_threshold": 2,
+            "cb_success_threshold": 2,
+            "cb_timeout_duration_secs": 2,
+            "cb_window_duration_secs": 5,
+            "disable_retries": True,
+        },
+    )
+
+    def post_once():
+        return requests.post(
+            f"{rh.url}/v1/completions",
+            json={
+                "model": "test-model",
+                "prompt": "x",
+                "max_tokens": 1,
+                "stream": False,
+            },
+            timeout=3,
+        )
+
+    opened = False
+    for _ in range(8):
+        r = post_once()
+        if r.status_code == 503:
+            opened = True
+            break
+    assert opened, "circuit breaker did not open"
+
+    time.sleep(3)
+    r = post_once()
+    assert r.status_code == 500
+    r2 = post_once()
+    assert r2.status_code == 503
+
+
+@pytest.mark.integration
+def test_circuit_breaker_disable_flag(router_manager, mock_workers):
+    _, [wurl], _ = mock_workers(n=1, args=["--status-code", "500"])  # always fail
+    rh = router_manager.start_router(
+        worker_urls=[wurl],
+        policy="round_robin",
+        extra={
+            "disable_circuit_breaker": True,
+            "disable_retries": True,
+        },
+    )
+    r = requests.post(
+        f"{rh.url}/v1/completions",
+        json={
+            "model": "test-model",
+            "prompt": "x",
+            "max_tokens": 1,
+            "stream": False,
+        },
+        timeout=3,
+    )
+    assert r.status_code == 500
+
+
+@pytest.mark.integration
+def test_circuit_breaker_per_worker_isolation(router_manager, mock_workers):
+    _, [fail_url], _ = mock_workers(n=1, args=["--status-code", "500"])  # always fail
+    _, [ok_url], _ = mock_workers(n=1)
+    rh = router_manager.start_router(
+        worker_urls=[fail_url, ok_url],
+        policy="round_robin",
+        extra={
+            "cb_failure_threshold": 2,
+            "cb_success_threshold": 1,
+            "cb_timeout_duration_secs": 2,
+            "cb_window_duration_secs": 10,
+            "disable_retries": True,
+        },
+    )
+
+    def post_once():
+        return requests.post(
+            f"{rh.url}/v1/completions",
+            json={
+                "model": "test-model",
+                "prompt": "y",
+                "max_tokens": 1,
+                "stream": False,
+            },
+            timeout=3,
+        )
+
+    failures = 0
+    successes_after_open = 0
+    opened = False
+    for _ in range(30):
+        r = post_once()
+        if not opened:
+            if r.status_code == 500:
+                failures += 1
+            if failures >= 2:
+                _ = post_once()
+                _ = post_once()
+                opened = True
+        else:
+            if r.status_code == 200:
+                successes_after_open += 1
+            else:
+                assert False, f"Unexpected non-200 after CB open: {r.status_code}"
+    assert opened and successes_after_open >= 5
+
+
+@pytest.mark.integration
+def test_circuit_breaker_with_retries(router_manager, mock_workers):
+    _, [fail_url], _ = mock_workers(n=1, args=["--status-code", "500"])  # always fail
+    _, [ok_url], _ = mock_workers(n=1)
+    rh = router_manager.start_router(
+        worker_urls=[fail_url, ok_url],
+        policy="round_robin",
+        extra={
+            "retry_max_retries": 3,
+            "retry_initial_backoff_ms": 10,
+            "retry_max_backoff_ms": 50,
+            "cb_failure_threshold": 2,
+            "cb_success_threshold": 1,
+            "cb_timeout_duration_secs": 2,
+            "cb_window_duration_secs": 10,
+        },
+    )
+
+    r = requests.post(
+        f"{rh.url}/v1/completions",
+        json={
+            "model": "test-model",
+            "prompt": "z",
+            "max_tokens": 1,
+            "stream": False,
+        },
+        timeout=5,
+    )
+    assert r.status_code == 200
diff --git a/sgl-router/py_test/integration/test_fault_tolerance.py b/sgl-router/py_test/integration/test_fault_tolerance.py
new file mode 100644
index 000000000..78e5968ce
--- /dev/null
+++ b/sgl-router/py_test/integration/test_fault_tolerance.py
@@ -0,0 +1,36 @@
+import concurrent.futures
+import subprocess
+import time
+
+import pytest
+import requests
+
+
+@pytest.mark.integration
+def test_worker_crash_reroute_with_retries(router_manager, mock_workers):
+    # Start one healthy and one that will crash on first request
+    _, [ok_url], _ = mock_workers(n=1)
+    _, [crash_url], _ = mock_workers(n=1, args=["--crash-on-request"])
+    rh = router_manager.start_router(
+        worker_urls=[crash_url, ok_url],
+        policy="round_robin",
+        extra={
+            "retry_max_retries": 3,
+            "retry_initial_backoff_ms": 10,
+            "retry_max_backoff_ms": 50,
+        },
+    )
+
+    # A single request should succeed via retry to the healthy worker
+    r = requests.post(
+        f"{rh.url}/v1/completions",
+        json={
+            "model": "test-model",
+            "prompt": "crash",
+            "max_tokens": 1,
+            "stream": False,
+        },
+        timeout=5,
+    )
+    assert r.status_code == 200
+    # mock_workers fixture handles cleanup
diff --git a/sgl-router/py_test/integration/test_payload_size.py b/sgl-router/py_test/integration/test_payload_size.py
new file mode 100644
index 000000000..b3583ab28
--- /dev/null
+++ b/sgl-router/py_test/integration/test_payload_size.py
@@ -0,0 +1,33 @@
+import pytest
+import requests
+
+
+@pytest.mark.integration
+def test_payload_size_limit(router_manager, mock_workers):
+    # Start one backend and a router with a 1MB payload limit
+    _, urls, _ = mock_workers(n=1)
+    rh = router_manager.start_router(
+        worker_urls=urls,
+        policy="round_robin",
+        extra={"max_payload_size": 1 * 1024 * 1024},  # 1MB
+    )
+
+    # Payload just under 1MB should succeed
+    payload_small = {
+        "model": "test-model",
+        "prompt": "x" * int(0.5 * 1024 * 1024),  # ~0.5MB
+        "max_tokens": 1,
+        "stream": False,
+    }
+    r = requests.post(f"{rh.url}/v1/completions", json=payload_small)
+    assert r.status_code == 200
+
+    # Payload over 1MB should fail with 413
+    payload_large = {
+        "model": "test-model",
+        "prompt": "x" * int(1.2 * 1024 * 1024),  # ~1.2MB
+        "max_tokens": 1,
+        "stream": False,
+    }
+    r = requests.post(f"{rh.url}/v1/completions", json=payload_large)
+    assert r.status_code == 413
diff --git a/sgl-router/py_test/integration/test_pd_routing.py b/sgl-router/py_test/integration/test_pd_routing.py
new file mode 100644
index 000000000..d0ae7d552
--- /dev/null
+++ b/sgl-router/py_test/integration/test_pd_routing.py
@@ -0,0 +1,127 @@
+import collections
+import concurrent.futures
+import subprocess
+import time
+
+import pytest
+import requests
+
+
+@pytest.mark.integration
+def test_pd_power_of_two_decode_attribution(router_manager, mock_workers):
+    # Start two prefill and three decode mock workers via fixture
+    _, prefill_urls_raw, prefill_ids = mock_workers(n=2)
+    _, decode_urls_raw, decode_ids_list = mock_workers(n=3)
+    prefill_urls = [(u, None) for u in prefill_urls_raw]
+    decode_urls = list(decode_urls_raw)
+    decode_ids = set(decode_ids_list)
+
+    rh = router_manager.start_router(
+        policy="power_of_two",
+        pd_disaggregation=True,
+        prefill_urls=prefill_urls,
+        decode_urls=decode_urls,
+        extra={"worker_startup_check_interval": 1},
+    )
+
+    counts = collections.Counter()
+    with requests.Session() as s:
+        for i in range(30):
+            r = s.post(
+                f"{rh.url}/v1/completions",
+                json={
+                    "model": "test-model",
+                    "prompt": f"p{i}",
+                    "max_tokens": 1,
+                    "stream": False,
+                },
+            )
+            assert r.status_code == 200
+            wid = r.headers.get("X-Worker-Id") or r.json().get("worker_id")
+            assert wid in decode_ids
+            counts[wid] += 1
+
+    assert sum(1 for v in counts.values() if v > 0) >= 2
+
+
+@pytest.mark.integration
+def test_pd_power_of_two_skews_to_faster_decode(router_manager, mock_workers):
+    # Start two prefill workers (fast)
+    _, prefill_urls_raw, _ = mock_workers(n=2)
+
+    # Start two decode workers: one slow, one fast
+    _, [decode_slow_url], [slow_id] = mock_workers(
+        n=1, args=["--latency-ms", "300"]
+    )  # slower decode
+    _, [decode_fast_url], [fast_id] = mock_workers(n=1)
+    decode_urls_raw = [decode_slow_url, decode_fast_url]
+
+    prefill_urls = [(u, None) for u in prefill_urls_raw]
+    decode_urls = list(decode_urls_raw)
+
+    rh = router_manager.start_router(
+        policy="power_of_two",
+        pd_disaggregation=True,
+        prefill_urls=prefill_urls,
+        decode_urls=decode_urls,
+        extra={"worker_startup_check_interval": 1},
+    )
+
+    def _prime_call(i):
+        try:
+            requests.post(
+                f"{rh.url}/v1/completions",
+                json={
+                    "model": "test-model",
+                    "prompt": f"warm-{i}",
+                    "max_tokens": 1,
+                    "stream": False,
+                },
+                timeout=8,
+            )
+        except Exception:
+            pass
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as ex:
+        list(ex.map(_prime_call, range(128)))
+    time.sleep(2)
+
+    def _direct_decode_load(i):
+        try:
+            requests.post(
+                f"{decode_slow_url}/v1/completions",
+                json={
+                    "model": "test-model",
+                    "prompt": f"bg-{i}",
+                    "max_tokens": 1,
+                    "stream": False,
+                },
+                timeout=8,
+            )
+        except Exception:
+            pass
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as ex:
+        list(ex.map(_direct_decode_load, range(128)))
+    time.sleep(1)
+
+    def call(i):
+        r = requests.post(
+            f"{rh.url}/v1/completions",
+            json={
+                "model": "test-model",
+                "prompt": f"p{i}",
+                "max_tokens": 1,
+                "stream": False,
+            },
+            timeout=8,
+        )
+        assert r.status_code == 200
+        return r.headers.get("X-Worker-Id") or r.json().get("worker_id")
+
+    counts = collections.Counter()
+    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as ex:
+        for wid in ex.map(call, range(200)):
+            counts[wid] += 1
+
+    assert counts[slow_id] < counts[fast_id], counts
diff --git a/sgl-router/py_test/integration/test_rate_limiting.py b/sgl-router/py_test/integration/test_rate_limiting.py
new file mode 100644
index 000000000..4297d77c9
--- /dev/null
+++ b/sgl-router/py_test/integration/test_rate_limiting.py
@@ -0,0 +1,91 @@
+import concurrent.futures
+import time
+
+import pytest
+import requests
+
+
+@pytest.mark.integration
+def test_rate_limit_and_queue(router_manager, mock_workers):
+    # One fast backend
+    _, urls, _ = mock_workers(n=1)
+    rh = router_manager.start_router(
+        worker_urls=urls,
+        policy="round_robin",
+        extra={
+            "max_concurrent_requests": 2,
+            "queue_size": 0,  # no queue -> immediate 429 when limit exceeded
+        },
+    )
+
+    def call_once(i):
+        try:
+            r = requests.post(
+                f"{rh.url}/v1/completions",
+                json={
+                    "model": "test-model",
+                    "prompt": f"p{i}",
+                    "max_tokens": 1,
+                    "stream": False,
+                },
+                timeout=3,
+            )
+            return r.status_code
+        except Exception:
+            return 599
+
+    # Fire a burst of concurrent requests
+    with concurrent.futures.ThreadPoolExecutor(max_workers=16) as ex:
+        results = list(ex.map(call_once, range(16)))
+
+    # Expect some to succeed and some to be rate limited (429)
+    assert any(code == 200 for code in results)
+    assert any(code == 429 for code in results)
+
+
+@pytest.mark.integration
+def test_rate_limit_queue_and_timeout(router_manager, mock_workers):
+    # Slow backend: ~2s per request ensures queue wait > timeout
+    _, urls, _ = mock_workers(n=1, args=["--latency-ms", "2000"])  # 2.0s per request
+
+    # Allow 1 concurrent, queue up to 1, with 1s queue timeout
+    rh = router_manager.start_router(
+        worker_urls=urls,
+        policy="round_robin",
+        extra={
+            "max_concurrent_requests": 1,
+            "queue_size": 1,
+            "queue_timeout_secs": 1,
+        },
+    )
+
+    def call_once(i):
+        try:
+            r = requests.post(
+                f"{rh.url}/v1/completions",
+                json={
+                    "model": "test-model",
+                    "prompt": f"q{i}",
+                    "max_tokens": 1,
+                    "stream": False,
+                },
+                timeout=5,
+            )
+            return r.status_code
+        except Exception:
+            return 599
+
+    # Fire 4 concurrent requests: 1 runs (~2s), 1 queued (times out at 1s -> 408), 2 overflow -> 429
+    import concurrent.futures
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as ex:
+        results = list(ex.map(call_once, range(4)))
+
+    # We expect:
+    # - Some 200s (processed)
+    # - At least one 408 (queued too long and timed out)
+    # - Remaining non-200s are either 429 (queue overflow) or additional 408s depending on scheduling
+    assert any(code == 200 for code in results)
+    assert any(code == 408 for code in results), results
+    non200 = [c for c in results if c != 200]
+    assert len(non200) >= 2 and all(c in (408, 429) for c in non200), results
diff --git a/sgl-router/py_test/integration/test_retries.py b/sgl-router/py_test/integration/test_retries.py
new file mode 100644
index 000000000..5f3d4ffee
--- /dev/null
+++ b/sgl-router/py_test/integration/test_retries.py
@@ -0,0 +1,65 @@
+import concurrent.futures
+import subprocess
+import time
+
+import pytest
+import requests
+
+
+@pytest.mark.integration
+def test_retry_reroutes_to_healthy_worker(router_manager, mock_workers):
+    # Worker A always 500; Worker B healthy
+    # Worker A always 500; Worker B/C healthy
+    _, [url_a], [id_a] = mock_workers(n=1, args=["--status-code", "500"])  # fail
+    _, [url_b], [id_b] = mock_workers(n=1)
+    _, [url_c], [id_c] = mock_workers(n=1)
+    rh = router_manager.start_router(
+        worker_urls=[url_a, url_b, url_c],
+        policy="round_robin",
+        extra={
+            "retry_max_retries": 3,
+            "retry_initial_backoff_ms": 10,
+            "retry_max_backoff_ms": 50,
+        },
+    )
+
+    r = requests.post(
+        f"{rh.url}/v1/completions",
+        json={
+            "model": "test-model",
+            "prompt": "x",
+            "max_tokens": 1,
+            "stream": False,
+        },
+        timeout=5,
+    )
+    assert r.status_code == 200
+    wid = r.headers.get("X-Worker-Id") or r.json().get("worker_id")
+    assert wid == id_b  # should have retried onto healthy worker
+    # mock_workers fixture handles cleanup
+
+
+@pytest.mark.integration
+def test_disable_retries_surfaces_failure(router_manager, mock_workers):
+    # Single failing worker, retries disabled -> should return 500
+    _, [url], [wid] = mock_workers(n=1, args=["--status-code", "500"])  # always fail
+    rh = router_manager.start_router(
+        worker_urls=[url],
+        policy="round_robin",
+        extra={
+            "disable_retries": True,
+        },
+    )
+
+    r = requests.post(
+        f"{rh.url}/v1/completions",
+        json={
+            "model": "test-model",
+            "prompt": "x",
+            "max_tokens": 1,
+            "stream": False,
+        },
+        timeout=5,
+    )
+    assert r.status_code == 500
+    # mock_workers fixture handles cleanup
diff --git a/sgl-router/py_test/integration/test_service_discovery_shim.py b/sgl-router/py_test/integration/test_service_discovery_shim.py
new file mode 100644
index 000000000..5cc1d6734
--- /dev/null
+++ b/sgl-router/py_test/integration/test_service_discovery_shim.py
@@ -0,0 +1,36 @@
+import pytest
+import requests
+
+
+@pytest.mark.integration
+def test_discovery_shim_add_remove(router_manager, mock_workers):
+    # Start router without workers
+    rh = router_manager.start_router(worker_urls=[], policy="round_robin")
+
+    # Initially empty
+    urls = router_manager.list_workers(rh.url)
+    assert urls == []
+
+    # Add a worker (simulate discovery event)
+    _, [wurl], [wid] = mock_workers(n=1)
+    router_manager.add_worker(rh.url, wurl)
+    urls = router_manager.list_workers(rh.url)
+    assert wurl in urls
+
+    # Can serve a request
+    r = requests.post(
+        f"{rh.url}/v1/completions",
+        json={
+            "model": "test-model",
+            "prompt": "hi",
+            "max_tokens": 1,
+            "stream": False,
+        },
+    )
+    assert r.status_code == 200
+
+    # Remove worker (simulate pod deletion)
+    router_manager.remove_worker(rh.url, wurl)
+    urls = router_manager.list_workers(rh.url)
+    assert wurl not in urls
+    # mock_workers fixture handles cleanup
diff --git a/sgl-router/py_test/integration/test_worker_management.py b/sgl-router/py_test/integration/test_worker_management.py
new file mode 100644
index 000000000..8acb94114
--- /dev/null
+++ b/sgl-router/py_test/integration/test_worker_management.py
@@ -0,0 +1,61 @@
+import collections
+import subprocess
+import time
+
+import pytest
+import requests
+
+
+@pytest.mark.integration
+def test_add_and_remove_worker(mock_worker, router_manager, mock_workers):
+    # Start with a single worker
+    proc1, url1, id1 = mock_worker
+    rh = router_manager.start_router(worker_urls=[url1], policy="round_robin")
+
+    # Add a second worker
+
+    procs2, urls2, ids2 = mock_workers(n=1)
+    url2 = urls2[0]
+    id2 = ids2[0]
+    router_manager.add_worker(rh.url, url2)
+
+    # Send some requests and ensure both workers are seen
+    seen = set()
+    with requests.Session() as s:
+        for i in range(20):
+            r = s.post(
+                f"{rh.url}/v1/completions",
+                json={
+                    "model": "test-model",
+                    "prompt": f"x{i}",
+                    "max_tokens": 1,
+                    "stream": False,
+                },
+            )
+            assert r.status_code == 200
+            wid = r.headers.get("X-Worker-Id") or r.json().get("worker_id")
+            seen.add(wid)
+            if len(seen) == 2:
+                break
+
+    assert id1 in seen and id2 in seen
+
+    # Now remove the second worker
+    router_manager.remove_worker(rh.url, url2)
+
+    # After removal, subsequent requests should only come from first worker
+    with requests.Session() as s:
+        for i in range(10):
+            r = s.post(
+                f"{rh.url}/v1/completions",
+                json={
+                    "model": "test-model",
+                    "prompt": f"y{i}",
+                    "max_tokens": 1,
+                    "stream": False,
+                },
+            )
+            assert r.status_code == 200
+            wid = r.headers.get("X-Worker-Id") or r.json().get("worker_id")
+            assert wid == id1
+    # mock_workers fixture handles cleanup
diff --git a/sgl-router/py_test/unit/__init__.py b/sgl-router/py_test/unit/__init__.py
new file mode 100644
index 000000000..42cbd8bee
--- /dev/null
+++ b/sgl-router/py_test/unit/__init__.py
@@ -0,0 +1,7 @@
+"""
+Unit tests for sglang_router.
+
+This package contains fast, isolated unit tests for Python components
+of the SGLang router. These tests focus on testing individual functions
+and classes in isolation without starting actual router instances.
+"""
diff --git a/sgl-router/py_test/unit/test_arg_parser.py b/sgl-router/py_test/unit/test_arg_parser.py
new file mode 100644
index 000000000..04d8a112d
--- /dev/null
+++ b/sgl-router/py_test/unit/test_arg_parser.py
@@ -0,0 +1,628 @@
+"""
+Unit tests for argument parsing functionality in sglang_router.
+
+These tests focus on testing the argument parsing logic in isolation,
+without starting actual router instances.
+"""
+
+import argparse
+from types import SimpleNamespace
+from unittest.mock import MagicMock, patch
+
+import pytest
+from sglang_router.launch_router import RouterArgs, parse_router_args
+from sglang_router.router import policy_from_str
+
+
+class TestRouterArgs:
+    """Test RouterArgs dataclass and its methods."""
+
+    def test_default_values(self):
+        """Test that RouterArgs has correct default values."""
+        args = RouterArgs()
+
+        # Test basic defaults
+        assert args.host == "127.0.0.1"
+        assert args.port == 30000
+        assert args.policy == "cache_aware"
+        assert args.worker_urls == []
+        assert args.pd_disaggregation is False
+        assert args.prefill_urls == []
+        assert args.decode_urls == []
+
+        # Test PD-specific defaults
+        assert args.prefill_policy is None
+        assert args.decode_policy is None
+
+        # Test service discovery defaults
+        assert args.service_discovery is False
+        assert args.selector == {}
+        assert args.service_discovery_port == 80
+        assert args.service_discovery_namespace is None
+
+        # Test retry and circuit breaker defaults
+        assert args.retry_max_retries == 5
+        assert args.cb_failure_threshold == 10
+        assert args.disable_retries is False
+        assert args.disable_circuit_breaker is False
+
+    def test_parse_selector_valid(self):
+        """Test parsing valid selector arguments."""
+        # Test single key-value pair
+        result = RouterArgs._parse_selector(["app=worker"])
+        assert result == {"app": "worker"}
+
+        # Test multiple key-value pairs
+        result = RouterArgs._parse_selector(["app=worker", "env=prod", "version=v1"])
+        assert result == {"app": "worker", "env": "prod", "version": "v1"}
+
+        # Test empty list
+        result = RouterArgs._parse_selector([])
+        assert result == {}
+
+        # Test None
+        result = RouterArgs._parse_selector(None)
+        assert result == {}
+
+    def test_parse_selector_invalid(self):
+        """Test parsing invalid selector arguments."""
+        # Test malformed selector (no equals sign)
+        result = RouterArgs._parse_selector(["app"])
+        assert result == {}
+
+        # Test multiple equals signs (should use first one)
+        result = RouterArgs._parse_selector(["app=worker=extra"])
+        assert result == {"app": "worker=extra"}
+
+    def test_parse_prefill_urls_valid(self):
+        """Test parsing valid prefill URL arguments."""
+        # Test with bootstrap port
+        result = RouterArgs._parse_prefill_urls([["http://prefill1:8000", "9000"]])
+        assert result == [("http://prefill1:8000", 9000)]
+
+        # Test with 'none' bootstrap port
+        result = RouterArgs._parse_prefill_urls([["http://prefill1:8000", "none"]])
+        assert result == [("http://prefill1:8000", None)]
+
+        # Test without bootstrap port
+        result = RouterArgs._parse_prefill_urls([["http://prefill1:8000"]])
+        assert result == [("http://prefill1:8000", None)]
+
+        # Test multiple prefill URLs
+        result = RouterArgs._parse_prefill_urls(
+            [
+                ["http://prefill1:8000", "9000"],
+                ["http://prefill2:8000", "none"],
+                ["http://prefill3:8000"],
+            ]
+        )
+        expected = [
+            ("http://prefill1:8000", 9000),
+            ("http://prefill2:8000", None),
+            ("http://prefill3:8000", None),
+        ]
+        assert result == expected
+
+        # Test empty list
+        result = RouterArgs._parse_prefill_urls([])
+        assert result == []
+
+        # Test None
+        result = RouterArgs._parse_prefill_urls(None)
+        assert result == []
+
+    def test_parse_prefill_urls_invalid(self):
+        """Test parsing invalid prefill URL arguments."""
+        # Test invalid bootstrap port
+        with pytest.raises(ValueError, match="Invalid bootstrap port"):
+            RouterArgs._parse_prefill_urls([["http://prefill1:8000", "invalid"]])
+
+    def test_parse_decode_urls_valid(self):
+        """Test parsing valid decode URL arguments."""
+        # Test single decode URL
+        result = RouterArgs._parse_decode_urls([["http://decode1:8001"]])
+        assert result == ["http://decode1:8001"]
+
+        # Test multiple decode URLs
+        result = RouterArgs._parse_decode_urls(
+            [["http://decode1:8001"], ["http://decode2:8001"]]
+        )
+        assert result == ["http://decode1:8001", "http://decode2:8001"]
+
+        # Test empty list
+        result = RouterArgs._parse_decode_urls([])
+        assert result == []
+
+        # Test None
+        result = RouterArgs._parse_decode_urls(None)
+        assert result == []
+
+    def test_from_cli_args_basic(self):
+        """Test creating RouterArgs from basic CLI arguments."""
+        args = SimpleNamespace(
+            host="0.0.0.0",
+            port=30001,
+            worker_urls=["http://worker1:8000", "http://worker2:8000"],
+            policy="round_robin",
+            prefill=None,
+            decode=None,
+            router_policy="round_robin",
+            router_pd_disaggregation=False,
+            router_prefill_policy=None,
+            router_decode_policy=None,
+            router_worker_startup_timeout_secs=300,
+            router_worker_startup_check_interval=15,
+            router_cache_threshold=0.7,
+            router_balance_abs_threshold=128,
+            router_balance_rel_threshold=2.0,
+            router_eviction_interval=180,
+            router_max_tree_size=2**28,
+            router_max_payload_size=1024 * 1024 * 1024,  # 1GB
+            router_dp_aware=True,
+            router_api_key="test-key",
+            router_log_dir="/tmp/logs",
+            router_log_level="debug",
+            router_service_discovery=True,
+            router_selector=["app=worker", "env=test"],
+            router_service_discovery_port=8080,
+            router_service_discovery_namespace="default",
+            router_prefill_selector=["app=prefill"],
+            router_decode_selector=["app=decode"],
+            router_prometheus_port=29000,
+            router_prometheus_host="0.0.0.0",
+            router_request_id_headers=["x-request-id", "x-trace-id"],
+            router_request_timeout_secs=1200,
+            router_max_concurrent_requests=512,
+            router_queue_size=200,
+            router_queue_timeout_secs=120,
+            router_rate_limit_tokens_per_second=100,
+            router_cors_allowed_origins=["http://localhost:3000"],
+            router_retry_max_retries=3,
+            router_retry_initial_backoff_ms=100,
+            router_retry_max_backoff_ms=10000,
+            router_retry_backoff_multiplier=2.0,
+            router_retry_jitter_factor=0.1,
+            router_cb_failure_threshold=5,
+            router_cb_success_threshold=2,
+            router_cb_timeout_duration_secs=30,
+            router_cb_window_duration_secs=60,
+            router_disable_retries=False,
+            router_disable_circuit_breaker=False,
+            router_health_failure_threshold=2,
+            router_health_success_threshold=1,
+            router_health_check_timeout_secs=3,
+            router_health_check_interval_secs=30,
+            router_health_check_endpoint="/healthz",
+        )
+
+        router_args = RouterArgs.from_cli_args(args, use_router_prefix=True)
+
+        # Test basic configuration
+        assert router_args.host == "0.0.0.0"
+        assert router_args.port == 30001
+        assert router_args.worker_urls == ["http://worker1:8000", "http://worker2:8000"]
+        assert router_args.policy == "round_robin"
+
+        # Test PD configuration
+        assert router_args.pd_disaggregation is False
+        assert router_args.prefill_urls == []
+        assert router_args.decode_urls == []
+
+        # Test service discovery
+        assert router_args.service_discovery is True
+        assert router_args.selector == {"app": "worker", "env": "test"}
+        assert router_args.service_discovery_port == 8080
+        assert router_args.service_discovery_namespace == "default"
+        assert router_args.prefill_selector == {"app": "prefill"}
+        assert router_args.decode_selector == {"app": "decode"}
+
+        # Test other configurations
+        assert router_args.dp_aware is True
+        assert router_args.api_key == "test-key"
+        assert router_args.log_dir == "/tmp/logs"
+        assert router_args.log_level == "debug"
+        assert router_args.prometheus_port == 29000
+        assert router_args.prometheus_host == "0.0.0.0"
+        assert router_args.request_id_headers == ["x-request-id", "x-trace-id"]
+        assert router_args.request_timeout_secs == 1200
+        assert router_args.max_concurrent_requests == 512
+        assert router_args.queue_size == 200
+        assert router_args.queue_timeout_secs == 120
+        assert router_args.rate_limit_tokens_per_second == 100
+        assert router_args.cors_allowed_origins == ["http://localhost:3000"]
+
+        # Test retry configuration
+        assert router_args.retry_max_retries == 3
+        assert router_args.retry_initial_backoff_ms == 100
+        assert router_args.retry_max_backoff_ms == 10000
+        assert router_args.retry_backoff_multiplier == 2.0
+        assert router_args.retry_jitter_factor == 0.1
+
+        # Test circuit breaker configuration
+        assert router_args.cb_failure_threshold == 5
+        assert router_args.cb_success_threshold == 2
+        assert router_args.cb_timeout_duration_secs == 30
+        assert router_args.cb_window_duration_secs == 60
+        assert router_args.disable_retries is False
+        assert router_args.disable_circuit_breaker is False
+
+        # Test health check configuration
+        assert router_args.health_failure_threshold == 2
+        assert router_args.health_success_threshold == 1
+        assert router_args.health_check_timeout_secs == 3
+        assert router_args.health_check_interval_secs == 30
+        assert router_args.health_check_endpoint == "/healthz"
+
+        # Note: model_path and tokenizer_path are not available in current RouterArgs
+
+    def test_from_cli_args_pd_mode(self):
+        """Test creating RouterArgs from CLI arguments in PD mode."""
+        args = SimpleNamespace(
+            host="127.0.0.1",
+            port=30000,
+            worker_urls=[],
+            policy="cache_aware",
+            prefill=[
+                ["http://prefill1:8000", "9000"],
+                ["http://prefill2:8000", "none"],
+            ],
+            decode=[["http://decode1:8001"], ["http://decode2:8001"]],
+            router_prefill=[
+                ["http://prefill1:8000", "9000"],
+                ["http://prefill2:8000", "none"],
+            ],
+            router_decode=[["http://decode1:8001"], ["http://decode2:8001"]],
+            router_policy="cache_aware",
+            router_pd_disaggregation=True,
+            router_prefill_policy="power_of_two",
+            router_decode_policy="round_robin",
+            # Include all required fields with defaults
+            router_worker_startup_timeout_secs=600,
+            router_worker_startup_check_interval=30,
+            router_cache_threshold=0.3,
+            router_balance_abs_threshold=64,
+            router_balance_rel_threshold=1.5,
+            router_eviction_interval=120,
+            router_max_tree_size=2**26,
+            router_max_payload_size=512 * 1024 * 1024,
+            router_dp_aware=False,
+            router_api_key=None,
+            router_log_dir=None,
+            router_log_level=None,
+            router_service_discovery=False,
+            router_selector=None,
+            router_service_discovery_port=80,
+            router_service_discovery_namespace=None,
+            router_prefill_selector=None,
+            router_decode_selector=None,
+            router_prometheus_port=None,
+            router_prometheus_host=None,
+            router_request_id_headers=None,
+            router_request_timeout_secs=1800,
+            router_max_concurrent_requests=256,
+            router_queue_size=100,
+            router_queue_timeout_secs=60,
+            router_rate_limit_tokens_per_second=None,
+            router_cors_allowed_origins=[],
+            router_retry_max_retries=5,
+            router_retry_initial_backoff_ms=50,
+            router_retry_max_backoff_ms=30000,
+            router_retry_backoff_multiplier=1.5,
+            router_retry_jitter_factor=0.2,
+            router_cb_failure_threshold=10,
+            router_cb_success_threshold=3,
+            router_cb_timeout_duration_secs=60,
+            router_cb_window_duration_secs=120,
+            router_disable_retries=False,
+            router_disable_circuit_breaker=False,
+            router_health_failure_threshold=3,
+            router_health_success_threshold=2,
+            router_health_check_timeout_secs=5,
+            router_health_check_interval_secs=60,
+            router_health_check_endpoint="/health",
+        )
+
+        router_args = RouterArgs.from_cli_args(args, use_router_prefix=True)
+
+        # Test PD configuration
+        assert router_args.pd_disaggregation is True
+        assert router_args.prefill_urls == [
+            ("http://prefill1:8000", 9000),
+            ("http://prefill2:8000", None),
+        ]
+        assert router_args.decode_urls == ["http://decode1:8001", "http://decode2:8001"]
+        assert router_args.prefill_policy == "power_of_two"
+        assert router_args.decode_policy == "round_robin"
+        assert router_args.policy == "cache_aware"  # Main policy still set
+
+    def test_from_cli_args_without_prefix(self):
+        """Test creating RouterArgs from CLI arguments without router prefix."""
+        args = SimpleNamespace(
+            host="127.0.0.1",
+            port=30000,
+            worker_urls=["http://worker1:8000"],
+            policy="random",
+            prefill=None,
+            decode=None,
+            pd_disaggregation=False,
+            prefill_policy=None,
+            decode_policy=None,
+            worker_startup_timeout_secs=600,
+            worker_startup_check_interval=30,
+            cache_threshold=0.3,
+            balance_abs_threshold=64,
+            balance_rel_threshold=1.5,
+            eviction_interval=120,
+            max_tree_size=2**26,
+            max_payload_size=512 * 1024 * 1024,
+            dp_aware=False,
+            api_key=None,
+            log_dir=None,
+            log_level=None,
+            service_discovery=False,
+            selector=None,
+            service_discovery_port=80,
+            service_discovery_namespace=None,
+            prefill_selector=None,
+            decode_selector=None,
+            prometheus_port=None,
+            prometheus_host=None,
+            request_id_headers=None,
+            request_timeout_secs=1800,
+            max_concurrent_requests=256,
+            queue_size=100,
+            queue_timeout_secs=60,
+            rate_limit_tokens_per_second=None,
+            cors_allowed_origins=[],
+            retry_max_retries=5,
+            retry_initial_backoff_ms=50,
+            retry_max_backoff_ms=30000,
+            retry_backoff_multiplier=1.5,
+            retry_jitter_factor=0.2,
+            cb_failure_threshold=10,
+            cb_success_threshold=3,
+            cb_timeout_duration_secs=60,
+            cb_window_duration_secs=120,
+            disable_retries=False,
+            disable_circuit_breaker=False,
+            health_failure_threshold=3,
+            health_success_threshold=2,
+            health_check_timeout_secs=5,
+            health_check_interval_secs=60,
+            health_check_endpoint="/health",
+            model_path=None,
+            tokenizer_path=None,
+        )
+
+        router_args = RouterArgs.from_cli_args(args, use_router_prefix=False)
+
+        assert router_args.host == "127.0.0.1"
+        assert router_args.port == 30000
+        assert router_args.worker_urls == ["http://worker1:8000"]
+        assert router_args.policy == "random"
+        assert router_args.pd_disaggregation is False
+
+
+class TestPolicyFromStr:
+    """Test policy string to enum conversion."""
+
+    def test_valid_policies(self):
+        """Test conversion of valid policy strings."""
+        from sglang_router_rs import PolicyType
+
+        assert policy_from_str("random") == PolicyType.Random
+        assert policy_from_str("round_robin") == PolicyType.RoundRobin
+        assert policy_from_str("cache_aware") == PolicyType.CacheAware
+        assert policy_from_str("power_of_two") == PolicyType.PowerOfTwo
+
+    def test_invalid_policy(self):
+        """Test conversion of invalid policy string."""
+        with pytest.raises(KeyError):
+            policy_from_str("invalid_policy")
+
+
+class TestParseRouterArgs:
+    """Test the parse_router_args function."""
+
+    def test_parse_basic_args(self):
+        """Test parsing basic router arguments."""
+        args = [
+            "--host",
+            "0.0.0.0",
+            "--port",
+            "30001",
+            "--worker-urls",
+            "http://worker1:8000",
+            "http://worker2:8000",
+            "--policy",
+            "round_robin",
+        ]
+
+        router_args = parse_router_args(args)
+
+        assert router_args.host == "0.0.0.0"
+        assert router_args.port == 30001
+        assert router_args.worker_urls == ["http://worker1:8000", "http://worker2:8000"]
+        assert router_args.policy == "round_robin"
+
+    def test_parse_pd_args(self):
+        """Test parsing PD disaggregated mode arguments."""
+        args = [
+            "--pd-disaggregation",
+            "--prefill",
+            "http://prefill1:8000",
+            "9000",
+            "--prefill",
+            "http://prefill2:8000",
+            "none",
+            "--decode",
+            "http://decode1:8001",
+            "--decode",
+            "http://decode2:8001",
+            "--prefill-policy",
+            "power_of_two",
+            "--decode-policy",
+            "round_robin",
+        ]
+
+        router_args = parse_router_args(args)
+
+        assert router_args.pd_disaggregation is True
+        assert router_args.prefill_urls == [
+            ("http://prefill1:8000", 9000),
+            ("http://prefill2:8000", None),
+        ]
+        assert router_args.decode_urls == ["http://decode1:8001", "http://decode2:8001"]
+        assert router_args.prefill_policy == "power_of_two"
+        assert router_args.decode_policy == "round_robin"
+
+    def test_parse_service_discovery_args(self):
+        """Test parsing service discovery arguments."""
+        args = [
+            "--service-discovery",
+            "--selector",
+            "app=worker",
+            "env=prod",
+            "--service-discovery-port",
+            "8080",
+            "--service-discovery-namespace",
+            "default",
+        ]
+
+        router_args = parse_router_args(args)
+
+        assert router_args.service_discovery is True
+        assert router_args.selector == {"app": "worker", "env": "prod"}
+        assert router_args.service_discovery_port == 8080
+        assert router_args.service_discovery_namespace == "default"
+
+    def test_parse_retry_and_circuit_breaker_args(self):
+        """Test parsing retry and circuit breaker arguments."""
+        args = [
+            "--retry-max-retries",
+            "3",
+            "--retry-initial-backoff-ms",
+            "100",
+            "--retry-max-backoff-ms",
+            "10000",
+            "--retry-backoff-multiplier",
+            "2.0",
+            "--retry-jitter-factor",
+            "0.1",
+            "--disable-retries",
+            "--cb-failure-threshold",
+            "5",
+            "--cb-success-threshold",
+            "2",
+            "--cb-timeout-duration-secs",
+            "30",
+            "--cb-window-duration-secs",
+            "60",
+            "--disable-circuit-breaker",
+        ]
+
+        router_args = parse_router_args(args)
+
+        # Test retry configuration
+        assert router_args.retry_max_retries == 3
+        assert router_args.retry_initial_backoff_ms == 100
+        assert router_args.retry_max_backoff_ms == 10000
+        assert router_args.retry_backoff_multiplier == 2.0
+        assert router_args.retry_jitter_factor == 0.1
+        assert router_args.disable_retries is True
+
+        # Test circuit breaker configuration
+        assert router_args.cb_failure_threshold == 5
+        assert router_args.cb_success_threshold == 2
+        assert router_args.cb_timeout_duration_secs == 30
+        assert router_args.cb_window_duration_secs == 60
+        assert router_args.disable_circuit_breaker is True
+
+    def test_parse_rate_limiting_args(self):
+        """Test parsing rate limiting arguments."""
+        args = [
+            "--max-concurrent-requests",
+            "512",
+            "--queue-size",
+            "200",
+            "--queue-timeout-secs",
+            "120",
+            "--rate-limit-tokens-per-second",
+            "100",
+        ]
+
+        router_args = parse_router_args(args)
+
+        assert router_args.max_concurrent_requests == 512
+        assert router_args.queue_size == 200
+        assert router_args.queue_timeout_secs == 120
+        assert router_args.rate_limit_tokens_per_second == 100
+
+    def test_parse_health_check_args(self):
+        """Test parsing health check arguments."""
+        args = [
+            "--health-failure-threshold",
+            "2",
+            "--health-success-threshold",
+            "1",
+            "--health-check-timeout-secs",
+            "3",
+            "--health-check-interval-secs",
+            "30",
+            "--health-check-endpoint",
+            "/healthz",
+        ]
+
+        router_args = parse_router_args(args)
+
+        assert router_args.health_failure_threshold == 2
+        assert router_args.health_success_threshold == 1
+        assert router_args.health_check_timeout_secs == 3
+        assert router_args.health_check_interval_secs == 30
+        assert router_args.health_check_endpoint == "/healthz"
+
+    def test_parse_cors_args(self):
+        """Test parsing CORS arguments."""
+        args = [
+            "--cors-allowed-origins",
+            "http://localhost:3000",
+            "https://example.com",
+        ]
+
+        router_args = parse_router_args(args)
+
+        assert router_args.cors_allowed_origins == [
+            "http://localhost:3000",
+            "https://example.com",
+        ]
+
+    def test_parse_tokenizer_args(self):
+        """Test parsing tokenizer arguments."""
+        # Note: model-path and tokenizer-path arguments are not available in current implementation
+        # This test is skipped until those arguments are added
+        pytest.skip("Tokenizer arguments not available in current implementation")
+
+    def test_parse_invalid_args(self):
+        """Test parsing invalid arguments."""
+        # Test invalid policy
+        with pytest.raises(SystemExit):
+            parse_router_args(["--policy", "invalid_policy"])
+
+        # Test invalid bootstrap port
+        with pytest.raises(ValueError, match="Invalid bootstrap port"):
+            parse_router_args(
+                [
+                    "--pd-disaggregation",
+                    "--prefill",
+                    "http://prefill1:8000",
+                    "invalid_port",
+                ]
+            )
+
+    def test_help_output(self):
+        """Test that help output is generated correctly."""
+        with pytest.raises(SystemExit) as exc_info:
+            parse_router_args(["--help"])
+
+        # SystemExit with code 0 indicates help was displayed
+        assert exc_info.value.code == 0
diff --git a/sgl-router/py_test/unit/test_router_config.py b/sgl-router/py_test/unit/test_router_config.py
new file mode 100644
index 000000000..ed0d9db4b
--- /dev/null
+++ b/sgl-router/py_test/unit/test_router_config.py
@@ -0,0 +1,421 @@
+"""
+Unit tests for router configuration validation and setup.
+
+These tests focus on testing the router configuration logic in isolation,
+including validation of configuration parameters and their interactions.
+"""
+
+from types import SimpleNamespace
+from unittest.mock import MagicMock, patch
+
+import pytest
+from sglang_router.launch_router import RouterArgs, launch_router
+from sglang_router.router import policy_from_str
+from sglang_router_rs import PolicyType
+
+
+class TestRouterConfigValidation:
+    """Test router configuration validation logic."""
+
+    def test_valid_basic_config(self):
+        """Test that a valid basic configuration passes validation."""
+        args = RouterArgs(
+            host="127.0.0.1",
+            port=30000,
+            worker_urls=["http://worker1:8000", "http://worker2:8000"],
+            policy="cache_aware",
+        )
+
+        # Should not raise any exceptions
+        assert args.host == "127.0.0.1"
+        assert args.port == 30000
+        assert args.worker_urls == ["http://worker1:8000", "http://worker2:8000"]
+        assert args.policy == "cache_aware"
+
+    def test_valid_pd_config(self):
+        """Test that a valid PD configuration passes validation."""
+        args = RouterArgs(
+            host="127.0.0.1",
+            port=30000,
+            pd_disaggregation=True,
+            prefill_urls=[
+                ("http://prefill1:8000", 9000),
+                ("http://prefill2:8000", None),
+            ],
+            decode_urls=["http://decode1:8001", "http://decode2:8001"],
+            policy="cache_aware",
+        )
+
+        assert args.pd_disaggregation is True
+        assert args.prefill_urls == [
+            ("http://prefill1:8000", 9000),
+            ("http://prefill2:8000", None),
+        ]
+        assert args.decode_urls == ["http://decode1:8001", "http://decode2:8001"]
+        assert args.policy == "cache_aware"
+
+    def test_pd_config_without_urls_raises_error(self):
+        """Test that PD mode without URLs raises validation error."""
+        args = RouterArgs(
+            pd_disaggregation=True,
+            prefill_urls=[],
+            decode_urls=[],
+            service_discovery=False,
+        )
+
+        # This should raise an error when trying to launch
+        with pytest.raises(
+            ValueError, match="PD disaggregation mode requires --prefill"
+        ):
+            launch_router(args)
+
+    def test_pd_config_with_service_discovery_allows_empty_urls(self):
+        """Test that PD mode with service discovery allows empty URLs."""
+        args = RouterArgs(
+            pd_disaggregation=True,
+            prefill_urls=[],
+            decode_urls=[],
+            service_discovery=True,
+        )
+
+        # Should not raise validation error when service discovery is enabled
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            mock_router_instance = MagicMock()
+            router_mod.from_args = MagicMock(return_value=mock_router_instance)
+
+            launch_router(args)
+
+            # Should create router instance via from_args
+            router_mod.from_args.assert_called_once()
+
+    def test_regular_mode_without_workers_allows_empty_urls(self):
+        """Test that regular mode allows empty worker URLs."""
+        args = RouterArgs(worker_urls=[], service_discovery=False)
+
+        # Should not raise validation error
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            mock_router_instance = MagicMock()
+            router_mod.from_args = MagicMock(return_value=mock_router_instance)
+
+            launch_router(args)
+
+            # Should create router instance via from_args
+            router_mod.from_args.assert_called_once()
+
+    def test_cache_threshold_validation(self):
+        """Test cache threshold validation."""
+        # Valid cache threshold
+        args = RouterArgs(cache_threshold=0.5)
+        assert args.cache_threshold == 0.5
+
+        # Edge cases
+        args = RouterArgs(cache_threshold=0.0)
+        assert args.cache_threshold == 0.0
+
+        args = RouterArgs(cache_threshold=1.0)
+        assert args.cache_threshold == 1.0
+
+    def test_balance_threshold_validation(self):
+        """Test load balancing threshold validation."""
+        # Valid thresholds
+        args = RouterArgs(balance_abs_threshold=64, balance_rel_threshold=1.5)
+        assert args.balance_abs_threshold == 64
+        assert args.balance_rel_threshold == 1.5
+
+        # Edge cases
+        args = RouterArgs(balance_abs_threshold=0, balance_rel_threshold=1.0)
+        assert args.balance_abs_threshold == 0
+        assert args.balance_rel_threshold == 1.0
+
+    def test_timeout_validation(self):
+        """Test timeout parameter validation."""
+        # Valid timeouts
+        args = RouterArgs(
+            worker_startup_timeout_secs=600,
+            worker_startup_check_interval=30,
+            request_timeout_secs=1800,
+            queue_timeout_secs=60,
+        )
+        assert args.worker_startup_timeout_secs == 600
+        assert args.worker_startup_check_interval == 30
+        assert args.request_timeout_secs == 1800
+        assert args.queue_timeout_secs == 60
+
+    def test_retry_config_validation(self):
+        """Test retry configuration validation."""
+        # Valid retry config
+        args = RouterArgs(
+            retry_max_retries=5,
+            retry_initial_backoff_ms=50,
+            retry_max_backoff_ms=30000,
+            retry_backoff_multiplier=1.5,
+            retry_jitter_factor=0.2,
+            disable_retries=False,
+        )
+        assert args.retry_max_retries == 5
+        assert args.retry_initial_backoff_ms == 50
+        assert args.retry_max_backoff_ms == 30000
+        assert args.retry_backoff_multiplier == 1.5
+        assert args.retry_jitter_factor == 0.2
+        assert args.disable_retries is False
+
+    def test_circuit_breaker_config_validation(self):
+        """Test circuit breaker configuration validation."""
+        # Valid circuit breaker config
+        args = RouterArgs(
+            cb_failure_threshold=10,
+            cb_success_threshold=3,
+            cb_timeout_duration_secs=60,
+            cb_window_duration_secs=120,
+            disable_circuit_breaker=False,
+        )
+        assert args.cb_failure_threshold == 10
+        assert args.cb_success_threshold == 3
+        assert args.cb_timeout_duration_secs == 60
+        assert args.cb_window_duration_secs == 120
+        assert args.disable_circuit_breaker is False
+
+    def test_health_check_config_validation(self):
+        """Test health check configuration validation."""
+        # Valid health check config
+        args = RouterArgs(
+            health_failure_threshold=3,
+            health_success_threshold=2,
+            health_check_timeout_secs=5,
+            health_check_interval_secs=60,
+            health_check_endpoint="/health",
+        )
+        assert args.health_failure_threshold == 3
+        assert args.health_success_threshold == 2
+        assert args.health_check_timeout_secs == 5
+        assert args.health_check_interval_secs == 60
+        assert args.health_check_endpoint == "/health"
+
+    def test_rate_limiting_config_validation(self):
+        """Test rate limiting configuration validation."""
+        # Valid rate limiting config
+        args = RouterArgs(
+            max_concurrent_requests=256,
+            queue_size=100,
+            queue_timeout_secs=60,
+            rate_limit_tokens_per_second=100,
+        )
+        assert args.max_concurrent_requests == 256
+        assert args.queue_size == 100
+        assert args.queue_timeout_secs == 60
+        assert args.rate_limit_tokens_per_second == 100
+
+    def test_service_discovery_config_validation(self):
+        """Test service discovery configuration validation."""
+        # Valid service discovery config
+        args = RouterArgs(
+            service_discovery=True,
+            selector={"app": "worker", "env": "prod"},
+            service_discovery_port=8080,
+            service_discovery_namespace="default",
+        )
+        assert args.service_discovery is True
+        assert args.selector == {"app": "worker", "env": "prod"}
+        assert args.service_discovery_port == 8080
+        assert args.service_discovery_namespace == "default"
+
+    def test_pd_service_discovery_config_validation(self):
+        """Test PD service discovery configuration validation."""
+        # Valid PD service discovery config
+        args = RouterArgs(
+            pd_disaggregation=True,
+            service_discovery=True,
+            prefill_selector={"app": "prefill"},
+            decode_selector={"app": "decode"},
+            bootstrap_port_annotation="sglang.ai/bootstrap-port",
+        )
+        assert args.pd_disaggregation is True
+        assert args.service_discovery is True
+        assert args.prefill_selector == {"app": "prefill"}
+        assert args.decode_selector == {"app": "decode"}
+        assert args.bootstrap_port_annotation == "sglang.ai/bootstrap-port"
+
+    def test_prometheus_config_validation(self):
+        """Test Prometheus configuration validation."""
+        # Valid Prometheus config
+        args = RouterArgs(prometheus_port=29000, prometheus_host="127.0.0.1")
+        assert args.prometheus_port == 29000
+        assert args.prometheus_host == "127.0.0.1"
+
+    def test_cors_config_validation(self):
+        """Test CORS configuration validation."""
+        # Valid CORS config
+        args = RouterArgs(
+            cors_allowed_origins=["http://localhost:3000", "https://example.com"]
+        )
+        assert args.cors_allowed_origins == [
+            "http://localhost:3000",
+            "https://example.com",
+        ]
+
+    def test_tokenizer_config_validation(self):
+        """Test tokenizer configuration validation."""
+        # Note: model_path and tokenizer_path are not available in current RouterArgs
+        pytest.skip("Tokenizer configuration not available in current implementation")
+
+    def test_dp_aware_config_validation(self):
+        """Test data parallelism aware configuration validation."""
+        # Valid DP aware config
+        args = RouterArgs(dp_aware=True, api_key="test-api-key")
+        assert args.dp_aware is True
+        assert args.api_key == "test-api-key"
+
+    def test_request_id_headers_validation(self):
+        """Test request ID headers configuration validation."""
+        # Valid request ID headers config
+        args = RouterArgs(
+            request_id_headers=["x-request-id", "x-trace-id", "x-correlation-id"]
+        )
+        assert args.request_id_headers == [
+            "x-request-id",
+            "x-trace-id",
+            "x-correlation-id",
+        ]
+
+    def test_policy_consistency_validation(self):
+        """Test policy consistency validation in PD mode."""
+        # Test with both prefill and decode policies specified
+        args = RouterArgs(
+            pd_disaggregation=True,
+            prefill_urls=[("http://prefill1:8000", None)],
+            decode_urls=["http://decode1:8001"],
+            policy="cache_aware",
+            prefill_policy="power_of_two",
+            decode_policy="round_robin",
+        )
+
+        # Should not raise validation error
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            mock_router_instance = MagicMock()
+            router_mod.from_args = MagicMock(return_value=mock_router_instance)
+
+            launch_router(args)
+
+            # Should create router instance via from_args
+            router_mod.from_args.assert_called_once()
+
+    def test_policy_fallback_validation(self):
+        """Test policy fallback validation in PD mode."""
+        # Test with only prefill policy specified
+        args = RouterArgs(
+            pd_disaggregation=True,
+            prefill_urls=[("http://prefill1:8000", None)],
+            decode_urls=["http://decode1:8001"],
+            policy="cache_aware",
+            prefill_policy="power_of_two",
+            decode_policy=None,
+        )
+
+        # Should not raise validation error
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            mock_router_instance = MagicMock()
+            router_mod.from_args = MagicMock(return_value=mock_router_instance)
+
+            launch_router(args)
+
+            # Should create router instance via from_args
+            router_mod.from_args.assert_called_once()
+
+    def test_policy_enum_conversion(self):
+        """Test policy string to enum conversion."""
+        # Test all valid policy conversions
+        assert policy_from_str("random") == PolicyType.Random
+        assert policy_from_str("round_robin") == PolicyType.RoundRobin
+        assert policy_from_str("cache_aware") == PolicyType.CacheAware
+        assert policy_from_str("power_of_two") == PolicyType.PowerOfTwo
+
+    def test_invalid_policy_enum_conversion(self):
+        """Test invalid policy string to enum conversion."""
+        with pytest.raises(KeyError):
+            policy_from_str("invalid_policy")
+
+    def test_config_immutability(self):
+        """Test that configuration objects are properly immutable."""
+        args = RouterArgs(
+            host="127.0.0.1", port=30000, worker_urls=["http://worker1:8000"]
+        )
+
+        # Test that we can't modify the configuration after creation
+        # (This is more of a design test - dataclasses are mutable by default)
+        original_host = args.host
+        args.host = "0.0.0.0"
+        assert args.host == "0.0.0.0"  # Dataclasses are mutable
+        assert args.host != original_host
+
+    def test_config_defaults_consistency(self):
+        """Test that configuration defaults are consistent."""
+        args1 = RouterArgs()
+        args2 = RouterArgs()
+
+        # Both instances should have the same defaults
+        assert args1.host == args2.host
+        assert args1.port == args2.port
+        assert args1.policy == args2.policy
+        assert args1.worker_urls == args2.worker_urls
+        assert args1.pd_disaggregation == args2.pd_disaggregation
+
+    def test_config_serialization(self):
+        """Test that configuration can be serialized/deserialized."""
+        args = RouterArgs(
+            host="127.0.0.1",
+            port=30000,
+            worker_urls=["http://worker1:8000"],
+            policy="cache_aware",
+            cache_threshold=0.5,
+        )
+
+        # Test that we can access all attributes
+        assert hasattr(args, "host")
+        assert hasattr(args, "port")
+        assert hasattr(args, "worker_urls")
+        assert hasattr(args, "policy")
+        assert hasattr(args, "cache_threshold")
+
+    def test_config_with_none_values(self):
+        """Test configuration with None values."""
+        args = RouterArgs(
+            api_key=None,
+            log_dir=None,
+            log_level=None,
+            prometheus_port=None,
+            prometheus_host=None,
+            request_id_headers=None,
+            rate_limit_tokens_per_second=None,
+            service_discovery_namespace=None,
+        )
+
+        # All None values should be preserved
+        assert args.api_key is None
+        assert args.log_dir is None
+        assert args.log_level is None
+        assert args.prometheus_port is None
+        assert args.prometheus_host is None
+        assert args.request_id_headers is None
+        assert args.rate_limit_tokens_per_second is None
+        assert args.service_discovery_namespace is None
+
+    def test_config_with_empty_lists(self):
+        """Test configuration with empty lists."""
+        args = RouterArgs(
+            worker_urls=[], prefill_urls=[], decode_urls=[], cors_allowed_origins=[]
+        )
+
+        # All empty lists should be preserved
+        assert args.worker_urls == []
+        assert args.prefill_urls == []
+        assert args.decode_urls == []
+        assert args.cors_allowed_origins == []
+
+    def test_config_with_empty_dicts(self):
+        """Test configuration with empty dictionaries."""
+        args = RouterArgs(selector={}, prefill_selector={}, decode_selector={})
+
+        # All empty dictionaries should be preserved
+        assert args.selector == {}
+        assert args.prefill_selector == {}
+        assert args.decode_selector == {}
diff --git a/sgl-router/py_test/unit/test_startup_sequence.py b/sgl-router/py_test/unit/test_startup_sequence.py
new file mode 100644
index 000000000..133c7eb16
--- /dev/null
+++ b/sgl-router/py_test/unit/test_startup_sequence.py
@@ -0,0 +1,1053 @@
+"""
+Unit tests for startup sequence logic in sglang_router.
+
+These tests focus on testing the startup sequence logic in isolation,
+including router initialization, configuration validation, and startup flow.
+"""
+
+import logging
+from types import SimpleNamespace
+from unittest.mock import MagicMock, call, patch
+
+import pytest
+from sglang_router.launch_router import RouterArgs, launch_router
+from sglang_router.router import policy_from_str
+
+
+# Local helper mirroring the router logger setup used in production
+def setup_logger():
+    logger = logging.getLogger("router")
+    logger.setLevel(logging.INFO)
+    if not logger.handlers:
+        formatter = logging.Formatter(
+            "[Router (Python)] %(asctime)s - %(levelname)s - %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S",
+        )
+        handler = logging.StreamHandler()
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+    return logger
+
+
+from sglang_router_rs import PolicyType
+
+
+class TestSetupLogger:
+    """Test logger setup functionality."""
+
+    def test_setup_logger_returns_logger(self):
+        """Test that setup_logger returns a logger instance."""
+        logger = setup_logger()
+
+        assert isinstance(logger, logging.Logger)
+        assert logger.name == "router"
+        assert logger.level == logging.INFO
+
+    def test_setup_logger_has_handler(self):
+        """Test that setup_logger configures a handler."""
+        logger = setup_logger()
+
+        assert len(logger.handlers) > 0
+        handler = logger.handlers[0]
+        assert isinstance(handler, logging.StreamHandler)
+
+    def test_setup_logger_has_formatter(self):
+        """Test that setup_logger configures a formatter."""
+        logger = setup_logger()
+
+        handler = logger.handlers[0]
+        formatter = handler.formatter
+
+        assert formatter is not None
+        assert "[Router (Python)]" in formatter._fmt
+
+    def test_setup_logger_multiple_calls(self):
+        """Test that multiple calls to setup_logger work correctly."""
+        logger1 = setup_logger()
+        logger2 = setup_logger()
+
+        # Should return the same logger instance
+        assert logger1 is logger2
+
+
+class TestPolicyFromStr:
+    """Test policy string to enum conversion in startup context."""
+
+    def test_policy_conversion_in_startup(self):
+        """Test policy conversion during startup sequence."""
+        # Test all valid policies
+        policies = ["random", "round_robin", "cache_aware", "power_of_two"]
+        expected_enums = [
+            PolicyType.Random,
+            PolicyType.RoundRobin,
+            PolicyType.CacheAware,
+            PolicyType.PowerOfTwo,
+        ]
+
+        for policy_str, expected_enum in zip(policies, expected_enums):
+            result = policy_from_str(policy_str)
+            assert result == expected_enum
+
+    def test_invalid_policy_in_startup(self):
+        """Test handling of invalid policy during startup."""
+        with pytest.raises(KeyError):
+            policy_from_str("invalid_policy")
+
+
+class TestRouterInitialization:
+    """Test router initialization logic."""
+
+    def test_router_initialization_basic(self):
+        """Test basic router initialization."""
+        args = RouterArgs(
+            host="127.0.0.1",
+            port=30000,
+            worker_urls=["http://worker1:8000"],
+            policy="cache_aware",
+        )
+
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            captured_args = {}
+
+            mock_router_instance = MagicMock()
+
+            def fake_from_args(router_args):
+                # capture needed fields from RouterArgs
+                captured_args.update(
+                    dict(
+                        host=router_args.host,
+                        port=router_args.port,
+                        worker_urls=router_args.worker_urls,
+                        policy=policy_from_str(router_args.policy),
+                    )
+                )
+                return mock_router_instance
+
+            router_mod.from_args = MagicMock(side_effect=fake_from_args)
+
+            result = launch_router(args)
+
+            # Verify Router.from_args was called and captured fields match
+            router_mod.from_args.assert_called_once()
+            assert captured_args["host"] == "127.0.0.1"
+            assert captured_args["port"] == 30000
+            assert captured_args["worker_urls"] == ["http://worker1:8000"]
+            assert captured_args["policy"] == PolicyType.CacheAware
+
+            # Verify router.start() was called
+            mock_router_instance.start.assert_called_once()
+
+            # Function returns None; ensure start was invoked
+
+    def test_router_initialization_pd_mode(self):
+        """Test router initialization in PD mode."""
+        args = RouterArgs(
+            pd_disaggregation=True,
+            prefill_urls=[("http://prefill1:8000", 9000)],
+            decode_urls=["http://decode1:8001"],
+            policy="power_of_two",
+        )
+
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            captured_args = {}
+            mock_router_instance = MagicMock()
+
+            def fake_from_args(router_args):
+                captured_args.update(
+                    dict(
+                        pd_disaggregation=router_args.pd_disaggregation,
+                        prefill_urls=router_args.prefill_urls,
+                        decode_urls=router_args.decode_urls,
+                        policy=policy_from_str(router_args.policy),
+                    )
+                )
+                return mock_router_instance
+
+            router_mod.from_args = MagicMock(side_effect=fake_from_args)
+
+            result = launch_router(args)
+
+            # Verify Router.from_args was called with PD parameters
+            router_mod.from_args.assert_called_once()
+            assert captured_args["pd_disaggregation"] is True
+            assert captured_args["prefill_urls"] == [("http://prefill1:8000", 9000)]
+            assert captured_args["decode_urls"] == ["http://decode1:8001"]
+            assert captured_args["policy"] == PolicyType.PowerOfTwo
+
+            # Verify router.start() was called
+            mock_router_instance.start.assert_called_once()
+
+            # Function returns None; ensure start was invoked
+
+    def test_router_initialization_with_service_discovery(self):
+        """Test router initialization with service discovery."""
+        args = RouterArgs(
+            service_discovery=True,
+            selector={"app": "worker", "env": "prod"},
+            service_discovery_port=8080,
+            service_discovery_namespace="default",
+        )
+
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            captured_args = {}
+            mock_router_instance = MagicMock()
+
+            def fake_from_args(router_args):
+                captured_args.update(
+                    dict(
+                        service_discovery=router_args.service_discovery,
+                        selector=router_args.selector,
+                        service_discovery_port=router_args.service_discovery_port,
+                        service_discovery_namespace=router_args.service_discovery_namespace,
+                    )
+                )
+                return mock_router_instance
+
+            router_mod.from_args = MagicMock(side_effect=fake_from_args)
+
+            result = launch_router(args)
+
+            # Verify Router.from_args was called with service discovery parameters
+            router_mod.from_args.assert_called_once()
+            assert captured_args["service_discovery"] is True
+            assert captured_args["selector"] == {"app": "worker", "env": "prod"}
+            assert captured_args["service_discovery_port"] == 8080
+            assert captured_args["service_discovery_namespace"] == "default"
+
+            # Verify router.start() was called
+            mock_router_instance.start.assert_called_once()
+
+            # Function returns None; ensure start was invoked
+
+    def test_router_initialization_with_retry_config(self):
+        """Test router initialization with retry configuration."""
+        args = RouterArgs(
+            retry_max_retries=3,
+            retry_initial_backoff_ms=100,
+            retry_max_backoff_ms=10000,
+            retry_backoff_multiplier=2.0,
+            retry_jitter_factor=0.1,
+            disable_retries=False,
+        )
+
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            captured_args = {}
+            mock_router_instance = MagicMock()
+
+            def fake_from_args(router_args):
+                captured_args.update(
+                    dict(
+                        retry_max_retries=router_args.retry_max_retries,
+                        retry_initial_backoff_ms=router_args.retry_initial_backoff_ms,
+                        retry_max_backoff_ms=router_args.retry_max_backoff_ms,
+                        retry_backoff_multiplier=router_args.retry_backoff_multiplier,
+                        retry_jitter_factor=router_args.retry_jitter_factor,
+                        disable_retries=router_args.disable_retries,
+                    )
+                )
+                return mock_router_instance
+
+            router_mod.from_args = MagicMock(side_effect=fake_from_args)
+
+            result = launch_router(args)
+
+            # Verify router was created with retry parameters
+            router_mod.from_args.assert_called_once()
+            assert captured_args["retry_max_retries"] == 3
+            assert captured_args["retry_initial_backoff_ms"] == 100
+            assert captured_args["retry_max_backoff_ms"] == 10000
+            assert captured_args["retry_backoff_multiplier"] == 2.0
+            assert captured_args["retry_jitter_factor"] == 0.1
+            assert captured_args["disable_retries"] is False
+
+            # Verify router.start() was called
+            mock_router_instance.start.assert_called_once()
+
+            # Function returns None; ensure start was invoked
+
+    def test_router_initialization_with_circuit_breaker_config(self):
+        """Test router initialization with circuit breaker configuration."""
+        args = RouterArgs(
+            cb_failure_threshold=5,
+            cb_success_threshold=2,
+            cb_timeout_duration_secs=30,
+            cb_window_duration_secs=60,
+            disable_circuit_breaker=False,
+        )
+
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            captured_args = {}
+            mock_router_instance = MagicMock()
+
+            def fake_from_args(router_args):
+                captured_args.update(
+                    dict(
+                        cb_failure_threshold=router_args.cb_failure_threshold,
+                        cb_success_threshold=router_args.cb_success_threshold,
+                        cb_timeout_duration_secs=router_args.cb_timeout_duration_secs,
+                        cb_window_duration_secs=router_args.cb_window_duration_secs,
+                        disable_circuit_breaker=router_args.disable_circuit_breaker,
+                    )
+                )
+                return mock_router_instance
+
+            router_mod.from_args = MagicMock(side_effect=fake_from_args)
+
+            result = launch_router(args)
+
+            # Verify router was created with circuit breaker parameters
+            router_mod.from_args.assert_called_once()
+            assert captured_args["cb_failure_threshold"] == 5
+            assert captured_args["cb_success_threshold"] == 2
+            assert captured_args["cb_timeout_duration_secs"] == 30
+            assert captured_args["cb_window_duration_secs"] == 60
+            assert captured_args["disable_circuit_breaker"] is False
+
+            # Verify router.start() was called
+            mock_router_instance.start.assert_called_once()
+
+            # Function returns None; ensure start was invoked
+
+    def test_router_initialization_with_rate_limiting_config(self):
+        """Test router initialization with rate limiting configuration."""
+        args = RouterArgs(
+            max_concurrent_requests=512,
+            queue_size=200,
+            queue_timeout_secs=120,
+            rate_limit_tokens_per_second=100,
+        )
+
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            captured_args = {}
+            mock_router_instance = MagicMock()
+
+            def fake_from_args(router_args):
+                captured_args.update(
+                    dict(
+                        max_concurrent_requests=router_args.max_concurrent_requests,
+                        queue_size=router_args.queue_size,
+                        queue_timeout_secs=router_args.queue_timeout_secs,
+                        rate_limit_tokens_per_second=router_args.rate_limit_tokens_per_second,
+                    )
+                )
+                return mock_router_instance
+
+            router_mod.from_args = MagicMock(side_effect=fake_from_args)
+
+            result = launch_router(args)
+
+            # Verify router was created with rate limiting parameters
+            router_mod.from_args.assert_called_once()
+            assert captured_args["max_concurrent_requests"] == 512
+            assert captured_args["queue_size"] == 200
+            assert captured_args["queue_timeout_secs"] == 120
+            assert captured_args["rate_limit_tokens_per_second"] == 100
+
+            # Verify router.start() was called
+            mock_router_instance.start.assert_called_once()
+
+            # Function returns None; ensure start was invoked
+
+    def test_router_initialization_with_health_check_config(self):
+        """Test router initialization with health check configuration."""
+        args = RouterArgs(
+            health_failure_threshold=2,
+            health_success_threshold=1,
+            health_check_timeout_secs=3,
+            health_check_interval_secs=30,
+            health_check_endpoint="/healthz",
+        )
+
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            captured_args = {}
+            mock_router_instance = MagicMock()
+
+            def fake_from_args(router_args):
+                captured_args.update(
+                    dict(
+                        health_failure_threshold=router_args.health_failure_threshold,
+                        health_success_threshold=router_args.health_success_threshold,
+                        health_check_timeout_secs=router_args.health_check_timeout_secs,
+                        health_check_interval_secs=router_args.health_check_interval_secs,
+                        health_check_endpoint=router_args.health_check_endpoint,
+                    )
+                )
+                return mock_router_instance
+
+            router_mod.from_args = MagicMock(side_effect=fake_from_args)
+
+            result = launch_router(args)
+
+            # Verify router was created with health check parameters
+            router_mod.from_args.assert_called_once()
+            assert captured_args["health_failure_threshold"] == 2
+            assert captured_args["health_success_threshold"] == 1
+            assert captured_args["health_check_timeout_secs"] == 3
+            assert captured_args["health_check_interval_secs"] == 30
+            assert captured_args["health_check_endpoint"] == "/healthz"
+
+            # Verify router.start() was called
+            mock_router_instance.start.assert_called_once()
+
+            # Function returns None; ensure start was invoked
+
+    def test_router_initialization_with_prometheus_config(self):
+        """Test router initialization with Prometheus configuration."""
+        args = RouterArgs(prometheus_port=29000, prometheus_host="127.0.0.1")
+
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            captured_args = {}
+            mock_router_instance = MagicMock()
+
+            def fake_from_args(router_args):
+                captured_args.update(
+                    dict(
+                        prometheus_port=router_args.prometheus_port,
+                        prometheus_host=router_args.prometheus_host,
+                    )
+                )
+                return mock_router_instance
+
+            router_mod.from_args = MagicMock(side_effect=fake_from_args)
+
+            result = launch_router(args)
+
+            # Verify router was created with Prometheus parameters
+            router_mod.from_args.assert_called_once()
+            assert captured_args["prometheus_port"] == 29000
+            assert captured_args["prometheus_host"] == "127.0.0.1"
+
+            # Verify router.start() was called
+            mock_router_instance.start.assert_called_once()
+
+            # Function returns None; ensure start was invoked
+
+    def test_router_initialization_with_cors_config(self):
+        """Test router initialization with CORS configuration."""
+        args = RouterArgs(
+            cors_allowed_origins=["http://localhost:3000", "https://example.com"]
+        )
+
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            captured_args = {}
+            mock_router_instance = MagicMock()
+
+            def fake_from_args(router_args):
+                captured_args.update(
+                    dict(cors_allowed_origins=router_args.cors_allowed_origins)
+                )
+                return mock_router_instance
+
+            router_mod.from_args = MagicMock(side_effect=fake_from_args)
+
+            result = launch_router(args)
+
+            # Verify router was created with CORS parameters
+            router_mod.from_args.assert_called_once()
+            assert captured_args["cors_allowed_origins"] == [
+                "http://localhost:3000",
+                "https://example.com",
+            ]
+
+            # Verify router.start() was called
+            mock_router_instance.start.assert_called_once()
+
+            # Function returns None; ensure start was invoked
+
+    def test_router_initialization_with_tokenizer_config(self):
+        """Test router initialization with tokenizer configuration."""
+        # Note: model_path and tokenizer_path are not available in current RouterArgs
+        pytest.skip("Tokenizer configuration not available in current implementation")
+
+
+class TestStartupValidation:
+    """Test startup validation logic."""
+
+    def test_pd_mode_validation_during_startup(self):
+        """Test PD mode validation during startup."""
+        # PD mode without URLs should fail
+        args = RouterArgs(
+            pd_disaggregation=True,
+            prefill_urls=[],
+            decode_urls=[],
+            service_discovery=False,
+        )
+
+        with pytest.raises(
+            ValueError, match="PD disaggregation mode requires --prefill"
+        ):
+            launch_router(args)
+
+    def test_pd_mode_with_service_discovery_validation(self):
+        """Test PD mode with service discovery validation during startup."""
+        args = RouterArgs(
+            pd_disaggregation=True,
+            prefill_urls=[],
+            decode_urls=[],
+            service_discovery=True,
+        )
+
+        # Should not raise validation error
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            mock_router_instance = MagicMock()
+
+            router_mod.from_args = MagicMock(return_value=mock_router_instance)
+
+            result = launch_router(args)
+
+            # Should create router instance
+            router_mod.from_args.assert_called_once()
+
+    def test_policy_warning_during_startup(self):
+        """Test policy warning during startup in PD mode."""
+        args = RouterArgs(
+            pd_disaggregation=True,
+            prefill_urls=[("http://prefill1:8000", None)],
+            decode_urls=["http://decode1:8001"],
+            policy="cache_aware",
+            prefill_policy="power_of_two",
+            decode_policy="round_robin",
+        )
+
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            mock_router_instance = MagicMock()
+            router_mod.from_args = MagicMock(return_value=mock_router_instance)
+
+            # The policy messages are emitted by router_args logger
+            with patch("sglang_router.router_args.logger") as mock_logger:
+                result = launch_router(args)
+
+                # Should log warning about policy usage
+                mock_logger.warning.assert_called_once()
+                warning_call = mock_logger.warning.call_args[0][0]
+                assert (
+                    "Both --prefill-policy and --decode-policy are specified"
+                    in warning_call
+                )
+
+                # Should create router instance
+                router_mod.from_args.assert_called_once()
+
+    def test_policy_info_during_startup(self):
+        """Test policy info logging during startup in PD mode."""
+        # Test with only prefill policy specified
+        args = RouterArgs(
+            pd_disaggregation=True,
+            prefill_urls=[("http://prefill1:8000", None)],
+            decode_urls=["http://decode1:8001"],
+            policy="cache_aware",
+            prefill_policy="power_of_two",
+            decode_policy=None,
+        )
+
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            mock_router_instance = MagicMock()
+            router_mod.from_args = MagicMock(return_value=mock_router_instance)
+
+            # The policy messages are emitted by router_args logger
+            with patch("sglang_router.router_args.logger") as mock_logger:
+                result = launch_router(args)
+
+                # Should log info about policy usage
+                mock_logger.info.assert_called_once()
+                info_call = mock_logger.info.call_args[0][0]
+                assert "Using --prefill-policy 'power_of_two'" in info_call
+                assert "and --policy 'cache_aware'" in info_call
+
+                # Should create router instance
+                router_mod.from_args.assert_called_once()
+
+    def test_policy_info_decode_only_during_startup(self):
+        """Test policy info logging during startup with only decode policy specified."""
+        args = RouterArgs(
+            pd_disaggregation=True,
+            prefill_urls=[("http://prefill1:8000", None)],
+            decode_urls=["http://decode1:8001"],
+            policy="cache_aware",
+            prefill_policy=None,
+            decode_policy="round_robin",
+        )
+
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            mock_router_instance = MagicMock()
+            router_mod.from_args = MagicMock(return_value=mock_router_instance)
+
+            # The policy messages are emitted by router_args logger
+            with patch("sglang_router.router_args.logger") as mock_logger:
+                result = launch_router(args)
+
+                # Should log info about policy usage
+                mock_logger.info.assert_called_once()
+                info_call = mock_logger.info.call_args[0][0]
+                assert "Using --policy 'cache_aware'" in info_call
+                assert "and --decode-policy 'round_robin'" in info_call
+
+                # Should create router instance
+                router_mod.from_args.assert_called_once()
+
+
+class TestStartupErrorHandling:
+    """Test startup error handling logic."""
+
+    def test_router_creation_error_handling(self):
+        """Test error handling when router creation fails."""
+        args = RouterArgs(
+            host="127.0.0.1", port=30000, worker_urls=["http://worker1:8000"]
+        )
+
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            # Simulate router creation failure in from_args
+            router_mod.from_args = MagicMock(
+                side_effect=Exception("Router creation failed")
+            )
+
+            with patch("sglang_router.launch_router.logger") as mock_logger:
+                with pytest.raises(Exception, match="Router creation failed"):
+                    launch_router(args)
+
+                # Should log error
+                mock_logger.error.assert_called_once()
+                error_call = mock_logger.error.call_args[0][0]
+                assert "Error starting router: Router creation failed" in error_call
+
+    def test_router_start_error_handling(self):
+        """Test error handling when router start fails."""
+        args = RouterArgs(
+            host="127.0.0.1", port=30000, worker_urls=["http://worker1:8000"]
+        )
+
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            mock_router_instance = MagicMock()
+            router_mod.from_args = MagicMock(return_value=mock_router_instance)
+
+            # Simulate router start failure
+            mock_router_instance.start.side_effect = Exception("Router start failed")
+
+            with patch("sglang_router.launch_router.logger") as mock_logger:
+                with pytest.raises(Exception, match="Router start failed"):
+                    launch_router(args)
+
+                # Should log error
+                mock_logger.error.assert_called_once()
+                error_call = mock_logger.error.call_args[0][0]
+                assert "Error starting router: Router start failed" in error_call
+
+
+# --- Added unit tests for Router wrapper and launch_server helpers ---
+
+
+def _install_sglang_stubs(monkeypatch):
+    """Install lightweight stubs for sglang.srt to avoid heavy deps during unit tests."""
+    import sys
+    import types
+
+    sglang_mod = types.ModuleType("sglang")
+    srt_mod = types.ModuleType("sglang.srt")
+    entry_mod = types.ModuleType("sglang.srt.entrypoints")
+    http_server_mod = types.ModuleType("sglang.srt.entrypoints.http_server")
+    server_args_mod = types.ModuleType("sglang.srt.server_args")
+    utils_mod = types.ModuleType("sglang.srt.utils")
+
+    def launch_server(_args):
+        return None
+
+    class ServerArgs:
+        # Minimal fields used by launch_server_process
+        def __init__(self):
+            self.port = 0
+            self.base_gpu_id = 0
+            self.dp_size = 1
+            self.tp_size = 1
+
+        @staticmethod
+        def add_cli_args(_parser):
+            return None
+
+        @staticmethod
+        def from_cli_args(_args):
+            sa = ServerArgs()
+            if hasattr(_args, "dp_size"):
+                sa.dp_size = _args.dp_size
+            if hasattr(_args, "tp_size"):
+                sa.tp_size = _args.tp_size
+            if hasattr(_args, "host"):
+                sa.host = _args.host
+            else:
+                sa.host = "127.0.0.1"
+            return sa
+
+    def is_port_available(_port: int) -> bool:
+        return True
+
+    http_server_mod.launch_server = launch_server
+    server_args_mod.ServerArgs = ServerArgs
+    utils_mod.is_port_available = is_port_available
+
+    # Also stub external deps imported at module top-level
+    def _dummy_get(*_a, **_k):
+        raise NotImplementedError
+
+    requests_stub = types.SimpleNamespace(
+        exceptions=types.SimpleNamespace(RequestException=Exception), get=_dummy_get
+    )
+    setproctitle_stub = types.SimpleNamespace(setproctitle=lambda *_a, **_k: None)
+
+    monkeypatch.setitem(sys.modules, "requests", requests_stub)
+    monkeypatch.setitem(sys.modules, "setproctitle", setproctitle_stub)
+
+    monkeypatch.setitem(sys.modules, "sglang", sglang_mod)
+    monkeypatch.setitem(sys.modules, "sglang.srt", srt_mod)
+    monkeypatch.setitem(sys.modules, "sglang.srt.entrypoints", entry_mod)
+    monkeypatch.setitem(
+        sys.modules, "sglang.srt.entrypoints.http_server", http_server_mod
+    )
+    monkeypatch.setitem(sys.modules, "sglang.srt.server_args", server_args_mod)
+    monkeypatch.setitem(sys.modules, "sglang.srt.utils", utils_mod)
+
+
+def test_router_defaults_and_start(monkeypatch):
+    """Router wrapper: defaults normalization and start() call.
+
+    Mocks the Rust-backed _Router to avoid native deps.
+    """
+    from sglang_router import router as router_mod
+
+    captured = {}
+
+    class FakeRouter:
+        def __init__(self, **kwargs):
+            captured.update(kwargs)
+
+        def start(self):
+            captured["started"] = True
+
+    monkeypatch.setattr(router_mod, "_Router", FakeRouter, raising=True)
+
+    from sglang_router.router_args import RouterArgs as _RouterArgs
+
+    Router = router_mod.Router
+    args = _RouterArgs(
+        worker_urls=["http://w1:8000"],
+        policy="round_robin",
+        selector=None,
+        prefill_selector=None,
+        decode_selector=None,
+        cors_allowed_origins=None,
+    )
+
+    r = Router.from_args(args)
+
+    # Defaults preserved/normalized by Router.from_args
+    assert captured["selector"] is None
+    assert captured["prefill_selector"] is None
+    assert captured["decode_selector"] is None
+    assert captured["cors_allowed_origins"] is None
+    assert captured["worker_urls"] == ["http://w1:8000"]
+    from sglang_router_rs import PolicyType
+
+    assert captured["policy"] == PolicyType.RoundRobin
+
+    r.start()
+    assert captured.get("started") is True
+
+
+def test_find_available_ports_and_wait_health(monkeypatch):
+    """launch_server helpers: port finding and health waiting with transient error."""
+    _install_sglang_stubs(monkeypatch)
+    import importlib
+
+    ls = importlib.import_module("sglang_router.launch_server")
+
+    # Deterministic increments
+    monkeypatch.setattr(ls.random, "randint", lambda a, b: 100)
+    ports = ls.find_available_ports(30000, 3)
+    assert ports == [30000, 30100, 30200]
+
+    calls = {"n": 0}
+
+    class Ok:
+        status_code = 200
+
+    def fake_get(_url, timeout=5):
+        calls["n"] += 1
+        if calls["n"] == 1:
+            raise ls.requests.exceptions.RequestException("boom")
+        return Ok()
+
+    monkeypatch.setattr(ls.requests, "get", fake_get)
+    monkeypatch.setattr(ls.time, "sleep", lambda _s: None)
+    base = {"t": 0.0}
+    monkeypatch.setattr(
+        ls.time,
+        "perf_counter",
+        lambda: (base.__setitem__("t", base["t"] + 0.1) or base["t"]),
+    )
+
+    assert ls.wait_for_server_health("127.0.0.1", 12345, timeout=1)
+
+
+def test_launch_server_process_and_cleanup(monkeypatch):
+    """launch_server: process creation args and cleanup SIGTERM/SIGKILL logic."""
+    _install_sglang_stubs(monkeypatch)
+    import importlib
+
+    ls = importlib.import_module("sglang_router.launch_server")
+
+    created = {}
+
+    class FakeProcess:
+        def __init__(self, target, args):
+            created["target"] = target
+            created["args"] = args
+            self.pid = 4242
+            self._alive = True
+
+        def start(self):
+            created["started"] = True
+
+        def join(self, timeout=None):
+            return None
+
+        def is_alive(self):
+            return self._alive
+
+    monkeypatch.setattr(ls.mp, "Process", FakeProcess)
+
+    import sys as _sys
+
+    SA = _sys.modules["sglang.srt.server_args"].ServerArgs
+    sa = SA()
+    sa.tp_size = 2
+
+    proc = ls.launch_server_process(sa, worker_port=31001, dp_id=3)
+    assert created.get("started") is True
+    targ, targ_args = created["target"], created["args"]
+    assert targ is ls.run_server
+    passed_sa = targ_args[0]
+    assert passed_sa.port == 31001
+    assert passed_sa.base_gpu_id == 3 * 2
+    assert passed_sa.dp_size == 1
+
+    # cleanup_processes
+    p1 = FakeProcess(target=None, args=())
+    p1._alive = False
+    p2 = FakeProcess(target=None, args=())
+    p2._alive = True
+
+    calls = []
+
+    def fake_killpg(pid, sig):
+        calls.append((pid, sig))
+
+    monkeypatch.setattr(ls.os, "killpg", fake_killpg)
+
+    ls.cleanup_processes([p1, p2])
+
+    import signal as _sig
+
+    assert (p1.pid, _sig.SIGTERM) in calls and (p2.pid, _sig.SIGTERM) in calls
+    assert (p2.pid, _sig.SIGKILL) in calls
+
+    def test_validation_error_handling(self):
+        """Test error handling when validation fails."""
+        args = RouterArgs(
+            pd_disaggregation=True,
+            prefill_urls=[],
+            decode_urls=[],
+            service_discovery=False,
+        )
+
+        with patch("sglang_router.launch_router.logger") as mock_logger:
+
+            with pytest.raises(
+                ValueError, match="PD disaggregation mode requires --prefill"
+            ):
+                launch_router(args)
+
+            # Should log error for validation failures
+            mock_logger.error.assert_called_once()
+
+
+class TestStartupFlow:
+    """Test complete startup flow."""
+
+    def test_complete_startup_flow_basic(self):
+        """Test complete startup flow for basic configuration."""
+        args = RouterArgs(
+            host="127.0.0.1",
+            port=30000,
+            worker_urls=["http://worker1:8000", "http://worker2:8000"],
+            policy="cache_aware",
+            cache_threshold=0.5,
+            balance_abs_threshold=32,
+            balance_rel_threshold=1.5,
+        )
+
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            mock_router_instance = MagicMock()
+            router_mod.from_args = MagicMock(return_value=mock_router_instance)
+
+            result = launch_router(args)
+
+            # Verify complete flow
+            router_mod.from_args.assert_called_once()
+            mock_router_instance.start.assert_called_once()
+
+    def test_complete_startup_flow_pd_mode(self):
+        """Test complete startup flow for PD mode configuration."""
+        args = RouterArgs(
+            pd_disaggregation=True,
+            prefill_urls=[
+                ("http://prefill1:8000", 9000),
+                ("http://prefill2:8000", None),
+            ],
+            decode_urls=["http://decode1:8001", "http://decode2:8001"],
+            policy="power_of_two",
+            prefill_policy="cache_aware",
+            decode_policy="round_robin",
+        )
+
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            mock_router_instance = MagicMock()
+            router_mod.from_args = MagicMock(return_value=mock_router_instance)
+
+            with patch("sglang_router.router_args.logger") as mock_logger:
+                result = launch_router(args)
+
+                # Verify complete flow
+                router_mod.from_args.assert_called_once()
+                mock_router_instance.start.assert_called_once()
+
+                # Verify policy warning was logged
+                mock_logger.warning.assert_called_once()
+
+    def test_complete_startup_flow_with_all_features(self):
+        """Test complete startup flow with all features enabled."""
+        args = RouterArgs(
+            host="0.0.0.0",
+            port=30001,
+            worker_urls=["http://worker1:8000"],
+            policy="round_robin",
+            service_discovery=True,
+            selector={"app": "worker"},
+            service_discovery_port=8080,
+            service_discovery_namespace="default",
+            dp_aware=True,
+            api_key="test-key",
+            log_dir="/tmp/logs",
+            log_level="debug",
+            prometheus_port=29000,
+            prometheus_host="0.0.0.0",
+            request_id_headers=["x-request-id", "x-trace-id"],
+            request_timeout_secs=1200,
+            max_concurrent_requests=512,
+            queue_size=200,
+            queue_timeout_secs=120,
+            rate_limit_tokens_per_second=100,
+            cors_allowed_origins=["http://localhost:3000"],
+            retry_max_retries=3,
+            retry_initial_backoff_ms=100,
+            retry_max_backoff_ms=10000,
+            retry_backoff_multiplier=2.0,
+            retry_jitter_factor=0.1,
+            cb_failure_threshold=5,
+            cb_success_threshold=2,
+            cb_timeout_duration_secs=30,
+            cb_window_duration_secs=60,
+            health_failure_threshold=2,
+            health_success_threshold=1,
+            health_check_timeout_secs=3,
+            health_check_interval_secs=30,
+            health_check_endpoint="/healthz",
+        )
+
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            captured_args = {}
+            mock_router_instance = MagicMock()
+
+            def fake_from_args(router_args):
+                captured_args.update(
+                    dict(
+                        host=router_args.host,
+                        port=router_args.port,
+                        worker_urls=router_args.worker_urls,
+                        policy=policy_from_str(router_args.policy),
+                        service_discovery=router_args.service_discovery,
+                        selector=router_args.selector,
+                        service_discovery_port=router_args.service_discovery_port,
+                        service_discovery_namespace=router_args.service_discovery_namespace,
+                        dp_aware=router_args.dp_aware,
+                        api_key=router_args.api_key,
+                        log_dir=router_args.log_dir,
+                        log_level=router_args.log_level,
+                        prometheus_port=router_args.prometheus_port,
+                        prometheus_host=router_args.prometheus_host,
+                        request_id_headers=router_args.request_id_headers,
+                        request_timeout_secs=router_args.request_timeout_secs,
+                        max_concurrent_requests=router_args.max_concurrent_requests,
+                        queue_size=router_args.queue_size,
+                        queue_timeout_secs=router_args.queue_timeout_secs,
+                        rate_limit_tokens_per_second=router_args.rate_limit_tokens_per_second,
+                        cors_allowed_origins=router_args.cors_allowed_origins,
+                        retry_max_retries=router_args.retry_max_retries,
+                        retry_initial_backoff_ms=router_args.retry_initial_backoff_ms,
+                        retry_max_backoff_ms=router_args.retry_max_backoff_ms,
+                        retry_backoff_multiplier=router_args.retry_backoff_multiplier,
+                        retry_jitter_factor=router_args.retry_jitter_factor,
+                        cb_failure_threshold=router_args.cb_failure_threshold,
+                        cb_success_threshold=router_args.cb_success_threshold,
+                        cb_timeout_duration_secs=router_args.cb_timeout_duration_secs,
+                        cb_window_duration_secs=router_args.cb_window_duration_secs,
+                        health_failure_threshold=router_args.health_failure_threshold,
+                        health_success_threshold=router_args.health_success_threshold,
+                        health_check_timeout_secs=router_args.health_check_timeout_secs,
+                        health_check_interval_secs=router_args.health_check_interval_secs,
+                        health_check_endpoint=router_args.health_check_endpoint,
+                    )
+                )
+                return mock_router_instance
+
+            router_mod.from_args = MagicMock(side_effect=fake_from_args)
+
+            result = launch_router(args)
+
+            # Verify complete flow
+            router_mod.from_args.assert_called_once()
+            mock_router_instance.start.assert_called_once()
+
+            # Verify key parameters were propagated into RouterArgs
+            assert captured_args["host"] == "0.0.0.0"
+            assert captured_args["port"] == 30001
+            assert captured_args["worker_urls"] == ["http://worker1:8000"]
+            assert captured_args["policy"] == PolicyType.RoundRobin
+            assert captured_args["service_discovery"] is True
+            assert captured_args["selector"] == {"app": "worker"}
+            assert captured_args["service_discovery_port"] == 8080
+            assert captured_args["service_discovery_namespace"] == "default"
+            assert captured_args["dp_aware"] is True
+            assert captured_args["api_key"] == "test-key"
+            assert captured_args["log_dir"] == "/tmp/logs"
+            assert captured_args["log_level"] == "debug"
+            assert captured_args["prometheus_port"] == 29000
+            assert captured_args["prometheus_host"] == "0.0.0.0"
+            assert captured_args["request_id_headers"] == ["x-request-id", "x-trace-id"]
+            assert captured_args["request_timeout_secs"] == 1200
+            assert captured_args["max_concurrent_requests"] == 512
+            assert captured_args["queue_size"] == 200
+            assert captured_args["queue_timeout_secs"] == 120
+            assert captured_args["rate_limit_tokens_per_second"] == 100
+            assert captured_args["cors_allowed_origins"] == ["http://localhost:3000"]
+            assert captured_args["retry_max_retries"] == 3
+            assert captured_args["retry_initial_backoff_ms"] == 100
+            assert captured_args["retry_max_backoff_ms"] == 10000
+            assert captured_args["retry_backoff_multiplier"] == 2.0
+            assert captured_args["retry_jitter_factor"] == 0.1
+            assert captured_args["cb_failure_threshold"] == 5
+            assert captured_args["cb_success_threshold"] == 2
+            assert captured_args["cb_timeout_duration_secs"] == 30
+            assert captured_args["cb_window_duration_secs"] == 60
+            assert captured_args["health_failure_threshold"] == 2
+            assert captured_args["health_success_threshold"] == 1
+            assert captured_args["health_check_timeout_secs"] == 3
+            assert captured_args["health_check_interval_secs"] == 30
+            assert captured_args["health_check_endpoint"] == "/healthz"
diff --git a/sgl-router/py_test/unit/test_validation.py b/sgl-router/py_test/unit/test_validation.py
new file mode 100644
index 000000000..1a3e54612
--- /dev/null
+++ b/sgl-router/py_test/unit/test_validation.py
@@ -0,0 +1,506 @@
+"""
+Unit tests for validation logic in sglang_router.
+
+These tests focus on testing the validation logic in isolation,
+including parameter validation, URL validation, and configuration validation.
+"""
+
+from types import SimpleNamespace
+from unittest.mock import MagicMock, patch
+
+import pytest
+from sglang_router.launch_router import RouterArgs, launch_router
+
+
+class TestURLValidation:
+    """Test URL validation logic."""
+
+    def test_valid_worker_urls(self):
+        """Test validation of valid worker URLs."""
+        valid_urls = [
+            "http://worker1:8000",
+            "https://worker2:8000",
+            "http://localhost:8000",
+            "http://127.0.0.1:8000",
+            "http://192.168.1.100:8000",
+            "http://worker.example.com:8000",
+        ]
+
+        for url in valid_urls:
+            args = RouterArgs(worker_urls=[url])
+            # Should not raise any validation errors
+            assert url in args.worker_urls
+
+    def test_valid_prefill_urls(self):
+        """Test validation of valid prefill URLs."""
+        valid_prefill_urls = [
+            ("http://prefill1:8000", 9000),
+            ("https://prefill2:8000", None),
+            ("http://localhost:8000", 9000),
+            ("http://127.0.0.1:8000", None),
+        ]
+
+        for url, bootstrap_port in valid_prefill_urls:
+            args = RouterArgs(prefill_urls=[(url, bootstrap_port)])
+            # Should not raise any validation errors
+            assert (url, bootstrap_port) in args.prefill_urls
+
+    def test_valid_decode_urls(self):
+        """Test validation of valid decode URLs."""
+        valid_decode_urls = [
+            "http://decode1:8001",
+            "https://decode2:8001",
+            "http://localhost:8001",
+            "http://127.0.0.1:8001",
+        ]
+
+        for url in valid_decode_urls:
+            args = RouterArgs(decode_urls=[url])
+            # Should not raise any validation errors
+            assert url in args.decode_urls
+
+    def test_malformed_urls(self):
+        """Test handling of malformed URLs."""
+        # Note: The current implementation doesn't validate URL format
+        # This test documents the current behavior
+        malformed_urls = [
+            "not-a-url",
+            "ftp://worker1:8000",  # Wrong protocol
+            "http://",  # Missing host
+            ":8000",  # Missing protocol and host
+            "http://worker1",  # Missing port
+        ]
+
+        for url in malformed_urls:
+            args = RouterArgs(worker_urls=[url])
+            # Currently, malformed URLs are accepted
+            # This might be something to improve in the future
+            assert url in args.worker_urls
+
+
+class TestPortValidation:
+    """Test port validation logic."""
+
+    def test_valid_ports(self):
+        """Test validation of valid port numbers."""
+        valid_ports = [1, 80, 8000, 30000, 65535]
+
+        for port in valid_ports:
+            args = RouterArgs(port=port)
+            assert args.port == port
+
+    def test_invalid_ports(self):
+        """Test handling of invalid port numbers."""
+        # Note: The current implementation doesn't validate port ranges
+        # This test documents the current behavior
+        invalid_ports = [0, -1, 65536, 70000]
+
+        for port in invalid_ports:
+            args = RouterArgs(port=port)
+            # Currently, invalid ports are accepted
+            # This might be something to improve in the future
+            assert args.port == port
+
+    def test_bootstrap_port_validation(self):
+        """Test validation of bootstrap ports in PD mode."""
+        valid_bootstrap_ports = [1, 80, 9000, 30000, 65535, None]
+
+        for bootstrap_port in valid_bootstrap_ports:
+            args = RouterArgs(prefill_urls=[("http://prefill1:8000", bootstrap_port)])
+            assert args.prefill_urls[0][1] == bootstrap_port
+
+
+class TestParameterValidation:
+    """Test parameter validation logic."""
+
+    def test_cache_threshold_validation(self):
+        """Test cache threshold parameter validation."""
+        # Valid cache thresholds
+        valid_thresholds = [0.0, 0.1, 0.5, 0.9, 1.0]
+
+        for threshold in valid_thresholds:
+            args = RouterArgs(cache_threshold=threshold)
+            assert args.cache_threshold == threshold
+
+    def test_balance_threshold_validation(self):
+        """Test load balancing threshold parameter validation."""
+        # Valid absolute thresholds
+        valid_abs_thresholds = [0, 1, 32, 64, 128, 1000]
+        for threshold in valid_abs_thresholds:
+            args = RouterArgs(balance_abs_threshold=threshold)
+            assert args.balance_abs_threshold == threshold
+
+        # Valid relative thresholds
+        valid_rel_thresholds = [1.0, 1.1, 1.5, 2.0, 10.0]
+        for threshold in valid_rel_thresholds:
+            args = RouterArgs(balance_rel_threshold=threshold)
+            assert args.balance_rel_threshold == threshold
+
+    def test_timeout_validation(self):
+        """Test timeout parameter validation."""
+        # Valid timeouts
+        valid_timeouts = [1, 30, 60, 300, 600, 1800, 3600]
+
+        for timeout in valid_timeouts:
+            args = RouterArgs(
+                worker_startup_timeout_secs=timeout,
+                worker_startup_check_interval=timeout,
+                request_timeout_secs=timeout,
+                queue_timeout_secs=timeout,
+            )
+            assert args.worker_startup_timeout_secs == timeout
+            assert args.worker_startup_check_interval == timeout
+            assert args.request_timeout_secs == timeout
+            assert args.queue_timeout_secs == timeout
+
+    def test_retry_parameter_validation(self):
+        """Test retry parameter validation."""
+        # Valid retry parameters
+        valid_retry_counts = [0, 1, 3, 5, 10]
+        for count in valid_retry_counts:
+            args = RouterArgs(retry_max_retries=count)
+            assert args.retry_max_retries == count
+
+        # Valid backoff parameters
+        valid_backoff_ms = [1, 50, 100, 1000, 30000]
+        for backoff in valid_backoff_ms:
+            args = RouterArgs(
+                retry_initial_backoff_ms=backoff, retry_max_backoff_ms=backoff
+            )
+            assert args.retry_initial_backoff_ms == backoff
+            assert args.retry_max_backoff_ms == backoff
+
+        # Valid multiplier parameters
+        valid_multipliers = [1.0, 1.5, 2.0, 3.0]
+        for multiplier in valid_multipliers:
+            args = RouterArgs(retry_backoff_multiplier=multiplier)
+            assert args.retry_backoff_multiplier == multiplier
+
+        # Valid jitter parameters
+        valid_jitter = [0.0, 0.1, 0.2, 0.5]
+        for jitter in valid_jitter:
+            args = RouterArgs(retry_jitter_factor=jitter)
+            assert args.retry_jitter_factor == jitter
+
+    def test_circuit_breaker_parameter_validation(self):
+        """Test circuit breaker parameter validation."""
+        # Valid failure thresholds
+        valid_failure_thresholds = [1, 3, 5, 10, 20]
+        for threshold in valid_failure_thresholds:
+            args = RouterArgs(cb_failure_threshold=threshold)
+            assert args.cb_failure_threshold == threshold
+
+        # Valid success thresholds
+        valid_success_thresholds = [1, 2, 3, 5]
+        for threshold in valid_success_thresholds:
+            args = RouterArgs(cb_success_threshold=threshold)
+            assert args.cb_success_threshold == threshold
+
+        # Valid timeout durations
+        valid_timeouts = [10, 30, 60, 120, 300]
+        for timeout in valid_timeouts:
+            args = RouterArgs(
+                cb_timeout_duration_secs=timeout, cb_window_duration_secs=timeout
+            )
+            assert args.cb_timeout_duration_secs == timeout
+            assert args.cb_window_duration_secs == timeout
+
+    def test_health_check_parameter_validation(self):
+        """Test health check parameter validation."""
+        # Valid failure thresholds
+        valid_failure_thresholds = [1, 2, 3, 5, 10]
+        for threshold in valid_failure_thresholds:
+            args = RouterArgs(health_failure_threshold=threshold)
+            assert args.health_failure_threshold == threshold
+
+        # Valid success thresholds
+        valid_success_thresholds = [1, 2, 3, 5]
+        for threshold in valid_success_thresholds:
+            args = RouterArgs(health_success_threshold=threshold)
+            assert args.health_success_threshold == threshold
+
+        # Valid timeouts and intervals
+        valid_times = [1, 5, 10, 30, 60, 120]
+        for time_val in valid_times:
+            args = RouterArgs(
+                health_check_timeout_secs=time_val, health_check_interval_secs=time_val
+            )
+            assert args.health_check_timeout_secs == time_val
+            assert args.health_check_interval_secs == time_val
+
+    def test_rate_limiting_parameter_validation(self):
+        """Test rate limiting parameter validation."""
+        # Valid concurrent request limits
+        valid_limits = [1, 10, 64, 256, 512, 1000]
+        for limit in valid_limits:
+            args = RouterArgs(max_concurrent_requests=limit)
+            assert args.max_concurrent_requests == limit
+
+        # Valid queue sizes
+        valid_queue_sizes = [0, 10, 50, 100, 500, 1000]
+        for size in valid_queue_sizes:
+            args = RouterArgs(queue_size=size)
+            assert args.queue_size == size
+
+        # Valid token rates
+        valid_rates = [1, 10, 50, 100, 500, 1000]
+        for rate in valid_rates:
+            args = RouterArgs(rate_limit_tokens_per_second=rate)
+            assert args.rate_limit_tokens_per_second == rate
+
+    def test_tree_size_validation(self):
+        """Test tree size parameter validation."""
+        # Valid tree sizes (powers of 2)
+        valid_sizes = [2**10, 2**20, 2**24, 2**26, 2**28, 2**30]
+
+        for size in valid_sizes:
+            args = RouterArgs(max_tree_size=size)
+            assert args.max_tree_size == size
+
+    def test_payload_size_validation(self):
+        """Test payload size parameter validation."""
+        # Valid payload sizes
+        valid_sizes = [
+            1024,  # 1KB
+            1024 * 1024,  # 1MB
+            10 * 1024 * 1024,  # 10MB
+            100 * 1024 * 1024,  # 100MB
+            512 * 1024 * 1024,  # 512MB
+            1024 * 1024 * 1024,  # 1GB
+        ]
+
+        for size in valid_sizes:
+            args = RouterArgs(max_payload_size=size)
+            assert args.max_payload_size == size
+
+
+class TestConfigurationValidation:
+    """Test configuration validation logic."""
+
+    def test_pd_mode_validation(self):
+        """Test PD mode configuration validation."""
+        # Valid PD configuration
+        args = RouterArgs(
+            pd_disaggregation=True,
+            prefill_urls=[("http://prefill1:8000", 9000)],
+            decode_urls=["http://decode1:8001"],
+        )
+
+        assert args.pd_disaggregation is True
+        assert len(args.prefill_urls) > 0
+        assert len(args.decode_urls) > 0
+
+    def test_service_discovery_validation(self):
+        """Test service discovery configuration validation."""
+        # Valid service discovery configuration
+        args = RouterArgs(
+            service_discovery=True,
+            selector={"app": "worker", "env": "prod"},
+            service_discovery_port=8080,
+            service_discovery_namespace="default",
+        )
+
+        assert args.service_discovery is True
+        assert args.selector == {"app": "worker", "env": "prod"}
+        assert args.service_discovery_port == 8080
+        assert args.service_discovery_namespace == "default"
+
+    def test_pd_service_discovery_validation(self):
+        """Test PD service discovery configuration validation."""
+        # Valid PD service discovery configuration
+        args = RouterArgs(
+            pd_disaggregation=True,
+            service_discovery=True,
+            prefill_selector={"app": "prefill"},
+            decode_selector={"app": "decode"},
+        )
+
+        assert args.pd_disaggregation is True
+        assert args.service_discovery is True
+        assert args.prefill_selector == {"app": "prefill"}
+        assert args.decode_selector == {"app": "decode"}
+
+    def test_policy_validation(self):
+        """Test policy configuration validation."""
+        # Valid policies
+        valid_policies = ["random", "round_robin", "cache_aware", "power_of_two"]
+
+        for policy in valid_policies:
+            args = RouterArgs(policy=policy)
+            assert args.policy == policy
+
+    def test_pd_policy_validation(self):
+        """Test PD policy configuration validation."""
+        # Valid PD policies
+        valid_policies = ["random", "round_robin", "cache_aware", "power_of_two"]
+
+        for prefill_policy in valid_policies:
+            for decode_policy in valid_policies:
+                args = RouterArgs(
+                    pd_disaggregation=True,
+                    prefill_urls=[("http://prefill1:8000", None)],
+                    decode_urls=["http://decode1:8001"],
+                    prefill_policy=prefill_policy,
+                    decode_policy=decode_policy,
+                )
+                assert args.prefill_policy == prefill_policy
+                assert args.decode_policy == decode_policy
+
+    def test_cors_validation(self):
+        """Test CORS configuration validation."""
+        # Valid CORS origins
+        valid_origins = [
+            [],
+            ["http://localhost:3000"],
+            ["https://example.com"],
+            ["http://localhost:3000", "https://example.com"],
+            ["*"],  # Wildcard (if supported)
+        ]
+
+        for origins in valid_origins:
+            args = RouterArgs(cors_allowed_origins=origins)
+            assert args.cors_allowed_origins == origins
+
+    def test_logging_validation(self):
+        """Test logging configuration validation."""
+        # Valid log levels
+        valid_log_levels = ["debug", "info", "warning", "error", "critical"]
+
+        for level in valid_log_levels:
+            args = RouterArgs(log_level=level)
+            assert args.log_level == level
+
+    def test_prometheus_validation(self):
+        """Test Prometheus configuration validation."""
+        # Valid Prometheus configuration
+        args = RouterArgs(prometheus_port=29000, prometheus_host="127.0.0.1")
+
+        assert args.prometheus_port == 29000
+        assert args.prometheus_host == "127.0.0.1"
+
+    def test_tokenizer_validation(self):
+        """Test tokenizer configuration validation."""
+        # Note: model_path and tokenizer_path are not available in current RouterArgs
+        pytest.skip("Tokenizer configuration not available in current implementation")
+
+    def test_request_id_headers_validation(self):
+        """Test request ID headers configuration validation."""
+        # Valid request ID headers
+        valid_headers = [
+            ["x-request-id"],
+            ["x-request-id", "x-trace-id"],
+            ["x-request-id", "x-trace-id", "x-correlation-id"],
+            ["custom-header"],
+        ]
+
+        for headers in valid_headers:
+            args = RouterArgs(request_id_headers=headers)
+            assert args.request_id_headers == headers
+
+
+class TestLaunchValidation:
+    """Test launch-time validation logic."""
+
+    def test_pd_mode_requires_urls(self):
+        """Test that PD mode requires prefill and decode URLs."""
+        # PD mode without URLs should fail
+        args = RouterArgs(
+            pd_disaggregation=True,
+            prefill_urls=[],
+            decode_urls=[],
+            service_discovery=False,
+        )
+
+        with pytest.raises(
+            ValueError, match="PD disaggregation mode requires --prefill"
+        ):
+            launch_router(args)
+
+    def test_pd_mode_with_service_discovery_allows_empty_urls(self):
+        """Test that PD mode with service discovery allows empty URLs."""
+        args = RouterArgs(
+            pd_disaggregation=True,
+            prefill_urls=[],
+            decode_urls=[],
+            service_discovery=True,
+        )
+
+        # Should not raise validation error
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            mock_router_instance = MagicMock()
+            router_mod.from_args = MagicMock(return_value=mock_router_instance)
+
+            launch_router(args)
+
+            # Should create router instance via from_args
+            router_mod.from_args.assert_called_once()
+
+    def test_regular_mode_allows_empty_worker_urls(self):
+        """Test that regular mode allows empty worker URLs."""
+        args = RouterArgs(worker_urls=[], service_discovery=False)
+
+        # Should not raise validation error
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            mock_router_instance = MagicMock()
+            router_mod.from_args = MagicMock(return_value=mock_router_instance)
+
+            launch_router(args)
+
+            # Should create router instance via from_args
+            router_mod.from_args.assert_called_once()
+
+    def test_launch_with_valid_config(self):
+        """Test launching with valid configuration."""
+        args = RouterArgs(
+            host="127.0.0.1",
+            port=30000,
+            worker_urls=["http://worker1:8000"],
+            policy="cache_aware",
+        )
+
+        # Should not raise validation error
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            mock_router_instance = MagicMock()
+            router_mod.from_args = MagicMock(return_value=mock_router_instance)
+
+            launch_router(args)
+
+            # Should create router instance via from_args
+            router_mod.from_args.assert_called_once()
+
+    def test_launch_with_pd_config(self):
+        """Test launching with valid PD configuration."""
+        args = RouterArgs(
+            pd_disaggregation=True,
+            prefill_urls=[("http://prefill1:8000", 9000)],
+            decode_urls=["http://decode1:8001"],
+            policy="cache_aware",
+        )
+
+        # Should not raise validation error
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            mock_router_instance = MagicMock()
+            router_mod.from_args = MagicMock(return_value=mock_router_instance)
+
+            launch_router(args)
+
+            # Should create router instance via from_args
+            router_mod.from_args.assert_called_once()
+
+    def test_launch_with_service_discovery_config(self):
+        """Test launching with valid service discovery configuration."""
+        args = RouterArgs(
+            service_discovery=True,
+            selector={"app": "worker"},
+            service_discovery_port=8080,
+        )
+
+        # Should not raise validation error
+        with patch("sglang_router.launch_router.Router") as router_mod:
+            mock_router_instance = MagicMock()
+            router_mod.from_args = MagicMock(return_value=mock_router_instance)
+
+            launch_router(args)
+
+            # Should create router instance via from_args
+            router_mod.from_args.assert_called_once()
diff --git a/sgl-router/pyproject.toml b/sgl-router/pyproject.toml
new file mode 100644
index 000000000..9a7606f6a
--- /dev/null
+++ b/sgl-router/pyproject.toml
@@ -0,0 +1,31 @@
+[build-system]
+requires = ["setuptools>=45", "wheel", "setuptools-rust>=1.5.2"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "sglang-router"
+version = "0.1.9"
+description = "High-performance Rust-based load balancer for SGLang with multiple routing algorithms and prefill-decode disaggregation support"
+authors = [{name = "Byron Hsu", email = "byronhsu1230@gmail.com"}]
+requires-python = ">=3.8"
+readme = "README.md"
+license = { text = "Apache-2.0" }
+classifiers = [
+    "Programming Language :: Python :: Implementation :: CPython",
+    "Programming Language :: Rust",
+    "Programming Language :: Python :: 3",
+]
+
+[project.optional-dependencies]
+dev = [
+    "requests>=2.25.0",
+]
+
+
+# https://github.com/PyO3/setuptools-rust?tab=readme-ov-file
+[tool.setuptools.packages]
+find = { where = ["py_src"] }
+
+# workaround for https://github.com/pypa/twine/issues/1216
+[tool.setuptools]
+license-files = []
diff --git a/sgl-router/pytest.ini b/sgl-router/pytest.ini
new file mode 100644
index 000000000..c9f400753
--- /dev/null
+++ b/sgl-router/pytest.ini
@@ -0,0 +1,5 @@
+[pytest]
+testpaths = py_test
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
diff --git a/sgl-router/scripts/run_benchmarks.py b/sgl-router/scripts/run_benchmarks.py
new file mode 100755
index 000000000..a7ece9d9a
--- /dev/null
+++ b/sgl-router/scripts/run_benchmarks.py
@@ -0,0 +1,250 @@
+#!/usr/bin/env python3
+"""
+SGLang Router Benchmark Runner
+
+A Python script to run Rust benchmarks with various options and modes.
+Replaces the shell script for better maintainability and integration.
+"""
+
+import argparse
+import os
+import subprocess
+import sys
+import time
+from pathlib import Path
+from typing import Dict, List, Optional
+
+
+class BenchmarkRunner:
+    """Handles running Rust benchmarks for the SGLang router."""
+
+    def __init__(self, project_root: str):
+        self.project_root = Path(project_root)
+        self.timestamp = time.strftime("%a %b %d %H:%M:%S UTC %Y", time.gmtime())
+
+    def run_command(
+        self, cmd: List[str], capture_output: bool = False
+    ) -> subprocess.CompletedProcess:
+        """Run a command and handle errors."""
+        try:
+            if capture_output:
+                result = subprocess.run(
+                    cmd, capture_output=True, text=True, cwd=self.project_root
+                )
+            else:
+                result = subprocess.run(cmd, cwd=self.project_root)
+            return result
+        except subprocess.CalledProcessError as e:
+            print(f"Error running command: {' '.join(cmd)}")
+            print(f"Exit code: {e.returncode}")
+            sys.exit(1)
+
+    def print_header(self):
+        """Print the benchmark runner header."""
+        print("SGLang Router Benchmark Runner")
+        print("=" * 30)
+        print(f"Project: {self.project_root.absolute()}")
+        print(f"Timestamp: {self.timestamp}")
+        print()
+
+    def build_release(self):
+        """Build the project in release mode."""
+        print("Building in release mode...")
+        result = self.run_command(["cargo", "build", "--release", "--quiet"])
+        if result.returncode != 0:
+            print("Failed to build in release mode")
+            sys.exit(1)
+
+    def run_benchmarks(
+        self,
+        quick_mode: bool = False,
+        save_baseline: Optional[str] = None,
+        compare_baseline: Optional[str] = None,
+    ) -> str:
+        """Run benchmarks with specified options."""
+        bench_args = ["cargo", "bench", "--bench", "request_processing"]
+
+        if quick_mode:
+            bench_args.append("benchmark_summary")
+            print("Running quick benchmarks...")
+        else:
+            print("Running full benchmark suite...")
+
+        # Note: Criterion baselines are handled via target directory structure
+        # For now, we'll implement baseline functionality via file copying
+        if save_baseline:
+            print(f"Will save results as baseline: {save_baseline}")
+
+        if compare_baseline:
+            print(f"Will compare with baseline: {compare_baseline}")
+
+        print(f"Executing: {' '.join(bench_args)}")
+        result = self.run_command(bench_args, capture_output=True)
+
+        if result.returncode != 0:
+            print("Benchmark execution failed!")
+            print("STDOUT:", result.stdout)
+            print("STDERR:", result.stderr)
+            sys.exit(1)
+
+        # Handle baseline saving after successful run
+        if save_baseline:
+            self._save_baseline(save_baseline, result.stdout)
+
+        return result.stdout
+
+    def _save_baseline(self, filename: str, output: str):
+        """Save benchmark results to a file as baseline."""
+        filepath = self.project_root / filename
+        with open(filepath, "w") as f:
+            f.write(output)
+        print(f"Baseline saved to: {filepath}")
+
+    def parse_benchmark_results(self, output: str) -> Dict[str, str]:
+        """Parse benchmark output to extract performance metrics."""
+        results = {}
+
+        # Look for performance overview section
+        lines = output.split("\n")
+        parsing_overview = False
+
+        for line in lines:
+            line = line.strip()
+
+            if "Quick Performance Overview:" in line:
+                parsing_overview = True
+                continue
+
+            if parsing_overview and line.startswith("* "):
+                # Parse lines like "* Serialization (avg):          481 ns/req"
+                if "Serialization (avg):" in line:
+                    results["serialization_time"] = self._extract_time(line)
+                elif "Deserialization (avg):" in line:
+                    results["deserialization_time"] = self._extract_time(line)
+                elif "Bootstrap Injection (avg):" in line:
+                    results["bootstrap_injection_time"] = self._extract_time(line)
+                elif "Total Pipeline (avg):" in line:
+                    results["total_time"] = self._extract_time(line)
+
+            # Stop parsing after the overview section
+            if parsing_overview and line.startswith("Performance Insights:"):
+                break
+
+        return results
+
+    def _extract_time(self, line: str) -> str:
+        """Extract time value from a benchmark line."""
+        # Extract number followed by ns/req
+        import re
+
+        match = re.search(r"(\d+)\s*ns/req", line)
+        return match.group(1) if match else "N/A"
+
+    def validate_thresholds(self, results: Dict[str, str]) -> bool:
+        """Validate benchmark results against performance thresholds."""
+        thresholds = {
+            "serialization_time": 2000,  # 2μs max
+            "deserialization_time": 2000,  # 2μs max
+            "bootstrap_injection_time": 5000,  # 5μs max
+            "total_time": 10000,  # 10μs max
+        }
+
+        all_passed = True
+        print("\nPerformance Threshold Validation:")
+        print("=" * 35)
+
+        for metric, threshold in thresholds.items():
+            if metric in results and results[metric] != "N/A":
+                try:
+                    value = int(results[metric])
+                    passed = value <= threshold
+                    status = "✓ PASS" if passed else "✗ FAIL"
+                    print(f"{metric:20}: {value:>6}ns <= {threshold:>6}ns {status}")
+                    if not passed:
+                        all_passed = False
+                except ValueError:
+                    print(f"{metric:20}: Invalid value: {results[metric]}")
+                    all_passed = False
+            else:
+                print(f"{metric:20}: No data available")
+                all_passed = False
+
+        print()
+        if all_passed:
+            print("All performance thresholds passed!")
+        else:
+            print("Some performance thresholds failed!")
+
+        return all_passed
+
+    def save_results_to_file(
+        self, results: Dict[str, str], filename: str = "benchmark_results.env"
+    ):
+        """Save benchmark results to a file for CI consumption."""
+        filepath = self.project_root / filename
+        with open(filepath, "w") as f:
+            for key, value in results.items():
+                f.write(f"{key}={value}\n")
+        print(f"Results saved to: {filepath}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run SGLang router benchmarks")
+    parser.add_argument(
+        "--quick", action="store_true", help="Run quick benchmarks (summary only)"
+    )
+    parser.add_argument(
+        "--save-baseline", type=str, help="Save benchmark results as baseline"
+    )
+    parser.add_argument(
+        "--compare-baseline", type=str, help="Compare with saved baseline"
+    )
+    parser.add_argument(
+        "--validate-thresholds",
+        action="store_true",
+        help="Validate results against performance thresholds",
+    )
+    parser.add_argument(
+        "--save-results", action="store_true", help="Save results to file for CI"
+    )
+
+    args = parser.parse_args()
+
+    # Determine project root (script is in scripts/ subdirectory)
+    script_dir = Path(__file__).parent
+    project_root = script_dir.parent
+
+    runner = BenchmarkRunner(str(project_root))
+    runner.print_header()
+
+    # Build in release mode
+    runner.build_release()
+
+    # Run benchmarks
+    output = runner.run_benchmarks(
+        quick_mode=args.quick,
+        save_baseline=args.save_baseline,
+        compare_baseline=args.compare_baseline,
+    )
+
+    # Print the raw output
+    print(output)
+
+    # Parse and validate results if requested
+    if args.validate_thresholds or args.save_results:
+        results = runner.parse_benchmark_results(output)
+
+        if args.save_results:
+            runner.save_results_to_file(results)
+
+        if args.validate_thresholds:
+            passed = runner.validate_thresholds(results)
+            if not passed:
+                print("Validation failed - performance regression detected!")
+                sys.exit(1)
+
+    print("\nBenchmark run completed successfully!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sgl-router/setup.py b/sgl-router/setup.py
new file mode 100644
index 000000000..730a91ceb
--- /dev/null
+++ b/sgl-router/setup.py
@@ -0,0 +1,21 @@
+import os
+
+from setuptools import setup
+from setuptools_rust import Binding, RustExtension
+
+no_rust = os.environ.get("SGLANG_ROUTER_BUILD_NO_RUST") == "1"
+
+rust_extensions = []
+if not no_rust:
+    rust_extensions.append(
+        RustExtension(
+            target="sglang_router_rs",
+            path="Cargo.toml",
+            binding=Binding.PyO3,
+        )
+    )
+
+setup(
+    rust_extensions=rust_extensions,
+    zip_safe=False,
+)
diff --git a/sgl-router/src/config/mod.rs b/sgl-router/src/config/mod.rs
new file mode 100644
index 000000000..4622ff781
--- /dev/null
+++ b/sgl-router/src/config/mod.rs
@@ -0,0 +1,28 @@
+pub mod types;
+pub mod validation;
+
+pub use types::*;
+pub use validation::*;
+
+/// Configuration errors
+#[derive(Debug, thiserror::Error)]
+pub enum ConfigError {
+    #[error("Validation failed: {reason}")]
+    ValidationFailed { reason: String },
+
+    #[error("Invalid value for field '{field}': {value} - {reason}")]
+    InvalidValue {
+        field: String,
+        value: String,
+        reason: String,
+    },
+
+    #[error("Incompatible configuration: {reason}")]
+    IncompatibleConfig { reason: String },
+
+    #[error("Missing required field: {field}")]
+    MissingRequired { field: String },
+}
+
+/// Result type for configuration operations
+pub type ConfigResult<T> = Result<T, ConfigError>;
diff --git a/sgl-router/src/config/types.rs b/sgl-router/src/config/types.rs
new file mode 100644
index 000000000..5f5b227af
--- /dev/null
+++ b/sgl-router/src/config/types.rs
@@ -0,0 +1,1241 @@
+use super::ConfigResult;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+
+/// Main router configuration
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct RouterConfig {
+    /// Routing mode configuration
+    pub mode: RoutingMode,
+    /// Worker connection mode
+    #[serde(default)]
+    pub connection_mode: ConnectionMode,
+    /// Policy configuration
+    pub policy: PolicyConfig,
+    /// Server host address
+    pub host: String,
+    /// Server port
+    pub port: u16,
+    /// Maximum payload size in bytes
+    pub max_payload_size: usize,
+    /// Request timeout in seconds
+    pub request_timeout_secs: u64,
+    /// Worker startup timeout in seconds
+    pub worker_startup_timeout_secs: u64,
+    /// Worker health check interval in seconds
+    pub worker_startup_check_interval_secs: u64,
+    /// Enable data parallelism aware schedule
+    pub dp_aware: bool,
+    /// The api key used for the authorization with the worker
+    pub api_key: Option<String>,
+    /// Service discovery configuration (optional)
+    pub discovery: Option<DiscoveryConfig>,
+    /// Metrics configuration (optional)
+    pub metrics: Option<MetricsConfig>,
+    /// Log directory (None = stdout only)
+    pub log_dir: Option<String>,
+    /// Log level (None = info)
+    pub log_level: Option<String>,
+    /// Custom request ID headers to check (defaults to common headers)
+    pub request_id_headers: Option<Vec<String>>,
+    /// Maximum concurrent requests allowed (for rate limiting)
+    pub max_concurrent_requests: usize,
+    /// Queue size for pending requests when max concurrent limit reached (0 = no queue, return 429 immediately)
+    pub queue_size: usize,
+    /// Maximum time (in seconds) a request can wait in queue before timing out
+    pub queue_timeout_secs: u64,
+    /// Token bucket refill rate (tokens per second). If not set, defaults to max_concurrent_requests
+    pub rate_limit_tokens_per_second: Option<usize>,
+    /// CORS allowed origins
+    pub cors_allowed_origins: Vec<String>,
+    /// Retry configuration
+    pub retry: RetryConfig,
+    /// Circuit breaker configuration
+    pub circuit_breaker: CircuitBreakerConfig,
+    /// Disable retries (overrides retry.max_retries to 1 when true)
+    #[serde(default)]
+    pub disable_retries: bool,
+    /// Disable circuit breaker (overrides circuit_breaker.failure_threshold to u32::MAX when true)
+    #[serde(default)]
+    pub disable_circuit_breaker: bool,
+    /// Health check configuration
+    pub health_check: HealthCheckConfig,
+    /// Enable Inference Gateway mode (false = proxy mode, true = IGW mode)
+    #[serde(default)]
+    pub enable_igw: bool,
+    /// Model path for loading tokenizer (can be a HuggingFace model ID or local path)
+    pub model_path: Option<String>,
+    /// Explicit tokenizer path (overrides model_path tokenizer if provided)
+    pub tokenizer_path: Option<String>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)]
+#[serde(tag = "type")]
+pub enum ConnectionMode {
+    #[default]
+    #[serde(rename = "http")]
+    Http,
+    #[serde(rename = "grpc")]
+    Grpc,
+}
+
+/// Routing mode configuration
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type")]
+pub enum RoutingMode {
+    #[serde(rename = "regular")]
+    Regular {
+        /// List of worker URLs
+        worker_urls: Vec<String>,
+    },
+    #[serde(rename = "prefill_decode")]
+    PrefillDecode {
+        /// Prefill worker URLs with optional bootstrap ports
+        prefill_urls: Vec<(String, Option<u16>)>,
+        /// Decode worker URLs
+        decode_urls: Vec<String>,
+        /// Optional separate policy for prefill workers
+        #[serde(skip_serializing_if = "Option::is_none")]
+        prefill_policy: Option<PolicyConfig>,
+        /// Optional separate policy for decode workers
+        #[serde(skip_serializing_if = "Option::is_none")]
+        decode_policy: Option<PolicyConfig>,
+    },
+    #[serde(rename = "openai")]
+    OpenAI {
+        /// OpenAI-compatible API base(s), provided via worker URLs
+        worker_urls: Vec<String>,
+    },
+}
+
+impl RoutingMode {
+    pub fn is_pd_mode(&self) -> bool {
+        matches!(self, RoutingMode::PrefillDecode { .. })
+    }
+
+    pub fn worker_count(&self) -> usize {
+        match self {
+            RoutingMode::Regular { worker_urls } => worker_urls.len(),
+            RoutingMode::PrefillDecode {
+                prefill_urls,
+                decode_urls,
+                ..
+            } => prefill_urls.len() + decode_urls.len(),
+            // OpenAI mode represents a single upstream
+            RoutingMode::OpenAI { .. } => 1,
+        }
+    }
+
+    /// Get the effective prefill policy for PD mode
+    /// Falls back to the main policy if no specific prefill policy is set
+    pub fn get_prefill_policy<'a>(&'a self, main_policy: &'a PolicyConfig) -> &'a PolicyConfig {
+        match self {
+            RoutingMode::PrefillDecode { prefill_policy, .. } => {
+                prefill_policy.as_ref().unwrap_or(main_policy)
+            }
+            _ => main_policy,
+        }
+    }
+
+    /// Get the effective decode policy for PD mode
+    /// Falls back to the main policy if no specific decode policy is set
+    pub fn get_decode_policy<'a>(&'a self, main_policy: &'a PolicyConfig) -> &'a PolicyConfig {
+        match self {
+            RoutingMode::PrefillDecode { decode_policy, .. } => {
+                decode_policy.as_ref().unwrap_or(main_policy)
+            }
+            _ => main_policy,
+        }
+    }
+}
+
+/// Policy configuration for routing
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type")]
+pub enum PolicyConfig {
+    #[serde(rename = "random")]
+    Random,
+
+    #[serde(rename = "round_robin")]
+    RoundRobin,
+
+    #[serde(rename = "cache_aware")]
+    CacheAware {
+        /// Minimum prefix match ratio to use cache-based routing
+        cache_threshold: f32,
+        /// Absolute load difference threshold for load balancing
+        balance_abs_threshold: usize,
+        /// Relative load ratio threshold for load balancing
+        balance_rel_threshold: f32,
+        /// Interval between cache eviction cycles (seconds)
+        eviction_interval_secs: u64,
+        /// Maximum cache tree size per tenant
+        max_tree_size: usize,
+    },
+
+    #[serde(rename = "power_of_two")]
+    PowerOfTwo {
+        /// Interval for load monitoring (seconds)
+        load_check_interval_secs: u64,
+    },
+}
+
+impl PolicyConfig {
+    pub fn name(&self) -> &'static str {
+        match self {
+            PolicyConfig::Random => "random",
+            PolicyConfig::RoundRobin => "round_robin",
+            PolicyConfig::CacheAware { .. } => "cache_aware",
+            PolicyConfig::PowerOfTwo { .. } => "power_of_two",
+        }
+    }
+}
+
+/// Service discovery configuration
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct DiscoveryConfig {
+    /// Enable service discovery
+    pub enabled: bool,
+    /// Kubernetes namespace (None = all namespaces)
+    pub namespace: Option<String>,
+    /// Service discovery port
+    pub port: u16,
+    /// Check interval for service discovery
+    pub check_interval_secs: u64,
+    /// Regular mode selector
+    pub selector: HashMap<String, String>,
+    /// PD mode prefill selector
+    pub prefill_selector: HashMap<String, String>,
+    /// PD mode decode selector
+    pub decode_selector: HashMap<String, String>,
+    /// Bootstrap port annotation key
+    pub bootstrap_port_annotation: String,
+}
+
+impl Default for DiscoveryConfig {
+    fn default() -> Self {
+        Self {
+            enabled: false,
+            namespace: None,
+            port: 8000,
+            check_interval_secs: 120,
+            selector: HashMap::new(),
+            prefill_selector: HashMap::new(),
+            decode_selector: HashMap::new(),
+            bootstrap_port_annotation: "sglang.ai/bootstrap-port".to_string(),
+        }
+    }
+}
+
+/// Retry configuration for request handling
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct RetryConfig {
+    /// Maximum number of retry attempts
+    pub max_retries: u32,
+    /// Initial backoff delay in milliseconds
+    pub initial_backoff_ms: u64,
+    /// Maximum backoff delay in milliseconds
+    pub max_backoff_ms: u64,
+    /// Backoff multiplier for exponential backoff
+    pub backoff_multiplier: f32,
+    /// Jitter factor applied to backoff (0.0 - 1.0)
+    /// Effective delay D' = D * (1 + U[-j, +j])
+    #[serde(default = "default_retry_jitter_factor")]
+    pub jitter_factor: f32,
+}
+
+impl Default for RetryConfig {
+    fn default() -> Self {
+        Self {
+            max_retries: 5,
+            initial_backoff_ms: 50,
+            max_backoff_ms: 30000,
+            backoff_multiplier: 1.5,
+            jitter_factor: 0.2,
+        }
+    }
+}
+
+fn default_retry_jitter_factor() -> f32 {
+    0.2
+}
+
+/// Health check configuration for worker monitoring
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct HealthCheckConfig {
+    /// Number of consecutive failures before marking unhealthy
+    pub failure_threshold: u32,
+    /// Number of consecutive successes before marking healthy
+    pub success_threshold: u32,
+    /// Timeout for health check requests in seconds
+    pub timeout_secs: u64,
+    /// Interval between health checks in seconds
+    pub check_interval_secs: u64,
+    /// Health check endpoint path
+    pub endpoint: String,
+}
+
+impl Default for HealthCheckConfig {
+    fn default() -> Self {
+        Self {
+            failure_threshold: 3,
+            success_threshold: 2,
+            timeout_secs: 5,
+            check_interval_secs: 60,
+            endpoint: "/health".to_string(),
+        }
+    }
+}
+
+/// Circuit breaker configuration for worker reliability
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct CircuitBreakerConfig {
+    /// Number of consecutive failures before opening circuit
+    pub failure_threshold: u32,
+    /// Number of consecutive successes before closing circuit
+    pub success_threshold: u32,
+    /// Time before attempting to recover from open state (in seconds)
+    pub timeout_duration_secs: u64,
+    /// Window duration for failure tracking (in seconds)
+    pub window_duration_secs: u64,
+}
+
+impl Default for CircuitBreakerConfig {
+    fn default() -> Self {
+        Self {
+            failure_threshold: 10,
+            success_threshold: 3,
+            timeout_duration_secs: 60,
+            window_duration_secs: 120,
+        }
+    }
+}
+
+/// Metrics configuration
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct MetricsConfig {
+    /// Prometheus metrics port
+    pub port: u16,
+    /// Prometheus metrics host
+    pub host: String,
+}
+
+impl Default for MetricsConfig {
+    fn default() -> Self {
+        Self {
+            port: 29000,
+            host: "127.0.0.1".to_string(),
+        }
+    }
+}
+
+impl Default for RouterConfig {
+    fn default() -> Self {
+        Self {
+            mode: RoutingMode::Regular {
+                worker_urls: vec![],
+            },
+            policy: PolicyConfig::Random,
+            host: "127.0.0.1".to_string(),
+            port: 3001,
+            max_payload_size: 536_870_912, // 512MB
+            request_timeout_secs: 1800,    // 30 minutes
+            worker_startup_timeout_secs: 600,
+            worker_startup_check_interval_secs: 30,
+            dp_aware: false,
+            api_key: None,
+            discovery: None,
+            metrics: None,
+            log_dir: None,
+            log_level: None,
+            request_id_headers: None,
+            max_concurrent_requests: 256,
+            queue_size: 100,
+            queue_timeout_secs: 60,
+            rate_limit_tokens_per_second: None,
+            cors_allowed_origins: vec![],
+            retry: RetryConfig::default(),
+            circuit_breaker: CircuitBreakerConfig::default(),
+            disable_retries: false,
+            disable_circuit_breaker: false,
+            health_check: HealthCheckConfig::default(),
+            enable_igw: false,
+            connection_mode: ConnectionMode::Http,
+            model_path: None,
+            tokenizer_path: None,
+        }
+    }
+}
+
+impl RouterConfig {
+    /// Create a new configuration with mode and policy
+    pub fn new(mode: RoutingMode, policy: PolicyConfig) -> Self {
+        Self {
+            mode,
+            policy,
+            ..Default::default()
+        }
+    }
+
+    /// Validate the configuration
+    pub fn validate(&self) -> ConfigResult<()> {
+        crate::config::validation::ConfigValidator::validate(self)
+    }
+
+    /// Get the routing mode type as a string
+    pub fn mode_type(&self) -> &'static str {
+        match self.mode {
+            RoutingMode::Regular { .. } => "regular",
+            RoutingMode::PrefillDecode { .. } => "prefill_decode",
+            RoutingMode::OpenAI { .. } => "openai",
+        }
+    }
+
+    /// Check if service discovery is enabled
+    pub fn has_service_discovery(&self) -> bool {
+        self.discovery.as_ref().is_some_and(|d| d.enabled)
+    }
+
+    /// Check if metrics are enabled
+    pub fn has_metrics(&self) -> bool {
+        self.metrics.is_some()
+    }
+
+    /// Compute the effective retry config considering disable flag
+    pub fn effective_retry_config(&self) -> RetryConfig {
+        let mut cfg = self.retry.clone();
+        if self.disable_retries {
+            cfg.max_retries = 1;
+        }
+        cfg
+    }
+
+    /// Compute the effective circuit breaker config considering disable flag
+    pub fn effective_circuit_breaker_config(&self) -> CircuitBreakerConfig {
+        let mut cfg = self.circuit_breaker.clone();
+        if self.disable_circuit_breaker {
+            cfg.failure_threshold = u32::MAX;
+        }
+        cfg
+    }
+
+    /// Check if running in IGW (Inference Gateway) mode
+    pub fn is_igw_mode(&self) -> bool {
+        self.enable_igw
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // ============= RouterConfig Tests =============
+
+    #[test]
+    fn test_router_config_default() {
+        let config = RouterConfig::default();
+
+        assert!(
+            matches!(config.mode, RoutingMode::Regular { worker_urls } if worker_urls.is_empty())
+        );
+        assert!(matches!(config.policy, PolicyConfig::Random));
+        assert_eq!(config.host, "127.0.0.1");
+        assert_eq!(config.port, 3001);
+        assert_eq!(config.max_payload_size, 536_870_912);
+        assert_eq!(config.request_timeout_secs, 1800);
+        assert_eq!(config.worker_startup_timeout_secs, 600);
+        assert_eq!(config.worker_startup_check_interval_secs, 30);
+        assert!(config.discovery.is_none());
+        assert!(config.metrics.is_none());
+        assert!(config.log_dir.is_none());
+        assert!(config.log_level.is_none());
+    }
+
+    #[test]
+    fn test_router_config_new() {
+        let mode = RoutingMode::Regular {
+            worker_urls: vec!["http://worker1".to_string(), "http://worker2".to_string()],
+        };
+        let policy = PolicyConfig::RoundRobin;
+
+        let config = RouterConfig::new(mode, policy);
+
+        match config.mode {
+            RoutingMode::Regular { worker_urls } => {
+                assert_eq!(worker_urls.len(), 2);
+                assert_eq!(worker_urls[0], "http://worker1");
+                assert_eq!(worker_urls[1], "http://worker2");
+            }
+            _ => panic!("Expected Regular mode"),
+        }
+
+        assert!(matches!(config.policy, PolicyConfig::RoundRobin));
+        // Other fields should be default
+        assert_eq!(config.host, "127.0.0.1");
+        assert_eq!(config.port, 3001);
+    }
+
+    #[test]
+    fn test_router_config_serialization() {
+        let config = RouterConfig {
+            mode: RoutingMode::Regular {
+                worker_urls: vec!["http://worker1".to_string()],
+            },
+            policy: PolicyConfig::Random,
+            host: "0.0.0.0".to_string(),
+            port: 8080,
+            max_payload_size: 1024,
+            request_timeout_secs: 30,
+            worker_startup_timeout_secs: 60,
+            worker_startup_check_interval_secs: 5,
+            dp_aware: false,
+            api_key: None,
+            discovery: Some(DiscoveryConfig::default()),
+            metrics: Some(MetricsConfig::default()),
+            log_dir: Some("/var/log".to_string()),
+            log_level: Some("debug".to_string()),
+            request_id_headers: None,
+            max_concurrent_requests: 64,
+            cors_allowed_origins: vec![],
+            retry: RetryConfig::default(),
+            circuit_breaker: CircuitBreakerConfig::default(),
+            disable_retries: false,
+            disable_circuit_breaker: false,
+            health_check: HealthCheckConfig::default(),
+            enable_igw: false,
+            queue_size: 100,
+            queue_timeout_secs: 60,
+            rate_limit_tokens_per_second: None,
+            connection_mode: ConnectionMode::Http,
+            model_path: None,
+            tokenizer_path: None,
+        };
+
+        let json = serde_json::to_string(&config).unwrap();
+        let deserialized: RouterConfig = serde_json::from_str(&json).unwrap();
+
+        assert_eq!(config.host, deserialized.host);
+        assert_eq!(config.port, deserialized.port);
+        assert_eq!(config.max_payload_size, deserialized.max_payload_size);
+        assert!(deserialized.discovery.is_some());
+        assert!(deserialized.metrics.is_some());
+    }
+
+    // ============= RoutingMode Tests =============
+
+    #[test]
+    fn test_routing_mode_is_pd_mode() {
+        let regular = RoutingMode::Regular {
+            worker_urls: vec!["http://worker1".to_string()],
+        };
+        assert!(!regular.is_pd_mode());
+
+        let pd = RoutingMode::PrefillDecode {
+            prefill_urls: vec![("http://prefill1".to_string(), Some(8001))],
+            decode_urls: vec!["http://decode1".to_string()],
+            prefill_policy: None,
+            decode_policy: None,
+        };
+        assert!(pd.is_pd_mode());
+    }
+
+    #[test]
+    fn test_routing_mode_worker_count() {
+        let regular = RoutingMode::Regular {
+            worker_urls: vec![
+                "http://worker1".to_string(),
+                "http://worker2".to_string(),
+                "http://worker3".to_string(),
+            ],
+        };
+        assert_eq!(regular.worker_count(), 3);
+
+        let pd = RoutingMode::PrefillDecode {
+            prefill_urls: vec![
+                ("http://prefill1".to_string(), Some(8001)),
+                ("http://prefill2".to_string(), None),
+            ],
+            decode_urls: vec![
+                "http://decode1".to_string(),
+                "http://decode2".to_string(),
+                "http://decode3".to_string(),
+            ],
+            prefill_policy: None,
+            decode_policy: None,
+        };
+        assert_eq!(pd.worker_count(), 5);
+
+        let empty_regular = RoutingMode::Regular {
+            worker_urls: vec![],
+        };
+        assert_eq!(empty_regular.worker_count(), 0);
+    }
+
+    #[test]
+    fn test_routing_mode_serialization() {
+        // Test Regular mode
+        let regular = RoutingMode::Regular {
+            worker_urls: vec!["http://worker1".to_string()],
+        };
+        let json = serde_json::to_string(&regular).unwrap();
+        assert!(json.contains("\"type\":\"regular\""));
+        assert!(json.contains("\"worker_urls\""));
+
+        // Test PrefillDecode mode
+        let pd = RoutingMode::PrefillDecode {
+            prefill_urls: vec![("http://prefill1".to_string(), Some(8001))],
+            decode_urls: vec!["http://decode1".to_string()],
+            prefill_policy: None,
+            decode_policy: None,
+        };
+        let json = serde_json::to_string(&pd).unwrap();
+        assert!(json.contains("\"type\":\"prefill_decode\""));
+        assert!(json.contains("\"prefill_urls\""));
+        assert!(json.contains("\"decode_urls\""));
+    }
+
+    // ============= PolicyConfig Tests =============
+
+    #[test]
+    fn test_policy_config_name() {
+        assert_eq!(PolicyConfig::Random.name(), "random");
+        assert_eq!(PolicyConfig::RoundRobin.name(), "round_robin");
+
+        let cache_aware = PolicyConfig::CacheAware {
+            cache_threshold: 0.8,
+            balance_abs_threshold: 10,
+            balance_rel_threshold: 1.5,
+            eviction_interval_secs: 300,
+            max_tree_size: 1000,
+        };
+        assert_eq!(cache_aware.name(), "cache_aware");
+
+        let power_of_two = PolicyConfig::PowerOfTwo {
+            load_check_interval_secs: 60,
+        };
+        assert_eq!(power_of_two.name(), "power_of_two");
+    }
+
+    #[test]
+    fn test_policy_config_serialization() {
+        // Test Random
+        let random = PolicyConfig::Random;
+        let json = serde_json::to_string(&random).unwrap();
+        assert_eq!(json, r#"{"type":"random"}"#);
+
+        // Test CacheAware with all parameters
+        let cache_aware = PolicyConfig::CacheAware {
+            cache_threshold: 0.8,
+            balance_abs_threshold: 10,
+            balance_rel_threshold: 1.5,
+            eviction_interval_secs: 300,
+            max_tree_size: 1000,
+        };
+        let json = serde_json::to_string(&cache_aware).unwrap();
+        assert!(json.contains("\"type\":\"cache_aware\""));
+        assert!(json.contains("\"cache_threshold\":0.8"));
+        assert!(json.contains("\"balance_abs_threshold\":10"));
+
+        // Test PowerOfTwo
+        let power_of_two = PolicyConfig::PowerOfTwo {
+            load_check_interval_secs: 60,
+        };
+        let json = serde_json::to_string(&power_of_two).unwrap();
+        assert!(json.contains("\"type\":\"power_of_two\""));
+        assert!(json.contains("\"load_check_interval_secs\":60"));
+    }
+
+    #[test]
+    fn test_cache_aware_parameters() {
+        let cache_aware = PolicyConfig::CacheAware {
+            cache_threshold: 0.75,
+            balance_abs_threshold: 20,
+            balance_rel_threshold: 2.0,
+            eviction_interval_secs: 600,
+            max_tree_size: 5000,
+        };
+
+        match cache_aware {
+            PolicyConfig::CacheAware {
+                cache_threshold,
+                balance_abs_threshold,
+                balance_rel_threshold,
+                eviction_interval_secs,
+                max_tree_size,
+            } => {
+                assert!((cache_threshold - 0.75).abs() < 0.0001);
+                assert_eq!(balance_abs_threshold, 20);
+                assert!((balance_rel_threshold - 2.0).abs() < 0.0001);
+                assert_eq!(eviction_interval_secs, 600);
+                assert_eq!(max_tree_size, 5000);
+            }
+            _ => panic!("Expected CacheAware"),
+        }
+    }
+
+    #[test]
+    fn test_power_of_two_parameters() {
+        let power_of_two = PolicyConfig::PowerOfTwo {
+            load_check_interval_secs: 120,
+        };
+
+        match power_of_two {
+            PolicyConfig::PowerOfTwo {
+                load_check_interval_secs,
+            } => {
+                assert_eq!(load_check_interval_secs, 120);
+            }
+            _ => panic!("Expected PowerOfTwo"),
+        }
+    }
+
+    // ============= DiscoveryConfig Tests =============
+
+    #[test]
+    fn test_discovery_config_default() {
+        let config = DiscoveryConfig::default();
+
+        assert!(!config.enabled);
+        assert!(config.namespace.is_none());
+        assert_eq!(config.port, 8000);
+        assert_eq!(config.check_interval_secs, 120);
+        assert!(config.selector.is_empty());
+        assert!(config.prefill_selector.is_empty());
+        assert!(config.decode_selector.is_empty());
+        assert_eq!(config.bootstrap_port_annotation, "sglang.ai/bootstrap-port");
+    }
+
+    #[test]
+    fn test_discovery_config_with_selectors() {
+        let mut selector = HashMap::new();
+        selector.insert("app".to_string(), "sglang".to_string());
+        selector.insert("role".to_string(), "worker".to_string());
+
+        let config = DiscoveryConfig {
+            enabled: true,
+            namespace: Some("default".to_string()),
+            port: 9000,
+            check_interval_secs: 30,
+            selector: selector.clone(),
+            prefill_selector: selector.clone(),
+            decode_selector: selector.clone(),
+            bootstrap_port_annotation: "custom.io/port".to_string(),
+        };
+
+        assert!(config.enabled);
+        assert_eq!(config.namespace, Some("default".to_string()));
+        assert_eq!(config.port, 9000);
+        assert_eq!(config.selector.len(), 2);
+        assert_eq!(config.selector.get("app"), Some(&"sglang".to_string()));
+    }
+
+    #[test]
+    fn test_discovery_config_namespace() {
+        // Test None namespace (all namespaces)
+        let config = DiscoveryConfig {
+            namespace: None,
+            ..Default::default()
+        };
+        assert!(config.namespace.is_none());
+
+        // Test specific namespace
+        let config = DiscoveryConfig {
+            namespace: Some("production".to_string()),
+            ..Default::default()
+        };
+        assert_eq!(config.namespace, Some("production".to_string()));
+    }
+
+    // ============= MetricsConfig Tests =============
+
+    #[test]
+    fn test_metrics_config_default() {
+        let config = MetricsConfig::default();
+
+        assert_eq!(config.port, 29000);
+        assert_eq!(config.host, "127.0.0.1");
+    }
+
+    #[test]
+    fn test_metrics_config_custom() {
+        let config = MetricsConfig {
+            port: 9090,
+            host: "0.0.0.0".to_string(),
+        };
+
+        assert_eq!(config.port, 9090);
+        assert_eq!(config.host, "0.0.0.0");
+    }
+
+    // ============= RouterConfig Utility Methods Tests =============
+
+    #[test]
+    fn test_mode_type() {
+        let config = RouterConfig {
+            mode: RoutingMode::Regular {
+                worker_urls: vec![],
+            },
+            ..Default::default()
+        };
+        assert_eq!(config.mode_type(), "regular");
+
+        let config = RouterConfig {
+            mode: RoutingMode::PrefillDecode {
+                prefill_urls: vec![],
+                decode_urls: vec![],
+                prefill_policy: None,
+                decode_policy: None,
+            },
+            ..Default::default()
+        };
+        assert_eq!(config.mode_type(), "prefill_decode");
+    }
+
+    #[test]
+    fn test_has_service_discovery() {
+        let config = RouterConfig::default();
+        assert!(!config.has_service_discovery());
+
+        let config = RouterConfig {
+            discovery: Some(DiscoveryConfig {
+                enabled: false,
+                ..Default::default()
+            }),
+            ..Default::default()
+        };
+        assert!(!config.has_service_discovery());
+
+        let config = RouterConfig {
+            discovery: Some(DiscoveryConfig {
+                enabled: true,
+                ..Default::default()
+            }),
+            ..Default::default()
+        };
+        assert!(config.has_service_discovery());
+    }
+
+    #[test]
+    fn test_has_metrics() {
+        let config = RouterConfig::default();
+        assert!(!config.has_metrics());
+
+        let config = RouterConfig {
+            metrics: Some(MetricsConfig::default()),
+            ..Default::default()
+        };
+        assert!(config.has_metrics());
+    }
+
+    // ============= Edge Cases =============
+
+    #[test]
+    fn test_large_worker_lists() {
+        let large_urls: Vec<String> = (0..1000).map(|i| format!("http://worker{}", i)).collect();
+
+        let mode = RoutingMode::Regular {
+            worker_urls: large_urls.clone(),
+        };
+
+        assert_eq!(mode.worker_count(), 1000);
+
+        // Test serialization with large list
+        let config = RouterConfig {
+            mode,
+            ..Default::default()
+        };
+
+        let json = serde_json::to_string(&config).unwrap();
+        let deserialized: RouterConfig = serde_json::from_str(&json).unwrap();
+
+        match deserialized.mode {
+            RoutingMode::Regular { worker_urls } => {
+                assert_eq!(worker_urls.len(), 1000);
+            }
+            _ => panic!("Expected Regular mode"),
+        }
+    }
+
+    #[test]
+    fn test_unicode_in_config() {
+        let config = RouterConfig {
+            mode: RoutingMode::Regular {
+                worker_urls: vec!["http://работник1".to_string(), "http://工作者2".to_string()],
+            },
+            log_dir: Some("/日志/目录".to_string()),
+            ..Default::default()
+        };
+
+        let json = serde_json::to_string(&config).unwrap();
+        let deserialized: RouterConfig = serde_json::from_str(&json).unwrap();
+
+        match deserialized.mode {
+            RoutingMode::Regular { worker_urls } => {
+                assert_eq!(worker_urls[0], "http://работник1");
+                assert_eq!(worker_urls[1], "http://工作者2");
+            }
+            _ => panic!("Expected Regular mode"),
+        }
+
+        assert_eq!(deserialized.log_dir, Some("/日志/目录".to_string()));
+    }
+
+    #[test]
+    fn test_empty_string_fields() {
+        let config = RouterConfig {
+            host: "".to_string(),
+            log_dir: Some("".to_string()),
+            log_level: Some("".to_string()),
+            ..Default::default()
+        };
+
+        assert_eq!(config.host, "");
+        assert_eq!(config.log_dir, Some("".to_string()));
+        assert_eq!(config.log_level, Some("".to_string()));
+    }
+
+    // ============= Complex Configuration Tests =============
+
+    #[test]
+    fn test_full_pd_mode_config() {
+        let config = RouterConfig {
+            mode: RoutingMode::PrefillDecode {
+                prefill_urls: vec![
+                    ("http://prefill1:8000".to_string(), Some(8001)),
+                    ("http://prefill2:8000".to_string(), None),
+                ],
+                decode_urls: vec![
+                    "http://decode1:8000".to_string(),
+                    "http://decode2:8000".to_string(),
+                ],
+                prefill_policy: None,
+                decode_policy: None,
+            },
+            policy: PolicyConfig::PowerOfTwo {
+                load_check_interval_secs: 30,
+            },
+            host: "0.0.0.0".to_string(),
+            port: 3000,
+            max_payload_size: 1048576,
+            request_timeout_secs: 120,
+            worker_startup_timeout_secs: 60,
+            worker_startup_check_interval_secs: 5,
+            dp_aware: false,
+            api_key: None,
+            discovery: Some(DiscoveryConfig {
+                enabled: true,
+                namespace: Some("sglang".to_string()),
+                ..Default::default()
+            }),
+            metrics: Some(MetricsConfig {
+                port: 9090,
+                host: "0.0.0.0".to_string(),
+            }),
+            log_dir: Some("/var/log/sglang".to_string()),
+            log_level: Some("info".to_string()),
+            request_id_headers: None,
+            max_concurrent_requests: 64,
+            cors_allowed_origins: vec![],
+            retry: RetryConfig::default(),
+            circuit_breaker: CircuitBreakerConfig::default(),
+            disable_retries: false,
+            disable_circuit_breaker: false,
+            health_check: HealthCheckConfig::default(),
+            enable_igw: false,
+            queue_size: 100,
+            queue_timeout_secs: 60,
+            rate_limit_tokens_per_second: None,
+            connection_mode: ConnectionMode::Http,
+            model_path: None,
+            tokenizer_path: None,
+        };
+
+        assert!(config.mode.is_pd_mode());
+        assert_eq!(config.mode.worker_count(), 4);
+        assert_eq!(config.policy.name(), "power_of_two");
+        assert!(config.has_service_discovery());
+        assert!(config.has_metrics());
+    }
+
+    #[test]
+    fn test_full_regular_mode_config() {
+        let mut selector = HashMap::new();
+        selector.insert("app".to_string(), "sglang".to_string());
+
+        let config = RouterConfig {
+            mode: RoutingMode::Regular {
+                worker_urls: vec![
+                    "http://worker1:8000".to_string(),
+                    "http://worker2:8000".to_string(),
+                    "http://worker3:8000".to_string(),
+                ],
+            },
+            policy: PolicyConfig::CacheAware {
+                cache_threshold: 0.9,
+                balance_abs_threshold: 5,
+                balance_rel_threshold: 1.2,
+                eviction_interval_secs: 600,
+                max_tree_size: 10000,
+            },
+            host: "0.0.0.0".to_string(),
+            port: 3001,
+            max_payload_size: 536870912,
+            request_timeout_secs: 300,
+            worker_startup_timeout_secs: 180,
+            worker_startup_check_interval_secs: 15,
+            dp_aware: false,
+            api_key: None,
+            discovery: Some(DiscoveryConfig {
+                enabled: true,
+                namespace: None,
+                port: 8080,
+                check_interval_secs: 45,
+                selector,
+                ..Default::default()
+            }),
+            metrics: Some(MetricsConfig::default()),
+            log_dir: None,
+            log_level: Some("debug".to_string()),
+            request_id_headers: None,
+            max_concurrent_requests: 64,
+            cors_allowed_origins: vec![],
+            retry: RetryConfig::default(),
+            circuit_breaker: CircuitBreakerConfig::default(),
+            disable_retries: false,
+            disable_circuit_breaker: false,
+            health_check: HealthCheckConfig::default(),
+            enable_igw: false,
+            queue_size: 100,
+            queue_timeout_secs: 60,
+            rate_limit_tokens_per_second: None,
+            connection_mode: ConnectionMode::Http,
+            model_path: None,
+            tokenizer_path: None,
+        };
+
+        assert!(!config.mode.is_pd_mode());
+        assert_eq!(config.mode.worker_count(), 3);
+        assert_eq!(config.policy.name(), "cache_aware");
+        assert!(config.has_service_discovery());
+        assert!(config.has_metrics());
+    }
+
+    #[test]
+    fn test_config_with_all_options() {
+        let mut selectors = HashMap::new();
+        selectors.insert("env".to_string(), "prod".to_string());
+        selectors.insert("version".to_string(), "v1".to_string());
+
+        let config = RouterConfig {
+            mode: RoutingMode::Regular {
+                worker_urls: vec!["http://worker1".to_string()],
+            },
+            policy: PolicyConfig::RoundRobin,
+            host: "::1".to_string(), // IPv6
+            port: 8888,
+            max_payload_size: 1024 * 1024 * 512, // 512MB
+            request_timeout_secs: 900,
+            worker_startup_timeout_secs: 600,
+            worker_startup_check_interval_secs: 20,
+            dp_aware: false,
+            api_key: None,
+            discovery: Some(DiscoveryConfig {
+                enabled: true,
+                namespace: Some("production".to_string()),
+                port: 8443,
+                check_interval_secs: 120,
+                selector: selectors.clone(),
+                prefill_selector: selectors.clone(),
+                decode_selector: selectors,
+                bootstrap_port_annotation: "mycompany.io/bootstrap".to_string(),
+            }),
+            metrics: Some(MetricsConfig {
+                port: 9999,
+                host: "::".to_string(), // IPv6 any
+            }),
+            log_dir: Some("/opt/logs/sglang".to_string()),
+            log_level: Some("trace".to_string()),
+            request_id_headers: None,
+            max_concurrent_requests: 64,
+            cors_allowed_origins: vec![],
+            retry: RetryConfig::default(),
+            circuit_breaker: CircuitBreakerConfig::default(),
+            disable_retries: false,
+            disable_circuit_breaker: false,
+            health_check: HealthCheckConfig::default(),
+            enable_igw: false,
+            queue_size: 100,
+            queue_timeout_secs: 60,
+            rate_limit_tokens_per_second: None,
+            connection_mode: ConnectionMode::Http,
+            model_path: None,
+            tokenizer_path: None,
+        };
+
+        assert!(config.has_service_discovery());
+        assert!(config.has_metrics());
+        assert_eq!(config.mode_type(), "regular");
+
+        // Test round-trip serialization
+        let json = serde_json::to_string_pretty(&config).unwrap();
+        let deserialized: RouterConfig = serde_json::from_str(&json).unwrap();
+
+        assert_eq!(deserialized.host, "::1");
+        assert_eq!(deserialized.port, 8888);
+        assert_eq!(
+            deserialized.discovery.unwrap().namespace,
+            Some("production".to_string())
+        );
+    }
+
+    // ============= Policy Fallback Tests =============
+
+    #[test]
+    fn test_pd_policy_fallback_both_specified() {
+        // When both prefill and decode policies are specified, they should be used
+        let pd = RoutingMode::PrefillDecode {
+            prefill_urls: vec![("http://prefill1".to_string(), None)],
+            decode_urls: vec!["http://decode1".to_string()],
+            prefill_policy: Some(PolicyConfig::CacheAware {
+                cache_threshold: 0.5,
+                balance_abs_threshold: 32,
+                balance_rel_threshold: 1.1,
+                eviction_interval_secs: 60,
+                max_tree_size: 1000,
+            }),
+            decode_policy: Some(PolicyConfig::PowerOfTwo {
+                load_check_interval_secs: 60,
+            }),
+        };
+
+        let main_policy = PolicyConfig::Random;
+
+        // Both specific policies should be used
+        match pd.get_prefill_policy(&main_policy) {
+            PolicyConfig::CacheAware { .. } => {} // Success
+            _ => panic!("Expected CacheAware for prefill"),
+        }
+
+        match pd.get_decode_policy(&main_policy) {
+            PolicyConfig::PowerOfTwo { .. } => {} // Success
+            _ => panic!("Expected PowerOfTwo for decode"),
+        }
+    }
+
+    #[test]
+    fn test_pd_policy_fallback_only_prefill() {
+        // When only prefill policy is specified, decode should use main policy
+        let pd = RoutingMode::PrefillDecode {
+            prefill_urls: vec![("http://prefill1".to_string(), None)],
+            decode_urls: vec!["http://decode1".to_string()],
+            prefill_policy: Some(PolicyConfig::CacheAware {
+                cache_threshold: 0.5,
+                balance_abs_threshold: 32,
+                balance_rel_threshold: 1.1,
+                eviction_interval_secs: 60,
+                max_tree_size: 1000,
+            }),
+            decode_policy: None,
+        };
+
+        let main_policy = PolicyConfig::RoundRobin;
+
+        // Prefill should use specific policy
+        match pd.get_prefill_policy(&main_policy) {
+            PolicyConfig::CacheAware { .. } => {} // Success
+            _ => panic!("Expected CacheAware for prefill"),
+        }
+
+        // Decode should fall back to main policy
+        match pd.get_decode_policy(&main_policy) {
+            PolicyConfig::RoundRobin => {} // Success
+            _ => panic!("Expected RoundRobin for decode"),
+        }
+    }
+
+    #[test]
+    fn test_pd_policy_fallback_only_decode() {
+        // When only decode policy is specified, prefill should use main policy
+        let pd = RoutingMode::PrefillDecode {
+            prefill_urls: vec![("http://prefill1".to_string(), None)],
+            decode_urls: vec!["http://decode1".to_string()],
+            prefill_policy: None,
+            decode_policy: Some(PolicyConfig::PowerOfTwo {
+                load_check_interval_secs: 60,
+            }),
+        };
+
+        let main_policy = PolicyConfig::Random;
+
+        // Prefill should fall back to main policy
+        match pd.get_prefill_policy(&main_policy) {
+            PolicyConfig::Random => {} // Success
+            _ => panic!("Expected Random for prefill"),
+        }
+
+        // Decode should use specific policy
+        match pd.get_decode_policy(&main_policy) {
+            PolicyConfig::PowerOfTwo { .. } => {} // Success
+            _ => panic!("Expected PowerOfTwo for decode"),
+        }
+    }
+
+    #[test]
+    fn test_pd_policy_fallback_none_specified() {
+        // When no specific policies are specified, both should use main policy
+        let pd = RoutingMode::PrefillDecode {
+            prefill_urls: vec![("http://prefill1".to_string(), None)],
+            decode_urls: vec!["http://decode1".to_string()],
+            prefill_policy: None,
+            decode_policy: None,
+        };
+
+        let main_policy = PolicyConfig::CacheAware {
+            cache_threshold: 0.7,
+            balance_abs_threshold: 20,
+            balance_rel_threshold: 1.5,
+            eviction_interval_secs: 300,
+            max_tree_size: 2000,
+        };
+
+        // Both should fall back to main policy
+        match pd.get_prefill_policy(&main_policy) {
+            PolicyConfig::CacheAware {
+                cache_threshold, ..
+            } => {
+                assert!((cache_threshold - 0.7).abs() < 0.0001);
+            }
+            _ => panic!("Expected CacheAware for prefill"),
+        }
+
+        match pd.get_decode_policy(&main_policy) {
+            PolicyConfig::CacheAware {
+                cache_threshold, ..
+            } => {
+                assert!((cache_threshold - 0.7).abs() < 0.0001);
+            }
+            _ => panic!("Expected CacheAware for decode"),
+        }
+    }
+
+    #[test]
+    fn test_regular_mode_policy_fallback() {
+        // For regular mode, the helper methods should just return the main policy
+        let regular = RoutingMode::Regular {
+            worker_urls: vec!["http://worker1".to_string()],
+        };
+
+        let main_policy = PolicyConfig::RoundRobin;
+
+        // Both methods should return main policy for regular mode
+        match regular.get_prefill_policy(&main_policy) {
+            PolicyConfig::RoundRobin => {} // Success
+            _ => panic!("Expected RoundRobin for regular mode"),
+        }
+
+        match regular.get_decode_policy(&main_policy) {
+            PolicyConfig::RoundRobin => {} // Success
+            _ => panic!("Expected RoundRobin for regular mode"),
+        }
+    }
+}
diff --git a/sgl-router/src/config/validation.rs b/sgl-router/src/config/validation.rs
new file mode 100644
index 000000000..710ad3fc8
--- /dev/null
+++ b/sgl-router/src/config/validation.rs
@@ -0,0 +1,776 @@
+use super::*;
+
+/// Configuration validator
+pub struct ConfigValidator;
+
+impl ConfigValidator {
+    /// Validate a complete router configuration
+    pub fn validate(config: &RouterConfig) -> ConfigResult<()> {
+        // Check if service discovery is enabled
+        let has_service_discovery = config.discovery.as_ref().is_some_and(|d| d.enabled);
+
+        Self::validate_mode(&config.mode, has_service_discovery)?;
+        Self::validate_policy(&config.policy)?;
+        Self::validate_server_settings(config)?;
+
+        if let Some(discovery) = &config.discovery {
+            Self::validate_discovery(discovery, &config.mode)?;
+        }
+
+        if let Some(metrics) = &config.metrics {
+            Self::validate_metrics(metrics)?;
+        }
+
+        Self::validate_compatibility(config)?;
+
+        // Validate effective retry/CB configs (respect disable flags)
+        let retry_cfg = config.effective_retry_config();
+        let cb_cfg = config.effective_circuit_breaker_config();
+        Self::validate_retry(&retry_cfg)?;
+        Self::validate_circuit_breaker(&cb_cfg)?;
+
+        Ok(())
+    }
+
+    /// Validate routing mode configuration
+    fn validate_mode(mode: &RoutingMode, has_service_discovery: bool) -> ConfigResult<()> {
+        match mode {
+            RoutingMode::Regular { worker_urls } => {
+                // Validate URLs if any are provided
+                if !worker_urls.is_empty() {
+                    Self::validate_urls(worker_urls)?;
+                }
+                // Note: We allow empty worker URLs even without service discovery
+                // to let the router start and fail at runtime when routing requests.
+                // This matches legacy behavior and test expectations.
+            }
+            RoutingMode::PrefillDecode {
+                prefill_urls,
+                decode_urls,
+                prefill_policy,
+                decode_policy,
+            } => {
+                // Only require URLs if service discovery is disabled
+                if !has_service_discovery {
+                    if prefill_urls.is_empty() {
+                        return Err(ConfigError::ValidationFailed {
+                            reason: "PD mode requires at least one prefill worker URL".to_string(),
+                        });
+                    }
+                    if decode_urls.is_empty() {
+                        return Err(ConfigError::ValidationFailed {
+                            reason: "PD mode requires at least one decode worker URL".to_string(),
+                        });
+                    }
+                }
+
+                // Validate URLs if any are provided
+                if !prefill_urls.is_empty() {
+                    let prefill_url_strings: Vec<String> =
+                        prefill_urls.iter().map(|(url, _)| url.clone()).collect();
+                    Self::validate_urls(&prefill_url_strings)?;
+                }
+                if !decode_urls.is_empty() {
+                    Self::validate_urls(decode_urls)?;
+                }
+
+                // Validate bootstrap ports
+                for (_url, port) in prefill_urls {
+                    if let Some(port) = port {
+                        if *port == 0 {
+                            return Err(ConfigError::InvalidValue {
+                                field: "bootstrap_port".to_string(),
+                                value: port.to_string(),
+                                reason: "Port must be between 1 and 65535".to_string(),
+                            });
+                        }
+                    }
+                }
+
+                // Validate optional prefill and decode policies
+                if let Some(p_policy) = prefill_policy {
+                    Self::validate_policy(p_policy)?;
+                }
+                if let Some(d_policy) = decode_policy {
+                    Self::validate_policy(d_policy)?;
+                }
+            }
+            RoutingMode::OpenAI { worker_urls } => {
+                // Require exactly one worker URL for OpenAI router
+                if worker_urls.len() != 1 {
+                    return Err(ConfigError::ValidationFailed {
+                        reason: "OpenAI mode requires exactly one --worker-urls entry".to_string(),
+                    });
+                }
+                // Validate URL format
+                if let Err(e) = url::Url::parse(&worker_urls[0]) {
+                    return Err(ConfigError::ValidationFailed {
+                        reason: format!("Invalid OpenAI worker URL '{}': {}", &worker_urls[0], e),
+                    });
+                }
+            }
+        }
+        Ok(())
+    }
+
+    /// Validate policy configuration
+    fn validate_policy(policy: &PolicyConfig) -> ConfigResult<()> {
+        match policy {
+            PolicyConfig::Random | PolicyConfig::RoundRobin => {
+                // No specific validation needed
+            }
+            PolicyConfig::CacheAware {
+                cache_threshold,
+                balance_abs_threshold: _,
+                balance_rel_threshold,
+                eviction_interval_secs,
+                max_tree_size,
+            } => {
+                if !(0.0..=1.0).contains(cache_threshold) {
+                    return Err(ConfigError::InvalidValue {
+                        field: "cache_threshold".to_string(),
+                        value: cache_threshold.to_string(),
+                        reason: "Must be between 0.0 and 1.0".to_string(),
+                    });
+                }
+
+                if *balance_rel_threshold < 1.0 {
+                    return Err(ConfigError::InvalidValue {
+                        field: "balance_rel_threshold".to_string(),
+                        value: balance_rel_threshold.to_string(),
+                        reason: "Must be >= 1.0".to_string(),
+                    });
+                }
+
+                if *eviction_interval_secs == 0 {
+                    return Err(ConfigError::InvalidValue {
+                        field: "eviction_interval_secs".to_string(),
+                        value: eviction_interval_secs.to_string(),
+                        reason: "Must be > 0".to_string(),
+                    });
+                }
+
+                if *max_tree_size == 0 {
+                    return Err(ConfigError::InvalidValue {
+                        field: "max_tree_size".to_string(),
+                        value: max_tree_size.to_string(),
+                        reason: "Must be > 0".to_string(),
+                    });
+                }
+            }
+            PolicyConfig::PowerOfTwo {
+                load_check_interval_secs,
+            } => {
+                if *load_check_interval_secs == 0 {
+                    return Err(ConfigError::InvalidValue {
+                        field: "load_check_interval_secs".to_string(),
+                        value: load_check_interval_secs.to_string(),
+                        reason: "Must be > 0".to_string(),
+                    });
+                }
+            }
+        }
+        Ok(())
+    }
+
+    /// Validate server configuration
+    fn validate_server_settings(config: &RouterConfig) -> ConfigResult<()> {
+        if config.port == 0 {
+            return Err(ConfigError::InvalidValue {
+                field: "port".to_string(),
+                value: config.port.to_string(),
+                reason: "Port must be > 0".to_string(),
+            });
+        }
+
+        if config.max_payload_size == 0 {
+            return Err(ConfigError::InvalidValue {
+                field: "max_payload_size".to_string(),
+                value: config.max_payload_size.to_string(),
+                reason: "Must be > 0".to_string(),
+            });
+        }
+
+        if config.request_timeout_secs == 0 {
+            return Err(ConfigError::InvalidValue {
+                field: "request_timeout_secs".to_string(),
+                value: config.request_timeout_secs.to_string(),
+                reason: "Must be > 0".to_string(),
+            });
+        }
+
+        if config.worker_startup_timeout_secs == 0 {
+            return Err(ConfigError::InvalidValue {
+                field: "worker_startup_timeout_secs".to_string(),
+                value: config.worker_startup_timeout_secs.to_string(),
+                reason: "Must be > 0".to_string(),
+            });
+        }
+
+        if config.worker_startup_check_interval_secs == 0 {
+            return Err(ConfigError::InvalidValue {
+                field: "worker_startup_check_interval_secs".to_string(),
+                value: config.worker_startup_check_interval_secs.to_string(),
+                reason: "Must be > 0".to_string(),
+            });
+        }
+
+        Ok(())
+    }
+
+    /// Validate service discovery configuration
+    fn validate_discovery(discovery: &DiscoveryConfig, mode: &RoutingMode) -> ConfigResult<()> {
+        if !discovery.enabled {
+            return Ok(()); // No validation needed if disabled
+        }
+
+        if discovery.port == 0 {
+            return Err(ConfigError::InvalidValue {
+                field: "discovery.port".to_string(),
+                value: discovery.port.to_string(),
+                reason: "Port must be > 0".to_string(),
+            });
+        }
+
+        if discovery.check_interval_secs == 0 {
+            return Err(ConfigError::InvalidValue {
+                field: "discovery.check_interval_secs".to_string(),
+                value: discovery.check_interval_secs.to_string(),
+                reason: "Must be > 0".to_string(),
+            });
+        }
+
+        // Validate selectors based on mode
+        match mode {
+            RoutingMode::Regular { .. } => {
+                if discovery.selector.is_empty() {
+                    return Err(ConfigError::ValidationFailed {
+                        reason: "Regular mode with service discovery requires a non-empty selector"
+                            .to_string(),
+                    });
+                }
+            }
+            RoutingMode::PrefillDecode { .. } => {
+                if discovery.prefill_selector.is_empty() && discovery.decode_selector.is_empty() {
+                    return Err(ConfigError::ValidationFailed {
+                        reason: "PD mode with service discovery requires at least one non-empty selector (prefill or decode)".to_string(),
+                    });
+                }
+            }
+            RoutingMode::OpenAI { .. } => {
+                // OpenAI mode doesn't use service discovery
+                return Err(ConfigError::ValidationFailed {
+                    reason: "OpenAI mode does not support service discovery".to_string(),
+                });
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Validate metrics configuration
+    fn validate_metrics(metrics: &MetricsConfig) -> ConfigResult<()> {
+        if metrics.port == 0 {
+            return Err(ConfigError::InvalidValue {
+                field: "metrics.port".to_string(),
+                value: metrics.port.to_string(),
+                reason: "Port must be > 0".to_string(),
+            });
+        }
+
+        if metrics.host.is_empty() {
+            return Err(ConfigError::InvalidValue {
+                field: "metrics.host".to_string(),
+                value: metrics.host.clone(),
+                reason: "Host cannot be empty".to_string(),
+            });
+        }
+
+        Ok(())
+    }
+
+    /// Validate retry configuration
+    fn validate_retry(retry: &RetryConfig) -> ConfigResult<()> {
+        if retry.max_retries < 1 {
+            return Err(ConfigError::InvalidValue {
+                field: "retry.max_retries".to_string(),
+                value: retry.max_retries.to_string(),
+                reason: "Must be >= 1 (set to 1 to effectively disable retries)".to_string(),
+            });
+        }
+        if retry.initial_backoff_ms == 0 {
+            return Err(ConfigError::InvalidValue {
+                field: "retry.initial_backoff_ms".to_string(),
+                value: retry.initial_backoff_ms.to_string(),
+                reason: "Must be > 0".to_string(),
+            });
+        }
+        if retry.max_backoff_ms < retry.initial_backoff_ms {
+            return Err(ConfigError::InvalidValue {
+                field: "retry.max_backoff_ms".to_string(),
+                value: retry.max_backoff_ms.to_string(),
+                reason: "Must be >= initial_backoff_ms".to_string(),
+            });
+        }
+        if retry.backoff_multiplier < 1.0 {
+            return Err(ConfigError::InvalidValue {
+                field: "retry.backoff_multiplier".to_string(),
+                value: retry.backoff_multiplier.to_string(),
+                reason: "Must be >= 1.0".to_string(),
+            });
+        }
+        if !(0.0..=1.0).contains(&retry.jitter_factor) {
+            return Err(ConfigError::InvalidValue {
+                field: "retry.jitter_factor".to_string(),
+                value: retry.jitter_factor.to_string(),
+                reason: "Must be between 0.0 and 1.0".to_string(),
+            });
+        }
+        Ok(())
+    }
+
+    /// Validate circuit breaker configuration
+    fn validate_circuit_breaker(cb: &CircuitBreakerConfig) -> ConfigResult<()> {
+        if cb.failure_threshold < 1 {
+            return Err(ConfigError::InvalidValue {
+                field: "circuit_breaker.failure_threshold".to_string(),
+                value: cb.failure_threshold.to_string(),
+                reason: "Must be >= 1 (set to u32::MAX to effectively disable CB)".to_string(),
+            });
+        }
+        if cb.success_threshold < 1 {
+            return Err(ConfigError::InvalidValue {
+                field: "circuit_breaker.success_threshold".to_string(),
+                value: cb.success_threshold.to_string(),
+                reason: "Must be >= 1".to_string(),
+            });
+        }
+        if cb.timeout_duration_secs == 0 {
+            return Err(ConfigError::InvalidValue {
+                field: "circuit_breaker.timeout_duration_secs".to_string(),
+                value: cb.timeout_duration_secs.to_string(),
+                reason: "Must be > 0".to_string(),
+            });
+        }
+        if cb.window_duration_secs == 0 {
+            return Err(ConfigError::InvalidValue {
+                field: "circuit_breaker.window_duration_secs".to_string(),
+                value: cb.window_duration_secs.to_string(),
+                reason: "Must be > 0".to_string(),
+            });
+        }
+        Ok(())
+    }
+
+    /// Validate compatibility between different configuration sections
+    fn validate_compatibility(config: &RouterConfig) -> ConfigResult<()> {
+        // IGW mode is independent - skip other compatibility checks when enabled
+        if config.enable_igw {
+            return Ok(());
+        }
+
+        // Validate gRPC connection mode requires tokenizer configuration
+        if config.connection_mode == ConnectionMode::Grpc
+            && config.tokenizer_path.is_none()
+            && config.model_path.is_none()
+        {
+            return Err(ConfigError::ValidationFailed {
+                reason: "gRPC connection mode requires either --tokenizer-path or --model-path to be specified".to_string(),
+            });
+        }
+
+        // All policies are now supported for both router types thanks to the unified trait design
+        // No mode/policy restrictions needed anymore
+
+        // Check if service discovery is enabled for worker count validation
+        let has_service_discovery = config.discovery.as_ref().is_some_and(|d| d.enabled);
+
+        // Only validate worker counts if service discovery is disabled
+        if !has_service_discovery {
+            // Check if power-of-two policy makes sense with insufficient workers
+            if let PolicyConfig::PowerOfTwo { .. } = &config.policy {
+                let worker_count = config.mode.worker_count();
+                if worker_count < 2 {
+                    return Err(ConfigError::IncompatibleConfig {
+                        reason: "Power-of-two policy requires at least 2 workers".to_string(),
+                    });
+                }
+            }
+
+            // For PD mode, validate that policies have sufficient workers
+            if let RoutingMode::PrefillDecode {
+                prefill_urls,
+                decode_urls,
+                prefill_policy,
+                decode_policy,
+            } = &config.mode
+            {
+                // Check power-of-two for prefill
+                if let Some(PolicyConfig::PowerOfTwo { .. }) = prefill_policy {
+                    if prefill_urls.len() < 2 {
+                        return Err(ConfigError::IncompatibleConfig {
+                            reason: "Power-of-two policy for prefill requires at least 2 prefill workers".to_string(),
+                        });
+                    }
+                }
+
+                // Check power-of-two for decode
+                if let Some(PolicyConfig::PowerOfTwo { .. }) = decode_policy {
+                    if decode_urls.len() < 2 {
+                        return Err(ConfigError::IncompatibleConfig {
+                            reason:
+                                "Power-of-two policy for decode requires at least 2 decode workers"
+                                    .to_string(),
+                        });
+                    }
+                }
+            }
+        }
+
+        // Service discovery is conflict with dp_aware routing for now
+        // since it's not fully supported yet
+        if has_service_discovery && config.dp_aware {
+            return Err(ConfigError::IncompatibleConfig {
+                reason: "DP-aware routing is not compatible with service discovery".to_string(),
+            });
+        }
+
+        Ok(())
+    }
+
+    /// Validate URL format
+    fn validate_urls(urls: &[String]) -> ConfigResult<()> {
+        for url in urls {
+            if url.is_empty() {
+                return Err(ConfigError::InvalidValue {
+                    field: "worker_url".to_string(),
+                    value: url.clone(),
+                    reason: "URL cannot be empty".to_string(),
+                });
+            }
+
+            if !url.starts_with("http://")
+                && !url.starts_with("https://")
+                && !url.starts_with("grpc://")
+            {
+                return Err(ConfigError::InvalidValue {
+                    field: "worker_url".to_string(),
+                    value: url.clone(),
+                    reason: "URL must start with http://, https://, or grpc://".to_string(),
+                });
+            }
+
+            // Basic URL validation
+            match ::url::Url::parse(url) {
+                Ok(parsed) => {
+                    // Additional validation
+                    if parsed.host_str().is_none() {
+                        return Err(ConfigError::InvalidValue {
+                            field: "worker_url".to_string(),
+                            value: url.clone(),
+                            reason: "URL must have a valid host".to_string(),
+                        });
+                    }
+                }
+                Err(e) => {
+                    return Err(ConfigError::InvalidValue {
+                        field: "worker_url".to_string(),
+                        value: url.clone(),
+                        reason: format!("Invalid URL format: {}", e),
+                    });
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_validate_regular_mode() {
+        let config = RouterConfig::new(
+            RoutingMode::Regular {
+                worker_urls: vec!["http://worker:8000".to_string()],
+            },
+            PolicyConfig::Random,
+        );
+
+        assert!(ConfigValidator::validate(&config).is_ok());
+    }
+
+    #[test]
+    fn test_validate_empty_worker_urls() {
+        let config = RouterConfig::new(
+            RoutingMode::Regular {
+                worker_urls: vec![],
+            },
+            PolicyConfig::Random,
+        );
+
+        // Empty worker URLs are now allowed to match legacy behavior
+        assert!(ConfigValidator::validate(&config).is_ok());
+    }
+
+    #[test]
+    fn test_validate_empty_worker_urls_with_service_discovery() {
+        let mut config = RouterConfig::new(
+            RoutingMode::Regular {
+                worker_urls: vec![],
+            },
+            PolicyConfig::Random,
+        );
+
+        // Enable service discovery
+        config.discovery = Some(DiscoveryConfig {
+            enabled: true,
+            selector: vec![("app".to_string(), "test".to_string())]
+                .into_iter()
+                .collect(),
+            ..Default::default()
+        });
+
+        // Should pass validation since service discovery is enabled
+        assert!(ConfigValidator::validate(&config).is_ok());
+    }
+
+    #[test]
+    fn test_validate_invalid_urls() {
+        let config = RouterConfig::new(
+            RoutingMode::Regular {
+                worker_urls: vec!["invalid-url".to_string()],
+            },
+            PolicyConfig::Random,
+        );
+
+        assert!(ConfigValidator::validate(&config).is_err());
+    }
+
+    #[test]
+    fn test_validate_cache_aware_thresholds() {
+        let config = RouterConfig::new(
+            RoutingMode::Regular {
+                worker_urls: vec![
+                    "http://worker1:8000".to_string(),
+                    "http://worker2:8000".to_string(),
+                ],
+            },
+            PolicyConfig::CacheAware {
+                cache_threshold: 1.5, // Invalid: > 1.0
+                balance_abs_threshold: 32,
+                balance_rel_threshold: 1.1,
+                eviction_interval_secs: 60,
+                max_tree_size: 1000,
+            },
+        );
+
+        assert!(ConfigValidator::validate(&config).is_err());
+    }
+
+    #[test]
+    fn test_validate_cache_aware_single_worker() {
+        // Cache-aware with single worker should be allowed (even if not optimal)
+        let config = RouterConfig::new(
+            RoutingMode::Regular {
+                worker_urls: vec!["http://worker1:8000".to_string()],
+            },
+            PolicyConfig::CacheAware {
+                cache_threshold: 0.5,
+                balance_abs_threshold: 32,
+                balance_rel_threshold: 1.1,
+                eviction_interval_secs: 60,
+                max_tree_size: 1000,
+            },
+        );
+
+        assert!(ConfigValidator::validate(&config).is_ok());
+    }
+
+    #[test]
+    fn test_validate_pd_mode() {
+        let config = RouterConfig::new(
+            RoutingMode::PrefillDecode {
+                prefill_urls: vec![("http://prefill:8000".to_string(), Some(8081))],
+                decode_urls: vec!["http://decode:8000".to_string()],
+                prefill_policy: None,
+                decode_policy: None,
+            },
+            PolicyConfig::Random,
+        );
+
+        assert!(ConfigValidator::validate(&config).is_ok());
+    }
+
+    #[test]
+    fn test_validate_roundrobin_with_pd_mode() {
+        // RoundRobin with PD mode is now supported
+        let config = RouterConfig::new(
+            RoutingMode::PrefillDecode {
+                prefill_urls: vec![("http://prefill:8000".to_string(), None)],
+                decode_urls: vec!["http://decode:8000".to_string()],
+                prefill_policy: None,
+                decode_policy: None,
+            },
+            PolicyConfig::RoundRobin,
+        );
+
+        let result = ConfigValidator::validate(&config);
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_validate_cache_aware_with_pd_mode() {
+        // CacheAware with PD mode is now supported
+        let config = RouterConfig::new(
+            RoutingMode::PrefillDecode {
+                prefill_urls: vec![("http://prefill:8000".to_string(), None)],
+                decode_urls: vec!["http://decode:8000".to_string()],
+                prefill_policy: None,
+                decode_policy: None,
+            },
+            PolicyConfig::CacheAware {
+                cache_threshold: 0.5,
+                balance_abs_threshold: 32,
+                balance_rel_threshold: 1.1,
+                eviction_interval_secs: 60,
+                max_tree_size: 1000,
+            },
+        );
+
+        let result = ConfigValidator::validate(&config);
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_validate_power_of_two_with_regular_mode() {
+        // PowerOfTwo with Regular mode is now supported
+        let config = RouterConfig::new(
+            RoutingMode::Regular {
+                worker_urls: vec![
+                    "http://worker1:8000".to_string(),
+                    "http://worker2:8000".to_string(),
+                ],
+            },
+            PolicyConfig::PowerOfTwo {
+                load_check_interval_secs: 60,
+            },
+        );
+
+        let result = ConfigValidator::validate(&config);
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_validate_pd_mode_with_separate_policies() {
+        // Test PD mode with different policies for prefill and decode
+        let config = RouterConfig::new(
+            RoutingMode::PrefillDecode {
+                prefill_urls: vec![
+                    ("http://prefill1:8000".to_string(), None),
+                    ("http://prefill2:8000".to_string(), None),
+                ],
+                decode_urls: vec![
+                    "http://decode1:8000".to_string(),
+                    "http://decode2:8000".to_string(),
+                ],
+                prefill_policy: Some(PolicyConfig::CacheAware {
+                    cache_threshold: 0.5,
+                    balance_abs_threshold: 32,
+                    balance_rel_threshold: 1.1,
+                    eviction_interval_secs: 60,
+                    max_tree_size: 1000,
+                }),
+                decode_policy: Some(PolicyConfig::PowerOfTwo {
+                    load_check_interval_secs: 60,
+                }),
+            },
+            PolicyConfig::Random, // Main policy as fallback
+        );
+
+        let result = ConfigValidator::validate(&config);
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_validate_pd_mode_power_of_two_insufficient_workers() {
+        // Test that power-of-two policy requires at least 2 workers
+        let config = RouterConfig::new(
+            RoutingMode::PrefillDecode {
+                prefill_urls: vec![("http://prefill1:8000".to_string(), None)], // Only 1 prefill
+                decode_urls: vec![
+                    "http://decode1:8000".to_string(),
+                    "http://decode2:8000".to_string(),
+                ],
+                prefill_policy: Some(PolicyConfig::PowerOfTwo {
+                    load_check_interval_secs: 60,
+                }), // Requires 2+ workers
+                decode_policy: None,
+            },
+            PolicyConfig::Random,
+        );
+
+        let result = ConfigValidator::validate(&config);
+        assert!(result.is_err());
+        if let Err(e) = result {
+            assert!(e.to_string().contains("prefill requires at least 2"));
+        }
+    }
+
+    #[test]
+    fn test_validate_grpc_requires_tokenizer() {
+        // Test that gRPC connection mode requires tokenizer configuration
+        let mut config = RouterConfig::new(
+            RoutingMode::Regular {
+                worker_urls: vec!["grpc://worker:50051".to_string()],
+            },
+            PolicyConfig::Random,
+        );
+
+        // Set connection mode to gRPC without tokenizer config
+        config.connection_mode = ConnectionMode::Grpc;
+        config.tokenizer_path = None;
+        config.model_path = None;
+
+        let result = ConfigValidator::validate(&config);
+        assert!(result.is_err());
+        if let Err(e) = result {
+            assert!(e.to_string().contains("gRPC connection mode requires"));
+        }
+    }
+
+    #[test]
+    fn test_validate_grpc_with_model_path() {
+        // Test that gRPC works with model_path
+        let mut config = RouterConfig::new(
+            RoutingMode::Regular {
+                worker_urls: vec!["grpc://worker:50051".to_string()],
+            },
+            PolicyConfig::Random,
+        );
+
+        config.connection_mode = ConnectionMode::Grpc;
+        config.model_path = Some("meta-llama/Llama-3-8B".to_string());
+
+        let result = ConfigValidator::validate(&config);
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_validate_grpc_with_tokenizer_path() {
+        // Test that gRPC works with tokenizer_path
+        let mut config = RouterConfig::new(
+            RoutingMode::Regular {
+                worker_urls: vec!["grpc://worker:50051".to_string()],
+            },
+            PolicyConfig::Random,
+        );
+
+        config.connection_mode = ConnectionMode::Grpc;
+        config.tokenizer_path = Some("/path/to/tokenizer.json".to_string());
+
+        let result = ConfigValidator::validate(&config);
+        assert!(result.is_ok());
+    }
+}
diff --git a/sgl-router/src/core/circuit_breaker.rs b/sgl-router/src/core/circuit_breaker.rs
new file mode 100644
index 000000000..5c374233e
--- /dev/null
+++ b/sgl-router/src/core/circuit_breaker.rs
@@ -0,0 +1,555 @@
+use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
+use std::sync::{Arc, RwLock};
+use std::time::{Duration, Instant};
+use tracing::info;
+
+/// Circuit breaker configuration
+#[derive(Debug, Clone)]
+pub struct CircuitBreakerConfig {
+    /// Number of consecutive failures to open the circuit
+    pub failure_threshold: u32,
+    /// Success threshold to close circuit from half-open
+    pub success_threshold: u32,
+    /// Duration to wait before attempting half-open
+    pub timeout_duration: Duration,
+    /// Time window for failure counting
+    pub window_duration: Duration,
+}
+
+impl Default for CircuitBreakerConfig {
+    fn default() -> Self {
+        Self {
+            failure_threshold: 5,
+            success_threshold: 2,
+            timeout_duration: Duration::from_secs(30),
+            window_duration: Duration::from_secs(60),
+        }
+    }
+}
+
+/// Circuit breaker state
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum CircuitState {
+    /// Normal operation - requests are allowed
+    Closed,
+    /// Circuit is open - requests are rejected
+    Open,
+    /// Testing if service has recovered - limited requests allowed
+    HalfOpen,
+}
+
+impl std::fmt::Display for CircuitState {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            CircuitState::Closed => write!(f, "Closed"),
+            CircuitState::Open => write!(f, "Open"),
+            CircuitState::HalfOpen => write!(f, "HalfOpen"),
+        }
+    }
+}
+
+/// Circuit breaker implementation
+#[derive(Debug)]
+pub struct CircuitBreaker {
+    state: Arc<RwLock<CircuitState>>,
+    consecutive_failures: Arc<AtomicU32>,
+    consecutive_successes: Arc<AtomicU32>,
+    total_failures: Arc<AtomicU64>,
+    total_successes: Arc<AtomicU64>,
+    last_failure_time: Arc<RwLock<Option<Instant>>>,
+    last_state_change: Arc<RwLock<Instant>>,
+    config: CircuitBreakerConfig,
+}
+
+impl CircuitBreaker {
+    /// Create a new circuit breaker with default configuration
+    pub fn new() -> Self {
+        Self::with_config(CircuitBreakerConfig::default())
+    }
+
+    /// Create a new circuit breaker with custom configuration
+    pub fn with_config(config: CircuitBreakerConfig) -> Self {
+        Self {
+            state: Arc::new(RwLock::new(CircuitState::Closed)),
+            consecutive_failures: Arc::new(AtomicU32::new(0)),
+            consecutive_successes: Arc::new(AtomicU32::new(0)),
+            total_failures: Arc::new(AtomicU64::new(0)),
+            total_successes: Arc::new(AtomicU64::new(0)),
+            last_failure_time: Arc::new(RwLock::new(None)),
+            last_state_change: Arc::new(RwLock::new(Instant::now())),
+            config,
+        }
+    }
+
+    /// Check if a request can be executed
+    pub fn can_execute(&self) -> bool {
+        // First check if we need to transition from Open to HalfOpen
+        self.check_and_update_state();
+
+        let state = *self.state.read().unwrap();
+        match state {
+            CircuitState::Closed => true,
+            CircuitState::Open => false,
+            CircuitState::HalfOpen => true, // Allow limited requests in half-open state
+        }
+    }
+
+    /// Get the current state
+    pub fn state(&self) -> CircuitState {
+        self.check_and_update_state();
+        *self.state.read().unwrap()
+    }
+
+    /// Record the outcome of a request
+    pub fn record_outcome(&self, success: bool) {
+        if success {
+            self.record_success();
+        } else {
+            self.record_failure();
+        }
+    }
+
+    /// Record a successful request
+    pub fn record_success(&self) {
+        self.total_successes.fetch_add(1, Ordering::Relaxed);
+        self.consecutive_failures.store(0, Ordering::Release);
+        let successes = self.consecutive_successes.fetch_add(1, Ordering::AcqRel) + 1;
+        // Outcome-level metrics are recorded at the worker level where the worker label is known
+
+        let current_state = *self.state.read().unwrap();
+
+        match current_state {
+            CircuitState::HalfOpen => {
+                // Check if we've reached the success threshold to close the circuit
+                if successes >= self.config.success_threshold {
+                    self.transition_to(CircuitState::Closed);
+                }
+            }
+            CircuitState::Closed => {
+                // Already closed, nothing to do
+            }
+            CircuitState::Open => {
+                // Shouldn't happen, but if it does, stay open
+                tracing::warn!("Success recorded while circuit is open");
+            }
+        }
+    }
+
+    /// Record a failed request
+    pub fn record_failure(&self) {
+        self.total_failures.fetch_add(1, Ordering::Relaxed);
+        self.consecutive_successes.store(0, Ordering::Release);
+        let failures = self.consecutive_failures.fetch_add(1, Ordering::AcqRel) + 1;
+        // Outcome-level metrics are recorded at the worker level where the worker label is known
+
+        // Update last failure time
+        {
+            let mut last_failure = self.last_failure_time.write().unwrap();
+            *last_failure = Some(Instant::now());
+        }
+
+        let current_state = *self.state.read().unwrap();
+
+        match current_state {
+            CircuitState::Closed => {
+                // Check if we've reached the failure threshold to open the circuit
+                if failures >= self.config.failure_threshold {
+                    self.transition_to(CircuitState::Open);
+                }
+            }
+            CircuitState::HalfOpen => {
+                // Single failure in half-open state reopens the circuit
+                self.transition_to(CircuitState::Open);
+            }
+            CircuitState::Open => {
+                // Already open, nothing to do
+            }
+        }
+    }
+
+    /// Check and update state based on timeout
+    fn check_and_update_state(&self) {
+        let current_state = *self.state.read().unwrap();
+
+        if current_state == CircuitState::Open {
+            // Check if timeout has expired
+            let last_change = *self.last_state_change.read().unwrap();
+            if last_change.elapsed() >= self.config.timeout_duration {
+                self.transition_to(CircuitState::HalfOpen);
+            }
+        }
+    }
+
+    /// Transition to a new state
+    fn transition_to(&self, new_state: CircuitState) {
+        let mut state = self.state.write().unwrap();
+        let old_state = *state;
+
+        if old_state != new_state {
+            *state = new_state;
+
+            // Update last state change time
+            let mut last_change = self.last_state_change.write().unwrap();
+            *last_change = Instant::now();
+
+            // Reset counters based on transition
+            match new_state {
+                CircuitState::Closed => {
+                    self.consecutive_failures.store(0, Ordering::Release);
+                    self.consecutive_successes.store(0, Ordering::Release);
+                }
+                CircuitState::Open => {
+                    self.consecutive_successes.store(0, Ordering::Release);
+                }
+                CircuitState::HalfOpen => {
+                    self.consecutive_failures.store(0, Ordering::Release);
+                    self.consecutive_successes.store(0, Ordering::Release);
+                }
+            }
+
+            let from = match old_state {
+                CircuitState::Closed => "closed",
+                CircuitState::Open => "open",
+                CircuitState::HalfOpen => "half_open",
+            };
+            let to = match new_state {
+                CircuitState::Closed => "closed",
+                CircuitState::Open => "open",
+                CircuitState::HalfOpen => "half_open",
+            };
+            info!("Circuit breaker state transition: {} -> {}", from, to);
+            // Transition metrics are recorded at the worker level where the worker label is known
+        }
+    }
+
+    /// Get the number of consecutive failures
+    pub fn failure_count(&self) -> u32 {
+        self.consecutive_failures.load(Ordering::Acquire)
+    }
+
+    /// Get the number of consecutive successes
+    pub fn success_count(&self) -> u32 {
+        self.consecutive_successes.load(Ordering::Acquire)
+    }
+
+    /// Get total failures
+    pub fn total_failures(&self) -> u64 {
+        self.total_failures.load(Ordering::Relaxed)
+    }
+
+    /// Get total successes
+    pub fn total_successes(&self) -> u64 {
+        self.total_successes.load(Ordering::Relaxed)
+    }
+
+    /// Get time since last failure
+    pub fn time_since_last_failure(&self) -> Option<Duration> {
+        self.last_failure_time.read().unwrap().map(|t| t.elapsed())
+    }
+
+    /// Get time since last state change
+    pub fn time_since_last_state_change(&self) -> Duration {
+        self.last_state_change.read().unwrap().elapsed()
+    }
+
+    /// Check if the circuit is in a half-open state
+    pub fn is_half_open(&self) -> bool {
+        self.state() == CircuitState::HalfOpen
+    }
+
+    /// Record a test success (for health check probing)
+    pub fn record_test_success(&self) {
+        if self.is_half_open() {
+            self.record_success();
+        }
+    }
+
+    /// Record a test failure (for health check probing)
+    pub fn record_test_failure(&self) {
+        if self.is_half_open() {
+            self.record_failure();
+        }
+    }
+
+    /// Reset the circuit breaker to closed state
+    pub fn reset(&self) {
+        self.transition_to(CircuitState::Closed);
+        self.consecutive_failures.store(0, Ordering::Release);
+        self.consecutive_successes.store(0, Ordering::Release);
+    }
+
+    /// Force the circuit to open (for manual intervention)
+    pub fn force_open(&self) {
+        self.transition_to(CircuitState::Open);
+    }
+
+    /// Get circuit breaker statistics
+    pub fn stats(&self) -> CircuitBreakerStats {
+        CircuitBreakerStats {
+            state: self.state(),
+            consecutive_failures: self.failure_count(),
+            consecutive_successes: self.success_count(),
+            total_failures: self.total_failures(),
+            total_successes: self.total_successes(),
+            time_since_last_failure: self.time_since_last_failure(),
+            time_since_last_state_change: self.time_since_last_state_change(),
+        }
+    }
+}
+
+impl Clone for CircuitBreaker {
+    fn clone(&self) -> Self {
+        Self {
+            state: Arc::clone(&self.state),
+            consecutive_failures: Arc::clone(&self.consecutive_failures),
+            consecutive_successes: Arc::clone(&self.consecutive_successes),
+            total_failures: Arc::clone(&self.total_failures),
+            total_successes: Arc::clone(&self.total_successes),
+            last_failure_time: Arc::clone(&self.last_failure_time),
+            last_state_change: Arc::clone(&self.last_state_change),
+            config: self.config.clone(),
+        }
+    }
+}
+
+impl Default for CircuitBreaker {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Circuit breaker statistics
+#[derive(Debug, Clone)]
+pub struct CircuitBreakerStats {
+    pub state: CircuitState,
+    pub consecutive_failures: u32,
+    pub consecutive_successes: u32,
+    pub total_failures: u64,
+    pub total_successes: u64,
+    pub time_since_last_failure: Option<Duration>,
+    pub time_since_last_state_change: Duration,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::thread;
+
+    #[test]
+    fn test_circuit_breaker_initial_state() {
+        let cb = CircuitBreaker::new();
+        assert_eq!(cb.state(), CircuitState::Closed);
+        assert!(cb.can_execute());
+        assert_eq!(cb.failure_count(), 0);
+        assert_eq!(cb.success_count(), 0);
+    }
+
+    #[test]
+    fn test_circuit_opens_on_threshold() {
+        let config = CircuitBreakerConfig {
+            failure_threshold: 3,
+            ..Default::default()
+        };
+        let cb = CircuitBreaker::with_config(config);
+
+        // Record failures up to threshold
+        assert_eq!(cb.state(), CircuitState::Closed);
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Closed);
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Closed);
+        cb.record_failure();
+
+        // Circuit should now be open
+        assert_eq!(cb.state(), CircuitState::Open);
+        assert!(!cb.can_execute());
+        assert_eq!(cb.failure_count(), 3);
+    }
+
+    #[test]
+    fn test_circuit_half_open_after_timeout() {
+        let config = CircuitBreakerConfig {
+            failure_threshold: 1,
+            timeout_duration: Duration::from_millis(100),
+            ..Default::default()
+        };
+        let cb = CircuitBreaker::with_config(config);
+
+        // Open the circuit
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Open);
+
+        // Wait for timeout
+        thread::sleep(Duration::from_millis(150));
+
+        // Circuit should be half-open
+        assert_eq!(cb.state(), CircuitState::HalfOpen);
+        assert!(cb.can_execute());
+    }
+
+    #[test]
+    fn test_circuit_closes_on_success_threshold() {
+        let config = CircuitBreakerConfig {
+            failure_threshold: 1,
+            success_threshold: 2,
+            timeout_duration: Duration::from_millis(50),
+            ..Default::default()
+        };
+        let cb = CircuitBreaker::with_config(config);
+
+        // Open the circuit
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Open);
+
+        // Wait for timeout
+        thread::sleep(Duration::from_millis(100));
+        assert_eq!(cb.state(), CircuitState::HalfOpen);
+
+        // Record successes
+        cb.record_success();
+        assert_eq!(cb.state(), CircuitState::HalfOpen);
+        cb.record_success();
+
+        // Circuit should now be closed
+        assert_eq!(cb.state(), CircuitState::Closed);
+        assert!(cb.can_execute());
+    }
+
+    #[test]
+    fn test_circuit_reopens_on_half_open_failure() {
+        let config = CircuitBreakerConfig {
+            failure_threshold: 1,
+            timeout_duration: Duration::from_millis(50),
+            ..Default::default()
+        };
+        let cb = CircuitBreaker::with_config(config);
+
+        // Open the circuit
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Open);
+
+        // Wait for timeout
+        thread::sleep(Duration::from_millis(100));
+        assert_eq!(cb.state(), CircuitState::HalfOpen);
+
+        // Record a failure in half-open state
+        cb.record_failure();
+
+        // Circuit should reopen immediately
+        assert_eq!(cb.state(), CircuitState::Open);
+        assert!(!cb.can_execute());
+    }
+
+    #[test]
+    fn test_success_resets_failure_count() {
+        let config = CircuitBreakerConfig {
+            failure_threshold: 3,
+            ..Default::default()
+        };
+        let cb = CircuitBreaker::with_config(config);
+
+        // Record some failures
+        cb.record_failure();
+        cb.record_failure();
+        assert_eq!(cb.failure_count(), 2);
+
+        // Success should reset failure count
+        cb.record_success();
+        assert_eq!(cb.failure_count(), 0);
+        assert_eq!(cb.success_count(), 1);
+
+        // Can now record more failures without opening
+        cb.record_failure();
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Closed);
+    }
+
+    #[test]
+    fn test_manual_reset() {
+        let config = CircuitBreakerConfig {
+            failure_threshold: 1,
+            ..Default::default()
+        };
+        let cb = CircuitBreaker::with_config(config);
+
+        // Open the circuit
+        cb.record_failure();
+        assert_eq!(cb.state(), CircuitState::Open);
+
+        // Manual reset
+        cb.reset();
+        assert_eq!(cb.state(), CircuitState::Closed);
+        assert_eq!(cb.failure_count(), 0);
+        assert_eq!(cb.success_count(), 0);
+    }
+
+    #[test]
+    fn test_force_open() {
+        let cb = CircuitBreaker::new();
+        assert_eq!(cb.state(), CircuitState::Closed);
+
+        cb.force_open();
+        assert_eq!(cb.state(), CircuitState::Open);
+        assert!(!cb.can_execute());
+    }
+
+    #[test]
+    fn test_stats() {
+        let config = CircuitBreakerConfig {
+            failure_threshold: 2,
+            ..Default::default()
+        };
+        let cb = CircuitBreaker::with_config(config);
+
+        cb.record_success();
+        cb.record_failure();
+        cb.record_failure();
+
+        let stats = cb.stats();
+        assert_eq!(stats.state, CircuitState::Open);
+        assert_eq!(stats.consecutive_failures, 2);
+        assert_eq!(stats.consecutive_successes, 0);
+        assert_eq!(stats.total_failures, 2);
+        assert_eq!(stats.total_successes, 1);
+    }
+
+    #[test]
+    fn test_clone() {
+        let cb1 = CircuitBreaker::new();
+        cb1.record_failure();
+
+        let cb2 = cb1.clone();
+        assert_eq!(cb2.failure_count(), 1);
+
+        // Changes to cb1 affect cb2 (shared state)
+        cb1.record_failure();
+        assert_eq!(cb2.failure_count(), 2);
+    }
+
+    #[test]
+    fn test_thread_safety() {
+        use std::sync::Arc;
+
+        let cb = Arc::new(CircuitBreaker::new());
+        let mut handles = vec![];
+
+        // Spawn threads that record failures
+        for _ in 0..10 {
+            let cb_clone = Arc::clone(&cb);
+            let handle = thread::spawn(move || {
+                for _ in 0..100 {
+                    cb_clone.record_failure();
+                }
+            });
+            handles.push(handle);
+        }
+
+        // Wait for all threads
+        for handle in handles {
+            handle.join().unwrap();
+        }
+
+        // Should have recorded 1000 failures
+        assert_eq!(cb.total_failures(), 1000);
+    }
+}
diff --git a/sgl-router/src/core/error.rs b/sgl-router/src/core/error.rs
new file mode 100644
index 000000000..74e0a0d25
--- /dev/null
+++ b/sgl-router/src/core/error.rs
@@ -0,0 +1,240 @@
+//! Error types for the SGLang router core
+//!
+//! This module defines error types used throughout the router for worker operations.
+
+use std::fmt;
+
+/// Worker-related errors
+#[derive(Debug)]
+pub enum WorkerError {
+    /// Health check failed
+    HealthCheckFailed { url: String, reason: String },
+    /// Worker not found
+    WorkerNotFound { url: String },
+    /// Invalid worker configuration
+    InvalidConfiguration { message: String },
+    /// Network error
+    NetworkError { url: String, error: String },
+    /// Worker is at capacity
+    WorkerAtCapacity { url: String },
+    /// Invalid URL format
+    InvalidUrl { url: String },
+}
+
+impl fmt::Display for WorkerError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            WorkerError::HealthCheckFailed { url, reason } => {
+                write!(f, "Health check failed for worker {}: {}", url, reason)
+            }
+            WorkerError::WorkerNotFound { url } => {
+                write!(f, "Worker not found: {}", url)
+            }
+            WorkerError::InvalidConfiguration { message } => {
+                write!(f, "Invalid worker configuration: {}", message)
+            }
+            WorkerError::NetworkError { url, error } => {
+                write!(f, "Network error for worker {}: {}", url, error)
+            }
+            WorkerError::WorkerAtCapacity { url } => {
+                write!(f, "Worker at capacity: {}", url)
+            }
+            WorkerError::InvalidUrl { url } => {
+                write!(f, "Invalid URL format: {}", url)
+            }
+        }
+    }
+}
+
+impl std::error::Error for WorkerError {}
+
+/// Result type for worker operations
+pub type WorkerResult<T> = Result<T, WorkerError>;
+
+/// Convert from reqwest errors to worker errors
+impl From<reqwest::Error> for WorkerError {
+    fn from(err: reqwest::Error) -> Self {
+        WorkerError::NetworkError {
+            url: err.url().map(|u| u.to_string()).unwrap_or_default(),
+            error: err.to_string(),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::error::Error;
+
+    #[test]
+    fn test_health_check_failed_display() {
+        let error = WorkerError::HealthCheckFailed {
+            url: "http://worker1:8080".to_string(),
+            reason: "Connection refused".to_string(),
+        };
+        assert_eq!(
+            error.to_string(),
+            "Health check failed for worker http://worker1:8080: Connection refused"
+        );
+    }
+
+    #[test]
+    fn test_worker_not_found_display() {
+        let error = WorkerError::WorkerNotFound {
+            url: "http://worker2:8080".to_string(),
+        };
+        assert_eq!(error.to_string(), "Worker not found: http://worker2:8080");
+    }
+
+    #[test]
+    fn test_invalid_configuration_display() {
+        let error = WorkerError::InvalidConfiguration {
+            message: "Missing port number".to_string(),
+        };
+        assert_eq!(
+            error.to_string(),
+            "Invalid worker configuration: Missing port number"
+        );
+    }
+
+    #[test]
+    fn test_network_error_display() {
+        let error = WorkerError::NetworkError {
+            url: "http://worker3:8080".to_string(),
+            error: "Timeout after 30s".to_string(),
+        };
+        assert_eq!(
+            error.to_string(),
+            "Network error for worker http://worker3:8080: Timeout after 30s"
+        );
+    }
+
+    #[test]
+    fn test_worker_at_capacity_display() {
+        let error = WorkerError::WorkerAtCapacity {
+            url: "http://worker4:8080".to_string(),
+        };
+        assert_eq!(error.to_string(), "Worker at capacity: http://worker4:8080");
+    }
+
+    #[test]
+    fn test_worker_error_implements_std_error() {
+        let error = WorkerError::WorkerNotFound {
+            url: "http://test".to_string(),
+        };
+        // Verify it implements Error trait
+        let _: &dyn Error = &error;
+        assert!(error.source().is_none());
+    }
+
+    #[test]
+    fn test_error_send_sync() {
+        fn assert_send_sync<T: Send + Sync>() {}
+        assert_send_sync::<WorkerError>();
+    }
+
+    #[test]
+    fn test_worker_result_type_alias() {
+        // Test Ok variant
+        let result: WorkerResult<i32> = Ok(42);
+        assert!(matches!(result, Ok(42)));
+
+        // Test Err variant
+        let error = WorkerError::WorkerNotFound {
+            url: "test".to_string(),
+        };
+        let result: WorkerResult<i32> = Err(error);
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_empty_url_handling() {
+        // Test empty URLs in error variants
+        let error1 = WorkerError::HealthCheckFailed {
+            url: "".to_string(),
+            reason: "No connection".to_string(),
+        };
+        assert_eq!(
+            error1.to_string(),
+            "Health check failed for worker : No connection"
+        );
+
+        let error2 = WorkerError::NetworkError {
+            url: "".to_string(),
+            error: "DNS failure".to_string(),
+        };
+        assert_eq!(error2.to_string(), "Network error for worker : DNS failure");
+
+        let error3 = WorkerError::WorkerNotFound {
+            url: "".to_string(),
+        };
+        assert_eq!(error3.to_string(), "Worker not found: ");
+    }
+
+    #[test]
+    fn test_special_characters_in_messages() {
+        // Test with special characters
+        let error = WorkerError::InvalidConfiguration {
+            message: "Invalid JSON: {\"error\": \"test\"}".to_string(),
+        };
+        assert_eq!(
+            error.to_string(),
+            "Invalid worker configuration: Invalid JSON: {\"error\": \"test\"}"
+        );
+
+        // Test with unicode
+        let error2 = WorkerError::HealthCheckFailed {
+            url: "http://测试:8080".to_string(),
+            reason: "连接被拒绝".to_string(),
+        };
+        assert_eq!(
+            error2.to_string(),
+            "Health check failed for worker http://测试:8080: 连接被拒绝"
+        );
+    }
+
+    #[test]
+    fn test_very_long_error_messages() {
+        let long_message = "A".repeat(10000);
+        let error = WorkerError::InvalidConfiguration {
+            message: long_message.clone(),
+        };
+        let display = error.to_string();
+        assert!(display.contains(&long_message));
+        assert_eq!(
+            display.len(),
+            "Invalid worker configuration: ".len() + long_message.len()
+        );
+    }
+
+    // Mock reqwest error for testing conversion
+    #[test]
+    fn test_reqwest_error_conversion() {
+        // Test that NetworkError is the correct variant
+        let network_error = WorkerError::NetworkError {
+            url: "http://example.com".to_string(),
+            error: "connection timeout".to_string(),
+        };
+
+        match network_error {
+            WorkerError::NetworkError { url, error } => {
+                assert_eq!(url, "http://example.com");
+                assert_eq!(error, "connection timeout");
+            }
+            _ => panic!("Expected NetworkError variant"),
+        }
+    }
+
+    #[test]
+    fn test_error_equality() {
+        // WorkerError doesn't implement PartialEq, but we can test that
+        // the same error construction produces the same display output
+        let error1 = WorkerError::WorkerNotFound {
+            url: "http://test".to_string(),
+        };
+        let error2 = WorkerError::WorkerNotFound {
+            url: "http://test".to_string(),
+        };
+        assert_eq!(error1.to_string(), error2.to_string());
+    }
+}
diff --git a/sgl-router/src/core/mod.rs b/sgl-router/src/core/mod.rs
new file mode 100644
index 000000000..b46810b4c
--- /dev/null
+++ b/sgl-router/src/core/mod.rs
@@ -0,0 +1,24 @@
+//! Core abstractions for the SGLang router
+//!
+//! This module contains the fundamental types and traits used throughout the router:
+//! - Worker trait and implementations
+//! - Error types
+//! - Circuit breaker for reliability
+//! - Common utilities
+
+pub mod circuit_breaker;
+pub mod error;
+pub mod retry;
+pub mod token_bucket;
+pub mod worker;
+
+// Re-export commonly used types at the module level
+pub use circuit_breaker::{
+    CircuitBreaker, CircuitBreakerConfig, CircuitBreakerStats, CircuitState,
+};
+pub use error::{WorkerError, WorkerResult};
+pub use retry::{is_retryable_status, BackoffCalculator, RetryError, RetryExecutor};
+pub use worker::{
+    start_health_checker, BasicWorker, ConnectionMode, DPAwareWorker, HealthChecker, HealthConfig,
+    Worker, WorkerCollection, WorkerFactory, WorkerLoadGuard, WorkerType,
+};
diff --git a/sgl-router/src/core/retry.rs b/sgl-router/src/core/retry.rs
new file mode 100644
index 000000000..8a00424a5
--- /dev/null
+++ b/sgl-router/src/core/retry.rs
@@ -0,0 +1,409 @@
+use crate::config::types::RetryConfig;
+use axum::http::StatusCode;
+use axum::response::Response;
+use rand::Rng;
+use std::time::Duration;
+use tracing::debug;
+
+/// Check if an HTTP status code indicates a retryable error
+pub fn is_retryable_status(status: StatusCode) -> bool {
+    matches!(
+        status,
+        StatusCode::REQUEST_TIMEOUT
+            | StatusCode::TOO_MANY_REQUESTS
+            | StatusCode::INTERNAL_SERVER_ERROR
+            | StatusCode::BAD_GATEWAY
+            | StatusCode::SERVICE_UNAVAILABLE
+            | StatusCode::GATEWAY_TIMEOUT
+    )
+}
+
+/// Computes exponential backoff with optional jitter.
+#[derive(Debug, Clone)]
+pub struct BackoffCalculator;
+
+impl BackoffCalculator {
+    /// Calculate backoff delay for a given attempt index (0-based).
+    pub fn calculate_delay(config: &RetryConfig, attempt: u32) -> Duration {
+        // Base exponential backoff
+        let pow = config.backoff_multiplier.powi(attempt as i32);
+        let mut delay_ms = (config.initial_backoff_ms as f32 * pow) as u64;
+        if delay_ms > config.max_backoff_ms {
+            delay_ms = config.max_backoff_ms;
+        }
+
+        // Apply jitter in range [-j, +j]
+        let jitter = config.jitter_factor.clamp(0.0, 1.0);
+        if jitter > 0.0 {
+            let mut rng = rand::rng();
+            let jitter_scale: f32 = rng.random_range(-jitter..=jitter);
+            let jitter_ms = (delay_ms as f32 * jitter_scale)
+                .round()
+                .max(-(delay_ms as f32));
+            let adjusted = (delay_ms as i64 + jitter_ms as i64).max(0) as u64;
+            return Duration::from_millis(adjusted);
+        }
+
+        Duration::from_millis(delay_ms)
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum RetryError {
+    #[error("no available workers")]
+    NoAvailableWorkers,
+    #[error("maximum retry attempts exceeded")]
+    MaxRetriesExceeded,
+}
+
+/// A thin async retry executor for generic operations.
+#[derive(Debug, Clone, Default)]
+pub struct RetryExecutor;
+
+impl RetryExecutor {
+    /// Execute an async operation with retries and backoff.
+    /// The `operation` closure is invoked each attempt with the attempt index.
+    pub async fn execute_with_retry<F, Fut, T>(
+        config: &RetryConfig,
+        mut operation: F,
+    ) -> Result<T, RetryError>
+    where
+        F: FnMut(u32) -> Fut,
+        Fut: std::future::Future<Output = Result<T, ()>>,
+    {
+        let max = config.max_retries.max(1);
+        let mut attempt: u32 = 0;
+        loop {
+            match operation(attempt).await {
+                Ok(val) => return Ok(val),
+                Err(_) => {
+                    // Use the number of failures so far (0-indexed) to compute delay,
+                    // so the first retry uses `initial_backoff_ms`.
+                    let is_last = attempt + 1 >= max;
+                    if is_last {
+                        return Err(RetryError::MaxRetriesExceeded);
+                    }
+                    let delay = BackoffCalculator::calculate_delay(config, attempt);
+                    attempt += 1; // advance to the next attempt after computing delay
+                    tokio::time::sleep(delay).await;
+                }
+            }
+        }
+    }
+
+    /// Execute an operation that returns an HTTP Response with retries and backoff.
+    ///
+    /// Usage pattern:
+    /// - `operation(attempt)`: perform one attempt (0-based). Construct and send the request,
+    ///   then return the `Response`. Do any per-attempt bookkeeping (e.g., load tracking,
+    ///   circuit-breaker outcome recording) inside this closure.
+    /// - `should_retry(&response, attempt)`: decide if the given response should be retried
+    ///   (e.g., based on HTTP status). Returning false short-circuits and returns the response.
+    /// - `on_backoff(delay, next_attempt)`: called before sleeping between attempts.
+    ///   Use this to record metrics.
+    /// - `on_exhausted()`: called when the executor has exhausted all retry attempts.
+    ///
+    /// Example:
+    /// ```ignore
+    /// let resp = RetryExecutor::execute_response_with_retry(
+    ///     &retry_cfg,
+    ///     |attempt| async move {
+    ///         let worker = select_cb_aware_worker()?;
+    ///         let resp = send_request(worker).await;
+    ///         worker.record_outcome(resp.status().is_success());
+    ///         resp
+    ///     },
+    ///     |res, _| matches!(res.status(), StatusCode::REQUEST_TIMEOUT | StatusCode::TOO_MANY_REQUESTS | StatusCode::INTERNAL_SERVER_ERROR | StatusCode::BAD_GATEWAY | StatusCode::SERVICE_UNAVAILABLE | StatusCode::GATEWAY_TIMEOUT),
+    ///     |delay, attempt| RouterMetrics::record_retry_backoff_duration(delay, attempt),
+    ///     || RouterMetrics::record_retries_exhausted("/route"),
+    /// ).await;
+    /// ```
+    pub async fn execute_response_with_retry<Op, Fut, ShouldRetry, OnBackoff, OnExhausted>(
+        config: &RetryConfig,
+        mut operation: Op,
+        should_retry: ShouldRetry,
+        on_backoff: OnBackoff,
+        mut on_exhausted: OnExhausted,
+    ) -> Response
+    where
+        Op: FnMut(u32) -> Fut,
+        Fut: std::future::Future<Output = Response>,
+        ShouldRetry: Fn(&Response, u32) -> bool,
+        OnBackoff: Fn(Duration, u32),
+        OnExhausted: FnMut(),
+    {
+        let max = config.max_retries.max(1);
+
+        let mut attempt: u32 = 0;
+        loop {
+            let response = operation(attempt).await;
+            let is_last = attempt + 1 >= max;
+
+            if !should_retry(&response, attempt) {
+                return response;
+            }
+
+            if is_last {
+                // Exhausted retries
+                on_exhausted();
+                return response;
+            }
+
+            // Backoff before next attempt
+            let next_attempt = attempt + 1;
+            // Compute delay based on the number of failures so far (0-indexed)
+            let delay = BackoffCalculator::calculate_delay(config, attempt);
+            debug!(
+                attempt = attempt,
+                next_attempt = next_attempt,
+                delay_ms = delay.as_millis() as u64,
+                "Retry backoff"
+            );
+            on_backoff(delay, next_attempt);
+            tokio::time::sleep(delay).await;
+
+            attempt = next_attempt;
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use axum::http::StatusCode;
+    use axum::response::IntoResponse;
+    use std::sync::atomic::{AtomicU32, Ordering};
+    use std::sync::Arc;
+
+    fn base_retry_config() -> RetryConfig {
+        RetryConfig {
+            max_retries: 3,
+            initial_backoff_ms: 1,
+            max_backoff_ms: 4,
+            backoff_multiplier: 2.0,
+            jitter_factor: 0.0,
+        }
+    }
+
+    #[test]
+    fn test_backoff_no_jitter_progression_and_cap() {
+        let cfg = RetryConfig {
+            max_retries: 10,
+            initial_backoff_ms: 100,
+            max_backoff_ms: 250,
+            backoff_multiplier: 2.0,
+            jitter_factor: 0.0,
+        };
+        // attempt=0 => 100ms
+        assert_eq!(
+            BackoffCalculator::calculate_delay(&cfg, 0),
+            Duration::from_millis(100)
+        );
+        // attempt=1 => 200ms
+        assert_eq!(
+            BackoffCalculator::calculate_delay(&cfg, 1),
+            Duration::from_millis(200)
+        );
+        // attempt=2 => 400ms -> capped to 250ms
+        assert_eq!(
+            BackoffCalculator::calculate_delay(&cfg, 2),
+            Duration::from_millis(250)
+        );
+        // large attempt still capped
+        assert_eq!(
+            BackoffCalculator::calculate_delay(&cfg, 10),
+            Duration::from_millis(250)
+        );
+    }
+
+    #[test]
+    fn test_backoff_with_jitter_within_bounds() {
+        let cfg = RetryConfig {
+            max_retries: 5,
+            initial_backoff_ms: 100,
+            max_backoff_ms: 10_000,
+            backoff_multiplier: 2.0,
+            jitter_factor: 0.5,
+        };
+        // attempt=2 => base 400ms, jitter in [0.5x, 1.5x]
+        let base = 400.0;
+        for _ in 0..50 {
+            let d = BackoffCalculator::calculate_delay(&cfg, 2).as_millis() as f32;
+            assert!(d >= base * 0.5 - 1.0 && d <= base * 1.5 + 1.0);
+        }
+    }
+
+    #[tokio::test]
+    async fn test_execute_with_retry_success_after_failures() {
+        let cfg = base_retry_config();
+        let remaining = Arc::new(AtomicU32::new(2));
+        let calls = Arc::new(AtomicU32::new(0));
+
+        let res: Result<u32, RetryError> = RetryExecutor::execute_with_retry(&cfg, {
+            let remaining = remaining.clone();
+            let calls = calls.clone();
+            move |_attempt| {
+                calls.fetch_add(1, Ordering::Relaxed);
+                let remaining = remaining.clone();
+                async move {
+                    if remaining
+                        .fetch_update(Ordering::AcqRel, Ordering::Acquire, |v| v.checked_sub(1))
+                        .is_ok()
+                    {
+                        Err(())
+                    } else {
+                        Ok(42u32)
+                    }
+                }
+            }
+        })
+        .await;
+
+        assert!(res.is_ok());
+        assert_eq!(res.unwrap(), 42);
+        assert_eq!(calls.load(Ordering::Relaxed), 3); // 2 fails + 1 success
+    }
+
+    #[tokio::test]
+    async fn test_execute_with_retry_exhausted() {
+        let cfg = base_retry_config();
+        let calls = Arc::new(AtomicU32::new(0));
+        let res: Result<u32, RetryError> = RetryExecutor::execute_with_retry(&cfg, {
+            let calls = calls.clone();
+            move |_attempt| {
+                calls.fetch_add(1, Ordering::Relaxed);
+                async move { Err(()) }
+            }
+        })
+        .await;
+
+        assert!(matches!(res, Err(RetryError::MaxRetriesExceeded)));
+        assert_eq!(calls.load(Ordering::Relaxed), cfg.max_retries);
+    }
+
+    #[tokio::test]
+    async fn test_execute_response_with_retry_success_path_and_hooks() {
+        let cfg = base_retry_config();
+        let remaining = Arc::new(AtomicU32::new(2));
+        let calls = Arc::new(AtomicU32::new(0));
+        let backoffs = Arc::new(AtomicU32::new(0));
+        let exhausted = Arc::new(AtomicU32::new(0));
+
+        let response = RetryExecutor::execute_response_with_retry(
+            &cfg,
+            {
+                let remaining = remaining.clone();
+                let calls = calls.clone();
+                move |_attempt| {
+                    calls.fetch_add(1, Ordering::Relaxed);
+                    let remaining = remaining.clone();
+                    async move {
+                        if remaining
+                            .fetch_update(Ordering::AcqRel, Ordering::Acquire, |v| v.checked_sub(1))
+                            .is_ok()
+                        {
+                            (StatusCode::SERVICE_UNAVAILABLE, "fail").into_response()
+                        } else {
+                            (StatusCode::OK, "ok").into_response()
+                        }
+                    }
+                }
+            },
+            |res, _attempt| !res.status().is_success(), // retry until success
+            {
+                let backoffs = backoffs.clone();
+                move |_delay, _next_attempt| {
+                    backoffs.fetch_add(1, Ordering::Relaxed);
+                }
+            },
+            {
+                let exhausted = exhausted.clone();
+                move || {
+                    exhausted.fetch_add(1, Ordering::Relaxed);
+                }
+            },
+        )
+        .await;
+
+        assert_eq!(response.status(), StatusCode::OK);
+        assert_eq!(calls.load(Ordering::Relaxed), 3); // 2 fails + 1 success
+        assert_eq!(backoffs.load(Ordering::Relaxed), 2);
+        assert_eq!(exhausted.load(Ordering::Relaxed), 0);
+    }
+
+    #[tokio::test]
+    async fn test_execute_response_with_retry_non_retryable_short_circuit() {
+        let cfg = base_retry_config();
+        let calls = Arc::new(AtomicU32::new(0));
+        let backoffs = Arc::new(AtomicU32::new(0));
+        let exhausted = Arc::new(AtomicU32::new(0));
+
+        let response = RetryExecutor::execute_response_with_retry(
+            &cfg,
+            {
+                let calls = calls.clone();
+                move |_attempt| {
+                    calls.fetch_add(1, Ordering::Relaxed);
+                    async move { (StatusCode::BAD_REQUEST, "bad").into_response() }
+                }
+            },
+            |_res, _attempt| false, // never retry
+            {
+                let backoffs = backoffs.clone();
+                move |_delay, _next_attempt| {
+                    backoffs.fetch_add(1, Ordering::Relaxed);
+                }
+            },
+            {
+                let exhausted = exhausted.clone();
+                move || {
+                    exhausted.fetch_add(1, Ordering::Relaxed);
+                }
+            },
+        )
+        .await;
+
+        assert_eq!(response.status(), StatusCode::BAD_REQUEST);
+        assert_eq!(calls.load(Ordering::Relaxed), 1);
+        assert_eq!(backoffs.load(Ordering::Relaxed), 0);
+        assert_eq!(exhausted.load(Ordering::Relaxed), 0);
+    }
+
+    #[tokio::test]
+    async fn test_execute_response_with_retry_exhausted_hooks() {
+        let cfg = base_retry_config();
+        let calls = Arc::new(AtomicU32::new(0));
+        let backoffs = Arc::new(AtomicU32::new(0));
+        let exhausted = Arc::new(AtomicU32::new(0));
+
+        let response = RetryExecutor::execute_response_with_retry(
+            &cfg,
+            {
+                let calls = calls.clone();
+                move |_attempt| {
+                    calls.fetch_add(1, Ordering::Relaxed);
+                    async move { (StatusCode::SERVICE_UNAVAILABLE, "fail").into_response() }
+                }
+            },
+            |_res, _attempt| true, // keep retrying
+            {
+                let backoffs = backoffs.clone();
+                move |_delay, _next_attempt| {
+                    backoffs.fetch_add(1, Ordering::Relaxed);
+                }
+            },
+            {
+                let exhausted = exhausted.clone();
+                move || {
+                    exhausted.fetch_add(1, Ordering::Relaxed);
+                }
+            },
+        )
+        .await;
+
+        assert_eq!(response.status(), StatusCode::SERVICE_UNAVAILABLE);
+        assert_eq!(calls.load(Ordering::Relaxed), cfg.max_retries);
+        assert_eq!(backoffs.load(Ordering::Relaxed), cfg.max_retries - 1);
+        assert_eq!(exhausted.load(Ordering::Relaxed), 1);
+    }
+}
diff --git a/sgl-router/src/core/token_bucket.rs b/sgl-router/src/core/token_bucket.rs
new file mode 100644
index 000000000..65117331a
--- /dev/null
+++ b/sgl-router/src/core/token_bucket.rs
@@ -0,0 +1,195 @@
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+use tokio::sync::{Mutex, Notify};
+use tracing::{debug, trace};
+
+/// Token bucket for rate limiting
+///
+/// This implementation provides:
+/// - Smooth rate limiting with configurable refill rate
+/// - Burst capacity handling
+/// - Fair queuing for waiting requests
+#[derive(Clone)]
+pub struct TokenBucket {
+    inner: Arc<Mutex<TokenBucketInner>>,
+    notify: Arc<Notify>,
+    capacity: f64,
+    refill_rate: f64, // tokens per second
+}
+
+struct TokenBucketInner {
+    tokens: f64,
+    last_refill: Instant,
+}
+
+impl TokenBucket {
+    /// Create a new token bucket
+    ///
+    /// # Arguments
+    /// * `capacity` - Maximum number of tokens (burst capacity)
+    /// * `refill_rate` - Tokens added per second
+    pub fn new(capacity: usize, refill_rate: usize) -> Self {
+        let capacity = capacity as f64;
+        let refill_rate = refill_rate as f64;
+
+        // Ensure refill_rate is not zero to prevent division by zero
+        let refill_rate = if refill_rate > 0.0 {
+            refill_rate
+        } else {
+            1.0 // Default to 1 token per second if zero
+        };
+
+        Self {
+            inner: Arc::new(Mutex::new(TokenBucketInner {
+                tokens: capacity, // Start full
+                last_refill: Instant::now(),
+            })),
+            notify: Arc::new(Notify::new()),
+            capacity,
+            refill_rate,
+        }
+    }
+
+    /// Try to acquire tokens immediately
+    pub async fn try_acquire(&self, tokens: f64) -> Result<(), ()> {
+        let mut inner = self.inner.lock().await;
+
+        // Refill tokens based on elapsed time
+        let now = Instant::now();
+        let elapsed = now.duration_since(inner.last_refill).as_secs_f64();
+        let refill_amount = elapsed * self.refill_rate;
+
+        inner.tokens = (inner.tokens + refill_amount).min(self.capacity);
+        inner.last_refill = now;
+
+        trace!(
+            "Token bucket: {} tokens available, requesting {}",
+            inner.tokens,
+            tokens
+        );
+
+        if inner.tokens >= tokens {
+            inner.tokens -= tokens;
+            debug!(
+                "Token bucket: acquired {} tokens, {} remaining",
+                tokens, inner.tokens
+            );
+            Ok(())
+        } else {
+            Err(())
+        }
+    }
+
+    /// Acquire tokens, waiting if necessary
+    pub async fn acquire(&self, tokens: f64) -> Result<(), tokio::time::error::Elapsed> {
+        // First try to acquire immediately
+        if self.try_acquire(tokens).await.is_ok() {
+            return Ok(());
+        }
+
+        // Calculate wait time
+        let wait_time = {
+            let inner = self.inner.lock().await;
+            let tokens_needed = tokens - inner.tokens;
+            let wait_secs = tokens_needed / self.refill_rate;
+            Duration::from_secs_f64(wait_secs)
+        };
+
+        debug!(
+            "Token bucket: waiting {:?} for {} tokens",
+            wait_time, tokens
+        );
+
+        // Wait for tokens to be available
+        tokio::time::timeout(wait_time, async {
+            loop {
+                // Check if we can acquire now
+                if self.try_acquire(tokens).await.is_ok() {
+                    return;
+                }
+
+                // Wait for notification or small interval
+                tokio::select! {
+                    _ = self.notify.notified() => {},
+                    _ = tokio::time::sleep(Duration::from_millis(10)) => {},
+                }
+            }
+        })
+        .await?;
+
+        Ok(())
+    }
+
+    /// Acquire tokens with custom timeout
+    pub async fn acquire_timeout(
+        &self,
+        tokens: f64,
+        timeout: Duration,
+    ) -> Result<(), tokio::time::error::Elapsed> {
+        tokio::time::timeout(timeout, self.acquire(tokens)).await?
+    }
+
+    /// Return tokens to the bucket (for cancelled requests)
+    pub async fn return_tokens(&self, tokens: f64) {
+        let mut inner = self.inner.lock().await;
+        inner.tokens = (inner.tokens + tokens).min(self.capacity);
+        self.notify.notify_waiters();
+        debug!(
+            "Token bucket: returned {} tokens, {} available",
+            tokens, inner.tokens
+        );
+    }
+
+    /// Get current available tokens (for monitoring)
+    pub async fn available_tokens(&self) -> f64 {
+        let mut inner = self.inner.lock().await;
+
+        // Refill before checking
+        let now = Instant::now();
+        let elapsed = now.duration_since(inner.last_refill).as_secs_f64();
+        let refill_amount = elapsed * self.refill_rate;
+
+        inner.tokens = (inner.tokens + refill_amount).min(self.capacity);
+        inner.last_refill = now;
+
+        inner.tokens
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_token_bucket_basic() {
+        let bucket = TokenBucket::new(10, 5); // 10 capacity, 5 per second
+
+        // Should succeed - bucket starts full
+        assert!(bucket.try_acquire(5.0).await.is_ok());
+        assert!(bucket.try_acquire(5.0).await.is_ok());
+
+        // Should fail - no tokens left
+        assert!(bucket.try_acquire(1.0).await.is_err());
+
+        // Wait for refill
+        tokio::time::sleep(Duration::from_millis(300)).await;
+
+        // Should have ~1.5 tokens now
+        assert!(bucket.try_acquire(1.0).await.is_ok());
+    }
+
+    #[tokio::test]
+    async fn test_token_bucket_refill() {
+        let bucket = TokenBucket::new(10, 10); // 10 capacity, 10 per second
+
+        // Use all tokens
+        assert!(bucket.try_acquire(10.0).await.is_ok());
+
+        // Wait for partial refill
+        tokio::time::sleep(Duration::from_millis(500)).await;
+
+        // Should have ~5 tokens
+        let available = bucket.available_tokens().await;
+        assert!((4.0..=6.0).contains(&available));
+    }
+}
diff --git a/sgl-router/src/core/worker.rs b/sgl-router/src/core/worker.rs
new file mode 100644
index 000000000..f25fc6eea
--- /dev/null
+++ b/sgl-router/src/core/worker.rs
@@ -0,0 +1,1980 @@
+use super::{CircuitBreaker, CircuitBreakerConfig, WorkerError, WorkerResult};
+use crate::grpc::SglangSchedulerClient;
+use crate::metrics::RouterMetrics;
+use async_trait::async_trait;
+use futures;
+use serde_json;
+use std::fmt;
+use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
+use std::sync::{Arc, LazyLock};
+use tokio::sync::Mutex;
+
+// Shared HTTP client for worker operations (health checks, server info, etc.)
+static WORKER_CLIENT: LazyLock<reqwest::Client> = LazyLock::new(|| {
+    reqwest::Client::builder()
+        .timeout(std::time::Duration::from_secs(30)) // Default timeout, overridden per request
+        .build()
+        .expect("Failed to create worker HTTP client")
+});
+
+/// Core worker abstraction that represents a backend service
+#[async_trait]
+pub trait Worker: Send + Sync + fmt::Debug {
+    /// Get the worker's URL
+    fn url(&self) -> &str;
+
+    /// Get the worker's type (Regular, Prefill, or Decode)
+    fn worker_type(&self) -> WorkerType;
+
+    /// Get the worker's connection mode (HTTP or gRPC)
+    fn connection_mode(&self) -> ConnectionMode;
+
+    /// Check if the worker is currently healthy
+    fn is_healthy(&self) -> bool;
+
+    /// Set the worker's health status
+    fn set_healthy(&self, healthy: bool);
+
+    /// Perform an async health check on the worker
+    async fn check_health_async(&self) -> WorkerResult<()>;
+
+    /// Synchronous health check wrapper (for compatibility)
+    fn check_health(&self) -> WorkerResult<()> {
+        // Use a small runtime for synchronous contexts
+        tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .map_err(|e| WorkerError::HealthCheckFailed {
+                url: self.url().to_string(),
+                reason: format!("Failed to create runtime: {}", e),
+            })?
+            .block_on(self.check_health_async())
+    }
+
+    /// Get the current load (number of active requests)
+    fn load(&self) -> usize;
+
+    /// Increment the load counter
+    fn increment_load(&self);
+
+    /// Decrement the load counter
+    fn decrement_load(&self);
+
+    /// Reset the load counter to 0 (for sync/recovery)
+    fn reset_load(&self) {
+        // Default implementation - does nothing
+        // Workers that track load should override this
+    }
+
+    /// Get the number of processed requests
+    fn processed_requests(&self) -> usize;
+
+    /// Increment the processed requests counter
+    fn increment_processed(&self);
+
+    /// Get worker-specific metadata
+    fn metadata(&self) -> &WorkerMetadata;
+
+    /// Clone the worker (for trait objects)
+    fn clone_worker(&self) -> Box<dyn Worker>;
+
+    /// Get the circuit breaker for this worker
+    fn circuit_breaker(&self) -> &CircuitBreaker;
+
+    /// Check if the worker is available (healthy + circuit closed/half-open)
+    fn is_available(&self) -> bool {
+        self.is_healthy() && self.circuit_breaker().can_execute()
+    }
+
+    /// Record the outcome of a request to this worker
+    fn record_outcome(&self, success: bool) {
+        // Record outcome-level metric with worker label
+        let outcome_str = if success { "success" } else { "failure" };
+        RouterMetrics::record_cb_outcome(self.url(), outcome_str);
+
+        // Record into circuit breaker and infer state change for metrics
+        let before = self.circuit_breaker().state();
+        self.circuit_breaker().record_outcome(success);
+        let after = self.circuit_breaker().state();
+
+        if before != after {
+            let from = match before {
+                crate::core::CircuitState::Closed => "closed",
+                crate::core::CircuitState::Open => "open",
+                crate::core::CircuitState::HalfOpen => "half_open",
+            };
+            let to = match after {
+                crate::core::CircuitState::Closed => "closed",
+                crate::core::CircuitState::Open => "open",
+                crate::core::CircuitState::HalfOpen => "half_open",
+            };
+            RouterMetrics::record_cb_state_transition(self.url(), from, to);
+        }
+
+        let state_code = match self.circuit_breaker().state() {
+            crate::core::CircuitState::Closed => 0u8,
+            crate::core::CircuitState::Open => 1u8,
+            crate::core::CircuitState::HalfOpen => 2u8,
+        };
+        RouterMetrics::set_cb_state(self.url(), state_code);
+    }
+
+    // === DP-aware methods ===
+
+    /// Check if this worker is DP-aware
+    fn is_dp_aware(&self) -> bool {
+        false
+    }
+
+    /// Get the base URL without any DP rank suffix
+    fn base_url(&self) -> &str {
+        self.url()
+    }
+
+    /// Get DP rank if this is a DP-aware worker
+    fn dp_rank(&self) -> Option<usize> {
+        None
+    }
+
+    /// Get DP size if this worker is part of a DP group
+    fn dp_size(&self) -> Option<usize> {
+        None
+    }
+
+    /// Transform a request for DP-aware routing
+    async fn prepare_request(&self, req: serde_json::Value) -> WorkerResult<serde_json::Value> {
+        Ok(req)
+    }
+
+    /// Get the actual endpoint URL for requests
+    fn endpoint_url(&self, route: &str) -> String {
+        format!("{}{}", self.base_url(), route)
+    }
+
+    /// Check if this worker can handle a specific request
+    fn can_handle(&self, _req: &serde_json::Value) -> bool {
+        true
+    }
+}
+
+/// Connection mode for worker communication
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub enum ConnectionMode {
+    /// HTTP/REST connection
+    Http,
+    /// gRPC connection
+    Grpc {
+        /// Optional port for gRPC endpoint (if different from URL)
+        port: Option<u16>,
+    },
+}
+
+impl fmt::Display for ConnectionMode {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            ConnectionMode::Http => write!(f, "HTTP"),
+            ConnectionMode::Grpc { port } => match port {
+                Some(p) => write!(f, "gRPC(port:{})", p),
+                None => write!(f, "gRPC"),
+            },
+        }
+    }
+}
+
+/// Worker type classification
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub enum WorkerType {
+    /// Regular worker for standard routing
+    Regular,
+    /// Prefill worker for PD disaggregated mode
+    Prefill {
+        /// Bootstrap port for communication with decode workers
+        bootstrap_port: Option<u16>,
+    },
+    /// Decode worker for PD disaggregated mode
+    Decode,
+}
+
+impl fmt::Display for WorkerType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            WorkerType::Regular => write!(f, "Regular"),
+            WorkerType::Prefill { bootstrap_port } => match bootstrap_port {
+                Some(port) => write!(f, "Prefill(bootstrap:{})", port),
+                None => write!(f, "Prefill"),
+            },
+            WorkerType::Decode => write!(f, "Decode"),
+        }
+    }
+}
+
+/// Health check configuration
+#[derive(Debug, Clone)]
+pub struct HealthConfig {
+    /// Timeout for health checks in seconds
+    pub timeout_secs: u64,
+    /// Interval between health checks in seconds
+    pub check_interval_secs: u64,
+    /// Health check endpoint path
+    pub endpoint: String,
+    /// Number of consecutive failures before marking unhealthy
+    pub failure_threshold: u32,
+    /// Number of consecutive successes before marking healthy
+    pub success_threshold: u32,
+}
+
+impl Default for HealthConfig {
+    fn default() -> Self {
+        Self {
+            timeout_secs: 5,
+            check_interval_secs: 30,
+            endpoint: "/health".to_string(),
+            failure_threshold: 3,
+            success_threshold: 2,
+        }
+    }
+}
+
+/// Metadata associated with a worker
+#[derive(Debug, Clone)]
+pub struct WorkerMetadata {
+    /// Worker URL
+    pub url: String,
+    /// Worker type
+    pub worker_type: WorkerType,
+    /// Connection mode
+    pub connection_mode: ConnectionMode,
+    /// Additional labels/tags
+    pub labels: std::collections::HashMap<String, String>,
+    /// Health check configuration
+    pub health_config: HealthConfig,
+}
+
+/// Basic worker implementation
+#[derive(Clone)]
+pub struct BasicWorker {
+    metadata: WorkerMetadata,
+    load_counter: Arc<AtomicUsize>,
+    processed_counter: Arc<AtomicUsize>,
+    healthy: Arc<AtomicBool>,
+    consecutive_failures: Arc<AtomicUsize>,
+    consecutive_successes: Arc<AtomicUsize>,
+    circuit_breaker: CircuitBreaker,
+    /// Optional gRPC client for gRPC workers
+    grpc_client: Option<Arc<Mutex<SglangSchedulerClient>>>,
+}
+
+impl fmt::Debug for BasicWorker {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("BasicWorker")
+            .field("metadata", &self.metadata)
+            .field("healthy", &self.healthy.load(Ordering::Relaxed))
+            .field("circuit_breaker", &self.circuit_breaker)
+            .field("has_grpc_client", &self.grpc_client.is_some())
+            .finish()
+    }
+}
+
+impl BasicWorker {
+    pub fn new(url: String, worker_type: WorkerType) -> Self {
+        Self::with_connection_mode(url, worker_type, ConnectionMode::Http)
+    }
+
+    pub fn with_connection_mode(
+        url: String,
+        worker_type: WorkerType,
+        connection_mode: ConnectionMode,
+    ) -> Self {
+        let metadata = WorkerMetadata {
+            url: url.clone(),
+            worker_type,
+            connection_mode,
+            labels: std::collections::HashMap::new(),
+            health_config: HealthConfig::default(),
+        };
+
+        Self {
+            metadata,
+            load_counter: Arc::new(AtomicUsize::new(0)),
+            processed_counter: Arc::new(AtomicUsize::new(0)),
+            healthy: Arc::new(AtomicBool::new(true)),
+            consecutive_failures: Arc::new(AtomicUsize::new(0)),
+            consecutive_successes: Arc::new(AtomicUsize::new(0)),
+            circuit_breaker: CircuitBreaker::new(),
+            grpc_client: None,
+        }
+    }
+
+    pub fn with_labels(mut self, labels: std::collections::HashMap<String, String>) -> Self {
+        self.metadata.labels = labels;
+        self
+    }
+
+    pub fn with_health_config(mut self, config: HealthConfig) -> Self {
+        self.metadata.health_config = config;
+        self
+    }
+
+    pub fn with_circuit_breaker_config(mut self, config: CircuitBreakerConfig) -> Self {
+        self.circuit_breaker = CircuitBreaker::with_config(config);
+        self
+    }
+
+    /// Set the gRPC client for gRPC workers
+    pub fn with_grpc_client(mut self, client: SglangSchedulerClient) -> Self {
+        self.grpc_client = Some(Arc::new(Mutex::new(client)));
+        self
+    }
+
+    pub fn normalised_url(&self) -> WorkerResult<&str> {
+        if self.url().contains("@") {
+            // Need to extract the URL from "http://host:port@dp_rank"
+            let parts: Vec<&str> = self.url().split('@').collect();
+            if parts.len() != 2 {
+                return Err(WorkerError::InvalidUrl {
+                    url: self.url().to_string(),
+                });
+            }
+            // Ensure the second part (the dp_rank) can be parsed as an integer
+            match parts[1].parse::<usize>() {
+                Ok(_) => Ok(parts[0]),
+                Err(_) => Err(WorkerError::InvalidUrl {
+                    url: self.url().to_string(),
+                }),
+            }
+        } else {
+            Ok(self.url())
+        }
+    }
+}
+
+#[async_trait]
+impl Worker for BasicWorker {
+    fn url(&self) -> &str {
+        &self.metadata.url
+    }
+
+    fn worker_type(&self) -> WorkerType {
+        self.metadata.worker_type.clone()
+    }
+
+    fn connection_mode(&self) -> ConnectionMode {
+        self.metadata.connection_mode.clone()
+    }
+
+    fn is_healthy(&self) -> bool {
+        self.healthy.load(Ordering::Acquire)
+    }
+
+    fn set_healthy(&self, healthy: bool) {
+        self.healthy.store(healthy, Ordering::Release);
+        RouterMetrics::set_worker_health(self.url(), healthy);
+    }
+
+    async fn check_health_async(&self) -> WorkerResult<()> {
+        use std::time::Duration;
+
+        let health_result = match &self.metadata.connection_mode {
+            ConnectionMode::Http => {
+                // Perform HTTP health check
+                let url = self.normalised_url()?;
+                let health_url = format!("{}{}", url, self.metadata.health_config.endpoint);
+                let timeout = Duration::from_secs(self.metadata.health_config.timeout_secs);
+
+                // Use the shared client with a custom timeout for this request
+                match WORKER_CLIENT.get(&health_url).timeout(timeout).send().await {
+                    Ok(response) => response.status().is_success(),
+                    Err(_) => false,
+                }
+            }
+            ConnectionMode::Grpc { .. } => {
+                // Perform gRPC health check
+                if let Some(grpc_client) = &self.grpc_client {
+                    let mut client = grpc_client.lock().await;
+                    match client.health_check().await {
+                        Ok(response) => {
+                            tracing::debug!(
+                                "gRPC health check succeeded for {}: healthy={}",
+                                self.metadata.url,
+                                response.healthy
+                            );
+                            response.healthy
+                        }
+                        Err(e) => {
+                            tracing::warn!(
+                                "gRPC health check RPC failed for {}: {:?}",
+                                self.metadata.url,
+                                e
+                            );
+                            false
+                        }
+                    }
+                } else {
+                    tracing::error!("No gRPC client available for worker {}", self.metadata.url);
+                    false
+                }
+            }
+        };
+
+        if health_result {
+            // Health check succeeded
+            self.consecutive_failures.store(0, Ordering::Release);
+            let successes = self.consecutive_successes.fetch_add(1, Ordering::AcqRel) + 1;
+
+            // Mark healthy if we've reached the success threshold
+            if !self.is_healthy()
+                && successes >= self.metadata.health_config.success_threshold as usize
+            {
+                self.set_healthy(true);
+                self.consecutive_successes.store(0, Ordering::Release);
+            }
+            Ok(())
+        } else {
+            // Health check failed
+            self.consecutive_successes.store(0, Ordering::Release);
+            let failures = self.consecutive_failures.fetch_add(1, Ordering::AcqRel) + 1;
+
+            // Mark unhealthy if we've reached the failure threshold
+            if self.is_healthy()
+                && failures >= self.metadata.health_config.failure_threshold as usize
+            {
+                self.set_healthy(false);
+                self.consecutive_failures.store(0, Ordering::Release);
+            }
+
+            Err(WorkerError::HealthCheckFailed {
+                url: self.metadata.url.clone(),
+                reason: format!("Health check failed (consecutive failures: {})", failures),
+            })
+        }
+    }
+
+    fn load(&self) -> usize {
+        self.load_counter.load(Ordering::Relaxed)
+    }
+
+    fn increment_load(&self) {
+        self.load_counter.fetch_add(1, Ordering::Relaxed);
+    }
+
+    fn decrement_load(&self) {
+        self.load_counter
+            .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |current| {
+                current.checked_sub(1)
+            })
+            .ok();
+    }
+
+    fn reset_load(&self) {
+        self.load_counter.store(0, Ordering::Relaxed);
+    }
+
+    fn processed_requests(&self) -> usize {
+        self.processed_counter.load(Ordering::Relaxed)
+    }
+
+    fn increment_processed(&self) {
+        self.processed_counter.fetch_add(1, Ordering::Relaxed);
+    }
+
+    fn metadata(&self) -> &WorkerMetadata {
+        &self.metadata
+    }
+
+    fn clone_worker(&self) -> Box<dyn Worker> {
+        Box::new(self.clone())
+    }
+
+    fn circuit_breaker(&self) -> &CircuitBreaker {
+        &self.circuit_breaker
+    }
+}
+
+/// A DP-aware worker that handles data-parallel routing
+#[derive(Debug, Clone)]
+pub struct DPAwareWorker {
+    /// The underlying basic worker
+    base_worker: BasicWorker,
+    /// DP rank for this worker
+    dp_rank: usize,
+    /// Total DP size
+    dp_size: usize,
+    /// Base URL without DP suffix
+    base_url: String,
+}
+
+impl DPAwareWorker {
+    /// Create a new DP-aware worker of any type
+    pub fn new(base_url: String, dp_rank: usize, dp_size: usize, worker_type: WorkerType) -> Self {
+        // Create URL with DP rank suffix for identification
+        let worker_url = format!("{}@{}", base_url, dp_rank);
+        let base_worker = BasicWorker::new(worker_url, worker_type);
+
+        Self {
+            base_worker,
+            dp_rank,
+            dp_size,
+            base_url,
+        }
+    }
+}
+
+#[async_trait]
+impl Worker for DPAwareWorker {
+    fn url(&self) -> &str {
+        self.base_worker.url()
+    }
+
+    fn worker_type(&self) -> WorkerType {
+        self.base_worker.worker_type()
+    }
+
+    fn connection_mode(&self) -> ConnectionMode {
+        self.base_worker.connection_mode()
+    }
+
+    fn is_healthy(&self) -> bool {
+        self.base_worker.is_healthy()
+    }
+
+    fn set_healthy(&self, healthy: bool) {
+        self.base_worker.set_healthy(healthy);
+    }
+
+    async fn check_health_async(&self) -> WorkerResult<()> {
+        // Delegate to the base worker's health check logic
+        self.base_worker.check_health_async().await
+    }
+
+    fn load(&self) -> usize {
+        self.base_worker.load()
+    }
+
+    fn increment_load(&self) {
+        self.base_worker.increment_load();
+    }
+
+    fn decrement_load(&self) {
+        self.base_worker.decrement_load();
+    }
+
+    fn reset_load(&self) {
+        self.base_worker.reset_load();
+    }
+
+    fn processed_requests(&self) -> usize {
+        self.base_worker.processed_requests()
+    }
+
+    fn increment_processed(&self) {
+        self.base_worker.increment_processed();
+    }
+
+    fn metadata(&self) -> &WorkerMetadata {
+        self.base_worker.metadata()
+    }
+
+    fn clone_worker(&self) -> Box<dyn Worker> {
+        Box::new(self.clone())
+    }
+
+    fn circuit_breaker(&self) -> &CircuitBreaker {
+        self.base_worker.circuit_breaker()
+    }
+
+    // DP-aware specific implementations
+
+    fn is_dp_aware(&self) -> bool {
+        true
+    }
+
+    fn base_url(&self) -> &str {
+        &self.base_url
+    }
+
+    fn dp_rank(&self) -> Option<usize> {
+        Some(self.dp_rank)
+    }
+
+    fn dp_size(&self) -> Option<usize> {
+        Some(self.dp_size)
+    }
+
+    async fn prepare_request(&self, mut req: serde_json::Value) -> WorkerResult<serde_json::Value> {
+        // Inject data_parallel_rank into the request
+        if let Some(map) = req.as_object_mut() {
+            map.insert(
+                "data_parallel_rank".to_string(),
+                serde_json::json!(self.dp_rank),
+            );
+            Ok(req)
+        } else {
+            Err(WorkerError::InvalidConfiguration {
+                message: "Request must be a JSON object for DP-aware routing".to_string(),
+            })
+        }
+    }
+
+    fn endpoint_url(&self, route: &str) -> String {
+        // Use base URL for actual requests
+        format!("{}{}", self.base_url, route)
+    }
+}
+
+/// Worker factory for creating workers of different types
+pub struct WorkerFactory;
+
+impl WorkerFactory {
+    /// Create a regular worker
+    pub fn create_regular(url: String) -> Box<dyn Worker> {
+        Box::new(BasicWorker::new(url, WorkerType::Regular))
+    }
+
+    /// Create a regular worker with custom circuit breaker configuration
+    pub fn create_regular_with_config(
+        url: String,
+        circuit_breaker_config: CircuitBreakerConfig,
+    ) -> Box<dyn Worker> {
+        Box::new(
+            BasicWorker::new(url, WorkerType::Regular)
+                .with_circuit_breaker_config(circuit_breaker_config),
+        )
+    }
+
+    /// Create a prefill worker with optional bootstrap port
+    pub fn create_prefill(url: String, bootstrap_port: Option<u16>) -> Box<dyn Worker> {
+        Box::new(BasicWorker::new(
+            url,
+            WorkerType::Prefill { bootstrap_port },
+        ))
+    }
+
+    /// Create a prefill worker with custom circuit breaker configuration
+    pub fn create_prefill_with_config(
+        url: String,
+        bootstrap_port: Option<u16>,
+        circuit_breaker_config: CircuitBreakerConfig,
+    ) -> Box<dyn Worker> {
+        Box::new(
+            BasicWorker::new(url, WorkerType::Prefill { bootstrap_port })
+                .with_circuit_breaker_config(circuit_breaker_config),
+        )
+    }
+
+    /// Create a decode worker
+    pub fn create_decode(url: String) -> Box<dyn Worker> {
+        Box::new(BasicWorker::new(url, WorkerType::Decode))
+    }
+
+    /// Create a decode worker with custom circuit breaker configuration
+    pub fn create_decode_with_config(
+        url: String,
+        circuit_breaker_config: CircuitBreakerConfig,
+    ) -> Box<dyn Worker> {
+        Box::new(
+            BasicWorker::new(url, WorkerType::Decode)
+                .with_circuit_breaker_config(circuit_breaker_config),
+        )
+    }
+
+    /// Create workers from URLs with automatic type detection
+    #[allow(clippy::type_complexity)]
+    pub fn create_from_urls(
+        regular_urls: Vec<String>,
+        prefill_urls: Vec<(String, Option<u16>)>,
+        decode_urls: Vec<String>,
+    ) -> (
+        Vec<Box<dyn Worker>>,
+        Vec<Box<dyn Worker>>,
+        Vec<Box<dyn Worker>>,
+    ) {
+        let regular_workers: Vec<Box<dyn Worker>> =
+            regular_urls.into_iter().map(Self::create_regular).collect();
+
+        let prefill_workers: Vec<Box<dyn Worker>> = prefill_urls
+            .into_iter()
+            .map(|(url, port)| Self::create_prefill(url, port))
+            .collect();
+
+        let decode_workers: Vec<Box<dyn Worker>> =
+            decode_urls.into_iter().map(Self::create_decode).collect();
+
+        (regular_workers, prefill_workers, decode_workers)
+    }
+
+    /// Create a gRPC worker
+    pub fn create_grpc(url: String, worker_type: WorkerType, port: Option<u16>) -> Box<dyn Worker> {
+        Box::new(BasicWorker::with_connection_mode(
+            url,
+            worker_type,
+            ConnectionMode::Grpc { port },
+        ))
+    }
+
+    /// Create a gRPC worker with custom circuit breaker configuration
+    pub fn create_grpc_with_config(
+        url: String,
+        worker_type: WorkerType,
+        port: Option<u16>,
+        circuit_breaker_config: CircuitBreakerConfig,
+    ) -> Box<dyn Worker> {
+        Box::new(
+            BasicWorker::with_connection_mode(url, worker_type, ConnectionMode::Grpc { port })
+                .with_circuit_breaker_config(circuit_breaker_config),
+        )
+    }
+
+    /// Create a DP-aware worker of specified type
+    pub fn create_dp_aware(
+        base_url: String,
+        dp_rank: usize,
+        dp_size: usize,
+        worker_type: WorkerType,
+    ) -> Box<dyn Worker> {
+        Box::new(DPAwareWorker::new(base_url, dp_rank, dp_size, worker_type))
+    }
+
+    /// Get DP size from a worker
+    async fn get_worker_dp_size(url: &str, api_key: &Option<String>) -> WorkerResult<usize> {
+        let mut req_builder = WORKER_CLIENT.get(format!("{}/get_server_info", url));
+
+        if let Some(key) = api_key {
+            req_builder = req_builder.bearer_auth(key);
+        }
+
+        let response = req_builder
+            .send()
+            .await
+            .map_err(|e| WorkerError::NetworkError {
+                url: url.to_string(),
+                error: e.to_string(),
+            })?;
+
+        if !response.status().is_success() {
+            return Err(WorkerError::NetworkError {
+                url: url.to_string(),
+                error: format!("Server returned: {}", response.status()),
+            });
+        }
+
+        let info: serde_json::Value =
+            response
+                .json()
+                .await
+                .map_err(|e| WorkerError::NetworkError {
+                    url: url.to_string(),
+                    error: format!("Failed to parse JSON: {}", e),
+                })?;
+
+        let dp_size = info
+            .get("dp_size")
+            .and_then(|v| v.as_u64())
+            .ok_or_else(|| WorkerError::InvalidConfiguration {
+                message: "dp_size not found in server info".to_string(),
+            })?;
+
+        if dp_size > usize::MAX as u64 {
+            return Err(WorkerError::InvalidConfiguration {
+                message: format!("dp_size is too large: {}", dp_size),
+            });
+        }
+
+        Ok(dp_size as usize)
+    }
+
+    /// Private helper to create DP-aware workers of any type
+    async fn create_dp_aware_workers_of_type(
+        url: &str,
+        api_key: &Option<String>,
+        worker_type: WorkerType,
+    ) -> WorkerResult<Vec<Box<dyn Worker>>> {
+        let dp_size = Self::get_worker_dp_size(url, api_key).await?;
+
+        let workers = (0..dp_size)
+            .map(|rank| Self::create_dp_aware(url.to_string(), rank, dp_size, worker_type.clone()))
+            .collect();
+
+        Ok(workers)
+    }
+
+    /// Create DP-aware regular workers from a single URL
+    pub async fn create_dp_aware_regular_workers(
+        url: &str,
+        api_key: &Option<String>,
+    ) -> WorkerResult<Vec<Box<dyn Worker>>> {
+        Self::create_dp_aware_workers_of_type(url, api_key, WorkerType::Regular).await
+    }
+
+    /// Create DP-aware prefill workers from a single URL
+    pub async fn create_dp_aware_prefill_workers(
+        url: &str,
+        bootstrap_port: Option<u16>,
+        api_key: &Option<String>,
+    ) -> WorkerResult<Vec<Box<dyn Worker>>> {
+        Self::create_dp_aware_workers_of_type(url, api_key, WorkerType::Prefill { bootstrap_port })
+            .await
+    }
+
+    /// Create DP-aware decode workers from a single URL
+    pub async fn create_dp_aware_decode_workers(
+        url: &str,
+        api_key: &Option<String>,
+    ) -> WorkerResult<Vec<Box<dyn Worker>>> {
+        Self::create_dp_aware_workers_of_type(url, api_key, WorkerType::Decode).await
+    }
+
+    /// Create workers based on configuration (for regular router)
+    pub async fn create_workers(
+        urls: Vec<String>,
+        dp_aware: bool,
+        api_key: &Option<String>,
+    ) -> WorkerResult<Vec<Box<dyn Worker>>> {
+        if dp_aware {
+            // Create futures for all worker creations
+            let worker_futs = urls
+                .iter()
+                .map(|url| Self::create_dp_aware_regular_workers(url, api_key));
+
+            // Execute all futures concurrently and flatten results
+            let all_workers = futures::future::try_join_all(worker_futs)
+                .await?
+                .into_iter()
+                .flatten()
+                .collect();
+
+            Ok(all_workers)
+        } else {
+            Ok(urls
+                .into_iter()
+                .map(|url| Self::create_regular(url))
+                .collect())
+        }
+    }
+}
+
+/// Helper trait for collections of workers
+pub trait WorkerCollection {
+    fn healthy_workers(&self) -> Vec<&dyn Worker>;
+    fn total_load(&self) -> usize;
+    fn find_worker(&self, url: &str) -> Option<&dyn Worker>;
+    fn find_worker_mut(&mut self, url: &str) -> Option<&mut Box<dyn Worker>>;
+}
+
+impl WorkerCollection for Vec<Box<dyn Worker>> {
+    fn healthy_workers(&self) -> Vec<&dyn Worker> {
+        self.iter()
+            .filter(|w| w.is_healthy())
+            .map(|w| w.as_ref())
+            .collect()
+    }
+
+    fn total_load(&self) -> usize {
+        self.iter().map(|w| w.load()).sum()
+    }
+
+    fn find_worker(&self, url: &str) -> Option<&dyn Worker> {
+        self.iter().find(|w| w.url() == url).map(|w| w.as_ref())
+    }
+
+    fn find_worker_mut(&mut self, url: &str) -> Option<&mut Box<dyn Worker>> {
+        self.iter_mut().find(|w| w.url() == url)
+    }
+}
+
+/// Convert a list of worker URLs to worker trait objects
+pub fn urls_to_workers(urls: Vec<String>) -> Vec<Box<dyn Worker>> {
+    urls.into_iter()
+        .map(WorkerFactory::create_regular)
+        .collect()
+}
+
+/// Convert worker trait objects back to URLs
+pub fn workers_to_urls(workers: &[Box<dyn Worker>]) -> Vec<String> {
+    workers.iter().map(|w| w.url().to_string()).collect()
+}
+
+/// RAII guard for worker load management
+pub struct WorkerLoadGuard<'a> {
+    workers: Vec<&'a dyn Worker>,
+}
+
+impl<'a> WorkerLoadGuard<'a> {
+    /// Create a new load guard for a single worker
+    pub fn new(worker: &'a dyn Worker) -> Self {
+        worker.increment_load();
+        Self {
+            workers: vec![worker],
+        }
+    }
+
+    /// Create a new load guard for multiple workers
+    pub fn new_multi(workers: Vec<&'a dyn Worker>) -> Self {
+        // Increment load counters for all workers
+        for worker in &workers {
+            worker.increment_load();
+        }
+        Self { workers }
+    }
+}
+
+impl<'a> Drop for WorkerLoadGuard<'a> {
+    fn drop(&mut self) {
+        // Decrement load counters for all workers
+        for worker in &self.workers {
+            worker.decrement_load();
+        }
+    }
+}
+
+/// Health checker handle with graceful shutdown
+pub struct HealthChecker {
+    handle: tokio::task::JoinHandle<()>,
+    shutdown: Arc<AtomicBool>,
+}
+
+impl fmt::Debug for HealthChecker {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("HealthChecker")
+            .field("shutdown", &self.shutdown.load(Ordering::Relaxed))
+            .finish()
+    }
+}
+
+impl HealthChecker {
+    /// Shutdown the health checker gracefully
+    pub async fn shutdown(self) {
+        self.shutdown.store(true, Ordering::Release);
+        let _ = self.handle.await;
+    }
+}
+
+/// Start an async background health checker for a collection of workers
+pub fn start_health_checker(
+    workers: std::sync::Arc<std::sync::RwLock<Vec<Box<dyn Worker>>>>,
+    check_interval_secs: u64,
+) -> HealthChecker {
+    let shutdown = Arc::new(AtomicBool::new(false));
+    let shutdown_clone = shutdown.clone();
+
+    let handle = tokio::spawn(async move {
+        let mut interval =
+            tokio::time::interval(tokio::time::Duration::from_secs(check_interval_secs));
+
+        // Counter for periodic load reset (every 10 health check cycles)
+        let mut check_count = 0u64;
+        const LOAD_RESET_INTERVAL: u64 = 10;
+
+        loop {
+            interval.tick().await;
+
+            // Check for shutdown signal
+            if shutdown_clone.load(Ordering::Acquire) {
+                tracing::debug!("Health checker shutting down");
+                break;
+            }
+
+            check_count += 1;
+
+            // Check health of all workers
+            let workers_to_check = match workers.read() {
+                Ok(guard) => guard.iter().map(|w| w.clone_worker()).collect::<Vec<_>>(),
+                Err(poisoned) => {
+                    tracing::error!("Worker lock poisoned: {}", poisoned);
+                    continue;
+                }
+            };
+
+            // Periodically reset load counters to prevent drift
+            // Only do this when we believe all workers should be idle
+            if check_count.is_multiple_of(LOAD_RESET_INTERVAL) {
+                let max_load = workers_to_check.iter().map(|w| w.load()).max().unwrap_or(0);
+                // Only reset if load appears to be very low (likely drift)
+                if max_load <= 2 {
+                    tracing::debug!(
+                        "Resetting load counters to prevent drift (max_load: {})",
+                        max_load
+                    );
+                    for worker in &workers_to_check {
+                        worker.reset_load();
+                    }
+                }
+            }
+
+            // Perform health checks concurrently
+            let health_checks = workers_to_check.iter().map(|worker| {
+                let worker_url = worker.url().to_string();
+                let was_healthy = worker.is_healthy();
+
+                async move {
+                    match worker.check_health_async().await {
+                        Ok(_) => {
+                            if !was_healthy {
+                                tracing::info!("Worker {} is now healthy", worker_url);
+                            }
+                        }
+                        Err(e) => {
+                            if was_healthy {
+                                tracing::warn!("Worker {} health check failed: {}", worker_url, e);
+                            } else {
+                                // Worker was already unhealthy, log at debug level
+                                tracing::debug!("Worker {} remains unhealthy: {}", worker_url, e);
+                            }
+                        }
+                    }
+                }
+            });
+
+            // Execute all health checks concurrently
+            futures::future::join_all(health_checks).await;
+        }
+    });
+
+    HealthChecker { handle, shutdown }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::RwLock;
+    use std::thread;
+    use std::time::Duration;
+    use tokio::time::timeout;
+
+    // Test WorkerType
+    #[test]
+    fn test_worker_type_display() {
+        assert_eq!(WorkerType::Regular.to_string(), "Regular");
+        assert_eq!(
+            WorkerType::Prefill {
+                bootstrap_port: Some(8080)
+            }
+            .to_string(),
+            "Prefill(bootstrap:8080)"
+        );
+        assert_eq!(
+            WorkerType::Prefill {
+                bootstrap_port: None
+            }
+            .to_string(),
+            "Prefill"
+        );
+        assert_eq!(WorkerType::Decode.to_string(), "Decode");
+    }
+
+    #[test]
+    fn test_worker_type_equality() {
+        assert_eq!(WorkerType::Regular, WorkerType::Regular);
+        assert_ne!(WorkerType::Regular, WorkerType::Decode);
+        assert_eq!(
+            WorkerType::Prefill {
+                bootstrap_port: Some(8080)
+            },
+            WorkerType::Prefill {
+                bootstrap_port: Some(8080)
+            }
+        );
+        assert_ne!(
+            WorkerType::Prefill {
+                bootstrap_port: Some(8080)
+            },
+            WorkerType::Prefill {
+                bootstrap_port: Some(8081)
+            }
+        );
+    }
+
+    #[test]
+    fn test_worker_type_clone() {
+        let original = WorkerType::Prefill {
+            bootstrap_port: Some(8080),
+        };
+        let cloned = original.clone();
+        assert_eq!(original, cloned);
+    }
+
+    // Test HealthConfig
+    #[test]
+    fn test_health_config_default() {
+        let config = HealthConfig::default();
+        assert_eq!(config.timeout_secs, 5);
+        assert_eq!(config.check_interval_secs, 30);
+        assert_eq!(config.endpoint, "/health");
+        assert_eq!(config.failure_threshold, 3);
+        assert_eq!(config.success_threshold, 2);
+    }
+
+    #[test]
+    fn test_health_config_custom() {
+        let config = HealthConfig {
+            timeout_secs: 10,
+            check_interval_secs: 60,
+            endpoint: "/healthz".to_string(),
+            failure_threshold: 5,
+            success_threshold: 3,
+        };
+        assert_eq!(config.timeout_secs, 10);
+        assert_eq!(config.check_interval_secs, 60);
+        assert_eq!(config.endpoint, "/healthz");
+        assert_eq!(config.failure_threshold, 5);
+        assert_eq!(config.success_threshold, 3);
+    }
+
+    // Test BasicWorker
+    #[test]
+    fn test_basic_worker_creation() {
+        let worker = BasicWorker::new("http://test:8080".to_string(), WorkerType::Regular);
+        assert_eq!(worker.url(), "http://test:8080");
+        assert_eq!(worker.worker_type(), WorkerType::Regular);
+        assert!(worker.is_healthy());
+        assert_eq!(worker.load(), 0);
+        assert_eq!(worker.processed_requests(), 0);
+    }
+
+    #[test]
+    fn test_worker_with_labels() {
+        let mut labels = std::collections::HashMap::new();
+        labels.insert("env".to_string(), "prod".to_string());
+        labels.insert("zone".to_string(), "us-west".to_string());
+
+        let worker = BasicWorker::new("http://test:8080".to_string(), WorkerType::Regular)
+            .with_labels(labels.clone());
+
+        assert_eq!(worker.metadata().labels, labels);
+    }
+
+    #[test]
+    fn test_worker_with_health_config() {
+        let custom_config = HealthConfig {
+            timeout_secs: 15,
+            check_interval_secs: 45,
+            endpoint: "/custom-health".to_string(),
+            failure_threshold: 4,
+            success_threshold: 2,
+        };
+
+        let worker = BasicWorker::new("http://test:8080".to_string(), WorkerType::Regular)
+            .with_health_config(custom_config.clone());
+
+        assert_eq!(worker.metadata().health_config.timeout_secs, 15);
+        assert_eq!(worker.metadata().health_config.check_interval_secs, 45);
+        assert_eq!(worker.metadata().health_config.endpoint, "/custom-health");
+    }
+
+    // Test Worker trait implementation
+    #[test]
+    fn test_worker_url() {
+        let worker = BasicWorker::new("http://worker1:8080".to_string(), WorkerType::Regular);
+        assert_eq!(worker.url(), "http://worker1:8080");
+    }
+
+    #[test]
+    fn test_worker_type_getter() {
+        let regular = BasicWorker::new("http://test:8080".to_string(), WorkerType::Regular);
+        assert_eq!(regular.worker_type(), WorkerType::Regular);
+
+        let prefill = BasicWorker::new(
+            "http://test:8080".to_string(),
+            WorkerType::Prefill {
+                bootstrap_port: Some(9090),
+            },
+        );
+        assert_eq!(
+            prefill.worker_type(),
+            WorkerType::Prefill {
+                bootstrap_port: Some(9090)
+            }
+        );
+
+        let decode = BasicWorker::new("http://test:8080".to_string(), WorkerType::Decode);
+        assert_eq!(decode.worker_type(), WorkerType::Decode);
+    }
+
+    #[test]
+    fn test_health_status() {
+        let worker = BasicWorker::new("http://test:8080".to_string(), WorkerType::Regular);
+
+        // Initial state is healthy
+        assert!(worker.is_healthy());
+
+        // Set unhealthy
+        worker.set_healthy(false);
+        assert!(!worker.is_healthy());
+
+        // Set healthy again
+        worker.set_healthy(true);
+        assert!(worker.is_healthy());
+    }
+
+    #[test]
+    fn test_load_counter_operations() {
+        let worker = BasicWorker::new("http://test:8080".to_string(), WorkerType::Regular);
+
+        // Initial load is 0
+        assert_eq!(worker.load(), 0);
+
+        // Increment once
+        worker.increment_load();
+        assert_eq!(worker.load(), 1);
+
+        // Increment twice more
+        worker.increment_load();
+        worker.increment_load();
+        assert_eq!(worker.load(), 3);
+
+        // Decrement once
+        worker.decrement_load();
+        assert_eq!(worker.load(), 2);
+
+        // Decrement to 0
+        worker.decrement_load();
+        worker.decrement_load();
+        assert_eq!(worker.load(), 0);
+
+        // Decrement below 0 should stay at 0
+        worker.decrement_load();
+        assert_eq!(worker.load(), 0);
+    }
+
+    #[test]
+    fn test_processed_counter() {
+        let worker = BasicWorker::new("http://test:8080".to_string(), WorkerType::Regular);
+
+        // Initial count is 0
+        assert_eq!(worker.processed_requests(), 0);
+
+        // Increment multiple times
+        for i in 1..=100 {
+            worker.increment_processed();
+            assert_eq!(worker.processed_requests(), i);
+        }
+    }
+
+    #[test]
+    fn test_clone_worker() {
+        let original = BasicWorker::new("http://test:8080".to_string(), WorkerType::Regular);
+        original.increment_load();
+        original.increment_processed();
+        original.set_healthy(false);
+
+        let cloned = original.clone_worker();
+
+        // Verify cloned worker has same URL and type
+        assert_eq!(cloned.url(), original.url());
+        assert_eq!(cloned.worker_type(), original.worker_type());
+
+        // Load counters should be independent (cloned shares the Arc)
+        assert_eq!(cloned.load(), original.load());
+
+        // Modify original and verify clone is affected (shared state)
+        original.increment_load();
+        assert_eq!(cloned.load(), original.load());
+    }
+
+    // Test concurrent operations
+    #[tokio::test]
+    async fn test_concurrent_load_increments() {
+        let worker = Arc::new(BasicWorker::new(
+            "http://test:8080".to_string(),
+            WorkerType::Regular,
+        ));
+
+        let mut handles = vec![];
+
+        // Spawn 100 tasks incrementing load
+        for _ in 0..100 {
+            let worker_clone = Arc::clone(&worker);
+            let handle = tokio::spawn(async move {
+                worker_clone.increment_load();
+            });
+            handles.push(handle);
+        }
+
+        // Wait for all tasks
+        for handle in handles {
+            handle.await.unwrap();
+        }
+
+        // Final count should be 100
+        assert_eq!(worker.load(), 100);
+    }
+
+    #[tokio::test]
+    async fn test_concurrent_load_decrements() {
+        let worker = Arc::new(BasicWorker::new(
+            "http://test:8080".to_string(),
+            WorkerType::Regular,
+        ));
+
+        // Set initial load to 100
+        for _ in 0..100 {
+            worker.increment_load();
+        }
+        assert_eq!(worker.load(), 100);
+
+        let mut handles = vec![];
+
+        // Spawn 100 tasks decrementing load
+        for _ in 0..100 {
+            let worker_clone = Arc::clone(&worker);
+            let handle = tokio::spawn(async move {
+                worker_clone.decrement_load();
+            });
+            handles.push(handle);
+        }
+
+        // Wait for all tasks
+        for handle in handles {
+            handle.await.unwrap();
+        }
+
+        // Final count should be 0
+        assert_eq!(worker.load(), 0);
+    }
+
+    #[tokio::test]
+    async fn test_concurrent_health_updates() {
+        let worker = Arc::new(BasicWorker::new(
+            "http://test:8080".to_string(),
+            WorkerType::Regular,
+        ));
+
+        let mut handles = vec![];
+
+        // Spawn threads randomly setting health status
+        for i in 0..100 {
+            let worker_clone = Arc::clone(&worker);
+            let handle = tokio::spawn(async move {
+                worker_clone.set_healthy(i % 2 == 0);
+                tokio::time::sleep(Duration::from_micros(10)).await;
+            });
+            handles.push(handle);
+        }
+
+        // Wait for all tasks
+        for handle in handles {
+            handle.await.unwrap();
+        }
+    }
+
+    // Test WorkerFactory
+    #[test]
+    fn test_create_regular_worker() {
+        let worker = WorkerFactory::create_regular("http://regular:8080".to_string());
+        assert_eq!(worker.url(), "http://regular:8080");
+        assert_eq!(worker.worker_type(), WorkerType::Regular);
+    }
+
+    #[test]
+    fn test_create_prefill_worker() {
+        // With bootstrap port
+        let worker1 = WorkerFactory::create_prefill("http://prefill:8080".to_string(), Some(9090));
+        assert_eq!(worker1.url(), "http://prefill:8080");
+        assert_eq!(
+            worker1.worker_type(),
+            WorkerType::Prefill {
+                bootstrap_port: Some(9090)
+            }
+        );
+
+        // Without bootstrap port
+        let worker2 = WorkerFactory::create_prefill("http://prefill:8080".to_string(), None);
+        assert_eq!(
+            worker2.worker_type(),
+            WorkerType::Prefill {
+                bootstrap_port: None
+            }
+        );
+    }
+
+    #[test]
+    fn test_create_decode_worker() {
+        let worker = WorkerFactory::create_decode("http://decode:8080".to_string());
+        assert_eq!(worker.url(), "http://decode:8080");
+        assert_eq!(worker.worker_type(), WorkerType::Decode);
+    }
+
+    #[test]
+    fn test_create_from_urls() {
+        let regular_urls = vec![
+            "http://regular1:8080".to_string(),
+            "http://regular2:8080".to_string(),
+        ];
+        let prefill_urls = vec![
+            ("http://prefill1:8080".to_string(), Some(9090)),
+            ("http://prefill2:8080".to_string(), None),
+        ];
+        let decode_urls = vec![
+            "http://decode1:8080".to_string(),
+            "http://decode2:8080".to_string(),
+        ];
+
+        let (regular, prefill, decode) =
+            WorkerFactory::create_from_urls(regular_urls, prefill_urls, decode_urls);
+
+        assert_eq!(regular.len(), 2);
+        assert_eq!(prefill.len(), 2);
+        assert_eq!(decode.len(), 2);
+
+        assert_eq!(regular[0].url(), "http://regular1:8080");
+        assert_eq!(prefill[0].url(), "http://prefill1:8080");
+        assert_eq!(decode[0].url(), "http://decode1:8080");
+    }
+
+    // Test WorkerCollection trait
+    #[test]
+    fn test_healthy_workers_filter() {
+        let workers: Vec<Box<dyn Worker>> = vec![
+            WorkerFactory::create_regular("http://w1:8080".to_string()),
+            WorkerFactory::create_regular("http://w2:8080".to_string()),
+            WorkerFactory::create_regular("http://w3:8080".to_string()),
+        ];
+
+        // Set some workers unhealthy
+        workers[0].set_healthy(false);
+        workers[2].set_healthy(false);
+
+        let healthy = workers.healthy_workers();
+        assert_eq!(healthy.len(), 1);
+        assert_eq!(healthy[0].url(), "http://w2:8080");
+    }
+
+    #[test]
+    fn test_total_load_calculation() {
+        let workers: Vec<Box<dyn Worker>> = vec![
+            WorkerFactory::create_regular("http://w1:8080".to_string()),
+            WorkerFactory::create_regular("http://w2:8080".to_string()),
+            WorkerFactory::create_regular("http://w3:8080".to_string()),
+        ];
+
+        // Set different loads
+        workers[0].increment_load();
+        workers[0].increment_load(); // load = 2
+
+        workers[1].increment_load();
+        workers[1].increment_load();
+        workers[1].increment_load(); // load = 3
+
+        workers[2].increment_load(); // load = 1
+
+        assert_eq!(workers.total_load(), 6);
+    }
+
+    #[test]
+    fn test_find_worker() {
+        let workers: Vec<Box<dyn Worker>> = vec![
+            WorkerFactory::create_regular("http://w1:8080".to_string()),
+            WorkerFactory::create_regular("http://w2:8080".to_string()),
+            WorkerFactory::create_regular("http://w3:8080".to_string()),
+        ];
+
+        // Found case
+        let found = workers.find_worker("http://w2:8080");
+        assert!(found.is_some());
+        assert_eq!(found.unwrap().url(), "http://w2:8080");
+
+        // Not found case
+        let not_found = workers.find_worker("http://w4:8080");
+        assert!(not_found.is_none());
+    }
+
+    #[test]
+    fn test_find_worker_mut() {
+        let mut workers: Vec<Box<dyn Worker>> = vec![
+            WorkerFactory::create_regular("http://w1:8080".to_string()),
+            WorkerFactory::create_regular("http://w2:8080".to_string()),
+        ];
+
+        // Find and modify
+        if let Some(worker) = workers.find_worker_mut("http://w1:8080") {
+            worker.set_healthy(false);
+        }
+
+        // Verify modification
+        assert!(!workers[0].is_healthy());
+        assert!(workers[1].is_healthy());
+    }
+
+    // Test WorkerLoadGuard
+    #[test]
+    fn test_load_guard_single_worker() {
+        let worker = BasicWorker::new("http://test:8080".to_string(), WorkerType::Regular);
+        assert_eq!(worker.load(), 0);
+
+        {
+            let _guard = WorkerLoadGuard::new(&worker);
+            assert_eq!(worker.load(), 1);
+        }
+
+        // Guard dropped, load decremented
+        assert_eq!(worker.load(), 0);
+    }
+
+    #[test]
+    fn test_load_guard_multiple_workers() {
+        let workers: Vec<Box<dyn Worker>> = vec![
+            WorkerFactory::create_regular("http://w1:8080".to_string()),
+            WorkerFactory::create_regular("http://w2:8080".to_string()),
+            WorkerFactory::create_regular("http://w3:8080".to_string()),
+        ];
+
+        let worker_refs: Vec<&dyn Worker> = workers.iter().map(|w| w.as_ref()).collect();
+
+        {
+            let _guard = WorkerLoadGuard::new_multi(worker_refs);
+            // All loads incremented
+            assert_eq!(workers[0].load(), 1);
+            assert_eq!(workers[1].load(), 1);
+            assert_eq!(workers[2].load(), 1);
+        }
+
+        // All loads decremented
+        assert_eq!(workers[0].load(), 0);
+        assert_eq!(workers[1].load(), 0);
+        assert_eq!(workers[2].load(), 0);
+    }
+
+    #[test]
+    fn test_load_guard_panic_safety() {
+        let worker = Arc::new(BasicWorker::new(
+            "http://test:8080".to_string(),
+            WorkerType::Regular,
+        ));
+        assert_eq!(worker.load(), 0);
+
+        // Clone for use inside catch_unwind
+        let worker_clone = Arc::clone(&worker);
+
+        // Use AssertUnwindSafe wrapper for the test
+        // This is safe because we're only testing the load counter behavior,
+        // not the grpc_client which is None for HTTP workers
+        use std::panic::AssertUnwindSafe;
+
+        // This will panic, but the guard should still clean up
+        let result = std::panic::catch_unwind(AssertUnwindSafe(|| {
+            let _guard = WorkerLoadGuard::new(worker_clone.as_ref());
+            assert_eq!(worker_clone.load(), 1);
+            panic!("Test panic");
+        }));
+
+        // Verify panic occurred
+        assert!(result.is_err());
+
+        // Load should be decremented even after panic
+        assert_eq!(worker.load(), 0);
+    }
+
+    // Test helper functions
+    #[test]
+    fn test_urls_to_workers() {
+        let urls = vec!["http://w1:8080".to_string(), "http://w2:8080".to_string()];
+
+        let workers = urls_to_workers(urls);
+        assert_eq!(workers.len(), 2);
+        assert_eq!(workers[0].url(), "http://w1:8080");
+        assert_eq!(workers[1].url(), "http://w2:8080");
+        assert_eq!(workers[0].worker_type(), WorkerType::Regular);
+    }
+
+    #[test]
+    fn test_workers_to_urls() {
+        let workers: Vec<Box<dyn Worker>> = vec![
+            WorkerFactory::create_regular("http://w1:8080".to_string()),
+            WorkerFactory::create_regular("http://w2:8080".to_string()),
+        ];
+
+        let urls = workers_to_urls(&workers);
+        assert_eq!(urls, vec!["http://w1:8080", "http://w2:8080"]);
+    }
+
+    // Test synchronous health check wrapper
+    #[test]
+    fn test_check_health_sync_wrapper() {
+        // We can't easily test the actual HTTP call without mocking,
+        // but we can verify the sync wrapper works
+        let worker = BasicWorker::new("http://test:8080".to_string(), WorkerType::Regular);
+
+        // This will fail because there's no server at this URL,
+        // but it tests that the sync wrapper doesn't panic
+        let result = worker.check_health();
+        assert!(result.is_err());
+    }
+
+    // Test HealthChecker background task
+    #[tokio::test]
+    async fn test_health_checker_startup() {
+        let workers = Arc::new(RwLock::new(vec![WorkerFactory::create_regular(
+            "http://w1:8080".to_string(),
+        )]));
+
+        let checker = start_health_checker(workers.clone(), 60);
+
+        // Verify it starts without panic
+        tokio::time::sleep(Duration::from_millis(100)).await;
+
+        // Shutdown
+        checker.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_health_checker_shutdown() {
+        let workers = Arc::new(RwLock::new(vec![WorkerFactory::create_regular(
+            "http://w1:8080".to_string(),
+        )]));
+
+        let checker = start_health_checker(workers.clone(), 60);
+
+        // Shutdown should complete quickly
+        let shutdown_result = timeout(Duration::from_secs(1), checker.shutdown()).await;
+        assert!(shutdown_result.is_ok());
+    }
+
+    // Performance test for load counter
+    #[test]
+    fn test_load_counter_performance() {
+        use std::time::Instant;
+
+        let worker = BasicWorker::new("http://test:8080".to_string(), WorkerType::Regular);
+        let iterations = 1_000_000;
+
+        let start = Instant::now();
+        for _ in 0..iterations {
+            worker.increment_load();
+        }
+        let duration = start.elapsed();
+
+        let ops_per_sec = iterations as f64 / duration.as_secs_f64();
+        println!("Load counter operations per second: {:.0}", ops_per_sec);
+
+        // Should be well over 1M ops/sec
+        assert!(ops_per_sec > 1_000_000.0);
+    }
+
+    // ===== Tests for DPAwareWorker =====
+
+    #[test]
+    fn test_dp_aware_worker_creation() {
+        let dp_worker =
+            DPAwareWorker::new("http://worker1:8080".to_string(), 2, 4, WorkerType::Regular);
+
+        assert_eq!(dp_worker.url(), "http://worker1:8080@2");
+        assert_eq!(dp_worker.base_url(), "http://worker1:8080");
+        assert!(dp_worker.is_dp_aware());
+        assert_eq!(dp_worker.dp_rank(), Some(2));
+        assert_eq!(dp_worker.dp_size(), Some(4));
+        assert_eq!(dp_worker.worker_type(), WorkerType::Regular);
+    }
+
+    #[test]
+    fn test_dp_aware_worker_creation_prefill() {
+        let dp_worker = DPAwareWorker::new(
+            "http://worker1:8080".to_string(),
+            1,
+            2,
+            WorkerType::Prefill {
+                bootstrap_port: Some(9090),
+            },
+        );
+
+        assert_eq!(dp_worker.url(), "http://worker1:8080@1");
+        assert!(dp_worker.is_dp_aware());
+        assert_eq!(
+            dp_worker.worker_type(),
+            WorkerType::Prefill {
+                bootstrap_port: Some(9090)
+            }
+        );
+    }
+
+    #[test]
+    fn test_dp_aware_worker_creation_decode() {
+        let dp_worker =
+            DPAwareWorker::new("http://worker1:8080".to_string(), 0, 4, WorkerType::Decode);
+
+        assert_eq!(dp_worker.url(), "http://worker1:8080@0");
+        assert!(dp_worker.is_dp_aware());
+        assert_eq!(dp_worker.worker_type(), WorkerType::Decode);
+    }
+
+    #[tokio::test]
+    async fn test_dp_aware_prepare_request() {
+        let dp_worker =
+            DPAwareWorker::new("http://worker1:8080".to_string(), 3, 8, WorkerType::Regular);
+
+        let original_req = serde_json::json!({
+            "prompt": "Hello",
+            "max_tokens": 100
+        });
+
+        let prepared_req = dp_worker.prepare_request(original_req).await.unwrap();
+
+        assert_eq!(prepared_req["prompt"], "Hello");
+        assert_eq!(prepared_req["max_tokens"], 100);
+        assert_eq!(prepared_req["data_parallel_rank"], 3);
+    }
+
+    #[tokio::test]
+    async fn test_dp_aware_prepare_request_invalid() {
+        let dp_worker =
+            DPAwareWorker::new("http://worker1:8080".to_string(), 0, 4, WorkerType::Regular);
+
+        // Non-object JSON should fail
+        let invalid_req = serde_json::json!("not an object");
+        let result = dp_worker.prepare_request(invalid_req).await;
+
+        assert!(result.is_err());
+        match result.unwrap_err() {
+            WorkerError::InvalidConfiguration { message } => {
+                assert!(message.contains("JSON object"));
+            }
+            _ => panic!("Expected InvalidConfiguration error"),
+        }
+    }
+
+    #[test]
+    fn test_dp_aware_endpoint_url() {
+        let dp_worker =
+            DPAwareWorker::new("http://worker1:8080".to_string(), 1, 4, WorkerType::Regular);
+
+        assert_eq!(
+            dp_worker.endpoint_url("/generate"),
+            "http://worker1:8080/generate"
+        );
+        assert_eq!(
+            dp_worker.endpoint_url("/health"),
+            "http://worker1:8080/health"
+        );
+    }
+
+    #[test]
+    fn test_dp_aware_worker_delegated_methods() {
+        let dp_worker =
+            DPAwareWorker::new("http://worker1:8080".to_string(), 0, 2, WorkerType::Regular);
+
+        // Test health status
+        assert!(dp_worker.is_healthy());
+        dp_worker.set_healthy(false);
+        assert!(!dp_worker.is_healthy());
+
+        // Test load tracking
+        assert_eq!(dp_worker.load(), 0);
+        dp_worker.increment_load();
+        assert_eq!(dp_worker.load(), 1);
+        dp_worker.decrement_load();
+        assert_eq!(dp_worker.load(), 0);
+
+        // Test processed tracking
+        assert_eq!(dp_worker.processed_requests(), 0);
+        dp_worker.increment_processed();
+        assert_eq!(dp_worker.processed_requests(), 1);
+    }
+
+    // ===== Tests for WorkerFactory async methods =====
+
+    #[tokio::test]
+    async fn test_factory_create_dp_aware() {
+        let worker = WorkerFactory::create_dp_aware(
+            "http://worker1:8080".to_string(),
+            1,
+            4,
+            WorkerType::Regular,
+        );
+
+        assert_eq!(worker.url(), "http://worker1:8080@1");
+        assert!(worker.is_dp_aware());
+        assert_eq!(worker.dp_rank(), Some(1));
+        assert_eq!(worker.dp_size(), Some(4));
+        assert_eq!(worker.worker_type(), WorkerType::Regular);
+    }
+
+    #[tokio::test]
+    async fn test_factory_create_dp_aware_prefill() {
+        let worker = WorkerFactory::create_dp_aware(
+            "http://worker1:8080".to_string(),
+            0,
+            2,
+            WorkerType::Prefill {
+                bootstrap_port: Some(8090),
+            },
+        );
+
+        assert_eq!(worker.url(), "http://worker1:8080@0");
+        assert!(worker.is_dp_aware());
+        assert_eq!(
+            worker.worker_type(),
+            WorkerType::Prefill {
+                bootstrap_port: Some(8090)
+            }
+        );
+    }
+
+    #[tokio::test]
+    async fn test_factory_create_workers_regular() {
+        let urls = vec!["http://w1:8080".to_string(), "http://w2:8080".to_string()];
+
+        let workers = WorkerFactory::create_workers(urls, false, &None)
+            .await
+            .unwrap();
+
+        assert_eq!(workers.len(), 2);
+        assert!(!workers[0].is_dp_aware());
+        assert!(!workers[1].is_dp_aware());
+        assert_eq!(workers[0].url(), "http://w1:8080");
+        assert_eq!(workers[1].url(), "http://w2:8080");
+    }
+
+    // ===== Circuit Breaker Integration Tests =====
+
+    #[test]
+    fn test_worker_circuit_breaker() {
+        let worker = BasicWorker::new("http://test:8080".to_string(), WorkerType::Regular);
+
+        // Initial state should be available
+        assert!(worker.is_available());
+        assert_eq!(
+            worker.circuit_breaker().state(),
+            crate::core::CircuitState::Closed
+        );
+
+        // Record some failures
+        worker.record_outcome(false);
+        worker.record_outcome(false);
+
+        // Still available (default threshold is 5)
+        assert!(worker.is_available());
+
+        // Record more failures to open circuit
+        worker.record_outcome(false);
+        worker.record_outcome(false);
+        worker.record_outcome(false);
+
+        // Circuit should be open, worker not available
+        assert!(!worker.is_available());
+        assert!(worker.is_healthy()); // Still healthy
+        assert!(!worker.circuit_breaker().can_execute()); // But circuit is open
+    }
+
+    #[test]
+    fn test_worker_with_circuit_breaker_config() {
+        let config = crate::core::CircuitBreakerConfig {
+            failure_threshold: 2,
+            success_threshold: 1,
+            timeout_duration: Duration::from_millis(100),
+            window_duration: Duration::from_secs(60),
+        };
+
+        let worker = BasicWorker::new("http://test:8080".to_string(), WorkerType::Regular)
+            .with_circuit_breaker_config(config);
+
+        // Should open after 2 failures
+        worker.record_outcome(false);
+        assert!(worker.is_available());
+        worker.record_outcome(false);
+        assert!(!worker.is_available());
+
+        // Wait for timeout
+        thread::sleep(Duration::from_millis(150));
+
+        // Should be half-open
+        assert!(worker.is_available());
+        assert_eq!(
+            worker.circuit_breaker().state(),
+            crate::core::CircuitState::HalfOpen
+        );
+
+        // Success should close it
+        worker.record_outcome(true);
+        assert_eq!(
+            worker.circuit_breaker().state(),
+            crate::core::CircuitState::Closed
+        );
+    }
+
+    #[test]
+    fn test_dp_aware_worker_circuit_breaker() {
+        let dp_worker =
+            DPAwareWorker::new("http://worker:8080".to_string(), 0, 2, WorkerType::Regular);
+
+        // Should have circuit breaker
+        assert!(dp_worker.is_available());
+
+        // Record failures
+        for _ in 0..5 {
+            dp_worker.record_outcome(false);
+        }
+
+        // Should not be available
+        assert!(!dp_worker.is_available());
+        assert_eq!(
+            dp_worker.circuit_breaker().state(),
+            crate::core::CircuitState::Open
+        );
+    }
+
+    // ===== Integration tests =====
+
+    #[tokio::test]
+    async fn test_mixed_worker_types() {
+        // Create a mix of worker types
+        let regular = WorkerFactory::create_regular("http://regular:8080".to_string());
+        let prefill = WorkerFactory::create_prefill("http://prefill:8080".to_string(), Some(9090));
+        let decode = WorkerFactory::create_decode("http://decode:8080".to_string());
+        let dp_aware_regular =
+            WorkerFactory::create_dp_aware("http://dp:8080".to_string(), 0, 2, WorkerType::Regular);
+        let dp_aware_prefill = WorkerFactory::create_dp_aware(
+            "http://dp-prefill:8080".to_string(),
+            1,
+            2,
+            WorkerType::Prefill {
+                bootstrap_port: None,
+            },
+        );
+        let dp_aware_decode = WorkerFactory::create_dp_aware(
+            "http://dp-decode:8080".to_string(),
+            0,
+            4,
+            WorkerType::Decode,
+        );
+
+        let workers: Vec<Box<dyn Worker>> = vec![
+            regular,
+            prefill,
+            decode,
+            dp_aware_regular,
+            dp_aware_prefill,
+            dp_aware_decode,
+        ];
+
+        // Test that they all implement Worker trait properly
+        for worker in &workers {
+            assert!(worker.is_healthy());
+            assert_eq!(worker.load(), 0);
+            assert_eq!(worker.processed_requests(), 0);
+        }
+
+        // Test specific behaviors
+        assert!(!workers[0].is_dp_aware()); // regular
+        assert!(!workers[1].is_dp_aware()); // prefill
+        assert!(!workers[2].is_dp_aware()); // decode
+        assert!(workers[3].is_dp_aware()); // dp_aware_regular
+        assert!(workers[4].is_dp_aware()); // dp_aware_prefill
+        assert!(workers[5].is_dp_aware()); // dp_aware_decode
+
+        // Test worker types
+        assert_eq!(workers[0].worker_type(), WorkerType::Regular);
+        assert_eq!(
+            workers[1].worker_type(),
+            WorkerType::Prefill {
+                bootstrap_port: Some(9090)
+            }
+        );
+        assert_eq!(workers[2].worker_type(), WorkerType::Decode);
+        assert_eq!(workers[3].worker_type(), WorkerType::Regular);
+        assert_eq!(
+            workers[4].worker_type(),
+            WorkerType::Prefill {
+                bootstrap_port: None
+            }
+        );
+        assert_eq!(workers[5].worker_type(), WorkerType::Decode);
+    }
+}
diff --git a/sgl-router/src/grpc/client.rs b/sgl-router/src/grpc/client.rs
new file mode 100644
index 000000000..8561b79db
--- /dev/null
+++ b/sgl-router/src/grpc/client.rs
@@ -0,0 +1,327 @@
+use std::time::Duration;
+use tonic::{transport::Channel, Request};
+use tracing::debug;
+
+// Include the generated protobuf code
+pub mod proto {
+    tonic::include_proto!("sglang.grpc.scheduler");
+}
+
+// The generated module structure depends on the package name in the .proto file
+// package sglang.grpc.scheduler; generates a nested module structure
+
+/// gRPC client for SGLang scheduler
+pub struct SglangSchedulerClient {
+    client: proto::sglang_scheduler_client::SglangSchedulerClient<Channel>,
+}
+
+impl SglangSchedulerClient {
+    /// Create a new client and connect to the scheduler
+    pub async fn connect(endpoint: &str) -> Result<Self, Box<dyn std::error::Error>> {
+        debug!("Connecting to SGLang scheduler at {}", endpoint);
+
+        // Convert grpc:// to http:// for tonic
+        let http_endpoint = if endpoint.starts_with("grpc://") {
+            endpoint.replace("grpc://", "http://")
+        } else {
+            endpoint.to_string()
+        };
+
+        let channel = Channel::from_shared(http_endpoint)?
+            .timeout(Duration::from_secs(30))
+            .connect()
+            .await?;
+
+        let client = proto::sglang_scheduler_client::SglangSchedulerClient::new(channel);
+
+        Ok(Self { client })
+    }
+
+    /// Initialize the connection
+    pub async fn initialize(
+        &mut self,
+        client_id: String,
+    ) -> Result<proto::InitializeResponse, Box<dyn std::error::Error>> {
+        let request = Request::new(proto::InitializeRequest {
+            client_id,
+            client_version: "0.1.0".to_string(),
+            mode: proto::initialize_request::Mode::Regular as i32,
+        });
+
+        let response = self.client.initialize(request).await?;
+        Ok(response.into_inner())
+    }
+
+    /// Submit a generation request (returns streaming response)
+    pub async fn generate_stream(
+        &mut self,
+        req: proto::GenerateRequest,
+    ) -> Result<tonic::Streaming<proto::GenerateResponse>, Box<dyn std::error::Error>> {
+        let request = Request::new(req);
+        let response = self.client.generate(request).await?;
+        Ok(response.into_inner())
+    }
+
+    /// Perform health check
+    pub async fn health_check(
+        &mut self,
+    ) -> Result<proto::HealthCheckResponse, Box<dyn std::error::Error>> {
+        debug!("Sending health check request");
+        let request = Request::new(proto::HealthCheckRequest {
+            include_detailed_metrics: false,
+        });
+
+        let response = self.client.health_check(request).await?;
+        debug!("Health check response received");
+        Ok(response.into_inner())
+    }
+
+    /// Abort a request
+    pub async fn abort_request(
+        &mut self,
+        request_id: String,
+        reason: String,
+    ) -> Result<(), Box<dyn std::error::Error>> {
+        let request = Request::new(proto::AbortRequest { request_id, reason });
+
+        self.client.abort(request).await?;
+        Ok(())
+    }
+
+    /// Flush cache
+    pub async fn flush_cache(
+        &mut self,
+        flush_all: bool,
+        session_ids: &[String],
+    ) -> Result<proto::FlushCacheResponse, Box<dyn std::error::Error>> {
+        let request = Request::new(proto::FlushCacheRequest {
+            flush_all,
+            session_ids: session_ids.to_vec(),
+        });
+
+        let response = self.client.flush_cache(request).await?;
+        Ok(response.into_inner())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_proto_types_compilation() {
+        // Test that protobuf types can be constructed
+        let init_req = proto::InitializeRequest {
+            client_id: "test-client".to_string(),
+            client_version: "0.1.0".to_string(),
+            mode: 0,
+        };
+        assert_eq!(init_req.client_id, "test-client");
+        assert_eq!(init_req.client_version, "0.1.0");
+        assert_eq!(init_req.mode, 0);
+    }
+
+    #[test]
+    fn test_generate_request_construction() {
+        let sampling_params = proto::SamplingParams {
+            temperature: 0.7,
+            max_new_tokens: 128,
+            top_p: 0.9,
+            top_k: 50,
+            stop: vec!["</s>".to_string()],
+            ..Default::default()
+        };
+
+        let gen_req = proto::GenerateRequest {
+            request_id: "test-req-123".to_string(),
+            input: Some(proto::generate_request::Input::Text(
+                "Hello world".to_string(),
+            )),
+            sampling_params: Some(sampling_params),
+            return_logprob: true,
+            logprob_start_len: 0,
+            top_logprobs_num: 5,
+            ..Default::default()
+        };
+
+        assert_eq!(gen_req.request_id, "test-req-123");
+        if let Some(proto::generate_request::Input::Text(text)) = &gen_req.input {
+            assert_eq!(text, "Hello world");
+        }
+        assert!(gen_req.return_logprob);
+        assert_eq!(gen_req.top_logprobs_num, 5);
+
+        let params = gen_req.sampling_params.unwrap();
+        assert_eq!(params.temperature, 0.7);
+        assert_eq!(params.max_new_tokens, 128);
+        assert_eq!(params.stop, vec!["</s>"]);
+    }
+
+    #[test]
+    fn test_health_check_request() {
+        let health_req = proto::HealthCheckRequest {
+            include_detailed_metrics: true,
+        };
+        assert!(health_req.include_detailed_metrics);
+    }
+
+    #[test]
+    fn test_abort_request_construction() {
+        let abort_req = proto::AbortRequest {
+            request_id: "req-456".to_string(),
+            reason: "User canceled".to_string(),
+        };
+        assert_eq!(abort_req.request_id, "req-456");
+        assert_eq!(abort_req.reason, "User canceled");
+    }
+
+    #[test]
+    fn test_flush_cache_request() {
+        let flush_req = proto::FlushCacheRequest {
+            flush_all: true,
+            session_ids: vec!["session1".to_string(), "session2".to_string()],
+        };
+        assert!(flush_req.flush_all);
+        assert_eq!(flush_req.session_ids.len(), 2);
+        assert_eq!(flush_req.session_ids[0], "session1");
+    }
+
+    #[test]
+    fn test_sampling_params_defaults() {
+        let params = proto::SamplingParams::default();
+        assert_eq!(params.temperature, 0.0);
+        assert_eq!(params.max_new_tokens, 0);
+        assert_eq!(params.top_p, 0.0);
+        assert_eq!(params.top_k, 0);
+        assert!(params.stop.is_empty());
+    }
+
+    #[test]
+    fn test_multimodal_inputs() {
+        let mm_inputs = proto::MultimodalInputs {
+            image_urls: vec!["http://example.com/image.jpg".to_string()],
+            video_urls: vec![],
+            audio_urls: vec![],
+            image_data: vec![],
+            video_data: vec![],
+            audio_data: vec![],
+            modalities: vec!["image".to_string()],
+            ..Default::default()
+        };
+
+        assert_eq!(mm_inputs.image_urls.len(), 1);
+        assert_eq!(mm_inputs.image_urls[0], "http://example.com/image.jpg");
+        assert_eq!(mm_inputs.modalities[0], "image");
+    }
+
+    #[test]
+    fn test_session_params() {
+        let session_params = proto::SessionParams {
+            session_id: "sess-789".to_string(),
+            request_id: "req-101".to_string(),
+            offset: 100,
+            replace: true,
+            drop_previous_output: false,
+        };
+
+        assert_eq!(session_params.session_id, "sess-789");
+        assert_eq!(session_params.request_id, "req-101");
+        assert_eq!(session_params.offset, 100);
+        assert!(session_params.replace);
+        assert!(!session_params.drop_previous_output);
+    }
+
+    #[test]
+    fn test_embed_request() {
+        let embed_req = proto::EmbedRequest {
+            request_id: "embed-req-202".to_string(),
+            input: Some(proto::embed_request::Input::Text(
+                "This is a test sentence for embedding".to_string(),
+            )),
+            log_metrics: true,
+            data_parallel_rank: 0,
+            ..Default::default()
+        };
+
+        assert_eq!(embed_req.request_id, "embed-req-202");
+        if let Some(proto::embed_request::Input::Text(text)) = &embed_req.input {
+            assert_eq!(text, "This is a test sentence for embedding");
+        }
+        assert!(embed_req.log_metrics);
+        assert_eq!(embed_req.data_parallel_rank, 0);
+    }
+
+    #[tokio::test]
+    async fn test_client_connect_invalid_endpoint() {
+        // Test connecting to an invalid endpoint should return error
+        let result = SglangSchedulerClient::connect("invalid://endpoint").await;
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_tokenized_input() {
+        let tokenized = proto::TokenizedInput {
+            original_text: "Hello world".to_string(),
+            input_ids: vec![1, 15043, 1917, 2],
+        };
+
+        assert_eq!(tokenized.original_text, "Hello world");
+        assert_eq!(tokenized.input_ids, vec![1, 15043, 1917, 2]);
+    }
+
+    // Test response type construction
+    #[test]
+    fn test_generate_stream_chunk() {
+        let chunk = proto::GenerateStreamChunk {
+            token_id: 1234,
+            text: " world".to_string(),
+            prompt_tokens: 5,
+            completion_tokens: 2,
+            cached_tokens: 3,
+            generation_time: 0.025,
+            queue_time: 10,
+            ..Default::default()
+        };
+
+        assert_eq!(chunk.token_id, 1234);
+        assert_eq!(chunk.text, " world");
+        assert_eq!(chunk.prompt_tokens, 5);
+        assert_eq!(chunk.completion_tokens, 2);
+        assert_eq!(chunk.cached_tokens, 3);
+        assert_eq!(chunk.generation_time, 0.025);
+        assert_eq!(chunk.queue_time, 10);
+    }
+
+    #[test]
+    fn test_model_info() {
+        let model_info = proto::ModelInfo {
+            model_name: "Meta-Llama-3-8B-Instruct".to_string(),
+            max_context_length: 8192,
+            vocab_size: 128256,
+            supports_tool_calling: true,
+            supports_vision: false,
+            special_tokens: vec![
+                "<|begin_of_text|>".to_string(),
+                "<|end_of_text|>".to_string(),
+            ],
+            model_type: "llama".to_string(),
+            num_layers: 32,
+            hidden_size: 4096,
+            num_attention_heads: 32,
+            num_key_value_heads: 8,
+            tokenizer_type: "llama".to_string(),
+            eos_token_ids: vec![128001, 128009],
+            pad_token_id: 128001,
+            bos_token_id: 128000,
+        };
+
+        assert_eq!(model_info.model_name, "Meta-Llama-3-8B-Instruct");
+        assert_eq!(model_info.max_context_length, 8192);
+        assert_eq!(model_info.vocab_size, 128256);
+        assert!(model_info.supports_tool_calling);
+        assert!(!model_info.supports_vision);
+        assert_eq!(model_info.special_tokens.len(), 2);
+        assert_eq!(model_info.num_layers, 32);
+        assert_eq!(model_info.eos_token_ids, vec![128001, 128009]);
+    }
+}
diff --git a/sgl-router/src/grpc/mod.rs b/sgl-router/src/grpc/mod.rs
new file mode 100644
index 000000000..331a6a538
--- /dev/null
+++ b/sgl-router/src/grpc/mod.rs
@@ -0,0 +1,8 @@
+//! gRPC client module for communicating with SGLang scheduler
+//!
+//! This module provides a gRPC client implementation for the SGLang router.
+
+pub mod client;
+
+// Re-export the client
+pub use client::{proto, SglangSchedulerClient};
diff --git a/sgl-router/src/lib.rs b/sgl-router/src/lib.rs
new file mode 100644
index 000000000..938a1ba0c
--- /dev/null
+++ b/sgl-router/src/lib.rs
@@ -0,0 +1,508 @@
+use pyo3::prelude::*;
+pub mod config;
+pub mod logging;
+use std::collections::HashMap;
+
+pub mod core;
+#[cfg(feature = "grpc-client")]
+pub mod grpc;
+pub mod mcp;
+pub mod metrics;
+pub mod middleware;
+pub mod policies;
+pub mod protocols;
+pub mod reasoning_parser;
+pub mod routers;
+pub mod server;
+pub mod service_discovery;
+pub mod tokenizer;
+pub mod tool_parser;
+pub mod tree;
+use crate::metrics::PrometheusConfig;
+
+#[pyclass(eq)]
+#[derive(Clone, PartialEq, Debug)]
+pub enum PolicyType {
+    Random,
+    RoundRobin,
+    CacheAware,
+    PowerOfTwo,
+}
+
+#[pyclass]
+#[derive(Debug, Clone, PartialEq)]
+struct Router {
+    host: String,
+    port: u16,
+    worker_urls: Vec<String>,
+    policy: PolicyType,
+    worker_startup_timeout_secs: u64,
+    worker_startup_check_interval: u64,
+    cache_threshold: f32,
+    balance_abs_threshold: usize,
+    balance_rel_threshold: f32,
+    eviction_interval_secs: u64,
+    max_tree_size: usize,
+    max_payload_size: usize,
+    dp_aware: bool,
+    api_key: Option<String>,
+    log_dir: Option<String>,
+    log_level: Option<String>,
+    service_discovery: bool,
+    selector: HashMap<String, String>,
+    service_discovery_port: u16,
+    service_discovery_namespace: Option<String>,
+    prefill_selector: HashMap<String, String>,
+    decode_selector: HashMap<String, String>,
+    bootstrap_port_annotation: String,
+    prometheus_port: Option<u16>,
+    prometheus_host: Option<String>,
+    request_timeout_secs: u64,
+    request_id_headers: Option<Vec<String>>,
+    pd_disaggregation: bool,
+    prefill_urls: Option<Vec<(String, Option<u16>)>>,
+    decode_urls: Option<Vec<String>>,
+    prefill_policy: Option<PolicyType>,
+    decode_policy: Option<PolicyType>,
+    max_concurrent_requests: usize,
+    cors_allowed_origins: Vec<String>,
+    // Retry configuration
+    retry_max_retries: u32,
+    retry_initial_backoff_ms: u64,
+    retry_max_backoff_ms: u64,
+    retry_backoff_multiplier: f32,
+    retry_jitter_factor: f32,
+    disable_retries: bool,
+    // Circuit breaker configuration
+    cb_failure_threshold: u32,
+    cb_success_threshold: u32,
+    cb_timeout_duration_secs: u64,
+    cb_window_duration_secs: u64,
+    disable_circuit_breaker: bool,
+    // Health check configuration
+    health_failure_threshold: u32,
+    health_success_threshold: u32,
+    health_check_timeout_secs: u64,
+    health_check_interval_secs: u64,
+    health_check_endpoint: String,
+    // IGW (Inference Gateway) configuration
+    enable_igw: bool,
+    queue_size: usize,
+    queue_timeout_secs: u64,
+    rate_limit_tokens_per_second: Option<usize>,
+    // Connection mode (determined from worker URLs)
+    connection_mode: config::ConnectionMode,
+    // Model path for tokenizer
+    model_path: Option<String>,
+    // Explicit tokenizer path
+    tokenizer_path: Option<String>,
+}
+
+impl Router {
+    /// Determine connection mode from worker URLs
+    fn determine_connection_mode(worker_urls: &[String]) -> config::ConnectionMode {
+        // Only consider it gRPC if explicitly specified with grpc:// or grpcs:// scheme
+        for url in worker_urls {
+            if url.starts_with("grpc://") || url.starts_with("grpcs://") {
+                return config::ConnectionMode::Grpc;
+            }
+        }
+        // Default to HTTP for all other cases (including http://, https://, or no scheme)
+        config::ConnectionMode::Http
+    }
+
+    /// Convert PyO3 Router to RouterConfig
+    pub fn to_router_config(&self) -> config::ConfigResult<config::RouterConfig> {
+        use config::{
+            DiscoveryConfig, MetricsConfig, PolicyConfig as ConfigPolicyConfig, RoutingMode,
+        };
+
+        // Convert policy helper function
+        let convert_policy = |policy: &PolicyType| -> ConfigPolicyConfig {
+            match policy {
+                PolicyType::Random => ConfigPolicyConfig::Random,
+                PolicyType::RoundRobin => ConfigPolicyConfig::RoundRobin,
+                PolicyType::CacheAware => ConfigPolicyConfig::CacheAware {
+                    cache_threshold: self.cache_threshold,
+                    balance_abs_threshold: self.balance_abs_threshold,
+                    balance_rel_threshold: self.balance_rel_threshold,
+                    eviction_interval_secs: self.eviction_interval_secs,
+                    max_tree_size: self.max_tree_size,
+                },
+                PolicyType::PowerOfTwo => ConfigPolicyConfig::PowerOfTwo {
+                    load_check_interval_secs: 5, // Default value
+                },
+            }
+        };
+
+        // Determine routing mode
+        let mode = if self.enable_igw {
+            // IGW mode - routing mode is not used in IGW, but we need to provide a placeholder
+            RoutingMode::Regular {
+                worker_urls: vec![],
+            }
+        } else if self.pd_disaggregation {
+            RoutingMode::PrefillDecode {
+                prefill_urls: self.prefill_urls.clone().unwrap_or_default(),
+                decode_urls: self.decode_urls.clone().unwrap_or_default(),
+                prefill_policy: self.prefill_policy.as_ref().map(convert_policy),
+                decode_policy: self.decode_policy.as_ref().map(convert_policy),
+            }
+        } else {
+            RoutingMode::Regular {
+                worker_urls: self.worker_urls.clone(),
+            }
+        };
+
+        // Convert main policy
+        let policy = convert_policy(&self.policy);
+
+        // Service discovery configuration
+        let discovery = if self.service_discovery {
+            Some(DiscoveryConfig {
+                enabled: true,
+                namespace: self.service_discovery_namespace.clone(),
+                port: self.service_discovery_port,
+                check_interval_secs: 60,
+                selector: self.selector.clone(),
+                prefill_selector: self.prefill_selector.clone(),
+                decode_selector: self.decode_selector.clone(),
+                bootstrap_port_annotation: self.bootstrap_port_annotation.clone(),
+            })
+        } else {
+            None
+        };
+
+        // Metrics configuration
+        let metrics = match (self.prometheus_port, self.prometheus_host.as_ref()) {
+            (Some(port), Some(host)) => Some(MetricsConfig {
+                port,
+                host: host.clone(),
+            }),
+            _ => None,
+        };
+
+        Ok(config::RouterConfig {
+            mode,
+            policy,
+            host: self.host.clone(),
+            port: self.port,
+            connection_mode: self.connection_mode.clone(),
+            max_payload_size: self.max_payload_size,
+            request_timeout_secs: self.request_timeout_secs,
+            worker_startup_timeout_secs: self.worker_startup_timeout_secs,
+            worker_startup_check_interval_secs: self.worker_startup_check_interval,
+            dp_aware: self.dp_aware,
+            api_key: self.api_key.clone(),
+            discovery,
+            metrics,
+            log_dir: self.log_dir.clone(),
+            log_level: self.log_level.clone(),
+            request_id_headers: self.request_id_headers.clone(),
+            max_concurrent_requests: self.max_concurrent_requests,
+            queue_size: self.queue_size,
+            queue_timeout_secs: self.queue_timeout_secs,
+            rate_limit_tokens_per_second: self.rate_limit_tokens_per_second,
+            cors_allowed_origins: self.cors_allowed_origins.clone(),
+            retry: config::RetryConfig {
+                max_retries: self.retry_max_retries,
+                initial_backoff_ms: self.retry_initial_backoff_ms,
+                max_backoff_ms: self.retry_max_backoff_ms,
+                backoff_multiplier: self.retry_backoff_multiplier,
+                jitter_factor: self.retry_jitter_factor,
+            },
+            circuit_breaker: config::CircuitBreakerConfig {
+                failure_threshold: self.cb_failure_threshold,
+                success_threshold: self.cb_success_threshold,
+                timeout_duration_secs: self.cb_timeout_duration_secs,
+                window_duration_secs: self.cb_window_duration_secs,
+            },
+            disable_retries: self.disable_retries,
+            disable_circuit_breaker: self.disable_circuit_breaker,
+            health_check: config::HealthCheckConfig {
+                failure_threshold: self.health_failure_threshold,
+                success_threshold: self.health_success_threshold,
+                timeout_secs: self.health_check_timeout_secs,
+                check_interval_secs: self.health_check_interval_secs,
+                endpoint: self.health_check_endpoint.clone(),
+            },
+            enable_igw: self.enable_igw,
+            model_path: self.model_path.clone(),
+            tokenizer_path: self.tokenizer_path.clone(),
+        })
+    }
+}
+
+#[pymethods]
+impl Router {
+    #[new]
+    #[pyo3(signature = (
+        worker_urls,
+        policy = PolicyType::RoundRobin,
+        host = String::from("127.0.0.1"),
+        port = 3001,
+        worker_startup_timeout_secs = 600,
+        worker_startup_check_interval = 30,
+        cache_threshold = 0.3,
+        balance_abs_threshold = 64,
+        balance_rel_threshold = 1.5,
+        eviction_interval_secs = 120,
+        max_tree_size = 2usize.pow(26),
+        max_payload_size = 512 * 1024 * 1024,  // 512MB default for large batches
+        dp_aware = false,
+        api_key = None,
+        log_dir = None,
+        log_level = None,
+        service_discovery = false,
+        selector = HashMap::new(),
+        service_discovery_port = 80,
+        service_discovery_namespace = None,
+        prefill_selector = HashMap::new(),
+        decode_selector = HashMap::new(),
+        bootstrap_port_annotation = String::from("sglang.ai/bootstrap-port"),
+        prometheus_port = None,
+        prometheus_host = None,
+        request_timeout_secs = 1800,  // Add configurable request timeout
+        request_id_headers = None,  // Custom request ID headers
+        pd_disaggregation = false,  // New flag for PD mode
+        prefill_urls = None,
+        decode_urls = None,
+        prefill_policy = None,
+        decode_policy = None,
+        max_concurrent_requests = 256,
+        cors_allowed_origins = vec![],
+        // Retry defaults
+        retry_max_retries = 5,
+        retry_initial_backoff_ms = 50,
+        retry_max_backoff_ms = 30_000,
+        retry_backoff_multiplier = 1.5,
+        retry_jitter_factor = 0.2,
+        disable_retries = false,
+        // Circuit breaker defaults
+        cb_failure_threshold = 10,
+        cb_success_threshold = 3,
+        cb_timeout_duration_secs = 60,
+        cb_window_duration_secs = 120,
+        disable_circuit_breaker = false,
+        // Health check defaults
+        health_failure_threshold = 3,
+        health_success_threshold = 2,
+        health_check_timeout_secs = 5,
+        health_check_interval_secs = 60,
+        health_check_endpoint = String::from("/health"),
+        // IGW defaults
+        enable_igw = false,
+        queue_size = 100,
+        queue_timeout_secs = 60,
+        rate_limit_tokens_per_second = None,
+        // Tokenizer defaults
+        model_path = None,
+        tokenizer_path = None,
+    ))]
+    #[allow(clippy::too_many_arguments)]
+    fn new(
+        worker_urls: Vec<String>,
+        policy: PolicyType,
+        host: String,
+        port: u16,
+        worker_startup_timeout_secs: u64,
+        worker_startup_check_interval: u64,
+        cache_threshold: f32,
+        balance_abs_threshold: usize,
+        balance_rel_threshold: f32,
+        eviction_interval_secs: u64,
+        max_tree_size: usize,
+        max_payload_size: usize,
+        dp_aware: bool,
+        api_key: Option<String>,
+        log_dir: Option<String>,
+        log_level: Option<String>,
+        service_discovery: bool,
+        selector: HashMap<String, String>,
+        service_discovery_port: u16,
+        service_discovery_namespace: Option<String>,
+        prefill_selector: HashMap<String, String>,
+        decode_selector: HashMap<String, String>,
+        bootstrap_port_annotation: String,
+        prometheus_port: Option<u16>,
+        prometheus_host: Option<String>,
+        request_timeout_secs: u64,
+        request_id_headers: Option<Vec<String>>,
+        pd_disaggregation: bool,
+        prefill_urls: Option<Vec<(String, Option<u16>)>>,
+        decode_urls: Option<Vec<String>>,
+        prefill_policy: Option<PolicyType>,
+        decode_policy: Option<PolicyType>,
+        max_concurrent_requests: usize,
+        cors_allowed_origins: Vec<String>,
+        retry_max_retries: u32,
+        retry_initial_backoff_ms: u64,
+        retry_max_backoff_ms: u64,
+        retry_backoff_multiplier: f32,
+        retry_jitter_factor: f32,
+        disable_retries: bool,
+        cb_failure_threshold: u32,
+        cb_success_threshold: u32,
+        cb_timeout_duration_secs: u64,
+        cb_window_duration_secs: u64,
+        disable_circuit_breaker: bool,
+        health_failure_threshold: u32,
+        health_success_threshold: u32,
+        health_check_timeout_secs: u64,
+        health_check_interval_secs: u64,
+        health_check_endpoint: String,
+        enable_igw: bool,
+        queue_size: usize,
+        queue_timeout_secs: u64,
+        rate_limit_tokens_per_second: Option<usize>,
+        model_path: Option<String>,
+        tokenizer_path: Option<String>,
+    ) -> PyResult<Self> {
+        // Determine connection mode from worker URLs
+        let mut all_urls = worker_urls.clone();
+
+        // Add prefill URLs if in PD mode
+        if let Some(ref prefill_urls) = prefill_urls {
+            for (url, _) in prefill_urls {
+                all_urls.push(url.clone());
+            }
+        }
+
+        // Add decode URLs if in PD mode
+        if let Some(ref decode_urls) = decode_urls {
+            all_urls.extend(decode_urls.clone());
+        }
+
+        let connection_mode = Self::determine_connection_mode(&all_urls);
+
+        Ok(Router {
+            host,
+            port,
+            worker_urls,
+            policy,
+            worker_startup_timeout_secs,
+            worker_startup_check_interval,
+            cache_threshold,
+            balance_abs_threshold,
+            balance_rel_threshold,
+            eviction_interval_secs,
+            max_tree_size,
+            max_payload_size,
+            dp_aware,
+            api_key,
+            log_dir,
+            log_level,
+            service_discovery,
+            selector,
+            service_discovery_port,
+            service_discovery_namespace,
+            prefill_selector,
+            decode_selector,
+            bootstrap_port_annotation,
+            prometheus_port,
+            prometheus_host,
+            request_timeout_secs,
+            request_id_headers,
+            pd_disaggregation,
+            prefill_urls,
+            decode_urls,
+            prefill_policy,
+            decode_policy,
+            max_concurrent_requests,
+            cors_allowed_origins,
+            retry_max_retries,
+            retry_initial_backoff_ms,
+            retry_max_backoff_ms,
+            retry_backoff_multiplier,
+            retry_jitter_factor,
+            disable_retries,
+            cb_failure_threshold,
+            cb_success_threshold,
+            cb_timeout_duration_secs,
+            cb_window_duration_secs,
+            disable_circuit_breaker,
+            health_failure_threshold,
+            health_success_threshold,
+            health_check_timeout_secs,
+            health_check_interval_secs,
+            health_check_endpoint,
+            enable_igw,
+            queue_size,
+            queue_timeout_secs,
+            rate_limit_tokens_per_second,
+            connection_mode,
+            model_path,
+            tokenizer_path,
+        })
+    }
+
+    fn start(&self) -> PyResult<()> {
+        // Convert to RouterConfig and validate
+        let router_config = self.to_router_config().map_err(|e| {
+            pyo3::exceptions::PyValueError::new_err(format!("Configuration error: {}", e))
+        })?;
+
+        // Validate the configuration
+        router_config.validate().map_err(|e| {
+            pyo3::exceptions::PyValueError::new_err(format!(
+                "Configuration validation failed: {}",
+                e
+            ))
+        })?;
+
+        // Create service discovery config if enabled
+        let service_discovery_config = if self.service_discovery {
+            Some(service_discovery::ServiceDiscoveryConfig {
+                enabled: true,
+                selector: self.selector.clone(),
+                check_interval: std::time::Duration::from_secs(60),
+                port: self.service_discovery_port,
+                namespace: self.service_discovery_namespace.clone(),
+                pd_mode: self.pd_disaggregation,
+                prefill_selector: self.prefill_selector.clone(),
+                decode_selector: self.decode_selector.clone(),
+                bootstrap_port_annotation: self.bootstrap_port_annotation.clone(),
+            })
+        } else {
+            None
+        };
+
+        // Create Prometheus config if enabled
+        let prometheus_config = Some(PrometheusConfig {
+            port: self.prometheus_port.unwrap_or(29000),
+            host: self
+                .prometheus_host
+                .clone()
+                .unwrap_or_else(|| "127.0.0.1".to_string()),
+        });
+
+        // Use tokio runtime instead of actix-web System for better compatibility
+        let runtime = tokio::runtime::Runtime::new()
+            .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e.to_string()))?;
+
+        // Block on the async startup function
+        runtime.block_on(async move {
+            server::startup(server::ServerConfig {
+                host: self.host.clone(),
+                port: self.port,
+                router_config,
+                max_payload_size: self.max_payload_size,
+                log_dir: self.log_dir.clone(),
+                log_level: self.log_level.clone(),
+                service_discovery_config,
+                prometheus_config,
+                request_timeout_secs: self.request_timeout_secs,
+                request_id_headers: self.request_id_headers.clone(),
+            })
+            .await
+            .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e.to_string()))
+        })
+    }
+}
+
+#[pymodule]
+fn sglang_router_rs(m: &Bound<'_, PyModule>) -> PyResult<()> {
+    m.add_class::<PolicyType>()?;
+    m.add_class::<Router>()?;
+    Ok(())
+}
diff --git a/sgl-router/src/logging.rs b/sgl-router/src/logging.rs
new file mode 100644
index 000000000..5c5b63e0e
--- /dev/null
+++ b/sgl-router/src/logging.rs
@@ -0,0 +1,163 @@
+use std::path::PathBuf;
+use tracing::Level;
+use tracing_appender::non_blocking::WorkerGuard;
+use tracing_appender::rolling::{RollingFileAppender, Rotation};
+use tracing_log::LogTracer;
+use tracing_subscriber::fmt::time::ChronoUtc;
+use tracing_subscriber::layer::SubscriberExt;
+use tracing_subscriber::util::SubscriberInitExt;
+use tracing_subscriber::{EnvFilter, Layer};
+
+/// Configuration for the logging system
+#[derive(Debug, Clone)]
+pub struct LoggingConfig {
+    /// Log level for the application (default: INFO)
+    pub level: Level,
+    /// Whether to use json format for logs (default: false)
+    pub json_format: bool,
+    /// Path to store log files. If None, logs will only go to stdout/stderr
+    pub log_dir: Option<String>,
+    /// Whether to colorize logs when output is a terminal (default: true)
+    pub colorize: bool,
+    /// Log file name to use if log_dir is specified (default: "sgl-router")
+    pub log_file_name: String,
+    /// Custom log targets to filter (default: "sglang_router_rs")
+    pub log_targets: Option<Vec<String>>,
+}
+
+impl Default for LoggingConfig {
+    fn default() -> Self {
+        Self {
+            level: Level::INFO,
+            json_format: false,
+            log_dir: None,
+            colorize: true,
+            log_file_name: "sgl-router".to_string(),
+            log_targets: Some(vec!["sglang_router_rs".to_string()]),
+        }
+    }
+}
+
+/// Guard that keeps the file appender worker thread alive
+///
+/// This must be kept in scope for the duration of the program
+/// to ensure logs are properly written to files
+#[allow(dead_code)]
+pub struct LogGuard {
+    _file_guard: Option<WorkerGuard>,
+}
+
+/// Initialize the logging system with the given configuration
+///
+/// # Arguments
+/// * `config` - Configuration for the logging system
+///
+/// # Returns
+/// A LogGuard that must be kept alive for the duration of the program
+///
+/// # Panics
+/// Will not panic, as initialization errors are handled gracefully
+pub fn init_logging(config: LoggingConfig) -> LogGuard {
+    // Forward logs to tracing - ignore errors to allow for multiple initialization
+    let _ = LogTracer::init();
+
+    // Convert log level to filter string
+    let level_filter = match config.level {
+        Level::TRACE => "trace",
+        Level::DEBUG => "debug",
+        Level::INFO => "info",
+        Level::WARN => "warn",
+        Level::ERROR => "error",
+    };
+
+    // Create env filter
+    let env_filter = EnvFilter::try_from_default_env().unwrap_or_else(|_| {
+        // Format: <target>=<level>,<target2>=<level2>,...
+        let filter_string = if let Some(targets) = &config.log_targets {
+            targets
+                .iter()
+                .enumerate()
+                .map(|(i, target)| {
+                    if i > 0 {
+                        format!(",{}={}", target, level_filter)
+                    } else {
+                        format!("{}={}", target, level_filter)
+                    }
+                })
+                .collect::<String>()
+        } else {
+            format!("sglang_router_rs={}", level_filter)
+        };
+
+        EnvFilter::new(filter_string)
+    });
+
+    // Setup stdout/stderr layer
+    let mut layers = Vec::new();
+
+    // Standard timestamp format: YYYY-MM-DD HH:MM:SS
+    let time_format = "%Y-%m-%d %H:%M:%S".to_string();
+
+    // Configure the console stdout layer
+    let stdout_layer = tracing_subscriber::fmt::layer()
+        .with_ansi(config.colorize)
+        .with_file(true)
+        .with_line_number(true)
+        .with_timer(ChronoUtc::new(time_format.clone()));
+
+    let stdout_layer = if config.json_format {
+        stdout_layer.json().flatten_event(true).boxed()
+    } else {
+        stdout_layer.boxed()
+    };
+
+    layers.push(stdout_layer);
+
+    // Create a file appender if log_dir is specified
+    let mut file_guard = None;
+
+    if let Some(log_dir) = &config.log_dir {
+        let file_name = config.log_file_name.clone();
+        let log_dir = PathBuf::from(log_dir);
+
+        // Create log directory if it doesn't exist
+        if !log_dir.exists() {
+            if let Err(e) = std::fs::create_dir_all(&log_dir) {
+                eprintln!("Failed to create log directory: {}", e);
+                return LogGuard { _file_guard: None };
+            }
+        }
+
+        let file_appender = RollingFileAppender::new(Rotation::DAILY, log_dir, file_name);
+
+        let (non_blocking, guard) = tracing_appender::non_blocking(file_appender);
+        file_guard = Some(guard);
+
+        let file_layer = tracing_subscriber::fmt::layer()
+            .with_ansi(false) // Never use ANSI colors in log files
+            .with_file(true)
+            .with_line_number(true)
+            .with_timer(ChronoUtc::new(time_format))
+            .with_writer(non_blocking);
+
+        let file_layer = if config.json_format {
+            file_layer.json().flatten_event(true).boxed()
+        } else {
+            file_layer.boxed()
+        };
+
+        layers.push(file_layer);
+    }
+
+    // Initialize the subscriber with all layers
+    // Use try_init to handle errors gracefully in case another subscriber is already set
+    let _ = tracing_subscriber::registry()
+        .with(env_filter)
+        .with(layers)
+        .try_init();
+
+    // Return the guard to keep the file appender worker thread alive
+    LogGuard {
+        _file_guard: file_guard,
+    }
+}
diff --git a/sgl-router/src/main.rs b/sgl-router/src/main.rs
new file mode 100644
index 000000000..8ec24a722
--- /dev/null
+++ b/sgl-router/src/main.rs
@@ -0,0 +1,636 @@
+use clap::{ArgAction, Parser, ValueEnum};
+use sglang_router_rs::config::{
+    CircuitBreakerConfig, ConfigError, ConfigResult, ConnectionMode, DiscoveryConfig,
+    HealthCheckConfig, MetricsConfig, PolicyConfig, RetryConfig, RouterConfig, RoutingMode,
+};
+use sglang_router_rs::metrics::PrometheusConfig;
+use sglang_router_rs::server::{self, ServerConfig};
+use sglang_router_rs::service_discovery::ServiceDiscoveryConfig;
+use std::collections::HashMap;
+
+// Helper function to parse prefill arguments from command line
+fn parse_prefill_args() -> Vec<(String, Option<u16>)> {
+    let args: Vec<String> = std::env::args().collect();
+    let mut prefill_entries = Vec::new();
+    let mut i = 0;
+
+    while i < args.len() {
+        if args[i] == "--prefill" && i + 1 < args.len() {
+            let url = args[i + 1].clone();
+            let bootstrap_port = if i + 2 < args.len() && !args[i + 2].starts_with("--") {
+                // Check if next arg is a port number
+                if let Ok(port) = args[i + 2].parse::<u16>() {
+                    i += 1; // Skip the port argument
+                    Some(port)
+                } else if args[i + 2].to_lowercase() == "none" {
+                    i += 1; // Skip the "none" argument
+                    None
+                } else {
+                    None
+                }
+            } else {
+                None
+            };
+            prefill_entries.push((url, bootstrap_port));
+            i += 2; // Skip --prefill and URL
+        } else {
+            i += 1;
+        }
+    }
+
+    prefill_entries
+}
+
+#[derive(Copy, Clone, Debug, Eq, PartialEq, ValueEnum)]
+pub enum Backend {
+    #[value(name = "sglang")]
+    Sglang,
+    #[value(name = "vllm")]
+    Vllm,
+    #[value(name = "trtllm")]
+    Trtllm,
+    #[value(name = "openai")]
+    Openai,
+    #[value(name = "anthropic")]
+    Anthropic,
+}
+
+impl std::fmt::Display for Backend {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let s = match self {
+            Backend::Sglang => "sglang",
+            Backend::Vllm => "vllm",
+            Backend::Trtllm => "trtllm",
+            Backend::Openai => "openai",
+            Backend::Anthropic => "anthropic",
+        };
+        write!(f, "{}", s)
+    }
+}
+
+#[derive(Parser, Debug)]
+#[command(name = "sglang-router")]
+#[command(about = "SGLang Router - High-performance request distribution across worker nodes")]
+#[command(long_about = r#"
+SGLang Router - High-performance request distribution across worker nodes
+
+Usage:
+This launcher enables starting a router with individual worker instances. It is useful for
+multi-node setups or when you want to start workers and router separately.
+
+Examples:
+  # Regular mode
+  sglang-router --worker-urls http://worker1:8000 http://worker2:8000
+
+  # PD disaggregated mode with same policy for both
+  sglang-router --pd-disaggregation \
+    --prefill http://127.0.0.1:30001 9001 \
+    --prefill http://127.0.0.2:30002 9002 \
+    --decode http://127.0.0.3:30003 \
+    --decode http://127.0.0.4:30004 \
+    --policy cache_aware
+
+  # PD mode with different policies for prefill and decode
+  sglang-router --pd-disaggregation \
+    --prefill http://127.0.0.1:30001 9001 \
+    --prefill http://127.0.0.2:30002 \
+    --decode http://127.0.0.3:30003 \
+    --decode http://127.0.0.4:30004 \
+    --prefill-policy cache_aware --decode-policy power_of_two
+
+"#)]
+struct CliArgs {
+    /// Host address to bind the router server
+    #[arg(long, default_value = "127.0.0.1")]
+    host: String,
+
+    /// Port number to bind the router server
+    #[arg(long, default_value_t = 30000)]
+    port: u16,
+
+    /// List of worker URLs (e.g., http://worker1:8000 http://worker2:8000)
+    #[arg(long, num_args = 0..)]
+    worker_urls: Vec<String>,
+
+    /// Load balancing policy to use
+    #[arg(long, default_value = "cache_aware", value_parser = ["random", "round_robin", "cache_aware", "power_of_two"])]
+    policy: String,
+
+    /// Enable PD (Prefill-Decode) disaggregated mode
+    #[arg(long, default_value_t = false)]
+    pd_disaggregation: bool,
+
+    /// Decode server URL (can be specified multiple times)
+    #[arg(long, action = ArgAction::Append)]
+    decode: Vec<String>,
+
+    /// Specific policy for prefill nodes in PD mode
+    #[arg(long, value_parser = ["random", "round_robin", "cache_aware", "power_of_two"])]
+    prefill_policy: Option<String>,
+
+    /// Specific policy for decode nodes in PD mode
+    #[arg(long, value_parser = ["random", "round_robin", "cache_aware", "power_of_two"])]
+    decode_policy: Option<String>,
+
+    /// Timeout in seconds for worker startup
+    #[arg(long, default_value_t = 600)]
+    worker_startup_timeout_secs: u64,
+
+    /// Interval in seconds between checks for worker startup
+    #[arg(long, default_value_t = 30)]
+    worker_startup_check_interval: u64,
+
+    /// Cache threshold (0.0-1.0) for cache-aware routing
+    #[arg(long, default_value_t = 0.3)]
+    cache_threshold: f32,
+
+    /// Absolute threshold for load balancing
+    #[arg(long, default_value_t = 64)]
+    balance_abs_threshold: usize,
+
+    /// Relative threshold for load balancing
+    #[arg(long, default_value_t = 1.5)]
+    balance_rel_threshold: f32,
+
+    /// Interval in seconds between cache eviction operations
+    #[arg(long, default_value_t = 120)]
+    eviction_interval: u64,
+
+    /// Maximum size of the approximation tree for cache-aware routing
+    #[arg(long, default_value_t = 67108864)] // 2^26
+    max_tree_size: usize,
+
+    /// Maximum payload size in bytes
+    #[arg(long, default_value_t = 536870912)] // 512MB
+    max_payload_size: usize,
+
+    /// Enable data parallelism aware schedule
+    #[arg(long, default_value_t = false)]
+    dp_aware: bool,
+
+    /// API key for worker authorization
+    #[arg(long)]
+    api_key: Option<String>,
+
+    /// Backend to route requests to (sglang, vllm, trtllm, openai, anthropic)
+    #[arg(long, value_enum, default_value_t = Backend::Sglang, alias = "runtime")]
+    backend: Backend,
+
+    /// Directory to store log files
+    #[arg(long)]
+    log_dir: Option<String>,
+
+    /// Set the logging level
+    #[arg(long, default_value = "info", value_parser = ["debug", "info", "warn", "error"])]
+    log_level: String,
+
+    /// Enable Kubernetes service discovery
+    #[arg(long, default_value_t = false)]
+    service_discovery: bool,
+
+    /// Label selector for Kubernetes service discovery (format: key1=value1 key2=value2)
+    #[arg(long, num_args = 0..)]
+    selector: Vec<String>,
+
+    /// Port to use for discovered worker pods
+    #[arg(long, default_value_t = 80)]
+    service_discovery_port: u16,
+
+    /// Kubernetes namespace to watch for pods
+    #[arg(long)]
+    service_discovery_namespace: Option<String>,
+
+    /// Label selector for prefill server pods in PD mode
+    #[arg(long, num_args = 0..)]
+    prefill_selector: Vec<String>,
+
+    /// Label selector for decode server pods in PD mode
+    #[arg(long, num_args = 0..)]
+    decode_selector: Vec<String>,
+
+    /// Port to expose Prometheus metrics
+    #[arg(long, default_value_t = 29000)]
+    prometheus_port: u16,
+
+    /// Host address to bind the Prometheus metrics server
+    #[arg(long, default_value = "127.0.0.1")]
+    prometheus_host: String,
+
+    /// Custom HTTP headers to check for request IDs
+    #[arg(long, num_args = 0..)]
+    request_id_headers: Vec<String>,
+
+    /// Request timeout in seconds
+    #[arg(long, default_value_t = 1800)]
+    request_timeout_secs: u64,
+
+    /// Maximum number of concurrent requests allowed
+    #[arg(long, default_value_t = 256)]
+    max_concurrent_requests: usize,
+
+    /// CORS allowed origins
+    #[arg(long, num_args = 0..)]
+    cors_allowed_origins: Vec<String>,
+
+    // Retry configuration
+    /// Maximum number of retries
+    #[arg(long, default_value_t = 5)]
+    retry_max_retries: u32,
+
+    /// Initial backoff in milliseconds for retries
+    #[arg(long, default_value_t = 50)]
+    retry_initial_backoff_ms: u64,
+
+    /// Maximum backoff in milliseconds for retries
+    #[arg(long, default_value_t = 30000)]
+    retry_max_backoff_ms: u64,
+
+    /// Backoff multiplier for exponential backoff
+    #[arg(long, default_value_t = 1.5)]
+    retry_backoff_multiplier: f32,
+
+    /// Jitter factor for retry backoff
+    #[arg(long, default_value_t = 0.2)]
+    retry_jitter_factor: f32,
+
+    /// Disable retries
+    #[arg(long, default_value_t = false)]
+    disable_retries: bool,
+
+    // Circuit breaker configuration
+    /// Number of failures before circuit breaker opens
+    #[arg(long, default_value_t = 10)]
+    cb_failure_threshold: u32,
+
+    /// Number of successes before circuit breaker closes
+    #[arg(long, default_value_t = 3)]
+    cb_success_threshold: u32,
+
+    /// Timeout duration in seconds for circuit breaker
+    #[arg(long, default_value_t = 60)]
+    cb_timeout_duration_secs: u64,
+
+    /// Window duration in seconds for circuit breaker
+    #[arg(long, default_value_t = 120)]
+    cb_window_duration_secs: u64,
+
+    /// Disable circuit breaker
+    #[arg(long, default_value_t = false)]
+    disable_circuit_breaker: bool,
+
+    // Health check configuration
+    /// Number of consecutive health check failures before marking worker unhealthy
+    #[arg(long, default_value_t = 3)]
+    health_failure_threshold: u32,
+
+    /// Number of consecutive health check successes before marking worker healthy
+    #[arg(long, default_value_t = 2)]
+    health_success_threshold: u32,
+
+    /// Timeout in seconds for health check requests
+    #[arg(long, default_value_t = 5)]
+    health_check_timeout_secs: u64,
+
+    /// Interval in seconds between runtime health checks
+    #[arg(long, default_value_t = 60)]
+    health_check_interval_secs: u64,
+
+    /// Health check endpoint path
+    #[arg(long, default_value = "/health")]
+    health_check_endpoint: String,
+
+    // IGW (Inference Gateway) configuration
+    /// Enable Inference Gateway mode
+    #[arg(long, default_value_t = false)]
+    enable_igw: bool,
+
+    // Tokenizer configuration
+    /// Model path for loading tokenizer (HuggingFace model ID or local path)
+    #[arg(long)]
+    model_path: Option<String>,
+
+    /// Explicit tokenizer path (overrides model_path tokenizer if provided)
+    #[arg(long)]
+    tokenizer_path: Option<String>,
+}
+
+impl CliArgs {
+    /// Determine connection mode from worker URLs
+    fn determine_connection_mode(worker_urls: &[String]) -> ConnectionMode {
+        // Only consider it gRPC if explicitly specified with grpc:// or grpcs:// scheme
+        for url in worker_urls {
+            if url.starts_with("grpc://") || url.starts_with("grpcs://") {
+                return ConnectionMode::Grpc;
+            }
+        }
+        // Default to HTTP for all other cases (including http://, https://, or no scheme)
+        ConnectionMode::Http
+    }
+
+    /// Parse selector strings into HashMap
+    fn parse_selector(selector_list: &[String]) -> HashMap<String, String> {
+        let mut map = HashMap::new();
+        for item in selector_list {
+            if let Some(eq_pos) = item.find('=') {
+                let key = item[..eq_pos].to_string();
+                let value = item[eq_pos + 1..].to_string();
+                map.insert(key, value);
+            }
+        }
+        map
+    }
+
+    /// Convert policy string to PolicyConfig
+    fn parse_policy(&self, policy_str: &str) -> PolicyConfig {
+        match policy_str {
+            "random" => PolicyConfig::Random,
+            "round_robin" => PolicyConfig::RoundRobin,
+            "cache_aware" => PolicyConfig::CacheAware {
+                cache_threshold: self.cache_threshold,
+                balance_abs_threshold: self.balance_abs_threshold,
+                balance_rel_threshold: self.balance_rel_threshold,
+                eviction_interval_secs: self.eviction_interval,
+                max_tree_size: self.max_tree_size,
+            },
+            "power_of_two" => PolicyConfig::PowerOfTwo {
+                load_check_interval_secs: 5, // Default value
+            },
+            _ => PolicyConfig::RoundRobin, // Fallback
+        }
+    }
+
+    /// Convert CLI arguments to RouterConfig
+    fn to_router_config(
+        &self,
+        prefill_urls: Vec<(String, Option<u16>)>,
+    ) -> ConfigResult<RouterConfig> {
+        // Determine routing mode
+        let mode = if self.enable_igw {
+            // IGW mode - routing mode is not used in IGW, but we need to provide a placeholder
+            RoutingMode::Regular {
+                worker_urls: vec![],
+            }
+        } else if matches!(self.backend, Backend::Openai) {
+            // OpenAI backend mode - use worker_urls as base(s)
+            RoutingMode::OpenAI {
+                worker_urls: self.worker_urls.clone(),
+            }
+        } else if self.pd_disaggregation {
+            let decode_urls = self.decode.clone();
+
+            // Validate PD configuration if not using service discovery
+            if !self.service_discovery && (prefill_urls.is_empty() || decode_urls.is_empty()) {
+                return Err(ConfigError::ValidationFailed {
+                    reason: "PD disaggregation mode requires --prefill and --decode URLs when not using service discovery".to_string(),
+                });
+            }
+
+            RoutingMode::PrefillDecode {
+                prefill_urls,
+                decode_urls,
+                prefill_policy: self.prefill_policy.as_ref().map(|p| self.parse_policy(p)),
+                decode_policy: self.decode_policy.as_ref().map(|p| self.parse_policy(p)),
+            }
+        } else {
+            // Regular mode
+            if !self.service_discovery && self.worker_urls.is_empty() {
+                return Err(ConfigError::ValidationFailed {
+                    reason: "Regular mode requires --worker-urls when not using service discovery"
+                        .to_string(),
+                });
+            }
+            RoutingMode::Regular {
+                worker_urls: self.worker_urls.clone(),
+            }
+        };
+
+        // Main policy
+        let policy = self.parse_policy(&self.policy);
+
+        // Service discovery configuration
+        let discovery = if self.service_discovery {
+            Some(DiscoveryConfig {
+                enabled: true,
+                namespace: self.service_discovery_namespace.clone(),
+                port: self.service_discovery_port,
+                check_interval_secs: 60,
+                selector: Self::parse_selector(&self.selector),
+                prefill_selector: Self::parse_selector(&self.prefill_selector),
+                decode_selector: Self::parse_selector(&self.decode_selector),
+                bootstrap_port_annotation: "sglang.ai/bootstrap-port".to_string(),
+            })
+        } else {
+            None
+        };
+
+        // Metrics configuration
+        let metrics = Some(MetricsConfig {
+            port: self.prometheus_port,
+            host: self.prometheus_host.clone(),
+        });
+
+        // Determine connection mode from all worker URLs
+        let mut all_urls = Vec::new();
+        match &mode {
+            RoutingMode::Regular { worker_urls } => {
+                all_urls.extend(worker_urls.clone());
+            }
+            RoutingMode::PrefillDecode {
+                prefill_urls,
+                decode_urls,
+                ..
+            } => {
+                for (url, _) in prefill_urls {
+                    all_urls.push(url.clone());
+                }
+                all_urls.extend(decode_urls.clone());
+            }
+            RoutingMode::OpenAI { .. } => {
+                // For connection-mode detection, skip URLs; OpenAI forces HTTP below.
+            }
+        }
+        let connection_mode = match &mode {
+            RoutingMode::OpenAI { .. } => ConnectionMode::Http,
+            _ => Self::determine_connection_mode(&all_urls),
+        };
+
+        // Build RouterConfig
+        Ok(RouterConfig {
+            mode,
+            policy,
+            connection_mode,
+            host: self.host.clone(),
+            port: self.port,
+            max_payload_size: self.max_payload_size,
+            request_timeout_secs: self.request_timeout_secs,
+            worker_startup_timeout_secs: self.worker_startup_timeout_secs,
+            worker_startup_check_interval_secs: self.worker_startup_check_interval,
+            dp_aware: self.dp_aware,
+            api_key: self.api_key.clone(),
+            discovery,
+            metrics,
+            log_dir: self.log_dir.clone(),
+            log_level: Some(self.log_level.clone()),
+            request_id_headers: if self.request_id_headers.is_empty() {
+                None
+            } else {
+                Some(self.request_id_headers.clone())
+            },
+            max_concurrent_requests: self.max_concurrent_requests,
+            queue_size: 100,        // Default queue size
+            queue_timeout_secs: 60, // Default timeout
+            cors_allowed_origins: self.cors_allowed_origins.clone(),
+            retry: RetryConfig {
+                max_retries: self.retry_max_retries,
+                initial_backoff_ms: self.retry_initial_backoff_ms,
+                max_backoff_ms: self.retry_max_backoff_ms,
+                backoff_multiplier: self.retry_backoff_multiplier,
+                jitter_factor: self.retry_jitter_factor,
+            },
+            circuit_breaker: CircuitBreakerConfig {
+                failure_threshold: self.cb_failure_threshold,
+                success_threshold: self.cb_success_threshold,
+                timeout_duration_secs: self.cb_timeout_duration_secs,
+                window_duration_secs: self.cb_window_duration_secs,
+            },
+            disable_retries: self.disable_retries,
+            disable_circuit_breaker: self.disable_circuit_breaker,
+            health_check: HealthCheckConfig {
+                failure_threshold: self.health_failure_threshold,
+                success_threshold: self.health_success_threshold,
+                timeout_secs: self.health_check_timeout_secs,
+                check_interval_secs: self.health_check_interval_secs,
+                endpoint: self.health_check_endpoint.clone(),
+            },
+            enable_igw: self.enable_igw,
+            rate_limit_tokens_per_second: None,
+            model_path: self.model_path.clone(),
+            tokenizer_path: self.tokenizer_path.clone(),
+        })
+    }
+
+    /// Create ServerConfig from CLI args and RouterConfig
+    fn to_server_config(&self, router_config: RouterConfig) -> ServerConfig {
+        // Create service discovery config if enabled
+        let service_discovery_config = if self.service_discovery {
+            Some(ServiceDiscoveryConfig {
+                enabled: true,
+                selector: Self::parse_selector(&self.selector),
+                check_interval: std::time::Duration::from_secs(60),
+                port: self.service_discovery_port,
+                namespace: self.service_discovery_namespace.clone(),
+                pd_mode: self.pd_disaggregation,
+                prefill_selector: Self::parse_selector(&self.prefill_selector),
+                decode_selector: Self::parse_selector(&self.decode_selector),
+                bootstrap_port_annotation: "sglang.ai/bootstrap-port".to_string(),
+            })
+        } else {
+            None
+        };
+
+        // Create Prometheus config
+        let prometheus_config = Some(PrometheusConfig {
+            port: self.prometheus_port,
+            host: self.prometheus_host.clone(),
+        });
+
+        ServerConfig {
+            host: self.host.clone(),
+            port: self.port,
+            router_config,
+            max_payload_size: self.max_payload_size,
+            log_dir: self.log_dir.clone(),
+            log_level: Some(self.log_level.clone()),
+            service_discovery_config,
+            prometheus_config,
+            request_timeout_secs: self.request_timeout_secs,
+            request_id_headers: if self.request_id_headers.is_empty() {
+                None
+            } else {
+                Some(self.request_id_headers.clone())
+            },
+        }
+    }
+}
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // Parse prefill arguments manually before clap parsing
+    let prefill_urls = parse_prefill_args();
+
+    // Filter out prefill arguments and their values before passing to clap
+    let mut filtered_args: Vec<String> = Vec::new();
+    let raw_args: Vec<String> = std::env::args().collect();
+    let mut i = 0;
+
+    while i < raw_args.len() {
+        if raw_args[i] == "--prefill" && i + 1 < raw_args.len() {
+            // Skip --prefill and its URL
+            i += 2;
+            // Also skip bootstrap port if present
+            if i < raw_args.len()
+                && !raw_args[i].starts_with("--")
+                && (raw_args[i].parse::<u16>().is_ok() || raw_args[i].to_lowercase() == "none")
+            {
+                i += 1;
+            }
+        } else {
+            filtered_args.push(raw_args[i].clone());
+            i += 1;
+        }
+    }
+
+    // Parse CLI arguments with clap using filtered args
+    let cli_args = CliArgs::parse_from(filtered_args);
+
+    // Print startup info
+    println!("SGLang Router starting...");
+    println!("Host: {}:{}", cli_args.host, cli_args.port);
+    let mode_str = if cli_args.enable_igw {
+        "IGW (Inference Gateway)".to_string()
+    } else if matches!(cli_args.backend, Backend::Openai) {
+        "OpenAI Backend".to_string()
+    } else if cli_args.pd_disaggregation {
+        "PD Disaggregated".to_string()
+    } else {
+        format!("Regular ({})", cli_args.backend)
+    };
+    println!("Mode: {}", mode_str);
+
+    // Warn for runtimes that are parsed but not yet implemented
+    match cli_args.backend {
+        Backend::Vllm | Backend::Trtllm | Backend::Anthropic => {
+            println!(
+                "WARNING: runtime '{}' not implemented yet; falling back to regular routing. \
+Provide --worker-urls or PD flags as usual.",
+                cli_args.backend
+            );
+        }
+        Backend::Sglang | Backend::Openai => {}
+    }
+
+    if !cli_args.enable_igw {
+        println!("Policy: {}", cli_args.policy);
+
+        if cli_args.pd_disaggregation && !prefill_urls.is_empty() {
+            println!("Prefill nodes: {:?}", prefill_urls);
+            println!("Decode nodes: {:?}", cli_args.decode);
+        }
+    }
+
+    // Convert to RouterConfig
+    let router_config = cli_args.to_router_config(prefill_urls)?;
+
+    // Validate configuration
+    router_config.validate()?;
+
+    // Create ServerConfig
+    let server_config = cli_args.to_server_config(router_config);
+
+    // Create a new runtime for the server (like Python binding does)
+    let runtime = tokio::runtime::Runtime::new()?;
+
+    // Block on the async startup function
+    runtime.block_on(async move { server::startup(server_config).await })?;
+
+    Ok(())
+}
diff --git a/sgl-router/src/mcp/client_manager.rs b/sgl-router/src/mcp/client_manager.rs
new file mode 100644
index 000000000..a2a6d7a7e
--- /dev/null
+++ b/sgl-router/src/mcp/client_manager.rs
@@ -0,0 +1,535 @@
+use backoff::ExponentialBackoffBuilder;
+use dashmap::DashMap;
+use rmcp::{
+    model::{
+        CallToolRequestParam, GetPromptRequestParam, GetPromptResult, Prompt,
+        ReadResourceRequestParam, ReadResourceResult, Resource, Tool as McpTool,
+    },
+    service::RunningService,
+    transport::{
+        sse_client::SseClientConfig, streamable_http_client::StreamableHttpClientTransportConfig,
+        ConfigureCommandExt, SseClientTransport, StreamableHttpClientTransport, TokioChildProcess,
+    },
+    RoleClient, ServiceExt,
+};
+use serde::{Deserialize, Serialize};
+use std::{borrow::Cow, collections::HashMap, time::Duration};
+
+use crate::mcp::{
+    config::{McpConfig, McpServerConfig, McpTransport},
+    error::{McpError, McpResult},
+};
+
+/// Information about an available tool
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ToolInfo {
+    pub name: String,
+    pub description: String,
+    pub server: String,
+    pub parameters: Option<serde_json::Value>,
+}
+
+/// Information about an available prompt
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PromptInfo {
+    pub name: String,
+    pub description: Option<String>,
+    pub server: String,
+    pub arguments: Option<Vec<serde_json::Value>>,
+}
+
+/// Information about an available resource
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ResourceInfo {
+    pub uri: String,
+    pub name: String,
+    pub description: Option<String>,
+    pub mime_type: Option<String>,
+    pub server: String,
+}
+
+/// Manages MCP client connections and tool execution
+pub struct McpClientManager {
+    /// Map of server_name -> MCP client
+    clients: HashMap<String, RunningService<RoleClient, ()>>,
+    /// Map of tool_name -> (server_name, tool_definition)
+    tools: DashMap<String, (String, McpTool)>,
+    /// Map of prompt_name -> (server_name, prompt_definition)
+    prompts: DashMap<String, (String, Prompt)>,
+    /// Map of resource_uri -> (server_name, resource_definition)
+    resources: DashMap<String, (String, Resource)>,
+}
+
+impl McpClientManager {
+    /// Create a new manager and connect to all configured servers
+    pub async fn new(config: McpConfig) -> McpResult<Self> {
+        let mut mgr = Self {
+            clients: HashMap::new(),
+            tools: DashMap::new(),
+            prompts: DashMap::new(),
+            resources: DashMap::new(),
+        };
+
+        for server_config in config.servers {
+            match Self::connect_server(&server_config).await {
+                Ok(client) => {
+                    mgr.load_server_inventory(&server_config.name, &client)
+                        .await;
+                    mgr.clients.insert(server_config.name.clone(), client);
+                }
+                Err(e) => {
+                    tracing::error!(
+                        "Failed to connect to server '{}': {}",
+                        server_config.name,
+                        e
+                    );
+                }
+            }
+        }
+
+        if mgr.clients.is_empty() {
+            return Err(McpError::ConnectionFailed(
+                "Failed to connect to any MCP servers".to_string(),
+            ));
+        }
+        Ok(mgr)
+    }
+
+    /// Discover and cache tools/prompts/resources for a connected server
+    async fn load_server_inventory(
+        &self,
+        server_name: &str,
+        client: &RunningService<RoleClient, ()>,
+    ) {
+        // Tools
+        match client.peer().list_all_tools().await {
+            Ok(ts) => {
+                tracing::info!("Discovered {} tools from '{}'", ts.len(), server_name);
+                for t in ts {
+                    if self.tools.contains_key(t.name.as_ref()) {
+                        tracing::warn!(
+                            "Tool '{}' from server '{}' is overwriting an existing tool.",
+                            &t.name,
+                            server_name
+                        );
+                    }
+                    self.tools
+                        .insert(t.name.to_string(), (server_name.to_string(), t));
+                }
+            }
+            Err(e) => tracing::warn!("Failed to list tools from '{}': {}", server_name, e),
+        }
+
+        // Prompts
+        match client.peer().list_all_prompts().await {
+            Ok(ps) => {
+                tracing::info!("Discovered {} prompts from '{}'", ps.len(), server_name);
+                for p in ps {
+                    if self.prompts.contains_key(&p.name) {
+                        tracing::warn!(
+                            "Prompt '{}' from server '{}' is overwriting an existing prompt.",
+                            &p.name,
+                            server_name
+                        );
+                    }
+                    self.prompts
+                        .insert(p.name.clone(), (server_name.to_string(), p));
+                }
+            }
+            Err(e) => tracing::debug!("No prompts or failed to list on '{}': {}", server_name, e),
+        }
+
+        // Resources
+        match client.peer().list_all_resources().await {
+            Ok(rs) => {
+                tracing::info!("Discovered {} resources from '{}'", rs.len(), server_name);
+                for r in rs {
+                    if self.resources.contains_key(&r.uri) {
+                        tracing::warn!(
+                            "Resource '{}' from server '{}' is overwriting an existing resource.",
+                            &r.uri,
+                            server_name
+                        );
+                    }
+                    self.resources
+                        .insert(r.uri.clone(), (server_name.to_string(), r));
+                }
+            }
+            Err(e) => tracing::debug!("No resources or failed to list on '{}': {}", server_name, e),
+        }
+    }
+
+    /// Connect to a single MCP server with retry logic for remote transports
+    async fn connect_server(config: &McpServerConfig) -> McpResult<RunningService<RoleClient, ()>> {
+        let needs_retry = matches!(
+            &config.transport,
+            McpTransport::Sse { .. } | McpTransport::Streamable { .. }
+        );
+        if needs_retry {
+            Self::connect_server_with_retry(config).await
+        } else {
+            Self::connect_server_impl(config).await
+        }
+    }
+
+    /// Connect with exponential backoff retry for remote servers
+    async fn connect_server_with_retry(
+        config: &McpServerConfig,
+    ) -> McpResult<RunningService<RoleClient, ()>> {
+        let backoff = ExponentialBackoffBuilder::new()
+            .with_initial_interval(Duration::from_secs(1))
+            .with_max_interval(Duration::from_secs(30))
+            .with_max_elapsed_time(Some(Duration::from_secs(120)))
+            .build();
+
+        backoff::future::retry(backoff, || async {
+            match Self::connect_server_impl(config).await {
+                Ok(client) => Ok(client),
+                Err(e) => {
+                    tracing::warn!("Failed to connect to '{}', retrying: {}", config.name, e);
+                    Err(backoff::Error::transient(e))
+                }
+            }
+        })
+        .await
+    }
+
+    /// Internal implementation of server connection
+    async fn connect_server_impl(
+        config: &McpServerConfig,
+    ) -> McpResult<RunningService<RoleClient, ()>> {
+        tracing::info!(
+            "Connecting to MCP server '{}' via {:?}",
+            config.name,
+            config.transport
+        );
+
+        match &config.transport {
+            McpTransport::Stdio {
+                command,
+                args,
+                envs,
+            } => {
+                let transport = TokioChildProcess::new(
+                    tokio::process::Command::new(command).configure(|cmd| {
+                        cmd.args(args)
+                            .envs(envs.iter())
+                            .stderr(std::process::Stdio::inherit());
+                    }),
+                )
+                .map_err(|e| McpError::Transport(format!("create stdio transport: {}", e)))?;
+
+                let client = ().serve(transport).await.map_err(|e| {
+                    McpError::ConnectionFailed(format!("initialize stdio client: {}", e))
+                })?;
+
+                tracing::info!("Connected to stdio server '{}'", config.name);
+                Ok(client)
+            }
+
+            McpTransport::Sse { url, token } => {
+                let transport = if let Some(tok) = token {
+                    let client = reqwest::Client::builder()
+                        .default_headers({
+                            let mut headers = reqwest::header::HeaderMap::new();
+                            headers.insert(
+                                reqwest::header::AUTHORIZATION,
+                                format!("Bearer {}", tok).parse().map_err(|e| {
+                                    McpError::Transport(format!("auth token: {}", e))
+                                })?,
+                            );
+                            headers
+                        })
+                        .build()
+                        .map_err(|e| McpError::Transport(format!("build HTTP client: {}", e)))?;
+
+                    let cfg = SseClientConfig {
+                        sse_endpoint: url.clone().into(),
+                        ..Default::default()
+                    };
+
+                    SseClientTransport::start_with_client(client, cfg)
+                        .await
+                        .map_err(|e| McpError::Transport(format!("create SSE transport: {}", e)))?
+                } else {
+                    SseClientTransport::start(url.as_str())
+                        .await
+                        .map_err(|e| McpError::Transport(format!("create SSE transport: {}", e)))?
+                };
+
+                let client = ().serve(transport).await.map_err(|e| {
+                    McpError::ConnectionFailed(format!("initialize SSE client: {}", e))
+                })?;
+
+                tracing::info!("Connected to SSE server '{}' at {}", config.name, url);
+                Ok(client)
+            }
+
+            McpTransport::Streamable { url, token } => {
+                let transport = if let Some(tok) = token {
+                    let mut cfg = StreamableHttpClientTransportConfig::with_uri(url.as_str());
+                    cfg.auth_header = Some(format!("Bearer {}", tok));
+                    StreamableHttpClientTransport::from_config(cfg)
+                } else {
+                    StreamableHttpClientTransport::from_uri(url.as_str())
+                };
+
+                let client = ().serve(transport).await.map_err(|e| {
+                    McpError::ConnectionFailed(format!("initialize streamable client: {}", e))
+                })?;
+
+                tracing::info!(
+                    "Connected to streamable HTTP server '{}' at {}",
+                    config.name,
+                    url
+                );
+                Ok(client)
+            }
+        }
+    }
+
+    // ===== Helpers =====
+
+    fn client_for(&self, server_name: &str) -> McpResult<&RunningService<RoleClient, ()>> {
+        self.clients
+            .get(server_name)
+            .ok_or_else(|| McpError::ServerNotFound(server_name.to_string()))
+    }
+
+    fn tool_entry(&self, name: &str) -> McpResult<(String, McpTool)> {
+        self.tools
+            .get(name)
+            .map(|e| e.value().clone())
+            .ok_or_else(|| McpError::ToolNotFound(name.to_string()))
+    }
+
+    fn prompt_entry(&self, name: &str) -> McpResult<(String, Prompt)> {
+        self.prompts
+            .get(name)
+            .map(|e| e.value().clone())
+            .ok_or_else(|| McpError::PromptNotFound(name.to_string()))
+    }
+
+    fn resource_entry(&self, uri: &str) -> McpResult<(String, Resource)> {
+        self.resources
+            .get(uri)
+            .map(|e| e.value().clone())
+            .ok_or_else(|| McpError::ResourceNotFound(uri.to_string()))
+    }
+
+    // ===== Tool Methods =====
+
+    /// Call a tool by name
+    pub async fn call_tool(
+        &self,
+        tool_name: &str,
+        arguments: Option<serde_json::Map<String, serde_json::Value>>,
+    ) -> McpResult<rmcp::model::CallToolResult> {
+        let (server_name, _tool) = self.tool_entry(tool_name)?;
+        let client = self.client_for(&server_name)?;
+
+        tracing::debug!("Calling tool '{}' on '{}'", tool_name, server_name);
+
+        client
+            .peer()
+            .call_tool(CallToolRequestParam {
+                name: Cow::Owned(tool_name.to_string()),
+                arguments,
+            })
+            .await
+            .map_err(|e| McpError::ToolExecution(format!("Tool call failed: {}", e)))
+    }
+
+    /// Get all available tools
+    pub fn list_tools(&self) -> Vec<ToolInfo> {
+        self.tools
+            .iter()
+            .map(|entry| {
+                let tool_name = entry.key().clone();
+                let (server_name, tool) = entry.value();
+                ToolInfo {
+                    name: tool_name,
+                    description: tool.description.as_deref().unwrap_or_default().to_string(),
+                    server: server_name.clone(),
+                    parameters: Some(serde_json::Value::Object((*tool.input_schema).clone())),
+                }
+            })
+            .collect()
+    }
+
+    /// Get a specific tool by name
+    pub fn get_tool(&self, name: &str) -> Option<ToolInfo> {
+        self.tools.get(name).map(|entry| {
+            let (server_name, tool) = entry.value();
+            ToolInfo {
+                name: name.to_string(),
+                description: tool.description.as_deref().unwrap_or_default().to_string(),
+                server: server_name.clone(),
+                parameters: Some(serde_json::Value::Object((*tool.input_schema).clone())),
+            }
+        })
+    }
+
+    /// Check if a tool exists
+    pub fn has_tool(&self, name: &str) -> bool {
+        self.tools.contains_key(name)
+    }
+
+    /// Get list of connected servers
+    pub fn list_servers(&self) -> Vec<String> {
+        self.clients.keys().cloned().collect()
+    }
+
+    // ===== Prompt Methods =====
+
+    /// Get a prompt by name with arguments
+    pub async fn get_prompt(
+        &self,
+        prompt_name: &str,
+        arguments: Option<serde_json::Map<String, serde_json::Value>>,
+    ) -> McpResult<GetPromptResult> {
+        let (server_name, _prompt) = self.prompt_entry(prompt_name)?;
+        let client = self.client_for(&server_name)?;
+
+        tracing::debug!("Getting prompt '{}' from '{}'", prompt_name, server_name);
+
+        client
+            .peer()
+            .get_prompt(GetPromptRequestParam {
+                name: prompt_name.to_string(),
+                arguments,
+            })
+            .await
+            .map_err(|e| McpError::ToolExecution(format!("Failed to get prompt: {}", e)))
+    }
+
+    /// List all available prompts
+    pub fn list_prompts(&self) -> Vec<PromptInfo> {
+        self.prompts
+            .iter()
+            .map(|entry| {
+                let name = entry.key().clone();
+                let (server_name, prompt) = entry.value();
+                PromptInfo {
+                    name,
+                    description: prompt.description.clone(),
+                    server: server_name.clone(),
+                    arguments: prompt
+                        .arguments
+                        .clone()
+                        .map(|args| args.into_iter().map(|arg| serde_json::json!(arg)).collect()),
+                }
+            })
+            .collect()
+    }
+
+    /// Get a specific prompt info by name
+    pub fn get_prompt_info(&self, name: &str) -> Option<PromptInfo> {
+        self.prompts.get(name).map(|entry| {
+            let (server_name, prompt) = entry.value();
+            PromptInfo {
+                name: name.to_string(),
+                description: prompt.description.clone(),
+                server: server_name.clone(),
+                arguments: prompt
+                    .arguments
+                    .clone()
+                    .map(|args| args.into_iter().map(|arg| serde_json::json!(arg)).collect()),
+            }
+        })
+    }
+
+    // ===== Resource Methods =====
+
+    /// Read a resource by URI
+    pub async fn read_resource(&self, uri: &str) -> McpResult<ReadResourceResult> {
+        let (server_name, _resource) = self.resource_entry(uri)?;
+        let client = self.client_for(&server_name)?;
+
+        tracing::debug!("Reading resource '{}' from '{}'", uri, server_name);
+
+        client
+            .peer()
+            .read_resource(ReadResourceRequestParam {
+                uri: uri.to_string(),
+            })
+            .await
+            .map_err(|e| McpError::ToolExecution(format!("Failed to read resource: {}", e)))
+    }
+
+    /// List all available resources
+    pub fn list_resources(&self) -> Vec<ResourceInfo> {
+        self.resources
+            .iter()
+            .map(|entry| {
+                let uri = entry.key().clone();
+                let (server_name, resource) = entry.value();
+                ResourceInfo {
+                    uri,
+                    name: resource.name.clone(),
+                    description: resource.description.clone(),
+                    mime_type: resource.mime_type.clone(),
+                    server: server_name.clone(),
+                }
+            })
+            .collect()
+    }
+
+    /// Get a specific resource info by URI
+    pub fn get_resource_info(&self, uri: &str) -> Option<ResourceInfo> {
+        self.resources.get(uri).map(|entry| {
+            let (server_name, resource) = entry.value();
+            ResourceInfo {
+                uri: uri.to_string(),
+                name: resource.name.clone(),
+                description: resource.description.clone(),
+                mime_type: resource.mime_type.clone(),
+                server: server_name.clone(),
+            }
+        })
+    }
+
+    /// Subscribe to resource changes
+    pub async fn subscribe_resource(&self, uri: &str) -> McpResult<()> {
+        let (server_name, _resource) = self.resource_entry(uri)?;
+        let client = self.client_for(&server_name)?;
+
+        tracing::debug!("Subscribing to '{}' on '{}'", uri, server_name);
+
+        client
+            .peer()
+            .subscribe(rmcp::model::SubscribeRequestParam {
+                uri: uri.to_string(),
+            })
+            .await
+            .map_err(|e| McpError::ToolExecution(format!("Failed to subscribe: {}", e)))
+    }
+
+    /// Unsubscribe from resource changes
+    pub async fn unsubscribe_resource(&self, uri: &str) -> McpResult<()> {
+        let (server_name, _resource) = self.resource_entry(uri)?;
+        let client = self.client_for(&server_name)?;
+
+        tracing::debug!("Unsubscribing from '{}' on '{}'", uri, server_name);
+
+        client
+            .peer()
+            .unsubscribe(rmcp::model::UnsubscribeRequestParam {
+                uri: uri.to_string(),
+            })
+            .await
+            .map_err(|e| McpError::ToolExecution(format!("Failed to unsubscribe: {}", e)))
+    }
+
+    /// Disconnect from all servers (for cleanup)
+    pub async fn shutdown(&mut self) {
+        for (name, client) in self.clients.drain() {
+            if let Err(e) = client.cancel().await {
+                tracing::warn!("Error disconnecting from '{}': {}", name, e);
+            }
+        }
+        self.tools.clear();
+        self.prompts.clear();
+        self.resources.clear();
+    }
+}
diff --git a/sgl-router/src/mcp/config.rs b/sgl-router/src/mcp/config.rs
new file mode 100644
index 000000000..1adf6a7d7
--- /dev/null
+++ b/sgl-router/src/mcp/config.rs
@@ -0,0 +1,52 @@
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct McpConfig {
+    pub servers: Vec<McpServerConfig>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct McpServerConfig {
+    pub name: String,
+    #[serde(flatten)]
+    pub transport: McpTransport,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(tag = "protocol", rename_all = "lowercase")]
+pub enum McpTransport {
+    Stdio {
+        command: String,
+        #[serde(default)]
+        args: Vec<String>,
+        #[serde(default)]
+        envs: HashMap<String, String>,
+    },
+    Sse {
+        url: String,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        token: Option<String>,
+    },
+    Streamable {
+        url: String,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        token: Option<String>,
+    },
+}
+
+impl McpConfig {
+    /// Load configuration from a YAML file
+    pub async fn from_file(path: &str) -> Result<Self, Box<dyn std::error::Error>> {
+        let content = tokio::fs::read_to_string(path).await?;
+        let config: Self = serde_yaml::from_str(&content)?;
+        Ok(config)
+    }
+
+    /// Load configuration from environment variables (optional)
+    pub fn from_env() -> Option<Self> {
+        // This could be expanded to read from env vars
+        // For now, return None to indicate env config not implemented
+        None
+    }
+}
diff --git a/sgl-router/src/mcp/error.rs b/sgl-router/src/mcp/error.rs
new file mode 100644
index 000000000..03b8b4cd1
--- /dev/null
+++ b/sgl-router/src/mcp/error.rs
@@ -0,0 +1,42 @@
+use thiserror::Error;
+
+pub type McpResult<T> = Result<T, McpError>;
+
+#[derive(Debug, Error)]
+pub enum McpError {
+    #[error("Server not found: {0}")]
+    ServerNotFound(String),
+
+    #[error("Tool not found: {0}")]
+    ToolNotFound(String),
+
+    #[error("Transport error: {0}")]
+    Transport(String),
+
+    #[error("Tool execution failed: {0}")]
+    ToolExecution(String),
+
+    #[error("Connection failed: {0}")]
+    ConnectionFailed(String),
+
+    #[error("Configuration error: {0}")]
+    Config(String),
+
+    #[error("Authentication error: {0}")]
+    Auth(String),
+
+    #[error("Resource not found: {0}")]
+    ResourceNotFound(String),
+
+    #[error("Prompt not found: {0}")]
+    PromptNotFound(String),
+
+    #[error(transparent)]
+    Sdk(#[from] Box<rmcp::RmcpError>),
+
+    #[error(transparent)]
+    Io(#[from] std::io::Error),
+
+    #[error(transparent)]
+    Http(#[from] reqwest::Error),
+}
diff --git a/sgl-router/src/mcp/mod.rs b/sgl-router/src/mcp/mod.rs
new file mode 100644
index 000000000..6cebc4c7d
--- /dev/null
+++ b/sgl-router/src/mcp/mod.rs
@@ -0,0 +1,18 @@
+// MCP Client for SGLang Router
+//
+// This module provides a complete MCP (Model Context Protocol) client implementation
+// supporting multiple transport types (stdio, SSE, HTTP) and all MCP features:
+// - Tools: Discovery and execution
+// - Prompts: Reusable templates for LLM interactions
+// - Resources: File/data access with subscription support
+// - OAuth: Secure authentication for remote servers
+
+pub mod client_manager;
+pub mod config;
+pub mod error;
+pub mod oauth;
+
+// Re-export the main types for convenience
+pub use client_manager::{McpClientManager, PromptInfo, ResourceInfo, ToolInfo};
+pub use config::{McpConfig, McpServerConfig, McpTransport};
+pub use error::{McpError, McpResult};
diff --git a/sgl-router/src/mcp/oauth.rs b/sgl-router/src/mcp/oauth.rs
new file mode 100644
index 000000000..3d13ea2be
--- /dev/null
+++ b/sgl-router/src/mcp/oauth.rs
@@ -0,0 +1,191 @@
+// OAuth authentication support for MCP servers
+
+use axum::{
+    extract::{Query, State},
+    response::Html,
+    routing::get,
+    Router,
+};
+use rmcp::transport::auth::OAuthState;
+use serde::Deserialize;
+use std::{net::SocketAddr, sync::Arc};
+use tokio::sync::{oneshot, Mutex};
+
+use crate::mcp::error::{McpError, McpResult};
+
+/// OAuth callback parameters
+#[derive(Debug, Deserialize)]
+struct CallbackParams {
+    code: String,
+    #[allow(dead_code)]
+    state: Option<String>,
+}
+
+/// State for the callback server
+#[derive(Clone)]
+struct CallbackState {
+    code_receiver: Arc<Mutex<Option<oneshot::Sender<String>>>>,
+}
+
+/// HTML page returned after successful OAuth callback
+const CALLBACK_HTML: &str = r#"
+<!DOCTYPE html>
+<html>
+<head>
+    <title>OAuth Success</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            height: 100vh;
+            margin: 0;
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        }
+        .container {
+            background: white;
+            padding: 40px;
+            border-radius: 10px;
+            box-shadow: 0 10px 30px rgba(0,0,0,0.2);
+            text-align: center;
+        }
+        h1 { color: #333; }
+        p { color: #666; margin: 20px 0; }
+        .success { color: #4CAF50; font-size: 48px; }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <div class="success">✓</div>
+        <h1>Authentication Successful!</h1>
+        <p>You can now close this window and return to your application.</p>
+    </div>
+</body>
+</html>
+"#;
+
+/// OAuth authentication helper for MCP servers
+pub struct OAuthHelper {
+    server_url: String,
+    redirect_uri: String,
+    callback_port: u16,
+}
+
+impl OAuthHelper {
+    /// Create a new OAuth helper
+    pub fn new(server_url: String, redirect_uri: String, callback_port: u16) -> Self {
+        Self {
+            server_url,
+            redirect_uri,
+            callback_port,
+        }
+    }
+
+    /// Perform OAuth authentication flow
+    pub async fn authenticate(
+        &self,
+        scopes: &[&str],
+    ) -> McpResult<rmcp::transport::auth::AuthorizationManager> {
+        // Initialize OAuth state machine
+        let mut oauth_state = OAuthState::new(&self.server_url, None)
+            .await
+            .map_err(|e| McpError::Auth(format!("Failed to initialize OAuth: {}", e)))?;
+
+        oauth_state
+            .start_authorization(scopes, &self.redirect_uri)
+            .await
+            .map_err(|e| McpError::Auth(format!("Failed to start authorization: {}", e)))?;
+
+        // Get authorization URL
+        let auth_url = oauth_state
+            .get_authorization_url()
+            .await
+            .map_err(|e| McpError::Auth(format!("Failed to get authorization URL: {}", e)))?;
+
+        tracing::info!("OAuth authorization URL: {}", auth_url);
+
+        // Start callback server and wait for code
+        let auth_code = self.start_callback_server().await?;
+
+        // Exchange code for token
+        oauth_state
+            .handle_callback(&auth_code)
+            .await
+            .map_err(|e| McpError::Auth(format!("Failed to handle OAuth callback: {}", e)))?;
+
+        // Get authorization manager
+        oauth_state
+            .into_authorization_manager()
+            .ok_or_else(|| McpError::Auth("Failed to get authorization manager".to_string()))
+    }
+
+    /// Start a local HTTP server to receive the OAuth callback
+    async fn start_callback_server(&self) -> McpResult<String> {
+        let (code_sender, code_receiver) = oneshot::channel::<String>();
+
+        let state = CallbackState {
+            code_receiver: Arc::new(Mutex::new(Some(code_sender))),
+        };
+
+        // Create router for callback
+        let app = Router::new()
+            .route("/callback", get(Self::callback_handler))
+            .with_state(state);
+
+        let addr = SocketAddr::from(([127, 0, 0, 1], self.callback_port));
+
+        // Start server in background
+        let listener = tokio::net::TcpListener::bind(addr).await.map_err(|e| {
+            McpError::Auth(format!(
+                "Failed to bind to callback port {}: {}",
+                self.callback_port, e
+            ))
+        })?;
+
+        tokio::spawn(async move {
+            let _ = axum::serve(listener, app).await;
+        });
+
+        tracing::info!(
+            "OAuth callback server started on port {}",
+            self.callback_port
+        );
+
+        // Wait for authorization code
+        code_receiver
+            .await
+            .map_err(|_| McpError::Auth("Failed to receive authorization code".to_string()))
+    }
+
+    /// Handle OAuth callback
+    async fn callback_handler(
+        Query(params): Query<CallbackParams>,
+        State(state): State<CallbackState>,
+    ) -> Html<String> {
+        tracing::debug!("Received OAuth callback with code");
+
+        // Send code to waiting task
+        if let Some(sender) = state.code_receiver.lock().await.take() {
+            let _ = sender.send(params.code);
+        }
+
+        Html(CALLBACK_HTML.to_string())
+    }
+}
+
+/// Create an OAuth-authenticated client
+pub async fn create_oauth_client(
+    server_url: String,
+    _sse_url: String,
+    redirect_uri: String,
+    callback_port: u16,
+    scopes: &[&str],
+) -> McpResult<rmcp::transport::auth::AuthClient<reqwest::Client>> {
+    let helper = OAuthHelper::new(server_url, redirect_uri, callback_port);
+    let auth_manager = helper.authenticate(scopes).await?;
+
+    let client = rmcp::transport::auth::AuthClient::new(reqwest::Client::default(), auth_manager);
+
+    Ok(client)
+}
diff --git a/sgl-router/src/metrics.rs b/sgl-router/src/metrics.rs
new file mode 100644
index 000000000..afcccb549
--- /dev/null
+++ b/sgl-router/src/metrics.rs
@@ -0,0 +1,1011 @@
+use metrics::{counter, describe_counter, describe_gauge, describe_histogram, gauge, histogram};
+use metrics_exporter_prometheus::{Matcher, PrometheusBuilder};
+use std::net::{IpAddr, Ipv4Addr, SocketAddr};
+use std::time::Duration;
+
+#[derive(Debug, Clone)]
+pub struct PrometheusConfig {
+    pub port: u16,
+    pub host: String,
+}
+
+impl Default for PrometheusConfig {
+    fn default() -> Self {
+        Self {
+            port: 29000,
+            host: "0.0.0.0".to_string(),
+        }
+    }
+}
+
+pub fn init_metrics() {
+    // Request metrics
+    describe_counter!(
+        "sgl_router_requests_total",
+        "Total number of requests by route and method"
+    );
+    describe_histogram!(
+        "sgl_router_request_duration_seconds",
+        "Request duration in seconds by route"
+    );
+    describe_counter!(
+        "sgl_router_request_errors_total",
+        "Total number of request errors by route and error type"
+    );
+    describe_counter!(
+        "sgl_router_retries_total",
+        "Total number of request retries by route"
+    );
+    describe_histogram!(
+        "sgl_router_retry_backoff_duration_seconds",
+        "Backoff duration in seconds by attempt index"
+    );
+    describe_counter!(
+        "sgl_router_retries_exhausted_total",
+        "Total number of requests that exhausted retries by route"
+    );
+
+    // Circuit breaker metrics
+    describe_gauge!(
+        "sgl_router_cb_state",
+        "Circuit breaker state per worker (0=closed, 1=open, 2=half_open)"
+    );
+    describe_counter!(
+        "sgl_router_cb_state_transitions_total",
+        "Total number of circuit breaker state transitions by worker"
+    );
+    describe_counter!(
+        "sgl_router_cb_outcomes_total",
+        "Total number of circuit breaker outcomes by worker and outcome type (success/failure)"
+    );
+
+    // Worker metrics
+    describe_gauge!(
+        "sgl_router_active_workers",
+        "Number of currently active workers"
+    );
+    describe_gauge!(
+        "sgl_router_worker_health",
+        "Worker health status (1=healthy, 0=unhealthy)"
+    );
+    describe_gauge!("sgl_router_worker_load", "Current load on each worker");
+    describe_counter!(
+        "sgl_router_processed_requests_total",
+        "Total requests processed by each worker"
+    );
+
+    // Policy metrics
+    describe_counter!(
+        "sgl_router_policy_decisions_total",
+        "Total routing policy decisions by policy and worker"
+    );
+    describe_counter!("sgl_router_cache_hits_total", "Total cache hits");
+    describe_counter!("sgl_router_cache_misses_total", "Total cache misses");
+    describe_gauge!(
+        "sgl_router_tree_size",
+        "Current tree size for cache-aware routing"
+    );
+    describe_counter!(
+        "sgl_router_load_balancing_events_total",
+        "Total load balancing trigger events"
+    );
+    describe_gauge!("sgl_router_max_load", "Maximum worker load");
+    describe_gauge!("sgl_router_min_load", "Minimum worker load");
+
+    // PD-specific metrics
+    describe_counter!("sgl_router_pd_requests_total", "Total PD requests by route");
+    describe_counter!(
+        "sgl_router_pd_prefill_requests_total",
+        "Total prefill requests per worker"
+    );
+    describe_counter!(
+        "sgl_router_pd_decode_requests_total",
+        "Total decode requests per worker"
+    );
+    describe_counter!(
+        "sgl_router_pd_errors_total",
+        "Total PD errors by error type"
+    );
+    describe_counter!(
+        "sgl_router_pd_prefill_errors_total",
+        "Total prefill server errors"
+    );
+    describe_counter!(
+        "sgl_router_pd_decode_errors_total",
+        "Total decode server errors"
+    );
+    describe_counter!(
+        "sgl_router_pd_stream_errors_total",
+        "Total streaming errors per worker"
+    );
+    describe_histogram!(
+        "sgl_router_pd_request_duration_seconds",
+        "PD request duration by route"
+    );
+
+    // Service discovery metrics
+    describe_counter!(
+        "sgl_router_discovery_updates_total",
+        "Total service discovery update events"
+    );
+    describe_gauge!(
+        "sgl_router_discovery_workers_added",
+        "Number of workers added in last discovery update"
+    );
+    describe_gauge!(
+        "sgl_router_discovery_workers_removed",
+        "Number of workers removed in last discovery update"
+    );
+
+    // Generate request specific metrics
+    describe_histogram!(
+        "sgl_router_generate_duration_seconds",
+        "Generate request duration"
+    );
+
+    // Running requests gauge for cache-aware policy
+    describe_gauge!(
+        "sgl_router_running_requests",
+        "Number of running requests per worker"
+    );
+
+    // Tokenizer metrics
+    describe_histogram!(
+        "sgl_tokenizer_encode_duration_seconds",
+        "Time to encode text to tokens"
+    );
+    describe_histogram!(
+        "sgl_tokenizer_decode_duration_seconds",
+        "Time to decode tokens to text"
+    );
+    describe_histogram!(
+        "sgl_tokenizer_encode_batch_duration_seconds",
+        "Time to encode a batch of texts"
+    );
+    describe_counter!(
+        "sgl_tokenizer_encode_requests_total",
+        "Total number of encode requests by tokenizer type"
+    );
+    describe_counter!(
+        "sgl_tokenizer_decode_requests_total",
+        "Total number of decode requests by tokenizer type"
+    );
+    describe_counter!(
+        "sgl_tokenizer_encode_errors_total",
+        "Total number of encode errors by error type"
+    );
+    describe_counter!(
+        "sgl_tokenizer_decode_errors_total",
+        "Total number of decode errors by error type"
+    );
+    describe_histogram!(
+        "sgl_tokenizer_tokens_per_encode",
+        "Number of tokens produced per encode operation"
+    );
+    describe_histogram!(
+        "sgl_tokenizer_chars_per_encode",
+        "Number of characters in input text per encode"
+    );
+    describe_histogram!(
+        "sgl_tokenizer_tokens_per_decode",
+        "Number of tokens decoded per operation"
+    );
+    describe_gauge!(
+        "sgl_tokenizer_vocab_size",
+        "Vocabulary size of the loaded tokenizer"
+    );
+
+    // Stop sequence detection metrics
+    describe_counter!(
+        "sgl_tokenizer_stop_sequences_detected_total",
+        "Total stop sequences detected by type"
+    );
+    describe_counter!(
+        "sgl_tokenizer_partial_matches_total",
+        "Total partial stop sequence matches (jailed text)"
+    );
+    describe_histogram!(
+        "sgl_tokenizer_stop_detection_duration_seconds",
+        "Time to check for stop sequences per token"
+    );
+
+    // Streaming decode metrics
+    describe_counter!(
+        "sgl_tokenizer_stream_tokens_total",
+        "Total tokens processed in streaming decode"
+    );
+    describe_counter!(
+        "sgl_tokenizer_stream_incomplete_utf8_total",
+        "Total incomplete UTF-8 sequences detected"
+    );
+    describe_histogram!(
+        "sgl_tokenizer_stream_step_duration_seconds",
+        "Time per streaming decode step"
+    );
+
+    // Factory metrics
+    describe_counter!(
+        "sgl_tokenizer_factory_loads_total",
+        "Total tokenizer loads by file type"
+    );
+    describe_counter!(
+        "sgl_tokenizer_factory_errors_total",
+        "Total tokenizer loading errors by type"
+    );
+    describe_histogram!(
+        "sgl_tokenizer_factory_load_duration_seconds",
+        "Time to load and initialize tokenizer"
+    );
+}
+
+pub fn start_prometheus(config: PrometheusConfig) {
+    // Initialize metric descriptions
+    init_metrics();
+
+    let duration_matcher = Matcher::Suffix(String::from("duration_seconds"));
+    let duration_bucket = [
+        0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 15.0, 30.0, 45.0,
+        60.0, 90.0, 120.0, 180.0, 240.0,
+    ];
+
+    let ip_addr: IpAddr = config
+        .host
+        .parse()
+        .unwrap_or(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)));
+    let socket_addr = SocketAddr::new(ip_addr, config.port);
+
+    PrometheusBuilder::new()
+        .with_http_listener(socket_addr)
+        .upkeep_timeout(Duration::from_secs(5 * 60))
+        .set_buckets_for_metric(duration_matcher, &duration_bucket)
+        .expect("failed to set duration bucket")
+        .install()
+        .expect("failed to install Prometheus metrics exporter");
+}
+
+pub struct RouterMetrics;
+
+pub struct TokenizerMetrics;
+
+impl RouterMetrics {
+    // Request metrics
+    pub fn record_request(route: &str) {
+        counter!("sgl_router_requests_total",
+            "route" => route.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_request_duration(route: &str, duration: Duration) {
+        histogram!("sgl_router_request_duration_seconds",
+            "route" => route.to_string()
+        )
+        .record(duration.as_secs_f64());
+    }
+
+    pub fn record_request_error(route: &str, error_type: &str) {
+        counter!("sgl_router_request_errors_total",
+            "route" => route.to_string(),
+            "error_type" => error_type.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_retry(route: &str) {
+        counter!("sgl_router_retries_total",
+            "route" => route.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_retry_backoff_duration(duration: Duration, attempt: u32) {
+        histogram!("sgl_router_retry_backoff_duration_seconds",
+            "attempt" => attempt.to_string()
+        )
+        .record(duration.as_secs_f64());
+    }
+
+    pub fn record_retries_exhausted(route: &str) {
+        counter!("sgl_router_retries_exhausted_total",
+            "route" => route.to_string()
+        )
+        .increment(1);
+    }
+
+    // Worker metrics
+    pub fn set_active_workers(count: usize) {
+        gauge!("sgl_router_active_workers").set(count as f64);
+    }
+
+    pub fn set_worker_health(worker_url: &str, healthy: bool) {
+        gauge!("sgl_router_worker_health",
+            "worker" => worker_url.to_string()
+        )
+        .set(if healthy { 1.0 } else { 0.0 });
+    }
+
+    pub fn set_worker_load(worker_url: &str, load: usize) {
+        gauge!("sgl_router_worker_load",
+            "worker" => worker_url.to_string()
+        )
+        .set(load as f64);
+    }
+
+    pub fn record_processed_request(worker_url: &str) {
+        counter!("sgl_router_processed_requests_total",
+            "worker" => worker_url.to_string()
+        )
+        .increment(1);
+    }
+
+    // Policy metrics
+    pub fn record_policy_decision(policy: &str, worker: &str) {
+        counter!("sgl_router_policy_decisions_total",
+            "policy" => policy.to_string(),
+            "worker" => worker.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_cache_hit() {
+        counter!("sgl_router_cache_hits_total").increment(1);
+    }
+
+    pub fn record_cache_miss() {
+        counter!("sgl_router_cache_misses_total").increment(1);
+    }
+
+    pub fn set_tree_size(worker: &str, size: usize) {
+        gauge!("sgl_router_tree_size",
+            "worker" => worker.to_string()
+        )
+        .set(size as f64);
+    }
+
+    pub fn record_load_balancing_event() {
+        counter!("sgl_router_load_balancing_events_total").increment(1);
+    }
+
+    pub fn set_load_range(max_load: usize, min_load: usize) {
+        gauge!("sgl_router_max_load").set(max_load as f64);
+        gauge!("sgl_router_min_load").set(min_load as f64);
+    }
+
+    // PD-specific metrics
+    pub fn record_pd_request(route: &str) {
+        counter!("sgl_router_pd_requests_total",
+            "route" => route.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_pd_request_duration(route: &str, duration: Duration) {
+        histogram!("sgl_router_pd_request_duration_seconds",
+            "route" => route.to_string()
+        )
+        .record(duration.as_secs_f64());
+    }
+
+    pub fn record_pd_prefill_request(worker: &str) {
+        counter!("sgl_router_pd_prefill_requests_total",
+            "worker" => worker.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_pd_decode_request(worker: &str) {
+        counter!("sgl_router_pd_decode_requests_total",
+            "worker" => worker.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_pd_error(error_type: &str) {
+        counter!("sgl_router_pd_errors_total",
+            "error_type" => error_type.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_pd_prefill_error(worker: &str) {
+        counter!("sgl_router_pd_prefill_errors_total",
+            "worker" => worker.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_pd_decode_error(worker: &str) {
+        counter!("sgl_router_pd_decode_errors_total",
+            "worker" => worker.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_pd_stream_error(worker: &str) {
+        counter!("sgl_router_pd_stream_errors_total",
+            "worker" => worker.to_string()
+        )
+        .increment(1);
+    }
+
+    // Service discovery metrics
+    pub fn record_discovery_update(added: usize, removed: usize) {
+        counter!("sgl_router_discovery_updates_total").increment(1);
+        gauge!("sgl_router_discovery_workers_added").set(added as f64);
+        gauge!("sgl_router_discovery_workers_removed").set(removed as f64);
+    }
+
+    // Generate request metrics
+    pub fn record_generate_duration(duration: Duration) {
+        histogram!("sgl_router_generate_duration_seconds").record(duration.as_secs_f64());
+    }
+
+    // Running requests for cache-aware policy
+    pub fn set_running_requests(worker: &str, count: usize) {
+        gauge!("sgl_router_running_requests",
+            "worker" => worker.to_string()
+        )
+        .set(count as f64);
+    }
+
+    // Circuit breaker metrics
+    pub fn set_cb_state(worker: &str, state_code: u8) {
+        gauge!("sgl_router_cb_state",
+            "worker" => worker.to_string()
+        )
+        .set(state_code as f64);
+    }
+
+    pub fn record_cb_state_transition(worker: &str, from: &str, to: &str) {
+        counter!("sgl_router_cb_state_transitions_total",
+            "worker" => worker.to_string(),
+            "from" => from.to_string(),
+            "to" => to.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_cb_outcome(worker: &str, outcome: &str) {
+        counter!("sgl_router_cb_outcomes_total",
+            "worker" => worker.to_string(),
+            "outcome" => outcome.to_string()
+        )
+        .increment(1);
+    }
+}
+
+impl TokenizerMetrics {
+    // Encoding metrics
+    pub fn record_encode_request(tokenizer_type: &str) {
+        counter!("sgl_tokenizer_encode_requests_total",
+            "tokenizer_type" => tokenizer_type.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_encode_duration(duration: Duration) {
+        histogram!("sgl_tokenizer_encode_duration_seconds").record(duration.as_secs_f64());
+    }
+
+    pub fn record_encode_error(error_type: &str) {
+        counter!("sgl_tokenizer_encode_errors_total",
+            "error_type" => error_type.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_tokens_per_encode(token_count: usize) {
+        histogram!("sgl_tokenizer_tokens_per_encode").record(token_count as f64);
+    }
+
+    pub fn record_chars_per_encode(char_count: usize) {
+        histogram!("sgl_tokenizer_chars_per_encode").record(char_count as f64);
+    }
+
+    // Decoding metrics
+    pub fn record_decode_request(tokenizer_type: &str) {
+        counter!("sgl_tokenizer_decode_requests_total",
+            "tokenizer_type" => tokenizer_type.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_decode_duration(duration: Duration) {
+        histogram!("sgl_tokenizer_decode_duration_seconds").record(duration.as_secs_f64());
+    }
+
+    pub fn record_decode_error(error_type: &str) {
+        counter!("sgl_tokenizer_decode_errors_total",
+            "error_type" => error_type.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_tokens_per_decode(token_count: usize) {
+        histogram!("sgl_tokenizer_tokens_per_decode").record(token_count as f64);
+    }
+
+    // Batch encoding metrics
+    pub fn record_encode_batch_duration(duration: Duration, batch_size: usize) {
+        histogram!("sgl_tokenizer_encode_batch_duration_seconds",
+            "batch_size" => batch_size.to_string()
+        )
+        .record(duration.as_secs_f64());
+    }
+
+    // Stop sequence detection metrics
+    pub fn record_stop_sequence_detected(stop_type: &str) {
+        counter!("sgl_tokenizer_stop_sequences_detected_total",
+            "type" => stop_type.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_partial_match() {
+        counter!("sgl_tokenizer_partial_matches_total").increment(1);
+    }
+
+    pub fn record_stop_detection_duration(duration: Duration) {
+        histogram!("sgl_tokenizer_stop_detection_duration_seconds").record(duration.as_secs_f64());
+    }
+
+    // Streaming decode metrics
+    pub fn record_stream_token() {
+        counter!("sgl_tokenizer_stream_tokens_total").increment(1);
+    }
+
+    pub fn record_incomplete_utf8() {
+        counter!("sgl_tokenizer_stream_incomplete_utf8_total").increment(1);
+    }
+
+    pub fn record_stream_step_duration(duration: Duration) {
+        histogram!("sgl_tokenizer_stream_step_duration_seconds").record(duration.as_secs_f64());
+    }
+
+    // Factory metrics
+    pub fn record_factory_load(file_type: &str) {
+        counter!("sgl_tokenizer_factory_loads_total",
+            "file_type" => file_type.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_factory_error(error_type: &str) {
+        counter!("sgl_tokenizer_factory_errors_total",
+            "error_type" => error_type.to_string()
+        )
+        .increment(1);
+    }
+
+    pub fn record_factory_load_duration(duration: Duration) {
+        histogram!("sgl_tokenizer_factory_load_duration_seconds").record(duration.as_secs_f64());
+    }
+
+    // Vocabulary metrics
+    pub fn set_vocab_size(tokenizer_type: &str, size: usize) {
+        gauge!("sgl_tokenizer_vocab_size",
+            "tokenizer_type" => tokenizer_type.to_string()
+        )
+        .set(size as f64);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::net::TcpListener;
+
+    // ============= PrometheusConfig Tests =============
+
+    #[test]
+    fn test_prometheus_config_default() {
+        let config = PrometheusConfig::default();
+        assert_eq!(config.port, 29000);
+        assert_eq!(config.host, "0.0.0.0");
+    }
+
+    #[test]
+    fn test_prometheus_config_custom() {
+        let config = PrometheusConfig {
+            port: 8080,
+            host: "127.0.0.1".to_string(),
+        };
+        assert_eq!(config.port, 8080);
+        assert_eq!(config.host, "127.0.0.1");
+    }
+
+    #[test]
+    fn test_prometheus_config_clone() {
+        let config = PrometheusConfig {
+            port: 9090,
+            host: "192.168.1.1".to_string(),
+        };
+        let cloned = config.clone();
+        assert_eq!(cloned.port, config.port);
+        assert_eq!(cloned.host, config.host);
+    }
+
+    // ============= IP Address Parsing Tests =============
+
+    #[test]
+    fn test_valid_ipv4_parsing() {
+        let test_cases = vec!["127.0.0.1", "192.168.1.1", "0.0.0.0"];
+
+        for ip_str in test_cases {
+            let config = PrometheusConfig {
+                port: 29000,
+                host: ip_str.to_string(),
+            };
+
+            let ip_addr: IpAddr = config.host.parse().unwrap();
+            assert!(matches!(ip_addr, IpAddr::V4(_)));
+        }
+    }
+
+    #[test]
+    fn test_valid_ipv6_parsing() {
+        let test_cases = vec!["::1", "2001:db8::1", "::"];
+
+        for ip_str in test_cases {
+            let config = PrometheusConfig {
+                port: 29000,
+                host: ip_str.to_string(),
+            };
+
+            let ip_addr: IpAddr = config.host.parse().unwrap();
+            assert!(matches!(ip_addr, IpAddr::V6(_)));
+        }
+    }
+
+    #[test]
+    fn test_invalid_ip_parsing() {
+        let test_cases = vec!["invalid", "256.256.256.256", "hostname"];
+
+        for ip_str in test_cases {
+            let config = PrometheusConfig {
+                port: 29000,
+                host: ip_str.to_string(),
+            };
+
+            let ip_addr: IpAddr = config
+                .host
+                .parse()
+                .unwrap_or(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)));
+
+            // Should fall back to 0.0.0.0
+            assert_eq!(ip_addr, IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)));
+        }
+    }
+
+    // ============= Socket Address Creation Tests =============
+
+    #[test]
+    fn test_socket_addr_creation() {
+        let test_cases = vec![("127.0.0.1", 8080), ("0.0.0.0", 29000), ("::1", 9090)];
+
+        for (host, port) in test_cases {
+            let config = PrometheusConfig {
+                port,
+                host: host.to_string(),
+            };
+
+            let ip_addr: IpAddr = config.host.parse().unwrap();
+            let socket_addr = SocketAddr::new(ip_addr, config.port);
+
+            assert_eq!(socket_addr.port(), port);
+            assert_eq!(socket_addr.ip().to_string(), host);
+        }
+    }
+
+    #[test]
+    fn test_socket_addr_with_different_ports() {
+        let ports = vec![0, 80, 8080, 65535];
+
+        for port in ports {
+            let config = PrometheusConfig {
+                port,
+                host: "127.0.0.1".to_string(),
+            };
+
+            let ip_addr: IpAddr = config.host.parse().unwrap();
+            let socket_addr = SocketAddr::new(ip_addr, config.port);
+
+            assert_eq!(socket_addr.port(), port);
+        }
+    }
+
+    // ============= Duration Bucket Tests =============
+
+    #[test]
+    fn test_duration_bucket_coverage() {
+        let test_cases: [(f64, &str); 7] = [
+            (0.0005, "sub-millisecond"),
+            (0.005, "5ms"),
+            (0.05, "50ms"),
+            (1.0, "1s"),
+            (10.0, "10s"),
+            (60.0, "1m"),
+            (240.0, "4m"),
+        ];
+
+        let buckets: [f64; 20] = [
+            0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 15.0, 30.0, 45.0,
+            60.0, 90.0, 120.0, 180.0, 240.0,
+        ];
+
+        for (duration, label) in test_cases {
+            let bucket_found = buckets
+                .iter()
+                .any(|&b| (b - duration).abs() < 0.0001 || b > duration);
+            assert!(bucket_found, "No bucket found for {} ({})", duration, label);
+        }
+    }
+
+    // ============= Matcher Configuration Tests =============
+
+    #[test]
+    fn test_duration_suffix_matcher() {
+        let matcher = Matcher::Suffix(String::from("duration_seconds"));
+
+        // Test matching behavior
+        let _matching_metrics = [
+            "request_duration_seconds",
+            "response_duration_seconds",
+            "sgl_router_request_duration_seconds",
+        ];
+
+        let _non_matching_metrics = ["duration_total", "duration_seconds_total", "other_metric"];
+
+        // Note: We can't directly test Matcher matching without the internals,
+        // but we can verify the matcher is created correctly
+        match matcher {
+            Matcher::Suffix(suffix) => assert_eq!(suffix, "duration_seconds"),
+            _ => panic!("Expected Suffix matcher"),
+        }
+    }
+
+    // ============= Builder Configuration Tests =============
+
+    #[test]
+    fn test_prometheus_builder_configuration() {
+        // This test verifies the builder configuration without actually starting Prometheus
+        let _config = PrometheusConfig::default();
+
+        let duration_matcher = Matcher::Suffix(String::from("duration_seconds"));
+        let duration_bucket = [
+            0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 15.0, 30.0, 45.0,
+            60.0, 90.0, 120.0, 180.0, 240.0,
+        ];
+
+        // Verify bucket configuration
+        assert_eq!(duration_bucket.len(), 20);
+
+        // Verify matcher is suffix type
+        match duration_matcher {
+            Matcher::Suffix(s) => assert_eq!(s, "duration_seconds"),
+            _ => panic!("Expected Suffix matcher"),
+        }
+    }
+
+    // ============= Upkeep Timeout Tests =============
+
+    #[test]
+    fn test_upkeep_timeout_duration() {
+        let timeout = Duration::from_secs(5 * 60);
+        assert_eq!(timeout.as_secs(), 300);
+    }
+
+    // ============= Custom Bucket Tests =============
+
+    #[test]
+    fn test_custom_buckets_for_different_metrics() {
+        // Test that we can create different bucket configurations
+        let request_buckets = [0.001, 0.01, 0.1, 1.0, 10.0];
+        let generate_buckets = [0.1, 0.5, 1.0, 5.0, 30.0, 60.0];
+
+        assert_eq!(request_buckets.len(), 5);
+        assert_eq!(generate_buckets.len(), 6);
+
+        // Verify each set is sorted
+        for i in 1..request_buckets.len() {
+            assert!(request_buckets[i] > request_buckets[i - 1]);
+        }
+
+        for i in 1..generate_buckets.len() {
+            assert!(generate_buckets[i] > generate_buckets[i - 1]);
+        }
+    }
+
+    // ============= RouterMetrics Tests =============
+
+    #[test]
+    fn test_metrics_static_methods() {
+        // Test that all static methods can be called without panic
+        RouterMetrics::record_request("/generate");
+        RouterMetrics::record_request_duration("/generate", Duration::from_millis(100));
+        RouterMetrics::record_request_error("/generate", "timeout");
+        RouterMetrics::record_retry("/generate");
+
+        RouterMetrics::set_active_workers(5);
+        RouterMetrics::set_worker_health("http://worker1", true);
+        RouterMetrics::set_worker_load("http://worker1", 10);
+        RouterMetrics::record_processed_request("http://worker1");
+
+        RouterMetrics::record_policy_decision("random", "http://worker1");
+        RouterMetrics::record_cache_hit();
+        RouterMetrics::record_cache_miss();
+        RouterMetrics::set_tree_size("http://worker1", 1000);
+        RouterMetrics::record_load_balancing_event();
+        RouterMetrics::set_load_range(20, 5);
+
+        RouterMetrics::record_pd_request("/v1/chat/completions");
+        RouterMetrics::record_pd_request_duration("/v1/chat/completions", Duration::from_secs(1));
+        RouterMetrics::record_pd_prefill_request("http://prefill1");
+        RouterMetrics::record_pd_decode_request("http://decode1");
+        RouterMetrics::record_pd_error("invalid_request");
+        RouterMetrics::record_pd_prefill_error("http://prefill1");
+        RouterMetrics::record_pd_decode_error("http://decode1");
+        RouterMetrics::record_pd_stream_error("http://decode1");
+
+        RouterMetrics::record_discovery_update(3, 1);
+        RouterMetrics::record_generate_duration(Duration::from_secs(2));
+        RouterMetrics::set_running_requests("http://worker1", 15);
+    }
+
+    #[test]
+    fn test_tokenizer_metrics_static_methods() {
+        // Test that all tokenizer metric methods can be called without panic
+
+        // Encoding metrics
+        TokenizerMetrics::record_encode_request("huggingface");
+        TokenizerMetrics::record_encode_duration(Duration::from_millis(10));
+        TokenizerMetrics::record_encode_error("invalid_input");
+        TokenizerMetrics::record_tokens_per_encode(100);
+        TokenizerMetrics::record_chars_per_encode(500);
+
+        // Decoding metrics
+        TokenizerMetrics::record_decode_request("huggingface");
+        TokenizerMetrics::record_decode_duration(Duration::from_millis(5));
+        TokenizerMetrics::record_decode_error("invalid_tokens");
+        TokenizerMetrics::record_tokens_per_decode(50);
+
+        // Batch encoding
+        TokenizerMetrics::record_encode_batch_duration(Duration::from_millis(100), 10);
+
+        // Stop sequence detection
+        TokenizerMetrics::record_stop_sequence_detected("token");
+        TokenizerMetrics::record_stop_sequence_detected("string");
+        TokenizerMetrics::record_partial_match();
+        TokenizerMetrics::record_stop_detection_duration(Duration::from_micros(100));
+
+        // Streaming decode
+        TokenizerMetrics::record_stream_token();
+        TokenizerMetrics::record_incomplete_utf8();
+        TokenizerMetrics::record_stream_step_duration(Duration::from_micros(50));
+
+        // Factory metrics
+        TokenizerMetrics::record_factory_load("json");
+        TokenizerMetrics::record_factory_error("unsupported_format");
+        TokenizerMetrics::record_factory_load_duration(Duration::from_millis(200));
+
+        // Vocabulary metrics
+        TokenizerMetrics::set_vocab_size("huggingface", 50000);
+    }
+
+    // ============= Port Availability Tests =============
+
+    #[test]
+    fn test_port_already_in_use() {
+        // Skip this test if we can't bind to the port
+        let port = 29123; // Use a different port to avoid conflicts
+
+        if let Ok(_listener) = TcpListener::bind(("127.0.0.1", port)) {
+            // Port is available, we can test
+            let config = PrometheusConfig {
+                port,
+                host: "127.0.0.1".to_string(),
+            };
+
+            // Just verify config is created correctly
+            assert_eq!(config.port, port);
+        }
+    }
+
+    // ============= Integration Test Helpers =============
+
+    #[test]
+    fn test_metrics_endpoint_accessibility() {
+        // This would be an integration test in practice
+        // Here we just verify the configuration
+        let config = PrometheusConfig {
+            port: 29000,
+            host: "127.0.0.1".to_string(),
+        };
+
+        let ip_addr: IpAddr = config.host.parse().unwrap();
+        let socket_addr = SocketAddr::new(ip_addr, config.port);
+
+        assert_eq!(socket_addr.to_string(), "127.0.0.1:29000");
+    }
+
+    #[test]
+    fn test_concurrent_metric_updates() {
+        // Test that metric updates can be called concurrently
+        use std::sync::atomic::{AtomicBool, Ordering};
+        use std::sync::Arc;
+        use std::thread;
+
+        let done = Arc::new(AtomicBool::new(false));
+        let mut handles = vec![];
+
+        for i in 0..3 {
+            let done_clone = done.clone();
+            let handle = thread::spawn(move || {
+                let worker = format!("http://worker{}", i);
+                while !done_clone.load(Ordering::Relaxed) {
+                    RouterMetrics::set_worker_load(&worker, i * 10);
+                    RouterMetrics::record_processed_request(&worker);
+                    thread::sleep(Duration::from_millis(1));
+                }
+            });
+            handles.push(handle);
+        }
+
+        // Let threads run briefly
+        thread::sleep(Duration::from_millis(10));
+        done.store(true, Ordering::Relaxed);
+
+        // Wait for all threads
+        for handle in handles {
+            handle.join().unwrap();
+        }
+    }
+
+    // ============= Edge Cases Tests =============
+
+    #[test]
+    fn test_empty_string_metrics() {
+        // Test that empty strings don't cause issues
+        RouterMetrics::record_request("");
+        RouterMetrics::set_worker_health("", true);
+        RouterMetrics::record_policy_decision("", "");
+    }
+
+    #[test]
+    fn test_very_long_metric_labels() {
+        let long_label = "a".repeat(1000);
+
+        RouterMetrics::record_request(&long_label);
+        RouterMetrics::set_worker_health(&long_label, false);
+    }
+
+    #[test]
+    fn test_special_characters_in_labels() {
+        let special_labels = [
+            "test/with/slashes",
+            "test-with-dashes",
+            "test_with_underscores",
+            "test.with.dots",
+            "test:with:colons",
+        ];
+
+        for label in special_labels {
+            RouterMetrics::record_request(label);
+            RouterMetrics::set_worker_health(label, true);
+        }
+    }
+
+    #[test]
+    fn test_extreme_metric_values() {
+        // Test extreme values
+        RouterMetrics::set_active_workers(0);
+        RouterMetrics::set_active_workers(usize::MAX);
+
+        RouterMetrics::set_worker_load("worker", 0);
+        RouterMetrics::set_worker_load("worker", usize::MAX);
+
+        RouterMetrics::record_request_duration("route", Duration::from_nanos(1));
+        // 24 hours
+        RouterMetrics::record_request_duration("route", Duration::from_secs(86400));
+    }
+}
diff --git a/sgl-router/src/middleware.rs b/sgl-router/src/middleware.rs
new file mode 100644
index 000000000..abe137572
--- /dev/null
+++ b/sgl-router/src/middleware.rs
@@ -0,0 +1,502 @@
+use axum::{
+    extract::Request, extract::State, http::HeaderValue, http::StatusCode, middleware::Next,
+    response::IntoResponse, response::Response,
+};
+use rand::Rng;
+use std::sync::Arc;
+use std::time::Duration;
+use std::time::Instant;
+use tokio::sync::{mpsc, oneshot};
+use tower::{Layer, Service};
+use tower_http::trace::{MakeSpan, OnRequest, OnResponse, TraceLayer};
+use tracing::{debug, error, field::Empty, info, info_span, warn, Span};
+
+pub use crate::core::token_bucket::TokenBucket;
+
+use crate::server::AppState;
+
+/// Generate OpenAI-compatible request ID based on endpoint
+fn generate_request_id(path: &str) -> String {
+    let prefix = if path.contains("/chat/completions") {
+        "chatcmpl-"
+    } else if path.contains("/completions") {
+        "cmpl-"
+    } else if path.contains("/generate") {
+        "gnt-"
+    } else {
+        "req-"
+    };
+
+    // Generate a random string similar to OpenAI's format
+    let chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
+    let mut rng = rand::rng();
+    let random_part: String = (0..24)
+        .map(|_| {
+            let idx = rng.random_range(0..chars.len());
+            chars.chars().nth(idx).unwrap()
+        })
+        .collect();
+
+    format!("{}{}", prefix, random_part)
+}
+
+/// Extension type for storing request ID
+#[derive(Clone, Debug)]
+pub struct RequestId(pub String);
+
+/// Tower Layer for request ID middleware
+#[derive(Clone)]
+pub struct RequestIdLayer {
+    headers: Arc<Vec<String>>,
+}
+
+impl RequestIdLayer {
+    pub fn new(headers: Vec<String>) -> Self {
+        Self {
+            headers: Arc::new(headers),
+        }
+    }
+}
+
+impl<S> Layer<S> for RequestIdLayer {
+    type Service = RequestIdMiddleware<S>;
+
+    fn layer(&self, inner: S) -> Self::Service {
+        RequestIdMiddleware {
+            inner,
+            headers: self.headers.clone(),
+        }
+    }
+}
+
+/// Tower Service for request ID middleware
+#[derive(Clone)]
+pub struct RequestIdMiddleware<S> {
+    inner: S,
+    headers: Arc<Vec<String>>,
+}
+
+impl<S> Service<Request> for RequestIdMiddleware<S>
+where
+    S: Service<Request, Response = Response> + Send + 'static,
+    S::Future: Send + 'static,
+{
+    type Response = S::Response;
+    type Error = S::Error;
+    type Future = std::pin::Pin<
+        Box<dyn std::future::Future<Output = Result<Self::Response, Self::Error>> + Send>,
+    >;
+
+    fn poll_ready(
+        &mut self,
+        cx: &mut std::task::Context<'_>,
+    ) -> std::task::Poll<Result<(), Self::Error>> {
+        self.inner.poll_ready(cx)
+    }
+
+    fn call(&mut self, mut req: Request) -> Self::Future {
+        let headers = self.headers.clone();
+
+        // Extract request ID from headers or generate new one
+        let mut request_id = None;
+
+        for header_name in headers.iter() {
+            if let Some(header_value) = req.headers().get(header_name) {
+                if let Ok(value) = header_value.to_str() {
+                    request_id = Some(value.to_string());
+                    break;
+                }
+            }
+        }
+
+        let request_id = request_id.unwrap_or_else(|| generate_request_id(req.uri().path()));
+
+        // Insert request ID into request extensions
+        req.extensions_mut().insert(RequestId(request_id.clone()));
+
+        // Create a span with the request ID for this request
+        let span = tracing::info_span!(
+            "http_request",
+            method = %req.method(),
+            uri = %req.uri(),
+            version = ?req.version(),
+            request_id = %request_id
+        );
+
+        // Log within the span
+        let _enter = span.enter();
+        tracing::info!(
+            target: "sglang_router_rs::request",
+            "started processing request"
+        );
+        drop(_enter);
+
+        // Capture values we need in the async block
+        let method = req.method().clone();
+        let uri = req.uri().clone();
+        let version = req.version();
+
+        // Call the inner service
+        let future = self.inner.call(req);
+
+        Box::pin(async move {
+            let start_time = Instant::now();
+            let mut response = future.await?;
+            let latency = start_time.elapsed();
+
+            // Add request ID to response headers
+            response.headers_mut().insert(
+                "x-request-id",
+                HeaderValue::from_str(&request_id)
+                    .unwrap_or_else(|_| HeaderValue::from_static("invalid-request-id")),
+            );
+
+            // Log the response with proper request ID in span
+            let status = response.status();
+            let span = tracing::info_span!(
+                "http_request",
+                method = %method,
+                uri = %uri,
+                version = ?version,
+                request_id = %request_id,
+                status = %status,
+                latency = ?latency
+            );
+
+            let _enter = span.enter();
+            if status.is_server_error() {
+                tracing::error!(
+                    target: "sglang_router_rs::response",
+                    "request failed with server error"
+                );
+            } else if status.is_client_error() {
+                tracing::warn!(
+                    target: "sglang_router_rs::response",
+                    "request failed with client error"
+                );
+            } else {
+                tracing::info!(
+                    target: "sglang_router_rs::response",
+                    "finished processing request"
+                );
+            }
+
+            Ok(response)
+        })
+    }
+}
+
+// ============= Logging Middleware =============
+
+/// Custom span maker that includes request ID
+#[derive(Clone, Debug)]
+pub struct RequestSpan;
+
+impl<B> MakeSpan<B> for RequestSpan {
+    fn make_span(&mut self, request: &Request<B>) -> Span {
+        // Don't try to extract request ID here - it won't be available yet
+        // The RequestIdLayer runs after TraceLayer creates the span
+        info_span!(
+            "http_request",
+            method = %request.method(),
+            uri = %request.uri(),
+            version = ?request.version(),
+            request_id = Empty,  // Will be set later
+            status_code = Empty,
+            latency = Empty,
+            error = Empty,
+        )
+    }
+}
+
+/// Custom on_request handler
+#[derive(Clone, Debug)]
+pub struct RequestLogger;
+
+impl<B> OnRequest<B> for RequestLogger {
+    fn on_request(&mut self, request: &Request<B>, span: &Span) {
+        let _enter = span.enter();
+
+        // Try to get the request ID from extensions
+        // This will work if RequestIdLayer has already run
+        if let Some(request_id) = request.extensions().get::<RequestId>() {
+            span.record("request_id", request_id.0.as_str());
+        }
+
+        // Don't log here - we already log in RequestIdService with the proper request_id
+    }
+}
+
+/// Custom on_response handler
+#[derive(Clone, Debug)]
+pub struct ResponseLogger {
+    _start_time: Instant,
+}
+
+impl Default for ResponseLogger {
+    fn default() -> Self {
+        Self {
+            _start_time: Instant::now(),
+        }
+    }
+}
+
+impl<B> OnResponse<B> for ResponseLogger {
+    fn on_response(self, response: &Response<B>, latency: std::time::Duration, span: &Span) {
+        let status = response.status();
+
+        // Record these in the span for structured logging/observability tools
+        span.record("status_code", status.as_u16());
+        span.record("latency", format!("{:?}", latency));
+
+        // Don't log here - RequestIdService handles all logging with proper request IDs
+    }
+}
+
+/// Create a configured TraceLayer for HTTP logging
+/// Note: Actual request/response logging with request IDs is done in RequestIdService
+pub fn create_logging_layer() -> TraceLayer<
+    tower_http::classify::SharedClassifier<tower_http::classify::ServerErrorsAsFailures>,
+    RequestSpan,
+    RequestLogger,
+    ResponseLogger,
+> {
+    TraceLayer::new_for_http()
+        .make_span_with(RequestSpan)
+        .on_request(RequestLogger)
+        .on_response(ResponseLogger::default())
+}
+
+/// Structured logging data for requests
+#[derive(Debug, serde::Serialize)]
+pub struct RequestLogEntry {
+    pub timestamp: String,
+    pub request_id: String,
+    pub method: String,
+    pub uri: String,
+    pub status: u16,
+    pub latency_ms: u64,
+    pub user_agent: Option<String>,
+    pub remote_addr: Option<String>,
+    pub error: Option<String>,
+}
+
+/// Log a request with structured data
+pub fn log_request(entry: RequestLogEntry) {
+    if entry.status >= 500 {
+        tracing::error!(
+            target: "sglang_router_rs::http",
+            request_id = %entry.request_id,
+            method = %entry.method,
+            uri = %entry.uri,
+            status = entry.status,
+            latency_ms = entry.latency_ms,
+            user_agent = ?entry.user_agent,
+            remote_addr = ?entry.remote_addr,
+            error = ?entry.error,
+            "HTTP request failed"
+        );
+    } else if entry.status >= 400 {
+        tracing::warn!(
+            target: "sglang_router_rs::http",
+            request_id = %entry.request_id,
+            method = %entry.method,
+            uri = %entry.uri,
+            status = entry.status,
+            latency_ms = entry.latency_ms,
+            user_agent = ?entry.user_agent,
+            remote_addr = ?entry.remote_addr,
+            "HTTP request client error"
+        );
+    } else {
+        tracing::info!(
+            target: "sglang_router_rs::http",
+            request_id = %entry.request_id,
+            method = %entry.method,
+            uri = %entry.uri,
+            status = entry.status,
+            latency_ms = entry.latency_ms,
+            user_agent = ?entry.user_agent,
+            remote_addr = ?entry.remote_addr,
+            "HTTP request completed"
+        );
+    }
+}
+
+// ============ Concurrency Limiting with Queue Support ============
+
+/// Request queue entry
+pub struct QueuedRequest {
+    /// Time when the request was queued
+    queued_at: Instant,
+    /// Channel to send the permit back when acquired
+    permit_tx: oneshot::Sender<Result<(), StatusCode>>,
+}
+
+/// Queue metrics for monitoring
+#[derive(Debug, Default)]
+pub struct QueueMetrics {
+    pub total_queued: std::sync::atomic::AtomicU64,
+    pub current_queued: std::sync::atomic::AtomicU64,
+    pub total_timeout: std::sync::atomic::AtomicU64,
+    pub total_rejected: std::sync::atomic::AtomicU64,
+}
+
+/// Queue processor that handles queued requests
+pub struct QueueProcessor {
+    token_bucket: Arc<TokenBucket>,
+    queue_rx: mpsc::Receiver<QueuedRequest>,
+    queue_timeout: Duration,
+}
+
+impl QueueProcessor {
+    pub fn new(
+        token_bucket: Arc<TokenBucket>,
+        queue_rx: mpsc::Receiver<QueuedRequest>,
+        queue_timeout: Duration,
+    ) -> Self {
+        Self {
+            token_bucket,
+            queue_rx,
+            queue_timeout,
+        }
+    }
+
+    pub async fn run(mut self) {
+        info!("Starting concurrency queue processor");
+
+        // Process requests in a single task to reduce overhead
+        while let Some(queued) = self.queue_rx.recv().await {
+            // Check timeout immediately
+            let elapsed = queued.queued_at.elapsed();
+            if elapsed >= self.queue_timeout {
+                warn!("Request already timed out in queue");
+                let _ = queued.permit_tx.send(Err(StatusCode::REQUEST_TIMEOUT));
+                continue;
+            }
+
+            let remaining_timeout = self.queue_timeout - elapsed;
+
+            // Try to acquire token for this request
+            if self.token_bucket.try_acquire(1.0).await.is_ok() {
+                // Got token immediately
+                debug!("Queue: acquired token immediately for queued request");
+                let _ = queued.permit_tx.send(Ok(()));
+            } else {
+                // Need to wait for token
+                let token_bucket = self.token_bucket.clone();
+
+                // Spawn task only when we actually need to wait
+                tokio::spawn(async move {
+                    if token_bucket
+                        .acquire_timeout(1.0, remaining_timeout)
+                        .await
+                        .is_ok()
+                    {
+                        debug!("Queue: acquired token after waiting");
+                        let _ = queued.permit_tx.send(Ok(()));
+                    } else {
+                        warn!("Queue: request timed out waiting for token");
+                        let _ = queued.permit_tx.send(Err(StatusCode::REQUEST_TIMEOUT));
+                    }
+                });
+            }
+        }
+
+        warn!("Concurrency queue processor shutting down");
+    }
+}
+
+/// State for the concurrency limiter
+pub struct ConcurrencyLimiter {
+    pub queue_tx: Option<mpsc::Sender<QueuedRequest>>,
+}
+
+impl ConcurrencyLimiter {
+    /// Create new concurrency limiter with optional queue
+    pub fn new(
+        token_bucket: Arc<TokenBucket>,
+        queue_size: usize,
+        queue_timeout: Duration,
+    ) -> (Self, Option<QueueProcessor>) {
+        if queue_size > 0 {
+            let (queue_tx, queue_rx) = mpsc::channel(queue_size);
+            let processor = QueueProcessor::new(token_bucket, queue_rx, queue_timeout);
+
+            (
+                Self {
+                    queue_tx: Some(queue_tx),
+                },
+                Some(processor),
+            )
+        } else {
+            (Self { queue_tx: None }, None)
+        }
+    }
+}
+
+/// Middleware function for concurrency limiting with optional queuing
+pub async fn concurrency_limit_middleware(
+    State(app_state): State<Arc<AppState>>,
+    request: Request<axum::body::Body>,
+    next: Next,
+) -> Response {
+    let token_bucket = app_state.context.rate_limiter.clone();
+
+    // Try to acquire token immediately
+    if token_bucket.try_acquire(1.0).await.is_ok() {
+        debug!("Acquired token immediately");
+        let response = next.run(request).await;
+
+        // Return the token to the bucket
+        token_bucket.return_tokens(1.0).await;
+
+        response
+    } else {
+        // No tokens available, try to queue if enabled
+        if let Some(queue_tx) = &app_state.concurrency_queue_tx {
+            debug!("No tokens available, attempting to queue request");
+
+            // Create a channel for the token response
+            let (permit_tx, permit_rx) = oneshot::channel();
+
+            let queued = QueuedRequest {
+                queued_at: Instant::now(),
+                permit_tx,
+            };
+
+            // Try to send to queue
+            match queue_tx.try_send(queued) {
+                Ok(_) => {
+                    // Wait for token from queue processor
+                    match permit_rx.await {
+                        Ok(Ok(())) => {
+                            debug!("Acquired token from queue");
+                            let response = next.run(request).await;
+
+                            // Return the token to the bucket
+                            token_bucket.return_tokens(1.0).await;
+
+                            response
+                        }
+                        Ok(Err(status)) => {
+                            warn!("Queue returned error status: {}", status);
+                            status.into_response()
+                        }
+                        Err(_) => {
+                            error!("Queue response channel closed");
+                            StatusCode::INTERNAL_SERVER_ERROR.into_response()
+                        }
+                    }
+                }
+                Err(_) => {
+                    warn!("Request queue is full, returning 429");
+                    StatusCode::TOO_MANY_REQUESTS.into_response()
+                }
+            }
+        } else {
+            warn!("No tokens available and queuing is disabled, returning 429");
+            StatusCode::TOO_MANY_REQUESTS.into_response()
+        }
+    }
+}
diff --git a/sgl-router/src/policies/cache_aware.rs b/sgl-router/src/policies/cache_aware.rs
new file mode 100644
index 000000000..47d95c835
--- /dev/null
+++ b/sgl-router/src/policies/cache_aware.rs
@@ -0,0 +1,423 @@
+/*
+    Cache-Aware Load Balancing Router
+
+    This router combines two strategies to optimize both cache utilization and request distribution:
+
+    1. Cache-Aware Routing (Approximate Tree)
+    2. Load Balancing (Shortest Queue with Balance Thresholds)
+
+    The router dynamically switches between these strategies based on load conditions:
+    - Uses load balancing when the system is imbalanced
+    - Uses cache-aware routing when the system is balanced
+
+    A system is considered imbalanced if both conditions are met:
+    1. (max - min) > abs_threshold
+    2. max > rel_threshold * min
+
+    Strategy Details:
+
+    1. Cache-Aware Routing (Approximate Tree)
+    -------------------------------------------
+    This strategy maintains an approximate radix tree for each worker based on request history,
+    eliminating the need for direct cache state queries. The tree stores raw text characters
+    instead of token IDs to avoid tokenization overhead.
+
+    Process:
+    a. For each request, find the worker with the highest prefix match
+    b. If match rate > cache_threshold:
+    Route to the worker with highest match (likely has relevant data cached)
+    c. If match rate ≤ cache_threshold:
+    Route to the worker with smallest tree size (most available cache capacity)
+    d. Background maintenance:
+    Periodically evict least recently used leaf nodes to prevent memory overflow
+
+    2. Load Balancing (Shortest Queue)
+    -------------------------------------------
+    This strategy tracks pending request counts per worker and routes new requests
+    to the least busy worker when the system is detected to be imbalanced.
+
+    Configuration Parameters:
+    ------------------------
+    1. cache_threshold: (float, 0.0 to 1.0)
+    Minimum prefix match ratio to use highest-match routing.
+    Below this threshold, routes to worker with most available cache space.
+
+    2. balance_abs_threshold: (integer)
+    Absolute difference threshold for load imbalance detection.
+    System is potentially imbalanced if (max_load - min_load) > abs_threshold
+
+    3. balance_rel_threshold: (float)
+    Relative ratio threshold for load imbalance detection.
+    System is potentially imbalanced if max_load > min_load * rel_threshold
+    Used in conjunction with abs_threshold to determine final imbalance state.
+
+    4. eviction_interval_secs: (integer)
+    Interval between LRU eviction cycles for the approximate trees.
+
+    5. max_tree_size: (integer)
+    Maximum nodes per tree. When exceeded, LRU leaf nodes are evicted
+    during the next eviction cycle.
+*/
+
+use super::{get_healthy_worker_indices, CacheAwareConfig, LoadBalancingPolicy};
+use crate::core::Worker;
+use crate::metrics::RouterMetrics;
+use crate::tree::Tree;
+use std::sync::{Arc, Mutex};
+use std::thread;
+use std::time::Duration;
+use tracing::debug;
+
+/// Cache-aware routing policy
+///
+/// Routes requests based on cache affinity when load is balanced,
+/// switches to shortest-queue routing when load is imbalanced.
+#[derive(Debug)]
+pub struct CacheAwarePolicy {
+    config: CacheAwareConfig,
+    tree: Arc<Mutex<Tree>>,
+    eviction_handle: Option<thread::JoinHandle<()>>,
+}
+
+impl CacheAwarePolicy {
+    pub fn new() -> Self {
+        Self::with_config(CacheAwareConfig::default())
+    }
+
+    pub fn with_config(config: CacheAwareConfig) -> Self {
+        let tree = Arc::new(Mutex::new(Tree::new()));
+
+        // Start background eviction thread if configured
+        let eviction_handle = if config.eviction_interval_secs > 0 {
+            let tree_clone = Arc::clone(&tree);
+            let max_tree_size = config.max_tree_size;
+            let interval = config.eviction_interval_secs;
+
+            Some(thread::spawn(move || loop {
+                thread::sleep(Duration::from_secs(interval));
+
+                if let Ok(tree_guard) = tree_clone.lock() {
+                    tree_guard.evict_tenant_by_size(max_tree_size);
+                    debug!("Cache eviction completed, max_size: {}", max_tree_size);
+                }
+            }))
+        } else {
+            None
+        };
+
+        Self {
+            config,
+            tree,
+            eviction_handle,
+        }
+    }
+
+    /// Initialize the tree with worker URLs (used only during initial setup)
+    pub fn init_workers(&self, workers: &[Box<dyn Worker>]) {
+        if let Ok(tree) = self.tree.lock() {
+            for worker in workers {
+                tree.insert("", worker.url());
+            }
+        }
+    }
+
+    /// Add a single worker to the tree (incremental update)
+    pub fn add_worker(&self, url: &str) {
+        if let Ok(tree) = self.tree.lock() {
+            tree.insert("", url);
+        }
+    }
+
+    /// Remove a worker from the tree
+    pub fn remove_worker(&self, url: &str) {
+        if let Ok(tree) = self.tree.lock() {
+            tree.remove_tenant(url);
+        }
+    }
+
+    /// Run cache eviction to prevent unbounded growth
+    pub fn evict_cache(&self, max_size: usize) {
+        if let Ok(tree) = self.tree.lock() {
+            tree.evict_tenant_by_size(max_size);
+        }
+    }
+}
+
+impl LoadBalancingPolicy for CacheAwarePolicy {
+    fn select_worker(
+        &self,
+        workers: &[Box<dyn Worker>],
+        request_text: Option<&str>,
+    ) -> Option<usize> {
+        let healthy_indices = get_healthy_worker_indices(workers);
+
+        if healthy_indices.is_empty() {
+            return None;
+        }
+
+        // Get current load statistics
+        let loads: Vec<usize> = workers.iter().map(|w| w.load()).collect();
+        let max_load = *loads.iter().max().unwrap_or(&0);
+        let min_load = *loads.iter().min().unwrap_or(&0);
+
+        // Check if load is imbalanced
+        let is_imbalanced = max_load.saturating_sub(min_load) > self.config.balance_abs_threshold
+            && (max_load as f32) > (min_load as f32 * self.config.balance_rel_threshold);
+
+        if is_imbalanced {
+            // Log load balancing trigger
+            let worker_loads: Vec<(String, usize)> = workers
+                .iter()
+                .map(|w| (w.url().to_string(), w.load()))
+                .collect();
+
+            debug!(
+                "Load balancing triggered | max: {} | min: {} | workers: {:?}",
+                max_load, min_load, worker_loads
+            );
+
+            RouterMetrics::record_load_balancing_event();
+            RouterMetrics::set_load_range(max_load, min_load);
+
+            // Use shortest queue when imbalanced
+            let min_load_idx = healthy_indices
+                .iter()
+                .min_by_key(|&&idx| workers[idx].load())
+                .copied()?;
+
+            // Even in imbalanced mode, update the tree to maintain cache state
+            if let Some(text) = request_text {
+                if let Ok(tree) = self.tree.lock() {
+                    tree.insert(text, workers[min_load_idx].url());
+                }
+            }
+
+            // Increment processed counter
+            workers[min_load_idx].increment_processed();
+            RouterMetrics::record_processed_request(workers[min_load_idx].url());
+            RouterMetrics::record_policy_decision(self.name(), workers[min_load_idx].url());
+
+            return Some(min_load_idx);
+        }
+
+        // Use cache-aware routing when balanced
+        let text = request_text.unwrap_or("");
+
+        if let Ok(tree) = self.tree.lock() {
+            let (matched_text, matched_worker) = tree.prefix_match(text);
+            let match_rate = if text.is_empty() {
+                0.0
+            } else {
+                matched_text.chars().count() as f32 / text.chars().count() as f32
+            };
+
+            let selected_url = if match_rate > self.config.cache_threshold {
+                RouterMetrics::record_cache_hit();
+                matched_worker.to_string()
+            } else {
+                RouterMetrics::record_cache_miss();
+                tree.get_smallest_tenant()
+            };
+
+            // Find the index of the selected worker
+            if let Some(selected_idx) = workers.iter().position(|w| w.url() == selected_url) {
+                // Only proceed if the worker is healthy
+                if workers[selected_idx].is_healthy() {
+                    // Update the tree with this request
+                    tree.insert(text, &selected_url);
+
+                    // Increment processed counter
+                    workers[selected_idx].increment_processed();
+                    RouterMetrics::record_processed_request(&selected_url);
+
+                    return Some(selected_idx);
+                }
+            } else {
+                // Selected worker no longer exists, remove it from tree
+                tree.remove_tenant(&selected_url);
+                debug!("Removed stale worker {} from cache tree", selected_url);
+            }
+
+            // Fallback to first healthy worker
+            return healthy_indices.first().copied();
+        }
+
+        // Fallback to first healthy worker if tree operations fail
+        healthy_indices.first().copied()
+    }
+
+    fn name(&self) -> &'static str {
+        "cache_aware"
+    }
+
+    fn needs_request_text(&self) -> bool {
+        true // Cache-aware policy needs request text for cache affinity
+    }
+
+    fn on_request_complete(&self, worker_url: &str, success: bool) {
+        // Could track success rates per worker for more intelligent routing
+        if !success {
+            // Optionally reduce affinity for failed requests
+            tracing::debug!(
+                "Request to {} completed with success={}",
+                worker_url,
+                success
+            );
+        }
+    }
+
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn select_worker_pair(
+        &self,
+        prefill_workers: &[Box<dyn Worker>],
+        decode_workers: &[Box<dyn Worker>],
+        request_text: Option<&str>,
+    ) -> Option<(usize, usize)> {
+        // DEPRECATED: This method is no longer used when separate policies are configured.
+        // The PD router now uses separate policies for prefill and decode selection.
+        // This implementation remains for backward compatibility when a single policy is used.
+
+        // In PD mode with single policy:
+        // - Prefill: Use cache-aware routing for better cache utilization
+        // - Decode: Use least-load routing for better load distribution
+
+        // Select prefill worker using cache-aware logic
+        let prefill_idx = self.select_worker(prefill_workers, request_text)?;
+
+        // Select decode worker using least-load logic
+        let healthy_decode = get_healthy_worker_indices(decode_workers);
+        if healthy_decode.is_empty() {
+            return None;
+        }
+
+        let decode_idx = healthy_decode
+            .iter()
+            .min_by_key(|&&idx| decode_workers[idx].load())
+            .copied()?;
+
+        Some((prefill_idx, decode_idx))
+    }
+}
+
+impl Default for CacheAwarePolicy {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Drop for CacheAwarePolicy {
+    fn drop(&mut self) {
+        // Note: We can't properly stop the eviction thread since it's in an infinite loop
+        // In a production system, we'd use a channel or atomic flag to signal shutdown
+        if let Some(handle) = self.eviction_handle.take() {
+            // The thread will continue running until the program exits
+            // This is acceptable for now since the router typically runs for the lifetime of the program
+            drop(handle);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::core::{BasicWorker, WorkerType};
+
+    #[test]
+    fn test_cache_aware_with_balanced_load() {
+        // Create policy without eviction thread for testing
+        let config = CacheAwareConfig {
+            eviction_interval_secs: 0, // Disable eviction thread
+            ..Default::default()
+        };
+        let policy = CacheAwarePolicy::with_config(config);
+        let workers: Vec<Box<dyn Worker>> = vec![
+            Box::new(BasicWorker::new(
+                "http://w1:8000".to_string(),
+                WorkerType::Regular,
+            )),
+            Box::new(BasicWorker::new(
+                "http://w2:8000".to_string(),
+                WorkerType::Regular,
+            )),
+        ];
+
+        // Initialize the policy with workers
+        policy.init_workers(&workers);
+
+        // First request should be distributed
+        let idx1 = policy.select_worker(&workers, Some("hello world")).unwrap();
+
+        // Same request should go to same worker (cache hit)
+        let idx2 = policy.select_worker(&workers, Some("hello world")).unwrap();
+        assert_eq!(idx1, idx2);
+
+        // Similar request should also go to same worker
+        let idx3 = policy.select_worker(&workers, Some("hello")).unwrap();
+        assert_eq!(idx1, idx3);
+    }
+
+    #[test]
+    fn test_cache_aware_with_imbalanced_load() {
+        let policy = CacheAwarePolicy::with_config(CacheAwareConfig {
+            cache_threshold: 0.5,
+            balance_abs_threshold: 5,
+            balance_rel_threshold: 2.0,
+            eviction_interval_secs: 0, // Disable eviction thread
+            max_tree_size: 10000,
+        });
+
+        let worker1 = BasicWorker::new("http://w1:8000".to_string(), WorkerType::Regular);
+        let worker2 = BasicWorker::new("http://w2:8000".to_string(), WorkerType::Regular);
+
+        // Create significant load imbalance
+        for _ in 0..20 {
+            worker1.increment_load();
+        }
+        // worker2 has load 0
+
+        let workers: Vec<Box<dyn Worker>> = vec![Box::new(worker1), Box::new(worker2)];
+        policy.init_workers(&workers);
+
+        // Should select worker2 (lower load) despite cache affinity
+        for _ in 0..5 {
+            let idx = policy.select_worker(&workers, Some("test")).unwrap();
+            assert_eq!(idx, 1); // Should always pick worker2
+        }
+    }
+
+    #[test]
+    fn test_cache_aware_worker_removal() {
+        let config = CacheAwareConfig {
+            eviction_interval_secs: 0, // Disable eviction thread
+            ..Default::default()
+        };
+        let policy = CacheAwarePolicy::with_config(config);
+        let workers: Vec<Box<dyn Worker>> = vec![
+            Box::new(BasicWorker::new(
+                "http://w1:8000".to_string(),
+                WorkerType::Regular,
+            )),
+            Box::new(BasicWorker::new(
+                "http://w2:8000".to_string(),
+                WorkerType::Regular,
+            )),
+        ];
+
+        policy.init_workers(&workers);
+
+        // Route some requests
+        policy.select_worker(&workers, Some("test1"));
+        policy.select_worker(&workers, Some("test2"));
+
+        // Remove a worker
+        policy.remove_worker("http://w1:8000");
+        workers[0].set_healthy(false);
+
+        // All requests should now go to worker2
+        let idx = policy.select_worker(&workers, Some("test1")).unwrap();
+        assert_eq!(idx, 1);
+    }
+}
diff --git a/sgl-router/src/policies/factory.rs b/sgl-router/src/policies/factory.rs
new file mode 100644
index 000000000..c65785d63
--- /dev/null
+++ b/sgl-router/src/policies/factory.rs
@@ -0,0 +1,94 @@
+//! Factory for creating load balancing policies
+
+use super::{
+    CacheAwareConfig, CacheAwarePolicy, LoadBalancingPolicy, PowerOfTwoPolicy, RandomPolicy,
+    RoundRobinPolicy,
+};
+use crate::config::PolicyConfig;
+use std::sync::Arc;
+
+/// Factory for creating policy instances
+pub struct PolicyFactory;
+
+impl PolicyFactory {
+    /// Create a policy from configuration
+    pub fn create_from_config(config: &PolicyConfig) -> Arc<dyn LoadBalancingPolicy> {
+        match config {
+            PolicyConfig::Random => Arc::new(RandomPolicy::new()),
+            PolicyConfig::RoundRobin => Arc::new(RoundRobinPolicy::new()),
+            PolicyConfig::PowerOfTwo { .. } => Arc::new(PowerOfTwoPolicy::new()),
+            PolicyConfig::CacheAware {
+                cache_threshold,
+                balance_abs_threshold,
+                balance_rel_threshold,
+                eviction_interval_secs,
+                max_tree_size,
+            } => {
+                let config = CacheAwareConfig {
+                    cache_threshold: *cache_threshold,
+                    balance_abs_threshold: *balance_abs_threshold,
+                    balance_rel_threshold: *balance_rel_threshold,
+                    eviction_interval_secs: *eviction_interval_secs,
+                    max_tree_size: *max_tree_size,
+                };
+                Arc::new(CacheAwarePolicy::with_config(config))
+            }
+        }
+    }
+
+    /// Create a policy by name (for dynamic loading)
+    pub fn create_by_name(name: &str) -> Option<Arc<dyn LoadBalancingPolicy>> {
+        match name.to_lowercase().as_str() {
+            "random" => Some(Arc::new(RandomPolicy::new())),
+            "round_robin" | "roundrobin" => Some(Arc::new(RoundRobinPolicy::new())),
+            "power_of_two" | "poweroftwo" => Some(Arc::new(PowerOfTwoPolicy::new())),
+            "cache_aware" | "cacheaware" => Some(Arc::new(CacheAwarePolicy::new())),
+            _ => None,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_create_from_config() {
+        // Test Random
+        let policy = PolicyFactory::create_from_config(&PolicyConfig::Random);
+        assert_eq!(policy.name(), "random");
+
+        // Test RoundRobin
+        let policy = PolicyFactory::create_from_config(&PolicyConfig::RoundRobin);
+        assert_eq!(policy.name(), "round_robin");
+
+        // Test PowerOfTwo
+        let policy = PolicyFactory::create_from_config(&PolicyConfig::PowerOfTwo {
+            load_check_interval_secs: 60,
+        });
+        assert_eq!(policy.name(), "power_of_two");
+
+        // Test CacheAware
+        let policy = PolicyFactory::create_from_config(&PolicyConfig::CacheAware {
+            cache_threshold: 0.7,
+            balance_abs_threshold: 10,
+            balance_rel_threshold: 1.5,
+            eviction_interval_secs: 30,
+            max_tree_size: 1000,
+        });
+        assert_eq!(policy.name(), "cache_aware");
+    }
+
+    #[test]
+    fn test_create_by_name() {
+        assert!(PolicyFactory::create_by_name("random").is_some());
+        assert!(PolicyFactory::create_by_name("RANDOM").is_some());
+        assert!(PolicyFactory::create_by_name("round_robin").is_some());
+        assert!(PolicyFactory::create_by_name("RoundRobin").is_some());
+        assert!(PolicyFactory::create_by_name("power_of_two").is_some());
+        assert!(PolicyFactory::create_by_name("PowerOfTwo").is_some());
+        assert!(PolicyFactory::create_by_name("cache_aware").is_some());
+        assert!(PolicyFactory::create_by_name("CacheAware").is_some());
+        assert!(PolicyFactory::create_by_name("unknown").is_none());
+    }
+}
diff --git a/sgl-router/src/policies/mod.rs b/sgl-router/src/policies/mod.rs
new file mode 100644
index 000000000..97ce9ca6f
--- /dev/null
+++ b/sgl-router/src/policies/mod.rs
@@ -0,0 +1,148 @@
+//! Load balancing policies for SGLang router
+//!
+//! This module provides a unified abstraction for routing policies that work
+//! across both regular and prefill-decode (PD) routing modes.
+
+use crate::core::Worker;
+use std::fmt::Debug;
+
+mod cache_aware;
+mod factory;
+mod power_of_two;
+mod random;
+mod round_robin;
+
+pub use cache_aware::CacheAwarePolicy;
+pub use factory::PolicyFactory;
+pub use power_of_two::PowerOfTwoPolicy;
+pub use random::RandomPolicy;
+pub use round_robin::RoundRobinPolicy;
+
+/// Core trait for load balancing policies
+///
+/// This trait provides a unified interface for implementing routing algorithms
+/// that can work with both regular single-worker selection and PD dual-worker selection.
+pub trait LoadBalancingPolicy: Send + Sync + Debug {
+    /// Select a single worker from the available workers
+    ///
+    /// This is used for regular routing mode where requests go to a single worker.
+    fn select_worker(
+        &self,
+        workers: &[Box<dyn Worker>],
+        request_text: Option<&str>,
+    ) -> Option<usize>;
+
+    /// Select a pair of workers (prefill and decode) for PD routing
+    ///
+    /// Returns indices of (prefill_worker, decode_worker) from their respective arrays.
+    /// Default implementation uses select_worker for each array independently.
+    fn select_worker_pair(
+        &self,
+        prefill_workers: &[Box<dyn Worker>],
+        decode_workers: &[Box<dyn Worker>],
+        request_text: Option<&str>,
+    ) -> Option<(usize, usize)> {
+        // Default implementation: independently select from each pool
+        let prefill_idx = self.select_worker(prefill_workers, request_text)?;
+        let decode_idx = self.select_worker(decode_workers, request_text)?;
+        Some((prefill_idx, decode_idx))
+    }
+
+    /// Update policy state after request completion
+    ///
+    /// This is called when a request completes (successfully or not) to allow
+    /// policies to update their internal state.
+    fn on_request_complete(&self, _worker_url: &str, _success: bool) {
+        // Default: no-op for stateless policies
+    }
+
+    /// Get policy name for metrics and debugging
+    fn name(&self) -> &'static str;
+
+    /// Check if this policy needs request text for routing decisions
+    fn needs_request_text(&self) -> bool {
+        false // Default: most policies don't need request text
+    }
+
+    /// Update worker load information
+    ///
+    /// This is called periodically with current load information for load-aware policies.
+    fn update_loads(&self, _loads: &std::collections::HashMap<String, isize>) {
+        // Default: no-op for policies that don't use load information
+    }
+
+    /// Reset any internal state
+    ///
+    /// This is useful for policies that maintain state (e.g., round-robin counters).
+    fn reset(&self) {
+        // Default: no-op for stateless policies
+    }
+
+    /// Get as Any for downcasting
+    fn as_any(&self) -> &dyn std::any::Any;
+}
+
+/// Configuration for cache-aware policy
+#[derive(Debug, Clone)]
+pub struct CacheAwareConfig {
+    pub cache_threshold: f32,
+    pub balance_abs_threshold: usize,
+    pub balance_rel_threshold: f32,
+    pub eviction_interval_secs: u64,
+    pub max_tree_size: usize,
+}
+
+impl Default for CacheAwareConfig {
+    fn default() -> Self {
+        Self {
+            cache_threshold: 0.5,
+            balance_abs_threshold: 32,
+            balance_rel_threshold: 1.1,
+            eviction_interval_secs: 30,
+            max_tree_size: 10000,
+        }
+    }
+}
+
+/// Helper function to filter healthy workers and return their indices
+pub(crate) fn get_healthy_worker_indices(workers: &[Box<dyn Worker>]) -> Vec<usize> {
+    workers
+        .iter()
+        .enumerate()
+        .filter(|(_, w)| w.is_healthy() && w.circuit_breaker().can_execute())
+        .map(|(idx, _)| idx)
+        .collect()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::core::{BasicWorker, WorkerType};
+
+    #[test]
+    fn test_get_healthy_worker_indices() {
+        let workers: Vec<Box<dyn Worker>> = vec![
+            Box::new(BasicWorker::new(
+                "http://w1:8000".to_string(),
+                WorkerType::Regular,
+            )),
+            Box::new(BasicWorker::new(
+                "http://w2:8000".to_string(),
+                WorkerType::Regular,
+            )),
+            Box::new(BasicWorker::new(
+                "http://w3:8000".to_string(),
+                WorkerType::Regular,
+            )),
+        ];
+
+        // All healthy initially
+        let indices = get_healthy_worker_indices(&workers);
+        assert_eq!(indices, vec![0, 1, 2]);
+
+        // Mark one unhealthy
+        workers[1].set_healthy(false);
+        let indices = get_healthy_worker_indices(&workers);
+        assert_eq!(indices, vec![0, 2]);
+    }
+}
diff --git a/sgl-router/src/policies/power_of_two.rs b/sgl-router/src/policies/power_of_two.rs
new file mode 100644
index 000000000..c10fc2949
--- /dev/null
+++ b/sgl-router/src/policies/power_of_two.rs
@@ -0,0 +1,201 @@
+//! Power-of-two choices load balancing policy
+
+use super::{get_healthy_worker_indices, LoadBalancingPolicy};
+use crate::core::Worker;
+use crate::metrics::RouterMetrics;
+use rand::Rng;
+use std::collections::HashMap;
+use std::sync::RwLock;
+use tracing::info;
+
+/// Power-of-two choices policy
+///
+/// Randomly selects two workers and routes to the one with lower load.
+/// This provides good load distribution with minimal coordination overhead.
+#[derive(Debug)]
+pub struct PowerOfTwoPolicy {
+    /// Cached load information from external monitoring
+    cached_loads: RwLock<HashMap<String, isize>>,
+}
+
+impl PowerOfTwoPolicy {
+    pub fn new() -> Self {
+        Self {
+            cached_loads: RwLock::new(HashMap::new()),
+        }
+    }
+
+    fn get_worker_load(&self, worker: &dyn Worker) -> isize {
+        // First check cached loads (from external monitoring)
+        if let Ok(loads) = self.cached_loads.read() {
+            if let Some(&load) = loads.get(worker.url()) {
+                return load;
+            }
+        }
+
+        // Fall back to local load counter
+        worker.load() as isize
+    }
+}
+
+impl LoadBalancingPolicy for PowerOfTwoPolicy {
+    fn select_worker(
+        &self,
+        workers: &[Box<dyn Worker>],
+        _request_text: Option<&str>,
+    ) -> Option<usize> {
+        let healthy_indices = get_healthy_worker_indices(workers);
+
+        if healthy_indices.is_empty() {
+            return None;
+        }
+
+        if healthy_indices.len() == 1 {
+            return Some(healthy_indices[0]);
+        }
+
+        // Select two random workers
+        let mut rng = rand::rng();
+        let idx1 = rng.random_range(0..healthy_indices.len());
+        let mut idx2 = rng.random_range(0..healthy_indices.len());
+
+        // Ensure we pick two different workers
+        while idx2 == idx1 {
+            idx2 = rng.random_range(0..healthy_indices.len());
+        }
+
+        let worker_idx1 = healthy_indices[idx1];
+        let worker_idx2 = healthy_indices[idx2];
+
+        // Compare loads and select the less loaded one
+        let load1 = self.get_worker_load(workers[worker_idx1].as_ref());
+        let load2 = self.get_worker_load(workers[worker_idx2].as_ref());
+
+        // Log selection for debugging
+        let selected_idx = if load1 <= load2 {
+            worker_idx1
+        } else {
+            worker_idx2
+        };
+
+        info!(
+            "Power-of-two selection: {}={} vs {}={} -> selected {}",
+            workers[worker_idx1].url(),
+            load1,
+            workers[worker_idx2].url(),
+            load2,
+            workers[selected_idx].url()
+        );
+
+        // Increment processed counter
+        workers[selected_idx].increment_processed();
+        RouterMetrics::record_processed_request(workers[selected_idx].url());
+        RouterMetrics::record_policy_decision(self.name(), workers[selected_idx].url());
+
+        Some(selected_idx)
+    }
+
+    fn name(&self) -> &'static str {
+        "power_of_two"
+    }
+
+    fn update_loads(&self, loads: &HashMap<String, isize>) {
+        if let Ok(mut cached) = self.cached_loads.write() {
+            *cached = loads.clone();
+        }
+    }
+
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+}
+
+impl Default for PowerOfTwoPolicy {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::core::{BasicWorker, WorkerType};
+
+    #[test]
+    fn test_power_of_two_selection() {
+        let policy = PowerOfTwoPolicy::new();
+        let worker1 = BasicWorker::new("http://w1:8000".to_string(), WorkerType::Regular);
+        let worker2 = BasicWorker::new("http://w2:8000".to_string(), WorkerType::Regular);
+        let worker3 = BasicWorker::new("http://w3:8000".to_string(), WorkerType::Regular);
+
+        // Set different loads
+        for _ in 0..10 {
+            worker1.increment_load();
+        }
+        for _ in 0..5 {
+            worker2.increment_load();
+        }
+        // worker3 has load 0
+
+        let workers: Vec<Box<dyn Worker>> =
+            vec![Box::new(worker1), Box::new(worker2), Box::new(worker3)];
+
+        // Run multiple selections
+        let mut selected_counts = [0; 3];
+        for _ in 0..100 {
+            if let Some(idx) = policy.select_worker(&workers, None) {
+                selected_counts[idx] += 1;
+            }
+        }
+
+        // Worker with lowest load (worker3) should be selected most often
+        assert!(selected_counts[2] > selected_counts[1]);
+        assert!(selected_counts[1] > selected_counts[0]);
+    }
+
+    #[test]
+    fn test_power_of_two_with_cached_loads() {
+        let policy = PowerOfTwoPolicy::new();
+        let workers: Vec<Box<dyn Worker>> = vec![
+            Box::new(BasicWorker::new(
+                "http://w1:8000".to_string(),
+                WorkerType::Regular,
+            )),
+            Box::new(BasicWorker::new(
+                "http://w2:8000".to_string(),
+                WorkerType::Regular,
+            )),
+        ];
+
+        // Update cached loads
+        let mut loads = HashMap::new();
+        loads.insert("http://w1:8000".to_string(), 100);
+        loads.insert("http://w2:8000".to_string(), 10);
+        policy.update_loads(&loads);
+
+        // Should prefer worker2 with lower cached load
+        let mut w2_selected = 0;
+        for _ in 0..50 {
+            if let Some(idx) = policy.select_worker(&workers, None) {
+                if idx == 1 {
+                    w2_selected += 1;
+                }
+            }
+        }
+
+        // Worker2 should be selected significantly more often
+        assert!(w2_selected > 35); // Should win most of the time
+    }
+
+    #[test]
+    fn test_power_of_two_single_worker() {
+        let policy = PowerOfTwoPolicy::new();
+        let workers: Vec<Box<dyn Worker>> = vec![Box::new(BasicWorker::new(
+            "http://w1:8000".to_string(),
+            WorkerType::Regular,
+        ))];
+
+        // With single worker, should always select it
+        assert_eq!(policy.select_worker(&workers, None), Some(0));
+    }
+}
diff --git a/sgl-router/src/policies/random.rs b/sgl-router/src/policies/random.rs
new file mode 100644
index 000000000..4912d0dd2
--- /dev/null
+++ b/sgl-router/src/policies/random.rs
@@ -0,0 +1,121 @@
+//! Random load balancing policy
+
+use super::{get_healthy_worker_indices, LoadBalancingPolicy};
+use crate::core::Worker;
+use crate::metrics::RouterMetrics;
+use rand::Rng;
+
+/// Random selection policy
+///
+/// Selects workers randomly with uniform distribution among healthy workers.
+#[derive(Debug, Default)]
+pub struct RandomPolicy;
+
+impl RandomPolicy {
+    pub fn new() -> Self {
+        Self
+    }
+}
+
+impl LoadBalancingPolicy for RandomPolicy {
+    fn select_worker(
+        &self,
+        workers: &[Box<dyn Worker>],
+        _request_text: Option<&str>,
+    ) -> Option<usize> {
+        let healthy_indices = get_healthy_worker_indices(workers);
+
+        if healthy_indices.is_empty() {
+            return None;
+        }
+
+        let mut rng = rand::rng();
+        let random_idx = rng.random_range(0..healthy_indices.len());
+        let worker = workers[healthy_indices[random_idx]].url();
+
+        RouterMetrics::record_processed_request(worker);
+        RouterMetrics::record_policy_decision(self.name(), worker);
+        Some(healthy_indices[random_idx])
+    }
+
+    fn name(&self) -> &'static str {
+        "random"
+    }
+
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::core::{BasicWorker, WorkerType};
+    use std::collections::HashMap;
+
+    #[test]
+    fn test_random_selection() {
+        let policy = RandomPolicy::new();
+        let workers: Vec<Box<dyn Worker>> = vec![
+            Box::new(BasicWorker::new(
+                "http://w1:8000".to_string(),
+                WorkerType::Regular,
+            )),
+            Box::new(BasicWorker::new(
+                "http://w2:8000".to_string(),
+                WorkerType::Regular,
+            )),
+            Box::new(BasicWorker::new(
+                "http://w3:8000".to_string(),
+                WorkerType::Regular,
+            )),
+        ];
+
+        // Test multiple selections to ensure randomness
+        let mut counts = HashMap::new();
+        for _ in 0..100 {
+            if let Some(idx) = policy.select_worker(&workers, None) {
+                *counts.entry(idx).or_insert(0) += 1;
+            }
+        }
+
+        // All workers should be selected at least once
+        assert_eq!(counts.len(), 3);
+        assert!(counts.values().all(|&count| count > 0));
+    }
+
+    #[test]
+    fn test_random_with_unhealthy_workers() {
+        let policy = RandomPolicy::new();
+        let workers: Vec<Box<dyn Worker>> = vec![
+            Box::new(BasicWorker::new(
+                "http://w1:8000".to_string(),
+                WorkerType::Regular,
+            )),
+            Box::new(BasicWorker::new(
+                "http://w2:8000".to_string(),
+                WorkerType::Regular,
+            )),
+        ];
+
+        // Mark first worker as unhealthy
+        workers[0].set_healthy(false);
+
+        // Should always select the healthy worker (index 1)
+        for _ in 0..10 {
+            assert_eq!(policy.select_worker(&workers, None), Some(1));
+        }
+    }
+
+    #[test]
+    fn test_random_no_healthy_workers() {
+        let policy = RandomPolicy::new();
+        let workers: Vec<Box<dyn Worker>> = vec![Box::new(BasicWorker::new(
+            "http://w1:8000".to_string(),
+            WorkerType::Regular,
+        ))];
+
+        workers[0].set_healthy(false);
+        assert_eq!(policy.select_worker(&workers, None), None);
+    }
+}
diff --git a/sgl-router/src/policies/round_robin.rs b/sgl-router/src/policies/round_robin.rs
new file mode 100644
index 000000000..fcb60233f
--- /dev/null
+++ b/sgl-router/src/policies/round_robin.rs
@@ -0,0 +1,140 @@
+//! Round-robin load balancing policy
+
+use super::{get_healthy_worker_indices, LoadBalancingPolicy};
+use crate::core::Worker;
+use crate::metrics::RouterMetrics;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+/// Round-robin selection policy
+///
+/// Selects workers in sequential order, cycling through all healthy workers.
+#[derive(Debug, Default)]
+pub struct RoundRobinPolicy {
+    counter: AtomicUsize,
+}
+
+impl RoundRobinPolicy {
+    pub fn new() -> Self {
+        Self {
+            counter: AtomicUsize::new(0),
+        }
+    }
+}
+
+impl LoadBalancingPolicy for RoundRobinPolicy {
+    fn select_worker(
+        &self,
+        workers: &[Box<dyn Worker>],
+        _request_text: Option<&str>,
+    ) -> Option<usize> {
+        let healthy_indices = get_healthy_worker_indices(workers);
+
+        if healthy_indices.is_empty() {
+            return None;
+        }
+
+        // Get and increment counter atomically
+        let count = self.counter.fetch_add(1, Ordering::Relaxed);
+        let selected_idx = count % healthy_indices.len();
+        let worker = workers[healthy_indices[selected_idx]].url();
+
+        RouterMetrics::record_processed_request(worker);
+        RouterMetrics::record_policy_decision(self.name(), worker);
+        Some(healthy_indices[selected_idx])
+    }
+
+    fn name(&self) -> &'static str {
+        "round_robin"
+    }
+
+    fn reset(&self) {
+        self.counter.store(0, Ordering::Relaxed);
+    }
+
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::core::{BasicWorker, WorkerType};
+
+    #[test]
+    fn test_round_robin_selection() {
+        let policy = RoundRobinPolicy::new();
+        let workers: Vec<Box<dyn Worker>> = vec![
+            Box::new(BasicWorker::new(
+                "http://w1:8000".to_string(),
+                WorkerType::Regular,
+            )),
+            Box::new(BasicWorker::new(
+                "http://w2:8000".to_string(),
+                WorkerType::Regular,
+            )),
+            Box::new(BasicWorker::new(
+                "http://w3:8000".to_string(),
+                WorkerType::Regular,
+            )),
+        ];
+
+        // Should select workers in order: 0, 1, 2, 0, 1, 2, ...
+        assert_eq!(policy.select_worker(&workers, None), Some(0));
+        assert_eq!(policy.select_worker(&workers, None), Some(1));
+        assert_eq!(policy.select_worker(&workers, None), Some(2));
+        assert_eq!(policy.select_worker(&workers, None), Some(0));
+        assert_eq!(policy.select_worker(&workers, None), Some(1));
+    }
+
+    #[test]
+    fn test_round_robin_with_unhealthy_workers() {
+        let policy = RoundRobinPolicy::new();
+        let workers: Vec<Box<dyn Worker>> = vec![
+            Box::new(BasicWorker::new(
+                "http://w1:8000".to_string(),
+                WorkerType::Regular,
+            )),
+            Box::new(BasicWorker::new(
+                "http://w2:8000".to_string(),
+                WorkerType::Regular,
+            )),
+            Box::new(BasicWorker::new(
+                "http://w3:8000".to_string(),
+                WorkerType::Regular,
+            )),
+        ];
+
+        // Mark middle worker as unhealthy
+        workers[1].set_healthy(false);
+
+        // Should skip unhealthy worker: 0, 2, 0, 2, ...
+        assert_eq!(policy.select_worker(&workers, None), Some(0));
+        assert_eq!(policy.select_worker(&workers, None), Some(2));
+        assert_eq!(policy.select_worker(&workers, None), Some(0));
+        assert_eq!(policy.select_worker(&workers, None), Some(2));
+    }
+
+    #[test]
+    fn test_round_robin_reset() {
+        let policy = RoundRobinPolicy::new();
+        let workers: Vec<Box<dyn Worker>> = vec![
+            Box::new(BasicWorker::new(
+                "http://w1:8000".to_string(),
+                WorkerType::Regular,
+            )),
+            Box::new(BasicWorker::new(
+                "http://w2:8000".to_string(),
+                WorkerType::Regular,
+            )),
+        ];
+
+        // Advance the counter
+        assert_eq!(policy.select_worker(&workers, None), Some(0));
+        assert_eq!(policy.select_worker(&workers, None), Some(1));
+
+        // Reset should start from beginning
+        policy.reset();
+        assert_eq!(policy.select_worker(&workers, None), Some(0));
+    }
+}
diff --git a/sgl-router/src/proto/sglang_scheduler.proto b/sgl-router/src/proto/sglang_scheduler.proto
new file mode 100644
index 000000000..1ea2855a4
--- /dev/null
+++ b/sgl-router/src/proto/sglang_scheduler.proto
@@ -0,0 +1,541 @@
+syntax = "proto3";
+
+package sglang.grpc.scheduler;
+
+import "google/protobuf/timestamp.proto";
+import "google/protobuf/struct.proto";
+
+// Service definition for SGLang scheduler communication
+// This protocol bridges the Rust router and Python scheduler
+service SglangScheduler {
+  // Initialize connection and get model info
+  rpc Initialize(InitializeRequest) returns (InitializeResponse);
+
+  // Submit a generation request (supports streaming)
+  rpc Generate(GenerateRequest) returns (stream GenerateResponse);
+
+  // Submit an embedding request
+  rpc Embed(EmbedRequest) returns (EmbedResponse);
+
+  // Health check and metrics
+  rpc HealthCheck(HealthCheckRequest) returns (HealthCheckResponse);
+
+  // Abort a running request
+  rpc Abort(AbortRequest) returns (AbortResponse);
+
+  // Flush KV cache
+  rpc FlushCache(FlushCacheRequest) returns (FlushCacheResponse);
+}
+
+// =====================
+// Common Types
+// =====================
+
+// Sampling parameters matching SGLang's SamplingParams
+message SamplingParams {
+  float temperature = 1;
+  float top_p = 2;
+  int32 top_k = 3;
+  float min_p = 4;
+  float frequency_penalty = 5;
+  float presence_penalty = 6;
+  float repetition_penalty = 7;
+
+  int32 max_new_tokens = 8;
+  repeated string stop = 9;
+  repeated int32 stop_token_ids = 10;
+  bool skip_special_tokens = 11;
+  bool spaces_between_special_tokens = 12;
+
+  // Structured generation
+  oneof constraint {
+    string regex = 13;
+    string json_schema = 14;
+    string ebnf_grammar = 15;
+  }
+
+  // LoRA adapter
+  string lora_path = 16;
+
+  // Speculative decoding
+  int32 n = 17;  // Number of samples
+
+  // Token healing
+  bool token_healing = 18;
+
+  // Additional parameters
+  int32 min_new_tokens = 19;
+  bool ignore_eos = 20;
+  bool no_stop_trim = 21;
+  int32 stream_interval = 22;
+  map<string, float> logit_bias = 23;
+  string structural_tag = 24;
+
+  // Custom parameters for extensibility
+  google.protobuf.Struct custom_params = 25;
+}
+
+// Session parameters for continual prompting
+message SessionParams {
+  string session_id = 1;
+  string request_id = 2;
+  int32 offset = 3;
+  bool replace = 4;
+  bool drop_previous_output = 5;
+}
+
+// Disaggregated serving parameters
+message DisaggregatedParams {
+  string bootstrap_host = 1;
+  int32 bootstrap_port = 2;
+  int32 bootstrap_room = 3;
+}
+
+// =====================
+// Initialize
+// =====================
+
+message InitializeRequest {
+  string client_id = 1;
+  string client_version = 2;
+
+  // Operating mode
+  enum Mode {
+    REGULAR = 0;      // Normal mode with local scheduler
+    PREFILL = 1;      // Prefill-only mode for disaggregated serving
+    DECODE = 2;       // Decode-only mode for disaggregated serving
+  }
+  Mode mode = 3;
+}
+
+message InitializeResponse {
+  bool success = 1;
+  string scheduler_version = 2;
+
+  // Model information
+  ModelInfo model_info = 3;
+
+  // Server capabilities
+  ServerCapabilities capabilities = 4;
+
+  // Error message if success is false
+  string error_message = 5;
+}
+
+message ModelInfo {
+  string model_name = 1;
+  int32 max_context_length = 2;
+  int32 vocab_size = 3;
+  bool supports_tool_calling = 4;
+  bool supports_vision = 5;
+  repeated string special_tokens = 6;
+
+  // Additional model metadata
+  string model_type = 7;
+  int32 num_layers = 8;
+  int32 hidden_size = 9;
+  int32 num_attention_heads = 10;
+  int32 num_key_value_heads = 11;
+
+  // Tokenizer info
+  string tokenizer_type = 12;
+  repeated int32 eos_token_ids = 13;
+  int32 pad_token_id = 14;
+  int32 bos_token_id = 15;
+}
+
+message ServerCapabilities {
+  bool continuous_batching = 1;
+  bool disaggregated_serving = 2;
+  bool speculative_decoding = 3;
+  int32 max_batch_size = 4;
+  int32 max_num_batched_tokens = 5;
+  int32 max_prefill_tokens = 6;
+  string attention_backend = 7;  // "flashinfer", "triton", "torch"
+
+  // Additional capabilities
+  bool supports_lora = 8;
+  bool supports_grammar = 9;
+  bool supports_multimodal = 10;
+  repeated string supported_modalities = 11;  // ["image", "video", "audio"]
+  bool supports_custom_logit_processor = 12;
+  bool supports_session = 13;
+
+  // Hardware info
+  int32 num_gpus = 14;
+  string gpu_type = 15;
+  int64 total_gpu_memory = 16;
+
+  // Parallelism info
+  int32 tensor_parallel_size = 17;
+  int32 pipeline_parallel_size = 18;
+  int32 data_parallel_size = 19;
+}
+
+// =====================
+// Generate Request
+// =====================
+
+message GenerateRequest {
+  string request_id = 1;
+
+  // Input can be either text or tokenized
+  oneof input {
+    string text = 2;
+    TokenizedInput tokenized = 3;
+  }
+
+  // Multimodal inputs
+  MultimodalInputs mm_inputs = 4;
+
+  // Generation parameters
+  SamplingParams sampling_params = 5;
+
+  // Return options
+  bool return_logprob = 6;
+  int32 logprob_start_len = 7;
+  int32 top_logprobs_num = 8;
+  repeated int32 token_ids_logprob = 9;
+  bool return_hidden_states = 10;
+
+  // Session management
+  SessionParams session_params = 11;
+
+  // For disaggregated serving
+  DisaggregatedParams disaggregated_params = 12;
+
+  // Custom logit processor (serialized)
+  string custom_logit_processor = 13;
+
+  // Request metadata
+  google.protobuf.Timestamp timestamp = 14;
+  bool log_metrics = 15;
+
+  // Input embeddings (alternative to text/tokens)
+  repeated float input_embeds = 16;
+
+  // LoRA adapter ID (if pre-loaded)
+  string lora_id = 17;
+
+  // Data parallel routing
+  int32 data_parallel_rank = 18;
+
+  // For load balancing
+  int32 dp_balance_id = 19;
+}
+
+message TokenizedInput {
+  string original_text = 1;  // For reference
+  repeated int32 input_ids = 2;
+}
+
+message MultimodalInputs {
+  // Simplified multimodal handling - actual data processed by tokenizer
+  repeated string image_urls = 1;
+  repeated string video_urls = 2;
+  repeated string audio_urls = 3;
+
+  // Pre-processed multimodal features (if available)
+  google.protobuf.Struct processed_features = 4;
+
+  // Raw data for direct processing
+  repeated bytes image_data = 5;
+  repeated bytes video_data = 6;
+  repeated bytes audio_data = 7;
+
+  // Modality metadata
+  repeated string modalities = 8;
+}
+
+// =====================
+// Generate Response
+// =====================
+
+message GenerateResponse {
+  string request_id = 1;
+
+  // Response type
+  oneof response {
+    GenerateStreamChunk chunk = 2;
+    GenerateComplete complete = 3;
+    GenerateError error = 4;
+  }
+}
+
+message GenerateStreamChunk {
+  // Generated token
+  int32 token_id = 1;
+  string text = 2;
+
+  // Cumulative counts
+  int32 prompt_tokens = 3;
+  int32 completion_tokens = 4;
+  int32 cached_tokens = 5;
+
+  // Logprobs (if requested)
+  LogProbs logprobs = 6;
+
+  // Hidden states (if requested)
+  repeated float hidden_states = 7;
+
+  // Metadata
+  float generation_time = 8;  // Time to generate this token
+  int32 queue_time = 9;       // Time spent in queue
+}
+
+message GenerateComplete {
+  // Final output
+  repeated int32 output_ids = 1;
+  string output_text = 2;
+
+  // Finish reason
+  enum FinishReason {
+    // The model generated a stop sequence.
+    STOP = 0;
+    // The model reached the maximum generation length.
+    LENGTH = 1;
+    // The model generated an end-of-sequence (EOS) token.
+    EOS_TOKEN = 2;
+    // The model generated a user-provided stop string.
+    STOP_STR = 3;
+    // The request was aborted by the user or system.
+    ABORT = 4;
+  }
+  FinishReason finish_reason = 3;
+
+  // Final counts
+  int32 prompt_tokens = 4;
+  int32 completion_tokens = 5;
+  int32 cached_tokens = 6;
+
+  // Performance metrics
+  float total_generation_time = 7;
+  float time_to_first_token = 8;
+  float tokens_per_second = 9;
+
+  // Spec decode metrics
+  int32 spec_verify_count = 10;
+
+  // All logprobs if requested
+  repeated LogProbs all_logprobs = 11;
+
+  // All hidden states if requested
+  repeated HiddenStates all_hidden_states = 12;
+}
+
+message GenerateError {
+  string message = 1;
+  string http_status_code = 2;
+  string details = 3;
+}
+
+message LogProbs {
+  repeated float token_logprobs = 1;
+  repeated int32 token_ids = 2;
+
+  // Top logprobs at each position
+  repeated TopLogProbs top_logprobs = 3;
+
+  // Decoded text for tokens
+  repeated string token_texts = 4;
+}
+
+message TopLogProbs {
+  repeated float values = 1;
+  repeated int32 token_ids = 2;
+  repeated string token_texts = 3;
+}
+
+message HiddenStates {
+  repeated float values = 1;
+  int32 layer = 2;
+  int32 position = 3;
+}
+
+// =====================
+// Embedding Request
+// =====================
+
+message EmbedRequest {
+  string request_id = 1;
+
+  oneof input {
+    string text = 2;
+    TokenizedInput tokenized = 3;
+  }
+
+  // Multimodal inputs
+  MultimodalInputs mm_inputs = 4;
+
+  // Dummy sampling params for compatibility
+  // EmbedRequest doesn't use sampling_params
+  SamplingParams sampling_params = 5;
+
+  bool log_metrics = 6;
+
+  // Token type IDs for models that require them
+  repeated int32 token_type_ids = 7;
+
+  // Data parallel routing
+  int32 data_parallel_rank = 8;
+
+  // For cross-encoder requests
+  bool is_cross_encoder = 9;
+  repeated string texts = 10;  // For cross-encoder batch
+}
+
+message EmbedResponse {
+  string request_id = 1;
+
+  oneof response {
+    EmbedComplete complete = 2;
+    EmbedError error = 3;
+  }
+}
+
+message EmbedComplete {
+  repeated float embedding = 1;
+  int32 prompt_tokens = 2;
+  int32 cached_tokens = 3;
+
+  // Additional metadata
+  int32 embedding_dim = 4;
+  float generation_time = 5;
+
+  // For batch embeddings
+  repeated Embedding batch_embeddings = 6;
+}
+
+message Embedding {
+  repeated float values = 1;
+  int32 index = 2;
+}
+
+message EmbedError {
+  string message = 1;
+  string code = 2;
+  string details = 3;
+}
+
+// =====================
+// Management Operations
+// =====================
+
+message HealthCheckRequest {
+  bool include_detailed_metrics = 1;
+}
+
+message HealthCheckResponse {
+  bool healthy = 1;
+
+  // Current load metrics
+  int32 num_requests_running = 2;
+  int32 num_requests_waiting = 3;
+  float gpu_cache_usage = 4;
+  float gpu_memory_usage = 5;
+
+  // KV cache metrics
+  int32 kv_cache_total_blocks = 6;
+  int32 kv_cache_used_blocks = 7;
+  float kv_cache_hit_rate = 8;
+
+  // Additional metrics
+  int32 num_grammar_queue_requests = 9;
+  float generation_throughput = 10;  // tokens/sec
+  float average_queue_time = 11;     // seconds
+  float average_generation_time = 12; // seconds
+
+  // System metrics
+  float cpu_usage = 13;
+  int64 memory_usage = 14;
+
+  // Disaggregation metrics
+  int32 num_prefill_requests = 15;
+  int32 num_decode_requests = 16;
+
+  // Detailed metrics (optional)
+  google.protobuf.Struct detailed_metrics = 17;
+}
+
+message AbortRequest {
+  string request_id = 1;
+  string reason = 2;
+}
+
+message AbortResponse {
+  bool success = 1;
+  string message = 2;
+}
+
+message FlushCacheRequest {
+  bool flush_all = 1;
+  repeated string session_ids = 2;  // Flush specific sessions
+}
+
+message FlushCacheResponse {
+  bool success = 1;
+  int32 num_entries_flushed = 2;
+  int64 memory_freed = 3;  // bytes
+  string message = 4;
+}
+
+// =====================
+// Additional Operations (Future)
+// =====================
+
+// Load LoRA adapter
+message LoadLoRARequest {
+  string adapter_id = 1;
+  string adapter_path = 2;
+  int32 rank = 3;
+}
+
+message LoadLoRAResponse {
+  bool success = 1;
+  string adapter_id = 2;
+  string message = 3;
+}
+
+// Unload LoRA adapter
+message UnloadLoRARequest {
+  string adapter_id = 1;
+}
+
+message UnloadLoRAResponse {
+  bool success = 1;
+  string message = 2;
+}
+
+// Update weights
+message UpdateWeightsRequest {
+  oneof source {
+    string disk_path = 1;
+    bytes tensor_data = 2;
+    string remote_url = 3;
+  }
+  string weight_name = 4;
+}
+
+message UpdateWeightsResponse {
+  bool success = 1;
+  string message = 2;
+}
+
+// Get internal state for debugging
+message GetInternalStateRequest {
+  repeated string state_keys = 1;
+}
+
+message GetInternalStateResponse {
+  google.protobuf.Struct state = 1;
+}
+
+// Set internal state for testing
+message SetInternalStateRequest {
+  google.protobuf.Struct state = 1;
+}
+
+message SetInternalStateResponse {
+  bool success = 1;
+  string message = 2;
+}
diff --git a/sgl-router/src/protocols/mod.rs b/sgl-router/src/protocols/mod.rs
new file mode 100644
index 000000000..5243c645f
--- /dev/null
+++ b/sgl-router/src/protocols/mod.rs
@@ -0,0 +1,5 @@
+// Protocol definitions and validation for various LLM APIs
+// This module provides a structured approach to handling different API protocols
+
+pub mod spec;
+pub mod validation;
diff --git a/sgl-router/src/protocols/spec.rs b/sgl-router/src/protocols/spec.rs
new file mode 100644
index 000000000..43e60244c
--- /dev/null
+++ b/sgl-router/src/protocols/spec.rs
@@ -0,0 +1,2681 @@
+use serde::{Deserialize, Serialize};
+use serde_json::Value;
+use std::collections::HashMap;
+
+// # Protocol Specifications
+//
+// This module contains all protocol definitions for OpenAI and SGLang APIs.
+//
+// ## Table of Contents
+//
+// 1. **OPENAI SPEC - Chat Completions API**
+//    - Message Types
+//    - Response Format Types
+//    - Tool/Function Types
+//    - Streaming Delta Types
+//    - Request/Response structures
+//
+// 2. **OPENAI SPEC - Completions API**
+//    - Request/Response structures
+//    - Streaming support
+//
+// 3. **OPENAI SPEC - Responses API**
+//    - Tool Definitions
+//    - Reasoning Configuration
+//    - Input/Output Items
+//    - Service Tier & Tool Choice
+//    - Request/Response structures
+//
+// 4. **OPENAI SPEC - Common**
+//    - Shared Request Components
+//    - Tool Choice Types
+//    - Usage Tracking
+//    - Logprobs Types
+//    - Error Response Types
+//
+// 5. **SGLANG SPEC - GENERATE API**
+//    - Generate Parameters
+//    - Sampling Parameters
+//    - Request/Response structures
+//
+// 6. **SGLANG SPEC - RERANK API**
+//    - Request/Response structures
+//
+// 7. **COMMON**
+//    - GenerationRequest trait
+//    - StringOrArray & LoRAPath types
+//    - Helper functions
+
+// ==================================================================
+// =            OPENAI SPEC - Chat Completions API                  =
+// ==================================================================
+
+// ============= Message Types =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(untagged)]
+pub enum ChatMessage {
+    System {
+        role: String,
+        content: String,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        name: Option<String>,
+    },
+    User {
+        role: String, // "user"
+        content: UserMessageContent,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        name: Option<String>,
+    },
+    Assistant {
+        role: String, // "assistant"
+        #[serde(skip_serializing_if = "Option::is_none")]
+        content: Option<String>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        name: Option<String>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        tool_calls: Option<Vec<ToolCall>>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        function_call: Option<FunctionCallResponse>,
+        /// Reasoning content for O1-style models (SGLang extension)
+        #[serde(skip_serializing_if = "Option::is_none")]
+        reasoning_content: Option<String>,
+    },
+    Tool {
+        role: String, // "tool"
+        content: String,
+        tool_call_id: String,
+    },
+    Function {
+        role: String, // "function"
+        content: String,
+        name: String,
+    },
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(untagged)]
+pub enum UserMessageContent {
+    Text(String),
+    Parts(Vec<ContentPart>),
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(tag = "type")]
+pub enum ContentPart {
+    #[serde(rename = "text")]
+    Text { text: String },
+    #[serde(rename = "image_url")]
+    ImageUrl { image_url: ImageUrl },
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ImageUrl {
+    pub url: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub detail: Option<String>, // "auto", "low", or "high"
+}
+
+// ============= Response Format Types =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(tag = "type")]
+pub enum ResponseFormat {
+    #[serde(rename = "text")]
+    Text,
+    #[serde(rename = "json_object")]
+    JsonObject,
+    #[serde(rename = "json_schema")]
+    JsonSchema { json_schema: JsonSchemaFormat },
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct JsonSchemaFormat {
+    pub name: String,
+    pub schema: Value,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub strict: Option<bool>,
+}
+
+// ============= Streaming Delta Types =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ChatMessageDelta {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub role: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub content: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tool_calls: Option<Vec<ToolCallDelta>>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub function_call: Option<FunctionCallDelta>,
+    /// Reasoning content delta for O1-style models (SGLang extension)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub reasoning_content: Option<String>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ToolCallDelta {
+    pub index: u32,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub id: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(rename = "type")]
+    pub tool_type: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub function: Option<FunctionCallDelta>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct FunctionCallDelta {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub name: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub arguments: Option<String>,
+}
+
+// ============= Request =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ChatCompletionRequest {
+    /// ID of the model to use
+    pub model: String,
+
+    /// A list of messages comprising the conversation so far
+    pub messages: Vec<ChatMessage>,
+
+    /// What sampling temperature to use, between 0 and 2
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub temperature: Option<f32>,
+
+    /// An alternative to sampling with temperature
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub top_p: Option<f32>,
+
+    /// How many chat completion choices to generate for each input message
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub n: Option<u32>,
+
+    /// If set, partial message deltas will be sent
+    #[serde(default)]
+    pub stream: bool,
+
+    /// Options for streaming response
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stream_options: Option<StreamOptions>,
+
+    /// Up to 4 sequences where the API will stop generating further tokens
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stop: Option<StringOrArray>,
+
+    /// The maximum number of tokens to generate
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub max_tokens: Option<u32>,
+
+    /// An upper bound for the number of tokens that can be generated for a completion
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub max_completion_tokens: Option<u32>,
+
+    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub presence_penalty: Option<f32>,
+
+    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub frequency_penalty: Option<f32>,
+
+    /// Modify the likelihood of specified tokens appearing in the completion
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub logit_bias: Option<HashMap<String, f32>>,
+
+    /// A unique identifier representing your end-user
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub user: Option<String>,
+
+    /// If specified, our system will make a best effort to sample deterministically
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub seed: Option<i64>,
+
+    /// Whether to return log probabilities of the output tokens
+    #[serde(default)]
+    pub logprobs: bool,
+
+    /// An integer between 0 and 20 specifying the number of most likely tokens to return
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub top_logprobs: Option<u32>,
+
+    /// An object specifying the format that the model must output
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub response_format: Option<ResponseFormat>,
+
+    /// A list of tools the model may call
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tools: Option<Vec<Tool>>,
+
+    /// Controls which (if any) tool is called by the model
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tool_choice: Option<ToolChoice>,
+
+    /// Whether to enable parallel function calling during tool use
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub parallel_tool_calls: Option<bool>,
+
+    /// Deprecated: use tools instead
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub functions: Option<Vec<Function>>,
+
+    /// Deprecated: use tool_choice instead
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub function_call: Option<FunctionCall>,
+
+    // ============= SGLang Extensions =============
+    /// Top-k sampling parameter (-1 to disable)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub top_k: Option<i32>,
+
+    /// Min-p nucleus sampling parameter
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub min_p: Option<f32>,
+
+    /// Minimum number of tokens to generate
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub min_tokens: Option<u32>,
+
+    /// Repetition penalty for reducing repetitive text
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub repetition_penalty: Option<f32>,
+
+    /// Regex constraint for output generation
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub regex: Option<String>,
+
+    /// EBNF grammar constraint for structured output
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub ebnf: Option<String>,
+
+    /// Specific token IDs to use as stop conditions
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stop_token_ids: Option<Vec<i32>>,
+
+    /// Skip trimming stop tokens from output
+    #[serde(default)]
+    pub no_stop_trim: bool,
+
+    /// Ignore end-of-sequence tokens during generation
+    #[serde(default)]
+    pub ignore_eos: bool,
+
+    /// Continue generating from final assistant message
+    #[serde(default)]
+    pub continue_final_message: bool,
+
+    /// Skip special tokens during detokenization
+    #[serde(default = "default_true")]
+    pub skip_special_tokens: bool,
+
+    // ============= SGLang Extensions =============
+    /// Path to LoRA adapter(s) for model customization
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub lora_path: Option<LoRAPath>,
+
+    /// Session parameters for continual prompting
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub session_params: Option<HashMap<String, serde_json::Value>>,
+
+    /// Separate reasoning content from final answer (O1-style models)
+    #[serde(default = "default_true")]
+    pub separate_reasoning: bool,
+
+    /// Stream reasoning tokens during generation
+    #[serde(default = "default_true")]
+    pub stream_reasoning: bool,
+
+    /// Chat template kwargs
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub chat_template_kwargs: Option<HashMap<String, serde_json::Value>>,
+
+    /// Return model hidden states
+    #[serde(default)]
+    pub return_hidden_states: bool,
+}
+
+impl GenerationRequest for ChatCompletionRequest {
+    fn is_stream(&self) -> bool {
+        self.stream
+    }
+
+    fn get_model(&self) -> Option<&str> {
+        Some(&self.model)
+    }
+
+    fn extract_text_for_routing(&self) -> String {
+        // Extract text from messages for routing decisions
+        self.messages
+            .iter()
+            .filter_map(|msg| match msg {
+                ChatMessage::System { content, .. } => Some(content.clone()),
+                ChatMessage::User { content, .. } => match content {
+                    UserMessageContent::Text(text) => Some(text.clone()),
+                    UserMessageContent::Parts(parts) => {
+                        let texts: Vec<String> = parts
+                            .iter()
+                            .filter_map(|part| match part {
+                                ContentPart::Text { text } => Some(text.clone()),
+                                _ => None,
+                            })
+                            .collect();
+                        Some(texts.join(" "))
+                    }
+                },
+                ChatMessage::Assistant {
+                    content,
+                    reasoning_content,
+                    ..
+                } => {
+                    // Combine content and reasoning content for routing decisions
+                    let main_content = content.clone().unwrap_or_default();
+                    let reasoning = reasoning_content.clone().unwrap_or_default();
+                    if main_content.is_empty() && reasoning.is_empty() {
+                        None
+                    } else {
+                        Some(format!("{} {}", main_content, reasoning).trim().to_string())
+                    }
+                }
+                ChatMessage::Tool { content, .. } => Some(content.clone()),
+                ChatMessage::Function { content, .. } => Some(content.clone()),
+            })
+            .collect::<Vec<String>>()
+            .join(" ")
+    }
+}
+
+// ============= Regular Response =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ChatCompletionResponse {
+    pub id: String,
+    pub object: String, // "chat.completion"
+    pub created: u64,
+    pub model: String,
+    pub choices: Vec<ChatChoice>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub usage: Option<Usage>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub system_fingerprint: Option<String>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ChatChoice {
+    pub index: u32,
+    pub message: ChatMessage,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub logprobs: Option<ChatLogProbs>,
+    pub finish_reason: Option<String>, // "stop", "length", "tool_calls", "content_filter", "function_call"
+    /// Information about which stop condition was matched
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub matched_stop: Option<serde_json::Value>, // Can be string or integer
+    /// Hidden states from the model (SGLang extension)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub hidden_states: Option<Vec<f32>>,
+}
+
+// ============= Streaming Response =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ChatCompletionStreamResponse {
+    pub id: String,
+    pub object: String, // "chat.completion.chunk"
+    pub created: u64,
+    pub model: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub system_fingerprint: Option<String>,
+    pub choices: Vec<ChatStreamChoice>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub usage: Option<Usage>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ChatStreamChoice {
+    pub index: u32,
+    pub delta: ChatMessageDelta,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub logprobs: Option<ChatLogProbs>,
+    pub finish_reason: Option<String>,
+}
+
+// ==================================================================
+// =            OPENAI SPEC - Completions API                       =
+// ==================================================================
+// Completions API request types (v1/completions) - DEPRECATED but still supported
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct CompletionRequest {
+    /// ID of the model to use (required for OpenAI, optional for some implementations, such as SGLang)
+    pub model: String,
+
+    /// The prompt(s) to generate completions for
+    pub prompt: StringOrArray,
+
+    /// The suffix that comes after a completion of inserted text
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub suffix: Option<String>,
+
+    /// The maximum number of tokens to generate
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub max_tokens: Option<u32>,
+
+    /// What sampling temperature to use, between 0 and 2
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub temperature: Option<f32>,
+
+    /// An alternative to sampling with temperature (nucleus sampling)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub top_p: Option<f32>,
+
+    /// How many completions to generate for each prompt
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub n: Option<u32>,
+
+    /// Whether to stream back partial progress
+    #[serde(default)]
+    pub stream: bool,
+
+    /// Options for streaming response
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stream_options: Option<StreamOptions>,
+
+    /// Include the log probabilities on the logprobs most likely tokens
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub logprobs: Option<u32>,
+
+    /// Echo back the prompt in addition to the completion
+    #[serde(default)]
+    pub echo: bool,
+
+    /// Up to 4 sequences where the API will stop generating further tokens
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stop: Option<StringOrArray>,
+
+    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub presence_penalty: Option<f32>,
+
+    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub frequency_penalty: Option<f32>,
+
+    /// Generates best_of completions server-side and returns the "best"
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub best_of: Option<u32>,
+
+    /// Modify the likelihood of specified tokens appearing in the completion
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub logit_bias: Option<HashMap<String, f32>>,
+
+    /// A unique identifier representing your end-user
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub user: Option<String>,
+
+    /// If specified, our system will make a best effort to sample deterministically
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub seed: Option<i64>,
+
+    // ============= SGLang Extensions =============
+    /// Top-k sampling parameter (-1 to disable)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub top_k: Option<i32>,
+
+    /// Min-p nucleus sampling parameter
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub min_p: Option<f32>,
+
+    /// Minimum number of tokens to generate
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub min_tokens: Option<u32>,
+
+    /// Repetition penalty for reducing repetitive text
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub repetition_penalty: Option<f32>,
+
+    /// Regex constraint for output generation
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub regex: Option<String>,
+
+    /// EBNF grammar constraint for structured output
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub ebnf: Option<String>,
+
+    /// JSON schema constraint for structured output
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub json_schema: Option<String>,
+
+    /// Specific token IDs to use as stop conditions
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stop_token_ids: Option<Vec<i32>>,
+
+    /// Skip trimming stop tokens from output
+    #[serde(default)]
+    pub no_stop_trim: bool,
+
+    /// Ignore end-of-sequence tokens during generation
+    #[serde(default)]
+    pub ignore_eos: bool,
+
+    /// Skip special tokens during detokenization
+    #[serde(default = "default_true")]
+    pub skip_special_tokens: bool,
+
+    // ============= SGLang Extensions =============
+    /// Path to LoRA adapter(s) for model customization
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub lora_path: Option<LoRAPath>,
+
+    /// Session parameters for continual prompting
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub session_params: Option<HashMap<String, serde_json::Value>>,
+
+    /// Return model hidden states
+    #[serde(default)]
+    pub return_hidden_states: bool,
+
+    /// Additional fields including bootstrap info for PD routing
+    #[serde(flatten)]
+    pub other: serde_json::Map<String, serde_json::Value>,
+}
+
+impl GenerationRequest for CompletionRequest {
+    fn is_stream(&self) -> bool {
+        self.stream
+    }
+
+    fn get_model(&self) -> Option<&str> {
+        Some(&self.model)
+    }
+
+    fn extract_text_for_routing(&self) -> String {
+        match &self.prompt {
+            StringOrArray::String(s) => s.clone(),
+            StringOrArray::Array(v) => v.join(" "),
+        }
+    }
+}
+
+// ============= Regular Response =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct CompletionResponse {
+    pub id: String,
+    pub object: String, // "text_completion"
+    pub created: u64,
+    pub model: String,
+    pub choices: Vec<CompletionChoice>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub usage: Option<Usage>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub system_fingerprint: Option<String>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct CompletionChoice {
+    pub text: String,
+    pub index: u32,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub logprobs: Option<LogProbs>,
+    pub finish_reason: Option<String>, // "stop", "length", "content_filter", etc.
+    /// Information about which stop condition was matched
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub matched_stop: Option<serde_json::Value>, // Can be string or integer
+    /// Hidden states from the model (SGLang extension)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub hidden_states: Option<Vec<f32>>,
+}
+
+// ============= Streaming Response =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct CompletionStreamResponse {
+    pub id: String,
+    pub object: String, // "text_completion"
+    pub created: u64,
+    pub choices: Vec<CompletionStreamChoice>,
+    pub model: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub system_fingerprint: Option<String>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct CompletionStreamChoice {
+    pub text: String,
+    pub index: u32,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub logprobs: Option<LogProbs>,
+    pub finish_reason: Option<String>,
+}
+
+// ==================================================================
+// =            OPENAI SPEC - Responses API                         =
+// ==================================================================
+
+// ============= Tool Definitions =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ResponseTool {
+    #[serde(rename = "type")]
+    pub r#type: ResponseToolType,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum ResponseToolType {
+    WebSearchPreview,
+    CodeInterpreter,
+}
+
+// ============= Reasoning Configuration =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ResponseReasoningParam {
+    #[serde(default = "default_reasoning_effort")]
+    pub effort: Option<ReasoningEffort>,
+}
+
+fn default_reasoning_effort() -> Option<ReasoningEffort> {
+    Some(ReasoningEffort::Medium)
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum ReasoningEffort {
+    Low,
+    Medium,
+    High,
+}
+
+// ============= Input/Output Items =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(tag = "type")]
+#[serde(rename_all = "snake_case")]
+pub enum ResponseInputOutputItem {
+    #[serde(rename = "message")]
+    Message {
+        id: String,
+        role: String,
+        content: Vec<ResponseContentPart>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        status: Option<String>,
+    },
+    #[serde(rename = "reasoning")]
+    Reasoning {
+        id: String,
+        #[serde(skip_serializing_if = "Vec::is_empty")]
+        summary: Vec<String>,
+        content: Vec<ResponseReasoningContent>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        status: Option<String>,
+    },
+    #[serde(rename = "function_tool_call")]
+    FunctionToolCall {
+        id: String,
+        name: String,
+        arguments: String,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        output: Option<String>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        status: Option<String>,
+    },
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(tag = "type")]
+#[serde(rename_all = "snake_case")]
+pub enum ResponseContentPart {
+    #[serde(rename = "output_text")]
+    OutputText {
+        text: String,
+        #[serde(skip_serializing_if = "Vec::is_empty")]
+        annotations: Vec<String>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        logprobs: Option<ChatLogProbs>,
+    },
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(tag = "type")]
+#[serde(rename_all = "snake_case")]
+pub enum ResponseReasoningContent {
+    #[serde(rename = "reasoning_text")]
+    ReasoningText { text: String },
+}
+
+// ============= Output Items for Response =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(tag = "type")]
+#[serde(rename_all = "snake_case")]
+pub enum ResponseOutputItem {
+    #[serde(rename = "message")]
+    Message {
+        id: String,
+        role: String,
+        content: Vec<ResponseContentPart>,
+        status: String,
+    },
+    #[serde(rename = "reasoning")]
+    Reasoning {
+        id: String,
+        #[serde(skip_serializing_if = "Vec::is_empty")]
+        summary: Vec<String>,
+        content: Vec<ResponseReasoningContent>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        status: Option<String>,
+    },
+    #[serde(rename = "function_tool_call")]
+    FunctionToolCall {
+        id: String,
+        name: String,
+        arguments: String,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        output: Option<String>,
+        status: String,
+    },
+}
+
+// ============= Service Tier =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum ServiceTier {
+    Auto,
+    Default,
+    Flex,
+    Scale,
+    Priority,
+}
+
+impl Default for ServiceTier {
+    fn default() -> Self {
+        Self::Auto
+    }
+}
+
+// ============= Truncation =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum Truncation {
+    Auto,
+    Disabled,
+}
+
+impl Default for Truncation {
+    fn default() -> Self {
+        Self::Disabled
+    }
+}
+
+// ============= Response Status =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum ResponseStatus {
+    Queued,
+    InProgress,
+    Completed,
+    Failed,
+    Cancelled,
+}
+
+// ============= Include Fields =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum IncludeField {
+    #[serde(rename = "code_interpreter_call.outputs")]
+    CodeInterpreterCallOutputs,
+    #[serde(rename = "computer_call_output.output.image_url")]
+    ComputerCallOutputImageUrl,
+    #[serde(rename = "file_search_call.results")]
+    FileSearchCallResults,
+    #[serde(rename = "message.input_image.image_url")]
+    MessageInputImageUrl,
+    #[serde(rename = "message.output_text.logprobs")]
+    MessageOutputTextLogprobs,
+    #[serde(rename = "reasoning.encrypted_content")]
+    ReasoningEncryptedContent,
+}
+
+// ============= Usage Info =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct UsageInfo {
+    pub prompt_tokens: u32,
+    pub completion_tokens: u32,
+    pub total_tokens: u32,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub reasoning_tokens: Option<u32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub prompt_tokens_details: Option<PromptTokenUsageInfo>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct PromptTokenUsageInfo {
+    pub cached_tokens: u32,
+}
+
+// ============= Response Usage Format =============
+
+/// OpenAI Responses API usage format (different from standard UsageInfo)
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ResponseUsage {
+    pub input_tokens: u32,
+    pub output_tokens: u32,
+    pub total_tokens: u32,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub input_tokens_details: Option<InputTokensDetails>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub output_tokens_details: Option<OutputTokensDetails>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct InputTokensDetails {
+    pub cached_tokens: u32,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct OutputTokensDetails {
+    pub reasoning_tokens: u32,
+}
+
+impl UsageInfo {
+    /// Convert to OpenAI Responses API format
+    pub fn to_response_usage(&self) -> ResponseUsage {
+        ResponseUsage {
+            input_tokens: self.prompt_tokens,
+            output_tokens: self.completion_tokens,
+            total_tokens: self.total_tokens,
+            input_tokens_details: self.prompt_tokens_details.as_ref().map(|details| {
+                InputTokensDetails {
+                    cached_tokens: details.cached_tokens,
+                }
+            }),
+            output_tokens_details: self.reasoning_tokens.map(|tokens| OutputTokensDetails {
+                reasoning_tokens: tokens,
+            }),
+        }
+    }
+}
+
+impl From<UsageInfo> for ResponseUsage {
+    fn from(usage: UsageInfo) -> Self {
+        usage.to_response_usage()
+    }
+}
+
+impl ResponseUsage {
+    /// Convert back to standard UsageInfo format
+    pub fn to_usage_info(&self) -> UsageInfo {
+        UsageInfo {
+            prompt_tokens: self.input_tokens,
+            completion_tokens: self.output_tokens,
+            total_tokens: self.total_tokens,
+            reasoning_tokens: self
+                .output_tokens_details
+                .as_ref()
+                .map(|details| details.reasoning_tokens),
+            prompt_tokens_details: self.input_tokens_details.as_ref().map(|details| {
+                PromptTokenUsageInfo {
+                    cached_tokens: details.cached_tokens,
+                }
+            }),
+        }
+    }
+}
+
+fn generate_request_id() -> String {
+    format!("resp_{}", uuid::Uuid::new_v4().simple())
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ResponsesRequest {
+    // ============= Core OpenAI API fields =============
+    /// Run the request in the background
+    #[serde(default)]
+    pub background: bool,
+
+    /// Fields to include in the response
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub include: Option<Vec<IncludeField>>,
+
+    /// Input content - can be string or structured items
+    pub input: ResponseInput,
+
+    /// System instructions for the model
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub instructions: Option<String>,
+
+    /// Maximum number of output tokens
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub max_output_tokens: Option<u32>,
+
+    /// Maximum number of tool calls
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub max_tool_calls: Option<u32>,
+
+    /// Additional metadata
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub metadata: Option<HashMap<String, serde_json::Value>>,
+
+    /// Model to use (optional to match vLLM)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub model: Option<String>,
+
+    /// Whether to enable parallel tool calls
+    #[serde(default = "default_true")]
+    pub parallel_tool_calls: bool,
+
+    /// ID of previous response to continue from
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub previous_response_id: Option<String>,
+
+    /// Reasoning configuration
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub reasoning: Option<ResponseReasoningParam>,
+
+    /// Service tier
+    #[serde(default)]
+    pub service_tier: ServiceTier,
+
+    /// Whether to store the response
+    #[serde(default = "default_true")]
+    pub store: bool,
+
+    /// Whether to stream the response
+    #[serde(default)]
+    pub stream: bool,
+
+    /// Temperature for sampling
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub temperature: Option<f32>,
+
+    /// Tool choice behavior
+    #[serde(default)]
+    pub tool_choice: ToolChoice,
+
+    /// Available tools
+    #[serde(default)]
+    pub tools: Vec<ResponseTool>,
+
+    /// Number of top logprobs to return
+    #[serde(default)]
+    pub top_logprobs: u32,
+
+    /// Top-p sampling parameter
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub top_p: Option<f32>,
+
+    /// Truncation behavior
+    #[serde(default)]
+    pub truncation: Truncation,
+
+    /// User identifier
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub user: Option<String>,
+
+    // ============= SGLang Extensions =============
+    /// Request ID
+    #[serde(default = "generate_request_id")]
+    pub request_id: String,
+
+    /// Request priority
+    #[serde(default)]
+    pub priority: i32,
+
+    /// Frequency penalty
+    #[serde(default)]
+    pub frequency_penalty: f32,
+
+    /// Presence penalty
+    #[serde(default)]
+    pub presence_penalty: f32,
+
+    /// Stop sequences
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stop: Option<StringOrArray>,
+
+    /// Top-k sampling parameter
+    #[serde(default = "default_top_k")]
+    pub top_k: i32,
+
+    /// Min-p sampling parameter
+    #[serde(default)]
+    pub min_p: f32,
+
+    /// Repetition penalty
+    #[serde(default = "default_repetition_penalty")]
+    pub repetition_penalty: f32,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(untagged)]
+pub enum ResponseInput {
+    Text(String),
+    Items(Vec<ResponseInputOutputItem>),
+}
+
+fn default_top_k() -> i32 {
+    -1
+}
+
+fn default_repetition_penalty() -> f32 {
+    1.0
+}
+
+impl ResponsesRequest {
+    /// Default sampling parameters
+    const DEFAULT_TEMPERATURE: f32 = 0.7;
+    const DEFAULT_TOP_P: f32 = 1.0;
+
+    /// Convert to sampling parameters for generation
+    pub fn to_sampling_params(
+        &self,
+        default_max_tokens: u32,
+        default_params: Option<HashMap<String, serde_json::Value>>,
+    ) -> HashMap<String, serde_json::Value> {
+        let mut params = HashMap::new();
+
+        // Use max_output_tokens if available
+        let max_tokens = if let Some(max_output) = self.max_output_tokens {
+            std::cmp::min(max_output, default_max_tokens)
+        } else {
+            default_max_tokens
+        };
+
+        // Avoid exceeding context length by minus 1 token
+        let max_tokens = max_tokens.saturating_sub(1);
+
+        // Temperature
+        let temperature = self.temperature.unwrap_or_else(|| {
+            default_params
+                .as_ref()
+                .and_then(|p| p.get("temperature"))
+                .and_then(|v| v.as_f64())
+                .map(|v| v as f32)
+                .unwrap_or(Self::DEFAULT_TEMPERATURE)
+        });
+
+        // Top-p
+        let top_p = self.top_p.unwrap_or_else(|| {
+            default_params
+                .as_ref()
+                .and_then(|p| p.get("top_p"))
+                .and_then(|v| v.as_f64())
+                .map(|v| v as f32)
+                .unwrap_or(Self::DEFAULT_TOP_P)
+        });
+
+        params.insert(
+            "max_new_tokens".to_string(),
+            serde_json::Value::Number(serde_json::Number::from(max_tokens)),
+        );
+        params.insert(
+            "temperature".to_string(),
+            serde_json::Value::Number(serde_json::Number::from_f64(temperature as f64).unwrap()),
+        );
+        params.insert(
+            "top_p".to_string(),
+            serde_json::Value::Number(serde_json::Number::from_f64(top_p as f64).unwrap()),
+        );
+        params.insert(
+            "frequency_penalty".to_string(),
+            serde_json::Value::Number(
+                serde_json::Number::from_f64(self.frequency_penalty as f64).unwrap(),
+            ),
+        );
+        params.insert(
+            "presence_penalty".to_string(),
+            serde_json::Value::Number(
+                serde_json::Number::from_f64(self.presence_penalty as f64).unwrap(),
+            ),
+        );
+        params.insert(
+            "top_k".to_string(),
+            serde_json::Value::Number(serde_json::Number::from(self.top_k)),
+        );
+        params.insert(
+            "min_p".to_string(),
+            serde_json::Value::Number(serde_json::Number::from_f64(self.min_p as f64).unwrap()),
+        );
+        params.insert(
+            "repetition_penalty".to_string(),
+            serde_json::Value::Number(
+                serde_json::Number::from_f64(self.repetition_penalty as f64).unwrap(),
+            ),
+        );
+
+        if let Some(ref stop) = self.stop {
+            match serde_json::to_value(stop) {
+                Ok(value) => params.insert("stop".to_string(), value),
+                Err(_) => params.insert("stop".to_string(), serde_json::Value::Null),
+            };
+        }
+
+        // Apply any additional default parameters
+        if let Some(default_params) = default_params {
+            for (key, value) in default_params {
+                params.entry(key).or_insert(value);
+            }
+        }
+
+        params
+    }
+}
+
+impl GenerationRequest for ResponsesRequest {
+    fn is_stream(&self) -> bool {
+        self.stream
+    }
+
+    fn get_model(&self) -> Option<&str> {
+        self.model.as_deref()
+    }
+
+    fn extract_text_for_routing(&self) -> String {
+        match &self.input {
+            ResponseInput::Text(text) => text.clone(),
+            ResponseInput::Items(items) => items
+                .iter()
+                .filter_map(|item| match item {
+                    ResponseInputOutputItem::Message { content, .. } => {
+                        let texts: Vec<String> = content
+                            .iter()
+                            .map(|part| match part {
+                                ResponseContentPart::OutputText { text, .. } => text.clone(),
+                            })
+                            .collect();
+                        if texts.is_empty() {
+                            None
+                        } else {
+                            Some(texts.join(" "))
+                        }
+                    }
+                    ResponseInputOutputItem::Reasoning { content, .. } => {
+                        let texts: Vec<String> = content
+                            .iter()
+                            .map(|part| match part {
+                                ResponseReasoningContent::ReasoningText { text } => text.clone(),
+                            })
+                            .collect();
+                        if texts.is_empty() {
+                            None
+                        } else {
+                            Some(texts.join(" "))
+                        }
+                    }
+                    ResponseInputOutputItem::FunctionToolCall { arguments, .. } => {
+                        Some(arguments.clone())
+                    }
+                })
+                .collect::<Vec<String>>()
+                .join(" "),
+        }
+    }
+}
+
+fn generate_response_id() -> String {
+    format!("resp_{}", uuid::Uuid::new_v4().simple())
+}
+
+fn current_timestamp() -> i64 {
+    std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)
+        .unwrap_or_else(|_| std::time::Duration::from_secs(0))
+        .as_secs() as i64
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ResponsesResponse {
+    /// Response ID
+    #[serde(default = "generate_response_id")]
+    pub id: String,
+
+    /// Object type
+    #[serde(default = "default_object_type")]
+    pub object: String,
+
+    /// Creation timestamp
+    #[serde(default = "current_timestamp")]
+    pub created_at: i64,
+
+    /// Model name
+    pub model: String,
+
+    /// Output items
+    #[serde(default)]
+    pub output: Vec<ResponseOutputItem>,
+
+    /// Response status
+    pub status: ResponseStatus,
+
+    /// Usage information
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub usage: Option<UsageInfo>,
+
+    /// Whether parallel tool calls are enabled
+    #[serde(default = "default_true")]
+    pub parallel_tool_calls: bool,
+
+    /// Tool choice setting
+    #[serde(default = "default_tool_choice")]
+    pub tool_choice: String,
+
+    /// Available tools
+    #[serde(default)]
+    pub tools: Vec<ResponseTool>,
+}
+
+fn default_object_type() -> String {
+    "response".to_string()
+}
+
+fn default_tool_choice() -> String {
+    "auto".to_string()
+}
+
+impl ResponsesResponse {
+    /// Create a response from a request
+    #[allow(clippy::too_many_arguments)]
+    pub fn from_request(
+        request: &ResponsesRequest,
+        _sampling_params: &HashMap<String, serde_json::Value>,
+        model_name: String,
+        created_time: i64,
+        output: Vec<ResponseOutputItem>,
+        status: ResponseStatus,
+        usage: Option<UsageInfo>,
+    ) -> Self {
+        Self {
+            id: request.request_id.clone(),
+            object: "response".to_string(),
+            created_at: created_time,
+            model: model_name,
+            output,
+            status,
+            usage,
+            parallel_tool_calls: request.parallel_tool_calls,
+            tool_choice: match &request.tool_choice {
+                ToolChoice::Value(ToolChoiceValue::Auto) => "auto".to_string(),
+                ToolChoice::Value(ToolChoiceValue::Required) => "required".to_string(),
+                ToolChoice::Value(ToolChoiceValue::None) => "none".to_string(),
+                ToolChoice::Function { .. } => "function".to_string(),
+            },
+            tools: request.tools.clone(),
+        }
+    }
+
+    /// Create a new response with default values
+    pub fn new(request_id: String, model: String, status: ResponseStatus) -> Self {
+        Self {
+            id: request_id,
+            object: "response".to_string(),
+            created_at: current_timestamp(),
+            model,
+            output: Vec::new(),
+            status,
+            usage: None,
+            parallel_tool_calls: true,
+            tool_choice: "auto".to_string(),
+            tools: Vec::new(),
+        }
+    }
+
+    /// Add an output item to the response
+    pub fn add_output(&mut self, item: ResponseOutputItem) {
+        self.output.push(item);
+    }
+
+    /// Set the usage information
+    pub fn set_usage(&mut self, usage: UsageInfo) {
+        self.usage = Some(usage);
+    }
+
+    /// Update the status
+    pub fn set_status(&mut self, status: ResponseStatus) {
+        self.status = status;
+    }
+
+    /// Check if the response is complete
+    pub fn is_complete(&self) -> bool {
+        matches!(self.status, ResponseStatus::Completed)
+    }
+
+    /// Check if the response is in progress
+    pub fn is_in_progress(&self) -> bool {
+        matches!(self.status, ResponseStatus::InProgress)
+    }
+
+    /// Check if the response failed
+    pub fn is_failed(&self) -> bool {
+        matches!(self.status, ResponseStatus::Failed)
+    }
+
+    /// Check if the response was cancelled
+    pub fn is_cancelled(&self) -> bool {
+        matches!(self.status, ResponseStatus::Cancelled)
+    }
+
+    /// Check if the response is queued
+    pub fn is_queued(&self) -> bool {
+        matches!(self.status, ResponseStatus::Queued)
+    }
+
+    /// Convert usage to OpenAI Responses API format
+    pub fn usage_in_response_format(&self) -> Option<ResponseUsage> {
+        self.usage.as_ref().map(|usage| usage.to_response_usage())
+    }
+
+    /// Get the response as a JSON value with usage in response format
+    pub fn to_response_format(&self) -> serde_json::Value {
+        let mut response = serde_json::to_value(self).unwrap_or(serde_json::Value::Null);
+
+        // Convert usage to response format if present
+        if let Some(usage) = &self.usage {
+            if let Ok(usage_value) = serde_json::to_value(usage.to_response_usage()) {
+                response["usage"] = usage_value;
+            }
+        }
+
+        response
+    }
+}
+
+// ============= Helper Functions =============
+
+impl ResponseOutputItem {
+    /// Create a new message output item
+    pub fn new_message(
+        id: String,
+        role: String,
+        content: Vec<ResponseContentPart>,
+        status: String,
+    ) -> Self {
+        Self::Message {
+            id,
+            role,
+            content,
+            status,
+        }
+    }
+
+    /// Create a new reasoning output item
+    pub fn new_reasoning(
+        id: String,
+        summary: Vec<String>,
+        content: Vec<ResponseReasoningContent>,
+        status: Option<String>,
+    ) -> Self {
+        Self::Reasoning {
+            id,
+            summary,
+            content,
+            status,
+        }
+    }
+
+    /// Create a new function tool call output item
+    pub fn new_function_tool_call(
+        id: String,
+        name: String,
+        arguments: String,
+        output: Option<String>,
+        status: String,
+    ) -> Self {
+        Self::FunctionToolCall {
+            id,
+            name,
+            arguments,
+            output,
+            status,
+        }
+    }
+}
+
+impl ResponseContentPart {
+    /// Create a new text content part
+    pub fn new_text(
+        text: String,
+        annotations: Vec<String>,
+        logprobs: Option<ChatLogProbs>,
+    ) -> Self {
+        Self::OutputText {
+            text,
+            annotations,
+            logprobs,
+        }
+    }
+}
+
+impl ResponseReasoningContent {
+    /// Create a new reasoning text content
+    pub fn new_reasoning_text(text: String) -> Self {
+        Self::ReasoningText { text }
+    }
+}
+
+impl UsageInfo {
+    /// Create a new usage info with token counts
+    pub fn new(prompt_tokens: u32, completion_tokens: u32, reasoning_tokens: Option<u32>) -> Self {
+        Self {
+            prompt_tokens,
+            completion_tokens,
+            total_tokens: prompt_tokens + completion_tokens,
+            reasoning_tokens,
+            prompt_tokens_details: None,
+        }
+    }
+
+    /// Create usage info with cached token details
+    pub fn new_with_cached(
+        prompt_tokens: u32,
+        completion_tokens: u32,
+        reasoning_tokens: Option<u32>,
+        cached_tokens: u32,
+    ) -> Self {
+        Self {
+            prompt_tokens,
+            completion_tokens,
+            total_tokens: prompt_tokens + completion_tokens,
+            reasoning_tokens,
+            prompt_tokens_details: Some(PromptTokenUsageInfo { cached_tokens }),
+        }
+    }
+}
+
+// ==================================================================
+// =            OPENAI SPEC - Common                                =
+// ==================================================================
+
+// ============= Shared Request Components =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct StreamOptions {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub include_usage: Option<bool>,
+}
+
+// ============= Tool Choice Types =============
+
+/// Tool choice value for simple string options
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum ToolChoiceValue {
+    Auto,
+    Required,
+    None,
+}
+
+/// Tool choice for both Chat Completion and Responses APIs
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(untagged)]
+pub enum ToolChoice {
+    Value(ToolChoiceValue),
+    Function {
+        #[serde(rename = "type")]
+        tool_type: String, // "function"
+        function: FunctionChoice,
+    },
+}
+
+impl Default for ToolChoice {
+    fn default() -> Self {
+        Self::Value(ToolChoiceValue::Auto)
+    }
+}
+
+/// Function choice specification for ToolChoice::Function
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct FunctionChoice {
+    pub name: String,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct Tool {
+    #[serde(rename = "type")]
+    pub tool_type: String, // "function"
+    pub function: Function,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct Function {
+    pub name: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub description: Option<String>,
+    pub parameters: Value, // JSON Schema
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ToolCall {
+    pub id: String,
+    #[serde(rename = "type")]
+    pub tool_type: String, // "function"
+    pub function: FunctionCallResponse,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(untagged)]
+pub enum FunctionCall {
+    None,
+    Auto,
+    Function { name: String },
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct FunctionCallResponse {
+    pub name: String,
+    #[serde(default)]
+    pub arguments: Option<String>, // JSON string
+}
+
+// ============= Usage Tracking =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct Usage {
+    pub prompt_tokens: u32,
+    pub completion_tokens: u32,
+    pub total_tokens: u32,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub completion_tokens_details: Option<CompletionTokensDetails>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct CompletionTokensDetails {
+    pub reasoning_tokens: Option<u32>,
+}
+
+// ============= Logprobs Types =============
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct LogProbs {
+    pub tokens: Vec<String>,
+    pub token_logprobs: Vec<Option<f32>>,
+    pub top_logprobs: Vec<Option<HashMap<String, f32>>>,
+    pub text_offset: Vec<u32>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ChatLogProbs {
+    pub content: Option<Vec<ChatLogProbsContent>>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ChatLogProbsContent {
+    pub token: String,
+    pub logprob: f32,
+    pub bytes: Option<Vec<u8>>,
+    pub top_logprobs: Vec<TopLogProb>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct TopLogProb {
+    pub token: String,
+    pub logprob: f32,
+    pub bytes: Option<Vec<u8>>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ErrorResponse {
+    pub error: ErrorDetail,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ErrorDetail {
+    pub message: String,
+    #[serde(rename = "type")]
+    pub error_type: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub param: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub code: Option<String>,
+}
+
+// ==================================================================
+// =            SGLANG SPEC - GENERATE API                          =
+// ==================================================================
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(untagged)]
+pub enum InputIds {
+    Single(Vec<i32>),
+    Batch(Vec<Vec<i32>>),
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize, Default)]
+pub struct GenerateParameters {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub best_of: Option<u32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub decoder_input_details: Option<bool>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub details: Option<bool>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub do_sample: Option<bool>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub max_new_tokens: Option<u32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub repetition_penalty: Option<f32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub return_full_text: Option<bool>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub seed: Option<u64>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stop: Option<Vec<String>>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub temperature: Option<f32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub top_k: Option<u32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub top_p: Option<f32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub truncate: Option<u32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub typical_p: Option<f32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub watermark: Option<bool>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize, Default)]
+pub struct SamplingParams {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub temperature: Option<f32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub max_new_tokens: Option<u32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub top_p: Option<f32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub top_k: Option<i32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub frequency_penalty: Option<f32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub presence_penalty: Option<f32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub repetition_penalty: Option<f32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stop: Option<StringOrArray>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub ignore_eos: Option<bool>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub skip_special_tokens: Option<bool>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub json_schema: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub regex: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub ebnf: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub min_p: Option<f32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub min_tokens: Option<u32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stop_token_ids: Option<Vec<i32>>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub no_stop_trim: Option<bool>,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct GenerateRequest {
+    /// The prompt to generate from (OpenAI style)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub prompt: Option<StringOrArray>,
+
+    /// Text input - SGLang native format
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub text: Option<String>,
+
+    /// Input IDs for tokenized input
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub input_ids: Option<InputIds>,
+
+    /// Generation parameters
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub parameters: Option<GenerateParameters>,
+
+    /// Sampling parameters (sglang style)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub sampling_params: Option<SamplingParams>,
+
+    /// Whether to stream the response
+    #[serde(default)]
+    pub stream: bool,
+
+    /// Whether to return logprobs
+    #[serde(default)]
+    pub return_logprob: bool,
+
+    // ============= SGLang Extensions =============
+    /// Path to LoRA adapter(s) for model customization
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub lora_path: Option<LoRAPath>,
+
+    /// Session parameters for continual prompting
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub session_params: Option<HashMap<String, serde_json::Value>>,
+
+    /// Return model hidden states
+    #[serde(default)]
+    pub return_hidden_states: bool,
+
+    /// Request ID for tracking
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub rid: Option<String>,
+}
+
+impl GenerationRequest for GenerateRequest {
+    fn is_stream(&self) -> bool {
+        self.stream
+    }
+
+    fn get_model(&self) -> Option<&str> {
+        // Generate requests typically don't have a model field
+        None
+    }
+
+    fn extract_text_for_routing(&self) -> String {
+        // Check fields in priority order: text, prompt, inputs
+        if let Some(ref text) = self.text {
+            return text.clone();
+        }
+
+        if let Some(ref prompt) = self.prompt {
+            return match prompt {
+                StringOrArray::String(s) => s.clone(),
+                StringOrArray::Array(v) => v.join(" "),
+            };
+        }
+
+        if let Some(ref input_ids) = self.input_ids {
+            return match input_ids {
+                InputIds::Single(ids) => ids
+                    .iter()
+                    .map(|&id| id.to_string())
+                    .collect::<Vec<String>>()
+                    .join(" "),
+                InputIds::Batch(batches) => batches
+                    .iter()
+                    .flat_map(|batch| batch.iter().map(|&id| id.to_string()))
+                    .collect::<Vec<String>>()
+                    .join(" "),
+            };
+        }
+
+        // No text input found
+        String::new()
+    }
+}
+
+// ==================================================================
+// =            SGLANG SPEC - RERANK API                            =
+// ==================================================================
+
+// Constants for rerank API
+pub const DEFAULT_MODEL_NAME: &str = "default";
+
+/// Rerank request for scoring documents against a query
+/// Used for RAG systems and document relevance scoring
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct RerankRequest {
+    /// The query text to rank documents against
+    pub query: String,
+
+    /// List of documents to be ranked
+    pub documents: Vec<String>,
+
+    /// Model to use for reranking
+    #[serde(default = "default_model_name")]
+    pub model: String,
+
+    /// Maximum number of documents to return (optional)
+    pub top_k: Option<usize>,
+
+    /// Whether to return documents in addition to scores
+    #[serde(default = "default_return_documents")]
+    pub return_documents: bool,
+
+    // SGLang specific extensions
+    /// Request ID for tracking
+    pub rid: Option<StringOrArray>,
+
+    /// User identifier
+    pub user: Option<String>,
+}
+
+fn default_model_name() -> String {
+    DEFAULT_MODEL_NAME.to_string()
+}
+
+fn default_return_documents() -> bool {
+    true
+}
+
+/// Individual rerank result
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct RerankResult {
+    /// Relevance score for the document
+    pub score: f32,
+
+    /// The document text (if return_documents was true)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub document: Option<String>,
+
+    /// Original index of the document in the request
+    pub index: usize,
+
+    /// Additional metadata about the ranking
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub meta_info: Option<HashMap<String, Value>>,
+}
+
+/// Rerank response containing sorted results
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct RerankResponse {
+    /// Ranked results sorted by score (highest first)
+    pub results: Vec<RerankResult>,
+
+    /// Model used for reranking
+    pub model: String,
+
+    /// Usage information
+    pub usage: Option<UsageInfo>,
+
+    /// Response object type
+    #[serde(default = "default_rerank_object")]
+    pub object: String,
+
+    /// Response ID
+    pub id: String,
+
+    /// Creation timestamp
+    pub created: i64,
+}
+
+fn default_rerank_object() -> String {
+    "rerank".to_string()
+}
+
+/// V1 API compatibility format for rerank requests
+/// Matches Python's V1RerankReqInput
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct V1RerankReqInput {
+    pub query: String,
+    pub documents: Vec<String>,
+}
+
+/// Convert V1RerankReqInput to RerankRequest
+impl From<V1RerankReqInput> for RerankRequest {
+    fn from(v1: V1RerankReqInput) -> Self {
+        RerankRequest {
+            query: v1.query,
+            documents: v1.documents,
+            model: default_model_name(),
+            top_k: None,
+            return_documents: true,
+            rid: None,
+            user: None,
+        }
+    }
+}
+
+/// Implementation of GenerationRequest trait for RerankRequest
+impl GenerationRequest for RerankRequest {
+    fn get_model(&self) -> Option<&str> {
+        Some(&self.model)
+    }
+
+    fn is_stream(&self) -> bool {
+        false // Reranking doesn't support streaming
+    }
+
+    fn extract_text_for_routing(&self) -> String {
+        self.query.clone()
+    }
+}
+
+impl RerankRequest {
+    pub fn validate(&self) -> Result<(), String> {
+        // Validate query is not empty
+        if self.query.trim().is_empty() {
+            return Err("Query cannot be empty".to_string());
+        }
+
+        // Validate documents list
+        if self.documents.is_empty() {
+            return Err("Documents list cannot be empty".to_string());
+        }
+
+        // Validate top_k if specified
+        if let Some(k) = self.top_k {
+            if k == 0 {
+                return Err("top_k must be greater than 0".to_string());
+            }
+            if k > self.documents.len() {
+                // This is allowed but we log a warning
+                tracing::warn!(
+                    "top_k ({}) is greater than number of documents ({})",
+                    k,
+                    self.documents.len()
+                );
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Get the effective top_k value
+    pub fn effective_top_k(&self) -> usize {
+        self.top_k.unwrap_or(self.documents.len())
+    }
+}
+
+impl RerankResponse {
+    pub fn new(results: Vec<RerankResult>, model: String, request_id: String) -> Self {
+        RerankResponse {
+            results,
+            model,
+            usage: None,
+            object: default_rerank_object(),
+            id: request_id,
+            created: current_timestamp(),
+        }
+    }
+
+    /// Sort results by score in descending order
+    pub fn sort_by_score(&mut self) {
+        self.results.sort_by(|a, b| {
+            b.score
+                .partial_cmp(&a.score)
+                .unwrap_or(std::cmp::Ordering::Equal)
+        });
+    }
+
+    /// Apply top_k limit to results
+    pub fn apply_top_k(&mut self, k: usize) {
+        self.results.truncate(k);
+    }
+}
+
+// ==================================================================
+// =            COMMON                                              =
+// ==================================================================
+
+/// Helper function for serde default value
+pub fn default_true() -> bool {
+    true
+}
+
+/// Common trait for all generation requests across different APIs
+pub trait GenerationRequest: Send + Sync {
+    /// Check if the request is for streaming
+    fn is_stream(&self) -> bool;
+
+    /// Get the model name if specified
+    fn get_model(&self) -> Option<&str>;
+
+    /// Extract text content for routing decisions
+    fn extract_text_for_routing(&self) -> String;
+}
+
+/// Helper type for string or array of strings
+#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
+#[serde(untagged)]
+pub enum StringOrArray {
+    String(String),
+    Array(Vec<String>),
+}
+impl StringOrArray {
+    /// Get the number of items in the StringOrArray
+    pub fn len(&self) -> usize {
+        match self {
+            StringOrArray::String(_) => 1,
+            StringOrArray::Array(arr) => arr.len(),
+        }
+    }
+
+    /// Check if the StringOrArray is empty
+    pub fn is_empty(&self) -> bool {
+        match self {
+            StringOrArray::String(s) => s.is_empty(),
+            StringOrArray::Array(arr) => arr.is_empty(),
+        }
+    }
+
+    /// Convert to a vector of strings
+    pub fn to_vec(&self) -> Vec<String> {
+        match self {
+            StringOrArray::String(s) => vec![s.clone()],
+            StringOrArray::Array(arr) => arr.clone(),
+        }
+    }
+}
+
+/// LoRA adapter path - can be single path or batch of paths (SGLang extension)
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(untagged)]
+pub enum LoRAPath {
+    Single(Option<String>),
+    Batch(Vec<Option<String>>),
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use serde_json;
+
+    // ==================================================================
+    // =            RERANK REQUEST TESTS                                =
+    // ==================================================================
+
+    #[test]
+    fn test_rerank_request_serialization() {
+        let request = RerankRequest {
+            query: "test query".to_string(),
+            documents: vec!["doc1".to_string(), "doc2".to_string()],
+            model: "test-model".to_string(),
+            top_k: Some(5),
+            return_documents: true,
+            rid: Some(StringOrArray::String("req-123".to_string())),
+            user: Some("user-456".to_string()),
+        };
+
+        let serialized = serde_json::to_string(&request).unwrap();
+        let deserialized: RerankRequest = serde_json::from_str(&serialized).unwrap();
+
+        assert_eq!(deserialized.query, request.query);
+        assert_eq!(deserialized.documents, request.documents);
+        assert_eq!(deserialized.model, request.model);
+        assert_eq!(deserialized.top_k, request.top_k);
+        assert_eq!(deserialized.return_documents, request.return_documents);
+        assert_eq!(deserialized.rid, request.rid);
+        assert_eq!(deserialized.user, request.user);
+    }
+
+    #[test]
+    fn test_rerank_request_deserialization_with_defaults() {
+        let json = r#"{
+            "query": "test query",
+            "documents": ["doc1", "doc2"]
+        }"#;
+
+        let request: RerankRequest = serde_json::from_str(json).unwrap();
+
+        assert_eq!(request.query, "test query");
+        assert_eq!(request.documents, vec!["doc1", "doc2"]);
+        assert_eq!(request.model, default_model_name());
+        assert_eq!(request.top_k, None);
+        assert!(request.return_documents);
+        assert_eq!(request.rid, None);
+        assert_eq!(request.user, None);
+    }
+
+    #[test]
+    fn test_rerank_request_validation_success() {
+        let request = RerankRequest {
+            query: "valid query".to_string(),
+            documents: vec!["doc1".to_string(), "doc2".to_string()],
+            model: "test-model".to_string(),
+            top_k: Some(2),
+            return_documents: true,
+            rid: None,
+            user: None,
+        };
+
+        assert!(request.validate().is_ok());
+    }
+
+    #[test]
+    fn test_rerank_request_validation_empty_query() {
+        let request = RerankRequest {
+            query: "".to_string(),
+            documents: vec!["doc1".to_string()],
+            model: "test-model".to_string(),
+            top_k: None,
+            return_documents: true,
+            rid: None,
+            user: None,
+        };
+
+        let result = request.validate();
+        assert!(result.is_err());
+        assert_eq!(result.unwrap_err(), "Query cannot be empty");
+    }
+
+    #[test]
+    fn test_rerank_request_validation_whitespace_query() {
+        let request = RerankRequest {
+            query: "   ".to_string(),
+            documents: vec!["doc1".to_string()],
+            model: "test-model".to_string(),
+            top_k: None,
+            return_documents: true,
+            rid: None,
+            user: None,
+        };
+
+        let result = request.validate();
+        assert!(result.is_err());
+        assert_eq!(result.unwrap_err(), "Query cannot be empty");
+    }
+
+    #[test]
+    fn test_rerank_request_validation_empty_documents() {
+        let request = RerankRequest {
+            query: "test query".to_string(),
+            documents: vec![],
+            model: "test-model".to_string(),
+            top_k: None,
+            return_documents: true,
+            rid: None,
+            user: None,
+        };
+
+        let result = request.validate();
+        assert!(result.is_err());
+        assert_eq!(result.unwrap_err(), "Documents list cannot be empty");
+    }
+
+    #[test]
+    fn test_rerank_request_validation_top_k_zero() {
+        let request = RerankRequest {
+            query: "test query".to_string(),
+            documents: vec!["doc1".to_string(), "doc2".to_string()],
+            model: "test-model".to_string(),
+            top_k: Some(0),
+            return_documents: true,
+            rid: None,
+            user: None,
+        };
+
+        let result = request.validate();
+        assert!(result.is_err());
+        assert_eq!(result.unwrap_err(), "top_k must be greater than 0");
+    }
+
+    #[test]
+    fn test_rerank_request_validation_top_k_greater_than_docs() {
+        let request = RerankRequest {
+            query: "test query".to_string(),
+            documents: vec!["doc1".to_string(), "doc2".to_string()],
+            model: "test-model".to_string(),
+            top_k: Some(5),
+            return_documents: true,
+            rid: None,
+            user: None,
+        };
+
+        // This should pass but log a warning
+        assert!(request.validate().is_ok());
+    }
+
+    #[test]
+    fn test_rerank_request_effective_top_k() {
+        let request = RerankRequest {
+            query: "test query".to_string(),
+            documents: vec!["doc1".to_string(), "doc2".to_string(), "doc3".to_string()],
+            model: "test-model".to_string(),
+            top_k: Some(2),
+            return_documents: true,
+            rid: None,
+            user: None,
+        };
+
+        assert_eq!(request.effective_top_k(), 2);
+    }
+
+    #[test]
+    fn test_rerank_request_effective_top_k_none() {
+        let request = RerankRequest {
+            query: "test query".to_string(),
+            documents: vec!["doc1".to_string(), "doc2".to_string(), "doc3".to_string()],
+            model: "test-model".to_string(),
+            top_k: None,
+            return_documents: true,
+            rid: None,
+            user: None,
+        };
+
+        assert_eq!(request.effective_top_k(), 3);
+    }
+
+    // ==================================================================
+    // =            RERANK RESPONSE TESTS                               =
+    // ==================================================================
+
+    #[test]
+    fn test_rerank_response_creation() {
+        let results = vec![
+            RerankResult {
+                score: 0.8,
+                document: Some("doc1".to_string()),
+                index: 0,
+                meta_info: None,
+            },
+            RerankResult {
+                score: 0.6,
+                document: Some("doc2".to_string()),
+                index: 1,
+                meta_info: None,
+            },
+        ];
+
+        let response = RerankResponse::new(
+            results.clone(),
+            "test-model".to_string(),
+            "req-123".to_string(),
+        );
+
+        assert_eq!(response.results.len(), 2);
+        assert_eq!(response.model, "test-model");
+        assert_eq!(response.id, "req-123");
+        assert_eq!(response.object, "rerank");
+        assert!(response.created > 0);
+    }
+
+    #[test]
+    fn test_rerank_response_serialization() {
+        let results = vec![RerankResult {
+            score: 0.8,
+            document: Some("doc1".to_string()),
+            index: 0,
+            meta_info: None,
+        }];
+
+        let response =
+            RerankResponse::new(results, "test-model".to_string(), "req-123".to_string());
+
+        let serialized = serde_json::to_string(&response).unwrap();
+        let deserialized: RerankResponse = serde_json::from_str(&serialized).unwrap();
+
+        assert_eq!(deserialized.results.len(), response.results.len());
+        assert_eq!(deserialized.model, response.model);
+        assert_eq!(deserialized.id, response.id);
+        assert_eq!(deserialized.object, response.object);
+    }
+
+    #[test]
+    fn test_rerank_response_sort_by_score() {
+        let results = vec![
+            RerankResult {
+                score: 0.6,
+                document: Some("doc2".to_string()),
+                index: 1,
+                meta_info: None,
+            },
+            RerankResult {
+                score: 0.8,
+                document: Some("doc1".to_string()),
+                index: 0,
+                meta_info: None,
+            },
+            RerankResult {
+                score: 0.4,
+                document: Some("doc3".to_string()),
+                index: 2,
+                meta_info: None,
+            },
+        ];
+
+        let mut response =
+            RerankResponse::new(results, "test-model".to_string(), "req-123".to_string());
+
+        response.sort_by_score();
+
+        assert_eq!(response.results[0].score, 0.8);
+        assert_eq!(response.results[0].index, 0);
+        assert_eq!(response.results[1].score, 0.6);
+        assert_eq!(response.results[1].index, 1);
+        assert_eq!(response.results[2].score, 0.4);
+        assert_eq!(response.results[2].index, 2);
+    }
+
+    #[test]
+    fn test_rerank_response_apply_top_k() {
+        let results = vec![
+            RerankResult {
+                score: 0.8,
+                document: Some("doc1".to_string()),
+                index: 0,
+                meta_info: None,
+            },
+            RerankResult {
+                score: 0.6,
+                document: Some("doc2".to_string()),
+                index: 1,
+                meta_info: None,
+            },
+            RerankResult {
+                score: 0.4,
+                document: Some("doc3".to_string()),
+                index: 2,
+                meta_info: None,
+            },
+        ];
+
+        let mut response =
+            RerankResponse::new(results, "test-model".to_string(), "req-123".to_string());
+
+        response.apply_top_k(2);
+
+        assert_eq!(response.results.len(), 2);
+        assert_eq!(response.results[0].score, 0.8);
+        assert_eq!(response.results[1].score, 0.6);
+    }
+
+    #[test]
+    fn test_rerank_response_apply_top_k_larger_than_results() {
+        let results = vec![RerankResult {
+            score: 0.8,
+            document: Some("doc1".to_string()),
+            index: 0,
+            meta_info: None,
+        }];
+
+        let mut response =
+            RerankResponse::new(results, "test-model".to_string(), "req-123".to_string());
+
+        response.apply_top_k(5);
+
+        assert_eq!(response.results.len(), 1);
+    }
+
+    // ==================================================================
+    // =            RERANK RESULT TESTS                                 =
+    // ==================================================================
+
+    #[test]
+    fn test_rerank_result_serialization() {
+        let result = RerankResult {
+            score: 0.85,
+            document: Some("test document".to_string()),
+            index: 42,
+            meta_info: Some(HashMap::from([
+                ("confidence".to_string(), Value::String("high".to_string())),
+                (
+                    "processing_time".to_string(),
+                    Value::Number(serde_json::Number::from(150)),
+                ),
+            ])),
+        };
+
+        let serialized = serde_json::to_string(&result).unwrap();
+        let deserialized: RerankResult = serde_json::from_str(&serialized).unwrap();
+
+        assert_eq!(deserialized.score, result.score);
+        assert_eq!(deserialized.document, result.document);
+        assert_eq!(deserialized.index, result.index);
+        assert_eq!(deserialized.meta_info, result.meta_info);
+    }
+
+    #[test]
+    fn test_rerank_result_serialization_without_document() {
+        let result = RerankResult {
+            score: 0.85,
+            document: None,
+            index: 42,
+            meta_info: None,
+        };
+
+        let serialized = serde_json::to_string(&result).unwrap();
+        let deserialized: RerankResult = serde_json::from_str(&serialized).unwrap();
+
+        assert_eq!(deserialized.score, result.score);
+        assert_eq!(deserialized.document, result.document);
+        assert_eq!(deserialized.index, result.index);
+        assert_eq!(deserialized.meta_info, result.meta_info);
+    }
+
+    // ==================================================================
+    // =            V1 COMPATIBILITY TESTS                              =
+    // ==================================================================
+
+    #[test]
+    fn test_v1_rerank_req_input_serialization() {
+        let v1_input = V1RerankReqInput {
+            query: "test query".to_string(),
+            documents: vec!["doc1".to_string(), "doc2".to_string()],
+        };
+
+        let serialized = serde_json::to_string(&v1_input).unwrap();
+        let deserialized: V1RerankReqInput = serde_json::from_str(&serialized).unwrap();
+
+        assert_eq!(deserialized.query, v1_input.query);
+        assert_eq!(deserialized.documents, v1_input.documents);
+    }
+
+    #[test]
+    fn test_v1_to_rerank_request_conversion() {
+        let v1_input = V1RerankReqInput {
+            query: "test query".to_string(),
+            documents: vec!["doc1".to_string(), "doc2".to_string()],
+        };
+
+        let request: RerankRequest = v1_input.into();
+
+        assert_eq!(request.query, "test query");
+        assert_eq!(request.documents, vec!["doc1", "doc2"]);
+        assert_eq!(request.model, default_model_name());
+        assert_eq!(request.top_k, None);
+        assert!(request.return_documents);
+        assert_eq!(request.rid, None);
+        assert_eq!(request.user, None);
+    }
+
+    // ==================================================================
+    // =            GENERATION REQUEST TRAIT TESTS                      =
+    // ==================================================================
+
+    #[test]
+    fn test_rerank_request_generation_request_trait() {
+        let request = RerankRequest {
+            query: "test query".to_string(),
+            documents: vec!["doc1".to_string()],
+            model: "test-model".to_string(),
+            top_k: None,
+            return_documents: true,
+            rid: None,
+            user: None,
+        };
+
+        assert_eq!(request.get_model(), Some("test-model"));
+        assert!(!request.is_stream());
+        assert_eq!(request.extract_text_for_routing(), "test query");
+    }
+
+    // ==================================================================
+    // =            EDGE CASES AND STRESS TESTS                         =
+    // ==================================================================
+
+    #[test]
+    fn test_rerank_request_very_long_query() {
+        let long_query = "a".repeat(100000);
+        let request = RerankRequest {
+            query: long_query,
+            documents: vec!["doc1".to_string()],
+            model: "test-model".to_string(),
+            top_k: None,
+            return_documents: true,
+            rid: None,
+            user: None,
+        };
+
+        assert!(request.validate().is_ok());
+    }
+
+    #[test]
+    fn test_rerank_request_many_documents() {
+        let documents: Vec<String> = (0..1000).map(|i| format!("doc{}", i)).collect();
+        let request = RerankRequest {
+            query: "test query".to_string(),
+            documents,
+            model: "test-model".to_string(),
+            top_k: Some(100),
+            return_documents: true,
+            rid: None,
+            user: None,
+        };
+
+        assert!(request.validate().is_ok());
+        assert_eq!(request.effective_top_k(), 100);
+    }
+
+    #[test]
+    fn test_rerank_request_special_characters() {
+        let request = RerankRequest {
+            query: "query with émojis 🚀 and unicode: 测试".to_string(),
+            documents: vec![
+                "doc with émojis 🎉".to_string(),
+                "doc with unicode: 测试".to_string(),
+            ],
+            model: "test-model".to_string(),
+            top_k: None,
+            return_documents: true,
+            rid: Some(StringOrArray::String("req-🚀-123".to_string())),
+            user: Some("user-🎉-456".to_string()),
+        };
+
+        assert!(request.validate().is_ok());
+    }
+
+    #[test]
+    fn test_rerank_request_rid_array() {
+        let request = RerankRequest {
+            query: "test query".to_string(),
+            documents: vec!["doc1".to_string()],
+            model: "test-model".to_string(),
+            top_k: None,
+            return_documents: true,
+            rid: Some(StringOrArray::Array(vec![
+                "req1".to_string(),
+                "req2".to_string(),
+            ])),
+            user: None,
+        };
+
+        assert!(request.validate().is_ok());
+    }
+
+    #[test]
+    fn test_rerank_response_with_usage_info() {
+        let results = vec![RerankResult {
+            score: 0.8,
+            document: Some("doc1".to_string()),
+            index: 0,
+            meta_info: None,
+        }];
+
+        let mut response =
+            RerankResponse::new(results, "test-model".to_string(), "req-123".to_string());
+
+        response.usage = Some(UsageInfo {
+            prompt_tokens: 100,
+            completion_tokens: 50,
+            total_tokens: 150,
+            reasoning_tokens: None,
+            prompt_tokens_details: None,
+        });
+
+        let serialized = serde_json::to_string(&response).unwrap();
+        let deserialized: RerankResponse = serde_json::from_str(&serialized).unwrap();
+
+        assert!(deserialized.usage.is_some());
+        let usage = deserialized.usage.unwrap();
+        assert_eq!(usage.prompt_tokens, 100);
+        assert_eq!(usage.completion_tokens, 50);
+        assert_eq!(usage.total_tokens, 150);
+    }
+
+    // ==================================================================
+    // =            INTEGRATION TESTS                                   =
+    // ==================================================================
+
+    #[test]
+    fn test_full_rerank_workflow() {
+        // Create request
+        let request = RerankRequest {
+            query: "machine learning".to_string(),
+            documents: vec![
+                "Introduction to machine learning algorithms".to_string(),
+                "Deep learning for computer vision".to_string(),
+                "Natural language processing basics".to_string(),
+                "Statistics and probability theory".to_string(),
+            ],
+            model: "rerank-model".to_string(),
+            top_k: Some(2),
+            return_documents: true,
+            rid: Some(StringOrArray::String("req-123".to_string())),
+            user: Some("user-456".to_string()),
+        };
+
+        // Validate request
+        assert!(request.validate().is_ok());
+
+        // Simulate reranking results (in real scenario, this would come from the model)
+        let results = vec![
+            RerankResult {
+                score: 0.95,
+                document: Some("Introduction to machine learning algorithms".to_string()),
+                index: 0,
+                meta_info: None,
+            },
+            RerankResult {
+                score: 0.87,
+                document: Some("Deep learning for computer vision".to_string()),
+                index: 1,
+                meta_info: None,
+            },
+            RerankResult {
+                score: 0.72,
+                document: Some("Natural language processing basics".to_string()),
+                index: 2,
+                meta_info: None,
+            },
+            RerankResult {
+                score: 0.45,
+                document: Some("Statistics and probability theory".to_string()),
+                index: 3,
+                meta_info: None,
+            },
+        ];
+
+        // Create response
+        let mut response = RerankResponse::new(
+            results,
+            request.model.clone(),
+            request
+                .rid
+                .as_ref()
+                .and_then(|r| match r {
+                    StringOrArray::String(s) => Some(s.clone()),
+                    StringOrArray::Array(arr) => arr.first().cloned(),
+                })
+                .unwrap_or_else(|| "unknown".to_string()),
+        );
+
+        // Sort by score
+        response.sort_by_score();
+
+        // Apply top_k
+        response.apply_top_k(request.effective_top_k());
+
+        // Verify results
+        assert_eq!(response.results.len(), 2);
+        assert_eq!(response.results[0].score, 0.95);
+        assert_eq!(response.results[0].index, 0);
+        assert_eq!(response.results[1].score, 0.87);
+        assert_eq!(response.results[1].index, 1);
+        assert_eq!(response.model, "rerank-model");
+
+        // Serialize and deserialize
+        let serialized = serde_json::to_string(&response).unwrap();
+        let deserialized: RerankResponse = serde_json::from_str(&serialized).unwrap();
+        assert_eq!(deserialized.results.len(), 2);
+        assert_eq!(deserialized.model, response.model);
+    }
+}
diff --git a/sgl-router/src/protocols/validation.rs b/sgl-router/src/protocols/validation.rs
new file mode 100644
index 000000000..460ce2148
--- /dev/null
+++ b/sgl-router/src/protocols/validation.rs
@@ -0,0 +1,1164 @@
+// Core validation infrastructure for API parameter validation
+
+use anyhow::Result;
+use serde::{Deserialize, Serialize};
+use std::fmt::Display;
+
+// Import types from spec module
+use crate::protocols::spec::{
+    ChatCompletionRequest, ChatMessage, ResponseFormat, StringOrArray, UserMessageContent,
+};
+
+/// Validation constants for OpenAI API parameters
+pub mod constants {
+    /// Temperature range: 0.0 to 2.0 (OpenAI spec)
+    pub const TEMPERATURE_RANGE: (f32, f32) = (0.0, 2.0);
+
+    /// Top-p range: 0.0 to 1.0 (exclusive of 0.0)
+    pub const TOP_P_RANGE: (f32, f32) = (0.0, 1.0);
+
+    /// Presence penalty range: -2.0 to 2.0 (OpenAI spec)
+    pub const PRESENCE_PENALTY_RANGE: (f32, f32) = (-2.0, 2.0);
+
+    /// Frequency penalty range: -2.0 to 2.0 (OpenAI spec)
+    pub const FREQUENCY_PENALTY_RANGE: (f32, f32) = (-2.0, 2.0);
+
+    /// Logprobs range for completions API: 0 to 5
+    pub const LOGPROBS_RANGE: (u32, u32) = (0, 5);
+
+    /// Top logprobs range for chat completions: 0 to 20
+    pub const TOP_LOGPROBS_RANGE: (u32, u32) = (0, 20);
+
+    /// Maximum number of stop sequences allowed
+    pub const MAX_STOP_SEQUENCES: usize = 4;
+
+    /// SGLang-specific validation constants
+    pub mod sglang {
+        /// Min-p range: 0.0 to 1.0 (SGLang extension)
+        pub const MIN_P_RANGE: (f32, f32) = (0.0, 1.0);
+
+        /// Top-k minimum value: -1 to disable, otherwise positive
+        pub const TOP_K_MIN: i32 = -1;
+
+        /// Repetition penalty range: 0.0 to 2.0 (SGLang extension)
+        /// 1.0 = no penalty, >1.0 = discourage repetition, <1.0 = encourage repetition
+        pub const REPETITION_PENALTY_RANGE: (f32, f32) = (0.0, 2.0);
+    }
+}
+
+/// Core validation error types
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub enum ValidationError {
+    /// Parameter value out of valid range
+    OutOfRange {
+        parameter: String,
+        value: String,
+        min: String,
+        max: String,
+    },
+    /// Invalid parameter value format or type
+    InvalidValue {
+        parameter: String,
+        value: String,
+        reason: String,
+    },
+    /// Cross-parameter validation failure
+    ConflictingParameters {
+        parameter1: String,
+        parameter2: String,
+        reason: String,
+    },
+    /// Required parameter missing
+    MissingRequired { parameter: String },
+    /// Too many items in array parameter
+    TooManyItems {
+        parameter: String,
+        count: usize,
+        max: usize,
+    },
+    /// Custom validation error
+    Custom(String),
+}
+
+impl Display for ValidationError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            ValidationError::OutOfRange {
+                parameter,
+                value,
+                min,
+                max,
+            } => {
+                write!(
+                    f,
+                    "Parameter '{}' must be between {} and {}, got {}",
+                    parameter, min, max, value
+                )
+            }
+            ValidationError::InvalidValue {
+                parameter,
+                value,
+                reason,
+            } => {
+                write!(
+                    f,
+                    "Invalid value for parameter '{}': {} ({})",
+                    parameter, value, reason
+                )
+            }
+            ValidationError::ConflictingParameters {
+                parameter1,
+                parameter2,
+                reason,
+            } => {
+                write!(
+                    f,
+                    "Conflicting parameters '{}' and '{}': {}",
+                    parameter1, parameter2, reason
+                )
+            }
+            ValidationError::MissingRequired { parameter } => {
+                write!(f, "Required parameter '{}' is missing", parameter)
+            }
+            ValidationError::TooManyItems {
+                parameter,
+                count,
+                max,
+            } => {
+                write!(
+                    f,
+                    "Parameter '{}' has too many items: {} (maximum: {})",
+                    parameter, count, max
+                )
+            }
+            ValidationError::Custom(msg) => write!(f, "{}", msg),
+        }
+    }
+}
+
+impl std::error::Error for ValidationError {}
+
+/// Core validation utility functions
+pub mod utils {
+    use super::*;
+
+    /// Validate that a numeric value is within the specified range (inclusive)
+    pub fn validate_range<T>(
+        value: T,
+        range: &(T, T),
+        param_name: &str,
+    ) -> Result<T, ValidationError>
+    where
+        T: PartialOrd + Display + Copy,
+    {
+        if value >= range.0 && value <= range.1 {
+            Ok(value)
+        } else {
+            Err(ValidationError::OutOfRange {
+                parameter: param_name.to_string(),
+                value: value.to_string(),
+                min: range.0.to_string(),
+                max: range.1.to_string(),
+            })
+        }
+    }
+
+    /// Validate that a positive number is actually positive
+    pub fn validate_positive<T>(value: T, param_name: &str) -> Result<T, ValidationError>
+    where
+        T: PartialOrd + Display + Copy + Default,
+    {
+        if value > T::default() {
+            Ok(value)
+        } else {
+            Err(ValidationError::InvalidValue {
+                parameter: param_name.to_string(),
+                value: value.to_string(),
+                reason: "must be positive".to_string(),
+            })
+        }
+    }
+
+    /// Validate that an array doesn't exceed maximum length
+    pub fn validate_max_items<T>(
+        items: &[T],
+        max_count: usize,
+        param_name: &str,
+    ) -> Result<(), ValidationError> {
+        if items.len() <= max_count {
+            Ok(())
+        } else {
+            Err(ValidationError::TooManyItems {
+                parameter: param_name.to_string(),
+                count: items.len(),
+                max: max_count,
+            })
+        }
+    }
+
+    /// Validate that a required parameter is present
+    pub fn validate_required<'a, T>(
+        value: &'a Option<T>,
+        param_name: &str,
+    ) -> Result<&'a T, ValidationError> {
+        value
+            .as_ref()
+            .ok_or_else(|| ValidationError::MissingRequired {
+                parameter: param_name.to_string(),
+            })
+    }
+
+    /// Validate top_k parameter (SGLang extension)
+    pub fn validate_top_k(top_k: i32) -> Result<i32, ValidationError> {
+        if top_k == constants::sglang::TOP_K_MIN || top_k > 0 {
+            Ok(top_k)
+        } else {
+            Err(ValidationError::InvalidValue {
+                parameter: "top_k".to_string(),
+                value: top_k.to_string(),
+                reason: "must be -1 (disabled) or positive".to_string(),
+            })
+        }
+    }
+
+    /// Generic validation function for sampling options
+    pub fn validate_sampling_options<T: SamplingOptionsProvider + ?Sized>(
+        request: &T,
+    ) -> Result<(), ValidationError> {
+        // Validate temperature (0.0 to 2.0)
+        if let Some(temp) = request.get_temperature() {
+            validate_range(temp, &constants::TEMPERATURE_RANGE, "temperature")?;
+        }
+
+        // Validate top_p (0.0 to 1.0)
+        if let Some(top_p) = request.get_top_p() {
+            validate_range(top_p, &constants::TOP_P_RANGE, "top_p")?;
+        }
+
+        // Validate frequency_penalty (-2.0 to 2.0)
+        if let Some(freq_penalty) = request.get_frequency_penalty() {
+            validate_range(
+                freq_penalty,
+                &constants::FREQUENCY_PENALTY_RANGE,
+                "frequency_penalty",
+            )?;
+        }
+
+        // Validate presence_penalty (-2.0 to 2.0)
+        if let Some(pres_penalty) = request.get_presence_penalty() {
+            validate_range(
+                pres_penalty,
+                &constants::PRESENCE_PENALTY_RANGE,
+                "presence_penalty",
+            )?;
+        }
+
+        Ok(())
+    }
+
+    /// Generic validation function for stop conditions
+    pub fn validate_stop_conditions<T: StopConditionsProvider + ?Sized>(
+        request: &T,
+    ) -> Result<(), ValidationError> {
+        if let Some(stop) = request.get_stop_sequences() {
+            match stop {
+                StringOrArray::String(s) => {
+                    if s.is_empty() {
+                        return Err(ValidationError::InvalidValue {
+                            parameter: "stop".to_string(),
+                            value: "empty string".to_string(),
+                            reason: "stop sequences cannot be empty".to_string(),
+                        });
+                    }
+                }
+                StringOrArray::Array(arr) => {
+                    validate_max_items(arr, constants::MAX_STOP_SEQUENCES, "stop")?;
+                    for (i, s) in arr.iter().enumerate() {
+                        if s.is_empty() {
+                            return Err(ValidationError::InvalidValue {
+                                parameter: format!("stop[{}]", i),
+                                value: "empty string".to_string(),
+                                reason: "stop sequences cannot be empty".to_string(),
+                            });
+                        }
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Generic validation function for token limits
+    pub fn validate_token_limits<T: TokenLimitsProvider + ?Sized>(
+        request: &T,
+    ) -> Result<(), ValidationError> {
+        // Validate max_tokens if provided
+        if let Some(max_tokens) = request.get_max_tokens() {
+            validate_positive(max_tokens, "max_tokens")?;
+        }
+
+        // Validate min_tokens if provided (SGLang extension)
+        if let Some(min_tokens) = request.get_min_tokens() {
+            validate_positive(min_tokens, "min_tokens")?;
+        }
+
+        Ok(())
+    }
+
+    /// Generic validation function for logprobs
+    pub fn validate_logprobs<T: LogProbsProvider + ?Sized>(
+        request: &T,
+    ) -> Result<(), ValidationError> {
+        // Validate logprobs (completions API - 0 to 5)
+        if let Some(logprobs) = request.get_logprobs() {
+            validate_range(logprobs, &constants::LOGPROBS_RANGE, "logprobs")?;
+        }
+
+        // Validate top_logprobs (chat API - 0 to 20)
+        if let Some(top_logprobs) = request.get_top_logprobs() {
+            validate_range(top_logprobs, &constants::TOP_LOGPROBS_RANGE, "top_logprobs")?;
+        }
+
+        Ok(())
+    }
+
+    /// Generic cross-parameter validation
+    pub fn validate_cross_parameters<T: TokenLimitsProvider + ?Sized>(
+        request: &T,
+    ) -> Result<(), ValidationError> {
+        // Check min_tokens <= max_tokens if both are specified
+        if let (Some(min_tokens), Some(max_tokens)) =
+            (request.get_min_tokens(), request.get_max_tokens())
+        {
+            if min_tokens > max_tokens {
+                return Err(ValidationError::ConflictingParameters {
+                    parameter1: "min_tokens".to_string(),
+                    parameter2: "max_tokens".to_string(),
+                    reason: format!(
+                        "min_tokens ({}) cannot be greater than max_tokens ({})",
+                        min_tokens, max_tokens
+                    ),
+                });
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Validate conflicting structured output constraints
+    pub fn validate_conflicting_parameters(
+        param1_name: &str,
+        param1_value: bool,
+        param2_name: &str,
+        param2_value: bool,
+        reason: &str,
+    ) -> Result<(), ValidationError> {
+        if param1_value && param2_value {
+            return Err(ValidationError::ConflictingParameters {
+                parameter1: param1_name.to_string(),
+                parameter2: param2_name.to_string(),
+                reason: reason.to_string(),
+            });
+        }
+        Ok(())
+    }
+
+    /// Validate that only one option from a set is active
+    pub fn validate_mutually_exclusive_options(
+        options: &[(&str, bool)],
+        error_msg: &str,
+    ) -> Result<(), ValidationError> {
+        let active_count = options.iter().filter(|(_, is_active)| *is_active).count();
+        if active_count > 1 {
+            return Err(ValidationError::Custom(error_msg.to_string()));
+        }
+        Ok(())
+    }
+
+    /// Generic validation for SGLang extensions
+    pub fn validate_sglang_extensions<T: SGLangExtensionsProvider + ?Sized>(
+        request: &T,
+    ) -> Result<(), ValidationError> {
+        // Validate top_k (-1 to disable, or positive)
+        if let Some(top_k) = request.get_top_k() {
+            validate_top_k(top_k)?;
+        }
+
+        // Validate min_p (0.0 to 1.0)
+        if let Some(min_p) = request.get_min_p() {
+            validate_range(min_p, &constants::sglang::MIN_P_RANGE, "min_p")?;
+        }
+
+        // Validate repetition_penalty (0.0 to 2.0)
+        if let Some(rep_penalty) = request.get_repetition_penalty() {
+            validate_range(
+                rep_penalty,
+                &constants::sglang::REPETITION_PENALTY_RANGE,
+                "repetition_penalty",
+            )?;
+        }
+
+        Ok(())
+    }
+
+    /// Generic validation for n parameter (number of completions)
+    pub fn validate_completion_count<T: CompletionCountProvider + ?Sized>(
+        request: &T,
+    ) -> Result<(), ValidationError> {
+        const N_RANGE: (u32, u32) = (1, 10);
+
+        if let Some(n) = request.get_n() {
+            validate_range(n, &N_RANGE, "n")?;
+        }
+
+        Ok(())
+    }
+
+    /// Validate that an array is not empty
+    pub fn validate_non_empty_array<T>(
+        items: &[T],
+        param_name: &str,
+    ) -> Result<(), ValidationError> {
+        if items.is_empty() {
+            return Err(ValidationError::MissingRequired {
+                parameter: param_name.to_string(),
+            });
+        }
+        Ok(())
+    }
+
+    /// Validate common request parameters that are shared across all API types
+    pub fn validate_common_request_params<T>(request: &T) -> Result<(), ValidationError>
+    where
+        T: SamplingOptionsProvider
+            + StopConditionsProvider
+            + TokenLimitsProvider
+            + LogProbsProvider
+            + SGLangExtensionsProvider
+            + CompletionCountProvider
+            + ?Sized,
+    {
+        // Validate all standard parameters
+        validate_sampling_options(request)?;
+        validate_stop_conditions(request)?;
+        validate_token_limits(request)?;
+        validate_logprobs(request)?;
+
+        // Validate SGLang extensions and completion count
+        validate_sglang_extensions(request)?;
+        validate_completion_count(request)?;
+
+        // Perform cross-parameter validation
+        validate_cross_parameters(request)?;
+
+        Ok(())
+    }
+}
+
+/// Core validation traits for different parameter categories
+pub trait SamplingOptionsProvider {
+    /// Get temperature parameter
+    fn get_temperature(&self) -> Option<f32>;
+
+    /// Get top_p parameter
+    fn get_top_p(&self) -> Option<f32>;
+
+    /// Get frequency penalty parameter
+    fn get_frequency_penalty(&self) -> Option<f32>;
+
+    /// Get presence penalty parameter
+    fn get_presence_penalty(&self) -> Option<f32>;
+}
+
+/// Trait for validating stop conditions
+pub trait StopConditionsProvider {
+    /// Get stop sequences
+    fn get_stop_sequences(&self) -> Option<&StringOrArray>;
+}
+
+/// Trait for validating token limits
+pub trait TokenLimitsProvider {
+    /// Get maximum tokens parameter
+    fn get_max_tokens(&self) -> Option<u32>;
+
+    /// Get minimum tokens parameter (SGLang extension)
+    fn get_min_tokens(&self) -> Option<u32>;
+}
+
+/// Trait for validating logprobs parameters
+pub trait LogProbsProvider {
+    /// Get logprobs parameter (completions API)
+    fn get_logprobs(&self) -> Option<u32>;
+
+    /// Get top_logprobs parameter (chat API)
+    fn get_top_logprobs(&self) -> Option<u32>;
+}
+
+/// Trait for SGLang-specific extensions
+pub trait SGLangExtensionsProvider {
+    /// Get top_k parameter
+    fn get_top_k(&self) -> Option<i32> {
+        None
+    }
+
+    /// Get min_p parameter
+    fn get_min_p(&self) -> Option<f32> {
+        None
+    }
+
+    /// Get repetition_penalty parameter
+    fn get_repetition_penalty(&self) -> Option<f32> {
+        None
+    }
+}
+
+/// Trait for n parameter (number of completions)
+pub trait CompletionCountProvider {
+    /// Get n parameter
+    fn get_n(&self) -> Option<u32> {
+        None
+    }
+}
+
+/// Comprehensive validation trait that combines all validation aspects
+pub trait ValidatableRequest:
+    SamplingOptionsProvider
+    + StopConditionsProvider
+    + TokenLimitsProvider
+    + LogProbsProvider
+    + SGLangExtensionsProvider
+    + CompletionCountProvider
+{
+    /// Perform comprehensive validation of the entire request
+    fn validate(&self) -> Result<(), ValidationError> {
+        // Use the common validation function
+        utils::validate_common_request_params(self)
+    }
+}
+
+// ==================================================================
+// =            OPENAI CHAT COMPLETION VALIDATION                   =
+// ==================================================================
+
+impl SamplingOptionsProvider for ChatCompletionRequest {
+    fn get_temperature(&self) -> Option<f32> {
+        self.temperature
+    }
+    fn get_top_p(&self) -> Option<f32> {
+        self.top_p
+    }
+    fn get_frequency_penalty(&self) -> Option<f32> {
+        self.frequency_penalty
+    }
+    fn get_presence_penalty(&self) -> Option<f32> {
+        self.presence_penalty
+    }
+}
+
+impl StopConditionsProvider for ChatCompletionRequest {
+    fn get_stop_sequences(&self) -> Option<&StringOrArray> {
+        self.stop.as_ref()
+    }
+}
+
+impl TokenLimitsProvider for ChatCompletionRequest {
+    fn get_max_tokens(&self) -> Option<u32> {
+        // Prefer max_completion_tokens over max_tokens if both are set
+        self.max_completion_tokens.or(self.max_tokens)
+    }
+
+    fn get_min_tokens(&self) -> Option<u32> {
+        self.min_tokens
+    }
+}
+
+impl LogProbsProvider for ChatCompletionRequest {
+    fn get_logprobs(&self) -> Option<u32> {
+        // For chat API, logprobs is a boolean, return 1 if true for validation purposes
+        if self.logprobs {
+            Some(1)
+        } else {
+            None
+        }
+    }
+
+    fn get_top_logprobs(&self) -> Option<u32> {
+        self.top_logprobs
+    }
+}
+
+impl SGLangExtensionsProvider for ChatCompletionRequest {
+    fn get_top_k(&self) -> Option<i32> {
+        self.top_k
+    }
+
+    fn get_min_p(&self) -> Option<f32> {
+        self.min_p
+    }
+
+    fn get_repetition_penalty(&self) -> Option<f32> {
+        self.repetition_penalty
+    }
+}
+
+impl CompletionCountProvider for ChatCompletionRequest {
+    fn get_n(&self) -> Option<u32> {
+        self.n
+    }
+}
+
+impl ChatCompletionRequest {
+    /// Validate message-specific requirements
+    pub fn validate_messages(&self) -> Result<(), ValidationError> {
+        // Ensure messages array is not empty
+        utils::validate_non_empty_array(&self.messages, "messages")?;
+
+        // Validate message content is not empty
+        for (i, msg) in self.messages.iter().enumerate() {
+            if let ChatMessage::User { content, .. } = msg {
+                match content {
+                    UserMessageContent::Text(text) if text.is_empty() => {
+                        return Err(ValidationError::InvalidValue {
+                            parameter: format!("messages[{}].content", i),
+                            value: "empty".to_string(),
+                            reason: "message content cannot be empty".to_string(),
+                        });
+                    }
+                    UserMessageContent::Parts(parts) if parts.is_empty() => {
+                        return Err(ValidationError::InvalidValue {
+                            parameter: format!("messages[{}].content", i),
+                            value: "empty array".to_string(),
+                            reason: "message content parts cannot be empty".to_string(),
+                        });
+                    }
+                    _ => {}
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Validate response format if specified
+    pub fn validate_response_format(&self) -> Result<(), ValidationError> {
+        if let Some(ResponseFormat::JsonSchema { json_schema }) = &self.response_format {
+            if json_schema.name.is_empty() {
+                return Err(ValidationError::InvalidValue {
+                    parameter: "response_format.json_schema.name".to_string(),
+                    value: "empty".to_string(),
+                    reason: "JSON schema name cannot be empty".to_string(),
+                });
+            }
+        }
+        Ok(())
+    }
+
+    /// Validate chat API specific logprobs requirements
+    pub fn validate_chat_logprobs(&self) -> Result<(), ValidationError> {
+        // In chat API, if logprobs=true, top_logprobs must be specified
+        if self.logprobs && self.top_logprobs.is_none() {
+            return Err(ValidationError::MissingRequired {
+                parameter: "top_logprobs".to_string(),
+            });
+        }
+
+        // If top_logprobs is specified, logprobs should be true
+        if self.top_logprobs.is_some() && !self.logprobs {
+            return Err(ValidationError::InvalidValue {
+                parameter: "logprobs".to_string(),
+                value: "false".to_string(),
+                reason: "must be true when top_logprobs is specified".to_string(),
+            });
+        }
+
+        Ok(())
+    }
+
+    /// Validate cross-parameter relationships specific to chat completions
+    pub fn validate_chat_cross_parameters(&self) -> Result<(), ValidationError> {
+        // Validate that both max_tokens and max_completion_tokens aren't set
+        utils::validate_conflicting_parameters(
+            "max_tokens",
+            self.max_tokens.is_some(),
+            "max_completion_tokens",
+            self.max_completion_tokens.is_some(),
+            "cannot specify both max_tokens and max_completion_tokens",
+        )?;
+
+        // Validate that tools and functions aren't both specified (deprecated)
+        utils::validate_conflicting_parameters(
+            "tools",
+            self.tools.is_some(),
+            "functions",
+            self.functions.is_some(),
+            "functions is deprecated, use tools instead",
+        )?;
+
+        // Validate structured output constraints don't conflict with JSON response format
+        let has_json_format = matches!(
+            self.response_format,
+            Some(ResponseFormat::JsonObject | ResponseFormat::JsonSchema { .. })
+        );
+
+        utils::validate_conflicting_parameters(
+            "response_format",
+            has_json_format,
+            "regex",
+            self.regex.is_some(),
+            "cannot use regex constraint with JSON response format",
+        )?;
+
+        utils::validate_conflicting_parameters(
+            "response_format",
+            has_json_format,
+            "ebnf",
+            self.ebnf.is_some(),
+            "cannot use EBNF constraint with JSON response format",
+        )?;
+
+        // Only one structured output constraint should be active
+        let structured_constraints = [
+            ("regex", self.regex.is_some()),
+            ("ebnf", self.ebnf.is_some()),
+            (
+                "json_schema",
+                matches!(
+                    self.response_format,
+                    Some(ResponseFormat::JsonSchema { .. })
+                ),
+            ),
+        ];
+
+        utils::validate_mutually_exclusive_options(
+            &structured_constraints,
+            "Only one structured output constraint (regex, ebnf, or json_schema) can be active at a time",
+        )?;
+
+        Ok(())
+    }
+}
+
+impl ValidatableRequest for ChatCompletionRequest {
+    fn validate(&self) -> Result<(), ValidationError> {
+        // Call the common validation function from the validation module
+        utils::validate_common_request_params(self)?;
+
+        // Then validate chat-specific parameters
+        self.validate_messages()?;
+        self.validate_response_format()?;
+        self.validate_chat_logprobs()?;
+        self.validate_chat_cross_parameters()?;
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::constants::*;
+    use super::utils::*;
+    use super::*;
+    use crate::protocols::spec::StringOrArray;
+
+    // Mock request type for testing validation traits
+    #[derive(Debug, Default)]
+    struct MockRequest {
+        temperature: Option<f32>,
+        stop: Option<StringOrArray>,
+        max_tokens: Option<u32>,
+        min_tokens: Option<u32>,
+    }
+
+    impl SamplingOptionsProvider for MockRequest {
+        fn get_temperature(&self) -> Option<f32> {
+            self.temperature
+        }
+        fn get_top_p(&self) -> Option<f32> {
+            None
+        }
+        fn get_frequency_penalty(&self) -> Option<f32> {
+            None
+        }
+        fn get_presence_penalty(&self) -> Option<f32> {
+            None
+        }
+    }
+
+    impl StopConditionsProvider for MockRequest {
+        fn get_stop_sequences(&self) -> Option<&StringOrArray> {
+            self.stop.as_ref()
+        }
+    }
+
+    impl TokenLimitsProvider for MockRequest {
+        fn get_max_tokens(&self) -> Option<u32> {
+            self.max_tokens
+        }
+        fn get_min_tokens(&self) -> Option<u32> {
+            self.min_tokens
+        }
+    }
+
+    impl LogProbsProvider for MockRequest {
+        fn get_logprobs(&self) -> Option<u32> {
+            None
+        }
+        fn get_top_logprobs(&self) -> Option<u32> {
+            None
+        }
+    }
+
+    impl SGLangExtensionsProvider for MockRequest {}
+    impl CompletionCountProvider for MockRequest {}
+    impl ValidatableRequest for MockRequest {}
+
+    #[test]
+    fn test_range_validation() {
+        // Valid range
+        assert!(validate_range(1.5f32, &TEMPERATURE_RANGE, "temperature").is_ok());
+        // Invalid range
+        assert!(validate_range(-0.1f32, &TEMPERATURE_RANGE, "temperature").is_err());
+        assert!(validate_range(3.0f32, &TEMPERATURE_RANGE, "temperature").is_err());
+    }
+
+    #[test]
+    fn test_sglang_top_k_validation() {
+        assert!(validate_top_k(-1).is_ok()); // Disabled
+        assert!(validate_top_k(50).is_ok()); // Valid positive
+        assert!(validate_top_k(0).is_err()); // Invalid
+        assert!(validate_top_k(-5).is_err()); // Invalid
+    }
+
+    #[test]
+    fn test_stop_sequences_limits() {
+        let request = MockRequest {
+            stop: Some(StringOrArray::Array(vec![
+                "stop1".to_string(),
+                "stop2".to_string(),
+                "stop3".to_string(),
+                "stop4".to_string(),
+                "stop5".to_string(), // Too many
+            ])),
+            ..Default::default()
+        };
+        assert!(request.validate().is_err());
+    }
+
+    #[test]
+    fn test_token_limits_conflict() {
+        let request = MockRequest {
+            min_tokens: Some(100),
+            max_tokens: Some(50), // min > max
+            ..Default::default()
+        };
+        assert!(request.validate().is_err());
+    }
+
+    #[test]
+    fn test_valid_request() {
+        let request = MockRequest {
+            temperature: Some(1.0),
+            stop: Some(StringOrArray::Array(vec!["stop".to_string()])),
+            max_tokens: Some(100),
+            min_tokens: Some(10),
+        };
+        assert!(request.validate().is_ok());
+    }
+
+    // Chat completion specific tests
+    #[cfg(test)]
+    mod chat_tests {
+        use super::*;
+
+        fn create_valid_chat_request() -> ChatCompletionRequest {
+            ChatCompletionRequest {
+                model: "gpt-4".to_string(),
+                messages: vec![ChatMessage::User {
+                    role: "user".to_string(),
+                    content: UserMessageContent::Text("Hello".to_string()),
+                    name: None,
+                }],
+                temperature: Some(1.0),
+                top_p: Some(0.9),
+                n: Some(1),
+                stream: false,
+                stream_options: None,
+                stop: None,
+                max_tokens: Some(100),
+                max_completion_tokens: None,
+                presence_penalty: Some(0.0),
+                frequency_penalty: Some(0.0),
+                logit_bias: None,
+                user: None,
+                seed: None,
+                logprobs: false,
+                top_logprobs: None,
+                response_format: None,
+                tools: None,
+                tool_choice: None,
+                parallel_tool_calls: None,
+                functions: None,
+                function_call: None,
+                // SGLang extensions
+                top_k: None,
+                min_p: None,
+                min_tokens: None,
+                repetition_penalty: None,
+                regex: None,
+                ebnf: None,
+                stop_token_ids: None,
+                no_stop_trim: false,
+                ignore_eos: false,
+                continue_final_message: false,
+                skip_special_tokens: true,
+                lora_path: None,
+                session_params: None,
+                separate_reasoning: true,
+                stream_reasoning: true,
+                chat_template_kwargs: None,
+                return_hidden_states: false,
+            }
+        }
+
+        #[test]
+        fn test_chat_validation_basics() {
+            // Valid request
+            assert!(create_valid_chat_request().validate().is_ok());
+
+            // Empty messages
+            let mut request = create_valid_chat_request();
+            request.messages = vec![];
+            assert!(request.validate().is_err());
+
+            // Invalid temperature
+            let mut request = create_valid_chat_request();
+            request.temperature = Some(3.0);
+            assert!(request.validate().is_err());
+        }
+
+        #[test]
+        fn test_chat_conflicts() {
+            let mut request = create_valid_chat_request();
+
+            // Conflicting max_tokens
+            request.max_tokens = Some(100);
+            request.max_completion_tokens = Some(200);
+            assert!(request.validate().is_err());
+
+            // Logprobs without top_logprobs
+            request.max_tokens = None;
+            request.logprobs = true;
+            request.top_logprobs = None;
+            assert!(request.validate().is_err());
+        }
+
+        #[test]
+        fn test_sglang_extensions() {
+            let mut request = create_valid_chat_request();
+
+            // Valid SGLang parameters
+            request.top_k = Some(-1);
+            request.min_p = Some(0.1);
+            request.repetition_penalty = Some(1.2);
+            assert!(request.validate().is_ok());
+
+            // Invalid parameters
+            request.top_k = Some(0); // Invalid
+            assert!(request.validate().is_err());
+        }
+
+        #[test]
+        fn test_parameter_ranges() {
+            let mut request = create_valid_chat_request();
+
+            // Test temperature range (0.0 to 2.0)
+            request.temperature = Some(1.5);
+            assert!(request.validate().is_ok());
+            request.temperature = Some(-0.1);
+            assert!(request.validate().is_err());
+            request.temperature = Some(3.0);
+            assert!(request.validate().is_err());
+
+            // Test top_p range (0.0 to 1.0)
+            request.temperature = Some(1.0); // Reset
+            request.top_p = Some(0.9);
+            assert!(request.validate().is_ok());
+            request.top_p = Some(-0.1);
+            assert!(request.validate().is_err());
+            request.top_p = Some(1.5);
+            assert!(request.validate().is_err());
+
+            // Test frequency_penalty range (-2.0 to 2.0)
+            request.top_p = Some(0.9); // Reset
+            request.frequency_penalty = Some(1.5);
+            assert!(request.validate().is_ok());
+            request.frequency_penalty = Some(-2.5);
+            assert!(request.validate().is_err());
+            request.frequency_penalty = Some(3.0);
+            assert!(request.validate().is_err());
+
+            // Test presence_penalty range (-2.0 to 2.0)
+            request.frequency_penalty = Some(0.0); // Reset
+            request.presence_penalty = Some(-1.5);
+            assert!(request.validate().is_ok());
+            request.presence_penalty = Some(-3.0);
+            assert!(request.validate().is_err());
+            request.presence_penalty = Some(2.5);
+            assert!(request.validate().is_err());
+
+            // Test repetition_penalty range (0.0 to 2.0)
+            request.presence_penalty = Some(0.0); // Reset
+            request.repetition_penalty = Some(1.2);
+            assert!(request.validate().is_ok());
+            request.repetition_penalty = Some(-0.1);
+            assert!(request.validate().is_err());
+            request.repetition_penalty = Some(2.1);
+            assert!(request.validate().is_err());
+
+            // Test min_p range (0.0 to 1.0)
+            request.repetition_penalty = Some(1.0); // Reset
+            request.min_p = Some(0.5);
+            assert!(request.validate().is_ok());
+            request.min_p = Some(-0.1);
+            assert!(request.validate().is_err());
+            request.min_p = Some(1.5);
+            assert!(request.validate().is_err());
+        }
+
+        #[test]
+        fn test_structured_output_conflicts() {
+            let mut request = create_valid_chat_request();
+
+            // JSON response format with regex should conflict
+            request.response_format = Some(ResponseFormat::JsonObject);
+            request.regex = Some(".*".to_string());
+            assert!(request.validate().is_err());
+
+            // JSON response format with EBNF should conflict
+            request.regex = None;
+            request.ebnf = Some("grammar".to_string());
+            assert!(request.validate().is_err());
+
+            // Multiple structured constraints should conflict
+            request.response_format = None;
+            request.regex = Some(".*".to_string());
+            request.ebnf = Some("grammar".to_string());
+            assert!(request.validate().is_err());
+
+            // Only one constraint should work
+            request.ebnf = None;
+            request.regex = Some(".*".to_string());
+            assert!(request.validate().is_ok());
+
+            request.regex = None;
+            request.ebnf = Some("grammar".to_string());
+            assert!(request.validate().is_ok());
+
+            request.ebnf = None;
+            request.response_format = Some(ResponseFormat::JsonObject);
+            assert!(request.validate().is_ok());
+        }
+
+        #[test]
+        fn test_stop_sequences_validation() {
+            let mut request = create_valid_chat_request();
+
+            // Valid stop sequences
+            request.stop = Some(StringOrArray::Array(vec![
+                "stop1".to_string(),
+                "stop2".to_string(),
+            ]));
+            assert!(request.validate().is_ok());
+
+            // Too many stop sequences (max 4)
+            request.stop = Some(StringOrArray::Array(vec![
+                "stop1".to_string(),
+                "stop2".to_string(),
+                "stop3".to_string(),
+                "stop4".to_string(),
+                "stop5".to_string(),
+            ]));
+            assert!(request.validate().is_err());
+
+            // Empty stop sequence should fail
+            request.stop = Some(StringOrArray::String("".to_string()));
+            assert!(request.validate().is_err());
+
+            // Empty string in array should fail
+            request.stop = Some(StringOrArray::Array(vec![
+                "stop1".to_string(),
+                "".to_string(),
+            ]));
+            assert!(request.validate().is_err());
+        }
+
+        #[test]
+        fn test_logprobs_validation() {
+            let mut request = create_valid_chat_request();
+
+            // Valid logprobs configuration
+            request.logprobs = true;
+            request.top_logprobs = Some(10);
+            assert!(request.validate().is_ok());
+
+            // logprobs=true without top_logprobs should fail
+            request.top_logprobs = None;
+            assert!(request.validate().is_err());
+
+            // top_logprobs without logprobs=true should fail
+            request.logprobs = false;
+            request.top_logprobs = Some(10);
+            assert!(request.validate().is_err());
+
+            // top_logprobs out of range (0-20)
+            request.logprobs = true;
+            request.top_logprobs = Some(25);
+            assert!(request.validate().is_err());
+        }
+
+        #[test]
+        fn test_n_parameter_validation() {
+            let mut request = create_valid_chat_request();
+
+            // Valid n values (1-10)
+            request.n = Some(1);
+            assert!(request.validate().is_ok());
+            request.n = Some(5);
+            assert!(request.validate().is_ok());
+            request.n = Some(10);
+            assert!(request.validate().is_ok());
+
+            // Invalid n values
+            request.n = Some(0);
+            assert!(request.validate().is_err());
+            request.n = Some(15);
+            assert!(request.validate().is_err());
+        }
+
+        #[test]
+        fn test_min_max_tokens_validation() {
+            let mut request = create_valid_chat_request();
+
+            // Valid token limits
+            request.min_tokens = Some(10);
+            request.max_tokens = Some(100);
+            assert!(request.validate().is_ok());
+
+            // min_tokens > max_tokens should fail
+            request.min_tokens = Some(150);
+            request.max_tokens = Some(100);
+            assert!(request.validate().is_err());
+
+            // Should work with max_completion_tokens instead
+            request.max_tokens = None;
+            request.max_completion_tokens = Some(200);
+            request.min_tokens = Some(50);
+            assert!(request.validate().is_ok());
+
+            // min_tokens > max_completion_tokens should fail
+            request.min_tokens = Some(250);
+            assert!(request.validate().is_err());
+        }
+    }
+}
diff --git a/sgl-router/src/reasoning_parser/README.md b/sgl-router/src/reasoning_parser/README.md
new file mode 100644
index 000000000..92a6ffce7
--- /dev/null
+++ b/sgl-router/src/reasoning_parser/README.md
@@ -0,0 +1,474 @@
+# Reasoning Parser Architecture
+
+## 1. Executive Summary
+
+### High-Level Overview
+
+The reasoning parser layer provides a unified interface for detecting and extracting reasoning content from Large Language Model (LLM) outputs, particularly from models that support Chain-of-Thought (CoT) reasoning with explicit thinking blocks. The architecture follows a trait-based design pattern enabling pluggable parser implementations while maintaining consistent APIs across different model families that use various reasoning token formats.
+
+**Key Components:**
+- **Factory Pattern**: Registry-based creation and pooling of model-specific parsers
+- **Trait System**: `ReasoningParser` trait for implementation flexibility
+- **Parser Pooling**: Efficient reuse of parser instances across concurrent requests
+- **Streaming Support**: Incremental parsing with partial token buffering
+- **Model Detection**: Pattern-based matching for automatic parser selection
+- **State Management**: Stateful parsing for streaming scenarios with buffer management
+- **Thread Safety**: Arc<Mutex> based sharing for high-concurrency environments
+- **Extensibility**: Easy addition of new model-specific parsers
+
+**Data Flow:**
+1. Request → Factory (model detection) → Pooled Parser Retrieval
+2. One-Shot: Text → Parser → ParserResult (normal + reasoning text)
+3. Streaming: Chunks → Parser (stateful) → Incremental ParserResult
+4. Buffer Management: Partial Tokens → Buffer → Complete Token Detection
+5. Reset: Parser State → Clear Buffers → Ready for Reuse
+
+### Architecture Highlights
+
+- **Model-Specific Parsers**: DeepSeek-R1, Qwen3, Kimi, GLM45, Step3 variants
+- **Parser Pooling**: Singleton instances per model type for memory efficiency
+- **High Concurrency**: Mutex-protected parsers handle 1000+ req/sec
+- **Buffer Overflow Protection**: Configurable max buffer size (default 64KB)
+- **Partial Token Detection**: Intelligent buffering for incomplete delimiters
+- **Passthrough Mode**: Graceful fallback for unknown models
+- **Zero-Copy Where Possible**: Efficient string handling in hot paths
+
+## 2. Mermaid Diagrams
+
+### Component Flow Diagram
+
+```mermaid
+graph TB
+    subgraph Input
+        R[Request] --> MID[Model ID]
+    end
+
+    subgraph Factory Layer
+        MID --> PF[ParserFactory]
+        PF --> REG[ParserRegistry]
+        REG --> PM[Pattern Matching]
+        PM --> PP[Parser Pool]
+    end
+
+    subgraph Parser Pool
+        PP --> DS[DeepSeek-R1]
+        PP --> QW[Qwen3]
+        PP --> QWT[Qwen3-Thinking]
+        PP --> KM[Kimi]
+        PP --> GL[GLM45]
+        PP --> S3[Step3]
+        PP --> PT[Passthrough]
+    end
+
+    subgraph Parser Instance
+        DS --> BP[BaseReasoningParser]
+        QW --> BP
+        KM --> BP
+        GL --> BP
+        S3 --> BP
+    end
+
+    subgraph Processing
+        BP --> DAP[detect_and_parse]
+        BP --> PSI[parse_streaming]
+        BP --> RST[reset]
+    end
+
+    subgraph State Management
+        BP --> BUF[Buffer]
+        BP --> IR[in_reasoning flag]
+        BP --> STS[stripped_think_start]
+    end
+
+    subgraph Output
+        DAP --> PR[ParserResult]
+        PSI --> PR
+        PR --> NT[normal_text]
+        PR --> RT[reasoning_text]
+    end
+```
+
+### Sequence Flow Diagram
+
+```mermaid
+sequenceDiagram
+    participant C as Client
+    participant F as ParserFactory
+    participant R as Registry
+    participant P as Parser Pool
+    participant BP as BaseParser
+    participant PR as ParserResult
+
+    C->>F: get_pooled("deepseek-r1-model")
+    F->>R: find_pooled_parser_for_model()
+    R->>R: pattern_match("deepseek-r1")
+    R->>P: get_pooled_parser("deepseek_r1")
+
+    alt Parser exists in pool
+        P-->>F: Arc<Mutex<Parser>>
+    else Create new parser
+        P->>BP: new DeepSeekR1Parser()
+        P->>P: insert into pool
+        P-->>F: Arc<Mutex<Parser>>
+    end
+
+    F-->>C: PooledParser
+
+    C->>BP: lock().parse_reasoning_streaming_incremental()
+    loop streaming chunks
+        C->>BP: parse_reasoning_streaming_incremental(chunk)
+        BP->>BP: buffer.push_str(chunk)
+        BP->>BP: check partial tokens
+
+        alt Complete token found
+            BP->>PR: create result
+            BP->>BP: clear buffer
+            BP-->>C: ParserResult
+        else Partial token
+            BP->>BP: keep buffering
+            BP-->>C: ParserResult::default()
+        end
+    end
+
+    C->>BP: reset()
+    BP->>BP: clear buffers & flags
+    C->>BP: unlock()
+```
+
+### Class/Type Diagram
+
+```mermaid
+classDiagram
+    class ReasoningParser {
+        <<trait>>
+        +detect_and_parse_reasoning(&mut self, text: &str) Result~ParserResult~
+        +parse_reasoning_streaming_incremental(&mut self, text: &str) Result~ParserResult~
+        +reset(&mut self)
+        +model_type(&self) &str
+    }
+
+    class ParserResult {
+        +normal_text: String
+        +reasoning_text: String
+        +new(normal: String, reasoning: String) Self
+        +normal(text: String) Self
+        +reasoning(text: String) Self
+        +is_empty() bool
+    }
+
+    class ParserConfig {
+        +think_start_token: String
+        +think_end_token: String
+        +stream_reasoning: bool
+        +max_buffer_size: usize
+        +initial_in_reasoning: bool
+        +default() Self
+    }
+
+    class BaseReasoningParser {
+        -config: ParserConfig
+        -in_reasoning: bool
+        -buffer: String
+        -stripped_think_start: bool
+        -model_type: String
+        +new(config: ParserConfig) Self
+        +with_model_type(model: String) Self
+        -is_partial_token(&self, text: &str) bool
+    }
+
+    class DeepSeekR1Parser {
+        -base: BaseReasoningParser
+        +new() Self
+    }
+
+    class Qwen3Parser {
+        -base: BaseReasoningParser
+        +new() Self
+    }
+
+    class QwenThinkingParser {
+        -base: BaseReasoningParser
+        +new() Self
+    }
+
+    class KimiParser {
+        -base: BaseReasoningParser
+        +new() Self
+    }
+
+    class Glm45Parser {
+        -base: BaseReasoningParser
+        +new() Self
+    }
+
+    class Step3Parser {
+        -base: BaseReasoningParser
+        +new() Self
+    }
+
+    class ParserFactory {
+        -registry: ParserRegistry
+        +new() Self
+        +get_pooled(model_id: &str) PooledParser
+        +create(model_id: &str) Result~Box~dyn ReasoningParser~~
+        +clear_pool()
+    }
+
+    class ParserRegistry {
+        -creators: Arc~RwLock~HashMap~~
+        -pool: Arc~RwLock~HashMap~~
+        -patterns: Arc~RwLock~Vec~~
+        +register_parser(name: &str, creator: F)
+        +register_pattern(pattern: &str, parser_name: &str)
+        +get_pooled_parser(name: &str) Option~PooledParser~
+        +find_pooled_parser_for_model(model: &str) Option~PooledParser~
+    }
+
+    ReasoningParser <|.. BaseReasoningParser
+    ReasoningParser <|.. DeepSeekR1Parser
+    ReasoningParser <|.. Qwen3Parser
+    ReasoningParser <|.. QwenThinkingParser
+    ReasoningParser <|.. KimiParser
+    ReasoningParser <|.. Glm45Parser
+    ReasoningParser <|.. Step3Parser
+
+    DeepSeekR1Parser o-- BaseReasoningParser
+    Qwen3Parser o-- BaseReasoningParser
+    QwenThinkingParser o-- BaseReasoningParser
+    KimiParser o-- BaseReasoningParser
+    Glm45Parser o-- BaseReasoningParser
+    Step3Parser o-- BaseReasoningParser
+
+    BaseReasoningParser o-- ParserConfig
+    ParserFactory o-- ParserRegistry
+    ParserRegistry o-- ReasoningParser
+```
+
+## 3. Module-by-Module Deep Dive
+
+### 3.1 mod.rs (Main Module)
+
+**Key Responsibilities:**
+- Module organization and public API surface
+- Re-exports for convenient access to core types
+- Separation of concerns across submodules
+
+**Module Structure:**
+- `factory`: Parser creation and pooling logic
+- `parsers`: Concrete parser implementations
+- `traits`: Core trait definitions and types
+
+### 3.2 traits.rs (Trait Definitions)
+
+**ParserResult Methods**:
+- `new()`: Create with both normal and reasoning text
+- `normal()`: Create with only normal text (convenience)
+- `reasoning()`: Create with only reasoning text (convenience)
+- `is_empty()`: Check if result contains any text
+
+**ReasoningParser Trait**:
+- **`detect_and_parse_reasoning`**: One-shot parsing for complete text
+- **`parse_reasoning_streaming_incremental`**: Stateful streaming parser
+- **`reset`**: Clear state for parser reuse
+- **`model_type`**: Identify parser variant for debugging
+
+**ParserConfig Defaults**:
+- Default tokens: `<think>` and `</think>`
+- Stream reasoning: true (immediate output)
+- Max buffer: 65536 bytes (64KB)
+- Initial state: false (explicit reasoning blocks)
+
+### 3.3 factory.rs (Parser Creation & Pooling)
+
+**ParserRegistry Methods**:
+
+1. **`register_parser`**:
+   - Register creator function for parser type
+   - Lazy instantiation when requested
+   - Thread-safe registration
+
+2. **`register_pattern`**:
+   - Map model ID patterns to parser names
+   - First-match-wins ordering
+   - Case-insensitive matching
+
+3. **`get_pooled_parser`**:
+   - Check pool for existing instance
+   - Create and pool if not present
+   - Return Arc<Mutex> for sharing
+
+4. **`find_pooled_parser_for_model`**:
+   - Pattern match against model ID
+   - Delegate to get_pooled_parser
+   - Case-insensitive comparison
+
+**ParserFactory Methods**:
+
+1. **`new()`**:
+   - Register all built-in parsers
+   - Setup model pattern mappings
+   - Initialize empty pool
+
+2. **`get_pooled`**:
+   - Primary API for getting parsers
+   - Automatic passthrough fallback
+   - Guaranteed non-null return
+
+3. **`create`**:
+   - Create fresh parser instance
+   - No pooling (for testing/isolation)
+   - Returns Result for error handling
+
+**Registered Parsers**:
+- `base`: Generic configurable parser
+- `deepseek_r1`: DeepSeek-R1 (initial_in_reasoning=true)
+- `qwen3`: Qwen3 base model (initial_in_reasoning=false)
+- `qwen3_thinking`: Qwen3 thinking variant (initial_in_reasoning=true)
+- `kimi`: Kimi with Unicode tokens
+- `glm45`: GLM-4.5 parser
+- `step3`: Step3 parser
+- `passthrough`: No-op fallback parser
+
+**Model Pattern Mappings**:
+```
+"deepseek-r1" → "deepseek_r1"
+"qwen3-thinking" → "qwen3_thinking"
+"qwen-thinking" → "qwen3_thinking"
+"qwen3" → "qwen3"
+"qwen" → "qwen3"
+"glm45" → "glm45"
+"kimi" → "kimi"
+"step3" → "step3"
+```
+
+### 3.4 parsers/base.rs (Base Implementation)
+
+**Key Methods:**
+
+**`detect_and_parse_reasoning`**:
+```
+Algorithm:
+1. Check buffer overflow protection
+2. Detect reasoning presence (in_reasoning OR contains start_token)
+3. If no reasoning → return as normal text
+4. Remove start token and trim
+5. If no end token → assume truncated reasoning
+6. Split on end token
+7. Extract reasoning and normal portions
+```
+
+**`parse_reasoning_streaming_incremental`**:
+```
+Algorithm:
+1. Check buffer capacity
+2. Append text to buffer
+3. Check if buffer is partial token prefix
+4. If partial → buffer and return empty
+5. Strip start token if present
+6. Find end token position
+7. Handle based on state:
+   - In reasoning + end found → split and return both
+   - In reasoning + streaming → return accumulated reasoning
+   - Not in reasoning → return as normal text
+   - In reasoning + no end → continue buffering
+```
+
+**Critical Features:**
+
+1. **Partial Token Detection**:
+   - Prevents premature token matching
+   - Buffers incomplete delimiters
+   - Essential for streaming correctness
+
+2. **Buffer Management**:
+   - Overflow protection
+   - Accumulation for partial content
+   - Clear on complete token detection
+
+3. **State Tracking**:
+   - `in_reasoning`: Current parsing state
+   - `stripped_think_start`: Prevent double processing
+   - `buffer`: Accumulated partial content
+
+
+## 4. Extensibility Guide
+
+### Adding a New Parser
+
+**Step 1: Create Parser Implementation**
+
+```rust
+// src/reasoning_parser/parsers/mymodel.rs
+use crate::reasoning_parser::parsers::BaseReasoningParser;
+use crate::reasoning_parser::traits::{ParserConfig, ReasoningParser};
+
+pub struct MyModelParser {
+    base: BaseReasoningParser,
+}
+
+impl MyModelParser {
+    pub fn new() -> Self {
+        let config = ParserConfig {
+            think_start_token: "<reasoning>".to_string(),
+            think_end_token: "</reasoning>".to_string(),
+            stream_reasoning: true,
+            max_buffer_size: 65536,
+            initial_in_reasoning: false, // or true for implicit
+        };
+
+        Self {
+            base: BaseReasoningParser::new(config)
+                .with_model_type("mymodel".to_string()),
+        }
+    }
+}
+
+impl ReasoningParser for MyModelParser {
+    // Delegate to base or implement custom logic
+    fn detect_and_parse_reasoning(&mut self, text: &str)
+        -> Result<ParserResult, ParseError> {
+        self.base.detect_and_parse_reasoning(text)
+    }
+
+    // ... other trait methods
+}
+```
+
+**Step 2: Register in Factory**
+
+```rust
+// In factory.rs ParserFactory::new()
+registry.register_parser("mymodel", || {
+    Box::new(MyModelParser::new())
+});
+
+// Register patterns
+registry.register_pattern("my-model", "mymodel");
+registry.register_pattern("mymodel", "mymodel");
+```
+
+**Step 3: Export from Module**
+
+```rust
+// In parsers/mod.rs
+pub use self::mymodel::MyModelParser;
+
+// In reasoning_parser/mod.rs
+pub use parsers::MyModelParser;
+```
+
+### Custom Parsing Logic
+
+For parsers requiring custom logic beyond configuration:
+
+```rust
+impl ReasoningParser for CustomParser {
+    fn parse_reasoning_streaming_incremental(&mut self, text: &str)
+        -> Result<ParserResult, ParseError> {
+        // Custom state machine
+        // Custom token detection
+        // Custom buffering strategy
+        // Return appropriate ParserResult
+    }
+}
+```
diff --git a/sgl-router/src/reasoning_parser/factory.rs b/sgl-router/src/reasoning_parser/factory.rs
new file mode 100644
index 000000000..771f0e856
--- /dev/null
+++ b/sgl-router/src/reasoning_parser/factory.rs
@@ -0,0 +1,566 @@
+// Factory and registry for creating model-specific reasoning parsers.
+// Now with parser pooling support for efficient reuse across requests.
+
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex, RwLock};
+
+use crate::reasoning_parser::parsers::{
+    BaseReasoningParser, DeepSeekR1Parser, Glm45Parser, KimiParser, Qwen3Parser,
+    QwenThinkingParser, Step3Parser,
+};
+use crate::reasoning_parser::traits::{ParseError, ParserConfig, ReasoningParser};
+
+/// Type alias for pooled parser instances.
+pub type PooledParser = Arc<Mutex<Box<dyn ReasoningParser>>>;
+
+/// Type alias for parser creator functions.
+type ParserCreator = Arc<dyn Fn() -> Box<dyn ReasoningParser> + Send + Sync>;
+
+/// Registry for model-specific parsers with pooling support.
+#[derive(Clone)]
+pub struct ParserRegistry {
+    /// Creator functions for parsers (used when pool is empty)
+    creators: Arc<RwLock<HashMap<String, ParserCreator>>>,
+    /// Pooled parser instances for reuse
+    pool: Arc<RwLock<HashMap<String, PooledParser>>>,
+    /// Model pattern to parser name mappings
+    patterns: Arc<RwLock<Vec<(String, String)>>>, // (pattern, parser_name)
+}
+
+impl ParserRegistry {
+    /// Create a new empty registry.
+    pub fn new() -> Self {
+        Self {
+            creators: Arc::new(RwLock::new(HashMap::new())),
+            pool: Arc::new(RwLock::new(HashMap::new())),
+            patterns: Arc::new(RwLock::new(Vec::new())),
+        }
+    }
+
+    /// Register a parser creator for a given parser type.
+    pub fn register_parser<F>(&self, name: &str, creator: F)
+    where
+        F: Fn() -> Box<dyn ReasoningParser> + Send + Sync + 'static,
+    {
+        let mut creators = self.creators.write().unwrap();
+        creators.insert(name.to_string(), Arc::new(creator));
+    }
+
+    /// Register a model pattern to parser mapping.
+    /// Patterns are checked in order, first match wins.
+    pub fn register_pattern(&self, pattern: &str, parser_name: &str) {
+        let mut patterns = self.patterns.write().unwrap();
+        patterns.push((pattern.to_string(), parser_name.to_string()));
+    }
+
+    /// Get a pooled parser by exact name.
+    /// Returns a shared parser instance from the pool, creating one if needed.
+    pub fn get_pooled_parser(&self, name: &str) -> Option<PooledParser> {
+        // First check if we have a pooled instance
+        {
+            let pool = self.pool.read().unwrap();
+            if let Some(parser) = pool.get(name) {
+                return Some(Arc::clone(parser));
+            }
+        }
+
+        // If not in pool, create one and add to pool
+        let creators = self.creators.read().unwrap();
+        if let Some(creator) = creators.get(name) {
+            let parser = Arc::new(Mutex::new(creator()));
+
+            // Add to pool for future use
+            let mut pool = self.pool.write().unwrap();
+            pool.insert(name.to_string(), Arc::clone(&parser));
+
+            Some(parser)
+        } else {
+            None
+        }
+    }
+
+    /// Get a parser by exact name (creates new instance, not pooled).
+    /// Use this for compatibility or when you need a fresh instance.
+    pub fn get_parser(&self, name: &str) -> Option<Box<dyn ReasoningParser>> {
+        let creators = self.creators.read().unwrap();
+        creators.get(name).map(|creator| creator())
+    }
+
+    /// Find a pooled parser for a given model ID by pattern matching.
+    pub fn find_pooled_parser_for_model(&self, model_id: &str) -> Option<PooledParser> {
+        let patterns = self.patterns.read().unwrap();
+        let model_lower = model_id.to_lowercase();
+
+        for (pattern, parser_name) in patterns.iter() {
+            if model_lower.contains(&pattern.to_lowercase()) {
+                return self.get_pooled_parser(parser_name);
+            }
+        }
+        None
+    }
+
+    /// Find a parser for a given model ID by pattern matching (creates new instance).
+    pub fn find_parser_for_model(&self, model_id: &str) -> Option<Box<dyn ReasoningParser>> {
+        let patterns = self.patterns.read().unwrap();
+        let model_lower = model_id.to_lowercase();
+
+        for (pattern, parser_name) in patterns.iter() {
+            if model_lower.contains(&pattern.to_lowercase()) {
+                return self.get_parser(parser_name);
+            }
+        }
+        None
+    }
+
+    /// Clear the parser pool, forcing new instances to be created.
+    /// Useful for testing or when parsers need to be reset globally.
+    pub fn clear_pool(&self) {
+        let mut pool = self.pool.write().unwrap();
+        pool.clear();
+    }
+}
+
+impl Default for ParserRegistry {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Factory for creating reasoning parsers based on model type.
+#[derive(Clone)]
+pub struct ParserFactory {
+    registry: ParserRegistry,
+}
+
+impl ParserFactory {
+    /// Create a new factory with default parsers registered.
+    pub fn new() -> Self {
+        let registry = ParserRegistry::new();
+
+        // Register base parser
+        registry.register_parser("base", || {
+            Box::new(BaseReasoningParser::new(ParserConfig::default()))
+        });
+
+        // Register DeepSeek-R1 parser (starts with in_reasoning=true)
+        registry.register_parser("deepseek_r1", || Box::new(DeepSeekR1Parser::new()));
+
+        // Register Qwen3 parser (starts with in_reasoning=false)
+        registry.register_parser("qwen3", || Box::new(Qwen3Parser::new()));
+
+        // Register Qwen3-thinking parser (starts with in_reasoning=true)
+        registry.register_parser("qwen3_thinking", || Box::new(QwenThinkingParser::new()));
+
+        // Register Kimi parser with Unicode tokens (starts with in_reasoning=false)
+        registry.register_parser("kimi", || Box::new(KimiParser::new()));
+
+        // Register GLM45 parser (same format as Qwen3 but separate for debugging)
+        registry.register_parser("glm45", || Box::new(Glm45Parser::new()));
+
+        // Register Step3 parser (same format as DeepSeek-R1 but separate for debugging)
+        registry.register_parser("step3", || Box::new(Step3Parser::new()));
+
+        // Register model patterns
+        registry.register_pattern("deepseek-r1", "deepseek_r1");
+        registry.register_pattern("qwen3-thinking", "qwen3_thinking");
+        registry.register_pattern("qwen-thinking", "qwen3_thinking");
+        registry.register_pattern("qwen3", "qwen3");
+        registry.register_pattern("qwen", "qwen3");
+        registry.register_pattern("glm45", "glm45");
+        registry.register_pattern("kimi", "kimi");
+        registry.register_pattern("step3", "step3");
+
+        Self { registry }
+    }
+
+    /// Get a pooled parser for the given model ID.
+    /// Returns a shared instance that can be used concurrently.
+    /// Falls back to a passthrough parser if model is not recognized.
+    pub fn get_pooled(&self, model_id: &str) -> PooledParser {
+        // First try to find by pattern
+        if let Some(parser) = self.registry.find_pooled_parser_for_model(model_id) {
+            return parser;
+        }
+
+        // Fall back to no-op parser (get or create passthrough in pool)
+        self.registry
+            .get_pooled_parser("passthrough")
+            .unwrap_or_else(|| {
+                // Register passthrough if not already registered
+                self.registry.register_parser("passthrough", || {
+                    let config = ParserConfig {
+                        think_start_token: "".to_string(),
+                        think_end_token: "".to_string(),
+                        stream_reasoning: true,
+                        max_buffer_size: 65536,
+                        initial_in_reasoning: false,
+                    };
+                    Box::new(
+                        BaseReasoningParser::new(config).with_model_type("passthrough".to_string()),
+                    )
+                });
+                self.registry.get_pooled_parser("passthrough").unwrap()
+            })
+    }
+
+    /// Create a new parser instance for the given model ID.
+    /// Returns a fresh instance (not pooled).
+    /// Use this when you need an isolated parser instance.
+    pub fn create(&self, model_id: &str) -> Result<Box<dyn ReasoningParser>, ParseError> {
+        // First try to find by pattern
+        if let Some(parser) = self.registry.find_parser_for_model(model_id) {
+            return Ok(parser);
+        }
+
+        // Fall back to no-op parser (base parser without reasoning detection)
+        let config = ParserConfig {
+            think_start_token: "".to_string(),
+            think_end_token: "".to_string(),
+            stream_reasoning: true,
+            max_buffer_size: 65536,
+            initial_in_reasoning: false,
+        };
+        Ok(Box::new(
+            BaseReasoningParser::new(config).with_model_type("passthrough".to_string()),
+        ))
+    }
+
+    /// Get the internal registry for custom registration.
+    pub fn registry(&self) -> &ParserRegistry {
+        &self.registry
+    }
+
+    /// Clear the parser pool.
+    /// Useful for testing or when parsers need to be reset globally.
+    pub fn clear_pool(&self) {
+        self.registry.clear_pool();
+    }
+}
+
+impl Default for ParserFactory {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_factory_creates_deepseek_r1() {
+        let factory = ParserFactory::new();
+        let parser = factory.create("deepseek-r1-distill").unwrap();
+        assert_eq!(parser.model_type(), "deepseek_r1");
+    }
+
+    #[test]
+    fn test_factory_creates_qwen3() {
+        let factory = ParserFactory::new();
+        let parser = factory.create("qwen3-7b").unwrap();
+        assert_eq!(parser.model_type(), "qwen3");
+    }
+
+    #[test]
+    fn test_factory_creates_kimi() {
+        let factory = ParserFactory::new();
+        let parser = factory.create("kimi-chat").unwrap();
+        assert_eq!(parser.model_type(), "kimi");
+    }
+
+    #[test]
+    fn test_factory_fallback_to_passthrough() {
+        let factory = ParserFactory::new();
+        let parser = factory.create("unknown-model").unwrap();
+        assert_eq!(parser.model_type(), "passthrough");
+    }
+
+    #[test]
+    fn test_case_insensitive_matching() {
+        let factory = ParserFactory::new();
+        let parser1 = factory.create("DeepSeek-R1").unwrap();
+        let parser2 = factory.create("QWEN3").unwrap();
+        let parser3 = factory.create("Kimi").unwrap();
+
+        assert_eq!(parser1.model_type(), "deepseek_r1");
+        assert_eq!(parser2.model_type(), "qwen3");
+        assert_eq!(parser3.model_type(), "kimi");
+    }
+
+    #[test]
+    fn test_step3_model() {
+        let factory = ParserFactory::new();
+        let step3 = factory.create("step3-model").unwrap();
+        assert_eq!(step3.model_type(), "step3");
+    }
+
+    #[test]
+    fn test_glm45_model() {
+        let factory = ParserFactory::new();
+        let glm45 = factory.create("glm45-v2").unwrap();
+        assert_eq!(glm45.model_type(), "glm45");
+    }
+
+    #[test]
+    fn test_pooled_parser_reuse() {
+        let factory = ParserFactory::new();
+
+        // Get the same parser twice - should be the same instance
+        let parser1 = factory.get_pooled("deepseek-r1");
+        let parser2 = factory.get_pooled("deepseek-r1");
+
+        // Both should point to the same Arc
+        assert!(Arc::ptr_eq(&parser1, &parser2));
+
+        // Different models should get different parsers
+        let parser3 = factory.get_pooled("qwen3");
+        assert!(!Arc::ptr_eq(&parser1, &parser3));
+    }
+
+    #[test]
+    fn test_pooled_parser_concurrent_access() {
+        use std::thread;
+
+        let factory = ParserFactory::new();
+        let parser = factory.get_pooled("deepseek-r1");
+
+        // Spawn multiple threads that use the same parser
+        let mut handles = vec![];
+
+        for i in 0..3 {
+            let parser_clone = Arc::clone(&parser);
+            let handle = thread::spawn(move || {
+                let mut parser = parser_clone.lock().unwrap();
+                let input = format!("thread {} reasoning</think>answer", i);
+                let result = parser.detect_and_parse_reasoning(&input).unwrap();
+                assert_eq!(result.normal_text, "answer");
+                assert!(result.reasoning_text.contains("reasoning"));
+            });
+            handles.push(handle);
+        }
+
+        // Wait for all threads to complete
+        for handle in handles {
+            handle.join().unwrap();
+        }
+    }
+
+    #[test]
+    fn test_pool_clearing() {
+        let factory = ParserFactory::new();
+
+        // Get a pooled parser
+        let parser1 = factory.get_pooled("deepseek-r1");
+
+        // Clear the pool
+        factory.clear_pool();
+
+        // Get another parser - should be a new instance
+        let parser2 = factory.get_pooled("deepseek-r1");
+
+        // They should be different instances (different Arc pointers)
+        assert!(!Arc::ptr_eq(&parser1, &parser2));
+    }
+
+    #[test]
+    fn test_passthrough_parser_pooling() {
+        let factory = ParserFactory::new();
+
+        // Unknown models should get passthrough parser
+        let parser1 = factory.get_pooled("unknown-model-1");
+        let parser2 = factory.get_pooled("unknown-model-2");
+
+        // Both should use the same passthrough parser instance
+        assert!(Arc::ptr_eq(&parser1, &parser2));
+
+        // Verify it's actually a passthrough parser
+        let parser = parser1.lock().unwrap();
+        assert_eq!(parser.model_type(), "passthrough");
+    }
+
+    #[test]
+    fn test_high_concurrency_parser_access() {
+        use std::sync::atomic::{AtomicUsize, Ordering};
+        use std::thread;
+        use std::time::Instant;
+
+        let factory = ParserFactory::new();
+        let num_threads = 100;
+        let requests_per_thread = 50;
+        let models = vec!["deepseek-r1", "qwen3", "kimi", "qwen3-thinking"];
+
+        // Track successful operations
+        let success_count = Arc::new(AtomicUsize::new(0));
+        let error_count = Arc::new(AtomicUsize::new(0));
+
+        let start = Instant::now();
+        let mut handles = vec![];
+
+        for thread_id in 0..num_threads {
+            let factory = factory.clone();
+            let models = models.clone();
+            let success_count = Arc::clone(&success_count);
+            let error_count = Arc::clone(&error_count);
+
+            let handle = thread::spawn(move || {
+                for request_id in 0..requests_per_thread {
+                    // Rotate through different models
+                    let model = &models[(thread_id + request_id) % models.len()];
+                    let parser = factory.get_pooled(model);
+
+                    // Use blocking lock - this is the realistic scenario
+                    // In production, requests would wait for the parser to be available
+                    // Handle poisoned locks gracefully
+                    let mut p = match parser.lock() {
+                        Ok(guard) => guard,
+                        Err(_poisoned) => {
+                            // Lock was poisoned by a panicking thread
+                            // In production, we might want to recreate the parser
+                            // For testing, we'll just skip this iteration
+                            error_count.fetch_add(1, Ordering::Relaxed);
+                            continue;
+                        }
+                    };
+
+                    // Simulate realistic parsing work with substantial text
+                    // Typical reasoning can be 500-5000 tokens
+                    let reasoning_text = format!(
+                        "Thread {} is processing request {}. Let me think through this step by step. \
+                        First, I need to understand the problem. The problem involves analyzing data \
+                        and making calculations. Let me break this down: \n\
+                        1. Initial analysis shows that we have multiple variables to consider. \
+                        2. The data suggests a pattern that needs further investigation. \
+                        3. Computing the values: {} * {} = {}. \
+                        4. Cross-referencing with previous results indicates consistency. \
+                        5. The mathematical proof follows from the axioms... \
+                        6. Considering edge cases and boundary conditions... \
+                        7. Validating against known constraints... \
+                        8. The conclusion follows logically from premises A, B, and C. \
+                        This reasoning chain demonstrates the validity of our approach.",
+                        thread_id, request_id, thread_id, request_id, thread_id * request_id
+                    );
+
+                    let answer_text = format!(
+                        "Based on my analysis, the answer for thread {} request {} is: \
+                        The solution involves multiple steps as outlined in the reasoning. \
+                        The final result is {} with confidence level high. \
+                        This conclusion is supported by rigorous mathematical analysis \
+                        and has been validated against multiple test cases. \
+                        The implementation should handle edge cases appropriately.",
+                        thread_id,
+                        request_id,
+                        thread_id * request_id
+                    );
+
+                    let input = format!("<think>{}</think>{}", reasoning_text, answer_text);
+
+                    match p.detect_and_parse_reasoning(&input) {
+                        Ok(result) => {
+                            // Verify parsing worked correctly with substantial content
+                            // Note: Some parsers with stream_reasoning=true won't accumulate reasoning text
+                            assert!(result
+                                .normal_text
+                                .contains(&format!("thread {}", thread_id)));
+
+                            // For parsers that accumulate reasoning (stream_reasoning=false)
+                            // the reasoning_text should be populated
+                            if !result.reasoning_text.is_empty() {
+                                assert!(result
+                                    .reasoning_text
+                                    .contains(&format!("Thread {}", thread_id)));
+                                assert!(result.reasoning_text.len() > 500); // Ensure substantial reasoning
+                            }
+
+                            // Normal text should always be present
+                            assert!(result.normal_text.len() > 100); // Ensure substantial answer
+                            success_count.fetch_add(1, Ordering::Relaxed);
+                        }
+                        Err(e) => {
+                            eprintln!("Parse error: {:?}", e);
+                            error_count.fetch_add(1, Ordering::Relaxed);
+                        }
+                    }
+
+                    // Explicitly drop the lock to release it quickly
+                    drop(p);
+                }
+            });
+            handles.push(handle);
+        }
+
+        // Wait for all threads
+        for handle in handles {
+            handle.join().unwrap();
+        }
+
+        let duration = start.elapsed();
+        let total_requests = num_threads * requests_per_thread;
+        let successes = success_count.load(Ordering::Relaxed);
+        let errors = error_count.load(Ordering::Relaxed);
+
+        // Print stats for debugging
+        println!(
+            "High concurrency test: {} threads, {} requests each",
+            num_threads, requests_per_thread
+        );
+        println!(
+            "Completed in {:?}, {} successes, {} errors",
+            duration, successes, errors
+        );
+        println!(
+            "Throughput: {:.0} requests/sec",
+            (total_requests as f64) / duration.as_secs_f64()
+        );
+
+        // All requests should succeed
+        assert_eq!(successes, total_requests);
+        assert_eq!(errors, 0);
+
+        // Performance check: should handle at least 1000 req/sec
+        let throughput = (total_requests as f64) / duration.as_secs_f64();
+        assert!(
+            throughput > 1000.0,
+            "Throughput too low: {:.0} req/sec",
+            throughput
+        );
+    }
+
+    #[test]
+    fn test_concurrent_pool_modifications() {
+        use std::thread;
+
+        let factory = ParserFactory::new();
+        let mut handles = vec![];
+
+        // Thread 1: Continuously get parsers
+        let factory1 = factory.clone();
+        handles.push(thread::spawn(move || {
+            for _ in 0..100 {
+                let _parser = factory1.get_pooled("deepseek-r1");
+            }
+        }));
+
+        // Thread 2: Continuously clear pool
+        let factory2 = factory.clone();
+        handles.push(thread::spawn(move || {
+            for _ in 0..10 {
+                factory2.clear_pool();
+                thread::sleep(std::time::Duration::from_micros(100));
+            }
+        }));
+
+        // Thread 3: Get different parsers
+        let factory3 = factory.clone();
+        handles.push(thread::spawn(move || {
+            for i in 0..100 {
+                let models = ["qwen3", "kimi", "unknown"];
+                let _parser = factory3.get_pooled(models[i % 3]);
+            }
+        }));
+
+        // Wait for all threads - should not deadlock or panic
+        for handle in handles {
+            handle.join().unwrap();
+        }
+    }
+}
diff --git a/sgl-router/src/reasoning_parser/mod.rs b/sgl-router/src/reasoning_parser/mod.rs
new file mode 100644
index 000000000..95ffcbc4f
--- /dev/null
+++ b/sgl-router/src/reasoning_parser/mod.rs
@@ -0,0 +1,10 @@
+pub mod factory;
+pub mod parsers;
+pub mod traits;
+
+pub use factory::{ParserFactory, ParserRegistry, PooledParser};
+pub use parsers::{
+    BaseReasoningParser, DeepSeekR1Parser, Glm45Parser, KimiParser, Qwen3Parser,
+    QwenThinkingParser, Step3Parser,
+};
+pub use traits::{ParseError, ParserConfig, ParserResult, ReasoningParser};
diff --git a/sgl-router/src/reasoning_parser/parsers/base.rs b/sgl-router/src/reasoning_parser/parsers/base.rs
new file mode 100644
index 000000000..0fd2818b9
--- /dev/null
+++ b/sgl-router/src/reasoning_parser/parsers/base.rs
@@ -0,0 +1,386 @@
+// Base implementation of reasoning parser that handles common logic
+// for detecting and extracting reasoning blocks from text.
+
+use crate::reasoning_parser::traits::{ParseError, ParserConfig, ParserResult, ReasoningParser};
+use tracing as log;
+
+/// Base reasoning parser implementation.
+///
+/// This parser handles the common logic for detecting reasoning blocks
+/// delimited by start and end tokens (e.g., <think> and </think>).
+#[derive(Debug, Clone)]
+pub struct BaseReasoningParser {
+    config: ParserConfig,
+    in_reasoning: bool,
+    buffer: String,
+    stripped_think_start: bool,
+    model_type: String,
+}
+
+impl BaseReasoningParser {
+    /// Create a new BaseReasoningParser with the given configuration.
+    pub fn new(config: ParserConfig) -> Self {
+        let in_reasoning = config.initial_in_reasoning;
+        Self {
+            config,
+            in_reasoning,
+            buffer: String::new(),
+            stripped_think_start: false,
+            model_type: "base".to_string(),
+        }
+    }
+
+    /// Create with custom model type identifier.
+    pub fn with_model_type(mut self, model_type: String) -> Self {
+        self.model_type = model_type;
+        self
+    }
+
+    /// Check if the current buffer is a prefix of one of the tokens.
+    fn is_partial_token(&self, text: &str) -> bool {
+        (self.config.think_start_token.starts_with(text) && self.config.think_start_token != text)
+            || (self.config.think_end_token.starts_with(text)
+                && self.config.think_end_token != text)
+    }
+}
+
+impl ReasoningParser for BaseReasoningParser {
+    fn detect_and_parse_reasoning(&mut self, text: &str) -> Result<ParserResult, ParseError> {
+        log::debug!("detect_and_parse_reasoning called with text: {:?}", text);
+
+        // Check input size against buffer limit
+        if text.len() > self.config.max_buffer_size {
+            return Err(ParseError::BufferOverflow(text.len()));
+        }
+
+        let in_reasoning = self.in_reasoning || text.contains(&self.config.think_start_token);
+        log::debug!("in_reasoning: {}", in_reasoning);
+
+        if !in_reasoning {
+            log::debug!("No reasoning detected, returning normal text.");
+            return Ok(ParserResult::normal(text.to_string()));
+        }
+
+        // The text is considered to be in a reasoning block.
+        let processed_text = text
+            .replace(&self.config.think_start_token, "")
+            .trim()
+            .to_string();
+        log::debug!(
+            "Processed text after removing think_start_token: {:?}",
+            processed_text
+        );
+
+        if !processed_text.contains(&self.config.think_end_token) {
+            log::debug!(
+                "Reasoning truncated, think_end_token not found. Returning reasoning text."
+            );
+            // Assume reasoning was truncated before end token
+            return Ok(ParserResult::reasoning(processed_text));
+        }
+
+        // Extract reasoning content
+        let splits: Vec<&str> = processed_text
+            .splitn(2, &self.config.think_end_token)
+            .collect();
+        let reasoning_text = splits.first().unwrap_or(&"").to_string();
+        let normal_text = splits
+            .get(1)
+            .map(|s| s.trim().to_string())
+            .unwrap_or_default();
+
+        log::debug!("Extracted reasoning_text: {:?}", reasoning_text);
+        log::debug!("Extracted normal_text: {:?}", normal_text);
+
+        Ok(ParserResult::new(normal_text, reasoning_text))
+    }
+
+    fn parse_reasoning_streaming_incremental(
+        &mut self,
+        text: &str,
+    ) -> Result<ParserResult, ParseError> {
+        // Check if adding this text would exceed buffer limit
+        if self.buffer.len() + text.len() > self.config.max_buffer_size {
+            return Err(ParseError::BufferOverflow(self.buffer.len() + text.len()));
+        }
+
+        // Incrementally parse the streaming text
+        self.buffer.push_str(text);
+        let mut current_text = self.buffer.clone();
+
+        log::debug!(
+            "parse_reasoning_streaming_incremental called with text: {:?}",
+            text
+        );
+        log::debug!("current buffer: {:?}", self.buffer);
+        log::debug!("current_text: {:?}", current_text);
+        log::debug!(
+            "in_reasoning: {}, stripped_think_start: {}, stream_reasoning: {}",
+            self.in_reasoning,
+            self.stripped_think_start,
+            self.config.stream_reasoning
+        );
+
+        // If the current text is a prefix of a token, keep buffering
+        if self.is_partial_token(&current_text) {
+            return Ok(ParserResult::default());
+        }
+
+        // Strip start token if present
+        if !self.stripped_think_start && current_text.contains(&self.config.think_start_token) {
+            current_text = current_text.replace(&self.config.think_start_token, "");
+            self.buffer = current_text.clone();
+            self.stripped_think_start = true;
+            self.in_reasoning = true;
+        }
+
+        // Handle end of reasoning block
+        let think_end_idx = if self.in_reasoning {
+            current_text
+                .find(&self.config.think_end_token)
+                .unwrap_or(current_text.len())
+        } else {
+            current_text.len()
+        };
+
+        if self.in_reasoning && think_end_idx < current_text.len() {
+            let reasoning_text = &current_text[..think_end_idx];
+            self.buffer.clear();
+            self.in_reasoning = false;
+            let start_idx = think_end_idx + self.config.think_end_token.len();
+            let normal_text = if start_idx < current_text.len() {
+                &current_text[start_idx..]
+            } else {
+                ""
+            };
+            return Ok(ParserResult::new(
+                normal_text.to_string(),
+                reasoning_text.trim().to_string(),
+            ));
+        }
+
+        // Continue with reasoning content
+        if self.in_reasoning && self.config.stream_reasoning {
+            // Stream the content immediately
+            let reasoning_text = current_text;
+            self.buffer.clear();
+            Ok(ParserResult::reasoning(reasoning_text))
+        } else if !self.in_reasoning {
+            // If we're not in a reasoning block, return as normal text
+            // CRITICAL FIX: Return current_text (with buffer) not just text
+            // This prevents buffer loss when partial tokens are followed by normal text
+            let normal_text = current_text;
+            self.buffer.clear();
+            Ok(ParserResult::normal(normal_text))
+        } else {
+            // If we are in a reasoning block but no end token is found, buffer it
+            Ok(ParserResult::default())
+        }
+    }
+
+    fn reset(&mut self) {
+        self.in_reasoning = self.config.initial_in_reasoning;
+        self.buffer.clear();
+        self.stripped_think_start = false;
+    }
+
+    fn model_type(&self) -> &str {
+        &self.model_type
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn create_test_parser(
+        initial_in_reasoning: bool,
+        stream_reasoning: bool,
+    ) -> BaseReasoningParser {
+        let config = ParserConfig {
+            think_start_token: "<think>".to_string(),
+            think_end_token: "</think>".to_string(),
+            stream_reasoning,
+            max_buffer_size: 65536,
+            initial_in_reasoning,
+        };
+        BaseReasoningParser::new(config)
+    }
+
+    #[test]
+    fn test_detect_and_parse_reasoning() {
+        let mut parser = create_test_parser(false, true);
+        let result = parser
+            .detect_and_parse_reasoning("<think>with reasoning</think> and more text.")
+            .unwrap();
+        assert_eq!(result.normal_text, "and more text.");
+        assert_eq!(result.reasoning_text, "with reasoning");
+    }
+
+    #[test]
+    fn test_detect_and_parse_no_reasoning() {
+        let mut parser = create_test_parser(false, true);
+        let result = parser
+            .detect_and_parse_reasoning("This is a test without reasoning.")
+            .unwrap();
+        assert_eq!(result.normal_text, "This is a test without reasoning.");
+        assert_eq!(result.reasoning_text, "");
+    }
+
+    #[test]
+    fn test_detect_and_parse_truncated_reasoning() {
+        let mut parser = create_test_parser(false, true);
+        let result = parser
+            .detect_and_parse_reasoning("<think>with truncated reasoning")
+            .unwrap();
+        assert_eq!(result.normal_text, "");
+        assert_eq!(result.reasoning_text, "with truncated reasoning");
+    }
+
+    #[test]
+    fn test_parse_streaming_partial_token() {
+        let mut parser = create_test_parser(false, true);
+        let result = parser
+            .parse_reasoning_streaming_incremental("<thi")
+            .unwrap();
+        assert_eq!(result.normal_text, "");
+        assert_eq!(result.reasoning_text, "");
+    }
+
+    #[test]
+    fn test_parse_streaming_complete() {
+        let mut parser = create_test_parser(false, true);
+        let result = parser
+            .parse_reasoning_streaming_incremental("<think>with reasoning</think> and more text.")
+            .unwrap();
+        assert_eq!(result.normal_text, " and more text.");
+        assert_eq!(result.reasoning_text, "with reasoning");
+    }
+
+    #[test]
+    fn test_parse_streaming_no_end_token() {
+        let mut parser = create_test_parser(true, true);
+        let result = parser
+            .parse_reasoning_streaming_incremental("<think>with reasoning")
+            .unwrap();
+        assert_eq!(result.normal_text, "");
+        assert_eq!(result.reasoning_text, "with reasoning");
+    }
+
+    #[test]
+    fn test_initial_in_reasoning_true() {
+        // Parser starts with in_reasoning=true (like DeepSeek-R1)
+        let mut parser = create_test_parser(true, true);
+        let result = parser
+            .detect_and_parse_reasoning("no think tags here")
+            .unwrap();
+        assert_eq!(result.normal_text, "");
+        assert_eq!(result.reasoning_text, "no think tags here");
+    }
+
+    #[test]
+    fn test_buffer_loss_bug_fix() {
+        // Critical test for buffer preservation
+        let mut parser = create_test_parser(false, true);
+
+        // Step 1: Send partial end tag when not in reasoning mode
+        let result1 = parser.parse_reasoning_streaming_incremental("</").unwrap();
+        assert_eq!(result1.normal_text, "");
+        assert_eq!(result1.reasoning_text, "");
+
+        // Step 2: Send normal text that doesn't complete the end tag
+        // Must return "</answer" not just "answer"
+        let result2 = parser
+            .parse_reasoning_streaming_incremental("answer")
+            .unwrap();
+        assert_eq!(result2.normal_text, "</answer");
+        assert_eq!(result2.reasoning_text, "");
+    }
+
+    #[test]
+    fn test_streaming_with_stream_reasoning_enabled() {
+        let mut parser = create_test_parser(false, true);
+
+        // Start reasoning block
+        let result1 = parser
+            .parse_reasoning_streaming_incremental("<think>reasoning ")
+            .unwrap();
+        assert_eq!(result1.normal_text, "");
+        assert_eq!(result1.reasoning_text, "reasoning ");
+
+        // Continue streaming reasoning
+        let result2 = parser
+            .parse_reasoning_streaming_incremental("content ")
+            .unwrap();
+        assert_eq!(result2.normal_text, "");
+        assert_eq!(result2.reasoning_text, "content ");
+
+        // End reasoning block
+        let result3 = parser
+            .parse_reasoning_streaming_incremental("more</think> normal")
+            .unwrap();
+        assert_eq!(result3.normal_text, " normal");
+        assert_eq!(result3.reasoning_text, "more");
+    }
+
+    #[test]
+    fn test_reset_state() {
+        let mut parser = create_test_parser(false, true);
+
+        // Process some text
+        parser
+            .parse_reasoning_streaming_incremental("<think>reasoning</think> normal")
+            .unwrap();
+
+        // Reset and verify state
+        parser.reset();
+        assert!(!parser.in_reasoning);
+        assert!(parser.buffer.is_empty());
+        assert!(!parser.stripped_think_start);
+    }
+
+    #[test]
+    fn test_buffer_overflow_detect_and_parse() {
+        let config = ParserConfig {
+            max_buffer_size: 10, // Set a very small buffer
+            ..Default::default()
+        };
+        let mut parser = BaseReasoningParser::new(config);
+
+        let large_text = "a".repeat(20);
+        let result = parser.detect_and_parse_reasoning(&large_text);
+
+        assert!(result.is_err());
+        match result {
+            Err(ParseError::BufferOverflow(size)) => {
+                assert_eq!(size, 20);
+            }
+            _ => panic!("Expected BufferOverflow error"),
+        }
+    }
+
+    #[test]
+    fn test_buffer_overflow_streaming() {
+        let config = ParserConfig {
+            max_buffer_size: 10, // Set a very small buffer
+            ..Default::default()
+        };
+        let mut parser = BaseReasoningParser::new(config);
+
+        // Send a partial token that will be buffered
+        let result1 = parser.parse_reasoning_streaming_incremental("<thi");
+        assert!(result1.is_ok());
+        assert_eq!(result1.unwrap().normal_text, "");
+
+        // Second chunk would exceed buffer
+        // Buffer has "<thi" (4 chars) + "this_is_too_large" (17 chars) = 21 total
+        let result2 = parser.parse_reasoning_streaming_incremental("this_is_too_large");
+        assert!(result2.is_err());
+        match result2 {
+            Err(ParseError::BufferOverflow(size)) => {
+                assert_eq!(size, 21); // 4 + 17
+            }
+            _ => panic!("Expected BufferOverflow error"),
+        }
+    }
+}
diff --git a/sgl-router/src/reasoning_parser/parsers/deepseek_r1.rs b/sgl-router/src/reasoning_parser/parsers/deepseek_r1.rs
new file mode 100644
index 000000000..62a7aadec
--- /dev/null
+++ b/sgl-router/src/reasoning_parser/parsers/deepseek_r1.rs
@@ -0,0 +1,112 @@
+// DeepSeek-R1 specific reasoning parser.
+// This parser starts with in_reasoning=true, assuming all text is reasoning
+// until an end token is encountered.
+
+use crate::reasoning_parser::parsers::BaseReasoningParser;
+use crate::reasoning_parser::traits::{ParseError, ParserConfig, ParserResult, ReasoningParser};
+
+/// DeepSeek-R1 reasoning parser.
+///
+/// This parser assumes reasoning from the start of text (in_reasoning=true)
+/// and uses <think> and </think> tokens.
+pub struct DeepSeekR1Parser {
+    base: BaseReasoningParser,
+}
+
+impl DeepSeekR1Parser {
+    /// Create a new DeepSeek-R1 parser.
+    pub fn new() -> Self {
+        let config = ParserConfig {
+            think_start_token: "<think>".to_string(),
+            think_end_token: "</think>".to_string(),
+            stream_reasoning: true,
+            max_buffer_size: 65536,
+            initial_in_reasoning: true, // Always starts with reasoning
+        };
+
+        Self {
+            base: BaseReasoningParser::new(config).with_model_type("deepseek_r1".to_string()),
+        }
+    }
+}
+
+impl Default for DeepSeekR1Parser {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ReasoningParser for DeepSeekR1Parser {
+    fn detect_and_parse_reasoning(&mut self, text: &str) -> Result<ParserResult, ParseError> {
+        self.base.detect_and_parse_reasoning(text)
+    }
+
+    fn parse_reasoning_streaming_incremental(
+        &mut self,
+        text: &str,
+    ) -> Result<ParserResult, ParseError> {
+        self.base.parse_reasoning_streaming_incremental(text)
+    }
+
+    fn reset(&mut self) {
+        self.base.reset()
+    }
+
+    fn model_type(&self) -> &str {
+        self.base.model_type()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_deepseek_r1_initial_state() {
+        let mut parser = DeepSeekR1Parser::new();
+
+        // Should treat text as reasoning even without start token
+        let result = parser
+            .detect_and_parse_reasoning("This is reasoning content")
+            .unwrap();
+        assert_eq!(result.normal_text, "");
+        assert_eq!(result.reasoning_text, "This is reasoning content");
+    }
+
+    #[test]
+    fn test_deepseek_r1_with_end_token() {
+        let mut parser = DeepSeekR1Parser::new();
+
+        // Should extract reasoning until end token
+        let result = parser
+            .detect_and_parse_reasoning("reasoning content</think>normal content")
+            .unwrap();
+        assert_eq!(result.normal_text, "normal content");
+        assert_eq!(result.reasoning_text, "reasoning content");
+    }
+
+    #[test]
+    fn test_deepseek_r1_streaming() {
+        let mut parser = DeepSeekR1Parser::new();
+
+        // First chunk - all reasoning
+        let result1 = parser
+            .parse_reasoning_streaming_incremental("thinking about")
+            .unwrap();
+        assert_eq!(result1.reasoning_text, "thinking about");
+        assert_eq!(result1.normal_text, "");
+
+        // Second chunk - ends reasoning
+        let result2 = parser
+            .parse_reasoning_streaming_incremental(" the problem</think>answer")
+            .unwrap();
+        assert_eq!(result2.reasoning_text, "the problem"); // Text is trimmed
+        assert_eq!(result2.normal_text, "answer");
+    }
+
+    #[test]
+    fn test_model_type() {
+        let parser = DeepSeekR1Parser::new();
+        assert_eq!(parser.model_type(), "deepseek_r1");
+    }
+}
diff --git a/sgl-router/src/reasoning_parser/parsers/glm45.rs b/sgl-router/src/reasoning_parser/parsers/glm45.rs
new file mode 100644
index 000000000..e4e56723c
--- /dev/null
+++ b/sgl-router/src/reasoning_parser/parsers/glm45.rs
@@ -0,0 +1,118 @@
+// GLM45 specific reasoning parser.
+// Uses the same format as Qwen3 but has its own implementation for debugging.
+
+use crate::reasoning_parser::parsers::BaseReasoningParser;
+use crate::reasoning_parser::traits::{ParseError, ParserConfig, ParserResult, ReasoningParser};
+
+/// GLM45 reasoning parser.
+///
+/// This parser uses the same format as Qwen3 (<think>...</think>) but has
+/// its own implementation for better debugging and potential future customization.
+pub struct Glm45Parser {
+    base: BaseReasoningParser,
+}
+
+impl Glm45Parser {
+    /// Create a new GLM45 parser.
+    pub fn new() -> Self {
+        let config = ParserConfig {
+            think_start_token: "<think>".to_string(),
+            think_end_token: "</think>".to_string(),
+            stream_reasoning: true,
+            max_buffer_size: 65536,
+            initial_in_reasoning: false, // Requires explicit start token like Qwen3
+        };
+
+        Self {
+            base: BaseReasoningParser::new(config).with_model_type("glm45".to_string()),
+        }
+    }
+}
+
+impl Default for Glm45Parser {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ReasoningParser for Glm45Parser {
+    fn detect_and_parse_reasoning(&mut self, text: &str) -> Result<ParserResult, ParseError> {
+        self.base.detect_and_parse_reasoning(text)
+    }
+
+    fn parse_reasoning_streaming_incremental(
+        &mut self,
+        text: &str,
+    ) -> Result<ParserResult, ParseError> {
+        self.base.parse_reasoning_streaming_incremental(text)
+    }
+
+    fn reset(&mut self) {
+        self.base.reset()
+    }
+
+    fn model_type(&self) -> &str {
+        self.base.model_type()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_glm45_initial_state() {
+        let mut parser = Glm45Parser::new();
+
+        // Should NOT treat text as reasoning without start token
+        let result = parser
+            .detect_and_parse_reasoning("This is normal content")
+            .unwrap();
+        assert_eq!(result.normal_text, "This is normal content");
+        assert_eq!(result.reasoning_text, "");
+    }
+
+    #[test]
+    fn test_glm45_with_tokens() {
+        let mut parser = Glm45Parser::new();
+
+        // Should extract reasoning with proper tokens
+        let result = parser
+            .detect_and_parse_reasoning("<think>reasoning content</think>answer")
+            .unwrap();
+        assert_eq!(result.normal_text, "answer");
+        assert_eq!(result.reasoning_text, "reasoning content");
+    }
+
+    #[test]
+    fn test_glm45_streaming() {
+        let mut parser = Glm45Parser::new();
+
+        // First chunk - normal text
+        let result1 = parser
+            .parse_reasoning_streaming_incremental("normal text ")
+            .unwrap();
+        assert_eq!(result1.normal_text, "normal text ");
+        assert_eq!(result1.reasoning_text, "");
+
+        // Second chunk - enters reasoning
+        let result2 = parser
+            .parse_reasoning_streaming_incremental("<think>reasoning")
+            .unwrap();
+        assert_eq!(result2.normal_text, "");
+        assert_eq!(result2.reasoning_text, "reasoning");
+
+        // Third chunk - exits reasoning
+        let result3 = parser
+            .parse_reasoning_streaming_incremental("</think>answer")
+            .unwrap();
+        assert_eq!(result3.normal_text, "answer");
+        assert_eq!(result3.reasoning_text, "");
+    }
+
+    #[test]
+    fn test_model_type() {
+        let parser = Glm45Parser::new();
+        assert_eq!(parser.model_type(), "glm45");
+    }
+}
diff --git a/sgl-router/src/reasoning_parser/parsers/kimi.rs b/sgl-router/src/reasoning_parser/parsers/kimi.rs
new file mode 100644
index 000000000..3e11a5711
--- /dev/null
+++ b/sgl-router/src/reasoning_parser/parsers/kimi.rs
@@ -0,0 +1,137 @@
+// Kimi specific reasoning parser.
+// This parser uses Unicode tokens and starts with in_reasoning=false.
+
+use crate::reasoning_parser::parsers::BaseReasoningParser;
+use crate::reasoning_parser::traits::{ParseError, ParserConfig, ParserResult, ReasoningParser};
+
+/// Kimi reasoning parser.
+///
+/// This parser uses Unicode tokens (◁think▷ and ◁/think▷) and requires
+/// explicit start tokens to enter reasoning mode.
+pub struct KimiParser {
+    base: BaseReasoningParser,
+}
+
+impl KimiParser {
+    /// Create a new Kimi parser.
+    pub fn new() -> Self {
+        let config = ParserConfig {
+            think_start_token: "◁think▷".to_string(),
+            think_end_token: "◁/think▷".to_string(),
+            stream_reasoning: true,
+            max_buffer_size: 65536,
+            initial_in_reasoning: false, // Requires explicit start token
+        };
+
+        Self {
+            base: BaseReasoningParser::new(config).with_model_type("kimi".to_string()),
+        }
+    }
+}
+
+impl Default for KimiParser {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ReasoningParser for KimiParser {
+    fn detect_and_parse_reasoning(&mut self, text: &str) -> Result<ParserResult, ParseError> {
+        self.base.detect_and_parse_reasoning(text)
+    }
+
+    fn parse_reasoning_streaming_incremental(
+        &mut self,
+        text: &str,
+    ) -> Result<ParserResult, ParseError> {
+        self.base.parse_reasoning_streaming_incremental(text)
+    }
+
+    fn reset(&mut self) {
+        self.base.reset()
+    }
+
+    fn model_type(&self) -> &str {
+        self.base.model_type()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_kimi_initial_state() {
+        let mut parser = KimiParser::new();
+
+        // Should NOT treat text as reasoning without start token
+        let result = parser
+            .detect_and_parse_reasoning("This is normal content")
+            .unwrap();
+        assert_eq!(result.normal_text, "This is normal content");
+        assert_eq!(result.reasoning_text, "");
+    }
+
+    #[test]
+    fn test_kimi_with_unicode_tokens() {
+        let mut parser = KimiParser::new();
+
+        // Should extract reasoning with Unicode tokens
+        let result = parser
+            .detect_and_parse_reasoning("◁think▷reasoning content◁/think▷answer")
+            .unwrap();
+        assert_eq!(result.normal_text, "answer");
+        assert_eq!(result.reasoning_text, "reasoning content");
+    }
+
+    #[test]
+    fn test_kimi_partial_unicode() {
+        let mut parser = KimiParser::new();
+
+        // Test partial Unicode token buffering
+        let result1 = parser
+            .parse_reasoning_streaming_incremental("◁thi")
+            .unwrap();
+        assert_eq!(result1.normal_text, "");
+        assert_eq!(result1.reasoning_text, "");
+
+        // Complete the token
+        let result2 = parser
+            .parse_reasoning_streaming_incremental("nk▷reasoning")
+            .unwrap();
+        assert_eq!(result2.normal_text, "");
+        assert_eq!(result2.reasoning_text, "reasoning");
+    }
+
+    #[test]
+    fn test_kimi_streaming() {
+        let mut parser = KimiParser::new();
+
+        // Normal text first
+        let result1 = parser
+            .parse_reasoning_streaming_incremental("normal ")
+            .unwrap();
+        assert_eq!(result1.normal_text, "normal ");
+        assert_eq!(result1.reasoning_text, "");
+
+        // Enter reasoning with Unicode token
+        let result2 = parser
+            .parse_reasoning_streaming_incremental("◁think▷thinking")
+            .unwrap();
+        assert_eq!(result2.normal_text, "");
+        assert_eq!(result2.reasoning_text, "thinking");
+
+        // Exit reasoning
+        let result3 = parser
+            .parse_reasoning_streaming_incremental("◁/think▷answer")
+            .unwrap();
+        assert_eq!(result3.normal_text, "answer");
+        assert_eq!(result3.reasoning_text, ""); // Already returned in stream mode
+    }
+
+    #[test]
+    fn test_model_type() {
+        let parser = KimiParser::new();
+        assert_eq!(parser.model_type(), "kimi");
+    }
+}
diff --git a/sgl-router/src/reasoning_parser/parsers/mod.rs b/sgl-router/src/reasoning_parser/parsers/mod.rs
new file mode 100644
index 000000000..a940a055c
--- /dev/null
+++ b/sgl-router/src/reasoning_parser/parsers/mod.rs
@@ -0,0 +1,13 @@
+pub mod base;
+pub mod deepseek_r1;
+pub mod glm45;
+pub mod kimi;
+pub mod qwen3;
+pub mod step3;
+
+pub use base::BaseReasoningParser;
+pub use deepseek_r1::DeepSeekR1Parser;
+pub use glm45::Glm45Parser;
+pub use kimi::KimiParser;
+pub use qwen3::{Qwen3Parser, QwenThinkingParser};
+pub use step3::Step3Parser;
diff --git a/sgl-router/src/reasoning_parser/parsers/qwen3.rs b/sgl-router/src/reasoning_parser/parsers/qwen3.rs
new file mode 100644
index 000000000..8c5ce9e8c
--- /dev/null
+++ b/sgl-router/src/reasoning_parser/parsers/qwen3.rs
@@ -0,0 +1,178 @@
+// Qwen3 specific reasoning parser.
+// This parser starts with in_reasoning=false, requiring an explicit
+// start token to enter reasoning mode.
+
+use crate::reasoning_parser::parsers::BaseReasoningParser;
+use crate::reasoning_parser::traits::{ParseError, ParserConfig, ParserResult, ReasoningParser};
+
+/// Qwen3 reasoning parser.
+///
+/// This parser requires explicit <think> tokens to enter reasoning mode
+/// (in_reasoning=false initially).
+pub struct Qwen3Parser {
+    base: BaseReasoningParser,
+}
+
+impl Qwen3Parser {
+    /// Create a new Qwen3 parser.
+    pub fn new() -> Self {
+        let config = ParserConfig {
+            think_start_token: "<think>".to_string(),
+            think_end_token: "</think>".to_string(),
+            stream_reasoning: true,
+            max_buffer_size: 65536,
+            initial_in_reasoning: false, // Requires explicit start token
+        };
+
+        Self {
+            base: BaseReasoningParser::new(config).with_model_type("qwen3".to_string()),
+        }
+    }
+}
+
+impl Default for Qwen3Parser {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ReasoningParser for Qwen3Parser {
+    fn detect_and_parse_reasoning(&mut self, text: &str) -> Result<ParserResult, ParseError> {
+        self.base.detect_and_parse_reasoning(text)
+    }
+
+    fn parse_reasoning_streaming_incremental(
+        &mut self,
+        text: &str,
+    ) -> Result<ParserResult, ParseError> {
+        self.base.parse_reasoning_streaming_incremental(text)
+    }
+
+    fn reset(&mut self) {
+        self.base.reset()
+    }
+
+    fn model_type(&self) -> &str {
+        self.base.model_type()
+    }
+}
+
+/// QwenThinking parser - variant that assumes reasoning from start.
+///
+/// This is for qwen*thinking models that behave like DeepSeek-R1.
+pub struct QwenThinkingParser {
+    base: BaseReasoningParser,
+}
+
+impl QwenThinkingParser {
+    /// Create a new QwenThinking parser.
+    pub fn new() -> Self {
+        let config = ParserConfig {
+            think_start_token: "<think>".to_string(),
+            think_end_token: "</think>".to_string(),
+            stream_reasoning: true,
+            max_buffer_size: 65536,
+            initial_in_reasoning: true, // Assumes reasoning from start
+        };
+
+        Self {
+            base: BaseReasoningParser::new(config).with_model_type("qwen_thinking".to_string()),
+        }
+    }
+}
+
+impl Default for QwenThinkingParser {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ReasoningParser for QwenThinkingParser {
+    fn detect_and_parse_reasoning(&mut self, text: &str) -> Result<ParserResult, ParseError> {
+        self.base.detect_and_parse_reasoning(text)
+    }
+
+    fn parse_reasoning_streaming_incremental(
+        &mut self,
+        text: &str,
+    ) -> Result<ParserResult, ParseError> {
+        self.base.parse_reasoning_streaming_incremental(text)
+    }
+
+    fn reset(&mut self) {
+        self.base.reset()
+    }
+
+    fn model_type(&self) -> &str {
+        self.base.model_type()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_qwen3_initial_state() {
+        let mut parser = Qwen3Parser::new();
+
+        // Should NOT treat text as reasoning without start token
+        let result = parser
+            .detect_and_parse_reasoning("This is normal content")
+            .unwrap();
+        assert_eq!(result.normal_text, "This is normal content");
+        assert_eq!(result.reasoning_text, "");
+    }
+
+    #[test]
+    fn test_qwen3_with_tokens() {
+        let mut parser = Qwen3Parser::new();
+
+        // Should extract reasoning with proper tokens
+        let result = parser
+            .detect_and_parse_reasoning("<think>reasoning</think>answer")
+            .unwrap();
+        assert_eq!(result.normal_text, "answer");
+        assert_eq!(result.reasoning_text, "reasoning");
+    }
+
+    #[test]
+    fn test_qwen_thinking_initial_state() {
+        let mut parser = QwenThinkingParser::new();
+
+        // Should treat text as reasoning even without start token
+        let result = parser
+            .detect_and_parse_reasoning("This is reasoning content")
+            .unwrap();
+        assert_eq!(result.normal_text, "");
+        assert_eq!(result.reasoning_text, "This is reasoning content");
+    }
+
+    #[test]
+    fn test_qwen3_streaming() {
+        let mut parser = Qwen3Parser::new();
+
+        // First chunk - normal text (no start token yet)
+        let result1 = parser
+            .parse_reasoning_streaming_incremental("normal text ")
+            .unwrap();
+        assert_eq!(result1.normal_text, "normal text ");
+        assert_eq!(result1.reasoning_text, "");
+
+        // Second chunk - enters reasoning
+        let result2 = parser
+            .parse_reasoning_streaming_incremental("<think>reasoning")
+            .unwrap();
+        assert_eq!(result2.normal_text, "");
+        assert_eq!(result2.reasoning_text, "reasoning");
+    }
+
+    #[test]
+    fn test_model_types() {
+        let qwen3 = Qwen3Parser::new();
+        assert_eq!(qwen3.model_type(), "qwen3");
+
+        let qwen_thinking = QwenThinkingParser::new();
+        assert_eq!(qwen_thinking.model_type(), "qwen_thinking");
+    }
+}
diff --git a/sgl-router/src/reasoning_parser/parsers/step3.rs b/sgl-router/src/reasoning_parser/parsers/step3.rs
new file mode 100644
index 000000000..cec0bcd15
--- /dev/null
+++ b/sgl-router/src/reasoning_parser/parsers/step3.rs
@@ -0,0 +1,123 @@
+// Step3 specific reasoning parser.
+// Uses the same format as DeepSeek-R1 but has its own implementation for debugging.
+
+use crate::reasoning_parser::parsers::BaseReasoningParser;
+use crate::reasoning_parser::traits::{ParseError, ParserConfig, ParserResult, ReasoningParser};
+
+/// Step3 reasoning parser.
+///
+/// This parser uses the same format as DeepSeek-R1 (<think>...</think>) but has
+/// its own implementation for better debugging and potential future customization.
+pub struct Step3Parser {
+    base: BaseReasoningParser,
+}
+
+impl Step3Parser {
+    /// Create a new Step3 parser.
+    pub fn new() -> Self {
+        let config = ParserConfig {
+            think_start_token: "<think>".to_string(),
+            think_end_token: "</think>".to_string(),
+            stream_reasoning: true,
+            max_buffer_size: 65536,
+            initial_in_reasoning: true, // Assumes reasoning from start like DeepSeek-R1
+        };
+
+        Self {
+            base: BaseReasoningParser::new(config).with_model_type("step3".to_string()),
+        }
+    }
+}
+
+impl Default for Step3Parser {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ReasoningParser for Step3Parser {
+    fn detect_and_parse_reasoning(&mut self, text: &str) -> Result<ParserResult, ParseError> {
+        self.base.detect_and_parse_reasoning(text)
+    }
+
+    fn parse_reasoning_streaming_incremental(
+        &mut self,
+        text: &str,
+    ) -> Result<ParserResult, ParseError> {
+        self.base.parse_reasoning_streaming_incremental(text)
+    }
+
+    fn reset(&mut self) {
+        self.base.reset()
+    }
+
+    fn model_type(&self) -> &str {
+        self.base.model_type()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_step3_initial_state() {
+        let mut parser = Step3Parser::new();
+
+        // Should treat text as reasoning even without start token
+        let result = parser
+            .detect_and_parse_reasoning("This is reasoning content")
+            .unwrap();
+        assert_eq!(result.normal_text, "");
+        assert_eq!(result.reasoning_text, "This is reasoning content");
+    }
+
+    #[test]
+    fn test_step3_with_end_token() {
+        let mut parser = Step3Parser::new();
+
+        // Should handle text with end token
+        let result = parser
+            .detect_and_parse_reasoning("reasoning content</think>answer")
+            .unwrap();
+        assert_eq!(result.normal_text, "answer");
+        assert_eq!(result.reasoning_text, "reasoning content");
+    }
+
+    #[test]
+    fn test_step3_with_both_tokens() {
+        let mut parser = Step3Parser::new();
+
+        // Should handle both start and end tokens
+        let result = parser
+            .detect_and_parse_reasoning("<think>reasoning content</think>answer")
+            .unwrap();
+        assert_eq!(result.normal_text, "answer");
+        assert_eq!(result.reasoning_text, "reasoning content");
+    }
+
+    #[test]
+    fn test_step3_streaming() {
+        let mut parser = Step3Parser::new();
+
+        // First chunk - treated as reasoning (initial_in_reasoning=true)
+        let result1 = parser
+            .parse_reasoning_streaming_incremental("reasoning text ")
+            .unwrap();
+        assert_eq!(result1.normal_text, "");
+        assert_eq!(result1.reasoning_text, "reasoning text ");
+
+        // Second chunk - continues reasoning until end token
+        let result2 = parser
+            .parse_reasoning_streaming_incremental("more reasoning</think>answer")
+            .unwrap();
+        assert_eq!(result2.normal_text, "answer");
+        assert_eq!(result2.reasoning_text, "more reasoning");
+    }
+
+    #[test]
+    fn test_model_type() {
+        let parser = Step3Parser::new();
+        assert_eq!(parser.model_type(), "step3");
+    }
+}
diff --git a/sgl-router/src/reasoning_parser/traits.rs b/sgl-router/src/reasoning_parser/traits.rs
new file mode 100644
index 000000000..160fa51d9
--- /dev/null
+++ b/sgl-router/src/reasoning_parser/traits.rs
@@ -0,0 +1,130 @@
+use std::fmt;
+
+/// Result of parsing text for reasoning content.
+#[derive(Debug, Clone, Default, PartialEq)]
+pub struct ParserResult {
+    /// The normal text outside of reasoning blocks.
+    pub normal_text: String,
+
+    /// The extracted reasoning text from within reasoning blocks.
+    pub reasoning_text: String,
+}
+
+impl ParserResult {
+    /// Create a new ParserResult with the given normal and reasoning text.
+    pub fn new(normal_text: String, reasoning_text: String) -> Self {
+        Self {
+            normal_text,
+            reasoning_text,
+        }
+    }
+
+    /// Create a result with only normal text.
+    pub fn normal(text: String) -> Self {
+        Self {
+            normal_text: text,
+            reasoning_text: String::new(),
+        }
+    }
+
+    /// Create a result with only reasoning text.
+    pub fn reasoning(text: String) -> Self {
+        Self {
+            normal_text: String::new(),
+            reasoning_text: text,
+        }
+    }
+
+    /// Check if this result contains any text.
+    pub fn is_empty(&self) -> bool {
+        self.normal_text.is_empty() && self.reasoning_text.is_empty()
+    }
+}
+
+/// Trait for parsing reasoning content from LLM outputs.
+pub trait ReasoningParser: Send + Sync {
+    /// Detects and parses reasoning from the input text (one-time parsing).
+    ///
+    /// This method is used for non-streaming scenarios where the complete
+    /// text is available at once.
+    ///
+    /// Returns an error if the text exceeds buffer limits or contains invalid UTF-8.
+    fn detect_and_parse_reasoning(&mut self, text: &str) -> Result<ParserResult, ParseError>;
+
+    /// Parses reasoning incrementally from streaming input.
+    ///
+    /// This method maintains internal state across calls to handle partial
+    /// tokens and chunk boundaries correctly.
+    ///
+    /// Returns an error if the buffer exceeds max_buffer_size.
+    fn parse_reasoning_streaming_incremental(
+        &mut self,
+        text: &str,
+    ) -> Result<ParserResult, ParseError>;
+
+    /// Reset the parser state for reuse.
+    ///
+    /// This should clear any buffers and reset flags to initial state.
+    fn reset(&mut self);
+
+    /// Get the model type this parser is designed for.
+    fn model_type(&self) -> &str;
+}
+
+/// Error types for reasoning parsing operations.
+#[derive(Debug, thiserror::Error)]
+pub enum ParseError {
+    #[error("Invalid UTF-8 in stream: {0}")]
+    Utf8Error(#[from] std::str::Utf8Error),
+
+    #[error("Buffer overflow: {0} bytes exceeds maximum")]
+    BufferOverflow(usize),
+
+    #[error("Unknown model type: {0}")]
+    UnknownModel(String),
+
+    #[error("Parser configuration error: {0}")]
+    ConfigError(String),
+}
+
+/// Configuration for parser behavior.
+#[derive(Debug, Clone)]
+pub struct ParserConfig {
+    /// The token that marks the start of reasoning content.
+    pub think_start_token: String,
+
+    /// The token that marks the end of reasoning content.
+    pub think_end_token: String,
+
+    /// Whether to stream reasoning content as it arrives.
+    pub stream_reasoning: bool,
+
+    /// Maximum buffer size in bytes.
+    pub max_buffer_size: usize,
+
+    /// Initial state for in_reasoning flag (fixed per parser type).
+    pub initial_in_reasoning: bool,
+}
+
+impl Default for ParserConfig {
+    fn default() -> Self {
+        Self {
+            think_start_token: "<think>".to_string(),
+            think_end_token: "</think>".to_string(),
+            stream_reasoning: true,
+            max_buffer_size: 65536,      // 64KB default
+            initial_in_reasoning: false, // Default to false (explicit reasoning)
+        }
+    }
+}
+
+impl fmt::Display for ParserResult {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "ParserResult {{ normal: {} chars, reasoning: {} chars }}",
+            self.normal_text.len(),
+            self.reasoning_text.len()
+        )
+    }
+}
diff --git a/sgl-router/src/routers/factory.rs b/sgl-router/src/routers/factory.rs
new file mode 100644
index 000000000..d1bdc0fce
--- /dev/null
+++ b/sgl-router/src/routers/factory.rs
@@ -0,0 +1,195 @@
+//! Factory for creating router instances
+
+use super::{
+    http::{openai_router::OpenAIRouter, pd_router::PDRouter, router::Router},
+    RouterTrait,
+};
+use crate::config::{ConnectionMode, PolicyConfig, RoutingMode};
+use crate::policies::PolicyFactory;
+use crate::server::AppContext;
+use std::sync::Arc;
+
+/// Factory for creating router instances based on configuration
+pub struct RouterFactory;
+
+impl RouterFactory {
+    /// Create a router instance from application context
+    pub async fn create_router(ctx: &Arc<AppContext>) -> Result<Box<dyn RouterTrait>, String> {
+        // Check if IGW mode is enabled
+        if ctx.router_config.enable_igw {
+            return Self::create_igw_router(ctx).await;
+        }
+
+        // Check connection mode and route to appropriate implementation
+        match ctx.router_config.connection_mode {
+            ConnectionMode::Grpc => {
+                // Route to gRPC implementation based on routing mode
+                match &ctx.router_config.mode {
+                    RoutingMode::Regular { worker_urls } => {
+                        Self::create_grpc_router(worker_urls, &ctx.router_config.policy, ctx).await
+                    }
+                    RoutingMode::PrefillDecode {
+                        prefill_urls,
+                        decode_urls,
+                        prefill_policy,
+                        decode_policy,
+                    } => {
+                        Self::create_grpc_pd_router(
+                            prefill_urls,
+                            decode_urls,
+                            prefill_policy.as_ref(),
+                            decode_policy.as_ref(),
+                            &ctx.router_config.policy,
+                            ctx,
+                        )
+                        .await
+                    }
+                    RoutingMode::OpenAI { .. } => {
+                        Err("OpenAI mode requires HTTP connection_mode".to_string())
+                    }
+                }
+            }
+            ConnectionMode::Http => {
+                // Route to HTTP implementation based on routing mode
+                match &ctx.router_config.mode {
+                    RoutingMode::Regular { worker_urls } => {
+                        Self::create_regular_router(worker_urls, &ctx.router_config.policy, ctx)
+                            .await
+                    }
+                    RoutingMode::PrefillDecode {
+                        prefill_urls,
+                        decode_urls,
+                        prefill_policy,
+                        decode_policy,
+                    } => {
+                        Self::create_pd_router(
+                            prefill_urls,
+                            decode_urls,
+                            prefill_policy.as_ref(),
+                            decode_policy.as_ref(),
+                            &ctx.router_config.policy,
+                            ctx,
+                        )
+                        .await
+                    }
+                    RoutingMode::OpenAI { worker_urls, .. } => {
+                        Self::create_openai_router(worker_urls.clone(), ctx).await
+                    }
+                }
+            }
+        }
+    }
+
+    /// Create a regular router with injected policy
+    async fn create_regular_router(
+        worker_urls: &[String],
+        policy_config: &PolicyConfig,
+        ctx: &Arc<AppContext>,
+    ) -> Result<Box<dyn RouterTrait>, String> {
+        // Create policy
+        let policy = PolicyFactory::create_from_config(policy_config);
+
+        // Create regular router with injected policy and context
+        let router = Router::new(worker_urls.to_vec(), policy, ctx).await?;
+
+        Ok(Box::new(router))
+    }
+
+    /// Create a PD router with injected policy
+    async fn create_pd_router(
+        prefill_urls: &[(String, Option<u16>)],
+        decode_urls: &[String],
+        prefill_policy_config: Option<&PolicyConfig>,
+        decode_policy_config: Option<&PolicyConfig>,
+        main_policy_config: &PolicyConfig,
+        ctx: &Arc<AppContext>,
+    ) -> Result<Box<dyn RouterTrait>, String> {
+        // Create policies - use specific policies if provided, otherwise fall back to main policy
+        let prefill_policy =
+            PolicyFactory::create_from_config(prefill_policy_config.unwrap_or(main_policy_config));
+        let decode_policy =
+            PolicyFactory::create_from_config(decode_policy_config.unwrap_or(main_policy_config));
+
+        // Create PD router with separate policies and context
+        let router = PDRouter::new(
+            prefill_urls.to_vec(),
+            decode_urls.to_vec(),
+            prefill_policy,
+            decode_policy,
+            ctx,
+        )
+        .await?;
+
+        Ok(Box::new(router))
+    }
+
+    /// Create a gRPC router with injected policy
+    pub async fn create_grpc_router(
+        worker_urls: &[String],
+        policy_config: &PolicyConfig,
+        ctx: &Arc<AppContext>,
+    ) -> Result<Box<dyn RouterTrait>, String> {
+        use super::grpc::router::GrpcRouter;
+
+        // Create policy
+        let policy = PolicyFactory::create_from_config(policy_config);
+
+        // Create gRPC router with context
+        let router = GrpcRouter::new(worker_urls.to_vec(), policy, ctx).await?;
+
+        Ok(Box::new(router))
+    }
+
+    /// Create a gRPC PD router with tokenizer and worker configuration
+    pub async fn create_grpc_pd_router(
+        prefill_urls: &[(String, Option<u16>)],
+        decode_urls: &[String],
+        prefill_policy_config: Option<&PolicyConfig>,
+        decode_policy_config: Option<&PolicyConfig>,
+        main_policy_config: &PolicyConfig,
+        ctx: &Arc<AppContext>,
+    ) -> Result<Box<dyn RouterTrait>, String> {
+        use super::grpc::pd_router::GrpcPDRouter;
+
+        // Create policies - use specific policies if provided, otherwise fall back to main policy
+        let prefill_policy =
+            PolicyFactory::create_from_config(prefill_policy_config.unwrap_or(main_policy_config));
+        let decode_policy =
+            PolicyFactory::create_from_config(decode_policy_config.unwrap_or(main_policy_config));
+
+        // Create gRPC PD router with context
+        let router = GrpcPDRouter::new(
+            prefill_urls.to_vec(),
+            decode_urls.to_vec(),
+            prefill_policy,
+            decode_policy,
+            ctx,
+        )
+        .await?;
+
+        Ok(Box::new(router))
+    }
+
+    /// Create an OpenAI router
+    async fn create_openai_router(
+        worker_urls: Vec<String>,
+        ctx: &Arc<AppContext>,
+    ) -> Result<Box<dyn RouterTrait>, String> {
+        // Use the first worker URL as the OpenAI-compatible base
+        let base_url = worker_urls
+            .first()
+            .cloned()
+            .ok_or_else(|| "OpenAI mode requires at least one worker URL".to_string())?;
+
+        let router =
+            OpenAIRouter::new(base_url, Some(ctx.router_config.circuit_breaker.clone())).await?;
+
+        Ok(Box::new(router))
+    }
+
+    /// Create an IGW router (placeholder for future implementation)
+    async fn create_igw_router(_ctx: &Arc<AppContext>) -> Result<Box<dyn RouterTrait>, String> {
+        // For now, return an error indicating IGW is not yet implemented
+        Err("IGW mode is not yet implemented".to_string())
+    }
+}
diff --git a/sgl-router/src/routers/grpc/mod.rs b/sgl-router/src/routers/grpc/mod.rs
new file mode 100644
index 000000000..a6a5d8eec
--- /dev/null
+++ b/sgl-router/src/routers/grpc/mod.rs
@@ -0,0 +1,4 @@
+//! gRPC router implementations
+
+pub mod pd_router;
+pub mod router;
diff --git a/sgl-router/src/routers/grpc/pd_router.rs b/sgl-router/src/routers/grpc/pd_router.rs
new file mode 100644
index 000000000..d227b460d
--- /dev/null
+++ b/sgl-router/src/routers/grpc/pd_router.rs
@@ -0,0 +1,328 @@
+// PD (Prefill-Decode) gRPC Router Implementation
+
+use crate::config::types::RetryConfig;
+use crate::core::{
+    BasicWorker, CircuitBreakerConfig, HealthChecker, HealthConfig, Worker, WorkerType,
+};
+use crate::grpc::SglangSchedulerClient;
+use crate::metrics::RouterMetrics;
+use crate::policies::LoadBalancingPolicy;
+use crate::reasoning_parser::ParserFactory;
+use crate::routers::{RouterTrait, WorkerManagement};
+use crate::tokenizer::traits::Tokenizer;
+use crate::tool_parser::ParserRegistry;
+use async_trait::async_trait;
+use axum::{
+    body::Body,
+    extract::Request,
+    http::{HeaderMap, StatusCode},
+    response::{IntoResponse, Response},
+};
+use std::collections::HashMap;
+use std::sync::{Arc, RwLock};
+use std::time::Duration;
+use tracing::{info, warn};
+
+/// gRPC PD (Prefill-Decode) router implementation for SGLang
+#[allow(dead_code)] // Fields will be used once implementation is complete
+pub struct GrpcPDRouter {
+    /// Prefill worker connections
+    prefill_workers: Arc<RwLock<Vec<Box<dyn Worker>>>>,
+    /// Decode worker connections
+    decode_workers: Arc<RwLock<Vec<Box<dyn Worker>>>>,
+    /// gRPC clients for prefill workers
+    prefill_grpc_clients: Arc<RwLock<HashMap<String, SglangSchedulerClient>>>,
+    /// gRPC clients for decode workers
+    decode_grpc_clients: Arc<RwLock<HashMap<String, SglangSchedulerClient>>>,
+    /// Load balancing policy for prefill
+    prefill_policy: Arc<dyn LoadBalancingPolicy>,
+    /// Load balancing policy for decode
+    decode_policy: Arc<dyn LoadBalancingPolicy>,
+    /// Tokenizer for handling text encoding/decoding
+    tokenizer: Arc<dyn Tokenizer>,
+    /// Reasoning parser factory for structured reasoning outputs
+    reasoning_parser_factory: ParserFactory,
+    /// Tool parser registry for function/tool calls
+    tool_parser_registry: &'static ParserRegistry,
+    /// Worker health checkers
+    _prefill_health_checker: Option<HealthChecker>,
+    _decode_health_checker: Option<HealthChecker>,
+    /// Configuration
+    timeout_secs: u64,
+    interval_secs: u64,
+    dp_aware: bool,
+    api_key: Option<String>,
+    retry_config: RetryConfig,
+    circuit_breaker_config: CircuitBreakerConfig,
+}
+
+impl GrpcPDRouter {
+    /// Create a new gRPC PD router
+    pub async fn new(
+        prefill_urls: Vec<(String, Option<u16>)>,
+        decode_urls: Vec<String>,
+        prefill_policy: Arc<dyn LoadBalancingPolicy>,
+        decode_policy: Arc<dyn LoadBalancingPolicy>,
+        ctx: &Arc<crate::server::AppContext>,
+    ) -> Result<Self, String> {
+        // Update metrics
+        RouterMetrics::set_active_workers(prefill_urls.len() + decode_urls.len());
+
+        // Extract necessary components from context
+        let tokenizer = ctx
+            .tokenizer
+            .as_ref()
+            .ok_or_else(|| "gRPC PD router requires tokenizer".to_string())?
+            .clone();
+        let reasoning_parser_factory = ctx
+            .reasoning_parser_factory
+            .as_ref()
+            .ok_or_else(|| "gRPC PD router requires reasoning parser factory".to_string())?
+            .clone();
+        let tool_parser_registry = ctx
+            .tool_parser_registry
+            .ok_or_else(|| "gRPC PD router requires tool parser registry".to_string())?;
+
+        // Convert config CircuitBreakerConfig to core CircuitBreakerConfig
+        let circuit_breaker_config = ctx.router_config.effective_circuit_breaker_config();
+        let core_cb_config = CircuitBreakerConfig {
+            failure_threshold: circuit_breaker_config.failure_threshold,
+            success_threshold: circuit_breaker_config.success_threshold,
+            timeout_duration: Duration::from_secs(circuit_breaker_config.timeout_duration_secs),
+            window_duration: Duration::from_secs(circuit_breaker_config.window_duration_secs),
+        };
+
+        // Create gRPC clients for prefill workers
+        let mut prefill_grpc_clients = HashMap::new();
+        for (url, _bootstrap_port) in &prefill_urls {
+            match SglangSchedulerClient::connect(url).await {
+                Ok(client) => {
+                    prefill_grpc_clients.insert(url.clone(), client);
+                    info!("Connected to gRPC prefill worker at {}", url);
+                }
+                Err(e) => {
+                    warn!("Failed to connect to gRPC prefill worker at {}: {}", url, e);
+                    // Continue with other workers
+                }
+            }
+        }
+
+        // Create gRPC clients for decode workers
+        let mut decode_grpc_clients = HashMap::new();
+        for url in &decode_urls {
+            match SglangSchedulerClient::connect(url).await {
+                Ok(client) => {
+                    decode_grpc_clients.insert(url.clone(), client);
+                    info!("Connected to gRPC decode worker at {}", url);
+                }
+                Err(e) => {
+                    warn!("Failed to connect to gRPC decode worker at {}: {}", url, e);
+                    // Continue with other workers
+                }
+            }
+        }
+
+        if prefill_grpc_clients.is_empty() && decode_grpc_clients.is_empty() {
+            return Err("Failed to connect to any gRPC workers".to_string());
+        }
+
+        // Create Prefill Worker trait objects with gRPC connection mode
+        let prefill_workers: Vec<Box<dyn Worker>> = prefill_urls
+            .iter()
+            .map(|(url, bootstrap_port)| {
+                let worker = BasicWorker::with_connection_mode(
+                    url.clone(),
+                    WorkerType::Prefill {
+                        bootstrap_port: *bootstrap_port,
+                    },
+                    crate::core::ConnectionMode::Grpc {
+                        port: *bootstrap_port,
+                    },
+                )
+                .with_circuit_breaker_config(core_cb_config.clone())
+                .with_health_config(HealthConfig {
+                    timeout_secs: ctx.router_config.health_check.timeout_secs,
+                    check_interval_secs: ctx.router_config.health_check.check_interval_secs,
+                    endpoint: ctx.router_config.health_check.endpoint.clone(),
+                    failure_threshold: ctx.router_config.health_check.failure_threshold,
+                    success_threshold: ctx.router_config.health_check.success_threshold,
+                });
+                Box::new(worker) as Box<dyn Worker>
+            })
+            .collect();
+
+        // Create Decode Worker trait objects with gRPC connection mode
+        let decode_workers: Vec<Box<dyn Worker>> = decode_urls
+            .iter()
+            .map(|url| {
+                let worker = BasicWorker::with_connection_mode(
+                    url.clone(),
+                    WorkerType::Decode,
+                    crate::core::ConnectionMode::Grpc { port: None },
+                )
+                .with_circuit_breaker_config(core_cb_config.clone())
+                .with_health_config(HealthConfig {
+                    timeout_secs: ctx.router_config.health_check.timeout_secs,
+                    check_interval_secs: ctx.router_config.health_check.check_interval_secs,
+                    endpoint: ctx.router_config.health_check.endpoint.clone(),
+                    failure_threshold: ctx.router_config.health_check.failure_threshold,
+                    success_threshold: ctx.router_config.health_check.success_threshold,
+                });
+                Box::new(worker) as Box<dyn Worker>
+            })
+            .collect();
+
+        // Initialize policies with workers if needed
+        if let Some(cache_aware) = prefill_policy
+            .as_any()
+            .downcast_ref::<crate::policies::CacheAwarePolicy>()
+        {
+            cache_aware.init_workers(&prefill_workers);
+        }
+
+        if let Some(cache_aware) = decode_policy
+            .as_any()
+            .downcast_ref::<crate::policies::CacheAwarePolicy>()
+        {
+            cache_aware.init_workers(&decode_workers);
+        }
+
+        let prefill_workers = Arc::new(RwLock::new(prefill_workers));
+        let decode_workers = Arc::new(RwLock::new(decode_workers));
+
+        let prefill_health_checker = crate::core::start_health_checker(
+            Arc::clone(&prefill_workers),
+            ctx.router_config.worker_startup_check_interval_secs,
+        );
+        let decode_health_checker = crate::core::start_health_checker(
+            Arc::clone(&decode_workers),
+            ctx.router_config.worker_startup_check_interval_secs,
+        );
+
+        Ok(GrpcPDRouter {
+            prefill_workers,
+            decode_workers,
+            prefill_grpc_clients: Arc::new(RwLock::new(prefill_grpc_clients)),
+            decode_grpc_clients: Arc::new(RwLock::new(decode_grpc_clients)),
+            prefill_policy,
+            decode_policy,
+            tokenizer,
+            reasoning_parser_factory,
+            tool_parser_registry,
+            _prefill_health_checker: Some(prefill_health_checker),
+            _decode_health_checker: Some(decode_health_checker),
+            timeout_secs: ctx.router_config.worker_startup_timeout_secs,
+            interval_secs: ctx.router_config.worker_startup_check_interval_secs,
+            dp_aware: ctx.router_config.dp_aware,
+            api_key: ctx.router_config.api_key.clone(),
+            retry_config: ctx.router_config.effective_retry_config(),
+            circuit_breaker_config: core_cb_config,
+        })
+    }
+}
+
+impl std::fmt::Debug for GrpcPDRouter {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("GrpcPDRouter")
+            .field(
+                "prefill_workers_count",
+                &self.prefill_workers.read().unwrap().len(),
+            )
+            .field(
+                "decode_workers_count",
+                &self.decode_workers.read().unwrap().len(),
+            )
+            .field("timeout_secs", &self.timeout_secs)
+            .field("interval_secs", &self.interval_secs)
+            .field("dp_aware", &self.dp_aware)
+            .finish()
+    }
+}
+
+#[async_trait]
+impl RouterTrait for GrpcPDRouter {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    async fn health(&self, _req: Request<Body>) -> Response {
+        (StatusCode::NOT_IMPLEMENTED).into_response()
+    }
+
+    async fn health_generate(&self, _req: Request<Body>) -> Response {
+        (StatusCode::NOT_IMPLEMENTED).into_response()
+    }
+
+    async fn get_server_info(&self, _req: Request<Body>) -> Response {
+        (StatusCode::NOT_IMPLEMENTED).into_response()
+    }
+
+    async fn get_models(&self, _req: Request<Body>) -> Response {
+        (StatusCode::NOT_IMPLEMENTED).into_response()
+    }
+
+    async fn get_model_info(&self, _req: Request<Body>) -> Response {
+        (StatusCode::NOT_IMPLEMENTED).into_response()
+    }
+
+    async fn route_generate(
+        &self,
+        _headers: Option<&HeaderMap>,
+        _body: &crate::protocols::spec::GenerateRequest,
+    ) -> Response {
+        (StatusCode::NOT_IMPLEMENTED).into_response()
+    }
+
+    async fn route_chat(
+        &self,
+        _headers: Option<&HeaderMap>,
+        _body: &crate::protocols::spec::ChatCompletionRequest,
+    ) -> Response {
+        (StatusCode::NOT_IMPLEMENTED).into_response()
+    }
+
+    async fn route_completion(
+        &self,
+        _headers: Option<&HeaderMap>,
+        _body: &crate::protocols::spec::CompletionRequest,
+    ) -> Response {
+        (StatusCode::NOT_IMPLEMENTED).into_response()
+    }
+
+    async fn route_embeddings(&self, _headers: Option<&HeaderMap>, _body: Body) -> Response {
+        (StatusCode::NOT_IMPLEMENTED).into_response()
+    }
+
+    async fn route_rerank(&self, _headers: Option<&HeaderMap>, _body: Body) -> Response {
+        (StatusCode::NOT_IMPLEMENTED).into_response()
+    }
+
+    async fn flush_cache(&self) -> Response {
+        (StatusCode::NOT_IMPLEMENTED).into_response()
+    }
+
+    async fn get_worker_loads(&self) -> Response {
+        (StatusCode::NOT_IMPLEMENTED).into_response()
+    }
+
+    fn router_type(&self) -> &'static str {
+        "grpc_pd"
+    }
+
+    fn readiness(&self) -> Response {
+        (StatusCode::SERVICE_UNAVAILABLE).into_response()
+    }
+}
+
+#[async_trait]
+impl WorkerManagement for GrpcPDRouter {
+    async fn add_worker(&self, _worker_url: &str) -> Result<String, String> {
+        Err("Not implemented".to_string())
+    }
+
+    fn remove_worker(&self, _worker_url: &str) {}
+
+    fn get_worker_urls(&self) -> Vec<String> {
+        vec![]
+    }
+}
diff --git a/sgl-router/src/routers/grpc/router.rs b/sgl-router/src/routers/grpc/router.rs
new file mode 100644
index 000000000..5c499125f
--- /dev/null
+++ b/sgl-router/src/routers/grpc/router.rs
@@ -0,0 +1,266 @@
+// gRPC Router Implementation
+
+use crate::config::types::RetryConfig;
+use crate::core::{
+    BasicWorker, CircuitBreakerConfig, HealthChecker, HealthConfig, Worker, WorkerType,
+};
+use crate::grpc::SglangSchedulerClient;
+use crate::metrics::RouterMetrics;
+use crate::policies::LoadBalancingPolicy;
+use crate::reasoning_parser::ParserFactory;
+use crate::routers::{RouterTrait, WorkerManagement};
+use crate::tokenizer::traits::Tokenizer;
+use crate::tool_parser::ParserRegistry;
+use async_trait::async_trait;
+use axum::{
+    body::Body,
+    extract::Request,
+    http::{HeaderMap, StatusCode},
+    response::{IntoResponse, Response},
+};
+use std::collections::HashMap;
+use std::sync::{Arc, RwLock};
+use std::time::Duration;
+use tracing::{info, warn};
+
+/// gRPC router implementation for SGLang
+#[allow(dead_code)] // Fields will be used once implementation is complete
+pub struct GrpcRouter {
+    /// Worker connections
+    workers: Arc<RwLock<Vec<Box<dyn Worker>>>>,
+    /// gRPC clients for each worker
+    grpc_clients: Arc<RwLock<HashMap<String, SglangSchedulerClient>>>,
+    /// Load balancing policy
+    policy: Arc<dyn LoadBalancingPolicy>,
+    /// Tokenizer for handling text encoding/decoding
+    tokenizer: Arc<dyn Tokenizer>,
+    /// Reasoning parser factory for structured reasoning outputs
+    reasoning_parser_factory: ParserFactory,
+    /// Tool parser registry for function/tool calls
+    tool_parser_registry: &'static ParserRegistry,
+    /// Worker health checker
+    _health_checker: Option<HealthChecker>,
+    /// Configuration
+    timeout_secs: u64,
+    interval_secs: u64,
+    dp_aware: bool,
+    api_key: Option<String>,
+    retry_config: RetryConfig,
+    circuit_breaker_config: CircuitBreakerConfig,
+}
+
+impl GrpcRouter {
+    /// Create a new gRPC router
+    pub async fn new(
+        worker_urls: Vec<String>,
+        policy: Arc<dyn LoadBalancingPolicy>,
+        ctx: &Arc<crate::server::AppContext>,
+    ) -> Result<Self, String> {
+        // Update metrics
+        RouterMetrics::set_active_workers(worker_urls.len());
+
+        // Extract necessary components from context
+        let tokenizer = ctx
+            .tokenizer
+            .as_ref()
+            .ok_or_else(|| "gRPC router requires tokenizer".to_string())?
+            .clone();
+        let reasoning_parser_factory = ctx
+            .reasoning_parser_factory
+            .as_ref()
+            .ok_or_else(|| "gRPC router requires reasoning parser factory".to_string())?
+            .clone();
+        let tool_parser_registry = ctx
+            .tool_parser_registry
+            .ok_or_else(|| "gRPC router requires tool parser registry".to_string())?;
+
+        // Convert config CircuitBreakerConfig to core CircuitBreakerConfig
+        let circuit_breaker_config = ctx.router_config.effective_circuit_breaker_config();
+        let core_cb_config = CircuitBreakerConfig {
+            failure_threshold: circuit_breaker_config.failure_threshold,
+            success_threshold: circuit_breaker_config.success_threshold,
+            timeout_duration: Duration::from_secs(circuit_breaker_config.timeout_duration_secs),
+            window_duration: Duration::from_secs(circuit_breaker_config.window_duration_secs),
+        };
+
+        // Create gRPC clients for each worker
+        let mut grpc_clients = HashMap::new();
+        for url in &worker_urls {
+            match SglangSchedulerClient::connect(url).await {
+                Ok(client) => {
+                    grpc_clients.insert(url.clone(), client);
+                    info!("Connected to gRPC worker at {}", url);
+                }
+                Err(e) => {
+                    warn!("Failed to connect to gRPC worker at {}: {}", url, e);
+                    // Continue with other workers
+                }
+            }
+        }
+
+        if grpc_clients.is_empty() {
+            return Err("Failed to connect to any gRPC workers".to_string());
+        }
+
+        // Create Worker trait objects with gRPC connection mode
+        let mut workers: Vec<Box<dyn Worker>> = Vec::new();
+
+        // Move clients from the HashMap to the workers
+        for url in &worker_urls {
+            if let Some(client) = grpc_clients.remove(url) {
+                let worker = BasicWorker::with_connection_mode(
+                    url.clone(),
+                    WorkerType::Regular,
+                    crate::core::ConnectionMode::Grpc { port: None },
+                )
+                .with_circuit_breaker_config(core_cb_config.clone())
+                .with_health_config(HealthConfig {
+                    timeout_secs: ctx.router_config.health_check.timeout_secs,
+                    check_interval_secs: ctx.router_config.health_check.check_interval_secs,
+                    endpoint: ctx.router_config.health_check.endpoint.clone(),
+                    failure_threshold: ctx.router_config.health_check.failure_threshold,
+                    success_threshold: ctx.router_config.health_check.success_threshold,
+                })
+                .with_grpc_client(client);
+
+                workers.push(Box::new(worker) as Box<dyn Worker>);
+            } else {
+                warn!("No gRPC client for worker {}, skipping", url);
+            }
+        }
+
+        // Initialize policy with workers if needed
+        if let Some(cache_aware) = policy
+            .as_any()
+            .downcast_ref::<crate::policies::CacheAwarePolicy>()
+        {
+            cache_aware.init_workers(&workers);
+        }
+
+        let workers = Arc::new(RwLock::new(workers));
+        let health_checker = crate::core::start_health_checker(
+            Arc::clone(&workers),
+            ctx.router_config.worker_startup_check_interval_secs,
+        );
+
+        Ok(GrpcRouter {
+            workers,
+            grpc_clients: Arc::new(RwLock::new(grpc_clients)),
+            policy,
+            tokenizer,
+            reasoning_parser_factory,
+            tool_parser_registry,
+            _health_checker: Some(health_checker),
+            timeout_secs: ctx.router_config.worker_startup_timeout_secs,
+            interval_secs: ctx.router_config.worker_startup_check_interval_secs,
+            dp_aware: ctx.router_config.dp_aware,
+            api_key: ctx.router_config.api_key.clone(),
+            retry_config: ctx.router_config.effective_retry_config(),
+            circuit_breaker_config: core_cb_config,
+        })
+    }
+}
+
+impl std::fmt::Debug for GrpcRouter {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("GrpcRouter")
+            .field("workers_count", &self.workers.read().unwrap().len())
+            .field("timeout_secs", &self.timeout_secs)
+            .field("interval_secs", &self.interval_secs)
+            .field("dp_aware", &self.dp_aware)
+            .finish()
+    }
+}
+
+#[async_trait]
+impl RouterTrait for GrpcRouter {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    async fn health(&self, _req: Request<Body>) -> Response {
+        (StatusCode::NOT_IMPLEMENTED).into_response()
+    }
+
+    async fn health_generate(&self, _req: Request<Body>) -> Response {
+        (StatusCode::NOT_IMPLEMENTED).into_response()
+    }
+
+    async fn get_server_info(&self, _req: Request<Body>) -> Response {
+        (StatusCode::NOT_IMPLEMENTED).into_response()
+    }
+
+    async fn get_models(&self, _req: Request<Body>) -> Response {
+        (StatusCode::NOT_IMPLEMENTED).into_response()
+    }
+
+    async fn get_model_info(&self, _req: Request<Body>) -> Response {
+        (StatusCode::NOT_IMPLEMENTED).into_response()
+    }
+
+    async fn route_generate(
+        &self,
+        _headers: Option<&HeaderMap>,
+        _body: &crate::protocols::spec::GenerateRequest,
+    ) -> Response {
+        (StatusCode::NOT_IMPLEMENTED).into_response()
+    }
+
+    async fn route_chat(
+        &self,
+        _headers: Option<&HeaderMap>,
+        _body: &crate::protocols::spec::ChatCompletionRequest,
+    ) -> Response {
+        (StatusCode::NOT_IMPLEMENTED).into_response()
+    }
+
+    async fn route_completion(
+        &self,
+        _headers: Option<&HeaderMap>,
+        _body: &crate::protocols::spec::CompletionRequest,
+    ) -> Response {
+        (StatusCode::NOT_IMPLEMENTED).into_response()
+    }
+
+    async fn route_embeddings(&self, _headers: Option<&HeaderMap>, _body: Body) -> Response {
+        (StatusCode::NOT_IMPLEMENTED).into_response()
+    }
+
+    async fn route_rerank(&self, _headers: Option<&HeaderMap>, _body: Body) -> Response {
+        (StatusCode::NOT_IMPLEMENTED).into_response()
+    }
+
+    async fn flush_cache(&self) -> Response {
+        (StatusCode::NOT_IMPLEMENTED).into_response()
+    }
+
+    async fn get_worker_loads(&self) -> Response {
+        (StatusCode::NOT_IMPLEMENTED).into_response()
+    }
+
+    fn router_type(&self) -> &'static str {
+        "grpc"
+    }
+
+    fn readiness(&self) -> Response {
+        (StatusCode::SERVICE_UNAVAILABLE).into_response()
+    }
+}
+
+#[async_trait]
+impl WorkerManagement for GrpcRouter {
+    async fn add_worker(&self, _worker_url: &str) -> Result<String, String> {
+        Err("Not implemented".to_string())
+    }
+
+    fn remove_worker(&self, _worker_url: &str) {}
+
+    fn get_worker_urls(&self) -> Vec<String> {
+        self.workers
+            .read()
+            .unwrap()
+            .iter()
+            .map(|w| w.url().to_string())
+            .collect()
+    }
+}
diff --git a/sgl-router/src/routers/header_utils.rs b/sgl-router/src/routers/header_utils.rs
new file mode 100644
index 000000000..0adab5bf0
--- /dev/null
+++ b/sgl-router/src/routers/header_utils.rs
@@ -0,0 +1,53 @@
+use axum::body::Body;
+use axum::extract::Request;
+use axum::http::HeaderMap;
+
+/// Copy request headers to a Vec of name-value string pairs
+/// Used for forwarding headers to backend workers
+pub fn copy_request_headers(req: &Request<Body>) -> Vec<(String, String)> {
+    req.headers()
+        .iter()
+        .filter_map(|(name, value)| {
+            // Convert header value to string, skipping non-UTF8 headers
+            value
+                .to_str()
+                .ok()
+                .map(|v| (name.to_string(), v.to_string()))
+        })
+        .collect()
+}
+
+/// Convert headers from reqwest Response to axum HeaderMap
+/// Filters out hop-by-hop headers that shouldn't be forwarded
+pub fn preserve_response_headers(reqwest_headers: &HeaderMap) -> HeaderMap {
+    let mut headers = HeaderMap::new();
+
+    for (name, value) in reqwest_headers.iter() {
+        // Skip hop-by-hop headers that shouldn't be forwarded
+        let name_str = name.as_str().to_lowercase();
+        if should_forward_header(&name_str) {
+            // The original name and value are already valid, so we can just clone them
+            headers.insert(name.clone(), value.clone());
+        }
+    }
+
+    headers
+}
+
+/// Determine if a header should be forwarded from backend to client
+fn should_forward_header(name: &str) -> bool {
+    // List of headers that should NOT be forwarded (hop-by-hop headers)
+    !matches!(
+        name,
+        "connection" |
+        "keep-alive" |
+        "proxy-authenticate" |
+        "proxy-authorization" |
+        "te" |
+        "trailers" |
+        "transfer-encoding" |
+        "upgrade" |
+        "content-encoding" | // Let axum/hyper handle encoding
+        "host" // Should not forward the backend's host header
+    )
+}
diff --git a/sgl-router/src/routers/http/mod.rs b/sgl-router/src/routers/http/mod.rs
new file mode 100644
index 000000000..9f955b651
--- /dev/null
+++ b/sgl-router/src/routers/http/mod.rs
@@ -0,0 +1,6 @@
+//! HTTP router implementations
+
+pub mod openai_router;
+pub mod pd_router;
+pub mod pd_types;
+pub mod router;
diff --git a/sgl-router/src/routers/http/openai_router.rs b/sgl-router/src/routers/http/openai_router.rs
new file mode 100644
index 000000000..551dd1aa3
--- /dev/null
+++ b/sgl-router/src/routers/http/openai_router.rs
@@ -0,0 +1,379 @@
+//! OpenAI router implementation (reqwest-based)
+
+use crate::config::CircuitBreakerConfig;
+use crate::core::{CircuitBreaker, CircuitBreakerConfig as CoreCircuitBreakerConfig};
+use crate::protocols::spec::{ChatCompletionRequest, CompletionRequest, GenerateRequest};
+use async_trait::async_trait;
+use axum::{
+    body::Body,
+    extract::Request,
+    http::{header::CONTENT_TYPE, HeaderMap, HeaderValue, StatusCode},
+    response::{IntoResponse, Response},
+};
+use futures_util::StreamExt;
+use std::{
+    any::Any,
+    sync::atomic::{AtomicBool, Ordering},
+};
+
+/// Router for OpenAI backend
+#[derive(Debug)]
+pub struct OpenAIRouter {
+    /// HTTP client for upstream OpenAI-compatible API
+    client: reqwest::Client,
+    /// Base URL for identification (no trailing slash)
+    base_url: String,
+    /// Circuit breaker
+    circuit_breaker: CircuitBreaker,
+    /// Health status
+    healthy: AtomicBool,
+}
+
+impl OpenAIRouter {
+    /// Create a new OpenAI router
+    pub async fn new(
+        base_url: String,
+        circuit_breaker_config: Option<CircuitBreakerConfig>,
+    ) -> Result<Self, String> {
+        let client = reqwest::Client::builder()
+            .timeout(std::time::Duration::from_secs(300))
+            .build()
+            .map_err(|e| format!("Failed to create HTTP client: {}", e))?;
+
+        let base_url = base_url.trim_end_matches('/').to_string();
+
+        // Convert circuit breaker config
+        let core_cb_config = circuit_breaker_config
+            .map(|cb| CoreCircuitBreakerConfig {
+                failure_threshold: cb.failure_threshold,
+                success_threshold: cb.success_threshold,
+                timeout_duration: std::time::Duration::from_secs(cb.timeout_duration_secs),
+                window_duration: std::time::Duration::from_secs(cb.window_duration_secs),
+            })
+            .unwrap_or_default();
+
+        let circuit_breaker = CircuitBreaker::with_config(core_cb_config);
+
+        Ok(Self {
+            client,
+            base_url,
+            circuit_breaker,
+            healthy: AtomicBool::new(true),
+        })
+    }
+}
+
+#[async_trait]
+impl super::super::WorkerManagement for OpenAIRouter {
+    async fn add_worker(&self, _worker_url: &str) -> Result<String, String> {
+        Err("Cannot add workers to OpenAI router".to_string())
+    }
+
+    fn remove_worker(&self, _worker_url: &str) {
+        // No-op for OpenAI router
+    }
+
+    fn get_worker_urls(&self) -> Vec<String> {
+        vec![self.base_url.clone()]
+    }
+}
+
+#[async_trait]
+impl super::super::RouterTrait for OpenAIRouter {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    async fn health(&self, _req: Request<Body>) -> Response {
+        // Simple upstream probe: GET {base}/v1/models without auth
+        let url = format!("{}/v1/models", self.base_url);
+        match self
+            .client
+            .get(&url)
+            .timeout(std::time::Duration::from_secs(2))
+            .send()
+            .await
+        {
+            Ok(resp) => {
+                let code = resp.status();
+                // Treat success and auth-required as healthy (endpoint reachable)
+                if code.is_success() || code.as_u16() == 401 || code.as_u16() == 403 {
+                    (StatusCode::OK, "OK").into_response()
+                } else {
+                    (
+                        StatusCode::SERVICE_UNAVAILABLE,
+                        format!("Upstream status: {}", code),
+                    )
+                        .into_response()
+                }
+            }
+            Err(e) => (
+                StatusCode::SERVICE_UNAVAILABLE,
+                format!("Upstream error: {}", e),
+            )
+                .into_response(),
+        }
+    }
+
+    async fn health_generate(&self, _req: Request<Body>) -> Response {
+        // For OpenAI, health_generate is the same as health
+        self.health(_req).await
+    }
+
+    async fn get_server_info(&self, _req: Request<Body>) -> Response {
+        let info = serde_json::json!({
+            "router_type": "openai",
+            "workers": 1,
+            "base_url": &self.base_url
+        });
+        (StatusCode::OK, info.to_string()).into_response()
+    }
+
+    async fn get_models(&self, req: Request<Body>) -> Response {
+        // Proxy to upstream /v1/models; forward Authorization header if provided
+        let headers = req.headers();
+
+        let mut upstream = self.client.get(format!("{}/v1/models", self.base_url));
+
+        if let Some(auth) = headers
+            .get("authorization")
+            .or_else(|| headers.get("Authorization"))
+        {
+            upstream = upstream.header("Authorization", auth);
+        }
+
+        match upstream.send().await {
+            Ok(res) => {
+                let status = StatusCode::from_u16(res.status().as_u16())
+                    .unwrap_or(StatusCode::INTERNAL_SERVER_ERROR);
+                let content_type = res.headers().get(CONTENT_TYPE).cloned();
+                match res.bytes().await {
+                    Ok(body) => {
+                        let mut response = Response::new(axum::body::Body::from(body));
+                        *response.status_mut() = status;
+                        if let Some(ct) = content_type {
+                            response.headers_mut().insert(CONTENT_TYPE, ct);
+                        }
+                        response
+                    }
+                    Err(e) => (
+                        StatusCode::INTERNAL_SERVER_ERROR,
+                        format!("Failed to read upstream response: {}", e),
+                    )
+                        .into_response(),
+                }
+            }
+            Err(e) => (
+                StatusCode::BAD_GATEWAY,
+                format!("Failed to contact upstream: {}", e),
+            )
+                .into_response(),
+        }
+    }
+
+    async fn get_model_info(&self, _req: Request<Body>) -> Response {
+        // Not directly supported without model param; return 501
+        (
+            StatusCode::NOT_IMPLEMENTED,
+            "get_model_info not implemented for OpenAI router",
+        )
+            .into_response()
+    }
+
+    async fn route_generate(
+        &self,
+        _headers: Option<&HeaderMap>,
+        _body: &GenerateRequest,
+    ) -> Response {
+        // Generate endpoint is SGLang-specific, not supported for OpenAI backend
+        (
+            StatusCode::NOT_IMPLEMENTED,
+            "Generate endpoint not supported for OpenAI backend",
+        )
+            .into_response()
+    }
+
+    async fn route_chat(
+        &self,
+        headers: Option<&HeaderMap>,
+        body: &ChatCompletionRequest,
+    ) -> Response {
+        if !self.circuit_breaker.can_execute() {
+            return (StatusCode::SERVICE_UNAVAILABLE, "Circuit breaker open").into_response();
+        }
+
+        // Serialize request body, removing SGLang-only fields
+        let mut payload = match serde_json::to_value(body) {
+            Ok(v) => v,
+            Err(e) => {
+                return (
+                    StatusCode::BAD_REQUEST,
+                    format!("Failed to serialize request: {}", e),
+                )
+                    .into_response();
+            }
+        };
+        if let Some(obj) = payload.as_object_mut() {
+            for key in [
+                "top_k",
+                "min_p",
+                "min_tokens",
+                "regex",
+                "ebnf",
+                "stop_token_ids",
+                "no_stop_trim",
+                "ignore_eos",
+                "continue_final_message",
+                "skip_special_tokens",
+                "lora_path",
+                "session_params",
+                "separate_reasoning",
+                "stream_reasoning",
+                "chat_template_kwargs",
+                "return_hidden_states",
+                "repetition_penalty",
+            ] {
+                obj.remove(key);
+            }
+        }
+
+        let url = format!("{}/v1/chat/completions", self.base_url);
+        let mut req = self.client.post(&url).json(&payload);
+
+        // Forward Authorization header if provided
+        if let Some(h) = headers {
+            if let Some(auth) = h.get("authorization").or_else(|| h.get("Authorization")) {
+                req = req.header("Authorization", auth);
+            }
+        }
+
+        // Accept SSE when stream=true
+        if body.stream {
+            req = req.header("Accept", "text/event-stream");
+        }
+
+        let resp = match req.send().await {
+            Ok(r) => r,
+            Err(e) => {
+                self.circuit_breaker.record_failure();
+                return (
+                    StatusCode::SERVICE_UNAVAILABLE,
+                    format!("Failed to contact upstream: {}", e),
+                )
+                    .into_response();
+            }
+        };
+
+        let status = StatusCode::from_u16(resp.status().as_u16())
+            .unwrap_or(StatusCode::INTERNAL_SERVER_ERROR);
+
+        if !body.stream {
+            // Capture Content-Type before consuming response body
+            let content_type = resp.headers().get(CONTENT_TYPE).cloned();
+            match resp.bytes().await {
+                Ok(body) => {
+                    self.circuit_breaker.record_success();
+                    let mut response = Response::new(axum::body::Body::from(body));
+                    *response.status_mut() = status;
+                    if let Some(ct) = content_type {
+                        response.headers_mut().insert(CONTENT_TYPE, ct);
+                    }
+                    response
+                }
+                Err(e) => {
+                    self.circuit_breaker.record_failure();
+                    (
+                        StatusCode::INTERNAL_SERVER_ERROR,
+                        format!("Failed to read response: {}", e),
+                    )
+                        .into_response()
+                }
+            }
+        } else {
+            // Stream SSE bytes to client
+            let stream = resp.bytes_stream();
+            let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
+            tokio::spawn(async move {
+                let mut s = stream;
+                while let Some(chunk) = s.next().await {
+                    match chunk {
+                        Ok(bytes) => {
+                            if tx.send(Ok(bytes)).is_err() {
+                                break;
+                            }
+                        }
+                        Err(e) => {
+                            let _ = tx.send(Err(format!("Stream error: {}", e)));
+                            break;
+                        }
+                    }
+                }
+            });
+            let mut response = Response::new(Body::from_stream(
+                tokio_stream::wrappers::UnboundedReceiverStream::new(rx),
+            ));
+            *response.status_mut() = status;
+            response
+                .headers_mut()
+                .insert(CONTENT_TYPE, HeaderValue::from_static("text/event-stream"));
+            response
+        }
+    }
+
+    async fn route_completion(
+        &self,
+        _headers: Option<&HeaderMap>,
+        _body: &CompletionRequest,
+    ) -> Response {
+        // Completion endpoint not implemented for OpenAI backend
+        (
+            StatusCode::NOT_IMPLEMENTED,
+            "Completion endpoint not implemented for OpenAI backend",
+        )
+            .into_response()
+    }
+
+    async fn flush_cache(&self) -> Response {
+        (
+            StatusCode::NOT_IMPLEMENTED,
+            "flush_cache not supported for OpenAI router",
+        )
+            .into_response()
+    }
+
+    async fn get_worker_loads(&self) -> Response {
+        (
+            StatusCode::NOT_IMPLEMENTED,
+            "get_worker_loads not supported for OpenAI router",
+        )
+            .into_response()
+    }
+
+    fn router_type(&self) -> &'static str {
+        "openai"
+    }
+
+    fn readiness(&self) -> Response {
+        if self.healthy.load(Ordering::Acquire) && self.circuit_breaker.can_execute() {
+            (StatusCode::OK, "Ready").into_response()
+        } else {
+            (StatusCode::SERVICE_UNAVAILABLE, "Not ready").into_response()
+        }
+    }
+
+    async fn route_embeddings(&self, _headers: Option<&HeaderMap>, _body: Body) -> Response {
+        (
+            StatusCode::NOT_IMPLEMENTED,
+            "Embeddings endpoint not implemented for OpenAI backend",
+        )
+            .into_response()
+    }
+
+    async fn route_rerank(&self, _headers: Option<&HeaderMap>, _body: Body) -> Response {
+        (
+            StatusCode::NOT_IMPLEMENTED,
+            "Rerank endpoint not implemented for OpenAI backend",
+        )
+            .into_response()
+    }
+}
diff --git a/sgl-router/src/routers/http/pd_router.rs b/sgl-router/src/routers/http/pd_router.rs
new file mode 100644
index 000000000..528ead5f5
--- /dev/null
+++ b/sgl-router/src/routers/http/pd_router.rs
@@ -0,0 +1,2500 @@
+// PD (Prefill-Decode) Router Implementation
+// This module handles routing for disaggregated prefill-decode systems
+use super::pd_types::{api_path, PDRouterError};
+use crate::config::types::RetryConfig;
+use crate::core::{
+    is_retryable_status, BasicWorker, CircuitBreakerConfig, HealthChecker, HealthConfig,
+    RetryExecutor, Worker, WorkerFactory, WorkerLoadGuard, WorkerType,
+};
+use crate::metrics::RouterMetrics;
+use crate::policies::LoadBalancingPolicy;
+use crate::protocols::spec::{
+    ChatCompletionRequest, ChatMessage, CompletionRequest, GenerateRequest, StringOrArray,
+    UserMessageContent,
+};
+use crate::routers::header_utils;
+use crate::routers::{RouterTrait, WorkerManagement};
+use async_trait::async_trait;
+use axum::{
+    body::Body,
+    extract::Request,
+    http::{header::CONTENT_TYPE, HeaderMap, HeaderValue, StatusCode},
+    response::{IntoResponse, Response},
+    Json,
+};
+use futures_util::StreamExt;
+use reqwest::Client;
+use serde::Serialize;
+use serde_json::{json, Value};
+use std::collections::HashMap;
+use std::sync::{Arc, RwLock};
+use std::time::{Duration, Instant};
+use tokio::sync::mpsc;
+use tokio_stream::wrappers::UnboundedReceiverStream;
+use tracing::{debug, error, info, warn};
+
+#[derive(Debug)]
+pub struct PDRouter {
+    pub prefill_workers: Arc<RwLock<Vec<Box<dyn Worker>>>>,
+    pub decode_workers: Arc<RwLock<Vec<Box<dyn Worker>>>>,
+    pub prefill_policy: Arc<dyn LoadBalancingPolicy>,
+    pub decode_policy: Arc<dyn LoadBalancingPolicy>,
+    pub worker_startup_timeout_secs: u64,
+    pub worker_startup_check_interval_secs: u64,
+    pub worker_loads: Arc<tokio::sync::watch::Receiver<HashMap<String, isize>>>,
+    pub load_monitor_handle: Option<Arc<tokio::task::JoinHandle<()>>>,
+    pub client: Client,
+    // Dedicated client for prefill fire-and-forget (non-logprob) requests
+    pub prefill_client: Client,
+    pub retry_config: RetryConfig,
+    pub circuit_breaker_config: CircuitBreakerConfig,
+    _prefill_health_checker: Option<HealthChecker>,
+    _decode_health_checker: Option<HealthChecker>,
+    // Channel for sending prefill responses to background workers for draining
+    prefill_drain_tx: mpsc::Sender<reqwest::Response>,
+}
+
+// Request context for PD router operations
+#[derive(Clone)]
+struct PDRequestContext {
+    route: &'static str,
+    batch_size: Option<usize>,
+    is_stream: bool,
+    return_logprob: bool,
+    request_text: Option<String>,
+}
+
+impl PDRouter {
+    // Dynamic worker management methods for service discovery
+
+    // Private helper method to perform health check on a new server
+    async fn wait_for_server_health(&self, url: &str) -> Result<(), PDRouterError> {
+        crate::routers::http::router::Router::wait_for_healthy_workers(
+            &[url.to_string()],
+            self.worker_startup_timeout_secs,
+            self.worker_startup_check_interval_secs,
+        )
+        .await
+        .map_err(|_| PDRouterError::HealthCheckFailed {
+            url: url.to_string(),
+        })
+    }
+
+    // Generic helper for processing all workers with an endpoint
+    async fn process_workers(
+        &self,
+        workers: &RwLock<Vec<Box<dyn Worker>>>,
+        worker_type: &str,
+        endpoint: &str,
+    ) -> (Vec<String>, Vec<String>) {
+        let mut results = Vec::new();
+        let mut errors = Vec::new();
+
+        // Get worker URLs first to avoid holding lock across await
+        let urls = match workers.read() {
+            Ok(workers) => workers
+                .iter()
+                .map(|w| w.url().to_string())
+                .collect::<Vec<_>>(),
+            Err(_) => {
+                errors.push(format!("Failed to access {} workers", worker_type));
+                Vec::new()
+            }
+        };
+
+        // Process each worker
+        for worker_url in urls {
+            let url = format!("{}/{}", worker_url, endpoint);
+            match self.client.post(&url).send().await {
+                Ok(res) if res.status().is_success() => {
+                    results.push(format!("{} {}: OK", worker_type, worker_url));
+                }
+                Ok(res) => {
+                    errors.push(format!(
+                        "{} {} returned status: {}",
+                        worker_type,
+                        worker_url,
+                        res.status()
+                    ));
+                }
+                Err(e) => {
+                    errors.push(format!("{} {} error: {}", worker_type, worker_url, e));
+                }
+            }
+        }
+
+        (results, errors)
+    }
+
+    // Helper to get worker URLs from a worker collection
+    fn get_worker_urls(
+        workers: &RwLock<Vec<Box<dyn Worker>>>,
+        worker_type: &str,
+    ) -> Result<Vec<String>, String> {
+        workers
+            .read()
+            .map(|workers| {
+                workers
+                    .iter()
+                    .map(|w| w.url().to_string())
+                    .collect::<Vec<_>>()
+            })
+            .map_err(|_| format!("Failed to access {} workers", worker_type))
+    }
+
+    // Generic helper for proxying requests to the first worker
+    async fn proxy_to_first_worker(
+        &self,
+        workers: &RwLock<Vec<Box<dyn Worker>>>,
+        endpoint: &str,
+        worker_type: &str,
+        headers: Option<Vec<(String, String)>>,
+    ) -> Response {
+        // Get first worker URL to avoid holding lock across await
+        let first_worker_url = match workers.read() {
+            Ok(workers) => workers.first().map(|w| w.url().to_string()),
+            Err(_) => {
+                return (
+                    StatusCode::INTERNAL_SERVER_ERROR,
+                    format!("Failed to access {} workers", worker_type),
+                )
+                    .into_response();
+            }
+        };
+
+        if let Some(worker_url) = first_worker_url {
+            let url = format!("{}/{}", worker_url, endpoint);
+            let mut request_builder = self.client.get(&url);
+
+            // Add headers if provided
+            if let Some(headers) = headers {
+                for (name, value) in headers {
+                    request_builder = request_builder.header(name, value);
+                }
+            }
+
+            match request_builder.send().await {
+                Ok(res) if res.status().is_success() => {
+                    let response_headers = header_utils::preserve_response_headers(res.headers());
+
+                    match res.bytes().await {
+                        Ok(body) => {
+                            let mut response = Response::new(axum::body::Body::from(body));
+                            *response.status_mut() = StatusCode::OK;
+                            *response.headers_mut() = response_headers;
+                            response
+                        }
+                        Err(e) => {
+                            error!("Failed to read response body: {}", e);
+                            (
+                                StatusCode::INTERNAL_SERVER_ERROR,
+                                format!("Failed to read response body: {}", e),
+                            )
+                                .into_response()
+                        }
+                    }
+                }
+                Ok(res) => {
+                    let status = StatusCode::from_u16(res.status().as_u16())
+                        .unwrap_or(StatusCode::INTERNAL_SERVER_ERROR);
+                    (
+                        status,
+                        format!("{} server returned status: {}", worker_type, res.status()),
+                    )
+                        .into_response()
+                }
+                Err(e) => {
+                    error!("Failed to proxy request to {} server: {}", worker_type, e);
+                    (
+                        StatusCode::INTERNAL_SERVER_ERROR,
+                        format!("Failed to proxy request: {}", e),
+                    )
+                        .into_response()
+                }
+            }
+        } else {
+            (
+                StatusCode::SERVICE_UNAVAILABLE,
+                format!("No {} servers available", worker_type),
+            )
+                .into_response()
+        }
+    }
+
+    pub async fn add_prefill_server(
+        &self,
+        url: String,
+        bootstrap_port: Option<u16>,
+    ) -> Result<String, PDRouterError> {
+        // Wait for the new server to be healthy
+        self.wait_for_server_health(&url).await?;
+
+        // Create Worker for the new prefill server with circuit breaker configuration
+        let worker = WorkerFactory::create_prefill_with_config(
+            url.clone(),
+            bootstrap_port,
+            self.circuit_breaker_config.clone(),
+        );
+
+        // Add to prefill workers list
+        let mut workers = self
+            .prefill_workers
+            .write()
+            .map_err(|_| PDRouterError::LockError {
+                operation: "prefill_workers write".to_string(),
+            })?;
+
+        // Check if already exists
+        if workers.iter().any(|w| w.url() == url) {
+            return Err(PDRouterError::WorkerAlreadyExists { url: url.clone() });
+        }
+
+        workers.push(worker);
+
+        // Update cache-aware policy if applicable
+        drop(workers); // Release write lock
+        if let Some(cache_policy) = self
+            .prefill_policy
+            .as_any()
+            .downcast_ref::<crate::policies::CacheAwarePolicy>()
+        {
+            cache_policy.add_worker(&url);
+        }
+
+        info!("Added prefill server: {}", url);
+        Ok(format!("Successfully added prefill server: {}", url))
+    }
+
+    pub async fn add_decode_server(&self, url: String) -> Result<String, PDRouterError> {
+        // Wait for the new server to be healthy
+        self.wait_for_server_health(&url).await?;
+
+        // Create Worker for the new decode server with circuit breaker configuration
+        let worker = WorkerFactory::create_decode_with_config(
+            url.clone(),
+            self.circuit_breaker_config.clone(),
+        );
+
+        // Add to decode workers list
+        let mut workers = self
+            .decode_workers
+            .write()
+            .map_err(|_| PDRouterError::LockError {
+                operation: "decode_workers write".to_string(),
+            })?;
+
+        // Check if already exists
+        if workers.iter().any(|w| w.url() == url) {
+            return Err(PDRouterError::WorkerAlreadyExists { url: url.clone() });
+        }
+
+        workers.push(worker);
+
+        // Update cache-aware policy if applicable
+        drop(workers); // Release write lock
+        if let Some(cache_policy) = self
+            .decode_policy
+            .as_any()
+            .downcast_ref::<crate::policies::CacheAwarePolicy>()
+        {
+            cache_policy.add_worker(&url);
+        }
+
+        info!("Added decode server: {}", url);
+        Ok(format!("Successfully added decode server: {}", url))
+    }
+
+    pub async fn remove_prefill_server(&self, url: &str) -> Result<String, PDRouterError> {
+        let mut workers = self
+            .prefill_workers
+            .write()
+            .map_err(|_| PDRouterError::LockError {
+                operation: "prefill_workers write".to_string(),
+            })?;
+
+        // Find and remove the server
+        let initial_len = workers.len();
+        workers.retain(|w| w.url() != url);
+
+        if workers.len() == initial_len {
+            return Err(PDRouterError::WorkerNotFound {
+                url: url.to_string(),
+            });
+        }
+
+        // Remove from cache-aware policy if applicable
+        if let Some(cache_policy) = self
+            .prefill_policy
+            .as_any()
+            .downcast_ref::<crate::policies::CacheAwarePolicy>()
+        {
+            cache_policy.remove_worker(url);
+        }
+
+        info!("Removed prefill server: {}", url);
+        Ok(format!("Successfully removed prefill server: {}", url))
+    }
+
+    pub async fn remove_decode_server(&self, url: &str) -> Result<String, PDRouterError> {
+        let mut workers = self
+            .decode_workers
+            .write()
+            .map_err(|_| PDRouterError::LockError {
+                operation: "decode_workers write".to_string(),
+            })?;
+
+        // Find and remove the server
+        let initial_len = workers.len();
+        workers.retain(|w| w.url() != url);
+
+        if workers.len() == initial_len {
+            return Err(PDRouterError::WorkerNotFound {
+                url: url.to_string(),
+            });
+        }
+
+        // Remove from cache-aware policy if applicable
+        if let Some(cache_policy) = self
+            .decode_policy
+            .as_any()
+            .downcast_ref::<crate::policies::CacheAwarePolicy>()
+        {
+            cache_policy.remove_worker(url);
+        }
+
+        info!("Removed decode server: {}", url);
+        Ok(format!("Successfully removed decode server: {}", url))
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    pub async fn new(
+        prefill_urls: Vec<(String, Option<u16>)>,
+        decode_urls: Vec<String>,
+        prefill_policy: Arc<dyn LoadBalancingPolicy>,
+        decode_policy: Arc<dyn LoadBalancingPolicy>,
+        ctx: &Arc<crate::server::AppContext>,
+    ) -> Result<Self, String> {
+        // Convert config CircuitBreakerConfig to core CircuitBreakerConfig
+        let circuit_breaker_config = ctx.router_config.effective_circuit_breaker_config();
+        let core_cb_config = CircuitBreakerConfig {
+            failure_threshold: circuit_breaker_config.failure_threshold,
+            success_threshold: circuit_breaker_config.success_threshold,
+            timeout_duration: Duration::from_secs(circuit_breaker_config.timeout_duration_secs),
+            window_duration: Duration::from_secs(circuit_breaker_config.window_duration_secs),
+        };
+
+        // Convert URLs to Worker trait objects with health check config
+        let prefill_workers: Vec<Box<dyn Worker>> = prefill_urls
+            .into_iter()
+            .map(|(url, port)| {
+                let worker = BasicWorker::new(
+                    url,
+                    WorkerType::Prefill {
+                        bootstrap_port: port,
+                    },
+                )
+                .with_circuit_breaker_config(core_cb_config.clone())
+                .with_health_config(HealthConfig {
+                    timeout_secs: ctx.router_config.health_check.timeout_secs,
+                    check_interval_secs: ctx.router_config.health_check.check_interval_secs,
+                    endpoint: ctx.router_config.health_check.endpoint.clone(),
+                    failure_threshold: ctx.router_config.health_check.failure_threshold,
+                    success_threshold: ctx.router_config.health_check.success_threshold,
+                });
+                Box::new(worker) as Box<dyn Worker>
+            })
+            .collect();
+
+        let decode_workers: Vec<Box<dyn Worker>> = decode_urls
+            .into_iter()
+            .map(|url| {
+                let worker = BasicWorker::new(url, WorkerType::Decode)
+                    .with_circuit_breaker_config(core_cb_config.clone())
+                    .with_health_config(HealthConfig {
+                        timeout_secs: ctx.router_config.health_check.timeout_secs,
+                        check_interval_secs: ctx.router_config.health_check.check_interval_secs,
+                        endpoint: ctx.router_config.health_check.endpoint.clone(),
+                        failure_threshold: ctx.router_config.health_check.failure_threshold,
+                        success_threshold: ctx.router_config.health_check.success_threshold,
+                    });
+                Box::new(worker) as Box<dyn Worker>
+            })
+            .collect();
+
+        // Wait for PD workers to be healthy (skip if empty - for service discovery mode)
+        let all_urls: Vec<String> = prefill_workers
+            .iter()
+            .chain(decode_workers.iter())
+            .map(|worker| worker.url().to_string())
+            .collect();
+        if !all_urls.is_empty() {
+            crate::routers::http::router::Router::wait_for_healthy_workers(
+                &all_urls,
+                ctx.router_config.worker_startup_timeout_secs,
+                ctx.router_config.worker_startup_check_interval_secs,
+            )
+            .await?;
+        }
+
+        // Initialize cache-aware policies with workers
+        if let Some(cache_policy) = prefill_policy
+            .as_any()
+            .downcast_ref::<crate::policies::CacheAwarePolicy>()
+        {
+            cache_policy.init_workers(&prefill_workers);
+        }
+
+        if let Some(cache_policy) = decode_policy
+            .as_any()
+            .downcast_ref::<crate::policies::CacheAwarePolicy>()
+        {
+            cache_policy.init_workers(&decode_workers);
+        }
+
+        // Set up background load monitoring for power-of-two selection
+        let (tx, rx) = tokio::sync::watch::channel(HashMap::new());
+        let worker_loads = Arc::new(rx);
+
+        let load_monitor_handle =
+            if prefill_policy.name() == "power_of_two" || decode_policy.name() == "power_of_two" {
+                let monitor_urls = all_urls.clone();
+                let monitor_interval = ctx.router_config.worker_startup_check_interval_secs;
+                let monitor_client = ctx.client.clone();
+                let prefill_policy_clone = Arc::clone(&prefill_policy);
+                let decode_policy_clone = Arc::clone(&decode_policy);
+
+                Some(Arc::new(tokio::spawn(async move {
+                    Self::monitor_worker_loads_with_client(
+                        monitor_urls,
+                        tx,
+                        monitor_interval,
+                        monitor_client,
+                        prefill_policy_clone,
+                        decode_policy_clone,
+                    )
+                    .await;
+                })))
+            } else {
+                None
+            };
+
+        let prefill_workers = Arc::new(RwLock::new(prefill_workers));
+        let decode_workers = Arc::new(RwLock::new(decode_workers));
+
+        // Start health checkers for both worker pools
+        let prefill_health_checker = crate::core::start_health_checker(
+            Arc::clone(&prefill_workers),
+            ctx.router_config.health_check.check_interval_secs,
+        );
+        let decode_health_checker = crate::core::start_health_checker(
+            Arc::clone(&decode_workers),
+            ctx.router_config.health_check.check_interval_secs,
+        );
+
+        // Build a dedicated prefill client for fire-and-forget semantics
+        let prefill_client = reqwest::Client::builder()
+            .pool_max_idle_per_host(0)
+            .http1_only()
+            .connect_timeout(Duration::from_millis(300))
+            .timeout(Duration::from_secs(ctx.router_config.request_timeout_secs))
+            .build()
+            .map_err(|e| format!("Failed to build prefill client: {}", e))?;
+
+        // Create bounded channel for prefill response draining
+        // Larger buffer for high concurrency scenarios
+        let (prefill_drain_tx, mut prefill_drain_rx) = mpsc::channel::<reqwest::Response>(2000);
+
+        // Spawn a coordinator with limited concurrent drain tasks
+        // This prevents unbounded task spawning under extreme load
+        tokio::spawn(async move {
+            info!("Prefill drain coordinator started");
+
+            // Use a semaphore to limit concurrent drain operations
+            let max_concurrent_drains = 100;
+            let semaphore = Arc::new(tokio::sync::Semaphore::new(max_concurrent_drains));
+
+            while let Some(response) = prefill_drain_rx.recv().await {
+                let permit = semaphore.clone().acquire_owned().await;
+
+                match permit {
+                    Ok(permit) => {
+                        // Spawn a task to drain this response
+                        tokio::spawn(async move {
+                            let url = response.url().to_string();
+                            let status = response.status();
+
+                            if !status.is_success() {
+                                error!("Prefill drain: error status={} url={}", status, url);
+                                RouterMetrics::record_pd_prefill_error(&url);
+                            }
+
+                            // Drain the response body efficiently
+                            // Use streaming to avoid loading entire body into memory
+                            let start = std::time::Instant::now();
+                            let mut stream = response.bytes_stream();
+                            let mut bytes_drained = 0;
+
+                            while let Some(chunk_result) = stream.next().await {
+                                match chunk_result {
+                                    Ok(chunk) => bytes_drained += chunk.len(),
+                                    Err(e) => {
+                                        debug!(
+                                            "Prefill drain: error streaming url={} error={}",
+                                            url, e
+                                        );
+                                        break;
+                                    }
+                                }
+                            }
+
+                            let elapsed = start.elapsed();
+                            if elapsed > Duration::from_millis(100) {
+                                // Only log slow drains
+                                debug!(
+                                    "Prefill drain: slow drain {} bytes from {} in {:?}",
+                                    bytes_drained, url, elapsed
+                                );
+                            }
+
+                            // Permit is automatically released when dropped
+                            drop(permit);
+                        });
+                    }
+                    Err(_) => {
+                        // Semaphore closed, shutting down
+                        break;
+                    }
+                }
+            }
+            info!("Prefill drain coordinator shutting down");
+        });
+
+        Ok(PDRouter {
+            prefill_workers,
+            decode_workers,
+            prefill_policy,
+            decode_policy,
+            worker_startup_timeout_secs: ctx.router_config.worker_startup_timeout_secs,
+            worker_startup_check_interval_secs: ctx
+                .router_config
+                .worker_startup_check_interval_secs,
+            worker_loads,
+            load_monitor_handle,
+            client: ctx.client.clone(),
+            prefill_client,
+            prefill_drain_tx,
+            retry_config: ctx.router_config.effective_retry_config(),
+            circuit_breaker_config: core_cb_config,
+            _prefill_health_checker: Some(prefill_health_checker),
+            _decode_health_checker: Some(decode_health_checker),
+        })
+    }
+
+    // Helper to handle server selection errors
+    fn handle_server_selection_error(error: String) -> Response {
+        error!("Failed to select PD pair error={}", error);
+        RouterMetrics::record_pd_error("server_selection");
+        (
+            StatusCode::SERVICE_UNAVAILABLE,
+            format!("No available servers: {}", error),
+        )
+            .into_response()
+    }
+
+    // Helper to handle serialization errors
+    fn handle_serialization_error(error: impl std::fmt::Display) -> Response {
+        error!("Failed to serialize request error={}", error);
+        (
+            StatusCode::INTERNAL_SERVER_ERROR,
+            "Failed to serialize request",
+        )
+            .into_response()
+    }
+
+    // Helper to determine batch size from a GenerateRequest
+    fn get_generate_batch_size(req: &GenerateRequest) -> Option<usize> {
+        // Check prompt array
+        if let Some(StringOrArray::Array(arr)) = &req.prompt {
+            if !arr.is_empty() {
+                return Some(arr.len());
+            }
+        }
+        // Check text array
+        if let Some(text) = &req.text {
+            if text.contains("[") && text.contains("]") {
+                // This is a simplified check - in reality we'd need to parse JSON
+                return None; // For now, fall back to non-batch
+            }
+        }
+        None
+    }
+
+    // Helper to determine batch size from a ChatCompletionRequest
+    fn get_chat_batch_size(req: &ChatCompletionRequest) -> Option<usize> {
+        // Check 'n' parameter for multiple responses
+        if let Some(n) = req.n {
+            if n > 1 {
+                return Some(n as usize);
+            }
+        }
+        None
+    }
+
+    // Helper to determine batch size from a CompletionRequest
+    fn get_completion_batch_size(req: &CompletionRequest) -> Option<usize> {
+        // Check prompt array
+        if let StringOrArray::Array(arr) = &req.prompt {
+            if !arr.is_empty() {
+                return Some(arr.len());
+            }
+        }
+        None
+    }
+
+    // Helper to inject bootstrap fields into an existing JSON request value
+    fn inject_bootstrap_into_value(
+        mut original: Value,
+        prefill_worker: &dyn Worker,
+        batch_size: Option<usize>,
+    ) -> Result<Value, String> {
+        let bootstrap_port = match prefill_worker.worker_type() {
+            crate::core::WorkerType::Prefill { bootstrap_port } => bootstrap_port,
+            _ => None,
+        };
+        let hostname = super::pd_types::get_hostname(prefill_worker.url());
+
+        let obj = original
+            .as_object_mut()
+            .ok_or_else(|| "Request must be a JSON object".to_string())?;
+
+        if let Some(n) = batch_size {
+            let mut hosts = Vec::with_capacity(n);
+            let mut ports = Vec::with_capacity(n);
+            let mut rooms = Vec::with_capacity(n);
+            for _ in 0..n {
+                hosts.push(hostname.clone());
+                ports.push(bootstrap_port);
+                rooms.push(super::pd_types::generate_room_id());
+            }
+            obj.insert(
+                "bootstrap_host".to_string(),
+                Value::Array(hosts.into_iter().map(serde_json::Value::from).collect()),
+            );
+            obj.insert(
+                "bootstrap_port".to_string(),
+                Value::Array(
+                    ports
+                        .into_iter()
+                        .map(|p| match p {
+                            Some(v) => serde_json::Value::from(v),
+                            None => Value::Null,
+                        })
+                        .collect(),
+                ),
+            );
+            obj.insert(
+                "bootstrap_room".to_string(),
+                Value::Array(rooms.into_iter().map(serde_json::Value::from).collect()),
+            );
+        } else {
+            obj.insert(
+                "bootstrap_host".to_string(),
+                serde_json::Value::from(hostname),
+            );
+            obj.insert(
+                "bootstrap_port".to_string(),
+                match bootstrap_port {
+                    Some(v) => serde_json::Value::from(v),
+                    None => Value::Null,
+                },
+            );
+            obj.insert(
+                "bootstrap_room".to_string(),
+                serde_json::Value::from(super::pd_types::generate_room_id()),
+            );
+        }
+        Ok(original)
+    }
+
+    // Execute the dual dispatch to prefill and decode servers with retries and bootstrap injection
+    async fn execute_dual_dispatch<T: Serialize + Clone>(
+        &self,
+        headers: Option<&HeaderMap>,
+        original_request: &T,
+        context: PDRequestContext,
+    ) -> Response {
+        let start_time = Instant::now();
+
+        let route = context.route;
+        RetryExecutor::execute_response_with_retry(
+            &self.retry_config,
+            // Operation per attempt
+            {
+                let original_request = original_request.clone();
+                move |attempt: u32| {
+                    let original_request = original_request.clone();
+                    let context = context.clone();
+                    async move {
+                        // Select workers fresh for each attempt
+                        let (prefill, decode) =
+                            match self.select_pd_pair(context.request_text.as_deref()).await {
+                                Ok(pair) => pair,
+                                Err(e) => {
+                                    RouterMetrics::record_pd_error("server_selection");
+                                    return Self::handle_server_selection_error(e);
+                                }
+                            };
+
+                        debug!(
+                            "PD retry attempt {} using prefill={} decode={}",
+                            attempt,
+                            prefill.url(),
+                            decode.url()
+                        );
+
+                        // Serialize the original request
+                        let mut json_request = match serde_json::to_value(&original_request) {
+                            Ok(v) => v,
+                            Err(e) => return Self::handle_serialization_error(e),
+                        };
+
+                        // Inject bootstrap based on current prefill worker
+                        json_request = match Self::inject_bootstrap_into_value(
+                            json_request,
+                            prefill.as_ref(),
+                            context.batch_size,
+                        ) {
+                            Ok(v) => v,
+                            Err(e) => return Self::handle_serialization_error(e),
+                        };
+
+                        // Execute the actual dual dispatch
+                        let response = self
+                            .execute_dual_dispatch_internal(
+                                headers,
+                                json_request,
+                                context,
+                                prefill.as_ref(),
+                                decode.as_ref(),
+                                start_time,
+                            )
+                            .await;
+
+                        // Record outcomes for circuit breakers
+                        let _status = response.status();
+                        let not_error = _status.is_success() || _status.is_client_error();
+                        prefill.record_outcome(not_error);
+                        decode.record_outcome(not_error);
+
+                        response
+                    }
+                }
+            },
+            // Should retry predicate
+            |res, _attempt| is_retryable_status(res.status()),
+            // On backoff hook
+            |delay, attempt| {
+                RouterMetrics::record_retry(route);
+                RouterMetrics::record_retry_backoff_duration(delay, attempt);
+            },
+            // On exhausted hook
+            || RouterMetrics::record_retries_exhausted(route),
+        )
+        .await
+    }
+
+    async fn handle_decode_error_response(
+        &self,
+        res: reqwest::Response,
+        context: &PDRequestContext,
+        prefill: &dyn Worker,
+        decode: &dyn Worker,
+    ) -> Response {
+        let status = res.status();
+
+        if context.is_stream {
+            // Handle streaming error response
+            let response_headers = header_utils::preserve_response_headers(res.headers());
+            let error_payload = match res.bytes().await {
+                Ok(error_body) => {
+                    if let Ok(error_json) = serde_json::from_slice::<Value>(&error_body) {
+                        json!({ "message": error_json, "status": status.as_u16() })
+                    } else {
+                        json!({ "message": String::from_utf8_lossy(&error_body).to_string(), "status": status.as_u16() })
+                    }
+                }
+                Err(e) => {
+                    json!({ "message": format!("Decode server error: {}", e), "status": status.as_u16() })
+                }
+            };
+
+            let sse_data = format!(
+                "data: {{'error': {}}}",
+                serde_json::to_string(&error_payload).unwrap_or_default()
+            );
+            let error_stream = tokio_stream::once(Ok(axum::body::Bytes::from(sse_data)));
+
+            let decode_url = decode.url().to_string();
+            self.create_streaming_response(
+                error_stream,
+                status,
+                None,
+                context.return_logprob,
+                Some(decode_url),
+                Some(response_headers),
+                prefill,
+                decode,
+            )
+        } else {
+            // Handle non-streaming error response
+            match res.bytes().await {
+                Ok(error_body) => (status, error_body).into_response(),
+                Err(e) => (status, format!("Decode server error: {}", e)).into_response(),
+            }
+        }
+    }
+
+    // Internal method that performs the actual dual dispatch (without retry logic)
+    async fn execute_dual_dispatch_internal(
+        &self,
+        headers: Option<&HeaderMap>,
+        json_request: Value,
+        context: PDRequestContext,
+        prefill: &dyn Worker,
+        decode: &dyn Worker,
+        start_time: Instant,
+    ) -> Response {
+        // For non-streaming: use guard for automatic load management
+        // For streaming: load will be managed in create_streaming_response
+        let _guard = if !context.is_stream {
+            Some(WorkerLoadGuard::new_multi(vec![prefill, decode]))
+        } else {
+            None
+        };
+
+        // Build decode request with shared client
+        let decode_request = self.build_post_with_headers(
+            &self.client,
+            decode.url(),
+            context.route,
+            &json_request,
+            headers,
+            false,
+        );
+
+        // Send both requests concurrently
+        debug!(
+            "Sending concurrent requests to prefill={} decode={}",
+            prefill.url(),
+            decode.url()
+        );
+
+        if context.return_logprob {
+            // Build prefill request with shared client when we need response body
+            let prefill_request = self.build_post_with_headers(
+                &self.client,
+                prefill.url(),
+                context.route,
+                &json_request,
+                headers,
+                false,
+            );
+            // When we need logprobs, wait for both responses
+            let (prefill_result, decode_result) =
+                tokio::join!(prefill_request.send(), decode_request.send());
+            debug!("Received responses from both servers");
+
+            // Update metrics
+            let duration = start_time.elapsed();
+            RouterMetrics::record_pd_request_duration(context.route, duration);
+            RouterMetrics::record_pd_request(context.route);
+            RouterMetrics::record_pd_prefill_request(prefill.url());
+            RouterMetrics::record_pd_decode_request(decode.url());
+
+            // Process decode response with prefill for logprobs
+            debug!("Processing decode response with logprobs");
+            match decode_result {
+                Ok(res) => {
+                    let status = StatusCode::from_u16(res.status().as_u16())
+                        .unwrap_or(StatusCode::INTERNAL_SERVER_ERROR);
+                    debug!("Decode response status: {}", status);
+
+                    if !status.is_success() {
+                        RouterMetrics::record_pd_decode_error(decode.url());
+                        error!(
+                            "Decode server returned error status decode_url={} status={}",
+                            decode.url(),
+                            status
+                        );
+
+                        return self
+                            .handle_decode_error_response(res, &context, prefill, decode)
+                            .await;
+                    }
+
+                    // Process prefill response for logprobs
+                    let prefill_body = match self
+                        .process_prefill_response(
+                            prefill_result,
+                            prefill.url(),
+                            context.return_logprob,
+                        )
+                        .await
+                    {
+                        Ok((_, body)) => body,
+                        Err(error_response) => return error_response,
+                    };
+
+                    if context.is_stream {
+                        // Streaming response with logprobs
+                        let prefill_logprobs = prefill_body
+                            .as_ref()
+                            .and_then(|body| serde_json::from_slice::<Value>(body).ok())
+                            .and_then(|json| {
+                                json.pointer("/meta_info/input_token_logprobs").cloned()
+                            });
+
+                        let response_headers =
+                            header_utils::preserve_response_headers(res.headers());
+
+                        self.create_streaming_response(
+                            res.bytes_stream(),
+                            status,
+                            prefill_logprobs,
+                            context.return_logprob,
+                            None,
+                            Some(response_headers),
+                            prefill,
+                            decode,
+                        )
+                    } else {
+                        // Non-streaming response with logprobs
+                        self.process_non_streaming_response(
+                            res,
+                            status,
+                            context.return_logprob,
+                            prefill_body,
+                        )
+                        .await
+                    }
+                }
+                Err(e) => {
+                    error!(
+                        decode_url = %decode.url(),
+                        error = %e,
+                        "Decode request failed"
+                    );
+                    RouterMetrics::record_pd_decode_error(decode.url());
+                    (
+                        StatusCode::BAD_GATEWAY,
+                        format!("Decode server error: {}", e),
+                    )
+                        .into_response()
+                }
+            }
+        } else {
+            // When we don't need logprobs, only wait for decode response
+            // Send both requests concurrently but don't wait for prefill
+            // Use dedicated prefill client with Connection: close
+            let prefill_future = self
+                .build_post_with_headers(
+                    &self.prefill_client,
+                    prefill.url(),
+                    context.route,
+                    &json_request,
+                    headers,
+                    true,
+                )
+                .send();
+            let decode_future = decode_request.send();
+
+            // Send prefill response to background worker for draining
+            // This ensures HTTP compliance without blocking
+            let drain_tx = self.prefill_drain_tx.clone();
+            let prefill_url = prefill.url().to_string();
+            tokio::spawn(async move {
+                if let Ok(response) = prefill_future.await {
+                    // Try to send to drain worker
+                    // If channel is full (under extreme load), drain inline as fallback
+                    match drain_tx.try_send(response) {
+                        Ok(_) => {
+                            // Successfully queued for draining
+                            debug!("Prefill response queued for draining");
+                        }
+                        Err(mpsc::error::TrySendError::Full(response)) => {
+                            // Channel full - drain inline as fallback
+                            warn!("Prefill drain channel full (capacity exceeded), draining inline for {}", prefill_url);
+                            RouterMetrics::record_pd_prefill_error(&prefill_url);
+
+                            // Drain inline with timeout to prevent blocking too long
+                            let drain_future = async {
+                                let mut stream = response.bytes_stream();
+                                while stream.next().await.is_some() {
+                                    // Just drain
+                                }
+                            };
+
+                            match tokio::time::timeout(Duration::from_secs(1), drain_future).await {
+                                Ok(_) => debug!("Inline drain completed for {}", prefill_url),
+                                Err(_) => error!("Inline drain timeout for {}", prefill_url),
+                            }
+                        }
+                        Err(mpsc::error::TrySendError::Closed(_)) => {
+                            error!("Prefill drain channel closed!");
+                        }
+                    }
+                }
+            });
+
+            // Wait only for decode response
+            let decode_result = decode_future.await;
+            debug!("Received decode response");
+
+            // Update metrics
+            let duration = start_time.elapsed();
+            RouterMetrics::record_pd_request_duration(context.route, duration);
+            RouterMetrics::record_pd_request(context.route);
+            RouterMetrics::record_pd_prefill_request(prefill.url());
+            RouterMetrics::record_pd_decode_request(decode.url());
+
+            // Process decode response immediately
+            debug!("Processing decode response (no logprobs)");
+            match decode_result {
+                Ok(res) => {
+                    let status = StatusCode::from_u16(res.status().as_u16())
+                        .unwrap_or(StatusCode::INTERNAL_SERVER_ERROR);
+                    debug!("Decode response status: {}", status);
+
+                    if !status.is_success() {
+                        RouterMetrics::record_pd_decode_error(decode.url());
+                        error!(
+                            "Decode server returned error status decode_url={} status={}",
+                            decode.url(),
+                            status
+                        );
+
+                        self.handle_decode_error_response(res, &context, prefill, decode)
+                            .await
+                    } else if context.is_stream {
+                        // Streaming response without logprobs - direct passthrough
+                        let decode_url = decode.url().to_string();
+                        let response_headers =
+                            header_utils::preserve_response_headers(res.headers());
+
+                        self.create_streaming_response(
+                            res.bytes_stream(),
+                            status,
+                            None,
+                            false,
+                            Some(decode_url),
+                            Some(response_headers),
+                            prefill,
+                            decode,
+                        )
+                    } else {
+                        // Non-streaming response without logprobs - direct passthrough like fast version
+                        let response_headers =
+                            header_utils::preserve_response_headers(res.headers());
+
+                        match res.bytes().await {
+                            Ok(decode_body) => {
+                                let mut response =
+                                    Response::new(axum::body::Body::from(decode_body));
+                                *response.status_mut() = status;
+                                *response.headers_mut() = response_headers;
+                                response
+                            }
+                            Err(e) => {
+                                error!("Failed to read decode response: {}", e);
+                                (StatusCode::INTERNAL_SERVER_ERROR, "Failed to read response")
+                                    .into_response()
+                            }
+                        }
+                    }
+                }
+                Err(e) => {
+                    error!(
+                        decode_url = %decode.url(),
+                        error = %e,
+                        "Decode request failed"
+                    );
+                    RouterMetrics::record_pd_decode_error(decode.url());
+                    (
+                        StatusCode::BAD_GATEWAY,
+                        format!("Decode server error: {}", e),
+                    )
+                        .into_response()
+                }
+            }
+        }
+    }
+
+    // Check if either prefill or decode policy needs request text
+    fn policies_need_request_text(&self) -> bool {
+        self.prefill_policy.needs_request_text() || self.decode_policy.needs_request_text()
+    }
+
+    // Select a pair of prefill and decode servers considering circuit breaker state
+    async fn select_pd_pair(
+        &self,
+        request_text: Option<&str>,
+    ) -> Result<(Box<dyn Worker>, Box<dyn Worker>), String> {
+        // Get read locks for both worker lists
+        let prefill_workers = self
+            .prefill_workers
+            .read()
+            .map_err(|e| format!("Failed to acquire prefill workers lock: {}", e))?;
+        let decode_workers = self
+            .decode_workers
+            .read()
+            .map_err(|e| format!("Failed to acquire decode workers lock: {}", e))?;
+
+        // Select workers using helper function
+        let prefill = Self::pick_worker_by_policy(
+            &prefill_workers,
+            &*self.prefill_policy,
+            request_text,
+            "prefill",
+        )?;
+
+        let decode = Self::pick_worker_by_policy(
+            &decode_workers,
+            &*self.decode_policy,
+            request_text,
+            "decode",
+        )?;
+
+        Ok((prefill, decode))
+    }
+
+    // Helper function to select a worker using the policy
+    fn pick_worker_by_policy(
+        workers: &[Box<dyn Worker>],
+        policy: &dyn LoadBalancingPolicy,
+        request_text: Option<&str>,
+        worker_type: &str,
+    ) -> Result<Box<dyn Worker>, String> {
+        // Check if we have any workers
+        if workers.is_empty() {
+            return Err(format!(
+                "No {} workers available. Please check if {} servers are configured and healthy.",
+                worker_type, worker_type
+            ));
+        }
+
+        // Filter available workers (healthy + circuit breaker not open)
+        let available_workers: Vec<Box<dyn Worker>> = workers
+            .iter()
+            .filter(|w| w.is_available())
+            .map(|w| w.clone_worker())
+            .collect();
+
+        if available_workers.is_empty() {
+            return Err(format!(
+                "No available {} workers (all circuits open or unhealthy)",
+                worker_type
+            ));
+        }
+
+        // Let policy select from available workers only
+        match policy.select_worker(&available_workers, request_text) {
+            Some(idx) => Ok(available_workers[idx].clone_worker()),
+            None => Err(format!("Policy could not select a {} worker", worker_type)),
+        }
+    }
+
+    // Background task to monitor worker loads with shared client
+    async fn monitor_worker_loads_with_client(
+        worker_urls: Vec<String>,
+        tx: tokio::sync::watch::Sender<HashMap<String, isize>>,
+        interval_secs: u64,
+        client: Client,
+        prefill_policy: Arc<dyn LoadBalancingPolicy>,
+        decode_policy: Arc<dyn LoadBalancingPolicy>,
+    ) {
+        loop {
+            let mut loads = HashMap::new();
+
+            let futures: Vec<_> = worker_urls
+                .iter()
+                .map(|url| {
+                    let client = client.clone();
+                    let url = url.clone();
+                    async move {
+                        let load = get_worker_load(&client, &url).await.unwrap_or(0);
+                        (url, load)
+                    }
+                })
+                .collect();
+
+            let results = futures_util::future::join_all(futures).await;
+
+            for (url, load) in results {
+                loads.insert(url, load);
+            }
+
+            debug!("Worker loads updated: {:?}", loads);
+
+            // Update both policies with current loads
+            prefill_policy.update_loads(&loads);
+            decode_policy.update_loads(&loads);
+
+            // Check if receiver is still active
+            if tx.send(loads).is_err() {
+                info!("Load monitor receiver dropped, shutting down monitor task");
+                break;
+            }
+
+            tokio::time::sleep(Duration::from_secs(interval_secs)).await;
+        }
+    }
+
+    // Helper to create a streaming response
+    #[allow(clippy::too_many_arguments)]
+    fn create_streaming_response(
+        &self,
+        stream: impl futures_util::Stream<Item = Result<bytes::Bytes, reqwest::Error>> + Send + 'static,
+        status: StatusCode,
+        prefill_logprobs: Option<Value>,
+        return_logprob: bool,
+        decode_url: Option<String>,
+        headers: Option<HeaderMap>,
+        prefill: &dyn Worker,
+        decode: &dyn Worker,
+    ) -> Response {
+        // For streaming, increment load now - will be decremented when streaming completes
+        prefill.increment_load();
+        decode.increment_load();
+
+        // Store URLs to find workers later for decrementing
+        let prefill_url = prefill.url().to_string();
+        let decode_url_str = decode.url().to_string();
+
+        let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
+
+        // Clone the worker collections for the spawned task
+        let prefill_workers = self.prefill_workers.clone();
+        let decode_workers = self.decode_workers.clone();
+
+        tokio::spawn(async move {
+            // Use a flag to track whether stream completed successfully
+            let mut stream_completed = false;
+
+            futures_util::pin_mut!(stream);
+            while let Some(chunk_result) = stream.next().await {
+                match chunk_result {
+                    Ok(chunk) => {
+                        // Check for stream end marker to decrement load early
+                        let is_done = chunk
+                            .as_ref()
+                            .windows(12)
+                            .any(|window| window == b"data: [DONE]");
+
+                        let result = if return_logprob && prefill_logprobs.is_some() {
+                            // Try to merge logprobs
+                            Self::merge_streaming_logprobs(prefill_logprobs.clone(), &chunk)
+                                .unwrap_or(chunk)
+                        } else {
+                            chunk
+                        };
+
+                        if tx.send(Ok(result)).is_err() {
+                            break;
+                        }
+
+                        // If we see the done marker, decrement load immediately
+                        if is_done {
+                            stream_completed = true;
+                            break;
+                        }
+                    }
+                    Err(e) => {
+                        if let Some(ref url) = decode_url {
+                            error!("Stream error from decode server {}: {}", url, e);
+                            RouterMetrics::record_pd_stream_error(url);
+                        }
+                        let _ = tx.send(Err(format!("Stream error: {}", e)));
+                        break;
+                    }
+                }
+            }
+
+            // Always decrement load after streaming (either completes or errors)
+            // Find and decrement prefill worker
+            if let Ok(prefill_workers_guard) = prefill_workers.read() {
+                for worker in prefill_workers_guard.iter() {
+                    if worker.url() == prefill_url.as_str() {
+                        worker.decrement_load();
+                        debug!(
+                            "Decremented load for prefill worker: {} (stream_completed: {})",
+                            prefill_url, stream_completed
+                        );
+                        break;
+                    }
+                }
+            }
+
+            // Find and decrement decode worker
+            if let Ok(decode_workers_guard) = decode_workers.read() {
+                for worker in decode_workers_guard.iter() {
+                    if worker.url() == decode_url_str.as_str() {
+                        worker.decrement_load();
+                        debug!(
+                            "Decremented load for decode worker: {} (stream_completed: {})",
+                            decode_url_str, stream_completed
+                        );
+                        break;
+                    }
+                }
+            }
+        });
+
+        let stream = UnboundedReceiverStream::new(rx);
+        let body = Body::from_stream(stream);
+
+        let mut response = Response::new(body);
+        *response.status_mut() = status;
+
+        // Use provided headers or create new ones, then ensure content-type is set for streaming
+        let mut headers = headers.unwrap_or_default();
+        headers.insert(CONTENT_TYPE, HeaderValue::from_static("text/event-stream"));
+        *response.headers_mut() = headers;
+
+        response
+    }
+
+    // Helper to process non-streaming decode response with logprob merging
+    async fn process_non_streaming_response(
+        &self,
+        res: reqwest::Response,
+        status: StatusCode,
+        return_logprob: bool,
+        prefill_body: Option<bytes::Bytes>,
+    ) -> Response {
+        let response = res.bytes().await;
+        let decode_body = match response {
+            Ok(decode_body) => decode_body,
+            Err(e) => {
+                error!("Failed to read decode response: {}", e);
+                return (StatusCode::INTERNAL_SERVER_ERROR, "Failed to read response")
+                    .into_response();
+            }
+        };
+
+        if !return_logprob {
+            return (status, decode_body).into_response();
+        }
+
+        let Some(prefill_body) = prefill_body else {
+            return (status, decode_body).into_response();
+        };
+
+        // Merge logprobs from prefill and decode
+        let (Ok(prefill_json), Ok(mut decode_json)) = (
+            serde_json::from_slice::<Value>(&prefill_body),
+            serde_json::from_slice::<Value>(&decode_body),
+        ) else {
+            warn!("Failed to parse responses for logprob merging");
+            return (status, decode_body).into_response();
+        };
+
+        Self::merge_logprobs_in_json(&prefill_json, &mut decode_json);
+
+        // Return merged response
+        match serde_json::to_vec(&decode_json) {
+            Ok(body) => (status, body).into_response(),
+            Err(e) => {
+                error!("Failed to serialize merged response: {}", e);
+                (status, decode_body).into_response()
+            }
+        }
+    }
+
+    // Helper to process prefill response and extract body if needed for logprobs
+    async fn process_prefill_response(
+        &self,
+        prefill_result: Result<reqwest::Response, reqwest::Error>,
+        prefill_url: &str,
+        return_logprob: bool,
+    ) -> Result<(StatusCode, Option<bytes::Bytes>), Response> {
+        // Check prefill result first - it's critical for disaggregated mode
+        let prefill_response = match prefill_result {
+            Ok(response) => response,
+            Err(e) => {
+                RouterMetrics::record_pd_prefill_error(prefill_url);
+                error!(
+                    "Prefill server failed (CRITICAL) prefill_url={} error={}. Decode will timeout without prefill KV cache.",
+                    prefill_url,
+                    e
+                );
+
+                // Return error immediately - don't wait for decode to timeout
+                return Err((
+                    StatusCode::BAD_GATEWAY,
+                    format!(
+                        "Prefill server error: {}. This will cause decode timeout.",
+                        e
+                    ),
+                )
+                    .into_response());
+            }
+        };
+
+        let prefill_status = StatusCode::from_u16(prefill_response.status().as_u16())
+            .unwrap_or(StatusCode::INTERNAL_SERVER_ERROR);
+
+        // Check if prefill succeeded
+        if !prefill_status.is_success() {
+            RouterMetrics::record_pd_prefill_error(prefill_url);
+
+            // Get error body from prefill
+            let error_msg = prefill_response
+                .text()
+                .await
+                .unwrap_or_else(|_| "Unknown prefill error".to_string());
+
+            error!(
+                "Prefill server returned error status prefill_url={} status={} body={}",
+                prefill_url, prefill_status, error_msg
+            );
+
+            return Err((
+                prefill_status,
+                format!("Prefill server error ({}): {}", prefill_status, error_msg),
+            )
+                .into_response());
+        }
+
+        // Read prefill body if needed for logprob merging
+        let prefill_body = if return_logprob {
+            match prefill_response.bytes().await {
+                Ok(body) => Some(body),
+                Err(e) => {
+                    warn!("Failed to read prefill response body for logprobs: {}", e);
+                    None
+                }
+            }
+        } else {
+            // For non-logprob requests, just consume the response without storing
+            debug!("Consuming prefill response body (non-logprob request)");
+            match prefill_response.bytes().await {
+                Ok(_) => debug!("Prefill response consumed successfully"),
+                Err(e) => warn!("Error consuming prefill response: {}", e),
+            }
+            None
+        };
+
+        Ok((prefill_status, prefill_body))
+    }
+
+    fn build_post_with_headers(
+        &self,
+        client: &Client,
+        url: &str,
+        route: &str,
+        json_request: &Value,
+        headers: Option<&HeaderMap>,
+        connection_close: bool,
+    ) -> reqwest::RequestBuilder {
+        let mut request = client.post(api_path(url, route)).json(json_request);
+        if connection_close {
+            request = request.header("Connection", "close");
+        }
+        if let Some(headers) = headers {
+            for (name, value) in headers.iter() {
+                let name_lc = name.as_str().to_ascii_lowercase();
+                // Whitelist important end-to-end headers, skip hop-by-hop
+                let forward = matches!(
+                    name_lc.as_str(),
+                    "authorization" | "x-request-id" | "x-correlation-id"
+                ) || name_lc.starts_with("x-request-id-");
+                if forward {
+                    if let Ok(val) = value.to_str() {
+                        request = request.header(name, val);
+                    }
+                }
+            }
+        }
+        request
+    }
+
+    // Helper to merge logprobs from prefill and decode responses
+    fn merge_logprobs_in_json(prefill_json: &Value, decode_json: &mut Value) -> bool {
+        if let (Some(prefill_meta), Some(decode_meta)) = (
+            prefill_json.get("meta_info"),
+            decode_json.get_mut("meta_info"),
+        ) {
+            if let (Some(prefill_logprobs), Some(decode_logprobs)) = (
+                prefill_meta.get("input_token_logprobs"),
+                decode_meta.get_mut("input_token_logprobs"),
+            ) {
+                if let (Some(prefill_arr), Some(decode_arr)) =
+                    (prefill_logprobs.as_array(), decode_logprobs.as_array_mut())
+                {
+                    let mut merged = prefill_arr.clone();
+                    merged.extend(decode_arr.clone());
+                    decode_meta["input_token_logprobs"] = Value::Array(merged);
+                    return true;
+                }
+            }
+        }
+        false
+    }
+
+    // Simple helper to merge logprobs in streaming responses
+    fn merge_streaming_logprobs(
+        prefill_logprobs: Option<Value>,
+        decode_chunk: &[u8],
+    ) -> Result<bytes::Bytes, ()> {
+        // Skip non-data chunks
+        let chunk_str = std::str::from_utf8(decode_chunk).map_err(|_| ())?;
+        if !chunk_str.starts_with("data: ") || chunk_str.contains("[DONE]") {
+            return Err(());
+        }
+
+        // Parse JSON from chunk
+        let json_str = chunk_str.trim_start_matches("data: ").trim();
+        let mut decode_json: Value = serde_json::from_str(json_str).map_err(|_| ())?;
+
+        // Merge prefill logprobs if available
+        if let Some(ref p_logprobs) = prefill_logprobs {
+            if let Some(meta) = decode_json.get_mut("meta_info") {
+                if let Some(d_logprobs) = meta.get_mut("input_token_logprobs") {
+                    if let (Some(p_arr), Some(d_arr)) =
+                        (p_logprobs.as_array(), d_logprobs.as_array())
+                    {
+                        let mut merged = p_arr.clone();
+                        merged.extend(d_arr.clone());
+                        *d_logprobs = Value::Array(merged);
+                    }
+                }
+            }
+        }
+
+        // Re-serialize
+        let merged_str = format!(
+            "data: {}\n\n",
+            serde_json::to_string(&decode_json).unwrap_or_default()
+        );
+        Ok(bytes::Bytes::from(merged_str))
+    }
+}
+
+// Helper functions
+
+async fn get_worker_load(client: &Client, worker_url: &str) -> Option<isize> {
+    match client.get(format!("{}/get_load", worker_url)).send().await {
+        Ok(res) if res.status().is_success() => match res.bytes().await {
+            Ok(bytes) => match serde_json::from_slice::<Value>(&bytes) {
+                Ok(data) => data
+                    .get("load")
+                    .and_then(|v| v.as_i64())
+                    .map(|v| v as isize),
+                Err(e) => {
+                    debug!("Failed to parse load response from {}: {}", worker_url, e);
+                    None
+                }
+            },
+            Err(e) => {
+                debug!("Failed to read load response from {}: {}", worker_url, e);
+                None
+            }
+        },
+        Ok(res) => {
+            debug!(
+                "Worker {} returned non-success status: {}",
+                worker_url,
+                res.status()
+            );
+            None
+        }
+        Err(e) => {
+            debug!("Failed to get load from {}: {}", worker_url, e);
+            None
+        }
+    }
+}
+
+#[async_trait]
+impl WorkerManagement for PDRouter {
+    async fn add_worker(&self, _worker_url: &str) -> Result<String, String> {
+        // For PD router, we don't support adding workers via this generic method
+        Err(
+            "PD router requires specific add_prefill_server or add_decode_server methods"
+                .to_string(),
+        )
+    }
+
+    fn remove_worker(&self, worker_url: &str) {
+        // For PD router, we would need to know if it's a prefill or decode server
+        // For now, try both
+        if let Ok(mut workers) = self.prefill_workers.write() {
+            if let Some(index) = workers.iter().position(|w| w.url() == worker_url) {
+                workers.remove(index);
+                info!("Removed prefill worker: {}", worker_url);
+                return;
+            }
+        }
+
+        if let Ok(mut workers) = self.decode_workers.write() {
+            if let Some(index) = workers.iter().position(|w| w.url() == worker_url) {
+                workers.remove(index);
+                info!("Removed decode worker: {}", worker_url);
+            }
+        }
+    }
+
+    fn get_worker_urls(&self) -> Vec<String> {
+        let mut urls = Vec::new();
+
+        // Add prefill worker URLs
+        if let Ok(workers) = self.prefill_workers.read() {
+            for worker in workers.iter() {
+                urls.push(worker.url().to_string());
+            }
+        }
+
+        // Add decode worker URLs
+        if let Ok(workers) = self.decode_workers.read() {
+            for worker in workers.iter() {
+                urls.push(worker.url().to_string());
+            }
+        }
+
+        urls
+    }
+}
+
+#[async_trait]
+impl RouterTrait for PDRouter {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    async fn health(&self, _req: Request<Body>) -> Response {
+        // This is a server readiness check - checking if we have healthy workers
+        // Workers handle their own health checks in the background
+        let mut all_healthy = true;
+        let mut unhealthy_servers = Vec::new();
+
+        // Check prefill servers
+        for worker in self.prefill_workers.read().unwrap().iter() {
+            if !worker.is_healthy() {
+                all_healthy = false;
+                unhealthy_servers.push(format!("Prefill: {}", worker.url()));
+            }
+        }
+
+        // Check decode servers
+        for worker in self.decode_workers.read().unwrap().iter() {
+            if !worker.is_healthy() {
+                all_healthy = false;
+                unhealthy_servers.push(format!("Decode: {}", worker.url()));
+            }
+        }
+
+        if all_healthy {
+            (StatusCode::OK, "All servers healthy").into_response()
+        } else {
+            (
+                StatusCode::SERVICE_UNAVAILABLE,
+                format!("Unhealthy servers: {:?}", unhealthy_servers),
+            )
+                .into_response()
+        }
+    }
+
+    async fn health_generate(&self, _req: Request<Body>) -> Response {
+        // Test model generation capability by selecting a random pair and testing them
+        // Note: This endpoint actually causes the model to generate tokens, so we only test one pair
+
+        // Select a random worker pair using the policy
+        let (prefill, decode) = match self.select_pd_pair(None).await {
+            Ok(pair) => pair,
+            Err(e) => {
+                return (
+                    StatusCode::SERVICE_UNAVAILABLE,
+                    format!("No healthy worker pair available: {}", e),
+                )
+                    .into_response();
+            }
+        };
+
+        // Test prefill server's health_generate
+        let prefill_url = format!("{}/health_generate", prefill.url());
+        let (prefill_result, decode_result) = tokio::join!(
+            self.client.get(&prefill_url).send(),
+            self.client
+                .get(format!("{}/health_generate", decode.url()))
+                .send()
+        );
+
+        // Check results
+        let mut errors = Vec::new();
+
+        match prefill_result {
+            Ok(res) if res.status().is_success() => {
+                debug!(
+                    "Health generate passed for prefill server: {}",
+                    prefill.url()
+                );
+            }
+            Ok(res) => {
+                errors.push(format!(
+                    "Prefill {} returned status {}",
+                    prefill.url(),
+                    res.status()
+                ));
+            }
+            Err(e) => {
+                errors.push(format!("Prefill {} error: {}", prefill.url(), e));
+            }
+        }
+
+        match decode_result {
+            Ok(res) if res.status().is_success() => {
+                debug!("Health generate passed for decode server: {}", decode.url());
+            }
+            Ok(res) => {
+                errors.push(format!(
+                    "Decode {} returned status {}",
+                    decode.url(),
+                    res.status()
+                ));
+            }
+            Err(e) => {
+                errors.push(format!("Decode {} error: {}", decode.url(), e));
+            }
+        }
+
+        if errors.is_empty() {
+            (
+                StatusCode::OK,
+                format!(
+                    "Health generate passed on selected pair: prefill={}, decode={}",
+                    prefill.url(),
+                    decode.url()
+                ),
+            )
+                .into_response()
+        } else {
+            (
+                StatusCode::SERVICE_UNAVAILABLE,
+                format!("Health generate failed: {:?}", errors),
+            )
+                .into_response()
+        }
+    }
+
+    async fn get_server_info(&self, _req: Request<Body>) -> Response {
+        // Get info from the first decode server to match sglang's server info format
+        // Note: We use decode workers for server info to match expected format
+        self.proxy_to_first_worker(&self.decode_workers, "get_server_info", "decode", None)
+            .await
+    }
+
+    async fn get_models(&self, req: Request<Body>) -> Response {
+        // Extract headers first to avoid Send issues
+        let headers = header_utils::copy_request_headers(&req);
+
+        // Proxy to first prefill worker
+        self.proxy_to_first_worker(&self.prefill_workers, "v1/models", "prefill", Some(headers))
+            .await
+    }
+
+    async fn get_model_info(&self, req: Request<Body>) -> Response {
+        // Extract headers first to avoid Send issues
+        let headers = header_utils::copy_request_headers(&req);
+
+        // Proxy to first prefill worker
+        self.proxy_to_first_worker(
+            &self.prefill_workers,
+            "get_model_info",
+            "prefill",
+            Some(headers),
+        )
+        .await
+    }
+
+    async fn route_generate(
+        &self,
+        headers: Option<&HeaderMap>,
+        body: &GenerateRequest,
+    ) -> Response {
+        // Extract parameters
+        let is_stream = body.stream;
+        let return_logprob = body.return_logprob;
+
+        // Extract text for cache-aware routing
+        let request_text = if self.policies_need_request_text() {
+            body.text
+                .as_deref()
+                .or_else(|| {
+                    body.prompt.as_ref().and_then(|p| match p {
+                        StringOrArray::String(s) => Some(s.as_str()),
+                        StringOrArray::Array(v) => v.first().map(|s| s.as_str()),
+                    })
+                })
+                .map(|s| s.to_string())
+        } else {
+            None
+        };
+
+        // Calculate batch size
+        let batch_size = Self::get_generate_batch_size(body);
+
+        // Create context
+        let context = PDRequestContext {
+            route: "/generate",
+            batch_size,
+            is_stream,
+            return_logprob,
+            request_text,
+        };
+
+        // Execute with retry and bootstrap injection
+        self.execute_dual_dispatch(headers, body, context).await
+    }
+
+    async fn route_chat(
+        &self,
+        headers: Option<&HeaderMap>,
+        body: &ChatCompletionRequest,
+    ) -> Response {
+        // Extract parameters
+        let is_stream = body.stream;
+        let return_logprob = body.logprobs;
+
+        // Extract text for cache-aware routing
+        let request_text = if self.policies_need_request_text() {
+            body.messages.first().and_then(|msg| match msg {
+                ChatMessage::User { content, .. } => match content {
+                    UserMessageContent::Text(text) => Some(text.clone()),
+                    UserMessageContent::Parts(_) => None,
+                },
+                ChatMessage::System { content, .. } => Some(content.clone()),
+                _ => None,
+            })
+        } else {
+            None
+        };
+
+        // Calculate batch size
+        let batch_size = Self::get_chat_batch_size(body);
+
+        // Create context
+        let context = PDRequestContext {
+            route: "/v1/chat/completions",
+            batch_size,
+            is_stream,
+            return_logprob,
+            request_text,
+        };
+
+        // Execute with retry and bootstrap injection
+        self.execute_dual_dispatch(headers, body, context).await
+    }
+
+    async fn route_completion(
+        &self,
+        headers: Option<&HeaderMap>,
+        body: &CompletionRequest,
+    ) -> Response {
+        // Extract parameters
+        let is_stream = body.stream;
+        let return_logprob = body.logprobs.is_some();
+
+        // Extract text for cache-aware routing
+        let request_text = if self.policies_need_request_text() {
+            match &body.prompt {
+                StringOrArray::String(s) => Some(s.clone()),
+                StringOrArray::Array(v) => v.first().map(|s| s.to_string()),
+            }
+        } else {
+            None
+        };
+
+        // Calculate batch size
+        let batch_size = Self::get_completion_batch_size(body);
+
+        // Create context
+        let context = PDRequestContext {
+            route: "/v1/completions",
+            batch_size,
+            is_stream,
+            return_logprob,
+            request_text,
+        };
+
+        // Execute with retry and bootstrap injection
+        self.execute_dual_dispatch(headers, body, context).await
+    }
+
+    async fn route_embeddings(&self, _headers: Option<&HeaderMap>, _body: Body) -> Response {
+        todo!()
+    }
+
+    async fn route_rerank(&self, _headers: Option<&HeaderMap>, _body: Body) -> Response {
+        todo!()
+    }
+
+    async fn flush_cache(&self) -> Response {
+        // Process both prefill and decode workers
+        let (prefill_results, prefill_errors) = self
+            .process_workers(&self.prefill_workers, "Prefill", "flush_cache")
+            .await;
+        let (decode_results, decode_errors) = self
+            .process_workers(&self.decode_workers, "Decode", "flush_cache")
+            .await;
+
+        // Combine results and errors
+        let mut results = prefill_results;
+        results.extend(decode_results);
+        let mut errors = prefill_errors;
+        errors.extend(decode_errors);
+
+        if errors.is_empty() {
+            (
+                StatusCode::OK,
+                format!("Cache flushed successfully: {:?}", results),
+            )
+                .into_response()
+        } else {
+            (
+                StatusCode::PARTIAL_CONTENT,
+                format!(
+                    "Partial success. Results: {:?}, Errors: {:?}",
+                    results, errors
+                ),
+            )
+                .into_response()
+        }
+    }
+
+    async fn get_worker_loads(&self) -> Response {
+        let mut loads = HashMap::new();
+        let mut errors = Vec::new();
+
+        // Process prefill workers
+        match Self::get_worker_urls(&self.prefill_workers, "prefill") {
+            Ok(urls) => {
+                for worker_url in urls {
+                    match get_worker_load(&self.client, &worker_url).await {
+                        Some(load) => {
+                            loads.insert(format!("prefill_{}", worker_url), load);
+                        }
+                        None => {
+                            errors.push(format!("Failed to get load from prefill {}", worker_url));
+                        }
+                    }
+                }
+            }
+            Err(e) => errors.push(e),
+        }
+
+        // Process decode workers
+        match Self::get_worker_urls(&self.decode_workers, "decode") {
+            Ok(urls) => {
+                for worker_url in urls {
+                    match get_worker_load(&self.client, &worker_url).await {
+                        Some(load) => {
+                            loads.insert(format!("decode_{}", worker_url), load);
+                        }
+                        None => {
+                            errors.push(format!("Failed to get load from decode {}", worker_url));
+                        }
+                    }
+                }
+            }
+            Err(e) => errors.push(e),
+        }
+
+        let response_data = serde_json::json!({
+            "loads": loads,
+            "errors": errors
+        });
+
+        (StatusCode::OK, Json(response_data)).into_response()
+    }
+
+    fn router_type(&self) -> &'static str {
+        "pd"
+    }
+
+    fn readiness(&self) -> Response {
+        // PD router is ready if it has at least one healthy prefill AND one healthy decode worker
+        let healthy_prefill_count = self
+            .prefill_workers
+            .read()
+            .unwrap()
+            .iter()
+            .filter(|w| w.is_healthy())
+            .count();
+
+        let healthy_decode_count = self
+            .decode_workers
+            .read()
+            .unwrap()
+            .iter()
+            .filter(|w| w.is_healthy())
+            .count();
+
+        let total_prefill = self.prefill_workers.read().unwrap().len();
+        let total_decode = self.decode_workers.read().unwrap().len();
+
+        if healthy_prefill_count > 0 && healthy_decode_count > 0 {
+            Json(json!({
+                "status": "ready",
+                "prefill": {
+                    "healthy": healthy_prefill_count,
+                    "total": total_prefill
+                },
+                "decode": {
+                    "healthy": healthy_decode_count,
+                    "total": total_decode
+                }
+            }))
+            .into_response()
+        } else {
+            let mut reasons = Vec::new();
+            if healthy_prefill_count == 0 {
+                reasons.push("no healthy prefill workers");
+            }
+            if healthy_decode_count == 0 {
+                reasons.push("no healthy decode workers");
+            }
+
+            (
+                StatusCode::SERVICE_UNAVAILABLE,
+                Json(serde_json::json!({
+                    "status": "not_ready",
+                    "reason": reasons.join(", "),
+                    "prefill": {
+                        "healthy": healthy_prefill_count,
+                        "total": total_prefill
+                    },
+                    "decode": {
+                        "healthy": healthy_decode_count,
+                        "total": total_decode
+                    }
+                })),
+            )
+                .into_response()
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::core::{BasicWorker, WorkerType};
+    use crate::policies::RandomPolicy;
+
+    fn create_test_pd_router() -> PDRouter {
+        let prefill_policy = Arc::new(RandomPolicy::new());
+        let decode_policy = Arc::new(RandomPolicy::new());
+
+        PDRouter {
+            prefill_workers: Arc::new(RwLock::new(vec![])),
+            decode_workers: Arc::new(RwLock::new(vec![])),
+            prefill_policy,
+            decode_policy,
+            worker_startup_timeout_secs: 5,
+            worker_startup_check_interval_secs: 1,
+            worker_loads: Arc::new(tokio::sync::watch::channel(HashMap::new()).1),
+            load_monitor_handle: None,
+            client: Client::new(),
+            prefill_client: Client::new(),
+            prefill_drain_tx: mpsc::channel(100).0,
+            retry_config: RetryConfig::default(),
+            circuit_breaker_config: CircuitBreakerConfig::default(),
+            _prefill_health_checker: None,
+            _decode_health_checker: None,
+        }
+    }
+
+    fn create_test_worker(url: String, worker_type: WorkerType, healthy: bool) -> Box<dyn Worker> {
+        let worker = BasicWorker::new(url, worker_type);
+        worker.set_healthy(healthy);
+        Box::new(worker)
+    }
+
+    // ============= Worker Management Tests =============
+
+    #[tokio::test]
+    async fn test_add_prefill_server_already_exists() {
+        let router = create_test_pd_router();
+
+        // Add a worker first
+        let worker = create_test_worker(
+            "http://localhost:8000".to_string(),
+            WorkerType::Prefill {
+                bootstrap_port: Some(8080),
+            },
+            true,
+        );
+        router.prefill_workers.write().unwrap().push(worker);
+
+        // Try to add the same URL again - this would fail during health check in real scenario
+        // For unit test, we test the duplicate check logic
+        let workers = router.prefill_workers.read().unwrap();
+        let exists = workers.iter().any(|w| w.url() == "http://localhost:8000");
+        assert!(exists);
+    }
+
+    #[tokio::test]
+    async fn test_remove_prefill_server_success() {
+        let router = create_test_pd_router();
+
+        // Add servers first
+        let worker1 = create_test_worker(
+            "http://worker1".to_string(),
+            WorkerType::Prefill {
+                bootstrap_port: None,
+            },
+            true,
+        );
+        let worker2 = create_test_worker(
+            "http://worker2".to_string(),
+            WorkerType::Prefill {
+                bootstrap_port: Some(8080),
+            },
+            true,
+        );
+
+        router.prefill_workers.write().unwrap().push(worker1);
+        router.prefill_workers.write().unwrap().push(worker2);
+
+        // Remove one
+        let result = router.remove_prefill_server("http://worker1").await;
+
+        assert!(result.is_ok());
+        assert!(result.unwrap().contains("Successfully removed"));
+
+        let workers = router.prefill_workers.read().unwrap();
+        assert_eq!(workers.len(), 1);
+        assert_eq!(workers[0].url(), "http://worker2");
+    }
+
+    #[tokio::test]
+    async fn test_remove_prefill_server_not_found() {
+        let router = create_test_pd_router();
+
+        let result = router.remove_prefill_server("http://nonexistent").await;
+
+        assert!(result.is_err());
+        match result.unwrap_err() {
+            PDRouterError::WorkerNotFound { url } => {
+                assert_eq!(url, "http://nonexistent");
+            }
+            _ => panic!("Expected WorkerNotFound error"),
+        }
+    }
+
+    #[tokio::test]
+    async fn test_remove_decode_server_success() {
+        let router = create_test_pd_router();
+
+        // Add server first
+        let worker = create_test_worker("http://decode1".to_string(), WorkerType::Decode, true);
+        router.decode_workers.write().unwrap().push(worker);
+
+        let result = router.remove_decode_server("http://decode1").await;
+
+        assert!(result.is_ok());
+        assert!(result.unwrap().contains("Successfully removed"));
+
+        let workers = router.decode_workers.read().unwrap();
+        assert_eq!(workers.len(), 0);
+    }
+
+    // ============= Lock Error Handling Tests =============
+
+    #[test]
+    fn test_lock_operations() {
+        let router = create_test_pd_router();
+
+        // Test read/write locks work correctly
+        {
+            let read_guard = router.prefill_workers.read().unwrap();
+            assert_eq!(read_guard.len(), 0);
+        }
+
+        {
+            let mut write_guard = router.prefill_workers.write().unwrap();
+            write_guard.push(create_test_worker(
+                "http://test".to_string(),
+                WorkerType::Prefill {
+                    bootstrap_port: None,
+                },
+                true,
+            ));
+        }
+
+        {
+            let read_guard = router.prefill_workers.read().unwrap();
+            assert_eq!(read_guard.len(), 1);
+        }
+    }
+
+    // ============= Bootstrap Injection Tests =============
+    // Note: These tests are commented out as we've moved to the optimized bootstrap injection
+    // approach that doesn't use the Bootstrap trait on GenerateReqInput anymore.
+
+    // TODO: Add new tests for the optimized bootstrap injection approach using
+    // RequestWithBootstrap and BatchRequestWithBootstrap wrappers
+
+    // ============= Worker Selection Tests =============
+
+    #[tokio::test]
+    async fn test_select_healthy_prefill_worker() {
+        let router = create_test_pd_router();
+
+        // Add mix of healthy and unhealthy workers
+        let healthy_worker = create_test_worker(
+            "http://healthy".to_string(),
+            WorkerType::Prefill {
+                bootstrap_port: None,
+            },
+            true,
+        );
+        let unhealthy_worker = create_test_worker(
+            "http://unhealthy".to_string(),
+            WorkerType::Prefill {
+                bootstrap_port: None,
+            },
+            false,
+        );
+        let decode_worker =
+            create_test_worker("http://decode".to_string(), WorkerType::Decode, true);
+
+        router
+            .prefill_workers
+            .write()
+            .unwrap()
+            .push(unhealthy_worker);
+        router.prefill_workers.write().unwrap().push(healthy_worker);
+        router.decode_workers.write().unwrap().push(decode_worker);
+
+        let result = router.select_pd_pair(None).await;
+
+        assert!(result.is_ok());
+        let (prefill, _decode) = result.unwrap();
+
+        // Should select the healthy worker
+        assert_eq!(prefill.url(), "http://healthy");
+        assert!(prefill.is_healthy());
+    }
+
+    #[tokio::test]
+    async fn test_empty_worker_lists() {
+        let router = create_test_pd_router();
+
+        let result = router.select_pd_pair(None).await;
+
+        assert!(result.is_err());
+        assert!(result.unwrap_err().contains("No prefill workers available"));
+    }
+
+    // ============= Health Endpoints Tests =============
+
+    #[tokio::test]
+    async fn test_health_endpoints() {
+        let router = create_test_pd_router();
+
+        // Add healthy workers
+        let prefill_worker = create_test_worker(
+            "http://localhost:8000".to_string(),
+            WorkerType::Prefill {
+                bootstrap_port: None,
+            },
+            true,
+        );
+        let decode_worker = create_test_worker(
+            "http://localhost:8001".to_string(),
+            WorkerType::Decode,
+            true,
+        );
+
+        router.prefill_workers.write().unwrap().push(prefill_worker);
+        router.decode_workers.write().unwrap().push(decode_worker);
+
+        // Test health endpoint
+        let http_req = axum::http::Request::builder()
+            .body(axum::body::Body::empty())
+            .unwrap();
+        let response = router.health(http_req).await;
+
+        assert_eq!(response.status(), 200);
+
+        // Test readiness endpoint
+        let response = router.readiness();
+        assert_eq!(response.status(), 200);
+    }
+
+    // ============= Load Monitoring Tests =============
+
+    #[tokio::test]
+    async fn test_load_monitor_updates() {
+        let power_of_two_policy = Arc::new(crate::policies::PowerOfTwoPolicy::new());
+        let mut router = create_test_pd_router();
+        router.prefill_policy = power_of_two_policy.clone();
+        router.decode_policy = power_of_two_policy;
+
+        // Create load channel
+        let (tx, rx) = tokio::sync::watch::channel(HashMap::new());
+        router.worker_loads = Arc::new(rx);
+
+        // Simulate load updates
+        let mut loads = HashMap::new();
+        loads.insert("http://worker1".to_string(), 10);
+        loads.insert("http://worker2".to_string(), 5);
+
+        let _ = tx.send(loads.clone());
+
+        // Router should receive updates
+        let received = router.worker_loads.borrow().clone();
+        assert_eq!(received.get("http://worker1"), Some(&10));
+        assert_eq!(received.get("http://worker2"), Some(&5));
+    }
+
+    // ============= Worker Load Tests =============
+
+    #[test]
+    fn test_worker_load_metrics() {
+        let prefill_worker = create_test_worker(
+            "http://prefill".to_string(),
+            WorkerType::Prefill {
+                bootstrap_port: None,
+            },
+            true,
+        );
+        let decode_worker =
+            create_test_worker("http://decode".to_string(), WorkerType::Decode, true);
+
+        // Create load guard for both workers
+        let _guard =
+            WorkerLoadGuard::new_multi(vec![prefill_worker.as_ref(), decode_worker.as_ref()]);
+
+        // Load should be incremented
+        assert_eq!(prefill_worker.load(), 1);
+        assert_eq!(decode_worker.load(), 1);
+
+        // Drop guard - load should decrement
+        drop(_guard);
+
+        assert_eq!(prefill_worker.load(), 0);
+        assert_eq!(decode_worker.load(), 0);
+    }
+
+    #[tokio::test]
+    async fn test_streaming_load_tracking() {
+        use futures_util::StreamExt;
+        use tokio::time::{sleep, Duration};
+
+        let router = create_test_pd_router();
+
+        // Add workers
+        let prefill_worker = create_test_worker(
+            "http://prefill".to_string(),
+            WorkerType::Prefill {
+                bootstrap_port: None,
+            },
+            true,
+        );
+        let decode_worker =
+            create_test_worker("http://decode".to_string(), WorkerType::Decode, true);
+
+        router.prefill_workers.write().unwrap().push(prefill_worker);
+        router.decode_workers.write().unwrap().push(decode_worker);
+
+        // Get references to the workers - clone to avoid holding lock across await
+        let (prefill_ref, decode_ref) = {
+            let workers = router.prefill_workers.read().unwrap();
+            let prefill = workers[0].clone_worker();
+            drop(workers);
+            let workers = router.decode_workers.read().unwrap();
+            let decode = workers[0].clone_worker();
+            (prefill, decode)
+        };
+
+        // Initially load should be 0
+        assert_eq!(prefill_ref.load(), 0);
+        assert_eq!(decode_ref.load(), 0);
+
+        // Create a mock streaming response
+        let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
+        let stream = tokio_stream::wrappers::UnboundedReceiverStream::new(rx);
+
+        // Call create_streaming_response which should increment load
+        let _response = router.create_streaming_response(
+            stream.map(Ok),
+            StatusCode::OK,
+            None,
+            false,
+            None,
+            None,
+            prefill_ref.as_ref(),
+            decode_ref.as_ref(),
+        );
+
+        // Load should be incremented immediately
+        assert_eq!(prefill_ref.load(), 1);
+        assert_eq!(decode_ref.load(), 1);
+
+        // Send some data through the stream
+        tx.send(bytes::Bytes::from("test data")).unwrap();
+
+        // Give time for the spawned task to process
+        sleep(Duration::from_millis(10)).await;
+
+        // Load should still be 1 (streaming in progress)
+        assert_eq!(prefill_ref.load(), 1);
+        assert_eq!(decode_ref.load(), 1);
+
+        // Close the stream
+        drop(tx);
+
+        // Give time for cleanup
+        sleep(Duration::from_millis(100)).await;
+
+        // Load should be decremented after streaming completes
+        assert_eq!(prefill_ref.load(), 0);
+        assert_eq!(decode_ref.load(), 0);
+    }
+
+    // ============= Concurrent Operations Tests =============
+
+    #[tokio::test]
+    async fn test_concurrent_worker_operations() {
+        let router = Arc::new(create_test_pd_router());
+
+        let mut handles = vec![];
+
+        // Spawn tasks to add workers
+        for i in 0..5 {
+            let router_clone = Arc::clone(&router);
+            let url = format!("http://worker{}", i);
+            let handle = tokio::spawn(async move {
+                let worker = create_test_worker(
+                    url,
+                    WorkerType::Prefill {
+                        bootstrap_port: None,
+                    },
+                    true,
+                );
+                router_clone.prefill_workers.write().unwrap().push(worker);
+            });
+            handles.push(handle);
+        }
+
+        // Wait for all tasks
+        for handle in handles {
+            let _ = handle.await;
+        }
+
+        // Check final state
+        let workers = router.prefill_workers.read().unwrap();
+        assert_eq!(workers.len(), 5);
+    }
+}
diff --git a/sgl-router/src/routers/http/pd_types.rs b/sgl-router/src/routers/http/pd_types.rs
new file mode 100644
index 000000000..a2b28a57d
--- /dev/null
+++ b/sgl-router/src/routers/http/pd_types.rs
@@ -0,0 +1,81 @@
+// Custom error type for PD router operations
+#[derive(Debug, thiserror::Error)]
+pub enum PDRouterError {
+    #[error("Worker already exists: {url}")]
+    WorkerAlreadyExists { url: String },
+
+    #[error("Worker not found: {url}")]
+    WorkerNotFound { url: String },
+
+    #[error("Lock acquisition failed: {operation}")]
+    LockError { operation: String },
+
+    #[error("Health check failed for worker: {url}")]
+    HealthCheckFailed { url: String },
+
+    #[error("Invalid worker configuration: {reason}")]
+    InvalidConfiguration { reason: String },
+
+    #[error("Network error: {message}")]
+    NetworkError { message: String },
+
+    #[error("Timeout waiting for worker: {url}")]
+    Timeout { url: String },
+}
+
+// Helper functions for workers
+pub fn api_path(url: &str, api_path: &str) -> String {
+    if api_path.starts_with("/") {
+        format!("{}{}", url, api_path)
+    } else {
+        format!("{}/{}", url, api_path)
+    }
+}
+
+pub fn get_hostname(url: &str) -> String {
+    // Simple hostname extraction without external dependencies
+    let url = url
+        .trim_start_matches("http://")
+        .trim_start_matches("https://");
+    url.split(':').next().unwrap_or("localhost").to_string()
+}
+
+use serde::Serialize;
+
+// Optimized bootstrap wrapper for single requests
+#[derive(Serialize)]
+pub struct RequestWithBootstrap<'a, T: Serialize> {
+    #[serde(flatten)]
+    pub original: &'a T,
+    pub bootstrap_host: String,
+    pub bootstrap_port: Option<u16>,
+    pub bootstrap_room: u64,
+}
+
+// Optimized bootstrap wrapper for batch requests
+#[derive(Serialize)]
+pub struct BatchRequestWithBootstrap<'a, T: Serialize> {
+    #[serde(flatten)]
+    pub original: &'a T,
+    pub bootstrap_host: Vec<String>,
+    pub bootstrap_port: Vec<Option<u16>>,
+    pub bootstrap_room: Vec<u64>,
+}
+
+// Helper to generate bootstrap room ID
+pub fn generate_room_id() -> u64 {
+    // Generate a value in the range [0, 2^63 - 1] to match Python's random.randint(0, 2**63 - 1)
+    rand::random::<u64>() & (i64::MAX as u64)
+}
+
+// PD-specific routing policies
+#[derive(Debug, Clone, PartialEq)]
+pub enum PDSelectionPolicy {
+    Random,
+    PowerOfTwo,
+    CacheAware {
+        cache_threshold: f32,
+        balance_abs_threshold: usize,
+        balance_rel_threshold: f32,
+    },
+}
diff --git a/sgl-router/src/routers/http/router.rs b/sgl-router/src/routers/http/router.rs
new file mode 100644
index 000000000..176c38602
--- /dev/null
+++ b/sgl-router/src/routers/http/router.rs
@@ -0,0 +1,1387 @@
+use crate::config::types::RetryConfig;
+use crate::core::{
+    is_retryable_status, BasicWorker, CircuitBreakerConfig, HealthChecker, HealthConfig,
+    RetryExecutor, Worker, WorkerFactory, WorkerType,
+};
+use crate::metrics::RouterMetrics;
+use crate::policies::LoadBalancingPolicy;
+use crate::protocols::spec::{
+    ChatCompletionRequest, CompletionRequest, GenerateRequest, GenerationRequest,
+};
+use crate::routers::header_utils;
+use crate::routers::{RouterTrait, WorkerManagement};
+use axum::{
+    body::Body,
+    extract::Request,
+    http::{header::CONTENT_LENGTH, header::CONTENT_TYPE, HeaderMap, HeaderValue, StatusCode},
+    response::{IntoResponse, Response},
+    Json,
+};
+use futures_util::StreamExt;
+use reqwest::Client;
+use std::collections::HashMap;
+use std::sync::{Arc, RwLock};
+use std::time::{Duration, Instant};
+use tokio_stream::wrappers::UnboundedReceiverStream;
+use tracing::{debug, error, info, warn};
+
+/// Regular router that uses injected load balancing policies
+#[derive(Debug)]
+pub struct Router {
+    workers: Arc<RwLock<Vec<Box<dyn Worker>>>>,
+    policy: Arc<dyn LoadBalancingPolicy>,
+    client: Client,
+    worker_startup_timeout_secs: u64,
+    worker_startup_check_interval_secs: u64,
+    dp_aware: bool,
+    api_key: Option<String>,
+    retry_config: RetryConfig,
+    circuit_breaker_config: CircuitBreakerConfig,
+    _worker_loads: Arc<tokio::sync::watch::Receiver<HashMap<String, isize>>>,
+    _load_monitor_handle: Option<Arc<tokio::task::JoinHandle<()>>>,
+    _health_checker: Option<HealthChecker>,
+}
+
+impl Router {
+    /// Create a new router with injected policy and client
+    #[allow(clippy::too_many_arguments)]
+    pub async fn new(
+        worker_urls: Vec<String>,
+        policy: Arc<dyn LoadBalancingPolicy>,
+        ctx: &Arc<crate::server::AppContext>,
+    ) -> Result<Self, String> {
+        // Update active workers gauge
+        RouterMetrics::set_active_workers(worker_urls.len());
+
+        // Wait for workers to be healthy (skip if empty - for service discovery mode)
+        if !worker_urls.is_empty() {
+            Self::wait_for_healthy_workers(
+                &worker_urls,
+                ctx.router_config.worker_startup_timeout_secs,
+                ctx.router_config.worker_startup_check_interval_secs,
+            )
+            .await?;
+        }
+
+        let worker_urls = if ctx.router_config.dp_aware {
+            // worker address now in the format of "http://host:port@dp_rank"
+            Self::get_dp_aware_workers(&worker_urls, &ctx.router_config.api_key)
+                .map_err(|e| format!("Failed to get dp-aware workers: {}", e))?
+        } else {
+            worker_urls
+        };
+
+        // Convert config CircuitBreakerConfig to core CircuitBreakerConfig
+        let circuit_breaker_config = ctx.router_config.effective_circuit_breaker_config();
+        let core_cb_config = CircuitBreakerConfig {
+            failure_threshold: circuit_breaker_config.failure_threshold,
+            success_threshold: circuit_breaker_config.success_threshold,
+            timeout_duration: Duration::from_secs(circuit_breaker_config.timeout_duration_secs),
+            window_duration: Duration::from_secs(circuit_breaker_config.window_duration_secs),
+        };
+
+        // Create Worker trait objects from URLs with health check config
+        let workers: Vec<Box<dyn Worker>> = worker_urls
+            .iter()
+            .map(|url| {
+                let worker = BasicWorker::new(url.clone(), WorkerType::Regular)
+                    .with_circuit_breaker_config(core_cb_config.clone())
+                    .with_health_config(HealthConfig {
+                        timeout_secs: ctx.router_config.health_check.timeout_secs,
+                        check_interval_secs: ctx.router_config.health_check.check_interval_secs,
+                        endpoint: ctx.router_config.health_check.endpoint.clone(),
+                        failure_threshold: ctx.router_config.health_check.failure_threshold,
+                        success_threshold: ctx.router_config.health_check.success_threshold,
+                    });
+                Box::new(worker) as Box<dyn Worker>
+            })
+            .collect();
+
+        // Initialize policy with workers if needed (e.g., for cache-aware)
+        if let Some(cache_aware) = policy
+            .as_any()
+            .downcast_ref::<crate::policies::CacheAwarePolicy>()
+        {
+            cache_aware.init_workers(&workers);
+        }
+
+        let workers = Arc::new(RwLock::new(workers));
+        let health_checker = crate::core::start_health_checker(
+            Arc::clone(&workers),
+            ctx.router_config.worker_startup_check_interval_secs,
+        );
+
+        // Setup load monitoring for PowerOfTwo policy
+        let (tx, rx) = tokio::sync::watch::channel(HashMap::new());
+        let worker_loads = Arc::new(rx);
+
+        let load_monitor_handle = if policy.name() == "power_of_two" {
+            let monitor_urls = worker_urls.clone();
+            let monitor_interval = ctx.router_config.worker_startup_check_interval_secs;
+            let policy_clone = Arc::clone(&policy);
+            let client_clone = ctx.client.clone();
+
+            Some(Arc::new(tokio::spawn(async move {
+                Self::monitor_worker_loads(
+                    monitor_urls,
+                    tx,
+                    monitor_interval,
+                    policy_clone,
+                    client_clone,
+                )
+                .await;
+            })))
+        } else {
+            None
+        };
+
+        Ok(Router {
+            workers,
+            policy,
+            client: ctx.client.clone(),
+            worker_startup_timeout_secs: ctx.router_config.worker_startup_timeout_secs,
+            worker_startup_check_interval_secs: ctx
+                .router_config
+                .worker_startup_check_interval_secs,
+            dp_aware: ctx.router_config.dp_aware,
+            api_key: ctx.router_config.api_key.clone(),
+            retry_config: ctx.router_config.effective_retry_config(),
+            circuit_breaker_config: core_cb_config,
+            _worker_loads: worker_loads,
+            _load_monitor_handle: load_monitor_handle,
+            _health_checker: Some(health_checker),
+        })
+    }
+
+    /// Get the current list of worker URLs
+    pub fn get_worker_urls(&self) -> Vec<String> {
+        self.workers
+            .read()
+            .unwrap()
+            .iter()
+            .map(|w| w.url().to_string())
+            .collect()
+    }
+
+    pub async fn wait_for_healthy_workers(
+        worker_urls: &[String],
+        worker_startup_timeout_secs: u64,
+        worker_startup_check_interval_secs: u64,
+    ) -> Result<(), String> {
+        if worker_urls.is_empty() {
+            return Err(
+                "Timeout waiting for workers to become healthy: no workers provided".to_string(),
+            );
+        }
+
+        // Perform health check asynchronously
+        Self::wait_for_healthy_workers_async(
+            worker_urls,
+            worker_startup_timeout_secs,
+            worker_startup_check_interval_secs,
+        )
+        .await
+    }
+
+    async fn wait_for_healthy_workers_async(
+        worker_urls: &[String],
+        worker_startup_timeout_secs: u64,
+        worker_startup_check_interval_secs: u64,
+    ) -> Result<(), String> {
+        info!(
+            "Waiting for {} workers to become healthy (timeout: {}s)",
+            worker_urls.len(),
+            worker_startup_timeout_secs
+        );
+
+        let start_time = std::time::Instant::now();
+        let client = reqwest::Client::builder()
+            .timeout(Duration::from_secs(2))
+            .build()
+            .map_err(|e| format!("Failed to create HTTP client: {}", e))?;
+
+        loop {
+            if start_time.elapsed() > Duration::from_secs(worker_startup_timeout_secs) {
+                error!(
+                    "Timeout {}s waiting for workers {:?} to become healthy. Please set --router-worker-startup-timeout-secs (sglang_router.launch_server) or --worker-startup-timeout-secs (sglang_worker.router) to a larger value",
+                    worker_startup_timeout_secs, worker_urls
+                );
+                return Err(format!(
+                    "Timeout {}s waiting for workers {:?} to become healthy. Please set --router-worker-startup-timeout-secs (sglang_router.launch_server) or --worker-startup-timeout-secs (sglang_worker.router) to a larger value",
+                    worker_startup_timeout_secs, worker_urls
+                ));
+            }
+
+            // Perform all health checks concurrently
+            let mut health_checks = Vec::new();
+            for url in worker_urls {
+                let client_clone = client.clone();
+                let url_clone = url.clone();
+
+                let check_health = tokio::spawn(async move {
+                    let health_url = format!("{}/health", url_clone);
+                    match client_clone.get(&health_url).send().await {
+                        Ok(res) => {
+                            if res.status().is_success() {
+                                None
+                            } else {
+                                Some((url_clone, format!("status: {}", res.status())))
+                            }
+                        }
+                        Err(_) => Some((url_clone, "not ready".to_string())),
+                    }
+                });
+
+                health_checks.push(check_health);
+            }
+
+            // Wait for all health checks to complete
+            let results = futures::future::join_all(health_checks).await;
+
+            let mut all_healthy = true;
+            let mut unhealthy_workers = Vec::new();
+
+            for result in results {
+                match result {
+                    Ok(None) => {
+                        // Worker is healthy
+                    }
+                    Ok(Some((url, reason))) => {
+                        all_healthy = false;
+                        unhealthy_workers.push((url, reason));
+                    }
+                    Err(e) => {
+                        all_healthy = false;
+                        unhealthy_workers
+                            .push(("unknown".to_string(), format!("task error: {}", e)));
+                    }
+                }
+            }
+
+            if all_healthy {
+                info!("All {} workers are healthy", worker_urls.len());
+                return Ok(());
+            } else {
+                debug!(
+                    "Waiting for {} workers to become healthy ({} unhealthy: {:?})",
+                    worker_urls.len(),
+                    unhealthy_workers.len(),
+                    unhealthy_workers
+                );
+                tokio::time::sleep(Duration::from_secs(worker_startup_check_interval_secs)).await;
+            }
+        }
+    }
+
+    fn get_worker_dp_size(worker_url: &str, api_key: &Option<String>) -> Result<usize, String> {
+        let sync_client = reqwest::blocking::Client::new();
+        let mut req_builder = sync_client.get(format!("{}/get_server_info", worker_url));
+        if let Some(key) = api_key {
+            req_builder = req_builder.bearer_auth(key);
+        }
+
+        match req_builder.send() {
+            Ok(res) => {
+                if res.status().is_success() {
+                    let server_info = res
+                        .text()
+                        .map_err(|e| format!("failed to read text from response: {}", e))?;
+
+                    let server_info: serde_json::Value = serde_json::from_str(&server_info)
+                        .map_err(|e| format!("failed to decode JSON: {}", e))?;
+
+                    let dp_size = server_info
+                        .get("dp_size")
+                        .and_then(|v| v.as_u64())
+                        .ok_or_else(|| String::from("dp_size not found or not an u64"))?;
+
+                    Ok(if dp_size > usize::MAX as u64 {
+                        return Err(format!("dp_size is too large: {}", dp_size));
+                    } else {
+                        dp_size as usize
+                    })
+                } else {
+                    Err(format!("unexpected status code: {}", res.status()))
+                }
+            }
+            Err(e) => Err(format!("error response: {}", e)),
+        }
+    }
+
+    // Given a list of workers, return a list of workers with dp_rank as suffix
+    fn get_dp_aware_workers(
+        worker_urls: &[String],
+        api_key: &Option<String>,
+    ) -> Result<Vec<String>, String> {
+        let mut dp_aware_workers: Vec<String> = Vec::new();
+
+        for url in worker_urls {
+            match Self::get_worker_dp_size(url, api_key) {
+                Ok(dp_size) => {
+                    for i in 0..dp_size {
+                        dp_aware_workers.push(format!("{}@{}", url, i));
+                    }
+                }
+                Err(e) => return Err(format!("Failed to get DP size for {}: {}", url, e)),
+            }
+        }
+
+        Ok(dp_aware_workers)
+    }
+
+    fn select_first_worker(&self) -> Result<String, String> {
+        let workers_guard = self.workers.read().unwrap();
+        if workers_guard.is_empty() {
+            Err("No workers are available".to_string())
+        } else {
+            Ok(workers_guard[0].url().to_string())
+        }
+    }
+
+    pub async fn send_health_check(&self, worker_url: &str) -> Response {
+        let health_url = if self.dp_aware {
+            // Need to extract the URL from "http://host:port@dp_rank"
+            match Self::extract_dp_rank(worker_url) {
+                Ok((worker_url_prefix, _dp_rank)) => worker_url_prefix,
+                Err(e) => {
+                    error!("Failed to extract dp_rank for health check: {}", e);
+                    return (
+                        StatusCode::INTERNAL_SERVER_ERROR,
+                        format!("Failed to extract dp_rank: {}", e),
+                    )
+                        .into_response();
+                }
+            }
+        } else {
+            worker_url
+        };
+
+        let request_builder = self.client.get(format!("{}/health", health_url));
+
+        let response = match request_builder.send().await {
+            Ok(res) => {
+                let status = StatusCode::from_u16(res.status().as_u16())
+                    .unwrap_or(StatusCode::INTERNAL_SERVER_ERROR);
+
+                match res.bytes().await {
+                    Ok(body) => (status, body).into_response(),
+                    Err(e) => {
+                        error!(
+                            worker_url = %health_url,
+                            error = %e,
+                            "Failed to read health response body"
+                        );
+                        (
+                            StatusCode::INTERNAL_SERVER_ERROR,
+                            format!("Failed to read response body: {}", e),
+                        )
+                            .into_response()
+                    }
+                }
+            }
+            Err(e) => {
+                error!(
+                    worker_url = %health_url,
+                    error = %e,
+                    "Failed to send health request to worker"
+                );
+                (
+                    StatusCode::INTERNAL_SERVER_ERROR,
+                    format!("Failed to send request to worker {}: {}", health_url, e),
+                )
+                    .into_response()
+            }
+        };
+
+        // Don't record metrics for health checks
+        response
+    }
+
+    // Helper method to proxy GET requests to the first available worker
+    async fn proxy_get_request(&self, req: Request<Body>, endpoint: &str) -> Response {
+        let headers = header_utils::copy_request_headers(&req);
+
+        match self.select_first_worker() {
+            Ok(worker_url) => {
+                let mut request_builder = self.client.get(format!("{}/{}", worker_url, endpoint));
+                for (name, value) in headers {
+                    let name_lc = name.to_lowercase();
+                    if name_lc != "content-type" && name_lc != "content-length" {
+                        request_builder = request_builder.header(name, value);
+                    }
+                }
+
+                match request_builder.send().await {
+                    Ok(res) => {
+                        let status = StatusCode::from_u16(res.status().as_u16())
+                            .unwrap_or(StatusCode::INTERNAL_SERVER_ERROR);
+
+                        // Preserve headers from backend
+                        let response_headers =
+                            header_utils::preserve_response_headers(res.headers());
+
+                        match res.bytes().await {
+                            Ok(body) => {
+                                let mut response = Response::new(axum::body::Body::from(body));
+                                *response.status_mut() = status;
+                                *response.headers_mut() = response_headers;
+                                response
+                            }
+                            Err(e) => (
+                                StatusCode::INTERNAL_SERVER_ERROR,
+                                format!("Failed to read response: {}", e),
+                            )
+                                .into_response(),
+                        }
+                    }
+                    Err(e) => (
+                        StatusCode::INTERNAL_SERVER_ERROR,
+                        format!("Request failed: {}", e),
+                    )
+                        .into_response(),
+                }
+            }
+            Err(e) => (StatusCode::SERVICE_UNAVAILABLE, e).into_response(),
+        }
+    }
+
+    // New method to route typed requests directly
+    /// Select worker considering circuit breaker state
+    fn select_worker_with_circuit_breaker(&self, text: Option<&str>) -> Option<Box<dyn Worker>> {
+        let workers = self.workers.read().ok()?;
+        let available: Vec<Box<dyn Worker>> = workers
+            .iter()
+            .filter(|w| w.is_available())
+            .map(|w| w.clone_worker())
+            .collect();
+        if available.is_empty() {
+            return None;
+        }
+        let idx = self.policy.select_worker(&available, text)?;
+        Some(available[idx].clone_worker())
+    }
+
+    pub async fn route_typed_request<T: GenerationRequest + serde::Serialize + Clone>(
+        &self,
+        headers: Option<&HeaderMap>,
+        typed_req: &T,
+        route: &str,
+    ) -> Response {
+        let start = Instant::now();
+        let is_stream = typed_req.is_stream();
+        let text = typed_req.extract_text_for_routing();
+
+        let response = RetryExecutor::execute_response_with_retry(
+            &self.retry_config,
+            // operation per attempt
+            |_: u32| async {
+                let worker = match self.select_worker_with_circuit_breaker(Some(&text)) {
+                    Some(w) => w,
+                    None => {
+                        RouterMetrics::record_request_error(route, "no_available_workers");
+                        return (
+                            StatusCode::SERVICE_UNAVAILABLE,
+                            "No available workers (all circuits open or unhealthy)",
+                        )
+                            .into_response();
+                    }
+                };
+
+                // Optional load tracking for cache-aware policy
+                let load_incremented = if self.policy.name() == "cache_aware" {
+                    worker.increment_load();
+                    RouterMetrics::set_running_requests(worker.url(), worker.load());
+                    true
+                } else {
+                    false
+                };
+
+                // Keep a clone for potential cleanup on retry
+                let worker_for_cleanup = if load_incremented {
+                    Some(worker.clone_worker())
+                } else {
+                    None
+                };
+
+                let response = self
+                    .send_typed_request(
+                        headers,
+                        typed_req,
+                        route,
+                        worker.url(),
+                        is_stream,
+                        load_incremented,
+                    )
+                    .await;
+
+                worker.record_outcome(response.status().is_success());
+
+                // For retryable failures, we need to decrement load since send_typed_request
+                // won't have done it (it only decrements on success or non-retryable failures)
+                if is_retryable_status(response.status()) && load_incremented {
+                    if let Some(cleanup_worker) = worker_for_cleanup {
+                        cleanup_worker.decrement_load();
+                        RouterMetrics::set_running_requests(
+                            cleanup_worker.url(),
+                            cleanup_worker.load(),
+                        );
+                    }
+                }
+
+                response
+            },
+            // should_retry predicate
+            |res, _attempt| is_retryable_status(res.status()),
+            // on_backoff hook
+            |delay, attempt| {
+                RouterMetrics::record_retry(route);
+                RouterMetrics::record_retry_backoff_duration(delay, attempt);
+            },
+            // on_exhausted hook
+            || RouterMetrics::record_retries_exhausted(route),
+        )
+        .await;
+
+        if response.status().is_success() {
+            let duration = start.elapsed();
+            RouterMetrics::record_request(route);
+            RouterMetrics::record_generate_duration(duration);
+        } else if !is_retryable_status(response.status()) {
+            RouterMetrics::record_request_error(route, "non_retryable_error");
+        }
+
+        response
+    }
+
+    // TODO (rui): Better accommodate to the Worker abstraction
+    fn extract_dp_rank(worker_url: &str) -> Result<(&str, usize), String> {
+        let parts: Vec<&str> = worker_url.split('@').collect();
+        if parts.len() != 2 {
+            return Err(format!("invalid worker_url format: {}", worker_url));
+        }
+
+        // Parse the second part (dp_rank) into an integer
+        match parts[1].parse::<usize>() {
+            Ok(dp_rank) => Ok((parts[0], dp_rank)),
+            Err(_) => Err(format!(
+                "failed to parse dp_rank from worker_url: {}",
+                worker_url
+            )),
+        }
+    }
+
+    // Send typed request directly without conversion
+    async fn send_typed_request<T: serde::Serialize>(
+        &self,
+        headers: Option<&HeaderMap>,
+        typed_req: &T,
+        route: &str,
+        worker_url: &str,
+        is_stream: bool,
+        load_incremented: bool, // Whether load was incremented for this request
+    ) -> Response {
+        let mut request_builder = if self.dp_aware {
+            let (worker_url_prefix, dp_rank) = match Self::extract_dp_rank(worker_url) {
+                Ok(tup) => tup,
+                Err(e) => {
+                    error!("Failed to extract dp_rank: {}", e);
+                    return (
+                        StatusCode::INTERNAL_SERVER_ERROR,
+                        format!("Failed to extract dp_rank: {}", e),
+                    )
+                        .into_response();
+                }
+            };
+
+            // Parse the request body
+            let mut json_val = match serde_json::to_value(typed_req) {
+                Ok(j) => j,
+                Err(e) => {
+                    return (
+                        StatusCode::BAD_REQUEST,
+                        format!("Convert into serde_json::Value failed: {}", e),
+                    )
+                        .into_response();
+                }
+            };
+
+            // Insert the data_parallel_rank field
+            if let Some(map) = json_val.as_object_mut() {
+                map.insert(
+                    String::from("data_parallel_rank"),
+                    serde_json::json!(dp_rank),
+                );
+                debug!(
+                    "Modified request body: {}",
+                    serde_json::to_string(&json_val).unwrap_or(String::from("ERR"))
+                );
+            } else {
+                return (
+                    StatusCode::BAD_REQUEST,
+                    "Failed to insert the data_parallel_rank field into the request body",
+                )
+                    .into_response();
+            }
+
+            self.client
+                .post(format!("{}{}", worker_url_prefix, route))
+                .json(&json_val)
+        } else {
+            self.client
+                .post(format!("{}{}", worker_url, route))
+                .json(typed_req) // Use json() directly with typed request
+        };
+
+        // Copy all headers from original request if provided
+        if let Some(headers) = headers {
+            for (name, value) in headers {
+                // Skip Content-Type and Content-Length as .json() sets them
+                if *name != CONTENT_TYPE && *name != CONTENT_LENGTH {
+                    request_builder = request_builder.header(name, value);
+                }
+            }
+        }
+
+        let res = match request_builder.send().await {
+            Ok(res) => res,
+            Err(e) => {
+                error!(
+                    "Failed to send typed request worker_url={} route={} error={}",
+                    worker_url, route, e
+                );
+
+                // Decrement load on error if it was incremented
+                if load_incremented {
+                    if let Ok(workers_guard) = self.workers.read() {
+                        if let Some(worker) = workers_guard.iter().find(|w| w.url() == worker_url) {
+                            worker.decrement_load();
+                            RouterMetrics::set_running_requests(worker_url, worker.load());
+                        }
+                    }
+                }
+
+                return (
+                    StatusCode::INTERNAL_SERVER_ERROR,
+                    format!("Request failed: {}", e),
+                )
+                    .into_response();
+            }
+        };
+
+        let status = StatusCode::from_u16(res.status().as_u16())
+            .unwrap_or(StatusCode::INTERNAL_SERVER_ERROR);
+
+        if !is_stream {
+            // For non-streaming requests, preserve headers
+            let response_headers = header_utils::preserve_response_headers(res.headers());
+
+            let response = match res.bytes().await {
+                Ok(body) => {
+                    let mut response = Response::new(axum::body::Body::from(body));
+                    *response.status_mut() = status;
+                    *response.headers_mut() = response_headers;
+                    response
+                }
+                Err(e) => {
+                    // IMPORTANT: Decrement load on error before returning
+                    if load_incremented {
+                        if let Ok(workers_guard) = self.workers.read() {
+                            if let Some(worker) =
+                                workers_guard.iter().find(|w| w.url() == worker_url)
+                            {
+                                worker.decrement_load();
+                                RouterMetrics::set_running_requests(worker_url, worker.load());
+                            }
+                        }
+                    }
+
+                    let error_msg = format!("Failed to get response body: {}", e);
+                    (StatusCode::INTERNAL_SERVER_ERROR, error_msg).into_response()
+                }
+            };
+
+            // Decrement load counter for non-streaming requests if it was incremented
+            if load_incremented {
+                if let Ok(workers_guard) = self.workers.read() {
+                    if let Some(worker) = workers_guard.iter().find(|w| w.url() == worker_url) {
+                        worker.decrement_load();
+                        RouterMetrics::set_running_requests(worker_url, worker.load());
+                    }
+                }
+            }
+
+            response
+        } else if load_incremented {
+            // For streaming with load tracking, we need to manually decrement when done
+            let workers = Arc::clone(&self.workers);
+            let worker_url = worker_url.to_string();
+
+            // Preserve headers for streaming response
+            let mut response_headers = header_utils::preserve_response_headers(res.headers());
+            // Ensure we set the correct content-type for SSE
+            response_headers.insert(CONTENT_TYPE, HeaderValue::from_static("text/event-stream"));
+
+            let stream = res.bytes_stream();
+            let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
+
+            // Spawn task to forward stream and detect completion
+            tokio::spawn(async move {
+                let mut stream = stream;
+                let mut decremented = false;
+                while let Some(chunk) = stream.next().await {
+                    match chunk {
+                        Ok(bytes) => {
+                            // Check for stream end marker
+                            if bytes
+                                .as_ref()
+                                .windows(12)
+                                .any(|window| window == b"data: [DONE]")
+                            {
+                                if let Ok(workers_guard) = workers.read() {
+                                    if let Some(worker) =
+                                        workers_guard.iter().find(|w| w.url() == worker_url)
+                                    {
+                                        worker.decrement_load();
+                                        RouterMetrics::set_running_requests(
+                                            &worker_url,
+                                            worker.load(),
+                                        );
+                                        decremented = true;
+                                    }
+                                }
+                            }
+                            if tx.send(Ok(bytes)).is_err() {
+                                break;
+                            }
+                        }
+                        Err(e) => {
+                            let _ = tx.send(Err(format!("Stream error: {}", e)));
+                            break;
+                        }
+                    }
+                }
+                if !decremented {
+                    if let Ok(workers_guard) = workers.read() {
+                        if let Some(worker) = workers_guard.iter().find(|w| w.url() == worker_url) {
+                            worker.decrement_load();
+                            RouterMetrics::set_running_requests(&worker_url, worker.load());
+                        }
+                    }
+                }
+            });
+
+            let stream = UnboundedReceiverStream::new(rx);
+            let body = Body::from_stream(stream);
+
+            let mut response = Response::new(body);
+            *response.status_mut() = status;
+            *response.headers_mut() = response_headers;
+            response
+        } else {
+            // For requests without load tracking, just stream
+            // Preserve headers for streaming response
+            let mut response_headers = header_utils::preserve_response_headers(res.headers());
+            // Ensure we set the correct content-type for SSE
+            response_headers.insert(CONTENT_TYPE, HeaderValue::from_static("text/event-stream"));
+
+            let stream = res.bytes_stream();
+            let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
+
+            // Spawn task to forward stream
+            tokio::spawn(async move {
+                let mut stream = stream;
+                while let Some(chunk) = stream.next().await {
+                    match chunk {
+                        Ok(bytes) => {
+                            if tx.send(Ok(bytes)).is_err() {
+                                break;
+                            }
+                        }
+                        Err(e) => {
+                            let _ = tx.send(Err(format!("Stream error: {}", e)));
+                            break;
+                        }
+                    }
+                }
+            });
+
+            let stream = UnboundedReceiverStream::new(rx);
+            let body = Body::from_stream(stream);
+
+            let mut response = Response::new(body);
+            *response.status_mut() = status;
+            *response.headers_mut() = response_headers;
+            response
+        }
+    }
+
+    pub async fn add_worker(&self, worker_url: &str) -> Result<String, String> {
+        let start_time = std::time::Instant::now();
+        let client = reqwest::Client::builder()
+            .timeout(Duration::from_secs(self.worker_startup_timeout_secs))
+            .build()
+            .map_err(|e| format!("Failed to create HTTP client: {}", e))?;
+
+        loop {
+            if start_time.elapsed() > Duration::from_secs(self.worker_startup_timeout_secs) {
+                error!(
+                    "Timeout {}s waiting for worker {} to become healthy. Please set --router-worker-startup-timeout-secs (sglang_router.launch_server) or --worker-startup-timeout-secs (sglang_worker.router) to a larger value",
+                    self.worker_startup_timeout_secs, worker_url
+                );
+                return Err(format!(
+                    "Timeout {}s waiting for worker {} to become healthy. Please set --router-worker-startup-timeout-secs (sglang_router.launch_server) or --worker-startup-timeout-secs (sglang_worker.router) to a larger value",
+                    self.worker_startup_timeout_secs, worker_url
+                ));
+            }
+
+            match client.get(format!("{}/health", worker_url)).send().await {
+                Ok(res) => {
+                    if res.status().is_success() {
+                        let mut workers_guard = self.workers.write().unwrap();
+                        if self.dp_aware {
+                            // Need to contact the worker to extract the dp_size,
+                            // and add them as multiple workers
+                            let url_vec = vec![String::from(worker_url)];
+                            let dp_url_vec = Self::get_dp_aware_workers(&url_vec, &self.api_key)
+                                .map_err(|e| format!("Failed to get dp-aware workers: {}", e))?;
+                            let mut worker_added: bool = false;
+                            for dp_url in &dp_url_vec {
+                                if workers_guard.iter().any(|w| w.url() == dp_url) {
+                                    warn!("Worker {} already exists", dp_url);
+                                    continue;
+                                }
+                                info!("Added worker: {}", dp_url);
+                                let new_worker = WorkerFactory::create_regular_with_config(
+                                    dp_url.to_string(),
+                                    self.circuit_breaker_config.clone(),
+                                );
+                                workers_guard.push(new_worker);
+                                worker_added = true;
+                            }
+                            if !worker_added {
+                                return Err(format!("No worker added for {}", worker_url));
+                            }
+                        } else {
+                            if workers_guard.iter().any(|w| w.url() == worker_url) {
+                                return Err(format!("Worker {} already exists", worker_url));
+                            }
+                            info!("Added worker: {}", worker_url);
+                            let new_worker = WorkerFactory::create_regular_with_config(
+                                worker_url.to_string(),
+                                self.circuit_breaker_config.clone(),
+                            );
+                            workers_guard.push(new_worker);
+                        }
+
+                        RouterMetrics::set_active_workers(workers_guard.len());
+
+                        // If cache aware policy, initialize the worker in the tree
+                        if let Some(cache_aware) =
+                            self.policy
+                                .as_any()
+                                .downcast_ref::<crate::policies::CacheAwarePolicy>()
+                        {
+                            // Get updated workers after adding
+                            drop(workers_guard);
+                            let workers_guard = self.workers.read().unwrap();
+                            cache_aware.init_workers(&workers_guard);
+                        }
+
+                        return Ok(format!("Successfully added worker: {}", worker_url));
+                    } else {
+                        debug!(
+                            "Worker {} health check pending - status: {}",
+                            worker_url,
+                            res.status()
+                        );
+                        // if the url does not have http or https prefix, warn users
+                        if !worker_url.starts_with("http://") && !worker_url.starts_with("https://")
+                        {
+                            warn!("The worker url {} does not have http or https prefix. Please add the prefix to the url.", worker_url);
+                        }
+
+                        tokio::time::sleep(Duration::from_secs(
+                            self.worker_startup_check_interval_secs,
+                        ))
+                        .await;
+                        continue;
+                    }
+                }
+                Err(e) => {
+                    debug!("Worker {} health check pending - error: {}", worker_url, e);
+
+                    // if the url does not have http or https prefix, warn users
+                    if !worker_url.starts_with("http://") && !worker_url.starts_with("https://") {
+                        warn!("The worker url {} does not have http or https prefix. Please add the prefix to the url.", worker_url);
+                    }
+
+                    tokio::time::sleep(Duration::from_secs(
+                        self.worker_startup_check_interval_secs,
+                    ))
+                    .await;
+                    continue;
+                }
+            }
+        }
+    }
+
+    pub fn remove_worker(&self, worker_url: &str) {
+        if self.dp_aware {
+            // remove dp-aware workers in a prefix-matching fashion
+            // without contacting the remote worker
+            let mut candidate_workers: Vec<String> = Vec::new();
+            let mut removed_workers: Vec<String> = Vec::new();
+            let worker_url_prefix = format!("{}@", worker_url);
+
+            {
+                // find the candidate workers to be removed
+                let workers_guard = self.workers.read().unwrap();
+                for w in workers_guard.iter() {
+                    if w.url().starts_with(&worker_url_prefix) {
+                        candidate_workers.push(w.url().to_string());
+                    }
+                }
+            }
+
+            {
+                // do the removing on the worker_urls
+                let mut workers_guard = self.workers.write().unwrap();
+                for dp_url in candidate_workers.iter() {
+                    if let Some(index) = workers_guard.iter().position(|w| w.url() == dp_url) {
+                        workers_guard.remove(index);
+                        info!("Removed worker: {}", dp_url);
+                        removed_workers.push(dp_url.to_string());
+                    } else {
+                        warn!("Worker {} not found, skipping removal", dp_url);
+                        continue;
+                    }
+                }
+                RouterMetrics::set_active_workers(workers_guard.len());
+            }
+
+            // If cache aware policy, remove the workers from the tree
+            if let Some(cache_aware) = self
+                .policy
+                .as_any()
+                .downcast_ref::<crate::policies::CacheAwarePolicy>()
+            {
+                for dp_url in removed_workers.iter() {
+                    cache_aware.remove_worker(dp_url);
+                    info!("Removed worker from tree: {}", dp_url);
+                }
+            }
+        } else {
+            let mut workers_guard = self.workers.write().unwrap();
+            if let Some(index) = workers_guard.iter().position(|w| w.url() == worker_url) {
+                workers_guard.remove(index);
+                info!("Removed worker: {}", worker_url);
+                RouterMetrics::set_active_workers(workers_guard.len());
+            } else {
+                warn!("Worker {} not found, skipping removal", worker_url);
+                return;
+            }
+
+            // If cache aware policy, remove the workers from the tree
+            if let Some(cache_aware) = self
+                .policy
+                .as_any()
+                .downcast_ref::<crate::policies::CacheAwarePolicy>()
+            {
+                cache_aware.remove_worker(worker_url);
+                info!("Removed worker from tree: {}", worker_url);
+            }
+        }
+    }
+
+    async fn get_worker_load(&self, worker_url: &str) -> Option<isize> {
+        let worker_url = if self.dp_aware {
+            // Need to extract the URL from "http://host:port@dp_rank"
+            let (worker_url_prefix, _dp_rank) = match Self::extract_dp_rank(worker_url) {
+                Ok(tup) => tup,
+                Err(e) => {
+                    error!("Failed to extract dp_rank: {}", e);
+                    return None;
+                }
+            };
+            worker_url_prefix
+        } else {
+            worker_url
+        };
+
+        match self
+            .client
+            .get(format!("{}/get_load", worker_url))
+            .send()
+            .await
+        {
+            Ok(res) if res.status().is_success() => match res.bytes().await {
+                Ok(bytes) => match serde_json::from_slice::<serde_json::Value>(&bytes) {
+                    Ok(data) => data
+                        .get("load")
+                        .and_then(|v| v.as_i64())
+                        .map(|v| v as isize),
+                    Err(e) => {
+                        debug!("Failed to parse load response from {}: {}", worker_url, e);
+                        None
+                    }
+                },
+                Err(e) => {
+                    debug!("Failed to read load response from {}: {}", worker_url, e);
+                    None
+                }
+            },
+            Ok(res) => {
+                debug!(
+                    "Worker {} returned non-success status: {}",
+                    worker_url,
+                    res.status()
+                );
+                None
+            }
+            Err(e) => {
+                debug!("Failed to get load from {}: {}", worker_url, e);
+                None
+            }
+        }
+    }
+
+    // Background task to monitor worker loads
+    async fn monitor_worker_loads(
+        worker_urls: Vec<String>,
+        tx: tokio::sync::watch::Sender<HashMap<String, isize>>,
+        interval_secs: u64,
+        policy: Arc<dyn LoadBalancingPolicy>,
+        client: Client,
+    ) {
+        let mut interval = tokio::time::interval(Duration::from_secs(interval_secs));
+
+        loop {
+            interval.tick().await;
+
+            let mut loads = HashMap::new();
+            for url in &worker_urls {
+                if let Some(load) = Self::get_worker_load_static(&client, url).await {
+                    loads.insert(url.clone(), load);
+                }
+            }
+
+            if !loads.is_empty() {
+                // Update policy with new loads
+                policy.update_loads(&loads);
+
+                // Send to watchers
+                if let Err(e) = tx.send(loads) {
+                    error!("Failed to send load update: {}", e);
+                }
+            }
+        }
+    }
+
+    // Static version of get_worker_load for use in monitoring task
+    async fn get_worker_load_static(client: &reqwest::Client, worker_url: &str) -> Option<isize> {
+        let worker_url = if worker_url.contains("@") {
+            // Need to extract the URL from "http://host:port@dp_rank"
+            let (worker_url_prefix, _dp_rank) = match Self::extract_dp_rank(worker_url) {
+                Ok(tup) => tup,
+                Err(e) => {
+                    debug!("Failed to extract dp_rank: {}", e);
+                    return None;
+                }
+            };
+            worker_url_prefix
+        } else {
+            worker_url
+        };
+
+        match client.get(format!("{}/get_load", worker_url)).send().await {
+            Ok(res) if res.status().is_success() => match res.bytes().await {
+                Ok(bytes) => match serde_json::from_slice::<serde_json::Value>(&bytes) {
+                    Ok(data) => data
+                        .get("load")
+                        .and_then(|v| v.as_i64())
+                        .map(|v| v as isize),
+                    Err(e) => {
+                        debug!("Failed to parse load response from {}: {}", worker_url, e);
+                        None
+                    }
+                },
+                Err(e) => {
+                    debug!("Failed to read load response from {}: {}", worker_url, e);
+                    None
+                }
+            },
+            Ok(res) => {
+                debug!(
+                    "Worker {} returned non-success status: {}",
+                    worker_url,
+                    res.status()
+                );
+                None
+            }
+            Err(e) => {
+                debug!("Failed to get load from {}: {}", worker_url, e);
+                None
+            }
+        }
+    }
+}
+
+use async_trait::async_trait;
+
+#[async_trait]
+impl WorkerManagement for Router {
+    async fn add_worker(&self, worker_url: &str) -> Result<String, String> {
+        Router::add_worker(self, worker_url).await
+    }
+
+    fn remove_worker(&self, worker_url: &str) {
+        Router::remove_worker(self, worker_url)
+    }
+
+    fn get_worker_urls(&self) -> Vec<String> {
+        Router::get_worker_urls(self)
+    }
+}
+
+#[async_trait]
+impl RouterTrait for Router {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    async fn health(&self, _req: Request<Body>) -> Response {
+        let workers = self.workers.read().unwrap();
+        let unhealthy_servers: Vec<_> = workers
+            .iter()
+            .filter(|w| !w.is_healthy())
+            .map(|w| w.url().to_string())
+            .collect();
+
+        if unhealthy_servers.is_empty() {
+            (StatusCode::OK, "All servers healthy").into_response()
+        } else {
+            (
+                StatusCode::SERVICE_UNAVAILABLE,
+                format!("Unhealthy servers: {:?}", unhealthy_servers),
+            )
+                .into_response()
+        }
+    }
+
+    async fn health_generate(&self, req: Request<Body>) -> Response {
+        self.proxy_get_request(req, "health_generate").await
+    }
+
+    async fn get_server_info(&self, req: Request<Body>) -> Response {
+        self.proxy_get_request(req, "get_server_info").await
+    }
+
+    async fn get_models(&self, req: Request<Body>) -> Response {
+        self.proxy_get_request(req, "v1/models").await
+    }
+
+    async fn get_model_info(&self, req: Request<Body>) -> Response {
+        self.proxy_get_request(req, "get_model_info").await
+    }
+
+    async fn route_generate(
+        &self,
+        headers: Option<&HeaderMap>,
+        body: &GenerateRequest,
+    ) -> Response {
+        self.route_typed_request(headers, body, "/generate").await
+    }
+
+    async fn route_chat(
+        &self,
+        headers: Option<&HeaderMap>,
+        body: &ChatCompletionRequest,
+    ) -> Response {
+        self.route_typed_request(headers, body, "/v1/chat/completions")
+            .await
+    }
+
+    async fn route_completion(
+        &self,
+        headers: Option<&HeaderMap>,
+        body: &CompletionRequest,
+    ) -> Response {
+        self.route_typed_request(headers, body, "/v1/completions")
+            .await
+    }
+
+    async fn route_embeddings(&self, _headers: Option<&HeaderMap>, _body: Body) -> Response {
+        todo!()
+    }
+
+    async fn route_rerank(&self, _headers: Option<&HeaderMap>, _body: Body) -> Response {
+        todo!()
+    }
+
+    async fn flush_cache(&self) -> Response {
+        // Get all worker URLs
+        let worker_urls = self.get_worker_urls();
+
+        // Send requests to all workers concurrently without headers
+        let mut tasks = Vec::new();
+        for worker_url in &worker_urls {
+            let worker_url = if self.dp_aware {
+                // Need to extract the URL from "http://host:port@dp_rank"
+                let (worker_url_prefix, _dp_rank) = match Self::extract_dp_rank(worker_url) {
+                    Ok(tup) => tup,
+                    Err(e) => {
+                        error!("Failed to extract dp_rank: {}", e);
+                        return (
+                            StatusCode::INTERNAL_SERVER_ERROR,
+                            format!("Failed to extract dp_rank: {}", e),
+                        )
+                            .into_response();
+                    }
+                };
+                worker_url_prefix
+            } else {
+                worker_url
+            };
+            let request_builder = self.client.post(format!("{}/flush_cache", worker_url));
+            tasks.push(request_builder.send());
+        }
+
+        // Wait for all responses
+        let results = futures_util::future::join_all(tasks).await;
+
+        // Check if all succeeded
+        let all_success = results.iter().all(|r| {
+            r.as_ref()
+                .map(|res| res.status().is_success())
+                .unwrap_or(false)
+        });
+
+        if all_success {
+            (StatusCode::OK, "Cache flushed on all servers").into_response()
+        } else {
+            (
+                StatusCode::INTERNAL_SERVER_ERROR,
+                "Cache flush failed on one or more servers",
+            )
+                .into_response()
+        }
+    }
+
+    async fn get_worker_loads(&self) -> Response {
+        let urls = self.get_worker_urls();
+        let mut loads = Vec::new();
+
+        // Get loads from all workers
+        for url in &urls {
+            let load = self.get_worker_load(url).await.unwrap_or(-1);
+            loads.push(serde_json::json!({
+                "worker": url,
+                "load": load
+            }));
+        }
+
+        Json(serde_json::json!({
+            "workers": loads
+        }))
+        .into_response()
+    }
+
+    fn router_type(&self) -> &'static str {
+        "regular"
+    }
+
+    fn readiness(&self) -> Response {
+        // Regular router is ready if it has at least one healthy worker
+        let healthy_count = self
+            .workers
+            .read()
+            .unwrap()
+            .iter()
+            .filter(|w| w.is_healthy())
+            .count();
+
+        if healthy_count > 0 {
+            Json(serde_json::json!({
+                "status": "ready",
+                "healthy_workers": healthy_count,
+                "total_workers": self.workers.read().unwrap().len()
+            }))
+            .into_response()
+        } else {
+            (
+                StatusCode::SERVICE_UNAVAILABLE,
+                Json(serde_json::json!({
+                    "status": "not_ready",
+                    "reason": "no healthy workers available",
+                    "total_workers": self.workers.read().unwrap().len()
+                })),
+            )
+                .into_response()
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::policies::RandomPolicy;
+    use std::collections::HashMap;
+
+    fn create_test_regular_router() -> Router {
+        let workers = vec![
+            WorkerFactory::create_regular("http://worker1:8080".to_string()),
+            WorkerFactory::create_regular("http://worker2:8080".to_string()),
+        ];
+        let (_, rx) = tokio::sync::watch::channel(HashMap::new());
+        Router {
+            workers: Arc::new(RwLock::new(workers)),
+            policy: Arc::new(RandomPolicy::new()),
+            worker_startup_timeout_secs: 5,
+            worker_startup_check_interval_secs: 1,
+            dp_aware: false,
+            api_key: None,
+            client: Client::new(),
+            retry_config: RetryConfig::default(),
+            circuit_breaker_config: CircuitBreakerConfig::default(),
+            _worker_loads: Arc::new(rx),
+            _load_monitor_handle: None,
+            _health_checker: None,
+        }
+    }
+
+    #[test]
+    fn test_router_get_worker_urls_regular() {
+        let router = create_test_regular_router();
+        let urls = router.get_worker_urls();
+
+        assert_eq!(urls.len(), 2);
+        assert!(urls.contains(&"http://worker1:8080".to_string()));
+        assert!(urls.contains(&"http://worker2:8080".to_string()));
+    }
+
+    #[test]
+    fn test_select_first_worker_regular() {
+        let router = create_test_regular_router();
+        let result = router.select_first_worker();
+
+        assert!(result.is_ok());
+        assert_eq!(result.unwrap(), "http://worker1:8080");
+    }
+
+    #[tokio::test]
+    async fn test_wait_for_healthy_workers_empty_list() {
+        // Empty list will return error immediately
+        let result = Router::wait_for_healthy_workers(&[], 1, 1).await;
+        assert!(result.is_err());
+        assert!(result.unwrap_err().contains("no workers provided"));
+    }
+
+    #[tokio::test]
+    async fn test_wait_for_healthy_workers_invalid_urls() {
+        // This test will timeout quickly since the URLs are invalid
+        let result =
+            Router::wait_for_healthy_workers(&["http://nonexistent:8080".to_string()], 1, 1).await;
+        assert!(result.is_err());
+        assert!(result.unwrap_err().contains("Timeout"));
+    }
+}
diff --git a/sgl-router/src/routers/mod.rs b/sgl-router/src/routers/mod.rs
new file mode 100644
index 000000000..e610fedb3
--- /dev/null
+++ b/sgl-router/src/routers/mod.rs
@@ -0,0 +1,107 @@
+//! Router implementations
+
+use async_trait::async_trait;
+use axum::{
+    body::Body,
+    extract::Request,
+    http::{HeaderMap, StatusCode},
+    response::{IntoResponse, Response},
+};
+use std::fmt::Debug;
+
+use crate::protocols::spec::{ChatCompletionRequest, CompletionRequest, GenerateRequest};
+
+pub mod factory;
+pub mod grpc;
+pub mod header_utils;
+pub mod http;
+
+pub use factory::RouterFactory;
+// Re-export HTTP routers for convenience (keeps routers::openai_router path working)
+pub use http::{openai_router, pd_router, pd_types, router};
+
+/// Worker management trait for administrative operations
+///
+/// This trait is separate from RouterTrait to allow Send futures
+/// for use in service discovery and other background tasks
+#[async_trait]
+pub trait WorkerManagement: Send + Sync {
+    /// Add a worker to the router
+    async fn add_worker(&self, worker_url: &str) -> Result<String, String>;
+
+    /// Remove a worker from the router
+    fn remove_worker(&self, worker_url: &str);
+
+    /// Get all worker URLs
+    fn get_worker_urls(&self) -> Vec<String>;
+}
+
+/// Core trait for all router implementations
+///
+/// This trait provides a unified interface for routing requests,
+/// regardless of whether it's a regular router or PD router.
+#[async_trait]
+pub trait RouterTrait: Send + Sync + Debug + WorkerManagement {
+    /// Get a reference to self as Any for downcasting
+    fn as_any(&self) -> &dyn std::any::Any;
+
+    /// Route a health check request
+    async fn health(&self, req: Request<Body>) -> Response;
+
+    /// Route a health generate request
+    async fn health_generate(&self, req: Request<Body>) -> Response;
+
+    /// Get server information
+    async fn get_server_info(&self, req: Request<Body>) -> Response;
+
+    /// Get available models
+    async fn get_models(&self, req: Request<Body>) -> Response;
+
+    /// Get model information
+    async fn get_model_info(&self, req: Request<Body>) -> Response;
+
+    /// Route a generate request
+    async fn route_generate(&self, headers: Option<&HeaderMap>, body: &GenerateRequest)
+        -> Response;
+
+    /// Route a chat completion request
+    async fn route_chat(
+        &self,
+        headers: Option<&HeaderMap>,
+        body: &ChatCompletionRequest,
+    ) -> Response;
+
+    /// Route a completion request
+    async fn route_completion(
+        &self,
+        headers: Option<&HeaderMap>,
+        body: &CompletionRequest,
+    ) -> Response;
+
+    async fn route_embeddings(&self, headers: Option<&HeaderMap>, body: Body) -> Response;
+
+    async fn route_rerank(&self, headers: Option<&HeaderMap>, body: Body) -> Response;
+
+    /// Flush cache on all workers
+    async fn flush_cache(&self) -> Response;
+
+    /// Get worker loads (for monitoring)
+    async fn get_worker_loads(&self) -> Response;
+
+    /// Get router type name
+    fn router_type(&self) -> &'static str;
+
+    /// Check if this is a PD router
+    fn is_pd_mode(&self) -> bool {
+        self.router_type() == "pd"
+    }
+
+    /// Server liveness check - is the server process running
+    fn liveness(&self) -> Response {
+        // Simple liveness check - if we can respond, we're alive
+        (StatusCode::OK, "OK").into_response()
+    }
+
+    /// Server readiness check - is the server ready to handle requests
+    fn readiness(&self) -> Response;
+}
diff --git a/sgl-router/src/server.rs b/sgl-router/src/server.rs
new file mode 100644
index 000000000..2762f9765
--- /dev/null
+++ b/sgl-router/src/server.rs
@@ -0,0 +1,474 @@
+use crate::config::RouterConfig;
+use crate::logging::{self, LoggingConfig};
+use crate::metrics::{self, PrometheusConfig};
+use crate::middleware::TokenBucket;
+use crate::protocols::spec::{ChatCompletionRequest, CompletionRequest, GenerateRequest};
+use crate::reasoning_parser::ParserFactory;
+use crate::routers::{RouterFactory, RouterTrait};
+use crate::service_discovery::{start_service_discovery, ServiceDiscoveryConfig};
+use crate::tokenizer::{factory as tokenizer_factory, traits::Tokenizer};
+use crate::tool_parser::ParserRegistry;
+use axum::{
+    extract::{Query, Request, State},
+    http::StatusCode,
+    response::{IntoResponse, Response},
+    routing::{get, post},
+    Json, Router,
+};
+use reqwest::Client;
+use std::collections::HashMap;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Arc;
+use std::time::Duration;
+use tokio::net::TcpListener;
+use tokio::signal;
+use tokio::spawn;
+use tracing::{error, info, warn, Level};
+
+#[derive(Clone)]
+pub struct AppContext {
+    pub client: Client,
+    pub router_config: RouterConfig,
+    pub rate_limiter: Arc<TokenBucket>,
+    pub tokenizer: Option<Arc<dyn Tokenizer>>,
+    pub reasoning_parser_factory: Option<ParserFactory>,
+    pub tool_parser_registry: Option<&'static ParserRegistry>,
+}
+
+impl AppContext {
+    pub fn new(
+        router_config: RouterConfig,
+        client: Client,
+        max_concurrent_requests: usize,
+        rate_limit_tokens_per_second: Option<usize>,
+    ) -> Result<Self, String> {
+        let rate_limit_tokens = rate_limit_tokens_per_second.unwrap_or(max_concurrent_requests);
+        let rate_limiter = Arc::new(TokenBucket::new(max_concurrent_requests, rate_limit_tokens));
+
+        // Initialize gRPC-specific components only when in gRPC mode
+        let (tokenizer, reasoning_parser_factory, tool_parser_registry) =
+            if router_config.connection_mode == crate::config::ConnectionMode::Grpc {
+                // Get tokenizer path (required for gRPC mode)
+                let tokenizer_path = router_config
+                    .tokenizer_path
+                    .clone()
+                    .or_else(|| router_config.model_path.clone())
+                    .ok_or_else(|| {
+                        "gRPC mode requires either --tokenizer-path or --model-path to be specified"
+                            .to_string()
+                    })?;
+
+                // Initialize all gRPC components
+                let tokenizer = Some(
+                    tokenizer_factory::create_tokenizer(&tokenizer_path)
+                        .map_err(|e| format!("Failed to create tokenizer: {}", e))?,
+                );
+                let reasoning_parser_factory = Some(ParserFactory::new());
+                let tool_parser_registry = Some(ParserRegistry::new());
+
+                (tokenizer, reasoning_parser_factory, tool_parser_registry)
+            } else {
+                // HTTP mode doesn't need these components
+                (None, None, None)
+            };
+
+        Ok(Self {
+            client,
+            router_config,
+            rate_limiter,
+            tokenizer,
+            reasoning_parser_factory,
+            tool_parser_registry,
+        })
+    }
+}
+
+#[derive(Clone)]
+pub struct AppState {
+    pub router: Arc<dyn RouterTrait>,
+    pub context: Arc<AppContext>,
+    pub concurrency_queue_tx: Option<tokio::sync::mpsc::Sender<crate::middleware::QueuedRequest>>,
+}
+
+// Fallback handler for unmatched routes
+async fn sink_handler() -> Response {
+    StatusCode::NOT_FOUND.into_response()
+}
+
+// Health check endpoints
+async fn liveness(State(state): State<Arc<AppState>>) -> Response {
+    state.router.liveness()
+}
+
+async fn readiness(State(state): State<Arc<AppState>>) -> Response {
+    state.router.readiness()
+}
+
+async fn health(State(state): State<Arc<AppState>>, req: Request) -> Response {
+    state.router.health(req).await
+}
+
+async fn health_generate(State(state): State<Arc<AppState>>, req: Request) -> Response {
+    state.router.health_generate(req).await
+}
+
+async fn get_server_info(State(state): State<Arc<AppState>>, req: Request) -> Response {
+    state.router.get_server_info(req).await
+}
+
+async fn v1_models(State(state): State<Arc<AppState>>, req: Request) -> Response {
+    state.router.get_models(req).await
+}
+
+async fn get_model_info(State(state): State<Arc<AppState>>, req: Request) -> Response {
+    state.router.get_model_info(req).await
+}
+
+// Generation endpoints
+// The RouterTrait now accepts optional headers and typed body directly
+async fn generate(
+    State(state): State<Arc<AppState>>,
+    headers: http::HeaderMap,
+    Json(body): Json<GenerateRequest>,
+) -> Response {
+    state.router.route_generate(Some(&headers), &body).await
+}
+
+async fn v1_chat_completions(
+    State(state): State<Arc<AppState>>,
+    headers: http::HeaderMap,
+    Json(body): Json<ChatCompletionRequest>,
+) -> Response {
+    state.router.route_chat(Some(&headers), &body).await
+}
+
+async fn v1_completions(
+    State(state): State<Arc<AppState>>,
+    headers: http::HeaderMap,
+    Json(body): Json<CompletionRequest>,
+) -> Response {
+    state.router.route_completion(Some(&headers), &body).await
+}
+
+// Worker management endpoints
+async fn add_worker(
+    State(state): State<Arc<AppState>>,
+    Query(params): Query<HashMap<String, String>>,
+) -> Response {
+    let worker_url = match params.get("url") {
+        Some(url) => url.to_string(),
+        None => {
+            return (
+                StatusCode::BAD_REQUEST,
+                "Worker URL required. Provide 'url' query parameter",
+            )
+                .into_response();
+        }
+    };
+
+    match state.router.add_worker(&worker_url).await {
+        Ok(message) => (StatusCode::OK, message).into_response(),
+        Err(error) => (StatusCode::BAD_REQUEST, error).into_response(),
+    }
+}
+
+async fn list_workers(State(state): State<Arc<AppState>>) -> Response {
+    let worker_list = state.router.get_worker_urls();
+    Json(serde_json::json!({ "urls": worker_list })).into_response()
+}
+
+async fn remove_worker(
+    State(state): State<Arc<AppState>>,
+    Query(params): Query<HashMap<String, String>>,
+) -> Response {
+    let worker_url = match params.get("url") {
+        Some(url) => url.to_string(),
+        None => return StatusCode::BAD_REQUEST.into_response(),
+    };
+
+    state.router.remove_worker(&worker_url);
+    (
+        StatusCode::OK,
+        format!("Successfully removed worker: {}", worker_url),
+    )
+        .into_response()
+}
+
+async fn flush_cache(State(state): State<Arc<AppState>>, _req: Request) -> Response {
+    state.router.flush_cache().await
+}
+
+async fn get_loads(State(state): State<Arc<AppState>>, _req: Request) -> Response {
+    state.router.get_worker_loads().await
+}
+
+pub struct ServerConfig {
+    pub host: String,
+    pub port: u16,
+    pub router_config: RouterConfig,
+    pub max_payload_size: usize,
+    pub log_dir: Option<String>,
+    pub log_level: Option<String>,
+    pub service_discovery_config: Option<ServiceDiscoveryConfig>,
+    pub prometheus_config: Option<PrometheusConfig>,
+    pub request_timeout_secs: u64,
+    pub request_id_headers: Option<Vec<String>>,
+}
+
+/// Build the Axum application with all routes and middleware
+pub fn build_app(
+    app_state: Arc<AppState>,
+    max_payload_size: usize,
+    request_id_headers: Vec<String>,
+    cors_allowed_origins: Vec<String>,
+) -> Router {
+    // Create routes
+    let protected_routes = Router::new()
+        .route("/generate", post(generate))
+        .route("/v1/chat/completions", post(v1_chat_completions))
+        .route("/v1/completions", post(v1_completions))
+        .route_layer(axum::middleware::from_fn_with_state(
+            app_state.clone(),
+            crate::middleware::concurrency_limit_middleware,
+        ));
+
+    let public_routes = Router::new()
+        .route("/liveness", get(liveness))
+        .route("/readiness", get(readiness))
+        .route("/health", get(health))
+        .route("/health_generate", get(health_generate))
+        .route("/v1/models", get(v1_models))
+        .route("/get_model_info", get(get_model_info))
+        .route("/get_server_info", get(get_server_info));
+
+    let admin_routes = Router::new()
+        .route("/add_worker", post(add_worker))
+        .route("/remove_worker", post(remove_worker))
+        .route("/list_workers", get(list_workers))
+        .route("/flush_cache", post(flush_cache))
+        .route("/get_loads", get(get_loads));
+
+    // Build app with all routes and middleware
+    Router::new()
+        .merge(protected_routes)
+        .merge(public_routes)
+        .merge(admin_routes)
+        // Request body size limiting
+        .layer(tower_http::limit::RequestBodyLimitLayer::new(
+            max_payload_size,
+        ))
+        // Request ID layer - must be added AFTER logging layer in the code
+        // so it executes BEFORE logging layer at runtime (layers execute bottom-up)
+        .layer(crate::middleware::RequestIdLayer::new(request_id_headers))
+        // Custom logging layer that can now see request IDs from extensions
+        .layer(crate::middleware::create_logging_layer())
+        // CORS (should be outermost)
+        .layer(create_cors_layer(cors_allowed_origins))
+        // Fallback
+        .fallback(sink_handler)
+        // State - apply last to get Router<Arc<AppState>>
+        .with_state(app_state)
+}
+
+pub async fn startup(config: ServerConfig) -> Result<(), Box<dyn std::error::Error>> {
+    // Only initialize logging if not already done (for Python bindings support)
+    static LOGGING_INITIALIZED: AtomicBool = AtomicBool::new(false);
+
+    let _log_guard = if !LOGGING_INITIALIZED.swap(true, Ordering::SeqCst) {
+        Some(logging::init_logging(LoggingConfig {
+            level: config
+                .log_level
+                .as_deref()
+                .and_then(|s| match s.to_uppercase().parse::<Level>() {
+                    Ok(l) => Some(l),
+                    Err(_) => {
+                        warn!("Invalid log level string: '{}'. Defaulting to INFO.", s);
+                        None
+                    }
+                })
+                .unwrap_or(Level::INFO),
+            json_format: false,
+            log_dir: config.log_dir.clone(),
+            colorize: true,
+            log_file_name: "sgl-router".to_string(),
+            log_targets: None,
+        }))
+    } else {
+        None
+    };
+
+    // Initialize prometheus metrics exporter
+    if let Some(prometheus_config) = config.prometheus_config {
+        metrics::start_prometheus(prometheus_config);
+    }
+
+    info!(
+        "Starting router on {}:{} | mode: {:?} | policy: {:?} | max_payload: {}MB",
+        config.host,
+        config.port,
+        config.router_config.mode,
+        config.router_config.policy,
+        config.max_payload_size / (1024 * 1024)
+    );
+
+    let client = Client::builder()
+        .pool_idle_timeout(Some(Duration::from_secs(50)))
+        .pool_max_idle_per_host(500) // Increase to 500 connections per host
+        .timeout(Duration::from_secs(config.request_timeout_secs))
+        .connect_timeout(Duration::from_secs(10)) // Separate connection timeout
+        .tcp_nodelay(true)
+        .tcp_keepalive(Some(Duration::from_secs(30))) // Keep connections alive
+        .build()
+        .expect("Failed to create HTTP client");
+
+    // Create the application context with all dependencies
+    let app_context = Arc::new(AppContext::new(
+        config.router_config.clone(),
+        client.clone(),
+        config.router_config.max_concurrent_requests,
+        config.router_config.rate_limit_tokens_per_second,
+    )?);
+
+    // Create router with the context
+    let router = RouterFactory::create_router(&app_context).await?;
+
+    // Set up concurrency limiter with queue if configured
+    let (limiter, processor) = crate::middleware::ConcurrencyLimiter::new(
+        app_context.rate_limiter.clone(),
+        config.router_config.queue_size,
+        Duration::from_secs(config.router_config.queue_timeout_secs),
+    );
+
+    // Start queue processor if enabled
+    if let Some(processor) = processor {
+        tokio::spawn(processor.run());
+        info!(
+            "Started request queue with size: {}, timeout: {}s",
+            config.router_config.queue_size, config.router_config.queue_timeout_secs
+        );
+    }
+
+    // Create app state with router and context
+    let app_state = Arc::new(AppState {
+        router: Arc::from(router),
+        context: app_context.clone(),
+        concurrency_queue_tx: limiter.queue_tx.clone(),
+    });
+    let router_arc = Arc::clone(&app_state.router);
+
+    // Start the service discovery if enabled
+    if let Some(service_discovery_config) = config.service_discovery_config {
+        if service_discovery_config.enabled {
+            match start_service_discovery(service_discovery_config, router_arc).await {
+                Ok(handle) => {
+                    info!("Service discovery started");
+                    // Spawn a task to handle the service discovery thread
+                    spawn(async move {
+                        if let Err(e) = handle.await {
+                            error!("Service discovery task failed: {:?}", e);
+                        }
+                    });
+                }
+                Err(e) => {
+                    error!("Failed to start service discovery: {}", e);
+                    warn!("Continuing without service discovery");
+                }
+            }
+        }
+    }
+
+    info!(
+        "Router ready | workers: {:?}",
+        app_state.router.get_worker_urls()
+    );
+
+    // Configure request ID headers
+    let request_id_headers = config.request_id_headers.clone().unwrap_or_else(|| {
+        vec![
+            "x-request-id".to_string(),
+            "x-correlation-id".to_string(),
+            "x-trace-id".to_string(),
+            "request-id".to_string(),
+        ]
+    });
+
+    // Build the application
+    let app = build_app(
+        app_state,
+        config.max_payload_size,
+        request_id_headers,
+        config.router_config.cors_allowed_origins.clone(),
+    );
+
+    // Create TCP listener - use the configured host
+    let addr = format!("{}:{}", config.host, config.port);
+    let listener = TcpListener::bind(&addr).await?;
+
+    // Start server with graceful shutdown
+    info!("Starting server on {}", addr);
+
+    // Serve the application with graceful shutdown
+    axum::serve(listener, app)
+        .with_graceful_shutdown(shutdown_signal())
+        .await
+        .map_err(|e| Box::new(e) as Box<dyn std::error::Error>)?;
+
+    Ok(())
+}
+
+// Graceful shutdown handler
+async fn shutdown_signal() {
+    let ctrl_c = async {
+        signal::ctrl_c()
+            .await
+            .expect("failed to install Ctrl+C handler");
+    };
+
+    #[cfg(unix)]
+    let terminate = async {
+        signal::unix::signal(signal::unix::SignalKind::terminate())
+            .expect("failed to install signal handler")
+            .recv()
+            .await;
+    };
+
+    #[cfg(not(unix))]
+    let terminate = std::future::pending::<()>();
+
+    tokio::select! {
+        _ = ctrl_c => {
+            info!("Received Ctrl+C, starting graceful shutdown");
+        },
+        _ = terminate => {
+            info!("Received terminate signal, starting graceful shutdown");
+        },
+    }
+}
+
+// CORS Layer Creation
+fn create_cors_layer(allowed_origins: Vec<String>) -> tower_http::cors::CorsLayer {
+    use tower_http::cors::Any;
+
+    let cors = if allowed_origins.is_empty() {
+        // Allow all origins if none specified
+        tower_http::cors::CorsLayer::new()
+            .allow_origin(Any)
+            .allow_methods(Any)
+            .allow_headers(Any)
+            .expose_headers(Any)
+    } else {
+        // Restrict to specific origins
+        let origins: Vec<http::HeaderValue> = allowed_origins
+            .into_iter()
+            .filter_map(|origin| origin.parse().ok())
+            .collect();
+
+        tower_http::cors::CorsLayer::new()
+            .allow_origin(origins)
+            .allow_methods([http::Method::GET, http::Method::POST, http::Method::OPTIONS])
+            .allow_headers([http::header::CONTENT_TYPE, http::header::AUTHORIZATION])
+            .expose_headers([http::header::HeaderName::from_static("x-request-id")])
+    };
+
+    cors.max_age(Duration::from_secs(3600))
+}
diff --git a/sgl-router/src/service_discovery.rs b/sgl-router/src/service_discovery.rs
new file mode 100644
index 000000000..c27317f86
--- /dev/null
+++ b/sgl-router/src/service_discovery.rs
@@ -0,0 +1,1168 @@
+use crate::routers::RouterTrait;
+
+use futures::{StreamExt, TryStreamExt};
+use k8s_openapi::api::core::v1::Pod;
+use kube::{
+    api::Api,
+    runtime::watcher::{watcher, Config},
+    runtime::WatchStreamExt,
+    Client,
+};
+use std::collections::{HashMap, HashSet};
+
+use rustls;
+use std::sync::{Arc, Mutex};
+use std::time::Duration;
+use tokio::task;
+use tokio::time;
+use tracing::{debug, error, info, warn};
+
+/// Represents the service discovery configuration
+#[derive(Debug, Clone)]
+pub struct ServiceDiscoveryConfig {
+    pub enabled: bool,
+    pub selector: HashMap<String, String>,
+    pub check_interval: Duration,
+    pub port: u16,
+    pub namespace: Option<String>,
+    // PD mode specific configuration
+    pub pd_mode: bool,
+    pub prefill_selector: HashMap<String, String>,
+    pub decode_selector: HashMap<String, String>,
+    // Bootstrap port annotation specific to mooncake implementation
+    pub bootstrap_port_annotation: String,
+}
+
+impl Default for ServiceDiscoveryConfig {
+    fn default() -> Self {
+        ServiceDiscoveryConfig {
+            enabled: false,
+            selector: HashMap::new(),
+            check_interval: Duration::from_secs(60),
+            port: 8000,      // Standard port for modern services
+            namespace: None, // None means watch all namespaces
+            pd_mode: false,
+            prefill_selector: HashMap::new(),
+            decode_selector: HashMap::new(),
+            bootstrap_port_annotation: "sglang.ai/bootstrap-port".to_string(),
+        }
+    }
+}
+
+/// Pod type for PD mode service discovery
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub enum PodType {
+    Prefill,
+    Decode,
+    Regular,
+}
+
+/// Represents a Kubernetes pod's information used for worker management
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct PodInfo {
+    pub name: String,
+    pub ip: String,
+    pub status: String,
+    pub is_ready: bool,
+    pub pod_type: Option<PodType>,
+    pub bootstrap_port: Option<u16>,
+}
+
+impl PodInfo {
+    /// Check if a pod matches any of the given selectors
+    fn matches_selector(pod: &Pod, selector: &HashMap<String, String>) -> bool {
+        if selector.is_empty() {
+            return false;
+        }
+
+        pod.metadata
+            .labels
+            .as_ref()
+            .is_some_and(|labels| selector.iter().all(|(k, v)| labels.get(k) == Some(v)))
+    }
+
+    /// Check if a pod should be included in service discovery
+    pub fn should_include(pod: &Pod, config: &ServiceDiscoveryConfig) -> bool {
+        if config.pd_mode {
+            // In PD mode, at least one selector must be non-empty
+            if config.prefill_selector.is_empty() && config.decode_selector.is_empty() {
+                warn!("PD mode enabled but both prefill_selector and decode_selector are empty");
+                return false;
+            }
+            // In PD mode, pod must match either prefill or decode selector
+            Self::matches_selector(pod, &config.prefill_selector)
+                || Self::matches_selector(pod, &config.decode_selector)
+        } else {
+            // In regular mode, pod must match the general selector
+            if config.selector.is_empty() {
+                warn!("Regular mode enabled but selector is empty");
+                return false;
+            }
+            Self::matches_selector(pod, &config.selector)
+        }
+    }
+
+    /// Unified PodInfo creation with optional PD configuration
+    pub fn from_pod(pod: &Pod, config: Option<&ServiceDiscoveryConfig>) -> Option<Self> {
+        let name = pod.metadata.name.clone()?;
+        let status = pod.status.clone()?;
+        let pod_ip = status.pod_ip?;
+
+        let is_ready = if let Some(conditions) = &status.conditions {
+            conditions
+                .iter()
+                .any(|condition| condition.type_ == "Ready" && condition.status == "True")
+        } else {
+            false
+        };
+
+        let pod_status = status.phase.unwrap_or_else(|| "Unknown".to_string());
+
+        // Determine pod type based on labels if config is provided and in PD mode
+        let pod_type = if let Some(config) = config {
+            if config.pd_mode {
+                // Use simplified helper methods for cleaner logic
+                if Self::matches_selector(pod, &config.prefill_selector) {
+                    Some(PodType::Prefill)
+                } else if Self::matches_selector(pod, &config.decode_selector) {
+                    Some(PodType::Decode)
+                } else {
+                    Some(PodType::Regular)
+                }
+            } else {
+                Some(PodType::Regular)
+            }
+        } else {
+            // No config provided, default to None (for backwards compatibility)
+            None
+        };
+
+        // Extract bootstrap port from annotations for prefill pods
+        let bootstrap_port = if matches!(pod_type, Some(PodType::Prefill)) {
+            if let Some(config) = config {
+                pod.metadata
+                    .annotations
+                    .as_ref()
+                    .and_then(|annotations| annotations.get(&config.bootstrap_port_annotation))
+                    .and_then(|port_str| port_str.parse::<u16>().ok())
+            } else {
+                None
+            }
+        } else {
+            None
+        };
+
+        Some(PodInfo {
+            name,
+            ip: pod_ip,
+            status: pod_status,
+            is_ready,
+            pod_type,
+            bootstrap_port,
+        })
+    }
+
+    /// Returns true if the pod is in a state where it can accept traffic
+    pub fn is_healthy(&self) -> bool {
+        self.is_ready && self.status == "Running"
+    }
+
+    /// Generates a worker URL for this pod
+    pub fn worker_url(&self, port: u16) -> String {
+        format!("http://{}:{}", self.ip, port)
+    }
+}
+
+pub async fn start_service_discovery(
+    config: ServiceDiscoveryConfig,
+    router: Arc<dyn RouterTrait>,
+) -> Result<task::JoinHandle<()>, kube::Error> {
+    // Don't initialize anything if service discovery is disabled
+    if !config.enabled {
+        // Return a generic error when service discovery is disabled
+        return Err(kube::Error::Api(kube::error::ErrorResponse {
+            status: "Disabled".to_string(),
+            message: "Service discovery is disabled".to_string(),
+            reason: "ConfigurationError".to_string(),
+            code: 400,
+        }));
+    }
+
+    let _ = rustls::crypto::ring::default_provider().install_default();
+
+    // Initialize Kubernetes client
+    let client = Client::try_default().await?;
+
+    // Log the appropriate selectors based on mode
+    if config.pd_mode {
+        let prefill_selector = config
+            .prefill_selector
+            .iter()
+            .map(|(k, v)| format!("{}={}", k, v))
+            .collect::<Vec<_>>()
+            .join(",");
+
+        let decode_selector = config
+            .decode_selector
+            .iter()
+            .map(|(k, v)| format!("{}={}", k, v))
+            .collect::<Vec<_>>()
+            .join(",");
+
+        info!(
+            "Starting K8s service discovery | PD mode | prefill: '{}' | decode: '{}'",
+            prefill_selector, decode_selector
+        );
+    } else {
+        let label_selector = config
+            .selector
+            .iter()
+            .map(|(k, v)| format!("{}={}", k, v))
+            .collect::<Vec<_>>()
+            .join(",");
+
+        info!(
+            "Starting K8s service discovery | selector: '{}'",
+            label_selector
+        );
+    }
+
+    // Create the task that will run in the background
+    let handle = task::spawn(async move {
+        // We'll track pods we've already added to avoid duplicates
+        let tracked_pods = Arc::new(Mutex::new(HashSet::new()));
+
+        // Create a watcher for pods
+        let pods: Api<Pod> = if let Some(namespace) = &config.namespace {
+            Api::namespaced(client, namespace)
+        } else {
+            Api::all(client)
+        };
+
+        debug!("K8s service discovery initialized");
+
+        // Create Arcs for configuration data
+        let config_arc = Arc::new(config.clone());
+        let port = config.port;
+
+        let mut retry_delay = Duration::from_secs(1);
+        const MAX_RETRY_DELAY: Duration = Duration::from_secs(300); // 5 minutes max
+
+        loop {
+            // Create a watcher with the proper parameters according to the kube-rs API
+            let watcher_config = Config::default();
+            let watcher_stream = watcher(pods.clone(), watcher_config).applied_objects();
+
+            // Clone Arcs for the closures
+            let config_clone = Arc::clone(&config_arc);
+            let tracked_pods_clone = Arc::clone(&tracked_pods);
+
+            // Simplified label selector filter using helper method
+            let filtered_stream = watcher_stream.filter_map(move |obj_res| {
+                let config_inner = Arc::clone(&config_clone);
+
+                async move {
+                    match obj_res {
+                        Ok(pod) => {
+                            if PodInfo::should_include(&pod, &config_inner) {
+                                Some(Ok(pod))
+                            } else {
+                                None
+                            }
+                        }
+                        Err(e) => Some(Err(e)),
+                    }
+                }
+            });
+
+            // Clone again for the next closure
+            let tracked_pods_clone2 = Arc::clone(&tracked_pods_clone);
+            let router_clone = Arc::clone(&router);
+            let config_clone2 = Arc::clone(&config_arc);
+
+            match filtered_stream
+                .try_for_each(move |pod| {
+                    let tracked_pods_inner = Arc::clone(&tracked_pods_clone2);
+                    let router_inner = Arc::clone(&router_clone);
+                    let config_inner = Arc::clone(&config_clone2);
+
+                    async move {
+                        let pod_info = PodInfo::from_pod(&pod, Some(&config_inner));
+
+                        if let Some(pod_info) = pod_info {
+                            if pod.metadata.deletion_timestamp.is_some() {
+                                handle_pod_deletion(
+                                    &pod_info,
+                                    tracked_pods_inner,
+                                    router_inner,
+                                    port,
+                                    config_inner.pd_mode,
+                                )
+                                .await;
+                            } else {
+                                handle_pod_event(
+                                    &pod_info,
+                                    tracked_pods_inner,
+                                    router_inner,
+                                    port,
+                                    config_inner.pd_mode,
+                                )
+                                .await;
+                            }
+                        }
+                        Ok(())
+                    }
+                })
+                .await
+            {
+                Ok(_) => {
+                    // Reset retry delay on success
+                    retry_delay = Duration::from_secs(1);
+                }
+                Err(err) => {
+                    error!("Error in Kubernetes watcher: {}", err);
+                    warn!(
+                        "Retrying in {} seconds with exponential backoff",
+                        retry_delay.as_secs()
+                    );
+                    time::sleep(retry_delay).await;
+
+                    // Exponential backoff with jitter
+                    retry_delay = std::cmp::min(retry_delay * 2, MAX_RETRY_DELAY);
+                }
+            }
+
+            // If the watcher exits for some reason, wait a bit before restarting
+            warn!(
+                "Kubernetes watcher exited, restarting in {} seconds",
+                config_arc.check_interval.as_secs()
+            );
+            time::sleep(config_arc.check_interval).await;
+        }
+    });
+
+    Ok(handle)
+}
+
+async fn handle_pod_event(
+    pod_info: &PodInfo,
+    tracked_pods: Arc<Mutex<HashSet<PodInfo>>>,
+    router: Arc<dyn RouterTrait>,
+    port: u16,
+    pd_mode: bool,
+) {
+    let worker_url = pod_info.worker_url(port);
+
+    // If pod is healthy, try to add it (with atomic check-and-insert)
+    if pod_info.is_healthy() {
+        // Atomic check-and-insert to prevent race conditions
+        let should_add = {
+            let mut tracker = match tracked_pods.lock() {
+                Ok(tracker) => tracker,
+                Err(e) => {
+                    error!("Failed to acquire tracked_pods lock: {}", e);
+                    return;
+                }
+            };
+
+            if tracker.contains(pod_info) {
+                false // Already tracked
+            } else {
+                // Reserve the spot to prevent other threads from adding the same pod
+                tracker.insert(pod_info.clone());
+                true
+            }
+        };
+
+        if should_add {
+            info!(
+                "Adding pod: {} | type: {:?} | url: {}",
+                pod_info.name, pod_info.pod_type, worker_url
+            );
+
+            // Handle PD mode with specific pod types
+            let result = if pd_mode && pod_info.pod_type.is_some() {
+                // Need to import PDRouter type
+                use crate::routers::http::pd_router::PDRouter;
+
+                // Try to downcast to PDRouter
+                if let Some(pd_router) = router.as_any().downcast_ref::<PDRouter>() {
+                    match &pod_info.pod_type {
+                        Some(PodType::Prefill) => pd_router
+                            .add_prefill_server(worker_url.clone(), pod_info.bootstrap_port)
+                            .await
+                            .map_err(|e| e.to_string()),
+                        Some(PodType::Decode) => pd_router
+                            .add_decode_server(worker_url.clone())
+                            .await
+                            .map_err(|e| e.to_string()),
+                        Some(PodType::Regular) | None => {
+                            // Fall back to regular add_worker for regular pods
+                            router.add_worker(&worker_url).await
+                        }
+                    }
+                } else {
+                    Err("PD mode enabled but router is not a PDRouter".to_string())
+                }
+            } else {
+                // Regular mode or no pod type specified
+                router.add_worker(&worker_url).await
+            };
+
+            match result {
+                Ok(_) => {
+                    debug!("Worker added: {}", worker_url);
+                }
+                Err(e) => {
+                    error!("Failed to add worker {} to router: {}", worker_url, e);
+                    // Remove from tracking since addition failed
+                    if let Ok(mut tracker) = tracked_pods.lock() {
+                        tracker.remove(pod_info);
+                    }
+                }
+            }
+        }
+    }
+}
+
+async fn handle_pod_deletion(
+    pod_info: &PodInfo,
+    tracked_pods: Arc<Mutex<HashSet<PodInfo>>>,
+    router: Arc<dyn RouterTrait>,
+    port: u16,
+    pd_mode: bool,
+) {
+    let worker_url = pod_info.worker_url(port);
+
+    let was_tracked = {
+        let mut tracked = match tracked_pods.lock() {
+            Ok(tracked) => tracked,
+            Err(e) => {
+                error!("Failed to acquire tracked_pods lock during deletion: {}", e);
+                return;
+            }
+        };
+        tracked.remove(pod_info)
+    };
+
+    if was_tracked {
+        info!(
+            "Removing pod: {} | type: {:?} | url: {}",
+            pod_info.name, pod_info.pod_type, worker_url
+        );
+
+        // Handle PD mode removal
+        if pd_mode && pod_info.pod_type.is_some() {
+            use crate::routers::http::pd_router::PDRouter;
+
+            // Try to downcast to PDRouter for PD-specific removal
+            if let Some(pd_router) = router.as_any().downcast_ref::<PDRouter>() {
+                match &pod_info.pod_type {
+                    Some(PodType::Prefill) => {
+                        if let Err(e) = pd_router.remove_prefill_server(&worker_url).await {
+                            error!("Failed to remove prefill server {}: {}", worker_url, e);
+                        }
+                    }
+                    Some(PodType::Decode) => {
+                        if let Err(e) = pd_router.remove_decode_server(&worker_url).await {
+                            error!("Failed to remove decode server {}: {}", worker_url, e);
+                        }
+                    }
+                    Some(PodType::Regular) | None => {
+                        // Fall back to regular remove_worker
+                        router.remove_worker(&worker_url);
+                    }
+                }
+            } else {
+                // PD mode but not a PDRouter, use generic removal
+                router.remove_worker(&worker_url);
+            }
+        } else {
+            // Regular mode removal
+            router.remove_worker(&worker_url);
+        }
+    } else {
+        // This case might occur if a pod is deleted before it was ever marked healthy and added.
+        // Or if the event is duplicated. No action needed on the router if it wasn't tracked (and thus not added).
+        debug!(
+            "Pod deletion event for untracked/already removed pod: {} (type: {:?}). Worker URL: {}",
+            pod_info.name, pod_info.pod_type, worker_url
+        );
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use k8s_openapi::api::core::v1::{Pod, PodCondition, PodSpec, PodStatus};
+    use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
+    use k8s_openapi::apimachinery::pkg::apis::meta::v1::Time;
+
+    // Helper function to create a Pod for testing PodInfo::from_pod
+    fn create_k8s_pod(
+        name: Option<&str>,
+        ip: Option<&str>,
+        phase: Option<&str>,
+        ready_status: Option<&str>,
+        deletion_timestamp: Option<Time>,
+    ) -> Pod {
+        let mut pod = Pod {
+            metadata: ObjectMeta {
+                name: name.map(String::from),
+                deletion_timestamp,
+                ..Default::default()
+            },
+            spec: Some(PodSpec::default()),
+            status: None,
+        };
+
+        if ip.is_some() || phase.is_some() || ready_status.is_some() {
+            let mut pod_status = PodStatus {
+                pod_ip: ip.map(String::from),
+                phase: phase.map(String::from),
+                conditions: None,
+                ..Default::default()
+            };
+
+            if let Some(status_str) = ready_status {
+                let condition = PodCondition {
+                    type_: "Ready".to_string(),
+                    status: status_str.to_string(),
+                    last_probe_time: None,
+                    last_transition_time: None,
+                    message: None,
+                    reason: None,
+                    observed_generation: None,
+                };
+                pod_status.conditions = Some(vec![condition]);
+            }
+            pod.status = Some(pod_status);
+        }
+        pod
+    }
+
+    // Helper function to create a Pod with PD-specific labels and annotations
+    fn create_pd_k8s_pod(name: &str, ip: &str, pod_type: &str, bootstrap_port: Option<u16>) -> Pod {
+        let mut labels = std::collections::BTreeMap::new();
+        labels.insert("app".to_string(), "sglang".to_string());
+        labels.insert("component".to_string(), pod_type.to_string());
+
+        let mut annotations = std::collections::BTreeMap::new();
+        if let Some(port) = bootstrap_port {
+            annotations.insert("sglang.ai/bootstrap-port".to_string(), port.to_string());
+        }
+
+        Pod {
+            metadata: ObjectMeta {
+                name: Some(name.to_string()),
+                labels: Some(labels),
+                annotations: Some(annotations),
+                ..Default::default()
+            },
+            spec: Some(PodSpec::default()),
+            status: Some(PodStatus {
+                pod_ip: Some(ip.to_string()),
+                phase: Some("Running".to_string()),
+                conditions: Some(vec![PodCondition {
+                    type_: "Ready".to_string(),
+                    status: "True".to_string(),
+                    last_probe_time: None,
+                    last_transition_time: None,
+                    message: None,
+                    reason: None,
+                    observed_generation: None,
+                }]),
+                ..Default::default()
+            }),
+        }
+    }
+
+    // Helper to create a Router instance for testing event handlers
+    async fn create_test_router() -> Arc<dyn RouterTrait> {
+        use crate::config::{PolicyConfig, RouterConfig};
+        use crate::middleware::TokenBucket;
+        use crate::policies::PolicyFactory;
+        use crate::routers::http::router::Router;
+        use crate::server::AppContext;
+
+        // Create a minimal RouterConfig for testing
+        let router_config = RouterConfig::default();
+
+        // Create AppContext with minimal components
+        let app_context = Arc::new(AppContext {
+            client: reqwest::Client::new(),
+            router_config,
+            rate_limiter: Arc::new(TokenBucket::new(1000, 1000)),
+            tokenizer: None,                // HTTP mode doesn't need tokenizer
+            reasoning_parser_factory: None, // HTTP mode doesn't need reasoning parser
+            tool_parser_registry: None,     // HTTP mode doesn't need tool parser
+        });
+
+        let policy = PolicyFactory::create_from_config(&PolicyConfig::Random);
+        let router = Router::new(vec![], policy, &app_context).await.unwrap();
+        Arc::new(router) as Arc<dyn RouterTrait>
+    }
+
+    // Helper to create a PD config for testing
+    fn create_pd_config() -> ServiceDiscoveryConfig {
+        let mut prefill_selector = HashMap::new();
+        prefill_selector.insert("app".to_string(), "sglang".to_string());
+        prefill_selector.insert("component".to_string(), "prefill".to_string());
+
+        let mut decode_selector = HashMap::new();
+        decode_selector.insert("app".to_string(), "sglang".to_string());
+        decode_selector.insert("component".to_string(), "decode".to_string());
+
+        ServiceDiscoveryConfig {
+            enabled: true,
+            selector: HashMap::new(),
+            check_interval: Duration::from_secs(60),
+            port: 8080,
+            namespace: None,
+            pd_mode: true,
+            prefill_selector,
+            decode_selector,
+            bootstrap_port_annotation: "sglang.ai/bootstrap-port".to_string(),
+        }
+    }
+
+    #[test]
+    fn test_pod_info_should_include() {
+        let config = create_pd_config();
+
+        // Test prefill pod should be included
+        let prefill_pod = create_pd_k8s_pod("prefill-pod", "10.0.0.1", "prefill", Some(8081));
+        assert!(PodInfo::should_include(&prefill_pod, &config));
+
+        // Test decode pod should be included
+        let decode_pod = create_pd_k8s_pod("decode-pod", "10.0.0.2", "decode", None);
+        assert!(PodInfo::should_include(&decode_pod, &config));
+
+        // Test unmatched pod should not be included
+        let unmatched_pod = create_pd_k8s_pod("other-pod", "10.0.0.3", "other", None);
+        assert!(!PodInfo::should_include(&unmatched_pod, &config));
+
+        // Test regular mode
+        let mut regular_config = ServiceDiscoveryConfig::default();
+        regular_config
+            .selector
+            .insert("app".to_string(), "sglang".to_string());
+        regular_config.pd_mode = false;
+
+        let regular_pod = create_pd_k8s_pod("worker-pod", "10.0.0.4", "worker", None);
+        assert!(PodInfo::should_include(&regular_pod, &regular_config));
+    }
+
+    #[test]
+    fn test_service_discovery_config_default() {
+        let config = ServiceDiscoveryConfig::default();
+        assert!(!config.enabled);
+        assert!(config.selector.is_empty());
+        assert_eq!(config.check_interval, Duration::from_secs(60));
+        assert_eq!(config.port, 8000);
+        assert!(config.namespace.is_none());
+        assert!(!config.pd_mode);
+        assert!(config.prefill_selector.is_empty());
+        assert!(config.decode_selector.is_empty());
+        assert_eq!(config.bootstrap_port_annotation, "sglang.ai/bootstrap-port");
+    }
+
+    #[test]
+    fn test_pod_type_enum() {
+        // Test that PodType enum has expected variants
+        let prefill = PodType::Prefill;
+        let decode = PodType::Decode;
+        let regular = PodType::Regular;
+
+        assert_eq!(format!("{:?}", prefill), "Prefill");
+        assert_eq!(format!("{:?}", decode), "Decode");
+        assert_eq!(format!("{:?}", regular), "Regular");
+    }
+
+    #[test]
+    fn test_pod_info_from_pod_valid() {
+        let k8s_pod = create_k8s_pod(
+            Some("test-pod"),
+            Some("10.0.0.1"),
+            Some("Running"),
+            Some("True"),
+            None,
+        );
+        let pod_info = PodInfo::from_pod(&k8s_pod, None).unwrap();
+        assert_eq!(pod_info.name, "test-pod");
+        assert_eq!(pod_info.ip, "10.0.0.1");
+        assert_eq!(pod_info.status, "Running");
+        assert!(pod_info.is_ready);
+        assert!(pod_info.pod_type.is_none());
+        assert!(pod_info.bootstrap_port.is_none());
+    }
+
+    #[test]
+    fn test_pod_info_from_pod_with_pd_config_prefill() {
+        let k8s_pod = create_pd_k8s_pod("prefill-pod", "10.0.0.1", "prefill", Some(8081));
+        let config = create_pd_config();
+
+        let pod_info = PodInfo::from_pod(&k8s_pod, Some(&config)).unwrap();
+        assert_eq!(pod_info.name, "prefill-pod");
+        assert_eq!(pod_info.ip, "10.0.0.1");
+        assert_eq!(pod_info.status, "Running");
+        assert!(pod_info.is_ready);
+        assert_eq!(pod_info.pod_type, Some(PodType::Prefill));
+        assert_eq!(pod_info.bootstrap_port, Some(8081));
+    }
+
+    #[test]
+    fn test_pod_info_from_pod_with_pd_config_decode() {
+        let k8s_pod = create_pd_k8s_pod("decode-pod", "10.0.0.2", "decode", None);
+        let config = create_pd_config();
+
+        let pod_info = PodInfo::from_pod(&k8s_pod, Some(&config)).unwrap();
+        assert_eq!(pod_info.name, "decode-pod");
+        assert_eq!(pod_info.ip, "10.0.0.2");
+        assert_eq!(pod_info.status, "Running");
+        assert!(pod_info.is_ready);
+        assert_eq!(pod_info.pod_type, Some(PodType::Decode));
+        assert!(pod_info.bootstrap_port.is_none());
+    }
+
+    #[test]
+    fn test_pod_info_from_pod_with_pd_config_regular_mode() {
+        let k8s_pod = create_pd_k8s_pod("regular-pod", "10.0.0.3", "worker", None);
+        let mut config = create_pd_config();
+        config.pd_mode = false; // Set to regular mode
+
+        let pod_info = PodInfo::from_pod(&k8s_pod, Some(&config)).unwrap();
+        assert_eq!(pod_info.name, "regular-pod");
+        assert_eq!(pod_info.ip, "10.0.0.3");
+        assert_eq!(pod_info.status, "Running");
+        assert!(pod_info.is_ready);
+        assert_eq!(pod_info.pod_type, Some(PodType::Regular));
+        assert!(pod_info.bootstrap_port.is_none());
+    }
+
+    #[test]
+    fn test_pod_info_from_pod_with_pd_config_unmatched_labels() {
+        let k8s_pod = create_pd_k8s_pod("unknown-pod", "10.0.0.4", "unknown", None);
+        let config = create_pd_config();
+
+        let pod_info = PodInfo::from_pod(&k8s_pod, Some(&config)).unwrap();
+        assert_eq!(pod_info.name, "unknown-pod");
+        assert_eq!(pod_info.ip, "10.0.0.4");
+        assert_eq!(pod_info.status, "Running");
+        assert!(pod_info.is_ready);
+        assert_eq!(pod_info.pod_type, Some(PodType::Regular));
+        assert!(pod_info.bootstrap_port.is_none());
+    }
+
+    #[test]
+    fn test_pod_info_from_pod_with_pd_config_invalid_bootstrap_port() {
+        let mut pod = create_pd_k8s_pod("prefill-pod", "10.0.0.1", "prefill", None);
+        // Add invalid bootstrap port annotation
+        pod.metadata.annotations.as_mut().unwrap().insert(
+            "sglang.ai/bootstrap-port".to_string(),
+            "invalid".to_string(),
+        );
+        let config = create_pd_config();
+
+        let pod_info = PodInfo::from_pod(&pod, Some(&config)).unwrap();
+        assert_eq!(pod_info.pod_type, Some(PodType::Prefill));
+        assert!(pod_info.bootstrap_port.is_none()); // Should be None for invalid port
+    }
+
+    #[test]
+    fn test_pod_info_from_pod_not_ready() {
+        let k8s_pod = create_k8s_pod(
+            Some("test-pod"),
+            Some("10.0.0.1"),
+            Some("Running"),
+            Some("False"),
+            None,
+        );
+        let pod_info = PodInfo::from_pod(&k8s_pod, None).unwrap();
+        assert!(!pod_info.is_ready);
+    }
+
+    #[test]
+    fn test_pod_info_from_pod_no_conditions() {
+        let k8s_pod = create_k8s_pod(
+            Some("test-pod"),
+            Some("10.0.0.1"),
+            Some("Running"),
+            None,
+            None,
+        );
+        let pod_info = PodInfo::from_pod(&k8s_pod, None).unwrap();
+        assert!(!pod_info.is_ready);
+    }
+
+    #[test]
+    fn test_pod_info_from_pod_missing_name() {
+        let k8s_pod = create_k8s_pod(None, Some("10.0.0.1"), Some("Running"), Some("True"), None);
+        assert!(PodInfo::from_pod(&k8s_pod, None).is_none());
+    }
+
+    #[test]
+    fn test_pod_info_from_pod_missing_ip() {
+        let k8s_pod = create_k8s_pod(Some("test-pod"), None, Some("Running"), Some("True"), None);
+        assert!(PodInfo::from_pod(&k8s_pod, None).is_none());
+    }
+
+    #[test]
+    fn test_pod_info_from_pod_missing_status_phase() {
+        let k8s_pod = create_k8s_pod(Some("test-pod"), Some("10.0.0.1"), None, Some("True"), None);
+        let pod_info = PodInfo::from_pod(&k8s_pod, None).unwrap();
+        assert_eq!(pod_info.status, "Unknown");
+    }
+
+    #[test]
+    fn test_pod_info_from_pod_no_status_object() {
+        let mut k8s_pod = create_k8s_pod(Some("test-pod"), None, None, None, None);
+        k8s_pod.status = None;
+        assert!(PodInfo::from_pod(&k8s_pod, None).is_none());
+    }
+
+    #[test]
+    fn test_pod_info_is_healthy() {
+        let healthy_pod = PodInfo {
+            name: "p1".into(),
+            ip: "1.1.1.1".into(),
+            status: "Running".into(),
+            is_ready: true,
+            pod_type: None,
+            bootstrap_port: None,
+        };
+        assert!(healthy_pod.is_healthy());
+
+        let not_ready_pod = PodInfo {
+            name: "p2".into(),
+            ip: "1.1.1.2".into(),
+            status: "Running".into(),
+            is_ready: false,
+            pod_type: None,
+            bootstrap_port: None,
+        };
+        assert!(!not_ready_pod.is_healthy());
+
+        let not_running_pod = PodInfo {
+            name: "p3".into(),
+            ip: "1.1.1.3".into(),
+            status: "Pending".into(),
+            is_ready: true,
+            pod_type: None,
+            bootstrap_port: None,
+        };
+        assert!(!not_running_pod.is_healthy());
+    }
+
+    #[test]
+    fn test_pod_info_worker_url() {
+        let pod_info = PodInfo {
+            name: "p1".into(),
+            ip: "1.2.3.4".into(),
+            status: "Running".into(),
+            is_ready: true,
+            pod_type: None,
+            bootstrap_port: None,
+        };
+        assert_eq!(pod_info.worker_url(8080), "http://1.2.3.4:8080");
+    }
+
+    #[test]
+    fn test_pod_info_equality_with_pod_type() {
+        let pod1 = PodInfo {
+            name: "pod1".into(),
+            ip: "1.2.3.4".into(),
+            status: "Running".into(),
+            is_ready: true,
+            pod_type: Some(PodType::Prefill),
+            bootstrap_port: Some(8081),
+        };
+
+        let pod2 = PodInfo {
+            name: "pod1".into(),
+            ip: "1.2.3.4".into(),
+            status: "Running".into(),
+            is_ready: true,
+            pod_type: Some(PodType::Prefill),
+            bootstrap_port: Some(8081),
+        };
+
+        let pod3 = PodInfo {
+            name: "pod1".into(),
+            ip: "1.2.3.4".into(),
+            status: "Running".into(),
+            is_ready: true,
+            pod_type: Some(PodType::Decode),
+            bootstrap_port: None,
+        };
+
+        assert_eq!(pod1, pod2);
+        assert_ne!(pod1, pod3);
+    }
+
+    #[tokio::test]
+    async fn test_handle_pod_event_add_unhealthy_pod() {
+        let router = create_test_router().await;
+        let tracked_pods = Arc::new(Mutex::new(HashSet::new()));
+        let pod_info = PodInfo {
+            name: "pod1".into(),
+            ip: "1.2.3.4".into(),
+            status: "Pending".into(),
+            is_ready: false,
+            pod_type: None,
+            bootstrap_port: None,
+        };
+        let port = 8080u16;
+
+        handle_pod_event(
+            &pod_info,
+            Arc::clone(&tracked_pods),
+            Arc::clone(&router),
+            port,
+            false, // pd_mode = false
+        )
+        .await;
+
+        assert!(!tracked_pods.lock().unwrap().contains(&pod_info));
+        assert!(!router
+            .get_worker_urls()
+            .contains(&pod_info.worker_url(port)));
+    }
+
+    #[tokio::test]
+    async fn test_handle_pod_deletion_non_existing_pod() {
+        let router = create_test_router().await;
+        let tracked_pods = Arc::new(Mutex::new(HashSet::new()));
+        let pod_info = PodInfo {
+            name: "pod1".into(),
+            ip: "1.2.3.4".into(),
+            status: "Running".into(),
+            is_ready: true,
+            pod_type: None,
+            bootstrap_port: None,
+        };
+        let port = 8080u16;
+
+        handle_pod_deletion(
+            &pod_info,
+            Arc::clone(&tracked_pods),
+            Arc::clone(&router),
+            port,
+            false, // pd_mode = false
+        )
+        .await;
+
+        assert!(tracked_pods.lock().unwrap().is_empty());
+        assert!(router.get_worker_urls().is_empty());
+    }
+
+    #[tokio::test]
+    async fn test_handle_pd_pod_event_prefill_pod() {
+        let router = create_test_router().await;
+        let tracked_pods = Arc::new(Mutex::new(HashSet::new()));
+        let pod_info = PodInfo {
+            name: "prefill-pod".into(),
+            ip: "1.2.3.4".into(),
+            status: "Running".into(),
+            is_ready: true,
+            pod_type: Some(PodType::Prefill),
+            bootstrap_port: Some(8081),
+        };
+        let port = 8080u16;
+
+        // This test validates the structure but won't actually add workers since
+        // we're using a regular router instead of PD router
+        handle_pod_event(
+            &pod_info,
+            Arc::clone(&tracked_pods),
+            Arc::clone(&router),
+            port,
+            false, // pd_mode = false, so it should fallback to regular handling
+        )
+        .await;
+
+        // Pod should not be tracked since router.add_worker will fail for non-running server
+        assert!(!tracked_pods.lock().unwrap().contains(&pod_info));
+    }
+
+    #[tokio::test]
+    async fn test_handle_pd_pod_event_decode_pod() {
+        let router = create_test_router().await;
+        let tracked_pods = Arc::new(Mutex::new(HashSet::new()));
+        let pod_info = PodInfo {
+            name: "decode-pod".into(),
+            ip: "1.2.3.5".into(),
+            status: "Running".into(),
+            is_ready: true,
+            pod_type: Some(PodType::Decode),
+            bootstrap_port: None,
+        };
+        let port = 8080u16;
+
+        handle_pod_event(
+            &pod_info,
+            Arc::clone(&tracked_pods),
+            Arc::clone(&router),
+            port,
+            false, // pd_mode = false, so it should fallback to regular handling
+        )
+        .await;
+
+        // Pod should not be tracked since router.add_worker will fail for non-running server
+        assert!(!tracked_pods.lock().unwrap().contains(&pod_info));
+    }
+
+    #[tokio::test]
+    async fn test_handle_pd_pod_deletion_tracked_pod() {
+        let router = create_test_router().await;
+        let tracked_pods = Arc::new(Mutex::new(HashSet::new()));
+        let pod_info = PodInfo {
+            name: "test-pod".into(),
+            ip: "1.2.3.4".into(),
+            status: "Running".into(),
+            is_ready: true,
+            pod_type: Some(PodType::Prefill),
+            bootstrap_port: Some(8081),
+        };
+
+        // Add pod to tracked set first
+        {
+            let mut tracked = tracked_pods.lock().unwrap();
+            tracked.insert(pod_info.clone());
+        }
+
+        let port = 8080u16;
+
+        handle_pod_deletion(
+            &pod_info,
+            Arc::clone(&tracked_pods),
+            Arc::clone(&router),
+            port,
+            false, // pd_mode = false
+        )
+        .await;
+
+        // Pod should be removed from tracking
+        assert!(!tracked_pods.lock().unwrap().contains(&pod_info));
+    }
+
+    #[tokio::test]
+    async fn test_handle_pd_pod_deletion_untracked_pod() {
+        let router = create_test_router().await;
+        let tracked_pods = Arc::new(Mutex::new(HashSet::new()));
+        let pod_info = PodInfo {
+            name: "untracked-pod".into(),
+            ip: "1.2.3.4".into(),
+            status: "Running".into(),
+            is_ready: true,
+            pod_type: Some(PodType::Decode),
+            bootstrap_port: None,
+        };
+        let port = 8080u16;
+
+        // Don't add pod to tracked set
+
+        handle_pod_deletion(
+            &pod_info,
+            Arc::clone(&tracked_pods),
+            Arc::clone(&router),
+            port,
+            true, // pd_mode = true
+        )
+        .await;
+
+        // Tracked set should remain empty
+        assert!(tracked_pods.lock().unwrap().is_empty());
+    }
+
+    #[tokio::test]
+    async fn test_unified_handler_regular_mode() {
+        let router = create_test_router().await;
+        let tracked_pods = Arc::new(Mutex::new(HashSet::new()));
+        let pod_info = PodInfo {
+            name: "regular-pod".into(),
+            ip: "1.2.3.4".into(),
+            status: "Running".into(),
+            is_ready: true,
+            pod_type: Some(PodType::Regular),
+            bootstrap_port: None,
+        };
+        let port = 8080u16;
+
+        // Test that unified handler works for regular mode
+        handle_pod_event(
+            &pod_info,
+            Arc::clone(&tracked_pods),
+            Arc::clone(&router),
+            port,
+            false, // pd_mode = false
+        )
+        .await;
+
+        // Pod should not be tracked since router.add_worker will fail for non-running server
+        assert!(!tracked_pods.lock().unwrap().contains(&pod_info));
+    }
+
+    #[tokio::test]
+    async fn test_unified_handler_pd_mode_with_prefill() {
+        let router = create_test_router().await;
+        let tracked_pods = Arc::new(Mutex::new(HashSet::new()));
+        let pod_info = PodInfo {
+            name: "prefill-pod".into(),
+            ip: "1.2.3.4".into(),
+            status: "Running".into(),
+            is_ready: true,
+            pod_type: Some(PodType::Prefill),
+            bootstrap_port: Some(8081),
+        };
+        let port = 8080u16;
+
+        // Test that unified handler works for PD mode with prefill
+        handle_pod_event(
+            &pod_info,
+            Arc::clone(&tracked_pods),
+            Arc::clone(&router),
+            port,
+            true, // pd_mode = true
+        )
+        .await;
+
+        // Pod should not be tracked since router.add_pd_worker will fail for regular router
+        assert!(!tracked_pods.lock().unwrap().contains(&pod_info));
+    }
+
+    #[tokio::test]
+    async fn test_unified_handler_deletion_with_pd_mode() {
+        let router = create_test_router().await;
+        let tracked_pods = Arc::new(Mutex::new(HashSet::new()));
+        let pod_info = PodInfo {
+            name: "decode-pod".into(),
+            ip: "1.2.3.4".into(),
+            status: "Running".into(),
+            is_ready: true,
+            pod_type: Some(PodType::Decode),
+            bootstrap_port: None,
+        };
+
+        // Add pod to tracked set first
+        {
+            let mut tracked = tracked_pods.lock().unwrap();
+            tracked.insert(pod_info.clone());
+        }
+
+        let port = 8080u16;
+
+        // Test that unified handler works for deletion in PD mode
+        handle_pod_deletion(
+            &pod_info,
+            Arc::clone(&tracked_pods),
+            Arc::clone(&router),
+            port,
+            true, // pd_mode = true
+        )
+        .await;
+
+        // Pod should be removed from tracking
+        assert!(!tracked_pods.lock().unwrap().contains(&pod_info));
+    }
+}
diff --git a/sgl-router/src/tokenizer/README.md b/sgl-router/src/tokenizer/README.md
new file mode 100644
index 000000000..67972ccbd
--- /dev/null
+++ b/sgl-router/src/tokenizer/README.md
@@ -0,0 +1,1021 @@
+# Tokenizer Architecture
+
+## 1. Executive Summary
+
+### High-Level Overview
+
+The SGL Router tokenizer layer provides a unified interface for text tokenization and detokenization, supporting multiple tokenizer backends (HuggingFace, Tiktoken, Mock) with sophisticated streaming capabilities and stop sequence detection. The architecture follows a trait-based design pattern enabling pluggable tokenizer implementations while maintaining consistent APIs across the router.
+
+**Key Components:**
+- **Factory Pattern**: Auto-detection and creation of appropriate tokenizer types from files or model names
+- **HuggingFace Hub Integration**: Automatic downloading of tokenizer files from HuggingFace Hub for model IDs
+- **Trait System**: `Encoder`, `Decoder`, and `Tokenizer` traits for implementation flexibility
+- **Streaming**: Incremental decoding with UTF-8 boundary handling and buffering
+- **Stop Sequences**: Complex pattern matching for stop tokens and sequences with "jail" buffering
+- **Sequence Management**: Stateful token sequence tracking with incremental text generation
+- **Chat Templates**: Jinja2-based conversation formatting with HuggingFace compatibility
+- **Metrics Integration**: Comprehensive performance and error tracking across all operations
+
+**Data Flow:**
+1. Request → Factory (type detection/HF download) → Concrete Tokenizer Creation
+2. Encode: Text → Tokenizer → Encoding (token IDs)
+3. Stream: Token IDs → DecodeStream → Incremental Text Chunks
+4. Stop Detection: Tokens → StopSequenceDecoder → Text/Held/Stopped
+5. Sequence: Tokens → Sequence → Incremental Decoding → Text Output
+
+### Architecture Highlights
+
+- **Extended Backend Support**: HuggingFace, Tiktoken (GPT models), and Mock for testing
+- **HuggingFace Hub Integration**: Automatic tokenizer downloads with caching
+- **Comprehensive Metrics**: Full TokenizerMetrics integration for observability
+- **Unified Dependencies**: All tokenizer backends included by default (no feature gates)
+- **Stop Sequence Detection**: Sophisticated partial matching with jail buffer
+- **Chat Template Support**: Full Jinja2 rendering with HuggingFace compatibility
+- **Thread Safety**: Arc-based sharing with Send + Sync guarantees
+
+## 2. Mermaid Diagrams
+
+### Component Flow Diagram
+
+```mermaid
+graph TB
+    subgraph Input
+        R[Request] --> F[Factory]
+    end
+
+    subgraph Factory Layer
+        F --> FD[File Detection]
+        F --> MD[Model Detection]
+        FD --> HF[HuggingFace]
+        FD --> TK[Tiktoken]
+        MD --> TK
+        FD --> MK[Mock]
+    end
+
+    subgraph Tokenizer Implementations
+        HF --> T[Tokenizer Wrapper]
+        TK --> T
+        MK --> T
+    end
+
+    subgraph Processing
+        T --> E[Encode]
+        T --> D[Decode]
+        T --> DS[DecodeStream]
+        T --> SQ[Sequence]
+        T --> SD[StopSequenceDecoder]
+    end
+
+    subgraph Output
+        E --> ENC[Encoding]
+        D --> TXT[Text]
+        DS --> STRM[Stream Chunks]
+        SQ --> ITXT[Incremental Text]
+        SD --> SO[Stop Output]
+    end
+
+    subgraph Metrics
+        M[TokenizerMetrics]
+        E -.-> M
+        D -.-> M
+        DS -.-> M
+        SD -.-> M
+    end
+```
+
+### Sequence Flow Diagram
+
+```mermaid
+sequenceDiagram
+    participant C as Client
+    participant F as Factory
+    participant T as Tokenizer
+    participant DS as DecodeStream
+    participant SD as StopDecoder
+    participant M as Metrics
+
+    C->>F: create_tokenizer(path_or_model_id)
+    F->>F: detect_type()
+    alt local file
+        F->>T: new HF/Tiktoken/Mock
+    else HuggingFace model ID
+        F->>F: download_tokenizer_from_hf()
+        F->>T: new from downloaded files
+    end
+    F->>M: record_factory_load()
+    F-->>C: Arc<dyn Tokenizer>
+
+    C->>T: encode(text)
+    T->>M: record_encode_request()
+    T->>T: tokenize
+    T->>M: record_tokens_per_encode()
+    T-->>C: Encoding
+
+    C->>DS: new(tokenizer, tokens)
+    loop streaming
+        C->>DS: step(token_id)
+        DS->>T: decode(partial)
+        DS->>DS: check UTF-8 boundary
+        alt complete char
+            DS->>M: record_stream_token()
+            DS-->>C: Some(text)
+        else incomplete
+            DS->>M: record_incomplete_utf8()
+            DS-->>C: None
+        end
+    end
+
+    C->>SD: process_token(id)
+    SD->>SD: check stop conditions
+    alt stop token
+        SD->>M: record_stop_detected()
+        SD-->>C: Stopped
+    else partial match
+        SD->>M: record_partial_match()
+        SD-->>C: Held
+    else no match
+        SD->>T: decode incremental
+        SD-->>C: Text(output)
+    end
+```
+
+### Class/Type Diagram
+
+```mermaid
+classDiagram
+    class Encoder {
+        <<trait>>
+        +encode(input: &str) Result~Encoding~
+        +encode_batch(inputs: &[&str]) Result~Vec~Encoding~~
+    }
+
+    class Decoder {
+        <<trait>>
+        +decode(token_ids: &[u32], skip_special: bool) Result~String~
+    }
+
+    class TokenizerTrait {
+        <<trait>>
+        +vocab_size() usize
+        +get_special_tokens() &SpecialTokens
+        +token_to_id(token: &str) Option~u32~
+        +id_to_token(id: u32) Option~String~
+    }
+
+    class Tokenizer {
+        -Arc~dyn TokenizerTrait~
+        +from_file(path: &str) Result~Tokenizer~
+        +from_arc(Arc~dyn TokenizerTrait~) Self
+        +decode_stream(&[u32], bool) DecodeStream
+        +encode(&str) Result~Encoding~
+        +decode(&[u32], bool) Result~String~
+    }
+
+    class Encoding {
+        <<enum>>
+        Hf(Box~HfEncoding~)
+        Sp(Vec~u32~)
+        Tiktoken(Vec~usize~)
+        +token_ids() Vec~u32~
+        +token_ids_ref() &[u32]
+    }
+
+    class HuggingFaceTokenizer {
+        -tokenizer: HfTokenizer
+        -special_tokens: SpecialTokens
+        -vocab: HashMap~String, u32~
+        -reverse_vocab: HashMap~u32, String~
+        +from_file(path: &str) Result~Self~
+        +apply_chat_template(&[ChatMessage]) Result~String~
+    }
+
+    class TiktokenTokenizer {
+        -tokenizer: CoreBPE
+        -model: TiktokenModel
+        -special_tokens: SpecialTokens
+        -vocab_size: usize
+        +new(model: TiktokenModel) Result~Self~
+        +from_model_name(name: &str) Result~Self~
+    }
+
+    class MockTokenizer {
+        -vocab: HashMap~String, u32~
+        -reverse_vocab: HashMap~u32, String~
+        -special_tokens: SpecialTokens
+        +new() Self
+    }
+
+    class DecodeStream {
+        -tokenizer: Arc~dyn TokenizerTrait~
+        -all_token_ids: Vec~u32~
+        -prefix_offset: usize
+        -read_offset: usize
+        -skip_special_tokens: bool
+        +new(tokenizer, &[u32], bool) Self
+        +step(u32) Result~Option~String~~
+        +flush() Result~Option~String~~
+    }
+
+    class Sequence {
+        -tokenizer: Arc~dyn TokenizerTrait~
+        -token_ids: Vec~u32~
+        -prefix_offset: usize
+        -read_offset: usize
+        +append_text(&str) Result~()~
+        +append_token(u32) Result~String~
+        +text() Result~String~
+    }
+
+    class StopSequenceDecoder {
+        -tokenizer: Arc~dyn TokenizerTrait~
+        -config: StopSequenceConfig
+        -jail_buffer: String
+        -token_buffer: Vec~u32~
+        -stopped: bool
+        +process_token(u32) Result~SequenceDecoderOutput~
+        +flush() SequenceDecoderOutput
+        +reset()
+    }
+
+    Encoder <|.. HuggingFaceTokenizer
+    Encoder <|.. TiktokenTokenizer
+    Encoder <|.. MockTokenizer
+    Decoder <|.. HuggingFaceTokenizer
+    Decoder <|.. TiktokenTokenizer
+    Decoder <|.. MockTokenizer
+    TokenizerTrait <|.. HuggingFaceTokenizer
+    TokenizerTrait <|.. TiktokenTokenizer
+    TokenizerTrait <|.. MockTokenizer
+    TokenizerTrait --|> Encoder
+    TokenizerTrait --|> Decoder
+
+    Tokenizer o-- TokenizerTrait
+    DecodeStream o-- TokenizerTrait
+    Sequence o-- TokenizerTrait
+    StopSequenceDecoder o-- TokenizerTrait
+```
+
+## 3. Module-by-Module Deep Dive
+
+### 3.1 mod.rs (Main Module)
+
+**Location**: `src/tokenizer/mod.rs`
+
+**Public API:**
+
+```rust
+pub struct Tokenizer(Arc<dyn traits::Tokenizer>);
+
+impl Tokenizer {
+    pub fn from_file(file_path: &str) -> Result<Tokenizer>
+    pub fn from_file_with_chat_template(
+        file_path: &str,
+        chat_template_path: Option<&str>
+    ) -> Result<Tokenizer>
+    pub fn from_arc(tokenizer: Arc<dyn traits::Tokenizer>) -> Self
+    pub fn decode_stream(&self, prompt_token_ids: &[u32], skip_special_tokens: bool) -> DecodeStream
+    pub fn encode(&self, input: &str) -> Result<Encoding>
+    pub fn encode_batch(&self, inputs: &[&str]) -> Result<Vec<Encoding>>
+    pub fn decode(&self, token_ids: &[u32], skip_special_tokens: bool) -> Result<String>
+    pub fn vocab_size(&self) -> usize
+    pub fn get_special_tokens(&self) -> &SpecialTokens
+    pub fn token_to_id(&self, token: &str) -> Option<u32>
+    pub fn id_to_token(&self, id: u32) -> Option<String>
+}
+```
+
+**Key Responsibilities:**
+- Main wrapper providing unified interface (mod.rs:36-93)
+- Arc-based shared ownership for thread safety
+- Delegates to concrete implementations via trait object
+- Factory method integration for creation
+
+**State Management:**
+- Single field: `Arc<dyn traits::Tokenizer>` for polymorphic dispatch
+- Immutable after creation, Clone via Arc
+
+**Re-exports** (mod.rs:26-43):
+- Factory functions: `create_tokenizer`, `create_tokenizer_async`, `create_tokenizer_from_file`, `create_tokenizer_with_chat_template`
+- Types: `Sequence`, `StopSequenceConfig`, `DecodeStream`, `Encoding`, `TokenizerType`
+- Chat template: `ChatMessage`
+- Tokenizer implementations: `HuggingFaceTokenizer`, `TiktokenTokenizer`
+
+### 3.2 traits.rs (Trait Definitions)
+
+**Location**: `src/tokenizer/traits.rs`
+
+**Core Traits:**
+
+```rust
+pub trait Encoder: Send + Sync {
+    fn encode(&self, input: &str) -> Result<Encoding>;
+    fn encode_batch(&self, inputs: &[&str]) -> Result<Vec<Encoding>>;
+}
+
+pub trait Decoder: Send + Sync {
+    fn decode(&self, token_ids: &[u32], skip_special_tokens: bool) -> Result<String>;
+}
+
+pub trait Tokenizer: Encoder + Decoder {
+    fn vocab_size(&self) -> usize;
+    fn get_special_tokens(&self) -> &SpecialTokens;
+    fn token_to_id(&self, token: &str) -> Option<u32>;
+    fn id_to_token(&self, id: u32) -> Option<String>;
+}
+```
+
+**Encoding Enum** (traits.rs:24-53):
+```rust
+pub enum Encoding {
+    Hf(Box<tokenizers::tokenizer::Encoding>),  // HuggingFace
+    Sp(Vec<u32>),                               // SentencePiece
+    Tiktoken(Vec<usize>),                        // GPT models
+}
+```
+
+**Key Design Decisions:**
+- Separation of Encoder/Decoder allows partial implementations
+- Send + Sync for thread safety
+- Encoding enum handles different backend representations
+- `token_ids()` returns `Vec<u32>` for compatibility (traits.rs:34-40)
+- `token_ids_ref()` has limitation for Tiktoken (returns empty slice)
+
+**SpecialTokens Struct** (traits.rs:55-65):
+- Standard tokens: bos, eos, unk, sep, pad, cls, mask
+- Additional tokens vector for custom special tokens
+
+### 3.3 factory.rs (Tokenizer Creation)
+
+**Location**: `src/tokenizer/factory.rs`
+
+**Public Functions:**
+
+```rust
+pub fn create_tokenizer_from_file(file_path: &str) -> Result<Arc<dyn traits::Tokenizer>>
+pub fn create_tokenizer_with_chat_template(
+    file_path: &str,
+    chat_template_path: Option<&str>
+) -> Result<Arc<dyn traits::Tokenizer>>
+pub fn create_tokenizer(model_name_or_path: &str) -> Result<Arc<dyn traits::Tokenizer>>
+pub async fn create_tokenizer_async(model_name_or_path: &str) -> Result<Arc<dyn traits::Tokenizer>>
+pub fn get_tokenizer_info(file_path: &str) -> Result<TokenizerType>
+```
+
+**Auto-Detection Logic** (factory.rs:94-132):
+1. Read first 512 bytes of file
+2. Check for JSON format (HuggingFace)
+3. Check for GGUF magic bytes
+4. Check for SentencePiece patterns
+
+**File Type Detection** (factory.rs:135-161):
+- JSON detection: Skip BOM, find `{` or `[`
+- SentencePiece: Check for specific byte patterns
+- GGUF: Check magic number "GGUF"
+
+**Model Name Routing** (factory.rs:145-193):
+- GPT models → Tiktoken (gpt-4, gpt-3.5, davinci, curie, etc.)
+- File paths → file-based creation
+- HuggingFace model IDs → Automatic download from Hub
+
+**HuggingFace Hub Integration**:
+- Downloads tokenizer files (tokenizer.json, tokenizer_config.json, etc.)
+- Respects HF_TOKEN environment variable for private models
+- Caches downloaded files using hf-hub crate
+- Async and blocking versions available
+
+**Metrics Integration:**
+- Records factory load/error events (factory.rs:56-57, 82-83)
+- Tracks vocab size on successful load
+- Measures load duration
+
+### 3.4 huggingface.rs (HuggingFace Implementation)
+
+**Location**: `src/tokenizer/huggingface.rs`
+
+**Public API:**
+
+```rust
+pub struct HuggingFaceTokenizer {
+    tokenizer: HfTokenizer,
+    special_tokens: SpecialTokens,
+    vocab: HashMap<String, u32>,
+    reverse_vocab: HashMap<u32, String>,
+}
+
+impl HuggingFaceTokenizer {
+    pub fn from_file(file_path: &str) -> Result<Self>
+    pub fn from_tokenizer(tokenizer: HfTokenizer) -> Self
+    pub fn apply_chat_template(&self, messages: &[ChatMessage]) -> Result<String>
+}
+```
+
+**Special Token Extraction** (huggingface.rs:58-82):
+- Searches for common patterns: `<s>`, `</s>`, `<unk>`, `[CLS]`, etc.
+- Falls back to None if not found
+
+**Vocab Management:**
+- Builds forward and reverse mappings on creation (huggingface.rs:26-30)
+- Used for token↔ID conversions
+
+**Metrics** (huggingface.rs:97-111, 136-150):
+- Tracks encode/decode requests, durations
+- Records character/token counts
+- Reports errors with context
+
+**Chat Template Integration** (huggingface.rs:21-144):
+- Automatic loading from tokenizer_config.json
+- Custom template loading from .jinja files
+- Runtime template modification via `set_chat_template()`
+- Full Jinja2 rendering via minijinja
+- See section 3.10 for detailed chat template architecture
+
+### 3.5 sequence.rs (Sequence Management)
+
+**Location**: `src/tokenizer/sequence.rs`
+
+**Core Structure:**
+
+```rust
+pub struct Sequence {
+    tokenizer: Arc<dyn TokenizerTrait>,
+    token_ids: Vec<u32>,
+    prefix_offset: usize,  // Start of prefix window
+    read_offset: usize,     // End of processed tokens
+}
+```
+
+**Key Methods:**
+
+```rust
+impl Sequence {
+    pub fn new(tokenizer: Arc<dyn TokenizerTrait>) -> Self
+    pub fn with_tokens(tokenizer: Arc<dyn TokenizerTrait>, token_ids: Vec<u32>) -> Self
+    pub fn append_text(&mut self, input: &str) -> Result<()>
+    pub fn append_token(&mut self, token_id: u32) -> Result<String>
+    pub fn text(&self) -> Result<String>
+}
+```
+
+**Incremental Decoding Algorithm** (sequence.rs:93-142):
+1. Store old read_offset before adding token
+2. Push new token, update read_offset
+3. Decode prefix window (prefix_offset..old_read_offset)
+4. Decode full window (prefix_offset..current)
+5. Check for UTF-8 boundary issues
+6. Extract only new text portion
+7. Handle incomplete UTF-8 (�) by returning empty
+
+**State Variables:**
+- `token_ids`: Complete sequence of tokens
+- `prefix_offset`: Where last decode started
+- `read_offset`: Current position in sequence
+
+### 3.6 stop.rs (Stop Sequence Detection)
+
+**Location**: `src/tokenizer/stop.rs`
+
+**Core Components:**
+
+```rust
+pub enum SequenceDecoderOutput {
+    Text(String),           // Normal output
+    Held,                   // Partial match, holding text
+    Stopped,                // Stop matched (hidden)
+    StoppedWithText(String),// Stop matched (visible)
+}
+
+pub struct StopSequenceConfig {
+    pub stop_tokens: HashSet<u32>,
+    pub stop_sequences: Vec<String>,
+    pub visible_stop_tokens: HashSet<u32>,
+    pub visible_stop_sequences: Vec<String>,
+}
+
+pub struct StopSequenceDecoder {
+    tokenizer: Arc<dyn traits::Tokenizer>,
+    config: StopSequenceConfig,
+    jail_buffer: String,     // Held text for partial matches
+    token_buffer: Vec<u32>,  // All tokens
+    prefix_offset: usize,
+    read_offset: usize,
+    stopped: bool,
+    skip_special_tokens: bool,
+}
+```
+
+**Stop Detection Algorithm** (stop.rs:97-252):
+
+1. **Token-level checks** (stop.rs:104-132):
+   - Check stop_tokens → return Stopped
+   - Check visible_stop_tokens → return StoppedWithText
+
+2. **Incremental decode** (stop.rs:136-166):
+   - Decode previous context
+   - Decode including new token
+   - Check for incomplete UTF-8
+
+3. **String matching** (stop.rs:169-202):
+   - Combine jail_buffer + new_text
+   - Check for complete matches
+   - Check visible sequences
+
+4. **Partial match detection** (stop.rs:204-239):
+   - Check all suffixes as potential prefixes
+   - Split safe text vs potential match
+   - Jail potential match text
+
+**Critical Fix** (stop.rs:385-424):
+- Ensures no repeated/accumulated output
+- Only outputs NEW text, not full buffer
+
+### 3.7 stream.rs (Streaming Decode)
+
+**Location**: `src/tokenizer/stream.rs`
+
+**Structure:**
+
+```rust
+pub struct DecodeStream {
+    tokenizer: Arc<dyn traits::Tokenizer>,
+    skip_special_tokens: bool,
+    all_token_ids: Vec<u32>,
+    prefix_offset: usize,
+    read_offset: usize,
+}
+```
+
+**Constants:**
+- `INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET: usize = 5` (stream.rs:9)
+  - Matches HuggingFace TGI and vLLM standard
+
+**Key Methods:**
+
+```rust
+impl DecodeStream {
+    pub fn new(tokenizer, prompt_token_ids: &[u32], skip_special: bool) -> Self
+    pub fn step(&mut self, id: u32) -> Result<Option<String>>
+    pub fn step_batch(&mut self, token_ids: &[u32]) -> Result<Vec<String>>
+    pub fn flush(&mut self) -> Result<Option<String>>
+    pub fn tokens(&self) -> &[u32]
+}
+```
+
+**Streaming Algorithm** (stream.rs:47-82):
+1. Append token to buffer
+2. Decode prefix window for context
+3. Decode full window
+4. Check for incomplete UTF-8 (�)
+5. Extract new text if complete
+6. Update offsets for next iteration
+
+**Metrics:**
+- Records stream tokens, incomplete UTF-8, step duration
+
+### 3.8 tiktoken.rs (Tiktoken Implementation)
+
+**Location**: `src/tokenizer/tiktoken.rs`
+
+**Public API:**
+
+```rust
+pub struct TiktokenTokenizer {
+    tokenizer: CoreBPE,
+    model: TiktokenModel,
+    special_tokens: SpecialTokens,
+    vocab_size: usize,
+}
+
+pub enum TiktokenModel {
+    Cl100kBase,  // GPT-4, GPT-3.5-turbo
+    P50kBase,    // Codex, text-davinci-002/003
+    P50kEdit,    // Edit models
+    R50kBase,    // GPT-3 (davinci, curie, etc.)
+}
+```
+
+**Model Detection** (tiktoken.rs:67-81):
+- GPT-4, GPT-3.5, turbo → Cl100kBase
+- davinci-002/003, codex → P50kBase
+- edit models → P50kEdit
+- davinci, curie, babbage, ada → R50kBase
+
+**Vocab Sizes** (tiktoken.rs:46-50):
+- Cl100kBase: 100,256 tokens
+- P50k variants: 50,281 tokens
+- R50kBase: 50,257 tokens
+
+**Special Tokens** (tiktoken.rs:84-114):
+- All models use `<|endoftext|>` for BOS/EOS/PAD
+- Cl100k adds FIM tokens for code completion
+
+**Limitations:**
+- No token↔ID mapping (returns None) (tiktoken.rs:151-161)
+- Requires Vec<u32> → Vec<usize> conversion
+
+### 3.9 mock.rs (Testing Implementation)
+
+**Location**: `src/tokenizer/mock.rs`
+
+**Purpose:** Simple tokenizer for unit testing
+
+**Vocabulary:**
+- 8 predefined tokens: "Hello"→1, "world"→2, "test"→3, etc.
+- Special tokens: `<eos>`→999, `<bos>`→1000
+
+**Behavior:**
+- Encode: Split on whitespace, lookup tokens
+- Decode: Join tokens with spaces
+- Skips special tokens when requested
+
+### 3.10 hub.rs (HuggingFace Hub Download)
+
+**Location**: `src/tokenizer/hub.rs`
+
+**Purpose:** Download tokenizer files from HuggingFace Hub when given a model ID.
+
+**Key Functions:**
+
+```rust
+pub async fn download_tokenizer_from_hf(model_id: impl AsRef<Path>) -> Result<PathBuf>
+pub async fn from_hf(name: impl AsRef<Path>, ignore_weights: bool) -> Result<PathBuf>
+```
+
+**Features:**
+- Downloads only tokenizer-related files by default
+- Filters out model weights, images, and documentation
+- Uses HF_TOKEN environment variable for authentication
+- Returns cached directory path for subsequent use
+- Progress indication during download
+
+**File Detection:**
+- Tokenizer files: tokenizer.json, tokenizer_config.json, special_tokens_map.json
+- Vocabulary files: vocab.json, merges.txt
+- SentencePiece models: *.model files
+
+### 3.11 chat_template.rs (Chat Template Support)
+
+**Location**: `src/tokenizer/chat_template.rs`
+
+**Purpose:** Jinja2-based chat template rendering for conversation formatting, matching HuggingFace transformers' `apply_chat_template` functionality.
+
+**Core Components:**
+
+```rust
+pub struct ChatMessage {
+    pub role: String,
+    pub content: String,
+}
+
+pub struct ChatTemplateProcessor {
+    template: String,
+    bos_token: Option<String>,
+    eos_token: Option<String>,
+}
+```
+
+**Key Features:**
+
+1. **Jinja2 Template Rendering** (chat_template.rs:63-102):
+   - Uses minijinja crate for Jinja2 compatibility
+   - Supports full Jinja2 syntax (loops, conditionals, variables)
+   - Compatible with HuggingFace chat templates
+
+2. **Template Loading Sources:**
+   - **tokenizer_config.json** (automatic): Default behavior when creating tokenizer
+   - **.jinja files** (explicit): Custom templates that override built-in
+   - **Programmatic** (runtime): `set_chat_template()` method
+
+3. **Loading Priority:**
+   ```rust
+   // Priority order:
+   // 1. Explicit .jinja file (if provided) - OVERRIDES all
+   // 2. tokenizer_config.json (if exists)
+   // 3. Fallback to simple formatting
+   ```
+
+**Template Variables:**
+- `messages`: Array of chat messages with role and content
+- `add_generation_prompt`: Boolean for assistant prompt
+- `bos_token`: Beginning of sequence token
+- `eos_token`: End of sequence token
+
+**API Methods:**
+
+```rust
+// Factory level - create with custom template
+pub fn create_tokenizer_with_chat_template(
+    tokenizer_path: &str,
+    chat_template_path: Option<&str>
+) -> Result<Arc<dyn Tokenizer>>
+
+// HuggingFace tokenizer methods
+impl HuggingFaceTokenizer {
+    // Load with custom template (overrides built-in)
+    pub fn from_file_with_chat_template(
+        file_path: &str,
+        chat_template_path: Option<&str>
+    ) -> Result<Self>
+
+    // Set template after creation (mimics Python)
+    pub fn set_chat_template(&mut self, template: String)
+
+    // Apply template to messages
+    pub fn apply_chat_template(
+        &self,
+        messages: &[ChatMessage],
+        add_generation_prompt: bool
+    ) -> Result<String>
+}
+```
+
+**Template Examples:**
+
+1. **Llama-style Template:**
+   ```jinja
+   {%- for message in messages %}
+   {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
+   {{- message['content'] + '<|eot_id|>' }}
+   {%- endfor %}
+   {%- if add_generation_prompt %}
+   {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+   {%- endif %}
+   ```
+
+2. **ChatML Format:**
+   ```jinja
+   {%- for message in messages %}
+   {{- '<|im_start|>' + message['role'] + '\n' }}
+   {{- message['content'] + '<|im_end|>\n' }}
+   {%- endfor %}
+   {%- if add_generation_prompt %}
+   {{- '<|im_start|>assistant\n' }}
+   {%- endif %}
+   ```
+
+**Integration with HuggingFace Tokenizer:**
+
+1. **Automatic Loading** (huggingface.rs:108-124):
+   - Searches for tokenizer_config.json in same directory
+   - Extracts `chat_template` field if present
+   - Stores template for use in apply_chat_template
+
+2. **Override Mechanism** (huggingface.rs:28-50):
+   - If chat_template_path provided, loads from .jinja file
+   - Replaces any existing template from tokenizer_config.json
+   - Matches Python's behavior: custom templates always override
+
+3. **Runtime Modification** (huggingface.rs:140-144):
+   - `set_chat_template()` allows changing template after creation
+   - Equivalent to Python's `tokenizer.chat_template = template`
+
+**Testing Coverage:**
+- Template rendering with various formats (Llama, ChatML, custom)
+- Loading from .jinja files
+- Override behavior verification
+- Runtime template modification
+- Special token handling
+
+## 4. Traits & Contracts
+
+### Core Trait Hierarchy
+
+1. **Encoder** (traits.rs:4-7)
+   - Contract: Convert text to token IDs
+   - Requirements: Send + Sync for thread safety
+   - Error handling via Result
+
+2. **Decoder** (traits.rs:10-12)
+   - Contract: Convert token IDs to text
+   - `skip_special_tokens` parameter for filtering
+
+3. **Tokenizer** (traits.rs:15-20)
+   - Extends both Encoder and Decoder
+   - Adds vocab introspection
+   - Token↔ID bidirectional mapping
+
+### Encoding Contract
+
+The `Encoding` enum must:
+- Provide `token_ids()` returning Vec<u32>
+- Support multiple backend representations
+- Handle type conversions (usize→u32 for Tiktoken)
+
+### Special Token Guarantees
+
+- BOS/EOS tokens for sequence boundaries
+- UNK for out-of-vocabulary handling
+- Optional tokens may be None
+- Additional tokens for custom use cases
+
+## 5. Tokenizer Implementations
+
+### HuggingFace Adapter
+
+**Normalization/Pretokenization:**
+- Handled by underlying `tokenizers` crate
+- Configurable via JSON tokenizer files
+- BPE, WordPiece, Unigram models supported
+
+**API Mapping:**
+- `encode(input, add_special_tokens=false)` → Encoding::Hf
+- Batch encoding supported natively
+- Vocab extraction for lookups
+
+### Tiktoken Adapter
+
+**Model Families:**
+- cl100k_base: Modern GPT models (GPT-4, GPT-3.5)
+- p50k_base: Codex and davinci-002/003
+- p50k_edit: Edit-specific models
+- r50k_base: Classic GPT-3
+
+**Byte-Level Behavior:**
+- Direct byte-pair encoding without pretokenization
+- No subword regularization
+- Deterministic encoding
+
+### Sequence/Stop Modules
+
+**Algorithms:**
+
+1. **Substring Matching:**
+   - Exact match for stop sequences
+   - Prefix detection for partial matches
+
+2. **Streaming Matcher:**
+   - Incremental text accumulation
+   - Jail buffer for uncertain text
+   - Release on divergence
+
+3. **Overlap Handling:**
+   - Token boundaries respected
+   - UTF-8 boundary checking
+   - Multi-byte character safety
+
+**Window Sizes:**
+- Initial offset: 5 tokens (standard)
+- Prefix window: Variable based on decoding
+- Jail buffer: Unbounded (cleared on match/divergence)
+
+## 6. Streaming Integration
+
+### Pipeline Position
+
+1. **Tokenization Phase:**
+   - Runs during request preprocessing
+   - Caches prompt encodings
+
+2. **Decoding Phase:**
+   - Runs per-token during generation
+   - Maintains streaming state
+
+### Buffering Policy
+
+- **Token Buffer:** Complete sequence retained
+- **Prefix Window:** Sliding window for context
+- **Partial Detokenization:** Hold incomplete UTF-8
+- **Chunk Boundaries:** Char-aligned output
+
+### Emission Rules
+
+- **Intermediate:** Emit on complete characters
+- **Final:** Flush any remaining text
+- **Stop Conditions:** Immediate termination
+- **Errors:** Propagate with context
+
+## 7. Testing & Benchmarking
+
+### Test Coverage Summary
+
+**Unit Tests (38 tests across 7 modules):**
+- `factory.rs`: 4 tests - JSON detection, file types, model routing
+- `huggingface.rs`: 1 test - Chat template handling
+- `sequence.rs`: 5 tests - Append operations, incremental decode
+- `stop.rs`: 9 tests - Stop detection, partial matches, jail buffer
+- `tiktoken.rs`: 7 tests - Model detection, encode/decode roundtrip
+- `chat_template.rs`: 3 tests - Template rendering, loading
+- `tests.rs`: 9 tests - Cross-module integration
+
+**Integration Tests (10 tests in tokenizer_integration.rs):**
+- HuggingFace tokenizer hash verification
+- Encode/decode lifecycle testing
+- Sequence operations with real tokenizers
+- Decode streaming with prefill
+- Stop sequence detection scenarios
+- Factory creation patterns
+- Batch encoding verification
+- Special token handling
+- Thread safety validation
+
+### Benchmark Suite (tokenizer_benchmark.rs)
+
+**Performance Benchmarks (12 benchmark groups):**
+1. **Encode Throughput**: Single-threaded encoding performance
+2. **Batch Encode**: Batch vs individual encoding comparison
+3. **Concurrent Encode**: Multi-request concurrent encoding
+4. **Decode Performance**: Various decode scenarios
+5. **Streaming Decode**: 100K token streaming performance
+6. **Latency Distribution**: P50/P90/P99 latency metrics
+7. **Concurrent Streaming**: Multi-stream performance
+8. **Stop Sequences**: Stop detection overhead
+9. **Multithreaded Encode**: Thread scaling characteristics
+10. **Multithreaded Decode**: Decode thread scaling
+11. **Memory Efficiency**: Memory usage patterns
+12. **Scaling Characteristics**: Performance vs input size
+
+**Test Prompts:**
+- Short: 30 chars ("What is the capital of France?")
+- Medium: 201 chars (Quantum computing explanation)
+- Long: 638 chars (Software engineering review)
+
+## 8. Operational Concerns
+
+### Configuration
+
+**Environment Variables:**
+- `HF_TOKEN`: HuggingFace authentication token for private models
+
+**Dependencies:**
+- All tokenizer backends included by default
+- No feature flags required
+
+**Model Mapping:**
+- Hardcoded in factory.rs
+- TODO: Make configurable
+
+### Metrics
+
+**Metric Names (via TokenizerMetrics):**
+- `sgl_tokenizer_encode_duration_seconds`
+- `sgl_tokenizer_decode_duration_seconds`
+- `sgl_tokenizer_tokens_per_encode`
+- `sgl_tokenizer_chars_per_encode`
+- `sgl_tokenizer_factory_load_duration_seconds`
+- `sgl_tokenizer_stop_sequence_detected`
+- `sgl_tokenizer_stream_incomplete_utf8_total`
+
+**Labels:**
+- `tokenizer_type`: huggingface, tiktoken, mock
+- `operation`: encode, decode, factory_load
+- `error_type`: Various error conditions
+
+### Failure Modes
+
+1. **File Not Found:** Clear error with path
+2. **Unsupported Format:** Lists supported types
+3. **Feature Disabled:** Suggests enabling feature
+4. **Decode Errors:** Context in error message
+5. **Incomplete UTF-8:** Handled gracefully
+
+### Dynamic Batching Analysis
+
+**Note**: Dynamic batching implementation was explored but found to have significant overhead:
+- Channel communication adds ~3-4ms latency per request
+- Single requests are ~300x slower with dynamic batching
+- Even concurrent requests show 50-100% performance regression
+- The async/channel overhead outweighs tokenization benefits
+
+**Recommendation**: Use thread-local tokenizer pools or direct `encode_batch()` instead of dynamic batching for this use case.
+
+## 9. Glossary
+
+- **BPE (Byte-Pair Encoding):** Subword tokenization merging frequent pairs
+- **Tokenizer Family:** Related tokenizers sharing vocabulary (GPT, BERT, etc.)
+- **Stop Sequence:** Text pattern triggering generation termination
+- **Detokenization:** Converting token IDs back to text
+- **Jail Buffer:** Temporary hold for potentially matching stop sequences
+- **Prefix Offset:** Starting position for incremental decoding window
+- **Read Offset:** Current position in token sequence
+- **Special Tokens:** Reserved tokens (BOS, EOS, PAD, etc.)
+- **Vocab Size:** Total number of unique tokens
+- **Chat Template:** Format for converting messages to model input
+
+## 10. TODO
+
+1. **TODO:** Implement `Encoding::get_hash()` for caching support
+   - File: `src/tokenizer/traits.rs`
+   - Symbol: `impl Encoding`
+
+2. **TODO:** Add character offset tracking
+   - File: `src/tokenizer/traits.rs`
+   - Symbol: `pub type Offsets = (usize, usize)`
+
+3. **TODO:** Support SentencePiece models
+   - File: `src/tokenizer/factory.rs:69-72`
+   - Symbol: Extension match arm for "model"
+
+4. **TODO:** Support GGUF format
+   - File: `src/tokenizer/factory.rs:74-78`
+   - Symbol: Extension match arm for "gguf"
+
+5. **TODO:** Add token↔ID mapping for Tiktoken
+   - File: `src/tokenizer/tiktoken.rs:151-161`
+   - Symbol: `token_to_id()` and `id_to_token()` methods
+
+6. **TODO:** Fix `token_ids_ref()` for Tiktoken
+   - File: `src/tokenizer/traits.rs:46-50`
+   - Symbol: `Encoding::Tiktoken` match arm
+
+7. **TODO:** Make model→tokenizer mapping configurable
+   - File: `src/tokenizer/factory.rs:174-184`
+   - Symbol: GPT model detection logic
diff --git a/sgl-router/src/tokenizer/chat_template.rs b/sgl-router/src/tokenizer/chat_template.rs
new file mode 100644
index 000000000..8a9a0fe1d
--- /dev/null
+++ b/sgl-router/src/tokenizer/chat_template.rs
@@ -0,0 +1,182 @@
+//! Chat template support for tokenizers using Jinja2 templates
+//!
+//! This module provides functionality to apply chat templates to messages,
+//! similar to HuggingFace transformers' apply_chat_template method.
+
+use anyhow::{anyhow, Result};
+use minijinja::{context, Environment, Value};
+use serde::{Deserialize, Serialize};
+use serde_json;
+
+/// Represents a chat message with role and content
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ChatMessage {
+    pub role: String,
+    pub content: String,
+}
+
+impl ChatMessage {
+    pub fn new(role: impl Into<String>, content: impl Into<String>) -> Self {
+        ChatMessage {
+            role: role.into(),
+            content: content.into(),
+        }
+    }
+
+    pub fn system(content: impl Into<String>) -> Self {
+        Self::new("system", content)
+    }
+
+    pub fn user(content: impl Into<String>) -> Self {
+        Self::new("user", content)
+    }
+
+    pub fn assistant(content: impl Into<String>) -> Self {
+        Self::new("assistant", content)
+    }
+}
+
+/// Chat template processor using Jinja2
+pub struct ChatTemplateProcessor {
+    template: String,
+    bos_token: Option<String>,
+    eos_token: Option<String>,
+}
+
+impl ChatTemplateProcessor {
+    /// Create a new chat template processor
+    pub fn new(template: String, bos_token: Option<String>, eos_token: Option<String>) -> Self {
+        ChatTemplateProcessor {
+            template,
+            bos_token,
+            eos_token,
+        }
+    }
+
+    /// Apply the chat template to a list of messages
+    ///
+    /// This mimics the behavior of HuggingFace's apply_chat_template method
+    /// but returns the formatted string instead of token IDs.
+    pub fn apply_chat_template(
+        &self,
+        messages: &[ChatMessage],
+        add_generation_prompt: bool,
+    ) -> Result<String> {
+        let mut env = Environment::new();
+
+        // Register the template
+        env.add_template("chat", &self.template)
+            .map_err(|e| anyhow!("Failed to add template: {}", e))?;
+
+        // Get the template
+        let tmpl = env
+            .get_template("chat")
+            .map_err(|e| anyhow!("Failed to get template: {}", e))?;
+
+        // Convert messages to a format Jinja can work with
+        let messages_value: Vec<Value> = messages
+            .iter()
+            .map(|msg| {
+                context! {
+                    role => msg.role.clone(),
+                    content => msg.content.clone()
+                }
+            })
+            .collect();
+
+        // Render the template
+        let rendered = tmpl
+            .render(context! {
+                messages => messages_value,
+                add_generation_prompt => add_generation_prompt,
+                bos_token => self.bos_token.clone().unwrap_or_default(),
+                eos_token => self.eos_token.clone().unwrap_or_default()
+            })
+            .map_err(|e| anyhow!("Failed to render template: {}", e))?;
+
+        Ok(rendered)
+    }
+}
+
+/// Load chat template from tokenizer config JSON
+pub fn load_chat_template_from_config(config_path: &str) -> Result<Option<String>> {
+    use std::fs;
+
+    let content = fs::read_to_string(config_path)?;
+    let config: serde_json::Value = serde_json::from_str(&content)?;
+
+    // Look for chat_template in the config
+    if let Some(template) = config.get("chat_template") {
+        if let Some(template_str) = template.as_str() {
+            return Ok(Some(template_str.to_string()));
+        }
+    }
+
+    Ok(None)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_chat_message_creation() {
+        let msg = ChatMessage::system("You are a helpful assistant");
+        assert_eq!(msg.role, "system");
+        assert_eq!(msg.content, "You are a helpful assistant");
+
+        let user_msg = ChatMessage::user("Hello!");
+        assert_eq!(user_msg.role, "user");
+
+        let assistant_msg = ChatMessage::assistant("Hi there!");
+        assert_eq!(assistant_msg.role, "assistant");
+    }
+
+    #[test]
+    fn test_simple_chat_template() {
+        // Simple template that formats messages
+        let template = r#"
+{%- for message in messages -%}
+{{ message.role }}: {{ message.content }}
+{% endfor -%}
+{%- if add_generation_prompt -%}
+assistant:
+{%- endif -%}
+"#;
+
+        let processor = ChatTemplateProcessor::new(template.to_string(), None, None);
+
+        let messages = vec![
+            ChatMessage::system("You are helpful"),
+            ChatMessage::user("Hello"),
+        ];
+
+        let result = processor.apply_chat_template(&messages, true).unwrap();
+        assert!(result.contains("system: You are helpful"));
+        assert!(result.contains("user: Hello"));
+        assert!(result.contains("assistant:"));
+    }
+
+    #[test]
+    fn test_chat_template_with_tokens() {
+        // Template that uses special tokens
+        let template = r#"
+{{ bos_token }}
+{%- for message in messages -%}
+{{ message.role }}: {{ message.content }}{{ eos_token }}
+{% endfor -%}
+"#;
+
+        let processor = ChatTemplateProcessor::new(
+            template.to_string(),
+            Some("<s>".to_string()),
+            Some("</s>".to_string()),
+        );
+
+        let messages = vec![ChatMessage::user("Test")];
+
+        let result = processor.apply_chat_template(&messages, false).unwrap();
+        assert!(result.contains("<s>"));
+        assert!(result.contains("</s>"));
+    }
+}
diff --git a/sgl-router/src/tokenizer/factory.rs b/sgl-router/src/tokenizer/factory.rs
new file mode 100644
index 000000000..8c80749e2
--- /dev/null
+++ b/sgl-router/src/tokenizer/factory.rs
@@ -0,0 +1,318 @@
+use super::traits;
+use anyhow::{Error, Result};
+use std::fs::File;
+use std::io::Read;
+use std::path::Path;
+use std::sync::Arc;
+
+use super::huggingface::HuggingFaceTokenizer;
+use super::tiktoken::TiktokenTokenizer;
+use crate::tokenizer::hub::download_tokenizer_from_hf;
+
+/// Represents the type of tokenizer being used
+#[derive(Debug, Clone)]
+pub enum TokenizerType {
+    HuggingFace(String),
+    Mock,
+    Tiktoken(String),
+    // Future: SentencePiece, GGUF
+}
+
+/// Create a tokenizer from a file path to a tokenizer file.
+/// The file extension is used to determine the tokenizer type.
+/// Supported file types are:
+/// - json: HuggingFace tokenizer
+/// - For testing: can return mock tokenizer
+pub fn create_tokenizer_from_file(file_path: &str) -> Result<Arc<dyn traits::Tokenizer>> {
+    create_tokenizer_with_chat_template(file_path, None)
+}
+
+/// Create a tokenizer from a file path with an optional chat template
+pub fn create_tokenizer_with_chat_template(
+    file_path: &str,
+    chat_template_path: Option<&str>,
+) -> Result<Arc<dyn traits::Tokenizer>> {
+    // Special case for testing
+    if file_path == "mock" || file_path == "test" {
+        return Ok(Arc::new(super::mock::MockTokenizer::new()));
+    }
+
+    let path = Path::new(file_path);
+
+    // Check if file exists
+    if !path.exists() {
+        return Err(Error::msg(format!("File not found: {}", file_path)));
+    }
+
+    // Try to determine tokenizer type from extension
+    let extension = path
+        .extension()
+        .and_then(std::ffi::OsStr::to_str)
+        .map(|s| s.to_lowercase());
+
+    let result = match extension.as_deref() {
+        Some("json") => {
+            let tokenizer =
+                HuggingFaceTokenizer::from_file_with_chat_template(file_path, chat_template_path)?;
+
+            Ok(Arc::new(tokenizer) as Arc<dyn traits::Tokenizer>)
+        }
+        Some("model") => {
+            // SentencePiece model file
+            Err(Error::msg("SentencePiece models not yet supported"))
+        }
+        Some("gguf") => {
+            // GGUF format
+            Err(Error::msg("GGUF format not yet supported"))
+        }
+        _ => {
+            // Try to auto-detect by reading file content
+            auto_detect_tokenizer(file_path)
+        }
+    };
+
+    result
+}
+
+/// Auto-detect tokenizer type by examining file content
+fn auto_detect_tokenizer(file_path: &str) -> Result<Arc<dyn traits::Tokenizer>> {
+    let mut file = File::open(file_path)?;
+    let mut buffer = vec![0u8; 512]; // Read first 512 bytes for detection
+    let bytes_read = file.read(&mut buffer)?;
+    buffer.truncate(bytes_read);
+
+    // Check for JSON (HuggingFace format)
+    if is_likely_json(&buffer) {
+        let tokenizer = HuggingFaceTokenizer::from_file(file_path)?;
+        return Ok(Arc::new(tokenizer));
+    }
+
+    // Check for GGUF magic number
+    if buffer.len() >= 4 && &buffer[0..4] == b"GGUF" {
+        return Err(Error::msg("GGUF format detected but not yet supported"));
+    }
+
+    // Check for SentencePiece model
+    if is_likely_sentencepiece(&buffer) {
+        return Err(Error::msg(
+            "SentencePiece model detected but not yet supported",
+        ));
+    }
+
+    Err(Error::msg(format!(
+        "Unable to determine tokenizer type for file: {}",
+        file_path
+    )))
+}
+
+/// Check if the buffer likely contains JSON data
+fn is_likely_json(buffer: &[u8]) -> bool {
+    // Skip UTF-8 BOM if present
+    let content = if buffer.len() >= 3 && buffer[0..3] == [0xEF, 0xBB, 0xBF] {
+        &buffer[3..]
+    } else {
+        buffer
+    };
+
+    // Find first non-whitespace character without allocation
+    if let Some(first_byte) = content.iter().find(|&&b| !b.is_ascii_whitespace()) {
+        *first_byte == b'{' || *first_byte == b'['
+    } else {
+        false
+    }
+}
+
+/// Check if the buffer likely contains a SentencePiece model
+fn is_likely_sentencepiece(buffer: &[u8]) -> bool {
+    // SentencePiece models often start with specific patterns
+    // This is a simplified check
+    buffer.len() >= 12
+        && (buffer.starts_with(b"\x0a\x09")
+            || buffer.starts_with(b"\x08\x00")
+            || buffer.windows(4).any(|w| w == b"<unk")
+            || buffer.windows(4).any(|w| w == b"<s>")
+            || buffer.windows(4).any(|w| w == b"</s>"))
+}
+
+/// Factory function to create tokenizer from a model name or path (async version)
+pub async fn create_tokenizer_async(
+    model_name_or_path: &str,
+) -> Result<Arc<dyn traits::Tokenizer>> {
+    // Check if it's a file path
+    let path = Path::new(model_name_or_path);
+    if path.exists() {
+        return create_tokenizer_from_file(model_name_or_path);
+    }
+
+    // Check if it's a GPT model name that should use Tiktoken
+    if model_name_or_path.contains("gpt-")
+        || model_name_or_path.contains("davinci")
+        || model_name_or_path.contains("curie")
+        || model_name_or_path.contains("babbage")
+        || model_name_or_path.contains("ada")
+    {
+        let tokenizer = TiktokenTokenizer::from_model_name(model_name_or_path)?;
+        return Ok(Arc::new(tokenizer));
+    }
+
+    // Try to download tokenizer files from HuggingFace
+    match download_tokenizer_from_hf(model_name_or_path).await {
+        Ok(cache_dir) => {
+            // Look for tokenizer.json in the cache directory
+            let tokenizer_path = cache_dir.join("tokenizer.json");
+            if tokenizer_path.exists() {
+                create_tokenizer_from_file(tokenizer_path.to_str().unwrap())
+            } else {
+                // Try other common tokenizer file names
+                let possible_files = ["tokenizer_config.json", "vocab.json"];
+                for file_name in &possible_files {
+                    let file_path = cache_dir.join(file_name);
+                    if file_path.exists() {
+                        return create_tokenizer_from_file(file_path.to_str().unwrap());
+                    }
+                }
+                Err(Error::msg(format!(
+                    "Downloaded model '{}' but couldn't find a suitable tokenizer file",
+                    model_name_or_path
+                )))
+            }
+        }
+        Err(e) => Err(Error::msg(format!(
+            "Failed to download tokenizer from HuggingFace: {}",
+            e
+        ))),
+    }
+}
+
+/// Factory function to create tokenizer from a model name or path (blocking version)
+pub fn create_tokenizer(model_name_or_path: &str) -> Result<Arc<dyn traits::Tokenizer>> {
+    // Check if it's a file path
+    let path = Path::new(model_name_or_path);
+    if path.exists() {
+        return create_tokenizer_from_file(model_name_or_path);
+    }
+
+    // Check if it's a GPT model name that should use Tiktoken
+    if model_name_or_path.contains("gpt-")
+        || model_name_or_path.contains("davinci")
+        || model_name_or_path.contains("curie")
+        || model_name_or_path.contains("babbage")
+        || model_name_or_path.contains("ada")
+    {
+        let tokenizer = TiktokenTokenizer::from_model_name(model_name_or_path)?;
+        return Ok(Arc::new(tokenizer));
+    }
+
+    // Only use tokio for HuggingFace downloads
+    // Check if we're already in a tokio runtime
+    if let Ok(handle) = tokio::runtime::Handle::try_current() {
+        // We're in a runtime, use block_in_place
+        tokio::task::block_in_place(|| handle.block_on(create_tokenizer_async(model_name_or_path)))
+    } else {
+        // No runtime, create a temporary one
+        let rt = tokio::runtime::Runtime::new()?;
+        rt.block_on(create_tokenizer_async(model_name_or_path))
+    }
+}
+
+/// Get information about a tokenizer file
+pub fn get_tokenizer_info(file_path: &str) -> Result<TokenizerType> {
+    let path = Path::new(file_path);
+
+    if !path.exists() {
+        return Err(Error::msg(format!("File not found: {}", file_path)));
+    }
+
+    let extension = path
+        .extension()
+        .and_then(std::ffi::OsStr::to_str)
+        .map(|s| s.to_lowercase());
+
+    match extension.as_deref() {
+        Some("json") => Ok(TokenizerType::HuggingFace(file_path.to_string())),
+        _ => {
+            // Try auto-detection
+            use std::fs::File;
+            use std::io::Read;
+
+            let mut file = File::open(file_path)?;
+            let mut buffer = vec![0u8; 512];
+            let bytes_read = file.read(&mut buffer)?;
+            buffer.truncate(bytes_read);
+
+            if is_likely_json(&buffer) {
+                Ok(TokenizerType::HuggingFace(file_path.to_string()))
+            } else {
+                Err(Error::msg("Unknown tokenizer type"))
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_json_detection() {
+        assert!(is_likely_json(b"{\"test\": \"value\"}"));
+        assert!(is_likely_json(b"  \n\t{\"test\": \"value\"}"));
+        assert!(is_likely_json(b"[1, 2, 3]"));
+        assert!(!is_likely_json(b"not json"));
+        assert!(!is_likely_json(b""));
+    }
+
+    #[test]
+    fn test_mock_tokenizer_creation() {
+        let tokenizer = create_tokenizer_from_file("mock").unwrap();
+        assert_eq!(tokenizer.vocab_size(), 8); // Mock tokenizer has 8 tokens
+    }
+
+    #[test]
+    fn test_file_not_found() {
+        let result = create_tokenizer_from_file("/nonexistent/file.json");
+        assert!(result.is_err());
+        if let Err(e) = result {
+            assert!(e.to_string().contains("File not found"));
+        }
+    }
+
+    #[test]
+    fn test_create_tiktoken_tokenizer() {
+        // Test creating tokenizer for GPT models
+        let tokenizer = create_tokenizer("gpt-4").unwrap();
+        assert!(tokenizer.vocab_size() > 0);
+
+        // Test encoding and decoding
+        let text = "Hello, world!";
+        let encoding = tokenizer.encode(text).unwrap();
+        let decoded = tokenizer.decode(encoding.token_ids(), false).unwrap();
+        assert_eq!(decoded, text);
+    }
+
+    #[tokio::test]
+    async fn test_download_tokenizer_from_hf() {
+        // Test with a small model that should have tokenizer files
+        // Skip this test if HF_TOKEN is not set and we're in CI
+        if std::env::var("CI").is_ok() && std::env::var("HF_TOKEN").is_err() {
+            println!("Skipping HF download test in CI without HF_TOKEN");
+            return;
+        }
+
+        // Try to create tokenizer for a known small model
+        let result = create_tokenizer_async("bert-base-uncased").await;
+
+        // The test might fail due to network issues or rate limiting
+        // so we just check that the function executes without panic
+        match result {
+            Ok(tokenizer) => {
+                assert!(tokenizer.vocab_size() > 0);
+                println!("Successfully downloaded and created tokenizer");
+            }
+            Err(e) => {
+                println!("Download failed (this might be expected): {}", e);
+                // Don't fail the test - network issues shouldn't break CI
+            }
+        }
+    }
+}
diff --git a/sgl-router/src/tokenizer/hub.rs b/sgl-router/src/tokenizer/hub.rs
new file mode 100644
index 000000000..c9d2cd1a4
--- /dev/null
+++ b/sgl-router/src/tokenizer/hub.rs
@@ -0,0 +1,238 @@
+use hf_hub::api::tokio::ApiBuilder;
+use std::env;
+use std::path::{Path, PathBuf};
+
+const IGNORED: [&str; 5] = [
+    ".gitattributes",
+    "LICENSE",
+    "LICENSE.txt",
+    "README.md",
+    "USE_POLICY.md",
+];
+
+const HF_TOKEN_ENV_VAR: &str = "HF_TOKEN";
+
+/// Checks if a file is a model weight file
+fn is_weight_file(filename: &str) -> bool {
+    filename.ends_with(".bin")
+        || filename.ends_with(".safetensors")
+        || filename.ends_with(".h5")
+        || filename.ends_with(".msgpack")
+        || filename.ends_with(".ckpt.index")
+}
+
+/// Checks if a file is an image file
+fn is_image(filename: &str) -> bool {
+    filename.ends_with(".png")
+        || filename.ends_with("PNG")
+        || filename.ends_with(".jpg")
+        || filename.ends_with("JPG")
+        || filename.ends_with(".jpeg")
+        || filename.ends_with("JPEG")
+}
+
+/// Checks if a file is a tokenizer file
+fn is_tokenizer_file(filename: &str) -> bool {
+    filename.ends_with("tokenizer.json")
+        || filename.ends_with("tokenizer_config.json")
+        || filename.ends_with("special_tokens_map.json")
+        || filename.ends_with("vocab.json")
+        || filename.ends_with("merges.txt")
+        || filename.ends_with(".model")  // SentencePiece models
+        || filename.ends_with(".tiktoken")
+}
+
+/// Attempt to download tokenizer files from Hugging Face
+/// Returns the directory containing the downloaded tokenizer files
+pub async fn download_tokenizer_from_hf(model_id: impl AsRef<Path>) -> anyhow::Result<PathBuf> {
+    let model_id = model_id.as_ref();
+    let token = env::var(HF_TOKEN_ENV_VAR).ok();
+    let api = ApiBuilder::new()
+        .with_progress(true)
+        .with_token(token)
+        .build()?;
+    let model_name = model_id.display().to_string();
+
+    let repo = api.model(model_name.clone());
+
+    let info = match repo.info().await {
+        Ok(info) => info,
+        Err(e) => {
+            return Err(anyhow::anyhow!(
+                "Failed to fetch model '{}' from HuggingFace: {}. Is this a valid HuggingFace ID?",
+                model_name,
+                e
+            ));
+        }
+    };
+
+    if info.siblings.is_empty() {
+        return Err(anyhow::anyhow!(
+            "Model '{}' exists but contains no downloadable files.",
+            model_name
+        ));
+    }
+
+    let mut cache_dir = None;
+    let mut tokenizer_files_found = false;
+
+    // First, identify all tokenizer files to download
+    let tokenizer_files: Vec<_> = info
+        .siblings
+        .iter()
+        .filter(|sib| {
+            !IGNORED.contains(&sib.rfilename.as_str())
+                && !is_image(&sib.rfilename)
+                && !is_weight_file(&sib.rfilename)
+                && is_tokenizer_file(&sib.rfilename)
+        })
+        .collect();
+
+    if tokenizer_files.is_empty() {
+        return Err(anyhow::anyhow!(
+            "No tokenizer files found for model '{}'.",
+            model_name
+        ));
+    }
+
+    // Download all tokenizer files
+    for sib in tokenizer_files {
+        match repo.get(&sib.rfilename).await {
+            Ok(path) => {
+                if cache_dir.is_none() {
+                    cache_dir = path.parent().map(|p| p.to_path_buf());
+                }
+                tokenizer_files_found = true;
+            }
+            Err(e) => {
+                return Err(anyhow::anyhow!(
+                    "Failed to download tokenizer file '{}' from model '{}': {}",
+                    sib.rfilename,
+                    model_name,
+                    e
+                ));
+            }
+        }
+    }
+
+    if !tokenizer_files_found {
+        return Err(anyhow::anyhow!(
+            "No tokenizer files could be downloaded for model '{}'.",
+            model_name
+        ));
+    }
+
+    match cache_dir {
+        Some(dir) => Ok(dir),
+        None => Err(anyhow::anyhow!(
+            "Invalid HF cache path for model '{}'",
+            model_name
+        )),
+    }
+}
+
+/// Attempt to download a model from Hugging Face (including weights)
+/// Returns the directory it is in
+/// If ignore_weights is true, model weight files will be skipped
+pub async fn from_hf(name: impl AsRef<Path>, ignore_weights: bool) -> anyhow::Result<PathBuf> {
+    let name = name.as_ref();
+    let token = env::var(HF_TOKEN_ENV_VAR).ok();
+    let api = ApiBuilder::new()
+        .with_progress(true)
+        .with_token(token)
+        .build()?;
+    let model_name = name.display().to_string();
+
+    let repo = api.model(model_name.clone());
+
+    let info = match repo.info().await {
+        Ok(info) => info,
+        Err(e) => {
+            return Err(anyhow::anyhow!(
+                "Failed to fetch model '{}' from HuggingFace: {}. Is this a valid HuggingFace ID?",
+                model_name,
+                e
+            ));
+        }
+    };
+
+    if info.siblings.is_empty() {
+        return Err(anyhow::anyhow!(
+            "Model '{}' exists but contains no downloadable files.",
+            model_name
+        ));
+    }
+
+    let mut p = PathBuf::new();
+    let mut files_downloaded = false;
+
+    for sib in info.siblings {
+        if IGNORED.contains(&sib.rfilename.as_str()) || is_image(&sib.rfilename) {
+            continue;
+        }
+
+        // If ignore_weights is true, skip weight files
+        if ignore_weights && is_weight_file(&sib.rfilename) {
+            continue;
+        }
+
+        match repo.get(&sib.rfilename).await {
+            Ok(path) => {
+                p = path;
+                files_downloaded = true;
+            }
+            Err(e) => {
+                return Err(anyhow::anyhow!(
+                    "Failed to download file '{}' from model '{}': {}",
+                    sib.rfilename,
+                    model_name,
+                    e
+                ));
+            }
+        }
+    }
+
+    if !files_downloaded {
+        let file_type = if ignore_weights {
+            "non-weight"
+        } else {
+            "valid"
+        };
+        return Err(anyhow::anyhow!(
+            "No {} files found for model '{}'.",
+            file_type,
+            model_name
+        ));
+    }
+
+    match p.parent() {
+        Some(p) => Ok(p.to_path_buf()),
+        None => Err(anyhow::anyhow!("Invalid HF cache path: {}", p.display())),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_is_tokenizer_file() {
+        assert!(is_tokenizer_file("tokenizer.json"));
+        assert!(is_tokenizer_file("tokenizer_config.json"));
+        assert!(is_tokenizer_file("special_tokens_map.json"));
+        assert!(is_tokenizer_file("vocab.json"));
+        assert!(is_tokenizer_file("merges.txt"));
+        assert!(is_tokenizer_file("spiece.model"));
+        assert!(!is_tokenizer_file("model.bin"));
+        assert!(!is_tokenizer_file("README.md"));
+    }
+
+    #[test]
+    fn test_is_weight_file() {
+        assert!(is_weight_file("model.bin"));
+        assert!(is_weight_file("model.safetensors"));
+        assert!(is_weight_file("pytorch_model.bin"));
+        assert!(!is_weight_file("tokenizer.json"));
+        assert!(!is_weight_file("config.json"));
+    }
+}
diff --git a/sgl-router/src/tokenizer/huggingface.rs b/sgl-router/src/tokenizer/huggingface.rs
new file mode 100644
index 000000000..02dce5a0a
--- /dev/null
+++ b/sgl-router/src/tokenizer/huggingface.rs
@@ -0,0 +1,234 @@
+use super::traits::{
+    Decoder, Encoder, Encoding, SpecialTokens, TokenIdType, Tokenizer as TokenizerTrait,
+};
+use anyhow::{Error, Result};
+use std::collections::HashMap;
+use tokenizers::tokenizer::Tokenizer as HfTokenizer;
+
+use super::chat_template::{ChatMessage, ChatTemplateProcessor};
+
+/// HuggingFace tokenizer wrapper
+pub struct HuggingFaceTokenizer {
+    tokenizer: HfTokenizer,
+    special_tokens: SpecialTokens,
+    vocab: HashMap<String, TokenIdType>,
+    reverse_vocab: HashMap<TokenIdType, String>,
+    chat_template: Option<String>,
+}
+
+impl HuggingFaceTokenizer {
+    /// Create a tokenizer from a HuggingFace tokenizer JSON file
+    pub fn from_file(file_path: &str) -> Result<Self> {
+        Self::from_file_with_chat_template(file_path, None)
+    }
+
+    /// Create a tokenizer from a HuggingFace tokenizer JSON file with an optional chat template
+    pub fn from_file_with_chat_template(
+        file_path: &str,
+        chat_template_path: Option<&str>,
+    ) -> Result<Self> {
+        let tokenizer = HfTokenizer::from_file(file_path)
+            .map_err(|e| Error::msg(format!("Failed to load tokenizer: {}", e)))?;
+
+        // Extract special tokens
+        let special_tokens = Self::extract_special_tokens(&tokenizer);
+
+        // Build vocab mappings
+        let vocab = tokenizer.get_vocab(false);
+        let reverse_vocab: HashMap<TokenIdType, String> = vocab
+            .iter()
+            .map(|(token, &id)| (id, token.clone()))
+            .collect();
+
+        // Load chat template
+        let chat_template = if let Some(template_path) = chat_template_path {
+            // Load from specified .jinja file
+            Self::load_chat_template_from_file(template_path)?
+        } else {
+            // Try to load from tokenizer_config.json
+            Self::load_chat_template(file_path)
+        };
+
+        Ok(HuggingFaceTokenizer {
+            tokenizer,
+            special_tokens,
+            vocab,
+            reverse_vocab,
+            chat_template,
+        })
+    }
+
+    /// Create from an existing HuggingFace tokenizer
+    pub fn from_tokenizer(tokenizer: HfTokenizer) -> Self {
+        let special_tokens = Self::extract_special_tokens(&tokenizer);
+        let vocab = tokenizer.get_vocab(false);
+        let reverse_vocab: HashMap<TokenIdType, String> = vocab
+            .iter()
+            .map(|(token, &id)| (id, token.clone()))
+            .collect();
+
+        HuggingFaceTokenizer {
+            tokenizer,
+            special_tokens,
+            vocab,
+            reverse_vocab,
+            chat_template: None,
+        }
+    }
+
+    /// Extract special tokens from the tokenizer
+    fn extract_special_tokens(tokenizer: &HfTokenizer) -> SpecialTokens {
+        // Try to get special tokens from the tokenizer
+        // This is a simplified version - actual implementation would need to handle various formats
+        let vocab = tokenizer.get_vocab(true);
+
+        let find_token = |patterns: &[&str]| -> Option<String> {
+            for pattern in patterns {
+                if vocab.contains_key(*pattern) {
+                    return Some(pattern.to_string());
+                }
+            }
+            None
+        };
+
+        SpecialTokens {
+            bos_token: find_token(&["<s>", "<|startoftext|>", "<BOS>", "[CLS]"]),
+            eos_token: find_token(&["</s>", "<|endoftext|>", "<EOS>", "[SEP]"]),
+            unk_token: find_token(&["<unk>", "<UNK>", "[UNK]"]),
+            sep_token: find_token(&["[SEP]", "<sep>", "<SEP>"]),
+            pad_token: find_token(&["<pad>", "<PAD>", "[PAD]"]),
+            cls_token: find_token(&["[CLS]", "<cls>", "<CLS>"]),
+            mask_token: find_token(&["[MASK]", "<mask>", "<MASK>"]),
+            additional_special_tokens: vec![],
+        }
+    }
+
+    /// Try to load chat template from tokenizer_config.json
+    fn load_chat_template(tokenizer_path: &str) -> Option<String> {
+        // Try to find tokenizer_config.json in the same directory
+        let path = std::path::Path::new(tokenizer_path);
+        let dir = path.parent()?;
+        let config_path = dir.join("tokenizer_config.json");
+
+        if config_path.exists() {
+            if let Ok(template) =
+                super::chat_template::load_chat_template_from_config(config_path.to_str()?)
+            {
+                return template;
+            }
+        }
+        None
+    }
+
+    /// Load chat template from a .jinja file
+    fn load_chat_template_from_file(template_path: &str) -> Result<Option<String>> {
+        use std::fs;
+
+        let content = fs::read_to_string(template_path)
+            .map_err(|e| Error::msg(format!("Failed to read chat template file: {}", e)))?;
+
+        // Clean up the template (similar to Python implementation)
+        let template = content.trim().replace("\\n", "\n");
+
+        Ok(Some(template))
+    }
+
+    /// Set or override the chat template
+    pub fn set_chat_template(&mut self, template: String) {
+        self.chat_template = Some(template);
+    }
+
+    /// Apply chat template if available
+    pub fn apply_chat_template(
+        &self,
+        messages: &[ChatMessage],
+        add_generation_prompt: bool,
+    ) -> Result<String> {
+        if let Some(ref template) = self.chat_template {
+            let processor = ChatTemplateProcessor::new(
+                template.clone(),
+                self.special_tokens.bos_token.clone(),
+                self.special_tokens.eos_token.clone(),
+            );
+            processor.apply_chat_template(messages, add_generation_prompt)
+        } else {
+            // Fallback to simple formatting if no template is available
+            let mut result = String::new();
+            for msg in messages {
+                result.push_str(&format!("{}: {}\n", msg.role, msg.content));
+            }
+            if add_generation_prompt {
+                result.push_str("assistant: ");
+            }
+            Ok(result)
+        }
+    }
+}
+
+impl Encoder for HuggingFaceTokenizer {
+    fn encode(&self, input: &str) -> Result<Encoding> {
+        self.tokenizer
+            .encode(input, false)
+            .map_err(|e| Error::msg(format!("Encoding failed: {}", e)))
+            .map(|encoding| Encoding::Hf(Box::new(encoding)))
+    }
+
+    fn encode_batch(&self, inputs: &[&str]) -> Result<Vec<Encoding>> {
+        let encodings = self
+            .tokenizer
+            .encode_batch(inputs.to_vec(), false)
+            .map_err(|e| Error::msg(format!("Batch encoding failed: {}", e)))?;
+
+        Ok(encodings
+            .into_iter()
+            .map(|e| Encoding::Hf(Box::new(e)))
+            .collect())
+    }
+}
+
+impl Decoder for HuggingFaceTokenizer {
+    fn decode(&self, token_ids: &[TokenIdType], skip_special_tokens: bool) -> Result<String> {
+        self.tokenizer
+            .decode(token_ids, skip_special_tokens)
+            .map_err(|e| Error::msg(format!("Decoding failed: {}", e)))
+    }
+}
+
+impl TokenizerTrait for HuggingFaceTokenizer {
+    fn vocab_size(&self) -> usize {
+        self.tokenizer.get_vocab_size(false)
+    }
+
+    fn get_special_tokens(&self) -> &SpecialTokens {
+        &self.special_tokens
+    }
+
+    fn token_to_id(&self, token: &str) -> Option<TokenIdType> {
+        self.vocab.get(token).copied()
+    }
+
+    fn id_to_token(&self, id: TokenIdType) -> Option<String> {
+        self.reverse_vocab.get(&id).cloned()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::ChatMessage;
+
+    #[test]
+    fn test_chat_message_creation() {
+        let msg = ChatMessage::system("You are a helpful assistant");
+        assert_eq!(msg.role, "system");
+        assert_eq!(msg.content, "You are a helpful assistant");
+
+        let user_msg = ChatMessage::user("Hello!");
+        assert_eq!(user_msg.role, "user");
+
+        let assistant_msg = ChatMessage::assistant("Hi there!");
+        assert_eq!(assistant_msg.role, "assistant");
+    }
+
+    // Note: Actual tokenizer tests would require a real tokenizer file
+    // These would be integration tests rather than unit tests
+}
diff --git a/sgl-router/src/tokenizer/mock.rs b/sgl-router/src/tokenizer/mock.rs
new file mode 100644
index 000000000..afb91543c
--- /dev/null
+++ b/sgl-router/src/tokenizer/mock.rs
@@ -0,0 +1,112 @@
+//! Mock tokenizer implementation for testing
+
+use super::traits::{Decoder, Encoder, Encoding, SpecialTokens, Tokenizer as TokenizerTrait};
+use anyhow::Result;
+use std::collections::HashMap;
+
+/// Mock tokenizer for testing purposes
+pub struct MockTokenizer {
+    vocab: HashMap<String, u32>,
+    reverse_vocab: HashMap<u32, String>,
+    special_tokens: SpecialTokens,
+}
+
+impl Default for MockTokenizer {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl MockTokenizer {
+    pub fn new() -> Self {
+        let mut vocab = HashMap::new();
+        let mut reverse_vocab = HashMap::new();
+
+        // Add some basic tokens
+        let tokens = vec![
+            ("Hello", 1),
+            ("world", 2),
+            ("test", 3),
+            ("token", 4),
+            (" ", 5),
+            (".", 6),
+            ("<eos>", 999),
+            ("<bos>", 1000),
+        ];
+
+        for (token, id) in tokens {
+            vocab.insert(token.to_string(), id);
+            reverse_vocab.insert(id, token.to_string());
+        }
+
+        let special_tokens = SpecialTokens {
+            bos_token: Some("<bos>".to_string()),
+            eos_token: Some("<eos>".to_string()),
+            unk_token: Some("<unk>".to_string()),
+            sep_token: None,
+            pad_token: None,
+            cls_token: None,
+            mask_token: None,
+            additional_special_tokens: vec![],
+        };
+
+        Self {
+            vocab,
+            reverse_vocab,
+            special_tokens,
+        }
+    }
+}
+
+impl Encoder for MockTokenizer {
+    fn encode(&self, input: &str) -> Result<Encoding> {
+        // Simple word-based tokenization for testing
+        let tokens: Vec<u32> = input
+            .split_whitespace()
+            .filter_map(|word| self.vocab.get(word).copied())
+            .collect();
+
+        Ok(Encoding::Sp(tokens))
+    }
+
+    fn encode_batch(&self, inputs: &[&str]) -> Result<Vec<Encoding>> {
+        inputs.iter().map(|input| self.encode(input)).collect()
+    }
+}
+
+impl Decoder for MockTokenizer {
+    fn decode(&self, token_ids: &[u32], skip_special_tokens: bool) -> Result<String> {
+        let tokens: Vec<String> = token_ids
+            .iter()
+            .filter_map(|id| {
+                self.reverse_vocab.get(id).and_then(|token| {
+                    if skip_special_tokens && (token == "<eos>" || token == "<bos>") {
+                        None
+                    } else {
+                        Some(token.clone())
+                    }
+                })
+            })
+            .collect();
+
+        Ok(tokens.join(" "))
+    }
+}
+
+impl TokenizerTrait for MockTokenizer {
+    fn vocab_size(&self) -> usize {
+        self.vocab.len()
+    }
+
+    fn get_special_tokens(&self) -> &SpecialTokens {
+        &self.special_tokens
+    }
+
+    fn token_to_id(&self, token: &str) -> Option<u32> {
+        self.vocab.get(token).copied()
+    }
+
+    fn id_to_token(&self, id: u32) -> Option<String> {
+        self.reverse_vocab.get(&id).cloned()
+    }
+}
diff --git a/sgl-router/src/tokenizer/mod.rs b/sgl-router/src/tokenizer/mod.rs
new file mode 100644
index 000000000..98a23f761
--- /dev/null
+++ b/sgl-router/src/tokenizer/mod.rs
@@ -0,0 +1,123 @@
+use anyhow::Result;
+use std::ops::Deref;
+use std::sync::Arc;
+
+pub mod factory;
+pub mod hub;
+pub mod mock;
+pub mod sequence;
+pub mod stop;
+pub mod stream;
+pub mod traits;
+
+// Feature-gated modules
+
+pub mod chat_template;
+
+pub mod huggingface;
+
+pub mod tiktoken;
+
+#[cfg(test)]
+mod tests;
+
+// Re-exports
+pub use factory::{
+    create_tokenizer, create_tokenizer_async, create_tokenizer_from_file,
+    create_tokenizer_with_chat_template, TokenizerType,
+};
+pub use sequence::Sequence;
+pub use stop::{SequenceDecoderOutput, StopSequenceConfig, StopSequenceDecoder};
+pub use stream::DecodeStream;
+pub use traits::{Decoder, Encoder, Encoding, SpecialTokens, Tokenizer as TokenizerTrait};
+
+pub use huggingface::HuggingFaceTokenizer;
+
+pub use chat_template::ChatMessage;
+
+pub use tiktoken::{TiktokenModel, TiktokenTokenizer};
+
+/// Main tokenizer wrapper that provides a unified interface for different tokenizer implementations
+#[derive(Clone)]
+pub struct Tokenizer(Arc<dyn traits::Tokenizer>);
+
+impl Tokenizer {
+    /// Create a tokenizer from a file path
+    pub fn from_file(file_path: &str) -> Result<Tokenizer> {
+        Ok(Tokenizer(factory::create_tokenizer_from_file(file_path)?))
+    }
+
+    /// Create a tokenizer from a file path with an optional chat template
+    pub fn from_file_with_chat_template(
+        file_path: &str,
+        chat_template_path: Option<&str>,
+    ) -> Result<Tokenizer> {
+        Ok(Tokenizer(factory::create_tokenizer_with_chat_template(
+            file_path,
+            chat_template_path,
+        )?))
+    }
+
+    /// Create a tokenizer from an Arc<dyn Tokenizer>
+    pub fn from_arc(tokenizer: Arc<dyn traits::Tokenizer>) -> Self {
+        Tokenizer(tokenizer)
+    }
+
+    /// Create a stateful sequence object for decoding token_ids into text
+    pub fn decode_stream(
+        &self,
+        prompt_token_ids: &[u32],
+        skip_special_tokens: bool,
+    ) -> DecodeStream {
+        DecodeStream::new(self.0.clone(), prompt_token_ids, skip_special_tokens)
+    }
+
+    /// Direct encode method
+    pub fn encode(&self, input: &str) -> Result<Encoding> {
+        self.0.encode(input)
+    }
+
+    /// Direct batch encode method
+    pub fn encode_batch(&self, inputs: &[&str]) -> Result<Vec<Encoding>> {
+        self.0.encode_batch(inputs)
+    }
+
+    /// Direct decode method
+    pub fn decode(&self, token_ids: &[u32], skip_special_tokens: bool) -> Result<String> {
+        self.0.decode(token_ids, skip_special_tokens)
+    }
+
+    /// Get vocabulary size
+    pub fn vocab_size(&self) -> usize {
+        self.0.vocab_size()
+    }
+
+    /// Get special tokens
+    pub fn get_special_tokens(&self) -> &SpecialTokens {
+        self.0.get_special_tokens()
+    }
+
+    /// Convert token string to ID
+    pub fn token_to_id(&self, token: &str) -> Option<u32> {
+        self.0.token_to_id(token)
+    }
+
+    /// Convert ID to token string
+    pub fn id_to_token(&self, id: u32) -> Option<String> {
+        self.0.id_to_token(id)
+    }
+}
+
+impl Deref for Tokenizer {
+    type Target = Arc<dyn traits::Tokenizer>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl From<Arc<dyn traits::Tokenizer>> for Tokenizer {
+    fn from(tokenizer: Arc<dyn traits::Tokenizer>) -> Self {
+        Tokenizer(tokenizer)
+    }
+}
diff --git a/sgl-router/src/tokenizer/sequence.rs b/sgl-router/src/tokenizer/sequence.rs
new file mode 100644
index 000000000..99801438d
--- /dev/null
+++ b/sgl-router/src/tokenizer/sequence.rs
@@ -0,0 +1,238 @@
+use super::traits::{TokenIdType, Tokenizer as TokenizerTrait};
+use anyhow::Result;
+use std::sync::Arc;
+
+/// Maintains state for an ongoing sequence of tokens and their decoded text
+/// This provides a cleaner abstraction for managing token sequences
+pub struct Sequence {
+    /// The tokenizer used for encoding/decoding
+    tokenizer: Arc<dyn TokenizerTrait>,
+
+    /// The current sequence of token ids
+    token_ids: Vec<TokenIdType>,
+
+    /// The position in the current sequence the last decoded token completed
+    prefix_offset: usize,
+
+    /// Current position in the sequence
+    read_offset: usize,
+}
+
+impl std::fmt::Debug for Sequence {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Sequence")
+            .field("tokenizer", &"Arc<dyn Tokenizer>")
+            .field(
+                "token_ids",
+                &format_args!("{}", {
+                    let token_ids = self.token_ids();
+                    if token_ids.len() <= 20 {
+                        format!("{:?}", token_ids)
+                    } else {
+                        let first_ten = &token_ids[..10];
+                        let last_ten = &token_ids[token_ids.len() - 10..];
+                        format!("{:?} ... {:?}", first_ten, last_ten)
+                    }
+                }),
+            )
+            .field("prefix_offset", &self.prefix_offset)
+            .field("read_offset", &self.read_offset)
+            .field("token count", &self.token_ids.len())
+            .finish()
+    }
+}
+
+impl Sequence {
+    /// Create a new empty sequence
+    pub fn new(tokenizer: Arc<dyn TokenizerTrait>) -> Self {
+        Self {
+            tokenizer,
+            token_ids: Vec::new(),
+            prefix_offset: 0,
+            read_offset: 0,
+        }
+    }
+
+    /// Create a sequence with initial tokens
+    pub fn with_tokens(tokenizer: Arc<dyn TokenizerTrait>, token_ids: Vec<TokenIdType>) -> Self {
+        let len = token_ids.len();
+        Self {
+            tokenizer,
+            token_ids,
+            prefix_offset: 0,
+            read_offset: len,
+        }
+    }
+
+    /// Check if the sequence is empty
+    pub fn is_empty(&self) -> bool {
+        self.token_ids.is_empty()
+    }
+
+    /// Get the length of the sequence
+    pub fn len(&self) -> usize {
+        self.token_ids.len()
+    }
+
+    /// Clear the sequence
+    pub fn clear(&mut self) {
+        self.token_ids.clear();
+        self.prefix_offset = 0;
+        self.read_offset = 0;
+    }
+
+    /// Append text to the sequence by encoding it
+    pub fn append_text(&mut self, input: &str) -> Result<()> {
+        let encoding = self.tokenizer.encode(input)?;
+        self.token_ids.extend(encoding.token_ids());
+        Ok(())
+    }
+
+    /// Append a single token to the sequence and return newly decoded text
+    /// Based on HuggingFace TGI incremental decoding
+    pub fn append_token(&mut self, token_id: TokenIdType) -> Result<String> {
+        // Store the old read offset before adding the new token
+        let old_read_offset = self.read_offset;
+
+        self.token_ids.push(token_id);
+        self.read_offset = self.token_ids.len();
+
+        // If this is the first token or we're at the beginning, decode everything
+        if self.prefix_offset == 0 && old_read_offset == 0 {
+            let text = self.tokenizer.decode(&self.token_ids, false)?;
+            if text.ends_with("�") {
+                // Incomplete UTF-8 sequence, wait for more tokens
+                return Ok(String::new());
+            }
+            self.prefix_offset = 0;
+            return Ok(text);
+        }
+
+        // Decode the text up to the previous position
+        let prefix_text = self
+            .tokenizer
+            .decode(&self.token_ids[self.prefix_offset..old_read_offset], false)?;
+
+        // Decode the text including the new token
+        let new_text = self
+            .tokenizer
+            .decode(&self.token_ids[self.prefix_offset..], false)?;
+
+        // Handle multi-byte character boundaries
+        let mut prefix_text_len = prefix_text.len();
+        while !new_text.is_char_boundary(prefix_text_len) && prefix_text_len > 0 {
+            prefix_text_len -= 1;
+        }
+
+        if new_text.len() > prefix_text.len() {
+            if new_text.ends_with("�") {
+                // Incomplete UTF-8 sequence, wait for more tokens
+                return Ok(String::new());
+            } else {
+                // Return the new text portion
+                let incremental_text = new_text[prefix_text_len..].to_string().replace("�", "");
+                self.prefix_offset = old_read_offset;
+                return Ok(incremental_text);
+            }
+        }
+
+        Ok(String::new())
+    }
+
+    /// Get a reference to the tokenizer
+    pub fn tokenizer(&self) -> &Arc<dyn TokenizerTrait> {
+        &self.tokenizer
+    }
+
+    /// Get the current token ids
+    pub fn token_ids(&self) -> &[TokenIdType] {
+        &self.token_ids
+    }
+
+    /// Decode the entire sequence to text
+    pub fn text(&self) -> Result<String> {
+        self.tokenizer.decode(&self.token_ids, false)
+    }
+
+    /// Get the prefix offset
+    pub fn prefix_offset(&self) -> usize {
+        self.prefix_offset
+    }
+
+    /// Get the read offset
+    pub fn read_offset(&self) -> usize {
+        self.read_offset
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::tokenizer::mock::MockTokenizer;
+
+    #[test]
+    fn test_sequence_new() {
+        let tokenizer = Arc::new(MockTokenizer::new());
+        let seq = Sequence::new(tokenizer);
+        assert!(seq.is_empty());
+        assert_eq!(seq.len(), 0);
+    }
+
+    #[test]
+    fn test_sequence_append_text() {
+        let tokenizer = Arc::new(MockTokenizer::new());
+        let mut seq = Sequence::new(tokenizer);
+
+        seq.append_text("Hello").unwrap();
+        assert!(!seq.is_empty());
+        assert!(!seq.is_empty());
+
+        let text = seq.text().unwrap();
+        assert_eq!(text, "Hello");
+    }
+
+    #[test]
+    fn test_sequence_append_token() {
+        let tokenizer = Arc::new(MockTokenizer::new());
+        let mut seq = Sequence::new(tokenizer.clone());
+
+        // Start with an empty sequence and append token 1 ("Hello")
+        let text1 = seq.append_token(1).unwrap();
+        assert_eq!(text1, "Hello");
+
+        // Now append token 2 ("world")
+        // The mock tokenizer will decode [1, 2] as "Hello world" (with a space)
+        let text2 = seq.append_token(2).unwrap();
+        // The incremental text should be " world" (with the space that the mock tokenizer adds)
+        assert_eq!(text2, " world");
+
+        // Verify the full text
+        assert_eq!(seq.text().unwrap(), "Hello world");
+    }
+
+    #[test]
+    fn test_sequence_clear() {
+        let tokenizer = Arc::new(MockTokenizer::new());
+        let mut seq = Sequence::new(tokenizer);
+
+        seq.append_text("Hello world").unwrap();
+        assert!(!seq.is_empty());
+
+        seq.clear();
+        assert!(seq.is_empty());
+        assert_eq!(seq.len(), 0);
+        assert_eq!(seq.prefix_offset(), 0);
+        assert_eq!(seq.read_offset(), 0);
+    }
+
+    #[test]
+    fn test_sequence_debug() {
+        let tokenizer = Arc::new(MockTokenizer::new());
+        let mut seq = Sequence::new(tokenizer);
+
+        seq.append_text("Test").unwrap();
+        let debug_str = format!("{:?}", seq);
+        assert!(debug_str.contains("Sequence"));
+        assert!(debug_str.contains("token count"));
+    }
+}
diff --git a/sgl-router/src/tokenizer/stop.rs b/sgl-router/src/tokenizer/stop.rs
new file mode 100644
index 000000000..1efda15b6
--- /dev/null
+++ b/sgl-router/src/tokenizer/stop.rs
@@ -0,0 +1,506 @@
+use super::traits::{self, TokenIdType};
+use anyhow::Result;
+use std::collections::HashSet;
+use std::sync::Arc;
+
+/// Output from the sequence decoder
+#[derive(Debug, Clone, PartialEq)]
+pub enum SequenceDecoderOutput {
+    /// Normal text output
+    Text(String),
+    /// Text is being held due to partial stop sequence match
+    Held,
+    /// Stop sequence matched (hidden - not included in output)
+    Stopped,
+    /// Stop sequence matched with text (visible - included in output)
+    StoppedWithText(String),
+}
+
+/// Configuration for stop sequences
+#[derive(Debug, Clone, Default)]
+pub struct StopSequenceConfig {
+    /// Token IDs that trigger a stop
+    pub stop_tokens: HashSet<TokenIdType>,
+    /// String sequences that trigger a stop
+    pub stop_sequences: Vec<String>,
+    /// Token IDs for visible stops (included in output)
+    pub visible_stop_tokens: HashSet<TokenIdType>,
+    /// String sequences for visible stops (included in output)
+    pub visible_stop_sequences: Vec<String>,
+}
+
+impl StopSequenceConfig {
+    /// Builder pattern - add a stop token
+    pub fn with_stop_token(mut self, token_id: TokenIdType) -> Self {
+        self.stop_tokens.insert(token_id);
+        self
+    }
+
+    /// Builder pattern - add a stop sequence
+    pub fn with_stop_sequence(mut self, sequence: impl Into<String>) -> Self {
+        self.stop_sequences.push(sequence.into());
+        self
+    }
+
+    /// Builder pattern - add a visible stop token
+    pub fn with_visible_stop_token(mut self, token_id: TokenIdType) -> Self {
+        self.visible_stop_tokens.insert(token_id);
+        self
+    }
+
+    /// Builder pattern - add a visible stop sequence
+    pub fn with_visible_stop_sequence(mut self, sequence: impl Into<String>) -> Self {
+        self.visible_stop_sequences.push(sequence.into());
+        self
+    }
+}
+
+/// Decoder that handles stop sequences
+pub struct StopSequenceDecoder {
+    tokenizer: Arc<dyn traits::Tokenizer>,
+    config: StopSequenceConfig,
+    /// Buffer for partial matches (the "jail")
+    jail_buffer: String,
+    /// Accumulated tokens
+    token_buffer: Vec<TokenIdType>,
+    /// Offset where the prefix text starts (for context)
+    prefix_offset: usize,
+    /// Offset marking the end of previously decoded text
+    read_offset: usize,
+    /// Whether we've stopped
+    stopped: bool,
+    skip_special_tokens: bool,
+}
+
+impl StopSequenceDecoder {
+    /// Create a new stop sequence decoder
+    pub fn new(
+        tokenizer: Arc<dyn traits::Tokenizer>,
+        config: StopSequenceConfig,
+        skip_special_tokens: bool,
+    ) -> Self {
+        StopSequenceDecoder {
+            tokenizer,
+            config,
+            jail_buffer: String::new(),
+            token_buffer: Vec::new(),
+            prefix_offset: 0,
+            read_offset: 0,
+            stopped: false,
+            skip_special_tokens,
+        }
+    }
+
+    /// Process a single token
+    pub fn process_token(&mut self, token_id: TokenIdType) -> Result<SequenceDecoderOutput> {
+        if self.stopped {
+            return Ok(SequenceDecoderOutput::Stopped);
+        }
+
+        // Check for token-level stops first
+        if self.config.stop_tokens.contains(&token_id) {
+            self.stopped = true;
+
+            // Flush any jailed text before stopping
+            if !self.jail_buffer.is_empty() {
+                let output = self.jail_buffer.clone();
+                self.jail_buffer.clear();
+                return Ok(SequenceDecoderOutput::StoppedWithText(output));
+            }
+            return Ok(SequenceDecoderOutput::Stopped);
+        }
+
+        if self.config.visible_stop_tokens.contains(&token_id) {
+            self.stopped = true;
+
+            // Include jailed text plus the stop token
+            let stop_text = self
+                .tokenizer
+                .decode(&[token_id], self.skip_special_tokens)?;
+            let output = format!("{}{}", self.jail_buffer, stop_text);
+            self.jail_buffer.clear();
+            return Ok(SequenceDecoderOutput::StoppedWithText(output));
+        }
+
+        // Add token to buffer
+        self.token_buffer.push(token_id);
+
+        // Use incremental decoding like DecodeStream
+        // First decode the previous context (what we've already output)
+        let prefix_text = if self.read_offset > self.prefix_offset {
+            self.tokenizer.decode(
+                &self.token_buffer[self.prefix_offset..self.read_offset],
+                self.skip_special_tokens,
+            )?
+        } else {
+            String::new()
+        };
+
+        // Now decode from prefix to current position
+        let new_full_text = self.tokenizer.decode(
+            &self.token_buffer[self.prefix_offset..],
+            self.skip_special_tokens,
+        )?;
+
+        // Check for incomplete UTF-8 sequence
+        if new_full_text.ends_with("�") {
+            // Wait for more tokens to complete the sequence
+            return Ok(SequenceDecoderOutput::Held);
+        }
+
+        // Calculate only the NEW text since last successful decode
+        let new_text = if new_full_text.len() > prefix_text.len() {
+            &new_full_text[prefix_text.len()..]
+        } else {
+            // No new text produced (can happen with special tokens)
+            return Ok(SequenceDecoderOutput::Held);
+        };
+
+        // Combine jail buffer with new text for checking
+        let check_text = format!("{}{}", self.jail_buffer, new_text);
+
+        // Check for complete stop sequences
+        for stop_seq in &self.config.stop_sequences {
+            if let Some(pos) = check_text.find(stop_seq) {
+                self.stopped = true;
+
+                // Output text before the stop sequence
+                let output = check_text[..pos].to_string();
+                self.jail_buffer.clear();
+                return Ok(if output.is_empty() {
+                    SequenceDecoderOutput::Stopped
+                } else {
+                    SequenceDecoderOutput::StoppedWithText(output)
+                });
+            }
+        }
+
+        // Check for visible stop sequences
+        for stop_seq in &self.config.visible_stop_sequences {
+            if let Some(pos) = check_text.find(stop_seq) {
+                self.stopped = true;
+
+                // Include the stop sequence in output
+                let end_pos = pos + stop_seq.len();
+                let output = check_text[..end_pos].to_string();
+                self.jail_buffer.clear();
+                return Ok(SequenceDecoderOutput::StoppedWithText(output));
+            }
+        }
+
+        // Check for partial matches at the end of check_text
+        let mut partial_match_len = 0;
+        for stop_seq in self
+            .config
+            .stop_sequences
+            .iter()
+            .chain(&self.config.visible_stop_sequences)
+        {
+            // Check all possible suffixes that could be a prefix of stop_seq
+            for i in 1..=check_text.len().min(stop_seq.len() - 1) {
+                let suffix = &check_text[check_text.len() - i..];
+                if stop_seq.starts_with(suffix) {
+                    partial_match_len = partial_match_len.max(i);
+                }
+            }
+        }
+
+        if partial_match_len > 0 {
+            // Split: output safe text, jail the potential match
+            let safe_end = check_text.len() - partial_match_len;
+            let safe_text = &check_text[..safe_end];
+            self.jail_buffer = check_text[safe_end..].to_string();
+
+            // Update offsets for next iteration
+            self.prefix_offset = self.read_offset;
+            self.read_offset = self.token_buffer.len();
+
+            if safe_text.is_empty() {
+                Ok(SequenceDecoderOutput::Held)
+            } else {
+                Ok(SequenceDecoderOutput::Text(safe_text.to_string()))
+            }
+        } else {
+            // No partial matches - output everything
+            self.jail_buffer.clear();
+
+            // Update offsets for next iteration
+            self.prefix_offset = self.read_offset;
+            self.read_offset = self.token_buffer.len();
+
+            Ok(SequenceDecoderOutput::Text(check_text))
+        }
+    }
+
+    /// Process multiple tokens
+    pub fn process_tokens(
+        &mut self,
+        token_ids: &[TokenIdType],
+    ) -> Result<Vec<SequenceDecoderOutput>> {
+        let mut outputs = Vec::new();
+        for &token_id in token_ids {
+            outputs.push(self.process_token(token_id)?);
+        }
+        Ok(outputs)
+    }
+
+    /// Flush any held text
+    pub fn flush(&mut self) -> SequenceDecoderOutput {
+        if !self.jail_buffer.is_empty() {
+            let output = self.jail_buffer.clone();
+            self.jail_buffer.clear();
+            SequenceDecoderOutput::Text(output)
+        } else {
+            SequenceDecoderOutput::Text(String::new())
+        }
+    }
+
+    /// Check if decoding has stopped
+    pub fn is_stopped(&self) -> bool {
+        self.stopped
+    }
+
+    /// Reset the decoder state
+    pub fn reset(&mut self) {
+        self.jail_buffer.clear();
+        self.token_buffer.clear();
+        self.prefix_offset = 0;
+        self.read_offset = 0;
+        self.stopped = false;
+    }
+}
+
+/// Builder for StopSequenceDecoder
+pub struct StopSequenceDecoderBuilder {
+    tokenizer: Arc<dyn traits::Tokenizer>,
+    config: StopSequenceConfig,
+    skip_special_tokens: bool,
+}
+
+impl StopSequenceDecoderBuilder {
+    pub fn new(tokenizer: Arc<dyn traits::Tokenizer>) -> Self {
+        StopSequenceDecoderBuilder {
+            tokenizer,
+            config: StopSequenceConfig::default(),
+            skip_special_tokens: true,
+        }
+    }
+
+    pub fn stop_token(mut self, token_id: TokenIdType) -> Self {
+        self.config.stop_tokens.insert(token_id);
+        self
+    }
+
+    pub fn stop_sequence(mut self, sequence: impl Into<String>) -> Self {
+        self.config.stop_sequences.push(sequence.into());
+        self
+    }
+
+    pub fn visible_stop_token(mut self, token_id: TokenIdType) -> Self {
+        self.config.visible_stop_tokens.insert(token_id);
+        self
+    }
+
+    pub fn visible_stop_sequence(mut self, sequence: impl Into<String>) -> Self {
+        self.config.visible_stop_sequences.push(sequence.into());
+        self
+    }
+
+    pub fn skip_special_tokens(mut self, skip: bool) -> Self {
+        self.skip_special_tokens = skip;
+        self
+    }
+
+    pub fn build(self) -> StopSequenceDecoder {
+        StopSequenceDecoder::new(self.tokenizer, self.config, self.skip_special_tokens)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::tokenizer::mock::MockTokenizer;
+
+    #[test]
+    fn test_stop_token_detection() {
+        let tokenizer = Arc::new(MockTokenizer::new());
+        let config = StopSequenceConfig::default().with_stop_token(999); // <eos> token
+
+        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
+
+        // Process tokens before stop
+        let result = decoder.process_token(1).unwrap(); // "Hello"
+        assert!(matches!(result, SequenceDecoderOutput::Text(_)));
+
+        // Process stop token
+        let result = decoder.process_token(999).unwrap(); // <eos>
+        assert_eq!(result, SequenceDecoderOutput::Stopped);
+
+        // Further tokens should also return Stopped
+        let result = decoder.process_token(2).unwrap();
+        assert_eq!(result, SequenceDecoderOutput::Stopped);
+    }
+
+    #[test]
+    fn test_visible_stop_token() {
+        let tokenizer = Arc::new(MockTokenizer::new());
+        let config = StopSequenceConfig::default().with_visible_stop_token(999);
+
+        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
+
+        let result = decoder.process_token(999).unwrap();
+        assert!(matches!(result, SequenceDecoderOutput::StoppedWithText(_)));
+    }
+
+    #[test]
+    fn test_builder_pattern() {
+        let tokenizer = Arc::new(MockTokenizer::new());
+
+        let decoder = StopSequenceDecoderBuilder::new(tokenizer)
+            .stop_token(999)
+            .stop_sequence("STOP")
+            .visible_stop_token(1000)
+            .skip_special_tokens(true)
+            .build();
+
+        assert!(!decoder.is_stopped());
+    }
+
+    #[test]
+    fn test_incremental_decoding_no_repetition() {
+        // This test verifies the critical fix: no repeated output
+        let tokenizer = Arc::new(MockTokenizer::new());
+        let config = StopSequenceConfig::default();
+        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
+
+        // Process tokens one by one and collect outputs
+        let mut outputs = Vec::new();
+
+        // Token 1: "Hello"
+        let result = decoder.process_token(1).unwrap();
+        if let SequenceDecoderOutput::Text(text) = result {
+            outputs.push(text.clone());
+        }
+
+        // Token 2: "world"
+        let result = decoder.process_token(2).unwrap();
+        if let SequenceDecoderOutput::Text(text) = result {
+            outputs.push(text.clone());
+        }
+
+        // Token 3: "test"
+        let result = decoder.process_token(3).unwrap();
+        if let SequenceDecoderOutput::Text(text) = result {
+            outputs.push(text.clone());
+        }
+
+        // CRITICAL: Each output should be unique (no accumulation)
+        // The fix ensures we only output NEW text, not accumulated text
+        assert_eq!(outputs.len(), 3);
+
+        // Verify no text is repeated
+        for i in 0..outputs.len() {
+            for j in i + 1..outputs.len() {
+                // No output should contain another (no accumulation)
+                assert!(!outputs[j].contains(&outputs[i]));
+            }
+        }
+    }
+
+    #[test]
+    fn test_stop_sequence_detection() {
+        let tokenizer = Arc::new(MockTokenizer::new());
+        let config = StopSequenceConfig::default().with_stop_sequence("test");
+        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
+
+        // Process "Hello world"
+        decoder.process_token(1).unwrap(); // "Hello"
+        decoder.process_token(2).unwrap(); // "world"
+
+        // Process "test" which should trigger stop
+        let result = decoder.process_token(3).unwrap(); // "test"
+
+        // Should stop when we hit "test"
+        assert!(matches!(
+            result,
+            SequenceDecoderOutput::Stopped | SequenceDecoderOutput::StoppedWithText(_)
+        ));
+    }
+
+    #[test]
+    fn test_flush_after_partial() {
+        let tokenizer = Arc::new(MockTokenizer::new());
+        let config = StopSequenceConfig::default().with_stop_sequence("NEVER_MATCH");
+        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
+
+        // Process a token
+        decoder.process_token(1).unwrap(); // "Hello"
+
+        // Flush should return any remaining text in jail
+        let result = decoder.flush();
+
+        // After processing, flush should work
+        assert!(matches!(result, SequenceDecoderOutput::Text(_)));
+    }
+
+    #[test]
+    fn test_reset_functionality() {
+        let tokenizer = Arc::new(MockTokenizer::new());
+        let config = StopSequenceConfig::default().with_stop_token(999);
+        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
+
+        // Process and stop
+        decoder.process_token(1).unwrap();
+        decoder.process_token(999).unwrap();
+        assert!(decoder.is_stopped());
+
+        // Reset should clear everything
+        decoder.reset();
+        assert!(!decoder.is_stopped());
+
+        // Should be able to process again
+        let result = decoder.process_token(2).unwrap();
+        assert!(matches!(result, SequenceDecoderOutput::Text(_)));
+    }
+
+    #[test]
+    fn test_visible_stop_sequence() {
+        let tokenizer = Arc::new(MockTokenizer::new());
+        let config = StopSequenceConfig::default().with_visible_stop_sequence("world");
+        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
+
+        // Process "Hello"
+        decoder.process_token(1).unwrap();
+
+        // Process "world" - should include it in output
+        let result = decoder.process_token(2).unwrap();
+
+        if let SequenceDecoderOutput::StoppedWithText(text) = result {
+            // Should include "world" in the output
+            assert!(text.contains("world"));
+        } else {
+            panic!("Expected StoppedWithText with visible stop sequence");
+        }
+    }
+
+    #[test]
+    fn test_multiple_tokens_processing() {
+        let tokenizer = Arc::new(MockTokenizer::new());
+        let config = StopSequenceConfig::default();
+        let mut decoder = StopSequenceDecoder::new(tokenizer, config, false);
+
+        // Process multiple tokens at once
+        let results = decoder.process_tokens(&[1, 2, 3]).unwrap();
+
+        // Should get results for each token
+        assert_eq!(results.len(), 3);
+
+        // Each result should be Text (no stops configured)
+        for result in results {
+            assert!(matches!(
+                result,
+                SequenceDecoderOutput::Text(_) | SequenceDecoderOutput::Held
+            ));
+        }
+    }
+}
diff --git a/sgl-router/src/tokenizer/stream.rs b/sgl-router/src/tokenizer/stream.rs
new file mode 100644
index 000000000..848be8a8c
--- /dev/null
+++ b/sgl-router/src/tokenizer/stream.rs
@@ -0,0 +1,105 @@
+// src/tokenizer/stream.rs
+
+use super::traits::{self, TokenIdType};
+use anyhow::Result;
+use std::sync::Arc;
+
+const INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET: usize = 5;
+
+/// DecodeStream will keep the state necessary to produce individual chunks of
+/// strings given an input stream of token_ids
+pub struct DecodeStream {
+    /// The tokenizer used to decode token_ids
+    tokenizer: Arc<dyn traits::Tokenizer>,
+
+    skip_special_tokens: bool,
+
+    /// A temporary buffer of the necessary token_ids needed
+    /// to produce valid string chunks
+    all_token_ids: Vec<TokenIdType>,
+
+    prefix_offset: usize,
+    read_offset: usize,
+}
+
+impl DecodeStream {
+    pub fn new(
+        tokenizer: Arc<dyn traits::Tokenizer>,
+        prompt_token_ids: &[TokenIdType],
+        skip_special_tokens: bool,
+    ) -> Self {
+        let num_input_tokens = prompt_token_ids.len();
+        let prompt_token_ids = prompt_token_ids.to_vec();
+        Self {
+            tokenizer,
+            skip_special_tokens,
+            all_token_ids: prompt_token_ids,
+            prefix_offset: num_input_tokens
+                .saturating_sub(INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET),
+            read_offset: num_input_tokens,
+        }
+    }
+
+    /// Step appends a token_id to the internal state and tries to produce a text chunk.
+    /// Returning `None` means the given id is not enough to produce a chunk.
+    pub fn step(&mut self, id: TokenIdType) -> Result<Option<String>> {
+        self.all_token_ids.push(id);
+
+        let prefix_text = self.tokenizer.decode(
+            &self.all_token_ids[self.prefix_offset..self.read_offset],
+            self.skip_special_tokens,
+        )?;
+
+        let new_text = self.tokenizer.decode(
+            &self.all_token_ids[self.prefix_offset..],
+            self.skip_special_tokens,
+        )?;
+
+        if new_text.len() > prefix_text.len() && !new_text.ends_with("�") {
+            let new_text = new_text[prefix_text.len()..].to_string();
+
+            self.prefix_offset = self.read_offset;
+            self.read_offset = self.all_token_ids.len();
+
+            Ok(Some(new_text))
+        } else {
+            Ok(None)
+        }
+    }
+
+    /// Process multiple tokens at once
+    pub fn step_batch(&mut self, token_ids: &[u32]) -> Result<Vec<String>> {
+        let mut chunks = Vec::new();
+
+        for &token_id in token_ids {
+            if let Some(text) = self.step(token_id)? {
+                chunks.push(text);
+            }
+        }
+
+        Ok(chunks)
+    }
+
+    /// Force flush any remaining text
+    pub fn flush(&mut self) -> Result<Option<String>> {
+        if self.read_offset < self.all_token_ids.len() {
+            let remaining = self.tokenizer.decode(
+                &self.all_token_ids[self.read_offset..],
+                self.skip_special_tokens,
+            )?;
+
+            self.read_offset = self.all_token_ids.len();
+
+            if !remaining.is_empty() {
+                return Ok(Some(remaining));
+            }
+        }
+
+        Ok(None)
+    }
+
+    /// Get all tokens processed so far
+    pub fn tokens(&self) -> &[u32] {
+        &self.all_token_ids
+    }
+}
diff --git a/sgl-router/src/tokenizer/tests.rs b/sgl-router/src/tokenizer/tests.rs
new file mode 100644
index 000000000..2c4d4b108
--- /dev/null
+++ b/sgl-router/src/tokenizer/tests.rs
@@ -0,0 +1,143 @@
+#[cfg(test)]
+use super::*;
+#[cfg(test)]
+use std::sync::Arc;
+
+#[test]
+fn test_mock_tokenizer_encode() {
+    let tokenizer = mock::MockTokenizer::new();
+    let encoding = tokenizer.encode("Hello world").unwrap();
+    let token_ids = encoding.token_ids();
+    assert_eq!(token_ids, &[1, 2]); // "Hello" -> 1, "world" -> 2
+}
+
+#[test]
+fn test_mock_tokenizer_decode() {
+    let tokenizer = mock::MockTokenizer::new();
+    let text = tokenizer.decode(&[1, 2], false).unwrap();
+    assert_eq!(text, "Hello world");
+}
+
+#[test]
+fn test_mock_tokenizer_decode_skip_special() {
+    let tokenizer = mock::MockTokenizer::new();
+
+    // With special tokens
+    let text = tokenizer.decode(&[1000, 1, 2, 999], false).unwrap();
+    assert_eq!(text, "<bos> Hello world <eos>");
+
+    // Without special tokens
+    let text = tokenizer.decode(&[1000, 1, 2, 999], true).unwrap();
+    assert_eq!(text, "Hello world");
+}
+
+#[test]
+fn test_tokenizer_wrapper() {
+    let mock_tokenizer = Arc::new(mock::MockTokenizer::new());
+    let tokenizer = Tokenizer::from_arc(mock_tokenizer);
+
+    // Test encoding
+    let encoding = tokenizer.encode("Hello world").unwrap();
+    assert_eq!(encoding.token_ids(), &[1, 2]);
+
+    // Test decoding
+    let text = tokenizer.decode(&[1, 2], false).unwrap();
+    assert_eq!(text, "Hello world");
+
+    // Test vocab size
+    assert_eq!(tokenizer.vocab_size(), 8);
+
+    // Test token to ID
+    assert_eq!(tokenizer.token_to_id("Hello"), Some(1));
+    assert_eq!(tokenizer.token_to_id("unknown"), None);
+
+    // Test ID to token
+    assert_eq!(tokenizer.id_to_token(1), Some("Hello".to_string()));
+    assert_eq!(tokenizer.id_to_token(9999), None);
+}
+
+#[test]
+fn test_decode_stream_basic() {
+    let mock_tokenizer = Arc::new(mock::MockTokenizer::new());
+    let tokenizer = Tokenizer::from_arc(mock_tokenizer);
+
+    // Create a decode stream with initial tokens
+    let initial_tokens = vec![1, 2]; // "Hello world"
+    let mut stream = tokenizer.decode_stream(&initial_tokens, false);
+
+    // Add a new token
+    let result = stream.step(3).unwrap(); // "test"
+                                          // Since we're using a mock, the actual incremental behavior depends on implementation
+                                          // For now, we just verify it doesn't crash
+    assert!(result.is_some() || result.is_none());
+}
+
+#[test]
+fn test_decode_stream_flush() {
+    let mock_tokenizer = Arc::new(mock::MockTokenizer::new());
+    let tokenizer = Tokenizer::from_arc(mock_tokenizer);
+
+    let initial_tokens = vec![1];
+    let mut stream = tokenizer.decode_stream(&initial_tokens, false);
+
+    // Add tokens
+    stream.step(2).unwrap();
+    stream.step(3).unwrap();
+
+    // Flush remaining
+    let flushed = stream.flush().unwrap();
+    // The flush behavior depends on the implementation
+    assert!(flushed.is_some() || flushed.is_none());
+}
+
+#[test]
+fn test_special_tokens() {
+    let mock_tokenizer = Arc::new(mock::MockTokenizer::new());
+    let tokenizer = Tokenizer::from_arc(mock_tokenizer);
+
+    let special_tokens = tokenizer.get_special_tokens();
+    assert_eq!(special_tokens.bos_token, Some("<bos>".to_string()));
+    assert_eq!(special_tokens.eos_token, Some("<eos>".to_string()));
+    assert_eq!(special_tokens.unk_token, Some("<unk>".to_string()));
+    assert!(special_tokens.sep_token.is_none());
+    assert!(special_tokens.pad_token.is_none());
+}
+
+#[test]
+fn test_batch_encode() {
+    let tokenizer = mock::MockTokenizer::new();
+    let inputs = vec!["Hello", "world", "test"];
+    let encodings = tokenizer.encode_batch(&inputs).unwrap();
+
+    assert_eq!(encodings.len(), 3);
+    assert_eq!(encodings[0].token_ids(), &[1]); // "Hello" -> 1
+    assert_eq!(encodings[1].token_ids(), &[2]); // "world" -> 2
+    assert_eq!(encodings[2].token_ids(), &[3]); // "test" -> 3
+}
+
+#[test]
+fn test_thread_safety() {
+    use std::thread;
+
+    let mock_tokenizer = Arc::new(mock::MockTokenizer::new());
+    let tokenizer = Tokenizer::from_arc(mock_tokenizer);
+
+    // Spawn multiple threads that use the same tokenizer
+    let handles: Vec<_> = (0..10)
+        .map(|i| {
+            let tokenizer_clone = tokenizer.clone();
+            thread::spawn(move || {
+                let text = "Hello test".to_string();
+                let encoding = tokenizer_clone.encode(&text).unwrap();
+                let decoded = tokenizer_clone.decode(encoding.token_ids(), false).unwrap();
+                assert!(decoded.contains("Hello") || decoded.contains("test"));
+                i
+            })
+        })
+        .collect();
+
+    // Wait for all threads to complete
+    for handle in handles {
+        handle.join().unwrap();
+    }
+}
diff --git a/sgl-router/src/tokenizer/tiktoken.rs b/sgl-router/src/tokenizer/tiktoken.rs
new file mode 100644
index 000000000..0af5a9791
--- /dev/null
+++ b/sgl-router/src/tokenizer/tiktoken.rs
@@ -0,0 +1,276 @@
+use super::traits::{
+    Decoder, Encoder, Encoding, SpecialTokens, TokenIdType, Tokenizer as TokenizerTrait,
+};
+use anyhow::{Error, Result};
+use tiktoken_rs::{cl100k_base, p50k_base, p50k_edit, r50k_base, CoreBPE};
+
+/// Tiktoken tokenizer wrapper for OpenAI GPT models
+pub struct TiktokenTokenizer {
+    tokenizer: CoreBPE,
+    #[allow(dead_code)]
+    model: TiktokenModel,
+    special_tokens: SpecialTokens,
+    vocab_size: usize,
+}
+
+/// Supported Tiktoken models
+#[derive(Debug, Clone, Copy)]
+pub enum TiktokenModel {
+    /// GPT-4, GPT-3.5-turbo, text-embedding-ada-002
+    Cl100kBase,
+    /// Codex models, text-davinci-002, text-davinci-003
+    P50kBase,
+    /// Use for edit models like text-davinci-edit-001, code-davinci-edit-001
+    P50kEdit,
+    /// GPT-3 models like davinci
+    R50kBase,
+}
+
+impl TiktokenTokenizer {
+    /// Create a new Tiktoken tokenizer for the specified model
+    pub fn new(model: TiktokenModel) -> Result<Self> {
+        let tokenizer =
+            match model {
+                TiktokenModel::Cl100kBase => cl100k_base()
+                    .map_err(|e| Error::msg(format!("Failed to load cl100k_base: {}", e)))?,
+                TiktokenModel::P50kBase => p50k_base()
+                    .map_err(|e| Error::msg(format!("Failed to load p50k_base: {}", e)))?,
+                TiktokenModel::P50kEdit => p50k_edit()
+                    .map_err(|e| Error::msg(format!("Failed to load p50k_edit: {}", e)))?,
+                TiktokenModel::R50kBase => r50k_base()
+                    .map_err(|e| Error::msg(format!("Failed to load r50k_base: {}", e)))?,
+            };
+
+        // Extract special tokens (tiktoken-rs doesn't expose them directly)
+        // We'll use common ones for GPT models
+        let special_tokens = Self::get_special_tokens_for_model(model);
+
+        // Get vocabulary size (this is an approximation)
+        let vocab_size = match model {
+            TiktokenModel::Cl100kBase => 100256, // cl100k has ~100k tokens
+            TiktokenModel::P50kBase | TiktokenModel::P50kEdit => 50281, // p50k has ~50k tokens
+            TiktokenModel::R50kBase => 50257,    // r50k has ~50k tokens
+        };
+
+        Ok(TiktokenTokenizer {
+            tokenizer,
+            model,
+            special_tokens,
+            vocab_size,
+        })
+    }
+
+    /// Create a tokenizer from a model string (e.g., "gpt-4", "gpt-3.5-turbo")
+    pub fn from_model_name(model_name: &str) -> Result<Self> {
+        let model = Self::model_from_name(model_name)?;
+        Self::new(model)
+    }
+
+    /// Determine the appropriate model from a model name
+    fn model_from_name(model_name: &str) -> Result<TiktokenModel> {
+        // Based on OpenAI's model-to-encoding mapping
+        if model_name.contains("gpt-4")
+            || model_name.contains("gpt-3.5")
+            || model_name.contains("turbo")
+        {
+            Ok(TiktokenModel::Cl100kBase)
+        } else if model_name.contains("davinci-002")
+            || model_name.contains("davinci-003")
+            || model_name.contains("codex")
+        {
+            Ok(TiktokenModel::P50kBase)
+        } else if model_name.contains("edit") {
+            Ok(TiktokenModel::P50kEdit)
+        } else if model_name.contains("davinci")
+            || model_name.contains("curie")
+            || model_name.contains("babbage")
+            || model_name.contains("ada")
+        {
+            Ok(TiktokenModel::R50kBase)
+        } else {
+            // Return an error for unrecognized model names to prevent silent failures
+            Err(anyhow::anyhow!(
+                "Unrecognized OpenAI model name: '{}'. Expected GPT-3, GPT-3.5, GPT-4, or related model names",
+                model_name
+            ))
+        }
+    }
+
+    /// Get special tokens for a specific model
+    fn get_special_tokens_for_model(model: TiktokenModel) -> SpecialTokens {
+        // These are common special tokens for GPT models
+        // The actual token IDs might vary by model
+        match model {
+            TiktokenModel::Cl100kBase => SpecialTokens {
+                bos_token: Some("<|endoftext|>".to_string()),
+                eos_token: Some("<|endoftext|>".to_string()),
+                unk_token: None,
+                sep_token: None,
+                pad_token: Some("<|endoftext|>".to_string()),
+                cls_token: None,
+                mask_token: None,
+                additional_special_tokens: vec![
+                    "<|fim_prefix|>".to_string(),
+                    "<|fim_middle|>".to_string(),
+                    "<|fim_suffix|>".to_string(),
+                    "<|endofprompt|>".to_string(),
+                ],
+            },
+            _ => SpecialTokens {
+                bos_token: Some("<|endoftext|>".to_string()),
+                eos_token: Some("<|endoftext|>".to_string()),
+                unk_token: None,
+                sep_token: None,
+                pad_token: Some("<|endoftext|>".to_string()),
+                cls_token: None,
+                mask_token: None,
+                additional_special_tokens: vec![],
+            },
+        }
+    }
+}
+
+impl Encoder for TiktokenTokenizer {
+    fn encode(&self, input: &str) -> Result<Encoding> {
+        let tokens = self.tokenizer.encode_ordinary(input);
+        Ok(Encoding::Tiktoken(tokens))
+    }
+
+    fn encode_batch(&self, inputs: &[&str]) -> Result<Vec<Encoding>> {
+        inputs.iter().map(|input| self.encode(input)).collect()
+    }
+}
+
+impl Decoder for TiktokenTokenizer {
+    fn decode(&self, token_ids: &[TokenIdType], _skip_special_tokens: bool) -> Result<String> {
+        // tiktoken-rs 0.7.0 now uses u32 (Rank type)
+        self.tokenizer
+            .decode(token_ids.to_vec())
+            .map_err(|e| Error::msg(format!("Decoding failed: {}", e)))
+    }
+}
+
+impl TokenizerTrait for TiktokenTokenizer {
+    fn vocab_size(&self) -> usize {
+        self.vocab_size
+    }
+
+    fn get_special_tokens(&self) -> &SpecialTokens {
+        &self.special_tokens
+    }
+
+    fn token_to_id(&self, _token: &str) -> Option<TokenIdType> {
+        // Tiktoken doesn't provide direct token-to-id mapping
+        // We'd need to encode the token and check if it produces a single ID
+        None
+    }
+
+    fn id_to_token(&self, _id: TokenIdType) -> Option<String> {
+        // Tiktoken doesn't provide direct id-to-token mapping
+        // We can only decode IDs to text
+        None
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_tiktoken_creation() {
+        let tokenizer = TiktokenTokenizer::new(TiktokenModel::Cl100kBase).unwrap();
+        assert_eq!(tokenizer.vocab_size(), 100256);
+    }
+
+    #[test]
+    fn test_model_from_name() {
+        assert!(matches!(
+            TiktokenTokenizer::model_from_name("gpt-4").unwrap(),
+            TiktokenModel::Cl100kBase
+        ));
+        assert!(matches!(
+            TiktokenTokenizer::model_from_name("gpt-3.5-turbo").unwrap(),
+            TiktokenModel::Cl100kBase
+        ));
+        assert!(matches!(
+            TiktokenTokenizer::model_from_name("text-davinci-003").unwrap(),
+            TiktokenModel::P50kBase
+        ));
+        assert!(matches!(
+            TiktokenTokenizer::model_from_name("text-davinci-edit-001").unwrap(),
+            TiktokenModel::P50kEdit
+        ));
+        assert!(matches!(
+            TiktokenTokenizer::model_from_name("davinci").unwrap(),
+            TiktokenModel::R50kBase
+        ));
+    }
+
+    #[test]
+    fn test_encode_decode() {
+        let tokenizer = TiktokenTokenizer::new(TiktokenModel::Cl100kBase).unwrap();
+
+        let text = "Hello, world!";
+        let encoding = tokenizer.encode(text).unwrap();
+
+        let decoded = tokenizer.decode(encoding.token_ids(), false).unwrap();
+        assert_eq!(decoded, text);
+    }
+
+    #[test]
+    fn test_batch_encode() {
+        let tokenizer = TiktokenTokenizer::new(TiktokenModel::Cl100kBase).unwrap();
+
+        let texts = vec!["Hello", "World", "Test"];
+        let encodings = tokenizer.encode_batch(&texts).unwrap();
+
+        assert_eq!(encodings.len(), 3);
+        for (i, encoding) in encodings.iter().enumerate() {
+            let decoded = tokenizer.decode(encoding.token_ids(), false).unwrap();
+            assert_eq!(decoded, texts[i]);
+        }
+    }
+
+    #[test]
+    fn test_special_tokens() {
+        let tokenizer = TiktokenTokenizer::new(TiktokenModel::Cl100kBase).unwrap();
+        let special_tokens = tokenizer.get_special_tokens();
+
+        assert!(special_tokens.eos_token.is_some());
+        assert_eq!(special_tokens.eos_token.as_ref().unwrap(), "<|endoftext|>");
+    }
+
+    #[test]
+    fn test_unrecognized_model_name_returns_error() {
+        // Test that unrecognized model names return an error
+        let result = TiktokenTokenizer::from_model_name("distilgpt-2");
+        assert!(result.is_err());
+        if let Err(e) = result {
+            assert!(e.to_string().contains("Unrecognized OpenAI model name"));
+        }
+
+        let result = TiktokenTokenizer::from_model_name("bert-base-uncased");
+        assert!(result.is_err());
+        if let Err(e) = result {
+            assert!(e.to_string().contains("Unrecognized OpenAI model name"));
+        }
+
+        let result = TiktokenTokenizer::from_model_name("llama-7b");
+        assert!(result.is_err());
+        if let Err(e) = result {
+            assert!(e.to_string().contains("Unrecognized OpenAI model name"));
+        }
+    }
+
+    #[test]
+    fn test_recognized_model_names() {
+        // Test that recognized model names work correctly
+        assert!(TiktokenTokenizer::from_model_name("gpt-4").is_ok());
+        assert!(TiktokenTokenizer::from_model_name("gpt-3.5-turbo").is_ok());
+        assert!(TiktokenTokenizer::from_model_name("text-davinci-003").is_ok());
+        assert!(TiktokenTokenizer::from_model_name("code-davinci-002").is_ok());
+        assert!(TiktokenTokenizer::from_model_name("text-curie-001").is_ok());
+        assert!(TiktokenTokenizer::from_model_name("text-babbage-001").is_ok());
+        assert!(TiktokenTokenizer::from_model_name("text-ada-001").is_ok());
+    }
+}
diff --git a/sgl-router/src/tokenizer/traits.rs b/sgl-router/src/tokenizer/traits.rs
new file mode 100644
index 000000000..275dd822f
--- /dev/null
+++ b/sgl-router/src/tokenizer/traits.rs
@@ -0,0 +1,83 @@
+use anyhow::Result;
+use std::collections::hash_map::DefaultHasher;
+use std::hash::{Hash, Hasher};
+
+/// Type alias for token IDs
+pub type TokenIdType = u32;
+
+/// Core encoding trait - separate from decoding for modularity
+pub trait Encoder: Send + Sync {
+    fn encode(&self, input: &str) -> Result<Encoding>;
+    fn encode_batch(&self, inputs: &[&str]) -> Result<Vec<Encoding>>;
+}
+
+/// Core decoding trait - can be implemented independently
+pub trait Decoder: Send + Sync {
+    fn decode(&self, token_ids: &[TokenIdType], skip_special_tokens: bool) -> Result<String>;
+}
+
+/// Combined tokenizer trait
+pub trait Tokenizer: Encoder + Decoder {
+    fn vocab_size(&self) -> usize;
+    fn get_special_tokens(&self) -> &SpecialTokens;
+    fn token_to_id(&self, token: &str) -> Option<TokenIdType>;
+    fn id_to_token(&self, id: TokenIdType) -> Option<String>;
+}
+
+/// Contains the results of tokenizing text: token IDs, string tokens, and their spans
+#[derive(Debug, Clone)]
+pub enum Encoding {
+    /// Hugging Face
+    Hf(Box<tokenizers::tokenizer::Encoding>),
+    /// Sentence Piece
+    Sp(Vec<TokenIdType>),
+    /// Tiktoken (for GPT models) - now uses u32 in tiktoken-rs 0.7.0
+    Tiktoken(Vec<TokenIdType>),
+}
+
+impl Encoding {
+    /// Returns a reference to token IDs - zero-copy operation
+    pub fn token_ids(&self) -> &[TokenIdType] {
+        match self {
+            Encoding::Hf(inner) => inner.get_ids(),
+            Encoding::Sp(inner) => inner,
+            Encoding::Tiktoken(inner) => inner,
+        }
+    }
+
+    /// Deprecated: Use token_ids() instead (kept for compatibility)
+    #[deprecated(since = "0.1.0", note = "Use token_ids() instead")]
+    pub fn token_ids_ref(&self) -> &[TokenIdType] {
+        self.token_ids()
+    }
+
+    /// Get a hash of the token IDs for caching purposes
+    pub fn get_hash(&self) -> u64 {
+        let mut hasher = DefaultHasher::new();
+        self.hash(&mut hasher);
+        hasher.finish()
+    }
+}
+
+/// Hash implementation for Encoding
+impl Hash for Encoding {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        match self {
+            Encoding::Hf(inner) => inner.get_ids().hash(state),
+            Encoding::Sp(inner) => inner.hash(state),
+            Encoding::Tiktoken(inner) => inner.hash(state),
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct SpecialTokens {
+    pub bos_token: Option<String>,
+    pub eos_token: Option<String>,
+    pub unk_token: Option<String>,
+    pub sep_token: Option<String>,
+    pub pad_token: Option<String>,
+    pub cls_token: Option<String>,
+    pub mask_token: Option<String>,
+    pub additional_special_tokens: Vec<String>,
+}
diff --git a/sgl-router/src/tool_parser/errors.rs b/sgl-router/src/tool_parser/errors.rs
new file mode 100644
index 000000000..30129a596
--- /dev/null
+++ b/sgl-router/src/tool_parser/errors.rs
@@ -0,0 +1,32 @@
+use thiserror::Error;
+
+/// Result type for tool parser operations
+pub type ToolParserResult<T> = Result<T, ToolParserError>;
+
+/// Errors that can occur during tool parsing
+#[derive(Debug, Error)]
+pub enum ToolParserError {
+    #[error("Parsing failed: {0}")]
+    ParsingFailed(String),
+
+    #[error("Model not supported: {0}")]
+    ModelNotSupported(String),
+
+    #[error("Parse depth exceeded: max {0}")]
+    DepthExceeded(usize),
+
+    #[error("Invalid JSON: {0}")]
+    JsonError(#[from] serde_json::Error),
+
+    #[error("Regex error: {0}")]
+    RegexError(#[from] regex::Error),
+
+    #[error("Incomplete tool call")]
+    Incomplete,
+
+    #[error("Invalid tool name: {0}")]
+    InvalidToolName(String),
+
+    #[error("Token not found: {0}")]
+    TokenNotFound(String),
+}
diff --git a/sgl-router/src/tool_parser/mod.rs b/sgl-router/src/tool_parser/mod.rs
new file mode 100644
index 000000000..41b8fae2f
--- /dev/null
+++ b/sgl-router/src/tool_parser/mod.rs
@@ -0,0 +1,30 @@
+/// Tool parser module for handling function/tool calls in model outputs
+///
+/// This module provides infrastructure for parsing tool calls from various model formats.
+// Core modules
+pub mod errors;
+pub mod partial_json;
+pub mod python_literal_parser;
+pub mod registry;
+pub mod state;
+pub mod traits;
+pub mod types;
+
+// Parser implementations
+pub mod parsers;
+
+#[cfg(test)]
+mod tests;
+
+// Re-export commonly used types
+pub use errors::{ToolParserError, ToolParserResult};
+pub use registry::ParserRegistry;
+pub use state::{ParsePhase, ParseState};
+pub use traits::{PartialJsonParser, ToolParser};
+pub use types::{FunctionCall, PartialToolCall, StreamResult, TokenConfig, ToolCall};
+
+// Re-export parsers for convenience
+pub use parsers::{
+    DeepSeekParser, Glm4MoeParser, GptOssParser, JsonParser, KimiK2Parser, LlamaParser,
+    MistralParser, PythonicParser, QwenParser, Step3Parser,
+};
diff --git a/sgl-router/src/tool_parser/parsers/deepseek_parser.rs b/sgl-router/src/tool_parser/parsers/deepseek_parser.rs
new file mode 100644
index 000000000..5e467bf2b
--- /dev/null
+++ b/sgl-router/src/tool_parser/parsers/deepseek_parser.rs
@@ -0,0 +1,277 @@
+use async_trait::async_trait;
+use regex::Regex;
+use serde_json::Value;
+
+use crate::tool_parser::{
+    errors::{ToolParserError, ToolParserResult},
+    partial_json::PartialJson,
+    state::ParseState,
+    traits::ToolParser,
+    types::{FunctionCall, StreamResult, ToolCall},
+};
+
+/// DeepSeek V3 format parser for tool calls
+///
+/// Handles the DeepSeek V3 specific format that uses Unicode tokens:
+/// `<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>{name}\n```json\n{args}\n```<｜tool▁call▁end｜><｜tool▁calls▁end｜>`
+///
+/// Features:
+/// - Unicode token delimiters
+/// - JSON arguments in code blocks
+/// - Support for multiple sequential tool calls
+pub struct DeepSeekParser {
+    /// Parser for handling incomplete JSON during streaming
+    partial_json: PartialJson,
+    /// Regex for extracting complete tool calls
+    tool_call_extractor: Regex,
+    /// Regex for extracting function details
+    func_detail_extractor: Regex,
+}
+
+impl DeepSeekParser {
+    /// Create a new DeepSeek parser
+    pub fn new() -> Self {
+        // Use (?s) flag for DOTALL mode to handle newlines
+        let tool_call_pattern = r"(?s)<｜tool▁call▁begin｜>.*?<｜tool▁call▁end｜>";
+        let tool_call_extractor = Regex::new(tool_call_pattern).expect("Valid regex pattern");
+
+        let func_detail_pattern = r"(?s)<｜tool▁call▁begin｜>(.*?)<｜tool▁sep｜>(.*?)\n```json\n(.*?)\n```<｜tool▁call▁end｜>";
+        let func_detail_extractor = Regex::new(func_detail_pattern).expect("Valid regex pattern");
+
+        Self {
+            partial_json: PartialJson::default(),
+            tool_call_extractor,
+            func_detail_extractor,
+        }
+    }
+
+    /// Check if text contains DeepSeek tool markers
+    fn has_tool_markers(&self, text: &str) -> bool {
+        text.contains("<｜tool▁calls▁begin｜>")
+    }
+
+    /// Extract all tool call blocks from text
+    fn extract_tool_calls<'a>(&self, text: &'a str) -> Vec<&'a str> {
+        self.tool_call_extractor
+            .find_iter(text)
+            .map(|m| m.as_str())
+            .collect()
+    }
+
+    /// Parse a single tool call block
+    fn parse_tool_call(&self, block: &str) -> ToolParserResult<Option<ToolCall>> {
+        if let Some(captures) = self.func_detail_extractor.captures(block) {
+            // Get function type (should be "function")
+            let func_type = captures.get(1).map_or("", |m| m.as_str());
+            if func_type != "function" {
+                return Ok(None);
+            }
+
+            // Get function name
+            let func_name = captures.get(2).map_or("", |m| m.as_str()).trim();
+
+            // Get JSON arguments
+            let json_args = captures.get(3).map_or("{}", |m| m.as_str()).trim();
+
+            // Parse JSON arguments
+            match serde_json::from_str::<Value>(json_args) {
+                Ok(value) => {
+                    // Create arguments object
+                    let args = if value.is_object() {
+                        value
+                    } else {
+                        // If not an object, wrap it
+                        serde_json::json!({ "value": value })
+                    };
+
+                    let arguments = serde_json::to_string(&args)
+                        .map_err(|e| ToolParserError::ParsingFailed(e.to_string()))?;
+
+                    // Generate ID
+                    let id = format!("deepseek_call_{}", uuid::Uuid::new_v4());
+
+                    Ok(Some(ToolCall {
+                        id,
+                        r#type: "function".to_string(),
+                        function: FunctionCall {
+                            name: func_name.to_string(),
+                            arguments,
+                        },
+                    }))
+                }
+                Err(_) => Ok(None),
+            }
+        } else {
+            Ok(None)
+        }
+    }
+}
+
+impl Default for DeepSeekParser {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[async_trait]
+impl ToolParser for DeepSeekParser {
+    async fn parse_complete(&self, text: &str) -> ToolParserResult<Vec<ToolCall>> {
+        // Check if text contains DeepSeek format
+        if !self.has_tool_markers(text) {
+            return Ok(vec![]);
+        }
+
+        // Extract all tool call blocks
+        let tool_blocks = self.extract_tool_calls(text);
+        let mut tools = Vec::new();
+
+        for block in tool_blocks {
+            if let Some(tool) = self.parse_tool_call(block)? {
+                tools.push(tool);
+            }
+        }
+
+        Ok(tools)
+    }
+
+    async fn parse_incremental(
+        &self,
+        chunk: &str,
+        state: &mut ParseState,
+    ) -> ToolParserResult<StreamResult> {
+        state.buffer.push_str(chunk);
+
+        // Check for tool markers
+        if !self.has_tool_markers(&state.buffer) {
+            // No markers found, return as incomplete
+            return Ok(StreamResult::Incomplete);
+        }
+
+        // Look for start of tool calls
+        if let Some(start_pos) = state.buffer.find("<｜tool▁calls▁begin｜>") {
+            // Look for individual tool call start
+            let search_from = start_pos + "<｜tool▁calls▁begin｜>".len();
+            if let Some(call_start) = state.buffer[search_from..].find("<｜tool▁call▁begin｜>")
+            {
+                let call_start_abs = search_from + call_start;
+
+                // Look for the end of this tool call
+                let search_end_from = call_start_abs + "<｜tool▁call▁begin｜>".len();
+                if let Some(call_end) = state.buffer[search_end_from..].find("<｜tool▁call▁end｜>")
+                {
+                    let call_end_abs = search_end_from + call_end + "<｜tool▁call▁end｜>".len();
+
+                    // Extract and parse the complete tool call
+                    let tool_call_text = &state.buffer[call_start_abs..call_end_abs];
+
+                    if let Some(tool) = self.parse_tool_call(tool_call_text)? {
+                        // Remove the processed part from buffer
+                        state.buffer.drain(..call_end_abs);
+
+                        return Ok(StreamResult::ToolComplete(tool));
+                    }
+                } else {
+                    // Tool call not complete yet, try to extract partial info
+                    let partial = &state.buffer[search_end_from..];
+
+                    // Try to extract function name
+                    if let Some(sep_pos) = partial.find("<｜tool▁sep｜>") {
+                        if let Some(_func_start) = partial[..sep_pos].rfind("function") {
+                            // We have the function type marker
+                            let after_sep = &partial[sep_pos + "<｜tool▁sep｜>".len()..];
+
+                            // Look for function name (ends at newline before ```json)
+                            if let Some(name_end) = after_sep.find("\n```json\n") {
+                                let func_name = after_sep[..name_end].trim();
+
+                                if !state.in_string {
+                                    state.in_string = true; // Mark name as sent
+                                    return Ok(StreamResult::ToolName {
+                                        index: 0,
+                                        name: func_name.to_string(),
+                                    });
+                                }
+
+                                // Try to extract partial arguments
+                                let args_start = name_end + "\n```json\n".len();
+                                let partial_args = &after_sep[args_start..];
+
+                                // Check if we can parse partial JSON
+                                if !partial_args.is_empty() {
+                                    match self.partial_json.parse_value(partial_args) {
+                                        Ok((value, _consumed)) => {
+                                            let args_str = serde_json::to_string(&value)
+                                                .unwrap_or_else(|_| "{}".to_string());
+
+                                            return Ok(StreamResult::ToolArguments {
+                                                index: 0,
+                                                arguments: args_str,
+                                            });
+                                        }
+                                        Err(_) => {
+                                            // Can't parse yet, keep buffering
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        Ok(StreamResult::Incomplete)
+    }
+
+    fn detect_format(&self, text: &str) -> bool {
+        self.has_tool_markers(text)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_parse_deepseek_single_tool() {
+        let parser = DeepSeekParser::new();
+        let input = r#"Some text
+<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>get_weather
+```json
+{"location": "Tokyo", "units": "celsius"}
+```<｜tool▁call▁end｜><｜tool▁calls▁end｜>More text"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "get_weather");
+        assert!(result[0].function.arguments.contains("Tokyo"));
+    }
+
+    #[tokio::test]
+    async fn test_parse_deepseek_multiple_tools() {
+        let parser = DeepSeekParser::new();
+        let input = r#"<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>get_weather
+```json
+{"location": "Tokyo"}
+```<｜tool▁call▁end｜>
+<｜tool▁call▁begin｜>function<｜tool▁sep｜>get_weather
+```json
+{"location": "Paris"}
+```<｜tool▁call▁end｜><｜tool▁calls▁end｜>"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 2);
+        assert_eq!(result[0].function.name, "get_weather");
+        assert_eq!(result[1].function.name, "get_weather");
+        assert!(result[0].function.arguments.contains("Tokyo"));
+        assert!(result[1].function.arguments.contains("Paris"));
+    }
+
+    #[test]
+    fn test_detect_format() {
+        let parser = DeepSeekParser::new();
+        assert!(parser.detect_format("<｜tool▁calls▁begin｜>"));
+        assert!(!parser.detect_format("plain text"));
+        assert!(!parser.detect_format("[TOOL_CALLS]"));
+    }
+}
diff --git a/sgl-router/src/tool_parser/parsers/glm4_moe_parser.rs b/sgl-router/src/tool_parser/parsers/glm4_moe_parser.rs
new file mode 100644
index 000000000..017de1256
--- /dev/null
+++ b/sgl-router/src/tool_parser/parsers/glm4_moe_parser.rs
@@ -0,0 +1,292 @@
+use async_trait::async_trait;
+use regex::Regex;
+use serde_json::Value;
+
+use crate::tool_parser::{
+    errors::{ToolParserError, ToolParserResult},
+    state::ParseState,
+    traits::ToolParser,
+    types::{FunctionCall, StreamResult, ToolCall},
+};
+
+/// GLM-4 MoE format parser for tool calls
+///
+/// Handles the GLM-4 MoE specific format:
+/// `<tool_call>{name}\n<arg_key>{key}</arg_key>\n<arg_value>{value}</arg_value>\n</tool_call>`
+///
+/// Features:
+/// - XML-style tags for tool calls
+/// - Key-value pairs for arguments
+/// - Support for multiple sequential tool calls
+pub struct Glm4MoeParser {
+    /// Regex for extracting complete tool calls
+    tool_call_extractor: Regex,
+    /// Regex for extracting function details
+    func_detail_extractor: Regex,
+    /// Regex for extracting argument key-value pairs
+    arg_extractor: Regex,
+}
+
+impl Glm4MoeParser {
+    /// Create a new GLM-4 MoE parser
+    pub fn new() -> Self {
+        // Use (?s) flag for DOTALL mode to handle newlines
+        let tool_call_pattern = r"(?s)<tool_call>.*?</tool_call>";
+        let tool_call_extractor = Regex::new(tool_call_pattern).expect("Valid regex pattern");
+
+        let func_detail_pattern = r"(?s)<tool_call>([^\n]*)\n(.*)</tool_call>";
+        let func_detail_extractor = Regex::new(func_detail_pattern).expect("Valid regex pattern");
+
+        let arg_pattern = r"(?s)<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>";
+        let arg_extractor = Regex::new(arg_pattern).expect("Valid regex pattern");
+
+        Self {
+            tool_call_extractor,
+            func_detail_extractor,
+            arg_extractor,
+        }
+    }
+
+    /// Check if text contains GLM-4 MoE tool markers
+    fn has_tool_markers(&self, text: &str) -> bool {
+        text.contains("<tool_call>")
+    }
+
+    /// Parse arguments from key-value pairs
+    fn parse_arguments(&self, args_text: &str) -> ToolParserResult<serde_json::Map<String, Value>> {
+        let mut arguments = serde_json::Map::new();
+
+        for capture in self.arg_extractor.captures_iter(args_text) {
+            let key = capture.get(1).map_or("", |m| m.as_str()).trim();
+            let value_str = capture.get(2).map_or("", |m| m.as_str()).trim();
+
+            // Try to parse the value as JSON first, fallback to string
+            let value = if let Ok(json_val) = serde_json::from_str::<Value>(value_str) {
+                json_val
+            } else {
+                // Try parsing as Python literal (similar to Python's ast.literal_eval)
+                if value_str == "true" || value_str == "True" {
+                    Value::Bool(true)
+                } else if value_str == "false" || value_str == "False" {
+                    Value::Bool(false)
+                } else if value_str == "null" || value_str == "None" {
+                    Value::Null
+                } else if let Ok(num) = value_str.parse::<i64>() {
+                    Value::Number(num.into())
+                } else if let Ok(num) = value_str.parse::<f64>() {
+                    if let Some(n) = serde_json::Number::from_f64(num) {
+                        Value::Number(n)
+                    } else {
+                        Value::String(value_str.to_string())
+                    }
+                } else {
+                    Value::String(value_str.to_string())
+                }
+            };
+
+            arguments.insert(key.to_string(), value);
+        }
+
+        Ok(arguments)
+    }
+
+    /// Parse a single tool call block
+    fn parse_tool_call(&self, block: &str) -> ToolParserResult<Option<ToolCall>> {
+        if let Some(captures) = self.func_detail_extractor.captures(block) {
+            // Get function name
+            let func_name = captures.get(1).map_or("", |m| m.as_str()).trim();
+
+            // Get arguments text
+            let args_text = captures.get(2).map_or("", |m| m.as_str());
+
+            // Parse arguments
+            let arguments = self.parse_arguments(args_text)?;
+
+            let arguments_str = serde_json::to_string(&arguments)
+                .map_err(|e| ToolParserError::ParsingFailed(e.to_string()))?;
+
+            // Generate ID
+            let id = format!("glm4_call_{}", uuid::Uuid::new_v4());
+
+            Ok(Some(ToolCall {
+                id,
+                r#type: "function".to_string(),
+                function: FunctionCall {
+                    name: func_name.to_string(),
+                    arguments: arguments_str,
+                },
+            }))
+        } else {
+            Ok(None)
+        }
+    }
+}
+
+impl Default for Glm4MoeParser {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[async_trait]
+impl ToolParser for Glm4MoeParser {
+    async fn parse_complete(&self, text: &str) -> ToolParserResult<Vec<ToolCall>> {
+        // Check if text contains GLM-4 MoE format
+        if !self.has_tool_markers(text) {
+            return Ok(vec![]);
+        }
+
+        // Extract all tool call blocks
+        let mut tools = Vec::new();
+        for mat in self.tool_call_extractor.find_iter(text) {
+            if let Some(tool) = self.parse_tool_call(mat.as_str())? {
+                tools.push(tool);
+            }
+        }
+
+        Ok(tools)
+    }
+
+    async fn parse_incremental(
+        &self,
+        chunk: &str,
+        state: &mut ParseState,
+    ) -> ToolParserResult<StreamResult> {
+        state.buffer.push_str(chunk);
+
+        // Check for tool markers
+        if !self.has_tool_markers(&state.buffer) {
+            // No markers found, return as incomplete
+            return Ok(StreamResult::Incomplete);
+        }
+
+        // Look for start of tool call
+        if let Some(start_pos) = state.buffer.find("<tool_call>") {
+            // Look for the end of this tool call
+            let search_from = start_pos + "<tool_call>".len();
+            if let Some(end_pos) = state.buffer[search_from..].find("</tool_call>") {
+                let end_abs = search_from + end_pos + "</tool_call>".len();
+
+                // Extract and parse the complete tool call
+                let tool_call_text = &state.buffer[start_pos..end_abs];
+
+                if let Some(tool) = self.parse_tool_call(tool_call_text)? {
+                    // Remove the processed part from buffer
+                    state.buffer.drain(..end_abs);
+
+                    return Ok(StreamResult::ToolComplete(tool));
+                }
+            } else {
+                // Tool call not complete yet, try to extract partial info
+                let partial = &state.buffer[search_from..];
+
+                // Try to extract function name (first line after <tool_call>)
+                if let Some(name_end) = partial.find('\n') {
+                    let func_name = partial[..name_end].trim();
+
+                    if !func_name.is_empty() && !state.in_string {
+                        state.in_string = true; // Mark name as sent
+                        return Ok(StreamResult::ToolName {
+                            index: 0,
+                            name: func_name.to_string(),
+                        });
+                    }
+
+                    // Try to extract partial arguments
+                    let args_text = &partial[name_end + 1..];
+                    let partial_args = self.parse_arguments(args_text)?;
+
+                    if !partial_args.is_empty() {
+                        let args_str = serde_json::to_string(&partial_args)
+                            .unwrap_or_else(|_| "{}".to_string());
+
+                        return Ok(StreamResult::ToolArguments {
+                            index: 0,
+                            arguments: args_str,
+                        });
+                    }
+                }
+            }
+        }
+
+        Ok(StreamResult::Incomplete)
+    }
+
+    fn detect_format(&self, text: &str) -> bool {
+        self.has_tool_markers(text)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_parse_glm4_single_tool() {
+        let parser = Glm4MoeParser::new();
+        let input = r#"Some text
+<tool_call>get_weather
+<arg_key>city</arg_key>
+<arg_value>Beijing</arg_value>
+<arg_key>date</arg_key>
+<arg_value>2024-06-27</arg_value>
+</tool_call>More text"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "get_weather");
+        assert!(result[0].function.arguments.contains("Beijing"));
+        assert!(result[0].function.arguments.contains("2024-06-27"));
+    }
+
+    #[tokio::test]
+    async fn test_parse_glm4_multiple_tools() {
+        let parser = Glm4MoeParser::new();
+        let input = r#"<tool_call>get_weather
+<arg_key>city</arg_key>
+<arg_value>Beijing</arg_value>
+</tool_call>
+<tool_call>get_weather
+<arg_key>city</arg_key>
+<arg_value>Shanghai</arg_value>
+</tool_call>"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 2);
+        assert_eq!(result[0].function.name, "get_weather");
+        assert_eq!(result[1].function.name, "get_weather");
+        assert!(result[0].function.arguments.contains("Beijing"));
+        assert!(result[1].function.arguments.contains("Shanghai"));
+    }
+
+    #[tokio::test]
+    async fn test_parse_glm4_mixed_types() {
+        let parser = Glm4MoeParser::new();
+        let input = r#"<tool_call>process_data
+<arg_key>count</arg_key>
+<arg_value>42</arg_value>
+<arg_key>active</arg_key>
+<arg_value>true</arg_value>
+<arg_key>name</arg_key>
+<arg_value>test</arg_value>
+</tool_call>"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "process_data");
+
+        // Parse arguments to check types
+        let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+        assert_eq!(args["count"], 42);
+        assert_eq!(args["active"], true);
+        assert_eq!(args["name"], "test");
+    }
+
+    #[test]
+    fn test_detect_format() {
+        let parser = Glm4MoeParser::new();
+        assert!(parser.detect_format("<tool_call>"));
+        assert!(!parser.detect_format("plain text"));
+        assert!(!parser.detect_format("[TOOL_CALLS]"));
+    }
+}
diff --git a/sgl-router/src/tool_parser/parsers/gpt_oss_parser.rs b/sgl-router/src/tool_parser/parsers/gpt_oss_parser.rs
new file mode 100644
index 000000000..646161a72
--- /dev/null
+++ b/sgl-router/src/tool_parser/parsers/gpt_oss_parser.rs
@@ -0,0 +1,292 @@
+use async_trait::async_trait;
+use regex::Regex;
+use serde_json::Value;
+
+use crate::tool_parser::{
+    errors::{ToolParserError, ToolParserResult},
+    partial_json::PartialJson,
+    state::ParseState,
+    traits::ToolParser,
+    types::{FunctionCall, StreamResult, ToolCall},
+};
+
+/// GPT-OSS format parser for tool calls
+///
+/// Handles the GPT-OSS specific channel format:
+/// `<|channel|>commentary to={namespace.function}<|constrain|>json<|message|>{json_args}<|call|>`
+///
+/// Features:
+/// - Channel-based format with commentary
+/// - Namespaced function calls
+/// - JSON arguments
+pub struct GptOssParser {
+    /// Parser for handling incomplete JSON during streaming
+    partial_json: PartialJson,
+    /// Regex for extracting complete function calls
+    function_call_extractor: Regex,
+    /// Regex for extracting streaming function calls
+    streaming_extractor: Regex,
+}
+
+impl GptOssParser {
+    /// Create a new GPT-OSS parser
+    pub fn new() -> Self {
+        // Pattern for complete function calls with to= parameter
+        // Handles optional <|start|>assistant prefix and whitespace after function name
+        let function_call_pattern = r"(?s)(?:<\|start\|>assistant)?<\|channel\|>commentary to=([a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)*)\s*<\|constrain\|>json<\|message\|>(.*?)<\|call\|>(?:commentary)?";
+        let function_call_extractor =
+            Regex::new(function_call_pattern).expect("Valid regex pattern");
+
+        // Pattern for streaming function calls (incomplete)
+        let streaming_pattern = r"(?s)(?:<\|start\|>assistant)?<\|channel\|>commentary to=([a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)*)\s*<\|constrain\|>json<\|message\|>(.*)";
+        let streaming_extractor = Regex::new(streaming_pattern).expect("Valid regex pattern");
+
+        Self {
+            partial_json: PartialJson::default(),
+            function_call_extractor,
+            streaming_extractor,
+        }
+    }
+
+    /// Check if text contains GPT-OSS tool markers
+    fn has_tool_markers(&self, text: &str) -> bool {
+        text.contains("<|channel|>commentary to=")
+    }
+
+    /// Extract function name from full namespace (e.g., "functions.get_weather" -> "get_weather")
+    fn extract_function_name(&self, full_name: &str) -> String {
+        if let Some(dot_pos) = full_name.rfind('.') {
+            full_name[dot_pos + 1..].to_string()
+        } else {
+            full_name.to_string()
+        }
+    }
+}
+
+impl Default for GptOssParser {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[async_trait]
+impl ToolParser for GptOssParser {
+    async fn parse_complete(&self, text: &str) -> ToolParserResult<Vec<ToolCall>> {
+        // Check if text contains GPT-OSS format
+        if !self.has_tool_markers(text) {
+            return Ok(vec![]);
+        }
+
+        let mut tools = Vec::new();
+        let mut _tool_index = 0;
+
+        // Extract all function calls
+        for captures in self.function_call_extractor.captures_iter(text) {
+            if let (Some(name_match), Some(args_match)) = (captures.get(1), captures.get(2)) {
+                let full_function_name = name_match.as_str();
+                let args_content = args_match.as_str().trim();
+
+                // Extract actual function name
+                let function_name = self.extract_function_name(full_function_name);
+
+                // Parse JSON arguments
+                let arguments = if args_content.is_empty() {
+                    "{}".to_string()
+                } else {
+                    match serde_json::from_str::<Value>(args_content) {
+                        Ok(value) => serde_json::to_string(&value)
+                            .map_err(|e| ToolParserError::ParsingFailed(e.to_string()))?,
+                        Err(_) => {
+                            // Skip malformed JSON
+                            continue;
+                        }
+                    }
+                };
+
+                // Generate unique ID
+                let id = format!("gpt_oss_call_{}", uuid::Uuid::new_v4());
+
+                tools.push(ToolCall {
+                    id,
+                    r#type: "function".to_string(),
+                    function: FunctionCall {
+                        name: function_name,
+                        arguments,
+                    },
+                });
+
+                _tool_index += 1;
+            }
+        }
+
+        Ok(tools)
+    }
+
+    async fn parse_incremental(
+        &self,
+        chunk: &str,
+        state: &mut ParseState,
+    ) -> ToolParserResult<StreamResult> {
+        state.buffer.push_str(chunk);
+
+        // Check for tool markers
+        if !self.has_tool_markers(&state.buffer) {
+            // No markers found, clear buffer and return
+            state.buffer.clear();
+            return Ok(StreamResult::Incomplete);
+        }
+
+        // Try to match streaming pattern
+        if let Some(captures) = self.streaming_extractor.captures(&state.buffer) {
+            if let (Some(name_match), Some(args_match)) = (captures.get(1), captures.get(2)) {
+                let full_function_name = name_match.as_str();
+                let partial_args = args_match.as_str();
+
+                // Extract actual function name
+                let function_name = self.extract_function_name(full_function_name);
+
+                // Send function name if not sent yet
+                if !state.in_string {
+                    state.in_string = true; // Mark name as sent
+                    return Ok(StreamResult::ToolName {
+                        index: 0,
+                        name: function_name.clone(),
+                    });
+                }
+
+                // Check if we have a complete function call
+                if let Some(complete_match) = self.function_call_extractor.captures(&state.buffer) {
+                    if let Some(args_match) = complete_match.get(2) {
+                        let args_content = args_match.as_str().trim();
+
+                        // Parse JSON arguments
+                        let arguments = if args_content.is_empty() {
+                            "{}".to_string()
+                        } else {
+                            match serde_json::from_str::<Value>(args_content) {
+                                Ok(value) => serde_json::to_string(&value)
+                                    .unwrap_or_else(|_| "{}".to_string()),
+                                Err(_) => "{}".to_string(),
+                            }
+                        };
+
+                        // Generate unique ID
+                        let id = format!("gpt_oss_call_{}", uuid::Uuid::new_v4());
+
+                        let tool = ToolCall {
+                            id,
+                            r#type: "function".to_string(),
+                            function: FunctionCall {
+                                name: function_name,
+                                arguments,
+                            },
+                        };
+
+                        // Remove the processed part from buffer
+                        let complete_end = complete_match.get(0).unwrap().end();
+                        state.buffer.drain(..complete_end);
+
+                        // Reset state for next tool
+                        state.in_string = false;
+
+                        return Ok(StreamResult::ToolComplete(tool));
+                    }
+                } else {
+                    // Try to parse partial JSON for streaming arguments
+                    if !partial_args.is_empty() {
+                        // Look for the end of JSON (before <|call|>)
+                        let json_part = if let Some(call_pos) = partial_args.find("<|call|>") {
+                            &partial_args[..call_pos]
+                        } else {
+                            partial_args
+                        };
+
+                        match self.partial_json.parse_value(json_part) {
+                            Ok((value, _consumed)) => {
+                                let args_str = serde_json::to_string(&value)
+                                    .unwrap_or_else(|_| "{}".to_string());
+
+                                return Ok(StreamResult::ToolArguments {
+                                    index: 0,
+                                    arguments: args_str,
+                                });
+                            }
+                            Err(_) => {
+                                // Can't parse yet, keep buffering
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        Ok(StreamResult::Incomplete)
+    }
+
+    fn detect_format(&self, text: &str) -> bool {
+        self.has_tool_markers(text) || text.contains("<|channel|>commentary")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_parse_gpt_oss_single_tool() {
+        let parser = GptOssParser::new();
+        let input = r#"Some text
+<|channel|>commentary to=functions.get_weather<|constrain|>json<|message|>{"location": "San Francisco"}<|call|>
+More text"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "get_weather");
+        assert!(result[0].function.arguments.contains("San Francisco"));
+    }
+
+    #[tokio::test]
+    async fn test_parse_gpt_oss_multiple_tools() {
+        let parser = GptOssParser::new();
+        let input = r#"<|channel|>commentary to=functions.get_weather<|constrain|>json<|message|>{"location": "Paris"}<|call|>commentary
+<|channel|>commentary to=functions.search<|constrain|>json<|message|>{"query": "Paris tourism"}<|call|>"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 2);
+        assert_eq!(result[0].function.name, "get_weather");
+        assert_eq!(result[1].function.name, "search");
+        assert!(result[0].function.arguments.contains("Paris"));
+        assert!(result[1].function.arguments.contains("Paris tourism"));
+    }
+
+    #[tokio::test]
+    async fn test_parse_gpt_oss_with_prefix() {
+        let parser = GptOssParser::new();
+        let input = r#"<|start|>assistant<|channel|>commentary to=functions.test<|constrain|>json<|message|>{"key": "value"}<|call|>"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "test");
+    }
+
+    #[tokio::test]
+    async fn test_parse_gpt_oss_empty_args() {
+        let parser = GptOssParser::new();
+        let input =
+            r#"<|channel|>commentary to=functions.get_time<|constrain|>json<|message|>{}<|call|>"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "get_time");
+        assert_eq!(result[0].function.arguments, "{}");
+    }
+
+    #[test]
+    fn test_detect_format() {
+        let parser = GptOssParser::new();
+        assert!(parser.detect_format("<|channel|>commentary to="));
+        assert!(parser.detect_format("<|channel|>commentary"));
+        assert!(!parser.detect_format("plain text"));
+        assert!(!parser.detect_format("[TOOL_CALLS]"));
+    }
+}
diff --git a/sgl-router/src/tool_parser/parsers/json_parser.rs b/sgl-router/src/tool_parser/parsers/json_parser.rs
new file mode 100644
index 000000000..b8430dc9e
--- /dev/null
+++ b/sgl-router/src/tool_parser/parsers/json_parser.rs
@@ -0,0 +1,619 @@
+use async_trait::async_trait;
+use regex::Regex;
+use serde_json::Value;
+
+use crate::tool_parser::{
+    errors::{ToolParserError, ToolParserResult},
+    partial_json::PartialJson,
+    state::ParseState,
+    traits::ToolParser,
+    types::{FunctionCall, StreamResult, TokenConfig, ToolCall},
+};
+
+/// JSON format parser for tool calls
+///
+/// Handles various JSON formats for function calling:
+/// - Single tool call: {"name": "fn", "arguments": {...}}
+/// - Multiple tool calls: [{"name": "fn1", "arguments": {...}}, ...]
+/// - With parameters instead of arguments: {"name": "fn", "parameters": {...}}
+///
+/// Supports configurable token markers for different models
+pub struct JsonParser {
+    /// Token configuration for parsing
+    token_config: TokenConfig,
+    /// Parser for handling incomplete JSON during streaming
+    partial_json: PartialJson,
+    /// Regex patterns for extracting content between tokens
+    extractors: Vec<Regex>,
+}
+
+impl JsonParser {
+    /// Create a new JSON parser with default configuration
+    pub fn new() -> Self {
+        Self::with_config(TokenConfig {
+            start_tokens: vec![],
+            end_tokens: vec![],
+            separator: ", ".to_string(),
+        })
+    }
+
+    /// Create a parser with custom token configuration
+    pub fn with_config(config: TokenConfig) -> Self {
+        // Build extraction patterns for each token pair
+        let extractors: Vec<Regex> = config
+            .iter_pairs()
+            .filter_map(|(start, end)| {
+                if !start.is_empty() && !end.is_empty() {
+                    // Use (?s) flag to enable DOTALL mode so . matches newlines
+                    let pattern =
+                        format!(r"(?s){}(.*?){}", regex::escape(start), regex::escape(end));
+                    Regex::new(&pattern).ok()
+                } else {
+                    None
+                }
+            })
+            .collect();
+
+        Self {
+            token_config: config,
+            partial_json: PartialJson::default(),
+            extractors,
+        }
+    }
+
+    /// Extract JSON content from text, handling wrapper tokens if configured
+    fn extract_json_content<'a>(&self, text: &'a str) -> &'a str {
+        let mut content = text;
+
+        // Try each extractor pattern (for tokens with both start and end)
+        for extractor in &self.extractors {
+            if let Some(captures) = extractor.captures(content) {
+                if let Some(matched) = captures.get(1) {
+                    return matched.as_str().trim();
+                }
+            }
+        }
+
+        // Handle special case where there's a start token but no end token
+        for (start, end) in self.token_config.iter_pairs() {
+            if !start.is_empty() && end.is_empty() {
+                // Find the start token and extract everything after it
+                if let Some(pos) = content.find(start) {
+                    content = &content[pos + start.len()..];
+                    return content.trim();
+                }
+            }
+        }
+
+        content.trim()
+    }
+
+    /// Try to extract a JSON object or array from text that may contain other content
+    fn extract_json_from_text(&self, text: &str) -> Option<String> {
+        // Look for JSON object starting with {
+        if let Some(start) = text.find('{') {
+            let mut depth = 0;
+            let mut in_string = false;
+            let mut escape_next = false;
+
+            for (i, ch) in text[start..].char_indices() {
+                if escape_next {
+                    escape_next = false;
+                    continue;
+                }
+
+                match ch {
+                    '\\' if in_string => escape_next = true,
+                    '"' if !in_string => in_string = true,
+                    '"' if in_string => in_string = false,
+                    '{' if !in_string => depth += 1,
+                    '}' if !in_string => {
+                        depth -= 1;
+                        if depth == 0 {
+                            return Some(text[start..start + i + 1].to_string());
+                        }
+                    }
+                    _ => {}
+                }
+            }
+        }
+
+        // Look for JSON array starting with [
+        if let Some(start) = text.find('[') {
+            let mut depth = 0;
+            let mut in_string = false;
+            let mut escape_next = false;
+
+            for (i, ch) in text[start..].char_indices() {
+                if escape_next {
+                    escape_next = false;
+                    continue;
+                }
+
+                match ch {
+                    '\\' if in_string => escape_next = true,
+                    '"' if !in_string => in_string = true,
+                    '"' if in_string => in_string = false,
+                    '[' if !in_string => depth += 1,
+                    ']' if !in_string => {
+                        depth -= 1;
+                        if depth == 0 {
+                            return Some(text[start..start + i + 1].to_string());
+                        }
+                    }
+                    _ => {}
+                }
+            }
+        }
+
+        None
+    }
+
+    /// Parse a single JSON object into a ToolCall
+    fn parse_single_object(&self, obj: &Value) -> ToolParserResult<Option<ToolCall>> {
+        // Check if this looks like a tool call
+        let name = obj
+            .get("name")
+            .or_else(|| obj.get("function"))
+            .and_then(|v| v.as_str());
+
+        if let Some(name) = name {
+            // Get arguments - support both "arguments" and "parameters" keys
+            let empty_obj = Value::Object(serde_json::Map::new());
+            let args = obj
+                .get("arguments")
+                .or_else(|| obj.get("parameters"))
+                .unwrap_or(&empty_obj);
+
+            // Convert arguments to JSON string
+            let arguments = serde_json::to_string(args)
+                .map_err(|e| ToolParserError::ParsingFailed(e.to_string()))?;
+
+            // Generate a unique ID if not provided
+            let id = obj
+                .get("id")
+                .and_then(|v| v.as_str())
+                .map(String::from)
+                .unwrap_or_else(|| format!("call_{}", uuid::Uuid::new_v4()));
+
+            Ok(Some(ToolCall {
+                id,
+                r#type: "function".to_string(),
+                function: FunctionCall {
+                    name: name.to_string(),
+                    arguments,
+                },
+            }))
+        } else {
+            Ok(None)
+        }
+    }
+
+    /// Parse JSON value(s) into tool calls
+    fn parse_json_value(&self, value: &Value) -> ToolParserResult<Vec<ToolCall>> {
+        let mut tools = Vec::new();
+
+        match value {
+            Value::Array(arr) => {
+                // Parse each element in the array
+                for item in arr {
+                    if let Some(tool) = self.parse_single_object(item)? {
+                        tools.push(tool);
+                    }
+                }
+            }
+            Value::Object(_) => {
+                // Single tool call
+                if let Some(tool) = self.parse_single_object(value)? {
+                    tools.push(tool);
+                }
+            }
+            _ => {
+                // Not a valid tool call format
+                return Ok(vec![]);
+            }
+        }
+
+        Ok(tools)
+    }
+
+    /// Check if text contains potential tool call markers
+    fn has_tool_markers(&self, text: &str) -> bool {
+        // If no start tokens configured, check for JSON structure
+        if self.token_config.start_tokens.is_empty() {
+            // For JSON, we just need to see the start of an object or array
+            return text.contains('{') || text.contains('[');
+        }
+
+        // Check for any start token
+        self.token_config
+            .start_tokens
+            .iter()
+            .any(|token| text.contains(token))
+    }
+}
+
+impl Default for JsonParser {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[async_trait]
+impl ToolParser for JsonParser {
+    async fn parse_complete(&self, text: &str) -> ToolParserResult<Vec<ToolCall>> {
+        // Check if we have multiple start tokens (e.g., multiple <|python_tag|> markers)
+        if !self.token_config.start_tokens.is_empty() {
+            let start_token = &self.token_config.start_tokens[0];
+            if !start_token.is_empty() && text.matches(start_token).count() > 1 {
+                // We have multiple occurrences of the start token
+                let mut all_tools = Vec::new();
+                let mut remaining = text;
+
+                while let Some(start_pos) = remaining.find(start_token.as_str()) {
+                    // Extract content after this start token
+                    let after_token = &remaining[start_pos + start_token.len()..];
+
+                    // Find where this JSON ends (look for the next start token or end of string)
+                    let end_pos = if let Some(next_start) = after_token.find(start_token.as_str()) {
+                        next_start
+                    } else {
+                        after_token.len()
+                    };
+
+                    let json_content = &after_token[..end_pos];
+
+                    // Try to extract and parse JSON from this segment
+                    if let Some(extracted) = self.extract_json_from_text(json_content) {
+                        if let Ok(value) = serde_json::from_str::<Value>(&extracted) {
+                            if let Ok(tools) = self.parse_json_value(&value) {
+                                all_tools.extend(tools);
+                            }
+                        }
+                    }
+
+                    // Move to the next segment
+                    remaining = &remaining[start_pos + start_token.len() + end_pos..];
+                    if remaining.is_empty() {
+                        break;
+                    }
+                }
+
+                if !all_tools.is_empty() {
+                    return Ok(all_tools);
+                }
+            }
+        }
+
+        // Extract JSON content from wrapper tokens if present
+        let json_content = self.extract_json_content(text);
+
+        // Try to parse as JSON first
+        match serde_json::from_str::<Value>(json_content) {
+            Ok(value) => self.parse_json_value(&value),
+            Err(_) => {
+                // If parse failed, check if we have multiple JSON objects separated by the configured separator
+                // This handles cases like: {"name": "func1", ...};{"name": "func2", ...}
+                if !self.token_config.separator.is_empty()
+                    && json_content.contains(&self.token_config.separator)
+                {
+                    let mut all_tools = Vec::new();
+
+                    // Split by separator and try to parse each part
+                    let parts: Vec<&str> =
+                        json_content.split(&self.token_config.separator).collect();
+                    for part in parts {
+                        let trimmed = part.trim();
+                        if trimmed.is_empty() {
+                            continue;
+                        }
+
+                        // Try to parse this part as JSON
+                        if let Ok(value) = serde_json::from_str::<Value>(trimmed) {
+                            if let Ok(tools) = self.parse_json_value(&value) {
+                                all_tools.extend(tools);
+                            }
+                        } else if let Some(extracted) = self.extract_json_from_text(trimmed) {
+                            // Try extracting JSON from this part
+                            if let Ok(value) = serde_json::from_str::<Value>(&extracted) {
+                                if let Ok(tools) = self.parse_json_value(&value) {
+                                    all_tools.extend(tools);
+                                }
+                            }
+                        }
+                    }
+
+                    if !all_tools.is_empty() {
+                        return Ok(all_tools);
+                    }
+                }
+
+                // If no wrapper tokens configured and parse failed,
+                // try to extract JSON from mixed text
+                if self.token_config.start_tokens.is_empty() {
+                    if let Some(extracted) = self.extract_json_from_text(text) {
+                        if let Ok(value) = serde_json::from_str::<Value>(&extracted) {
+                            return self.parse_json_value(&value);
+                        }
+                    }
+                }
+                // Not valid JSON, return empty
+                Ok(vec![])
+            }
+        }
+    }
+
+    async fn parse_incremental(
+        &self,
+        chunk: &str,
+        state: &mut ParseState,
+    ) -> ToolParserResult<StreamResult> {
+        state.buffer.push_str(chunk);
+
+        // Check if we have potential tool calls
+        if !self.has_tool_markers(&state.buffer) {
+            // No tool markers, return as incomplete
+            return Ok(StreamResult::Incomplete);
+        }
+
+        // Extract JSON content first to check for separators
+        let extracted_json = self.extract_json_content(&state.buffer);
+
+        // Handle multiple JSON objects with separators
+        // Check if we have a separator and potentially multiple JSON objects
+        let separator = &self.token_config.separator;
+        if !separator.is_empty() && extracted_json.contains(separator.as_str()) {
+            // Try to find a complete JSON object before the separator
+            if let Some(separator_pos) = extracted_json.find(separator.as_str()) {
+                // Get JSON before separator
+                let before_separator = &extracted_json[..separator_pos];
+
+                // Try to parse the JSON before the separator
+                match serde_json::from_str::<Value>(before_separator) {
+                    Ok(value) => {
+                        // Parse tool calls from this JSON
+                        let tools = self.parse_json_value(&value)?;
+                        if !tools.is_empty() {
+                            // We need to figure out how much to remove from the original buffer
+                            // Find where the separator is in the original buffer and remove up to and including it
+                            if let Some(sep_in_original) = state.buffer.find(separator.as_str()) {
+                                let remaining =
+                                    state.buffer[sep_in_original + separator.len()..].to_string();
+                                state.buffer = remaining;
+                            }
+
+                            // Return the first tool as complete
+                            if let Some(tool) = tools.into_iter().next() {
+                                return Ok(StreamResult::ToolComplete(tool));
+                            }
+                        }
+                    }
+                    Err(_) => {
+                        // Failed to parse, continue to try other methods
+                    }
+                }
+            }
+        }
+
+        // Handle multiple start tokens (e.g., multiple <|python_tag|> markers)
+        if !self.token_config.start_tokens.is_empty() {
+            let start_token = &self.token_config.start_tokens[0];
+            if !start_token.is_empty() {
+                // Find all occurrences of start token
+                let occurrences: Vec<_> =
+                    state.buffer.match_indices(start_token.as_str()).collect();
+                if occurrences.len() > 1 {
+                    // We have multiple start tokens, try to process the first complete one
+                    let first_pos = occurrences[0].0;
+                    let second_pos = occurrences[1].0;
+
+                    // Extract content between first and second start token
+                    let first_json_section = &state.buffer[first_pos..second_pos];
+                    let json_content = self.extract_json_content(first_json_section);
+
+                    // Try to parse this as complete JSON
+                    if let Ok(value) = serde_json::from_str::<Value>(json_content) {
+                        // Parse tool calls from this JSON
+                        let tools = self.parse_json_value(&value)?;
+                        if !tools.is_empty() {
+                            // Remove the processed section from buffer
+                            let remaining = state.buffer[second_pos..].to_string();
+                            state.buffer = remaining;
+
+                            // Return the first tool as complete
+                            if let Some(tool) = tools.into_iter().next() {
+                                return Ok(StreamResult::ToolComplete(tool));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        // Regular single JSON parsing
+        // Extract JSON content
+        let json_content = self.extract_json_content(&state.buffer);
+
+        // Try to parse with partial JSON parser
+        match self.partial_json.parse_value(json_content) {
+            Ok((value, consumed)) => {
+                // Check if we have a complete JSON structure
+                if consumed == json_content.len() {
+                    // Check if this is truly complete or just has null from incomplete parsing
+                    // We need to ensure the JSON actually ends properly (not cut off mid-key)
+                    let trimmed = json_content.trim();
+                    let looks_complete = trimmed.ends_with('}') || trimmed.ends_with(']');
+
+                    if looks_complete {
+                        // Complete JSON, parse tool calls
+                        let tools = self.parse_json_value(&value)?;
+                        if !tools.is_empty() {
+                            // Clear buffer since we consumed everything
+                            state.buffer.clear();
+
+                            // Return the first tool as complete
+                            // TODO simplified version, address more complex version
+                            if let Some(tool) = tools.into_iter().next() {
+                                return Ok(StreamResult::ToolComplete(tool));
+                            }
+                        }
+                    }
+                } else {
+                    // Partial JSON, try to extract tool name
+                    if let Some(name) = value.get("name").and_then(|v| v.as_str()) {
+                        // TODO simplified version, address more complex version
+                        // Just return the tool name once we see it
+                        if !state.in_string {
+                            state.in_string = true; // Use as a flag for "name sent"
+                            return Ok(StreamResult::ToolName {
+                                index: 0,
+                                name: name.to_string(),
+                            });
+                        }
+
+                        // Check for complete arguments
+                        if let Some(args) =
+                            value.get("arguments").or_else(|| value.get("parameters"))
+                        {
+                            if let Ok(args_str) = serde_json::to_string(args) {
+                                // Return arguments as a single update
+                                return Ok(StreamResult::ToolArguments {
+                                    index: 0,
+                                    arguments: args_str,
+                                });
+                            }
+                        }
+                    }
+                }
+            }
+            Err(_) => {
+                // Failed to parse even as partial JSON
+                // Keep buffering
+            }
+        }
+
+        Ok(StreamResult::Incomplete)
+    }
+
+    fn detect_format(&self, text: &str) -> bool {
+        // Check if text contains JSON-like structure
+        if self.has_tool_markers(text) {
+            // Try to extract and parse
+            let json_content = self.extract_json_content(text);
+
+            // Check if it looks like valid JSON for tool calls
+            if let Ok(value) = serde_json::from_str::<Value>(json_content) {
+                match value {
+                    Value::Object(ref obj) => {
+                        // Check for tool call structure
+                        obj.contains_key("name") || obj.contains_key("function")
+                    }
+                    Value::Array(ref arr) => {
+                        // Check if array contains tool-like objects
+                        arr.iter().any(|v| {
+                            if let Some(obj) = v.as_object() {
+                                obj.contains_key("name") || obj.contains_key("function")
+                            } else {
+                                false
+                            }
+                        })
+                    }
+                    _ => false,
+                }
+            } else {
+                false
+            }
+        } else {
+            false
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_parse_single_tool_call() {
+        let parser = JsonParser::new();
+        let input = r#"{"name": "get_weather", "arguments": {"location": "San Francisco"}}"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "get_weather");
+    }
+
+    #[tokio::test]
+    async fn test_parse_multiple_tool_calls() {
+        let parser = JsonParser::new();
+        let input = r#"[
+            {"name": "get_weather", "arguments": {"location": "SF"}},
+            {"name": "search", "arguments": {"query": "news"}}
+        ]"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 2);
+        assert_eq!(result[0].function.name, "get_weather");
+        assert_eq!(result[1].function.name, "search");
+    }
+
+    #[tokio::test]
+    async fn test_parse_with_parameters_key() {
+        let parser = JsonParser::new();
+        let input = r#"{"name": "calculate", "parameters": {"x": 10, "y": 20}}"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "calculate");
+        assert!(result[0].function.arguments.contains("10"));
+    }
+
+    #[tokio::test]
+    async fn test_parse_with_wrapper_tokens() {
+        let parser = JsonParser::with_config(TokenConfig {
+            start_tokens: vec!["<tool>".to_string()],
+            end_tokens: vec!["</tool>".to_string()],
+            separator: ", ".to_string(),
+        });
+
+        let input = r#"<tool>{"name": "test", "arguments": {}}</tool>"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "test");
+    }
+
+    #[test]
+    fn test_detect_format() {
+        let parser = JsonParser::new();
+
+        assert!(parser.detect_format(r#"{"name": "test", "arguments": {}}"#));
+        assert!(parser.detect_format(r#"[{"name": "test"}]"#));
+        assert!(!parser.detect_format("plain text"));
+        assert!(!parser.detect_format(r#"{"key": "value"}"#));
+    }
+
+    #[tokio::test]
+    async fn test_streaming_parse() {
+        // Just verify that streaming eventually produces a complete tool call
+        let parser = JsonParser::new();
+        let mut state = ParseState::new();
+
+        // Send complete JSON in one go
+        // TODO simplified version, address more complex version
+        let full_json = r#"{"name": "get_weather", "arguments": {"location": "SF"}}"#;
+
+        let result = parser
+            .parse_incremental(full_json, &mut state)
+            .await
+            .unwrap();
+
+        // Should get a complete tool immediately with complete JSON
+        match result {
+            StreamResult::ToolComplete(tool) => {
+                assert_eq!(tool.function.name, "get_weather");
+                assert!(tool.function.arguments.contains("SF"));
+            }
+            _ => panic!("Expected ToolComplete for complete JSON input"),
+        }
+    }
+}
diff --git a/sgl-router/src/tool_parser/parsers/kimik2_parser.rs b/sgl-router/src/tool_parser/parsers/kimik2_parser.rs
new file mode 100644
index 000000000..52f92bd90
--- /dev/null
+++ b/sgl-router/src/tool_parser/parsers/kimik2_parser.rs
@@ -0,0 +1,270 @@
+use async_trait::async_trait;
+use regex::Regex;
+
+use crate::tool_parser::{
+    errors::ToolParserResult,
+    partial_json::PartialJson,
+    state::ParseState,
+    traits::ToolParser,
+    types::{FunctionCall, StreamResult, ToolCall},
+};
+
+/// Kimi K2 format parser for tool calls
+///
+/// Handles the Kimi K2 specific format:
+/// `<|tool_calls_section_begin|><|tool_call_begin|>functions.{name}:{index}<|tool_call_argument_begin|>{json_args}<|tool_call_end|><|tool_calls_section_end|>`
+///
+/// Features:
+/// - Token-based delimiters
+/// - Function calls with explicit indexing
+/// - JSON arguments
+pub struct KimiK2Parser {
+    /// Parser for handling incomplete JSON during streaming
+    partial_json: PartialJson,
+    /// Regex for extracting complete tool calls
+    tool_call_extractor: Regex,
+    /// Regex for extracting partial tool calls (streaming)
+    stream_tool_call_extractor: Regex,
+}
+
+impl KimiK2Parser {
+    /// Create a new Kimi K2 parser
+    pub fn new() -> Self {
+        // Pattern for complete tool calls
+        let tool_call_pattern = r"<\|tool_call_begin\|>\s*(?P<tool_call_id>[\w\.]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>\{.*?\})\s*<\|tool_call_end\|>";
+        let tool_call_extractor = Regex::new(tool_call_pattern).expect("Valid regex pattern");
+
+        // Pattern for streaming (partial) tool calls
+        let stream_pattern = r"<\|tool_call_begin\|>\s*(?P<tool_call_id>[\w\.]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>\{.*)";
+        let stream_tool_call_extractor = Regex::new(stream_pattern).expect("Valid regex pattern");
+
+        Self {
+            partial_json: PartialJson::default(),
+            tool_call_extractor,
+            stream_tool_call_extractor,
+        }
+    }
+
+    /// Check if text contains Kimi K2 tool markers
+    fn has_tool_markers(&self, text: &str) -> bool {
+        text.contains("<|tool_calls_section_begin|>")
+    }
+
+    /// Parse function ID to extract name and index
+    fn parse_function_id(&self, id: &str) -> Option<(String, usize)> {
+        // Format: functions.{name}:{index} or namespace.functions.{name}:{index}
+        // Extract everything after the last dot before the colon as the function name
+        if let Some(colon_pos) = id.rfind(':') {
+            let before_colon = &id[..colon_pos];
+            let index_str = &id[colon_pos + 1..];
+
+            // Find the last dot to extract the function name
+            if let Some(dot_pos) = before_colon.rfind('.') {
+                let func_name = &before_colon[dot_pos + 1..];
+
+                if let Ok(index) = index_str.parse::<usize>() {
+                    return Some((func_name.to_string(), index));
+                }
+            }
+        }
+        None
+    }
+}
+
+impl Default for KimiK2Parser {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[async_trait]
+impl ToolParser for KimiK2Parser {
+    async fn parse_complete(&self, text: &str) -> ToolParserResult<Vec<ToolCall>> {
+        // Check if text contains Kimi K2 format
+        if !self.has_tool_markers(text) {
+            return Ok(vec![]);
+        }
+
+        let mut tools = Vec::new();
+
+        // Extract all tool calls
+        for captures in self.tool_call_extractor.captures_iter(text) {
+            if let (Some(id_match), Some(args_match)) = (
+                captures.name("tool_call_id"),
+                captures.name("function_arguments"),
+            ) {
+                let function_id = id_match.as_str();
+                let function_args = args_match.as_str();
+
+                // Parse function ID
+                if let Some((func_name, _index)) = self.parse_function_id(function_id) {
+                    // Validate JSON arguments
+                    if serde_json::from_str::<serde_json::Value>(function_args).is_ok() {
+                        // Generate unique ID
+                        let id = format!("kimi_call_{}", uuid::Uuid::new_v4());
+
+                        tools.push(ToolCall {
+                            id,
+                            r#type: "function".to_string(),
+                            function: FunctionCall {
+                                name: func_name,
+                                arguments: function_args.to_string(),
+                            },
+                        });
+                    }
+                }
+            }
+        }
+
+        Ok(tools)
+    }
+
+    async fn parse_incremental(
+        &self,
+        chunk: &str,
+        state: &mut ParseState,
+    ) -> ToolParserResult<StreamResult> {
+        state.buffer.push_str(chunk);
+
+        // Check for tool markers
+        let has_tool_call =
+            self.has_tool_markers(&state.buffer) || state.buffer.contains("<|tool_call_begin|>");
+
+        if !has_tool_call {
+            // No markers found, clear buffer and return
+            state.buffer.clear();
+            return Ok(StreamResult::Incomplete);
+        }
+
+        // Try to match streaming pattern
+        if let Some(captures) = self.stream_tool_call_extractor.captures(&state.buffer) {
+            if let (Some(id_match), Some(args_match)) = (
+                captures.name("tool_call_id"),
+                captures.name("function_arguments"),
+            ) {
+                let function_id = id_match.as_str();
+                let partial_args = args_match.as_str();
+
+                // Parse function ID
+                if let Some((func_name, _index)) = self.parse_function_id(function_id) {
+                    // Send function name if not sent yet
+                    if !state.in_string {
+                        state.in_string = true; // Mark name as sent
+                        return Ok(StreamResult::ToolName {
+                            index: 0,
+                            name: func_name.clone(),
+                        });
+                    }
+
+                    // Check if we have a complete tool call
+                    if let Some(end_pos) = partial_args.find("<|tool_call_end|>") {
+                        // Extract just the JSON part
+                        let json_args = &partial_args[..end_pos];
+
+                        // Validate and parse JSON
+                        if serde_json::from_str::<serde_json::Value>(json_args).is_ok() {
+                            // Generate unique ID
+                            let id = format!("kimi_call_{}", uuid::Uuid::new_v4());
+
+                            let tool = ToolCall {
+                                id,
+                                r#type: "function".to_string(),
+                                function: FunctionCall {
+                                    name: func_name,
+                                    arguments: json_args.to_string(),
+                                },
+                            };
+
+                            // Find where this tool call ends in the buffer
+                            if let Some(tool_end) = state.buffer.find("<|tool_call_end|>") {
+                                let end_pos = tool_end + "<|tool_call_end|>".len();
+                                state.buffer.drain(..end_pos);
+                            }
+
+                            // Reset state for next tool
+                            state.in_string = false;
+
+                            return Ok(StreamResult::ToolComplete(tool));
+                        }
+                    } else {
+                        // Try to parse partial JSON for streaming arguments
+                        match self.partial_json.parse_value(partial_args) {
+                            Ok((value, _consumed)) => {
+                                let args_str = serde_json::to_string(&value)
+                                    .unwrap_or_else(|_| "{}".to_string());
+
+                                return Ok(StreamResult::ToolArguments {
+                                    index: 0,
+                                    arguments: args_str,
+                                });
+                            }
+                            Err(_) => {
+                                // Can't parse yet, keep buffering
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        Ok(StreamResult::Incomplete)
+    }
+
+    fn detect_format(&self, text: &str) -> bool {
+        self.has_tool_markers(text) || text.contains("<|tool_call_begin|>")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_parse_kimi_single_tool() {
+        let parser = KimiK2Parser::new();
+        let input = r#"Some text
+<|tool_calls_section_begin|>
+<|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"location": "Tokyo", "units": "celsius"}<|tool_call_end|>
+<|tool_calls_section_end|>More text"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "get_weather");
+        assert!(result[0].function.arguments.contains("Tokyo"));
+    }
+
+    #[tokio::test]
+    async fn test_parse_kimi_multiple_tools() {
+        let parser = KimiK2Parser::new();
+        let input = r#"<|tool_calls_section_begin|>
+<|tool_call_begin|>functions.search:0<|tool_call_argument_begin|>{"query": "rust"}<|tool_call_end|>
+<|tool_call_begin|>functions.calculate:1<|tool_call_argument_begin|>{"expression": "2+2"}<|tool_call_end|>
+<|tool_calls_section_end|>"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 2);
+        assert_eq!(result[0].function.name, "search");
+        assert_eq!(result[1].function.name, "calculate");
+    }
+
+    #[tokio::test]
+    async fn test_parse_kimi_with_whitespace() {
+        let parser = KimiK2Parser::new();
+        let input = r#"<|tool_calls_section_begin|>
+<|tool_call_begin|> functions.test:0 <|tool_call_argument_begin|> {"key": "value"} <|tool_call_end|>
+<|tool_calls_section_end|>"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "test");
+    }
+
+    #[test]
+    fn test_detect_format() {
+        let parser = KimiK2Parser::new();
+        assert!(parser.detect_format("<|tool_calls_section_begin|>"));
+        assert!(parser.detect_format("<|tool_call_begin|>"));
+        assert!(!parser.detect_format("plain text"));
+        assert!(!parser.detect_format("[TOOL_CALLS]"));
+    }
+}
diff --git a/sgl-router/src/tool_parser/parsers/llama_parser.rs b/sgl-router/src/tool_parser/parsers/llama_parser.rs
new file mode 100644
index 000000000..678c964c5
--- /dev/null
+++ b/sgl-router/src/tool_parser/parsers/llama_parser.rs
@@ -0,0 +1,156 @@
+use async_trait::async_trait;
+
+use super::json_parser::JsonParser;
+use crate::tool_parser::{
+    errors::ToolParserResult,
+    state::ParseState,
+    traits::ToolParser,
+    types::{StreamResult, TokenConfig, ToolCall},
+};
+
+/// Llama 3.2 format parser for tool calls
+///
+/// Handles the Llama 3.2 specific format:
+/// `<|python_tag|>{"name": "func", "arguments": {...}}`
+///
+/// Also supports plain JSON without the python_tag prefix
+pub struct LlamaParser {
+    /// Underlying JSON parser with Llama-specific configuration
+    json_parser: JsonParser,
+}
+
+impl LlamaParser {
+    /// Create a new Llama parser
+    pub fn new() -> Self {
+        // Configure JSON parser with Llama's python_tag token
+        // Note: No end token for python_tag format
+        let json_parser = JsonParser::with_config(TokenConfig {
+            start_tokens: vec!["<|python_tag|>".to_string()],
+            end_tokens: vec!["".to_string()], // Empty end token
+            separator: ";".to_string(), // Llama uses semicolon for multiple calls (though not well supported)
+        });
+
+        Self { json_parser }
+    }
+}
+
+impl Default for LlamaParser {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[async_trait]
+impl ToolParser for LlamaParser {
+    async fn parse_complete(&self, text: &str) -> ToolParserResult<Vec<ToolCall>> {
+        // First try with the configured python_tag parser
+        let result = self.json_parser.parse_complete(text).await?;
+
+        if !result.is_empty() {
+            return Ok(result);
+        }
+
+        // If no results and text starts with '{', try plain JSON
+        if text.trim_start().starts_with('{') {
+            // Create a temporary plain JSON parser
+            let plain_parser = JsonParser::new();
+            return plain_parser.parse_complete(text).await;
+        }
+
+        Ok(vec![])
+    }
+
+    async fn parse_incremental(
+        &self,
+        chunk: &str,
+        state: &mut ParseState,
+    ) -> ToolParserResult<StreamResult> {
+        // Try with the python_tag parser first
+        let result = self.json_parser.parse_incremental(chunk, state).await?;
+
+        // If we get Incomplete and buffer starts with '{', might be plain JSON
+        if matches!(result, StreamResult::Incomplete) && state.buffer.trim_start().starts_with('{')
+        {
+            // Check if we have python_tag in the buffer
+            if !state.buffer.contains("<|python_tag|>") {
+                // Likely plain JSON, create temporary parser
+                let plain_parser = JsonParser::new();
+                return plain_parser.parse_incremental("", state).await;
+            }
+        }
+
+        Ok(result)
+    }
+
+    fn detect_format(&self, text: &str) -> bool {
+        // Llama format if contains python_tag or starts with JSON object
+        text.contains("<|python_tag|>")
+            || (text.trim_start().starts_with('{')
+                && (text.contains(r#""name""#) || text.contains(r#""function""#)))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_parse_with_python_tag() {
+        let parser = LlamaParser::new();
+        let input = r#"<|python_tag|>{"name": "search", "arguments": {"query": "weather"}}"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "search");
+        assert!(result[0].function.arguments.contains("weather"));
+    }
+
+    #[tokio::test]
+    async fn test_parse_plain_json() {
+        let parser = LlamaParser::new();
+        let input = r#"{"name": "calculate", "arguments": {"x": 5, "y": 10}}"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "calculate");
+    }
+
+    #[tokio::test]
+    async fn test_parse_with_text_before() {
+        let parser = LlamaParser::new();
+        let input = r#"Let me help you with that. <|python_tag|>{"name": "get_time", "arguments": {"timezone": "UTC"}}"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "get_time");
+    }
+
+    #[test]
+    fn test_detect_format() {
+        let parser = LlamaParser::new();
+
+        assert!(parser.detect_format(r#"<|python_tag|>{"name": "test"}"#));
+        assert!(parser.detect_format(r#"{"name": "test", "arguments": {}}"#));
+        assert!(!parser.detect_format("plain text"));
+        assert!(!parser.detect_format(r#"{"key": "value"}"#)); // No name field
+    }
+
+    #[tokio::test]
+    async fn test_single_call_with_semicolon() {
+        let parser = LlamaParser::new();
+        // Note: Llama 3.2 doesn't handle multiple calls well
+        // Test that we can at least parse a single call followed by semicolon
+        let input = r#"<|python_tag|>{"name": "func1", "arguments": {"x": 1}};"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+
+        // We expect this to either parse the first JSON object or fail gracefully
+        // Since the semicolon makes it invalid JSON, it will likely return empty
+        // This is acceptable as Llama 3.2 doesn't reliably support parallel calls
+
+        // If it parses anything, it should be func1
+        if !result.is_empty() {
+            assert_eq!(result[0].function.name, "func1");
+        }
+    }
+}
diff --git a/sgl-router/src/tool_parser/parsers/mistral_parser.rs b/sgl-router/src/tool_parser/parsers/mistral_parser.rs
new file mode 100644
index 000000000..68a3568aa
--- /dev/null
+++ b/sgl-router/src/tool_parser/parsers/mistral_parser.rs
@@ -0,0 +1,347 @@
+use async_trait::async_trait;
+use serde_json::Value;
+
+use crate::tool_parser::{
+    errors::{ToolParserError, ToolParserResult},
+    partial_json::PartialJson,
+    state::ParseState,
+    traits::ToolParser,
+    types::{FunctionCall, StreamResult, ToolCall},
+};
+
+/// Mistral format parser for tool calls
+///
+/// Handles the Mistral-specific format:
+/// `[TOOL_CALLS] [{"name": "func", "arguments": {...}}, ...]`
+///
+/// Features:
+/// - Bracket counting for proper JSON array extraction
+/// - Support for multiple tool calls in a single array
+/// - String-aware parsing to handle nested brackets in JSON
+pub struct MistralParser {
+    /// Parser for handling incomplete JSON during streaming
+    partial_json: PartialJson,
+}
+
+impl MistralParser {
+    /// Create a new Mistral parser
+    pub fn new() -> Self {
+        Self {
+            partial_json: PartialJson::default(),
+        }
+    }
+
+    /// Extract JSON array using bracket counting
+    ///
+    /// Handles nested brackets in JSON content by tracking:
+    /// - String boundaries (quotes)
+    /// - Escape sequences
+    /// - Bracket depth
+    fn extract_json_array<'a>(&self, text: &'a str) -> Option<&'a str> {
+        const BOT_TOKEN: &str = "[TOOL_CALLS] [";
+
+        // Find the start of the token
+        let start_idx = text.find(BOT_TOKEN)?;
+
+        // Start from the opening bracket after [TOOL_CALLS]
+        // The -1 is to include the opening bracket that's part of the token
+        let json_start = start_idx + BOT_TOKEN.len() - 1;
+
+        let mut bracket_count = 0;
+        let mut in_string = false;
+        let mut escape_next = false;
+
+        let bytes = text.as_bytes();
+
+        for i in json_start..text.len() {
+            let char = bytes[i];
+
+            if escape_next {
+                escape_next = false;
+                continue;
+            }
+
+            if char == b'\\' {
+                escape_next = true;
+                continue;
+            }
+
+            if char == b'"' && !escape_next {
+                in_string = !in_string;
+                continue;
+            }
+
+            if !in_string {
+                if char == b'[' {
+                    bracket_count += 1;
+                } else if char == b']' {
+                    bracket_count -= 1;
+                    if bracket_count == 0 {
+                        // Found the matching closing bracket
+                        return Some(&text[json_start..=i]);
+                    }
+                }
+            }
+        }
+
+        // Incomplete array (no matching closing bracket found)
+        None
+    }
+
+    /// Parse tool calls from a JSON array
+    fn parse_json_array(&self, json_str: &str) -> ToolParserResult<Vec<ToolCall>> {
+        let value: Value = serde_json::from_str(json_str)
+            .map_err(|e| ToolParserError::ParsingFailed(e.to_string()))?;
+
+        let mut tools = Vec::new();
+
+        if let Value::Array(arr) = value {
+            for (index, item) in arr.iter().enumerate() {
+                if let Some(tool) = self.parse_single_object(item, index)? {
+                    tools.push(tool);
+                }
+            }
+        } else {
+            // Single object case (shouldn't happen with Mistral format, but handle it)
+            if let Some(tool) = self.parse_single_object(&value, 0)? {
+                tools.push(tool);
+            }
+        }
+
+        Ok(tools)
+    }
+
+    /// Parse a single JSON object into a ToolCall
+    fn parse_single_object(&self, obj: &Value, index: usize) -> ToolParserResult<Option<ToolCall>> {
+        let name = obj.get("name").and_then(|v| v.as_str());
+
+        if let Some(name) = name {
+            // Get arguments - Mistral uses "arguments" key
+            let empty_obj = Value::Object(serde_json::Map::new());
+            let args = obj.get("arguments").unwrap_or(&empty_obj);
+
+            // Convert arguments to JSON string
+            let arguments = serde_json::to_string(args)
+                .map_err(|e| ToolParserError::ParsingFailed(e.to_string()))?;
+
+            // Generate ID with index for multiple tools
+            let id = format!("mistral_call_{}", index);
+
+            Ok(Some(ToolCall {
+                id,
+                r#type: "function".to_string(),
+                function: FunctionCall {
+                    name: name.to_string(),
+                    arguments,
+                },
+            }))
+        } else {
+            Ok(None)
+        }
+    }
+
+    /// Check if text contains Mistral tool markers
+    fn has_tool_markers(&self, text: &str) -> bool {
+        text.contains("[TOOL_CALLS]")
+    }
+}
+
+impl Default for MistralParser {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[async_trait]
+impl ToolParser for MistralParser {
+    async fn parse_complete(&self, text: &str) -> ToolParserResult<Vec<ToolCall>> {
+        // Check if text contains Mistral format
+        if !self.has_tool_markers(text) {
+            return Ok(vec![]);
+        }
+
+        // Extract JSON array from Mistral format
+        if let Some(json_array) = self.extract_json_array(text) {
+            self.parse_json_array(json_array)
+        } else {
+            // Markers present but no complete array found
+            Ok(vec![])
+        }
+    }
+
+    async fn parse_incremental(
+        &self,
+        chunk: &str,
+        state: &mut ParseState,
+    ) -> ToolParserResult<StreamResult> {
+        state.buffer.push_str(chunk);
+
+        // Check if we have the start marker
+        if !self.has_tool_markers(&state.buffer) {
+            return Ok(StreamResult::Incomplete);
+        }
+
+        // Try to extract complete JSON array
+        if let Some(json_array) = self.extract_json_array(&state.buffer) {
+            // Parse with partial JSON to handle incomplete content
+            match self.partial_json.parse_value(json_array) {
+                Ok((value, consumed)) => {
+                    // Check if we have a complete JSON structure
+                    if consumed == json_array.len() {
+                        // Complete JSON, parse tool calls
+                        let tools = if let Value::Array(arr) = value {
+                            let mut result = Vec::new();
+                            for (index, item) in arr.iter().enumerate() {
+                                if let Some(tool) = self.parse_single_object(item, index)? {
+                                    result.push(tool);
+                                }
+                            }
+                            result
+                        } else {
+                            vec![]
+                        };
+
+                        if !tools.is_empty() {
+                            // Clear buffer since we consumed everything
+                            state.buffer.clear();
+
+                            // Return the first tool (simplified for Phase 3)
+                            // Full multi-tool streaming will be implemented later
+                            if let Some(tool) = tools.into_iter().next() {
+                                return Ok(StreamResult::ToolComplete(tool));
+                            }
+                        }
+                    } else {
+                        // Partial JSON - try to extract tool name for streaming
+                        if let Value::Array(arr) = value {
+                            if let Some(first_tool) = arr.first() {
+                                if let Some(name) = first_tool.get("name").and_then(|v| v.as_str())
+                                {
+                                    // Check if we've already sent the name
+                                    if !state.in_string {
+                                        state.in_string = true; // Use as flag for "name sent"
+                                        return Ok(StreamResult::ToolName {
+                                            index: 0,
+                                            name: name.to_string(),
+                                        });
+                                    }
+
+                                    // Check for arguments
+                                    if let Some(args) = first_tool.get("arguments") {
+                                        if let Ok(args_str) = serde_json::to_string(args) {
+                                            return Ok(StreamResult::ToolArguments {
+                                                index: 0,
+                                                arguments: args_str,
+                                            });
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                Err(_) => {
+                    // Failed to parse even as partial JSON
+                    // Keep buffering
+                }
+            }
+        }
+
+        Ok(StreamResult::Incomplete)
+    }
+
+    fn detect_format(&self, text: &str) -> bool {
+        // Check if text contains Mistral-specific markers
+        if self.has_tool_markers(text) {
+            // Try to extract and validate the array
+            if let Some(json_array) = self.extract_json_array(text) {
+                // Check if it's valid JSON
+                if let Ok(value) = serde_json::from_str::<Value>(json_array) {
+                    // Check if it contains tool-like structures
+                    match value {
+                        Value::Array(ref arr) => arr.iter().any(|v| {
+                            v.as_object().is_some_and(|o| {
+                                o.contains_key("name") && o.contains_key("arguments")
+                            })
+                        }),
+                        Value::Object(ref obj) => {
+                            obj.contains_key("name") && obj.contains_key("arguments")
+                        }
+                        _ => false,
+                    }
+                } else {
+                    false
+                }
+            } else {
+                // Has markers but no complete array - might be streaming
+                true
+            }
+        } else {
+            false
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_parse_mistral_format() {
+        let parser = MistralParser::new();
+        let input = r#"[TOOL_CALLS] [{"name": "get_weather", "arguments": {"location": "Paris", "units": "celsius"}}]"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "get_weather");
+        assert!(result[0].function.arguments.contains("Paris"));
+    }
+
+    #[tokio::test]
+    async fn test_parse_multiple_tools() {
+        let parser = MistralParser::new();
+        let input = r#"[TOOL_CALLS] [
+            {"name": "search", "arguments": {"query": "rust programming"}},
+            {"name": "calculate", "arguments": {"expression": "2 + 2"}}
+        ]"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 2);
+        assert_eq!(result[0].function.name, "search");
+        assert_eq!(result[1].function.name, "calculate");
+    }
+
+    #[tokio::test]
+    async fn test_nested_brackets_in_json() {
+        let parser = MistralParser::new();
+        let input = r#"[TOOL_CALLS] [{"name": "process", "arguments": {"data": [1, 2, [3, 4]], "config": {"nested": [5, 6]}}}]"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "process");
+        // JSON serialization removes spaces, so check for [3,4] without spaces
+        assert!(result[0].function.arguments.contains("[3,4]"));
+    }
+
+    #[tokio::test]
+    async fn test_escaped_quotes_in_strings() {
+        let parser = MistralParser::new();
+        let input = r#"[TOOL_CALLS] [{"name": "echo", "arguments": {"message": "He said \"Hello [World]\""}}]"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "echo");
+    }
+
+    #[test]
+    fn test_detect_format() {
+        let parser = MistralParser::new();
+
+        assert!(parser.detect_format(r#"[TOOL_CALLS] [{"name": "test", "arguments": {}}]"#));
+        assert!(
+            parser.detect_format(r#"Some text [TOOL_CALLS] [{"name": "test", "arguments": {}}]"#)
+        );
+        assert!(!parser.detect_format(r#"{"name": "test", "arguments": {}}"#));
+        assert!(!parser.detect_format("plain text"));
+    }
+}
diff --git a/sgl-router/src/tool_parser/parsers/mod.rs b/sgl-router/src/tool_parser/parsers/mod.rs
new file mode 100644
index 000000000..693aeedf4
--- /dev/null
+++ b/sgl-router/src/tool_parser/parsers/mod.rs
@@ -0,0 +1,27 @@
+/// Parser implementations for different model formats
+///
+/// This module contains concrete parser implementations for various model-specific
+/// tool/function call formats.
+// Individual parser modules
+pub mod deepseek_parser;
+pub mod glm4_moe_parser;
+pub mod gpt_oss_parser;
+pub mod json_parser;
+pub mod kimik2_parser;
+pub mod llama_parser;
+pub mod mistral_parser;
+pub mod pythonic_parser;
+pub mod qwen_parser;
+pub mod step3_parser;
+
+// Re-export parser types for convenience
+pub use deepseek_parser::DeepSeekParser;
+pub use glm4_moe_parser::Glm4MoeParser;
+pub use gpt_oss_parser::GptOssParser;
+pub use json_parser::JsonParser;
+pub use kimik2_parser::KimiK2Parser;
+pub use llama_parser::LlamaParser;
+pub use mistral_parser::MistralParser;
+pub use pythonic_parser::PythonicParser;
+pub use qwen_parser::QwenParser;
+pub use step3_parser::Step3Parser;
diff --git a/sgl-router/src/tool_parser/parsers/pythonic_parser.rs b/sgl-router/src/tool_parser/parsers/pythonic_parser.rs
new file mode 100644
index 000000000..6e0a1ecff
--- /dev/null
+++ b/sgl-router/src/tool_parser/parsers/pythonic_parser.rs
@@ -0,0 +1,434 @@
+/// Pythonic format parser for tool calls
+///
+/// Handles Python function call syntax within square brackets:
+/// ```text
+/// [tool1(arg1=val1, arg2=val2), tool2(arg1=val3)]
+/// ```
+///
+/// This format is used by Llama-4 models and uses Python literals
+/// rather than JSON for arguments.
+use async_trait::async_trait;
+use regex::Regex;
+use serde_json::{json, Value};
+
+use crate::tool_parser::{
+    errors::ToolParserResult,
+    python_literal_parser::parse_python_literal,
+    state::ParseState,
+    traits::ToolParser,
+    types::{FunctionCall, StreamResult, ToolCall},
+};
+
+/// Parser for Pythonic tool call format
+pub struct PythonicParser {
+    /// Regex to detect tool calls in Pythonic format
+    tool_call_regex: Regex,
+    /// Regex to parse function calls - cached for reuse
+    call_regex: Regex,
+}
+
+impl PythonicParser {
+    /// Create a new Pythonic parser
+    pub fn new() -> Self {
+        // Simple regex to detect start of Pythonic tool calls
+        // We'll use manual parsing for the actual extraction
+        let pattern = r"\[[a-zA-Z_]\w*\(";
+        let tool_call_regex = Regex::new(pattern).expect("Valid regex pattern");
+
+        // Compile the function call regex once
+        let call_regex = Regex::new(r"(?s)^([a-zA-Z_]\w*)\((.*)\)$").expect("Valid regex pattern");
+
+        Self {
+            tool_call_regex,
+            call_regex,
+        }
+    }
+
+    /// Extract tool calls using bracket counting (similar to MistralParser)
+    fn extract_tool_calls(&self, text: &str) -> Option<String> {
+        // Find the start of a tool call list - look for [ followed by a function name
+        let chars: Vec<char> = text.chars().collect();
+
+        for start_idx in 0..chars.len() {
+            if chars[start_idx] != '[' {
+                continue;
+            }
+
+            // Check if this looks like a tool call
+            // Skip whitespace after [
+            let mut check_idx = start_idx + 1;
+            while check_idx < chars.len() && chars[check_idx].is_whitespace() {
+                check_idx += 1;
+            }
+
+            // Check if we have a function name (starts with letter or underscore)
+            if check_idx >= chars.len()
+                || (!chars[check_idx].is_alphabetic() && chars[check_idx] != '_')
+            {
+                continue;
+            }
+
+            // Now count brackets to find the matching ]
+            let mut bracket_count = 0;
+            let mut _paren_count = 0;
+            let mut _brace_count = 0;
+            let mut in_string = false;
+            let mut string_char = ' ';
+            let mut escape_next = false;
+
+            for i in start_idx..chars.len() {
+                let ch = chars[i];
+
+                if escape_next {
+                    escape_next = false;
+                    continue;
+                }
+
+                if ch == '\\' && in_string {
+                    escape_next = true;
+                    continue;
+                }
+
+                if !in_string && (ch == '"' || ch == '\'') {
+                    in_string = true;
+                    string_char = ch;
+                } else if in_string && ch == string_char && !escape_next {
+                    in_string = false;
+                } else if !in_string {
+                    match ch {
+                        '[' => bracket_count += 1,
+                        ']' => {
+                            bracket_count -= 1;
+                            if bracket_count == 0 {
+                                // Found the matching bracket
+                                let extracted: String = chars[start_idx..=i].iter().collect();
+                                // Verify this actually contains a function call
+                                if extracted.contains('(') && extracted.contains(')') {
+                                    return Some(extracted);
+                                }
+                            }
+                        }
+                        '(' => _paren_count += 1,
+                        ')' => _paren_count -= 1,
+                        '{' => _brace_count += 1,
+                        '}' => _brace_count -= 1,
+                        _ => {}
+                    }
+                }
+            }
+        }
+        None
+    }
+
+    /// Strip special tokens that Llama 4 might output
+    fn strip_special_tokens(text: &str) -> String {
+        text.replace("<|python_start|>", "")
+            .replace("<|python_end|>", "")
+    }
+
+    /// Parse a single function call from Python syntax
+    fn parse_function_call(&self, call_str: &str) -> ToolParserResult<Option<ToolCall>> {
+        // Use cached regex instead of creating new one
+        if let Some(captures) = self.call_regex.captures(call_str.trim()) {
+            let function_name = captures.get(1).unwrap().as_str();
+            let args_str = captures.get(2).unwrap().as_str();
+
+            // Parse arguments
+            let arguments = self.parse_arguments(args_str)?;
+
+            Ok(Some(ToolCall {
+                id: format!("call_{}", uuid::Uuid::new_v4()),
+                r#type: "function".to_string(),
+                function: FunctionCall {
+                    name: function_name.to_string(),
+                    arguments: serde_json::to_string(&arguments)?,
+                },
+            }))
+        } else {
+            Ok(None)
+        }
+    }
+
+    /// Parse Python-style arguments into JSON
+    fn parse_arguments(&self, args_str: &str) -> ToolParserResult<Value> {
+        if args_str.trim().is_empty() {
+            return Ok(json!({}));
+        }
+
+        let mut result = serde_json::Map::new();
+        let mut current_key = String::new();
+        let mut current_value = String::new();
+        let mut in_key = true;
+        let mut depth = 0;
+        let mut in_string = false;
+        let mut string_char = ' ';
+        let mut escape_next = false;
+
+        let chars: Vec<char> = args_str.chars().collect();
+        let mut i = 0;
+
+        while i < chars.len() {
+            let ch = chars[i];
+
+            if escape_next {
+                if in_key {
+                    current_key.push(ch);
+                } else {
+                    current_value.push(ch);
+                }
+                escape_next = false;
+                i += 1;
+                continue;
+            }
+
+            if ch == '\\' && in_string {
+                escape_next = true;
+                current_value.push(ch);
+                i += 1;
+                continue;
+            }
+
+            // Handle string literals
+            if !in_string && (ch == '"' || ch == '\'') {
+                in_string = true;
+                string_char = ch;
+                if !in_key {
+                    current_value.push(ch);
+                }
+            } else if in_string && ch == string_char && !escape_next {
+                in_string = false;
+                if !in_key {
+                    current_value.push(ch);
+                }
+            } else if in_string {
+                if in_key {
+                    current_key.push(ch);
+                } else {
+                    current_value.push(ch);
+                }
+            } else {
+                // Not in string
+                match ch {
+                    '=' if in_key && depth == 0 => {
+                        in_key = false;
+                    }
+                    ',' if depth == 0 => {
+                        // End of current argument
+                        if !current_key.is_empty() {
+                            let value = parse_python_literal(current_value.trim())?;
+                            result.insert(current_key.trim().to_string(), value);
+                        }
+                        current_key.clear();
+                        current_value.clear();
+                        in_key = true;
+                    }
+                    '[' | '{' | '(' => {
+                        depth += 1;
+                        if !in_key {
+                            current_value.push(ch);
+                        }
+                    }
+                    ']' | '}' | ')' => {
+                        depth -= 1;
+                        if !in_key {
+                            current_value.push(ch);
+                        }
+                    }
+                    _ => {
+                        if in_key {
+                            if !ch.is_whitespace() || !current_key.is_empty() {
+                                current_key.push(ch);
+                            }
+                        } else {
+                            current_value.push(ch);
+                        }
+                    }
+                }
+            }
+
+            i += 1;
+        }
+
+        // Handle the last argument
+        if !current_key.is_empty() {
+            let value = parse_python_literal(current_value.trim())?;
+            result.insert(current_key.trim().to_string(), value);
+        }
+
+        Ok(Value::Object(result))
+    }
+}
+
+#[async_trait]
+impl ToolParser for PythonicParser {
+    async fn parse_complete(&self, text: &str) -> ToolParserResult<Vec<ToolCall>> {
+        let cleaned = Self::strip_special_tokens(text);
+
+        // Extract tool calls using bracket counting
+        if let Some(tool_calls_text) = self.extract_tool_calls(&cleaned) {
+            // Remove the outer brackets
+            let tool_calls_str = &tool_calls_text[1..tool_calls_text.len() - 1];
+
+            // Split into individual function calls
+            let mut calls = Vec::new();
+            let mut current_call = String::new();
+            let mut paren_depth = 0;
+            let mut in_string = false;
+            let mut string_char = ' ';
+
+            for ch in tool_calls_str.chars() {
+                if !in_string && (ch == '"' || ch == '\'') {
+                    in_string = true;
+                    string_char = ch;
+                    current_call.push(ch);
+                } else if in_string && ch == string_char {
+                    in_string = false;
+                    current_call.push(ch);
+                } else if in_string {
+                    current_call.push(ch);
+                } else {
+                    match ch {
+                        '(' => {
+                            paren_depth += 1;
+                            current_call.push(ch);
+                        }
+                        ')' => {
+                            paren_depth -= 1;
+                            current_call.push(ch);
+                        }
+                        ',' if paren_depth == 0 => {
+                            // End of current function call
+                            if let Some(call) = self.parse_function_call(current_call.trim())? {
+                                calls.push(call);
+                            }
+                            current_call.clear();
+                        }
+                        _ => {
+                            if !ch.is_whitespace() || !current_call.is_empty() {
+                                current_call.push(ch);
+                            }
+                        }
+                    }
+                }
+            }
+
+            // Handle the last call (important for single calls or the last call in a list)
+            if !current_call.trim().is_empty() {
+                if let Some(call) = self.parse_function_call(current_call.trim())? {
+                    calls.push(call);
+                }
+            }
+
+            Ok(calls)
+        } else {
+            Ok(vec![])
+        }
+    }
+
+    async fn parse_incremental(
+        &self,
+        chunk: &str,
+        state: &mut ParseState,
+    ) -> ToolParserResult<StreamResult> {
+        // For Pythonic format, we accumulate until we have a complete tool call
+        // This is a simplified implementation
+        state.buffer.push_str(chunk);
+
+        // Try to parse if we have a complete tool call
+        let cleaned = Self::strip_special_tokens(&state.buffer);
+        if self.extract_tool_calls(&cleaned).is_some() {
+            let result = self.parse_complete(&state.buffer).await?;
+            if !result.is_empty() {
+                state.buffer.clear();
+                return Ok(StreamResult::ToolComplete(
+                    result.into_iter().next().unwrap(),
+                ));
+            }
+        }
+
+        Ok(StreamResult::Incomplete)
+    }
+
+    fn detect_format(&self, text: &str) -> bool {
+        let cleaned = Self::strip_special_tokens(text);
+        self.tool_call_regex.is_match(&cleaned)
+    }
+}
+
+impl Default for PythonicParser {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_single_function_call() {
+        let parser = PythonicParser::new();
+        let input = r#"[search_web(query="Rust programming", max_results=5)]"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "search_web");
+
+        let args: Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+        assert_eq!(args["query"], "Rust programming");
+        assert_eq!(args["max_results"], 5);
+    }
+
+    #[tokio::test]
+    async fn test_multiple_function_calls() {
+        let parser = PythonicParser::new();
+        let input = r#"[get_weather(city="Tokyo"), search(query="news")]"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 2);
+        assert_eq!(result[0].function.name, "get_weather");
+        assert_eq!(result[1].function.name, "search");
+    }
+
+    #[tokio::test]
+    async fn test_python_literals() {
+        let parser = PythonicParser::new();
+        let input = r#"[test(flag=True, disabled=False, optional=None)]"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+
+        let args: Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+        assert_eq!(args["flag"], true);
+        assert_eq!(args["disabled"], false);
+        assert_eq!(args["optional"], Value::Null);
+    }
+
+    #[tokio::test]
+    async fn test_special_tokens() {
+        let parser = PythonicParser::new();
+        let input = r#"<|python_start|>[calculate(x=10, y=20)]<|python_end|>"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "calculate");
+
+        let args: Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+        assert_eq!(args["x"], 10);
+        assert_eq!(args["y"], 20);
+    }
+
+    #[tokio::test]
+    async fn test_llama4_format() {
+        let parser = PythonicParser::new();
+        let input = r#"[get_weather(city="London", units="celsius")]"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "get_weather");
+
+        let args: Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+        assert_eq!(args["city"], "London");
+        assert_eq!(args["units"], "celsius");
+    }
+}
diff --git a/sgl-router/src/tool_parser/parsers/qwen_parser.rs b/sgl-router/src/tool_parser/parsers/qwen_parser.rs
new file mode 100644
index 000000000..29ad2083c
--- /dev/null
+++ b/sgl-router/src/tool_parser/parsers/qwen_parser.rs
@@ -0,0 +1,396 @@
+use async_trait::async_trait;
+use regex::Regex;
+use serde_json::Value;
+
+use crate::tool_parser::{
+    errors::{ToolParserError, ToolParserResult},
+    partial_json::PartialJson,
+    state::ParseState,
+    traits::ToolParser,
+    types::{FunctionCall, StreamResult, ToolCall},
+};
+
+/// Qwen format parser for tool calls
+///
+/// Handles the Qwen 2.5/3 specific format:
+/// `<tool_call>\n{"name": "func", "arguments": {...}}\n</tool_call>`
+///
+/// Features:
+/// - XML-style tags with JSON content
+/// - Support for multiple sequential tool calls
+/// - Newline-aware parsing
+pub struct QwenParser {
+    /// Parser for handling incomplete JSON during streaming
+    partial_json: PartialJson,
+    /// Regex for extracting tool calls
+    extractor: Regex,
+}
+
+impl QwenParser {
+    /// Create a new Qwen parser
+    pub fn new() -> Self {
+        // Use (?s) flag for DOTALL mode to handle newlines
+        let pattern = r"(?s)<tool_call>\n(.*?)\n</tool_call>";
+        let extractor = Regex::new(pattern).expect("Valid regex pattern");
+
+        Self {
+            partial_json: PartialJson::default(),
+            extractor,
+        }
+    }
+
+    /// Extract all tool call blocks from text
+    fn extract_tool_calls<'a>(&self, text: &'a str) -> Vec<&'a str> {
+        self.extractor
+            .captures_iter(text)
+            .filter_map(|cap| cap.get(1).map(|m| m.as_str()))
+            .collect()
+    }
+
+    /// Parse a single JSON object into a ToolCall
+    fn parse_single_object(&self, obj: &Value, index: usize) -> ToolParserResult<Option<ToolCall>> {
+        let name = obj.get("name").and_then(|v| v.as_str());
+
+        if let Some(name) = name {
+            // Get arguments - Qwen uses "arguments" key
+            let empty_obj = Value::Object(serde_json::Map::new());
+            let args = obj.get("arguments").unwrap_or(&empty_obj);
+
+            // Convert arguments to JSON string
+            let arguments = serde_json::to_string(args)
+                .map_err(|e| ToolParserError::ParsingFailed(e.to_string()))?;
+
+            // Generate ID with index for multiple tools
+            let id = format!("qwen_call_{}", index);
+
+            Ok(Some(ToolCall {
+                id,
+                r#type: "function".to_string(),
+                function: FunctionCall {
+                    name: name.to_string(),
+                    arguments,
+                },
+            }))
+        } else {
+            Ok(None)
+        }
+    }
+
+    /// Check if text contains Qwen tool markers
+    fn has_tool_markers(&self, text: &str) -> bool {
+        text.contains("<tool_call>")
+    }
+
+    /// Find the start position of a tool call
+    fn find_tool_start(&self, text: &str) -> Option<usize> {
+        text.find("<tool_call>\n")
+    }
+
+    /// Find the end position of a tool call
+    fn find_tool_end(&self, text: &str, start_pos: usize) -> Option<usize> {
+        let search_from = start_pos + "<tool_call>\n".len();
+        text[search_from..]
+            .find("\n</tool_call>")
+            .map(|pos| search_from + pos + "\n</tool_call>".len())
+    }
+
+    /// Check if buffer ends with a partial token
+    fn ends_with_partial_token(&self, buffer: &str) -> Option<usize> {
+        // Check for partial start token
+        let start_token = "<tool_call>\n";
+        // Use inclusive range to check if entire buffer could be a prefix
+        for i in 1..=start_token.len().min(buffer.len()) {
+            if start_token.starts_with(&buffer[buffer.len() - i..]) {
+                return Some(i);
+            }
+        }
+
+        // Check for partial end token
+        let end_token = "\n</tool_call>";
+        // Only check if buffer ends with a partial match (not the complete token without newline)
+        // If buffer ends with "</tool_call>", that's not a partial token - it's missing the newline
+        if buffer.ends_with("</tool_call>") {
+            // This is a complete end tag, just missing the leading newline
+            // Not a partial token situation
+            return None;
+        }
+        // Use inclusive range to check if entire buffer could be a prefix
+        (1..=end_token.len().min(buffer.len()))
+            .find(|&i| end_token.starts_with(&buffer[buffer.len() - i..]))
+    }
+}
+
+impl Default for QwenParser {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[async_trait]
+impl ToolParser for QwenParser {
+    async fn parse_complete(&self, text: &str) -> ToolParserResult<Vec<ToolCall>> {
+        // Check if text contains Qwen format
+        if !self.has_tool_markers(text) {
+            return Ok(vec![]);
+        }
+
+        // Extract all tool call blocks
+        let tool_blocks = self.extract_tool_calls(text);
+        let mut tools = Vec::new();
+
+        for (index, json_str) in tool_blocks.iter().enumerate() {
+            // Parse each JSON block
+            match serde_json::from_str::<Value>(json_str.trim()) {
+                Ok(value) => {
+                    if let Some(tool) = self.parse_single_object(&value, index)? {
+                        tools.push(tool);
+                    }
+                }
+                Err(_) => {
+                    // Skip malformed JSON blocks
+                    continue;
+                }
+            }
+        }
+
+        Ok(tools)
+    }
+
+    async fn parse_incremental(
+        &self,
+        chunk: &str,
+        state: &mut ParseState,
+    ) -> ToolParserResult<StreamResult> {
+        state.buffer.push_str(chunk);
+
+        // Check for partial token at end of buffer
+        if let Some(_partial_len) = self.ends_with_partial_token(&state.buffer) {
+            // Hold back the partial token
+            return Ok(StreamResult::Incomplete);
+        }
+
+        // Check if we have the start marker
+        if !self.has_tool_markers(&state.buffer) {
+            return Ok(StreamResult::Incomplete);
+        }
+
+        // Find start and end positions
+        if let Some(start_pos) = self.find_tool_start(&state.buffer) {
+            // Check if we have the complete tool call
+            if let Some(end_pos) = self.find_tool_end(&state.buffer, start_pos) {
+                // Extract the JSON content
+                let json_start = start_pos + "<tool_call>\n".len();
+                let json_end = end_pos - "\n</tool_call>".len();
+                let json_str = &state.buffer[json_start..json_end];
+
+                // Parse the complete JSON
+                match serde_json::from_str::<Value>(json_str.trim()) {
+                    Ok(value) => {
+                        if let Some(tool) = self.parse_single_object(&value, 0)? {
+                            // Clear the consumed part from buffer using drain for efficiency
+                            state.buffer.drain(..end_pos);
+                            return Ok(StreamResult::ToolComplete(tool));
+                        }
+                    }
+                    Err(_) => {
+                        // JSON parsing failed, might be incomplete
+                    }
+                }
+            } else {
+                // We have start but no end yet - try partial parsing
+                let json_start = start_pos + "<tool_call>\n".len();
+                let partial_json = &state.buffer[json_start..];
+
+                // Remove trailing newline if present (might be start of end token)
+                let partial_json = partial_json.trim_end();
+
+                // Try to parse with partial JSON parser
+                match self.partial_json.parse_value(partial_json) {
+                    Ok((value, _consumed)) => {
+                        // Extract tool name if available
+                        if let Some(name) = value.get("name").and_then(|v| v.as_str()) {
+                            // Check if we've already sent the name
+                            if !state.in_string {
+                                state.in_string = true; // Use as flag for "name sent"
+                                return Ok(StreamResult::ToolName {
+                                    index: 0,
+                                    name: name.to_string(),
+                                });
+                            }
+
+                            // Check for arguments
+                            if let Some(args) = value.get("arguments") {
+                                if let Ok(args_str) = serde_json::to_string(args) {
+                                    return Ok(StreamResult::ToolArguments {
+                                        index: 0,
+                                        arguments: args_str,
+                                    });
+                                }
+                            }
+                        }
+                    }
+                    Err(_) => {
+                        // Failed to parse even as partial JSON
+                        // Keep buffering
+                    }
+                }
+            }
+        }
+
+        Ok(StreamResult::Incomplete)
+    }
+
+    fn detect_format(&self, text: &str) -> bool {
+        // Check if text contains Qwen-specific markers. If not, it's not this format.
+        if !self.has_tool_markers(text) {
+            return false;
+        }
+
+        // Try to extract tool calls to see if we have a complete, valid one.
+        let tool_blocks = self.extract_tool_calls(text);
+        for json_str in &tool_blocks {
+            if let Ok(value) = serde_json::from_str::<Value>(json_str.trim()) {
+                if let Some(obj) = value.as_object() {
+                    if obj.contains_key("name") && obj.contains_key("arguments") {
+                        // Found a valid, complete tool call.
+                        return true;
+                    }
+                }
+            }
+        }
+
+        // If we have the marker but no valid complete tool call,
+        // it could be a partial stream. We should detect this as the format.
+        true
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_parse_qwen_format() {
+        let parser = QwenParser::new();
+        let input = r#"<tool_call>
+{"name": "get_weather", "arguments": {"location": "Beijing", "units": "celsius"}}
+</tool_call>"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "get_weather");
+        assert!(result[0].function.arguments.contains("Beijing"));
+    }
+
+    #[tokio::test]
+    async fn test_parse_multiple_tools() {
+        let parser = QwenParser::new();
+        let input = r#"<tool_call>
+{"name": "search", "arguments": {"query": "rust programming"}}
+</tool_call>
+<tool_call>
+{"name": "calculate", "arguments": {"expression": "2 + 2"}}
+</tool_call>"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 2);
+        assert_eq!(result[0].function.name, "search");
+        assert_eq!(result[1].function.name, "calculate");
+    }
+
+    #[tokio::test]
+    async fn test_with_normal_text() {
+        let parser = QwenParser::new();
+        let input = r#"Let me help you with that.
+<tool_call>
+{"name": "get_info", "arguments": {"topic": "Rust"}}
+</tool_call>
+Here are the results."#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "get_info");
+    }
+
+    #[tokio::test]
+    async fn test_nested_json_structures() {
+        let parser = QwenParser::new();
+        let input = r#"<tool_call>
+{
+    "name": "process_data",
+    "arguments": {
+        "data": {
+            "nested": {
+                "array": [1, 2, 3],
+                "object": {"key": "value"}
+            }
+        }
+    }
+}
+</tool_call>"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "process_data");
+        assert!(result[0].function.arguments.contains("nested"));
+    }
+
+    #[test]
+    fn test_detect_format() {
+        let parser = QwenParser::new();
+
+        assert!(parser.detect_format(
+            r#"<tool_call>
+{"name": "test", "arguments": {}}
+</tool_call>"#
+        ));
+
+        assert!(parser.detect_format(
+            r#"Text before <tool_call>
+{"name": "test", "arguments": {}}
+</tool_call> text after"#
+        ));
+
+        assert!(!parser.detect_format(r#"{"name": "test", "arguments": {}}"#));
+        assert!(!parser.detect_format("plain text"));
+
+        // Partial format should still be detected
+        assert!(parser.detect_format("<tool_call>"));
+    }
+
+    #[tokio::test]
+    async fn test_streaming_partial() {
+        let parser = QwenParser::new();
+        let mut state = ParseState::new();
+
+        // Simulate streaming chunks
+        let chunks = vec![
+            "<tool_call>\n",
+            r#"{"name": "search","#,
+            r#" "arguments": {"query":"#,
+            r#" "rust"}}"#,
+            "\n</tool_call>",
+        ];
+
+        let mut found_name = false;
+        let mut found_complete = false;
+
+        for chunk in chunks {
+            let result = parser.parse_incremental(chunk, &mut state).await.unwrap();
+
+            match result {
+                StreamResult::ToolName { name, .. } => {
+                    assert_eq!(name, "search");
+                    found_name = true;
+                }
+                StreamResult::ToolComplete(tool) => {
+                    assert_eq!(tool.function.name, "search");
+                    found_complete = true;
+                }
+                _ => {}
+            }
+        }
+
+        assert!(found_name || found_complete); // At least one should be found
+    }
+}
diff --git a/sgl-router/src/tool_parser/parsers/step3_parser.rs b/sgl-router/src/tool_parser/parsers/step3_parser.rs
new file mode 100644
index 000000000..721d5c037
--- /dev/null
+++ b/sgl-router/src/tool_parser/parsers/step3_parser.rs
@@ -0,0 +1,348 @@
+use async_trait::async_trait;
+use regex::Regex;
+use serde_json::Value;
+
+use crate::tool_parser::{
+    errors::{ToolParserError, ToolParserResult},
+    state::ParseState,
+    traits::ToolParser,
+    types::{FunctionCall, StreamResult, ToolCall},
+};
+
+/// Step3 format parser for tool calls
+///
+/// Handles the Step3 specific format with steptml XML:
+/// `<｜tool_calls_begin｜><｜tool_call_begin｜>function<｜tool_sep｜><steptml:invoke name="{name}"><steptml:parameter name="{k}">{v}</steptml:parameter></steptml:invoke><｜tool_call_end｜><｜tool_calls_end｜>`
+///
+/// Features:
+/// - Unicode token delimiters
+/// - StepTML XML format for invocations
+/// - Support for multiple sequential tool calls
+pub struct Step3Parser {
+    /// Regex for extracting tool call blocks
+    tool_call_extractor: Regex,
+    /// Regex for extracting steptml invocations
+    invoke_extractor: Regex,
+    /// Regex for extracting parameters
+    param_extractor: Regex,
+}
+
+impl Step3Parser {
+    /// Create a new Step3 parser
+    pub fn new() -> Self {
+        // Pattern for individual tool calls
+        let tool_call_pattern = r"(?s)<｜tool_call_begin｜>.*?<｜tool_call_end｜>";
+        let tool_call_extractor = Regex::new(tool_call_pattern).expect("Valid regex pattern");
+
+        // Pattern for steptml invocations
+        let invoke_pattern = r#"(?s)<steptml:invoke name="([^"]+)">(.+?)</steptml:invoke>"#;
+        let invoke_extractor = Regex::new(invoke_pattern).expect("Valid regex pattern");
+
+        // Pattern for steptml parameters - using non-greedy match for values to handle < characters
+        let param_pattern = r#"(?s)<steptml:parameter name="([^"]+)">(.+?)</steptml:parameter>"#;
+        let param_extractor = Regex::new(param_pattern).expect("Valid regex pattern");
+
+        Self {
+            tool_call_extractor,
+            invoke_extractor,
+            param_extractor,
+        }
+    }
+
+    /// Check if text contains Step3 tool markers
+    fn has_tool_markers(&self, text: &str) -> bool {
+        text.contains("<｜tool_calls_begin｜>")
+    }
+
+    /// Parse parameters from steptml format
+    fn parse_steptml_parameters(
+        &self,
+        params_text: &str,
+    ) -> ToolParserResult<serde_json::Map<String, Value>> {
+        let mut parameters = serde_json::Map::new();
+
+        for capture in self.param_extractor.captures_iter(params_text) {
+            let param_name = capture.get(1).map_or("", |m| m.as_str()).trim();
+            let param_value_str = capture.get(2).map_or("", |m| m.as_str()).trim();
+
+            // Try to parse the value as JSON first, fallback to string
+            let param_value = if let Ok(json_val) = serde_json::from_str::<Value>(param_value_str) {
+                json_val
+            } else {
+                // Try parsing as Python literal
+                if param_value_str == "true" || param_value_str == "True" {
+                    Value::Bool(true)
+                } else if param_value_str == "false" || param_value_str == "False" {
+                    Value::Bool(false)
+                } else if param_value_str == "null" || param_value_str == "None" {
+                    Value::Null
+                } else if let Ok(num) = param_value_str.parse::<i64>() {
+                    Value::Number(num.into())
+                } else if let Ok(num) = param_value_str.parse::<f64>() {
+                    if let Some(n) = serde_json::Number::from_f64(num) {
+                        Value::Number(n)
+                    } else {
+                        Value::String(param_value_str.to_string())
+                    }
+                } else {
+                    Value::String(param_value_str.to_string())
+                }
+            };
+
+            parameters.insert(param_name.to_string(), param_value);
+        }
+
+        Ok(parameters)
+    }
+
+    /// Parse a single tool call block
+    fn parse_tool_call(&self, block: &str) -> ToolParserResult<Option<ToolCall>> {
+        // Check if it contains function marker and tool separator
+        if !block.contains("function") || !block.contains("<｜tool_sep｜>") {
+            return Ok(None);
+        }
+
+        // Split by tool separator
+        let parts: Vec<&str> = block.split("<｜tool_sep｜>").collect();
+        if parts.len() != 2 {
+            return Ok(None);
+        }
+
+        // Check if it's a function type
+        if !parts[0].contains("function") {
+            return Ok(None);
+        }
+
+        let invoke_part = parts[1];
+
+        // Extract steptml invoke
+        if let Some(captures) = self.invoke_extractor.captures(invoke_part) {
+            let func_name = captures.get(1).map_or("", |m| m.as_str()).trim();
+
+            // Validate function name is not empty
+            if func_name.is_empty() {
+                return Ok(None);
+            }
+
+            let params_text = captures.get(2).map_or("", |m| m.as_str());
+
+            // Parse parameters
+            let parameters = self.parse_steptml_parameters(params_text)?;
+
+            let arguments_str = serde_json::to_string(&parameters)
+                .map_err(|e| ToolParserError::ParsingFailed(e.to_string()))?;
+
+            // Generate ID
+            let id = format!("step3_call_{}", uuid::Uuid::new_v4());
+
+            Ok(Some(ToolCall {
+                id,
+                r#type: "function".to_string(),
+                function: FunctionCall {
+                    name: func_name.to_string(),
+                    arguments: arguments_str,
+                },
+            }))
+        } else {
+            Ok(None)
+        }
+    }
+}
+
+impl Default for Step3Parser {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[async_trait]
+impl ToolParser for Step3Parser {
+    async fn parse_complete(&self, text: &str) -> ToolParserResult<Vec<ToolCall>> {
+        // Check if text contains Step3 format
+        if !self.has_tool_markers(text) {
+            return Ok(vec![]);
+        }
+
+        // Find the tool calls section
+        if let Some(start_pos) = text.find("<｜tool_calls_begin｜>") {
+            let search_from = start_pos + "<｜tool_calls_begin｜>".len();
+
+            // Find the end of tool calls section
+            if let Some(end_pos) = text[search_from..].find("<｜tool_calls_end｜>") {
+                let tool_section = &text[search_from..search_from + end_pos];
+
+                // Extract all tool call blocks
+                let mut tools = Vec::new();
+                for mat in self.tool_call_extractor.find_iter(tool_section) {
+                    if let Some(tool) = self.parse_tool_call(mat.as_str())? {
+                        tools.push(tool);
+                    }
+                }
+
+                return Ok(tools);
+            }
+        }
+
+        Ok(vec![])
+    }
+
+    async fn parse_incremental(
+        &self,
+        chunk: &str,
+        state: &mut ParseState,
+    ) -> ToolParserResult<StreamResult> {
+        state.buffer.push_str(chunk);
+
+        // Check for tool markers
+        if !self.has_tool_markers(&state.buffer) {
+            // No markers found, return as incomplete
+            return Ok(StreamResult::Incomplete);
+        }
+
+        // Look for start of tool calls
+        if let Some(start_pos) = state.buffer.find("<｜tool_calls_begin｜>") {
+            let search_from = start_pos + "<｜tool_calls_begin｜>".len();
+
+            // Look for individual tool call start
+            if let Some(call_start) = state.buffer[search_from..].find("<｜tool_call_begin｜>") {
+                let call_start_abs = search_from + call_start;
+
+                // Look for the end of this tool call
+                let search_end_from = call_start_abs + "<｜tool_call_begin｜>".len();
+                if let Some(call_end) = state.buffer[search_end_from..].find("<｜tool_call_end｜>")
+                {
+                    let call_end_abs = search_end_from + call_end + "<｜tool_call_end｜>".len();
+
+                    // Extract and parse the complete tool call
+                    let tool_call_text = &state.buffer[call_start_abs..call_end_abs];
+
+                    if let Some(tool) = self.parse_tool_call(tool_call_text)? {
+                        // Remove the processed part from buffer
+                        state.buffer.drain(..call_end_abs);
+
+                        return Ok(StreamResult::ToolComplete(tool));
+                    }
+                } else {
+                    // Tool call not complete yet, try to extract partial info
+                    let partial = &state.buffer[search_end_from..];
+
+                    // Check for tool separator
+                    if let Some(sep_pos) = partial.find("<｜tool_sep｜>") {
+                        // Check if it's a function
+                        if partial[..sep_pos].contains("function") {
+                            let after_sep = &partial[sep_pos + "<｜tool_sep｜>".len()..];
+
+                            // Try to extract function name from steptml:invoke
+                            if let Some(name_match) = self.invoke_extractor.captures(after_sep) {
+                                let func_name = name_match.get(1).map_or("", |m| m.as_str()).trim();
+
+                                if !state.in_string && !func_name.is_empty() {
+                                    state.in_string = true; // Mark name as sent
+                                    return Ok(StreamResult::ToolName {
+                                        index: 0,
+                                        name: func_name.to_string(),
+                                    });
+                                }
+
+                                // Try to extract partial parameters
+                                if let Some(params_text) = name_match.get(2) {
+                                    let parameters =
+                                        self.parse_steptml_parameters(params_text.as_str())?;
+
+                                    if !parameters.is_empty() {
+                                        let args_str = serde_json::to_string(&parameters)
+                                            .unwrap_or_else(|_| "{}".to_string());
+
+                                        return Ok(StreamResult::ToolArguments {
+                                            index: 0,
+                                            arguments: args_str,
+                                        });
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        Ok(StreamResult::Incomplete)
+    }
+
+    fn detect_format(&self, text: &str) -> bool {
+        self.has_tool_markers(text)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_parse_step3_single_tool() {
+        let parser = Step3Parser::new();
+        let input = r#"Some text
+<｜tool_calls_begin｜>
+<｜tool_call_begin｜>function<｜tool_sep｜><steptml:invoke name="get_weather">
+<steptml:parameter name="location">Tokyo</steptml:parameter>
+<steptml:parameter name="units">celsius</steptml:parameter>
+</steptml:invoke><｜tool_call_end｜>
+<｜tool_calls_end｜>More text"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "get_weather");
+        assert!(result[0].function.arguments.contains("Tokyo"));
+        assert!(result[0].function.arguments.contains("celsius"));
+    }
+
+    #[tokio::test]
+    async fn test_parse_step3_multiple_tools() {
+        let parser = Step3Parser::new();
+        let input = r#"<｜tool_calls_begin｜>
+<｜tool_call_begin｜>function<｜tool_sep｜><steptml:invoke name="search">
+<steptml:parameter name="query">rust programming</steptml:parameter>
+</steptml:invoke><｜tool_call_end｜>
+<｜tool_call_begin｜>function<｜tool_sep｜><steptml:invoke name="calculate">
+<steptml:parameter name="expression">2 + 2</steptml:parameter>
+</steptml:invoke><｜tool_call_end｜>
+<｜tool_calls_end｜>"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 2);
+        assert_eq!(result[0].function.name, "search");
+        assert_eq!(result[1].function.name, "calculate");
+    }
+
+    #[tokio::test]
+    async fn test_parse_step3_mixed_types() {
+        let parser = Step3Parser::new();
+        let input = r#"<｜tool_calls_begin｜>
+<｜tool_call_begin｜>function<｜tool_sep｜><steptml:invoke name="process_data">
+<steptml:parameter name="count">42</steptml:parameter>
+<steptml:parameter name="active">true</steptml:parameter>
+<steptml:parameter name="rate">1.5</steptml:parameter>
+<steptml:parameter name="name">test</steptml:parameter>
+</steptml:invoke><｜tool_call_end｜>
+<｜tool_calls_end｜>"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "process_data");
+
+        // Parse arguments to check types
+        let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+        assert_eq!(args["count"], 42);
+        assert_eq!(args["active"], true);
+        assert_eq!(args["rate"], 1.5);
+        assert_eq!(args["name"], "test");
+    }
+
+    #[test]
+    fn test_detect_format() {
+        let parser = Step3Parser::new();
+        assert!(parser.detect_format("<｜tool_calls_begin｜>"));
+        assert!(!parser.detect_format("plain text"));
+        assert!(!parser.detect_format("[TOOL_CALLS]"));
+    }
+}
diff --git a/sgl-router/src/tool_parser/partial_json.rs b/sgl-router/src/tool_parser/partial_json.rs
new file mode 100644
index 000000000..4a4504fe0
--- /dev/null
+++ b/sgl-router/src/tool_parser/partial_json.rs
@@ -0,0 +1,527 @@
+use crate::tool_parser::{
+    errors::{ToolParserError, ToolParserResult},
+    traits::PartialJsonParser,
+};
+use serde_json::{Map, Value};
+
+/// Parser for incomplete JSON
+pub struct PartialJson {
+    /// Maximum depth for nested structures
+    max_depth: usize,
+    /// Whether to allow incomplete values
+    allow_incomplete: bool,
+}
+
+impl PartialJson {
+    /// Create a new partial JSON parser
+    pub fn new(max_depth: usize, allow_incomplete: bool) -> Self {
+        Self {
+            max_depth,
+            allow_incomplete,
+        }
+    }
+
+    /// Parse potentially incomplete JSON, returning parsed value and consumed bytes
+    pub fn parse_value(&self, input: &str) -> ToolParserResult<(Value, usize)> {
+        let mut parser = Parser::new(input, self.max_depth, self.allow_incomplete);
+        let value = parser.parse_value(0)?;
+        Ok((value, parser.position))
+    }
+}
+
+impl Default for PartialJson {
+    fn default() -> Self {
+        Self::new(32, true)
+    }
+}
+
+impl PartialJsonParser for PartialJson {
+    fn parse(&self, input: &str) -> ToolParserResult<(Value, usize)> {
+        self.parse_value(input)
+    }
+
+    fn is_complete(&self, input: &str) -> bool {
+        // Try to parse as complete JSON
+        serde_json::from_str::<Value>(input).is_ok()
+    }
+
+    fn max_depth(&self) -> usize {
+        self.max_depth
+    }
+}
+
+/// Internal parser state
+struct Parser<'a> {
+    chars: std::iter::Peekable<std::str::Chars<'a>>,
+    position: usize,
+    max_depth: usize,
+    allow_incomplete: bool,
+}
+
+impl<'a> Parser<'a> {
+    fn new(input: &'a str, max_depth: usize, allow_incomplete: bool) -> Self {
+        Self {
+            chars: input.chars().peekable(),
+            position: 0,
+            max_depth,
+            allow_incomplete,
+        }
+    }
+
+    fn peek(&mut self) -> Option<char> {
+        self.chars.peek().copied()
+    }
+
+    fn advance(&mut self) {
+        if self.chars.next().is_some() {
+            self.position += 1;
+        }
+    }
+
+    fn skip_whitespace(&mut self) {
+        while let Some(ch) = self.peek() {
+            if ch.is_whitespace() {
+                self.advance();
+            } else {
+                break;
+            }
+        }
+    }
+
+    fn parse_value(&mut self, depth: usize) -> ToolParserResult<Value> {
+        if depth > self.max_depth {
+            return Err(ToolParserError::DepthExceeded(self.max_depth));
+        }
+
+        self.skip_whitespace();
+
+        match self.peek() {
+            Some('{') => self.parse_object(depth + 1),
+            Some('[') => self.parse_array(depth + 1),
+            Some('"') => self.parse_string(),
+            Some('t') | Some('f') => self.parse_bool(),
+            Some('n') => self.parse_null(),
+            Some(c) if c == '-' || c.is_ascii_digit() => self.parse_number(),
+            _ => {
+                if self.allow_incomplete {
+                    Ok(Value::Null)
+                } else {
+                    Err(ToolParserError::ParsingFailed(
+                        "Unexpected character".into(),
+                    ))
+                }
+            }
+        }
+    }
+
+    fn parse_object(&mut self, depth: usize) -> ToolParserResult<Value> {
+        if depth > self.max_depth {
+            return Err(ToolParserError::DepthExceeded(self.max_depth));
+        }
+
+        let mut object = Map::new();
+
+        // Consume '{'
+        self.advance();
+        self.skip_whitespace();
+
+        // Check for empty object
+        if self.peek() == Some('}') {
+            self.advance();
+            return Ok(Value::Object(object));
+        }
+
+        loop {
+            // Parse key
+            let key = match self.parse_string() {
+                Ok(Value::String(s)) => s,
+                Err(_) if self.allow_incomplete => {
+                    // Incomplete object
+                    return Ok(Value::Object(object));
+                }
+                Err(e) => return Err(e),
+                _ => return Err(ToolParserError::ParsingFailed("Expected string key".into())),
+            };
+
+            self.skip_whitespace();
+
+            // Expect ':'
+            if self.peek() != Some(':') {
+                if self.allow_incomplete {
+                    // Add null value for incomplete pair
+                    object.insert(key, Value::Null);
+                    return Ok(Value::Object(object));
+                }
+                return Err(ToolParserError::ParsingFailed("Expected ':'".into()));
+            }
+            self.advance();
+            self.skip_whitespace();
+
+            // Parse value (keep same depth - we already incremented in parse_object)
+            let value = match self.parse_value(depth) {
+                Ok(v) => v,
+                Err(_) if self.allow_incomplete => {
+                    // Add null for incomplete value
+                    object.insert(key, Value::Null);
+                    return Ok(Value::Object(object));
+                }
+                Err(e) => return Err(e),
+            };
+
+            object.insert(key, value);
+            self.skip_whitespace();
+
+            match self.peek() {
+                Some(',') => {
+                    self.advance();
+                    self.skip_whitespace();
+                    // Check for trailing comma
+                    if self.peek() == Some('}') {
+                        self.advance();
+                        return Ok(Value::Object(object));
+                    }
+                }
+                Some('}') => {
+                    self.advance();
+                    return Ok(Value::Object(object));
+                }
+                None if self.allow_incomplete => {
+                    return Ok(Value::Object(object));
+                }
+                _ => {
+                    if self.allow_incomplete {
+                        return Ok(Value::Object(object));
+                    }
+                    return Err(ToolParserError::ParsingFailed("Expected ',' or '}'".into()));
+                }
+            }
+        }
+    }
+
+    fn parse_array(&mut self, depth: usize) -> ToolParserResult<Value> {
+        if depth > self.max_depth {
+            return Err(ToolParserError::DepthExceeded(self.max_depth));
+        }
+
+        let mut array = Vec::new();
+
+        // Consume '['
+        self.advance();
+        self.skip_whitespace();
+
+        // Check for empty array
+        if self.peek() == Some(']') {
+            self.advance();
+            return Ok(Value::Array(array));
+        }
+
+        loop {
+            // Parse value (keep same depth - we already incremented in parse_object)
+            let value = match self.parse_value(depth) {
+                Ok(v) => v,
+                Err(_) if self.allow_incomplete => {
+                    return Ok(Value::Array(array));
+                }
+                Err(e) => return Err(e),
+            };
+
+            array.push(value);
+            self.skip_whitespace();
+
+            match self.peek() {
+                Some(',') => {
+                    self.advance();
+                    self.skip_whitespace();
+                    // Check for trailing comma
+                    if self.peek() == Some(']') {
+                        self.advance();
+                        return Ok(Value::Array(array));
+                    }
+                }
+                Some(']') => {
+                    self.advance();
+                    return Ok(Value::Array(array));
+                }
+                None if self.allow_incomplete => {
+                    return Ok(Value::Array(array));
+                }
+                _ => {
+                    if self.allow_incomplete {
+                        return Ok(Value::Array(array));
+                    }
+                    return Err(ToolParserError::ParsingFailed("Expected ',' or ']'".into()));
+                }
+            }
+        }
+    }
+
+    fn parse_string(&mut self) -> ToolParserResult<Value> {
+        if self.peek() != Some('"') {
+            return Err(ToolParserError::ParsingFailed("Expected '\"'".into()));
+        }
+
+        // Consume opening quote
+        self.advance();
+
+        let mut string = String::new();
+        let mut escaped = false;
+
+        while let Some(ch) = self.peek() {
+            if escaped {
+                // Handle escape sequences
+                let escaped_char = match ch {
+                    '"' | '\\' | '/' => ch,
+                    'b' => '\u{0008}',
+                    'f' => '\u{000C}',
+                    'n' => '\n',
+                    'r' => '\r',
+                    't' => '\t',
+                    'u' => {
+                        // Unicode escape
+                        self.advance();
+                        let hex = self.parse_unicode_escape()?;
+                        string.push(hex);
+                        escaped = false;
+                        continue;
+                    }
+                    _ => ch, // Invalid escape, but be lenient
+                };
+                string.push(escaped_char);
+                escaped = false;
+            } else if ch == '\\' {
+                escaped = true;
+            } else if ch == '"' {
+                // End of string
+                self.advance();
+                return Ok(Value::String(string));
+            } else {
+                string.push(ch);
+            }
+            self.advance();
+        }
+
+        // Incomplete string
+        if self.allow_incomplete {
+            Ok(Value::String(string))
+        } else {
+            Err(ToolParserError::ParsingFailed("Unterminated string".into()))
+        }
+    }
+
+    fn parse_unicode_escape(&mut self) -> ToolParserResult<char> {
+        let mut hex = String::new();
+        for _ in 0..4 {
+            if let Some(ch) = self.peek() {
+                if ch.is_ascii_hexdigit() {
+                    hex.push(ch);
+                    self.advance();
+                } else {
+                    break;
+                }
+            } else {
+                break;
+            }
+        }
+
+        if hex.len() == 4 {
+            u32::from_str_radix(&hex, 16)
+                .ok()
+                .and_then(char::from_u32)
+                .ok_or_else(|| ToolParserError::ParsingFailed("Invalid unicode escape".into()))
+        } else if self.allow_incomplete {
+            Ok('\u{FFFD}') // Replacement character
+        } else {
+            Err(ToolParserError::ParsingFailed(
+                "Incomplete unicode escape".into(),
+            ))
+        }
+    }
+
+    fn parse_number(&mut self) -> ToolParserResult<Value> {
+        let mut number = String::new();
+
+        // Handle negative sign
+        if self.peek() == Some('-') {
+            number.push('-');
+            self.advance();
+        }
+
+        // Parse integer part
+        if self.peek() == Some('0') {
+            number.push('0');
+            self.advance();
+        } else {
+            while let Some(ch) = self.peek() {
+                if ch.is_ascii_digit() {
+                    number.push(ch);
+                    self.advance();
+                } else {
+                    break;
+                }
+            }
+        }
+
+        // Parse decimal part
+        if self.peek() == Some('.') {
+            number.push('.');
+            self.advance();
+
+            while let Some(ch) = self.peek() {
+                if ch.is_ascii_digit() {
+                    number.push(ch);
+                    self.advance();
+                } else {
+                    break;
+                }
+            }
+        }
+
+        // Parse exponent
+        if let Some(ch) = self.peek() {
+            if ch == 'e' || ch == 'E' {
+                number.push(ch);
+                self.advance();
+
+                if let Some(sign) = self.peek() {
+                    if sign == '+' || sign == '-' {
+                        number.push(sign);
+                        self.advance();
+                    }
+                }
+
+                while let Some(ch) = self.peek() {
+                    if ch.is_ascii_digit() {
+                        number.push(ch);
+                        self.advance();
+                    } else {
+                        break;
+                    }
+                }
+            }
+        }
+
+        // Try to parse as integer first, then as float
+        if let Ok(n) = number.parse::<i64>() {
+            Ok(Value::Number(serde_json::Number::from(n)))
+        } else if let Ok(n) = number.parse::<f64>() {
+            Ok(Value::Number(
+                serde_json::Number::from_f64(n).unwrap_or_else(|| serde_json::Number::from(0)),
+            ))
+        } else if self.allow_incomplete {
+            Ok(Value::Number(serde_json::Number::from(0)))
+        } else {
+            Err(ToolParserError::ParsingFailed("Invalid number".into()))
+        }
+    }
+
+    fn parse_bool(&mut self) -> ToolParserResult<Value> {
+        let mut word = String::new();
+
+        // Peek at upcoming characters to validate it looks like a boolean
+        let mut temp_chars = self.chars.clone();
+        while let Some(&ch) = temp_chars.peek() {
+            if ch.is_alphabetic() && word.len() < 5 {
+                // "false" is 5 chars
+                word.push(ch);
+                temp_chars.next();
+            } else {
+                break;
+            }
+        }
+
+        // Check if it's a valid boolean prefix
+        let is_valid = word == "true"
+            || word == "false"
+            || (self.allow_incomplete && ("true".starts_with(&word) || "false".starts_with(&word)));
+
+        if !is_valid {
+            return Err(ToolParserError::ParsingFailed("Invalid boolean".into()));
+        }
+
+        // Now actually consume the characters
+        word.clear();
+        while let Some(ch) = self.peek() {
+            if ch.is_alphabetic() {
+                word.push(ch);
+                self.advance();
+            } else {
+                break;
+            }
+        }
+
+        match word.as_str() {
+            "true" => Ok(Value::Bool(true)),
+            "false" => Ok(Value::Bool(false)),
+            partial if self.allow_incomplete => {
+                if "true".starts_with(partial) {
+                    Ok(Value::Bool(true))
+                } else if "false".starts_with(partial) {
+                    Ok(Value::Bool(false))
+                } else {
+                    Err(ToolParserError::ParsingFailed("Invalid boolean".into()))
+                }
+            }
+            _ => Err(ToolParserError::ParsingFailed("Invalid boolean".into())),
+        }
+    }
+
+    fn parse_null(&mut self) -> ToolParserResult<Value> {
+        let mut word = String::new();
+
+        // Peek at upcoming characters to validate it looks like "null"
+        let mut temp_chars = self.chars.clone();
+        while let Some(&ch) = temp_chars.peek() {
+            if ch.is_alphabetic() && word.len() < 4 {
+                // "null" is 4 chars
+                word.push(ch);
+                temp_chars.next();
+            } else {
+                break;
+            }
+        }
+
+        // Check if it's a valid null prefix
+        let is_valid = word == "null" || (self.allow_incomplete && "null".starts_with(&word));
+
+        if !is_valid {
+            return Err(ToolParserError::ParsingFailed("Invalid null".into()));
+        }
+
+        // Now actually consume the characters
+        word.clear();
+        while let Some(ch) = self.peek() {
+            if ch.is_alphabetic() {
+                word.push(ch);
+                self.advance();
+            } else {
+                break;
+            }
+        }
+
+        if word == "null" || (self.allow_incomplete && "null".starts_with(&word)) {
+            Ok(Value::Null)
+        } else {
+            Err(ToolParserError::ParsingFailed("Invalid null".into()))
+        }
+    }
+}
+
+/// Utility function to check if a string contains complete JSON
+pub fn is_complete_json(input: &str) -> bool {
+    serde_json::from_str::<Value>(input).is_ok()
+}
+
+/// Utility function to find common prefix between two strings
+pub fn find_common_prefix(s1: &str, s2: &str) -> usize {
+    s1.chars()
+        .zip(s2.chars())
+        .take_while(|(a, b)| a == b)
+        .count()
+}
+
+/// Utility function to compute diff between old and new strings
+pub fn compute_diff(old: &str, new: &str) -> String {
+    let common_len = find_common_prefix(old, new);
+    // Convert character count to byte offset
+    new.chars().skip(common_len).collect()
+}
diff --git a/sgl-router/src/tool_parser/python_literal_parser.rs b/sgl-router/src/tool_parser/python_literal_parser.rs
new file mode 100644
index 000000000..4acc69d34
--- /dev/null
+++ b/sgl-router/src/tool_parser/python_literal_parser.rs
@@ -0,0 +1,442 @@
+/// Minimal Python literal parser for Pythonic tool call format
+///
+/// This module provides a recursive descent parser for Python literals
+/// (strings, numbers, booleans, None, lists, dicts) without requiring
+/// a full Python AST parser.
+use serde_json::{json, Value};
+use std::collections::HashMap;
+
+use crate::tool_parser::errors::{ToolParserError, ToolParserResult};
+
+/// Token types for Python literals
+#[derive(Debug, Clone, PartialEq)]
+enum Token {
+    // Literals
+    String(String),
+    Number(String),
+    True,
+    False,
+    None,
+
+    // Delimiters
+    LeftBracket,  // [
+    RightBracket, // ]
+    LeftBrace,    // {
+    RightBrace,   // }
+    LeftParen,    // (
+    RightParen,   // )
+    Comma,        // ,
+    Colon,        // :
+    Equals,       // =
+
+    // Identifier for function names
+    Identifier(String),
+
+    // End of input
+    Eof,
+}
+
+/// Lexer for Python literals
+struct Lexer {
+    input: Vec<char>,
+    position: usize,
+}
+
+impl Lexer {
+    fn new(input: &str) -> Self {
+        Self {
+            input: input.chars().collect(),
+            position: 0,
+        }
+    }
+
+    fn current_char(&self) -> Option<char> {
+        self.input.get(self.position).copied()
+    }
+
+    fn advance(&mut self) {
+        if self.position < self.input.len() {
+            self.position += 1;
+        }
+    }
+
+    fn skip_whitespace(&mut self) {
+        while let Some(ch) = self.current_char() {
+            if ch.is_whitespace() {
+                self.advance();
+            } else {
+                break;
+            }
+        }
+    }
+
+    fn read_string(&mut self, quote_char: char) -> Result<String, ToolParserError> {
+        let mut result = String::new();
+        self.advance(); // Skip opening quote
+
+        while let Some(ch) = self.current_char() {
+            if ch == '\\' {
+                self.advance();
+                if let Some(escaped) = self.current_char() {
+                    match escaped {
+                        'n' => result.push('\n'),
+                        't' => result.push('\t'),
+                        'r' => result.push('\r'),
+                        '\\' => result.push('\\'),
+                        '\'' => result.push('\''),
+                        '"' => result.push('"'),
+                        _ => {
+                            result.push('\\');
+                            result.push(escaped);
+                        }
+                    }
+                    self.advance();
+                }
+            } else if ch == quote_char {
+                self.advance(); // Skip closing quote
+                return Ok(result);
+            } else {
+                result.push(ch);
+                self.advance();
+            }
+        }
+
+        Err(ToolParserError::ParsingFailed("Unterminated string".into()))
+    }
+
+    fn read_number(&mut self) -> String {
+        let mut result = String::new();
+
+        // Handle negative numbers
+        if self.current_char() == Some('-') {
+            result.push('-');
+            self.advance();
+        }
+
+        // Read digits and decimal point
+        while let Some(ch) = self.current_char() {
+            if ch.is_ascii_digit() || ch == '.' || ch == 'e' || ch == 'E' || ch == '+' || ch == '-'
+            {
+                result.push(ch);
+                self.advance();
+            } else {
+                break;
+            }
+        }
+
+        result
+    }
+
+    fn read_identifier(&mut self) -> String {
+        let mut result = String::new();
+
+        while let Some(ch) = self.current_char() {
+            if ch.is_alphanumeric() || ch == '_' {
+                result.push(ch);
+                self.advance();
+            } else {
+                break;
+            }
+        }
+
+        result
+    }
+
+    fn next_token(&mut self) -> Result<Token, ToolParserError> {
+        self.skip_whitespace();
+
+        match self.current_char() {
+            None => Ok(Token::Eof),
+            Some('[') => {
+                self.advance();
+                Ok(Token::LeftBracket)
+            }
+            Some(']') => {
+                self.advance();
+                Ok(Token::RightBracket)
+            }
+            Some('{') => {
+                self.advance();
+                Ok(Token::LeftBrace)
+            }
+            Some('}') => {
+                self.advance();
+                Ok(Token::RightBrace)
+            }
+            Some('(') => {
+                self.advance();
+                Ok(Token::LeftParen)
+            }
+            Some(')') => {
+                self.advance();
+                Ok(Token::RightParen)
+            }
+            Some(',') => {
+                self.advance();
+                Ok(Token::Comma)
+            }
+            Some(':') => {
+                self.advance();
+                Ok(Token::Colon)
+            }
+            Some('=') => {
+                self.advance();
+                Ok(Token::Equals)
+            }
+            Some('"') => Ok(Token::String(self.read_string('"')?)),
+            Some('\'') => Ok(Token::String(self.read_string('\'')?)),
+            Some(ch) if ch == '-' || ch.is_ascii_digit() => Ok(Token::Number(self.read_number())),
+            Some(ch) if ch.is_alphabetic() || ch == '_' => {
+                let ident = self.read_identifier();
+                match ident.as_str() {
+                    "True" => Ok(Token::True),
+                    "False" => Ok(Token::False),
+                    "None" => Ok(Token::None),
+                    _ => Ok(Token::Identifier(ident)),
+                }
+            }
+            Some(ch) => Err(ToolParserError::ParsingFailed(format!(
+                "Unexpected character: {}",
+                ch
+            ))),
+        }
+    }
+}
+
+/// Parser for Python literals
+pub struct PythonLiteralParser {
+    lexer: Lexer,
+    current_token: Token,
+}
+
+impl PythonLiteralParser {
+    pub fn new(input: &str) -> Result<Self, ToolParserError> {
+        let mut lexer = Lexer::new(input);
+        let current_token = lexer.next_token()?;
+        Ok(Self {
+            lexer,
+            current_token,
+        })
+    }
+
+    fn advance(&mut self) -> Result<(), ToolParserError> {
+        self.current_token = self.lexer.next_token()?;
+        Ok(())
+    }
+
+    fn expect(&mut self, expected: Token) -> Result<(), ToolParserError> {
+        if self.current_token == expected {
+            self.advance()?;
+            Ok(())
+        } else {
+            Err(ToolParserError::ParsingFailed(format!(
+                "Expected {:?}, got {:?}",
+                expected, self.current_token
+            )))
+        }
+    }
+
+    /// Parse a Python literal value
+    pub fn parse_value(&mut self) -> Result<Value, ToolParserError> {
+        match &self.current_token.clone() {
+            Token::String(s) => {
+                let value = s.clone();
+                self.advance()?;
+                Ok(json!(value))
+            }
+            Token::Number(n) => {
+                let value = if let Ok(int_val) = n.parse::<i64>() {
+                    json!(int_val)
+                } else if let Ok(float_val) = n.parse::<f64>() {
+                    json!(float_val)
+                } else {
+                    return Err(ToolParserError::ParsingFailed(format!(
+                        "Invalid number: {}",
+                        n
+                    )));
+                };
+                self.advance()?;
+                Ok(value)
+            }
+            Token::True => {
+                self.advance()?;
+                Ok(json!(true))
+            }
+            Token::False => {
+                self.advance()?;
+                Ok(json!(false))
+            }
+            Token::None => {
+                self.advance()?;
+                Ok(Value::Null)
+            }
+            Token::LeftBracket => self.parse_list(),
+            Token::LeftBrace => self.parse_dict(),
+            _ => Err(ToolParserError::ParsingFailed(format!(
+                "Unexpected token: {:?}",
+                self.current_token
+            ))),
+        }
+    }
+
+    /// Parse a Python list: [item1, item2, ...]
+    fn parse_list(&mut self) -> Result<Value, ToolParserError> {
+        self.expect(Token::LeftBracket)?;
+        let mut items = Vec::new();
+
+        // Handle empty list
+        if self.current_token == Token::RightBracket {
+            self.advance()?;
+            return Ok(json!(items));
+        }
+
+        loop {
+            items.push(self.parse_value()?);
+
+            if self.current_token == Token::Comma {
+                self.advance()?;
+                // Handle trailing comma
+                if self.current_token == Token::RightBracket {
+                    break;
+                }
+            } else if self.current_token == Token::RightBracket {
+                break;
+            } else {
+                return Err(ToolParserError::ParsingFailed(format!(
+                    "Expected ',' or ']', got {:?}",
+                    self.current_token
+                )));
+            }
+        }
+
+        self.expect(Token::RightBracket)?;
+        Ok(json!(items))
+    }
+
+    /// Parse a Python dict: {key1: value1, key2: value2, ...}
+    fn parse_dict(&mut self) -> Result<Value, ToolParserError> {
+        self.expect(Token::LeftBrace)?;
+        let mut map = HashMap::new();
+
+        // Handle empty dict
+        if self.current_token == Token::RightBrace {
+            self.advance()?;
+            return Ok(json!(map));
+        }
+
+        loop {
+            // Parse key (must be a string)
+            let key = match &self.current_token {
+                Token::String(s) => {
+                    let k = s.clone();
+                    self.advance()?;
+                    k
+                }
+                _ => {
+                    return Err(ToolParserError::ParsingFailed(format!(
+                        "Expected string key, got {:?}",
+                        self.current_token
+                    )))
+                }
+            };
+
+            self.expect(Token::Colon)?;
+
+            // Parse value
+            let value = self.parse_value()?;
+            map.insert(key, value);
+
+            if self.current_token == Token::Comma {
+                self.advance()?;
+                // Handle trailing comma
+                if self.current_token == Token::RightBrace {
+                    break;
+                }
+            } else if self.current_token == Token::RightBrace {
+                break;
+            } else {
+                return Err(ToolParserError::ParsingFailed(format!(
+                    "Expected ',' or '}}', got {:?}",
+                    self.current_token
+                )));
+            }
+        }
+
+        self.expect(Token::RightBrace)?;
+        Ok(json!(map))
+    }
+}
+
+/// Parse a Python literal string into a JSON value
+pub fn parse_python_literal(input: &str) -> ToolParserResult<Value> {
+    let mut parser = PythonLiteralParser::new(input)?;
+    parser.parse_value()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_primitives() {
+        assert_eq!(parse_python_literal("True").unwrap(), json!(true));
+        assert_eq!(parse_python_literal("False").unwrap(), json!(false));
+        assert_eq!(parse_python_literal("None").unwrap(), Value::Null);
+        assert_eq!(parse_python_literal("42").unwrap(), json!(42));
+        assert_eq!(parse_python_literal("12.345").unwrap(), json!(12.345));
+        assert_eq!(parse_python_literal("-42").unwrap(), json!(-42));
+        assert_eq!(parse_python_literal("\"hello\"").unwrap(), json!("hello"));
+        assert_eq!(parse_python_literal("'world'").unwrap(), json!("world"));
+    }
+
+    #[test]
+    fn test_parse_list() {
+        assert_eq!(parse_python_literal("[]").unwrap(), json!([]));
+        assert_eq!(parse_python_literal("[1, 2, 3]").unwrap(), json!([1, 2, 3]));
+        assert_eq!(
+            parse_python_literal("[\"a\", \"b\", \"c\"]").unwrap(),
+            json!(["a", "b", "c"])
+        );
+        assert_eq!(
+            parse_python_literal("[True, False, None]").unwrap(),
+            json!([true, false, null])
+        );
+        // Nested list
+        assert_eq!(
+            parse_python_literal("[[1, 2], [3, 4]]").unwrap(),
+            json!([[1, 2], [3, 4]])
+        );
+    }
+
+    #[test]
+    fn test_parse_dict() {
+        assert_eq!(parse_python_literal("{}").unwrap(), json!({}));
+        assert_eq!(
+            parse_python_literal("{\"a\": 1, \"b\": 2}").unwrap(),
+            json!({"a": 1, "b": 2})
+        );
+        assert_eq!(
+            parse_python_literal("{'x': True, 'y': False}").unwrap(),
+            json!({"x": true, "y": false})
+        );
+        // Nested dict
+        assert_eq!(
+            parse_python_literal("{\"nested\": {\"value\": [1, 2, 3]}}").unwrap(),
+            json!({"nested": {"value": [1, 2, 3]}})
+        );
+    }
+
+    #[test]
+    fn test_complex_nested() {
+        let input = r#"{"config": {"nested": {"value": [1, 2, 3]}, "enabled": True}}"#;
+        let expected = json!({
+            "config": {
+                "nested": {
+                    "value": [1, 2, 3]
+                },
+                "enabled": true
+            }
+        });
+        assert_eq!(parse_python_literal(input).unwrap(), expected);
+    }
+}
diff --git a/sgl-router/src/tool_parser/registry.rs b/sgl-router/src/tool_parser/registry.rs
new file mode 100644
index 000000000..f694d680c
--- /dev/null
+++ b/sgl-router/src/tool_parser/registry.rs
@@ -0,0 +1,224 @@
+use crate::tool_parser::parsers::{
+    DeepSeekParser, Glm4MoeParser, GptOssParser, JsonParser, KimiK2Parser, LlamaParser,
+    MistralParser, PythonicParser, QwenParser, Step3Parser,
+};
+use crate::tool_parser::traits::ToolParser;
+use once_cell::sync::Lazy;
+use std::collections::HashMap;
+use std::sync::Arc;
+
+/// Global singleton registry instance - created once and reused
+pub static GLOBAL_REGISTRY: Lazy<ParserRegistry> = Lazy::new(ParserRegistry::new_internal);
+
+/// Registry for tool parsers and model mappings
+pub struct ParserRegistry {
+    /// Map of parser name to parser instance
+    parsers: HashMap<String, Arc<dyn ToolParser>>,
+    /// Map of model name/pattern to parser name
+    model_mapping: HashMap<String, String>,
+    /// Default parser to use when no match found
+    default_parser: String,
+}
+
+impl ParserRegistry {
+    /// Get the global singleton instance
+    pub fn new() -> &'static Self {
+        &GLOBAL_REGISTRY
+    }
+
+    /// Create a new instance for testing (not the singleton)
+    #[cfg(test)]
+    pub fn new_for_testing() -> Self {
+        Self::new_internal()
+    }
+
+    /// Internal constructor for creating the singleton instance
+    fn new_internal() -> Self {
+        let mut registry = Self {
+            parsers: HashMap::new(),
+            model_mapping: HashMap::new(),
+            default_parser: "json".to_string(),
+        };
+
+        // Register default parsers
+        registry.register_default_parsers();
+
+        // Register default model mappings
+        registry.register_default_mappings();
+
+        registry
+    }
+
+    /// Register a parser
+    pub fn register_parser(&mut self, name: impl Into<String>, parser: Arc<dyn ToolParser>) {
+        self.parsers.insert(name.into(), parser);
+    }
+
+    /// Map a model name/pattern to a parser
+    pub fn map_model(&mut self, model: impl Into<String>, parser: impl Into<String>) {
+        self.model_mapping.insert(model.into(), parser.into());
+    }
+
+    /// Get parser for a specific model
+    pub fn get_parser(&self, model: &str) -> Option<Arc<dyn ToolParser>> {
+        // Try exact match first
+        if let Some(parser_name) = self.model_mapping.get(model) {
+            if let Some(parser) = self.parsers.get(parser_name) {
+                return Some(parser.clone());
+            }
+        }
+
+        // Try prefix matching with more specific patterns first
+        // Collect all matching patterns and sort by specificity (longer = more specific)
+        let mut matches: Vec<(&String, &String)> = self
+            .model_mapping
+            .iter()
+            .filter(|(pattern, _)| {
+                if pattern.ends_with('*') {
+                    let prefix = &pattern[..pattern.len() - 1];
+                    model.starts_with(prefix)
+                } else {
+                    false
+                }
+            })
+            .collect();
+
+        // Sort by pattern length in descending order (longer patterns are more specific)
+        matches.sort_by_key(|(pattern, _)| std::cmp::Reverse(pattern.len()));
+
+        // Return the first matching parser
+        for (_, parser_name) in matches {
+            if let Some(parser) = self.parsers.get(parser_name) {
+                return Some(parser.clone());
+            }
+        }
+
+        // Fall back to default parser if it exists
+        self.parsers.get(&self.default_parser).cloned()
+    }
+
+    /// List all registered parsers
+    pub fn list_parsers(&self) -> Vec<&str> {
+        self.parsers.keys().map(|s| s.as_str()).collect()
+    }
+
+    /// List all model mappings
+    pub fn list_mappings(&self) -> Vec<(&str, &str)> {
+        self.model_mapping
+            .iter()
+            .map(|(k, v)| (k.as_str(), v.as_str()))
+            .collect()
+    }
+
+    /// Register default parsers
+    fn register_default_parsers(&mut self) {
+        // JSON parser - most common format
+        self.register_parser("json", Arc::new(JsonParser::new()));
+
+        // Mistral parser - [TOOL_CALLS] [...] format
+        self.register_parser("mistral", Arc::new(MistralParser::new()));
+
+        // Qwen parser - <tool_call>...</tool_call> format
+        self.register_parser("qwen", Arc::new(QwenParser::new()));
+
+        // Pythonic parser - [func(arg=val)] format
+        self.register_parser("pythonic", Arc::new(PythonicParser::new()));
+
+        // Llama parser - <|python_tag|>{...} or plain JSON format
+        self.register_parser("llama", Arc::new(LlamaParser::new()));
+
+        // DeepSeek V3 parser - Unicode tokens with JSON blocks
+        self.register_parser("deepseek", Arc::new(DeepSeekParser::new()));
+
+        // GLM-4 MoE parser - XML-style key-value format
+        self.register_parser("glm4_moe", Arc::new(Glm4MoeParser::new()));
+
+        // Step3 parser - StepTML XML format
+        self.register_parser("step3", Arc::new(Step3Parser::new()));
+
+        // Kimi K2 parser - Token-based with indexed functions
+        self.register_parser("kimik2", Arc::new(KimiK2Parser::new()));
+
+        // GPT-OSS parser - Channel format
+        self.register_parser("gpt_oss", Arc::new(GptOssParser::new()));
+    }
+
+    /// Register default model mappings
+    fn register_default_mappings(&mut self) {
+        // OpenAI models
+        self.map_model("gpt-4*", "json");
+        self.map_model("gpt-3.5*", "json");
+        self.map_model("gpt-4o*", "json");
+
+        // Anthropic models
+        self.map_model("claude-*", "json");
+
+        // Mistral models - use Mistral parser
+        self.map_model("mistral-*", "mistral");
+        self.map_model("mixtral-*", "mistral");
+
+        // Qwen models - use Qwen parser
+        self.map_model("qwen*", "qwen");
+        self.map_model("Qwen*", "qwen");
+
+        // Llama models
+        // Llama 4 uses pythonic format
+        self.map_model("llama-4*", "pythonic");
+        self.map_model("meta-llama-4*", "pythonic");
+        // Llama 3.2 uses python_tag format
+        self.map_model("llama-3.2*", "llama");
+        self.map_model("meta-llama-3.2*", "llama");
+        // Other Llama models use JSON
+        self.map_model("llama-*", "json");
+        self.map_model("meta-llama-*", "json");
+
+        // DeepSeek models
+        // DeepSeek V3 uses custom Unicode token format
+        self.map_model("deepseek-v3*", "deepseek");
+        self.map_model("deepseek-ai/DeepSeek-V3*", "deepseek");
+        // DeepSeek V2 uses pythonic format
+        self.map_model("deepseek-*", "pythonic");
+
+        // GLM models
+        // GLM-4 MoE uses XML-style format
+        self.map_model("glm-4-moe*", "glm4_moe");
+        self.map_model("THUDM/glm-4-moe*", "glm4_moe");
+        self.map_model("glm-4.5*", "glm4_moe");
+        // Other GLM models may use JSON
+        self.map_model("glm-*", "json");
+
+        // Step3 models
+        self.map_model("step3*", "step3");
+        self.map_model("Step-3*", "step3");
+
+        // Kimi models
+        self.map_model("kimi-k2*", "kimik2");
+        self.map_model("Kimi-K2*", "kimik2");
+        self.map_model("moonshot*/Kimi-K2*", "kimik2");
+
+        // GPT-OSS models (T4-style)
+        self.map_model("gpt-oss*", "gpt_oss");
+        self.map_model("t4-*", "gpt_oss");
+
+        // Other models default to JSON
+        self.map_model("gemini-*", "json");
+        self.map_model("palm-*", "json");
+        self.map_model("gemma-*", "json");
+    }
+
+    /// Set the default parser
+    pub fn set_default_parser(&mut self, name: impl Into<String>) {
+        self.default_parser = name.into();
+    }
+
+    /// Check if a parser is registered
+    pub fn has_parser(&self, name: &str) -> bool {
+        self.parsers.contains_key(name)
+    }
+}
+
+impl Default for &'static ParserRegistry {
+    fn default() -> Self {
+        ParserRegistry::new()
+    }
+}
diff --git a/sgl-router/src/tool_parser/state.rs b/sgl-router/src/tool_parser/state.rs
new file mode 100644
index 000000000..096a9352f
--- /dev/null
+++ b/sgl-router/src/tool_parser/state.rs
@@ -0,0 +1,181 @@
+use crate::tool_parser::types::{PartialToolCall, ToolCall};
+
+/// Current phase of parsing
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ParsePhase {
+    /// Looking for start of tool call
+    Searching,
+    /// Parsing function name
+    InName,
+    /// Parsing function arguments
+    InArguments,
+    /// Tool call complete
+    Complete,
+}
+
+/// State for streaming parser
+#[derive(Debug, Clone)]
+pub struct ParseState {
+    /// Buffer for accumulating input
+    pub buffer: String,
+    /// Position of last consumed character
+    pub consumed: usize,
+    /// Current partial tool being parsed
+    pub partial_tool: Option<PartialToolCall>,
+    /// Completed tool calls
+    pub completed_tools: Vec<ToolCall>,
+    /// Current parsing phase
+    pub phase: ParsePhase,
+    /// Bracket/brace depth for JSON parsing
+    pub bracket_depth: i32,
+    /// Whether currently inside a string literal
+    pub in_string: bool,
+    /// Whether next character should be escaped
+    pub escape_next: bool,
+    /// Current tool index (for streaming)
+    pub tool_index: usize,
+}
+
+impl ParseState {
+    /// Create a new parse state
+    pub fn new() -> Self {
+        Self {
+            buffer: String::new(),
+            consumed: 0,
+            partial_tool: None,
+            completed_tools: Vec::new(),
+            phase: ParsePhase::Searching,
+            bracket_depth: 0,
+            in_string: false,
+            escape_next: false,
+            tool_index: 0,
+        }
+    }
+
+    /// Reset state for parsing next tool
+    pub fn reset(&mut self) {
+        self.partial_tool = None;
+        self.phase = ParsePhase::Searching;
+        self.bracket_depth = 0;
+        self.in_string = false;
+        self.escape_next = false;
+    }
+
+    /// Process a single character for JSON parsing
+    pub fn process_char(&mut self, ch: char) {
+        // Handle escape sequences
+        if self.escape_next {
+            self.escape_next = false;
+            self.buffer.push(ch);
+            return;
+        }
+
+        if ch == '\\' && self.in_string {
+            self.escape_next = true;
+            self.buffer.push(ch);
+            return;
+        }
+
+        // Track string boundaries
+        if ch == '"' && !self.escape_next {
+            self.in_string = !self.in_string;
+        }
+
+        // Track bracket depth for JSON
+        if !self.in_string {
+            match ch {
+                '{' | '[' => {
+                    self.bracket_depth += 1;
+                }
+                '}' | ']' => {
+                    self.bracket_depth -= 1;
+                    if self.bracket_depth == 0 && self.partial_tool.is_some() {
+                        // Complete tool call found
+                        self.phase = ParsePhase::Complete;
+                    }
+                }
+                _ => {}
+            }
+        }
+
+        self.buffer.push(ch);
+    }
+
+    /// Check if we have a complete JSON object/array
+    pub fn has_complete_json(&self) -> bool {
+        self.bracket_depth == 0 && !self.in_string && !self.buffer.is_empty()
+    }
+
+    /// Extract content from buffer starting at position
+    pub fn extract_from(&self, start: usize) -> &str {
+        if start >= self.buffer.len() {
+            return "";
+        }
+
+        // Find the nearest character boundary at or after start
+        let mut safe_start = start;
+        while safe_start < self.buffer.len() && !self.buffer.is_char_boundary(safe_start) {
+            safe_start += 1;
+        }
+
+        if safe_start < self.buffer.len() {
+            &self.buffer[safe_start..]
+        } else {
+            ""
+        }
+    }
+
+    /// Mark content as consumed up to position
+    pub fn consume_to(&mut self, position: usize) {
+        if position > self.consumed {
+            self.consumed = position;
+        }
+    }
+
+    /// Get unconsumed content
+    pub fn unconsumed(&self) -> &str {
+        if self.consumed >= self.buffer.len() {
+            return "";
+        }
+
+        // Find the nearest character boundary at or after consumed
+        let mut safe_consumed = self.consumed;
+        while safe_consumed < self.buffer.len() && !self.buffer.is_char_boundary(safe_consumed) {
+            safe_consumed += 1;
+        }
+
+        if safe_consumed < self.buffer.len() {
+            &self.buffer[safe_consumed..]
+        } else {
+            ""
+        }
+    }
+
+    /// Clear consumed content from buffer
+    pub fn clear_consumed(&mut self) {
+        if self.consumed > 0 {
+            // Find the nearest character boundary at or before consumed
+            let mut safe_consumed = self.consumed;
+            while safe_consumed > 0 && !self.buffer.is_char_boundary(safe_consumed) {
+                safe_consumed -= 1;
+            }
+
+            if safe_consumed > 0 {
+                self.buffer.drain(..safe_consumed);
+                self.consumed = self.consumed.saturating_sub(safe_consumed);
+            }
+        }
+    }
+
+    /// Add completed tool
+    pub fn add_completed_tool(&mut self, tool: ToolCall) {
+        self.completed_tools.push(tool);
+        self.tool_index += 1;
+    }
+}
+
+impl Default for ParseState {
+    fn default() -> Self {
+        Self::new()
+    }
+}
diff --git a/sgl-router/src/tool_parser/tests.rs b/sgl-router/src/tool_parser/tests.rs
new file mode 100644
index 000000000..27a258ad5
--- /dev/null
+++ b/sgl-router/src/tool_parser/tests.rs
@@ -0,0 +1,886 @@
+use super::*;
+use crate::tool_parser::parsers::JsonParser;
+use crate::tool_parser::partial_json::{
+    compute_diff, find_common_prefix, is_complete_json, PartialJson,
+};
+use crate::tool_parser::traits::ToolParser;
+use crate::tool_parser::types::TokenConfig;
+
+#[test]
+fn test_parse_state_new() {
+    let state = ParseState::new();
+    assert_eq!(state.phase, ParsePhase::Searching);
+    assert_eq!(state.buffer, "");
+    assert_eq!(state.consumed, 0);
+    assert_eq!(state.bracket_depth, 0);
+    assert!(!state.in_string);
+    assert!(!state.escape_next);
+}
+
+#[test]
+fn test_parse_state_process_char() {
+    let mut state = ParseState::new();
+
+    // Test bracket tracking
+    state.process_char('{');
+    assert_eq!(state.bracket_depth, 1);
+
+    state.process_char('}');
+    assert_eq!(state.bracket_depth, 0);
+
+    // Test string tracking
+    state.process_char('"');
+    assert!(state.in_string);
+
+    state.process_char('"');
+    assert!(!state.in_string);
+
+    // Test escape handling
+    state.process_char('"');
+    state.process_char('\\');
+    assert!(state.escape_next);
+
+    state.process_char('"');
+    assert!(!state.escape_next);
+    assert!(state.in_string); // Still in string because quote was escaped
+}
+
+#[test]
+fn test_token_config() {
+    let config = TokenConfig {
+        start_tokens: vec!["<start>".to_string(), "[".to_string()],
+        end_tokens: vec!["</end>".to_string(), "]".to_string()],
+        separator: ", ".to_string(),
+    };
+
+    let pairs: Vec<_> = config.iter_pairs().collect();
+    assert_eq!(pairs.len(), 2);
+    assert_eq!(pairs[0], ("<start>", "</end>"));
+    assert_eq!(pairs[1], ("[", "]"));
+}
+
+#[test]
+fn test_parser_registry() {
+    let registry = ParserRegistry::new();
+
+    // Test has default mappings
+    assert!(!registry.list_mappings().is_empty());
+
+    // Test model pattern matching
+    let mappings = registry.list_mappings();
+    let has_gpt = mappings.iter().any(|(m, _)| m.starts_with("gpt"));
+    assert!(has_gpt);
+}
+
+#[test]
+fn test_parser_registry_pattern_matching() {
+    let mut registry = ParserRegistry::new_for_testing();
+
+    // Test that model mappings work by checking the list
+    registry.map_model("test-model", "json");
+
+    // Verify through list_mappings
+    let mappings = registry.list_mappings();
+    let has_test = mappings
+        .iter()
+        .any(|(m, p)| *m == "test-model" && *p == "json");
+    assert!(has_test);
+}
+
+#[test]
+fn test_tool_call_serialization() {
+    let tool_call = ToolCall {
+        id: "call-123".to_string(),
+        r#type: "function".to_string(),
+        function: FunctionCall {
+            name: "search".to_string(),
+            arguments: r#"{"query": "rust programming"}"#.to_string(),
+        },
+    };
+
+    let json = serde_json::to_string(&tool_call).unwrap();
+    assert!(json.contains("call-123"));
+    assert!(json.contains("search"));
+    assert!(json.contains("rust programming"));
+
+    let parsed: ToolCall = serde_json::from_str(&json).unwrap();
+    assert_eq!(parsed.id, "call-123");
+    assert_eq!(parsed.function.name, "search");
+}
+
+#[test]
+fn test_partial_json_parser() {
+    let parser = PartialJson::default();
+
+    // Test complete JSON
+    let input = r#"{"name": "test", "value": 42}"#;
+    let (value, consumed) = parser.parse_value(input).unwrap();
+    assert_eq!(value["name"], "test");
+    assert_eq!(value["value"], 42);
+    assert_eq!(consumed, input.len());
+
+    // Test incomplete JSON object
+    let input = r#"{"name": "test", "value": "#;
+    let (value, _consumed) = parser.parse_value(input).unwrap();
+    assert_eq!(value["name"], "test");
+    assert!(value["value"].is_null());
+
+    // Test incomplete string
+    let input = r#"{"name": "tes"#;
+    let (value, _consumed) = parser.parse_value(input).unwrap();
+    assert_eq!(value["name"], "tes");
+
+    // Test incomplete array
+    let input = r#"[1, 2, "#;
+    let (value, _consumed) = parser.parse_value(input).unwrap();
+    assert!(value.is_array());
+    assert_eq!(value[0], 1);
+    assert_eq!(value[1], 2);
+}
+
+#[test]
+fn test_partial_json_depth_limit() {
+    // max_depth of 3 allows nesting up to 3 levels
+    // Set allow_incomplete to false to get errors instead of partial results
+    let parser = PartialJson::new(3, false);
+
+    // This should work (simple object)
+    let input = r#"{"a": 1}"#;
+    let result = parser.parse_value(input);
+    assert!(result.is_ok());
+
+    // This should work (nested to depth 3)
+    let input = r#"{"a": {"b": {"c": 1}}}"#;
+    let result = parser.parse_value(input);
+    assert!(result.is_ok());
+
+    // This should fail (nested to depth 4, exceeds limit)
+    let input = r#"{"a": {"b": {"c": {"d": 1}}}}"#;
+    let result = parser.parse_value(input);
+    assert!(result.is_err());
+}
+
+#[test]
+fn test_is_complete_json() {
+    assert!(is_complete_json(r#"{"name": "test"}"#));
+    assert!(is_complete_json(r#"[1, 2, 3]"#));
+    assert!(is_complete_json(r#""string""#));
+    assert!(is_complete_json("42"));
+    assert!(is_complete_json("true"));
+    assert!(is_complete_json("null"));
+
+    assert!(!is_complete_json(r#"{"name": "#));
+    assert!(!is_complete_json(r#"[1, 2, "#));
+    assert!(!is_complete_json(r#""unclosed"#));
+}
+
+#[test]
+fn test_find_common_prefix() {
+    assert_eq!(find_common_prefix("hello", "hello"), 5);
+    assert_eq!(find_common_prefix("hello", "help"), 3);
+    assert_eq!(find_common_prefix("hello", "world"), 0);
+    assert_eq!(find_common_prefix("", "hello"), 0);
+    assert_eq!(find_common_prefix("hello", ""), 0);
+}
+
+#[test]
+fn test_compute_diff() {
+    assert_eq!(compute_diff("hello", "hello world"), " world");
+    assert_eq!(compute_diff("", "hello"), "hello");
+    assert_eq!(compute_diff("hello", "hello"), "");
+    assert_eq!(compute_diff("test", "hello"), "hello");
+}
+
+#[test]
+fn test_stream_result_variants() {
+    // Test Incomplete
+    let result = StreamResult::Incomplete;
+    matches!(result, StreamResult::Incomplete);
+
+    // Test ToolName
+    let result = StreamResult::ToolName {
+        index: 0,
+        name: "test".to_string(),
+    };
+    if let StreamResult::ToolName { index, name } = result {
+        assert_eq!(index, 0);
+        assert_eq!(name, "test");
+    } else {
+        panic!("Expected ToolName variant");
+    }
+
+    // Test ToolComplete
+    let tool = ToolCall {
+        id: "123".to_string(),
+        r#type: "function".to_string(),
+        function: FunctionCall {
+            name: "test".to_string(),
+            arguments: "{}".to_string(),
+        },
+    };
+    let result = StreamResult::ToolComplete(tool.clone());
+    if let StreamResult::ToolComplete(t) = result {
+        assert_eq!(t.id, "123");
+    } else {
+        panic!("Expected ToolComplete variant");
+    }
+}
+
+#[test]
+fn test_partial_tool_call() {
+    let mut partial = PartialToolCall {
+        name: None,
+        arguments_buffer: String::new(),
+        start_position: 0,
+        name_sent: false,
+        streamed_args: String::new(),
+    };
+
+    // Set name
+    partial.name = Some("test_function".to_string());
+    assert_eq!(partial.name.as_ref().unwrap(), "test_function");
+
+    // Append arguments
+    partial.arguments_buffer.push_str(r#"{"key": "value"}"#);
+    assert_eq!(partial.arguments_buffer, r#"{"key": "value"}"#);
+
+    // Update streaming state
+    partial.name_sent = true;
+    partial.streamed_args = r#"{"key": "#.to_string();
+    assert!(partial.name_sent);
+    assert_eq!(partial.streamed_args, r#"{"key": "#);
+}
+
+#[tokio::test]
+async fn test_json_parser_complete_single() {
+    let parser = JsonParser::new();
+
+    // Test single tool call with arguments
+    let input = r#"{"name": "get_weather", "arguments": {"location": "San Francisco", "units": "celsius"}}"#;
+    let result = parser.parse_complete(input).await.unwrap();
+
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "get_weather");
+    assert!(result[0].function.arguments.contains("San Francisco"));
+    assert!(result[0].function.arguments.contains("celsius"));
+}
+
+#[tokio::test]
+async fn test_json_parser_complete_array() {
+    let parser = JsonParser::new();
+
+    // Test array of tool calls
+    let input = r#"[
+        {"name": "get_weather", "arguments": {"location": "SF"}},
+        {"name": "get_news", "arguments": {"query": "technology"}}
+    ]"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+
+    assert_eq!(result.len(), 2);
+    assert_eq!(result[0].function.name, "get_weather");
+    assert_eq!(result[1].function.name, "get_news");
+}
+
+#[tokio::test]
+async fn test_json_parser_with_parameters() {
+    let parser = JsonParser::new();
+
+    // Test with "parameters" instead of "arguments"
+    let input = r#"{"name": "calculate", "parameters": {"x": 10, "y": 20, "operation": "add"}}"#;
+    let result = parser.parse_complete(input).await.unwrap();
+
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "calculate");
+    assert!(result[0].function.arguments.contains("10"));
+    assert!(result[0].function.arguments.contains("20"));
+    assert!(result[0].function.arguments.contains("add"));
+}
+
+#[tokio::test]
+async fn test_json_parser_with_tokens() {
+    // Test with custom wrapper tokens
+    let parser = JsonParser::with_config(TokenConfig {
+        start_tokens: vec!["[TOOL_CALLS] [".to_string()],
+        end_tokens: vec!["]".to_string()],
+        separator: ", ".to_string(),
+    });
+
+    let input = r#"[TOOL_CALLS] [{"name": "search", "arguments": {"query": "rust programming"}}]"#;
+    let result = parser.parse_complete(input).await.unwrap();
+
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "search");
+}
+
+#[tokio::test]
+async fn test_multiline_json_with_tokens() {
+    // Test that regex with (?s) flag properly handles multi-line JSON
+    let parser = JsonParser::with_config(TokenConfig {
+        start_tokens: vec!["<tool>".to_string()],
+        end_tokens: vec!["</tool>".to_string()],
+        separator: ", ".to_string(),
+    });
+
+    // Pretty-printed multi-line JSON
+    let input = r#"<tool>{
+    "name": "get_weather",
+    "arguments": {
+        "location": "San Francisco",
+        "units": "celsius",
+        "include_forecast": true
+    }
+}</tool>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "get_weather");
+    assert!(result[0].function.arguments.contains("San Francisco"));
+    assert!(result[0].function.arguments.contains("celsius"));
+    assert!(result[0].function.arguments.contains("true"));
+}
+
+#[tokio::test]
+async fn test_multiline_json_array() {
+    // Test multi-line JSON array without wrapper tokens
+    let parser = JsonParser::new();
+
+    let input = r#"[
+    {
+        "name": "function1",
+        "arguments": {
+            "param1": "value1",
+            "param2": 42
+        }
+    },
+    {
+        "name": "function2",
+        "parameters": {
+            "data": [1, 2, 3],
+            "flag": false
+        }
+    }
+]"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 2);
+    assert_eq!(result[0].function.name, "function1");
+    assert_eq!(result[1].function.name, "function2");
+    assert!(result[0].function.arguments.contains("value1"));
+    assert!(result[1].function.arguments.contains("[1,2,3]"));
+}
+
+#[test]
+fn test_json_parser_format_detection() {
+    let parser = JsonParser::new();
+
+    // Should detect valid tool call formats
+    assert!(parser.detect_format(r#"{"name": "test", "arguments": {}}"#));
+    assert!(parser.detect_format(r#"{"name": "test", "parameters": {"x": 1}}"#));
+    assert!(parser.detect_format(r#"[{"name": "test"}]"#));
+
+    // Should not detect non-tool formats
+    assert!(!parser.detect_format("plain text"));
+    assert!(!parser.detect_format(r#"{"key": "value"}"#));
+    assert!(!parser.detect_format(r#"{"data": {"nested": true}}"#));
+}
+
+#[tokio::test]
+async fn test_json_parser_streaming() {
+    let parser = JsonParser::new();
+    let mut state = ParseState::new();
+
+    // Test with complete JSON
+    let full_json = r#"{"name": "get_weather", "arguments": {"location": "San Francisco"}}"#;
+
+    let result = parser
+        .parse_incremental(full_json, &mut state)
+        .await
+        .unwrap();
+
+    match result {
+        StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "get_weather");
+            assert!(tool.function.arguments.contains("San Francisco"));
+        }
+        _ => panic!("Expected ToolComplete for complete JSON"),
+    }
+}
+
+#[tokio::test]
+async fn test_registry_with_json_parser() {
+    let registry = ParserRegistry::new();
+
+    // JSON parser should be registered by default
+    assert!(registry.has_parser("json"));
+
+    // Should get JSON parser for OpenAI models
+    let parser = registry.get_parser("gpt-4-turbo").unwrap();
+
+    // Test that the parser works
+    let input = r#"{"name": "test", "arguments": {"x": 1}}"#;
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "test");
+}
+
+#[tokio::test]
+async fn test_json_parser_invalid_input() {
+    let parser = JsonParser::new();
+
+    // Invalid JSON should return empty results
+    assert_eq!(parser.parse_complete("not json").await.unwrap().len(), 0);
+    assert_eq!(parser.parse_complete("{invalid}").await.unwrap().len(), 0);
+    assert_eq!(parser.parse_complete("").await.unwrap().len(), 0);
+}
+
+#[tokio::test]
+async fn test_json_parser_empty_arguments() {
+    let parser = JsonParser::new();
+
+    // Tool call with no arguments
+    let input = r#"{"name": "get_time"}"#;
+    let result = parser.parse_complete(input).await.unwrap();
+
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "get_time");
+    assert_eq!(result[0].function.arguments, "{}");
+}
+
+#[cfg(test)]
+mod failure_cases {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_malformed_tool_missing_name() {
+        let parser = JsonParser::new();
+
+        // Missing name field
+        let input = r#"{"arguments": {"x": 1}}"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 0, "Should return empty for tool without name");
+
+        // Empty name
+        let input = r#"{"name": "", "arguments": {"x": 1}}"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1, "Should accept empty name string");
+        assert_eq!(result[0].function.name, "");
+    }
+
+    #[tokio::test]
+    async fn test_invalid_arguments_json() {
+        let parser = JsonParser::new();
+
+        // Arguments is a string instead of object
+        let input = r#"{"name": "test", "arguments": "not an object"}"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        // Should serialize the string as JSON
+        assert!(result[0].function.arguments.contains("not an object"));
+
+        // Arguments is a number
+        let input = r#"{"name": "test", "arguments": 42}"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.arguments, "42");
+
+        // Arguments is null
+        let input = r#"{"name": "test", "arguments": null}"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.arguments, "null");
+    }
+
+    #[tokio::test]
+    async fn test_broken_wrapper_tokens() {
+        let parser = JsonParser::with_config(TokenConfig {
+            start_tokens: vec!["<tool>".to_string()],
+            end_tokens: vec!["</tool>".to_string()],
+            separator: ", ".to_string(),
+        });
+
+        // Missing end token
+        let input = r#"<tool>{"name": "test", "arguments": {}}"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(
+            result.len(),
+            0,
+            "Should fail to parse without complete wrapper"
+        );
+
+        // Missing start token - parser looks for complete wrapper, so this won't parse
+        let input = r#"{"name": "test", "arguments": {}}</tool>"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(
+            result.len(),
+            0,
+            "Should not parse JSON with incomplete wrapper"
+        );
+
+        // Mismatched tokens
+        let input = r#"<tool>{"name": "test", "arguments": {}}</wrong>"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 0, "Should fail with mismatched tokens");
+    }
+
+    #[tokio::test]
+    async fn test_invalid_json_structures() {
+        let parser = JsonParser::new();
+
+        // Trailing comma
+        let input = r#"{"name": "test", "arguments": {"x": 1,}}"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 0, "Should reject JSON with trailing comma");
+
+        // Missing quotes on keys
+        let input = r#"{name: "test", arguments: {}}"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 0, "Should reject invalid JSON syntax");
+
+        // Unclosed object
+        let input = r#"{"name": "test", "arguments": {"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 0, "Should reject incomplete JSON");
+    }
+}
+
+#[cfg(test)]
+mod edge_cases {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_unicode_in_names_and_arguments() {
+        let parser = JsonParser::new();
+
+        // Unicode in function name
+        let input = r#"{"name": "获取天气", "arguments": {"location": "北京"}}"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "获取天气");
+        assert!(result[0].function.arguments.contains("北京"));
+
+        // Emoji in arguments
+        let input = r#"{"name": "send_message", "arguments": {"text": "Hello 👋 World 🌍"}}"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert!(result[0].function.arguments.contains("👋"));
+        assert!(result[0].function.arguments.contains("🌍"));
+    }
+
+    #[tokio::test]
+    async fn test_escaped_characters() {
+        let parser = JsonParser::new();
+
+        // Escaped quotes in arguments
+        let input = r#"{"name": "echo", "arguments": {"text": "He said \"hello\""}}"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert!(result[0].function.arguments.contains(r#"\"hello\""#));
+
+        // Escaped backslashes
+        let input = r#"{"name": "path", "arguments": {"dir": "C:\\Users\\test"}}"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert!(result[0].function.arguments.contains("\\\\"));
+
+        // Newlines and tabs
+        let input = r#"{"name": "format", "arguments": {"text": "line1\nline2\ttabbed"}}"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert!(result[0].function.arguments.contains("\\n"));
+        assert!(result[0].function.arguments.contains("\\t"));
+    }
+
+    #[tokio::test]
+    async fn test_very_large_payloads() {
+        let parser = JsonParser::new();
+
+        // Large arguments object
+        let mut large_args = r#"{"name": "process", "arguments": {"#.to_string();
+        for i in 0..1000 {
+            large_args.push_str(&format!(r#""field_{}": "value_{}","#, i, i));
+        }
+        large_args.push_str(r#""final": "value"}}"#);
+
+        let result = parser.parse_complete(&large_args).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "process");
+        assert!(result[0].function.arguments.contains("field_999"));
+
+        // Large array of tool calls
+        let mut large_array = "[".to_string();
+        for i in 0..100 {
+            if i > 0 {
+                large_array.push(',');
+            }
+            large_array.push_str(&format!(r#"{{"name": "func_{}", "arguments": {{}}}}"#, i));
+        }
+        large_array.push(']');
+
+        let result = parser.parse_complete(&large_array).await.unwrap();
+        assert_eq!(result.len(), 100);
+        assert_eq!(result[99].function.name, "func_99");
+    }
+
+    #[tokio::test]
+    async fn test_mixed_array_tools_and_non_tools() {
+        let parser = JsonParser::new();
+
+        // Array with both tool calls and non-tool objects
+        let input = r#"[
+            {"name": "tool1", "arguments": {}},
+            {"not_a_tool": "just_data"},
+            {"name": "tool2", "parameters": {"x": 1}},
+            {"key": "value", "another": "field"}
+        ]"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 2, "Should only parse valid tool calls");
+        assert_eq!(result[0].function.name, "tool1");
+        assert_eq!(result[1].function.name, "tool2");
+    }
+
+    #[tokio::test]
+    async fn test_duplicate_keys_in_json() {
+        let parser = JsonParser::new();
+
+        // JSON with duplicate keys (last one wins in most parsers)
+        let input = r#"{"name": "first", "name": "second", "arguments": {"x": 1, "x": 2}}"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(
+            result[0].function.name, "second",
+            "Last duplicate key should win"
+        );
+        assert!(
+            result[0].function.arguments.contains("2"),
+            "Last duplicate value should win"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_null_values_in_arguments() {
+        let parser = JsonParser::new();
+
+        // Null values in arguments
+        let input = r#"{"name": "test", "arguments": {"required": "value", "optional": null}}"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert!(result[0].function.arguments.contains("null"));
+
+        // Array with null
+        let input = r#"{"name": "test", "arguments": {"items": [1, null, "three"]}}"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert!(result[0].function.arguments.contains("null"));
+    }
+
+    #[tokio::test]
+    async fn test_multiple_token_pairs_with_conflicts() {
+        // Test with overlapping token patterns
+        let parser = JsonParser::with_config(TokenConfig {
+            start_tokens: vec!["<<".to_string(), "<tool>".to_string()],
+            end_tokens: vec![">>".to_string(), "</tool>".to_string()],
+            separator: ", ".to_string(),
+        });
+
+        // First pattern
+        let input = r#"<<{"name": "test1", "arguments": {}}>>"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "test1");
+
+        // Second pattern
+        let input = r#"<tool>{"name": "test2", "arguments": {}}</tool>"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "test2");
+
+        // Nested patterns (should use first match)
+        let input = r#"<<tool>{"name": "test3", "arguments": {}}</tool>>"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        // This is tricky - depends on regex behavior
+        // The parser should handle this gracefully
+        assert!(result.len() <= 1, "Should not parse multiple times");
+    }
+
+    #[tokio::test]
+    async fn test_streaming_with_partial_chunks() {
+        let parser = JsonParser::new();
+
+        // Test 1: Very incomplete JSON (just opening brace) should return Incomplete
+        let mut state1 = ParseState::new();
+        let partial = r#"{"#;
+        let result = parser
+            .parse_incremental(partial, &mut state1)
+            .await
+            .unwrap();
+        assert!(
+            matches!(result, StreamResult::Incomplete),
+            "Should return Incomplete for just opening brace"
+        );
+
+        // Test 2: Complete JSON should return ToolComplete
+        let mut state2 = ParseState::new();
+        let complete = r#"{"name": "get_weather", "arguments": {"location": "SF"}}"#;
+        let result = parser
+            .parse_incremental(complete, &mut state2)
+            .await
+            .unwrap();
+
+        match result {
+            StreamResult::ToolComplete(tool) => {
+                assert_eq!(tool.function.name, "get_weather");
+                let args: serde_json::Value =
+                    serde_json::from_str(&tool.function.arguments).unwrap();
+                assert_eq!(args["location"], "SF");
+            }
+            _ => panic!("Expected ToolComplete for complete JSON"),
+        }
+
+        // Test 3: Partial JSON with name
+        // The PartialJson parser can complete partial JSON by filling in missing values
+        let mut state3 = ParseState::new();
+        let partial_with_name = r#"{"name": "test", "argum"#;
+        let result = parser
+            .parse_incremental(partial_with_name, &mut state3)
+            .await
+            .unwrap();
+
+        match result {
+            StreamResult::ToolComplete(tool) => {
+                assert_eq!(tool.function.name, "test");
+                // Arguments will be empty object since "argum" is incomplete
+                assert_eq!(tool.function.arguments, "{}");
+            }
+            StreamResult::ToolName { name, .. } => {
+                assert_eq!(name, "test");
+            }
+            StreamResult::Incomplete => {
+                // Also acceptable if parser decides to wait
+            }
+            _ => panic!("Unexpected result for partial JSON with name"),
+        }
+    }
+
+    #[tokio::test]
+    async fn test_special_json_values() {
+        let parser = JsonParser::new();
+
+        // Boolean values
+        let input = r#"{"name": "toggle", "arguments": {"enabled": true, "disabled": false}}"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert!(result[0].function.arguments.contains("true"));
+        assert!(result[0].function.arguments.contains("false"));
+
+        // Numbers (including float and negative)
+        let input = r#"{"name": "calc", "arguments": {"int": 42, "float": 3.14, "negative": -17}}"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert!(result[0].function.arguments.contains("42"));
+        assert!(result[0].function.arguments.contains("3.14"));
+        assert!(result[0].function.arguments.contains("-17"));
+
+        // Empty arrays and objects
+        let input = r#"{"name": "test", "arguments": {"empty_arr": [], "empty_obj": {}}}"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert!(result[0].function.arguments.contains("[]"));
+        assert!(result[0].function.arguments.contains("{}"));
+    }
+
+    #[tokio::test]
+    async fn test_function_field_alternative() {
+        let parser = JsonParser::new();
+
+        // Using "function" instead of "name"
+        let input = r#"{"function": "test_func", "arguments": {"x": 1}}"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "test_func");
+
+        // Both "name" and "function" present (name should take precedence)
+        let input = r#"{"name": "primary", "function": "secondary", "arguments": {}}"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "primary");
+    }
+
+    #[tokio::test]
+    async fn test_whitespace_handling() {
+        let parser = JsonParser::new();
+
+        // Extra whitespace everywhere
+        let input = r#"  {
+            "name"   :   "test"  ,
+            "arguments"   :   {
+                "key"   :   "value"
+            }
+        }  "#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "test");
+
+        // Minified JSON (no whitespace)
+        let input = r#"{"name":"compact","arguments":{"a":1,"b":2}}"#;
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "compact");
+    }
+}
+
+#[cfg(test)]
+mod stress_tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_deeply_nested_arguments() {
+        let parser = JsonParser::new();
+
+        // Deeply nested structure
+        let input = r#"{
+            "name": "nested",
+            "arguments": {
+                "level1": {
+                    "level2": {
+                        "level3": {
+                            "level4": {
+                                "level5": {
+                                    "value": "deep"
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }"#;
+
+        let result = parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert!(result[0].function.arguments.contains("deep"));
+    }
+
+    #[tokio::test]
+    async fn test_concurrent_parser_usage() {
+        // Test that parser can be used concurrently
+        let parser = std::sync::Arc::new(JsonParser::new());
+
+        let mut handles = vec![];
+
+        for i in 0..10 {
+            let parser_clone = parser.clone();
+            let handle = tokio::spawn(async move {
+                let input = format!(r#"{{"name": "func_{}", "arguments": {{}}}}"#, i);
+                let result = parser_clone.parse_complete(&input).await.unwrap();
+                assert_eq!(result.len(), 1);
+                assert_eq!(result[0].function.name, format!("func_{}", i));
+            });
+            handles.push(handle);
+        }
+
+        for handle in handles {
+            handle.await.unwrap();
+        }
+    }
+}
diff --git a/sgl-router/src/tool_parser/traits.rs b/sgl-router/src/tool_parser/traits.rs
new file mode 100644
index 000000000..19263688d
--- /dev/null
+++ b/sgl-router/src/tool_parser/traits.rs
@@ -0,0 +1,35 @@
+use crate::tool_parser::{
+    errors::ToolParserResult,
+    state::ParseState,
+    types::{StreamResult, ToolCall},
+};
+use async_trait::async_trait;
+
+/// Core trait for all tool parsers
+#[async_trait]
+pub trait ToolParser: Send + Sync {
+    /// Parse complete tool calls from final output
+    async fn parse_complete(&self, output: &str) -> ToolParserResult<Vec<ToolCall>>;
+
+    /// Parse tool calls from model output (streaming)
+    async fn parse_incremental(
+        &self,
+        chunk: &str,
+        state: &mut ParseState,
+    ) -> ToolParserResult<StreamResult>;
+
+    /// Check if text contains tool calls in this parser's format
+    fn detect_format(&self, text: &str) -> bool;
+}
+
+/// Trait for partial JSON parsing
+pub trait PartialJsonParser: Send + Sync {
+    /// Parse potentially incomplete JSON
+    fn parse(&self, input: &str) -> ToolParserResult<(serde_json::Value, usize)>;
+
+    /// Check if JSON is complete
+    fn is_complete(&self, input: &str) -> bool;
+
+    /// Get the maximum parsing depth
+    fn max_depth(&self) -> usize;
+}
diff --git a/sgl-router/src/tool_parser/types.rs b/sgl-router/src/tool_parser/types.rs
new file mode 100644
index 000000000..0638d1c2a
--- /dev/null
+++ b/sgl-router/src/tool_parser/types.rs
@@ -0,0 +1,73 @@
+use serde::{Deserialize, Serialize};
+
+/// Parsed tool call from model output (OpenAI format)
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct ToolCall {
+    /// Unique identifier for the tool call
+    pub id: String,
+    /// Type of tool call (currently always "function")
+    #[serde(rename = "type")]
+    pub r#type: String,
+    /// Function call details
+    pub function: FunctionCall,
+}
+
+/// Function call within a tool call
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct FunctionCall {
+    /// Name of the function to call
+    pub name: String,
+    /// Arguments as JSON string
+    pub arguments: String,
+}
+
+/// Streaming parse result
+#[derive(Debug, Clone)]
+pub enum StreamResult {
+    /// Need more data to continue parsing
+    Incomplete,
+    /// Found a tool name (for streaming)
+    ToolName { index: usize, name: String },
+    /// Found incremental arguments (for streaming)
+    ToolArguments { index: usize, arguments: String },
+    /// Completed parsing a tool
+    ToolComplete(ToolCall),
+    /// Normal text (not part of tool call)
+    NormalText(String),
+}
+
+/// Token configuration for parsing
+#[derive(Debug, Clone)]
+pub struct TokenConfig {
+    /// Start tokens for tool calls
+    pub start_tokens: Vec<String>,
+    /// End tokens for tool calls
+    pub end_tokens: Vec<String>,
+    /// Separator between multiple tool calls
+    pub separator: String,
+}
+
+impl TokenConfig {
+    /// Iterate over start/end token pairs
+    pub fn iter_pairs(&self) -> impl Iterator<Item = (&str, &str)> {
+        self.start_tokens
+            .iter()
+            .zip(self.end_tokens.iter())
+            .map(|(s, e)| (s.as_str(), e.as_str()))
+    }
+}
+
+/// Simple partial tool call for streaming
+#[derive(Debug, Clone)]
+pub struct PartialToolCall {
+    /// Tool name (if parsed)
+    pub name: Option<String>,
+    /// Buffer for accumulating arguments
+    pub arguments_buffer: String,
+    /// Start position in the input buffer
+    pub start_position: usize,
+    /// Whether the name has been sent (for streaming)
+    pub name_sent: bool,
+    /// Arguments already streamed
+    pub streamed_args: String,
+}
diff --git a/sgl-router/src/tree.rs b/sgl-router/src/tree.rs
new file mode 100644
index 000000000..ea95a6946
--- /dev/null
+++ b/sgl-router/src/tree.rs
@@ -0,0 +1,1478 @@
+use dashmap::mapref::entry::Entry;
+use dashmap::DashMap;
+use tracing::info;
+
+use std::cmp::Reverse;
+use std::collections::BinaryHeap;
+use std::collections::HashMap;
+use std::collections::VecDeque;
+use std::sync::Arc;
+use std::sync::RwLock;
+
+use std::time::Duration;
+use std::time::{SystemTime, UNIX_EPOCH};
+
+type NodeRef = Arc<Node>;
+
+#[derive(Debug)]
+struct Node {
+    children: DashMap<char, NodeRef>,
+    text: RwLock<String>,
+    tenant_last_access_time: DashMap<String, u128>,
+    parent: RwLock<Option<NodeRef>>,
+}
+
+#[derive(Debug)]
+pub struct Tree {
+    root: NodeRef,
+    pub tenant_char_count: DashMap<String, usize>,
+}
+
+// For the heap
+
+struct EvictionEntry {
+    timestamp: u128,
+    tenant: String,
+    node: NodeRef,
+}
+
+impl Eq for EvictionEntry {}
+
+#[allow(clippy::non_canonical_partial_ord_impl)]
+impl PartialOrd for EvictionEntry {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.timestamp.cmp(&other.timestamp))
+    }
+}
+
+impl Ord for EvictionEntry {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        self.timestamp.cmp(&other.timestamp)
+    }
+}
+
+impl PartialEq for EvictionEntry {
+    fn eq(&self, other: &Self) -> bool {
+        self.timestamp == other.timestamp
+    }
+}
+
+// For char operations
+// Note that in rust, `.len()` or slice is operated on the "byte" level. It causes issues for UTF-8 characters because one character might use multiple bytes.
+// https://en.wikipedia.org/wiki/UTF-8
+
+fn shared_prefix_count(a: &str, b: &str) -> usize {
+    let mut i = 0;
+    let mut a_iter = a.chars();
+    let mut b_iter = b.chars();
+
+    loop {
+        match (a_iter.next(), b_iter.next()) {
+            (Some(a_char), Some(b_char)) if a_char == b_char => {
+                i += 1;
+            }
+            _ => break,
+        }
+    }
+
+    i
+}
+
+fn slice_by_chars(s: &str, start: usize, end: usize) -> String {
+    s.chars().skip(start).take(end - start).collect()
+}
+
+impl Default for Tree {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Tree {
+    /*
+    Thread-safe multi tenant radix tree
+
+    1. Storing data for multiple tenants (the overlap of multiple radix tree)
+    2. Node-level lock to enable concurrent acesss on nodes
+    3. Leaf LRU eviction based on tenant access time
+    */
+
+    pub fn new() -> Self {
+        Tree {
+            root: Arc::new(Node {
+                children: DashMap::new(),
+                text: RwLock::new("".to_string()),
+                tenant_last_access_time: DashMap::new(),
+                parent: RwLock::new(None),
+            }),
+            tenant_char_count: DashMap::new(),
+        }
+    }
+
+    pub fn insert(&self, text: &str, tenant: &str) {
+        // Insert text into tree with given tenant
+
+        let mut curr = Arc::clone(&self.root);
+        let mut curr_idx = 0;
+
+        let timestamp_ms = SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .unwrap()
+            .as_millis();
+
+        curr.tenant_last_access_time
+            .insert(tenant.to_string(), timestamp_ms);
+
+        self.tenant_char_count
+            .entry(tenant.to_string())
+            .or_insert(0);
+
+        let mut prev = Arc::clone(&self.root);
+
+        let text_count = text.chars().count();
+
+        while curr_idx < text_count {
+            let first_char = text.chars().nth(curr_idx).unwrap();
+
+            curr = prev;
+
+            // dashmap.entry locks the entry until the op is done
+            // if using contains_key + insert, there will be an issue that
+            // 1. "apple" and "app" entered at the same time
+            // 2. and get inserted to the dashmap concurrently, so only one is inserted
+
+            match curr.children.entry(first_char) {
+                Entry::Vacant(entry) => {
+                    /*
+                       no matched
+                       [curr]
+                       becomes
+                       [curr] => [new node]
+                    */
+
+                    let curr_text = slice_by_chars(text, curr_idx, text_count);
+                    let curr_text_count = curr_text.chars().count();
+                    let new_node = Arc::new(Node {
+                        children: DashMap::new(),
+                        text: RwLock::new(curr_text),
+                        tenant_last_access_time: DashMap::new(),
+                        parent: RwLock::new(Some(Arc::clone(&curr))),
+                    });
+
+                    // Attach tenant to the new node (map is empty here) and increment count once
+                    self.tenant_char_count
+                        .entry(tenant.to_string())
+                        .and_modify(|count| *count += curr_text_count)
+                        .or_insert(curr_text_count);
+                    new_node
+                        .tenant_last_access_time
+                        .insert(tenant.to_string(), timestamp_ms);
+
+                    entry.insert(Arc::clone(&new_node));
+
+                    prev = Arc::clone(&new_node);
+                    curr_idx = text_count;
+                }
+
+                Entry::Occupied(mut entry) => {
+                    // matched
+                    let matched_node = entry.get().clone();
+
+                    let matched_node_text = matched_node.text.read().unwrap().to_owned();
+                    let matched_node_text_count = matched_node_text.chars().count();
+
+                    let curr_text = slice_by_chars(text, curr_idx, text_count);
+                    let shared_count = shared_prefix_count(&matched_node_text, &curr_text);
+
+                    if shared_count < matched_node_text_count {
+                        /*
+                           split the matched node
+                           [curr] -> [matched_node] =>
+                           becomes
+                           [curr] -> [new_node] -> [contracted_matched_node]
+                        */
+
+                        let matched_text = slice_by_chars(&matched_node_text, 0, shared_count);
+                        let contracted_text = slice_by_chars(
+                            &matched_node_text,
+                            shared_count,
+                            matched_node_text_count,
+                        );
+                        let matched_text_count = matched_text.chars().count();
+
+                        let new_node = Arc::new(Node {
+                            text: RwLock::new(matched_text),
+                            children: DashMap::new(),
+                            parent: RwLock::new(Some(Arc::clone(&curr))),
+                            tenant_last_access_time: matched_node.tenant_last_access_time.clone(),
+                        });
+
+                        let first_new_char = contracted_text.chars().nth(0).unwrap();
+                        new_node
+                            .children
+                            .insert(first_new_char, Arc::clone(&matched_node));
+
+                        entry.insert(Arc::clone(&new_node));
+
+                        *matched_node.text.write().unwrap() = contracted_text;
+                        *matched_node.parent.write().unwrap() = Some(Arc::clone(&new_node));
+
+                        prev = Arc::clone(&new_node);
+
+                        // Atomically attach tenant to the new split node and increment count once
+                        match prev.tenant_last_access_time.entry(tenant.to_string()) {
+                            Entry::Vacant(v) => {
+                                self.tenant_char_count
+                                    .entry(tenant.to_string())
+                                    .and_modify(|count| *count += matched_text_count)
+                                    .or_insert(matched_text_count);
+                                v.insert(timestamp_ms);
+                            }
+                            Entry::Occupied(mut o) => {
+                                o.insert(timestamp_ms);
+                            }
+                        }
+
+                        curr_idx += shared_count;
+                    } else {
+                        // move to next node
+                        prev = Arc::clone(&matched_node);
+
+                        // Atomically attach tenant to existing node and increment count once
+                        match prev.tenant_last_access_time.entry(tenant.to_string()) {
+                            Entry::Vacant(v) => {
+                                self.tenant_char_count
+                                    .entry(tenant.to_string())
+                                    .and_modify(|count| *count += matched_node_text_count)
+                                    .or_insert(matched_node_text_count);
+                                v.insert(timestamp_ms);
+                            }
+                            Entry::Occupied(mut o) => {
+                                o.insert(timestamp_ms);
+                            }
+                        }
+                        curr_idx += shared_count;
+                    }
+                }
+            }
+        }
+    }
+
+    #[allow(unused_assignments)]
+    pub fn prefix_match(&self, text: &str) -> (String, String) {
+        let mut curr = Arc::clone(&self.root);
+        let mut curr_idx = 0;
+
+        let mut prev = Arc::clone(&self.root);
+        let text_count = text.chars().count();
+
+        while curr_idx < text_count {
+            let first_char = text.chars().nth(curr_idx).unwrap();
+            let curr_text = slice_by_chars(text, curr_idx, text_count);
+
+            curr = prev.clone();
+
+            if let Some(entry) = curr.children.get(&first_char) {
+                let matched_node = entry.value().clone();
+                let matched_text_guard = matched_node.text.read().unwrap();
+                let shared_count = shared_prefix_count(&matched_text_guard, &curr_text);
+                let matched_node_text_count = matched_text_guard.chars().count();
+                drop(matched_text_guard);
+
+                if shared_count == matched_node_text_count {
+                    // Full match with current node's text, continue to next node
+                    curr_idx += shared_count;
+                    prev = Arc::clone(&matched_node);
+                } else {
+                    // Partial match, stop here
+                    curr_idx += shared_count;
+                    prev = Arc::clone(&matched_node);
+                    break;
+                }
+            } else {
+                // No match found, stop here
+                break;
+            }
+        }
+
+        curr = prev.clone();
+
+        // Select the first tenant (key in the map)
+        let tenant = curr
+            .tenant_last_access_time
+            .iter()
+            .next()
+            .map(|kv| kv.key().to_owned())
+            .unwrap_or("empty".to_string());
+
+        // Traverse from the curr node to the root and update the timestamp
+
+        let timestamp_ms = SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .unwrap()
+            .as_millis();
+
+        if !tenant.eq("empty") {
+            let mut current_node = Some(curr);
+            while let Some(node) = current_node {
+                node.tenant_last_access_time
+                    .insert(tenant.clone(), timestamp_ms);
+                current_node = node.parent.read().unwrap().clone();
+            }
+        }
+
+        let ret_text = slice_by_chars(text, 0, curr_idx);
+        (ret_text, tenant)
+    }
+
+    #[allow(unused_assignments)]
+    pub fn prefix_match_tenant(&self, text: &str, tenant: &str) -> String {
+        let mut curr = Arc::clone(&self.root);
+        let mut curr_idx = 0;
+
+        let mut prev = Arc::clone(&self.root);
+        let text_count = text.chars().count();
+
+        while curr_idx < text_count {
+            let first_char = text.chars().nth(curr_idx).unwrap();
+            let curr_text = slice_by_chars(text, curr_idx, text_count);
+
+            curr = prev.clone();
+
+            if let Some(entry) = curr.children.get(&first_char) {
+                let matched_node = entry.value().clone();
+
+                // Only continue matching if this node belongs to the specified tenant
+                if !matched_node.tenant_last_access_time.contains_key(tenant) {
+                    break;
+                }
+
+                let matched_text_guard = matched_node.text.read().unwrap();
+                let shared_count = shared_prefix_count(&matched_text_guard, &curr_text);
+                let matched_node_text_count = matched_text_guard.chars().count();
+                drop(matched_text_guard);
+
+                if shared_count == matched_node_text_count {
+                    // Full match with current node's text, continue to next node
+                    curr_idx += shared_count;
+                    prev = Arc::clone(&matched_node);
+                } else {
+                    // Partial match, stop here
+                    curr_idx += shared_count;
+                    prev = Arc::clone(&matched_node);
+                    break;
+                }
+            } else {
+                // No match found, stop here
+                break;
+            }
+        }
+
+        curr = prev.clone();
+
+        // Only update timestamp if we found a match for the specified tenant
+        if curr.tenant_last_access_time.contains_key(tenant) {
+            let timestamp_ms = SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .unwrap()
+                .as_millis();
+
+            let mut current_node = Some(curr);
+            while let Some(node) = current_node {
+                node.tenant_last_access_time
+                    .insert(tenant.to_string(), timestamp_ms);
+                current_node = node.parent.read().unwrap().clone();
+            }
+        }
+
+        slice_by_chars(text, 0, curr_idx)
+    }
+
+    fn leaf_of(node: &NodeRef) -> Vec<String> {
+        /*
+        Return the list of tenants if it's a leaf for the tenant
+         */
+        let mut candidates: HashMap<String, bool> = node
+            .tenant_last_access_time
+            .iter()
+            .map(|entry| (entry.key().clone(), true))
+            .collect();
+
+        for child in node.children.iter() {
+            for tenant in child.value().tenant_last_access_time.iter() {
+                candidates.insert(tenant.key().clone(), false);
+            }
+        }
+
+        candidates
+            .into_iter()
+            .filter(|(_, is_leaf)| *is_leaf)
+            .map(|(tenant, _)| tenant)
+            .collect()
+    }
+
+    pub fn evict_tenant_by_size(&self, max_size: usize) {
+        // Calculate used size and collect leaves
+        let mut stack = vec![Arc::clone(&self.root)];
+        let mut pq = BinaryHeap::new();
+
+        while let Some(curr) = stack.pop() {
+            for child in curr.children.iter() {
+                stack.push(Arc::clone(child.value()));
+            }
+
+            // Add leaves to priority queue
+            for tenant in Tree::leaf_of(&curr) {
+                if let Some(timestamp) = curr.tenant_last_access_time.get(&tenant) {
+                    pq.push(Reverse(EvictionEntry {
+                        timestamp: *timestamp,
+                        tenant: tenant.clone(),
+                        node: Arc::clone(&curr),
+                    }));
+                }
+            }
+        }
+
+        info!("Before eviction - Used size per tenant:");
+        for entry in self.tenant_char_count.iter() {
+            info!("Tenant: {}, Size: {}", entry.key(), entry.value());
+        }
+
+        // Process eviction
+        while let Some(Reverse(entry)) = pq.pop() {
+            let EvictionEntry { tenant, node, .. } = entry;
+
+            if let Some(used_size) = self.tenant_char_count.get(&tenant) {
+                if *used_size <= max_size {
+                    continue;
+                }
+            }
+
+            // Decrement when removing tenant from node
+            if node.tenant_last_access_time.contains_key(&tenant) {
+                let node_len = node.text.read().unwrap().chars().count();
+                self.tenant_char_count
+                    .entry(tenant.clone())
+                    .and_modify(|count| {
+                        *count = count.saturating_sub(node_len);
+                    });
+            }
+
+            // Remove tenant from node
+            node.tenant_last_access_time.remove(&tenant);
+
+            // Remove empty nodes
+            if node.children.is_empty() && node.tenant_last_access_time.is_empty() {
+                if let Some(parent) = node.parent.read().unwrap().as_ref() {
+                    let text_guard = node.text.read().unwrap();
+                    if let Some(first_char) = text_guard.chars().next() {
+                        parent.children.remove(&first_char);
+                    }
+                }
+            }
+
+            // Add parent to queue if it becomes a leaf
+            if let Some(parent) = node.parent.read().unwrap().as_ref() {
+                if Tree::leaf_of(parent).contains(&tenant) {
+                    if let Some(timestamp) = parent.tenant_last_access_time.get(&tenant) {
+                        pq.push(Reverse(EvictionEntry {
+                            timestamp: *timestamp,
+                            tenant: tenant.clone(),
+                            node: Arc::clone(parent),
+                        }));
+                    }
+                }
+            };
+        }
+
+        info!("After eviction - Used size per tenant:");
+        for entry in self.tenant_char_count.iter() {
+            info!("Tenant: {}, Size: {}", entry.key(), entry.value());
+        }
+    }
+
+    pub fn remove_tenant(&self, tenant: &str) {
+        // 1. Find all the leaves for the tenant
+        let mut stack = vec![Arc::clone(&self.root)];
+        let mut queue = VecDeque::new();
+
+        while let Some(curr) = stack.pop() {
+            for child in curr.children.iter() {
+                stack.push(Arc::clone(child.value()));
+            }
+
+            if Tree::leaf_of(&curr).contains(&tenant.to_string()) {
+                queue.push_back(Arc::clone(&curr));
+            }
+        }
+
+        // 2. Start from the leaves and traverse up to the root, removing the tenant from each node
+        while let Some(curr) = queue.pop_front() {
+            // remove tenant from node
+            curr.tenant_last_access_time.remove(&tenant.to_string());
+
+            // remove empty nodes
+            if curr.children.is_empty() && curr.tenant_last_access_time.is_empty() {
+                if let Some(parent) = curr.parent.read().unwrap().as_ref() {
+                    let text_guard = curr.text.read().unwrap();
+                    if let Some(first_char) = text_guard.chars().next() {
+                        parent.children.remove(&first_char);
+                    }
+                }
+            }
+
+            // add parent to queue if it becomes a leaf
+            if let Some(parent) = curr.parent.read().unwrap().as_ref() {
+                if Tree::leaf_of(parent).contains(&tenant.to_string()) {
+                    queue.push_back(Arc::clone(parent));
+                }
+            }
+        }
+
+        // 3. Remove the tenant from the tenant_char_count map
+        self.tenant_char_count.remove(&tenant.to_string());
+    }
+
+    pub fn get_tenant_char_count(&self) -> HashMap<String, usize> {
+        self.tenant_char_count
+            .iter()
+            .map(|entry| (entry.key().clone(), *entry.value()))
+            .collect()
+    }
+
+    pub fn get_smallest_tenant(&self) -> String {
+        // Return a placeholder if there are no tenants
+        if self.tenant_char_count.is_empty() {
+            return "empty".to_string();
+        }
+
+        // Find the tenant with minimum char count
+        let mut min_tenant = None;
+        let mut min_count = usize::MAX;
+
+        for entry in self.tenant_char_count.iter() {
+            let tenant = entry.key();
+            let count = *entry.value();
+
+            if count < min_count {
+                min_count = count;
+                min_tenant = Some(tenant.clone());
+            }
+        }
+
+        // Return the found tenant or "empty" if somehow none was found
+        min_tenant.unwrap_or_else(|| "empty".to_string())
+    }
+
+    pub fn get_used_size_per_tenant(&self) -> HashMap<String, usize> {
+        // perform a DFS to traverse all nodes and calculate the total size used by each tenant
+
+        let mut used_size_per_tenant: HashMap<String, usize> = HashMap::new();
+        let mut stack = vec![Arc::clone(&self.root)];
+
+        while let Some(curr) = stack.pop() {
+            let text_count = curr.text.read().unwrap().chars().count();
+
+            for tenant in curr.tenant_last_access_time.iter() {
+                let size = used_size_per_tenant
+                    .entry(tenant.key().clone())
+                    .or_insert(0);
+                *size += text_count;
+            }
+
+            for child in curr.children.iter() {
+                stack.push(Arc::clone(child.value()));
+            }
+        }
+
+        used_size_per_tenant
+    }
+
+    fn node_to_string(node: &NodeRef, prefix: &str, is_last: bool) -> String {
+        let mut result = String::new();
+
+        // Add prefix and branch character
+        result.push_str(prefix);
+        result.push_str(if is_last { "└── " } else { "├── " });
+
+        // Add node text
+        let node_text = node.text.read().unwrap();
+        result.push_str(&format!("'{}' [", node_text));
+
+        // Add tenant information with timestamps
+        let mut tenant_info = Vec::new();
+        for entry in node.tenant_last_access_time.iter() {
+            let tenant_id = entry.key();
+            let timestamp_ms = entry.value();
+
+            // Convert milliseconds to seconds and remaining milliseconds
+            let seconds = (timestamp_ms / 1000) as u64;
+            let millis = (timestamp_ms % 1000) as u32;
+
+            // Create SystemTime from Unix timestamp
+            let system_time = UNIX_EPOCH + Duration::from_secs(seconds);
+
+            // Format time as HH:MM:SS.mmm
+            let datetime = system_time.duration_since(UNIX_EPOCH).unwrap();
+            let hours = (datetime.as_secs() % 86400) / 3600;
+            let minutes = (datetime.as_secs() % 3600) / 60;
+            let seconds = datetime.as_secs() % 60;
+
+            tenant_info.push(format!(
+                "{} | {:02}:{:02}:{:02}.{:03}",
+                tenant_id, hours, minutes, seconds, millis
+            ));
+        }
+
+        result.push_str(&tenant_info.join(", "));
+        result.push_str("]\n");
+
+        // Process children
+        let children: Vec<_> = node.children.iter().collect();
+        let child_count = children.len();
+
+        for (i, entry) in children.iter().enumerate() {
+            let is_last_child = i == child_count - 1;
+            let new_prefix = format!("{}{}", prefix, if is_last { "    " } else { "│   " });
+
+            result.push_str(&Tree::node_to_string(
+                entry.value(),
+                &new_prefix,
+                is_last_child,
+            ));
+        }
+
+        result
+    }
+
+    pub fn pretty_print(&self) {
+        if self.root.children.is_empty() {
+            return;
+        }
+
+        let mut result = String::new();
+        let children: Vec<_> = self.root.children.iter().collect();
+        let child_count = children.len();
+
+        for (i, entry) in children.iter().enumerate() {
+            let is_last = i == child_count - 1;
+            result.push_str(&Tree::node_to_string(entry.value(), "", is_last));
+        }
+
+        println!("{result}");
+    }
+}
+
+//  Unit tests
+#[cfg(test)]
+mod tests {
+    use rand::distr::Alphanumeric;
+    use rand::distr::SampleString;
+    use rand::rng as thread_rng;
+    use rand::Rng;
+    use std::thread;
+    use std::time::Instant;
+
+    use super::*;
+
+    #[test]
+    fn test_get_smallest_tenant() {
+        let tree = Tree::new();
+
+        // Test empty tree
+        assert_eq!(tree.get_smallest_tenant(), "empty");
+
+        // Insert data for tenant1 - "ap" + "icot" = 6 chars
+        tree.insert("ap", "tenant1");
+        tree.insert("icot", "tenant1");
+
+        // Insert data for tenant2 - "cat" = 3 chars
+        tree.insert("cat", "tenant2");
+
+        // Test - tenant2 should be smallest with 3 chars vs 6 chars
+        assert_eq!(
+            tree.get_smallest_tenant(),
+            "tenant2",
+            "Expected tenant2 to be smallest with 3 characters."
+        );
+
+        // Insert overlapping data for tenant3 and tenant4 to test equal counts
+        // tenant3: "do" = 2 chars
+        // tenant4: "hi" = 2 chars
+        tree.insert("do", "tenant3");
+        tree.insert("hi", "tenant4");
+
+        // Test - should return either tenant3 or tenant4 (both have 2 chars)
+        let smallest = tree.get_smallest_tenant();
+        assert!(
+            smallest == "tenant3" || smallest == "tenant4",
+            "Expected either tenant3 or tenant4 (both have 2 characters), got {}",
+            smallest
+        );
+
+        // Add more text to tenant4 to make it larger
+        tree.insert("hello", "tenant4"); // Now tenant4 has "hi" + "hello" = 6 chars
+
+        // Now tenant3 should be smallest (2 chars vs 6 chars for tenant4)
+        assert_eq!(
+            tree.get_smallest_tenant(),
+            "tenant3",
+            "Expected tenant3 to be smallest with 2 characters"
+        );
+
+        // Test eviction
+        tree.evict_tenant_by_size(3); // This should evict tenants with more than 3 chars
+
+        let post_eviction_smallest = tree.get_smallest_tenant();
+        println!("Smallest tenant after eviction: {}", post_eviction_smallest);
+    }
+
+    #[test]
+    fn test_tenant_char_count() {
+        let tree = Tree::new();
+
+        // Phase 1: Initial insertions
+        tree.insert("apple", "tenant1");
+        tree.insert("apricot", "tenant1");
+        tree.insert("banana", "tenant1");
+        tree.insert("amplify", "tenant2");
+        tree.insert("application", "tenant2");
+
+        let computed_sizes = tree.get_used_size_per_tenant();
+        let maintained_counts: HashMap<String, usize> = tree
+            .tenant_char_count
+            .iter()
+            .map(|entry| (entry.key().clone(), *entry.value()))
+            .collect();
+
+        println!("Phase 1 - Maintained vs Computed counts:");
+        println!(
+            "Maintained: {:?}\nComputed: {:?}",
+            maintained_counts, computed_sizes
+        );
+        assert_eq!(
+            maintained_counts, computed_sizes,
+            "Phase 1: Initial insertions"
+        );
+
+        // Phase 2: Additional insertions
+        tree.insert("apartment", "tenant1");
+        tree.insert("appetite", "tenant2");
+        tree.insert("ball", "tenant1");
+        tree.insert("box", "tenant2");
+
+        let computed_sizes = tree.get_used_size_per_tenant();
+        let maintained_counts: HashMap<String, usize> = tree
+            .tenant_char_count
+            .iter()
+            .map(|entry| (entry.key().clone(), *entry.value()))
+            .collect();
+
+        println!("Phase 2 - Maintained vs Computed counts:");
+        println!(
+            "Maintained: {:?}\nComputed: {:?}",
+            maintained_counts, computed_sizes
+        );
+        assert_eq!(
+            maintained_counts, computed_sizes,
+            "Phase 2: Additional insertions"
+        );
+
+        // Phase 3: Overlapping insertions
+        tree.insert("zebra", "tenant1");
+        tree.insert("zebra", "tenant2");
+        tree.insert("zero", "tenant1");
+        tree.insert("zero", "tenant2");
+
+        let computed_sizes = tree.get_used_size_per_tenant();
+        let maintained_counts: HashMap<String, usize> = tree
+            .tenant_char_count
+            .iter()
+            .map(|entry| (entry.key().clone(), *entry.value()))
+            .collect();
+
+        println!("Phase 3 - Maintained vs Computed counts:");
+        println!(
+            "Maintained: {:?}\nComputed: {:?}",
+            maintained_counts, computed_sizes
+        );
+        assert_eq!(
+            maintained_counts, computed_sizes,
+            "Phase 3: Overlapping insertions"
+        );
+
+        // Phase 4: Eviction test
+        tree.evict_tenant_by_size(10);
+
+        let computed_sizes = tree.get_used_size_per_tenant();
+        let maintained_counts: HashMap<String, usize> = tree
+            .tenant_char_count
+            .iter()
+            .map(|entry| (entry.key().clone(), *entry.value()))
+            .collect();
+
+        println!("Phase 4 - Maintained vs Computed counts:");
+        println!(
+            "Maintained: {:?}\nComputed: {:?}",
+            maintained_counts, computed_sizes
+        );
+        assert_eq!(maintained_counts, computed_sizes, "Phase 4: After eviction");
+    }
+
+    fn random_string(len: usize) -> String {
+        Alphanumeric.sample_string(&mut thread_rng(), len)
+    }
+
+    #[test]
+    fn test_cold_start() {
+        let tree = Tree::new();
+
+        let (matched_text, tenant) = tree.prefix_match("hello");
+
+        assert_eq!(matched_text, "");
+        assert_eq!(tenant, "empty");
+    }
+
+    #[test]
+    fn test_exact_match_seq() {
+        let tree = Tree::new();
+        tree.insert("hello", "tenant1");
+        tree.pretty_print();
+        tree.insert("apple", "tenant2");
+        tree.pretty_print();
+        tree.insert("banana", "tenant3");
+        tree.pretty_print();
+
+        let (matched_text, tenant) = tree.prefix_match("hello");
+        assert_eq!(matched_text, "hello");
+        assert_eq!(tenant, "tenant1");
+
+        let (matched_text, tenant) = tree.prefix_match("apple");
+        assert_eq!(matched_text, "apple");
+        assert_eq!(tenant, "tenant2");
+
+        let (matched_text, tenant) = tree.prefix_match("banana");
+        assert_eq!(matched_text, "banana");
+        assert_eq!(tenant, "tenant3");
+    }
+
+    #[test]
+    fn test_exact_match_concurrent() {
+        let tree = Arc::new(Tree::new());
+
+        // spawn 3 threads for insert
+        let tree_clone = Arc::clone(&tree);
+
+        let texts = ["hello", "apple", "banana"];
+        let tenants = ["tenant1", "tenant2", "tenant3"];
+
+        let mut handles = vec![];
+
+        for i in 0..3 {
+            let tree_clone = Arc::clone(&tree_clone);
+            let text = texts[i];
+            let tenant = tenants[i];
+
+            let handle = thread::spawn(move || {
+                tree_clone.insert(text, tenant);
+            });
+
+            handles.push(handle);
+        }
+
+        // wait
+        for handle in handles {
+            handle.join().unwrap();
+        }
+
+        // spawn 3 threads for match
+        let mut handles = vec![];
+
+        let tree_clone = Arc::clone(&tree);
+
+        for i in 0..3 {
+            let tree_clone = Arc::clone(&tree_clone);
+            let text = texts[i];
+            let tenant = tenants[i];
+
+            let handle = thread::spawn(move || {
+                let (matched_text, matched_tenant) = tree_clone.prefix_match(text);
+                assert_eq!(matched_text, text);
+                assert_eq!(matched_tenant, tenant);
+            });
+
+            handles.push(handle);
+        }
+
+        // wait
+        for handle in handles {
+            handle.join().unwrap();
+        }
+    }
+
+    #[test]
+    fn test_partial_match_concurrent() {
+        let tree = Arc::new(Tree::new());
+
+        // spawn 3 threads for insert
+        let tree_clone = Arc::clone(&tree);
+
+        static TEXTS: [&str; 3] = ["apple", "apabc", "acbdeds"];
+
+        let mut handles = vec![];
+
+        for text in TEXTS.iter() {
+            let tree_clone = Arc::clone(&tree_clone);
+            let tenant = "tenant0";
+
+            let handle = thread::spawn(move || {
+                tree_clone.insert(text, tenant);
+            });
+
+            handles.push(handle);
+        }
+
+        // wait
+        for handle in handles {
+            handle.join().unwrap();
+        }
+
+        // spawn 3 threads for match
+        let mut handles = vec![];
+
+        let tree_clone = Arc::clone(&tree);
+
+        for text in TEXTS.iter() {
+            let tree_clone = Arc::clone(&tree_clone);
+            let tenant = "tenant0";
+
+            let handle = thread::spawn(move || {
+                let (matched_text, matched_tenant) = tree_clone.prefix_match(text);
+                assert_eq!(matched_text, *text);
+                assert_eq!(matched_tenant, tenant);
+            });
+
+            handles.push(handle);
+        }
+
+        // wait
+        for handle in handles {
+            handle.join().unwrap();
+        }
+    }
+
+    #[test]
+    fn test_group_prefix_insert_match_concurrent() {
+        static PREFIXES: [&str; 4] = [
+            "Clock strikes midnight, I'm still wide awake",
+            "Got dreams bigger than these city lights",
+            "Time waits for no one, gotta make my move",
+            "Started from the bottom, that's no metaphor",
+        ];
+        let suffixes = [
+            "Got too much to prove, ain't got time to lose",
+            "History in the making, yeah, you can't erase this",
+        ];
+        let tree = Arc::new(Tree::new());
+
+        let mut handles = vec![];
+
+        for (i, prefix) in PREFIXES.iter().enumerate() {
+            for suffix in suffixes.iter() {
+                let tree_clone = Arc::clone(&tree);
+                let text = format!("{} {}", prefix, suffix);
+                let tenant = format!("tenant{}", i);
+
+                let handle = thread::spawn(move || {
+                    tree_clone.insert(&text, &tenant);
+                });
+
+                handles.push(handle);
+            }
+        }
+
+        // wait
+        for handle in handles {
+            handle.join().unwrap();
+        }
+
+        tree.pretty_print();
+
+        // check matching using multi threads
+        let mut handles = vec![];
+
+        for (i, prefix) in PREFIXES.iter().enumerate() {
+            let tree_clone = Arc::clone(&tree);
+
+            let handle = thread::spawn(move || {
+                let (matched_text, matched_tenant) = tree_clone.prefix_match(prefix);
+                let tenant = format!("tenant{}", i);
+                assert_eq!(matched_text, *prefix);
+                assert_eq!(matched_tenant, tenant);
+            });
+
+            handles.push(handle);
+        }
+
+        // wait
+        for handle in handles {
+            handle.join().unwrap();
+        }
+    }
+
+    #[test]
+    fn test_mixed_concurrent_insert_match() {
+        // ensure it does not deadlock instead of doing correctness check
+
+        static PREFIXES: [&str; 4] = [
+            "Clock strikes midnight, I'm still wide awake",
+            "Got dreams bigger than these city lights",
+            "Time waits for no one, gotta make my move",
+            "Started from the bottom, that's no metaphor",
+        ];
+        let suffixes = [
+            "Got too much to prove, ain't got time to lose",
+            "History in the making, yeah, you can't erase this",
+        ];
+        let tree = Arc::new(Tree::new());
+
+        let mut handles = vec![];
+
+        for (i, prefix) in PREFIXES.iter().enumerate() {
+            for suffix in suffixes.iter() {
+                let tree_clone = Arc::clone(&tree);
+                let text = format!("{} {}", prefix, suffix);
+                let tenant = format!("tenant{}", i);
+
+                let handle = thread::spawn(move || {
+                    tree_clone.insert(&text, &tenant);
+                });
+
+                handles.push(handle);
+            }
+        }
+
+        // check matching using multi threads
+        for prefix in PREFIXES.iter() {
+            let tree_clone = Arc::clone(&tree);
+
+            let handle = thread::spawn(move || {
+                let (_matched_text, _matched_tenant) = tree_clone.prefix_match(prefix);
+            });
+
+            handles.push(handle);
+        }
+
+        // wait
+        for handle in handles {
+            handle.join().unwrap();
+        }
+    }
+
+    #[test]
+    fn test_utf8_split_seq() {
+        // The string should be indexed and split by a utf-8 value basis instead of byte basis
+        // use .chars() to get the iterator of the utf-8 value
+        let tree = Arc::new(Tree::new());
+
+        static TEST_PAIRS: [(&str, &str); 3] = [
+            ("你好嗎", "tenant1"),
+            ("你好喔", "tenant2"),
+            ("你心情好嗎", "tenant3"),
+        ];
+
+        // Insert sequentially
+        for (text, tenant) in TEST_PAIRS.iter() {
+            tree.insert(text, tenant);
+        }
+
+        tree.pretty_print();
+
+        // Test sequentially
+
+        for (text, tenant) in TEST_PAIRS.iter() {
+            let (matched_text, matched_tenant) = tree.prefix_match(text);
+            assert_eq!(matched_text, *text);
+            assert_eq!(matched_tenant, *tenant);
+        }
+    }
+
+    #[test]
+    fn test_utf8_split_concurrent() {
+        let tree = Arc::new(Tree::new());
+
+        static TEST_PAIRS: [(&str, &str); 3] = [
+            ("你好嗎", "tenant1"),
+            ("你好喔", "tenant2"),
+            ("你心情好嗎", "tenant3"),
+        ];
+
+        // Create multiple threads for insertion
+        let mut handles = vec![];
+
+        for (text, tenant) in TEST_PAIRS.iter() {
+            let tree_clone = Arc::clone(&tree);
+
+            let handle = thread::spawn(move || {
+                tree_clone.insert(text, tenant);
+            });
+
+            handles.push(handle);
+        }
+
+        // Wait for all insertions to complete
+        for handle in handles {
+            handle.join().unwrap();
+        }
+
+        tree.pretty_print();
+
+        // Create multiple threads for matching
+        let mut handles = vec![];
+
+        for (text, tenant) in TEST_PAIRS.iter() {
+            let tree_clone = Arc::clone(&tree);
+
+            let handle = thread::spawn(move || {
+                let (matched_text, matched_tenant) = tree_clone.prefix_match(text);
+                assert_eq!(matched_text, *text);
+                assert_eq!(matched_tenant, *tenant);
+            });
+
+            handles.push(handle);
+        }
+
+        // Wait for all matches to complete
+        for handle in handles {
+            handle.join().unwrap();
+        }
+    }
+
+    #[test]
+    fn test_simple_eviction() {
+        let tree = Tree::new();
+        let max_size = 5;
+
+        // Insert strings for both tenants
+        tree.insert("hello", "tenant1"); // size 5
+
+        tree.insert("hello", "tenant2"); // size 5
+        thread::sleep(Duration::from_millis(10));
+        tree.insert("world", "tenant2"); // size 5, total for tenant2 = 10
+
+        tree.pretty_print();
+
+        // Verify initial sizes
+        let sizes_before = tree.get_used_size_per_tenant();
+        assert_eq!(sizes_before.get("tenant1").unwrap(), &5); // "hello" = 5
+        assert_eq!(sizes_before.get("tenant2").unwrap(), &10); // "hello" + "world" = 10
+
+        // Evict - should remove "hello" from tenant2 as it's the oldest
+        tree.evict_tenant_by_size(max_size);
+
+        tree.pretty_print();
+
+        // Verify sizes after eviction
+        let sizes_after = tree.get_used_size_per_tenant();
+        assert_eq!(sizes_after.get("tenant1").unwrap(), &5); // Should be unchanged
+        assert_eq!(sizes_after.get("tenant2").unwrap(), &5); // Only "world" remains
+
+        // Verify "world" remains for tenant2
+        let (matched, tenant) = tree.prefix_match("world");
+        assert_eq!(matched, "world");
+        assert_eq!(tenant, "tenant2");
+    }
+
+    #[test]
+    fn test_advanced_eviction() {
+        let tree = Tree::new();
+
+        // Set limits for each tenant
+        let max_size: usize = 100;
+
+        // Define prefixes
+        let prefixes = ["aqwefcisdf", "iajsdfkmade", "kjnzxcvewqe", "iejksduqasd"];
+
+        // Insert strings with shared prefixes
+        for _i in 0..100 {
+            for (j, prefix) in prefixes.iter().enumerate() {
+                let random_suffix = random_string(10);
+                let text = format!("{}{}", prefix, random_suffix);
+                let tenant = format!("tenant{}", j + 1);
+                tree.insert(&text, &tenant);
+            }
+        }
+
+        // Perform eviction
+        tree.evict_tenant_by_size(max_size);
+
+        // Check sizes after eviction
+        let sizes_after = tree.get_used_size_per_tenant();
+        // Verify all tenants are under their size limits
+        for (tenant, &size) in sizes_after.iter() {
+            assert!(
+                size <= max_size,
+                "Tenant {} exceeds size limit. Current size: {}, Limit: {}",
+                tenant,
+                size,
+                max_size
+            );
+        }
+    }
+
+    #[test]
+    fn test_concurrent_operations_with_eviction() {
+        // Ensure eviction works fine with concurrent insert and match operations for a given period
+
+        let tree = Arc::new(Tree::new());
+        let mut handles = vec![];
+        let test_duration = Duration::from_secs(10);
+        let start_time = Instant::now();
+        let max_size = 100; // Single max size for all tenants
+
+        // Spawn eviction thread
+        {
+            let tree = Arc::clone(&tree);
+            let handle = thread::spawn(move || {
+                while start_time.elapsed() < test_duration {
+                    // Run eviction
+                    tree.evict_tenant_by_size(max_size);
+
+                    // Sleep for 5 seconds
+                    thread::sleep(Duration::from_secs(5));
+                }
+            });
+            handles.push(handle);
+        }
+
+        // Spawn 4 worker threads
+        for thread_id in 0..4 {
+            let tree = Arc::clone(&tree);
+            let handle = thread::spawn(move || {
+                let mut rng = rand::rng();
+                let tenant = format!("tenant{}", thread_id + 1);
+                let prefix = format!("prefix{}", thread_id);
+
+                while start_time.elapsed() < test_duration {
+                    // Random decision: match or insert (70% match, 30% insert)
+                    if rng.random_bool(0.7) {
+                        // Perform match operation
+                        let random_len = rng.random_range(3..10);
+                        let search_str = format!("{}{}", prefix, random_string(random_len));
+                        let (_matched, _) = tree.prefix_match(&search_str);
+                    } else {
+                        // Perform insert operation
+                        let random_len = rng.random_range(5..15);
+                        let insert_str = format!("{}{}", prefix, random_string(random_len));
+                        tree.insert(&insert_str, &tenant);
+                        // println!("Thread {} inserted: {}", thread_id, insert_str);
+                    }
+
+                    // Small random sleep to vary timing
+                    thread::sleep(Duration::from_millis(rng.random_range(10..100)));
+                }
+            });
+            handles.push(handle);
+        }
+
+        // Wait for all threads to complete
+        for handle in handles {
+            handle.join().unwrap();
+        }
+
+        // final eviction
+        tree.evict_tenant_by_size(max_size);
+
+        // Final size check
+        let final_sizes = tree.get_used_size_per_tenant();
+        println!("Final sizes after test completion: {:?}", final_sizes);
+
+        // Verify all tenants are under limit
+        for (_, &size) in final_sizes.iter() {
+            assert!(
+                size <= max_size,
+                "Tenant exceeds size limit. Final size: {}, Limit: {}",
+                size,
+                max_size
+            );
+        }
+    }
+
+    #[test]
+    fn test_leaf_of() {
+        let tree = Tree::new();
+
+        // Single node
+        tree.insert("hello", "tenant1");
+        let leaves = Tree::leaf_of(&tree.root.children.get(&'h').unwrap());
+        assert_eq!(leaves, vec!["tenant1"]);
+
+        // Node with multiple tenants
+        tree.insert("hello", "tenant2");
+        let leaves = Tree::leaf_of(&tree.root.children.get(&'h').unwrap());
+        assert_eq!(leaves.len(), 2);
+        assert!(leaves.contains(&"tenant1".to_string()));
+        assert!(leaves.contains(&"tenant2".to_string()));
+
+        // Non-leaf node
+        tree.insert("hi", "tenant1");
+        let leaves = Tree::leaf_of(&tree.root.children.get(&'h').unwrap());
+        assert!(leaves.is_empty());
+    }
+
+    #[test]
+    fn test_get_used_size_per_tenant() {
+        let tree = Tree::new();
+
+        // Single tenant
+        tree.insert("hello", "tenant1");
+        tree.insert("world", "tenant1");
+        let sizes = tree.get_used_size_per_tenant();
+
+        tree.pretty_print();
+        println!("{:?}", sizes);
+        assert_eq!(sizes.get("tenant1").unwrap(), &10); // "hello" + "world"
+
+        // Multiple tenants sharing nodes
+        tree.insert("hello", "tenant2");
+        tree.insert("help", "tenant2");
+        let sizes = tree.get_used_size_per_tenant();
+
+        tree.pretty_print();
+        println!("{:?}", sizes);
+        assert_eq!(sizes.get("tenant1").unwrap(), &10);
+        assert_eq!(sizes.get("tenant2").unwrap(), &6); // "hello" + "p"
+
+        // UTF-8 characters
+        tree.insert("你好", "tenant3");
+        let sizes = tree.get_used_size_per_tenant();
+        tree.pretty_print();
+        println!("{:?}", sizes);
+        assert_eq!(sizes.get("tenant3").unwrap(), &2); // 2 Chinese characters
+
+        tree.pretty_print();
+    }
+
+    #[test]
+    fn test_prefix_match_tenant() {
+        let tree = Tree::new();
+
+        // Insert overlapping prefixes for different tenants
+        tree.insert("hello", "tenant1"); // tenant1: hello
+        tree.insert("hello", "tenant2"); // tenant2: hello
+        tree.insert("hello world", "tenant2"); // tenant2: hello -> world
+        tree.insert("help", "tenant1"); // tenant1: hel -> p
+        tree.insert("helicopter", "tenant2"); // tenant2: hel -> icopter
+
+        // Test tenant1's data
+        assert_eq!(tree.prefix_match_tenant("hello", "tenant1"), "hello"); // Full match for tenant1
+        assert_eq!(tree.prefix_match_tenant("help", "tenant1"), "help"); // Exclusive to tenant1
+        assert_eq!(tree.prefix_match_tenant("hel", "tenant1"), "hel"); // Shared prefix
+        assert_eq!(tree.prefix_match_tenant("hello world", "tenant1"), "hello"); // Should stop at tenant1's boundary
+        assert_eq!(tree.prefix_match_tenant("helicopter", "tenant1"), "hel"); // Should stop at tenant1's boundary
+
+        // Test tenant2's data
+        assert_eq!(tree.prefix_match_tenant("hello", "tenant2"), "hello"); // Full match for tenant2
+        assert_eq!(
+            tree.prefix_match_tenant("hello world", "tenant2"),
+            "hello world"
+        ); // Exclusive to tenant2
+        assert_eq!(
+            tree.prefix_match_tenant("helicopter", "tenant2"),
+            "helicopter"
+        ); // Exclusive to tenant2
+        assert_eq!(tree.prefix_match_tenant("hel", "tenant2"), "hel"); // Shared prefix
+        assert_eq!(tree.prefix_match_tenant("help", "tenant2"), "hel"); // Should stop at tenant2's boundary
+
+        // Test non-existent tenant
+        assert_eq!(tree.prefix_match_tenant("hello", "tenant3"), ""); // Non-existent tenant
+        assert_eq!(tree.prefix_match_tenant("help", "tenant3"), ""); // Non-existent tenant
+    }
+
+    #[test]
+    fn test_simple_tenant_eviction() {
+        let tree = Tree::new();
+
+        // Insert data for multiple tenants
+        tree.insert("hello", "tenant1");
+        tree.insert("world", "tenant1");
+        tree.insert("hello", "tenant2");
+        tree.insert("help", "tenant2");
+
+        // Verify initial state
+        let initial_sizes = tree.get_used_size_per_tenant();
+        assert_eq!(initial_sizes.get("tenant1").unwrap(), &10); // "hello" + "world"
+        assert_eq!(initial_sizes.get("tenant2").unwrap(), &6); // "hello" + "p"
+
+        // Evict tenant1
+        tree.remove_tenant("tenant1");
+
+        // Verify after eviction
+        let final_sizes = tree.get_used_size_per_tenant();
+        assert!(
+            !final_sizes.contains_key("tenant1"),
+            "tenant1 should be completely removed"
+        );
+        assert_eq!(
+            final_sizes.get("tenant2").unwrap(),
+            &6,
+            "tenant2 should be unaffected"
+        );
+
+        // Verify tenant1's data is inaccessible
+        assert_eq!(tree.prefix_match_tenant("hello", "tenant1"), "");
+        assert_eq!(tree.prefix_match_tenant("world", "tenant1"), "");
+
+        // Verify tenant2's data is still accessible
+        assert_eq!(tree.prefix_match_tenant("hello", "tenant2"), "hello");
+        assert_eq!(tree.prefix_match_tenant("help", "tenant2"), "help");
+    }
+
+    #[test]
+    fn test_complex_tenant_eviction() {
+        let tree = Tree::new();
+
+        // Create a more complex tree structure with shared prefixes
+        tree.insert("apple", "tenant1");
+        tree.insert("application", "tenant1");
+        tree.insert("apple", "tenant2");
+        tree.insert("appetite", "tenant2");
+        tree.insert("banana", "tenant1");
+        tree.insert("banana", "tenant2");
+        tree.insert("ball", "tenant2");
+
+        // Verify initial state
+        let initial_sizes = tree.get_used_size_per_tenant();
+        println!("Initial sizes: {:?}", initial_sizes);
+        tree.pretty_print();
+
+        // Evict tenant1
+        tree.remove_tenant("tenant1");
+
+        // Verify final state
+        let final_sizes = tree.get_used_size_per_tenant();
+        println!("Final sizes: {:?}", final_sizes);
+        tree.pretty_print();
+
+        // Verify tenant1 is completely removed
+        assert!(
+            !final_sizes.contains_key("tenant1"),
+            "tenant1 should be completely removed"
+        );
+
+        // Verify all tenant1's data is inaccessible
+        assert_eq!(tree.prefix_match_tenant("apple", "tenant1"), "");
+        assert_eq!(tree.prefix_match_tenant("application", "tenant1"), "");
+        assert_eq!(tree.prefix_match_tenant("banana", "tenant1"), "");
+
+        // Verify tenant2's data is intact
+        assert_eq!(tree.prefix_match_tenant("apple", "tenant2"), "apple");
+        assert_eq!(tree.prefix_match_tenant("appetite", "tenant2"), "appetite");
+        assert_eq!(tree.prefix_match_tenant("banana", "tenant2"), "banana");
+        assert_eq!(tree.prefix_match_tenant("ball", "tenant2"), "ball");
+
+        // Verify the tree structure is still valid for tenant2
+        let tenant2_size = final_sizes.get("tenant2").unwrap();
+        assert_eq!(tenant2_size, &(5 + 5 + 6 + 2)); // "apple" + "etite" + "banana" + "ll"
+    }
+}
diff --git a/sgl-router/tests/api_endpoints_test.rs b/sgl-router/tests/api_endpoints_test.rs
new file mode 100644
index 000000000..8b2e29714
--- /dev/null
+++ b/sgl-router/tests/api_endpoints_test.rs
@@ -0,0 +1,1669 @@
+mod common;
+
+use axum::{
+    body::Body,
+    extract::Request,
+    http::{header::CONTENT_TYPE, StatusCode},
+};
+use common::mock_worker::{HealthStatus, MockWorker, MockWorkerConfig, WorkerType};
+use reqwest::Client;
+use serde_json::json;
+use sglang_router_rs::config::{
+    CircuitBreakerConfig, ConnectionMode, PolicyConfig, RetryConfig, RouterConfig, RoutingMode,
+};
+use sglang_router_rs::routers::{RouterFactory, RouterTrait};
+use std::sync::Arc;
+use tower::ServiceExt;
+
+/// Test context that manages mock workers
+struct TestContext {
+    workers: Vec<MockWorker>,
+    router: Arc<dyn RouterTrait>,
+    client: Client,
+    config: RouterConfig,
+}
+
+impl TestContext {
+    async fn new(worker_configs: Vec<MockWorkerConfig>) -> Self {
+        // Create default router config
+        let config = RouterConfig {
+            mode: RoutingMode::Regular {
+                worker_urls: vec![],
+            },
+            policy: PolicyConfig::Random,
+            host: "127.0.0.1".to_string(),
+            port: 3002,
+            max_payload_size: 256 * 1024 * 1024,
+            request_timeout_secs: 600,
+            worker_startup_timeout_secs: 1,
+            worker_startup_check_interval_secs: 1,
+            discovery: None,
+            dp_aware: false,
+            api_key: None,
+            metrics: None,
+            log_dir: None,
+            log_level: None,
+            request_id_headers: None,
+            max_concurrent_requests: 64,
+            queue_size: 0,
+            queue_timeout_secs: 60,
+            rate_limit_tokens_per_second: None,
+            cors_allowed_origins: vec![],
+            retry: RetryConfig::default(),
+            circuit_breaker: CircuitBreakerConfig::default(),
+            disable_retries: false,
+            disable_circuit_breaker: false,
+            health_check: sglang_router_rs::config::HealthCheckConfig::default(),
+            enable_igw: false,
+            connection_mode: ConnectionMode::Http,
+            model_path: None,
+            tokenizer_path: None,
+        };
+
+        Self::new_with_config(config, worker_configs).await
+    }
+
+    async fn new_with_config(
+        mut config: RouterConfig,
+        worker_configs: Vec<MockWorkerConfig>,
+    ) -> Self {
+        let mut workers = Vec::new();
+        let mut worker_urls = Vec::new();
+
+        // Start mock workers if any
+        for worker_config in worker_configs {
+            let mut worker = MockWorker::new(worker_config);
+            let url = worker.start().await.unwrap();
+            worker_urls.push(url);
+            workers.push(worker);
+        }
+
+        if !workers.is_empty() {
+            tokio::time::sleep(tokio::time::Duration::from_millis(200)).await;
+        }
+
+        // Update config with worker URLs if not already set
+        if let RoutingMode::Regular {
+            worker_urls: ref mut urls,
+        } = config.mode
+        {
+            if urls.is_empty() {
+                *urls = worker_urls.clone();
+            }
+        }
+
+        let client = Client::builder()
+            .timeout(std::time::Duration::from_secs(config.request_timeout_secs))
+            .build()
+            .unwrap();
+
+        // Create app context
+        let app_context = common::create_test_context(config.clone());
+
+        // Create router
+        let router = RouterFactory::create_router(&app_context).await.unwrap();
+        let router = Arc::from(router);
+
+        // Wait for router to discover workers
+        if !workers.is_empty() {
+            tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
+        }
+
+        Self {
+            workers,
+            router,
+            client,
+            config,
+        }
+    }
+
+    async fn create_app(&self) -> axum::Router {
+        common::test_app::create_test_app(
+            Arc::clone(&self.router),
+            self.client.clone(),
+            &self.config,
+        )
+    }
+
+    async fn shutdown(mut self) {
+        for worker in &mut self.workers {
+            worker.stop().await;
+        }
+    }
+}
+
+#[cfg(test)]
+mod health_tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_liveness_endpoint() {
+        let ctx = TestContext::new(vec![]).await;
+        let app = ctx.create_app().await;
+
+        let req = Request::builder()
+            .method("GET")
+            .uri("/liveness")
+            .body(Body::empty())
+            .unwrap();
+
+        let resp = app.oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::OK);
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_readiness_with_healthy_workers() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 18001,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        let app = ctx.create_app().await;
+
+        let req = Request::builder()
+            .method("GET")
+            .uri("/readiness")
+            .body(Body::empty())
+            .unwrap();
+
+        let resp = app.oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::OK);
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_readiness_with_unhealthy_workers() {
+        let ctx = TestContext::new(vec![]).await;
+
+        let app = ctx.create_app().await;
+
+        let req = Request::builder()
+            .method("GET")
+            .uri("/readiness")
+            .body(Body::empty())
+            .unwrap();
+
+        let resp = app.oneshot(req).await.unwrap();
+        // With no workers, readiness should return SERVICE_UNAVAILABLE
+        assert_eq!(resp.status(), StatusCode::SERVICE_UNAVAILABLE);
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_health_endpoint_details() {
+        let ctx = TestContext::new(vec![
+            MockWorkerConfig {
+                port: 18003,
+                worker_type: WorkerType::Regular,
+                health_status: HealthStatus::Healthy,
+                response_delay_ms: 0,
+                fail_rate: 0.0,
+            },
+            MockWorkerConfig {
+                port: 18004,
+                worker_type: WorkerType::Regular,
+                health_status: HealthStatus::Healthy,
+                response_delay_ms: 0,
+                fail_rate: 0.0,
+            },
+        ])
+        .await;
+
+        let app = ctx.create_app().await;
+
+        let req = Request::builder()
+            .method("GET")
+            .uri("/health")
+            .body(Body::empty())
+            .unwrap();
+
+        let resp = app.oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::OK);
+
+        // The health endpoint returns plain text, not JSON
+        let body = axum::body::to_bytes(resp.into_body(), usize::MAX)
+            .await
+            .unwrap();
+        let body_str = String::from_utf8_lossy(&body);
+        assert!(body_str.contains("All servers healthy"));
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_health_generate_endpoint() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 18005,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        let app = ctx.create_app().await;
+
+        let req = Request::builder()
+            .method("GET")
+            .uri("/health_generate")
+            .body(Body::empty())
+            .unwrap();
+
+        let resp = app.oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::OK);
+
+        let body = axum::body::to_bytes(resp.into_body(), usize::MAX)
+            .await
+            .unwrap();
+        let body_json: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        assert!(body_json.is_object());
+
+        ctx.shutdown().await;
+    }
+}
+
+#[cfg(test)]
+mod generation_tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_generate_success() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 18101,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        let app = ctx.create_app().await;
+
+        let payload = json!({
+            "text": "Hello, world!",
+            "stream": false
+        });
+
+        let req = Request::builder()
+            .method("POST")
+            .uri("/generate")
+            .header(CONTENT_TYPE, "application/json")
+            .body(Body::from(serde_json::to_string(&payload).unwrap()))
+            .unwrap();
+
+        let resp = app.oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::OK);
+
+        let body = axum::body::to_bytes(resp.into_body(), usize::MAX)
+            .await
+            .unwrap();
+        let body_json: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        assert!(body_json.get("text").is_some());
+        assert!(body_json.get("meta_info").is_some());
+        let meta_info = &body_json["meta_info"];
+        assert!(meta_info.get("finish_reason").is_some());
+        assert_eq!(meta_info["finish_reason"]["type"], "stop");
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_generate_streaming() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 18102,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        let app = ctx.create_app().await;
+
+        let payload = json!({
+            "text": "Stream test",
+            "stream": true
+        });
+
+        let req = Request::builder()
+            .method("POST")
+            .uri("/generate")
+            .header(CONTENT_TYPE, "application/json")
+            .body(Body::from(serde_json::to_string(&payload).unwrap()))
+            .unwrap();
+
+        let resp = app.oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::OK);
+
+        // For streaming responses, the router might use chunked encoding or other streaming mechanisms
+        // The exact content-type can vary based on the router implementation
+        // Just verify we got a successful response
+        // Note: In a real implementation, we'd check for text/event-stream or appropriate streaming headers
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_generate_with_worker_failure() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 18103,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 1.0, // Always fail
+        }])
+        .await;
+
+        let app = ctx.create_app().await;
+
+        let payload = json!({
+            "text": "This should fail",
+            "stream": false
+        });
+
+        let req = Request::builder()
+            .method("POST")
+            .uri("/generate")
+            .header(CONTENT_TYPE, "application/json")
+            .body(Body::from(serde_json::to_string(&payload).unwrap()))
+            .unwrap();
+
+        let resp = app.oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::INTERNAL_SERVER_ERROR);
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_v1_chat_completions_success() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 18104,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        let app = ctx.create_app().await;
+
+        let payload = json!({
+            "model": "test-model",
+            "messages": [
+                {"role": "user", "content": "Hello!"}
+            ],
+            "stream": false
+        });
+
+        let req = Request::builder()
+            .method("POST")
+            .uri("/v1/chat/completions")
+            .header(CONTENT_TYPE, "application/json")
+            .body(Body::from(serde_json::to_string(&payload).unwrap()))
+            .unwrap();
+
+        let resp = app.oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::OK);
+
+        let body = axum::body::to_bytes(resp.into_body(), usize::MAX)
+            .await
+            .unwrap();
+        let body_json: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        assert!(body_json.get("choices").is_some());
+
+        ctx.shutdown().await;
+    }
+}
+
+#[cfg(test)]
+mod model_info_tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_get_server_info() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 18201,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        let app = ctx.create_app().await;
+
+        let req = Request::builder()
+            .method("GET")
+            .uri("/get_server_info")
+            .body(Body::empty())
+            .unwrap();
+
+        let resp = app.oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::OK);
+
+        let body = axum::body::to_bytes(resp.into_body(), usize::MAX)
+            .await
+            .unwrap();
+        let body_json: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        assert!(body_json.is_object());
+        // Check for actual sglang server fields
+        assert!(body_json.get("version").is_some());
+        assert!(body_json.get("model_path").is_some());
+        assert!(body_json.get("tokenizer_path").is_some());
+        assert!(body_json.get("port").is_some());
+        assert!(body_json.get("max_num_batched_tokens").is_some());
+        assert!(body_json.get("schedule_policy").is_some());
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_get_model_info() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 18202,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        let app = ctx.create_app().await;
+
+        let req = Request::builder()
+            .method("GET")
+            .uri("/get_model_info")
+            .body(Body::empty())
+            .unwrap();
+
+        let resp = app.oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::OK);
+
+        let body = axum::body::to_bytes(resp.into_body(), usize::MAX)
+            .await
+            .unwrap();
+        let body_json: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        assert!(body_json.is_object());
+        // Check for actual sglang model info fields
+        assert_eq!(
+            body_json.get("model_path").and_then(|v| v.as_str()),
+            Some("mock-model-path")
+        );
+        assert_eq!(
+            body_json.get("tokenizer_path").and_then(|v| v.as_str()),
+            Some("mock-tokenizer-path")
+        );
+        assert_eq!(
+            body_json.get("is_generation").and_then(|v| v.as_bool()),
+            Some(true)
+        );
+        assert!(body_json.get("preferred_sampling_params").is_some());
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_v1_models() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 18203,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        let app = ctx.create_app().await;
+
+        let req = Request::builder()
+            .method("GET")
+            .uri("/v1/models")
+            .body(Body::empty())
+            .unwrap();
+
+        let resp = app.oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::OK);
+
+        let body = axum::body::to_bytes(resp.into_body(), usize::MAX)
+            .await
+            .unwrap();
+        let body_json: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        assert!(body_json.get("object").is_some());
+        assert_eq!(
+            body_json.get("object").and_then(|v| v.as_str()),
+            Some("list")
+        );
+
+        let data = body_json.get("data").and_then(|v| v.as_array());
+        assert!(data.is_some());
+
+        let models = data.unwrap();
+        assert!(!models.is_empty());
+
+        let first_model = &models[0];
+        assert_eq!(
+            first_model.get("id").and_then(|v| v.as_str()),
+            Some("mock-model")
+        );
+        assert_eq!(
+            first_model.get("object").and_then(|v| v.as_str()),
+            Some("model")
+        );
+        assert!(first_model.get("created").is_some());
+        assert_eq!(
+            first_model.get("owned_by").and_then(|v| v.as_str()),
+            Some("organization-owner")
+        );
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_model_info_with_no_workers() {
+        let ctx = TestContext::new(vec![]).await;
+        let app = ctx.create_app().await;
+
+        // Test server info with no workers
+        let req = Request::builder()
+            .method("GET")
+            .uri("/get_server_info")
+            .body(Body::empty())
+            .unwrap();
+        let resp = app.clone().oneshot(req).await.unwrap();
+        // Router may return various error codes when no workers
+        assert!(
+            resp.status() == StatusCode::OK
+                || resp.status() == StatusCode::SERVICE_UNAVAILABLE
+                || resp.status() == StatusCode::NOT_FOUND
+                || resp.status() == StatusCode::INTERNAL_SERVER_ERROR,
+            "Unexpected status code: {:?}",
+            resp.status()
+        );
+
+        // Test model info with no workers
+        let req = Request::builder()
+            .method("GET")
+            .uri("/get_model_info")
+            .body(Body::empty())
+            .unwrap();
+        let resp = app.clone().oneshot(req).await.unwrap();
+        // Router may return various error codes when no workers
+        assert!(
+            resp.status() == StatusCode::OK
+                || resp.status() == StatusCode::SERVICE_UNAVAILABLE
+                || resp.status() == StatusCode::NOT_FOUND
+                || resp.status() == StatusCode::INTERNAL_SERVER_ERROR,
+            "Unexpected status code: {:?}",
+            resp.status()
+        );
+
+        // Test v1/models with no workers
+        let req = Request::builder()
+            .method("GET")
+            .uri("/v1/models")
+            .body(Body::empty())
+            .unwrap();
+        let resp = app.oneshot(req).await.unwrap();
+        // Router may return various error codes when no workers
+        assert!(
+            resp.status() == StatusCode::OK
+                || resp.status() == StatusCode::SERVICE_UNAVAILABLE
+                || resp.status() == StatusCode::NOT_FOUND
+                || resp.status() == StatusCode::INTERNAL_SERVER_ERROR,
+            "Unexpected status code: {:?}",
+            resp.status()
+        );
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_model_info_with_multiple_workers() {
+        let ctx = TestContext::new(vec![
+            MockWorkerConfig {
+                port: 18204,
+                worker_type: WorkerType::Regular,
+                health_status: HealthStatus::Healthy,
+                response_delay_ms: 0,
+                fail_rate: 0.0,
+            },
+            MockWorkerConfig {
+                port: 18205,
+                worker_type: WorkerType::Regular,
+                health_status: HealthStatus::Healthy,
+                response_delay_ms: 0,
+                fail_rate: 0.0,
+            },
+        ])
+        .await;
+
+        let app = ctx.create_app().await;
+
+        // Test that model info is consistent across workers
+        for _ in 0..5 {
+            let req = Request::builder()
+                .method("GET")
+                .uri("/get_model_info")
+                .body(Body::empty())
+                .unwrap();
+
+            let resp = app.clone().oneshot(req).await.unwrap();
+            assert_eq!(resp.status(), StatusCode::OK);
+
+            let body = axum::body::to_bytes(resp.into_body(), usize::MAX)
+                .await
+                .unwrap();
+            let body_json: serde_json::Value = serde_json::from_slice(&body).unwrap();
+            assert_eq!(
+                body_json.get("model_path").and_then(|v| v.as_str()),
+                Some("mock-model-path")
+            );
+        }
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_model_info_with_unhealthy_worker() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 18206,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 1.0, // Always fail
+        }])
+        .await;
+
+        let app = ctx.create_app().await;
+
+        let req = Request::builder()
+            .method("GET")
+            .uri("/get_model_info")
+            .body(Body::empty())
+            .unwrap();
+
+        let resp = app.oneshot(req).await.unwrap();
+        // Worker with fail_rate: 1.0 should always return an error status
+        assert!(
+            resp.status() == StatusCode::INTERNAL_SERVER_ERROR
+                || resp.status() == StatusCode::SERVICE_UNAVAILABLE,
+            "Expected error status for always-failing worker, got: {:?}",
+            resp.status()
+        );
+
+        ctx.shutdown().await;
+    }
+}
+
+#[cfg(test)]
+mod worker_management_tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_add_new_worker() {
+        let ctx = TestContext::new(vec![]).await;
+        let app = ctx.create_app().await;
+
+        // Start a mock worker
+        let mut worker = MockWorker::new(MockWorkerConfig {
+            port: 18301,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        });
+        let url = worker.start().await.unwrap();
+
+        // Add the worker
+        let req = Request::builder()
+            .method("POST")
+            .uri(format!("/add_worker?url={}", url))
+            .body(Body::empty())
+            .unwrap();
+
+        let resp = app.clone().oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::OK);
+
+        // List workers to verify
+        let req = Request::builder()
+            .method("GET")
+            .uri("/list_workers")
+            .body(Body::empty())
+            .unwrap();
+
+        let resp = app.oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::OK);
+
+        let body = axum::body::to_bytes(resp.into_body(), usize::MAX)
+            .await
+            .unwrap();
+        let body_json: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        let workers = body_json["urls"].as_array().unwrap();
+        assert!(workers.iter().any(|w| w.as_str().unwrap() == url));
+
+        worker.stop().await;
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_remove_existing_worker() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 18302,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        let app = ctx.create_app().await;
+
+        // Get the worker URL
+        let req = Request::builder()
+            .method("GET")
+            .uri("/list_workers")
+            .body(Body::empty())
+            .unwrap();
+        let resp = app.clone().oneshot(req).await.unwrap();
+        let body = axum::body::to_bytes(resp.into_body(), usize::MAX)
+            .await
+            .unwrap();
+        let body_json: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        let workers = body_json["urls"].as_array().unwrap();
+        let worker_url = workers[0].as_str().unwrap();
+
+        // Remove the worker
+        let req = Request::builder()
+            .method("POST")
+            .uri(format!("/remove_worker?url={}", worker_url))
+            .body(Body::empty())
+            .unwrap();
+
+        let resp = app.clone().oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::OK);
+
+        // Verify it's removed
+        let req = Request::builder()
+            .method("GET")
+            .uri("/list_workers")
+            .body(Body::empty())
+            .unwrap();
+        let resp = app.oneshot(req).await.unwrap();
+        let body = axum::body::to_bytes(resp.into_body(), usize::MAX)
+            .await
+            .unwrap();
+        let body_json: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        let workers = body_json["urls"].as_array().unwrap();
+        assert!(workers.is_empty());
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_add_worker_invalid_url() {
+        let ctx = TestContext::new(vec![]).await;
+        let app = ctx.create_app().await;
+
+        // Invalid URL format
+        let req = Request::builder()
+            .method("POST")
+            .uri("/add_worker?url=not-a-valid-url")
+            .body(Body::empty())
+            .unwrap();
+
+        let resp = app.clone().oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::BAD_REQUEST);
+
+        // Missing URL parameter
+        let req = Request::builder()
+            .method("POST")
+            .uri("/add_worker")
+            .body(Body::empty())
+            .unwrap();
+
+        let resp = app.clone().oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::BAD_REQUEST);
+
+        // Empty URL
+        let req = Request::builder()
+            .method("POST")
+            .uri("/add_worker?url=")
+            .body(Body::empty())
+            .unwrap();
+
+        let resp = app.oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::BAD_REQUEST);
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_add_duplicate_worker() {
+        // Start a mock worker
+        let mut worker = MockWorker::new(MockWorkerConfig {
+            port: 18303,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        });
+        let url = worker.start().await.unwrap();
+
+        let ctx = TestContext::new(vec![]).await;
+        let app = ctx.create_app().await;
+
+        // Add worker first time
+        let req = Request::builder()
+            .method("POST")
+            .uri(format!("/add_worker?url={}", url))
+            .body(Body::empty())
+            .unwrap();
+        let resp = app.clone().oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::OK);
+
+        tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
+
+        // Try to add same worker again
+        let req = Request::builder()
+            .method("POST")
+            .uri(format!("/add_worker?url={}", url))
+            .body(Body::empty())
+            .unwrap();
+        let resp = app.oneshot(req).await.unwrap();
+        // Should return error for duplicate
+        assert_eq!(resp.status(), StatusCode::BAD_REQUEST);
+
+        worker.stop().await;
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_add_unhealthy_worker() {
+        // Start unhealthy worker
+        let mut worker = MockWorker::new(MockWorkerConfig {
+            port: 18304,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Unhealthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        });
+        let url = worker.start().await.unwrap();
+
+        let ctx = TestContext::new(vec![]).await;
+        let app = ctx.create_app().await;
+
+        // Try to add unhealthy worker
+        let req = Request::builder()
+            .method("POST")
+            .uri(format!("/add_worker?url={}", url))
+            .body(Body::empty())
+            .unwrap();
+        let resp = app.oneshot(req).await.unwrap();
+
+        // Router should reject unhealthy workers
+        assert!(
+            resp.status() == StatusCode::BAD_REQUEST
+                || resp.status() == StatusCode::SERVICE_UNAVAILABLE
+        );
+
+        worker.stop().await;
+        ctx.shutdown().await;
+    }
+}
+
+#[cfg(test)]
+mod router_policy_tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_random_policy() {
+        let ctx = TestContext::new(vec![
+            MockWorkerConfig {
+                port: 18801,
+                worker_type: WorkerType::Regular,
+                health_status: HealthStatus::Healthy,
+                response_delay_ms: 0,
+                fail_rate: 0.0,
+            },
+            MockWorkerConfig {
+                port: 18802,
+                worker_type: WorkerType::Regular,
+                health_status: HealthStatus::Healthy,
+                response_delay_ms: 0,
+                fail_rate: 0.0,
+            },
+        ])
+        .await;
+
+        // Send multiple requests and verify they succeed
+        let app = ctx.create_app().await;
+
+        for i in 0..10 {
+            let payload = json!({
+                "text": format!("Request {}", i),
+                "stream": false
+            });
+
+            let req = Request::builder()
+                .method("POST")
+                .uri("/generate")
+                .header(CONTENT_TYPE, "application/json")
+                .body(Body::from(serde_json::to_string(&payload).unwrap()))
+                .unwrap();
+
+            let resp = app.clone().oneshot(req).await.unwrap();
+            assert_eq!(resp.status(), StatusCode::OK);
+        }
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_worker_selection() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 18203,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        let _payload = json!({
+            "text": "Test selection",
+            "stream": false
+        });
+
+        // Check that router has the worker
+        let worker_urls = ctx.router.get_worker_urls();
+        assert_eq!(worker_urls.len(), 1);
+        assert!(worker_urls[0].contains("18203"));
+
+        ctx.shutdown().await;
+    }
+}
+
+#[cfg(test)]
+mod error_tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_404_not_found() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 18401,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        let app = ctx.create_app().await;
+
+        // Test unknown endpoint
+        let req = Request::builder()
+            .method("GET")
+            .uri("/unknown_endpoint")
+            .body(Body::empty())
+            .unwrap();
+
+        let resp = app.clone().oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::NOT_FOUND);
+
+        // Test POST to unknown endpoint
+        let req = Request::builder()
+            .method("POST")
+            .uri("/api/v2/generate")
+            .header(CONTENT_TYPE, "application/json")
+            .body(Body::from(
+                serde_json::to_string(&json!({"text": "test"})).unwrap(),
+            ))
+            .unwrap();
+
+        let resp = app.oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::NOT_FOUND);
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_method_not_allowed() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 18402,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        let app = ctx.create_app().await;
+
+        // GET request to POST-only endpoint
+        let req = Request::builder()
+            .method("GET")
+            .uri("/generate")
+            .body(Body::empty())
+            .unwrap();
+
+        let resp = app.clone().oneshot(req).await.unwrap();
+        // Note: Axum returns 405 for wrong methods on matched routes
+        assert_eq!(resp.status(), StatusCode::METHOD_NOT_ALLOWED);
+
+        // POST request to GET-only endpoint
+        let req = Request::builder()
+            .method("POST")
+            .uri("/health")
+            .header(CONTENT_TYPE, "application/json")
+            .body(Body::from("{}"))
+            .unwrap();
+
+        let resp = app.oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::METHOD_NOT_ALLOWED);
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_payload_too_large() {
+        // Create context with small payload limit
+        let config = RouterConfig {
+            mode: RoutingMode::Regular {
+                worker_urls: vec![],
+            },
+            policy: PolicyConfig::Random,
+            host: "127.0.0.1".to_string(),
+            port: 3010,
+            max_payload_size: 1024, // 1KB limit
+            request_timeout_secs: 600,
+            worker_startup_timeout_secs: 1,
+            worker_startup_check_interval_secs: 1,
+            dp_aware: false,
+            api_key: None,
+            discovery: None,
+            metrics: None,
+            log_dir: None,
+            log_level: None,
+            request_id_headers: None,
+            max_concurrent_requests: 64,
+            queue_size: 0,
+            queue_timeout_secs: 60,
+            rate_limit_tokens_per_second: None,
+            cors_allowed_origins: vec![],
+            retry: RetryConfig::default(),
+            circuit_breaker: CircuitBreakerConfig::default(),
+            disable_retries: false,
+            disable_circuit_breaker: false,
+            health_check: sglang_router_rs::config::HealthCheckConfig::default(),
+            enable_igw: false,
+            connection_mode: ConnectionMode::Http,
+            model_path: None,
+            tokenizer_path: None,
+        };
+
+        let ctx = TestContext::new_with_config(
+            config,
+            vec![MockWorkerConfig {
+                port: 18403,
+                worker_type: WorkerType::Regular,
+                health_status: HealthStatus::Healthy,
+                response_delay_ms: 0,
+                fail_rate: 0.0,
+            }],
+        )
+        .await;
+
+        // Note: The server would have payload size middleware configured
+        // but we cannot test it directly through the test app
+        // This test is kept for documentation purposes
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_invalid_json_payload() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 18404,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        let app = ctx.create_app().await;
+
+        // Send invalid JSON
+        let req = Request::builder()
+            .method("POST")
+            .uri("/generate")
+            .header(CONTENT_TYPE, "application/json")
+            .body(Body::from("{invalid json}"))
+            .unwrap();
+
+        let resp = app.clone().oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::BAD_REQUEST);
+
+        // Send empty body
+        let req = Request::builder()
+            .method("POST")
+            .uri("/generate")
+            .header(CONTENT_TYPE, "application/json")
+            .body(Body::empty())
+            .unwrap();
+
+        let resp = app.oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::BAD_REQUEST);
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_missing_required_fields() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 18405,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        let app = ctx.create_app().await;
+
+        // Missing messages in chat completion
+        let payload = json!({
+            "model": "test-model"
+            // missing "messages"
+        });
+
+        let req = Request::builder()
+            .method("POST")
+            .uri("/v1/chat/completions")
+            .header(CONTENT_TYPE, "application/json")
+            .body(Body::from(serde_json::to_string(&payload).unwrap()))
+            .unwrap();
+
+        let resp = app.oneshot(req).await.unwrap();
+        // Axum validates JSON schema - returns 422 for validation errors
+        assert_eq!(resp.status(), StatusCode::UNPROCESSABLE_ENTITY);
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_invalid_model() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 18406,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        let app = ctx.create_app().await;
+
+        let payload = json!({
+            "model": "invalid-model-name-that-does-not-exist",
+            "messages": [{"role": "user", "content": "Hello"}],
+            "stream": false
+        });
+
+        let req = Request::builder()
+            .method("POST")
+            .uri("/v1/chat/completions")
+            .header(CONTENT_TYPE, "application/json")
+            .body(Body::from(serde_json::to_string(&payload).unwrap()))
+            .unwrap();
+
+        let resp = app.oneshot(req).await.unwrap();
+        // Mock worker accepts any model, but real implementation might return 400
+        assert!(resp.status().is_success() || resp.status() == StatusCode::BAD_REQUEST);
+
+        ctx.shutdown().await;
+    }
+}
+
+#[cfg(test)]
+mod cache_tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_flush_cache() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 18501,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        let app = ctx.create_app().await;
+
+        let req = Request::builder()
+            .method("POST")
+            .uri("/flush_cache")
+            .body(Body::empty())
+            .unwrap();
+
+        let resp = app.oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::OK);
+
+        // The response might be empty or contain a message
+        let body_bytes = axum::body::to_bytes(resp.into_body(), usize::MAX)
+            .await
+            .unwrap();
+        if !body_bytes.is_empty() {
+            if let Ok(body) = serde_json::from_slice::<serde_json::Value>(&body_bytes) {
+                // Check that we got a successful response with expected fields
+                assert!(body.is_object());
+                assert!(body.get("message").is_some() || body.get("status").is_some());
+            }
+        }
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_get_loads() {
+        let ctx = TestContext::new(vec![
+            MockWorkerConfig {
+                port: 18502,
+                worker_type: WorkerType::Regular,
+                health_status: HealthStatus::Healthy,
+                response_delay_ms: 0,
+                fail_rate: 0.0,
+            },
+            MockWorkerConfig {
+                port: 18503,
+                worker_type: WorkerType::Regular,
+                health_status: HealthStatus::Healthy,
+                response_delay_ms: 0,
+                fail_rate: 0.0,
+            },
+        ])
+        .await;
+
+        let app = ctx.create_app().await;
+
+        let req = Request::builder()
+            .method("GET")
+            .uri("/get_loads")
+            .body(Body::empty())
+            .unwrap();
+
+        let resp = app.oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::OK);
+
+        let body = axum::body::to_bytes(resp.into_body(), usize::MAX)
+            .await
+            .unwrap();
+        let body_json: serde_json::Value = serde_json::from_slice(&body).unwrap();
+
+        // Verify the response contains load information
+        assert!(body_json.is_object());
+        // The exact structure depends on the implementation
+        // but should contain worker load information
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_flush_cache_no_workers() {
+        let ctx = TestContext::new(vec![]).await;
+
+        let app = ctx.create_app().await;
+
+        let req = Request::builder()
+            .method("POST")
+            .uri("/flush_cache")
+            .body(Body::empty())
+            .unwrap();
+
+        let resp = app.oneshot(req).await.unwrap();
+        // Should either succeed (no-op) or return service unavailable
+        assert!(
+            resp.status() == StatusCode::OK || resp.status() == StatusCode::SERVICE_UNAVAILABLE
+        );
+
+        ctx.shutdown().await;
+    }
+}
+
+#[cfg(test)]
+mod load_balancing_tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_request_distribution() {
+        // Create multiple workers
+        let ctx = TestContext::new(vec![
+            MockWorkerConfig {
+                port: 18601,
+                worker_type: WorkerType::Regular,
+                health_status: HealthStatus::Healthy,
+                response_delay_ms: 0,
+                fail_rate: 0.0,
+            },
+            MockWorkerConfig {
+                port: 18602,
+                worker_type: WorkerType::Regular,
+                health_status: HealthStatus::Healthy,
+                response_delay_ms: 0,
+                fail_rate: 0.0,
+            },
+        ])
+        .await;
+
+        let app = ctx.create_app().await;
+
+        // Send multiple requests and track distribution
+        let mut request_count = 0;
+        for i in 0..10 {
+            let payload = json!({
+                "text": format!("Request {}", i),
+                "stream": false
+            });
+
+            let req = Request::builder()
+                .method("POST")
+                .uri("/generate")
+                .header(CONTENT_TYPE, "application/json")
+                .body(Body::from(serde_json::to_string(&payload).unwrap()))
+                .unwrap();
+
+            let resp = app.clone().oneshot(req).await.unwrap();
+            if resp.status() == StatusCode::OK {
+                request_count += 1;
+            }
+        }
+
+        // With random policy, all requests should succeed
+        assert_eq!(request_count, 10);
+
+        ctx.shutdown().await;
+    }
+}
+
+#[cfg(test)]
+mod pd_mode_tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_pd_mode_routing() {
+        // Create PD mode configuration with prefill and decode workers
+        let mut prefill_worker = MockWorker::new(MockWorkerConfig {
+            port: 18701,
+            worker_type: WorkerType::Prefill,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        });
+
+        let mut decode_worker = MockWorker::new(MockWorkerConfig {
+            port: 18702,
+            worker_type: WorkerType::Decode,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        });
+
+        let prefill_url = prefill_worker.start().await.unwrap();
+        let decode_url = decode_worker.start().await.unwrap();
+
+        tokio::time::sleep(tokio::time::Duration::from_millis(200)).await;
+
+        // Extract port from prefill URL
+        let prefill_port = prefill_url
+            .split(':')
+            .next_back()
+            .and_then(|p| p.trim_end_matches('/').parse::<u16>().ok())
+            .unwrap_or(9000);
+
+        let config = RouterConfig {
+            mode: RoutingMode::PrefillDecode {
+                prefill_urls: vec![(prefill_url, Some(prefill_port))],
+                decode_urls: vec![decode_url],
+                prefill_policy: None,
+                decode_policy: None,
+            },
+            policy: PolicyConfig::Random,
+            host: "127.0.0.1".to_string(),
+            port: 3011,
+            max_payload_size: 256 * 1024 * 1024,
+            request_timeout_secs: 600,
+            worker_startup_timeout_secs: 1,
+            worker_startup_check_interval_secs: 1,
+            discovery: None,
+            metrics: None,
+            log_dir: None,
+            dp_aware: false,
+            api_key: None,
+            log_level: None,
+            request_id_headers: None,
+            max_concurrent_requests: 64,
+            queue_size: 0,
+            queue_timeout_secs: 60,
+            rate_limit_tokens_per_second: None,
+            cors_allowed_origins: vec![],
+            retry: RetryConfig::default(),
+            circuit_breaker: CircuitBreakerConfig::default(),
+            disable_retries: false,
+            disable_circuit_breaker: false,
+            health_check: sglang_router_rs::config::HealthCheckConfig::default(),
+            enable_igw: false,
+            connection_mode: ConnectionMode::Http,
+            model_path: None,
+            tokenizer_path: None,
+        };
+
+        // Create app context
+        let app_context = common::create_test_context(config);
+
+        // Create router - this might fail due to health check issues
+        let router_result = RouterFactory::create_router(&app_context).await;
+
+        // Clean up workers
+        prefill_worker.stop().await;
+        decode_worker.stop().await;
+
+        // For now, just verify the configuration was attempted
+        assert!(router_result.is_err() || router_result.is_ok());
+    }
+}
+
+#[cfg(test)]
+mod request_id_tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_request_id_generation() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 18901,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        let app = ctx.create_app().await;
+
+        // Test 1: Request without any request ID header should generate one
+        let payload = json!({
+            "text": "Test request",
+            "stream": false
+        });
+
+        let req = Request::builder()
+            .method("POST")
+            .uri("/generate")
+            .header(CONTENT_TYPE, "application/json")
+            .body(Body::from(serde_json::to_string(&payload).unwrap()))
+            .unwrap();
+
+        let resp = app.clone().oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::OK);
+
+        // Check that response has x-request-id header
+        let request_id = resp.headers().get("x-request-id");
+        assert!(
+            request_id.is_some(),
+            "Response should have x-request-id header"
+        );
+
+        let id_value = request_id.unwrap().to_str().unwrap();
+        assert!(
+            id_value.starts_with("gnt-"),
+            "Generate endpoint should have gnt- prefix"
+        );
+        assert!(
+            id_value.len() > 4,
+            "Request ID should have content after prefix"
+        );
+
+        // Test 2: Request with custom x-request-id should preserve it
+        let custom_id = "custom-request-id-123";
+        let req = Request::builder()
+            .method("POST")
+            .uri("/generate")
+            .header(CONTENT_TYPE, "application/json")
+            .header("x-request-id", custom_id)
+            .body(Body::from(serde_json::to_string(&payload).unwrap()))
+            .unwrap();
+
+        let resp = app.clone().oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::OK);
+
+        let response_id = resp.headers().get("x-request-id");
+        assert!(response_id.is_some());
+        assert_eq!(response_id.unwrap(), custom_id);
+
+        // Test 3: Different endpoints should have different prefixes
+        let chat_payload = json!({
+            "messages": [{"role": "user", "content": "Hello"}],
+            "model": "test-model"
+        });
+
+        let req = Request::builder()
+            .method("POST")
+            .uri("/v1/chat/completions")
+            .header(CONTENT_TYPE, "application/json")
+            .body(Body::from(serde_json::to_string(&chat_payload).unwrap()))
+            .unwrap();
+
+        let resp = app.clone().oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::OK);
+
+        let request_id = resp.headers().get("x-request-id");
+        assert!(request_id.is_some());
+        assert!(request_id
+            .unwrap()
+            .to_str()
+            .unwrap()
+            .starts_with("chatcmpl-"));
+
+        // Test 4: Alternative request ID headers should be recognized
+        let req = Request::builder()
+            .method("POST")
+            .uri("/generate")
+            .header(CONTENT_TYPE, "application/json")
+            .header("x-correlation-id", "correlation-123")
+            .body(Body::from(serde_json::to_string(&payload).unwrap()))
+            .unwrap();
+
+        let resp = app.clone().oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::OK);
+
+        let response_id = resp.headers().get("x-request-id");
+        assert!(response_id.is_some());
+        assert_eq!(response_id.unwrap(), "correlation-123");
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_request_id_with_custom_headers() {
+        // Create config with custom request ID headers
+        let config = RouterConfig {
+            mode: RoutingMode::Regular {
+                worker_urls: vec![],
+            },
+            policy: PolicyConfig::Random,
+            host: "127.0.0.1".to_string(),
+            port: 3002,
+            max_payload_size: 256 * 1024 * 1024,
+            request_timeout_secs: 600,
+            worker_startup_timeout_secs: 1,
+            worker_startup_check_interval_secs: 1,
+            discovery: None,
+            metrics: None,
+            dp_aware: false,
+            api_key: None,
+            log_dir: None,
+            log_level: None,
+            request_id_headers: Some(vec!["custom-id".to_string(), "trace-id".to_string()]),
+            max_concurrent_requests: 64,
+            queue_size: 0,
+            queue_timeout_secs: 60,
+            rate_limit_tokens_per_second: None,
+            cors_allowed_origins: vec![],
+            retry: RetryConfig::default(),
+            circuit_breaker: CircuitBreakerConfig::default(),
+            disable_retries: false,
+            disable_circuit_breaker: false,
+            health_check: sglang_router_rs::config::HealthCheckConfig::default(),
+            enable_igw: false,
+            connection_mode: ConnectionMode::Http,
+            model_path: None,
+            tokenizer_path: None,
+        };
+
+        let ctx = TestContext::new_with_config(
+            config,
+            vec![MockWorkerConfig {
+                port: 18902,
+                worker_type: WorkerType::Regular,
+                health_status: HealthStatus::Healthy,
+                response_delay_ms: 0,
+                fail_rate: 0.0,
+            }],
+        )
+        .await;
+
+        let app = ctx.create_app().await;
+
+        let payload = json!({
+            "text": "Test request",
+            "stream": false
+        });
+
+        // Test custom header is recognized
+        let req = Request::builder()
+            .method("POST")
+            .uri("/generate")
+            .header(CONTENT_TYPE, "application/json")
+            .header("custom-id", "my-custom-id")
+            .body(Body::from(serde_json::to_string(&payload).unwrap()))
+            .unwrap();
+
+        let resp = app.clone().oneshot(req).await.unwrap();
+        assert_eq!(resp.status(), StatusCode::OK);
+
+        let response_id = resp.headers().get("x-request-id");
+        assert!(response_id.is_some());
+        assert_eq!(response_id.unwrap(), "my-custom-id");
+
+        ctx.shutdown().await;
+    }
+}
diff --git a/sgl-router/tests/common/mock_mcp_server.rs b/sgl-router/tests/common/mock_mcp_server.rs
new file mode 100644
index 000000000..daeec8001
--- /dev/null
+++ b/sgl-router/tests/common/mock_mcp_server.rs
@@ -0,0 +1,177 @@
+// tests/common/mock_mcp_server.rs - Mock MCP server for testing
+use rmcp::{
+    handler::server::{router::tool::ToolRouter, wrapper::Parameters},
+    model::*,
+    service::RequestContext,
+    tool, tool_handler, tool_router,
+    transport::streamable_http_server::{
+        session::local::LocalSessionManager, StreamableHttpService,
+    },
+    ErrorData as McpError, RoleServer, ServerHandler,
+};
+use tokio::net::TcpListener;
+
+/// Mock MCP server that returns hardcoded responses for testing
+pub struct MockMCPServer {
+    pub port: u16,
+    pub server_handle: Option<tokio::task::JoinHandle<()>>,
+}
+
+/// Simple test server with mock search tools
+#[derive(Clone)]
+pub struct MockSearchServer {
+    tool_router: ToolRouter<MockSearchServer>,
+}
+
+#[tool_router]
+impl MockSearchServer {
+    pub fn new() -> Self {
+        Self {
+            tool_router: Self::tool_router(),
+        }
+    }
+
+    #[tool(description = "Mock web search tool")]
+    fn brave_web_search(
+        &self,
+        Parameters(params): Parameters<serde_json::Map<String, serde_json::Value>>,
+    ) -> Result<CallToolResult, McpError> {
+        let query = params
+            .get("query")
+            .and_then(|v| v.as_str())
+            .unwrap_or("test");
+        Ok(CallToolResult::success(vec![Content::text(format!(
+            "Mock search results for: {}",
+            query
+        ))]))
+    }
+
+    #[tool(description = "Mock local search tool")]
+    fn brave_local_search(
+        &self,
+        Parameters(_params): Parameters<serde_json::Map<String, serde_json::Value>>,
+    ) -> Result<CallToolResult, McpError> {
+        Ok(CallToolResult::success(vec![Content::text(
+            "Mock local search results",
+        )]))
+    }
+}
+
+#[tool_handler]
+impl ServerHandler for MockSearchServer {
+    fn get_info(&self) -> ServerInfo {
+        ServerInfo {
+            protocol_version: ProtocolVersion::V_2024_11_05,
+            capabilities: ServerCapabilities::builder().enable_tools().build(),
+            server_info: Implementation::from_build_env(),
+            instructions: Some("Mock server for testing".to_string()),
+        }
+    }
+
+    async fn initialize(
+        &self,
+        _request: InitializeRequestParam,
+        _context: RequestContext<RoleServer>,
+    ) -> Result<InitializeResult, McpError> {
+        Ok(self.get_info())
+    }
+}
+
+impl MockMCPServer {
+    /// Start a mock MCP server on an available port
+    pub async fn start() -> Result<Self, Box<dyn std::error::Error + Send + Sync>> {
+        // Find an available port
+        let listener = TcpListener::bind("127.0.0.1:0").await?;
+        let port = listener.local_addr()?.port();
+
+        // Create the MCP service using rmcp's StreamableHttpService
+        let service = StreamableHttpService::new(
+            || Ok(MockSearchServer::new()),
+            LocalSessionManager::default().into(),
+            Default::default(),
+        );
+
+        let app = axum::Router::new().nest_service("/mcp", service);
+
+        let server_handle = tokio::spawn(async move {
+            axum::serve(listener, app)
+                .await
+                .expect("Mock MCP server failed to start");
+        });
+
+        // Give the server a moment to start
+        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+
+        Ok(MockMCPServer {
+            port,
+            server_handle: Some(server_handle),
+        })
+    }
+
+    /// Get the full URL for this mock server
+    pub fn url(&self) -> String {
+        format!("http://127.0.0.1:{}/mcp", self.port)
+    }
+
+    /// Stop the mock server
+    pub async fn stop(&mut self) {
+        if let Some(handle) = self.server_handle.take() {
+            handle.abort();
+            // Wait a moment for cleanup
+            tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
+        }
+    }
+}
+
+impl Drop for MockMCPServer {
+    fn drop(&mut self) {
+        if let Some(handle) = self.server_handle.take() {
+            handle.abort();
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    #[allow(unused_imports)]
+    use super::MockMCPServer;
+
+    #[tokio::test]
+    async fn test_mock_server_startup() {
+        let mut server = MockMCPServer::start().await.unwrap();
+        assert!(server.port > 0);
+        assert!(server.url().contains(&server.port.to_string()));
+        server.stop().await;
+    }
+
+    #[tokio::test]
+    async fn test_mock_server_with_rmcp_client() {
+        let mut server = MockMCPServer::start().await.unwrap();
+
+        // Test that we can connect with rmcp client
+        use rmcp::transport::StreamableHttpClientTransport;
+        use rmcp::ServiceExt;
+
+        let transport = StreamableHttpClientTransport::from_uri(server.url().as_str());
+        let client = ().serve(transport).await;
+
+        assert!(client.is_ok(), "Should be able to connect to mock server");
+
+        if let Ok(client) = client {
+            // Test listing tools
+            let tools = client.peer().list_all_tools().await;
+            assert!(tools.is_ok(), "Should be able to list tools");
+
+            if let Ok(tools) = tools {
+                assert_eq!(tools.len(), 2, "Should have 2 tools");
+                assert!(tools.iter().any(|t| t.name == "brave_web_search"));
+                assert!(tools.iter().any(|t| t.name == "brave_local_search"));
+            }
+
+            // Shutdown by dropping the client
+            drop(client);
+        }
+
+        server.stop().await;
+    }
+}
diff --git a/sgl-router/tests/common/mock_openai_server.rs b/sgl-router/tests/common/mock_openai_server.rs
new file mode 100644
index 000000000..643fd5e98
--- /dev/null
+++ b/sgl-router/tests/common/mock_openai_server.rs
@@ -0,0 +1,238 @@
+//! Mock servers for testing
+
+#![allow(dead_code)]
+
+use axum::{
+    body::Body,
+    extract::{Request, State},
+    http::{HeaderValue, StatusCode},
+    response::sse::{Event, KeepAlive},
+    response::{IntoResponse, Response, Sse},
+    routing::post,
+    Json, Router,
+};
+use futures_util::stream::{self, StreamExt};
+use serde_json::json;
+use std::net::SocketAddr;
+use std::sync::Arc;
+use tokio::net::TcpListener;
+
+/// Mock OpenAI API server for testing
+pub struct MockOpenAIServer {
+    addr: SocketAddr,
+    _handle: tokio::task::JoinHandle<()>,
+}
+
+#[derive(Clone)]
+struct MockServerState {
+    require_auth: bool,
+    expected_auth: Option<String>,
+}
+
+impl MockOpenAIServer {
+    /// Create and start a new mock OpenAI server
+    pub async fn new() -> Self {
+        Self::new_with_auth(None).await
+    }
+
+    /// Create and start a new mock OpenAI server with optional auth requirement
+    pub async fn new_with_auth(expected_auth: Option<String>) -> Self {
+        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+        let addr = listener.local_addr().unwrap();
+
+        let state = Arc::new(MockServerState {
+            require_auth: expected_auth.is_some(),
+            expected_auth,
+        });
+
+        let app = Router::new()
+            .route("/v1/chat/completions", post(mock_chat_completions))
+            .route("/v1/completions", post(mock_completions))
+            .route("/v1/models", post(mock_models).get(mock_models))
+            .with_state(state);
+
+        let handle = tokio::spawn(async move {
+            axum::serve(listener, app).await.unwrap();
+        });
+
+        // Give the server a moment to start
+        tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
+
+        Self {
+            addr,
+            _handle: handle,
+        }
+    }
+
+    /// Get the base URL for this mock server
+    pub fn base_url(&self) -> String {
+        format!("http://{}", self.addr)
+    }
+}
+
+/// Mock chat completions endpoint
+async fn mock_chat_completions(req: Request<Body>) -> Response {
+    let (_, body) = req.into_parts();
+    let body_bytes = match axum::body::to_bytes(body, usize::MAX).await {
+        Ok(bytes) => bytes,
+        Err(_) => return StatusCode::BAD_REQUEST.into_response(),
+    };
+
+    let request: serde_json::Value = match serde_json::from_slice(&body_bytes) {
+        Ok(req) => req,
+        Err(_) => return StatusCode::BAD_REQUEST.into_response(),
+    };
+
+    // Extract model from request or use default (owned String to satisfy 'static in stream)
+    let model: String = request
+        .get("model")
+        .and_then(|v| v.as_str())
+        .unwrap_or("gpt-3.5-turbo")
+        .to_string();
+
+    // If stream requested, return SSE
+    let is_stream = request
+        .get("stream")
+        .and_then(|v| v.as_bool())
+        .unwrap_or(false);
+
+    if is_stream {
+        let created = 1677652288u64;
+        // Single chunk then [DONE]
+        let model_chunk = model.clone();
+        let event_stream = stream::once(async move {
+            let chunk = json!({
+                "id": "chatcmpl-123456789",
+                "object": "chat.completion.chunk",
+                "created": created,
+                "model": model_chunk,
+                "choices": [{
+                    "index": 0,
+                    "delta": {
+                        "content": "Hello!"
+                    },
+                    "finish_reason": null
+                }]
+            });
+            Ok::<_, std::convert::Infallible>(Event::default().data(chunk.to_string()))
+        })
+        .chain(stream::once(async { Ok(Event::default().data("[DONE]")) }));
+
+        Sse::new(event_stream)
+            .keep_alive(KeepAlive::default())
+            .into_response()
+    } else {
+        // Create a mock non-streaming response
+        let response = json!({
+            "id": "chatcmpl-123456789",
+            "object": "chat.completion",
+            "created": 1677652288,
+            "model": model,
+            "choices": [{
+                "index": 0,
+                "message": {
+                    "role": "assistant",
+                    "content": "Hello! I'm a mock OpenAI assistant. How can I help you today?"
+                },
+                "finish_reason": "stop"
+            }],
+            "usage": {
+                "prompt_tokens": 9,
+                "completion_tokens": 12,
+                "total_tokens": 21
+            }
+        });
+
+        Json(response).into_response()
+    }
+}
+
+/// Mock completions endpoint (legacy)
+async fn mock_completions(req: Request<Body>) -> Response {
+    let (_, body) = req.into_parts();
+    let body_bytes = match axum::body::to_bytes(body, usize::MAX).await {
+        Ok(bytes) => bytes,
+        Err(_) => return StatusCode::BAD_REQUEST.into_response(),
+    };
+
+    let request: serde_json::Value = match serde_json::from_slice(&body_bytes) {
+        Ok(req) => req,
+        Err(_) => return StatusCode::BAD_REQUEST.into_response(),
+    };
+
+    let model = request["model"].as_str().unwrap_or("text-davinci-003");
+
+    let response = json!({
+        "id": "cmpl-123456789",
+        "object": "text_completion",
+        "created": 1677652288,
+        "model": model,
+        "choices": [{
+            "text": " This is a mock completion response.",
+            "index": 0,
+            "logprobs": null,
+            "finish_reason": "stop"
+        }],
+        "usage": {
+            "prompt_tokens": 5,
+            "completion_tokens": 7,
+            "total_tokens": 12
+        }
+    });
+
+    Json(response).into_response()
+}
+
+/// Mock models endpoint
+async fn mock_models(State(state): State<Arc<MockServerState>>, req: Request<Body>) -> Response {
+    // Optionally enforce Authorization header
+    if state.require_auth {
+        let auth = req
+            .headers()
+            .get("authorization")
+            .or_else(|| req.headers().get("Authorization"))
+            .and_then(|v| v.to_str().ok())
+            .map(|s| s.to_string());
+        let auth_ok = match (&state.expected_auth, auth) {
+            (Some(expected), Some(got)) => &got == expected,
+            (None, Some(_)) => true,
+            _ => false,
+        };
+        if !auth_ok {
+            let mut response = Response::new(Body::from(
+                json!({
+                    "error": {
+                        "message": "Unauthorized",
+                        "type": "invalid_request_error"
+                    }
+                })
+                .to_string(),
+            ));
+            *response.status_mut() = StatusCode::UNAUTHORIZED;
+            response
+                .headers_mut()
+                .insert("WWW-Authenticate", HeaderValue::from_static("Bearer"));
+            return response;
+        }
+    }
+
+    let response = json!({
+        "object": "list",
+        "data": [
+            {
+                "id": "gpt-4",
+                "object": "model",
+                "created": 1677610602,
+                "owned_by": "openai"
+            },
+            {
+                "id": "gpt-3.5-turbo",
+                "object": "model",
+                "created": 1677610602,
+                "owned_by": "openai"
+            }
+        ]
+    });
+
+    Json(response).into_response()
+}
diff --git a/sgl-router/tests/common/mock_worker.rs b/sgl-router/tests/common/mock_worker.rs
new file mode 100755
index 000000000..16d721607
--- /dev/null
+++ b/sgl-router/tests/common/mock_worker.rs
@@ -0,0 +1,614 @@
+// Mock worker for testing - these functions are used by integration tests
+#![allow(dead_code)]
+
+use axum::{
+    extract::{Json, State},
+    http::StatusCode,
+    response::sse::{Event, KeepAlive},
+    response::{IntoResponse, Response, Sse},
+    routing::{get, post},
+    Router,
+};
+use futures_util::stream::{self, StreamExt};
+use serde_json::json;
+use std::convert::Infallible;
+use std::sync::Arc;
+use std::time::{SystemTime, UNIX_EPOCH};
+use tokio::sync::RwLock;
+use uuid::Uuid;
+
+/// Configuration for mock worker behavior
+#[derive(Clone)]
+pub struct MockWorkerConfig {
+    pub port: u16,
+    pub worker_type: WorkerType,
+    pub health_status: HealthStatus,
+    pub response_delay_ms: u64,
+    pub fail_rate: f32,
+}
+
+#[derive(Clone, Debug)]
+pub enum WorkerType {
+    Regular,
+    Prefill,
+    Decode,
+}
+
+#[derive(Clone, Debug)]
+pub enum HealthStatus {
+    Healthy,
+    Unhealthy,
+    Degraded,
+}
+
+/// Mock worker server for testing
+pub struct MockWorker {
+    config: Arc<RwLock<MockWorkerConfig>>,
+    shutdown_handle: Option<tokio::task::JoinHandle<()>>,
+    shutdown_tx: Option<tokio::sync::oneshot::Sender<()>>,
+}
+
+impl MockWorker {
+    pub fn new(config: MockWorkerConfig) -> Self {
+        Self {
+            config: Arc::new(RwLock::new(config)),
+            shutdown_handle: None,
+            shutdown_tx: None,
+        }
+    }
+
+    /// Start the mock worker server
+    pub async fn start(&mut self) -> Result<String, Box<dyn std::error::Error>> {
+        let config = self.config.clone();
+        let port = config.read().await.port;
+
+        // If port is 0, find an available port
+        let port = if port == 0 {
+            let listener = std::net::TcpListener::bind("127.0.0.1:0")?;
+            let port = listener.local_addr()?.port();
+            drop(listener);
+            config.write().await.port = port;
+            port
+        } else {
+            port
+        };
+
+        let app = Router::new()
+            .route("/health", get(health_handler))
+            .route("/health_generate", get(health_generate_handler))
+            .route("/get_server_info", get(server_info_handler))
+            .route("/get_model_info", get(model_info_handler))
+            .route("/generate", post(generate_handler))
+            .route("/v1/chat/completions", post(chat_completions_handler))
+            .route("/v1/completions", post(completions_handler))
+            .route("/flush_cache", post(flush_cache_handler))
+            .route("/v1/models", get(v1_models_handler))
+            .with_state(config);
+
+        let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel::<()>();
+        self.shutdown_tx = Some(shutdown_tx);
+
+        // Spawn the server in a separate task
+        let handle = tokio::spawn(async move {
+            let listener = match tokio::net::TcpListener::bind(("127.0.0.1", port)).await {
+                Ok(l) => l,
+                Err(e) => {
+                    eprintln!("Failed to bind to port {}: {}", port, e);
+                    return;
+                }
+            };
+
+            let server = axum::serve(listener, app).with_graceful_shutdown(async move {
+                let _ = shutdown_rx.await;
+            });
+
+            if let Err(e) = server.await {
+                eprintln!("Server error: {}", e);
+            }
+        });
+
+        self.shutdown_handle = Some(handle);
+
+        // Wait for the server to start
+        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+
+        let url = format!("http://127.0.0.1:{}", port);
+        Ok(url)
+    }
+
+    /// Stop the mock worker server
+    pub async fn stop(&mut self) {
+        if let Some(shutdown_tx) = self.shutdown_tx.take() {
+            let _ = shutdown_tx.send(());
+        }
+
+        if let Some(handle) = self.shutdown_handle.take() {
+            // Wait for the server to shut down
+            let _ = tokio::time::timeout(tokio::time::Duration::from_secs(5), handle).await;
+        }
+    }
+}
+
+impl Drop for MockWorker {
+    fn drop(&mut self) {
+        // Clean shutdown when dropped
+        if let Some(shutdown_tx) = self.shutdown_tx.take() {
+            let _ = shutdown_tx.send(());
+        }
+    }
+}
+
+// Handler implementations
+
+/// Check if request should fail based on configured fail_rate
+async fn should_fail(config: &MockWorkerConfig) -> bool {
+    rand::random::<f32>() < config.fail_rate
+}
+
+async fn health_handler(State(config): State<Arc<RwLock<MockWorkerConfig>>>) -> Response {
+    let config = config.read().await;
+
+    match config.health_status {
+        HealthStatus::Healthy => Json(json!({
+            "status": "healthy",
+            "timestamp": SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_secs(),
+            "worker_type": format!("{:?}", config.worker_type),
+        }))
+        .into_response(),
+        HealthStatus::Unhealthy => (
+            StatusCode::SERVICE_UNAVAILABLE,
+            Json(json!({
+                "status": "unhealthy",
+                "error": "Worker is not responding"
+            })),
+        )
+            .into_response(),
+        HealthStatus::Degraded => Json(json!({
+            "status": "degraded",
+            "warning": "High load detected"
+        }))
+        .into_response(),
+    }
+}
+
+async fn health_generate_handler(State(config): State<Arc<RwLock<MockWorkerConfig>>>) -> Response {
+    let config = config.read().await;
+
+    if should_fail(&config).await {
+        return (
+            StatusCode::INTERNAL_SERVER_ERROR,
+            Json(json!({
+                "error": "Random failure for testing"
+            })),
+        )
+            .into_response();
+    }
+
+    if matches!(config.health_status, HealthStatus::Healthy) {
+        Json(json!({
+            "status": "ok",
+            "queue_length": 0,
+            "processing_time_ms": config.response_delay_ms
+        }))
+        .into_response()
+    } else {
+        (
+            StatusCode::SERVICE_UNAVAILABLE,
+            Json(json!({
+                "error": "Generation service unavailable"
+            })),
+        )
+            .into_response()
+    }
+}
+
+async fn server_info_handler(State(config): State<Arc<RwLock<MockWorkerConfig>>>) -> Response {
+    let config = config.read().await;
+
+    if should_fail(&config).await {
+        return (
+            StatusCode::INTERNAL_SERVER_ERROR,
+            Json(json!({
+                "error": "Random failure for testing"
+            })),
+        )
+            .into_response();
+    }
+
+    Json(json!({
+        "model_path": "mock-model-path",
+        "tokenizer_path": "mock-tokenizer-path",
+        "port": config.port,
+        "host": "127.0.0.1",
+        "max_num_batched_tokens": 32768,
+        "max_prefill_tokens": 16384,
+        "mem_fraction_static": 0.88,
+        "tp_size": 1,
+        "dp_size": 1,
+        "stream_interval": 8,
+        "dtype": "float16",
+        "device": "cuda",
+        "enable_flashinfer": true,
+        "enable_p2p_check": true,
+        "context_length": 32768,
+        "chat_template": null,
+        "disable_radix_cache": false,
+        "enable_torch_compile": false,
+        "trust_remote_code": false,
+        "show_time_cost": false,
+        "waiting_queue_size": 0,
+        "running_queue_size": 0,
+        "req_to_token_ratio": 1.2,
+        "min_running_requests": 0,
+        "max_running_requests": 2048,
+        "max_req_num": 8192,
+        "max_batch_tokens": 32768,
+        "schedule_policy": "lpm",
+        "schedule_conservativeness": 1.0,
+        "version": "0.3.0",
+        "internal_states": [{
+            "waiting_queue_size": 0,
+            "running_queue_size": 0
+        }]
+    }))
+    .into_response()
+}
+
+async fn model_info_handler(State(config): State<Arc<RwLock<MockWorkerConfig>>>) -> Response {
+    let config = config.read().await;
+
+    if should_fail(&config).await {
+        return (
+            StatusCode::INTERNAL_SERVER_ERROR,
+            Json(json!({
+                "error": "Random failure for testing"
+            })),
+        )
+            .into_response();
+    }
+
+    Json(json!({
+        "model_path": "mock-model-path",
+        "tokenizer_path": "mock-tokenizer-path",
+        "is_generation": true,
+        "preferred_sampling_params": {
+            "temperature": 0.7,
+            "top_p": 0.9,
+            "top_k": 40,
+            "max_tokens": 2048
+        }
+    }))
+    .into_response()
+}
+
+async fn generate_handler(
+    State(config): State<Arc<RwLock<MockWorkerConfig>>>,
+    Json(payload): Json<serde_json::Value>,
+) -> Response {
+    let config = config.read().await;
+
+    if should_fail(&config).await {
+        return (
+            StatusCode::INTERNAL_SERVER_ERROR,
+            Json(json!({
+                "error": "Random failure for testing"
+            })),
+        )
+            .into_response();
+    }
+
+    if config.response_delay_ms > 0 {
+        tokio::time::sleep(tokio::time::Duration::from_millis(config.response_delay_ms)).await;
+    }
+
+    let is_stream = payload
+        .get("stream")
+        .and_then(|v| v.as_bool())
+        .unwrap_or(false);
+
+    if is_stream {
+        let stream_delay = config.response_delay_ms;
+
+        // Check if it's a batch request
+        let is_batch = payload.get("text").and_then(|t| t.as_array()).is_some();
+
+        let batch_size = if is_batch {
+            payload
+                .get("text")
+                .and_then(|t| t.as_array())
+                .map(|arr| arr.len())
+                .unwrap_or(1)
+        } else {
+            1
+        };
+
+        let mut events = Vec::new();
+
+        // Generate events for each item in batch
+        for i in 0..batch_size {
+            let timestamp_start = SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .unwrap()
+                .as_secs_f64();
+
+            let data = json!({
+                "text": format!("Mock response {}", i + 1),
+                "meta_info": {
+                    "prompt_tokens": 10,
+                    "completion_tokens": 5,
+                    "completion_tokens_wo_jump_forward": 5,
+                    "input_token_logprobs": null,
+                    "output_token_logprobs": null,
+                    "first_token_latency": stream_delay as f64 / 1000.0,
+                    "time_to_first_token": stream_delay as f64 / 1000.0,
+                    "time_per_output_token": 0.01,
+                    "end_time": timestamp_start + (stream_delay as f64 / 1000.0),
+                    "start_time": timestamp_start,
+                    "finish_reason": {
+                        "type": "stop",
+                        "reason": "length"
+                    }
+                },
+                "stage": "mid"
+            });
+
+            events.push(Ok::<_, Infallible>(Event::default().data(data.to_string())));
+        }
+
+        // Add [DONE] event
+        events.push(Ok(Event::default().data("[DONE]")));
+
+        let stream = stream::iter(events);
+
+        Sse::new(stream)
+            .keep_alive(KeepAlive::default())
+            .into_response()
+    } else {
+        Json(json!({
+            "text": "This is a mock response.",
+            "meta_info": {
+                "prompt_tokens": 10,
+                "completion_tokens": 5,
+                "completion_tokens_wo_jump_forward": 5,
+                "input_token_logprobs": null,
+                "output_token_logprobs": null,
+                "first_token_latency": config.response_delay_ms as f64 / 1000.0,
+                "time_to_first_token": config.response_delay_ms as f64 / 1000.0,
+                "time_per_output_token": 0.01,
+                "finish_reason": {
+                    "type": "stop",
+                    "reason": "length"
+                }
+            }
+        }))
+        .into_response()
+    }
+}
+
+async fn chat_completions_handler(
+    State(config): State<Arc<RwLock<MockWorkerConfig>>>,
+    Json(payload): Json<serde_json::Value>,
+) -> Response {
+    let config = config.read().await;
+
+    if should_fail(&config).await {
+        return (
+            StatusCode::INTERNAL_SERVER_ERROR,
+            Json(json!({
+                "error": {
+                    "message": "Random failure for testing",
+                    "type": "internal_error",
+                    "code": "internal_error"
+                }
+            })),
+        )
+            .into_response();
+    }
+
+    if config.response_delay_ms > 0 {
+        tokio::time::sleep(tokio::time::Duration::from_millis(config.response_delay_ms)).await;
+    }
+
+    let is_stream = payload
+        .get("stream")
+        .and_then(|v| v.as_bool())
+        .unwrap_or(false);
+
+    let timestamp = SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .unwrap()
+        .as_secs();
+
+    if is_stream {
+        let request_id = format!("chatcmpl-{}", Uuid::new_v4());
+
+        let stream = stream::once(async move {
+            let chunk = json!({
+                "id": request_id,
+                "object": "chat.completion.chunk",
+                "created": timestamp,
+                "model": "mock-model",
+                "choices": [{
+                    "index": 0,
+                    "delta": {
+                        "content": "This is a mock chat response."
+                    },
+                    "finish_reason": null
+                }]
+            });
+
+            Ok::<_, Infallible>(Event::default().data(chunk.to_string()))
+        })
+        .chain(stream::once(async { Ok(Event::default().data("[DONE]")) }));
+
+        Sse::new(stream)
+            .keep_alive(KeepAlive::default())
+            .into_response()
+    } else {
+        Json(json!({
+            "id": format!("chatcmpl-{}", Uuid::new_v4()),
+            "object": "chat.completion",
+            "created": timestamp,
+            "model": "mock-model",
+            "choices": [{
+                "index": 0,
+                "message": {
+                    "role": "assistant",
+                    "content": "This is a mock chat response."
+                },
+                "finish_reason": "stop"
+            }],
+            "usage": {
+                "prompt_tokens": 10,
+                "completion_tokens": 5,
+                "total_tokens": 15
+            }
+        }))
+        .into_response()
+    }
+}
+
+async fn completions_handler(
+    State(config): State<Arc<RwLock<MockWorkerConfig>>>,
+    Json(payload): Json<serde_json::Value>,
+) -> Response {
+    let config = config.read().await;
+
+    if should_fail(&config).await {
+        return (
+            StatusCode::INTERNAL_SERVER_ERROR,
+            Json(json!({
+                "error": {
+                    "message": "Random failure for testing",
+                    "type": "internal_error",
+                    "code": "internal_error"
+                }
+            })),
+        )
+            .into_response();
+    }
+
+    if config.response_delay_ms > 0 {
+        tokio::time::sleep(tokio::time::Duration::from_millis(config.response_delay_ms)).await;
+    }
+
+    let is_stream = payload
+        .get("stream")
+        .and_then(|v| v.as_bool())
+        .unwrap_or(false);
+
+    let timestamp = SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .unwrap()
+        .as_secs();
+
+    if is_stream {
+        let request_id = format!("cmpl-{}", Uuid::new_v4());
+
+        let stream = stream::once(async move {
+            let chunk = json!({
+                "id": request_id,
+                "object": "text_completion",
+                "created": timestamp,
+                "model": "mock-model",
+                "choices": [{
+                    "text": "This is a mock completion.",
+                    "index": 0,
+                    "logprobs": null,
+                    "finish_reason": null
+                }]
+            });
+
+            Ok::<_, Infallible>(Event::default().data(chunk.to_string()))
+        })
+        .chain(stream::once(async { Ok(Event::default().data("[DONE]")) }));
+
+        Sse::new(stream)
+            .keep_alive(KeepAlive::default())
+            .into_response()
+    } else {
+        Json(json!({
+            "id": format!("cmpl-{}", Uuid::new_v4()),
+            "object": "text_completion",
+            "created": timestamp,
+            "model": "mock-model",
+            "choices": [{
+                "text": "This is a mock completion.",
+                "index": 0,
+                "logprobs": null,
+                "finish_reason": "stop"
+            }],
+            "usage": {
+                "prompt_tokens": 10,
+                "completion_tokens": 5,
+                "total_tokens": 15
+            }
+        }))
+        .into_response()
+    }
+}
+
+async fn flush_cache_handler(State(config): State<Arc<RwLock<MockWorkerConfig>>>) -> Response {
+    let config = config.read().await;
+
+    if should_fail(&config).await {
+        return (
+            StatusCode::INTERNAL_SERVER_ERROR,
+            Json(json!({
+                "error": "Random failure for testing"
+            })),
+        )
+            .into_response();
+    }
+
+    Json(json!({
+        "message": "Cache flushed successfully"
+    }))
+    .into_response()
+}
+
+async fn v1_models_handler(State(config): State<Arc<RwLock<MockWorkerConfig>>>) -> Response {
+    let config = config.read().await;
+
+    if should_fail(&config).await {
+        return (
+            StatusCode::INTERNAL_SERVER_ERROR,
+            Json(json!({
+                "error": {
+                    "message": "Random failure for testing",
+                    "type": "internal_error",
+                    "code": "internal_error"
+                }
+            })),
+        )
+            .into_response();
+    }
+
+    let timestamp = SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .unwrap()
+        .as_secs();
+
+    Json(json!({
+        "object": "list",
+        "data": [{
+            "id": "mock-model",
+            "object": "model",
+            "created": timestamp,
+            "owned_by": "organization-owner"
+        }]
+    }))
+    .into_response()
+}
+
+impl Default for MockWorkerConfig {
+    fn default() -> Self {
+        Self {
+            port: 0,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        }
+    }
+}
diff --git a/sgl-router/tests/common/mod.rs b/sgl-router/tests/common/mod.rs
new file mode 100644
index 000000000..0170cd765
--- /dev/null
+++ b/sgl-router/tests/common/mod.rs
@@ -0,0 +1,103 @@
+// These modules are used by tests and benchmarks
+#![allow(dead_code)]
+
+pub mod mock_mcp_server;
+pub mod mock_openai_server;
+pub mod mock_worker;
+pub mod test_app;
+
+use sglang_router_rs::config::RouterConfig;
+use sglang_router_rs::server::AppContext;
+use std::fs;
+use std::path::PathBuf;
+use std::sync::{Arc, Mutex, OnceLock};
+
+/// Helper function to create AppContext for tests
+pub fn create_test_context(config: RouterConfig) -> Arc<AppContext> {
+    Arc::new(
+        AppContext::new(
+            config.clone(),
+            reqwest::Client::new(),
+            config.max_concurrent_requests,
+            config.rate_limit_tokens_per_second,
+        )
+        .expect("Failed to create AppContext in test"),
+    )
+}
+
+// Tokenizer download configuration
+const TINYLLAMA_TOKENIZER_URL: &str =
+    "https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0/resolve/main/tokenizer.json";
+const CACHE_DIR: &str = ".tokenizer_cache";
+const TINYLLAMA_TOKENIZER_FILENAME: &str = "tinyllama_tokenizer.json";
+
+// Global mutex to prevent concurrent downloads
+static DOWNLOAD_MUTEX: OnceLock<Mutex<()>> = OnceLock::new();
+
+/// Downloads the TinyLlama tokenizer from HuggingFace if not already cached.
+/// Returns the path to the cached tokenizer file.
+///
+/// This function is thread-safe and will only download the tokenizer once
+/// even if called from multiple threads concurrently.
+pub fn ensure_tokenizer_cached() -> PathBuf {
+    // Get or initialize the mutex
+    let mutex = DOWNLOAD_MUTEX.get_or_init(|| Mutex::new(()));
+
+    // Lock to ensure only one thread downloads at a time
+    let _guard = mutex.lock().unwrap();
+
+    let cache_dir = PathBuf::from(CACHE_DIR);
+    let tokenizer_path = cache_dir.join(TINYLLAMA_TOKENIZER_FILENAME);
+
+    // Create cache directory if it doesn't exist
+    if !cache_dir.exists() {
+        fs::create_dir_all(&cache_dir).expect("Failed to create cache directory");
+    }
+
+    // Download tokenizer if not already cached
+    if !tokenizer_path.exists() {
+        println!("Downloading TinyLlama tokenizer from HuggingFace...");
+
+        // Use blocking reqwest client since we're in tests/benchmarks
+        let client = reqwest::blocking::Client::new();
+        let response = client
+            .get(TINYLLAMA_TOKENIZER_URL)
+            .send()
+            .expect("Failed to download tokenizer");
+
+        if !response.status().is_success() {
+            panic!("Failed to download tokenizer: HTTP {}", response.status());
+        }
+
+        let content = response.bytes().expect("Failed to read tokenizer content");
+
+        // Verify we got actual JSON content
+        if content.len() < 100 {
+            panic!("Downloaded content too small: {} bytes", content.len());
+        }
+
+        fs::write(&tokenizer_path, content).expect("Failed to write tokenizer to cache");
+        println!(
+            "Tokenizer downloaded and cached successfully ({} bytes)",
+            tokenizer_path.metadata().unwrap().len()
+        );
+    }
+
+    tokenizer_path
+}
+
+/// Common test prompts for consistency across tests
+pub const TEST_PROMPTS: [&str; 4] = [
+    "deep learning is",
+    "Deep learning is",
+    "has anyone seen nemo lately",
+    "another prompt",
+];
+
+/// Pre-computed hashes for verification
+pub const EXPECTED_HASHES: [u64; 4] = [
+    1209591529327510910,
+    4181375434596349981,
+    6245658446118930933,
+    5097285695902185237,
+];
diff --git a/sgl-router/tests/common/test_app.rs b/sgl-router/tests/common/test_app.rs
new file mode 100644
index 000000000..83d7d456a
--- /dev/null
+++ b/sgl-router/tests/common/test_app.rs
@@ -0,0 +1,52 @@
+use axum::Router;
+use reqwest::Client;
+use sglang_router_rs::{
+    config::RouterConfig,
+    routers::RouterTrait,
+    server::{build_app, AppContext, AppState},
+};
+use std::sync::Arc;
+
+/// Create a test Axum application using the actual server's build_app function
+#[allow(dead_code)]
+pub fn create_test_app(
+    router: Arc<dyn RouterTrait>,
+    client: Client,
+    router_config: &RouterConfig,
+) -> Router {
+    // Create AppContext
+    let app_context = Arc::new(
+        AppContext::new(
+            router_config.clone(),
+            client,
+            router_config.max_concurrent_requests,
+            router_config.rate_limit_tokens_per_second,
+        )
+        .expect("Failed to create AppContext in test"),
+    );
+
+    // Create AppState with the test router and context
+    let app_state = Arc::new(AppState {
+        router,
+        context: app_context,
+        concurrency_queue_tx: None, // No queue for tests
+    });
+
+    // Configure request ID headers (use defaults if not specified)
+    let request_id_headers = router_config.request_id_headers.clone().unwrap_or_else(|| {
+        vec![
+            "x-request-id".to_string(),
+            "x-correlation-id".to_string(),
+            "x-trace-id".to_string(),
+            "request-id".to_string(),
+        ]
+    });
+
+    // Use the actual server's build_app function
+    build_app(
+        app_state,
+        router_config.max_payload_size,
+        request_id_headers,
+        router_config.cors_allowed_origins.clone(),
+    )
+}
diff --git a/sgl-router/tests/mcp_test.rs b/sgl-router/tests/mcp_test.rs
new file mode 100644
index 000000000..9821bffa6
--- /dev/null
+++ b/sgl-router/tests/mcp_test.rs
@@ -0,0 +1,457 @@
+// This test suite validates the complete MCP implementation against the
+// functionality required for SGLang responses API integration.
+//
+// Test Coverage:
+// - Core MCP server functionality
+// - Tool session management (individual and multi-tool)
+// - Tool execution and error handling
+// - Schema adaptation and validation
+// - Mock server integration for reliable testing
+
+mod common;
+
+use common::mock_mcp_server::MockMCPServer;
+use serde_json::json;
+use sglang_router_rs::mcp::{McpClientManager, McpConfig, McpError, McpServerConfig, McpTransport};
+use std::collections::HashMap;
+
+/// Create a new mock server for testing (each test gets its own)
+async fn create_mock_server() -> MockMCPServer {
+    MockMCPServer::start()
+        .await
+        .expect("Failed to start mock MCP server")
+}
+
+// Core MCP Server Tests
+
+#[tokio::test]
+async fn test_mcp_server_initialization() {
+    // Test that we can create an empty configuration
+    let config = McpConfig { servers: vec![] };
+
+    // Should fail with no servers
+    let result = McpClientManager::new(config).await;
+    assert!(result.is_err(), "Should fail with no servers configured");
+}
+
+#[tokio::test]
+async fn test_server_connection_with_mock() {
+    let mock_server = create_mock_server().await;
+
+    let config = McpConfig {
+        servers: vec![McpServerConfig {
+            name: "mock_server".to_string(),
+            transport: McpTransport::Streamable {
+                url: mock_server.url(),
+                token: None,
+            },
+        }],
+    };
+
+    let result = McpClientManager::new(config).await;
+    assert!(result.is_ok(), "Should connect to mock server");
+
+    let mut manager = result.unwrap();
+
+    let servers = manager.list_servers();
+    assert_eq!(servers.len(), 1);
+    assert!(servers.contains(&"mock_server".to_string()));
+
+    let tools = manager.list_tools();
+    assert_eq!(tools.len(), 2, "Should have 2 tools from mock server");
+
+    assert!(manager.has_tool("brave_web_search"));
+    assert!(manager.has_tool("brave_local_search"));
+
+    manager.shutdown().await;
+}
+
+#[tokio::test]
+async fn test_tool_availability_checking() {
+    let mock_server = create_mock_server().await;
+
+    let config = McpConfig {
+        servers: vec![McpServerConfig {
+            name: "mock_server".to_string(),
+            transport: McpTransport::Streamable {
+                url: mock_server.url(),
+                token: None,
+            },
+        }],
+    };
+
+    let mut manager = McpClientManager::new(config).await.unwrap();
+
+    let test_tools = vec!["brave_web_search", "brave_local_search", "calculator"];
+    for tool in test_tools {
+        let available = manager.has_tool(tool);
+        match tool {
+            "brave_web_search" | "brave_local_search" => {
+                assert!(
+                    available,
+                    "Tool {} should be available from mock server",
+                    tool
+                );
+            }
+            "calculator" => {
+                assert!(
+                    !available,
+                    "Tool {} should not be available from mock server",
+                    tool
+                );
+            }
+            _ => {}
+        }
+    }
+
+    manager.shutdown().await;
+}
+
+#[tokio::test]
+async fn test_multi_server_connection() {
+    let mock_server1 = create_mock_server().await;
+    let mock_server2 = create_mock_server().await;
+
+    let config = McpConfig {
+        servers: vec![
+            McpServerConfig {
+                name: "mock_server_1".to_string(),
+                transport: McpTransport::Streamable {
+                    url: mock_server1.url(),
+                    token: None,
+                },
+            },
+            McpServerConfig {
+                name: "mock_server_2".to_string(),
+                transport: McpTransport::Streamable {
+                    url: mock_server2.url(),
+                    token: None,
+                },
+            },
+        ],
+    };
+
+    // Note: This will fail to connect to both servers in the current implementation
+    // since they return the same tools. The manager will connect to the first one.
+    let result = McpClientManager::new(config).await;
+
+    if let Ok(mut manager) = result {
+        let servers = manager.list_servers();
+        assert!(!servers.is_empty(), "Should have at least one server");
+
+        let tools = manager.list_tools();
+        assert!(tools.len() >= 2, "Should have tools from servers");
+
+        manager.shutdown().await;
+    }
+}
+
+#[tokio::test]
+async fn test_tool_execution_with_mock() {
+    let mock_server = create_mock_server().await;
+
+    let config = McpConfig {
+        servers: vec![McpServerConfig {
+            name: "mock_server".to_string(),
+            transport: McpTransport::Streamable {
+                url: mock_server.url(),
+                token: None,
+            },
+        }],
+    };
+
+    let mut manager = McpClientManager::new(config).await.unwrap();
+
+    let result = manager
+        .call_tool(
+            "brave_web_search",
+            Some(
+                json!({
+                    "query": "rust programming",
+                    "count": 1
+                })
+                .as_object()
+                .unwrap()
+                .clone(),
+            ),
+        )
+        .await;
+
+    assert!(
+        result.is_ok(),
+        "Tool execution should succeed with mock server"
+    );
+
+    let response = result.unwrap();
+    assert!(!response.content.is_empty(), "Should have content");
+
+    // Check the content
+    if let rmcp::model::RawContent::Text(text) = &response.content[0].raw {
+        assert!(text
+            .text
+            .contains("Mock search results for: rust programming"));
+    } else {
+        panic!("Expected text content");
+    }
+
+    manager.shutdown().await;
+}
+
+#[tokio::test]
+async fn test_concurrent_tool_execution() {
+    let mock_server = create_mock_server().await;
+
+    let config = McpConfig {
+        servers: vec![McpServerConfig {
+            name: "mock_server".to_string(),
+            transport: McpTransport::Streamable {
+                url: mock_server.url(),
+                token: None,
+            },
+        }],
+    };
+
+    let mut manager = McpClientManager::new(config).await.unwrap();
+
+    // Execute tools sequentially (true concurrent execution would require Arc<Mutex>)
+    let tool_calls = vec![
+        ("brave_web_search", json!({"query": "test1"})),
+        ("brave_local_search", json!({"query": "test2"})),
+    ];
+
+    for (tool_name, args) in tool_calls {
+        let result = manager
+            .call_tool(tool_name, Some(args.as_object().unwrap().clone()))
+            .await;
+
+        assert!(result.is_ok(), "Tool {} should succeed", tool_name);
+        let response = result.unwrap();
+        assert!(!response.content.is_empty(), "Should have content");
+    }
+
+    manager.shutdown().await;
+}
+
+// Error Handling Tests
+
+#[tokio::test]
+async fn test_tool_execution_errors() {
+    let mock_server = create_mock_server().await;
+
+    let config = McpConfig {
+        servers: vec![McpServerConfig {
+            name: "mock_server".to_string(),
+            transport: McpTransport::Streamable {
+                url: mock_server.url(),
+                token: None,
+            },
+        }],
+    };
+
+    let mut manager = McpClientManager::new(config).await.unwrap();
+
+    // Try to call unknown tool
+    let result = manager
+        .call_tool("unknown_tool", Some(serde_json::Map::new()))
+        .await;
+    assert!(result.is_err(), "Should fail for unknown tool");
+
+    match result.unwrap_err() {
+        McpError::ToolNotFound(name) => {
+            assert_eq!(name, "unknown_tool");
+        }
+        _ => panic!("Expected ToolNotFound error"),
+    }
+
+    manager.shutdown().await;
+}
+
+#[tokio::test]
+async fn test_connection_without_server() {
+    let config = McpConfig {
+        servers: vec![McpServerConfig {
+            name: "nonexistent".to_string(),
+            transport: McpTransport::Streamable {
+                url: "http://localhost:9999/mcp".to_string(),
+                token: None,
+            },
+        }],
+    };
+
+    let result = McpClientManager::new(config).await;
+    assert!(result.is_err(), "Should fail when no server is running");
+
+    if let Err(e) = result {
+        let error_msg = e.to_string();
+        assert!(
+            error_msg.contains("Failed to connect") || error_msg.contains("Connection"),
+            "Error should be connection-related: {}",
+            error_msg
+        );
+    }
+}
+
+// Schema Validation Tests
+
+#[tokio::test]
+async fn test_tool_info_structure() {
+    let mock_server = create_mock_server().await;
+
+    let config = McpConfig {
+        servers: vec![McpServerConfig {
+            name: "mock_server".to_string(),
+            transport: McpTransport::Streamable {
+                url: mock_server.url(),
+                token: None,
+            },
+        }],
+    };
+
+    let manager = McpClientManager::new(config).await.unwrap();
+
+    let tools = manager.list_tools();
+    let brave_search = tools
+        .iter()
+        .find(|t| t.name == "brave_web_search")
+        .expect("Should have brave_web_search tool");
+
+    assert_eq!(brave_search.name, "brave_web_search");
+    assert!(brave_search.description.contains("Mock web search"));
+    assert_eq!(brave_search.server, "mock_server");
+    assert!(brave_search.parameters.is_some());
+}
+
+// SSE Parsing Tests (simplified since we don't expose parse_sse_event)
+
+#[tokio::test]
+async fn test_sse_connection() {
+    let mock_server = create_mock_server().await;
+
+    // Test SSE transport configuration
+    let config = McpConfig {
+        servers: vec![McpServerConfig {
+            name: "sse_server".to_string(),
+            transport: McpTransport::Sse {
+                // Mock server doesn't support SSE, but we can test the config
+                url: format!("http://127.0.0.1:{}/sse", mock_server.port),
+                token: Some("test_token".to_string()),
+            },
+        }],
+    };
+
+    // This will fail to connect but tests the configuration
+    let result = McpClientManager::new(config).await;
+    assert!(result.is_err(), "Mock server doesn't support SSE");
+}
+
+// Connection Type Tests
+
+#[tokio::test]
+async fn test_transport_types() {
+    // Test different transport configurations
+
+    // HTTP/Streamable transport
+    let http_config = McpServerConfig {
+        name: "http_server".to_string(),
+        transport: McpTransport::Streamable {
+            url: "http://localhost:8080/mcp".to_string(),
+            token: Some("auth_token".to_string()),
+        },
+    };
+    assert_eq!(http_config.name, "http_server");
+
+    // SSE transport
+    let sse_config = McpServerConfig {
+        name: "sse_server".to_string(),
+        transport: McpTransport::Sse {
+            url: "http://localhost:8081/sse".to_string(),
+            token: None,
+        },
+    };
+    assert_eq!(sse_config.name, "sse_server");
+
+    // STDIO transport
+    let stdio_config = McpServerConfig {
+        name: "stdio_server".to_string(),
+        transport: McpTransport::Stdio {
+            command: "mcp-server".to_string(),
+            args: vec!["--port".to_string(), "8082".to_string()],
+            envs: HashMap::new(),
+        },
+    };
+    assert_eq!(stdio_config.name, "stdio_server");
+}
+
+// Integration Pattern Tests
+
+#[tokio::test]
+async fn test_complete_workflow() {
+    let mock_server = create_mock_server().await;
+
+    // 1. Initialize configuration
+    let config = McpConfig {
+        servers: vec![McpServerConfig {
+            name: "integration_test".to_string(),
+            transport: McpTransport::Streamable {
+                url: mock_server.url(),
+                token: None,
+            },
+        }],
+    };
+
+    // 2. Connect to server
+    let mut manager = McpClientManager::new(config)
+        .await
+        .expect("Should connect to mock server");
+
+    // 3. Verify server connection
+    let servers = manager.list_servers();
+    assert_eq!(servers.len(), 1);
+    assert_eq!(servers[0], "integration_test");
+
+    // 4. Check available tools
+    let tools = manager.list_tools();
+    assert_eq!(tools.len(), 2);
+
+    // 5. Verify specific tools exist
+    assert!(manager.has_tool("brave_web_search"));
+    assert!(manager.has_tool("brave_local_search"));
+    assert!(!manager.has_tool("nonexistent_tool"));
+
+    // 6. Execute a tool
+    let result = manager
+        .call_tool(
+            "brave_web_search",
+            Some(
+                json!({
+                    "query": "SGLang router MCP integration",
+                    "count": 1
+                })
+                .as_object()
+                .unwrap()
+                .clone(),
+            ),
+        )
+        .await;
+
+    assert!(result.is_ok(), "Tool execution should succeed");
+    let response = result.unwrap();
+    assert!(!response.content.is_empty(), "Should return content");
+
+    // 7. Clean shutdown
+    manager.shutdown().await;
+
+    // Verify all required capabilities for responses API integration
+    let capabilities = [
+        "MCP server initialization",
+        "Tool server connection and discovery",
+        "Tool availability checking",
+        "Tool execution",
+        "Error handling and robustness",
+        "Multi-server support",
+        "Schema adaptation",
+        "Mock server integration (no external dependencies)",
+    ];
+
+    assert_eq!(capabilities.len(), 8);
+}
diff --git a/sgl-router/tests/request_formats_test.rs b/sgl-router/tests/request_formats_test.rs
new file mode 100644
index 000000000..606ca0a41
--- /dev/null
+++ b/sgl-router/tests/request_formats_test.rs
@@ -0,0 +1,392 @@
+mod common;
+
+use common::mock_worker::{HealthStatus, MockWorker, MockWorkerConfig, WorkerType};
+use reqwest::Client;
+use serde_json::json;
+use sglang_router_rs::config::{
+    CircuitBreakerConfig, ConnectionMode, PolicyConfig, RetryConfig, RouterConfig, RoutingMode,
+};
+use sglang_router_rs::routers::{RouterFactory, RouterTrait};
+use std::sync::Arc;
+
+/// Test context that manages mock workers
+struct TestContext {
+    workers: Vec<MockWorker>,
+    router: Arc<dyn RouterTrait>,
+}
+
+impl TestContext {
+    async fn new(worker_configs: Vec<MockWorkerConfig>) -> Self {
+        let mut config = RouterConfig {
+            mode: RoutingMode::Regular {
+                worker_urls: vec![],
+            },
+            policy: PolicyConfig::Random,
+            host: "127.0.0.1".to_string(),
+            port: 3003,
+            max_payload_size: 256 * 1024 * 1024,
+            request_timeout_secs: 600,
+            worker_startup_timeout_secs: 1,
+            worker_startup_check_interval_secs: 1,
+            dp_aware: false,
+            api_key: None,
+            discovery: None,
+            metrics: None,
+            log_dir: None,
+            log_level: None,
+            request_id_headers: None,
+            max_concurrent_requests: 64,
+            queue_size: 0,
+            queue_timeout_secs: 60,
+            rate_limit_tokens_per_second: None,
+            cors_allowed_origins: vec![],
+            retry: RetryConfig::default(),
+            circuit_breaker: CircuitBreakerConfig::default(),
+            disable_retries: false,
+            disable_circuit_breaker: false,
+            health_check: sglang_router_rs::config::HealthCheckConfig::default(),
+            enable_igw: false,
+            connection_mode: ConnectionMode::Http,
+            model_path: None,
+            tokenizer_path: None,
+        };
+
+        let mut workers = Vec::new();
+        let mut worker_urls = Vec::new();
+
+        for worker_config in worker_configs {
+            let mut worker = MockWorker::new(worker_config);
+            let url = worker.start().await.unwrap();
+            worker_urls.push(url);
+            workers.push(worker);
+        }
+
+        if !workers.is_empty() {
+            tokio::time::sleep(tokio::time::Duration::from_millis(200)).await;
+        }
+
+        config.mode = RoutingMode::Regular { worker_urls };
+
+        let app_context = common::create_test_context(config);
+        let router = RouterFactory::create_router(&app_context).await.unwrap();
+        let router = Arc::from(router);
+
+        if !workers.is_empty() {
+            tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
+        }
+
+        Self { workers, router }
+    }
+
+    async fn shutdown(mut self) {
+        // Small delay to ensure any pending operations complete
+        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+
+        for worker in &mut self.workers {
+            worker.stop().await;
+        }
+
+        // Another small delay to ensure cleanup completes
+        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+    }
+
+    async fn make_request(
+        &self,
+        endpoint: &str,
+        body: serde_json::Value,
+    ) -> Result<serde_json::Value, String> {
+        let client = Client::new();
+
+        // Get any worker URL for testing
+        let worker_urls = self.router.get_worker_urls();
+        if worker_urls.is_empty() {
+            return Err("No available workers".to_string());
+        }
+
+        let worker_url = &worker_urls[0];
+
+        let response = client
+            .post(format!("{}{}", worker_url, endpoint))
+            .json(&body)
+            .send()
+            .await
+            .map_err(|e| format!("Request failed: {}", e))?;
+
+        if !response.status().is_success() {
+            return Err(format!("Request failed with status: {}", response.status()));
+        }
+
+        response
+            .json::<serde_json::Value>()
+            .await
+            .map_err(|e| format!("Failed to parse response: {}", e))
+    }
+}
+
+#[cfg(test)]
+mod request_format_tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_generate_request_formats() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 19001,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        // Test 1: Basic text request
+        let payload = json!({
+            "text": "Hello, world!",
+            "stream": false
+        });
+
+        let result = ctx.make_request("/generate", payload).await;
+        assert!(result.is_ok());
+
+        // Test 2: Request with sampling parameters
+        let payload = json!({
+            "text": "Tell me a story",
+            "sampling_params": {
+                "temperature": 0.7,
+                "max_new_tokens": 100,
+                "top_p": 0.9
+            },
+            "stream": false
+        });
+
+        let result = ctx.make_request("/generate", payload).await;
+        assert!(result.is_ok());
+
+        // Test 3: Request with input_ids
+        let payload = json!({
+            "input_ids": [1, 2, 3, 4, 5],
+            "sampling_params": {
+                "temperature": 0.0,
+                "max_new_tokens": 50
+            },
+            "stream": false
+        });
+
+        let result = ctx.make_request("/generate", payload).await;
+        assert!(result.is_ok());
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_v1_chat_completions_formats() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 19002,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        // Test 1: Basic chat completion
+        let payload = json!({
+            "model": "test-model",
+            "messages": [
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Hello!"}
+            ],
+            "stream": false
+        });
+
+        let result = ctx.make_request("/v1/chat/completions", payload).await;
+        assert!(result.is_ok());
+
+        let response = result.unwrap();
+        assert!(response.get("choices").is_some());
+        assert!(response.get("id").is_some());
+        assert_eq!(
+            response.get("object").and_then(|v| v.as_str()),
+            Some("chat.completion")
+        );
+
+        // Test 2: Chat completion with parameters
+        let payload = json!({
+            "model": "test-model",
+            "messages": [
+                {"role": "user", "content": "Tell me a joke"}
+            ],
+            "temperature": 0.8,
+            "max_tokens": 150,
+            "top_p": 0.95,
+            "stream": false
+        });
+
+        let result = ctx.make_request("/v1/chat/completions", payload).await;
+        assert!(result.is_ok());
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_v1_completions_formats() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 19003,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        // Test 1: Basic completion
+        let payload = json!({
+            "model": "test-model",
+            "prompt": "Once upon a time",
+            "max_tokens": 50,
+            "stream": false
+        });
+
+        let result = ctx.make_request("/v1/completions", payload).await;
+        assert!(result.is_ok());
+
+        let response = result.unwrap();
+        assert!(response.get("choices").is_some());
+        assert_eq!(
+            response.get("object").and_then(|v| v.as_str()),
+            Some("text_completion")
+        );
+
+        // Test 2: Completion with array prompt
+        let payload = json!({
+            "model": "test-model",
+            "prompt": ["First prompt", "Second prompt"],
+            "temperature": 0.5,
+            "stream": false
+        });
+
+        let result = ctx.make_request("/v1/completions", payload).await;
+        assert!(result.is_ok());
+
+        // Test 3: Completion with logprobs
+        let payload = json!({
+            "model": "test-model",
+            "prompt": "The capital of France is",
+            "max_tokens": 10,
+            "logprobs": 5,
+            "stream": false
+        });
+
+        let result = ctx.make_request("/v1/completions", payload).await;
+        assert!(result.is_ok());
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_batch_requests() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 19004,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        // Test batch text generation
+        let payload = json!({
+            "text": ["First text", "Second text", "Third text"],
+            "sampling_params": {
+                "temperature": 0.7,
+                "max_new_tokens": 50
+            },
+            "stream": false
+        });
+
+        let result = ctx.make_request("/generate", payload).await;
+        assert!(result.is_ok());
+
+        // Test batch with input_ids
+        let payload = json!({
+            "input_ids": [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+            "stream": false
+        });
+
+        let result = ctx.make_request("/generate", payload).await;
+        assert!(result.is_ok());
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_special_parameters() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 19005,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        // Test with return_logprob
+        let payload = json!({
+            "text": "Test",
+            "return_logprob": true,
+            "stream": false
+        });
+
+        let result = ctx.make_request("/generate", payload).await;
+        assert!(result.is_ok());
+
+        // Test with json_schema
+        let payload = json!({
+            "text": "Generate JSON",
+            "sampling_params": {
+                "temperature": 0.0,
+                "json_schema": "$$ANY$$"
+            },
+            "stream": false
+        });
+
+        let result = ctx.make_request("/generate", payload).await;
+        assert!(result.is_ok());
+
+        // Test with ignore_eos
+        let payload = json!({
+            "text": "Continue forever",
+            "sampling_params": {
+                "temperature": 0.7,
+                "max_new_tokens": 100,
+                "ignore_eos": true
+            },
+            "stream": false
+        });
+
+        let result = ctx.make_request("/generate", payload).await;
+        assert!(result.is_ok());
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_error_handling() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 19006,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        // Test with empty body - should still work with mock worker
+        let payload = json!({});
+
+        let result = ctx.make_request("/generate", payload).await;
+        // Mock worker accepts empty body
+        assert!(result.is_ok());
+
+        ctx.shutdown().await;
+    }
+}
diff --git a/sgl-router/tests/responses_api_test.rs b/sgl-router/tests/responses_api_test.rs
new file mode 100644
index 000000000..dc2253799
--- /dev/null
+++ b/sgl-router/tests/responses_api_test.rs
@@ -0,0 +1,210 @@
+// Integration test for Responses API
+
+use sglang_router_rs::protocols::spec::{
+    GenerationRequest, ReasoningEffort, ResponseInput, ResponseReasoningParam, ResponseStatus,
+    ResponseTool, ResponseToolType, ResponsesRequest, ResponsesResponse, ServiceTier, ToolChoice,
+    ToolChoiceValue, Truncation, UsageInfo,
+};
+
+#[test]
+fn test_responses_request_creation() {
+    let request = ResponsesRequest {
+        background: false,
+        include: None,
+        input: ResponseInput::Text("Hello, world!".to_string()),
+        instructions: Some("Be helpful".to_string()),
+        max_output_tokens: Some(100),
+        max_tool_calls: None,
+        metadata: None,
+        model: Some("test-model".to_string()),
+        parallel_tool_calls: true,
+        previous_response_id: None,
+        reasoning: Some(ResponseReasoningParam {
+            effort: Some(ReasoningEffort::Medium),
+        }),
+        service_tier: ServiceTier::Auto,
+        store: true,
+        stream: false,
+        temperature: Some(0.7),
+        tool_choice: ToolChoice::Value(ToolChoiceValue::Auto),
+        tools: vec![ResponseTool {
+            r#type: ResponseToolType::WebSearchPreview,
+        }],
+        top_logprobs: 5,
+        top_p: Some(0.9),
+        truncation: Truncation::Disabled,
+        user: Some("test-user".to_string()),
+        request_id: "resp_test123".to_string(),
+        priority: 0,
+        frequency_penalty: 0.0,
+        presence_penalty: 0.0,
+        stop: None,
+        top_k: -1,
+        min_p: 0.0,
+        repetition_penalty: 1.0,
+    };
+
+    // Test GenerationRequest trait implementation
+    assert!(!request.is_stream());
+    assert_eq!(request.get_model(), Some("test-model"));
+    let routing_text = request.extract_text_for_routing();
+    assert_eq!(routing_text, "Hello, world!");
+}
+
+#[test]
+fn test_sampling_params_conversion() {
+    let request = ResponsesRequest {
+        background: false,
+        include: None,
+        input: ResponseInput::Text("Test".to_string()),
+        instructions: None,
+        max_output_tokens: Some(50),
+        max_tool_calls: None,
+        metadata: None,
+        model: Some("test-model".to_string()),
+        parallel_tool_calls: true, // Use default true
+        previous_response_id: None,
+        reasoning: None,
+        service_tier: ServiceTier::Auto,
+        store: true, // Use default true
+        stream: false,
+        temperature: Some(0.8),
+        tool_choice: ToolChoice::Value(ToolChoiceValue::Auto),
+        tools: vec![],
+        top_logprobs: 0, // Use default 0
+        top_p: Some(0.95),
+        truncation: Truncation::Auto,
+        user: None,
+        request_id: "resp_test456".to_string(),
+        priority: 0,
+        frequency_penalty: 0.1,
+        presence_penalty: 0.2,
+        stop: None,
+        top_k: 10,
+        min_p: 0.05,
+        repetition_penalty: 1.1,
+    };
+
+    let params = request.to_sampling_params(1000, None);
+
+    // Check that parameters are converted correctly
+    assert!(params.contains_key("temperature"));
+    assert!(params.contains_key("top_p"));
+    assert!(params.contains_key("frequency_penalty"));
+    assert!(params.contains_key("max_new_tokens"));
+}
+
+#[test]
+fn test_responses_response_creation() {
+    let response = ResponsesResponse::new(
+        "resp_test789".to_string(),
+        "test-model".to_string(),
+        ResponseStatus::Completed,
+    );
+
+    assert_eq!(response.id, "resp_test789");
+    assert_eq!(response.model, "test-model");
+    assert!(response.is_complete());
+    assert!(!response.is_in_progress());
+    assert!(!response.is_failed());
+}
+
+#[test]
+fn test_usage_conversion() {
+    let usage_info = UsageInfo::new_with_cached(15, 25, Some(8), 3);
+    let response_usage = usage_info.to_response_usage();
+
+    assert_eq!(response_usage.input_tokens, 15);
+    assert_eq!(response_usage.output_tokens, 25);
+    assert_eq!(response_usage.total_tokens, 40);
+
+    // Check details are converted correctly
+    assert!(response_usage.input_tokens_details.is_some());
+    assert_eq!(
+        response_usage
+            .input_tokens_details
+            .as_ref()
+            .unwrap()
+            .cached_tokens,
+        3
+    );
+
+    assert!(response_usage.output_tokens_details.is_some());
+    assert_eq!(
+        response_usage
+            .output_tokens_details
+            .as_ref()
+            .unwrap()
+            .reasoning_tokens,
+        8
+    );
+
+    // Test reverse conversion
+    let back_to_usage = response_usage.to_usage_info();
+    assert_eq!(back_to_usage.prompt_tokens, 15);
+    assert_eq!(back_to_usage.completion_tokens, 25);
+    assert_eq!(back_to_usage.reasoning_tokens, Some(8));
+}
+
+#[test]
+fn test_reasoning_param_default() {
+    let param = ResponseReasoningParam {
+        effort: Some(ReasoningEffort::Medium),
+    };
+
+    // Test JSON serialization/deserialization preserves default
+    let json = serde_json::to_string(&param).unwrap();
+    let parsed: ResponseReasoningParam = serde_json::from_str(&json).unwrap();
+
+    assert!(matches!(parsed.effort, Some(ReasoningEffort::Medium)));
+}
+
+#[test]
+fn test_json_serialization() {
+    let request = ResponsesRequest {
+        background: true,
+        include: None,
+        input: ResponseInput::Text("Test input".to_string()),
+        instructions: Some("Test instructions".to_string()),
+        max_output_tokens: Some(200),
+        max_tool_calls: Some(5),
+        metadata: None,
+        model: Some("gpt-4".to_string()),
+        parallel_tool_calls: false,
+        previous_response_id: None,
+        reasoning: Some(ResponseReasoningParam {
+            effort: Some(ReasoningEffort::High),
+        }),
+        service_tier: ServiceTier::Priority,
+        store: false,
+        stream: true,
+        temperature: Some(0.9),
+        tool_choice: ToolChoice::Value(ToolChoiceValue::Required),
+        tools: vec![ResponseTool {
+            r#type: ResponseToolType::CodeInterpreter,
+        }],
+        top_logprobs: 10,
+        top_p: Some(0.8),
+        truncation: Truncation::Auto,
+        user: Some("test_user".to_string()),
+        request_id: "resp_comprehensive_test".to_string(),
+        priority: 1,
+        frequency_penalty: 0.3,
+        presence_penalty: 0.4,
+        stop: None,
+        top_k: 50,
+        min_p: 0.1,
+        repetition_penalty: 1.2,
+    };
+
+    // Test that everything can be serialized to JSON and back
+    let json = serde_json::to_string(&request).expect("Serialization should work");
+    let parsed: ResponsesRequest =
+        serde_json::from_str(&json).expect("Deserialization should work");
+
+    assert_eq!(parsed.request_id, "resp_comprehensive_test");
+    assert_eq!(parsed.model, Some("gpt-4".to_string()));
+    assert!(parsed.background);
+    assert!(parsed.stream);
+    assert_eq!(parsed.tools.len(), 1);
+}
diff --git a/sgl-router/tests/streaming_tests.rs b/sgl-router/tests/streaming_tests.rs
new file mode 100644
index 000000000..29190a312
--- /dev/null
+++ b/sgl-router/tests/streaming_tests.rs
@@ -0,0 +1,370 @@
+mod common;
+
+use common::mock_worker::{HealthStatus, MockWorker, MockWorkerConfig, WorkerType};
+use futures_util::StreamExt;
+use reqwest::Client;
+use serde_json::json;
+use sglang_router_rs::config::{
+    CircuitBreakerConfig, ConnectionMode, PolicyConfig, RetryConfig, RouterConfig, RoutingMode,
+};
+use sglang_router_rs::routers::{RouterFactory, RouterTrait};
+use std::sync::Arc;
+
+/// Test context that manages mock workers
+struct TestContext {
+    workers: Vec<MockWorker>,
+    router: Arc<dyn RouterTrait>,
+}
+
+impl TestContext {
+    async fn new(worker_configs: Vec<MockWorkerConfig>) -> Self {
+        let mut config = RouterConfig {
+            mode: RoutingMode::Regular {
+                worker_urls: vec![],
+            },
+            policy: PolicyConfig::Random,
+            host: "127.0.0.1".to_string(),
+            port: 3004,
+            max_payload_size: 256 * 1024 * 1024,
+            request_timeout_secs: 600,
+            worker_startup_timeout_secs: 1,
+            worker_startup_check_interval_secs: 1,
+            dp_aware: false,
+            api_key: None,
+            discovery: None,
+            metrics: None,
+            log_dir: None,
+            log_level: None,
+            request_id_headers: None,
+            max_concurrent_requests: 64,
+            queue_size: 0,
+            queue_timeout_secs: 60,
+            rate_limit_tokens_per_second: None,
+            cors_allowed_origins: vec![],
+            retry: RetryConfig::default(),
+            circuit_breaker: CircuitBreakerConfig::default(),
+            disable_retries: false,
+            disable_circuit_breaker: false,
+            health_check: sglang_router_rs::config::HealthCheckConfig::default(),
+            enable_igw: false,
+            connection_mode: ConnectionMode::Http,
+            model_path: None,
+            tokenizer_path: None,
+        };
+
+        let mut workers = Vec::new();
+        let mut worker_urls = Vec::new();
+
+        for worker_config in worker_configs {
+            let mut worker = MockWorker::new(worker_config);
+            let url = worker.start().await.unwrap();
+            worker_urls.push(url);
+            workers.push(worker);
+        }
+
+        if !workers.is_empty() {
+            tokio::time::sleep(tokio::time::Duration::from_millis(200)).await;
+        }
+
+        config.mode = RoutingMode::Regular { worker_urls };
+
+        let app_context = common::create_test_context(config);
+        let router = RouterFactory::create_router(&app_context).await.unwrap();
+        let router = Arc::from(router);
+
+        if !workers.is_empty() {
+            tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
+        }
+
+        Self { workers, router }
+    }
+
+    async fn shutdown(mut self) {
+        // Small delay to ensure any pending operations complete
+        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+
+        for worker in &mut self.workers {
+            worker.stop().await;
+        }
+
+        // Another small delay to ensure cleanup completes
+        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+    }
+
+    async fn make_streaming_request(
+        &self,
+        endpoint: &str,
+        body: serde_json::Value,
+    ) -> Result<Vec<String>, String> {
+        let client = Client::new();
+
+        // Get any worker URL for testing
+        let worker_urls = self.router.get_worker_urls();
+        if worker_urls.is_empty() {
+            return Err("No available workers".to_string());
+        }
+
+        let worker_url = &worker_urls[0];
+
+        let response = client
+            .post(format!("{}{}", worker_url, endpoint))
+            .json(&body)
+            .send()
+            .await
+            .map_err(|e| format!("Request failed: {}", e))?;
+
+        if !response.status().is_success() {
+            return Err(format!("Request failed with status: {}", response.status()));
+        }
+
+        // Check if it's a streaming response
+        let content_type = response
+            .headers()
+            .get("content-type")
+            .and_then(|v| v.to_str().ok())
+            .unwrap_or("");
+
+        if !content_type.contains("text/event-stream") {
+            return Err("Response is not a stream".to_string());
+        }
+
+        let mut stream = response.bytes_stream();
+        let mut events = Vec::new();
+
+        while let Some(chunk) = stream.next().await {
+            if let Ok(bytes) = chunk {
+                let text = String::from_utf8_lossy(&bytes);
+                for line in text.lines() {
+                    if let Some(stripped) = line.strip_prefix("data: ") {
+                        events.push(stripped.to_string());
+                    }
+                }
+            }
+        }
+
+        Ok(events)
+    }
+}
+
+#[cfg(test)]
+mod streaming_tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_generate_streaming() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 20001,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 10,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        let payload = json!({
+            "text": "Stream test",
+            "stream": true,
+            "sampling_params": {
+                "temperature": 0.7,
+                "max_new_tokens": 10
+            }
+        });
+
+        let result = ctx.make_streaming_request("/generate", payload).await;
+        assert!(result.is_ok());
+
+        let events = result.unwrap();
+        // Should have at least one data chunk and [DONE]
+        assert!(events.len() >= 2);
+        assert_eq!(events.last().unwrap(), "[DONE]");
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_v1_chat_completions_streaming() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 20002,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 10,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        let payload = json!({
+            "model": "test-model",
+            "messages": [
+                {"role": "user", "content": "Count to 3"}
+            ],
+            "stream": true,
+            "max_tokens": 20
+        });
+
+        let result = ctx
+            .make_streaming_request("/v1/chat/completions", payload)
+            .await;
+        assert!(result.is_ok());
+
+        let events = result.unwrap();
+        assert!(events.len() >= 2); // At least one chunk + [DONE]
+
+        // Verify events are valid JSON (except [DONE])
+        for event in &events {
+            if event != "[DONE]" {
+                let parsed: Result<serde_json::Value, _> = serde_json::from_str(event);
+                assert!(parsed.is_ok(), "Invalid JSON in SSE event: {}", event);
+
+                let json = parsed.unwrap();
+                assert_eq!(
+                    json.get("object").and_then(|v| v.as_str()),
+                    Some("chat.completion.chunk")
+                );
+            }
+        }
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_v1_completions_streaming() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 20003,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 10,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        let payload = json!({
+            "model": "test-model",
+            "prompt": "Once upon a time",
+            "stream": true,
+            "max_tokens": 15
+        });
+
+        let result = ctx.make_streaming_request("/v1/completions", payload).await;
+        assert!(result.is_ok());
+
+        let events = result.unwrap();
+        assert!(events.len() >= 2); // At least one chunk + [DONE]
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_streaming_with_error() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 20004,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 0,
+            fail_rate: 1.0, // Always fail
+        }])
+        .await;
+
+        let payload = json!({
+            "text": "This should fail",
+            "stream": true
+        });
+
+        let result = ctx.make_streaming_request("/generate", payload).await;
+        // With fail_rate: 1.0, the request should fail
+        assert!(result.is_err());
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_streaming_timeouts() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 20005,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 100, // Slow response
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        let payload = json!({
+            "text": "Slow stream",
+            "stream": true,
+            "sampling_params": {
+                "max_new_tokens": 5
+            }
+        });
+
+        let start = std::time::Instant::now();
+        let result = ctx.make_streaming_request("/generate", payload).await;
+        let elapsed = start.elapsed();
+
+        assert!(result.is_ok());
+        let events = result.unwrap();
+
+        // Should have received multiple chunks over time
+        assert!(!events.is_empty());
+        assert!(elapsed.as_millis() >= 100); // At least one delay
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_batch_streaming() {
+        let ctx = TestContext::new(vec![MockWorkerConfig {
+            port: 20006,
+            worker_type: WorkerType::Regular,
+            health_status: HealthStatus::Healthy,
+            response_delay_ms: 10,
+            fail_rate: 0.0,
+        }])
+        .await;
+
+        // Batch request with streaming
+        let payload = json!({
+            "text": ["First", "Second", "Third"],
+            "stream": true,
+            "sampling_params": {
+                "max_new_tokens": 5
+            }
+        });
+
+        let result = ctx.make_streaming_request("/generate", payload).await;
+        assert!(result.is_ok());
+
+        let events = result.unwrap();
+        // Should have multiple events for batch
+        assert!(events.len() >= 4); // At least 3 responses + [DONE]
+
+        ctx.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_sse_format_parsing() {
+        // Test SSE format parsing
+        let parse_sse_chunk = |chunk: &[u8]| -> Vec<String> {
+            let text = String::from_utf8_lossy(chunk);
+            text.lines()
+                .filter(|line| line.starts_with("data: "))
+                .map(|line| line[6..].to_string())
+                .collect()
+        };
+
+        let sse_data =
+            b"data: {\"text\":\"Hello\"}\n\ndata: {\"text\":\" world\"}\n\ndata: [DONE]\n\n";
+        let events = parse_sse_chunk(sse_data);
+
+        assert_eq!(events.len(), 3);
+        assert_eq!(events[0], "{\"text\":\"Hello\"}");
+        assert_eq!(events[1], "{\"text\":\" world\"}");
+        assert_eq!(events[2], "[DONE]");
+
+        // Test with mixed content
+        let mixed = b"event: message\ndata: {\"test\":true}\n\n: comment\ndata: [DONE]\n\n";
+        let events = parse_sse_chunk(mixed);
+
+        assert_eq!(events.len(), 2);
+        assert_eq!(events[0], "{\"test\":true}");
+        assert_eq!(events[1], "[DONE]");
+    }
+}
diff --git a/sgl-router/tests/test_chat_template.rs b/sgl-router/tests/test_chat_template.rs
new file mode 100644
index 000000000..4a0e73bd0
--- /dev/null
+++ b/sgl-router/tests/test_chat_template.rs
@@ -0,0 +1,150 @@
+#[cfg(test)]
+mod tests {
+    use sglang_router_rs::tokenizer::chat_template::{ChatMessage, ChatTemplateProcessor};
+
+    #[test]
+    fn test_chat_message_helpers() {
+        let system_msg = ChatMessage::system("You are a helpful assistant");
+        assert_eq!(system_msg.role, "system");
+        assert_eq!(system_msg.content, "You are a helpful assistant");
+
+        let user_msg = ChatMessage::user("Hello!");
+        assert_eq!(user_msg.role, "user");
+        assert_eq!(user_msg.content, "Hello!");
+
+        let assistant_msg = ChatMessage::assistant("Hi there!");
+        assert_eq!(assistant_msg.role, "assistant");
+        assert_eq!(assistant_msg.content, "Hi there!");
+    }
+
+    #[test]
+    fn test_llama_style_template() {
+        // Test a Llama-style chat template
+        let template = r#"
+{%- if messages[0]['role'] == 'system' -%}
+    {%- set system_message = messages[0]['content'] -%}
+    {%- set messages = messages[1:] -%}
+{%- else -%}
+    {%- set system_message = '' -%}
+{%- endif -%}
+
+{{- bos_token }}
+{%- if system_message %}
+{{- '<|start_header_id|>system<|end_header_id|>\n\n' + system_message + '<|eot_id|>' }}
+{%- endif %}
+
+{%- for message in messages %}
+    {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}
+{%- endfor %}
+
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
+"#;
+
+        let processor = ChatTemplateProcessor::new(
+            template.to_string(),
+            Some("<|begin_of_text|>".to_string()),
+            Some("<|end_of_text|>".to_string()),
+        );
+
+        let messages = vec![
+            ChatMessage::system("You are a helpful assistant"),
+            ChatMessage::user("What is 2+2?"),
+        ];
+
+        let result = processor.apply_chat_template(&messages, true).unwrap();
+
+        // Check that the result contains expected markers
+        assert!(result.contains("<|begin_of_text|>"));
+        assert!(result.contains("<|start_header_id|>system<|end_header_id|>"));
+        assert!(result.contains("You are a helpful assistant"));
+        assert!(result.contains("<|start_header_id|>user<|end_header_id|>"));
+        assert!(result.contains("What is 2+2?"));
+        assert!(result.contains("<|start_header_id|>assistant<|end_header_id|>"));
+    }
+
+    #[test]
+    fn test_chatml_template() {
+        // Test a ChatML-style template
+        let template = r#"
+{%- for message in messages %}
+    {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>\n' }}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}
+"#;
+
+        let processor = ChatTemplateProcessor::new(template.to_string(), None, None);
+
+        let messages = vec![
+            ChatMessage::user("Hello"),
+            ChatMessage::assistant("Hi there!"),
+            ChatMessage::user("How are you?"),
+        ];
+
+        let result = processor.apply_chat_template(&messages, true).unwrap();
+
+        // Check ChatML format
+        assert!(result.contains("<|im_start|>user\nHello<|im_end|>"));
+        assert!(result.contains("<|im_start|>assistant\nHi there!<|im_end|>"));
+        assert!(result.contains("<|im_start|>user\nHow are you?<|im_end|>"));
+        assert!(result.ends_with("<|im_start|>assistant\n"));
+    }
+
+    #[test]
+    fn test_template_without_generation_prompt() {
+        let template = r#"
+{%- for message in messages -%}
+{{ message.role }}: {{ message.content }}
+{% endfor -%}
+{%- if add_generation_prompt -%}
+assistant:
+{%- endif -%}
+"#;
+
+        let processor = ChatTemplateProcessor::new(template.to_string(), None, None);
+
+        let messages = vec![ChatMessage::user("Test")];
+
+        // Test without generation prompt
+        let result = processor.apply_chat_template(&messages, false).unwrap();
+        assert_eq!(result.trim(), "user: Test");
+
+        // Test with generation prompt
+        let result_with_prompt = processor.apply_chat_template(&messages, true).unwrap();
+        assert!(result_with_prompt.contains("assistant:"));
+    }
+
+    #[test]
+    fn test_template_with_special_tokens() {
+        let template = r#"{{ bos_token }}{% for msg in messages %}{{ msg.content }}{{ eos_token }}{% endfor %}"#;
+
+        let processor = ChatTemplateProcessor::new(
+            template.to_string(),
+            Some("<s>".to_string()),
+            Some("</s>".to_string()),
+        );
+
+        let messages = vec![ChatMessage::user("Hello")];
+
+        let result = processor.apply_chat_template(&messages, false).unwrap();
+        assert_eq!(result, "<s>Hello</s>");
+    }
+
+    #[test]
+    fn test_empty_messages() {
+        let template =
+            r#"{% for msg in messages %}{{ msg.role }}: {{ msg.content }}\n{% endfor %}"#;
+
+        let processor = ChatTemplateProcessor::new(template.to_string(), None, None);
+
+        let messages = vec![];
+        let result = processor.apply_chat_template(&messages, false).unwrap();
+        assert_eq!(result, "");
+    }
+
+    // Integration test with actual tokenizer file loading would go here
+    // but requires a real tokenizer_config.json file
+}
diff --git a/sgl-router/tests/test_chat_template_loading.rs b/sgl-router/tests/test_chat_template_loading.rs
new file mode 100644
index 000000000..ad1501233
--- /dev/null
+++ b/sgl-router/tests/test_chat_template_loading.rs
@@ -0,0 +1,183 @@
+#[cfg(test)]
+mod tests {
+    use std::fs;
+    use tempfile::TempDir;
+
+    #[test]
+    fn test_load_chat_template_from_file() {
+        use sglang_router_rs::tokenizer::chat_template::ChatMessage;
+        use sglang_router_rs::tokenizer::huggingface::HuggingFaceTokenizer;
+
+        // Create temporary directory
+        let temp_dir = TempDir::new().unwrap();
+        let template_path = temp_dir.path().join("template.jinja");
+
+        // Write a test template
+        let template_content = r#"
+{%- for message in messages %}
+    {{- '<|' + message['role'] + '|>' + message['content'] }}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|assistant|>' }}
+{%- endif %}
+"#;
+        fs::write(&template_path, template_content).unwrap();
+
+        // Create a mock tokenizer config
+        let tokenizer_config = r#"{
+            "version": "1.0",
+            "truncation": null,
+            "padding": null,
+            "added_tokens": [],
+            "normalizer": null,
+            "pre_tokenizer": {
+                "type": "Whitespace"
+            },
+            "post_processor": null,
+            "decoder": null,
+            "model": {
+                "type": "BPE",
+                "vocab": {
+                    "hello": 0,
+                    "world": 1,
+                    "<s>": 2,
+                    "</s>": 3
+                },
+                "merges": []
+            }
+        }"#;
+
+        let tokenizer_path = temp_dir.path().join("tokenizer.json");
+        fs::write(&tokenizer_path, tokenizer_config).unwrap();
+
+        // Load tokenizer with custom chat template
+        let tokenizer = HuggingFaceTokenizer::from_file_with_chat_template(
+            tokenizer_path.to_str().unwrap(),
+            Some(template_path.to_str().unwrap()),
+        )
+        .unwrap();
+
+        // Test that the custom template is used
+        let messages = vec![
+            ChatMessage::user("Hello"),
+            ChatMessage::assistant("Hi there"),
+        ];
+
+        let result = tokenizer.apply_chat_template(&messages, true).unwrap();
+
+        // Verify the custom template format
+        assert!(result.contains("<|user|>Hello"));
+        assert!(result.contains("<|assistant|>Hi there"));
+        assert!(result.ends_with("<|assistant|>"));
+    }
+
+    #[test]
+    fn test_override_existing_template() {
+        use sglang_router_rs::tokenizer::chat_template::ChatMessage;
+        use sglang_router_rs::tokenizer::huggingface::HuggingFaceTokenizer;
+
+        // Create temporary directory
+        let temp_dir = TempDir::new().unwrap();
+
+        // Create tokenizer config with a built-in template
+        let tokenizer_config_path = temp_dir.path().join("tokenizer_config.json");
+        let config_with_template = r#"{
+            "chat_template": "built-in: {% for msg in messages %}{{ msg.content }}{% endfor %}"
+        }"#;
+        fs::write(&tokenizer_config_path, config_with_template).unwrap();
+
+        // Create the actual tokenizer file
+        let tokenizer_json = r#"{
+            "version": "1.0",
+            "truncation": null,
+            "padding": null,
+            "added_tokens": [],
+            "normalizer": null,
+            "pre_tokenizer": {
+                "type": "Whitespace"
+            },
+            "post_processor": null,
+            "decoder": null,
+            "model": {
+                "type": "BPE",
+                "vocab": {
+                    "test": 0,
+                    "<s>": 1,
+                    "</s>": 2
+                },
+                "merges": []
+            }
+        }"#;
+        let tokenizer_path = temp_dir.path().join("tokenizer.json");
+        fs::write(&tokenizer_path, tokenizer_json).unwrap();
+
+        // Create custom template that should override
+        let custom_template_path = temp_dir.path().join("custom.jinja");
+        let custom_template =
+            r#"CUSTOM: {% for msg in messages %}[{{ msg.role }}]: {{ msg.content }}{% endfor %}"#;
+        fs::write(&custom_template_path, custom_template).unwrap();
+
+        // Load with custom template - should override the built-in one
+        let tokenizer = HuggingFaceTokenizer::from_file_with_chat_template(
+            tokenizer_path.to_str().unwrap(),
+            Some(custom_template_path.to_str().unwrap()),
+        )
+        .unwrap();
+
+        let messages = vec![ChatMessage::user("Test")];
+        let result = tokenizer.apply_chat_template(&messages, false).unwrap();
+
+        // Should use CUSTOM template, not built-in
+        assert!(result.starts_with("CUSTOM:"));
+        assert!(result.contains("[user]: Test"));
+        assert!(!result.contains("built-in:"));
+    }
+
+    #[test]
+    fn test_set_chat_template_after_creation() {
+        use sglang_router_rs::tokenizer::chat_template::ChatMessage;
+        use sglang_router_rs::tokenizer::huggingface::HuggingFaceTokenizer;
+
+        // Create temporary directory and tokenizer file
+        let temp_dir = TempDir::new().unwrap();
+        let tokenizer_json = r#"{
+            "version": "1.0",
+            "truncation": null,
+            "padding": null,
+            "added_tokens": [],
+            "normalizer": null,
+            "pre_tokenizer": {
+                "type": "Whitespace"
+            },
+            "post_processor": null,
+            "decoder": null,
+            "model": {
+                "type": "BPE",
+                "vocab": {
+                    "test": 0,
+                    "<s>": 1,
+                    "</s>": 2
+                },
+                "merges": []
+            }
+        }"#;
+        let tokenizer_path = temp_dir.path().join("tokenizer.json");
+        fs::write(&tokenizer_path, tokenizer_json).unwrap();
+
+        // Load tokenizer without custom template
+        let mut tokenizer =
+            HuggingFaceTokenizer::from_file(tokenizer_path.to_str().unwrap()).unwrap();
+
+        // Set a template after creation (mimics Python's behavior)
+        let new_template =
+            "NEW: {% for msg in messages %}{{ msg.role }}: {{ msg.content }}; {% endfor %}";
+        tokenizer.set_chat_template(new_template.to_string());
+
+        let messages = vec![ChatMessage::user("Hello"), ChatMessage::assistant("World")];
+        let result = tokenizer.apply_chat_template(&messages, false).unwrap();
+
+        assert!(result.starts_with("NEW:"));
+        assert!(result.contains("user: Hello;"));
+        assert!(result.contains("assistant: World;"));
+    }
+}
diff --git a/sgl-router/tests/test_openai_routing.rs b/sgl-router/tests/test_openai_routing.rs
new file mode 100644
index 000000000..ec38a6dd5
--- /dev/null
+++ b/sgl-router/tests/test_openai_routing.rs
@@ -0,0 +1,419 @@
+//! Comprehensive integration tests for OpenAI backend functionality
+
+use axum::{
+    body::Body,
+    extract::Request,
+    http::{Method, StatusCode},
+    routing::post,
+    Router,
+};
+use serde_json::json;
+use sglang_router_rs::{
+    config::{RouterConfig, RoutingMode},
+    protocols::spec::{
+        ChatCompletionRequest, ChatMessage, CompletionRequest, GenerateRequest, UserMessageContent,
+    },
+    routers::{openai_router::OpenAIRouter, RouterTrait},
+};
+use std::sync::Arc;
+use tower::ServiceExt;
+
+mod common;
+use common::mock_openai_server::MockOpenAIServer;
+
+/// Helper function to create a minimal chat completion request for testing
+fn create_minimal_chat_request() -> ChatCompletionRequest {
+    let val = json!({
+        "model": "gpt-3.5-turbo",
+        "messages": [
+            {"role": "user", "content": "Hello"}
+        ],
+        "max_tokens": 100
+    });
+    serde_json::from_value(val).unwrap()
+}
+
+/// Helper function to create a minimal completion request for testing
+fn create_minimal_completion_request() -> CompletionRequest {
+    CompletionRequest {
+        model: "gpt-3.5-turbo".to_string(),
+        prompt: sglang_router_rs::protocols::spec::StringOrArray::String("Hello".to_string()),
+        suffix: None,
+        max_tokens: Some(100),
+        temperature: None,
+        top_p: None,
+        n: None,
+        stream: false,
+        stream_options: None,
+        logprobs: None,
+        echo: false,
+        stop: None,
+        presence_penalty: None,
+        frequency_penalty: None,
+        best_of: None,
+        logit_bias: None,
+        user: None,
+        seed: None,
+        top_k: None,
+        min_p: None,
+        min_tokens: None,
+        repetition_penalty: None,
+        regex: None,
+        ebnf: None,
+        json_schema: None,
+        stop_token_ids: None,
+        no_stop_trim: false,
+        ignore_eos: false,
+        skip_special_tokens: true,
+        lora_path: None,
+        session_params: None,
+        return_hidden_states: false,
+        other: serde_json::Map::new(),
+    }
+}
+
+// ============= Basic Unit Tests =============
+
+/// Test basic OpenAI router creation and configuration
+#[tokio::test]
+async fn test_openai_router_creation() {
+    let router = OpenAIRouter::new("https://api.openai.com".to_string(), None).await;
+
+    assert!(router.is_ok(), "Router creation should succeed");
+
+    let router = router.unwrap();
+    assert_eq!(router.router_type(), "openai");
+    assert!(!router.is_pd_mode());
+}
+
+/// Test health endpoints
+#[tokio::test]
+async fn test_openai_router_health() {
+    let router = OpenAIRouter::new("https://api.openai.com".to_string(), None)
+        .await
+        .unwrap();
+
+    let req = Request::builder()
+        .method(Method::GET)
+        .uri("/health")
+        .body(Body::empty())
+        .unwrap();
+
+    let response = router.health(req).await;
+    assert_eq!(response.status(), StatusCode::OK);
+}
+
+/// Test server info endpoint
+#[tokio::test]
+async fn test_openai_router_server_info() {
+    let router = OpenAIRouter::new("https://api.openai.com".to_string(), None)
+        .await
+        .unwrap();
+
+    let req = Request::builder()
+        .method(Method::GET)
+        .uri("/info")
+        .body(Body::empty())
+        .unwrap();
+
+    let response = router.get_server_info(req).await;
+    assert_eq!(response.status(), StatusCode::OK);
+
+    let (_, body) = response.into_parts();
+    let body_bytes = axum::body::to_bytes(body, usize::MAX).await.unwrap();
+    let body_str = String::from_utf8(body_bytes.to_vec()).unwrap();
+
+    assert!(body_str.contains("openai"));
+}
+
+/// Test models endpoint
+#[tokio::test]
+async fn test_openai_router_models() {
+    // Use mock server for deterministic models response
+    let mock_server = MockOpenAIServer::new().await;
+    let router = OpenAIRouter::new(mock_server.base_url(), None)
+        .await
+        .unwrap();
+
+    let req = Request::builder()
+        .method(Method::GET)
+        .uri("/models")
+        .body(Body::empty())
+        .unwrap();
+
+    let response = router.get_models(req).await;
+    assert_eq!(response.status(), StatusCode::OK);
+
+    let (_, body) = response.into_parts();
+    let body_bytes = axum::body::to_bytes(body, usize::MAX).await.unwrap();
+    let body_str = String::from_utf8(body_bytes.to_vec()).unwrap();
+    let models: serde_json::Value = serde_json::from_str(&body_str).unwrap();
+
+    assert_eq!(models["object"], "list");
+    assert!(models["data"].is_array());
+}
+
+/// Test router factory with OpenAI routing mode
+#[tokio::test]
+async fn test_router_factory_openai_mode() {
+    let routing_mode = RoutingMode::OpenAI {
+        worker_urls: vec!["https://api.openai.com".to_string()],
+    };
+
+    let router_config =
+        RouterConfig::new(routing_mode, sglang_router_rs::config::PolicyConfig::Random);
+
+    let app_context = common::create_test_context(router_config);
+
+    let router = sglang_router_rs::routers::RouterFactory::create_router(&app_context).await;
+    assert!(
+        router.is_ok(),
+        "Router factory should create OpenAI router successfully"
+    );
+
+    let router = router.unwrap();
+    assert_eq!(router.router_type(), "openai");
+}
+
+/// Test that unsupported endpoints return proper error codes
+#[tokio::test]
+async fn test_unsupported_endpoints() {
+    let router = OpenAIRouter::new("https://api.openai.com".to_string(), None)
+        .await
+        .unwrap();
+
+    // Test generate endpoint (SGLang-specific, should not be supported)
+    let generate_request = GenerateRequest {
+        prompt: None,
+        text: Some("Hello world".to_string()),
+        input_ids: None,
+        parameters: None,
+        sampling_params: None,
+        stream: false,
+        return_logprob: false,
+        lora_path: None,
+        session_params: None,
+        return_hidden_states: false,
+        rid: None,
+    };
+
+    let response = router.route_generate(None, &generate_request).await;
+    assert_eq!(response.status(), StatusCode::NOT_IMPLEMENTED);
+
+    // Test completion endpoint (should also not be supported)
+    let completion_request = create_minimal_completion_request();
+    let response = router.route_completion(None, &completion_request).await;
+    assert_eq!(response.status(), StatusCode::NOT_IMPLEMENTED);
+}
+
+// ============= Mock Server E2E Tests =============
+
+/// Test chat completion with mock OpenAI server
+#[tokio::test]
+async fn test_openai_router_chat_completion_with_mock() {
+    // Start a mock OpenAI server
+    let mock_server = MockOpenAIServer::new().await;
+    let base_url = mock_server.base_url();
+
+    // Create router pointing to mock server
+    let router = OpenAIRouter::new(base_url, None).await.unwrap();
+
+    // Create a minimal chat completion request
+    let mut chat_request = create_minimal_chat_request();
+    chat_request.messages = vec![ChatMessage::User {
+        role: "user".to_string(),
+        content: UserMessageContent::Text("Hello, how are you?".to_string()),
+        name: None,
+    }];
+    chat_request.temperature = Some(0.7);
+
+    // Route the request
+    let response = router.route_chat(None, &chat_request).await;
+
+    // Should get a successful response from mock server
+    assert_eq!(response.status(), StatusCode::OK);
+
+    let (_, body) = response.into_parts();
+    let body_bytes = axum::body::to_bytes(body, usize::MAX).await.unwrap();
+    let body_str = String::from_utf8(body_bytes.to_vec()).unwrap();
+    let chat_response: serde_json::Value = serde_json::from_str(&body_str).unwrap();
+
+    // Verify it's a valid chat completion response
+    assert_eq!(chat_response["object"], "chat.completion");
+    assert_eq!(chat_response["model"], "gpt-3.5-turbo");
+    assert!(!chat_response["choices"].as_array().unwrap().is_empty());
+}
+
+/// Test full E2E flow with Axum server
+#[tokio::test]
+async fn test_openai_e2e_with_server() {
+    // Start mock OpenAI server
+    let mock_server = MockOpenAIServer::new().await;
+    let base_url = mock_server.base_url();
+
+    // Create router
+    let router = OpenAIRouter::new(base_url, None).await.unwrap();
+
+    // Create Axum app with chat completions endpoint
+    let app = Router::new().route(
+        "/v1/chat/completions",
+        post({
+            let router = Arc::new(router);
+            move |req: Request<Body>| {
+                let router = router.clone();
+                async move {
+                    let (parts, body) = req.into_parts();
+                    let body_bytes = axum::body::to_bytes(body, usize::MAX).await.unwrap();
+                    let body_str = String::from_utf8(body_bytes.to_vec()).unwrap();
+
+                    let chat_request: ChatCompletionRequest =
+                        serde_json::from_str(&body_str).unwrap();
+
+                    router.route_chat(Some(&parts.headers), &chat_request).await
+                }
+            }
+        }),
+    );
+
+    // Make a request to the server
+    let request = Request::builder()
+        .method(Method::POST)
+        .uri("/v1/chat/completions")
+        .header("content-type", "application/json")
+        .body(Body::from(
+            json!({
+                "model": "gpt-3.5-turbo",
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": "Hello, world!"
+                    }
+                ],
+                "max_tokens": 100
+            })
+            .to_string(),
+        ))
+        .unwrap();
+
+    let response = app.oneshot(request).await.unwrap();
+    assert_eq!(response.status(), StatusCode::OK);
+
+    let body = axum::body::to_bytes(response.into_body(), usize::MAX)
+        .await
+        .unwrap();
+    let response_json: serde_json::Value = serde_json::from_slice(&body).unwrap();
+
+    // Verify the response structure
+    assert_eq!(response_json["object"], "chat.completion");
+    assert_eq!(response_json["model"], "gpt-3.5-turbo");
+    assert!(!response_json["choices"].as_array().unwrap().is_empty());
+}
+
+/// Test streaming chat completions pass-through with mock server
+#[tokio::test]
+async fn test_openai_router_chat_streaming_with_mock() {
+    let mock_server = MockOpenAIServer::new().await;
+    let base_url = mock_server.base_url();
+    let router = OpenAIRouter::new(base_url, None).await.unwrap();
+
+    // Build a streaming chat request
+    let val = json!({
+        "model": "gpt-3.5-turbo",
+        "messages": [
+            {"role": "user", "content": "Hello"}
+        ],
+        "max_tokens": 10,
+        "stream": true
+    });
+    let chat_request: ChatCompletionRequest = serde_json::from_value(val).unwrap();
+
+    let response = router.route_chat(None, &chat_request).await;
+    assert_eq!(response.status(), StatusCode::OK);
+
+    // Should be SSE
+    let headers = response.headers();
+    let ct = headers
+        .get("content-type")
+        .unwrap()
+        .to_str()
+        .unwrap()
+        .to_ascii_lowercase();
+    assert!(ct.contains("text/event-stream"));
+
+    // Read entire stream body and assert chunks + DONE
+    let body = axum::body::to_bytes(response.into_body(), usize::MAX)
+        .await
+        .unwrap();
+    let text = String::from_utf8(body.to_vec()).unwrap();
+    assert!(text.contains("chat.completion.chunk"));
+    assert!(text.contains("[DONE]"));
+}
+
+/// Test circuit breaker functionality
+#[tokio::test]
+async fn test_openai_router_circuit_breaker() {
+    // Create router with circuit breaker config
+    let cb_config = sglang_router_rs::config::CircuitBreakerConfig {
+        failure_threshold: 2,
+        success_threshold: 1,
+        timeout_duration_secs: 1,
+        window_duration_secs: 10,
+    };
+
+    let router = OpenAIRouter::new(
+        "http://invalid-url-that-will-fail".to_string(),
+        Some(cb_config),
+    )
+    .await
+    .unwrap();
+
+    let chat_request = create_minimal_chat_request();
+
+    // First few requests should fail and record failures
+    for _ in 0..3 {
+        let response = router.route_chat(None, &chat_request).await;
+        // Should get either an error or circuit breaker response
+        assert!(
+            response.status() == StatusCode::INTERNAL_SERVER_ERROR
+                || response.status() == StatusCode::SERVICE_UNAVAILABLE
+        );
+    }
+}
+
+/// Test that Authorization header is forwarded in /v1/models
+#[tokio::test]
+async fn test_openai_router_models_auth_forwarding() {
+    // Start a mock server that requires Authorization
+    let expected_auth = "Bearer test-token".to_string();
+    let mock_server = MockOpenAIServer::new_with_auth(Some(expected_auth.clone())).await;
+    let router = OpenAIRouter::new(mock_server.base_url(), None)
+        .await
+        .unwrap();
+
+    // 1) Without auth header -> expect 401
+    let req = Request::builder()
+        .method(Method::GET)
+        .uri("/models")
+        .body(Body::empty())
+        .unwrap();
+
+    let response = router.get_models(req).await;
+    assert_eq!(response.status(), StatusCode::UNAUTHORIZED);
+
+    // 2) With auth header -> expect 200
+    let req = Request::builder()
+        .method(Method::GET)
+        .uri("/models")
+        .header("Authorization", expected_auth)
+        .body(Body::empty())
+        .unwrap();
+
+    let response = router.get_models(req).await;
+    assert_eq!(response.status(), StatusCode::OK);
+
+    let (_, body) = response.into_parts();
+    let body_bytes = axum::body::to_bytes(body, usize::MAX).await.unwrap();
+    let body_str = String::from_utf8(body_bytes.to_vec()).unwrap();
+    let models: serde_json::Value = serde_json::from_str(&body_str).unwrap();
+    assert_eq!(models["object"], "list");
+}
diff --git a/sgl-router/tests/test_pd_routing.rs b/sgl-router/tests/test_pd_routing.rs
new file mode 100644
index 000000000..7071106a4
--- /dev/null
+++ b/sgl-router/tests/test_pd_routing.rs
@@ -0,0 +1,918 @@
+#[cfg(test)]
+mod test_pd_routing {
+    use serde_json::json;
+    use sglang_router_rs::config::{
+        CircuitBreakerConfig, ConnectionMode, PolicyConfig, RetryConfig, RouterConfig, RoutingMode,
+    };
+    use sglang_router_rs::core::{WorkerFactory, WorkerType};
+    use sglang_router_rs::routers::http::pd_types::get_hostname;
+    use sglang_router_rs::routers::http::pd_types::PDSelectionPolicy;
+    use sglang_router_rs::routers::RouterFactory;
+
+    // Test-only struct to help validate PD request parsing
+    #[derive(Debug)]
+    struct PDRequest {
+        pub is_stream: bool,
+        pub batch_size: Option<usize>,
+    }
+
+    impl PDRequest {
+        // Extract PD-relevant info from JSON for testing
+        pub fn from_json(json: &serde_json::Value) -> Self {
+            let is_stream = json
+                .get("stream")
+                .and_then(|v| v.as_bool())
+                .unwrap_or(false);
+
+            // Detect batch size from text or input_ids
+            let batch_size = if let Some(text) = json.get("text") {
+                text.as_array().map(|arr| arr.len())
+            } else if let Some(input_ids) = json.get("input_ids") {
+                input_ids.as_array().map(|arr| arr.len())
+            } else {
+                None
+            };
+
+            PDRequest {
+                is_stream,
+                batch_size,
+            }
+        }
+    }
+
+    // ========================================================================
+    // Phase 1: Basic PD Components and Router Creation
+    // ========================================================================
+
+    #[test]
+    fn test_worker_types() {
+        use sglang_router_rs::core::{WorkerFactory, WorkerType};
+
+        // Test worker creation for prefill servers
+        let prefill_worker =
+            WorkerFactory::create_prefill("http://prefill:8080".to_string(), Some(9000));
+        assert_eq!(prefill_worker.url(), "http://prefill:8080");
+        match prefill_worker.worker_type() {
+            WorkerType::Prefill { bootstrap_port } => {
+                assert_eq!(bootstrap_port, Some(9000));
+            }
+            _ => panic!("Expected Prefill worker type"),
+        }
+
+        // Test worker creation for decode servers
+        let decode_worker = WorkerFactory::create_decode("http://decode:8080".to_string());
+        assert_eq!(decode_worker.url(), "http://decode:8080");
+        match decode_worker.worker_type() {
+            WorkerType::Decode => (),
+            _ => panic!("Expected Decode worker type"),
+        }
+
+        // Test regular worker creation
+        let regular_worker = WorkerFactory::create_regular("http://regular:8080".to_string());
+        assert_eq!(regular_worker.url(), "http://regular:8080");
+        match regular_worker.worker_type() {
+            WorkerType::Regular => (),
+            _ => panic!("Expected Regular worker type"),
+        }
+    }
+
+    #[test]
+    fn test_pd_selection_policies() {
+        // Test all PD selection policy variants
+        // Note: These policies are only used when pd_disaggregation=true
+        let policies = vec![
+            PDSelectionPolicy::Random,
+            PDSelectionPolicy::PowerOfTwo,
+            PDSelectionPolicy::CacheAware {
+                cache_threshold: 0.5,
+                balance_abs_threshold: 32,
+                balance_rel_threshold: 1.1,
+            },
+        ];
+
+        for policy in policies {
+            // Verify each policy can be created and matched
+            match &policy {
+                PDSelectionPolicy::Random => {
+                    assert!(matches!(policy, PDSelectionPolicy::Random));
+                }
+                PDSelectionPolicy::PowerOfTwo => {
+                    assert!(matches!(policy, PDSelectionPolicy::PowerOfTwo));
+                }
+                PDSelectionPolicy::CacheAware {
+                    cache_threshold, ..
+                } => {
+                    assert!(*cache_threshold >= 0.0 && *cache_threshold <= 1.0);
+                }
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn test_pd_router_configuration() {
+        // Test PD router configuration with various policies
+        // In the new structure, RoutingMode and PolicyConfig are separate
+        let test_cases = vec![
+            (
+                RoutingMode::PrefillDecode {
+                    prefill_urls: vec![
+                        ("http://prefill1:8080".to_string(), Some(9000)),
+                        ("http://prefill2:8080".to_string(), None),
+                    ],
+                    decode_urls: vec![
+                        "http://decode1:8080".to_string(),
+                        "http://decode2:8080".to_string(),
+                    ],
+                    prefill_policy: None,
+                    decode_policy: None,
+                },
+                PolicyConfig::Random,
+            ),
+            (
+                RoutingMode::PrefillDecode {
+                    prefill_urls: vec![("http://prefill:8080".to_string(), Some(9000))],
+                    decode_urls: vec!["http://decode:8080".to_string()],
+                    prefill_policy: None,
+                    decode_policy: None,
+                },
+                PolicyConfig::PowerOfTwo {
+                    load_check_interval_secs: 5,
+                },
+            ),
+            (
+                RoutingMode::PrefillDecode {
+                    prefill_urls: vec![
+                        ("http://p1:8080".to_string(), Some(9000)),
+                        ("http://p2:8080".to_string(), Some(9001)),
+                        ("http://p3:8080".to_string(), Some(9002)),
+                    ],
+                    decode_urls: vec!["http://d1:8080".to_string(), "http://d2:8080".to_string()],
+                    prefill_policy: None,
+                    decode_policy: None,
+                },
+                PolicyConfig::CacheAware {
+                    cache_threshold: 0.7,
+                    balance_abs_threshold: 20,
+                    balance_rel_threshold: 1.2,
+                    eviction_interval_secs: 60,
+                    max_tree_size: 1000000,
+                },
+            ),
+        ];
+
+        for (mode, policy) in test_cases {
+            let config = RouterConfig {
+                mode,
+                policy,
+                host: "127.0.0.1".to_string(),
+                port: 3001,
+                max_payload_size: 1024 * 1024,
+                request_timeout_secs: 60,
+                worker_startup_timeout_secs: 10,
+                worker_startup_check_interval_secs: 1,
+                dp_aware: false,
+                api_key: None,
+                discovery: None,
+                metrics: None,
+                log_dir: None,
+                log_level: None,
+                request_id_headers: None,
+                max_concurrent_requests: 64,
+                queue_size: 0,
+                queue_timeout_secs: 60,
+                cors_allowed_origins: vec![],
+                retry: RetryConfig::default(),
+                circuit_breaker: CircuitBreakerConfig::default(),
+                disable_retries: false,
+                disable_circuit_breaker: false,
+                health_check: sglang_router_rs::config::HealthCheckConfig::default(),
+                enable_igw: false,
+                rate_limit_tokens_per_second: None,
+                connection_mode: ConnectionMode::Http,
+                model_path: None,
+                tokenizer_path: None,
+            };
+
+            // Router creation will fail due to health checks, but config should be valid
+            let app_context =
+                sglang_router_rs::server::AppContext::new(config, reqwest::Client::new(), 64, None)
+                    .expect("Failed to create AppContext");
+            let app_context = std::sync::Arc::new(app_context);
+            let result = RouterFactory::create_router(&app_context).await;
+            assert!(result.is_err());
+            let error_msg = result.unwrap_err();
+            // Error should be about health/timeout, not configuration
+            assert!(
+                error_msg.contains("healthy") || error_msg.contains("timeout"),
+                "Unexpected error: {}",
+                error_msg
+            );
+        }
+    }
+
+    // ========================================================================
+    // Phase 2: Bootstrap Injection and Request Handling
+    // ========================================================================
+
+    #[test]
+    fn test_pd_request_from_json() {
+        // Test PDRequest parsing from single text request
+        let single_json = json!({
+            "text": "Hello world",
+            "stream": false,
+            "temperature": 0.7,
+            "max_tokens": 100
+        });
+
+        let pd_req = PDRequest::from_json(&single_json);
+        assert!(!pd_req.is_stream);
+        assert_eq!(pd_req.batch_size, None);
+
+        // Test PDRequest parsing from batch text request
+        let batch_json = json!({
+            "text": ["Hello", "World", "Test"],
+            "stream": true,
+            "temperature": 0.5
+        });
+
+        let pd_req = PDRequest::from_json(&batch_json);
+        assert!(pd_req.is_stream);
+        assert_eq!(pd_req.batch_size, Some(3));
+
+        // Test PDRequest parsing from input_ids request
+        let ids_json = json!({
+            "input_ids": [[1, 2, 3], [4, 5, 6]],
+            "stream": false
+        });
+
+        let pd_req = PDRequest::from_json(&ids_json);
+        assert!(!pd_req.is_stream);
+        assert_eq!(pd_req.batch_size, Some(2));
+
+        // Test PDRequest parsing from chat request
+        let chat_json = json!({
+            "messages": [
+                {"role": "system", "content": "You are a helpful assistant"},
+                {"role": "user", "content": "Hello"}
+            ],
+            "stream": true
+        });
+
+        let pd_req = PDRequest::from_json(&chat_json);
+        assert!(pd_req.is_stream);
+        assert_eq!(pd_req.batch_size, None);
+    }
+
+    #[test]
+    fn test_bootstrap_injection_simulation() {
+        // Since we can't test the actual inject_bootstrap_fields function here
+        // (it's private in the router module), we'll test the expected behavior
+
+        // Simulate bootstrap injection for single request
+        let mut single_json = json!({
+            "text": "Hello world",
+            "stream": false,
+            "temperature": 0.7
+        });
+
+        // Create a prefill worker to simulate injection
+        let prefill_worker =
+            WorkerFactory::create_prefill("http://prefill1:8080".to_string(), Some(9000));
+
+        // Extract bootstrap port from worker type
+        let bootstrap_port = match prefill_worker.worker_type() {
+            WorkerType::Prefill { bootstrap_port } => bootstrap_port,
+            _ => None,
+        };
+
+        // Simulate what inject_bootstrap_fields would do
+        single_json["bootstrap_host"] = json!(get_hostname(prefill_worker.url()));
+        single_json["bootstrap_port"] = json!(bootstrap_port);
+        single_json["bootstrap_room"] = json!(12345u64); // Random room ID
+
+        // Verify bootstrap fields are added correctly
+        assert_eq!(single_json["bootstrap_host"], "prefill1");
+        assert_eq!(single_json["bootstrap_port"], json!(Some(9000)));
+        assert!(single_json["bootstrap_room"].is_u64());
+        assert_eq!(single_json["temperature"], 0.7); // Original field preserved
+
+        // Simulate bootstrap injection for batch request
+        let mut batch_json = json!({
+            "text": ["Hello", "World", "Test"],
+            "stream": true
+        });
+
+        let batch_size = 3;
+        let hostname = get_hostname(prefill_worker.url());
+        batch_json["bootstrap_host"] = json!(vec![hostname; batch_size]);
+        batch_json["bootstrap_port"] = json!(vec![bootstrap_port; batch_size]);
+        batch_json["bootstrap_room"] = json!(vec![111u64, 222u64, 333u64]);
+
+        // Verify batch bootstrap fields
+        assert!(batch_json["bootstrap_host"].is_array());
+        assert_eq!(
+            batch_json["bootstrap_host"].as_array().unwrap().len(),
+            batch_size
+        );
+        assert!(batch_json["bootstrap_port"].is_array());
+        assert!(batch_json["bootstrap_room"].is_array());
+        assert_eq!(batch_json["stream"], true); // Original field preserved
+    }
+
+    #[test]
+    fn test_request_serialization() {
+        // Test that requests can be properly serialized and deserialized
+        let request = json!({
+            "text": "Test prompt",
+            "stream": false,
+            "temperature": 0.7,
+            "max_tokens": 100,
+            "top_p": 0.9,
+            "frequency_penalty": 0.5,
+            "bootstrap_host": "prefill1",
+            "bootstrap_port": 9000,
+            "bootstrap_room": 12345u64
+        });
+
+        // Convert to bytes (as would happen in the router)
+        let bytes = serde_json::to_vec(&request).unwrap();
+
+        // Parse back from bytes
+        let parsed: serde_json::Value = serde_json::from_slice(&bytes).unwrap();
+
+        // Verify all fields are preserved
+        assert_eq!(parsed["text"], "Test prompt");
+        assert_eq!(parsed["stream"], false);
+        assert_eq!(parsed["temperature"], 0.7);
+        assert_eq!(parsed["max_tokens"], 100);
+        assert_eq!(parsed["bootstrap_host"], "prefill1");
+        assert_eq!(parsed["bootstrap_port"], 9000);
+        assert_eq!(parsed["bootstrap_room"], 12345);
+    }
+
+    #[test]
+    fn test_hostname_extraction() {
+        // Test various URL formats
+        let test_cases = vec![
+            ("http://localhost:8080", "localhost"),
+            ("http://10.0.0.1:8080", "10.0.0.1"),
+            ("https://api.example.com:443", "api.example.com"),
+            ("http://prefill-server", "prefill-server"),
+            ("http://[::1]:8080", "["),  // IPv6 edge case
+            ("prefill:8080", "prefill"), // No protocol
+        ];
+
+        for (url, expected_hostname) in test_cases {
+            assert_eq!(get_hostname(url), expected_hostname);
+        }
+    }
+
+    #[test]
+    fn test_pd_request_edge_cases() {
+        // Test empty request
+        let empty_json = json!({});
+        let pd_req = PDRequest::from_json(&empty_json);
+        assert!(!pd_req.is_stream);
+        assert_eq!(pd_req.batch_size, None);
+
+        // Test request with only stream field
+        let stream_only = json!({
+            "stream": true
+        });
+        let pd_req = PDRequest::from_json(&stream_only);
+        assert!(pd_req.is_stream);
+        assert_eq!(pd_req.batch_size, None);
+
+        // Test request with empty text array
+        let empty_batch = json!({
+            "text": []
+        });
+        let pd_req = PDRequest::from_json(&empty_batch);
+        assert_eq!(pd_req.batch_size, Some(0));
+
+        // Test request with non-array text (should be None)
+        let non_array_text = json!({
+            "text": "single string"
+        });
+        let pd_req = PDRequest::from_json(&non_array_text);
+        assert_eq!(pd_req.batch_size, None);
+    }
+
+    // ========================================================================
+    // Phase 2: Background Load Monitoring Tests
+    // ========================================================================
+
+    #[tokio::test]
+    async fn test_background_load_monitoring() {
+        use std::collections::HashMap;
+        use tokio::sync::watch;
+
+        // Create a watch channel for testing
+        let (tx, rx) = watch::channel(HashMap::new());
+
+        // Simulate load updates
+        let mut loads = HashMap::new();
+        loads.insert("http://prefill1:8080".to_string(), 10);
+        loads.insert("http://prefill2:8080".to_string(), 20);
+        loads.insert("http://decode1:8080".to_string(), 5);
+        loads.insert("http://decode2:8080".to_string(), 15);
+
+        // Send the loads
+        tx.send(loads.clone()).unwrap();
+
+        // Verify receiver gets the update
+        let received_loads = rx.borrow();
+        assert_eq!(received_loads.get("http://prefill1:8080"), Some(&10));
+        assert_eq!(received_loads.get("http://prefill2:8080"), Some(&20));
+        assert_eq!(received_loads.get("http://decode1:8080"), Some(&5));
+        assert_eq!(received_loads.get("http://decode2:8080"), Some(&15));
+    }
+
+    #[test]
+    fn test_load_monitoring_configuration() {
+        // Test that load monitoring is only enabled for PowerOfTwo policy
+        let policies = vec![
+            (PDSelectionPolicy::Random, false),
+            (PDSelectionPolicy::PowerOfTwo, true),
+            (
+                PDSelectionPolicy::CacheAware {
+                    cache_threshold: 0.5,
+                    balance_abs_threshold: 32,
+                    balance_rel_threshold: 1.1,
+                },
+                false,
+            ),
+        ];
+
+        for (policy, should_monitor) in policies {
+            match policy {
+                PDSelectionPolicy::PowerOfTwo => assert!(should_monitor),
+                _ => assert!(!should_monitor),
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn test_watch_channel_behavior() {
+        use std::collections::HashMap;
+        use tokio::sync::watch;
+
+        // Test watch channel's broadcast behavior
+        let (tx, rx1) = watch::channel(HashMap::new());
+        let rx2 = rx1.clone();
+
+        // Initial state - empty map
+        assert!(rx1.borrow().is_empty());
+        assert!(rx2.borrow().is_empty());
+
+        // Update 1
+        let mut loads = HashMap::new();
+        loads.insert("worker1".to_string(), 10);
+        tx.send(loads.clone()).unwrap();
+
+        // Both receivers see the update
+        assert_eq!(rx1.borrow().get("worker1"), Some(&10));
+        assert_eq!(rx2.borrow().get("worker1"), Some(&10));
+
+        // Update 2 - overwrites previous
+        loads.insert("worker1".to_string(), 20);
+        loads.insert("worker2".to_string(), 30);
+        tx.send(loads).unwrap();
+
+        // Both receivers see the latest state
+        assert_eq!(rx1.borrow().get("worker1"), Some(&20));
+        assert_eq!(rx2.borrow().get("worker2"), Some(&30));
+    }
+
+    // ========================================================================
+    // Tests based on bench_one_batch_server.py patterns
+    // ========================================================================
+
+    #[test]
+    fn test_generate_request_formats() {
+        // Based on bench_one_batch_server.py request patterns
+
+        // Test 1: Batch request with input_ids (most common in benchmarks)
+        let batch_request = json!({
+            "input_ids": [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
+            "sampling_params": {
+                "temperature": 0.0,
+                "max_new_tokens": 16,
+                "ignore_eos": true,
+            },
+            "return_logprob": false,
+            "stream": true
+        });
+
+        let pd_req = PDRequest::from_json(&batch_request);
+        assert!(pd_req.is_stream);
+        assert_eq!(pd_req.batch_size, Some(3));
+
+        // Test 2: Request with return_logprob (critical for PD)
+        let logprob_request = json!({
+            "input_ids": [[1, 2, 3]],
+            "sampling_params": {
+                "temperature": 0.7,
+                "max_new_tokens": 8,
+            },
+            "return_logprob": true,
+            "stream": false
+        });
+
+        assert_eq!(logprob_request["return_logprob"], true);
+        assert_eq!(logprob_request["stream"], false);
+
+        // Test 3: Large batch sizes from benchmark
+        let batch_sizes = vec![1, 16, 64]; // From bench_one_batch_server.py
+        for bs in batch_sizes {
+            let request = json!({
+                "input_ids": vec![vec![1, 2, 3]; bs],
+                "sampling_params": {
+                    "temperature": 0.0,
+                    "max_new_tokens": 16,
+                },
+                "stream": true
+            });
+
+            let pd_req = PDRequest::from_json(&request);
+            assert_eq!(pd_req.batch_size, Some(bs));
+        }
+    }
+
+    #[test]
+    fn test_sampling_params_handling() {
+        // Test various sampling parameters from bench_one_batch_server.py
+        let sampling_params_variations = vec![
+            json!({
+                "temperature": 0.0,
+                "max_new_tokens": 8,
+                "ignore_eos": true
+            }),
+            json!({
+                "temperature": 0.7,
+                "max_new_tokens": 16,
+                "ignore_eos": false,
+                "top_p": 0.9,
+                "frequency_penalty": 0.5
+            }),
+            json!({
+                "temperature": 1.0,
+                "max_new_tokens": 64,
+                "json_schema": "$$ANY$$"  // Structured output
+            }),
+        ];
+
+        for params in sampling_params_variations {
+            let request = json!({
+                "input_ids": [[1, 2, 3]],
+                "sampling_params": params.clone(),
+                "stream": false
+            });
+
+            // Verify params are preserved
+            assert_eq!(request["sampling_params"], params);
+        }
+    }
+
+    #[test]
+    fn test_streaming_response_parsing() {
+        // Test SSE format parsing from streaming responses
+        let sse_chunks = ["data: {\"text\":\"Hello\",\"meta_info\":{\"completion_tokens\":1,\"finish_reason\":null}}",
+            "data: {\"text\":\" world\",\"meta_info\":{\"completion_tokens\":2,\"finish_reason\":null}}",
+            "data: {\"text\":\"!\",\"meta_info\":{\"completion_tokens\":3,\"finish_reason\":{\"type\":\"length\"}}}",
+            "data: [DONE]"];
+
+        for chunk in &sse_chunks[..3] {
+            assert!(chunk.starts_with("data: "));
+            let json_str = &chunk[6..]; // Skip "data: "
+            let parsed: serde_json::Value = serde_json::from_str(json_str).unwrap();
+            assert!(parsed["meta_info"]["completion_tokens"].is_u64());
+        }
+
+        // Test [DONE] detection
+        assert_eq!(sse_chunks[3], "data: [DONE]");
+    }
+
+    #[test]
+    fn test_ttft_calculation() {
+        // Test Time To First Token calculation pattern
+        let first_token_response = json!({
+            "text": "Hello",
+            "meta_info": {
+                "completion_tokens": 1,
+                "finish_reason": null
+            }
+        });
+
+        // TTFT is calculated when completion_tokens == 1
+        assert_eq!(first_token_response["meta_info"]["completion_tokens"], 1);
+        assert!(first_token_response["meta_info"]["finish_reason"].is_null());
+    }
+
+    #[test]
+    fn test_throughput_metrics() {
+        // Test throughput calculation patterns from bench_one_batch_server.py
+        let batch_size = 16;
+        let input_len = 1024;
+        let output_len = 16;
+        let ttft = 0.5; // seconds
+        let total_latency = 2.0; // seconds
+
+        // Input throughput = batch_size * input_len / ttft
+        let input_throughput = (batch_size as f64) * (input_len as f64) / ttft;
+        assert!((input_throughput - 32768.0).abs() < 0.01);
+
+        // Output throughput = batch_size * output_len / (latency - ttft)
+        let output_throughput = (batch_size as f64) * (output_len as f64) / (total_latency - ttft);
+        assert!((output_throughput - 170.67).abs() < 0.01);
+    }
+
+    #[test]
+    fn test_error_response_handling() {
+        // Test error response format from bench_one_batch_server.py
+        let error_response = json!({
+            "error": "Request has failed. Invalid input format."
+        });
+
+        assert!(error_response.get("error").is_some());
+        assert!(error_response["error"].as_str().unwrap().contains("failed"));
+    }
+
+    #[test]
+    fn test_structured_output_request() {
+        // Test structured output format (json_schema)
+        let structured_request = json!({
+            "text": "What is the capital of France? Answer in JSON.",
+            "sampling_params": {
+                "temperature": 0.0,
+                "max_new_tokens": 64,
+                "json_schema": "$$ANY$$"
+            },
+            "stream": false
+        });
+
+        assert_eq!(
+            structured_request["sampling_params"]["json_schema"],
+            "$$ANY$$"
+        );
+    }
+
+    #[test]
+    fn test_bootstrap_injection_with_benchmark_requests() {
+        use sglang_router_rs::core::{WorkerFactory, WorkerType};
+
+        // Test bootstrap injection with actual benchmark request patterns
+        let mut benchmark_request = json!({
+            "input_ids": vec![vec![1, 2, 3, 4]; 16], // Batch size 16
+            "sampling_params": {
+                "temperature": 0.0,
+                "max_new_tokens": 8,
+                "ignore_eos": true
+            },
+            "return_logprob": true,
+            "stream": true
+        });
+
+        // Create a prefill worker to simulate injection
+        let prefill_worker =
+            WorkerFactory::create_prefill("http://prefill:8080".to_string(), Some(9000));
+
+        // Extract bootstrap port from worker type
+        let bootstrap_port = match prefill_worker.worker_type() {
+            WorkerType::Prefill { bootstrap_port } => bootstrap_port,
+            _ => None,
+        };
+        let batch_size = 16;
+        let hostname = get_hostname(prefill_worker.url());
+
+        benchmark_request["bootstrap_host"] = json!(vec![hostname; batch_size]);
+        benchmark_request["bootstrap_port"] = json!(vec![bootstrap_port; batch_size]);
+        benchmark_request["bootstrap_room"] =
+            json!((0..batch_size).map(|_| 12345u64).collect::<Vec<_>>());
+
+        // Verify bootstrap fields match batch size
+        assert_eq!(
+            benchmark_request["bootstrap_host"]
+                .as_array()
+                .unwrap()
+                .len(),
+            batch_size
+        );
+        assert_eq!(
+            benchmark_request["bootstrap_port"]
+                .as_array()
+                .unwrap()
+                .len(),
+            batch_size
+        );
+        assert_eq!(
+            benchmark_request["bootstrap_room"]
+                .as_array()
+                .unwrap()
+                .len(),
+            batch_size
+        );
+
+        // Verify original fields are preserved
+        assert_eq!(benchmark_request["return_logprob"], true);
+        assert_eq!(benchmark_request["stream"], true);
+    }
+
+    #[test]
+    fn test_server_info_response_format() {
+        // Test server info format expected by bench_one_batch_server.py
+        let server_info = json!({
+            "internal_states": [{
+                "avg_spec_accept_length": 3.5,
+                "last_gen_throughput": 2048.5,
+                "load": 16
+            }],
+            "prefill": [
+                {"url": "http://prefill1:8080", "load": 10},
+                {"url": "http://prefill2:8080", "load": 20}
+            ],
+            "decode": [
+                {"url": "http://decode1:8080", "load": 5},
+                {"url": "http://decode2:8080", "load": 15}
+            ]
+        });
+
+        // Verify structure matches what benchmark expects
+        assert!(server_info["internal_states"][0]["avg_spec_accept_length"].is_f64());
+        assert!(server_info["internal_states"][0]["last_gen_throughput"].is_f64());
+        assert!(server_info["prefill"].is_array());
+        assert!(server_info["decode"].is_array());
+    }
+
+    // ========================================================================
+    // Comprehensive Endpoint Coverage Test
+    // ========================================================================
+
+    #[test]
+    fn test_pd_endpoints_coverage() {
+        // Document all endpoints from Python mini_lb.py and verify implementation status
+        let implemented_endpoints = vec![
+            ("/health", "GET", true),
+            ("/health_generate", "GET", true), // Note: Python uses POST, we use GET
+            ("/get_server_info", "GET", true),
+            ("/v1/models", "GET", true),
+            ("/get_model_info", "GET", true),
+            ("/generate", "POST", true),
+            ("/v1/chat/completions", "POST", true),
+            ("/v1/completions", "POST", true),
+            ("/flush_cache", "POST", true),
+            ("/get_loads", "GET", true),
+            ("/register", "POST", false), // NOT IMPLEMENTED - needs dynamic worker management
+        ];
+
+        let implemented_count = implemented_endpoints
+            .iter()
+            .filter(|(_, _, impl_status)| *impl_status)
+            .count();
+        let total_count = implemented_endpoints.len();
+
+        // We've implemented 10 out of 11 endpoints (register is not needed for Phase 1/2)
+        assert_eq!(implemented_count, 10);
+        assert_eq!(total_count, 11);
+
+        // Document the missing endpoint
+        let missing: Vec<_> = implemented_endpoints
+            .iter()
+            .filter(|(_, _, impl_status)| !impl_status)
+            .map(|(endpoint, method, _)| format!("{} {}", method, endpoint))
+            .collect();
+
+        assert_eq!(missing, vec!["POST /register"]);
+    }
+
+    #[test]
+    fn test_large_batch_bootstrap_injection() {
+        // Test bootstrap injection performance with very large batches
+        // This simulates the bench_one_batch_server.py scenario
+        let large_batch_sizes = vec![1024, 4096, 8192];
+
+        for batch_size in large_batch_sizes {
+            let start = std::time::Instant::now();
+
+            // Simulate a large batch request
+            let mut large_batch_request = json!({
+                "input_ids": vec![vec![1, 2, 3, 4]; batch_size],
+                "sampling_params": {
+                    "temperature": 0.0,
+                    "max_new_tokens": 16,
+                },
+                "stream": true
+            });
+
+            // Create a prefill worker to simulate injection
+            let prefill_worker =
+                WorkerFactory::create_prefill("http://prefill:8080".to_string(), Some(9000));
+
+            // Extract bootstrap port from worker type
+            let bootstrap_port = match prefill_worker.worker_type() {
+                WorkerType::Prefill { bootstrap_port } => bootstrap_port,
+                _ => None,
+            };
+            let hostname = get_hostname(prefill_worker.url());
+
+            large_batch_request["bootstrap_host"] = json!(vec![hostname; batch_size]);
+            large_batch_request["bootstrap_port"] = json!(vec![bootstrap_port; batch_size]);
+            large_batch_request["bootstrap_room"] = json!((0..batch_size)
+                .map(|_| rand::random::<u64>())
+                .collect::<Vec<_>>());
+
+            let elapsed = start.elapsed();
+
+            // Verify bootstrap fields are correctly sized
+            assert_eq!(
+                large_batch_request["bootstrap_host"]
+                    .as_array()
+                    .unwrap()
+                    .len(),
+                batch_size
+            );
+            assert_eq!(
+                large_batch_request["bootstrap_port"]
+                    .as_array()
+                    .unwrap()
+                    .len(),
+                batch_size
+            );
+            assert_eq!(
+                large_batch_request["bootstrap_room"]
+                    .as_array()
+                    .unwrap()
+                    .len(),
+                batch_size
+            );
+
+            // Bootstrap injection should be reasonably fast even for large batches
+            println!(
+                "Bootstrap injection for batch_size {} took {:?}",
+                batch_size, elapsed
+            );
+            assert!(
+                elapsed.as_millis() < 1000,
+                "Bootstrap injection took too long for batch size {}",
+                batch_size
+            );
+        }
+    }
+
+    #[test]
+    fn test_payload_size_calculation() {
+        // Test payload size estimation for bench_one_batch_server.py scenarios
+        let test_cases = vec![
+            (1, 1024, 16),   // Small batch
+            (16, 1024, 16),  // Medium batch
+            (64, 1024, 16),  // Large batch
+            (8192, 4096, 5), // Benchmark scenario
+        ];
+
+        for (batch_size, input_len, _output_len) in test_cases {
+            // Estimate payload size (rough calculation)
+            // Each token is ~4 bytes (i32), plus JSON overhead
+            let tokens_size = batch_size * input_len * 4; // 4 bytes per token
+            let json_overhead = batch_size * 100; // ~100 bytes overhead per request
+            let total_size = tokens_size + json_overhead;
+
+            println!(
+                "Batch size: {}, Input len: {}, Estimated payload: {} MB",
+                batch_size,
+                input_len,
+                total_size / (1024 * 1024)
+            );
+
+            // For the benchmark case (8192, 4096), this should be ~134 MB
+            if batch_size == 8192 && input_len == 4096 {
+                assert!(
+                    total_size > 100 * 1024 * 1024,
+                    "Benchmark payload should be > 100MB"
+                );
+                assert!(
+                    total_size < 200 * 1024 * 1024,
+                    "Benchmark payload should be < 200MB"
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_policy_type_to_pd_selection_policy_mapping() {
+        // Test that PDSelectionPolicy doesn't include RoundRobin
+        let pd_policy_count = 3; // Random, PowerOfTwo, CacheAware
+        assert_eq!(
+            pd_policy_count, 3,
+            "PDSelectionPolicy should have exactly 3 variants"
+        );
+
+        // Verify that each PDSelectionPolicy variant can be created
+        let _random = PDSelectionPolicy::Random;
+        let _po2 = PDSelectionPolicy::PowerOfTwo;
+        let _cache_aware = PDSelectionPolicy::CacheAware {
+            cache_threshold: 0.5,
+            balance_abs_threshold: 32,
+            balance_rel_threshold: 1.1,
+        };
+    }
+}
diff --git a/sgl-router/tests/tokenizer_integration.rs b/sgl-router/tests/tokenizer_integration.rs
new file mode 100644
index 000000000..9f0597297
--- /dev/null
+++ b/sgl-router/tests/tokenizer_integration.rs
@@ -0,0 +1,320 @@
+//! Integration tests for tokenizers using real tokenizer data
+//!
+//! These tests download the TinyLlama tokenizer from HuggingFace to verify our tokenizer
+//! implementation works correctly with real-world tokenizer files.
+
+mod common;
+use common::{ensure_tokenizer_cached, EXPECTED_HASHES, TEST_PROMPTS};
+
+use sglang_router_rs::tokenizer::{
+    factory, huggingface::HuggingFaceTokenizer, sequence::Sequence, stop::*, stream::DecodeStream,
+    traits::*,
+};
+use std::sync::Arc;
+
+const LONG_TEST_PROMPTS: [(&str, &str); 6] = [
+    ("Tell me about the following text.", "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."),
+    ("Tell me about the following text.", "Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."),
+    ("Tell me about the following text.", "Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt."),
+    ("Tell me about the following text.", "Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem."),
+    // Tennis-themed prompt for variety
+    ("Tell me about the following text.", "In the ancient realm of Tennisia, the very magic of the land is drawn from the sport itself. Forehands light the skies, backhands carve the earth, and serves rumble like thunder across kingdoms. At the center of this balance lie four sacred Grand Slam relics: the Sapphire Trophy of Melbourne, the Emerald Chalice of Paris, the Ruby Crown of London, and the Diamond Orb of New York. Together, they keep the game's spirit alive.
+    But the relics are scattered, guarded by champions of legendary skill. The first is the Fire King of Clay, ruler of the crimson courts, whose topspin arcs blaze high and heavy, scorching all who dare stand across from him. The second is the Tempest Trickster, master of the baseline fortress, whose footwork and precision can turn back any storm, and whose returns arrive as if pulled by invisible strings. The third is the Shadow-Dancer of the Highlands, a tactician who thrives in the long rallies of twilight, changing pace and spin until opponents lose their rhythm. The fourth and final guardian is a towering Diamond Titan, a net-charging colossus whose volleys shatter the air itself.
+    Into this arena of gods steps the Silver-Wristed Knight — a player of impossible grace, whose game is an art form. His quest: to claim each relic not for glory, but to restore harmony to the rankings of the realm.
+    He travels across the Kingdom of Clay, where the points stretch like marathons and the air tastes of iron; through the Grasslands of London, where the ball skids low and the margins are razor-thin; over the Hard Courts of the East, where rallies turn into duels of endurance; and finally to the Cathedral of Lights in New York, where night matches burn with fevered energy.
+    Each battle is played under enchanted floodlights, the lines patrolled by spectral line judges whose calls are final. The crowd's roar swells with every break point, and the Silver-Wristed Knight's racket glows brightest when the match teeters at deuce. There are moments when doubt grips him — when his serve falters or his touch deserts him — but each challenge teaches a new stroke, culminating in the legendary Forehand of Dawn.
+    When the last relic is claimed, he stands not as a conqueror but as a custodian of the game, knowing that rivalries forge the very magic he protects. The balance is restored — until the next season begins."),
+    // Emoji stress test
+    ("Tell me about the following text.", "😀😃😄😁😆🥹😅😂🤣🥲☺️😊😇🙂🙃😉🤩😎 🤪🥳🤓🙄🤪😵👻")
+];
+
+fn compute_hashes_for_tokenizer<E: Encoder>(tokenizer: &E, prompts: &[&str]) -> Vec<u64> {
+    prompts
+        .iter()
+        .map(|&prompt| {
+            tokenizer
+                .encode(prompt)
+                .expect("Failed to encode prompt")
+                .get_hash()
+        })
+        .collect()
+}
+
+#[test]
+fn test_huggingface_tokenizer_hashes() {
+    let tokenizer_path = ensure_tokenizer_cached();
+    let tokenizer = HuggingFaceTokenizer::from_file(tokenizer_path.to_str().unwrap())
+        .expect("Failed to load HuggingFace tokenizer");
+
+    let prompt_hashes = compute_hashes_for_tokenizer(&tokenizer, &TEST_PROMPTS);
+
+    println!(
+        "HF Tokenizer: {:?}\nComputed Hashes: {:?}\nExpected Hashes: {:?}",
+        tokenizer_path, prompt_hashes, EXPECTED_HASHES
+    );
+
+    assert_eq!(prompt_hashes, EXPECTED_HASHES);
+}
+
+#[test]
+fn test_tokenizer_encode_decode_lifecycle() {
+    let tokenizer_path = ensure_tokenizer_cached();
+    let tokenizer = HuggingFaceTokenizer::from_file(tokenizer_path.to_str().unwrap())
+        .expect("Failed to load HuggingFace tokenizer");
+
+    for prompt in TEST_PROMPTS.iter() {
+        let encoding = tokenizer.encode(prompt).expect("Failed to encode prompt");
+
+        let decoded = tokenizer
+            .decode(encoding.token_ids(), false)
+            .expect("Failed to decode token_ids");
+
+        assert_eq!(decoded, *prompt, "Encode-decode mismatch for: {}", prompt);
+    }
+}
+
+#[test]
+fn test_sequence_operations() {
+    let tokenizer_path = ensure_tokenizer_cached();
+    let tokenizer = Arc::new(
+        HuggingFaceTokenizer::from_file(tokenizer_path.to_str().unwrap())
+            .expect("Failed to load tokenizer"),
+    );
+
+    for prompt in TEST_PROMPTS.iter() {
+        let encoding = tokenizer.encode(prompt).expect("Failed to encode prompt");
+
+        // Test Sequence with append_text
+        let mut sequence = Sequence::new(tokenizer.clone());
+        sequence.append_text(prompt).expect("Failed to append text");
+
+        assert_eq!(
+            sequence.len(),
+            encoding.token_ids().len(),
+            "Sequence length mismatch"
+        );
+        assert_eq!(sequence.text().unwrap(), *prompt, "Sequence text mismatch");
+
+        // Test incremental decoding with append_token
+        let mut decoder = Sequence::new(tokenizer.clone());
+        let mut output = String::new();
+
+        for token_id in encoding.token_ids() {
+            let text = decoder
+                .append_token(*token_id)
+                .expect("Failed to append token");
+            output.push_str(&text);
+        }
+
+        assert_eq!(decoder.len(), sequence.len(), "Decoder length mismatch");
+        assert_eq!(
+            decoder.token_ids(),
+            sequence.token_ids(),
+            "Token IDs mismatch"
+        );
+        assert_eq!(output, *prompt, "Incremental decode mismatch");
+    }
+}
+
+#[test]
+fn test_decode_stream() {
+    let tokenizer_path = ensure_tokenizer_cached();
+    let tokenizer = Arc::new(
+        HuggingFaceTokenizer::from_file(tokenizer_path.to_str().unwrap())
+            .expect("Failed to load tokenizer"),
+    );
+
+    for prompt in TEST_PROMPTS.iter() {
+        let encoding = tokenizer.encode(prompt).expect("Failed to encode prompt");
+
+        let mut decoder = DecodeStream::new(tokenizer.clone(), &[], false);
+        let mut output = String::new();
+
+        for token_id in encoding.token_ids() {
+            if let Some(text) = decoder.step(*token_id).expect("Failed to decode token") {
+                output.push_str(&text);
+            }
+        }
+
+        assert_eq!(output, *prompt, "DecodeStream output mismatch");
+    }
+}
+
+#[test]
+fn test_long_sequence_incremental_decode_with_prefill() {
+    let tokenizer_path = ensure_tokenizer_cached();
+    let tokenizer = Arc::new(
+        HuggingFaceTokenizer::from_file(tokenizer_path.to_str().unwrap())
+            .expect("Failed to load tokenizer"),
+    );
+
+    for (input_text, output_text) in LONG_TEST_PROMPTS.iter() {
+        let input_encoding = tokenizer
+            .encode(input_text)
+            .expect("Failed to encode input");
+
+        let output_encoding = tokenizer
+            .encode(output_text)
+            .expect("Failed to encode output");
+
+        let mut decoder = DecodeStream::new(tokenizer.clone(), input_encoding.token_ids(), false);
+
+        let mut output = String::new();
+        for token_id in output_encoding.token_ids() {
+            if let Some(text) = decoder.step(*token_id).expect("Failed to decode token") {
+                output.push_str(&text);
+            }
+        }
+
+        assert_eq!(output.trim(), *output_text, "Long sequence decode mismatch");
+    }
+}
+
+#[test]
+fn test_stop_sequence_decoder() {
+    let tokenizer_path = ensure_tokenizer_cached();
+    let tokenizer = Arc::new(
+        HuggingFaceTokenizer::from_file(tokenizer_path.to_str().unwrap())
+            .expect("Failed to load tokenizer"),
+    );
+
+    // Test with various stop sequences
+    let test_cases = vec![
+        (
+            "Hello world! Stop here. Continue after.",
+            "Stop",
+            "Hello world! ",
+        ),
+        ("Testing stop sequences.", ".", "Testing stop sequences"),
+        ("No stop sequence here", "xyz", "No stop sequence here"),
+    ];
+
+    for (input, stop_seq, expected) in test_cases {
+        let config = StopSequenceConfig::default().with_stop_sequence(stop_seq);
+
+        let mut decoder = StopSequenceDecoder::new(tokenizer.clone(), config, false);
+
+        let encoding = tokenizer.encode(input).expect("Failed to encode");
+        let mut output = String::new();
+        let mut stopped = false;
+
+        for token_id in encoding.token_ids() {
+            match decoder.process_token(*token_id).unwrap() {
+                SequenceDecoderOutput::Text(text) => output.push_str(&text),
+                SequenceDecoderOutput::StoppedWithText(text) => {
+                    output.push_str(&text);
+                    stopped = true;
+                    break;
+                }
+                SequenceDecoderOutput::Stopped => {
+                    stopped = true;
+                    break;
+                }
+                SequenceDecoderOutput::Held => {}
+            }
+        }
+
+        if !stopped {
+            // Flush any remaining text
+            if let SequenceDecoderOutput::Text(text) = decoder.flush() {
+                output.push_str(&text);
+            }
+        }
+
+        println!(
+            "Input: '{}', Stop: '{}', Output: '{}', Expected: '{}'",
+            input, stop_seq, output, expected
+        );
+
+        // The test should check if output starts with expected
+        // since stop sequences might not be perfectly aligned with token boundaries
+        assert!(
+            output.starts_with(expected) || output == input,
+            "Stop sequence test failed"
+        );
+    }
+}
+
+#[test]
+fn test_factory_creation() {
+    // Test factory creation method
+    let tokenizer_path = ensure_tokenizer_cached();
+    let tokenizer = factory::create_tokenizer(tokenizer_path.to_str().unwrap())
+        .expect("Failed to create tokenizer via factory");
+
+    let encoding = tokenizer.encode(TEST_PROMPTS[0]).expect("Failed to encode");
+
+    let decoded = tokenizer
+        .decode(encoding.token_ids(), false)
+        .expect("Failed to decode");
+
+    assert_eq!(decoded, TEST_PROMPTS[0]);
+}
+
+#[test]
+fn test_batch_encoding() {
+    let tokenizer_path = ensure_tokenizer_cached();
+    let tokenizer = HuggingFaceTokenizer::from_file(tokenizer_path.to_str().unwrap())
+        .expect("Failed to load tokenizer");
+
+    let encodings = tokenizer
+        .encode_batch(&TEST_PROMPTS)
+        .expect("Failed to batch encode");
+
+    assert_eq!(encodings.len(), TEST_PROMPTS.len());
+
+    for (i, encoding) in encodings.iter().enumerate() {
+        let decoded = tokenizer
+            .decode(encoding.token_ids(), false)
+            .expect("Failed to decode");
+        assert_eq!(decoded, TEST_PROMPTS[i]);
+    }
+}
+
+#[test]
+fn test_special_tokens() {
+    use sglang_router_rs::tokenizer::traits::Tokenizer as TokenizerTrait;
+
+    let tokenizer_path = ensure_tokenizer_cached();
+    let tokenizer = HuggingFaceTokenizer::from_file(tokenizer_path.to_str().unwrap())
+        .expect("Failed to load tokenizer");
+
+    let special_tokens = tokenizer.get_special_tokens();
+
+    // TinyLlama should have at least BOS and EOS tokens
+    assert!(special_tokens.bos_token.is_some());
+    assert!(special_tokens.eos_token.is_some());
+
+    println!("Special tokens: {:?}", special_tokens);
+}
+
+#[test]
+fn test_thread_safety() {
+    use std::thread;
+
+    let tokenizer_path = ensure_tokenizer_cached();
+    let tokenizer = Arc::new(
+        HuggingFaceTokenizer::from_file(tokenizer_path.to_str().unwrap())
+            .expect("Failed to load tokenizer"),
+    );
+
+    let handles: Vec<_> = TEST_PROMPTS
+        .iter()
+        .map(|&prompt| {
+            let tokenizer_clone = tokenizer.clone();
+            thread::spawn(move || {
+                let encoding = tokenizer_clone
+                    .encode(prompt)
+                    .expect("Failed to encode in thread");
+                let decoded = tokenizer_clone
+                    .decode(encoding.token_ids(), false)
+                    .expect("Failed to decode in thread");
+                assert_eq!(decoded, prompt);
+            })
+        })
+        .collect();
+
+    for handle in handles {
+        handle.join().expect("Thread panicked");
+    }
+}
diff --git a/sgl-router/tests/tool_parser_deepseek.rs b/sgl-router/tests/tool_parser_deepseek.rs
new file mode 100644
index 000000000..45168c13e
--- /dev/null
+++ b/sgl-router/tests/tool_parser_deepseek.rs
@@ -0,0 +1,183 @@
+//! DeepSeek V3 Parser Integration Tests
+
+use sglang_router_rs::tool_parser::{DeepSeekParser, ParseState, StreamResult, ToolParser};
+
+#[tokio::test]
+async fn test_deepseek_complete_parsing() {
+    let parser = DeepSeekParser::new();
+
+    // Test single tool call
+    let input = r#"Let me help you with that.
+<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>get_weather
+```json
+{"location": "Tokyo", "units": "celsius"}
+```<｜tool▁call▁end｜><｜tool▁calls▁end｜>
+The weather in Tokyo is..."#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "get_weather");
+
+    // Verify arguments
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["location"], "Tokyo");
+    assert_eq!(args["units"], "celsius");
+}
+
+#[tokio::test]
+async fn test_deepseek_multiple_tools() {
+    let parser = DeepSeekParser::new();
+
+    let input = r#"<｜tool▁calls▁begin｜>
+<｜tool▁call▁begin｜>function<｜tool▁sep｜>search
+```json
+{"query": "rust programming"}
+```<｜tool▁call▁end｜>
+<｜tool▁call▁begin｜>function<｜tool▁sep｜>translate
+```json
+{"text": "Hello World", "to": "ja"}
+```<｜tool▁call▁end｜>
+<｜tool▁calls▁end｜>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 2);
+    assert_eq!(result[0].function.name, "search");
+    assert_eq!(result[1].function.name, "translate");
+}
+
+#[tokio::test]
+async fn test_deepseek_streaming() {
+    let parser = DeepSeekParser::new();
+    let mut state = ParseState::new();
+
+    // Simulate streaming chunks
+    let chunks = vec![
+        "<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>",
+        "function<｜tool▁sep｜>get_weather\n",
+        "```json\n",
+        r#"{"location": "#,
+        r#""Beijing", "#,
+        r#""units": "metric"}"#,
+        "\n```<｜tool▁call▁end｜><｜tool▁calls▁end｜>",
+    ];
+
+    let mut found_name = false;
+    let mut found_complete = false;
+
+    for chunk in chunks {
+        let result = parser.parse_incremental(chunk, &mut state).await.unwrap();
+
+        match result {
+            StreamResult::ToolName { name, .. } => {
+                assert_eq!(name, "get_weather");
+                found_name = true;
+            }
+            StreamResult::ToolComplete(tool) => {
+                assert_eq!(tool.function.name, "get_weather");
+                found_complete = true;
+            }
+            _ => {}
+        }
+    }
+
+    assert!(found_name || found_complete);
+}
+
+#[tokio::test]
+async fn test_deepseek_nested_json() {
+    let parser = DeepSeekParser::new();
+
+    let input = r#"<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>process
+```json
+{
+    "data": {
+        "nested": {
+            "deep": [1, 2, 3]
+        }
+    }
+}
+```<｜tool▁call▁end｜><｜tool▁calls▁end｜>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "process");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert!(args["data"]["nested"]["deep"].is_array());
+}
+
+#[test]
+fn test_deepseek_format_detection() {
+    let parser = DeepSeekParser::new();
+
+    // Should detect DeepSeek format
+    assert!(parser.detect_format("<｜tool▁calls▁begin｜>"));
+    assert!(parser.detect_format("text with <｜tool▁calls▁begin｜> marker"));
+
+    // Should not detect other formats
+    assert!(!parser.detect_format("[TOOL_CALLS]"));
+    assert!(!parser.detect_format("<tool_call>"));
+    assert!(!parser.detect_format("plain text"));
+}
+
+#[tokio::test]
+async fn test_deepseek_malformed_json_handling() {
+    let parser = DeepSeekParser::new();
+
+    // Malformed JSON should be skipped
+    let input = r#"<｜tool▁calls▁begin｜>
+<｜tool▁call▁begin｜>function<｜tool▁sep｜>broken
+```json
+{invalid json}
+```<｜tool▁call▁end｜>
+<｜tool▁call▁begin｜>function<｜tool▁sep｜>valid
+```json
+{"key": "value"}
+```<｜tool▁call▁end｜>
+<｜tool▁calls▁end｜>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    // Only the valid tool call should be parsed
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "valid");
+}
+
+#[tokio::test]
+async fn test_normal_text_extraction() {
+    let parser = DeepSeekParser::new();
+
+    // Python extracts text before tool calls as normal_text
+    let input = r#"Let me help you with that.
+<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>get_weather
+```json
+{"location": "Tokyo"}
+```<｜tool▁call▁end｜><｜tool▁calls▁end｜>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "get_weather");
+
+    // TODO: Verify normal text extraction when parser returns it
+    // In Python: normal_text = "Let me help you with that."
+}
+
+#[tokio::test]
+async fn test_multiple_tool_calls() {
+    let parser = DeepSeekParser::new();
+
+    let input = r#"<｜tool▁calls▁begin｜>
+<｜tool▁call▁begin｜>function<｜tool▁sep｜>get_weather
+```json
+{"location": "Tokyo"}
+```<｜tool▁call▁end｜>
+<｜tool▁call▁begin｜>function<｜tool▁sep｜>get_weather
+```json
+{"location": "Paris"}
+```<｜tool▁call▁end｜>
+<｜tool▁calls▁end｜><｜end▁of▁sentence｜>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 2);
+    assert_eq!(result[0].function.name, "get_weather");
+    assert_eq!(result[1].function.name, "get_weather");
+}
diff --git a/sgl-router/tests/tool_parser_edge_cases.rs b/sgl-router/tests/tool_parser_edge_cases.rs
new file mode 100644
index 000000000..5738f650b
--- /dev/null
+++ b/sgl-router/tests/tool_parser_edge_cases.rs
@@ -0,0 +1,330 @@
+//! Edge Cases and Error Handling Tests
+//!
+//! Tests for malformed input, edge cases, and error recovery
+
+use sglang_router_rs::tool_parser::{
+    JsonParser, MistralParser, ParseState, ParserRegistry, PythonicParser, QwenParser,
+    StreamResult, ToolParser,
+};
+
+#[tokio::test]
+async fn test_empty_input() {
+    let registry = ParserRegistry::new();
+    let parsers = vec!["json", "mistral", "qwen", "pythonic", "llama"];
+
+    for parser_name in parsers {
+        let parser = registry
+            .get_parser(&format!("test-{}", parser_name))
+            .unwrap();
+        let result = parser.parse_complete("").await.unwrap();
+        assert_eq!(
+            result.len(),
+            0,
+            "Parser {} should return empty for empty input",
+            parser_name
+        );
+    }
+}
+
+#[tokio::test]
+async fn test_plain_text_no_tools() {
+    let plain_text = "This is just a regular response with no tool calls whatsoever.";
+
+    let json_parser = JsonParser::new();
+    assert_eq!(
+        json_parser.parse_complete(plain_text).await.unwrap().len(),
+        0
+    );
+
+    let mistral_parser = MistralParser::new();
+    assert_eq!(
+        mistral_parser
+            .parse_complete(plain_text)
+            .await
+            .unwrap()
+            .len(),
+        0
+    );
+
+    let qwen_parser = QwenParser::new();
+    assert_eq!(
+        qwen_parser.parse_complete(plain_text).await.unwrap().len(),
+        0
+    );
+
+    let pythonic_parser = PythonicParser::new();
+    assert_eq!(
+        pythonic_parser
+            .parse_complete(plain_text)
+            .await
+            .unwrap()
+            .len(),
+        0
+    );
+}
+
+#[tokio::test]
+async fn test_incomplete_json() {
+    let json_parser = JsonParser::new();
+
+    let incomplete_cases = vec![
+        r#"{"name": "test""#,                 // Missing closing brace
+        r#"{"name": "test", "arguments":"#,   // Incomplete arguments
+        r#"{"name": "test", "arguments": {"#, // Incomplete nested object
+    ];
+
+    for input in incomplete_cases {
+        let result = json_parser.parse_complete(input).await.unwrap();
+        assert_eq!(
+            result.len(),
+            0,
+            "Should not parse incomplete JSON: {}",
+            input
+        );
+    }
+
+    // This case might actually parse because [{"name": "test"}] is complete
+    // The trailing comma suggests more items but the first item is valid
+    let _result = json_parser
+        .parse_complete(r#"[{"name": "test"},"#)
+        .await
+        .unwrap();
+    // This could parse the first element or return empty - implementation dependent
+}
+
+#[tokio::test]
+async fn test_malformed_mistral() {
+    let parser = MistralParser::new();
+
+    let malformed_cases = vec![
+        "[TOOL_CALLS]",                // Missing array
+        "[TOOL_CALLS] {",              // Not an array
+        "[TOOL_CALLS] [",              // Incomplete array
+        "[TOOL_CALLS] [{]",            // Invalid JSON in array
+        "[TOOL_CALLS] [{\"name\": }]", // Invalid value
+    ];
+
+    for input in malformed_cases {
+        // Parser might return error or empty vec for malformed input
+        if let Ok(result) = parser.parse_complete(input).await {
+            assert_eq!(
+                result.len(),
+                0,
+                "Should not parse malformed Mistral: {}",
+                input
+            );
+        }
+        // Error is also acceptable for malformed input
+    }
+}
+
+#[tokio::test]
+async fn test_missing_required_fields() {
+    let json_parser = JsonParser::new();
+
+    // Missing name field
+    let input = r#"{"arguments": {"x": 1}}"#;
+    let result = json_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 0, "Should not parse without name field");
+
+    // Name is not a string
+    let input = r#"{"name": 123, "arguments": {}}"#;
+    let result = json_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 0, "Should not parse with non-string name");
+}
+
+#[tokio::test]
+async fn test_very_long_strings() {
+    let json_parser = JsonParser::new();
+
+    let long_string = "x".repeat(10000);
+    let input = format!(
+        r#"{{"name": "test", "arguments": {{"data": "{}"}}}}"#,
+        long_string
+    );
+
+    let result = json_parser.parse_complete(&input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "test");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["data"].as_str().unwrap().len(), 10000);
+}
+
+#[tokio::test]
+async fn test_unicode_edge_cases() {
+    let json_parser = JsonParser::new();
+
+    // Various Unicode characters including emojis, CJK, RTL text
+    let input = r#"{"name": "translate", "arguments": {"text": "Hello 世界 🌍 مرحبا עולם"}}"#;
+
+    let result = json_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["text"], "Hello 世界 🌍 مرحبا עולם");
+}
+
+#[tokio::test]
+async fn test_nested_brackets_in_strings() {
+    // Test that parsers correctly handle brackets within string literals
+
+    let mistral_parser = MistralParser::new();
+    let input = r#"[TOOL_CALLS] [{"name": "echo", "arguments": {"text": "Array: [1, 2, 3]"}}]"#;
+    let result = mistral_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["text"], "Array: [1, 2, 3]");
+
+    let pythonic_parser = PythonicParser::new();
+    let input = r#"[echo(text="List: [a, b, c]")]"#;
+    let result = pythonic_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["text"], "List: [a, b, c]");
+}
+
+#[tokio::test]
+async fn test_multiple_formats_in_text() {
+    // Test that parsers don't get confused by other formats in the text
+
+    let json_parser = JsonParser::new();
+    let input = r#"
+    Here's some text with [TOOL_CALLS] that shouldn't trigger.
+    {"name": "actual_tool", "arguments": {}}
+    And some more text with <tool_call> tags.
+    "#;
+
+    let result = json_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "actual_tool");
+}
+
+#[tokio::test]
+async fn test_escaped_characters() {
+    let json_parser = JsonParser::new();
+
+    let input = r#"{"name": "write", "arguments": {"content": "Line 1\nLine 2\r\nLine 3\tTabbed\\Backslash\"Quote"}}"#;
+
+    let result = json_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    let content = args["content"].as_str().unwrap();
+    assert!(content.contains('\n'));
+    assert!(content.contains('\t'));
+    assert!(content.contains('\\'));
+    assert!(content.contains('"'));
+}
+
+#[tokio::test]
+async fn test_numeric_edge_cases() {
+    let json_parser = JsonParser::new();
+
+    let input = r#"{
+        "name": "calculate",
+        "arguments": {
+            "int": 42,
+            "float": 123.456,
+            "scientific": 1.23e-4,
+            "negative": -999,
+            "zero": 0,
+            "large": 9007199254740991
+        }
+    }"#;
+
+    let result = json_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["int"], 42);
+    assert_eq!(args["float"], 123.456);
+    assert_eq!(args["scientific"], 0.000123);
+    assert_eq!(args["negative"], -999);
+    assert_eq!(args["zero"], 0);
+    assert_eq!(args["large"], 9007199254740991i64);
+}
+
+#[tokio::test]
+async fn test_null_and_boolean_values() {
+    let json_parser = JsonParser::new();
+
+    let input = r#"{
+        "name": "configure",
+        "arguments": {
+            "enabled": true,
+            "disabled": false,
+            "optional": null
+        }
+    }"#;
+
+    let result = json_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["enabled"], true);
+    assert_eq!(args["disabled"], false);
+    assert_eq!(args["optional"], serde_json::Value::Null);
+}
+
+#[tokio::test]
+async fn test_partial_token_at_buffer_boundary() {
+    let parser = QwenParser::new();
+    let mut state = ParseState::new();
+
+    // Test case that would fail with the bug:
+    // Send exactly "<tool" which is a 5-character prefix of "<tool_call>\n"
+    let result = parser.parse_incremental("<tool", &mut state).await.unwrap();
+    assert!(matches!(result, StreamResult::Incomplete));
+    assert_eq!(state.buffer, "<tool");
+
+    // Complete the token
+    let result = parser
+        .parse_incremental(
+            "_call>\n{\"name\": \"test\", \"arguments\": {}}\n</tool_call>",
+            &mut state,
+        )
+        .await
+        .unwrap();
+
+    // Should successfully parse after completing
+    match result {
+        StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "test");
+        }
+        _ => {
+            // In Phase 2 simplified streaming, might get Incomplete
+            // The important thing is it didn't fail to recognize the partial token
+        }
+    }
+}
+
+#[tokio::test]
+async fn test_exact_prefix_lengths() {
+    let parser = QwenParser::new();
+
+    // Test various exact prefix lengths that would be missed by exclusive range
+    let test_cases = vec![
+        ("<", 1),            // 1-char prefix
+        ("<t", 2),           // 2-char prefix
+        ("<tool", 5),        // 5-char prefix (the main bug case)
+        ("<tool_call", 10),  // 10-char prefix
+        ("<tool_call>", 11), // 11-char prefix (full start without \n)
+    ];
+
+    for (prefix, expected_len) in test_cases {
+        let mut state = ParseState::new();
+        let result = parser.parse_incremental(prefix, &mut state).await.unwrap();
+        assert!(
+            matches!(result, StreamResult::Incomplete),
+            "Prefix '{}' (len {}) should be incomplete",
+            prefix,
+            expected_len
+        );
+        assert_eq!(
+            state.buffer, prefix,
+            "Buffer should contain the prefix '{}'",
+            prefix
+        );
+    }
+}
diff --git a/sgl-router/tests/tool_parser_glm4_moe.rs b/sgl-router/tests/tool_parser_glm4_moe.rs
new file mode 100644
index 000000000..bae8fe727
--- /dev/null
+++ b/sgl-router/tests/tool_parser_glm4_moe.rs
@@ -0,0 +1,194 @@
+//! GLM-4 MoE Parser Integration Tests
+
+use sglang_router_rs::tool_parser::{Glm4MoeParser, ParseState, StreamResult, ToolParser};
+
+#[tokio::test]
+async fn test_glm4_complete_parsing() {
+    let parser = Glm4MoeParser::new();
+
+    // Test single tool call
+    let input = r#"Let me search for that.
+<tool_call>get_weather
+<arg_key>city</arg_key>
+<arg_value>Beijing</arg_value>
+<arg_key>date</arg_key>
+<arg_value>2024-12-25</arg_value>
+</tool_call>
+The weather will be..."#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "get_weather");
+
+    // Verify arguments
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["city"], "Beijing");
+    assert_eq!(args["date"], "2024-12-25");
+}
+
+#[tokio::test]
+async fn test_glm4_multiple_tools() {
+    let parser = Glm4MoeParser::new();
+
+    let input = r#"<tool_call>search
+<arg_key>query</arg_key>
+<arg_value>rust tutorials</arg_value>
+</tool_call>
+<tool_call>translate
+<arg_key>text</arg_key>
+<arg_value>Hello World</arg_value>
+<arg_key>target_lang</arg_key>
+<arg_value>zh</arg_value>
+</tool_call>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 2);
+    assert_eq!(result[0].function.name, "search");
+    assert_eq!(result[1].function.name, "translate");
+}
+
+#[tokio::test]
+async fn test_glm4_type_conversion() {
+    let parser = Glm4MoeParser::new();
+
+    // Test various value types
+    let input = r#"<tool_call>process
+<arg_key>count</arg_key>
+<arg_value>42</arg_value>
+<arg_key>rate</arg_key>
+<arg_value>1.5</arg_value>
+<arg_key>enabled</arg_key>
+<arg_value>true</arg_value>
+<arg_key>data</arg_key>
+<arg_value>null</arg_value>
+<arg_key>text</arg_key>
+<arg_value>string value</arg_value>
+</tool_call>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["count"], 42);
+    assert_eq!(args["rate"], 1.5);
+    assert_eq!(args["enabled"], true);
+    assert_eq!(args["data"], serde_json::Value::Null);
+    assert_eq!(args["text"], "string value");
+}
+
+#[tokio::test]
+async fn test_glm4_streaming() {
+    let parser = Glm4MoeParser::new();
+    let mut state = ParseState::new();
+
+    // Simulate streaming chunks
+    let chunks = vec![
+        "<tool_call>",
+        "get_weather\n",
+        "<arg_key>city</arg_key>\n",
+        "<arg_value>Shanghai</arg_value>\n",
+        "<arg_key>units</arg_key>\n",
+        "<arg_value>celsius</arg_value>\n",
+        "</tool_call>",
+    ];
+
+    let mut found_name = false;
+    let mut found_complete = false;
+
+    for chunk in chunks {
+        let result = parser.parse_incremental(chunk, &mut state).await.unwrap();
+
+        match result {
+            StreamResult::ToolName { name, .. } => {
+                assert_eq!(name, "get_weather");
+                found_name = true;
+            }
+            StreamResult::ToolComplete(tool) => {
+                assert_eq!(tool.function.name, "get_weather");
+                found_complete = true;
+            }
+            _ => {}
+        }
+    }
+
+    assert!(found_name || found_complete);
+}
+
+#[test]
+fn test_glm4_format_detection() {
+    let parser = Glm4MoeParser::new();
+
+    // Should detect GLM-4 format
+    assert!(parser.detect_format("<tool_call>"));
+    assert!(parser.detect_format("text with <tool_call> marker"));
+
+    // Should not detect other formats
+    assert!(!parser.detect_format("[TOOL_CALLS]"));
+    assert!(!parser.detect_format("<｜tool▁calls▁begin｜>"));
+    assert!(!parser.detect_format("plain text"));
+}
+
+#[tokio::test]
+async fn test_glm4_python_literal_values() {
+    let parser = Glm4MoeParser::new();
+
+    // Test Python-style boolean values
+    let input = r#"<tool_call>config
+<arg_key>debug</arg_key>
+<arg_value>True</arg_value>
+<arg_key>verbose</arg_key>
+<arg_value>False</arg_value>
+<arg_key>optional</arg_key>
+<arg_value>None</arg_value>
+</tool_call>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["debug"], true);
+    assert_eq!(args["verbose"], false);
+    assert_eq!(args["optional"], serde_json::Value::Null);
+}
+
+#[tokio::test]
+async fn test_python_literals() {
+    let parser = Glm4MoeParser::new();
+
+    let input = r#"<tool_call>test_func
+<arg_key>bool_true</arg_key>
+<arg_value>True</arg_value>
+<arg_key>bool_false</arg_key>
+<arg_value>False</arg_value>
+<arg_key>none_val</arg_key>
+<arg_value>None</arg_value>
+</tool_call>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "test_func");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["bool_true"], true);
+    assert_eq!(args["bool_false"], false);
+    assert_eq!(args["none_val"], serde_json::Value::Null);
+}
+
+#[tokio::test]
+async fn test_nested_values() {
+    let parser = Glm4MoeParser::new();
+
+    let input = r#"<tool_call>process
+<arg_key>data</arg_key>
+<arg_value>{"nested": {"key": "value"}}</arg_value>
+<arg_key>list</arg_key>
+<arg_value>[1, 2, 3]</arg_value>
+</tool_call>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert!(args["data"].is_object());
+    assert!(args["list"].is_array());
+}
diff --git a/sgl-router/tests/tool_parser_gpt_oss.rs b/sgl-router/tests/tool_parser_gpt_oss.rs
new file mode 100644
index 000000000..50dc0be15
--- /dev/null
+++ b/sgl-router/tests/tool_parser_gpt_oss.rs
@@ -0,0 +1,201 @@
+//! GPT-OSS Parser Integration Tests
+
+use sglang_router_rs::tool_parser::{GptOssParser, ParseState, StreamResult, ToolParser};
+
+#[tokio::test]
+async fn test_gpt_oss_complete_parsing() {
+    let parser = GptOssParser::new();
+
+    // Test single tool call
+    let input = r#"Let me search for that information.
+<|channel|>commentary to=functions.search<|constrain|>json<|message|>{"query": "rust programming", "limit": 10}<|call|>
+Here are the results..."#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "search");
+
+    // Verify arguments
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["query"], "rust programming");
+    assert_eq!(args["limit"], 10);
+}
+
+#[tokio::test]
+async fn test_gpt_oss_multiple_tools() {
+    let parser = GptOssParser::new();
+
+    let input = r#"<|channel|>commentary to=functions.get_weather<|constrain|>json<|message|>{"location": "Paris"}<|call|>commentary
+<|channel|>commentary to=functions.search<|constrain|>json<|message|>{"query": "Paris tourism"}<|call|>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 2);
+    assert_eq!(result[0].function.name, "get_weather");
+    assert_eq!(result[1].function.name, "search");
+}
+
+#[tokio::test]
+async fn test_gpt_oss_with_namespace() {
+    let parser = GptOssParser::new();
+
+    // Test with different namespace patterns
+    let input = r#"<|channel|>commentary to=api.users.create<|constrain|>json<|message|>{"name": "John", "email": "john@example.com"}<|call|>
+<|channel|>commentary to=tools.calculator.add<|constrain|>json<|message|>{"x": 10, "y": 20}<|call|>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 2);
+    assert_eq!(result[0].function.name, "create"); // Should extract last part
+    assert_eq!(result[1].function.name, "add");
+}
+
+#[tokio::test]
+async fn test_gpt_oss_with_assistant_prefix() {
+    let parser = GptOssParser::new();
+
+    // Test with <|start|>assistant prefix
+    let input = r#"<|start|>assistant<|channel|>commentary to=functions.test<|constrain|>json<|message|>{"key": "value"}<|call|>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "test");
+}
+
+#[tokio::test]
+async fn test_gpt_oss_empty_args() {
+    let parser = GptOssParser::new();
+
+    // Test with empty arguments
+    let input =
+        r#"<|channel|>commentary to=functions.get_time<|constrain|>json<|message|>{}<|call|>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "get_time");
+    assert_eq!(result[0].function.arguments, "{}");
+}
+
+#[tokio::test]
+async fn test_gpt_oss_streaming() {
+    let parser = GptOssParser::new();
+    let mut state = ParseState::new();
+
+    // Simulate streaming chunks
+    let chunks = vec![
+        "<|channel|>commentary to=",
+        "functions.calculate",
+        "<|constrain|>json<|message|>",
+        r#"{"x": 10"#,
+        r#", "y": 20}"#,
+        "<|call|>",
+    ];
+
+    let mut found_name = false;
+    let mut found_complete = false;
+
+    for chunk in chunks {
+        let result = parser.parse_incremental(chunk, &mut state).await.unwrap();
+
+        match result {
+            StreamResult::ToolName { name, .. } => {
+                assert_eq!(name, "calculate");
+                found_name = true;
+            }
+            StreamResult::ToolComplete(tool) => {
+                assert_eq!(tool.function.name, "calculate");
+                found_complete = true;
+            }
+            _ => {}
+        }
+    }
+
+    assert!(found_name || found_complete);
+}
+
+#[test]
+fn test_gpt_oss_format_detection() {
+    let parser = GptOssParser::new();
+
+    // Should detect GPT-OSS format
+    assert!(parser.detect_format("<|channel|>commentary to="));
+    assert!(parser.detect_format("<|channel|>commentary"));
+    assert!(parser.detect_format("text with <|channel|>commentary to= marker"));
+
+    // Should not detect other formats
+    assert!(!parser.detect_format("[TOOL_CALLS]"));
+    assert!(!parser.detect_format("<tool_call>"));
+    assert!(!parser.detect_format("plain text"));
+}
+
+#[tokio::test]
+async fn test_gpt_oss_with_whitespace() {
+    let parser = GptOssParser::new();
+
+    // Test with whitespace after function name
+    let input = r#"<|channel|>commentary to=functions.test  <|constrain|>json<|message|>{"key": "value"}<|call|>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "test");
+}
+
+#[tokio::test]
+async fn test_gpt_oss_complex_json() {
+    let parser = GptOssParser::new();
+
+    // Test with complex nested JSON
+    let input = r#"<|channel|>commentary to=functions.process<|constrain|>json<|message|>{
+    "nested": {
+        "data": [1, 2, 3],
+        "config": {
+            "enabled": true
+        }
+    }
+}<|call|>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "process");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert!(args["nested"]["data"].is_array());
+    assert_eq!(args["nested"]["config"]["enabled"], true);
+}
+
+#[tokio::test]
+async fn test_commentary_without_function() {
+    let parser = GptOssParser::new();
+
+    // Python should extract commentary as normal text
+    let input = r#"<|channel|>commentary<|message|>**Action plan**: 1. Do X 2. Do Y<|end|>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 0); // No tool calls
+                                 // TODO: Verify normal text = "**Action plan**: 1. Do X 2. Do Y"
+}
+
+#[tokio::test]
+async fn test_final_channel() {
+    let parser = GptOssParser::new();
+
+    let input = r#"<|channel|>commentary to=functions.test<|constrain|>json<|message|>{"x": 1}<|call|>
+<|channel|>final<|message|>The result is calculated.<|return|>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "test");
+    // TODO: Verify normal text = "The result is calculated."
+}
+
+#[tokio::test]
+async fn test_mixed_commentary_and_calls() {
+    let parser = GptOssParser::new();
+
+    let input = r#"<|channel|>commentary<|message|>Let me think<|end|>
+<|channel|>commentary to=functions.calc<|constrain|>json<|message|>{"x": 5}<|call|>
+<|channel|>commentary<|message|>Processing...<|end|>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "calc");
+    // TODO: Verify normal text = "Let me think Processing..."
+}
diff --git a/sgl-router/tests/tool_parser_json.rs b/sgl-router/tests/tool_parser_json.rs
new file mode 100644
index 000000000..c8c42b70f
--- /dev/null
+++ b/sgl-router/tests/tool_parser_json.rs
@@ -0,0 +1,147 @@
+//! JSON Parser Integration Tests
+//!
+//! Tests for the JSON parser which handles OpenAI, Claude, and generic JSON formats
+
+use serde_json::json;
+use sglang_router_rs::tool_parser::{JsonParser, ToolParser};
+
+#[tokio::test]
+async fn test_simple_json_tool_call() {
+    let parser = JsonParser::new();
+    let input = r#"{"name": "get_weather", "arguments": {"location": "San Francisco"}}"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "get_weather");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["location"], "San Francisco");
+}
+
+#[tokio::test]
+async fn test_json_array_of_tools() {
+    let parser = JsonParser::new();
+    let input = r#"[
+        {"name": "get_weather", "arguments": {"location": "SF"}},
+        {"name": "search", "arguments": {"query": "news"}}
+    ]"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 2);
+    assert_eq!(result[0].function.name, "get_weather");
+    assert_eq!(result[1].function.name, "search");
+}
+
+#[tokio::test]
+async fn test_json_with_parameters_key() {
+    let parser = JsonParser::new();
+    let input = r#"{"name": "calculate", "parameters": {"x": 10, "y": 20}}"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "calculate");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["x"], 10);
+    assert_eq!(args["y"], 20);
+}
+
+#[tokio::test]
+async fn test_json_extraction_from_text() {
+    let parser = JsonParser::new();
+    let input = r#"I'll help you with that. {"name": "search", "arguments": {"query": "rust"}} Let me search for that."#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "search");
+}
+
+#[tokio::test]
+async fn test_json_with_nested_objects() {
+    let parser = JsonParser::new();
+    let input = r#"{
+        "name": "update_config",
+        "arguments": {
+            "settings": {
+                "theme": "dark",
+                "language": "en",
+                "notifications": {
+                    "email": true,
+                    "push": false
+                }
+            }
+        }
+    }"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "update_config");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["settings"]["theme"], "dark");
+    assert_eq!(args["settings"]["notifications"]["email"], true);
+}
+
+#[tokio::test]
+async fn test_json_with_special_characters() {
+    let parser = JsonParser::new();
+    let input = r#"{"name": "echo", "arguments": {"text": "Line 1\nLine 2\tTabbed", "path": "C:\\Users\\test"}}"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["text"], "Line 1\nLine 2\tTabbed");
+    assert_eq!(args["path"], "C:\\Users\\test");
+}
+
+#[tokio::test]
+async fn test_json_with_unicode() {
+    let parser = JsonParser::new();
+    let input = r#"{"name": "translate", "arguments": {"text": "Hello 世界 🌍", "emoji": "😊"}}"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["text"], "Hello 世界 🌍");
+    assert_eq!(args["emoji"], "😊");
+}
+
+#[tokio::test]
+async fn test_json_empty_arguments() {
+    let parser = JsonParser::new();
+    let input = r#"{"name": "ping", "arguments": {}}"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "ping");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args, json!({}));
+}
+
+#[tokio::test]
+async fn test_json_invalid_format() {
+    let parser = JsonParser::new();
+
+    // Missing closing brace
+    let input = r#"{"name": "test", "arguments": {"key": "value""#;
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 0);
+
+    // Not JSON at all
+    let input = "This is just plain text";
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 0);
+}
+
+#[tokio::test]
+async fn test_json_format_detection() {
+    let parser = JsonParser::new();
+
+    assert!(parser.detect_format(r#"{"name": "test", "arguments": {}}"#));
+    assert!(parser.detect_format(r#"[{"name": "test"}]"#));
+    assert!(!parser.detect_format("plain text"));
+    assert!(!parser.detect_format(r#"{"key": "value"}"#)); // No name field
+}
diff --git a/sgl-router/tests/tool_parser_kimik2.rs b/sgl-router/tests/tool_parser_kimik2.rs
new file mode 100644
index 000000000..66be2e88f
--- /dev/null
+++ b/sgl-router/tests/tool_parser_kimik2.rs
@@ -0,0 +1,160 @@
+//! Kimi K2 Parser Integration Tests
+
+use sglang_router_rs::tool_parser::{KimiK2Parser, ParseState, StreamResult, ToolParser};
+
+#[tokio::test]
+async fn test_kimik2_complete_parsing() {
+    let parser = KimiK2Parser::new();
+
+    // Test single tool call
+    let input = r#"Let me help you with that.
+<|tool_calls_section_begin|>
+<|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"location": "Tokyo", "units": "celsius"}<|tool_call_end|>
+<|tool_calls_section_end|>
+The weather in Tokyo is..."#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "get_weather");
+
+    // Verify arguments
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["location"], "Tokyo");
+    assert_eq!(args["units"], "celsius");
+}
+
+#[tokio::test]
+async fn test_kimik2_multiple_tools() {
+    let parser = KimiK2Parser::new();
+
+    let input = r#"<|tool_calls_section_begin|>
+<|tool_call_begin|>functions.search:0<|tool_call_argument_begin|>{"query": "rust tutorials"}<|tool_call_end|>
+<|tool_call_begin|>functions.translate:1<|tool_call_argument_begin|>{"text": "Hello", "to": "ja"}<|tool_call_end|>
+<|tool_calls_section_end|>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 2);
+    assert_eq!(result[0].function.name, "search");
+    assert_eq!(result[1].function.name, "translate");
+}
+
+#[tokio::test]
+async fn test_kimik2_with_whitespace() {
+    let parser = KimiK2Parser::new();
+
+    // Test with extra whitespace
+    let input = r#"<|tool_calls_section_begin|>
+<|tool_call_begin|> functions.test:0 <|tool_call_argument_begin|> {"key": "value", "num": 42} <|tool_call_end|>
+<|tool_calls_section_end|>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "test");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["key"], "value");
+    assert_eq!(args["num"], 42);
+}
+
+#[tokio::test]
+async fn test_kimik2_streaming() {
+    let parser = KimiK2Parser::new();
+    let mut state = ParseState::new();
+
+    // Simulate streaming chunks
+    let chunks = vec![
+        "<|tool_calls_section_begin|>\n",
+        "<|tool_call_begin|>functions.",
+        "calculate:0",
+        "<|tool_call_argument_begin|>",
+        r#"{"x": 10, "#,
+        r#""y": 20}"#,
+        "<|tool_call_end|>\n",
+        "<|tool_calls_section_end|>",
+    ];
+
+    let mut found_name = false;
+    let mut found_complete = false;
+
+    for chunk in chunks {
+        let result = parser.parse_incremental(chunk, &mut state).await.unwrap();
+
+        match result {
+            StreamResult::ToolName { name, .. } => {
+                assert_eq!(name, "calculate");
+                found_name = true;
+            }
+            StreamResult::ToolComplete(tool) => {
+                assert_eq!(tool.function.name, "calculate");
+                found_complete = true;
+            }
+            _ => {}
+        }
+    }
+
+    assert!(found_name || found_complete);
+}
+
+#[test]
+fn test_kimik2_format_detection() {
+    let parser = KimiK2Parser::new();
+
+    // Should detect Kimi K2 format
+    assert!(parser.detect_format("<|tool_calls_section_begin|>"));
+    assert!(parser.detect_format("<|tool_call_begin|>"));
+    assert!(parser.detect_format("text with <|tool_calls_section_begin|> marker"));
+
+    // Should not detect other formats
+    assert!(!parser.detect_format("[TOOL_CALLS]"));
+    assert!(!parser.detect_format("<tool_call>"));
+    assert!(!parser.detect_format("plain text"));
+}
+
+#[tokio::test]
+async fn test_kimik2_sequential_indices() {
+    let parser = KimiK2Parser::new();
+
+    // Test with proper sequential indexing
+    let input = r#"<|tool_calls_section_begin|>
+<|tool_call_begin|>functions.first:0<|tool_call_argument_begin|>{"param": "a"}<|tool_call_end|>
+<|tool_call_begin|>functions.second:1<|tool_call_argument_begin|>{"param": "b"}<|tool_call_end|>
+<|tool_call_begin|>functions.third:2<|tool_call_argument_begin|>{"param": "c"}<|tool_call_end|>
+<|tool_calls_section_end|>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 3);
+    assert_eq!(result[0].function.name, "first");
+    assert_eq!(result[1].function.name, "second");
+    assert_eq!(result[2].function.name, "third");
+}
+
+#[tokio::test]
+async fn test_function_index_extraction() {
+    let parser = KimiK2Parser::new();
+
+    let input = r#"Text before tool calls.
+<|tool_calls_section_begin|>
+<|tool_call_begin|>functions.search:0<|tool_call_argument_begin|>{"query": "rust"}<|tool_call_end|>
+<|tool_call_begin|>functions.calc:1<|tool_call_argument_begin|>{"x": 10}<|tool_call_end|>
+<|tool_calls_section_end|>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 2);
+    assert_eq!(result[0].function.name, "search");
+    assert_eq!(result[1].function.name, "calc");
+    // TODO: Verify indices are preserved: 0 and 1
+    // TODO: Verify normal text = "Text before tool calls."
+}
+
+#[tokio::test]
+async fn test_namespace_extraction() {
+    let parser = KimiK2Parser::new();
+
+    let input = r#"<|tool_calls_section_begin|>
+<|tool_call_begin|>api.tools.search:0<|tool_call_argument_begin|>{"q": "test"}<|tool_call_end|>
+<|tool_calls_section_end|>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "search"); // Should extract after last dot
+}
diff --git a/sgl-router/tests/tool_parser_llama.rs b/sgl-router/tests/tool_parser_llama.rs
new file mode 100644
index 000000000..4f86ef2b2
--- /dev/null
+++ b/sgl-router/tests/tool_parser_llama.rs
@@ -0,0 +1,424 @@
+//! Llama Parser Integration Tests
+//!
+//! Tests for the Llama parser which handles <|python_tag|> format and plain JSON
+
+use sglang_router_rs::tool_parser::{LlamaParser, ToolParser};
+
+#[tokio::test]
+async fn test_llama_python_tag_format() {
+    let parser = LlamaParser::new();
+    let input = r#"<|python_tag|>{"name": "search", "arguments": {"query": "weather"}}"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "search");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["query"], "weather");
+}
+
+#[tokio::test]
+async fn test_llama_plain_json_fallback() {
+    let parser = LlamaParser::new();
+    let input = r#"{"name": "calculate", "arguments": {"x": 5, "y": 10}}"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "calculate");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["x"], 5);
+    assert_eq!(args["y"], 10);
+}
+
+#[tokio::test]
+async fn test_llama_with_text_before() {
+    let parser = LlamaParser::new();
+    let input = r#"Let me help you with that. <|python_tag|>{"name": "get_time", "arguments": {"timezone": "UTC"}}"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "get_time");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["timezone"], "UTC");
+}
+
+#[tokio::test]
+async fn test_llama_with_nested_json() {
+    let parser = LlamaParser::new();
+    let input = r#"<|python_tag|>{
+        "name": "update_settings",
+        "arguments": {
+            "preferences": {
+                "theme": "dark",
+                "language": "en"
+            },
+            "notifications": true
+        }
+    }"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "update_settings");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["preferences"]["theme"], "dark");
+    assert_eq!(args["notifications"], true);
+}
+
+#[tokio::test]
+async fn test_llama_empty_arguments() {
+    let parser = LlamaParser::new();
+
+    // With python_tag
+    let input = r#"<|python_tag|>{"name": "ping", "arguments": {}}"#;
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "ping");
+
+    // Plain JSON
+    let input = r#"{"name": "ping", "arguments": {}}"#;
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "ping");
+}
+
+#[tokio::test]
+async fn test_llama_format_detection() {
+    let parser = LlamaParser::new();
+
+    assert!(parser.detect_format(r#"<|python_tag|>{"name": "test"}"#));
+    assert!(parser.detect_format(r#"{"name": "test", "arguments": {}}"#));
+    assert!(!parser.detect_format("plain text"));
+    assert!(!parser.detect_format(r#"{"key": "value"}"#)); // No name field
+}
+
+#[tokio::test]
+async fn test_llama_invalid_json_after_tag() {
+    let parser = LlamaParser::new();
+
+    let input = r#"<|python_tag|>{"name": invalid}"#;
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 0);
+}
+
+#[tokio::test]
+async fn test_llama_real_world_output() {
+    let parser = LlamaParser::new();
+
+    // Actual output from Llama 3.2 model - simplified for testing
+    let input = r#"I'll search for that information for you.
+
+<|python_tag|>{"name": "web_search", "arguments": {"query": "Llama 3.2 model capabilities", "num_results": 5, "search_type": "recent"}}"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "web_search");
+
+    // Test with nicely formatted JSON
+    let formatted_input = r#"<|python_tag|>{
+    "name": "get_current_time",
+    "arguments": {
+        "timezone": "America/New_York",
+        "format": "ISO8601"
+    }
+}"#;
+
+    let result2 = parser.parse_complete(formatted_input).await.unwrap();
+    assert_eq!(result2.len(), 1);
+    assert_eq!(result2[0].function.name, "get_current_time");
+}
+
+#[tokio::test]
+async fn test_llama_json_array_format() {
+    let parser = LlamaParser::new();
+
+    // Plain JSON array (should work as fallback)
+    let input = r#"[{"name": "func1", "arguments": {}}, {"name": "func2", "arguments": {}}]"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    // Current implementation might handle this through JSON fallback
+    assert!(!result.is_empty());
+}
+
+#[tokio::test]
+async fn test_single_json() {
+    // Test parsing plain JSON without python_tag
+    let parser = LlamaParser::new();
+    let text = r#"{"name": "get_weather", "arguments": {"city": "Paris"}}"#;
+
+    let result = parser.parse_complete(text).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "get_weather");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["city"], "Paris");
+}
+
+#[tokio::test]
+async fn test_multiple_json_with_separator() {
+    // Test multiple JSON objects with semicolon separator
+    let parser = LlamaParser::new();
+    let text = r#"<|python_tag|>{"name": "get_weather", "arguments": {"city": "Paris"}};{"name": "get_tourist_attractions", "arguments": {"city": "Paris"}}"#;
+
+    let result = parser.parse_complete(text).await.unwrap();
+    // Note: Current implementation may only parse the first one due to semicolon handling
+    assert!(!result.is_empty());
+    assert_eq!(result[0].function.name, "get_weather");
+}
+
+#[tokio::test]
+async fn test_multiple_json_with_separator_customized() {
+    // Test multiple JSON objects with python_tag repeated
+    let parser = LlamaParser::new();
+    let text = r#"<|python_tag|>{"name": "get_weather", "arguments": {}}<|python_tag|>{"name": "get_tourist_attractions", "arguments": {}}"#;
+
+    let result = parser.parse_complete(text).await.unwrap();
+    // Current implementation may handle this differently
+    assert!(!result.is_empty());
+    assert_eq!(result[0].function.name, "get_weather");
+}
+
+#[tokio::test]
+async fn test_json_with_trailing_text() {
+    // Test JSON with trailing text after
+    let parser = LlamaParser::new();
+    let text = r#"{"name": "get_weather", "arguments": {}} Some follow-up text"#;
+
+    let result = parser.parse_complete(text).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "get_weather");
+}
+
+#[tokio::test]
+async fn test_invalid_then_valid_json() {
+    // Test error recovery - invalid JSON followed by valid JSON
+    let parser = LlamaParser::new();
+    let text = r#"{"name": "get_weather", "arguments": {{"name": "get_weather", "arguments": {}}"#;
+
+    let result = parser.parse_complete(text).await.unwrap();
+    // Should parse at least one valid JSON
+    if !result.is_empty() {
+        assert_eq!(result[0].function.name, "get_weather");
+    }
+}
+
+#[tokio::test]
+async fn test_plain_text_only() {
+    // Test plain text with no tool calls
+    let parser = LlamaParser::new();
+    let text = "This is just plain explanation text.";
+
+    let result = parser.parse_complete(text).await.unwrap();
+    assert_eq!(result.len(), 0);
+}
+
+#[tokio::test]
+async fn test_with_python_tag_prefix() {
+    // Test text before python_tag
+    let parser = LlamaParser::new();
+    let text = r#"Some intro. <|python_tag|>{"name": "get_weather", "arguments": {}}"#;
+
+    let result = parser.parse_complete(text).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "get_weather");
+}
+
+// ============================================================================
+// STREAMING TESTS
+// ============================================================================
+
+#[tokio::test]
+async fn test_llama_streaming_simple() {
+    let parser = LlamaParser::new();
+    let mut state = sglang_router_rs::tool_parser::ParseState::new();
+
+    // Send complete JSON at once
+    let full_json = r#"<|python_tag|>{"name": "search", "arguments": {"query": "weather"}}"#;
+
+    let result = parser
+        .parse_incremental(full_json, &mut state)
+        .await
+        .unwrap();
+
+    match result {
+        sglang_router_rs::tool_parser::StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "search");
+        }
+        _ => panic!("Expected ToolComplete for complete JSON input"),
+    }
+}
+
+#[tokio::test]
+async fn test_llama_streaming_partial() {
+    let parser = LlamaParser::new();
+    let mut state = sglang_router_rs::tool_parser::ParseState::new();
+
+    // Stream in chunks
+    let chunks = vec![
+        r#"<|python"#,
+        r#"_tag|>{"name": "#,
+        r#""calculate", "#,
+        r#""arguments": {"x": 10}"#,
+        r#"}"#,
+    ];
+
+    let mut got_complete = false;
+
+    for chunk in chunks {
+        let result = parser.parse_incremental(chunk, &mut state).await.unwrap();
+        if let sglang_router_rs::tool_parser::StreamResult::ToolComplete(tool) = result {
+            assert_eq!(tool.function.name, "calculate");
+            got_complete = true;
+        }
+    }
+
+    assert!(got_complete, "Should have completed parsing");
+}
+
+#[tokio::test]
+async fn test_llama_streaming_plain_json() {
+    let parser = LlamaParser::new();
+    let mut state = sglang_router_rs::tool_parser::ParseState::new();
+
+    // Stream plain JSON without python_tag
+    let chunks = vec![
+        r#"{"name": "#,
+        r#""search", "#,
+        r#""arguments": "#,
+        r#"{"query": "#,
+        r#""test"}}"#,
+    ];
+
+    let mut got_complete = false;
+
+    for chunk in chunks {
+        let result = parser.parse_incremental(chunk, &mut state).await.unwrap();
+        if let sglang_router_rs::tool_parser::StreamResult::ToolComplete(tool) = result {
+            assert_eq!(tool.function.name, "search");
+            got_complete = true;
+        }
+    }
+
+    assert!(got_complete, "Should have completed parsing");
+}
+
+#[tokio::test]
+async fn test_llama_streaming_with_text_before() {
+    let parser = LlamaParser::new();
+    let mut state = sglang_router_rs::tool_parser::ParseState::new();
+
+    let chunks = vec![
+        r#"Let me help you. "#,
+        r#"<|python_tag|>"#,
+        r#"{"name": "get_time","#,
+        r#" "arguments": {"#,
+        r#""timezone": "UTC"}}"#,
+    ];
+
+    let mut got_complete = false;
+
+    for chunk in chunks {
+        let result = parser.parse_incremental(chunk, &mut state).await.unwrap();
+        if let sglang_router_rs::tool_parser::StreamResult::ToolComplete(tool) = result {
+            assert_eq!(tool.function.name, "get_time");
+            got_complete = true;
+        }
+    }
+
+    assert!(got_complete, "Should have completed parsing");
+}
+
+#[tokio::test]
+async fn test_llama_streaming_multiple_tools() {
+    // Test streaming multiple tool calls with semicolon separator
+    let parser = LlamaParser::new();
+    let mut state = sglang_router_rs::tool_parser::ParseState::new();
+
+    let text =
+        r#"<|python_tag|>{"name": "func1", "arguments": {}};{"name": "func2", "arguments": {}}"#;
+
+    let result = parser.parse_incremental(text, &mut state).await.unwrap();
+
+    // Should get first tool complete
+    match result {
+        sglang_router_rs::tool_parser::StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "func1");
+        }
+        _ => panic!("Expected first tool to be complete"),
+    }
+
+    // Process remaining buffer to get second tool
+    let result2 = parser.parse_incremental("", &mut state).await.unwrap();
+    match result2 {
+        sglang_router_rs::tool_parser::StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "func2");
+        }
+        _ => panic!("Expected second tool to be complete"),
+    }
+}
+
+#[tokio::test]
+async fn test_llama_streaming_multiple_tools_chunked() {
+    // Test streaming multiple tool calls arriving in chunks
+    let parser = LlamaParser::new();
+    let mut state = sglang_router_rs::tool_parser::ParseState::new();
+
+    // First chunk - incomplete first JSON
+    let chunk1 = r#"<|python_tag|>{"name": "get_weather", "arguments""#;
+    let result1 = parser.parse_incremental(chunk1, &mut state).await.unwrap();
+
+    // Should be incomplete or have tool name
+    match result1 {
+        sglang_router_rs::tool_parser::StreamResult::Incomplete
+        | sglang_router_rs::tool_parser::StreamResult::ToolName { .. }
+        | sglang_router_rs::tool_parser::StreamResult::ToolArguments { .. } => {
+            // Expected - could get tool name or be incomplete or even partial args
+        }
+        _ => panic!(
+            "Expected incomplete or tool name for partial JSON, got: {:?}",
+            result1
+        ),
+    }
+
+    // Second chunk - complete first JSON and separator
+    let chunk2 = r#": {"city": "Paris"}};{"name": "#;
+    let result2 = parser.parse_incremental(chunk2, &mut state).await.unwrap();
+
+    // Should get first tool complete
+    match result2 {
+        sglang_router_rs::tool_parser::StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "get_weather");
+            let args: serde_json::Value = serde_json::from_str(&tool.function.arguments).unwrap();
+            assert_eq!(args["city"], "Paris");
+        }
+        _ => panic!("Expected first tool to be complete after separator"),
+    }
+
+    // Third chunk - complete second JSON
+    let chunk3 = r#""get_time", "arguments": {"timezone": "UTC"}}"#;
+    let result3 = parser.parse_incremental(chunk3, &mut state).await.unwrap();
+
+    // Should get second tool complete
+    match result3 {
+        sglang_router_rs::tool_parser::StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "get_time");
+            let args: serde_json::Value = serde_json::from_str(&tool.function.arguments).unwrap();
+            assert_eq!(args["timezone"], "UTC");
+        }
+        _ => {
+            // If not complete yet, try one more empty chunk
+            let result4 = parser.parse_incremental("", &mut state).await.unwrap();
+            match result4 {
+                sglang_router_rs::tool_parser::StreamResult::ToolComplete(tool) => {
+                    assert_eq!(tool.function.name, "get_time");
+                    let args: serde_json::Value =
+                        serde_json::from_str(&tool.function.arguments).unwrap();
+                    assert_eq!(args["timezone"], "UTC");
+                }
+                _ => panic!("Expected second tool to be complete"),
+            }
+        }
+    }
+}
diff --git a/sgl-router/tests/tool_parser_mistral.rs b/sgl-router/tests/tool_parser_mistral.rs
new file mode 100644
index 000000000..3801006f5
--- /dev/null
+++ b/sgl-router/tests/tool_parser_mistral.rs
@@ -0,0 +1,153 @@
+//! Mistral Parser Integration Tests
+//!
+//! Tests for the Mistral parser which handles [TOOL_CALLS] format
+
+use serde_json::json;
+use sglang_router_rs::tool_parser::{MistralParser, ToolParser};
+
+#[tokio::test]
+async fn test_mistral_single_tool() {
+    let parser = MistralParser::new();
+    let input = r#"Let me search for that.
+[TOOL_CALLS] [{"name": "search_web", "arguments": {"query": "latest news", "max_results": 5}}]"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "search_web");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["query"], "latest news");
+    assert_eq!(args["max_results"], 5);
+}
+
+#[tokio::test]
+async fn test_mistral_multiple_tools() {
+    let parser = MistralParser::new();
+    let input = r#"I'll help you with both tasks.
+[TOOL_CALLS] [
+    {"name": "get_weather", "arguments": {"city": "Tokyo", "units": "celsius"}},
+    {"name": "search_news", "arguments": {"query": "AI developments", "limit": 10}}
+]"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 2);
+
+    assert_eq!(result[0].function.name, "get_weather");
+    let args0: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args0["city"], "Tokyo");
+
+    assert_eq!(result[1].function.name, "search_news");
+    let args1: serde_json::Value = serde_json::from_str(&result[1].function.arguments).unwrap();
+    assert_eq!(args1["query"], "AI developments");
+}
+
+#[tokio::test]
+async fn test_mistral_nested_json() {
+    let parser = MistralParser::new();
+    let input = r#"Processing complex data.
+[TOOL_CALLS] [{"name": "process_data", "arguments": {"config": {"nested": {"value": [1, 2, 3]}}, "enabled": true}}]"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["config"]["nested"]["value"], json!([1, 2, 3]));
+    assert_eq!(args["enabled"], true);
+}
+
+#[tokio::test]
+async fn test_mistral_with_text_after() {
+    let parser = MistralParser::new();
+    let input = r#"[TOOL_CALLS] [{"name": "test", "arguments": {}}]
+
+And here's some text after the tool call that should be ignored."#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "test");
+}
+
+#[tokio::test]
+async fn test_mistral_empty_arguments() {
+    let parser = MistralParser::new();
+    let input = r#"[TOOL_CALLS] [{"name": "ping", "arguments": {}}]"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "ping");
+}
+
+#[tokio::test]
+async fn test_mistral_with_brackets_in_strings() {
+    let parser = MistralParser::new();
+    let input = r#"[TOOL_CALLS] [{"name": "echo", "arguments": {"text": "Array notation: arr[0] = value[1]"}}]"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["text"], "Array notation: arr[0] = value[1]");
+}
+
+#[tokio::test]
+async fn test_mistral_format_detection() {
+    let parser = MistralParser::new();
+
+    assert!(parser.detect_format("[TOOL_CALLS] ["));
+    assert!(parser.detect_format("Some text [TOOL_CALLS] ["));
+    assert!(!parser.detect_format("Just plain text"));
+    assert!(!parser.detect_format("[{\"name\": \"test\"}]")); // JSON array without TOOL_CALLS
+}
+
+#[tokio::test]
+async fn test_mistral_malformed_json() {
+    let parser = MistralParser::new();
+
+    // Missing closing bracket
+    let input = r#"[TOOL_CALLS] [{"name": "test", "arguments": {}"#;
+    if let Ok(result) = parser.parse_complete(input).await {
+        assert_eq!(result.len(), 0);
+    }
+    // Error is also acceptable for malformed input
+
+    // Invalid JSON inside
+    let input = r#"[TOOL_CALLS] [{"name": invalid}]"#;
+    if let Ok(result) = parser.parse_complete(input).await {
+        assert_eq!(result.len(), 0);
+    }
+    // Error is also acceptable for malformed input
+}
+
+#[tokio::test]
+async fn test_mistral_real_world_output() {
+    let parser = MistralParser::new();
+
+    // Actual output from Mistral model
+    let input = r#"I'll search for information about Rust programming and check the weather in San Francisco.
+
+[TOOL_CALLS] [
+    {
+        "name": "web_search",
+        "arguments": {
+            "query": "Rust programming language features 2024",
+            "max_results": 3,
+            "include_snippets": true
+        }
+    },
+    {
+        "name": "get_weather",
+        "arguments": {
+            "location": "San Francisco, CA",
+            "units": "fahrenheit",
+            "include_forecast": false
+        }
+    }
+]
+
+Let me execute these searches for you."#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 2);
+    assert_eq!(result[0].function.name, "web_search");
+    assert_eq!(result[1].function.name, "get_weather");
+}
diff --git a/sgl-router/tests/tool_parser_mixed_edge_cases.rs b/sgl-router/tests/tool_parser_mixed_edge_cases.rs
new file mode 100644
index 000000000..595fb74a0
--- /dev/null
+++ b/sgl-router/tests/tool_parser_mixed_edge_cases.rs
@@ -0,0 +1,301 @@
+//! Mixed Format and Additional Edge Case Tests
+//!
+//! Tests for edge cases across parsers and mixed format scenarios
+
+use serde_json::json;
+use sglang_router_rs::tool_parser::{
+    JsonParser, LlamaParser, MistralParser, ParseState, PythonicParser, QwenParser, StreamResult,
+    ToolParser,
+};
+
+#[tokio::test]
+async fn test_mixed_formats_in_text() {
+    // Test that parsers correctly ignore other formats' markers
+
+    let json_parser = JsonParser::new();
+    let input = r#"
+    Some text with [TOOL_CALLS] marker that shouldn't trigger.
+    Also has <tool_call> tags and [function()] syntax.
+    But here's the actual JSON: {"name": "test", "arguments": {}}
+    "#;
+
+    let result = json_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "test");
+
+    // Mistral parser should ignore JSON and other formats
+    let mistral_parser = MistralParser::new();
+    let input = r#"
+    {"name": "fake"} [function()] <tool_call>
+    [TOOL_CALLS] [{"name": "real", "arguments": {}}]
+    "#;
+
+    let result = mistral_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "real");
+}
+
+#[tokio::test]
+async fn test_format_markers_in_string_content() {
+    // Test that format markers inside string content don't interfere
+
+    let pythonic_parser = PythonicParser::new();
+    let input = r#"[echo(text="Use [TOOL_CALLS] and <tool_call> in text")]"#;
+
+    let result = pythonic_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["text"], "Use [TOOL_CALLS] and <tool_call> in text");
+
+    let qwen_parser = QwenParser::new();
+    let input = r#"<tool_call>
+{"name": "log", "arguments": {"msg": "Found [function()] pattern"}}
+</tool_call>"#;
+
+    let result = qwen_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["msg"], "Found [function()] pattern");
+}
+
+#[tokio::test]
+async fn test_deeply_nested_json_structures() {
+    let json_parser = JsonParser::new();
+
+    let input = r#"{
+        "name": "deep_process",
+        "arguments": {
+            "level1": {
+                "level2": {
+                    "level3": {
+                        "level4": {
+                            "level5": {
+                                "data": [1, 2, [3, [4, 5]]]
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }"#;
+
+    let result = json_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "deep_process");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert!(args["level1"]["level2"]["level3"]["level4"]["level5"]["data"].is_array());
+}
+
+#[tokio::test]
+async fn test_multiple_sequential_calls_different_formats() {
+    // Simulate a scenario where different parts of text have different formats
+    // (though each parser will only recognize its own format)
+
+    let llama_parser = LlamaParser::new();
+
+    // Llama parser currently only returns the first tool found
+    let input = r#"First call: <|python_tag|>{"name": "call1", "arguments": {}}"#;
+
+    let result = llama_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "call1");
+
+    // Test plain JSON separately
+    let input2 = r#"{"name": "call2", "arguments": {"x": 1}}"#;
+    let result2 = llama_parser.parse_complete(input2).await.unwrap();
+    assert_eq!(result2.len(), 1);
+    assert_eq!(result2[0].function.name, "call2");
+}
+
+#[tokio::test]
+async fn test_empty_and_whitespace_variations() {
+    let json_parser = JsonParser::new();
+
+    // Various whitespace scenarios
+    let cases = vec![
+        r#"  {"name":"compact","arguments":{}}  "#,
+        r#"
+
+        {"name": "spaced", "arguments": {}}
+
+        "#,
+        r#"	{"name": "tabbed", "arguments": {}}	"#, // tabs
+    ];
+
+    for input in cases {
+        let result = json_parser.parse_complete(input).await.unwrap();
+        assert_eq!(result.len(), 1, "Should parse regardless of whitespace");
+    }
+}
+
+#[tokio::test]
+async fn test_special_json_values() {
+    let json_parser = JsonParser::new();
+
+    // Test various special JSON values
+    let input = r#"{
+        "name": "test_special",
+        "arguments": {
+            "float_e": 1.23e10,
+            "float_neg_e": 1.23e-10,
+            "hex_like": "0x1234",
+            "very_long_num": 99999999999999999999,
+            "special_strings": ["", " ", "\u0000", "\u001f"],
+            "escaped": "\\n\\r\\t\\\"\\\\",
+            "unicode": "\u4e2d\u6587"
+        }
+    }"#;
+
+    let result = json_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "test_special");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert!(args["special_strings"].is_array());
+    assert!(args["escaped"].is_string());
+}
+
+#[tokio::test]
+async fn test_parser_recovery_after_invalid_input() {
+    let mut state = ParseState::new();
+    let parser = JsonParser::new();
+
+    // Send invalid JSON first
+    let _ = parser.parse_incremental(r#"{"broken": "#, &mut state).await;
+
+    // Clear state and try valid JSON
+    state.buffer.clear();
+    let result = parser
+        .parse_incremental(r#"{"name": "valid", "arguments": {}}"#, &mut state)
+        .await
+        .unwrap();
+
+    match result {
+        StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "valid");
+        }
+        _ => {
+            // Might be incomplete depending on implementation
+        }
+    }
+}
+
+#[tokio::test]
+async fn test_boundary_cases_for_extraction() {
+    // Test edge cases in JSON extraction from text
+
+    let json_parser = JsonParser::new();
+
+    // JSON at the very beginning
+    let input = r#"{"name": "start", "arguments": {}} and then text"#;
+    let result = json_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "start");
+
+    // JSON at the very end
+    let input = r#"Some text first {"name": "end", "arguments": {}}"#;
+    let result = json_parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "end");
+
+    // Multiple JSON objects in text (should find first valid one)
+    let input =
+        r#"Text {"name": "first", "arguments": {}} more {"name": "second", "arguments": {}}"#;
+    let result = json_parser.parse_complete(input).await.unwrap();
+    assert!(!result.is_empty());
+    assert_eq!(result[0].function.name, "first");
+}
+
+#[tokio::test]
+async fn test_pythonic_edge_cases() {
+    let parser = PythonicParser::new();
+
+    // Function name with underscores and numbers
+    let input = r#"[func_name_2(param_1="value")]"#;
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "func_name_2");
+
+    // Empty string argument
+    let input = r#"[process(text="")]"#;
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["text"], "");
+}
+
+#[tokio::test]
+async fn test_mistral_with_pretty_json() {
+    let parser = MistralParser::new();
+
+    // Pretty-printed JSON in Mistral format
+    let input = r#"[TOOL_CALLS] [
+        {
+            "name": "formatted",
+            "arguments": {
+                "nested": {
+                    "key": "value"
+                },
+                "array": [
+                    1,
+                    2,
+                    3
+                ]
+            }
+        }
+    ]"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "formatted");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["nested"]["key"], "value");
+    assert_eq!(args["array"], json!([1, 2, 3]));
+}
+
+#[tokio::test]
+async fn test_qwen_with_cdata_like_content() {
+    let parser = QwenParser::new();
+
+    // Test with content that looks like CDATA but isn't
+    // Note: QwenParser expects exactly "<tool_call>\n" with the newline
+    let input = r#"<tool_call>
+{"name": "process", "arguments": {"xml": "<![CDATA[some data]]>"}}
+</tool_call>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "process");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["xml"], "<![CDATA[some data]]>");
+}
+
+#[tokio::test]
+async fn test_extremely_long_function_names() {
+    let parser = PythonicParser::new();
+
+    let long_name = "very_long_function_name_that_might_appear_in_generated_code_somewhere";
+    let input = format!(r#"[{}(param="value")]"#, long_name);
+
+    let result = parser.parse_complete(&input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, long_name);
+}
+
+#[tokio::test]
+async fn test_json_with_duplicate_keys() {
+    let parser = JsonParser::new();
+
+    // JSON with duplicate keys (last one should win per JSON spec)
+    let input = r#"{"name": "test", "arguments": {"key": "first", "key": "second"}}"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    // JSON parsers typically keep the last value for duplicate keys
+    assert_eq!(args["key"], "second");
+}
diff --git a/sgl-router/tests/tool_parser_pythonic.rs b/sgl-router/tests/tool_parser_pythonic.rs
new file mode 100644
index 000000000..4297fdbd3
--- /dev/null
+++ b/sgl-router/tests/tool_parser_pythonic.rs
@@ -0,0 +1,559 @@
+//! Pythonic Parser Integration Tests
+//!
+//! Tests for the Pythonic parser which handles Python function call syntax
+
+use serde_json::json;
+use sglang_router_rs::tool_parser::{PythonicParser, ToolParser};
+
+#[tokio::test]
+async fn test_pythonic_single_function() {
+    let parser = PythonicParser::new();
+    let input = r#"[get_weather(city="London", units="celsius")]"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "get_weather");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["city"], "London");
+    assert_eq!(args["units"], "celsius");
+}
+
+#[tokio::test]
+async fn test_pythonic_multiple_functions() {
+    let parser = PythonicParser::new();
+    let input =
+        r#"[search_web(query="Rust programming", max_results=5), get_time(timezone="UTC")]"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 2);
+    assert_eq!(result[0].function.name, "search_web");
+    assert_eq!(result[1].function.name, "get_time");
+
+    let args0: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args0["query"], "Rust programming");
+    assert_eq!(args0["max_results"], 5);
+}
+
+#[tokio::test]
+async fn test_pythonic_with_python_literals() {
+    let parser = PythonicParser::new();
+    let input = r#"[configure(enabled=True, disabled=False, optional=None)]"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["enabled"], true);
+    assert_eq!(args["disabled"], false);
+    assert_eq!(args["optional"], json!(null));
+}
+
+#[tokio::test]
+async fn test_pythonic_with_lists_and_dicts() {
+    let parser = PythonicParser::new();
+    let input =
+        r#"[process_data(items=[1, 2, 3], config={"key": "value", "nested": {"deep": True}})]"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["items"], json!([1, 2, 3]));
+    assert_eq!(args["config"]["key"], "value");
+    assert_eq!(args["config"]["nested"]["deep"], true);
+}
+
+#[tokio::test]
+async fn test_pythonic_with_special_tokens() {
+    let parser = PythonicParser::new();
+
+    // Llama 4 sometimes outputs these tokens
+    let input = r#"<|python_start|>[calculate(x=10, y=20)]<|python_end|>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "calculate");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["x"], 10);
+    assert_eq!(args["y"], 20);
+}
+
+#[tokio::test]
+async fn test_pythonic_with_nested_parentheses() {
+    let parser = PythonicParser::new();
+    let input = r#"[math_eval(expression="(2 + 3) * (4 - 1)", round_to=2)]"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["expression"], "(2 + 3) * (4 - 1)");
+    assert_eq!(args["round_to"], 2);
+}
+
+#[tokio::test]
+async fn test_pythonic_with_escaped_quotes() {
+    let parser = PythonicParser::new();
+    let input = r#"[echo(text="She said \"Hello\" to him")]"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["text"], "She said \"Hello\" to him");
+}
+
+#[tokio::test]
+async fn test_pythonic_empty_arguments() {
+    let parser = PythonicParser::new();
+    let input = r#"[ping()]"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "ping");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args, json!({}));
+}
+
+#[tokio::test]
+async fn test_pythonic_format_detection() {
+    let parser = PythonicParser::new();
+
+    assert!(parser.detect_format("[function_name("));
+    assert!(parser.detect_format("[get_weather(city=\"NYC\")]"));
+    assert!(!parser.detect_format("Just plain text"));
+    assert!(!parser.detect_format("[1, 2, 3]")); // Plain list
+    assert!(!parser.detect_format("{\"name\": \"test\"}")); // JSON
+}
+
+#[tokio::test]
+async fn test_pythonic_invalid_syntax() {
+    let parser = PythonicParser::new();
+
+    // Missing closing bracket
+    let input = r#"[function(arg=value"#;
+    if let Ok(result) = parser.parse_complete(input).await {
+        assert_eq!(result.len(), 0);
+    }
+    // Error is also acceptable for invalid syntax
+
+    // Invalid Python syntax - empty parameter name
+    // Note: The parser currently accepts this invalid syntax and returns a result
+    // This is a known limitation of the current implementation
+    let input = r#"[function(=value)]"#;
+    if let Ok(result) = parser.parse_complete(input).await {
+        // The parser incorrectly accepts this, returning 1 result
+        // We'll accept this behavior for now but note it's not ideal
+        assert!(result.len() <= 1, "Should parse at most one function");
+    }
+    // Error would be the correct behavior
+}
+
+#[tokio::test]
+async fn test_pythonic_real_world_llama4() {
+    let parser = PythonicParser::new();
+
+    // Actual output from Llama 4 model
+    let input = r#"I'll help you with multiple tasks. Let me search for information and perform calculations.
+
+[web_search(query="latest Rust features", max_results=3, safe_search=True),
+ calculate(expression="42 * 3.14159", precision=2),
+ get_weather(city="San Francisco", units="fahrenheit", include_forecast=False)]
+
+These functions will provide the information you need."#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 3);
+    assert_eq!(result[0].function.name, "web_search");
+    assert_eq!(result[1].function.name, "calculate");
+    assert_eq!(result[2].function.name, "get_weather");
+
+    let args0: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args0["query"], "latest Rust features");
+    assert_eq!(args0["safe_search"], true);
+}
+
+#[tokio::test]
+async fn test_pythonic_nested_brackets_in_lists() {
+    let parser = PythonicParser::new();
+
+    // Test nested brackets within list arguments
+    let input = r#"[process_matrix(data=[[1, 2], [3, 4]], labels=["row[0]", "row[1]"])]"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "process_matrix");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["data"], json!([[1, 2], [3, 4]]));
+    assert_eq!(args["labels"], json!(["row[0]", "row[1]"]));
+}
+
+#[tokio::test]
+async fn test_pythonic_nested_brackets_in_dicts() {
+    let parser = PythonicParser::new();
+
+    // Test nested brackets within dictionary arguments
+    let input =
+        r#"[analyze(config={"patterns": ["[a-z]+", "[0-9]+"], "nested": {"list": [1, [2, 3]]}})]"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "analyze");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["config"]["patterns"], json!(["[a-z]+", "[0-9]+"]));
+    assert_eq!(args["config"]["nested"]["list"], json!([1, [2, 3]]));
+}
+
+#[tokio::test]
+async fn test_pythonic_mixed_quotes() {
+    let parser = PythonicParser::new();
+
+    // Test mixed quote types in arguments
+    let input = r#"[format_text(single='Hello', double="World", mixed="It's \"quoted\"")]"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "format_text");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["single"], "Hello");
+    assert_eq!(args["double"], "World");
+    assert_eq!(args["mixed"], "It's \"quoted\"");
+}
+
+#[tokio::test]
+async fn test_pythonic_complex_nesting() {
+    let parser = PythonicParser::new();
+
+    // Test complex nested structures
+    let input = r#"[transform(
+        matrix=[[1, [2, 3]], [4, [5, [6, 7]]]],
+        operations=[{"type": "scale", "factor": [2, 3]}, {"type": "rotate", "angle": 90}],
+        metadata={"tags": ["nested[0]", "nested[1]"], "config": {"depth": [1, 2, 3]}}
+    )]"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "transform");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert!(args["matrix"].is_array());
+    assert!(args["operations"].is_array());
+    assert_eq!(args["operations"][0]["type"], "scale");
+    assert_eq!(args["metadata"]["config"]["depth"], json!([1, 2, 3]));
+}
+
+#[tokio::test]
+async fn test_parse_streaming_no_brackets() {
+    // Test parsing text with no brackets (no tool calls)
+    let parser = PythonicParser::new();
+    let mut state = sglang_router_rs::tool_parser::ParseState::new();
+
+    let text = "This is just normal text without any tool calls.";
+    let result = parser.parse_incremental(text, &mut state).await.unwrap();
+
+    match result {
+        sglang_router_rs::tool_parser::StreamResult::Incomplete => {
+            // Expected - no tool calls found
+            assert_eq!(state.buffer, text);
+        }
+        _ => panic!("Should return Incomplete for text without tool calls"),
+    }
+}
+
+#[tokio::test]
+async fn test_parse_streaming_complete_tool_call() {
+    // Test parsing a complete tool call
+    let parser = PythonicParser::new();
+    let mut state = sglang_router_rs::tool_parser::ParseState::new();
+
+    let text = "Here's a tool call: [get_weather(location='New York', unit='celsius')]";
+    let result = parser.parse_incremental(text, &mut state).await.unwrap();
+
+    match result {
+        sglang_router_rs::tool_parser::StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "get_weather");
+            let args: serde_json::Value = serde_json::from_str(&tool.function.arguments).unwrap();
+            assert_eq!(args["location"], "New York");
+            assert_eq!(args["unit"], "celsius");
+            assert_eq!(state.buffer, "");
+        }
+        _ => panic!("Should return ToolComplete for complete tool call"),
+    }
+}
+
+#[tokio::test]
+async fn test_parse_streaming_text_before_tool_call() {
+    // Test parsing text that appears before a tool call
+    let parser = PythonicParser::new();
+    let mut state = sglang_router_rs::tool_parser::ParseState::new();
+
+    let text = "This is some text before [get_weather(location='London')]";
+    let result = parser.parse_incremental(text, &mut state).await.unwrap();
+
+    match result {
+        sglang_router_rs::tool_parser::StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "get_weather");
+            let args: serde_json::Value = serde_json::from_str(&tool.function.arguments).unwrap();
+            assert_eq!(args["location"], "London");
+        }
+        _ => panic!("Should return ToolComplete"),
+    }
+}
+
+#[tokio::test]
+async fn test_parse_streaming_partial_tool_call() {
+    // Test parsing a partial tool call that spans multiple chunks
+    let parser = PythonicParser::new();
+    let mut state = sglang_router_rs::tool_parser::ParseState::new();
+
+    // First chunk with opening bracket but no closing bracket
+    let text1 = "Let me check the weather: [get_weather(location=";
+    let result1 = parser.parse_incremental(text1, &mut state).await.unwrap();
+
+    match result1 {
+        sglang_router_rs::tool_parser::StreamResult::Incomplete => {
+            assert!(state.buffer.contains("[get_weather(location="));
+        }
+        _ => panic!("First chunk should return Incomplete"),
+    }
+
+    // Second chunk completing the tool call
+    let text2 = "'Paris')]";
+    let result2 = parser.parse_incremental(text2, &mut state).await.unwrap();
+
+    match result2 {
+        sglang_router_rs::tool_parser::StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "get_weather");
+            let args: serde_json::Value = serde_json::from_str(&tool.function.arguments).unwrap();
+            assert_eq!(args["location"], "Paris");
+            assert_eq!(state.buffer, "");
+        }
+        _ => panic!("Second chunk should return ToolComplete"),
+    }
+}
+
+#[tokio::test]
+async fn test_parse_streaming_bracket_without_text_before() {
+    // Test parsing a tool call that starts at the beginning of the text
+    let parser = PythonicParser::new();
+    let mut state = sglang_router_rs::tool_parser::ParseState::new();
+
+    let text = "[search(query='python programming')]";
+    let result = parser.parse_incremental(text, &mut state).await.unwrap();
+
+    match result {
+        sglang_router_rs::tool_parser::StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "search");
+            let args: serde_json::Value = serde_json::from_str(&tool.function.arguments).unwrap();
+            assert_eq!(args["query"], "python programming");
+        }
+        _ => panic!("Should return ToolComplete"),
+    }
+}
+
+#[tokio::test]
+async fn test_parse_streaming_text_after_tool_call() {
+    // Test parsing text that appears after a tool call
+    let parser = PythonicParser::new();
+    let mut state = sglang_router_rs::tool_parser::ParseState::new();
+
+    // First chunk with complete tool call and some text after
+    let text = "[get_weather(location='Tokyo')] Here's the forecast:";
+    let result = parser.parse_incremental(text, &mut state).await.unwrap();
+
+    match result {
+        sglang_router_rs::tool_parser::StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "get_weather");
+            // Text after tool call should remain in buffer
+            // Note: Current implementation may clear buffer, this behavior needs verification
+        }
+        _ => panic!("Should return ToolComplete"),
+    }
+}
+
+#[tokio::test]
+async fn test_parse_streaming_multiple_tool_calls() {
+    // Test parsing multiple tool calls in sequence
+    let parser = PythonicParser::new();
+    let mut state = sglang_router_rs::tool_parser::ParseState::new();
+
+    let text = "[get_weather(location='Berlin'), search(query='restaurants')]";
+
+    // Current implementation may handle this as a single parse
+    let result = parser.parse_incremental(text, &mut state).await.unwrap();
+
+    // The parser should handle multiple tools in one bracket pair
+    match result {
+        sglang_router_rs::tool_parser::StreamResult::ToolComplete(_) => {
+            // Expected behavior - parses first tool
+        }
+        _ => {
+            // Also acceptable if it returns Incomplete waiting for more
+        }
+    }
+}
+
+#[tokio::test]
+async fn test_parse_streaming_opening_bracket_only() {
+    // Test parsing text with only an opening bracket but no closing bracket
+    let parser = PythonicParser::new();
+    let mut state = sglang_router_rs::tool_parser::ParseState::new();
+
+    let text = "Let's try this: [";
+    let result = parser.parse_incremental(text, &mut state).await.unwrap();
+
+    match result {
+        sglang_router_rs::tool_parser::StreamResult::Incomplete => {
+            assert!(state.buffer.ends_with("["));
+        }
+        _ => panic!("Should return Incomplete for partial bracket"),
+    }
+}
+
+#[tokio::test]
+async fn test_parse_streaming_nested_brackets() {
+    // Test parsing tool calls with nested brackets in arguments
+    let parser = PythonicParser::new();
+    let mut state = sglang_router_rs::tool_parser::ParseState::new();
+
+    let text = "[get_weather(location='New York', unit='celsius', data=[1, 2, 3])]";
+    let result = parser.parse_incremental(text, &mut state).await.unwrap();
+
+    match result {
+        sglang_router_rs::tool_parser::StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "get_weather");
+            let args: serde_json::Value = serde_json::from_str(&tool.function.arguments).unwrap();
+            assert_eq!(args["location"], "New York");
+            assert_eq!(args["unit"], "celsius");
+            assert_eq!(args["data"], json!([1, 2, 3]));
+        }
+        _ => panic!("Should return ToolComplete"),
+    }
+}
+
+#[tokio::test]
+async fn test_parse_streaming_nested_brackets_dict() {
+    // Test parsing tool calls with nested dictionaries and lists
+    let parser = PythonicParser::new();
+    let mut state = sglang_router_rs::tool_parser::ParseState::new();
+
+    let text = r#"[search(query='test', config={'options': [1, 2], 'nested': {'key': 'value'}})]"#;
+    let result = parser.parse_incremental(text, &mut state).await.unwrap();
+
+    match result {
+        sglang_router_rs::tool_parser::StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "search");
+            let args: serde_json::Value = serde_json::from_str(&tool.function.arguments).unwrap();
+            assert_eq!(args["query"], "test");
+            assert_eq!(args["config"]["options"], json!([1, 2]));
+            assert_eq!(args["config"]["nested"]["key"], "value");
+        }
+        _ => panic!("Should return ToolComplete"),
+    }
+}
+
+#[tokio::test]
+async fn test_parse_streaming_multiple_tools_with_nested_brackets() {
+    // Test parsing multiple tool calls with nested brackets
+    let parser = PythonicParser::new();
+    let mut state = sglang_router_rs::tool_parser::ParseState::new();
+
+    let text =
+        "[get_weather(location='Paris', data=[10, 20]), search(query='test', filters=['a', 'b'])]";
+    let result = parser.parse_incremental(text, &mut state).await.unwrap();
+
+    // Should parse both tools successfully
+    match result {
+        sglang_router_rs::tool_parser::StreamResult::ToolComplete(tool) => {
+            // At least gets the first tool
+            assert_eq!(tool.function.name, "get_weather");
+        }
+        _ => panic!("Should return ToolComplete"),
+    }
+}
+
+#[tokio::test]
+async fn test_parse_streaming_partial_nested_brackets() {
+    // Test parsing partial tool calls with nested brackets across chunks
+    let parser = PythonicParser::new();
+    let mut state = sglang_router_rs::tool_parser::ParseState::new();
+
+    // First chunk with nested brackets but incomplete
+    let text1 = "Here's a call: [get_weather(location='Tokyo', data=[1, 2";
+    let result1 = parser.parse_incremental(text1, &mut state).await.unwrap();
+
+    match result1 {
+        sglang_router_rs::tool_parser::StreamResult::Incomplete => {
+            assert!(state
+                .buffer
+                .contains("[get_weather(location='Tokyo', data=[1, 2"));
+        }
+        _ => panic!("First chunk should return Incomplete"),
+    }
+
+    // Second chunk completing the nested brackets
+    let text2 = ", 3])]";
+    let result2 = parser.parse_incremental(text2, &mut state).await.unwrap();
+
+    match result2 {
+        sglang_router_rs::tool_parser::StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "get_weather");
+            let args: serde_json::Value = serde_json::from_str(&tool.function.arguments).unwrap();
+            assert_eq!(args["location"], "Tokyo");
+            assert_eq!(args["data"], json!([1, 2, 3]));
+        }
+        _ => panic!("Second chunk should return ToolComplete"),
+    }
+}
+
+#[tokio::test]
+async fn test_parse_streaming_with_python_start_and_end_token() {
+    // Test parsing a message that starts with <|python_start|> and <|python_end|> across chunks
+    let parser = PythonicParser::new();
+    let mut state = sglang_router_rs::tool_parser::ParseState::new();
+
+    let chunks = vec![
+        "Here's a call: ",
+        "<|python_",
+        "start|>[get_weather(location=",
+        "'Tokyo', data=[1, 2",
+        ", 3])]<|python_end|>",
+    ];
+
+    let mut got_tool = false;
+
+    for chunk in chunks {
+        let result = parser.parse_incremental(chunk, &mut state).await.unwrap();
+        if let sglang_router_rs::tool_parser::StreamResult::ToolComplete(tool) = result {
+            assert_eq!(tool.function.name, "get_weather");
+            let args: serde_json::Value = serde_json::from_str(&tool.function.arguments).unwrap();
+            assert_eq!(args["location"], "Tokyo");
+            assert_eq!(args["data"], json!([1, 2, 3]));
+            got_tool = true;
+        }
+    }
+
+    assert!(got_tool, "Should have parsed the tool call");
+}
+
+#[tokio::test]
+async fn test_detect_and_parse_with_python_start_and_end_token() {
+    // Test parsing a message that starts with <|python_start|> and contains a valid tool call
+    let parser = PythonicParser::new();
+
+    let text = "User wants to get the weather in Mars. <|python_start|>[get_weather(location='Mars', unit='celsius')]<|python_end|> In this way we will get the weather in Mars.";
+    let result = parser.parse_complete(text).await.unwrap();
+
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "get_weather");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["location"], "Mars");
+    assert_eq!(args["unit"], "celsius");
+}
diff --git a/sgl-router/tests/tool_parser_qwen.rs b/sgl-router/tests/tool_parser_qwen.rs
new file mode 100644
index 000000000..979c105b0
--- /dev/null
+++ b/sgl-router/tests/tool_parser_qwen.rs
@@ -0,0 +1,259 @@
+//! Qwen Parser Integration Tests
+//!
+//! Tests for the Qwen parser which handles <tool_call>...</tool_call> format
+
+use serde_json::json;
+use sglang_router_rs::tool_parser::{ParseState, QwenParser, StreamResult, ToolParser};
+
+#[tokio::test]
+async fn test_qwen_single_tool() {
+    let parser = QwenParser::new();
+    let input = r#"<tool_call>
+{"name": "get_weather", "arguments": {"city": "Beijing", "units": "celsius"}}
+</tool_call>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "get_weather");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["city"], "Beijing");
+    assert_eq!(args["units"], "celsius");
+}
+
+#[tokio::test]
+async fn test_qwen_multiple_sequential_tools() {
+    let parser = QwenParser::new();
+    let input = r#"Let me help you with that.
+<tool_call>
+{"name": "search", "arguments": {"query": "Qwen model"}}
+</tool_call>
+<tool_call>
+{"name": "translate", "arguments": {"text": "Hello", "to": "zh"}}
+</tool_call>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 2);
+    assert_eq!(result[0].function.name, "search");
+    assert_eq!(result[1].function.name, "translate");
+}
+
+#[tokio::test]
+async fn test_qwen_pretty_printed_json() {
+    let parser = QwenParser::new();
+    let input = r#"<tool_call>
+{
+    "name": "create_document",
+    "arguments": {
+        "title": "Test Document",
+        "content": "This is a test",
+        "metadata": {
+            "author": "Qwen",
+            "tags": ["test", "example"]
+        }
+    }
+}
+</tool_call>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "create_document");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["metadata"]["author"], "Qwen");
+    assert_eq!(args["metadata"]["tags"], json!(["test", "example"]));
+}
+
+#[tokio::test]
+async fn test_qwen_with_text_between() {
+    let parser = QwenParser::new();
+    let input = r#"First, let me search for information.
+<tool_call>
+{"name": "search", "arguments": {"query": "test"}}
+</tool_call>
+
+Now I'll translate something.
+
+<tool_call>
+{"name": "translate", "arguments": {"text": "world", "to": "es"}}
+</tool_call>
+Done!"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 2);
+    assert_eq!(result[0].function.name, "search");
+    assert_eq!(result[1].function.name, "translate");
+}
+
+#[tokio::test]
+async fn test_qwen_empty_arguments() {
+    let parser = QwenParser::new();
+    let input = r#"<tool_call>
+{"name": "get_time", "arguments": {}}
+</tool_call>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "get_time");
+}
+
+#[tokio::test]
+async fn test_qwen_with_newlines_in_strings() {
+    let parser = QwenParser::new();
+    let input = r#"<tool_call>
+{"name": "write_file", "arguments": {"content": "Line 1\nLine 2\nLine 3", "path": "/tmp/test.txt"}}
+</tool_call>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["content"], "Line 1\nLine 2\nLine 3");
+}
+
+#[tokio::test]
+async fn test_qwen_format_detection() {
+    let parser = QwenParser::new();
+
+    assert!(parser.detect_format("<tool_call>"));
+    assert!(parser.detect_format("Some text <tool_call>\n{"));
+    assert!(!parser.detect_format("Just plain text"));
+    assert!(!parser.detect_format("{\"name\": \"test\"}")); // Plain JSON
+}
+
+#[tokio::test]
+async fn test_qwen_incomplete_tags() {
+    let parser = QwenParser::new();
+
+    // Missing closing tag
+    let input = r#"<tool_call>
+{"name": "test", "arguments": {}}"#;
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 0);
+
+    // Missing opening tag
+    let input = r#"{"name": "test", "arguments": {}}
+</tool_call>"#;
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 0);
+}
+
+#[tokio::test]
+async fn test_qwen_real_world_output() {
+    let parser = QwenParser::new();
+
+    // Actual output from Qwen model
+    let input = r#"I'll help you search for information and perform calculations.
+
+<tool_call>
+{
+    "name": "web_search",
+    "arguments": {
+        "query": "quantum computing breakthroughs 2024",
+        "language": "en",
+        "region": "us",
+        "safe_search": true
+    }
+}
+</tool_call>
+
+Let me also calculate something for you:
+
+<tool_call>
+{
+    "name": "calculator",
+    "arguments": {
+        "expression": "sqrt(144) + 3^2",
+        "precision": 2
+    }
+}
+</tool_call>
+
+These tools will provide the information you need."#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 2);
+    assert_eq!(result[0].function.name, "web_search");
+    assert_eq!(result[1].function.name, "calculator");
+
+    let args0: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args0["query"], "quantum computing breakthroughs 2024");
+    assert_eq!(args0["safe_search"], true);
+}
+
+#[tokio::test]
+async fn test_buffer_drain_optimization() {
+    let parser = QwenParser::new();
+    let mut state = ParseState::new();
+
+    // First chunk - incomplete tool call
+    let chunk1 = "<tool_call>\n{\"name\": \"test1\", ";
+    let _result = parser.parse_incremental(chunk1, &mut state).await.unwrap();
+    // Phase 2 simplified streaming might not handle partial JSON correctly
+    // The important thing is buffer accumulation works
+    assert!(!state.buffer.is_empty());
+
+    // Complete first tool and start second
+    let chunk2 = "\"arguments\": {}}\n</tool_call><tool_call>\n{\"name\": \"test2\", ";
+    let result = parser.parse_incremental(chunk2, &mut state).await.unwrap();
+
+    match result {
+        StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "test1");
+            // After consuming the first tool, buffer should contain only the second tool start
+            assert!(state.buffer.starts_with("<tool_call>"));
+            assert!(state.buffer.contains("test2"));
+        }
+        _ => {
+            // Phase 2 simplified streaming might return Incomplete
+            // The important thing is the buffer is managed correctly
+        }
+    }
+
+    // Complete the second tool
+    let chunk3 = "\"arguments\": {\"x\": 1}}\n</tool_call>";
+    let result = parser.parse_incremental(chunk3, &mut state).await.unwrap();
+
+    match result {
+        StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "test2");
+            // Buffer should be empty after consuming all tools
+            assert!(state.buffer.is_empty() || !state.buffer.contains("</tool_call>"));
+        }
+        _ => {
+            // Phase 2 simplified streaming might handle this differently
+        }
+    }
+}
+
+#[tokio::test]
+async fn test_buffer_efficiency_with_multiple_tools() {
+    let parser = QwenParser::new();
+    let mut state = ParseState::new();
+
+    // Send multiple complete tools at once
+    let input = r#"<tool_call>
+{"name": "tool1", "arguments": {"a": 1}}
+</tool_call><tool_call>
+{"name": "tool2", "arguments": {"b": 2}}
+</tool_call><tool_call>
+{"name": "tool3", "arguments": {"c": 3}}
+</tool_call>"#;
+
+    // This should efficiently process tools using drain() without creating new strings
+    let result = parser.parse_incremental(input, &mut state).await.unwrap();
+
+    // In Phase 2, this will likely parse only the first tool
+    // The important thing is that drain() doesn't cause any issues
+    match result {
+        StreamResult::ToolComplete(tool) => {
+            assert!(["tool1", "tool2", "tool3"].contains(&tool.function.name.as_str()));
+        }
+        _ => {
+            // Simplified streaming might return Incomplete
+        }
+    }
+
+    // Verify no memory issues or panics occurred with drain()
+    // Test passes if we reach this point without panic
+}
diff --git a/sgl-router/tests/tool_parser_registry.rs b/sgl-router/tests/tool_parser_registry.rs
new file mode 100644
index 000000000..c98405eaf
--- /dev/null
+++ b/sgl-router/tests/tool_parser_registry.rs
@@ -0,0 +1,194 @@
+//! Parser Registry Integration Tests
+//!
+//! Tests for model-to-parser mappings and registry functionality
+
+use sglang_router_rs::tool_parser::ParserRegistry;
+
+#[tokio::test]
+async fn test_registry_has_all_parsers() {
+    let registry = ParserRegistry::new();
+    let parsers = registry.list_parsers();
+
+    assert!(parsers.contains(&"json"));
+    assert!(parsers.contains(&"mistral"));
+    assert!(parsers.contains(&"qwen"));
+    assert!(parsers.contains(&"pythonic"));
+    assert!(parsers.contains(&"llama"));
+}
+
+#[tokio::test]
+async fn test_openai_models_use_json() {
+    let registry = ParserRegistry::new();
+
+    let models = vec!["gpt-4", "gpt-4-turbo", "gpt-3.5-turbo", "gpt-4o"];
+    for model in models {
+        let parser = registry.get_parser(model).unwrap();
+        let test_input = r#"{"name": "test", "arguments": {}}"#;
+        let result = parser.parse_complete(test_input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "test");
+    }
+}
+
+#[tokio::test]
+async fn test_anthropic_models_use_json() {
+    let registry = ParserRegistry::new();
+
+    let models = vec!["claude-3-opus", "claude-3-sonnet", "claude-2.1"];
+    for model in models {
+        let parser = registry.get_parser(model).unwrap();
+        let test_input = r#"{"name": "test", "arguments": {}}"#;
+        let result = parser.parse_complete(test_input).await.unwrap();
+        assert_eq!(result.len(), 1);
+    }
+}
+
+#[tokio::test]
+async fn test_mistral_models() {
+    let registry = ParserRegistry::new();
+
+    let models = vec!["mistral-large", "mistral-medium", "mixtral-8x7b"];
+    for model in models {
+        let parser = registry.get_parser(model).unwrap();
+        let test_input = r#"[TOOL_CALLS] [{"name": "test", "arguments": {}}]"#;
+        let result = parser.parse_complete(test_input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "test");
+    }
+}
+
+#[tokio::test]
+async fn test_qwen_models() {
+    let registry = ParserRegistry::new();
+
+    let models = vec!["qwen2.5-72b", "Qwen2-7B", "qwen-max"];
+    for model in models {
+        let parser = registry.get_parser(model).unwrap();
+        let test_input = r#"<tool_call>
+{"name": "test", "arguments": {}}
+</tool_call>"#;
+        let result = parser.parse_complete(test_input).await.unwrap();
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].function.name, "test");
+    }
+}
+
+#[tokio::test]
+async fn test_llama_model_variants() {
+    let registry = ParserRegistry::new();
+
+    // Llama 4 uses pythonic
+    let parser = registry.get_parser("llama-4-70b").unwrap();
+    let test_input = r#"[get_weather(city="NYC")]"#;
+    let result = parser.parse_complete(test_input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "get_weather");
+
+    // Llama 3.2 uses python_tag
+    let parser = registry.get_parser("llama-3.2-8b").unwrap();
+    let test_input = r#"<|python_tag|>{"name": "test", "arguments": {}}"#;
+    let result = parser.parse_complete(test_input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "test");
+
+    // Other Llama models use JSON
+    let parser = registry.get_parser("llama-2-70b").unwrap();
+    let test_input = r#"{"name": "test", "arguments": {}}"#;
+    let result = parser.parse_complete(test_input).await.unwrap();
+    assert_eq!(result.len(), 1);
+}
+
+#[tokio::test]
+async fn test_deepseek_models() {
+    let registry = ParserRegistry::new();
+
+    // DeepSeek uses pythonic format (simplified, v3 would need custom parser)
+    let parser = registry.get_parser("deepseek-coder").unwrap();
+    let test_input = r#"[function(arg="value")]"#;
+    let result = parser.parse_complete(test_input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "function");
+}
+
+#[tokio::test]
+async fn test_unknown_model_fallback() {
+    let registry = ParserRegistry::new();
+
+    // Unknown models should fall back to JSON parser
+    let parser = registry.get_parser("unknown-model-xyz").unwrap();
+    let test_input = r#"{"name": "fallback", "arguments": {}}"#;
+    let result = parser.parse_complete(test_input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "fallback");
+}
+
+#[tokio::test]
+async fn test_pattern_specificity() {
+    let registry = ParserRegistry::new();
+
+    // Test that more specific patterns take precedence
+    // llama-4* should match before llama-*
+    let parser = registry.get_parser("llama-4-70b").unwrap();
+    assert!(parser.detect_format(r#"[test_function(x=1)]"#)); // Pythonic format
+
+    let parser = registry.get_parser("llama-3-70b").unwrap();
+    assert!(parser.detect_format(r#"{"name": "test", "arguments": {}}"#)); // JSON format
+}
+
+#[tokio::test]
+async fn test_real_world_model_outputs() {
+    let registry = ParserRegistry::new();
+
+    // Test with realistic outputs from different models
+    let test_cases = vec![
+        (
+            "gpt-4",
+            r#"I'll help you with that.
+
+{"name": "search_web", "arguments": {"query": "latest AI news", "max_results": 5}}
+
+Let me search for that information."#,
+            "search_web",
+        ),
+        (
+            "mistral-large",
+            r#"Let me search for information about Rust.
+
+[TOOL_CALLS] [
+    {"name": "search", "arguments": {"query": "Rust programming"}},
+    {"name": "get_weather", "arguments": {"city": "San Francisco"}}
+]
+
+I've initiated the search."#,
+            "search",
+        ),
+        (
+            "qwen2.5",
+            r#"I'll check the weather for you.
+
+<tool_call>
+{
+    "name": "get_weather",
+    "arguments": {
+        "location": "Tokyo",
+        "units": "celsius"
+    }
+}
+</tool_call>
+
+The weather information has been requested."#,
+            "get_weather",
+        ),
+    ];
+
+    for (model, output, expected_name) in test_cases {
+        let parser = registry.get_parser(model).unwrap();
+        let result = parser.parse_complete(output).await.unwrap();
+        assert!(!result.is_empty(), "No tools parsed for model {}", model);
+        assert_eq!(
+            result[0].function.name, expected_name,
+            "Wrong function name for model {}",
+            model
+        );
+    }
+}
diff --git a/sgl-router/tests/tool_parser_step3.rs b/sgl-router/tests/tool_parser_step3.rs
new file mode 100644
index 000000000..6c1808b31
--- /dev/null
+++ b/sgl-router/tests/tool_parser_step3.rs
@@ -0,0 +1,245 @@
+//! Step3 Parser Integration Tests
+
+use sglang_router_rs::tool_parser::{ParseState, Step3Parser, StreamResult, ToolParser};
+
+#[tokio::test]
+async fn test_step3_complete_parsing() {
+    let parser = Step3Parser::new();
+
+    // Test single tool call
+    let input = r#"Let me help you.
+<｜tool_calls_begin｜>
+<｜tool_call_begin｜>function<｜tool_sep｜><steptml:invoke name="search">
+<steptml:parameter name="query">rust programming</steptml:parameter>
+<steptml:parameter name="limit">10</steptml:parameter>
+</steptml:invoke><｜tool_call_end｜>
+<｜tool_calls_end｜>
+Here are the results..."#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "search");
+
+    // Verify arguments
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["query"], "rust programming");
+    assert_eq!(args["limit"], 10);
+}
+
+#[tokio::test]
+async fn test_step3_multiple_tools() {
+    let parser = Step3Parser::new();
+
+    let input = r#"<｜tool_calls_begin｜>
+<｜tool_call_begin｜>function<｜tool_sep｜><steptml:invoke name="get_weather">
+<steptml:parameter name="location">Tokyo</steptml:parameter>
+</steptml:invoke><｜tool_call_end｜>
+<｜tool_call_begin｜>function<｜tool_sep｜><steptml:invoke name="get_news">
+<steptml:parameter name="category">tech</steptml:parameter>
+<steptml:parameter name="limit">5</steptml:parameter>
+</steptml:invoke><｜tool_call_end｜>
+<｜tool_calls_end｜>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 2);
+    assert_eq!(result[0].function.name, "get_weather");
+    assert_eq!(result[1].function.name, "get_news");
+}
+
+#[tokio::test]
+async fn test_step3_type_conversion() {
+    let parser = Step3Parser::new();
+
+    let input = r#"<｜tool_calls_begin｜>
+<｜tool_call_begin｜>function<｜tool_sep｜><steptml:invoke name="process">
+<steptml:parameter name="count">100</steptml:parameter>
+<steptml:parameter name="rate">2.5</steptml:parameter>
+<steptml:parameter name="active">true</steptml:parameter>
+<steptml:parameter name="optional">null</steptml:parameter>
+<steptml:parameter name="text">hello world</steptml:parameter>
+</steptml:invoke><｜tool_call_end｜>
+<｜tool_calls_end｜>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["count"], 100);
+    assert_eq!(args["rate"], 2.5);
+    assert_eq!(args["active"], true);
+    assert_eq!(args["optional"], serde_json::Value::Null);
+    assert_eq!(args["text"], "hello world");
+}
+
+#[tokio::test]
+async fn test_step3_streaming() {
+    let parser = Step3Parser::new();
+    let mut state = ParseState::new();
+
+    // Simulate streaming chunks
+    let chunks = vec![
+        "<｜tool_calls_begin｜>\n",
+        "<｜tool_call_begin｜>function",
+        "<｜tool_sep｜><steptml:invoke name=\"calc\">",
+        "\n<steptml:parameter name=\"x\">10</steptml:parameter>",
+        "\n<steptml:parameter name=\"y\">20</steptml:parameter>",
+        "\n</steptml:invoke><｜tool_call_end｜>",
+        "\n<｜tool_calls_end｜>",
+    ];
+
+    let mut found_name = false;
+    let mut found_complete = false;
+
+    for chunk in chunks {
+        let result = parser.parse_incremental(chunk, &mut state).await.unwrap();
+
+        match result {
+            StreamResult::ToolName { name, .. } => {
+                assert_eq!(name, "calc");
+                found_name = true;
+            }
+            StreamResult::ToolComplete(tool) => {
+                assert_eq!(tool.function.name, "calc");
+                found_complete = true;
+            }
+            _ => {}
+        }
+    }
+
+    assert!(found_name || found_complete);
+}
+
+#[test]
+fn test_step3_format_detection() {
+    let parser = Step3Parser::new();
+
+    // Should detect Step3 format
+    assert!(parser.detect_format("<｜tool_calls_begin｜>"));
+    assert!(parser.detect_format("text with <｜tool_calls_begin｜> marker"));
+
+    // Should not detect other formats
+    assert!(!parser.detect_format("[TOOL_CALLS]"));
+    assert!(!parser.detect_format("<tool_call>"));
+    assert!(!parser.detect_format("plain text"));
+}
+
+#[tokio::test]
+async fn test_step3_nested_steptml() {
+    let parser = Step3Parser::new();
+
+    // Test with complex parameter values
+    let input = r#"<｜tool_calls_begin｜>
+<｜tool_call_begin｜>function<｜tool_sep｜><steptml:invoke name="config">
+<steptml:parameter name="settings">{"nested": {"key": "value"}}</steptml:parameter>
+<steptml:parameter name="array">[1, 2, 3]</steptml:parameter>
+</steptml:invoke><｜tool_call_end｜>
+<｜tool_calls_end｜>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "config");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert!(args["settings"].is_object());
+    assert!(args["array"].is_array());
+}
+
+#[tokio::test]
+async fn test_step3_python_literals() {
+    let parser = Step3Parser::new();
+
+    // Test Python-style literals
+    let input = r#"<｜tool_calls_begin｜>
+<｜tool_call_begin｜>function<｜tool_sep｜><steptml:invoke name="test">
+<steptml:parameter name="bool_true">True</steptml:parameter>
+<steptml:parameter name="bool_false">False</steptml:parameter>
+<steptml:parameter name="none_value">None</steptml:parameter>
+</steptml:invoke><｜tool_call_end｜>
+<｜tool_calls_end｜>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["bool_true"], true);
+    assert_eq!(args["bool_false"], false);
+    assert_eq!(args["none_value"], serde_json::Value::Null);
+}
+
+#[tokio::test]
+async fn test_steptml_format() {
+    let parser = Step3Parser::new();
+
+    let input = r#"Text before.
+<｜tool_calls_begin｜>
+<｜tool_call_begin｜>function<｜tool_sep｜><steptml:invoke name="search">
+<steptml:parameter name="query">rust lang</steptml:parameter>
+<steptml:parameter name="limit">10</steptml:parameter>
+</steptml:invoke><｜tool_call_end｜>
+<｜tool_calls_end｜>Text after."#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "search");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["query"], "rust lang");
+    assert_eq!(args["limit"], 10);
+    // TODO: Verify normal text extraction
+}
+
+#[tokio::test]
+async fn test_json_parameter_values() {
+    let parser = Step3Parser::new();
+
+    let input = r#"<｜tool_calls_begin｜>
+<｜tool_call_begin｜>function<｜tool_sep｜><steptml:invoke name="config">
+<steptml:parameter name="settings">{"nested": {"value": true}}</steptml:parameter>
+<steptml:parameter name="items">[1, 2, 3]</steptml:parameter>
+</steptml:invoke><｜tool_call_end｜>
+<｜tool_calls_end｜>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert!(args["settings"].is_object());
+    assert!(args["items"].is_array());
+}
+
+#[tokio::test]
+async fn test_step3_parameter_with_angle_brackets() {
+    let parser = Step3Parser::new();
+
+    // Test parameter value containing < character
+    let input = r#"<｜tool_calls_begin｜>
+<｜tool_call_begin｜>function<｜tool_sep｜><steptml:invoke name="compare">
+<steptml:parameter name="expression">a < b && b > c</steptml:parameter>
+<steptml:parameter name="context">comparison test</steptml:parameter>
+</steptml:invoke><｜tool_call_end｜>
+<｜tool_calls_end｜>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "compare");
+
+    // Verify the parameter value was parsed correctly
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["expression"], "a < b && b > c");
+    assert_eq!(args["context"], "comparison test");
+}
+
+#[tokio::test]
+async fn test_step3_empty_function_name() {
+    let parser = Step3Parser::new();
+
+    // Test empty function name
+    let input = r#"<｜tool_calls_begin｜>
+<｜tool_call_begin｜>function<｜tool_sep｜><steptml:invoke name="">
+<steptml:parameter name="param">value</steptml:parameter>
+</steptml:invoke><｜tool_call_end｜>
+<｜tool_calls_end｜>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 0); // Should reject empty function name
+}
diff --git a/sgl-router/tests/tool_parser_streaming.rs b/sgl-router/tests/tool_parser_streaming.rs
new file mode 100644
index 000000000..f0e9ddedb
--- /dev/null
+++ b/sgl-router/tests/tool_parser_streaming.rs
@@ -0,0 +1,341 @@
+//! Streaming Parser Tests
+//!
+//! Tests for incremental/streaming parsing capabilities across all parsers
+
+use sglang_router_rs::tool_parser::{
+    JsonParser, LlamaParser, MistralParser, ParseState, PythonicParser, QwenParser, StreamResult,
+    ToolParser,
+};
+
+#[tokio::test]
+async fn test_json_streaming_simple() {
+    let parser = JsonParser::new();
+    let mut state = ParseState::new();
+
+    // Phase 2 note: This test sends the full JSON at once in the last chunk
+    // In real streaming, chunks would be smaller
+    let full_json = r#"{"name": "get_weather", "arguments": {"location": "San Francisco"}}"#;
+
+    let result = parser
+        .parse_incremental(full_json, &mut state)
+        .await
+        .unwrap();
+
+    // With complete JSON sent at once, we should get ToolComplete
+    match result {
+        StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "get_weather");
+        }
+        _ => {
+            panic!("Expected ToolComplete for complete JSON input");
+        }
+    }
+}
+
+#[tokio::test]
+async fn test_json_streaming_array() {
+    let parser = JsonParser::new();
+    let mut state = ParseState::new();
+
+    // Stream a JSON array of tools
+    let chunks = vec![
+        r#"["#,
+        r#"{"name": "tool1", "#,
+        r#""arguments": {}}, "#,
+        r#"{"name": "tool2", "#,
+        r#""arguments": {"x": 1"#,
+        r#"}}]"#,
+    ];
+
+    let mut tool_count = 0;
+
+    for chunk in chunks {
+        let result = parser.parse_incremental(chunk, &mut state).await.unwrap();
+        if let StreamResult::ToolComplete(_) = result {
+            tool_count += 1;
+        }
+    }
+
+    // Current implementation may handle this differently
+    // We're mainly testing that it doesn't crash
+    assert!(tool_count <= 2, "Should parse at most 2 tools");
+}
+
+#[tokio::test]
+async fn test_mistral_streaming() {
+    let parser = MistralParser::new();
+    let mut state = ParseState::new();
+
+    let chunks = vec![
+        r#"Here is the result: "#,
+        r#"[TOOL_CALLS] ["#,
+        r#"{"name": "#,
+        r#""search", "#,
+        r#""arguments": "#,
+        r#"{"query": "#,
+        r#""rust lang""#,
+        r#"}}]"#,
+    ];
+
+    let mut got_complete = false;
+
+    for chunk in chunks {
+        let result = parser.parse_incremental(chunk, &mut state).await.unwrap();
+        if let StreamResult::ToolComplete(tool) = result {
+            assert_eq!(tool.function.name, "search");
+            got_complete = true;
+        }
+    }
+
+    assert!(got_complete, "Should have completed parsing");
+}
+
+#[tokio::test]
+async fn test_pythonic_streaming() {
+    let parser = PythonicParser::new();
+    let mut state = ParseState::new();
+
+    // Send complete pythonic format at once
+    let full_input = r#"[get_weather(city="London", units="celsius")]"#;
+
+    let result = parser
+        .parse_incremental(full_input, &mut state)
+        .await
+        .unwrap();
+
+    match result {
+        StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "get_weather");
+            let args: serde_json::Value = serde_json::from_str(&tool.function.arguments).unwrap();
+            assert_eq!(args["city"], "London");
+        }
+        _ => {
+            panic!("Expected ToolComplete for complete pythonic input");
+        }
+    }
+}
+
+#[tokio::test]
+async fn test_llama_streaming_with_python_tag() {
+    let parser = LlamaParser::new();
+    let mut state = ParseState::new();
+
+    let chunks = vec![
+        r#"Let me help. "#,
+        r#"<|python"#,
+        r#"_tag|>"#,
+        r#"{"name": "#,
+        r#""calculate", "#,
+        r#""arguments": "#,
+        r#"{"x": 10}"#,
+        r#"}"#,
+    ];
+
+    let mut got_complete = false;
+
+    for chunk in chunks {
+        let result = parser.parse_incremental(chunk, &mut state).await.unwrap();
+        if let StreamResult::ToolComplete(tool) = result {
+            assert_eq!(tool.function.name, "calculate");
+            got_complete = true;
+        }
+    }
+
+    assert!(got_complete, "Should have completed parsing");
+}
+
+#[tokio::test]
+async fn test_qwen_streaming() {
+    let parser = QwenParser::new();
+    let mut state = ParseState::new();
+
+    // Send complete Qwen format at once (with exact format expected by parser)
+    // Note: Parser expects newline after both tags
+    let full_input = "<tool_call>\n{\"name\": \"translate\", \"arguments\": {\"text\": \"hello\", \"to\": \"zh\"}}\n</tool_call>";
+
+    let result = parser
+        .parse_incremental(full_input, &mut state)
+        .await
+        .unwrap();
+
+    match result {
+        StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "translate");
+        }
+        other => {
+            panic!(
+                "Expected ToolComplete for complete Qwen input, got: {:?}",
+                other
+            );
+        }
+    }
+}
+
+#[tokio::test]
+async fn test_streaming_incomplete_stays_incomplete() {
+    let parser = JsonParser::new();
+    let mut state = ParseState::new();
+
+    // Send truly incomplete JSON that can't be auto-completed
+    let chunks = vec![r#"{"na"#, r#"me": "#];
+
+    for chunk in chunks {
+        let result = parser.parse_incremental(chunk, &mut state).await.unwrap();
+        // Should return Incomplete for partial JSON that can't be auto-completed
+        assert!(
+            matches!(result, StreamResult::Incomplete),
+            "Should return Incomplete for partial JSON, got: {:?}",
+            result
+        );
+    }
+
+    // Buffer should contain the accumulated incomplete JSON
+    assert!(!state.buffer.is_empty());
+}
+
+#[tokio::test]
+async fn test_streaming_with_text_before_tool() {
+    let parser = JsonParser::new();
+    let mut state = ParseState::new();
+
+    // For streaming, the parser expects clean JSON
+    // Mixed text extraction only works in parse_complete, not parse_incremental
+    let full_input = r#"{"name": "test", "arguments": {}}"#;
+
+    let result = parser
+        .parse_incremental(full_input, &mut state)
+        .await
+        .unwrap();
+
+    match result {
+        StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "test");
+        }
+        other => {
+            panic!("Expected ToolComplete, got: {:?}", other);
+        }
+    }
+}
+
+#[tokio::test]
+async fn test_streaming_buffer_accumulation() {
+    let parser = JsonParser::new();
+
+    // Test: Complete JSON should clear buffer after parsing
+    let mut state = ParseState::new();
+
+    // Send partial JSON that can't be interpreted as complete
+    let result1 = parser
+        .parse_incremental(r#"{"na"#, &mut state)
+        .await
+        .unwrap();
+
+    assert!(matches!(result1, StreamResult::Incomplete));
+    assert!(
+        !state.buffer.is_empty(),
+        "Buffer should accumulate incomplete JSON"
+    );
+
+    // Send rest of JSON
+    let result2 = parser
+        .parse_incremental(r#"me": "test", "arguments": {}}"#, &mut state)
+        .await
+        .unwrap();
+
+    match result2 {
+        StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "test");
+            assert!(
+                state.buffer.is_empty(),
+                "Buffer should be cleared after complete parse"
+            );
+        }
+        _ => panic!(
+            "Expected ToolComplete for complete JSON, got: {:?}",
+            result2
+        ),
+    }
+}
+
+#[tokio::test]
+async fn test_streaming_multiple_tools_sequential() {
+    let parser = QwenParser::new();
+    let mut state = ParseState::new();
+
+    // Send complete Qwen format with newlines
+    let full_input = r#"<tool_call>
+{"name": "tool1", "arguments": {}}
+</tool_call>"#;
+
+    let result = parser
+        .parse_incremental(full_input, &mut state)
+        .await
+        .unwrap();
+
+    match result {
+        StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "tool1");
+        }
+        _ => {
+            panic!("Expected ToolComplete for first tool");
+        }
+    }
+}
+
+#[tokio::test]
+async fn test_streaming_reset_after_error() {
+    let parser = JsonParser::new();
+
+    // First attempt with invalid JSON
+    let mut state1 = ParseState::new();
+    let _ = parser
+        .parse_incremental(r#"{"name": invalid}"#, &mut state1)
+        .await;
+
+    // Second attempt with valid JSON should work with fresh state
+    let mut state2 = ParseState::new();
+    let result = parser
+        .parse_incremental(r#"{"name": "test", "arguments": {}}"#, &mut state2)
+        .await
+        .unwrap();
+
+    if let StreamResult::ToolComplete(tool) = result {
+        assert_eq!(tool.function.name, "test");
+    }
+}
+
+#[tokio::test]
+async fn test_streaming_with_unicode_chunks() {
+    let parser = JsonParser::new();
+    let mut state = ParseState::new();
+
+    // Send complete JSON with unicode
+    let full_input = r#"{"name": "translate", "arguments": {"text": "Hello 世界 🌍"}}"#;
+
+    let result = parser
+        .parse_incremental(full_input, &mut state)
+        .await
+        .unwrap();
+
+    // Phase 2 may return partial results even with complete JSON
+    // The important thing is that unicode is handled without crashes
+    match result {
+        StreamResult::ToolComplete(tool) => {
+            assert_eq!(tool.function.name, "translate");
+            let args: serde_json::Value = serde_json::from_str(&tool.function.arguments).unwrap();
+            assert!(args["text"].as_str().unwrap().contains("世界"));
+        }
+        StreamResult::ToolName { name, .. } => {
+            assert_eq!(name, "translate");
+            // Phase 2 partial streaming behavior - acceptable
+        }
+        StreamResult::ToolArguments { arguments, .. } => {
+            // Verify unicode was preserved
+            let args: serde_json::Value = serde_json::from_str(&arguments).unwrap();
+            assert!(args["text"].as_str().unwrap().contains("世界"));
+        }
+        other => {
+            panic!("Unexpected result: {:?}", other);
+        }
+    }
+}
diff --git a/sgl-router/tests/tool_parser_wrapper_tokens.rs b/sgl-router/tests/tool_parser_wrapper_tokens.rs
new file mode 100644
index 000000000..d2cc6b2f7
--- /dev/null
+++ b/sgl-router/tests/tool_parser_wrapper_tokens.rs
@@ -0,0 +1,247 @@
+//! Wrapper Token Tests
+//!
+//! Tests for JSON parser with custom wrapper tokens
+
+use sglang_router_rs::tool_parser::{JsonParser, TokenConfig, ToolParser};
+
+#[tokio::test]
+async fn test_json_with_xml_style_wrapper() {
+    let parser = JsonParser::with_config(TokenConfig {
+        start_tokens: vec!["<tool>".to_string()],
+        end_tokens: vec!["</tool>".to_string()],
+        separator: ", ".to_string(),
+    });
+
+    let input =
+        r#"Some text before <tool>{"name": "test", "arguments": {"x": 1}}</tool> and after"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "test");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["x"], 1);
+}
+
+#[tokio::test]
+async fn test_json_with_multiple_wrapper_pairs() {
+    // Test with multiple start/end token pairs
+    let parser = JsonParser::with_config(TokenConfig {
+        start_tokens: vec!["<tool>".to_string(), "<<TOOL>>".to_string()],
+        end_tokens: vec!["</tool>".to_string(), "<</TOOL>>".to_string()],
+        separator: ", ".to_string(),
+    });
+
+    // Test first pair
+    let input1 = r#"<tool>{"name": "tool1", "arguments": {}}</tool>"#;
+    let result1 = parser.parse_complete(input1).await.unwrap();
+    assert_eq!(result1.len(), 1);
+    assert_eq!(result1[0].function.name, "tool1");
+
+    // Test second pair
+    let input2 = r#"<<TOOL>>{"name": "tool2", "arguments": {}}<</TOOL>>"#;
+    let result2 = parser.parse_complete(input2).await.unwrap();
+    assert_eq!(result2.len(), 1);
+    assert_eq!(result2[0].function.name, "tool2");
+}
+
+#[tokio::test]
+async fn test_json_with_only_start_token() {
+    // Test when only start token is provided (no end token)
+    let parser = JsonParser::with_config(TokenConfig {
+        start_tokens: vec![">>>FUNCTION:".to_string()],
+        end_tokens: vec!["".to_string()], // Empty end token
+        separator: ", ".to_string(),
+    });
+
+    let input = r#"Some preamble >>>FUNCTION:{"name": "execute", "arguments": {"cmd": "ls"}}"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "execute");
+}
+
+#[tokio::test]
+async fn test_json_with_custom_separator() {
+    let parser = JsonParser::with_config(TokenConfig {
+        start_tokens: vec!["[FUNC]".to_string()],
+        end_tokens: vec!["[/FUNC]".to_string()],
+        separator: " | ".to_string(), // Custom separator
+    });
+
+    // Though we're not testing multiple tools here, the separator is configured
+    let input = r#"[FUNC]{"name": "test", "arguments": {}}[/FUNC]"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "test");
+}
+
+#[tokio::test]
+async fn test_json_with_nested_wrapper_tokens_in_content() {
+    // Known limitation: When wrapper tokens appear inside JSON strings,
+    // the simple regex-based extraction may fail. This would require
+    // a more sophisticated parser that understands JSON string escaping.
+
+    let parser = JsonParser::with_config(TokenConfig {
+        start_tokens: vec!["<call>".to_string()],
+        end_tokens: vec!["</call>".to_string()],
+        separator: ", ".to_string(),
+    });
+
+    let input =
+        r#"<call>{"name": "echo", "arguments": {"text": "Use <call> and </call> tags"}}</call>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+
+    // This is a known limitation - the parser may fail when end tokens appear in content
+    // For now, we accept this behavior
+    if result.is_empty() {
+        // Parser failed due to nested tokens - this is expected
+        assert_eq!(
+            result.len(),
+            0,
+            "Known limitation: nested wrapper tokens in content"
+        );
+    } else {
+        // If it does parse, verify it's correct
+        assert_eq!(result[0].function.name, "echo");
+        let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+        assert_eq!(args["text"], "Use <call> and </call> tags");
+    }
+}
+
+#[tokio::test]
+async fn test_json_extraction_without_wrapper_tokens() {
+    // Default parser without wrapper tokens should extract JSON from text
+    let parser = JsonParser::new();
+
+    let input = r#"
+    Here is some text before the JSON.
+    {"name": "search", "arguments": {"query": "test"}}
+    And here is some text after.
+    "#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "search");
+}
+
+#[tokio::test]
+async fn test_json_with_multiline_wrapper_content() {
+    let parser = JsonParser::with_config(TokenConfig {
+        start_tokens: vec!["```json\n".to_string()],
+        end_tokens: vec!["\n```".to_string()],
+        separator: ", ".to_string(),
+    });
+
+    let input = r#"Here's the function call:
+```json
+{
+    "name": "format_code",
+    "arguments": {
+        "language": "rust",
+        "code": "fn main() {}"
+    }
+}
+```
+Done!"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "format_code");
+}
+
+#[tokio::test]
+async fn test_json_with_special_chars_in_tokens() {
+    let parser = JsonParser::with_config(TokenConfig {
+        start_tokens: vec!["{{FUNC[[".to_string()],
+        end_tokens: vec!["]]FUNC}}".to_string()],
+        separator: ", ".to_string(),
+    });
+
+    let input = r#"{{FUNC[[{"name": "test", "arguments": {"special": "[]{}"}}]]FUNC}}"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "test");
+
+    let args: serde_json::Value = serde_json::from_str(&result[0].function.arguments).unwrap();
+    assert_eq!(args["special"], "[]{}");
+}
+
+#[tokio::test]
+async fn test_json_multiple_tools_with_wrapper() {
+    let parser = JsonParser::with_config(TokenConfig {
+        start_tokens: vec!["<fn>".to_string()],
+        end_tokens: vec!["</fn>".to_string()],
+        separator: ", ".to_string(),
+    });
+
+    // Multiple wrapped JSON objects
+    let input = r#"
+    <fn>{"name": "tool1", "arguments": {}}</fn>
+    Some text between.
+    <fn>{"name": "tool2", "arguments": {"x": 1}}</fn>
+    "#;
+
+    // Current implementation might handle this as separate calls
+    // Let's test that at least the first one is parsed
+    let result = parser.parse_complete(input).await.unwrap();
+    assert!(!result.is_empty(), "Should parse at least one tool");
+    assert_eq!(result[0].function.name, "tool1");
+}
+
+#[tokio::test]
+async fn test_json_wrapper_with_array() {
+    let parser = JsonParser::with_config(TokenConfig {
+        start_tokens: vec!["<tools>".to_string()],
+        end_tokens: vec!["</tools>".to_string()],
+        separator: ", ".to_string(),
+    });
+
+    let input = r#"<tools>[
+        {"name": "func1", "arguments": {}},
+        {"name": "func2", "arguments": {"param": "value"}}
+    ]</tools>"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 2);
+    assert_eq!(result[0].function.name, "func1");
+    assert_eq!(result[1].function.name, "func2");
+}
+
+#[tokio::test]
+async fn test_json_incomplete_wrapper_tokens() {
+    let parser = JsonParser::with_config(TokenConfig {
+        start_tokens: vec!["<tool>".to_string()],
+        end_tokens: vec!["</tool>".to_string()],
+        separator: ", ".to_string(),
+    });
+
+    // Missing end token
+    let input = r#"<tool>{"name": "test", "arguments": {}}"#;
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 0, "Should not parse without closing token");
+
+    // Missing start token
+    let input = r#"{"name": "test", "arguments": {}}</tool>"#;
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 0, "Should not parse without opening token");
+}
+
+#[tokio::test]
+async fn test_json_empty_wrapper_tokens() {
+    // Test with empty wrapper tokens (should behave like default)
+    let parser = JsonParser::with_config(TokenConfig {
+        start_tokens: vec![],
+        end_tokens: vec![],
+        separator: ", ".to_string(),
+    });
+
+    let input = r#"{"name": "test", "arguments": {"key": "value"}}"#;
+
+    let result = parser.parse_complete(input).await.unwrap();
+    assert_eq!(result.len(), 1);
+    assert_eq!(result[0].function.name, "test");
+}
diff --git a/test/README.md b/test/README.md
new file mode 100644
index 000000000..1854ec955
--- /dev/null
+++ b/test/README.md
@@ -0,0 +1,41 @@
+# Run Unit Tests
+
+SGLang uses the built-in library [unittest](https://docs.python.org/3/library/unittest.html) as the testing framework.
+
+## Test Backend Runtime
+```bash
+cd sglang/test/srt
+
+# Run a single file
+python3 test_srt_endpoint.py
+
+# Run a single test
+python3 -m unittest test_srt_endpoint.TestSRTEndpoint.test_simple_decode
+
+# Run a suite with multiple files
+python3 run_suite.py --suite per-commit
+```
+
+## Test Frontend Language
+```bash
+cd sglang/test/lang
+
+# Run a single file
+python3 test_srt_backend.py
+```
+
+## Adding or Updating Tests in CI
+
+- Create new test files under `test/srt` or `test/lang` depending on the type of test.
+- Ensure they are referenced in the respective `run_suite.py` (e.g., `test/srt/run_suite.py`) so they’re picked up in CI. For most small test cases, they can be added to the `per-commit` suite. Sort the test cases alphabetically.
+- The CI will run the `per-commit` and `nightly` automatically. If you need special setup or custom test groups, you may modify the workflows in [`.github/workflows/`](https://github.com/sgl-project/sglang/tree/main/.github/workflows).
+
+
+## Writing Elegant Test Cases
+
+- Examine existing tests in [sglang/test](https://github.com/sgl-project/sglang/tree/main/test) for practical examples.
+- Keep each test function focused on a single scenario or piece of functionality.
+- Give tests descriptive names reflecting their purpose.
+- Use robust assertions (e.g., assert, unittest methods) to validate outcomes.
+- Clean up resources to avoid side effects and preserve test independence.
+- Reduce the test time by using smaller models and reusing the server for multiple test cases.
diff --git a/test/lang/run_suite.py b/test/lang/run_suite.py
new file mode 100644
index 000000000..04efba51f
--- /dev/null
+++ b/test/lang/run_suite.py
@@ -0,0 +1,38 @@
+import argparse
+import glob
+
+from sglang.test.test_utils import TestFile, run_unittest_files
+
+suites = {
+    "per-commit": [
+        TestFile("test_srt_backend.py"),
+        # Skip this due to some OPENAI_API_KEY issues
+        # "test_openai_backend.py",
+    ],
+}
+
+
+if __name__ == "__main__":
+    arg_parser = argparse.ArgumentParser()
+    arg_parser.add_argument(
+        "--timeout-per-file",
+        type=int,
+        default=1000,
+        help="The time limit for running one file in seconds.",
+    )
+    arg_parser.add_argument(
+        "--suite",
+        type=str,
+        default=list(suites.keys())[0],
+        choices=list(suites.keys()) + ["all"],
+        help="The suite to run",
+    )
+    args = arg_parser.parse_args()
+
+    if args.suite == "all":
+        files = glob.glob("**/test_*.py", recursive=True)
+    else:
+        files = suites[args.suite]
+
+    exit_code = run_unittest_files(files, args.timeout_per_file)
+    exit(exit_code)
diff --git a/test/lang/test_anthropic_backend.py b/test/lang/test_anthropic_backend.py
new file mode 100644
index 000000000..dc8f0b17e
--- /dev/null
+++ b/test/lang/test_anthropic_backend.py
@@ -0,0 +1,25 @@
+import json
+import unittest
+
+from sglang import Anthropic, set_default_backend
+from sglang.test.test_programs import test_mt_bench, test_stream
+from sglang.test.test_utils import CustomTestCase
+
+
+class TestAnthropicBackend(CustomTestCase):
+    backend = None
+
+    @classmethod
+    def setUpClass(cls):
+        cls.backend = Anthropic("claude-3-haiku-20240307")
+        set_default_backend(cls.backend)
+
+    def test_mt_bench(self):
+        test_mt_bench()
+
+    def test_stream(self):
+        test_stream()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/lang/test_bind_cache.py b/test/lang/test_bind_cache.py
new file mode 100644
index 000000000..d5beefae6
--- /dev/null
+++ b/test/lang/test_bind_cache.py
@@ -0,0 +1,51 @@
+import unittest
+
+import sglang as sgl
+from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, CustomTestCase
+
+
+class TestBind(CustomTestCase):
+    backend = None
+
+    @classmethod
+    def setUpClass(cls):
+        cls.backend = sgl.Runtime(model_path=DEFAULT_MODEL_NAME_FOR_TEST)
+        sgl.set_default_backend(cls.backend)
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.backend.shutdown()
+
+    def test_bind(self):
+        @sgl.function
+        def few_shot_qa(s, prompt, question):
+            s += prompt
+            s += "Q: What is the capital of France?\n"
+            s += "A: Paris\n"
+            s += "Q: " + question + "\n"
+            s += "A:" + sgl.gen("answer", stop="\n")
+
+        few_shot_qa_2 = few_shot_qa.bind(
+            prompt="The following are questions with answers.\n\n"
+        )
+
+        tracer = few_shot_qa_2.trace()
+        print(tracer.last_node.print_graph_dfs() + "\n")
+
+    def test_cache(self):
+        @sgl.function
+        def few_shot_qa(s, prompt, question):
+            s += prompt
+            s += "Q: What is the capital of France?\n"
+            s += "A: Paris\n"
+            s += "Q: " + question + "\n"
+            s += "A:" + sgl.gen("answer", stop="\n")
+
+        few_shot_qa_2 = few_shot_qa.bind(
+            prompt="Answer the following questions as if you were a 5-year-old kid.\n\n"
+        )
+        few_shot_qa_2.cache(self.backend)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/lang/test_choices.py b/test/lang/test_choices.py
new file mode 100644
index 000000000..89e7ca1c7
--- /dev/null
+++ b/test/lang/test_choices.py
@@ -0,0 +1,91 @@
+import unittest
+
+import numpy as np
+
+from sglang.lang.choices import (
+    greedy_token_selection,
+    token_length_normalized,
+    unconditional_likelihood_normalized,
+)
+from sglang.test.test_utils import CustomTestCase
+
+MOCK_CHOICES_INPUT_DATA = {
+    "choices": [
+        "organ",  # ["organ"]
+        "organism",  # ["organ", "ism"]
+        "antidisestablishmentarianism",  # ["ant", "id", "is", "est", "ablish", "ment", "arian", "ism"]
+    ],
+    "normalized_prompt_logprobs": [-0.1, -0.2, -0.05],
+    "input_token_logprobs": [
+        [[-0.1, 1, None]],
+        [[-0.1, 1, None], [-0.3, 2, None]],
+        [
+            [-0.4, 3, None],
+            [-0.25, 4, None],
+            [-0.1, 5, None],
+            [-0.01, 6, None],
+            [-0.01, 7, None],
+            [-0.01, 8, None],
+            [-0.01, 9, None],
+            [-0.01, 2, None],
+        ],
+    ],
+    "output_token_logprobs": [
+        [[-0.1, 10, None]],
+        [[-0.1, 10, None]],
+        [[-0.1, 10, None]],
+    ],
+    "unconditional_token_logprobs": [
+        [[None, 1, None]],
+        [[None, 1, None], [-1.4, 2, None]],
+        [
+            [None, 3, None],
+            [-0.25, 4, None],
+            [-0.1, 5, None],
+            [-0.01, 6, None],
+            [-0.01, 7, None],
+            [-0.01, 8, None],
+            [-0.01, 9, None],
+            [-0.01, 2, None],
+        ],
+    ],
+}
+
+
+class TestChoices(CustomTestCase):
+
+    def test_token_length_normalized(self):
+        """Confirm 'antidisestablishmentarianism' is selected due to high confidences for
+        its later tokens resulting in highest token length normalized prompt logprob."""
+        decision = token_length_normalized(**MOCK_CHOICES_INPUT_DATA)
+        assert decision.decision == "antidisestablishmentarianism"
+
+    def test_greedy_token_selection(self):
+        """Confirm 'organ' is selected due it having the joint highest initial token
+        logprob, and a higher average logprob than organism's second token."""
+        decision = greedy_token_selection(**MOCK_CHOICES_INPUT_DATA)
+        assert decision.decision == "organ"
+        assert np.allclose(
+            decision.meta_info["greedy_logprob_matrix"],
+            [
+                [-0.1, -0.1, -0.1, -0.1, -0.1, -0.1, -0.1, -0.1],
+                [-0.1, -0.3, -0.2, -0.2, -0.2, -0.2, -0.2, -0.2],
+                [-0.4, -0.25, -0.1, -0.01, -0.01, -0.01, -0.01, -0.01],
+            ],
+            atol=0.01,
+        )
+
+    def test_unconditional_likelihood_normalized(self):
+        """Confirm 'organism' is selected due to it having the highest average token logprob
+        once normalized by the unconditional token logprobs."""
+        decision = unconditional_likelihood_normalized(**MOCK_CHOICES_INPUT_DATA)
+        assert decision.decision == "organism"
+        assert np.allclose(
+            decision.meta_info["normalized_unconditional_prompt_logprobs"],
+            [-0.1, 0.5, -0.05],
+            atol=0.01,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/lang/test_litellm_backend.py b/test/lang/test_litellm_backend.py
new file mode 100644
index 000000000..74c3a187a
--- /dev/null
+++ b/test/lang/test_litellm_backend.py
@@ -0,0 +1,25 @@
+import json
+import unittest
+
+from sglang import LiteLLM, set_default_backend
+from sglang.test.test_programs import test_mt_bench, test_stream
+from sglang.test.test_utils import CustomTestCase
+
+
+class TestAnthropicBackend(CustomTestCase):
+    chat_backend = None
+
+    @classmethod
+    def setUpClass(cls):
+        cls.chat_backend = LiteLLM("gpt-3.5-turbo")
+        set_default_backend(cls.chat_backend)
+
+    def test_mt_bench(self):
+        test_mt_bench()
+
+    def test_stream(self):
+        test_stream()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/lang/test_openai_backend.py b/test/lang/test_openai_backend.py
new file mode 100644
index 000000000..dbef7d450
--- /dev/null
+++ b/test/lang/test_openai_backend.py
@@ -0,0 +1,92 @@
+import unittest
+
+from sglang import OpenAI, set_default_backend
+from sglang.test.test_programs import (
+    test_chat_completion_speculative,
+    test_completion_speculative,
+    test_decode_int,
+    test_decode_json,
+    test_expert_answer,
+    test_few_shot_qa,
+    test_image_qa,
+    test_mt_bench,
+    test_parallel_decoding,
+    test_parallel_encoding,
+    test_react,
+    test_select,
+    test_stream,
+    test_tool_use,
+)
+from sglang.test.test_utils import CustomTestCase
+
+
+class TestOpenAIBackend(CustomTestCase):
+    instruct_backend = None
+    chat_backend = None
+    chat_vision_backend = None
+
+    @classmethod
+    def setUpClass(cls):
+        cls.instruct_backend = OpenAI("gpt-3.5-turbo-instruct")
+        cls.chat_backend = OpenAI("gpt-3.5-turbo")
+        cls.chat_vision_backend = OpenAI("gpt-4-turbo")
+
+    def test_few_shot_qa(self):
+        set_default_backend(self.instruct_backend)
+        test_few_shot_qa()
+
+    def test_mt_bench(self):
+        set_default_backend(self.chat_backend)
+        test_mt_bench()
+
+    def test_select(self):
+        set_default_backend(self.instruct_backend)
+        test_select(check_answer=True)
+
+    def test_decode_int(self):
+        set_default_backend(self.instruct_backend)
+        test_decode_int()
+
+    def test_decode_json(self):
+        set_default_backend(self.instruct_backend)
+        test_decode_json()
+
+    def test_expert_answer(self):
+        set_default_backend(self.instruct_backend)
+        test_expert_answer()
+
+    def test_tool_use(self):
+        set_default_backend(self.instruct_backend)
+        test_tool_use()
+
+    def test_react(self):
+        set_default_backend(self.instruct_backend)
+        test_react()
+
+    def test_parallel_decoding(self):
+        set_default_backend(self.instruct_backend)
+        test_parallel_decoding()
+
+    def test_parallel_encoding(self):
+        set_default_backend(self.instruct_backend)
+        test_parallel_encoding()
+
+    def test_image_qa(self):
+        set_default_backend(self.chat_vision_backend)
+        test_image_qa()
+
+    def test_stream(self):
+        set_default_backend(self.instruct_backend)
+        test_stream()
+
+    def test_completion_speculative(self):
+        set_default_backend(self.instruct_backend)
+        test_completion_speculative()
+
+    def test_chat_completion_speculative(self):
+        set_default_backend(self.chat_backend)
+        test_chat_completion_speculative()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/lang/test_separate_reasoning.py b/test/lang/test_separate_reasoning.py
new file mode 100644
index 000000000..1709cecfd
--- /dev/null
+++ b/test/lang/test_separate_reasoning.py
@@ -0,0 +1,68 @@
+"""
+Tests for the separate_reasoning functionality in sglang.
+
+Usage:
+python3 -m unittest test/lang/test_separate_reasoning.py
+"""
+
+import unittest
+
+from sglang import assistant, gen, separate_reasoning, user
+from sglang.lang.ir import SglExprList, SglSeparateReasoning
+from sglang.test.test_utils import CustomTestCase
+
+
+class TestSeparateReasoning(CustomTestCase):
+    def test_separate_reasoning_creation(self):
+        """Test that SglSeparateReasoning objects are created correctly."""
+        # Test with valid model type and gen expression
+        test_gen = gen("test")
+        expr = separate_reasoning(test_gen, model_type="deepseek-r1")
+        self.assertIsInstance(expr, SglExprList)
+        self.assertEqual(len(expr.expr_list), 2)
+        self.assertEqual(expr.expr_list[0], test_gen)
+        reasoning_expr = expr.expr_list[1]
+        self.assertIsInstance(reasoning_expr, SglSeparateReasoning)
+        self.assertEqual(reasoning_expr.model_type, "deepseek-r1")
+        self.assertEqual(reasoning_expr.name, "test_reasoning_content")
+
+        # Test with another valid model type
+        expr = separate_reasoning(test_gen, model_type="qwen3")
+        self.assertIsInstance(expr, SglExprList)
+        self.assertEqual(expr.expr_list[1].model_type, "qwen3")
+
+    def test_separate_reasoning_name_processing(self):
+        """Test that separate_reasoning correctly processes names."""
+        test_gen = gen("test_var")
+        expr = separate_reasoning(test_gen, model_type="deepseek-r1")
+        reasoning_expr = expr.expr_list[1]
+        self.assertEqual(reasoning_expr.name, "test_var_reasoning_content")
+
+        # Test the process_name_for_reasoning method
+        self.assertEqual(
+            reasoning_expr.process_name_for_reasoning("another_var"),
+            "another_var_reasoning_content",
+        )
+
+    def test_separate_reasoning_repr(self):
+        """Test the string representation of SglSeparateReasoning."""
+        test_gen = gen("test_var")
+        expr = separate_reasoning(test_gen, model_type="deepseek-r1")
+        reasoning_expr = expr.expr_list[1]
+        self.assertEqual(
+            repr(reasoning_expr),
+            "SeparateReasoning(model_type=deepseek-r1, name=test_var_reasoning_content)",
+        )
+
+    def test_separate_reasoning_with_invalid_model_type(self):
+        """Test that separate_reasoning accepts any model type during creation."""
+        # Create with invalid model type
+        test_gen = gen("test")
+        expr = separate_reasoning(test_gen, model_type="invalid-model")
+        self.assertIsInstance(expr, SglExprList)
+        self.assertEqual(expr.expr_list[1].model_type, "invalid-model")
+        # The actual validation happens in the ReasoningParser constructor
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/lang/test_separate_reasoning_execution.py b/test/lang/test_separate_reasoning_execution.py
new file mode 100644
index 000000000..481488f6a
--- /dev/null
+++ b/test/lang/test_separate_reasoning_execution.py
@@ -0,0 +1,195 @@
+"""
+Tests for the execution of separate_reasoning functionality in sglang.
+
+Usage:
+python3 -m unittest test/lang/test_separate_reasoning_execution.py
+"""
+
+import threading
+import time
+import unittest
+from unittest.mock import MagicMock, patch
+
+from sglang import assistant, gen, separate_reasoning, user
+from sglang.lang.interpreter import StreamExecutor
+from sglang.lang.ir import SglGen, SglSeparateReasoning
+from sglang.test.test_utils import CustomTestCase
+
+
+# Helper function to create events that won't block program exit
+def create_daemon_event():
+    event = threading.Event()
+    return event
+
+
+class MockReasoningParser:
+    def __init__(self, model_type):
+        self.model_type = model_type
+        self.parse_non_stream_called = False
+        self.parse_stream_chunk_called = False
+
+    def parse_non_stream(self, full_text):
+        self.parse_non_stream_called = True
+        # Simulate parsing by adding a prefix to indicate reasoning
+        reasoning = f"[REASONING from {self.model_type}]: {full_text}"
+        normal_text = f"[NORMAL from {self.model_type}]: {full_text}"
+        return reasoning, normal_text
+
+    def parse_stream_chunk(self, chunk_text):
+        self.parse_stream_chunk_called = True
+        # Simulate parsing by adding a prefix to indicate reasoning
+        reasoning = f"[REASONING from {self.model_type}]: {chunk_text}"
+        normal_text = f"[NORMAL from {self.model_type}]: {chunk_text}"
+        return reasoning, normal_text
+
+
+class TestSeparateReasoningExecution(CustomTestCase):
+    def setUp(self):
+        """Set up for the test."""
+        super().setUp()
+        # Store any events created during the test
+        self.events = []
+
+    def tearDown(self):
+        """Clean up any threads that might have been created during the test."""
+        super().tearDown()
+
+        # Set all events to ensure any waiting threads are released
+        for event in self.events:
+            event.set()
+
+    def tearDown(self):
+        super().tearDown()
+        # wake up all threads
+        for ev in self.events:
+            ev.set()
+
+    @patch("sglang.srt.parser.reasoning_parser.ReasoningParser")
+    def test_execute_separate_reasoning(self, mock_parser_class):
+        """Test that _execute_separate_reasoning correctly calls the ReasoningParser."""
+        # Setup mock parser
+        mock_parser = MockReasoningParser("deepseek-r1")
+        mock_parser_class.return_value = mock_parser
+
+        # Create a mock backend to avoid AttributeError in __del__
+        mock_backend = MagicMock()
+
+        # Create a StreamExecutor with necessary setup
+        executor = StreamExecutor(
+            backend=mock_backend,
+            arguments={},
+            default_sampling_para={},
+            chat_template={
+                "role_map": {"user": "user", "assistant": "assistant"}
+            },  # Simple chat template
+            stream=False,
+            use_thread=False,
+        )
+
+        # Set up the executor with a variable and its value
+        var_name = "test_var"
+        reasoning_name = f"{var_name}_reasoning_content"
+        var_value = "Test content"
+        executor.variables = {var_name: var_value}
+
+        # Create events and track them for cleanup
+        var_event = create_daemon_event()
+        reasoning_event = create_daemon_event()
+        self.events.extend([var_event, reasoning_event])
+
+        executor.variable_event = {var_name: var_event, reasoning_name: reasoning_event}
+        executor.variable_event[var_name].set()  # Mark as ready
+
+        # Set up the current role
+        executor.cur_role = "assistant"
+        executor.cur_role_begin_pos = 0
+        executor.text_ = var_value
+
+        # Create a gen expression and a separate_reasoning expression
+        gen_expr = SglGen(var_name)
+        expr = SglSeparateReasoning("deepseek-r1", expr=gen_expr)
+
+        # Execute separate_reasoning
+        executor._execute_separate_reasoning(expr)
+
+        # Verify that the parser was created with the correct model type
+        mock_parser_class.assert_called_once_with("deepseek-r1")
+
+        # Verify that parse_non_stream was called
+        self.assertTrue(mock_parser.parse_non_stream_called)
+
+        # Verify that the variables were updated correctly
+        reasoning_name = f"{var_name}_reasoning_content"
+        self.assertIn(reasoning_name, executor.variables)
+        self.assertEqual(
+            executor.variables[reasoning_name],
+            f"[REASONING from deepseek-r1]: {var_value}",
+        )
+        self.assertEqual(
+            executor.variables[var_name], f"[NORMAL from deepseek-r1]: {var_value}"
+        )
+
+        # Verify that the variable event was set
+        self.assertIn(reasoning_name, executor.variable_event)
+        self.assertTrue(executor.variable_event[reasoning_name].is_set())
+
+        # Verify that the text was updated
+        self.assertEqual(executor.text_, f"[NORMAL from deepseek-r1]: {var_value}")
+
+    @patch("sglang.srt.parser.reasoning_parser.ReasoningParser")
+    def test_reasoning_parser_integration(self, mock_parser_class):
+        """Test the integration between separate_reasoning and ReasoningParser."""
+        # Setup mock parsers for different model types
+        deepseek_parser = MockReasoningParser("deepseek-r1")
+        qwen_parser = MockReasoningParser("qwen3")
+
+        # Configure the mock to return different parsers based on model type
+        def get_parser(model_type):
+            if model_type == "deepseek-r1":
+                return deepseek_parser
+            elif model_type == "qwen3":
+                return qwen_parser
+            else:
+                raise ValueError(f"Unsupported model type: {model_type}")
+
+        mock_parser_class.side_effect = get_parser
+
+        # Test with DeepSeek-R1 model
+        test_text = "This is a test"
+        reasoning, normal_text = deepseek_parser.parse_non_stream(test_text)
+
+        self.assertEqual(reasoning, f"[REASONING from deepseek-r1]: {test_text}")
+        self.assertEqual(normal_text, f"[NORMAL from deepseek-r1]: {test_text}")
+
+        # Test with Qwen3 model
+        reasoning, normal_text = qwen_parser.parse_non_stream(test_text)
+
+        self.assertEqual(reasoning, f"[REASONING from qwen3]: {test_text}")
+        self.assertEqual(normal_text, f"[NORMAL from qwen3]: {test_text}")
+
+    @patch("sglang.srt.parser.reasoning_parser.ReasoningParser")
+    def test_reasoning_parser_invalid_model(self, mock_parser_class):
+        """Test that ReasoningParser raises an error for invalid model types."""
+
+        # Configure the mock to raise an error for invalid model types
+        def get_parser(model_type):
+            if model_type in ["deepseek-r1", "qwen3"]:
+                return MockReasoningParser(model_type)
+            elif model_type is None:
+                raise ValueError("Model type must be specified")
+            else:
+                raise ValueError(f"Unsupported model type: {model_type}")
+
+        mock_parser_class.side_effect = get_parser
+
+        with self.assertRaises(ValueError) as context:
+            mock_parser_class("invalid-model")
+        self.assertIn("Unsupported model type", str(context.exception))
+
+        with self.assertRaises(ValueError) as context:
+            mock_parser_class(None)
+        self.assertIn("Model type must be specified", str(context.exception))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/lang/test_srt_backend.py b/test/lang/test_srt_backend.py
new file mode 100644
index 000000000..0e05eb906
--- /dev/null
+++ b/test/lang/test_srt_backend.py
@@ -0,0 +1,86 @@
+"""
+Usage:
+python3 -m unittest test_srt_backend.TestSRTBackend.test_gen_min_new_tokens
+python3 -m unittest test_srt_backend.TestSRTBackend.test_hellaswag_select
+"""
+
+import unittest
+
+import sglang as sgl
+from sglang.test.test_programs import (
+    test_decode_int,
+    test_decode_json_regex,
+    test_dtype_gen,
+    test_expert_answer,
+    test_few_shot_qa,
+    test_gen_min_new_tokens,
+    test_hellaswag_select,
+    test_mt_bench,
+    test_parallel_decoding,
+    test_regex,
+    test_select,
+    test_stream,
+    test_tool_use,
+)
+from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, CustomTestCase
+
+
+class TestSRTBackend(CustomTestCase):
+    backend = None
+
+    @classmethod
+    def setUpClass(cls):
+        cls.backend = sgl.Runtime(
+            model_path=DEFAULT_MODEL_NAME_FOR_TEST, cuda_graph_max_bs=4
+        )
+        sgl.set_default_backend(cls.backend)
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.backend.shutdown()
+
+    def test_few_shot_qa(self):
+        test_few_shot_qa()
+
+    def test_mt_bench(self):
+        test_mt_bench()
+
+    def test_select(self):
+        test_select(check_answer=False)
+
+    def test_decode_int(self):
+        test_decode_int()
+
+    def test_decode_json_regex(self):
+        test_decode_json_regex()
+
+    def test_expert_answer(self):
+        test_expert_answer()
+
+    def test_tool_use(self):
+        test_tool_use()
+
+    def test_parallel_decoding(self):
+        test_parallel_decoding()
+
+    def test_stream(self):
+        test_stream()
+
+    def test_regex(self):
+        test_regex()
+
+    def test_dtype_gen(self):
+        test_dtype_gen()
+
+    def test_hellaswag_select(self):
+        # Run twice to capture more bugs
+        for _ in range(2):
+            accuracy, latency = test_hellaswag_select()
+            self.assertGreater(accuracy, 0.60)
+
+    def test_gen_min_new_tokens(self):
+        test_gen_min_new_tokens()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/lang/test_tracing.py b/test/lang/test_tracing.py
new file mode 100644
index 000000000..3f02ac52b
--- /dev/null
+++ b/test/lang/test_tracing.py
@@ -0,0 +1,129 @@
+import unittest
+
+import sglang as sgl
+from sglang.lang.backend.base_backend import BaseBackend
+from sglang.lang.chat_template import get_chat_template
+from sglang.test.test_utils import CustomTestCase
+
+
+class TestTracing(CustomTestCase):
+    def test_few_shot_qa(self):
+        @sgl.function
+        def few_shot_qa(s, question):
+            s += "The following are questions with answers.\n\n"
+            s += "Q: What is the capital of France?\n"
+            s += "A: Paris\n"
+            s += "Q: " + question + "\n"
+            s += "A:" + sgl.gen("answer", stop="\n")
+
+        tracer = few_shot_qa.trace()
+        # print(tracer.last_node.print_graph_dfs() + "\n")
+
+    def test_select(self):
+        @sgl.function
+        def capital(s):
+            s += "The capital of France is"
+            s += sgl.select("capital", ["Paris. ", "London. "])
+            s += "It is a city" + sgl.gen("description", stop=".")
+
+        tracer = capital.trace()
+        # print(tracer.last_node.print_graph_dfs() + "\n")
+
+    def test_raise_warning(self):
+        @sgl.function
+        def wrong(s, question):
+            s += f"I want to ask {question}"
+
+        try:
+            tracer = wrong.trace()
+            raised = False
+        except TypeError:
+            raised = True
+
+        assert raised
+
+    def test_multi_function(self):
+        @sgl.function
+        def expand(s, tip):
+            s += (
+                "Please expand the following tip into a detailed paragraph:"
+                + tip
+                + "\n"
+            )
+            s += sgl.gen("detailed_tip")
+
+        @sgl.function
+        def tip_suggestion(s, topic):
+            s += "Here are 2 tips for " + topic + ".\n"
+
+            s += "1." + sgl.gen("tip_1", stop=["\n", ":", "."]) + "\n"
+            s += "2." + sgl.gen("tip_2", stop=["\n", ":", "."]) + "\n"
+
+            branch1 = expand(tip=s["tip_1"])
+            branch2 = expand(tip=s["tip_2"])
+
+            s += "Tip 1: " + branch1["detailed_tip"] + "\n"
+            s += "Tip 2: " + branch2["detailed_tip"] + "\n"
+            s += "In summary" + sgl.gen("summary")
+
+        compiled = tip_suggestion.compile()
+        # compiled.print_graph()
+
+        sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo-instruct"))
+        state = compiled.run(topic="staying healthy")
+        # print(state.text() + "\n")
+
+        states = compiled.run_batch(
+            [
+                {"topic": "staying healthy"},
+                {"topic": "staying happy"},
+                {"topic": "earning money"},
+            ],
+            temperature=0,
+        )
+        # for s in states:
+        #     print(s.text() + "\n")
+
+    def test_role(self):
+        @sgl.function
+        def multi_turn_chat(s):
+            s += sgl.user("Who are you?")
+            s += sgl.assistant(sgl.gen("answer_1"))
+            s += sgl.user("Who created you?")
+            s += sgl.assistant(sgl.gen("answer_2"))
+
+        backend = BaseBackend()
+        backend.chat_template = get_chat_template("llama-2-chat")
+
+        compiled = multi_turn_chat.compile(backend=backend)
+        # compiled.print_graph()
+
+    def test_fork(self):
+        @sgl.function
+        def tip_suggestion(s):
+            s += (
+                "Here are three tips for staying healthy: "
+                "1. Balanced Diet; "
+                "2. Regular Exercise; "
+                "3. Adequate Sleep\n"
+            )
+
+            forks = s.fork(3)
+            for i in range(3):
+                forks[i] += f"Now, expand tip {i+1} into a paragraph:\n"
+                forks[i] += sgl.gen(f"detailed_tip")
+
+            s += "Tip 1:" + forks[0]["detailed_tip"] + "\n"
+            s += "Tip 2:" + forks[1]["detailed_tip"] + "\n"
+            s += "Tip 3:" + forks[2]["detailed_tip"] + "\n"
+            s += "In summary" + sgl.gen("summary")
+
+        tracer = tip_suggestion.trace()
+        # print(tracer.last_node.print_graph_dfs())
+
+        a = tip_suggestion.run(backend=sgl.OpenAI("gpt-3.5-turbo-instruct"))
+        # print(a.text())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/lang/test_vertexai_backend.py b/test/lang/test_vertexai_backend.py
new file mode 100644
index 000000000..83ce7fc0b
--- /dev/null
+++ b/test/lang/test_vertexai_backend.py
@@ -0,0 +1,53 @@
+import unittest
+
+from sglang import VertexAI, set_default_backend
+from sglang.test.test_programs import (
+    test_expert_answer,
+    test_few_shot_qa,
+    test_image_qa,
+    test_mt_bench,
+    test_parallel_decoding,
+    test_parallel_encoding,
+    test_stream,
+)
+from sglang.test.test_utils import CustomTestCase
+
+
+class TestVertexAIBackend(CustomTestCase):
+    backend = None
+
+    @classmethod
+    def setUpClass(cls):
+        cls.backend = VertexAI("gemini-1.5-pro-001")
+
+    def test_few_shot_qa(self):
+        set_default_backend(self.backend)
+        test_few_shot_qa()
+
+    def test_mt_bench(self):
+        set_default_backend(self.backend)
+        test_mt_bench()
+
+    def test_expert_answer(self):
+        set_default_backend(self.backend)
+        test_expert_answer(check_answer=False)
+
+    def test_parallel_decoding(self):
+        set_default_backend(self.backend)
+        test_parallel_decoding()
+
+    def test_parallel_encoding(self):
+        set_default_backend(self.backend)
+        test_parallel_encoding()
+
+    def test_image_qa(self):
+        set_default_backend(self.backend)
+        test_image_qa()
+
+    def test_stream(self):
+        set_default_backend(self.backend)
+        test_stream()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/pytest.ini b/test/pytest.ini
new file mode 100644
index 000000000..2f4c80e30
--- /dev/null
+++ b/test/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+asyncio_mode = auto
diff --git a/test/srt/ascend/test_ascend_deepep.py b/test/srt/ascend/test_ascend_deepep.py
new file mode 100644
index 000000000..6ccd34d27
--- /dev/null
+++ b/test/srt/ascend/test_ascend_deepep.py
@@ -0,0 +1,121 @@
+import os
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    run_bench_offline_throughput,
+)
+
+TEST_MODEL_MATRIX = {
+    "/root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-R1-0528-W8A8": {
+        "accuracy": 0.95,
+        "latency": 1000,
+        "output_throughput": 6,
+    },
+}
+
+
+class TestAscendDeepEP(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.models = TEST_MODEL_MATRIX.keys()
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.url = urlparse(DEFAULT_URL_FOR_TEST)
+
+        cls.common_args = [
+            "--trust-remote-code",
+            "--attention-backend",
+            "ascend",
+            "--quantization",
+            "w8a8_int8",
+            "--mem-fraction-static",
+            0.9,
+            "--max-running-requests",
+            32,
+            "--disable-radix-cache",
+            "--chunked-prefill-size",
+            32768,
+            "--disable-cuda-graph",
+            "--tp-size",
+            16,
+            "--dp-size",
+            1,
+            "--ep-size",
+            16,
+            "--moe-a2a-backend",
+            "deepep",
+            "--deepep-mode",
+            "auto",
+        ]
+
+        cls.extra_envs = {
+            "HCCL_BUFFSIZE": "500",
+            "SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK": "32",
+        }
+        os.environ.update(cls.extra_envs)
+
+    def test_a_gsm8k(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing accuracy: {model} ===##")
+
+                process = popen_launch_server(
+                    model,
+                    self.base_url,
+                    timeout=1500,
+                    other_args=[
+                        *self.common_args,
+                    ],
+                )
+
+                try:
+                    args = SimpleNamespace(
+                        num_shots=5,
+                        data_path=None,
+                        num_questions=1319,
+                        max_new_tokens=512,
+                        parallel=128,
+                        host=f"http://{self.url.hostname}",
+                        port=int(self.url.port),
+                    )
+
+                    metrics = run_eval_few_shot_gsm8k(args)
+                    self.assertGreaterEqual(
+                        metrics["accuracy"],
+                        TEST_MODEL_MATRIX[model]["accuracy"],
+                    )
+                finally:
+                    kill_process_tree(process.pid)
+
+    def test_b_throughput(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing throughput: {model} ===##")
+
+                output_throughput = run_bench_offline_throughput(
+                    model,
+                    [
+                        *self.common_args,
+                    ],
+                )
+
+                print(f"##=== {model} throughput: {output_throughput} ===##")
+
+                if is_in_ci():
+                    self.assertGreater(
+                        output_throughput,
+                        TEST_MODEL_MATRIX[model]["output_throughput"],
+                    )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/ascend/test_ascend_graph_tp1_bf16.py b/test/srt/ascend/test_ascend_graph_tp1_bf16.py
new file mode 100644
index 000000000..95c6b7bcf
--- /dev/null
+++ b/test/srt/ascend/test_ascend_graph_tp1_bf16.py
@@ -0,0 +1,95 @@
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    run_bench_offline_throughput,
+)
+
+TEST_MODEL_MATRIX = {
+    "Qwen/Qwen2.5-7B-Instruct": {
+        "accuracy": 0.85,
+        "latency": 150,
+        "output_throughput": 30,
+    },
+}
+
+
+class TestAscendGraphTp1Bf16(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.models = TEST_MODEL_MATRIX.keys()
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.url = urlparse(DEFAULT_URL_FOR_TEST)
+        cls.common_args = [
+            "--trust-remote-code",
+            "--mem-fraction-static",
+            0.8,
+            "--attention-backend",
+            "ascend",
+        ]
+
+    def test_a_gsm8k(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing accuracy: {model} ===##")
+
+                process = popen_launch_server(
+                    model,
+                    self.base_url,
+                    timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                    other_args=[
+                        *self.common_args,
+                    ],
+                )
+
+                try:
+                    args = SimpleNamespace(
+                        num_shots=5,
+                        data_path=None,
+                        num_questions=1319,
+                        max_new_tokens=512,
+                        parallel=128,
+                        host=f"http://{self.url.hostname}",
+                        port=int(self.url.port),
+                    )
+
+                    metrics = run_eval_few_shot_gsm8k(args)
+                    self.assertGreaterEqual(
+                        metrics["accuracy"],
+                        TEST_MODEL_MATRIX[model]["accuracy"],
+                    )
+                finally:
+                    kill_process_tree(process.pid)
+
+    def test_b_throughput(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing throughput: {model} ===##")
+
+                output_throughput = run_bench_offline_throughput(
+                    model,
+                    [
+                        *self.common_args,
+                    ],
+                )
+
+                print(f"##=== {model} throughput: {output_throughput} ===##")
+
+                if is_in_ci():
+                    self.assertGreater(
+                        output_throughput,
+                        TEST_MODEL_MATRIX[model]["output_throughput"],
+                    )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/ascend/test_ascend_graph_tp2_bf16.py b/test/srt/ascend/test_ascend_graph_tp2_bf16.py
new file mode 100644
index 000000000..f7c3c6537
--- /dev/null
+++ b/test/srt/ascend/test_ascend_graph_tp2_bf16.py
@@ -0,0 +1,97 @@
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    run_bench_offline_throughput,
+)
+
+TEST_MODEL_MATRIX = {
+    "Qwen/Qwen2.5-7B-Instruct": {
+        "accuracy": 0.85,
+        "latency": 180,
+        "output_throughput": 20,
+    },
+}
+
+
+class TestAscendGraphTp2Bf16(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.models = TEST_MODEL_MATRIX.keys()
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.url = urlparse(DEFAULT_URL_FOR_TEST)
+        cls.common_args = [
+            "--trust-remote-code",
+            "--mem-fraction-static",
+            0.8,
+            "--attention-backend",
+            "ascend",
+            "--tp-size",
+            2,
+        ]
+
+    def test_a_gsm8k(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing accuracy: {model} ===##")
+
+                process = popen_launch_server(
+                    model,
+                    self.base_url,
+                    timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                    other_args=[
+                        *self.common_args,
+                    ],
+                )
+
+                try:
+                    args = SimpleNamespace(
+                        num_shots=5,
+                        data_path=None,
+                        num_questions=1319,
+                        max_new_tokens=512,
+                        parallel=128,
+                        host=f"http://{self.url.hostname}",
+                        port=int(self.url.port),
+                    )
+
+                    metrics = run_eval_few_shot_gsm8k(args)
+                    self.assertGreaterEqual(
+                        metrics["accuracy"],
+                        TEST_MODEL_MATRIX[model]["accuracy"],
+                    )
+                finally:
+                    kill_process_tree(process.pid)
+
+    def test_b_throughput(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing throughput: {model} ===##")
+
+                output_throughput = run_bench_offline_throughput(
+                    model,
+                    [
+                        *self.common_args,
+                    ],
+                )
+
+                print(f"##=== {model} throughput: {output_throughput} ===##")
+
+                if is_in_ci():
+                    self.assertGreater(
+                        output_throughput,
+                        TEST_MODEL_MATRIX[model]["output_throughput"],
+                    )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/ascend/test_ascend_mla_fia_w8a8int8.py b/test/srt/ascend/test_ascend_mla_fia_w8a8int8.py
new file mode 100644
index 000000000..6de97b04d
--- /dev/null
+++ b/test/srt/ascend/test_ascend_mla_fia_w8a8int8.py
@@ -0,0 +1,103 @@
+import os
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    run_bench_offline_throughput,
+)
+
+TEST_MODEL_MATRIX = {
+    "/root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V2-Lite-W8A8": {
+        "accuracy": 0.34,
+        "latency": 1000,
+        "output_throughput": 6,
+    },
+}
+
+
+class TestAscendMlaW8A8Int8(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.models = TEST_MODEL_MATRIX.keys()
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.url = urlparse(DEFAULT_URL_FOR_TEST)
+        cls.common_args = [
+            "--trust-remote-code",
+            "--disable-cuda-graph",
+            "--mem-fraction-static",
+            0.8,
+            "--attention-backend",
+            "ascend",
+            "--quantization",
+            "w8a8_int8",
+            "--tp-size",
+            2,
+            "--disable-radix-cache",
+        ]
+
+    def test_a_gsm8k(self):
+        os.environ["ASCEND_USE_FIA"] = "true"
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing accuracy: {model} ===##")
+
+                process = popen_launch_server(
+                    model,
+                    self.base_url,
+                    timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                    other_args=[
+                        *self.common_args,
+                    ],
+                )
+
+                try:
+                    args = SimpleNamespace(
+                        num_shots=5,
+                        data_path=None,
+                        num_questions=1319,
+                        max_new_tokens=512,
+                        parallel=128,
+                        host=f"http://{self.url.hostname}",
+                        port=int(self.url.port),
+                    )
+
+                    metrics = run_eval_few_shot_gsm8k(args)
+                    self.assertGreaterEqual(
+                        metrics["accuracy"],
+                        TEST_MODEL_MATRIX[model]["accuracy"],
+                    )
+                finally:
+                    kill_process_tree(process.pid)
+
+    def test_b_throughput(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing throughput: {model} ===##")
+
+                output_throughput = run_bench_offline_throughput(
+                    model,
+                    [
+                        *self.common_args,
+                    ],
+                )
+
+                print(f"##=== {model} throughput: {output_throughput} ===##")
+
+                if is_in_ci():
+                    self.assertGreater(
+                        output_throughput,
+                        TEST_MODEL_MATRIX[model]["output_throughput"],
+                    )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/ascend/test_ascend_mla_w8a8int8.py b/test/srt/ascend/test_ascend_mla_w8a8int8.py
new file mode 100644
index 000000000..70f7edab4
--- /dev/null
+++ b/test/srt/ascend/test_ascend_mla_w8a8int8.py
@@ -0,0 +1,101 @@
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    run_bench_offline_throughput,
+)
+
+TEST_MODEL_MATRIX = {
+    "/root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V2-Lite-W8A8": {
+        "accuracy": 0.34,
+        "latency": 1000,
+        "output_throughput": 6,
+    },
+}
+
+
+class TestAscendMlaW8A8Int8(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.models = TEST_MODEL_MATRIX.keys()
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.url = urlparse(DEFAULT_URL_FOR_TEST)
+        cls.common_args = [
+            "--trust-remote-code",
+            "--disable-cuda-graph",
+            "--mem-fraction-static",
+            0.8,
+            "--attention-backend",
+            "ascend",
+            "--quantization",
+            "w8a8_int8",
+            "--tp-size",
+            4,
+            "--disable-radix-cache",
+        ]
+
+    def test_a_gsm8k(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing accuracy: {model} ===##")
+
+                process = popen_launch_server(
+                    model,
+                    self.base_url,
+                    timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                    other_args=[
+                        *self.common_args,
+                    ],
+                )
+
+                try:
+                    args = SimpleNamespace(
+                        num_shots=5,
+                        data_path=None,
+                        num_questions=1319,
+                        max_new_tokens=512,
+                        parallel=128,
+                        host=f"http://{self.url.hostname}",
+                        port=int(self.url.port),
+                    )
+
+                    metrics = run_eval_few_shot_gsm8k(args)
+                    self.assertGreaterEqual(
+                        metrics["accuracy"],
+                        TEST_MODEL_MATRIX[model]["accuracy"],
+                    )
+                finally:
+                    kill_process_tree(process.pid)
+
+    def test_b_throughput(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing throughput: {model} ===##")
+
+                output_throughput = run_bench_offline_throughput(
+                    model,
+                    [
+                        *self.common_args,
+                    ],
+                )
+
+                print(f"##=== {model} throughput: {output_throughput} ===##")
+
+                if is_in_ci():
+                    self.assertGreater(
+                        output_throughput,
+                        TEST_MODEL_MATRIX[model]["output_throughput"],
+                    )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/ascend/test_ascend_tp1_bf16.py b/test/srt/ascend/test_ascend_tp1_bf16.py
new file mode 100644
index 000000000..f854605ce
--- /dev/null
+++ b/test/srt/ascend/test_ascend_tp1_bf16.py
@@ -0,0 +1,96 @@
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    run_bench_offline_throughput,
+)
+
+TEST_MODEL_MATRIX = {
+    "Qwen/Qwen2.5-7B-Instruct": {
+        "accuracy": 0.84,
+        "latency": 150,
+        "output_throughput": 30,
+    },
+}
+
+
+class TestAscendTp1Bf16(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.models = TEST_MODEL_MATRIX.keys()
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.url = urlparse(DEFAULT_URL_FOR_TEST)
+        cls.common_args = [
+            "--trust-remote-code",
+            "--disable-cuda-graph",
+            "--mem-fraction-static",
+            0.8,
+            "--attention-backend",
+            "ascend",
+        ]
+
+    def test_a_gsm8k(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing accuracy: {model} ===##")
+
+                process = popen_launch_server(
+                    model,
+                    self.base_url,
+                    timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                    other_args=[
+                        *self.common_args,
+                    ],
+                )
+
+                try:
+                    args = SimpleNamespace(
+                        num_shots=5,
+                        data_path=None,
+                        num_questions=1319,
+                        max_new_tokens=512,
+                        parallel=128,
+                        host=f"http://{self.url.hostname}",
+                        port=int(self.url.port),
+                    )
+
+                    metrics = run_eval_few_shot_gsm8k(args)
+                    self.assertGreaterEqual(
+                        metrics["accuracy"],
+                        TEST_MODEL_MATRIX[model]["accuracy"],
+                    )
+                finally:
+                    kill_process_tree(process.pid)
+
+    def test_b_throughput(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing throughput: {model} ===##")
+
+                output_throughput = run_bench_offline_throughput(
+                    model,
+                    [
+                        *self.common_args,
+                    ],
+                )
+
+                print(f"##=== {model} throughput: {output_throughput} ===##")
+
+                if is_in_ci():
+                    self.assertGreater(
+                        output_throughput,
+                        TEST_MODEL_MATRIX[model]["output_throughput"],
+                    )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/ascend/test_ascend_tp2_bf16.py b/test/srt/ascend/test_ascend_tp2_bf16.py
new file mode 100644
index 000000000..d5e141c9f
--- /dev/null
+++ b/test/srt/ascend/test_ascend_tp2_bf16.py
@@ -0,0 +1,98 @@
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    run_bench_offline_throughput,
+)
+
+TEST_MODEL_MATRIX = {
+    "Qwen/Qwen2.5-7B-Instruct": {
+        "accuracy": 0.85,
+        "latency": 180,
+        "output_throughput": 20,
+    },
+}
+
+
+class TestAscendTp2Bf16(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.models = TEST_MODEL_MATRIX.keys()
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.url = urlparse(DEFAULT_URL_FOR_TEST)
+        cls.common_args = [
+            "--trust-remote-code",
+            "--disable-cuda-graph",
+            "--mem-fraction-static",
+            0.8,
+            "--attention-backend",
+            "ascend",
+            "--tp-size",
+            2,
+        ]
+
+    def test_a_gsm8k(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing accuracy: {model} ===##")
+
+                process = popen_launch_server(
+                    model,
+                    self.base_url,
+                    timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                    other_args=[
+                        *self.common_args,
+                    ],
+                )
+
+                try:
+                    args = SimpleNamespace(
+                        num_shots=5,
+                        data_path=None,
+                        num_questions=1319,
+                        max_new_tokens=512,
+                        parallel=128,
+                        host=f"http://{self.url.hostname}",
+                        port=int(self.url.port),
+                    )
+
+                    metrics = run_eval_few_shot_gsm8k(args)
+                    self.assertGreaterEqual(
+                        metrics["accuracy"],
+                        TEST_MODEL_MATRIX[model]["accuracy"],
+                    )
+                finally:
+                    kill_process_tree(process.pid)
+
+    def test_b_throughput(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing throughput: {model} ===##")
+
+                output_throughput = run_bench_offline_throughput(
+                    model,
+                    [
+                        *self.common_args,
+                    ],
+                )
+
+                print(f"##=== {model} throughput: {output_throughput} ===##")
+
+                if is_in_ci():
+                    self.assertGreater(
+                        output_throughput,
+                        TEST_MODEL_MATRIX[model]["output_throughput"],
+                    )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/ascend/test_ascend_tp2_fia_bf16.py b/test/srt/ascend/test_ascend_tp2_fia_bf16.py
new file mode 100644
index 000000000..bdd1c5733
--- /dev/null
+++ b/test/srt/ascend/test_ascend_tp2_fia_bf16.py
@@ -0,0 +1,101 @@
+import os
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    run_bench_offline_throughput,
+)
+
+TEST_MODEL_MATRIX = {
+    "Qwen/Qwen2.5-7B-Instruct": {
+        "accuracy": 0.85,
+        "latency": 180,
+        "output_throughput": 20,
+    },
+}
+
+
+class TestAscendTp2Bf16(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.models = TEST_MODEL_MATRIX.keys()
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.url = urlparse(DEFAULT_URL_FOR_TEST)
+        cls.common_args = [
+            "--trust-remote-code",
+            "--disable-cuda-graph",
+            "--mem-fraction-static",
+            0.8,
+            "--attention-backend",
+            "ascend",
+            "--tp-size",
+            2,
+            "--disable-radix-cache",
+        ]
+
+    def test_a_gsm8k(self):
+        os.environ["ASCEND_USE_FIA"] = "true"
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing accuracy: {model} ===##")
+
+                process = popen_launch_server(
+                    model,
+                    self.base_url,
+                    timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                    other_args=[
+                        *self.common_args,
+                    ],
+                )
+
+                try:
+                    args = SimpleNamespace(
+                        num_shots=5,
+                        data_path=None,
+                        num_questions=1319,
+                        max_new_tokens=512,
+                        parallel=128,
+                        host=f"http://{self.url.hostname}",
+                        port=int(self.url.port),
+                    )
+
+                    metrics = run_eval_few_shot_gsm8k(args)
+                    self.assertGreaterEqual(
+                        metrics["accuracy"],
+                        TEST_MODEL_MATRIX[model]["accuracy"],
+                    )
+                finally:
+                    kill_process_tree(process.pid)
+
+    def test_b_throughput(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing throughput: {model} ===##")
+
+                output_throughput = run_bench_offline_throughput(
+                    model,
+                    [
+                        *self.common_args,
+                    ],
+                )
+
+                print(f"##=== {model} throughput: {output_throughput} ===##")
+
+                if is_in_ci():
+                    self.assertGreater(
+                        output_throughput,
+                        TEST_MODEL_MATRIX[model]["output_throughput"],
+                    )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/ascend/test_ascend_tp4_bf16.py b/test/srt/ascend/test_ascend_tp4_bf16.py
new file mode 100644
index 000000000..bb7d90e4f
--- /dev/null
+++ b/test/srt/ascend/test_ascend_tp4_bf16.py
@@ -0,0 +1,101 @@
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    run_bench_offline_throughput,
+)
+
+TEST_MODEL_MATRIX = {
+    "Qwen/Qwen3-30B-A3B-Instruct-2507": {
+        "accuracy": 0.90,
+        "latency": 180,
+        "output_throughput": 20,
+    },
+}
+
+
+class TestAscendTp4Bf16(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.models = TEST_MODEL_MATRIX.keys()
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.url = urlparse(DEFAULT_URL_FOR_TEST)
+        cls.common_args = [
+            "--trust-remote-code",
+            "--mem-fraction-static",
+            0.7,
+            "--max-running-requests",
+            32,
+            "--attention-backend",
+            "ascend",
+            "--cuda-graph-max-bs",
+            32,
+            "--tp-size",
+            4,
+        ]
+
+    def test_a_gsm8k(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing accuracy: {model} ===##")
+
+                process = popen_launch_server(
+                    model,
+                    self.base_url,
+                    timeout=1800,
+                    other_args=[
+                        *self.common_args,
+                    ],
+                )
+
+                try:
+                    args = SimpleNamespace(
+                        num_shots=5,
+                        data_path=None,
+                        num_questions=1319,
+                        max_new_tokens=512,
+                        parallel=128,
+                        host=f"http://{self.url.hostname}",
+                        port=int(self.url.port),
+                    )
+
+                    metrics = run_eval_few_shot_gsm8k(args)
+                    self.assertGreaterEqual(
+                        metrics["accuracy"],
+                        TEST_MODEL_MATRIX[model]["accuracy"],
+                    )
+                finally:
+                    kill_process_tree(process.pid)
+
+    def test_b_throughput(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing throughput: {model} ===##")
+
+                output_throughput = run_bench_offline_throughput(
+                    model,
+                    [
+                        *self.common_args,
+                    ],
+                )
+
+                print(f"##=== {model} throughput: {output_throughput} ===##")
+
+                if is_in_ci():
+                    self.assertGreater(
+                        output_throughput,
+                        TEST_MODEL_MATRIX[model]["output_throughput"],
+                    )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/ascend/test_ascend_w8a8_quantization.py b/test/srt/ascend/test_ascend_w8a8_quantization.py
new file mode 100644
index 000000000..bf139f46a
--- /dev/null
+++ b/test/srt/ascend/test_ascend_w8a8_quantization.py
@@ -0,0 +1,104 @@
+"""
+Usage:
+python3 -m unittest test_ascend_w8a8_quantization.TestAscendW8A8.test_gsm8k
+"""
+
+import os
+import time
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+)
+
+if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ:
+    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1"
+DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
+    7000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100
+)
+DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}"
+
+
+class TestAscendW8A8(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "vllm-ascend/Qwen2.5-0.5B-Instruct-w8a8"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--disable-cuda-graph",
+                "--device",
+                "npu",
+                "--attention-backend",
+                "ascend",
+                "--quantization",
+                "w8a8_int8",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        base_url = DEFAULT_URL_FOR_TEST
+        url = urlparse(base_url)
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host=f"http://{url.hostname}",
+            port=int(url.port),
+        )
+        metrics = run_eval(args)
+        print(metrics)
+
+        self.assertGreaterEqual(metrics["accuracy"], 0.25)
+        self.assertGreaterEqual(metrics["output_throughput"], 1000)
+
+    def run_decode(self, max_new_tokens):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": max_new_tokens,
+                },
+                "ignore_eos": True,
+            },
+        )
+        return response.json()
+
+    def test_throughput(self):
+        max_tokens = 256
+
+        tic = time.perf_counter()
+        res = self.run_decode(max_tokens)
+        tok = time.perf_counter()
+        print(res["text"])
+        throughput = max_tokens / (tok - tic)
+        print(f"Throughput: {throughput} tokens/s")
+
+        if is_in_ci():
+            self.assertGreaterEqual(throughput, 25)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/configs/deepseek_v3.yaml b/test/srt/configs/deepseek_v3.yaml
new file mode 100644
index 000000000..82d059cba
--- /dev/null
+++ b/test/srt/configs/deepseek_v3.yaml
@@ -0,0 +1,28 @@
+tasks:
+  - name: sglang-8192-1024-concurrency1
+    server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768
+    client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 1 --num-prompts 5 --output-file deepseek_v3_results.jsonl
+
+  - name: sglang-8192-1024-concurrency2
+    server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768
+    client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 2 --num-prompts 10 --output-file deepseek_v3_results.jsonl
+
+  - name: sglang-8192-1024-concurrency4
+    server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768
+    client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 4 --num-prompts 20 --output-file deepseek_v3_results.jsonl
+
+  - name: sglang-8192-1024-concurrency8
+    server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768
+    client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 8 --num-prompts 32 --output-file deepseek_v3_results.jsonl
+
+  - name: sglang-8192-1024-concurrency16
+    server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768
+    client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 16 --num-prompts 48 --output-file deepseek_v3_results.jsonl
+
+  - name: sglang-8192-1024-concurrency24
+    server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768
+    client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 24 --num-prompts 72 --output-file deepseek_v3_results.jsonl
+
+  - name: sglang-8192-1024-concurrency32
+    server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768
+    client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 32 --num-prompts 96 --output-file deepseek_v3_results.jsonl
diff --git a/test/srt/configs/deepseek_v3_long_context.yaml b/test/srt/configs/deepseek_v3_long_context.yaml
new file mode 100644
index 000000000..df416a429
--- /dev/null
+++ b/test/srt/configs/deepseek_v3_long_context.yaml
@@ -0,0 +1,28 @@
+tasks:
+  - name: sglang-32000-100-concurrency1
+    server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768
+    client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 32000 --random-output-len 100 --max-concurrency 1 --num-prompts 5 --output-file deepseek_v3_long_context_results.jsonl
+
+  - name: sglang-32000-100-concurrency2
+    server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768
+    client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 32000 --random-output-len 100 --max-concurrency 2 --num-prompts 10 --output-file deepseek_v3_long_context_results.jsonl
+
+  - name: sglang-32000-100-concurrency4
+    server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768
+    client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 32000 --random-output-len 100 --max-concurrency 4 --num-prompts 20 --output-file deepseek_v3_long_context_results.jsonl
+
+  - name: sglang-32000-100-concurrency8
+    server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768
+    client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 32000 --random-output-len 100 --max-concurrency 8 --num-prompts 32 --output-file deepseek_v3_long_context_results.jsonl
+
+  - name: sglang-32000-100-concurrency16
+    server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768
+    client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 32000 --random-output-len 100 --max-concurrency 16 --num-prompts 48 --output-file deepseek_v3_long_context_results.jsonl
+
+  - name: sglang-32000-100-concurrency24
+    server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768
+    client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 32000 --random-output-len 100 --max-concurrency 24 --num-prompts 72 --output-file deepseek_v3_long_context_results.jsonl
+
+  - name: sglang-32000-100-concurrency32
+    server_cmd: python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code --disable-radix-cache --max-prefill-tokens 32768
+    client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 32000 --random-output-len 100 --max-concurrency 32 --num-prompts 96 --output-file deepseek_v3_long_context_results.jsonl
diff --git a/test/srt/configs/llama_405b.yaml b/test/srt/configs/llama_405b.yaml
new file mode 100644
index 000000000..db0c816fb
--- /dev/null
+++ b/test/srt/configs/llama_405b.yaml
@@ -0,0 +1,28 @@
+tasks:
+  - name: sglang-8192-1024-concurrency1
+    server_cmd: python3 -m sglang.launch_server --model nvidia/Llama-3.1-405B-Instruct-FP8 --tp 8
+    client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 1 --num-prompts 5 --output-file llama_405b_results.jsonl
+
+  - name: sglang-8192-1024-concurrency2
+    server_cmd: python3 -m sglang.launch_server --model nvidia/Llama-3.1-405B-Instruct-FP8 --tp 8
+    client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 2 --num-prompts 10 --output-file llama_405b_results.jsonl
+
+  - name: sglang-8192-1024-concurrency4
+    server_cmd: python3 -m sglang.launch_server --model nvidia/Llama-3.1-405B-Instruct-FP8 --tp 8
+    client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 4 --num-prompts 20 --output-file llama_405b_results.jsonl
+
+  - name: sglang-8192-1024-concurrency8
+    server_cmd: python3 -m sglang.launch_server --model nvidia/Llama-3.1-405B-Instruct-FP8 --tp 8
+    client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 8 --num-prompts 32 --output-file llama_405b_results.jsonl
+
+  - name: sglang-8192-1024-concurrency16
+    server_cmd: python3 -m sglang.launch_server --model nvidia/Llama-3.1-405B-Instruct-FP8 --tp 8
+    client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 16 --num-prompts 48 --output-file llama_405b_results.jsonl
+
+  - name: sglang-8192-1024-concurrency24
+    server_cmd: python3 -m sglang.launch_server --model nvidia/Llama-3.1-405B-Instruct-FP8 --tp 8
+    client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 24 --num-prompts 72 --output-file llama_405b_results.jsonl
+
+  - name: sglang-8192-1024-concurrency32
+    server_cmd: python3 -m sglang.launch_server --model nvidia/Llama-3.1-405B-Instruct-FP8 --tp 8
+    client_cmd: python3 -m sglang.bench_serving --dataset-name random --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 32 --num-prompts 96 --output-file llama_405b_results.jsonl
diff --git a/test/srt/configs/random_config.yaml b/test/srt/configs/random_config.yaml
new file mode 100644
index 000000000..eae8c27f4
--- /dev/null
+++ b/test/srt/configs/random_config.yaml
@@ -0,0 +1,25 @@
+tasks:
+  - name: sglang-128-4
+    server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache
+    client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 128 --random-output 4 --request-rate 24 --num-prompt 1440
+  - name: vllm-128-4
+    server_cmd: python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
+    client_cmd: python3 -m sglang.bench_serving --backend vllm --dataset-name random --random-input 128 --random-output 4 --request-rate 24 --num-prompt 1440
+  - name: sglang-2000-100
+    server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache
+    client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 2000 --random-output 100 --request-rate 2 --num-prompt 120
+  - name: vllm-2000-100
+    server_cmd: python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
+    client_cmd: python3 -m sglang.bench_serving --backend vllm --dataset-name random --random-input 2000 --random-output 100 --request-rate 2 --num-prompt 120
+  - name: sglang-4000-200
+    server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache
+    client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 4000 --random-output 200 --request-rate 8 --num-prompt 480
+  - name: vllm-4000-200
+    server_cmd: python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
+    client_cmd: python3 -m sglang.bench_serving --backend vllm --dataset-name random --random-input 4000 --random-output 200 --request-rate 8 --num-prompt 480
+  - name: sglang-32000-100
+    server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache
+    client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 32000 --random-output 100 --request-rate 1 --num-prompt 60
+  - name: vllm-32000-100
+    server_cmd: python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
+    client_cmd: python3 -m sglang.bench_serving --backend vllm --dataset-name random --random-input 32000 --random-output 100 --request-rate 1 --num-prompt 60
diff --git a/test/srt/configs/random_flashinfer_vs_triton_config.yaml b/test/srt/configs/random_flashinfer_vs_triton_config.yaml
new file mode 100644
index 000000000..7f4a386dd
--- /dev/null
+++ b/test/srt/configs/random_flashinfer_vs_triton_config.yaml
@@ -0,0 +1,25 @@
+tasks:
+  - name: sglang-128-4
+    server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache
+    client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 128 --random-output 4 --request-rate 24 --num-prompt 1440
+  - name: sglang-triton-128-4
+    server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache --attention-backend triton
+    client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 128 --random-output 4 --request-rate 24 --num-prompt 1440
+  - name: sglang-2000-100
+    server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache
+    client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 2000 --random-output 100 --request-rate 2 --num-prompt 120
+  - name: sglang-triton-2000-100
+    server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache --attention-backend triton
+    client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 2000 --random-output 100 --request-rate 2 --num-prompt 120
+  - name: sglang-4000-200
+    server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache
+    client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 4000 --random-output 200 --request-rate 8 --num-prompt 480
+  - name: sglang-triton-4000-200
+    server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache --attention-backend triton
+    client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 4000 --random-output 200 --request-rate 8 --num-prompt 480
+  - name: sglang-32000-100
+    server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache
+    client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 32000 --random-output 100 --request-rate 1 --num-prompt 60
+  - name: sglang-triton-32000-100
+    server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache --attention-backend triton
+    client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name random --random-input 32000 --random-output 100 --request-rate 1 --num-prompt 60
diff --git a/test/srt/configs/sharegpt_config.yaml b/test/srt/configs/sharegpt_config.yaml
new file mode 100644
index 000000000..a80b96c8e
--- /dev/null
+++ b/test/srt/configs/sharegpt_config.yaml
@@ -0,0 +1,7 @@
+tasks:
+  - name: sglang-benchmark
+    server_cmd: python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --disable-radix-cache
+    client_cmd: python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --request-rate 16
+  - name: vllm-benchmark
+    server_cmd: python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
+    client_cmd: python3 -m sglang.bench_serving --backend vllm --dataset-name sharegpt --request-rate 16
diff --git a/test/srt/cpu/test_activation.py b/test/srt/cpu/test_activation.py
new file mode 100644
index 000000000..1234fc631
--- /dev/null
+++ b/test/srt/cpu/test_activation.py
@@ -0,0 +1,55 @@
+import itertools
+import unittest
+
+import sgl_kernel
+import torch
+import torch.nn.functional as F
+from utils import GeluAndMul, SiluAndMul, precision
+
+from sglang.test.test_utils import CustomTestCase
+
+torch.manual_seed(1234)
+
+
+class TestActivation(CustomTestCase):
+    M = [128, 129, 257]
+    N = [22016, 22018]
+    dtype = [torch.float16, torch.bfloat16]
+
+    def _silu_and_mul_test(self, m, n, dtype):
+        x = torch.randn([m, n], dtype=dtype)
+
+        out = torch.ops.sgl_kernel.silu_and_mul_cpu(x)
+        ref_out = SiluAndMul(x)
+
+        atol = rtol = precision[ref_out.dtype]
+        torch.testing.assert_close(ref_out, out, atol=atol, rtol=rtol)
+
+    def _gelu_and_mul_test(self, m, n, dtype):
+        x = torch.randn([m, n], dtype=dtype)
+
+        out = torch.ops.sgl_kernel.gelu_and_mul_cpu(x)
+        ref_out = GeluAndMul(x, approximate="none")
+
+        atol = rtol = precision[ref_out.dtype]
+        torch.testing.assert_close(ref_out, out, atol=atol, rtol=rtol)
+
+    def _gelu_tanh_and_mul_test(self, m, n, dtype):
+        x = torch.randn([m, n], dtype=dtype)
+
+        out = torch.ops.sgl_kernel.gelu_tanh_and_mul_cpu(x)
+        ref_out = GeluAndMul(x, approximate="tanh")
+
+        atol = rtol = precision[ref_out.dtype]
+        torch.testing.assert_close(ref_out, out, atol=atol, rtol=rtol)
+
+    def test_activation(self):
+        for params in itertools.product(self.M, self.N, self.dtype):
+            with self.subTest(m=params[0], n=params[1], dtype=params[2]):
+                self._silu_and_mul_test(*params)
+                self._gelu_and_mul_test(*params)
+                self._gelu_tanh_and_mul_test(*params)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/cpu/test_binding.py b/test/srt/cpu/test_binding.py
new file mode 100644
index 000000000..d3cc329af
--- /dev/null
+++ b/test/srt/cpu/test_binding.py
@@ -0,0 +1,28 @@
+import re
+import unittest
+
+import sgl_kernel
+import torch
+
+kernel = torch.ops.sgl_kernel
+
+from sglang.test.test_utils import CustomTestCase
+
+
+class TestGemm(CustomTestCase):
+    def test_binding(self):
+        start_id = 1
+        n_cpu = 6
+
+        expected_cores = list(map(str, range(start_id, start_id + n_cpu)))
+        cpu_ids = ",".join(expected_cores)
+        output = kernel.init_cpu_threads_env(cpu_ids)
+
+        bindings = re.findall(r"OMP tid: \d+, core (\d+)", output)
+        self.assertEqual(len(bindings), n_cpu)
+
+        self.assertEqual(bindings, expected_cores)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/cpu/test_decode.py b/test/srt/cpu/test_decode.py
new file mode 100644
index 000000000..c77378e1a
--- /dev/null
+++ b/test/srt/cpu/test_decode.py
@@ -0,0 +1,170 @@
+import unittest
+
+import sgl_kernel
+import torch
+from torch.nn.functional import scaled_dot_product_attention
+
+from sglang.test.test_utils import CustomTestCase
+
+torch.manual_seed(1234)
+
+
+class TestDecodeAttention(CustomTestCase):
+    def _run_sdpa_forward_decode(
+        self,
+        query: torch.Tensor,
+        output: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        req_to_token: torch.Tensor,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        scaling=None,
+        enable_gqa=False,
+        causal=False,
+    ):
+        # [num_tokens, num_heads, head_size] -> [num_heads, num_tokens, head_size]
+        query = query.movedim(0, query.dim() - 2)
+
+        start_q, start_kv = 0, 0
+        for seq_idx in range(seq_lens.shape[0]):
+            seq_len_q = 1
+            seq_len_kv = seq_lens[seq_idx]
+            end_q = start_q + seq_len_q
+            end_kv = start_kv + seq_len_kv
+
+            per_req_query = query[:, start_q:end_q, :]
+
+            # get key and value from cache. per_req_tokens contains the kv cache
+            # index for each token in the sequence.
+            req_pool_idx = req_pool_indices[seq_idx]
+            per_req_tokens = req_to_token[req_pool_idx, :seq_len_kv]
+            per_req_key = k_cache[per_req_tokens].movedim(0, query.dim() - 2)
+            per_req_value = v_cache[per_req_tokens].movedim(0, query.dim() - 2)
+
+            per_req_out = (
+                scaled_dot_product_attention(
+                    per_req_query.unsqueeze(0),
+                    per_req_key.unsqueeze(0),
+                    per_req_value.unsqueeze(0),
+                    enable_gqa=enable_gqa,
+                    scale=scaling,
+                    is_causal=causal,
+                )
+                .squeeze(0)
+                .movedim(query.dim() - 2, 0)
+            )
+            output[start_q:end_q, :, :] = per_req_out
+            start_q, start_kv = end_q, end_kv
+
+        return output
+
+    def _test_grouped_decode_attention_once(self, B, H_Q, H_KV, D, D_V, device):
+        dtype = torch.bfloat16
+        # This represents the number of tokens already in the sequence
+        seq_len = 1024
+        total_tokens = B * seq_len
+        sm_scale = 1.0 / (D**0.5)
+        logit_cap = 0.0
+        num_kv_splits = 8
+        enable_gqa = H_Q != H_KV
+
+        # q represents the new token being generated, one per batch
+        q = torch.randn(B, H_Q, D, dtype=dtype, device=device)
+
+        # k_buffer and v_buffer represent all previous tokens
+        k_buffer = torch.randn(total_tokens, H_KV, D, dtype=dtype, device=device)
+        v_buffer = torch.randn(total_tokens, H_KV, D_V, dtype=dtype, device=device)
+
+        key = torch.randn(B, H_KV, D, dtype=dtype)
+        value = torch.randn(B, H_KV, D_V, dtype=dtype)
+        loc = torch.randint(0, 10, (B,)).to(torch.int64)
+
+        # set kv cache
+        k_buffer[loc] = key
+        v_buffer[loc] = value
+
+        # o will have the same shape as q
+        o = torch.zeros(B, H_Q, D_V, dtype=dtype, device=device)
+        o_grouped = torch.zeros(B, H_Q, D_V, dtype=dtype, device=device)
+
+        req_to_token = (
+            torch.arange(total_tokens, device=device)
+            .reshape(B, seq_len)
+            .to(torch.int32)
+        )
+        b_req_idx = torch.arange(B, device=device).to(torch.int64)
+        b_seq_len = torch.full((B,), seq_len, device=device).to(torch.int64)
+
+        attn_logits = torch.empty(
+            (B, H_Q, num_kv_splits, D_V + 1),
+            dtype=torch.float32,
+            device=device,
+        )
+
+        # k_buffer, v_buffer, query, key and value supports non-contiguous tensors
+        k_buffer = k_buffer.transpose(0, 1).contiguous().transpose(0, 1)
+        v_buffer = v_buffer.transpose(0, 1).contiguous().transpose(0, 1)
+        q = q.transpose(0, 1).contiguous().transpose(0, 1)
+        key = key.transpose(0, 1).contiguous().transpose(0, 1)
+        value = value.transpose(0, 1).contiguous().transpose(0, 1)
+        torch.ops.sgl_kernel.decode_attention_cpu(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            key,
+            value,
+            loc,
+            attn_logits,
+            req_to_token,
+            b_req_idx,
+            b_seq_len,
+            sm_scale,
+            logit_cap,
+        )
+
+        self._run_sdpa_forward_decode(
+            q,
+            o_grouped,
+            k_buffer,
+            v_buffer,
+            req_to_token,
+            b_req_idx,
+            b_seq_len,
+            scaling=sm_scale,
+            enable_gqa=enable_gqa,
+        )
+
+        cos_sim = torch.nn.functional.cosine_similarity(
+            o.flatten(), o_grouped.flatten(), dim=0
+        )
+        self.assertGreater(cos_sim.item(), 0.99)
+        torch.testing.assert_close(o, o_grouped, atol=3e-2, rtol=1e-6)
+
+    def _test_grouped_decode_attention(self, device="cuda"):
+        configs = [
+            (2, 16, 16, 64, 64),
+            (2, 16, 1, 16, 16),
+            (2, 32, 8, 33, 55),
+            (2, 16, 1, 64, 64),
+            (2, 64, 1, 13, 13),
+            (2, 128, 1, 80, 80),
+            (2, 128, 2, 512, 512),
+            (1, 16, 1, 576, 512),
+            (1, 16, 16, 576, 512),
+            (1, 22, 1, 576, 512),
+            (1, 40, 8, 128, 128),
+        ]
+
+        for B, H_Q, H_KV, D, D_V in configs:
+            self._test_grouped_decode_attention_once(
+                B, H_Q, H_KV, D, D_V, device=device
+            )
+
+    def test_grouped_decode_attention(self):
+        self._test_grouped_decode_attention("cpu")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/cpu/test_extend.py b/test/srt/cpu/test_extend.py
new file mode 100644
index 000000000..9c6f5b394
--- /dev/null
+++ b/test/srt/cpu/test_extend.py
@@ -0,0 +1,190 @@
+import unittest
+
+import sgl_kernel
+import torch
+from torch.nn.functional import scaled_dot_product_attention
+
+from sglang.test.test_utils import CustomTestCase
+
+torch.manual_seed(1234)
+
+
+class TestExtendAttention(CustomTestCase):
+
+    def _run_sdpa_forward_extend(
+        self,
+        query: torch.Tensor,
+        output: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        req_to_token: torch.Tensor,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        extend_prefix_lens: torch.Tensor,
+        extend_seq_lens: torch.Tensor,
+        scaling=None,
+        enable_gqa=False,
+        causal=False,
+    ):
+
+        assert seq_lens.shape[0] == extend_prefix_lens.shape[0]
+        assert seq_lens.shape[0] == extend_seq_lens.shape[0]
+
+        # [num_tokens, num_heads, head_size] -> [num_heads, num_tokens, head_size]
+        query = query.movedim(0, query.dim() - 2)
+
+        start_q, start_kv = 0, 0
+        for seq_idx in range(seq_lens.shape[0]):
+
+            extend_seq_len_q = extend_seq_lens[seq_idx]
+            prefill_seq_len_q = extend_prefix_lens[seq_idx]
+
+            seq_len_kv = seq_lens[seq_idx]
+            end_q = start_q + extend_seq_len_q
+            end_kv = start_kv + seq_len_kv
+
+            per_req_query = query[:, start_q:end_q, :]
+            per_req_query_redudant = torch.empty(
+                (per_req_query.shape[0], seq_len_kv, per_req_query.shape[2]),
+                dtype=per_req_query.dtype,
+                device=per_req_query.device,
+            )
+
+            per_req_query_redudant[:, prefill_seq_len_q:, :] = per_req_query
+
+            # get key and value from cache. per_req_tokens contains the kv cache
+            # index for each token in the sequence.
+            req_pool_idx = req_pool_indices[seq_idx]
+            per_req_tokens = req_to_token[req_pool_idx, :seq_len_kv]
+            per_req_key = k_cache[per_req_tokens].movedim(0, query.dim() - 2)
+            per_req_value = v_cache[per_req_tokens].movedim(0, query.dim() - 2)
+
+            per_req_out_redudant = (
+                scaled_dot_product_attention(
+                    per_req_query_redudant.unsqueeze(0),
+                    per_req_key.unsqueeze(0),
+                    per_req_value.unsqueeze(0),
+                    enable_gqa=enable_gqa,
+                    scale=scaling,
+                    is_causal=causal,
+                )
+                .squeeze(0)
+                .movedim(query.dim() - 2, 0)
+            )
+            output[start_q:end_q, :, :] = per_req_out_redudant[prefill_seq_len_q:, :, :]
+            start_q, start_kv = end_q, end_kv
+        return output
+
+    def _test_extend_attention_once(self, B, N_CTX, H_Q, H_KV, D, DV, mla=False):
+        dtype = torch.bfloat16
+
+        b_seq_len_prefix = torch.randint(1, N_CTX // 2, (B,), dtype=torch.int32)
+        if mla:
+            b_seq_len_prefix.zero_()
+        b_seq_len_extend = torch.randint(1, N_CTX // 2, (B,), dtype=torch.int32)
+        b_seq_len = b_seq_len_prefix + b_seq_len_extend
+        max_len_in_batch = torch.max(b_seq_len, 0)[0].item()
+
+        b_req_idx = torch.arange(B, dtype=torch.int32)
+        req_to_tokens = torch.empty((B, max_len_in_batch), dtype=torch.int32)
+        b_start_loc = torch.zeros((B,), dtype=torch.int32)
+        b_start_loc[1:] = torch.cumsum(b_seq_len[:-1], 0)
+        b_start_loc_extend = torch.zeros((B,), dtype=torch.int32)
+        b_start_loc_extend[1:] = torch.cumsum(b_seq_len_extend[:-1], 0)
+
+        for i in range(B):
+            req_to_tokens[i, : b_seq_len[i]] = torch.arange(
+                b_start_loc[i], b_start_loc[i] + b_seq_len[i]
+            )
+
+        total_token_num = torch.sum(b_seq_len).item()
+        extend_token_num = torch.sum(b_seq_len_extend).item()
+
+        H_BUF = 1 if mla else H_KV
+        k_buffer = torch.randn((total_token_num, H_BUF, D), dtype=dtype)
+        v_buffer = torch.randn((total_token_num, H_BUF, DV), dtype=dtype)
+
+        k_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype)
+        v_extend = torch.empty((extend_token_num, H_KV, DV), dtype=dtype)
+        q_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype)
+
+        for i in range(B):
+            extend_start_in_buffer = b_start_loc[i] + b_seq_len_prefix[i]
+            extend_end_in_buffer = b_start_loc[i] + b_seq_len[i]
+            extend_start = b_start_loc_extend[i]
+            extend_end = b_start_loc_extend[i] + b_seq_len_extend[i]
+            k_extend[extend_start:extend_end] = k_buffer[
+                extend_start_in_buffer:extend_end_in_buffer
+            ]
+            v_extend[extend_start:extend_end] = v_buffer[
+                extend_start_in_buffer:extend_end_in_buffer
+            ]
+            q_extend[extend_start:extend_end] = torch.randn(
+                (b_seq_len_extend[i], H_Q, D), dtype=dtype
+            )
+
+        # q_extend, k_extend, v_extend, k_buffer and v_buffer supports non-contiguous tensors
+        q_extend = q_extend.transpose(0, 1).contiguous().transpose(0, 1)
+        k_extend = k_extend.transpose(0, 1).contiguous().transpose(0, 1)
+        v_extend = v_extend.transpose(0, 1).contiguous().transpose(0, 1)
+        k_buffer = k_buffer.transpose(0, 1).contiguous().transpose(0, 1)
+        v_buffer = v_buffer.transpose(0, 1).contiguous().transpose(0, 1)
+
+        b_seq_len_extend = b_seq_len - b_seq_len_prefix
+        b_start_loc_extend = torch.zeros_like(b_seq_len)
+        b_start_loc_extend[1:] = torch.cumsum(b_seq_len_extend[:-1], 0)
+        max_len_extend = torch.max(b_seq_len_extend, 0)[0].item()
+
+        sm_scale = 1.0 / (D**0.5)
+        logit_cap = 0.0
+
+        # handle index type
+        b_req_idx = b_req_idx.to(torch.int64)
+        b_seq_len = b_seq_len.to(torch.int64)
+
+        enable_gqa = H_Q != H_KV
+        o_ref = torch.empty((extend_token_num, H_Q, DV), dtype=dtype)
+        self._run_sdpa_forward_extend(
+            q_extend,
+            o_ref,
+            k_buffer,
+            v_buffer,
+            req_to_tokens,
+            b_req_idx,
+            b_seq_len,
+            b_seq_len_prefix,
+            b_seq_len_extend,
+            scaling=sm_scale,
+            enable_gqa=enable_gqa,
+            causal=True,
+        )
+
+        o_extend = torch.empty((extend_token_num, H_Q, DV), dtype=dtype)
+        torch.ops.sgl_kernel.extend_attention_cpu(
+            q_extend,
+            k_extend,
+            v_extend,
+            o_extend,
+            k_buffer,
+            v_buffer,
+            req_to_tokens,
+            b_req_idx,
+            b_seq_len,
+            b_seq_len_extend,
+            b_start_loc_extend,
+            max_len_extend,
+            sm_scale,
+            logit_cap,
+        )
+
+        torch.testing.assert_close(o_ref, o_extend, atol=1e-2, rtol=1e-2)
+
+    def test_extend_attention(self):
+        for is_mla in [True, False]:
+            self._test_extend_attention_once(1, 123, 1, 1, 128, 96, is_mla)
+            self._test_extend_attention_once(1, 123, 16, 1, 128, 96, is_mla)
+            self._test_extend_attention_once(4, 1230, 16, 4, 128, 96, is_mla)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/cpu/test_gemm.py b/test/srt/cpu/test_gemm.py
new file mode 100644
index 000000000..a9fe5066a
--- /dev/null
+++ b/test/srt/cpu/test_gemm.py
@@ -0,0 +1,189 @@
+import itertools
+import unittest
+
+# TODO: use interface in cpu.py
+import sgl_kernel
+import torch
+import torch.nn as nn
+from utils import (
+    convert_weight,
+    native_w8a8_per_token_matmul,
+    per_token_quant_int8,
+    precision,
+)
+
+from sglang.test.test_utils import CustomTestCase
+
+torch.manual_seed(1234)
+
+
+class Mod(nn.Module):
+    def __init__(self, input_channel, output_channel, has_bias):
+        super(Mod, self).__init__()
+        self.linear = torch.nn.Linear(input_channel, output_channel, has_bias)
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+class TestGemm(CustomTestCase):
+    M = [1, 101]
+    N = [16, 32 * 13]
+    K = [32 * 16]
+    has_bias = [False, True]
+
+    M_int8 = [2, 128]
+    N_int8 = [32 * 12]
+    K_int8 = [32 * 17]
+
+    M_fp8 = [1, 11]
+    N_fp8 = [128, 224]
+    K_fp8 = [512, 576]
+
+    def _bf16_gemm(self, M, N, K, has_bias):
+
+        mat1 = torch.randn(M, K, dtype=torch.bfloat16)
+        mat2 = torch.randn(N, K, dtype=torch.bfloat16)
+
+        ref = torch.matmul(mat1.float(), mat2.float().t())
+        if has_bias:
+            bias = torch.randn(N, dtype=torch.float32)
+            ref.add_(bias.bfloat16())
+
+        ref = ref.bfloat16()
+
+        out = torch.ops.sgl_kernel.weight_packed_linear(
+            mat1, mat2, bias if has_bias else None, False
+        )
+
+        packed_mat2 = torch.ops.sgl_kernel.convert_weight_packed(mat2)
+        out2 = torch.ops.sgl_kernel.weight_packed_linear(
+            mat1, packed_mat2, bias if has_bias else None, True
+        )
+
+        atol = rtol = precision[ref.dtype]
+        torch.testing.assert_close(ref, out, atol=atol, rtol=rtol)
+        torch.testing.assert_close(ref, out2, atol=atol, rtol=rtol)
+
+    def test_bf16_gemm(self):
+        for params in itertools.product(
+            self.M,
+            self.N,
+            self.K,
+            self.has_bias,
+        ):
+            with self.subTest(
+                M=params[0],
+                N=params[1],
+                K=params[2],
+                has_bias=params[3],
+            ):
+                self._bf16_gemm(*params)
+
+    def _int8_gemm(self, M, N, K, has_bias):
+        dtype = torch.bfloat16
+        A = torch.randn((M, K), dtype=dtype) / 10
+        Aq, As = per_token_quant_int8(A)
+
+        factor_for_scale = 1e-2
+        int8_max = 127
+        int8_min = -128
+
+        B = (torch.rand((N, K), dtype=torch.float32) - 0.5) * 2
+        Bq = (B * int8_max).clamp(min=int8_min, max=int8_max).to(torch.int8)
+        Bs = torch.rand(N) * factor_for_scale
+
+        bias = torch.randn(N) if has_bias else None
+        ref_out = native_w8a8_per_token_matmul(Aq, Bq, As, Bs, bias, dtype)
+
+        atol = rtol = precision[ref_out.dtype]
+
+        Aq2, As2 = torch.ops.sgl_kernel.per_token_quant_int8_cpu(A)
+        out = torch.ops.sgl_kernel.int8_scaled_mm_cpu(
+            Aq2, Bq, As2, Bs, bias if has_bias else None, torch.bfloat16, False
+        )
+        torch.testing.assert_close(ref_out, out, atol=atol, rtol=rtol)
+
+        # test the fused version
+        fused_out = torch.ops.sgl_kernel.int8_scaled_mm_with_quant(
+            A, Bq, Bs, bias if has_bias else None, torch.bfloat16, False
+        )
+        torch.testing.assert_close(ref_out, fused_out, atol=atol, rtol=rtol)
+
+    def test_int8_gemm(self):
+        for params in itertools.product(
+            self.M_int8,
+            self.N_int8,
+            self.K_int8,
+            self.has_bias,
+        ):
+            with self.subTest(
+                M=params[0],
+                N=params[1],
+                K=params[2],
+                has_bias=params[3],
+            ):
+                self._int8_gemm(*params)
+
+    def _fp8_gemm(self, M, N, K, has_bias):
+        prepack = True
+        chunk = False
+        scale_block_size_N = 64
+        scale_block_size_K = 128
+        assert scale_block_size_N <= N
+        assert scale_block_size_K <= K
+        A_dtype = torch.bfloat16
+
+        model = Mod(K, N, has_bias).eval()
+        if chunk:
+            data = torch.randn(M, K + 6, dtype=A_dtype).narrow(1, 0, K)
+        else:
+            data = torch.randn(M, K, dtype=A_dtype)
+
+        weight = model.linear.weight  # (N, K)
+
+        if has_bias:
+            bias = model.linear.bias
+
+        fp8_weight, scales, dq_weight = convert_weight(
+            weight, [scale_block_size_N, scale_block_size_K], A_dtype
+        )
+
+        if has_bias:
+            ref = torch.matmul(data.to(A_dtype), dq_weight.T) + bias.to(A_dtype)
+        else:
+            ref = torch.matmul(data.to(A_dtype), dq_weight.T)
+
+        if prepack:
+            fp8_weight = torch.ops.sgl_kernel.convert_weight_packed(fp8_weight)
+
+        opt = torch.ops.sgl_kernel.fp8_scaled_mm_cpu(
+            data,
+            fp8_weight,
+            scales,
+            [scale_block_size_N, scale_block_size_K],
+            bias if has_bias else None,
+            data.dtype,
+            prepack,
+        )
+        atol = rtol = precision[ref.dtype]
+        torch.testing.assert_close(ref, opt, atol=atol, rtol=rtol)
+
+    def test_fp8_gemm(self):
+        for params in itertools.product(
+            self.M_fp8,
+            self.N_fp8,
+            self.K_fp8,
+            self.has_bias,
+        ):
+            with self.subTest(
+                M=params[0],
+                N=params[1],
+                K=params[2],
+                has_bias=params[3],
+            ):
+                self._fp8_gemm(*params)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/cpu/test_mla.py b/test/srt/cpu/test_mla.py
new file mode 100644
index 000000000..1f0718d7a
--- /dev/null
+++ b/test/srt/cpu/test_mla.py
@@ -0,0 +1,157 @@
+import itertools
+import unittest
+
+import sgl_kernel
+import torch
+from torch.nn.functional import scaled_dot_product_attention
+from utils import precision
+
+from sglang.test.test_utils import CustomTestCase
+
+torch.manual_seed(1234)
+
+
+class TestMLA(CustomTestCase):
+    def _run_sdpa_forward_decode(
+        self,
+        query: torch.Tensor,
+        output: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        key: torch.Tensor,
+        loc: torch.Tensor,
+        req_to_token: torch.Tensor,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        scaling=None,
+        enable_gqa=False,
+        causal=False,
+    ):
+        # set kv cache
+        k_cache[loc] = key
+
+        # [num_tokens, num_heads, head_size] -> [num_heads, num_tokens, head_size]
+        query = query.movedim(0, query.dim() - 2)
+
+        start_q, start_kv = 0, 0
+        for seq_idx in range(seq_lens.shape[0]):
+            seq_len_q = 1
+            seq_len_kv = seq_lens[seq_idx]
+            end_q = start_q + seq_len_q
+            end_kv = start_kv + seq_len_kv
+
+            per_req_query = query[:, start_q:end_q, :]
+
+            # get key and value from cache. per_req_tokens contains the kv cache
+            # index for each token in the sequence.
+            req_pool_idx = req_pool_indices[seq_idx]
+            per_req_tokens = req_to_token[req_pool_idx, :seq_len_kv]
+            per_req_key = k_cache[per_req_tokens].movedim(0, query.dim() - 2)
+            per_req_value = v_cache[per_req_tokens].movedim(0, query.dim() - 2)
+
+            per_req_out = (
+                scaled_dot_product_attention(
+                    per_req_query.unsqueeze(0),
+                    per_req_key.unsqueeze(0),
+                    per_req_value.unsqueeze(0),
+                    enable_gqa=enable_gqa,
+                    scale=scaling,
+                    is_causal=causal,
+                )
+                .squeeze(0)
+                .movedim(query.dim() - 2, 0)
+            )
+            output[start_q:end_q, :, :] = per_req_out
+            start_q, start_kv = end_q, end_kv
+
+        return output
+
+    def _test_grouped_decode_attention_once(self, B, H_Q, H_KV, D, D_V, seq_len):
+        dtype = torch.bfloat16
+
+        total_tokens = B * seq_len
+        sm_scale = 1.0 / (D**0.5)
+        logit_cap = 0.0
+        num_kv_splits = 8
+        enable_gqa = H_Q != H_KV
+
+        # q represents the new token being generated, one per batch
+        q = torch.randn(B, H_Q, D, dtype=dtype)
+
+        # k_buffer and v_buffer represent all previous tokens
+        k_buffer = torch.randn(total_tokens, H_KV, D, dtype=dtype)
+        v_buffer = k_buffer.narrow(2, 0, D_V)
+
+        key = torch.randn(B, H_KV, D, dtype=dtype)
+        value = key.narrow(2, 0, D_V)
+        # make sure no duplicates in loc
+        loc = torch.randperm(total_tokens)[:B].to(torch.int64)
+
+        k_buffer2 = k_buffer.clone()
+        v_buffer2 = k_buffer2.narrow(2, 0, D_V)
+
+        # o will have the same shape as q
+        o = torch.zeros(B, H_Q, D_V, dtype=dtype)
+        o_grouped = torch.zeros(B, H_Q, D_V, dtype=dtype)
+
+        req_to_token = torch.arange(total_tokens).reshape(B, seq_len).to(torch.int32)
+        b_req_idx = torch.arange(B).to(torch.int64)
+        b_seq_len = torch.full((B,), seq_len).to(torch.int64)
+
+        attn_logits = torch.empty(
+            (B, H_Q, num_kv_splits, D_V + 1),
+            dtype=torch.float32,
+        )
+
+        torch.ops.sgl_kernel.decode_attention_cpu(
+            q,
+            k_buffer2,
+            v_buffer2,
+            o,
+            key,
+            value,
+            loc,
+            attn_logits,
+            req_to_token,
+            b_req_idx,
+            b_seq_len,
+            sm_scale,
+            logit_cap,
+        )
+
+        self._run_sdpa_forward_decode(
+            q,
+            o_grouped,
+            k_buffer,
+            v_buffer,
+            key,
+            loc,
+            req_to_token,
+            b_req_idx,
+            b_seq_len,
+            scaling=sm_scale,
+            enable_gqa=enable_gqa,
+        )
+
+        cos_sim = torch.nn.functional.cosine_similarity(
+            o.flatten(), o_grouped.flatten(), dim=0
+        )
+        atol = rtol = precision[q.dtype]
+        self.assertGreater(cos_sim.item(), 0.99)
+        torch.testing.assert_close(o, o_grouped, atol=atol, rtol=rtol)
+        torch.testing.assert_close(k_buffer, k_buffer2, atol=atol, rtol=rtol)
+        torch.testing.assert_close(v_buffer, v_buffer2, atol=atol, rtol=rtol)
+
+    def test_grouped_decode_attention(self):
+        configs = [
+            (1, 22, 1, 576, 512, 8 * 111),
+            (4, 22, 1, 576, 512, 8 * 128),
+            (40, 22, 1, 576, 512, 8 * 133),
+        ]
+
+        for B, H_Q, H_KV, D, D_V, seqlen in configs:
+            self._test_grouped_decode_attention_once(B, H_Q, H_KV, D, D_V, seqlen)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/cpu/test_moe.py b/test/srt/cpu/test_moe.py
new file mode 100644
index 000000000..96eb28020
--- /dev/null
+++ b/test/srt/cpu/test_moe.py
@@ -0,0 +1,265 @@
+import itertools
+import math
+import unittest
+
+# TODO: use interface in cpu.py
+import sgl_kernel
+import torch
+
+kernel = torch.ops.sgl_kernel
+
+torch.manual_seed(1234)
+
+from utils import (
+    BLOCK_K,
+    BLOCK_N,
+    factor_for_scale,
+    fp8_max,
+    fp8_min,
+    native_fp8_fused_moe,
+    precision,
+    scaled_weight,
+    torch_naive_fused_moe,
+    torch_w8a8_per_column_fused_moe,
+)
+
+from sglang.test.test_utils import CustomTestCase
+
+
+def fused_moe(a, w1, w2, score, topk, renormalize, prepack):
+
+    G = 1
+    topk_group = 1
+
+    B, D = a.shape
+    topk_weights = torch.empty(B, topk, dtype=torch.float32)
+    topk_ids = torch.empty(B, topk, dtype=torch.int32)
+    topk_weights, topk_ids = kernel.grouped_topk_cpu(
+        a, score, topk, renormalize, G, topk_group, 0, None, None
+    )
+
+    packed_w1 = kernel.convert_weight_packed(w1) if prepack else w1
+    packed_w2 = kernel.convert_weight_packed(w2) if prepack else w2
+
+    inplace = True
+    return kernel.fused_experts_cpu(
+        a,
+        packed_w1,
+        packed_w2,
+        topk_weights,
+        topk_ids,
+        inplace,
+        False,
+        False,
+        None,
+        None,
+        None,
+        None,
+        None,
+        prepack,
+    )
+
+
+class TestFusedExperts(CustomTestCase):
+    M = [2, 114]
+    N = [32]
+    K = [32]
+    E = [4]
+    topk = [2]
+    renormalize = [False, True]
+
+    M_int8 = [1, 39]
+    N_int8 = [128]
+    K_int8 = [256]
+    E_int8 = [8]
+    topk_int8 = [3]
+
+    M_fp8 = [2, 121]
+    N_fp8 = [352, 512]
+    K_fp8 = [256, 320]
+    E_fp8 = [8]
+    topk_fp8 = [4]
+
+    def _bf16_moe(self, m, n, k, e, topk, renormalize):
+        dtype = torch.bfloat16
+        prepack = True
+
+        a = torch.randn((m, k), device="cpu", dtype=dtype) / 10
+        w1 = torch.randn((e, 2 * n, k), device="cpu", dtype=dtype) / 10
+        w2 = torch.randn((e, k, n), device="cpu", dtype=dtype) / 10
+        score = torch.randn((m, e), device="cpu", dtype=dtype)
+
+        torch_output = torch_naive_fused_moe(a, w1, w2, score, topk, renormalize)
+        fused_output = fused_moe(a, w1, w2, score, topk, renormalize, prepack)
+
+        atol = rtol = precision[torch_output.dtype]
+        torch.testing.assert_close(torch_output, fused_output, atol=atol, rtol=rtol)
+
+    def test_bf16_moe(self):
+        for params in itertools.product(
+            self.M,
+            self.N,
+            self.K,
+            self.E,
+            self.topk,
+            self.renormalize,
+        ):
+            with self.subTest(
+                m=params[0],
+                n=params[1],
+                k=params[2],
+                e=params[3],
+                topk=params[4],
+                renormalize=params[5],
+            ):
+                self._bf16_moe(*params)
+
+    def _int8_moe(self, M, N, K, E, topk):
+        dtype = torch.bfloat16
+        prepack = True
+
+        # Initialize int8 quantization parameters
+        int8_factor_for_scale = 1e-2
+        int8_max = 127
+        int8_min = -128
+
+        # Input tensor
+        # M * K
+        a = torch.randn((M, K), dtype=dtype) / math.sqrt(K)
+
+        # Generate int8 weights
+        w1_fp32 = (torch.rand((E, 2 * N, K), dtype=torch.float32) - 0.5) * 2
+        w1 = (w1_fp32 * int8_max).clamp(min=int8_min, max=int8_max).to(torch.int8)
+
+        w2_fp32 = (torch.rand((E, K, N), dtype=torch.float32) - 0.5) * 2
+        w2 = (w2_fp32 * int8_max).clamp(min=int8_min, max=int8_max).to(torch.int8)
+
+        # Generate scale for each column (per-column quantization)
+        w1_s = torch.rand(E, 2 * N, device=w1_fp32.device) * int8_factor_for_scale
+        w2_s = torch.rand(E, K, device=w2_fp32.device) * int8_factor_for_scale
+
+        # Calculate routing
+        score = torch.randn((M, E), dtype=dtype)
+        score = torch.softmax(score, dim=-1, dtype=torch.float32)
+        topk_weight, topk_ids = torch.topk(score, topk)
+
+        ref_out = torch_w8a8_per_column_fused_moe(
+            a, w1, w2, w1_s, w2_s, topk_weight, topk_ids, topk
+        )
+
+        inplace = True
+        packed_w1 = kernel.convert_weight_packed(w1) if prepack else w1
+        packed_w2 = kernel.convert_weight_packed(w2) if prepack else w2
+        out = kernel.fused_experts_cpu(
+            a,
+            packed_w1,
+            packed_w2,
+            topk_weight,
+            topk_ids.to(torch.int32),
+            inplace,
+            True,
+            False,
+            w1_s,
+            w2_s,
+            None,
+            None,
+            None,
+            prepack,
+        )
+
+        atol = rtol = precision[ref_out.dtype]
+        # Increase the tolerance for large input shapes
+        if M > 35:
+            atol = rtol = 0.02
+        torch.testing.assert_close(ref_out, out, atol=atol, rtol=rtol)
+
+    def test_int8_moe(self):
+        for params in itertools.product(
+            self.M_int8,
+            self.N_int8,
+            self.K_int8,
+            self.E_int8,
+            self.topk_int8,
+        ):
+            with self.subTest(
+                M=params[0],
+                N=params[1],
+                K=params[2],
+                E=params[3],
+                topk=params[4],
+            ):
+                self._int8_moe(*params)
+
+    def _fp8_moe(self, M, N, K, E, topk):
+        dtype = torch.bfloat16
+
+        a = torch.randn(M, K, dtype=dtype) / math.sqrt(K)
+
+        w1_fp32 = torch.randn(E, 2 * N, K)
+        w1 = (w1_fp32 * fp8_max).clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+        w2_fp32 = torch.randn(E, K, N)
+        w2 = (w2_fp32 * fp8_max).clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+        w1s = (
+            torch.randn(E, math.ceil(2 * N / BLOCK_N), math.ceil(K / BLOCK_K))
+            * factor_for_scale
+        )
+        w2s = (
+            torch.randn(E, math.ceil(K / BLOCK_N), math.ceil(N / BLOCK_K))
+            * factor_for_scale
+        )
+
+        w1_scaled = scaled_weight(w1, w1s)
+        w2_scaled = scaled_weight(w2, w2s)
+
+        score = torch.randn((M, E), dtype=dtype)
+        score = torch.softmax(score, dim=-1, dtype=torch.float32)
+        topk_weight, topk_ids = torch.topk(score, topk)
+
+        w1 = kernel.convert_weight_packed(w1)
+        w2 = kernel.convert_weight_packed(w2)
+
+        ref_out = native_fp8_fused_moe(
+            a, w1_scaled, w2_scaled, topk_weight, topk_ids, topk
+        )
+        out = kernel.fused_experts_cpu(
+            a,
+            w1,
+            w2,
+            topk_weight,
+            topk_ids.to(torch.int32),
+            False,
+            False,
+            True,
+            w1s,
+            w2s,
+            [BLOCK_N, BLOCK_K],
+            None,
+            None,
+            True,
+        )
+
+        atol = rtol = precision[dtype]
+        torch.testing.assert_close(ref_out.bfloat16(), out, atol=atol, rtol=rtol)
+
+    def test_fp8_moe(self):
+        for params in itertools.product(
+            self.M_fp8,
+            self.N_fp8,
+            self.K_fp8,
+            self.E_fp8,
+            self.topk_fp8,
+        ):
+            with self.subTest(
+                M=params[0],
+                N=params[1],
+                K=params[2],
+                E=params[3],
+                topk=params[4],
+            ):
+                self._fp8_moe(*params)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/cpu/test_norm.py b/test/srt/cpu/test_norm.py
new file mode 100644
index 000000000..75bacb198
--- /dev/null
+++ b/test/srt/cpu/test_norm.py
@@ -0,0 +1,90 @@
+import itertools
+import unittest
+from typing import Optional, Tuple, Union
+
+import sgl_kernel
+import torch
+from utils import make_non_contiguous, precision
+
+from sglang.test.test_utils import CustomTestCase
+
+torch.manual_seed(1234)
+
+
+class TestNorm(CustomTestCase):
+    M = [4096, 1024]
+    N = [4096, 4096 + 13]
+    dtype = [torch.float16, torch.bfloat16]
+
+    def _forward_native(
+        self,
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        variance_epsilon: float = 1e-6,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        orig_dtype = x.dtype
+        x = x.to(torch.float32)
+        if residual is not None:
+            x = x + residual.to(torch.float32)
+            residual = x.to(orig_dtype)
+
+        variance = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + variance_epsilon)
+        x = x.to(orig_dtype) * weight
+        if residual is None:
+            return x
+        else:
+            return x, residual
+
+    def _norm_test(self, m, n, dtype):
+
+        x = torch.randn([m, n], dtype=dtype)
+        x = make_non_contiguous(x)
+        hidden_size = x.size(-1)
+        weight = torch.randn(hidden_size, dtype=dtype)
+        variance_epsilon = 1e-6
+
+        out = torch.ops.sgl_kernel.rmsnorm_cpu(x, weight, variance_epsilon)
+        ref_out = self._forward_native(x, weight, variance_epsilon)
+
+        atol = rtol = precision[ref_out.dtype]
+        torch.testing.assert_close(ref_out, out, atol=atol, rtol=rtol)
+
+        ref_x = x.clone()
+        residual = torch.randn([m, hidden_size], dtype=dtype)
+        ref_residual = residual.clone()
+
+        torch.ops.sgl_kernel.fused_add_rmsnorm_cpu(
+            x, residual, weight, variance_epsilon
+        )
+
+        ref_x, ref_residual = self._forward_native(
+            ref_x, weight, variance_epsilon, ref_residual
+        )
+
+        torch.testing.assert_close(x, ref_x, atol=atol, rtol=rtol)
+        torch.testing.assert_close(residual, ref_residual, atol=atol, rtol=rtol)
+
+    def _l2norm_test(self, m, n, dtype):
+
+        x = torch.randn([m, n], dtype=dtype)
+        hidden_size = x.size(-1)
+        fake_ones_weight = torch.ones(hidden_size, dtype=dtype)
+        variance_epsilon = 1e-6
+
+        out = torch.ops.sgl_kernel.l2norm_cpu(x, variance_epsilon)
+        ref_out = self._forward_native(x, fake_ones_weight, variance_epsilon)
+
+        atol = rtol = precision[ref_out.dtype]
+        torch.testing.assert_close(ref_out, out, atol=atol, rtol=rtol)
+
+    def test_norm(self):
+        for params in itertools.product(self.M, self.N, self.dtype):
+            with self.subTest(m=params[0], n=params[1], dtype=params[2]):
+                self._norm_test(*params)
+                self._l2norm_test(*params)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/cpu/test_qkv_proj_with_rope.py b/test/srt/cpu/test_qkv_proj_with_rope.py
new file mode 100644
index 000000000..dc90cc559
--- /dev/null
+++ b/test/srt/cpu/test_qkv_proj_with_rope.py
@@ -0,0 +1,432 @@
+import unittest
+
+import sgl_kernel
+import torch
+from utils import (
+    convert_weight,
+    native_w8a8_per_token_matmul,
+    per_token_quant_int8,
+    precision,
+)
+
+from sglang.srt.layers.rotary_embedding import _apply_rotary_emb
+from sglang.test.test_utils import CustomTestCase
+
+convert_weight_packed = torch.ops.sgl_kernel.convert_weight_packed
+qkv_proj_with_rope = torch.ops.sgl_kernel.qkv_proj_with_rope
+qkv_proj_with_rope_fused_weight = torch.ops.sgl_kernel.qkv_proj_with_rope_fused_weight
+torch.manual_seed(1234)
+# constants
+kv_lora_rank = 512
+qk_head_dim = 192
+qk_nope_head_dim = 128
+qk_rope_head_dim = 64
+rotary_dim = qk_rope_head_dim
+num_heads = 22
+q_lora_rank = 1536
+hidden_size = 7168
+B = 1
+eps = 1e-6
+
+
+def layernorm(x, weight, variance_epsilon=1e-6, residual=None):
+    orig_dtype = x.dtype
+    x = x.to(torch.float32)
+    variance = x.pow(2).mean(dim=-1, keepdim=True)
+    x = x * torch.rsqrt(variance + variance_epsilon)
+    return (x * weight).to(orig_dtype)
+
+
+def rotary_emb(q_pe, k_pe, pos, cos_sin_cache):
+    orig_dtype = q_pe.dtype
+    q_pe = q_pe.float()
+    k_pe = k_pe.float()
+    cos_sin_cache = cos_sin_cache.float()
+
+    query_rot = q_pe[..., :rotary_dim]
+    key_rot = k_pe[..., :rotary_dim]
+    cos_sin = cos_sin_cache[pos]
+    cos, sin = cos_sin.chunk(2, dim=-1)
+    query_rot = _apply_rotary_emb(query_rot, cos, sin, False)
+    key_rot = _apply_rotary_emb(key_rot, cos, sin, False)
+    return query_rot.to(orig_dtype), key_rot.to(orig_dtype)
+
+
+def native_torch(
+    q_input,
+    hidden_states,
+    q_a_proj_weight,
+    norm_weight1,
+    q_b_proj_weight,
+    w_kc,
+    kv_a_proj_weight,
+    norm_weight2,
+    pos,
+    cos_sin_cache,
+):
+
+    q = torch.matmul(hidden_states, q_a_proj_weight.t())
+    q = layernorm(q, norm_weight1)
+    q = torch.matmul(q, q_b_proj_weight.t()).view(-1, num_heads, qk_head_dim)
+
+    q_nope, q_pe = q.split([qk_nope_head_dim, qk_rope_head_dim], dim=-1)
+    q_nope_out = torch.bmm(q_nope.transpose(0, 1), w_kc)
+
+    q_input[..., :kv_lora_rank] = q_nope_out.transpose(0, 1)
+    latent_cache = torch.matmul(hidden_states, kv_a_proj_weight.t())
+    v_input = latent_cache[..., :kv_lora_rank]
+    v_input = layernorm(v_input.contiguous(), norm_weight2).unsqueeze(1)
+    k_input = latent_cache.unsqueeze(1)
+    k_input[..., :kv_lora_rank] = v_input
+    k_pe = k_input[..., kv_lora_rank:]
+
+    q_pe, k_pe = rotary_emb(q_pe, k_pe, pos, cos_sin_cache)
+    q_input[..., kv_lora_rank:] = q_pe
+    k_input[..., kv_lora_rank:] = k_pe
+
+    return q_input, k_input, v_input
+
+
+def native_torch_int8(
+    q_input,
+    hidden_states,
+    w1_q,
+    w1_s,
+    norm_weight1,
+    w2_q,
+    w2_s,
+    w_kc,
+    w3_q,
+    w3_s,
+    norm_weight2,
+    pos,
+    cos_sin_cache,
+):
+
+    a_q, a_s = per_token_quant_int8(hidden_states)
+    q = native_w8a8_per_token_matmul(a_q, w1_q, a_s, w1_s, None, torch.bfloat16)
+    q = layernorm(q, norm_weight1)
+
+    a_q, a_s = per_token_quant_int8(q)
+    q = native_w8a8_per_token_matmul(a_q, w2_q, a_s, w2_s, None, torch.bfloat16).view(
+        -1, num_heads, qk_head_dim
+    )
+
+    q_nope, q_pe = q.split([qk_nope_head_dim, qk_rope_head_dim], dim=-1)
+    q_nope_out = torch.bmm(q_nope.transpose(0, 1), w_kc)
+
+    q_input[..., :kv_lora_rank] = q_nope_out.transpose(0, 1)
+    a_q, a_s = per_token_quant_int8(hidden_states)
+    latent_cache = native_w8a8_per_token_matmul(
+        a_q, w3_q, a_s, w3_s, None, torch.bfloat16
+    )
+    v_input = latent_cache[..., :kv_lora_rank]
+    v_input = layernorm(v_input.contiguous(), norm_weight2).unsqueeze(1)
+    k_input = latent_cache.unsqueeze(1)
+    k_input[..., :kv_lora_rank] = v_input
+    k_pe = k_input[..., kv_lora_rank:]
+
+    q_pe, k_pe = rotary_emb(q_pe, k_pe, pos, cos_sin_cache)
+    q_input[..., kv_lora_rank:] = q_pe
+    k_input[..., kv_lora_rank:] = k_pe
+
+    return q_input, k_input, v_input
+
+
+class TestQKVProjWithROPE(CustomTestCase):
+    def test_bf16_qkv_proj_with_rope(self):
+        dtype = torch.bfloat16
+        hidden_states = torch.randn(B, hidden_size, dtype=dtype) / hidden_size
+        q_input = torch.empty(
+            B, num_heads, kv_lora_rank + qk_rope_head_dim, dtype=dtype
+        )
+        q_a_proj_weight = torch.randn(q_lora_rank, hidden_size, dtype=dtype) * 0.1
+        norm_weight1 = torch.randn(q_lora_rank, dtype=dtype)
+        q_b_proj_weight = (
+            torch.randn(num_heads * qk_head_dim, q_lora_rank, dtype=dtype) * 0.1
+        )
+        w_kc = torch.randn(num_heads, kv_lora_rank, qk_nope_head_dim, dtype=dtype) * 0.1
+        kv_a_proj_weight = (
+            torch.randn(kv_lora_rank + qk_rope_head_dim, hidden_size, dtype=dtype) * 0.1
+        )
+        fused_weight = torch.cat([q_a_proj_weight, kv_a_proj_weight], dim=0)
+        norm_weight2 = torch.randn(kv_lora_rank, dtype=dtype)
+        pos = torch.randint(10, 100, (B,))
+        cos_sin_cache = torch.randn(100, rotary_dim, dtype=dtype)
+        q_ref, k_ref, v_ref = native_torch(
+            q_input,
+            hidden_states,
+            q_a_proj_weight,
+            norm_weight1,
+            q_b_proj_weight,
+            w_kc.transpose(1, 2),
+            kv_a_proj_weight,
+            norm_weight2,
+            pos,
+            cos_sin_cache,
+        )
+        qa_packed = convert_weight_packed(q_a_proj_weight)
+        qb_packed = convert_weight_packed(q_b_proj_weight)
+        kva_packed = convert_weight_packed(kv_a_proj_weight)
+        wkc_packed = convert_weight_packed(w_kc)
+        fused_weight_packed = convert_weight_packed(fused_weight)
+
+        q_out, k_out, v_out = qkv_proj_with_rope(
+            hidden_states,
+            qa_packed,
+            qb_packed,
+            kva_packed,
+            wkc_packed,
+            norm_weight1,
+            norm_weight2,
+            pos,
+            cos_sin_cache,
+            eps,
+            False,
+            False,
+            None,
+            None,
+            None,
+            True,
+            None,
+        )
+        fused_q_out, fused_k_out, fused_v_out = qkv_proj_with_rope_fused_weight(
+            hidden_states,
+            fused_weight_packed,
+            qb_packed,
+            wkc_packed,
+            norm_weight1,
+            norm_weight2,
+            pos,
+            cos_sin_cache,
+            eps,
+            False,
+            False,
+            None,
+            None,
+            True,
+            None,
+            q_lora_rank,
+            kv_lora_rank,
+            qk_rope_head_dim,
+        )
+        atol = rtol = precision[q_ref.dtype]
+        torch.testing.assert_close(q_ref, q_out, atol=atol, rtol=rtol)
+        torch.testing.assert_close(k_ref, k_out, atol=atol, rtol=rtol)
+        torch.testing.assert_close(v_ref, v_out, atol=atol, rtol=rtol)
+        torch.testing.assert_close(fused_q_out, q_out)
+        torch.testing.assert_close(fused_k_out, k_out)
+        torch.testing.assert_close(fused_v_out, v_out)
+
+    def test_int8_qkv_proj_with_rope(self):
+        dtype = torch.bfloat16
+        hidden_states = torch.randn(B, hidden_size, dtype=dtype) / hidden_size
+        q_input = torch.empty(
+            B, num_heads, kv_lora_rank + qk_rope_head_dim, dtype=dtype
+        )
+        q_a_proj_weight = torch.randn(q_lora_rank, hidden_size, dtype=dtype) * 0.1
+        norm_weight1 = torch.randn(q_lora_rank, dtype=dtype)
+        q_b_proj_weight = (
+            torch.randn(num_heads * qk_head_dim, q_lora_rank, dtype=dtype) * 0.1
+        )
+        w_kc = torch.randn(num_heads, kv_lora_rank, qk_nope_head_dim, dtype=dtype) * 0.1
+        kv_a_proj_weight = (
+            torch.randn(kv_lora_rank + qk_rope_head_dim, hidden_size, dtype=dtype) * 0.1
+        )
+        norm_weight2 = torch.randn(kv_lora_rank, dtype=dtype)
+        pos = torch.randint(10, 100, (B,))
+        cos_sin_cache = torch.randn(100, rotary_dim, dtype=dtype)
+
+        w1_q, w1_s = per_token_quant_int8(q_a_proj_weight)
+        w2_q, w2_s = per_token_quant_int8(q_b_proj_weight)
+        w3_q, w3_s = per_token_quant_int8(kv_a_proj_weight)
+        q_ref, k_ref, v_ref = native_torch_int8(
+            q_input,
+            hidden_states,
+            w1_q,
+            w1_s,
+            norm_weight1,
+            w2_q,
+            w2_s,
+            w_kc.transpose(1, 2),
+            w3_q,
+            w3_s,
+            norm_weight2,
+            pos,
+            cos_sin_cache,
+        )
+        w1_q_packed = convert_weight_packed(w1_q)
+        w2_q_packed = convert_weight_packed(w2_q)
+        w3_q_packed = convert_weight_packed(w3_q)
+        wkc_packed = convert_weight_packed(w_kc)
+        q_out, k_out, v_out = qkv_proj_with_rope(
+            hidden_states,
+            w1_q_packed,
+            w2_q_packed,
+            w3_q_packed,
+            wkc_packed,
+            norm_weight1,
+            norm_weight2,
+            pos,
+            cos_sin_cache,
+            eps,
+            True,
+            False,
+            w1_s,
+            w2_s,
+            w3_s,
+            True,
+            None,
+        )
+        fused_weight = torch.cat([w1_q, w3_q], dim=0)
+        fused_weight_s = torch.cat([w1_s, w3_s], dim=0)
+        w_fused_q_packed = convert_weight_packed(fused_weight)
+        fused_q_out, fused_k_out, fused_v_out = qkv_proj_with_rope_fused_weight(
+            hidden_states,
+            w_fused_q_packed,
+            w2_q_packed,
+            wkc_packed,
+            norm_weight1,
+            norm_weight2,
+            pos,
+            cos_sin_cache,
+            eps,
+            True,
+            False,
+            fused_weight_s,
+            w2_s,
+            True,
+            None,
+            q_lora_rank,
+            kv_lora_rank,
+            qk_rope_head_dim,
+        )
+        atol = rtol = precision[q_ref.dtype]
+        torch.testing.assert_close(q_ref, q_out, atol=atol, rtol=rtol)
+        torch.testing.assert_close(k_ref, k_out, atol=atol, rtol=rtol)
+        torch.testing.assert_close(v_ref, v_out, atol=atol, rtol=rtol)
+        torch.testing.assert_close(fused_q_out, q_out)
+        torch.testing.assert_close(fused_k_out, k_out)
+        torch.testing.assert_close(fused_v_out, v_out)
+
+    def test_fp8_qkv_proj_with_rope(self):
+        dtype = torch.bfloat16
+        hidden_states = torch.randn(B, hidden_size, dtype=dtype) / hidden_size
+        q_input = torch.empty(
+            B, num_heads, kv_lora_rank + qk_rope_head_dim, dtype=dtype
+        )
+        q_a_proj_weight = torch.randn(q_lora_rank, hidden_size, dtype=dtype) * 0.1
+        norm_weight1 = torch.randn(q_lora_rank, dtype=dtype)
+        q_b_proj_weight = (
+            torch.randn(num_heads * qk_head_dim, q_lora_rank, dtype=dtype) * 0.1
+        )
+        w_kc = torch.randn(num_heads, kv_lora_rank, qk_nope_head_dim, dtype=dtype) * 0.1
+        kv_a_proj_weight = (
+            torch.randn(kv_lora_rank + qk_rope_head_dim, hidden_size, dtype=dtype) * 0.1
+        )
+        norm_weight2 = torch.randn(kv_lora_rank, dtype=dtype)
+        pos = torch.randint(10, 100, (B,))
+        cos_sin_cache = torch.randn(100, rotary_dim, dtype=dtype)
+
+        scale_block_size_N = 128
+        scale_block_size_K = 128
+        fp8_q_a_proj_weight, q_a_proj_weight_scale_inv, q_a_proj_weight_dq = (
+            convert_weight(
+                q_a_proj_weight,
+                [scale_block_size_N, scale_block_size_K],
+                torch.bfloat16,
+            )
+        )
+        fp8_q_b_proj_weight, q_b_proj_weight_scale_inv, q_b_proj_weight_dq = (
+            convert_weight(
+                q_b_proj_weight,
+                [scale_block_size_N, scale_block_size_K],
+                torch.bfloat16,
+            )
+        )
+        (
+            fp8_kv_a_proj_with_mqa_weight,
+            kv_a_proj_with_mqa_weight_scale_inv,
+            kv_a_proj_with_mqa_weight_dq,
+        ) = convert_weight(
+            kv_a_proj_weight, [scale_block_size_N, scale_block_size_K], torch.bfloat16
+        )
+        q_ref, k_ref, v_ref = native_torch(
+            q_input,
+            hidden_states,
+            q_a_proj_weight_dq,
+            norm_weight1,
+            q_b_proj_weight_dq,
+            w_kc.transpose(1, 2),
+            kv_a_proj_with_mqa_weight_dq,
+            norm_weight2,
+            pos,
+            cos_sin_cache,
+        )
+        fp8_q_a_proj_weight_packed = convert_weight_packed(fp8_q_a_proj_weight)
+        fp8_q_b_proj_weight_packed = convert_weight_packed(fp8_q_b_proj_weight)
+        fp8_kv_a_proj_with_mqa_weight_packed = convert_weight_packed(
+            fp8_kv_a_proj_with_mqa_weight
+        )
+        w_kc = convert_weight_packed(w_kc)
+        q_out, k_out, v_out = qkv_proj_with_rope(
+            hidden_states,
+            fp8_q_a_proj_weight_packed,
+            fp8_q_b_proj_weight_packed,
+            fp8_kv_a_proj_with_mqa_weight_packed,
+            w_kc,
+            norm_weight1,
+            norm_weight2,
+            pos,
+            cos_sin_cache,
+            eps,
+            False,
+            True,
+            q_a_proj_weight_scale_inv.float(),
+            q_b_proj_weight_scale_inv.float(),
+            kv_a_proj_with_mqa_weight_scale_inv.float(),
+            True,
+            [scale_block_size_N, scale_block_size_K],
+        )
+
+        fused_weight = torch.cat(
+            [fp8_q_a_proj_weight, fp8_kv_a_proj_with_mqa_weight], dim=0
+        )
+        fused_weight_s = torch.cat(
+            [q_a_proj_weight_scale_inv, kv_a_proj_with_mqa_weight_scale_inv], dim=0
+        )
+        fused_weight_packed = convert_weight_packed(fused_weight)
+        fused_q_out, fused_k_out, fused_v_out = qkv_proj_with_rope_fused_weight(
+            hidden_states,
+            fused_weight_packed,
+            fp8_q_b_proj_weight_packed,
+            w_kc,
+            norm_weight1,
+            norm_weight2,
+            pos,
+            cos_sin_cache,
+            eps,
+            False,
+            True,
+            fused_weight_s.float(),
+            q_b_proj_weight_scale_inv.float(),
+            True,
+            [scale_block_size_N, scale_block_size_K],
+            q_lora_rank,
+            kv_lora_rank,
+            qk_rope_head_dim,
+        )
+        atol = rtol = precision[q_ref.dtype]
+        # Due to the change in multiplication order, the error is amplified.
+        # In the model, with fewer layers, this doesn't cause issues, but in
+        # tests with more layers, we need to enlarge the tolerance to pass the tests.
+        torch.testing.assert_close(q_ref, q_out, atol=1e-1, rtol=1e-1)
+        torch.testing.assert_close(k_ref, k_out, atol=atol, rtol=rtol)
+        torch.testing.assert_close(v_ref, v_out, atol=atol, rtol=rtol)
+        torch.testing.assert_close(fused_q_out, q_out)
+        torch.testing.assert_close(fused_k_out, k_out)
+        torch.testing.assert_close(fused_v_out, v_out)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/cpu/test_rope.py b/test/srt/cpu/test_rope.py
new file mode 100644
index 000000000..38481e5e3
--- /dev/null
+++ b/test/srt/cpu/test_rope.py
@@ -0,0 +1,178 @@
+import unittest
+
+import sgl_kernel
+import torch
+from utils import precision
+
+from sglang.srt.layers.rotary_embedding import (
+    DeepseekScalingRotaryEmbedding,
+    RotaryEmbedding,
+)
+from sglang.test.test_utils import CustomTestCase
+
+torch.manual_seed(1234)
+
+
+class TestROPE(CustomTestCase):
+    def test_deepseek_v2_rope(self):
+        num_head = 16
+        seq_len = 1024
+        q_head_dim = 192
+        qk_nope_head_dim = 128
+        qk_rope_head_dim = 64
+        max_pos = 256
+        k_dim = 576
+        rotary_dim = 64
+        is_neox_style = False
+
+        # Create cos_sin_cache
+        freqs = torch.rand(max_pos, qk_rope_head_dim // 2)
+        cos = freqs.cos() * 0.7
+        sin = freqs.sin() * 0.7
+        cos_sin_cache = torch.cat((cos, sin), dim=-1).to(torch.bfloat16)
+        positions = torch.randint(0, max_pos, (seq_len,))
+
+        rope = DeepseekScalingRotaryEmbedding(
+            qk_rope_head_dim,
+            rotary_dim,
+            max_pos,
+            16,  # not used since cos_sin_cache is provided
+            is_neox_style,
+            1.0,
+            torch.bfloat16,
+            device="cpu",
+        )
+        rope.register_buffer("cos_sin_cache", cos_sin_cache)
+
+        for dtype in [torch.bfloat16]:
+            enable_autocast = True
+
+            with torch.no_grad(), torch.amp.autocast("cpu", enabled=enable_autocast):
+                q = torch.randn(seq_len, num_head, q_head_dim, dtype=dtype)
+                q_clone = q.clone()
+                k = torch.randn(seq_len, 1, k_dim, dtype=dtype)
+                k_clone = k.clone()
+                _, q_pe = q.split([qk_nope_head_dim, qk_rope_head_dim], dim=-1)
+                _, q_pe_clone = q_clone.split(
+                    [qk_nope_head_dim, qk_rope_head_dim], dim=-1
+                )
+                k_pe = k[:, :, k_dim - qk_rope_head_dim :]
+                k_pe_clone = k_clone[:, :, k_dim - qk_rope_head_dim :]
+
+                # ref kernel
+                q_pe, k_pe = rope.forward_native(
+                    query=q_pe,
+                    key=k_pe,
+                    positions=positions,
+                )
+
+                # fused rope kernel
+                q_pe_clone, k_pe_clone = torch.ops.sgl_kernel.rotary_embedding_cpu(
+                    positions,
+                    q_pe_clone,
+                    k_pe_clone,
+                    rope.head_size,
+                    cos_sin_cache,
+                    False,
+                )
+
+                atol = rtol = precision[q_pe.dtype]
+                torch.testing.assert_close(q_pe, q_pe_clone, atol=atol, rtol=rtol)
+                torch.testing.assert_close(k_pe, k_pe_clone, atol=atol, rtol=rtol)
+                torch.testing.assert_close(k_pe, k_pe_clone)
+
+    def test_origin_rope(self):
+        def single_test(
+            head_size: int,
+            rotary_dim: int,
+            max_position_embeddings: int,
+            base: int,
+            is_neox_style: bool,
+            dtype: torch.dtype,
+            device: str,
+            batch_size: int,
+            seq_len: int,
+            num_q_heads: int,
+            num_kv_heads: int,
+        ):
+            torch.manual_seed(100)
+            rope_ref = RotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position_embeddings,
+                base,
+                is_neox_style,
+                dtype,
+            ).to(device)
+            pos_ids = torch.arange(seq_len, device=device).repeat(batch_size)
+            query = torch.randn(
+                batch_size * seq_len,
+                num_q_heads * head_size,
+                dtype=dtype,
+                device=device,
+            )
+            key = torch.randn(
+                batch_size * seq_len,
+                num_kv_heads * head_size,
+                dtype=dtype,
+                device=device,
+            )
+
+            query_ref, key_ref = query.clone(), key.clone()
+            query_cpu, key_cpu = query.clone(), key.clone()
+
+            query_ref_out, key_ref_out = rope_ref.forward_native(
+                pos_ids, query_ref, key_ref
+            )
+            query_cpu_out, key_cpu_out = torch.ops.sgl_kernel.rotary_embedding_cpu(
+                pos_ids,
+                query_cpu,
+                key_cpu,
+                rope_ref.head_size,
+                rope_ref.cos_sin_cache.to(query.dtype),
+                rope_ref.is_neox_style,
+            )
+            torch.testing.assert_close(
+                query_ref_out, query_cpu_out, atol=1e-2, rtol=1e-2
+            )
+            torch.testing.assert_close(key_ref_out, key_cpu_out, atol=1e-2, rtol=1e-2)
+
+        test_config = [
+            (64, 64, 32, 8000, True, torch.bfloat16, "cpu", 32, 32, 1, 1),
+            (256, 128, 4096, 10000, True, torch.bfloat16, "cpu", 2, 512, 32, 8),
+            (512, 128, 311, 10000, True, torch.bfloat16, "cpu", 3, 39, 4, 2),
+            (128, 128, 2048, 10000, False, torch.bfloat16, "cpu", 2, 512, 32, 8),
+            (128, 128, 2048, 10000, False, torch.bfloat16, "cpu", 2, 512, 16, 4),
+            (512, 128, 311, 10000, False, torch.bfloat16, "cpu", 3, 39, 4, 2),
+        ]
+
+        for (
+            head_size,
+            rotary_dim,
+            max_position_embeddings,
+            base,
+            is_neox_style,
+            dtype,
+            device,
+            batch_size,
+            seq_len,
+            num_q_heads,
+            num_kv_heads,
+        ) in test_config:
+            single_test(
+                head_size,
+                rotary_dim,
+                max_position_embeddings,
+                base,
+                is_neox_style,
+                dtype,
+                device,
+                batch_size,
+                seq_len,
+                num_q_heads,
+                num_kv_heads,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/cpu/test_shared_expert.py b/test/srt/cpu/test_shared_expert.py
new file mode 100644
index 000000000..17818aebb
--- /dev/null
+++ b/test/srt/cpu/test_shared_expert.py
@@ -0,0 +1,223 @@
+import itertools
+import math
+import unittest
+
+# TODO: use interface in cpu.py
+import sgl_kernel
+import torch
+import torch.nn as nn
+from utils import (
+    BLOCK_K,
+    BLOCK_N,
+    SiluAndMul,
+    factor_for_scale,
+    fp8_max,
+    fp8_min,
+    per_token_quant_int8,
+    precision,
+    scaled_weight,
+    torch_naive_moe,
+    torch_w8a8_per_column_moe,
+)
+
+from sglang.test.test_utils import CustomTestCase
+
+torch.manual_seed(1234)
+
+
+class TestSharedExpert(CustomTestCase):
+    M = [2, 121]
+    N = [32, 32 * 4]
+    K = [32, 32 * 2]
+    routed_scaling_factor = [16]
+
+    M_fp8 = [2, 12]
+    N_fp8 = [512]
+    K_fp8 = [256]
+
+    def _bf16_shared_expert(self, m, n, k, routed_scaling_factor):
+        dtype = torch.bfloat16
+        prepack = True
+
+        hidden_states = torch.randn(m, k, dtype=dtype) / k
+        w1 = torch.randn(2 * n, k, dtype=dtype)
+        w2 = torch.randn(k, n, dtype=dtype)
+        fused_output = torch.randn(m, k, dtype=dtype) / k
+
+        # fused moe mutates content in hs
+        hidden_states2 = hidden_states.clone()
+
+        # bfloat16
+        ref = torch_naive_moe(
+            hidden_states.float(),
+            w1.float(),
+            w2.float(),
+            fused_output.float(),
+            routed_scaling_factor,
+        ).to(dtype=dtype)
+        res = torch.ops.sgl_kernel.shared_expert_cpu(
+            hidden_states,
+            w1,
+            w2,
+            fused_output,
+            routed_scaling_factor,
+            True,
+            False,
+            False,
+            None,
+            None,
+            None,
+            None,
+            None,
+            False,
+        )
+
+        atol = rtol = precision[ref.dtype]
+        torch.testing.assert_close(ref, res, atol=atol, rtol=rtol)
+
+    def test_bf16_shared_expert(self):
+        for params in itertools.product(
+            self.M,
+            self.N,
+            self.K,
+            self.routed_scaling_factor,
+        ):
+            with self.subTest(
+                m=params[0],
+                n=params[1],
+                k=params[2],
+                routed_scaling_factor=params[3],
+            ):
+                self._bf16_shared_expert(*params)
+
+    def _int8_shared_expert(self, m, n, k, routed_scaling_factor):
+        dtype = torch.bfloat16
+        prepack = True
+
+        hidden_states = torch.randn(m, k, dtype=dtype) / k
+        w1 = torch.randn(2 * n, k, dtype=dtype)
+        w2 = torch.randn(k, n, dtype=dtype)
+        fused_output = torch.randn(m, k, dtype=dtype) / k
+
+        # fused moe mutates content in hs
+        hidden_states2 = hidden_states.clone()
+
+        w1_q, w1_s = per_token_quant_int8(w1)
+        w2_q, w2_s = per_token_quant_int8(w2)
+        ref2 = torch_w8a8_per_column_moe(
+            hidden_states2.float(),
+            w1_q,
+            w2_q,
+            w1_s,
+            w2_s,
+            fused_output.float(),
+            routed_scaling_factor,
+        ).to(dtype=dtype)
+        res2 = torch.ops.sgl_kernel.shared_expert_cpu(
+            hidden_states2,
+            w1_q,
+            w2_q,
+            fused_output,
+            routed_scaling_factor,
+            True,
+            True,
+            False,
+            w1_s,
+            w2_s,
+            None,
+            None,
+            None,
+            False,
+        )
+
+        atol = rtol = precision[ref2.dtype]
+        torch.testing.assert_close(ref2, res2, atol=atol, rtol=rtol)
+
+    def test_int8_shared_expert(self):
+        for params in itertools.product(
+            self.M,
+            self.N,
+            self.K,
+            self.routed_scaling_factor,
+        ):
+            with self.subTest(
+                m=params[0],
+                n=params[1],
+                k=params[2],
+                routed_scaling_factor=params[3],
+            ):
+                self._int8_shared_expert(*params)
+
+    def _fp8_shared_expert(self, M, N, K, routed_scaling_factor):
+        dtype = torch.bfloat16
+        prepack = True
+
+        a = torch.randn(M, K, dtype=dtype) / math.sqrt(K)
+
+        w1_fp32 = torch.randn(1, 2 * N, K)
+        w1 = (w1_fp32 * fp8_max).clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+        w2_fp32 = torch.randn(1, K, N)
+        w2 = (w2_fp32 * fp8_max).clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+        w1s = torch.randn(1, 2 * N // BLOCK_N, K // BLOCK_K) * factor_for_scale
+        w2s = torch.randn(1, K // BLOCK_N, N // BLOCK_K) * factor_for_scale
+
+        w1_scaled = scaled_weight(w1, w1s).view(2 * N, K)
+        w2_scaled = scaled_weight(w2, w2s).view(K, N)
+
+        # change back to 2D
+        w1, w2 = w1.squeeze(0), w2.squeeze(0)
+        w1s, w2s = w1s.squeeze(0), w2s.squeeze(0)
+        w1_scaled, w2_scaled = w1_scaled.squeeze(0), w2_scaled.squeeze(0)
+
+        fused_out = torch.randn(M, K, dtype=dtype) / math.sqrt(K)
+        a2 = a.clone()
+
+        # ref
+        ic0 = torch.matmul(a.float(), w1_scaled.transpose(0, 1))
+        ic1 = SiluAndMul(ic0)
+        shared_out = torch.matmul(ic1, w2_scaled.transpose(0, 1))
+        ref_out = shared_out + fused_out.float() * routed_scaling_factor
+        ref_out = ref_out.to(dtype=dtype)
+
+        w1 = torch.ops.sgl_kernel.convert_weight_packed(w1)  # [2N, K]
+        w2 = torch.ops.sgl_kernel.convert_weight_packed(w2)  # [K, N]
+        out = torch.ops.sgl_kernel.shared_expert_cpu(
+            a2,
+            w1,
+            w2,
+            fused_out,
+            routed_scaling_factor,
+            True,
+            False,
+            True,
+            w1s,
+            w2s,
+            [BLOCK_N, BLOCK_K],
+            None,
+            None,
+            True,
+        )
+
+        atol = rtol = precision[ref_out.dtype]
+        torch.testing.assert_close(ref_out, out, atol=atol, rtol=rtol)
+
+    def test_fp8_shared_expert(self):
+        for params in itertools.product(
+            self.M_fp8,
+            self.N_fp8,
+            self.K_fp8,
+            self.routed_scaling_factor,
+        ):
+            with self.subTest(
+                M=params[0],
+                N=params[1],
+                K=params[2],
+                routed_scaling_factor=params[3],
+            ):
+                self._fp8_shared_expert(*params)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/cpu/test_topk.py b/test/srt/cpu/test_topk.py
new file mode 100644
index 000000000..4b4ce21ae
--- /dev/null
+++ b/test/srt/cpu/test_topk.py
@@ -0,0 +1,199 @@
+import itertools
+import unittest
+
+import sgl_kernel
+import torch
+from utils import precision
+
+from sglang.srt.layers.moe.topk import (
+    biased_grouped_topk_impl as native_biased_grouped_topk,
+)
+from sglang.srt.layers.moe.topk import fused_topk_torch_native as native_fused_topk
+from sglang.srt.layers.moe.topk import grouped_topk_gpu as native_grouped_topk
+from sglang.srt.models.llama4 import Llama4MoE
+from sglang.test.test_utils import CustomTestCase
+
+torch.manual_seed(1234)
+
+
+# This is used by the Deepseek-V2 model
+class TestGroupedTopK(CustomTestCase):
+    def _run_single_test(self, M, E, G, topk, topk_group, renormalize, dtype):
+        torch.manual_seed(1234)
+
+        # expand gating_output by M, otherwise bfloat16 fall into same value aftering truncating
+        hidden_states = torch.randn(M, 100, dtype=dtype)
+        gating_output = torch.randn(M, E, dtype=dtype) * 2 * M
+
+        ref_topk_weights, ref_topk_ids = native_grouped_topk(
+            hidden_states.float(),
+            gating_output.float(),
+            topk,
+            renormalize,
+            G,
+            topk_group,
+        )
+
+        # fused version
+        topk_weights, topk_ids = torch.ops.sgl_kernel.grouped_topk_cpu(
+            hidden_states,
+            gating_output,
+            topk,
+            renormalize,
+            G,
+            topk_group,
+            0,
+            None,
+            None,
+        )
+
+        res = torch.zeros(M, E, dtype=torch.float)
+        ref = torch.zeros(M, E, dtype=torch.float)
+        res.scatter_(1, topk_ids.long(), topk_weights)
+        ref.scatter_(1, ref_topk_ids.long(), ref_topk_weights)
+        torch.testing.assert_close(res, ref)
+
+    def test_grouped_topk(self):
+        for renormalize in [True, False]:
+            self._run_single_test(123, 8, 2, 2, 1, renormalize, torch.bfloat16)
+            self._run_single_test(123, 16, 4, 3, 2, renormalize, torch.bfloat16)
+            self._run_single_test(123, 32, 4, 3, 2, renormalize, torch.bfloat16)
+            self._run_single_test(1123, 32, 4, 3, 2, renormalize, torch.bfloat16)
+            self._run_single_test(123, 64, 1, 6, 1, renormalize, torch.bfloat16)
+            self._run_single_test(123, 256, 8, 4, 8, renormalize, torch.bfloat16)
+            self._run_single_test(123, 160, 8, 6, 2, renormalize, torch.bfloat16)
+
+
+# DeepSeek V2/V3/R1 uses biased_grouped_top
+class TestBiasedGroupedTopK(CustomTestCase):
+    def _run_single_test(
+        self, M, E, G, topk, topk_group, renormalize, dtype, bias_dtype
+    ):
+        torch.manual_seed(1234)
+
+        # expand gating_output by M, otherwise bfloat16 fall into same value aftering truncating
+        hidden_states = torch.randn(M, 100, dtype=dtype)
+        gating_output = torch.randn(M, E, dtype=dtype) * 2 * M
+        correction_bias = torch.randn(E, dtype=bias_dtype)
+
+        ref_topk_weights, ref_topk_ids = native_biased_grouped_topk(
+            hidden_states.float(),
+            gating_output.float(),
+            correction_bias.float(),
+            topk,
+            renormalize,
+            G,
+            topk_group,
+        )
+
+        # fused version
+        topk_weights, topk_ids = torch.ops.sgl_kernel.biased_grouped_topk_cpu(
+            hidden_states,
+            gating_output,
+            correction_bias,
+            topk,
+            renormalize,
+            G,
+            topk_group,
+            0,
+            None,
+            None,
+        )
+
+        res = torch.zeros(M, E, dtype=torch.float)
+        ref = torch.zeros(M, E, dtype=torch.float)
+        res.scatter_(1, topk_ids.long(), topk_weights)
+        ref.scatter_(1, ref_topk_ids.long(), ref_topk_weights)
+        torch.testing.assert_close(res, ref)
+
+    def test_biased_grouped_topk(self):
+        for renormalize in [True, False]:
+            for bias_dtype in [torch.float32, torch.bfloat16]:
+                self._run_single_test(
+                    122, 256, 8, 8, 2, renormalize, torch.bfloat16, bias_dtype
+                )
+
+
+class TestTopK(CustomTestCase):
+    def _run_single_test(self, M, E, topk, renormalize, dtype):
+        torch.manual_seed(1998)
+
+        # expand gating_output by M, otherwise bfloat16 fall into same value aftering truncating
+        hidden_states = torch.randn(M, 100, dtype=dtype)
+        gating_output = torch.randn(M, E, dtype=dtype) * 2 * M
+
+        ref_topk_weights, ref_topk_ids = native_fused_topk(
+            hidden_states.float(),
+            gating_output.float(),
+            topk,
+            renormalize,
+        )
+
+        # fused version
+        topk_weights, topk_ids = torch.ops.sgl_kernel.topk_softmax_cpu(
+            hidden_states, gating_output, topk, renormalize
+        )
+
+        res = torch.zeros(M, E, dtype=torch.float)
+        ref = torch.zeros(M, E, dtype=torch.float)
+        res.scatter_(1, topk_ids.long(), topk_weights)
+        ref.scatter_(1, ref_topk_ids.long(), ref_topk_weights)
+        torch.testing.assert_close(res, ref)
+
+    def test_topk(self):
+        for renormalize in [True, False]:
+            self._run_single_test(123, 8, 2, renormalize, torch.bfloat16)
+            self._run_single_test(123, 16, 3, renormalize, torch.bfloat16)
+            self._run_single_test(123, 32, 3, renormalize, torch.bfloat16)
+            self._run_single_test(123, 32, 3, renormalize, torch.bfloat16)
+            self._run_single_test(123, 64, 6, renormalize, torch.bfloat16)
+            self._run_single_test(123, 256, 4, renormalize, torch.bfloat16)
+            self._run_single_test(123, 160, 6, renormalize, torch.bfloat16)
+
+
+class TestCustomTopK(CustomTestCase):
+    def _run_single_test(
+        self, M, E, topk, renormalize, dtype, native_custom_f, fused_custom_f
+    ):
+        torch.manual_seed(16)
+
+        # expand gating_output by M, otherwise bfloat16 fall into same value aftering truncating
+        hidden_states = torch.randn(M, 100, dtype=dtype)
+        gating_output = torch.randn(M, E, dtype=dtype) * 2 * M
+
+        ref_topk_weights, ref_topk_ids = native_custom_f(
+            hidden_states.float(),
+            gating_output.float(),
+            topk,
+            renormalize,
+        )
+
+        # fused version
+        topk_weights, topk_ids = fused_custom_f(
+            hidden_states, gating_output, topk, renormalize
+        )
+
+        res = torch.zeros(M, E, dtype=torch.float)
+        ref = torch.zeros(M, E, dtype=torch.float)
+        res.scatter_(1, topk_ids.long(), topk_weights)
+        ref.scatter_(1, ref_topk_ids.long(), ref_topk_weights)
+        torch.testing.assert_close(res, ref)
+
+    def test_custom_topk(self):
+        test_custom_functions = [
+            (Llama4MoE.custom_routing_function, torch.ops.sgl_kernel.topk_sigmoid_cpu)
+        ]
+        for native_custom_f, fused_custom_f in test_custom_functions:
+            self._run_single_test(
+                123, 8, 1, False, torch.bfloat16, native_custom_f, fused_custom_f
+            )
+            self._run_single_test(
+                123, 16, 1, False, torch.bfloat16, native_custom_f, fused_custom_f
+            )
+            self._run_single_test(
+                123, 32, 1, False, torch.bfloat16, native_custom_f, fused_custom_f
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/cpu/utils.py b/test/srt/cpu/utils.py
new file mode 100644
index 000000000..6435dad74
--- /dev/null
+++ b/test/srt/cpu/utils.py
@@ -0,0 +1,274 @@
+import math
+
+import torch
+import torch.nn.functional as F
+
+precision = {
+    torch.bfloat16: 1e-2,
+    torch.float16: 1e-3,
+    torch.float32: 1e-5,
+}
+
+
+BLOCK_N, BLOCK_K = 64, 128
+factor_for_scale = 1e-3
+fp8_max, fp8_min = 400, -400
+
+
+def SiluAndMul(x: torch.Tensor) -> torch.Tensor:
+    d = x.shape[-1] // 2
+    return F.silu(x[..., :d]) * x[..., d:]
+
+
+def GeluAndMul(x: torch.Tensor, approximate="tanh") -> torch.Tensor:
+    d = x.shape[-1] // 2
+    return F.gelu(x[..., :d], approximate=approximate) * x[..., d:]
+
+
+def per_token_quant_int8(x):
+    x = x.float()
+    absmax = x.abs().max(dim=-1).values
+    absmax = absmax.clamp_min(1e-10).unsqueeze(-1)
+    scale_x = absmax / 127
+    x_q = x.mul(127 / absmax)
+    x_q = torch.round(x_q).to(torch.int8)
+
+    return x_q, scale_x
+
+
+def convert_weight(weight, scale_block_size, A_dtype):
+    N, K = weight.size()
+    fp8_max = 448.0
+    scale_block_size_N, scale_block_size_K = scale_block_size  # (128, 128)
+
+    pad_N = (scale_block_size_N - (N % scale_block_size_N)) % scale_block_size_N
+    pad_K = (scale_block_size_K - (K % scale_block_size_K)) % scale_block_size_K
+
+    if pad_N > 0 or pad_K > 0:
+        weight = torch.nn.functional.pad(weight, (0, pad_K, 0, pad_N))
+
+    weight_blocks = weight.view(
+        math.ceil(N / scale_block_size_N),
+        scale_block_size_N,
+        math.ceil(K / scale_block_size_K),
+        scale_block_size_K,
+    )  # (8, 128, 8, 128)
+    weight_blocks = weight_blocks.permute(0, 2, 1, 3).contiguous()  # (8, 8, 128, 128)
+
+    # Step 2: compute per-block max abs values → scale
+    abs_max = weight_blocks.abs().amax(dim=(-2, -1), keepdim=True)  # (8, 8, 1, 1)
+    scales = abs_max / fp8_max
+    scales = torch.where(
+        scales == 0, torch.ones_like(scales), scales
+    )  # avoid division by zero
+
+    q_fp8 = (weight_blocks / scales).to(torch.float8_e4m3fn)
+    q_fp8_reshape = q_fp8.permute(0, 2, 1, 3).contiguous()
+
+    if pad_N > 0 or pad_K > 0:
+        q_fp8_reshape = q_fp8_reshape.view(N + pad_N, K + pad_K)
+        q_fp8_reshape = q_fp8_reshape[:N, :K].contiguous()
+    else:
+        q_fp8_reshape = q_fp8_reshape.view(N, K)
+
+    dq_weight = q_fp8.float() * scales
+    dq_weight = dq_weight.permute(0, 2, 1, 3).contiguous()  # (8, 128, 8, 128)
+
+    if pad_N > 0 or pad_K > 0:
+        w_dq = dq_weight.view(N + pad_N, K + pad_K).to(A_dtype)
+        w_dq = w_dq[:N, :K].contiguous()
+    else:
+        w_dq = dq_weight.view(N, K).to(A_dtype)
+
+    scales = scales.view(
+        math.ceil(N / scale_block_size_N), math.ceil(K / scale_block_size_K)
+    )
+
+    return q_fp8_reshape, scales, w_dq
+
+
+def native_w8a8_per_token_matmul(A, B, As, Bs, bias, output_dtype=torch.bfloat16):
+    """Matrix multiplication function that supports per-token input quantization and per-column weight quantization"""
+    A = A.to(torch.float32)
+    B = B.to(torch.float32)
+
+    assert A.shape[-1] == B.shape[-1], "Dimension mismatch"
+    assert B.ndim == 2 and B.is_contiguous(), "B must be a 2D contiguous tensor"
+
+    # Reshape input
+    M = A.numel() // A.shape[-1]
+    B = B.t()  # Transpose weight matrix
+    N, K = B.shape
+    origin_C_shape = A.shape[:-1] + (K,)
+    A = A.reshape(M, N)
+
+    # As is per-token [M, 1], Bs is per-column [1, K]
+    C = torch.matmul(A, B)  # [M, K]
+    C = As * C * Bs.view(1, -1)  # Broadcast per-column scale
+
+    if bias is not None:
+        C.add_(bias.view(1, -1))
+
+    return C.reshape(origin_C_shape).to(output_dtype)
+
+
+def torch_naive_moe(a, w1, w2, b, routed_scaling_factor):
+
+    ic1 = torch.matmul(a, w1.transpose(0, 1))
+    ic2 = SiluAndMul(ic1)
+    ic3 = torch.matmul(ic2, w2.transpose(0, 1))
+
+    return ic3 + b * routed_scaling_factor
+
+
+def torch_w8a8_per_column_moe(a, w1_q, w2_q, w1_s, w2_s, b, routed_scaling_factor):
+
+    # Perform per-token quantization
+    a_q, a_s = per_token_quant_int8(a)
+
+    ic1 = native_w8a8_per_token_matmul(
+        a_q, w1_q, a_s, w1_s, bias=None, output_dtype=torch.float32
+    )
+    ic2 = SiluAndMul(ic1)
+
+    a1_q, a1_s = per_token_quant_int8(ic2)
+    ic3 = native_w8a8_per_token_matmul(
+        a1_q, w2_q, a1_s, w2_s, bias=None, output_dtype=torch.float32
+    )
+
+    return ic3 + b * routed_scaling_factor
+
+
+def scaled_weight(weight, scales):
+    E, N, K = weight.shape
+    pad_N = (BLOCK_N - (N % BLOCK_N)) % BLOCK_N
+    pad_K = (BLOCK_K - (K % BLOCK_K)) % BLOCK_K
+
+    if pad_N > 0 or pad_K > 0:
+        weight = torch.nn.functional.pad(weight, (0, pad_K, 0, pad_N))
+
+    weight_block = (
+        weight.view(E, math.ceil(N / BLOCK_N), BLOCK_N, math.ceil(K / BLOCK_K), BLOCK_K)
+        .permute(0, 1, 3, 2, 4)
+        .float()
+        .contiguous()
+    )
+
+    weight_scaled = (
+        (
+            weight_block
+            * scales.view(E, math.ceil(N / BLOCK_N), math.ceil(K / BLOCK_K), 1, 1)
+        )
+        .permute(0, 1, 3, 2, 4)
+        .contiguous()
+    )
+    if pad_N > 0 or pad_K > 0:
+        weight_scaled = weight_scaled.view(E, N + pad_N, K + pad_K)
+        weight_scaled = weight_scaled[..., :N, :K].contiguous()
+    else:
+        weight_scaled = weight_scaled.view(E, N, K)
+    return weight_scaled
+
+
+def torch_naive_fused_moe(a, w1, w2, score, topk, renormalize):
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+
+    if renormalize:
+        topk_weight = topk_weight / topk_weight.sum(dim=-1, keepdim=True)
+
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            out[mask] = SiluAndMul(a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(
+                0, 1
+            )
+    return (
+        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+    ).sum(dim=1)
+
+
+def torch_w8a8_per_column_fused_moe(a, w1, w2, w1_s, w2_s, topk_weight, topk_ids, topk):
+    """This function performs fused moe with per-column int8 quantization using native torch."""
+
+    B, D = a.shape
+    # Perform per-token quantization
+    a_q, a_s = per_token_quant_int8(a)
+    # Repeat tokens to match topk
+    a_q = a_q.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    # Also repeat the scale
+    a_s = a_s.view(B, -1, 1).repeat(1, topk, 1).reshape(-1, 1)  # [B*topk, 1]
+
+    out = torch.zeros(B * topk, w2.shape[1], dtype=torch.float32, device=a.device)
+
+    # Calculate routing
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+    # Process each expert
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            # First MLP layer: note that a_s is now per-token
+            inter_out = native_w8a8_per_token_matmul(
+                a_q[mask],
+                w1[i],
+                a_s[mask],
+                w1_s[i],
+                bias=None,
+                output_dtype=torch.float32,
+            )
+            # Activation function
+            act_out = SiluAndMul(inter_out)
+            # Quantize activation output with per-token
+            act_out_q, act_out_s = per_token_quant_int8(act_out)
+            # Second MLP layer
+            out[mask] = native_w8a8_per_token_matmul(
+                act_out_q,
+                w2[i],
+                act_out_s,
+                w2_s[i],
+                bias=None,
+                output_dtype=torch.float32,
+            )
+    # Apply routing weights and sum
+    return (
+        (out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype))
+        .sum(dim=1)
+        .to(a.dtype)
+    )
+
+
+def native_fp8_fused_moe(a, w1, w2, topk_weight, topk_ids, topk):
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D).float()
+    out = torch.zeros(B * topk, w2.shape[1], dtype=torch.float32, device=a.device)
+
+    # Calculate routing
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            ic0 = torch.matmul(a[mask], w1[i].transpose(0, 1))
+            ic1 = SiluAndMul(ic0)
+            out[mask] = torch.matmul(ic1, w2[i].transpose(0, 1))
+
+    return (
+        (out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype))
+        .sum(dim=1)
+        .to(a.dtype)
+    )
+
+
+def make_non_contiguous(x: torch.Tensor) -> torch.Tensor:
+    """
+    Make a tensor non-contiguous by slicing it via last dimension.
+    """
+    last_dim = x.shape[-1]
+    return x[..., : last_dim // 2] if x.is_contiguous() else x
diff --git a/test/srt/double-sparsity-config-Llama-3.1-8B-Instruct.json b/test/srt/double-sparsity-config-Llama-3.1-8B-Instruct.json
new file mode 100644
index 000000000..f1652ec96
--- /dev/null
+++ b/test/srt/double-sparsity-config-Llama-3.1-8B-Instruct.json
@@ -0,0 +1 @@
+{"model.layers.0.self_attn.q_proj": [[39, 106, 104, 102, 33, 95, 13, 44, 11, 29, 12, 10, 53, 126, 27, 114, 121, 8, 124, 113, 112, 15, 23, 89, 69, 111, 54, 80, 4, 1, 20, 24, 83, 63, 115, 122, 66, 42, 22, 110, 3, 73, 21, 61, 97, 19, 25, 88, 117, 119, 116, 85, 70, 5, 56, 118, 68, 123, 2, 86, 71, 127, 93, 49, 109, 50, 67, 52, 91, 40, 17, 108, 60, 55, 78, 62, 65, 47, 6, 87, 51, 84, 58, 82, 94, 79, 57, 103, 76, 48, 0, 18, 81, 96, 9, 31, 16, 92, 26, 43, 30, 37, 74, 100, 46, 38, 32, 14, 90, 36, 101, 75, 35, 125, 45, 7, 72, 41, 77, 105, 28, 59, 34, 99, 98, 107, 120, 64], [39, 104, 102, 106, 33, 44, 95, 53, 126, 111, 114, 110, 29, 14, 113, 54, 121, 112, 109, 80, 119, 12, 63, 25, 78, 123, 42, 10, 24, 127, 108, 115, 122, 60, 116, 97, 9, 93, 61, 17, 117, 56, 82, 91, 21, 124, 62, 0, 43, 19, 31, 26, 49, 52, 57, 59, 1, 50, 41, 15, 46, 86, 88, 81, 118, 125, 2, 3, 55, 105, 84, 75, 120, 70, 47, 85, 32, 51, 37, 16, 22, 23, 89, 87, 67, 30, 68, 69, 77, 48, 66, 27, 36, 107, 20, 76, 94, 79, 96, 83, 99, 38, 100, 18, 8, 35, 28, 101, 13, 72, 92, 7, 4, 98, 90, 103, 6, 34, 74, 11, 65, 40, 71, 45, 58, 64, 73, 5], [106, 104, 39, 102, 2, 44, 95, 53, 121, 3, 4, 33, 1, 0, 122, 111, 126, 115, 63, 71, 119, 112, 113, 70, 109, 29, 61, 52, 114, 54, 127, 21, 10, 72, 5, 110, 116, 49, 123, 11, 12, 55, 24, 45, 15, 19, 117, 73, 80, 50, 6, 23, 64, 124, 75, 69, 25, 83, 14, 77, 97, 42, 78, 13, 79, 85, 27, 56, 87, 86, 57, 17, 118, 91, 81, 74, 65, 76, 9, 41, 88, 18, 51, 22, 62, 93, 89, 67, 105, 84, 82, 38, 48, 120, 60, 47, 26, 20, 43, 68, 28, 66, 108, 59, 58, 8, 125, 98, 46, 31, 90, 36, 94, 92, 34, 107, 35, 7, 99, 100, 30, 32, 37, 101, 16, 96, 40, 103], [39, 104, 106, 102, 33, 44, 53, 111, 113, 95, 54, 109, 41, 116, 124, 63, 121, 122, 61, 56, 114, 110, 60, 108, 127, 112, 125, 43, 52, 126, 12, 80, 119, 59, 62, 82, 123, 97, 117, 25, 37, 0, 88, 29, 57, 120, 26, 91, 115, 78, 24, 21, 118, 49, 93, 1, 42, 14, 58, 107, 10, 84, 67, 7, 9, 86, 87, 3, 50, 51, 64, 98, 18, 20, 72, 66, 100, 17, 32, 81, 69, 79, 2, 101, 105, 85, 15, 77, 31, 22, 75, 70, 68, 90, 55, 19, 89, 11, 74, 76, 8, 38, 30, 4, 65, 5, 45, 92, 23, 46, 73, 13, 94, 47, 35, 36, 6, 48, 34, 28, 99, 27, 96, 103, 71, 83, 16, 40], [41, 103, 38, 33, 42, 93, 109, 95, 88, 44, 107, 17, 25, 20, 110, 21, 47, 40, 78, 116, 115, 126, 59, 61, 64, 57, 48, 83, 119, 12, 45, 125, 11, 18, 124, 118, 53, 56, 89, 70, 22, 77, 86, 50, 92, 4, 87, 24, 97, 65, 49, 73, 26, 62, 3, 16, 108, 66, 72, 104, 67, 80, 58, 71, 79, 23, 91, 122, 19, 43, 113, 39, 14, 102, 111, 27, 9, 121, 10, 117, 29, 60, 69, 90, 76, 120, 0, 84, 15, 114, 51, 74, 68, 98, 6, 81, 85, 2, 1, 82, 100, 112, 99, 13, 52, 123, 94, 8, 35, 28, 55, 96, 63, 54, 7, 105, 5, 36, 127, 37, 32, 75, 34, 106, 31, 30, 101, 46], [38, 103, 44, 41, 107, 33, 42, 88, 95, 40, 93, 87, 25, 78, 110, 47, 48, 26, 20, 18, 21, 17, 64, 59, 45, 22, 109, 89, 118, 62, 91, 115, 116, 53, 49, 61, 126, 90, 16, 11, 119, 92, 111, 65, 125, 124, 43, 85, 83, 60, 24, 4, 77, 73, 104, 117, 57, 120, 86, 3, 76, 97, 29, 23, 102, 39, 51, 67, 70, 12, 122, 71, 9, 15, 30, 113, 56, 66, 0, 106, 7, 50, 108, 100, 114, 28, 46, 121, 80, 94, 72, 10, 6, 79, 69, 55, 19, 58, 74, 68, 123, 81, 27, 13, 2, 82, 98, 35, 8, 34, 36, 1, 101, 5, 105, 54, 63, 96, 37, 84, 127, 52, 32, 31, 14, 75, 112, 99], [103, 41, 107, 33, 38, 93, 110, 116, 64, 42, 48, 20, 123, 61, 124, 17, 126, 115, 88, 49, 25, 53, 47, 78, 57, 122, 21, 59, 95, 18, 44, 89, 77, 97, 16, 11, 125, 66, 86, 120, 119, 109, 4, 65, 12, 118, 71, 62, 22, 24, 80, 14, 50, 74, 73, 51, 70, 3, 7, 121, 67, 83, 27, 111, 72, 5, 69, 76, 23, 45, 84, 29, 60, 9, 31, 10, 81, 26, 8, 87, 19, 13, 0, 40, 2, 56, 113, 85, 82, 127, 91, 43, 34, 108, 117, 114, 6, 15, 96, 106, 79, 75, 94, 102, 100, 101, 68, 98, 63, 58, 99, 105, 55, 30, 90, 36, 1, 35, 32, 46, 112, 28, 104, 39, 37, 92, 52, 54], [103, 41, 33, 38, 42, 93, 95, 107, 110, 25, 59, 17, 48, 116, 88, 109, 44, 57, 11, 83, 20, 125, 78, 21, 62, 86, 115, 70, 97, 77, 4, 118, 18, 47, 43, 64, 12, 24, 50, 126, 22, 124, 45, 3, 61, 72, 65, 39, 89, 84, 113, 53, 73, 29, 49, 120, 66, 80, 79, 27, 71, 122, 10, 117, 40, 9, 92, 16, 67, 81, 19, 23, 119, 123, 51, 105, 36, 8, 15, 55, 108, 111, 60, 90, 14, 26, 87, 104, 74, 56, 91, 75, 0, 28, 69, 68, 100, 2, 102, 99, 58, 52, 85, 6, 63, 34, 54, 94, 127, 96, 76, 32, 121, 30, 1, 31, 7, 114, 82, 46, 37, 13, 112, 35, 5, 101, 98, 106], [102, 41, 112, 101, 108, 103, 33, 104, 29, 27, 115, 30, 42, 38, 16, 45, 114, 20, 32, 100, 121, 89, 98, 111, 87, 62, 59, 126, 13, 63, 107, 11, 43, 88, 117, 75, 118, 25, 84, 34, 31, 50, 51, 55, 23, 95, 56, 58, 53, 78, 94, 97, 82, 92, 8, 18, 127, 113, 110, 19, 40, 21, 35, 39, 85, 119, 37, 125, 96, 54, 6, 72, 90, 91, 14, 26, 124, 105, 9, 44, 28, 17, 116, 120, 60, 48, 80, 57, 1, 123, 79, 77, 86, 24, 69, 73, 15, 122, 68, 99, 81, 74, 4, 52, 71, 67, 7, 65, 76, 66, 61, 109, 106, 46, 93, 2, 64, 12, 10, 49, 3, 47, 83, 70, 36, 22, 0, 5], [107, 104, 102, 33, 108, 45, 95, 92, 29, 42, 103, 21, 99, 43, 85, 101, 23, 50, 27, 28, 113, 98, 19, 15, 120, 83, 26, 46, 74, 97, 34, 12, 86, 123, 52, 127, 90, 79, 39, 7, 17, 41, 112, 49, 126, 77, 48, 38, 125, 89, 44, 61, 111, 18, 100, 115, 32, 25, 35, 119, 93, 62, 30, 60, 84, 96, 31, 80, 122, 87, 114, 53, 55, 47, 56, 110, 13, 57, 63, 37, 51, 54, 116, 117, 68, 124, 105, 78, 40, 81, 118, 88, 9, 91, 11, 109, 76, 24, 59, 71, 121, 82, 6, 20, 1, 72, 106, 16, 58, 73, 5, 94, 67, 10, 69, 14, 4, 22, 75, 2, 36, 70, 8, 66, 3, 65, 64, 0], [102, 103, 33, 29, 41, 42, 104, 45, 64, 1, 107, 108, 66, 27, 16, 112, 88, 115, 3, 50, 101, 20, 127, 67, 118, 75, 58, 116, 53, 72, 13, 84, 68, 19, 86, 25, 89, 70, 11, 30, 21, 77, 117, 4, 6, 14, 56, 59, 69, 82, 80, 100, 126, 95, 32, 92, 97, 39, 85, 124, 79, 90, 121, 52, 54, 26, 5, 113, 71, 7, 123, 73, 51, 8, 0, 63, 111, 38, 23, 57, 15, 119, 10, 43, 60, 78, 22, 83, 87, 9, 74, 37, 99, 98, 76, 65, 61, 55, 17, 81, 18, 31, 24, 94, 28, 2, 47, 44, 40, 110, 114, 96, 105, 46, 109, 120, 34, 48, 12, 35, 62, 93, 106, 49, 91, 122, 125, 36], [103, 45, 102, 33, 26, 42, 95, 87, 79, 123, 104, 19, 113, 112, 41, 29, 21, 76, 108, 120, 30, 71, 10, 97, 81, 27, 101, 74, 44, 32, 115, 122, 89, 17, 12, 23, 7, 92, 25, 53, 15, 100, 90, 117, 13, 39, 105, 99, 83, 51, 85, 84, 50, 4, 20, 109, 61, 88, 35, 16, 98, 54, 94, 110, 125, 55, 66, 127, 70, 47, 65, 124, 116, 46, 80, 67, 114, 49, 57, 56, 78, 38, 86, 18, 60, 82, 11, 5, 121, 119, 126, 62, 48, 58, 59, 28, 111, 93, 37, 63, 2, 73, 52, 6, 8, 24, 3, 96, 106, 43, 40, 107, 1, 34, 118, 22, 0, 68, 69, 77, 91, 64, 72, 36, 75, 31, 14, 9], [104, 103, 33, 43, 44, 111, 102, 90, 49, 89, 8, 41, 93, 37, 116, 113, 95, 122, 123, 23, 74, 42, 14, 17, 83, 68, 85, 106, 91, 92, 13, 78, 63, 127, 115, 81, 112, 87, 121, 3, 70, 48, 72, 120, 28, 124, 53, 71, 30, 19, 47, 86, 59, 51, 96, 56, 94, 114, 108, 118, 76, 79, 55, 38, 29, 110, 5, 82, 15, 21, 11, 20, 26, 66, 6, 84, 22, 50, 9, 27, 100, 65, 61, 64, 57, 25, 35, 7, 16, 101, 80, 45, 52, 36, 46, 109, 107, 24, 126, 2, 4, 12, 67, 125, 75, 10, 32, 117, 97, 1, 69, 60, 77, 88, 98, 99, 73, 105, 58, 18, 62, 54, 34, 31, 0, 119, 40, 39], [103, 104, 102, 33, 43, 44, 111, 90, 115, 106, 113, 89, 42, 74, 56, 93, 48, 95, 85, 127, 41, 72, 92, 37, 107, 71, 17, 87, 123, 112, 122, 50, 83, 22, 55, 57, 116, 14, 53, 124, 49, 114, 26, 47, 110, 23, 63, 70, 98, 100, 51, 108, 121, 59, 13, 35, 28, 81, 94, 79, 32, 120, 29, 76, 101, 105, 126, 38, 7, 82, 9, 6, 118, 24, 125, 91, 20, 88, 15, 99, 36, 109, 18, 62, 34, 46, 84, 19, 3, 61, 2, 30, 11, 27, 86, 52, 58, 75, 25, 68, 12, 31, 117, 69, 54, 96, 119, 65, 78, 80, 8, 45, 16, 77, 40, 1, 21, 73, 10, 4, 66, 64, 60, 67, 0, 39, 97, 5], [104, 44, 103, 43, 33, 49, 111, 102, 95, 41, 37, 123, 27, 91, 9, 84, 42, 89, 61, 122, 24, 76, 124, 116, 30, 85, 79, 96, 51, 48, 118, 28, 87, 92, 98, 127, 59, 23, 109, 121, 20, 15, 88, 29, 100, 106, 53, 34, 35, 101, 126, 16, 120, 54, 108, 115, 47, 93, 83, 17, 56, 18, 90, 113, 55, 38, 22, 63, 8, 81, 13, 94, 72, 117, 60, 50, 114, 36, 12, 71, 45, 57, 105, 32, 110, 119, 62, 74, 58, 97, 31, 112, 125, 52, 19, 46, 86, 5, 80, 68, 73, 3, 78, 99, 107, 6, 70, 75, 40, 14, 65, 21, 39, 7, 25, 77, 64, 2, 11, 82, 26, 10, 69, 4, 0, 66, 1, 67], [102, 33, 44, 104, 43, 103, 79, 28, 27, 87, 95, 9, 12, 124, 49, 21, 23, 99, 76, 90, 85, 35, 32, 31, 29, 80, 94, 61, 98, 114, 84, 91, 18, 15, 73, 89, 81, 106, 112, 62, 51, 41, 54, 56, 58, 59, 117, 45, 119, 88, 26, 110, 105, 52, 115, 42, 111, 24, 30, 92, 109, 38, 123, 37, 108, 113, 57, 118, 60, 125, 82, 25, 86, 75, 96, 19, 16, 14, 40, 46, 77, 20, 101, 93, 122, 126, 36, 53, 100, 55, 47, 71, 39, 11, 72, 34, 50, 63, 116, 48, 107, 70, 121, 13, 120, 17, 127, 78, 83, 3, 22, 7, 6, 5, 8, 66, 74, 97, 69, 10, 68, 67, 64, 4, 65, 0, 1, 2], [39, 41, 101, 97, 29, 110, 24, 43, 32, 49, 121, 84, 120, 63, 3, 56, 44, 8, 59, 21, 38, 98, 40, 6, 60, 127, 53, 71, 2, 25, 52, 116, 99, 13, 62, 4, 125, 72, 46, 18, 23, 7, 31, 68, 85, 15, 92, 42, 79, 76, 115, 123, 87, 75, 109, 57, 108, 95, 102, 12, 54, 0, 36, 66, 61, 19, 50, 124, 94, 65, 111, 82, 69, 126, 77, 80, 83, 106, 113, 51, 33, 100, 114, 91, 14, 20, 78, 10, 47, 27, 117, 86, 48, 118, 67, 89, 11, 112, 58, 16, 35, 88, 30, 45, 28, 5, 17, 70, 93, 73, 90, 34, 105, 22, 26, 55, 64, 119, 107, 9, 122, 104, 103, 96, 81, 1, 74, 37], [39, 41, 97, 32, 101, 43, 29, 26, 116, 31, 54, 80, 99, 55, 125, 18, 21, 59, 24, 78, 73, 111, 23, 76, 71, 16, 87, 82, 119, 3, 25, 28, 11, 121, 112, 12, 123, 63, 98, 120, 85, 83, 47, 91, 94, 56, 13, 103, 7, 33, 17, 88, 51, 127, 4, 95, 104, 19, 38, 108, 60, 86, 100, 79, 124, 77, 30, 27, 109, 81, 52, 50, 84, 61, 67, 37, 93, 49, 92, 5, 10, 118, 46, 58, 69, 74, 110, 14, 53, 45, 122, 126, 57, 113, 114, 15, 42, 107, 68, 22, 72, 9, 62, 90, 89, 70, 115, 48, 102, 8, 75, 36, 40, 6, 117, 105, 2, 0, 96, 20, 106, 44, 35, 65, 66, 1, 34, 64], [6, 39, 41, 70, 101, 72, 8, 43, 74, 60, 108, 53, 92, 110, 98, 113, 50, 32, 44, 47, 61, 87, 62, 95, 28, 111, 26, 38, 58, 57, 67, 124, 18, 52, 56, 51, 99, 97, 112, 123, 118, 125, 116, 59, 109, 33, 40, 89, 120, 106, 19, 121, 30, 46, 14, 31, 68, 126, 88, 11, 4, 104, 24, 45, 2, 63, 127, 10, 119, 105, 80, 90, 107, 83, 34, 114, 36, 78, 3, 27, 69, 42, 55, 9, 20, 73, 122, 5, 15, 86, 25, 49, 48, 29, 21, 94, 66, 100, 22, 0, 54, 65, 75, 84, 17, 96, 93, 91, 35, 77, 12, 71, 115, 102, 103, 79, 7, 23, 81, 117, 37, 85, 16, 13, 64, 76, 1, 82], [41, 39, 70, 101, 10, 32, 19, 43, 87, 97, 11, 83, 46, 14, 60, 4, 123, 15, 125, 38, 44, 108, 56, 86, 16, 9, 118, 53, 18, 5, 109, 2, 92, 76, 78, 62, 61, 23, 120, 127, 24, 68, 49, 59, 42, 66, 84, 75, 21, 110, 8, 27, 106, 72, 63, 111, 47, 52, 121, 50, 73, 126, 28, 40, 17, 99, 67, 107, 79, 122, 89, 82, 33, 124, 105, 77, 58, 26, 1, 85, 25, 114, 119, 51, 22, 57, 98, 48, 31, 71, 94, 103, 115, 116, 13, 7, 102, 74, 113, 3, 20, 96, 6, 0, 81, 90, 69, 45, 104, 29, 34, 100, 12, 36, 91, 88, 80, 64, 54, 95, 30, 55, 35, 65, 93, 37, 112, 117], [40, 102, 31, 107, 22, 39, 78, 98, 9, 6, 73, 88, 12, 81, 17, 67, 76, 106, 25, 70, 48, 85, 86, 110, 91, 92, 114, 74, 82, 2, 58, 15, 83, 59, 14, 27, 108, 117, 127, 66, 79, 111, 47, 30, 51, 50, 5, 21, 57, 52, 109, 36, 89, 23, 35, 49, 101, 63, 72, 90, 122, 54, 100, 26, 84, 56, 124, 33, 18, 55, 62, 126, 29, 123, 11, 95, 119, 20, 96, 28, 77, 19, 24, 4, 87, 32, 115, 37, 99, 41, 53, 97, 125, 112, 8, 104, 71, 43, 93, 3, 116, 65, 16, 10, 64, 75, 94, 113, 120, 105, 68, 60, 46, 103, 7, 61, 69, 34, 13, 44, 0, 118, 80, 45, 121, 1, 42, 38], [40, 73, 70, 14, 39, 17, 110, 102, 114, 85, 82, 91, 51, 106, 99, 49, 48, 41, 98, 83, 36, 75, 115, 46, 88, 119, 92, 30, 69, 57, 28, 32, 15, 109, 126, 63, 86, 31, 123, 84, 6, 37, 72, 107, 90, 76, 77, 53, 71, 60, 22, 23, 124, 55, 93, 100, 21, 33, 58, 111, 81, 8, 24, 18, 125, 19, 44, 112, 79, 113, 59, 2, 74, 65, 62, 97, 35, 0, 61, 45, 78, 89, 101, 5, 4, 66, 67, 50, 42, 122, 11, 117, 26, 16, 108, 56, 3, 10, 9, 121, 25, 116, 94, 1, 105, 118, 12, 47, 120, 20, 96, 43, 29, 87, 64, 52, 13, 104, 80, 127, 7, 38, 54, 95, 27, 34, 68, 103], [40, 99, 41, 107, 9, 66, 96, 37, 91, 36, 73, 24, 58, 111, 94, 31, 47, 86, 114, 50, 92, 89, 10, 52, 29, 59, 34, 6, 39, 18, 127, 44, 100, 22, 76, 21, 56, 122, 121, 57, 102, 79, 70, 12, 46, 84, 81, 85, 63, 116, 72, 88, 43, 110, 5, 83, 87, 45, 69, 27, 3, 2, 8, 78, 51, 115, 106, 67, 74, 7, 54, 118, 19, 42, 104, 124, 11, 97, 4, 75, 105, 35, 33, 62, 55, 28, 30, 0, 82, 77, 95, 93, 120, 98, 20, 25, 117, 32, 53, 17, 90, 26, 71, 108, 64, 14, 23, 101, 48, 49, 65, 15, 119, 125, 103, 80, 68, 112, 1, 126, 13, 16, 113, 123, 61, 109, 38, 60], [40, 41, 102, 37, 92, 32, 90, 88, 96, 107, 23, 31, 74, 39, 84, 97, 25, 75, 82, 35, 79, 10, 91, 15, 7, 4, 18, 116, 20, 98, 87, 99, 69, 33, 86, 111, 100, 122, 121, 127, 65, 58, 13, 8, 56, 47, 95, 9, 50, 114, 81, 110, 59, 118, 72, 48, 77, 17, 29, 24, 21, 52, 104, 22, 54, 28, 94, 83, 85, 51, 57, 14, 73, 36, 16, 19, 66, 76, 71, 11, 93, 78, 120, 27, 6, 108, 5, 117, 43, 49, 63, 3, 67, 2, 61, 26, 64, 70, 68, 0, 101, 105, 124, 62, 55, 30, 80, 89, 103, 34, 44, 113, 45, 60, 1, 38, 106, 42, 115, 109, 12, 125, 119, 126, 53, 112, 123, 46], [103, 105, 96, 47, 59, 45, 42, 30, 19, 93, 14, 112, 113, 32, 86, 0, 127, 104, 38, 21, 84, 57, 48, 89, 72, 124, 87, 81, 51, 80, 55, 44, 25, 63, 126, 94, 118, 123, 34, 52, 68, 26, 40, 7, 115, 27, 3, 33, 88, 11, 120, 8, 125, 1, 23, 66, 70, 110, 77, 5, 107, 39, 50, 73, 58, 46, 13, 119, 6, 91, 9, 69, 15, 17, 76, 74, 43, 54, 24, 79, 95, 75, 22, 82, 78, 90, 12, 121, 92, 16, 71, 31, 114, 4, 117, 116, 18, 20, 67, 61, 53, 97, 37, 28, 41, 49, 35, 60, 36, 122, 99, 10, 85, 2, 109, 100, 101, 108, 83, 102, 56, 111, 65, 62, 98, 29, 64, 106], [104, 42, 103, 96, 45, 34, 47, 93, 32, 89, 27, 30, 84, 94, 26, 21, 81, 112, 116, 87, 105, 19, 51, 102, 86, 25, 38, 88, 80, 15, 41, 23, 119, 33, 20, 118, 52, 111, 114, 113, 44, 110, 85, 57, 98, 125, 124, 13, 48, 54, 11, 107, 43, 73, 55, 37, 106, 12, 100, 59, 10, 76, 24, 127, 77, 82, 115, 60, 14, 72, 101, 68, 90, 121, 29, 91, 0, 63, 97, 58, 75, 9, 61, 46, 126, 108, 4, 39, 56, 18, 22, 28, 70, 117, 120, 53, 109, 66, 6, 123, 3, 1, 95, 92, 49, 31, 50, 122, 99, 40, 69, 5, 62, 67, 83, 79, 16, 36, 64, 78, 2, 8, 74, 7, 35, 65, 71, 17], [103, 104, 96, 42, 45, 105, 30, 19, 93, 59, 47, 89, 113, 14, 86, 112, 84, 21, 32, 124, 80, 87, 57, 81, 44, 0, 72, 127, 88, 115, 25, 26, 48, 63, 119, 123, 51, 68, 118, 34, 52, 15, 55, 98, 126, 61, 43, 73, 24, 110, 102, 46, 107, 125, 121, 23, 116, 114, 41, 56, 54, 66, 27, 8, 3, 70, 91, 76, 6, 11, 39, 13, 1, 33, 97, 74, 4, 94, 22, 77, 58, 7, 37, 95, 82, 50, 36, 17, 117, 120, 53, 122, 75, 16, 90, 79, 67, 111, 40, 5, 64, 9, 2, 78, 18, 38, 31, 71, 60, 65, 12, 100, 108, 92, 85, 69, 29, 49, 99, 101, 62, 20, 109, 28, 35, 10, 106, 83], [42, 104, 103, 96, 45, 30, 89, 112, 34, 27, 113, 32, 102, 47, 93, 26, 105, 84, 94, 21, 81, 86, 88, 87, 124, 25, 59, 80, 44, 15, 19, 38, 114, 20, 57, 85, 52, 51, 23, 118, 127, 125, 119, 43, 100, 29, 33, 41, 46, 116, 82, 13, 126, 48, 11, 107, 77, 73, 98, 72, 122, 106, 54, 37, 95, 55, 24, 76, 12, 115, 10, 14, 97, 39, 110, 68, 101, 123, 56, 63, 92, 4, 0, 6, 91, 53, 90, 121, 50, 40, 61, 111, 22, 75, 9, 18, 16, 70, 117, 31, 60, 3, 58, 99, 66, 49, 74, 67, 1, 108, 109, 79, 2, 120, 35, 62, 64, 36, 28, 7, 83, 8, 69, 65, 5, 78, 71, 17], [100, 39, 31, 50, 91, 25, 47, 97, 96, 42, 116, 40, 90, 60, 41, 46, 19, 70, 44, 89, 22, 109, 48, 33, 95, 77, 27, 107, 2, 43, 101, 0, 18, 17, 108, 80, 49, 13, 53, 7, 10, 103, 64, 45, 21, 94, 12, 74, 79, 76, 118, 115, 124, 5, 73, 120, 35, 56, 99, 28, 55, 81, 54, 110, 14, 114, 126, 6, 68, 125, 127, 123, 93, 112, 113, 38, 58, 57, 61, 122, 32, 121, 75, 62, 84, 88, 119, 111, 59, 106, 3, 63, 72, 52, 30, 36, 26, 51, 86, 34, 98, 102, 29, 117, 16, 9, 8, 67, 83, 105, 11, 71, 65, 15, 4, 69, 1, 24, 104, 87, 66, 20, 82, 37, 92, 78, 23, 85], [40, 25, 64, 101, 42, 41, 45, 110, 72, 113, 56, 112, 108, 68, 107, 111, 114, 39, 2, 61, 125, 55, 119, 51, 97, 63, 31, 121, 22, 57, 54, 126, 59, 74, 123, 80, 87, 96, 52, 62, 18, 120, 7, 48, 0, 84, 58, 28, 91, 70, 115, 93, 11, 5, 14, 49, 60, 17, 30, 76, 66, 44, 19, 24, 88, 77, 3, 100, 13, 50, 1, 79, 109, 69, 90, 95, 99, 102, 104, 33, 53, 71, 21, 73, 8, 26, 106, 65, 75, 116, 117, 46, 98, 105, 89, 92, 67, 47, 35, 34, 81, 12, 4, 6, 122, 36, 78, 85, 10, 94, 43, 82, 83, 9, 118, 86, 103, 15, 127, 37, 16, 38, 27, 124, 23, 20, 32, 29], [0, 39, 22, 42, 96, 41, 91, 18, 10, 107, 79, 40, 14, 21, 19, 50, 77, 71, 80, 88, 70, 84, 48, 5, 45, 85, 44, 17, 11, 81, 100, 46, 28, 60, 76, 83, 72, 110, 66, 63, 57, 36, 47, 109, 2, 108, 56, 125, 120, 35, 111, 93, 61, 73, 114, 24, 112, 49, 55, 113, 87, 90, 121, 30, 64, 4, 99, 126, 116, 15, 51, 103, 52, 54, 6, 82, 97, 43, 115, 86, 119, 123, 58, 7, 101, 117, 69, 53, 13, 33, 59, 9, 62, 32, 68, 75, 31, 8, 3, 1, 12, 92, 29, 20, 78, 65, 104, 37, 27, 106, 122, 74, 16, 23, 34, 95, 25, 118, 26, 89, 94, 67, 98, 38, 105, 102, 127, 124], [108, 54, 112, 119, 59, 62, 113, 110, 123, 111, 26, 51, 61, 107, 55, 125, 89, 115, 114, 45, 121, 98, 58, 57, 63, 48, 52, 49, 34, 109, 122, 41, 126, 43, 127, 38, 32, 95, 56, 94, 100, 33, 120, 53, 50, 24, 124, 29, 40, 101, 44, 90, 25, 85, 8, 102, 60, 97, 88, 86, 83, 27, 42, 118, 46, 99, 66, 105, 4, 12, 19, 10, 47, 116, 96, 13, 82, 22, 16, 92, 30, 64, 31, 37, 117, 72, 28, 91, 87, 106, 69, 2, 103, 17, 9, 36, 70, 5, 84, 65, 76, 104, 75, 11, 39, 0, 78, 73, 20, 77, 23, 3, 6, 81, 68, 15, 74, 79, 80, 21, 35, 7, 93, 18, 71, 14, 67, 1]], "model.layers.0.self_attn.k_proj": [[0, 97, 42, 47, 103, 86, 93, 1, 25, 38, 88, 49, 40, 108, 65, 91, 117, 48, 57, 66, 78, 46, 68, 9, 64, 50, 52, 19, 45, 31, 118, 2, 55, 51, 70, 67, 85, 61, 77, 127, 82, 17, 63, 120, 80, 126, 87, 6, 75, 124, 72, 123, 3, 54, 62, 122, 14, 121, 7, 69, 53, 5, 56, 12, 10, 84, 41, 119, 115, 81, 34, 4, 76, 101, 107, 105, 60, 26, 74, 23, 79, 89, 15, 73, 24, 83, 22, 21, 18, 58, 43, 28, 33, 116, 29, 35, 27, 11, 112, 32, 30, 92, 16, 37, 104, 98, 59, 99, 94, 39, 90, 114, 13, 96, 36, 113, 100, 109, 95, 125, 111, 20, 106, 71, 8, 110, 44, 102], [39, 97, 105, 102, 112, 46, 65, 43, 29, 113, 64, 51, 77, 68, 111, 25, 86, 125, 18, 67, 106, 70, 9, 80, 17, 4, 59, 11, 116, 31, 124, 24, 126, 12, 88, 3, 108, 78, 52, 71, 73, 16, 45, 47, 90, 66, 42, 22, 53, 57, 21, 61, 122, 118, 20, 1, 6, 121, 114, 93, 109, 54, 89, 62, 76, 120, 74, 48, 69, 123, 33, 87, 50, 13, 55, 7, 10, 127, 84, 95, 83, 115, 41, 23, 26, 119, 2, 117, 60, 103, 56, 5, 30, 37, 72, 81, 40, 100, 101, 91, 14, 104, 94, 15, 92, 19, 38, 85, 8, 0, 49, 79, 75, 35, 36, 28, 63, 98, 82, 32, 34, 99, 110, 44, 27, 58, 96, 107], [38, 39, 97, 109, 105, 89, 25, 49, 106, 78, 44, 40, 20, 48, 36, 126, 33, 64, 62, 31, 104, 108, 82, 27, 122, 29, 120, 94, 34, 96, 113, 117, 51, 42, 107, 103, 11, 8, 115, 16, 18, 84, 123, 14, 30, 37, 124, 91, 125, 50, 88, 56, 61, 41, 17, 87, 114, 66, 21, 47, 12, 53, 54, 19, 60, 26, 52, 73, 28, 24, 23, 118, 90, 32, 81, 83, 59, 102, 85, 15, 1, 74, 93, 57, 121, 43, 75, 58, 3, 112, 70, 92, 9, 69, 13, 127, 111, 79, 5, 6, 80, 119, 22, 63, 55, 76, 99, 7, 116, 35, 110, 77, 68, 4, 101, 72, 86, 46, 67, 10, 71, 45, 0, 2, 95, 98, 65, 100], [107, 39, 108, 64, 40, 47, 105, 97, 113, 29, 65, 30, 3, 66, 36, 83, 48, 89, 116, 122, 101, 90, 2, 95, 17, 13, 16, 33, 74, 38, 85, 23, 19, 63, 87, 27, 109, 5, 71, 114, 24, 81, 26, 84, 104, 53, 50, 92, 82, 93, 4, 34, 22, 121, 106, 76, 1, 14, 78, 68, 35, 79, 115, 103, 120, 123, 9, 42, 55, 41, 69, 7, 126, 118, 88, 18, 127, 28, 112, 51, 59, 80, 54, 124, 111, 117, 62, 91, 15, 43, 77, 96, 125, 110, 98, 6, 45, 49, 119, 20, 100, 94, 60, 25, 75, 58, 86, 56, 52, 44, 0, 57, 21, 61, 12, 10, 46, 31, 102, 11, 32, 70, 8, 72, 99, 67, 37, 73], [103, 105, 107, 33, 0, 37, 65, 83, 24, 28, 95, 87, 78, 19, 84, 23, 25, 77, 34, 96, 125, 54, 94, 14, 92, 35, 21, 89, 79, 5, 116, 123, 22, 59, 13, 18, 29, 55, 73, 81, 36, 12, 106, 52, 127, 111, 102, 56, 121, 15, 97, 42, 63, 46, 48, 120, 11, 45, 91, 51, 71, 76, 126, 93, 85, 66, 119, 122, 90, 8, 68, 60, 26, 98, 2, 118, 40, 39, 109, 16, 113, 57, 32, 58, 3, 27, 112, 62, 61, 44, 30, 7, 80, 115, 114, 124, 41, 74, 117, 50, 17, 110, 49, 31, 69, 53, 47, 108, 4, 64, 82, 9, 104, 67, 100, 75, 43, 1, 88, 72, 38, 101, 10, 20, 70, 86, 99, 6], [104, 38, 43, 103, 95, 42, 45, 46, 105, 112, 84, 113, 53, 50, 59, 125, 96, 90, 115, 88, 60, 23, 13, 62, 34, 61, 91, 82, 58, 124, 0, 57, 92, 63, 111, 21, 123, 120, 119, 20, 79, 19, 55, 54, 17, 15, 126, 22, 81, 87, 27, 127, 44, 72, 49, 85, 51, 86, 56, 52, 117, 80, 48, 75, 114, 31, 89, 18, 94, 116, 29, 121, 122, 1, 33, 12, 16, 118, 32, 7, 35, 101, 64, 40, 78, 47, 24, 109, 65, 69, 71, 14, 76, 73, 100, 11, 97, 110, 41, 8, 26, 4, 108, 67, 39, 93, 25, 36, 70, 28, 10, 66, 74, 6, 77, 37, 107, 99, 5, 68, 83, 9, 98, 3, 102, 106, 30, 2], [39, 111, 32, 1, 0, 109, 2, 6, 48, 4, 40, 41, 29, 108, 67, 84, 66, 15, 107, 11, 65, 81, 115, 70, 89, 73, 52, 21, 30, 57, 104, 119, 80, 42, 118, 87, 124, 123, 13, 106, 72, 3, 49, 86, 58, 98, 63, 27, 114, 110, 127, 113, 126, 76, 53, 26, 25, 14, 83, 59, 96, 55, 77, 88, 19, 94, 121, 93, 23, 125, 7, 78, 51, 120, 46, 82, 116, 71, 68, 105, 85, 24, 97, 8, 61, 74, 103, 10, 54, 47, 117, 122, 34, 91, 18, 92, 20, 62, 100, 12, 31, 75, 56, 5, 90, 9, 79, 28, 33, 101, 69, 99, 22, 36, 95, 35, 43, 50, 60, 16, 112, 37, 44, 102, 17, 45, 64, 38], [64, 66, 4, 106, 6, 67, 1, 71, 10, 69, 105, 46, 43, 104, 37, 44, 92, 47, 109, 103, 94, 15, 8, 83, 85, 7, 56, 53, 73, 86, 126, 116, 32, 78, 22, 82, 75, 24, 77, 49, 9, 120, 76, 79, 50, 91, 114, 52, 127, 48, 17, 16, 35, 19, 124, 18, 102, 12, 20, 68, 58, 57, 63, 60, 115, 13, 113, 121, 123, 14, 81, 117, 45, 62, 59, 55, 51, 111, 122, 125, 23, 61, 119, 118, 110, 21, 3, 100, 112, 33, 54, 38, 72, 34, 2, 11, 80, 65, 29, 70, 88, 27, 0, 36, 98, 84, 89, 97, 74, 93, 30, 5, 107, 90, 96, 26, 87, 95, 28, 39, 108, 41, 42, 99, 101, 31, 25, 40]], "model.layers.0.self_attn.qk_proj": [[104, 39, 33, 42, 103, 41, 97, 64, 0, 93, 108, 25, 107, 89, 105, 113, 126, 29, 102, 121, 111, 48, 57, 95, 59, 115, 47, 123, 70, 88, 53, 63, 96, 30, 24, 20, 43, 87, 116, 23, 38, 109, 78, 124, 49, 19, 61, 112, 52, 51, 50, 120, 27, 83, 119, 22, 32, 114, 45, 21, 106, 125, 122, 40, 44, 127, 86, 54, 17, 118, 56, 85, 55, 14, 84, 46, 65, 81, 68, 1, 91, 3, 62, 4, 66, 18, 82, 77, 9, 79, 117, 2, 90, 92, 110, 8, 80, 67, 13, 76, 15, 75, 60, 12, 26, 16, 31, 11, 73, 10, 58, 71, 74, 7, 5, 69, 28, 101, 6, 34, 98, 94, 72, 37, 100, 35, 36, 99], [39, 104, 33, 42, 103, 41, 64, 97, 108, 0, 93, 107, 105, 25, 113, 126, 29, 89, 95, 102, 111, 59, 47, 38, 123, 121, 124, 88, 115, 50, 53, 106, 116, 48, 85, 119, 30, 70, 43, 63, 40, 87, 96, 24, 21, 57, 49, 23, 122, 27, 20, 22, 44, 114, 109, 52, 19, 120, 54, 61, 45, 67, 127, 17, 78, 32, 81, 83, 84, 62, 56, 125, 86, 51, 46, 14, 1, 118, 92, 112, 55, 79, 26, 18, 12, 2, 82, 15, 76, 65, 90, 3, 77, 91, 66, 117, 110, 73, 74, 4, 31, 9, 16, 13, 68, 75, 10, 80, 60, 8, 7, 58, 11, 71, 28, 98, 94, 6, 5, 72, 69, 34, 101, 100, 35, 36, 37, 99], [104, 39, 33, 42, 103, 41, 97, 108, 93, 0, 64, 25, 89, 29, 105, 107, 113, 95, 126, 102, 59, 123, 111, 115, 121, 47, 57, 88, 63, 24, 48, 124, 96, 38, 53, 116, 30, 87, 19, 52, 119, 43, 23, 20, 21, 40, 61, 127, 27, 109, 51, 84, 50, 22, 112, 106, 49, 32, 78, 86, 125, 14, 85, 54, 56, 17, 45, 120, 114, 70, 122, 44, 3, 118, 81, 46, 83, 65, 62, 82, 1, 91, 92, 15, 55, 76, 2, 117, 110, 12, 90, 79, 80, 26, 67, 16, 18, 77, 68, 11, 66, 13, 73, 9, 4, 31, 75, 6, 58, 74, 60, 72, 71, 10, 8, 28, 69, 7, 34, 98, 5, 94, 101, 100, 37, 35, 36, 99], [39, 104, 42, 33, 103, 41, 97, 108, 64, 0, 105, 93, 107, 113, 126, 89, 25, 29, 59, 124, 123, 111, 119, 63, 102, 115, 121, 95, 53, 47, 48, 50, 57, 87, 106, 96, 116, 43, 88, 38, 49, 30, 61, 23, 24, 22, 54, 20, 51, 6, 122, 44, 114, 27, 40, 109, 52, 78, 85, 125, 21, 83, 45, 17, 19, 118, 86, 84, 112, 62, 65, 81, 56, 55, 14, 120, 79, 127, 32, 68, 66, 67, 1, 46, 3, 9, 91, 110, 18, 4, 12, 73, 82, 26, 2, 72, 70, 92, 77, 15, 117, 76, 13, 16, 90, 58, 80, 75, 10, 31, 60, 74, 11, 7, 71, 69, 28, 5, 98, 94, 101, 34, 8, 100, 35, 37, 36, 99], [39, 104, 33, 42, 103, 41, 97, 64, 108, 0, 93, 105, 25, 89, 107, 113, 126, 115, 102, 59, 29, 123, 47, 95, 119, 121, 63, 53, 111, 124, 57, 48, 116, 43, 6, 20, 30, 23, 88, 96, 24, 106, 109, 38, 87, 50, 52, 22, 21, 49, 27, 17, 40, 61, 19, 112, 122, 78, 3, 54, 14, 44, 45, 114, 51, 91, 84, 86, 46, 65, 85, 125, 1, 127, 118, 120, 32, 81, 56, 62, 82, 83, 26, 73, 72, 55, 15, 9, 2, 110, 4, 79, 66, 76, 92, 117, 12, 67, 18, 16, 77, 11, 13, 68, 31, 80, 90, 75, 60, 58, 10, 74, 71, 70, 7, 98, 34, 28, 69, 5, 94, 101, 8, 100, 37, 36, 35, 99], [39, 104, 42, 33, 103, 41, 97, 108, 0, 64, 25, 93, 89, 105, 107, 126, 95, 113, 29, 102, 115, 123, 59, 48, 121, 53, 124, 6, 30, 88, 111, 47, 87, 63, 96, 106, 116, 23, 119, 38, 27, 24, 43, 57, 20, 49, 40, 22, 61, 44, 50, 54, 52, 122, 21, 109, 1, 51, 86, 81, 85, 19, 78, 114, 3, 84, 45, 125, 112, 32, 127, 14, 83, 118, 120, 17, 2, 46, 56, 55, 76, 62, 82, 91, 92, 18, 72, 77, 65, 79, 9, 117, 15, 12, 66, 90, 110, 67, 68, 4, 26, 16, 31, 13, 80, 73, 75, 10, 74, 60, 7, 11, 71, 58, 98, 5, 94, 28, 34, 70, 69, 101, 8, 100, 35, 37, 99, 36], [39, 104, 33, 42, 103, 97, 41, 0, 64, 93, 108, 113, 105, 107, 25, 89, 126, 47, 29, 95, 102, 30, 121, 59, 123, 115, 48, 111, 53, 6, 124, 88, 24, 50, 119, 49, 116, 27, 38, 106, 63, 43, 57, 96, 87, 23, 21, 32, 20, 125, 52, 61, 22, 109, 112, 44, 85, 45, 51, 19, 83, 122, 114, 40, 14, 54, 55, 78, 17, 120, 2, 84, 86, 62, 1, 118, 127, 66, 91, 3, 56, 72, 110, 4, 81, 18, 82, 77, 46, 68, 26, 79, 65, 16, 13, 9, 15, 76, 67, 12, 80, 92, 75, 117, 31, 90, 73, 60, 74, 58, 10, 11, 71, 7, 28, 70, 5, 69, 98, 34, 94, 101, 8, 100, 37, 35, 36, 99], [39, 33, 104, 42, 103, 97, 41, 93, 64, 108, 0, 25, 105, 113, 89, 107, 126, 29, 95, 47, 59, 102, 48, 88, 53, 115, 116, 123, 124, 111, 24, 57, 30, 38, 96, 63, 87, 52, 121, 19, 43, 23, 27, 20, 50, 119, 51, 85, 32, 6, 40, 21, 106, 49, 109, 61, 127, 78, 125, 86, 122, 84, 22, 54, 81, 120, 45, 112, 14, 114, 91, 17, 83, 56, 46, 44, 118, 2, 67, 16, 90, 110, 66, 62, 1, 117, 4, 79, 65, 76, 92, 72, 68, 82, 15, 18, 75, 13, 26, 55, 12, 9, 3, 80, 31, 77, 11, 73, 10, 58, 74, 60, 70, 28, 7, 98, 71, 34, 94, 101, 69, 5, 100, 37, 8, 35, 36, 99], [39, 104, 33, 42, 103, 41, 97, 64, 0, 108, 93, 105, 107, 113, 25, 95, 89, 29, 126, 48, 59, 124, 102, 88, 47, 38, 115, 123, 111, 53, 50, 106, 21, 27, 44, 30, 96, 43, 87, 116, 121, 109, 52, 23, 24, 119, 40, 63, 49, 57, 85, 19, 22, 114, 86, 122, 20, 127, 78, 54, 83, 56, 51, 17, 32, 84, 61, 14, 81, 90, 125, 120, 112, 45, 46, 118, 66, 1, 70, 82, 15, 62, 12, 79, 91, 55, 2, 76, 26, 65, 67, 18, 92, 6, 110, 9, 117, 73, 3, 72, 77, 16, 31, 68, 10, 13, 80, 74, 75, 4, 60, 11, 28, 58, 7, 98, 94, 34, 71, 5, 69, 8, 101, 100, 37, 35, 36, 99], [33, 104, 39, 42, 103, 41, 97, 89, 25, 93, 108, 29, 64, 95, 0, 113, 102, 107, 105, 126, 38, 87, 59, 88, 96, 48, 27, 24, 53, 47, 32, 116, 30, 115, 124, 85, 123, 19, 21, 40, 106, 23, 57, 43, 109, 121, 111, 22, 44, 20, 83, 63, 122, 78, 52, 50, 84, 86, 81, 49, 127, 2, 56, 45, 17, 26, 119, 66, 14, 114, 54, 61, 51, 125, 70, 91, 82, 4, 120, 112, 76, 12, 46, 118, 92, 18, 1, 67, 15, 55, 117, 68, 79, 65, 62, 80, 16, 90, 13, 77, 73, 74, 72, 31, 75, 9, 11, 10, 3, 110, 71, 7, 58, 60, 6, 28, 8, 34, 94, 98, 101, 69, 5, 100, 37, 36, 35, 99], [39, 104, 33, 42, 103, 0, 41, 64, 97, 108, 25, 93, 113, 105, 107, 126, 89, 59, 29, 115, 111, 47, 57, 121, 102, 63, 123, 48, 95, 124, 70, 52, 88, 96, 116, 53, 119, 43, 87, 61, 49, 23, 24, 30, 38, 50, 19, 125, 106, 20, 51, 21, 32, 109, 56, 22, 54, 14, 78, 1, 112, 120, 83, 67, 45, 40, 114, 44, 122, 118, 84, 127, 27, 46, 86, 81, 68, 65, 3, 17, 62, 85, 55, 91, 110, 2, 79, 18, 66, 77, 82, 4, 76, 9, 13, 117, 92, 15, 90, 26, 12, 73, 60, 11, 16, 31, 74, 58, 75, 80, 10, 8, 71, 7, 72, 28, 5, 69, 6, 98, 101, 94, 34, 100, 37, 35, 99, 36], [39, 104, 33, 42, 103, 0, 41, 64, 108, 97, 107, 105, 113, 93, 126, 25, 89, 59, 121, 63, 111, 115, 70, 123, 47, 29, 95, 102, 119, 48, 124, 96, 53, 116, 49, 57, 51, 88, 52, 87, 50, 61, 27, 30, 65, 109, 43, 106, 22, 120, 24, 23, 21, 114, 38, 56, 19, 122, 40, 54, 14, 45, 125, 118, 83, 84, 112, 20, 46, 44, 127, 55, 32, 86, 81, 78, 62, 17, 67, 82, 85, 3, 91, 2, 90, 117, 79, 66, 77, 92, 68, 4, 110, 76, 15, 13, 60, 73, 26, 31, 1, 9, 12, 75, 18, 80, 8, 58, 74, 11, 16, 71, 10, 28, 7, 69, 5, 98, 72, 34, 94, 6, 101, 100, 37, 36, 35, 99], [39, 33, 104, 42, 103, 97, 41, 108, 89, 93, 95, 105, 0, 113, 25, 64, 29, 126, 107, 30, 102, 27, 124, 48, 38, 47, 115, 53, 106, 88, 24, 59, 23, 116, 123, 21, 96, 40, 32, 87, 86, 111, 50, 121, 44, 85, 43, 22, 122, 20, 109, 63, 49, 70, 57, 119, 52, 83, 81, 19, 51, 14, 91, 17, 114, 118, 78, 54, 127, 125, 45, 112, 61, 26, 92, 84, 2, 56, 76, 66, 18, 15, 82, 12, 65, 62, 79, 46, 1, 120, 117, 80, 55, 73, 31, 68, 90, 110, 16, 67, 77, 3, 8, 13, 75, 74, 4, 9, 6, 10, 11, 60, 34, 58, 71, 28, 98, 94, 7, 72, 69, 101, 5, 100, 36, 37, 35, 99], [33, 104, 39, 42, 103, 41, 97, 93, 108, 89, 25, 0, 64, 95, 105, 113, 29, 107, 126, 102, 121, 115, 53, 30, 88, 38, 48, 47, 96, 59, 124, 123, 57, 43, 87, 21, 111, 116, 22, 23, 27, 63, 32, 40, 51, 52, 50, 106, 20, 109, 86, 44, 83, 122, 24, 14, 49, 78, 119, 127, 19, 112, 84, 81, 61, 54, 17, 114, 85, 55, 120, 91, 45, 46, 70, 18, 82, 62, 79, 65, 12, 2, 56, 92, 3, 66, 118, 4, 125, 90, 73, 26, 117, 13, 1, 68, 8, 80, 15, 76, 6, 31, 67, 77, 110, 16, 11, 9, 75, 10, 60, 74, 28, 7, 71, 5, 58, 98, 34, 101, 94, 69, 100, 72, 37, 35, 36, 99], [39, 104, 33, 42, 103, 0, 97, 41, 108, 93, 64, 113, 25, 107, 126, 47, 105, 89, 59, 29, 115, 124, 48, 63, 121, 123, 53, 95, 57, 111, 116, 102, 52, 30, 96, 24, 49, 88, 119, 51, 87, 23, 38, 20, 106, 78, 32, 61, 83, 44, 43, 27, 21, 22, 50, 127, 109, 122, 86, 54, 85, 19, 114, 6, 17, 125, 14, 3, 40, 112, 4, 81, 118, 46, 45, 56, 65, 62, 84, 2, 55, 120, 82, 66, 8, 91, 18, 76, 68, 79, 26, 117, 1, 12, 9, 11, 67, 73, 90, 70, 13, 15, 16, 110, 92, 77, 31, 60, 75, 80, 71, 10, 7, 58, 74, 28, 5, 98, 34, 69, 94, 72, 101, 37, 100, 35, 36, 99], [39, 104, 33, 42, 103, 41, 64, 0, 97, 108, 93, 113, 25, 105, 107, 89, 126, 115, 29, 102, 59, 123, 95, 111, 47, 6, 48, 121, 53, 88, 63, 57, 119, 124, 43, 49, 116, 30, 52, 109, 24, 38, 106, 87, 50, 96, 61, 20, 23, 21, 19, 44, 27, 114, 14, 122, 125, 51, 54, 85, 22, 112, 56, 40, 120, 84, 83, 118, 127, 17, 65, 62, 78, 55, 45, 46, 32, 66, 81, 91, 3, 86, 1, 67, 8, 76, 18, 79, 117, 68, 2, 73, 110, 26, 90, 82, 77, 9, 4, 13, 92, 12, 16, 80, 74, 15, 60, 31, 11, 75, 10, 7, 58, 28, 71, 70, 94, 69, 5, 98, 34, 101, 72, 37, 100, 35, 99, 36], [39, 33, 104, 42, 103, 41, 97, 108, 0, 93, 64, 105, 107, 25, 89, 95, 113, 126, 29, 102, 59, 30, 111, 121, 47, 123, 88, 38, 124, 115, 87, 6, 24, 43, 106, 50, 53, 48, 116, 119, 40, 27, 21, 23, 96, 86, 109, 85, 63, 20, 44, 83, 32, 122, 22, 84, 49, 52, 57, 17, 14, 91, 114, 81, 51, 19, 61, 54, 127, 45, 56, 65, 120, 92, 125, 90, 79, 66, 2, 76, 78, 112, 82, 46, 118, 26, 15, 18, 62, 55, 80, 3, 13, 67, 8, 68, 9, 31, 12, 117, 1, 74, 73, 16, 110, 4, 77, 11, 75, 10, 60, 58, 71, 7, 28, 34, 94, 98, 5, 72, 69, 100, 70, 101, 35, 36, 37, 99], [39, 104, 33, 42, 103, 41, 97, 108, 93, 64, 0, 89, 29, 105, 113, 25, 126, 107, 95, 115, 47, 124, 111, 116, 88, 59, 48, 121, 123, 53, 57, 87, 102, 24, 38, 63, 30, 20, 23, 96, 43, 21, 6, 40, 127, 22, 106, 119, 52, 50, 27, 51, 109, 49, 61, 32, 54, 17, 78, 84, 44, 112, 114, 19, 83, 118, 85, 82, 56, 120, 45, 122, 14, 125, 81, 1, 46, 117, 86, 65, 91, 67, 62, 68, 26, 2, 92, 90, 55, 4, 79, 12, 77, 66, 15, 18, 13, 76, 31, 3, 9, 110, 16, 73, 75, 80, 11, 74, 10, 60, 58, 8, 7, 98, 71, 70, 5, 72, 69, 28, 34, 94, 101, 100, 37, 35, 99, 36], [39, 104, 33, 42, 103, 64, 97, 41, 0, 108, 93, 107, 113, 89, 105, 126, 25, 29, 59, 121, 115, 102, 95, 47, 111, 123, 124, 48, 63, 53, 57, 88, 119, 43, 30, 38, 24, 50, 116, 96, 49, 61, 51, 106, 87, 20, 52, 54, 19, 23, 21, 44, 27, 22, 81, 6, 14, 125, 122, 85, 32, 112, 114, 109, 127, 120, 55, 78, 62, 40, 84, 83, 56, 2, 45, 17, 86, 118, 46, 1, 67, 82, 91, 65, 3, 110, 68, 79, 4, 66, 12, 26, 77, 18, 73, 15, 90, 117, 92, 76, 9, 70, 13, 80, 11, 16, 10, 75, 31, 74, 58, 60, 7, 71, 69, 72, 8, 28, 5, 98, 94, 34, 101, 37, 100, 35, 99, 36], [39, 33, 104, 42, 103, 41, 108, 97, 0, 64, 113, 25, 93, 89, 107, 105, 29, 95, 126, 102, 115, 59, 48, 124, 123, 121, 111, 38, 88, 53, 63, 21, 24, 47, 106, 116, 30, 87, 23, 49, 22, 27, 96, 119, 85, 43, 20, 57, 122, 40, 52, 109, 44, 65, 32, 14, 83, 17, 81, 51, 19, 61, 50, 54, 114, 125, 78, 91, 127, 3, 84, 112, 70, 79, 56, 86, 67, 120, 46, 45, 12, 62, 18, 2, 118, 26, 76, 66, 73, 55, 82, 92, 4, 16, 13, 6, 90, 1, 15, 9, 117, 110, 31, 77, 80, 74, 10, 68, 75, 72, 7, 71, 11, 28, 58, 60, 94, 98, 34, 5, 8, 69, 101, 100, 35, 37, 36, 99], [39, 104, 33, 42, 103, 41, 64, 97, 108, 0, 93, 105, 113, 89, 107, 59, 25, 29, 126, 95, 123, 47, 121, 48, 111, 115, 102, 63, 124, 70, 57, 87, 119, 53, 116, 88, 30, 109, 49, 43, 50, 24, 56, 96, 114, 61, 38, 52, 65, 51, 106, 40, 127, 20, 44, 83, 85, 120, 118, 54, 22, 86, 23, 46, 21, 19, 122, 78, 112, 27, 125, 81, 45, 14, 32, 62, 17, 84, 67, 91, 82, 90, 117, 15, 1, 55, 18, 3, 110, 76, 12, 13, 26, 66, 68, 77, 60, 72, 31, 2, 4, 9, 16, 75, 79, 92, 58, 73, 11, 10, 80, 6, 74, 7, 98, 28, 71, 69, 5, 34, 94, 100, 8, 101, 37, 35, 36, 99], [39, 104, 33, 42, 103, 41, 97, 108, 107, 0, 93, 113, 64, 105, 126, 89, 25, 95, 29, 30, 47, 59, 123, 102, 88, 124, 121, 111, 70, 115, 38, 43, 116, 24, 48, 53, 119, 106, 50, 19, 96, 63, 87, 57, 44, 21, 49, 23, 109, 27, 85, 52, 22, 14, 40, 51, 122, 20, 32, 56, 54, 114, 61, 17, 84, 127, 125, 2, 120, 62, 4, 83, 81, 67, 86, 78, 55, 79, 66, 112, 82, 118, 46, 26, 91, 68, 65, 1, 45, 15, 76, 16, 72, 18, 92, 117, 73, 12, 3, 9, 90, 77, 11, 110, 74, 13, 31, 80, 10, 71, 60, 58, 75, 7, 28, 94, 98, 69, 8, 34, 5, 101, 100, 37, 6, 35, 36, 99], [39, 104, 33, 42, 103, 41, 0, 97, 108, 105, 93, 64, 25, 89, 113, 107, 95, 29, 126, 102, 115, 59, 111, 123, 70, 124, 48, 53, 87, 24, 88, 121, 106, 47, 38, 27, 63, 21, 116, 119, 43, 50, 96, 22, 30, 40, 20, 23, 19, 57, 52, 49, 122, 114, 109, 85, 44, 54, 17, 51, 84, 78, 81, 65, 61, 3, 45, 112, 86, 120, 125, 56, 127, 32, 14, 91, 62, 82, 118, 46, 15, 92, 12, 79, 1, 18, 110, 26, 9, 76, 72, 83, 73, 67, 55, 117, 2, 80, 16, 66, 31, 77, 13, 10, 75, 4, 68, 90, 11, 60, 58, 7, 74, 6, 28, 34, 98, 5, 69, 71, 94, 101, 100, 35, 8, 36, 37, 99], [39, 104, 33, 42, 103, 41, 97, 108, 0, 64, 25, 93, 107, 105, 89, 29, 113, 95, 126, 102, 115, 123, 48, 124, 88, 53, 59, 121, 30, 87, 47, 38, 96, 22, 63, 111, 106, 23, 43, 27, 49, 24, 65, 40, 116, 20, 119, 50, 44, 109, 122, 52, 114, 57, 51, 21, 85, 14, 84, 54, 70, 83, 81, 127, 61, 17, 86, 82, 19, 32, 79, 56, 112, 46, 78, 118, 45, 120, 62, 91, 55, 2, 117, 125, 3, 12, 18, 9, 72, 92, 13, 76, 26, 16, 66, 31, 1, 67, 90, 68, 110, 11, 77, 10, 73, 15, 80, 6, 4, 60, 75, 71, 74, 58, 98, 28, 34, 7, 94, 5, 69, 101, 8, 100, 35, 37, 36, 99], [39, 104, 33, 42, 103, 97, 41, 93, 64, 0, 108, 25, 107, 113, 89, 105, 102, 126, 95, 29, 47, 30, 123, 59, 111, 88, 115, 24, 48, 121, 38, 124, 53, 119, 63, 49, 43, 57, 96, 50, 27, 52, 23, 116, 21, 87, 19, 106, 51, 20, 61, 32, 84, 44, 22, 109, 40, 78, 85, 114, 127, 14, 118, 83, 125, 17, 86, 112, 66, 122, 45, 120, 2, 54, 81, 62, 56, 1, 3, 68, 65, 4, 70, 46, 26, 79, 55, 12, 82, 6, 91, 15, 92, 72, 18, 80, 76, 117, 13, 9, 77, 90, 16, 67, 110, 31, 11, 10, 73, 60, 58, 75, 74, 71, 7, 28, 5, 98, 69, 34, 94, 8, 101, 100, 35, 37, 99, 36], [39, 104, 33, 42, 103, 41, 97, 108, 64, 93, 105, 95, 29, 107, 0, 89, 25, 126, 88, 113, 102, 123, 124, 47, 121, 48, 53, 30, 115, 111, 59, 38, 43, 87, 106, 116, 40, 96, 22, 50, 109, 6, 24, 63, 23, 49, 119, 27, 21, 54, 44, 32, 52, 61, 83, 114, 122, 57, 20, 86, 56, 85, 81, 112, 45, 127, 19, 84, 51, 14, 125, 120, 1, 78, 118, 46, 62, 82, 17, 91, 15, 65, 18, 3, 117, 90, 67, 79, 26, 31, 92, 12, 66, 2, 77, 68, 55, 76, 16, 110, 13, 73, 58, 4, 80, 10, 9, 75, 74, 11, 72, 98, 60, 70, 7, 71, 34, 28, 94, 69, 8, 5, 101, 100, 36, 35, 37, 99], [39, 104, 33, 42, 103, 41, 97, 0, 64, 108, 93, 25, 113, 29, 89, 105, 59, 126, 107, 95, 115, 111, 102, 48, 123, 57, 47, 87, 124, 53, 121, 6, 63, 116, 38, 52, 88, 24, 96, 109, 30, 43, 119, 61, 106, 21, 49, 127, 44, 50, 54, 19, 23, 40, 51, 84, 20, 125, 27, 114, 56, 112, 14, 86, 85, 22, 83, 45, 122, 17, 65, 32, 120, 46, 81, 66, 78, 55, 1, 91, 3, 18, 79, 82, 62, 118, 26, 67, 12, 110, 90, 2, 76, 13, 77, 92, 31, 117, 10, 4, 15, 16, 9, 80, 75, 60, 73, 58, 68, 11, 74, 8, 71, 7, 72, 28, 70, 69, 98, 5, 101, 34, 94, 100, 35, 37, 36, 99], [39, 33, 104, 42, 103, 41, 97, 108, 0, 93, 25, 89, 107, 64, 113, 105, 126, 29, 95, 102, 59, 111, 123, 48, 124, 47, 30, 38, 53, 24, 43, 121, 6, 57, 88, 27, 115, 63, 96, 23, 44, 106, 116, 87, 21, 83, 49, 119, 109, 85, 52, 20, 32, 22, 50, 40, 14, 122, 19, 51, 17, 86, 78, 54, 114, 127, 81, 61, 45, 46, 84, 65, 62, 91, 56, 125, 112, 66, 120, 18, 2, 12, 15, 68, 67, 26, 82, 90, 3, 55, 118, 4, 79, 92, 13, 31, 1, 117, 76, 9, 8, 73, 80, 11, 77, 10, 16, 110, 75, 60, 74, 7, 71, 28, 58, 72, 34, 94, 98, 69, 5, 70, 101, 100, 37, 35, 36, 99], [104, 39, 33, 42, 103, 64, 0, 41, 108, 97, 93, 113, 107, 126, 25, 89, 59, 29, 105, 121, 111, 63, 115, 47, 102, 123, 95, 57, 88, 48, 124, 6, 53, 119, 21, 24, 61, 87, 116, 43, 96, 30, 106, 49, 51, 20, 83, 50, 114, 52, 67, 38, 109, 27, 65, 54, 125, 56, 122, 1, 112, 14, 4, 85, 44, 22, 120, 127, 23, 78, 19, 32, 81, 45, 40, 55, 86, 46, 118, 84, 17, 62, 2, 3, 12, 15, 82, 66, 79, 68, 26, 76, 18, 91, 9, 117, 110, 73, 8, 77, 92, 13, 16, 90, 7, 75, 10, 11, 80, 74, 71, 60, 31, 58, 70, 69, 28, 5, 98, 101, 94, 72, 34, 100, 37, 35, 99, 36], [39, 104, 33, 42, 103, 41, 108, 97, 93, 0, 105, 64, 25, 95, 29, 89, 115, 113, 126, 107, 102, 59, 124, 48, 123, 111, 53, 43, 88, 106, 121, 30, 47, 38, 119, 24, 87, 50, 96, 27, 22, 63, 21, 20, 57, 17, 49, 23, 52, 86, 116, 44, 40, 54, 122, 114, 6, 109, 51, 84, 65, 78, 19, 112, 127, 61, 125, 14, 83, 32, 85, 3, 81, 45, 18, 56, 46, 70, 79, 120, 1, 15, 62, 91, 82, 55, 76, 118, 9, 67, 2, 12, 8, 92, 90, 73, 110, 4, 31, 13, 68, 117, 16, 26, 10, 66, 80, 77, 75, 11, 74, 58, 60, 71, 28, 69, 7, 98, 94, 34, 5, 101, 100, 72, 36, 35, 37, 99], [39, 33, 104, 42, 103, 97, 41, 93, 0, 108, 25, 95, 107, 29, 113, 89, 64, 105, 102, 126, 115, 59, 88, 124, 47, 27, 38, 30, 24, 53, 121, 87, 48, 23, 123, 116, 96, 21, 106, 111, 32, 50, 44, 43, 85, 19, 109, 40, 63, 52, 83, 22, 49, 119, 54, 122, 56, 84, 81, 14, 86, 125, 20, 57, 66, 127, 114, 112, 17, 91, 61, 78, 2, 1, 70, 51, 45, 18, 12, 118, 82, 65, 15, 120, 46, 55, 90, 76, 77, 16, 92, 4, 3, 26, 8, 62, 67, 79, 117, 73, 80, 75, 13, 31, 68, 74, 9, 110, 10, 6, 71, 11, 28, 60, 58, 7, 34, 98, 94, 5, 69, 72, 100, 101, 35, 37, 36, 99], [39, 104, 33, 42, 103, 97, 0, 41, 93, 108, 64, 25, 105, 89, 113, 29, 95, 107, 126, 102, 59, 115, 48, 47, 123, 124, 53, 70, 88, 111, 116, 30, 43, 52, 24, 57, 23, 38, 121, 96, 87, 63, 22, 119, 106, 78, 51, 50, 40, 32, 20, 19, 44, 109, 49, 61, 54, 122, 127, 91, 83, 21, 112, 27, 125, 81, 45, 85, 84, 118, 1, 86, 56, 17, 62, 46, 79, 114, 55, 3, 14, 66, 120, 65, 2, 18, 68, 82, 110, 12, 8, 26, 117, 77, 90, 67, 75, 16, 92, 4, 31, 13, 76, 15, 80, 9, 73, 10, 28, 11, 74, 58, 60, 7, 71, 34, 98, 5, 94, 69, 6, 101, 72, 100, 37, 35, 36, 99]], "model.layers.1.self_attn.q_proj": [[103, 12, 107, 104, 46, 33, 42, 105, 49, 123, 50, 117, 62, 48, 57, 122, 19, 6, 59, 31, 29, 52, 54, 56, 5, 44, 115, 23, 92, 63, 8, 76, 70, 80, 66, 79, 72, 20, 116, 85, 120, 111, 84, 65, 78, 83, 25, 3, 22, 18, 127, 55, 90, 9, 27, 7, 126, 11, 87, 74, 37, 58, 82, 16, 77, 124, 4, 67, 2, 35, 13, 109, 15, 125, 88, 102, 71, 75, 17, 34, 94, 21, 100, 86, 14, 61, 24, 81, 114, 69, 45, 1, 98, 112, 73, 0, 51, 10, 39, 113, 101, 38, 32, 30, 36, 26, 121, 89, 41, 43, 97, 108, 106, 47, 40, 28, 93, 96, 64, 60, 91, 99, 118, 53, 119, 68, 110, 95], [103, 107, 104, 42, 105, 117, 46, 123, 50, 62, 24, 48, 33, 3, 67, 57, 122, 111, 59, 70, 56, 7, 127, 25, 65, 29, 49, 63, 115, 31, 79, 58, 13, 124, 16, 74, 44, 6, 66, 0, 4, 11, 116, 22, 9, 92, 82, 54, 5, 77, 101, 23, 125, 45, 84, 52, 1, 90, 78, 12, 98, 20, 94, 27, 126, 71, 2, 83, 120, 17, 73, 109, 38, 18, 87, 37, 19, 86, 32, 36, 15, 75, 64, 80, 55, 35, 81, 10, 14, 102, 100, 8, 113, 99, 76, 61, 121, 69, 26, 85, 53, 72, 97, 21, 30, 112, 89, 68, 118, 39, 43, 88, 41, 106, 114, 34, 110, 91, 95, 93, 60, 47, 108, 40, 51, 28, 96, 119], [100, 44, 102, 113, 37, 38, 59, 115, 35, 57, 122, 116, 34, 104, 114, 103, 109, 123, 46, 107, 50, 48, 62, 120, 117, 54, 42, 105, 52, 49, 125, 63, 112, 111, 98, 61, 56, 60, 53, 126, 121, 101, 33, 25, 99, 110, 127, 32, 84, 86, 58, 47, 82, 124, 88, 108, 21, 94, 40, 119, 41, 45, 36, 43, 89, 29, 118, 7, 51, 78, 77, 96, 17, 13, 30, 72, 106, 81, 95, 70, 92, 97, 93, 27, 11, 4, 79, 14, 9, 16, 55, 71, 74, 26, 83, 20, 67, 65, 75, 5, 0, 87, 66, 19, 12, 76, 85, 80, 91, 39, 3, 90, 24, 31, 28, 18, 6, 2, 22, 1, 64, 8, 15, 23, 68, 69, 73, 10], [103, 107, 104, 42, 33, 105, 46, 123, 117, 50, 62, 57, 122, 48, 49, 9, 29, 125, 111, 63, 31, 92, 90, 115, 10, 56, 4, 18, 54, 83, 69, 35, 7, 84, 44, 20, 23, 71, 22, 59, 100, 79, 75, 14, 5, 58, 127, 25, 113, 77, 8, 74, 86, 87, 27, 82, 70, 66, 65, 99, 120, 24, 13, 2, 73, 6, 12, 52, 78, 85, 55, 124, 114, 38, 1, 47, 108, 116, 72, 121, 11, 45, 88, 89, 81, 126, 112, 32, 3, 80, 93, 21, 15, 37, 16, 30, 97, 26, 94, 19, 53, 28, 101, 60, 118, 102, 17, 109, 91, 36, 39, 119, 98, 61, 110, 0, 34, 76, 95, 41, 51, 43, 40, 106, 96, 67, 68, 64], [104, 38, 97, 103, 42, 43, 48, 56, 90, 45, 87, 110, 47, 125, 50, 120, 53, 121, 49, 124, 61, 126, 58, 123, 30, 6, 31, 122, 57, 62, 68, 12, 119, 84, 52, 67, 81, 29, 16, 4, 105, 70, 112, 75, 117, 77, 78, 28, 63, 127, 7, 51, 82, 59, 13, 10, 41, 76, 20, 66, 85, 1, 18, 9, 8, 83, 15, 55, 64, 100, 60, 89, 113, 108, 23, 115, 116, 69, 72, 88, 98, 24, 73, 3, 34, 2, 91, 118, 54, 17, 79, 65, 5, 14, 109, 35, 11, 21, 80, 86, 107, 101, 37, 32, 19, 71, 74, 106, 99, 44, 25, 0, 92, 93, 36, 22, 26, 96, 94, 27, 111, 114, 33, 46, 40, 102, 95, 39], [104, 38, 103, 97, 42, 43, 87, 47, 45, 110, 49, 53, 50, 123, 126, 61, 124, 6, 29, 57, 90, 120, 121, 48, 12, 66, 31, 13, 117, 64, 28, 59, 81, 56, 10, 68, 58, 122, 62, 78, 7, 44, 75, 105, 52, 67, 20, 125, 51, 16, 1, 72, 119, 54, 15, 2, 9, 55, 18, 69, 115, 113, 82, 127, 100, 80, 11, 109, 91, 99, 4, 35, 96, 8, 85, 71, 63, 60, 5, 116, 70, 17, 118, 22, 101, 3, 36, 84, 37, 89, 107, 14, 23, 46, 73, 79, 106, 112, 74, 27, 77, 30, 25, 76, 65, 21, 41, 98, 108, 24, 19, 88, 0, 114, 26, 32, 34, 111, 86, 83, 94, 33, 92, 93, 40, 95, 39, 102], [104, 103, 97, 38, 42, 43, 13, 49, 110, 47, 53, 50, 121, 56, 126, 59, 45, 87, 123, 124, 61, 6, 122, 48, 31, 117, 120, 29, 37, 78, 81, 90, 127, 28, 118, 119, 67, 51, 12, 68, 16, 55, 105, 70, 113, 58, 52, 82, 10, 35, 19, 72, 125, 8, 69, 24, 71, 66, 79, 75, 1, 23, 115, 112, 57, 54, 11, 62, 91, 73, 116, 18, 7, 85, 60, 25, 80, 44, 101, 20, 98, 30, 114, 41, 84, 17, 15, 76, 108, 88, 32, 63, 89, 34, 77, 46, 96, 27, 4, 64, 74, 14, 36, 100, 22, 21, 5, 107, 94, 9, 86, 92, 99, 2, 3, 65, 106, 83, 111, 26, 93, 109, 33, 0, 95, 102, 40, 39], [120, 49, 97, 124, 57, 103, 61, 104, 59, 87, 38, 18, 105, 121, 42, 122, 43, 54, 53, 50, 23, 31, 47, 35, 45, 48, 110, 101, 92, 126, 82, 84, 123, 117, 113, 56, 118, 115, 127, 51, 62, 41, 55, 109, 90, 12, 27, 89, 28, 114, 94, 91, 108, 81, 19, 26, 102, 44, 58, 16, 77, 78, 60, 88, 37, 15, 116, 95, 80, 119, 100, 107, 75, 6, 10, 29, 52, 20, 36, 111, 72, 46, 13, 73, 79, 74, 25, 22, 85, 33, 63, 106, 9, 86, 68, 30, 32, 11, 17, 98, 4, 93, 67, 71, 7, 125, 66, 1, 24, 76, 21, 96, 112, 3, 99, 69, 83, 34, 14, 8, 64, 70, 5, 39, 65, 2, 40, 0], [107, 42, 41, 40, 103, 33, 46, 118, 95, 24, 126, 109, 116, 125, 18, 14, 82, 115, 114, 63, 53, 56, 93, 113, 119, 111, 112, 58, 84, 54, 108, 117, 85, 59, 55, 49, 83, 60, 121, 20, 79, 80, 61, 87, 13, 27, 75, 76, 25, 50, 21, 99, 110, 48, 51, 124, 90, 92, 26, 7, 38, 89, 86, 77, 71, 94, 123, 8, 81, 74, 29, 23, 72, 120, 10, 15, 127, 22, 37, 32, 91, 96, 98, 73, 30, 52, 16, 34, 12, 122, 28, 11, 106, 47, 19, 31, 17, 35, 45, 102, 78, 97, 43, 9, 101, 88, 70, 36, 69, 67, 44, 66, 57, 3, 4, 65, 100, 62, 68, 104, 5, 105, 39, 0, 6, 1, 2, 64], [101, 98, 120, 108, 112, 127, 114, 47, 113, 34, 61, 49, 115, 62, 59, 122, 32, 111, 30, 118, 58, 36, 121, 119, 46, 48, 60, 123, 42, 50, 55, 53, 63, 94, 124, 41, 40, 125, 116, 37, 35, 126, 107, 56, 109, 44, 117, 54, 100, 88, 103, 38, 89, 22, 91, 45, 21, 68, 84, 102, 33, 57, 39, 85, 51, 25, 1, 93, 92, 6, 11, 64, 4, 52, 77, 69, 67, 8, 0, 71, 86, 99, 12, 24, 83, 95, 96, 97, 10, 81, 78, 80, 28, 9, 18, 110, 16, 7, 26, 2, 66, 104, 13, 79, 72, 31, 15, 82, 20, 3, 5, 19, 14, 65, 23, 73, 27, 70, 75, 76, 90, 43, 87, 29, 74, 17, 106, 105], [40, 42, 107, 41, 33, 112, 95, 46, 103, 116, 93, 20, 115, 114, 18, 121, 109, 27, 122, 15, 118, 78, 60, 16, 19, 72, 58, 54, 92, 25, 126, 125, 53, 17, 81, 26, 23, 119, 108, 12, 111, 13, 62, 83, 22, 56, 10, 49, 117, 48, 77, 55, 124, 11, 68, 120, 79, 14, 34, 87, 71, 113, 50, 29, 86, 59, 21, 74, 3, 57, 75, 127, 67, 5, 76, 80, 8, 104, 70, 89, 66, 84, 6, 61, 36, 82, 91, 9, 51, 98, 73, 38, 37, 90, 106, 85, 97, 69, 99, 28, 24, 52, 105, 1, 39, 31, 110, 63, 32, 2, 45, 4, 123, 94, 101, 88, 96, 43, 7, 47, 44, 30, 102, 65, 100, 35, 0, 64], [107, 41, 42, 103, 40, 109, 33, 85, 112, 108, 46, 95, 126, 54, 24, 66, 49, 68, 111, 117, 113, 38, 4, 114, 62, 122, 119, 125, 56, 22, 27, 1, 93, 61, 7, 118, 32, 36, 53, 18, 0, 17, 121, 10, 71, 6, 23, 8, 94, 78, 67, 2, 120, 98, 115, 116, 127, 59, 25, 81, 92, 60, 11, 21, 110, 37, 9, 15, 55, 80, 30, 57, 69, 77, 48, 64, 75, 12, 100, 3, 20, 19, 88, 63, 82, 123, 13, 16, 73, 34, 52, 5, 96, 124, 87, 70, 51, 101, 26, 35, 45, 99, 14, 79, 74, 43, 65, 58, 102, 29, 50, 83, 76, 86, 72, 84, 47, 28, 90, 91, 106, 44, 89, 97, 31, 105, 39, 104], [52, 55, 102, 57, 117, 111, 56, 95, 109, 54, 51, 101, 114, 63, 97, 53, 50, 127, 59, 104, 48, 122, 24, 86, 61, 98, 103, 90, 100, 112, 88, 26, 93, 33, 115, 41, 80, 110, 29, 31, 42, 116, 96, 125, 108, 16, 113, 85, 120, 27, 124, 126, 121, 60, 46, 49, 44, 22, 123, 74, 99, 37, 119, 30, 39, 17, 62, 81, 118, 107, 47, 58, 38, 43, 32, 40, 94, 10, 25, 20, 28, 21, 36, 45, 23, 91, 35, 34, 105, 84, 19, 92, 12, 76, 87, 106, 14, 89, 11, 18, 82, 15, 69, 78, 7, 13, 3, 75, 77, 72, 9, 8, 83, 5, 67, 79, 73, 6, 71, 4, 66, 2, 70, 68, 65, 0, 64, 1], [103, 104, 42, 41, 0, 33, 90, 44, 114, 109, 111, 63, 56, 59, 122, 102, 29, 61, 86, 127, 53, 101, 125, 121, 54, 51, 3, 95, 66, 16, 52, 10, 4, 112, 107, 48, 46, 83, 65, 50, 120, 84, 77, 49, 7, 76, 15, 69, 55, 71, 19, 115, 74, 12, 57, 25, 28, 78, 17, 72, 6, 85, 14, 91, 5, 9, 100, 123, 118, 80, 1, 126, 64, 124, 47, 21, 70, 13, 18, 2, 22, 75, 117, 23, 81, 68, 82, 11, 106, 87, 73, 113, 88, 67, 94, 96, 79, 105, 116, 108, 35, 110, 8, 20, 34, 39, 40, 98, 26, 24, 38, 30, 60, 36, 62, 99, 58, 89, 27, 37, 43, 45, 119, 32, 93, 92, 31, 97], [46, 44, 33, 52, 24, 111, 104, 103, 109, 124, 41, 102, 123, 57, 125, 107, 120, 108, 51, 114, 122, 61, 127, 59, 54, 90, 56, 85, 48, 29, 42, 53, 63, 95, 86, 55, 116, 76, 101, 50, 27, 80, 12, 22, 58, 60, 37, 110, 113, 26, 88, 119, 47, 21, 97, 81, 74, 93, 126, 96, 112, 34, 19, 117, 83, 7, 23, 3, 20, 15, 25, 10, 49, 77, 18, 43, 31, 89, 4, 84, 100, 0, 115, 17, 69, 98, 99, 78, 121, 87, 16, 9, 38, 28, 66, 14, 75, 92, 30, 11, 32, 91, 6, 62, 94, 105, 65, 118, 45, 106, 13, 8, 36, 35, 70, 79, 72, 67, 82, 71, 2, 68, 64, 73, 5, 1, 40, 39], [103, 57, 44, 101, 104, 41, 42, 33, 52, 109, 90, 55, 114, 46, 12, 111, 29, 54, 63, 56, 59, 127, 122, 61, 53, 51, 125, 95, 85, 72, 112, 81, 76, 48, 17, 3, 50, 11, 4, 100, 7, 121, 73, 115, 15, 120, 47, 69, 82, 19, 66, 80, 65, 23, 86, 78, 28, 67, 123, 102, 107, 0, 20, 22, 13, 16, 84, 88, 70, 10, 91, 30, 9, 126, 94, 5, 18, 14, 71, 83, 110, 25, 34, 58, 21, 32, 49, 6, 89, 2, 26, 8, 124, 77, 93, 64, 68, 75, 62, 45, 79, 117, 113, 87, 96, 43, 92, 24, 60, 98, 38, 106, 99, 27, 105, 1, 74, 36, 119, 108, 118, 31, 116, 35, 37, 97, 40, 39], [104, 101, 105, 45, 110, 97, 44, 106, 43, 64, 114, 95, 89, 113, 56, 116, 57, 93, 48, 119, 47, 58, 50, 68, 1, 15, 6, 70, 9, 84, 2, 76, 81, 16, 13, 74, 3, 14, 69, 11, 80, 20, 55, 0, 65, 60, 28, 72, 78, 7, 63, 17, 115, 71, 36, 46, 66, 8, 19, 121, 5, 109, 52, 73, 4, 118, 25, 40, 75, 34, 108, 41, 126, 82, 112, 59, 107, 111, 122, 61, 51, 42, 124, 123, 35, 125, 99, 62, 98, 67, 79, 88, 54, 85, 117, 49, 103, 27, 102, 38, 53, 32, 96, 91, 100, 90, 10, 21, 24, 12, 30, 22, 26, 127, 29, 86, 87, 83, 120, 77, 94, 23, 39, 18, 37, 92, 33, 31], [114, 104, 101, 97, 105, 45, 69, 44, 89, 110, 106, 43, 63, 95, 72, 75, 3, 93, 113, 56, 4, 57, 14, 55, 76, 119, 116, 58, 50, 48, 84, 47, 74, 20, 70, 67, 1, 9, 81, 80, 13, 60, 28, 118, 123, 16, 115, 18, 73, 64, 17, 11, 15, 22, 126, 78, 12, 7, 121, 46, 68, 77, 112, 62, 109, 65, 85, 61, 2, 83, 24, 23, 52, 127, 125, 108, 122, 6, 40, 51, 41, 79, 107, 10, 54, 26, 96, 42, 66, 21, 49, 19, 86, 25, 103, 38, 120, 37, 36, 124, 111, 90, 8, 102, 35, 87, 92, 27, 88, 91, 5, 82, 34, 100, 39, 117, 98, 94, 0, 71, 53, 32, 59, 29, 30, 99, 31, 33], [104, 64, 45, 110, 101, 105, 44, 106, 97, 43, 89, 113, 56, 57, 1, 119, 116, 0, 48, 58, 50, 47, 95, 93, 114, 68, 3, 84, 70, 123, 13, 2, 5, 63, 76, 28, 55, 60, 127, 59, 118, 14, 74, 122, 9, 81, 80, 15, 20, 46, 66, 8, 51, 16, 126, 36, 109, 112, 26, 40, 52, 4, 67, 41, 75, 108, 12, 69, 7, 11, 65, 107, 42, 77, 17, 53, 18, 78, 117, 61, 88, 6, 39, 35, 115, 72, 98, 111, 124, 32, 82, 71, 121, 91, 99, 86, 22, 85, 38, 62, 34, 10, 125, 73, 90, 49, 120, 19, 87, 30, 25, 21, 102, 83, 96, 27, 23, 100, 94, 24, 79, 92, 54, 103, 29, 31, 37, 33], [55, 63, 104, 97, 101, 72, 105, 45, 44, 110, 106, 43, 114, 52, 89, 93, 95, 77, 113, 58, 57, 56, 119, 116, 78, 15, 48, 60, 47, 83, 13, 62, 70, 50, 84, 115, 126, 80, 81, 118, 75, 8, 90, 17, 79, 38, 28, 91, 73, 20, 18, 12, 121, 112, 22, 125, 6, 69, 21, 16, 25, 86, 85, 46, 19, 37, 124, 29, 23, 9, 109, 76, 74, 102, 14, 61, 3, 10, 4, 59, 127, 111, 123, 108, 68, 71, 53, 92, 94, 36, 120, 24, 103, 35, 82, 54, 99, 27, 107, 51, 100, 11, 122, 117, 5, 88, 26, 41, 34, 96, 42, 87, 40, 32, 33, 67, 31, 30, 1, 49, 39, 98, 2, 7, 64, 65, 66, 0], [104, 0, 45, 106, 43, 103, 112, 1, 97, 102, 127, 2, 64, 116, 60, 93, 117, 67, 113, 4, 88, 115, 65, 50, 70, 71, 66, 57, 77, 78, 49, 63, 59, 69, 16, 76, 52, 62, 20, 48, 15, 73, 122, 74, 109, 22, 51, 58, 114, 123, 11, 81, 82, 8, 3, 124, 125, 107, 53, 87, 120, 17, 56, 83, 42, 19, 61, 84, 118, 18, 7, 24, 55, 40, 72, 39, 121, 86, 119, 6, 14, 54, 13, 12, 75, 10, 5, 38, 85, 9, 26, 21, 79, 92, 80, 89, 126, 111, 110, 44, 23, 68, 47, 32, 27, 25, 105, 98, 95, 108, 90, 28, 41, 91, 30, 100, 29, 37, 36, 96, 31, 34, 46, 94, 101, 33, 99, 35], [104, 103, 106, 45, 43, 97, 112, 102, 5, 0, 116, 3, 93, 68, 69, 127, 88, 20, 66, 4, 67, 60, 114, 117, 11, 71, 57, 70, 113, 49, 73, 115, 83, 2, 76, 122, 16, 82, 77, 78, 63, 51, 59, 48, 124, 8, 6, 22, 1, 15, 74, 81, 50, 65, 19, 58, 62, 52, 118, 123, 84, 85, 56, 120, 87, 75, 125, 61, 121, 109, 53, 55, 7, 64, 24, 17, 86, 54, 79, 107, 110, 80, 18, 72, 13, 42, 119, 10, 12, 14, 23, 47, 9, 111, 92, 108, 126, 29, 40, 27, 89, 44, 38, 21, 90, 46, 39, 28, 95, 25, 33, 26, 91, 31, 36, 41, 100, 30, 105, 94, 99, 37, 32, 96, 34, 35, 98, 101], [103, 114, 104, 116, 122, 59, 51, 97, 106, 48, 49, 43, 20, 124, 58, 18, 62, 88, 45, 57, 118, 63, 19, 55, 22, 102, 86, 108, 92, 123, 61, 79, 52, 112, 47, 125, 14, 121, 80, 126, 110, 120, 29, 33, 113, 17, 56, 75, 127, 84, 53, 82, 117, 24, 13, 115, 12, 50, 90, 46, 23, 93, 81, 26, 60, 119, 83, 91, 44, 10, 87, 27, 72, 54, 9, 32, 89, 15, 41, 111, 95, 94, 25, 28, 30, 36, 39, 38, 98, 77, 31, 85, 7, 37, 100, 6, 16, 99, 96, 105, 101, 5, 109, 35, 76, 78, 74, 21, 34, 73, 40, 11, 68, 8, 3, 71, 70, 66, 69, 4, 0, 1, 65, 67, 107, 2, 42, 64], [114, 118, 51, 103, 104, 49, 55, 53, 110, 125, 20, 123, 106, 122, 120, 121, 102, 56, 61, 43, 63, 124, 108, 126, 111, 88, 45, 19, 62, 119, 47, 54, 17, 58, 97, 18, 92, 22, 112, 79, 14, 80, 57, 41, 36, 99, 101, 100, 86, 116, 35, 90, 30, 98, 37, 34, 75, 59, 31, 44, 95, 32, 105, 96, 12, 13, 117, 127, 89, 91, 10, 50, 81, 33, 60, 24, 93, 72, 83, 94, 38, 9, 115, 46, 27, 52, 26, 7, 82, 113, 23, 84, 6, 87, 48, 29, 15, 5, 85, 109, 21, 28, 68, 25, 16, 11, 107, 3, 78, 39, 8, 77, 69, 66, 76, 42, 70, 65, 71, 74, 73, 4, 67, 40, 2, 0, 1, 64], [104, 102, 97, 44, 4, 103, 24, 107, 106, 29, 50, 126, 120, 55, 21, 62, 116, 59, 125, 95, 48, 3, 71, 121, 78, 67, 119, 45, 76, 0, 112, 82, 15, 65, 91, 123, 8, 92, 9, 49, 13, 1, 66, 10, 11, 68, 52, 122, 53, 80, 117, 51, 18, 110, 83, 6, 7, 58, 73, 77, 127, 75, 57, 113, 47, 101, 85, 108, 54, 81, 5, 20, 70, 16, 12, 111, 19, 43, 115, 74, 37, 32, 2, 69, 26, 42, 30, 56, 124, 64, 114, 46, 61, 72, 88, 96, 17, 118, 99, 22, 87, 89, 14, 98, 34, 63, 40, 41, 84, 27, 25, 94, 109, 79, 100, 60, 86, 23, 105, 36, 90, 35, 28, 93, 39, 33, 31, 38], [97, 102, 104, 44, 103, 105, 107, 106, 29, 24, 53, 50, 48, 126, 120, 116, 55, 119, 95, 57, 21, 121, 117, 125, 45, 82, 16, 123, 61, 78, 63, 15, 51, 62, 111, 11, 127, 92, 52, 13, 56, 9, 115, 5, 91, 72, 68, 12, 7, 59, 3, 2, 83, 6, 76, 118, 23, 108, 0, 49, 18, 54, 65, 30, 60, 75, 17, 80, 79, 89, 90, 77, 43, 98, 10, 32, 112, 74, 42, 19, 86, 81, 70, 14, 34, 84, 26, 25, 94, 35, 100, 27, 22, 85, 96, 36, 66, 124, 99, 37, 40, 28, 87, 20, 4, 8, 122, 113, 110, 109, 101, 47, 58, 71, 39, 31, 69, 67, 33, 73, 46, 64, 114, 41, 88, 38, 1, 93], [104, 102, 97, 44, 103, 29, 107, 106, 24, 50, 21, 126, 55, 120, 68, 82, 53, 52, 95, 11, 117, 78, 5, 13, 0, 125, 9, 59, 12, 62, 2, 6, 48, 92, 65, 15, 7, 45, 58, 51, 72, 3, 70, 116, 105, 18, 14, 123, 91, 46, 8, 121, 75, 57, 119, 109, 113, 122, 66, 76, 54, 26, 49, 110, 85, 88, 73, 17, 67, 71, 112, 30, 108, 114, 47, 60, 74, 80, 63, 23, 101, 1, 64, 79, 61, 43, 10, 22, 77, 41, 4, 42, 19, 111, 56, 37, 35, 124, 16, 90, 118, 84, 87, 34, 20, 69, 127, 100, 96, 115, 89, 36, 40, 32, 33, 94, 98, 83, 99, 25, 86, 81, 27, 28, 93, 38, 31, 39], [103, 102, 104, 105, 48, 97, 59, 44, 106, 107, 24, 116, 29, 117, 78, 21, 11, 120, 50, 126, 52, 16, 62, 55, 82, 53, 57, 74, 13, 51, 9, 18, 123, 95, 110, 119, 121, 75, 2, 15, 54, 85, 68, 125, 19, 122, 7, 88, 6, 70, 14, 112, 72, 92, 8, 80, 3, 45, 113, 30, 109, 66, 0, 17, 26, 76, 71, 65, 49, 58, 5, 1, 115, 114, 91, 47, 60, 22, 4, 77, 108, 67, 64, 10, 111, 32, 63, 124, 12, 87, 25, 43, 93, 56, 84, 86, 79, 83, 96, 42, 23, 41, 118, 94, 20, 61, 27, 69, 100, 127, 73, 46, 101, 98, 37, 40, 34, 81, 89, 90, 38, 36, 35, 99, 39, 28, 31, 33], [114, 103, 98, 44, 43, 49, 57, 48, 97, 105, 122, 95, 40, 14, 127, 63, 121, 107, 124, 83, 19, 24, 21, 120, 42, 78, 119, 94, 54, 32, 90, 58, 117, 110, 111, 74, 51, 92, 53, 84, 25, 26, 62, 29, 36, 18, 86, 109, 38, 115, 50, 46, 56, 112, 116, 15, 59, 123, 113, 85, 81, 33, 87, 99, 125, 88, 10, 17, 22, 31, 60, 100, 104, 118, 126, 82, 70, 93, 91, 52, 61, 28, 37, 55, 101, 108, 30, 35, 79, 47, 45, 6, 8, 102, 23, 16, 39, 13, 76, 11, 89, 77, 96, 12, 20, 34, 80, 27, 7, 9, 72, 4, 106, 75, 3, 68, 66, 73, 2, 41, 71, 65, 5, 67, 1, 69, 64, 0], [44, 63, 98, 62, 60, 111, 51, 49, 103, 124, 43, 59, 48, 127, 42, 97, 86, 17, 105, 22, 24, 120, 95, 55, 114, 92, 29, 50, 40, 99, 113, 110, 45, 90, 102, 25, 18, 57, 37, 36, 54, 77, 61, 47, 85, 28, 88, 94, 33, 74, 21, 87, 84, 101, 93, 100, 31, 26, 27, 108, 123, 125, 14, 91, 118, 35, 116, 115, 38, 117, 10, 52, 112, 121, 20, 56, 15, 109, 30, 81, 89, 13, 82, 78, 122, 104, 70, 83, 39, 126, 119, 96, 58, 32, 8, 19, 53, 71, 66, 7, 16, 4, 79, 34, 23, 107, 2, 46, 72, 67, 11, 68, 6, 41, 106, 80, 12, 0, 75, 1, 69, 64, 3, 73, 65, 9, 5, 76], [103, 105, 97, 42, 44, 111, 40, 121, 89, 95, 9, 119, 22, 58, 49, 57, 4, 48, 120, 124, 94, 62, 122, 65, 50, 60, 67, 11, 109, 51, 13, 59, 56, 92, 43, 99, 18, 79, 53, 127, 55, 16, 0, 70, 84, 63, 54, 5, 1, 73, 36, 81, 24, 26, 114, 46, 20, 71, 6, 98, 2, 82, 90, 112, 23, 66, 80, 86, 3, 77, 74, 85, 113, 100, 126, 12, 72, 76, 10, 7, 32, 38, 21, 115, 91, 78, 17, 118, 83, 125, 123, 27, 19, 29, 69, 68, 25, 47, 45, 102, 88, 64, 75, 8, 61, 14, 93, 116, 28, 52, 110, 34, 31, 37, 117, 87, 30, 15, 35, 101, 33, 96, 41, 107, 39, 108, 104, 106], [40, 109, 43, 44, 49, 97, 42, 103, 48, 53, 124, 105, 37, 118, 98, 120, 127, 62, 57, 56, 36, 95, 29, 51, 60, 59, 74, 92, 114, 22, 90, 84, 121, 55, 78, 81, 63, 10, 89, 54, 122, 16, 77, 58, 108, 24, 15, 115, 70, 38, 126, 119, 85, 86, 111, 125, 13, 47, 61, 82, 21, 25, 123, 8, 101, 93, 94, 83, 87, 45, 11, 102, 76, 19, 79, 18, 99, 20, 28, 88, 46, 116, 68, 80, 117, 32, 52, 100, 106, 9, 14, 17, 23, 67, 1, 4, 75, 96, 110, 26, 41, 2, 112, 33, 91, 113, 27, 30, 35, 50, 71, 107, 34, 72, 6, 12, 0, 3, 7, 64, 73, 69, 66, 39, 5, 31, 104, 65]], "model.layers.1.self_attn.k_proj": [[0, 39, 40, 43, 110, 4, 97, 11, 106, 41, 65, 95, 78, 84, 79, 62, 117, 74, 123, 9, 66, 50, 72, 122, 93, 5, 57, 87, 91, 73, 26, 47, 13, 69, 28, 68, 86, 16, 70, 7, 2, 59, 77, 81, 53, 112, 56, 67, 127, 17, 21, 46, 48, 10, 82, 121, 3, 64, 116, 76, 55, 63, 118, 124, 115, 58, 6, 114, 30, 89, 71, 113, 18, 51, 111, 14, 8, 108, 96, 54, 12, 120, 85, 60, 61, 109, 126, 75, 119, 15, 80, 125, 23, 1, 88, 52, 29, 49, 103, 38, 83, 94, 19, 34, 32, 45, 99, 33, 90, 36, 104, 27, 101, 22, 20, 24, 92, 98, 44, 105, 35, 100, 31, 107, 102, 37, 25, 42], [64, 40, 106, 107, 39, 46, 69, 111, 1, 33, 10, 66, 93, 7, 65, 2, 5, 113, 109, 95, 67, 75, 72, 117, 126, 12, 3, 123, 16, 112, 114, 81, 9, 61, 50, 4, 78, 53, 6, 102, 60, 124, 85, 15, 23, 47, 116, 21, 26, 125, 0, 92, 63, 27, 74, 71, 110, 108, 86, 22, 58, 68, 62, 11, 56, 70, 121, 83, 49, 80, 24, 19, 82, 94, 118, 55, 77, 17, 8, 98, 127, 57, 59, 52, 34, 51, 73, 25, 122, 119, 96, 14, 32, 99, 101, 87, 18, 37, 36, 79, 88, 76, 115, 84, 41, 20, 48, 91, 44, 35, 54, 89, 100, 28, 30, 97, 43, 105, 90, 120, 29, 45, 13, 42, 38, 31, 104, 103], [39, 97, 43, 105, 104, 106, 110, 31, 90, 29, 87, 45, 48, 54, 117, 83, 91, 81, 12, 15, 44, 78, 51, 28, 77, 10, 20, 0, 11, 89, 6, 122, 69, 7, 50, 2, 16, 126, 17, 79, 56, 125, 86, 8, 18, 52, 116, 65, 84, 60, 62, 1, 121, 13, 3, 55, 9, 73, 57, 19, 124, 53, 4, 119, 35, 58, 115, 64, 47, 75, 118, 59, 67, 49, 74, 5, 102, 114, 21, 61, 127, 24, 112, 103, 120, 80, 111, 34, 113, 46, 98, 23, 72, 96, 40, 101, 41, 68, 71, 93, 27, 32, 70, 107, 100, 30, 99, 42, 85, 36, 63, 82, 66, 37, 123, 88, 38, 94, 76, 109, 22, 108, 14, 26, 92, 33, 25, 95], [45, 0, 106, 105, 40, 39, 1, 65, 97, 108, 50, 47, 92, 31, 9, 61, 15, 127, 77, 63, 7, 122, 3, 6, 19, 110, 53, 59, 48, 4, 51, 118, 2, 120, 54, 68, 56, 23, 64, 49, 66, 43, 27, 114, 35, 87, 93, 126, 69, 70, 5, 36, 89, 79, 11, 71, 82, 8, 20, 119, 94, 75, 18, 121, 98, 62, 125, 76, 81, 37, 12, 78, 38, 32, 83, 22, 74, 58, 14, 111, 34, 13, 67, 115, 57, 84, 60, 109, 112, 99, 26, 21, 30, 123, 17, 116, 46, 73, 96, 113, 85, 107, 103, 80, 117, 124, 25, 33, 10, 24, 86, 91, 29, 104, 42, 88, 41, 72, 55, 100, 90, 44, 102, 101, 28, 52, 16, 95], [64, 40, 109, 65, 46, 108, 41, 107, 42, 66, 50, 0, 37, 3, 119, 57, 56, 33, 113, 116, 67, 48, 1, 74, 5, 58, 31, 68, 6, 76, 4, 14, 9, 13, 25, 47, 29, 71, 49, 20, 111, 11, 81, 112, 8, 7, 92, 15, 16, 2, 80, 61, 51, 94, 110, 85, 10, 60, 24, 84, 21, 123, 70, 59, 118, 126, 63, 100, 96, 54, 82, 83, 23, 102, 30, 79, 45, 39, 121, 99, 86, 19, 104, 73, 98, 27, 77, 12, 103, 32, 75, 91, 88, 87, 117, 90, 69, 38, 105, 120, 22, 44, 35, 18, 34, 106, 125, 17, 62, 78, 53, 43, 115, 122, 127, 52, 114, 124, 26, 55, 72, 89, 36, 28, 93, 101, 97, 95], [40, 0, 109, 42, 107, 39, 33, 48, 65, 64, 66, 113, 50, 52, 3, 115, 29, 117, 127, 60, 71, 70, 4, 16, 24, 73, 69, 67, 77, 68, 116, 57, 74, 76, 8, 11, 58, 82, 28, 83, 78, 63, 59, 62, 15, 86, 2, 124, 112, 84, 23, 54, 121, 122, 46, 81, 119, 125, 123, 120, 61, 26, 44, 5, 6, 1, 7, 41, 101, 37, 100, 111, 94, 99, 56, 27, 36, 35, 38, 96, 31, 98, 32, 105, 34, 55, 126, 118, 47, 87, 25, 9, 95, 45, 108, 110, 51, 114, 49, 91, 53, 30, 20, 88, 85, 90, 21, 72, 10, 19, 89, 17, 102, 14, 43, 12, 92, 106, 18, 79, 104, 22, 13, 75, 103, 93, 80, 97], [0, 108, 40, 43, 42, 39, 65, 55, 120, 126, 114, 3, 33, 50, 72, 117, 6, 2, 52, 7, 9, 41, 31, 109, 68, 5, 48, 70, 15, 125, 49, 13, 121, 62, 18, 12, 112, 78, 74, 115, 46, 85, 11, 110, 76, 69, 27, 93, 59, 116, 123, 28, 67, 38, 53, 88, 1, 4, 16, 82, 17, 20, 118, 84, 86, 90, 80, 63, 61, 60, 19, 47, 37, 83, 111, 96, 81, 57, 35, 87, 54, 21, 99, 75, 89, 36, 22, 51, 10, 73, 77, 98, 100, 25, 24, 66, 94, 23, 79, 30, 113, 34, 32, 14, 127, 124, 8, 119, 26, 71, 58, 101, 122, 44, 102, 45, 29, 64, 56, 97, 92, 105, 95, 91, 107, 106, 104, 103], [39, 41, 106, 33, 108, 104, 31, 0, 69, 34, 57, 113, 120, 25, 16, 112, 30, 47, 45, 1, 67, 84, 119, 46, 87, 2, 122, 124, 107, 5, 8, 51, 11, 121, 58, 71, 62, 68, 53, 9, 56, 35, 59, 15, 18, 60, 90, 92, 13, 125, 96, 86, 70, 76, 12, 7, 61, 24, 36, 81, 49, 4, 27, 126, 55, 123, 50, 66, 63, 127, 73, 101, 52, 75, 117, 77, 89, 54, 38, 85, 21, 110, 48, 102, 100, 28, 29, 72, 93, 22, 116, 118, 80, 97, 115, 3, 82, 43, 64, 95, 74, 23, 65, 19, 111, 20, 88, 17, 91, 98, 26, 37, 83, 99, 109, 32, 114, 79, 103, 6, 14, 44, 10, 94, 78, 40, 42, 105]], "model.layers.1.self_attn.qk_proj": [[50, 48, 0, 57, 64, 117, 56, 116, 59, 126, 113, 120, 122, 62, 123, 53, 55, 49, 65, 127, 46, 110, 1, 63, 97, 61, 124, 47, 58, 109, 67, 119, 121, 29, 3, 112, 108, 54, 51, 40, 60, 107, 4, 45, 125, 114, 43, 104, 70, 68, 2, 76, 33, 5, 66, 16, 42, 106, 95, 7, 77, 111, 93, 14, 6, 52, 41, 9, 13, 20, 11, 115, 84, 12, 71, 82, 75, 74, 81, 44, 73, 79, 69, 103, 78, 39, 10, 17, 15, 18, 72, 80, 87, 105, 24, 118, 31, 23, 8, 22, 83, 19, 90, 86, 85, 92, 89, 21, 25, 88, 26, 91, 102, 28, 36, 38, 27, 37, 98, 94, 101, 35, 100, 96, 34, 99, 32, 30], [50, 48, 0, 64, 57, 117, 56, 120, 59, 126, 116, 62, 113, 122, 46, 53, 55, 123, 127, 63, 65, 1, 110, 58, 49, 97, 61, 109, 47, 108, 124, 29, 40, 112, 121, 51, 68, 119, 114, 60, 43, 107, 54, 4, 45, 125, 66, 104, 2, 42, 67, 6, 70, 52, 106, 93, 33, 7, 95, 69, 3, 115, 77, 14, 20, 12, 41, 5, 76, 13, 111, 9, 71, 16, 44, 103, 11, 39, 84, 82, 17, 73, 105, 10, 18, 118, 78, 74, 79, 72, 75, 31, 80, 15, 81, 24, 87, 23, 8, 89, 88, 22, 83, 85, 102, 26, 19, 25, 90, 92, 21, 28, 86, 27, 98, 91, 38, 101, 37, 35, 30, 94, 36, 32, 100, 34, 99, 96], [50, 48, 57, 64, 0, 117, 56, 116, 120, 59, 126, 113, 62, 122, 53, 46, 123, 63, 127, 65, 110, 55, 58, 108, 97, 1, 47, 109, 51, 49, 29, 112, 121, 61, 124, 119, 67, 40, 3, 54, 114, 107, 60, 43, 6, 45, 4, 104, 125, 70, 2, 42, 66, 52, 106, 95, 68, 13, 71, 93, 115, 33, 5, 14, 76, 84, 111, 69, 11, 41, 16, 77, 12, 20, 9, 7, 73, 75, 15, 10, 39, 79, 78, 82, 18, 17, 103, 44, 81, 74, 72, 80, 118, 105, 31, 24, 87, 8, 23, 83, 22, 19, 89, 86, 88, 21, 85, 28, 25, 92, 102, 90, 26, 91, 98, 27, 37, 101, 38, 30, 94, 32, 35, 100, 36, 99, 96, 34], [50, 48, 0, 64, 57, 117, 56, 116, 59, 120, 126, 113, 122, 62, 123, 46, 53, 65, 127, 55, 58, 110, 63, 124, 49, 1, 97, 3, 61, 47, 108, 121, 109, 51, 112, 29, 114, 67, 54, 40, 119, 60, 6, 45, 2, 107, 104, 43, 125, 66, 52, 42, 5, 95, 106, 68, 115, 70, 111, 71, 93, 69, 20, 4, 76, 33, 41, 11, 12, 13, 77, 9, 14, 7, 103, 16, 78, 10, 84, 44, 79, 81, 80, 39, 82, 18, 73, 17, 31, 74, 75, 105, 72, 24, 118, 15, 23, 22, 8, 87, 88, 83, 86, 21, 89, 19, 92, 27, 90, 25, 85, 102, 28, 26, 91, 37, 38, 101, 98, 100, 94, 34, 35, 36, 30, 96, 32, 99], [50, 48, 64, 0, 57, 117, 56, 120, 116, 126, 62, 59, 113, 122, 46, 123, 63, 53, 58, 110, 127, 55, 108, 49, 1, 109, 61, 124, 47, 40, 65, 29, 97, 51, 54, 107, 112, 43, 121, 45, 68, 2, 66, 119, 4, 6, 104, 60, 114, 106, 42, 52, 125, 67, 3, 5, 70, 93, 115, 41, 111, 33, 71, 95, 77, 13, 103, 16, 69, 7, 44, 39, 20, 80, 12, 75, 84, 9, 14, 74, 17, 11, 76, 78, 73, 10, 105, 81, 79, 15, 72, 82, 31, 18, 8, 118, 87, 23, 24, 22, 83, 19, 86, 102, 89, 25, 88, 27, 21, 92, 28, 90, 91, 85, 26, 38, 101, 98, 37, 94, 36, 34, 100, 99, 32, 30, 35, 96], [50, 48, 57, 0, 64, 117, 116, 56, 126, 59, 120, 122, 113, 62, 46, 123, 55, 53, 110, 58, 109, 63, 49, 127, 97, 124, 29, 1, 61, 40, 108, 47, 65, 3, 43, 112, 67, 114, 51, 107, 54, 119, 121, 4, 2, 125, 104, 68, 45, 60, 6, 42, 52, 106, 66, 33, 93, 20, 41, 95, 70, 77, 13, 80, 111, 12, 115, 11, 69, 71, 10, 5, 84, 7, 16, 39, 74, 103, 9, 82, 14, 17, 76, 73, 79, 44, 78, 15, 75, 72, 81, 8, 105, 31, 118, 23, 18, 87, 24, 88, 22, 102, 83, 85, 21, 19, 89, 92, 26, 25, 86, 90, 91, 27, 28, 38, 101, 37, 36, 98, 100, 35, 34, 94, 99, 96, 30, 32], [50, 48, 57, 117, 0, 64, 120, 56, 59, 116, 122, 126, 62, 113, 123, 55, 46, 53, 110, 127, 29, 58, 108, 63, 49, 97, 1, 124, 47, 61, 109, 65, 51, 112, 114, 40, 121, 67, 119, 54, 3, 43, 107, 125, 52, 60, 45, 104, 70, 2, 93, 6, 42, 66, 115, 106, 33, 13, 20, 16, 77, 80, 95, 69, 71, 12, 68, 11, 84, 111, 4, 75, 78, 41, 103, 79, 9, 15, 7, 76, 82, 18, 14, 81, 8, 5, 73, 39, 10, 74, 44, 17, 72, 105, 118, 23, 87, 31, 24, 85, 22, 19, 26, 92, 83, 25, 89, 86, 21, 90, 88, 102, 27, 28, 91, 37, 98, 36, 101, 94, 38, 35, 34, 30, 99, 96, 100, 32], [50, 48, 57, 0, 117, 64, 56, 116, 122, 59, 120, 113, 62, 126, 123, 53, 55, 46, 127, 110, 63, 97, 1, 49, 108, 65, 58, 112, 29, 109, 54, 61, 119, 47, 124, 114, 40, 121, 51, 125, 60, 107, 45, 43, 70, 104, 4, 52, 77, 106, 42, 95, 115, 33, 93, 6, 68, 66, 71, 13, 11, 2, 3, 15, 20, 14, 12, 75, 80, 67, 84, 41, 7, 16, 9, 78, 82, 44, 69, 111, 5, 39, 76, 103, 73, 17, 81, 79, 74, 18, 10, 8, 105, 31, 24, 118, 87, 23, 21, 19, 72, 25, 86, 22, 83, 85, 89, 90, 92, 26, 88, 27, 102, 28, 98, 91, 37, 101, 36, 94, 38, 30, 34, 32, 100, 35, 96, 99], [50, 48, 57, 64, 0, 117, 56, 120, 59, 116, 122, 126, 62, 113, 123, 53, 63, 46, 127, 1, 97, 110, 109, 61, 55, 108, 47, 124, 65, 40, 49, 112, 58, 29, 119, 54, 121, 3, 67, 51, 107, 43, 60, 125, 114, 68, 45, 4, 42, 104, 70, 106, 66, 95, 52, 2, 20, 115, 33, 11, 77, 13, 93, 6, 41, 71, 76, 15, 7, 14, 39, 16, 5, 84, 82, 78, 111, 80, 12, 103, 73, 9, 74, 81, 75, 69, 44, 18, 17, 10, 31, 79, 24, 105, 87, 8, 118, 23, 72, 85, 19, 83, 25, 90, 88, 89, 22, 86, 21, 92, 102, 28, 26, 27, 91, 98, 38, 101, 36, 37, 34, 35, 94, 96, 32, 100, 30, 99], [50, 48, 57, 64, 0, 117, 56, 59, 116, 126, 62, 122, 120, 113, 123, 46, 53, 97, 108, 127, 55, 65, 110, 63, 47, 1, 109, 49, 112, 61, 58, 124, 29, 119, 40, 54, 114, 51, 121, 107, 43, 67, 60, 45, 3, 104, 125, 70, 52, 42, 68, 66, 115, 106, 95, 33, 4, 15, 20, 77, 6, 2, 13, 93, 12, 14, 75, 79, 41, 69, 78, 5, 84, 7, 76, 80, 11, 82, 71, 39, 73, 103, 44, 18, 16, 81, 9, 10, 111, 87, 17, 8, 74, 118, 31, 24, 105, 23, 21, 22, 19, 25, 85, 88, 92, 90, 89, 83, 26, 72, 86, 28, 102, 27, 98, 37, 38, 91, 101, 34, 35, 96, 100, 30, 94, 32, 36, 99], [50, 48, 64, 57, 0, 117, 56, 116, 59, 126, 120, 113, 122, 123, 62, 65, 53, 46, 55, 1, 63, 58, 110, 127, 49, 97, 61, 124, 108, 109, 47, 54, 121, 51, 29, 40, 45, 112, 119, 114, 125, 66, 60, 43, 107, 70, 2, 67, 104, 3, 42, 52, 106, 33, 95, 115, 41, 71, 68, 20, 111, 13, 4, 93, 7, 5, 12, 77, 6, 10, 76, 84, 78, 14, 39, 69, 17, 81, 103, 16, 75, 73, 74, 82, 44, 9, 11, 80, 15, 31, 105, 79, 118, 18, 24, 8, 72, 87, 23, 89, 22, 19, 83, 25, 88, 92, 86, 28, 85, 21, 90, 102, 26, 101, 38, 91, 37, 27, 98, 35, 36, 94, 100, 34, 32, 96, 30, 99], [50, 48, 64, 0, 57, 117, 56, 126, 116, 59, 120, 62, 122, 113, 46, 123, 55, 53, 110, 58, 63, 127, 49, 65, 1, 67, 108, 3, 109, 61, 124, 40, 97, 51, 47, 29, 119, 121, 43, 112, 4, 60, 107, 54, 114, 45, 125, 68, 104, 2, 66, 42, 106, 70, 52, 71, 6, 111, 41, 33, 77, 95, 93, 115, 20, 69, 16, 13, 39, 5, 103, 10, 80, 7, 75, 11, 84, 73, 15, 44, 12, 74, 78, 76, 17, 9, 14, 82, 105, 81, 8, 18, 31, 79, 23, 118, 87, 24, 72, 19, 21, 88, 22, 83, 25, 90, 85, 89, 102, 26, 86, 92, 28, 91, 27, 101, 38, 98, 36, 37, 94, 35, 32, 30, 34, 99, 100, 96], [50, 48, 57, 64, 117, 0, 116, 120, 56, 122, 62, 126, 113, 59, 123, 46, 110, 53, 127, 55, 97, 109, 63, 108, 40, 49, 112, 61, 47, 1, 58, 124, 65, 29, 119, 51, 114, 54, 107, 121, 43, 3, 4, 60, 6, 42, 52, 104, 67, 125, 106, 45, 68, 2, 66, 115, 33, 95, 70, 15, 79, 16, 20, 93, 41, 13, 84, 11, 12, 77, 39, 71, 5, 44, 76, 14, 75, 73, 103, 78, 82, 7, 18, 9, 69, 10, 105, 80, 17, 74, 31, 111, 81, 87, 118, 24, 72, 8, 23, 88, 22, 26, 19, 25, 21, 92, 85, 86, 83, 90, 89, 28, 27, 102, 91, 98, 101, 38, 37, 34, 30, 36, 35, 94, 32, 99, 100, 96], [50, 48, 57, 64, 0, 117, 56, 116, 59, 113, 126, 122, 62, 120, 123, 53, 46, 97, 63, 127, 110, 55, 49, 1, 61, 109, 58, 124, 65, 108, 112, 47, 29, 121, 54, 40, 51, 114, 119, 60, 43, 107, 125, 6, 104, 52, 2, 67, 45, 42, 66, 33, 95, 106, 70, 3, 68, 4, 41, 115, 77, 76, 93, 5, 7, 16, 13, 78, 12, 20, 71, 84, 9, 39, 69, 79, 11, 18, 44, 10, 15, 80, 75, 103, 81, 74, 111, 14, 82, 73, 17, 72, 31, 24, 8, 105, 118, 87, 23, 19, 85, 88, 21, 86, 92, 25, 89, 22, 26, 83, 102, 90, 28, 27, 91, 38, 36, 98, 37, 101, 35, 94, 34, 100, 32, 99, 30, 96], [50, 48, 0, 57, 117, 64, 56, 120, 116, 59, 126, 122, 62, 113, 123, 53, 46, 108, 63, 55, 110, 127, 1, 49, 47, 61, 97, 65, 29, 58, 124, 3, 109, 67, 112, 119, 121, 40, 114, 51, 54, 60, 107, 6, 52, 43, 125, 2, 4, 45, 104, 66, 68, 106, 42, 33, 5, 95, 70, 93, 111, 71, 13, 77, 115, 41, 75, 7, 12, 79, 9, 39, 69, 20, 78, 84, 76, 73, 103, 82, 18, 14, 44, 80, 16, 10, 11, 74, 17, 81, 87, 72, 15, 105, 118, 24, 31, 8, 23, 90, 83, 22, 85, 88, 25, 19, 26, 86, 102, 92, 89, 21, 28, 27, 91, 101, 38, 37, 35, 98, 94, 96, 30, 36, 32, 100, 34, 99], [50, 48, 57, 0, 64, 117, 56, 116, 120, 126, 59, 122, 113, 62, 53, 63, 55, 46, 123, 58, 127, 110, 108, 65, 47, 61, 109, 49, 1, 40, 124, 97, 29, 121, 51, 119, 112, 114, 43, 54, 4, 60, 67, 107, 3, 68, 6, 45, 104, 125, 42, 2, 106, 66, 33, 52, 93, 115, 7, 41, 95, 20, 13, 111, 77, 70, 39, 9, 78, 71, 103, 75, 44, 84, 74, 12, 5, 73, 69, 80, 72, 76, 11, 79, 18, 81, 82, 10, 17, 16, 31, 14, 15, 105, 118, 8, 23, 87, 24, 83, 90, 102, 85, 88, 89, 25, 26, 22, 91, 19, 28, 98, 86, 21, 27, 92, 37, 38, 101, 94, 36, 35, 96, 30, 99, 100, 32, 34], [50, 48, 57, 117, 64, 0, 56, 116, 59, 62, 120, 126, 122, 113, 53, 46, 55, 123, 110, 127, 49, 58, 63, 109, 47, 97, 29, 108, 124, 54, 61, 65, 112, 114, 121, 40, 1, 51, 119, 60, 43, 107, 104, 66, 125, 52, 68, 2, 42, 45, 4, 67, 6, 33, 3, 70, 106, 115, 93, 95, 79, 13, 7, 5, 20, 16, 12, 11, 75, 78, 41, 77, 80, 9, 84, 82, 15, 14, 10, 103, 72, 44, 76, 69, 81, 73, 39, 71, 18, 111, 17, 118, 87, 105, 74, 31, 24, 23, 83, 8, 85, 21, 88, 26, 89, 86, 22, 92, 90, 25, 19, 102, 28, 27, 98, 91, 38, 101, 37, 94, 35, 30, 36, 34, 32, 100, 99, 96], [50, 48, 57, 117, 0, 64, 56, 59, 116, 126, 120, 122, 62, 113, 123, 53, 46, 127, 63, 110, 97, 55, 49, 58, 1, 61, 29, 124, 109, 47, 112, 108, 121, 67, 65, 51, 54, 119, 3, 114, 40, 107, 60, 66, 45, 125, 43, 2, 104, 70, 52, 6, 95, 13, 93, 106, 42, 4, 33, 5, 12, 115, 16, 68, 9, 77, 20, 14, 76, 84, 79, 7, 111, 75, 78, 71, 69, 82, 18, 15, 103, 41, 17, 39, 44, 81, 73, 11, 10, 80, 72, 31, 118, 74, 105, 24, 22, 87, 23, 85, 83, 25, 88, 21, 89, 86, 8, 19, 90, 92, 26, 28, 27, 102, 98, 91, 94, 38, 37, 101, 36, 30, 35, 96, 34, 100, 32, 99], [50, 48, 64, 57, 0, 117, 56, 59, 116, 120, 126, 113, 62, 122, 123, 55, 53, 46, 63, 127, 1, 58, 110, 97, 65, 61, 49, 121, 108, 109, 47, 119, 124, 29, 112, 40, 51, 114, 54, 60, 2, 107, 3, 125, 4, 68, 43, 70, 104, 45, 66, 67, 42, 52, 6, 106, 115, 95, 41, 33, 93, 5, 77, 13, 16, 20, 7, 79, 82, 12, 76, 71, 69, 9, 84, 111, 81, 103, 75, 73, 78, 11, 14, 10, 39, 44, 80, 17, 72, 118, 31, 18, 74, 15, 87, 23, 105, 8, 24, 22, 88, 19, 21, 83, 89, 85, 25, 102, 90, 86, 92, 28, 26, 27, 91, 98, 38, 94, 101, 37, 100, 36, 32, 35, 30, 96, 34, 99], [50, 48, 57, 0, 64, 117, 120, 56, 126, 116, 59, 62, 122, 113, 53, 123, 46, 63, 55, 127, 110, 58, 108, 1, 109, 61, 97, 65, 40, 49, 47, 112, 124, 29, 121, 51, 107, 68, 119, 114, 43, 60, 70, 4, 54, 2, 104, 42, 125, 66, 106, 52, 45, 3, 67, 33, 93, 95, 6, 115, 5, 69, 41, 71, 77, 79, 75, 84, 39, 13, 78, 12, 20, 44, 9, 103, 111, 7, 73, 82, 17, 76, 16, 10, 14, 18, 80, 105, 74, 72, 11, 31, 8, 15, 81, 118, 24, 23, 87, 83, 88, 22, 85, 89, 102, 19, 27, 92, 25, 21, 86, 91, 90, 26, 28, 38, 101, 98, 37, 30, 96, 94, 35, 34, 36, 100, 32, 99], [50, 48, 57, 64, 0, 117, 116, 120, 56, 126, 122, 59, 113, 62, 53, 123, 46, 63, 127, 55, 49, 108, 58, 110, 97, 124, 29, 67, 47, 109, 61, 112, 3, 51, 1, 121, 114, 40, 119, 54, 65, 107, 43, 60, 70, 33, 125, 104, 45, 2, 42, 106, 52, 95, 93, 66, 68, 71, 4, 13, 84, 77, 69, 6, 20, 111, 115, 17, 75, 76, 82, 41, 9, 80, 39, 7, 16, 103, 79, 5, 78, 74, 14, 44, 11, 12, 73, 18, 10, 15, 31, 81, 105, 87, 24, 118, 23, 72, 83, 8, 22, 88, 86, 19, 89, 90, 28, 25, 85, 21, 102, 92, 26, 91, 98, 27, 37, 36, 38, 101, 94, 32, 30, 99, 35, 100, 96, 34], [50, 48, 57, 64, 0, 117, 56, 120, 116, 59, 126, 62, 113, 122, 123, 55, 53, 1, 46, 63, 110, 58, 127, 65, 49, 61, 121, 97, 108, 112, 47, 29, 119, 124, 109, 51, 54, 114, 40, 60, 2, 66, 3, 125, 45, 107, 70, 104, 52, 43, 4, 6, 5, 115, 67, 95, 42, 93, 106, 68, 71, 69, 33, 77, 13, 76, 75, 9, 12, 78, 20, 80, 15, 111, 41, 18, 82, 7, 84, 103, 11, 73, 10, 39, 14, 17, 79, 8, 44, 118, 16, 81, 74, 31, 87, 105, 23, 72, 24, 25, 22, 21, 102, 19, 88, 28, 85, 89, 86, 83, 90, 92, 26, 27, 91, 98, 37, 38, 101, 30, 94, 96, 35, 34, 32, 100, 36, 99], [50, 48, 57, 0, 117, 64, 56, 116, 120, 126, 122, 59, 113, 62, 123, 46, 63, 53, 110, 127, 55, 58, 124, 97, 108, 49, 109, 47, 61, 40, 65, 112, 114, 29, 51, 119, 54, 107, 121, 1, 60, 4, 43, 125, 104, 68, 52, 42, 6, 3, 106, 33, 45, 70, 2, 95, 66, 93, 115, 67, 20, 77, 9, 5, 7, 71, 41, 13, 11, 16, 111, 103, 80, 14, 84, 15, 82, 12, 39, 69, 75, 10, 79, 17, 78, 44, 76, 73, 8, 74, 18, 105, 81, 118, 31, 24, 87, 72, 19, 23, 25, 22, 21, 27, 88, 26, 28, 83, 91, 86, 85, 92, 89, 90, 102, 38, 37, 98, 101, 30, 94, 34, 35, 96, 32, 36, 100, 99], [50, 48, 57, 0, 64, 117, 56, 120, 126, 116, 59, 122, 113, 62, 123, 46, 53, 110, 63, 55, 58, 49, 109, 127, 124, 97, 65, 112, 3, 108, 61, 47, 40, 1, 29, 67, 114, 43, 119, 121, 54, 51, 107, 60, 125, 104, 66, 42, 6, 68, 2, 33, 106, 4, 45, 52, 93, 95, 70, 13, 69, 41, 115, 12, 71, 17, 39, 44, 20, 79, 84, 9, 77, 16, 103, 76, 14, 8, 82, 74, 7, 5, 111, 80, 75, 78, 73, 11, 18, 15, 105, 10, 118, 31, 81, 87, 24, 23, 72, 88, 22, 19, 92, 26, 102, 89, 25, 28, 83, 21, 85, 86, 90, 27, 91, 38, 98, 101, 37, 36, 35, 94, 30, 100, 96, 99, 34, 32], [50, 48, 57, 64, 117, 0, 116, 56, 120, 126, 59, 122, 113, 62, 123, 55, 53, 110, 46, 63, 108, 58, 127, 1, 29, 109, 97, 49, 124, 114, 61, 40, 119, 51, 121, 47, 112, 54, 65, 125, 43, 107, 52, 60, 6, 45, 104, 2, 66, 67, 42, 106, 93, 70, 33, 3, 115, 95, 4, 69, 68, 80, 13, 11, 76, 41, 84, 71, 7, 77, 5, 74, 75, 103, 111, 14, 8, 79, 9, 20, 16, 17, 39, 73, 15, 82, 12, 78, 18, 44, 10, 31, 105, 118, 23, 87, 81, 72, 85, 24, 83, 25, 92, 22, 88, 86, 19, 89, 26, 102, 21, 28, 90, 91, 101, 38, 37, 27, 98, 94, 35, 30, 36, 34, 99, 96, 100, 32], [50, 48, 64, 57, 0, 117, 116, 120, 56, 126, 59, 113, 62, 122, 46, 123, 53, 63, 55, 110, 127, 58, 97, 49, 108, 109, 47, 1, 65, 112, 124, 61, 121, 29, 40, 114, 119, 51, 54, 60, 43, 107, 67, 125, 66, 3, 45, 6, 104, 42, 33, 2, 106, 52, 95, 68, 93, 115, 20, 4, 70, 77, 13, 41, 16, 71, 5, 7, 79, 111, 84, 69, 44, 11, 12, 39, 17, 76, 73, 75, 80, 82, 18, 14, 9, 103, 15, 10, 78, 31, 105, 74, 81, 24, 118, 8, 23, 72, 87, 19, 86, 92, 85, 88, 89, 25, 83, 28, 26, 22, 102, 21, 90, 37, 27, 98, 91, 101, 38, 30, 94, 35, 99, 34, 36, 32, 100, 96], [50, 48, 57, 0, 117, 64, 116, 56, 126, 113, 120, 59, 62, 122, 123, 53, 46, 63, 65, 127, 110, 49, 47, 97, 108, 61, 109, 124, 55, 112, 58, 29, 67, 1, 3, 51, 40, 54, 114, 119, 121, 43, 107, 68, 4, 45, 104, 60, 125, 6, 66, 2, 42, 33, 106, 95, 52, 93, 115, 20, 111, 77, 44, 11, 12, 13, 76, 84, 39, 7, 70, 71, 69, 41, 5, 73, 17, 78, 82, 103, 79, 9, 18, 10, 80, 16, 75, 15, 14, 74, 105, 81, 31, 24, 118, 8, 87, 23, 72, 83, 22, 86, 25, 88, 21, 92, 28, 85, 19, 26, 90, 102, 89, 27, 98, 37, 91, 38, 101, 100, 35, 94, 36, 32, 30, 99, 34, 96], [50, 48, 0, 57, 64, 117, 56, 120, 116, 59, 62, 113, 126, 122, 123, 46, 53, 63, 55, 110, 1, 58, 65, 108, 127, 109, 61, 97, 47, 49, 40, 121, 29, 124, 112, 51, 107, 119, 66, 114, 43, 54, 125, 45, 104, 60, 52, 42, 70, 106, 4, 2, 6, 68, 95, 3, 33, 67, 69, 93, 115, 77, 5, 41, 20, 13, 76, 12, 71, 7, 111, 11, 73, 103, 78, 80, 39, 9, 10, 79, 16, 14, 75, 84, 74, 18, 8, 44, 15, 82, 81, 31, 17, 105, 118, 87, 23, 72, 24, 19, 22, 86, 21, 88, 92, 102, 83, 85, 89, 25, 26, 90, 27, 28, 91, 38, 98, 37, 101, 94, 35, 34, 32, 36, 99, 30, 100, 96], [50, 48, 57, 0, 64, 117, 116, 120, 56, 59, 126, 113, 122, 62, 123, 55, 46, 58, 53, 63, 127, 49, 1, 108, 110, 109, 61, 97, 65, 47, 124, 112, 29, 119, 121, 40, 3, 51, 67, 54, 114, 107, 43, 104, 60, 2, 45, 70, 125, 42, 66, 106, 33, 13, 6, 95, 52, 4, 41, 93, 7, 68, 69, 111, 115, 71, 20, 5, 76, 78, 77, 74, 73, 11, 103, 14, 9, 12, 79, 84, 16, 39, 80, 75, 18, 81, 82, 44, 105, 17, 72, 8, 10, 15, 31, 118, 87, 24, 23, 25, 88, 22, 19, 83, 102, 86, 21, 89, 26, 85, 90, 28, 92, 91, 101, 27, 37, 38, 98, 94, 36, 35, 100, 30, 34, 96, 99, 32], [50, 48, 0, 57, 117, 64, 120, 116, 56, 59, 126, 113, 122, 62, 46, 53, 123, 127, 58, 110, 1, 55, 63, 109, 29, 97, 124, 49, 108, 65, 40, 3, 61, 47, 67, 51, 114, 112, 54, 121, 43, 60, 107, 119, 68, 70, 104, 2, 125, 42, 4, 45, 66, 33, 106, 52, 115, 95, 93, 6, 71, 77, 20, 41, 73, 5, 69, 13, 44, 111, 80, 16, 78, 7, 76, 12, 103, 74, 11, 9, 79, 39, 84, 81, 75, 10, 18, 14, 105, 72, 15, 31, 17, 8, 82, 118, 24, 87, 23, 22, 88, 86, 83, 19, 25, 26, 21, 92, 89, 85, 102, 28, 90, 91, 27, 38, 101, 98, 34, 37, 36, 94, 32, 100, 30, 99, 96, 35], [50, 48, 57, 117, 64, 0, 116, 59, 126, 56, 122, 120, 62, 113, 123, 46, 53, 49, 55, 127, 58, 110, 97, 108, 63, 109, 47, 124, 29, 61, 54, 112, 119, 114, 121, 40, 65, 1, 51, 107, 104, 70, 43, 125, 4, 60, 52, 42, 45, 33, 68, 93, 95, 66, 11, 13, 106, 20, 2, 6, 15, 79, 76, 77, 115, 9, 5, 18, 84, 74, 80, 44, 14, 78, 16, 103, 81, 67, 41, 75, 69, 72, 82, 12, 3, 73, 71, 7, 39, 10, 105, 111, 17, 118, 87, 31, 23, 24, 8, 85, 22, 88, 25, 19, 83, 26, 92, 21, 89, 86, 102, 90, 27, 28, 91, 38, 98, 37, 101, 94, 36, 34, 99, 30, 100, 35, 96, 32], [50, 48, 57, 64, 117, 0, 116, 120, 56, 126, 113, 62, 122, 59, 123, 53, 46, 55, 63, 127, 110, 97, 109, 108, 58, 124, 61, 49, 112, 47, 65, 29, 40, 114, 125, 119, 1, 121, 43, 54, 51, 60, 3, 107, 67, 45, 66, 70, 104, 33, 52, 42, 95, 106, 115, 2, 6, 93, 13, 5, 16, 20, 77, 14, 7, 76, 41, 84, 9, 11, 12, 44, 4, 71, 111, 81, 79, 103, 68, 18, 80, 39, 17, 75, 15, 82, 74, 10, 105, 69, 78, 73, 72, 23, 87, 31, 118, 24, 8, 19, 85, 22, 86, 88, 83, 92, 89, 25, 28, 26, 102, 90, 21, 27, 91, 38, 98, 37, 101, 94, 36, 35, 30, 34, 100, 99, 96, 32]], "model.layers.2.self_attn.q_proj": [[104, 127, 46, 34, 49, 88, 113, 82, 21, 79, 75, 76, 24, 14, 71, 7, 5, 9, 92, 40, 107, 3, 15, 19, 52, 125, 62, 11, 119, 27, 121, 12, 78, 120, 20, 116, 66, 8, 93, 53, 50, 77, 109, 57, 47, 10, 56, 69, 2, 96, 18, 65, 84, 86, 85, 73, 60, 30, 91, 115, 72, 63, 41, 83, 54, 1, 45, 95, 17, 51, 74, 99, 29, 81, 25, 59, 28, 118, 16, 108, 55, 103, 31, 101, 64, 106, 48, 32, 97, 102, 13, 90, 87, 61, 80, 35, 112, 124, 94, 39, 43, 122, 42, 23, 67, 26, 38, 117, 4, 58, 100, 33, 111, 126, 6, 105, 22, 44, 37, 70, 36, 110, 123, 114, 89, 98, 68, 0], [104, 49, 46, 34, 127, 113, 21, 108, 24, 82, 79, 105, 40, 57, 9, 88, 75, 107, 76, 121, 56, 80, 5, 27, 120, 51, 90, 14, 30, 71, 92, 62, 26, 87, 48, 16, 22, 83, 35, 28, 52, 43, 17, 50, 125, 63, 84, 106, 94, 4, 117, 23, 111, 109, 61, 89, 59, 29, 3, 100, 45, 114, 119, 126, 42, 118, 19, 36, 47, 54, 67, 38, 110, 70, 116, 115, 10, 123, 96, 72, 98, 68, 33, 122, 11, 41, 44, 124, 93, 99, 20, 112, 102, 39, 103, 55, 95, 101, 77, 53, 37, 60, 7, 31, 18, 66, 32, 81, 58, 85, 86, 91, 97, 69, 78, 73, 65, 15, 25, 1, 13, 6, 8, 64, 12, 74, 0, 2], [104, 34, 49, 127, 46, 21, 82, 71, 88, 79, 113, 76, 9, 75, 3, 5, 40, 14, 26, 121, 7, 53, 64, 1, 125, 66, 23, 119, 8, 52, 67, 10, 107, 24, 11, 81, 73, 4, 15, 17, 57, 120, 69, 77, 50, 19, 2, 20, 85, 22, 18, 32, 94, 70, 84, 6, 74, 72, 62, 92, 33, 12, 65, 16, 93, 89, 90, 63, 87, 56, 96, 45, 30, 126, 38, 31, 27, 51, 35, 13, 78, 42, 80, 108, 44, 43, 118, 103, 47, 54, 124, 48, 100, 86, 58, 95, 115, 91, 60, 28, 36, 83, 25, 37, 29, 117, 123, 102, 116, 39, 114, 122, 101, 111, 97, 112, 99, 109, 105, 41, 55, 68, 0, 59, 61, 106, 110, 98], [104, 46, 49, 34, 127, 113, 82, 21, 107, 88, 75, 125, 14, 52, 90, 53, 9, 22, 28, 79, 26, 24, 92, 5, 38, 121, 40, 19, 105, 94, 77, 76, 120, 119, 84, 27, 48, 51, 103, 50, 111, 39, 93, 29, 62, 16, 41, 60, 30, 98, 17, 124, 71, 91, 36, 56, 57, 126, 99, 118, 3, 81, 47, 106, 33, 123, 110, 109, 117, 115, 54, 44, 112, 63, 86, 37, 80, 59, 116, 100, 42, 35, 108, 20, 87, 55, 31, 122, 102, 83, 73, 58, 101, 70, 114, 96, 32, 97, 11, 15, 78, 23, 95, 43, 69, 72, 45, 61, 10, 89, 66, 6, 1, 7, 13, 18, 85, 65, 64, 67, 8, 25, 4, 68, 0, 74, 12, 2], [104, 107, 91, 35, 23, 84, 17, 27, 15, 13, 81, 47, 20, 94, 112, 113, 82, 114, 61, 43, 55, 99, 75, 77, 40, 56, 89, 53, 86, 111, 10, 96, 79, 92, 25, 117, 119, 123, 11, 28, 125, 51, 21, 115, 97, 118, 30, 124, 9, 73, 80, 93, 12, 16, 57, 120, 29, 48, 87, 122, 46, 76, 42, 70, 6, 102, 24, 26, 72, 18, 88, 116, 7, 110, 49, 108, 85, 38, 83, 14, 98, 105, 74, 22, 50, 68, 67, 60, 101, 31, 90, 41, 59, 71, 106, 8, 37, 32, 33, 78, 39, 44, 19, 63, 58, 127, 126, 103, 34, 45, 54, 69, 121, 36, 100, 95, 4, 62, 52, 109, 2, 3, 0, 65, 66, 5, 1, 64], [104, 107, 35, 91, 27, 23, 99, 94, 84, 114, 47, 43, 113, 89, 112, 61, 9, 55, 119, 123, 53, 15, 40, 73, 111, 92, 17, 12, 82, 81, 120, 28, 98, 77, 56, 46, 105, 117, 125, 97, 25, 67, 86, 116, 70, 48, 115, 122, 7, 124, 42, 13, 36, 102, 118, 51, 32, 14, 90, 29, 57, 22, 79, 11, 49, 37, 10, 85, 63, 16, 41, 109, 68, 126, 6, 103, 44, 110, 54, 121, 72, 76, 106, 58, 75, 108, 34, 93, 45, 96, 101, 52, 59, 69, 39, 38, 24, 87, 30, 95, 100, 80, 50, 18, 60, 31, 127, 74, 26, 33, 71, 62, 88, 21, 19, 83, 65, 20, 2, 78, 4, 0, 3, 8, 66, 64, 5, 1], [107, 104, 47, 35, 99, 124, 94, 89, 96, 27, 55, 91, 43, 61, 119, 112, 113, 46, 25, 123, 85, 63, 105, 48, 114, 120, 53, 116, 125, 111, 23, 54, 40, 84, 102, 52, 28, 56, 117, 97, 37, 51, 115, 82, 121, 126, 92, 42, 109, 62, 22, 41, 32, 106, 60, 29, 118, 44, 110, 95, 49, 57, 86, 122, 98, 38, 108, 100, 58, 127, 36, 34, 50, 12, 101, 33, 103, 17, 59, 67, 83, 26, 45, 31, 93, 39, 14, 78, 90, 30, 21, 15, 73, 81, 16, 24, 77, 6, 88, 18, 7, 19, 80, 79, 9, 10, 20, 68, 66, 11, 71, 70, 3, 76, 75, 13, 87, 2, 74, 72, 65, 4, 0, 8, 64, 5, 1, 69], [104, 107, 35, 15, 75, 8, 13, 17, 91, 5, 84, 4, 23, 99, 73, 40, 43, 3, 1, 71, 72, 87, 112, 114, 69, 61, 68, 6, 113, 55, 67, 20, 2, 47, 53, 123, 7, 66, 0, 56, 65, 11, 120, 70, 115, 111, 10, 119, 89, 9, 25, 110, 77, 30, 64, 125, 57, 21, 16, 46, 124, 12, 94, 18, 51, 117, 86, 79, 74, 122, 108, 81, 76, 58, 14, 92, 49, 78, 85, 83, 82, 96, 127, 116, 121, 19, 90, 28, 88, 26, 102, 118, 31, 29, 80, 97, 105, 59, 60, 48, 32, 95, 93, 24, 22, 39, 37, 62, 63, 34, 50, 106, 33, 44, 42, 38, 101, 36, 27, 98, 100, 103, 54, 41, 109, 52, 45, 126], [110, 38, 97, 112, 46, 16, 89, 78, 10, 19, 8, 4, 12, 70, 68, 48, 25, 2, 69, 7, 6, 86, 82, 56, 1, 31, 20, 64, 50, 5, 102, 71, 22, 75, 125, 119, 11, 113, 3, 111, 13, 83, 67, 0, 127, 77, 61, 66, 47, 109, 51, 80, 14, 85, 73, 52, 118, 74, 72, 84, 94, 9, 81, 116, 117, 41, 93, 40, 120, 123, 18, 76, 87, 124, 107, 63, 15, 23, 21, 115, 65, 30, 104, 79, 36, 37, 43, 28, 34, 55, 88, 53, 91, 62, 24, 59, 17, 98, 108, 27, 60, 126, 99, 96, 45, 103, 32, 44, 57, 100, 101, 121, 58, 42, 122, 106, 29, 39, 92, 54, 35, 49, 95, 114, 26, 90, 105, 33], [110, 38, 112, 97, 46, 19, 78, 16, 12, 10, 8, 89, 4, 68, 69, 48, 71, 25, 5, 9, 70, 6, 102, 13, 24, 50, 2, 1, 88, 51, 62, 111, 66, 77, 65, 56, 109, 31, 118, 64, 113, 21, 15, 75, 61, 22, 127, 85, 73, 72, 3, 7, 67, 55, 14, 30, 74, 17, 27, 0, 34, 76, 83, 79, 124, 86, 115, 125, 20, 11, 100, 43, 94, 52, 122, 104, 107, 116, 87, 93, 99, 23, 105, 80, 84, 117, 95, 119, 47, 126, 121, 35, 82, 18, 91, 98, 44, 58, 120, 81, 123, 26, 40, 54, 42, 49, 45, 37, 101, 32, 92, 53, 39, 59, 36, 41, 106, 96, 108, 60, 29, 114, 103, 90, 28, 63, 57, 33], [38, 97, 110, 94, 112, 46, 25, 19, 83, 89, 78, 48, 93, 118, 85, 16, 50, 11, 96, 62, 39, 56, 21, 111, 127, 36, 77, 61, 27, 109, 90, 106, 80, 101, 17, 10, 88, 30, 12, 125, 60, 87, 95, 44, 86, 40, 105, 35, 121, 34, 31, 24, 104, 59, 116, 108, 115, 120, 57, 43, 54, 26, 123, 58, 81, 103, 126, 117, 20, 99, 23, 119, 13, 45, 14, 52, 124, 51, 37, 122, 29, 73, 100, 55, 42, 91, 92, 47, 107, 63, 22, 75, 32, 41, 114, 98, 53, 113, 9, 28, 33, 49, 15, 84, 6, 8, 82, 7, 18, 79, 69, 71, 76, 74, 70, 72, 5, 68, 4, 102, 2, 66, 3, 67, 65, 1, 0, 64], [110, 38, 97, 112, 46, 89, 78, 16, 10, 77, 25, 17, 19, 12, 8, 48, 6, 69, 11, 71, 102, 94, 51, 15, 50, 4, 75, 68, 2, 86, 21, 83, 62, 76, 31, 70, 52, 5, 72, 111, 81, 9, 127, 1, 22, 108, 99, 29, 121, 118, 109, 84, 124, 14, 7, 116, 82, 92, 36, 45, 37, 117, 74, 123, 90, 23, 79, 88, 80, 104, 35, 64, 105, 95, 61, 56, 66, 34, 58, 65, 103, 73, 93, 42, 44, 33, 107, 122, 28, 126, 20, 18, 55, 106, 87, 27, 113, 57, 47, 125, 53, 98, 85, 115, 41, 24, 54, 114, 26, 120, 30, 39, 101, 59, 40, 13, 96, 119, 67, 60, 100, 91, 43, 32, 49, 63, 3, 0], [40, 98, 121, 125, 53, 51, 17, 20, 82, 80, 18, 115, 27, 19, 21, 88, 104, 77, 22, 42, 119, 12, 89, 26, 95, 86, 30, 75, 92, 28, 84, 78, 25, 117, 96, 94, 29, 32, 120, 23, 93, 90, 57, 102, 116, 52, 79, 38, 127, 41, 24, 97, 50, 87, 123, 105, 56, 122, 10, 109, 113, 16, 31, 63, 101, 110, 43, 44, 48, 107, 33, 61, 49, 111, 36, 58, 62, 76, 124, 100, 91, 39, 114, 59, 45, 81, 108, 83, 46, 37, 35, 9, 103, 99, 60, 112, 47, 55, 85, 8, 126, 118, 54, 74, 14, 34, 71, 73, 15, 106, 7, 13, 11, 6, 72, 5, 70, 2, 69, 65, 68, 3, 64, 4, 67, 66, 1, 0], [40, 125, 121, 53, 51, 98, 27, 20, 82, 16, 84, 81, 12, 21, 42, 19, 115, 10, 119, 88, 8, 78, 74, 29, 50, 85, 86, 35, 72, 91, 36, 95, 5, 6, 117, 14, 94, 69, 104, 17, 18, 34, 93, 24, 76, 22, 65, 67, 97, 26, 79, 68, 75, 120, 38, 116, 15, 23, 30, 122, 44, 57, 63, 66, 107, 89, 58, 80, 71, 83, 109, 87, 7, 106, 9, 99, 108, 96, 111, 41, 33, 118, 13, 105, 103, 101, 48, 127, 114, 56, 100, 37, 124, 52, 28, 45, 90, 3, 113, 49, 92, 43, 77, 39, 32, 123, 59, 102, 110, 46, 31, 60, 126, 54, 70, 25, 61, 112, 62, 64, 47, 0, 55, 11, 4, 73, 1, 2], [40, 121, 42, 53, 125, 51, 98, 91, 123, 50, 110, 52, 111, 113, 58, 63, 105, 126, 47, 33, 115, 27, 54, 60, 118, 93, 117, 107, 97, 59, 122, 127, 48, 108, 61, 56, 15, 44, 124, 109, 57, 112, 116, 43, 45, 114, 62, 46, 120, 106, 38, 119, 49, 55, 41, 95, 23, 103, 39, 36, 101, 31, 100, 24, 86, 37, 99, 32, 35, 102, 30, 21, 12, 29, 96, 18, 85, 34, 89, 104, 94, 77, 92, 17, 81, 84, 83, 78, 79, 20, 74, 87, 9, 28, 69, 72, 13, 26, 88, 19, 75, 90, 25, 82, 22, 7, 10, 80, 2, 65, 70, 4, 76, 8, 11, 64, 3, 16, 6, 68, 1, 73, 67, 14, 5, 0, 71, 66], [53, 125, 51, 121, 42, 40, 115, 98, 119, 52, 108, 57, 123, 116, 91, 107, 50, 114, 117, 122, 127, 124, 120, 58, 111, 59, 118, 113, 63, 15, 55, 41, 60, 110, 48, 49, 9, 109, 56, 61, 100, 62, 44, 43, 46, 7, 105, 99, 112, 45, 54, 77, 126, 47, 102, 72, 39, 38, 33, 74, 12, 103, 69, 106, 93, 18, 101, 35, 30, 17, 95, 75, 78, 2, 70, 31, 11, 37, 27, 32, 80, 36, 84, 3, 24, 104, 34, 23, 96, 97, 4, 86, 68, 94, 83, 79, 64, 8, 29, 92, 10, 89, 85, 6, 65, 13, 0, 73, 76, 71, 26, 20, 87, 28, 21, 19, 88, 25, 90, 5, 1, 66, 16, 81, 67, 14, 22, 82], [102, 49, 98, 56, 116, 121, 113, 89, 20, 11, 81, 9, 77, 22, 15, 124, 52, 25, 2, 6, 73, 71, 18, 96, 75, 86, 110, 72, 88, 79, 70, 101, 19, 51, 10, 74, 100, 4, 3, 29, 67, 91, 1, 31, 83, 87, 92, 78, 17, 64, 13, 60, 76, 16, 120, 24, 84, 59, 61, 8, 30, 41, 80, 14, 21, 7, 58, 93, 47, 37, 12, 26, 28, 117, 62, 115, 90, 44, 46, 82, 32, 94, 27, 112, 63, 68, 103, 40, 105, 109, 33, 39, 42, 85, 95, 114, 97, 111, 35, 122, 48, 107, 23, 118, 53, 55, 54, 45, 127, 43, 123, 50, 104, 106, 119, 34, 126, 57, 99, 36, 108, 0, 125, 5, 69, 66, 65, 38], [102, 49, 116, 56, 98, 121, 113, 89, 81, 124, 20, 11, 15, 77, 68, 9, 71, 52, 70, 25, 29, 2, 64, 91, 19, 85, 4, 0, 105, 88, 1, 65, 66, 8, 3, 73, 101, 41, 46, 75, 12, 127, 110, 62, 60, 51, 72, 80, 93, 21, 67, 7, 27, 74, 14, 86, 96, 76, 47, 117, 13, 126, 18, 84, 44, 87, 6, 111, 115, 120, 79, 17, 22, 82, 90, 112, 10, 95, 107, 5, 61, 39, 24, 23, 16, 103, 30, 122, 92, 57, 83, 94, 32, 97, 118, 45, 78, 33, 26, 59, 36, 69, 43, 123, 109, 53, 37, 40, 100, 54, 114, 28, 50, 125, 99, 42, 108, 106, 48, 55, 35, 104, 58, 31, 63, 119, 34, 38], [102, 49, 116, 56, 98, 113, 121, 89, 77, 11, 81, 20, 9, 71, 15, 52, 68, 124, 110, 29, 19, 91, 86, 10, 101, 25, 6, 70, 2, 41, 66, 74, 47, 7, 4, 14, 46, 59, 67, 117, 82, 75, 72, 80, 64, 18, 96, 92, 73, 0, 31, 3, 95, 79, 115, 126, 12, 8, 21, 16, 93, 1, 22, 13, 87, 17, 60, 39, 88, 36, 107, 127, 50, 97, 32, 35, 120, 33, 122, 69, 53, 105, 85, 30, 61, 65, 84, 111, 62, 104, 5, 26, 42, 58, 118, 76, 94, 114, 23, 83, 112, 51, 24, 109, 54, 27, 44, 28, 40, 90, 125, 119, 100, 99, 78, 43, 57, 55, 48, 106, 123, 108, 103, 63, 45, 37, 34, 38], [102, 121, 110, 49, 98, 56, 116, 113, 101, 89, 25, 62, 93, 29, 31, 52, 91, 41, 51, 46, 107, 27, 45, 126, 34, 40, 100, 38, 115, 58, 106, 60, 112, 20, 105, 63, 96, 85, 39, 48, 103, 22, 57, 19, 61, 119, 86, 50, 47, 108, 53, 18, 123, 109, 114, 55, 127, 54, 120, 84, 104, 28, 111, 78, 94, 44, 118, 43, 83, 122, 81, 21, 42, 125, 124, 59, 92, 80, 30, 24, 16, 117, 35, 95, 37, 33, 26, 32, 88, 99, 36, 15, 97, 23, 77, 76, 14, 87, 90, 82, 10, 12, 5, 17, 71, 79, 2, 11, 69, 66, 8, 13, 9, 74, 7, 72, 70, 68, 75, 67, 4, 6, 64, 73, 65, 0, 3, 1], [126, 61, 56, 60, 54, 51, 123, 63, 124, 119, 100, 58, 117, 32, 127, 108, 112, 49, 39, 97, 114, 47, 104, 42, 125, 44, 106, 46, 40, 41, 48, 118, 99, 62, 110, 53, 120, 34, 52, 105, 121, 57, 107, 115, 116, 102, 55, 103, 43, 122, 50, 45, 113, 59, 111, 109, 38, 36, 33, 101, 96, 37, 94, 98, 30, 85, 31, 35, 8, 28, 25, 67, 70, 7, 5, 91, 29, 73, 2, 74, 27, 68, 92, 75, 77, 95, 93, 83, 10, 22, 4, 21, 65, 11, 76, 26, 71, 13, 89, 64, 12, 0, 9, 90, 19, 1, 78, 86, 23, 72, 66, 80, 88, 87, 15, 24, 6, 79, 14, 20, 69, 82, 81, 84, 3, 16, 17, 18], [55, 119, 123, 127, 115, 124, 60, 58, 56, 54, 63, 51, 61, 117, 62, 113, 126, 59, 96, 57, 52, 109, 48, 53, 121, 116, 120, 107, 114, 45, 108, 43, 50, 122, 110, 112, 42, 47, 125, 49, 118, 111, 44, 94, 46, 106, 105, 41, 32, 93, 104, 40, 103, 95, 39, 30, 23, 102, 98, 91, 34, 38, 90, 24, 92, 37, 17, 16, 101, 88, 22, 18, 100, 79, 7, 31, 35, 10, 78, 99, 75, 89, 8, 29, 84, 13, 26, 70, 36, 73, 5, 76, 19, 81, 15, 21, 20, 97, 80, 33, 9, 4, 27, 65, 0, 2, 87, 67, 85, 68, 66, 77, 25, 83, 28, 72, 14, 3, 69, 82, 12, 86, 1, 71, 74, 6, 11, 64], [119, 123, 55, 100, 124, 127, 56, 60, 58, 54, 115, 51, 63, 61, 36, 117, 126, 62, 32, 98, 59, 113, 52, 57, 121, 48, 45, 109, 120, 116, 53, 43, 107, 50, 30, 122, 114, 47, 106, 108, 110, 111, 112, 125, 118, 42, 49, 44, 46, 34, 41, 105, 39, 38, 103, 101, 104, 102, 40, 35, 37, 31, 96, 33, 99, 93, 97, 95, 28, 22, 88, 91, 94, 29, 19, 24, 68, 77, 64, 85, 90, 92, 27, 12, 26, 86, 2, 5, 83, 14, 74, 1, 13, 6, 25, 11, 72, 71, 67, 7, 8, 21, 69, 3, 73, 79, 78, 10, 9, 16, 81, 76, 20, 70, 89, 66, 80, 75, 87, 15, 65, 82, 18, 0, 4, 23, 17, 84], [126, 61, 100, 26, 54, 60, 28, 20, 63, 119, 51, 58, 123, 56, 124, 87, 55, 96, 18, 93, 24, 127, 25, 115, 86, 117, 85, 92, 30, 27, 94, 17, 97, 90, 33, 91, 62, 31, 32, 49, 112, 108, 106, 125, 114, 34, 104, 42, 39, 47, 120, 46, 44, 48, 53, 110, 121, 57, 41, 118, 40, 52, 105, 59, 43, 116, 107, 103, 50, 113, 122, 45, 109, 36, 101, 80, 102, 111, 98, 23, 81, 38, 15, 16, 89, 21, 99, 37, 83, 88, 95, 29, 35, 82, 19, 22, 14, 79, 78, 84, 9, 13, 75, 10, 71, 74, 12, 76, 8, 7, 67, 5, 70, 66, 72, 4, 77, 69, 65, 73, 11, 2, 0, 64, 1, 3, 6, 68], [33, 40, 108, 117, 57, 54, 22, 19, 118, 122, 82, 24, 16, 121, 77, 89, 127, 25, 76, 14, 52, 7, 125, 49, 44, 12, 123, 92, 109, 93, 106, 107, 3, 13, 69, 73, 11, 115, 46, 29, 43, 84, 62, 21, 53, 104, 120, 32, 119, 39, 28, 20, 41, 94, 124, 98, 61, 67, 87, 86, 88, 30, 111, 83, 34, 112, 59, 2, 15, 74, 64, 70, 65, 95, 48, 85, 71, 47, 60, 91, 105, 63, 96, 6, 17, 38, 68, 27, 81, 58, 42, 113, 116, 126, 110, 8, 23, 45, 72, 4, 114, 97, 79, 50, 90, 78, 5, 18, 10, 37, 26, 31, 1, 51, 101, 35, 103, 99, 102, 100, 55, 56, 80, 36, 66, 0, 9, 75], [108, 40, 33, 57, 54, 86, 25, 44, 118, 18, 117, 82, 80, 52, 91, 29, 104, 62, 121, 94, 115, 90, 92, 122, 89, 31, 56, 30, 126, 27, 95, 93, 48, 28, 88, 63, 34, 51, 110, 99, 124, 35, 59, 101, 112, 123, 26, 98, 113, 36, 60, 37, 43, 116, 103, 109, 41, 32, 39, 47, 58, 46, 42, 105, 119, 102, 100, 96, 45, 111, 106, 120, 49, 125, 38, 50, 107, 14, 53, 127, 55, 61, 23, 84, 87, 16, 114, 24, 85, 20, 83, 77, 21, 22, 76, 19, 78, 97, 81, 79, 17, 75, 11, 13, 66, 70, 12, 15, 74, 5, 4, 6, 7, 3, 67, 0, 9, 68, 8, 72, 65, 71, 2, 1, 10, 64, 69, 73], [33, 40, 108, 57, 54, 16, 14, 89, 118, 44, 117, 78, 82, 22, 13, 104, 52, 24, 121, 27, 71, 19, 122, 12, 75, 25, 90, 87, 28, 62, 109, 124, 93, 99, 94, 48, 115, 30, 31, 29, 123, 110, 101, 26, 126, 56, 91, 127, 112, 95, 53, 63, 119, 96, 46, 59, 37, 73, 21, 120, 51, 86, 36, 98, 92, 77, 43, 116, 102, 105, 85, 41, 103, 42, 38, 34, 106, 35, 60, 32, 125, 88, 111, 100, 47, 113, 58, 23, 39, 45, 76, 49, 9, 50, 61, 107, 114, 55, 5, 17, 20, 83, 6, 18, 81, 84, 79, 68, 15, 80, 10, 74, 7, 3, 11, 8, 67, 72, 66, 70, 2, 97, 4, 1, 69, 65, 0, 64], [40, 108, 33, 57, 54, 11, 12, 89, 82, 117, 118, 76, 44, 16, 14, 22, 19, 73, 7, 9, 24, 104, 86, 77, 10, 71, 78, 75, 121, 27, 13, 127, 52, 122, 93, 70, 85, 74, 123, 48, 109, 23, 87, 66, 124, 43, 126, 106, 90, 21, 68, 101, 115, 53, 110, 28, 112, 30, 81, 26, 80, 31, 15, 119, 79, 99, 4, 63, 2, 64, 84, 25, 46, 94, 83, 62, 51, 3, 56, 96, 114, 95, 29, 98, 111, 120, 116, 36, 37, 59, 35, 32, 91, 92, 42, 102, 1, 41, 100, 47, 113, 60, 38, 125, 65, 88, 49, 20, 34, 103, 39, 17, 105, 6, 45, 50, 58, 107, 55, 69, 61, 72, 18, 67, 0, 5, 8, 97], [110, 103, 33, 31, 46, 21, 78, 76, 80, 10, 8, 19, 114, 120, 79, 13, 112, 70, 4, 82, 17, 116, 49, 12, 6, 123, 16, 61, 54, 57, 89, 124, 58, 121, 25, 73, 125, 75, 47, 39, 22, 107, 18, 67, 117, 1, 118, 94, 27, 83, 56, 43, 85, 108, 111, 115, 69, 77, 48, 2, 14, 101, 63, 24, 32, 50, 51, 29, 106, 86, 93, 44, 74, 113, 23, 62, 41, 15, 28, 119, 71, 40, 91, 68, 36, 104, 127, 55, 30, 84, 11, 20, 59, 9, 92, 38, 42, 102, 72, 88, 35, 122, 37, 34, 99, 60, 52, 81, 26, 90, 45, 87, 7, 96, 98, 105, 100, 53, 126, 109, 97, 5, 66, 0, 95, 3, 65, 64], [103, 110, 33, 31, 46, 21, 19, 114, 49, 78, 120, 116, 23, 76, 12, 24, 57, 112, 79, 80, 72, 58, 16, 61, 118, 14, 4, 8, 6, 123, 48, 54, 50, 18, 67, 111, 47, 83, 98, 10, 85, 95, 45, 89, 17, 15, 108, 117, 121, 127, 73, 91, 109, 41, 63, 20, 1, 43, 90, 104, 37, 88, 81, 55, 102, 13, 39, 32, 119, 28, 113, 94, 2, 115, 70, 84, 44, 60, 51, 126, 42, 38, 96, 53, 105, 125, 122, 35, 34, 124, 106, 59, 97, 27, 29, 100, 74, 82, 9, 36, 107, 99, 93, 75, 56, 52, 40, 101, 71, 69, 86, 92, 30, 65, 3, 87, 22, 26, 62, 5, 25, 68, 11, 77, 0, 66, 7, 64], [110, 103, 33, 31, 46, 80, 21, 10, 8, 76, 78, 4, 114, 70, 68, 57, 120, 5, 49, 123, 58, 116, 112, 16, 17, 69, 3, 61, 1, 39, 65, 66, 12, 117, 121, 67, 74, 118, 54, 124, 9, 73, 7, 71, 127, 72, 14, 50, 32, 15, 11, 75, 107, 47, 125, 0, 51, 84, 111, 64, 45, 48, 79, 25, 77, 2, 85, 6, 91, 89, 87, 13, 93, 82, 60, 24, 63, 19, 109, 23, 81, 104, 18, 88, 122, 20, 83, 56, 92, 29, 38, 37, 115, 105, 86, 55, 119, 113, 35, 102, 36, 106, 53, 28, 44, 41, 62, 94, 42, 22, 30, 108, 95, 52, 34, 26, 96, 99, 43, 40, 101, 59, 100, 98, 90, 97, 27, 126], [110, 103, 33, 31, 46, 21, 10, 80, 78, 79, 69, 8, 76, 120, 114, 70, 82, 2, 6, 67, 49, 57, 112, 73, 4, 123, 58, 12, 61, 39, 5, 66, 1, 116, 68, 23, 71, 3, 16, 13, 0, 19, 17, 127, 56, 117, 111, 72, 74, 118, 121, 7, 107, 125, 14, 51, 50, 64, 54, 87, 11, 9, 45, 75, 124, 65, 85, 24, 48, 88, 81, 43, 20, 77, 15, 89, 84, 126, 119, 94, 59, 42, 18, 32, 47, 92, 26, 37, 115, 63, 86, 83, 25, 109, 36, 62, 28, 104, 53, 44, 106, 91, 122, 60, 55, 40, 34, 27, 41, 108, 99, 30, 113, 105, 102, 22, 52, 97, 90, 35, 98, 96, 29, 100, 95, 38, 93, 101]], "model.layers.2.self_attn.k_proj": [[40, 49, 110, 127, 46, 98, 76, 9, 21, 82, 24, 79, 71, 14, 5, 75, 67, 66, 0, 64, 65, 70, 125, 88, 53, 2, 92, 81, 50, 69, 26, 93, 121, 4, 27, 116, 77, 30, 112, 52, 105, 74, 80, 119, 118, 97, 62, 13, 51, 56, 72, 57, 25, 68, 31, 109, 23, 106, 1, 126, 54, 120, 113, 83, 28, 84, 63, 48, 107, 29, 124, 44, 43, 59, 89, 20, 99, 45, 73, 47, 86, 60, 42, 108, 87, 61, 16, 95, 8, 115, 90, 122, 117, 32, 39, 17, 19, 36, 96, 22, 114, 100, 41, 33, 103, 91, 123, 35, 58, 55, 37, 111, 10, 94, 102, 38, 101, 6, 104, 15, 3, 78, 7, 12, 85, 11, 34, 18], [43, 40, 99, 56, 48, 50, 119, 23, 117, 61, 84, 13, 113, 15, 17, 75, 123, 5, 51, 47, 0, 8, 111, 4, 57, 91, 55, 53, 46, 3, 104, 94, 65, 49, 6, 124, 73, 114, 44, 110, 125, 2, 89, 80, 10, 71, 60, 120, 64, 107, 122, 108, 121, 1, 12, 58, 118, 14, 86, 42, 85, 66, 25, 82, 70, 59, 52, 62, 28, 112, 76, 109, 127, 106, 27, 92, 126, 63, 83, 88, 97, 54, 22, 41, 19, 69, 33, 18, 39, 29, 115, 101, 38, 116, 21, 74, 78, 26, 87, 45, 105, 34, 37, 103, 7, 32, 98, 36, 90, 96, 31, 93, 102, 67, 16, 30, 24, 68, 95, 100, 72, 79, 35, 20, 81, 9, 11, 77], [46, 112, 102, 33, 110, 0, 12, 48, 16, 8, 10, 78, 25, 70, 69, 65, 66, 19, 2, 4, 64, 5, 77, 89, 50, 85, 17, 67, 47, 1, 7, 68, 22, 83, 30, 20, 95, 93, 88, 82, 125, 62, 56, 124, 116, 54, 87, 51, 9, 43, 27, 127, 55, 49, 118, 23, 24, 73, 3, 11, 114, 15, 13, 84, 106, 40, 92, 79, 32, 121, 91, 103, 41, 36, 61, 99, 126, 98, 101, 113, 109, 81, 29, 31, 107, 123, 105, 45, 57, 44, 108, 94, 104, 39, 59, 52, 96, 28, 120, 100, 119, 26, 63, 60, 35, 21, 18, 71, 37, 115, 58, 6, 90, 111, 53, 42, 86, 122, 117, 34, 74, 76, 75, 14, 80, 72, 97, 38], [104, 121, 125, 53, 51, 20, 16, 34, 78, 81, 82, 119, 19, 75, 42, 43, 10, 122, 88, 50, 61, 63, 127, 93, 70, 21, 13, 109, 118, 106, 116, 44, 113, 30, 7, 120, 8, 48, 57, 124, 114, 111, 62, 47, 60, 56, 58, 107, 54, 123, 115, 31, 76, 126, 9, 49, 23, 55, 38, 105, 110, 97, 22, 112, 39, 46, 41, 100, 45, 59, 91, 52, 102, 108, 37, 117, 36, 4, 3, 99, 86, 26, 89, 35, 95, 101, 32, 92, 28, 27, 33, 103, 94, 96, 24, 25, 2, 69, 90, 29, 14, 18, 87, 73, 17, 85, 15, 65, 5, 68, 83, 64, 80, 1, 84, 98, 0, 12, 72, 79, 74, 71, 11, 77, 40, 66, 6, 67], [113, 38, 116, 56, 34, 121, 49, 77, 11, 9, 81, 15, 71, 25, 0, 93, 68, 70, 20, 46, 65, 86, 67, 18, 66, 91, 16, 30, 124, 112, 102, 115, 31, 83, 61, 22, 19, 87, 72, 5, 118, 88, 76, 27, 85, 29, 1, 45, 78, 8, 82, 89, 23, 7, 6, 111, 48, 117, 36, 44, 74, 41, 10, 58, 105, 59, 99, 47, 40, 108, 51, 106, 119, 42, 127, 101, 62, 43, 92, 50, 123, 104, 114, 103, 120, 57, 84, 37, 90, 95, 97, 125, 122, 24, 32, 55, 63, 69, 35, 109, 96, 53, 126, 60, 54, 3, 28, 33, 94, 39, 12, 107, 26, 100, 79, 13, 80, 52, 4, 73, 14, 110, 21, 75, 17, 2, 64, 98], [126, 36, 124, 60, 61, 54, 63, 58, 119, 56, 115, 51, 123, 55, 127, 32, 117, 125, 118, 49, 122, 110, 112, 114, 46, 30, 50, 53, 52, 120, 121, 116, 48, 59, 111, 62, 113, 47, 57, 34, 108, 44, 45, 109, 106, 95, 42, 93, 40, 41, 107, 104, 105, 43, 98, 20, 24, 23, 39, 103, 96, 33, 102, 97, 90, 94, 38, 82, 81, 92, 101, 26, 99, 27, 91, 89, 37, 35, 100, 22, 29, 14, 80, 87, 15, 31, 85, 16, 25, 86, 28, 79, 12, 9, 19, 13, 11, 83, 88, 7, 4, 74, 66, 72, 5, 70, 75, 10, 69, 0, 3, 21, 18, 1, 65, 6, 71, 76, 8, 77, 17, 67, 78, 64, 84, 73, 68, 2], [44, 104, 54, 57, 97, 117, 82, 16, 22, 14, 77, 73, 25, 11, 76, 9, 127, 53, 108, 89, 7, 29, 122, 40, 92, 123, 12, 27, 19, 4, 24, 52, 91, 112, 93, 95, 90, 26, 31, 30, 94, 32, 88, 115, 110, 120, 119, 124, 21, 87, 113, 96, 28, 118, 45, 62, 46, 69, 121, 126, 47, 10, 125, 116, 98, 39, 34, 58, 63, 67, 37, 111, 107, 49, 41, 43, 99, 35, 51, 36, 60, 42, 103, 105, 102, 106, 59, 70, 101, 5, 100, 23, 50, 1, 38, 109, 55, 61, 2, 20, 56, 85, 48, 17, 8, 15, 114, 75, 78, 0, 84, 72, 81, 80, 74, 71, 66, 79, 83, 18, 64, 6, 86, 13, 3, 33, 68, 65], [46, 39, 110, 97, 95, 64, 78, 76, 80, 10, 8, 21, 70, 68, 79, 66, 73, 3, 65, 19, 120, 58, 50, 57, 123, 9, 5, 48, 1, 2, 82, 7, 61, 23, 69, 113, 116, 118, 114, 67, 125, 51, 0, 124, 24, 121, 127, 77, 117, 112, 53, 49, 81, 47, 84, 52, 59, 56, 109, 72, 6, 17, 60, 87, 119, 4, 91, 75, 122, 107, 20, 22, 106, 45, 89, 96, 54, 40, 90, 29, 11, 126, 25, 63, 43, 42, 62, 92, 86, 30, 101, 98, 105, 44, 14, 115, 37, 104, 26, 27, 102, 108, 93, 36, 38, 55, 111, 12, 32, 41, 99, 100, 15, 18, 34, 83, 35, 85, 88, 28, 94, 74, 13, 71, 16, 33, 31, 103]], "model.layers.2.self_attn.qk_proj": [[46, 110, 49, 121, 112, 54, 40, 51, 127, 104, 57, 56, 125, 113, 53, 116, 43, 44, 126, 61, 124, 102, 60, 119, 123, 117, 108, 48, 89, 39, 33, 25, 16, 14, 80, 115, 78, 63, 99, 107, 12, 76, 55, 18, 50, 85, 47, 98, 120, 21, 114, 58, 75, 97, 88, 86, 74, 8, 118, 82, 13, 11, 81, 95, 83, 19, 34, 20, 27, 84, 79, 15, 91, 77, 122, 17, 9, 87, 42, 62, 10, 94, 73, 29, 24, 31, 22, 38, 7, 52, 23, 36, 45, 41, 59, 92, 71, 6, 72, 111, 4, 69, 5, 30, 68, 105, 101, 35, 93, 32, 109, 70, 96, 28, 66, 2, 0, 64, 103, 90, 106, 1, 100, 67, 3, 65, 37, 26], [46, 110, 49, 121, 112, 54, 40, 56, 104, 51, 57, 53, 125, 127, 113, 116, 43, 44, 126, 123, 61, 119, 124, 102, 60, 117, 48, 39, 108, 89, 25, 16, 107, 58, 120, 80, 33, 14, 115, 63, 12, 99, 47, 78, 76, 8, 118, 55, 98, 50, 85, 82, 27, 21, 114, 18, 20, 95, 75, 62, 83, 91, 15, 97, 19, 122, 77, 81, 74, 34, 13, 79, 17, 11, 94, 86, 84, 10, 9, 87, 42, 24, 31, 73, 70, 88, 23, 36, 111, 22, 4, 29, 38, 5, 7, 71, 68, 109, 52, 69, 0, 96, 28, 106, 64, 45, 103, 32, 1, 92, 105, 93, 66, 59, 30, 35, 67, 2, 72, 6, 101, 3, 65, 90, 26, 100, 41, 37], [46, 110, 49, 121, 112, 54, 56, 40, 104, 53, 51, 57, 127, 113, 125, 116, 43, 44, 61, 126, 124, 102, 119, 123, 117, 60, 108, 63, 33, 107, 55, 48, 89, 16, 25, 39, 120, 47, 58, 80, 76, 14, 8, 115, 12, 99, 118, 78, 97, 85, 50, 21, 70, 18, 98, 19, 82, 27, 114, 74, 11, 20, 77, 84, 15, 79, 94, 9, 17, 75, 42, 95, 91, 34, 13, 86, 52, 87, 10, 73, 81, 5, 38, 23, 24, 36, 7, 71, 68, 69, 122, 83, 4, 29, 62, 59, 88, 31, 111, 32, 109, 22, 66, 30, 106, 45, 64, 96, 105, 2, 1, 28, 92, 93, 100, 0, 103, 67, 41, 72, 3, 26, 65, 35, 101, 90, 6, 37], [46, 110, 49, 121, 112, 54, 127, 51, 57, 40, 56, 104, 113, 125, 53, 116, 43, 126, 44, 61, 124, 123, 102, 117, 60, 58, 119, 48, 33, 25, 108, 63, 16, 39, 89, 80, 55, 14, 8, 107, 99, 76, 118, 12, 78, 98, 21, 19, 120, 47, 18, 50, 91, 114, 115, 70, 97, 84, 11, 82, 27, 95, 52, 85, 77, 10, 74, 34, 23, 79, 20, 75, 42, 15, 86, 94, 83, 87, 62, 81, 31, 13, 17, 5, 9, 71, 73, 88, 36, 4, 69, 38, 24, 122, 68, 109, 106, 29, 30, 22, 7, 92, 41, 66, 111, 67, 59, 28, 93, 32, 2, 103, 65, 101, 0, 45, 1, 96, 105, 3, 64, 100, 26, 90, 35, 72, 6, 37], [46, 110, 49, 121, 112, 54, 40, 127, 51, 57, 56, 125, 104, 113, 53, 116, 43, 61, 44, 126, 119, 102, 124, 123, 117, 58, 108, 25, 63, 33, 55, 16, 76, 48, 120, 8, 107, 12, 99, 118, 14, 60, 89, 80, 78, 39, 47, 115, 21, 82, 114, 98, 50, 18, 91, 95, 84, 97, 11, 20, 70, 9, 79, 94, 27, 74, 85, 17, 52, 75, 73, 15, 19, 69, 13, 77, 31, 24, 10, 88, 7, 87, 81, 68, 86, 42, 23, 62, 36, 5, 34, 38, 83, 71, 59, 4, 111, 22, 2, 67, 3, 122, 66, 28, 64, 65, 41, 0, 92, 29, 109, 30, 32, 35, 96, 103, 100, 101, 106, 93, 1, 72, 6, 105, 45, 90, 26, 37], [46, 110, 49, 121, 112, 51, 54, 56, 40, 104, 57, 127, 113, 125, 53, 116, 43, 44, 61, 126, 102, 124, 123, 119, 48, 60, 58, 25, 115, 108, 117, 33, 89, 39, 16, 107, 55, 80, 14, 12, 120, 8, 78, 76, 47, 99, 98, 18, 21, 50, 114, 82, 63, 85, 75, 52, 20, 19, 95, 27, 9, 31, 15, 11, 118, 111, 74, 84, 79, 73, 97, 34, 91, 122, 70, 88, 17, 42, 86, 13, 62, 94, 87, 24, 77, 23, 10, 83, 7, 4, 22, 36, 81, 68, 69, 38, 5, 71, 28, 32, 106, 6, 29, 93, 103, 72, 67, 96, 105, 30, 66, 41, 92, 35, 1, 59, 0, 65, 45, 64, 109, 101, 2, 3, 100, 90, 26, 37], [46, 110, 49, 121, 112, 54, 40, 56, 104, 127, 57, 51, 113, 125, 116, 53, 43, 44, 61, 126, 119, 124, 123, 102, 89, 60, 48, 117, 108, 115, 16, 107, 39, 14, 58, 33, 25, 12, 47, 55, 76, 80, 78, 99, 8, 120, 63, 85, 11, 18, 50, 9, 21, 83, 82, 118, 79, 52, 84, 75, 98, 15, 42, 17, 20, 95, 77, 91, 19, 111, 27, 13, 114, 97, 73, 74, 81, 24, 94, 88, 31, 86, 36, 122, 10, 34, 38, 87, 71, 23, 109, 22, 4, 6, 7, 29, 68, 62, 69, 5, 32, 106, 45, 72, 105, 70, 96, 30, 35, 93, 59, 103, 92, 101, 41, 28, 66, 3, 0, 1, 64, 2, 26, 67, 65, 100, 37, 90], [46, 110, 49, 121, 112, 54, 40, 104, 56, 57, 51, 127, 113, 125, 53, 116, 43, 44, 61, 119, 126, 102, 124, 123, 60, 89, 108, 117, 25, 33, 63, 55, 48, 39, 78, 107, 80, 14, 16, 12, 85, 99, 76, 115, 47, 118, 58, 18, 21, 82, 120, 50, 11, 15, 97, 74, 9, 83, 91, 31, 84, 98, 52, 20, 75, 79, 27, 19, 24, 94, 114, 17, 77, 81, 6, 10, 86, 29, 13, 73, 87, 8, 38, 111, 95, 34, 122, 42, 62, 23, 88, 72, 36, 7, 68, 69, 22, 59, 71, 30, 4, 5, 32, 103, 105, 106, 45, 28, 96, 41, 0, 3, 2, 35, 93, 64, 109, 67, 26, 66, 92, 65, 70, 101, 100, 1, 90, 37], [46, 110, 49, 121, 112, 40, 51, 56, 54, 104, 57, 127, 53, 125, 113, 116, 43, 44, 126, 61, 102, 119, 117, 124, 60, 123, 25, 39, 108, 89, 48, 16, 115, 58, 63, 55, 21, 14, 80, 78, 33, 107, 99, 12, 76, 120, 85, 62, 98, 82, 47, 18, 15, 19, 31, 97, 27, 50, 42, 72, 20, 84, 95, 6, 11, 91, 75, 114, 94, 81, 23, 9, 79, 83, 77, 74, 86, 17, 29, 10, 52, 34, 13, 118, 111, 24, 88, 122, 73, 87, 36, 38, 45, 22, 68, 41, 4, 7, 93, 105, 71, 30, 96, 106, 69, 8, 5, 28, 66, 109, 32, 59, 100, 64, 90, 0, 103, 3, 92, 35, 67, 65, 26, 2, 1, 101, 70, 37], [46, 110, 49, 121, 112, 54, 40, 56, 104, 57, 127, 125, 53, 51, 113, 116, 43, 44, 126, 61, 119, 102, 124, 60, 117, 123, 25, 108, 16, 89, 107, 39, 115, 48, 58, 80, 78, 14, 120, 33, 63, 12, 99, 76, 21, 98, 82, 27, 55, 97, 19, 47, 42, 84, 18, 72, 91, 11, 94, 6, 85, 15, 20, 83, 9, 75, 13, 17, 77, 79, 52, 24, 114, 95, 62, 10, 81, 50, 34, 118, 74, 87, 31, 88, 86, 23, 73, 111, 22, 36, 7, 71, 109, 69, 4, 68, 29, 105, 38, 5, 122, 106, 59, 96, 30, 41, 92, 8, 103, 32, 66, 28, 35, 45, 93, 2, 90, 26, 3, 70, 100, 101, 0, 67, 64, 1, 65, 37], [46, 110, 49, 121, 112, 40, 51, 54, 57, 56, 127, 104, 53, 113, 125, 116, 43, 126, 44, 61, 124, 102, 117, 63, 60, 119, 123, 108, 16, 89, 25, 33, 48, 39, 58, 80, 72, 78, 55, 12, 14, 98, 99, 107, 120, 115, 76, 50, 114, 95, 85, 82, 118, 42, 91, 21, 97, 18, 27, 47, 20, 15, 11, 19, 84, 79, 87, 94, 77, 74, 75, 10, 24, 17, 6, 73, 66, 86, 13, 83, 52, 62, 31, 69, 5, 9, 23, 111, 38, 88, 81, 4, 36, 68, 34, 29, 22, 106, 7, 71, 122, 2, 30, 64, 32, 0, 92, 70, 1, 59, 67, 3, 105, 65, 93, 96, 103, 109, 41, 28, 35, 45, 100, 90, 101, 8, 26, 37], [46, 110, 49, 121, 112, 54, 40, 127, 51, 57, 56, 104, 125, 53, 113, 116, 43, 44, 61, 126, 123, 119, 124, 102, 117, 63, 108, 33, 48, 89, 16, 72, 107, 115, 99, 58, 25, 60, 98, 39, 14, 55, 47, 114, 12, 78, 80, 76, 50, 21, 18, 82, 120, 27, 85, 118, 97, 91, 15, 84, 20, 95, 10, 111, 19, 11, 52, 87, 73, 75, 24, 42, 86, 36, 94, 31, 17, 34, 79, 83, 23, 74, 88, 81, 62, 77, 69, 9, 4, 70, 59, 5, 7, 13, 68, 22, 38, 71, 32, 30, 122, 29, 1, 3, 0, 6, 66, 105, 96, 92, 93, 106, 28, 65, 2, 109, 41, 45, 67, 35, 101, 64, 100, 103, 26, 90, 8, 37], [46, 110, 49, 121, 112, 54, 40, 104, 56, 57, 51, 125, 127, 53, 113, 116, 43, 44, 119, 61, 126, 123, 102, 124, 117, 89, 25, 108, 107, 48, 58, 63, 60, 39, 33, 16, 115, 99, 78, 80, 14, 12, 47, 21, 72, 76, 120, 18, 82, 77, 20, 85, 114, 79, 97, 55, 94, 27, 15, 111, 75, 11, 52, 118, 91, 19, 42, 84, 81, 17, 31, 73, 34, 10, 50, 83, 86, 87, 98, 24, 88, 95, 9, 70, 74, 13, 23, 38, 36, 29, 62, 7, 68, 22, 30, 122, 96, 5, 59, 41, 93, 105, 109, 69, 4, 71, 92, 28, 32, 106, 35, 45, 101, 100, 67, 66, 6, 26, 3, 8, 103, 37, 64, 1, 2, 90, 0, 65], [46, 110, 49, 121, 112, 56, 54, 51, 127, 40, 57, 104, 125, 113, 53, 116, 43, 44, 126, 61, 124, 102, 119, 60, 48, 123, 25, 108, 89, 117, 16, 39, 47, 33, 58, 21, 120, 99, 80, 107, 55, 12, 72, 115, 78, 14, 63, 76, 18, 97, 19, 82, 27, 85, 50, 70, 15, 42, 114, 11, 20, 84, 98, 75, 95, 118, 79, 77, 34, 13, 91, 86, 122, 17, 94, 10, 74, 73, 81, 83, 31, 59, 9, 62, 24, 88, 87, 52, 22, 4, 38, 5, 41, 23, 111, 29, 69, 71, 36, 68, 7, 105, 45, 92, 93, 64, 30, 35, 8, 28, 96, 101, 1, 100, 103, 32, 66, 2, 109, 3, 106, 67, 65, 26, 0, 6, 37, 90], [46, 110, 49, 112, 121, 54, 40, 104, 56, 57, 51, 127, 113, 125, 53, 116, 43, 44, 61, 126, 124, 123, 102, 119, 117, 89, 33, 60, 48, 25, 39, 63, 16, 108, 120, 78, 80, 47, 118, 99, 55, 107, 12, 14, 21, 76, 70, 58, 72, 115, 82, 18, 114, 98, 95, 85, 50, 42, 97, 91, 11, 27, 94, 20, 73, 79, 52, 74, 75, 31, 17, 15, 84, 88, 19, 86, 13, 83, 34, 81, 24, 62, 77, 23, 10, 22, 9, 71, 5, 68, 38, 4, 69, 122, 36, 8, 87, 29, 7, 32, 109, 105, 111, 93, 30, 59, 41, 67, 2, 92, 96, 66, 65, 45, 106, 1, 103, 0, 28, 101, 3, 100, 90, 64, 35, 26, 37, 6], [46, 110, 49, 112, 121, 54, 51, 40, 104, 127, 125, 56, 57, 113, 53, 116, 43, 44, 126, 61, 124, 117, 102, 123, 119, 63, 108, 107, 33, 60, 25, 48, 55, 115, 16, 39, 89, 120, 80, 58, 12, 78, 114, 47, 99, 118, 21, 52, 14, 50, 76, 18, 98, 97, 27, 42, 82, 95, 84, 85, 24, 72, 74, 11, 91, 20, 19, 88, 94, 70, 75, 36, 87, 73, 15, 8, 79, 23, 17, 86, 38, 31, 10, 111, 34, 13, 22, 4, 9, 81, 83, 77, 5, 29, 68, 122, 71, 69, 62, 41, 105, 7, 109, 28, 92, 65, 30, 59, 2, 32, 106, 96, 0, 3, 103, 67, 45, 93, 66, 64, 6, 35, 1, 26, 101, 90, 100, 37], [46, 110, 49, 121, 112, 54, 40, 104, 56, 51, 57, 125, 127, 113, 53, 116, 43, 44, 61, 126, 119, 102, 124, 107, 123, 117, 60, 25, 108, 89, 48, 58, 33, 63, 16, 115, 80, 76, 39, 78, 12, 14, 118, 99, 47, 55, 21, 120, 18, 98, 82, 15, 75, 84, 11, 9, 85, 19, 79, 91, 42, 52, 114, 20, 27, 74, 94, 77, 8, 31, 97, 73, 81, 86, 13, 17, 24, 50, 88, 10, 87, 83, 111, 70, 95, 36, 38, 72, 62, 23, 4, 34, 29, 71, 22, 68, 7, 122, 5, 69, 41, 109, 45, 28, 106, 6, 35, 59, 32, 92, 105, 93, 30, 96, 101, 103, 67, 100, 64, 2, 66, 3, 90, 26, 1, 0, 65, 37], [46, 110, 49, 121, 112, 54, 40, 56, 104, 57, 51, 53, 127, 113, 125, 116, 43, 44, 126, 61, 124, 102, 60, 119, 123, 117, 25, 108, 39, 16, 33, 89, 107, 48, 58, 118, 55, 21, 80, 78, 76, 14, 12, 115, 8, 99, 63, 18, 97, 82, 120, 91, 85, 98, 79, 11, 47, 19, 81, 20, 84, 50, 86, 77, 114, 15, 42, 95, 10, 122, 52, 75, 27, 88, 17, 73, 13, 34, 94, 62, 6, 74, 9, 83, 22, 31, 87, 24, 111, 5, 23, 68, 38, 45, 36, 71, 29, 41, 93, 7, 59, 4, 105, 69, 32, 30, 28, 106, 103, 72, 70, 66, 92, 96, 35, 0, 2, 64, 100, 109, 65, 101, 1, 67, 90, 26, 3, 37], [46, 110, 49, 121, 112, 127, 54, 40, 56, 104, 51, 57, 125, 113, 116, 53, 43, 126, 44, 61, 102, 124, 119, 117, 123, 60, 48, 108, 33, 25, 80, 39, 63, 58, 89, 16, 14, 8, 21, 12, 107, 78, 76, 120, 55, 115, 118, 99, 18, 19, 91, 97, 82, 47, 114, 50, 85, 20, 6, 62, 98, 42, 27, 11, 84, 75, 86, 79, 10, 81, 95, 77, 15, 13, 88, 9, 34, 23, 52, 24, 74, 73, 94, 17, 87, 122, 83, 5, 31, 22, 36, 4, 109, 29, 105, 71, 68, 111, 38, 45, 69, 7, 32, 0, 59, 92, 30, 106, 28, 2, 41, 67, 93, 66, 65, 64, 96, 3, 103, 101, 35, 1, 90, 100, 72, 70, 26, 37], [46, 110, 49, 121, 112, 54, 40, 104, 51, 56, 127, 57, 113, 53, 116, 125, 43, 44, 126, 61, 124, 123, 119, 102, 25, 108, 63, 117, 60, 89, 48, 8, 80, 16, 39, 14, 115, 107, 78, 33, 55, 120, 12, 76, 58, 47, 6, 99, 98, 82, 18, 50, 21, 95, 20, 75, 97, 114, 85, 11, 9, 91, 13, 27, 84, 10, 15, 73, 19, 79, 118, 17, 74, 42, 86, 24, 62, 31, 94, 52, 5, 111, 81, 77, 68, 59, 87, 23, 36, 7, 83, 4, 88, 34, 69, 45, 71, 22, 106, 122, 38, 29, 96, 105, 92, 35, 30, 66, 3, 32, 100, 67, 109, 93, 28, 103, 41, 64, 26, 1, 101, 65, 2, 0, 90, 72, 70, 37], [46, 110, 49, 121, 112, 54, 40, 51, 56, 104, 57, 127, 113, 53, 116, 125, 43, 44, 124, 61, 126, 117, 102, 119, 48, 123, 33, 25, 58, 107, 108, 47, 16, 39, 60, 63, 80, 89, 55, 8, 12, 99, 14, 98, 78, 76, 115, 114, 97, 82, 120, 18, 21, 6, 50, 91, 95, 75, 79, 27, 85, 10, 20, 42, 94, 84, 74, 81, 11, 17, 83, 15, 19, 9, 31, 87, 52, 36, 73, 38, 34, 86, 88, 77, 4, 13, 71, 24, 22, 118, 111, 62, 23, 29, 5, 122, 69, 7, 105, 68, 106, 59, 30, 32, 28, 93, 66, 45, 96, 103, 92, 109, 64, 0, 100, 41, 2, 101, 35, 72, 3, 67, 90, 70, 1, 26, 65, 37], [46, 110, 49, 121, 112, 54, 56, 40, 57, 104, 127, 51, 113, 53, 116, 125, 43, 61, 44, 126, 123, 102, 119, 117, 124, 60, 39, 48, 25, 107, 108, 63, 33, 89, 80, 115, 114, 78, 16, 58, 55, 14, 99, 47, 12, 98, 8, 76, 118, 21, 50, 120, 18, 82, 97, 84, 27, 95, 85, 83, 75, 11, 79, 52, 9, 20, 34, 19, 74, 15, 81, 91, 13, 87, 6, 42, 73, 10, 36, 77, 94, 17, 69, 86, 31, 38, 29, 71, 4, 88, 111, 5, 24, 23, 59, 22, 68, 7, 122, 28, 64, 109, 30, 45, 41, 62, 66, 96, 106, 70, 35, 93, 100, 103, 72, 92, 67, 1, 105, 32, 2, 3, 0, 101, 65, 26, 90, 37], [46, 110, 49, 121, 112, 51, 54, 40, 104, 57, 127, 113, 56, 125, 53, 116, 43, 44, 126, 61, 119, 117, 124, 102, 123, 25, 33, 58, 55, 89, 63, 107, 16, 108, 47, 48, 39, 60, 115, 14, 80, 78, 12, 76, 99, 21, 85, 82, 8, 18, 97, 114, 98, 120, 75, 27, 20, 84, 17, 91, 50, 118, 13, 19, 79, 10, 94, 95, 15, 74, 31, 87, 11, 77, 81, 9, 24, 34, 88, 23, 83, 73, 86, 68, 22, 29, 62, 52, 72, 4, 38, 36, 70, 42, 71, 111, 5, 109, 69, 45, 59, 30, 7, 28, 122, 41, 106, 32, 96, 6, 93, 92, 105, 35, 67, 100, 2, 1, 103, 3, 64, 0, 101, 90, 66, 26, 65, 37], [46, 110, 49, 121, 112, 51, 40, 57, 56, 54, 104, 127, 113, 125, 53, 116, 43, 44, 126, 61, 124, 102, 119, 123, 60, 117, 25, 108, 58, 107, 89, 48, 33, 80, 39, 16, 63, 78, 115, 47, 12, 76, 99, 55, 14, 98, 18, 120, 27, 19, 85, 97, 94, 114, 79, 21, 20, 83, 95, 82, 42, 91, 15, 75, 8, 52, 11, 84, 17, 74, 70, 50, 118, 24, 77, 73, 13, 86, 10, 72, 87, 22, 9, 36, 81, 34, 88, 31, 23, 38, 111, 4, 62, 122, 69, 68, 29, 7, 30, 71, 5, 103, 96, 45, 28, 41, 92, 106, 93, 67, 2, 109, 32, 105, 66, 101, 35, 59, 100, 0, 64, 1, 6, 26, 65, 90, 3, 37], [46, 110, 49, 121, 112, 40, 54, 104, 56, 57, 127, 51, 125, 113, 53, 116, 43, 44, 61, 126, 119, 123, 124, 117, 102, 33, 108, 39, 48, 63, 60, 89, 107, 25, 58, 16, 78, 115, 55, 99, 80, 14, 47, 76, 12, 18, 70, 120, 97, 118, 114, 85, 98, 21, 27, 20, 75, 84, 72, 11, 52, 73, 9, 82, 17, 15, 91, 42, 79, 50, 94, 87, 13, 62, 81, 74, 10, 19, 95, 24, 34, 83, 88, 77, 86, 5, 111, 31, 36, 7, 8, 122, 71, 4, 68, 22, 69, 23, 38, 109, 30, 29, 106, 32, 105, 45, 35, 2, 103, 92, 67, 96, 28, 101, 3, 93, 64, 66, 59, 41, 100, 65, 26, 1, 0, 90, 6, 37], [46, 110, 49, 121, 112, 54, 40, 51, 127, 56, 104, 57, 125, 113, 53, 116, 43, 44, 126, 61, 124, 102, 123, 119, 117, 89, 39, 107, 60, 48, 25, 55, 33, 108, 58, 47, 16, 78, 99, 80, 115, 114, 76, 63, 12, 14, 118, 85, 120, 18, 72, 27, 50, 70, 97, 98, 79, 91, 21, 19, 20, 82, 95, 11, 84, 83, 75, 15, 87, 74, 13, 52, 34, 9, 17, 122, 77, 81, 94, 10, 31, 42, 73, 22, 38, 24, 62, 86, 111, 36, 7, 5, 69, 88, 29, 23, 68, 30, 64, 4, 71, 32, 103, 106, 35, 41, 45, 28, 105, 109, 59, 92, 8, 67, 93, 1, 96, 66, 2, 26, 100, 0, 101, 3, 65, 90, 37, 6], [46, 110, 49, 121, 112, 40, 57, 104, 54, 56, 51, 127, 125, 53, 113, 116, 43, 44, 126, 61, 124, 117, 102, 119, 123, 60, 39, 25, 89, 108, 48, 63, 107, 33, 80, 16, 99, 115, 58, 78, 12, 55, 14, 76, 21, 47, 72, 50, 97, 82, 91, 20, 85, 98, 27, 18, 94, 83, 114, 79, 120, 118, 38, 24, 73, 19, 11, 95, 75, 77, 34, 86, 13, 74, 52, 15, 42, 88, 17, 81, 84, 62, 70, 87, 31, 10, 36, 45, 9, 111, 23, 5, 122, 4, 22, 71, 29, 68, 7, 69, 92, 30, 106, 93, 41, 32, 96, 103, 28, 59, 2, 109, 6, 101, 35, 105, 8, 90, 3, 1, 26, 67, 66, 0, 100, 65, 64, 37], [46, 110, 49, 121, 112, 40, 54, 56, 104, 51, 57, 113, 53, 125, 127, 116, 43, 44, 61, 126, 119, 102, 60, 124, 39, 123, 117, 108, 89, 72, 58, 25, 16, 33, 48, 80, 107, 78, 63, 76, 120, 12, 99, 14, 98, 55, 115, 47, 50, 118, 11, 18, 79, 85, 82, 73, 42, 114, 95, 21, 97, 91, 9, 75, 84, 10, 27, 20, 77, 17, 19, 13, 15, 83, 34, 38, 74, 52, 122, 94, 69, 24, 4, 31, 81, 23, 68, 22, 70, 88, 7, 71, 62, 36, 29, 86, 87, 6, 5, 111, 45, 109, 32, 3, 28, 59, 2, 30, 0, 1, 90, 103, 105, 67, 65, 96, 66, 35, 93, 64, 92, 101, 106, 100, 41, 26, 8, 37], [46, 110, 49, 121, 112, 54, 40, 57, 56, 104, 127, 51, 113, 125, 53, 116, 43, 126, 44, 61, 123, 124, 102, 119, 60, 117, 108, 58, 115, 33, 89, 48, 25, 72, 16, 107, 39, 63, 80, 76, 78, 99, 14, 120, 47, 55, 118, 12, 98, 50, 114, 82, 27, 21, 18, 52, 85, 97, 20, 9, 83, 11, 91, 84, 74, 75, 79, 15, 10, 95, 86, 87, 19, 122, 42, 13, 34, 81, 94, 24, 17, 73, 6, 77, 36, 5, 23, 88, 38, 69, 7, 111, 22, 45, 31, 68, 4, 71, 109, 30, 62, 29, 59, 28, 106, 92, 105, 3, 103, 32, 2, 70, 96, 65, 1, 64, 100, 0, 66, 90, 41, 93, 67, 8, 35, 101, 37, 26], [46, 110, 49, 121, 112, 54, 40, 51, 56, 57, 104, 127, 113, 125, 116, 53, 43, 44, 61, 126, 124, 102, 123, 119, 117, 108, 39, 107, 60, 25, 58, 48, 16, 89, 72, 33, 63, 80, 12, 76, 55, 99, 78, 14, 115, 98, 47, 50, 18, 82, 21, 6, 85, 95, 9, 114, 120, 11, 27, 97, 20, 79, 91, 118, 75, 84, 83, 73, 10, 42, 15, 34, 36, 86, 13, 77, 19, 24, 74, 88, 31, 87, 62, 94, 52, 17, 7, 4, 81, 68, 5, 111, 38, 71, 69, 103, 45, 23, 29, 122, 22, 93, 106, 32, 105, 109, 59, 30, 41, 35, 8, 28, 3, 92, 1, 64, 2, 67, 66, 70, 0, 65, 96, 100, 90, 101, 26, 37], [46, 110, 49, 121, 112, 54, 40, 56, 104, 51, 127, 57, 125, 113, 116, 53, 43, 44, 61, 126, 119, 102, 124, 123, 117, 60, 108, 25, 89, 39, 107, 48, 80, 115, 16, 12, 58, 99, 76, 78, 14, 33, 47, 18, 55, 72, 19, 21, 27, 6, 63, 75, 120, 94, 20, 82, 9, 50, 98, 83, 91, 73, 11, 15, 34, 42, 84, 13, 88, 85, 79, 81, 52, 97, 77, 74, 10, 95, 86, 114, 118, 17, 68, 87, 24, 31, 111, 36, 71, 106, 22, 62, 23, 5, 7, 38, 29, 109, 93, 4, 30, 8, 103, 122, 35, 69, 45, 92, 41, 59, 105, 32, 66, 67, 100, 28, 64, 0, 65, 3, 96, 2, 101, 90, 26, 70, 1, 37], [46, 110, 49, 112, 121, 54, 51, 40, 57, 56, 104, 127, 125, 113, 53, 116, 43, 44, 126, 61, 124, 102, 60, 123, 119, 117, 89, 58, 108, 48, 55, 39, 63, 33, 25, 80, 107, 99, 50, 16, 14, 47, 78, 76, 12, 18, 114, 115, 120, 42, 21, 85, 118, 83, 19, 98, 95, 27, 34, 97, 52, 82, 79, 91, 20, 84, 75, 6, 77, 94, 15, 11, 73, 17, 10, 13, 31, 81, 38, 72, 9, 88, 86, 62, 87, 74, 24, 8, 22, 59, 106, 36, 23, 122, 29, 5, 68, 7, 71, 4, 111, 93, 41, 30, 92, 103, 69, 109, 45, 32, 96, 100, 66, 35, 65, 90, 1, 64, 0, 28, 105, 101, 3, 67, 26, 37, 70, 2]], "model.layers.3.self_attn.q_proj": [[40, 60, 98, 56, 52, 57, 119, 62, 32, 58, 49, 37, 113, 118, 53, 59, 95, 123, 48, 105, 89, 50, 24, 45, 122, 63, 120, 25, 101, 26, 30, 93, 39, 86, 51, 61, 85, 42, 34, 17, 38, 109, 111, 102, 35, 107, 36, 121, 114, 43, 116, 21, 126, 103, 54, 83, 84, 125, 29, 112, 108, 55, 46, 97, 96, 115, 92, 127, 110, 124, 31, 99, 47, 44, 33, 100, 41, 91, 88, 94, 15, 106, 20, 18, 117, 90, 27, 28, 82, 87, 23, 22, 81, 104, 80, 11, 78, 75, 76, 79, 74, 77, 10, 19, 8, 72, 14, 12, 68, 13, 65, 71, 5, 73, 16, 4, 2, 67, 69, 6, 9, 7, 70, 3, 1, 66, 0, 64], [119, 62, 40, 98, 121, 52, 123, 58, 53, 48, 50, 118, 38, 25, 49, 30, 29, 54, 39, 63, 60, 57, 84, 56, 122, 59, 42, 115, 86, 102, 112, 33, 27, 37, 93, 113, 116, 28, 32, 111, 107, 89, 83, 15, 46, 108, 114, 120, 103, 110, 125, 101, 80, 18, 88, 36, 45, 41, 34, 47, 21, 43, 127, 24, 55, 51, 117, 44, 23, 109, 14, 124, 61, 100, 105, 97, 92, 106, 126, 73, 35, 26, 104, 12, 72, 19, 99, 31, 94, 17, 22, 95, 85, 90, 91, 96, 74, 20, 67, 70, 76, 79, 87, 81, 11, 10, 75, 71, 2, 13, 9, 5, 4, 65, 82, 68, 78, 7, 8, 0, 77, 3, 1, 66, 6, 64, 16, 69], [40, 98, 56, 60, 62, 119, 86, 29, 52, 58, 83, 14, 80, 57, 12, 54, 59, 10, 48, 93, 17, 15, 50, 63, 24, 25, 125, 4, 123, 118, 121, 101, 9, 120, 74, 113, 72, 26, 5, 85, 114, 51, 106, 33, 99, 53, 70, 45, 122, 95, 73, 71, 78, 76, 11, 16, 30, 81, 19, 22, 89, 104, 32, 103, 108, 21, 20, 88, 66, 67, 90, 84, 7, 2, 41, 49, 18, 111, 13, 1, 39, 23, 38, 87, 110, 75, 43, 36, 27, 6, 0, 82, 77, 79, 69, 37, 47, 8, 3, 97, 100, 102, 94, 92, 126, 96, 127, 34, 91, 42, 61, 28, 109, 115, 117, 35, 31, 116, 64, 105, 112, 68, 46, 55, 107, 65, 44, 124], [58, 40, 98, 119, 62, 60, 52, 29, 57, 86, 56, 48, 12, 80, 83, 59, 14, 24, 50, 51, 10, 9, 54, 72, 95, 73, 4, 19, 11, 71, 114, 116, 93, 66, 100, 76, 69, 63, 123, 32, 2, 49, 70, 106, 0, 125, 113, 16, 118, 68, 91, 5, 26, 7, 67, 75, 82, 53, 39, 90, 127, 79, 28, 120, 99, 25, 23, 103, 109, 27, 94, 22, 55, 111, 81, 13, 77, 108, 89, 18, 61, 117, 47, 46, 85, 87, 8, 121, 78, 88, 38, 36, 21, 31, 107, 3, 17, 37, 20, 30, 42, 15, 35, 122, 124, 96, 112, 84, 92, 45, 102, 43, 105, 101, 97, 33, 44, 64, 104, 126, 74, 6, 41, 115, 1, 110, 34, 65], [38, 97, 111, 120, 123, 122, 47, 84, 81, 10, 14, 71, 12, 8, 2, 67, 69, 58, 1, 0, 87, 88, 76, 19, 65, 83, 93, 106, 17, 86, 57, 34, 53, 11, 42, 32, 25, 118, 26, 20, 85, 99, 96, 56, 80, 49, 66, 31, 15, 78, 3, 105, 95, 89, 112, 109, 94, 52, 124, 101, 98, 110, 30, 126, 90, 92, 37, 22, 64, 55, 115, 82, 91, 103, 79, 44, 48, 74, 73, 119, 113, 60, 27, 45, 36, 4, 39, 29, 54, 62, 41, 61, 100, 21, 121, 108, 35, 63, 28, 51, 72, 46, 7, 18, 114, 127, 107, 125, 75, 77, 117, 16, 59, 23, 50, 43, 116, 104, 40, 13, 70, 5, 9, 24, 6, 68, 33, 102], [38, 97, 111, 122, 123, 120, 47, 81, 23, 84, 14, 71, 12, 10, 8, 69, 67, 75, 19, 73, 25, 7, 3, 1, 56, 76, 21, 105, 13, 78, 58, 5, 65, 2, 49, 79, 64, 4, 94, 70, 66, 30, 113, 0, 31, 98, 6, 112, 72, 16, 126, 86, 61, 114, 87, 52, 15, 88, 43, 35, 74, 42, 124, 62, 119, 63, 68, 37, 11, 104, 121, 45, 118, 17, 99, 20, 110, 57, 93, 82, 24, 26, 90, 77, 80, 32, 28, 22, 103, 85, 117, 46, 36, 9, 51, 106, 125, 18, 108, 92, 59, 83, 55, 115, 89, 60, 53, 41, 29, 44, 107, 33, 100, 95, 39, 127, 48, 54, 27, 101, 116, 50, 40, 91, 109, 96, 102, 34], [38, 111, 97, 123, 120, 122, 47, 81, 14, 84, 23, 12, 10, 8, 69, 71, 59, 75, 58, 3, 1, 86, 67, 16, 11, 40, 56, 127, 88, 25, 24, 105, 50, 31, 32, 87, 51, 9, 90, 45, 116, 76, 43, 73, 110, 74, 5, 125, 22, 21, 106, 4, 2, 62, 82, 19, 91, 78, 13, 98, 72, 113, 26, 93, 27, 28, 48, 100, 20, 83, 99, 66, 94, 15, 53, 117, 39, 17, 95, 42, 80, 49, 104, 108, 63, 65, 0, 118, 96, 126, 55, 60, 33, 92, 61, 77, 37, 85, 114, 30, 107, 35, 52, 119, 46, 44, 79, 103, 6, 41, 34, 109, 7, 29, 57, 18, 36, 115, 112, 64, 54, 101, 121, 89, 70, 102, 124, 68], [38, 97, 111, 123, 122, 120, 47, 84, 81, 14, 23, 12, 10, 71, 24, 8, 69, 67, 3, 1, 55, 105, 93, 76, 44, 59, 78, 86, 7, 56, 58, 70, 73, 16, 61, 107, 25, 30, 2, 21, 11, 57, 60, 126, 6, 51, 68, 4, 89, 80, 88, 79, 17, 104, 72, 32, 35, 20, 19, 74, 66, 95, 46, 41, 42, 26, 96, 43, 94, 75, 0, 98, 116, 106, 40, 15, 22, 48, 5, 50, 54, 9, 29, 63, 87, 124, 109, 114, 118, 110, 82, 37, 53, 112, 113, 27, 125, 45, 92, 121, 91, 85, 77, 34, 117, 62, 39, 49, 101, 65, 103, 119, 31, 90, 83, 28, 100, 115, 99, 36, 64, 52, 127, 18, 13, 108, 33, 102], [41, 34, 18, 20, 52, 76, 89, 79, 22, 16, 14, 120, 114, 62, 8, 10, 105, 46, 47, 63, 81, 126, 116, 69, 13, 125, 117, 7, 9, 127, 3, 59, 123, 72, 71, 5, 80, 23, 28, 73, 53, 70, 112, 12, 74, 50, 119, 24, 78, 19, 113, 6, 49, 66, 58, 15, 2, 110, 25, 75, 82, 54, 17, 109, 107, 95, 85, 86, 67, 11, 83, 94, 111, 118, 104, 60, 29, 37, 55, 61, 32, 93, 122, 27, 88, 68, 1, 39, 56, 115, 84, 64, 38, 42, 31, 43, 33, 101, 102, 87, 21, 26, 44, 103, 100, 0, 97, 35, 96, 121, 57, 48, 91, 92, 77, 45, 108, 106, 124, 4, 99, 51, 30, 90, 40, 65, 36, 98], [41, 34, 89, 52, 20, 22, 120, 18, 47, 46, 63, 114, 126, 80, 116, 76, 62, 105, 72, 14, 28, 78, 125, 117, 11, 79, 123, 82, 127, 13, 85, 7, 59, 74, 77, 10, 12, 19, 84, 58, 21, 15, 49, 9, 107, 119, 53, 54, 90, 24, 86, 81, 113, 5, 109, 73, 25, 95, 112, 29, 8, 23, 92, 103, 27, 104, 30, 37, 50, 61, 102, 39, 16, 88, 69, 96, 35, 32, 94, 3, 31, 71, 26, 110, 111, 101, 55, 33, 60, 124, 51, 93, 36, 83, 17, 38, 66, 57, 118, 43, 87, 42, 115, 98, 91, 75, 40, 4, 56, 99, 1, 108, 44, 122, 97, 100, 121, 6, 67, 68, 65, 45, 48, 106, 70, 2, 64, 0], [41, 34, 52, 81, 22, 18, 114, 20, 126, 76, 79, 120, 14, 10, 62, 69, 47, 116, 8, 3, 105, 63, 5, 46, 117, 125, 127, 72, 7, 9, 89, 54, 66, 123, 24, 1, 70, 6, 59, 74, 53, 71, 16, 113, 49, 28, 2, 64, 58, 77, 86, 110, 32, 73, 0, 119, 27, 68, 85, 50, 107, 112, 78, 12, 67, 17, 87, 83, 15, 13, 80, 84, 26, 104, 109, 61, 25, 92, 90, 60, 96, 118, 56, 38, 65, 57, 75, 95, 19, 45, 43, 82, 11, 111, 91, 99, 88, 35, 93, 33, 101, 31, 97, 122, 21, 29, 30, 37, 102, 39, 40, 100, 44, 42, 23, 106, 94, 55, 36, 124, 121, 115, 48, 103, 51, 4, 108, 98], [41, 34, 20, 18, 22, 52, 10, 76, 14, 6, 67, 62, 46, 120, 79, 126, 125, 114, 3, 116, 47, 105, 7, 8, 63, 9, 69, 127, 54, 2, 28, 117, 59, 58, 25, 0, 13, 123, 70, 110, 72, 80, 77, 112, 12, 49, 24, 90, 65, 75, 44, 113, 11, 119, 53, 71, 74, 23, 73, 91, 104, 35, 32, 78, 85, 15, 64, 87, 86, 89, 84, 88, 45, 66, 61, 21, 26, 5, 17, 68, 95, 60, 1, 42, 121, 83, 51, 122, 92, 100, 107, 56, 103, 109, 36, 106, 111, 94, 81, 48, 82, 38, 50, 55, 43, 16, 37, 19, 31, 102, 27, 29, 108, 124, 57, 97, 39, 93, 101, 99, 40, 115, 33, 96, 118, 30, 4, 98], [103, 117, 125, 109, 46, 56, 122, 97, 58, 52, 63, 57, 47, 28, 126, 59, 115, 89, 39, 21, 124, 119, 51, 49, 116, 62, 110, 60, 113, 55, 114, 111, 54, 86, 50, 87, 112, 123, 127, 61, 121, 53, 81, 33, 82, 48, 118, 108, 45, 120, 24, 107, 20, 44, 41, 31, 42, 19, 105, 106, 79, 43, 13, 96, 37, 30, 40, 100, 102, 104, 101, 99, 38, 36, 94, 34, 35, 95, 98, 32, 73, 80, 92, 93, 90, 17, 22, 91, 29, 76, 12, 26, 78, 88, 25, 71, 70, 27, 16, 83, 7, 9, 10, 85, 77, 4, 23, 68, 18, 84, 74, 5, 6, 8, 15, 3, 14, 67, 66, 0, 11, 65, 75, 69, 1, 64, 72, 2], [103, 117, 52, 109, 125, 56, 97, 122, 46, 119, 57, 28, 63, 58, 89, 21, 82, 115, 9, 47, 12, 78, 87, 126, 80, 20, 124, 33, 49, 116, 13, 62, 19, 110, 31, 83, 114, 59, 86, 54, 17, 111, 92, 61, 123, 127, 50, 60, 51, 94, 30, 55, 25, 6, 112, 74, 107, 81, 79, 71, 118, 77, 90, 11, 108, 121, 7, 24, 113, 53, 39, 48, 16, 101, 15, 18, 44, 32, 88, 95, 73, 4, 67, 22, 36, 45, 29, 84, 76, 69, 43, 41, 8, 68, 105, 37, 23, 26, 91, 85, 70, 5, 104, 93, 99, 14, 34, 75, 10, 100, 96, 120, 42, 106, 35, 3, 65, 38, 102, 66, 0, 27, 40, 98, 1, 2, 72, 64], [103, 117, 52, 125, 109, 97, 119, 122, 21, 89, 56, 28, 46, 59, 60, 20, 78, 82, 79, 87, 58, 47, 80, 55, 9, 124, 113, 50, 126, 121, 57, 12, 114, 123, 111, 62, 7, 74, 11, 63, 25, 118, 51, 53, 61, 92, 15, 54, 4, 22, 90, 116, 31, 112, 19, 110, 30, 115, 127, 17, 107, 33, 83, 70, 49, 95, 8, 13, 91, 98, 93, 6, 69, 24, 81, 44, 94, 77, 32, 71, 48, 67, 88, 73, 45, 38, 23, 99, 43, 42, 86, 29, 26, 66, 104, 84, 37, 5, 76, 101, 96, 102, 120, 39, 105, 41, 106, 34, 108, 100, 40, 1, 27, 16, 36, 10, 85, 68, 35, 18, 14, 0, 75, 2, 65, 64, 72, 3], [103, 117, 52, 46, 97, 125, 109, 119, 56, 122, 21, 63, 82, 78, 80, 87, 58, 9, 83, 89, 20, 12, 28, 92, 57, 8, 11, 6, 126, 15, 59, 75, 25, 79, 67, 124, 54, 74, 16, 13, 81, 114, 49, 62, 76, 115, 71, 123, 31, 53, 110, 68, 69, 60, 85, 90, 18, 19, 77, 95, 30, 14, 23, 127, 66, 111, 50, 72, 48, 33, 24, 84, 113, 10, 64, 70, 73, 51, 5, 3, 47, 112, 61, 4, 26, 22, 86, 55, 17, 93, 1, 2, 88, 27, 29, 0, 94, 118, 107, 91, 32, 96, 44, 99, 34, 65, 121, 108, 116, 101, 7, 41, 98, 100, 36, 35, 102, 106, 104, 120, 37, 43, 42, 38, 40, 105, 45, 39], [104, 33, 108, 20, 18, 23, 16, 13, 10, 44, 72, 68, 70, 48, 56, 107, 63, 55, 40, 117, 54, 84, 116, 57, 90, 21, 50, 12, 66, 14, 110, 31, 115, 51, 11, 9, 58, 73, 78, 61, 109, 114, 87, 111, 96, 121, 123, 25, 52, 120, 79, 1, 5, 82, 26, 100, 86, 43, 74, 125, 127, 59, 94, 119, 83, 118, 3, 76, 15, 126, 2, 38, 92, 45, 77, 34, 6, 8, 22, 102, 41, 62, 24, 88, 95, 71, 112, 36, 7, 60, 105, 106, 93, 80, 89, 42, 99, 85, 113, 47, 67, 101, 27, 39, 17, 98, 81, 46, 29, 103, 69, 53, 28, 30, 19, 35, 4, 91, 49, 124, 75, 122, 37, 32, 64, 65, 97, 0], [104, 108, 33, 65, 10, 13, 68, 18, 70, 1, 16, 64, 7, 44, 23, 67, 66, 87, 40, 0, 72, 20, 4, 56, 110, 69, 111, 116, 120, 3, 55, 5, 50, 12, 115, 6, 57, 61, 11, 123, 71, 54, 58, 84, 113, 77, 112, 48, 8, 2, 19, 62, 125, 114, 117, 51, 100, 43, 124, 52, 118, 78, 38, 14, 26, 9, 99, 80, 97, 63, 31, 96, 45, 24, 83, 49, 79, 21, 59, 107, 127, 29, 119, 74, 30, 60, 88, 109, 39, 122, 121, 89, 15, 85, 92, 36, 91, 41, 86, 32, 101, 25, 106, 94, 37, 93, 126, 34, 42, 90, 75, 82, 105, 53, 17, 73, 98, 103, 76, 47, 22, 27, 46, 28, 35, 81, 102, 95], [104, 108, 33, 64, 0, 44, 70, 13, 68, 10, 7, 66, 67, 16, 20, 18, 87, 1, 40, 23, 65, 72, 111, 56, 50, 110, 57, 55, 5, 120, 48, 116, 4, 84, 113, 115, 11, 123, 12, 61, 71, 51, 114, 100, 3, 117, 58, 21, 6, 118, 43, 125, 2, 80, 62, 45, 77, 52, 96, 75, 54, 69, 109, 86, 8, 126, 41, 38, 124, 121, 39, 47, 76, 101, 74, 27, 59, 89, 25, 99, 31, 35, 28, 79, 24, 81, 107, 83, 82, 34, 78, 92, 90, 60, 95, 37, 19, 94, 14, 9, 98, 127, 49, 17, 91, 29, 53, 30, 88, 106, 63, 85, 26, 112, 42, 36, 102, 32, 105, 46, 122, 103, 22, 93, 15, 119, 73, 97], [104, 108, 33, 23, 16, 18, 10, 44, 20, 13, 7, 72, 12, 116, 55, 87, 40, 110, 61, 67, 68, 107, 115, 117, 56, 70, 48, 114, 69, 50, 57, 125, 109, 1, 66, 118, 123, 120, 60, 54, 124, 111, 21, 81, 83, 25, 58, 126, 78, 64, 11, 59, 29, 63, 9, 88, 28, 90, 43, 32, 52, 46, 42, 62, 2, 8, 85, 47, 101, 73, 27, 89, 15, 113, 84, 76, 49, 103, 91, 99, 31, 24, 14, 3, 119, 86, 22, 98, 105, 122, 121, 36, 75, 79, 41, 106, 45, 30, 94, 127, 92, 80, 82, 93, 19, 96, 112, 77, 53, 17, 38, 6, 26, 95, 102, 39, 34, 100, 37, 4, 51, 71, 35, 5, 74, 97, 65, 0], [41, 46, 53, 127, 40, 97, 60, 105, 91, 31, 119, 39, 122, 47, 27, 112, 38, 24, 116, 99, 36, 115, 118, 42, 93, 57, 117, 33, 43, 125, 51, 126, 35, 110, 85, 55, 18, 114, 90, 87, 102, 88, 92, 96, 111, 100, 29, 124, 20, 109, 59, 49, 52, 95, 123, 63, 50, 56, 81, 44, 23, 30, 106, 48, 104, 79, 120, 108, 98, 25, 80, 107, 62, 121, 34, 37, 22, 94, 77, 82, 19, 113, 14, 26, 9, 58, 61, 28, 45, 54, 101, 75, 21, 83, 103, 13, 16, 86, 32, 84, 89, 74, 11, 76, 17, 5, 72, 15, 6, 66, 4, 73, 71, 2, 69, 10, 67, 70, 1, 68, 78, 64, 12, 0, 7, 8, 65, 3], [40, 97, 53, 127, 46, 24, 85, 31, 110, 14, 81, 75, 88, 60, 19, 90, 93, 41, 76, 47, 115, 71, 80, 74, 9, 4, 122, 82, 83, 119, 21, 91, 43, 38, 6, 104, 96, 55, 105, 42, 112, 27, 3, 72, 73, 33, 68, 101, 100, 37, 22, 26, 63, 124, 108, 13, 29, 54, 62, 126, 69, 30, 25, 125, 98, 114, 84, 15, 66, 117, 23, 45, 10, 77, 70, 118, 51, 16, 35, 106, 79, 87, 113, 17, 92, 34, 116, 1, 102, 89, 52, 44, 103, 12, 28, 48, 99, 56, 2, 20, 64, 111, 78, 36, 18, 123, 7, 61, 11, 94, 39, 65, 32, 50, 95, 86, 49, 8, 5, 59, 109, 57, 67, 121, 0, 107, 58, 120], [41, 110, 40, 53, 97, 127, 60, 43, 46, 38, 31, 47, 112, 91, 92, 35, 116, 52, 19, 122, 113, 33, 124, 119, 117, 26, 93, 59, 39, 27, 24, 125, 108, 63, 86, 56, 123, 95, 118, 57, 115, 49, 29, 126, 48, 45, 58, 88, 28, 76, 100, 96, 55, 42, 114, 18, 80, 85, 121, 99, 13, 87, 103, 37, 36, 22, 106, 51, 50, 111, 61, 90, 98, 109, 94, 12, 62, 105, 54, 14, 25, 107, 21, 20, 30, 44, 83, 82, 77, 101, 104, 70, 89, 6, 34, 102, 120, 84, 32, 81, 79, 23, 74, 16, 75, 15, 72, 9, 67, 3, 10, 78, 5, 71, 2, 0, 66, 73, 4, 65, 11, 8, 1, 69, 7, 68, 17, 64], [40, 46, 127, 97, 53, 43, 31, 38, 67, 41, 60, 75, 5, 14, 85, 71, 81, 65, 110, 24, 0, 9, 39, 90, 1, 19, 104, 47, 115, 112, 35, 91, 77, 21, 27, 88, 26, 17, 72, 7, 59, 102, 121, 124, 80, 93, 23, 79, 10, 76, 125, 48, 44, 2, 70, 8, 105, 100, 118, 54, 3, 56, 52, 103, 92, 64, 69, 116, 4, 12, 82, 57, 96, 87, 78, 49, 99, 36, 119, 42, 68, 74, 98, 11, 15, 117, 114, 18, 25, 86, 126, 37, 63, 28, 106, 122, 66, 34, 108, 120, 83, 84, 32, 62, 50, 94, 73, 107, 95, 22, 111, 30, 51, 113, 55, 16, 58, 61, 123, 109, 101, 20, 13, 45, 33, 89, 6, 29], [39, 124, 34, 117, 119, 47, 24, 62, 109, 20, 15, 13, 48, 26, 17, 11, 53, 70, 9, 72, 55, 19, 1, 65, 73, 28, 10, 78, 69, 123, 5, 21, 118, 68, 86, 83, 106, 81, 71, 49, 95, 112, 122, 25, 87, 31, 79, 116, 12, 92, 85, 18, 88, 27, 35, 115, 30, 90, 107, 14, 75, 8, 89, 7, 77, 66, 74, 22, 23, 94, 84, 45, 80, 91, 33, 3, 61, 6, 67, 76, 110, 52, 16, 82, 121, 2, 64, 4, 50, 93, 54, 29, 32, 43, 0, 97, 99, 41, 51, 111, 100, 63, 46, 37, 101, 36, 38, 56, 96, 59, 44, 127, 60, 40, 120, 114, 102, 42, 98, 113, 57, 105, 58, 108, 126, 104, 103, 125], [109, 119, 39, 62, 117, 34, 48, 124, 45, 61, 20, 30, 26, 94, 86, 110, 92, 24, 115, 17, 112, 116, 28, 122, 103, 19, 118, 114, 98, 15, 57, 123, 78, 51, 58, 125, 42, 63, 27, 53, 91, 55, 22, 47, 49, 126, 87, 95, 83, 54, 43, 127, 11, 59, 9, 44, 113, 111, 33, 121, 14, 38, 60, 50, 56, 52, 46, 108, 16, 68, 76, 41, 29, 32, 99, 89, 40, 13, 107, 105, 71, 106, 104, 102, 73, 90, 25, 21, 96, 37, 23, 100, 120, 72, 93, 35, 36, 101, 97, 31, 82, 80, 10, 85, 18, 84, 4, 70, 88, 69, 81, 7, 67, 75, 64, 0, 77, 74, 79, 2, 12, 8, 1, 3, 66, 5, 65, 6], [48, 39, 62, 34, 119, 109, 124, 117, 24, 47, 20, 11, 15, 13, 17, 94, 70, 73, 85, 10, 9, 68, 7, 86, 53, 80, 115, 19, 61, 118, 123, 45, 28, 22, 72, 121, 25, 55, 1, 3, 110, 93, 54, 92, 29, 89, 106, 30, 31, 69, 75, 112, 87, 95, 88, 66, 50, 12, 79, 26, 27, 81, 77, 16, 71, 82, 14, 60, 91, 64, 4, 43, 122, 84, 21, 40, 114, 90, 8, 23, 96, 0, 67, 18, 116, 76, 107, 57, 46, 83, 74, 97, 2, 32, 33, 102, 6, 100, 52, 111, 108, 37, 99, 105, 63, 78, 35, 44, 56, 49, 41, 36, 38, 65, 126, 127, 101, 42, 120, 51, 104, 125, 113, 58, 59, 5, 98, 103], [48, 39, 62, 124, 119, 109, 117, 34, 47, 53, 26, 30, 49, 86, 116, 45, 42, 112, 123, 20, 115, 63, 61, 55, 50, 125, 24, 111, 92, 110, 58, 33, 43, 122, 107, 118, 80, 54, 29, 114, 44, 41, 94, 46, 93, 96, 57, 40, 106, 52, 25, 120, 56, 60, 37, 113, 127, 73, 121, 51, 108, 36, 126, 104, 35, 22, 28, 98, 100, 105, 87, 82, 17, 83, 32, 102, 38, 99, 31, 59, 101, 91, 97, 95, 19, 23, 15, 27, 103, 89, 90, 85, 16, 76, 13, 21, 71, 78, 84, 74, 81, 9, 6, 14, 79, 18, 75, 72, 88, 11, 12, 10, 5, 77, 67, 68, 4, 8, 7, 70, 3, 2, 1, 69, 0, 65, 66, 64], [40, 97, 119, 31, 81, 23, 19, 52, 76, 85, 79, 111, 112, 93, 53, 110, 7, 73, 59, 61, 57, 104, 113, 117, 5, 126, 60, 58, 27, 3, 44, 54, 107, 48, 86, 13, 43, 24, 91, 114, 1, 10, 55, 67, 28, 115, 127, 78, 123, 50, 30, 90, 71, 15, 116, 56, 109, 82, 121, 21, 83, 94, 88, 108, 101, 12, 80, 87, 74, 103, 75, 29, 72, 77, 96, 47, 42, 17, 35, 16, 39, 84, 25, 11, 37, 100, 45, 26, 33, 122, 6, 22, 118, 124, 0, 34, 92, 63, 46, 8, 70, 65, 125, 36, 68, 106, 120, 18, 98, 69, 9, 99, 4, 89, 62, 32, 102, 105, 51, 38, 64, 20, 49, 14, 41, 95, 66, 2], [40, 97, 119, 31, 23, 81, 52, 13, 111, 53, 19, 79, 58, 112, 85, 25, 110, 60, 61, 117, 55, 123, 54, 24, 76, 57, 51, 126, 84, 20, 93, 113, 59, 73, 114, 104, 121, 88, 27, 91, 33, 108, 29, 47, 82, 18, 43, 26, 7, 28, 41, 86, 90, 72, 99, 103, 44, 109, 63, 107, 45, 77, 94, 122, 75, 30, 87, 70, 34, 105, 5, 21, 100, 42, 78, 74, 22, 127, 120, 50, 35, 101, 116, 32, 69, 96, 83, 56, 48, 14, 1, 115, 8, 15, 89, 38, 46, 124, 125, 17, 80, 37, 68, 49, 98, 12, 92, 16, 11, 36, 6, 118, 39, 9, 62, 102, 106, 2, 67, 95, 71, 0, 3, 66, 10, 4, 64, 65], [40, 97, 119, 31, 81, 85, 52, 23, 19, 111, 24, 79, 76, 73, 126, 57, 112, 54, 5, 93, 58, 53, 44, 104, 110, 117, 13, 48, 7, 59, 108, 107, 60, 15, 70, 123, 61, 113, 120, 34, 55, 114, 43, 25, 29, 3, 89, 1, 50, 109, 12, 30, 28, 6, 88, 82, 83, 116, 22, 39, 78, 91, 42, 56, 87, 2, 80, 121, 74, 69, 75, 127, 26, 51, 27, 115, 21, 68, 37, 103, 32, 46, 101, 94, 45, 47, 118, 99, 66, 62, 38, 122, 100, 124, 35, 20, 102, 77, 86, 64, 14, 41, 11, 96, 90, 92, 17, 33, 106, 105, 98, 16, 9, 63, 36, 10, 65, 8, 84, 125, 0, 49, 4, 71, 72, 18, 95, 67], [40, 97, 119, 31, 26, 81, 52, 19, 85, 79, 23, 112, 126, 111, 73, 110, 13, 76, 53, 57, 25, 58, 93, 55, 104, 114, 54, 78, 5, 117, 96, 51, 60, 123, 59, 7, 113, 99, 61, 42, 20, 27, 88, 84, 82, 45, 77, 43, 1, 125, 91, 87, 48, 44, 116, 24, 107, 109, 3, 70, 80, 102, 16, 94, 15, 69, 106, 21, 2, 9, 90, 56, 127, 121, 115, 83, 75, 12, 89, 33, 47, 122, 28, 101, 120, 92, 36, 34, 22, 17, 29, 6, 74, 41, 103, 108, 49, 32, 30, 63, 62, 14, 11, 86, 100, 105, 64, 0, 10, 67, 46, 35, 68, 118, 50, 39, 98, 37, 124, 66, 72, 38, 71, 4, 65, 18, 8, 95]], "model.layers.3.self_attn.k_proj": [[104, 34, 93, 56, 62, 119, 60, 58, 80, 83, 24, 14, 86, 12, 10, 0, 26, 70, 5, 2, 67, 125, 72, 54, 116, 114, 77, 25, 85, 73, 22, 9, 6, 71, 118, 115, 18, 13, 42, 112, 96, 32, 37, 68, 95, 117, 8, 92, 29, 49, 41, 51, 65, 84, 59, 7, 113, 61, 111, 100, 43, 105, 17, 103, 45, 127, 35, 121, 108, 55, 99, 110, 57, 109, 124, 30, 89, 75, 106, 122, 48, 107, 33, 63, 88, 50, 91, 28, 21, 120, 31, 76, 87, 101, 46, 102, 44, 47, 38, 82, 1, 15, 20, 78, 53, 27, 126, 97, 79, 4, 64, 123, 39, 36, 23, 52, 69, 94, 11, 90, 74, 81, 66, 3, 16, 19, 98, 40], [47, 122, 102, 123, 120, 64, 65, 69, 33, 8, 10, 12, 71, 81, 84, 67, 14, 23, 68, 66, 0, 2, 1, 75, 5, 3, 70, 24, 86, 113, 25, 13, 6, 19, 73, 43, 72, 88, 124, 87, 9, 42, 21, 40, 118, 59, 119, 111, 41, 38, 61, 103, 105, 39, 83, 79, 117, 49, 100, 114, 27, 15, 18, 16, 116, 60, 104, 82, 125, 121, 45, 50, 110, 53, 11, 108, 52, 37, 85, 55, 4, 109, 106, 127, 48, 115, 92, 96, 98, 58, 63, 28, 30, 89, 26, 107, 90, 31, 54, 80, 36, 77, 46, 95, 99, 56, 62, 51, 57, 112, 91, 29, 34, 35, 126, 94, 44, 22, 101, 20, 93, 74, 7, 17, 78, 32, 76, 97], [105, 98, 52, 79, 18, 9, 0, 20, 22, 76, 7, 111, 69, 110, 14, 120, 10, 2, 62, 50, 63, 67, 1, 4, 126, 53, 6, 127, 41, 59, 114, 125, 24, 113, 65, 43, 11, 8, 46, 48, 89, 123, 19, 77, 117, 119, 54, 58, 81, 90, 70, 17, 47, 60, 68, 95, 28, 64, 3, 40, 122, 112, 72, 91, 71, 108, 75, 85, 61, 44, 83, 51, 121, 124, 73, 115, 32, 93, 116, 66, 30, 74, 97, 29, 45, 87, 57, 33, 94, 38, 16, 55, 99, 39, 23, 21, 100, 31, 26, 88, 109, 118, 96, 56, 27, 101, 42, 103, 49, 104, 102, 78, 106, 35, 92, 37, 107, 36, 80, 25, 86, 82, 12, 15, 5, 13, 84, 34], [39, 52, 33, 45, 125, 110, 28, 74, 119, 87, 117, 46, 78, 89, 4, 21, 12, 80, 0, 66, 11, 86, 8, 17, 71, 82, 20, 1, 7, 58, 62, 65, 79, 24, 6, 53, 67, 57, 95, 3, 90, 94, 64, 72, 127, 5, 56, 50, 54, 9, 116, 126, 63, 48, 108, 55, 123, 19, 75, 43, 109, 61, 92, 69, 68, 15, 118, 49, 73, 59, 13, 60, 106, 122, 51, 77, 88, 41, 120, 113, 124, 115, 101, 112, 107, 44, 84, 32, 42, 105, 37, 121, 96, 114, 83, 102, 104, 30, 14, 111, 10, 16, 100, 40, 38, 29, 99, 35, 34, 47, 36, 2, 98, 31, 91, 27, 81, 76, 93, 26, 18, 103, 22, 70, 97, 23, 85, 25], [44, 40, 64, 97, 108, 13, 1, 112, 16, 10, 56, 23, 46, 50, 67, 18, 70, 66, 47, 7, 68, 69, 123, 2, 104, 61, 72, 20, 52, 55, 57, 120, 51, 12, 3, 125, 116, 118, 58, 119, 117, 54, 49, 109, 4, 124, 14, 73, 11, 39, 53, 6, 43, 115, 114, 86, 75, 21, 5, 85, 45, 60, 126, 62, 78, 22, 65, 107, 41, 90, 83, 42, 81, 36, 15, 76, 27, 122, 38, 19, 127, 99, 71, 95, 25, 106, 91, 89, 111, 110, 88, 79, 121, 103, 92, 17, 63, 29, 98, 31, 24, 96, 105, 26, 93, 30, 28, 59, 113, 94, 32, 84, 102, 101, 37, 77, 35, 87, 48, 9, 34, 100, 74, 82, 0, 80, 8, 33], [104, 127, 53, 110, 95, 33, 90, 85, 122, 35, 83, 64, 88, 111, 81, 19, 115, 91, 14, 87, 102, 106, 75, 70, 76, 9, 29, 71, 66, 80, 93, 24, 18, 61, 99, 48, 45, 63, 20, 92, 126, 54, 107, 103, 125, 13, 4, 97, 105, 1, 60, 108, 72, 121, 77, 2, 59, 57, 123, 6, 3, 62, 117, 49, 114, 56, 67, 5, 44, 42, 36, 50, 43, 51, 109, 28, 58, 47, 113, 116, 37, 74, 98, 55, 112, 101, 27, 124, 89, 118, 34, 46, 120, 119, 79, 32, 52, 25, 94, 69, 78, 86, 96, 30, 10, 15, 39, 82, 100, 16, 23, 84, 0, 8, 22, 41, 26, 68, 65, 21, 73, 31, 38, 11, 17, 12, 40, 7], [103, 98, 124, 119, 62, 48, 11, 24, 17, 117, 20, 15, 13, 72, 112, 26, 0, 70, 68, 1, 30, 69, 53, 2, 3, 45, 9, 47, 111, 109, 78, 7, 54, 43, 10, 67, 19, 22, 76, 52, 86, 115, 64, 87, 49, 125, 57, 46, 126, 51, 56, 122, 66, 61, 118, 63, 127, 60, 71, 44, 120, 113, 80, 107, 55, 92, 121, 101, 106, 40, 105, 110, 38, 74, 108, 95, 73, 100, 42, 35, 58, 114, 32, 29, 104, 41, 50, 28, 116, 21, 37, 33, 123, 102, 4, 31, 93, 94, 14, 25, 82, 36, 97, 59, 96, 27, 23, 83, 85, 99, 39, 5, 8, 84, 89, 91, 34, 90, 18, 6, 16, 75, 88, 12, 77, 65, 81, 79], [104, 33, 119, 23, 19, 79, 95, 47, 85, 76, 112, 81, 55, 52, 53, 73, 113, 7, 65, 75, 59, 58, 126, 5, 46, 60, 57, 29, 61, 108, 13, 64, 49, 3, 123, 107, 54, 71, 14, 44, 117, 127, 10, 69, 80, 109, 43, 68, 114, 67, 37, 86, 25, 48, 77, 24, 28, 91, 2, 56, 11, 27, 18, 111, 4, 118, 124, 72, 30, 9, 115, 66, 40, 121, 62, 42, 16, 106, 103, 22, 38, 125, 70, 63, 74, 120, 93, 50, 39, 20, 6, 105, 35, 98, 90, 17, 101, 34, 32, 94, 96, 88, 92, 26, 110, 100, 99, 102, 89, 116, 87, 45, 51, 82, 41, 122, 36, 78, 21, 84, 0, 8, 1, 12, 15, 97, 83, 31]], "model.layers.3.self_attn.qk_proj": [[119, 104, 47, 62, 120, 52, 44, 123, 122, 53, 117, 127, 105, 40, 108, 125, 97, 110, 124, 48, 46, 56, 84, 20, 58, 60, 12, 87, 41, 109, 23, 74, 17, 76, 81, 78, 111, 112, 82, 10, 7, 14, 126, 64, 39, 15, 0, 79, 71, 103, 77, 57, 18, 13, 16, 33, 54, 114, 63, 24, 72, 88, 38, 83, 22, 86, 70, 55, 3, 19, 80, 21, 67, 9, 85, 29, 8, 45, 50, 59, 65, 5, 73, 69, 34, 89, 11, 92, 66, 90, 4, 116, 1, 25, 61, 2, 95, 113, 93, 68, 98, 75, 31, 43, 118, 26, 49, 6, 27, 115, 51, 121, 94, 28, 102, 35, 42, 30, 107, 32, 37, 106, 99, 91, 36, 96, 101, 100], [119, 104, 47, 52, 62, 120, 123, 44, 122, 53, 117, 127, 105, 40, 125, 108, 97, 110, 56, 124, 48, 46, 60, 58, 20, 84, 41, 76, 87, 109, 23, 17, 12, 74, 81, 10, 0, 14, 111, 112, 78, 64, 82, 15, 7, 103, 71, 70, 18, 86, 33, 39, 114, 63, 22, 19, 24, 80, 72, 77, 79, 57, 38, 83, 88, 13, 54, 3, 1, 65, 59, 16, 50, 8, 5, 21, 126, 69, 85, 29, 55, 92, 73, 9, 67, 2, 90, 66, 4, 68, 31, 45, 34, 11, 113, 98, 25, 116, 61, 95, 43, 89, 93, 75, 26, 51, 27, 118, 49, 6, 115, 102, 28, 94, 107, 121, 35, 30, 37, 42, 32, 96, 91, 99, 106, 101, 36, 100], [119, 104, 47, 62, 52, 120, 44, 123, 122, 53, 117, 127, 40, 105, 125, 108, 97, 56, 110, 124, 48, 58, 46, 20, 60, 84, 87, 23, 41, 12, 76, 109, 17, 74, 81, 64, 111, 82, 78, 10, 0, 14, 7, 112, 39, 18, 33, 71, 15, 38, 79, 86, 70, 67, 13, 3, 22, 24, 103, 57, 77, 88, 83, 5, 80, 16, 21, 19, 54, 126, 72, 69, 8, 1, 59, 65, 9, 55, 73, 114, 66, 4, 63, 50, 34, 29, 85, 2, 68, 92, 11, 45, 113, 93, 25, 89, 98, 90, 31, 116, 95, 61, 43, 75, 6, 26, 28, 49, 27, 51, 94, 115, 102, 118, 107, 121, 35, 42, 32, 30, 37, 91, 99, 106, 36, 96, 100, 101], [119, 104, 47, 62, 52, 120, 123, 122, 53, 44, 127, 117, 105, 40, 125, 108, 97, 48, 110, 124, 56, 46, 58, 60, 84, 20, 87, 41, 12, 23, 74, 112, 17, 109, 76, 81, 111, 82, 0, 78, 10, 14, 18, 64, 79, 86, 7, 38, 71, 57, 39, 67, 24, 15, 103, 33, 19, 54, 13, 5, 8, 22, 65, 80, 114, 70, 55, 63, 88, 3, 83, 69, 126, 72, 1, 16, 9, 29, 50, 77, 85, 59, 45, 4, 92, 66, 31, 73, 2, 93, 68, 116, 90, 21, 113, 98, 89, 25, 6, 95, 34, 61, 75, 11, 43, 26, 27, 115, 118, 51, 102, 49, 28, 94, 107, 30, 121, 42, 32, 106, 91, 37, 35, 99, 96, 101, 100, 36], [119, 104, 47, 52, 62, 120, 123, 53, 44, 122, 127, 117, 40, 105, 125, 108, 97, 48, 124, 110, 56, 60, 46, 58, 84, 12, 20, 87, 17, 76, 41, 112, 10, 23, 81, 78, 74, 109, 111, 0, 7, 18, 64, 71, 79, 14, 82, 24, 86, 33, 8, 77, 5, 19, 63, 22, 114, 39, 103, 57, 126, 67, 69, 72, 15, 38, 13, 9, 54, 59, 16, 88, 55, 80, 68, 3, 1, 65, 83, 6, 50, 29, 113, 21, 85, 73, 4, 45, 66, 92, 70, 90, 2, 93, 98, 89, 11, 95, 116, 75, 31, 61, 43, 34, 25, 26, 94, 115, 27, 51, 118, 49, 121, 28, 102, 107, 42, 32, 106, 30, 35, 91, 37, 96, 36, 100, 99, 101], [119, 104, 47, 52, 62, 120, 123, 44, 53, 122, 127, 117, 105, 40, 125, 108, 97, 124, 110, 56, 46, 48, 84, 87, 58, 20, 60, 12, 76, 41, 17, 74, 23, 78, 109, 81, 112, 10, 111, 7, 0, 64, 14, 71, 18, 79, 82, 24, 86, 19, 39, 69, 15, 13, 54, 8, 114, 77, 67, 3, 9, 57, 63, 83, 103, 88, 5, 6, 33, 126, 55, 16, 38, 72, 22, 80, 85, 59, 65, 29, 21, 50, 73, 2, 66, 113, 68, 1, 116, 45, 11, 4, 92, 95, 90, 61, 31, 75, 89, 98, 34, 93, 25, 70, 43, 115, 26, 118, 27, 51, 49, 42, 94, 102, 121, 28, 107, 30, 37, 91, 106, 32, 99, 35, 101, 96, 36, 100], [119, 104, 47, 52, 120, 62, 123, 122, 44, 53, 105, 117, 40, 127, 108, 125, 97, 110, 56, 124, 58, 87, 84, 46, 41, 20, 60, 48, 76, 12, 74, 23, 17, 81, 112, 109, 82, 71, 14, 10, 78, 103, 18, 79, 39, 7, 64, 0, 111, 15, 6, 22, 13, 16, 24, 54, 19, 77, 88, 86, 126, 8, 69, 57, 85, 59, 83, 33, 5, 63, 9, 3, 72, 38, 21, 80, 67, 29, 61, 113, 114, 1, 50, 73, 55, 66, 34, 11, 68, 65, 116, 75, 2, 25, 93, 95, 90, 4, 92, 89, 45, 98, 31, 43, 115, 27, 118, 26, 94, 51, 49, 70, 121, 42, 102, 30, 107, 35, 28, 99, 32, 37, 91, 106, 101, 36, 96, 100], [119, 104, 52, 47, 120, 62, 123, 44, 122, 53, 117, 105, 127, 40, 125, 108, 97, 110, 56, 124, 58, 48, 87, 84, 20, 46, 41, 76, 12, 60, 17, 23, 81, 78, 74, 10, 109, 14, 79, 18, 0, 6, 111, 112, 7, 82, 64, 13, 15, 19, 86, 71, 39, 16, 103, 22, 77, 80, 88, 33, 9, 24, 8, 38, 54, 83, 63, 5, 57, 85, 21, 126, 73, 72, 59, 65, 3, 69, 55, 114, 1, 2, 29, 66, 50, 61, 34, 45, 93, 75, 68, 67, 90, 89, 95, 4, 11, 92, 98, 113, 25, 116, 31, 26, 118, 49, 70, 51, 115, 28, 27, 43, 42, 121, 94, 35, 102, 107, 32, 37, 30, 101, 91, 106, 36, 99, 96, 100], [119, 104, 52, 47, 62, 120, 123, 44, 122, 53, 127, 117, 105, 40, 125, 108, 97, 124, 110, 56, 58, 48, 46, 20, 60, 84, 87, 41, 76, 12, 81, 23, 17, 109, 78, 111, 10, 74, 14, 112, 79, 64, 7, 39, 82, 18, 0, 71, 63, 86, 13, 16, 15, 103, 19, 33, 6, 8, 24, 80, 67, 114, 22, 83, 38, 77, 88, 21, 3, 5, 50, 126, 1, 57, 59, 54, 72, 55, 29, 9, 85, 69, 65, 45, 2, 4, 89, 66, 116, 31, 73, 98, 34, 90, 92, 68, 25, 11, 75, 113, 93, 95, 61, 26, 70, 43, 49, 28, 27, 115, 121, 51, 118, 94, 102, 107, 35, 42, 30, 37, 101, 99, 32, 91, 106, 96, 100, 36], [119, 104, 47, 52, 62, 120, 123, 44, 122, 53, 105, 117, 40, 127, 108, 125, 97, 124, 56, 110, 58, 46, 48, 84, 87, 20, 12, 41, 23, 60, 81, 17, 109, 74, 76, 18, 10, 14, 78, 82, 15, 33, 111, 112, 7, 79, 86, 64, 22, 0, 103, 39, 80, 19, 83, 24, 71, 38, 13, 16, 77, 88, 57, 21, 63, 59, 55, 85, 8, 54, 29, 5, 114, 45, 69, 6, 9, 72, 3, 73, 92, 126, 67, 50, 2, 113, 66, 98, 34, 93, 89, 116, 1, 65, 61, 90, 68, 43, 95, 4, 75, 25, 26, 31, 70, 11, 49, 27, 115, 118, 28, 30, 102, 51, 94, 35, 42, 121, 107, 32, 37, 99, 106, 96, 91, 101, 36, 100], [119, 104, 47, 62, 120, 52, 123, 122, 44, 53, 127, 40, 117, 105, 125, 108, 97, 48, 124, 56, 110, 46, 60, 20, 84, 41, 58, 12, 87, 76, 17, 111, 109, 10, 23, 112, 81, 0, 14, 18, 74, 78, 64, 7, 71, 39, 79, 15, 103, 82, 63, 33, 1, 8, 24, 19, 38, 86, 80, 69, 77, 13, 57, 55, 114, 50, 85, 22, 83, 3, 59, 88, 72, 16, 54, 5, 73, 29, 65, 126, 4, 21, 45, 67, 70, 116, 66, 31, 2, 9, 95, 90, 93, 68, 75, 61, 26, 92, 98, 113, 6, 34, 25, 89, 11, 118, 49, 94, 43, 115, 51, 27, 102, 28, 121, 107, 42, 91, 106, 30, 35, 37, 32, 99, 100, 101, 96, 36], [119, 104, 47, 52, 62, 120, 123, 53, 44, 122, 127, 40, 117, 125, 105, 108, 97, 124, 56, 110, 48, 46, 20, 60, 58, 84, 87, 41, 76, 112, 12, 81, 17, 14, 23, 10, 109, 111, 74, 7, 0, 3, 39, 78, 82, 64, 71, 24, 67, 103, 18, 79, 86, 114, 15, 88, 70, 33, 5, 57, 69, 8, 38, 72, 54, 22, 19, 63, 77, 80, 1, 13, 126, 55, 83, 16, 65, 85, 59, 21, 50, 29, 73, 9, 61, 68, 66, 2, 90, 92, 4, 116, 75, 95, 34, 45, 113, 11, 25, 98, 89, 93, 31, 26, 6, 27, 118, 51, 94, 115, 43, 49, 102, 28, 121, 42, 30, 107, 32, 35, 37, 106, 99, 91, 100, 36, 101, 96], [119, 104, 52, 47, 62, 120, 123, 44, 122, 117, 53, 105, 40, 127, 108, 125, 97, 56, 110, 124, 58, 20, 84, 87, 48, 41, 46, 76, 23, 60, 12, 17, 81, 10, 14, 74, 109, 78, 71, 82, 18, 15, 79, 112, 7, 80, 86, 33, 83, 70, 39, 103, 19, 24, 22, 64, 13, 77, 111, 38, 0, 88, 63, 5, 21, 16, 57, 69, 72, 85, 8, 9, 54, 55, 29, 67, 126, 92, 116, 93, 50, 59, 113, 3, 1, 2, 114, 25, 61, 73, 66, 75, 34, 89, 4, 65, 11, 45, 90, 95, 68, 98, 31, 27, 118, 26, 43, 49, 94, 28, 115, 51, 30, 107, 102, 6, 35, 42, 32, 121, 96, 37, 91, 99, 106, 101, 100, 36], [119, 104, 47, 52, 62, 120, 123, 44, 122, 53, 127, 117, 105, 40, 125, 108, 97, 110, 124, 56, 20, 48, 58, 46, 84, 60, 23, 41, 76, 17, 12, 109, 87, 74, 111, 81, 10, 14, 71, 112, 64, 82, 15, 18, 78, 103, 7, 70, 0, 39, 79, 86, 22, 13, 33, 19, 69, 24, 57, 83, 16, 38, 80, 114, 72, 54, 59, 8, 63, 77, 126, 88, 1, 73, 5, 85, 50, 55, 66, 29, 9, 21, 67, 2, 3, 45, 25, 65, 68, 90, 92, 113, 4, 34, 116, 11, 61, 93, 95, 98, 75, 31, 89, 43, 49, 26, 115, 51, 118, 6, 27, 28, 94, 121, 102, 30, 107, 42, 32, 99, 91, 37, 35, 106, 36, 101, 100, 96], [119, 104, 47, 62, 120, 52, 123, 44, 122, 53, 117, 127, 105, 40, 125, 108, 97, 110, 124, 58, 56, 48, 20, 84, 60, 87, 41, 46, 23, 12, 76, 81, 111, 14, 74, 17, 71, 10, 112, 0, 109, 64, 7, 79, 39, 18, 67, 72, 103, 78, 15, 69, 3, 86, 82, 126, 1, 33, 24, 13, 55, 5, 38, 70, 22, 80, 54, 19, 16, 57, 83, 114, 88, 9, 73, 85, 77, 68, 113, 8, 63, 65, 59, 2, 21, 29, 50, 66, 90, 31, 98, 116, 6, 4, 75, 92, 61, 93, 25, 45, 89, 34, 11, 95, 26, 49, 118, 51, 27, 115, 28, 102, 43, 42, 94, 121, 106, 35, 30, 99, 107, 32, 91, 101, 37, 96, 36, 100], [119, 104, 47, 62, 52, 120, 123, 122, 44, 53, 127, 117, 105, 40, 125, 108, 97, 48, 110, 56, 124, 46, 20, 58, 84, 41, 60, 87, 76, 12, 17, 81, 23, 111, 112, 109, 71, 10, 74, 14, 78, 39, 7, 15, 0, 82, 72, 18, 103, 86, 64, 63, 24, 33, 57, 13, 55, 38, 79, 69, 114, 67, 83, 126, 22, 88, 3, 21, 80, 16, 59, 50, 29, 54, 19, 77, 9, 5, 85, 65, 73, 8, 1, 116, 92, 90, 113, 70, 68, 4, 45, 6, 98, 95, 93, 2, 31, 61, 66, 26, 11, 34, 43, 75, 25, 89, 118, 51, 49, 115, 94, 102, 27, 121, 28, 42, 30, 106, 107, 35, 91, 32, 99, 37, 100, 101, 96, 36], [119, 104, 52, 47, 120, 62, 123, 44, 122, 53, 117, 105, 127, 40, 108, 125, 97, 110, 124, 56, 20, 48, 46, 58, 84, 87, 23, 41, 76, 12, 60, 10, 17, 81, 74, 78, 18, 109, 14, 7, 111, 71, 15, 112, 82, 79, 22, 80, 13, 86, 64, 69, 19, 83, 0, 57, 33, 39, 24, 77, 9, 38, 103, 88, 6, 5, 21, 72, 29, 63, 85, 54, 55, 16, 73, 8, 126, 50, 3, 114, 75, 92, 67, 59, 68, 65, 66, 2, 89, 1, 113, 116, 95, 4, 34, 93, 45, 11, 25, 90, 98, 61, 115, 31, 70, 26, 49, 27, 43, 51, 28, 118, 102, 94, 30, 42, 121, 107, 37, 91, 35, 32, 99, 106, 100, 96, 101, 36], [119, 104, 52, 47, 62, 120, 123, 44, 122, 53, 117, 105, 127, 40, 108, 125, 97, 56, 124, 110, 20, 58, 84, 48, 41, 23, 76, 60, 87, 46, 17, 12, 10, 109, 74, 81, 82, 14, 78, 112, 6, 64, 15, 0, 71, 7, 111, 18, 22, 79, 69, 39, 16, 57, 86, 33, 19, 24, 103, 3, 13, 83, 77, 80, 38, 72, 67, 1, 114, 59, 63, 54, 21, 55, 88, 8, 73, 85, 29, 9, 2, 126, 93, 66, 113, 45, 4, 116, 75, 65, 50, 5, 92, 95, 98, 90, 61, 25, 31, 68, 34, 89, 11, 26, 70, 28, 43, 51, 49, 27, 118, 115, 102, 30, 42, 94, 37, 121, 107, 32, 91, 35, 106, 99, 101, 96, 36, 100], [119, 104, 47, 52, 62, 120, 123, 122, 53, 44, 127, 117, 105, 40, 125, 97, 108, 110, 124, 56, 48, 58, 60, 46, 20, 84, 76, 87, 23, 41, 12, 81, 17, 10, 74, 14, 112, 78, 111, 109, 64, 82, 71, 7, 15, 79, 39, 18, 0, 6, 86, 33, 103, 57, 126, 38, 16, 114, 69, 24, 1, 77, 19, 8, 67, 22, 13, 83, 80, 88, 54, 85, 63, 72, 55, 73, 59, 3, 29, 5, 9, 116, 50, 2, 75, 93, 4, 21, 66, 61, 92, 65, 98, 34, 90, 45, 68, 31, 95, 113, 89, 25, 11, 26, 70, 49, 43, 51, 118, 115, 27, 94, 28, 102, 42, 107, 121, 32, 30, 35, 37, 91, 99, 106, 101, 36, 96, 100], [119, 104, 47, 62, 52, 120, 122, 123, 44, 53, 117, 127, 105, 40, 125, 108, 97, 110, 56, 124, 58, 60, 46, 48, 84, 87, 20, 41, 12, 76, 17, 74, 81, 10, 109, 111, 23, 14, 112, 71, 7, 78, 79, 18, 82, 86, 69, 64, 15, 19, 33, 0, 24, 57, 39, 6, 13, 83, 22, 103, 63, 77, 38, 73, 80, 114, 72, 1, 88, 8, 55, 16, 67, 9, 5, 126, 54, 29, 21, 2, 3, 50, 85, 116, 59, 113, 45, 92, 68, 75, 4, 90, 93, 98, 31, 34, 65, 95, 66, 61, 89, 43, 26, 70, 11, 25, 49, 51, 102, 28, 115, 118, 27, 94, 42, 30, 107, 121, 35, 37, 32, 91, 101, 99, 96, 106, 100, 36], [119, 104, 62, 52, 47, 120, 123, 53, 122, 44, 127, 117, 40, 105, 125, 108, 97, 110, 48, 124, 56, 46, 20, 84, 60, 58, 87, 41, 109, 17, 12, 23, 112, 76, 111, 81, 74, 10, 82, 78, 14, 71, 39, 7, 0, 79, 57, 18, 38, 64, 24, 3, 22, 33, 15, 16, 126, 67, 54, 19, 55, 80, 114, 77, 86, 63, 59, 72, 9, 88, 69, 103, 83, 13, 8, 29, 5, 50, 1, 85, 21, 116, 4, 95, 68, 89, 92, 113, 90, 6, 45, 73, 70, 65, 98, 31, 66, 93, 75, 2, 11, 61, 34, 25, 26, 51, 115, 49, 94, 118, 42, 43, 121, 28, 27, 107, 30, 102, 37, 35, 91, 99, 106, 100, 32, 101, 36, 96], [119, 104, 47, 52, 120, 62, 123, 122, 53, 44, 117, 127, 105, 40, 125, 108, 97, 110, 58, 124, 56, 84, 46, 87, 48, 20, 60, 41, 12, 17, 23, 76, 112, 111, 81, 109, 10, 0, 64, 74, 7, 14, 18, 39, 71, 57, 79, 33, 82, 15, 78, 86, 24, 54, 70, 22, 126, 38, 77, 103, 69, 63, 1, 80, 8, 16, 114, 5, 55, 67, 2, 19, 9, 72, 83, 29, 50, 68, 3, 88, 13, 66, 73, 85, 21, 116, 4, 34, 65, 59, 31, 113, 93, 45, 61, 95, 89, 90, 92, 98, 11, 26, 75, 6, 43, 25, 118, 115, 49, 27, 51, 102, 28, 42, 94, 107, 121, 30, 35, 32, 91, 99, 106, 101, 37, 96, 36, 100], [119, 104, 52, 47, 62, 120, 123, 122, 44, 53, 117, 127, 40, 105, 125, 97, 108, 110, 48, 124, 56, 46, 84, 58, 60, 20, 87, 23, 41, 12, 76, 17, 74, 81, 109, 14, 10, 111, 112, 18, 7, 78, 71, 82, 79, 22, 15, 33, 86, 70, 0, 64, 24, 39, 19, 8, 38, 83, 57, 16, 103, 29, 88, 13, 5, 63, 80, 85, 77, 9, 59, 67, 54, 69, 114, 72, 55, 21, 65, 126, 50, 68, 116, 73, 4, 1, 113, 61, 3, 45, 92, 89, 98, 2, 93, 34, 31, 90, 75, 11, 66, 95, 25, 26, 118, 6, 43, 27, 94, 28, 49, 51, 102, 115, 42, 107, 30, 121, 32, 91, 35, 106, 96, 37, 99, 101, 36, 100], [119, 104, 47, 52, 62, 120, 123, 53, 44, 122, 127, 117, 105, 40, 125, 97, 108, 110, 48, 124, 56, 46, 20, 84, 58, 60, 87, 41, 12, 76, 74, 23, 17, 112, 81, 111, 78, 109, 10, 82, 18, 14, 15, 7, 86, 39, 71, 103, 57, 0, 24, 8, 70, 16, 19, 126, 64, 38, 67, 33, 13, 22, 83, 79, 63, 77, 114, 88, 55, 80, 50, 3, 59, 9, 5, 92, 54, 116, 29, 85, 1, 73, 113, 69, 21, 34, 68, 72, 89, 45, 98, 61, 95, 4, 65, 2, 11, 90, 31, 75, 66, 93, 43, 25, 26, 51, 115, 118, 49, 27, 94, 6, 28, 102, 42, 107, 30, 121, 37, 106, 91, 32, 35, 99, 101, 36, 100, 96], [119, 104, 47, 52, 62, 120, 123, 122, 44, 53, 117, 127, 105, 40, 125, 108, 97, 110, 56, 124, 58, 48, 60, 20, 84, 46, 87, 41, 12, 76, 23, 17, 109, 81, 10, 78, 74, 112, 7, 111, 82, 39, 15, 14, 57, 18, 79, 126, 71, 64, 0, 103, 33, 86, 24, 88, 16, 54, 83, 22, 5, 8, 13, 77, 85, 80, 38, 69, 19, 70, 114, 59, 21, 72, 29, 63, 3, 1, 67, 61, 9, 55, 92, 50, 73, 95, 31, 93, 90, 4, 2, 66, 65, 25, 113, 34, 45, 75, 11, 89, 116, 98, 68, 118, 26, 6, 51, 49, 43, 27, 94, 115, 42, 102, 28, 30, 35, 121, 32, 107, 37, 106, 99, 91, 96, 36, 101, 100], [119, 104, 52, 47, 62, 120, 123, 44, 122, 53, 127, 117, 40, 105, 125, 108, 97, 56, 110, 48, 124, 20, 46, 58, 87, 60, 84, 41, 12, 76, 23, 81, 10, 112, 17, 74, 109, 111, 7, 39, 14, 18, 0, 78, 82, 64, 15, 71, 103, 79, 57, 33, 19, 22, 83, 16, 13, 86, 126, 77, 38, 69, 80, 8, 88, 3, 24, 5, 50, 67, 54, 116, 85, 70, 72, 9, 63, 1, 114, 21, 55, 59, 92, 73, 2, 29, 34, 65, 25, 68, 61, 113, 89, 45, 4, 66, 90, 95, 31, 6, 11, 98, 118, 93, 51, 75, 28, 26, 115, 27, 49, 43, 30, 107, 94, 42, 102, 121, 106, 37, 35, 91, 32, 36, 101, 100, 99, 96], [119, 104, 47, 52, 62, 120, 123, 44, 122, 53, 117, 127, 105, 40, 108, 125, 97, 56, 110, 124, 20, 58, 48, 84, 41, 46, 60, 23, 12, 87, 109, 76, 74, 17, 81, 111, 10, 112, 18, 78, 14, 7, 33, 71, 0, 15, 39, 64, 83, 82, 57, 79, 77, 86, 38, 22, 103, 88, 16, 19, 24, 3, 80, 126, 67, 59, 8, 72, 54, 13, 55, 21, 63, 65, 85, 50, 69, 45, 5, 29, 92, 68, 6, 73, 1, 25, 89, 9, 114, 34, 2, 95, 113, 4, 93, 98, 11, 90, 66, 116, 75, 49, 70, 31, 61, 26, 27, 121, 43, 115, 118, 102, 51, 28, 94, 35, 42, 107, 30, 32, 91, 37, 106, 100, 99, 96, 36, 101], [119, 104, 47, 52, 120, 62, 123, 44, 122, 53, 117, 105, 127, 40, 125, 108, 97, 110, 124, 56, 48, 58, 84, 46, 76, 20, 87, 60, 41, 12, 74, 81, 23, 10, 17, 112, 0, 7, 71, 109, 14, 64, 78, 111, 18, 15, 57, 82, 79, 77, 126, 6, 39, 72, 69, 103, 86, 33, 8, 114, 59, 88, 1, 38, 80, 5, 16, 83, 13, 73, 22, 24, 63, 54, 67, 19, 3, 55, 9, 2, 85, 50, 21, 29, 65, 68, 66, 113, 93, 92, 34, 45, 75, 4, 98, 116, 31, 90, 61, 11, 95, 49, 25, 26, 89, 115, 27, 43, 118, 51, 70, 102, 94, 42, 28, 37, 121, 30, 107, 35, 99, 91, 32, 100, 106, 96, 36, 101], [119, 104, 47, 120, 62, 52, 123, 53, 122, 44, 127, 117, 40, 105, 125, 108, 97, 110, 124, 56, 20, 48, 60, 58, 46, 87, 84, 41, 76, 12, 74, 17, 23, 112, 111, 109, 10, 81, 78, 14, 7, 39, 71, 18, 82, 0, 79, 33, 86, 64, 114, 15, 103, 24, 77, 6, 83, 38, 8, 1, 72, 19, 22, 13, 57, 3, 126, 16, 54, 88, 55, 59, 5, 9, 63, 80, 50, 73, 67, 21, 69, 29, 65, 92, 116, 85, 113, 31, 95, 93, 68, 98, 4, 90, 66, 2, 45, 11, 34, 61, 75, 43, 25, 89, 118, 26, 115, 49, 70, 102, 27, 51, 121, 94, 28, 107, 30, 42, 37, 32, 35, 91, 106, 99, 100, 36, 101, 96], [119, 104, 47, 62, 52, 120, 123, 44, 122, 53, 127, 40, 105, 117, 125, 97, 108, 124, 48, 56, 46, 110, 20, 84, 60, 12, 58, 41, 87, 76, 23, 17, 74, 111, 112, 81, 109, 7, 10, 14, 78, 0, 39, 82, 71, 18, 15, 64, 79, 67, 6, 77, 103, 83, 72, 5, 22, 57, 33, 54, 59, 38, 126, 24, 86, 88, 13, 69, 16, 3, 63, 19, 8, 114, 29, 80, 9, 73, 55, 85, 65, 50, 68, 21, 1, 116, 2, 92, 11, 113, 4, 45, 89, 95, 31, 66, 61, 90, 34, 98, 43, 93, 25, 70, 118, 27, 75, 26, 115, 121, 51, 49, 102, 30, 94, 42, 91, 107, 28, 106, 37, 32, 35, 99, 101, 96, 100, 36], [119, 104, 47, 52, 120, 62, 123, 44, 122, 53, 117, 105, 40, 127, 108, 125, 97, 124, 56, 110, 58, 20, 87, 84, 46, 41, 76, 60, 23, 48, 12, 74, 17, 10, 81, 111, 0, 78, 14, 109, 18, 64, 82, 79, 71, 86, 7, 112, 103, 15, 33, 39, 83, 5, 22, 57, 72, 77, 38, 19, 13, 59, 24, 54, 16, 80, 6, 88, 9, 55, 126, 21, 69, 8, 3, 63, 73, 65, 29, 2, 85, 34, 4, 50, 113, 67, 114, 92, 66, 1, 89, 45, 75, 70, 95, 25, 68, 11, 116, 61, 98, 93, 31, 90, 43, 26, 115, 118, 27, 49, 102, 28, 51, 30, 94, 121, 35, 32, 42, 99, 37, 107, 91, 106, 96, 101, 36, 100], [119, 104, 47, 52, 120, 62, 123, 44, 122, 53, 127, 40, 105, 117, 125, 108, 97, 56, 46, 110, 124, 48, 60, 20, 87, 41, 84, 58, 76, 12, 17, 109, 23, 81, 10, 74, 111, 112, 14, 78, 82, 103, 7, 71, 64, 15, 18, 39, 79, 72, 0, 13, 22, 80, 33, 63, 86, 83, 5, 38, 24, 126, 3, 57, 88, 114, 16, 55, 77, 67, 19, 59, 85, 116, 69, 9, 8, 54, 29, 65, 21, 2, 50, 73, 1, 34, 70, 113, 66, 4, 89, 92, 98, 6, 61, 25, 45, 95, 11, 31, 90, 75, 68, 93, 118, 115, 43, 26, 51, 27, 49, 28, 102, 121, 42, 30, 94, 107, 32, 37, 35, 106, 101, 36, 99, 91, 100, 96]], "model.layers.4.self_attn.q_proj": [[42, 122, 117, 56, 53, 123, 101, 124, 102, 121, 33, 61, 27, 107, 104, 119, 59, 48, 110, 39, 116, 83, 55, 109, 127, 115, 37, 114, 113, 126, 92, 57, 60, 43, 58, 17, 50, 26, 38, 120, 47, 125, 44, 45, 51, 54, 89, 62, 105, 111, 94, 21, 52, 46, 87, 41, 36, 108, 32, 106, 91, 49, 63, 112, 86, 118, 40, 23, 31, 30, 76, 79, 99, 22, 34, 103, 84, 96, 100, 72, 97, 98, 28, 78, 82, 25, 24, 95, 93, 16, 75, 0, 29, 35, 15, 13, 81, 10, 66, 6, 5, 73, 2, 11, 14, 3, 85, 19, 64, 8, 12, 74, 4, 68, 90, 69, 20, 65, 70, 88, 67, 7, 80, 71, 77, 9, 1, 18], [42, 101, 117, 114, 123, 56, 122, 102, 33, 53, 48, 124, 107, 59, 111, 39, 89, 94, 91, 27, 86, 121, 54, 57, 21, 37, 104, 28, 125, 112, 49, 79, 116, 40, 109, 36, 113, 43, 44, 41, 60, 118, 99, 26, 47, 38, 84, 82, 58, 46, 87, 25, 32, 17, 115, 52, 110, 61, 95, 92, 108, 62, 119, 24, 127, 45, 50, 120, 55, 51, 85, 29, 63, 90, 126, 30, 15, 93, 105, 106, 81, 34, 83, 22, 12, 96, 76, 16, 31, 98, 78, 75, 19, 35, 23, 74, 97, 13, 88, 8, 73, 20, 14, 100, 103, 77, 68, 72, 71, 6, 7, 69, 5, 11, 9, 66, 10, 4, 1, 65, 70, 3, 64, 67, 2, 0, 18, 80], [101, 42, 117, 33, 123, 122, 56, 53, 114, 87, 84, 82, 124, 91, 94, 39, 61, 79, 78, 75, 16, 73, 13, 6, 48, 86, 27, 72, 46, 111, 107, 2, 57, 89, 32, 95, 30, 21, 68, 22, 104, 20, 19, 74, 54, 71, 58, 102, 23, 109, 67, 44, 14, 85, 25, 8, 11, 88, 63, 55, 90, 10, 24, 106, 65, 41, 110, 36, 98, 64, 120, 35, 121, 92, 105, 7, 81, 99, 17, 49, 113, 28, 15, 60, 3, 77, 118, 108, 40, 116, 93, 76, 80, 18, 29, 52, 12, 38, 83, 26, 96, 4, 69, 119, 34, 70, 43, 31, 1, 9, 62, 47, 103, 115, 125, 51, 112, 127, 126, 59, 50, 97, 37, 5, 100, 0, 45, 66], [101, 117, 56, 122, 123, 33, 53, 42, 87, 82, 91, 84, 124, 54, 73, 15, 16, 103, 76, 78, 75, 24, 71, 27, 121, 13, 88, 68, 6, 37, 72, 114, 107, 86, 85, 104, 58, 59, 67, 66, 10, 3, 48, 65, 1, 126, 116, 25, 105, 95, 12, 109, 61, 118, 44, 89, 64, 45, 14, 92, 7, 120, 79, 115, 9, 127, 18, 29, 80, 43, 26, 98, 90, 77, 4, 94, 102, 70, 49, 74, 93, 0, 110, 11, 28, 23, 55, 19, 8, 81, 34, 99, 35, 20, 36, 100, 41, 125, 111, 52, 22, 32, 31, 17, 113, 46, 47, 83, 39, 51, 96, 21, 69, 5, 2, 112, 30, 62, 50, 63, 40, 60, 57, 38, 106, 119, 108, 97], [39, 52, 118, 33, 31, 79, 13, 87, 8, 11, 53, 70, 20, 117, 54, 82, 81, 9, 68, 63, 50, 66, 113, 26, 120, 121, 111, 125, 90, 10, 61, 114, 49, 119, 46, 23, 74, 3, 51, 112, 25, 47, 116, 84, 55, 21, 6, 29, 95, 38, 67, 69, 12, 73, 18, 5, 19, 60, 77, 17, 107, 0, 28, 93, 85, 97, 57, 44, 106, 62, 15, 59, 22, 100, 72, 2, 24, 83, 92, 91, 7, 94, 86, 80, 98, 36, 14, 115, 71, 102, 78, 75, 105, 34, 1, 16, 123, 65, 58, 110, 30, 42, 76, 89, 43, 32, 4, 27, 45, 124, 104, 40, 88, 48, 96, 56, 99, 41, 109, 127, 37, 101, 64, 108, 35, 122, 126, 103], [39, 118, 52, 33, 31, 102, 20, 87, 63, 25, 116, 11, 54, 47, 82, 81, 90, 117, 13, 85, 79, 49, 80, 123, 111, 121, 56, 86, 76, 104, 26, 50, 28, 53, 23, 91, 9, 7, 105, 120, 41, 112, 51, 71, 61, 44, 70, 83, 55, 114, 27, 57, 8, 60, 125, 107, 46, 110, 115, 106, 101, 124, 40, 97, 42, 10, 14, 59, 127, 29, 43, 78, 126, 113, 35, 45, 16, 69, 122, 74, 62, 119, 65, 22, 98, 108, 58, 36, 48, 3, 93, 17, 84, 37, 30, 12, 68, 99, 109, 88, 92, 64, 18, 67, 38, 100, 34, 66, 96, 19, 89, 21, 24, 32, 75, 72, 77, 94, 5, 95, 15, 73, 1, 6, 4, 103, 0, 2], [53, 39, 33, 52, 118, 31, 90, 50, 102, 81, 87, 113, 59, 63, 54, 125, 20, 114, 98, 61, 55, 97, 60, 117, 108, 47, 107, 26, 124, 121, 92, 49, 24, 44, 86, 112, 42, 79, 126, 110, 28, 116, 103, 57, 111, 82, 14, 43, 83, 120, 45, 119, 10, 127, 46, 29, 122, 85, 106, 19, 12, 51, 56, 41, 37, 32, 115, 96, 48, 62, 109, 22, 100, 105, 34, 88, 13, 40, 89, 58, 123, 101, 11, 94, 91, 25, 104, 93, 36, 5, 78, 27, 38, 99, 35, 17, 71, 21, 80, 23, 76, 74, 84, 30, 16, 95, 77, 75, 8, 18, 15, 72, 67, 6, 9, 7, 73, 69, 3, 4, 65, 1, 66, 70, 0, 68, 64, 2], [39, 118, 52, 33, 53, 31, 8, 20, 82, 87, 13, 79, 11, 54, 70, 63, 117, 102, 81, 74, 9, 50, 125, 68, 121, 46, 90, 86, 23, 61, 16, 49, 66, 10, 55, 59, 60, 15, 47, 42, 51, 69, 115, 67, 95, 12, 26, 111, 119, 97, 22, 72, 112, 114, 77, 120, 17, 80, 124, 21, 18, 94, 89, 116, 3, 25, 2, 45, 56, 92, 75, 84, 78, 85, 28, 0, 19, 14, 38, 29, 57, 27, 83, 30, 71, 24, 113, 96, 107, 76, 40, 7, 123, 99, 62, 122, 88, 44, 48, 36, 6, 100, 41, 5, 73, 98, 106, 43, 34, 127, 91, 64, 1, 93, 32, 105, 37, 126, 4, 108, 110, 101, 104, 35, 58, 109, 65, 103], [41, 39, 99, 116, 124, 118, 52, 95, 54, 28, 86, 92, 62, 75, 105, 81, 26, 31, 24, 21, 122, 121, 58, 17, 83, 55, 87, 127, 22, 61, 79, 56, 123, 117, 93, 50, 12, 53, 14, 115, 35, 88, 110, 106, 13, 82, 112, 113, 94, 47, 18, 34, 125, 57, 42, 120, 101, 108, 30, 48, 20, 126, 98, 32, 111, 46, 89, 43, 51, 78, 114, 96, 49, 104, 90, 15, 72, 40, 25, 109, 9, 84, 59, 38, 37, 107, 100, 63, 60, 44, 45, 33, 23, 16, 11, 119, 29, 85, 102, 36, 97, 27, 91, 74, 19, 8, 65, 68, 1, 69, 73, 76, 5, 4, 70, 77, 71, 6, 3, 10, 80, 7, 66, 64, 2, 0, 67, 103], [39, 99, 28, 116, 124, 118, 16, 52, 41, 83, 74, 13, 86, 95, 72, 54, 70, 49, 75, 71, 79, 92, 4, 26, 22, 3, 21, 35, 24, 12, 81, 1, 62, 2, 122, 23, 61, 31, 18, 55, 32, 85, 123, 87, 30, 50, 11, 73, 77, 127, 58, 89, 7, 112, 19, 48, 27, 117, 78, 126, 64, 111, 25, 88, 80, 42, 47, 66, 82, 53, 14, 121, 8, 59, 67, 100, 10, 101, 51, 5, 17, 6, 106, 94, 110, 113, 76, 0, 15, 90, 105, 34, 65, 107, 91, 104, 60, 63, 119, 69, 20, 46, 93, 120, 102, 125, 84, 43, 33, 97, 37, 96, 29, 36, 115, 40, 108, 68, 98, 103, 56, 45, 9, 57, 109, 38, 44, 114], [39, 116, 99, 124, 118, 52, 41, 95, 92, 86, 26, 31, 28, 54, 49, 83, 98, 87, 81, 112, 24, 62, 79, 75, 105, 22, 30, 35, 61, 84, 16, 34, 93, 13, 117, 88, 47, 121, 120, 58, 106, 21, 123, 114, 44, 110, 46, 104, 45, 60, 122, 14, 42, 108, 53, 96, 89, 125, 113, 25, 33, 27, 50, 127, 90, 48, 40, 109, 59, 20, 38, 111, 63, 37, 97, 126, 17, 55, 51, 91, 43, 107, 32, 119, 102, 101, 36, 100, 12, 57, 94, 72, 29, 56, 115, 74, 23, 78, 85, 103, 82, 11, 18, 15, 19, 4, 70, 1, 68, 76, 71, 73, 9, 5, 69, 8, 6, 65, 2, 64, 80, 77, 66, 7, 0, 10, 3, 67], [39, 99, 118, 124, 52, 116, 28, 95, 72, 13, 83, 4, 79, 86, 16, 22, 70, 75, 74, 66, 54, 105, 2, 68, 65, 64, 81, 35, 84, 87, 92, 61, 1, 21, 0, 26, 24, 3, 17, 115, 106, 77, 98, 55, 11, 89, 31, 30, 8, 14, 19, 71, 67, 88, 62, 110, 7, 123, 23, 73, 58, 82, 6, 94, 42, 33, 15, 117, 10, 41, 109, 12, 5, 103, 49, 122, 121, 76, 80, 127, 25, 57, 125, 56, 51, 40, 93, 48, 53, 34, 113, 78, 85, 18, 96, 69, 47, 59, 114, 46, 63, 27, 20, 90, 45, 44, 100, 29, 32, 60, 120, 111, 126, 43, 101, 91, 36, 119, 50, 37, 97, 108, 38, 112, 9, 102, 107, 104], [45, 102, 0, 97, 64, 31, 109, 25, 4, 66, 1, 65, 6, 76, 10, 124, 79, 69, 9, 68, 22, 17, 2, 11, 48, 38, 105, 126, 95, 125, 81, 51, 82, 59, 20, 56, 52, 5, 78, 104, 90, 18, 127, 84, 67, 118, 61, 7, 41, 63, 60, 122, 53, 71, 113, 74, 70, 73, 57, 75, 100, 15, 50, 120, 42, 14, 77, 13, 72, 28, 110, 12, 47, 114, 24, 49, 87, 107, 93, 19, 88, 121, 23, 33, 26, 16, 83, 36, 3, 46, 80, 112, 55, 39, 40, 85, 92, 29, 58, 115, 8, 30, 43, 101, 86, 96, 108, 106, 34, 111, 37, 91, 35, 99, 89, 21, 98, 32, 117, 119, 27, 123, 94, 103, 54, 62, 116, 44], [102, 45, 97, 31, 124, 79, 10, 17, 11, 5, 109, 25, 76, 89, 22, 70, 9, 4, 69, 6, 84, 48, 52, 51, 66, 82, 1, 78, 115, 75, 56, 65, 59, 104, 126, 53, 19, 90, 61, 72, 2, 105, 14, 87, 125, 20, 41, 77, 81, 42, 57, 71, 122, 38, 114, 15, 86, 28, 16, 120, 127, 118, 3, 60, 73, 63, 23, 74, 7, 83, 67, 0, 18, 13, 50, 123, 92, 12, 121, 95, 99, 24, 35, 68, 111, 88, 43, 39, 80, 113, 21, 34, 106, 30, 27, 29, 46, 93, 107, 117, 85, 101, 8, 110, 100, 94, 108, 26, 47, 96, 116, 98, 91, 36, 119, 58, 37, 62, 33, 112, 49, 32, 40, 44, 54, 55, 103, 64], [45, 102, 97, 31, 65, 25, 124, 109, 6, 11, 4, 1, 10, 76, 9, 79, 17, 0, 67, 69, 3, 48, 81, 66, 84, 51, 20, 105, 5, 52, 71, 126, 61, 90, 56, 15, 122, 125, 70, 72, 78, 118, 22, 95, 59, 120, 41, 23, 53, 57, 63, 38, 8, 7, 16, 82, 60, 113, 18, 73, 12, 64, 24, 30, 74, 127, 2, 14, 75, 47, 83, 80, 85, 68, 111, 86, 28, 19, 39, 21, 46, 87, 50, 49, 13, 29, 77, 104, 121, 26, 100, 88, 91, 93, 42, 27, 110, 58, 107, 117, 94, 101, 119, 114, 40, 108, 115, 96, 36, 103, 89, 116, 123, 37, 62, 92, 44, 34, 43, 55, 106, 112, 35, 98, 99, 54, 32, 33], [124, 102, 97, 45, 31, 90, 22, 52, 86, 120, 84, 109, 89, 78, 111, 104, 41, 53, 121, 13, 122, 18, 36, 33, 38, 115, 51, 114, 59, 113, 107, 63, 25, 34, 117, 57, 79, 127, 106, 46, 42, 26, 48, 44, 62, 17, 123, 126, 61, 100, 39, 54, 96, 56, 116, 21, 60, 49, 108, 103, 58, 105, 29, 43, 112, 110, 28, 99, 32, 101, 27, 24, 118, 35, 119, 50, 37, 91, 55, 88, 23, 76, 98, 47, 30, 40, 94, 125, 93, 82, 20, 11, 92, 14, 19, 83, 85, 77, 80, 16, 87, 10, 72, 9, 8, 12, 81, 15, 73, 74, 95, 5, 75, 7, 71, 70, 69, 67, 68, 1, 3, 6, 2, 65, 4, 66, 64, 0], [63, 59, 52, 104, 119, 125, 56, 122, 60, 113, 62, 123, 114, 124, 50, 61, 127, 53, 54, 57, 58, 46, 109, 118, 111, 49, 48, 126, 116, 51, 47, 55, 117, 82, 120, 44, 115, 108, 45, 121, 112, 43, 110, 79, 88, 107, 84, 106, 42, 39, 41, 80, 87, 102, 28, 90, 105, 91, 76, 85, 86, 96, 103, 19, 30, 93, 9, 38, 100, 12, 81, 14, 101, 16, 13, 37, 36, 7, 89, 73, 4, 5, 33, 40, 74, 32, 99, 83, 35, 21, 97, 8, 94, 11, 25, 10, 15, 27, 65, 77, 72, 70, 92, 69, 34, 22, 2, 78, 23, 75, 17, 98, 31, 95, 1, 67, 24, 18, 29, 71, 0, 26, 20, 3, 6, 66, 68, 64], [104, 34, 52, 63, 59, 77, 17, 90, 10, 7, 78, 87, 68, 75, 27, 20, 31, 21, 13, 118, 4, 119, 66, 79, 89, 93, 124, 71, 46, 70, 125, 14, 100, 26, 23, 82, 6, 83, 73, 84, 22, 19, 122, 86, 95, 64, 74, 85, 11, 1, 76, 113, 8, 81, 88, 15, 18, 80, 25, 111, 29, 56, 28, 60, 2, 65, 94, 67, 108, 123, 50, 72, 120, 36, 0, 69, 12, 5, 3, 91, 40, 102, 41, 30, 32, 61, 116, 39, 92, 127, 48, 9, 98, 16, 115, 62, 24, 126, 47, 99, 121, 38, 42, 110, 45, 109, 37, 49, 57, 101, 96, 97, 117, 33, 54, 103, 106, 105, 58, 35, 55, 51, 44, 43, 112, 107, 53, 114], [104, 34, 52, 63, 59, 7, 92, 71, 87, 17, 78, 119, 28, 68, 10, 91, 31, 75, 83, 125, 90, 77, 85, 94, 60, 1, 22, 3, 122, 100, 18, 4, 16, 8, 124, 26, 65, 113, 118, 23, 84, 46, 74, 56, 89, 93, 72, 123, 14, 25, 5, 13, 76, 79, 9, 81, 19, 12, 21, 6, 82, 62, 30, 116, 11, 88, 86, 70, 27, 66, 98, 102, 80, 24, 67, 120, 36, 111, 69, 40, 108, 20, 2, 64, 61, 48, 39, 73, 15, 96, 95, 117, 43, 101, 0, 38, 29, 37, 33, 50, 121, 32, 55, 35, 97, 109, 47, 103, 45, 99, 105, 106, 127, 54, 110, 57, 44, 41, 42, 49, 115, 51, 107, 112, 58, 126, 114, 53], [104, 52, 63, 59, 119, 125, 124, 60, 46, 62, 113, 118, 27, 123, 48, 39, 116, 50, 56, 40, 122, 88, 34, 32, 108, 57, 111, 109, 106, 61, 115, 47, 110, 30, 28, 51, 54, 120, 55, 43, 35, 117, 49, 114, 107, 121, 42, 41, 98, 58, 103, 105, 36, 102, 100, 79, 33, 45, 53, 37, 97, 76, 44, 38, 126, 80, 86, 101, 24, 82, 112, 127, 85, 26, 99, 94, 95, 9, 96, 84, 19, 73, 20, 91, 22, 31, 7, 25, 93, 81, 14, 13, 5, 74, 92, 4, 87, 90, 8, 89, 11, 16, 1, 12, 70, 17, 69, 23, 18, 72, 10, 2, 29, 21, 83, 65, 77, 15, 78, 75, 71, 67, 3, 0, 64, 6, 66, 68], [105, 112, 39, 34, 101, 48, 50, 111, 118, 85, 124, 53, 23, 59, 21, 125, 90, 31, 82, 119, 30, 108, 56, 54, 117, 25, 93, 87, 60, 18, 96, 63, 127, 41, 19, 22, 52, 61, 122, 28, 123, 92, 88, 51, 103, 94, 89, 47, 95, 26, 37, 86, 16, 80, 57, 110, 121, 81, 38, 102, 84, 55, 29, 49, 91, 17, 97, 32, 45, 33, 11, 106, 44, 40, 107, 27, 99, 109, 62, 58, 114, 20, 78, 42, 120, 79, 14, 100, 104, 13, 116, 126, 46, 15, 24, 115, 36, 83, 12, 76, 35, 43, 75, 113, 71, 77, 9, 6, 5, 74, 98, 10, 73, 72, 8, 69, 7, 70, 2, 66, 68, 3, 65, 67, 0, 1, 64, 4], [50, 48, 105, 124, 34, 111, 38, 112, 117, 108, 119, 90, 57, 54, 39, 106, 116, 94, 114, 120, 122, 125, 118, 109, 127, 42, 63, 110, 103, 126, 52, 51, 93, 49, 92, 45, 85, 35, 82, 23, 59, 36, 31, 60, 107, 113, 55, 46, 58, 29, 104, 87, 9, 101, 47, 61, 83, 62, 40, 121, 89, 56, 53, 99, 97, 123, 44, 80, 96, 43, 115, 28, 91, 76, 37, 100, 24, 19, 88, 102, 78, 71, 16, 14, 33, 32, 86, 77, 22, 30, 95, 26, 41, 27, 84, 25, 74, 98, 20, 79, 72, 17, 13, 21, 15, 18, 7, 68, 73, 75, 11, 6, 10, 2, 67, 8, 12, 4, 81, 69, 70, 66, 64, 3, 5, 0, 1, 65], [105, 101, 34, 112, 48, 50, 89, 31, 39, 85, 83, 124, 23, 111, 87, 80, 54, 14, 82, 76, 118, 123, 59, 25, 41, 93, 21, 119, 16, 29, 63, 125, 108, 53, 19, 95, 56, 9, 117, 103, 94, 49, 116, 84, 90, 22, 20, 60, 15, 75, 52, 122, 47, 71, 26, 24, 109, 46, 74, 127, 17, 28, 32, 35, 11, 33, 57, 12, 38, 96, 62, 102, 42, 120, 92, 91, 113, 30, 114, 77, 13, 79, 107, 10, 88, 73, 104, 70, 121, 43, 78, 97, 100, 61, 81, 18, 99, 44, 27, 72, 51, 6, 106, 58, 8, 110, 55, 5, 115, 126, 7, 86, 40, 36, 69, 98, 45, 37, 68, 67, 3, 66, 4, 2, 65, 1, 0, 64], [50, 101, 38, 105, 34, 112, 118, 59, 119, 83, 31, 85, 89, 37, 23, 111, 90, 78, 49, 61, 103, 124, 21, 20, 74, 71, 125, 52, 88, 113, 53, 57, 19, 122, 104, 92, 11, 44, 106, 82, 54, 14, 109, 48, 63, 29, 22, 123, 40, 51, 28, 96, 76, 79, 108, 8, 41, 17, 9, 87, 60, 18, 68, 102, 16, 127, 56, 107, 126, 25, 114, 42, 72, 121, 24, 84, 30, 110, 93, 13, 27, 10, 117, 100, 62, 46, 39, 116, 4, 47, 35, 15, 81, 58, 91, 2, 115, 67, 45, 97, 120, 32, 94, 69, 33, 43, 7, 86, 26, 99, 55, 80, 77, 6, 1, 64, 36, 95, 70, 66, 5, 65, 0, 3, 75, 98, 12, 73], [110, 39, 33, 31, 46, 85, 70, 16, 53, 10, 14, 72, 94, 82, 12, 75, 68, 3, 0, 2, 1, 9, 65, 66, 6, 50, 67, 8, 104, 21, 86, 13, 120, 124, 17, 22, 4, 79, 18, 83, 57, 54, 118, 63, 90, 11, 69, 58, 123, 73, 119, 51, 126, 103, 78, 7, 127, 41, 24, 20, 99, 38, 115, 87, 80, 88, 29, 121, 56, 49, 101, 71, 102, 114, 35, 76, 98, 28, 74, 81, 36, 23, 84, 108, 125, 89, 47, 19, 96, 77, 113, 107, 45, 111, 100, 64, 5, 40, 61, 48, 117, 95, 34, 92, 26, 25, 91, 42, 52, 55, 37, 122, 93, 116, 112, 27, 105, 15, 43, 62, 106, 60, 59, 30, 109, 44, 32, 97], [110, 39, 33, 31, 46, 12, 94, 10, 6, 16, 82, 4, 72, 14, 85, 50, 74, 2, 79, 53, 70, 17, 86, 67, 115, 124, 51, 69, 21, 23, 28, 24, 76, 80, 120, 22, 57, 78, 8, 63, 117, 75, 18, 123, 66, 73, 127, 65, 126, 104, 49, 84, 118, 103, 88, 56, 58, 96, 64, 48, 77, 90, 13, 54, 20, 32, 9, 68, 121, 0, 38, 122, 108, 41, 62, 7, 11, 101, 81, 83, 59, 113, 45, 43, 93, 87, 44, 55, 47, 15, 89, 112, 27, 109, 92, 119, 114, 107, 25, 116, 5, 98, 125, 19, 95, 105, 99, 60, 91, 40, 34, 36, 26, 61, 37, 1, 111, 29, 52, 71, 35, 100, 102, 106, 42, 3, 30, 97], [39, 33, 110, 53, 31, 23, 85, 46, 82, 79, 94, 16, 12, 14, 88, 19, 104, 10, 126, 57, 17, 112, 50, 70, 75, 51, 5, 63, 72, 124, 66, 115, 68, 120, 106, 64, 65, 71, 86, 81, 60, 48, 84, 55, 8, 123, 9, 125, 47, 11, 108, 56, 21, 40, 42, 3, 121, 41, 59, 24, 116, 95, 25, 49, 91, 119, 58, 38, 101, 96, 54, 22, 111, 117, 99, 122, 87, 113, 35, 18, 26, 105, 32, 34, 73, 118, 90, 127, 78, 29, 37, 100, 107, 114, 83, 27, 15, 44, 13, 45, 61, 102, 28, 109, 52, 89, 7, 80, 98, 67, 20, 93, 43, 36, 62, 74, 103, 92, 4, 2, 6, 76, 30, 0, 69, 77, 1, 97], [110, 33, 39, 31, 85, 46, 94, 24, 74, 82, 16, 14, 72, 10, 12, 86, 88, 83, 22, 53, 79, 23, 4, 13, 124, 51, 73, 90, 50, 80, 126, 5, 115, 123, 11, 120, 57, 71, 6, 78, 18, 63, 2, 70, 8, 48, 17, 75, 9, 103, 21, 67, 58, 119, 65, 118, 91, 106, 68, 56, 30, 104, 15, 69, 84, 92, 121, 87, 49, 19, 54, 40, 28, 41, 117, 77, 61, 59, 81, 125, 76, 36, 114, 20, 26, 27, 111, 99, 127, 7, 25, 62, 3, 32, 55, 60, 35, 42, 96, 112, 101, 89, 45, 43, 47, 113, 44, 93, 34, 38, 100, 102, 52, 105, 98, 108, 109, 66, 37, 116, 122, 1, 64, 95, 97, 29, 107, 0], [112, 40, 34, 53, 23, 27, 122, 29, 19, 81, 76, 21, 25, 87, 63, 31, 56, 79, 118, 119, 77, 59, 15, 93, 62, 57, 30, 16, 72, 90, 47, 9, 82, 18, 26, 11, 92, 97, 109, 125, 33, 95, 78, 101, 123, 32, 44, 96, 49, 83, 35, 105, 73, 120, 24, 51, 86, 80, 106, 121, 103, 88, 98, 55, 36, 14, 22, 52, 38, 10, 71, 46, 67, 12, 28, 89, 91, 110, 127, 111, 61, 6, 54, 37, 107, 13, 94, 60, 42, 85, 84, 17, 102, 58, 5, 41, 99, 43, 113, 126, 74, 20, 115, 124, 108, 48, 45, 100, 8, 69, 116, 7, 39, 114, 50, 70, 4, 104, 117, 68, 2, 3, 1, 75, 65, 66, 0, 64], [112, 53, 40, 38, 34, 56, 47, 119, 44, 43, 52, 63, 62, 49, 122, 117, 114, 125, 61, 116, 120, 111, 109, 45, 123, 55, 31, 23, 25, 95, 59, 60, 126, 51, 118, 29, 106, 42, 127, 57, 107, 50, 54, 121, 27, 92, 41, 58, 24, 115, 46, 16, 113, 105, 48, 30, 108, 39, 110, 124, 84, 21, 93, 82, 96, 103, 88, 101, 19, 35, 22, 100, 90, 89, 102, 37, 36, 33, 32, 18, 87, 26, 17, 97, 99, 81, 86, 28, 20, 94, 85, 80, 77, 11, 14, 98, 78, 83, 13, 15, 12, 91, 76, 3, 74, 68, 8, 9, 10, 65, 7, 6, 79, 71, 4, 72, 75, 66, 73, 70, 2, 104, 5, 0, 69, 1, 67, 64], [53, 40, 112, 34, 19, 23, 81, 27, 56, 125, 21, 77, 93, 47, 82, 57, 122, 88, 120, 58, 25, 76, 59, 11, 9, 28, 14, 118, 61, 95, 62, 67, 79, 70, 104, 30, 91, 114, 92, 31, 26, 36, 44, 80, 16, 73, 119, 107, 52, 15, 7, 72, 90, 121, 20, 29, 63, 83, 126, 6, 85, 99, 12, 116, 105, 100, 18, 86, 106, 48, 60, 38, 22, 43, 78, 110, 115, 127, 69, 24, 50, 97, 87, 46, 103, 1, 84, 35, 123, 13, 41, 17, 4, 5, 89, 94, 32, 55, 10, 54, 102, 37, 98, 96, 74, 109, 45, 42, 108, 49, 111, 124, 75, 33, 51, 39, 117, 68, 113, 101, 3, 65, 71, 8, 0, 2, 64, 66], [40, 53, 34, 112, 91, 19, 21, 29, 81, 122, 79, 73, 93, 76, 6, 56, 16, 11, 47, 125, 52, 8, 63, 70, 67, 59, 62, 65, 95, 72, 17, 119, 9, 118, 27, 126, 57, 1, 12, 38, 104, 10, 120, 74, 77, 71, 61, 7, 121, 58, 86, 85, 24, 44, 30, 75, 23, 88, 84, 4, 3, 31, 68, 15, 28, 13, 78, 32, 83, 114, 14, 80, 46, 89, 20, 82, 25, 90, 123, 92, 87, 0, 2, 94, 117, 69, 26, 66, 36, 96, 18, 22, 5, 100, 35, 99, 60, 45, 37, 54, 51, 124, 108, 97, 33, 55, 101, 105, 102, 42, 103, 41, 107, 43, 49, 127, 64, 39, 106, 109, 111, 115, 110, 50, 113, 116, 48, 98]], "model.layers.4.self_attn.k_proj": [[37, 53, 56, 123, 122, 97, 117, 106, 30, 84, 16, 27, 87, 82, 13, 26, 75, 100, 50, 6, 40, 77, 73, 83, 78, 31, 42, 72, 17, 28, 64, 86, 103, 65, 38, 108, 21, 94, 35, 120, 5, 68, 118, 60, 45, 25, 47, 3, 74, 61, 127, 101, 2, 0, 89, 126, 24, 88, 43, 29, 58, 79, 52, 96, 113, 110, 55, 51, 4, 46, 49, 125, 119, 54, 66, 33, 109, 62, 105, 104, 57, 59, 121, 115, 63, 116, 41, 91, 111, 8, 10, 12, 112, 85, 44, 107, 23, 98, 93, 71, 34, 114, 22, 48, 124, 67, 32, 99, 39, 70, 92, 76, 14, 1, 11, 7, 90, 36, 102, 18, 95, 69, 19, 81, 80, 15, 20, 9], [103, 118, 52, 87, 97, 95, 20, 0, 13, 82, 81, 79, 70, 11, 9, 10, 68, 90, 3, 8, 66, 1, 47, 28, 110, 42, 53, 14, 63, 5, 71, 121, 86, 43, 22, 127, 55, 26, 65, 113, 92, 123, 2, 120, 69, 58, 46, 74, 122, 44, 88, 109, 48, 102, 39, 56, 61, 107, 125, 115, 16, 36, 89, 67, 57, 117, 126, 54, 50, 62, 73, 64, 45, 59, 29, 80, 93, 124, 114, 106, 116, 7, 38, 78, 6, 104, 100, 25, 96, 40, 27, 60, 108, 51, 83, 77, 37, 15, 119, 112, 101, 19, 99, 85, 105, 32, 21, 91, 30, 12, 76, 72, 98, 111, 4, 94, 41, 34, 35, 18, 49, 33, 24, 23, 17, 84, 75, 31], [103, 52, 118, 124, 35, 86, 92, 83, 79, 13, 1, 74, 72, 75, 31, 0, 16, 4, 70, 81, 105, 71, 26, 3, 21, 69, 2, 117, 116, 24, 53, 28, 14, 94, 5, 39, 12, 61, 112, 123, 41, 46, 95, 121, 82, 22, 56, 113, 54, 47, 55, 106, 84, 58, 59, 99, 48, 122, 9, 89, 98, 49, 126, 115, 50, 30, 40, 96, 34, 114, 63, 45, 23, 119, 111, 60, 51, 108, 100, 125, 17, 101, 7, 42, 107, 127, 87, 36, 93, 104, 67, 62, 43, 88, 20, 102, 85, 120, 33, 109, 29, 57, 91, 37, 44, 76, 32, 97, 38, 110, 90, 27, 25, 18, 73, 77, 80, 66, 19, 78, 68, 6, 64, 15, 65, 10, 11, 8], [109, 38, 0, 95, 33, 45, 89, 124, 66, 65, 17, 6, 79, 4, 10, 76, 2, 112, 64, 9, 69, 11, 126, 60, 56, 84, 59, 115, 61, 3, 47, 50, 18, 125, 52, 7, 105, 63, 118, 68, 78, 39, 116, 111, 46, 67, 40, 113, 110, 122, 121, 57, 23, 8, 127, 58, 51, 22, 44, 86, 53, 83, 88, 13, 70, 16, 108, 104, 117, 41, 92, 5, 37, 43, 119, 99, 48, 103, 101, 42, 90, 96, 80, 54, 29, 30, 55, 71, 77, 91, 123, 120, 35, 85, 94, 32, 93, 98, 62, 100, 114, 107, 26, 75, 102, 27, 36, 106, 87, 24, 28, 73, 19, 49, 34, 82, 1, 21, 74, 72, 97, 12, 15, 14, 81, 20, 25, 31], [40, 52, 59, 63, 98, 75, 26, 95, 17, 119, 78, 116, 22, 10, 77, 20, 0, 2, 6, 4, 83, 7, 118, 29, 113, 72, 123, 124, 110, 67, 18, 60, 125, 70, 111, 3, 23, 28, 50, 122, 15, 115, 55, 56, 54, 46, 121, 57, 48, 53, 47, 49, 120, 87, 62, 51, 117, 126, 114, 58, 127, 16, 61, 89, 106, 24, 9, 109, 112, 12, 43, 42, 45, 44, 21, 5, 103, 91, 108, 65, 107, 105, 8, 64, 41, 36, 39, 11, 102, 37, 27, 92, 99, 101, 93, 84, 38, 104, 35, 33, 31, 94, 32, 14, 90, 30, 96, 86, 97, 100, 69, 85, 66, 1, 25, 19, 74, 88, 80, 82, 71, 13, 79, 34, 81, 73, 76, 68], [41, 48, 98, 50, 37, 95, 23, 89, 82, 93, 90, 118, 114, 76, 80, 85, 74, 65, 44, 83, 64, 14, 59, 6, 77, 47, 53, 19, 68, 124, 86, 16, 27, 81, 20, 8, 111, 102, 9, 78, 66, 121, 125, 17, 71, 7, 112, 67, 119, 84, 106, 39, 94, 12, 32, 54, 63, 116, 117, 88, 72, 113, 96, 26, 62, 42, 69, 110, 115, 127, 3, 87, 22, 43, 107, 104, 4, 56, 51, 122, 60, 99, 58, 52, 123, 30, 15, 49, 92, 46, 108, 11, 38, 36, 61, 79, 35, 97, 33, 120, 45, 126, 91, 18, 28, 13, 109, 103, 70, 29, 40, 5, 75, 21, 2, 100, 57, 55, 10, 24, 0, 31, 25, 73, 1, 105, 101, 34], [46, 103, 110, 97, 53, 14, 0, 85, 95, 82, 16, 75, 12, 72, 17, 10, 66, 70, 71, 68, 79, 3, 51, 1, 118, 5, 124, 63, 30, 126, 8, 114, 11, 23, 40, 9, 50, 123, 90, 120, 56, 65, 86, 117, 77, 119, 57, 81, 113, 125, 122, 58, 112, 88, 67, 18, 121, 83, 84, 19, 116, 87, 62, 47, 32, 20, 127, 29, 78, 91, 44, 89, 28, 25, 93, 6, 24, 54, 2, 69, 92, 107, 27, 37, 42, 43, 76, 105, 100, 101, 34, 15, 22, 35, 59, 45, 7, 109, 99, 102, 36, 60, 55, 98, 108, 61, 115, 26, 96, 48, 111, 49, 80, 106, 41, 33, 38, 73, 52, 13, 94, 104, 74, 64, 4, 21, 31, 39], [104, 53, 112, 98, 48, 21, 11, 81, 19, 16, 0, 27, 93, 5, 73, 77, 117, 76, 2, 79, 8, 6, 65, 59, 122, 111, 25, 62, 102, 68, 110, 69, 57, 67, 72, 30, 50, 71, 56, 3, 84, 116, 15, 52, 94, 92, 74, 125, 91, 45, 121, 119, 31, 39, 58, 63, 13, 108, 70, 44, 55, 54, 7, 46, 61, 107, 90, 51, 32, 105, 118, 126, 28, 88, 109, 66, 41, 22, 96, 10, 114, 82, 113, 106, 86, 100, 60, 115, 120, 26, 124, 99, 24, 64, 127, 97, 23, 33, 29, 38, 103, 36, 14, 78, 42, 1, 95, 47, 43, 49, 35, 9, 18, 37, 101, 123, 87, 89, 75, 12, 85, 20, 83, 17, 4, 80, 40, 34]], "model.layers.4.self_attn.qk_proj": [[52, 118, 53, 124, 109, 46, 112, 45, 110, 63, 59, 50, 56, 48, 122, 117, 123, 39, 95, 103, 33, 41, 104, 31, 87, 17, 23, 11, 75, 81, 15, 38, 79, 85, 21, 28, 116, 102, 119, 40, 74, 18, 82, 42, 13, 80, 22, 10, 72, 97, 25, 64, 77, 20, 16, 6, 19, 83, 0, 12, 84, 70, 89, 91, 26, 86, 90, 78, 14, 76, 68, 125, 4, 37, 99, 61, 8, 94, 73, 9, 101, 126, 47, 65, 1, 35, 2, 98, 111, 54, 113, 66, 60, 114, 105, 51, 120, 29, 121, 92, 71, 7, 57, 108, 62, 67, 115, 127, 30, 44, 93, 5, 88, 27, 69, 3, 34, 106, 49, 24, 55, 107, 58, 43, 96, 32, 36, 100], [52, 118, 53, 124, 109, 46, 112, 45, 110, 59, 63, 50, 48, 56, 117, 122, 123, 39, 95, 103, 33, 41, 31, 87, 104, 11, 23, 17, 81, 15, 38, 116, 28, 0, 79, 40, 85, 75, 64, 21, 74, 18, 72, 10, 97, 102, 25, 89, 42, 22, 6, 70, 19, 82, 26, 20, 83, 91, 68, 77, 16, 80, 12, 13, 125, 78, 119, 1, 90, 14, 84, 37, 4, 76, 86, 98, 101, 54, 105, 94, 65, 66, 113, 2, 47, 9, 111, 121, 73, 126, 35, 8, 114, 61, 120, 108, 99, 57, 51, 62, 3, 115, 60, 29, 69, 71, 5, 67, 92, 106, 7, 43, 58, 34, 24, 93, 127, 27, 30, 49, 44, 55, 88, 107, 96, 32, 100, 36], [52, 118, 53, 124, 46, 109, 112, 45, 110, 63, 59, 50, 56, 122, 117, 48, 123, 39, 103, 95, 33, 41, 87, 104, 31, 23, 11, 15, 81, 0, 38, 72, 70, 75, 85, 17, 28, 64, 18, 116, 74, 82, 10, 79, 21, 13, 80, 40, 16, 22, 6, 97, 4, 83, 25, 42, 102, 19, 77, 20, 14, 12, 91, 26, 89, 84, 65, 68, 78, 76, 1, 90, 119, 125, 54, 86, 37, 66, 105, 2, 98, 94, 99, 121, 9, 8, 101, 126, 47, 114, 73, 3, 61, 120, 35, 111, 67, 5, 113, 7, 71, 69, 93, 57, 60, 108, 92, 51, 24, 29, 62, 30, 115, 44, 34, 88, 127, 58, 55, 27, 106, 49, 107, 43, 96, 32, 36, 100], [52, 118, 53, 124, 109, 46, 112, 45, 110, 59, 63, 50, 56, 122, 48, 117, 39, 123, 95, 103, 33, 41, 31, 104, 87, 79, 23, 17, 11, 64, 81, 75, 72, 18, 38, 0, 70, 28, 85, 15, 21, 102, 74, 80, 97, 25, 116, 13, 40, 19, 91, 10, 83, 82, 16, 42, 6, 89, 4, 14, 22, 84, 77, 12, 86, 20, 76, 119, 78, 26, 94, 90, 98, 68, 2, 121, 65, 125, 66, 37, 101, 1, 47, 99, 105, 54, 9, 8, 114, 67, 113, 73, 3, 69, 60, 111, 120, 115, 35, 126, 61, 71, 58, 34, 29, 92, 93, 5, 88, 57, 108, 30, 44, 7, 62, 51, 24, 127, 43, 27, 106, 49, 107, 55, 96, 32, 36, 100], [52, 118, 53, 124, 109, 46, 112, 45, 110, 63, 59, 56, 117, 50, 122, 48, 123, 95, 103, 39, 41, 33, 104, 87, 79, 64, 11, 75, 81, 17, 31, 70, 72, 23, 0, 38, 40, 85, 6, 15, 116, 28, 18, 21, 80, 82, 10, 74, 13, 97, 16, 22, 83, 84, 4, 42, 77, 12, 19, 25, 78, 76, 102, 26, 20, 91, 2, 89, 68, 86, 8, 65, 119, 14, 66, 120, 1, 90, 98, 9, 94, 61, 125, 113, 73, 37, 126, 47, 111, 101, 99, 121, 35, 114, 7, 67, 115, 105, 51, 57, 69, 29, 5, 60, 71, 3, 93, 92, 54, 30, 34, 108, 62, 44, 58, 106, 127, 88, 27, 24, 49, 107, 55, 43, 36, 96, 32, 100], [52, 118, 53, 124, 109, 46, 112, 45, 110, 59, 63, 50, 48, 56, 122, 117, 123, 95, 39, 103, 33, 41, 104, 81, 31, 87, 79, 11, 17, 23, 75, 0, 38, 85, 64, 15, 28, 82, 21, 40, 70, 10, 6, 72, 4, 102, 80, 18, 116, 13, 74, 97, 25, 12, 16, 77, 78, 76, 83, 20, 22, 8, 91, 84, 86, 68, 19, 89, 42, 119, 14, 90, 26, 66, 125, 47, 126, 2, 1, 73, 101, 61, 94, 65, 98, 37, 54, 99, 105, 9, 113, 35, 5, 111, 29, 121, 114, 3, 71, 120, 7, 51, 115, 34, 49, 62, 60, 67, 57, 92, 108, 69, 58, 127, 93, 107, 24, 106, 27, 30, 44, 88, 43, 55, 96, 36, 32, 100], [52, 118, 53, 124, 109, 46, 45, 112, 110, 59, 63, 50, 56, 48, 122, 117, 123, 39, 103, 95, 33, 41, 104, 17, 31, 87, 79, 23, 81, 38, 11, 40, 85, 75, 28, 25, 82, 116, 18, 15, 21, 6, 74, 0, 42, 102, 13, 22, 84, 97, 76, 26, 78, 10, 20, 64, 14, 83, 16, 80, 70, 72, 19, 12, 89, 86, 119, 8, 68, 91, 4, 77, 90, 47, 61, 37, 101, 126, 1, 94, 125, 120, 9, 66, 2, 73, 98, 65, 121, 54, 113, 35, 29, 99, 111, 114, 57, 60, 71, 105, 7, 34, 93, 127, 51, 67, 62, 3, 58, 92, 108, 69, 115, 27, 5, 44, 30, 88, 24, 106, 55, 49, 43, 107, 96, 36, 100, 32], [52, 118, 53, 124, 109, 46, 112, 45, 110, 63, 59, 50, 56, 122, 48, 117, 123, 39, 95, 103, 33, 41, 104, 81, 87, 23, 17, 79, 75, 31, 85, 28, 38, 6, 15, 11, 21, 40, 64, 82, 116, 74, 10, 25, 18, 97, 76, 0, 42, 16, 77, 70, 22, 19, 84, 80, 78, 8, 119, 13, 83, 89, 20, 26, 91, 102, 14, 12, 86, 68, 4, 101, 37, 125, 66, 72, 1, 90, 65, 9, 54, 111, 98, 94, 60, 35, 126, 44, 105, 73, 120, 113, 57, 2, 93, 47, 121, 61, 7, 99, 29, 51, 24, 69, 58, 92, 67, 114, 3, 27, 71, 115, 108, 34, 5, 30, 127, 62, 106, 43, 88, 49, 55, 107, 96, 36, 32, 100], [52, 118, 53, 124, 46, 109, 112, 45, 110, 59, 63, 50, 56, 48, 122, 117, 123, 39, 95, 103, 33, 41, 87, 104, 23, 81, 31, 17, 11, 79, 85, 21, 75, 38, 82, 6, 15, 40, 97, 10, 64, 74, 83, 0, 28, 18, 80, 25, 8, 116, 22, 20, 16, 13, 42, 102, 70, 91, 76, 89, 78, 19, 84, 86, 77, 90, 12, 68, 54, 14, 119, 26, 125, 105, 4, 37, 98, 101, 66, 47, 72, 94, 111, 65, 1, 2, 61, 121, 99, 9, 126, 120, 73, 58, 113, 35, 34, 92, 93, 29, 57, 114, 51, 115, 7, 67, 108, 69, 62, 60, 3, 30, 71, 24, 44, 27, 106, 5, 43, 55, 88, 127, 49, 32, 107, 96, 36, 100], [52, 118, 53, 124, 109, 46, 45, 112, 110, 59, 63, 50, 48, 56, 122, 117, 123, 39, 103, 95, 33, 87, 41, 81, 23, 31, 104, 17, 75, 15, 21, 38, 11, 28, 85, 116, 79, 42, 18, 25, 82, 74, 40, 97, 83, 102, 0, 78, 8, 84, 16, 13, 19, 10, 90, 6, 22, 64, 12, 80, 89, 86, 26, 91, 20, 77, 70, 68, 14, 76, 119, 37, 4, 125, 101, 105, 94, 99, 54, 1, 111, 65, 47, 121, 98, 120, 72, 60, 73, 114, 126, 9, 92, 35, 66, 29, 2, 71, 61, 24, 34, 69, 7, 3, 115, 27, 113, 67, 51, 106, 57, 93, 44, 108, 5, 127, 49, 88, 58, 30, 62, 55, 107, 43, 96, 36, 32, 100], [52, 118, 53, 124, 109, 46, 112, 45, 110, 63, 59, 56, 48, 50, 122, 117, 39, 123, 95, 103, 33, 41, 104, 31, 87, 75, 11, 64, 17, 79, 8, 81, 15, 23, 6, 82, 38, 28, 10, 40, 0, 21, 116, 85, 42, 70, 80, 18, 102, 97, 13, 74, 16, 83, 68, 22, 19, 77, 12, 119, 25, 20, 84, 91, 78, 26, 4, 14, 76, 86, 89, 47, 1, 66, 2, 65, 72, 101, 99, 90, 94, 9, 73, 98, 125, 37, 126, 121, 114, 105, 35, 67, 115, 127, 61, 111, 5, 60, 120, 69, 54, 113, 71, 92, 7, 3, 57, 108, 106, 29, 58, 51, 93, 30, 62, 24, 34, 107, 43, 49, 44, 27, 88, 55, 96, 36, 32, 100], [52, 118, 53, 124, 109, 46, 112, 45, 110, 59, 63, 56, 50, 122, 48, 117, 123, 95, 39, 33, 103, 41, 104, 31, 15, 87, 23, 81, 75, 8, 17, 116, 64, 28, 70, 11, 40, 38, 85, 102, 18, 10, 6, 21, 0, 82, 16, 97, 79, 83, 42, 25, 77, 68, 13, 91, 19, 12, 20, 74, 22, 84, 80, 119, 89, 26, 125, 4, 14, 66, 76, 86, 78, 113, 2, 90, 101, 9, 61, 72, 65, 94, 47, 121, 51, 98, 73, 126, 111, 1, 37, 120, 105, 99, 54, 67, 35, 57, 7, 34, 69, 115, 60, 5, 93, 114, 92, 3, 62, 108, 55, 29, 71, 44, 30, 106, 127, 24, 58, 27, 88, 43, 49, 107, 32, 96, 36, 100], [52, 118, 53, 124, 109, 46, 45, 112, 110, 50, 59, 63, 48, 56, 122, 117, 123, 39, 103, 95, 33, 41, 104, 87, 17, 81, 31, 23, 38, 40, 28, 85, 75, 15, 11, 79, 25, 116, 21, 82, 83, 97, 16, 10, 22, 0, 8, 70, 18, 102, 42, 86, 26, 74, 6, 20, 89, 12, 84, 14, 80, 13, 76, 91, 19, 119, 90, 77, 78, 64, 37, 4, 101, 94, 68, 9, 72, 125, 98, 113, 1, 65, 29, 35, 105, 111, 61, 126, 2, 47, 99, 54, 60, 66, 51, 93, 121, 120, 7, 114, 73, 34, 57, 3, 27, 92, 58, 24, 71, 44, 108, 55, 5, 115, 88, 62, 49, 69, 127, 106, 67, 30, 43, 107, 36, 96, 32, 100], [52, 118, 53, 124, 109, 46, 45, 112, 110, 59, 63, 50, 56, 48, 117, 122, 123, 95, 39, 33, 103, 41, 87, 31, 104, 81, 17, 11, 23, 38, 75, 28, 79, 70, 15, 0, 85, 40, 64, 18, 10, 74, 8, 25, 97, 16, 22, 42, 102, 21, 82, 83, 116, 80, 86, 6, 84, 77, 78, 20, 12, 13, 14, 19, 91, 76, 90, 68, 72, 89, 119, 4, 9, 26, 1, 37, 125, 98, 47, 65, 2, 66, 101, 99, 105, 94, 51, 7, 61, 121, 54, 111, 35, 126, 73, 29, 120, 92, 113, 5, 60, 57, 108, 114, 67, 34, 115, 44, 127, 27, 71, 88, 3, 62, 93, 69, 58, 55, 24, 49, 30, 106, 43, 107, 36, 96, 32, 100], [52, 118, 53, 124, 109, 46, 112, 45, 110, 59, 63, 50, 56, 48, 122, 117, 123, 39, 95, 33, 103, 41, 104, 87, 31, 75, 11, 23, 17, 38, 116, 70, 15, 74, 0, 81, 79, 21, 85, 42, 64, 28, 40, 119, 82, 6, 18, 16, 97, 102, 72, 19, 8, 25, 10, 13, 83, 22, 20, 80, 89, 84, 14, 4, 68, 78, 91, 77, 12, 76, 86, 26, 2, 90, 125, 1, 94, 9, 98, 37, 66, 65, 101, 120, 47, 73, 121, 126, 54, 3, 67, 7, 111, 105, 44, 99, 35, 92, 113, 57, 5, 115, 51, 61, 60, 114, 93, 27, 69, 29, 34, 71, 30, 49, 127, 108, 62, 88, 24, 55, 58, 106, 43, 107, 36, 96, 100, 32], [52, 118, 53, 124, 109, 46, 112, 45, 110, 63, 59, 50, 56, 48, 122, 117, 123, 95, 39, 103, 41, 33, 104, 87, 31, 11, 75, 23, 81, 38, 79, 17, 74, 28, 15, 70, 85, 64, 40, 18, 20, 116, 97, 102, 21, 16, 0, 72, 25, 6, 13, 10, 82, 91, 19, 80, 83, 84, 89, 22, 42, 86, 12, 76, 119, 125, 77, 4, 90, 68, 78, 8, 14, 94, 101, 66, 120, 26, 54, 73, 2, 98, 37, 111, 9, 1, 35, 126, 47, 105, 65, 34, 113, 99, 61, 51, 108, 29, 57, 121, 114, 3, 115, 60, 71, 7, 92, 67, 62, 93, 69, 5, 106, 30, 127, 58, 107, 44, 24, 43, 88, 49, 55, 27, 96, 100, 36, 32], [52, 118, 53, 124, 46, 109, 45, 112, 110, 63, 59, 50, 122, 56, 48, 117, 123, 39, 103, 95, 33, 41, 87, 23, 17, 31, 81, 15, 38, 11, 85, 79, 28, 75, 104, 72, 82, 18, 21, 74, 16, 25, 40, 14, 20, 22, 97, 6, 86, 89, 19, 10, 0, 102, 70, 84, 76, 77, 116, 91, 83, 64, 68, 13, 42, 12, 78, 80, 90, 4, 26, 125, 8, 47, 1, 9, 101, 94, 119, 105, 61, 37, 65, 98, 73, 99, 126, 113, 29, 2, 121, 66, 120, 114, 35, 111, 7, 71, 60, 34, 51, 54, 5, 58, 92, 3, 57, 24, 69, 93, 67, 88, 127, 108, 27, 115, 55, 62, 44, 30, 49, 106, 43, 96, 107, 36, 32, 100], [52, 118, 53, 124, 46, 109, 45, 112, 110, 63, 59, 50, 56, 122, 48, 39, 123, 117, 103, 95, 33, 41, 87, 31, 17, 104, 15, 75, 81, 21, 85, 23, 72, 82, 11, 28, 38, 79, 6, 18, 16, 64, 42, 10, 40, 74, 116, 89, 83, 0, 97, 70, 20, 25, 22, 12, 19, 14, 13, 102, 125, 84, 78, 80, 76, 77, 91, 86, 68, 90, 119, 26, 4, 105, 9, 126, 37, 99, 1, 98, 94, 66, 65, 8, 101, 73, 121, 2, 51, 111, 113, 3, 54, 47, 60, 7, 120, 67, 29, 127, 114, 61, 35, 71, 93, 5, 92, 57, 115, 34, 69, 44, 49, 108, 24, 62, 27, 88, 30, 43, 58, 55, 106, 107, 32, 96, 36, 100], [52, 118, 53, 124, 109, 46, 112, 45, 110, 63, 59, 50, 56, 122, 48, 117, 123, 39, 95, 103, 33, 41, 31, 104, 17, 87, 75, 38, 23, 15, 0, 72, 6, 81, 79, 21, 11, 85, 116, 82, 10, 40, 97, 16, 28, 64, 89, 18, 70, 83, 19, 42, 20, 74, 102, 25, 13, 76, 14, 84, 80, 119, 22, 77, 90, 78, 68, 91, 26, 12, 4, 65, 37, 86, 101, 66, 98, 1, 8, 111, 9, 105, 73, 125, 94, 114, 2, 120, 3, 99, 35, 47, 67, 126, 121, 29, 60, 113, 51, 54, 61, 62, 34, 69, 93, 7, 30, 57, 92, 5, 71, 44, 115, 108, 88, 127, 24, 49, 27, 43, 58, 55, 106, 107, 96, 32, 36, 100], [52, 118, 53, 124, 46, 109, 45, 112, 110, 63, 59, 50, 56, 48, 122, 117, 123, 39, 95, 33, 103, 41, 17, 31, 87, 81, 23, 75, 11, 116, 104, 38, 15, 79, 28, 72, 21, 85, 74, 10, 0, 6, 40, 64, 70, 20, 18, 97, 16, 83, 19, 77, 82, 102, 90, 78, 84, 68, 25, 89, 13, 91, 76, 22, 4, 80, 42, 119, 125, 86, 12, 26, 14, 94, 65, 8, 37, 1, 98, 9, 101, 73, 66, 54, 105, 99, 120, 113, 47, 2, 121, 111, 61, 35, 71, 29, 3, 126, 7, 58, 62, 5, 69, 60, 92, 34, 51, 115, 114, 57, 93, 67, 106, 108, 24, 43, 127, 88, 44, 49, 27, 55, 30, 96, 36, 32, 107, 100], [52, 118, 53, 124, 46, 109, 112, 45, 110, 63, 59, 56, 50, 122, 117, 48, 39, 123, 95, 33, 103, 41, 31, 87, 104, 11, 81, 79, 23, 75, 17, 38, 18, 15, 85, 6, 72, 21, 28, 97, 40, 102, 10, 83, 0, 74, 64, 119, 82, 16, 116, 91, 25, 70, 77, 89, 76, 22, 80, 4, 19, 84, 42, 13, 68, 20, 12, 121, 26, 86, 78, 90, 14, 8, 47, 99, 105, 125, 94, 65, 66, 73, 120, 9, 37, 98, 114, 61, 1, 2, 101, 35, 34, 60, 126, 113, 54, 111, 29, 51, 5, 3, 69, 57, 24, 71, 108, 115, 92, 62, 7, 106, 30, 67, 93, 127, 58, 49, 43, 44, 88, 27, 55, 107, 96, 32, 36, 100], [52, 118, 53, 124, 46, 109, 112, 45, 110, 63, 59, 50, 56, 48, 122, 117, 123, 39, 95, 103, 33, 31, 41, 104, 87, 17, 38, 79, 75, 81, 85, 23, 15, 64, 116, 28, 11, 21, 0, 70, 18, 102, 83, 97, 89, 72, 40, 19, 82, 42, 26, 10, 80, 74, 91, 16, 6, 25, 77, 14, 84, 13, 20, 78, 4, 8, 22, 76, 119, 12, 68, 90, 1, 86, 37, 65, 66, 98, 47, 125, 73, 105, 94, 101, 9, 99, 60, 113, 3, 2, 126, 54, 111, 115, 57, 114, 61, 121, 35, 92, 7, 24, 69, 29, 120, 5, 67, 62, 51, 34, 49, 27, 108, 127, 93, 43, 71, 30, 58, 44, 106, 107, 88, 55, 32, 96, 100, 36], [52, 118, 53, 124, 109, 46, 112, 45, 110, 63, 59, 50, 56, 122, 48, 123, 117, 39, 95, 103, 33, 41, 87, 104, 31, 81, 11, 38, 23, 15, 17, 21, 28, 40, 85, 79, 0, 116, 75, 70, 74, 10, 16, 83, 97, 18, 22, 19, 64, 89, 82, 8, 84, 25, 77, 80, 6, 20, 42, 86, 119, 12, 91, 26, 72, 76, 13, 102, 4, 90, 78, 68, 37, 14, 66, 94, 65, 98, 1, 73, 125, 105, 101, 2, 47, 9, 35, 121, 111, 61, 120, 29, 113, 99, 114, 3, 54, 126, 51, 92, 60, 69, 34, 7, 24, 93, 30, 71, 58, 115, 27, 67, 57, 106, 5, 127, 62, 49, 108, 44, 43, 88, 107, 55, 96, 32, 100, 36], [52, 118, 53, 124, 109, 46, 45, 112, 110, 59, 63, 50, 48, 56, 122, 117, 123, 39, 95, 33, 103, 41, 31, 87, 104, 11, 81, 23, 17, 79, 38, 85, 75, 74, 40, 15, 116, 64, 28, 97, 18, 70, 8, 83, 25, 22, 21, 102, 42, 91, 80, 16, 6, 82, 90, 119, 68, 76, 13, 77, 89, 10, 19, 12, 86, 84, 26, 20, 78, 72, 14, 4, 0, 98, 37, 101, 9, 111, 65, 66, 105, 94, 54, 126, 125, 120, 99, 47, 2, 1, 73, 61, 113, 35, 121, 51, 115, 127, 29, 60, 71, 34, 57, 55, 93, 114, 69, 3, 7, 62, 92, 5, 67, 108, 106, 24, 30, 27, 88, 58, 49, 44, 107, 43, 96, 100, 36, 32], [52, 118, 53, 124, 109, 46, 45, 112, 110, 59, 63, 50, 56, 122, 48, 117, 123, 39, 103, 95, 33, 41, 104, 81, 87, 31, 17, 11, 23, 116, 15, 38, 75, 70, 85, 21, 18, 40, 79, 28, 82, 74, 83, 22, 84, 8, 77, 119, 42, 16, 19, 25, 20, 97, 80, 0, 6, 89, 10, 26, 76, 64, 12, 91, 102, 14, 78, 90, 86, 68, 47, 13, 4, 72, 65, 1, 125, 101, 37, 94, 113, 126, 73, 98, 54, 111, 9, 35, 2, 61, 114, 66, 71, 29, 60, 57, 120, 127, 121, 105, 93, 99, 34, 108, 51, 3, 92, 7, 24, 62, 69, 67, 55, 30, 106, 115, 88, 5, 44, 58, 27, 49, 43, 107, 96, 32, 100, 36], [52, 118, 53, 124, 46, 109, 45, 112, 110, 63, 50, 59, 56, 48, 123, 117, 122, 39, 95, 103, 33, 41, 87, 31, 81, 23, 104, 75, 15, 85, 11, 79, 17, 38, 28, 8, 40, 70, 74, 116, 82, 21, 97, 25, 22, 64, 18, 83, 42, 10, 89, 16, 119, 0, 102, 19, 20, 6, 14, 91, 13, 77, 80, 84, 76, 12, 90, 78, 26, 4, 68, 101, 86, 37, 111, 125, 1, 9, 98, 61, 66, 120, 72, 73, 105, 47, 65, 2, 94, 60, 57, 35, 113, 121, 51, 99, 126, 58, 92, 54, 7, 29, 106, 34, 115, 93, 62, 127, 67, 108, 114, 24, 5, 3, 49, 69, 27, 43, 55, 44, 30, 88, 71, 107, 96, 32, 36, 100], [52, 118, 53, 124, 109, 46, 112, 45, 110, 59, 63, 50, 56, 48, 117, 122, 123, 39, 103, 95, 33, 41, 87, 104, 31, 11, 81, 85, 23, 15, 97, 0, 17, 28, 75, 79, 21, 8, 40, 38, 82, 102, 70, 18, 25, 119, 74, 22, 116, 16, 83, 19, 20, 80, 13, 77, 91, 10, 6, 64, 12, 14, 42, 89, 4, 26, 84, 76, 68, 90, 78, 47, 86, 99, 94, 125, 101, 37, 105, 66, 65, 72, 1, 126, 98, 73, 60, 9, 57, 35, 113, 2, 111, 61, 114, 29, 54, 121, 92, 120, 3, 67, 7, 51, 127, 69, 93, 30, 24, 71, 5, 106, 34, 27, 62, 115, 49, 44, 108, 58, 88, 107, 43, 55, 96, 36, 32, 100], [52, 118, 53, 124, 46, 109, 45, 112, 110, 63, 59, 50, 56, 48, 122, 117, 123, 39, 95, 103, 33, 41, 104, 31, 11, 17, 87, 75, 81, 0, 8, 15, 23, 79, 64, 116, 74, 40, 85, 28, 6, 38, 42, 21, 18, 20, 10, 82, 83, 97, 70, 4, 80, 76, 77, 22, 25, 102, 16, 91, 13, 14, 78, 86, 19, 12, 89, 68, 1, 119, 26, 66, 84, 113, 125, 65, 90, 47, 98, 72, 120, 94, 9, 2, 101, 37, 105, 60, 111, 73, 54, 99, 121, 35, 126, 61, 51, 67, 114, 7, 71, 34, 57, 3, 29, 69, 5, 115, 92, 127, 44, 62, 93, 58, 88, 108, 49, 55, 30, 24, 27, 106, 43, 107, 36, 96, 32, 100], [52, 118, 53, 124, 109, 46, 112, 45, 110, 63, 59, 50, 56, 48, 122, 117, 123, 39, 95, 33, 103, 41, 11, 31, 104, 87, 75, 81, 116, 38, 79, 8, 23, 28, 17, 6, 15, 21, 74, 0, 85, 40, 82, 80, 64, 97, 25, 10, 42, 83, 102, 18, 19, 70, 89, 76, 91, 22, 77, 14, 119, 68, 13, 84, 16, 20, 4, 86, 12, 78, 72, 26, 90, 101, 121, 2, 125, 1, 37, 73, 9, 105, 65, 47, 66, 114, 94, 113, 98, 120, 111, 35, 99, 51, 108, 54, 34, 115, 61, 71, 3, 60, 67, 7, 5, 92, 29, 69, 126, 57, 62, 93, 58, 24, 30, 127, 106, 55, 49, 88, 44, 27, 43, 107, 96, 32, 36, 100], [52, 118, 53, 124, 109, 46, 112, 45, 110, 59, 63, 50, 122, 48, 56, 117, 123, 39, 95, 33, 103, 41, 104, 31, 15, 87, 11, 75, 23, 17, 81, 6, 79, 38, 0, 28, 8, 85, 10, 18, 64, 22, 21, 82, 40, 97, 116, 16, 83, 13, 74, 25, 102, 77, 80, 70, 19, 86, 42, 76, 4, 91, 68, 78, 20, 12, 84, 90, 14, 89, 72, 26, 47, 66, 60, 1, 119, 125, 101, 73, 94, 37, 105, 2, 65, 98, 9, 126, 35, 113, 114, 61, 111, 120, 99, 121, 69, 7, 51, 62, 3, 54, 57, 29, 71, 108, 67, 5, 34, 115, 58, 93, 92, 106, 127, 30, 49, 24, 27, 44, 107, 88, 43, 55, 96, 32, 36, 100], [52, 118, 53, 124, 109, 46, 45, 112, 110, 63, 50, 59, 56, 122, 48, 117, 123, 103, 39, 95, 33, 41, 87, 31, 38, 81, 17, 23, 104, 85, 75, 11, 15, 6, 28, 0, 18, 79, 40, 42, 97, 89, 21, 10, 82, 25, 64, 70, 80, 102, 86, 74, 22, 8, 14, 20, 19, 26, 90, 68, 76, 77, 83, 116, 78, 72, 16, 12, 4, 13, 91, 37, 101, 119, 65, 84, 9, 1, 125, 66, 98, 111, 113, 126, 105, 73, 94, 61, 35, 47, 120, 2, 54, 99, 121, 69, 67, 51, 7, 29, 34, 114, 115, 60, 71, 62, 57, 93, 49, 58, 92, 27, 5, 44, 3, 127, 108, 24, 30, 106, 88, 55, 43, 107, 96, 36, 32, 100], [52, 118, 53, 124, 109, 46, 45, 112, 110, 59, 63, 50, 56, 48, 122, 123, 117, 39, 95, 103, 41, 33, 104, 31, 81, 87, 23, 11, 75, 15, 17, 85, 40, 79, 38, 6, 10, 42, 82, 74, 97, 25, 72, 0, 28, 20, 89, 64, 21, 102, 70, 116, 77, 18, 16, 19, 91, 80, 119, 76, 14, 125, 101, 83, 8, 13, 78, 26, 68, 22, 12, 84, 86, 47, 4, 90, 37, 111, 98, 54, 105, 126, 9, 66, 2, 121, 1, 94, 61, 65, 73, 120, 114, 99, 60, 35, 115, 29, 71, 113, 57, 62, 51, 108, 58, 67, 3, 93, 5, 7, 34, 92, 24, 69, 44, 106, 127, 88, 30, 27, 49, 43, 96, 55, 107, 32, 36, 100]], "model.layers.5.self_attn.q_proj": [[39, 106, 98, 50, 20, 80, 42, 23, 127, 116, 14, 49, 18, 10, 124, 51, 27, 47, 74, 61, 120, 103, 54, 12, 24, 112, 87, 57, 84, 71, 25, 115, 68, 48, 63, 29, 111, 123, 16, 96, 15, 26, 76, 113, 85, 82, 83, 9, 79, 88, 125, 52, 118, 92, 114, 28, 121, 91, 122, 21, 78, 93, 41, 62, 94, 99, 8, 59, 75, 86, 58, 6, 89, 81, 126, 30, 44, 22, 90, 97, 35, 11, 95, 60, 31, 2, 45, 104, 109, 110, 119, 36, 108, 70, 117, 102, 107, 19, 33, 40, 105, 56, 38, 32, 46, 13, 53, 3, 17, 77, 37, 43, 100, 101, 55, 7, 69, 34, 73, 4, 72, 5, 1, 65, 0, 66, 67, 64], [39, 98, 106, 49, 47, 51, 127, 57, 54, 23, 48, 27, 120, 116, 44, 63, 124, 50, 111, 92, 59, 123, 126, 60, 28, 46, 121, 105, 61, 41, 42, 52, 55, 112, 115, 43, 107, 118, 58, 125, 108, 53, 36, 119, 22, 110, 62, 117, 18, 102, 45, 113, 32, 40, 82, 104, 109, 96, 100, 21, 38, 35, 114, 56, 101, 122, 99, 30, 37, 25, 80, 97, 93, 33, 95, 31, 85, 89, 29, 87, 83, 78, 94, 91, 90, 14, 26, 34, 10, 79, 84, 20, 24, 19, 88, 16, 17, 103, 86, 12, 76, 81, 4, 74, 75, 77, 73, 15, 7, 11, 67, 13, 64, 70, 69, 72, 8, 71, 9, 3, 66, 68, 5, 6, 0, 65, 1, 2], [39, 106, 98, 50, 18, 74, 20, 88, 80, 14, 12, 23, 42, 68, 103, 116, 49, 6, 47, 51, 10, 9, 54, 4, 2, 7, 91, 21, 44, 83, 61, 121, 75, 64, 124, 127, 57, 70, 120, 71, 115, 87, 72, 92, 125, 16, 73, 13, 15, 11, 48, 28, 3, 24, 76, 17, 52, 22, 78, 67, 84, 89, 111, 63, 85, 123, 82, 112, 77, 59, 126, 86, 5, 8, 58, 60, 30, 81, 79, 66, 27, 113, 25, 19, 0, 1, 26, 118, 69, 110, 33, 114, 38, 94, 105, 122, 65, 95, 32, 31, 29, 41, 97, 55, 36, 102, 109, 93, 96, 101, 119, 46, 90, 53, 99, 117, 100, 108, 43, 45, 35, 34, 37, 40, 104, 62, 107, 56], [106, 39, 50, 98, 18, 12, 80, 20, 6, 23, 14, 9, 2, 74, 42, 47, 49, 116, 103, 64, 70, 28, 127, 87, 54, 51, 57, 91, 7, 71, 124, 68, 120, 61, 75, 115, 60, 3, 10, 4, 76, 24, 78, 67, 121, 27, 48, 26, 69, 13, 1, 11, 118, 126, 63, 89, 125, 84, 19, 17, 44, 123, 82, 16, 21, 66, 111, 15, 59, 83, 112, 119, 73, 65, 22, 8, 55, 52, 5, 77, 58, 41, 0, 62, 72, 105, 46, 81, 88, 85, 110, 108, 53, 30, 107, 33, 113, 109, 40, 114, 79, 104, 101, 29, 38, 43, 35, 99, 45, 117, 96, 122, 25, 36, 100, 102, 97, 95, 93, 56, 31, 86, 94, 90, 92, 32, 37, 34], [38, 110, 126, 125, 115, 48, 72, 112, 7, 23, 5, 4, 74, 6, 76, 70, 67, 29, 20, 11, 16, 13, 46, 51, 90, 81, 75, 12, 93, 9, 78, 26, 83, 18, 73, 66, 14, 80, 68, 65, 10, 84, 2, 69, 62, 59, 117, 17, 21, 19, 79, 122, 123, 15, 121, 56, 113, 116, 120, 55, 50, 8, 105, 91, 31, 64, 63, 124, 85, 114, 95, 87, 118, 49, 61, 53, 47, 111, 32, 58, 104, 89, 109, 27, 54, 88, 119, 71, 40, 127, 41, 44, 77, 82, 96, 45, 52, 60, 35, 108, 106, 107, 43, 42, 37, 57, 94, 28, 101, 103, 24, 1, 100, 36, 22, 86, 98, 39, 34, 3, 99, 97, 33, 25, 92, 30, 102, 0], [110, 38, 126, 112, 48, 46, 115, 125, 51, 62, 17, 116, 122, 59, 123, 56, 16, 105, 50, 121, 117, 14, 124, 84, 120, 104, 85, 32, 113, 49, 26, 55, 114, 61, 13, 119, 63, 53, 102, 118, 47, 31, 111, 41, 58, 88, 109, 27, 107, 45, 29, 127, 54, 44, 60, 40, 15, 108, 93, 52, 11, 42, 74, 106, 100, 24, 43, 57, 37, 91, 96, 103, 87, 19, 36, 90, 99, 101, 39, 35, 76, 33, 98, 34, 82, 97, 28, 95, 81, 94, 9, 89, 22, 30, 21, 92, 20, 25, 23, 72, 18, 8, 86, 71, 83, 75, 80, 6, 77, 10, 79, 78, 7, 0, 68, 5, 69, 66, 3, 12, 1, 65, 73, 70, 64, 2, 4, 67], [38, 110, 48, 112, 125, 115, 126, 46, 93, 51, 78, 83, 26, 21, 11, 72, 88, 122, 32, 123, 10, 31, 7, 124, 62, 116, 79, 77, 81, 50, 95, 15, 74, 56, 5, 29, 41, 85, 75, 61, 84, 59, 121, 18, 55, 19, 86, 90, 73, 117, 17, 106, 53, 113, 27, 120, 63, 91, 40, 80, 70, 111, 127, 47, 58, 14, 6, 92, 96, 49, 109, 104, 60, 105, 114, 4, 25, 44, 23, 45, 119, 24, 118, 97, 9, 30, 107, 28, 99, 89, 37, 108, 13, 67, 35, 43, 94, 22, 39, 68, 52, 34, 100, 101, 20, 54, 87, 103, 33, 66, 98, 16, 36, 42, 76, 57, 65, 12, 82, 8, 102, 64, 71, 3, 69, 2, 0, 1], [38, 110, 112, 115, 48, 46, 125, 51, 26, 31, 27, 122, 62, 88, 50, 102, 93, 21, 116, 117, 59, 56, 123, 23, 124, 126, 120, 104, 105, 106, 32, 41, 83, 53, 40, 61, 58, 86, 101, 111, 17, 29, 100, 113, 121, 47, 60, 49, 63, 95, 78, 33, 45, 119, 35, 109, 114, 16, 22, 96, 91, 74, 54, 99, 55, 84, 36, 37, 85, 30, 52, 28, 89, 94, 43, 107, 11, 42, 108, 118, 87, 44, 34, 14, 103, 18, 72, 98, 90, 80, 127, 57, 39, 97, 92, 13, 20, 7, 6, 79, 24, 25, 9, 19, 5, 76, 75, 15, 66, 68, 10, 82, 65, 77, 12, 81, 3, 73, 1, 0, 71, 67, 70, 64, 8, 4, 2, 69], [113, 121, 86, 61, 125, 49, 57, 59, 85, 116, 52, 60, 56, 122, 22, 62, 55, 117, 53, 124, 114, 63, 127, 54, 115, 119, 51, 123, 18, 58, 118, 50, 112, 120, 110, 111, 126, 109, 48, 46, 45, 47, 44, 108, 107, 43, 106, 23, 42, 105, 41, 40, 90, 104, 39, 35, 36, 38, 94, 95, 102, 103, 87, 34, 37, 15, 96, 100, 82, 97, 98, 64, 99, 1, 30, 31, 101, 21, 91, 66, 33, 16, 28, 3, 84, 25, 93, 65, 88, 12, 92, 80, 0, 77, 14, 69, 26, 78, 13, 89, 74, 4, 75, 24, 73, 71, 29, 2, 67, 20, 83, 8, 6, 19, 17, 68, 27, 79, 9, 10, 32, 11, 5, 7, 70, 76, 81, 72], [121, 113, 61, 125, 49, 22, 57, 59, 116, 52, 56, 62, 122, 55, 60, 117, 63, 114, 53, 127, 124, 119, 115, 51, 58, 54, 123, 118, 50, 112, 120, 110, 111, 126, 25, 48, 109, 46, 90, 47, 45, 44, 64, 19, 108, 107, 106, 30, 43, 66, 3, 69, 65, 96, 105, 42, 95, 38, 41, 26, 92, 40, 39, 104, 34, 27, 4, 94, 102, 1, 100, 36, 103, 71, 35, 97, 23, 98, 9, 67, 29, 87, 99, 33, 70, 86, 32, 24, 6, 15, 2, 31, 37, 0, 101, 84, 28, 18, 93, 85, 68, 13, 8, 88, 10, 80, 91, 83, 5, 77, 76, 12, 89, 82, 11, 78, 17, 16, 79, 75, 81, 7, 20, 14, 72, 21, 73, 74], [113, 121, 61, 125, 49, 57, 86, 22, 59, 116, 52, 60, 56, 122, 117, 62, 55, 53, 63, 114, 124, 54, 119, 51, 127, 123, 58, 115, 118, 120, 21, 50, 110, 112, 111, 126, 109, 46, 48, 45, 47, 44, 83, 108, 107, 106, 87, 43, 92, 42, 105, 30, 25, 41, 23, 89, 96, 40, 39, 104, 38, 90, 20, 34, 88, 94, 36, 103, 102, 64, 35, 100, 99, 18, 24, 97, 66, 26, 19, 1, 27, 93, 95, 65, 3, 98, 69, 33, 31, 37, 28, 0, 101, 77, 4, 10, 32, 11, 29, 85, 71, 76, 67, 13, 9, 2, 81, 8, 12, 6, 68, 91, 14, 82, 80, 84, 72, 15, 70, 78, 79, 5, 75, 16, 17, 73, 74, 7], [37, 61, 125, 49, 121, 57, 113, 59, 22, 96, 62, 55, 122, 32, 52, 116, 20, 60, 127, 56, 91, 115, 88, 124, 83, 117, 30, 77, 28, 54, 119, 53, 123, 110, 51, 63, 29, 114, 58, 112, 45, 92, 94, 50, 118, 9, 46, 120, 44, 111, 47, 109, 48, 79, 126, 69, 11, 71, 89, 76, 18, 26, 108, 93, 105, 43, 66, 87, 107, 33, 41, 36, 106, 81, 64, 39, 8, 3, 82, 42, 6, 104, 34, 99, 40, 10, 65, 23, 90, 4, 27, 17, 78, 74, 24, 98, 97, 103, 102, 19, 101, 100, 84, 31, 1, 25, 38, 73, 14, 80, 35, 75, 95, 12, 68, 13, 16, 7, 15, 85, 70, 67, 21, 2, 72, 86, 0, 5], [38, 44, 115, 90, 51, 22, 84, 108, 17, 72, 79, 81, 74, 18, 29, 126, 8, 12, 127, 16, 10, 78, 33, 82, 28, 19, 89, 112, 123, 120, 36, 4, 116, 5, 124, 122, 13, 125, 71, 60, 76, 98, 103, 35, 39, 110, 49, 107, 50, 91, 62, 47, 52, 15, 67, 97, 111, 80, 42, 45, 121, 73, 118, 113, 119, 114, 105, 20, 25, 109, 63, 55, 59, 53, 96, 86, 58, 23, 41, 30, 54, 87, 61, 106, 6, 46, 40, 37, 43, 101, 117, 99, 48, 26, 31, 34, 70, 14, 100, 94, 57, 95, 32, 24, 104, 68, 21, 92, 9, 77, 65, 56, 85, 27, 88, 93, 2, 75, 11, 0, 102, 83, 7, 1, 3, 69, 66, 64], [38, 44, 115, 69, 71, 51, 79, 12, 17, 22, 18, 108, 84, 74, 81, 8, 78, 90, 2, 64, 3, 33, 5, 66, 102, 7, 68, 65, 120, 126, 1, 10, 49, 15, 67, 14, 57, 112, 70, 76, 95, 59, 125, 82, 20, 52, 23, 88, 9, 73, 19, 75, 26, 83, 56, 4, 16, 0, 13, 11, 87, 127, 119, 86, 92, 29, 98, 89, 116, 114, 124, 80, 72, 37, 6, 93, 58, 25, 61, 28, 91, 36, 110, 77, 104, 99, 41, 45, 27, 117, 35, 122, 106, 63, 21, 34, 96, 30, 85, 118, 121, 31, 62, 123, 100, 94, 24, 111, 48, 50, 60, 101, 32, 42, 39, 105, 97, 46, 40, 103, 109, 47, 55, 54, 53, 43, 113, 107], [38, 44, 115, 74, 18, 84, 79, 12, 51, 108, 69, 22, 71, 3, 8, 78, 2, 64, 70, 102, 33, 1, 17, 67, 90, 68, 76, 81, 6, 7, 15, 66, 120, 16, 28, 4, 112, 23, 82, 10, 52, 59, 126, 91, 65, 86, 88, 49, 98, 26, 73, 127, 125, 72, 92, 95, 5, 57, 56, 89, 41, 85, 83, 9, 24, 118, 114, 117, 35, 0, 13, 20, 14, 58, 46, 29, 93, 19, 25, 116, 45, 75, 123, 80, 105, 50, 21, 111, 62, 11, 31, 110, 37, 124, 40, 63, 30, 32, 42, 60, 107, 119, 104, 113, 36, 47, 94, 109, 101, 77, 48, 39, 99, 122, 61, 87, 103, 53, 121, 34, 106, 55, 100, 43, 96, 97, 54, 27], [38, 44, 115, 84, 51, 22, 18, 108, 12, 17, 8, 79, 78, 74, 16, 72, 33, 19, 70, 89, 67, 5, 68, 10, 4, 81, 24, 126, 120, 90, 71, 65, 127, 2, 92, 112, 76, 49, 82, 6, 73, 69, 125, 28, 7, 75, 13, 60, 14, 3, 80, 29, 116, 23, 25, 114, 86, 98, 35, 56, 9, 50, 57, 117, 64, 107, 20, 123, 0, 58, 52, 32, 36, 124, 103, 41, 87, 93, 88, 91, 21, 26, 15, 1, 102, 39, 96, 37, 118, 43, 11, 95, 53, 83, 30, 63, 111, 100, 110, 31, 61, 62, 101, 106, 85, 40, 104, 47, 94, 113, 55, 77, 27, 109, 45, 97, 121, 59, 46, 34, 119, 99, 48, 122, 105, 54, 42, 66], [38, 44, 34, 119, 118, 53, 56, 81, 30, 78, 23, 84, 126, 47, 11, 25, 71, 122, 5, 8, 12, 52, 79, 9, 82, 87, 93, 54, 27, 29, 61, 116, 20, 50, 19, 94, 90, 125, 86, 13, 14, 66, 55, 17, 67, 7, 75, 15, 18, 127, 28, 10, 106, 16, 51, 70, 89, 101, 4, 22, 62, 46, 77, 88, 21, 121, 26, 68, 85, 69, 102, 80, 48, 73, 3, 74, 2, 96, 49, 97, 63, 105, 1, 58, 98, 42, 0, 91, 114, 24, 83, 110, 35, 59, 72, 41, 92, 113, 76, 64, 32, 117, 33, 40, 65, 99, 36, 95, 39, 43, 123, 109, 37, 31, 103, 6, 100, 108, 120, 124, 104, 115, 60, 107, 57, 112, 111, 45], [44, 53, 38, 118, 47, 34, 27, 101, 119, 126, 116, 22, 122, 54, 58, 94, 102, 30, 79, 56, 82, 46, 114, 84, 108, 59, 41, 125, 23, 106, 57, 62, 55, 93, 103, 52, 110, 63, 26, 113, 90, 50, 109, 29, 123, 61, 115, 60, 42, 124, 51, 48, 117, 31, 40, 105, 49, 121, 43, 86, 104, 127, 112, 33, 45, 9, 120, 107, 100, 80, 98, 39, 18, 37, 24, 99, 36, 92, 111, 25, 91, 73, 96, 88, 83, 28, 3, 13, 32, 35, 76, 21, 95, 85, 97, 69, 81, 72, 71, 89, 11, 20, 14, 74, 16, 75, 19, 15, 70, 7, 12, 10, 87, 6, 77, 68, 67, 0, 65, 17, 8, 2, 4, 78, 66, 64, 5, 1], [44, 38, 119, 56, 118, 53, 34, 58, 27, 84, 30, 94, 42, 79, 101, 25, 122, 125, 21, 116, 114, 82, 41, 23, 26, 50, 9, 127, 18, 62, 117, 47, 24, 54, 93, 113, 59, 52, 92, 51, 46, 105, 99, 57, 108, 55, 86, 96, 126, 48, 61, 60, 81, 33, 29, 28, 123, 103, 39, 90, 124, 31, 110, 22, 89, 120, 88, 112, 87, 12, 106, 49, 15, 91, 102, 121, 63, 109, 13, 71, 45, 104, 40, 11, 35, 115, 83, 97, 19, 85, 100, 95, 77, 75, 36, 37, 20, 16, 111, 107, 43, 32, 80, 78, 73, 98, 3, 17, 7, 70, 10, 67, 74, 76, 0, 5, 6, 14, 2, 66, 4, 64, 69, 8, 68, 72, 65, 1], [38, 53, 118, 119, 56, 34, 12, 44, 88, 23, 25, 84, 30, 47, 27, 81, 108, 6, 8, 126, 94, 1, 18, 89, 9, 86, 87, 79, 19, 33, 67, 21, 112, 54, 15, 58, 20, 52, 105, 65, 82, 41, 42, 55, 125, 46, 16, 117, 114, 101, 102, 77, 123, 116, 11, 48, 92, 122, 31, 26, 72, 62, 80, 57, 120, 90, 39, 70, 76, 115, 83, 110, 63, 61, 50, 32, 78, 97, 37, 106, 75, 85, 29, 124, 113, 45, 95, 59, 51, 5, 103, 100, 35, 2, 104, 0, 127, 17, 43, 74, 14, 22, 28, 107, 121, 64, 60, 7, 96, 109, 40, 13, 69, 111, 49, 93, 99, 36, 68, 10, 4, 91, 24, 73, 71, 66, 3, 98], [39, 51, 50, 114, 97, 115, 113, 54, 117, 85, 124, 87, 120, 121, 58, 29, 63, 27, 24, 55, 61, 122, 126, 91, 111, 75, 92, 108, 25, 32, 112, 83, 57, 33, 26, 93, 53, 22, 56, 20, 59, 116, 79, 82, 73, 18, 17, 60, 88, 23, 49, 69, 21, 81, 14, 99, 48, 119, 77, 95, 34, 90, 80, 109, 16, 38, 40, 72, 41, 28, 30, 127, 107, 84, 7, 94, 78, 71, 52, 123, 15, 43, 76, 89, 100, 118, 110, 45, 125, 47, 44, 42, 86, 96, 104, 36, 98, 37, 106, 46, 102, 101, 31, 35, 105, 9, 12, 62, 11, 13, 74, 3, 5, 67, 70, 19, 8, 6, 66, 2, 4, 1, 0, 68, 10, 103, 64, 65], [51, 39, 114, 113, 50, 121, 97, 116, 61, 53, 59, 124, 60, 54, 122, 29, 120, 118, 87, 52, 105, 45, 125, 24, 56, 115, 112, 86, 63, 123, 103, 58, 26, 88, 27, 119, 109, 83, 25, 110, 107, 95, 62, 46, 85, 117, 40, 111, 92, 55, 33, 57, 44, 18, 82, 43, 127, 94, 126, 49, 98, 108, 47, 93, 106, 20, 42, 90, 41, 34, 35, 75, 23, 102, 101, 96, 38, 48, 15, 100, 89, 36, 14, 17, 37, 99, 104, 91, 19, 80, 73, 31, 32, 77, 28, 84, 12, 21, 22, 81, 30, 16, 72, 74, 70, 69, 78, 11, 0, 13, 68, 67, 64, 66, 79, 76, 1, 3, 71, 9, 6, 2, 5, 65, 8, 7, 10, 4], [39, 114, 51, 50, 97, 121, 122, 11, 9, 21, 87, 29, 36, 116, 83, 34, 54, 12, 30, 98, 14, 92, 6, 48, 19, 109, 110, 8, 119, 10, 62, 94, 53, 5, 113, 126, 35, 63, 15, 42, 13, 28, 55, 71, 47, 59, 67, 127, 57, 76, 111, 108, 58, 117, 70, 101, 43, 2, 16, 91, 79, 118, 93, 115, 68, 40, 125, 100, 4, 120, 52, 32, 60, 45, 106, 73, 102, 74, 112, 123, 56, 23, 105, 41, 46, 49, 95, 38, 89, 72, 96, 81, 44, 65, 90, 104, 99, 37, 22, 61, 107, 82, 0, 84, 18, 124, 86, 26, 85, 20, 75, 27, 78, 31, 17, 25, 88, 103, 33, 24, 80, 77, 1, 7, 69, 3, 64, 66], [39, 51, 114, 50, 97, 113, 54, 122, 121, 85, 24, 61, 87, 63, 92, 29, 48, 116, 124, 83, 95, 25, 117, 57, 27, 34, 26, 80, 126, 40, 82, 28, 111, 120, 20, 119, 33, 53, 110, 58, 91, 60, 77, 14, 17, 75, 108, 32, 15, 55, 47, 56, 73, 42, 21, 115, 89, 59, 23, 62, 88, 125, 107, 127, 19, 36, 98, 86, 74, 104, 49, 94, 81, 90, 52, 118, 18, 41, 38, 78, 101, 79, 105, 106, 22, 84, 72, 30, 43, 123, 35, 112, 13, 12, 44, 37, 76, 11, 102, 109, 93, 100, 70, 45, 99, 69, 16, 96, 31, 46, 67, 7, 68, 71, 9, 0, 10, 8, 1, 6, 2, 65, 3, 4, 5, 66, 64, 103], [108, 124, 102, 122, 62, 34, 107, 23, 30, 60, 50, 35, 119, 104, 25, 120, 45, 31, 123, 114, 121, 49, 117, 61, 103, 87, 40, 106, 47, 29, 55, 44, 110, 22, 20, 28, 32, 86, 118, 52, 38, 58, 59, 41, 33, 63, 53, 92, 115, 113, 48, 88, 43, 54, 126, 96, 105, 89, 46, 99, 56, 26, 57, 111, 51, 42, 91, 116, 112, 125, 127, 109, 101, 97, 94, 37, 79, 100, 77, 21, 39, 93, 84, 95, 27, 36, 90, 24, 11, 16, 17, 13, 83, 69, 81, 18, 72, 19, 75, 8, 15, 98, 78, 5, 10, 14, 74, 3, 65, 7, 64, 0, 67, 66, 68, 85, 1, 71, 4, 12, 80, 82, 2, 73, 9, 6, 76, 70], [102, 124, 34, 108, 62, 122, 107, 23, 17, 93, 25, 21, 11, 79, 104, 77, 18, 16, 120, 30, 69, 15, 114, 96, 29, 22, 32, 10, 83, 19, 85, 126, 89, 54, 5, 20, 117, 26, 49, 40, 31, 50, 92, 87, 127, 123, 110, 73, 2, 28, 90, 86, 119, 60, 1, 81, 51, 118, 103, 24, 91, 43, 109, 35, 41, 27, 47, 44, 84, 45, 38, 36, 116, 8, 57, 74, 78, 37, 125, 100, 13, 82, 112, 63, 94, 71, 68, 53, 67, 115, 97, 106, 0, 88, 75, 14, 61, 99, 55, 101, 105, 64, 111, 58, 42, 95, 33, 39, 46, 113, 121, 56, 72, 59, 52, 65, 4, 3, 7, 80, 48, 66, 12, 6, 9, 76, 98, 70], [102, 62, 122, 34, 124, 21, 93, 18, 108, 16, 40, 23, 12, 29, 83, 90, 77, 25, 73, 85, 11, 92, 61, 76, 126, 96, 100, 8, 24, 6, 26, 44, 27, 28, 116, 49, 82, 70, 107, 22, 119, 20, 32, 120, 84, 112, 78, 5, 42, 19, 57, 53, 58, 7, 46, 71, 17, 48, 36, 79, 106, 69, 80, 95, 123, 127, 67, 88, 4, 59, 114, 75, 109, 65, 103, 2, 43, 33, 63, 50, 0, 104, 13, 99, 9, 68, 86, 110, 14, 15, 45, 97, 66, 47, 41, 113, 117, 87, 105, 10, 89, 3, 55, 72, 37, 54, 98, 51, 91, 52, 111, 1, 64, 74, 30, 94, 81, 56, 31, 121, 115, 101, 118, 35, 60, 38, 39, 125], [62, 102, 122, 108, 34, 124, 26, 120, 23, 51, 35, 123, 30, 90, 40, 61, 21, 95, 24, 29, 96, 75, 110, 48, 93, 60, 16, 54, 5, 127, 37, 45, 114, 22, 20, 39, 50, 72, 31, 18, 94, 58, 79, 107, 49, 104, 8, 36, 32, 89, 118, 69, 44, 77, 117, 86, 59, 112, 125, 25, 103, 116, 11, 57, 106, 121, 111, 87, 56, 92, 52, 97, 55, 126, 53, 83, 76, 27, 119, 101, 91, 47, 46, 28, 41, 84, 113, 63, 109, 33, 100, 88, 74, 115, 82, 105, 3, 17, 15, 42, 98, 85, 13, 81, 73, 99, 10, 43, 38, 70, 14, 12, 71, 19, 80, 9, 0, 78, 2, 1, 68, 7, 4, 66, 67, 65, 64, 6], [38, 97, 53, 117, 21, 81, 80, 75, 14, 71, 4, 87, 76, 9, 6, 66, 13, 1, 83, 24, 123, 70, 74, 68, 118, 2, 85, 19, 12, 16, 90, 72, 0, 65, 10, 73, 94, 113, 11, 46, 18, 17, 88, 3, 92, 112, 79, 106, 64, 103, 124, 119, 86, 77, 84, 82, 25, 122, 78, 69, 7, 55, 93, 27, 111, 23, 5, 115, 42, 96, 91, 89, 15, 51, 28, 41, 62, 29, 35, 30, 120, 39, 109, 22, 56, 57, 98, 26, 54, 61, 99, 121, 50, 47, 67, 48, 36, 100, 31, 59, 52, 107, 45, 125, 20, 108, 40, 58, 104, 60, 110, 101, 44, 95, 116, 127, 114, 49, 105, 43, 32, 34, 37, 63, 126, 8, 33, 102], [38, 53, 97, 117, 87, 80, 21, 75, 76, 81, 6, 14, 70, 74, 4, 71, 9, 83, 68, 123, 24, 13, 118, 2, 1, 113, 19, 94, 66, 46, 72, 73, 55, 11, 103, 92, 119, 106, 17, 85, 16, 89, 93, 23, 12, 45, 0, 88, 96, 122, 79, 77, 112, 10, 78, 29, 20, 42, 51, 91, 111, 82, 69, 64, 84, 62, 52, 41, 86, 124, 35, 15, 28, 33, 8, 65, 7, 25, 40, 60, 18, 26, 3, 36, 105, 104, 50, 90, 120, 27, 48, 121, 107, 101, 109, 30, 116, 47, 56, 5, 108, 54, 110, 63, 99, 95, 57, 125, 100, 115, 39, 59, 98, 127, 58, 67, 114, 61, 34, 31, 22, 49, 44, 37, 43, 32, 126, 102], [38, 53, 97, 117, 21, 87, 81, 76, 75, 80, 123, 14, 4, 66, 70, 6, 118, 9, 74, 24, 19, 90, 12, 7, 69, 16, 46, 92, 71, 51, 65, 82, 112, 78, 64, 103, 56, 84, 30, 106, 23, 122, 25, 110, 15, 11, 98, 119, 121, 94, 17, 113, 47, 3, 85, 83, 1, 26, 31, 125, 35, 41, 39, 111, 88, 57, 91, 115, 73, 60, 49, 42, 93, 86, 52, 37, 18, 54, 27, 50, 29, 107, 34, 124, 58, 101, 22, 44, 59, 13, 10, 20, 126, 77, 61, 96, 36, 109, 120, 105, 79, 89, 40, 116, 63, 55, 99, 95, 28, 43, 127, 108, 104, 62, 48, 2, 45, 114, 100, 32, 67, 0, 5, 72, 8, 102, 68, 33], [38, 53, 97, 117, 87, 80, 6, 76, 21, 81, 14, 75, 74, 9, 1, 19, 3, 113, 68, 8, 71, 123, 7, 5, 82, 118, 2, 72, 85, 65, 46, 18, 70, 24, 23, 122, 91, 55, 41, 83, 35, 119, 78, 79, 10, 88, 67, 105, 12, 40, 112, 33, 106, 94, 111, 28, 15, 16, 66, 69, 84, 109, 27, 64, 45, 107, 36, 50, 86, 124, 61, 13, 90, 121, 103, 93, 26, 56, 98, 120, 77, 47, 30, 60, 115, 114, 116, 96, 25, 92, 59, 95, 42, 51, 73, 20, 104, 48, 58, 89, 0, 126, 29, 52, 17, 108, 57, 44, 110, 49, 125, 37, 22, 99, 34, 43, 62, 31, 54, 63, 127, 11, 101, 32, 39, 100, 102, 4]], "model.layers.5.self_attn.k_proj": [[42, 103, 50, 34, 12, 9, 80, 23, 20, 14, 18, 114, 7, 51, 66, 111, 6, 4, 74, 52, 64, 113, 79, 49, 126, 54, 8, 61, 127, 57, 67, 112, 116, 92, 27, 65, 108, 73, 0, 88, 121, 58, 1, 21, 124, 5, 77, 120, 59, 28, 125, 26, 86, 123, 69, 46, 62, 98, 106, 38, 81, 60, 94, 105, 91, 99, 55, 47, 44, 85, 93, 22, 45, 71, 24, 70, 41, 102, 122, 109, 37, 48, 117, 119, 90, 40, 63, 118, 11, 97, 53, 43, 25, 17, 95, 115, 104, 89, 56, 32, 3, 96, 110, 30, 101, 19, 100, 35, 107, 75, 16, 72, 29, 33, 83, 13, 31, 36, 10, 84, 15, 76, 82, 87, 78, 2, 68, 39], [46, 102, 115, 112, 125, 93, 126, 23, 26, 83, 9, 79, 80, 21, 76, 81, 20, 6, 68, 24, 3, 12, 122, 15, 13, 71, 124, 50, 77, 8, 41, 66, 116, 5, 106, 91, 123, 117, 113, 31, 16, 56, 55, 62, 58, 82, 69, 53, 78, 105, 63, 104, 95, 127, 74, 108, 111, 44, 45, 109, 118, 0, 47, 51, 96, 121, 107, 52, 40, 29, 59, 48, 60, 119, 114, 75, 33, 1, 120, 72, 49, 61, 43, 37, 19, 11, 54, 42, 36, 17, 10, 22, 18, 85, 39, 35, 110, 101, 34, 97, 99, 57, 32, 100, 88, 103, 98, 65, 84, 30, 94, 14, 25, 86, 27, 89, 28, 7, 92, 73, 87, 90, 2, 70, 38, 64, 67, 4], [61, 121, 125, 101, 57, 49, 113, 22, 32, 127, 62, 115, 55, 56, 122, 117, 17, 123, 52, 84, 51, 60, 58, 99, 59, 124, 63, 119, 54, 120, 116, 93, 114, 126, 50, 112, 88, 118, 53, 30, 48, 91, 110, 109, 47, 111, 46, 45, 43, 44, 92, 24, 95, 14, 12, 102, 35, 108, 79, 42, 82, 18, 107, 39, 10, 106, 83, 40, 41, 77, 100, 38, 37, 94, 21, 20, 97, 105, 16, 87, 104, 103, 11, 85, 36, 78, 72, 89, 80, 25, 98, 9, 33, 96, 29, 34, 31, 75, 86, 15, 90, 71, 13, 28, 74, 70, 19, 4, 23, 8, 26, 81, 66, 64, 27, 69, 3, 65, 73, 5, 6, 7, 1, 0, 76, 67, 68, 2], [108, 115, 102, 12, 22, 78, 84, 74, 18, 17, 79, 44, 69, 64, 71, 8, 68, 66, 90, 3, 75, 97, 120, 73, 2, 65, 48, 116, 1, 6, 16, 126, 24, 49, 70, 29, 67, 125, 113, 51, 93, 57, 59, 114, 19, 89, 88, 9, 21, 117, 11, 85, 124, 14, 23, 92, 58, 7, 91, 83, 0, 80, 25, 81, 62, 107, 39, 118, 30, 63, 77, 27, 55, 87, 95, 53, 10, 36, 127, 56, 99, 26, 32, 34, 31, 4, 119, 112, 41, 105, 28, 103, 43, 94, 60, 45, 52, 61, 13, 121, 104, 106, 40, 37, 100, 42, 54, 96, 123, 122, 82, 98, 110, 47, 35, 46, 101, 111, 109, 50, 33, 76, 86, 72, 38, 20, 15, 5], [102, 119, 118, 56, 53, 98, 94, 27, 108, 23, 84, 81, 25, 79, 64, 12, 9, 66, 117, 58, 5, 78, 57, 54, 37, 109, 125, 42, 67, 113, 8, 82, 114, 104, 55, 52, 105, 11, 39, 126, 24, 115, 85, 71, 62, 124, 123, 46, 110, 45, 50, 61, 19, 122, 48, 63, 121, 38, 1, 107, 6, 13, 22, 127, 112, 120, 41, 80, 4, 51, 43, 49, 60, 35, 59, 47, 74, 33, 111, 40, 18, 69, 65, 99, 90, 103, 31, 10, 116, 3, 29, 36, 97, 83, 21, 44, 68, 77, 100, 106, 96, 92, 70, 86, 32, 75, 93, 0, 30, 91, 7, 95, 88, 87, 16, 28, 14, 26, 89, 76, 72, 2, 101, 17, 20, 15, 34, 73], [103, 114, 51, 33, 31, 87, 85, 79, 49, 115, 83, 77, 27, 46, 17, 24, 7, 122, 76, 78, 25, 18, 108, 29, 4, 10, 84, 98, 121, 75, 57, 6, 3, 8, 74, 86, 30, 62, 82, 55, 48, 42, 65, 113, 69, 54, 119, 111, 5, 110, 99, 109, 127, 64, 101, 45, 89, 104, 63, 9, 112, 126, 56, 91, 123, 2, 38, 93, 58, 47, 92, 100, 52, 44, 13, 66, 125, 43, 118, 72, 94, 73, 41, 102, 40, 61, 35, 106, 23, 50, 96, 105, 116, 117, 26, 59, 12, 32, 81, 120, 36, 53, 67, 15, 22, 97, 60, 37, 107, 21, 95, 34, 88, 124, 28, 16, 80, 19, 14, 20, 90, 71, 70, 11, 39, 1, 0, 68], [38, 122, 62, 29, 98, 124, 44, 16, 18, 104, 78, 21, 22, 20, 25, 79, 73, 12, 85, 19, 11, 71, 23, 24, 6, 32, 108, 90, 77, 66, 14, 81, 82, 43, 64, 92, 17, 126, 26, 46, 68, 58, 30, 48, 40, 8, 27, 9, 117, 116, 49, 65, 57, 109, 114, 119, 102, 76, 34, 61, 60, 7, 74, 51, 103, 113, 41, 110, 91, 121, 55, 93, 101, 56, 39, 63, 10, 36, 33, 50, 59, 97, 28, 4, 105, 112, 45, 15, 99, 106, 13, 100, 125, 95, 87, 84, 75, 42, 52, 54, 96, 31, 86, 3, 53, 83, 118, 88, 120, 111, 5, 72, 115, 107, 94, 47, 123, 37, 127, 80, 35, 0, 67, 89, 1, 69, 2, 70], [117, 53, 102, 33, 14, 21, 80, 87, 81, 74, 76, 0, 75, 9, 68, 6, 8, 71, 2, 83, 65, 3, 5, 55, 24, 78, 7, 82, 1, 10, 73, 67, 42, 77, 38, 118, 123, 19, 113, 27, 84, 111, 66, 72, 110, 11, 69, 62, 91, 70, 105, 86, 94, 93, 90, 35, 26, 25, 89, 103, 112, 88, 79, 92, 15, 115, 46, 124, 57, 20, 119, 122, 48, 120, 54, 31, 59, 41, 106, 56, 17, 44, 60, 4, 30, 29, 116, 61, 37, 18, 43, 22, 114, 85, 49, 98, 127, 104, 126, 40, 100, 47, 125, 45, 51, 32, 109, 12, 28, 95, 52, 96, 58, 34, 99, 107, 36, 121, 16, 63, 108, 50, 13, 101, 39, 23, 64, 97]], "model.layers.5.self_attn.qk_proj": [[53, 115, 117, 108, 51, 125, 122, 124, 114, 62, 38, 61, 119, 112, 118, 44, 102, 50, 121, 42, 46, 56, 113, 126, 49, 23, 87, 57, 103, 76, 84, 85, 78, 82, 21, 17, 12, 81, 20, 18, 86, 80, 93, 14, 16, 110, 55, 91, 116, 10, 29, 22, 127, 74, 98, 97, 120, 15, 111, 79, 54, 26, 48, 106, 59, 63, 30, 52, 60, 9, 7, 73, 11, 47, 39, 34, 123, 8, 90, 71, 104, 75, 89, 45, 83, 24, 72, 27, 19, 5, 33, 28, 88, 109, 6, 58, 107, 70, 25, 13, 69, 77, 105, 68, 94, 0, 4, 32, 40, 35, 3, 43, 67, 37, 66, 31, 41, 64, 101, 2, 95, 1, 99, 92, 36, 65, 96, 100], [53, 115, 117, 108, 51, 38, 122, 125, 124, 62, 61, 114, 112, 118, 44, 102, 119, 46, 121, 50, 42, 113, 49, 56, 126, 23, 87, 103, 57, 82, 76, 48, 84, 86, 21, 85, 93, 81, 20, 12, 116, 78, 29, 18, 80, 17, 97, 127, 14, 91, 120, 16, 22, 55, 60, 15, 98, 110, 59, 106, 10, 30, 54, 47, 26, 63, 74, 34, 39, 58, 111, 79, 90, 123, 9, 8, 73, 6, 52, 75, 33, 11, 89, 7, 107, 71, 27, 109, 24, 28, 45, 83, 104, 5, 19, 88, 105, 25, 40, 64, 94, 72, 4, 3, 68, 69, 0, 37, 13, 35, 77, 95, 41, 2, 66, 32, 96, 70, 67, 1, 92, 43, 65, 99, 31, 101, 100, 36], [53, 115, 117, 108, 51, 124, 122, 125, 62, 38, 114, 61, 44, 112, 121, 102, 118, 46, 119, 42, 50, 56, 113, 49, 23, 103, 87, 126, 82, 57, 76, 21, 84, 12, 85, 93, 20, 86, 78, 81, 14, 116, 29, 18, 17, 16, 80, 22, 97, 98, 48, 8, 91, 120, 60, 15, 10, 110, 79, 74, 106, 26, 30, 39, 127, 55, 54, 6, 90, 9, 63, 71, 11, 7, 34, 47, 59, 75, 52, 109, 73, 123, 27, 89, 88, 19, 33, 28, 5, 58, 24, 107, 111, 104, 0, 83, 77, 64, 69, 40, 68, 25, 45, 67, 4, 13, 41, 94, 66, 105, 2, 3, 37, 35, 72, 32, 1, 43, 92, 95, 70, 65, 101, 31, 99, 96, 36, 100], [53, 115, 117, 108, 51, 114, 38, 122, 124, 125, 62, 61, 118, 102, 44, 46, 121, 112, 50, 119, 42, 113, 56, 49, 126, 103, 23, 87, 21, 85, 84, 86, 82, 17, 76, 57, 14, 20, 18, 93, 12, 80, 81, 78, 16, 22, 29, 98, 79, 91, 116, 97, 127, 55, 8, 30, 26, 54, 39, 48, 34, 106, 58, 63, 10, 59, 15, 110, 120, 90, 52, 123, 74, 47, 9, 6, 60, 75, 89, 71, 83, 73, 11, 7, 64, 24, 69, 28, 33, 88, 27, 104, 111, 19, 0, 77, 109, 25, 13, 107, 45, 5, 67, 4, 72, 3, 105, 94, 40, 66, 65, 37, 68, 1, 92, 2, 41, 96, 70, 32, 95, 31, 101, 35, 43, 99, 100, 36], [53, 115, 117, 108, 51, 122, 114, 124, 62, 125, 61, 38, 118, 121, 46, 102, 44, 112, 119, 50, 42, 113, 56, 49, 23, 103, 126, 87, 57, 82, 76, 12, 78, 85, 21, 20, 17, 86, 16, 84, 93, 81, 14, 18, 120, 98, 8, 80, 29, 22, 34, 52, 10, 110, 79, 106, 116, 15, 58, 39, 91, 123, 74, 63, 127, 6, 55, 48, 59, 7, 9, 71, 30, 54, 11, 75, 90, 111, 97, 64, 47, 109, 73, 33, 60, 26, 0, 5, 89, 83, 69, 104, 19, 27, 68, 88, 4, 3, 45, 77, 13, 67, 28, 24, 1, 40, 2, 107, 72, 25, 94, 41, 70, 66, 37, 35, 65, 32, 43, 31, 105, 99, 92, 95, 101, 36, 100, 96], [53, 115, 117, 51, 122, 124, 108, 125, 114, 61, 62, 38, 118, 119, 112, 121, 50, 102, 44, 46, 42, 56, 113, 49, 126, 87, 103, 57, 23, 76, 12, 18, 21, 78, 20, 84, 86, 85, 82, 17, 81, 93, 16, 14, 80, 55, 48, 110, 22, 120, 98, 29, 91, 74, 10, 15, 34, 47, 8, 54, 30, 59, 52, 127, 79, 123, 58, 97, 106, 116, 60, 63, 7, 9, 39, 11, 73, 71, 90, 75, 26, 111, 104, 6, 83, 33, 89, 5, 88, 68, 109, 24, 4, 45, 27, 19, 69, 72, 28, 94, 2, 70, 13, 25, 0, 64, 77, 40, 107, 3, 67, 105, 43, 41, 31, 1, 95, 32, 35, 37, 66, 101, 65, 96, 92, 100, 99, 36], [53, 115, 117, 51, 124, 125, 108, 122, 114, 38, 62, 119, 118, 61, 44, 112, 102, 121, 46, 50, 49, 42, 56, 113, 126, 87, 21, 23, 57, 103, 85, 76, 17, 82, 12, 78, 14, 110, 18, 20, 84, 81, 16, 93, 86, 80, 106, 54, 120, 48, 22, 29, 55, 60, 98, 79, 97, 127, 59, 11, 10, 111, 91, 63, 74, 39, 15, 30, 58, 34, 47, 116, 45, 8, 52, 9, 90, 123, 7, 26, 73, 88, 71, 75, 33, 19, 83, 89, 104, 27, 24, 70, 109, 40, 28, 94, 25, 41, 69, 43, 68, 6, 32, 5, 107, 72, 77, 4, 13, 3, 37, 35, 67, 105, 2, 95, 64, 31, 92, 0, 66, 101, 96, 65, 1, 36, 99, 100], [53, 115, 117, 51, 108, 125, 122, 114, 124, 62, 38, 61, 118, 119, 112, 102, 44, 46, 121, 42, 50, 49, 56, 113, 23, 87, 21, 126, 57, 103, 12, 20, 84, 85, 80, 86, 76, 17, 82, 14, 78, 18, 81, 93, 16, 54, 29, 127, 52, 79, 97, 106, 91, 15, 111, 120, 22, 110, 58, 116, 98, 55, 10, 48, 74, 39, 60, 70, 30, 34, 47, 9, 11, 63, 88, 26, 104, 45, 27, 33, 90, 75, 19, 7, 123, 8, 71, 73, 83, 24, 109, 89, 59, 28, 43, 25, 107, 5, 37, 41, 40, 72, 69, 77, 32, 13, 94, 66, 0, 35, 4, 67, 64, 3, 105, 68, 2, 92, 31, 65, 101, 1, 6, 99, 95, 36, 100, 96], [53, 115, 117, 108, 51, 122, 38, 125, 124, 62, 114, 61, 44, 118, 112, 102, 50, 119, 121, 46, 42, 56, 126, 113, 49, 23, 87, 103, 21, 57, 20, 17, 84, 85, 14, 18, 81, 76, 12, 86, 82, 93, 80, 16, 29, 22, 127, 78, 54, 120, 110, 98, 116, 55, 97, 106, 91, 48, 10, 34, 30, 111, 79, 15, 26, 70, 52, 47, 74, 39, 90, 123, 58, 59, 33, 27, 11, 75, 24, 104, 25, 88, 71, 7, 89, 72, 9, 83, 73, 8, 19, 60, 5, 45, 94, 109, 105, 63, 32, 28, 35, 13, 0, 69, 4, 64, 41, 40, 66, 77, 67, 68, 37, 107, 43, 95, 3, 31, 92, 101, 99, 2, 1, 96, 6, 65, 36, 100], [53, 115, 117, 108, 51, 122, 38, 114, 125, 62, 124, 61, 44, 118, 112, 119, 121, 102, 46, 42, 50, 113, 56, 49, 87, 23, 103, 85, 82, 17, 76, 12, 126, 21, 57, 86, 93, 18, 84, 20, 81, 14, 78, 80, 16, 29, 127, 22, 48, 15, 97, 55, 98, 79, 91, 110, 59, 52, 74, 54, 26, 30, 10, 106, 39, 60, 90, 120, 34, 47, 111, 109, 70, 9, 58, 11, 72, 89, 75, 71, 33, 116, 63, 7, 73, 83, 27, 88, 24, 104, 123, 19, 77, 94, 69, 45, 25, 105, 4, 68, 28, 40, 13, 8, 5, 43, 37, 3, 66, 0, 107, 32, 2, 35, 6, 92, 95, 67, 31, 96, 36, 64, 41, 1, 101, 100, 99, 65], [53, 115, 117, 108, 122, 125, 51, 114, 62, 38, 124, 61, 102, 118, 44, 121, 50, 112, 119, 42, 46, 56, 113, 49, 126, 23, 87, 103, 17, 18, 12, 76, 85, 78, 57, 93, 20, 21, 81, 84, 82, 14, 86, 74, 80, 110, 16, 98, 29, 22, 79, 15, 72, 91, 48, 97, 10, 54, 116, 58, 52, 30, 120, 34, 111, 47, 55, 106, 39, 127, 90, 109, 9, 11, 73, 26, 7, 75, 70, 33, 60, 63, 27, 71, 83, 59, 69, 88, 89, 5, 107, 104, 64, 4, 6, 68, 77, 0, 3, 13, 19, 123, 8, 28, 2, 65, 45, 24, 31, 94, 66, 105, 37, 25, 67, 1, 40, 95, 41, 32, 101, 43, 35, 96, 92, 100, 99, 36], [53, 115, 117, 51, 108, 122, 125, 114, 62, 124, 38, 61, 121, 102, 118, 50, 44, 112, 46, 119, 42, 56, 113, 49, 126, 87, 23, 103, 57, 76, 84, 21, 17, 85, 18, 20, 12, 14, 78, 86, 82, 93, 58, 48, 80, 81, 72, 16, 29, 98, 22, 91, 74, 34, 116, 79, 54, 30, 10, 55, 52, 120, 106, 63, 59, 15, 127, 97, 9, 111, 110, 11, 39, 109, 47, 90, 7, 89, 75, 104, 6, 71, 107, 26, 123, 68, 27, 73, 45, 70, 5, 0, 33, 19, 28, 88, 69, 60, 24, 83, 8, 64, 67, 4, 37, 77, 3, 13, 25, 66, 94, 105, 40, 1, 2, 43, 65, 35, 41, 32, 31, 95, 92, 101, 99, 100, 36, 96], [53, 115, 117, 108, 51, 122, 62, 125, 114, 38, 124, 61, 112, 44, 119, 118, 102, 46, 50, 121, 42, 49, 113, 56, 126, 87, 23, 103, 57, 84, 76, 21, 20, 85, 81, 18, 17, 86, 93, 82, 12, 16, 78, 14, 80, 58, 48, 29, 22, 59, 15, 52, 127, 54, 91, 63, 97, 106, 110, 98, 120, 111, 26, 34, 10, 39, 9, 55, 116, 79, 74, 30, 72, 47, 90, 7, 109, 6, 19, 11, 33, 75, 27, 71, 24, 89, 73, 88, 83, 104, 123, 28, 107, 25, 60, 40, 68, 5, 45, 94, 77, 4, 43, 37, 13, 32, 41, 69, 3, 35, 105, 67, 101, 66, 95, 70, 2, 0, 31, 64, 92, 36, 8, 65, 99, 100, 96, 1], [53, 115, 117, 108, 51, 122, 125, 38, 114, 62, 124, 61, 112, 118, 44, 119, 102, 121, 46, 42, 50, 49, 56, 113, 126, 23, 103, 87, 21, 76, 78, 20, 18, 12, 57, 85, 84, 17, 48, 82, 81, 80, 86, 55, 93, 110, 14, 22, 16, 54, 6, 127, 72, 79, 29, 91, 10, 15, 58, 26, 98, 74, 30, 75, 116, 9, 39, 106, 34, 123, 120, 7, 97, 90, 71, 59, 111, 52, 11, 89, 88, 63, 73, 19, 47, 60, 33, 24, 104, 83, 68, 27, 40, 28, 5, 25, 77, 69, 0, 4, 94, 105, 2, 64, 37, 66, 8, 43, 109, 45, 100, 107, 13, 32, 3, 92, 41, 101, 35, 67, 95, 36, 65, 1, 31, 96, 99, 70], [53, 115, 117, 108, 51, 122, 125, 62, 38, 124, 61, 114, 102, 118, 44, 121, 112, 119, 46, 42, 50, 56, 126, 49, 87, 23, 113, 103, 84, 57, 81, 48, 85, 18, 76, 20, 21, 12, 78, 93, 82, 17, 110, 80, 14, 22, 98, 16, 86, 10, 29, 91, 79, 6, 72, 34, 63, 15, 74, 120, 39, 55, 97, 106, 54, 30, 116, 90, 127, 71, 104, 26, 58, 7, 52, 60, 89, 27, 59, 109, 11, 75, 9, 73, 33, 123, 19, 64, 88, 83, 24, 47, 111, 37, 5, 8, 69, 68, 107, 28, 94, 25, 40, 13, 66, 77, 4, 45, 2, 0, 105, 3, 32, 43, 67, 1, 92, 41, 70, 65, 35, 96, 101, 100, 31, 95, 99, 36], [53, 115, 117, 108, 51, 125, 122, 62, 124, 114, 38, 61, 44, 118, 112, 102, 121, 119, 50, 46, 42, 113, 56, 126, 49, 87, 23, 103, 57, 20, 14, 82, 85, 21, 12, 84, 76, 17, 18, 81, 86, 63, 80, 48, 78, 93, 22, 91, 16, 98, 110, 58, 34, 15, 74, 10, 29, 106, 60, 116, 127, 79, 54, 120, 6, 55, 59, 52, 39, 30, 72, 97, 11, 104, 90, 8, 7, 9, 89, 33, 47, 73, 26, 19, 111, 109, 107, 37, 75, 71, 69, 5, 83, 28, 123, 24, 43, 88, 25, 27, 64, 67, 13, 0, 68, 4, 70, 3, 77, 40, 94, 35, 45, 105, 92, 41, 65, 31, 100, 32, 95, 2, 101, 1, 66, 36, 96, 99], [53, 115, 117, 108, 51, 125, 124, 114, 122, 38, 62, 61, 112, 119, 118, 44, 121, 46, 102, 42, 50, 49, 113, 56, 126, 23, 87, 103, 84, 85, 81, 20, 14, 21, 12, 76, 18, 82, 80, 93, 57, 86, 78, 17, 15, 22, 16, 97, 10, 116, 52, 110, 29, 98, 48, 59, 54, 55, 60, 127, 91, 106, 58, 79, 30, 9, 120, 74, 7, 90, 8, 63, 26, 33, 73, 11, 71, 34, 39, 19, 24, 111, 75, 83, 89, 109, 72, 27, 28, 88, 104, 107, 47, 105, 6, 68, 123, 5, 25, 13, 40, 77, 70, 69, 4, 37, 32, 94, 43, 45, 92, 0, 64, 2, 31, 66, 35, 101, 41, 3, 67, 95, 65, 96, 100, 99, 36, 1], [53, 115, 117, 108, 51, 122, 62, 125, 38, 124, 61, 114, 44, 118, 112, 121, 102, 119, 50, 42, 46, 23, 49, 56, 113, 126, 87, 103, 84, 21, 12, 20, 82, 81, 57, 76, 14, 78, 85, 93, 18, 86, 17, 80, 16, 48, 110, 29, 97, 22, 91, 15, 106, 98, 10, 26, 8, 79, 116, 120, 74, 59, 55, 39, 30, 90, 58, 127, 54, 34, 9, 60, 47, 71, 11, 75, 52, 70, 33, 89, 7, 27, 73, 19, 28, 123, 111, 88, 5, 63, 24, 105, 83, 25, 43, 45, 104, 77, 72, 94, 69, 107, 13, 4, 40, 64, 109, 68, 32, 6, 67, 92, 66, 37, 0, 2, 95, 96, 41, 3, 101, 31, 99, 65, 35, 36, 100, 1], [53, 115, 117, 108, 51, 125, 38, 122, 62, 114, 124, 61, 118, 44, 102, 112, 121, 119, 50, 42, 46, 113, 56, 49, 23, 126, 103, 87, 20, 21, 12, 84, 14, 76, 85, 93, 82, 18, 16, 78, 110, 80, 17, 81, 22, 57, 48, 86, 29, 98, 39, 8, 55, 58, 52, 74, 34, 79, 106, 10, 70, 63, 97, 54, 91, 15, 30, 26, 90, 127, 116, 120, 109, 9, 89, 59, 60, 11, 75, 47, 71, 104, 7, 73, 33, 111, 27, 28, 19, 83, 24, 69, 0, 88, 64, 5, 25, 68, 123, 40, 94, 67, 66, 3, 65, 32, 45, 77, 13, 37, 43, 4, 41, 31, 2, 92, 107, 105, 6, 72, 95, 100, 1, 36, 35, 99, 96, 101], [53, 115, 117, 51, 108, 125, 122, 114, 38, 124, 62, 61, 121, 118, 44, 119, 112, 102, 46, 50, 113, 42, 49, 56, 87, 23, 126, 21, 103, 84, 57, 12, 76, 18, 82, 16, 20, 81, 86, 14, 78, 17, 85, 80, 93, 22, 48, 8, 29, 91, 10, 79, 98, 74, 15, 70, 47, 39, 55, 54, 109, 110, 63, 97, 9, 11, 59, 30, 116, 58, 127, 106, 34, 90, 60, 7, 73, 71, 111, 120, 89, 75, 52, 26, 83, 107, 33, 27, 104, 24, 4, 5, 19, 69, 77, 28, 68, 88, 123, 25, 94, 45, 43, 40, 13, 0, 105, 64, 72, 2, 37, 32, 67, 66, 3, 35, 99, 41, 92, 31, 65, 1, 95, 6, 36, 100, 96, 101], [53, 115, 117, 51, 108, 114, 125, 62, 122, 38, 61, 124, 44, 102, 119, 121, 118, 50, 112, 46, 42, 113, 126, 56, 49, 87, 103, 23, 82, 84, 21, 76, 78, 12, 86, 57, 93, 85, 20, 17, 18, 14, 81, 16, 110, 80, 8, 91, 97, 120, 22, 29, 54, 116, 15, 98, 30, 79, 63, 60, 48, 74, 10, 106, 34, 39, 58, 55, 70, 90, 127, 47, 52, 9, 11, 71, 26, 73, 107, 7, 111, 28, 75, 33, 19, 24, 59, 89, 104, 69, 27, 88, 83, 123, 109, 25, 37, 5, 77, 105, 68, 43, 40, 94, 4, 13, 64, 3, 72, 0, 35, 45, 31, 32, 66, 65, 67, 92, 95, 101, 6, 41, 2, 99, 96, 36, 1, 100], [53, 115, 117, 51, 108, 114, 125, 62, 122, 38, 124, 61, 44, 118, 121, 102, 112, 119, 46, 42, 50, 113, 49, 126, 87, 56, 103, 57, 21, 23, 76, 82, 84, 93, 20, 12, 86, 80, 14, 78, 85, 18, 81, 17, 120, 110, 16, 22, 29, 60, 55, 79, 48, 98, 97, 91, 63, 8, 54, 106, 15, 127, 39, 59, 116, 30, 74, 34, 10, 71, 26, 90, 52, 33, 47, 11, 9, 58, 75, 104, 19, 7, 27, 111, 89, 123, 83, 24, 45, 25, 70, 107, 73, 69, 5, 88, 13, 77, 109, 28, 94, 64, 40, 0, 68, 37, 105, 72, 6, 3, 66, 43, 67, 4, 101, 92, 65, 1, 31, 2, 32, 41, 35, 99, 96, 95, 100, 36], [53, 115, 117, 108, 51, 114, 125, 62, 38, 124, 122, 61, 112, 121, 44, 118, 46, 102, 119, 42, 50, 113, 56, 49, 87, 126, 103, 23, 57, 21, 93, 76, 20, 18, 84, 82, 86, 14, 85, 81, 17, 12, 22, 16, 80, 78, 39, 106, 98, 97, 127, 29, 15, 120, 34, 79, 91, 48, 116, 10, 30, 55, 54, 111, 74, 110, 8, 52, 58, 59, 26, 9, 11, 71, 7, 90, 33, 60, 109, 104, 63, 6, 27, 73, 25, 75, 47, 83, 89, 19, 28, 5, 24, 88, 69, 77, 107, 13, 70, 45, 72, 123, 68, 94, 64, 37, 40, 0, 32, 43, 67, 4, 35, 3, 31, 65, 101, 1, 66, 99, 2, 105, 92, 41, 95, 36, 100, 96], [53, 115, 117, 51, 108, 122, 125, 62, 38, 114, 124, 61, 102, 118, 112, 44, 119, 50, 121, 46, 42, 56, 113, 87, 49, 126, 103, 23, 21, 76, 86, 82, 18, 85, 84, 20, 93, 17, 57, 12, 14, 59, 81, 78, 16, 48, 80, 29, 91, 22, 116, 97, 110, 98, 127, 15, 30, 10, 34, 26, 106, 111, 74, 55, 6, 60, 11, 39, 120, 52, 90, 54, 79, 63, 47, 75, 58, 8, 73, 72, 7, 89, 109, 9, 24, 33, 27, 19, 71, 83, 123, 104, 25, 68, 28, 88, 40, 13, 107, 69, 5, 94, 3, 37, 43, 64, 77, 105, 4, 101, 41, 95, 45, 0, 31, 70, 35, 67, 96, 32, 100, 66, 92, 36, 2, 99, 65, 1], [53, 115, 117, 51, 108, 122, 125, 114, 62, 124, 38, 118, 61, 44, 119, 112, 102, 121, 42, 46, 50, 113, 49, 56, 126, 87, 23, 103, 21, 57, 76, 20, 85, 18, 14, 12, 17, 86, 81, 84, 82, 78, 80, 93, 59, 54, 16, 110, 22, 116, 98, 91, 120, 6, 29, 15, 63, 10, 48, 97, 106, 39, 55, 60, 79, 74, 30, 75, 127, 111, 52, 71, 58, 9, 11, 90, 26, 34, 33, 19, 72, 73, 47, 109, 27, 7, 104, 89, 88, 28, 83, 8, 24, 37, 107, 69, 25, 68, 123, 5, 43, 13, 94, 105, 40, 77, 45, 32, 67, 41, 4, 70, 2, 3, 64, 0, 35, 31, 66, 101, 95, 65, 92, 1, 100, 99, 96, 36], [53, 115, 117, 51, 108, 122, 125, 38, 114, 124, 62, 61, 118, 44, 121, 119, 102, 112, 46, 42, 50, 113, 49, 126, 56, 87, 57, 103, 23, 21, 76, 20, 82, 81, 78, 84, 18, 85, 12, 86, 14, 80, 93, 17, 54, 16, 120, 97, 29, 48, 91, 59, 98, 22, 15, 116, 10, 110, 52, 79, 127, 106, 90, 26, 74, 30, 47, 72, 60, 11, 6, 39, 55, 34, 9, 63, 58, 75, 73, 33, 111, 104, 19, 24, 7, 27, 71, 88, 89, 5, 28, 37, 123, 13, 69, 25, 107, 83, 94, 41, 40, 45, 105, 0, 77, 109, 43, 4, 2, 8, 101, 70, 32, 68, 64, 31, 3, 35, 66, 96, 65, 67, 1, 95, 92, 99, 100, 36], [53, 115, 117, 108, 51, 122, 38, 125, 62, 124, 61, 114, 44, 118, 112, 102, 50, 121, 119, 42, 46, 126, 56, 113, 23, 49, 103, 87, 21, 57, 17, 85, 20, 18, 84, 12, 82, 93, 76, 81, 86, 80, 78, 14, 22, 110, 29, 97, 55, 48, 16, 98, 106, 10, 91, 72, 26, 127, 47, 79, 30, 116, 39, 15, 59, 34, 74, 58, 90, 120, 111, 54, 123, 73, 33, 60, 52, 7, 89, 75, 9, 11, 19, 63, 104, 6, 27, 25, 69, 83, 24, 88, 71, 94, 28, 109, 5, 45, 77, 4, 105, 64, 32, 68, 40, 35, 70, 13, 67, 8, 107, 31, 37, 0, 3, 66, 41, 101, 96, 95, 92, 2, 65, 43, 99, 1, 36, 100], [53, 115, 117, 108, 51, 122, 125, 124, 62, 38, 114, 61, 118, 121, 44, 119, 112, 102, 46, 42, 50, 126, 49, 56, 113, 87, 23, 103, 12, 21, 81, 14, 76, 18, 110, 84, 78, 57, 82, 17, 20, 86, 85, 93, 16, 22, 80, 72, 10, 98, 127, 48, 74, 58, 106, 29, 120, 116, 63, 91, 7, 15, 47, 9, 26, 30, 79, 75, 71, 55, 97, 34, 11, 39, 59, 73, 90, 54, 89, 52, 109, 19, 33, 70, 123, 45, 111, 24, 6, 69, 5, 60, 28, 27, 83, 68, 77, 4, 0, 64, 104, 40, 88, 3, 67, 13, 107, 66, 94, 2, 8, 25, 65, 37, 105, 35, 32, 43, 41, 1, 31, 92, 96, 95, 101, 99, 100, 36], [53, 115, 117, 51, 108, 122, 125, 114, 124, 62, 38, 61, 118, 44, 121, 119, 102, 112, 50, 42, 46, 56, 113, 126, 49, 87, 23, 103, 12, 57, 21, 18, 84, 78, 76, 20, 86, 82, 14, 110, 85, 93, 81, 48, 16, 80, 17, 22, 72, 58, 98, 60, 106, 91, 116, 54, 10, 29, 79, 55, 127, 74, 120, 97, 34, 30, 39, 15, 63, 59, 70, 52, 33, 47, 90, 73, 75, 26, 9, 11, 104, 111, 71, 7, 89, 109, 123, 5, 19, 37, 88, 69, 24, 28, 45, 27, 35, 68, 13, 83, 40, 4, 25, 0, 64, 105, 8, 6, 2, 77, 43, 94, 66, 107, 3, 67, 1, 95, 65, 32, 31, 100, 96, 41, 99, 36, 101, 92], [53, 115, 117, 51, 108, 125, 122, 114, 124, 62, 38, 61, 119, 118, 44, 112, 50, 121, 102, 46, 126, 42, 113, 49, 56, 87, 23, 57, 103, 85, 12, 21, 76, 20, 82, 18, 78, 14, 17, 86, 93, 84, 16, 59, 80, 81, 116, 54, 98, 22, 29, 55, 58, 47, 72, 70, 91, 120, 15, 110, 10, 74, 63, 79, 48, 106, 127, 34, 9, 7, 30, 97, 52, 39, 75, 11, 60, 71, 111, 123, 73, 90, 26, 33, 69, 89, 24, 28, 5, 109, 19, 4, 40, 67, 27, 83, 64, 88, 37, 104, 68, 45, 25, 94, 0, 3, 8, 105, 77, 13, 107, 66, 2, 43, 1, 32, 65, 35, 41, 6, 31, 100, 95, 101, 96, 92, 99, 36], [53, 115, 117, 108, 51, 125, 122, 124, 38, 62, 114, 119, 61, 44, 118, 112, 121, 46, 102, 50, 42, 49, 113, 126, 56, 87, 103, 23, 21, 57, 84, 85, 12, 18, 82, 76, 20, 93, 81, 78, 17, 14, 59, 86, 80, 22, 16, 98, 116, 106, 74, 79, 47, 54, 55, 127, 29, 110, 48, 120, 39, 15, 70, 63, 72, 60, 30, 26, 97, 11, 91, 10, 73, 9, 7, 34, 75, 52, 58, 19, 111, 71, 90, 33, 45, 69, 4, 109, 68, 123, 104, 27, 89, 28, 83, 24, 37, 8, 40, 13, 88, 25, 5, 66, 0, 107, 105, 67, 3, 64, 94, 77, 6, 2, 43, 41, 32, 31, 1, 35, 65, 95, 92, 101, 96, 36, 100, 99], [53, 115, 117, 108, 51, 122, 125, 124, 38, 114, 62, 61, 119, 44, 118, 50, 112, 121, 102, 46, 42, 49, 113, 56, 126, 87, 23, 103, 21, 76, 57, 54, 14, 85, 20, 81, 78, 18, 84, 82, 127, 86, 93, 12, 17, 80, 55, 60, 59, 16, 22, 34, 63, 116, 74, 110, 29, 10, 26, 120, 48, 91, 98, 70, 97, 79, 30, 15, 123, 73, 47, 11, 111, 39, 9, 75, 33, 52, 72, 106, 109, 7, 71, 24, 19, 89, 90, 88, 8, 105, 5, 104, 58, 27, 43, 69, 83, 28, 45, 37, 4, 25, 68, 77, 64, 6, 94, 40, 107, 13, 0, 32, 3, 66, 41, 101, 1, 92, 67, 2, 95, 31, 96, 100, 35, 65, 36, 99]], "model.layers.6.self_attn.q_proj": [[108, 55, 36, 96, 91, 23, 44, 84, 77, 81, 15, 86, 10, 16, 7, 114, 57, 75, 78, 32, 30, 28, 111, 89, 6, 25, 87, 29, 66, 70, 11, 69, 68, 22, 101, 13, 74, 72, 85, 19, 95, 17, 92, 26, 90, 51, 88, 79, 121, 14, 40, 18, 3, 9, 20, 61, 27, 76, 126, 4, 1, 54, 123, 80, 71, 107, 97, 103, 12, 64, 94, 118, 99, 102, 83, 33, 21, 39, 93, 73, 49, 58, 60, 24, 82, 125, 41, 31, 34, 98, 112, 35, 110, 48, 37, 104, 56, 5, 115, 109, 53, 38, 120, 50, 124, 2, 106, 59, 113, 47, 105, 127, 8, 46, 52, 122, 63, 42, 65, 43, 119, 116, 117, 62, 45, 67, 100, 0], [108, 55, 36, 23, 44, 84, 15, 75, 91, 81, 77, 10, 8, 96, 29, 114, 3, 72, 70, 74, 6, 64, 111, 11, 67, 86, 69, 30, 57, 2, 79, 89, 83, 27, 4, 26, 25, 65, 94, 28, 73, 18, 103, 0, 14, 5, 82, 17, 9, 87, 13, 31, 68, 85, 12, 93, 21, 1, 88, 20, 16, 80, 112, 78, 51, 7, 22, 95, 39, 32, 24, 19, 66, 97, 76, 90, 71, 33, 118, 121, 56, 123, 35, 92, 54, 98, 34, 107, 102, 48, 40, 52, 127, 41, 99, 37, 61, 59, 119, 120, 110, 49, 115, 125, 101, 104, 122, 60, 58, 100, 38, 126, 46, 109, 47, 43, 105, 124, 63, 53, 62, 106, 45, 113, 116, 42, 50, 117], [55, 108, 112, 39, 127, 36, 111, 61, 44, 54, 51, 124, 113, 118, 60, 121, 50, 59, 122, 63, 56, 58, 119, 115, 101, 116, 123, 120, 97, 62, 53, 42, 92, 26, 117, 19, 47, 43, 48, 40, 49, 94, 98, 52, 32, 90, 46, 109, 106, 110, 105, 96, 83, 45, 114, 104, 107, 41, 38, 35, 95, 88, 125, 102, 99, 33, 86, 30, 34, 23, 37, 103, 126, 29, 91, 100, 57, 78, 25, 89, 16, 93, 27, 28, 24, 31, 73, 21, 85, 81, 22, 18, 7, 14, 82, 15, 84, 75, 87, 65, 2, 77, 9, 12, 66, 11, 17, 10, 6, 4, 68, 67, 80, 76, 0, 64, 71, 5, 70, 8, 72, 79, 13, 1, 20, 69, 74, 3], [108, 55, 39, 44, 36, 94, 103, 126, 111, 90, 102, 114, 96, 57, 34, 121, 127, 29, 32, 83, 51, 54, 123, 91, 86, 60, 58, 63, 0, 19, 61, 59, 2, 110, 115, 49, 23, 125, 112, 52, 113, 98, 104, 56, 50, 38, 37, 107, 31, 42, 124, 47, 101, 106, 40, 78, 82, 117, 97, 45, 120, 109, 16, 89, 122, 92, 105, 119, 116, 33, 48, 53, 15, 10, 77, 118, 62, 46, 35, 84, 28, 43, 81, 70, 7, 41, 67, 88, 4, 68, 99, 65, 26, 95, 93, 100, 30, 5, 71, 64, 85, 25, 76, 21, 11, 75, 24, 72, 8, 12, 18, 66, 9, 73, 27, 80, 1, 22, 14, 6, 87, 69, 17, 79, 13, 3, 74, 20], [42, 102, 46, 32, 103, 89, 56, 20, 106, 76, 22, 17, 14, 19, 71, 127, 10, 110, 28, 93, 8, 120, 26, 51, 80, 60, 116, 29, 66, 50, 11, 27, 96, 38, 82, 25, 47, 123, 69, 36, 109, 63, 3, 59, 34, 118, 13, 113, 105, 79, 6, 88, 54, 33, 21, 41, 67, 52, 83, 30, 75, 81, 119, 15, 35, 77, 12, 7, 91, 86, 2, 114, 58, 112, 78, 61, 94, 84, 90, 5, 23, 16, 125, 85, 98, 101, 18, 62, 87, 70, 95, 97, 92, 74, 122, 49, 111, 4, 126, 24, 31, 9, 64, 99, 73, 72, 55, 43, 115, 57, 107, 108, 121, 44, 45, 37, 40, 124, 68, 0, 117, 53, 100, 104, 48, 39, 65, 1], [42, 41, 46, 103, 23, 32, 106, 102, 80, 92, 56, 8, 20, 26, 24, 89, 123, 29, 17, 110, 105, 96, 127, 19, 22, 116, 120, 25, 93, 28, 51, 52, 14, 109, 121, 60, 76, 27, 83, 54, 10, 69, 107, 47, 44, 49, 119, 86, 100, 61, 111, 35, 34, 58, 97, 5, 85, 91, 112, 122, 50, 63, 18, 104, 31, 39, 43, 59, 4, 9, 95, 66, 65, 117, 115, 15, 72, 124, 90, 68, 3, 126, 11, 62, 118, 48, 55, 101, 30, 98, 33, 21, 73, 108, 87, 114, 57, 53, 36, 45, 38, 94, 125, 99, 79, 37, 2, 40, 1, 77, 113, 82, 88, 71, 16, 78, 75, 81, 0, 70, 84, 6, 64, 13, 67, 12, 74, 7], [102, 46, 42, 56, 51, 116, 127, 110, 26, 123, 41, 47, 60, 50, 32, 59, 122, 120, 103, 63, 22, 111, 61, 90, 52, 19, 109, 33, 112, 27, 121, 115, 55, 49, 96, 34, 54, 118, 93, 119, 117, 43, 113, 53, 89, 57, 62, 48, 124, 16, 58, 39, 107, 126, 125, 114, 44, 29, 105, 31, 108, 101, 97, 45, 85, 79, 104, 36, 35, 98, 99, 38, 40, 24, 11, 25, 100, 37, 95, 94, 92, 106, 87, 28, 88, 20, 23, 91, 30, 83, 13, 80, 72, 86, 18, 15, 84, 17, 21, 14, 75, 73, 12, 8, 82, 69, 77, 9, 66, 5, 76, 78, 2, 81, 6, 70, 4, 74, 1, 68, 71, 10, 7, 65, 3, 0, 64, 67], [42, 46, 102, 32, 20, 17, 89, 76, 103, 22, 106, 10, 14, 71, 96, 29, 110, 69, 127, 56, 26, 51, 25, 66, 38, 11, 120, 93, 19, 5, 60, 24, 116, 79, 41, 61, 63, 119, 81, 78, 6, 109, 8, 118, 87, 82, 12, 58, 13, 75, 83, 2, 80, 23, 3, 86, 55, 15, 52, 30, 97, 54, 27, 113, 88, 59, 21, 16, 47, 28, 84, 77, 68, 85, 91, 90, 122, 35, 125, 0, 18, 72, 105, 111, 115, 50, 95, 74, 101, 70, 114, 7, 65, 112, 92, 73, 126, 99, 94, 67, 36, 43, 34, 49, 33, 98, 31, 107, 4, 37, 64, 1, 9, 121, 48, 100, 45, 123, 108, 104, 62, 44, 40, 117, 124, 39, 53, 57], [109, 103, 34, 45, 4, 20, 74, 71, 1, 18, 9, 14, 12, 0, 80, 88, 3, 66, 112, 70, 68, 79, 72, 39, 75, 67, 85, 10, 21, 7, 19, 5, 69, 13, 124, 77, 6, 16, 65, 23, 84, 11, 64, 76, 60, 8, 24, 73, 15, 2, 98, 87, 51, 22, 48, 82, 78, 83, 52, 81, 25, 86, 17, 108, 59, 57, 27, 63, 49, 55, 113, 89, 91, 121, 126, 123, 96, 110, 120, 92, 61, 114, 111, 30, 62, 90, 94, 119, 97, 28, 127, 54, 29, 41, 115, 95, 125, 93, 116, 44, 31, 58, 33, 38, 42, 26, 105, 106, 46, 107, 32, 40, 99, 122, 36, 35, 43, 56, 104, 50, 101, 100, 102, 117, 37, 118, 47, 53], [103, 109, 34, 113, 45, 88, 84, 24, 16, 76, 92, 11, 124, 8, 15, 82, 83, 73, 10, 60, 22, 127, 112, 7, 78, 18, 119, 63, 48, 57, 51, 52, 49, 62, 19, 123, 91, 114, 89, 56, 6, 115, 108, 53, 90, 120, 104, 94, 28, 9, 95, 110, 86, 26, 29, 111, 93, 97, 35, 126, 59, 122, 30, 54, 121, 117, 27, 50, 58, 96, 125, 118, 100, 31, 61, 101, 116, 32, 55, 20, 105, 33, 44, 46, 43, 42, 37, 38, 40, 99, 36, 41, 106, 25, 12, 107, 102, 47, 87, 77, 21, 13, 23, 69, 79, 75, 81, 17, 74, 72, 14, 85, 68, 98, 80, 71, 70, 66, 4, 5, 39, 67, 2, 65, 3, 1, 0, 64], [109, 103, 34, 45, 20, 74, 22, 18, 19, 4, 12, 14, 39, 67, 71, 79, 10, 72, 80, 68, 73, 1, 5, 3, 9, 13, 112, 88, 7, 66, 2, 77, 75, 0, 23, 85, 64, 70, 76, 21, 16, 15, 27, 69, 60, 92, 82, 8, 78, 65, 84, 11, 83, 81, 48, 6, 120, 123, 24, 124, 52, 63, 58, 91, 113, 51, 86, 55, 87, 17, 114, 41, 57, 25, 98, 89, 119, 90, 31, 49, 59, 126, 97, 127, 108, 94, 42, 29, 62, 28, 30, 56, 115, 96, 36, 61, 53, 105, 33, 106, 95, 32, 40, 46, 125, 93, 111, 122, 110, 47, 99, 118, 26, 107, 121, 102, 38, 100, 35, 104, 101, 54, 37, 50, 117, 44, 43, 116], [109, 103, 34, 45, 22, 80, 14, 20, 4, 18, 24, 74, 70, 75, 88, 19, 72, 112, 13, 12, 124, 89, 68, 10, 9, 60, 71, 63, 51, 73, 15, 66, 39, 76, 2, 8, 48, 52, 82, 78, 113, 77, 16, 83, 84, 49, 87, 6, 62, 127, 23, 57, 81, 11, 121, 5, 59, 79, 1, 17, 91, 0, 64, 25, 85, 92, 86, 94, 106, 7, 97, 21, 123, 27, 31, 90, 38, 28, 61, 110, 67, 69, 119, 36, 104, 126, 114, 99, 96, 3, 101, 111, 108, 95, 32, 41, 116, 40, 102, 55, 26, 120, 118, 98, 56, 50, 54, 100, 53, 30, 58, 35, 47, 93, 29, 115, 33, 125, 44, 107, 37, 117, 43, 46, 122, 65, 105, 42], [38, 124, 49, 30, 56, 86, 113, 15, 82, 84, 81, 26, 76, 77, 8, 5, 74, 71, 66, 122, 94, 67, 24, 11, 25, 1, 60, 22, 102, 72, 32, 12, 39, 97, 2, 120, 75, 21, 9, 17, 64, 10, 79, 14, 16, 89, 78, 91, 20, 6, 23, 80, 83, 87, 29, 4, 51, 73, 13, 68, 90, 27, 70, 112, 69, 33, 92, 93, 59, 85, 18, 96, 99, 117, 125, 31, 7, 114, 0, 19, 3, 107, 28, 105, 88, 118, 111, 110, 109, 44, 63, 104, 50, 108, 123, 95, 106, 52, 55, 36, 126, 101, 34, 65, 58, 116, 119, 47, 103, 62, 40, 45, 100, 42, 37, 121, 57, 54, 61, 48, 35, 41, 127, 115, 98, 43, 46, 53], [38, 49, 56, 124, 113, 30, 7, 84, 86, 82, 3, 81, 77, 15, 10, 13, 65, 89, 5, 76, 1, 67, 69, 8, 2, 64, 68, 94, 74, 73, 26, 72, 91, 122, 120, 66, 0, 27, 22, 97, 20, 79, 6, 18, 12, 9, 11, 14, 25, 87, 17, 23, 32, 24, 29, 51, 90, 75, 80, 60, 16, 99, 92, 78, 102, 83, 71, 59, 4, 88, 19, 39, 70, 33, 21, 31, 28, 103, 43, 114, 45, 125, 85, 112, 98, 93, 46, 111, 107, 118, 40, 95, 58, 37, 101, 126, 109, 104, 108, 44, 55, 105, 63, 41, 57, 61, 127, 96, 42, 115, 53, 116, 35, 54, 48, 100, 36, 47, 121, 62, 50, 117, 106, 119, 110, 34, 52, 123], [38, 49, 56, 124, 30, 86, 94, 89, 113, 81, 84, 91, 102, 122, 39, 15, 25, 82, 18, 51, 22, 26, 23, 76, 59, 97, 32, 120, 17, 33, 87, 105, 125, 29, 114, 99, 126, 72, 60, 103, 90, 62, 24, 31, 116, 77, 58, 20, 109, 118, 110, 52, 79, 45, 127, 112, 121, 63, 41, 117, 104, 96, 13, 55, 57, 50, 43, 44, 119, 111, 12, 115, 42, 48, 93, 95, 54, 101, 78, 40, 123, 75, 108, 37, 27, 53, 47, 46, 35, 100, 92, 61, 106, 36, 34, 107, 83, 28, 98, 21, 10, 19, 88, 9, 85, 80, 69, 14, 16, 73, 8, 7, 74, 11, 6, 68, 70, 3, 5, 71, 66, 4, 2, 65, 67, 1, 64, 0], [38, 49, 30, 124, 56, 113, 84, 86, 15, 77, 82, 74, 81, 5, 73, 76, 89, 8, 67, 71, 1, 20, 22, 72, 10, 122, 78, 66, 14, 64, 17, 9, 75, 79, 26, 94, 120, 16, 87, 11, 2, 7, 12, 6, 65, 99, 13, 91, 80, 69, 25, 27, 4, 3, 24, 39, 90, 97, 18, 32, 70, 93, 51, 85, 23, 92, 68, 29, 19, 21, 118, 31, 83, 60, 28, 88, 0, 114, 107, 33, 45, 46, 42, 43, 95, 103, 44, 117, 98, 125, 101, 100, 109, 63, 104, 62, 126, 54, 58, 111, 112, 40, 115, 57, 37, 59, 116, 50, 96, 55, 61, 52, 53, 34, 105, 41, 48, 35, 36, 119, 106, 121, 47, 110, 127, 123, 108, 102], [40, 98, 116, 32, 20, 78, 9, 81, 11, 89, 86, 7, 68, 69, 16, 47, 104, 4, 3, 53, 2, 49, 73, 51, 13, 117, 111, 126, 58, 121, 64, 8, 46, 56, 0, 67, 60, 94, 52, 88, 10, 127, 34, 42, 90, 71, 62, 72, 80, 74, 84, 113, 92, 50, 45, 79, 70, 63, 77, 75, 5, 27, 120, 6, 57, 44, 65, 1, 87, 14, 19, 106, 54, 17, 41, 101, 24, 125, 100, 21, 114, 99, 29, 122, 85, 105, 108, 33, 66, 15, 43, 124, 28, 119, 23, 115, 30, 12, 91, 18, 110, 25, 38, 39, 26, 22, 107, 118, 96, 61, 59, 103, 36, 83, 35, 97, 112, 123, 48, 37, 109, 102, 93, 95, 82, 31, 55, 76], [40, 98, 116, 32, 86, 20, 94, 88, 89, 28, 81, 62, 50, 121, 58, 78, 117, 52, 11, 47, 115, 45, 21, 104, 46, 126, 53, 111, 49, 87, 13, 30, 101, 41, 51, 96, 42, 16, 22, 92, 119, 90, 12, 122, 113, 24, 31, 91, 63, 26, 18, 29, 44, 37, 23, 109, 83, 93, 95, 107, 85, 38, 36, 125, 120, 110, 99, 103, 76, 79, 43, 114, 105, 102, 108, 8, 59, 25, 84, 124, 19, 9, 15, 7, 112, 54, 48, 60, 123, 82, 100, 127, 56, 27, 97, 106, 118, 35, 55, 80, 61, 57, 33, 17, 39, 14, 74, 34, 77, 75, 69, 10, 72, 2, 71, 3, 6, 68, 70, 73, 5, 67, 0, 66, 64, 4, 65, 1], [40, 98, 32, 86, 104, 116, 81, 20, 13, 47, 2, 66, 7, 8, 78, 49, 58, 51, 11, 9, 121, 60, 94, 5, 111, 45, 53, 52, 62, 68, 73, 46, 4, 16, 122, 92, 126, 70, 88, 89, 50, 0, 69, 56, 115, 34, 113, 10, 87, 64, 117, 125, 79, 3, 103, 82, 18, 28, 119, 77, 127, 63, 37, 57, 1, 108, 100, 27, 72, 25, 43, 114, 106, 44, 75, 120, 71, 22, 84, 65, 17, 30, 23, 42, 93, 14, 41, 19, 59, 54, 35, 48, 21, 31, 101, 76, 110, 12, 118, 29, 99, 90, 112, 80, 6, 85, 109, 15, 61, 124, 83, 36, 26, 91, 102, 55, 107, 24, 97, 39, 123, 95, 33, 38, 105, 74, 67, 96], [40, 98, 116, 86, 81, 32, 89, 78, 20, 88, 11, 7, 104, 47, 117, 111, 13, 49, 16, 28, 53, 74, 10, 51, 9, 121, 94, 58, 52, 126, 62, 46, 21, 45, 75, 60, 69, 25, 92, 96, 17, 8, 115, 79, 30, 50, 63, 77, 113, 90, 73, 71, 80, 56, 14, 67, 127, 95, 38, 2, 23, 42, 120, 122, 87, 72, 84, 99, 18, 44, 41, 29, 101, 105, 114, 125, 3, 22, 61, 19, 83, 91, 100, 6, 5, 54, 43, 97, 24, 27, 37, 119, 112, 108, 118, 15, 68, 85, 70, 102, 57, 107, 59, 34, 124, 55, 33, 12, 48, 110, 76, 66, 106, 26, 123, 103, 109, 31, 39, 93, 35, 65, 36, 82, 0, 1, 4, 64], [119, 38, 118, 42, 33, 25, 84, 31, 88, 52, 94, 103, 114, 99, 57, 55, 102, 28, 83, 78, 96, 54, 50, 62, 22, 106, 60, 45, 87, 93, 112, 29, 23, 53, 97, 27, 71, 61, 34, 74, 32, 85, 109, 111, 17, 24, 120, 46, 122, 110, 41, 80, 104, 21, 40, 18, 116, 125, 35, 121, 39, 105, 10, 123, 79, 56, 36, 92, 98, 47, 113, 107, 89, 127, 115, 126, 26, 20, 101, 16, 30, 43, 12, 49, 48, 81, 58, 91, 117, 37, 14, 13, 90, 63, 82, 100, 73, 11, 4, 7, 75, 44, 59, 51, 68, 124, 15, 108, 77, 95, 19, 72, 86, 1, 76, 8, 5, 6, 3, 0, 70, 69, 9, 65, 2, 67, 66, 64], [119, 118, 38, 33, 103, 42, 25, 94, 90, 54, 88, 109, 84, 31, 112, 120, 52, 102, 62, 111, 22, 45, 93, 53, 50, 57, 55, 83, 125, 47, 114, 87, 110, 106, 122, 74, 35, 18, 60, 101, 48, 58, 116, 85, 127, 61, 23, 78, 28, 59, 91, 100, 124, 121, 46, 63, 99, 49, 79, 126, 56, 123, 51, 113, 108, 30, 115, 105, 107, 117, 29, 43, 98, 26, 41, 39, 96, 0, 44, 24, 71, 34, 97, 40, 104, 27, 72, 89, 8, 11, 21, 75, 16, 68, 82, 12, 5, 37, 13, 69, 36, 64, 1, 4, 32, 95, 80, 67, 7, 2, 92, 65, 17, 14, 66, 73, 9, 3, 15, 70, 77, 10, 20, 81, 76, 6, 86, 19], [118, 38, 119, 42, 33, 25, 31, 109, 83, 55, 22, 94, 114, 84, 78, 90, 88, 106, 99, 52, 53, 54, 12, 74, 103, 120, 48, 111, 121, 102, 125, 17, 110, 50, 93, 62, 87, 107, 127, 34, 112, 60, 43, 97, 89, 115, 113, 122, 61, 57, 105, 58, 47, 91, 116, 73, 104, 35, 63, 46, 123, 30, 96, 28, 29, 72, 41, 59, 124, 126, 23, 44, 70, 37, 56, 117, 49, 101, 100, 27, 40, 85, 51, 79, 71, 95, 7, 45, 108, 80, 92, 68, 19, 39, 98, 81, 86, 26, 24, 75, 10, 36, 32, 67, 15, 76, 16, 21, 2, 18, 14, 11, 77, 82, 3, 66, 8, 1, 6, 20, 13, 0, 69, 4, 64, 5, 65, 9], [38, 118, 119, 33, 25, 83, 31, 79, 84, 89, 94, 12, 81, 78, 90, 88, 28, 85, 73, 22, 29, 54, 42, 24, 19, 112, 45, 26, 87, 93, 70, 17, 6, 102, 97, 48, 9, 55, 27, 116, 106, 35, 59, 127, 121, 114, 74, 110, 32, 125, 2, 103, 82, 20, 124, 52, 98, 96, 34, 21, 71, 50, 15, 80, 67, 18, 40, 16, 95, 109, 92, 0, 75, 91, 76, 62, 117, 72, 8, 39, 120, 69, 77, 65, 43, 30, 108, 36, 100, 99, 5, 58, 115, 57, 101, 14, 13, 23, 51, 60, 3, 61, 64, 107, 37, 86, 113, 104, 4, 66, 11, 126, 68, 41, 47, 122, 46, 111, 53, 1, 10, 123, 105, 49, 63, 7, 56, 44], [39, 57, 58, 52, 56, 116, 117, 55, 118, 51, 37, 59, 120, 86, 97, 31, 50, 126, 28, 98, 110, 99, 53, 54, 63, 112, 40, 122, 41, 20, 29, 47, 43, 92, 111, 109, 127, 124, 38, 125, 114, 95, 115, 108, 49, 123, 119, 105, 88, 89, 19, 61, 106, 62, 30, 60, 107, 113, 45, 34, 44, 104, 100, 121, 48, 36, 42, 46, 102, 33, 25, 32, 23, 6, 81, 101, 35, 80, 93, 10, 91, 24, 73, 96, 18, 87, 15, 78, 69, 27, 13, 14, 94, 22, 26, 90, 74, 66, 84, 67, 76, 71, 16, 12, 7, 68, 103, 64, 8, 21, 1, 2, 17, 72, 75, 5, 85, 83, 82, 11, 77, 0, 9, 3, 65, 79, 4, 70], [56, 39, 58, 52, 19, 59, 120, 34, 28, 90, 26, 31, 95, 96, 55, 37, 79, 53, 12, 21, 97, 88, 98, 86, 50, 116, 127, 118, 57, 24, 92, 8, 99, 25, 122, 49, 123, 111, 89, 14, 121, 18, 83, 27, 108, 29, 74, 63, 62, 51, 110, 44, 43, 87, 105, 109, 115, 91, 104, 38, 70, 126, 114, 1, 42, 10, 80, 41, 119, 68, 46, 33, 102, 72, 35, 60, 32, 45, 124, 125, 15, 93, 100, 106, 4, 48, 107, 85, 103, 112, 94, 47, 6, 78, 76, 23, 113, 81, 61, 75, 40, 101, 54, 11, 117, 0, 84, 16, 82, 36, 9, 2, 30, 13, 17, 22, 67, 71, 20, 64, 3, 69, 65, 73, 66, 77, 5, 7], [56, 39, 52, 117, 58, 57, 116, 55, 37, 118, 51, 53, 86, 54, 120, 10, 31, 73, 127, 126, 110, 111, 63, 97, 123, 122, 6, 119, 38, 76, 99, 121, 16, 29, 49, 43, 109, 13, 105, 112, 80, 42, 124, 125, 44, 78, 83, 106, 61, 7, 108, 93, 89, 28, 115, 50, 72, 48, 40, 62, 107, 45, 60, 46, 114, 77, 47, 113, 68, 88, 81, 11, 98, 41, 66, 104, 87, 82, 15, 67, 102, 69, 92, 36, 79, 30, 101, 34, 27, 59, 19, 96, 75, 22, 100, 33, 95, 20, 35, 1, 103, 71, 17, 32, 5, 84, 24, 94, 64, 26, 23, 85, 14, 4, 91, 8, 25, 0, 65, 9, 2, 12, 74, 21, 90, 70, 18, 3], [39, 52, 56, 34, 117, 57, 58, 51, 88, 116, 120, 50, 118, 95, 18, 98, 80, 20, 86, 55, 122, 21, 92, 30, 22, 90, 28, 53, 114, 97, 13, 74, 115, 19, 29, 32, 109, 60, 126, 62, 77, 61, 125, 99, 26, 89, 27, 25, 121, 37, 68, 11, 78, 49, 82, 14, 8, 54, 23, 48, 79, 24, 31, 71, 96, 112, 40, 105, 123, 127, 35, 46, 36, 94, 33, 124, 63, 12, 108, 119, 100, 110, 45, 10, 43, 44, 111, 47, 9, 107, 93, 85, 42, 16, 41, 38, 106, 15, 59, 113, 104, 102, 6, 101, 75, 91, 70, 81, 83, 2, 69, 84, 17, 1, 87, 76, 73, 0, 4, 3, 67, 5, 66, 72, 103, 7, 65, 64], [57, 48, 50, 38, 58, 54, 118, 56, 61, 100, 53, 63, 124, 115, 41, 116, 117, 119, 52, 102, 59, 84, 120, 46, 51, 121, 47, 126, 122, 125, 62, 113, 114, 111, 127, 97, 20, 44, 123, 49, 109, 60, 55, 45, 110, 112, 108, 43, 103, 106, 40, 107, 74, 42, 89, 105, 30, 27, 34, 31, 7, 33, 13, 101, 39, 24, 95, 78, 104, 32, 36, 98, 35, 21, 37, 88, 25, 99, 92, 80, 77, 87, 91, 96, 29, 23, 85, 93, 16, 71, 9, 69, 94, 28, 10, 66, 12, 90, 18, 73, 4, 26, 2, 5, 6, 65, 64, 3, 14, 19, 68, 72, 79, 83, 11, 70, 17, 0, 75, 82, 86, 1, 76, 15, 67, 22, 81, 8], [50, 38, 57, 48, 97, 112, 114, 7, 54, 115, 20, 88, 111, 77, 124, 90, 81, 100, 29, 95, 18, 79, 11, 28, 4, 27, 58, 24, 53, 68, 35, 23, 52, 59, 107, 94, 86, 119, 1, 101, 25, 63, 10, 56, 120, 99, 61, 42, 72, 104, 41, 121, 16, 103, 40, 125, 26, 43, 118, 117, 39, 108, 32, 123, 109, 110, 51, 37, 8, 46, 102, 116, 47, 105, 87, 106, 14, 93, 44, 6, 45, 85, 71, 49, 62, 126, 113, 55, 3, 76, 83, 65, 96, 75, 60, 34, 127, 89, 91, 30, 21, 122, 36, 31, 98, 78, 15, 84, 73, 12, 82, 17, 66, 5, 19, 80, 2, 74, 33, 0, 64, 92, 22, 9, 69, 13, 67, 70], [38, 57, 50, 48, 97, 23, 79, 72, 81, 76, 100, 18, 114, 28, 112, 6, 10, 3, 68, 14, 21, 65, 8, 70, 86, 77, 11, 82, 80, 87, 0, 67, 17, 54, 12, 75, 90, 88, 24, 56, 15, 2, 20, 95, 59, 73, 78, 5, 83, 9, 13, 85, 53, 84, 16, 29, 69, 58, 93, 102, 19, 92, 115, 91, 26, 25, 74, 27, 52, 66, 22, 117, 71, 94, 30, 121, 111, 61, 89, 32, 107, 7, 64, 124, 101, 36, 1, 55, 31, 35, 96, 63, 99, 4, 98, 119, 46, 34, 41, 60, 118, 125, 43, 110, 120, 106, 37, 126, 62, 123, 104, 40, 47, 42, 39, 103, 44, 127, 51, 116, 49, 105, 33, 108, 122, 109, 45, 113], [48, 50, 57, 38, 122, 115, 126, 51, 46, 54, 118, 56, 47, 62, 58, 109, 114, 124, 116, 49, 125, 55, 52, 112, 63, 113, 119, 61, 127, 60, 110, 59, 53, 45, 106, 43, 117, 44, 123, 121, 120, 108, 41, 107, 105, 40, 42, 103, 34, 102, 111, 101, 39, 89, 104, 20, 97, 37, 95, 32, 29, 90, 86, 100, 36, 31, 30, 35, 93, 99, 88, 77, 92, 96, 94, 98, 16, 22, 83, 33, 81, 27, 25, 91, 23, 80, 84, 14, 28, 24, 19, 21, 73, 26, 17, 82, 13, 18, 79, 78, 87, 85, 76, 11, 66, 10, 6, 12, 74, 9, 15, 0, 75, 2, 71, 69, 7, 1, 72, 8, 64, 68, 3, 4, 70, 5, 65, 67]], "model.layers.6.self_attn.k_proj": [[44, 55, 100, 86, 32, 84, 81, 57, 15, 77, 91, 108, 23, 12, 10, 64, 5, 75, 67, 8, 7, 65, 94, 114, 90, 16, 111, 29, 71, 76, 50, 82, 89, 103, 68, 72, 2, 83, 25, 39, 123, 78, 51, 66, 93, 121, 63, 120, 6, 58, 46, 48, 118, 125, 19, 124, 31, 126, 106, 61, 127, 104, 21, 70, 40, 1, 45, 18, 95, 88, 47, 0, 122, 69, 113, 53, 119, 34, 117, 62, 92, 54, 56, 42, 37, 109, 43, 73, 35, 41, 33, 11, 38, 116, 80, 30, 97, 49, 52, 98, 85, 28, 74, 107, 112, 59, 115, 110, 102, 4, 24, 60, 87, 99, 105, 101, 14, 27, 22, 26, 20, 79, 96, 13, 9, 17, 3, 36], [106, 110, 46, 96, 22, 93, 17, 89, 39, 10, 14, 19, 76, 20, 56, 71, 38, 120, 0, 11, 24, 45, 111, 42, 1, 123, 3, 52, 69, 66, 26, 28, 13, 103, 119, 58, 16, 5, 8, 49, 79, 37, 92, 18, 121, 116, 48, 107, 54, 43, 99, 114, 70, 105, 57, 127, 68, 122, 61, 85, 113, 60, 50, 90, 47, 73, 63, 55, 40, 36, 126, 117, 62, 109, 35, 53, 125, 124, 100, 112, 118, 30, 65, 102, 27, 115, 108, 4, 44, 104, 67, 23, 80, 82, 6, 101, 31, 51, 98, 34, 59, 97, 29, 9, 84, 64, 91, 33, 41, 94, 86, 7, 72, 21, 95, 74, 88, 2, 87, 77, 78, 12, 75, 15, 83, 25, 32, 81], [45, 39, 98, 109, 0, 20, 80, 18, 72, 12, 14, 24, 79, 75, 71, 66, 74, 5, 4, 70, 65, 9, 3, 112, 1, 67, 77, 69, 22, 92, 19, 64, 113, 123, 60, 120, 85, 91, 59, 86, 89, 23, 58, 48, 88, 17, 73, 6, 34, 78, 114, 25, 10, 90, 21, 27, 55, 63, 108, 13, 30, 122, 28, 47, 96, 126, 29, 119, 104, 33, 61, 105, 93, 83, 95, 35, 115, 31, 57, 11, 100, 51, 8, 54, 50, 38, 2, 121, 94, 111, 110, 49, 97, 26, 125, 81, 32, 87, 124, 7, 40, 46, 56, 52, 68, 101, 127, 42, 44, 117, 106, 116, 107, 118, 36, 43, 37, 41, 62, 102, 99, 53, 16, 76, 82, 84, 15, 103], [113, 124, 102, 56, 49, 94, 84, 82, 15, 76, 86, 81, 74, 77, 64, 5, 71, 8, 89, 73, 1, 66, 69, 39, 67, 122, 26, 91, 4, 75, 2, 51, 68, 70, 25, 65, 3, 72, 12, 24, 13, 38, 30, 6, 22, 23, 20, 59, 33, 79, 14, 83, 78, 90, 32, 80, 87, 7, 27, 9, 29, 88, 96, 10, 60, 99, 19, 16, 28, 21, 85, 18, 0, 11, 105, 97, 93, 17, 31, 95, 92, 103, 114, 35, 58, 125, 54, 47, 117, 43, 46, 104, 45, 40, 107, 118, 112, 36, 108, 109, 116, 115, 110, 126, 42, 62, 48, 111, 63, 98, 101, 55, 50, 53, 57, 44, 61, 127, 52, 106, 41, 34, 37, 121, 119, 120, 123, 100], [104, 116, 34, 111, 64, 20, 11, 53, 81, 86, 78, 51, 8, 16, 9, 7, 13, 121, 96, 3, 113, 2, 58, 62, 68, 69, 109, 60, 110, 122, 42, 50, 117, 56, 41, 89, 46, 127, 30, 49, 1, 126, 90, 108, 21, 98, 67, 39, 6, 105, 40, 63, 35, 37, 88, 83, 82, 32, 27, 124, 125, 65, 92, 44, 15, 57, 76, 107, 118, 61, 120, 54, 36, 74, 29, 0, 87, 70, 72, 28, 24, 31, 23, 25, 99, 93, 59, 106, 119, 47, 85, 91, 5, 43, 103, 77, 94, 33, 112, 97, 18, 55, 26, 45, 22, 38, 95, 115, 114, 19, 101, 123, 12, 52, 84, 100, 48, 71, 102, 66, 17, 79, 10, 75, 14, 4, 73, 80], [102, 119, 118, 22, 97, 25, 28, 83, 84, 78, 95, 12, 64, 74, 94, 71, 16, 18, 52, 79, 88, 30, 68, 125, 106, 111, 73, 38, 124, 46, 13, 91, 54, 45, 127, 66, 109, 6, 116, 17, 48, 104, 62, 43, 51, 58, 40, 53, 61, 57, 126, 90, 59, 123, 117, 67, 65, 1, 120, 121, 113, 44, 93, 3, 60, 50, 11, 56, 49, 69, 103, 108, 21, 47, 37, 112, 81, 19, 99, 42, 115, 122, 23, 63, 35, 82, 114, 41, 75, 2, 105, 20, 39, 26, 100, 70, 110, 55, 36, 87, 34, 32, 72, 15, 107, 101, 4, 80, 0, 98, 7, 5, 31, 77, 85, 96, 8, 27, 29, 92, 76, 86, 9, 89, 10, 24, 14, 33], [103, 57, 52, 56, 31, 58, 21, 88, 18, 90, 117, 59, 79, 98, 28, 14, 111, 72, 89, 75, 4, 120, 118, 25, 60, 12, 116, 110, 126, 74, 64, 85, 115, 55, 50, 62, 63, 124, 121, 66, 70, 46, 32, 112, 125, 49, 123, 65, 109, 39, 108, 105, 51, 53, 114, 48, 43, 73, 54, 34, 44, 92, 127, 27, 35, 113, 119, 81, 106, 19, 107, 102, 45, 47, 86, 84, 101, 20, 61, 122, 96, 7, 97, 104, 87, 77, 76, 91, 29, 41, 100, 13, 40, 82, 42, 71, 11, 94, 26, 16, 33, 6, 38, 95, 67, 3, 99, 36, 69, 37, 24, 80, 78, 30, 93, 1, 9, 68, 5, 8, 17, 15, 10, 23, 22, 83, 0, 2], [102, 50, 57, 86, 48, 33, 92, 18, 79, 76, 81, 23, 72, 6, 88, 83, 112, 14, 90, 11, 0, 93, 100, 30, 3, 10, 111, 31, 53, 58, 77, 124, 68, 73, 119, 61, 94, 117, 114, 2, 16, 63, 21, 118, 52, 36, 29, 54, 125, 38, 35, 99, 56, 115, 120, 9, 59, 65, 121, 108, 123, 105, 41, 98, 47, 113, 126, 127, 69, 55, 25, 45, 62, 51, 46, 24, 109, 60, 116, 20, 107, 110, 49, 85, 43, 32, 44, 66, 5, 96, 34, 122, 89, 106, 75, 42, 104, 1, 103, 91, 27, 39, 40, 101, 26, 37, 64, 4, 71, 95, 82, 84, 12, 78, 67, 80, 22, 15, 87, 19, 7, 8, 74, 28, 17, 13, 70, 97]], "model.layers.6.self_attn.qk_proj": [[57, 56, 118, 119, 45, 50, 55, 124, 113, 109, 48, 49, 38, 46, 44, 106, 22, 104, 52, 116, 102, 39, 108, 86, 20, 84, 58, 17, 110, 42, 30, 81, 25, 111, 34, 79, 15, 78, 76, 82, 98, 18, 88, 94, 89, 24, 10, 14, 32, 12, 112, 7, 13, 74, 0, 103, 40, 77, 117, 114, 51, 29, 72, 120, 8, 53, 75, 64, 16, 71, 11, 96, 90, 19, 28, 80, 67, 62, 31, 60, 87, 26, 121, 33, 3, 92, 63, 69, 70, 123, 127, 23, 83, 73, 97, 5, 66, 2, 9, 59, 126, 122, 68, 27, 61, 41, 47, 4, 93, 100, 21, 43, 95, 36, 115, 125, 54, 107, 1, 91, 65, 99, 85, 105, 35, 6, 101, 37], [57, 56, 119, 118, 45, 50, 113, 55, 124, 109, 49, 48, 38, 46, 44, 106, 104, 22, 116, 52, 102, 39, 84, 86, 108, 20, 58, 110, 17, 42, 111, 81, 30, 25, 103, 34, 89, 94, 79, 82, 78, 98, 32, 15, 18, 76, 112, 14, 117, 24, 88, 12, 51, 10, 40, 74, 29, 7, 53, 75, 77, 120, 16, 28, 71, 72, 60, 11, 8, 64, 13, 114, 90, 80, 0, 96, 19, 121, 26, 127, 59, 97, 83, 87, 66, 31, 69, 2, 23, 62, 92, 4, 5, 33, 9, 122, 73, 123, 126, 27, 68, 67, 63, 1, 70, 100, 54, 61, 93, 125, 95, 3, 47, 43, 65, 21, 41, 91, 107, 6, 115, 99, 85, 36, 105, 35, 37, 101], [57, 56, 119, 118, 45, 113, 50, 55, 109, 48, 124, 49, 38, 46, 44, 102, 22, 106, 104, 39, 52, 116, 86, 84, 108, 20, 58, 110, 42, 17, 30, 25, 81, 89, 111, 98, 79, 94, 15, 34, 18, 82, 88, 14, 103, 32, 78, 76, 74, 0, 120, 13, 77, 75, 40, 112, 10, 24, 51, 71, 117, 12, 8, 7, 53, 60, 64, 114, 96, 11, 28, 72, 16, 90, 5, 29, 67, 31, 80, 26, 19, 83, 92, 69, 66, 2, 62, 3, 123, 97, 23, 68, 87, 73, 121, 9, 59, 127, 4, 122, 27, 93, 100, 63, 33, 54, 126, 6, 61, 1, 115, 70, 95, 43, 107, 21, 85, 41, 65, 91, 47, 125, 99, 105, 35, 36, 37, 101], [57, 56, 119, 118, 45, 50, 113, 55, 124, 48, 109, 49, 38, 46, 44, 106, 104, 102, 52, 116, 39, 22, 86, 108, 84, 20, 58, 42, 110, 17, 30, 81, 89, 18, 79, 34, 103, 25, 98, 94, 15, 111, 14, 78, 32, 82, 64, 112, 13, 117, 88, 76, 10, 51, 40, 24, 12, 8, 74, 77, 0, 3, 71, 120, 11, 60, 7, 28, 114, 53, 67, 96, 127, 90, 66, 123, 75, 121, 80, 16, 29, 31, 5, 62, 83, 72, 26, 19, 2, 69, 92, 73, 97, 33, 122, 23, 63, 59, 87, 4, 6, 27, 126, 100, 9, 54, 68, 93, 115, 1, 61, 95, 65, 21, 41, 107, 43, 70, 47, 91, 85, 125, 37, 105, 36, 99, 35, 101], [57, 56, 118, 119, 45, 50, 55, 113, 48, 124, 109, 38, 49, 46, 44, 106, 104, 52, 116, 102, 39, 22, 108, 86, 20, 84, 42, 58, 17, 110, 81, 34, 78, 30, 89, 111, 79, 15, 14, 94, 76, 18, 32, 12, 98, 112, 74, 10, 25, 8, 71, 82, 103, 88, 64, 51, 7, 11, 40, 114, 77, 13, 120, 0, 72, 24, 96, 53, 16, 28, 75, 29, 62, 80, 69, 117, 83, 66, 19, 90, 3, 92, 127, 5, 4, 60, 2, 6, 123, 26, 63, 73, 9, 67, 68, 31, 121, 59, 100, 65, 33, 23, 97, 87, 122, 126, 93, 115, 27, 41, 54, 95, 1, 61, 105, 43, 91, 47, 36, 107, 125, 85, 37, 70, 35, 99, 21, 101], [57, 56, 119, 118, 45, 50, 55, 113, 124, 109, 48, 49, 46, 38, 44, 106, 104, 52, 116, 102, 39, 22, 86, 84, 108, 20, 58, 110, 17, 42, 81, 34, 79, 82, 111, 30, 25, 14, 98, 12, 78, 15, 76, 8, 74, 89, 10, 94, 103, 18, 13, 112, 88, 11, 7, 40, 51, 16, 32, 77, 117, 24, 71, 114, 0, 72, 64, 3, 53, 80, 120, 75, 29, 123, 60, 28, 127, 69, 5, 90, 96, 19, 26, 83, 97, 23, 31, 4, 68, 6, 73, 67, 66, 122, 59, 92, 62, 121, 9, 100, 87, 126, 63, 2, 33, 93, 27, 47, 41, 54, 107, 115, 95, 1, 61, 65, 43, 36, 21, 125, 85, 105, 99, 91, 70, 37, 101, 35], [57, 56, 119, 118, 45, 50, 113, 55, 124, 109, 48, 49, 38, 46, 44, 104, 106, 52, 102, 22, 116, 39, 86, 20, 84, 108, 17, 58, 81, 30, 110, 42, 82, 79, 89, 25, 40, 10, 78, 15, 18, 13, 12, 77, 98, 76, 34, 74, 51, 14, 94, 111, 8, 103, 16, 88, 11, 32, 112, 24, 72, 7, 96, 117, 75, 80, 120, 19, 90, 114, 71, 53, 29, 60, 59, 28, 83, 26, 127, 92, 73, 61, 63, 97, 62, 64, 123, 122, 121, 0, 33, 100, 9, 87, 31, 69, 23, 67, 5, 27, 126, 2, 4, 68, 93, 6, 3, 66, 47, 125, 107, 21, 41, 54, 91, 43, 85, 105, 95, 1, 99, 115, 65, 36, 37, 101, 70, 35], [57, 56, 119, 118, 45, 50, 124, 113, 55, 109, 48, 49, 46, 38, 106, 44, 104, 52, 39, 116, 102, 22, 86, 84, 20, 108, 17, 58, 81, 89, 42, 110, 25, 78, 30, 34, 82, 14, 79, 15, 12, 18, 111, 74, 98, 77, 88, 103, 76, 10, 40, 13, 51, 8, 11, 16, 80, 94, 75, 117, 32, 24, 71, 112, 120, 19, 72, 114, 7, 28, 83, 53, 64, 96, 92, 121, 59, 73, 69, 31, 29, 5, 0, 87, 62, 90, 60, 97, 66, 9, 26, 100, 126, 27, 23, 122, 2, 123, 33, 63, 68, 4, 47, 3, 67, 127, 54, 21, 6, 91, 61, 93, 85, 125, 43, 95, 65, 70, 107, 41, 35, 1, 115, 105, 36, 99, 37, 101], [57, 56, 119, 118, 45, 50, 55, 124, 113, 109, 49, 48, 38, 46, 44, 106, 104, 52, 116, 102, 39, 22, 86, 84, 20, 108, 58, 81, 17, 42, 25, 110, 89, 78, 34, 82, 30, 14, 111, 79, 18, 32, 103, 98, 15, 94, 40, 10, 12, 88, 77, 80, 13, 74, 76, 24, 8, 112, 117, 29, 11, 75, 83, 53, 19, 114, 28, 51, 96, 72, 120, 90, 7, 71, 121, 16, 62, 92, 31, 87, 64, 26, 59, 127, 23, 97, 100, 66, 9, 67, 122, 60, 5, 123, 61, 33, 27, 73, 63, 3, 4, 68, 0, 126, 69, 2, 54, 93, 21, 125, 91, 95, 6, 47, 85, 65, 70, 41, 99, 35, 1, 43, 37, 115, 105, 107, 36, 101], [57, 56, 118, 119, 50, 45, 55, 113, 109, 124, 48, 49, 38, 46, 44, 116, 106, 104, 102, 52, 22, 39, 84, 86, 20, 108, 58, 81, 42, 17, 110, 30, 25, 82, 79, 111, 14, 15, 78, 76, 18, 89, 98, 34, 94, 103, 51, 77, 40, 32, 88, 10, 127, 12, 29, 24, 7, 74, 11, 112, 80, 117, 28, 8, 90, 13, 96, 72, 75, 53, 87, 16, 83, 60, 71, 19, 114, 120, 26, 31, 23, 0, 5, 123, 66, 92, 59, 64, 97, 62, 69, 73, 67, 100, 121, 68, 9, 33, 61, 27, 4, 126, 122, 63, 2, 3, 70, 54, 93, 47, 41, 125, 43, 91, 21, 85, 95, 107, 1, 105, 65, 115, 6, 35, 99, 37, 101, 36], [57, 56, 119, 118, 45, 50, 113, 55, 124, 109, 48, 38, 49, 44, 106, 46, 52, 104, 102, 116, 39, 22, 86, 108, 84, 20, 58, 42, 110, 17, 81, 34, 111, 30, 15, 14, 78, 25, 76, 98, 79, 103, 82, 18, 89, 94, 10, 0, 32, 88, 12, 64, 112, 7, 72, 71, 11, 13, 24, 74, 40, 114, 51, 117, 96, 53, 77, 8, 19, 28, 2, 70, 75, 5, 80, 66, 69, 31, 120, 127, 16, 62, 26, 123, 3, 92, 63, 90, 29, 83, 67, 60, 73, 68, 23, 9, 97, 121, 41, 4, 87, 122, 59, 93, 100, 61, 65, 47, 54, 33, 126, 27, 1, 95, 115, 107, 85, 125, 43, 21, 91, 105, 6, 36, 35, 99, 37, 101], [57, 56, 119, 118, 45, 50, 113, 55, 124, 109, 48, 38, 49, 44, 46, 106, 104, 52, 39, 116, 102, 22, 84, 86, 108, 20, 58, 42, 110, 17, 81, 111, 98, 34, 82, 30, 79, 25, 78, 15, 14, 89, 94, 32, 18, 103, 13, 88, 12, 76, 51, 40, 74, 72, 10, 7, 71, 24, 112, 80, 77, 53, 67, 75, 64, 0, 123, 3, 11, 121, 8, 5, 16, 29, 19, 117, 62, 127, 28, 92, 68, 120, 96, 114, 69, 83, 9, 31, 90, 122, 4, 59, 66, 60, 2, 70, 26, 87, 63, 97, 73, 100, 41, 27, 33, 23, 1, 65, 95, 93, 54, 47, 126, 61, 91, 107, 125, 115, 21, 85, 36, 105, 6, 99, 37, 43, 35, 101], [57, 56, 118, 119, 50, 45, 55, 124, 113, 48, 109, 49, 46, 38, 44, 104, 106, 116, 52, 102, 22, 39, 84, 86, 20, 108, 17, 58, 81, 110, 30, 42, 25, 89, 82, 111, 15, 79, 18, 78, 34, 14, 94, 12, 74, 10, 98, 32, 76, 103, 13, 72, 40, 88, 77, 24, 11, 29, 16, 7, 51, 80, 117, 8, 83, 75, 112, 71, 114, 53, 96, 120, 92, 122, 28, 97, 121, 5, 100, 90, 59, 60, 19, 127, 26, 31, 9, 87, 123, 27, 126, 69, 73, 23, 66, 4, 41, 68, 62, 33, 2, 64, 3, 0, 93, 61, 54, 63, 47, 91, 125, 115, 43, 107, 85, 67, 21, 70, 95, 65, 6, 35, 37, 99, 1, 36, 105, 101], [57, 56, 119, 118, 45, 55, 50, 124, 113, 109, 48, 49, 46, 38, 44, 106, 52, 104, 116, 22, 102, 39, 86, 84, 108, 20, 58, 17, 110, 81, 42, 34, 82, 78, 25, 18, 12, 30, 72, 79, 15, 103, 111, 74, 98, 14, 10, 0, 32, 89, 76, 88, 24, 112, 71, 77, 7, 51, 8, 94, 64, 40, 13, 16, 75, 117, 11, 123, 80, 5, 29, 83, 53, 19, 114, 62, 69, 60, 26, 96, 127, 28, 31, 92, 9, 126, 90, 59, 120, 122, 2, 66, 73, 23, 121, 33, 54, 63, 68, 4, 87, 97, 3, 61, 67, 100, 70, 27, 21, 6, 47, 93, 115, 43, 95, 91, 41, 125, 1, 65, 85, 105, 107, 36, 35, 99, 37, 101], [57, 56, 118, 119, 45, 113, 50, 124, 55, 109, 48, 49, 38, 46, 44, 106, 104, 52, 116, 39, 102, 86, 22, 84, 108, 20, 42, 58, 17, 81, 110, 30, 18, 82, 79, 89, 25, 78, 34, 14, 98, 15, 94, 111, 12, 103, 72, 32, 10, 74, 76, 71, 40, 77, 112, 88, 7, 24, 11, 51, 16, 8, 96, 117, 26, 5, 28, 29, 62, 13, 3, 75, 53, 83, 64, 31, 114, 0, 90, 19, 69, 120, 80, 127, 73, 66, 60, 121, 59, 92, 67, 2, 123, 100, 9, 97, 23, 68, 126, 87, 6, 63, 93, 54, 4, 122, 27, 33, 41, 115, 43, 65, 21, 95, 47, 1, 85, 61, 36, 35, 107, 70, 91, 125, 37, 105, 99, 101], [57, 56, 119, 118, 45, 50, 55, 124, 113, 48, 109, 38, 49, 44, 46, 106, 52, 104, 116, 102, 39, 22, 86, 108, 84, 20, 58, 42, 17, 110, 81, 34, 30, 111, 78, 25, 98, 15, 94, 14, 79, 82, 10, 74, 89, 76, 18, 12, 103, 117, 24, 77, 72, 88, 32, 112, 40, 7, 71, 11, 16, 8, 120, 26, 96, 51, 28, 75, 80, 29, 13, 64, 83, 121, 19, 114, 0, 92, 127, 69, 59, 53, 90, 3, 123, 31, 97, 47, 67, 62, 60, 73, 5, 122, 93, 126, 4, 87, 23, 100, 9, 68, 2, 66, 6, 41, 33, 63, 27, 54, 115, 43, 21, 95, 61, 125, 65, 1, 107, 35, 99, 105, 91, 70, 85, 36, 37, 101], [57, 56, 118, 119, 50, 45, 113, 124, 55, 109, 48, 38, 46, 49, 44, 116, 22, 106, 104, 52, 102, 39, 86, 84, 108, 20, 17, 42, 58, 110, 81, 30, 25, 18, 89, 15, 111, 77, 98, 78, 79, 34, 74, 82, 14, 10, 12, 94, 76, 11, 88, 40, 24, 7, 13, 16, 72, 51, 120, 117, 32, 103, 8, 75, 80, 53, 28, 71, 114, 112, 122, 123, 19, 26, 96, 29, 60, 92, 31, 0, 87, 127, 83, 5, 69, 126, 97, 23, 90, 62, 73, 66, 63, 59, 121, 9, 4, 33, 61, 64, 100, 68, 21, 27, 6, 47, 43, 54, 115, 107, 3, 2, 67, 93, 95, 41, 105, 125, 99, 35, 91, 1, 65, 36, 85, 37, 70, 101], [57, 56, 118, 119, 45, 50, 113, 124, 55, 109, 48, 46, 49, 38, 44, 106, 52, 104, 116, 22, 39, 102, 86, 84, 20, 108, 58, 42, 110, 17, 81, 89, 30, 82, 25, 18, 14, 78, 15, 98, 34, 74, 12, 10, 79, 111, 94, 72, 88, 40, 117, 8, 32, 13, 75, 76, 103, 24, 77, 7, 53, 28, 51, 112, 80, 71, 11, 16, 60, 31, 83, 64, 96, 19, 121, 5, 90, 114, 120, 126, 87, 122, 69, 26, 62, 66, 29, 92, 9, 3, 73, 23, 97, 123, 0, 127, 100, 59, 63, 67, 68, 33, 2, 61, 6, 27, 4, 47, 115, 54, 43, 21, 93, 91, 95, 41, 1, 85, 107, 65, 125, 70, 105, 99, 35, 37, 36, 101], [57, 56, 119, 118, 45, 50, 113, 55, 124, 109, 48, 38, 49, 46, 44, 106, 102, 52, 104, 116, 39, 22, 86, 84, 108, 20, 58, 42, 17, 110, 30, 81, 89, 94, 15, 34, 18, 82, 98, 14, 111, 12, 76, 88, 78, 79, 25, 103, 40, 10, 74, 8, 0, 51, 24, 112, 71, 7, 77, 72, 11, 32, 64, 96, 117, 31, 13, 75, 28, 26, 100, 60, 90, 5, 16, 80, 114, 29, 69, 121, 122, 66, 120, 3, 92, 53, 83, 19, 2, 127, 97, 23, 62, 87, 123, 33, 9, 73, 59, 4, 68, 126, 27, 67, 63, 41, 61, 6, 93, 95, 105, 54, 115, 70, 65, 1, 47, 91, 21, 37, 85, 99, 35, 43, 107, 36, 125, 101], [57, 56, 118, 119, 50, 45, 55, 124, 113, 48, 109, 49, 38, 46, 44, 106, 52, 116, 102, 104, 22, 39, 86, 84, 108, 20, 58, 17, 110, 42, 30, 81, 14, 34, 89, 98, 82, 76, 78, 18, 10, 74, 25, 94, 15, 8, 117, 88, 12, 111, 103, 32, 79, 112, 51, 71, 40, 11, 75, 13, 24, 72, 120, 77, 19, 123, 7, 29, 26, 96, 16, 126, 80, 114, 83, 28, 90, 127, 0, 60, 53, 5, 69, 121, 66, 73, 31, 4, 64, 122, 61, 97, 92, 68, 59, 100, 3, 62, 87, 2, 23, 33, 9, 63, 47, 70, 27, 93, 67, 43, 95, 21, 54, 41, 65, 1, 115, 6, 107, 85, 105, 91, 35, 125, 36, 99, 37, 101], [57, 56, 118, 119, 45, 50, 124, 55, 113, 48, 109, 49, 44, 38, 46, 106, 52, 104, 116, 22, 102, 39, 86, 108, 84, 20, 58, 17, 42, 110, 81, 30, 14, 98, 89, 18, 111, 79, 25, 88, 34, 78, 8, 15, 32, 82, 12, 94, 76, 10, 74, 103, 77, 112, 24, 40, 71, 117, 13, 120, 67, 16, 75, 114, 64, 51, 80, 11, 7, 28, 62, 3, 96, 72, 123, 60, 19, 92, 90, 31, 83, 29, 87, 26, 53, 127, 126, 5, 0, 121, 69, 97, 73, 66, 61, 63, 4, 47, 70, 115, 122, 33, 41, 9, 59, 93, 100, 27, 68, 107, 2, 23, 95, 54, 91, 125, 43, 21, 36, 85, 1, 6, 105, 65, 99, 37, 35, 101], [57, 56, 118, 119, 45, 50, 124, 55, 113, 109, 48, 49, 38, 44, 46, 52, 106, 104, 102, 116, 39, 22, 86, 108, 84, 20, 58, 42, 17, 110, 81, 30, 34, 15, 98, 25, 94, 79, 18, 89, 14, 111, 78, 82, 12, 103, 88, 74, 40, 24, 8, 112, 10, 117, 76, 77, 51, 32, 120, 71, 7, 16, 75, 64, 13, 11, 96, 0, 28, 60, 80, 26, 66, 90, 114, 72, 69, 53, 59, 5, 83, 19, 29, 92, 3, 126, 87, 97, 121, 62, 2, 31, 122, 123, 127, 9, 23, 63, 27, 73, 33, 100, 70, 67, 4, 43, 68, 61, 47, 93, 41, 95, 115, 107, 105, 1, 54, 91, 125, 21, 85, 36, 37, 65, 6, 99, 35, 101], [57, 56, 118, 119, 45, 50, 55, 124, 113, 48, 109, 49, 38, 46, 44, 52, 106, 116, 104, 102, 22, 39, 20, 108, 86, 84, 58, 42, 17, 110, 30, 81, 89, 25, 34, 18, 88, 79, 32, 78, 94, 14, 15, 12, 82, 98, 74, 111, 8, 10, 112, 13, 51, 103, 76, 117, 77, 24, 71, 11, 114, 7, 120, 75, 80, 40, 16, 19, 72, 28, 87, 83, 59, 29, 26, 0, 126, 62, 96, 92, 97, 69, 90, 127, 31, 121, 5, 64, 60, 3, 122, 53, 9, 73, 66, 123, 33, 4, 68, 100, 23, 2, 27, 70, 93, 67, 61, 63, 47, 43, 41, 1, 115, 95, 54, 85, 91, 125, 35, 37, 107, 105, 21, 6, 36, 65, 99, 101], [57, 56, 118, 119, 45, 50, 124, 55, 113, 109, 48, 38, 49, 46, 44, 106, 52, 116, 104, 102, 39, 22, 84, 108, 86, 20, 110, 58, 42, 17, 30, 81, 111, 98, 89, 78, 34, 25, 94, 10, 79, 88, 103, 8, 18, 74, 76, 117, 14, 12, 82, 32, 15, 51, 75, 112, 77, 24, 40, 71, 28, 7, 114, 120, 72, 26, 80, 13, 29, 96, 16, 127, 11, 3, 123, 19, 53, 83, 97, 90, 60, 87, 92, 69, 126, 121, 31, 64, 122, 23, 9, 62, 68, 5, 73, 67, 100, 33, 4, 59, 63, 27, 0, 66, 41, 47, 115, 54, 2, 70, 93, 61, 107, 105, 125, 36, 65, 95, 43, 21, 85, 91, 35, 1, 6, 99, 37, 101], [57, 56, 118, 119, 45, 50, 124, 113, 55, 109, 48, 49, 38, 46, 44, 52, 104, 102, 106, 116, 22, 39, 108, 20, 84, 86, 58, 17, 42, 81, 110, 30, 10, 89, 34, 79, 76, 18, 111, 25, 15, 98, 14, 78, 94, 82, 11, 88, 12, 74, 40, 51, 7, 103, 13, 77, 117, 112, 8, 71, 24, 32, 120, 16, 96, 72, 114, 75, 29, 80, 60, 26, 28, 90, 31, 19, 83, 53, 122, 5, 69, 92, 73, 123, 64, 9, 0, 62, 126, 100, 97, 121, 23, 59, 127, 63, 33, 87, 68, 2, 4, 67, 3, 41, 47, 66, 93, 115, 27, 61, 6, 125, 95, 43, 107, 70, 105, 21, 65, 1, 85, 36, 91, 54, 99, 35, 37, 101], [57, 56, 118, 119, 50, 45, 113, 124, 55, 109, 48, 49, 38, 46, 44, 106, 104, 52, 116, 102, 22, 39, 84, 86, 20, 108, 58, 17, 81, 42, 110, 111, 25, 98, 30, 89, 34, 14, 82, 76, 74, 79, 10, 15, 78, 117, 94, 18, 103, 88, 24, 72, 40, 12, 71, 120, 8, 13, 7, 77, 32, 80, 11, 75, 28, 51, 16, 112, 92, 126, 114, 64, 0, 26, 19, 96, 83, 53, 5, 29, 123, 9, 62, 23, 60, 121, 66, 122, 4, 73, 127, 90, 67, 69, 97, 100, 87, 31, 93, 59, 33, 2, 47, 6, 63, 61, 3, 91, 27, 115, 125, 95, 41, 54, 68, 21, 85, 1, 107, 105, 65, 43, 36, 70, 99, 35, 101, 37], [57, 56, 119, 118, 45, 50, 124, 113, 55, 109, 48, 49, 38, 46, 44, 104, 106, 52, 116, 22, 102, 39, 86, 84, 20, 108, 58, 17, 30, 81, 25, 42, 110, 82, 34, 98, 89, 79, 32, 15, 111, 103, 88, 76, 94, 14, 78, 24, 12, 74, 10, 40, 18, 77, 51, 75, 72, 112, 13, 120, 7, 29, 19, 90, 71, 16, 92, 114, 117, 96, 80, 28, 64, 8, 0, 83, 11, 67, 3, 62, 97, 59, 53, 121, 87, 63, 73, 60, 26, 31, 126, 5, 122, 33, 66, 4, 6, 23, 2, 69, 47, 61, 68, 100, 123, 127, 41, 9, 93, 125, 27, 65, 91, 43, 95, 107, 85, 21, 54, 1, 105, 99, 115, 36, 70, 35, 37, 101], [57, 56, 119, 118, 45, 50, 124, 55, 113, 109, 48, 49, 46, 38, 44, 52, 106, 116, 104, 39, 22, 102, 84, 108, 20, 86, 58, 17, 42, 110, 81, 34, 30, 15, 111, 79, 89, 10, 25, 98, 82, 78, 12, 18, 14, 72, 76, 74, 103, 112, 8, 7, 51, 24, 94, 88, 13, 75, 120, 40, 117, 64, 71, 77, 11, 0, 80, 32, 16, 53, 60, 114, 5, 62, 2, 19, 28, 29, 90, 69, 123, 83, 73, 67, 121, 4, 26, 97, 68, 126, 92, 66, 96, 23, 122, 9, 31, 63, 127, 6, 33, 61, 47, 54, 3, 59, 93, 65, 87, 100, 1, 27, 95, 43, 41, 115, 125, 107, 91, 105, 85, 21, 70, 36, 37, 99, 101, 35], [57, 56, 119, 118, 45, 50, 113, 55, 124, 48, 109, 38, 49, 46, 44, 106, 104, 102, 52, 116, 22, 39, 108, 86, 20, 84, 42, 58, 110, 111, 17, 30, 81, 34, 94, 14, 98, 15, 112, 25, 72, 103, 79, 7, 18, 76, 89, 88, 10, 12, 32, 120, 78, 24, 74, 82, 114, 40, 51, 117, 123, 28, 71, 80, 11, 96, 75, 8, 77, 0, 64, 13, 26, 5, 53, 29, 16, 60, 69, 83, 2, 31, 67, 92, 90, 62, 126, 122, 121, 127, 9, 97, 47, 66, 19, 87, 33, 4, 73, 3, 63, 100, 59, 93, 41, 6, 23, 27, 54, 68, 107, 65, 115, 61, 36, 95, 105, 1, 21, 91, 70, 99, 125, 35, 85, 43, 101, 37], [57, 56, 119, 118, 45, 55, 124, 50, 113, 48, 109, 49, 38, 46, 44, 106, 104, 52, 116, 102, 39, 22, 108, 86, 84, 20, 110, 58, 42, 17, 81, 30, 111, 14, 72, 34, 98, 25, 76, 79, 15, 18, 10, 89, 94, 103, 112, 114, 78, 74, 88, 82, 12, 13, 7, 75, 32, 117, 24, 71, 80, 77, 40, 16, 3, 8, 53, 120, 11, 51, 96, 64, 0, 28, 63, 19, 29, 83, 5, 31, 123, 60, 4, 126, 69, 67, 73, 62, 26, 127, 66, 92, 122, 97, 90, 115, 9, 121, 23, 2, 59, 33, 107, 54, 47, 87, 27, 41, 68, 93, 100, 6, 70, 1, 95, 125, 65, 61, 105, 21, 85, 43, 99, 91, 36, 35, 37, 101], [57, 56, 119, 118, 45, 50, 113, 55, 124, 109, 48, 49, 38, 46, 44, 106, 104, 116, 52, 39, 102, 22, 84, 108, 86, 20, 110, 58, 81, 17, 42, 34, 30, 25, 15, 98, 111, 18, 82, 103, 72, 14, 79, 94, 78, 12, 76, 74, 51, 89, 88, 13, 71, 112, 10, 7, 24, 120, 40, 32, 11, 77, 80, 60, 0, 117, 114, 64, 75, 66, 16, 83, 123, 5, 8, 69, 96, 26, 68, 53, 63, 62, 29, 28, 87, 19, 90, 31, 127, 122, 9, 2, 4, 92, 97, 73, 67, 59, 47, 23, 33, 100, 61, 115, 70, 121, 3, 27, 41, 126, 93, 21, 107, 54, 6, 65, 125, 95, 91, 36, 1, 85, 43, 105, 37, 99, 35, 101], [57, 56, 118, 119, 45, 50, 113, 55, 124, 109, 49, 48, 38, 46, 44, 106, 104, 52, 116, 39, 102, 22, 20, 86, 108, 84, 58, 81, 17, 42, 110, 30, 25, 34, 111, 82, 14, 18, 79, 98, 12, 72, 76, 94, 10, 15, 89, 88, 117, 78, 51, 74, 32, 40, 103, 112, 75, 7, 8, 71, 80, 16, 24, 13, 114, 77, 60, 11, 120, 83, 28, 19, 53, 31, 96, 90, 69, 9, 0, 26, 5, 87, 62, 59, 73, 29, 97, 92, 23, 123, 66, 126, 121, 64, 3, 127, 70, 122, 68, 4, 27, 33, 2, 115, 47, 67, 61, 63, 100, 41, 93, 54, 65, 43, 125, 21, 91, 95, 1, 105, 6, 85, 107, 36, 99, 37, 101, 35]], "model.layers.7.self_attn.q_proj": [[38, 109, 43, 89, 45, 18, 93, 60, 107, 78, 12, 22, 10, 8, 17, 5, 19, 77, 80, 79, 56, 24, 91, 71, 68, 72, 3, 92, 26, 115, 25, 51, 20, 23, 57, 69, 15, 50, 81, 4, 58, 65, 102, 14, 127, 86, 97, 125, 27, 70, 34, 126, 28, 90, 88, 32, 67, 108, 30, 76, 9, 122, 29, 111, 123, 1, 6, 13, 124, 116, 74, 110, 82, 16, 63, 11, 96, 33, 75, 117, 55, 113, 52, 83, 87, 62, 35, 112, 94, 95, 7, 59, 100, 73, 119, 106, 40, 85, 54, 21, 0, 48, 44, 61, 114, 39, 105, 47, 31, 49, 120, 103, 53, 99, 36, 118, 84, 121, 41, 37, 66, 2, 98, 46, 101, 64, 104, 42], [38, 109, 43, 60, 10, 12, 19, 77, 5, 80, 93, 45, 18, 68, 78, 71, 107, 25, 22, 89, 8, 1, 73, 69, 79, 17, 3, 70, 29, 125, 65, 51, 88, 72, 87, 23, 0, 16, 14, 66, 122, 20, 4, 56, 74, 94, 26, 91, 84, 15, 32, 28, 86, 85, 7, 115, 21, 34, 24, 11, 76, 82, 6, 30, 90, 83, 92, 81, 58, 75, 116, 102, 97, 2, 13, 120, 123, 27, 95, 64, 126, 9, 100, 35, 33, 67, 98, 31, 96, 52, 46, 36, 111, 108, 47, 39, 127, 105, 99, 118, 62, 117, 112, 110, 59, 49, 55, 37, 106, 50, 54, 63, 104, 61, 103, 101, 57, 40, 114, 113, 119, 41, 48, 124, 53, 42, 44, 121], [38, 109, 43, 10, 67, 19, 93, 77, 45, 107, 80, 78, 12, 65, 6, 79, 23, 4, 5, 89, 7, 71, 3, 72, 0, 69, 25, 68, 8, 74, 1, 22, 60, 20, 14, 17, 18, 76, 16, 21, 51, 28, 75, 13, 115, 64, 88, 9, 11, 70, 73, 90, 84, 102, 29, 57, 126, 2, 82, 15, 86, 83, 87, 125, 24, 66, 34, 30, 91, 26, 35, 85, 120, 92, 94, 32, 81, 56, 55, 122, 58, 116, 98, 95, 52, 62, 31, 111, 97, 106, 50, 27, 124, 96, 118, 123, 48, 110, 40, 47, 36, 127, 99, 33, 108, 112, 113, 49, 54, 37, 117, 39, 63, 104, 46, 100, 105, 42, 103, 59, 121, 44, 53, 101, 61, 114, 41, 119], [109, 43, 60, 45, 107, 123, 34, 50, 58, 62, 115, 56, 35, 113, 110, 57, 122, 48, 63, 108, 98, 126, 47, 127, 112, 124, 119, 55, 111, 117, 49, 121, 59, 53, 114, 52, 118, 54, 39, 125, 61, 116, 44, 51, 37, 120, 106, 46, 92, 40, 42, 36, 105, 41, 103, 104, 33, 94, 38, 101, 100, 31, 97, 99, 86, 82, 95, 29, 87, 96, 16, 22, 30, 81, 78, 89, 32, 91, 84, 28, 27, 10, 93, 76, 77, 20, 85, 13, 8, 83, 24, 11, 80, 88, 23, 19, 21, 102, 25, 90, 79, 14, 17, 71, 74, 26, 18, 5, 68, 15, 9, 12, 70, 73, 66, 1, 2, 3, 72, 7, 4, 69, 6, 75, 0, 67, 65, 64], [105, 97, 118, 23, 90, 84, 79, 82, 117, 13, 93, 33, 47, 9, 54, 111, 29, 41, 10, 11, 71, 85, 20, 22, 25, 15, 17, 26, 18, 74, 56, 83, 89, 77, 75, 87, 121, 80, 21, 91, 19, 81, 86, 116, 7, 6, 92, 76, 32, 38, 30, 95, 88, 27, 3, 78, 5, 49, 48, 53, 16, 12, 69, 52, 94, 24, 31, 96, 112, 8, 108, 73, 61, 57, 46, 43, 102, 124, 4, 126, 58, 35, 100, 113, 99, 14, 28, 68, 42, 110, 107, 101, 123, 115, 55, 63, 37, 106, 119, 39, 62, 51, 103, 98, 109, 34, 67, 114, 120, 127, 60, 72, 40, 70, 125, 59, 122, 44, 45, 104, 50, 36, 1, 65, 66, 2, 0, 64], [105, 118, 64, 0, 2, 97, 1, 79, 65, 13, 66, 67, 41, 71, 4, 84, 29, 11, 26, 10, 9, 82, 69, 68, 23, 117, 70, 111, 6, 54, 87, 101, 7, 3, 76, 74, 78, 20, 8, 5, 73, 18, 12, 90, 121, 89, 75, 86, 77, 16, 93, 15, 14, 25, 38, 115, 47, 55, 17, 45, 80, 83, 37, 72, 58, 59, 48, 92, 22, 51, 39, 114, 85, 56, 49, 53, 34, 21, 81, 91, 88, 27, 120, 19, 24, 62, 46, 98, 100, 110, 124, 44, 28, 116, 96, 119, 106, 42, 94, 57, 102, 40, 61, 60, 126, 107, 127, 52, 31, 30, 103, 109, 113, 108, 122, 125, 63, 36, 43, 104, 95, 112, 123, 99, 35, 50, 32, 33], [105, 47, 118, 93, 117, 111, 90, 89, 97, 101, 116, 56, 85, 54, 23, 41, 25, 80, 33, 52, 121, 32, 96, 61, 37, 55, 24, 17, 49, 84, 31, 29, 53, 123, 21, 99, 91, 58, 98, 88, 95, 75, 35, 92, 60, 113, 34, 77, 126, 100, 12, 18, 127, 48, 30, 36, 45, 115, 108, 39, 73, 59, 112, 102, 15, 28, 43, 107, 26, 38, 27, 110, 81, 46, 114, 82, 16, 40, 103, 106, 87, 63, 94, 120, 104, 20, 62, 8, 44, 42, 122, 69, 109, 124, 7, 119, 74, 125, 51, 57, 50, 78, 11, 86, 19, 79, 83, 70, 22, 14, 67, 76, 72, 68, 13, 9, 10, 71, 5, 4, 6, 3, 66, 65, 2, 1, 0, 64], [105, 47, 118, 93, 117, 101, 90, 97, 116, 111, 54, 56, 52, 123, 89, 24, 85, 33, 55, 61, 23, 53, 58, 21, 49, 60, 121, 17, 37, 59, 75, 96, 31, 113, 115, 25, 112, 120, 106, 126, 12, 80, 114, 57, 127, 107, 48, 29, 41, 43, 77, 46, 110, 108, 45, 63, 62, 27, 124, 99, 73, 109, 40, 44, 51, 119, 42, 98, 38, 39, 87, 103, 102, 84, 50, 122, 125, 104, 32, 91, 34, 81, 26, 36, 20, 95, 16, 35, 18, 100, 30, 15, 69, 8, 7, 92, 86, 94, 28, 22, 88, 78, 74, 72, 82, 14, 70, 83, 79, 11, 10, 71, 67, 76, 19, 13, 9, 4, 68, 5, 6, 3, 66, 2, 65, 1, 64, 0], [103, 42, 34, 30, 23, 85, 106, 83, 81, 127, 70, 78, 50, 16, 39, 94, 53, 10, 41, 82, 12, 15, 4, 112, 13, 74, 20, 54, 25, 5, 8, 66, 38, 48, 19, 117, 63, 76, 93, 121, 72, 90, 88, 123, 111, 124, 24, 32, 107, 14, 118, 40, 17, 73, 55, 1, 87, 115, 75, 105, 80, 125, 21, 26, 67, 60, 29, 47, 52, 22, 101, 113, 58, 95, 36, 84, 126, 120, 11, 97, 6, 49, 100, 104, 62, 79, 59, 86, 46, 43, 45, 122, 7, 69, 102, 44, 109, 89, 33, 3, 99, 57, 77, 37, 108, 98, 27, 114, 28, 110, 18, 68, 96, 71, 119, 56, 116, 51, 35, 31, 61, 91, 65, 9, 64, 92, 0, 2], [103, 42, 34, 30, 85, 106, 23, 81, 83, 53, 50, 94, 78, 41, 127, 16, 39, 62, 15, 121, 54, 38, 44, 112, 125, 25, 12, 124, 74, 93, 63, 22, 47, 87, 13, 48, 82, 118, 111, 60, 11, 24, 5, 33, 113, 59, 109, 40, 117, 8, 55, 80, 70, 72, 107, 122, 115, 114, 123, 51, 49, 95, 32, 56, 4, 76, 110, 46, 21, 101, 45, 119, 28, 36, 58, 116, 90, 10, 57, 120, 126, 79, 29, 61, 104, 91, 108, 99, 26, 97, 92, 19, 86, 37, 20, 96, 84, 52, 102, 89, 100, 105, 43, 17, 1, 35, 75, 3, 31, 77, 14, 27, 88, 7, 98, 66, 18, 67, 6, 71, 73, 9, 64, 68, 69, 65, 0, 2], [103, 42, 34, 30, 106, 50, 85, 81, 54, 41, 23, 16, 83, 53, 112, 48, 121, 38, 44, 95, 4, 70, 94, 124, 13, 127, 20, 26, 63, 123, 125, 113, 93, 52, 60, 10, 62, 109, 59, 29, 115, 47, 78, 15, 122, 19, 76, 111, 31, 87, 8, 46, 28, 25, 45, 118, 40, 92, 82, 32, 90, 66, 58, 61, 55, 97, 110, 88, 114, 43, 105, 36, 117, 107, 74, 51, 100, 101, 119, 22, 49, 116, 37, 126, 33, 9, 80, 89, 86, 104, 99, 120, 73, 56, 57, 5, 24, 84, 21, 27, 35, 17, 96, 102, 69, 108, 67, 91, 12, 1, 79, 14, 11, 65, 75, 39, 68, 72, 18, 7, 77, 98, 71, 6, 2, 0, 3, 64], [103, 42, 34, 30, 83, 81, 85, 106, 12, 23, 50, 74, 72, 15, 78, 127, 16, 54, 69, 5, 13, 4, 113, 48, 60, 124, 8, 39, 76, 41, 1, 2, 24, 53, 80, 87, 11, 55, 17, 65, 21, 121, 94, 6, 63, 68, 3, 45, 19, 25, 62, 32, 38, 75, 126, 70, 52, 82, 93, 9, 84, 20, 125, 67, 123, 90, 14, 115, 26, 79, 44, 22, 10, 112, 104, 77, 66, 96, 95, 120, 91, 97, 117, 86, 100, 28, 98, 0, 73, 88, 71, 36, 59, 111, 89, 108, 64, 18, 29, 114, 105, 61, 7, 47, 101, 27, 99, 43, 107, 122, 31, 35, 92, 118, 51, 119, 33, 102, 58, 116, 49, 46, 109, 110, 40, 37, 57, 56], [104, 121, 100, 33, 17, 13, 21, 23, 79, 83, 90, 122, 72, 11, 8, 26, 28, 67, 112, 37, 30, 60, 59, 6, 61, 126, 45, 80, 53, 22, 115, 125, 81, 48, 109, 20, 4, 10, 51, 42, 18, 14, 46, 86, 19, 15, 40, 77, 96, 78, 34, 97, 75, 69, 99, 85, 27, 5, 1, 41, 29, 74, 39, 0, 16, 87, 94, 50, 25, 118, 111, 24, 68, 82, 12, 92, 84, 101, 73, 52, 103, 32, 117, 98, 66, 93, 95, 110, 113, 123, 47, 62, 31, 7, 107, 71, 76, 9, 70, 89, 2, 58, 108, 102, 124, 106, 55, 54, 63, 91, 88, 116, 3, 105, 64, 127, 120, 35, 49, 44, 119, 43, 57, 38, 65, 56, 114, 36], [104, 100, 121, 33, 28, 90, 21, 23, 17, 13, 109, 112, 42, 83, 115, 92, 48, 122, 30, 79, 26, 116, 18, 86, 118, 37, 61, 123, 45, 95, 117, 51, 44, 108, 39, 101, 58, 11, 120, 52, 34, 35, 107, 60, 110, 29, 124, 96, 41, 98, 25, 22, 32, 31, 82, 126, 80, 53, 16, 24, 62, 59, 27, 54, 103, 125, 43, 111, 57, 87, 119, 46, 6, 114, 36, 56, 105, 84, 106, 63, 102, 50, 85, 49, 19, 97, 55, 127, 20, 113, 99, 91, 14, 88, 93, 81, 38, 94, 10, 47, 77, 75, 72, 89, 74, 78, 12, 76, 70, 40, 67, 3, 8, 15, 69, 7, 9, 68, 5, 73, 71, 65, 2, 1, 66, 64, 4, 0], [104, 121, 33, 100, 93, 37, 79, 23, 83, 17, 21, 109, 112, 13, 8, 11, 90, 6, 67, 26, 35, 122, 59, 86, 40, 46, 3, 1, 125, 126, 71, 30, 82, 61, 45, 81, 48, 28, 42, 51, 85, 115, 60, 29, 9, 18, 4, 92, 43, 15, 52, 77, 74, 101, 25, 80, 22, 34, 65, 105, 75, 107, 19, 111, 50, 108, 10, 94, 69, 20, 78, 58, 16, 120, 39, 62, 47, 7, 119, 123, 88, 70, 96, 73, 99, 98, 31, 87, 57, 14, 32, 66, 106, 49, 102, 5, 103, 116, 110, 53, 38, 114, 76, 68, 63, 55, 124, 64, 56, 27, 113, 127, 12, 118, 84, 41, 117, 24, 91, 54, 95, 89, 44, 2, 72, 36, 97, 0], [121, 104, 100, 33, 28, 112, 122, 90, 21, 23, 83, 37, 93, 13, 116, 126, 30, 45, 92, 91, 125, 101, 79, 52, 17, 86, 115, 61, 60, 59, 46, 26, 51, 109, 35, 82, 85, 57, 48, 18, 123, 42, 118, 105, 19, 15, 97, 43, 39, 11, 54, 29, 107, 111, 103, 22, 32, 84, 31, 53, 14, 117, 95, 47, 96, 38, 120, 58, 119, 41, 108, 49, 106, 113, 110, 87, 27, 127, 99, 25, 70, 55, 34, 44, 124, 94, 6, 114, 89, 62, 36, 102, 24, 63, 75, 9, 50, 77, 98, 81, 72, 78, 20, 74, 56, 88, 10, 12, 16, 3, 80, 67, 71, 7, 76, 73, 2, 68, 8, 40, 69, 5, 66, 64, 1, 65, 4, 0], [37, 40, 105, 93, 55, 30, 124, 14, 18, 87, 63, 20, 94, 126, 48, 29, 88, 49, 104, 41, 60, 16, 117, 43, 58, 76, 89, 50, 62, 51, 57, 116, 53, 52, 39, 61, 111, 72, 119, 102, 38, 109, 100, 42, 44, 120, 59, 112, 56, 46, 122, 121, 110, 54, 114, 45, 113, 127, 123, 103, 115, 125, 27, 47, 108, 106, 118, 96, 107, 95, 86, 21, 98, 82, 26, 99, 36, 32, 33, 31, 35, 34, 23, 97, 24, 101, 69, 77, 81, 90, 25, 85, 10, 83, 92, 91, 78, 15, 28, 12, 84, 13, 74, 22, 17, 9, 80, 5, 6, 8, 79, 11, 75, 19, 1, 73, 3, 65, 71, 70, 66, 64, 2, 7, 67, 68, 4, 0], [105, 37, 40, 93, 87, 41, 18, 16, 76, 14, 20, 124, 29, 55, 69, 63, 72, 10, 57, 94, 88, 60, 49, 1, 61, 126, 15, 48, 23, 77, 117, 90, 114, 58, 27, 96, 74, 116, 89, 52, 78, 25, 12, 62, 122, 24, 81, 26, 13, 8, 30, 79, 82, 50, 102, 6, 84, 3, 68, 119, 112, 95, 86, 51, 100, 80, 104, 43, 83, 46, 17, 53, 21, 44, 91, 56, 59, 120, 71, 22, 39, 92, 19, 107, 97, 85, 45, 28, 11, 123, 4, 70, 101, 111, 38, 32, 75, 54, 5, 67, 108, 31, 121, 9, 34, 36, 65, 103, 33, 73, 35, 0, 2, 127, 7, 110, 113, 125, 98, 109, 42, 64, 115, 118, 99, 47, 66, 106], [105, 37, 40, 93, 104, 29, 55, 18, 87, 14, 16, 20, 63, 124, 76, 72, 49, 30, 60, 126, 27, 57, 94, 48, 117, 61, 88, 10, 114, 69, 77, 116, 89, 62, 50, 56, 100, 24, 90, 51, 96, 102, 119, 112, 86, 52, 15, 54, 58, 23, 127, 122, 12, 95, 8, 84, 53, 38, 79, 43, 21, 26, 92, 111, 82, 34, 108, 107, 113, 13, 32, 120, 80, 91, 25, 81, 98, 110, 28, 22, 103, 121, 44, 123, 19, 31, 39, 59, 45, 1, 115, 33, 101, 125, 47, 36, 35, 106, 46, 85, 99, 118, 42, 78, 17, 97, 109, 83, 41, 74, 6, 71, 70, 11, 3, 75, 68, 9, 5, 2, 4, 65, 66, 73, 7, 0, 64, 67], [40, 37, 105, 63, 93, 14, 16, 87, 10, 20, 18, 76, 55, 69, 29, 72, 57, 104, 30, 124, 3, 60, 41, 94, 74, 58, 61, 1, 26, 114, 88, 49, 15, 13, 23, 84, 24, 12, 126, 90, 17, 67, 95, 112, 70, 11, 71, 82, 78, 62, 116, 8, 117, 48, 52, 122, 81, 6, 22, 68, 0, 102, 7, 75, 83, 50, 77, 79, 46, 56, 98, 123, 80, 96, 4, 53, 27, 65, 89, 43, 21, 103, 86, 5, 39, 85, 119, 101, 100, 113, 51, 44, 47, 45, 25, 38, 28, 59, 92, 32, 19, 127, 111, 107, 91, 9, 120, 73, 110, 34, 64, 36, 115, 54, 108, 31, 109, 99, 35, 2, 121, 97, 118, 42, 66, 125, 33, 106], [105, 99, 89, 96, 48, 54, 83, 92, 84, 16, 14, 25, 11, 112, 22, 19, 75, 78, 72, 27, 77, 18, 80, 115, 41, 69, 81, 32, 24, 10, 36, 108, 30, 5, 44, 85, 93, 26, 13, 88, 104, 63, 119, 117, 29, 82, 15, 17, 73, 118, 20, 124, 94, 6, 9, 66, 62, 67, 49, 79, 12, 28, 91, 70, 126, 33, 23, 56, 90, 59, 21, 120, 58, 87, 76, 31, 71, 7, 74, 8, 65, 37, 4, 0, 1, 38, 98, 97, 43, 127, 46, 95, 2, 68, 60, 64, 86, 3, 51, 116, 107, 52, 47, 101, 110, 123, 50, 35, 100, 102, 109, 40, 53, 34, 122, 103, 125, 114, 121, 106, 61, 39, 42, 113, 45, 111, 57, 55], [44, 36, 104, 54, 92, 100, 95, 48, 105, 108, 28, 47, 96, 53, 32, 98, 37, 26, 27, 123, 40, 24, 97, 60, 127, 51, 112, 91, 58, 55, 85, 88, 52, 78, 93, 31, 33, 38, 21, 90, 113, 107, 49, 99, 124, 10, 110, 120, 115, 18, 89, 63, 30, 45, 34, 83, 62, 82, 56, 111, 94, 29, 122, 102, 84, 106, 118, 74, 103, 116, 117, 50, 46, 43, 22, 81, 101, 121, 119, 61, 114, 17, 39, 23, 57, 109, 59, 16, 86, 79, 126, 14, 87, 35, 70, 42, 41, 80, 5, 125, 9, 15, 75, 2, 20, 3, 66, 64, 12, 7, 73, 1, 77, 19, 76, 6, 4, 68, 0, 13, 25, 11, 67, 69, 72, 71, 8, 65], [48, 54, 37, 105, 108, 104, 36, 107, 47, 115, 63, 119, 95, 50, 44, 56, 112, 114, 118, 94, 127, 30, 117, 62, 110, 52, 55, 123, 116, 121, 53, 38, 58, 51, 124, 40, 59, 111, 113, 92, 34, 101, 120, 60, 125, 32, 46, 122, 126, 109, 57, 61, 100, 49, 24, 102, 33, 45, 98, 106, 43, 103, 28, 21, 99, 85, 42, 39, 96, 31, 41, 15, 97, 25, 29, 87, 19, 18, 81, 11, 23, 88, 12, 90, 73, 77, 17, 84, 26, 75, 91, 8, 93, 79, 22, 78, 7, 20, 5, 82, 80, 4, 35, 2, 0, 1, 64, 76, 6, 3, 14, 86, 74, 72, 70, 83, 71, 9, 89, 69, 27, 10, 16, 65, 68, 66, 67, 13], [54, 105, 48, 36, 123, 41, 25, 44, 127, 98, 99, 112, 34, 96, 115, 50, 47, 52, 60, 121, 118, 57, 117, 122, 64, 107, 63, 114, 108, 74, 119, 45, 120, 56, 22, 30, 113, 55, 8, 126, 111, 51, 58, 53, 62, 106, 110, 116, 125, 1, 124, 104, 59, 37, 61, 94, 80, 100, 68, 43, 75, 97, 109, 84, 67, 49, 81, 14, 38, 77, 6, 46, 42, 73, 103, 40, 5, 39, 79, 2, 24, 18, 4, 16, 3, 92, 89, 71, 21, 69, 12, 102, 28, 29, 33, 15, 20, 101, 88, 26, 85, 86, 7, 17, 31, 27, 95, 82, 91, 35, 32, 87, 13, 93, 76, 70, 19, 23, 90, 9, 0, 83, 11, 65, 78, 10, 66, 72], [37, 114, 49, 50, 54, 113, 94, 91, 101, 28, 118, 84, 61, 119, 25, 62, 45, 58, 11, 32, 112, 56, 87, 60, 29, 22, 124, 15, 31, 14, 90, 79, 116, 96, 81, 48, 33, 97, 82, 55, 24, 47, 110, 83, 36, 35, 117, 46, 95, 93, 109, 53, 98, 51, 99, 27, 18, 126, 127, 34, 30, 39, 115, 107, 102, 38, 40, 43, 63, 100, 52, 104, 44, 21, 105, 57, 125, 122, 123, 103, 92, 121, 41, 59, 120, 42, 8, 16, 108, 89, 106, 20, 88, 85, 26, 111, 76, 23, 86, 9, 75, 78, 5, 13, 19, 12, 6, 17, 80, 4, 10, 73, 7, 77, 2, 74, 3, 1, 71, 66, 72, 0, 70, 68, 69, 64, 65, 67], [114, 54, 49, 113, 50, 124, 119, 118, 109, 62, 123, 61, 56, 48, 47, 55, 112, 53, 58, 36, 127, 46, 37, 120, 63, 45, 126, 52, 60, 107, 125, 44, 115, 51, 111, 121, 117, 110, 122, 116, 59, 43, 57, 108, 17, 42, 105, 39, 106, 41, 101, 104, 20, 102, 38, 40, 82, 80, 30, 96, 19, 75, 79, 103, 86, 77, 78, 76, 95, 85, 24, 93, 100, 27, 97, 98, 99, 35, 34, 33, 74, 18, 32, 89, 72, 28, 71, 9, 23, 29, 14, 90, 15, 31, 92, 94, 87, 12, 88, 69, 70, 66, 13, 64, 65, 91, 84, 21, 81, 67, 26, 73, 83, 11, 25, 68, 22, 16, 6, 10, 4, 5, 1, 3, 8, 7, 0, 2], [37, 49, 114, 50, 54, 113, 101, 94, 62, 25, 91, 11, 84, 83, 119, 87, 81, 118, 32, 79, 8, 56, 5, 15, 76, 9, 28, 22, 21, 45, 6, 60, 4, 58, 61, 7, 124, 13, 110, 16, 33, 75, 112, 85, 14, 10, 18, 30, 116, 3, 82, 90, 46, 24, 1, 73, 53, 23, 96, 77, 89, 109, 78, 117, 2, 63, 74, 125, 47, 12, 80, 52, 31, 126, 20, 17, 71, 48, 93, 57, 127, 66, 19, 115, 98, 105, 55, 27, 36, 44, 86, 29, 88, 40, 35, 95, 26, 43, 97, 122, 92, 72, 111, 68, 59, 38, 123, 41, 0, 51, 107, 34, 103, 39, 121, 42, 120, 102, 99, 106, 108, 100, 104, 70, 69, 65, 64, 67], [49, 37, 114, 54, 50, 113, 5, 9, 76, 6, 8, 7, 94, 4, 77, 79, 75, 12, 87, 16, 3, 25, 30, 22, 73, 11, 74, 83, 78, 80, 82, 17, 124, 2, 81, 71, 72, 91, 10, 45, 19, 56, 60, 62, 119, 61, 112, 1, 32, 118, 13, 15, 58, 84, 85, 63, 53, 26, 101, 21, 20, 117, 110, 109, 55, 23, 90, 93, 92, 66, 14, 89, 116, 31, 88, 46, 29, 48, 18, 107, 28, 24, 27, 125, 102, 127, 115, 57, 0, 126, 36, 52, 51, 47, 86, 123, 44, 70, 96, 121, 68, 43, 33, 42, 59, 39, 122, 104, 111, 120, 98, 67, 40, 69, 41, 100, 105, 35, 34, 38, 108, 97, 95, 103, 106, 65, 99, 64], [41, 101, 25, 23, 85, 110, 18, 105, 15, 34, 13, 20, 11, 91, 124, 30, 113, 9, 14, 83, 43, 8, 109, 24, 126, 94, 82, 68, 6, 51, 84, 48, 107, 118, 92, 29, 27, 59, 112, 111, 57, 4, 45, 77, 114, 44, 87, 88, 122, 89, 36, 53, 62, 17, 26, 98, 79, 31, 32, 16, 76, 80, 58, 117, 78, 103, 90, 96, 115, 66, 50, 75, 72, 21, 67, 38, 56, 2, 99, 55, 10, 47, 95, 37, 121, 0, 61, 52, 69, 5, 102, 120, 86, 108, 19, 28, 63, 60, 46, 22, 116, 106, 39, 123, 35, 127, 54, 65, 104, 71, 70, 1, 7, 97, 73, 119, 93, 12, 3, 33, 74, 40, 49, 42, 81, 100, 125, 64], [41, 101, 25, 85, 18, 23, 20, 105, 15, 11, 34, 13, 91, 124, 14, 9, 6, 51, 113, 2, 118, 8, 82, 112, 114, 29, 27, 53, 83, 98, 84, 68, 81, 109, 75, 48, 78, 32, 43, 107, 89, 30, 66, 87, 92, 111, 94, 126, 70, 36, 90, 45, 79, 110, 17, 44, 88, 122, 62, 57, 59, 117, 21, 7, 99, 103, 60, 127, 47, 76, 19, 38, 26, 42, 31, 24, 120, 52, 58, 16, 46, 69, 108, 77, 0, 35, 119, 72, 61, 115, 56, 80, 37, 22, 123, 102, 55, 63, 121, 10, 86, 95, 28, 67, 96, 97, 125, 100, 106, 39, 71, 54, 116, 50, 73, 93, 5, 74, 12, 40, 33, 65, 3, 49, 4, 1, 104, 64], [41, 101, 109, 110, 91, 25, 20, 18, 34, 85, 105, 23, 14, 124, 11, 15, 9, 13, 43, 53, 24, 113, 82, 84, 29, 8, 107, 45, 27, 126, 83, 6, 111, 90, 127, 59, 68, 98, 112, 51, 28, 44, 120, 78, 56, 47, 121, 118, 117, 114, 116, 62, 50, 106, 122, 55, 57, 92, 58, 95, 93, 94, 81, 36, 52, 97, 16, 87, 32, 30, 60, 102, 108, 42, 99, 38, 66, 75, 26, 40, 96, 119, 88, 63, 67, 33, 31, 89, 35, 37, 48, 80, 123, 115, 70, 61, 49, 46, 39, 104, 71, 125, 2, 22, 103, 76, 77, 86, 12, 54, 17, 79, 0, 4, 19, 21, 72, 7, 10, 5, 69, 100, 73, 65, 74, 1, 3, 64], [41, 101, 25, 85, 91, 105, 23, 15, 18, 20, 34, 13, 98, 113, 124, 81, 110, 29, 30, 11, 48, 95, 126, 43, 92, 114, 27, 9, 53, 14, 109, 84, 36, 44, 94, 24, 118, 45, 35, 26, 83, 100, 111, 89, 51, 107, 46, 57, 82, 120, 76, 123, 112, 61, 119, 38, 40, 103, 122, 121, 31, 56, 87, 116, 8, 62, 60, 50, 28, 106, 99, 47, 127, 125, 39, 80, 97, 117, 78, 42, 33, 6, 32, 58, 54, 59, 86, 102, 16, 88, 73, 21, 55, 63, 77, 22, 17, 96, 115, 52, 49, 79, 108, 4, 93, 12, 90, 104, 19, 68, 37, 66, 75, 10, 70, 74, 72, 5, 67, 7, 71, 1, 2, 69, 64, 3, 0, 65]], "model.layers.7.self_attn.k_proj": [[45, 107, 102, 0, 109, 60, 1, 12, 3, 77, 70, 19, 71, 43, 10, 8, 78, 115, 80, 29, 68, 22, 25, 5, 18, 17, 11, 125, 2, 73, 57, 126, 56, 48, 122, 111, 124, 79, 55, 110, 113, 63, 20, 66, 58, 67, 52, 47, 127, 62, 117, 49, 59, 114, 112, 96, 121, 51, 44, 54, 23, 118, 6, 119, 61, 92, 123, 53, 27, 95, 46, 116, 98, 50, 120, 106, 69, 89, 100, 64, 65, 21, 4, 108, 75, 9, 7, 41, 42, 105, 40, 39, 87, 30, 35, 24, 104, 94, 103, 36, 97, 31, 26, 101, 84, 37, 32, 33, 99, 90, 28, 85, 88, 81, 72, 93, 34, 15, 74, 76, 91, 82, 14, 16, 83, 13, 86, 38], [41, 118, 64, 111, 117, 33, 79, 82, 84, 1, 13, 23, 90, 93, 56, 71, 2, 11, 9, 4, 89, 85, 10, 105, 17, 65, 5, 37, 47, 121, 6, 30, 76, 67, 14, 116, 32, 80, 66, 72, 101, 22, 24, 115, 0, 48, 19, 28, 59, 29, 127, 44, 126, 49, 95, 110, 3, 26, 55, 112, 88, 83, 16, 38, 43, 35, 94, 52, 98, 27, 18, 46, 81, 40, 120, 107, 12, 100, 21, 102, 62, 124, 119, 61, 106, 113, 99, 69, 34, 53, 91, 123, 97, 103, 96, 42, 63, 57, 39, 51, 109, 114, 60, 36, 31, 58, 73, 104, 108, 54, 8, 122, 125, 68, 50, 25, 45, 78, 75, 92, 20, 77, 87, 86, 15, 74, 7, 70], [106, 39, 94, 98, 85, 23, 83, 127, 74, 12, 81, 78, 72, 16, 15, 50, 112, 65, 68, 42, 6, 41, 53, 69, 118, 2, 114, 49, 13, 0, 124, 88, 77, 18, 52, 111, 55, 8, 79, 38, 126, 125, 115, 34, 29, 36, 102, 109, 108, 120, 54, 7, 64, 107, 31, 40, 63, 110, 9, 47, 26, 101, 75, 60, 48, 3, 71, 61, 35, 122, 67, 117, 104, 25, 22, 58, 100, 43, 119, 62, 80, 113, 66, 56, 5, 11, 121, 14, 73, 105, 90, 57, 33, 123, 44, 92, 103, 46, 89, 24, 86, 116, 93, 37, 82, 27, 59, 99, 96, 19, 91, 45, 32, 95, 84, 28, 20, 97, 30, 76, 10, 17, 4, 51, 1, 87, 70, 21], [40, 121, 97, 90, 23, 21, 83, 36, 79, 48, 17, 82, 11, 13, 86, 8, 122, 73, 0, 30, 78, 50, 51, 107, 6, 109, 116, 69, 60, 59, 10, 65, 115, 58, 110, 101, 45, 29, 9, 4, 16, 68, 53, 123, 28, 84, 93, 66, 67, 39, 100, 126, 61, 37, 27, 42, 12, 89, 55, 124, 112, 3, 56, 96, 32, 47, 57, 70, 120, 125, 117, 103, 119, 2, 114, 95, 94, 35, 98, 14, 74, 54, 92, 106, 46, 43, 75, 88, 99, 41, 111, 105, 62, 52, 118, 49, 108, 34, 63, 38, 127, 91, 113, 44, 5, 31, 102, 15, 19, 76, 1, 71, 7, 18, 22, 24, 64, 25, 20, 72, 80, 77, 85, 26, 81, 87, 104, 33], [104, 41, 101, 29, 87, 55, 20, 63, 18, 16, 76, 10, 14, 60, 37, 57, 7, 0, 65, 72, 77, 67, 52, 69, 112, 58, 124, 113, 114, 71, 61, 66, 5, 86, 30, 17, 9, 126, 24, 15, 27, 75, 8, 93, 123, 4, 12, 102, 89, 62, 43, 64, 94, 21, 53, 2, 90, 70, 83, 13, 85, 39, 79, 19, 96, 88, 56, 100, 117, 3, 107, 116, 108, 45, 127, 82, 6, 91, 46, 111, 119, 47, 11, 121, 81, 23, 51, 54, 25, 49, 120, 73, 95, 32, 68, 42, 59, 26, 78, 34, 36, 28, 122, 80, 22, 31, 98, 92, 110, 84, 109, 103, 97, 106, 118, 44, 35, 99, 115, 74, 38, 48, 50, 125, 33, 40, 1, 105], [41, 54, 48, 35, 32, 22, 93, 16, 44, 127, 40, 10, 112, 30, 90, 89, 78, 84, 124, 83, 3, 70, 77, 27, 115, 123, 113, 51, 18, 117, 25, 52, 55, 101, 120, 111, 24, 7, 59, 121, 76, 119, 110, 63, 5, 122, 60, 50, 62, 126, 114, 58, 57, 43, 11, 118, 108, 31, 116, 81, 125, 1, 61, 56, 53, 109, 47, 19, 45, 100, 72, 79, 91, 46, 103, 102, 42, 39, 106, 9, 49, 2, 23, 107, 4, 80, 64, 73, 0, 13, 74, 66, 20, 12, 8, 33, 28, 21, 86, 104, 65, 38, 14, 26, 71, 87, 98, 34, 92, 15, 17, 85, 95, 68, 97, 29, 88, 36, 105, 96, 37, 94, 82, 75, 6, 99, 69, 67], [50, 101, 113, 49, 54, 94, 114, 22, 91, 87, 25, 21, 83, 81, 74, 84, 13, 18, 16, 86, 14, 72, 56, 71, 80, 110, 69, 77, 15, 32, 112, 58, 70, 60, 67, 66, 53, 117, 124, 26, 88, 92, 12, 45, 119, 68, 52, 55, 78, 62, 63, 125, 109, 9, 127, 73, 61, 64, 57, 4, 37, 51, 43, 126, 107, 11, 118, 115, 48, 19, 100, 123, 98, 116, 31, 47, 65, 121, 59, 75, 111, 120, 42, 29, 104, 102, 95, 6, 105, 44, 41, 103, 89, 76, 46, 40, 108, 122, 39, 1, 23, 97, 17, 38, 106, 3, 27, 10, 34, 35, 36, 99, 33, 85, 24, 30, 28, 96, 93, 79, 90, 20, 5, 8, 82, 7, 2, 0], [105, 37, 23, 18, 25, 15, 20, 11, 14, 85, 101, 13, 124, 91, 113, 34, 9, 46, 49, 8, 48, 126, 6, 1, 41, 118, 43, 68, 45, 115, 2, 111, 74, 110, 94, 16, 0, 63, 119, 114, 51, 83, 24, 62, 98, 64, 70, 59, 35, 53, 109, 92, 117, 58, 81, 102, 12, 65, 90, 55, 31, 10, 30, 57, 67, 3, 44, 29, 36, 61, 120, 7, 4, 32, 112, 103, 122, 108, 27, 116, 99, 96, 77, 50, 26, 86, 72, 107, 106, 121, 19, 95, 104, 93, 88, 56, 75, 71, 42, 125, 40, 54, 22, 38, 100, 123, 52, 97, 39, 5, 17, 79, 33, 69, 127, 47, 80, 28, 73, 60, 66, 21, 76, 78, 89, 84, 87, 82]], "model.layers.7.self_attn.qk_proj": [[121, 41, 54, 118, 105, 50, 37, 45, 106, 48, 101, 49, 109, 114, 107, 104, 113, 93, 43, 23, 87, 60, 30, 42, 89, 77, 40, 85, 25, 82, 83, 13, 14, 18, 21, 26, 78, 117, 19, 16, 112, 10, 111, 124, 79, 15, 20, 34, 63, 55, 81, 84, 74, 80, 103, 72, 17, 56, 119, 115, 94, 76, 29, 12, 39, 100, 86, 33, 57, 110, 75, 122, 126, 27, 47, 127, 52, 44, 125, 0, 11, 22, 53, 102, 90, 8, 58, 123, 32, 97, 64, 61, 7, 67, 69, 5, 62, 9, 59, 71, 51, 65, 73, 70, 98, 3, 4, 91, 120, 116, 38, 96, 6, 35, 24, 36, 46, 1, 88, 68, 31, 92, 108, 28, 2, 95, 99, 66], [121, 41, 118, 105, 54, 50, 37, 45, 106, 48, 49, 101, 109, 107, 114, 104, 43, 113, 93, 23, 30, 87, 40, 60, 42, 25, 21, 18, 85, 77, 19, 13, 14, 82, 89, 83, 26, 112, 111, 84, 103, 79, 78, 16, 10, 63, 80, 117, 15, 34, 55, 124, 94, 17, 20, 74, 12, 81, 72, 115, 119, 100, 56, 86, 39, 76, 122, 53, 29, 11, 125, 33, 47, 110, 127, 27, 61, 57, 22, 102, 75, 0, 52, 90, 126, 8, 97, 62, 64, 123, 32, 51, 44, 58, 98, 69, 7, 9, 4, 5, 6, 1, 65, 71, 67, 36, 91, 120, 35, 116, 73, 59, 68, 108, 70, 38, 3, 24, 96, 88, 2, 99, 28, 66, 46, 92, 31, 95], [121, 41, 118, 105, 54, 50, 37, 45, 106, 101, 48, 114, 109, 49, 107, 104, 113, 93, 23, 43, 30, 87, 42, 60, 40, 25, 89, 85, 21, 77, 26, 18, 19, 82, 13, 117, 55, 103, 14, 94, 83, 112, 84, 34, 111, 78, 81, 124, 20, 79, 100, 16, 10, 74, 15, 29, 80, 33, 39, 86, 63, 17, 110, 76, 97, 47, 12, 122, 90, 56, 53, 115, 72, 57, 8, 27, 119, 22, 127, 102, 52, 32, 75, 0, 64, 11, 58, 62, 69, 126, 125, 6, 98, 7, 5, 1, 44, 65, 61, 68, 123, 51, 71, 91, 3, 108, 9, 96, 4, 59, 36, 35, 67, 46, 116, 73, 24, 99, 38, 31, 88, 120, 70, 2, 66, 28, 95, 92], [121, 118, 41, 105, 54, 50, 45, 37, 106, 49, 48, 101, 114, 109, 107, 104, 93, 113, 43, 87, 23, 30, 89, 40, 42, 60, 25, 82, 26, 85, 117, 77, 112, 21, 18, 103, 55, 78, 127, 83, 13, 19, 79, 119, 20, 94, 84, 34, 124, 15, 63, 14, 111, 53, 100, 81, 10, 16, 47, 80, 17, 74, 29, 39, 27, 86, 12, 33, 102, 110, 64, 125, 11, 44, 115, 76, 126, 32, 52, 56, 58, 90, 97, 122, 22, 8, 72, 0, 51, 57, 69, 71, 62, 75, 5, 6, 67, 65, 116, 3, 73, 61, 91, 59, 7, 98, 120, 68, 1, 36, 35, 9, 123, 96, 4, 2, 46, 31, 38, 70, 88, 28, 66, 108, 92, 24, 95, 99], [121, 41, 118, 105, 54, 50, 45, 37, 106, 48, 49, 101, 114, 109, 107, 104, 113, 43, 93, 87, 23, 42, 40, 89, 30, 60, 77, 26, 18, 85, 82, 21, 117, 25, 55, 13, 34, 111, 112, 14, 103, 83, 78, 10, 84, 94, 16, 63, 47, 74, 15, 19, 20, 79, 124, 76, 81, 8, 80, 100, 39, 29, 17, 12, 119, 102, 115, 11, 33, 57, 56, 86, 64, 53, 0, 126, 110, 75, 32, 65, 22, 127, 90, 97, 122, 68, 69, 72, 62, 27, 58, 61, 44, 52, 5, 98, 4, 125, 7, 73, 3, 6, 9, 51, 67, 38, 91, 71, 1, 116, 59, 66, 24, 70, 123, 108, 35, 2, 36, 88, 120, 96, 46, 92, 28, 99, 31, 95], [121, 41, 118, 54, 105, 50, 45, 37, 106, 101, 48, 49, 114, 107, 109, 104, 43, 113, 93, 87, 23, 60, 42, 77, 85, 82, 89, 40, 30, 25, 18, 21, 26, 14, 78, 13, 19, 112, 79, 83, 111, 16, 103, 84, 117, 74, 20, 15, 63, 34, 17, 10, 11, 8, 81, 80, 124, 12, 76, 100, 115, 94, 47, 55, 86, 126, 53, 39, 125, 29, 57, 127, 110, 72, 61, 52, 75, 0, 119, 33, 102, 58, 62, 27, 22, 44, 64, 122, 32, 56, 7, 9, 120, 97, 3, 90, 91, 123, 68, 6, 73, 5, 69, 51, 4, 71, 70, 65, 59, 1, 67, 98, 24, 35, 66, 116, 36, 38, 96, 108, 2, 92, 88, 28, 46, 31, 99, 95], [121, 41, 54, 105, 118, 50, 37, 45, 101, 106, 49, 48, 114, 109, 107, 23, 104, 113, 93, 87, 43, 30, 82, 42, 60, 77, 18, 21, 25, 14, 89, 85, 13, 40, 78, 26, 16, 19, 15, 84, 117, 111, 83, 79, 20, 103, 63, 17, 34, 74, 8, 10, 81, 112, 80, 12, 76, 47, 94, 124, 29, 33, 110, 27, 11, 115, 86, 39, 57, 55, 52, 100, 22, 125, 126, 75, 119, 122, 127, 56, 58, 44, 90, 61, 91, 32, 102, 72, 53, 0, 59, 62, 97, 73, 69, 123, 98, 7, 96, 5, 64, 9, 71, 51, 38, 68, 6, 3, 116, 92, 35, 36, 24, 1, 88, 67, 70, 4, 120, 65, 46, 2, 31, 28, 108, 66, 95, 99], [121, 41, 105, 118, 54, 50, 45, 106, 37, 49, 101, 48, 109, 114, 107, 104, 113, 93, 43, 23, 42, 87, 82, 40, 18, 21, 30, 13, 85, 19, 77, 60, 89, 26, 14, 78, 25, 79, 16, 84, 83, 15, 103, 80, 17, 20, 74, 12, 81, 8, 76, 34, 117, 10, 124, 94, 111, 29, 86, 11, 63, 112, 55, 39, 119, 27, 75, 100, 57, 127, 33, 115, 22, 61, 44, 102, 0, 47, 64, 90, 1, 125, 52, 5, 91, 110, 72, 69, 58, 32, 123, 122, 70, 56, 7, 24, 53, 97, 65, 62, 126, 59, 71, 98, 73, 4, 88, 67, 9, 96, 38, 51, 36, 116, 68, 92, 3, 2, 6, 46, 120, 31, 35, 95, 66, 108, 28, 99], [121, 41, 118, 54, 105, 50, 45, 37, 106, 48, 101, 49, 109, 107, 114, 104, 43, 93, 113, 23, 87, 40, 42, 30, 89, 21, 19, 18, 82, 25, 85, 60, 26, 84, 13, 14, 111, 77, 16, 83, 94, 63, 20, 79, 103, 117, 15, 17, 78, 112, 55, 34, 80, 12, 74, 124, 81, 10, 57, 8, 29, 53, 76, 33, 127, 47, 110, 52, 27, 39, 86, 119, 11, 126, 90, 125, 58, 100, 22, 102, 97, 44, 51, 32, 62, 91, 115, 75, 56, 0, 64, 98, 7, 61, 123, 70, 122, 4, 72, 5, 36, 65, 71, 88, 116, 59, 69, 73, 68, 67, 38, 9, 24, 1, 96, 120, 35, 92, 3, 6, 108, 28, 66, 31, 95, 99, 46, 2], [121, 41, 118, 105, 54, 50, 37, 45, 106, 101, 49, 48, 109, 107, 114, 104, 113, 93, 43, 87, 23, 30, 60, 42, 25, 82, 21, 85, 40, 89, 13, 18, 111, 19, 16, 14, 77, 117, 83, 26, 84, 15, 20, 103, 79, 78, 34, 112, 17, 55, 80, 10, 81, 94, 74, 76, 8, 12, 115, 124, 102, 119, 11, 29, 126, 63, 27, 39, 33, 100, 86, 125, 47, 127, 57, 44, 56, 53, 110, 90, 122, 61, 51, 32, 64, 58, 91, 22, 0, 97, 75, 69, 72, 70, 62, 52, 71, 123, 73, 3, 98, 5, 65, 9, 116, 36, 67, 1, 7, 4, 92, 88, 68, 96, 59, 6, 120, 28, 24, 35, 66, 38, 46, 31, 99, 108, 95, 2], [121, 41, 118, 54, 105, 50, 37, 45, 106, 48, 49, 101, 114, 109, 107, 104, 113, 43, 93, 23, 42, 87, 30, 40, 13, 82, 25, 85, 89, 60, 77, 21, 18, 111, 55, 83, 117, 15, 16, 26, 79, 14, 63, 20, 78, 34, 119, 112, 19, 10, 103, 84, 74, 115, 47, 94, 80, 53, 127, 81, 17, 12, 52, 8, 39, 29, 110, 76, 86, 11, 33, 100, 72, 64, 102, 126, 122, 57, 124, 56, 44, 69, 62, 61, 51, 75, 125, 0, 97, 32, 1, 7, 58, 90, 5, 22, 123, 27, 70, 71, 9, 65, 73, 91, 67, 6, 98, 4, 68, 3, 38, 35, 36, 120, 96, 59, 116, 2, 66, 28, 92, 24, 31, 46, 108, 88, 95, 99], [121, 41, 118, 54, 105, 50, 45, 37, 106, 48, 49, 114, 101, 107, 109, 104, 113, 93, 87, 43, 23, 42, 30, 82, 85, 25, 40, 21, 18, 89, 13, 77, 60, 83, 26, 14, 15, 79, 111, 19, 84, 103, 16, 34, 20, 112, 78, 117, 81, 80, 10, 74, 94, 63, 124, 55, 76, 17, 86, 47, 39, 127, 12, 8, 53, 119, 29, 72, 33, 115, 100, 52, 27, 11, 110, 64, 56, 69, 90, 102, 22, 126, 75, 62, 44, 123, 91, 73, 32, 122, 67, 71, 57, 97, 125, 5, 61, 0, 4, 70, 68, 3, 9, 51, 7, 58, 1, 6, 98, 120, 65, 96, 38, 36, 66, 24, 116, 2, 59, 108, 88, 46, 35, 92, 28, 31, 99, 95], [121, 41, 118, 54, 105, 50, 45, 37, 101, 106, 48, 49, 109, 114, 107, 104, 113, 23, 93, 43, 87, 60, 30, 18, 42, 40, 25, 89, 21, 82, 13, 85, 19, 16, 26, 14, 83, 77, 78, 124, 84, 79, 103, 117, 111, 94, 15, 34, 81, 80, 20, 72, 74, 17, 112, 63, 10, 12, 115, 76, 29, 55, 119, 86, 100, 33, 27, 47, 90, 11, 126, 125, 39, 127, 110, 53, 75, 44, 102, 8, 56, 57, 52, 62, 97, 91, 22, 32, 61, 58, 122, 59, 64, 6, 4, 123, 73, 69, 0, 71, 68, 7, 65, 9, 3, 24, 36, 5, 51, 96, 98, 116, 70, 67, 38, 46, 92, 88, 120, 35, 31, 108, 1, 2, 66, 28, 95, 99], [121, 41, 118, 105, 54, 50, 37, 45, 106, 101, 49, 48, 109, 114, 107, 104, 113, 93, 43, 23, 60, 13, 40, 42, 87, 30, 18, 85, 25, 82, 89, 103, 21, 83, 26, 72, 78, 16, 14, 17, 117, 79, 19, 20, 15, 34, 77, 63, 74, 84, 81, 111, 124, 55, 10, 112, 80, 76, 12, 119, 47, 126, 29, 11, 75, 86, 0, 64, 27, 94, 115, 100, 127, 8, 125, 44, 39, 90, 61, 102, 33, 110, 22, 53, 6, 57, 52, 56, 69, 5, 58, 7, 65, 51, 71, 1, 9, 32, 68, 62, 97, 73, 116, 122, 91, 4, 3, 59, 96, 67, 70, 98, 36, 123, 88, 35, 24, 38, 66, 92, 31, 120, 46, 2, 99, 28, 95, 108], [121, 41, 118, 105, 54, 50, 45, 37, 49, 106, 101, 48, 109, 114, 107, 104, 43, 113, 93, 23, 87, 30, 42, 60, 25, 40, 89, 18, 21, 82, 85, 26, 13, 83, 111, 15, 16, 78, 72, 79, 19, 20, 112, 117, 77, 84, 63, 14, 124, 74, 103, 17, 34, 10, 81, 80, 55, 29, 47, 86, 94, 115, 12, 119, 76, 39, 127, 27, 57, 90, 33, 11, 110, 100, 53, 75, 126, 52, 61, 32, 8, 102, 62, 22, 69, 58, 6, 56, 0, 125, 44, 91, 64, 71, 73, 5, 9, 7, 122, 123, 51, 65, 4, 3, 1, 67, 36, 88, 97, 70, 68, 96, 24, 38, 120, 46, 116, 59, 92, 28, 98, 35, 2, 31, 66, 108, 99, 95], [121, 41, 118, 54, 105, 50, 45, 37, 106, 49, 48, 101, 114, 107, 109, 104, 113, 93, 43, 23, 30, 60, 42, 25, 87, 85, 40, 89, 82, 18, 13, 26, 83, 21, 112, 77, 14, 111, 117, 16, 94, 78, 19, 15, 79, 34, 20, 84, 63, 74, 72, 29, 124, 55, 80, 115, 10, 17, 53, 57, 126, 47, 86, 76, 39, 103, 81, 33, 100, 119, 110, 62, 125, 127, 12, 56, 11, 90, 22, 27, 52, 6, 64, 75, 102, 61, 32, 8, 122, 44, 97, 91, 7, 51, 123, 58, 9, 73, 69, 0, 38, 68, 4, 71, 5, 3, 36, 59, 67, 46, 1, 120, 116, 98, 96, 88, 24, 70, 35, 65, 92, 108, 66, 28, 31, 99, 2, 95], [121, 41, 118, 54, 105, 50, 37, 45, 106, 101, 49, 114, 109, 48, 107, 104, 113, 23, 93, 87, 60, 43, 42, 30, 25, 89, 18, 82, 85, 13, 83, 40, 21, 26, 16, 117, 19, 78, 14, 77, 84, 81, 79, 34, 15, 74, 112, 72, 20, 124, 80, 10, 111, 94, 17, 100, 12, 115, 39, 55, 76, 119, 103, 63, 110, 33, 11, 29, 90, 57, 86, 47, 127, 123, 27, 53, 102, 58, 22, 52, 56, 8, 125, 44, 32, 75, 122, 0, 91, 126, 9, 7, 71, 97, 62, 61, 5, 69, 68, 51, 6, 98, 64, 120, 1, 70, 36, 59, 4, 108, 38, 116, 73, 35, 65, 88, 24, 46, 96, 92, 67, 2, 31, 3, 99, 95, 28, 66], [121, 41, 118, 54, 105, 50, 37, 45, 106, 101, 49, 109, 48, 114, 107, 104, 113, 23, 93, 43, 42, 87, 30, 60, 85, 83, 40, 25, 13, 89, 82, 18, 21, 26, 19, 111, 78, 77, 16, 117, 84, 14, 79, 15, 20, 103, 80, 72, 34, 124, 10, 112, 17, 81, 29, 74, 94, 55, 76, 119, 12, 33, 63, 57, 75, 90, 8, 127, 100, 47, 110, 126, 11, 39, 86, 22, 115, 61, 52, 32, 27, 53, 62, 102, 125, 56, 64, 58, 7, 51, 0, 91, 122, 44, 71, 5, 69, 70, 97, 9, 59, 1, 67, 116, 73, 98, 6, 123, 36, 38, 3, 4, 46, 65, 120, 96, 24, 68, 99, 28, 92, 31, 108, 35, 88, 95, 2, 66], [121, 41, 118, 105, 54, 50, 37, 45, 106, 49, 101, 109, 48, 114, 107, 104, 43, 113, 23, 30, 87, 93, 89, 42, 60, 40, 25, 18, 117, 85, 13, 63, 82, 21, 83, 112, 26, 94, 16, 111, 77, 78, 119, 103, 20, 34, 55, 15, 84, 19, 80, 79, 124, 14, 33, 57, 74, 81, 29, 102, 10, 17, 52, 39, 72, 127, 90, 8, 12, 115, 53, 100, 97, 47, 76, 86, 110, 126, 32, 27, 44, 11, 64, 58, 51, 0, 62, 91, 56, 75, 22, 71, 69, 61, 7, 122, 70, 98, 5, 1, 67, 4, 3, 68, 35, 9, 36, 65, 125, 123, 59, 120, 73, 92, 24, 95, 38, 31, 6, 116, 99, 46, 28, 108, 88, 96, 2, 66], [121, 41, 118, 54, 105, 50, 37, 45, 106, 101, 49, 48, 109, 107, 114, 113, 104, 93, 23, 43, 30, 25, 87, 60, 42, 18, 77, 40, 26, 21, 89, 13, 85, 83, 19, 82, 117, 80, 111, 112, 15, 84, 14, 124, 10, 78, 34, 79, 16, 103, 74, 63, 76, 94, 55, 100, 72, 81, 17, 8, 33, 115, 39, 29, 12, 20, 126, 86, 27, 102, 11, 110, 62, 53, 75, 57, 56, 44, 97, 61, 127, 22, 70, 119, 47, 52, 71, 51, 58, 90, 69, 0, 122, 64, 5, 125, 123, 91, 32, 73, 7, 4, 9, 65, 1, 36, 98, 35, 120, 68, 59, 24, 46, 96, 38, 3, 116, 6, 67, 31, 88, 92, 108, 99, 2, 28, 95, 66], [121, 41, 118, 54, 105, 50, 45, 37, 106, 49, 101, 48, 109, 114, 107, 104, 113, 93, 43, 42, 23, 30, 60, 87, 89, 40, 25, 18, 13, 77, 85, 82, 21, 26, 103, 117, 14, 111, 74, 83, 19, 63, 79, 112, 119, 16, 34, 15, 81, 78, 84, 80, 94, 29, 33, 55, 124, 20, 10, 57, 17, 8, 76, 62, 110, 86, 127, 39, 53, 47, 12, 11, 72, 75, 126, 100, 22, 90, 102, 58, 115, 56, 9, 71, 123, 70, 27, 44, 52, 69, 5, 97, 125, 32, 51, 64, 120, 91, 7, 122, 3, 0, 61, 4, 59, 38, 73, 67, 68, 98, 46, 96, 36, 1, 6, 88, 65, 99, 35, 31, 116, 108, 28, 2, 92, 66, 24, 95], [121, 41, 118, 105, 54, 50, 37, 45, 106, 49, 48, 101, 109, 114, 107, 104, 113, 87, 93, 23, 43, 42, 30, 60, 89, 40, 25, 82, 85, 26, 13, 18, 21, 77, 19, 117, 83, 14, 79, 103, 124, 34, 16, 112, 78, 111, 63, 15, 74, 84, 119, 29, 94, 33, 55, 20, 17, 110, 80, 81, 10, 76, 86, 8, 127, 12, 100, 39, 90, 115, 57, 75, 102, 47, 126, 52, 32, 53, 91, 62, 27, 56, 22, 44, 58, 11, 64, 97, 72, 61, 51, 123, 0, 71, 5, 59, 125, 69, 9, 70, 1, 98, 122, 116, 73, 38, 7, 88, 120, 46, 96, 67, 36, 4, 3, 35, 65, 68, 28, 92, 31, 99, 6, 24, 95, 2, 108, 66], [121, 41, 118, 54, 105, 50, 37, 45, 101, 49, 48, 106, 114, 109, 107, 113, 104, 43, 93, 23, 42, 30, 87, 60, 25, 40, 89, 18, 82, 13, 26, 85, 117, 77, 21, 78, 34, 94, 19, 124, 103, 83, 14, 55, 84, 15, 79, 16, 74, 111, 80, 20, 17, 8, 81, 29, 10, 112, 119, 76, 47, 115, 86, 90, 39, 33, 11, 12, 56, 63, 126, 100, 102, 75, 44, 127, 22, 110, 53, 57, 0, 32, 91, 61, 27, 58, 97, 52, 72, 64, 73, 4, 62, 51, 69, 5, 1, 122, 125, 9, 123, 68, 38, 71, 6, 59, 7, 35, 65, 24, 96, 98, 46, 70, 88, 36, 3, 67, 116, 31, 92, 66, 28, 120, 95, 99, 2, 108], [121, 41, 118, 105, 54, 50, 37, 45, 106, 101, 48, 49, 109, 114, 107, 104, 113, 43, 93, 60, 23, 42, 30, 25, 87, 40, 85, 13, 117, 21, 89, 82, 18, 26, 19, 77, 78, 103, 111, 74, 124, 55, 94, 16, 79, 8, 14, 63, 84, 83, 112, 34, 20, 127, 17, 10, 15, 119, 80, 29, 33, 126, 47, 12, 56, 53, 81, 100, 86, 39, 75, 115, 110, 76, 62, 102, 11, 51, 90, 27, 97, 125, 61, 22, 44, 58, 32, 72, 7, 57, 64, 52, 73, 6, 122, 69, 9, 4, 123, 5, 91, 68, 71, 0, 96, 3, 98, 35, 116, 38, 120, 36, 67, 59, 65, 46, 108, 1, 88, 70, 28, 31, 66, 99, 24, 95, 92, 2], [121, 41, 118, 105, 54, 50, 37, 45, 106, 49, 101, 48, 109, 114, 104, 107, 113, 43, 93, 23, 60, 30, 42, 87, 25, 13, 40, 82, 89, 21, 85, 18, 78, 19, 117, 26, 34, 77, 74, 15, 79, 124, 111, 83, 112, 14, 16, 20, 80, 84, 103, 10, 63, 17, 8, 39, 94, 115, 33, 100, 55, 29, 12, 76, 81, 110, 11, 86, 57, 56, 126, 22, 44, 90, 127, 53, 75, 119, 27, 52, 47, 58, 72, 102, 62, 123, 122, 32, 64, 97, 125, 73, 6, 71, 38, 5, 91, 69, 61, 0, 36, 9, 59, 120, 7, 51, 4, 67, 31, 1, 98, 68, 3, 88, 116, 70, 65, 46, 96, 35, 28, 108, 99, 92, 24, 2, 95, 66], [121, 41, 118, 105, 54, 50, 45, 37, 106, 101, 49, 48, 109, 114, 107, 104, 113, 43, 93, 23, 60, 42, 87, 40, 25, 82, 13, 85, 30, 89, 18, 21, 26, 77, 117, 14, 19, 74, 83, 34, 15, 78, 79, 63, 112, 80, 84, 17, 103, 8, 111, 94, 10, 12, 20, 16, 124, 76, 29, 110, 55, 39, 100, 81, 33, 86, 119, 127, 115, 52, 75, 47, 53, 56, 27, 57, 11, 126, 102, 22, 72, 90, 97, 125, 58, 0, 122, 6, 44, 64, 7, 123, 32, 71, 51, 62, 69, 91, 59, 73, 9, 5, 61, 98, 1, 120, 65, 96, 116, 88, 68, 4, 3, 38, 24, 92, 36, 67, 46, 35, 108, 28, 31, 70, 2, 99, 95, 66], [121, 41, 118, 105, 54, 50, 45, 37, 106, 101, 48, 49, 114, 109, 107, 104, 93, 113, 43, 23, 42, 30, 87, 60, 25, 40, 85, 82, 26, 89, 21, 77, 13, 83, 18, 103, 14, 19, 111, 20, 34, 17, 117, 55, 84, 124, 16, 78, 79, 94, 112, 29, 15, 63, 127, 86, 80, 76, 126, 119, 10, 74, 110, 33, 39, 53, 58, 123, 56, 27, 90, 8, 122, 12, 115, 81, 47, 57, 11, 52, 22, 102, 44, 72, 97, 0, 100, 32, 125, 62, 38, 61, 75, 9, 51, 64, 59, 7, 4, 116, 98, 5, 69, 6, 36, 68, 67, 3, 96, 65, 91, 24, 71, 1, 120, 73, 70, 92, 35, 88, 31, 28, 46, 95, 66, 99, 2, 108], [121, 41, 118, 105, 54, 50, 45, 37, 106, 48, 49, 101, 109, 107, 114, 104, 113, 43, 93, 42, 87, 23, 60, 30, 40, 13, 82, 77, 85, 21, 25, 18, 19, 89, 78, 14, 34, 15, 117, 83, 74, 26, 103, 111, 112, 10, 20, 72, 124, 16, 79, 80, 17, 8, 12, 81, 84, 76, 47, 110, 64, 75, 56, 127, 115, 100, 55, 86, 29, 63, 94, 119, 11, 126, 39, 0, 53, 22, 122, 52, 102, 7, 61, 5, 57, 33, 58, 90, 68, 69, 9, 1, 62, 27, 97, 44, 65, 32, 73, 123, 59, 125, 98, 3, 71, 91, 4, 6, 67, 70, 51, 116, 66, 38, 36, 120, 88, 96, 35, 2, 24, 46, 108, 31, 92, 28, 99, 95], [41, 121, 118, 105, 54, 50, 45, 37, 106, 49, 48, 101, 109, 107, 114, 104, 113, 43, 93, 42, 23, 30, 87, 60, 40, 89, 25, 85, 55, 19, 77, 13, 112, 18, 21, 34, 117, 26, 103, 82, 78, 83, 111, 124, 94, 63, 15, 20, 10, 84, 16, 14, 74, 115, 79, 80, 29, 56, 100, 72, 39, 76, 33, 17, 81, 127, 47, 110, 53, 8, 126, 12, 102, 52, 86, 75, 119, 0, 11, 90, 123, 22, 57, 7, 27, 32, 44, 58, 97, 5, 125, 122, 62, 51, 70, 64, 61, 9, 91, 71, 69, 68, 38, 120, 65, 1, 67, 4, 96, 73, 92, 6, 59, 88, 98, 36, 116, 35, 3, 28, 108, 24, 66, 2, 46, 31, 95, 99], [121, 41, 118, 54, 105, 50, 45, 37, 106, 48, 49, 101, 114, 107, 109, 104, 113, 43, 93, 23, 42, 60, 87, 40, 25, 30, 89, 18, 77, 85, 21, 13, 82, 14, 78, 19, 83, 63, 26, 111, 117, 15, 79, 34, 10, 112, 115, 74, 103, 20, 119, 80, 72, 124, 16, 84, 17, 81, 11, 39, 94, 55, 12, 86, 76, 56, 100, 47, 126, 33, 110, 29, 102, 127, 44, 8, 0, 57, 75, 27, 122, 52, 7, 62, 64, 22, 90, 123, 53, 61, 73, 125, 70, 3, 68, 5, 67, 97, 69, 65, 51, 71, 9, 58, 32, 1, 91, 120, 4, 59, 98, 96, 2, 6, 36, 108, 35, 38, 88, 46, 92, 28, 99, 24, 116, 66, 95, 31], [41, 121, 118, 105, 54, 50, 45, 106, 37, 101, 48, 49, 114, 109, 107, 104, 113, 43, 60, 23, 87, 93, 42, 40, 30, 89, 25, 18, 13, 85, 21, 77, 82, 78, 26, 34, 19, 83, 14, 10, 80, 111, 119, 63, 79, 72, 15, 112, 103, 117, 55, 74, 20, 124, 16, 100, 115, 17, 84, 81, 29, 94, 12, 47, 76, 11, 52, 39, 102, 110, 57, 53, 75, 33, 22, 27, 62, 44, 86, 126, 0, 123, 5, 127, 73, 90, 122, 8, 7, 125, 59, 97, 64, 70, 4, 32, 61, 38, 56, 116, 58, 51, 69, 68, 65, 3, 91, 9, 71, 1, 88, 67, 98, 96, 120, 35, 6, 36, 2, 92, 99, 46, 28, 31, 108, 95, 66, 24], [121, 41, 118, 105, 54, 50, 45, 106, 37, 49, 48, 101, 109, 107, 104, 114, 113, 43, 93, 23, 60, 42, 87, 40, 13, 30, 25, 89, 85, 18, 19, 77, 82, 21, 14, 26, 111, 78, 72, 80, 10, 83, 34, 79, 103, 15, 63, 16, 117, 20, 74, 119, 124, 112, 84, 81, 12, 47, 11, 17, 76, 55, 127, 94, 39, 27, 75, 100, 126, 29, 52, 115, 110, 33, 86, 44, 53, 56, 22, 102, 58, 61, 125, 90, 70, 57, 51, 8, 62, 123, 32, 7, 122, 0, 69, 9, 73, 64, 59, 5, 71, 91, 4, 1, 97, 67, 3, 65, 68, 120, 98, 24, 6, 96, 116, 38, 35, 92, 36, 88, 46, 2, 31, 99, 66, 28, 95, 108]], "model.layers.8.self_attn.q_proj": [[60, 102, 116, 51, 123, 117, 91, 34, 52, 33, 118, 124, 115, 62, 119, 18, 126, 63, 48, 97, 58, 57, 50, 79, 54, 55, 53, 56, 61, 85, 49, 121, 59, 125, 94, 111, 122, 35, 101, 127, 45, 112, 120, 77, 44, 105, 43, 113, 89, 114, 73, 47, 26, 110, 46, 42, 13, 90, 21, 19, 109, 24, 28, 108, 107, 22, 12, 82, 14, 83, 96, 70, 37, 20, 87, 106, 39, 16, 40, 81, 72, 74, 84, 41, 103, 80, 7, 99, 30, 27, 32, 88, 31, 11, 15, 25, 36, 17, 95, 29, 104, 93, 23, 86, 92, 10, 100, 98, 76, 1, 4, 5, 3, 9, 75, 78, 38, 69, 64, 71, 67, 68, 66, 8, 6, 2, 65, 0], [117, 60, 51, 123, 116, 52, 122, 63, 126, 53, 55, 49, 48, 125, 57, 121, 120, 56, 110, 113, 111, 61, 47, 112, 114, 127, 54, 118, 119, 45, 46, 124, 62, 115, 109, 50, 58, 59, 44, 108, 43, 105, 107, 36, 42, 106, 41, 104, 40, 101, 39, 103, 102, 37, 35, 100, 24, 86, 98, 38, 95, 34, 99, 33, 87, 22, 96, 90, 66, 32, 0, 94, 31, 81, 5, 30, 97, 8, 84, 65, 19, 27, 93, 23, 85, 69, 29, 82, 75, 72, 91, 28, 21, 17, 15, 20, 89, 67, 88, 78, 92, 83, 2, 76, 13, 14, 64, 18, 25, 79, 80, 6, 70, 9, 26, 73, 68, 12, 77, 11, 1, 4, 74, 3, 16, 7, 71, 10], [102, 116, 60, 51, 33, 123, 117, 91, 52, 115, 85, 84, 15, 124, 70, 118, 79, 18, 7, 62, 34, 73, 12, 38, 119, 27, 26, 74, 89, 88, 11, 87, 59, 24, 94, 90, 58, 50, 20, 21, 93, 69, 30, 22, 3, 57, 54, 72, 126, 77, 4, 14, 82, 121, 35, 9, 98, 81, 61, 42, 37, 1, 13, 63, 56, 48, 92, 53, 10, 32, 23, 76, 25, 2, 28, 16, 64, 68, 55, 80, 31, 41, 105, 5, 17, 44, 95, 36, 101, 75, 49, 39, 67, 71, 103, 108, 66, 43, 29, 99, 120, 106, 96, 19, 127, 107, 46, 45, 125, 83, 78, 114, 97, 113, 40, 111, 100, 47, 104, 109, 6, 110, 8, 112, 86, 122, 65, 0], [116, 102, 60, 51, 123, 117, 115, 52, 62, 124, 119, 118, 34, 38, 39, 19, 58, 79, 57, 126, 85, 54, 50, 37, 43, 61, 48, 89, 53, 56, 121, 55, 94, 44, 104, 46, 42, 31, 107, 73, 108, 110, 125, 41, 40, 70, 72, 33, 111, 95, 36, 106, 109, 87, 63, 45, 49, 101, 59, 127, 103, 23, 105, 113, 120, 47, 32, 35, 28, 90, 114, 91, 13, 122, 96, 21, 112, 88, 100, 25, 11, 99, 74, 30, 27, 5, 93, 98, 92, 77, 29, 26, 1, 12, 17, 7, 24, 20, 14, 66, 80, 64, 10, 4, 3, 15, 76, 82, 16, 67, 78, 97, 18, 68, 22, 75, 71, 83, 69, 81, 84, 65, 2, 9, 86, 8, 0, 6], [38, 34, 119, 60, 18, 92, 78, 77, 9, 24, 88, 72, 85, 8, 75, 124, 68, 3, 21, 55, 59, 70, 80, 123, 69, 115, 28, 48, 71, 19, 22, 29, 37, 44, 47, 26, 61, 58, 117, 76, 4, 35, 73, 10, 95, 83, 125, 64, 0, 118, 14, 16, 79, 66, 12, 2, 62, 13, 31, 87, 93, 82, 46, 65, 57, 104, 49, 89, 42, 98, 103, 27, 15, 81, 84, 56, 63, 107, 67, 111, 30, 94, 110, 36, 6, 100, 7, 50, 25, 116, 17, 106, 105, 32, 101, 39, 53, 23, 120, 74, 86, 97, 113, 112, 43, 40, 20, 99, 41, 96, 126, 90, 1, 52, 11, 33, 121, 54, 108, 127, 114, 91, 5, 122, 45, 109, 51, 102], [38, 34, 59, 60, 119, 29, 18, 92, 88, 24, 115, 47, 77, 79, 9, 78, 68, 85, 1, 80, 55, 71, 124, 75, 22, 83, 70, 61, 65, 110, 44, 72, 64, 117, 35, 37, 69, 19, 123, 76, 125, 98, 3, 12, 58, 5, 16, 8, 73, 52, 21, 28, 95, 13, 7, 10, 100, 103, 27, 67, 0, 6, 48, 66, 93, 86, 106, 118, 26, 17, 4, 14, 112, 30, 20, 90, 46, 81, 63, 91, 32, 82, 107, 15, 2, 104, 102, 23, 94, 42, 74, 36, 11, 25, 99, 89, 39, 45, 54, 62, 49, 116, 120, 97, 50, 41, 96, 126, 31, 87, 113, 33, 43, 56, 127, 114, 122, 53, 105, 40, 84, 121, 101, 111, 51, 109, 108, 57], [38, 34, 60, 119, 29, 18, 85, 77, 88, 59, 78, 24, 37, 80, 9, 92, 124, 75, 71, 55, 69, 44, 21, 47, 8, 125, 3, 61, 93, 72, 16, 68, 19, 117, 35, 118, 79, 123, 22, 81, 95, 106, 11, 76, 115, 13, 32, 48, 58, 31, 10, 1, 83, 120, 65, 27, 63, 98, 122, 7, 116, 52, 84, 97, 89, 14, 26, 82, 99, 112, 107, 36, 94, 66, 64, 50, 42, 25, 30, 53, 28, 33, 100, 70, 40, 103, 87, 126, 110, 62, 51, 96, 104, 12, 41, 101, 39, 20, 54, 91, 86, 43, 49, 109, 45, 127, 23, 46, 114, 111, 57, 56, 108, 74, 113, 6, 17, 90, 15, 67, 121, 4, 0, 105, 73, 5, 2, 102], [38, 60, 119, 34, 29, 28, 88, 115, 85, 102, 93, 59, 19, 125, 80, 95, 27, 78, 92, 47, 87, 52, 98, 55, 18, 33, 44, 37, 35, 123, 118, 117, 24, 26, 83, 110, 22, 62, 58, 54, 46, 12, 89, 25, 61, 42, 48, 104, 103, 84, 30, 113, 40, 45, 97, 100, 101, 106, 91, 109, 31, 112, 116, 10, 81, 94, 114, 63, 21, 99, 107, 51, 49, 50, 105, 126, 96, 16, 111, 108, 127, 121, 124, 32, 57, 122, 43, 36, 56, 17, 120, 53, 41, 20, 86, 79, 39, 76, 23, 90, 8, 14, 74, 77, 71, 15, 75, 72, 11, 13, 82, 9, 7, 66, 2, 67, 70, 5, 69, 3, 4, 6, 73, 0, 68, 1, 64, 65], [59, 60, 116, 80, 55, 50, 73, 72, 117, 12, 62, 77, 10, 84, 120, 26, 54, 125, 103, 69, 19, 13, 24, 127, 68, 78, 15, 75, 93, 7, 21, 82, 17, 51, 90, 124, 6, 57, 49, 113, 3, 98, 56, 122, 121, 1, 66, 114, 16, 18, 112, 46, 58, 76, 70, 119, 36, 126, 123, 52, 28, 34, 118, 53, 88, 110, 61, 63, 0, 47, 29, 71, 109, 43, 27, 115, 96, 32, 104, 44, 107, 45, 108, 100, 92, 48, 102, 38, 42, 97, 106, 94, 105, 79, 111, 41, 101, 87, 89, 40, 37, 31, 33, 99, 95, 11, 91, 85, 35, 30, 86, 74, 9, 20, 81, 65, 83, 39, 23, 14, 4, 25, 8, 2, 22, 64, 67, 5], [59, 116, 60, 103, 55, 73, 117, 15, 87, 62, 80, 120, 125, 82, 54, 84, 10, 90, 127, 72, 13, 19, 69, 68, 76, 7, 6, 3, 78, 77, 51, 50, 124, 57, 66, 0, 75, 17, 113, 1, 49, 12, 21, 37, 56, 70, 121, 122, 11, 114, 119, 46, 123, 52, 36, 94, 74, 93, 32, 38, 88, 110, 58, 61, 118, 39, 43, 31, 112, 126, 28, 96, 47, 26, 104, 99, 53, 71, 42, 89, 65, 106, 101, 107, 115, 40, 100, 109, 111, 86, 83, 97, 102, 63, 108, 44, 105, 41, 81, 24, 33, 35, 30, 45, 95, 48, 8, 22, 92, 4, 98, 27, 20, 5, 16, 18, 79, 25, 91, 14, 34, 2, 29, 64, 23, 9, 85, 67], [103, 59, 116, 60, 20, 26, 16, 96, 84, 98, 24, 29, 89, 114, 93, 14, 10, 55, 125, 94, 91, 99, 88, 62, 19, 117, 70, 1, 87, 33, 18, 50, 31, 28, 80, 90, 68, 17, 57, 92, 52, 127, 69, 12, 95, 3, 25, 54, 27, 49, 23, 124, 72, 97, 120, 123, 66, 30, 74, 110, 71, 35, 22, 100, 36, 38, 32, 34, 113, 21, 0, 67, 64, 104, 102, 53, 82, 39, 78, 37, 76, 101, 40, 42, 51, 6, 119, 7, 108, 4, 122, 105, 83, 43, 81, 58, 75, 41, 118, 106, 44, 109, 48, 107, 45, 121, 47, 46, 9, 112, 56, 63, 73, 61, 111, 126, 115, 15, 5, 79, 85, 11, 8, 13, 65, 86, 77, 2], [59, 50, 60, 116, 19, 21, 15, 82, 80, 89, 76, 84, 13, 73, 88, 78, 87, 120, 62, 117, 17, 125, 54, 55, 7, 51, 90, 72, 127, 77, 6, 10, 3, 124, 61, 56, 121, 122, 58, 0, 63, 123, 112, 126, 118, 119, 11, 69, 57, 113, 53, 115, 47, 68, 75, 49, 66, 48, 111, 74, 46, 92, 86, 52, 110, 45, 109, 114, 108, 44, 1, 107, 93, 43, 106, 42, 105, 8, 41, 23, 104, 40, 27, 29, 65, 12, 100, 81, 103, 4, 28, 30, 38, 37, 102, 36, 39, 5, 70, 99, 20, 101, 14, 2, 83, 32, 35, 98, 97, 96, 22, 33, 95, 31, 67, 64, 26, 94, 71, 34, 24, 25, 85, 91, 16, 79, 9, 18], [104, 118, 34, 95, 50, 25, 70, 73, 51, 56, 45, 23, 55, 114, 49, 62, 5, 84, 3, 7, 40, 109, 117, 18, 121, 52, 74, 63, 54, 110, 124, 82, 58, 68, 14, 86, 126, 76, 15, 60, 8, 119, 112, 65, 17, 31, 123, 53, 80, 108, 97, 59, 127, 16, 27, 71, 125, 20, 35, 115, 46, 107, 89, 47, 32, 93, 120, 77, 44, 39, 111, 42, 29, 101, 61, 48, 90, 38, 41, 2, 113, 122, 100, 88, 30, 106, 92, 12, 102, 22, 37, 33, 103, 36, 105, 19, 13, 57, 94, 43, 116, 96, 28, 9, 91, 78, 85, 87, 81, 75, 99, 79, 21, 24, 83, 98, 10, 11, 26, 67, 0, 72, 6, 69, 4, 1, 66, 64], [104, 118, 34, 95, 84, 25, 73, 12, 40, 23, 18, 51, 16, 9, 49, 5, 65, 68, 70, 56, 1, 69, 66, 54, 8, 89, 80, 67, 63, 6, 13, 117, 15, 64, 62, 87, 60, 78, 45, 11, 50, 55, 3, 27, 58, 74, 14, 7, 20, 124, 72, 79, 2, 110, 24, 10, 71, 109, 76, 77, 82, 81, 127, 0, 28, 52, 4, 123, 22, 114, 31, 93, 83, 125, 97, 42, 91, 75, 126, 85, 112, 94, 86, 90, 21, 35, 46, 88, 33, 47, 19, 59, 121, 100, 38, 48, 39, 61, 119, 122, 17, 41, 26, 92, 29, 44, 101, 96, 108, 107, 111, 53, 113, 106, 36, 103, 32, 98, 120, 37, 99, 30, 102, 105, 43, 116, 115, 57], [118, 104, 34, 95, 50, 25, 23, 45, 52, 62, 84, 56, 117, 58, 49, 51, 18, 54, 16, 63, 114, 109, 123, 60, 55, 124, 106, 112, 12, 59, 27, 125, 57, 91, 121, 15, 126, 61, 110, 42, 32, 41, 31, 127, 119, 48, 43, 53, 33, 22, 39, 47, 115, 108, 111, 37, 113, 122, 120, 35, 28, 46, 116, 88, 107, 44, 99, 105, 93, 73, 103, 36, 97, 102, 38, 101, 92, 30, 82, 100, 86, 5, 87, 79, 96, 90, 13, 94, 85, 24, 89, 17, 78, 29, 9, 8, 70, 83, 26, 14, 74, 98, 21, 76, 20, 81, 80, 19, 7, 75, 3, 10, 77, 40, 72, 11, 69, 68, 65, 2, 71, 67, 6, 64, 4, 66, 1, 0], [104, 118, 34, 95, 25, 16, 12, 56, 23, 84, 18, 45, 15, 55, 69, 51, 78, 50, 117, 72, 58, 9, 40, 63, 8, 5, 49, 109, 66, 114, 67, 124, 54, 79, 71, 6, 60, 76, 4, 13, 110, 10, 80, 87, 86, 73, 31, 81, 119, 27, 62, 91, 108, 89, 75, 14, 1, 20, 70, 85, 52, 53, 112, 64, 29, 123, 11, 7, 19, 82, 42, 44, 59, 93, 121, 83, 61, 127, 39, 126, 90, 96, 94, 97, 32, 125, 74, 77, 103, 2, 21, 98, 107, 24, 41, 88, 111, 17, 22, 35, 106, 26, 57, 47, 115, 101, 92, 48, 43, 33, 30, 37, 68, 100, 122, 46, 28, 3, 36, 116, 105, 38, 102, 113, 120, 0, 99, 65], [41, 54, 59, 99, 79, 88, 101, 33, 13, 22, 90, 91, 31, 27, 73, 82, 39, 11, 123, 21, 84, 26, 71, 0, 66, 16, 40, 109, 94, 6, 74, 108, 2, 83, 28, 12, 38, 107, 68, 29, 126, 105, 51, 62, 7, 18, 95, 60, 76, 87, 34, 52, 20, 1, 98, 23, 9, 15, 32, 80, 78, 122, 67, 96, 125, 8, 110, 42, 55, 65, 4, 75, 57, 70, 36, 30, 106, 85, 49, 89, 45, 50, 46, 3, 124, 103, 56, 64, 77, 19, 127, 43, 17, 44, 58, 92, 100, 47, 104, 121, 97, 113, 118, 86, 112, 14, 115, 93, 114, 119, 10, 25, 120, 63, 102, 61, 5, 24, 81, 53, 72, 48, 69, 116, 111, 117, 37, 35], [41, 54, 99, 52, 91, 33, 126, 123, 27, 105, 113, 88, 108, 59, 61, 43, 36, 40, 101, 13, 125, 109, 22, 84, 107, 49, 16, 21, 57, 44, 118, 56, 45, 34, 115, 51, 63, 62, 17, 106, 50, 98, 31, 112, 127, 42, 25, 47, 120, 90, 39, 55, 100, 46, 124, 38, 121, 110, 114, 116, 58, 103, 29, 111, 96, 122, 94, 60, 95, 48, 53, 77, 71, 119, 117, 23, 104, 32, 102, 28, 93, 37, 82, 19, 4, 92, 85, 30, 81, 75, 20, 87, 89, 35, 26, 83, 97, 76, 68, 74, 10, 80, 73, 70, 18, 11, 24, 14, 72, 69, 12, 79, 78, 67, 6, 0, 2, 7, 8, 86, 15, 64, 65, 9, 5, 66, 1, 3], [41, 54, 99, 126, 33, 52, 91, 105, 27, 40, 101, 88, 21, 61, 89, 56, 125, 13, 45, 110, 123, 44, 34, 113, 82, 115, 36, 109, 85, 38, 59, 43, 107, 84, 22, 31, 42, 112, 81, 124, 94, 108, 114, 111, 120, 39, 116, 127, 57, 103, 16, 96, 58, 50, 63, 18, 118, 62, 60, 74, 90, 53, 100, 122, 17, 55, 37, 106, 23, 98, 104, 48, 87, 32, 49, 46, 121, 93, 28, 77, 51, 117, 19, 47, 119, 102, 25, 26, 97, 80, 29, 95, 68, 30, 83, 92, 35, 71, 24, 79, 75, 20, 11, 76, 4, 10, 78, 14, 12, 72, 86, 15, 70, 73, 7, 6, 1, 8, 9, 65, 64, 69, 2, 67, 5, 66, 0, 3], [41, 99, 59, 54, 88, 82, 31, 123, 27, 22, 91, 105, 79, 84, 13, 52, 21, 108, 87, 101, 12, 46, 33, 8, 18, 74, 83, 26, 39, 19, 81, 9, 94, 95, 118, 126, 10, 89, 77, 38, 17, 6, 92, 127, 20, 25, 55, 28, 15, 56, 40, 14, 69, 90, 70, 115, 44, 85, 68, 24, 16, 30, 97, 5, 86, 100, 23, 61, 98, 43, 32, 109, 110, 96, 104, 120, 113, 124, 76, 111, 29, 34, 122, 42, 75, 125, 80, 93, 107, 119, 57, 102, 58, 106, 36, 103, 47, 1, 63, 112, 114, 45, 72, 121, 4, 78, 50, 60, 117, 11, 3, 116, 53, 65, 49, 48, 71, 73, 37, 51, 62, 35, 67, 66, 0, 7, 64, 2], [103, 33, 59, 31, 82, 20, 23, 120, 81, 90, 114, 11, 76, 13, 14, 87, 22, 78, 7, 18, 71, 10, 21, 106, 83, 112, 16, 12, 15, 73, 124, 19, 119, 52, 77, 67, 96, 123, 38, 91, 25, 125, 41, 48, 40, 43, 107, 37, 94, 104, 98, 97, 88, 118, 27, 75, 86, 42, 51, 121, 84, 53, 117, 54, 55, 35, 58, 32, 3, 24, 30, 72, 80, 69, 111, 28, 34, 109, 126, 101, 47, 56, 100, 46, 8, 44, 85, 29, 93, 105, 115, 26, 79, 92, 108, 9, 1, 99, 62, 60, 36, 122, 127, 45, 61, 68, 113, 89, 4, 17, 57, 2, 39, 74, 63, 49, 5, 65, 102, 66, 110, 6, 95, 116, 50, 70, 64, 0], [103, 33, 59, 20, 31, 23, 120, 90, 83, 114, 82, 29, 16, 78, 19, 13, 84, 27, 88, 37, 94, 24, 32, 118, 81, 11, 72, 30, 17, 97, 7, 126, 48, 38, 92, 127, 96, 117, 21, 8, 77, 124, 56, 36, 87, 106, 25, 76, 22, 40, 9, 51, 12, 91, 119, 62, 105, 57, 41, 115, 10, 43, 45, 113, 52, 26, 110, 123, 14, 102, 79, 107, 15, 98, 54, 28, 93, 55, 34, 89, 61, 109, 18, 66, 44, 74, 80, 100, 95, 42, 99, 125, 85, 101, 47, 121, 63, 49, 35, 60, 73, 104, 68, 111, 58, 39, 108, 71, 4, 122, 53, 86, 75, 46, 112, 65, 116, 70, 50, 67, 2, 3, 6, 69, 5, 64, 0, 1], [103, 33, 59, 120, 31, 25, 13, 20, 11, 82, 78, 52, 16, 87, 10, 73, 23, 81, 7, 3, 65, 2, 66, 70, 90, 0, 5, 68, 114, 76, 69, 6, 19, 83, 18, 80, 106, 107, 67, 71, 27, 8, 75, 119, 9, 72, 96, 51, 77, 117, 48, 39, 28, 74, 105, 1, 43, 126, 4, 118, 22, 24, 37, 41, 123, 100, 46, 17, 53, 36, 127, 94, 56, 109, 115, 101, 44, 85, 40, 26, 92, 30, 54, 32, 104, 21, 84, 113, 62, 49, 47, 15, 12, 34, 14, 45, 121, 60, 93, 98, 91, 42, 88, 79, 50, 124, 29, 57, 111, 58, 110, 122, 125, 61, 116, 63, 99, 108, 35, 86, 102, 38, 89, 55, 112, 95, 64, 97], [103, 33, 114, 59, 31, 81, 120, 25, 76, 20, 78, 64, 15, 23, 82, 7, 21, 90, 16, 9, 3, 1, 83, 11, 5, 48, 106, 13, 65, 24, 66, 67, 27, 75, 115, 37, 47, 126, 77, 19, 74, 52, 44, 84, 55, 4, 10, 28, 53, 107, 93, 87, 22, 97, 71, 92, 29, 98, 39, 12, 43, 0, 109, 40, 96, 124, 46, 127, 51, 119, 17, 121, 88, 32, 105, 38, 2, 26, 100, 118, 112, 99, 111, 69, 8, 79, 104, 41, 86, 116, 58, 113, 73, 62, 45, 42, 110, 91, 18, 56, 94, 70, 14, 125, 6, 89, 54, 35, 57, 80, 108, 72, 63, 101, 123, 68, 122, 30, 50, 60, 61, 49, 34, 117, 36, 95, 102, 85], [44, 102, 50, 63, 98, 93, 108, 40, 123, 88, 80, 107, 43, 13, 31, 120, 119, 90, 7, 53, 115, 125, 57, 22, 29, 10, 114, 126, 19, 51, 84, 26, 20, 116, 92, 112, 109, 39, 55, 28, 12, 124, 127, 49, 121, 21, 45, 48, 36, 17, 18, 8, 101, 61, 52, 14, 46, 56, 85, 54, 47, 78, 118, 122, 99, 16, 113, 94, 59, 110, 74, 83, 60, 6, 58, 117, 70, 3, 97, 111, 68, 105, 81, 62, 25, 23, 4, 104, 42, 87, 32, 41, 91, 103, 30, 76, 79, 35, 106, 33, 15, 37, 82, 11, 27, 9, 96, 24, 89, 75, 66, 0, 95, 71, 73, 100, 86, 72, 67, 77, 65, 1, 2, 64, 69, 5, 38, 34], [44, 63, 102, 50, 98, 93, 40, 123, 107, 26, 108, 80, 31, 57, 115, 120, 43, 84, 90, 10, 127, 125, 53, 13, 22, 88, 116, 21, 126, 45, 19, 109, 121, 23, 51, 119, 112, 61, 124, 114, 104, 29, 18, 55, 47, 68, 56, 101, 54, 49, 36, 14, 62, 46, 52, 48, 122, 92, 17, 99, 58, 7, 60, 72, 105, 118, 12, 28, 8, 39, 97, 30, 78, 91, 59, 87, 110, 111, 117, 20, 42, 113, 77, 15, 103, 27, 81, 41, 106, 32, 95, 25, 85, 89, 94, 79, 83, 16, 24, 9, 66, 96, 2, 35, 37, 4, 76, 100, 75, 82, 11, 33, 5, 3, 73, 70, 1, 0, 74, 86, 6, 69, 65, 38, 34, 71, 67, 64], [44, 63, 102, 50, 108, 98, 120, 107, 40, 57, 53, 119, 51, 123, 88, 93, 26, 109, 125, 112, 58, 31, 127, 114, 43, 126, 61, 121, 116, 101, 118, 45, 48, 49, 54, 115, 36, 85, 47, 59, 56, 124, 21, 111, 122, 113, 55, 23, 95, 117, 80, 60, 39, 84, 52, 25, 62, 78, 104, 8, 46, 87, 41, 42, 100, 14, 19, 13, 18, 105, 90, 29, 89, 106, 110, 92, 35, 28, 32, 94, 99, 22, 103, 97, 37, 12, 91, 30, 17, 96, 33, 34, 81, 38, 66, 72, 82, 68, 24, 71, 83, 27, 20, 77, 7, 10, 74, 79, 2, 65, 16, 75, 11, 70, 15, 3, 73, 76, 9, 86, 6, 67, 69, 0, 4, 64, 5, 1], [44, 63, 102, 50, 108, 98, 40, 43, 115, 123, 120, 88, 125, 116, 31, 57, 26, 107, 112, 93, 114, 58, 53, 28, 127, 121, 84, 51, 19, 119, 101, 95, 52, 126, 56, 55, 49, 54, 36, 47, 109, 61, 39, 45, 59, 21, 104, 124, 46, 97, 62, 111, 8, 118, 48, 29, 90, 122, 23, 78, 14, 60, 113, 30, 117, 42, 110, 80, 22, 103, 12, 87, 92, 105, 99, 20, 37, 94, 33, 100, 106, 18, 32, 13, 17, 41, 27, 35, 38, 91, 81, 89, 72, 96, 24, 83, 15, 2, 77, 85, 66, 79, 25, 76, 82, 34, 7, 71, 73, 10, 68, 75, 9, 65, 16, 11, 6, 4, 70, 67, 0, 74, 69, 86, 3, 5, 1, 64], [54, 102, 122, 127, 120, 33, 28, 20, 125, 87, 92, 25, 116, 15, 12, 30, 97, 114, 47, 89, 63, 17, 96, 21, 22, 38, 32, 82, 19, 49, 94, 99, 7, 52, 48, 93, 123, 50, 16, 80, 13, 62, 115, 27, 91, 95, 121, 83, 74, 75, 72, 14, 88, 84, 31, 60, 85, 81, 55, 26, 51, 29, 10, 11, 59, 108, 35, 34, 113, 100, 111, 77, 90, 36, 98, 78, 23, 45, 118, 79, 37, 103, 24, 126, 53, 101, 57, 109, 58, 110, 112, 39, 117, 18, 105, 124, 56, 107, 41, 104, 43, 69, 42, 119, 40, 44, 46, 106, 76, 9, 2, 73, 61, 8, 71, 4, 86, 67, 6, 0, 70, 5, 68, 3, 1, 65, 66, 64], [54, 102, 122, 125, 127, 116, 25, 20, 114, 38, 33, 63, 30, 79, 47, 87, 120, 62, 123, 50, 52, 74, 48, 49, 53, 60, 55, 118, 126, 115, 28, 22, 58, 15, 121, 59, 7, 31, 57, 46, 51, 17, 94, 117, 92, 124, 113, 84, 76, 111, 70, 41, 44, 103, 89, 110, 35, 104, 32, 43, 12, 36, 107, 112, 39, 93, 34, 61, 119, 98, 96, 109, 99, 67, 40, 37, 56, 88, 105, 108, 95, 45, 2, 91, 106, 29, 77, 101, 83, 42, 97, 13, 69, 27, 100, 81, 19, 80, 72, 11, 90, 86, 21, 85, 82, 26, 73, 78, 23, 14, 75, 24, 71, 68, 18, 4, 0, 8, 16, 65, 10, 1, 9, 6, 5, 66, 64, 3], [122, 102, 54, 127, 125, 116, 120, 114, 87, 25, 33, 38, 92, 30, 12, 63, 47, 28, 20, 48, 22, 123, 62, 53, 17, 50, 59, 55, 118, 52, 60, 80, 49, 72, 82, 126, 57, 89, 24, 124, 51, 115, 76, 15, 58, 121, 91, 29, 81, 67, 94, 113, 97, 70, 109, 7, 0, 111, 44, 46, 21, 112, 2, 77, 103, 61, 56, 74, 119, 117, 69, 88, 110, 93, 108, 43, 79, 39, 65, 84, 107, 8, 36, 14, 1, 45, 73, 78, 83, 68, 32, 23, 90, 31, 27, 105, 40, 95, 96, 104, 6, 9, 37, 106, 16, 42, 3, 86, 85, 18, 100, 26, 10, 11, 41, 35, 34, 98, 101, 71, 19, 75, 13, 64, 4, 66, 5, 99], [122, 54, 102, 127, 116, 125, 114, 120, 38, 70, 25, 55, 63, 74, 53, 47, 59, 48, 62, 123, 118, 49, 103, 50, 60, 20, 67, 44, 52, 51, 124, 115, 126, 39, 57, 56, 58, 121, 113, 2, 45, 43, 33, 117, 76, 111, 7, 112, 110, 13, 46, 87, 119, 107, 109, 61, 93, 42, 108, 106, 36, 0, 105, 40, 101, 28, 104, 79, 92, 22, 41, 89, 37, 30, 35, 100, 19, 94, 26, 96, 90, 81, 88, 32, 34, 31, 29, 97, 71, 78, 23, 77, 99, 80, 69, 98, 27, 95, 84, 72, 10, 65, 4, 24, 1, 8, 12, 91, 68, 15, 73, 86, 17, 83, 11, 85, 9, 6, 14, 18, 82, 64, 21, 16, 5, 66, 75, 3]], "model.layers.8.self_attn.k_proj": [[116, 60, 38, 22, 51, 97, 94, 123, 117, 28, 89, 90, 84, 124, 18, 62, 16, 119, 81, 91, 36, 77, 52, 118, 87, 58, 15, 74, 35, 7, 54, 86, 19, 59, 50, 121, 61, 57, 21, 80, 42, 56, 99, 78, 126, 53, 100, 10, 73, 13, 48, 70, 41, 46, 75, 108, 125, 44, 127, 76, 40, 14, 114, 107, 103, 115, 104, 47, 98, 37, 63, 55, 106, 109, 105, 110, 24, 111, 26, 120, 43, 101, 45, 112, 49, 113, 39, 31, 122, 9, 95, 68, 67, 32, 85, 96, 82, 29, 1, 88, 30, 34, 64, 17, 102, 4, 66, 5, 23, 6, 92, 27, 20, 93, 71, 12, 83, 72, 3, 33, 65, 25, 11, 0, 2, 79, 8, 69], [60, 119, 98, 102, 88, 18, 85, 9, 80, 28, 78, 77, 64, 75, 71, 65, 59, 99, 3, 19, 93, 108, 68, 12, 69, 23, 115, 125, 2, 70, 61, 95, 111, 43, 51, 104, 29, 11, 38, 91, 5, 124, 123, 22, 117, 47, 37, 52, 20, 58, 46, 42, 24, 30, 10, 31, 49, 33, 66, 103, 36, 118, 53, 67, 40, 90, 113, 4, 15, 17, 122, 86, 81, 7, 114, 109, 6, 107, 83, 41, 48, 8, 92, 25, 63, 62, 54, 127, 32, 96, 13, 110, 94, 34, 0, 89, 84, 44, 106, 105, 100, 112, 120, 126, 26, 79, 97, 76, 121, 57, 55, 39, 82, 35, 116, 87, 16, 50, 45, 73, 101, 72, 56, 27, 74, 21, 1, 14], [59, 22, 60, 116, 39, 34, 114, 120, 54, 86, 124, 127, 51, 50, 35, 62, 55, 117, 113, 94, 57, 121, 56, 125, 122, 123, 49, 119, 52, 46, 29, 99, 61, 118, 43, 126, 58, 53, 110, 115, 112, 63, 103, 111, 47, 106, 108, 109, 76, 37, 18, 48, 28, 45, 44, 107, 42, 40, 81, 104, 96, 41, 14, 102, 38, 101, 105, 11, 85, 91, 95, 98, 15, 97, 100, 13, 74, 27, 26, 36, 73, 20, 25, 16, 82, 7, 32, 24, 30, 23, 78, 33, 92, 72, 6, 19, 8, 79, 80, 31, 89, 69, 88, 93, 90, 83, 66, 3, 68, 87, 17, 75, 0, 4, 1, 9, 5, 10, 67, 21, 12, 65, 84, 77, 2, 70, 71, 64], [40, 118, 98, 31, 23, 18, 25, 84, 16, 12, 56, 51, 78, 72, 10, 15, 9, 117, 55, 69, 71, 49, 63, 0, 54, 67, 62, 6, 1, 109, 4, 110, 60, 58, 45, 50, 127, 27, 124, 13, 2, 64, 66, 113, 74, 46, 8, 17, 47, 106, 42, 83, 123, 112, 44, 34, 19, 53, 91, 75, 126, 26, 24, 125, 52, 114, 61, 122, 29, 97, 68, 77, 30, 48, 11, 94, 108, 111, 59, 38, 119, 93, 107, 99, 120, 86, 103, 88, 39, 28, 121, 7, 21, 101, 33, 90, 100, 43, 105, 115, 41, 32, 96, 35, 92, 36, 102, 14, 37, 116, 57, 65, 89, 82, 85, 20, 70, 87, 22, 76, 79, 3, 80, 95, 5, 73, 81, 104], [105, 54, 59, 35, 95, 22, 91, 88, 21, 64, 110, 123, 68, 18, 97, 13, 16, 92, 74, 1, 6, 79, 37, 45, 71, 62, 52, 98, 44, 82, 41, 55, 107, 66, 83, 108, 126, 77, 57, 11, 50, 99, 48, 120, 127, 104, 58, 112, 53, 8, 90, 117, 56, 94, 122, 5, 40, 125, 60, 63, 51, 81, 121, 67, 103, 61, 106, 124, 114, 34, 12, 111, 115, 65, 3, 26, 113, 102, 116, 46, 42, 119, 30, 96, 47, 93, 73, 100, 15, 25, 109, 118, 36, 78, 17, 87, 43, 20, 14, 39, 2, 75, 38, 32, 0, 101, 76, 89, 84, 49, 29, 23, 19, 69, 72, 9, 33, 80, 31, 28, 24, 70, 7, 27, 10, 86, 4, 85], [39, 59, 120, 97, 23, 95, 83, 16, 11, 114, 20, 0, 78, 13, 82, 7, 76, 8, 73, 2, 42, 65, 48, 15, 50, 126, 69, 68, 3, 104, 107, 52, 25, 32, 27, 43, 117, 105, 119, 45, 62, 51, 4, 115, 53, 112, 118, 40, 9, 89, 81, 90, 88, 86, 58, 44, 98, 55, 1, 85, 109, 21, 10, 113, 79, 127, 66, 5, 54, 28, 30, 37, 35, 63, 122, 75, 47, 124, 46, 24, 102, 29, 61, 34, 106, 38, 94, 108, 121, 67, 91, 70, 57, 125, 49, 26, 110, 101, 22, 36, 96, 123, 72, 56, 60, 14, 18, 93, 99, 111, 100, 41, 116, 6, 84, 92, 19, 64, 17, 74, 12, 87, 77, 31, 80, 71, 103, 33], [108, 38, 22, 63, 34, 95, 50, 29, 23, 26, 18, 88, 44, 92, 104, 17, 19, 120, 20, 66, 115, 12, 114, 57, 125, 80, 13, 49, 107, 37, 123, 69, 53, 98, 112, 111, 8, 127, 45, 47, 121, 126, 51, 78, 113, 10, 55, 61, 35, 56, 52, 14, 54, 62, 91, 59, 119, 117, 65, 79, 85, 116, 109, 122, 7, 60, 64, 103, 118, 83, 124, 89, 46, 110, 105, 27, 70, 93, 58, 15, 9, 39, 100, 106, 48, 11, 40, 5, 97, 72, 42, 3, 28, 33, 94, 25, 41, 99, 96, 36, 77, 32, 30, 87, 84, 67, 43, 82, 24, 75, 76, 0, 21, 81, 31, 73, 101, 6, 4, 86, 1, 16, 90, 71, 2, 74, 68, 102], [122, 54, 38, 22, 97, 127, 63, 30, 114, 120, 125, 53, 47, 92, 50, 62, 126, 116, 82, 123, 51, 118, 59, 52, 124, 25, 57, 115, 46, 87, 55, 48, 58, 113, 44, 49, 111, 61, 56, 60, 121, 119, 117, 109, 12, 108, 98, 64, 43, 110, 112, 71, 15, 100, 17, 45, 40, 107, 20, 103, 106, 80, 32, 104, 105, 101, 41, 75, 39, 78, 42, 66, 36, 19, 67, 10, 35, 37, 102, 34, 94, 68, 5, 91, 99, 31, 77, 14, 3, 83, 8, 96, 18, 81, 73, 93, 24, 21, 26, 65, 95, 9, 11, 79, 88, 69, 29, 90, 33, 85, 70, 4, 16, 23, 27, 28, 89, 13, 6, 86, 1, 74, 72, 0, 2, 7, 84, 76]], "model.layers.8.self_attn.qk_proj": [[59, 60, 54, 118, 116, 119, 122, 120, 51, 63, 114, 108, 50, 38, 44, 125, 105, 117, 123, 40, 41, 102, 82, 86, 127, 47, 95, 34, 53, 87, 98, 18, 55, 62, 58, 39, 57, 52, 49, 124, 24, 103, 126, 88, 23, 115, 80, 97, 16, 78, 61, 84, 27, 20, 22, 89, 29, 13, 14, 104, 110, 83, 77, 19, 107, 113, 12, 92, 85, 31, 99, 76, 93, 46, 21, 11, 43, 56, 48, 15, 75, 111, 112, 109, 25, 33, 42, 9, 90, 121, 73, 79, 28, 71, 7, 106, 8, 26, 35, 45, 30, 81, 94, 10, 74, 64, 17, 36, 91, 5, 37, 3, 4, 0, 70, 72, 32, 67, 68, 69, 101, 100, 66, 2, 6, 65, 96, 1], [59, 60, 118, 54, 116, 122, 119, 120, 51, 63, 114, 108, 50, 38, 117, 44, 40, 105, 125, 123, 53, 102, 86, 41, 18, 103, 98, 127, 62, 82, 115, 87, 52, 126, 39, 88, 95, 24, 124, 34, 57, 84, 58, 97, 80, 23, 104, 55, 47, 20, 16, 22, 107, 78, 27, 77, 89, 49, 56, 14, 92, 19, 25, 85, 31, 13, 61, 83, 93, 12, 76, 21, 79, 42, 29, 99, 110, 48, 113, 11, 43, 75, 28, 9, 106, 26, 109, 8, 46, 73, 71, 45, 112, 15, 35, 37, 90, 121, 33, 111, 7, 94, 30, 36, 91, 17, 64, 10, 5, 81, 74, 0, 65, 4, 32, 67, 2, 1, 3, 70, 66, 101, 68, 69, 6, 100, 96, 72], [59, 60, 118, 54, 116, 119, 122, 120, 51, 63, 108, 38, 50, 114, 123, 44, 40, 117, 105, 53, 125, 102, 86, 34, 98, 39, 41, 95, 18, 24, 97, 87, 62, 82, 57, 127, 88, 23, 52, 115, 55, 84, 124, 22, 103, 20, 126, 49, 27, 104, 58, 89, 31, 47, 113, 16, 80, 93, 78, 29, 92, 77, 25, 56, 61, 48, 99, 33, 109, 85, 13, 75, 26, 45, 19, 21, 112, 76, 107, 43, 110, 14, 90, 35, 11, 94, 79, 12, 37, 83, 28, 46, 91, 111, 30, 42, 106, 15, 9, 7, 121, 8, 71, 81, 17, 36, 73, 32, 74, 10, 69, 100, 101, 64, 65, 0, 3, 72, 4, 66, 2, 68, 5, 6, 1, 67, 70, 96], [59, 60, 118, 54, 116, 119, 122, 120, 63, 51, 114, 108, 50, 38, 123, 44, 40, 117, 125, 105, 127, 86, 95, 41, 98, 53, 87, 102, 24, 115, 18, 62, 39, 34, 126, 82, 49, 103, 124, 88, 97, 23, 58, 47, 57, 84, 113, 52, 20, 22, 93, 55, 29, 80, 31, 16, 27, 14, 89, 25, 104, 83, 99, 77, 92, 110, 19, 48, 13, 78, 43, 109, 107, 61, 45, 11, 56, 85, 33, 21, 28, 46, 42, 15, 76, 37, 90, 26, 12, 75, 35, 73, 30, 94, 7, 36, 106, 79, 121, 112, 8, 111, 64, 9, 32, 71, 17, 91, 101, 81, 74, 10, 67, 3, 69, 4, 1, 2, 66, 0, 5, 65, 72, 96, 68, 70, 100, 6], [59, 60, 118, 54, 119, 122, 116, 120, 51, 63, 114, 108, 38, 50, 44, 117, 123, 40, 105, 125, 102, 98, 127, 57, 53, 95, 39, 24, 18, 82, 62, 41, 124, 87, 103, 34, 86, 23, 49, 113, 126, 115, 55, 80, 104, 88, 27, 97, 78, 84, 20, 58, 47, 52, 22, 77, 29, 16, 89, 93, 19, 46, 31, 110, 13, 14, 112, 45, 11, 12, 76, 92, 56, 107, 85, 28, 99, 83, 61, 48, 79, 109, 43, 21, 25, 73, 90, 35, 42, 37, 71, 0, 33, 9, 75, 111, 15, 26, 121, 7, 64, 36, 30, 106, 94, 17, 74, 72, 1, 3, 5, 32, 8, 4, 68, 66, 67, 10, 2, 81, 69, 91, 65, 101, 6, 96, 70, 100], [59, 60, 54, 118, 116, 119, 122, 120, 63, 51, 114, 108, 50, 38, 44, 123, 125, 117, 40, 105, 86, 102, 53, 127, 41, 95, 98, 82, 18, 24, 62, 124, 39, 87, 88, 80, 103, 16, 115, 34, 84, 57, 55, 52, 20, 78, 22, 47, 58, 23, 14, 126, 77, 49, 31, 83, 97, 104, 29, 27, 13, 107, 113, 76, 19, 89, 85, 28, 11, 12, 110, 92, 93, 99, 48, 42, 56, 109, 79, 75, 15, 25, 43, 61, 45, 46, 21, 30, 73, 112, 33, 9, 71, 90, 35, 37, 7, 72, 111, 121, 26, 106, 17, 81, 5, 94, 8, 36, 74, 10, 32, 64, 2, 0, 68, 91, 3, 69, 6, 67, 4, 65, 66, 101, 100, 1, 96, 70], [59, 60, 118, 54, 116, 119, 122, 120, 63, 51, 108, 114, 50, 38, 123, 117, 44, 40, 125, 105, 127, 86, 53, 95, 18, 87, 98, 82, 41, 102, 115, 24, 52, 88, 62, 84, 34, 126, 124, 22, 47, 16, 58, 20, 23, 39, 80, 97, 55, 57, 103, 104, 78, 29, 83, 13, 14, 113, 89, 27, 112, 49, 92, 107, 110, 19, 77, 76, 99, 48, 56, 31, 93, 11, 111, 21, 85, 12, 75, 61, 15, 28, 109, 9, 73, 79, 121, 43, 90, 33, 42, 94, 25, 81, 30, 46, 17, 7, 35, 26, 72, 71, 45, 106, 37, 74, 10, 91, 101, 32, 5, 8, 36, 68, 6, 2, 64, 100, 3, 69, 67, 4, 70, 65, 66, 96, 0, 1], [59, 60, 118, 54, 116, 119, 122, 63, 120, 108, 51, 114, 38, 50, 44, 117, 40, 105, 123, 86, 125, 87, 82, 18, 102, 127, 115, 95, 98, 88, 124, 41, 24, 34, 39, 22, 52, 58, 16, 20, 53, 80, 84, 107, 57, 23, 14, 47, 103, 104, 13, 27, 55, 56, 97, 76, 78, 62, 12, 126, 89, 83, 48, 77, 92, 85, 121, 29, 49, 93, 21, 31, 19, 75, 110, 113, 79, 99, 28, 11, 106, 15, 112, 73, 46, 9, 111, 42, 25, 30, 94, 90, 61, 109, 37, 43, 91, 71, 35, 45, 72, 33, 81, 26, 7, 10, 74, 17, 0, 64, 66, 32, 69, 36, 101, 5, 4, 65, 68, 6, 2, 67, 100, 8, 3, 1, 70, 96], [59, 60, 118, 54, 116, 119, 122, 120, 51, 63, 108, 50, 38, 114, 117, 123, 44, 105, 40, 125, 102, 127, 18, 98, 95, 62, 82, 24, 87, 115, 34, 39, 41, 86, 84, 88, 52, 16, 20, 103, 57, 22, 107, 126, 58, 89, 124, 80, 55, 97, 104, 27, 47, 23, 48, 13, 29, 78, 14, 92, 53, 46, 49, 77, 28, 21, 93, 19, 31, 110, 99, 25, 76, 83, 43, 56, 11, 113, 37, 85, 111, 94, 9, 109, 75, 12, 42, 15, 79, 112, 45, 33, 91, 61, 106, 35, 72, 30, 71, 73, 26, 90, 121, 7, 17, 101, 74, 81, 32, 64, 67, 36, 10, 1, 0, 5, 4, 68, 3, 69, 2, 66, 70, 65, 6, 8, 96, 100], [59, 60, 118, 54, 116, 119, 122, 120, 63, 51, 108, 114, 50, 117, 38, 123, 125, 44, 105, 40, 18, 102, 53, 41, 95, 82, 98, 62, 86, 87, 115, 103, 24, 88, 127, 39, 16, 49, 124, 52, 34, 80, 23, 126, 22, 84, 20, 47, 78, 58, 55, 13, 31, 97, 92, 19, 27, 89, 29, 77, 113, 107, 104, 14, 21, 75, 76, 42, 83, 57, 93, 61, 25, 12, 85, 110, 43, 99, 46, 33, 109, 26, 28, 79, 56, 112, 48, 111, 9, 45, 11, 7, 71, 72, 73, 121, 94, 15, 90, 37, 30, 35, 106, 91, 17, 36, 74, 81, 32, 10, 69, 3, 101, 0, 1, 64, 67, 68, 66, 100, 70, 5, 4, 6, 2, 65, 96, 8], [59, 60, 54, 118, 116, 119, 122, 120, 51, 63, 108, 114, 50, 38, 44, 123, 125, 40, 117, 105, 102, 95, 127, 58, 55, 41, 98, 18, 47, 87, 39, 86, 82, 24, 57, 53, 52, 126, 124, 16, 103, 88, 115, 34, 49, 80, 23, 78, 22, 13, 20, 62, 84, 27, 14, 104, 107, 97, 113, 19, 61, 29, 77, 83, 31, 89, 93, 109, 28, 110, 33, 121, 75, 46, 42, 43, 11, 12, 25, 85, 99, 76, 56, 21, 92, 71, 48, 7, 15, 79, 111, 45, 73, 26, 9, 112, 35, 72, 30, 106, 37, 94, 90, 10, 0, 64, 5, 81, 101, 67, 74, 36, 17, 91, 4, 66, 32, 68, 65, 2, 69, 1, 70, 3, 96, 8, 100, 6], [59, 60, 54, 118, 116, 119, 122, 120, 51, 63, 108, 114, 50, 38, 117, 125, 44, 40, 123, 105, 127, 18, 55, 102, 86, 98, 87, 82, 95, 24, 53, 41, 39, 22, 16, 88, 78, 47, 62, 126, 124, 57, 80, 34, 84, 23, 52, 20, 29, 49, 103, 27, 58, 113, 13, 97, 115, 104, 46, 31, 83, 14, 121, 89, 110, 77, 28, 93, 43, 76, 107, 11, 75, 19, 12, 112, 92, 61, 21, 73, 99, 48, 15, 25, 90, 79, 85, 9, 109, 45, 42, 106, 33, 72, 7, 71, 56, 81, 111, 35, 30, 0, 26, 37, 36, 64, 69, 91, 10, 17, 67, 74, 3, 4, 94, 65, 8, 68, 5, 70, 66, 2, 1, 6, 101, 32, 100, 96], [59, 60, 118, 54, 116, 119, 122, 120, 63, 51, 114, 50, 108, 38, 117, 44, 125, 123, 105, 40, 18, 102, 41, 127, 98, 52, 87, 34, 82, 86, 95, 88, 115, 22, 57, 23, 55, 124, 62, 24, 126, 39, 80, 84, 20, 16, 97, 58, 53, 78, 103, 110, 104, 47, 27, 89, 107, 61, 92, 49, 113, 29, 43, 13, 31, 14, 19, 28, 93, 85, 83, 77, 99, 21, 76, 25, 75, 11, 12, 56, 15, 73, 112, 109, 42, 45, 121, 46, 33, 111, 90, 9, 48, 79, 35, 26, 37, 106, 91, 7, 94, 30, 71, 17, 36, 8, 72, 81, 74, 10, 32, 69, 101, 100, 66, 68, 3, 70, 5, 4, 0, 67, 65, 64, 1, 6, 2, 96], [59, 60, 54, 118, 116, 119, 122, 120, 63, 51, 114, 50, 108, 125, 38, 44, 117, 123, 105, 40, 53, 102, 41, 55, 18, 86, 95, 82, 34, 62, 127, 87, 98, 52, 124, 16, 88, 80, 103, 22, 115, 23, 24, 47, 84, 20, 58, 57, 39, 13, 104, 14, 78, 61, 49, 85, 107, 43, 110, 12, 27, 92, 31, 89, 97, 113, 19, 93, 126, 83, 21, 45, 76, 11, 77, 28, 75, 79, 29, 99, 121, 15, 112, 48, 56, 25, 42, 9, 73, 46, 7, 71, 90, 33, 8, 106, 30, 109, 26, 94, 0, 64, 111, 35, 10, 72, 91, 17, 81, 37, 74, 69, 65, 66, 4, 67, 32, 5, 68, 70, 2, 36, 101, 1, 6, 3, 100, 96], [59, 60, 54, 118, 116, 122, 119, 120, 63, 51, 108, 50, 114, 38, 40, 44, 117, 123, 105, 102, 125, 86, 95, 98, 18, 82, 53, 124, 41, 127, 87, 55, 84, 34, 88, 39, 62, 24, 52, 104, 57, 23, 58, 47, 49, 103, 97, 20, 16, 80, 115, 126, 14, 22, 110, 31, 78, 43, 56, 46, 113, 27, 61, 13, 29, 99, 89, 12, 107, 45, 106, 85, 77, 93, 19, 48, 83, 28, 92, 109, 75, 15, 112, 76, 25, 37, 79, 33, 21, 7, 94, 11, 73, 90, 35, 42, 71, 111, 8, 121, 91, 9, 30, 36, 26, 17, 81, 10, 0, 101, 72, 32, 67, 3, 69, 64, 68, 74, 5, 65, 100, 2, 4, 6, 66, 1, 70, 96], [59, 60, 54, 118, 116, 119, 122, 120, 51, 63, 108, 114, 50, 38, 125, 44, 40, 123, 105, 102, 117, 86, 52, 124, 18, 127, 98, 115, 82, 57, 41, 88, 95, 103, 39, 87, 78, 62, 24, 34, 61, 107, 16, 20, 126, 23, 47, 53, 110, 97, 104, 22, 55, 80, 84, 49, 29, 31, 27, 89, 109, 19, 83, 113, 45, 85, 99, 58, 46, 93, 48, 14, 13, 21, 25, 43, 77, 28, 76, 92, 121, 11, 56, 37, 12, 42, 112, 75, 8, 106, 9, 79, 33, 30, 15, 7, 35, 73, 36, 90, 26, 71, 111, 17, 91, 94, 32, 67, 10, 81, 74, 68, 5, 64, 0, 65, 4, 2, 3, 101, 69, 1, 70, 96, 72, 66, 100, 6], [59, 60, 54, 118, 116, 122, 119, 63, 120, 51, 108, 114, 50, 38, 117, 44, 40, 123, 105, 125, 86, 18, 102, 82, 41, 98, 52, 87, 95, 22, 34, 124, 88, 24, 115, 39, 127, 16, 23, 62, 20, 80, 55, 104, 49, 103, 84, 126, 110, 58, 48, 56, 97, 78, 57, 27, 53, 93, 89, 83, 92, 29, 14, 47, 19, 45, 31, 107, 13, 77, 61, 85, 28, 113, 21, 76, 11, 12, 33, 42, 9, 99, 112, 75, 25, 46, 79, 43, 121, 8, 15, 90, 37, 73, 35, 71, 7, 30, 109, 106, 94, 26, 17, 10, 111, 91, 5, 81, 74, 36, 101, 32, 4, 64, 6, 0, 72, 68, 100, 66, 69, 70, 65, 2, 96, 1, 67, 3], [59, 60, 54, 118, 116, 119, 122, 120, 63, 51, 108, 50, 38, 114, 44, 40, 105, 117, 86, 125, 123, 102, 127, 41, 82, 98, 87, 18, 124, 34, 22, 24, 95, 39, 88, 62, 97, 115, 23, 52, 20, 16, 55, 103, 80, 84, 126, 110, 27, 56, 49, 57, 93, 58, 104, 19, 29, 31, 78, 48, 89, 83, 92, 47, 13, 113, 85, 14, 99, 25, 28, 12, 107, 76, 53, 61, 21, 77, 112, 26, 75, 42, 45, 46, 15, 111, 7, 90, 11, 33, 109, 79, 43, 30, 9, 35, 94, 8, 73, 37, 106, 71, 121, 17, 91, 32, 36, 5, 81, 101, 6, 74, 10, 100, 0, 64, 67, 72, 4, 68, 96, 66, 3, 70, 2, 69, 65, 1], [59, 60, 54, 118, 116, 119, 122, 120, 51, 63, 108, 114, 50, 38, 117, 40, 44, 123, 105, 125, 98, 86, 102, 87, 62, 34, 95, 82, 97, 39, 127, 124, 57, 41, 18, 52, 115, 24, 49, 88, 22, 55, 103, 23, 58, 53, 80, 126, 104, 29, 84, 16, 110, 56, 45, 20, 89, 61, 99, 25, 27, 31, 14, 26, 48, 46, 47, 83, 113, 93, 109, 92, 19, 33, 107, 28, 43, 42, 13, 78, 112, 12, 85, 77, 37, 35, 75, 15, 21, 73, 121, 76, 8, 9, 106, 36, 30, 94, 90, 11, 7, 79, 32, 91, 17, 71, 111, 101, 69, 3, 81, 74, 64, 10, 67, 0, 65, 4, 96, 5, 72, 6, 68, 66, 2, 1, 100, 70], [59, 60, 54, 118, 116, 122, 119, 63, 120, 51, 50, 108, 114, 38, 117, 44, 125, 40, 105, 102, 123, 86, 127, 98, 95, 41, 52, 82, 34, 18, 103, 39, 87, 115, 24, 57, 88, 55, 124, 62, 22, 23, 104, 80, 20, 16, 107, 126, 97, 27, 89, 110, 84, 56, 61, 58, 14, 83, 47, 53, 49, 12, 77, 31, 113, 25, 78, 93, 19, 99, 13, 28, 109, 48, 92, 29, 21, 46, 45, 30, 75, 76, 85, 112, 33, 43, 106, 42, 11, 73, 37, 26, 71, 35, 79, 121, 15, 9, 90, 7, 36, 17, 8, 91, 94, 111, 101, 74, 32, 10, 72, 69, 64, 81, 67, 65, 0, 2, 68, 96, 4, 5, 6, 100, 70, 66, 3, 1], [59, 60, 118, 54, 116, 119, 122, 63, 120, 51, 108, 50, 114, 38, 105, 44, 40, 86, 117, 125, 123, 102, 127, 18, 41, 124, 87, 126, 39, 52, 82, 88, 98, 55, 95, 22, 103, 49, 24, 115, 57, 34, 16, 23, 80, 20, 78, 62, 84, 53, 47, 56, 83, 104, 27, 97, 93, 13, 77, 29, 89, 14, 19, 110, 111, 61, 107, 113, 85, 75, 58, 31, 12, 48, 99, 76, 28, 21, 25, 109, 9, 71, 45, 92, 106, 79, 43, 11, 90, 37, 33, 26, 46, 42, 30, 121, 112, 73, 35, 7, 15, 36, 81, 94, 8, 32, 72, 67, 74, 17, 91, 64, 10, 68, 69, 3, 2, 0, 5, 101, 6, 1, 70, 65, 4, 66, 96, 100], [59, 60, 118, 54, 119, 122, 116, 120, 63, 51, 108, 50, 38, 114, 117, 44, 123, 105, 40, 86, 125, 87, 18, 102, 41, 98, 82, 55, 62, 124, 115, 39, 52, 34, 24, 95, 22, 57, 88, 103, 80, 127, 126, 23, 97, 47, 16, 84, 29, 48, 20, 78, 53, 89, 110, 27, 49, 61, 93, 56, 43, 92, 19, 58, 31, 42, 25, 83, 107, 104, 77, 113, 14, 75, 13, 99, 76, 85, 28, 112, 121, 12, 21, 46, 90, 106, 109, 79, 26, 11, 33, 30, 9, 15, 35, 73, 94, 111, 37, 45, 36, 71, 72, 8, 17, 7, 91, 81, 74, 5, 0, 101, 32, 64, 10, 3, 100, 68, 2, 67, 65, 69, 70, 96, 4, 1, 66, 6], [59, 60, 118, 54, 119, 116, 122, 63, 120, 51, 108, 114, 38, 50, 117, 44, 123, 105, 125, 40, 124, 41, 86, 98, 34, 102, 18, 39, 87, 127, 95, 62, 115, 52, 82, 24, 22, 55, 23, 57, 103, 88, 47, 16, 84, 53, 49, 97, 126, 80, 61, 20, 48, 58, 27, 104, 78, 56, 89, 13, 45, 19, 93, 113, 14, 92, 77, 31, 76, 110, 29, 83, 99, 107, 11, 85, 42, 12, 112, 28, 25, 75, 21, 26, 37, 121, 73, 43, 9, 46, 106, 111, 109, 33, 90, 79, 30, 15, 94, 7, 35, 72, 71, 36, 91, 17, 101, 32, 74, 0, 81, 64, 69, 100, 10, 65, 3, 8, 5, 68, 2, 96, 66, 67, 6, 4, 70, 1], [59, 60, 118, 54, 119, 116, 122, 63, 120, 51, 108, 114, 50, 38, 117, 125, 44, 40, 105, 123, 86, 102, 98, 127, 62, 95, 124, 87, 52, 41, 88, 82, 39, 18, 103, 115, 23, 34, 24, 57, 53, 16, 22, 126, 49, 84, 92, 110, 55, 58, 80, 97, 27, 47, 25, 20, 107, 56, 89, 31, 78, 83, 99, 42, 14, 104, 19, 121, 93, 77, 29, 48, 21, 13, 113, 12, 43, 45, 76, 28, 75, 85, 46, 73, 11, 72, 61, 15, 33, 109, 106, 30, 35, 112, 90, 37, 79, 26, 7, 111, 36, 91, 71, 9, 32, 94, 68, 17, 81, 0, 74, 69, 65, 70, 64, 100, 5, 67, 101, 3, 2, 10, 1, 66, 4, 6, 8, 96], [59, 60, 54, 118, 116, 119, 122, 63, 120, 51, 108, 50, 114, 38, 44, 117, 40, 125, 123, 105, 86, 102, 127, 98, 52, 124, 18, 41, 82, 39, 95, 87, 55, 57, 24, 34, 62, 88, 126, 22, 16, 97, 115, 53, 84, 49, 110, 47, 103, 58, 23, 80, 20, 83, 14, 29, 99, 104, 46, 48, 43, 56, 77, 89, 13, 61, 113, 78, 27, 93, 31, 25, 28, 73, 45, 76, 92, 85, 21, 107, 111, 19, 106, 121, 12, 75, 11, 112, 109, 72, 33, 15, 9, 42, 90, 35, 37, 79, 26, 71, 36, 94, 7, 81, 91, 74, 32, 5, 17, 30, 101, 10, 0, 64, 100, 68, 4, 70, 67, 2, 3, 69, 1, 96, 6, 8, 65, 66], [59, 60, 118, 54, 116, 119, 122, 63, 120, 51, 108, 38, 50, 114, 44, 117, 40, 125, 105, 123, 102, 86, 52, 98, 18, 127, 124, 82, 87, 95, 22, 39, 41, 88, 126, 115, 34, 55, 62, 24, 103, 97, 16, 23, 57, 84, 80, 20, 110, 47, 83, 58, 89, 27, 14, 53, 45, 56, 78, 49, 12, 19, 92, 29, 93, 61, 13, 107, 77, 104, 46, 21, 42, 43, 121, 99, 85, 28, 31, 76, 75, 9, 48, 15, 11, 79, 26, 112, 35, 73, 25, 106, 71, 90, 33, 72, 109, 113, 30, 37, 17, 36, 74, 94, 7, 81, 91, 32, 111, 64, 0, 101, 10, 100, 69, 4, 5, 66, 65, 8, 70, 3, 96, 6, 67, 2, 1, 68], [59, 60, 118, 54, 116, 119, 122, 120, 63, 51, 50, 108, 114, 38, 40, 123, 105, 44, 117, 125, 62, 102, 124, 98, 87, 41, 86, 55, 57, 95, 82, 126, 34, 22, 52, 47, 127, 24, 39, 18, 88, 103, 97, 115, 84, 27, 89, 104, 23, 20, 53, 16, 49, 29, 93, 80, 78, 99, 48, 31, 61, 58, 83, 92, 42, 121, 21, 45, 110, 13, 107, 25, 77, 14, 85, 46, 19, 111, 12, 43, 90, 33, 11, 75, 26, 112, 28, 37, 9, 56, 15, 94, 30, 109, 76, 113, 35, 91, 7, 73, 106, 72, 79, 71, 17, 32, 101, 100, 67, 74, 69, 36, 0, 10, 4, 2, 68, 1, 66, 64, 65, 5, 81, 3, 70, 96, 6, 8], [59, 60, 118, 54, 116, 119, 122, 63, 120, 51, 108, 114, 50, 38, 44, 40, 105, 123, 117, 125, 86, 82, 87, 41, 127, 124, 102, 95, 98, 52, 39, 126, 18, 24, 34, 62, 88, 57, 22, 55, 23, 47, 16, 80, 115, 58, 20, 97, 84, 78, 83, 103, 104, 27, 56, 14, 93, 89, 29, 12, 92, 110, 19, 13, 107, 77, 76, 75, 49, 53, 21, 121, 45, 48, 61, 85, 11, 99, 46, 31, 43, 42, 109, 79, 15, 28, 9, 7, 73, 71, 111, 112, 113, 106, 25, 30, 0, 26, 94, 64, 72, 37, 33, 90, 35, 69, 17, 65, 74, 3, 8, 6, 4, 68, 66, 91, 1, 67, 10, 101, 2, 81, 36, 32, 5, 70, 96, 100], [59, 60, 118, 54, 116, 119, 122, 120, 51, 63, 114, 108, 50, 38, 40, 117, 105, 44, 123, 102, 98, 127, 87, 125, 57, 41, 124, 18, 82, 95, 39, 52, 126, 86, 34, 62, 88, 24, 22, 103, 55, 97, 115, 58, 23, 84, 49, 20, 27, 16, 80, 104, 53, 31, 56, 89, 99, 47, 93, 78, 113, 29, 107, 83, 13, 121, 14, 110, 12, 77, 85, 25, 48, 109, 61, 43, 92, 45, 19, 28, 26, 106, 46, 21, 33, 90, 11, 94, 37, 75, 30, 76, 9, 111, 42, 35, 71, 15, 73, 112, 79, 36, 72, 91, 7, 8, 32, 64, 69, 74, 10, 17, 0, 66, 67, 1, 68, 6, 65, 101, 81, 3, 4, 5, 2, 96, 70, 100], [59, 60, 118, 54, 122, 116, 119, 120, 51, 63, 114, 108, 50, 38, 44, 125, 40, 105, 123, 117, 102, 86, 98, 52, 87, 41, 18, 127, 34, 82, 95, 124, 126, 88, 53, 57, 39, 115, 58, 55, 24, 80, 20, 47, 16, 103, 78, 97, 84, 22, 14, 23, 19, 31, 56, 62, 93, 13, 113, 49, 77, 27, 76, 42, 48, 83, 75, 43, 121, 89, 110, 85, 21, 12, 107, 29, 99, 92, 61, 104, 45, 25, 11, 46, 73, 109, 35, 15, 79, 9, 28, 106, 26, 30, 8, 90, 71, 7, 33, 112, 37, 94, 111, 17, 64, 72, 0, 10, 74, 66, 69, 65, 4, 81, 100, 3, 91, 36, 5, 1, 6, 68, 32, 67, 70, 101, 2, 96], [59, 60, 118, 54, 116, 122, 119, 120, 51, 63, 114, 108, 50, 38, 44, 117, 40, 123, 125, 105, 55, 41, 102, 98, 52, 86, 95, 127, 126, 34, 124, 18, 53, 82, 115, 49, 87, 58, 39, 57, 84, 24, 23, 88, 80, 103, 62, 97, 47, 22, 20, 16, 113, 14, 31, 104, 89, 27, 110, 48, 99, 45, 19, 29, 78, 61, 107, 77, 85, 92, 12, 56, 121, 93, 21, 13, 76, 83, 11, 43, 109, 112, 42, 106, 73, 28, 75, 25, 33, 15, 8, 46, 9, 111, 26, 71, 79, 90, 35, 94, 37, 30, 7, 64, 69, 17, 74, 10, 81, 68, 32, 5, 1, 0, 2, 72, 4, 36, 91, 67, 101, 6, 70, 66, 3, 65, 100, 96], [59, 60, 54, 118, 116, 122, 119, 120, 63, 51, 108, 114, 50, 38, 123, 44, 125, 117, 40, 105, 86, 41, 52, 127, 98, 102, 55, 87, 82, 34, 95, 58, 115, 18, 126, 62, 22, 24, 53, 124, 23, 88, 80, 47, 39, 103, 16, 20, 57, 84, 97, 49, 27, 14, 12, 77, 104, 19, 92, 93, 89, 21, 13, 15, 48, 83, 110, 76, 31, 78, 61, 45, 85, 11, 107, 113, 43, 42, 29, 73, 33, 9, 99, 121, 28, 75, 109, 25, 56, 46, 26, 8, 79, 71, 90, 35, 7, 37, 106, 30, 94, 112, 111, 81, 74, 64, 17, 91, 5, 10, 0, 32, 68, 36, 1, 65, 67, 69, 72, 4, 6, 101, 70, 2, 100, 66, 3, 96]], "model.layers.9.self_attn.q_proj": [[110, 101, 46, 124, 63, 59, 28, 32, 113, 25, 89, 61, 19, 115, 27, 87, 16, 60, 57, 49, 122, 83, 85, 26, 78, 22, 47, 114, 55, 58, 56, 53, 96, 99, 37, 92, 108, 41, 23, 54, 48, 125, 30, 79, 51, 11, 119, 67, 84, 1, 24, 118, 69, 35, 62, 91, 120, 123, 116, 70, 43, 88, 97, 117, 121, 68, 106, 112, 90, 126, 109, 29, 40, 2, 77, 127, 45, 111, 8, 17, 86, 0, 74, 50, 82, 107, 15, 5, 42, 9, 66, 21, 18, 10, 52, 14, 6, 104, 105, 94, 36, 93, 100, 81, 65, 102, 44, 34, 33, 76, 20, 80, 31, 103, 3, 64, 7, 38, 4, 71, 95, 75, 73, 12, 13, 39, 98, 72], [110, 124, 101, 63, 46, 113, 25, 59, 80, 28, 24, 61, 89, 115, 57, 122, 60, 56, 83, 114, 32, 53, 58, 47, 125, 27, 22, 51, 41, 54, 119, 49, 78, 55, 40, 11, 112, 118, 62, 48, 123, 43, 96, 120, 106, 69, 23, 121, 37, 108, 52, 117, 127, 50, 42, 109, 19, 104, 45, 100, 111, 116, 35, 126, 105, 16, 107, 85, 91, 87, 102, 30, 38, 39, 31, 44, 33, 92, 34, 103, 84, 36, 86, 20, 17, 94, 88, 26, 97, 73, 98, 29, 99, 93, 8, 90, 95, 77, 72, 14, 18, 1, 21, 12, 67, 81, 5, 79, 76, 15, 82, 70, 4, 75, 2, 0, 10, 9, 7, 13, 6, 74, 66, 3, 68, 71, 64, 65], [110, 124, 46, 101, 63, 59, 89, 61, 11, 37, 113, 57, 25, 122, 60, 115, 69, 56, 53, 19, 54, 58, 47, 55, 32, 114, 120, 48, 16, 51, 49, 41, 117, 118, 116, 28, 112, 50, 62, 27, 109, 24, 119, 22, 121, 111, 125, 92, 127, 99, 126, 123, 43, 87, 45, 52, 82, 30, 107, 108, 39, 100, 72, 35, 103, 42, 105, 106, 1, 96, 104, 83, 40, 0, 4, 44, 23, 78, 2, 86, 77, 80, 26, 102, 15, 38, 29, 34, 85, 97, 90, 84, 67, 73, 36, 31, 33, 14, 88, 94, 5, 6, 98, 95, 18, 17, 9, 93, 91, 7, 21, 8, 20, 68, 79, 12, 81, 75, 66, 10, 13, 76, 71, 70, 74, 64, 3, 65], [124, 110, 101, 59, 46, 89, 63, 61, 19, 28, 115, 25, 83, 60, 113, 56, 27, 57, 23, 114, 122, 24, 118, 54, 78, 53, 55, 22, 125, 119, 47, 49, 58, 120, 48, 51, 37, 26, 107, 93, 32, 50, 81, 96, 117, 121, 99, 112, 11, 106, 43, 62, 123, 92, 127, 41, 35, 109, 108, 17, 116, 111, 103, 45, 42, 36, 104, 52, 44, 82, 126, 40, 100, 105, 30, 39, 38, 85, 88, 95, 14, 12, 34, 86, 87, 8, 98, 97, 31, 33, 102, 29, 16, 91, 94, 77, 69, 84, 18, 21, 73, 20, 15, 90, 80, 75, 76, 67, 13, 5, 79, 72, 9, 6, 74, 7, 1, 10, 70, 4, 3, 71, 2, 0, 65, 68, 66, 64], [117, 41, 59, 61, 97, 126, 101, 39, 31, 87, 55, 43, 42, 44, 28, 100, 124, 112, 110, 116, 35, 92, 102, 113, 96, 122, 57, 86, 36, 121, 127, 89, 54, 58, 114, 63, 38, 49, 52, 115, 93, 125, 47, 94, 51, 60, 53, 105, 99, 120, 16, 80, 48, 111, 32, 45, 50, 84, 46, 98, 29, 104, 19, 56, 118, 109, 123, 108, 40, 62, 27, 78, 33, 24, 107, 23, 74, 18, 26, 17, 25, 83, 37, 85, 75, 30, 91, 22, 20, 119, 21, 34, 14, 106, 12, 88, 9, 68, 81, 13, 103, 15, 77, 10, 3, 7, 76, 73, 95, 11, 71, 79, 4, 82, 66, 8, 90, 2, 70, 5, 6, 0, 72, 69, 64, 67, 65, 1], [41, 64, 107, 101, 117, 31, 67, 63, 1, 56, 69, 55, 26, 97, 42, 126, 112, 87, 105, 2, 72, 10, 39, 18, 84, 79, 76, 98, 71, 124, 65, 43, 106, 28, 78, 4, 35, 9, 111, 6, 44, 100, 53, 70, 29, 93, 23, 114, 80, 12, 121, 3, 74, 58, 86, 66, 20, 92, 94, 57, 5, 81, 11, 89, 59, 127, 33, 110, 102, 99, 83, 38, 75, 7, 48, 125, 109, 8, 61, 19, 15, 17, 77, 27, 21, 0, 46, 113, 88, 62, 118, 68, 50, 85, 45, 116, 32, 54, 104, 47, 24, 25, 120, 52, 122, 96, 40, 13, 51, 115, 103, 60, 90, 30, 34, 14, 73, 119, 123, 49, 82, 36, 16, 108, 91, 22, 95, 37], [41, 117, 26, 87, 31, 42, 101, 97, 56, 98, 18, 124, 35, 84, 105, 79, 15, 23, 12, 76, 55, 39, 57, 78, 43, 20, 80, 7, 59, 112, 93, 106, 118, 44, 74, 90, 61, 102, 10, 89, 100, 63, 28, 83, 114, 126, 107, 82, 58, 9, 53, 25, 104, 92, 69, 81, 14, 48, 8, 96, 36, 21, 127, 95, 86, 70, 24, 49, 121, 110, 99, 11, 29, 60, 111, 22, 46, 27, 17, 62, 85, 51, 122, 109, 16, 4, 52, 120, 125, 32, 5, 19, 77, 33, 13, 123, 40, 54, 45, 91, 2, 30, 67, 94, 71, 47, 3, 73, 115, 38, 113, 116, 72, 88, 64, 108, 50, 37, 103, 66, 65, 75, 68, 34, 0, 119, 6, 1], [117, 56, 41, 61, 107, 43, 97, 39, 31, 126, 35, 44, 118, 100, 42, 57, 110, 89, 121, 102, 36, 105, 46, 87, 63, 48, 101, 94, 52, 124, 92, 28, 59, 53, 113, 50, 86, 55, 38, 98, 54, 122, 49, 51, 29, 112, 111, 25, 45, 115, 108, 83, 62, 123, 125, 127, 80, 40, 47, 96, 104, 19, 99, 114, 27, 58, 116, 120, 60, 33, 109, 34, 88, 26, 18, 32, 93, 85, 9, 23, 106, 22, 64, 21, 24, 16, 119, 84, 30, 17, 103, 13, 91, 37, 77, 14, 90, 1, 2, 81, 69, 78, 67, 76, 95, 11, 79, 4, 82, 66, 73, 71, 68, 7, 70, 10, 75, 72, 20, 5, 12, 74, 6, 8, 65, 15, 3, 0], [105, 98, 86, 18, 84, 53, 79, 81, 88, 13, 111, 41, 76, 119, 74, 51, 72, 55, 29, 10, 117, 52, 7, 61, 2, 63, 123, 114, 68, 110, 107, 71, 6, 126, 31, 59, 70, 26, 125, 116, 5, 15, 4, 17, 48, 28, 120, 50, 118, 8, 127, 108, 77, 75, 90, 12, 27, 1, 3, 82, 24, 85, 92, 11, 109, 21, 122, 42, 124, 20, 38, 106, 9, 62, 40, 57, 93, 78, 25, 113, 33, 49, 22, 66, 80, 56, 16, 67, 58, 39, 30, 32, 60, 19, 23, 102, 91, 37, 54, 46, 87, 89, 64, 100, 96, 43, 115, 36, 83, 101, 14, 112, 47, 94, 95, 121, 103, 44, 35, 97, 99, 104, 0, 45, 34, 69, 73, 65], [105, 98, 84, 79, 88, 53, 18, 72, 41, 86, 111, 5, 81, 51, 2, 13, 63, 15, 76, 10, 29, 55, 52, 6, 68, 114, 119, 126, 74, 116, 110, 12, 123, 48, 92, 7, 117, 61, 28, 26, 107, 71, 9, 59, 27, 11, 118, 4, 87, 90, 31, 3, 93, 8, 85, 127, 125, 24, 122, 70, 109, 78, 1, 58, 20, 108, 50, 120, 124, 77, 91, 38, 66, 42, 49, 17, 22, 40, 39, 60, 30, 102, 69, 95, 67, 57, 54, 34, 106, 83, 23, 37, 62, 89, 32, 101, 16, 82, 21, 75, 96, 36, 19, 56, 33, 35, 121, 43, 112, 103, 113, 80, 46, 73, 94, 25, 44, 99, 14, 115, 104, 100, 0, 97, 45, 64, 47, 65], [105, 98, 84, 41, 72, 86, 53, 6, 18, 2, 79, 12, 4, 1, 76, 68, 71, 111, 74, 51, 13, 0, 3, 67, 81, 88, 66, 5, 114, 126, 63, 61, 123, 119, 8, 117, 26, 52, 116, 55, 48, 118, 125, 107, 127, 29, 65, 110, 59, 27, 10, 92, 93, 7, 85, 90, 20, 17, 109, 34, 120, 24, 9, 87, 50, 28, 122, 124, 22, 58, 62, 78, 70, 80, 49, 42, 32, 57, 108, 31, 30, 106, 83, 101, 35, 25, 56, 54, 21, 102, 77, 91, 14, 82, 113, 95, 64, 89, 15, 112, 38, 75, 97, 39, 36, 43, 115, 60, 40, 33, 121, 23, 19, 103, 37, 94, 11, 100, 46, 104, 45, 99, 69, 16, 96, 47, 44, 73], [105, 98, 74, 3, 70, 0, 86, 53, 111, 4, 1, 81, 18, 79, 76, 61, 5, 41, 67, 66, 119, 13, 117, 71, 8, 84, 2, 51, 110, 123, 68, 114, 55, 12, 52, 72, 126, 50, 125, 65, 120, 116, 107, 63, 17, 6, 9, 20, 59, 57, 64, 118, 88, 29, 109, 48, 10, 127, 49, 56, 38, 62, 73, 124, 34, 26, 7, 22, 40, 106, 78, 87, 23, 30, 15, 16, 28, 11, 24, 69, 93, 92, 19, 77, 89, 80, 83, 14, 60, 31, 94, 113, 75, 42, 39, 32, 82, 85, 108, 122, 54, 25, 43, 58, 121, 27, 90, 21, 96, 102, 115, 44, 101, 99, 112, 91, 36, 46, 95, 35, 33, 103, 104, 45, 37, 100, 47, 97], [125, 62, 104, 48, 63, 119, 122, 120, 59, 108, 55, 57, 118, 54, 121, 116, 58, 60, 27, 124, 113, 114, 115, 117, 84, 123, 111, 61, 97, 56, 53, 50, 43, 51, 47, 30, 44, 87, 52, 49, 126, 127, 107, 45, 109, 25, 110, 89, 12, 46, 21, 105, 28, 22, 72, 99, 112, 103, 41, 36, 38, 94, 106, 34, 5, 98, 90, 39, 102, 101, 20, 100, 1, 37, 92, 14, 9, 66, 96, 68, 42, 19, 16, 83, 35, 32, 79, 17, 29, 33, 95, 2, 85, 69, 31, 81, 0, 23, 40, 3, 91, 77, 86, 88, 18, 4, 93, 6, 15, 64, 82, 71, 76, 13, 11, 26, 24, 10, 73, 80, 78, 75, 74, 7, 8, 70, 65, 67], [62, 125, 48, 104, 63, 119, 122, 120, 59, 108, 97, 115, 55, 58, 54, 57, 60, 117, 114, 116, 121, 52, 124, 113, 50, 118, 47, 123, 61, 56, 111, 84, 36, 53, 127, 51, 89, 49, 109, 126, 44, 87, 5, 43, 72, 45, 14, 96, 69, 103, 46, 112, 107, 12, 30, 41, 110, 1, 106, 102, 105, 68, 2, 27, 32, 101, 66, 37, 28, 39, 38, 42, 24, 88, 98, 21, 25, 20, 22, 19, 35, 95, 99, 92, 100, 31, 64, 34, 23, 40, 83, 3, 93, 4, 15, 90, 91, 71, 76, 29, 85, 75, 79, 0, 6, 18, 94, 78, 9, 8, 17, 73, 77, 81, 13, 16, 7, 26, 10, 80, 33, 86, 11, 70, 74, 67, 82, 65], [104, 125, 62, 48, 63, 97, 119, 30, 21, 57, 122, 60, 59, 28, 127, 115, 118, 120, 116, 108, 55, 85, 84, 27, 54, 121, 113, 126, 123, 53, 114, 49, 56, 25, 111, 58, 117, 61, 22, 89, 124, 87, 52, 47, 9, 112, 109, 90, 44, 107, 51, 50, 45, 43, 40, 35, 5, 79, 46, 36, 105, 103, 94, 66, 1, 18, 68, 24, 41, 78, 92, 16, 12, 34, 15, 2, 98, 99, 102, 38, 95, 110, 72, 39, 19, 32, 3, 14, 96, 69, 11, 106, 101, 64, 31, 42, 37, 0, 6, 93, 91, 73, 100, 88, 83, 17, 23, 75, 20, 8, 29, 81, 4, 77, 26, 33, 13, 70, 71, 74, 7, 10, 80, 86, 76, 67, 65, 82], [104, 62, 125, 90, 30, 97, 63, 122, 84, 18, 19, 115, 23, 60, 80, 22, 16, 119, 27, 74, 79, 38, 57, 49, 26, 36, 78, 55, 20, 54, 59, 39, 82, 13, 124, 99, 120, 108, 94, 116, 123, 112, 45, 87, 21, 121, 111, 118, 114, 28, 83, 24, 113, 61, 126, 33, 88, 50, 58, 44, 127, 56, 17, 75, 48, 107, 10, 53, 102, 92, 34, 81, 100, 91, 109, 117, 96, 43, 51, 101, 47, 77, 103, 52, 14, 46, 25, 93, 15, 105, 37, 41, 31, 106, 6, 42, 110, 73, 98, 85, 70, 40, 5, 89, 12, 35, 32, 76, 95, 86, 68, 8, 29, 4, 7, 11, 71, 3, 9, 64, 1, 66, 65, 72, 67, 69, 0, 2], [103, 124, 55, 56, 21, 33, 15, 57, 29, 113, 81, 83, 91, 88, 27, 122, 123, 125, 93, 41, 10, 25, 35, 49, 98, 114, 24, 12, 78, 61, 105, 60, 48, 30, 109, 86, 115, 22, 52, 120, 112, 85, 53, 51, 110, 58, 31, 116, 43, 62, 45, 17, 94, 108, 44, 84, 100, 42, 107, 102, 70, 119, 111, 90, 126, 23, 18, 76, 118, 121, 117, 47, 36, 46, 4, 106, 127, 26, 89, 54, 63, 32, 50, 59, 20, 19, 11, 99, 8, 28, 40, 79, 101, 96, 13, 104, 37, 80, 38, 82, 95, 92, 16, 77, 34, 87, 72, 6, 65, 14, 2, 75, 97, 9, 5, 68, 39, 73, 74, 71, 7, 69, 0, 66, 67, 1, 64, 3], [103, 124, 55, 57, 88, 33, 56, 21, 105, 81, 113, 93, 123, 29, 15, 83, 23, 41, 122, 91, 125, 19, 49, 70, 78, 27, 114, 10, 12, 44, 86, 24, 126, 117, 45, 90, 98, 109, 2, 22, 58, 76, 52, 74, 4, 108, 120, 30, 60, 118, 107, 115, 48, 111, 106, 84, 110, 43, 65, 5, 85, 51, 35, 53, 119, 32, 18, 38, 39, 102, 17, 112, 99, 95, 34, 100, 104, 79, 47, 68, 54, 116, 92, 101, 42, 121, 20, 97, 61, 62, 31, 26, 46, 28, 96, 127, 87, 36, 11, 6, 50, 82, 94, 73, 37, 14, 89, 75, 59, 77, 8, 25, 16, 67, 40, 7, 63, 80, 0, 13, 9, 69, 72, 71, 1, 66, 64, 3], [103, 56, 124, 55, 57, 33, 88, 29, 113, 21, 15, 93, 105, 23, 86, 10, 77, 81, 114, 123, 91, 122, 64, 67, 125, 120, 78, 109, 0, 45, 85, 83, 60, 49, 61, 51, 65, 41, 52, 115, 53, 58, 119, 13, 110, 116, 112, 118, 27, 34, 2, 108, 48, 73, 24, 84, 71, 62, 42, 63, 89, 30, 50, 90, 14, 8, 6, 46, 126, 117, 107, 43, 70, 35, 26, 11, 17, 44, 121, 47, 111, 80, 3, 127, 20, 54, 59, 102, 7, 104, 100, 87, 37, 38, 82, 79, 106, 40, 22, 31, 9, 16, 97, 99, 36, 1, 18, 28, 101, 98, 66, 76, 12, 32, 25, 94, 68, 96, 4, 75, 92, 95, 5, 74, 19, 72, 69, 39], [124, 103, 55, 57, 41, 123, 88, 56, 125, 70, 114, 74, 122, 33, 19, 113, 21, 93, 110, 115, 23, 45, 49, 109, 120, 118, 52, 58, 116, 61, 15, 53, 108, 2, 112, 48, 83, 51, 60, 43, 35, 117, 81, 62, 29, 119, 126, 105, 73, 44, 50, 91, 121, 84, 127, 65, 111, 47, 46, 12, 107, 86, 42, 54, 63, 98, 30, 68, 106, 39, 14, 22, 59, 78, 100, 79, 27, 40, 102, 101, 10, 90, 5, 4, 38, 104, 67, 96, 99, 92, 24, 13, 36, 37, 34, 75, 26, 69, 20, 32, 8, 97, 85, 87, 17, 7, 28, 80, 25, 82, 18, 31, 76, 11, 94, 0, 71, 95, 16, 6, 77, 89, 9, 72, 3, 64, 1, 66], [57, 126, 39, 34, 61, 45, 95, 111, 120, 125, 60, 21, 51, 63, 58, 87, 52, 90, 54, 113, 114, 119, 117, 123, 109, 55, 127, 48, 50, 62, 118, 112, 122, 19, 106, 53, 56, 49, 46, 72, 59, 43, 115, 124, 110, 5, 82, 116, 108, 100, 41, 22, 36, 121, 47, 105, 17, 104, 28, 44, 26, 92, 107, 102, 88, 35, 73, 9, 42, 79, 14, 24, 16, 40, 38, 37, 77, 76, 78, 89, 99, 101, 27, 18, 30, 97, 75, 10, 32, 71, 93, 83, 98, 1, 33, 94, 13, 81, 66, 96, 29, 86, 25, 91, 103, 85, 8, 67, 11, 4, 2, 0, 20, 12, 74, 31, 70, 23, 15, 84, 69, 65, 3, 6, 7, 80, 68, 64], [39, 126, 57, 34, 95, 90, 19, 87, 60, 120, 21, 26, 61, 17, 92, 11, 111, 48, 79, 28, 31, 5, 71, 72, 22, 10, 45, 63, 117, 84, 27, 9, 58, 70, 102, 42, 4, 67, 100, 125, 127, 105, 40, 36, 108, 51, 83, 113, 99, 16, 101, 13, 38, 73, 41, 54, 96, 75, 46, 35, 53, 77, 124, 122, 52, 106, 66, 50, 44, 37, 49, 81, 59, 47, 112, 118, 62, 23, 104, 109, 119, 33, 43, 116, 85, 115, 114, 121, 110, 30, 32, 55, 123, 29, 97, 107, 25, 56, 0, 24, 20, 89, 65, 93, 15, 7, 80, 94, 18, 98, 91, 74, 1, 88, 76, 78, 6, 68, 12, 14, 82, 103, 69, 86, 8, 2, 3, 64], [39, 57, 126, 34, 95, 87, 60, 21, 19, 90, 61, 16, 120, 73, 28, 26, 22, 79, 75, 17, 77, 48, 111, 125, 5, 58, 51, 83, 13, 63, 9, 92, 45, 104, 74, 33, 82, 113, 118, 27, 112, 122, 109, 127, 18, 123, 7, 71, 78, 20, 10, 89, 117, 62, 23, 119, 31, 30, 49, 50, 36, 115, 53, 56, 81, 94, 102, 54, 24, 84, 76, 52, 70, 25, 67, 47, 96, 68, 55, 110, 114, 91, 44, 88, 85, 116, 108, 41, 40, 8, 15, 6, 2, 46, 38, 121, 99, 80, 1, 105, 100, 37, 106, 14, 29, 97, 124, 93, 42, 98, 35, 12, 32, 72, 101, 107, 59, 0, 3, 43, 86, 11, 69, 103, 64, 4, 65, 66], [39, 57, 126, 34, 120, 87, 95, 90, 19, 60, 26, 92, 11, 61, 21, 111, 36, 17, 72, 22, 5, 48, 117, 63, 125, 71, 9, 4, 127, 67, 58, 83, 16, 79, 75, 70, 66, 73, 52, 51, 27, 113, 13, 122, 91, 119, 53, 50, 115, 45, 46, 31, 10, 28, 124, 65, 123, 23, 55, 118, 109, 54, 56, 114, 40, 62, 20, 121, 15, 18, 32, 49, 0, 77, 7, 102, 81, 44, 47, 82, 37, 85, 112, 101, 38, 106, 43, 110, 100, 35, 24, 116, 80, 107, 105, 78, 99, 33, 76, 41, 74, 94, 59, 1, 14, 84, 108, 97, 30, 104, 86, 42, 88, 96, 68, 93, 89, 6, 25, 98, 8, 12, 3, 2, 69, 29, 64, 103], [59, 61, 53, 101, 27, 88, 76, 78, 18, 16, 25, 20, 22, 85, 29, 69, 7, 96, 28, 75, 9, 30, 119, 62, 100, 56, 87, 66, 39, 14, 126, 83, 82, 117, 90, 37, 2, 71, 43, 67, 3, 12, 91, 72, 21, 79, 80, 74, 77, 73, 84, 0, 33, 24, 26, 93, 81, 65, 15, 23, 19, 6, 17, 1, 31, 54, 94, 8, 92, 36, 50, 10, 89, 5, 97, 105, 111, 41, 95, 13, 35, 106, 108, 116, 107, 42, 70, 103, 68, 113, 86, 32, 124, 48, 49, 4, 11, 99, 44, 60, 98, 64, 34, 55, 102, 63, 115, 120, 46, 52, 125, 104, 121, 123, 122, 110, 45, 57, 58, 127, 40, 47, 109, 118, 38, 114, 112, 51], [53, 59, 61, 100, 119, 56, 107, 50, 124, 52, 126, 113, 123, 116, 60, 54, 62, 115, 117, 120, 122, 58, 23, 125, 63, 110, 121, 57, 55, 111, 77, 45, 108, 39, 49, 114, 37, 43, 127, 48, 109, 112, 47, 118, 90, 51, 42, 46, 103, 104, 44, 38, 92, 41, 40, 101, 106, 32, 98, 105, 36, 84, 33, 99, 28, 31, 102, 35, 34, 17, 73, 26, 97, 95, 3, 94, 15, 64, 2, 11, 30, 12, 5, 74, 14, 96, 82, 70, 13, 29, 93, 83, 24, 85, 27, 7, 1, 25, 69, 19, 68, 87, 72, 66, 21, 71, 91, 79, 80, 76, 9, 8, 4, 20, 75, 16, 81, 67, 89, 18, 78, 6, 88, 22, 86, 65, 0, 10], [59, 53, 61, 101, 88, 71, 75, 18, 78, 27, 16, 76, 2, 22, 69, 25, 20, 9, 68, 117, 56, 0, 119, 79, 5, 74, 81, 66, 85, 91, 93, 14, 7, 96, 72, 11, 83, 10, 87, 24, 82, 90, 28, 30, 15, 6, 13, 80, 39, 3, 4, 89, 43, 29, 21, 32, 84, 12, 116, 23, 111, 33, 26, 70, 19, 35, 64, 41, 92, 94, 1, 8, 31, 100, 77, 49, 73, 67, 62, 17, 54, 37, 65, 99, 95, 103, 55, 126, 86, 124, 108, 38, 98, 50, 113, 34, 36, 97, 115, 60, 122, 105, 42, 44, 104, 123, 110, 47, 107, 102, 48, 121, 63, 52, 120, 106, 46, 58, 40, 45, 57, 125, 109, 127, 51, 112, 118, 114], [61, 53, 119, 124, 113, 63, 54, 47, 52, 57, 115, 117, 114, 120, 56, 50, 58, 111, 55, 118, 127, 125, 59, 62, 123, 121, 51, 110, 45, 49, 122, 116, 48, 112, 109, 126, 44, 43, 60, 108, 46, 42, 106, 107, 105, 32, 103, 36, 40, 101, 41, 100, 37, 33, 39, 104, 99, 86, 22, 102, 35, 23, 38, 34, 29, 95, 92, 98, 28, 97, 31, 96, 90, 83, 17, 94, 26, 19, 81, 20, 93, 30, 27, 25, 82, 16, 88, 87, 21, 18, 24, 80, 89, 14, 84, 85, 15, 91, 13, 78, 77, 79, 76, 67, 72, 74, 75, 70, 12, 10, 9, 71, 11, 1, 68, 0, 73, 69, 8, 6, 3, 2, 4, 5, 66, 64, 7, 65], [127, 124, 102, 33, 62, 51, 94, 56, 22, 18, 24, 121, 60, 116, 59, 37, 119, 39, 113, 122, 125, 114, 27, 91, 53, 93, 99, 30, 110, 38, 55, 126, 32, 25, 61, 58, 57, 49, 104, 118, 95, 112, 54, 28, 123, 48, 117, 47, 63, 50, 97, 46, 2, 111, 115, 108, 7, 120, 84, 29, 52, 45, 109, 31, 44, 21, 101, 71, 75, 107, 34, 106, 89, 3, 0, 90, 82, 85, 70, 35, 42, 98, 92, 103, 40, 78, 6, 105, 36, 41, 67, 65, 73, 83, 15, 43, 23, 20, 26, 14, 16, 87, 69, 88, 80, 68, 72, 96, 19, 100, 1, 76, 4, 64, 12, 77, 8, 66, 17, 79, 11, 10, 81, 74, 13, 5, 86, 9], [124, 102, 127, 47, 24, 33, 62, 94, 18, 51, 91, 84, 22, 75, 25, 36, 28, 121, 114, 56, 15, 37, 26, 60, 40, 110, 112, 116, 32, 53, 59, 122, 126, 119, 38, 27, 58, 79, 89, 20, 109, 107, 13, 125, 113, 11, 57, 87, 30, 123, 111, 34, 63, 55, 103, 35, 117, 48, 100, 49, 93, 46, 29, 54, 41, 45, 108, 9, 105, 61, 7, 42, 118, 31, 39, 85, 101, 88, 115, 106, 50, 80, 98, 96, 92, 99, 52, 120, 82, 90, 23, 104, 19, 3, 67, 97, 95, 44, 83, 17, 69, 21, 0, 16, 10, 76, 5, 71, 12, 74, 81, 2, 43, 14, 78, 86, 72, 68, 77, 73, 8, 4, 65, 66, 64, 6, 1, 70], [102, 127, 124, 94, 24, 33, 84, 13, 30, 93, 19, 22, 9, 25, 62, 18, 28, 17, 15, 92, 82, 88, 97, 16, 83, 73, 91, 38, 11, 23, 56, 51, 20, 79, 42, 85, 47, 112, 36, 89, 77, 69, 27, 46, 81, 31, 87, 95, 90, 106, 29, 67, 110, 119, 34, 2, 32, 80, 35, 86, 96, 8, 5, 0, 76, 26, 14, 12, 39, 122, 7, 74, 21, 78, 37, 10, 72, 108, 99, 71, 120, 3, 60, 59, 100, 114, 70, 123, 65, 107, 75, 63, 98, 116, 40, 68, 6, 103, 104, 105, 126, 101, 111, 4, 52, 55, 1, 57, 58, 54, 41, 66, 109, 45, 118, 50, 125, 64, 48, 117, 121, 53, 113, 115, 44, 43, 49, 61], [124, 127, 102, 47, 51, 62, 121, 56, 85, 59, 116, 119, 114, 60, 110, 113, 125, 58, 126, 53, 91, 122, 55, 123, 61, 112, 54, 48, 49, 63, 57, 50, 117, 118, 18, 109, 115, 108, 26, 41, 120, 111, 107, 101, 46, 105, 40, 45, 52, 21, 33, 38, 44, 25, 104, 42, 43, 36, 106, 39, 86, 22, 73, 99, 35, 37, 82, 76, 24, 34, 100, 30, 103, 79, 97, 94, 92, 3, 15, 98, 32, 29, 95, 96, 31, 27, 77, 90, 28, 89, 23, 75, 93, 65, 69, 7, 68, 84, 88, 17, 83, 12, 20, 81, 87, 0, 1, 72, 10, 66, 67, 19, 16, 80, 11, 64, 70, 6, 9, 5, 78, 71, 13, 14, 4, 74, 2, 8]], "model.layers.9.self_attn.k_proj": [[46, 37, 110, 96, 22, 63, 124, 99, 59, 113, 28, 49, 56, 122, 55, 114, 115, 54, 58, 57, 51, 61, 60, 119, 123, 53, 120, 48, 62, 25, 47, 85, 50, 121, 125, 112, 118, 127, 19, 117, 126, 116, 78, 43, 52, 8, 42, 108, 111, 109, 105, 107, 27, 40, 45, 65, 44, 30, 41, 106, 77, 26, 104, 17, 16, 103, 23, 39, 94, 102, 36, 7, 38, 4, 5, 3, 97, 95, 31, 29, 90, 81, 33, 35, 32, 82, 67, 74, 100, 93, 2, 21, 64, 18, 98, 101, 34, 76, 84, 24, 20, 14, 1, 87, 9, 73, 11, 10, 92, 91, 66, 13, 88, 12, 79, 15, 0, 80, 69, 75, 72, 6, 68, 70, 86, 71, 83, 89], [105, 117, 95, 56, 37, 58, 43, 106, 112, 34, 92, 23, 33, 108, 63, 114, 55, 29, 110, 103, 38, 111, 99, 22, 89, 26, 9, 78, 124, 46, 107, 119, 84, 50, 122, 87, 18, 82, 127, 2, 126, 0, 60, 51, 83, 4, 20, 14, 121, 98, 54, 65, 88, 12, 57, 120, 59, 69, 67, 16, 90, 115, 52, 40, 44, 109, 118, 62, 31, 36, 10, 42, 48, 70, 79, 24, 47, 76, 80, 41, 8, 30, 25, 100, 113, 96, 32, 27, 53, 101, 6, 125, 45, 17, 123, 64, 11, 91, 102, 15, 21, 86, 49, 116, 85, 13, 7, 104, 94, 61, 3, 97, 81, 72, 71, 1, 75, 5, 74, 66, 93, 35, 77, 73, 68, 19, 28, 39], [41, 0, 34, 47, 18, 74, 13, 81, 86, 117, 76, 8, 70, 53, 65, 55, 66, 79, 105, 71, 116, 84, 3, 4, 50, 63, 51, 112, 126, 59, 119, 111, 123, 43, 61, 127, 69, 44, 88, 106, 114, 46, 125, 93, 118, 67, 49, 87, 48, 98, 92, 85, 45, 115, 26, 90, 68, 122, 104, 52, 2, 27, 54, 42, 6, 124, 7, 73, 121, 120, 60, 30, 58, 29, 1, 31, 38, 83, 57, 110, 11, 95, 24, 37, 62, 10, 25, 77, 64, 75, 32, 12, 103, 5, 14, 19, 107, 9, 36, 72, 35, 80, 102, 39, 15, 91, 16, 28, 33, 109, 40, 17, 101, 97, 23, 89, 100, 56, 78, 20, 94, 113, 108, 96, 99, 82, 21, 22], [40, 125, 62, 22, 33, 94, 48, 63, 100, 122, 90, 18, 119, 117, 92, 57, 127, 16, 114, 60, 29, 52, 49, 47, 86, 19, 54, 78, 59, 87, 45, 58, 115, 109, 110, 120, 10, 84, 56, 123, 46, 106, 113, 108, 112, 89, 70, 61, 43, 99, 24, 116, 80, 126, 53, 124, 50, 51, 96, 107, 111, 118, 28, 77, 4, 121, 39, 2, 21, 38, 79, 55, 8, 44, 105, 42, 35, 41, 0, 74, 37, 23, 101, 88, 17, 102, 32, 91, 103, 31, 98, 36, 97, 81, 82, 34, 27, 75, 93, 95, 1, 30, 7, 11, 26, 3, 15, 73, 20, 13, 83, 76, 69, 25, 12, 65, 14, 72, 71, 67, 6, 104, 85, 64, 9, 5, 68, 66], [124, 39, 55, 56, 97, 93, 113, 88, 21, 83, 57, 91, 15, 122, 81, 114, 123, 125, 78, 105, 44, 117, 6, 11, 52, 23, 18, 10, 66, 4, 45, 64, 9, 43, 109, 69, 126, 76, 106, 22, 49, 12, 110, 47, 116, 53, 112, 120, 51, 80, 107, 61, 46, 58, 118, 62, 42, 32, 54, 1, 14, 41, 82, 119, 3, 50, 121, 127, 115, 27, 60, 111, 34, 89, 108, 33, 48, 101, 100, 38, 20, 63, 72, 102, 36, 84, 99, 59, 90, 25, 98, 35, 13, 37, 26, 96, 92, 19, 104, 94, 30, 95, 86, 40, 16, 73, 28, 87, 77, 7, 29, 85, 31, 5, 8, 71, 67, 75, 24, 103, 65, 68, 79, 0, 17, 2, 70, 74], [126, 57, 103, 31, 98, 61, 87, 90, 22, 60, 16, 92, 21, 120, 19, 17, 111, 48, 77, 79, 75, 125, 51, 7, 63, 58, 113, 74, 118, 84, 122, 53, 3, 127, 73, 115, 117, 6, 45, 55, 123, 65, 64, 100, 56, 119, 82, 62, 52, 116, 66, 49, 112, 50, 114, 99, 110, 46, 69, 44, 121, 88, 54, 38, 47, 76, 105, 124, 40, 109, 107, 68, 4, 28, 59, 15, 13, 36, 32, 91, 104, 41, 97, 43, 106, 101, 35, 20, 29, 30, 1, 102, 18, 42, 86, 10, 27, 108, 33, 5, 93, 80, 14, 25, 24, 37, 96, 94, 85, 23, 70, 34, 67, 89, 12, 81, 78, 83, 8, 2, 71, 11, 72, 9, 26, 39, 95, 0], [59, 61, 53, 37, 22, 16, 27, 18, 78, 32, 20, 119, 75, 88, 29, 25, 56, 76, 71, 85, 69, 79, 60, 116, 10, 8, 34, 2, 9, 81, 126, 98, 50, 105, 19, 113, 117, 15, 1, 108, 36, 96, 123, 122, 107, 103, 124, 87, 26, 115, 74, 39, 62, 120, 3, 0, 55, 52, 43, 13, 90, 111, 63, 70, 28, 54, 35, 6, 21, 38, 125, 109, 46, 106, 112, 30, 121, 49, 4, 47, 57, 65, 41, 17, 94, 58, 23, 104, 33, 110, 127, 77, 45, 68, 118, 48, 51, 114, 83, 97, 44, 99, 42, 93, 102, 5, 86, 72, 92, 95, 40, 89, 31, 80, 100, 73, 24, 67, 12, 64, 14, 82, 84, 7, 66, 91, 11, 101], [127, 124, 38, 22, 97, 51, 62, 30, 121, 24, 15, 114, 59, 111, 110, 18, 56, 60, 116, 112, 9, 119, 63, 91, 13, 122, 125, 93, 64, 48, 113, 17, 84, 11, 53, 5, 57, 126, 47, 106, 55, 54, 58, 102, 118, 94, 49, 117, 61, 123, 50, 66, 44, 16, 100, 19, 45, 115, 43, 101, 109, 120, 76, 25, 99, 88, 104, 108, 46, 28, 29, 10, 52, 23, 42, 103, 40, 105, 98, 41, 107, 37, 68, 26, 83, 1, 35, 14, 89, 95, 6, 87, 8, 79, 36, 71, 21, 81, 7, 96, 65, 34, 78, 74, 31, 33, 12, 39, 80, 85, 90, 86, 72, 32, 20, 3, 67, 75, 77, 92, 4, 27, 82, 69, 73, 2, 70, 0]], "model.layers.9.self_attn.qk_proj": [[124, 61, 127, 53, 125, 126, 62, 41, 57, 59, 110, 105, 117, 46, 56, 55, 51, 119, 114, 86, 63, 22, 113, 48, 98, 102, 82, 111, 112, 122, 116, 60, 52, 18, 123, 49, 58, 118, 26, 84, 50, 43, 39, 121, 28, 20, 17, 88, 47, 42, 81, 24, 101, 15, 38, 23, 29, 31, 79, 76, 103, 37, 13, 87, 97, 77, 34, 93, 33, 115, 45, 120, 74, 21, 10, 40, 0, 94, 64, 12, 54, 27, 66, 108, 107, 109, 89, 106, 16, 2, 7, 71, 44, 67, 91, 30, 19, 70, 90, 83, 78, 3, 95, 80, 92, 25, 32, 104, 85, 96, 72, 35, 8, 68, 6, 14, 99, 69, 75, 11, 9, 1, 36, 4, 100, 73, 5, 65], [124, 61, 127, 53, 126, 62, 125, 41, 59, 57, 110, 105, 117, 46, 56, 55, 63, 51, 86, 119, 114, 22, 48, 113, 111, 60, 112, 122, 98, 116, 49, 43, 82, 88, 118, 28, 24, 18, 102, 84, 123, 50, 37, 103, 26, 47, 52, 58, 39, 42, 15, 101, 87, 23, 77, 17, 20, 76, 115, 34, 45, 81, 31, 38, 79, 10, 0, 107, 13, 121, 33, 40, 97, 29, 120, 27, 54, 21, 64, 74, 94, 93, 30, 44, 12, 109, 83, 70, 108, 7, 89, 16, 90, 91, 25, 68, 106, 92, 96, 104, 100, 71, 80, 78, 85, 35, 95, 36, 66, 19, 8, 32, 4, 14, 11, 65, 99, 2, 5, 9, 75, 72, 69, 1, 3, 6, 73, 67], [124, 61, 127, 53, 126, 62, 41, 125, 59, 57, 105, 117, 110, 46, 56, 55, 63, 51, 119, 114, 86, 22, 113, 111, 48, 102, 98, 116, 60, 103, 112, 37, 122, 123, 47, 88, 39, 43, 115, 82, 18, 84, 28, 121, 118, 52, 38, 26, 34, 24, 49, 50, 20, 45, 120, 101, 87, 27, 23, 40, 17, 97, 31, 76, 42, 93, 109, 15, 81, 58, 94, 108, 54, 33, 79, 29, 106, 64, 70, 91, 77, 32, 74, 12, 44, 95, 0, 89, 10, 90, 30, 66, 107, 21, 100, 13, 83, 16, 4, 96, 7, 78, 99, 71, 8, 80, 85, 25, 92, 36, 104, 2, 35, 19, 11, 3, 68, 6, 9, 72, 67, 1, 14, 5, 75, 69, 65, 73], [124, 127, 53, 61, 125, 126, 62, 41, 57, 59, 110, 105, 46, 117, 56, 55, 63, 51, 114, 48, 86, 119, 22, 122, 111, 112, 118, 98, 60, 113, 116, 50, 49, 121, 102, 45, 18, 82, 47, 52, 43, 28, 88, 123, 38, 37, 58, 84, 115, 39, 103, 26, 17, 34, 87, 101, 42, 24, 31, 20, 54, 29, 120, 81, 93, 107, 33, 97, 23, 40, 27, 15, 94, 79, 108, 76, 0, 70, 44, 77, 92, 10, 106, 89, 13, 109, 96, 64, 12, 71, 74, 25, 66, 30, 80, 91, 35, 32, 95, 85, 21, 7, 8, 90, 100, 14, 104, 2, 83, 16, 19, 67, 69, 99, 36, 3, 78, 68, 11, 65, 72, 73, 6, 1, 4, 75, 9, 5], [124, 127, 53, 61, 125, 126, 62, 41, 59, 57, 110, 105, 117, 56, 46, 55, 51, 63, 119, 114, 116, 86, 22, 48, 98, 113, 82, 60, 18, 112, 39, 45, 102, 111, 123, 50, 122, 49, 84, 42, 24, 47, 118, 26, 38, 28, 121, 101, 52, 37, 88, 120, 34, 20, 103, 17, 115, 43, 58, 33, 87, 81, 64, 31, 40, 77, 29, 76, 23, 15, 97, 54, 93, 107, 2, 79, 44, 10, 108, 0, 12, 89, 7, 66, 13, 74, 27, 92, 109, 21, 104, 91, 96, 90, 80, 70, 94, 71, 95, 30, 106, 83, 35, 8, 78, 4, 68, 16, 85, 19, 6, 25, 65, 14, 9, 5, 75, 99, 72, 32, 11, 67, 1, 69, 100, 3, 73, 36], [124, 127, 61, 53, 126, 125, 62, 41, 57, 59, 110, 105, 117, 56, 46, 55, 51, 63, 119, 86, 114, 48, 22, 111, 113, 82, 98, 116, 58, 112, 49, 18, 122, 60, 26, 52, 102, 123, 88, 118, 28, 84, 103, 50, 47, 24, 39, 77, 45, 101, 13, 42, 43, 120, 15, 17, 20, 38, 87, 81, 79, 33, 31, 121, 34, 23, 76, 37, 54, 10, 29, 40, 97, 74, 12, 44, 7, 115, 107, 0, 83, 104, 21, 27, 2, 94, 89, 80, 106, 108, 96, 8, 71, 109, 30, 92, 64, 85, 19, 14, 16, 91, 93, 25, 95, 6, 4, 11, 78, 90, 66, 3, 75, 35, 70, 67, 9, 68, 32, 72, 69, 99, 100, 5, 65, 36, 73, 1], [124, 127, 61, 53, 126, 125, 62, 41, 59, 57, 110, 105, 117, 56, 46, 55, 51, 86, 22, 63, 119, 114, 48, 112, 116, 82, 111, 18, 52, 26, 49, 122, 118, 60, 84, 98, 20, 102, 120, 123, 28, 58, 88, 50, 39, 24, 101, 79, 81, 29, 15, 76, 17, 103, 121, 43, 113, 38, 13, 47, 31, 42, 37, 12, 34, 45, 106, 77, 10, 87, 21, 33, 27, 115, 8, 107, 23, 44, 40, 54, 94, 19, 97, 80, 93, 83, 74, 89, 85, 91, 25, 14, 109, 108, 92, 96, 7, 3, 30, 90, 0, 67, 71, 95, 6, 72, 16, 99, 78, 64, 11, 66, 36, 104, 32, 35, 2, 69, 9, 100, 5, 68, 75, 70, 1, 4, 65, 73], [124, 127, 61, 53, 126, 125, 41, 62, 59, 57, 110, 105, 117, 56, 46, 55, 51, 63, 22, 86, 119, 114, 48, 98, 112, 18, 82, 88, 122, 116, 84, 28, 26, 24, 111, 20, 60, 113, 118, 79, 15, 103, 102, 58, 50, 123, 38, 52, 43, 49, 34, 101, 17, 37, 87, 31, 42, 76, 29, 45, 13, 83, 77, 81, 39, 10, 120, 12, 21, 121, 47, 23, 97, 40, 94, 27, 74, 33, 0, 104, 64, 19, 54, 107, 115, 108, 7, 30, 80, 93, 25, 106, 6, 96, 89, 85, 16, 14, 66, 90, 92, 44, 8, 91, 109, 78, 100, 11, 2, 99, 35, 95, 68, 75, 71, 65, 72, 32, 36, 70, 4, 1, 69, 9, 5, 3, 73, 67], [124, 127, 53, 61, 126, 125, 41, 62, 59, 57, 110, 117, 105, 56, 46, 51, 55, 63, 114, 119, 22, 86, 122, 48, 98, 111, 88, 18, 60, 112, 49, 26, 84, 118, 82, 113, 116, 50, 20, 24, 39, 28, 47, 37, 38, 58, 103, 43, 102, 123, 52, 29, 94, 15, 121, 17, 79, 87, 101, 34, 27, 77, 31, 33, 120, 81, 13, 107, 97, 45, 12, 76, 23, 42, 0, 115, 40, 85, 30, 21, 74, 54, 10, 6, 19, 96, 83, 93, 25, 89, 78, 64, 92, 108, 44, 90, 16, 104, 80, 91, 68, 7, 106, 67, 109, 2, 32, 66, 95, 3, 100, 71, 4, 36, 1, 72, 70, 99, 65, 14, 35, 11, 69, 75, 8, 5, 9, 73], [124, 127, 53, 61, 126, 41, 125, 59, 62, 57, 105, 110, 117, 56, 46, 55, 63, 51, 114, 22, 86, 119, 48, 112, 116, 122, 111, 18, 98, 118, 113, 88, 60, 102, 50, 58, 82, 84, 28, 52, 79, 24, 26, 49, 38, 103, 81, 47, 15, 87, 20, 39, 34, 17, 123, 43, 29, 37, 23, 120, 107, 121, 97, 74, 76, 101, 45, 42, 31, 33, 13, 77, 27, 12, 54, 115, 21, 93, 44, 94, 0, 10, 40, 78, 64, 6, 90, 92, 83, 104, 66, 96, 7, 19, 80, 85, 25, 109, 108, 30, 91, 106, 16, 68, 71, 89, 70, 2, 72, 4, 95, 67, 32, 14, 99, 100, 5, 35, 69, 8, 3, 65, 36, 11, 73, 75, 1, 9], [124, 53, 127, 61, 125, 126, 41, 57, 62, 59, 105, 110, 117, 46, 56, 51, 55, 63, 119, 114, 22, 86, 48, 113, 111, 116, 122, 112, 98, 102, 18, 60, 82, 118, 49, 52, 50, 39, 47, 58, 20, 123, 88, 42, 31, 26, 84, 87, 15, 24, 37, 28, 103, 79, 121, 64, 81, 17, 107, 43, 34, 120, 101, 54, 77, 38, 93, 29, 12, 76, 97, 13, 66, 33, 45, 23, 27, 7, 115, 10, 40, 0, 74, 71, 2, 78, 16, 44, 21, 94, 90, 25, 6, 92, 70, 30, 72, 95, 104, 109, 85, 108, 19, 106, 35, 96, 83, 5, 68, 89, 91, 32, 75, 69, 80, 65, 4, 9, 14, 11, 3, 67, 8, 100, 99, 73, 1, 36], [124, 127, 53, 61, 126, 125, 62, 41, 59, 57, 110, 105, 117, 56, 46, 51, 55, 63, 119, 114, 86, 22, 48, 122, 111, 82, 60, 18, 116, 52, 113, 98, 39, 112, 102, 49, 118, 26, 58, 50, 88, 47, 123, 20, 15, 84, 28, 24, 120, 101, 17, 13, 77, 81, 42, 121, 29, 37, 33, 12, 43, 79, 40, 87, 31, 103, 107, 54, 38, 16, 27, 76, 0, 64, 21, 44, 34, 74, 23, 19, 10, 45, 97, 78, 93, 115, 71, 94, 66, 30, 72, 89, 108, 7, 104, 92, 2, 5, 70, 96, 90, 3, 83, 25, 106, 91, 67, 85, 109, 68, 1, 95, 35, 32, 80, 69, 14, 8, 11, 4, 100, 6, 75, 9, 73, 99, 65, 36], [124, 127, 61, 53, 126, 125, 62, 41, 57, 59, 110, 105, 117, 56, 46, 55, 51, 63, 119, 22, 86, 114, 60, 112, 116, 18, 48, 113, 122, 88, 111, 49, 118, 98, 82, 24, 102, 50, 28, 52, 123, 20, 43, 39, 26, 84, 101, 58, 103, 15, 47, 38, 37, 29, 81, 13, 87, 120, 17, 33, 121, 97, 45, 79, 34, 107, 27, 31, 12, 42, 115, 21, 77, 74, 104, 93, 23, 40, 16, 76, 94, 10, 54, 91, 19, 44, 83, 89, 30, 25, 96, 90, 85, 108, 64, 70, 78, 72, 92, 100, 106, 32, 95, 71, 80, 109, 0, 75, 14, 36, 35, 7, 68, 66, 2, 99, 69, 4, 3, 67, 11, 5, 8, 6, 65, 9, 1, 73], [124, 127, 53, 61, 126, 125, 62, 41, 57, 59, 110, 117, 105, 56, 46, 55, 63, 51, 86, 119, 22, 114, 48, 112, 111, 98, 113, 60, 18, 58, 116, 122, 82, 88, 49, 28, 15, 118, 26, 47, 123, 13, 120, 24, 43, 102, 50, 84, 52, 103, 79, 20, 34, 64, 101, 87, 81, 39, 21, 42, 45, 29, 23, 94, 17, 38, 40, 121, 74, 77, 10, 83, 104, 70, 12, 0, 33, 31, 37, 93, 107, 54, 27, 97, 89, 19, 7, 71, 76, 16, 25, 14, 90, 108, 44, 2, 92, 80, 91, 72, 115, 78, 85, 95, 30, 106, 66, 75, 4, 73, 11, 32, 96, 5, 68, 35, 6, 69, 9, 8, 65, 36, 109, 1, 100, 99, 3, 67], [124, 127, 53, 61, 126, 125, 62, 41, 59, 57, 110, 105, 117, 56, 46, 55, 63, 51, 119, 86, 114, 48, 22, 60, 111, 112, 118, 113, 52, 123, 49, 82, 98, 47, 18, 101, 116, 102, 120, 122, 88, 39, 58, 26, 28, 24, 50, 103, 13, 20, 38, 84, 37, 34, 45, 43, 33, 79, 15, 87, 23, 42, 17, 121, 97, 31, 81, 40, 107, 27, 77, 29, 108, 7, 12, 21, 0, 54, 16, 19, 44, 74, 10, 64, 94, 70, 93, 104, 92, 76, 89, 106, 25, 71, 91, 72, 83, 90, 66, 115, 2, 80, 95, 14, 85, 78, 96, 32, 30, 3, 67, 5, 4, 9, 100, 99, 35, 109, 1, 68, 6, 11, 36, 75, 69, 73, 65, 8], [124, 127, 53, 61, 126, 125, 41, 57, 62, 59, 110, 105, 117, 56, 46, 55, 63, 51, 86, 119, 22, 114, 113, 48, 112, 60, 111, 18, 102, 98, 52, 122, 118, 49, 39, 116, 43, 82, 123, 84, 101, 28, 50, 47, 58, 120, 88, 103, 26, 20, 42, 24, 29, 33, 13, 17, 15, 81, 97, 38, 45, 34, 27, 37, 79, 12, 121, 31, 40, 23, 107, 87, 77, 94, 16, 108, 104, 10, 54, 64, 30, 21, 0, 91, 25, 7, 76, 115, 74, 90, 83, 89, 44, 35, 93, 92, 71, 70, 69, 96, 85, 19, 32, 66, 106, 67, 2, 78, 72, 95, 80, 100, 109, 99, 6, 68, 3, 75, 14, 4, 8, 5, 1, 36, 9, 11, 65, 73], [124, 127, 126, 53, 61, 125, 41, 57, 62, 59, 110, 105, 56, 46, 117, 55, 51, 63, 22, 86, 114, 48, 119, 112, 52, 18, 111, 116, 102, 123, 60, 113, 98, 28, 84, 49, 122, 82, 88, 118, 50, 101, 103, 20, 26, 39, 79, 24, 15, 47, 58, 17, 120, 37, 13, 38, 81, 107, 104, 31, 45, 33, 27, 121, 29, 43, 34, 10, 40, 21, 12, 74, 23, 94, 42, 19, 91, 87, 77, 16, 97, 32, 90, 115, 54, 25, 108, 76, 106, 93, 78, 35, 71, 83, 44, 64, 89, 85, 80, 96, 99, 0, 30, 92, 14, 6, 95, 8, 7, 68, 72, 109, 69, 4, 66, 70, 100, 75, 9, 5, 2, 65, 36, 1, 73, 67, 11, 3], [124, 127, 53, 126, 61, 125, 41, 59, 62, 57, 110, 105, 117, 56, 46, 55, 51, 22, 86, 63, 119, 114, 111, 48, 112, 122, 123, 82, 98, 52, 28, 18, 116, 49, 101, 102, 24, 84, 88, 50, 47, 26, 58, 38, 60, 20, 113, 118, 17, 103, 37, 81, 42, 15, 39, 79, 34, 31, 29, 43, 33, 21, 23, 13, 87, 97, 120, 40, 107, 45, 12, 121, 27, 74, 10, 89, 93, 77, 19, 90, 94, 83, 16, 104, 0, 25, 7, 91, 115, 54, 6, 78, 80, 92, 108, 76, 30, 44, 96, 106, 64, 85, 99, 2, 95, 71, 66, 14, 35, 75, 32, 8, 100, 67, 9, 3, 109, 36, 68, 70, 4, 73, 1, 72, 11, 69, 5, 65], [124, 127, 53, 61, 126, 41, 125, 59, 62, 57, 105, 110, 117, 46, 56, 55, 51, 86, 114, 63, 119, 22, 48, 118, 52, 111, 98, 122, 116, 60, 39, 112, 37, 123, 18, 47, 102, 49, 113, 103, 101, 82, 43, 58, 88, 84, 38, 115, 24, 34, 26, 81, 28, 31, 42, 50, 120, 107, 17, 121, 33, 20, 79, 13, 87, 40, 91, 15, 45, 54, 23, 106, 29, 108, 27, 97, 94, 0, 99, 64, 12, 93, 32, 89, 44, 10, 6, 21, 76, 19, 83, 95, 92, 85, 90, 74, 16, 100, 2, 35, 109, 96, 77, 30, 80, 25, 14, 66, 7, 104, 68, 36, 8, 4, 71, 67, 70, 1, 3, 75, 78, 9, 69, 5, 65, 73, 72, 11], [124, 127, 53, 61, 126, 125, 57, 41, 62, 59, 110, 105, 117, 46, 56, 55, 51, 86, 63, 119, 22, 114, 48, 112, 111, 113, 116, 60, 123, 49, 118, 98, 102, 103, 18, 39, 122, 88, 52, 47, 101, 58, 28, 26, 82, 50, 33, 84, 43, 24, 37, 121, 31, 38, 79, 15, 34, 87, 20, 97, 13, 42, 29, 17, 81, 21, 120, 107, 27, 45, 23, 115, 54, 74, 12, 94, 25, 10, 6, 93, 104, 40, 89, 19, 90, 92, 30, 77, 32, 91, 64, 76, 0, 108, 35, 83, 80, 7, 44, 95, 85, 96, 78, 106, 16, 8, 4, 109, 71, 68, 14, 2, 75, 66, 9, 70, 99, 100, 36, 5, 11, 72, 69, 1, 3, 65, 67, 73], [124, 61, 53, 127, 126, 125, 41, 57, 62, 59, 110, 105, 117, 56, 46, 55, 51, 63, 86, 119, 22, 114, 48, 112, 111, 116, 98, 18, 60, 102, 84, 52, 123, 122, 58, 118, 50, 113, 88, 82, 49, 101, 17, 42, 28, 26, 43, 13, 47, 121, 39, 20, 15, 33, 27, 81, 38, 79, 24, 29, 103, 120, 10, 87, 107, 31, 45, 77, 40, 23, 37, 115, 12, 54, 94, 34, 74, 95, 6, 30, 90, 25, 7, 93, 16, 64, 21, 104, 71, 66, 76, 78, 0, 2, 83, 89, 97, 91, 44, 5, 80, 85, 19, 92, 108, 106, 8, 3, 96, 75, 4, 9, 14, 69, 67, 70, 100, 109, 35, 32, 72, 36, 99, 68, 11, 73, 65, 1], [124, 127, 53, 61, 125, 126, 59, 41, 62, 57, 110, 117, 105, 56, 46, 55, 51, 63, 119, 86, 22, 114, 48, 112, 98, 111, 118, 58, 52, 18, 116, 102, 49, 50, 37, 82, 88, 101, 42, 122, 20, 60, 113, 115, 123, 38, 39, 84, 24, 28, 81, 47, 103, 29, 26, 43, 15, 17, 34, 54, 33, 120, 45, 87, 12, 121, 13, 31, 79, 64, 23, 40, 27, 93, 107, 106, 94, 97, 21, 44, 92, 10, 108, 77, 89, 16, 74, 30, 90, 76, 91, 83, 95, 25, 71, 109, 7, 19, 0, 96, 35, 78, 85, 8, 100, 66, 104, 80, 99, 70, 4, 2, 32, 14, 75, 6, 5, 36, 3, 11, 67, 69, 1, 72, 68, 9, 73, 65], [124, 127, 61, 53, 126, 125, 41, 62, 57, 59, 110, 105, 117, 56, 46, 63, 55, 51, 119, 86, 22, 114, 48, 98, 112, 111, 18, 37, 118, 38, 60, 24, 116, 84, 102, 82, 52, 58, 113, 20, 39, 123, 101, 88, 50, 115, 120, 28, 103, 45, 15, 81, 49, 26, 43, 42, 23, 31, 47, 122, 87, 29, 27, 17, 34, 33, 94, 40, 97, 12, 13, 10, 0, 54, 91, 107, 79, 77, 121, 44, 109, 93, 21, 104, 108, 74, 64, 83, 90, 76, 106, 95, 70, 19, 80, 71, 25, 96, 66, 30, 7, 92, 16, 78, 2, 85, 89, 32, 14, 8, 100, 68, 35, 99, 4, 75, 72, 73, 5, 69, 36, 3, 1, 11, 65, 9, 6, 67], [124, 127, 53, 61, 126, 125, 62, 41, 57, 59, 110, 105, 56, 117, 46, 63, 55, 51, 86, 119, 114, 22, 48, 58, 111, 98, 18, 116, 112, 60, 113, 118, 52, 82, 49, 88, 45, 50, 103, 123, 39, 84, 38, 102, 122, 37, 43, 42, 47, 28, 26, 20, 101, 24, 81, 27, 54, 79, 15, 87, 13, 120, 23, 31, 10, 40, 77, 33, 17, 107, 94, 29, 121, 34, 115, 97, 83, 85, 30, 76, 64, 12, 70, 71, 104, 91, 21, 44, 93, 108, 95, 90, 25, 74, 89, 7, 19, 92, 16, 80, 3, 66, 0, 106, 14, 35, 78, 2, 96, 68, 69, 8, 4, 32, 109, 9, 75, 100, 11, 67, 5, 99, 73, 36, 72, 1, 6, 65], [124, 127, 61, 53, 126, 125, 62, 57, 41, 59, 110, 117, 105, 46, 56, 55, 51, 63, 86, 119, 22, 114, 48, 60, 111, 113, 118, 98, 52, 82, 122, 47, 123, 49, 116, 18, 102, 112, 39, 58, 88, 101, 84, 26, 37, 50, 20, 28, 43, 17, 24, 45, 38, 121, 81, 31, 103, 27, 42, 23, 34, 79, 29, 97, 15, 33, 54, 87, 76, 115, 120, 12, 108, 13, 21, 40, 94, 77, 70, 91, 10, 107, 106, 74, 93, 0, 44, 30, 83, 25, 89, 2, 7, 96, 16, 19, 32, 85, 71, 92, 64, 90, 95, 80, 14, 109, 100, 73, 78, 8, 35, 99, 66, 72, 104, 4, 3, 75, 69, 36, 5, 67, 6, 11, 65, 1, 68, 9], [124, 61, 127, 53, 126, 125, 59, 57, 62, 41, 110, 105, 117, 56, 46, 55, 63, 86, 51, 119, 22, 114, 48, 60, 113, 112, 58, 49, 18, 111, 98, 122, 88, 116, 118, 52, 82, 50, 102, 101, 84, 26, 123, 39, 24, 121, 81, 38, 15, 37, 20, 34, 45, 103, 28, 17, 79, 115, 76, 42, 120, 29, 33, 47, 13, 31, 43, 107, 27, 87, 40, 77, 74, 21, 23, 10, 25, 12, 97, 93, 94, 54, 90, 30, 83, 89, 64, 91, 80, 44, 7, 70, 85, 96, 92, 19, 78, 104, 108, 100, 0, 99, 71, 106, 35, 109, 16, 95, 36, 32, 72, 8, 73, 14, 75, 6, 68, 66, 69, 11, 2, 4, 5, 9, 1, 67, 3, 65], [124, 127, 61, 53, 126, 125, 41, 59, 62, 57, 110, 105, 117, 56, 46, 55, 63, 51, 119, 86, 22, 114, 122, 48, 58, 60, 112, 98, 88, 82, 116, 52, 111, 49, 120, 26, 43, 34, 18, 50, 102, 113, 121, 24, 101, 45, 28, 27, 123, 37, 40, 84, 42, 118, 38, 103, 31, 20, 29, 15, 47, 115, 39, 17, 94, 81, 23, 87, 21, 33, 79, 97, 91, 77, 107, 12, 109, 13, 93, 96, 76, 74, 0, 54, 104, 95, 108, 44, 80, 89, 25, 10, 90, 106, 64, 85, 30, 19, 3, 99, 78, 100, 70, 83, 7, 92, 68, 71, 2, 36, 72, 66, 67, 16, 35, 5, 6, 32, 4, 8, 73, 14, 65, 69, 1, 11, 9, 75], [124, 127, 53, 61, 126, 125, 59, 57, 41, 62, 110, 117, 105, 56, 46, 55, 63, 51, 119, 86, 22, 114, 111, 48, 98, 58, 82, 112, 113, 116, 49, 60, 18, 122, 102, 26, 84, 52, 118, 28, 79, 88, 120, 50, 121, 103, 123, 47, 20, 43, 101, 34, 45, 33, 29, 24, 39, 76, 81, 0, 42, 15, 17, 38, 37, 77, 87, 31, 94, 64, 54, 27, 10, 40, 23, 74, 12, 2, 97, 93, 7, 13, 107, 83, 21, 71, 115, 66, 6, 19, 80, 85, 44, 4, 109, 72, 106, 95, 89, 104, 96, 91, 30, 1, 92, 78, 16, 99, 25, 5, 108, 90, 14, 35, 32, 67, 9, 75, 68, 11, 100, 36, 73, 65, 70, 69, 3, 8], [124, 127, 61, 53, 125, 126, 41, 57, 62, 59, 105, 110, 117, 46, 56, 63, 55, 51, 86, 119, 22, 114, 113, 48, 111, 98, 112, 52, 122, 102, 116, 18, 47, 37, 49, 82, 123, 60, 118, 58, 50, 20, 26, 39, 101, 103, 24, 88, 34, 81, 42, 87, 38, 45, 33, 31, 120, 29, 28, 43, 15, 0, 76, 40, 121, 84, 97, 23, 17, 27, 21, 77, 79, 93, 107, 94, 7, 44, 74, 64, 2, 10, 92, 13, 106, 6, 115, 19, 89, 91, 12, 96, 90, 54, 95, 109, 80, 30, 108, 100, 16, 71, 104, 72, 66, 25, 32, 5, 78, 14, 69, 35, 85, 36, 83, 11, 9, 99, 4, 67, 75, 1, 3, 65, 68, 8, 73, 70], [124, 127, 61, 53, 126, 125, 41, 62, 57, 59, 110, 105, 56, 117, 46, 55, 51, 63, 86, 119, 114, 113, 22, 111, 48, 52, 58, 112, 60, 116, 98, 18, 102, 47, 49, 82, 50, 103, 26, 84, 88, 123, 37, 24, 28, 15, 122, 118, 38, 20, 39, 87, 101, 31, 81, 43, 42, 34, 121, 77, 97, 33, 17, 45, 40, 13, 29, 76, 10, 27, 79, 0, 23, 54, 74, 115, 108, 94, 12, 107, 6, 21, 89, 64, 99, 83, 2, 104, 72, 80, 90, 30, 120, 93, 44, 25, 71, 16, 109, 67, 96, 7, 91, 66, 19, 3, 106, 14, 92, 85, 95, 4, 78, 35, 75, 32, 36, 11, 5, 65, 100, 1, 73, 69, 8, 68, 70, 9], [124, 127, 61, 53, 126, 125, 41, 59, 62, 57, 110, 105, 117, 46, 56, 55, 63, 51, 119, 114, 22, 86, 60, 113, 48, 98, 111, 112, 52, 116, 18, 58, 122, 102, 82, 84, 118, 24, 47, 50, 88, 28, 103, 123, 43, 15, 26, 121, 49, 42, 39, 101, 20, 38, 45, 31, 17, 87, 97, 120, 81, 76, 23, 34, 29, 77, 37, 13, 40, 54, 33, 0, 27, 21, 79, 7, 44, 115, 12, 94, 74, 10, 89, 107, 106, 96, 93, 6, 83, 108, 72, 25, 2, 30, 64, 95, 71, 80, 68, 92, 104, 16, 35, 91, 4, 14, 85, 90, 19, 66, 75, 36, 100, 11, 78, 99, 32, 109, 9, 73, 70, 5, 69, 65, 1, 8, 67, 3], [124, 127, 61, 126, 53, 125, 62, 57, 59, 41, 110, 105, 56, 117, 46, 55, 51, 63, 119, 114, 86, 22, 48, 113, 60, 98, 18, 111, 116, 112, 24, 49, 82, 103, 123, 84, 88, 118, 122, 52, 58, 28, 26, 102, 50, 15, 20, 17, 76, 81, 121, 79, 13, 45, 39, 38, 43, 34, 87, 101, 23, 77, 31, 94, 21, 115, 47, 97, 29, 74, 37, 12, 10, 33, 42, 120, 40, 27, 54, 107, 44, 7, 83, 30, 0, 93, 89, 64, 71, 25, 91, 78, 11, 92, 6, 16, 90, 104, 96, 108, 72, 80, 66, 19, 85, 14, 5, 95, 70, 2, 75, 67, 4, 32, 3, 69, 36, 8, 106, 109, 99, 35, 100, 65, 68, 73, 9, 1]], "model.layers.10.self_attn.q_proj": [[121, 122, 26, 65, 36, 83, 50, 79, 21, 24, 81, 63, 89, 29, 94, 9, 0, 64, 68, 16, 6, 19, 99, 1, 91, 78, 8, 17, 23, 2, 58, 77, 75, 111, 69, 124, 86, 12, 114, 71, 95, 82, 85, 44, 52, 14, 120, 76, 30, 37, 67, 51, 96, 5, 118, 57, 31, 54, 70, 11, 123, 74, 66, 93, 20, 84, 47, 27, 60, 127, 61, 40, 4, 13, 10, 3, 18, 125, 73, 80, 102, 87, 53, 49, 117, 115, 119, 62, 113, 116, 100, 72, 112, 55, 45, 7, 15, 126, 59, 90, 48, 105, 104, 110, 92, 41, 88, 33, 56, 46, 109, 35, 42, 25, 103, 22, 39, 107, 97, 38, 106, 108, 43, 34, 32, 28, 101, 98], [121, 122, 63, 87, 50, 36, 75, 52, 118, 28, 12, 124, 17, 120, 94, 123, 60, 61, 54, 125, 51, 33, 73, 29, 57, 62, 127, 119, 40, 117, 34, 44, 115, 53, 113, 102, 15, 49, 47, 58, 111, 19, 112, 55, 23, 70, 126, 110, 114, 59, 14, 109, 46, 105, 24, 56, 97, 116, 45, 104, 99, 38, 31, 32, 21, 71, 16, 86, 13, 39, 107, 48, 76, 106, 80, 37, 35, 85, 103, 25, 101, 20, 88, 91, 100, 92, 96, 98, 108, 93, 41, 5, 18, 84, 66, 95, 67, 26, 22, 89, 90, 42, 64, 27, 83, 30, 43, 79, 11, 3, 74, 82, 4, 1, 8, 77, 72, 81, 7, 65, 10, 9, 0, 6, 2, 69, 78, 68], [50, 121, 122, 114, 63, 56, 62, 115, 120, 60, 58, 51, 49, 116, 84, 87, 59, 119, 53, 11, 113, 52, 15, 127, 44, 17, 117, 109, 45, 55, 124, 61, 126, 54, 47, 57, 123, 48, 107, 14, 111, 46, 72, 118, 108, 125, 110, 43, 112, 7, 24, 2, 27, 78, 106, 25, 99, 101, 75, 41, 4, 85, 42, 16, 77, 80, 19, 82, 10, 105, 1, 81, 3, 74, 104, 18, 13, 12, 9, 28, 83, 94, 40, 37, 97, 6, 71, 90, 73, 102, 103, 21, 39, 22, 29, 38, 23, 76, 30, 89, 93, 33, 5, 36, 0, 35, 32, 96, 34, 98, 70, 91, 95, 69, 31, 26, 92, 100, 86, 8, 65, 88, 79, 20, 66, 68, 67, 64], [121, 122, 50, 63, 14, 118, 52, 84, 124, 11, 123, 87, 120, 61, 97, 125, 15, 114, 54, 57, 62, 101, 60, 82, 47, 16, 17, 53, 119, 85, 28, 111, 115, 51, 112, 127, 117, 102, 58, 110, 40, 104, 55, 44, 109, 45, 75, 32, 59, 105, 46, 77, 113, 107, 49, 27, 106, 39, 37, 103, 126, 25, 33, 38, 81, 48, 116, 12, 34, 19, 41, 74, 80, 90, 108, 71, 72, 83, 24, 9, 13, 99, 91, 4, 2, 7, 56, 89, 21, 42, 78, 35, 6, 94, 36, 10, 96, 3, 76, 0, 73, 43, 92, 79, 93, 5, 30, 100, 23, 31, 98, 1, 22, 95, 29, 86, 26, 70, 88, 20, 66, 69, 18, 8, 67, 68, 65, 64], [103, 124, 113, 24, 82, 49, 21, 91, 31, 15, 32, 11, 62, 13, 83, 19, 16, 28, 50, 80, 72, 99, 27, 81, 63, 73, 93, 4, 76, 17, 88, 7, 75, 9, 14, 18, 74, 8, 85, 79, 12, 78, 26, 98, 77, 87, 86, 84, 64, 89, 100, 68, 1, 20, 25, 29, 45, 23, 35, 53, 92, 46, 70, 66, 90, 22, 71, 94, 10, 69, 33, 30, 34, 104, 95, 40, 6, 123, 101, 5, 67, 0, 60, 61, 122, 3, 110, 96, 59, 39, 109, 51, 111, 126, 2, 65, 117, 97, 112, 127, 116, 43, 55, 58, 54, 120, 125, 115, 105, 102, 44, 36, 37, 57, 48, 38, 56, 108, 41, 114, 121, 106, 107, 118, 52, 42, 47, 119], [62, 103, 124, 113, 49, 122, 61, 115, 57, 45, 40, 87, 26, 50, 119, 112, 53, 32, 48, 58, 104, 109, 29, 54, 121, 120, 126, 63, 59, 117, 118, 60, 90, 95, 108, 110, 93, 55, 127, 46, 116, 52, 125, 51, 94, 123, 111, 56, 114, 99, 47, 96, 44, 105, 42, 107, 31, 106, 41, 27, 43, 14, 33, 19, 17, 91, 38, 97, 28, 23, 102, 98, 21, 30, 20, 101, 74, 100, 81, 35, 36, 37, 84, 34, 24, 39, 92, 86, 78, 25, 22, 89, 70, 6, 13, 66, 88, 68, 15, 69, 83, 7, 18, 75, 85, 67, 9, 16, 79, 64, 77, 10, 5, 8, 72, 3, 12, 4, 82, 11, 71, 80, 65, 2, 0, 1, 73, 76], [113, 124, 103, 49, 62, 50, 53, 46, 26, 122, 45, 61, 32, 40, 120, 51, 59, 123, 63, 87, 126, 112, 109, 5, 58, 117, 57, 30, 47, 100, 35, 127, 115, 111, 90, 8, 108, 17, 54, 104, 121, 105, 56, 114, 68, 125, 33, 95, 48, 60, 1, 99, 52, 116, 83, 118, 97, 106, 7, 94, 31, 107, 101, 41, 25, 64, 110, 119, 43, 34, 80, 91, 75, 79, 6, 29, 93, 38, 44, 3, 36, 9, 42, 77, 92, 55, 102, 12, 98, 28, 37, 27, 0, 66, 23, 14, 96, 76, 78, 84, 20, 21, 2, 82, 86, 88, 24, 15, 19, 39, 74, 73, 89, 10, 85, 81, 67, 11, 22, 70, 71, 16, 65, 69, 4, 72, 13, 18], [113, 124, 62, 40, 103, 50, 49, 46, 122, 83, 59, 112, 126, 87, 5, 53, 8, 127, 109, 45, 77, 120, 61, 117, 1, 78, 26, 123, 6, 68, 80, 55, 63, 54, 75, 35, 58, 125, 42, 51, 57, 7, 114, 9, 115, 3, 60, 17, 111, 118, 76, 52, 56, 47, 15, 116, 119, 108, 48, 110, 121, 44, 41, 82, 30, 43, 107, 104, 32, 106, 0, 73, 105, 12, 101, 64, 33, 100, 38, 66, 102, 36, 37, 79, 11, 89, 85, 34, 2, 23, 71, 90, 84, 31, 99, 94, 24, 92, 95, 67, 29, 81, 98, 96, 97, 20, 21, 28, 25, 93, 14, 10, 19, 39, 74, 86, 65, 88, 91, 27, 22, 70, 69, 13, 4, 16, 18, 72], [102, 56, 49, 113, 23, 89, 55, 91, 19, 58, 85, 47, 37, 31, 20, 52, 18, 33, 14, 127, 95, 92, 122, 80, 124, 115, 117, 29, 125, 59, 120, 42, 112, 107, 123, 75, 61, 57, 46, 108, 28, 116, 114, 82, 50, 45, 121, 73, 48, 84, 109, 44, 77, 88, 41, 63, 51, 99, 39, 40, 53, 38, 103, 126, 34, 43, 97, 118, 32, 16, 13, 106, 11, 87, 100, 54, 104, 98, 94, 15, 86, 27, 105, 83, 60, 119, 111, 26, 35, 110, 78, 79, 62, 8, 93, 21, 30, 101, 5, 22, 25, 36, 90, 96, 74, 72, 6, 81, 10, 24, 71, 76, 17, 4, 7, 9, 12, 67, 66, 69, 65, 70, 0, 3, 1, 68, 2, 64], [102, 56, 113, 49, 85, 23, 91, 19, 80, 77, 9, 31, 11, 89, 58, 4, 13, 55, 6, 70, 68, 72, 47, 2, 73, 95, 75, 16, 21, 83, 52, 65, 8, 82, 64, 87, 39, 48, 27, 42, 7, 53, 15, 78, 37, 25, 109, 110, 124, 22, 26, 69, 125, 90, 112, 46, 5, 33, 81, 84, 79, 71, 50, 99, 88, 32, 120, 63, 12, 106, 17, 119, 35, 44, 127, 10, 61, 92, 30, 96, 107, 123, 29, 14, 0, 101, 76, 24, 28, 66, 118, 54, 40, 116, 57, 94, 115, 59, 114, 98, 45, 126, 60, 122, 41, 86, 67, 121, 18, 100, 20, 105, 36, 104, 43, 117, 111, 34, 93, 74, 97, 51, 62, 103, 108, 1, 3, 38], [102, 47, 113, 56, 49, 89, 91, 23, 85, 19, 31, 55, 58, 9, 80, 82, 77, 11, 14, 0, 120, 46, 68, 121, 72, 59, 51, 33, 66, 95, 83, 127, 37, 124, 114, 70, 52, 53, 88, 110, 78, 87, 84, 3, 48, 99, 10, 15, 112, 81, 57, 126, 60, 73, 62, 16, 35, 42, 1, 21, 109, 34, 20, 125, 32, 116, 108, 27, 79, 118, 5, 61, 12, 100, 96, 29, 7, 97, 30, 2, 93, 25, 44, 106, 40, 92, 71, 36, 104, 17, 43, 24, 39, 122, 123, 63, 22, 119, 67, 103, 69, 18, 26, 38, 13, 115, 4, 65, 117, 90, 50, 8, 54, 111, 98, 45, 105, 94, 64, 86, 101, 41, 74, 76, 28, 107, 75, 6], [102, 113, 56, 49, 23, 80, 19, 89, 9, 85, 77, 11, 70, 68, 31, 2, 55, 64, 83, 14, 1, 16, 72, 21, 91, 37, 87, 6, 58, 75, 25, 67, 82, 3, 66, 125, 13, 120, 47, 33, 27, 48, 52, 127, 7, 73, 42, 112, 78, 12, 110, 65, 8, 29, 53, 59, 5, 17, 57, 46, 24, 32, 44, 88, 22, 10, 81, 76, 116, 51, 50, 92, 124, 30, 114, 121, 40, 122, 61, 95, 109, 28, 39, 4, 84, 74, 93, 123, 90, 119, 99, 26, 43, 35, 86, 115, 18, 63, 79, 15, 62, 45, 106, 100, 118, 69, 126, 105, 103, 20, 107, 41, 104, 60, 71, 117, 94, 98, 101, 97, 54, 36, 108, 34, 96, 111, 0, 38], [52, 62, 122, 58, 102, 125, 61, 53, 60, 127, 123, 112, 121, 119, 63, 115, 114, 50, 57, 48, 117, 126, 124, 118, 56, 49, 46, 54, 55, 113, 120, 47, 116, 44, 111, 36, 110, 51, 107, 45, 59, 109, 108, 35, 42, 30, 105, 43, 106, 104, 89, 25, 40, 41, 101, 103, 100, 23, 39, 33, 38, 37, 96, 32, 92, 99, 98, 34, 95, 97, 94, 31, 87, 28, 27, 91, 75, 29, 26, 15, 93, 24, 90, 17, 84, 21, 9, 20, 82, 85, 88, 70, 2, 6, 11, 19, 81, 80, 77, 12, 0, 79, 16, 14, 74, 86, 7, 65, 10, 67, 73, 69, 3, 66, 13, 5, 4, 64, 83, 8, 18, 76, 71, 1, 72, 68, 22, 78], [58, 62, 52, 122, 60, 121, 63, 118, 125, 55, 112, 127, 115, 51, 114, 117, 126, 123, 50, 110, 46, 59, 61, 113, 119, 111, 43, 120, 56, 42, 48, 124, 47, 116, 49, 57, 45, 109, 104, 96, 53, 40, 41, 105, 54, 108, 106, 107, 37, 101, 39, 102, 32, 103, 44, 38, 100, 29, 25, 23, 35, 98, 36, 34, 99, 84, 33, 27, 95, 30, 19, 97, 86, 24, 94, 20, 90, 22, 31, 92, 28, 89, 93, 81, 91, 80, 17, 16, 88, 83, 13, 79, 77, 26, 87, 85, 15, 21, 73, 78, 76, 11, 18, 82, 14, 70, 75, 9, 72, 12, 6, 10, 0, 74, 66, 68, 7, 2, 71, 8, 3, 67, 69, 1, 65, 5, 4, 64], [62, 58, 52, 101, 121, 122, 60, 127, 125, 112, 55, 115, 118, 117, 123, 63, 51, 53, 57, 119, 114, 50, 113, 61, 126, 102, 111, 120, 124, 48, 46, 59, 110, 36, 116, 56, 42, 54, 109, 47, 45, 44, 106, 49, 108, 93, 105, 43, 107, 40, 41, 100, 96, 104, 90, 30, 39, 103, 38, 99, 94, 35, 34, 23, 29, 25, 28, 27, 97, 80, 95, 86, 98, 33, 19, 16, 85, 92, 89, 31, 91, 21, 32, 73, 75, 24, 87, 37, 83, 26, 9, 88, 81, 11, 3, 77, 13, 22, 20, 82, 68, 70, 84, 15, 71, 5, 69, 10, 74, 1, 78, 14, 17, 7, 18, 76, 8, 79, 6, 67, 64, 66, 4, 12, 2, 65, 72, 0], [58, 52, 62, 101, 24, 90, 34, 82, 93, 14, 12, 85, 19, 80, 121, 32, 60, 21, 86, 30, 37, 15, 20, 76, 78, 96, 9, 122, 18, 79, 69, 83, 16, 125, 67, 74, 10, 88, 94, 72, 55, 5, 8, 127, 100, 84, 29, 77, 87, 7, 17, 71, 1, 81, 112, 102, 26, 13, 25, 23, 118, 75, 89, 91, 113, 27, 68, 117, 0, 2, 114, 64, 98, 70, 120, 124, 11, 4, 61, 66, 115, 126, 73, 104, 119, 110, 92, 51, 63, 42, 28, 107, 123, 111, 31, 3, 116, 46, 50, 59, 45, 65, 57, 56, 53, 48, 95, 6, 47, 106, 97, 41, 33, 43, 99, 103, 109, 38, 40, 35, 105, 22, 108, 39, 36, 54, 49, 44], [118, 127, 26, 101, 17, 10, 86, 15, 77, 70, 98, 88, 82, 20, 74, 93, 6, 29, 68, 89, 13, 104, 30, 22, 72, 87, 2, 76, 85, 79, 81, 27, 92, 84, 90, 54, 24, 67, 37, 28, 34, 71, 18, 11, 7, 52, 19, 78, 3, 9, 80, 1, 12, 40, 91, 109, 59, 65, 126, 75, 95, 73, 97, 0, 66, 16, 42, 63, 64, 23, 5, 21, 83, 4, 94, 105, 62, 31, 14, 47, 113, 69, 121, 32, 33, 96, 25, 39, 43, 35, 36, 8, 38, 107, 50, 110, 99, 44, 41, 53, 103, 108, 100, 48, 56, 115, 122, 112, 46, 116, 124, 58, 102, 117, 45, 111, 55, 114, 49, 120, 61, 119, 106, 123, 125, 60, 51, 57], [118, 127, 101, 98, 37, 126, 88, 86, 62, 39, 30, 29, 50, 20, 63, 91, 104, 52, 26, 113, 72, 48, 121, 54, 112, 56, 119, 93, 82, 38, 97, 116, 43, 125, 59, 122, 44, 110, 60, 92, 66, 83, 117, 0, 35, 51, 25, 40, 109, 58, 77, 123, 53, 46, 102, 45, 55, 15, 69, 115, 41, 120, 108, 28, 31, 57, 61, 49, 1, 47, 90, 114, 99, 24, 11, 84, 111, 89, 106, 75, 36, 124, 94, 33, 103, 8, 6, 4, 79, 23, 68, 76, 80, 96, 18, 107, 73, 105, 42, 74, 3, 64, 13, 17, 100, 22, 32, 81, 78, 2, 95, 85, 16, 87, 19, 9, 27, 12, 14, 34, 21, 5, 71, 7, 65, 70, 10, 67], [127, 118, 62, 101, 126, 56, 104, 113, 63, 54, 119, 48, 112, 52, 82, 40, 60, 121, 44, 50, 122, 116, 98, 43, 51, 123, 59, 117, 88, 58, 97, 125, 120, 91, 37, 106, 110, 55, 57, 47, 109, 46, 53, 45, 114, 115, 61, 39, 86, 42, 111, 29, 49, 107, 23, 41, 8, 124, 108, 105, 100, 35, 102, 93, 36, 38, 103, 83, 96, 72, 31, 92, 26, 99, 33, 66, 30, 87, 28, 4, 95, 89, 32, 34, 80, 90, 25, 94, 22, 27, 16, 18, 68, 24, 20, 15, 19, 0, 77, 85, 12, 84, 2, 69, 78, 13, 14, 73, 21, 75, 11, 74, 71, 5, 76, 79, 81, 7, 17, 1, 64, 6, 9, 65, 3, 70, 10, 67], [127, 118, 101, 126, 56, 97, 88, 82, 62, 29, 63, 54, 52, 40, 113, 48, 44, 28, 23, 119, 109, 60, 43, 98, 112, 36, 122, 117, 50, 41, 121, 106, 45, 116, 59, 8, 86, 51, 37, 123, 115, 104, 47, 125, 91, 32, 26, 114, 58, 110, 120, 105, 89, 46, 39, 61, 102, 111, 55, 107, 93, 30, 57, 53, 108, 72, 42, 100, 38, 16, 20, 4, 92, 49, 12, 99, 124, 103, 35, 18, 34, 31, 33, 94, 77, 83, 13, 27, 15, 71, 79, 96, 95, 87, 81, 22, 14, 25, 80, 24, 75, 84, 21, 11, 85, 76, 69, 74, 68, 0, 6, 66, 90, 73, 1, 17, 19, 7, 3, 78, 9, 5, 2, 70, 10, 64, 65, 67], [42, 96, 106, 120, 45, 125, 26, 23, 20, 118, 62, 109, 78, 52, 98, 80, 67, 63, 124, 90, 122, 32, 82, 71, 126, 7, 34, 101, 94, 58, 27, 56, 87, 14, 73, 55, 77, 116, 12, 86, 97, 123, 13, 81, 48, 85, 91, 115, 35, 57, 103, 104, 114, 43, 50, 16, 49, 18, 99, 110, 111, 39, 76, 33, 22, 64, 65, 117, 24, 119, 37, 10, 1, 53, 92, 59, 31, 47, 60, 28, 112, 84, 17, 19, 46, 121, 83, 44, 127, 21, 29, 38, 89, 113, 54, 30, 74, 5, 107, 70, 61, 93, 41, 51, 95, 105, 8, 4, 102, 88, 15, 108, 100, 40, 25, 36, 3, 6, 68, 79, 9, 75, 66, 69, 72, 11, 2, 0], [42, 96, 106, 125, 45, 23, 26, 62, 20, 109, 52, 98, 80, 124, 118, 126, 120, 34, 63, 78, 90, 82, 55, 27, 94, 122, 101, 77, 56, 43, 116, 46, 111, 58, 103, 81, 44, 35, 119, 114, 97, 104, 53, 7, 123, 87, 48, 127, 115, 50, 99, 49, 107, 60, 39, 38, 41, 91, 113, 117, 110, 59, 47, 30, 100, 54, 121, 102, 40, 85, 37, 57, 67, 61, 108, 105, 36, 112, 31, 95, 18, 93, 16, 51, 28, 32, 33, 29, 73, 13, 92, 21, 14, 86, 84, 71, 24, 25, 17, 19, 12, 5, 22, 89, 10, 88, 83, 74, 9, 11, 1, 65, 69, 79, 75, 15, 76, 64, 3, 8, 72, 4, 6, 68, 70, 66, 0, 2], [42, 96, 125, 106, 52, 45, 23, 20, 26, 82, 80, 78, 120, 98, 118, 109, 5, 62, 73, 7, 77, 12, 124, 90, 101, 86, 126, 103, 67, 63, 122, 32, 94, 35, 27, 87, 9, 56, 16, 1, 55, 58, 21, 69, 85, 91, 83, 64, 84, 76, 114, 123, 75, 3, 50, 34, 13, 18, 116, 115, 22, 11, 29, 70, 17, 40, 19, 43, 15, 48, 10, 39, 97, 81, 14, 74, 30, 110, 4, 79, 37, 31, 89, 44, 108, 104, 24, 49, 25, 71, 47, 66, 59, 92, 99, 60, 113, 28, 95, 112, 107, 88, 51, 8, 111, 100, 117, 72, 121, 57, 93, 6, 119, 105, 38, 41, 33, 36, 65, 54, 127, 68, 61, 0, 2, 102, 46, 53], [42, 96, 45, 106, 125, 23, 124, 26, 20, 52, 109, 98, 120, 62, 118, 126, 80, 122, 82, 115, 94, 56, 103, 116, 55, 78, 34, 90, 63, 58, 43, 48, 87, 114, 97, 101, 50, 60, 77, 110, 39, 41, 35, 104, 13, 123, 31, 47, 108, 53, 107, 99, 119, 113, 30, 19, 121, 27, 117, 33, 57, 49, 51, 40, 7, 61, 38, 102, 54, 92, 44, 85, 100, 67, 46, 32, 111, 93, 95, 36, 59, 37, 105, 81, 91, 112, 18, 28, 16, 29, 84, 127, 71, 86, 24, 14, 25, 73, 21, 89, 88, 79, 76, 12, 22, 74, 72, 69, 83, 17, 11, 15, 10, 9, 5, 68, 75, 65, 1, 8, 4, 66, 6, 64, 70, 0, 3, 2], [102, 62, 126, 98, 87, 26, 110, 85, 51, 41, 30, 19, 38, 45, 50, 22, 15, 79, 103, 94, 90, 84, 12, 93, 31, 17, 16, 113, 21, 109, 69, 105, 112, 20, 125, 127, 23, 8, 53, 65, 83, 25, 32, 117, 119, 14, 88, 63, 6, 55, 99, 58, 1, 116, 122, 75, 46, 57, 60, 96, 100, 108, 34, 121, 124, 52, 80, 120, 115, 74, 59, 106, 24, 89, 114, 49, 97, 111, 10, 76, 104, 118, 0, 2, 95, 64, 27, 86, 47, 40, 73, 123, 48, 81, 36, 92, 70, 54, 67, 82, 28, 42, 66, 5, 35, 29, 91, 78, 18, 77, 9, 13, 56, 44, 72, 39, 11, 101, 71, 68, 107, 61, 33, 37, 4, 43, 7, 3], [102, 126, 62, 98, 87, 17, 85, 110, 90, 26, 22, 19, 50, 8, 14, 75, 45, 103, 41, 15, 94, 21, 93, 89, 6, 51, 24, 113, 109, 2, 30, 84, 88, 83, 68, 92, 12, 38, 27, 58, 20, 31, 72, 79, 78, 112, 122, 23, 25, 74, 80, 32, 4, 71, 16, 117, 119, 53, 49, 127, 77, 29, 47, 7, 35, 81, 76, 60, 105, 10, 111, 28, 55, 120, 116, 52, 82, 0, 124, 46, 48, 5, 91, 13, 99, 125, 34, 67, 39, 36, 33, 121, 96, 97, 57, 95, 86, 63, 37, 118, 65, 115, 18, 44, 100, 66, 104, 11, 73, 108, 59, 42, 114, 43, 101, 9, 61, 69, 123, 54, 56, 3, 106, 64, 107, 70, 1, 40], [102, 126, 98, 62, 26, 41, 45, 87, 85, 19, 17, 93, 38, 103, 22, 15, 94, 51, 21, 50, 12, 113, 92, 110, 32, 75, 14, 30, 8, 100, 90, 31, 79, 83, 84, 88, 89, 20, 86, 35, 36, 54, 55, 25, 117, 127, 125, 119, 76, 13, 77, 81, 123, 46, 82, 23, 6, 2, 16, 53, 109, 39, 18, 27, 121, 59, 116, 70, 74, 97, 69, 80, 114, 47, 111, 52, 57, 105, 29, 118, 78, 58, 24, 49, 11, 63, 95, 99, 108, 122, 96, 115, 112, 10, 48, 91, 104, 124, 42, 66, 33, 72, 120, 40, 60, 101, 34, 37, 7, 43, 73, 65, 28, 1, 4, 107, 67, 5, 71, 44, 106, 61, 68, 9, 56, 3, 64, 0], [62, 102, 126, 98, 87, 41, 110, 26, 85, 38, 50, 45, 22, 105, 103, 109, 30, 113, 94, 63, 112, 93, 58, 125, 84, 121, 53, 127, 119, 15, 122, 31, 51, 116, 59, 57, 19, 52, 114, 55, 47, 97, 117, 118, 90, 44, 46, 43, 60, 49, 111, 108, 56, 42, 106, 54, 120, 124, 48, 115, 101, 36, 107, 32, 40, 123, 104, 100, 99, 61, 35, 37, 25, 39, 20, 96, 12, 17, 83, 95, 79, 92, 88, 27, 28, 33, 91, 29, 16, 8, 89, 23, 74, 21, 73, 34, 6, 75, 24, 10, 18, 82, 14, 69, 78, 86, 80, 71, 2, 77, 76, 5, 13, 81, 9, 7, 1, 65, 68, 11, 72, 70, 67, 0, 64, 66, 3, 4], [120, 122, 54, 37, 53, 127, 19, 101, 87, 17, 113, 35, 91, 58, 93, 60, 123, 124, 13, 115, 50, 117, 55, 56, 89, 63, 51, 118, 116, 32, 75, 45, 74, 114, 15, 119, 57, 14, 125, 112, 80, 49, 33, 36, 62, 52, 48, 100, 98, 12, 61, 97, 103, 47, 34, 110, 96, 90, 44, 73, 106, 59, 43, 126, 111, 41, 7, 26, 38, 18, 39, 121, 102, 85, 46, 108, 99, 104, 109, 107, 105, 29, 95, 42, 31, 27, 21, 20, 83, 81, 40, 72, 23, 94, 28, 92, 25, 30, 88, 24, 86, 5, 70, 68, 82, 67, 65, 2, 22, 0, 78, 77, 79, 84, 11, 9, 76, 16, 71, 8, 3, 10, 4, 69, 6, 1, 64, 66], [120, 122, 37, 13, 19, 17, 75, 14, 87, 15, 80, 74, 32, 85, 20, 53, 12, 18, 54, 73, 113, 127, 93, 58, 123, 26, 88, 115, 7, 30, 60, 50, 35, 124, 51, 55, 91, 45, 117, 72, 101, 116, 56, 63, 118, 94, 119, 57, 114, 90, 98, 62, 125, 112, 52, 25, 92, 49, 48, 86, 24, 61, 23, 36, 126, 33, 31, 96, 59, 47, 46, 34, 44, 82, 111, 110, 43, 70, 79, 41, 121, 21, 109, 99, 84, 29, 81, 108, 106, 100, 5, 42, 103, 27, 105, 104, 107, 16, 39, 97, 40, 95, 38, 102, 78, 77, 28, 22, 89, 68, 11, 83, 76, 67, 9, 65, 10, 2, 8, 71, 3, 0, 4, 6, 69, 64, 1, 66], [122, 120, 37, 84, 7, 16, 67, 74, 66, 69, 19, 75, 1, 68, 72, 24, 5, 70, 73, 12, 94, 78, 71, 11, 8, 53, 6, 64, 23, 13, 4, 3, 82, 88, 15, 54, 14, 80, 20, 90, 32, 30, 58, 92, 87, 93, 18, 113, 26, 115, 79, 77, 10, 123, 127, 50, 51, 60, 76, 124, 17, 117, 55, 56, 85, 21, 116, 96, 83, 63, 45, 118, 35, 9, 25, 65, 98, 0, 114, 119, 57, 28, 112, 101, 62, 49, 52, 29, 99, 48, 86, 125, 31, 61, 126, 59, 36, 81, 46, 91, 41, 44, 111, 47, 43, 22, 121, 110, 27, 95, 100, 109, 108, 2, 97, 33, 34, 106, 89, 105, 42, 102, 104, 39, 107, 103, 40, 38], [120, 122, 54, 87, 19, 53, 127, 123, 113, 60, 58, 124, 125, 115, 117, 55, 51, 116, 50, 17, 62, 86, 56, 27, 57, 119, 118, 45, 94, 49, 25, 52, 85, 114, 61, 96, 48, 28, 63, 23, 126, 112, 59, 44, 47, 111, 121, 24, 46, 109, 81, 43, 110, 90, 41, 106, 108, 84, 42, 107, 39, 83, 105, 82, 103, 35, 104, 40, 21, 37, 98, 38, 88, 36, 102, 89, 91, 79, 20, 65, 100, 101, 77, 16, 80, 97, 34, 99, 22, 78, 0, 33, 13, 10, 30, 29, 26, 76, 95, 31, 64, 93, 11, 18, 2, 9, 14, 15, 32, 3, 92, 71, 68, 4, 8, 7, 12, 75, 5, 6, 67, 69, 74, 1, 73, 70, 72, 66]], "model.layers.10.self_attn.k_proj": [[122, 121, 100, 118, 86, 63, 114, 123, 54, 61, 124, 119, 62, 52, 125, 115, 57, 108, 111, 117, 120, 60, 112, 31, 53, 126, 51, 127, 116, 59, 47, 58, 110, 55, 102, 46, 48, 83, 0, 45, 109, 49, 113, 56, 98, 29, 105, 104, 77, 9, 40, 44, 76, 103, 39, 38, 91, 106, 80, 81, 24, 6, 107, 35, 101, 34, 41, 37, 43, 21, 42, 79, 97, 26, 23, 1, 30, 22, 28, 36, 99, 32, 89, 88, 84, 3, 69, 50, 10, 95, 96, 92, 78, 19, 93, 33, 4, 20, 90, 82, 11, 85, 87, 7, 27, 25, 94, 14, 2, 18, 72, 8, 74, 5, 16, 71, 13, 66, 15, 67, 73, 65, 68, 70, 12, 75, 64, 17], [113, 124, 39, 50, 53, 96, 93, 21, 123, 120, 49, 15, 62, 126, 125, 63, 51, 112, 11, 87, 24, 82, 122, 58, 55, 114, 48, 117, 13, 91, 104, 116, 46, 61, 73, 30, 54, 115, 57, 71, 52, 95, 84, 26, 81, 118, 76, 121, 59, 127, 47, 80, 60, 111, 108, 19, 110, 86, 8, 99, 109, 56, 40, 45, 119, 100, 92, 89, 3, 10, 44, 35, 14, 1, 2, 98, 43, 68, 107, 79, 6, 106, 41, 27, 88, 0, 105, 42, 18, 38, 5, 102, 101, 36, 37, 33, 28, 23, 97, 77, 16, 17, 32, 94, 69, 34, 78, 66, 70, 7, 25, 31, 22, 72, 64, 4, 74, 29, 9, 83, 65, 90, 85, 20, 67, 12, 75, 103], [113, 56, 38, 80, 19, 77, 85, 23, 89, 11, 91, 9, 95, 58, 70, 72, 68, 55, 64, 14, 76, 1, 66, 20, 46, 123, 78, 111, 127, 44, 59, 106, 116, 112, 47, 17, 57, 53, 121, 69, 114, 117, 115, 71, 125, 51, 101, 109, 86, 118, 126, 42, 79, 124, 97, 52, 74, 48, 108, 45, 18, 96, 120, 24, 28, 82, 15, 37, 50, 33, 98, 39, 60, 122, 29, 119, 31, 93, 92, 32, 43, 3, 62, 103, 90, 61, 99, 34, 67, 5, 63, 12, 40, 49, 26, 88, 110, 35, 7, 30, 94, 100, 54, 104, 36, 81, 41, 105, 22, 8, 107, 10, 75, 25, 27, 87, 6, 84, 21, 0, 13, 83, 2, 73, 102, 16, 65, 4], [58, 62, 86, 37, 52, 96, 121, 29, 90, 14, 82, 17, 19, 12, 80, 60, 127, 77, 10, 72, 118, 125, 117, 24, 119, 55, 51, 20, 115, 63, 114, 126, 57, 120, 123, 79, 7, 100, 124, 50, 56, 44, 53, 48, 116, 98, 34, 59, 113, 46, 111, 110, 43, 45, 61, 25, 54, 93, 76, 112, 99, 109, 106, 70, 47, 9, 40, 49, 36, 15, 41, 122, 108, 105, 107, 11, 69, 104, 38, 42, 39, 102, 2, 23, 8, 97, 3, 85, 91, 21, 27, 35, 4, 95, 103, 31, 84, 83, 101, 6, 28, 33, 18, 74, 94, 30, 81, 65, 0, 92, 13, 22, 1, 16, 71, 87, 88, 68, 64, 32, 78, 5, 26, 89, 75, 73, 67, 66], [118, 127, 37, 126, 86, 63, 62, 10, 52, 34, 17, 48, 113, 30, 26, 119, 56, 121, 122, 15, 54, 70, 77, 112, 60, 43, 29, 50, 104, 59, 88, 51, 20, 120, 117, 116, 41, 110, 96, 115, 61, 123, 0, 3, 109, 57, 58, 55, 125, 47, 45, 53, 114, 108, 111, 49, 124, 106, 76, 103, 107, 46, 44, 82, 42, 71, 100, 2, 83, 38, 9, 65, 40, 68, 1, 72, 99, 32, 105, 92, 85, 39, 75, 35, 33, 95, 97, 36, 11, 91, 102, 14, 23, 31, 4, 80, 94, 67, 7, 64, 5, 73, 78, 69, 19, 25, 21, 27, 101, 93, 98, 81, 24, 90, 28, 89, 16, 12, 66, 79, 84, 87, 22, 8, 18, 74, 13, 6], [106, 32, 52, 26, 23, 20, 82, 125, 120, 42, 118, 45, 80, 78, 109, 126, 73, 58, 124, 77, 62, 112, 34, 12, 123, 114, 115, 50, 56, 55, 59, 54, 7, 5, 63, 85, 116, 47, 81, 122, 3, 51, 37, 119, 61, 39, 43, 103, 30, 104, 6, 1, 110, 127, 117, 101, 0, 107, 108, 11, 27, 48, 86, 65, 91, 46, 66, 13, 44, 57, 75, 74, 49, 35, 70, 111, 40, 60, 2, 22, 76, 64, 105, 98, 53, 79, 121, 102, 113, 33, 94, 29, 72, 71, 92, 36, 41, 83, 68, 8, 19, 25, 88, 14, 24, 100, 21, 4, 69, 38, 31, 97, 89, 10, 99, 95, 15, 28, 93, 96, 84, 90, 9, 17, 18, 87, 16, 67], [126, 62, 38, 34, 26, 94, 45, 85, 19, 41, 87, 22, 17, 110, 51, 84, 15, 50, 113, 12, 103, 75, 8, 14, 1, 127, 53, 93, 57, 112, 119, 117, 55, 116, 120, 39, 124, 125, 111, 122, 109, 59, 114, 118, 69, 63, 74, 52, 121, 102, 6, 29, 60, 115, 66, 28, 106, 5, 95, 73, 24, 108, 49, 32, 48, 58, 77, 123, 97, 46, 43, 33, 68, 105, 104, 54, 56, 16, 36, 47, 27, 35, 31, 40, 86, 61, 100, 44, 91, 64, 107, 4, 71, 67, 25, 78, 18, 42, 80, 9, 101, 99, 72, 23, 7, 37, 90, 3, 98, 82, 89, 92, 96, 70, 88, 0, 30, 21, 76, 13, 81, 20, 10, 83, 79, 11, 2, 65], [122, 120, 101, 24, 30, 82, 54, 127, 32, 79, 28, 84, 117, 26, 21, 113, 115, 124, 123, 55, 16, 27, 53, 56, 78, 60, 116, 50, 11, 77, 62, 114, 118, 9, 76, 51, 89, 63, 119, 57, 112, 52, 49, 125, 48, 93, 45, 72, 58, 126, 47, 59, 46, 44, 2, 70, 23, 61, 108, 81, 109, 10, 111, 110, 99, 121, 12, 65, 43, 5, 41, 106, 107, 29, 22, 35, 42, 39, 71, 104, 8, 105, 3, 103, 40, 14, 96, 97, 100, 4, 102, 38, 95, 98, 75, 83, 36, 80, 18, 69, 34, 68, 73, 33, 86, 6, 37, 31, 20, 92, 15, 7, 64, 87, 0, 91, 13, 90, 94, 25, 67, 88, 74, 17, 19, 85, 1, 66]], "model.layers.10.self_attn.qk_proj": [[122, 113, 62, 120, 118, 127, 126, 56, 121, 58, 52, 124, 106, 42, 54, 50, 90, 45, 110, 102, 55, 63, 117, 37, 125, 51, 116, 123, 53, 49, 60, 114, 87, 115, 85, 61, 38, 93, 57, 46, 23, 101, 59, 80, 83, 47, 32, 21, 22, 41, 112, 96, 34, 119, 103, 19, 16, 26, 25, 44, 13, 109, 20, 48, 104, 14, 77, 82, 86, 39, 43, 88, 27, 18, 84, 24, 100, 12, 11, 94, 98, 75, 81, 78, 30, 89, 108, 15, 29, 17, 111, 9, 91, 79, 73, 76, 31, 40, 107, 36, 35, 95, 97, 72, 6, 74, 105, 92, 10, 70, 99, 28, 71, 8, 68, 33, 64, 69, 65, 2, 1, 7, 3, 67, 5, 4, 0, 66], [122, 113, 62, 120, 118, 56, 121, 127, 126, 58, 52, 124, 106, 42, 54, 50, 117, 90, 116, 45, 37, 55, 110, 102, 60, 87, 114, 63, 123, 125, 85, 49, 51, 115, 23, 61, 38, 53, 101, 41, 93, 46, 103, 21, 80, 83, 96, 22, 34, 112, 104, 32, 19, 109, 59, 20, 26, 111, 27, 48, 43, 13, 119, 57, 25, 47, 98, 24, 84, 16, 86, 77, 88, 18, 14, 89, 39, 100, 30, 81, 108, 79, 12, 82, 31, 94, 78, 11, 75, 17, 29, 44, 9, 40, 91, 73, 95, 76, 15, 70, 92, 72, 97, 33, 74, 36, 105, 10, 35, 99, 6, 107, 69, 4, 8, 0, 68, 28, 65, 71, 64, 7, 5, 3, 67, 2, 66, 1], [122, 113, 62, 120, 118, 127, 121, 56, 126, 58, 52, 106, 124, 42, 54, 50, 116, 102, 55, 110, 37, 125, 51, 45, 90, 38, 60, 53, 114, 49, 63, 123, 87, 101, 117, 23, 93, 57, 115, 34, 96, 32, 119, 26, 85, 27, 43, 41, 103, 83, 22, 104, 19, 109, 21, 48, 112, 24, 25, 46, 61, 111, 16, 80, 59, 30, 94, 20, 84, 47, 86, 100, 88, 13, 77, 95, 18, 89, 39, 31, 98, 91, 29, 81, 14, 82, 35, 108, 78, 75, 12, 79, 92, 44, 11, 99, 15, 17, 40, 9, 70, 36, 73, 33, 76, 72, 107, 105, 28, 74, 97, 71, 4, 10, 1, 6, 7, 68, 64, 69, 8, 5, 65, 0, 66, 67, 3, 2], [122, 113, 62, 120, 118, 121, 127, 126, 56, 58, 52, 106, 124, 42, 54, 55, 117, 50, 90, 102, 45, 125, 60, 51, 114, 123, 37, 110, 116, 87, 63, 49, 53, 112, 23, 93, 115, 46, 119, 101, 38, 85, 96, 19, 57, 27, 47, 41, 61, 43, 109, 34, 22, 16, 104, 25, 83, 32, 59, 21, 80, 24, 44, 100, 26, 77, 13, 48, 111, 39, 88, 103, 30, 81, 18, 86, 94, 82, 84, 89, 98, 78, 20, 108, 17, 91, 12, 14, 40, 75, 95, 92, 11, 31, 35, 15, 105, 79, 9, 29, 107, 72, 70, 36, 73, 33, 74, 76, 28, 99, 97, 10, 71, 4, 6, 8, 0, 69, 7, 65, 68, 1, 5, 64, 67, 66, 3, 2], [122, 113, 62, 118, 120, 121, 126, 56, 127, 58, 52, 124, 106, 42, 55, 110, 54, 50, 90, 123, 45, 125, 102, 51, 37, 60, 116, 53, 49, 114, 117, 63, 87, 38, 115, 47, 61, 93, 46, 57, 119, 109, 96, 112, 23, 85, 59, 26, 101, 34, 103, 83, 21, 111, 80, 19, 16, 48, 22, 41, 77, 32, 13, 43, 100, 104, 18, 25, 39, 82, 14, 89, 81, 86, 24, 30, 27, 44, 94, 108, 20, 84, 88, 78, 11, 98, 36, 91, 17, 29, 9, 79, 75, 15, 40, 35, 73, 31, 12, 76, 95, 72, 70, 74, 105, 107, 97, 33, 4, 92, 10, 68, 8, 7, 28, 6, 71, 99, 65, 64, 0, 1, 67, 69, 2, 5, 66, 3], [122, 113, 62, 118, 120, 127, 121, 56, 126, 58, 52, 124, 106, 42, 54, 55, 90, 45, 110, 125, 116, 114, 60, 50, 117, 87, 63, 102, 123, 37, 53, 51, 46, 85, 93, 61, 49, 38, 96, 119, 23, 19, 115, 80, 109, 22, 103, 21, 41, 57, 34, 83, 77, 112, 101, 47, 16, 13, 111, 48, 20, 26, 32, 59, 25, 18, 104, 14, 100, 82, 98, 27, 88, 12, 15, 86, 11, 78, 39, 108, 24, 17, 81, 30, 75, 79, 89, 84, 43, 9, 31, 76, 105, 94, 73, 35, 29, 44, 91, 40, 107, 36, 10, 72, 92, 74, 70, 95, 33, 6, 97, 8, 28, 99, 68, 4, 69, 5, 71, 67, 64, 0, 1, 66, 7, 2, 65, 3], [122, 113, 62, 120, 118, 56, 127, 126, 121, 58, 52, 106, 124, 42, 90, 45, 125, 102, 55, 116, 37, 110, 54, 114, 87, 60, 50, 53, 85, 101, 51, 63, 23, 115, 61, 38, 46, 123, 49, 119, 93, 117, 22, 112, 80, 59, 19, 83, 16, 103, 21, 20, 48, 32, 96, 77, 34, 109, 41, 47, 57, 26, 111, 13, 100, 18, 25, 86, 82, 24, 27, 81, 78, 104, 84, 44, 88, 11, 75, 15, 14, 39, 12, 30, 98, 89, 94, 43, 9, 105, 108, 17, 31, 91, 76, 79, 29, 73, 40, 35, 36, 95, 107, 92, 74, 72, 99, 10, 6, 33, 70, 97, 8, 71, 28, 68, 7, 69, 66, 4, 5, 1, 65, 0, 3, 64, 2, 67], [113, 122, 62, 120, 118, 126, 56, 127, 121, 58, 106, 52, 124, 42, 90, 54, 45, 50, 37, 125, 117, 87, 102, 55, 23, 49, 46, 110, 60, 85, 123, 116, 83, 53, 61, 38, 119, 93, 114, 112, 21, 101, 22, 51, 115, 19, 80, 16, 77, 47, 63, 109, 111, 96, 86, 26, 32, 20, 25, 57, 82, 103, 41, 59, 13, 18, 84, 24, 78, 34, 27, 81, 98, 48, 39, 30, 100, 44, 17, 108, 15, 14, 79, 88, 75, 11, 104, 12, 9, 76, 89, 91, 105, 73, 107, 31, 43, 29, 40, 94, 92, 95, 36, 35, 6, 97, 72, 74, 99, 33, 10, 71, 70, 8, 68, 7, 28, 69, 65, 5, 0, 4, 64, 3, 1, 2, 66, 67], [122, 113, 62, 118, 120, 127, 56, 121, 126, 58, 52, 106, 124, 42, 90, 54, 45, 50, 37, 110, 123, 102, 55, 125, 87, 117, 116, 23, 60, 49, 38, 101, 53, 21, 93, 85, 83, 115, 114, 46, 47, 112, 63, 109, 57, 51, 59, 32, 16, 26, 22, 61, 19, 34, 96, 41, 111, 20, 25, 98, 119, 80, 77, 39, 27, 104, 24, 84, 86, 13, 82, 100, 78, 30, 108, 88, 43, 44, 18, 81, 48, 94, 91, 40, 17, 15, 29, 103, 11, 31, 75, 79, 105, 92, 89, 6, 14, 76, 95, 36, 9, 12, 73, 107, 97, 33, 35, 74, 99, 10, 28, 8, 72, 71, 68, 70, 1, 64, 4, 65, 0, 7, 69, 66, 2, 5, 67, 3], [122, 113, 62, 120, 118, 126, 127, 56, 121, 58, 52, 124, 106, 42, 117, 90, 116, 125, 123, 50, 55, 102, 45, 37, 87, 60, 23, 51, 63, 54, 110, 101, 115, 114, 38, 83, 53, 112, 85, 49, 109, 21, 93, 59, 47, 61, 80, 46, 111, 96, 16, 26, 77, 22, 82, 57, 119, 41, 19, 32, 103, 20, 13, 24, 34, 86, 27, 100, 108, 30, 25, 98, 104, 84, 14, 48, 18, 78, 81, 91, 39, 43, 88, 17, 79, 94, 12, 75, 44, 89, 73, 29, 11, 76, 15, 92, 107, 9, 40, 6, 36, 31, 97, 105, 95, 8, 33, 10, 99, 74, 35, 72, 68, 70, 28, 71, 69, 4, 7, 67, 0, 5, 65, 3, 1, 2, 64, 66], [122, 113, 62, 118, 126, 120, 121, 56, 127, 58, 52, 124, 106, 42, 123, 54, 125, 55, 110, 45, 102, 51, 90, 60, 50, 116, 117, 37, 63, 49, 87, 114, 53, 112, 119, 23, 115, 101, 85, 96, 93, 109, 57, 59, 38, 80, 111, 21, 83, 61, 19, 46, 47, 22, 77, 34, 16, 25, 41, 26, 20, 86, 13, 44, 18, 32, 82, 98, 48, 103, 100, 39, 30, 14, 81, 108, 43, 91, 24, 27, 15, 29, 78, 17, 11, 84, 12, 94, 89, 104, 73, 9, 8, 36, 79, 75, 76, 40, 10, 88, 6, 31, 74, 92, 107, 70, 105, 97, 95, 35, 68, 33, 28, 71, 99, 4, 65, 72, 1, 69, 0, 7, 64, 3, 66, 5, 2, 67], [122, 113, 62, 118, 120, 126, 127, 56, 121, 58, 52, 124, 106, 42, 123, 90, 51, 54, 55, 102, 110, 49, 45, 116, 125, 117, 50, 60, 87, 37, 114, 63, 112, 23, 53, 93, 38, 85, 101, 80, 109, 115, 83, 111, 59, 57, 96, 19, 46, 22, 47, 34, 20, 61, 21, 48, 25, 16, 119, 103, 77, 13, 41, 26, 32, 18, 39, 86, 14, 43, 98, 81, 82, 100, 11, 108, 75, 94, 24, 78, 27, 84, 17, 79, 15, 88, 76, 44, 9, 104, 30, 40, 12, 29, 36, 89, 73, 91, 10, 8, 107, 70, 74, 31, 6, 95, 35, 72, 97, 4, 92, 68, 7, 33, 99, 0, 69, 71, 1, 28, 65, 66, 105, 64, 5, 2, 3, 67], [122, 113, 62, 120, 118, 126, 127, 56, 121, 58, 106, 52, 124, 42, 54, 50, 90, 45, 102, 37, 110, 117, 55, 51, 116, 87, 23, 125, 85, 123, 38, 60, 57, 53, 101, 59, 63, 19, 46, 21, 119, 115, 93, 80, 26, 83, 32, 114, 22, 96, 47, 49, 109, 77, 86, 16, 103, 41, 20, 111, 112, 39, 98, 48, 61, 34, 25, 27, 24, 18, 13, 81, 82, 100, 91, 78, 17, 84, 15, 14, 89, 30, 88, 12, 43, 94, 76, 79, 11, 75, 104, 44, 29, 95, 9, 40, 108, 31, 36, 107, 8, 73, 92, 35, 33, 70, 97, 105, 74, 99, 72, 10, 71, 4, 28, 6, 7, 69, 68, 0, 65, 5, 3, 1, 66, 67, 64, 2], [122, 113, 62, 118, 126, 120, 127, 56, 121, 58, 52, 106, 124, 42, 54, 117, 125, 55, 90, 45, 50, 123, 87, 102, 110, 60, 116, 37, 63, 46, 114, 49, 51, 23, 85, 119, 57, 93, 38, 115, 83, 77, 19, 101, 112, 47, 53, 61, 21, 59, 109, 16, 96, 80, 22, 111, 86, 13, 20, 25, 34, 81, 26, 18, 41, 82, 27, 48, 103, 78, 14, 98, 39, 32, 76, 24, 100, 30, 17, 43, 75, 12, 15, 84, 11, 9, 73, 44, 79, 8, 104, 70, 91, 108, 29, 94, 40, 88, 31, 89, 72, 74, 92, 97, 35, 107, 36, 6, 95, 10, 7, 64, 71, 69, 33, 105, 4, 0, 65, 68, 99, 5, 66, 1, 2, 28, 67, 3], [122, 113, 62, 120, 118, 127, 126, 56, 58, 121, 52, 124, 106, 42, 54, 110, 55, 102, 125, 123, 50, 117, 45, 37, 90, 60, 51, 116, 87, 49, 46, 114, 61, 38, 23, 57, 63, 101, 112, 96, 109, 85, 115, 93, 59, 19, 41, 32, 83, 21, 26, 111, 22, 119, 47, 80, 53, 77, 34, 103, 20, 48, 25, 27, 98, 16, 86, 24, 100, 43, 39, 82, 13, 18, 44, 78, 81, 17, 94, 84, 15, 79, 89, 30, 107, 14, 40, 108, 29, 70, 75, 91, 104, 73, 9, 76, 11, 12, 88, 36, 31, 8, 72, 95, 35, 74, 105, 92, 97, 10, 33, 6, 28, 71, 99, 64, 1, 4, 69, 68, 0, 7, 65, 5, 67, 2, 66, 3], [122, 113, 62, 118, 120, 127, 126, 56, 121, 58, 52, 106, 124, 42, 55, 54, 45, 110, 90, 102, 125, 123, 117, 60, 37, 114, 51, 50, 87, 63, 46, 109, 61, 116, 85, 93, 101, 115, 49, 103, 53, 23, 38, 112, 47, 96, 57, 22, 83, 80, 59, 19, 111, 21, 26, 20, 32, 86, 77, 16, 41, 82, 34, 98, 13, 25, 100, 119, 39, 27, 24, 44, 78, 81, 18, 43, 48, 14, 30, 84, 89, 104, 75, 17, 15, 11, 76, 79, 29, 73, 12, 40, 108, 91, 94, 70, 9, 31, 88, 105, 74, 33, 10, 72, 36, 92, 107, 35, 95, 97, 99, 8, 71, 6, 68, 28, 4, 7, 0, 1, 66, 69, 64, 67, 5, 3, 2, 65], [122, 113, 62, 120, 118, 126, 127, 56, 121, 58, 52, 106, 124, 42, 117, 37, 102, 54, 90, 125, 87, 45, 116, 110, 50, 55, 123, 60, 23, 85, 46, 63, 112, 51, 114, 101, 93, 80, 38, 115, 49, 96, 59, 19, 47, 83, 21, 53, 103, 32, 111, 109, 27, 57, 119, 22, 26, 86, 16, 20, 77, 61, 24, 34, 25, 98, 81, 82, 48, 100, 18, 39, 41, 78, 44, 14, 13, 11, 79, 30, 17, 91, 84, 15, 88, 43, 9, 94, 40, 89, 12, 75, 104, 108, 29, 76, 31, 95, 73, 72, 36, 10, 35, 74, 92, 70, 28, 33, 107, 8, 6, 99, 7, 105, 97, 71, 68, 4, 69, 66, 5, 0, 64, 67, 1, 3, 65, 2], [122, 113, 62, 118, 120, 126, 56, 127, 121, 58, 106, 52, 124, 42, 117, 102, 37, 110, 90, 45, 87, 54, 125, 50, 60, 55, 112, 116, 85, 51, 23, 38, 46, 49, 123, 114, 101, 63, 53, 93, 115, 21, 47, 109, 26, 19, 83, 96, 80, 103, 59, 32, 61, 111, 57, 119, 16, 27, 22, 18, 41, 44, 24, 39, 34, 25, 86, 98, 77, 20, 84, 82, 100, 108, 17, 15, 43, 81, 88, 78, 30, 13, 91, 11, 79, 14, 104, 75, 40, 76, 89, 48, 31, 94, 9, 95, 72, 29, 73, 36, 107, 92, 12, 10, 35, 33, 6, 99, 74, 28, 97, 8, 70, 105, 7, 71, 5, 4, 68, 64, 0, 1, 65, 69, 67, 2, 3, 66], [122, 113, 62, 118, 120, 126, 127, 121, 56, 58, 52, 124, 106, 42, 117, 55, 102, 37, 125, 45, 60, 116, 110, 54, 90, 51, 123, 50, 87, 46, 38, 101, 114, 49, 115, 23, 53, 63, 32, 112, 93, 47, 96, 26, 59, 119, 85, 21, 22, 109, 44, 57, 61, 34, 19, 83, 103, 43, 41, 16, 100, 111, 88, 24, 27, 80, 20, 91, 39, 98, 86, 25, 94, 18, 108, 82, 77, 89, 78, 107, 84, 48, 104, 81, 30, 40, 14, 17, 13, 95, 29, 79, 31, 15, 11, 75, 6, 9, 36, 35, 99, 72, 28, 33, 76, 92, 12, 73, 105, 74, 97, 10, 70, 7, 69, 68, 71, 8, 0, 1, 3, 4, 5, 64, 65, 67, 66, 2], [122, 113, 62, 120, 118, 127, 126, 56, 121, 58, 52, 106, 124, 42, 117, 54, 60, 37, 55, 102, 45, 90, 51, 125, 87, 110, 50, 116, 123, 114, 101, 46, 61, 85, 63, 112, 23, 96, 93, 59, 38, 49, 22, 103, 47, 109, 111, 119, 53, 32, 34, 115, 41, 19, 21, 26, 27, 16, 83, 57, 20, 24, 77, 18, 25, 44, 86, 80, 39, 48, 98, 100, 30, 88, 43, 13, 89, 82, 14, 108, 78, 75, 81, 84, 31, 79, 94, 104, 29, 11, 6, 17, 73, 15, 95, 9, 35, 72, 105, 92, 76, 33, 91, 40, 12, 36, 107, 74, 99, 28, 8, 97, 10, 68, 7, 5, 71, 70, 4, 64, 66, 1, 65, 69, 0, 67, 3, 2], [122, 113, 62, 118, 120, 126, 127, 56, 121, 58, 106, 52, 124, 42, 117, 55, 60, 125, 110, 54, 90, 51, 102, 114, 50, 45, 123, 112, 87, 37, 115, 53, 85, 116, 93, 23, 119, 61, 109, 46, 63, 38, 22, 47, 49, 103, 101, 59, 83, 111, 19, 21, 96, 80, 57, 77, 26, 34, 86, 44, 27, 98, 16, 39, 32, 13, 82, 24, 14, 25, 20, 48, 100, 81, 18, 108, 17, 41, 84, 30, 88, 11, 9, 89, 75, 72, 79, 91, 43, 6, 78, 104, 29, 94, 15, 76, 73, 36, 40, 31, 107, 12, 92, 74, 10, 35, 8, 33, 99, 95, 70, 68, 28, 97, 105, 4, 5, 7, 71, 66, 0, 1, 64, 67, 69, 65, 3, 2], [122, 113, 62, 118, 120, 126, 127, 121, 56, 58, 52, 106, 124, 42, 117, 55, 110, 45, 102, 54, 90, 51, 125, 37, 46, 87, 50, 114, 115, 112, 60, 63, 61, 123, 38, 116, 53, 101, 49, 85, 23, 119, 47, 109, 22, 93, 83, 57, 26, 21, 59, 19, 96, 41, 111, 80, 44, 103, 34, 18, 32, 16, 24, 27, 77, 39, 43, 48, 100, 13, 25, 88, 20, 84, 17, 94, 98, 82, 30, 91, 86, 78, 89, 11, 14, 76, 104, 75, 29, 79, 81, 36, 107, 31, 15, 9, 95, 73, 99, 35, 40, 92, 108, 12, 6, 33, 72, 74, 10, 97, 28, 105, 68, 70, 8, 7, 64, 71, 66, 0, 67, 65, 4, 5, 69, 2, 1, 3], [122, 113, 62, 118, 126, 120, 56, 127, 121, 58, 52, 106, 124, 42, 54, 125, 117, 110, 37, 55, 50, 90, 102, 53, 60, 45, 123, 114, 51, 115, 87, 63, 38, 61, 112, 101, 49, 23, 109, 116, 57, 85, 59, 119, 111, 46, 26, 21, 96, 19, 93, 22, 34, 47, 83, 41, 48, 32, 103, 80, 16, 13, 98, 44, 27, 25, 77, 91, 24, 84, 86, 82, 30, 94, 29, 18, 20, 40, 39, 100, 107, 17, 81, 108, 78, 43, 31, 88, 11, 76, 14, 89, 75, 9, 79, 36, 35, 104, 95, 15, 73, 99, 12, 92, 10, 70, 8, 28, 97, 72, 74, 68, 33, 105, 7, 6, 64, 0, 71, 1, 65, 4, 5, 67, 69, 2, 66, 3], [122, 113, 62, 118, 120, 126, 56, 127, 121, 58, 106, 52, 124, 42, 117, 60, 54, 125, 90, 45, 87, 50, 37, 55, 110, 102, 51, 123, 114, 49, 61, 46, 63, 116, 38, 119, 93, 23, 115, 85, 22, 101, 96, 21, 59, 83, 109, 53, 112, 103, 26, 111, 80, 34, 57, 16, 25, 47, 19, 27, 32, 41, 13, 98, 77, 24, 18, 30, 20, 48, 108, 43, 44, 39, 104, 82, 17, 86, 88, 75, 100, 81, 14, 84, 29, 78, 31, 91, 11, 94, 12, 73, 15, 9, 107, 40, 70, 92, 79, 76, 89, 36, 35, 33, 95, 99, 74, 8, 72, 10, 105, 97, 68, 6, 28, 4, 71, 69, 7, 0, 5, 64, 1, 67, 65, 66, 2, 3], [122, 113, 62, 120, 118, 127, 126, 56, 121, 58, 52, 106, 124, 42, 55, 125, 117, 123, 102, 50, 116, 37, 60, 90, 45, 110, 87, 54, 51, 61, 63, 46, 53, 38, 115, 101, 23, 114, 112, 93, 57, 85, 119, 22, 96, 83, 49, 47, 26, 109, 103, 48, 21, 80, 59, 111, 32, 27, 19, 16, 25, 20, 44, 24, 18, 34, 88, 41, 86, 43, 13, 98, 82, 77, 91, 104, 100, 94, 17, 84, 39, 30, 11, 14, 81, 40, 9, 89, 75, 79, 15, 78, 70, 29, 12, 107, 76, 73, 35, 33, 31, 36, 95, 99, 74, 92, 97, 108, 10, 8, 72, 71, 28, 7, 4, 6, 5, 69, 68, 0, 105, 65, 64, 66, 3, 67, 1, 2], [122, 113, 62, 120, 118, 126, 127, 56, 121, 58, 106, 52, 124, 42, 117, 90, 50, 125, 102, 45, 60, 54, 55, 123, 87, 37, 23, 49, 63, 110, 46, 112, 116, 51, 47, 114, 38, 85, 109, 93, 101, 83, 119, 96, 115, 103, 59, 86, 22, 57, 16, 41, 32, 26, 111, 53, 21, 27, 20, 19, 61, 80, 34, 98, 13, 48, 82, 77, 100, 24, 25, 44, 14, 30, 75, 17, 18, 84, 11, 79, 9, 81, 43, 88, 12, 78, 39, 91, 94, 40, 89, 73, 31, 108, 15, 104, 70, 92, 8, 76, 36, 29, 10, 33, 107, 95, 35, 97, 74, 72, 99, 105, 6, 71, 7, 28, 68, 4, 1, 64, 5, 69, 2, 0, 65, 67, 66, 3], [122, 113, 62, 120, 118, 126, 127, 121, 56, 58, 52, 106, 124, 42, 125, 50, 54, 123, 90, 55, 60, 45, 110, 102, 37, 117, 114, 112, 87, 116, 53, 115, 38, 49, 23, 51, 101, 109, 57, 63, 93, 119, 19, 96, 85, 32, 47, 21, 59, 46, 41, 61, 22, 34, 83, 80, 26, 27, 94, 86, 111, 44, 103, 25, 24, 16, 43, 20, 91, 77, 39, 84, 13, 108, 18, 88, 40, 100, 30, 98, 31, 17, 82, 48, 104, 14, 29, 89, 81, 11, 95, 107, 15, 12, 36, 79, 73, 75, 78, 35, 9, 8, 76, 92, 99, 70, 10, 33, 28, 105, 97, 74, 7, 4, 6, 72, 71, 68, 65, 1, 0, 69, 5, 64, 67, 2, 66, 3], [122, 113, 62, 118, 120, 126, 127, 56, 121, 58, 52, 106, 124, 42, 54, 90, 60, 123, 117, 45, 37, 50, 125, 102, 55, 49, 114, 87, 110, 116, 112, 115, 93, 57, 23, 38, 53, 51, 109, 85, 83, 63, 119, 101, 46, 61, 21, 96, 19, 59, 80, 41, 16, 22, 20, 47, 98, 27, 34, 77, 82, 13, 32, 25, 26, 100, 103, 111, 44, 30, 24, 86, 81, 39, 18, 14, 48, 108, 43, 78, 73, 11, 94, 84, 75, 9, 17, 79, 91, 104, 12, 29, 15, 89, 88, 76, 8, 31, 36, 40, 92, 10, 74, 107, 70, 6, 71, 33, 35, 7, 0, 4, 97, 72, 68, 65, 64, 28, 105, 1, 99, 95, 2, 5, 69, 67, 3, 66], [122, 113, 62, 118, 120, 127, 126, 121, 56, 58, 52, 106, 124, 42, 55, 54, 102, 51, 123, 90, 37, 50, 125, 110, 45, 53, 87, 114, 117, 116, 63, 60, 38, 49, 61, 57, 115, 101, 85, 23, 112, 93, 103, 83, 96, 32, 47, 46, 109, 22, 111, 41, 34, 59, 26, 19, 24, 27, 77, 21, 80, 16, 20, 82, 98, 17, 94, 86, 39, 43, 88, 30, 48, 13, 44, 100, 18, 14, 119, 29, 25, 84, 91, 78, 89, 31, 75, 79, 11, 108, 9, 73, 104, 35, 81, 6, 12, 36, 40, 107, 95, 15, 92, 8, 76, 33, 97, 10, 99, 74, 70, 105, 71, 4, 68, 72, 28, 7, 1, 64, 2, 5, 69, 65, 0, 66, 67, 3], [122, 113, 62, 120, 118, 127, 56, 126, 121, 58, 52, 124, 106, 42, 54, 90, 102, 50, 37, 125, 55, 60, 117, 45, 110, 87, 51, 116, 123, 46, 114, 53, 49, 101, 59, 57, 38, 61, 85, 93, 112, 63, 23, 109, 83, 22, 96, 115, 47, 32, 103, 21, 80, 119, 26, 19, 77, 34, 13, 41, 44, 16, 25, 20, 14, 18, 98, 81, 27, 100, 24, 11, 30, 78, 48, 75, 86, 82, 111, 17, 108, 39, 43, 9, 79, 12, 91, 94, 84, 76, 6, 73, 8, 89, 15, 40, 104, 31, 29, 88, 36, 107, 10, 72, 33, 92, 99, 74, 35, 97, 95, 4, 71, 70, 28, 105, 68, 64, 7, 2, 65, 5, 1, 0, 3, 69, 66, 67], [122, 113, 62, 118, 120, 126, 127, 56, 121, 58, 52, 106, 124, 42, 54, 55, 90, 102, 45, 110, 37, 125, 117, 123, 87, 50, 51, 116, 57, 60, 63, 46, 53, 101, 49, 114, 23, 85, 38, 61, 93, 115, 83, 59, 112, 103, 109, 47, 32, 21, 22, 16, 96, 19, 20, 13, 41, 80, 24, 77, 34, 48, 25, 100, 82, 27, 111, 26, 86, 104, 88, 14, 17, 44, 43, 94, 119, 78, 18, 39, 11, 12, 30, 81, 98, 73, 84, 31, 108, 15, 89, 76, 75, 29, 79, 35, 6, 91, 40, 9, 92, 107, 36, 95, 8, 10, 99, 97, 68, 72, 74, 33, 71, 105, 7, 4, 5, 70, 0, 1, 64, 28, 65, 67, 66, 69, 3, 2], [122, 113, 62, 120, 118, 126, 56, 121, 127, 58, 52, 106, 124, 42, 54, 117, 45, 90, 50, 55, 125, 102, 63, 87, 60, 37, 116, 123, 51, 110, 114, 115, 85, 38, 53, 101, 93, 49, 23, 57, 61, 46, 83, 21, 59, 47, 109, 80, 119, 20, 13, 96, 22, 34, 77, 103, 111, 48, 19, 112, 25, 41, 82, 32, 26, 86, 27, 24, 16, 44, 43, 14, 30, 75, 100, 78, 89, 18, 11, 104, 98, 84, 88, 81, 17, 12, 9, 76, 73, 15, 39, 79, 94, 29, 6, 31, 40, 8, 91, 108, 105, 92, 107, 10, 72, 35, 71, 99, 95, 36, 74, 97, 70, 4, 33, 68, 69, 28, 7, 5, 64, 0, 1, 65, 66, 3, 2, 67]], "model.layers.11.self_attn.q_proj": [[45, 36, 96, 109, 92, 23, 86, 62, 48, 28, 14, 20, 10, 16, 61, 81, 71, 54, 56, 25, 89, 73, 18, 111, 22, 32, 70, 66, 4, 58, 47, 67, 125, 76, 17, 83, 88, 94, 42, 37, 38, 11, 12, 84, 29, 107, 30, 82, 78, 27, 80, 87, 95, 74, 21, 24, 19, 75, 13, 97, 64, 6, 1, 50, 9, 5, 102, 91, 15, 46, 77, 55, 85, 40, 116, 53, 41, 79, 43, 90, 49, 113, 115, 39, 122, 60, 51, 100, 3, 34, 98, 101, 69, 126, 112, 26, 123, 59, 93, 33, 114, 7, 108, 120, 105, 72, 119, 31, 63, 118, 103, 106, 99, 124, 35, 110, 44, 52, 8, 57, 127, 121, 2, 104, 117, 65, 0, 68], [45, 36, 109, 96, 62, 92, 28, 25, 54, 86, 48, 4, 89, 81, 7, 3, 47, 102, 23, 100, 40, 20, 32, 43, 14, 22, 1, 111, 8, 38, 58, 10, 101, 66, 46, 12, 116, 41, 50, 125, 107, 55, 64, 18, 114, 56, 126, 122, 51, 49, 74, 120, 42, 5, 57, 97, 112, 59, 118, 127, 123, 19, 39, 108, 113, 37, 115, 105, 99, 53, 63, 31, 104, 0, 52, 121, 35, 29, 61, 119, 124, 34, 106, 44, 110, 60, 33, 117, 30, 11, 98, 103, 95, 17, 9, 68, 91, 93, 90, 70, 84, 65, 75, 27, 24, 80, 94, 26, 15, 85, 72, 88, 78, 21, 82, 2, 6, 73, 16, 69, 79, 76, 87, 83, 71, 77, 67, 13], [45, 36, 109, 48, 100, 25, 86, 96, 92, 125, 111, 58, 54, 56, 22, 50, 107, 46, 43, 113, 55, 47, 41, 122, 23, 116, 115, 114, 120, 101, 32, 119, 112, 53, 118, 57, 28, 62, 60, 117, 38, 127, 121, 40, 59, 97, 49, 51, 20, 63, 42, 39, 110, 126, 102, 52, 124, 44, 123, 108, 10, 81, 103, 105, 14, 106, 4, 7, 61, 88, 104, 89, 15, 37, 24, 83, 31, 99, 98, 33, 35, 34, 91, 78, 30, 12, 19, 93, 29, 3, 95, 74, 84, 85, 21, 94, 90, 66, 17, 80, 18, 16, 87, 1, 8, 9, 27, 75, 26, 82, 11, 76, 70, 79, 65, 64, 0, 73, 68, 72, 5, 13, 77, 71, 6, 69, 2, 67], [45, 36, 96, 109, 25, 28, 62, 92, 86, 48, 23, 20, 54, 89, 14, 81, 111, 61, 56, 58, 10, 18, 97, 107, 47, 32, 41, 22, 46, 125, 12, 102, 80, 100, 43, 30, 16, 4, 50, 42, 113, 53, 40, 38, 39, 9, 115, 101, 37, 66, 1, 55, 64, 71, 31, 122, 3, 17, 44, 114, 126, 105, 91, 116, 21, 108, 5, 87, 70, 51, 93, 112, 94, 8, 34, 29, 60, 49, 57, 110, 90, 83, 95, 7, 85, 121, 19, 118, 98, 120, 84, 78, 127, 24, 52, 123, 99, 88, 106, 35, 15, 76, 124, 26, 27, 104, 119, 33, 59, 79, 103, 11, 117, 63, 82, 75, 72, 74, 73, 0, 77, 68, 65, 69, 6, 13, 2, 67], [58, 123, 63, 37, 117, 26, 87, 33, 88, 95, 86, 76, 17, 20, 78, 80, 83, 122, 81, 91, 126, 118, 30, 92, 0, 85, 12, 124, 67, 5, 9, 97, 82, 69, 79, 24, 90, 93, 1, 60, 70, 22, 18, 29, 14, 2, 23, 3, 25, 96, 27, 21, 61, 71, 16, 45, 120, 84, 50, 73, 75, 125, 77, 94, 72, 15, 32, 28, 109, 31, 19, 115, 89, 62, 11, 52, 56, 13, 54, 8, 113, 55, 127, 10, 59, 49, 112, 7, 57, 74, 47, 66, 100, 46, 68, 116, 111, 121, 6, 48, 103, 53, 64, 34, 106, 114, 38, 110, 40, 35, 98, 101, 65, 36, 119, 107, 105, 42, 102, 39, 51, 43, 104, 108, 41, 44, 99, 4], [123, 63, 58, 37, 117, 122, 126, 118, 60, 62, 33, 112, 115, 124, 61, 50, 52, 125, 54, 59, 30, 56, 45, 49, 101, 48, 47, 114, 119, 116, 57, 113, 55, 127, 39, 111, 99, 53, 110, 109, 40, 105, 121, 100, 106, 108, 43, 51, 46, 44, 104, 42, 120, 41, 10, 103, 102, 16, 88, 107, 34, 38, 86, 35, 21, 96, 36, 92, 32, 98, 97, 84, 95, 26, 18, 77, 93, 19, 28, 8, 29, 91, 94, 83, 14, 24, 22, 31, 65, 13, 68, 89, 15, 7, 27, 12, 17, 25, 23, 87, 82, 64, 70, 67, 90, 85, 75, 80, 79, 5, 4, 73, 78, 66, 71, 20, 72, 9, 69, 11, 81, 1, 76, 74, 3, 2, 0, 6], [63, 123, 58, 117, 60, 126, 118, 124, 122, 10, 37, 50, 62, 115, 54, 61, 68, 125, 52, 59, 65, 45, 112, 116, 56, 113, 8, 49, 127, 13, 16, 121, 48, 114, 55, 47, 7, 64, 57, 111, 53, 46, 40, 67, 21, 15, 14, 119, 17, 51, 120, 84, 12, 109, 75, 78, 110, 43, 99, 77, 100, 70, 73, 108, 5, 104, 44, 18, 105, 42, 106, 33, 39, 107, 102, 4, 41, 92, 103, 19, 101, 38, 1, 30, 66, 96, 24, 86, 9, 93, 95, 36, 88, 0, 28, 97, 34, 35, 87, 22, 80, 98, 83, 29, 79, 32, 31, 69, 91, 26, 25, 71, 76, 81, 27, 20, 23, 89, 72, 94, 74, 85, 90, 11, 82, 3, 2, 6], [58, 123, 63, 37, 122, 117, 126, 118, 60, 88, 33, 10, 112, 115, 61, 54, 62, 124, 50, 125, 52, 101, 30, 49, 59, 45, 16, 56, 55, 116, 48, 39, 111, 40, 113, 127, 114, 57, 93, 47, 8, 106, 53, 110, 100, 119, 120, 28, 92, 18, 51, 77, 19, 84, 86, 109, 108, 121, 46, 65, 21, 96, 26, 14, 68, 43, 104, 12, 44, 102, 105, 99, 42, 13, 91, 15, 41, 103, 24, 38, 95, 35, 32, 107, 97, 34, 83, 29, 36, 7, 5, 17, 98, 70, 31, 64, 75, 67, 25, 73, 22, 94, 89, 78, 66, 27, 82, 87, 90, 20, 80, 79, 71, 9, 4, 76, 23, 85, 1, 81, 72, 2, 11, 74, 0, 69, 3, 6], [102, 46, 110, 19, 91, 88, 125, 86, 114, 113, 95, 43, 111, 122, 27, 12, 10, 50, 63, 16, 18, 60, 52, 31, 107, 56, 17, 42, 54, 45, 123, 93, 71, 126, 119, 55, 69, 120, 118, 76, 83, 99, 44, 116, 41, 48, 121, 59, 51, 108, 58, 112, 47, 124, 105, 61, 57, 109, 36, 70, 53, 100, 39, 24, 104, 49, 34, 117, 4, 103, 127, 115, 67, 30, 106, 40, 62, 97, 23, 94, 101, 33, 22, 35, 9, 32, 80, 72, 82, 96, 37, 29, 89, 79, 98, 28, 14, 81, 20, 25, 87, 26, 90, 21, 11, 92, 74, 85, 65, 84, 78, 15, 38, 66, 13, 75, 77, 2, 73, 8, 0, 6, 5, 68, 7, 3, 1, 64], [102, 46, 110, 91, 86, 88, 125, 19, 16, 95, 113, 27, 69, 76, 122, 111, 10, 12, 71, 70, 72, 67, 43, 4, 85, 18, 93, 82, 114, 66, 9, 31, 24, 58, 90, 13, 17, 14, 117, 83, 23, 100, 22, 49, 121, 79, 65, 116, 74, 99, 63, 28, 124, 45, 29, 120, 80, 30, 41, 42, 54, 44, 81, 123, 56, 39, 78, 57, 60, 35, 126, 8, 52, 92, 2, 11, 112, 25, 53, 94, 103, 75, 47, 40, 59, 119, 61, 38, 36, 104, 107, 51, 127, 21, 55, 84, 115, 118, 32, 98, 0, 96, 101, 62, 108, 97, 87, 89, 33, 15, 26, 48, 50, 105, 37, 34, 106, 20, 109, 5, 6, 73, 77, 7, 68, 3, 64, 1], [102, 46, 110, 19, 27, 86, 88, 122, 95, 1, 78, 3, 7, 74, 76, 67, 91, 71, 24, 64, 65, 16, 5, 93, 4, 73, 125, 22, 18, 114, 9, 14, 68, 69, 111, 0, 82, 12, 11, 8, 31, 72, 113, 2, 70, 79, 99, 10, 66, 83, 43, 6, 75, 77, 45, 80, 17, 124, 121, 112, 40, 21, 20, 58, 116, 120, 42, 56, 63, 119, 50, 13, 123, 25, 57, 23, 47, 89, 15, 33, 81, 44, 104, 54, 90, 32, 85, 107, 103, 87, 34, 29, 118, 28, 105, 51, 36, 39, 97, 96, 38, 100, 101, 126, 94, 49, 84, 26, 92, 52, 117, 106, 62, 59, 108, 41, 30, 37, 35, 61, 115, 109, 60, 48, 98, 53, 127, 55], [102, 46, 110, 91, 88, 86, 19, 16, 95, 27, 18, 125, 122, 76, 10, 93, 31, 113, 114, 24, 111, 74, 71, 12, 69, 78, 70, 22, 72, 14, 83, 43, 4, 17, 82, 30, 67, 79, 80, 94, 60, 99, 29, 15, 58, 9, 75, 120, 23, 100, 39, 21, 66, 6, 13, 89, 90, 49, 38, 11, 116, 33, 42, 52, 126, 32, 81, 123, 36, 85, 107, 97, 65, 28, 59, 61, 41, 103, 92, 45, 73, 112, 54, 84, 50, 63, 87, 25, 53, 51, 56, 35, 44, 26, 101, 119, 48, 77, 37, 0, 121, 55, 2, 34, 106, 124, 127, 117, 109, 98, 57, 104, 115, 20, 40, 105, 62, 96, 47, 108, 118, 5, 8, 7, 68, 3, 64, 1], [63, 59, 106, 36, 14, 42, 114, 121, 60, 15, 126, 117, 47, 12, 97, 48, 52, 124, 123, 100, 120, 11, 82, 125, 122, 109, 113, 84, 115, 49, 56, 116, 58, 16, 53, 119, 62, 110, 33, 54, 44, 111, 51, 85, 61, 24, 57, 45, 55, 46, 118, 83, 127, 108, 112, 107, 43, 104, 50, 105, 72, 0, 41, 92, 17, 103, 40, 39, 65, 102, 37, 69, 38, 99, 81, 10, 89, 101, 67, 91, 34, 98, 6, 9, 7, 21, 35, 18, 73, 88, 93, 22, 68, 31, 86, 13, 29, 2, 96, 32, 94, 64, 95, 19, 27, 66, 25, 30, 28, 23, 1, 26, 4, 90, 8, 20, 75, 87, 76, 70, 77, 80, 78, 79, 5, 3, 71, 74], [59, 63, 100, 97, 42, 36, 60, 121, 93, 87, 79, 88, 73, 27, 76, 20, 117, 33, 47, 52, 82, 15, 106, 114, 92, 78, 75, 12, 124, 68, 86, 6, 116, 125, 14, 25, 85, 8, 67, 105, 126, 81, 18, 83, 104, 80, 11, 28, 48, 95, 29, 69, 56, 26, 31, 110, 7, 19, 65, 2, 90, 72, 91, 22, 10, 84, 24, 17, 123, 45, 120, 34, 115, 38, 98, 96, 89, 113, 37, 99, 16, 109, 111, 23, 53, 107, 103, 49, 102, 35, 46, 74, 21, 127, 30, 94, 62, 122, 108, 77, 39, 13, 32, 50, 101, 112, 119, 41, 58, 54, 44, 70, 40, 43, 64, 71, 51, 57, 9, 55, 5, 61, 118, 0, 3, 66, 4, 1], [59, 63, 100, 97, 14, 68, 36, 6, 76, 42, 60, 73, 7, 12, 10, 67, 69, 72, 8, 11, 87, 88, 121, 2, 93, 82, 27, 65, 25, 52, 33, 20, 13, 17, 117, 81, 16, 15, 47, 78, 106, 79, 70, 74, 29, 83, 21, 80, 86, 22, 95, 77, 116, 126, 85, 48, 71, 105, 114, 124, 123, 91, 89, 26, 9, 64, 5, 84, 125, 23, 115, 110, 19, 90, 56, 75, 104, 31, 92, 24, 49, 3, 113, 122, 120, 109, 18, 127, 111, 119, 45, 46, 107, 98, 53, 28, 58, 38, 35, 54, 37, 44, 51, 55, 62, 94, 103, 41, 50, 96, 34, 61, 39, 40, 57, 43, 108, 0, 112, 118, 102, 4, 32, 99, 30, 101, 66, 1], [59, 63, 100, 97, 36, 42, 121, 106, 60, 114, 93, 27, 73, 20, 117, 33, 78, 76, 52, 12, 125, 123, 88, 126, 86, 48, 124, 116, 120, 87, 6, 47, 113, 110, 122, 99, 67, 82, 58, 68, 111, 51, 115, 79, 25, 8, 109, 49, 53, 54, 62, 37, 119, 118, 55, 104, 56, 112, 46, 45, 69, 81, 98, 14, 92, 18, 57, 105, 44, 29, 61, 7, 127, 108, 10, 102, 2, 50, 15, 31, 107, 80, 22, 65, 43, 84, 35, 85, 39, 90, 41, 34, 83, 91, 75, 17, 103, 38, 95, 96, 94, 32, 28, 101, 24, 26, 40, 30, 89, 23, 11, 19, 5, 72, 74, 13, 9, 16, 64, 21, 0, 71, 77, 70, 66, 3, 4, 1], [101, 121, 116, 94, 57, 17, 71, 19, 2, 52, 78, 21, 3, 5, 75, 24, 64, 88, 9, 66, 76, 69, 124, 30, 1, 7, 80, 6, 118, 18, 115, 127, 117, 0, 67, 53, 11, 63, 70, 37, 120, 50, 125, 14, 33, 87, 72, 73, 29, 60, 83, 105, 97, 12, 82, 104, 54, 61, 79, 91, 59, 81, 16, 74, 122, 20, 4, 96, 43, 93, 46, 51, 113, 22, 106, 112, 77, 110, 65, 27, 40, 86, 15, 85, 111, 45, 13, 58, 126, 25, 26, 23, 55, 41, 84, 39, 48, 42, 89, 107, 102, 28, 31, 68, 10, 90, 62, 8, 92, 103, 123, 114, 49, 108, 99, 44, 56, 35, 100, 109, 34, 47, 38, 95, 32, 98, 119, 36], [101, 121, 116, 94, 57, 24, 21, 52, 19, 17, 78, 76, 80, 30, 88, 9, 75, 5, 37, 7, 15, 71, 11, 18, 69, 97, 29, 67, 33, 124, 63, 118, 66, 82, 117, 53, 127, 3, 73, 16, 115, 105, 22, 0, 120, 50, 60, 46, 86, 6, 81, 14, 2, 72, 1, 28, 26, 54, 83, 106, 20, 45, 51, 59, 43, 79, 122, 12, 85, 89, 91, 74, 77, 90, 39, 58, 104, 87, 125, 42, 13, 114, 41, 70, 110, 113, 112, 23, 61, 25, 96, 10, 109, 92, 93, 56, 95, 108, 31, 84, 49, 123, 32, 40, 102, 107, 62, 48, 126, 98, 4, 8, 99, 27, 103, 65, 111, 100, 35, 47, 38, 34, 55, 64, 119, 44, 36, 68], [101, 116, 121, 94, 52, 37, 30, 21, 57, 24, 19, 85, 80, 33, 117, 115, 105, 16, 50, 127, 124, 114, 62, 113, 26, 104, 59, 53, 125, 58, 118, 110, 60, 43, 51, 39, 49, 95, 63, 97, 107, 123, 54, 120, 55, 41, 45, 61, 91, 76, 17, 102, 46, 48, 42, 112, 56, 111, 28, 126, 122, 109, 38, 108, 103, 89, 119, 78, 36, 32, 73, 40, 22, 93, 99, 47, 44, 100, 106, 20, 35, 23, 27, 96, 15, 12, 18, 98, 87, 34, 92, 31, 86, 29, 88, 83, 79, 90, 25, 81, 82, 7, 9, 84, 14, 74, 5, 11, 66, 70, 72, 8, 75, 10, 13, 67, 77, 4, 6, 69, 2, 71, 3, 64, 68, 65, 0, 1], [101, 116, 121, 94, 57, 21, 24, 19, 76, 52, 17, 80, 78, 30, 6, 75, 9, 3, 33, 88, 71, 37, 124, 2, 127, 67, 29, 117, 97, 60, 115, 69, 83, 118, 46, 7, 18, 53, 64, 63, 50, 1, 23, 0, 16, 54, 90, 22, 106, 73, 70, 82, 14, 68, 12, 77, 26, 81, 122, 72, 112, 85, 61, 43, 13, 93, 58, 59, 104, 86, 98, 27, 79, 120, 42, 105, 20, 11, 74, 96, 84, 28, 39, 35, 125, 40, 65, 91, 25, 110, 123, 15, 107, 95, 41, 10, 92, 111, 32, 51, 99, 89, 87, 109, 103, 31, 48, 45, 62, 36, 126, 34, 113, 38, 44, 4, 55, 114, 49, 47, 56, 8, 100, 119, 102, 108, 5, 66], [40, 126, 97, 25, 89, 95, 87, 86, 28, 92, 121, 84, 15, 83, 115, 63, 82, 122, 55, 17, 49, 44, 53, 37, 119, 62, 29, 50, 38, 104, 45, 111, 91, 107, 61, 101, 46, 43, 41, 35, 9, 114, 123, 120, 102, 118, 23, 52, 117, 30, 54, 58, 109, 60, 12, 124, 48, 75, 57, 13, 42, 47, 116, 36, 103, 32, 98, 20, 34, 56, 105, 18, 85, 39, 51, 127, 110, 125, 90, 106, 16, 67, 112, 21, 59, 99, 1, 31, 96, 108, 113, 5, 80, 27, 74, 93, 79, 22, 100, 94, 71, 88, 11, 24, 0, 19, 26, 68, 77, 33, 73, 81, 6, 72, 70, 64, 14, 4, 66, 78, 7, 2, 69, 76, 65, 10, 8, 3], [40, 126, 97, 95, 89, 92, 86, 25, 120, 61, 15, 122, 28, 87, 82, 52, 83, 62, 17, 53, 84, 9, 60, 29, 121, 45, 63, 59, 12, 58, 118, 16, 107, 54, 21, 123, 114, 116, 85, 49, 47, 13, 77, 23, 117, 30, 8, 55, 33, 20, 115, 46, 127, 125, 50, 18, 79, 119, 36, 27, 56, 22, 57, 111, 80, 112, 90, 124, 44, 102, 110, 113, 106, 43, 93, 101, 91, 31, 41, 108, 109, 48, 38, 72, 24, 51, 35, 81, 68, 104, 4, 70, 19, 103, 71, 75, 100, 88, 96, 105, 37, 76, 6, 42, 2, 98, 39, 11, 78, 34, 26, 73, 14, 32, 99, 69, 66, 10, 94, 74, 1, 5, 0, 67, 64, 7, 3, 65], [126, 40, 97, 95, 86, 121, 101, 28, 82, 122, 62, 58, 120, 25, 53, 115, 45, 61, 102, 111, 84, 52, 54, 44, 114, 127, 60, 49, 92, 59, 118, 56, 63, 125, 50, 83, 46, 37, 112, 47, 113, 87, 89, 15, 43, 123, 57, 117, 51, 119, 55, 100, 29, 18, 12, 9, 116, 33, 108, 68, 30, 21, 109, 35, 48, 107, 110, 27, 73, 71, 91, 66, 38, 69, 80, 36, 93, 78, 124, 106, 19, 72, 10, 103, 99, 41, 42, 76, 14, 79, 65, 24, 90, 77, 23, 85, 16, 17, 31, 2, 13, 20, 105, 104, 5, 7, 34, 26, 3, 67, 64, 39, 98, 11, 1, 32, 94, 22, 75, 96, 88, 70, 74, 0, 8, 4, 81, 6], [126, 40, 97, 121, 61, 122, 71, 67, 49, 62, 89, 1, 54, 101, 60, 25, 53, 52, 59, 5, 45, 43, 95, 127, 63, 58, 114, 125, 120, 92, 28, 47, 56, 68, 50, 12, 46, 0, 86, 118, 84, 102, 111, 55, 57, 112, 82, 115, 117, 116, 123, 113, 66, 88, 108, 64, 81, 15, 73, 44, 51, 11, 109, 107, 77, 105, 119, 110, 106, 100, 30, 87, 24, 48, 42, 38, 19, 9, 41, 76, 14, 124, 17, 69, 2, 34, 36, 18, 27, 6, 4, 20, 78, 94, 35, 13, 72, 32, 39, 23, 103, 37, 99, 98, 16, 91, 85, 80, 3, 7, 74, 29, 83, 70, 104, 75, 21, 22, 93, 65, 10, 96, 31, 90, 26, 33, 79, 8], [104, 111, 30, 119, 26, 19, 127, 24, 94, 21, 79, 62, 9, 113, 61, 123, 12, 40, 81, 32, 86, 122, 47, 87, 118, 63, 78, 114, 54, 83, 68, 36, 49, 110, 89, 88, 126, 107, 90, 85, 100, 105, 115, 74, 56, 96, 71, 43, 4, 18, 70, 15, 57, 46, 17, 76, 125, 55, 121, 51, 120, 106, 42, 23, 37, 80, 117, 73, 3, 14, 97, 20, 112, 1, 103, 53, 92, 69, 45, 50, 5, 95, 101, 59, 66, 48, 38, 98, 39, 41, 22, 124, 102, 93, 28, 91, 108, 116, 10, 75, 52, 82, 58, 33, 99, 67, 77, 64, 35, 84, 8, 34, 31, 16, 60, 29, 44, 13, 2, 109, 27, 72, 11, 7, 25, 0, 65, 6], [104, 111, 119, 30, 62, 127, 24, 19, 21, 26, 94, 79, 12, 113, 47, 9, 123, 81, 122, 40, 32, 54, 70, 1, 61, 96, 86, 87, 49, 90, 118, 4, 91, 107, 67, 110, 15, 63, 20, 57, 120, 82, 105, 23, 64, 68, 43, 126, 85, 124, 103, 22, 56, 114, 45, 121, 36, 53, 46, 42, 55, 78, 58, 52, 112, 100, 109, 117, 18, 88, 89, 108, 71, 59, 98, 101, 2, 17, 115, 28, 83, 106, 74, 13, 41, 37, 34, 80, 27, 48, 66, 5, 50, 116, 125, 51, 75, 95, 60, 38, 84, 76, 72, 93, 8, 39, 99, 44, 73, 35, 102, 33, 3, 29, 16, 6, 10, 7, 0, 14, 31, 25, 97, 92, 11, 77, 69, 65], [104, 111, 119, 30, 127, 24, 62, 26, 94, 19, 21, 79, 113, 9, 61, 32, 122, 107, 12, 81, 86, 87, 54, 123, 40, 63, 47, 36, 105, 49, 70, 103, 117, 43, 114, 110, 56, 118, 100, 57, 126, 41, 83, 108, 85, 89, 96, 120, 90, 46, 37, 53, 28, 45, 95, 68, 88, 48, 42, 74, 115, 51, 112, 121, 39, 101, 38, 55, 1, 18, 125, 15, 4, 20, 78, 58, 76, 23, 17, 35, 91, 99, 59, 13, 98, 116, 52, 106, 44, 124, 93, 109, 50, 34, 102, 29, 97, 71, 67, 73, 64, 60, 82, 31, 33, 92, 72, 22, 14, 77, 80, 27, 66, 25, 3, 75, 84, 10, 5, 16, 2, 11, 8, 69, 0, 6, 7, 65], [104, 111, 30, 119, 24, 26, 19, 94, 79, 127, 81, 12, 61, 9, 21, 62, 32, 113, 78, 40, 118, 87, 90, 47, 96, 91, 54, 22, 123, 88, 100, 36, 4, 23, 85, 86, 18, 124, 15, 89, 107, 74, 110, 80, 63, 95, 72, 17, 53, 43, 122, 57, 83, 126, 98, 70, 120, 114, 45, 50, 49, 56, 20, 42, 37, 58, 125, 101, 34, 102, 105, 46, 112, 55, 14, 103, 35, 48, 16, 84, 7, 29, 76, 115, 97, 93, 25, 31, 99, 5, 82, 27, 51, 75, 41, 117, 109, 39, 59, 121, 38, 28, 44, 33, 92, 71, 2, 106, 52, 116, 108, 11, 60, 10, 6, 13, 68, 67, 1, 8, 77, 73, 64, 66, 69, 3, 0, 65], [59, 119, 46, 104, 51, 113, 61, 48, 56, 115, 98, 120, 116, 109, 57, 52, 49, 55, 53, 124, 60, 126, 63, 42, 112, 117, 122, 110, 50, 54, 118, 58, 111, 45, 47, 107, 121, 62, 106, 114, 123, 37, 125, 105, 41, 43, 127, 44, 39, 108, 36, 101, 38, 30, 24, 103, 33, 93, 86, 96, 89, 40, 35, 92, 102, 26, 88, 32, 95, 90, 97, 99, 100, 29, 27, 34, 80, 28, 91, 25, 20, 94, 31, 22, 87, 84, 16, 17, 18, 82, 85, 75, 13, 77, 23, 19, 21, 11, 68, 5, 83, 8, 76, 3, 14, 78, 9, 7, 81, 6, 79, 73, 2, 74, 0, 4, 1, 10, 66, 64, 15, 72, 12, 69, 65, 67, 70, 71], [46, 59, 104, 51, 53, 98, 117, 93, 91, 113, 48, 119, 49, 36, 37, 26, 114, 121, 55, 89, 61, 127, 110, 41, 30, 33, 120, 23, 57, 112, 109, 97, 54, 90, 115, 38, 107, 50, 63, 44, 83, 122, 56, 111, 94, 78, 84, 95, 106, 82, 52, 125, 118, 108, 45, 60, 99, 123, 116, 39, 102, 62, 43, 58, 27, 101, 100, 103, 105, 47, 32, 34, 35, 124, 25, 126, 96, 92, 21, 28, 18, 87, 42, 88, 80, 24, 75, 85, 22, 31, 40, 86, 81, 29, 15, 19, 13, 17, 20, 16, 77, 11, 14, 72, 79, 66, 9, 76, 74, 68, 8, 4, 73, 2, 64, 0, 70, 6, 3, 12, 5, 65, 1, 69, 71, 10, 67, 7], [46, 59, 48, 104, 51, 119, 61, 117, 113, 123, 98, 53, 55, 57, 56, 109, 126, 50, 114, 120, 37, 116, 63, 52, 93, 115, 60, 112, 49, 124, 122, 127, 42, 54, 110, 121, 118, 106, 44, 58, 45, 47, 41, 105, 111, 125, 43, 38, 62, 32, 29, 85, 89, 108, 36, 103, 107, 35, 31, 92, 39, 90, 95, 33, 84, 102, 91, 86, 97, 101, 96, 80, 26, 100, 99, 21, 28, 82, 30, 40, 25, 88, 34, 23, 20, 27, 24, 78, 68, 18, 22, 94, 75, 87, 16, 17, 11, 3, 77, 2, 83, 8, 14, 13, 0, 72, 5, 4, 65, 6, 9, 19, 73, 81, 74, 1, 79, 66, 70, 15, 10, 7, 76, 69, 67, 64, 71, 12], [104, 59, 98, 46, 83, 79, 23, 21, 81, 37, 76, 74, 48, 91, 8, 5, 69, 12, 7, 72, 78, 119, 71, 2, 3, 64, 66, 110, 67, 15, 84, 40, 19, 26, 25, 62, 10, 85, 14, 16, 27, 17, 18, 89, 87, 93, 13, 75, 31, 90, 20, 1, 51, 30, 94, 0, 73, 88, 77, 22, 86, 11, 29, 24, 70, 65, 92, 82, 9, 80, 4, 53, 6, 28, 32, 68, 96, 33, 95, 52, 61, 100, 108, 34, 99, 116, 115, 112, 39, 35, 102, 106, 56, 120, 54, 97, 36, 55, 101, 103, 113, 45, 111, 117, 50, 44, 38, 118, 124, 127, 122, 123, 109, 126, 58, 49, 121, 105, 60, 41, 42, 43, 47, 114, 57, 63, 107, 125]], "model.layers.11.self_attn.k_proj": [[109, 45, 32, 100, 54, 48, 92, 25, 56, 20, 23, 81, 47, 14, 62, 58, 107, 22, 16, 61, 112, 76, 85, 18, 69, 0, 68, 65, 120, 125, 86, 30, 55, 10, 73, 31, 122, 42, 59, 12, 7, 50, 71, 123, 72, 111, 118, 51, 2, 57, 43, 46, 126, 116, 38, 19, 3, 53, 113, 63, 40, 37, 13, 114, 102, 41, 124, 60, 108, 95, 119, 115, 93, 104, 127, 117, 44, 110, 39, 121, 49, 80, 52, 94, 11, 33, 101, 106, 97, 75, 99, 90, 91, 87, 96, 98, 17, 35, 105, 15, 28, 74, 77, 6, 103, 34, 29, 67, 27, 26, 24, 84, 5, 66, 21, 83, 88, 79, 36, 9, 82, 8, 64, 89, 78, 1, 70, 4], [58, 123, 63, 101, 126, 118, 117, 31, 61, 60, 50, 124, 86, 125, 52, 91, 62, 115, 20, 17, 121, 97, 55, 47, 56, 113, 49, 120, 111, 127, 112, 122, 59, 57, 48, 54, 53, 116, 45, 26, 119, 114, 78, 46, 76, 109, 51, 110, 108, 88, 73, 43, 75, 107, 44, 106, 32, 40, 103, 66, 42, 69, 37, 104, 41, 80, 105, 82, 39, 102, 99, 79, 36, 38, 89, 100, 70, 98, 93, 87, 96, 30, 34, 35, 33, 92, 19, 72, 27, 94, 25, 21, 74, 83, 29, 67, 13, 28, 64, 65, 5, 81, 95, 6, 90, 23, 71, 24, 11, 7, 16, 4, 9, 18, 85, 12, 8, 15, 10, 22, 84, 77, 1, 68, 0, 3, 14, 2], [110, 38, 46, 86, 91, 78, 88, 125, 18, 122, 19, 16, 31, 76, 111, 7, 114, 73, 3, 74, 1, 43, 5, 113, 64, 45, 121, 58, 49, 47, 120, 93, 56, 124, 6, 9, 42, 68, 17, 40, 15, 116, 119, 57, 63, 72, 62, 50, 13, 44, 123, 112, 99, 54, 103, 67, 35, 27, 118, 61, 59, 75, 11, 105, 41, 36, 2, 126, 108, 101, 107, 96, 90, 30, 70, 51, 55, 20, 106, 115, 82, 104, 8, 127, 21, 48, 60, 109, 100, 52, 117, 81, 87, 29, 34, 26, 94, 97, 37, 89, 79, 98, 53, 32, 66, 33, 24, 85, 39, 92, 95, 10, 28, 22, 80, 65, 25, 23, 84, 69, 14, 12, 71, 83, 0, 77, 4, 102], [59, 63, 36, 33, 121, 87, 117, 124, 93, 19, 106, 25, 60, 126, 95, 114, 81, 27, 26, 56, 125, 20, 116, 113, 47, 48, 42, 21, 53, 119, 52, 110, 86, 49, 80, 54, 123, 46, 16, 79, 58, 77, 111, 10, 98, 104, 100, 115, 122, 18, 45, 120, 55, 88, 78, 39, 51, 50, 7, 101, 40, 13, 118, 76, 41, 112, 108, 127, 105, 61, 35, 62, 109, 22, 102, 44, 34, 9, 43, 57, 38, 4, 107, 103, 69, 72, 28, 99, 37, 70, 14, 32, 74, 85, 92, 75, 3, 11, 0, 94, 96, 91, 30, 31, 73, 15, 97, 66, 17, 67, 89, 82, 71, 90, 24, 83, 6, 29, 65, 1, 23, 68, 12, 84, 2, 8, 64, 5], [121, 116, 37, 21, 24, 80, 78, 9, 75, 19, 17, 30, 57, 76, 64, 71, 69, 6, 2, 3, 97, 46, 53, 94, 124, 66, 127, 118, 65, 60, 74, 1, 59, 120, 115, 5, 63, 70, 43, 4, 122, 125, 79, 41, 93, 82, 73, 40, 50, 13, 91, 117, 42, 8, 104, 113, 88, 61, 7, 101, 0, 90, 67, 89, 33, 112, 15, 58, 92, 12, 28, 110, 107, 45, 87, 114, 34, 27, 86, 32, 108, 111, 22, 96, 99, 26, 109, 126, 55, 54, 23, 39, 31, 68, 51, 11, 10, 48, 105, 49, 84, 47, 123, 18, 72, 62, 106, 35, 29, 38, 25, 100, 20, 95, 102, 119, 44, 56, 98, 103, 14, 77, 36, 52, 16, 83, 85, 81], [126, 104, 33, 86, 28, 122, 121, 31, 62, 54, 118, 25, 60, 58, 17, 61, 53, 115, 87, 52, 127, 120, 50, 15, 63, 56, 114, 47, 49, 125, 108, 84, 113, 112, 3, 59, 100, 57, 64, 45, 46, 65, 37, 9, 82, 111, 42, 12, 109, 110, 117, 43, 75, 92, 123, 55, 116, 7, 70, 69, 107, 48, 51, 99, 106, 13, 29, 41, 83, 16, 124, 85, 105, 44, 0, 38, 102, 74, 119, 1, 39, 91, 94, 36, 8, 103, 66, 68, 19, 78, 89, 2, 4, 26, 34, 98, 101, 88, 32, 27, 96, 93, 11, 35, 81, 71, 95, 21, 90, 10, 24, 30, 6, 80, 14, 18, 22, 97, 72, 77, 23, 67, 79, 76, 20, 73, 5, 40], [40, 111, 47, 119, 94, 24, 127, 113, 26, 21, 62, 19, 110, 81, 122, 78, 114, 49, 123, 79, 118, 56, 63, 32, 12, 55, 120, 121, 105, 57, 61, 58, 126, 115, 43, 108, 54, 59, 42, 107, 87, 106, 9, 46, 18, 45, 125, 124, 104, 112, 48, 69, 116, 117, 51, 53, 37, 7, 39, 11, 41, 100, 60, 52, 109, 50, 38, 16, 36, 80, 44, 86, 73, 96, 28, 103, 99, 95, 102, 3, 101, 33, 91, 10, 30, 93, 89, 0, 97, 65, 71, 4, 35, 90, 13, 75, 76, 98, 23, 20, 27, 34, 82, 92, 31, 29, 15, 68, 22, 88, 25, 8, 84, 17, 74, 2, 14, 77, 66, 83, 85, 6, 72, 64, 70, 5, 1, 67], [40, 59, 46, 34, 110, 74, 76, 79, 7, 83, 81, 6, 21, 8, 5, 78, 2, 23, 93, 3, 0, 48, 1, 119, 9, 62, 27, 91, 117, 52, 115, 116, 77, 30, 45, 4, 89, 57, 49, 90, 64, 84, 56, 113, 126, 58, 122, 55, 120, 51, 47, 123, 118, 65, 108, 54, 109, 60, 42, 121, 124, 127, 112, 53, 61, 92, 63, 43, 125, 50, 44, 68, 37, 101, 38, 80, 111, 107, 15, 97, 72, 70, 88, 75, 114, 82, 87, 73, 105, 31, 41, 36, 102, 22, 18, 33, 95, 66, 86, 28, 103, 99, 106, 12, 39, 96, 104, 85, 11, 13, 94, 35, 67, 98, 32, 25, 29, 100, 24, 71, 20, 17, 19, 16, 26, 14, 10, 69]], "model.layers.11.self_attn.qk_proj": [[59, 126, 63, 46, 121, 111, 116, 110, 58, 123, 45, 109, 54, 117, 104, 60, 57, 40, 94, 56, 37, 125, 122, 62, 61, 101, 113, 114, 36, 88, 48, 24, 118, 127, 119, 27, 47, 22, 124, 83, 21, 49, 30, 19, 78, 53, 115, 76, 17, 14, 55, 120, 100, 51, 89, 81, 12, 85, 33, 52, 102, 95, 80, 50, 86, 32, 43, 16, 28, 97, 108, 87, 91, 9, 90, 23, 112, 42, 7, 92, 71, 38, 29, 31, 44, 93, 25, 41, 73, 107, 15, 10, 82, 96, 75, 69, 74, 34, 20, 79, 3, 11, 106, 5, 18, 67, 2, 84, 105, 0, 26, 6, 66, 98, 64, 99, 39, 70, 68, 72, 103, 13, 77, 4, 1, 65, 35, 8], [59, 126, 46, 63, 121, 111, 116, 110, 58, 45, 123, 109, 54, 117, 60, 104, 57, 94, 40, 56, 113, 122, 101, 127, 37, 36, 119, 114, 88, 118, 125, 24, 61, 62, 22, 49, 27, 53, 30, 48, 76, 52, 21, 83, 115, 124, 47, 55, 78, 120, 100, 19, 81, 51, 95, 43, 14, 12, 42, 86, 108, 17, 85, 50, 33, 97, 80, 91, 90, 16, 89, 28, 9, 23, 112, 102, 92, 44, 38, 25, 107, 7, 31, 20, 79, 32, 87, 93, 71, 34, 73, 29, 105, 15, 96, 10, 106, 18, 69, 41, 74, 6, 82, 26, 5, 75, 84, 11, 0, 98, 2, 64, 66, 99, 67, 72, 39, 103, 3, 1, 70, 65, 68, 77, 4, 35, 8, 13], [59, 126, 121, 46, 63, 111, 110, 116, 58, 45, 123, 109, 54, 117, 104, 40, 94, 122, 61, 37, 101, 36, 56, 125, 57, 60, 113, 114, 24, 88, 62, 119, 118, 48, 127, 124, 27, 53, 30, 47, 21, 100, 52, 51, 22, 83, 19, 55, 115, 50, 49, 33, 102, 95, 43, 42, 76, 28, 78, 120, 108, 17, 86, 91, 89, 85, 38, 112, 106, 81, 32, 12, 23, 97, 14, 92, 93, 31, 90, 80, 87, 44, 107, 16, 29, 96, 25, 9, 73, 105, 34, 41, 71, 20, 7, 79, 84, 98, 10, 99, 26, 69, 11, 82, 18, 74, 15, 75, 6, 5, 66, 0, 103, 64, 2, 3, 67, 39, 70, 1, 65, 68, 35, 72, 4, 8, 13, 77], [59, 126, 63, 46, 121, 111, 116, 110, 58, 123, 45, 109, 54, 117, 60, 104, 40, 57, 94, 36, 37, 122, 61, 56, 62, 114, 127, 101, 113, 24, 49, 119, 88, 118, 124, 48, 30, 27, 52, 47, 115, 125, 120, 95, 19, 22, 55, 21, 53, 50, 100, 83, 42, 28, 89, 14, 76, 108, 78, 102, 43, 81, 90, 17, 51, 12, 97, 92, 33, 85, 91, 32, 16, 31, 86, 80, 44, 38, 93, 87, 105, 107, 23, 64, 25, 73, 71, 96, 75, 29, 106, 112, 26, 9, 41, 7, 69, 2, 67, 74, 20, 79, 34, 0, 66, 11, 18, 98, 99, 82, 10, 3, 39, 5, 84, 6, 15, 103, 70, 68, 72, 35, 65, 4, 1, 8, 13, 77], [59, 126, 63, 46, 121, 111, 116, 110, 58, 123, 45, 109, 117, 54, 104, 40, 122, 61, 57, 60, 56, 94, 127, 36, 37, 118, 62, 113, 101, 119, 24, 125, 114, 88, 48, 49, 52, 124, 83, 21, 30, 120, 19, 76, 22, 115, 47, 55, 27, 78, 53, 12, 95, 85, 80, 28, 100, 42, 43, 50, 17, 51, 16, 102, 81, 89, 108, 33, 14, 112, 23, 97, 91, 92, 90, 32, 107, 25, 86, 7, 31, 71, 105, 93, 29, 44, 73, 9, 87, 41, 96, 0, 38, 15, 75, 106, 64, 82, 79, 20, 74, 66, 2, 10, 98, 67, 18, 84, 69, 26, 99, 34, 70, 11, 65, 3, 5, 103, 6, 39, 8, 1, 72, 68, 13, 77, 4, 35], [59, 126, 63, 46, 121, 111, 116, 110, 58, 123, 45, 109, 54, 117, 104, 60, 40, 57, 127, 122, 56, 113, 94, 114, 61, 36, 118, 37, 48, 101, 62, 88, 24, 119, 49, 22, 125, 83, 30, 52, 21, 27, 78, 19, 124, 12, 120, 76, 53, 81, 85, 95, 47, 14, 97, 17, 80, 115, 55, 43, 42, 16, 89, 28, 50, 51, 108, 100, 92, 102, 33, 86, 91, 25, 90, 107, 32, 73, 23, 9, 31, 96, 87, 79, 71, 7, 105, 44, 29, 20, 15, 93, 74, 82, 10, 112, 75, 69, 38, 99, 41, 26, 67, 18, 34, 11, 106, 70, 103, 84, 66, 5, 98, 3, 0, 64, 68, 6, 8, 2, 65, 39, 4, 1, 35, 77, 13, 72], [59, 126, 46, 63, 121, 116, 111, 110, 58, 123, 45, 109, 117, 54, 60, 104, 94, 40, 57, 36, 37, 113, 101, 24, 114, 127, 122, 88, 48, 21, 61, 118, 119, 56, 83, 19, 125, 27, 22, 62, 78, 30, 53, 49, 14, 17, 52, 95, 85, 76, 81, 12, 120, 124, 89, 86, 16, 100, 80, 55, 50, 28, 32, 115, 42, 43, 47, 112, 33, 92, 91, 87, 9, 102, 23, 97, 107, 51, 108, 38, 90, 71, 20, 96, 93, 31, 7, 11, 82, 75, 73, 25, 44, 29, 5, 41, 74, 18, 15, 34, 99, 10, 79, 105, 26, 67, 106, 98, 84, 69, 3, 39, 70, 64, 103, 2, 66, 8, 1, 6, 13, 0, 68, 4, 35, 72, 65, 77], [59, 126, 46, 121, 63, 116, 111, 110, 58, 45, 123, 109, 104, 40, 54, 94, 117, 60, 113, 24, 101, 37, 122, 36, 27, 88, 83, 57, 114, 127, 61, 22, 21, 47, 30, 52, 19, 118, 76, 56, 119, 125, 12, 62, 53, 48, 124, 78, 86, 81, 85, 49, 95, 17, 100, 14, 89, 115, 80, 91, 33, 55, 16, 28, 120, 50, 102, 92, 23, 42, 87, 43, 51, 7, 90, 32, 44, 9, 112, 108, 20, 38, 25, 82, 11, 97, 93, 75, 31, 29, 71, 15, 79, 73, 107, 106, 18, 10, 26, 5, 84, 64, 34, 41, 69, 74, 2, 70, 96, 0, 105, 3, 98, 8, 66, 99, 103, 67, 39, 1, 65, 4, 72, 6, 13, 68, 77, 35], [59, 126, 121, 46, 63, 111, 116, 110, 58, 45, 123, 109, 117, 104, 40, 94, 54, 60, 56, 36, 57, 122, 88, 101, 62, 37, 24, 114, 61, 119, 27, 49, 125, 118, 83, 113, 22, 19, 30, 21, 52, 124, 48, 100, 47, 127, 78, 55, 51, 89, 120, 85, 115, 17, 14, 76, 81, 12, 43, 86, 53, 80, 33, 95, 102, 38, 91, 28, 50, 42, 90, 108, 16, 23, 32, 107, 112, 92, 106, 93, 97, 44, 87, 31, 73, 9, 34, 7, 71, 15, 25, 18, 29, 41, 20, 10, 96, 11, 79, 105, 5, 82, 69, 98, 75, 26, 103, 84, 2, 99, 64, 0, 74, 66, 70, 67, 65, 3, 8, 6, 1, 4, 39, 72, 68, 77, 13, 35], [59, 126, 46, 121, 63, 111, 116, 110, 58, 123, 45, 109, 104, 54, 56, 60, 117, 40, 122, 57, 113, 94, 101, 37, 61, 62, 88, 36, 24, 114, 125, 127, 27, 119, 48, 83, 118, 19, 49, 47, 115, 21, 52, 22, 30, 120, 124, 76, 78, 85, 100, 51, 43, 86, 33, 12, 17, 14, 89, 80, 53, 55, 42, 81, 95, 108, 91, 92, 32, 16, 28, 97, 23, 50, 73, 31, 87, 38, 9, 90, 7, 102, 93, 15, 25, 105, 71, 112, 96, 79, 44, 107, 41, 69, 10, 18, 29, 67, 75, 84, 74, 11, 34, 3, 82, 20, 6, 106, 103, 26, 66, 64, 98, 0, 2, 5, 70, 8, 99, 1, 13, 68, 4, 39, 77, 72, 65, 35], [59, 126, 63, 121, 46, 111, 110, 116, 58, 123, 45, 109, 104, 117, 122, 54, 40, 113, 56, 61, 60, 125, 94, 62, 57, 37, 48, 114, 101, 24, 127, 88, 118, 36, 124, 119, 49, 21, 115, 47, 19, 52, 78, 83, 120, 30, 33, 22, 27, 76, 95, 12, 100, 50, 80, 28, 17, 85, 53, 14, 102, 43, 42, 16, 51, 81, 97, 86, 89, 73, 55, 91, 23, 25, 92, 108, 41, 32, 90, 7, 106, 38, 74, 9, 71, 29, 107, 66, 96, 79, 15, 10, 18, 6, 31, 87, 11, 34, 69, 93, 82, 20, 2, 0, 5, 75, 105, 64, 98, 26, 112, 84, 70, 67, 8, 3, 44, 13, 99, 4, 68, 1, 65, 72, 103, 77, 39, 35], [59, 126, 63, 46, 121, 111, 116, 110, 58, 123, 109, 45, 117, 104, 54, 40, 122, 60, 118, 56, 61, 113, 88, 57, 62, 114, 94, 48, 36, 101, 127, 37, 124, 24, 49, 19, 125, 21, 78, 83, 119, 30, 12, 22, 52, 115, 27, 76, 80, 17, 14, 47, 81, 16, 95, 120, 85, 86, 28, 100, 91, 33, 89, 50, 43, 102, 55, 108, 97, 51, 71, 53, 73, 32, 23, 90, 42, 7, 92, 10, 9, 11, 15, 38, 25, 41, 107, 82, 112, 105, 20, 87, 31, 74, 29, 18, 79, 96, 106, 69, 75, 5, 6, 0, 93, 84, 34, 3, 99, 67, 64, 1, 44, 98, 2, 26, 66, 65, 8, 72, 4, 70, 103, 13, 77, 39, 68, 35], [59, 126, 46, 63, 121, 111, 116, 110, 58, 45, 123, 109, 54, 117, 104, 40, 60, 122, 94, 24, 57, 88, 113, 48, 56, 49, 36, 101, 37, 114, 62, 61, 27, 124, 119, 125, 127, 19, 118, 83, 22, 78, 21, 12, 30, 55, 17, 115, 14, 80, 47, 50, 81, 76, 85, 52, 86, 120, 100, 108, 89, 95, 28, 53, 43, 91, 16, 33, 51, 42, 92, 97, 23, 25, 31, 38, 79, 32, 90, 44, 105, 15, 73, 112, 82, 9, 102, 29, 71, 87, 93, 74, 41, 7, 10, 96, 84, 69, 18, 34, 11, 107, 20, 75, 106, 26, 103, 3, 6, 99, 98, 67, 39, 5, 72, 0, 8, 2, 66, 65, 1, 70, 64, 13, 77, 35, 68, 4], [59, 126, 46, 63, 121, 111, 116, 110, 58, 123, 45, 109, 104, 54, 60, 40, 117, 114, 94, 122, 57, 48, 127, 37, 36, 61, 113, 49, 47, 56, 62, 101, 50, 21, 24, 88, 12, 27, 119, 125, 124, 22, 118, 78, 19, 14, 76, 83, 30, 52, 80, 97, 53, 95, 17, 28, 81, 55, 16, 120, 86, 115, 89, 100, 51, 73, 85, 9, 108, 112, 92, 43, 7, 25, 90, 71, 33, 91, 102, 10, 15, 87, 44, 79, 23, 107, 32, 31, 69, 29, 82, 42, 38, 18, 20, 64, 96, 6, 74, 11, 0, 106, 34, 93, 84, 75, 26, 41, 5, 66, 99, 67, 105, 2, 70, 1, 3, 98, 72, 39, 103, 65, 68, 4, 35, 13, 8, 77], [59, 126, 46, 121, 63, 111, 116, 110, 58, 123, 45, 109, 117, 104, 54, 40, 60, 114, 113, 94, 122, 61, 56, 57, 36, 49, 37, 88, 24, 101, 48, 125, 127, 118, 62, 119, 27, 19, 21, 124, 30, 78, 22, 12, 83, 17, 47, 55, 50, 52, 14, 16, 115, 76, 89, 80, 100, 51, 95, 85, 86, 53, 81, 97, 91, 28, 120, 108, 102, 33, 92, 43, 23, 107, 31, 9, 38, 42, 25, 32, 7, 90, 44, 41, 71, 112, 73, 93, 29, 15, 11, 87, 105, 74, 82, 10, 106, 18, 20, 79, 84, 34, 96, 69, 6, 98, 5, 99, 75, 0, 3, 26, 72, 70, 66, 39, 67, 64, 103, 2, 35, 65, 68, 4, 8, 1, 13, 77], [59, 126, 63, 46, 121, 111, 116, 110, 58, 123, 45, 109, 117, 54, 60, 104, 40, 127, 56, 57, 61, 122, 94, 48, 37, 36, 114, 113, 118, 88, 124, 101, 49, 24, 62, 83, 30, 27, 19, 78, 52, 115, 119, 21, 12, 47, 125, 22, 81, 55, 100, 120, 17, 16, 95, 97, 50, 85, 80, 89, 14, 86, 43, 53, 76, 51, 102, 108, 91, 33, 107, 112, 90, 42, 28, 41, 32, 73, 23, 7, 9, 71, 31, 25, 10, 74, 29, 92, 96, 105, 34, 15, 44, 11, 106, 87, 18, 75, 20, 82, 98, 79, 69, 99, 93, 38, 67, 0, 84, 64, 3, 26, 5, 6, 70, 103, 66, 72, 2, 68, 35, 4, 65, 1, 8, 13, 39, 77], [59, 126, 46, 63, 121, 116, 111, 110, 58, 45, 123, 109, 104, 117, 60, 40, 54, 122, 49, 56, 24, 101, 57, 114, 94, 37, 88, 61, 27, 125, 48, 36, 127, 83, 30, 19, 113, 22, 62, 47, 85, 78, 21, 119, 118, 120, 12, 55, 52, 81, 16, 100, 14, 124, 86, 43, 17, 95, 76, 80, 89, 115, 97, 50, 33, 42, 9, 108, 51, 91, 28, 53, 112, 90, 31, 32, 23, 92, 73, 25, 41, 71, 44, 15, 107, 87, 74, 102, 106, 82, 20, 79, 38, 75, 7, 29, 93, 105, 96, 10, 34, 98, 18, 11, 84, 26, 72, 69, 99, 70, 5, 0, 3, 103, 64, 35, 67, 2, 4, 68, 39, 65, 66, 1, 77, 13, 6, 8], [59, 126, 46, 63, 121, 111, 116, 110, 58, 45, 123, 109, 104, 117, 60, 54, 40, 94, 56, 57, 127, 114, 101, 24, 122, 113, 36, 37, 118, 88, 119, 49, 27, 19, 48, 62, 125, 83, 21, 22, 61, 78, 12, 30, 55, 124, 47, 52, 86, 14, 95, 50, 120, 100, 115, 17, 81, 80, 85, 16, 97, 42, 76, 89, 33, 28, 91, 43, 44, 23, 53, 112, 31, 102, 51, 108, 9, 90, 32, 38, 25, 92, 20, 41, 107, 73, 82, 71, 106, 87, 18, 93, 105, 7, 79, 15, 84, 29, 74, 11, 96, 34, 26, 10, 98, 64, 69, 5, 67, 75, 103, 70, 2, 0, 66, 72, 99, 3, 35, 77, 1, 39, 6, 8, 65, 4, 68, 13], [59, 126, 46, 63, 121, 111, 116, 110, 58, 45, 123, 109, 117, 49, 104, 56, 36, 122, 57, 94, 118, 61, 54, 119, 114, 40, 60, 101, 127, 88, 125, 113, 37, 48, 24, 83, 27, 52, 62, 124, 30, 120, 19, 85, 115, 43, 55, 47, 50, 22, 17, 21, 14, 102, 51, 53, 81, 95, 108, 12, 100, 78, 33, 76, 38, 80, 112, 16, 86, 42, 97, 89, 32, 41, 91, 28, 107, 31, 105, 93, 25, 92, 106, 23, 90, 87, 82, 29, 73, 7, 96, 71, 99, 18, 44, 69, 20, 34, 0, 84, 9, 79, 11, 26, 10, 103, 70, 67, 74, 66, 98, 5, 64, 3, 35, 15, 75, 2, 39, 1, 68, 6, 8, 65, 77, 72, 4, 13], [59, 126, 63, 46, 121, 111, 116, 110, 58, 123, 45, 109, 117, 104, 54, 40, 60, 114, 56, 94, 57, 49, 127, 101, 37, 36, 122, 24, 118, 113, 61, 88, 27, 22, 48, 125, 119, 124, 115, 30, 19, 83, 62, 52, 21, 14, 95, 47, 120, 81, 100, 53, 89, 12, 51, 17, 97, 50, 78, 55, 108, 33, 76, 86, 28, 85, 102, 16, 43, 91, 112, 80, 90, 25, 42, 31, 73, 92, 23, 93, 107, 106, 105, 41, 38, 44, 32, 9, 87, 96, 29, 20, 7, 82, 18, 34, 79, 10, 11, 5, 71, 98, 69, 15, 74, 84, 26, 70, 99, 103, 75, 64, 2, 0, 66, 72, 67, 6, 35, 3, 39, 8, 77, 65, 68, 4, 1, 13], [59, 126, 46, 63, 121, 111, 116, 110, 58, 123, 45, 109, 117, 104, 60, 54, 40, 56, 113, 114, 48, 94, 122, 118, 57, 49, 125, 36, 127, 124, 24, 88, 37, 61, 101, 62, 19, 52, 22, 27, 83, 119, 47, 21, 30, 78, 55, 12, 95, 81, 115, 76, 16, 14, 120, 100, 17, 89, 86, 108, 80, 97, 85, 50, 33, 51, 28, 42, 91, 53, 102, 9, 112, 90, 43, 23, 105, 41, 29, 32, 20, 7, 107, 25, 73, 92, 31, 106, 71, 38, 93, 74, 87, 18, 79, 10, 5, 44, 96, 82, 69, 75, 11, 98, 70, 64, 3, 34, 84, 66, 15, 26, 6, 67, 0, 2, 68, 99, 103, 39, 72, 77, 8, 4, 13, 65, 1, 35], [59, 126, 46, 63, 121, 111, 116, 110, 58, 123, 45, 109, 117, 104, 113, 94, 60, 40, 54, 114, 56, 118, 48, 125, 122, 57, 88, 49, 101, 37, 36, 24, 119, 61, 52, 124, 127, 62, 21, 27, 83, 30, 19, 22, 100, 33, 47, 43, 115, 55, 78, 50, 81, 17, 12, 95, 108, 85, 80, 120, 89, 53, 14, 102, 16, 97, 86, 91, 76, 28, 106, 90, 32, 51, 42, 112, 38, 31, 92, 107, 87, 23, 105, 9, 93, 7, 71, 96, 25, 20, 74, 11, 82, 29, 18, 73, 79, 34, 10, 41, 15, 99, 75, 26, 67, 6, 103, 0, 2, 84, 98, 69, 64, 44, 3, 5, 66, 35, 8, 72, 39, 70, 1, 77, 65, 68, 13, 4], [59, 126, 46, 63, 121, 116, 110, 111, 58, 45, 123, 109, 104, 40, 54, 117, 60, 56, 94, 113, 125, 122, 61, 37, 118, 57, 48, 24, 101, 119, 36, 88, 62, 49, 114, 22, 30, 127, 124, 83, 19, 100, 52, 27, 85, 78, 21, 33, 47, 115, 12, 43, 17, 91, 95, 120, 108, 50, 16, 97, 14, 76, 80, 28, 89, 53, 55, 81, 102, 86, 51, 32, 42, 106, 112, 38, 92, 23, 31, 105, 87, 107, 9, 96, 93, 90, 41, 73, 25, 71, 7, 79, 20, 34, 26, 74, 18, 29, 44, 10, 82, 99, 84, 15, 11, 75, 6, 5, 64, 0, 103, 69, 98, 67, 2, 1, 39, 66, 8, 72, 4, 3, 70, 13, 65, 77, 35, 68], [59, 126, 63, 46, 121, 111, 116, 110, 58, 123, 45, 109, 104, 117, 56, 54, 60, 40, 113, 94, 36, 49, 57, 48, 122, 118, 125, 24, 114, 62, 37, 127, 101, 119, 88, 61, 22, 83, 124, 76, 47, 30, 27, 120, 52, 19, 97, 21, 95, 78, 100, 33, 85, 115, 80, 28, 14, 81, 55, 91, 16, 17, 50, 53, 86, 43, 12, 102, 108, 89, 73, 42, 23, 51, 107, 92, 31, 25, 90, 7, 32, 9, 112, 44, 93, 105, 41, 71, 79, 87, 38, 10, 29, 96, 82, 20, 74, 18, 26, 5, 6, 34, 84, 99, 15, 69, 67, 75, 103, 106, 11, 3, 66, 0, 98, 8, 2, 64, 70, 39, 68, 4, 65, 72, 1, 13, 35, 77], [59, 126, 46, 63, 121, 111, 116, 110, 58, 123, 45, 109, 104, 54, 117, 40, 94, 56, 62, 122, 57, 114, 60, 125, 101, 36, 48, 118, 24, 113, 37, 61, 88, 49, 127, 119, 21, 124, 83, 27, 19, 22, 86, 30, 52, 100, 120, 81, 115, 47, 78, 14, 17, 76, 85, 50, 91, 33, 55, 42, 32, 89, 102, 16, 53, 43, 95, 12, 51, 80, 97, 108, 41, 107, 92, 23, 28, 112, 90, 87, 25, 31, 73, 7, 38, 71, 29, 93, 20, 82, 74, 9, 106, 96, 105, 44, 75, 26, 18, 99, 34, 79, 10, 84, 15, 5, 69, 3, 98, 6, 11, 8, 64, 39, 70, 103, 66, 67, 0, 2, 35, 68, 1, 13, 65, 77, 4, 72], [59, 126, 46, 63, 121, 111, 116, 110, 58, 45, 123, 109, 104, 117, 40, 54, 94, 56, 57, 125, 60, 122, 113, 36, 24, 101, 62, 37, 88, 114, 49, 48, 119, 19, 118, 127, 22, 27, 83, 21, 86, 47, 100, 30, 124, 61, 95, 52, 115, 33, 17, 43, 78, 76, 55, 120, 85, 14, 81, 80, 12, 91, 97, 16, 108, 89, 42, 53, 102, 28, 51, 92, 41, 50, 90, 23, 20, 31, 9, 25, 32, 38, 87, 107, 7, 29, 105, 112, 82, 71, 11, 106, 79, 44, 73, 93, 96, 74, 18, 10, 69, 15, 84, 98, 5, 34, 75, 26, 2, 99, 0, 70, 64, 103, 6, 39, 67, 8, 66, 13, 3, 35, 72, 77, 68, 1, 65, 4], [59, 126, 46, 121, 63, 111, 116, 110, 58, 45, 123, 109, 104, 54, 40, 125, 117, 113, 122, 94, 56, 57, 62, 101, 37, 119, 60, 36, 88, 24, 61, 114, 118, 49, 48, 22, 127, 19, 21, 47, 30, 27, 124, 83, 52, 55, 115, 43, 17, 33, 50, 100, 86, 78, 12, 108, 95, 28, 76, 14, 80, 51, 85, 97, 89, 53, 91, 81, 120, 42, 16, 112, 102, 90, 23, 38, 31, 106, 92, 87, 32, 71, 93, 9, 41, 29, 7, 73, 79, 105, 25, 44, 74, 84, 18, 75, 96, 34, 20, 82, 5, 26, 15, 67, 10, 64, 2, 0, 66, 69, 107, 11, 103, 98, 3, 70, 99, 39, 6, 8, 35, 68, 4, 1, 65, 72, 13, 77], [59, 126, 46, 63, 121, 111, 116, 110, 58, 123, 45, 109, 104, 117, 54, 40, 57, 94, 122, 36, 118, 37, 113, 56, 60, 114, 61, 125, 62, 48, 101, 88, 24, 49, 47, 119, 124, 83, 30, 19, 27, 22, 50, 52, 21, 76, 17, 127, 78, 95, 120, 85, 12, 28, 55, 100, 81, 115, 89, 14, 43, 33, 86, 80, 53, 97, 16, 108, 9, 91, 51, 102, 73, 32, 23, 7, 71, 92, 90, 25, 87, 42, 75, 107, 41, 106, 93, 29, 64, 38, 5, 18, 74, 112, 96, 10, 15, 31, 70, 2, 105, 82, 66, 44, 11, 3, 79, 20, 0, 84, 69, 34, 67, 26, 99, 65, 6, 1, 39, 8, 98, 103, 72, 68, 77, 4, 13, 35], [59, 126, 46, 63, 121, 111, 116, 110, 58, 45, 123, 109, 117, 54, 104, 40, 61, 60, 57, 56, 113, 118, 122, 101, 37, 48, 36, 94, 62, 125, 114, 127, 88, 24, 47, 119, 21, 49, 83, 124, 22, 19, 30, 120, 27, 52, 76, 89, 53, 95, 78, 100, 115, 51, 33, 43, 55, 12, 50, 17, 102, 28, 81, 80, 108, 42, 86, 85, 31, 90, 97, 14, 32, 16, 23, 91, 87, 38, 93, 73, 112, 7, 92, 96, 41, 9, 71, 106, 25, 44, 20, 15, 107, 105, 79, 5, 26, 34, 18, 70, 99, 10, 29, 75, 84, 82, 0, 69, 3, 74, 64, 11, 66, 67, 2, 98, 1, 65, 103, 8, 39, 68, 6, 35, 72, 4, 77, 13], [59, 126, 46, 63, 121, 116, 111, 110, 58, 123, 45, 109, 117, 104, 40, 54, 60, 36, 56, 57, 113, 118, 61, 88, 94, 37, 101, 24, 122, 47, 62, 83, 125, 49, 114, 127, 27, 22, 48, 119, 21, 124, 30, 76, 19, 78, 95, 17, 52, 80, 43, 12, 85, 14, 115, 53, 108, 50, 55, 100, 16, 81, 33, 28, 120, 42, 89, 73, 51, 23, 32, 31, 9, 102, 86, 87, 90, 91, 97, 44, 71, 38, 92, 112, 25, 7, 106, 74, 93, 107, 34, 79, 29, 96, 41, 10, 67, 3, 64, 20, 5, 70, 18, 15, 11, 69, 75, 82, 84, 26, 103, 105, 0, 66, 65, 6, 99, 98, 72, 2, 4, 1, 39, 8, 13, 68, 77, 35], [59, 126, 46, 63, 121, 111, 116, 110, 58, 123, 45, 109, 117, 104, 54, 40, 60, 122, 57, 118, 113, 127, 94, 61, 56, 37, 101, 48, 36, 88, 125, 114, 24, 27, 119, 62, 124, 83, 21, 47, 49, 19, 22, 50, 76, 89, 30, 55, 14, 100, 78, 86, 120, 12, 81, 85, 42, 52, 115, 95, 53, 80, 33, 17, 16, 108, 51, 87, 112, 32, 102, 28, 97, 90, 43, 31, 23, 9, 107, 91, 92, 38, 71, 73, 41, 7, 20, 79, 18, 105, 15, 82, 74, 44, 25, 96, 11, 75, 10, 26, 34, 5, 29, 93, 99, 64, 106, 3, 98, 70, 69, 103, 84, 67, 2, 0, 66, 6, 8, 1, 72, 65, 39, 4, 13, 68, 35, 77], [59, 126, 46, 63, 121, 111, 116, 110, 58, 45, 123, 109, 117, 104, 54, 60, 40, 94, 122, 113, 127, 57, 88, 56, 36, 61, 37, 125, 48, 27, 24, 118, 101, 76, 62, 114, 22, 120, 83, 124, 119, 19, 49, 80, 30, 47, 85, 115, 14, 12, 50, 86, 55, 21, 52, 81, 17, 100, 78, 89, 95, 16, 28, 33, 108, 97, 91, 51, 42, 9, 73, 53, 90, 32, 43, 102, 87, 23, 112, 7, 71, 31, 92, 79, 107, 82, 74, 38, 44, 15, 41, 29, 10, 25, 20, 93, 69, 75, 18, 84, 3, 26, 11, 96, 105, 106, 5, 99, 34, 6, 70, 98, 0, 39, 67, 72, 64, 2, 66, 65, 1, 4, 103, 13, 8, 35, 68, 77]], "model.layers.12.self_attn.q_proj": [[102, 127, 49, 95, 21, 24, 16, 83, 80, 82, 88, 77, 28, 6, 51, 99, 44, 37, 38, 72, 2, 31, 74, 27, 106, 22, 75, 86, 13, 53, 42, 108, 26, 98, 112, 10, 93, 96, 18, 43, 62, 20, 94, 103, 91, 79, 4, 117, 90, 61, 85, 78, 29, 101, 104, 14, 40, 19, 25, 81, 11, 32, 92, 64, 89, 60, 17, 30, 36, 56, 12, 109, 125, 9, 113, 48, 111, 52, 116, 70, 87, 41, 115, 105, 84, 71, 76, 33, 15, 55, 114, 65, 8, 73, 118, 100, 45, 97, 54, 107, 34, 68, 39, 69, 0, 123, 35, 5, 66, 7, 110, 67, 58, 122, 126, 1, 47, 121, 59, 23, 120, 50, 57, 46, 119, 63, 124, 3], [127, 102, 49, 95, 51, 108, 24, 31, 26, 38, 98, 19, 83, 21, 96, 61, 106, 44, 42, 116, 97, 111, 46, 43, 109, 28, 125, 105, 29, 55, 115, 114, 93, 62, 112, 53, 40, 37, 52, 41, 126, 101, 22, 27, 54, 60, 121, 58, 45, 103, 107, 110, 117, 82, 123, 120, 104, 56, 113, 36, 124, 48, 119, 57, 50, 59, 32, 118, 94, 122, 20, 100, 88, 47, 92, 91, 13, 33, 99, 39, 84, 34, 86, 90, 85, 63, 15, 25, 77, 17, 30, 35, 79, 18, 80, 87, 89, 23, 81, 16, 14, 6, 78, 72, 12, 70, 76, 74, 64, 8, 4, 75, 11, 10, 73, 71, 9, 65, 2, 68, 66, 67, 1, 7, 0, 69, 3, 5], [102, 49, 127, 95, 21, 108, 82, 88, 77, 83, 24, 51, 96, 44, 86, 98, 99, 80, 113, 37, 28, 27, 6, 16, 31, 74, 117, 26, 42, 90, 72, 2, 62, 19, 46, 41, 53, 105, 116, 79, 8, 94, 22, 119, 25, 18, 121, 112, 85, 20, 61, 14, 38, 101, 109, 65, 29, 93, 68, 92, 12, 104, 36, 76, 43, 81, 115, 64, 10, 55, 125, 111, 56, 23, 87, 45, 91, 40, 126, 58, 107, 120, 13, 100, 59, 50, 52, 60, 97, 122, 39, 30, 4, 32, 110, 54, 71, 103, 78, 123, 34, 118, 17, 57, 33, 114, 106, 124, 11, 47, 35, 84, 63, 69, 89, 3, 48, 70, 15, 75, 7, 5, 73, 9, 0, 67, 1, 66], [102, 49, 127, 95, 21, 24, 83, 28, 82, 80, 77, 11, 99, 38, 88, 31, 96, 20, 74, 90, 42, 53, 51, 26, 113, 35, 86, 60, 105, 14, 70, 6, 68, 108, 44, 15, 22, 18, 16, 75, 98, 101, 97, 56, 79, 58, 27, 37, 62, 2, 114, 29, 85, 25, 103, 81, 19, 8, 43, 106, 110, 112, 87, 92, 9, 116, 61, 117, 5, 12, 67, 89, 10, 17, 34, 109, 30, 120, 111, 55, 13, 78, 73, 76, 36, 3, 94, 48, 46, 0, 121, 124, 119, 23, 115, 93, 39, 91, 107, 50, 122, 71, 100, 41, 123, 72, 33, 126, 65, 59, 1, 57, 125, 47, 52, 69, 104, 45, 118, 54, 32, 40, 7, 84, 64, 4, 63, 66], [45, 101, 109, 26, 28, 21, 24, 83, 88, 62, 31, 78, 95, 97, 72, 82, 124, 75, 87, 113, 4, 122, 30, 49, 44, 121, 53, 93, 94, 27, 98, 51, 16, 116, 85, 80, 110, 77, 66, 57, 111, 69, 102, 0, 18, 25, 86, 105, 106, 14, 63, 92, 68, 36, 115, 119, 100, 107, 118, 19, 114, 23, 76, 35, 34, 79, 61, 81, 15, 59, 41, 33, 47, 103, 9, 52, 91, 60, 112, 46, 120, 1, 8, 117, 39, 50, 13, 67, 104, 125, 48, 127, 10, 89, 40, 58, 32, 70, 71, 22, 56, 90, 20, 38, 11, 123, 54, 55, 42, 17, 29, 43, 108, 12, 37, 126, 99, 84, 96, 74, 6, 2, 73, 5, 7, 65, 3, 64], [45, 101, 109, 28, 62, 21, 26, 24, 31, 88, 83, 87, 97, 78, 122, 124, 75, 95, 16, 107, 53, 72, 113, 82, 98, 49, 116, 94, 17, 25, 91, 85, 47, 93, 121, 76, 102, 30, 77, 86, 34, 57, 106, 51, 59, 6, 103, 112, 22, 108, 44, 117, 36, 84, 29, 33, 10, 61, 52, 74, 63, 39, 118, 60, 27, 43, 12, 13, 18, 71, 123, 96, 48, 114, 23, 110, 105, 79, 41, 67, 92, 115, 42, 81, 111, 56, 66, 80, 69, 8, 58, 55, 50, 127, 14, 125, 0, 126, 120, 100, 35, 20, 119, 40, 11, 104, 54, 89, 68, 4, 46, 19, 32, 99, 38, 70, 37, 5, 7, 90, 73, 9, 1, 15, 2, 65, 3, 64], [45, 101, 109, 28, 26, 122, 21, 24, 83, 87, 88, 31, 97, 95, 78, 121, 62, 124, 113, 30, 75, 51, 53, 82, 116, 93, 86, 16, 110, 102, 98, 25, 36, 63, 106, 72, 59, 34, 79, 49, 112, 57, 35, 107, 47, 111, 38, 44, 52, 58, 105, 29, 39, 100, 27, 118, 61, 125, 123, 114, 115, 55, 41, 40, 48, 37, 119, 15, 92, 94, 50, 85, 6, 13, 46, 60, 103, 108, 127, 33, 99, 126, 74, 42, 12, 32, 104, 54, 20, 0, 84, 22, 14, 117, 56, 43, 120, 80, 18, 19, 90, 91, 23, 96, 8, 11, 68, 17, 89, 81, 69, 77, 76, 10, 67, 66, 9, 5, 70, 71, 73, 3, 1, 7, 4, 65, 2, 64], [45, 101, 109, 26, 28, 24, 88, 83, 21, 82, 93, 87, 121, 62, 95, 75, 78, 31, 124, 122, 16, 97, 98, 116, 30, 113, 34, 51, 53, 57, 77, 110, 49, 63, 107, 61, 125, 115, 18, 112, 41, 105, 22, 106, 94, 103, 71, 36, 111, 92, 86, 11, 118, 29, 37, 85, 40, 59, 9, 27, 67, 47, 50, 4, 127, 33, 100, 108, 44, 19, 117, 123, 32, 35, 126, 1, 60, 0, 48, 80, 69, 46, 23, 42, 84, 120, 55, 6, 79, 52, 91, 20, 81, 14, 72, 58, 68, 54, 5, 56, 8, 114, 13, 43, 66, 74, 102, 39, 90, 99, 15, 12, 119, 104, 76, 96, 17, 65, 25, 38, 89, 2, 73, 10, 70, 7, 3, 64], [43, 97, 107, 24, 91, 26, 31, 22, 83, 81, 100, 85, 103, 75, 33, 87, 30, 20, 50, 15, 88, 110, 13, 7, 114, 90, 49, 127, 95, 66, 93, 86, 84, 12, 94, 19, 102, 92, 98, 61, 113, 48, 70, 18, 35, 0, 115, 40, 118, 25, 38, 54, 1, 63, 36, 16, 80, 28, 117, 56, 58, 27, 101, 77, 42, 51, 99, 82, 32, 34, 120, 29, 55, 125, 71, 96, 4, 89, 21, 79, 17, 6, 39, 47, 112, 41, 111, 116, 8, 119, 121, 123, 23, 106, 45, 62, 52, 3, 122, 53, 46, 11, 57, 37, 105, 73, 67, 10, 9, 109, 74, 126, 108, 78, 44, 104, 14, 76, 60, 59, 124, 72, 69, 68, 5, 2, 64, 65], [43, 31, 97, 107, 22, 13, 24, 26, 15, 91, 83, 20, 85, 33, 9, 115, 100, 110, 75, 73, 18, 81, 86, 84, 114, 50, 90, 70, 68, 54, 36, 95, 72, 25, 17, 79, 27, 77, 88, 63, 121, 19, 71, 42, 111, 61, 12, 127, 76, 23, 82, 93, 74, 87, 16, 66, 49, 65, 8, 38, 122, 46, 48, 10, 7, 103, 112, 14, 101, 47, 78, 117, 69, 34, 113, 67, 55, 118, 80, 29, 21, 116, 126, 123, 92, 124, 56, 89, 28, 96, 106, 6, 1, 108, 120, 40, 102, 94, 30, 98, 11, 105, 45, 35, 125, 51, 39, 32, 5, 62, 58, 41, 4, 37, 57, 60, 64, 59, 52, 99, 44, 109, 119, 104, 0, 53, 3, 2], [43, 107, 31, 97, 26, 106, 42, 20, 113, 61, 115, 125, 50, 33, 24, 48, 127, 117, 11, 38, 114, 121, 36, 100, 49, 126, 95, 54, 63, 116, 110, 111, 56, 57, 118, 55, 123, 46, 47, 60, 58, 91, 103, 77, 108, 39, 119, 109, 101, 6, 51, 124, 45, 44, 59, 122, 7, 120, 62, 35, 85, 112, 40, 53, 41, 52, 30, 73, 90, 92, 105, 22, 37, 17, 81, 104, 16, 10, 99, 96, 3, 5, 84, 75, 1, 32, 21, 98, 79, 34, 19, 102, 93, 29, 87, 94, 4, 72, 28, 23, 25, 88, 89, 27, 0, 83, 82, 86, 18, 78, 80, 2, 71, 15, 9, 14, 12, 76, 13, 66, 8, 64, 70, 68, 65, 67, 69, 74], [43, 97, 31, 107, 24, 26, 91, 22, 54, 81, 42, 20, 115, 83, 75, 33, 13, 50, 100, 15, 18, 85, 48, 61, 84, 120, 66, 114, 110, 93, 90, 6, 57, 70, 127, 30, 113, 12, 49, 63, 103, 118, 95, 80, 19, 125, 121, 88, 86, 56, 9, 116, 77, 47, 0, 46, 1, 117, 71, 60, 36, 28, 41, 111, 45, 82, 112, 55, 123, 108, 106, 124, 89, 126, 92, 58, 14, 8, 38, 51, 29, 40, 76, 11, 109, 87, 23, 119, 122, 94, 34, 17, 73, 27, 79, 64, 25, 101, 99, 67, 44, 3, 39, 35, 68, 21, 32, 98, 52, 37, 96, 105, 53, 65, 7, 16, 69, 62, 78, 104, 74, 102, 59, 10, 72, 5, 4, 2], [38, 64, 113, 97, 93, 49, 57, 1, 6, 78, 67, 16, 70, 3, 82, 73, 65, 11, 4, 77, 23, 12, 90, 2, 31, 14, 72, 19, 8, 94, 121, 126, 66, 75, 89, 62, 71, 59, 20, 13, 84, 9, 122, 86, 18, 103, 22, 87, 37, 83, 125, 53, 69, 7, 21, 80, 118, 27, 42, 39, 54, 10, 101, 44, 76, 33, 5, 91, 58, 88, 74, 26, 17, 29, 40, 0, 30, 68, 32, 119, 79, 25, 50, 63, 96, 85, 45, 35, 15, 60, 99, 36, 95, 108, 123, 92, 81, 124, 48, 109, 100, 116, 120, 61, 111, 114, 51, 24, 115, 106, 46, 56, 117, 127, 52, 47, 102, 34, 104, 98, 28, 43, 55, 41, 105, 110, 112, 107], [38, 57, 113, 97, 49, 93, 23, 82, 78, 16, 73, 88, 11, 21, 12, 20, 25, 102, 6, 29, 5, 39, 94, 77, 72, 9, 26, 83, 37, 31, 32, 68, 27, 13, 125, 28, 90, 30, 92, 63, 71, 121, 106, 85, 19, 34, 101, 44, 22, 91, 62, 8, 65, 15, 80, 4, 74, 75, 81, 84, 59, 79, 87, 14, 17, 76, 18, 24, 122, 66, 7, 89, 33, 105, 10, 2, 95, 109, 119, 3, 108, 60, 36, 96, 42, 86, 45, 35, 98, 41, 70, 54, 100, 126, 52, 40, 99, 55, 50, 47, 51, 58, 56, 69, 53, 103, 118, 120, 61, 107, 115, 123, 112, 43, 46, 116, 104, 114, 48, 110, 127, 64, 111, 124, 67, 1, 117, 0], [113, 101, 57, 38, 62, 49, 97, 123, 126, 121, 119, 53, 52, 56, 102, 59, 60, 114, 127, 118, 125, 26, 116, 117, 51, 106, 58, 46, 100, 103, 124, 63, 44, 61, 31, 48, 111, 45, 50, 112, 120, 47, 90, 115, 109, 54, 122, 55, 110, 107, 108, 105, 40, 22, 41, 39, 32, 21, 43, 84, 104, 42, 28, 93, 94, 37, 99, 25, 36, 81, 95, 34, 30, 89, 86, 35, 85, 17, 98, 23, 19, 27, 88, 83, 96, 91, 29, 20, 76, 24, 33, 92, 79, 87, 4, 74, 75, 8, 14, 82, 9, 5, 72, 80, 13, 70, 15, 12, 18, 10, 71, 2, 73, 77, 66, 7, 16, 6, 67, 69, 65, 64, 68, 1, 11, 78, 3, 0], [38, 57, 97, 23, 93, 49, 113, 82, 29, 78, 16, 25, 11, 102, 59, 31, 62, 90, 85, 33, 87, 26, 21, 39, 83, 126, 77, 19, 27, 71, 88, 80, 18, 94, 13, 125, 101, 95, 89, 28, 15, 20, 22, 121, 79, 96, 30, 100, 75, 32, 92, 91, 119, 61, 37, 84, 68, 14, 86, 53, 98, 127, 81, 24, 35, 58, 47, 17, 44, 36, 8, 108, 34, 76, 63, 10, 99, 50, 43, 115, 40, 118, 48, 110, 12, 103, 74, 55, 106, 7, 72, 111, 114, 60, 109, 122, 123, 51, 46, 54, 42, 4, 41, 116, 56, 104, 66, 112, 117, 52, 105, 5, 2, 120, 124, 70, 69, 6, 45, 107, 67, 73, 9, 1, 0, 3, 65, 64], [39, 48, 62, 112, 29, 21, 14, 80, 61, 81, 76, 11, 67, 87, 73, 7, 56, 84, 5, 93, 24, 27, 69, 71, 75, 1, 109, 12, 0, 25, 41, 2, 96, 118, 6, 89, 116, 17, 9, 79, 117, 85, 66, 52, 4, 13, 16, 86, 54, 94, 32, 82, 101, 18, 78, 50, 95, 113, 119, 15, 19, 120, 34, 126, 83, 57, 23, 77, 26, 49, 106, 31, 45, 127, 55, 42, 107, 122, 36, 111, 28, 88, 44, 3, 22, 104, 40, 63, 97, 115, 121, 102, 35, 59, 33, 72, 108, 30, 114, 125, 60, 92, 124, 105, 8, 123, 58, 99, 110, 74, 53, 43, 51, 37, 100, 90, 38, 46, 98, 20, 47, 65, 91, 70, 10, 68, 103, 64], [39, 62, 48, 112, 87, 61, 29, 21, 81, 14, 73, 80, 11, 76, 69, 56, 24, 93, 41, 71, 77, 79, 118, 3, 86, 27, 111, 18, 9, 96, 32, 94, 109, 127, 125, 50, 101, 126, 2, 92, 37, 70, 95, 33, 6, 54, 117, 82, 85, 121, 5, 36, 28, 1, 66, 113, 22, 78, 0, 123, 74, 20, 17, 97, 91, 49, 116, 59, 120, 31, 122, 89, 99, 107, 72, 23, 119, 52, 16, 45, 84, 115, 108, 51, 43, 55, 38, 53, 40, 106, 60, 124, 12, 57, 13, 104, 19, 114, 83, 34, 102, 30, 110, 75, 35, 15, 88, 42, 58, 44, 67, 10, 100, 105, 46, 63, 26, 98, 25, 47, 90, 8, 65, 7, 68, 103, 64, 4], [39, 62, 48, 112, 87, 29, 61, 11, 25, 80, 81, 21, 14, 73, 69, 76, 5, 93, 66, 2, 71, 0, 77, 18, 67, 118, 27, 3, 1, 24, 94, 33, 117, 72, 109, 75, 125, 111, 86, 56, 92, 32, 96, 31, 41, 82, 74, 106, 50, 7, 116, 95, 37, 79, 113, 85, 28, 30, 19, 97, 70, 89, 83, 36, 122, 63, 78, 17, 123, 121, 126, 34, 120, 43, 88, 16, 38, 35, 54, 6, 12, 40, 59, 65, 26, 45, 91, 10, 9, 90, 23, 99, 127, 84, 51, 104, 52, 46, 114, 58, 98, 119, 49, 102, 20, 103, 100, 44, 124, 110, 108, 57, 101, 13, 115, 8, 4, 68, 55, 53, 15, 42, 105, 107, 60, 47, 22, 64], [39, 62, 48, 112, 71, 29, 14, 80, 73, 81, 21, 76, 11, 3, 69, 2, 61, 65, 1, 67, 56, 109, 118, 0, 72, 64, 7, 4, 13, 24, 12, 85, 8, 37, 79, 117, 17, 96, 111, 121, 87, 77, 66, 78, 93, 95, 15, 84, 5, 16, 54, 126, 75, 68, 50, 19, 83, 99, 33, 120, 53, 123, 94, 10, 116, 124, 88, 9, 103, 18, 101, 127, 106, 41, 92, 86, 28, 89, 82, 74, 38, 35, 40, 31, 57, 27, 30, 51, 20, 63, 26, 52, 70, 45, 115, 43, 36, 22, 97, 91, 107, 119, 32, 125, 34, 102, 6, 104, 105, 23, 25, 60, 110, 113, 46, 122, 100, 90, 44, 59, 98, 49, 42, 55, 108, 114, 47, 58], [105, 58, 34, 109, 26, 41, 86, 104, 88, 18, 84, 126, 24, 30, 55, 78, 92, 16, 73, 77, 36, 69, 19, 32, 49, 87, 71, 95, 91, 75, 119, 67, 121, 79, 111, 28, 45, 48, 1, 44, 2, 120, 51, 5, 94, 15, 29, 98, 114, 31, 12, 25, 40, 50, 57, 82, 112, 60, 33, 113, 64, 124, 107, 14, 125, 74, 10, 8, 123, 63, 108, 13, 37, 127, 101, 66, 62, 0, 22, 115, 81, 6, 39, 38, 118, 122, 65, 61, 72, 100, 3, 59, 47, 93, 7, 52, 90, 4, 23, 102, 46, 11, 56, 97, 42, 106, 21, 43, 76, 103, 70, 17, 99, 83, 68, 116, 9, 110, 20, 80, 54, 117, 27, 35, 53, 85, 89, 96], [105, 34, 26, 41, 58, 86, 36, 121, 84, 32, 104, 16, 126, 92, 62, 18, 51, 88, 118, 24, 78, 47, 57, 25, 52, 50, 49, 119, 116, 48, 109, 28, 102, 30, 90, 100, 114, 107, 95, 120, 23, 39, 46, 122, 55, 21, 61, 75, 113, 31, 38, 103, 127, 42, 37, 43, 83, 63, 11, 60, 80, 20, 76, 97, 112, 117, 14, 77, 108, 87, 125, 56, 45, 111, 7, 101, 22, 93, 96, 94, 19, 82, 123, 106, 115, 9, 59, 40, 33, 29, 91, 8, 124, 110, 98, 27, 44, 74, 72, 35, 85, 99, 79, 53, 81, 6, 89, 12, 54, 73, 71, 17, 15, 68, 13, 5, 10, 66, 4, 3, 70, 65, 67, 1, 64, 69, 2, 0], [105, 34, 58, 84, 26, 41, 86, 126, 77, 109, 16, 104, 73, 121, 32, 88, 18, 78, 75, 71, 30, 36, 49, 70, 108, 98, 48, 3, 29, 4, 51, 81, 24, 55, 120, 118, 11, 114, 50, 62, 111, 122, 100, 39, 2, 60, 94, 45, 102, 82, 80, 85, 52, 69, 115, 101, 37, 79, 116, 57, 65, 119, 44, 28, 125, 90, 0, 23, 19, 113, 92, 43, 38, 20, 112, 61, 83, 63, 13, 21, 124, 14, 22, 12, 95, 17, 68, 127, 117, 6, 123, 87, 46, 31, 103, 9, 99, 59, 47, 96, 76, 15, 27, 40, 67, 74, 106, 97, 25, 72, 42, 1, 35, 56, 93, 110, 53, 33, 54, 10, 89, 107, 91, 5, 8, 7, 64, 66], [105, 34, 92, 26, 41, 84, 18, 86, 109, 104, 16, 24, 126, 121, 78, 75, 77, 57, 71, 101, 58, 60, 73, 51, 36, 52, 120, 90, 118, 122, 107, 55, 62, 48, 49, 66, 45, 32, 95, 44, 30, 119, 108, 116, 14, 111, 102, 28, 29, 37, 63, 88, 50, 125, 46, 47, 7, 124, 114, 113, 106, 20, 127, 112, 40, 94, 5, 100, 87, 25, 123, 11, 31, 4, 42, 98, 82, 56, 38, 43, 17, 39, 19, 115, 103, 54, 21, 59, 91, 33, 8, 110, 35, 6, 97, 85, 83, 61, 80, 81, 117, 76, 27, 22, 12, 93, 53, 23, 96, 15, 79, 89, 99, 72, 2, 65, 64, 0, 68, 13, 69, 9, 3, 74, 1, 10, 70, 67], [104, 34, 20, 22, 124, 89, 92, 79, 95, 49, 18, 75, 119, 25, 77, 62, 48, 17, 114, 40, 120, 107, 6, 118, 46, 121, 117, 58, 54, 111, 70, 96, 93, 50, 45, 61, 53, 47, 9, 100, 84, 59, 43, 109, 68, 94, 15, 41, 110, 12, 82, 86, 19, 11, 90, 72, 55, 81, 66, 122, 98, 52, 1, 60, 123, 33, 99, 23, 126, 8, 76, 4, 56, 102, 64, 103, 3, 2, 112, 88, 37, 16, 113, 97, 73, 27, 80, 38, 78, 21, 32, 13, 24, 30, 28, 87, 91, 108, 29, 106, 105, 101, 44, 26, 35, 74, 0, 83, 85, 31, 51, 67, 10, 63, 57, 127, 42, 36, 39, 125, 115, 116, 7, 14, 5, 71, 69, 65], [104, 34, 89, 22, 49, 20, 124, 95, 18, 79, 48, 62, 119, 77, 25, 40, 118, 92, 75, 107, 31, 121, 46, 120, 114, 54, 9, 41, 6, 17, 111, 16, 82, 117, 58, 12, 15, 97, 84, 60, 38, 56, 93, 45, 115, 81, 53, 112, 101, 28, 76, 2, 86, 94, 68, 103, 110, 63, 98, 36, 55, 47, 70, 4, 96, 72, 61, 90, 85, 52, 88, 100, 127, 102, 73, 21, 113, 1, 123, 78, 50, 109, 59, 126, 24, 10, 0, 64, 91, 26, 23, 13, 87, 11, 19, 32, 99, 105, 5, 125, 108, 3, 66, 80, 30, 106, 57, 37, 67, 35, 33, 39, 43, 29, 69, 51, 83, 122, 8, 7, 27, 116, 74, 42, 14, 44, 71, 65], [104, 34, 22, 124, 89, 18, 92, 20, 48, 79, 62, 49, 119, 77, 95, 118, 25, 75, 40, 114, 17, 96, 46, 12, 9, 82, 121, 50, 111, 54, 58, 61, 117, 47, 84, 41, 120, 100, 4, 56, 107, 6, 110, 13, 59, 45, 98, 38, 15, 86, 81, 94, 52, 37, 55, 53, 108, 43, 35, 68, 63, 1, 122, 78, 112, 90, 101, 97, 126, 109, 33, 93, 87, 71, 70, 76, 19, 28, 60, 7, 72, 88, 123, 125, 39, 51, 23, 73, 31, 115, 127, 74, 26, 113, 105, 103, 2, 8, 21, 80, 91, 99, 14, 11, 3, 30, 85, 42, 116, 106, 16, 24, 10, 29, 102, 36, 44, 27, 32, 66, 57, 65, 67, 83, 64, 0, 5, 69], [104, 34, 119, 89, 22, 92, 18, 49, 20, 124, 95, 62, 48, 79, 77, 46, 25, 40, 114, 75, 121, 118, 120, 109, 41, 17, 54, 97, 12, 50, 111, 93, 9, 58, 16, 98, 82, 31, 112, 84, 4, 59, 47, 100, 117, 107, 80, 45, 61, 33, 53, 103, 108, 56, 38, 94, 21, 60, 32, 43, 24, 86, 90, 123, 55, 52, 127, 81, 126, 122, 113, 76, 96, 51, 110, 19, 35, 15, 63, 99, 105, 115, 28, 102, 85, 13, 87, 39, 44, 88, 125, 91, 37, 83, 27, 6, 101, 26, 106, 29, 78, 42, 23, 30, 1, 0, 2, 73, 72, 116, 36, 68, 69, 74, 57, 10, 70, 5, 7, 14, 71, 11, 8, 67, 65, 3, 66, 64], [41, 34, 53, 88, 93, 105, 20, 17, 79, 22, 35, 77, 72, 29, 75, 48, 25, 84, 115, 6, 66, 91, 18, 27, 82, 125, 24, 51, 55, 58, 62, 127, 126, 50, 8, 89, 63, 61, 108, 68, 2, 81, 15, 124, 13, 38, 0, 4, 16, 117, 80, 43, 114, 70, 52, 11, 19, 57, 103, 44, 60, 78, 100, 47, 90, 59, 76, 73, 98, 42, 109, 46, 26, 49, 83, 23, 86, 85, 40, 31, 112, 92, 33, 96, 28, 12, 10, 113, 14, 116, 87, 45, 21, 107, 39, 118, 101, 37, 122, 121, 64, 102, 97, 9, 36, 94, 104, 106, 119, 30, 123, 110, 7, 32, 74, 95, 99, 56, 69, 120, 71, 54, 5, 1, 111, 3, 65, 67], [41, 34, 53, 22, 105, 20, 88, 93, 48, 17, 29, 79, 50, 77, 44, 126, 62, 127, 27, 108, 35, 115, 58, 84, 80, 60, 96, 117, 43, 38, 125, 72, 112, 114, 75, 31, 49, 57, 124, 86, 24, 101, 21, 15, 118, 55, 61, 54, 123, 52, 63, 99, 26, 121, 90, 91, 8, 51, 37, 82, 103, 30, 39, 56, 28, 106, 122, 87, 6, 92, 18, 25, 47, 119, 81, 113, 40, 36, 120, 111, 70, 32, 102, 83, 110, 46, 16, 95, 33, 107, 85, 45, 23, 100, 89, 94, 97, 66, 104, 42, 71, 116, 12, 19, 73, 59, 109, 13, 78, 0, 14, 98, 10, 11, 7, 76, 9, 74, 67, 5, 68, 4, 2, 3, 69, 64, 65, 1], [41, 34, 53, 105, 22, 93, 88, 20, 27, 66, 91, 0, 79, 77, 17, 68, 127, 75, 19, 82, 72, 58, 112, 2, 67, 43, 124, 115, 48, 126, 44, 64, 35, 62, 70, 18, 65, 61, 29, 1, 60, 59, 6, 49, 38, 71, 51, 57, 84, 80, 50, 125, 3, 63, 52, 9, 96, 16, 69, 108, 109, 45, 55, 118, 8, 7, 31, 47, 15, 114, 110, 23, 5, 119, 101, 120, 99, 113, 117, 76, 46, 24, 73, 40, 12, 11, 37, 103, 4, 122, 25, 123, 74, 107, 10, 100, 116, 39, 32, 106, 87, 83, 26, 78, 13, 90, 85, 42, 56, 33, 102, 86, 81, 121, 54, 14, 97, 36, 28, 98, 104, 21, 111, 92, 89, 95, 94, 30], [41, 34, 53, 22, 88, 105, 17, 20, 58, 91, 93, 27, 29, 127, 115, 77, 126, 43, 75, 79, 61, 51, 48, 63, 35, 124, 108, 62, 112, 114, 55, 38, 57, 31, 81, 84, 44, 50, 113, 70, 123, 49, 82, 15, 25, 122, 59, 66, 52, 45, 40, 86, 39, 60, 118, 32, 106, 100, 119, 107, 116, 72, 80, 109, 101, 24, 0, 110, 125, 42, 117, 47, 37, 18, 96, 103, 46, 120, 11, 56, 19, 26, 102, 16, 54, 104, 121, 13, 99, 111, 23, 33, 83, 90, 98, 36, 92, 6, 78, 28, 21, 67, 68, 73, 95, 30, 97, 94, 87, 10, 76, 89, 12, 14, 85, 9, 71, 74, 4, 5, 8, 1, 69, 3, 2, 7, 65, 64]], "model.layers.12.self_attn.k_proj": [[38, 49, 127, 31, 113, 24, 21, 83, 80, 28, 82, 77, 35, 72, 108, 74, 62, 106, 105, 66, 96, 64, 1, 90, 112, 30, 40, 25, 102, 16, 107, 6, 51, 37, 34, 57, 116, 75, 86, 60, 125, 48, 45, 109, 76, 58, 91, 55, 44, 27, 117, 126, 29, 120, 119, 5, 46, 124, 59, 42, 67, 4, 87, 111, 85, 122, 47, 94, 23, 114, 26, 73, 84, 56, 43, 17, 63, 118, 115, 14, 79, 0, 104, 71, 98, 54, 41, 61, 69, 50, 78, 39, 12, 52, 110, 36, 92, 100, 53, 32, 123, 70, 103, 11, 121, 20, 15, 93, 97, 68, 89, 18, 101, 88, 99, 22, 81, 33, 19, 65, 10, 9, 7, 3, 8, 13, 2, 95], [109, 37, 45, 28, 24, 21, 31, 26, 82, 83, 78, 16, 62, 115, 75, 116, 124, 117, 49, 113, 121, 53, 110, 57, 30, 63, 8, 41, 105, 23, 64, 61, 127, 51, 60, 55, 126, 2, 103, 120, 33, 40, 90, 15, 93, 112, 42, 100, 118, 125, 107, 44, 111, 77, 108, 54, 34, 59, 17, 50, 18, 46, 43, 39, 56, 47, 22, 58, 5, 114, 85, 73, 65, 86, 48, 87, 38, 123, 97, 99, 95, 52, 27, 19, 102, 81, 104, 9, 106, 94, 68, 92, 13, 119, 96, 29, 98, 3, 80, 4, 20, 76, 32, 36, 12, 25, 122, 71, 35, 79, 89, 91, 84, 72, 10, 70, 66, 11, 74, 1, 7, 69, 14, 6, 67, 88, 0, 101], [107, 33, 43, 95, 22, 91, 24, 15, 26, 50, 115, 83, 13, 85, 118, 81, 20, 48, 61, 18, 49, 75, 63, 100, 117, 47, 126, 110, 9, 36, 46, 70, 54, 42, 112, 127, 124, 64, 39, 93, 116, 58, 108, 2, 121, 51, 101, 113, 111, 114, 68, 72, 123, 71, 57, 119, 120, 60, 12, 102, 125, 55, 65, 56, 99, 38, 44, 59, 40, 109, 19, 122, 45, 104, 52, 62, 41, 98, 103, 96, 53, 106, 14, 37, 105, 87, 11, 82, 92, 94, 3, 32, 30, 34, 29, 88, 74, 27, 35, 23, 76, 28, 21, 5, 78, 80, 25, 89, 16, 77, 79, 90, 31, 10, 97, 8, 86, 4, 1, 84, 17, 6, 67, 73, 7, 66, 69, 0], [113, 57, 102, 33, 82, 29, 23, 16, 11, 0, 3, 78, 65, 6, 77, 22, 26, 38, 59, 2, 71, 125, 108, 126, 73, 95, 83, 64, 63, 53, 84, 62, 93, 58, 72, 85, 12, 118, 114, 28, 79, 49, 89, 40, 56, 5, 60, 45, 61, 36, 124, 39, 68, 88, 123, 15, 96, 30, 119, 127, 117, 112, 106, 74, 116, 115, 52, 81, 94, 122, 111, 120, 27, 86, 121, 47, 100, 44, 31, 46, 4, 32, 34, 55, 109, 43, 91, 50, 103, 24, 51, 13, 37, 54, 35, 99, 41, 92, 19, 76, 48, 110, 105, 97, 98, 21, 17, 80, 20, 9, 42, 104, 107, 87, 1, 8, 69, 10, 25, 67, 14, 101, 75, 70, 90, 18, 7, 66], [103, 62, 48, 21, 81, 93, 14, 76, 11, 80, 61, 73, 71, 69, 3, 87, 0, 2, 65, 56, 45, 24, 8, 41, 70, 27, 64, 7, 66, 112, 6, 18, 117, 125, 32, 116, 54, 95, 119, 10, 94, 121, 13, 37, 33, 25, 47, 63, 74, 118, 1, 35, 51, 5, 72, 83, 4, 108, 97, 111, 99, 55, 67, 122, 50, 9, 105, 19, 75, 115, 68, 59, 15, 86, 120, 114, 31, 12, 113, 44, 43, 100, 17, 42, 107, 57, 126, 30, 28, 78, 104, 101, 22, 88, 92, 96, 84, 38, 26, 109, 16, 49, 106, 102, 60, 77, 20, 52, 79, 98, 85, 29, 123, 58, 53, 110, 127, 36, 89, 124, 23, 46, 40, 90, 34, 91, 82, 39], [41, 98, 26, 88, 18, 45, 84, 58, 16, 86, 112, 44, 111, 119, 109, 78, 40, 60, 75, 105, 77, 63, 50, 55, 96, 71, 121, 92, 73, 0, 104, 30, 2, 124, 122, 43, 4, 57, 56, 36, 49, 65, 39, 115, 125, 123, 29, 107, 101, 53, 31, 126, 110, 51, 62, 120, 85, 34, 113, 127, 69, 17, 74, 76, 47, 48, 68, 52, 42, 114, 79, 102, 100, 70, 61, 87, 38, 118, 59, 19, 89, 91, 28, 106, 94, 37, 6, 15, 103, 27, 117, 54, 9, 93, 35, 83, 108, 97, 32, 46, 11, 72, 20, 95, 116, 67, 33, 22, 23, 99, 25, 8, 81, 13, 90, 80, 10, 21, 5, 24, 12, 14, 64, 82, 66, 1, 3, 7], [40, 98, 89, 18, 22, 43, 119, 20, 79, 124, 48, 92, 49, 77, 112, 50, 17, 75, 31, 62, 61, 110, 9, 12, 6, 109, 60, 72, 117, 53, 113, 118, 54, 45, 68, 111, 107, 59, 41, 46, 84, 34, 58, 121, 64, 120, 14, 114, 47, 63, 35, 115, 56, 127, 65, 74, 44, 96, 83, 1, 100, 97, 66, 13, 70, 123, 28, 94, 104, 93, 55, 85, 33, 16, 36, 26, 2, 38, 3, 23, 67, 42, 102, 8, 103, 105, 11, 126, 29, 88, 24, 90, 81, 52, 39, 91, 122, 125, 101, 30, 51, 32, 57, 76, 116, 106, 7, 108, 37, 10, 27, 86, 99, 80, 69, 78, 73, 71, 15, 21, 0, 5, 4, 25, 19, 95, 87, 82], [105, 98, 53, 20, 88, 29, 17, 79, 22, 77, 75, 124, 61, 41, 64, 51, 126, 44, 6, 72, 119, 2, 80, 82, 59, 91, 112, 60, 118, 38, 63, 4, 108, 117, 127, 55, 58, 43, 45, 50, 62, 57, 35, 115, 125, 73, 101, 122, 12, 111, 110, 49, 46, 28, 123, 107, 42, 48, 32, 104, 19, 26, 47, 10, 40, 109, 66, 113, 34, 52, 78, 103, 114, 120, 39, 87, 56, 96, 90, 21, 86, 9, 102, 95, 116, 5, 31, 37, 36, 65, 97, 25, 106, 54, 121, 71, 67, 81, 30, 100, 33, 74, 99, 89, 8, 27, 92, 76, 1, 94, 69, 70, 85, 23, 11, 93, 7, 68, 24, 16, 15, 18, 3, 83, 14, 13, 84, 0]], "model.layers.12.self_attn.qk_proj": [[62, 49, 105, 41, 57, 113, 48, 45, 127, 109, 43, 107, 38, 102, 88, 40, 93, 53, 24, 18, 34, 82, 86, 90, 21, 104, 58, 124, 98, 75, 97, 22, 85, 80, 20, 16, 11, 84, 95, 92, 112, 29, 77, 31, 78, 61, 13, 14, 81, 17, 119, 103, 28, 87, 26, 25, 27, 19, 73, 15, 79, 126, 33, 83, 121, 37, 89, 12, 23, 9, 50, 60, 118, 76, 6, 116, 44, 55, 115, 96, 0, 51, 108, 56, 101, 111, 125, 7, 39, 110, 64, 91, 59, 63, 71, 35, 47, 122, 117, 72, 8, 3, 1, 2, 65, 69, 66, 52, 100, 114, 36, 70, 46, 120, 5, 123, 94, 30, 67, 32, 10, 54, 42, 99, 4, 68, 106, 74], [49, 62, 105, 41, 57, 113, 48, 45, 127, 109, 43, 107, 38, 40, 102, 34, 93, 88, 53, 24, 86, 124, 90, 21, 58, 104, 18, 82, 11, 75, 97, 112, 98, 29, 31, 92, 22, 20, 95, 80, 84, 77, 85, 16, 13, 78, 61, 119, 14, 28, 87, 26, 17, 25, 27, 103, 81, 73, 19, 15, 9, 23, 6, 60, 126, 111, 83, 89, 33, 79, 118, 50, 55, 108, 115, 125, 37, 121, 39, 12, 44, 0, 76, 101, 110, 72, 59, 51, 47, 63, 116, 7, 35, 117, 96, 64, 56, 71, 1, 123, 36, 122, 2, 91, 94, 65, 66, 32, 100, 3, 5, 30, 46, 4, 114, 8, 120, 106, 42, 10, 54, 69, 67, 70, 52, 99, 68, 74], [49, 62, 41, 105, 57, 113, 48, 45, 127, 109, 43, 107, 38, 102, 93, 40, 53, 88, 34, 24, 104, 58, 90, 86, 124, 112, 95, 21, 98, 20, 22, 18, 82, 31, 29, 97, 92, 61, 84, 85, 75, 80, 11, 119, 77, 16, 78, 103, 26, 28, 27, 13, 87, 126, 14, 33, 37, 17, 25, 23, 81, 115, 121, 44, 110, 118, 51, 50, 73, 59, 35, 60, 19, 125, 6, 116, 55, 15, 108, 117, 83, 89, 79, 9, 39, 101, 76, 12, 111, 30, 63, 0, 36, 47, 56, 96, 72, 7, 91, 65, 122, 114, 66, 100, 3, 46, 71, 70, 64, 120, 123, 52, 1, 94, 32, 5, 106, 4, 69, 2, 42, 54, 67, 10, 68, 99, 8, 74], [62, 49, 41, 105, 57, 113, 48, 45, 127, 109, 43, 107, 38, 40, 102, 53, 24, 93, 88, 34, 90, 58, 86, 124, 112, 82, 104, 18, 21, 20, 98, 80, 31, 95, 29, 22, 77, 97, 92, 75, 16, 85, 11, 84, 119, 14, 61, 103, 78, 25, 13, 28, 17, 27, 121, 79, 23, 87, 126, 44, 26, 81, 33, 50, 60, 51, 111, 118, 9, 83, 108, 15, 64, 89, 73, 101, 12, 115, 19, 39, 59, 63, 116, 37, 35, 55, 47, 70, 72, 125, 56, 110, 6, 91, 117, 76, 114, 0, 65, 30, 100, 7, 71, 123, 96, 2, 42, 66, 94, 120, 36, 67, 32, 3, 1, 122, 4, 54, 46, 52, 68, 69, 106, 5, 99, 8, 74, 10], [62, 49, 41, 105, 57, 113, 48, 45, 127, 109, 43, 107, 38, 40, 102, 53, 34, 88, 24, 58, 90, 93, 82, 124, 18, 104, 86, 80, 20, 75, 112, 98, 21, 31, 11, 16, 77, 84, 85, 61, 22, 13, 29, 92, 78, 97, 14, 95, 119, 103, 81, 26, 17, 25, 87, 28, 9, 23, 15, 79, 70, 83, 126, 121, 73, 27, 33, 50, 118, 111, 39, 19, 12, 0, 55, 89, 37, 7, 64, 72, 110, 76, 108, 59, 35, 2, 51, 44, 125, 116, 60, 47, 101, 91, 115, 6, 117, 1, 30, 56, 71, 120, 63, 66, 114, 123, 67, 122, 3, 42, 36, 65, 5, 96, 68, 69, 100, 52, 54, 46, 4, 94, 32, 106, 8, 99, 10, 74], [62, 49, 41, 105, 57, 113, 48, 127, 45, 109, 43, 107, 38, 40, 102, 53, 24, 34, 88, 90, 18, 86, 58, 75, 93, 21, 82, 20, 104, 124, 80, 77, 11, 98, 84, 31, 16, 85, 13, 78, 22, 14, 112, 92, 97, 61, 29, 119, 95, 17, 25, 81, 103, 28, 26, 87, 73, 15, 27, 9, 19, 121, 79, 83, 111, 12, 76, 70, 23, 89, 47, 50, 126, 37, 116, 55, 39, 72, 108, 118, 60, 56, 33, 115, 125, 117, 0, 44, 59, 35, 7, 64, 110, 51, 91, 65, 114, 94, 123, 63, 46, 66, 101, 2, 71, 96, 1, 6, 54, 52, 68, 120, 5, 122, 30, 99, 42, 36, 69, 67, 3, 100, 4, 106, 8, 74, 32, 10], [49, 62, 41, 105, 113, 57, 48, 127, 109, 45, 43, 107, 38, 102, 40, 88, 24, 53, 90, 18, 34, 86, 21, 82, 93, 20, 80, 104, 75, 58, 124, 84, 77, 22, 98, 16, 85, 92, 11, 13, 61, 29, 78, 31, 112, 14, 97, 17, 119, 81, 95, 23, 87, 26, 79, 28, 103, 25, 76, 15, 83, 27, 33, 121, 73, 9, 44, 50, 89, 12, 70, 19, 39, 126, 111, 37, 118, 55, 110, 116, 108, 60, 72, 51, 115, 47, 56, 117, 7, 59, 64, 122, 35, 101, 96, 63, 52, 94, 36, 91, 125, 67, 120, 0, 42, 1, 123, 71, 100, 66, 65, 2, 30, 46, 54, 3, 69, 114, 8, 5, 6, 4, 32, 99, 74, 106, 68, 10], [49, 62, 41, 105, 57, 113, 48, 127, 45, 109, 43, 107, 38, 88, 40, 102, 53, 34, 93, 24, 90, 18, 21, 86, 82, 104, 20, 85, 80, 22, 75, 84, 92, 77, 11, 16, 124, 98, 31, 58, 78, 97, 13, 29, 14, 61, 95, 119, 112, 17, 87, 81, 15, 28, 19, 23, 26, 79, 25, 33, 44, 70, 121, 103, 115, 83, 89, 12, 9, 27, 73, 118, 50, 111, 37, 126, 76, 116, 117, 110, 55, 7, 60, 0, 51, 108, 47, 59, 64, 63, 35, 8, 72, 125, 122, 39, 71, 91, 3, 120, 2, 65, 101, 36, 94, 100, 66, 114, 46, 54, 5, 96, 30, 56, 1, 123, 32, 6, 67, 52, 68, 42, 99, 69, 106, 10, 4, 74], [49, 62, 105, 41, 57, 113, 48, 127, 109, 45, 43, 107, 38, 40, 102, 93, 53, 88, 24, 34, 90, 18, 20, 86, 104, 85, 82, 21, 98, 22, 29, 92, 58, 80, 16, 31, 61, 124, 119, 112, 84, 75, 95, 78, 11, 77, 97, 14, 25, 23, 17, 26, 81, 28, 13, 87, 19, 121, 79, 44, 103, 15, 115, 111, 126, 37, 89, 33, 27, 9, 73, 110, 108, 50, 60, 39, 83, 47, 76, 96, 51, 55, 125, 70, 94, 12, 116, 63, 118, 64, 117, 59, 7, 35, 122, 30, 0, 91, 2, 36, 6, 56, 8, 42, 67, 66, 54, 1, 71, 114, 101, 65, 52, 46, 100, 120, 106, 3, 123, 72, 4, 32, 68, 99, 5, 69, 74, 10], [49, 62, 41, 57, 105, 113, 48, 127, 45, 109, 43, 107, 38, 53, 102, 40, 88, 24, 93, 34, 18, 90, 58, 104, 86, 85, 82, 22, 20, 75, 21, 16, 124, 92, 31, 77, 61, 80, 29, 78, 84, 112, 119, 11, 13, 95, 14, 97, 98, 81, 25, 23, 28, 26, 19, 15, 17, 87, 103, 27, 9, 121, 79, 126, 33, 83, 44, 89, 73, 50, 111, 60, 51, 47, 115, 108, 37, 8, 118, 6, 70, 76, 55, 12, 110, 125, 63, 7, 39, 71, 64, 0, 96, 67, 120, 36, 101, 30, 66, 59, 94, 4, 91, 114, 56, 117, 1, 116, 65, 123, 35, 32, 2, 3, 42, 54, 68, 5, 100, 122, 46, 106, 69, 74, 99, 72, 52, 10], [49, 62, 105, 41, 57, 113, 48, 45, 127, 43, 109, 107, 38, 102, 53, 40, 34, 24, 93, 88, 18, 82, 86, 58, 21, 90, 104, 75, 80, 16, 31, 77, 124, 61, 85, 20, 78, 22, 98, 112, 29, 92, 97, 11, 95, 84, 14, 119, 13, 17, 81, 25, 26, 6, 103, 9, 87, 28, 126, 23, 19, 27, 15, 79, 76, 83, 64, 8, 50, 73, 118, 89, 33, 115, 121, 111, 12, 37, 47, 7, 2, 116, 60, 71, 0, 51, 63, 120, 66, 91, 125, 1, 110, 44, 70, 114, 35, 67, 108, 55, 56, 5, 65, 101, 4, 30, 39, 59, 96, 123, 3, 122, 117, 36, 68, 69, 72, 42, 54, 94, 32, 52, 46, 74, 100, 99, 106, 10], [49, 62, 41, 105, 113, 57, 48, 109, 127, 45, 43, 107, 38, 40, 102, 53, 88, 34, 24, 93, 86, 18, 90, 58, 21, 82, 80, 75, 104, 77, 22, 16, 20, 98, 84, 92, 85, 78, 11, 31, 29, 112, 124, 61, 13, 14, 97, 95, 17, 81, 119, 26, 28, 103, 6, 15, 25, 87, 9, 73, 19, 76, 23, 12, 83, 89, 79, 126, 37, 121, 50, 111, 27, 8, 33, 64, 60, 44, 47, 0, 39, 71, 51, 115, 118, 2, 120, 55, 7, 66, 56, 108, 116, 125, 59, 35, 91, 3, 94, 65, 122, 63, 70, 123, 5, 117, 110, 72, 96, 1, 114, 36, 30, 101, 42, 69, 46, 100, 4, 67, 74, 52, 54, 68, 99, 32, 106, 10], [49, 62, 41, 105, 113, 57, 48, 127, 109, 45, 43, 107, 38, 102, 40, 53, 24, 93, 88, 104, 18, 34, 86, 21, 82, 90, 20, 22, 58, 85, 124, 80, 84, 92, 77, 31, 16, 98, 29, 11, 78, 75, 61, 95, 112, 97, 17, 119, 13, 14, 81, 26, 15, 25, 28, 121, 19, 89, 87, 23, 83, 37, 103, 79, 33, 126, 50, 27, 44, 9, 12, 6, 60, 111, 110, 108, 39, 73, 55, 115, 35, 118, 76, 51, 63, 8, 47, 91, 125, 101, 30, 116, 56, 123, 59, 117, 7, 122, 96, 94, 36, 114, 100, 64, 42, 2, 120, 0, 65, 54, 71, 52, 3, 1, 67, 66, 68, 70, 32, 99, 5, 4, 106, 72, 46, 69, 10, 74], [49, 62, 41, 105, 57, 113, 48, 45, 127, 109, 43, 107, 38, 53, 40, 34, 102, 24, 88, 104, 18, 82, 93, 21, 90, 75, 20, 77, 11, 86, 80, 58, 124, 98, 78, 22, 13, 84, 85, 31, 61, 112, 14, 16, 97, 92, 29, 95, 17, 119, 81, 25, 6, 15, 9, 19, 126, 83, 79, 87, 27, 89, 121, 26, 28, 73, 23, 103, 8, 50, 60, 118, 111, 0, 108, 33, 44, 7, 115, 64, 76, 12, 110, 125, 66, 51, 55, 47, 71, 2, 116, 70, 37, 56, 39, 91, 1, 117, 101, 30, 123, 63, 36, 65, 68, 35, 122, 96, 5, 114, 120, 59, 67, 4, 54, 72, 52, 94, 69, 42, 100, 3, 46, 32, 74, 99, 10, 106], [49, 62, 41, 105, 113, 57, 48, 127, 45, 109, 43, 107, 38, 53, 102, 40, 34, 93, 88, 24, 104, 18, 82, 90, 58, 86, 21, 20, 75, 31, 98, 124, 80, 77, 16, 11, 78, 84, 29, 92, 95, 112, 85, 61, 22, 97, 13, 14, 81, 17, 119, 9, 103, 28, 25, 121, 126, 26, 23, 15, 83, 33, 73, 27, 79, 44, 87, 89, 19, 37, 50, 110, 108, 60, 6, 118, 12, 111, 116, 51, 8, 115, 47, 0, 55, 39, 56, 70, 76, 35, 71, 30, 63, 7, 101, 91, 125, 96, 123, 66, 120, 59, 64, 117, 65, 54, 94, 114, 100, 72, 36, 32, 46, 122, 5, 52, 69, 106, 2, 1, 74, 67, 4, 3, 42, 99, 68, 10], [49, 62, 105, 41, 113, 57, 48, 109, 43, 127, 45, 107, 38, 102, 40, 53, 34, 93, 58, 24, 18, 88, 82, 104, 21, 86, 90, 20, 75, 98, 124, 61, 16, 80, 22, 85, 78, 84, 11, 31, 77, 92, 17, 95, 29, 112, 13, 97, 14, 81, 121, 119, 103, 15, 26, 87, 9, 126, 73, 27, 23, 44, 25, 79, 111, 83, 19, 12, 60, 76, 70, 28, 50, 115, 89, 33, 37, 59, 108, 64, 118, 110, 7, 0, 39, 116, 51, 123, 47, 125, 66, 55, 67, 35, 8, 101, 6, 120, 71, 117, 63, 2, 91, 56, 1, 30, 54, 96, 72, 65, 52, 114, 3, 36, 4, 122, 94, 42, 69, 5, 100, 32, 46, 68, 74, 10, 99, 106], [49, 62, 41, 105, 57, 113, 45, 48, 43, 127, 109, 107, 38, 40, 53, 102, 34, 88, 93, 18, 24, 90, 82, 104, 86, 22, 21, 20, 29, 75, 61, 80, 98, 85, 58, 16, 84, 95, 92, 77, 31, 11, 97, 112, 124, 78, 17, 14, 81, 13, 28, 119, 79, 15, 87, 23, 25, 26, 73, 103, 126, 83, 19, 50, 121, 108, 9, 33, 70, 89, 27, 111, 37, 51, 60, 118, 39, 76, 44, 12, 115, 117, 116, 91, 101, 110, 55, 30, 125, 63, 120, 47, 72, 7, 0, 114, 123, 54, 71, 56, 122, 59, 2, 1, 35, 42, 36, 5, 100, 65, 96, 64, 4, 32, 46, 94, 66, 8, 3, 68, 6, 52, 67, 99, 10, 69, 74, 106], [49, 62, 41, 105, 57, 113, 48, 45, 127, 109, 43, 107, 38, 40, 102, 53, 34, 93, 88, 24, 18, 82, 21, 104, 86, 90, 20, 98, 11, 58, 75, 22, 80, 16, 31, 95, 92, 97, 84, 124, 112, 29, 85, 77, 61, 13, 14, 81, 17, 119, 78, 26, 103, 87, 28, 79, 15, 121, 83, 126, 50, 25, 27, 70, 23, 33, 73, 44, 9, 89, 19, 108, 37, 60, 118, 39, 111, 110, 116, 76, 51, 47, 35, 30, 101, 71, 55, 12, 117, 64, 125, 72, 91, 115, 63, 0, 7, 59, 67, 100, 96, 114, 36, 65, 66, 56, 123, 6, 2, 122, 52, 46, 120, 32, 1, 54, 94, 106, 42, 8, 69, 68, 99, 5, 10, 4, 3, 74], [49, 41, 62, 105, 57, 113, 127, 48, 45, 109, 43, 107, 38, 40, 102, 93, 53, 88, 34, 104, 58, 24, 18, 82, 90, 20, 86, 112, 29, 98, 80, 85, 31, 97, 61, 21, 124, 84, 16, 119, 75, 95, 22, 92, 11, 77, 14, 103, 26, 81, 17, 126, 78, 87, 28, 121, 13, 25, 108, 39, 110, 118, 44, 9, 33, 27, 15, 50, 79, 89, 23, 37, 73, 70, 51, 111, 60, 63, 83, 19, 59, 125, 101, 116, 47, 55, 117, 56, 96, 94, 30, 36, 72, 35, 12, 115, 71, 91, 100, 64, 42, 76, 32, 0, 7, 123, 66, 120, 65, 122, 106, 46, 6, 67, 114, 54, 52, 1, 99, 2, 69, 3, 68, 4, 8, 10, 74, 5], [49, 62, 105, 41, 57, 113, 48, 127, 45, 109, 43, 107, 38, 102, 40, 53, 93, 34, 88, 24, 86, 90, 18, 21, 58, 104, 82, 98, 20, 31, 80, 75, 124, 11, 29, 61, 95, 97, 22, 85, 92, 112, 84, 77, 16, 78, 13, 17, 119, 14, 28, 25, 81, 103, 121, 87, 15, 33, 27, 9, 26, 50, 19, 126, 83, 37, 111, 89, 79, 101, 73, 60, 116, 47, 118, 23, 63, 39, 72, 59, 12, 108, 76, 55, 51, 44, 70, 117, 96, 115, 110, 35, 30, 125, 0, 120, 64, 71, 7, 123, 56, 91, 114, 1, 36, 42, 6, 2, 54, 4, 94, 106, 100, 32, 52, 67, 122, 46, 65, 68, 99, 5, 69, 66, 3, 74, 8, 10], [49, 62, 41, 105, 57, 113, 48, 45, 127, 43, 109, 107, 38, 40, 53, 34, 102, 93, 24, 104, 88, 86, 18, 58, 82, 90, 21, 98, 80, 20, 11, 75, 124, 97, 22, 77, 31, 92, 16, 85, 13, 29, 84, 95, 112, 78, 61, 14, 81, 17, 119, 103, 121, 73, 15, 25, 9, 27, 126, 79, 83, 87, 26, 19, 23, 110, 28, 50, 89, 76, 116, 72, 12, 111, 47, 44, 33, 118, 115, 60, 51, 37, 108, 55, 6, 39, 101, 59, 70, 71, 64, 117, 0, 123, 125, 63, 7, 35, 120, 30, 52, 36, 2, 46, 1, 54, 66, 96, 69, 91, 65, 56, 114, 3, 122, 42, 32, 67, 68, 4, 99, 10, 100, 94, 74, 8, 106, 5], [49, 62, 41, 105, 57, 113, 48, 45, 127, 109, 43, 107, 38, 40, 102, 53, 93, 34, 104, 88, 58, 24, 86, 90, 21, 18, 98, 82, 31, 97, 84, 20, 124, 95, 11, 85, 61, 22, 16, 80, 13, 29, 112, 92, 78, 75, 77, 14, 81, 28, 26, 17, 119, 121, 103, 15, 23, 126, 87, 27, 25, 110, 44, 19, 79, 9, 89, 73, 33, 111, 59, 115, 50, 83, 6, 37, 108, 12, 55, 76, 118, 47, 116, 91, 101, 35, 60, 96, 51, 63, 117, 39, 72, 64, 0, 71, 125, 123, 7, 52, 36, 42, 32, 114, 56, 122, 94, 120, 3, 30, 65, 46, 2, 70, 66, 1, 4, 100, 106, 68, 10, 8, 54, 99, 67, 69, 5, 74], [49, 62, 41, 105, 57, 113, 48, 45, 127, 109, 43, 107, 38, 53, 102, 40, 88, 93, 34, 104, 24, 18, 86, 21, 90, 82, 58, 98, 80, 61, 20, 31, 124, 22, 85, 11, 92, 84, 29, 112, 95, 75, 13, 97, 16, 77, 119, 78, 14, 26, 17, 81, 28, 126, 15, 103, 23, 87, 83, 9, 33, 110, 6, 27, 25, 121, 111, 51, 89, 44, 118, 50, 115, 73, 79, 60, 19, 12, 37, 39, 108, 55, 76, 47, 59, 63, 0, 116, 123, 35, 101, 91, 32, 64, 72, 36, 71, 96, 7, 120, 125, 65, 117, 100, 30, 114, 2, 56, 1, 122, 66, 42, 54, 3, 94, 8, 5, 46, 4, 67, 52, 99, 106, 68, 69, 70, 10, 74], [49, 62, 41, 105, 57, 113, 48, 45, 109, 127, 43, 107, 38, 53, 40, 102, 34, 24, 88, 93, 104, 58, 18, 82, 86, 98, 90, 11, 31, 80, 21, 20, 85, 124, 75, 92, 77, 112, 97, 22, 16, 84, 95, 13, 29, 61, 78, 81, 14, 28, 119, 103, 26, 17, 25, 9, 6, 126, 87, 73, 111, 79, 23, 121, 83, 15, 19, 33, 27, 89, 50, 115, 37, 118, 47, 110, 116, 55, 108, 60, 76, 123, 12, 44, 59, 39, 63, 117, 7, 72, 35, 101, 51, 56, 64, 125, 96, 91, 71, 114, 8, 36, 120, 0, 30, 65, 2, 68, 32, 46, 66, 3, 99, 54, 4, 52, 70, 100, 94, 122, 1, 69, 5, 42, 106, 67, 74, 10], [49, 62, 41, 105, 57, 113, 48, 45, 127, 109, 43, 107, 38, 40, 102, 34, 93, 53, 88, 24, 18, 86, 104, 21, 90, 58, 82, 22, 20, 98, 16, 124, 61, 29, 31, 11, 97, 80, 112, 84, 75, 85, 92, 95, 77, 13, 28, 78, 119, 14, 103, 81, 26, 17, 25, 121, 73, 87, 23, 89, 9, 83, 27, 79, 12, 33, 108, 6, 126, 15, 50, 44, 19, 39, 111, 37, 118, 60, 76, 51, 110, 116, 117, 35, 55, 59, 101, 47, 7, 56, 63, 0, 123, 125, 115, 8, 122, 120, 64, 1, 96, 30, 70, 72, 2, 114, 71, 52, 91, 54, 46, 3, 36, 100, 32, 66, 42, 5, 67, 94, 106, 65, 99, 68, 69, 74, 4, 10], [49, 62, 41, 105, 57, 113, 48, 109, 127, 45, 43, 107, 38, 40, 34, 102, 88, 53, 93, 104, 24, 90, 18, 86, 22, 82, 58, 20, 21, 11, 31, 98, 92, 85, 112, 97, 75, 80, 16, 84, 124, 29, 61, 77, 78, 95, 13, 119, 28, 81, 14, 17, 26, 15, 25, 103, 19, 73, 23, 27, 83, 9, 89, 87, 79, 33, 126, 111, 50, 121, 60, 12, 51, 118, 37, 125, 110, 44, 70, 76, 55, 108, 6, 8, 115, 7, 39, 116, 35, 0, 101, 47, 63, 64, 59, 123, 56, 114, 117, 122, 120, 42, 71, 1, 30, 91, 96, 65, 100, 54, 52, 69, 2, 32, 67, 36, 106, 5, 94, 66, 3, 72, 46, 68, 10, 74, 4, 99], [49, 62, 41, 105, 57, 113, 48, 109, 127, 45, 43, 107, 38, 53, 102, 40, 88, 104, 24, 34, 93, 90, 18, 86, 82, 58, 98, 22, 124, 20, 29, 112, 21, 61, 85, 80, 92, 75, 31, 11, 16, 84, 77, 95, 78, 81, 97, 17, 126, 119, 13, 14, 25, 103, 28, 26, 87, 23, 83, 33, 27, 9, 73, 110, 121, 115, 79, 125, 15, 51, 89, 44, 12, 50, 108, 37, 55, 39, 60, 8, 111, 70, 47, 76, 35, 36, 19, 118, 116, 59, 117, 7, 101, 63, 6, 91, 0, 114, 100, 56, 65, 64, 42, 122, 66, 2, 32, 71, 94, 96, 67, 120, 123, 30, 54, 68, 1, 52, 69, 3, 46, 5, 106, 4, 10, 99, 72, 74], [49, 62, 41, 105, 57, 113, 48, 45, 127, 109, 43, 107, 38, 53, 40, 102, 88, 24, 34, 18, 104, 93, 82, 90, 86, 75, 58, 11, 80, 21, 98, 124, 20, 85, 16, 61, 31, 22, 77, 92, 78, 97, 84, 13, 112, 29, 95, 81, 17, 14, 119, 70, 9, 103, 28, 126, 83, 25, 26, 73, 64, 87, 79, 121, 23, 115, 15, 27, 19, 33, 8, 89, 76, 12, 7, 118, 44, 111, 51, 55, 0, 116, 50, 2, 110, 66, 101, 60, 59, 125, 108, 117, 47, 68, 67, 65, 37, 56, 6, 39, 71, 123, 63, 3, 1, 91, 5, 52, 35, 96, 122, 36, 114, 100, 30, 46, 4, 69, 54, 120, 94, 72, 10, 106, 32, 42, 74, 99], [62, 49, 41, 105, 57, 113, 48, 127, 45, 109, 43, 107, 38, 102, 40, 53, 34, 93, 88, 58, 24, 104, 21, 124, 11, 86, 90, 18, 98, 75, 84, 82, 29, 61, 22, 31, 95, 92, 112, 80, 20, 85, 78, 97, 77, 16, 13, 26, 17, 14, 103, 119, 81, 23, 121, 25, 70, 15, 73, 28, 27, 9, 126, 8, 87, 89, 37, 19, 50, 12, 44, 47, 33, 51, 125, 83, 116, 115, 7, 110, 118, 60, 108, 79, 111, 59, 0, 39, 35, 55, 101, 76, 63, 64, 56, 91, 114, 2, 96, 1, 117, 30, 67, 100, 36, 71, 66, 52, 46, 122, 3, 123, 120, 4, 32, 69, 65, 54, 6, 5, 42, 94, 68, 99, 106, 72, 10, 74], [49, 62, 41, 105, 57, 113, 48, 45, 127, 109, 43, 107, 38, 102, 40, 24, 53, 88, 93, 34, 18, 90, 82, 86, 11, 58, 75, 21, 20, 104, 98, 22, 16, 80, 124, 85, 31, 92, 29, 77, 13, 84, 97, 112, 78, 61, 95, 14, 17, 81, 19, 119, 25, 103, 9, 121, 26, 87, 73, 15, 28, 70, 126, 23, 79, 89, 37, 50, 33, 44, 8, 27, 12, 116, 111, 83, 76, 101, 60, 115, 0, 47, 118, 125, 55, 7, 59, 56, 117, 64, 51, 108, 66, 96, 39, 110, 65, 63, 2, 123, 35, 71, 94, 5, 114, 1, 91, 67, 30, 6, 3, 54, 46, 120, 122, 68, 69, 74, 72, 4, 100, 42, 32, 36, 99, 52, 106, 10], [49, 62, 41, 105, 113, 57, 48, 45, 127, 109, 43, 107, 38, 40, 102, 53, 88, 93, 24, 58, 34, 82, 90, 18, 104, 124, 21, 20, 112, 11, 86, 85, 16, 22, 77, 97, 75, 98, 80, 92, 84, 95, 13, 61, 31, 14, 103, 29, 78, 17, 81, 119, 15, 28, 19, 23, 121, 115, 25, 9, 83, 126, 27, 73, 79, 70, 87, 26, 116, 37, 12, 44, 89, 33, 50, 59, 51, 39, 60, 108, 35, 111, 101, 118, 125, 76, 55, 7, 8, 110, 64, 6, 47, 117, 63, 114, 0, 56, 123, 91, 2, 96, 122, 46, 65, 69, 54, 71, 30, 72, 100, 1, 52, 66, 68, 36, 67, 3, 4, 42, 120, 94, 5, 32, 99, 106, 10, 74], [62, 49, 41, 105, 57, 113, 48, 127, 45, 109, 43, 107, 38, 102, 40, 53, 88, 34, 24, 93, 82, 18, 90, 86, 75, 104, 21, 77, 11, 20, 58, 22, 85, 124, 16, 98, 31, 84, 13, 97, 95, 92, 112, 80, 61, 29, 14, 78, 17, 81, 28, 26, 119, 79, 87, 9, 15, 19, 115, 103, 73, 83, 126, 25, 12, 37, 23, 27, 33, 121, 50, 108, 89, 70, 76, 7, 118, 111, 8, 6, 44, 51, 60, 63, 116, 59, 47, 72, 39, 110, 71, 96, 0, 101, 123, 91, 55, 117, 35, 64, 68, 56, 125, 114, 2, 1, 100, 66, 46, 42, 36, 5, 122, 65, 67, 94, 10, 120, 30, 54, 4, 74, 52, 32, 69, 99, 3, 106]], "model.layers.13.self_attn.q_proj": [[115, 103, 62, 124, 126, 50, 93, 54, 90, 20, 89, 13, 122, 74, 83, 23, 127, 33, 7, 82, 61, 39, 48, 112, 80, 56, 22, 25, 2, 91, 92, 29, 31, 0, 53, 52, 67, 117, 116, 119, 51, 120, 77, 4, 65, 121, 6, 5, 87, 69, 125, 21, 15, 12, 100, 114, 64, 118, 60, 59, 66, 49, 3, 16, 95, 34, 55, 37, 28, 19, 111, 17, 18, 68, 58, 99, 85, 72, 63, 32, 57, 81, 44, 9, 70, 30, 84, 36, 88, 101, 96, 24, 113, 45, 11, 109, 35, 94, 102, 98, 26, 123, 38, 27, 14, 76, 78, 73, 8, 47, 107, 86, 43, 46, 41, 40, 42, 105, 110, 108, 79, 71, 75, 104, 97, 1, 106, 10], [115, 62, 103, 124, 50, 54, 126, 23, 74, 67, 61, 90, 122, 127, 56, 112, 7, 53, 93, 82, 6, 39, 77, 116, 48, 2, 68, 117, 52, 0, 89, 33, 121, 114, 51, 120, 85, 15, 31, 25, 119, 49, 125, 60, 11, 111, 22, 118, 55, 63, 59, 37, 69, 58, 113, 57, 95, 44, 109, 45, 9, 110, 46, 107, 38, 35, 43, 83, 13, 65, 123, 108, 102, 47, 20, 41, 34, 42, 104, 105, 106, 100, 92, 64, 5, 98, 73, 28, 40, 96, 101, 66, 32, 99, 79, 36, 87, 18, 72, 14, 30, 29, 88, 24, 94, 21, 10, 80, 26, 16, 19, 86, 4, 70, 84, 12, 3, 91, 81, 97, 27, 76, 1, 78, 17, 75, 8, 71], [62, 115, 103, 39, 124, 127, 126, 50, 90, 61, 54, 112, 122, 23, 11, 7, 15, 20, 82, 52, 74, 53, 89, 116, 51, 119, 6, 56, 14, 48, 0, 117, 68, 67, 79, 120, 84, 121, 125, 114, 59, 80, 55, 111, 100, 118, 49, 95, 60, 72, 22, 66, 57, 33, 63, 102, 58, 113, 2, 107, 73, 43, 109, 110, 93, 34, 37, 40, 101, 31, 46, 92, 45, 35, 13, 18, 44, 28, 106, 25, 98, 81, 38, 108, 96, 123, 47, 32, 69, 41, 42, 105, 104, 99, 87, 88, 16, 24, 94, 85, 77, 83, 30, 29, 91, 26, 8, 27, 9, 86, 75, 36, 70, 12, 76, 78, 65, 97, 5, 19, 64, 10, 17, 21, 71, 4, 1, 3], [115, 62, 103, 50, 126, 124, 54, 90, 61, 122, 127, 74, 112, 67, 82, 48, 56, 7, 39, 117, 77, 52, 83, 116, 114, 93, 2, 53, 120, 125, 51, 119, 121, 18, 92, 13, 6, 111, 0, 89, 49, 59, 60, 88, 109, 55, 63, 58, 118, 33, 45, 57, 23, 11, 108, 113, 47, 21, 43, 25, 102, 107, 40, 46, 65, 44, 69, 123, 66, 64, 87, 110, 37, 15, 42, 20, 105, 35, 104, 41, 106, 5, 100, 19, 101, 68, 98, 17, 73, 38, 91, 3, 95, 96, 29, 34, 9, 85, 36, 99, 10, 31, 32, 70, 4, 28, 22, 30, 12, 16, 94, 72, 26, 27, 81, 24, 1, 97, 84, 76, 86, 80, 14, 78, 71, 79, 8, 75], [123, 127, 60, 62, 38, 121, 53, 56, 61, 122, 110, 120, 59, 57, 54, 52, 44, 114, 119, 51, 125, 118, 116, 91, 117, 124, 126, 55, 115, 48, 58, 47, 113, 49, 90, 50, 111, 63, 102, 82, 46, 108, 109, 32, 112, 14, 104, 45, 29, 100, 84, 34, 103, 99, 107, 37, 86, 43, 97, 105, 39, 22, 42, 106, 36, 24, 40, 33, 89, 98, 41, 94, 101, 27, 30, 23, 75, 20, 88, 35, 96, 18, 25, 26, 95, 93, 80, 31, 78, 19, 28, 8, 9, 21, 87, 11, 81, 83, 64, 68, 85, 92, 13, 66, 17, 1, 12, 76, 77, 16, 79, 70, 71, 5, 15, 72, 3, 7, 10, 65, 6, 67, 73, 4, 0, 69, 2, 74], [127, 123, 38, 121, 62, 60, 37, 53, 56, 61, 122, 57, 120, 54, 116, 51, 59, 114, 52, 58, 44, 32, 119, 125, 36, 118, 124, 117, 49, 126, 115, 90, 110, 55, 108, 47, 50, 111, 113, 63, 89, 107, 82, 109, 91, 112, 106, 100, 48, 87, 14, 46, 84, 102, 42, 96, 45, 103, 34, 27, 25, 20, 22, 104, 40, 78, 8, 97, 94, 39, 31, 43, 95, 98, 29, 105, 101, 41, 30, 33, 99, 86, 1, 35, 75, 9, 81, 18, 26, 19, 93, 92, 24, 28, 5, 12, 68, 3, 71, 76, 64, 0, 79, 13, 88, 70, 21, 72, 23, 17, 85, 10, 15, 66, 16, 67, 83, 73, 7, 6, 80, 11, 77, 4, 2, 69, 65, 74], [123, 127, 62, 60, 121, 1, 53, 61, 56, 122, 57, 115, 120, 54, 52, 63, 51, 58, 59, 32, 114, 116, 124, 113, 125, 118, 119, 117, 49, 110, 126, 107, 55, 87, 37, 0, 111, 47, 50, 48, 112, 20, 109, 106, 104, 91, 108, 64, 46, 38, 44, 45, 67, 75, 66, 41, 42, 39, 43, 100, 40, 105, 78, 82, 103, 3, 36, 26, 71, 101, 34, 22, 68, 99, 18, 95, 72, 14, 5, 98, 88, 13, 33, 28, 79, 30, 94, 35, 102, 27, 96, 16, 97, 92, 9, 8, 93, 10, 76, 21, 17, 11, 6, 83, 2, 70, 15, 29, 23, 85, 84, 86, 31, 12, 24, 90, 25, 19, 73, 65, 7, 81, 89, 4, 69, 80, 77, 74], [127, 123, 38, 24, 83, 17, 28, 79, 90, 26, 10, 97, 13, 60, 31, 86, 104, 121, 84, 76, 4, 42, 7, 29, 62, 74, 77, 85, 40, 88, 19, 2, 53, 46, 70, 92, 15, 49, 81, 89, 57, 25, 20, 91, 9, 6, 5, 71, 64, 58, 101, 21, 116, 12, 56, 33, 18, 112, 78, 80, 99, 27, 82, 16, 87, 54, 23, 1, 65, 0, 95, 102, 14, 75, 93, 32, 51, 109, 69, 67, 115, 73, 52, 61, 94, 11, 108, 96, 30, 36, 22, 68, 59, 125, 48, 122, 72, 8, 37, 34, 55, 126, 114, 124, 66, 63, 50, 120, 43, 45, 117, 119, 103, 100, 118, 44, 3, 110, 113, 47, 98, 106, 35, 41, 107, 111, 105, 39], [121, 60, 39, 123, 120, 89, 61, 54, 49, 119, 65, 51, 122, 116, 62, 78, 127, 71, 126, 59, 112, 58, 117, 92, 110, 48, 118, 11, 50, 57, 124, 80, 53, 4, 55, 18, 27, 56, 44, 47, 68, 115, 125, 23, 63, 43, 111, 96, 69, 105, 46, 90, 99, 103, 52, 113, 106, 45, 109, 9, 114, 70, 107, 0, 101, 28, 84, 104, 91, 3, 42, 108, 73, 66, 38, 40, 82, 41, 102, 22, 95, 2, 93, 94, 36, 100, 25, 37, 10, 30, 34, 97, 98, 86, 79, 16, 33, 64, 8, 5, 29, 87, 19, 14, 20, 74, 31, 7, 35, 83, 15, 12, 72, 26, 81, 88, 17, 21, 32, 75, 77, 85, 24, 1, 67, 13, 76, 6], [60, 121, 39, 89, 123, 120, 54, 61, 116, 62, 49, 119, 23, 96, 18, 126, 84, 51, 80, 122, 117, 16, 55, 27, 127, 57, 124, 48, 9, 25, 95, 86, 92, 59, 58, 118, 28, 22, 112, 53, 50, 56, 47, 15, 65, 13, 99, 78, 69, 110, 63, 115, 52, 106, 7, 34, 94, 30, 103, 19, 111, 75, 82, 125, 105, 11, 97, 32, 44, 43, 114, 10, 46, 45, 108, 113, 21, 104, 71, 20, 77, 109, 107, 42, 38, 40, 85, 35, 66, 41, 37, 73, 93, 87, 81, 0, 91, 100, 36, 90, 31, 98, 12, 102, 33, 26, 29, 101, 70, 2, 68, 74, 83, 24, 79, 3, 17, 88, 14, 4, 67, 76, 72, 64, 5, 8, 6, 1], [121, 39, 60, 123, 120, 89, 54, 116, 23, 61, 84, 27, 28, 11, 92, 96, 51, 49, 18, 80, 99, 16, 122, 48, 62, 75, 126, 119, 86, 124, 127, 118, 58, 59, 117, 50, 55, 112, 78, 57, 22, 56, 30, 25, 47, 110, 53, 94, 95, 82, 115, 106, 111, 29, 7, 9, 44, 114, 45, 91, 15, 109, 90, 103, 46, 63, 113, 52, 33, 105, 2, 107, 4, 20, 42, 65, 102, 81, 71, 40, 104, 101, 87, 70, 43, 19, 108, 93, 125, 77, 85, 36, 37, 98, 35, 41, 79, 3, 97, 21, 6, 32, 31, 100, 83, 0, 38, 34, 68, 69, 26, 17, 14, 73, 88, 12, 64, 24, 67, 76, 13, 10, 8, 74, 72, 5, 66, 1], [60, 121, 39, 123, 54, 84, 120, 116, 89, 61, 49, 51, 18, 16, 92, 117, 99, 28, 122, 27, 126, 82, 112, 119, 62, 127, 103, 23, 124, 48, 96, 118, 58, 59, 57, 94, 55, 53, 86, 50, 9, 88, 47, 95, 56, 52, 110, 11, 25, 115, 22, 65, 30, 71, 69, 15, 42, 114, 111, 75, 45, 46, 125, 33, 106, 102, 63, 113, 43, 78, 91, 107, 90, 101, 104, 34, 41, 44, 77, 109, 40, 38, 87, 105, 0, 13, 36, 29, 85, 31, 108, 26, 20, 7, 98, 68, 14, 37, 93, 97, 100, 24, 19, 66, 32, 35, 81, 21, 80, 17, 83, 70, 2, 79, 76, 12, 74, 73, 8, 64, 5, 72, 67, 3, 4, 10, 6, 1], [51, 118, 102, 48, 90, 24, 97, 53, 126, 61, 58, 117, 18, 121, 29, 62, 63, 54, 123, 86, 50, 19, 9, 80, 124, 26, 47, 127, 111, 93, 120, 112, 125, 38, 115, 23, 59, 116, 119, 27, 60, 91, 73, 17, 46, 55, 78, 108, 20, 113, 12, 104, 103, 114, 52, 101, 57, 71, 45, 56, 39, 89, 110, 82, 11, 87, 88, 30, 107, 36, 5, 77, 109, 66, 49, 42, 13, 44, 99, 16, 14, 33, 43, 41, 98, 83, 100, 28, 74, 122, 94, 105, 31, 34, 40, 25, 106, 37, 75, 32, 35, 96, 21, 81, 95, 85, 22, 84, 92, 15, 72, 3, 7, 68, 70, 79, 76, 64, 65, 69, 2, 1, 0, 8, 10, 4, 6, 67], [118, 51, 102, 48, 61, 126, 58, 123, 53, 124, 127, 120, 62, 117, 63, 90, 54, 60, 19, 125, 121, 119, 97, 116, 59, 41, 113, 50, 3, 56, 52, 55, 29, 112, 47, 73, 57, 0, 86, 115, 38, 93, 78, 24, 26, 111, 92, 81, 122, 46, 9, 5, 1, 109, 114, 43, 69, 87, 22, 89, 103, 94, 30, 45, 110, 108, 49, 7, 44, 66, 105, 39, 98, 107, 99, 104, 106, 91, 42, 83, 28, 2, 75, 14, 18, 4, 40, 25, 76, 37, 100, 6, 84, 65, 12, 32, 35, 96, 17, 16, 33, 82, 101, 67, 21, 13, 36, 34, 15, 8, 85, 77, 11, 64, 31, 20, 88, 80, 79, 95, 71, 23, 10, 74, 27, 70, 72, 68], [51, 102, 118, 48, 24, 97, 53, 90, 19, 58, 126, 47, 63, 61, 86, 81, 38, 18, 121, 117, 93, 123, 29, 62, 60, 124, 80, 9, 50, 99, 89, 59, 112, 54, 127, 120, 45, 94, 125, 108, 119, 13, 75, 100, 11, 39, 57, 111, 113, 114, 91, 104, 88, 77, 115, 26, 116, 3, 5, 109, 73, 36, 56, 37, 98, 49, 52, 110, 103, 78, 105, 107, 42, 55, 28, 66, 41, 122, 46, 30, 71, 44, 40, 83, 35, 43, 31, 34, 27, 95, 82, 22, 87, 17, 106, 85, 92, 20, 32, 101, 23, 21, 84, 69, 15, 96, 25, 16, 7, 74, 33, 14, 72, 0, 70, 67, 10, 1, 79, 64, 12, 8, 76, 65, 68, 6, 4, 2], [51, 102, 118, 48, 24, 97, 61, 86, 53, 29, 126, 19, 90, 63, 124, 115, 78, 75, 18, 58, 80, 81, 50, 123, 93, 66, 87, 62, 89, 30, 77, 39, 117, 16, 34, 99, 54, 9, 88, 32, 47, 116, 91, 45, 120, 37, 36, 125, 96, 83, 28, 111, 31, 44, 127, 3, 59, 100, 26, 121, 11, 95, 60, 103, 38, 52, 119, 23, 109, 71, 35, 112, 92, 17, 113, 5, 94, 20, 105, 27, 110, 57, 98, 101, 106, 56, 74, 114, 41, 25, 107, 70, 108, 104, 69, 46, 33, 0, 82, 55, 21, 1, 14, 72, 49, 40, 10, 85, 43, 64, 13, 73, 122, 42, 22, 7, 15, 84, 6, 68, 12, 79, 4, 76, 8, 65, 67, 2], [47, 125, 111, 56, 14, 124, 60, 24, 120, 127, 117, 54, 118, 49, 123, 5, 57, 91, 61, 75, 2, 21, 96, 59, 122, 1, 62, 94, 55, 7, 126, 63, 26, 58, 50, 121, 48, 103, 39, 53, 9, 3, 119, 64, 116, 104, 101, 98, 40, 113, 23, 4, 20, 100, 83, 42, 114, 52, 115, 112, 37, 36, 13, 82, 18, 97, 10, 17, 85, 72, 108, 44, 22, 110, 51, 89, 73, 46, 25, 27, 6, 43, 35, 19, 78, 29, 15, 107, 41, 71, 31, 45, 81, 32, 16, 79, 106, 105, 8, 28, 34, 99, 93, 109, 87, 74, 33, 88, 102, 70, 38, 92, 95, 76, 86, 80, 11, 77, 90, 30, 65, 67, 84, 0, 12, 69, 66, 68], [47, 111, 125, 56, 98, 26, 124, 2, 5, 60, 117, 103, 85, 64, 39, 36, 61, 54, 120, 127, 92, 37, 123, 49, 4, 87, 91, 97, 126, 122, 118, 1, 59, 57, 62, 20, 14, 6, 89, 48, 7, 3, 55, 24, 9, 50, 63, 96, 121, 113, 53, 65, 29, 58, 75, 13, 27, 114, 51, 101, 10, 41, 52, 81, 83, 84, 116, 94, 35, 42, 71, 72, 46, 119, 38, 112, 44, 8, 22, 115, 106, 28, 90, 100, 16, 31, 105, 99, 107, 43, 34, 109, 0, 23, 45, 95, 110, 11, 104, 108, 68, 76, 73, 86, 102, 25, 40, 32, 79, 15, 17, 66, 21, 33, 74, 80, 30, 19, 82, 93, 67, 70, 69, 77, 18, 88, 78, 12], [125, 47, 111, 56, 60, 124, 14, 39, 117, 26, 127, 118, 120, 54, 61, 49, 123, 5, 57, 103, 126, 62, 94, 59, 122, 81, 55, 63, 98, 50, 48, 23, 121, 19, 53, 20, 46, 58, 113, 116, 76, 114, 2, 8, 1, 119, 97, 112, 64, 4, 10, 75, 115, 107, 51, 52, 38, 44, 21, 72, 87, 65, 73, 16, 104, 100, 41, 3, 90, 42, 43, 45, 71, 96, 108, 110, 106, 105, 37, 99, 86, 6, 109, 17, 24, 36, 40, 22, 79, 80, 91, 28, 9, 7, 83, 101, 85, 13, 74, 34, 102, 33, 92, 18, 35, 82, 27, 95, 31, 29, 11, 78, 84, 88, 68, 30, 0, 25, 89, 77, 12, 32, 69, 93, 15, 70, 66, 67], [47, 125, 111, 36, 56, 24, 96, 124, 91, 75, 25, 120, 100, 7, 60, 49, 83, 29, 16, 99, 84, 21, 2, 54, 94, 1, 14, 127, 117, 5, 61, 57, 80, 26, 103, 40, 118, 123, 22, 97, 64, 33, 4, 46, 35, 44, 48, 59, 110, 85, 126, 62, 104, 13, 122, 9, 27, 28, 58, 89, 3, 20, 6, 31, 116, 90, 77, 53, 108, 15, 43, 93, 121, 101, 42, 39, 119, 87, 50, 55, 17, 79, 63, 88, 98, 67, 112, 95, 82, 106, 18, 92, 23, 113, 41, 34, 30, 52, 37, 38, 102, 114, 19, 71, 32, 45, 115, 105, 74, 109, 51, 107, 11, 70, 76, 78, 8, 10, 68, 12, 65, 72, 73, 81, 86, 0, 69, 66], [123, 63, 39, 114, 57, 115, 61, 127, 121, 124, 59, 62, 120, 54, 49, 93, 116, 125, 110, 113, 97, 50, 25, 53, 58, 112, 56, 55, 118, 51, 117, 92, 52, 80, 126, 60, 42, 122, 45, 48, 22, 111, 44, 28, 108, 119, 102, 46, 107, 43, 95, 41, 40, 47, 26, 103, 100, 88, 106, 19, 37, 104, 105, 34, 109, 35, 96, 17, 86, 31, 87, 36, 99, 32, 101, 94, 38, 24, 9, 29, 6, 91, 16, 3, 98, 76, 12, 72, 30, 90, 78, 33, 83, 73, 27, 68, 0, 67, 23, 18, 81, 20, 89, 85, 77, 15, 65, 2, 1, 5, 84, 79, 21, 69, 82, 11, 8, 14, 75, 70, 13, 4, 74, 66, 71, 10, 64, 7], [63, 123, 114, 39, 57, 115, 61, 59, 127, 124, 121, 62, 54, 125, 120, 116, 113, 49, 60, 117, 58, 53, 122, 88, 56, 50, 52, 48, 19, 55, 126, 51, 28, 118, 112, 119, 43, 42, 93, 45, 97, 38, 110, 46, 6, 0, 41, 85, 104, 95, 86, 102, 107, 111, 92, 47, 87, 105, 16, 106, 103, 109, 44, 76, 25, 37, 108, 81, 40, 34, 96, 35, 73, 80, 15, 101, 22, 99, 100, 36, 26, 67, 3, 83, 94, 32, 29, 90, 98, 23, 69, 24, 33, 17, 27, 30, 72, 31, 91, 5, 2, 12, 78, 65, 84, 70, 79, 9, 89, 75, 1, 20, 4, 68, 21, 14, 18, 77, 82, 74, 11, 8, 64, 10, 13, 66, 71, 7], [39, 63, 123, 25, 82, 97, 87, 13, 115, 22, 19, 114, 74, 80, 70, 57, 6, 77, 3, 66, 83, 7, 18, 64, 84, 91, 31, 4, 12, 94, 29, 27, 10, 9, 79, 93, 92, 16, 95, 24, 75, 30, 21, 90, 86, 5, 59, 85, 81, 76, 58, 104, 78, 23, 15, 20, 89, 17, 1, 127, 71, 120, 14, 73, 61, 54, 8, 88, 124, 62, 113, 116, 43, 72, 68, 109, 49, 96, 118, 99, 32, 2, 65, 28, 11, 55, 100, 34, 35, 98, 121, 48, 107, 37, 69, 45, 110, 40, 26, 41, 50, 51, 53, 122, 67, 102, 42, 126, 0, 60, 125, 46, 56, 105, 52, 36, 38, 119, 111, 112, 117, 33, 101, 108, 44, 103, 106, 47], [63, 123, 39, 115, 57, 28, 114, 80, 97, 19, 25, 22, 127, 61, 124, 87, 9, 62, 59, 96, 88, 120, 92, 58, 54, 113, 116, 104, 121, 49, 111, 3, 12, 125, 117, 53, 51, 24, 107, 95, 20, 60, 50, 31, 126, 52, 48, 15, 56, 72, 21, 77, 112, 55, 34, 122, 41, 46, 82, 118, 40, 119, 13, 42, 94, 102, 43, 37, 30, 45, 110, 99, 23, 16, 83, 93, 84, 70, 85, 29, 90, 100, 27, 91, 44, 105, 109, 78, 38, 108, 65, 68, 106, 98, 26, 35, 47, 36, 86, 76, 81, 17, 33, 8, 101, 5, 14, 79, 32, 74, 2, 89, 73, 64, 1, 7, 75, 18, 4, 103, 6, 10, 0, 69, 67, 11, 71, 66], [38, 47, 62, 111, 93, 83, 16, 26, 33, 76, 24, 9, 23, 86, 78, 92, 81, 118, 90, 68, 1, 70, 4, 34, 73, 74, 80, 29, 84, 20, 12, 87, 14, 19, 101, 2, 77, 75, 21, 71, 51, 64, 22, 85, 15, 63, 5, 10, 91, 66, 112, 95, 17, 11, 28, 82, 67, 88, 98, 36, 27, 13, 79, 7, 69, 65, 0, 53, 32, 30, 119, 18, 89, 56, 31, 52, 6, 3, 100, 8, 25, 72, 120, 104, 61, 122, 123, 114, 102, 96, 127, 110, 117, 49, 124, 55, 94, 57, 48, 115, 60, 109, 116, 50, 99, 35, 121, 106, 59, 58, 126, 46, 42, 40, 97, 125, 54, 105, 103, 44, 37, 39, 108, 113, 45, 41, 43, 107], [38, 47, 62, 111, 33, 24, 83, 93, 79, 81, 76, 100, 86, 118, 78, 29, 67, 26, 28, 16, 74, 6, 112, 32, 87, 90, 84, 119, 53, 92, 77, 34, 63, 18, 61, 120, 70, 85, 122, 56, 71, 114, 52, 13, 25, 98, 82, 23, 3, 88, 27, 19, 75, 51, 2, 20, 127, 95, 123, 22, 9, 60, 55, 121, 99, 117, 10, 17, 5, 49, 80, 115, 48, 94, 124, 0, 1, 50, 58, 113, 91, 4, 57, 89, 30, 8, 21, 15, 31, 12, 125, 68, 101, 116, 110, 14, 104, 106, 107, 59, 11, 96, 126, 73, 105, 72, 54, 37, 46, 35, 40, 7, 36, 108, 102, 45, 43, 109, 44, 66, 39, 41, 42, 103, 97, 65, 69, 64], [62, 111, 47, 38, 118, 120, 51, 63, 56, 53, 119, 100, 117, 58, 48, 122, 55, 124, 60, 123, 113, 52, 108, 116, 115, 127, 57, 112, 114, 121, 61, 49, 50, 126, 54, 18, 104, 59, 91, 125, 106, 109, 107, 110, 46, 27, 101, 28, 41, 84, 45, 44, 95, 35, 37, 105, 40, 43, 42, 103, 96, 92, 99, 36, 39, 24, 98, 33, 32, 87, 23, 21, 34, 31, 30, 93, 86, 94, 29, 89, 26, 97, 102, 82, 25, 20, 22, 90, 88, 81, 85, 79, 83, 15, 13, 77, 17, 8, 74, 67, 10, 72, 78, 64, 16, 3, 75, 19, 7, 76, 70, 69, 71, 11, 80, 14, 6, 5, 66, 0, 12, 4, 2, 65, 1, 9, 68, 73], [62, 111, 47, 118, 120, 100, 38, 56, 119, 52, 53, 112, 58, 63, 48, 57, 123, 113, 122, 55, 125, 117, 60, 115, 54, 51, 121, 114, 50, 124, 127, 61, 49, 126, 59, 116, 109, 110, 107, 36, 46, 104, 105, 106, 44, 108, 28, 45, 43, 27, 42, 41, 35, 103, 39, 91, 40, 82, 101, 84, 95, 33, 34, 92, 23, 37, 99, 102, 31, 18, 98, 93, 87, 32, 21, 30, 29, 96, 94, 86, 24, 97, 26, 22, 25, 89, 90, 67, 81, 20, 79, 6, 85, 10, 15, 12, 88, 8, 64, 72, 78, 70, 83, 19, 14, 17, 13, 66, 3, 7, 69, 77, 74, 11, 80, 16, 65, 75, 76, 0, 73, 71, 5, 2, 4, 68, 9, 1], [42, 43, 33, 72, 38, 1, 5, 91, 113, 106, 0, 21, 114, 12, 79, 24, 30, 67, 81, 19, 119, 61, 46, 124, 82, 49, 2, 107, 74, 53, 121, 122, 69, 66, 75, 71, 13, 25, 47, 112, 28, 126, 86, 77, 94, 87, 125, 45, 4, 117, 23, 3, 123, 70, 88, 50, 20, 80, 48, 65, 68, 89, 76, 63, 101, 100, 116, 8, 15, 103, 59, 27, 17, 110, 99, 7, 60, 14, 118, 11, 62, 104, 41, 64, 58, 31, 6, 115, 102, 57, 52, 92, 37, 127, 109, 51, 9, 78, 32, 22, 111, 90, 83, 56, 54, 16, 73, 26, 35, 39, 96, 55, 105, 95, 85, 84, 120, 10, 97, 98, 18, 44, 29, 36, 40, 108, 93, 34], [42, 38, 91, 33, 117, 43, 21, 82, 79, 74, 24, 107, 12, 20, 78, 10, 70, 14, 121, 89, 112, 30, 119, 81, 106, 27, 86, 19, 18, 84, 8, 88, 92, 28, 114, 6, 3, 113, 16, 94, 61, 67, 15, 87, 64, 90, 122, 80, 96, 5, 22, 23, 123, 57, 69, 83, 9, 76, 85, 25, 37, 102, 56, 40, 72, 75, 36, 93, 73, 51, 45, 13, 29, 116, 39, 32, 65, 63, 17, 77, 4, 46, 41, 100, 71, 26, 47, 35, 31, 95, 53, 124, 101, 11, 62, 105, 127, 125, 49, 7, 110, 34, 68, 99, 52, 98, 50, 126, 103, 66, 109, 59, 104, 60, 2, 48, 58, 118, 115, 44, 111, 54, 120, 55, 1, 108, 97, 0], [117, 43, 38, 42, 113, 33, 91, 107, 30, 114, 47, 123, 41, 46, 125, 62, 24, 26, 112, 39, 106, 59, 111, 60, 53, 116, 18, 115, 40, 50, 63, 87, 56, 118, 121, 119, 80, 108, 57, 89, 48, 124, 110, 51, 86, 61, 32, 109, 45, 127, 82, 52, 54, 126, 120, 55, 31, 58, 49, 28, 34, 44, 36, 81, 101, 104, 78, 95, 122, 94, 35, 96, 98, 88, 99, 19, 14, 21, 105, 37, 84, 93, 100, 29, 90, 103, 25, 20, 79, 11, 97, 23, 22, 16, 77, 27, 92, 13, 73, 83, 102, 85, 74, 17, 9, 70, 12, 76, 75, 2, 66, 8, 67, 6, 4, 71, 72, 7, 5, 0, 68, 15, 3, 10, 1, 64, 65, 69], [43, 38, 107, 33, 114, 91, 46, 47, 26, 24, 42, 117, 62, 30, 53, 56, 123, 119, 125, 39, 116, 111, 41, 59, 82, 115, 52, 90, 113, 48, 84, 118, 49, 55, 126, 50, 124, 51, 112, 122, 60, 57, 106, 121, 105, 63, 109, 120, 61, 110, 93, 44, 45, 127, 40, 54, 77, 108, 101, 34, 58, 35, 27, 88, 18, 100, 20, 37, 32, 104, 99, 98, 103, 29, 36, 31, 25, 81, 94, 86, 96, 102, 95, 28, 89, 87, 78, 22, 92, 14, 23, 76, 80, 17, 97, 75, 83, 19, 21, 16, 13, 12, 8, 79, 73, 11, 9, 6, 70, 4, 71, 67, 74, 5, 66, 72, 68, 15, 7, 0, 3, 64, 1, 69, 10, 2, 85, 65]], "model.layers.13.self_attn.k_proj": [[115, 62, 39, 124, 127, 50, 36, 61, 54, 22, 122, 112, 126, 97, 116, 52, 53, 56, 117, 51, 121, 48, 120, 119, 49, 114, 125, 55, 118, 111, 100, 57, 60, 59, 29, 58, 63, 89, 113, 91, 46, 107, 92, 109, 47, 44, 123, 45, 110, 42, 23, 105, 95, 43, 33, 108, 106, 41, 40, 37, 82, 104, 99, 101, 38, 30, 102, 20, 87, 19, 80, 78, 31, 103, 90, 88, 98, 34, 94, 35, 24, 96, 21, 1, 81, 79, 28, 32, 8, 75, 93, 26, 86, 73, 27, 76, 17, 13, 69, 10, 16, 71, 83, 14, 12, 68, 25, 84, 85, 11, 67, 15, 5, 18, 6, 66, 70, 74, 77, 4, 72, 9, 0, 7, 3, 2, 64, 65], [127, 123, 102, 86, 35, 60, 121, 58, 28, 63, 31, 51, 57, 83, 48, 99, 115, 110, 53, 24, 108, 46, 56, 17, 62, 79, 61, 125, 49, 52, 33, 116, 122, 50, 117, 114, 90, 112, 84, 54, 42, 55, 126, 40, 25, 59, 124, 29, 119, 107, 47, 44, 45, 120, 113, 109, 43, 16, 97, 111, 104, 118, 106, 34, 13, 41, 105, 76, 10, 39, 103, 38, 101, 37, 98, 14, 100, 91, 36, 32, 82, 95, 72, 96, 71, 68, 21, 81, 94, 22, 93, 30, 87, 19, 9, 20, 80, 77, 23, 92, 26, 88, 67, 11, 1, 66, 85, 89, 27, 15, 73, 12, 74, 18, 70, 75, 5, 69, 78, 6, 2, 0, 7, 8, 4, 64, 3, 65], [121, 60, 103, 123, 86, 54, 49, 35, 120, 61, 116, 32, 51, 119, 62, 122, 127, 55, 126, 118, 53, 28, 57, 59, 48, 124, 112, 50, 58, 115, 56, 110, 47, 99, 117, 63, 42, 44, 111, 114, 46, 125, 45, 52, 109, 106, 22, 113, 100, 105, 107, 30, 43, 23, 77, 108, 41, 40, 25, 84, 18, 101, 80, 96, 102, 104, 37, 91, 89, 98, 94, 90, 75, 38, 31, 36, 19, 97, 33, 15, 29, 93, 5, 16, 95, 34, 26, 88, 21, 81, 12, 17, 87, 3, 92, 13, 24, 76, 78, 6, 79, 27, 39, 85, 64, 66, 73, 8, 0, 9, 70, 83, 11, 82, 72, 74, 20, 71, 10, 14, 4, 67, 2, 7, 1, 65, 68, 69], [51, 118, 38, 86, 33, 48, 61, 53, 93, 123, 54, 120, 58, 124, 125, 117, 62, 126, 127, 59, 60, 115, 63, 47, 121, 119, 116, 55, 56, 50, 52, 57, 24, 113, 41, 112, 99, 26, 46, 91, 122, 81, 39, 104, 43, 103, 18, 111, 80, 45, 42, 11, 114, 109, 40, 19, 49, 110, 44, 105, 101, 107, 106, 20, 108, 79, 77, 36, 102, 89, 30, 90, 84, 97, 34, 21, 25, 35, 78, 27, 37, 15, 28, 6, 95, 4, 100, 98, 23, 67, 32, 88, 17, 96, 8, 9, 74, 16, 82, 72, 1, 22, 92, 31, 94, 75, 85, 87, 12, 10, 14, 65, 5, 76, 13, 64, 29, 69, 71, 70, 7, 2, 83, 66, 0, 73, 68, 3], [111, 125, 100, 47, 22, 120, 127, 118, 56, 54, 123, 60, 124, 59, 62, 117, 61, 57, 55, 122, 53, 63, 48, 126, 29, 121, 50, 49, 32, 108, 107, 119, 116, 42, 58, 113, 114, 36, 43, 115, 52, 105, 112, 51, 40, 39, 106, 41, 44, 46, 91, 110, 24, 103, 83, 45, 102, 104, 109, 25, 80, 26, 38, 79, 84, 99, 97, 92, 101, 35, 72, 37, 95, 76, 81, 18, 30, 34, 98, 33, 10, 31, 90, 4, 93, 21, 78, 77, 11, 89, 88, 86, 17, 20, 28, 71, 85, 94, 0, 82, 96, 16, 66, 13, 3, 12, 23, 27, 15, 73, 87, 65, 14, 70, 74, 6, 19, 2, 69, 75, 5, 9, 64, 7, 1, 8, 67, 68], [103, 123, 63, 22, 57, 115, 33, 114, 54, 13, 124, 127, 82, 25, 61, 59, 62, 28, 116, 120, 87, 121, 107, 125, 58, 27, 53, 60, 49, 55, 117, 80, 84, 56, 126, 113, 19, 74, 51, 118, 50, 48, 45, 109, 38, 99, 71, 119, 52, 7, 111, 97, 26, 122, 47, 12, 46, 37, 9, 30, 112, 93, 101, 106, 108, 31, 110, 85, 44, 104, 14, 42, 29, 105, 41, 100, 75, 98, 43, 102, 70, 89, 95, 86, 2, 17, 34, 15, 4, 79, 35, 32, 36, 20, 68, 1, 67, 40, 96, 0, 88, 18, 81, 94, 5, 73, 91, 11, 21, 72, 65, 10, 24, 3, 90, 23, 77, 64, 66, 78, 8, 76, 83, 39, 92, 6, 69, 16], [111, 62, 102, 86, 97, 118, 26, 29, 47, 83, 16, 81, 76, 78, 112, 9, 24, 119, 75, 53, 127, 122, 123, 114, 51, 55, 120, 96, 52, 63, 56, 50, 4, 71, 60, 124, 117, 121, 23, 61, 115, 116, 58, 49, 79, 74, 54, 36, 126, 57, 41, 59, 84, 42, 106, 46, 48, 35, 70, 110, 125, 37, 66, 105, 103, 109, 108, 45, 98, 40, 34, 43, 107, 65, 113, 77, 69, 1, 64, 44, 28, 99, 104, 93, 39, 101, 32, 3, 30, 21, 31, 38, 91, 80, 72, 11, 25, 100, 94, 7, 95, 89, 92, 5, 33, 0, 73, 27, 82, 15, 14, 13, 12, 20, 85, 17, 19, 88, 90, 22, 18, 2, 87, 67, 10, 8, 6, 68], [106, 107, 97, 102, 122, 86, 94, 121, 117, 49, 21, 50, 36, 48, 114, 42, 79, 103, 119, 27, 63, 113, 74, 92, 126, 116, 56, 61, 109, 91, 64, 123, 70, 62, 67, 127, 51, 24, 52, 105, 40, 19, 111, 57, 5, 12, 82, 112, 110, 59, 47, 45, 77, 44, 65, 18, 53, 115, 124, 81, 84, 60, 118, 125, 99, 98, 32, 88, 55, 54, 104, 71, 90, 66, 58, 37, 72, 78, 26, 8, 108, 120, 35, 101, 4, 46, 100, 34, 96, 1, 89, 11, 80, 93, 31, 95, 43, 23, 39, 25, 29, 41, 85, 87, 75, 9, 13, 0, 38, 15, 10, 14, 73, 28, 83, 30, 68, 22, 17, 33, 7, 76, 16, 20, 2, 6, 69, 3]], "model.layers.13.self_attn.qk_proj": [[123, 62, 111, 127, 121, 60, 51, 118, 115, 63, 47, 125, 53, 124, 61, 120, 106, 56, 57, 117, 59, 48, 116, 114, 119, 107, 122, 54, 50, 126, 42, 52, 102, 22, 38, 49, 58, 112, 86, 103, 113, 39, 55, 43, 110, 33, 91, 97, 100, 46, 88, 93, 92, 45, 108, 90, 44, 19, 24, 109, 27, 29, 36, 89, 83, 99, 82, 105, 18, 104, 26, 21, 16, 25, 40, 30, 32, 35, 80, 28, 87, 17, 41, 84, 12, 77, 101, 15, 20, 79, 23, 74, 85, 37, 76, 13, 94, 81, 98, 96, 78, 14, 95, 31, 75, 10, 9, 34, 73, 65, 70, 4, 11, 64, 3, 6, 7, 8, 68, 71, 5, 0, 72, 67, 69, 2, 66, 1], [123, 62, 111, 127, 121, 51, 118, 60, 63, 115, 47, 125, 53, 61, 56, 59, 54, 117, 57, 124, 106, 107, 120, 48, 114, 116, 119, 50, 102, 42, 122, 22, 126, 38, 58, 49, 52, 112, 103, 113, 86, 39, 43, 55, 110, 44, 93, 100, 33, 97, 90, 46, 92, 91, 45, 108, 88, 27, 36, 89, 40, 24, 29, 26, 105, 19, 99, 82, 35, 83, 25, 16, 109, 30, 32, 21, 80, 104, 37, 84, 28, 87, 18, 41, 20, 12, 85, 15, 76, 96, 17, 23, 31, 94, 81, 101, 77, 74, 78, 98, 10, 34, 75, 13, 70, 14, 95, 9, 79, 11, 71, 4, 73, 3, 7, 1, 0, 69, 65, 72, 64, 8, 5, 68, 2, 6, 67, 66], [123, 62, 111, 127, 121, 51, 118, 60, 63, 115, 47, 125, 61, 56, 117, 124, 53, 59, 106, 57, 120, 54, 48, 114, 42, 107, 50, 126, 116, 119, 122, 38, 102, 103, 112, 49, 113, 39, 58, 52, 22, 86, 55, 43, 33, 93, 97, 110, 91, 100, 46, 108, 45, 44, 92, 88, 90, 105, 27, 36, 29, 99, 40, 24, 109, 104, 32, 26, 89, 35, 83, 25, 41, 28, 37, 85, 30, 87, 101, 21, 16, 19, 18, 84, 98, 82, 23, 80, 94, 20, 79, 12, 34, 77, 31, 17, 96, 15, 95, 70, 74, 76, 10, 81, 14, 78, 75, 13, 9, 72, 64, 65, 1, 69, 0, 11, 71, 3, 67, 73, 66, 2, 4, 5, 68, 7, 8, 6], [123, 62, 111, 127, 121, 51, 118, 63, 60, 115, 47, 125, 61, 56, 53, 54, 124, 117, 106, 120, 126, 116, 57, 119, 59, 107, 58, 122, 38, 48, 49, 42, 50, 114, 102, 103, 112, 52, 22, 55, 43, 113, 39, 86, 110, 33, 44, 46, 92, 97, 93, 91, 88, 100, 45, 90, 40, 36, 29, 27, 108, 26, 89, 24, 32, 109, 83, 99, 105, 28, 35, 82, 19, 41, 87, 25, 37, 16, 18, 85, 23, 21, 17, 104, 79, 30, 31, 101, 12, 80, 34, 96, 20, 76, 77, 84, 98, 94, 70, 74, 81, 95, 14, 10, 75, 13, 15, 9, 78, 1, 64, 71, 3, 72, 68, 0, 11, 65, 67, 66, 73, 7, 69, 4, 5, 2, 6, 8], [123, 111, 62, 127, 121, 51, 115, 118, 63, 60, 47, 125, 61, 56, 53, 106, 124, 54, 57, 126, 117, 116, 114, 48, 119, 59, 50, 122, 42, 107, 120, 58, 102, 38, 52, 112, 49, 103, 55, 22, 113, 86, 39, 110, 44, 43, 46, 33, 91, 93, 92, 97, 88, 100, 90, 108, 24, 99, 36, 45, 27, 29, 40, 83, 89, 105, 19, 87, 26, 25, 28, 35, 32, 104, 109, 82, 41, 18, 15, 77, 16, 17, 12, 21, 76, 80, 23, 85, 81, 79, 94, 101, 20, 84, 10, 78, 74, 96, 30, 70, 98, 13, 95, 31, 37, 9, 75, 34, 14, 64, 7, 11, 68, 65, 72, 71, 73, 1, 66, 69, 3, 0, 4, 67, 6, 2, 5, 8], [123, 62, 111, 127, 121, 51, 60, 115, 63, 118, 47, 125, 53, 61, 56, 124, 106, 117, 54, 126, 116, 57, 119, 107, 59, 120, 122, 58, 42, 50, 55, 102, 38, 22, 48, 114, 112, 86, 103, 39, 52, 49, 113, 43, 44, 110, 46, 91, 88, 93, 92, 97, 33, 90, 100, 29, 83, 108, 45, 36, 104, 19, 27, 26, 24, 99, 82, 16, 105, 87, 89, 80, 17, 18, 109, 28, 41, 40, 25, 35, 76, 79, 21, 13, 77, 84, 23, 32, 81, 30, 15, 20, 12, 74, 10, 14, 34, 101, 78, 75, 96, 9, 98, 85, 31, 94, 37, 11, 7, 70, 95, 71, 72, 6, 73, 0, 65, 3, 67, 1, 66, 4, 69, 5, 68, 64, 8, 2], [123, 111, 62, 127, 121, 51, 115, 60, 63, 118, 47, 125, 61, 124, 53, 56, 126, 54, 117, 106, 122, 59, 107, 120, 119, 57, 116, 58, 114, 49, 42, 102, 48, 103, 38, 50, 55, 22, 112, 86, 39, 52, 43, 113, 88, 44, 46, 93, 110, 91, 92, 90, 97, 100, 45, 19, 33, 83, 24, 108, 27, 99, 40, 89, 36, 104, 29, 105, 82, 18, 26, 21, 80, 17, 87, 109, 35, 85, 76, 28, 25, 23, 16, 79, 77, 12, 32, 84, 37, 15, 30, 41, 94, 20, 13, 81, 78, 96, 10, 98, 14, 31, 34, 74, 9, 101, 75, 95, 72, 7, 73, 11, 6, 70, 71, 69, 67, 0, 5, 65, 1, 66, 64, 3, 8, 68, 4, 2], [123, 62, 111, 127, 121, 51, 115, 60, 63, 47, 118, 125, 61, 124, 53, 56, 106, 54, 117, 120, 48, 126, 114, 107, 122, 59, 57, 119, 103, 116, 42, 38, 102, 58, 22, 49, 86, 50, 39, 55, 112, 52, 113, 43, 44, 91, 93, 110, 90, 97, 92, 100, 33, 88, 46, 27, 24, 29, 83, 19, 89, 36, 108, 35, 40, 104, 45, 26, 87, 16, 28, 32, 23, 82, 105, 99, 109, 80, 21, 25, 18, 84, 85, 41, 37, 76, 17, 31, 30, 15, 13, 12, 96, 20, 101, 77, 81, 14, 78, 94, 74, 98, 79, 10, 9, 34, 95, 75, 6, 72, 11, 73, 1, 0, 65, 64, 7, 3, 67, 69, 5, 2, 70, 4, 68, 71, 8, 66], [123, 62, 111, 127, 51, 121, 63, 60, 115, 118, 47, 125, 61, 53, 54, 117, 124, 56, 106, 126, 119, 59, 107, 120, 48, 57, 114, 102, 50, 42, 122, 38, 116, 113, 103, 58, 22, 49, 39, 112, 55, 52, 86, 43, 93, 90, 91, 110, 44, 92, 33, 97, 100, 46, 26, 88, 45, 27, 108, 29, 89, 83, 36, 82, 24, 19, 35, 99, 104, 25, 109, 28, 32, 40, 80, 87, 85, 41, 105, 21, 23, 16, 37, 76, 18, 84, 30, 17, 98, 31, 81, 78, 96, 14, 12, 13, 94, 79, 15, 20, 101, 74, 77, 10, 34, 95, 73, 75, 6, 67, 11, 9, 3, 65, 1, 69, 8, 64, 7, 72, 0, 71, 70, 5, 66, 4, 2, 68], [62, 123, 111, 127, 121, 51, 63, 115, 60, 118, 47, 125, 53, 61, 124, 117, 59, 106, 56, 54, 48, 126, 114, 119, 57, 107, 116, 120, 38, 58, 50, 102, 42, 103, 52, 22, 112, 49, 122, 86, 39, 55, 43, 113, 44, 33, 91, 46, 93, 97, 100, 92, 110, 90, 88, 108, 24, 83, 27, 45, 89, 19, 109, 29, 36, 26, 40, 105, 104, 28, 18, 25, 99, 82, 16, 35, 41, 32, 80, 76, 37, 13, 87, 96, 30, 31, 79, 12, 23, 17, 77, 20, 85, 81, 78, 84, 98, 15, 74, 94, 101, 21, 14, 95, 10, 34, 75, 9, 6, 11, 73, 67, 8, 7, 71, 1, 68, 5, 70, 64, 65, 0, 2, 3, 4, 69, 72, 66], [123, 62, 111, 127, 121, 51, 60, 115, 63, 118, 47, 125, 124, 53, 61, 117, 54, 120, 106, 126, 114, 116, 56, 57, 59, 119, 48, 107, 50, 122, 42, 38, 102, 49, 22, 52, 58, 39, 55, 113, 86, 103, 43, 112, 91, 33, 46, 97, 93, 100, 92, 44, 88, 90, 110, 24, 29, 27, 40, 45, 99, 89, 19, 26, 83, 82, 36, 105, 108, 25, 109, 17, 37, 32, 18, 80, 16, 35, 87, 77, 12, 30, 94, 76, 84, 28, 104, 79, 15, 31, 74, 41, 21, 13, 96, 81, 10, 6, 20, 23, 101, 73, 98, 78, 14, 95, 75, 34, 85, 71, 8, 11, 7, 3, 1, 9, 67, 68, 70, 4, 65, 64, 0, 5, 66, 69, 72, 2], [123, 62, 111, 127, 121, 51, 60, 115, 63, 118, 47, 125, 61, 53, 54, 117, 106, 57, 126, 124, 119, 120, 116, 114, 59, 56, 122, 49, 42, 107, 50, 48, 52, 102, 22, 103, 55, 38, 58, 86, 113, 112, 39, 43, 46, 91, 97, 110, 93, 33, 90, 88, 44, 92, 100, 24, 19, 45, 27, 83, 99, 29, 36, 108, 21, 26, 89, 82, 87, 35, 109, 40, 32, 76, 80, 15, 16, 105, 77, 17, 10, 41, 18, 79, 104, 28, 13, 81, 12, 25, 85, 31, 94, 37, 96, 84, 30, 74, 20, 23, 34, 14, 78, 101, 98, 65, 11, 8, 9, 75, 73, 71, 95, 6, 70, 0, 68, 69, 67, 7, 3, 5, 64, 66, 4, 1, 72, 2], [123, 62, 111, 127, 121, 51, 115, 60, 63, 118, 47, 125, 61, 124, 53, 117, 126, 114, 54, 59, 56, 106, 120, 107, 57, 119, 50, 48, 38, 102, 116, 42, 49, 22, 103, 55, 122, 112, 52, 43, 86, 58, 113, 39, 46, 91, 110, 93, 97, 44, 92, 90, 100, 45, 33, 27, 89, 24, 29, 40, 108, 88, 104, 99, 19, 26, 25, 36, 82, 41, 109, 83, 35, 32, 28, 87, 105, 80, 18, 85, 16, 37, 76, 96, 23, 20, 17, 30, 31, 21, 84, 15, 94, 13, 79, 77, 12, 78, 81, 98, 10, 34, 95, 101, 14, 11, 75, 74, 73, 8, 70, 71, 0, 3, 65, 9, 5, 67, 66, 69, 4, 68, 6, 1, 2, 64, 7, 72], [123, 62, 111, 127, 121, 63, 60, 51, 118, 115, 47, 125, 61, 124, 53, 117, 59, 106, 119, 126, 57, 107, 56, 54, 120, 114, 50, 122, 38, 58, 55, 116, 49, 48, 22, 52, 102, 42, 86, 112, 43, 39, 103, 113, 46, 110, 44, 91, 92, 33, 97, 100, 93, 108, 88, 90, 109, 24, 45, 82, 19, 29, 83, 36, 105, 89, 80, 27, 35, 18, 104, 26, 25, 16, 13, 28, 40, 99, 17, 41, 30, 76, 87, 84, 78, 94, 21, 32, 79, 37, 23, 12, 14, 20, 96, 10, 85, 81, 98, 95, 31, 11, 74, 75, 101, 70, 77, 73, 15, 71, 9, 7, 8, 34, 67, 64, 0, 68, 65, 3, 4, 1, 6, 69, 66, 5, 72, 2], [123, 62, 111, 127, 121, 60, 51, 115, 118, 63, 47, 125, 124, 53, 61, 106, 56, 117, 119, 57, 54, 114, 120, 126, 59, 107, 50, 116, 122, 55, 42, 48, 38, 49, 52, 58, 102, 22, 86, 112, 103, 43, 39, 113, 110, 46, 91, 44, 93, 97, 100, 33, 92, 108, 24, 27, 45, 109, 88, 90, 29, 19, 82, 83, 36, 26, 40, 99, 32, 25, 104, 28, 80, 35, 18, 21, 16, 41, 89, 105, 17, 76, 20, 31, 23, 13, 37, 87, 94, 84, 12, 30, 81, 78, 96, 15, 98, 85, 74, 77, 10, 79, 14, 95, 75, 70, 34, 101, 73, 11, 71, 8, 68, 7, 9, 65, 64, 4, 3, 1, 67, 0, 5, 69, 72, 6, 2, 66], [123, 62, 111, 127, 121, 51, 60, 115, 118, 63, 47, 125, 124, 61, 53, 117, 106, 114, 54, 59, 120, 126, 57, 56, 119, 107, 116, 122, 48, 55, 42, 58, 49, 50, 102, 38, 22, 52, 112, 113, 86, 39, 103, 43, 46, 44, 93, 100, 110, 97, 33, 91, 45, 92, 27, 90, 24, 88, 108, 36, 109, 83, 82, 40, 26, 29, 21, 99, 19, 41, 104, 89, 18, 32, 15, 25, 12, 16, 28, 87, 10, 30, 79, 17, 31, 35, 13, 81, 76, 105, 80, 37, 96, 23, 84, 85, 98, 94, 74, 34, 77, 73, 20, 14, 78, 11, 101, 8, 75, 95, 70, 65, 69, 7, 71, 9, 64, 6, 1, 4, 0, 67, 68, 3, 72, 2, 5, 66], [123, 62, 111, 127, 121, 115, 51, 60, 63, 118, 47, 125, 61, 124, 53, 59, 57, 117, 126, 119, 54, 120, 106, 56, 107, 122, 38, 114, 52, 55, 58, 48, 49, 116, 42, 22, 103, 102, 86, 50, 39, 112, 113, 46, 43, 44, 91, 100, 33, 97, 93, 110, 92, 27, 90, 88, 45, 109, 108, 99, 24, 29, 83, 89, 26, 36, 104, 40, 19, 35, 32, 87, 80, 25, 105, 41, 82, 84, 28, 16, 18, 79, 85, 81, 13, 23, 37, 21, 30, 12, 17, 34, 101, 98, 94, 31, 76, 20, 15, 77, 96, 10, 95, 78, 73, 11, 75, 14, 74, 6, 7, 9, 71, 0, 64, 65, 8, 70, 3, 5, 1, 69, 72, 67, 66, 2, 4, 68], [123, 62, 111, 127, 121, 51, 115, 118, 63, 60, 47, 125, 53, 61, 124, 119, 126, 106, 117, 56, 57, 48, 107, 120, 54, 59, 38, 50, 122, 114, 55, 49, 42, 116, 22, 102, 58, 103, 86, 52, 113, 112, 39, 43, 46, 44, 97, 91, 33, 93, 100, 90, 110, 92, 45, 27, 108, 36, 88, 29, 109, 24, 40, 25, 105, 104, 19, 83, 87, 82, 35, 32, 26, 16, 89, 28, 99, 80, 18, 41, 13, 84, 34, 21, 37, 20, 17, 96, 30, 12, 81, 79, 23, 85, 14, 94, 76, 15, 78, 98, 101, 77, 31, 10, 74, 11, 95, 73, 9, 67, 75, 72, 6, 71, 70, 69, 68, 64, 7, 1, 0, 3, 65, 2, 4, 66, 5, 8], [123, 111, 62, 127, 121, 115, 51, 118, 63, 60, 47, 125, 53, 61, 117, 124, 56, 106, 120, 57, 119, 54, 58, 107, 59, 38, 126, 42, 116, 55, 102, 122, 114, 103, 50, 48, 113, 49, 39, 22, 112, 43, 86, 52, 46, 100, 97, 91, 33, 44, 93, 108, 92, 110, 45, 88, 104, 24, 36, 90, 29, 109, 40, 99, 27, 83, 25, 32, 19, 89, 82, 35, 105, 87, 26, 37, 18, 84, 21, 16, 101, 41, 30, 28, 98, 80, 96, 34, 23, 85, 94, 20, 31, 15, 81, 14, 17, 79, 13, 95, 12, 10, 77, 76, 6, 78, 9, 11, 67, 74, 75, 73, 4, 65, 0, 72, 71, 64, 7, 1, 5, 68, 3, 69, 66, 2, 8, 70], [123, 62, 111, 127, 121, 51, 63, 115, 60, 118, 47, 125, 53, 61, 124, 117, 106, 120, 59, 107, 56, 119, 57, 126, 54, 48, 116, 58, 114, 122, 38, 50, 22, 102, 42, 49, 39, 55, 86, 103, 112, 113, 52, 43, 44, 46, 97, 33, 100, 92, 93, 91, 90, 45, 27, 24, 110, 88, 29, 36, 82, 104, 19, 89, 35, 99, 26, 109, 108, 32, 28, 40, 41, 25, 83, 16, 18, 84, 87, 80, 21, 96, 23, 20, 105, 12, 30, 37, 98, 31, 13, 76, 94, 77, 81, 15, 101, 34, 17, 11, 79, 10, 14, 85, 74, 6, 95, 78, 9, 73, 75, 72, 71, 7, 3, 1, 4, 69, 64, 65, 70, 67, 5, 0, 66, 8, 68, 2], [123, 62, 111, 127, 121, 51, 115, 63, 60, 118, 47, 125, 53, 124, 61, 54, 117, 106, 119, 59, 126, 120, 122, 114, 107, 57, 50, 55, 116, 56, 49, 48, 22, 58, 38, 42, 86, 43, 102, 39, 52, 113, 103, 112, 46, 93, 91, 44, 33, 90, 92, 97, 27, 100, 45, 24, 89, 36, 88, 110, 29, 26, 108, 82, 109, 19, 40, 32, 35, 83, 28, 41, 104, 84, 18, 99, 21, 80, 16, 25, 23, 87, 17, 20, 12, 37, 13, 30, 96, 31, 98, 81, 79, 34, 76, 10, 15, 105, 94, 101, 77, 74, 85, 14, 78, 73, 7, 11, 75, 9, 3, 95, 1, 72, 6, 71, 0, 4, 69, 64, 68, 67, 65, 5, 70, 8, 2, 66], [123, 62, 111, 127, 121, 51, 118, 60, 63, 115, 47, 125, 53, 124, 61, 120, 106, 126, 59, 119, 117, 116, 54, 56, 122, 107, 58, 49, 38, 57, 114, 48, 55, 42, 103, 102, 39, 50, 113, 43, 22, 112, 52, 86, 46, 91, 93, 44, 90, 100, 97, 110, 33, 92, 24, 45, 108, 27, 36, 88, 29, 26, 32, 109, 89, 19, 104, 35, 99, 41, 83, 82, 40, 18, 28, 87, 25, 23, 16, 30, 85, 84, 96, 80, 31, 10, 21, 37, 15, 105, 17, 101, 13, 12, 94, 78, 20, 76, 98, 34, 77, 81, 79, 95, 14, 73, 75, 72, 74, 3, 11, 9, 71, 70, 64, 65, 1, 7, 0, 6, 68, 66, 67, 5, 69, 4, 8, 2], [123, 62, 111, 127, 121, 51, 60, 63, 115, 118, 47, 125, 53, 124, 117, 54, 61, 106, 59, 114, 120, 107, 126, 122, 56, 57, 119, 116, 58, 48, 39, 103, 102, 55, 38, 22, 42, 49, 50, 112, 52, 86, 113, 43, 93, 97, 100, 110, 46, 33, 44, 90, 91, 92, 45, 27, 36, 24, 108, 88, 89, 104, 40, 29, 26, 109, 99, 32, 83, 82, 25, 87, 35, 19, 28, 84, 30, 21, 16, 41, 80, 18, 96, 23, 105, 37, 101, 31, 78, 85, 15, 12, 20, 13, 94, 76, 10, 34, 95, 81, 14, 98, 79, 17, 77, 73, 11, 70, 74, 9, 4, 1, 65, 71, 0, 67, 75, 72, 3, 64, 69, 7, 2, 68, 66, 6, 5, 8], [123, 62, 111, 127, 121, 51, 60, 63, 118, 115, 47, 125, 53, 61, 124, 59, 106, 54, 117, 56, 114, 107, 126, 119, 58, 116, 57, 48, 120, 122, 49, 38, 22, 42, 55, 103, 50, 102, 52, 39, 86, 43, 113, 112, 46, 93, 92, 90, 33, 97, 91, 100, 27, 88, 44, 24, 45, 110, 36, 29, 26, 82, 89, 109, 25, 99, 83, 28, 18, 41, 80, 108, 19, 87, 32, 20, 104, 105, 40, 35, 96, 31, 16, 30, 79, 37, 84, 81, 21, 94, 12, 73, 14, 17, 76, 85, 13, 23, 77, 10, 34, 15, 74, 78, 101, 11, 98, 70, 7, 95, 75, 71, 9, 0, 3, 67, 72, 69, 4, 65, 5, 1, 66, 6, 68, 8, 64, 2], [123, 62, 111, 127, 121, 51, 118, 60, 115, 63, 47, 125, 61, 53, 124, 57, 117, 56, 106, 120, 122, 54, 126, 119, 59, 107, 48, 50, 116, 58, 22, 49, 114, 42, 38, 86, 103, 102, 55, 52, 113, 39, 112, 43, 46, 33, 110, 45, 97, 91, 100, 93, 90, 44, 92, 108, 27, 109, 88, 24, 83, 19, 99, 26, 29, 40, 89, 36, 18, 25, 82, 35, 41, 104, 21, 30, 16, 28, 87, 23, 32, 84, 12, 31, 80, 105, 94, 77, 101, 20, 13, 37, 79, 85, 17, 15, 81, 14, 96, 98, 10, 74, 78, 76, 73, 34, 70, 95, 7, 9, 11, 75, 64, 71, 1, 0, 72, 4, 3, 65, 69, 67, 66, 68, 5, 8, 2, 6], [123, 62, 111, 127, 121, 118, 51, 115, 60, 63, 47, 125, 61, 53, 124, 117, 54, 106, 57, 59, 56, 107, 120, 126, 58, 119, 122, 48, 50, 116, 49, 112, 114, 103, 38, 102, 22, 55, 42, 86, 39, 113, 52, 43, 46, 93, 44, 97, 45, 100, 92, 33, 110, 91, 90, 24, 27, 108, 88, 40, 19, 29, 109, 89, 26, 25, 99, 36, 83, 82, 28, 35, 87, 18, 21, 84, 41, 16, 105, 30, 31, 104, 32, 80, 85, 94, 17, 96, 15, 12, 98, 81, 20, 101, 23, 77, 37, 13, 76, 10, 79, 34, 95, 78, 74, 14, 11, 9, 70, 75, 73, 7, 8, 68, 71, 3, 1, 65, 64, 0, 6, 72, 67, 5, 69, 2, 4, 66], [123, 62, 111, 127, 121, 51, 118, 115, 60, 63, 47, 125, 61, 53, 124, 120, 117, 59, 106, 54, 56, 57, 114, 107, 122, 50, 48, 116, 126, 38, 58, 49, 119, 102, 22, 42, 55, 103, 113, 86, 52, 112, 39, 43, 46, 110, 97, 91, 93, 45, 44, 33, 108, 100, 92, 99, 24, 88, 90, 19, 36, 27, 40, 26, 89, 29, 109, 35, 28, 105, 104, 18, 25, 30, 83, 80, 31, 82, 101, 87, 16, 23, 21, 20, 32, 37, 85, 84, 41, 12, 94, 13, 79, 17, 78, 10, 76, 34, 77, 96, 98, 81, 15, 95, 74, 9, 11, 14, 73, 75, 7, 70, 8, 67, 6, 71, 3, 1, 64, 0, 69, 5, 4, 68, 65, 66, 2, 72], [123, 62, 111, 127, 121, 51, 60, 115, 118, 63, 47, 125, 61, 53, 124, 106, 54, 117, 120, 56, 50, 114, 107, 59, 57, 119, 48, 126, 22, 122, 58, 116, 42, 49, 38, 86, 113, 39, 102, 103, 52, 112, 55, 43, 97, 33, 92, 44, 91, 93, 110, 90, 45, 88, 46, 100, 108, 27, 82, 19, 29, 26, 36, 24, 89, 83, 109, 40, 18, 99, 16, 25, 28, 87, 80, 20, 32, 104, 23, 35, 105, 13, 17, 21, 30, 94, 84, 41, 37, 85, 31, 12, 10, 81, 78, 76, 79, 9, 96, 101, 77, 15, 34, 95, 74, 98, 6, 14, 11, 67, 75, 8, 7, 73, 0, 65, 71, 1, 3, 4, 70, 5, 64, 68, 66, 2, 69, 72], [123, 111, 62, 127, 51, 121, 60, 115, 118, 63, 47, 125, 61, 124, 53, 56, 106, 54, 117, 48, 116, 57, 114, 50, 126, 119, 107, 42, 59, 120, 122, 58, 113, 38, 102, 22, 49, 86, 112, 52, 103, 39, 55, 43, 110, 33, 97, 91, 46, 100, 44, 93, 92, 88, 90, 27, 24, 36, 45, 29, 99, 40, 89, 108, 109, 32, 26, 104, 18, 41, 19, 83, 25, 82, 28, 21, 35, 80, 94, 96, 12, 15, 105, 23, 30, 101, 87, 17, 37, 81, 16, 13, 85, 20, 31, 84, 98, 76, 74, 77, 10, 95, 78, 34, 9, 14, 6, 79, 64, 73, 11, 65, 7, 75, 3, 4, 1, 67, 70, 71, 68, 5, 8, 66, 69, 2, 0, 72], [123, 111, 62, 127, 121, 51, 115, 60, 118, 63, 47, 125, 53, 61, 124, 106, 56, 48, 57, 120, 126, 117, 50, 119, 59, 54, 107, 114, 42, 116, 22, 122, 58, 103, 86, 38, 102, 49, 55, 39, 112, 113, 52, 43, 44, 33, 110, 93, 46, 100, 97, 92, 88, 91, 27, 90, 108, 36, 24, 19, 45, 29, 40, 89, 26, 109, 32, 82, 99, 83, 104, 18, 28, 87, 25, 16, 35, 85, 76, 105, 12, 15, 84, 80, 21, 30, 96, 41, 37, 79, 81, 77, 17, 98, 23, 10, 74, 13, 14, 20, 94, 101, 95, 73, 78, 9, 6, 31, 11, 34, 75, 8, 71, 1, 67, 4, 66, 64, 7, 65, 3, 68, 5, 0, 70, 69, 72, 2], [123, 62, 111, 127, 121, 51, 60, 115, 63, 118, 47, 125, 53, 61, 124, 56, 106, 117, 48, 57, 126, 120, 119, 59, 114, 54, 50, 107, 116, 122, 49, 42, 38, 22, 58, 112, 86, 103, 102, 113, 55, 52, 43, 39, 33, 100, 46, 110, 91, 97, 93, 108, 90, 44, 92, 27, 88, 109, 45, 36, 29, 40, 83, 82, 26, 89, 105, 19, 32, 104, 99, 18, 16, 24, 80, 87, 41, 35, 25, 94, 30, 28, 77, 84, 37, 21, 101, 17, 15, 74, 98, 81, 79, 12, 6, 76, 96, 85, 14, 20, 31, 23, 10, 75, 13, 78, 9, 71, 11, 95, 73, 34, 7, 8, 0, 3, 68, 67, 1, 65, 4, 70, 64, 2, 69, 66, 72, 5], [123, 62, 111, 127, 121, 60, 51, 115, 118, 63, 47, 125, 61, 53, 124, 120, 106, 57, 117, 48, 59, 56, 54, 126, 116, 107, 114, 122, 119, 22, 42, 38, 103, 86, 102, 39, 58, 50, 49, 112, 52, 55, 113, 100, 43, 46, 110, 33, 97, 91, 93, 92, 44, 88, 90, 108, 27, 45, 89, 19, 24, 29, 26, 99, 36, 83, 16, 25, 105, 32, 104, 28, 109, 18, 30, 80, 87, 84, 82, 35, 40, 76, 77, 20, 12, 81, 85, 79, 21, 17, 15, 37, 23, 41, 94, 10, 14, 74, 13, 73, 101, 96, 75, 78, 31, 34, 98, 11, 6, 9, 7, 8, 71, 95, 5, 70, 67, 3, 4, 1, 72, 69, 64, 65, 0, 68, 66, 2]], "model.layers.14.self_attn.q_proj": [[60, 120, 62, 37, 51, 63, 33, 53, 90, 101, 30, 118, 117, 87, 19, 86, 57, 125, 58, 85, 74, 93, 123, 111, 109, 115, 119, 59, 54, 92, 21, 116, 121, 12, 26, 50, 52, 114, 42, 44, 29, 126, 61, 91, 127, 122, 45, 55, 56, 48, 39, 105, 124, 6, 110, 15, 49, 4, 81, 43, 97, 112, 46, 36, 113, 25, 108, 20, 104, 94, 38, 47, 17, 64, 106, 88, 107, 71, 41, 18, 13, 103, 78, 23, 99, 34, 102, 22, 16, 2, 98, 40, 32, 65, 28, 10, 89, 72, 100, 95, 31, 1, 27, 35, 14, 79, 96, 68, 83, 24, 84, 11, 82, 66, 76, 80, 70, 69, 77, 75, 7, 8, 3, 0, 73, 67, 5, 9], [60, 120, 62, 37, 33, 90, 30, 118, 19, 101, 87, 117, 51, 53, 86, 59, 115, 57, 119, 92, 58, 111, 121, 126, 85, 54, 12, 15, 63, 116, 123, 26, 127, 48, 114, 56, 122, 74, 125, 46, 105, 71, 52, 43, 81, 38, 97, 110, 61, 42, 124, 83, 41, 55, 44, 45, 93, 50, 94, 39, 88, 69, 108, 112, 102, 40, 109, 103, 106, 113, 47, 20, 6, 29, 13, 49, 107, 1, 79, 28, 2, 99, 22, 17, 25, 104, 31, 95, 34, 14, 91, 23, 8, 66, 36, 21, 24, 89, 7, 72, 64, 32, 82, 100, 76, 16, 84, 98, 35, 96, 80, 75, 4, 18, 78, 11, 10, 70, 27, 65, 67, 9, 77, 5, 73, 3, 68, 0], [60, 120, 37, 62, 90, 118, 51, 101, 33, 19, 53, 87, 117, 30, 86, 42, 74, 4, 12, 59, 93, 111, 21, 114, 58, 39, 116, 63, 46, 92, 113, 54, 57, 126, 50, 121, 115, 26, 52, 105, 2, 49, 125, 15, 123, 25, 119, 127, 45, 48, 43, 122, 94, 91, 88, 61, 17, 110, 83, 103, 112, 56, 71, 55, 124, 85, 109, 38, 41, 81, 98, 47, 106, 104, 40, 14, 108, 102, 44, 6, 8, 107, 78, 29, 34, 99, 79, 76, 28, 23, 64, 72, 31, 69, 22, 89, 18, 65, 35, 66, 84, 36, 16, 20, 97, 1, 32, 68, 100, 24, 27, 77, 96, 95, 7, 82, 73, 80, 13, 11, 10, 75, 5, 70, 0, 67, 9, 3], [120, 60, 37, 62, 118, 90, 33, 117, 30, 51, 101, 53, 111, 59, 19, 87, 63, 86, 58, 57, 85, 105, 121, 48, 115, 92, 54, 42, 12, 126, 116, 55, 122, 119, 123, 46, 125, 52, 56, 26, 93, 74, 61, 127, 110, 108, 50, 112, 124, 114, 88, 41, 69, 109, 49, 113, 47, 91, 15, 82, 4, 21, 79, 44, 107, 43, 106, 81, 45, 39, 2, 25, 97, 104, 103, 94, 28, 38, 64, 71, 22, 36, 83, 78, 29, 31, 17, 6, 13, 40, 1, 100, 66, 99, 76, 102, 35, 34, 95, 32, 24, 72, 98, 84, 23, 96, 73, 18, 16, 89, 75, 68, 10, 65, 8, 27, 20, 0, 14, 11, 80, 70, 5, 67, 9, 77, 7, 3], [52, 102, 51, 124, 125, 29, 33, 60, 116, 84, 61, 123, 26, 86, 87, 88, 56, 119, 62, 93, 55, 38, 107, 120, 118, 106, 114, 115, 81, 39, 57, 122, 113, 109, 127, 37, 45, 94, 50, 53, 110, 58, 121, 28, 20, 54, 42, 63, 101, 105, 111, 41, 103, 112, 99, 35, 49, 108, 36, 40, 104, 97, 48, 22, 44, 43, 117, 34, 90, 59, 46, 47, 85, 25, 15, 100, 126, 98, 95, 73, 14, 32, 24, 21, 96, 82, 30, 31, 23, 92, 17, 18, 91, 11, 27, 71, 89, 80, 83, 74, 19, 16, 10, 7, 13, 5, 67, 78, 79, 9, 77, 8, 76, 12, 72, 75, 69, 66, 68, 65, 6, 64, 1, 4, 2, 70, 3, 0], [52, 102, 124, 33, 29, 125, 87, 60, 61, 116, 88, 123, 56, 38, 55, 119, 51, 73, 84, 45, 101, 62, 107, 57, 86, 28, 112, 106, 115, 39, 113, 110, 50, 53, 105, 127, 63, 67, 26, 118, 120, 42, 36, 54, 121, 114, 49, 58, 108, 43, 117, 109, 64, 47, 46, 126, 48, 40, 94, 122, 111, 44, 41, 59, 15, 93, 32, 103, 104, 97, 24, 22, 83, 99, 11, 100, 91, 20, 98, 90, 37, 35, 65, 81, 95, 34, 82, 31, 3, 25, 66, 96, 69, 27, 30, 23, 92, 71, 21, 89, 13, 5, 7, 9, 17, 79, 77, 19, 1, 6, 76, 18, 85, 10, 80, 78, 75, 14, 72, 74, 4, 0, 16, 12, 70, 68, 2, 8], [52, 102, 33, 124, 86, 29, 116, 88, 125, 61, 84, 82, 60, 73, 26, 87, 15, 56, 38, 66, 24, 93, 123, 51, 81, 14, 119, 90, 105, 55, 20, 62, 28, 19, 50, 110, 9, 112, 57, 113, 18, 10, 95, 13, 120, 43, 91, 54, 5, 106, 21, 45, 79, 99, 2, 22, 53, 63, 121, 49, 77, 108, 115, 85, 31, 101, 30, 118, 127, 114, 83, 4, 107, 23, 34, 78, 39, 75, 48, 17, 11, 25, 40, 117, 64, 98, 58, 27, 122, 12, 109, 0, 71, 44, 7, 72, 37, 94, 42, 70, 92, 111, 126, 16, 59, 103, 97, 69, 35, 46, 47, 1, 76, 89, 36, 41, 104, 80, 96, 32, 3, 100, 8, 6, 74, 68, 67, 65], [52, 102, 33, 29, 125, 51, 61, 26, 86, 124, 56, 116, 88, 82, 60, 84, 81, 87, 93, 15, 71, 123, 20, 119, 85, 11, 120, 55, 38, 73, 110, 113, 97, 22, 32, 62, 17, 75, 28, 90, 103, 14, 57, 24, 13, 50, 19, 76, 5, 54, 106, 109, 115, 127, 10, 112, 67, 25, 18, 42, 63, 118, 77, 48, 122, 45, 49, 79, 121, 107, 27, 58, 23, 3, 111, 108, 16, 31, 37, 104, 46, 89, 114, 43, 39, 105, 101, 53, 40, 36, 98, 94, 30, 117, 64, 41, 7, 126, 35, 21, 47, 83, 6, 80, 9, 91, 92, 1, 44, 100, 65, 99, 59, 34, 74, 70, 96, 95, 78, 12, 72, 69, 68, 4, 8, 66, 2, 0], [103, 97, 53, 117, 23, 2, 80, 11, 9, 85, 65, 0, 69, 81, 83, 67, 51, 14, 4, 3, 12, 71, 116, 70, 1, 7, 124, 61, 127, 112, 64, 87, 118, 60, 106, 6, 122, 73, 50, 5, 89, 113, 126, 66, 74, 13, 55, 114, 25, 68, 121, 54, 120, 24, 88, 8, 39, 63, 30, 91, 10, 49, 76, 15, 78, 28, 75, 17, 59, 108, 29, 45, 119, 104, 82, 58, 42, 79, 109, 31, 43, 52, 105, 110, 47, 77, 44, 46, 20, 19, 27, 123, 90, 72, 56, 34, 40, 57, 22, 125, 26, 16, 21, 98, 32, 102, 100, 99, 86, 37, 35, 107, 101, 84, 41, 62, 95, 36, 93, 18, 38, 48, 92, 33, 111, 94, 115, 96], [103, 97, 53, 117, 106, 81, 80, 87, 14, 12, 85, 23, 74, 4, 2, 112, 51, 83, 7, 11, 127, 116, 9, 124, 118, 69, 0, 54, 60, 61, 3, 25, 15, 24, 113, 50, 55, 70, 65, 126, 88, 122, 64, 30, 45, 121, 120, 114, 63, 6, 1, 76, 18, 75, 29, 72, 73, 5, 71, 68, 28, 89, 91, 39, 49, 26, 78, 58, 16, 8, 10, 67, 59, 44, 40, 19, 109, 110, 21, 52, 105, 17, 82, 98, 92, 66, 108, 42, 86, 46, 32, 31, 79, 27, 41, 13, 77, 36, 119, 95, 90, 125, 43, 84, 47, 100, 48, 34, 57, 107, 62, 102, 38, 22, 104, 99, 37, 101, 20, 35, 115, 93, 56, 94, 96, 111, 123, 33], [103, 97, 117, 53, 85, 87, 80, 51, 58, 14, 83, 12, 106, 11, 27, 16, 10, 23, 24, 5, 127, 124, 49, 81, 113, 104, 76, 43, 116, 118, 112, 45, 54, 71, 57, 91, 25, 88, 120, 86, 108, 122, 17, 109, 60, 50, 66, 47, 78, 55, 121, 110, 92, 63, 114, 52, 6, 21, 61, 44, 99, 125, 119, 62, 40, 105, 42, 123, 38, 37, 111, 89, 126, 56, 41, 19, 101, 102, 75, 98, 72, 34, 59, 9, 29, 115, 67, 100, 46, 68, 20, 22, 4, 30, 48, 36, 107, 26, 32, 1, 94, 95, 28, 82, 35, 18, 93, 90, 77, 96, 64, 31, 39, 2, 15, 73, 69, 79, 8, 84, 13, 7, 70, 74, 33, 65, 3, 0], [103, 97, 53, 117, 81, 11, 85, 14, 87, 106, 23, 69, 80, 12, 9, 2, 88, 60, 127, 7, 5, 124, 3, 51, 74, 112, 116, 0, 65, 70, 61, 4, 28, 126, 24, 118, 16, 29, 121, 83, 15, 122, 22, 45, 54, 44, 30, 120, 6, 25, 105, 114, 21, 79, 49, 71, 78, 50, 75, 67, 10, 17, 89, 63, 76, 55, 73, 64, 113, 40, 8, 1, 19, 98, 66, 86, 77, 58, 46, 109, 27, 82, 42, 108, 72, 13, 92, 104, 91, 34, 110, 84, 59, 62, 20, 39, 31, 125, 52, 18, 123, 90, 68, 41, 26, 32, 93, 94, 96, 107, 95, 48, 57, 99, 43, 119, 101, 38, 111, 37, 36, 35, 100, 47, 102, 56, 115, 33], [56, 102, 127, 24, 17, 93, 44, 60, 78, 11, 61, 33, 86, 88, 29, 116, 113, 108, 90, 50, 35, 114, 115, 47, 19, 59, 6, 62, 92, 123, 31, 40, 26, 72, 63, 119, 54, 20, 46, 120, 118, 122, 112, 126, 49, 104, 51, 111, 53, 124, 99, 121, 117, 23, 125, 37, 48, 55, 85, 57, 110, 36, 109, 42, 106, 32, 34, 43, 94, 45, 58, 30, 103, 39, 107, 52, 38, 8, 16, 105, 41, 100, 83, 89, 87, 96, 91, 95, 81, 98, 79, 28, 101, 82, 70, 25, 14, 27, 74, 21, 4, 10, 22, 3, 84, 77, 97, 75, 71, 76, 13, 18, 12, 80, 2, 15, 66, 69, 64, 67, 9, 7, 5, 0, 73, 68, 1, 65], [56, 102, 127, 24, 61, 113, 93, 60, 116, 114, 50, 108, 115, 63, 62, 59, 47, 126, 122, 88, 123, 119, 90, 38, 120, 112, 49, 53, 125, 46, 51, 121, 55, 124, 111, 117, 54, 42, 33, 48, 44, 39, 110, 57, 52, 58, 118, 109, 17, 103, 45, 30, 105, 43, 32, 81, 104, 40, 36, 78, 29, 75, 107, 106, 37, 84, 35, 41, 96, 34, 22, 92, 86, 19, 66, 94, 99, 101, 26, 100, 87, 2, 8, 80, 83, 98, 20, 11, 95, 91, 31, 85, 28, 97, 27, 79, 14, 73, 89, 15, 16, 6, 72, 3, 23, 64, 70, 77, 25, 21, 76, 5, 68, 82, 4, 13, 74, 9, 0, 18, 1, 67, 10, 12, 7, 65, 71, 69], [56, 108, 102, 127, 93, 9, 33, 24, 17, 61, 90, 94, 113, 60, 103, 1, 116, 88, 86, 39, 115, 65, 2, 50, 30, 114, 47, 59, 120, 62, 34, 23, 85, 123, 122, 126, 119, 69, 63, 87, 44, 55, 46, 73, 111, 104, 3, 0, 121, 18, 20, 64, 112, 31, 38, 54, 53, 49, 96, 68, 91, 51, 109, 98, 124, 36, 79, 95, 5, 48, 11, 117, 26, 37, 13, 57, 32, 77, 28, 35, 125, 42, 58, 52, 21, 7, 110, 118, 8, 27, 97, 10, 74, 4, 106, 105, 100, 16, 83, 72, 99, 66, 107, 15, 25, 40, 29, 19, 92, 45, 89, 82, 70, 71, 101, 84, 78, 6, 41, 67, 80, 22, 43, 76, 12, 81, 75, 14], [56, 102, 33, 108, 127, 93, 17, 24, 88, 113, 61, 90, 11, 86, 60, 82, 116, 78, 35, 59, 29, 20, 6, 47, 114, 50, 26, 44, 38, 123, 62, 122, 115, 120, 16, 106, 37, 21, 34, 63, 95, 119, 30, 126, 112, 87, 76, 121, 55, 49, 94, 51, 27, 53, 54, 28, 36, 83, 85, 80, 9, 75, 72, 89, 117, 124, 25, 110, 48, 91, 111, 125, 81, 79, 46, 42, 22, 104, 45, 103, 118, 105, 43, 57, 58, 96, 109, 98, 39, 52, 40, 99, 18, 32, 12, 15, 97, 23, 107, 31, 92, 84, 5, 7, 19, 100, 101, 74, 41, 77, 10, 70, 13, 73, 64, 14, 4, 3, 0, 67, 71, 2, 66, 69, 8, 68, 1, 65], [123, 39, 113, 69, 3, 64, 112, 1, 77, 71, 49, 63, 29, 13, 48, 111, 91, 61, 73, 98, 122, 54, 60, 119, 120, 4, 11, 72, 124, 88, 82, 24, 53, 115, 101, 56, 78, 22, 66, 62, 95, 17, 121, 59, 57, 81, 125, 55, 65, 83, 40, 118, 43, 127, 114, 93, 51, 8, 94, 70, 68, 46, 92, 15, 116, 106, 96, 36, 89, 47, 50, 35, 76, 2, 99, 52, 110, 97, 109, 102, 108, 44, 80, 45, 117, 104, 107, 126, 38, 41, 28, 9, 105, 58, 74, 37, 32, 26, 23, 33, 10, 100, 21, 42, 0, 25, 67, 12, 87, 19, 6, 31, 30, 90, 16, 7, 18, 20, 27, 84, 103, 5, 34, 75, 85, 79, 14, 86], [113, 123, 39, 13, 73, 71, 17, 83, 115, 74, 92, 78, 76, 59, 60, 55, 126, 63, 114, 57, 120, 127, 47, 116, 52, 62, 95, 119, 118, 117, 53, 121, 54, 69, 112, 82, 51, 56, 99, 46, 122, 109, 50, 124, 3, 58, 111, 48, 6, 100, 16, 125, 49, 110, 61, 108, 45, 88, 11, 44, 75, 106, 37, 107, 42, 43, 31, 86, 28, 72, 21, 98, 94, 15, 84, 41, 105, 67, 104, 93, 5, 36, 68, 85, 65, 38, 102, 91, 101, 40, 35, 97, 30, 25, 1, 64, 90, 22, 103, 33, 96, 87, 20, 32, 23, 29, 34, 2, 80, 8, 27, 89, 18, 81, 24, 14, 4, 26, 0, 77, 12, 19, 79, 10, 70, 66, 7, 9], [123, 39, 113, 112, 49, 32, 54, 95, 60, 99, 63, 115, 120, 90, 83, 124, 122, 48, 43, 13, 121, 94, 87, 125, 119, 61, 98, 56, 53, 111, 57, 85, 40, 127, 17, 38, 59, 62, 118, 36, 55, 114, 102, 105, 37, 110, 97, 92, 101, 81, 26, 47, 104, 109, 28, 35, 51, 45, 46, 96, 108, 116, 107, 50, 30, 41, 100, 106, 74, 52, 117, 126, 31, 44, 23, 42, 58, 82, 93, 33, 73, 25, 91, 29, 22, 84, 6, 21, 78, 18, 79, 86, 89, 34, 88, 11, 27, 24, 14, 15, 19, 76, 16, 75, 103, 20, 69, 10, 71, 8, 80, 72, 66, 77, 7, 3, 4, 70, 68, 12, 67, 64, 2, 1, 65, 9, 0, 5], [113, 123, 39, 119, 52, 120, 63, 126, 121, 118, 53, 57, 115, 106, 114, 47, 56, 13, 59, 116, 55, 51, 60, 58, 122, 127, 117, 111, 46, 50, 54, 92, 48, 109, 110, 62, 45, 43, 124, 108, 107, 61, 44, 74, 42, 112, 105, 125, 73, 49, 69, 101, 82, 41, 76, 64, 83, 99, 21, 17, 102, 104, 78, 5, 38, 71, 91, 1, 6, 40, 3, 66, 70, 72, 37, 100, 90, 30, 97, 8, 25, 81, 36, 15, 4, 89, 93, 26, 75, 35, 16, 96, 80, 19, 2, 11, 88, 33, 68, 14, 9, 84, 79, 98, 27, 87, 86, 32, 31, 67, 24, 18, 103, 10, 95, 20, 94, 12, 28, 65, 29, 85, 34, 23, 22, 7, 77, 0], [62, 63, 39, 53, 127, 32, 57, 36, 54, 121, 107, 47, 116, 46, 115, 59, 52, 55, 50, 51, 41, 105, 58, 61, 123, 96, 60, 125, 91, 27, 113, 122, 120, 106, 48, 124, 44, 103, 117, 126, 114, 98, 49, 119, 56, 118, 110, 45, 85, 43, 35, 38, 89, 112, 109, 101, 104, 40, 111, 108, 81, 37, 25, 42, 99, 22, 100, 34, 102, 92, 94, 87, 19, 33, 31, 23, 95, 0, 93, 21, 86, 26, 20, 97, 78, 17, 10, 80, 12, 29, 28, 83, 90, 66, 74, 65, 77, 82, 15, 30, 64, 24, 71, 72, 18, 69, 9, 84, 67, 76, 6, 1, 3, 68, 13, 11, 5, 4, 8, 14, 16, 88, 2, 75, 79, 70, 7, 73], [39, 63, 62, 24, 84, 94, 33, 18, 53, 86, 26, 88, 92, 80, 32, 78, 90, 127, 14, 29, 75, 73, 23, 30, 16, 121, 67, 17, 57, 82, 70, 54, 9, 95, 107, 1, 98, 116, 58, 47, 115, 15, 50, 27, 59, 52, 21, 22, 74, 60, 20, 55, 46, 83, 11, 51, 93, 8, 13, 85, 61, 31, 34, 125, 96, 123, 124, 91, 28, 79, 117, 113, 120, 4, 64, 77, 41, 76, 81, 5, 89, 25, 6, 122, 42, 72, 48, 12, 2, 126, 36, 35, 65, 118, 49, 102, 38, 87, 110, 101, 103, 114, 56, 19, 71, 119, 106, 108, 45, 37, 7, 97, 104, 100, 109, 111, 44, 112, 43, 105, 99, 3, 10, 40, 69, 68, 66, 0], [63, 62, 39, 53, 127, 57, 32, 54, 116, 121, 52, 59, 55, 47, 50, 115, 125, 61, 46, 51, 58, 60, 96, 123, 120, 122, 90, 113, 91, 86, 117, 124, 41, 48, 126, 34, 119, 118, 56, 106, 114, 92, 27, 109, 107, 101, 49, 110, 26, 35, 33, 89, 45, 21, 111, 102, 105, 108, 100, 36, 85, 112, 93, 44, 42, 43, 104, 99, 40, 98, 22, 29, 81, 88, 37, 10, 94, 64, 38, 103, 95, 16, 66, 65, 31, 19, 0, 97, 14, 25, 30, 67, 83, 72, 17, 24, 76, 87, 20, 6, 23, 74, 28, 78, 68, 69, 9, 12, 8, 15, 11, 84, 1, 5, 7, 79, 77, 82, 80, 4, 71, 18, 70, 73, 2, 3, 13, 75], [39, 62, 63, 94, 84, 24, 88, 53, 86, 21, 32, 74, 14, 10, 33, 26, 83, 90, 18, 127, 8, 92, 20, 69, 30, 23, 36, 57, 16, 7, 76, 66, 68, 121, 67, 54, 96, 27, 116, 89, 73, 87, 13, 52, 47, 82, 55, 28, 72, 115, 59, 17, 50, 6, 85, 77, 60, 70, 58, 34, 22, 80, 46, 95, 5, 81, 65, 61, 107, 125, 29, 51, 98, 49, 123, 93, 91, 9, 117, 124, 101, 108, 19, 120, 79, 25, 11, 113, 78, 15, 31, 38, 35, 64, 12, 103, 105, 99, 118, 41, 110, 106, 48, 122, 56, 100, 45, 97, 126, 37, 119, 104, 114, 40, 71, 43, 109, 4, 44, 102, 42, 111, 112, 1, 75, 0, 3, 2], [113, 48, 101, 112, 26, 94, 87, 18, 123, 20, 49, 86, 122, 59, 120, 79, 57, 62, 124, 58, 55, 60, 54, 115, 76, 43, 125, 37, 53, 119, 6, 39, 45, 51, 117, 126, 52, 114, 63, 61, 56, 50, 118, 106, 30, 75, 98, 34, 88, 32, 104, 107, 111, 127, 47, 110, 41, 116, 121, 92, 82, 108, 33, 44, 29, 102, 40, 42, 15, 103, 89, 90, 96, 105, 109, 99, 46, 23, 36, 35, 24, 100, 17, 22, 13, 31, 19, 97, 28, 70, 25, 91, 16, 95, 85, 27, 84, 11, 78, 21, 38, 83, 93, 12, 81, 14, 77, 73, 67, 8, 74, 9, 72, 71, 80, 3, 10, 7, 5, 4, 65, 68, 69, 66, 1, 2, 0, 64], [48, 113, 101, 112, 26, 122, 18, 94, 59, 87, 20, 43, 123, 57, 76, 79, 30, 63, 92, 98, 49, 53, 61, 124, 51, 88, 120, 86, 58, 117, 119, 39, 60, 121, 55, 125, 62, 115, 6, 50, 52, 89, 54, 56, 126, 96, 107, 114, 45, 102, 110, 29, 118, 47, 82, 35, 127, 34, 40, 46, 106, 24, 111, 32, 42, 100, 116, 108, 105, 37, 109, 44, 75, 90, 103, 104, 93, 36, 97, 25, 41, 31, 13, 27, 22, 28, 16, 95, 99, 38, 19, 91, 83, 12, 23, 85, 81, 33, 15, 17, 84, 21, 14, 11, 9, 78, 70, 8, 72, 74, 73, 77, 80, 67, 3, 71, 10, 7, 5, 65, 68, 4, 69, 1, 66, 2, 0, 64], [48, 113, 74, 16, 101, 7, 13, 68, 64, 69, 4, 112, 2, 122, 0, 87, 5, 1, 66, 65, 94, 67, 86, 3, 20, 58, 71, 53, 54, 98, 125, 55, 107, 57, 70, 6, 60, 49, 56, 120, 119, 52, 114, 121, 32, 26, 18, 62, 117, 124, 50, 118, 10, 11, 75, 110, 127, 30, 63, 47, 89, 51, 45, 82, 105, 104, 115, 61, 80, 106, 8, 103, 28, 76, 44, 77, 73, 96, 72, 108, 12, 126, 111, 79, 59, 42, 9, 109, 123, 24, 90, 23, 46, 85, 84, 93, 81, 37, 15, 14, 22, 78, 29, 19, 116, 17, 40, 34, 102, 83, 27, 35, 43, 21, 91, 33, 88, 92, 31, 25, 97, 95, 41, 38, 39, 100, 99, 36], [48, 113, 101, 122, 94, 112, 87, 26, 18, 59, 20, 123, 86, 49, 43, 79, 98, 57, 19, 120, 124, 30, 89, 32, 8, 115, 51, 76, 107, 58, 39, 21, 97, 82, 61, 63, 60, 62, 88, 29, 13, 52, 42, 6, 125, 34, 53, 117, 56, 55, 37, 119, 111, 96, 54, 28, 106, 44, 100, 114, 16, 75, 108, 104, 102, 85, 118, 126, 50, 105, 17, 45, 36, 27, 103, 116, 127, 31, 40, 91, 110, 92, 25, 121, 35, 47, 41, 22, 23, 109, 38, 33, 83, 46, 90, 95, 99, 74, 84, 24, 93, 12, 14, 78, 81, 73, 15, 11, 9, 72, 70, 3, 67, 77, 7, 80, 71, 10, 69, 65, 4, 5, 68, 1, 66, 2, 0, 64], [56, 113, 63, 102, 117, 49, 124, 68, 53, 76, 59, 85, 55, 5, 114, 60, 116, 58, 94, 121, 61, 89, 16, 7, 2, 66, 72, 50, 62, 8, 9, 12, 111, 51, 80, 115, 123, 118, 6, 126, 122, 48, 77, 54, 120, 52, 101, 40, 125, 69, 87, 91, 46, 119, 21, 27, 1, 3, 4, 29, 30, 96, 127, 42, 112, 83, 105, 92, 43, 44, 57, 35, 39, 36, 75, 22, 45, 109, 17, 24, 107, 110, 14, 104, 108, 88, 103, 47, 18, 79, 25, 23, 78, 41, 64, 15, 28, 97, 37, 93, 106, 20, 99, 98, 95, 81, 19, 26, 33, 82, 90, 100, 10, 38, 70, 31, 11, 84, 34, 73, 32, 13, 67, 65, 74, 86, 71, 0], [113, 56, 63, 124, 53, 59, 117, 55, 9, 49, 40, 116, 94, 58, 81, 69, 121, 61, 77, 62, 50, 114, 48, 118, 54, 60, 64, 111, 125, 51, 115, 123, 122, 126, 2, 120, 1, 24, 43, 57, 127, 68, 8, 52, 12, 39, 46, 119, 21, 70, 101, 112, 3, 45, 105, 107, 104, 109, 110, 47, 106, 44, 102, 108, 96, 30, 79, 6, 42, 92, 85, 72, 41, 103, 26, 36, 33, 7, 37, 15, 98, 100, 66, 16, 35, 25, 65, 83, 74, 97, 34, 93, 73, 95, 99, 90, 4, 17, 31, 28, 88, 91, 75, 38, 89, 67, 19, 23, 10, 87, 0, 5, 29, 13, 20, 32, 71, 78, 84, 27, 18, 14, 11, 82, 80, 76, 86, 22], [113, 56, 63, 94, 124, 49, 102, 117, 24, 96, 53, 38, 59, 104, 19, 55, 92, 43, 40, 58, 26, 91, 89, 35, 9, 116, 48, 114, 118, 37, 121, 22, 62, 51, 61, 50, 15, 60, 30, 122, 97, 126, 87, 46, 111, 54, 69, 29, 123, 52, 83, 120, 1, 85, 33, 125, 127, 17, 36, 77, 98, 12, 64, 81, 68, 119, 115, 3, 73, 105, 57, 75, 90, 7, 112, 86, 45, 79, 39, 109, 2, 8, 108, 44, 99, 6, 110, 34, 95, 25, 28, 76, 100, 10, 106, 27, 16, 93, 101, 18, 82, 78, 14, 47, 42, 41, 103, 88, 70, 107, 23, 31, 13, 21, 72, 20, 4, 32, 84, 80, 11, 74, 66, 67, 71, 0, 65, 5], [113, 56, 63, 124, 49, 117, 53, 40, 59, 114, 55, 102, 58, 116, 48, 24, 50, 94, 26, 121, 51, 54, 96, 60, 62, 118, 61, 87, 89, 126, 35, 43, 120, 77, 122, 123, 81, 111, 125, 57, 109, 52, 105, 9, 46, 17, 115, 104, 29, 119, 127, 92, 97, 30, 112, 99, 44, 22, 70, 45, 91, 110, 1, 37, 42, 108, 107, 103, 100, 101, 47, 38, 36, 15, 39, 6, 41, 27, 85, 106, 3, 20, 31, 90, 98, 33, 34, 88, 14, 73, 28, 23, 95, 19, 72, 64, 68, 12, 93, 79, 18, 13, 16, 86, 32, 82, 21, 25, 8, 69, 83, 78, 74, 11, 84, 80, 76, 10, 2, 75, 7, 65, 66, 71, 5, 0, 4, 67]], "model.layers.14.self_attn.k_proj": [[60, 101, 97, 62, 86, 117, 120, 94, 51, 59, 53, 90, 118, 58, 126, 57, 48, 111, 123, 121, 115, 54, 116, 87, 61, 81, 114, 125, 119, 56, 52, 127, 124, 55, 107, 19, 110, 122, 105, 50, 63, 106, 112, 89, 37, 40, 46, 44, 109, 45, 47, 43, 49, 102, 88, 108, 113, 12, 103, 21, 92, 104, 78, 15, 25, 42, 41, 32, 38, 39, 16, 93, 1, 34, 74, 67, 100, 27, 36, 71, 70, 31, 95, 13, 91, 98, 35, 68, 0, 20, 83, 73, 24, 33, 99, 96, 14, 79, 84, 72, 22, 28, 29, 11, 85, 80, 26, 77, 30, 66, 18, 8, 23, 17, 76, 82, 10, 9, 75, 5, 69, 2, 7, 65, 64, 3, 6, 4], [52, 38, 97, 86, 60, 93, 124, 56, 119, 123, 125, 61, 50, 57, 26, 113, 110, 62, 121, 127, 55, 81, 58, 116, 63, 109, 112, 53, 45, 115, 117, 108, 42, 49, 15, 46, 54, 106, 48, 111, 122, 118, 120, 88, 47, 84, 33, 114, 126, 82, 107, 103, 43, 39, 59, 22, 40, 102, 41, 87, 37, 105, 85, 36, 44, 75, 83, 100, 96, 77, 35, 13, 101, 104, 9, 99, 0, 98, 31, 16, 27, 30, 92, 34, 25, 32, 95, 51, 80, 17, 89, 28, 94, 10, 76, 69, 65, 29, 19, 14, 21, 73, 70, 91, 2, 24, 78, 72, 12, 71, 7, 4, 18, 90, 67, 68, 23, 8, 20, 11, 79, 5, 3, 74, 1, 66, 6, 64], [53, 39, 33, 87, 117, 14, 81, 80, 7, 9, 12, 74, 0, 42, 11, 65, 85, 2, 3, 60, 4, 48, 69, 70, 114, 83, 116, 61, 51, 64, 127, 54, 66, 67, 122, 79, 126, 113, 55, 118, 63, 124, 52, 5, 72, 120, 68, 88, 91, 109, 89, 25, 115, 112, 121, 108, 6, 45, 103, 59, 40, 73, 71, 22, 49, 1, 41, 34, 105, 77, 104, 23, 26, 10, 21, 43, 106, 44, 47, 96, 13, 50, 30, 92, 16, 110, 119, 18, 93, 123, 28, 76, 20, 58, 125, 94, 75, 8, 78, 29, 102, 36, 24, 46, 101, 90, 111, 56, 97, 57, 107, 31, 95, 35, 99, 19, 37, 27, 98, 38, 84, 100, 62, 82, 17, 86, 32, 15], [56, 38, 127, 86, 97, 44, 60, 61, 119, 114, 50, 113, 122, 116, 123, 62, 110, 63, 29, 115, 126, 47, 112, 59, 120, 51, 90, 55, 121, 39, 53, 49, 57, 124, 24, 117, 58, 45, 30, 78, 48, 118, 54, 125, 6, 109, 111, 17, 11, 1, 103, 52, 46, 107, 101, 42, 0, 41, 102, 105, 106, 28, 23, 108, 89, 20, 40, 104, 16, 43, 33, 35, 100, 98, 76, 25, 96, 82, 68, 15, 84, 85, 99, 32, 77, 37, 18, 74, 95, 36, 2, 34, 91, 19, 7, 64, 21, 22, 12, 31, 80, 93, 27, 3, 92, 87, 10, 67, 13, 79, 72, 94, 9, 26, 71, 69, 5, 73, 8, 66, 81, 14, 75, 4, 65, 83, 70, 88], [123, 113, 103, 22, 34, 60, 122, 54, 124, 61, 53, 125, 111, 56, 57, 49, 62, 119, 121, 55, 18, 47, 63, 114, 127, 115, 51, 40, 59, 96, 112, 93, 117, 116, 52, 126, 120, 46, 58, 118, 50, 44, 48, 109, 45, 107, 42, 104, 27, 98, 110, 43, 75, 41, 108, 106, 14, 65, 38, 100, 7, 35, 105, 92, 68, 84, 102, 37, 101, 0, 26, 67, 80, 36, 15, 9, 32, 31, 5, 33, 24, 30, 8, 87, 97, 88, 77, 94, 99, 25, 95, 6, 79, 20, 10, 39, 28, 81, 72, 91, 23, 76, 83, 70, 2, 90, 89, 12, 29, 66, 78, 21, 82, 17, 16, 11, 85, 74, 64, 19, 71, 4, 69, 86, 3, 1, 73, 13], [63, 62, 103, 86, 127, 53, 57, 121, 54, 116, 55, 30, 59, 50, 52, 58, 115, 125, 123, 97, 120, 61, 92, 60, 46, 84, 18, 51, 113, 26, 117, 124, 96, 118, 47, 126, 48, 88, 49, 24, 110, 114, 122, 100, 56, 119, 80, 15, 108, 106, 91, 111, 45, 107, 42, 14, 112, 109, 101, 38, 41, 43, 102, 40, 33, 75, 99, 44, 9, 105, 37, 31, 77, 83, 104, 93, 35, 94, 21, 17, 16, 25, 13, 98, 85, 36, 72, 82, 23, 34, 11, 19, 79, 29, 6, 66, 78, 32, 68, 0, 12, 95, 69, 22, 27, 81, 89, 28, 10, 87, 71, 90, 20, 39, 74, 76, 65, 3, 7, 73, 70, 67, 8, 1, 2, 4, 5, 64], [112, 64, 37, 113, 48, 49, 1, 69, 16, 74, 30, 68, 86, 87, 7, 123, 13, 2, 53, 20, 122, 34, 43, 58, 57, 96, 121, 118, 59, 120, 66, 125, 62, 56, 119, 54, 60, 75, 3, 26, 18, 55, 4, 52, 6, 115, 50, 114, 124, 111, 117, 126, 63, 79, 61, 0, 51, 47, 76, 127, 67, 110, 103, 116, 41, 109, 46, 106, 108, 93, 45, 104, 89, 38, 44, 36, 105, 8, 29, 35, 39, 42, 100, 71, 81, 102, 28, 15, 33, 21, 31, 90, 40, 27, 107, 25, 94, 19, 78, 24, 84, 99, 92, 88, 70, 73, 97, 85, 22, 72, 80, 5, 9, 91, 98, 95, 14, 77, 17, 32, 23, 83, 65, 12, 10, 82, 101, 11], [56, 113, 63, 38, 99, 124, 117, 59, 53, 22, 58, 116, 114, 55, 62, 122, 60, 32, 61, 50, 126, 51, 123, 111, 48, 121, 54, 127, 115, 118, 52, 46, 120, 125, 119, 57, 108, 112, 49, 45, 110, 109, 27, 40, 47, 35, 43, 104, 44, 89, 107, 97, 41, 42, 101, 106, 103, 105, 39, 92, 30, 98, 93, 33, 36, 87, 100, 102, 37, 19, 96, 18, 31, 13, 95, 34, 24, 17, 26, 91, 86, 20, 15, 29, 94, 28, 88, 85, 65, 2, 23, 0, 70, 80, 90, 7, 83, 10, 78, 74, 82, 25, 84, 14, 73, 8, 11, 3, 67, 81, 79, 75, 68, 16, 69, 21, 5, 71, 76, 4, 9, 12, 77, 72, 66, 64, 1, 6]], "model.layers.14.self_attn.qk_proj": [[56, 113, 53, 60, 52, 123, 63, 62, 48, 112, 117, 116, 58, 120, 121, 124, 122, 57, 125, 127, 61, 103, 115, 55, 51, 59, 22, 97, 50, 119, 114, 38, 102, 49, 54, 39, 37, 118, 23, 126, 101, 47, 87, 111, 94, 86, 108, 16, 80, 33, 46, 0, 42, 110, 17, 26, 64, 109, 43, 90, 24, 106, 45, 88, 81, 93, 78, 14, 44, 107, 10, 69, 105, 29, 7, 5, 83, 74, 71, 12, 84, 11, 30, 40, 34, 19, 13, 4, 77, 2, 85, 9, 96, 73, 20, 21, 1, 66, 76, 41, 75, 67, 68, 28, 104, 99, 15, 65, 18, 82, 3, 36, 89, 35, 79, 92, 25, 98, 32, 70, 6, 100, 8, 31, 91, 95, 27, 72], [113, 56, 53, 52, 60, 123, 63, 62, 48, 112, 117, 116, 58, 124, 120, 127, 61, 125, 121, 122, 55, 103, 57, 115, 51, 59, 97, 102, 39, 22, 49, 38, 114, 54, 50, 23, 118, 37, 101, 119, 126, 94, 111, 47, 87, 109, 33, 86, 16, 64, 108, 43, 80, 0, 24, 17, 106, 90, 42, 46, 110, 45, 10, 26, 88, 14, 107, 78, 81, 93, 44, 29, 74, 4, 68, 71, 11, 85, 5, 105, 83, 69, 12, 9, 99, 75, 41, 34, 96, 19, 7, 21, 28, 77, 84, 40, 104, 65, 13, 20, 18, 1, 30, 76, 6, 92, 2, 73, 66, 67, 32, 36, 79, 25, 82, 70, 15, 98, 100, 35, 31, 89, 91, 8, 3, 27, 95, 72], [56, 113, 53, 60, 52, 123, 63, 62, 48, 112, 117, 116, 58, 127, 124, 61, 120, 122, 103, 125, 121, 59, 115, 57, 39, 54, 55, 97, 51, 49, 38, 50, 102, 114, 118, 47, 101, 119, 126, 37, 111, 46, 23, 22, 87, 94, 43, 108, 86, 0, 33, 64, 16, 110, 106, 109, 107, 80, 45, 88, 44, 90, 24, 42, 26, 30, 17, 93, 14, 29, 71, 10, 41, 105, 81, 78, 85, 69, 83, 34, 5, 99, 96, 12, 74, 7, 11, 4, 40, 65, 19, 68, 13, 66, 28, 104, 1, 9, 21, 75, 77, 76, 20, 73, 6, 84, 2, 35, 98, 32, 3, 25, 18, 67, 36, 82, 15, 92, 89, 70, 91, 31, 27, 79, 72, 100, 95, 8], [56, 113, 53, 52, 60, 123, 63, 62, 48, 112, 117, 116, 58, 125, 121, 122, 124, 61, 120, 127, 57, 49, 103, 114, 55, 39, 54, 97, 115, 59, 50, 51, 102, 111, 119, 101, 22, 38, 126, 47, 118, 23, 37, 94, 110, 46, 87, 108, 43, 0, 64, 16, 86, 80, 45, 109, 33, 90, 106, 24, 107, 17, 44, 93, 26, 42, 78, 88, 29, 81, 41, 69, 74, 2, 19, 7, 10, 104, 14, 105, 71, 30, 85, 96, 83, 13, 34, 12, 66, 76, 84, 77, 40, 65, 5, 11, 1, 9, 73, 4, 28, 3, 68, 20, 99, 75, 92, 36, 6, 32, 21, 98, 67, 18, 25, 100, 89, 15, 35, 82, 79, 70, 72, 31, 91, 95, 27, 8], [56, 113, 53, 52, 60, 123, 63, 62, 48, 112, 117, 122, 58, 121, 125, 116, 57, 61, 120, 55, 124, 103, 49, 127, 115, 114, 39, 59, 97, 118, 38, 51, 50, 22, 46, 102, 101, 47, 23, 54, 126, 94, 37, 108, 111, 110, 86, 0, 16, 119, 87, 80, 64, 33, 43, 109, 90, 106, 14, 81, 42, 44, 107, 24, 17, 26, 78, 71, 10, 30, 93, 2, 45, 66, 83, 11, 74, 69, 19, 88, 29, 105, 5, 4, 104, 68, 84, 13, 9, 12, 76, 41, 7, 40, 96, 73, 21, 1, 34, 75, 18, 85, 20, 77, 28, 65, 3, 79, 92, 82, 32, 89, 15, 99, 98, 6, 67, 35, 100, 70, 36, 25, 27, 91, 31, 72, 95, 8], [56, 113, 53, 52, 123, 60, 63, 62, 48, 112, 117, 116, 121, 55, 127, 125, 58, 124, 120, 122, 103, 57, 61, 59, 49, 115, 114, 97, 22, 118, 39, 102, 51, 38, 54, 50, 119, 23, 37, 101, 109, 94, 87, 108, 126, 110, 111, 86, 47, 16, 80, 64, 17, 33, 14, 43, 90, 26, 42, 81, 0, 44, 24, 78, 46, 45, 106, 29, 11, 4, 88, 93, 10, 9, 83, 12, 74, 76, 84, 30, 75, 41, 85, 19, 7, 71, 105, 96, 18, 68, 107, 69, 66, 20, 5, 13, 73, 77, 104, 21, 32, 82, 79, 65, 2, 15, 3, 40, 34, 28, 89, 98, 92, 1, 70, 99, 36, 67, 6, 31, 25, 35, 100, 72, 91, 27, 95, 8], [56, 113, 53, 123, 52, 60, 63, 62, 48, 112, 117, 116, 121, 120, 127, 55, 58, 115, 125, 61, 49, 122, 114, 59, 57, 103, 124, 22, 118, 97, 102, 23, 119, 38, 51, 39, 37, 101, 94, 50, 54, 87, 126, 86, 109, 111, 16, 47, 110, 80, 17, 26, 43, 108, 33, 90, 14, 46, 78, 81, 42, 24, 64, 45, 106, 85, 29, 0, 88, 44, 30, 11, 10, 105, 74, 104, 9, 75, 76, 93, 84, 71, 107, 20, 73, 7, 69, 83, 12, 15, 41, 5, 77, 96, 21, 19, 28, 40, 13, 82, 18, 32, 2, 66, 34, 67, 79, 4, 99, 36, 6, 98, 92, 68, 65, 89, 1, 70, 25, 3, 35, 31, 100, 27, 72, 91, 95, 8], [56, 113, 53, 52, 123, 60, 63, 62, 48, 112, 117, 120, 58, 121, 116, 59, 115, 55, 114, 124, 57, 122, 127, 103, 49, 97, 125, 38, 51, 22, 102, 61, 119, 23, 118, 54, 37, 101, 39, 126, 94, 87, 47, 50, 86, 46, 111, 109, 110, 16, 43, 108, 33, 26, 0, 64, 90, 80, 88, 78, 45, 14, 81, 17, 85, 107, 42, 29, 24, 30, 44, 84, 106, 75, 10, 96, 83, 11, 105, 93, 21, 74, 20, 5, 34, 71, 104, 65, 76, 9, 77, 7, 15, 41, 13, 18, 82, 4, 69, 1, 68, 12, 73, 2, 66, 36, 70, 19, 32, 79, 35, 28, 92, 98, 99, 89, 25, 3, 6, 91, 40, 67, 31, 100, 72, 27, 95, 8], [56, 113, 53, 52, 123, 60, 63, 62, 48, 112, 117, 116, 120, 125, 58, 127, 121, 122, 49, 124, 55, 115, 57, 103, 97, 38, 54, 61, 51, 102, 114, 50, 101, 59, 39, 118, 23, 22, 119, 37, 111, 94, 87, 47, 43, 108, 86, 109, 80, 64, 0, 126, 110, 16, 33, 46, 106, 88, 78, 90, 17, 14, 26, 85, 105, 44, 45, 81, 29, 24, 10, 42, 30, 93, 4, 96, 74, 83, 84, 19, 104, 1, 71, 107, 41, 5, 34, 7, 76, 99, 21, 69, 20, 65, 75, 68, 66, 13, 28, 73, 2, 77, 11, 12, 15, 9, 32, 98, 18, 70, 25, 92, 36, 40, 35, 82, 6, 3, 79, 67, 91, 89, 100, 31, 27, 72, 95, 8], [56, 113, 53, 60, 52, 123, 63, 62, 48, 112, 117, 58, 120, 116, 57, 55, 125, 124, 121, 49, 122, 114, 127, 59, 51, 97, 103, 38, 115, 61, 39, 50, 54, 119, 23, 22, 111, 118, 102, 37, 101, 87, 94, 110, 126, 33, 64, 47, 0, 16, 80, 108, 86, 46, 17, 107, 14, 78, 45, 90, 42, 109, 24, 44, 81, 43, 26, 93, 10, 29, 71, 74, 7, 11, 88, 85, 99, 106, 30, 68, 69, 13, 12, 83, 34, 28, 84, 105, 5, 76, 96, 9, 66, 73, 4, 21, 18, 104, 75, 77, 20, 1, 70, 67, 65, 3, 2, 41, 15, 19, 82, 79, 40, 32, 89, 98, 6, 92, 100, 91, 35, 25, 36, 8, 31, 95, 27, 72], [56, 113, 53, 52, 60, 123, 62, 63, 48, 112, 117, 116, 58, 120, 55, 121, 124, 57, 49, 122, 127, 125, 103, 115, 114, 59, 39, 61, 51, 97, 50, 22, 102, 54, 23, 126, 118, 38, 101, 37, 87, 119, 47, 110, 0, 86, 94, 64, 16, 108, 80, 111, 17, 109, 46, 33, 90, 10, 106, 14, 74, 42, 78, 107, 29, 93, 45, 81, 26, 24, 43, 30, 5, 44, 71, 69, 105, 88, 66, 7, 12, 76, 13, 19, 68, 73, 104, 83, 2, 11, 84, 15, 65, 96, 18, 85, 41, 9, 77, 70, 4, 20, 40, 75, 1, 21, 34, 79, 3, 82, 28, 99, 67, 6, 36, 32, 89, 98, 92, 35, 91, 100, 8, 25, 27, 31, 72, 95], [56, 113, 53, 52, 123, 60, 63, 62, 48, 112, 117, 116, 120, 58, 121, 55, 122, 127, 124, 103, 115, 114, 49, 57, 61, 125, 51, 22, 39, 118, 23, 59, 38, 102, 97, 119, 37, 54, 101, 50, 126, 94, 111, 110, 86, 87, 108, 16, 80, 109, 17, 64, 107, 0, 47, 14, 33, 42, 46, 81, 78, 26, 90, 10, 43, 74, 29, 44, 45, 106, 30, 24, 68, 105, 88, 93, 83, 69, 41, 84, 20, 4, 76, 5, 75, 85, 73, 7, 11, 19, 12, 13, 66, 2, 77, 9, 104, 28, 71, 1, 96, 32, 18, 82, 15, 21, 67, 34, 3, 70, 79, 6, 99, 40, 65, 35, 89, 25, 36, 98, 100, 8, 31, 91, 92, 27, 72, 95], [56, 113, 53, 52, 60, 123, 63, 62, 48, 112, 117, 120, 116, 58, 127, 115, 124, 103, 55, 57, 59, 121, 125, 114, 61, 38, 50, 49, 51, 122, 119, 39, 22, 102, 23, 97, 111, 54, 37, 118, 101, 94, 87, 108, 126, 47, 86, 110, 33, 17, 109, 46, 16, 80, 88, 14, 81, 26, 42, 64, 43, 0, 44, 106, 45, 90, 24, 107, 78, 29, 105, 93, 30, 10, 74, 104, 75, 41, 11, 21, 7, 83, 85, 68, 4, 69, 84, 99, 20, 96, 19, 28, 77, 34, 76, 5, 32, 82, 13, 18, 73, 92, 71, 66, 36, 98, 9, 1, 15, 12, 2, 40, 3, 65, 6, 31, 25, 100, 91, 89, 95, 79, 35, 70, 27, 67, 8, 72], [56, 113, 53, 52, 60, 123, 62, 63, 48, 112, 117, 120, 116, 127, 58, 59, 125, 115, 124, 49, 114, 55, 57, 51, 121, 103, 119, 39, 122, 61, 22, 50, 97, 111, 54, 118, 102, 37, 38, 87, 23, 64, 110, 94, 101, 86, 109, 108, 47, 126, 0, 17, 42, 46, 80, 14, 74, 33, 16, 44, 81, 45, 90, 43, 26, 78, 88, 24, 29, 10, 7, 71, 73, 105, 18, 75, 83, 93, 76, 30, 69, 11, 104, 77, 107, 19, 5, 13, 66, 41, 9, 6, 4, 96, 106, 1, 84, 65, 34, 20, 21, 99, 12, 85, 15, 28, 68, 40, 36, 2, 79, 32, 82, 70, 67, 3, 92, 25, 98, 35, 27, 8, 91, 89, 31, 100, 95, 72], [56, 113, 53, 52, 123, 60, 63, 62, 48, 112, 117, 120, 121, 116, 124, 55, 127, 125, 49, 58, 114, 57, 59, 122, 103, 39, 61, 115, 50, 119, 51, 97, 22, 38, 118, 102, 23, 54, 37, 101, 94, 111, 110, 108, 126, 86, 87, 109, 16, 0, 64, 46, 47, 43, 80, 33, 42, 14, 90, 17, 74, 44, 45, 78, 106, 26, 81, 88, 107, 10, 7, 93, 24, 29, 30, 105, 69, 11, 71, 83, 75, 84, 73, 76, 104, 66, 19, 21, 18, 13, 5, 20, 34, 40, 4, 68, 28, 1, 3, 67, 12, 85, 2, 9, 77, 6, 96, 41, 15, 32, 65, 99, 82, 70, 79, 36, 92, 89, 98, 100, 25, 27, 8, 35, 31, 91, 95, 72], [56, 113, 53, 60, 52, 123, 63, 62, 48, 112, 117, 116, 120, 114, 55, 121, 58, 127, 124, 122, 49, 103, 125, 115, 59, 61, 57, 39, 102, 119, 51, 118, 37, 22, 54, 97, 50, 23, 38, 126, 46, 111, 94, 108, 16, 86, 110, 87, 101, 17, 64, 45, 47, 80, 44, 109, 0, 33, 43, 106, 107, 74, 14, 90, 42, 10, 105, 81, 4, 26, 83, 78, 29, 2, 93, 11, 76, 88, 24, 68, 5, 69, 7, 75, 71, 66, 41, 19, 84, 28, 21, 13, 30, 96, 9, 20, 104, 65, 40, 12, 73, 77, 18, 6, 34, 85, 98, 70, 15, 67, 79, 92, 99, 36, 32, 3, 1, 8, 89, 31, 35, 82, 100, 25, 91, 27, 95, 72], [56, 113, 53, 52, 60, 123, 63, 62, 48, 112, 117, 120, 58, 116, 114, 121, 124, 127, 55, 115, 103, 59, 39, 125, 57, 49, 122, 50, 119, 61, 38, 118, 22, 51, 97, 54, 23, 102, 126, 37, 101, 94, 86, 111, 87, 110, 17, 46, 108, 44, 16, 45, 109, 80, 33, 0, 26, 47, 64, 107, 90, 106, 78, 43, 10, 42, 93, 29, 88, 81, 30, 14, 105, 24, 74, 11, 7, 28, 5, 75, 71, 41, 77, 21, 40, 34, 18, 83, 96, 68, 104, 4, 98, 9, 13, 85, 20, 12, 69, 84, 36, 66, 65, 76, 19, 99, 6, 32, 1, 2, 92, 73, 70, 79, 82, 35, 25, 15, 100, 3, 31, 91, 89, 8, 27, 67, 95, 72], [56, 113, 53, 52, 123, 60, 63, 62, 48, 112, 117, 120, 116, 58, 121, 61, 55, 124, 115, 103, 127, 125, 59, 49, 114, 122, 39, 38, 119, 57, 118, 22, 97, 23, 54, 50, 51, 101, 37, 102, 126, 87, 86, 94, 47, 111, 46, 64, 0, 108, 16, 109, 33, 110, 44, 45, 80, 106, 17, 42, 14, 90, 107, 88, 43, 26, 78, 81, 24, 30, 93, 105, 74, 75, 7, 28, 10, 66, 71, 96, 19, 29, 69, 40, 76, 83, 11, 104, 20, 65, 34, 99, 77, 21, 84, 85, 9, 5, 18, 4, 68, 1, 82, 70, 13, 41, 73, 12, 2, 36, 67, 15, 6, 3, 92, 79, 89, 31, 35, 25, 98, 91, 32, 27, 100, 8, 95, 72], [56, 113, 53, 123, 60, 52, 63, 62, 48, 112, 117, 58, 121, 116, 120, 103, 124, 114, 122, 125, 127, 38, 57, 55, 49, 59, 115, 51, 97, 61, 54, 39, 50, 119, 101, 118, 22, 102, 23, 94, 111, 110, 37, 87, 33, 109, 47, 46, 64, 126, 108, 0, 86, 16, 44, 80, 107, 106, 90, 45, 24, 26, 43, 78, 10, 17, 81, 30, 93, 40, 105, 42, 88, 14, 28, 34, 104, 96, 74, 65, 85, 29, 99, 7, 71, 69, 19, 20, 4, 13, 5, 83, 21, 76, 98, 66, 75, 12, 84, 41, 9, 11, 18, 68, 77, 2, 92, 1, 35, 25, 3, 73, 82, 70, 32, 36, 67, 100, 91, 89, 31, 15, 79, 6, 27, 95, 72, 8], [56, 113, 53, 60, 52, 123, 63, 62, 48, 112, 117, 116, 120, 58, 121, 124, 127, 59, 61, 55, 125, 49, 103, 114, 39, 57, 122, 115, 97, 54, 22, 102, 119, 51, 50, 38, 118, 23, 37, 126, 94, 101, 111, 87, 108, 44, 33, 109, 86, 107, 0, 16, 46, 80, 47, 17, 106, 43, 64, 45, 90, 26, 24, 110, 93, 78, 42, 88, 14, 74, 10, 81, 105, 30, 29, 75, 4, 76, 19, 96, 7, 83, 40, 12, 11, 71, 99, 34, 85, 84, 5, 68, 20, 77, 28, 65, 9, 104, 21, 82, 73, 98, 1, 69, 18, 36, 41, 13, 70, 2, 15, 79, 66, 32, 35, 6, 25, 92, 89, 100, 91, 67, 31, 3, 27, 95, 72, 8], [56, 113, 53, 123, 52, 60, 63, 62, 48, 112, 117, 120, 121, 61, 58, 55, 116, 59, 125, 127, 122, 124, 115, 49, 114, 103, 57, 119, 22, 54, 118, 97, 39, 38, 51, 23, 37, 50, 126, 102, 94, 101, 86, 108, 46, 111, 87, 107, 44, 80, 110, 16, 43, 47, 109, 90, 0, 45, 88, 33, 64, 26, 14, 10, 106, 17, 105, 78, 74, 42, 81, 93, 24, 7, 76, 12, 69, 30, 19, 83, 11, 29, 28, 71, 4, 96, 2, 104, 84, 9, 75, 5, 99, 20, 41, 77, 18, 40, 67, 85, 34, 13, 21, 73, 66, 65, 82, 98, 79, 68, 70, 36, 92, 1, 15, 3, 32, 6, 91, 89, 72, 35, 25, 31, 100, 27, 95, 8], [56, 113, 53, 60, 52, 123, 63, 62, 48, 112, 117, 58, 120, 121, 61, 59, 116, 125, 55, 127, 122, 49, 103, 115, 38, 114, 54, 57, 124, 39, 51, 119, 118, 23, 97, 126, 101, 22, 102, 37, 50, 87, 86, 47, 111, 94, 80, 64, 108, 46, 90, 16, 110, 0, 33, 109, 107, 44, 45, 43, 17, 26, 88, 106, 14, 10, 105, 93, 78, 42, 81, 24, 74, 29, 19, 7, 83, 5, 76, 30, 104, 28, 71, 65, 69, 34, 40, 20, 75, 9, 84, 85, 11, 18, 77, 96, 4, 68, 12, 13, 41, 21, 73, 2, 99, 1, 82, 70, 79, 3, 98, 36, 92, 91, 66, 15, 89, 31, 25, 35, 32, 6, 72, 67, 100, 27, 95, 8], [56, 113, 53, 60, 52, 123, 63, 62, 48, 112, 117, 120, 59, 58, 124, 121, 103, 116, 38, 125, 127, 122, 55, 57, 61, 115, 54, 114, 97, 118, 49, 51, 22, 39, 50, 101, 23, 119, 102, 126, 37, 94, 86, 87, 109, 111, 47, 16, 108, 46, 64, 33, 44, 80, 110, 0, 90, 45, 88, 42, 106, 26, 81, 93, 105, 14, 10, 24, 43, 17, 78, 74, 107, 83, 30, 11, 104, 5, 68, 29, 19, 34, 4, 7, 96, 77, 40, 20, 99, 69, 85, 84, 71, 76, 2, 28, 65, 21, 66, 92, 13, 73, 9, 75, 12, 41, 18, 36, 82, 6, 89, 1, 15, 3, 25, 98, 79, 35, 32, 67, 100, 91, 27, 70, 95, 72, 31, 8], [56, 113, 53, 60, 52, 123, 63, 62, 48, 112, 117, 125, 58, 120, 121, 124, 59, 55, 61, 116, 57, 127, 103, 114, 122, 38, 97, 115, 49, 22, 51, 39, 118, 54, 23, 37, 119, 50, 94, 101, 126, 87, 111, 102, 16, 86, 47, 33, 80, 90, 110, 44, 109, 64, 42, 26, 0, 17, 108, 46, 45, 78, 106, 10, 14, 29, 88, 81, 93, 74, 24, 43, 83, 30, 107, 11, 28, 105, 76, 104, 71, 75, 4, 73, 84, 68, 41, 12, 96, 6, 7, 99, 5, 85, 66, 69, 15, 40, 13, 34, 77, 9, 19, 2, 20, 21, 82, 3, 18, 36, 32, 67, 92, 1, 100, 65, 25, 70, 89, 98, 91, 79, 27, 35, 72, 31, 95, 8], [56, 113, 53, 60, 123, 52, 63, 62, 48, 112, 117, 120, 121, 58, 59, 125, 61, 55, 116, 127, 49, 103, 57, 114, 124, 22, 115, 122, 118, 39, 38, 54, 119, 23, 51, 37, 97, 50, 86, 102, 101, 87, 94, 47, 111, 80, 126, 108, 110, 109, 46, 16, 26, 43, 33, 17, 106, 45, 78, 90, 88, 0, 64, 44, 107, 81, 29, 11, 74, 42, 14, 10, 24, 105, 83, 5, 93, 85, 30, 76, 9, 71, 40, 7, 84, 104, 69, 28, 41, 20, 13, 2, 77, 82, 18, 12, 75, 15, 96, 73, 19, 34, 1, 21, 99, 65, 66, 4, 32, 98, 68, 36, 3, 6, 100, 67, 79, 25, 89, 35, 70, 91, 31, 72, 92, 27, 95, 8], [56, 113, 60, 52, 53, 123, 63, 62, 48, 112, 117, 120, 59, 58, 61, 121, 57, 127, 116, 55, 124, 125, 103, 122, 114, 115, 22, 49, 54, 38, 118, 39, 50, 119, 51, 97, 23, 102, 37, 101, 94, 126, 86, 87, 47, 109, 110, 106, 33, 80, 26, 90, 16, 42, 88, 43, 107, 81, 111, 64, 17, 44, 108, 78, 0, 10, 29, 14, 46, 93, 24, 45, 75, 40, 85, 30, 104, 83, 74, 28, 105, 84, 41, 11, 71, 21, 76, 19, 5, 20, 32, 96, 69, 12, 18, 7, 99, 9, 77, 68, 34, 4, 36, 13, 65, 6, 2, 15, 82, 25, 66, 98, 100, 79, 73, 70, 1, 92, 89, 35, 91, 31, 3, 67, 95, 72, 27, 8], [56, 113, 60, 53, 52, 123, 63, 62, 48, 112, 117, 120, 127, 124, 59, 58, 61, 57, 121, 122, 103, 115, 116, 125, 49, 50, 51, 55, 54, 114, 39, 22, 38, 97, 23, 119, 126, 101, 118, 47, 102, 86, 37, 46, 94, 87, 44, 17, 111, 80, 26, 0, 107, 109, 64, 108, 33, 16, 90, 43, 24, 42, 106, 45, 81, 88, 105, 110, 10, 14, 30, 93, 29, 74, 68, 28, 85, 76, 78, 84, 40, 71, 83, 5, 96, 11, 104, 2, 21, 69, 77, 13, 20, 4, 12, 41, 34, 19, 66, 7, 82, 9, 75, 99, 73, 65, 67, 25, 98, 1, 3, 18, 92, 6, 15, 35, 79, 32, 70, 36, 91, 89, 31, 27, 100, 95, 72, 8], [56, 113, 53, 60, 52, 123, 63, 62, 48, 112, 117, 124, 120, 121, 58, 116, 125, 122, 61, 127, 57, 55, 59, 114, 115, 103, 97, 22, 54, 39, 49, 38, 118, 51, 50, 101, 102, 23, 126, 37, 119, 94, 87, 86, 64, 0, 80, 108, 16, 47, 43, 90, 109, 111, 33, 26, 17, 81, 45, 46, 14, 110, 88, 106, 10, 93, 78, 44, 107, 42, 29, 71, 74, 11, 76, 105, 73, 24, 83, 75, 9, 84, 30, 12, 85, 69, 4, 7, 82, 28, 66, 65, 40, 77, 96, 104, 19, 1, 21, 5, 20, 34, 41, 13, 68, 18, 67, 2, 79, 99, 3, 70, 25, 6, 91, 32, 92, 89, 15, 98, 36, 31, 27, 35, 8, 100, 95, 72], [56, 113, 53, 52, 60, 123, 63, 62, 48, 112, 117, 120, 116, 125, 57, 121, 58, 122, 127, 124, 61, 55, 103, 97, 51, 115, 49, 54, 126, 39, 114, 118, 59, 22, 23, 38, 50, 102, 101, 37, 47, 119, 46, 94, 111, 0, 108, 109, 43, 16, 80, 87, 86, 110, 33, 90, 64, 45, 81, 17, 14, 26, 106, 42, 10, 93, 88, 74, 44, 105, 78, 83, 29, 30, 24, 71, 7, 13, 69, 107, 5, 85, 40, 34, 12, 76, 9, 2, 11, 19, 28, 65, 20, 104, 96, 98, 68, 73, 70, 4, 84, 99, 66, 82, 77, 75, 41, 92, 67, 1, 21, 79, 6, 36, 89, 18, 91, 32, 35, 100, 3, 31, 8, 15, 27, 25, 95, 72], [56, 113, 53, 52, 60, 123, 63, 62, 48, 112, 117, 58, 116, 121, 120, 125, 61, 55, 124, 122, 59, 127, 57, 103, 22, 97, 49, 115, 54, 51, 50, 119, 114, 39, 37, 38, 126, 102, 101, 23, 47, 118, 87, 94, 86, 45, 80, 43, 109, 16, 111, 46, 0, 26, 78, 17, 81, 10, 90, 108, 33, 64, 14, 110, 106, 29, 88, 2, 24, 93, 42, 44, 107, 74, 83, 85, 69, 12, 76, 105, 104, 73, 4, 96, 19, 40, 30, 5, 7, 11, 75, 20, 71, 84, 41, 9, 34, 68, 77, 99, 28, 21, 82, 70, 66, 13, 18, 3, 79, 65, 67, 1, 15, 32, 89, 36, 98, 6, 25, 35, 92, 8, 91, 100, 31, 95, 27, 72], [56, 113, 53, 60, 52, 123, 63, 62, 48, 112, 117, 116, 125, 58, 120, 61, 121, 122, 57, 124, 127, 59, 97, 115, 55, 103, 114, 22, 54, 39, 51, 50, 49, 119, 102, 37, 23, 126, 86, 38, 118, 111, 87, 94, 101, 47, 110, 16, 0, 17, 33, 64, 43, 45, 80, 78, 26, 108, 46, 109, 29, 42, 90, 44, 81, 10, 24, 88, 41, 83, 106, 107, 74, 93, 68, 71, 4, 30, 7, 5, 12, 14, 69, 75, 9, 99, 96, 105, 11, 19, 73, 76, 34, 18, 2, 13, 104, 70, 77, 66, 85, 40, 28, 21, 67, 65, 1, 84, 20, 79, 32, 89, 25, 98, 82, 15, 36, 35, 31, 6, 91, 92, 3, 8, 100, 27, 95, 72], [56, 113, 53, 52, 60, 123, 63, 62, 48, 112, 117, 120, 116, 121, 125, 58, 122, 61, 55, 59, 124, 103, 22, 127, 57, 97, 114, 51, 115, 54, 23, 38, 50, 49, 126, 102, 39, 119, 101, 37, 118, 86, 94, 87, 111, 47, 17, 78, 108, 109, 80, 26, 16, 33, 43, 42, 45, 88, 90, 46, 24, 10, 0, 11, 29, 110, 44, 107, 14, 64, 106, 74, 93, 81, 30, 83, 12, 5, 105, 75, 9, 71, 40, 13, 99, 76, 7, 73, 84, 19, 66, 85, 96, 4, 20, 18, 104, 34, 69, 70, 77, 41, 79, 15, 28, 21, 65, 25, 68, 82, 89, 32, 6, 3, 2, 92, 1, 67, 98, 36, 35, 91, 8, 100, 31, 27, 95, 72]], "model.layers.15.self_attn.q_proj": [[103, 62, 113, 33, 14, 21, 8, 11, 16, 18, 49, 89, 60, 67, 91, 19, 57, 69, 79, 26, 2, 22, 24, 59, 84, 46, 3, 95, 75, 70, 87, 65, 28, 66, 13, 6, 101, 27, 72, 80, 10, 25, 0, 73, 5, 78, 44, 88, 74, 83, 71, 56, 117, 77, 81, 76, 82, 85, 94, 20, 4, 64, 9, 92, 7, 17, 12, 119, 29, 15, 90, 23, 1, 43, 86, 116, 61, 68, 123, 110, 96, 104, 30, 40, 37, 98, 108, 112, 127, 125, 93, 106, 102, 51, 50, 122, 32, 45, 118, 120, 42, 124, 55, 111, 31, 34, 36, 54, 53, 100, 63, 99, 126, 121, 114, 58, 35, 38, 97, 47, 109, 41, 105, 48, 52, 107, 115, 39], [62, 113, 57, 60, 123, 59, 61, 58, 114, 121, 126, 56, 124, 127, 116, 120, 50, 63, 125, 122, 53, 54, 119, 55, 117, 51, 118, 48, 111, 46, 112, 52, 47, 109, 115, 49, 110, 45, 44, 42, 43, 108, 107, 35, 101, 41, 106, 105, 38, 37, 40, 104, 103, 102, 97, 36, 100, 88, 34, 99, 39, 98, 33, 96, 84, 23, 20, 30, 32, 95, 29, 31, 17, 19, 93, 90, 86, 22, 28, 94, 76, 92, 91, 26, 79, 81, 24, 21, 89, 12, 87, 14, 16, 77, 13, 83, 10, 73, 18, 27, 85, 25, 70, 82, 9, 68, 1, 4, 74, 7, 8, 2, 11, 15, 5, 67, 66, 64, 71, 80, 65, 78, 69, 6, 0, 3, 72, 75], [113, 62, 57, 60, 121, 59, 119, 126, 116, 61, 120, 123, 58, 127, 56, 114, 63, 54, 53, 125, 118, 55, 124, 122, 109, 112, 49, 51, 117, 52, 48, 50, 103, 111, 47, 115, 99, 38, 43, 106, 44, 108, 41, 110, 45, 107, 42, 46, 35, 105, 40, 104, 88, 100, 37, 101, 102, 20, 96, 36, 97, 29, 90, 39, 34, 98, 95, 17, 93, 33, 87, 32, 31, 23, 92, 24, 13, 30, 94, 28, 84, 26, 91, 22, 86, 81, 76, 77, 89, 74, 27, 79, 21, 83, 73, 18, 85, 19, 9, 10, 25, 68, 12, 15, 82, 69, 14, 16, 70, 8, 4, 5, 78, 1, 71, 6, 11, 80, 0, 64, 65, 7, 66, 2, 67, 72, 75, 3], [62, 103, 113, 57, 60, 59, 61, 58, 123, 56, 36, 126, 98, 121, 54, 116, 49, 120, 114, 117, 51, 55, 101, 125, 124, 119, 63, 53, 38, 108, 112, 122, 118, 46, 22, 48, 44, 52, 50, 32, 100, 45, 106, 34, 127, 33, 111, 99, 109, 42, 43, 47, 105, 107, 115, 35, 110, 104, 41, 89, 97, 40, 95, 93, 96, 102, 87, 31, 37, 94, 29, 17, 26, 28, 30, 86, 92, 19, 27, 23, 39, 91, 79, 24, 21, 83, 81, 88, 20, 25, 13, 18, 84, 90, 16, 15, 9, 80, 82, 85, 76, 77, 12, 10, 14, 73, 74, 5, 4, 75, 78, 11, 72, 2, 71, 6, 68, 70, 7, 8, 66, 1, 69, 65, 67, 3, 64, 0], [39, 112, 97, 45, 48, 125, 19, 21, 119, 115, 89, 14, 17, 95, 118, 93, 79, 25, 120, 12, 27, 8, 53, 22, 23, 111, 77, 40, 86, 57, 7, 127, 5, 28, 42, 55, 99, 124, 122, 10, 78, 49, 29, 85, 46, 47, 62, 9, 59, 91, 114, 116, 54, 90, 58, 108, 88, 63, 123, 24, 51, 110, 107, 117, 106, 94, 104, 113, 101, 32, 96, 82, 16, 52, 71, 100, 121, 37, 60, 84, 56, 73, 44, 38, 43, 30, 11, 34, 87, 67, 18, 126, 92, 35, 98, 41, 81, 26, 61, 102, 31, 69, 66, 50, 105, 80, 36, 20, 109, 3, 13, 83, 103, 72, 33, 76, 15, 64, 74, 1, 2, 0, 70, 6, 75, 65, 4, 68], [39, 112, 45, 97, 48, 115, 89, 19, 27, 14, 21, 118, 119, 125, 120, 53, 57, 46, 122, 99, 24, 95, 55, 108, 59, 17, 22, 28, 121, 123, 106, 58, 60, 116, 107, 124, 25, 126, 62, 63, 54, 101, 114, 88, 75, 111, 127, 34, 79, 37, 47, 113, 40, 11, 36, 30, 56, 49, 42, 50, 87, 100, 61, 71, 51, 104, 52, 109, 43, 8, 44, 110, 29, 117, 96, 85, 102, 41, 92, 35, 93, 105, 23, 91, 32, 26, 38, 98, 31, 73, 84, 12, 74, 94, 16, 78, 20, 90, 83, 80, 15, 82, 86, 81, 77, 33, 67, 103, 7, 18, 69, 5, 10, 13, 2, 76, 9, 6, 66, 72, 65, 64, 3, 70, 68, 0, 1, 4], [39, 45, 97, 48, 112, 118, 21, 19, 79, 89, 17, 9, 27, 14, 7, 12, 6, 95, 25, 115, 65, 23, 5, 122, 18, 124, 16, 127, 74, 119, 125, 8, 93, 66, 24, 3, 28, 4, 78, 22, 76, 55, 1, 15, 70, 64, 111, 51, 120, 29, 80, 10, 77, 46, 67, 117, 68, 114, 72, 109, 56, 86, 73, 30, 82, 13, 75, 57, 123, 20, 2, 121, 96, 69, 58, 116, 62, 81, 85, 71, 108, 84, 53, 43, 107, 61, 49, 42, 60, 92, 83, 100, 31, 88, 11, 40, 90, 99, 50, 98, 94, 36, 32, 44, 113, 103, 0, 38, 26, 126, 47, 41, 105, 59, 104, 37, 63, 33, 54, 35, 87, 34, 91, 52, 106, 101, 110, 102], [39, 97, 45, 112, 48, 12, 17, 19, 67, 21, 118, 25, 9, 3, 1, 7, 79, 0, 89, 95, 73, 69, 124, 5, 23, 22, 8, 122, 14, 71, 85, 66, 127, 120, 10, 28, 13, 74, 119, 114, 4, 103, 75, 93, 70, 76, 117, 51, 15, 81, 2, 24, 55, 58, 78, 29, 84, 111, 125, 72, 80, 83, 6, 115, 65, 116, 30, 18, 16, 77, 92, 82, 123, 57, 27, 11, 31, 49, 47, 108, 26, 91, 50, 99, 88, 94, 56, 41, 87, 121, 113, 53, 35, 96, 107, 46, 43, 86, 36, 90, 109, 59, 100, 62, 40, 64, 54, 37, 32, 106, 68, 63, 61, 20, 105, 102, 42, 34, 52, 60, 104, 38, 126, 101, 110, 98, 44, 33], [51, 61, 117, 36, 81, 22, 30, 27, 88, 97, 100, 10, 78, 18, 44, 96, 94, 19, 108, 2, 69, 28, 91, 80, 54, 42, 20, 68, 25, 5, 74, 104, 93, 33, 63, 11, 124, 64, 14, 115, 86, 77, 83, 71, 8, 21, 114, 12, 85, 16, 103, 118, 17, 53, 23, 119, 66, 62, 84, 120, 26, 109, 111, 24, 6, 31, 110, 76, 73, 15, 121, 70, 79, 127, 13, 34, 123, 59, 82, 38, 87, 102, 37, 126, 0, 105, 72, 50, 9, 112, 122, 1, 56, 52, 89, 107, 57, 7, 46, 43, 65, 45, 49, 29, 4, 106, 75, 55, 101, 67, 48, 39, 58, 98, 90, 95, 35, 92, 32, 113, 125, 116, 40, 47, 41, 60, 99, 3], [51, 61, 36, 117, 27, 88, 30, 96, 100, 22, 78, 19, 81, 25, 10, 94, 20, 97, 44, 91, 71, 32, 21, 62, 34, 54, 69, 53, 29, 89, 103, 80, 76, 85, 82, 33, 86, 42, 83, 11, 23, 124, 13, 119, 16, 90, 110, 5, 108, 14, 87, 98, 116, 66, 18, 93, 106, 92, 26, 28, 102, 118, 77, 111, 17, 84, 101, 45, 31, 43, 6, 40, 114, 38, 74, 99, 123, 50, 24, 35, 41, 2, 109, 95, 104, 121, 12, 122, 126, 73, 48, 39, 37, 127, 120, 79, 105, 115, 7, 9, 59, 64, 55, 63, 49, 107, 46, 113, 4, 56, 72, 57, 15, 67, 52, 75, 47, 0, 60, 70, 58, 112, 125, 8, 3, 1, 68, 65], [117, 51, 36, 100, 61, 97, 102, 91, 115, 88, 30, 18, 94, 96, 25, 44, 22, 80, 27, 93, 11, 62, 20, 33, 28, 54, 53, 78, 19, 124, 63, 81, 103, 108, 76, 119, 6, 120, 111, 42, 104, 21, 59, 3, 85, 10, 4, 32, 110, 16, 127, 82, 9, 114, 69, 15, 83, 71, 123, 48, 38, 112, 12, 65, 52, 73, 118, 26, 43, 107, 122, 35, 58, 50, 126, 56, 0, 46, 121, 40, 116, 109, 60, 14, 77, 37, 67, 57, 55, 125, 41, 47, 92, 66, 39, 98, 1, 84, 106, 45, 105, 113, 24, 75, 72, 90, 49, 86, 23, 87, 95, 79, 34, 13, 99, 89, 101, 29, 68, 2, 5, 8, 70, 31, 17, 74, 7, 64], [61, 117, 51, 36, 88, 100, 30, 108, 97, 44, 85, 115, 124, 102, 62, 27, 104, 34, 54, 120, 111, 63, 59, 94, 114, 119, 91, 127, 33, 46, 52, 123, 112, 103, 122, 126, 109, 121, 50, 78, 58, 42, 43, 56, 107, 110, 116, 80, 19, 23, 57, 55, 98, 31, 22, 25, 48, 60, 125, 113, 93, 53, 105, 118, 45, 35, 47, 21, 49, 101, 96, 106, 41, 81, 37, 40, 39, 38, 32, 99, 24, 26, 69, 10, 29, 28, 5, 90, 74, 95, 2, 83, 71, 17, 86, 92, 65, 89, 73, 7, 84, 14, 76, 3, 20, 82, 87, 72, 77, 16, 11, 0, 9, 75, 66, 64, 8, 18, 67, 15, 6, 79, 70, 1, 13, 68, 4, 12], [39, 47, 115, 34, 25, 71, 12, 10, 83, 17, 14, 4, 86, 117, 111, 79, 123, 87, 22, 18, 63, 92, 7, 89, 66, 64, 56, 9, 98, 2, 21, 5, 1, 78, 91, 24, 67, 94, 76, 62, 110, 30, 19, 68, 102, 65, 6, 15, 69, 13, 27, 51, 73, 85, 80, 57, 99, 8, 70, 88, 29, 74, 23, 84, 77, 101, 55, 96, 16, 38, 40, 116, 72, 11, 50, 26, 75, 81, 31, 93, 112, 60, 90, 3, 43, 0, 48, 82, 28, 42, 106, 95, 118, 108, 20, 37, 53, 45, 41, 36, 114, 97, 113, 124, 61, 52, 49, 103, 107, 126, 127, 109, 121, 32, 35, 46, 100, 33, 44, 105, 54, 120, 58, 125, 122, 59, 119, 104], [47, 39, 115, 34, 114, 27, 37, 43, 111, 116, 25, 117, 94, 112, 56, 83, 123, 63, 51, 79, 86, 124, 55, 30, 126, 91, 89, 16, 62, 110, 58, 60, 127, 17, 109, 15, 99, 54, 46, 118, 122, 40, 107, 57, 98, 53, 52, 59, 48, 113, 125, 29, 50, 104, 44, 10, 14, 24, 41, 108, 97, 102, 61, 23, 95, 38, 121, 88, 42, 101, 106, 120, 12, 96, 80, 21, 36, 49, 35, 119, 105, 28, 85, 22, 45, 90, 19, 33, 82, 100, 26, 93, 92, 9, 31, 32, 18, 20, 81, 8, 84, 87, 77, 75, 74, 78, 103, 13, 2, 4, 73, 11, 72, 68, 70, 76, 71, 6, 69, 67, 5, 66, 3, 0, 7, 64, 65, 1], [115, 47, 39, 43, 34, 27, 25, 60, 51, 117, 37, 112, 56, 83, 114, 116, 86, 123, 55, 63, 30, 94, 10, 127, 58, 24, 15, 16, 79, 110, 126, 89, 62, 111, 17, 118, 48, 61, 124, 57, 12, 14, 72, 122, 21, 98, 107, 54, 53, 52, 113, 125, 99, 109, 105, 120, 44, 102, 91, 41, 119, 121, 66, 38, 49, 101, 104, 46, 59, 95, 50, 4, 106, 36, 100, 68, 22, 108, 45, 96, 32, 13, 93, 19, 75, 28, 42, 40, 80, 31, 97, 26, 85, 84, 71, 18, 23, 29, 35, 33, 8, 82, 9, 103, 74, 5, 87, 92, 90, 67, 88, 78, 2, 81, 20, 70, 77, 76, 3, 6, 11, 73, 0, 64, 69, 65, 7, 1], [115, 39, 47, 114, 34, 27, 116, 55, 117, 25, 56, 123, 63, 24, 37, 112, 94, 41, 83, 58, 44, 126, 91, 89, 86, 124, 110, 62, 51, 30, 45, 60, 52, 53, 113, 15, 127, 54, 84, 118, 61, 43, 119, 111, 46, 121, 108, 125, 59, 101, 109, 107, 122, 106, 57, 42, 49, 102, 120, 104, 48, 50, 79, 29, 96, 22, 99, 36, 80, 10, 38, 105, 92, 85, 97, 20, 100, 21, 26, 17, 93, 40, 98, 16, 12, 14, 31, 33, 28, 19, 35, 103, 70, 8, 32, 95, 23, 75, 18, 88, 87, 81, 9, 82, 90, 68, 72, 13, 11, 78, 66, 74, 2, 4, 6, 77, 5, 3, 71, 67, 0, 76, 64, 69, 73, 7, 1, 65], [53, 120, 101, 127, 110, 38, 121, 25, 125, 56, 51, 100, 102, 28, 22, 40, 80, 30, 33, 97, 116, 29, 49, 104, 11, 114, 59, 113, 87, 118, 50, 32, 84, 86, 63, 35, 57, 55, 77, 61, 75, 60, 16, 119, 115, 24, 90, 79, 46, 39, 54, 124, 93, 112, 37, 98, 117, 7, 52, 96, 62, 99, 9, 48, 94, 106, 92, 108, 18, 23, 105, 45, 111, 27, 109, 34, 123, 126, 122, 103, 107, 89, 72, 58, 47, 69, 91, 83, 15, 88, 44, 26, 1, 78, 20, 42, 36, 71, 21, 31, 43, 13, 95, 41, 81, 5, 85, 19, 73, 66, 4, 14, 82, 10, 8, 17, 2, 12, 76, 67, 70, 74, 3, 68, 0, 6, 65, 64], [120, 127, 53, 51, 113, 110, 59, 104, 49, 50, 125, 56, 112, 124, 116, 114, 63, 121, 101, 44, 52, 62, 55, 61, 57, 117, 60, 118, 119, 126, 46, 115, 58, 111, 37, 123, 54, 45, 48, 122, 39, 43, 47, 27, 107, 105, 108, 109, 42, 40, 106, 38, 79, 41, 96, 103, 100, 29, 35, 33, 82, 19, 85, 87, 78, 98, 34, 99, 102, 36, 30, 88, 95, 97, 6, 31, 64, 86, 24, 66, 65, 77, 92, 91, 32, 10, 94, 9, 8, 83, 22, 75, 67, 84, 5, 20, 17, 93, 28, 4, 0, 71, 18, 90, 14, 80, 11, 25, 21, 12, 15, 68, 26, 74, 1, 81, 7, 2, 23, 69, 72, 70, 3, 76, 73, 13, 89, 16], [127, 120, 110, 53, 40, 101, 100, 121, 49, 38, 51, 25, 118, 97, 63, 56, 62, 28, 113, 117, 84, 61, 39, 125, 119, 30, 32, 124, 59, 29, 96, 52, 104, 115, 88, 105, 112, 18, 94, 50, 116, 19, 23, 99, 111, 0, 24, 87, 60, 33, 89, 37, 98, 70, 35, 108, 95, 55, 103, 48, 41, 31, 21, 122, 109, 114, 57, 126, 43, 54, 102, 14, 22, 76, 86, 44, 46, 27, 80, 47, 78, 90, 34, 10, 42, 45, 6, 93, 66, 107, 85, 17, 36, 13, 65, 12, 83, 123, 75, 26, 106, 58, 81, 3, 20, 82, 11, 74, 79, 68, 1, 91, 15, 67, 69, 72, 8, 77, 92, 9, 71, 64, 2, 5, 73, 16, 4, 7], [127, 101, 53, 120, 40, 59, 28, 84, 33, 97, 100, 30, 25, 37, 87, 51, 50, 113, 86, 63, 77, 46, 104, 112, 102, 88, 96, 29, 27, 20, 116, 35, 121, 38, 22, 83, 126, 91, 125, 56, 94, 11, 80, 98, 75, 45, 19, 34, 78, 17, 52, 103, 60, 55, 54, 62, 24, 122, 110, 61, 115, 32, 79, 81, 41, 114, 15, 93, 71, 107, 124, 48, 21, 58, 105, 99, 8, 31, 39, 7, 106, 12, 44, 10, 57, 118, 42, 76, 95, 123, 109, 111, 47, 82, 108, 119, 92, 43, 4, 26, 89, 2, 18, 49, 74, 85, 90, 36, 5, 68, 117, 72, 16, 1, 9, 14, 23, 69, 70, 66, 0, 65, 3, 6, 13, 67, 64, 73], [104, 117, 25, 95, 34, 92, 86, 52, 19, 54, 77, 97, 8, 53, 61, 51, 79, 22, 28, 17, 120, 80, 68, 59, 88, 85, 114, 45, 72, 12, 60, 121, 24, 124, 13, 40, 49, 116, 4, 122, 113, 56, 87, 55, 58, 57, 23, 73, 107, 91, 109, 46, 15, 110, 63, 127, 62, 89, 93, 65, 74, 118, 83, 48, 115, 47, 21, 31, 18, 69, 75, 90, 0, 33, 126, 100, 35, 39, 111, 125, 94, 2, 14, 43, 106, 82, 20, 112, 6, 42, 36, 3, 50, 32, 64, 44, 5, 101, 123, 105, 98, 119, 102, 76, 29, 99, 108, 84, 27, 26, 30, 9, 16, 96, 66, 103, 81, 11, 1, 38, 71, 78, 7, 70, 41, 37, 10, 67], [117, 104, 120, 25, 95, 54, 52, 34, 61, 92, 103, 86, 88, 97, 58, 51, 116, 12, 114, 124, 55, 122, 53, 60, 113, 102, 49, 121, 63, 47, 59, 85, 127, 46, 62, 106, 111, 109, 48, 56, 28, 18, 126, 118, 19, 44, 115, 8, 119, 69, 125, 45, 14, 112, 57, 91, 24, 123, 80, 43, 42, 110, 108, 50, 107, 17, 66, 20, 87, 98, 105, 22, 39, 36, 71, 33, 100, 41, 38, 32, 37, 79, 26, 35, 77, 99, 94, 93, 101, 75, 74, 40, 30, 84, 31, 82, 13, 29, 96, 16, 72, 3, 89, 78, 11, 27, 90, 23, 9, 4, 15, 65, 81, 7, 68, 21, 83, 5, 76, 2, 1, 0, 73, 6, 67, 64, 10, 70], [117, 104, 95, 25, 52, 54, 61, 34, 92, 56, 97, 86, 120, 60, 24, 12, 47, 17, 51, 116, 53, 113, 88, 20, 122, 49, 99, 124, 85, 103, 119, 80, 19, 59, 102, 77, 28, 48, 91, 63, 57, 31, 112, 106, 126, 26, 58, 18, 62, 109, 118, 111, 115, 114, 121, 127, 125, 107, 100, 8, 123, 110, 44, 45, 50, 69, 42, 55, 43, 22, 105, 36, 38, 46, 16, 37, 15, 108, 96, 78, 30, 87, 39, 75, 94, 79, 32, 33, 35, 41, 40, 82, 90, 14, 98, 84, 29, 27, 93, 83, 89, 101, 81, 21, 23, 9, 73, 76, 74, 72, 10, 11, 66, 3, 4, 7, 5, 2, 6, 13, 71, 1, 70, 0, 65, 68, 64, 67], [117, 104, 52, 25, 95, 61, 56, 86, 92, 97, 34, 54, 60, 51, 19, 53, 58, 77, 120, 17, 8, 80, 113, 114, 31, 122, 59, 66, 62, 124, 127, 116, 69, 22, 12, 49, 48, 46, 57, 88, 47, 121, 106, 55, 85, 112, 79, 115, 72, 28, 102, 14, 74, 109, 45, 63, 15, 10, 87, 110, 125, 36, 40, 111, 118, 24, 4, 43, 126, 35, 73, 94, 33, 119, 123, 27, 42, 93, 98, 30, 5, 103, 50, 100, 91, 21, 44, 3, 108, 82, 29, 32, 105, 37, 107, 26, 41, 99, 90, 78, 38, 101, 83, 20, 39, 6, 18, 96, 75, 89, 23, 11, 16, 76, 84, 71, 64, 13, 70, 81, 7, 2, 68, 9, 1, 65, 0, 67], [39, 121, 112, 33, 1, 114, 93, 23, 69, 82, 0, 21, 79, 20, 13, 14, 84, 67, 48, 2, 118, 10, 25, 90, 16, 91, 22, 5, 29, 81, 72, 17, 65, 89, 78, 124, 3, 75, 71, 66, 68, 61, 87, 80, 88, 7, 30, 56, 47, 52, 4, 6, 60, 19, 55, 120, 12, 11, 110, 119, 122, 125, 63, 116, 115, 57, 53, 42, 126, 73, 92, 49, 83, 76, 86, 9, 70, 34, 100, 64, 102, 107, 27, 59, 74, 108, 101, 98, 43, 127, 51, 111, 99, 106, 35, 85, 44, 28, 50, 8, 45, 113, 94, 24, 117, 18, 109, 58, 26, 40, 37, 95, 96, 54, 38, 41, 62, 77, 46, 36, 123, 104, 105, 32, 31, 15, 97, 103], [112, 39, 114, 93, 33, 121, 23, 20, 90, 84, 17, 110, 56, 10, 53, 61, 115, 49, 22, 91, 119, 101, 116, 108, 117, 113, 43, 50, 47, 29, 24, 55, 125, 51, 118, 16, 99, 40, 54, 82, 63, 81, 57, 59, 60, 100, 46, 42, 38, 52, 58, 124, 111, 79, 126, 45, 123, 120, 109, 6, 87, 48, 107, 13, 122, 105, 106, 62, 102, 88, 44, 3, 104, 41, 127, 95, 94, 34, 37, 98, 85, 36, 72, 83, 92, 97, 30, 27, 86, 80, 35, 32, 25, 26, 96, 11, 74, 28, 19, 103, 31, 89, 76, 21, 5, 70, 18, 15, 77, 12, 64, 71, 14, 73, 67, 78, 68, 9, 8, 7, 75, 0, 2, 4, 1, 65, 69, 66], [112, 39, 121, 114, 93, 33, 23, 61, 110, 48, 17, 120, 20, 84, 90, 47, 22, 56, 115, 91, 124, 117, 53, 45, 82, 60, 116, 49, 51, 118, 57, 119, 55, 10, 50, 29, 94, 38, 43, 24, 125, 111, 52, 30, 108, 63, 79, 126, 62, 99, 88, 59, 113, 54, 122, 109, 107, 102, 123, 44, 58, 46, 40, 104, 100, 85, 41, 6, 101, 42, 127, 106, 81, 103, 105, 13, 86, 98, 87, 25, 92, 97, 21, 74, 31, 35, 95, 76, 36, 37, 96, 26, 34, 32, 27, 15, 77, 28, 11, 8, 16, 18, 89, 3, 80, 67, 72, 83, 70, 19, 5, 12, 71, 75, 68, 78, 7, 64, 2, 73, 14, 0, 9, 1, 4, 69, 65, 66], [39, 112, 33, 93, 121, 114, 23, 120, 20, 10, 22, 17, 79, 84, 110, 96, 13, 61, 29, 72, 49, 87, 90, 21, 56, 82, 109, 6, 117, 60, 70, 47, 125, 16, 52, 26, 81, 50, 118, 92, 77, 119, 51, 57, 115, 42, 25, 53, 11, 30, 124, 34, 104, 107, 91, 116, 122, 3, 95, 98, 38, 99, 83, 100, 86, 43, 111, 41, 101, 126, 48, 74, 5, 37, 55, 88, 62, 94, 44, 27, 24, 63, 108, 113, 67, 40, 35, 59, 45, 54, 78, 123, 32, 89, 127, 14, 28, 58, 105, 46, 18, 15, 85, 106, 9, 36, 31, 12, 19, 8, 64, 80, 75, 73, 102, 4, 76, 7, 68, 97, 71, 2, 0, 1, 103, 69, 65, 66], [53, 60, 127, 124, 122, 119, 123, 120, 51, 116, 56, 55, 125, 57, 38, 126, 54, 115, 61, 63, 50, 52, 118, 49, 58, 36, 59, 114, 62, 48, 45, 121, 112, 111, 101, 39, 113, 105, 110, 47, 46, 117, 106, 42, 108, 109, 43, 104, 44, 34, 107, 40, 99, 103, 41, 91, 24, 102, 94, 95, 19, 27, 21, 97, 93, 35, 37, 31, 89, 98, 100, 29, 16, 96, 14, 23, 92, 33, 25, 90, 83, 13, 32, 85, 30, 87, 11, 88, 20, 82, 4, 80, 8, 78, 68, 72, 28, 26, 75, 77, 70, 86, 6, 17, 81, 67, 3, 79, 74, 84, 18, 15, 10, 73, 76, 9, 22, 65, 71, 1, 12, 5, 7, 64, 66, 2, 0, 69], [53, 60, 120, 124, 39, 62, 119, 36, 127, 123, 125, 56, 100, 122, 46, 104, 109, 31, 51, 37, 101, 108, 61, 110, 126, 121, 118, 117, 63, 33, 42, 115, 113, 57, 52, 54, 44, 116, 107, 106, 41, 58, 23, 43, 59, 45, 24, 50, 48, 112, 55, 47, 97, 114, 40, 95, 111, 49, 102, 105, 90, 38, 103, 92, 94, 21, 82, 32, 98, 29, 34, 86, 25, 35, 99, 85, 91, 30, 93, 14, 13, 96, 16, 20, 19, 89, 87, 88, 27, 26, 28, 22, 18, 11, 83, 78, 80, 72, 15, 84, 68, 17, 77, 81, 79, 4, 75, 8, 6, 70, 73, 10, 12, 74, 67, 9, 3, 76, 71, 7, 5, 1, 69, 2, 66, 65, 0, 64], [60, 53, 127, 56, 124, 119, 122, 51, 123, 125, 55, 52, 116, 61, 57, 114, 126, 58, 63, 54, 48, 115, 121, 59, 50, 113, 49, 118, 112, 106, 111, 62, 120, 47, 117, 38, 46, 110, 45, 109, 108, 44, 107, 43, 42, 41, 105, 40, 39, 36, 103, 100, 104, 102, 37, 97, 29, 22, 98, 101, 33, 34, 35, 99, 86, 95, 32, 90, 23, 31, 85, 96, 91, 83, 25, 87, 18, 80, 30, 94, 92, 84, 24, 75, 27, 93, 26, 89, 17, 82, 16, 21, 19, 13, 78, 81, 79, 14, 20, 68, 8, 28, 88, 76, 15, 72, 4, 11, 74, 9, 77, 6, 3, 7, 2, 70, 1, 10, 67, 12, 64, 69, 65, 0, 73, 5, 66, 71], [60, 53, 120, 20, 79, 76, 92, 36, 9, 124, 7, 82, 17, 24, 97, 2, 62, 86, 69, 71, 0, 66, 32, 125, 56, 5, 122, 100, 74, 14, 73, 12, 3, 52, 119, 34, 25, 33, 18, 11, 80, 38, 65, 94, 21, 89, 48, 28, 13, 67, 78, 15, 81, 123, 8, 84, 127, 88, 96, 19, 93, 10, 75, 99, 83, 31, 101, 16, 26, 95, 108, 23, 6, 90, 72, 77, 70, 1, 85, 87, 64, 68, 27, 43, 4, 41, 30, 110, 91, 55, 29, 44, 35, 22, 109, 59, 102, 117, 58, 112, 39, 126, 98, 104, 37, 107, 61, 103, 51, 49, 113, 47, 118, 42, 115, 105, 63, 106, 116, 45, 54, 114, 40, 121, 46, 57, 111, 50]], "model.layers.15.self_attn.k_proj": [[113, 39, 62, 22, 18, 11, 60, 14, 89, 16, 97, 8, 21, 19, 57, 59, 123, 56, 110, 92, 67, 58, 120, 70, 116, 63, 53, 127, 119, 108, 79, 10, 122, 126, 117, 61, 54, 55, 48, 109, 114, 121, 87, 51, 124, 91, 112, 107, 52, 111, 125, 101, 42, 50, 118, 30, 44, 106, 47, 36, 115, 13, 46, 76, 0, 45, 43, 41, 104, 105, 75, 37, 35, 49, 69, 38, 29, 26, 7, 9, 27, 102, 40, 71, 85, 83, 31, 100, 17, 68, 95, 96, 99, 72, 33, 2, 34, 88, 15, 73, 93, 32, 78, 80, 82, 20, 5, 90, 25, 98, 77, 23, 28, 84, 12, 65, 24, 81, 94, 66, 1, 74, 6, 86, 3, 64, 103, 4], [103, 109, 112, 33, 45, 89, 21, 118, 12, 19, 79, 17, 9, 122, 119, 14, 31, 51, 7, 125, 5, 0, 117, 3, 120, 116, 124, 55, 115, 53, 93, 27, 57, 127, 50, 123, 47, 60, 20, 114, 56, 1, 49, 40, 59, 94, 46, 61, 54, 43, 8, 58, 66, 62, 23, 63, 111, 18, 108, 25, 28, 24, 52, 35, 121, 110, 42, 48, 10, 32, 86, 30, 126, 68, 11, 70, 22, 4, 44, 87, 106, 91, 113, 105, 74, 96, 107, 104, 82, 41, 98, 100, 83, 38, 34, 99, 77, 92, 37, 102, 88, 6, 26, 81, 101, 95, 13, 36, 84, 29, 73, 65, 2, 16, 76, 78, 90, 69, 71, 39, 15, 80, 85, 75, 64, 72, 97, 67], [115, 51, 117, 61, 100, 22, 30, 33, 53, 27, 108, 19, 88, 78, 124, 81, 62, 10, 120, 54, 80, 59, 32, 114, 20, 119, 127, 118, 63, 111, 64, 44, 71, 106, 110, 4, 123, 116, 48, 112, 58, 66, 56, 125, 122, 52, 126, 55, 76, 60, 50, 109, 1, 25, 43, 39, 121, 107, 46, 17, 102, 113, 11, 38, 13, 104, 57, 47, 69, 85, 49, 24, 40, 45, 99, 15, 41, 84, 82, 36, 29, 77, 35, 92, 72, 90, 42, 105, 37, 73, 93, 28, 98, 67, 8, 86, 101, 103, 87, 89, 95, 96, 6, 26, 65, 34, 31, 94, 97, 75, 16, 18, 21, 3, 23, 70, 9, 79, 74, 91, 12, 83, 7, 0, 5, 68, 2, 14], [103, 115, 47, 111, 86, 98, 51, 25, 17, 12, 117, 83, 14, 27, 10, 30, 110, 71, 116, 56, 1, 79, 112, 126, 123, 4, 55, 63, 62, 6, 64, 9, 58, 114, 66, 127, 121, 52, 61, 113, 122, 69, 59, 43, 124, 48, 109, 54, 106, 101, 32, 125, 21, 45, 60, 41, 53, 75, 118, 34, 108, 5, 120, 2, 49, 50, 46, 13, 57, 88, 87, 105, 67, 119, 102, 29, 92, 85, 72, 40, 96, 36, 107, 44, 104, 31, 28, 37, 24, 8, 20, 82, 81, 76, 26, 100, 42, 95, 38, 99, 91, 70, 90, 18, 97, 23, 35, 15, 0, 33, 93, 78, 94, 16, 19, 84, 77, 73, 80, 3, 7, 68, 89, 11, 22, 65, 74, 39], [127, 37, 120, 53, 104, 22, 97, 102, 32, 56, 46, 110, 94, 61, 121, 125, 25, 51, 60, 28, 93, 35, 80, 89, 113, 92, 55, 116, 118, 54, 122, 119, 124, 63, 108, 117, 52, 20, 96, 57, 99, 126, 59, 112, 47, 45, 48, 115, 91, 58, 114, 41, 30, 111, 106, 50, 98, 77, 123, 33, 44, 9, 109, 49, 88, 87, 62, 107, 101, 42, 19, 39, 43, 18, 95, 4, 36, 103, 66, 86, 105, 84, 31, 65, 40, 71, 75, 17, 64, 34, 78, 12, 90, 8, 79, 10, 29, 15, 0, 85, 82, 21, 6, 16, 24, 27, 26, 13, 67, 3, 72, 69, 5, 11, 100, 23, 14, 83, 38, 81, 76, 7, 68, 73, 74, 1, 70, 2], [117, 40, 31, 98, 86, 28, 52, 61, 25, 62, 120, 60, 114, 48, 56, 122, 113, 116, 51, 124, 59, 49, 127, 115, 109, 57, 46, 125, 55, 47, 54, 58, 33, 19, 123, 121, 85, 110, 111, 12, 126, 112, 18, 119, 80, 45, 63, 118, 43, 106, 53, 24, 8, 79, 50, 108, 42, 35, 44, 39, 105, 15, 41, 36, 107, 2, 64, 87, 102, 38, 34, 101, 29, 81, 68, 100, 16, 92, 37, 77, 17, 23, 99, 74, 27, 5, 103, 104, 20, 32, 88, 96, 94, 75, 21, 91, 89, 65, 1, 30, 97, 13, 9, 67, 93, 10, 14, 11, 71, 90, 6, 84, 82, 26, 83, 78, 95, 70, 76, 73, 7, 22, 66, 72, 69, 0, 3, 4], [103, 121, 112, 97, 48, 29, 114, 22, 57, 120, 23, 84, 118, 50, 119, 61, 116, 17, 52, 56, 111, 115, 90, 49, 53, 79, 60, 10, 43, 42, 82, 64, 3, 59, 122, 47, 46, 55, 58, 72, 124, 125, 36, 123, 102, 62, 104, 117, 63, 126, 45, 13, 110, 51, 37, 106, 91, 113, 28, 127, 41, 44, 6, 38, 54, 109, 21, 68, 107, 24, 2, 25, 76, 93, 73, 98, 105, 32, 108, 89, 78, 86, 71, 101, 40, 99, 83, 35, 80, 33, 16, 26, 34, 15, 31, 11, 27, 85, 94, 77, 95, 19, 92, 96, 18, 100, 75, 14, 12, 30, 69, 39, 7, 65, 5, 88, 9, 1, 8, 20, 87, 66, 4, 0, 70, 81, 67, 74], [60, 53, 86, 100, 120, 94, 76, 125, 79, 7, 124, 20, 101, 9, 82, 92, 74, 69, 96, 17, 25, 122, 44, 62, 24, 61, 119, 52, 118, 2, 106, 56, 127, 123, 117, 57, 51, 63, 121, 19, 58, 112, 126, 48, 0, 109, 107, 54, 114, 55, 59, 16, 113, 49, 47, 46, 50, 14, 90, 1, 115, 116, 111, 104, 72, 110, 41, 34, 13, 45, 5, 6, 43, 108, 81, 97, 75, 39, 42, 3, 10, 27, 37, 40, 11, 105, 102, 103, 35, 98, 12, 15, 23, 99, 33, 21, 22, 31, 30, 71, 78, 38, 73, 26, 93, 29, 85, 28, 95, 32, 8, 84, 67, 91, 70, 77, 18, 87, 83, 65, 89, 4, 68, 66, 64, 88, 36, 80]], "model.layers.15.self_attn.qk_proj": [[117, 112, 53, 115, 60, 113, 62, 51, 120, 61, 127, 47, 121, 45, 114, 124, 48, 86, 39, 100, 57, 56, 125, 118, 25, 116, 103, 89, 122, 22, 110, 111, 91, 109, 46, 97, 40, 63, 59, 50, 81, 30, 54, 119, 33, 123, 104, 93, 19, 49, 108, 55, 52, 83, 17, 58, 21, 79, 85, 126, 14, 101, 94, 36, 92, 98, 15, 12, 44, 107, 76, 78, 88, 20, 28, 106, 24, 37, 43, 84, 34, 74, 42, 31, 102, 95, 87, 27, 10, 9, 29, 32, 23, 7, 71, 96, 41, 82, 80, 16, 18, 5, 73, 38, 35, 105, 72, 26, 11, 69, 77, 75, 67, 3, 64, 0, 13, 66, 99, 8, 90, 2, 68, 70, 4, 1, 6, 65], [117, 112, 53, 115, 60, 62, 113, 51, 120, 61, 127, 47, 121, 45, 114, 57, 48, 39, 86, 124, 56, 103, 100, 25, 118, 125, 22, 89, 46, 109, 116, 122, 54, 63, 111, 119, 91, 97, 110, 50, 40, 55, 49, 59, 52, 30, 108, 81, 104, 33, 123, 93, 126, 101, 83, 94, 19, 17, 15, 36, 58, 21, 85, 107, 98, 14, 12, 78, 92, 79, 76, 88, 34, 24, 28, 37, 10, 20, 106, 43, 44, 102, 74, 27, 84, 7, 31, 9, 42, 95, 23, 87, 71, 18, 82, 41, 96, 80, 16, 29, 32, 26, 35, 38, 99, 72, 0, 8, 69, 73, 5, 105, 2, 66, 75, 77, 11, 64, 67, 3, 13, 90, 4, 65, 6, 70, 68, 1], [117, 112, 53, 60, 115, 113, 62, 51, 120, 61, 127, 47, 121, 45, 114, 39, 56, 48, 124, 57, 118, 86, 100, 89, 103, 122, 111, 46, 25, 116, 109, 59, 125, 22, 91, 123, 97, 104, 110, 63, 40, 54, 119, 33, 30, 108, 52, 55, 93, 36, 81, 101, 50, 94, 58, 49, 19, 83, 126, 17, 21, 15, 28, 37, 85, 12, 24, 98, 92, 107, 44, 88, 76, 78, 34, 14, 102, 43, 79, 27, 95, 42, 106, 84, 9, 10, 74, 31, 20, 87, 23, 29, 7, 71, 18, 41, 32, 38, 64, 26, 82, 16, 96, 35, 2, 5, 0, 8, 80, 66, 99, 73, 69, 105, 67, 90, 11, 75, 13, 3, 72, 77, 6, 68, 65, 1, 4, 70], [117, 112, 53, 60, 115, 62, 113, 51, 120, 61, 47, 127, 121, 45, 124, 114, 48, 39, 57, 56, 118, 100, 125, 116, 89, 103, 86, 109, 111, 25, 22, 122, 97, 91, 110, 54, 63, 59, 40, 30, 55, 81, 119, 46, 104, 58, 123, 52, 50, 108, 33, 126, 93, 101, 49, 19, 94, 21, 17, 37, 107, 79, 98, 14, 85, 12, 83, 76, 28, 88, 36, 92, 24, 78, 34, 15, 102, 44, 42, 20, 95, 27, 43, 16, 106, 84, 23, 80, 96, 9, 29, 82, 74, 87, 31, 10, 38, 18, 7, 32, 0, 71, 5, 41, 69, 35, 26, 73, 99, 66, 8, 11, 67, 3, 64, 90, 77, 2, 105, 1, 75, 72, 65, 13, 6, 4, 68, 70], [117, 112, 53, 115, 60, 62, 113, 51, 120, 61, 127, 47, 121, 45, 124, 114, 118, 48, 57, 56, 39, 125, 86, 100, 103, 116, 122, 59, 63, 22, 89, 110, 25, 109, 55, 97, 111, 54, 40, 91, 46, 119, 104, 30, 108, 123, 81, 33, 52, 49, 58, 50, 17, 19, 83, 93, 101, 12, 85, 15, 14, 98, 37, 43, 78, 79, 126, 94, 76, 92, 21, 107, 24, 42, 28, 74, 106, 44, 88, 84, 34, 102, 20, 36, 87, 10, 38, 96, 27, 9, 8, 7, 95, 32, 71, 41, 69, 35, 80, 23, 82, 18, 16, 5, 31, 105, 64, 29, 73, 26, 3, 66, 2, 75, 67, 77, 11, 13, 0, 99, 72, 1, 65, 90, 4, 68, 6, 70], [117, 112, 53, 115, 60, 113, 62, 51, 120, 61, 127, 47, 121, 45, 114, 118, 124, 57, 56, 86, 39, 48, 89, 100, 25, 116, 22, 125, 63, 111, 110, 103, 59, 91, 97, 54, 30, 109, 52, 40, 122, 119, 50, 81, 55, 49, 108, 46, 33, 123, 104, 93, 17, 58, 85, 83, 14, 19, 126, 76, 21, 12, 101, 15, 79, 94, 107, 92, 78, 44, 28, 88, 98, 34, 36, 42, 84, 24, 43, 37, 20, 10, 95, 87, 106, 9, 23, 18, 82, 16, 31, 80, 74, 7, 102, 27, 96, 38, 71, 32, 41, 8, 29, 35, 13, 75, 105, 73, 5, 69, 77, 67, 11, 99, 26, 64, 2, 66, 90, 0, 3, 68, 72, 6, 4, 65, 1, 70], [117, 112, 53, 60, 115, 62, 113, 51, 120, 61, 127, 47, 121, 45, 118, 114, 57, 48, 124, 86, 100, 89, 39, 22, 56, 25, 59, 125, 103, 63, 111, 110, 91, 97, 122, 54, 116, 30, 81, 40, 123, 55, 119, 52, 109, 93, 46, 19, 126, 104, 50, 83, 108, 33, 101, 21, 49, 17, 92, 85, 76, 14, 15, 12, 58, 78, 79, 107, 94, 44, 28, 98, 84, 34, 88, 36, 24, 42, 106, 27, 37, 20, 87, 18, 10, 95, 23, 74, 96, 9, 38, 102, 16, 82, 105, 32, 31, 43, 80, 41, 71, 7, 29, 8, 13, 35, 73, 69, 11, 75, 26, 5, 77, 3, 0, 2, 99, 67, 66, 64, 72, 90, 68, 1, 6, 70, 4, 65], [117, 112, 53, 60, 115, 62, 113, 51, 120, 61, 127, 47, 121, 45, 114, 57, 118, 48, 39, 89, 56, 86, 103, 100, 25, 22, 124, 122, 125, 59, 97, 116, 91, 63, 111, 46, 30, 54, 110, 119, 109, 55, 40, 123, 104, 33, 83, 19, 108, 81, 49, 93, 50, 52, 101, 126, 17, 21, 92, 15, 36, 12, 14, 79, 28, 58, 85, 98, 78, 94, 76, 107, 24, 44, 42, 87, 88, 20, 84, 37, 106, 34, 43, 27, 96, 102, 23, 38, 31, 74, 18, 10, 41, 16, 9, 82, 95, 29, 32, 7, 105, 71, 80, 26, 35, 8, 0, 73, 11, 75, 5, 13, 66, 69, 99, 2, 64, 67, 3, 77, 72, 65, 70, 90, 1, 68, 4, 6], [117, 112, 53, 60, 115, 113, 51, 62, 120, 127, 61, 121, 47, 45, 57, 48, 118, 114, 56, 124, 39, 86, 89, 100, 125, 25, 103, 22, 122, 97, 63, 109, 30, 116, 111, 91, 55, 110, 123, 50, 54, 46, 59, 104, 119, 40, 101, 33, 49, 81, 19, 83, 93, 108, 98, 17, 58, 52, 21, 126, 36, 14, 107, 12, 79, 76, 94, 92, 85, 15, 24, 78, 42, 37, 88, 44, 28, 34, 20, 95, 106, 102, 27, 43, 16, 84, 31, 41, 87, 18, 32, 23, 10, 29, 74, 96, 80, 71, 9, 38, 82, 105, 35, 7, 73, 75, 99, 8, 13, 77, 11, 5, 0, 69, 26, 2, 72, 64, 67, 1, 66, 3, 90, 70, 65, 4, 68, 6], [117, 112, 53, 60, 115, 62, 113, 51, 61, 127, 120, 47, 121, 45, 114, 48, 124, 39, 57, 56, 118, 89, 125, 86, 100, 103, 22, 122, 25, 111, 123, 97, 55, 110, 109, 63, 91, 116, 54, 52, 59, 30, 119, 58, 46, 81, 83, 40, 104, 49, 50, 108, 126, 33, 93, 17, 19, 101, 14, 15, 94, 98, 85, 21, 12, 79, 107, 76, 44, 102, 28, 92, 36, 34, 78, 74, 24, 37, 42, 20, 43, 10, 84, 27, 88, 71, 7, 87, 23, 106, 41, 95, 18, 38, 9, 31, 16, 73, 32, 80, 96, 29, 5, 69, 82, 105, 0, 8, 35, 64, 3, 99, 66, 2, 67, 72, 13, 77, 26, 75, 11, 90, 70, 4, 68, 65, 1, 6], [117, 112, 53, 60, 115, 113, 62, 51, 120, 61, 127, 47, 121, 45, 114, 124, 39, 48, 118, 57, 56, 89, 86, 100, 122, 22, 25, 116, 125, 103, 110, 119, 97, 109, 46, 30, 59, 111, 54, 91, 123, 63, 40, 81, 55, 104, 52, 108, 33, 101, 93, 83, 50, 58, 49, 12, 19, 17, 14, 76, 79, 94, 21, 126, 15, 36, 78, 88, 85, 24, 28, 98, 107, 92, 10, 43, 44, 84, 34, 7, 102, 20, 87, 37, 71, 74, 18, 9, 106, 80, 5, 16, 23, 42, 29, 32, 38, 41, 95, 69, 73, 72, 75, 82, 64, 96, 27, 66, 0, 31, 35, 8, 2, 67, 105, 3, 13, 77, 11, 99, 1, 65, 90, 68, 26, 6, 4, 70], [117, 112, 53, 115, 60, 113, 62, 51, 61, 127, 120, 47, 121, 45, 114, 39, 124, 118, 48, 86, 89, 57, 25, 22, 110, 100, 116, 125, 103, 109, 56, 63, 122, 55, 54, 59, 97, 91, 46, 30, 104, 40, 119, 111, 123, 81, 50, 52, 19, 49, 93, 108, 17, 101, 21, 12, 33, 83, 76, 15, 14, 126, 79, 98, 58, 92, 44, 94, 43, 78, 107, 28, 85, 88, 20, 87, 36, 106, 34, 24, 84, 42, 10, 18, 102, 9, 37, 74, 95, 7, 23, 16, 82, 41, 71, 27, 73, 32, 75, 80, 31, 96, 29, 35, 72, 38, 5, 69, 77, 0, 13, 11, 3, 105, 8, 99, 64, 2, 1, 26, 66, 67, 70, 6, 4, 68, 90, 65], [117, 112, 53, 115, 60, 62, 51, 113, 120, 61, 127, 47, 121, 45, 114, 118, 57, 48, 39, 56, 86, 124, 89, 100, 22, 25, 103, 111, 125, 55, 97, 63, 109, 59, 91, 54, 116, 110, 119, 122, 30, 93, 46, 40, 104, 81, 52, 108, 123, 126, 33, 36, 101, 50, 83, 19, 49, 94, 107, 76, 98, 21, 79, 17, 92, 15, 58, 14, 85, 28, 44, 20, 88, 12, 37, 42, 43, 78, 34, 87, 24, 27, 41, 106, 95, 84, 74, 102, 31, 96, 18, 23, 82, 35, 29, 38, 10, 105, 80, 7, 32, 73, 16, 72, 71, 99, 26, 5, 75, 9, 13, 11, 67, 77, 90, 69, 3, 66, 1, 2, 8, 0, 68, 64, 6, 4, 70, 65], [117, 112, 53, 115, 60, 62, 113, 51, 127, 61, 47, 120, 121, 45, 48, 114, 39, 118, 57, 56, 124, 86, 125, 89, 116, 100, 22, 54, 103, 25, 122, 55, 109, 111, 91, 119, 123, 59, 63, 40, 97, 30, 46, 104, 110, 93, 81, 33, 50, 108, 52, 94, 49, 19, 58, 83, 76, 15, 107, 101, 79, 21, 14, 126, 78, 17, 98, 36, 85, 44, 12, 74, 24, 28, 92, 20, 10, 42, 88, 71, 43, 34, 73, 37, 7, 23, 29, 16, 84, 102, 87, 32, 18, 72, 95, 38, 64, 82, 31, 80, 35, 27, 41, 96, 9, 5, 69, 0, 106, 75, 2, 99, 105, 3, 26, 66, 13, 11, 67, 77, 8, 6, 1, 65, 68, 90, 4, 70], [117, 112, 53, 115, 60, 62, 113, 51, 127, 120, 61, 47, 121, 45, 114, 48, 57, 56, 118, 39, 125, 124, 86, 100, 89, 55, 103, 25, 63, 110, 22, 119, 116, 109, 111, 54, 122, 97, 59, 91, 30, 46, 40, 108, 104, 94, 50, 33, 123, 93, 19, 81, 52, 101, 43, 79, 76, 83, 36, 21, 49, 14, 126, 98, 44, 15, 42, 17, 12, 78, 85, 92, 58, 28, 10, 24, 88, 34, 84, 74, 107, 20, 95, 102, 23, 7, 73, 18, 87, 106, 27, 71, 37, 9, 41, 72, 16, 38, 32, 75, 5, 80, 31, 35, 82, 29, 96, 105, 69, 0, 64, 13, 2, 26, 66, 11, 8, 67, 77, 99, 3, 90, 68, 6, 1, 4, 70, 65], [117, 112, 53, 115, 60, 62, 113, 51, 120, 127, 61, 47, 121, 45, 114, 39, 57, 124, 118, 48, 63, 86, 100, 116, 56, 103, 22, 89, 25, 110, 119, 125, 55, 97, 122, 54, 109, 91, 111, 40, 30, 59, 52, 108, 104, 46, 50, 81, 123, 101, 126, 93, 94, 83, 79, 19, 12, 49, 76, 17, 33, 36, 21, 14, 98, 44, 15, 85, 92, 78, 74, 107, 28, 58, 88, 42, 20, 34, 43, 24, 95, 87, 73, 37, 23, 82, 18, 106, 80, 84, 102, 16, 38, 96, 27, 9, 71, 32, 7, 41, 29, 72, 10, 99, 31, 105, 69, 5, 77, 75, 35, 67, 11, 26, 0, 13, 66, 8, 3, 64, 90, 2, 1, 6, 68, 4, 65, 70], [117, 112, 53, 115, 60, 62, 113, 51, 61, 120, 127, 47, 121, 45, 114, 39, 57, 48, 89, 100, 116, 124, 86, 118, 125, 22, 25, 103, 56, 122, 54, 63, 97, 111, 109, 55, 110, 104, 91, 59, 52, 30, 119, 123, 46, 93, 49, 40, 19, 126, 50, 81, 83, 108, 101, 17, 33, 36, 21, 94, 107, 98, 85, 28, 44, 34, 15, 14, 76, 58, 79, 92, 12, 78, 24, 43, 102, 20, 87, 88, 96, 27, 74, 106, 84, 42, 37, 41, 18, 23, 29, 95, 38, 80, 10, 16, 31, 71, 32, 99, 105, 9, 82, 7, 73, 35, 26, 72, 5, 77, 69, 90, 64, 75, 66, 67, 13, 0, 11, 65, 8, 3, 2, 4, 70, 68, 1, 6], [117, 112, 53, 60, 115, 62, 113, 51, 127, 61, 47, 120, 121, 45, 114, 48, 56, 57, 39, 103, 124, 100, 86, 118, 116, 89, 25, 22, 125, 54, 59, 91, 122, 63, 109, 119, 123, 104, 97, 55, 30, 111, 40, 110, 46, 93, 49, 33, 52, 81, 126, 50, 108, 19, 101, 36, 17, 94, 83, 76, 14, 79, 15, 92, 98, 107, 28, 21, 44, 78, 85, 74, 58, 12, 37, 24, 34, 42, 88, 20, 102, 87, 106, 10, 43, 84, 27, 41, 38, 23, 32, 95, 71, 96, 7, 80, 82, 16, 18, 73, 9, 29, 105, 31, 5, 35, 0, 69, 8, 2, 26, 3, 72, 66, 99, 64, 75, 77, 13, 90, 67, 11, 1, 4, 68, 70, 65, 6], [117, 112, 53, 115, 60, 62, 113, 51, 120, 127, 61, 47, 121, 45, 114, 39, 124, 57, 56, 48, 118, 100, 103, 63, 86, 25, 125, 22, 116, 109, 89, 122, 104, 111, 97, 110, 54, 119, 59, 123, 91, 108, 40, 30, 55, 126, 49, 33, 46, 94, 93, 50, 83, 19, 101, 17, 81, 98, 52, 76, 106, 85, 79, 44, 42, 107, 58, 78, 34, 14, 36, 21, 102, 37, 28, 92, 24, 12, 15, 10, 88, 38, 43, 74, 96, 84, 27, 32, 41, 95, 23, 87, 7, 20, 16, 35, 18, 80, 31, 9, 71, 5, 29, 82, 73, 0, 99, 69, 64, 26, 13, 67, 66, 8, 105, 72, 2, 75, 11, 3, 90, 77, 1, 70, 4, 65, 68, 6], [117, 112, 53, 115, 60, 62, 113, 51, 120, 127, 61, 47, 121, 45, 114, 39, 124, 57, 48, 89, 118, 86, 56, 100, 116, 103, 22, 54, 122, 110, 109, 97, 25, 63, 125, 30, 91, 46, 126, 59, 111, 52, 119, 55, 50, 104, 40, 33, 108, 123, 93, 81, 36, 49, 101, 19, 83, 94, 58, 12, 79, 98, 28, 21, 44, 85, 15, 34, 76, 78, 14, 42, 92, 74, 107, 17, 102, 24, 106, 37, 87, 20, 88, 95, 84, 41, 27, 31, 23, 38, 18, 96, 43, 29, 16, 82, 7, 80, 9, 71, 10, 32, 35, 8, 99, 105, 73, 26, 75, 66, 69, 13, 5, 90, 0, 11, 64, 77, 2, 67, 72, 3, 68, 1, 70, 4, 65, 6], [117, 112, 115, 53, 60, 113, 62, 51, 61, 120, 127, 47, 121, 45, 114, 124, 48, 56, 57, 118, 86, 39, 89, 116, 100, 103, 22, 125, 63, 109, 122, 110, 54, 59, 97, 91, 30, 25, 46, 55, 111, 40, 104, 126, 52, 119, 50, 123, 93, 81, 108, 33, 17, 21, 19, 15, 83, 101, 58, 76, 14, 79, 92, 85, 44, 12, 28, 49, 24, 34, 36, 98, 88, 74, 94, 107, 43, 78, 42, 37, 87, 20, 106, 84, 23, 102, 9, 16, 31, 18, 71, 96, 29, 35, 7, 80, 27, 32, 38, 95, 41, 10, 82, 26, 105, 69, 8, 5, 77, 75, 73, 66, 11, 3, 13, 64, 0, 2, 90, 72, 99, 67, 4, 70, 65, 68, 1, 6], [117, 112, 53, 115, 60, 113, 62, 51, 120, 61, 127, 47, 121, 45, 114, 56, 124, 57, 48, 118, 89, 100, 103, 86, 39, 116, 125, 122, 22, 63, 54, 40, 97, 91, 109, 25, 111, 59, 119, 104, 126, 30, 110, 55, 123, 46, 50, 81, 17, 93, 33, 83, 52, 94, 19, 58, 108, 49, 15, 101, 21, 98, 36, 28, 79, 76, 12, 37, 92, 88, 24, 14, 85, 44, 34, 43, 78, 20, 87, 27, 74, 107, 42, 106, 84, 31, 102, 96, 18, 38, 23, 80, 95, 7, 32, 71, 29, 35, 41, 69, 16, 10, 9, 0, 82, 8, 73, 64, 105, 26, 66, 5, 67, 99, 90, 77, 11, 3, 75, 13, 2, 1, 4, 72, 65, 6, 70, 68], [117, 112, 53, 115, 60, 113, 62, 51, 120, 61, 127, 47, 121, 45, 114, 39, 48, 124, 118, 56, 100, 86, 125, 89, 103, 57, 116, 22, 25, 97, 122, 54, 104, 63, 91, 59, 111, 109, 55, 119, 46, 110, 30, 40, 123, 108, 52, 126, 33, 81, 19, 93, 58, 50, 83, 101, 36, 15, 94, 85, 92, 17, 79, 76, 98, 28, 21, 49, 12, 14, 78, 34, 37, 43, 24, 44, 88, 87, 42, 38, 84, 102, 107, 96, 27, 20, 106, 10, 23, 32, 95, 18, 74, 31, 7, 29, 82, 71, 41, 80, 8, 35, 9, 105, 16, 99, 11, 5, 75, 69, 26, 64, 73, 66, 77, 2, 0, 67, 90, 13, 3, 72, 4, 65, 1, 6, 68, 70], [117, 112, 53, 115, 60, 113, 62, 51, 61, 120, 127, 47, 121, 45, 114, 39, 48, 56, 57, 116, 124, 100, 89, 103, 86, 118, 125, 22, 59, 91, 97, 110, 25, 122, 52, 30, 54, 109, 63, 111, 104, 46, 119, 40, 55, 126, 33, 123, 81, 108, 93, 19, 50, 49, 58, 83, 79, 17, 94, 85, 15, 101, 14, 12, 28, 44, 21, 92, 34, 76, 98, 74, 107, 24, 78, 88, 36, 37, 43, 102, 96, 42, 20, 84, 87, 106, 8, 27, 23, 10, 7, 95, 32, 31, 29, 38, 71, 35, 16, 73, 41, 82, 80, 18, 105, 9, 26, 69, 75, 5, 67, 11, 13, 99, 2, 90, 77, 64, 3, 0, 66, 72, 6, 4, 68, 1, 70, 65], [117, 112, 53, 115, 60, 113, 62, 51, 120, 127, 61, 47, 121, 45, 114, 86, 57, 118, 124, 48, 56, 125, 39, 22, 100, 103, 116, 122, 89, 59, 63, 25, 97, 109, 110, 91, 54, 40, 119, 30, 123, 111, 104, 81, 108, 55, 46, 126, 52, 50, 33, 83, 17, 14, 93, 19, 49, 76, 101, 85, 58, 21, 36, 92, 15, 44, 94, 12, 79, 107, 78, 43, 74, 28, 88, 42, 34, 98, 84, 24, 87, 106, 20, 102, 71, 23, 37, 96, 16, 27, 10, 18, 7, 95, 38, 32, 105, 82, 69, 80, 29, 41, 9, 8, 31, 73, 35, 11, 26, 77, 75, 5, 66, 3, 67, 0, 2, 72, 64, 13, 99, 90, 68, 6, 65, 1, 4, 70], [117, 112, 53, 115, 60, 62, 113, 51, 61, 120, 127, 47, 121, 45, 114, 57, 56, 118, 124, 86, 48, 125, 103, 89, 22, 100, 122, 39, 25, 63, 54, 119, 59, 116, 97, 46, 91, 30, 110, 109, 126, 55, 111, 104, 40, 123, 52, 108, 49, 81, 33, 19, 93, 92, 21, 17, 50, 94, 85, 44, 83, 58, 14, 101, 36, 15, 12, 76, 79, 98, 43, 28, 37, 88, 107, 24, 34, 106, 78, 87, 84, 96, 10, 95, 102, 23, 20, 42, 74, 27, 82, 7, 32, 41, 31, 29, 38, 80, 18, 73, 16, 35, 105, 71, 9, 8, 26, 99, 69, 11, 75, 77, 5, 72, 90, 66, 13, 0, 2, 64, 3, 67, 6, 68, 65, 4, 70, 1], [117, 112, 53, 115, 60, 113, 62, 51, 61, 127, 120, 47, 121, 45, 114, 124, 56, 118, 57, 48, 86, 100, 39, 103, 89, 125, 22, 25, 122, 59, 116, 123, 40, 63, 119, 91, 109, 54, 111, 55, 30, 46, 104, 97, 110, 93, 81, 108, 52, 50, 19, 33, 101, 126, 58, 17, 92, 44, 49, 36, 83, 94, 79, 15, 21, 85, 12, 14, 28, 76, 98, 102, 24, 78, 43, 34, 88, 37, 106, 107, 96, 87, 42, 20, 16, 84, 32, 23, 82, 10, 29, 95, 38, 27, 7, 31, 71, 73, 80, 74, 18, 41, 35, 99, 9, 26, 69, 72, 105, 75, 11, 5, 64, 2, 8, 67, 0, 66, 77, 13, 65, 3, 6, 90, 68, 1, 70, 4], [117, 112, 53, 60, 115, 62, 113, 51, 61, 120, 127, 47, 121, 45, 114, 124, 48, 39, 57, 118, 56, 86, 103, 89, 100, 116, 125, 22, 25, 119, 110, 63, 97, 30, 54, 40, 91, 122, 46, 52, 59, 109, 111, 55, 104, 123, 81, 58, 50, 126, 33, 19, 17, 93, 83, 49, 108, 94, 21, 85, 15, 24, 101, 36, 76, 79, 92, 12, 14, 78, 44, 37, 43, 98, 28, 84, 74, 88, 34, 10, 107, 20, 42, 106, 87, 102, 7, 16, 96, 71, 31, 23, 82, 41, 29, 27, 72, 9, 32, 18, 73, 80, 95, 105, 35, 5, 38, 26, 0, 69, 2, 77, 67, 64, 11, 66, 3, 99, 75, 13, 8, 90, 1, 68, 4, 70, 65, 6], [117, 112, 53, 60, 115, 62, 113, 51, 120, 61, 127, 47, 121, 45, 114, 39, 124, 48, 57, 118, 86, 100, 125, 116, 110, 89, 56, 22, 103, 25, 63, 119, 46, 109, 59, 40, 97, 91, 104, 52, 123, 122, 54, 30, 111, 108, 58, 101, 55, 93, 33, 81, 36, 126, 21, 50, 17, 19, 83, 94, 92, 12, 15, 98, 107, 44, 14, 43, 78, 76, 79, 85, 34, 49, 106, 42, 28, 87, 24, 88, 102, 10, 37, 84, 7, 18, 74, 41, 71, 20, 82, 32, 96, 9, 95, 31, 5, 72, 16, 38, 27, 23, 29, 69, 26, 2, 64, 11, 35, 73, 80, 0, 13, 105, 90, 66, 77, 75, 3, 99, 67, 4, 70, 8, 1, 65, 68, 6], [117, 112, 53, 60, 115, 62, 113, 51, 120, 61, 127, 47, 121, 45, 114, 39, 57, 124, 56, 86, 118, 125, 116, 100, 103, 22, 48, 89, 110, 25, 59, 54, 63, 97, 91, 122, 30, 40, 52, 109, 111, 126, 46, 123, 104, 58, 55, 119, 50, 108, 33, 17, 81, 93, 19, 83, 107, 85, 12, 14, 49, 21, 101, 15, 94, 92, 98, 36, 44, 79, 76, 78, 106, 34, 24, 42, 88, 28, 37, 43, 10, 84, 96, 20, 74, 102, 87, 27, 7, 9, 16, 23, 31, 38, 72, 95, 71, 41, 18, 32, 80, 73, 29, 82, 26, 105, 5, 77, 11, 69, 35, 66, 99, 3, 13, 67, 75, 64, 2, 90, 0, 65, 8, 68, 70, 4, 6, 1], [117, 112, 53, 115, 60, 62, 113, 51, 120, 127, 61, 47, 121, 45, 124, 39, 114, 57, 125, 56, 118, 103, 48, 116, 86, 100, 89, 22, 122, 91, 46, 40, 110, 63, 109, 25, 59, 111, 55, 123, 54, 97, 52, 50, 58, 126, 108, 104, 119, 30, 33, 81, 93, 101, 83, 94, 49, 17, 21, 36, 107, 79, 12, 19, 76, 44, 14, 78, 15, 92, 42, 28, 85, 24, 34, 98, 106, 20, 10, 37, 43, 71, 88, 84, 74, 7, 96, 87, 72, 27, 102, 32, 31, 9, 73, 95, 38, 80, 23, 16, 5, 69, 29, 82, 18, 11, 41, 66, 0, 105, 26, 64, 35, 99, 77, 13, 67, 2, 3, 70, 4, 75, 90, 8, 68, 1, 6, 65], [117, 112, 53, 115, 60, 113, 62, 51, 120, 61, 127, 47, 121, 45, 114, 39, 56, 124, 57, 86, 100, 118, 125, 89, 116, 48, 22, 103, 91, 25, 59, 111, 122, 97, 40, 109, 110, 119, 54, 123, 63, 30, 55, 104, 81, 46, 93, 126, 33, 19, 50, 52, 17, 49, 79, 83, 85, 101, 14, 94, 15, 92, 12, 21, 78, 108, 76, 58, 44, 98, 107, 36, 10, 28, 88, 74, 20, 106, 24, 43, 84, 87, 34, 37, 102, 42, 31, 72, 96, 71, 27, 95, 9, 73, 23, 7, 38, 18, 16, 32, 82, 80, 29, 35, 26, 69, 99, 5, 13, 11, 41, 75, 77, 105, 90, 8, 2, 66, 3, 70, 67, 0, 64, 6, 65, 4, 68, 1]], "model.layers.16.self_attn.q_proj": [[61, 125, 115, 121, 36, 101, 43, 62, 127, 63, 55, 92, 100, 97, 57, 40, 124, 60, 119, 54, 109, 49, 123, 126, 53, 59, 58, 113, 47, 117, 114, 118, 112, 50, 103, 51, 116, 52, 122, 45, 120, 102, 56, 41, 48, 89, 46, 110, 111, 37, 42, 107, 44, 108, 104, 32, 33, 39, 38, 105, 106, 99, 84, 35, 34, 98, 76, 10, 13, 24, 29, 72, 70, 95, 5, 16, 88, 21, 96, 2, 25, 71, 9, 4, 94, 86, 65, 0, 26, 80, 31, 23, 11, 93, 15, 30, 82, 77, 20, 3, 28, 78, 22, 67, 73, 18, 27, 90, 91, 87, 85, 6, 14, 75, 83, 79, 69, 8, 74, 19, 12, 81, 17, 68, 66, 1, 7, 64], [125, 61, 121, 115, 62, 55, 124, 101, 57, 126, 49, 52, 119, 127, 60, 112, 63, 50, 123, 120, 59, 53, 47, 117, 54, 113, 114, 118, 43, 56, 116, 122, 51, 107, 58, 48, 109, 45, 46, 111, 110, 40, 108, 44, 106, 92, 42, 41, 35, 105, 104, 39, 36, 38, 103, 37, 100, 102, 98, 99, 22, 25, 34, 97, 33, 96, 19, 32, 93, 89, 95, 18, 31, 94, 86, 15, 30, 29, 76, 78, 4, 26, 24, 23, 11, 72, 28, 16, 70, 9, 13, 77, 27, 21, 87, 83, 81, 90, 85, 17, 91, 88, 10, 71, 20, 2, 73, 84, 0, 82, 65, 74, 3, 79, 5, 80, 14, 67, 6, 12, 8, 1, 75, 64, 68, 66, 69, 7], [61, 125, 27, 121, 36, 94, 87, 100, 25, 62, 115, 35, 21, 63, 101, 20, 43, 127, 122, 60, 55, 32, 57, 49, 102, 40, 96, 74, 84, 95, 54, 126, 123, 52, 79, 53, 37, 113, 124, 118, 47, 119, 93, 114, 50, 117, 44, 120, 10, 112, 56, 51, 109, 59, 16, 48, 116, 83, 58, 45, 99, 46, 42, 19, 41, 104, 30, 31, 90, 82, 110, 29, 111, 92, 6, 107, 86, 12, 89, 105, 23, 13, 80, 15, 97, 98, 39, 91, 18, 28, 34, 108, 38, 77, 78, 33, 106, 17, 4, 24, 26, 103, 88, 73, 85, 5, 66, 68, 1, 9, 2, 76, 0, 67, 14, 81, 8, 22, 71, 7, 69, 64, 70, 72, 3, 75, 11, 65], [61, 125, 100, 96, 26, 30, 88, 122, 27, 121, 87, 77, 92, 98, 115, 32, 81, 75, 20, 43, 17, 93, 8, 94, 36, 62, 33, 29, 90, 44, 86, 99, 23, 41, 63, 34, 127, 40, 85, 49, 68, 101, 83, 57, 73, 102, 69, 19, 55, 110, 53, 56, 118, 60, 113, 48, 116, 2, 103, 16, 84, 13, 24, 12, 11, 50, 52, 47, 54, 31, 112, 51, 91, 79, 14, 109, 18, 6, 117, 126, 95, 78, 123, 119, 25, 97, 89, 21, 120, 124, 35, 111, 1, 72, 46, 42, 58, 76, 15, 114, 104, 45, 64, 59, 80, 38, 7, 28, 105, 67, 39, 107, 108, 106, 74, 82, 70, 71, 37, 66, 10, 4, 22, 9, 5, 3, 0, 65], [42, 98, 30, 124, 24, 85, 116, 106, 15, 19, 12, 17, 8, 88, 102, 78, 95, 52, 27, 10, 68, 66, 70, 94, 81, 38, 7, 119, 60, 90, 125, 22, 47, 83, 64, 120, 63, 55, 11, 75, 21, 59, 37, 92, 41, 53, 1, 26, 86, 76, 9, 111, 109, 114, 28, 44, 49, 103, 126, 23, 80, 117, 93, 121, 57, 36, 91, 108, 34, 25, 62, 123, 29, 6, 14, 48, 87, 35, 58, 3, 105, 79, 32, 45, 20, 33, 99, 118, 67, 113, 100, 18, 51, 69, 43, 115, 110, 31, 61, 46, 127, 5, 13, 122, 40, 89, 39, 72, 104, 56, 73, 54, 65, 2, 97, 82, 101, 50, 77, 107, 74, 0, 112, 16, 84, 96, 71, 4], [42, 124, 98, 63, 116, 30, 106, 38, 27, 19, 85, 88, 24, 62, 15, 35, 90, 17, 28, 10, 87, 57, 102, 99, 53, 122, 114, 47, 59, 61, 78, 109, 48, 108, 56, 60, 94, 123, 120, 125, 117, 113, 40, 12, 119, 26, 110, 43, 8, 45, 50, 126, 44, 118, 115, 31, 58, 52, 95, 46, 111, 80, 121, 93, 127, 112, 51, 54, 105, 36, 107, 86, 70, 81, 20, 49, 100, 68, 55, 37, 103, 39, 22, 96, 104, 33, 11, 41, 32, 18, 83, 21, 97, 23, 5, 84, 29, 101, 91, 34, 0, 92, 3, 6, 66, 74, 25, 89, 1, 82, 77, 71, 9, 16, 4, 13, 79, 7, 14, 75, 64, 69, 67, 76, 73, 65, 2, 72], [42, 98, 30, 124, 106, 85, 19, 63, 24, 27, 17, 59, 116, 15, 88, 12, 10, 78, 68, 8, 53, 70, 60, 94, 81, 102, 95, 28, 66, 122, 117, 52, 58, 45, 93, 125, 96, 76, 90, 64, 72, 89, 119, 38, 55, 120, 41, 103, 62, 100, 49, 47, 80, 82, 126, 114, 108, 21, 127, 83, 123, 44, 87, 79, 111, 113, 32, 37, 36, 35, 40, 14, 26, 20, 50, 43, 110, 46, 91, 121, 105, 48, 75, 25, 109, 115, 56, 92, 104, 1, 4, 71, 51, 61, 65, 101, 16, 6, 57, 11, 74, 31, 54, 29, 67, 22, 7, 3, 84, 23, 99, 33, 118, 2, 97, 39, 107, 13, 112, 86, 18, 73, 9, 77, 34, 5, 0, 69], [42, 98, 124, 52, 106, 30, 85, 15, 24, 27, 19, 88, 63, 60, 17, 45, 10, 116, 53, 95, 12, 108, 117, 59, 100, 121, 8, 113, 102, 94, 114, 119, 122, 48, 93, 47, 44, 58, 40, 62, 36, 37, 123, 78, 105, 35, 111, 49, 28, 125, 81, 90, 38, 57, 43, 104, 33, 56, 109, 46, 70, 103, 50, 41, 118, 68, 115, 23, 61, 51, 32, 54, 55, 127, 20, 126, 87, 6, 112, 39, 110, 80, 101, 120, 67, 91, 99, 25, 97, 96, 83, 21, 86, 82, 66, 92, 26, 107, 89, 31, 1, 3, 13, 71, 84, 74, 29, 22, 75, 79, 18, 11, 77, 34, 16, 64, 14, 72, 76, 69, 73, 2, 5, 9, 7, 65, 4, 0], [120, 39, 51, 48, 119, 123, 25, 112, 98, 89, 82, 53, 13, 31, 54, 44, 52, 113, 127, 111, 122, 121, 59, 58, 124, 60, 126, 117, 57, 50, 118, 116, 62, 61, 63, 125, 91, 107, 47, 114, 46, 115, 41, 49, 109, 56, 55, 20, 108, 28, 106, 45, 110, 42, 104, 43, 92, 103, 102, 1, 4, 11, 87, 105, 96, 65, 35, 36, 86, 101, 37, 72, 38, 100, 40, 33, 18, 99, 32, 66, 93, 90, 29, 95, 23, 7, 79, 97, 30, 69, 94, 8, 34, 80, 21, 77, 16, 27, 84, 15, 26, 85, 75, 14, 0, 9, 68, 19, 76, 88, 64, 22, 3, 24, 83, 78, 17, 5, 81, 10, 71, 67, 12, 2, 73, 6, 74, 70], [51, 48, 39, 120, 25, 98, 82, 123, 119, 87, 127, 91, 50, 121, 124, 54, 45, 122, 89, 106, 13, 31, 118, 53, 111, 86, 28, 52, 58, 112, 61, 113, 49, 46, 92, 126, 47, 110, 60, 57, 115, 62, 40, 117, 63, 44, 56, 108, 43, 55, 15, 41, 79, 116, 104, 59, 125, 77, 73, 36, 84, 109, 4, 38, 114, 65, 20, 23, 42, 103, 96, 7, 107, 99, 105, 27, 18, 100, 11, 102, 3, 37, 35, 68, 95, 8, 93, 101, 72, 88, 76, 34, 66, 94, 64, 22, 24, 30, 69, 85, 83, 90, 32, 33, 80, 78, 26, 75, 97, 10, 81, 17, 16, 29, 19, 74, 1, 2, 12, 70, 21, 14, 67, 9, 5, 71, 6, 0], [120, 39, 51, 25, 48, 98, 31, 89, 54, 127, 53, 123, 82, 20, 91, 11, 13, 56, 58, 118, 86, 49, 41, 57, 113, 116, 119, 112, 124, 15, 43, 4, 111, 47, 121, 60, 122, 62, 115, 28, 108, 63, 52, 107, 23, 114, 92, 50, 117, 126, 61, 59, 42, 45, 125, 7, 72, 32, 46, 55, 36, 102, 79, 88, 1, 104, 8, 109, 44, 19, 101, 93, 96, 27, 103, 87, 37, 77, 40, 110, 76, 99, 9, 33, 94, 106, 95, 17, 26, 66, 105, 69, 35, 100, 81, 30, 29, 97, 90, 85, 38, 75, 0, 84, 22, 21, 16, 80, 83, 24, 65, 10, 18, 68, 14, 34, 78, 6, 64, 12, 73, 2, 74, 70, 71, 5, 67, 3], [39, 51, 56, 120, 48, 123, 98, 119, 25, 20, 31, 89, 82, 28, 13, 50, 52, 54, 122, 43, 60, 87, 86, 41, 59, 79, 53, 91, 116, 88, 115, 4, 113, 124, 9, 92, 118, 7, 42, 58, 127, 126, 73, 112, 11, 57, 77, 114, 76, 125, 101, 109, 121, 61, 63, 1, 117, 81, 17, 111, 62, 102, 49, 10, 55, 19, 90, 108, 46, 8, 104, 45, 21, 85, 47, 93, 44, 22, 27, 74, 6, 38, 97, 23, 70, 75, 107, 14, 95, 32, 69, 15, 78, 40, 105, 103, 106, 84, 99, 33, 18, 96, 26, 16, 94, 0, 66, 36, 35, 110, 72, 67, 30, 80, 29, 12, 37, 24, 3, 5, 65, 100, 64, 71, 83, 34, 68, 2], [42, 100, 106, 90, 79, 78, 18, 20, 31, 85, 94, 75, 77, 111, 16, 86, 70, 73, 26, 4, 76, 46, 72, 126, 125, 124, 58, 3, 64, 67, 66, 56, 103, 23, 55, 117, 40, 28, 82, 21, 52, 2, 45, 65, 114, 80, 0, 11, 33, 127, 24, 10, 12, 119, 44, 71, 83, 81, 95, 88, 62, 15, 7, 97, 84, 8, 74, 5, 14, 53, 25, 93, 39, 122, 22, 109, 30, 59, 50, 61, 121, 69, 29, 6, 57, 89, 123, 9, 115, 13, 32, 113, 104, 47, 17, 91, 36, 54, 27, 41, 120, 99, 37, 51, 110, 105, 112, 34, 92, 116, 19, 108, 87, 48, 107, 43, 98, 60, 96, 35, 102, 49, 38, 68, 101, 118, 63, 1], [42, 100, 20, 106, 18, 77, 85, 90, 88, 111, 79, 75, 95, 73, 31, 70, 58, 40, 62, 23, 94, 26, 125, 52, 46, 16, 127, 68, 82, 78, 2, 89, 124, 104, 126, 45, 11, 109, 117, 72, 80, 61, 84, 56, 57, 67, 64, 1, 9, 99, 47, 114, 60, 33, 48, 96, 122, 21, 86, 103, 55, 12, 36, 14, 13, 97, 119, 3, 54, 25, 91, 15, 50, 115, 112, 101, 76, 17, 81, 4, 29, 65, 69, 39, 44, 102, 51, 32, 27, 92, 38, 8, 93, 53, 116, 105, 37, 24, 107, 22, 0, 123, 30, 49, 63, 41, 87, 108, 7, 110, 28, 5, 71, 83, 113, 74, 43, 59, 10, 118, 6, 34, 121, 120, 35, 19, 98, 66], [42, 100, 106, 20, 78, 90, 18, 85, 77, 73, 88, 111, 75, 94, 16, 70, 46, 124, 58, 26, 79, 72, 31, 125, 4, 56, 2, 23, 40, 67, 82, 52, 64, 127, 62, 33, 126, 104, 9, 117, 119, 95, 3, 103, 24, 11, 13, 45, 68, 57, 65, 55, 0, 99, 30, 86, 80, 29, 114, 44, 21, 8, 47, 48, 50, 66, 112, 22, 25, 60, 12, 91, 84, 69, 115, 7, 89, 109, 1, 81, 39, 96, 83, 116, 105, 97, 51, 36, 122, 93, 15, 123, 113, 92, 54, 32, 59, 87, 14, 28, 38, 63, 120, 110, 17, 74, 49, 53, 107, 118, 102, 61, 27, 108, 71, 10, 35, 5, 37, 34, 43, 41, 19, 76, 101, 6, 121, 98], [42, 100, 94, 88, 106, 85, 31, 78, 18, 90, 16, 111, 20, 79, 95, 70, 75, 77, 46, 86, 11, 73, 56, 124, 127, 40, 80, 72, 10, 55, 58, 45, 67, 26, 71, 21, 39, 65, 125, 104, 13, 83, 23, 119, 62, 117, 33, 82, 47, 103, 126, 5, 114, 12, 57, 91, 109, 2, 69, 0, 19, 97, 6, 74, 24, 107, 60, 1, 52, 84, 17, 61, 76, 53, 30, 81, 15, 29, 7, 102, 44, 113, 28, 115, 98, 105, 110, 4, 59, 38, 89, 120, 122, 101, 116, 99, 3, 25, 50, 32, 64, 22, 49, 93, 36, 8, 54, 123, 118, 34, 37, 112, 63, 92, 27, 108, 51, 35, 96, 87, 121, 43, 48, 41, 68, 66, 14, 9], [40, 54, 63, 36, 98, 122, 90, 20, 88, 81, 22, 123, 29, 78, 15, 93, 24, 13, 82, 33, 57, 62, 115, 126, 26, 38, 91, 68, 83, 51, 27, 76, 8, 61, 125, 120, 92, 114, 17, 127, 113, 34, 45, 124, 121, 21, 74, 19, 10, 11, 59, 100, 116, 50, 117, 119, 56, 46, 55, 97, 118, 73, 70, 16, 94, 31, 84, 53, 44, 107, 52, 28, 23, 48, 111, 106, 75, 109, 87, 18, 49, 77, 72, 80, 96, 69, 30, 35, 43, 25, 79, 89, 108, 112, 85, 0, 71, 32, 95, 101, 99, 1, 47, 39, 9, 2, 65, 58, 6, 37, 42, 4, 110, 105, 41, 103, 102, 60, 86, 14, 67, 3, 5, 104, 66, 64, 12, 7], [40, 54, 98, 63, 29, 82, 20, 76, 16, 122, 22, 71, 88, 50, 36, 3, 78, 90, 26, 93, 8, 73, 7, 83, 13, 61, 123, 15, 81, 84, 87, 5, 80, 66, 17, 64, 127, 115, 21, 77, 89, 74, 113, 65, 62, 12, 0, 125, 120, 100, 119, 86, 27, 23, 104, 97, 52, 91, 116, 121, 24, 2, 59, 19, 55, 112, 10, 57, 18, 48, 14, 51, 79, 118, 126, 9, 11, 43, 69, 53, 49, 92, 124, 45, 114, 56, 106, 6, 4, 111, 25, 33, 31, 117, 95, 70, 34, 30, 94, 67, 32, 85, 72, 46, 110, 28, 75, 37, 58, 38, 35, 103, 96, 99, 108, 42, 102, 109, 105, 101, 47, 1, 68, 60, 39, 41, 107, 44], [40, 61, 63, 122, 98, 54, 26, 123, 29, 88, 20, 24, 113, 93, 57, 22, 81, 90, 127, 125, 82, 78, 45, 76, 16, 73, 91, 92, 44, 121, 99, 32, 50, 62, 96, 19, 85, 38, 107, 15, 106, 28, 126, 47, 118, 112, 41, 120, 31, 109, 105, 119, 46, 116, 102, 21, 51, 39, 23, 74, 111, 117, 14, 55, 37, 43, 33, 89, 86, 35, 94, 53, 42, 103, 52, 5, 69, 60, 114, 110, 97, 115, 49, 80, 58, 12, 27, 17, 95, 13, 124, 108, 56, 48, 71, 59, 100, 83, 36, 104, 25, 101, 84, 87, 11, 30, 3, 10, 79, 66, 0, 64, 75, 68, 2, 34, 9, 18, 65, 8, 70, 1, 77, 6, 67, 4, 72, 7], [63, 40, 54, 122, 100, 123, 125, 127, 62, 57, 98, 113, 121, 119, 118, 59, 116, 115, 61, 51, 55, 114, 120, 53, 29, 124, 50, 52, 117, 91, 48, 56, 126, 20, 111, 26, 45, 36, 93, 88, 49, 112, 46, 87, 47, 25, 110, 27, 43, 106, 22, 81, 103, 109, 23, 108, 58, 60, 107, 99, 39, 38, 42, 44, 105, 41, 102, 96, 35, 101, 34, 33, 97, 37, 89, 104, 94, 92, 83, 21, 82, 28, 32, 79, 86, 24, 18, 90, 95, 15, 31, 16, 30, 75, 84, 13, 17, 85, 11, 80, 0, 73, 19, 74, 10, 2, 1, 72, 76, 64, 3, 69, 66, 68, 5, 8, 6, 78, 14, 71, 9, 70, 67, 77, 4, 12, 65, 7], [48, 52, 63, 116, 127, 55, 123, 61, 53, 54, 124, 117, 51, 56, 119, 62, 60, 122, 115, 126, 120, 112, 118, 59, 47, 111, 121, 49, 114, 57, 125, 58, 113, 50, 45, 46, 109, 36, 110, 101, 107, 108, 42, 44, 106, 43, 105, 98, 32, 96, 41, 38, 40, 39, 103, 100, 104, 37, 92, 102, 34, 90, 26, 99, 33, 35, 89, 93, 95, 97, 91, 31, 15, 94, 28, 20, 30, 23, 27, 29, 86, 22, 84, 17, 85, 81, 79, 87, 21, 25, 82, 76, 88, 14, 18, 74, 12, 72, 8, 10, 16, 83, 78, 24, 66, 69, 67, 5, 71, 77, 7, 3, 19, 11, 64, 2, 68, 80, 6, 4, 70, 73, 13, 75, 0, 1, 65, 9], [116, 52, 48, 101, 57, 37, 100, 97, 120, 59, 123, 94, 95, 63, 112, 114, 58, 54, 126, 38, 34, 60, 92, 50, 110, 127, 15, 125, 46, 39, 47, 113, 43, 55, 106, 61, 118, 56, 124, 111, 121, 44, 45, 99, 32, 22, 49, 105, 107, 103, 108, 119, 115, 40, 62, 51, 53, 42, 36, 109, 88, 85, 117, 98, 41, 122, 102, 35, 76, 91, 31, 23, 104, 89, 27, 28, 96, 90, 86, 30, 12, 17, 18, 20, 87, 26, 33, 29, 82, 84, 93, 25, 79, 14, 72, 21, 81, 24, 74, 83, 19, 10, 8, 78, 16, 80, 71, 5, 67, 69, 7, 3, 77, 11, 13, 73, 75, 2, 70, 9, 4, 66, 68, 6, 1, 65, 0, 64], [116, 48, 97, 101, 52, 88, 83, 77, 11, 16, 73, 4, 71, 22, 14, 70, 126, 85, 18, 61, 0, 2, 68, 42, 30, 64, 26, 27, 105, 55, 75, 9, 28, 80, 13, 93, 17, 47, 120, 65, 82, 121, 6, 24, 7, 67, 19, 41, 91, 5, 78, 118, 66, 37, 74, 127, 20, 69, 34, 21, 86, 10, 89, 23, 108, 3, 29, 31, 54, 57, 81, 53, 84, 87, 79, 114, 15, 36, 103, 72, 123, 50, 12, 8, 99, 1, 76, 45, 35, 43, 51, 33, 25, 92, 96, 98, 115, 46, 110, 94, 95, 90, 49, 112, 32, 113, 109, 102, 38, 44, 63, 39, 100, 106, 104, 40, 58, 59, 125, 60, 56, 124, 119, 62, 111, 107, 117, 122], [52, 63, 116, 48, 60, 114, 55, 51, 54, 122, 124, 53, 117, 125, 59, 127, 126, 120, 119, 47, 62, 58, 115, 123, 118, 61, 121, 111, 113, 46, 49, 101, 57, 56, 50, 109, 110, 45, 36, 112, 42, 98, 107, 32, 105, 43, 44, 97, 92, 106, 15, 108, 41, 37, 94, 90, 96, 104, 40, 103, 38, 39, 99, 100, 102, 35, 85, 34, 23, 26, 95, 33, 84, 29, 82, 87, 28, 30, 20, 27, 76, 93, 31, 91, 79, 89, 25, 17, 22, 86, 18, 81, 12, 21, 74, 8, 10, 72, 88, 5, 69, 24, 83, 14, 16, 71, 67, 78, 77, 7, 2, 3, 66, 4, 73, 9, 19, 13, 68, 11, 80, 0, 65, 75, 70, 6, 64, 1], [124, 49, 37, 55, 126, 61, 87, 118, 26, 93, 96, 121, 80, 12, 86, 119, 57, 84, 101, 50, 54, 60, 78, 123, 25, 90, 32, 67, 122, 18, 58, 85, 115, 112, 72, 29, 51, 16, 102, 120, 56, 6, 127, 59, 53, 15, 83, 52, 116, 34, 63, 92, 38, 89, 114, 62, 47, 46, 76, 79, 111, 103, 110, 104, 117, 125, 48, 98, 9, 44, 109, 36, 17, 43, 39, 45, 97, 81, 33, 21, 27, 107, 30, 10, 20, 108, 23, 106, 113, 35, 28, 31, 74, 73, 105, 42, 70, 88, 41, 22, 95, 91, 94, 11, 99, 13, 3, 19, 40, 77, 100, 24, 64, 14, 82, 75, 66, 4, 1, 69, 7, 0, 8, 65, 71, 68, 2, 5], [49, 124, 37, 55, 93, 61, 96, 126, 50, 118, 54, 57, 119, 123, 102, 121, 87, 79, 127, 112, 115, 53, 60, 122, 58, 125, 51, 120, 110, 114, 59, 85, 56, 90, 101, 26, 63, 62, 43, 116, 38, 36, 5, 46, 108, 113, 42, 52, 48, 117, 111, 44, 86, 47, 107, 41, 32, 30, 45, 109, 12, 39, 40, 104, 18, 35, 80, 105, 21, 103, 106, 24, 99, 20, 69, 100, 29, 25, 84, 33, 34, 73, 8, 92, 31, 98, 71, 89, 95, 77, 2, 94, 6, 23, 97, 91, 27, 17, 67, 28, 0, 88, 19, 11, 22, 64, 72, 15, 13, 7, 83, 16, 78, 3, 65, 81, 76, 9, 4, 70, 75, 82, 10, 74, 66, 14, 1, 68], [49, 124, 37, 55, 61, 126, 101, 118, 93, 119, 26, 87, 96, 50, 57, 123, 60, 121, 54, 21, 44, 115, 122, 58, 29, 12, 84, 85, 51, 56, 63, 90, 53, 120, 125, 47, 112, 102, 25, 59, 110, 62, 48, 116, 46, 52, 113, 114, 105, 100, 104, 45, 108, 111, 127, 43, 39, 86, 117, 103, 19, 109, 99, 27, 107, 38, 42, 28, 34, 88, 81, 80, 106, 41, 36, 72, 33, 31, 6, 40, 97, 79, 32, 35, 24, 78, 95, 98, 75, 30, 92, 67, 20, 73, 94, 17, 18, 83, 23, 82, 77, 91, 22, 10, 15, 89, 76, 14, 3, 69, 11, 8, 64, 13, 16, 70, 9, 7, 71, 0, 65, 74, 4, 5, 66, 1, 68, 2], [49, 124, 37, 55, 87, 61, 126, 118, 96, 50, 57, 123, 26, 80, 93, 101, 54, 121, 90, 86, 119, 60, 12, 122, 120, 51, 127, 115, 58, 53, 78, 84, 6, 56, 63, 114, 67, 112, 29, 52, 59, 47, 104, 85, 116, 113, 46, 25, 48, 125, 62, 102, 111, 20, 83, 65, 44, 89, 70, 16, 8, 110, 108, 107, 15, 79, 23, 21, 45, 18, 33, 99, 117, 109, 64, 92, 43, 35, 94, 72, 39, 40, 106, 32, 77, 42, 22, 2, 17, 0, 97, 71, 38, 27, 36, 103, 105, 10, 28, 76, 100, 14, 81, 30, 95, 73, 88, 34, 41, 4, 19, 9, 24, 3, 11, 13, 91, 31, 98, 69, 7, 68, 74, 82, 75, 1, 5, 66], [103, 34, 45, 28, 109, 84, 24, 38, 79, 17, 85, 86, 30, 92, 31, 12, 11, 95, 9, 22, 46, 111, 49, 23, 122, 116, 88, 14, 19, 63, 27, 36, 20, 75, 57, 44, 33, 121, 53, 107, 119, 52, 83, 115, 101, 113, 104, 124, 15, 50, 32, 48, 61, 100, 37, 97, 29, 91, 25, 94, 125, 108, 26, 93, 54, 73, 126, 106, 16, 55, 47, 56, 114, 40, 43, 42, 35, 117, 99, 112, 51, 120, 60, 21, 89, 58, 41, 123, 81, 102, 82, 62, 59, 110, 72, 105, 96, 127, 13, 90, 118, 18, 10, 87, 39, 8, 5, 98, 7, 68, 76, 78, 77, 6, 4, 80, 65, 69, 74, 67, 3, 66, 70, 1, 71, 2, 64, 0], [103, 45, 34, 28, 14, 109, 17, 12, 7, 84, 71, 66, 79, 9, 3, 5, 24, 85, 69, 44, 64, 65, 67, 31, 86, 10, 20, 96, 127, 98, 105, 81, 2, 61, 25, 76, 35, 38, 39, 51, 0, 63, 8, 124, 52, 49, 55, 72, 125, 58, 90, 23, 30, 68, 78, 19, 4, 92, 73, 75, 122, 115, 118, 57, 16, 88, 95, 111, 100, 70, 11, 77, 6, 74, 21, 46, 119, 102, 60, 89, 18, 113, 94, 13, 59, 97, 120, 83, 93, 121, 107, 126, 36, 80, 43, 53, 1, 40, 15, 116, 29, 110, 112, 54, 91, 22, 56, 123, 37, 50, 82, 48, 117, 101, 47, 41, 87, 114, 42, 99, 27, 62, 104, 33, 106, 108, 26, 32], [103, 45, 34, 38, 109, 0, 28, 79, 7, 3, 24, 84, 12, 9, 14, 2, 17, 66, 5, 31, 86, 85, 1, 67, 10, 125, 69, 122, 92, 57, 100, 25, 23, 20, 112, 64, 98, 35, 61, 58, 44, 121, 51, 26, 55, 22, 68, 21, 72, 118, 59, 49, 96, 81, 65, 105, 102, 75, 48, 82, 32, 30, 123, 70, 63, 52, 74, 90, 54, 46, 93, 13, 76, 36, 60, 111, 116, 88, 56, 124, 42, 119, 104, 6, 126, 127, 8, 120, 50, 89, 40, 110, 73, 71, 115, 18, 80, 4, 114, 94, 95, 83, 19, 91, 43, 108, 16, 62, 117, 47, 39, 29, 78, 99, 77, 15, 113, 53, 27, 107, 87, 37, 106, 97, 101, 41, 11, 33], [103, 34, 45, 28, 109, 84, 79, 24, 31, 92, 17, 86, 22, 12, 108, 63, 9, 29, 88, 30, 102, 55, 25, 89, 98, 122, 112, 58, 14, 105, 104, 53, 100, 27, 49, 20, 61, 11, 50, 57, 114, 72, 113, 33, 42, 121, 127, 81, 95, 52, 116, 38, 23, 35, 90, 83, 36, 85, 111, 124, 41, 93, 125, 18, 126, 46, 77, 39, 74, 94, 115, 123, 80, 5, 101, 59, 26, 82, 62, 119, 16, 73, 19, 13, 43, 21, 47, 118, 96, 76, 56, 32, 15, 91, 48, 51, 87, 99, 110, 106, 78, 44, 117, 97, 75, 7, 37, 8, 10, 107, 70, 54, 60, 4, 6, 120, 68, 40, 69, 3, 71, 66, 67, 65, 1, 64, 0, 2]], "model.layers.16.self_attn.k_proj": [[125, 61, 86, 36, 96, 127, 17, 115, 121, 49, 30, 62, 63, 55, 57, 113, 60, 54, 26, 122, 53, 39, 51, 118, 48, 56, 126, 107, 116, 47, 112, 59, 52, 29, 58, 108, 124, 87, 40, 114, 120, 109, 44, 119, 123, 117, 79, 50, 27, 83, 11, 20, 110, 46, 78, 18, 43, 45, 111, 13, 25, 85, 98, 105, 106, 97, 41, 24, 42, 104, 72, 16, 82, 33, 71, 38, 101, 103, 9, 34, 28, 76, 10, 37, 91, 102, 100, 89, 31, 88, 35, 4, 99, 22, 70, 80, 95, 5, 94, 92, 21, 15, 14, 2, 32, 90, 19, 93, 77, 12, 65, 84, 23, 81, 0, 7, 3, 67, 73, 75, 6, 8, 74, 1, 69, 68, 66, 64], [106, 94, 34, 124, 116, 24, 63, 19, 52, 59, 85, 17, 15, 42, 55, 60, 121, 10, 70, 78, 12, 120, 27, 49, 8, 117, 111, 58, 47, 108, 114, 53, 109, 64, 68, 18, 1, 66, 4, 126, 123, 50, 122, 22, 44, 102, 45, 91, 51, 125, 46, 127, 90, 0, 119, 62, 112, 54, 56, 11, 84, 105, 104, 48, 61, 100, 28, 57, 41, 110, 40, 113, 115, 95, 118, 14, 80, 92, 103, 37, 77, 107, 101, 76, 73, 69, 2, 16, 26, 89, 32, 99, 43, 31, 13, 35, 9, 36, 23, 33, 97, 87, 20, 25, 7, 39, 93, 65, 96, 38, 75, 71, 67, 3, 86, 29, 74, 79, 5, 88, 82, 21, 83, 30, 72, 6, 98, 81], [103, 51, 120, 34, 86, 56, 48, 95, 123, 115, 112, 119, 53, 28, 54, 122, 58, 89, 60, 126, 52, 88, 40, 82, 61, 15, 124, 118, 111, 50, 121, 63, 46, 57, 113, 47, 62, 125, 117, 55, 107, 116, 20, 26, 114, 98, 91, 108, 110, 8, 59, 45, 49, 44, 81, 11, 43, 127, 109, 13, 42, 4, 38, 105, 93, 65, 41, 73, 106, 6, 64, 30, 16, 68, 37, 24, 27, 104, 72, 23, 102, 35, 33, 36, 85, 29, 80, 100, 76, 94, 101, 99, 12, 9, 87, 97, 14, 67, 1, 21, 79, 19, 66, 83, 71, 78, 17, 32, 90, 96, 10, 7, 2, 18, 92, 0, 70, 74, 77, 25, 5, 3, 84, 39, 69, 31, 75, 22], [106, 36, 90, 18, 85, 47, 20, 42, 79, 77, 110, 16, 58, 72, 75, 124, 126, 73, 78, 70, 56, 127, 109, 117, 119, 114, 30, 95, 120, 40, 86, 4, 125, 65, 108, 76, 112, 62, 45, 52, 43, 67, 60, 99, 88, 61, 96, 51, 39, 53, 111, 0, 66, 54, 93, 97, 69, 68, 8, 100, 19, 116, 25, 55, 38, 74, 13, 103, 32, 91, 29, 115, 48, 6, 23, 57, 89, 41, 107, 64, 123, 11, 2, 31, 7, 5, 63, 101, 81, 1, 44, 94, 50, 122, 104, 92, 87, 27, 28, 17, 49, 46, 37, 105, 34, 3, 118, 12, 113, 33, 71, 83, 35, 121, 22, 80, 59, 102, 14, 9, 26, 98, 15, 82, 24, 84, 10, 21], [104, 54, 63, 34, 22, 93, 122, 120, 90, 123, 62, 119, 125, 121, 126, 127, 61, 50, 88, 59, 113, 51, 118, 81, 52, 20, 16, 116, 115, 76, 55, 64, 46, 82, 57, 53, 48, 114, 117, 56, 71, 60, 73, 91, 124, 45, 69, 111, 78, 49, 39, 109, 13, 106, 110, 43, 112, 2, 58, 44, 15, 108, 1, 87, 98, 47, 74, 11, 102, 67, 42, 80, 79, 41, 29, 100, 103, 37, 105, 96, 107, 35, 14, 92, 97, 21, 28, 25, 26, 10, 85, 95, 83, 99, 32, 19, 38, 9, 101, 8, 18, 33, 30, 70, 94, 36, 12, 72, 3, 31, 65, 68, 23, 75, 24, 40, 77, 27, 6, 89, 4, 86, 66, 84, 17, 0, 7, 5], [52, 116, 37, 48, 83, 16, 77, 33, 22, 88, 11, 112, 73, 70, 4, 71, 14, 126, 61, 127, 63, 0, 2, 50, 123, 106, 1, 121, 29, 44, 51, 55, 58, 114, 60, 111, 120, 57, 125, 93, 115, 30, 118, 56, 108, 110, 91, 122, 119, 99, 53, 49, 113, 124, 59, 54, 85, 62, 105, 117, 46, 47, 109, 107, 41, 43, 40, 102, 104, 42, 90, 103, 35, 81, 74, 101, 45, 95, 38, 75, 18, 39, 98, 8, 92, 5, 36, 9, 67, 12, 25, 32, 96, 20, 97, 34, 100, 3, 65, 94, 31, 21, 78, 89, 13, 23, 69, 24, 82, 17, 84, 87, 19, 76, 15, 64, 7, 79, 10, 28, 6, 27, 80, 72, 68, 26, 66, 86], [124, 49, 101, 113, 86, 55, 32, 126, 57, 118, 29, 54, 61, 119, 121, 50, 123, 120, 26, 58, 122, 53, 115, 51, 114, 52, 63, 47, 60, 56, 125, 116, 62, 59, 111, 112, 48, 45, 108, 80, 46, 109, 44, 103, 99, 18, 127, 87, 43, 117, 78, 110, 107, 105, 37, 42, 106, 40, 102, 98, 72, 64, 104, 12, 39, 97, 73, 3, 20, 38, 84, 41, 21, 19, 36, 100, 17, 79, 94, 92, 33, 34, 25, 30, 23, 35, 85, 89, 27, 68, 81, 6, 31, 28, 24, 10, 77, 95, 14, 91, 4, 70, 93, 83, 88, 66, 1, 16, 96, 11, 5, 71, 90, 75, 0, 22, 7, 67, 13, 65, 74, 82, 15, 9, 8, 2, 76, 69], [109, 39, 98, 84, 92, 12, 17, 64, 7, 79, 14, 9, 5, 45, 3, 66, 61, 121, 41, 49, 122, 52, 86, 95, 127, 58, 21, 119, 40, 111, 124, 55, 65, 57, 118, 63, 51, 24, 23, 126, 48, 116, 25, 102, 28, 72, 30, 60, 34, 1, 22, 44, 125, 47, 0, 106, 59, 100, 18, 120, 90, 80, 82, 117, 112, 32, 97, 46, 89, 36, 29, 104, 6, 11, 19, 4, 70, 53, 26, 74, 123, 93, 77, 35, 108, 99, 85, 68, 62, 27, 50, 16, 56, 8, 107, 31, 115, 88, 114, 78, 83, 15, 91, 38, 20, 105, 94, 37, 42, 76, 13, 54, 43, 96, 110, 10, 87, 33, 73, 101, 113, 2, 71, 81, 67, 69, 75, 103]], "model.layers.16.self_attn.qk_proj": [[106, 124, 125, 116, 61, 52, 109, 42, 49, 63, 51, 54, 120, 48, 121, 45, 53, 123, 55, 119, 127, 56, 58, 98, 126, 28, 22, 39, 34, 122, 26, 84, 20, 88, 86, 79, 57, 15, 36, 59, 112, 101, 37, 40, 100, 62, 113, 118, 103, 50, 24, 30, 47, 117, 81, 85, 78, 111, 21, 115, 104, 12, 60, 29, 82, 14, 17, 76, 93, 90, 9, 92, 114, 80, 18, 16, 73, 94, 77, 110, 11, 31, 44, 83, 95, 27, 13, 46, 19, 72, 96, 108, 71, 75, 70, 25, 107, 102, 89, 64, 91, 32, 43, 97, 3, 7, 87, 23, 8, 68, 0, 41, 4, 67, 38, 6, 69, 5, 74, 105, 33, 2, 66, 10, 35, 99, 1, 65], [106, 124, 125, 61, 116, 52, 109, 42, 63, 49, 51, 54, 120, 48, 45, 121, 56, 123, 119, 127, 98, 58, 55, 53, 28, 126, 34, 39, 20, 86, 22, 118, 36, 26, 88, 84, 37, 79, 122, 47, 15, 59, 57, 111, 113, 24, 101, 112, 40, 115, 21, 100, 62, 30, 103, 85, 117, 60, 81, 29, 78, 76, 114, 14, 50, 104, 12, 92, 17, 90, 82, 93, 73, 9, 44, 77, 94, 16, 31, 18, 108, 80, 11, 72, 27, 46, 83, 13, 95, 96, 70, 107, 75, 110, 89, 19, 25, 7, 64, 102, 0, 87, 91, 38, 67, 8, 32, 71, 97, 43, 4, 68, 23, 5, 69, 41, 3, 105, 2, 74, 66, 35, 33, 1, 99, 6, 65, 10], [106, 125, 124, 116, 61, 52, 109, 42, 49, 63, 51, 54, 120, 48, 45, 121, 119, 123, 55, 98, 126, 56, 127, 58, 53, 28, 34, 39, 88, 36, 101, 122, 86, 84, 26, 100, 22, 20, 47, 113, 118, 57, 62, 79, 24, 59, 112, 85, 15, 40, 37, 103, 81, 50, 111, 104, 21, 60, 117, 29, 115, 30, 114, 93, 14, 90, 78, 12, 94, 17, 18, 92, 9, 27, 73, 44, 82, 76, 64, 31, 102, 110, 13, 95, 80, 83, 16, 96, 91, 11, 0, 77, 46, 25, 108, 72, 70, 19, 32, 107, 38, 75, 7, 71, 87, 4, 68, 3, 89, 8, 97, 41, 23, 1, 66, 69, 105, 67, 2, 65, 33, 35, 43, 5, 74, 99, 6, 10], [106, 125, 124, 116, 61, 52, 63, 109, 49, 42, 51, 54, 120, 48, 45, 121, 53, 123, 119, 56, 127, 58, 98, 55, 126, 28, 39, 59, 62, 57, 22, 20, 79, 101, 118, 34, 36, 26, 84, 100, 47, 113, 122, 86, 37, 24, 115, 112, 111, 40, 117, 15, 50, 30, 88, 103, 85, 104, 81, 29, 21, 14, 78, 60, 90, 92, 114, 93, 17, 94, 76, 27, 9, 73, 18, 0, 12, 44, 82, 80, 110, 16, 83, 13, 96, 77, 46, 108, 102, 70, 31, 19, 11, 107, 95, 72, 75, 7, 25, 97, 91, 38, 67, 87, 89, 71, 68, 8, 4, 41, 43, 64, 3, 2, 32, 66, 69, 23, 1, 5, 105, 33, 65, 35, 10, 99, 74, 6], [106, 124, 125, 116, 61, 52, 63, 49, 109, 42, 51, 54, 120, 48, 45, 121, 56, 123, 119, 53, 127, 126, 58, 55, 98, 57, 20, 22, 34, 39, 86, 47, 122, 59, 101, 118, 28, 112, 79, 62, 50, 84, 15, 40, 100, 88, 26, 36, 103, 111, 24, 117, 114, 81, 37, 30, 115, 85, 104, 14, 17, 113, 18, 78, 12, 93, 29, 60, 44, 76, 27, 73, 9, 90, 21, 94, 11, 80, 92, 13, 83, 68, 82, 107, 110, 77, 16, 46, 108, 64, 19, 31, 102, 72, 95, 0, 96, 25, 7, 8, 71, 4, 70, 67, 75, 41, 3, 91, 38, 69, 87, 23, 89, 2, 43, 32, 97, 1, 5, 33, 66, 6, 105, 35, 65, 99, 74, 10], [106, 124, 125, 116, 61, 52, 63, 109, 42, 49, 51, 54, 120, 48, 45, 121, 119, 123, 58, 127, 56, 126, 53, 98, 55, 20, 26, 22, 34, 47, 79, 57, 39, 84, 86, 62, 118, 59, 15, 103, 101, 88, 28, 36, 100, 111, 81, 24, 112, 122, 37, 50, 85, 30, 40, 60, 78, 117, 114, 14, 17, 29, 76, 104, 93, 27, 12, 21, 80, 82, 18, 73, 90, 16, 11, 94, 113, 13, 9, 92, 77, 44, 83, 115, 31, 19, 46, 110, 108, 107, 75, 96, 89, 25, 72, 8, 95, 7, 64, 23, 71, 6, 32, 91, 102, 0, 41, 97, 43, 69, 68, 87, 4, 67, 3, 70, 105, 5, 2, 35, 38, 66, 65, 33, 74, 99, 1, 10], [106, 125, 124, 116, 61, 52, 63, 109, 42, 49, 51, 120, 54, 48, 45, 119, 121, 127, 53, 56, 123, 126, 58, 98, 55, 20, 26, 57, 122, 84, 86, 22, 88, 34, 79, 118, 36, 47, 62, 28, 39, 81, 15, 24, 112, 101, 103, 37, 50, 59, 40, 30, 114, 100, 111, 85, 14, 78, 115, 90, 18, 117, 12, 17, 104, 29, 93, 27, 80, 21, 60, 113, 44, 76, 73, 94, 9, 82, 16, 83, 11, 92, 13, 77, 110, 107, 95, 19, 25, 31, 96, 108, 46, 41, 43, 75, 71, 89, 23, 102, 97, 6, 91, 32, 8, 7, 0, 72, 38, 3, 33, 67, 87, 64, 2, 69, 4, 35, 68, 74, 105, 5, 99, 66, 70, 10, 65, 1], [106, 124, 116, 125, 61, 52, 109, 42, 63, 49, 51, 54, 120, 48, 45, 119, 121, 127, 53, 56, 123, 55, 126, 98, 58, 22, 20, 26, 28, 84, 39, 86, 122, 101, 34, 57, 88, 36, 24, 100, 79, 15, 30, 81, 103, 37, 40, 112, 111, 62, 59, 85, 115, 47, 50, 118, 14, 113, 60, 117, 90, 17, 78, 93, 21, 114, 29, 104, 92, 80, 18, 12, 27, 16, 82, 73, 9, 13, 25, 76, 83, 95, 96, 44, 94, 77, 46, 107, 32, 11, 43, 91, 110, 102, 6, 19, 8, 31, 89, 0, 108, 75, 71, 64, 23, 41, 7, 97, 68, 105, 87, 33, 38, 72, 67, 3, 66, 4, 99, 35, 69, 2, 74, 5, 65, 1, 70, 10], [106, 124, 116, 125, 61, 52, 109, 42, 63, 51, 49, 54, 120, 48, 45, 127, 119, 121, 53, 98, 56, 55, 58, 34, 28, 123, 22, 20, 39, 126, 84, 101, 26, 88, 103, 86, 24, 100, 122, 36, 37, 79, 85, 112, 50, 111, 15, 30, 47, 57, 115, 113, 59, 60, 40, 62, 118, 17, 81, 78, 114, 14, 12, 21, 93, 90, 104, 117, 29, 18, 27, 76, 82, 80, 44, 108, 92, 16, 9, 73, 95, 13, 25, 77, 94, 83, 46, 96, 102, 19, 11, 8, 31, 107, 43, 75, 6, 91, 89, 0, 71, 110, 32, 23, 87, 64, 7, 97, 33, 41, 2, 38, 4, 72, 105, 67, 68, 66, 3, 35, 5, 69, 99, 1, 10, 74, 65, 70], [106, 124, 125, 61, 116, 52, 109, 42, 63, 49, 51, 54, 48, 120, 45, 121, 127, 119, 53, 56, 123, 55, 98, 58, 126, 34, 28, 101, 20, 57, 59, 84, 22, 26, 47, 115, 39, 103, 118, 36, 37, 122, 88, 50, 86, 100, 79, 111, 15, 24, 117, 113, 30, 40, 112, 17, 85, 62, 21, 12, 81, 78, 104, 60, 90, 44, 14, 9, 93, 29, 114, 18, 76, 92, 82, 16, 8, 94, 110, 27, 80, 73, 13, 83, 108, 46, 95, 96, 11, 75, 77, 6, 102, 7, 0, 64, 25, 19, 31, 32, 107, 71, 89, 3, 72, 38, 43, 4, 97, 23, 70, 91, 5, 33, 41, 68, 67, 2, 87, 66, 69, 99, 35, 74, 105, 1, 65, 10], [106, 124, 116, 125, 61, 52, 109, 42, 63, 49, 54, 51, 120, 48, 121, 45, 127, 53, 123, 119, 56, 58, 98, 55, 126, 122, 118, 57, 28, 34, 20, 15, 84, 39, 79, 22, 59, 86, 101, 26, 100, 50, 81, 88, 36, 113, 85, 37, 47, 111, 24, 40, 30, 115, 62, 60, 17, 112, 9, 12, 14, 78, 103, 117, 82, 21, 76, 73, 29, 104, 8, 18, 92, 90, 114, 93, 16, 27, 44, 94, 11, 0, 13, 80, 77, 83, 31, 64, 19, 68, 110, 7, 108, 75, 71, 70, 96, 107, 72, 25, 46, 4, 91, 67, 102, 95, 32, 89, 3, 87, 6, 2, 66, 1, 69, 5, 23, 65, 38, 97, 41, 33, 35, 105, 43, 74, 10, 99], [106, 124, 116, 125, 61, 52, 63, 109, 42, 49, 51, 54, 120, 48, 45, 121, 127, 53, 119, 58, 56, 126, 98, 123, 122, 86, 84, 22, 34, 20, 26, 57, 15, 55, 118, 28, 39, 79, 62, 47, 100, 101, 30, 50, 40, 88, 36, 37, 85, 103, 81, 112, 111, 59, 24, 17, 76, 115, 113, 14, 21, 60, 78, 117, 90, 114, 29, 82, 104, 12, 18, 9, 73, 16, 93, 27, 92, 94, 31, 80, 83, 44, 13, 77, 0, 110, 11, 95, 8, 19, 71, 46, 70, 102, 96, 108, 75, 91, 7, 72, 89, 68, 87, 4, 32, 64, 107, 25, 38, 67, 41, 66, 3, 43, 23, 97, 5, 69, 33, 2, 99, 6, 105, 10, 65, 35, 74, 1], [106, 124, 116, 125, 61, 52, 63, 109, 49, 42, 51, 120, 54, 48, 121, 45, 119, 53, 127, 56, 58, 123, 98, 126, 20, 26, 101, 39, 34, 22, 28, 118, 86, 37, 47, 57, 122, 88, 84, 114, 112, 79, 40, 55, 103, 62, 30, 59, 15, 36, 24, 100, 85, 50, 60, 113, 21, 117, 111, 115, 90, 81, 29, 93, 78, 104, 17, 82, 94, 92, 14, 18, 76, 95, 27, 44, 46, 12, 16, 9, 25, 108, 110, 31, 96, 19, 77, 13, 73, 83, 102, 70, 80, 43, 11, 107, 75, 91, 71, 87, 32, 89, 7, 41, 38, 72, 23, 97, 8, 99, 35, 0, 33, 67, 105, 5, 3, 68, 4, 64, 69, 66, 2, 74, 1, 10, 6, 65], [106, 124, 116, 125, 61, 52, 109, 63, 42, 49, 51, 54, 120, 48, 45, 121, 119, 53, 58, 123, 127, 126, 98, 56, 55, 20, 22, 28, 39, 57, 34, 101, 112, 118, 47, 122, 26, 79, 100, 37, 88, 59, 84, 86, 50, 36, 15, 62, 85, 30, 24, 115, 40, 114, 103, 81, 117, 104, 111, 60, 78, 113, 14, 12, 93, 76, 17, 82, 21, 29, 16, 73, 18, 90, 94, 9, 92, 11, 27, 70, 80, 44, 77, 64, 46, 83, 19, 72, 110, 71, 96, 31, 108, 75, 89, 0, 8, 13, 95, 25, 7, 107, 91, 97, 102, 2, 32, 43, 41, 67, 4, 87, 38, 68, 66, 23, 35, 3, 69, 65, 99, 5, 33, 10, 1, 105, 74, 6], [106, 124, 116, 125, 61, 52, 109, 63, 42, 51, 54, 49, 120, 48, 45, 121, 119, 53, 58, 127, 98, 55, 123, 126, 56, 20, 39, 22, 28, 34, 101, 15, 84, 118, 122, 47, 88, 86, 26, 79, 57, 37, 100, 30, 59, 36, 62, 24, 40, 112, 111, 115, 85, 81, 117, 50, 17, 103, 9, 78, 104, 29, 76, 93, 114, 14, 12, 113, 60, 90, 92, 94, 21, 82, 16, 73, 11, 18, 31, 80, 77, 27, 64, 46, 108, 75, 72, 19, 110, 83, 71, 44, 96, 25, 70, 102, 91, 8, 13, 41, 7, 107, 95, 4, 43, 87, 89, 67, 0, 68, 32, 3, 2, 97, 65, 38, 5, 33, 66, 69, 74, 6, 23, 35, 99, 10, 105, 1], [106, 125, 116, 124, 61, 52, 63, 109, 42, 49, 54, 51, 120, 48, 45, 121, 119, 53, 127, 56, 123, 55, 58, 98, 126, 39, 20, 118, 86, 122, 22, 28, 84, 101, 57, 26, 15, 37, 30, 79, 34, 112, 36, 47, 59, 62, 88, 50, 111, 100, 103, 81, 85, 60, 113, 24, 78, 117, 115, 40, 76, 114, 14, 12, 9, 29, 104, 93, 73, 90, 17, 18, 77, 82, 21, 94, 44, 108, 16, 31, 92, 96, 80, 110, 11, 83, 19, 46, 75, 13, 27, 72, 0, 107, 71, 7, 102, 8, 91, 70, 87, 43, 41, 64, 4, 25, 68, 23, 95, 3, 67, 97, 89, 32, 38, 6, 69, 2, 33, 5, 66, 74, 65, 105, 35, 1, 99, 10], [106, 124, 125, 116, 61, 52, 63, 109, 49, 42, 51, 54, 120, 48, 45, 121, 53, 119, 127, 98, 58, 126, 123, 56, 20, 26, 57, 122, 28, 101, 39, 22, 55, 36, 86, 84, 37, 34, 50, 112, 59, 15, 79, 100, 30, 114, 40, 88, 47, 115, 24, 62, 81, 118, 60, 21, 103, 85, 90, 14, 93, 104, 29, 111, 17, 117, 78, 94, 82, 16, 110, 76, 44, 113, 92, 108, 18, 9, 12, 27, 96, 77, 80, 19, 102, 25, 46, 83, 95, 73, 31, 75, 43, 13, 107, 91, 11, 72, 38, 41, 89, 32, 97, 6, 87, 64, 71, 7, 23, 35, 33, 0, 99, 66, 8, 5, 105, 67, 3, 68, 4, 69, 70, 74, 2, 65, 1, 10], [106, 124, 116, 125, 61, 52, 109, 63, 42, 51, 49, 54, 120, 48, 45, 121, 127, 119, 53, 98, 123, 58, 56, 126, 28, 39, 101, 55, 122, 26, 20, 22, 100, 86, 57, 36, 34, 59, 118, 84, 40, 15, 88, 37, 30, 117, 103, 79, 50, 24, 113, 111, 112, 62, 114, 14, 17, 60, 85, 29, 115, 81, 21, 47, 93, 44, 76, 104, 90, 95, 16, 82, 9, 27, 94, 78, 92, 73, 18, 12, 80, 108, 75, 77, 96, 31, 6, 110, 83, 72, 13, 0, 107, 46, 102, 11, 19, 91, 7, 43, 25, 87, 32, 64, 38, 97, 89, 41, 71, 23, 33, 8, 105, 2, 4, 67, 69, 3, 68, 35, 66, 74, 99, 65, 5, 1, 70, 10], [106, 124, 125, 116, 61, 52, 109, 63, 42, 49, 54, 51, 120, 48, 45, 53, 121, 119, 127, 56, 98, 58, 123, 101, 126, 34, 36, 55, 39, 118, 40, 59, 28, 20, 50, 103, 86, 115, 37, 111, 26, 100, 57, 22, 84, 24, 122, 15, 47, 88, 113, 30, 114, 62, 112, 79, 117, 104, 44, 60, 21, 81, 29, 14, 108, 78, 90, 82, 94, 76, 17, 85, 93, 92, 73, 16, 110, 18, 9, 12, 27, 102, 31, 46, 41, 77, 83, 80, 107, 75, 6, 19, 72, 95, 96, 89, 87, 8, 13, 43, 7, 11, 38, 97, 64, 0, 105, 25, 91, 23, 32, 71, 4, 33, 3, 35, 99, 69, 67, 68, 5, 66, 2, 74, 1, 65, 10, 70], [106, 124, 125, 116, 61, 52, 63, 109, 42, 49, 51, 54, 120, 48, 45, 53, 121, 119, 123, 127, 98, 58, 56, 55, 20, 39, 126, 22, 28, 57, 59, 37, 86, 34, 101, 84, 36, 118, 15, 122, 100, 30, 79, 26, 47, 103, 111, 40, 115, 24, 88, 117, 112, 113, 50, 90, 85, 81, 114, 29, 21, 14, 94, 92, 62, 93, 60, 12, 104, 78, 76, 82, 44, 27, 110, 17, 73, 80, 108, 9, 31, 77, 18, 75, 96, 83, 16, 13, 6, 95, 25, 72, 91, 11, 46, 19, 89, 102, 7, 107, 8, 0, 41, 23, 87, 32, 64, 71, 43, 105, 38, 35, 97, 33, 67, 69, 3, 68, 4, 5, 1, 99, 65, 74, 2, 66, 10, 70], [106, 124, 116, 125, 61, 52, 63, 109, 42, 49, 51, 54, 120, 48, 45, 121, 53, 119, 123, 127, 58, 56, 98, 57, 126, 22, 86, 59, 122, 34, 28, 20, 15, 100, 39, 84, 101, 26, 55, 103, 36, 40, 118, 88, 113, 37, 79, 47, 62, 114, 24, 78, 30, 117, 111, 115, 112, 50, 85, 81, 90, 76, 29, 93, 60, 17, 27, 14, 92, 9, 94, 104, 12, 44, 18, 73, 21, 82, 80, 108, 110, 83, 31, 16, 75, 13, 95, 46, 77, 96, 91, 19, 107, 89, 72, 11, 0, 25, 43, 102, 6, 7, 8, 41, 64, 87, 71, 32, 23, 97, 38, 3, 68, 33, 5, 67, 35, 4, 66, 105, 1, 74, 70, 69, 2, 65, 99, 10], [106, 124, 125, 116, 61, 52, 109, 63, 42, 49, 51, 54, 120, 48, 45, 121, 123, 53, 119, 127, 98, 58, 56, 126, 59, 28, 57, 39, 55, 34, 101, 26, 22, 20, 122, 84, 15, 36, 103, 86, 47, 88, 100, 115, 113, 118, 24, 79, 112, 62, 30, 37, 114, 111, 40, 117, 85, 21, 104, 90, 78, 76, 81, 29, 93, 14, 17, 110, 108, 50, 60, 18, 27, 94, 44, 92, 82, 9, 16, 73, 80, 83, 13, 12, 25, 0, 31, 77, 46, 102, 107, 64, 95, 19, 7, 8, 96, 75, 11, 91, 97, 38, 43, 72, 71, 87, 70, 4, 89, 67, 32, 3, 23, 6, 68, 2, 33, 66, 41, 5, 65, 1, 35, 69, 105, 99, 74, 10], [106, 124, 116, 125, 61, 52, 63, 49, 109, 42, 51, 54, 120, 48, 45, 53, 119, 121, 123, 58, 127, 98, 56, 126, 57, 122, 34, 86, 84, 101, 26, 22, 36, 28, 20, 39, 59, 55, 15, 100, 114, 103, 115, 88, 47, 37, 30, 62, 111, 113, 24, 85, 40, 79, 90, 112, 118, 29, 81, 50, 60, 117, 104, 76, 14, 78, 93, 17, 21, 94, 92, 82, 18, 44, 27, 80, 9, 110, 73, 77, 31, 16, 95, 108, 12, 11, 83, 13, 25, 46, 107, 19, 102, 64, 70, 96, 91, 8, 43, 87, 41, 7, 75, 4, 71, 32, 38, 97, 72, 68, 33, 89, 0, 35, 23, 3, 2, 1, 65, 105, 5, 69, 67, 66, 74, 99, 10, 6], [106, 124, 116, 125, 61, 52, 63, 109, 42, 49, 51, 54, 120, 48, 45, 53, 121, 119, 123, 58, 98, 127, 56, 126, 57, 59, 55, 28, 34, 26, 22, 103, 101, 86, 62, 84, 39, 118, 36, 122, 79, 88, 20, 30, 15, 117, 100, 37, 112, 40, 81, 111, 113, 114, 24, 47, 29, 115, 93, 50, 85, 90, 21, 76, 78, 14, 104, 94, 12, 92, 17, 27, 44, 82, 73, 60, 18, 9, 31, 80, 108, 70, 95, 46, 110, 16, 19, 75, 77, 8, 13, 11, 83, 96, 91, 25, 89, 102, 32, 72, 107, 7, 97, 87, 43, 38, 23, 35, 71, 41, 3, 0, 105, 33, 4, 68, 64, 74, 69, 67, 5, 66, 2, 10, 65, 1, 99, 6], [106, 125, 116, 124, 61, 52, 63, 109, 49, 42, 51, 54, 120, 48, 45, 121, 119, 53, 58, 98, 127, 56, 123, 126, 122, 26, 34, 59, 20, 55, 28, 84, 86, 22, 39, 100, 88, 15, 101, 62, 57, 118, 36, 103, 47, 79, 50, 37, 85, 40, 30, 112, 81, 117, 24, 114, 111, 14, 115, 113, 60, 76, 104, 110, 17, 93, 18, 90, 44, 78, 21, 80, 108, 82, 73, 94, 29, 12, 31, 9, 92, 27, 70, 46, 16, 77, 95, 83, 13, 19, 11, 107, 75, 8, 96, 25, 102, 71, 7, 89, 41, 23, 0, 91, 64, 87, 32, 38, 72, 43, 33, 67, 97, 68, 3, 66, 4, 5, 105, 69, 35, 74, 1, 2, 10, 65, 99, 6], [106, 116, 124, 125, 61, 52, 63, 109, 49, 42, 51, 54, 120, 48, 45, 121, 53, 98, 119, 56, 127, 123, 122, 58, 55, 59, 26, 20, 126, 28, 86, 22, 101, 100, 88, 84, 34, 39, 15, 115, 36, 57, 40, 79, 62, 37, 30, 85, 113, 118, 50, 44, 117, 24, 111, 112, 103, 60, 47, 81, 114, 93, 29, 108, 21, 94, 78, 104, 76, 17, 90, 14, 110, 12, 92, 9, 18, 46, 73, 82, 80, 77, 102, 31, 95, 27, 16, 83, 11, 25, 8, 96, 13, 19, 107, 75, 91, 70, 41, 105, 71, 89, 23, 0, 7, 43, 38, 72, 87, 67, 32, 97, 64, 35, 33, 68, 4, 99, 2, 74, 5, 66, 69, 3, 1, 6, 65, 10], [106, 124, 116, 125, 61, 52, 109, 63, 42, 49, 51, 54, 120, 48, 121, 45, 123, 127, 55, 53, 98, 56, 122, 119, 126, 40, 58, 26, 28, 34, 86, 60, 57, 39, 20, 88, 101, 115, 118, 36, 22, 37, 100, 59, 103, 84, 15, 47, 50, 79, 62, 117, 85, 30, 104, 111, 24, 90, 112, 114, 93, 81, 44, 78, 113, 29, 12, 76, 17, 9, 21, 18, 14, 82, 31, 102, 27, 95, 108, 16, 96, 110, 73, 92, 80, 46, 94, 77, 83, 8, 19, 75, 41, 11, 13, 25, 64, 89, 91, 23, 7, 71, 32, 38, 97, 0, 107, 43, 67, 68, 33, 72, 6, 3, 87, 105, 4, 70, 66, 2, 69, 35, 74, 65, 5, 99, 1, 10], [106, 124, 125, 116, 61, 52, 63, 109, 42, 49, 54, 51, 120, 48, 45, 121, 123, 55, 56, 53, 119, 58, 98, 127, 122, 28, 126, 34, 39, 20, 57, 101, 59, 84, 118, 79, 26, 22, 88, 100, 15, 103, 86, 36, 47, 117, 37, 111, 62, 115, 112, 40, 81, 85, 113, 30, 78, 24, 17, 50, 76, 60, 93, 104, 18, 29, 9, 12, 82, 21, 14, 73, 90, 114, 110, 27, 44, 92, 94, 16, 8, 77, 19, 75, 13, 80, 0, 108, 6, 83, 11, 46, 95, 31, 7, 64, 91, 102, 71, 96, 72, 3, 25, 23, 97, 41, 107, 89, 32, 5, 4, 87, 67, 66, 38, 35, 2, 105, 65, 69, 43, 68, 33, 74, 70, 99, 1, 10], [106, 116, 125, 124, 61, 52, 109, 63, 49, 42, 54, 51, 120, 48, 45, 121, 53, 119, 123, 55, 56, 58, 98, 127, 39, 34, 126, 57, 101, 28, 118, 59, 22, 26, 115, 84, 20, 100, 37, 86, 36, 122, 88, 15, 103, 111, 47, 85, 79, 40, 117, 113, 30, 50, 62, 60, 24, 78, 112, 104, 44, 93, 29, 90, 12, 81, 110, 92, 76, 9, 17, 14, 18, 73, 21, 27, 114, 82, 94, 8, 80, 31, 19, 46, 6, 13, 64, 77, 108, 95, 83, 11, 16, 7, 75, 96, 102, 25, 72, 0, 71, 32, 89, 4, 91, 38, 41, 23, 107, 87, 2, 105, 3, 69, 1, 5, 67, 66, 97, 33, 68, 43, 65, 74, 99, 35, 10, 70], [106, 116, 125, 124, 61, 52, 109, 63, 49, 42, 51, 54, 120, 48, 121, 45, 53, 119, 55, 58, 98, 56, 123, 126, 39, 59, 20, 26, 127, 22, 34, 84, 15, 86, 28, 57, 118, 88, 101, 36, 122, 37, 79, 100, 62, 115, 117, 111, 81, 103, 50, 78, 60, 30, 47, 85, 40, 113, 12, 24, 14, 17, 104, 76, 9, 93, 73, 29, 21, 112, 44, 18, 27, 90, 80, 82, 92, 16, 6, 94, 114, 13, 46, 83, 19, 75, 11, 77, 110, 108, 8, 96, 64, 31, 95, 7, 71, 3, 87, 25, 91, 32, 72, 107, 4, 68, 102, 67, 89, 41, 5, 66, 105, 97, 23, 43, 38, 69, 33, 10, 0, 2, 74, 35, 99, 1, 65, 70], [106, 125, 124, 116, 61, 52, 49, 109, 63, 42, 51, 54, 120, 48, 53, 121, 45, 123, 58, 127, 119, 56, 98, 126, 59, 122, 39, 55, 26, 118, 28, 34, 100, 62, 57, 15, 101, 22, 20, 37, 47, 36, 86, 84, 88, 40, 79, 60, 50, 112, 115, 117, 111, 103, 81, 30, 24, 85, 113, 104, 14, 73, 78, 29, 93, 21, 12, 76, 9, 80, 44, 90, 17, 18, 94, 82, 27, 92, 114, 11, 6, 96, 46, 13, 31, 16, 72, 110, 83, 0, 19, 77, 7, 75, 8, 64, 43, 71, 108, 95, 91, 68, 89, 25, 32, 67, 4, 38, 5, 102, 3, 97, 66, 87, 23, 33, 70, 107, 2, 69, 105, 99, 41, 1, 35, 65, 10, 74], [106, 124, 125, 116, 61, 52, 42, 109, 63, 49, 51, 120, 54, 48, 121, 45, 53, 123, 126, 98, 58, 119, 86, 59, 127, 56, 57, 26, 55, 22, 84, 79, 28, 39, 20, 15, 115, 34, 101, 40, 88, 36, 62, 37, 111, 122, 118, 117, 60, 112, 24, 30, 100, 103, 47, 81, 85, 50, 14, 12, 78, 113, 29, 104, 21, 76, 93, 90, 73, 9, 17, 18, 114, 82, 92, 11, 16, 13, 94, 80, 27, 95, 77, 46, 44, 31, 19, 72, 83, 75, 110, 108, 107, 96, 89, 7, 25, 6, 43, 71, 91, 102, 32, 8, 41, 97, 70, 23, 105, 35, 68, 87, 67, 38, 3, 5, 64, 4, 0, 69, 10, 74, 33, 99, 2, 66, 1, 65]], "model.layers.17.self_attn.q_proj": [[60, 106, 114, 100, 29, 89, 115, 123, 93, 86, 27, 83, 120, 47, 80, 42, 22, 112, 11, 57, 21, 113, 122, 53, 62, 127, 50, 16, 46, 61, 118, 59, 32, 111, 126, 45, 125, 56, 14, 99, 110, 63, 119, 91, 116, 81, 69, 55, 108, 96, 54, 26, 48, 117, 44, 51, 52, 121, 124, 58, 37, 97, 49, 109, 43, 105, 24, 76, 38, 41, 88, 71, 75, 95, 5, 107, 102, 103, 39, 104, 87, 31, 20, 40, 36, 15, 9, 101, 79, 35, 34, 98, 84, 17, 28, 10, 2, 94, 19, 90, 85, 8, 3, 23, 30, 33, 0, 7, 12, 1, 25, 4, 82, 6, 64, 18, 66, 77, 13, 78, 92, 74, 65, 67, 73, 72, 70, 68], [114, 106, 100, 60, 29, 89, 93, 50, 42, 86, 120, 59, 44, 61, 83, 22, 80, 115, 87, 99, 118, 27, 32, 105, 119, 81, 47, 31, 95, 46, 43, 21, 121, 97, 17, 25, 63, 91, 48, 54, 56, 116, 113, 51, 57, 14, 24, 11, 111, 103, 117, 96, 45, 53, 107, 125, 62, 110, 76, 58, 122, 41, 19, 38, 126, 112, 101, 37, 127, 40, 94, 55, 52, 16, 124, 88, 20, 102, 104, 98, 109, 33, 85, 108, 123, 71, 8, 23, 49, 35, 39, 90, 92, 30, 82, 79, 34, 9, 26, 15, 5, 10, 28, 1, 13, 36, 65, 84, 69, 77, 18, 12, 74, 75, 0, 3, 66, 6, 67, 72, 4, 70, 64, 7, 78, 73, 68, 2], [106, 60, 114, 100, 50, 29, 89, 86, 42, 57, 62, 107, 59, 93, 22, 80, 83, 56, 116, 37, 11, 14, 63, 47, 124, 99, 115, 91, 111, 46, 51, 71, 123, 52, 69, 48, 97, 110, 44, 125, 27, 55, 61, 49, 9, 16, 24, 113, 32, 76, 53, 58, 105, 81, 104, 120, 126, 119, 54, 31, 117, 127, 118, 122, 41, 38, 40, 45, 102, 121, 20, 108, 79, 103, 43, 109, 112, 19, 101, 94, 98, 39, 28, 35, 26, 33, 95, 96, 5, 88, 23, 25, 2, 30, 34, 12, 90, 82, 92, 21, 75, 85, 67, 1, 0, 4, 36, 17, 6, 18, 84, 74, 87, 13, 77, 8, 15, 10, 7, 72, 70, 3, 66, 78, 65, 64, 73, 68], [106, 60, 100, 115, 114, 89, 29, 83, 14, 86, 93, 42, 11, 71, 80, 27, 9, 50, 22, 25, 69, 110, 2, 67, 76, 1, 91, 4, 24, 62, 0, 16, 6, 53, 61, 116, 57, 3, 111, 37, 85, 21, 26, 97, 122, 113, 125, 7, 63, 118, 51, 127, 90, 77, 119, 19, 55, 35, 47, 17, 105, 123, 117, 75, 112, 52, 81, 95, 54, 64, 43, 108, 44, 120, 48, 78, 49, 18, 87, 46, 13, 88, 59, 98, 101, 96, 38, 92, 20, 45, 34, 40, 58, 121, 104, 68, 84, 124, 23, 103, 28, 94, 79, 102, 10, 107, 31, 73, 65, 70, 32, 41, 126, 56, 39, 33, 12, 99, 72, 109, 36, 74, 30, 15, 8, 82, 66, 5], [103, 126, 29, 82, 99, 85, 15, 76, 73, 71, 112, 90, 3, 5, 24, 105, 35, 93, 96, 97, 57, 23, 120, 87, 21, 61, 54, 18, 81, 119, 84, 17, 33, 26, 80, 49, 67, 115, 106, 64, 116, 83, 88, 50, 79, 16, 111, 78, 28, 101, 19, 12, 9, 121, 13, 127, 1, 69, 114, 25, 65, 34, 91, 7, 122, 89, 118, 74, 8, 32, 53, 110, 62, 20, 86, 14, 104, 11, 92, 4, 98, 117, 63, 40, 59, 51, 102, 10, 22, 27, 55, 31, 66, 41, 38, 77, 36, 43, 56, 123, 94, 58, 125, 100, 52, 70, 47, 107, 39, 42, 95, 46, 60, 6, 44, 75, 37, 30, 124, 109, 72, 68, 48, 113, 108, 45, 2, 0], [103, 126, 29, 99, 15, 85, 82, 76, 73, 71, 112, 3, 5, 90, 96, 24, 26, 1, 16, 27, 21, 9, 114, 119, 67, 57, 64, 84, 74, 11, 19, 93, 78, 79, 81, 23, 120, 88, 65, 14, 97, 101, 61, 91, 49, 33, 18, 98, 104, 105, 116, 6, 50, 43, 87, 8, 69, 92, 55, 32, 106, 7, 2, 30, 121, 94, 117, 13, 34, 125, 12, 54, 25, 56, 28, 80, 70, 75, 41, 68, 66, 127, 83, 36, 20, 115, 110, 10, 22, 17, 102, 100, 63, 122, 4, 51, 118, 39, 86, 58, 59, 113, 0, 108, 111, 35, 62, 77, 124, 60, 37, 72, 46, 44, 45, 89, 107, 31, 95, 53, 109, 48, 42, 123, 40, 38, 47, 52], [103, 126, 35, 49, 112, 29, 90, 114, 99, 56, 26, 85, 88, 24, 120, 93, 82, 116, 84, 119, 121, 96, 115, 46, 86, 63, 55, 117, 41, 57, 15, 60, 54, 110, 43, 32, 111, 122, 106, 62, 74, 59, 127, 101, 76, 16, 125, 52, 61, 104, 50, 118, 22, 124, 102, 123, 58, 107, 105, 108, 47, 51, 53, 97, 23, 73, 71, 40, 34, 113, 109, 77, 2, 48, 27, 42, 36, 11, 83, 0, 21, 37, 19, 44, 33, 18, 45, 98, 20, 38, 14, 100, 5, 8, 3, 95, 4, 78, 25, 66, 79, 91, 75, 39, 94, 68, 92, 80, 31, 28, 89, 72, 67, 87, 30, 1, 13, 81, 10, 64, 17, 69, 7, 65, 70, 12, 9, 6], [103, 126, 29, 85, 24, 82, 76, 90, 15, 112, 93, 99, 120, 26, 73, 71, 74, 35, 88, 127, 119, 41, 98, 21, 23, 3, 115, 5, 13, 111, 116, 121, 78, 57, 54, 96, 92, 16, 62, 49, 34, 63, 61, 91, 36, 14, 114, 55, 60, 104, 84, 117, 25, 30, 56, 32, 18, 33, 118, 106, 27, 50, 83, 28, 53, 51, 31, 79, 110, 86, 67, 52, 43, 87, 97, 17, 122, 19, 105, 101, 8, 47, 125, 12, 42, 102, 81, 46, 58, 20, 22, 77, 107, 108, 7, 123, 89, 48, 1, 95, 40, 94, 59, 38, 113, 124, 11, 100, 80, 44, 37, 45, 68, 72, 9, 109, 64, 10, 65, 66, 75, 39, 4, 6, 69, 70, 0, 2], [45, 109, 55, 58, 111, 59, 57, 63, 124, 52, 47, 116, 46, 125, 115, 119, 61, 53, 114, 48, 62, 51, 113, 38, 123, 60, 126, 50, 54, 117, 121, 112, 102, 122, 106, 49, 118, 120, 101, 127, 107, 36, 110, 42, 56, 43, 108, 33, 44, 28, 105, 29, 104, 41, 40, 103, 26, 39, 92, 99, 97, 37, 96, 98, 87, 34, 31, 100, 21, 35, 86, 32, 91, 30, 23, 95, 24, 18, 94, 89, 90, 27, 12, 84, 93, 25, 85, 78, 88, 22, 20, 82, 81, 17, 80, 7, 15, 11, 76, 9, 14, 79, 73, 16, 19, 83, 71, 75, 13, 69, 4, 68, 70, 74, 5, 2, 77, 65, 66, 8, 3, 10, 6, 1, 0, 64, 67, 72], [58, 45, 38, 47, 94, 55, 59, 89, 95, 60, 124, 112, 109, 111, 33, 57, 52, 115, 28, 35, 127, 87, 46, 113, 61, 92, 123, 51, 117, 122, 49, 118, 116, 63, 53, 54, 125, 101, 96, 50, 119, 121, 62, 105, 126, 110, 48, 41, 114, 120, 107, 86, 106, 43, 42, 44, 40, 108, 104, 56, 85, 36, 16, 37, 19, 98, 82, 103, 23, 20, 39, 12, 34, 84, 18, 91, 100, 15, 93, 81, 32, 22, 27, 99, 97, 102, 9, 11, 31, 29, 24, 78, 17, 25, 90, 7, 26, 88, 21, 71, 30, 79, 75, 14, 73, 76, 4, 77, 69, 5, 2, 13, 83, 68, 74, 1, 80, 66, 65, 10, 64, 0, 6, 8, 70, 67, 3, 72], [45, 109, 47, 52, 38, 101, 53, 60, 58, 94, 61, 115, 92, 59, 62, 46, 126, 23, 34, 116, 28, 111, 51, 119, 118, 108, 87, 33, 54, 57, 50, 112, 110, 63, 49, 85, 127, 123, 113, 117, 48, 30, 42, 44, 114, 89, 21, 41, 122, 103, 36, 40, 102, 55, 121, 120, 43, 35, 39, 107, 125, 37, 106, 104, 29, 105, 124, 99, 95, 100, 56, 93, 27, 96, 26, 97, 31, 78, 12, 32, 25, 24, 98, 16, 80, 18, 9, 88, 91, 86, 73, 82, 90, 81, 7, 20, 22, 84, 17, 11, 76, 83, 79, 19, 14, 75, 15, 69, 77, 71, 6, 74, 66, 10, 68, 4, 5, 2, 70, 67, 13, 0, 3, 72, 65, 64, 1, 8], [45, 38, 58, 19, 89, 94, 13, 59, 16, 74, 109, 35, 70, 87, 8, 3, 47, 6, 33, 67, 95, 30, 52, 111, 0, 55, 64, 72, 124, 1, 85, 77, 92, 84, 17, 65, 88, 69, 15, 10, 5, 81, 23, 21, 83, 82, 80, 28, 22, 20, 27, 78, 115, 25, 4, 31, 66, 76, 9, 11, 90, 32, 101, 2, 18, 75, 79, 37, 14, 96, 57, 126, 71, 63, 73, 12, 26, 68, 7, 29, 86, 98, 34, 24, 39, 91, 93, 108, 50, 125, 122, 61, 62, 105, 44, 104, 103, 116, 60, 100, 118, 121, 36, 97, 43, 99, 112, 120, 117, 51, 54, 40, 49, 56, 107, 123, 42, 106, 41, 113, 48, 53, 110, 119, 114, 127, 46, 102], [40, 50, 109, 98, 126, 29, 51, 84, 23, 80, 13, 18, 71, 73, 53, 75, 26, 56, 47, 90, 104, 2, 87, 66, 4, 111, 72, 93, 69, 68, 112, 39, 86, 94, 70, 1, 20, 0, 120, 11, 22, 19, 65, 74, 30, 103, 101, 64, 5, 25, 96, 59, 43, 119, 60, 16, 67, 89, 34, 76, 31, 77, 8, 36, 95, 38, 78, 117, 57, 113, 108, 100, 32, 6, 62, 121, 27, 45, 105, 124, 127, 7, 102, 21, 88, 33, 55, 97, 48, 15, 85, 99, 41, 118, 44, 28, 110, 83, 91, 35, 92, 58, 63, 10, 52, 12, 61, 46, 42, 17, 24, 115, 116, 82, 3, 123, 114, 81, 37, 107, 14, 9, 54, 122, 125, 49, 106, 79], [40, 126, 29, 98, 109, 87, 47, 39, 50, 23, 93, 84, 20, 75, 36, 90, 125, 58, 52, 124, 26, 59, 13, 121, 119, 56, 54, 115, 117, 53, 81, 18, 57, 108, 118, 34, 17, 103, 31, 122, 88, 120, 104, 63, 60, 127, 61, 107, 111, 105, 97, 110, 72, 44, 21, 51, 42, 46, 48, 24, 15, 32, 116, 123, 41, 45, 94, 70, 30, 43, 114, 49, 112, 99, 55, 106, 113, 80, 37, 38, 62, 85, 35, 100, 25, 92, 22, 101, 95, 79, 96, 102, 33, 14, 77, 27, 19, 83, 11, 4, 89, 28, 12, 91, 16, 86, 6, 82, 8, 76, 78, 10, 74, 73, 66, 9, 1, 68, 2, 65, 7, 3, 71, 5, 64, 69, 0, 67], [40, 126, 98, 109, 29, 60, 84, 90, 51, 87, 93, 124, 47, 23, 80, 50, 26, 18, 75, 59, 54, 34, 117, 36, 39, 88, 20, 119, 118, 53, 121, 13, 63, 114, 58, 103, 22, 123, 125, 127, 37, 55, 44, 122, 52, 45, 46, 111, 56, 115, 105, 21, 48, 104, 110, 107, 62, 120, 94, 92, 102, 61, 96, 108, 57, 112, 30, 38, 91, 49, 113, 43, 72, 70, 24, 12, 116, 35, 41, 42, 106, 97, 89, 86, 32, 31, 99, 100, 101, 16, 17, 85, 28, 79, 19, 33, 25, 73, 81, 82, 76, 15, 27, 95, 83, 14, 78, 4, 11, 10, 9, 6, 74, 8, 3, 69, 68, 67, 7, 77, 5, 71, 65, 1, 66, 2, 0, 64], [40, 51, 98, 29, 126, 13, 80, 18, 87, 84, 72, 23, 90, 4, 75, 93, 96, 26, 70, 60, 10, 73, 88, 124, 52, 19, 94, 121, 56, 125, 71, 54, 86, 30, 119, 81, 65, 117, 20, 109, 47, 59, 122, 77, 28, 50, 53, 111, 46, 16, 3, 61, 104, 105, 127, 112, 63, 74, 108, 78, 64, 15, 101, 68, 32, 39, 7, 91, 5, 89, 95, 97, 79, 67, 82, 11, 83, 58, 8, 118, 41, 22, 123, 106, 14, 2, 115, 120, 33, 76, 6, 62, 31, 34, 48, 69, 49, 113, 55, 116, 27, 12, 38, 9, 21, 100, 36, 44, 37, 110, 25, 57, 43, 92, 45, 17, 1, 103, 99, 24, 35, 66, 85, 102, 114, 42, 0, 107], [113, 115, 105, 57, 49, 63, 124, 53, 119, 34, 30, 39, 112, 48, 56, 127, 123, 61, 58, 45, 59, 117, 60, 116, 52, 120, 122, 62, 47, 27, 38, 118, 110, 107, 37, 108, 54, 121, 55, 94, 51, 125, 46, 111, 106, 85, 126, 44, 88, 114, 99, 104, 109, 87, 101, 90, 42, 43, 89, 36, 25, 24, 19, 103, 78, 18, 28, 40, 50, 92, 23, 41, 100, 35, 80, 83, 91, 22, 98, 33, 102, 31, 93, 21, 32, 14, 97, 26, 74, 95, 96, 6, 17, 16, 82, 65, 29, 12, 76, 20, 70, 81, 66, 64, 67, 72, 8, 10, 3, 0, 69, 86, 15, 11, 1, 2, 5, 13, 7, 68, 9, 75, 77, 4, 79, 73, 71, 84], [115, 61, 105, 113, 34, 30, 50, 124, 121, 120, 37, 38, 48, 63, 57, 126, 122, 53, 55, 112, 119, 27, 123, 49, 56, 59, 92, 60, 101, 62, 58, 127, 52, 46, 47, 45, 39, 118, 90, 110, 107, 44, 85, 111, 54, 125, 94, 116, 117, 108, 109, 103, 99, 87, 25, 114, 51, 40, 106, 41, 42, 33, 43, 89, 88, 104, 93, 96, 80, 32, 23, 91, 35, 36, 19, 31, 95, 102, 100, 98, 82, 78, 29, 28, 83, 21, 26, 18, 16, 97, 24, 22, 70, 17, 74, 6, 14, 64, 76, 0, 65, 20, 66, 1, 3, 69, 12, 86, 8, 81, 67, 2, 5, 10, 13, 72, 11, 15, 7, 9, 71, 4, 79, 68, 84, 73, 77, 75], [105, 113, 61, 57, 34, 115, 22, 20, 27, 30, 55, 41, 24, 17, 63, 53, 80, 13, 37, 59, 74, 38, 82, 18, 15, 11, 124, 123, 21, 12, 19, 7, 56, 51, 97, 121, 122, 70, 88, 26, 62, 47, 49, 103, 75, 99, 117, 9, 90, 116, 106, 31, 95, 36, 52, 127, 119, 60, 23, 58, 71, 10, 118, 39, 120, 89, 100, 46, 72, 112, 33, 45, 83, 111, 2, 6, 91, 50, 44, 3, 28, 48, 29, 14, 114, 126, 109, 101, 77, 93, 40, 85, 107, 54, 104, 84, 64, 110, 102, 16, 73, 81, 96, 76, 66, 42, 32, 25, 108, 87, 86, 92, 79, 68, 78, 94, 5, 69, 43, 1, 98, 4, 125, 35, 65, 8, 0, 67], [105, 57, 115, 113, 53, 34, 27, 22, 20, 15, 30, 41, 63, 9, 17, 24, 13, 49, 127, 74, 120, 4, 94, 55, 39, 116, 61, 37, 85, 1, 68, 3, 47, 36, 11, 87, 76, 73, 29, 72, 78, 77, 28, 31, 6, 46, 118, 123, 48, 79, 88, 95, 58, 119, 93, 103, 60, 26, 8, 18, 107, 84, 54, 38, 92, 104, 124, 35, 90, 112, 52, 43, 83, 110, 56, 40, 100, 121, 45, 109, 126, 23, 42, 59, 106, 122, 10, 96, 21, 50, 44, 62, 117, 125, 108, 81, 64, 111, 32, 114, 33, 86, 89, 12, 70, 25, 80, 51, 97, 99, 5, 16, 19, 14, 75, 66, 91, 98, 69, 65, 101, 67, 82, 102, 0, 7, 71, 2], [108, 37, 126, 88, 19, 90, 78, 32, 75, 81, 44, 8, 5, 36, 93, 86, 85, 67, 21, 1, 24, 34, 61, 22, 0, 16, 15, 83, 18, 12, 11, 72, 20, 69, 119, 58, 80, 65, 17, 25, 28, 14, 89, 13, 84, 87, 103, 23, 73, 10, 2, 98, 77, 74, 79, 35, 91, 27, 29, 30, 3, 76, 9, 94, 71, 82, 46, 26, 100, 118, 127, 68, 70, 95, 97, 125, 31, 53, 6, 7, 51, 92, 106, 64, 45, 52, 40, 66, 54, 4, 120, 122, 33, 113, 107, 117, 60, 115, 63, 99, 57, 123, 59, 105, 39, 42, 96, 111, 62, 104, 110, 43, 50, 114, 116, 102, 124, 109, 55, 47, 121, 101, 38, 48, 41, 49, 56, 112], [126, 108, 37, 44, 23, 58, 120, 57, 113, 51, 48, 61, 122, 63, 114, 118, 56, 62, 115, 116, 102, 104, 59, 32, 52, 119, 93, 124, 55, 53, 46, 54, 60, 125, 121, 101, 49, 100, 117, 110, 50, 103, 47, 45, 111, 123, 41, 109, 112, 40, 127, 106, 16, 107, 27, 42, 43, 90, 105, 39, 38, 91, 36, 35, 92, 84, 98, 99, 97, 95, 87, 33, 94, 34, 30, 25, 86, 88, 96, 29, 31, 85, 81, 28, 17, 20, 80, 89, 22, 74, 10, 2, 19, 24, 12, 26, 83, 82, 15, 0, 6, 65, 4, 21, 76, 66, 18, 70, 71, 13, 79, 78, 11, 67, 64, 7, 5, 9, 77, 69, 68, 3, 1, 73, 14, 75, 8, 72], [108, 61, 126, 44, 36, 119, 90, 85, 32, 122, 82, 93, 35, 24, 88, 37, 104, 58, 7, 19, 12, 15, 81, 13, 62, 91, 29, 101, 59, 95, 28, 63, 86, 46, 51, 20, 23, 79, 27, 120, 48, 78, 53, 57, 60, 96, 109, 87, 98, 116, 97, 115, 74, 89, 45, 71, 4, 18, 118, 114, 70, 66, 99, 52, 49, 33, 124, 64, 54, 125, 25, 127, 113, 56, 107, 117, 100, 34, 55, 9, 75, 41, 121, 26, 42, 92, 47, 31, 110, 30, 94, 123, 38, 16, 39, 112, 50, 76, 40, 102, 84, 6, 111, 106, 105, 65, 43, 103, 80, 21, 77, 2, 68, 22, 8, 83, 3, 11, 17, 0, 10, 5, 1, 73, 67, 69, 14, 72], [108, 44, 37, 126, 93, 23, 90, 58, 81, 24, 32, 61, 36, 85, 120, 19, 123, 41, 56, 86, 31, 29, 96, 118, 114, 110, 16, 78, 125, 112, 54, 105, 113, 88, 33, 63, 38, 103, 122, 48, 106, 87, 51, 111, 42, 109, 62, 115, 52, 57, 30, 47, 124, 119, 50, 102, 53, 127, 97, 43, 116, 27, 60, 34, 55, 121, 35, 45, 46, 99, 107, 98, 94, 74, 117, 49, 40, 84, 59, 104, 91, 7, 28, 25, 80, 39, 95, 100, 92, 26, 75, 82, 76, 89, 101, 13, 17, 9, 22, 10, 77, 15, 18, 0, 79, 71, 2, 83, 8, 67, 21, 64, 66, 5, 20, 12, 11, 70, 4, 68, 6, 73, 65, 14, 1, 69, 3, 72], [56, 126, 101, 62, 60, 122, 37, 125, 116, 123, 88, 53, 119, 58, 55, 51, 118, 120, 115, 117, 63, 48, 50, 89, 46, 127, 114, 84, 54, 124, 121, 33, 61, 59, 57, 103, 49, 14, 52, 11, 34, 113, 47, 92, 110, 39, 95, 107, 104, 44, 112, 98, 109, 108, 45, 111, 100, 106, 41, 40, 43, 93, 42, 35, 26, 16, 20, 38, 91, 105, 36, 15, 94, 99, 102, 18, 12, 22, 31, 82, 96, 90, 28, 25, 72, 86, 80, 32, 30, 68, 24, 87, 5, 23, 29, 17, 74, 97, 2, 7, 21, 70, 27, 79, 77, 19, 83, 85, 76, 81, 6, 73, 78, 64, 65, 9, 13, 75, 3, 0, 8, 66, 10, 67, 71, 69, 4, 1], [56, 126, 101, 116, 39, 62, 37, 20, 91, 53, 84, 26, 123, 127, 14, 125, 60, 119, 114, 33, 46, 100, 103, 120, 48, 54, 122, 118, 59, 51, 124, 55, 88, 58, 121, 45, 11, 104, 115, 40, 110, 50, 41, 36, 25, 106, 107, 63, 89, 61, 47, 117, 35, 34, 108, 44, 57, 99, 111, 52, 105, 42, 43, 112, 109, 95, 113, 29, 38, 16, 98, 49, 82, 31, 102, 94, 32, 12, 87, 93, 96, 83, 90, 92, 72, 15, 27, 30, 24, 28, 80, 97, 18, 77, 74, 86, 73, 22, 5, 6, 7, 68, 81, 70, 19, 21, 23, 65, 78, 79, 66, 75, 85, 9, 76, 17, 64, 2, 13, 8, 71, 67, 0, 10, 4, 3, 69, 1], [56, 126, 101, 60, 84, 116, 26, 40, 11, 21, 93, 62, 89, 28, 2, 125, 109, 68, 122, 127, 92, 33, 53, 55, 31, 118, 37, 79, 51, 29, 88, 24, 90, 114, 119, 124, 63, 58, 47, 123, 57, 61, 80, 48, 59, 110, 117, 50, 0, 103, 75, 120, 23, 121, 91, 52, 15, 16, 41, 54, 49, 108, 18, 106, 32, 34, 8, 115, 46, 100, 113, 112, 96, 67, 111, 35, 44, 17, 20, 30, 85, 70, 95, 105, 81, 19, 94, 77, 7, 45, 43, 22, 42, 82, 38, 9, 39, 107, 36, 99, 13, 104, 74, 76, 98, 3, 69, 102, 83, 5, 6, 73, 14, 87, 1, 65, 25, 27, 12, 72, 86, 10, 78, 4, 71, 97, 66, 64], [126, 56, 101, 62, 37, 88, 122, 53, 123, 15, 103, 125, 116, 24, 60, 91, 119, 120, 118, 23, 124, 92, 114, 48, 58, 127, 51, 117, 55, 63, 89, 54, 12, 33, 46, 121, 94, 50, 25, 61, 84, 59, 14, 6, 39, 49, 115, 57, 113, 52, 44, 47, 100, 11, 90, 45, 110, 18, 34, 112, 111, 20, 9, 99, 107, 17, 36, 105, 87, 7, 109, 40, 108, 95, 81, 98, 72, 22, 96, 104, 42, 102, 26, 35, 3, 28, 38, 106, 31, 32, 5, 86, 93, 68, 41, 79, 30, 82, 43, 80, 27, 16, 70, 97, 29, 74, 19, 64, 65, 76, 21, 66, 85, 1, 67, 8, 78, 2, 77, 73, 10, 4, 83, 75, 0, 71, 13, 69], [120, 109, 104, 124, 34, 123, 127, 59, 27, 89, 58, 60, 30, 52, 56, 125, 114, 86, 91, 54, 126, 118, 78, 121, 61, 113, 53, 62, 29, 45, 51, 50, 88, 119, 63, 40, 35, 110, 122, 84, 116, 94, 117, 19, 25, 49, 42, 55, 57, 112, 48, 43, 46, 107, 111, 47, 39, 115, 37, 106, 100, 108, 105, 41, 36, 10, 101, 103, 96, 44, 102, 99, 33, 21, 38, 18, 97, 26, 32, 31, 92, 28, 98, 87, 90, 95, 15, 22, 93, 17, 20, 16, 82, 14, 23, 85, 83, 9, 69, 80, 79, 24, 12, 76, 74, 13, 81, 73, 64, 5, 65, 77, 1, 6, 11, 7, 75, 4, 71, 68, 70, 72, 2, 67, 0, 8, 66, 3], [120, 109, 104, 123, 52, 58, 30, 127, 121, 59, 54, 124, 60, 122, 126, 61, 114, 112, 53, 56, 125, 117, 118, 45, 62, 51, 116, 113, 63, 110, 89, 107, 50, 49, 119, 57, 48, 78, 55, 115, 111, 46, 106, 47, 43, 42, 105, 34, 38, 108, 102, 25, 44, 18, 87, 41, 40, 101, 39, 100, 99, 103, 37, 94, 36, 98, 91, 35, 97, 27, 96, 88, 10, 33, 86, 32, 93, 28, 31, 95, 19, 85, 29, 82, 92, 90, 16, 84, 22, 26, 23, 74, 14, 21, 64, 20, 65, 83, 69, 1, 79, 15, 80, 76, 6, 17, 0, 5, 9, 24, 12, 81, 4, 3, 70, 66, 2, 67, 13, 11, 7, 71, 68, 73, 8, 75, 72, 77], [104, 120, 34, 88, 109, 17, 13, 75, 15, 7, 9, 86, 84, 92, 66, 71, 91, 2, 124, 11, 85, 19, 30, 69, 5, 4, 27, 40, 45, 22, 74, 0, 98, 68, 20, 64, 81, 82, 77, 24, 83, 79, 18, 80, 87, 29, 73, 26, 21, 76, 70, 107, 1, 14, 23, 94, 3, 72, 93, 90, 35, 8, 25, 123, 16, 47, 12, 6, 89, 56, 65, 116, 67, 78, 10, 59, 127, 46, 63, 31, 96, 54, 28, 95, 32, 97, 113, 106, 58, 52, 42, 99, 100, 101, 126, 55, 118, 43, 33, 37, 36, 112, 60, 105, 122, 125, 121, 114, 62, 117, 115, 49, 38, 110, 108, 61, 50, 39, 57, 53, 48, 102, 51, 103, 44, 119, 41, 111], [109, 104, 120, 34, 123, 78, 27, 59, 88, 116, 30, 112, 54, 86, 84, 52, 127, 125, 10, 121, 89, 40, 29, 18, 58, 126, 124, 60, 91, 114, 55, 56, 46, 113, 50, 61, 87, 122, 53, 62, 25, 110, 115, 26, 98, 45, 118, 63, 117, 119, 49, 19, 43, 48, 15, 51, 39, 42, 57, 100, 47, 36, 97, 90, 35, 106, 108, 38, 32, 28, 96, 107, 99, 101, 14, 31, 44, 102, 33, 111, 94, 17, 16, 9, 37, 105, 41, 92, 95, 23, 93, 103, 83, 82, 13, 22, 21, 76, 85, 20, 24, 80, 4, 6, 74, 79, 69, 12, 0, 2, 75, 73, 72, 11, 7, 81, 5, 71, 64, 1, 77, 70, 68, 65, 67, 8, 66, 3]], "model.layers.17.self_attn.k_proj": [[42, 36, 114, 60, 86, 93, 91, 80, 83, 89, 11, 63, 14, 19, 124, 119, 0, 53, 125, 46, 9, 69, 113, 111, 59, 25, 56, 54, 120, 126, 76, 50, 47, 44, 57, 115, 123, 118, 127, 116, 117, 32, 45, 43, 15, 2, 33, 71, 62, 51, 48, 122, 101, 121, 110, 49, 4, 55, 108, 41, 52, 24, 103, 104, 66, 106, 40, 105, 112, 58, 102, 90, 74, 98, 107, 1, 109, 39, 97, 18, 30, 77, 61, 5, 35, 88, 12, 34, 31, 100, 87, 95, 38, 81, 37, 20, 23, 17, 99, 28, 21, 6, 96, 72, 82, 70, 8, 92, 27, 26, 84, 94, 10, 79, 85, 65, 78, 7, 67, 13, 68, 29, 3, 64, 22, 16, 73, 75], [126, 39, 93, 85, 15, 82, 24, 1, 76, 5, 73, 71, 64, 48, 112, 74, 26, 50, 3, 120, 116, 96, 35, 121, 119, 117, 77, 118, 57, 115, 110, 105, 41, 16, 113, 114, 63, 60, 97, 54, 55, 42, 125, 46, 56, 52, 43, 49, 6, 29, 84, 34, 4, 40, 59, 122, 66, 61, 32, 27, 123, 62, 53, 90, 47, 75, 124, 99, 19, 51, 107, 127, 72, 87, 78, 38, 86, 109, 44, 111, 58, 37, 70, 100, 108, 23, 45, 104, 101, 25, 14, 83, 22, 11, 36, 106, 89, 13, 68, 95, 28, 69, 33, 81, 94, 2, 17, 102, 98, 31, 91, 92, 0, 30, 80, 20, 10, 8, 88, 9, 65, 67, 18, 21, 7, 103, 12, 79], [109, 58, 102, 30, 59, 8, 45, 13, 74, 19, 25, 86, 16, 70, 3, 97, 89, 65, 99, 28, 23, 0, 46, 32, 49, 47, 69, 36, 127, 121, 12, 126, 94, 115, 18, 15, 62, 91, 51, 84, 118, 53, 61, 120, 111, 107, 63, 116, 35, 77, 117, 48, 125, 11, 114, 83, 43, 101, 20, 87, 9, 110, 24, 27, 57, 106, 50, 52, 54, 68, 78, 4, 98, 123, 112, 113, 124, 122, 56, 29, 105, 17, 44, 37, 55, 119, 85, 31, 42, 60, 34, 7, 108, 104, 71, 64, 41, 39, 103, 40, 2, 100, 80, 90, 81, 6, 66, 26, 93, 82, 21, 5, 10, 96, 14, 88, 95, 75, 33, 79, 72, 92, 67, 73, 22, 76, 1, 38], [104, 126, 34, 50, 51, 115, 93, 23, 84, 26, 114, 13, 80, 111, 18, 45, 75, 56, 124, 59, 53, 70, 105, 72, 48, 127, 65, 119, 4, 58, 81, 73, 122, 64, 55, 57, 125, 103, 118, 94, 123, 3, 49, 60, 121, 99, 63, 2, 30, 117, 61, 10, 109, 110, 106, 32, 120, 22, 46, 14, 29, 24, 108, 113, 7, 38, 62, 36, 12, 107, 87, 116, 92, 112, 95, 42, 52, 102, 41, 28, 44, 54, 90, 25, 39, 101, 83, 19, 31, 71, 100, 47, 86, 43, 91, 37, 21, 96, 35, 15, 27, 89, 76, 33, 98, 5, 97, 85, 78, 88, 17, 79, 9, 67, 69, 82, 1, 6, 0, 74, 77, 16, 8, 20, 68, 11, 66, 40], [41, 98, 57, 115, 113, 94, 24, 51, 11, 20, 15, 91, 13, 22, 17, 61, 27, 55, 49, 9, 63, 68, 62, 59, 85, 26, 25, 72, 127, 56, 53, 123, 19, 116, 6, 114, 101, 35, 99, 107, 7, 18, 102, 48, 80, 108, 28, 122, 109, 65, 104, 54, 87, 125, 16, 60, 118, 8, 84, 117, 66, 74, 47, 103, 82, 97, 43, 121, 111, 93, 120, 78, 45, 33, 119, 42, 52, 58, 44, 105, 110, 124, 112, 96, 36, 50, 40, 46, 100, 76, 126, 3, 32, 73, 95, 5, 106, 29, 77, 31, 64, 39, 92, 90, 0, 67, 37, 79, 75, 89, 21, 23, 69, 70, 12, 83, 88, 71, 38, 81, 34, 2, 86, 10, 30, 14, 4, 1], [44, 126, 86, 101, 108, 100, 96, 61, 29, 58, 78, 90, 57, 19, 8, 51, 63, 119, 75, 120, 62, 81, 55, 125, 5, 124, 114, 47, 117, 116, 59, 122, 118, 123, 53, 16, 67, 60, 46, 74, 111, 56, 52, 112, 115, 110, 54, 0, 50, 121, 49, 88, 106, 127, 48, 109, 113, 23, 34, 65, 104, 42, 85, 99, 105, 39, 45, 98, 9, 107, 43, 41, 27, 66, 83, 40, 103, 82, 25, 38, 4, 70, 35, 13, 36, 102, 1, 94, 92, 28, 30, 73, 31, 26, 20, 93, 33, 95, 77, 97, 2, 68, 89, 15, 24, 7, 3, 17, 91, 12, 21, 79, 76, 84, 64, 18, 6, 72, 71, 32, 22, 80, 37, 11, 10, 87, 14, 69], [126, 37, 56, 22, 97, 62, 51, 125, 122, 123, 59, 119, 61, 60, 53, 116, 127, 58, 118, 50, 63, 48, 121, 120, 117, 124, 52, 49, 57, 112, 114, 55, 113, 54, 92, 47, 104, 115, 46, 94, 111, 27, 109, 110, 44, 42, 108, 45, 107, 103, 38, 93, 33, 43, 106, 39, 105, 41, 89, 82, 40, 35, 102, 101, 80, 84, 36, 98, 96, 99, 88, 86, 28, 34, 19, 77, 79, 100, 91, 32, 95, 30, 74, 90, 25, 3, 29, 9, 31, 7, 5, 85, 87, 26, 17, 23, 15, 81, 64, 16, 83, 21, 76, 24, 14, 65, 75, 72, 12, 6, 18, 78, 13, 2, 68, 4, 73, 71, 20, 11, 8, 70, 10, 66, 67, 1, 69, 0], [120, 40, 45, 86, 98, 123, 54, 27, 109, 60, 17, 59, 127, 113, 94, 58, 7, 88, 61, 13, 126, 52, 56, 117, 114, 62, 63, 9, 15, 53, 121, 118, 119, 122, 48, 75, 34, 116, 110, 66, 57, 55, 49, 124, 50, 51, 115, 112, 111, 46, 43, 47, 125, 84, 19, 103, 42, 78, 44, 26, 10, 107, 18, 5, 39, 106, 89, 102, 65, 108, 64, 36, 31, 105, 41, 93, 16, 101, 38, 87, 21, 95, 69, 8, 6, 99, 67, 4, 35, 30, 33, 37, 72, 90, 28, 97, 96, 24, 100, 76, 20, 32, 11, 12, 29, 104, 85, 92, 23, 68, 80, 83, 3, 71, 70, 81, 91, 0, 25, 82, 77, 79, 14, 1, 74, 73, 22, 2]], "model.layers.17.self_attn.qk_proj": [[126, 120, 109, 58, 44, 114, 56, 60, 115, 45, 113, 108, 104, 29, 57, 42, 61, 51, 50, 59, 41, 40, 106, 55, 119, 22, 118, 123, 49, 34, 105, 124, 125, 116, 93, 87, 112, 86, 127, 62, 117, 90, 111, 24, 94, 98, 53, 63, 27, 80, 37, 16, 52, 47, 30, 89, 23, 100, 25, 26, 54, 121, 39, 88, 18, 91, 83, 13, 85, 82, 75, 101, 122, 19, 48, 21, 103, 77, 110, 35, 84, 11, 20, 15, 36, 73, 79, 17, 99, 102, 32, 96, 43, 107, 9, 46, 78, 81, 14, 92, 76, 72, 38, 12, 97, 69, 7, 8, 5, 74, 71, 3, 28, 6, 64, 67, 33, 10, 65, 1, 70, 0, 4, 95, 68, 66, 31, 2], [126, 120, 109, 56, 44, 58, 114, 60, 115, 45, 113, 108, 29, 57, 104, 42, 61, 59, 50, 51, 40, 106, 123, 41, 22, 119, 55, 118, 49, 105, 124, 125, 34, 86, 93, 53, 112, 47, 98, 87, 116, 37, 127, 111, 63, 90, 24, 94, 54, 52, 27, 101, 39, 62, 30, 26, 16, 25, 121, 80, 48, 23, 91, 18, 83, 103, 110, 100, 82, 35, 89, 13, 117, 21, 88, 85, 79, 19, 122, 20, 77, 11, 75, 36, 32, 84, 46, 102, 96, 99, 73, 78, 15, 9, 107, 76, 17, 92, 81, 97, 14, 43, 12, 72, 69, 5, 38, 74, 65, 71, 7, 10, 67, 8, 28, 0, 70, 33, 64, 1, 68, 3, 6, 31, 66, 4, 95, 2], [126, 120, 109, 56, 58, 44, 114, 60, 115, 45, 113, 108, 29, 104, 61, 42, 57, 51, 40, 59, 50, 106, 41, 55, 22, 123, 105, 124, 118, 119, 34, 86, 93, 90, 49, 125, 112, 98, 116, 127, 62, 39, 52, 26, 87, 27, 16, 121, 63, 47, 24, 100, 37, 101, 25, 94, 117, 30, 111, 53, 80, 91, 18, 122, 103, 48, 89, 54, 88, 82, 23, 32, 19, 83, 99, 85, 36, 15, 102, 35, 20, 21, 13, 77, 84, 79, 11, 73, 75, 110, 43, 96, 92, 46, 65, 9, 1, 17, 64, 5, 3, 78, 107, 70, 76, 14, 81, 97, 10, 8, 67, 74, 0, 33, 7, 38, 69, 12, 72, 71, 28, 4, 2, 6, 31, 68, 66, 95], [126, 120, 109, 58, 56, 44, 114, 60, 113, 115, 45, 108, 104, 29, 61, 57, 42, 51, 50, 40, 59, 41, 119, 106, 55, 49, 123, 105, 125, 118, 22, 34, 116, 124, 62, 52, 90, 93, 63, 53, 98, 39, 122, 86, 127, 94, 47, 87, 112, 16, 117, 121, 100, 27, 30, 37, 111, 85, 101, 25, 48, 80, 24, 88, 26, 54, 91, 18, 23, 15, 103, 89, 36, 19, 21, 110, 82, 35, 46, 102, 77, 20, 96, 84, 75, 79, 83, 13, 32, 99, 73, 43, 11, 67, 92, 17, 10, 65, 76, 70, 5, 78, 81, 14, 74, 9, 3, 1, 12, 64, 107, 97, 8, 0, 69, 38, 33, 7, 72, 6, 68, 28, 71, 4, 66, 2, 31, 95], [126, 120, 109, 58, 56, 44, 114, 60, 115, 113, 45, 108, 61, 57, 104, 42, 50, 59, 51, 29, 41, 40, 106, 123, 119, 105, 55, 124, 116, 62, 22, 49, 118, 34, 90, 47, 125, 86, 93, 98, 127, 112, 111, 87, 122, 39, 94, 63, 27, 16, 80, 117, 24, 100, 53, 88, 37, 89, 18, 121, 91, 52, 23, 48, 19, 83, 26, 82, 30, 25, 15, 54, 85, 77, 75, 20, 13, 35, 103, 110, 101, 102, 46, 84, 79, 21, 11, 36, 43, 9, 73, 3, 96, 78, 14, 17, 76, 81, 70, 32, 99, 8, 1, 10, 67, 92, 107, 7, 5, 69, 12, 97, 74, 0, 65, 6, 72, 64, 71, 28, 33, 68, 4, 38, 66, 31, 2, 95], [126, 120, 109, 56, 58, 44, 114, 60, 115, 113, 45, 108, 104, 29, 61, 57, 50, 42, 51, 59, 41, 40, 55, 22, 106, 123, 105, 49, 119, 124, 34, 116, 125, 118, 62, 86, 93, 87, 127, 63, 16, 98, 90, 27, 94, 24, 37, 80, 117, 47, 18, 112, 39, 111, 53, 88, 91, 85, 30, 13, 89, 19, 82, 26, 48, 122, 83, 121, 23, 54, 25, 84, 35, 15, 100, 79, 110, 75, 103, 77, 20, 21, 52, 101, 11, 46, 17, 102, 99, 96, 36, 9, 81, 97, 14, 12, 78, 73, 92, 32, 8, 10, 74, 107, 76, 43, 70, 5, 69, 7, 72, 71, 33, 65, 67, 28, 38, 1, 3, 64, 0, 6, 31, 95, 4, 68, 2, 66], [126, 120, 109, 58, 56, 44, 60, 114, 115, 113, 45, 108, 29, 57, 42, 104, 51, 61, 50, 106, 40, 59, 41, 123, 22, 55, 119, 124, 49, 105, 118, 34, 127, 125, 86, 87, 93, 24, 27, 53, 94, 47, 116, 90, 80, 37, 88, 62, 89, 111, 16, 98, 25, 63, 83, 121, 52, 19, 18, 84, 112, 117, 91, 122, 15, 54, 30, 26, 77, 39, 100, 23, 48, 75, 21, 82, 85, 110, 101, 13, 46, 35, 20, 36, 103, 102, 11, 99, 79, 96, 17, 14, 81, 78, 9, 12, 32, 73, 92, 10, 107, 76, 97, 43, 8, 33, 7, 28, 38, 69, 74, 72, 5, 3, 71, 1, 0, 31, 70, 67, 65, 68, 6, 64, 4, 95, 66, 2], [126, 120, 109, 58, 44, 56, 60, 114, 115, 113, 108, 45, 57, 29, 42, 61, 51, 104, 59, 50, 22, 123, 106, 41, 40, 55, 118, 119, 125, 53, 49, 86, 124, 90, 105, 93, 34, 62, 87, 27, 127, 94, 24, 25, 89, 88, 98, 116, 37, 112, 80, 47, 63, 101, 16, 52, 39, 111, 18, 91, 26, 100, 83, 23, 30, 117, 121, 19, 54, 122, 99, 15, 103, 85, 21, 13, 82, 84, 35, 77, 102, 20, 46, 48, 75, 36, 32, 110, 97, 79, 96, 81, 92, 73, 17, 11, 33, 14, 78, 9, 43, 28, 12, 107, 8, 69, 5, 71, 6, 10, 76, 7, 65, 72, 31, 0, 3, 74, 64, 67, 1, 38, 4, 95, 68, 66, 70, 2], [126, 120, 109, 58, 56, 44, 60, 114, 113, 115, 45, 108, 57, 29, 104, 42, 61, 51, 50, 59, 40, 106, 123, 41, 55, 22, 119, 86, 49, 34, 93, 105, 118, 127, 37, 47, 124, 122, 90, 53, 116, 94, 62, 25, 89, 87, 26, 111, 27, 63, 125, 24, 30, 98, 52, 121, 16, 88, 80, 18, 101, 112, 83, 48, 54, 23, 35, 46, 100, 91, 82, 39, 85, 15, 117, 103, 84, 21, 13, 20, 110, 36, 19, 102, 32, 11, 99, 77, 43, 75, 79, 78, 96, 17, 81, 97, 73, 92, 107, 14, 74, 9, 12, 10, 33, 76, 28, 6, 69, 71, 8, 38, 7, 5, 72, 64, 65, 1, 31, 0, 68, 67, 3, 95, 4, 66, 2, 70], [126, 120, 109, 58, 56, 44, 114, 60, 113, 115, 45, 108, 57, 29, 61, 104, 42, 51, 50, 40, 123, 41, 106, 62, 59, 119, 49, 22, 55, 86, 93, 127, 34, 52, 118, 124, 105, 37, 125, 121, 47, 116, 53, 94, 90, 87, 63, 54, 98, 30, 122, 111, 89, 101, 16, 80, 25, 39, 85, 112, 24, 88, 27, 117, 83, 100, 23, 26, 46, 82, 91, 13, 18, 11, 35, 77, 36, 48, 75, 79, 110, 21, 103, 20, 19, 84, 102, 99, 76, 81, 15, 96, 9, 73, 17, 78, 32, 43, 92, 107, 5, 8, 7, 97, 71, 69, 6, 72, 12, 14, 74, 28, 3, 65, 1, 10, 38, 0, 33, 67, 64, 4, 70, 95, 68, 66, 2, 31], [126, 120, 109, 58, 56, 44, 114, 60, 115, 45, 113, 108, 104, 29, 57, 42, 61, 51, 123, 41, 40, 59, 50, 55, 106, 125, 105, 34, 49, 22, 119, 118, 62, 124, 86, 53, 93, 80, 116, 112, 98, 90, 16, 47, 52, 127, 94, 37, 122, 111, 87, 89, 27, 63, 13, 24, 30, 18, 117, 39, 85, 54, 23, 103, 25, 88, 26, 15, 100, 91, 84, 19, 11, 75, 77, 83, 82, 121, 79, 20, 48, 101, 73, 21, 96, 110, 36, 9, 35, 46, 17, 5, 67, 6, 14, 81, 72, 92, 78, 32, 43, 102, 99, 1, 69, 71, 76, 7, 10, 12, 97, 74, 65, 3, 8, 0, 64, 70, 68, 38, 33, 107, 4, 28, 2, 66, 95, 31], [126, 120, 109, 58, 56, 44, 114, 60, 115, 108, 113, 45, 104, 29, 42, 57, 61, 50, 51, 40, 59, 106, 41, 123, 22, 55, 49, 34, 86, 125, 105, 118, 124, 119, 47, 62, 127, 52, 98, 80, 90, 53, 24, 93, 27, 87, 16, 116, 122, 117, 37, 89, 94, 26, 39, 121, 112, 88, 91, 13, 63, 18, 30, 111, 83, 103, 48, 19, 82, 85, 54, 11, 23, 101, 84, 100, 25, 79, 75, 21, 20, 15, 77, 35, 46, 36, 9, 110, 78, 14, 73, 81, 99, 102, 96, 12, 32, 43, 17, 71, 97, 92, 72, 33, 107, 10, 69, 7, 5, 76, 6, 74, 3, 67, 1, 8, 0, 38, 65, 70, 64, 28, 31, 4, 68, 2, 66, 95], [126, 120, 109, 56, 114, 58, 60, 44, 115, 113, 108, 45, 29, 57, 61, 42, 123, 51, 104, 50, 40, 106, 41, 59, 124, 125, 55, 119, 118, 22, 86, 105, 49, 93, 62, 122, 52, 37, 34, 90, 53, 94, 116, 87, 47, 127, 27, 24, 88, 98, 101, 89, 111, 30, 121, 112, 103, 54, 25, 82, 91, 117, 16, 80, 48, 46, 26, 100, 83, 18, 35, 110, 21, 85, 39, 19, 63, 96, 102, 99, 23, 13, 20, 36, 84, 32, 79, 97, 11, 81, 15, 77, 43, 75, 78, 92, 107, 12, 17, 73, 14, 33, 9, 10, 71, 28, 72, 5, 38, 74, 76, 70, 69, 8, 7, 31, 68, 67, 6, 95, 1, 65, 3, 0, 4, 66, 64, 2], [126, 120, 109, 56, 114, 58, 44, 60, 115, 108, 113, 45, 57, 29, 104, 51, 42, 61, 123, 50, 106, 40, 59, 119, 41, 118, 55, 22, 124, 86, 105, 125, 49, 34, 62, 127, 90, 116, 37, 93, 47, 53, 63, 54, 98, 52, 87, 30, 16, 111, 24, 117, 27, 94, 122, 80, 101, 100, 26, 25, 89, 121, 91, 112, 103, 88, 83, 23, 35, 39, 84, 13, 85, 48, 46, 19, 21, 18, 82, 11, 75, 79, 81, 99, 107, 9, 36, 102, 20, 32, 15, 77, 110, 43, 97, 78, 96, 73, 14, 71, 17, 5, 76, 70, 92, 10, 72, 64, 33, 12, 7, 65, 69, 0, 1, 74, 8, 3, 38, 67, 28, 4, 95, 31, 66, 68, 2, 6], [126, 120, 109, 58, 56, 44, 114, 60, 115, 113, 108, 45, 57, 29, 104, 42, 61, 51, 50, 123, 59, 106, 40, 41, 22, 55, 119, 49, 93, 105, 34, 118, 124, 98, 86, 87, 62, 125, 37, 80, 90, 47, 16, 52, 89, 53, 94, 111, 127, 122, 27, 39, 116, 24, 30, 54, 101, 46, 117, 112, 18, 100, 26, 21, 19, 91, 13, 82, 83, 11, 85, 103, 25, 75, 23, 84, 15, 63, 36, 81, 77, 20, 88, 107, 110, 73, 79, 121, 9, 35, 102, 17, 48, 78, 96, 99, 14, 12, 76, 92, 7, 32, 70, 10, 5, 97, 72, 71, 67, 43, 69, 8, 3, 64, 74, 33, 1, 0, 65, 38, 28, 4, 6, 68, 31, 2, 95, 66], [126, 120, 109, 56, 58, 44, 60, 114, 115, 113, 45, 108, 29, 57, 42, 61, 104, 50, 51, 59, 40, 123, 41, 55, 49, 119, 106, 105, 22, 34, 90, 124, 47, 62, 98, 116, 118, 125, 86, 117, 53, 111, 122, 37, 52, 127, 103, 30, 112, 24, 93, 94, 87, 26, 46, 54, 80, 39, 101, 16, 27, 63, 82, 25, 85, 13, 88, 19, 110, 91, 83, 89, 100, 11, 121, 18, 15, 75, 73, 21, 84, 36, 35, 23, 48, 77, 20, 79, 9, 102, 99, 96, 107, 81, 92, 78, 10, 12, 32, 14, 72, 76, 70, 17, 33, 69, 97, 43, 7, 67, 3, 8, 71, 65, 74, 38, 0, 1, 5, 64, 6, 28, 68, 4, 66, 95, 31, 2], [126, 120, 109, 114, 56, 44, 58, 60, 115, 108, 113, 45, 29, 57, 61, 42, 104, 51, 50, 40, 106, 123, 41, 59, 55, 49, 119, 118, 22, 86, 34, 105, 124, 62, 94, 52, 93, 116, 98, 125, 111, 90, 37, 47, 30, 53, 27, 117, 87, 127, 101, 39, 24, 122, 54, 63, 80, 112, 88, 25, 26, 91, 100, 89, 19, 82, 46, 48, 16, 21, 13, 85, 83, 103, 84, 35, 23, 121, 18, 96, 79, 102, 15, 36, 11, 110, 107, 99, 92, 20, 32, 75, 17, 12, 77, 81, 33, 43, 73, 78, 97, 9, 14, 10, 64, 69, 28, 71, 38, 76, 72, 74, 8, 5, 7, 65, 70, 67, 1, 0, 3, 95, 31, 4, 6, 2, 68, 66], [126, 120, 109, 58, 114, 44, 56, 60, 115, 113, 108, 45, 29, 57, 104, 42, 61, 51, 50, 106, 40, 123, 41, 59, 119, 55, 22, 49, 34, 105, 118, 93, 86, 62, 124, 125, 47, 111, 98, 90, 94, 52, 122, 26, 37, 116, 117, 87, 24, 63, 27, 30, 39, 53, 80, 112, 88, 85, 16, 89, 101, 127, 25, 91, 121, 100, 82, 18, 54, 83, 13, 110, 103, 46, 19, 23, 36, 11, 20, 102, 21, 35, 15, 32, 48, 99, 96, 12, 75, 79, 84, 77, 9, 92, 73, 17, 107, 81, 97, 78, 14, 43, 71, 33, 7, 69, 5, 74, 8, 72, 28, 38, 64, 67, 10, 1, 3, 0, 76, 65, 6, 70, 31, 68, 4, 66, 2, 95], [126, 120, 109, 58, 44, 60, 56, 114, 115, 113, 45, 108, 57, 104, 29, 61, 42, 51, 50, 40, 59, 123, 41, 124, 106, 119, 62, 22, 105, 49, 34, 118, 55, 37, 93, 47, 116, 52, 94, 98, 111, 90, 125, 87, 53, 122, 39, 100, 86, 127, 30, 112, 54, 121, 27, 80, 25, 26, 101, 117, 24, 18, 103, 35, 63, 16, 88, 110, 36, 89, 46, 23, 21, 83, 82, 91, 102, 85, 96, 19, 15, 75, 11, 48, 84, 13, 20, 99, 107, 79, 81, 73, 77, 32, 38, 92, 17, 97, 9, 12, 43, 76, 14, 78, 8, 1, 74, 71, 67, 6, 10, 5, 64, 7, 3, 69, 72, 28, 33, 65, 0, 70, 95, 68, 66, 31, 4, 2], [126, 120, 109, 58, 56, 114, 44, 60, 115, 113, 45, 108, 29, 57, 61, 104, 42, 51, 50, 40, 59, 41, 123, 22, 119, 106, 124, 49, 34, 55, 105, 118, 62, 125, 93, 98, 116, 52, 86, 94, 90, 37, 111, 112, 63, 47, 53, 16, 101, 87, 121, 127, 30, 25, 24, 27, 54, 26, 91, 80, 100, 103, 23, 122, 82, 89, 110, 18, 39, 48, 11, 85, 88, 83, 21, 19, 96, 46, 13, 117, 35, 84, 102, 15, 75, 107, 77, 99, 20, 36, 79, 92, 73, 81, 32, 14, 17, 9, 78, 97, 74, 76, 33, 10, 12, 8, 6, 5, 71, 43, 1, 7, 72, 38, 67, 28, 69, 65, 64, 3, 0, 95, 4, 70, 68, 31, 66, 2], [126, 120, 109, 58, 56, 114, 44, 60, 115, 113, 45, 108, 29, 104, 61, 42, 57, 51, 40, 50, 59, 123, 106, 49, 41, 55, 22, 62, 119, 124, 34, 118, 86, 105, 52, 125, 90, 93, 94, 116, 37, 63, 98, 53, 16, 47, 24, 87, 80, 117, 112, 111, 39, 30, 54, 26, 27, 91, 89, 88, 101, 82, 25, 23, 121, 48, 103, 13, 21, 85, 83, 127, 18, 100, 84, 20, 19, 110, 75, 11, 77, 36, 96, 79, 122, 102, 107, 46, 32, 15, 99, 35, 9, 17, 81, 92, 73, 12, 33, 43, 78, 14, 76, 10, 6, 71, 8, 97, 74, 7, 5, 72, 3, 28, 69, 0, 65, 1, 67, 38, 64, 31, 95, 70, 4, 68, 66, 2], [126, 120, 109, 58, 56, 114, 44, 60, 113, 115, 45, 108, 104, 29, 51, 61, 57, 42, 40, 106, 59, 50, 41, 62, 123, 119, 105, 124, 55, 118, 34, 49, 125, 86, 52, 22, 37, 87, 98, 93, 116, 47, 90, 121, 53, 63, 112, 111, 94, 117, 89, 16, 39, 24, 27, 127, 26, 88, 30, 110, 54, 122, 18, 80, 91, 101, 23, 83, 85, 48, 102, 82, 25, 21, 100, 20, 96, 19, 35, 13, 77, 46, 11, 36, 84, 107, 99, 17, 15, 79, 75, 92, 103, 12, 32, 33, 73, 81, 43, 0, 9, 78, 14, 67, 65, 64, 1, 5, 8, 10, 69, 72, 97, 6, 74, 76, 38, 71, 7, 28, 3, 4, 70, 31, 2, 68, 66, 95], [126, 120, 109, 58, 56, 44, 114, 60, 115, 113, 45, 108, 61, 29, 104, 57, 51, 42, 40, 50, 59, 123, 106, 41, 118, 62, 55, 22, 124, 119, 105, 49, 116, 86, 34, 93, 37, 98, 111, 94, 87, 90, 125, 53, 30, 117, 47, 112, 16, 121, 110, 52, 39, 54, 127, 27, 100, 103, 122, 91, 88, 63, 26, 101, 18, 89, 80, 24, 48, 19, 83, 20, 25, 82, 23, 21, 35, 36, 85, 107, 46, 77, 13, 99, 84, 102, 11, 32, 79, 96, 92, 75, 15, 17, 9, 12, 73, 33, 78, 81, 14, 43, 69, 3, 8, 97, 76, 7, 74, 10, 28, 38, 1, 65, 5, 6, 64, 68, 70, 71, 72, 0, 67, 31, 66, 95, 4, 2], [126, 120, 109, 56, 58, 114, 44, 60, 115, 113, 108, 45, 29, 104, 61, 42, 57, 51, 123, 50, 40, 106, 59, 41, 62, 22, 49, 118, 119, 55, 105, 124, 34, 116, 93, 47, 125, 86, 87, 98, 37, 63, 90, 52, 94, 111, 53, 30, 80, 91, 117, 24, 121, 88, 39, 16, 26, 18, 122, 127, 112, 100, 48, 27, 89, 19, 54, 101, 46, 85, 25, 110, 82, 20, 11, 83, 103, 15, 75, 23, 21, 36, 102, 35, 84, 77, 107, 96, 13, 79, 32, 73, 17, 92, 14, 76, 9, 99, 7, 12, 97, 81, 78, 8, 69, 43, 10, 38, 70, 33, 74, 5, 28, 71, 72, 67, 1, 0, 3, 65, 64, 68, 6, 95, 4, 31, 66, 2], [126, 120, 109, 58, 56, 44, 114, 60, 115, 113, 45, 108, 57, 29, 104, 42, 61, 51, 40, 123, 50, 106, 59, 41, 22, 55, 119, 105, 49, 118, 62, 34, 124, 93, 125, 86, 87, 121, 52, 47, 98, 39, 90, 116, 94, 37, 26, 111, 24, 63, 89, 127, 112, 117, 16, 54, 27, 18, 80, 30, 91, 82, 53, 88, 83, 122, 101, 100, 48, 23, 19, 21, 85, 77, 15, 20, 110, 75, 11, 13, 25, 84, 103, 35, 96, 36, 73, 17, 46, 79, 102, 9, 81, 76, 107, 92, 78, 32, 99, 74, 14, 12, 70, 43, 71, 7, 5, 72, 8, 69, 97, 10, 3, 38, 28, 67, 65, 33, 1, 64, 0, 4, 6, 31, 68, 95, 2, 66], [126, 120, 109, 56, 58, 44, 114, 60, 115, 113, 45, 108, 29, 104, 57, 42, 61, 123, 51, 50, 59, 40, 106, 119, 41, 49, 22, 118, 124, 105, 55, 34, 63, 62, 125, 121, 86, 116, 52, 90, 98, 93, 53, 87, 47, 24, 94, 54, 37, 39, 117, 101, 27, 26, 30, 127, 89, 111, 122, 112, 18, 48, 80, 16, 88, 91, 100, 25, 83, 35, 84, 23, 82, 19, 96, 36, 46, 110, 13, 21, 75, 85, 9, 102, 107, 103, 15, 11, 99, 92, 20, 77, 79, 73, 43, 76, 32, 97, 14, 17, 33, 78, 81, 74, 28, 38, 7, 10, 71, 5, 72, 70, 69, 12, 8, 67, 1, 65, 64, 0, 31, 95, 3, 68, 6, 66, 4, 2], [126, 120, 109, 58, 44, 56, 114, 60, 115, 108, 45, 113, 29, 104, 42, 57, 61, 51, 123, 50, 59, 40, 106, 119, 41, 124, 55, 118, 62, 125, 34, 22, 105, 90, 86, 53, 49, 116, 93, 94, 127, 52, 122, 37, 98, 47, 63, 117, 39, 112, 87, 111, 27, 54, 103, 30, 121, 101, 100, 48, 25, 88, 89, 18, 23, 24, 16, 26, 80, 83, 110, 91, 36, 84, 35, 99, 75, 82, 20, 21, 19, 46, 107, 13, 96, 43, 77, 102, 15, 85, 9, 11, 32, 17, 7, 79, 97, 14, 92, 78, 81, 73, 76, 33, 38, 72, 67, 74, 12, 69, 5, 70, 28, 3, 71, 31, 8, 64, 65, 10, 0, 1, 6, 68, 4, 95, 2, 66], [126, 120, 109, 58, 56, 44, 114, 60, 115, 113, 45, 108, 29, 104, 57, 42, 61, 51, 50, 40, 59, 106, 123, 41, 62, 22, 124, 118, 119, 49, 105, 55, 116, 87, 86, 47, 93, 34, 125, 122, 94, 121, 111, 98, 117, 63, 52, 37, 112, 90, 127, 30, 39, 80, 18, 24, 53, 16, 27, 89, 54, 26, 25, 101, 91, 83, 100, 13, 75, 84, 23, 19, 88, 82, 21, 48, 96, 110, 11, 35, 15, 79, 20, 36, 85, 77, 102, 107, 73, 99, 32, 9, 17, 81, 103, 46, 12, 14, 76, 78, 74, 10, 7, 5, 92, 97, 72, 1, 43, 69, 67, 71, 8, 64, 33, 6, 3, 65, 38, 70, 0, 4, 28, 68, 66, 95, 31, 2], [126, 120, 109, 58, 56, 44, 114, 60, 115, 113, 45, 108, 29, 61, 104, 57, 42, 40, 50, 51, 59, 123, 41, 106, 22, 49, 124, 105, 62, 55, 119, 118, 34, 86, 125, 63, 98, 47, 93, 116, 94, 111, 87, 30, 37, 90, 122, 16, 127, 112, 39, 27, 52, 100, 24, 117, 54, 80, 96, 91, 82, 101, 25, 26, 121, 89, 88, 18, 53, 110, 75, 46, 103, 83, 13, 85, 79, 48, 23, 19, 21, 35, 84, 36, 77, 11, 15, 102, 20, 73, 9, 43, 107, 14, 99, 81, 72, 76, 5, 92, 17, 32, 1, 74, 78, 97, 64, 65, 12, 10, 8, 0, 7, 3, 69, 6, 71, 38, 67, 33, 70, 28, 4, 66, 68, 95, 31, 2], [126, 120, 109, 56, 58, 44, 114, 115, 60, 113, 45, 108, 29, 104, 61, 51, 42, 57, 59, 50, 123, 40, 41, 106, 22, 118, 105, 49, 124, 119, 55, 34, 125, 86, 116, 93, 98, 87, 94, 47, 63, 53, 90, 127, 52, 37, 30, 117, 80, 24, 111, 112, 39, 62, 16, 54, 27, 121, 26, 82, 88, 25, 75, 83, 91, 13, 21, 100, 23, 89, 101, 18, 35, 48, 79, 96, 85, 77, 110, 122, 84, 46, 20, 11, 15, 19, 73, 103, 99, 36, 92, 107, 76, 14, 9, 78, 81, 43, 102, 97, 12, 17, 32, 72, 5, 6, 7, 10, 33, 74, 28, 69, 71, 8, 67, 3, 38, 1, 64, 65, 0, 70, 31, 4, 68, 2, 95, 66], [126, 120, 109, 56, 114, 58, 44, 115, 60, 113, 45, 108, 104, 29, 61, 42, 50, 57, 51, 123, 59, 40, 41, 106, 49, 118, 119, 55, 124, 105, 116, 22, 34, 125, 86, 52, 63, 112, 98, 62, 47, 30, 94, 87, 111, 127, 93, 39, 90, 37, 54, 101, 121, 27, 24, 26, 16, 53, 80, 100, 117, 122, 103, 91, 82, 23, 89, 48, 18, 96, 88, 85, 25, 75, 35, 13, 84, 15, 110, 19, 21, 9, 36, 73, 77, 83, 20, 79, 7, 11, 107, 99, 46, 14, 5, 6, 76, 102, 81, 32, 92, 78, 43, 12, 69, 17, 10, 97, 8, 74, 67, 72, 64, 0, 33, 65, 1, 71, 38, 3, 28, 4, 95, 68, 70, 31, 66, 2], [126, 120, 109, 44, 56, 114, 58, 60, 115, 108, 113, 45, 29, 104, 57, 61, 42, 51, 50, 59, 40, 41, 106, 22, 123, 55, 118, 116, 119, 125, 124, 34, 86, 105, 49, 62, 87, 93, 47, 98, 94, 37, 127, 111, 90, 27, 39, 53, 30, 80, 100, 24, 16, 52, 63, 117, 121, 112, 122, 26, 25, 19, 101, 91, 88, 54, 13, 23, 82, 75, 18, 48, 35, 89, 83, 85, 21, 15, 110, 84, 77, 79, 96, 20, 11, 103, 9, 73, 14, 46, 76, 36, 102, 107, 97, 43, 32, 99, 81, 92, 78, 7, 17, 6, 69, 12, 72, 8, 74, 28, 33, 38, 71, 5, 10, 67, 31, 1, 95, 68, 3, 4, 65, 64, 0, 70, 66, 2]], "model.layers.18.self_attn.q_proj": [[39, 52, 112, 113, 32, 26, 87, 85, 121, 49, 29, 90, 74, 79, 116, 70, 77, 20, 53, 93, 82, 80, 46, 44, 23, 125, 119, 0, 75, 57, 98, 124, 18, 55, 61, 68, 111, 3, 48, 60, 54, 123, 115, 110, 62, 106, 65, 126, 19, 7, 117, 107, 84, 99, 118, 38, 56, 96, 120, 42, 47, 43, 9, 50, 109, 8, 2, 40, 51, 58, 11, 105, 21, 17, 25, 97, 41, 101, 59, 102, 122, 35, 24, 45, 6, 127, 63, 89, 104, 94, 33, 37, 30, 114, 100, 28, 108, 66, 22, 34, 14, 86, 36, 31, 88, 27, 92, 83, 103, 15, 69, 91, 10, 95, 81, 5, 16, 76, 71, 72, 1, 78, 12, 73, 67, 13, 4, 64], [39, 113, 52, 110, 112, 49, 29, 46, 87, 85, 32, 116, 90, 53, 26, 80, 93, 60, 83, 92, 77, 125, 115, 108, 23, 44, 123, 79, 21, 31, 121, 98, 119, 56, 111, 88, 58, 8, 48, 124, 107, 61, 105, 122, 42, 101, 18, 74, 118, 126, 62, 100, 25, 57, 55, 33, 63, 104, 106, 97, 45, 43, 24, 70, 117, 54, 99, 35, 38, 41, 82, 89, 47, 114, 120, 59, 50, 40, 109, 51, 127, 102, 36, 86, 68, 12, 11, 28, 17, 78, 91, 96, 94, 37, 15, 22, 34, 84, 9, 95, 30, 20, 7, 76, 19, 13, 27, 72, 75, 6, 16, 81, 73, 14, 2, 4, 64, 71, 65, 3, 69, 10, 0, 103, 1, 67, 66, 5], [39, 113, 52, 49, 29, 85, 32, 87, 116, 90, 26, 121, 79, 60, 125, 110, 46, 93, 112, 119, 53, 44, 21, 77, 124, 111, 122, 59, 61, 96, 109, 118, 123, 80, 98, 84, 56, 115, 54, 82, 47, 55, 57, 18, 23, 8, 74, 108, 83, 101, 120, 50, 92, 43, 22, 88, 89, 37, 31, 48, 97, 127, 42, 41, 51, 11, 105, 38, 35, 91, 106, 45, 62, 58, 75, 107, 36, 104, 78, 70, 117, 33, 63, 28, 114, 126, 94, 15, 19, 86, 95, 102, 40, 30, 27, 34, 9, 99, 17, 100, 24, 16, 76, 68, 25, 20, 71, 7, 81, 65, 3, 14, 12, 103, 2, 13, 1, 0, 72, 73, 4, 67, 64, 10, 6, 5, 69, 66], [39, 113, 52, 112, 110, 49, 29, 32, 85, 116, 77, 122, 87, 26, 31, 98, 46, 93, 80, 48, 8, 119, 50, 83, 70, 79, 53, 18, 123, 120, 74, 44, 125, 111, 114, 60, 42, 58, 124, 88, 57, 100, 92, 21, 117, 118, 90, 56, 115, 23, 121, 61, 35, 7, 108, 45, 27, 109, 63, 101, 25, 37, 62, 43, 0, 59, 82, 51, 54, 28, 6, 86, 68, 99, 102, 36, 104, 107, 41, 97, 38, 47, 55, 106, 105, 126, 127, 69, 40, 33, 34, 12, 3, 94, 95, 13, 91, 11, 30, 78, 19, 24, 84, 22, 15, 96, 81, 89, 20, 65, 67, 66, 9, 2, 73, 75, 4, 71, 76, 72, 16, 14, 17, 10, 1, 64, 5, 103], [124, 63, 56, 100, 115, 36, 39, 110, 102, 122, 116, 111, 59, 54, 105, 58, 103, 57, 121, 120, 60, 49, 62, 127, 50, 45, 25, 117, 125, 47, 112, 123, 55, 48, 114, 53, 109, 93, 126, 35, 52, 42, 40, 113, 51, 101, 41, 34, 119, 46, 87, 118, 107, 108, 61, 32, 44, 99, 104, 43, 38, 106, 37, 80, 90, 96, 94, 20, 30, 98, 23, 24, 97, 33, 84, 27, 31, 95, 29, 73, 85, 91, 89, 19, 28, 22, 21, 92, 78, 26, 88, 11, 75, 16, 86, 18, 69, 83, 77, 82, 13, 7, 9, 14, 1, 71, 5, 6, 66, 17, 3, 2, 79, 81, 67, 4, 15, 70, 12, 72, 74, 10, 65, 76, 64, 0, 8, 68], [63, 124, 30, 39, 115, 100, 19, 36, 25, 96, 22, 12, 18, 17, 15, 99, 27, 89, 32, 13, 72, 24, 5, 103, 35, 74, 92, 6, 123, 122, 56, 45, 2, 54, 4, 1, 90, 120, 126, 28, 33, 71, 21, 121, 42, 85, 14, 113, 26, 98, 83, 125, 73, 84, 75, 11, 94, 108, 86, 117, 23, 69, 52, 9, 10, 0, 81, 20, 97, 7, 67, 101, 38, 112, 87, 118, 93, 60, 43, 29, 16, 57, 34, 47, 37, 78, 91, 107, 80, 82, 88, 3, 58, 105, 79, 44, 65, 127, 119, 48, 70, 59, 62, 95, 53, 40, 109, 51, 66, 76, 31, 102, 111, 104, 49, 77, 106, 110, 114, 50, 68, 64, 61, 55, 116, 41, 46, 8], [124, 63, 100, 36, 120, 98, 103, 38, 25, 96, 32, 84, 19, 87, 97, 45, 94, 101, 126, 39, 22, 56, 51, 89, 117, 59, 121, 58, 113, 27, 21, 17, 62, 34, 54, 31, 23, 107, 40, 86, 52, 33, 13, 47, 111, 48, 46, 112, 102, 90, 122, 127, 99, 15, 60, 24, 57, 43, 91, 125, 83, 116, 29, 118, 44, 26, 28, 119, 110, 114, 30, 88, 109, 106, 49, 61, 93, 123, 105, 115, 35, 55, 53, 108, 12, 80, 85, 37, 78, 42, 20, 11, 104, 41, 81, 18, 95, 92, 50, 7, 14, 16, 82, 5, 77, 6, 74, 79, 75, 73, 72, 69, 76, 4, 2, 71, 9, 64, 68, 70, 8, 10, 66, 65, 1, 67, 3, 0], [124, 63, 24, 100, 21, 30, 18, 17, 12, 22, 15, 39, 115, 8, 74, 72, 27, 4, 90, 96, 79, 84, 13, 92, 76, 36, 1, 25, 94, 122, 68, 64, 75, 26, 77, 51, 121, 0, 6, 7, 3, 83, 2, 33, 85, 69, 81, 31, 10, 28, 70, 19, 86, 89, 46, 14, 5, 78, 67, 11, 82, 66, 20, 16, 71, 23, 80, 102, 88, 65, 73, 87, 93, 29, 43, 111, 103, 95, 32, 9, 127, 97, 35, 91, 98, 114, 126, 42, 125, 44, 37, 99, 56, 40, 61, 45, 120, 52, 58, 34, 104, 105, 119, 50, 101, 118, 38, 53, 59, 108, 107, 123, 55, 106, 112, 41, 49, 116, 60, 57, 109, 110, 113, 48, 54, 117, 47, 62], [55, 102, 118, 123, 46, 97, 61, 62, 53, 116, 106, 21, 120, 27, 57, 51, 115, 112, 126, 58, 122, 119, 48, 49, 63, 114, 91, 124, 50, 45, 31, 109, 52, 117, 24, 99, 110, 127, 60, 95, 41, 125, 121, 54, 113, 56, 103, 111, 59, 14, 47, 108, 104, 44, 38, 87, 88, 40, 18, 28, 19, 78, 70, 6, 42, 39, 26, 105, 107, 92, 86, 43, 101, 36, 37, 100, 93, 98, 96, 35, 34, 32, 65, 33, 82, 23, 94, 25, 30, 29, 0, 89, 16, 84, 90, 85, 22, 76, 80, 64, 17, 1, 3, 68, 20, 4, 83, 67, 11, 12, 8, 15, 66, 81, 2, 71, 72, 5, 9, 75, 7, 69, 74, 79, 77, 73, 13, 10], [118, 102, 55, 123, 48, 97, 53, 62, 61, 27, 46, 24, 57, 91, 51, 21, 116, 120, 31, 38, 122, 126, 115, 50, 49, 119, 124, 112, 58, 114, 63, 110, 40, 117, 95, 106, 109, 52, 54, 113, 99, 121, 18, 127, 56, 86, 88, 125, 45, 47, 111, 41, 60, 59, 104, 14, 44, 39, 43, 103, 108, 100, 107, 42, 92, 33, 105, 26, 101, 94, 98, 37, 36, 32, 78, 19, 35, 70, 34, 96, 29, 93, 30, 25, 28, 22, 17, 23, 84, 87, 82, 90, 6, 16, 85, 89, 80, 83, 20, 76, 12, 65, 15, 81, 3, 0, 1, 8, 75, 9, 71, 73, 11, 68, 79, 67, 64, 5, 4, 66, 77, 7, 2, 69, 74, 72, 10, 13], [102, 55, 118, 97, 123, 86, 24, 84, 46, 15, 27, 82, 17, 88, 77, 90, 74, 18, 48, 11, 72, 76, 30, 38, 120, 91, 54, 8, 95, 69, 92, 106, 93, 62, 71, 104, 22, 5, 16, 3, 20, 9, 81, 29, 14, 119, 75, 10, 31, 7, 116, 13, 19, 79, 21, 89, 45, 25, 85, 26, 126, 83, 28, 80, 23, 12, 67, 122, 101, 94, 40, 1, 87, 33, 108, 98, 70, 124, 49, 78, 109, 61, 96, 73, 36, 64, 65, 53, 68, 6, 114, 32, 103, 34, 0, 66, 57, 4, 37, 35, 60, 2, 107, 99, 100, 59, 110, 44, 125, 58, 41, 113, 105, 42, 47, 39, 112, 43, 50, 111, 51, 117, 56, 63, 52, 127, 121, 115], [102, 118, 55, 97, 123, 84, 24, 86, 17, 46, 27, 15, 31, 62, 91, 14, 76, 90, 18, 38, 88, 73, 95, 22, 79, 101, 82, 120, 81, 53, 61, 48, 11, 54, 12, 104, 75, 80, 19, 74, 49, 106, 28, 83, 20, 71, 29, 119, 122, 6, 116, 23, 85, 51, 45, 30, 77, 10, 25, 126, 69, 96, 87, 21, 9, 89, 112, 16, 26, 2, 108, 92, 93, 7, 32, 57, 5, 13, 94, 109, 124, 67, 65, 35, 58, 98, 115, 125, 37, 114, 78, 72, 103, 8, 34, 63, 50, 59, 52, 110, 44, 99, 100, 111, 40, 60, 113, 127, 121, 47, 3, 56, 39, 66, 117, 36, 0, 42, 68, 41, 105, 70, 107, 43, 64, 1, 33, 4], [43, 36, 54, 96, 63, 60, 89, 20, 107, 82, 126, 50, 87, 115, 80, 62, 25, 21, 9, 52, 16, 61, 32, 79, 13, 48, 101, 112, 47, 75, 53, 39, 57, 29, 85, 41, 69, 71, 103, 122, 119, 111, 19, 106, 105, 35, 104, 45, 18, 84, 55, 86, 23, 40, 127, 94, 46, 59, 125, 99, 88, 44, 93, 121, 28, 26, 90, 38, 100, 58, 92, 97, 51, 78, 6, 98, 110, 108, 120, 33, 10, 37, 114, 15, 74, 27, 123, 124, 7, 116, 56, 17, 102, 83, 113, 118, 24, 49, 67, 77, 95, 12, 8, 11, 109, 34, 31, 14, 22, 72, 30, 91, 117, 81, 76, 42, 66, 73, 4, 1, 5, 2, 70, 64, 3, 0, 65, 68], [43, 60, 96, 36, 63, 107, 89, 48, 123, 20, 126, 82, 21, 119, 46, 111, 25, 19, 85, 44, 80, 26, 42, 55, 112, 23, 13, 51, 122, 79, 9, 45, 61, 54, 18, 75, 41, 53, 59, 87, 16, 62, 50, 39, 105, 69, 52, 57, 103, 38, 40, 106, 116, 27, 121, 113, 101, 71, 114, 92, 109, 84, 49, 32, 58, 115, 35, 104, 95, 117, 124, 108, 125, 56, 86, 33, 11, 90, 127, 37, 81, 110, 88, 98, 29, 99, 83, 118, 34, 100, 30, 102, 24, 7, 120, 93, 97, 17, 91, 47, 94, 77, 1, 31, 22, 28, 8, 15, 0, 14, 72, 3, 73, 78, 67, 12, 10, 6, 2, 74, 5, 68, 76, 70, 66, 4, 64, 65], [43, 96, 54, 36, 63, 47, 20, 89, 107, 25, 60, 119, 82, 9, 80, 16, 87, 21, 69, 79, 18, 111, 28, 71, 13, 48, 55, 112, 75, 120, 53, 38, 32, 126, 19, 61, 123, 41, 62, 50, 115, 51, 84, 85, 57, 110, 104, 40, 103, 39, 45, 29, 67, 106, 122, 125, 1, 118, 59, 95, 26, 37, 116, 44, 64, 121, 56, 127, 76, 58, 100, 113, 108, 49, 7, 114, 102, 65, 105, 52, 83, 46, 94, 109, 124, 117, 91, 86, 90, 101, 99, 34, 92, 23, 30, 35, 22, 42, 73, 98, 27, 93, 4, 97, 33, 6, 14, 66, 0, 72, 81, 74, 78, 15, 88, 31, 11, 5, 24, 10, 3, 68, 2, 17, 8, 77, 12, 70], [43, 54, 96, 36, 60, 89, 107, 82, 20, 87, 75, 63, 25, 80, 79, 71, 111, 13, 9, 47, 29, 69, 85, 16, 21, 77, 126, 97, 40, 32, 41, 38, 92, 106, 18, 119, 115, 28, 34, 117, 53, 121, 67, 19, 44, 27, 46, 57, 109, 58, 50, 103, 14, 48, 101, 10, 84, 35, 55, 122, 45, 93, 65, 95, 1, 114, 91, 23, 118, 51, 12, 37, 2, 83, 112, 15, 104, 88, 94, 31, 78, 123, 62, 30, 66, 59, 116, 99, 90, 11, 100, 64, 26, 61, 42, 127, 22, 56, 113, 33, 49, 7, 120, 24, 74, 81, 76, 73, 39, 110, 125, 98, 8, 124, 52, 17, 108, 5, 86, 4, 102, 105, 6, 70, 72, 68, 0, 3], [120, 104, 59, 113, 109, 51, 62, 110, 22, 56, 88, 57, 119, 112, 90, 52, 83, 41, 21, 114, 36, 98, 55, 116, 107, 50, 103, 29, 115, 123, 93, 54, 95, 43, 38, 42, 124, 49, 60, 122, 99, 45, 117, 105, 126, 102, 61, 48, 106, 58, 33, 30, 127, 108, 39, 118, 47, 125, 44, 46, 111, 53, 27, 121, 63, 101, 37, 97, 35, 28, 100, 92, 31, 96, 80, 10, 32, 34, 79, 86, 40, 74, 17, 25, 91, 94, 81, 19, 76, 23, 85, 77, 84, 26, 13, 24, 89, 14, 15, 20, 71, 87, 72, 82, 16, 18, 7, 12, 11, 67, 69, 3, 9, 78, 5, 6, 75, 8, 1, 70, 65, 68, 73, 0, 66, 4, 2, 64], [104, 120, 59, 113, 98, 88, 18, 80, 65, 22, 6, 76, 93, 9, 110, 15, 14, 62, 68, 40, 112, 119, 72, 57, 83, 84, 97, 32, 51, 28, 36, 100, 26, 123, 29, 1, 67, 82, 74, 52, 11, 5, 8, 24, 70, 81, 99, 61, 4, 27, 86, 21, 91, 90, 46, 19, 107, 75, 66, 117, 34, 55, 116, 33, 58, 77, 16, 124, 54, 37, 49, 114, 118, 50, 31, 23, 25, 60, 13, 42, 109, 127, 115, 53, 78, 38, 3, 87, 106, 17, 126, 89, 122, 30, 69, 101, 12, 20, 92, 108, 64, 85, 105, 71, 41, 56, 45, 95, 96, 35, 0, 7, 102, 94, 43, 125, 48, 44, 103, 10, 111, 47, 2, 63, 79, 73, 121, 39], [113, 104, 120, 59, 88, 98, 22, 93, 52, 51, 83, 57, 29, 21, 112, 80, 90, 119, 62, 110, 124, 114, 76, 28, 116, 81, 55, 123, 50, 11, 95, 18, 122, 117, 118, 15, 108, 54, 49, 42, 96, 115, 58, 61, 41, 85, 100, 39, 14, 56, 53, 47, 84, 46, 109, 60, 127, 44, 125, 91, 27, 92, 31, 36, 86, 48, 43, 103, 35, 26, 101, 126, 105, 97, 45, 40, 121, 106, 38, 82, 6, 99, 72, 111, 19, 17, 102, 65, 63, 13, 30, 79, 25, 74, 24, 107, 23, 33, 37, 94, 87, 10, 77, 32, 89, 9, 34, 12, 20, 5, 75, 68, 3, 8, 71, 66, 16, 69, 7, 67, 78, 0, 2, 1, 4, 64, 70, 73], [59, 104, 120, 113, 112, 57, 62, 51, 83, 36, 43, 55, 105, 89, 119, 52, 50, 98, 95, 60, 110, 124, 49, 126, 115, 90, 35, 121, 117, 122, 88, 45, 97, 92, 56, 109, 123, 21, 61, 118, 102, 54, 116, 58, 42, 46, 103, 127, 17, 107, 114, 47, 48, 44, 15, 125, 23, 108, 41, 106, 63, 74, 34, 53, 100, 39, 101, 32, 99, 37, 111, 38, 19, 10, 26, 29, 30, 96, 13, 93, 28, 24, 33, 25, 31, 81, 94, 84, 79, 91, 5, 67, 40, 85, 87, 3, 71, 20, 0, 77, 22, 65, 69, 27, 14, 64, 18, 7, 76, 11, 75, 66, 86, 1, 68, 9, 72, 6, 82, 80, 2, 8, 4, 12, 78, 73, 70, 16], [127, 54, 122, 125, 100, 62, 126, 119, 120, 41, 40, 36, 42, 28, 117, 106, 118, 56, 83, 115, 92, 112, 58, 124, 59, 113, 39, 55, 49, 46, 57, 111, 53, 123, 63, 86, 52, 116, 99, 61, 114, 121, 48, 104, 47, 60, 51, 50, 105, 107, 37, 103, 108, 82, 110, 43, 45, 44, 38, 109, 35, 15, 74, 94, 101, 33, 25, 102, 98, 29, 34, 31, 80, 20, 96, 89, 90, 71, 95, 77, 24, 79, 97, 32, 88, 67, 26, 22, 75, 30, 91, 93, 65, 19, 27, 18, 23, 21, 6, 14, 85, 87, 10, 7, 68, 66, 84, 0, 13, 76, 73, 17, 78, 4, 1, 64, 8, 81, 16, 2, 12, 11, 70, 9, 72, 3, 69, 5], [122, 54, 127, 40, 100, 125, 24, 90, 33, 31, 62, 120, 105, 99, 19, 36, 86, 26, 126, 28, 106, 119, 53, 25, 43, 81, 102, 41, 98, 20, 88, 87, 49, 69, 35, 94, 115, 85, 59, 76, 66, 73, 61, 32, 7, 104, 56, 91, 27, 52, 34, 15, 110, 23, 8, 57, 38, 83, 72, 22, 75, 114, 29, 58, 63, 12, 78, 5, 101, 3, 109, 113, 92, 117, 48, 123, 42, 60, 111, 51, 17, 124, 16, 116, 37, 70, 103, 74, 112, 93, 55, 121, 96, 47, 82, 10, 21, 13, 50, 11, 9, 14, 30, 95, 46, 118, 39, 1, 89, 4, 80, 108, 107, 45, 77, 6, 44, 67, 18, 79, 84, 2, 71, 64, 65, 68, 0, 97], [54, 122, 127, 100, 41, 40, 92, 33, 106, 120, 125, 126, 36, 24, 103, 28, 83, 86, 119, 117, 94, 32, 26, 99, 115, 15, 42, 62, 49, 31, 59, 90, 29, 21, 105, 57, 96, 88, 53, 104, 82, 124, 113, 56, 39, 58, 95, 63, 74, 112, 35, 38, 34, 16, 123, 114, 55, 118, 48, 80, 61, 50, 98, 52, 37, 22, 51, 121, 46, 89, 101, 27, 102, 23, 47, 97, 85, 19, 77, 116, 111, 60, 107, 20, 43, 30, 110, 25, 87, 91, 13, 79, 109, 81, 44, 93, 75, 108, 76, 18, 12, 45, 71, 78, 14, 11, 17, 84, 8, 6, 10, 73, 9, 66, 4, 67, 72, 65, 7, 0, 69, 68, 70, 1, 64, 3, 5, 2], [127, 122, 54, 100, 41, 40, 126, 86, 26, 33, 28, 120, 15, 36, 42, 43, 117, 119, 96, 125, 83, 106, 49, 53, 57, 59, 21, 92, 118, 39, 124, 38, 115, 88, 103, 62, 107, 56, 24, 58, 35, 19, 114, 46, 108, 95, 112, 61, 37, 48, 105, 110, 81, 52, 123, 47, 63, 55, 113, 102, 17, 104, 50, 60, 111, 97, 116, 31, 99, 34, 98, 121, 51, 101, 44, 30, 22, 45, 13, 84, 109, 82, 32, 74, 77, 10, 94, 29, 11, 18, 93, 90, 89, 91, 87, 78, 27, 23, 14, 71, 20, 7, 85, 25, 79, 80, 75, 16, 67, 76, 72, 73, 8, 6, 4, 66, 1, 65, 12, 68, 9, 0, 64, 3, 70, 69, 2, 5], [105, 98, 117, 86, 96, 28, 53, 20, 52, 116, 25, 41, 94, 38, 80, 126, 109, 56, 63, 119, 123, 45, 111, 82, 19, 14, 44, 83, 62, 118, 71, 75, 120, 89, 77, 87, 115, 54, 78, 8, 49, 21, 112, 79, 23, 48, 51, 10, 50, 58, 57, 102, 61, 30, 46, 55, 59, 37, 39, 124, 27, 114, 125, 6, 107, 100, 127, 16, 43, 40, 93, 31, 35, 101, 104, 29, 60, 103, 121, 26, 42, 97, 110, 2, 85, 122, 4, 95, 113, 106, 17, 33, 36, 99, 47, 108, 9, 90, 34, 24, 22, 91, 88, 84, 18, 7, 32, 64, 81, 1, 92, 76, 12, 11, 15, 67, 68, 13, 73, 70, 0, 74, 66, 3, 65, 72, 5, 69], [105, 98, 96, 86, 20, 117, 82, 53, 75, 25, 41, 123, 79, 80, 52, 77, 63, 8, 116, 28, 94, 6, 126, 10, 16, 118, 89, 2, 119, 112, 4, 59, 62, 56, 45, 115, 44, 91, 111, 3, 109, 11, 57, 120, 13, 30, 88, 84, 26, 23, 54, 48, 97, 7, 51, 38, 17, 93, 50, 22, 74, 127, 67, 29, 114, 83, 18, 0, 58, 14, 31, 95, 102, 21, 61, 85, 78, 70, 39, 76, 104, 81, 19, 40, 107, 55, 92, 36, 15, 5, 24, 60, 64, 101, 125, 35, 121, 124, 106, 27, 113, 68, 87, 32, 1, 37, 90, 33, 72, 43, 49, 47, 110, 73, 34, 69, 99, 66, 65, 9, 103, 46, 122, 12, 42, 100, 108, 71], [105, 98, 86, 96, 117, 82, 8, 53, 79, 25, 63, 20, 75, 80, 4, 77, 41, 94, 2, 64, 123, 62, 6, 89, 48, 116, 118, 0, 115, 52, 72, 28, 66, 1, 29, 68, 13, 126, 16, 71, 69, 70, 87, 44, 85, 10, 112, 30, 65, 51, 109, 45, 11, 59, 119, 56, 57, 50, 88, 5, 18, 7, 24, 26, 93, 19, 111, 120, 35, 22, 78, 17, 15, 127, 21, 84, 3, 74, 76, 58, 39, 114, 12, 9, 100, 92, 60, 106, 23, 73, 14, 83, 102, 99, 67, 27, 40, 38, 121, 55, 31, 95, 91, 97, 61, 49, 101, 113, 122, 36, 124, 34, 81, 47, 33, 37, 90, 46, 54, 110, 32, 125, 107, 42, 103, 104, 43, 108], [105, 98, 20, 28, 96, 25, 86, 82, 41, 117, 53, 123, 80, 94, 63, 116, 77, 75, 52, 118, 79, 126, 115, 16, 59, 56, 44, 8, 89, 111, 62, 119, 10, 50, 57, 17, 120, 48, 109, 112, 51, 71, 45, 127, 84, 18, 12, 13, 30, 38, 22, 23, 6, 91, 95, 31, 55, 83, 78, 81, 76, 88, 54, 27, 61, 60, 19, 39, 74, 93, 46, 33, 124, 24, 104, 106, 7, 49, 34, 37, 26, 114, 58, 4, 102, 43, 107, 122, 11, 99, 15, 9, 2, 125, 87, 97, 14, 85, 92, 36, 90, 29, 21, 110, 101, 72, 100, 32, 35, 121, 40, 42, 68, 103, 70, 113, 69, 108, 47, 3, 1, 0, 64, 66, 5, 73, 65, 67], [112, 105, 113, 125, 110, 41, 114, 89, 126, 104, 117, 54, 61, 27, 84, 127, 57, 59, 58, 56, 107, 123, 30, 91, 99, 48, 52, 51, 46, 111, 49, 108, 47, 55, 120, 92, 42, 115, 101, 119, 53, 94, 122, 43, 109, 23, 118, 44, 116, 62, 86, 25, 121, 124, 106, 50, 63, 60, 34, 102, 36, 45, 39, 37, 100, 40, 98, 33, 38, 35, 103, 90, 21, 97, 32, 15, 96, 28, 82, 95, 88, 31, 79, 18, 80, 29, 93, 81, 26, 87, 85, 17, 20, 22, 24, 83, 13, 12, 19, 78, 3, 14, 76, 69, 74, 75, 65, 77, 16, 11, 10, 9, 8, 71, 72, 73, 6, 5, 70, 7, 67, 68, 66, 1, 2, 4, 64, 0], [105, 112, 86, 30, 41, 84, 91, 80, 82, 78, 27, 76, 98, 99, 89, 94, 126, 113, 9, 10, 7, 110, 61, 114, 74, 34, 25, 52, 71, 35, 127, 14, 58, 125, 20, 49, 23, 109, 88, 67, 54, 69, 57, 59, 3, 123, 56, 117, 19, 47, 13, 46, 62, 5, 22, 111, 18, 108, 81, 48, 51, 118, 55, 77, 116, 17, 21, 85, 16, 107, 43, 120, 115, 119, 37, 79, 40, 32, 73, 50, 122, 92, 11, 60, 93, 24, 102, 83, 53, 124, 65, 63, 28, 121, 15, 106, 8, 4, 104, 97, 36, 44, 87, 64, 42, 45, 90, 12, 95, 38, 39, 29, 26, 103, 33, 75, 31, 101, 96, 72, 100, 2, 70, 66, 0, 1, 6, 68], [112, 105, 86, 91, 41, 30, 84, 25, 27, 58, 126, 89, 15, 62, 80, 113, 106, 82, 33, 111, 114, 110, 17, 127, 116, 104, 52, 124, 78, 94, 83, 117, 76, 49, 61, 46, 48, 39, 125, 59, 92, 57, 11, 56, 107, 10, 85, 47, 98, 36, 99, 54, 13, 51, 35, 123, 45, 7, 87, 60, 101, 23, 32, 119, 63, 108, 18, 22, 29, 6, 121, 120, 115, 44, 12, 109, 71, 55, 93, 21, 97, 40, 43, 96, 118, 20, 95, 72, 34, 16, 37, 31, 53, 122, 28, 42, 88, 50, 26, 38, 103, 90, 69, 2, 9, 79, 24, 102, 81, 100, 64, 14, 19, 67, 68, 75, 74, 5, 77, 8, 73, 4, 66, 1, 70, 65, 0, 3], [105, 112, 30, 84, 86, 89, 41, 98, 126, 82, 48, 113, 118, 123, 110, 25, 80, 35, 85, 45, 31, 91, 99, 52, 54, 94, 88, 61, 125, 78, 27, 58, 115, 9, 59, 127, 92, 116, 109, 117, 39, 114, 33, 17, 57, 12, 60, 32, 49, 121, 95, 76, 108, 6, 40, 73, 56, 111, 47, 74, 120, 18, 103, 51, 62, 26, 97, 79, 55, 53, 122, 75, 20, 106, 63, 19, 22, 107, 104, 93, 90, 119, 13, 42, 28, 124, 102, 46, 21, 3, 100, 101, 43, 50, 44, 83, 23, 37, 4, 36, 10, 87, 96, 72, 38, 24, 15, 14, 29, 34, 11, 16, 81, 77, 68, 1, 71, 69, 70, 67, 5, 8, 7, 65, 0, 66, 2, 64]], "model.layers.18.self_attn.k_proj": [[113, 52, 103, 110, 96, 93, 90, 87, 85, 18, 80, 112, 77, 49, 55, 44, 124, 74, 48, 119, 54, 79, 118, 53, 20, 70, 121, 57, 46, 62, 105, 61, 86, 56, 125, 51, 43, 34, 111, 116, 8, 126, 127, 109, 120, 115, 64, 123, 50, 47, 108, 117, 41, 58, 0, 68, 65, 59, 107, 102, 63, 45, 122, 42, 114, 9, 60, 40, 106, 104, 38, 19, 7, 83, 99, 37, 101, 2, 3, 30, 28, 76, 36, 98, 95, 1, 4, 92, 78, 81, 88, 97, 35, 33, 75, 25, 5, 11, 94, 100, 12, 24, 91, 89, 27, 73, 31, 66, 17, 69, 14, 72, 84, 16, 22, 15, 29, 67, 13, 71, 23, 21, 26, 82, 32, 6, 39, 10], [63, 124, 36, 22, 120, 15, 30, 17, 74, 103, 12, 56, 72, 18, 13, 54, 4, 32, 27, 24, 117, 62, 60, 51, 122, 64, 107, 46, 121, 2, 44, 58, 126, 21, 47, 118, 59, 109, 28, 61, 69, 57, 114, 45, 10, 39, 38, 119, 113, 79, 50, 53, 49, 55, 43, 48, 116, 40, 19, 125, 115, 108, 42, 123, 127, 106, 105, 110, 104, 8, 112, 52, 111, 41, 102, 1, 25, 100, 37, 97, 67, 26, 101, 34, 7, 76, 29, 33, 35, 96, 95, 98, 90, 99, 71, 6, 85, 93, 91, 78, 94, 73, 31, 75, 82, 92, 23, 84, 88, 80, 3, 20, 87, 11, 5, 81, 14, 9, 83, 86, 16, 0, 89, 66, 77, 65, 68, 70], [55, 118, 38, 123, 33, 86, 91, 24, 61, 62, 48, 15, 116, 53, 18, 95, 119, 126, 120, 122, 17, 112, 102, 115, 109, 77, 49, 51, 57, 84, 124, 117, 63, 114, 110, 21, 11, 45, 52, 90, 31, 46, 50, 74, 104, 19, 44, 113, 54, 58, 121, 76, 125, 111, 127, 60, 37, 59, 56, 105, 108, 7, 3, 16, 103, 30, 5, 107, 42, 23, 78, 35, 47, 8, 106, 43, 41, 1, 39, 94, 27, 40, 9, 100, 99, 36, 32, 6, 14, 93, 98, 64, 20, 89, 101, 29, 72, 82, 34, 87, 12, 28, 96, 88, 25, 75, 26, 92, 85, 79, 66, 71, 83, 13, 80, 22, 10, 69, 4, 73, 67, 81, 68, 97, 2, 70, 65, 0], [107, 60, 100, 32, 89, 54, 47, 87, 20, 80, 63, 82, 75, 21, 9, 43, 53, 79, 126, 13, 69, 71, 1, 117, 111, 118, 29, 51, 64, 121, 61, 2, 57, 67, 52, 0, 50, 62, 114, 102, 115, 127, 124, 92, 48, 122, 125, 49, 56, 34, 103, 7, 41, 40, 77, 105, 119, 46, 108, 58, 45, 110, 120, 116, 94, 123, 42, 112, 37, 86, 113, 39, 44, 30, 3, 27, 109, 59, 38, 101, 81, 55, 104, 91, 19, 28, 31, 90, 106, 11, 22, 26, 78, 99, 85, 33, 35, 98, 72, 83, 95, 93, 4, 14, 70, 68, 8, 74, 97, 17, 6, 76, 15, 10, 66, 88, 25, 24, 12, 5, 18, 16, 23, 73, 65, 36, 96, 84], [40, 120, 59, 34, 22, 113, 29, 49, 90, 14, 80, 119, 31, 51, 9, 88, 56, 112, 57, 18, 76, 52, 6, 83, 118, 123, 110, 117, 68, 124, 109, 61, 116, 50, 58, 62, 43, 60, 98, 115, 54, 122, 55, 114, 102, 72, 53, 46, 127, 95, 125, 45, 65, 101, 0, 108, 126, 121, 63, 47, 28, 105, 111, 48, 99, 107, 82, 91, 81, 44, 16, 39, 106, 42, 79, 41, 103, 36, 89, 30, 2, 21, 66, 23, 38, 37, 32, 27, 11, 96, 75, 13, 17, 97, 35, 86, 100, 67, 71, 33, 92, 94, 74, 24, 85, 69, 3, 25, 73, 84, 5, 20, 78, 15, 104, 64, 87, 26, 10, 93, 77, 7, 70, 12, 1, 8, 4, 19], [122, 54, 127, 36, 97, 120, 125, 124, 86, 53, 62, 119, 126, 57, 118, 104, 121, 49, 58, 56, 117, 113, 48, 112, 50, 51, 52, 123, 59, 42, 26, 60, 114, 115, 55, 43, 116, 61, 63, 39, 47, 105, 106, 41, 111, 46, 94, 81, 110, 93, 28, 109, 38, 29, 107, 40, 19, 45, 100, 108, 24, 44, 30, 15, 83, 73, 103, 69, 27, 33, 67, 21, 82, 99, 102, 75, 87, 101, 76, 4, 25, 6, 8, 23, 88, 65, 34, 37, 91, 85, 98, 32, 78, 16, 64, 31, 35, 90, 12, 92, 20, 13, 0, 17, 95, 96, 10, 89, 66, 71, 84, 14, 22, 80, 79, 77, 18, 7, 2, 11, 72, 74, 1, 5, 70, 9, 68, 3], [41, 53, 34, 25, 63, 30, 52, 79, 86, 80, 117, 126, 20, 75, 32, 8, 48, 10, 77, 51, 82, 108, 116, 47, 45, 59, 123, 109, 118, 6, 4, 2, 56, 50, 64, 120, 119, 62, 1, 105, 113, 9, 40, 57, 71, 55, 111, 106, 61, 83, 28, 87, 18, 54, 14, 29, 114, 31, 67, 69, 70, 110, 127, 103, 23, 92, 44, 46, 58, 35, 36, 124, 74, 21, 0, 94, 122, 102, 37, 66, 115, 90, 121, 3, 33, 81, 107, 88, 24, 93, 39, 49, 112, 101, 97, 100, 125, 78, 60, 104, 91, 12, 38, 13, 5, 85, 42, 22, 43, 17, 73, 27, 26, 76, 19, 98, 15, 11, 84, 95, 96, 72, 99, 65, 7, 68, 16, 89], [41, 112, 48, 94, 86, 113, 35, 27, 82, 110, 80, 89, 126, 84, 78, 49, 105, 61, 52, 125, 58, 59, 76, 9, 114, 115, 111, 123, 54, 50, 34, 57, 127, 117, 51, 91, 109, 60, 108, 62, 98, 10, 45, 7, 116, 47, 118, 56, 55, 5, 63, 119, 120, 53, 96, 46, 121, 124, 107, 1, 81, 37, 43, 40, 36, 44, 88, 122, 102, 101, 16, 39, 106, 79, 12, 100, 30, 67, 97, 104, 21, 83, 42, 38, 31, 103, 32, 33, 13, 93, 29, 95, 8, 87, 99, 24, 26, 28, 17, 69, 23, 75, 15, 11, 14, 64, 90, 92, 74, 19, 20, 6, 18, 85, 71, 72, 25, 77, 68, 66, 22, 70, 73, 4, 2, 3, 65, 0]], "model.layers.18.self_attn.qk_proj": [[113, 112, 63, 124, 55, 118, 41, 54, 122, 52, 120, 59, 60, 105, 107, 127, 53, 43, 123, 36, 86, 48, 22, 102, 110, 117, 126, 25, 62, 49, 30, 84, 100, 89, 29, 51, 119, 82, 40, 18, 20, 16, 80, 96, 38, 57, 116, 15, 61, 115, 56, 46, 79, 109, 77, 34, 125, 94, 47, 23, 58, 50, 98, 27, 26, 114, 24, 45, 104, 39, 103, 121, 111, 85, 32, 21, 92, 88, 87, 93, 13, 42, 91, 8, 10, 75, 11, 90, 81, 72, 76, 74, 19, 31, 44, 97, 9, 17, 33, 95, 37, 14, 35, 108, 12, 83, 106, 28, 78, 73, 7, 71, 6, 101, 0, 99, 64, 69, 5, 2, 70, 65, 67, 68, 1, 3, 4, 66], [113, 112, 63, 124, 55, 118, 54, 41, 52, 120, 122, 59, 60, 105, 107, 127, 43, 53, 123, 86, 102, 36, 22, 117, 110, 48, 49, 25, 126, 100, 89, 30, 29, 84, 62, 18, 116, 82, 16, 15, 40, 119, 94, 46, 20, 57, 96, 56, 51, 79, 47, 61, 38, 80, 77, 104, 58, 125, 103, 50, 39, 34, 26, 27, 23, 24, 115, 87, 109, 98, 114, 85, 21, 88, 92, 32, 13, 42, 8, 10, 93, 45, 121, 90, 111, 81, 97, 74, 75, 11, 91, 35, 28, 83, 44, 9, 12, 76, 19, 95, 31, 106, 108, 72, 17, 6, 69, 37, 78, 14, 33, 99, 0, 101, 71, 64, 7, 67, 73, 65, 66, 68, 1, 4, 70, 2, 5, 3], [113, 112, 63, 124, 55, 118, 54, 41, 122, 52, 120, 59, 105, 60, 107, 127, 43, 53, 123, 22, 86, 36, 48, 102, 126, 110, 25, 117, 49, 30, 29, 100, 89, 84, 51, 94, 62, 56, 82, 40, 104, 116, 47, 18, 20, 119, 79, 109, 38, 96, 58, 39, 57, 16, 27, 125, 15, 80, 26, 61, 103, 114, 23, 98, 115, 46, 50, 34, 45, 24, 85, 87, 77, 13, 88, 121, 90, 91, 8, 32, 21, 42, 92, 111, 75, 93, 11, 10, 19, 6, 33, 81, 106, 44, 64, 28, 31, 97, 108, 83, 35, 12, 95, 76, 72, 9, 14, 74, 0, 37, 2, 101, 7, 69, 65, 17, 78, 67, 73, 3, 1, 66, 5, 4, 68, 71, 70, 99], [113, 112, 63, 124, 55, 54, 118, 41, 52, 120, 122, 59, 60, 105, 107, 127, 53, 43, 123, 36, 117, 48, 22, 102, 126, 110, 49, 86, 25, 62, 51, 116, 89, 47, 82, 58, 100, 56, 30, 125, 119, 96, 46, 29, 20, 61, 18, 15, 94, 79, 57, 84, 39, 16, 115, 40, 103, 80, 38, 114, 50, 104, 109, 34, 27, 26, 24, 13, 23, 98, 75, 111, 77, 45, 121, 87, 85, 21, 93, 32, 97, 106, 8, 91, 90, 42, 92, 10, 88, 44, 76, 74, 81, 35, 11, 0, 6, 108, 83, 37, 72, 95, 17, 64, 31, 28, 3, 12, 14, 66, 2, 78, 1, 73, 19, 33, 9, 7, 70, 71, 4, 69, 101, 5, 68, 65, 99, 67], [113, 112, 63, 124, 55, 54, 118, 41, 52, 122, 120, 60, 59, 105, 107, 127, 53, 43, 123, 48, 22, 110, 117, 36, 86, 126, 25, 62, 49, 116, 102, 125, 82, 51, 89, 30, 20, 56, 47, 57, 84, 16, 79, 100, 18, 119, 109, 29, 58, 61, 15, 40, 94, 115, 38, 96, 104, 98, 111, 46, 80, 26, 103, 50, 23, 114, 34, 13, 39, 77, 45, 121, 21, 8, 27, 24, 88, 85, 106, 10, 93, 75, 90, 32, 87, 44, 42, 81, 92, 12, 97, 91, 11, 108, 19, 28, 74, 76, 78, 72, 14, 33, 31, 17, 83, 95, 73, 35, 37, 6, 101, 70, 9, 0, 7, 66, 64, 3, 69, 65, 2, 4, 71, 68, 5, 1, 99, 67], [113, 63, 112, 124, 55, 118, 54, 41, 52, 122, 120, 60, 59, 105, 107, 127, 43, 53, 123, 22, 48, 86, 110, 117, 36, 126, 102, 25, 51, 49, 15, 89, 30, 82, 18, 62, 116, 79, 20, 100, 57, 96, 40, 80, 125, 16, 84, 109, 119, 47, 29, 58, 50, 39, 115, 56, 34, 94, 98, 103, 61, 104, 46, 23, 114, 77, 38, 13, 24, 21, 26, 27, 111, 85, 88, 8, 32, 87, 42, 93, 75, 10, 11, 97, 90, 45, 92, 81, 78, 12, 83, 74, 31, 17, 121, 28, 76, 106, 91, 44, 19, 72, 95, 14, 9, 35, 33, 108, 71, 37, 73, 101, 99, 70, 64, 0, 6, 69, 7, 2, 1, 66, 65, 5, 67, 4, 68, 3], [113, 63, 112, 124, 55, 118, 41, 54, 122, 52, 120, 60, 59, 105, 107, 43, 127, 53, 123, 22, 86, 48, 36, 126, 49, 25, 110, 117, 30, 102, 82, 18, 62, 89, 96, 84, 15, 29, 51, 47, 16, 100, 80, 20, 79, 40, 58, 56, 57, 119, 94, 77, 104, 23, 98, 34, 38, 115, 27, 61, 116, 125, 26, 32, 109, 24, 88, 50, 46, 114, 103, 13, 85, 91, 39, 21, 75, 93, 111, 87, 92, 121, 8, 10, 74, 97, 11, 90, 78, 31, 12, 81, 35, 33, 95, 76, 42, 44, 19, 17, 14, 101, 45, 106, 83, 9, 108, 73, 72, 28, 99, 70, 71, 37, 64, 7, 5, 65, 69, 2, 66, 0, 1, 68, 67, 6, 4, 3], [113, 63, 112, 124, 55, 54, 118, 41, 52, 120, 122, 59, 60, 105, 107, 127, 43, 22, 53, 123, 36, 86, 102, 126, 48, 117, 25, 30, 110, 49, 119, 62, 29, 84, 89, 82, 16, 20, 56, 18, 96, 100, 40, 47, 80, 51, 94, 57, 79, 15, 98, 116, 38, 125, 46, 23, 26, 61, 27, 104, 50, 115, 24, 58, 39, 85, 88, 34, 91, 13, 32, 114, 109, 77, 21, 87, 103, 121, 93, 90, 111, 92, 33, 75, 19, 31, 17, 97, 10, 42, 95, 11, 81, 8, 45, 44, 14, 108, 78, 9, 83, 76, 106, 12, 70, 35, 74, 72, 28, 99, 37, 73, 64, 101, 5, 71, 7, 1, 0, 68, 2, 69, 3, 4, 66, 65, 67, 6], [113, 63, 112, 124, 55, 118, 54, 41, 52, 120, 122, 60, 59, 105, 107, 127, 53, 43, 22, 123, 36, 102, 110, 86, 126, 48, 117, 25, 89, 49, 30, 84, 57, 119, 96, 29, 20, 18, 79, 82, 40, 38, 62, 94, 116, 16, 100, 56, 47, 46, 125, 80, 15, 51, 115, 27, 58, 61, 104, 24, 98, 39, 34, 88, 26, 32, 87, 13, 114, 21, 23, 103, 77, 85, 93, 50, 92, 90, 97, 11, 109, 111, 75, 28, 106, 108, 81, 91, 72, 10, 19, 121, 95, 17, 31, 12, 42, 35, 74, 33, 76, 45, 8, 44, 83, 78, 70, 14, 101, 37, 99, 73, 9, 7, 5, 71, 64, 68, 0, 69, 65, 3, 4, 67, 1, 2, 66, 6], [113, 112, 63, 124, 55, 54, 118, 41, 52, 120, 122, 60, 59, 105, 107, 127, 123, 53, 43, 48, 102, 117, 36, 22, 86, 126, 25, 110, 62, 49, 57, 100, 116, 125, 18, 51, 82, 30, 47, 89, 40, 119, 20, 56, 29, 58, 84, 79, 61, 80, 38, 16, 96, 15, 46, 27, 104, 26, 94, 34, 115, 39, 98, 50, 24, 85, 103, 77, 21, 32, 88, 72, 13, 109, 75, 23, 10, 93, 45, 87, 106, 111, 121, 92, 114, 97, 31, 74, 90, 76, 81, 108, 35, 95, 19, 91, 44, 28, 11, 8, 42, 83, 78, 12, 14, 17, 9, 70, 71, 37, 73, 7, 101, 64, 33, 5, 99, 67, 68, 3, 0, 65, 4, 69, 6, 2, 1, 66], [113, 112, 63, 124, 55, 54, 41, 118, 120, 122, 52, 60, 59, 105, 107, 127, 53, 43, 123, 117, 48, 22, 126, 36, 102, 86, 110, 116, 62, 25, 51, 18, 49, 56, 82, 89, 20, 100, 61, 79, 96, 47, 57, 30, 80, 125, 15, 84, 40, 119, 16, 46, 58, 38, 29, 104, 77, 50, 98, 115, 39, 34, 72, 94, 13, 23, 109, 24, 114, 103, 27, 26, 45, 75, 85, 87, 32, 111, 93, 121, 21, 10, 92, 11, 88, 90, 8, 28, 81, 106, 76, 12, 74, 17, 97, 14, 83, 78, 6, 35, 91, 42, 73, 44, 64, 71, 37, 31, 2, 108, 19, 9, 69, 0, 95, 7, 33, 101, 68, 70, 1, 65, 5, 4, 67, 66, 3, 99], [113, 63, 112, 124, 55, 41, 118, 54, 52, 120, 122, 60, 59, 105, 107, 127, 43, 53, 123, 22, 86, 48, 102, 36, 117, 126, 25, 89, 84, 18, 110, 82, 30, 100, 49, 96, 80, 15, 20, 79, 29, 62, 51, 40, 56, 16, 116, 98, 47, 46, 94, 57, 115, 119, 77, 85, 23, 38, 58, 125, 104, 61, 34, 13, 24, 26, 72, 27, 75, 87, 103, 39, 109, 21, 32, 50, 93, 11, 88, 111, 74, 10, 90, 114, 97, 92, 19, 81, 45, 121, 31, 91, 17, 78, 14, 95, 12, 106, 76, 6, 44, 33, 108, 8, 9, 35, 83, 42, 73, 28, 99, 101, 71, 7, 37, 64, 68, 5, 2, 70, 1, 66, 3, 69, 0, 4, 67, 65], [113, 63, 112, 124, 55, 118, 54, 41, 52, 122, 120, 60, 59, 105, 107, 127, 43, 123, 53, 36, 48, 102, 86, 22, 117, 25, 110, 126, 30, 100, 62, 29, 96, 49, 89, 38, 18, 82, 40, 51, 20, 94, 84, 47, 80, 79, 58, 61, 16, 15, 56, 125, 116, 115, 57, 119, 98, 46, 23, 24, 39, 27, 88, 26, 104, 21, 85, 34, 32, 103, 50, 87, 90, 97, 109, 121, 13, 93, 77, 111, 91, 31, 72, 92, 75, 19, 45, 114, 10, 95, 11, 106, 35, 17, 83, 44, 28, 42, 81, 33, 74, 78, 14, 76, 12, 37, 73, 108, 6, 101, 99, 8, 9, 7, 71, 69, 64, 66, 4, 1, 65, 5, 0, 67, 2, 68, 70, 3], [113, 63, 112, 124, 55, 118, 54, 41, 52, 122, 120, 59, 60, 107, 105, 127, 43, 53, 123, 22, 86, 48, 62, 117, 36, 102, 110, 126, 49, 25, 89, 100, 30, 82, 61, 51, 15, 18, 125, 96, 47, 80, 29, 20, 119, 58, 40, 115, 98, 50, 94, 79, 57, 38, 84, 116, 46, 56, 16, 39, 24, 34, 104, 121, 27, 21, 23, 85, 72, 13, 103, 77, 114, 88, 26, 75, 10, 32, 109, 111, 87, 93, 74, 97, 92, 11, 12, 35, 90, 106, 45, 33, 83, 91, 28, 31, 81, 19, 8, 95, 6, 42, 17, 0, 73, 76, 14, 7, 108, 44, 78, 65, 71, 101, 69, 64, 9, 67, 99, 37, 5, 4, 3, 66, 1, 68, 2, 70], [113, 63, 112, 124, 55, 41, 54, 118, 52, 120, 122, 59, 60, 105, 107, 43, 127, 53, 123, 22, 48, 117, 102, 86, 36, 110, 126, 25, 100, 62, 89, 51, 82, 15, 30, 40, 80, 20, 96, 18, 61, 79, 49, 115, 84, 125, 29, 47, 56, 57, 94, 46, 23, 16, 116, 98, 50, 34, 58, 38, 85, 72, 39, 13, 27, 119, 77, 26, 21, 104, 103, 45, 88, 111, 93, 32, 24, 10, 11, 75, 109, 74, 121, 87, 8, 114, 83, 12, 91, 81, 97, 90, 28, 17, 31, 106, 76, 44, 92, 33, 78, 95, 19, 35, 42, 6, 73, 14, 7, 101, 64, 69, 9, 108, 37, 4, 71, 0, 66, 3, 1, 68, 67, 65, 70, 2, 99, 5], [113, 112, 63, 124, 55, 118, 54, 41, 52, 120, 122, 59, 60, 105, 107, 127, 53, 43, 123, 22, 117, 48, 86, 36, 102, 110, 62, 126, 100, 25, 89, 82, 116, 96, 30, 15, 49, 20, 56, 51, 46, 84, 18, 40, 79, 80, 16, 47, 61, 57, 125, 94, 29, 115, 98, 77, 119, 23, 50, 58, 38, 39, 85, 34, 24, 13, 111, 72, 103, 121, 104, 114, 75, 27, 10, 26, 11, 88, 109, 32, 87, 74, 42, 45, 97, 106, 8, 21, 81, 93, 92, 12, 76, 90, 19, 95, 83, 91, 44, 108, 17, 31, 14, 9, 78, 6, 35, 33, 28, 101, 73, 7, 37, 4, 70, 64, 0, 5, 2, 71, 1, 65, 99, 67, 69, 68, 66, 3], [113, 63, 112, 124, 55, 118, 41, 52, 54, 120, 122, 59, 60, 105, 107, 127, 53, 43, 123, 48, 86, 36, 22, 126, 102, 117, 25, 30, 82, 110, 62, 49, 89, 18, 96, 80, 29, 20, 40, 47, 94, 84, 100, 16, 15, 104, 46, 57, 125, 61, 38, 79, 51, 98, 115, 39, 119, 56, 116, 23, 26, 58, 24, 27, 103, 34, 50, 21, 111, 85, 13, 109, 32, 88, 77, 97, 114, 87, 121, 91, 93, 75, 92, 42, 90, 74, 8, 19, 11, 31, 106, 10, 95, 76, 28, 108, 44, 17, 33, 72, 81, 35, 12, 45, 83, 78, 37, 14, 70, 101, 73, 69, 9, 64, 71, 65, 2, 99, 7, 6, 0, 4, 68, 1, 66, 67, 3, 5], [113, 63, 112, 124, 55, 54, 118, 41, 52, 120, 122, 60, 105, 59, 107, 43, 123, 127, 53, 48, 36, 86, 102, 22, 117, 126, 25, 89, 49, 110, 30, 96, 29, 94, 62, 40, 18, 82, 20, 84, 116, 56, 100, 47, 80, 38, 15, 79, 104, 16, 119, 46, 58, 57, 23, 115, 27, 77, 125, 98, 39, 34, 24, 61, 26, 51, 85, 21, 13, 88, 32, 111, 103, 50, 114, 11, 87, 74, 90, 92, 121, 109, 8, 97, 93, 91, 75, 10, 95, 19, 81, 106, 72, 12, 70, 31, 14, 42, 28, 35, 33, 108, 83, 76, 78, 44, 17, 71, 0, 45, 9, 1, 69, 3, 73, 64, 37, 68, 7, 66, 101, 65, 99, 6, 4, 2, 5, 67], [113, 112, 63, 124, 55, 54, 118, 41, 52, 120, 122, 59, 60, 105, 107, 53, 127, 43, 123, 36, 48, 117, 22, 102, 86, 126, 62, 116, 56, 110, 25, 57, 47, 100, 125, 30, 49, 46, 115, 89, 20, 18, 96, 40, 38, 82, 94, 61, 51, 84, 80, 29, 119, 34, 15, 104, 79, 16, 114, 58, 121, 39, 98, 32, 23, 27, 77, 50, 103, 111, 8, 88, 21, 24, 109, 74, 87, 26, 106, 85, 13, 45, 91, 11, 92, 90, 97, 95, 93, 10, 75, 35, 31, 42, 17, 76, 37, 19, 81, 72, 108, 28, 44, 83, 70, 12, 33, 14, 9, 99, 71, 0, 64, 101, 73, 69, 1, 78, 3, 4, 7, 65, 2, 66, 67, 68, 5, 6], [113, 112, 63, 124, 55, 54, 118, 52, 41, 120, 122, 59, 60, 105, 107, 127, 53, 43, 48, 123, 117, 102, 36, 22, 86, 126, 110, 56, 62, 100, 25, 30, 49, 57, 116, 47, 15, 96, 119, 51, 125, 89, 61, 18, 94, 29, 84, 46, 115, 39, 20, 40, 82, 79, 16, 38, 80, 23, 98, 34, 27, 13, 103, 21, 58, 104, 26, 50, 8, 24, 109, 121, 88, 87, 114, 32, 77, 45, 111, 97, 85, 93, 92, 74, 11, 10, 95, 90, 91, 31, 108, 28, 76, 75, 106, 33, 83, 42, 12, 35, 19, 17, 81, 78, 72, 44, 37, 14, 70, 9, 64, 73, 65, 71, 5, 101, 0, 4, 7, 99, 2, 69, 1, 67, 68, 3, 66, 6], [113, 112, 63, 124, 55, 54, 118, 52, 41, 120, 122, 59, 60, 105, 107, 127, 43, 53, 123, 117, 36, 48, 22, 102, 86, 126, 25, 62, 89, 96, 110, 51, 56, 57, 30, 119, 100, 116, 47, 49, 84, 94, 38, 20, 79, 61, 15, 29, 98, 18, 115, 80, 40, 82, 125, 46, 23, 34, 58, 16, 121, 26, 104, 111, 50, 77, 27, 24, 39, 88, 103, 13, 85, 32, 8, 97, 21, 114, 109, 75, 87, 93, 92, 91, 90, 19, 10, 106, 11, 95, 78, 81, 76, 31, 74, 17, 108, 12, 35, 83, 42, 28, 33, 72, 44, 45, 14, 37, 70, 71, 73, 9, 101, 99, 64, 65, 7, 0, 1, 3, 67, 4, 6, 5, 68, 66, 69, 2], [113, 112, 63, 124, 55, 54, 118, 52, 41, 120, 122, 59, 60, 105, 107, 127, 43, 53, 123, 117, 48, 36, 126, 86, 62, 22, 25, 102, 116, 110, 56, 89, 30, 61, 51, 57, 100, 125, 47, 49, 119, 94, 38, 84, 96, 20, 82, 40, 80, 79, 115, 15, 29, 18, 98, 46, 58, 16, 104, 34, 50, 103, 121, 27, 23, 26, 77, 111, 88, 32, 24, 85, 114, 39, 21, 13, 90, 91, 8, 93, 87, 11, 92, 109, 106, 75, 97, 74, 19, 95, 28, 10, 35, 31, 17, 83, 42, 108, 44, 12, 72, 76, 45, 0, 81, 14, 64, 33, 6, 78, 9, 71, 70, 37, 2, 7, 73, 99, 1, 101, 65, 69, 68, 67, 5, 3, 66, 4], [113, 63, 112, 124, 55, 54, 118, 52, 41, 120, 122, 59, 60, 105, 107, 127, 43, 123, 53, 48, 117, 36, 102, 22, 86, 25, 126, 110, 62, 57, 30, 116, 51, 119, 100, 40, 96, 47, 61, 84, 89, 38, 56, 29, 18, 94, 20, 82, 15, 16, 49, 46, 125, 79, 98, 23, 104, 50, 34, 80, 26, 24, 115, 58, 103, 111, 39, 27, 121, 87, 77, 32, 85, 88, 21, 109, 93, 114, 97, 35, 13, 31, 28, 45, 90, 106, 92, 91, 8, 42, 83, 75, 74, 108, 95, 19, 44, 11, 81, 76, 10, 6, 33, 12, 72, 17, 78, 9, 14, 101, 37, 0, 64, 7, 65, 71, 73, 1, 99, 5, 4, 69, 66, 2, 67, 3, 70, 68], [113, 63, 112, 124, 55, 118, 54, 52, 41, 120, 122, 59, 60, 105, 107, 127, 43, 53, 123, 48, 22, 36, 117, 86, 102, 126, 116, 25, 62, 110, 96, 51, 30, 100, 57, 18, 119, 16, 89, 61, 49, 47, 15, 94, 84, 82, 29, 20, 79, 40, 38, 125, 34, 98, 39, 46, 23, 104, 26, 50, 56, 80, 103, 77, 115, 58, 24, 27, 121, 13, 32, 21, 85, 109, 8, 74, 88, 92, 114, 90, 75, 111, 93, 97, 87, 72, 10, 91, 95, 81, 11, 19, 45, 106, 76, 83, 28, 42, 12, 6, 17, 31, 35, 14, 78, 108, 44, 7, 73, 37, 9, 33, 71, 99, 101, 0, 4, 5, 68, 67, 69, 1, 3, 64, 66, 65, 70, 2], [113, 112, 63, 124, 55, 118, 54, 41, 52, 122, 120, 59, 60, 105, 107, 127, 43, 123, 53, 22, 117, 86, 48, 36, 102, 25, 62, 126, 89, 116, 30, 110, 20, 16, 51, 84, 18, 115, 82, 61, 100, 96, 40, 119, 79, 57, 47, 15, 49, 38, 80, 29, 94, 23, 98, 46, 109, 58, 125, 34, 56, 50, 104, 26, 88, 103, 27, 114, 77, 13, 111, 85, 32, 75, 21, 121, 39, 24, 10, 87, 90, 42, 72, 11, 93, 31, 8, 92, 97, 74, 28, 91, 76, 45, 81, 19, 83, 95, 6, 106, 35, 17, 78, 12, 108, 33, 44, 14, 73, 101, 9, 64, 7, 99, 71, 4, 65, 0, 37, 69, 66, 2, 5, 3, 1, 68, 67, 70], [113, 63, 112, 124, 55, 118, 54, 41, 52, 120, 122, 59, 60, 105, 107, 127, 43, 123, 53, 48, 86, 36, 22, 117, 102, 62, 126, 25, 30, 110, 100, 89, 116, 29, 51, 96, 94, 20, 119, 40, 61, 56, 57, 18, 16, 82, 47, 84, 15, 38, 49, 125, 46, 109, 115, 23, 98, 58, 80, 50, 104, 24, 79, 77, 26, 34, 121, 85, 103, 27, 21, 13, 88, 92, 39, 87, 11, 93, 111, 32, 114, 72, 91, 74, 97, 42, 90, 75, 10, 31, 95, 81, 19, 45, 106, 33, 8, 12, 28, 76, 108, 17, 83, 44, 78, 35, 73, 14, 6, 9, 37, 0, 101, 7, 99, 65, 69, 2, 3, 71, 64, 67, 5, 68, 70, 66, 4, 1], [113, 112, 63, 124, 55, 118, 54, 41, 52, 120, 122, 60, 59, 105, 107, 127, 43, 123, 53, 36, 117, 48, 102, 62, 22, 110, 126, 86, 25, 61, 116, 89, 49, 30, 18, 100, 51, 56, 96, 40, 119, 57, 84, 47, 29, 125, 20, 121, 23, 58, 39, 16, 38, 98, 104, 94, 82, 46, 50, 115, 79, 34, 15, 80, 109, 114, 26, 111, 77, 103, 27, 24, 13, 88, 32, 85, 97, 21, 42, 72, 87, 93, 92, 90, 75, 74, 91, 106, 35, 11, 45, 108, 10, 44, 19, 17, 12, 31, 81, 83, 76, 28, 95, 37, 33, 101, 8, 78, 14, 73, 99, 6, 9, 0, 7, 70, 64, 69, 5, 71, 4, 68, 67, 66, 3, 2, 1, 65], [113, 112, 63, 124, 55, 54, 118, 41, 120, 52, 122, 60, 59, 105, 107, 127, 43, 53, 123, 22, 48, 36, 117, 126, 102, 86, 110, 62, 25, 61, 20, 89, 18, 47, 100, 82, 30, 51, 125, 96, 49, 56, 116, 79, 40, 80, 15, 84, 16, 119, 29, 46, 57, 23, 58, 38, 94, 50, 104, 121, 88, 77, 34, 115, 27, 26, 103, 72, 13, 39, 85, 111, 24, 114, 32, 21, 93, 92, 98, 11, 109, 74, 75, 91, 90, 45, 97, 87, 10, 19, 12, 28, 106, 17, 76, 81, 42, 8, 78, 31, 35, 95, 70, 83, 7, 9, 44, 0, 73, 14, 108, 69, 99, 3, 71, 65, 37, 64, 101, 4, 33, 5, 2, 1, 68, 6, 67, 66], [113, 63, 112, 124, 55, 118, 41, 54, 120, 52, 122, 60, 59, 105, 107, 127, 123, 53, 43, 48, 117, 36, 62, 22, 86, 110, 51, 102, 126, 119, 49, 30, 25, 125, 100, 96, 89, 47, 61, 40, 18, 115, 57, 56, 20, 15, 29, 84, 46, 116, 16, 79, 82, 94, 39, 34, 23, 104, 58, 38, 50, 103, 77, 121, 80, 26, 98, 72, 109, 45, 85, 32, 27, 92, 13, 93, 111, 21, 114, 106, 74, 10, 42, 90, 11, 24, 75, 88, 97, 87, 81, 108, 95, 35, 76, 70, 91, 28, 78, 31, 44, 12, 8, 19, 17, 33, 9, 5, 0, 64, 83, 14, 37, 67, 7, 65, 1, 73, 71, 101, 66, 69, 3, 4, 2, 99, 68, 6], [113, 112, 63, 124, 55, 118, 54, 41, 52, 120, 122, 60, 59, 105, 107, 127, 53, 43, 123, 48, 117, 22, 36, 86, 110, 102, 126, 62, 51, 119, 25, 49, 89, 18, 30, 100, 82, 79, 56, 46, 125, 20, 84, 116, 115, 15, 40, 29, 61, 96, 94, 16, 47, 80, 23, 109, 77, 98, 57, 39, 38, 34, 103, 24, 58, 13, 114, 50, 72, 121, 32, 42, 21, 88, 85, 104, 45, 111, 92, 27, 93, 87, 11, 76, 10, 97, 75, 74, 26, 106, 17, 90, 108, 91, 81, 14, 19, 8, 44, 95, 70, 83, 31, 28, 35, 12, 78, 9, 33, 7, 73, 71, 5, 101, 64, 67, 37, 69, 4, 99, 65, 3, 2, 0, 1, 68, 6, 66], [113, 112, 63, 124, 55, 118, 54, 41, 52, 120, 122, 59, 60, 105, 107, 127, 53, 43, 123, 48, 117, 102, 86, 36, 62, 22, 51, 110, 126, 25, 116, 30, 89, 100, 49, 125, 82, 96, 29, 61, 47, 16, 40, 20, 115, 119, 94, 39, 79, 18, 15, 84, 23, 58, 121, 56, 57, 109, 77, 50, 26, 34, 38, 104, 80, 98, 46, 114, 103, 27, 72, 85, 74, 88, 10, 24, 21, 87, 42, 32, 92, 111, 45, 97, 93, 75, 13, 76, 91, 8, 11, 90, 70, 14, 81, 31, 7, 19, 83, 12, 33, 5, 78, 95, 35, 17, 28, 106, 73, 108, 71, 44, 9, 0, 64, 69, 3, 4, 1, 65, 68, 67, 101, 66, 2, 37, 6, 99], [113, 112, 63, 124, 55, 118, 54, 41, 52, 122, 120, 60, 59, 105, 107, 127, 43, 53, 123, 86, 36, 102, 48, 22, 117, 25, 126, 62, 110, 100, 82, 29, 30, 51, 15, 49, 84, 89, 18, 16, 96, 61, 79, 116, 20, 47, 40, 119, 80, 94, 125, 38, 34, 56, 77, 57, 26, 98, 115, 23, 39, 27, 109, 46, 50, 88, 13, 24, 114, 58, 85, 21, 103, 92, 121, 32, 10, 72, 104, 74, 75, 42, 111, 45, 91, 97, 76, 87, 93, 11, 8, 95, 90, 106, 81, 83, 14, 19, 12, 78, 31, 33, 44, 28, 17, 73, 35, 108, 70, 9, 7, 71, 101, 37, 69, 99, 5, 67, 64, 6, 68, 4, 65, 66, 0, 1, 3, 2]], "model.layers.19.self_attn.q_proj": [[124, 110, 63, 115, 58, 119, 101, 120, 57, 46, 112, 52, 60, 123, 127, 109, 53, 59, 56, 116, 54, 88, 126, 50, 108, 61, 122, 55, 94, 111, 121, 114, 104, 51, 37, 47, 62, 97, 48, 40, 44, 49, 113, 117, 125, 45, 27, 19, 24, 118, 90, 34, 41, 105, 42, 81, 98, 106, 43, 107, 96, 31, 22, 39, 89, 102, 26, 16, 92, 9, 100, 38, 103, 36, 35, 78, 91, 99, 93, 25, 18, 30, 80, 29, 32, 73, 17, 20, 95, 23, 67, 85, 15, 28, 21, 69, 77, 84, 33, 12, 6, 14, 87, 83, 86, 10, 70, 79, 75, 13, 4, 1, 82, 68, 72, 0, 74, 5, 11, 64, 3, 65, 66, 8, 7, 76, 71, 2], [63, 57, 46, 101, 123, 112, 59, 110, 124, 120, 47, 97, 53, 121, 119, 81, 109, 114, 90, 24, 94, 111, 34, 105, 60, 19, 88, 87, 122, 126, 31, 85, 55, 104, 27, 91, 117, 58, 45, 52, 61, 25, 56, 106, 20, 39, 22, 116, 32, 28, 78, 42, 16, 38, 10, 107, 62, 44, 41, 54, 83, 127, 113, 50, 108, 14, 102, 100, 17, 80, 37, 49, 35, 51, 118, 40, 26, 12, 103, 36, 95, 96, 125, 43, 115, 89, 21, 48, 84, 98, 99, 93, 74, 30, 23, 29, 86, 92, 15, 11, 76, 77, 75, 6, 67, 7, 72, 18, 70, 13, 9, 73, 82, 68, 33, 79, 8, 69, 66, 1, 71, 4, 0, 2, 3, 64, 65, 5], [63, 124, 110, 57, 115, 101, 58, 46, 50, 122, 97, 88, 52, 60, 27, 120, 49, 55, 116, 81, 56, 59, 112, 123, 94, 119, 42, 121, 109, 111, 54, 126, 24, 125, 105, 127, 62, 117, 114, 44, 51, 93, 113, 37, 53, 84, 61, 90, 47, 31, 104, 20, 85, 108, 45, 6, 103, 118, 75, 73, 9, 41, 40, 67, 106, 48, 65, 43, 36, 38, 16, 4, 39, 72, 22, 14, 107, 0, 17, 100, 68, 12, 83, 80, 69, 30, 102, 87, 71, 91, 95, 7, 26, 15, 99, 19, 96, 11, 2, 35, 10, 74, 25, 3, 29, 34, 66, 77, 64, 32, 28, 92, 98, 13, 23, 21, 82, 70, 1, 8, 89, 18, 5, 79, 86, 76, 78, 33], [124, 63, 110, 101, 54, 46, 122, 119, 58, 59, 28, 112, 61, 55, 62, 120, 94, 56, 114, 111, 57, 90, 50, 53, 113, 116, 126, 123, 127, 121, 60, 125, 118, 97, 82, 22, 117, 48, 37, 47, 52, 115, 18, 49, 108, 109, 51, 41, 45, 44, 43, 39, 107, 32, 19, 106, 86, 102, 100, 42, 27, 92, 36, 30, 104, 40, 105, 25, 38, 103, 99, 34, 14, 35, 98, 96, 78, 20, 15, 33, 83, 88, 31, 77, 12, 26, 10, 29, 93, 24, 95, 89, 87, 23, 84, 74, 91, 13, 85, 76, 79, 21, 81, 17, 8, 75, 7, 72, 11, 80, 71, 9, 16, 0, 70, 68, 73, 67, 3, 2, 6, 66, 4, 5, 65, 69, 64, 1], [59, 118, 39, 46, 105, 127, 63, 34, 113, 112, 60, 119, 53, 54, 111, 50, 48, 126, 61, 122, 30, 125, 109, 49, 123, 89, 58, 97, 57, 121, 115, 56, 110, 43, 52, 25, 47, 104, 116, 55, 51, 107, 31, 108, 45, 62, 114, 36, 44, 41, 42, 120, 117, 102, 23, 33, 106, 28, 40, 82, 37, 38, 22, 94, 90, 84, 27, 98, 91, 18, 80, 101, 78, 87, 86, 99, 100, 26, 35, 32, 93, 103, 95, 96, 29, 14, 92, 21, 24, 19, 75, 20, 8, 124, 0, 76, 16, 12, 7, 88, 67, 66, 9, 2, 68, 70, 71, 73, 79, 6, 85, 65, 81, 3, 69, 4, 64, 83, 11, 1, 72, 17, 13, 5, 15, 10, 77, 74], [39, 59, 118, 34, 124, 90, 46, 84, 43, 79, 75, 54, 60, 122, 98, 30, 112, 49, 109, 127, 21, 19, 88, 111, 25, 94, 115, 76, 119, 23, 63, 89, 105, 22, 113, 9, 38, 87, 56, 8, 41, 106, 57, 31, 44, 81, 110, 33, 50, 70, 92, 48, 82, 116, 125, 100, 114, 53, 27, 62, 120, 99, 107, 42, 40, 73, 108, 58, 45, 123, 93, 104, 55, 95, 126, 121, 35, 91, 20, 18, 37, 52, 51, 61, 14, 72, 97, 28, 47, 6, 16, 86, 13, 24, 36, 96, 3, 102, 29, 11, 12, 101, 32, 26, 117, 71, 80, 66, 85, 78, 15, 7, 83, 2, 67, 68, 17, 10, 5, 69, 77, 4, 65, 103, 64, 0, 1, 74], [118, 39, 59, 34, 124, 127, 46, 105, 54, 63, 43, 55, 89, 112, 90, 84, 50, 120, 47, 57, 49, 113, 30, 115, 122, 53, 51, 125, 107, 60, 119, 80, 35, 75, 126, 45, 21, 19, 116, 123, 110, 25, 8, 111, 109, 22, 82, 88, 48, 52, 104, 27, 94, 33, 56, 121, 44, 61, 108, 62, 98, 92, 117, 58, 106, 37, 78, 79, 31, 38, 23, 40, 101, 103, 86, 99, 91, 42, 95, 24, 114, 97, 93, 12, 87, 102, 81, 32, 26, 100, 41, 20, 29, 18, 76, 14, 7, 36, 73, 28, 71, 96, 16, 13, 3, 67, 11, 70, 2, 69, 66, 68, 6, 10, 9, 0, 72, 4, 64, 77, 65, 85, 1, 83, 15, 5, 17, 74], [39, 59, 118, 34, 81, 90, 21, 13, 19, 79, 10, 46, 69, 54, 100, 65, 8, 68, 74, 115, 77, 33, 3, 11, 75, 95, 49, 55, 38, 120, 122, 71, 85, 30, 94, 0, 32, 88, 87, 18, 17, 107, 56, 112, 23, 44, 57, 63, 25, 126, 86, 67, 24, 83, 58, 119, 113, 5, 76, 72, 15, 41, 127, 89, 7, 111, 4, 12, 16, 26, 20, 43, 29, 80, 91, 14, 64, 110, 99, 22, 78, 84, 60, 123, 66, 92, 70, 37, 6, 105, 31, 1, 27, 36, 82, 96, 73, 98, 50, 35, 124, 93, 9, 47, 28, 45, 2, 53, 101, 125, 52, 114, 102, 121, 103, 109, 48, 62, 61, 104, 116, 97, 51, 40, 42, 117, 108, 106], [105, 112, 34, 84, 18, 12, 15, 27, 22, 5, 71, 48, 9, 41, 96, 2, 64, 89, 53, 82, 91, 58, 13, 4, 122, 30, 83, 1, 20, 25, 79, 63, 86, 118, 76, 87, 7, 74, 31, 78, 80, 50, 19, 85, 69, 119, 120, 70, 75, 94, 127, 16, 73, 11, 14, 81, 10, 92, 77, 46, 98, 17, 68, 6, 103, 60, 90, 28, 21, 24, 37, 72, 93, 23, 65, 107, 113, 61, 3, 26, 67, 59, 8, 56, 52, 114, 32, 115, 66, 57, 88, 110, 29, 95, 0, 123, 33, 62, 49, 55, 35, 125, 44, 36, 126, 101, 42, 99, 116, 51, 97, 47, 54, 104, 100, 102, 124, 109, 111, 43, 39, 117, 45, 38, 106, 40, 108, 121], [112, 105, 53, 25, 127, 58, 118, 89, 56, 34, 114, 54, 101, 83, 125, 96, 32, 49, 48, 120, 124, 116, 55, 52, 57, 93, 59, 33, 41, 113, 119, 62, 117, 115, 126, 61, 60, 107, 50, 106, 121, 122, 104, 45, 111, 110, 98, 51, 63, 44, 109, 47, 42, 46, 94, 123, 43, 108, 102, 37, 103, 100, 27, 38, 36, 39, 23, 99, 35, 40, 81, 87, 18, 95, 30, 22, 97, 77, 29, 31, 13, 6, 28, 8, 92, 10, 90, 68, 24, 91, 84, 16, 65, 26, 88, 86, 70, 15, 4, 19, 14, 85, 17, 80, 82, 11, 67, 21, 1, 74, 72, 75, 64, 3, 7, 78, 12, 0, 20, 66, 73, 71, 79, 5, 9, 2, 76, 69], [105, 112, 34, 15, 22, 12, 84, 18, 9, 27, 41, 89, 48, 71, 53, 5, 96, 4, 91, 58, 25, 13, 85, 2, 78, 81, 75, 64, 122, 19, 50, 67, 30, 1, 80, 31, 86, 20, 77, 82, 63, 79, 119, 72, 59, 83, 95, 88, 69, 73, 118, 21, 37, 76, 74, 7, 46, 11, 57, 3, 60, 26, 14, 6, 65, 23, 92, 28, 17, 127, 70, 62, 93, 87, 8, 24, 115, 98, 56, 61, 52, 68, 107, 125, 90, 33, 16, 120, 114, 110, 36, 10, 66, 47, 94, 35, 29, 126, 55, 116, 51, 113, 0, 97, 32, 42, 103, 123, 49, 100, 124, 102, 43, 54, 109, 117, 39, 101, 38, 99, 44, 111, 45, 104, 121, 106, 108, 40], [105, 53, 112, 34, 41, 48, 88, 25, 89, 94, 27, 33, 98, 13, 22, 84, 58, 56, 114, 118, 101, 96, 125, 120, 59, 19, 18, 78, 57, 80, 91, 124, 116, 117, 55, 62, 127, 107, 119, 49, 93, 68, 113, 6, 75, 115, 60, 50, 122, 70, 100, 15, 110, 52, 63, 54, 126, 83, 103, 47, 61, 51, 111, 4, 37, 29, 121, 92, 12, 104, 45, 97, 109, 30, 31, 42, 71, 38, 46, 123, 65, 36, 23, 44, 102, 43, 67, 74, 21, 99, 108, 10, 26, 106, 1, 24, 35, 0, 95, 39, 11, 3, 86, 81, 77, 16, 32, 40, 82, 14, 90, 9, 28, 72, 85, 17, 8, 87, 2, 20, 66, 64, 5, 7, 79, 73, 69, 76], [120, 121, 53, 57, 118, 126, 63, 113, 124, 51, 59, 60, 58, 123, 125, 115, 49, 116, 122, 119, 117, 62, 127, 56, 112, 54, 47, 55, 99, 50, 61, 114, 48, 111, 83, 44, 108, 110, 52, 13, 45, 103, 109, 46, 43, 41, 107, 40, 31, 32, 90, 106, 42, 105, 39, 38, 102, 100, 35, 9, 69, 34, 3, 97, 104, 101, 21, 82, 71, 96, 37, 28, 2, 36, 70, 98, 18, 76, 85, 68, 77, 23, 33, 30, 66, 11, 81, 92, 25, 26, 72, 0, 10, 93, 15, 29, 94, 84, 24, 86, 88, 22, 20, 16, 1, 89, 91, 75, 4, 79, 27, 78, 12, 5, 95, 65, 80, 17, 14, 19, 7, 64, 87, 73, 8, 74, 67, 6], [121, 120, 53, 57, 113, 63, 58, 118, 60, 125, 124, 126, 59, 56, 117, 119, 51, 116, 123, 55, 115, 48, 114, 99, 49, 62, 50, 61, 112, 122, 54, 52, 47, 43, 44, 110, 45, 108, 127, 111, 109, 46, 39, 41, 40, 42, 102, 106, 107, 105, 89, 90, 100, 83, 103, 35, 26, 104, 38, 32, 3, 101, 96, 37, 31, 2, 71, 72, 36, 82, 70, 34, 98, 33, 0, 13, 4, 69, 16, 76, 11, 91, 9, 92, 21, 27, 29, 22, 97, 95, 1, 94, 93, 77, 85, 8, 87, 30, 23, 15, 18, 86, 28, 88, 10, 68, 14, 25, 64, 19, 80, 12, 20, 78, 65, 84, 75, 17, 74, 79, 24, 66, 6, 7, 81, 67, 5, 73], [121, 120, 126, 40, 53, 99, 36, 63, 58, 124, 39, 51, 42, 106, 117, 88, 49, 113, 89, 59, 57, 102, 26, 122, 96, 31, 60, 123, 56, 28, 50, 46, 118, 119, 24, 105, 16, 116, 87, 112, 110, 55, 125, 32, 90, 98, 34, 82, 52, 103, 127, 35, 10, 115, 30, 54, 48, 95, 47, 100, 25, 45, 37, 38, 111, 104, 44, 41, 109, 61, 62, 33, 114, 22, 83, 43, 80, 21, 107, 27, 91, 97, 101, 29, 108, 69, 72, 92, 93, 19, 94, 15, 9, 70, 13, 71, 23, 11, 77, 84, 3, 86, 14, 85, 76, 20, 81, 79, 18, 17, 68, 74, 2, 0, 1, 12, 7, 65, 75, 78, 8, 4, 6, 73, 67, 64, 66, 5], [121, 120, 118, 39, 63, 117, 126, 124, 53, 116, 57, 51, 102, 125, 56, 60, 54, 119, 123, 58, 113, 49, 35, 99, 62, 83, 105, 112, 15, 40, 36, 44, 127, 114, 61, 48, 52, 59, 122, 42, 45, 111, 50, 47, 55, 38, 108, 107, 115, 101, 32, 106, 103, 110, 21, 46, 24, 13, 96, 88, 41, 109, 90, 26, 81, 16, 31, 43, 100, 37, 75, 34, 104, 91, 33, 25, 20, 28, 84, 69, 92, 97, 79, 98, 9, 27, 71, 89, 85, 74, 86, 87, 29, 94, 23, 22, 30, 77, 19, 93, 12, 76, 10, 11, 14, 17, 80, 5, 7, 3, 73, 95, 82, 18, 68, 78, 70, 8, 67, 4, 0, 72, 2, 65, 66, 6, 64, 1], [55, 105, 101, 120, 116, 49, 32, 112, 25, 87, 46, 57, 110, 81, 91, 109, 27, 48, 47, 119, 28, 60, 96, 94, 39, 113, 126, 44, 54, 59, 115, 63, 89, 107, 118, 84, 127, 76, 78, 85, 83, 11, 50, 106, 37, 20, 14, 122, 104, 31, 123, 58, 52, 111, 125, 61, 124, 108, 71, 88, 99, 51, 121, 19, 92, 114, 56, 45, 42, 103, 38, 43, 117, 26, 97, 62, 30, 35, 95, 34, 98, 21, 7, 86, 53, 22, 29, 73, 100, 18, 24, 102, 93, 90, 79, 15, 17, 33, 40, 5, 16, 12, 80, 70, 36, 82, 68, 41, 10, 75, 3, 23, 8, 9, 74, 2, 4, 77, 13, 72, 6, 69, 67, 65, 66, 1, 64, 0], [105, 55, 109, 116, 101, 120, 32, 112, 57, 46, 87, 25, 113, 28, 60, 110, 81, 119, 91, 104, 49, 35, 89, 84, 48, 121, 85, 39, 88, 111, 92, 42, 108, 37, 78, 27, 58, 114, 11, 107, 97, 117, 53, 31, 96, 95, 127, 125, 118, 56, 61, 94, 124, 126, 63, 62, 38, 44, 115, 99, 59, 93, 30, 54, 14, 45, 26, 50, 52, 29, 51, 98, 122, 76, 100, 106, 20, 41, 123, 33, 17, 73, 36, 79, 47, 103, 40, 12, 23, 43, 102, 86, 83, 16, 21, 24, 22, 34, 7, 19, 15, 71, 80, 75, 82, 18, 90, 10, 74, 5, 70, 6, 72, 77, 9, 3, 8, 4, 13, 2, 67, 68, 69, 65, 1, 0, 64, 66], [55, 105, 101, 120, 46, 112, 116, 57, 32, 87, 25, 39, 81, 28, 126, 47, 119, 48, 94, 84, 27, 89, 50, 56, 113, 34, 92, 118, 14, 122, 91, 60, 124, 125, 127, 38, 11, 111, 104, 78, 115, 45, 83, 110, 58, 36, 114, 108, 96, 54, 121, 109, 52, 117, 20, 31, 103, 106, 95, 42, 44, 49, 30, 107, 63, 123, 33, 37, 73, 29, 59, 79, 61, 70, 43, 51, 85, 62, 72, 99, 53, 19, 100, 102, 22, 97, 86, 21, 93, 35, 98, 17, 8, 41, 76, 40, 24, 12, 82, 90, 26, 88, 16, 75, 80, 18, 10, 15, 5, 7, 3, 6, 23, 4, 68, 9, 71, 77, 69, 66, 74, 2, 67, 13, 65, 64, 0, 1], [105, 57, 120, 116, 46, 112, 32, 55, 101, 119, 39, 113, 127, 87, 92, 94, 28, 53, 62, 104, 50, 115, 25, 126, 60, 114, 124, 38, 108, 58, 52, 27, 121, 56, 54, 111, 123, 84, 107, 98, 110, 61, 48, 122, 117, 44, 36, 106, 96, 59, 42, 51, 100, 118, 47, 63, 43, 125, 45, 109, 89, 91, 83, 79, 41, 49, 85, 33, 99, 88, 81, 35, 86, 29, 37, 95, 22, 20, 77, 30, 34, 97, 21, 18, 26, 31, 103, 40, 93, 102, 14, 78, 74, 82, 24, 90, 73, 23, 13, 75, 80, 70, 11, 76, 16, 5, 15, 64, 19, 17, 12, 7, 66, 0, 8, 2, 67, 3, 65, 72, 1, 10, 69, 68, 71, 4, 6, 9], [102, 62, 56, 55, 97, 22, 29, 19, 89, 35, 15, 88, 93, 38, 31, 76, 45, 84, 27, 60, 17, 90, 25, 33, 21, 12, 86, 46, 96, 30, 81, 95, 43, 14, 98, 78, 50, 100, 7, 26, 9, 24, 44, 61, 101, 91, 40, 87, 79, 94, 57, 122, 116, 23, 67, 71, 83, 59, 10, 34, 20, 39, 127, 85, 32, 92, 53, 28, 121, 54, 103, 73, 36, 99, 82, 18, 115, 105, 114, 117, 63, 113, 13, 104, 111, 11, 49, 124, 16, 112, 70, 41, 58, 80, 37, 125, 48, 109, 42, 8, 119, 106, 68, 51, 110, 2, 107, 123, 52, 108, 120, 75, 47, 1, 69, 126, 118, 74, 77, 0, 3, 5, 65, 72, 4, 64, 66, 6], [56, 102, 97, 55, 62, 29, 15, 45, 19, 89, 22, 60, 76, 88, 38, 93, 7, 69, 17, 82, 10, 74, 2, 59, 32, 49, 31, 81, 3, 54, 33, 40, 46, 86, 58, 25, 91, 9, 98, 68, 112, 21, 104, 43, 50, 122, 28, 27, 64, 26, 11, 13, 110, 120, 108, 90, 44, 78, 84, 73, 57, 14, 23, 117, 121, 53, 124, 61, 80, 100, 116, 101, 83, 1, 37, 115, 126, 51, 6, 12, 48, 20, 47, 114, 52, 96, 36, 87, 30, 107, 63, 119, 41, 35, 127, 118, 125, 92, 123, 75, 109, 85, 113, 67, 99, 39, 42, 106, 103, 4, 8, 94, 71, 16, 24, 77, 18, 111, 70, 105, 34, 79, 95, 72, 5, 0, 66, 65], [55, 102, 62, 56, 97, 29, 22, 38, 93, 19, 60, 15, 76, 74, 88, 12, 69, 27, 28, 13, 45, 17, 90, 81, 84, 25, 118, 32, 104, 10, 23, 82, 73, 89, 21, 31, 49, 37, 79, 42, 11, 35, 58, 16, 30, 53, 50, 101, 61, 92, 46, 109, 98, 94, 86, 7, 71, 107, 117, 26, 91, 96, 44, 83, 59, 57, 106, 114, 121, 68, 24, 43, 2, 100, 110, 124, 126, 87, 48, 47, 54, 20, 39, 36, 112, 52, 85, 125, 113, 63, 33, 95, 99, 108, 116, 119, 78, 123, 75, 9, 3, 105, 14, 127, 115, 41, 122, 51, 80, 111, 120, 34, 40, 5, 103, 77, 18, 67, 72, 64, 6, 4, 8, 70, 66, 0, 1, 65], [102, 62, 56, 97, 55, 50, 29, 43, 40, 22, 61, 45, 25, 38, 89, 19, 126, 27, 17, 42, 127, 116, 18, 88, 46, 91, 57, 15, 59, 24, 53, 13, 114, 123, 93, 8, 54, 39, 109, 60, 113, 76, 125, 81, 117, 121, 111, 104, 124, 52, 115, 119, 47, 58, 122, 120, 63, 107, 112, 110, 6, 10, 87, 48, 77, 51, 49, 106, 31, 100, 20, 44, 86, 108, 103, 26, 118, 101, 65, 84, 30, 33, 105, 78, 35, 94, 11, 95, 41, 9, 37, 23, 72, 5, 99, 36, 14, 98, 69, 28, 96, 7, 74, 73, 90, 3, 75, 83, 68, 34, 66, 82, 32, 92, 85, 80, 21, 70, 16, 2, 1, 4, 67, 12, 79, 64, 71, 0], [41, 120, 47, 34, 57, 105, 22, 93, 84, 80, 109, 60, 78, 76, 88, 117, 45, 26, 103, 61, 118, 127, 125, 74, 126, 110, 108, 8, 90, 52, 113, 86, 114, 59, 124, 58, 119, 75, 35, 49, 89, 48, 116, 99, 111, 20, 1, 123, 101, 53, 81, 51, 62, 54, 104, 24, 28, 3, 43, 29, 7, 56, 19, 31, 112, 87, 2, 69, 92, 70, 107, 121, 17, 4, 25, 95, 18, 63, 50, 16, 55, 73, 115, 122, 27, 44, 102, 13, 21, 42, 94, 32, 91, 100, 106, 96, 11, 40, 46, 33, 71, 85, 36, 79, 39, 14, 23, 37, 10, 83, 6, 97, 30, 82, 77, 15, 38, 12, 98, 5, 9, 72, 65, 66, 68, 67, 0, 64], [41, 120, 57, 34, 84, 22, 88, 109, 93, 60, 110, 116, 29, 108, 90, 80, 49, 95, 42, 105, 76, 52, 123, 24, 26, 35, 74, 39, 127, 126, 113, 104, 78, 81, 25, 44, 82, 61, 40, 53, 124, 101, 112, 19, 43, 54, 119, 111, 48, 45, 122, 125, 7, 59, 121, 86, 55, 107, 58, 98, 63, 97, 115, 32, 62, 38, 106, 56, 37, 51, 96, 27, 18, 28, 118, 103, 79, 85, 36, 117, 114, 92, 83, 20, 102, 47, 16, 91, 50, 17, 71, 94, 46, 15, 89, 21, 23, 31, 100, 30, 99, 70, 33, 14, 87, 77, 13, 75, 8, 4, 69, 66, 68, 11, 73, 1, 12, 9, 65, 10, 5, 2, 72, 3, 6, 67, 64, 0], [41, 120, 34, 47, 70, 57, 64, 8, 76, 78, 93, 67, 66, 80, 87, 105, 7, 22, 84, 74, 1, 124, 0, 60, 3, 19, 11, 88, 13, 65, 18, 86, 117, 101, 126, 61, 20, 83, 69, 59, 26, 5, 72, 110, 73, 90, 82, 68, 121, 108, 16, 89, 31, 37, 25, 9, 2, 109, 79, 10, 75, 17, 111, 14, 4, 122, 24, 12, 114, 92, 32, 27, 116, 112, 6, 23, 77, 58, 85, 71, 81, 15, 95, 127, 100, 91, 102, 21, 115, 94, 62, 113, 43, 119, 28, 30, 107, 125, 103, 53, 48, 104, 99, 55, 33, 97, 50, 45, 96, 35, 36, 51, 63, 49, 118, 38, 39, 44, 46, 106, 123, 56, 40, 42, 54, 52, 29, 98], [120, 41, 34, 109, 22, 88, 48, 84, 47, 60, 113, 93, 117, 26, 110, 49, 119, 51, 124, 43, 61, 111, 29, 58, 24, 125, 50, 55, 80, 115, 81, 62, 63, 127, 116, 57, 114, 123, 108, 112, 54, 42, 126, 56, 78, 105, 90, 118, 36, 53, 40, 121, 102, 74, 44, 46, 35, 52, 28, 59, 107, 37, 122, 76, 106, 7, 95, 86, 39, 19, 45, 101, 38, 87, 99, 103, 18, 104, 100, 97, 27, 32, 83, 20, 31, 25, 33, 30, 94, 92, 96, 91, 17, 89, 70, 98, 11, 79, 73, 68, 8, 21, 1, 82, 23, 75, 15, 16, 77, 66, 9, 85, 13, 14, 71, 10, 12, 67, 5, 69, 65, 4, 6, 2, 64, 72, 3, 0], [38, 48, 112, 24, 125, 84, 17, 109, 27, 22, 91, 97, 78, 47, 18, 10, 107, 35, 95, 81, 121, 43, 11, 98, 118, 77, 115, 46, 52, 49, 119, 86, 102, 20, 68, 13, 50, 26, 120, 90, 54, 59, 92, 40, 34, 29, 88, 53, 7, 14, 15, 56, 108, 89, 82, 55, 30, 9, 37, 96, 103, 42, 117, 51, 39, 60, 64, 106, 110, 28, 87, 94, 32, 69, 126, 61, 113, 83, 123, 105, 41, 45, 25, 63, 122, 111, 85, 19, 93, 44, 33, 116, 114, 127, 8, 80, 72, 124, 23, 104, 31, 58, 21, 100, 73, 57, 79, 16, 76, 36, 99, 6, 101, 2, 12, 75, 74, 62, 5, 71, 70, 67, 65, 4, 1, 3, 66, 0], [48, 38, 112, 24, 22, 93, 107, 84, 123, 17, 52, 50, 27, 91, 95, 118, 53, 30, 20, 98, 94, 115, 78, 125, 111, 99, 113, 49, 117, 55, 126, 45, 97, 81, 19, 119, 104, 77, 58, 110, 92, 41, 109, 28, 47, 102, 10, 36, 13, 116, 86, 105, 101, 32, 88, 35, 57, 18, 54, 80, 90, 122, 61, 121, 62, 120, 108, 34, 106, 39, 33, 40, 44, 56, 59, 43, 42, 31, 100, 124, 63, 60, 103, 127, 51, 37, 72, 96, 114, 8, 26, 46, 25, 29, 65, 23, 87, 11, 16, 68, 9, 71, 85, 89, 21, 14, 12, 75, 82, 83, 15, 7, 6, 76, 64, 79, 74, 5, 2, 73, 67, 69, 70, 4, 66, 3, 1, 0], [38, 48, 112, 24, 22, 125, 84, 107, 17, 93, 19, 80, 95, 10, 20, 78, 109, 50, 62, 13, 77, 33, 18, 27, 47, 98, 97, 68, 94, 111, 28, 92, 72, 81, 34, 35, 75, 56, 119, 6, 52, 88, 96, 8, 86, 90, 26, 9, 30, 91, 82, 49, 116, 14, 32, 31, 69, 99, 64, 43, 67, 12, 54, 53, 41, 117, 55, 29, 126, 89, 16, 15, 45, 120, 21, 36, 83, 61, 85, 114, 0, 79, 73, 39, 74, 1, 121, 57, 23, 7, 118, 59, 51, 87, 63, 25, 46, 100, 44, 110, 76, 40, 124, 58, 123, 2, 106, 5, 42, 66, 104, 37, 115, 101, 103, 65, 102, 113, 127, 11, 108, 71, 70, 105, 60, 122, 4, 3], [38, 48, 112, 24, 22, 107, 84, 93, 17, 62, 123, 97, 78, 10, 121, 30, 20, 110, 125, 18, 98, 49, 27, 96, 95, 115, 94, 50, 33, 35, 127, 116, 19, 34, 13, 45, 124, 91, 120, 72, 43, 77, 59, 86, 126, 101, 105, 51, 14, 53, 92, 111, 119, 102, 113, 41, 52, 28, 118, 80, 57, 7, 68, 109, 103, 106, 32, 55, 81, 88, 89, 40, 36, 47, 54, 117, 8, 104, 114, 42, 60, 5, 21, 99, 63, 58, 122, 26, 61, 44, 56, 39, 74, 12, 31, 100, 9, 108, 46, 25, 37, 29, 87, 82, 64, 3, 11, 65, 90, 75, 15, 16, 83, 2, 71, 66, 67, 79, 76, 85, 23, 70, 6, 69, 73, 4, 1, 0]], "model.layers.19.self_attn.k_proj": [[63, 124, 110, 37, 33, 22, 53, 55, 112, 113, 59, 122, 114, 121, 120, 125, 54, 116, 61, 126, 62, 127, 119, 50, 49, 117, 56, 111, 47, 57, 123, 118, 58, 60, 30, 48, 109, 92, 51, 105, 43, 52, 14, 108, 44, 90, 45, 15, 104, 36, 69, 35, 12, 81, 71, 101, 107, 115, 3, 9, 46, 85, 88, 99, 74, 106, 40, 39, 42, 96, 41, 102, 38, 86, 103, 91, 20, 1, 77, 82, 64, 34, 19, 72, 100, 24, 11, 25, 98, 78, 32, 75, 70, 29, 18, 87, 66, 31, 26, 93, 95, 68, 89, 94, 76, 23, 16, 28, 27, 21, 80, 83, 84, 97, 79, 0, 13, 4, 10, 2, 8, 7, 73, 5, 67, 17, 65, 6], [118, 59, 103, 98, 94, 10, 79, 81, 13, 21, 69, 19, 75, 110, 25, 8, 97, 90, 80, 78, 0, 122, 44, 43, 57, 49, 23, 48, 108, 84, 120, 87, 77, 22, 71, 82, 26, 115, 35, 74, 76, 42, 2, 106, 54, 107, 119, 95, 1, 123, 52, 102, 68, 91, 41, 113, 9, 7, 121, 112, 55, 53, 63, 50, 88, 127, 14, 27, 67, 24, 65, 101, 93, 58, 100, 62, 99, 56, 104, 60, 117, 126, 92, 105, 111, 51, 114, 38, 116, 125, 28, 61, 47, 109, 36, 17, 96, 5, 32, 124, 73, 40, 37, 4, 83, 18, 70, 11, 12, 31, 46, 33, 45, 86, 72, 66, 6, 89, 29, 85, 3, 16, 30, 39, 20, 15, 34, 64], [41, 112, 98, 22, 48, 9, 27, 64, 15, 12, 71, 84, 2, 63, 5, 25, 114, 18, 53, 62, 122, 125, 56, 113, 94, 117, 59, 124, 1, 60, 120, 58, 119, 57, 83, 118, 116, 13, 51, 50, 126, 55, 49, 4, 46, 47, 105, 115, 54, 45, 32, 111, 74, 97, 29, 123, 110, 108, 61, 70, 52, 43, 127, 121, 78, 75, 107, 80, 67, 42, 37, 85, 88, 44, 87, 109, 103, 106, 104, 102, 101, 100, 40, 19, 93, 28, 21, 39, 8, 66, 95, 35, 81, 38, 30, 36, 72, 6, 11, 99, 10, 14, 16, 90, 92, 31, 26, 33, 17, 96, 91, 0, 24, 73, 76, 65, 34, 69, 3, 89, 77, 23, 86, 68, 79, 7, 20, 82], [121, 120, 35, 22, 104, 126, 124, 95, 57, 49, 60, 117, 59, 46, 112, 63, 50, 38, 58, 51, 119, 92, 127, 106, 111, 48, 123, 118, 125, 56, 122, 113, 61, 115, 116, 91, 41, 52, 54, 103, 45, 36, 47, 28, 109, 110, 62, 18, 42, 108, 55, 32, 114, 102, 43, 12, 44, 20, 101, 53, 14, 40, 81, 39, 107, 105, 26, 94, 37, 34, 79, 29, 80, 88, 74, 97, 99, 100, 27, 96, 33, 70, 13, 23, 98, 9, 8, 71, 31, 24, 30, 21, 89, 93, 83, 87, 25, 90, 3, 4, 10, 11, 86, 84, 16, 15, 17, 1, 69, 75, 85, 2, 19, 73, 7, 72, 82, 78, 5, 76, 68, 64, 77, 6, 67, 0, 66, 65], [55, 37, 110, 116, 120, 96, 112, 41, 30, 91, 89, 28, 57, 81, 45, 124, 87, 23, 78, 103, 56, 109, 83, 102, 84, 52, 49, 36, 60, 11, 123, 31, 40, 25, 34, 76, 85, 94, 121, 117, 62, 99, 8, 71, 73, 111, 39, 97, 54, 126, 58, 14, 44, 12, 59, 125, 115, 114, 50, 122, 22, 17, 61, 24, 119, 35, 118, 51, 127, 107, 43, 93, 79, 33, 18, 48, 95, 9, 47, 63, 21, 108, 42, 106, 113, 68, 6, 5, 2, 32, 70, 27, 26, 88, 15, 90, 92, 98, 104, 3, 29, 80, 100, 86, 53, 19, 16, 7, 38, 46, 82, 67, 20, 74, 75, 13, 10, 72, 1, 65, 105, 64, 0, 77, 69, 4, 101, 66], [62, 56, 38, 55, 33, 22, 93, 15, 19, 89, 17, 76, 81, 59, 7, 109, 29, 120, 102, 78, 110, 57, 90, 53, 121, 88, 74, 27, 12, 60, 119, 10, 69, 117, 124, 103, 63, 2, 113, 125, 104, 54, 45, 9, 31, 61, 77, 58, 116, 47, 0, 49, 107, 126, 118, 114, 122, 21, 112, 52, 39, 44, 127, 115, 34, 111, 46, 48, 13, 87, 123, 84, 108, 41, 3, 37, 99, 51, 23, 18, 106, 80, 96, 105, 71, 73, 36, 24, 79, 30, 95, 94, 50, 4, 92, 16, 14, 91, 85, 83, 32, 40, 42, 28, 67, 65, 6, 26, 75, 100, 20, 68, 101, 11, 8, 64, 98, 82, 43, 35, 5, 25, 72, 86, 70, 97, 66, 1], [105, 120, 98, 47, 29, 22, 57, 64, 84, 8, 80, 76, 111, 78, 67, 88, 74, 56, 70, 48, 66, 69, 60, 50, 26, 19, 119, 46, 117, 126, 124, 61, 82, 114, 44, 115, 53, 116, 125, 63, 62, 122, 43, 45, 113, 95, 59, 7, 23, 118, 41, 99, 112, 51, 55, 65, 58, 127, 81, 107, 110, 52, 49, 18, 33, 54, 89, 71, 123, 106, 27, 109, 42, 38, 121, 4, 101, 3, 104, 87, 15, 37, 83, 39, 77, 79, 40, 13, 103, 108, 28, 102, 96, 25, 30, 31, 2, 1, 91, 94, 32, 34, 16, 35, 85, 92, 36, 100, 12, 97, 21, 73, 11, 75, 5, 9, 24, 90, 20, 68, 17, 6, 93, 72, 0, 10, 14, 86], [112, 48, 102, 22, 84, 24, 78, 107, 18, 17, 10, 13, 33, 68, 93, 72, 31, 92, 9, 109, 125, 94, 65, 19, 64, 12, 98, 6, 27, 30, 62, 80, 71, 2, 67, 77, 52, 91, 85, 8, 119, 5, 32, 35, 121, 113, 123, 11, 7, 105, 34, 36, 28, 118, 43, 61, 26, 111, 87, 106, 56, 75, 69, 120, 50, 115, 122, 83, 47, 89, 126, 23, 63, 79, 58, 49, 97, 40, 16, 90, 51, 53, 99, 45, 29, 46, 104, 76, 103, 117, 41, 25, 44, 101, 54, 100, 21, 60, 70, 124, 15, 57, 42, 20, 37, 55, 95, 127, 108, 96, 110, 116, 73, 74, 114, 59, 39, 38, 82, 14, 88, 3, 86, 81, 4, 66, 1, 0]], "model.layers.19.self_attn.qk_proj": [[112, 120, 55, 48, 62, 59, 56, 63, 118, 121, 124, 41, 110, 57, 105, 116, 86, 38, 102, 47, 98, 22, 53, 125, 50, 20, 111, 46, 58, 29, 60, 122, 93, 119, 84, 126, 113, 37, 25, 127, 17, 91, 51, 24, 89, 27, 109, 79, 61, 88, 12, 49, 76, 81, 83, 19, 90, 115, 97, 78, 54, 101, 14, 117, 114, 94, 123, 34, 82, 15, 52, 33, 35, 39, 32, 26, 45, 107, 87, 10, 30, 13, 18, 43, 103, 74, 92, 31, 44, 16, 77, 106, 72, 23, 108, 21, 95, 28, 96, 42, 9, 73, 11, 104, 71, 64, 80, 5, 40, 99, 75, 85, 36, 6, 7, 69, 0, 8, 100, 2, 70, 66, 67, 3, 4, 68, 65, 1], [112, 120, 55, 48, 59, 62, 56, 121, 63, 118, 124, 41, 110, 105, 57, 116, 86, 38, 102, 47, 53, 98, 58, 22, 125, 119, 50, 84, 91, 37, 111, 113, 20, 89, 126, 127, 93, 29, 46, 60, 25, 76, 24, 83, 12, 17, 49, 51, 109, 88, 79, 78, 114, 117, 81, 123, 34, 27, 61, 122, 90, 35, 19, 107, 33, 94, 97, 54, 115, 15, 18, 32, 39, 43, 103, 14, 92, 52, 26, 101, 30, 87, 10, 82, 74, 13, 45, 104, 96, 23, 44, 16, 77, 108, 31, 95, 40, 7, 9, 42, 21, 106, 28, 80, 73, 69, 99, 5, 0, 75, 71, 85, 11, 8, 64, 100, 6, 72, 70, 36, 2, 68, 66, 67, 4, 1, 65, 3], [112, 120, 48, 55, 59, 62, 56, 121, 63, 118, 41, 124, 105, 110, 57, 116, 86, 38, 102, 47, 22, 53, 58, 98, 29, 125, 93, 126, 20, 84, 46, 119, 49, 113, 127, 60, 122, 91, 89, 25, 50, 24, 81, 37, 111, 109, 88, 27, 19, 90, 34, 54, 94, 79, 51, 61, 12, 15, 76, 17, 35, 33, 97, 18, 117, 115, 107, 78, 26, 14, 83, 32, 39, 103, 101, 123, 52, 30, 114, 43, 23, 108, 45, 87, 8, 92, 13, 10, 82, 77, 96, 74, 95, 16, 42, 64, 31, 0, 11, 104, 44, 99, 9, 7, 21, 69, 5, 71, 80, 70, 28, 106, 36, 73, 100, 75, 66, 2, 3, 85, 40, 68, 4, 1, 67, 72, 6, 65], [112, 120, 48, 55, 62, 59, 63, 56, 118, 121, 124, 41, 110, 105, 57, 116, 38, 86, 102, 47, 126, 58, 53, 22, 29, 98, 84, 93, 89, 50, 46, 27, 91, 25, 20, 113, 123, 119, 49, 127, 51, 122, 125, 34, 81, 94, 15, 61, 37, 19, 76, 117, 24, 12, 115, 109, 78, 17, 88, 54, 79, 14, 60, 90, 107, 39, 83, 114, 97, 32, 111, 101, 35, 30, 33, 52, 18, 82, 45, 43, 23, 92, 87, 103, 8, 108, 104, 10, 26, 96, 95, 74, 13, 44, 106, 77, 0, 70, 69, 16, 64, 31, 71, 99, 5, 73, 42, 9, 80, 11, 7, 21, 75, 2, 40, 85, 66, 36, 3, 100, 68, 28, 6, 67, 4, 72, 65, 1], [112, 120, 48, 55, 62, 59, 121, 56, 63, 118, 124, 41, 110, 105, 57, 116, 38, 86, 47, 22, 102, 53, 98, 126, 20, 58, 29, 125, 84, 61, 25, 60, 51, 27, 46, 93, 122, 127, 119, 78, 17, 81, 19, 76, 49, 111, 37, 117, 12, 91, 89, 88, 50, 113, 79, 34, 24, 94, 15, 115, 90, 123, 14, 54, 97, 103, 18, 83, 101, 109, 30, 45, 33, 8, 43, 82, 106, 52, 32, 39, 13, 107, 74, 35, 23, 10, 114, 77, 73, 104, 96, 87, 108, 44, 31, 16, 11, 26, 92, 28, 71, 5, 64, 95, 7, 99, 9, 69, 70, 100, 85, 0, 42, 36, 21, 75, 40, 68, 66, 80, 2, 67, 3, 4, 72, 6, 65, 1], [112, 120, 55, 48, 62, 59, 56, 63, 121, 118, 124, 41, 110, 57, 105, 116, 86, 38, 102, 47, 22, 98, 84, 25, 29, 20, 126, 119, 53, 58, 27, 17, 113, 46, 49, 37, 81, 89, 76, 12, 88, 24, 78, 60, 93, 15, 83, 19, 122, 125, 127, 52, 50, 54, 14, 117, 123, 79, 111, 51, 91, 61, 90, 94, 115, 109, 97, 82, 34, 33, 13, 18, 39, 32, 43, 74, 101, 107, 10, 30, 8, 114, 103, 35, 96, 92, 44, 23, 45, 87, 16, 106, 104, 21, 108, 73, 11, 31, 28, 5, 80, 71, 77, 85, 26, 0, 42, 9, 75, 95, 69, 7, 99, 70, 64, 2, 66, 40, 6, 36, 100, 67, 3, 68, 72, 4, 1, 65], [112, 120, 55, 48, 62, 59, 121, 56, 63, 118, 41, 124, 57, 110, 105, 116, 86, 102, 38, 47, 22, 29, 20, 84, 98, 58, 17, 125, 53, 119, 24, 60, 25, 93, 111, 126, 27, 109, 12, 46, 19, 50, 49, 89, 76, 90, 14, 37, 81, 52, 91, 122, 78, 79, 88, 39, 117, 113, 83, 127, 15, 94, 61, 34, 97, 123, 82, 114, 32, 51, 18, 45, 33, 101, 54, 115, 92, 30, 8, 13, 96, 23, 10, 43, 103, 74, 104, 26, 35, 77, 87, 16, 31, 80, 107, 21, 85, 28, 9, 11, 44, 42, 108, 71, 106, 5, 7, 95, 99, 73, 100, 69, 6, 75, 64, 70, 40, 36, 0, 2, 68, 3, 67, 66, 1, 4, 72, 65], [112, 120, 48, 55, 62, 59, 56, 121, 63, 118, 41, 124, 105, 57, 110, 116, 86, 102, 38, 47, 53, 22, 29, 84, 98, 93, 24, 58, 126, 119, 20, 60, 111, 109, 27, 46, 37, 17, 89, 122, 25, 91, 50, 117, 81, 76, 88, 125, 12, 49, 54, 19, 90, 15, 107, 14, 83, 79, 94, 114, 78, 39, 113, 34, 123, 32, 115, 61, 97, 33, 101, 127, 51, 18, 35, 82, 92, 52, 26, 43, 103, 96, 23, 45, 77, 10, 30, 42, 104, 87, 8, 16, 74, 31, 44, 28, 13, 80, 21, 99, 7, 9, 71, 108, 5, 11, 36, 40, 85, 6, 106, 95, 64, 73, 0, 69, 75, 100, 2, 66, 72, 3, 4, 68, 70, 67, 1, 65], [112, 120, 55, 48, 62, 59, 56, 63, 118, 121, 41, 124, 105, 110, 57, 116, 86, 102, 47, 38, 53, 22, 29, 126, 84, 98, 119, 37, 93, 60, 122, 25, 58, 27, 91, 17, 20, 88, 24, 125, 109, 81, 114, 54, 94, 46, 50, 111, 113, 89, 12, 15, 49, 19, 127, 83, 33, 76, 123, 14, 117, 79, 107, 39, 61, 90, 97, 51, 34, 32, 35, 115, 52, 101, 18, 78, 103, 43, 13, 30, 82, 77, 92, 104, 26, 42, 87, 74, 23, 45, 10, 28, 31, 108, 44, 96, 16, 7, 99, 21, 6, 80, 36, 9, 8, 40, 73, 95, 11, 106, 5, 71, 85, 75, 64, 69, 100, 72, 0, 2, 70, 3, 67, 4, 66, 68, 1, 65], [112, 120, 48, 55, 62, 59, 56, 118, 63, 121, 41, 124, 57, 105, 110, 116, 102, 38, 86, 47, 22, 53, 98, 84, 58, 29, 125, 60, 126, 25, 93, 20, 37, 49, 12, 46, 119, 27, 88, 89, 122, 113, 50, 91, 81, 117, 61, 127, 109, 114, 19, 123, 76, 54, 94, 111, 24, 17, 97, 83, 79, 34, 107, 15, 115, 39, 32, 51, 14, 30, 10, 78, 103, 33, 101, 90, 92, 35, 104, 31, 82, 45, 74, 43, 77, 18, 13, 44, 42, 52, 23, 108, 26, 96, 9, 87, 106, 6, 16, 7, 99, 80, 95, 5, 0, 71, 72, 73, 69, 11, 28, 8, 64, 36, 40, 75, 21, 100, 85, 68, 67, 70, 66, 2, 4, 3, 1, 65], [112, 120, 55, 48, 62, 59, 118, 121, 56, 63, 124, 41, 110, 57, 105, 116, 38, 86, 102, 22, 53, 47, 29, 46, 20, 98, 25, 60, 84, 50, 126, 125, 122, 27, 58, 93, 119, 81, 117, 12, 61, 97, 89, 76, 88, 14, 101, 37, 78, 17, 113, 19, 15, 51, 91, 49, 83, 39, 24, 79, 127, 111, 10, 18, 114, 115, 54, 34, 94, 103, 30, 90, 109, 106, 123, 33, 107, 32, 72, 74, 82, 35, 43, 104, 77, 11, 5, 9, 92, 6, 96, 45, 13, 42, 31, 23, 52, 87, 16, 108, 26, 64, 44, 69, 73, 80, 7, 71, 0, 28, 2, 66, 21, 75, 95, 85, 36, 99, 8, 70, 68, 40, 67, 3, 4, 100, 65, 1], [112, 120, 48, 55, 62, 59, 56, 121, 63, 118, 124, 41, 105, 110, 57, 116, 86, 38, 47, 22, 102, 53, 29, 20, 84, 60, 98, 119, 27, 81, 93, 126, 50, 25, 17, 125, 46, 122, 58, 88, 37, 117, 12, 19, 89, 76, 24, 78, 34, 113, 109, 79, 83, 51, 97, 49, 15, 94, 61, 39, 14, 18, 111, 91, 107, 90, 32, 33, 115, 114, 74, 54, 101, 10, 103, 127, 106, 82, 52, 30, 123, 35, 87, 77, 104, 72, 26, 42, 31, 11, 13, 16, 9, 108, 96, 23, 80, 28, 45, 92, 43, 73, 44, 21, 99, 71, 95, 5, 85, 7, 0, 64, 69, 75, 6, 66, 36, 70, 3, 2, 100, 40, 68, 4, 8, 67, 65, 1], [112, 120, 48, 55, 62, 59, 121, 56, 63, 118, 41, 124, 110, 105, 57, 116, 86, 102, 47, 38, 53, 22, 119, 29, 84, 98, 58, 60, 20, 125, 93, 24, 27, 50, 126, 37, 122, 109, 89, 88, 117, 17, 49, 25, 91, 81, 113, 114, 76, 61, 19, 111, 12, 33, 51, 39, 46, 54, 15, 78, 34, 123, 83, 115, 90, 127, 107, 79, 52, 94, 32, 103, 35, 18, 92, 101, 26, 82, 74, 97, 14, 30, 96, 28, 72, 10, 77, 87, 44, 23, 13, 45, 106, 31, 80, 108, 104, 42, 85, 16, 21, 71, 43, 9, 99, 95, 100, 36, 11, 7, 69, 40, 5, 73, 70, 75, 0, 64, 6, 66, 67, 2, 68, 4, 65, 3, 8, 1], [112, 120, 55, 48, 59, 62, 56, 121, 63, 118, 124, 41, 105, 57, 110, 116, 86, 102, 38, 47, 22, 60, 53, 125, 58, 98, 84, 93, 119, 50, 29, 126, 27, 46, 25, 20, 61, 122, 113, 109, 12, 91, 49, 37, 117, 81, 17, 88, 89, 111, 115, 19, 76, 35, 24, 127, 34, 94, 101, 15, 54, 52, 97, 51, 79, 14, 114, 78, 33, 30, 107, 123, 72, 32, 18, 39, 26, 83, 90, 10, 82, 103, 74, 13, 104, 31, 23, 87, 96, 80, 77, 73, 92, 7, 16, 71, 106, 0, 42, 69, 45, 21, 95, 43, 108, 44, 70, 9, 99, 11, 85, 5, 28, 64, 40, 75, 100, 36, 66, 2, 4, 67, 6, 68, 3, 65, 1, 8], [112, 120, 55, 48, 62, 59, 56, 118, 121, 63, 41, 124, 110, 105, 57, 116, 86, 38, 102, 47, 98, 125, 84, 22, 29, 119, 53, 25, 20, 93, 60, 27, 12, 17, 58, 24, 126, 81, 50, 37, 117, 76, 113, 94, 46, 34, 49, 91, 78, 61, 19, 15, 88, 89, 14, 39, 122, 109, 111, 101, 97, 79, 35, 52, 83, 30, 18, 51, 72, 33, 54, 115, 10, 127, 123, 103, 82, 74, 90, 107, 13, 32, 26, 31, 23, 114, 73, 87, 106, 45, 92, 96, 9, 80, 71, 16, 77, 43, 69, 44, 64, 70, 104, 108, 11, 85, 5, 7, 0, 36, 99, 28, 95, 66, 40, 42, 21, 75, 6, 67, 68, 3, 2, 8, 100, 65, 4, 1], [112, 120, 48, 55, 62, 59, 56, 63, 118, 121, 41, 124, 110, 105, 57, 116, 86, 102, 38, 47, 22, 53, 29, 58, 98, 84, 20, 27, 125, 17, 117, 46, 76, 119, 126, 25, 60, 37, 81, 93, 113, 49, 50, 61, 24, 12, 34, 122, 89, 15, 91, 88, 19, 14, 94, 101, 78, 107, 79, 123, 51, 109, 52, 97, 115, 111, 83, 127, 35, 10, 54, 74, 39, 33, 18, 30, 82, 13, 90, 31, 32, 106, 72, 114, 103, 9, 26, 43, 77, 11, 73, 80, 44, 87, 45, 104, 92, 96, 42, 69, 71, 23, 95, 7, 108, 16, 70, 40, 85, 64, 5, 21, 75, 28, 99, 36, 0, 8, 2, 66, 67, 6, 100, 3, 68, 4, 1, 65], [112, 120, 48, 55, 62, 59, 56, 121, 118, 63, 41, 124, 105, 110, 57, 116, 86, 38, 102, 22, 53, 47, 98, 126, 29, 60, 58, 119, 20, 84, 125, 27, 46, 88, 50, 34, 49, 93, 37, 91, 89, 17, 81, 24, 25, 117, 122, 19, 113, 76, 109, 127, 12, 51, 33, 90, 83, 61, 32, 111, 94, 52, 79, 15, 97, 54, 78, 82, 39, 107, 92, 101, 18, 14, 114, 123, 30, 26, 74, 115, 43, 103, 35, 87, 96, 31, 10, 106, 13, 104, 77, 44, 23, 80, 16, 85, 28, 95, 42, 21, 45, 108, 40, 99, 36, 72, 8, 73, 5, 71, 9, 64, 70, 0, 11, 7, 75, 69, 100, 66, 6, 3, 2, 68, 4, 67, 1, 65], [112, 120, 48, 55, 62, 59, 56, 118, 121, 63, 41, 124, 105, 110, 57, 116, 38, 86, 102, 47, 22, 53, 98, 20, 29, 93, 113, 60, 46, 119, 84, 58, 125, 37, 89, 27, 91, 126, 25, 122, 17, 88, 50, 12, 49, 24, 76, 19, 117, 81, 109, 94, 15, 14, 79, 90, 127, 123, 83, 78, 61, 32, 34, 33, 30, 82, 74, 114, 107, 111, 39, 97, 115, 101, 103, 52, 54, 35, 51, 26, 10, 92, 18, 43, 13, 104, 87, 77, 31, 96, 23, 80, 0, 42, 73, 71, 16, 45, 28, 85, 106, 8, 21, 36, 7, 9, 5, 69, 108, 44, 95, 6, 64, 99, 75, 72, 11, 40, 70, 66, 2, 100, 67, 3, 4, 68, 65, 1], [112, 120, 48, 55, 62, 59, 63, 118, 56, 121, 41, 124, 110, 105, 57, 116, 38, 86, 47, 102, 53, 98, 22, 46, 125, 126, 29, 50, 93, 117, 27, 20, 25, 58, 84, 122, 89, 17, 127, 113, 37, 119, 114, 115, 91, 60, 81, 49, 19, 61, 12, 24, 94, 34, 33, 14, 109, 76, 52, 88, 123, 79, 83, 97, 15, 30, 101, 107, 51, 32, 90, 39, 54, 92, 104, 35, 82, 78, 74, 103, 8, 18, 31, 111, 43, 10, 87, 26, 45, 13, 96, 108, 77, 106, 44, 73, 23, 36, 6, 40, 16, 0, 80, 9, 71, 64, 69, 7, 42, 21, 28, 5, 11, 75, 95, 99, 85, 100, 3, 2, 70, 68, 66, 72, 4, 67, 65, 1], [112, 120, 48, 55, 62, 59, 63, 56, 118, 121, 41, 124, 110, 105, 57, 116, 86, 102, 38, 47, 53, 98, 22, 29, 125, 46, 84, 58, 25, 126, 27, 117, 89, 37, 20, 93, 119, 91, 17, 61, 24, 12, 50, 34, 60, 76, 49, 113, 109, 83, 14, 127, 111, 88, 81, 51, 115, 122, 33, 94, 114, 79, 32, 19, 123, 15, 78, 10, 54, 8, 97, 74, 107, 90, 104, 103, 82, 101, 39, 43, 30, 52, 92, 31, 77, 35, 106, 18, 13, 96, 26, 87, 80, 108, 44, 45, 28, 23, 6, 21, 73, 16, 9, 11, 95, 71, 69, 99, 75, 42, 5, 7, 64, 85, 0, 40, 36, 100, 66, 2, 70, 68, 72, 4, 1, 67, 3, 65], [112, 120, 48, 55, 62, 56, 59, 118, 63, 121, 124, 41, 110, 105, 57, 116, 86, 38, 102, 47, 53, 22, 98, 119, 25, 126, 125, 29, 20, 93, 27, 58, 60, 117, 61, 46, 84, 114, 89, 127, 113, 50, 91, 76, 37, 24, 122, 49, 109, 17, 19, 83, 81, 115, 78, 88, 123, 12, 94, 79, 14, 97, 30, 34, 52, 51, 15, 54, 107, 33, 18, 103, 32, 10, 43, 82, 111, 39, 101, 90, 8, 28, 106, 77, 13, 26, 35, 9, 104, 74, 44, 42, 92, 45, 87, 31, 80, 21, 23, 96, 16, 75, 95, 69, 6, 71, 73, 85, 11, 99, 7, 108, 5, 0, 36, 40, 64, 2, 70, 4, 67, 66, 3, 100, 68, 72, 65, 1], [112, 120, 48, 55, 62, 59, 56, 121, 63, 118, 41, 124, 110, 105, 57, 116, 86, 38, 102, 47, 58, 22, 98, 53, 126, 29, 119, 93, 20, 117, 49, 50, 125, 61, 25, 27, 51, 60, 46, 89, 84, 37, 115, 17, 91, 113, 123, 83, 114, 109, 88, 122, 76, 12, 127, 24, 81, 54, 52, 94, 14, 79, 19, 97, 18, 15, 103, 78, 32, 33, 34, 8, 30, 39, 43, 107, 92, 26, 82, 45, 111, 74, 90, 10, 13, 77, 87, 35, 31, 101, 96, 104, 21, 28, 80, 23, 44, 95, 9, 64, 0, 108, 69, 40, 106, 42, 7, 99, 5, 16, 71, 11, 73, 85, 75, 6, 36, 70, 100, 66, 4, 2, 67, 65, 3, 68, 1, 72], [112, 120, 48, 55, 59, 62, 56, 121, 118, 63, 41, 124, 105, 110, 57, 116, 38, 86, 102, 47, 22, 53, 58, 98, 119, 126, 29, 50, 60, 49, 20, 117, 113, 27, 91, 93, 61, 51, 25, 46, 89, 24, 17, 37, 125, 123, 84, 122, 76, 127, 114, 111, 88, 81, 12, 54, 15, 115, 94, 52, 19, 79, 33, 109, 83, 30, 78, 34, 107, 14, 90, 35, 97, 32, 103, 92, 26, 101, 43, 45, 77, 82, 18, 8, 10, 106, 104, 31, 39, 44, 74, 13, 87, 99, 21, 23, 108, 36, 80, 28, 16, 95, 96, 9, 7, 42, 0, 70, 73, 85, 71, 5, 11, 75, 40, 100, 69, 6, 64, 2, 66, 4, 67, 68, 72, 3, 1, 65], [112, 120, 48, 55, 62, 56, 59, 121, 118, 63, 124, 41, 110, 105, 57, 116, 86, 38, 102, 47, 22, 58, 29, 126, 98, 53, 119, 25, 20, 84, 27, 89, 125, 93, 113, 50, 24, 61, 91, 117, 60, 46, 15, 115, 81, 37, 88, 17, 94, 109, 123, 76, 122, 12, 127, 49, 51, 111, 97, 32, 34, 114, 78, 19, 14, 83, 33, 79, 30, 10, 107, 54, 39, 82, 18, 35, 74, 104, 92, 13, 101, 103, 90, 96, 8, 73, 52, 31, 77, 43, 87, 21, 23, 26, 106, 44, 45, 9, 16, 71, 28, 7, 42, 69, 75, 95, 80, 40, 70, 11, 5, 108, 85, 100, 0, 99, 2, 6, 64, 68, 36, 72, 67, 66, 4, 3, 65, 1], [112, 120, 48, 55, 62, 59, 121, 56, 118, 63, 41, 124, 110, 105, 57, 116, 86, 38, 22, 47, 102, 58, 98, 60, 20, 53, 29, 84, 25, 113, 126, 89, 119, 125, 17, 88, 81, 24, 50, 27, 61, 12, 93, 76, 49, 78, 97, 37, 19, 117, 123, 46, 91, 127, 109, 34, 122, 15, 111, 14, 83, 107, 54, 51, 103, 79, 39, 10, 33, 101, 115, 32, 94, 82, 114, 18, 90, 26, 30, 74, 13, 77, 52, 35, 43, 92, 31, 106, 87, 96, 28, 23, 9, 16, 45, 80, 104, 7, 21, 42, 44, 73, 70, 75, 69, 108, 8, 71, 5, 72, 11, 85, 0, 64, 40, 36, 100, 99, 95, 66, 2, 3, 68, 67, 6, 4, 65, 1], [112, 120, 48, 55, 62, 56, 59, 121, 118, 63, 124, 41, 105, 110, 57, 116, 86, 102, 38, 47, 53, 22, 125, 58, 98, 119, 29, 60, 84, 20, 113, 126, 49, 27, 91, 88, 46, 37, 89, 117, 122, 123, 93, 25, 61, 50, 76, 107, 12, 24, 34, 17, 127, 109, 111, 81, 32, 19, 78, 114, 51, 83, 54, 33, 115, 94, 97, 79, 103, 15, 30, 26, 35, 90, 14, 74, 39, 18, 82, 101, 43, 92, 87, 10, 96, 42, 44, 23, 31, 77, 28, 13, 45, 52, 21, 40, 16, 7, 95, 80, 104, 108, 99, 106, 72, 36, 70, 9, 85, 75, 69, 73, 71, 8, 5, 0, 100, 11, 64, 2, 66, 67, 6, 68, 1, 3, 4, 65], [112, 120, 48, 55, 56, 62, 59, 63, 121, 118, 124, 41, 105, 110, 57, 116, 47, 38, 86, 102, 53, 22, 98, 119, 126, 50, 58, 29, 20, 60, 125, 122, 113, 91, 93, 27, 61, 37, 25, 88, 111, 49, 84, 114, 46, 89, 107, 117, 51, 34, 24, 115, 109, 81, 123, 17, 76, 127, 15, 30, 12, 90, 39, 54, 83, 97, 33, 79, 14, 32, 101, 35, 103, 19, 94, 43, 92, 42, 104, 18, 78, 23, 10, 82, 52, 74, 44, 45, 26, 87, 77, 72, 96, 13, 16, 106, 99, 95, 108, 21, 28, 36, 75, 31, 80, 5, 71, 73, 9, 70, 69, 40, 100, 7, 64, 11, 0, 2, 85, 67, 66, 4, 8, 6, 3, 68, 65, 1], [112, 120, 48, 55, 56, 62, 59, 121, 63, 118, 124, 41, 110, 105, 57, 116, 86, 38, 102, 47, 22, 53, 98, 126, 119, 84, 58, 29, 125, 122, 50, 20, 46, 54, 60, 89, 93, 113, 17, 24, 27, 127, 25, 76, 12, 49, 51, 37, 78, 115, 81, 61, 91, 15, 117, 19, 109, 79, 123, 111, 83, 88, 97, 14, 114, 34, 39, 18, 94, 30, 32, 33, 10, 35, 90, 103, 101, 72, 107, 52, 74, 45, 13, 82, 77, 43, 92, 26, 87, 9, 16, 23, 71, 73, 64, 31, 44, 7, 5, 28, 96, 42, 108, 104, 75, 80, 21, 69, 0, 95, 99, 106, 40, 85, 70, 11, 36, 66, 6, 2, 8, 100, 3, 68, 67, 4, 65, 1], [112, 120, 55, 48, 62, 59, 56, 63, 121, 118, 124, 41, 110, 105, 57, 116, 86, 38, 102, 47, 98, 22, 58, 53, 60, 125, 119, 20, 122, 46, 113, 25, 126, 29, 27, 84, 51, 50, 89, 49, 76, 83, 93, 81, 127, 114, 78, 37, 12, 17, 91, 111, 97, 61, 79, 19, 24, 117, 15, 115, 88, 34, 109, 94, 54, 33, 101, 123, 45, 35, 14, 82, 43, 30, 92, 10, 90, 18, 52, 72, 107, 32, 44, 103, 104, 31, 77, 106, 9, 74, 23, 39, 13, 108, 87, 0, 69, 64, 21, 75, 5, 96, 71, 26, 73, 95, 6, 36, 16, 28, 7, 80, 85, 66, 42, 11, 99, 40, 67, 4, 100, 68, 2, 3, 1, 70, 8, 65], [112, 120, 55, 48, 62, 59, 56, 121, 118, 63, 124, 41, 110, 105, 57, 116, 86, 102, 38, 47, 22, 58, 98, 53, 20, 119, 46, 60, 29, 122, 125, 84, 49, 113, 76, 93, 25, 24, 123, 91, 27, 51, 37, 12, 81, 17, 126, 54, 89, 19, 88, 15, 14, 50, 83, 78, 115, 127, 79, 111, 34, 97, 114, 117, 101, 82, 109, 94, 61, 90, 72, 10, 33, 39, 43, 32, 30, 35, 18, 107, 87, 26, 103, 52, 92, 74, 44, 45, 106, 13, 96, 9, 23, 77, 28, 16, 6, 108, 5, 21, 104, 85, 95, 73, 71, 42, 80, 31, 7, 36, 75, 40, 69, 11, 64, 99, 2, 0, 100, 66, 3, 70, 67, 4, 8, 68, 1, 65], [112, 120, 55, 48, 59, 62, 121, 56, 118, 63, 124, 41, 110, 105, 57, 116, 38, 102, 86, 47, 98, 22, 58, 53, 60, 122, 29, 46, 20, 113, 119, 125, 126, 84, 89, 49, 24, 12, 93, 76, 51, 50, 81, 91, 127, 115, 27, 25, 123, 17, 88, 37, 97, 15, 61, 78, 14, 111, 54, 83, 101, 19, 30, 34, 72, 79, 103, 117, 94, 39, 33, 35, 109, 107, 92, 114, 32, 10, 45, 18, 82, 90, 52, 26, 74, 9, 108, 106, 6, 23, 71, 13, 44, 5, 16, 69, 95, 73, 87, 7, 96, 80, 21, 99, 31, 77, 64, 43, 85, 104, 28, 11, 75, 40, 0, 42, 2, 3, 100, 66, 36, 8, 4, 68, 70, 67, 1, 65], [112, 120, 55, 48, 62, 59, 56, 63, 121, 118, 124, 41, 110, 105, 57, 116, 86, 102, 38, 47, 58, 22, 53, 98, 50, 20, 93, 29, 60, 113, 24, 126, 84, 125, 89, 25, 49, 119, 37, 81, 12, 27, 46, 111, 91, 127, 17, 76, 61, 122, 51, 78, 15, 123, 88, 19, 79, 14, 97, 109, 94, 115, 32, 54, 90, 33, 39, 117, 34, 83, 35, 101, 52, 10, 45, 74, 92, 114, 72, 103, 107, 82, 18, 30, 26, 96, 87, 77, 44, 43, 106, 13, 108, 73, 23, 80, 16, 71, 11, 36, 6, 21, 95, 5, 9, 28, 31, 7, 75, 104, 99, 40, 42, 69, 85, 64, 100, 4, 3, 8, 70, 0, 67, 2, 68, 66, 65, 1]], "model.layers.20.self_attn.q_proj": [[114, 126, 101, 110, 46, 98, 19, 86, 29, 92, 25, 120, 24, 67, 124, 82, 12, 89, 9, 87, 28, 37, 119, 7, 49, 90, 16, 60, 23, 51, 13, 74, 115, 79, 31, 122, 58, 65, 127, 62, 77, 112, 121, 100, 116, 54, 4, 0, 97, 57, 88, 43, 118, 53, 125, 39, 61, 111, 108, 96, 34, 55, 50, 15, 14, 63, 32, 113, 47, 48, 40, 94, 30, 5, 93, 6, 56, 59, 64, 20, 109, 123, 42, 44, 117, 66, 1, 35, 72, 107, 68, 95, 45, 52, 2, 17, 27, 36, 83, 41, 26, 3, 70, 102, 38, 80, 104, 33, 105, 84, 106, 91, 81, 103, 18, 21, 99, 69, 8, 78, 73, 10, 85, 75, 71, 22, 76, 11], [46, 126, 114, 110, 101, 120, 98, 49, 124, 92, 55, 115, 58, 122, 19, 60, 119, 28, 37, 40, 54, 127, 86, 121, 116, 52, 53, 125, 118, 112, 57, 50, 51, 47, 61, 48, 62, 56, 108, 111, 90, 31, 42, 63, 113, 59, 24, 123, 43, 89, 109, 117, 105, 91, 45, 103, 44, 29, 106, 88, 15, 104, 82, 102, 39, 107, 41, 25, 95, 36, 16, 94, 35, 100, 23, 99, 85, 7, 38, 22, 33, 97, 13, 93, 96, 87, 34, 14, 79, 9, 32, 26, 21, 30, 27, 83, 73, 12, 17, 20, 84, 80, 8, 77, 81, 72, 11, 18, 78, 75, 66, 74, 10, 68, 6, 5, 4, 71, 76, 3, 1, 69, 64, 70, 2, 65, 67, 0], [126, 101, 110, 114, 46, 98, 28, 92, 120, 19, 86, 115, 124, 14, 24, 60, 58, 121, 82, 123, 23, 29, 89, 119, 116, 125, 37, 42, 39, 122, 36, 49, 31, 55, 118, 112, 103, 54, 109, 48, 25, 105, 127, 57, 62, 99, 47, 53, 51, 16, 111, 50, 59, 61, 56, 117, 26, 100, 91, 63, 33, 104, 95, 38, 32, 45, 30, 113, 83, 93, 52, 35, 94, 40, 41, 44, 85, 97, 87, 90, 96, 12, 107, 27, 108, 43, 84, 7, 102, 74, 34, 106, 21, 13, 81, 22, 15, 17, 88, 79, 80, 5, 8, 20, 18, 76, 78, 75, 71, 77, 3, 72, 9, 11, 73, 10, 66, 70, 69, 6, 1, 68, 67, 4, 2, 65, 0, 64], [114, 126, 46, 110, 101, 92, 98, 120, 119, 12, 124, 28, 19, 82, 60, 5, 51, 25, 86, 37, 127, 49, 115, 0, 62, 58, 16, 122, 50, 57, 74, 112, 117, 121, 7, 123, 61, 53, 38, 55, 113, 54, 68, 125, 118, 14, 31, 116, 47, 44, 48, 56, 109, 111, 63, 89, 87, 104, 43, 108, 2, 100, 36, 91, 45, 107, 33, 105, 94, 29, 52, 23, 59, 8, 41, 39, 6, 85, 77, 40, 70, 79, 3, 102, 42, 15, 35, 66, 18, 95, 32, 88, 69, 83, 99, 106, 96, 73, 75, 30, 9, 1, 71, 93, 64, 17, 27, 13, 103, 97, 24, 4, 20, 65, 78, 11, 67, 76, 84, 81, 21, 80, 22, 26, 72, 90, 10, 34], [43, 98, 93, 103, 107, 116, 46, 117, 78, 58, 21, 24, 81, 19, 127, 12, 63, 8, 51, 95, 121, 62, 92, 90, 120, 29, 37, 70, 26, 109, 41, 59, 101, 104, 119, 105, 100, 52, 48, 50, 36, 57, 87, 74, 86, 16, 27, 44, 7, 110, 73, 2, 72, 42, 54, 33, 67, 13, 32, 39, 56, 126, 115, 75, 53, 79, 114, 68, 83, 84, 3, 10, 125, 23, 64, 118, 9, 22, 1, 112, 61, 113, 80, 40, 25, 17, 47, 108, 85, 66, 106, 45, 30, 14, 97, 99, 38, 76, 123, 94, 35, 122, 96, 11, 82, 18, 60, 28, 49, 111, 31, 6, 15, 0, 55, 91, 71, 20, 77, 69, 89, 34, 102, 88, 124, 5, 4, 65], [43, 121, 107, 98, 93, 46, 63, 62, 52, 58, 118, 109, 38, 24, 116, 39, 127, 103, 54, 119, 37, 104, 19, 53, 51, 55, 26, 29, 44, 49, 56, 105, 21, 57, 110, 115, 48, 117, 41, 95, 50, 90, 61, 33, 47, 59, 32, 125, 112, 92, 120, 81, 123, 42, 106, 114, 108, 60, 23, 78, 111, 25, 126, 124, 122, 96, 86, 45, 113, 87, 34, 75, 12, 40, 100, 36, 101, 70, 9, 91, 83, 97, 73, 102, 31, 35, 84, 28, 22, 30, 80, 88, 99, 18, 77, 10, 82, 68, 94, 11, 27, 20, 13, 79, 17, 89, 4, 15, 8, 16, 67, 1, 85, 7, 64, 6, 71, 74, 76, 3, 65, 69, 2, 5, 14, 66, 0, 72], [43, 113, 121, 116, 93, 98, 107, 59, 52, 24, 103, 63, 53, 125, 49, 29, 127, 105, 58, 51, 110, 95, 26, 48, 117, 54, 119, 112, 46, 124, 37, 92, 120, 47, 44, 106, 61, 21, 123, 118, 109, 114, 56, 62, 45, 111, 104, 60, 90, 115, 91, 41, 122, 126, 39, 108, 57, 100, 81, 42, 97, 50, 40, 32, 33, 87, 38, 23, 101, 55, 102, 36, 22, 19, 35, 96, 88, 34, 83, 86, 84, 99, 94, 31, 18, 30, 80, 27, 77, 28, 25, 82, 89, 75, 85, 20, 16, 15, 78, 12, 17, 9, 76, 13, 79, 73, 11, 74, 70, 14, 10, 6, 72, 7, 5, 4, 66, 68, 8, 0, 69, 65, 71, 67, 2, 3, 1, 64], [43, 98, 93, 107, 103, 46, 116, 121, 24, 81, 78, 51, 21, 12, 109, 62, 8, 63, 19, 120, 26, 7, 117, 52, 29, 69, 74, 127, 68, 58, 38, 70, 92, 119, 73, 65, 90, 3, 2, 105, 37, 16, 66, 125, 44, 95, 50, 86, 48, 10, 39, 75, 106, 54, 0, 33, 64, 47, 59, 87, 56, 4, 96, 5, 97, 49, 82, 53, 71, 41, 101, 45, 84, 67, 18, 113, 114, 123, 110, 115, 104, 83, 13, 77, 79, 118, 6, 126, 11, 111, 36, 60, 17, 55, 124, 61, 1, 108, 76, 57, 23, 28, 25, 42, 35, 20, 112, 27, 80, 30, 72, 22, 40, 99, 85, 100, 122, 31, 9, 91, 15, 32, 14, 89, 102, 94, 34, 88], [39, 109, 46, 97, 93, 110, 78, 17, 10, 71, 19, 45, 5, 85, 76, 86, 16, 65, 26, 3, 22, 116, 83, 9, 89, 66, 2, 121, 35, 90, 87, 88, 107, 81, 24, 64, 79, 21, 91, 80, 119, 103, 75, 11, 28, 12, 74, 29, 15, 82, 54, 8, 92, 72, 127, 7, 18, 111, 59, 13, 23, 77, 14, 94, 1, 84, 4, 123, 48, 69, 68, 67, 20, 32, 73, 6, 70, 0, 27, 118, 30, 25, 115, 34, 99, 33, 114, 96, 102, 113, 31, 100, 38, 43, 95, 101, 58, 44, 41, 51, 125, 120, 122, 57, 98, 53, 50, 55, 126, 40, 60, 36, 104, 52, 47, 106, 62, 37, 42, 61, 105, 63, 117, 124, 112, 49, 56, 108], [109, 39, 45, 93, 97, 85, 107, 123, 110, 16, 99, 116, 54, 26, 76, 59, 53, 119, 57, 90, 23, 121, 56, 60, 118, 58, 115, 24, 55, 125, 51, 112, 40, 126, 63, 111, 43, 114, 29, 48, 102, 19, 122, 113, 127, 52, 86, 44, 124, 9, 105, 47, 62, 49, 117, 92, 46, 108, 120, 37, 61, 50, 41, 30, 106, 21, 42, 83, 6, 31, 104, 36, 94, 98, 100, 17, 64, 28, 5, 103, 35, 101, 34, 88, 96, 38, 4, 32, 25, 95, 0, 18, 87, 12, 27, 80, 10, 89, 22, 78, 1, 33, 3, 65, 68, 66, 84, 70, 73, 20, 91, 71, 82, 67, 81, 79, 77, 11, 72, 15, 75, 13, 2, 69, 8, 7, 14, 74], [109, 39, 45, 97, 93, 107, 123, 85, 110, 46, 26, 16, 86, 76, 23, 19, 121, 24, 83, 90, 29, 119, 59, 116, 51, 102, 111, 127, 57, 118, 99, 54, 17, 22, 10, 125, 53, 78, 48, 122, 58, 114, 55, 112, 60, 113, 63, 21, 124, 56, 126, 115, 27, 30, 104, 120, 50, 49, 62, 52, 12, 92, 44, 105, 61, 108, 47, 96, 35, 103, 42, 41, 89, 40, 117, 43, 15, 36, 88, 101, 37, 5, 79, 100, 32, 38, 71, 64, 33, 106, 94, 80, 95, 34, 9, 11, 6, 98, 18, 68, 82, 28, 87, 3, 77, 31, 81, 91, 25, 84, 75, 13, 20, 66, 70, 1, 0, 65, 67, 14, 4, 8, 74, 72, 73, 69, 7, 2], [39, 109, 110, 46, 97, 45, 93, 102, 86, 26, 85, 118, 117, 96, 19, 24, 123, 17, 111, 92, 57, 35, 30, 15, 60, 125, 49, 59, 116, 53, 58, 29, 121, 119, 113, 54, 99, 127, 78, 44, 63, 48, 27, 89, 122, 52, 107, 55, 51, 124, 47, 11, 112, 36, 56, 98, 126, 95, 41, 37, 62, 61, 83, 115, 40, 108, 42, 114, 106, 101, 50, 43, 105, 100, 10, 120, 32, 87, 31, 34, 28, 104, 88, 22, 76, 75, 84, 90, 16, 9, 91, 23, 25, 94, 18, 5, 79, 38, 20, 0, 21, 4, 82, 6, 13, 80, 33, 77, 70, 12, 1, 2, 8, 71, 103, 81, 65, 14, 64, 69, 72, 67, 68, 73, 3, 74, 7, 66], [51, 101, 120, 119, 97, 127, 25, 115, 82, 37, 113, 110, 118, 46, 104, 22, 28, 62, 89, 31, 93, 78, 53, 109, 21, 94, 116, 42, 122, 121, 98, 75, 30, 27, 123, 117, 126, 23, 87, 20, 59, 52, 96, 54, 47, 69, 44, 114, 48, 60, 9, 124, 81, 61, 63, 57, 56, 83, 58, 111, 55, 16, 50, 80, 125, 86, 91, 49, 95, 77, 112, 88, 24, 29, 41, 10, 90, 92, 14, 105, 45, 107, 108, 73, 3, 19, 84, 99, 71, 85, 39, 34, 43, 103, 106, 102, 100, 35, 70, 18, 38, 64, 32, 26, 5, 40, 15, 17, 8, 36, 12, 68, 67, 65, 66, 7, 74, 72, 33, 76, 13, 2, 1, 4, 11, 79, 0, 6], [127, 120, 51, 119, 101, 109, 97, 56, 113, 107, 82, 118, 102, 54, 106, 62, 59, 117, 25, 93, 124, 123, 116, 52, 126, 122, 57, 37, 38, 121, 114, 104, 55, 105, 48, 50, 125, 49, 98, 115, 112, 58, 47, 91, 44, 60, 89, 63, 35, 45, 61, 111, 110, 40, 17, 30, 46, 92, 28, 53, 23, 85, 90, 94, 42, 108, 18, 34, 36, 43, 31, 39, 96, 84, 27, 103, 41, 100, 99, 24, 88, 95, 79, 29, 26, 15, 12, 19, 32, 81, 73, 33, 83, 87, 21, 11, 78, 20, 22, 16, 77, 75, 13, 86, 1, 66, 76, 7, 0, 80, 14, 72, 69, 3, 70, 6, 9, 4, 68, 74, 10, 5, 2, 8, 67, 64, 65, 71], [120, 51, 119, 101, 25, 89, 97, 37, 47, 118, 127, 115, 62, 117, 113, 123, 84, 122, 54, 121, 52, 60, 126, 53, 55, 116, 125, 46, 56, 16, 57, 49, 50, 59, 124, 61, 63, 109, 114, 58, 48, 110, 93, 111, 28, 44, 30, 22, 42, 108, 112, 45, 12, 29, 107, 40, 92, 41, 43, 80, 103, 39, 38, 104, 105, 86, 106, 87, 73, 20, 27, 31, 95, 102, 18, 35, 98, 33, 69, 7, 36, 3, 66, 100, 34, 14, 99, 88, 77, 72, 94, 82, 76, 17, 32, 10, 1, 96, 13, 91, 70, 85, 15, 8, 74, 75, 4, 19, 83, 24, 78, 26, 11, 0, 5, 68, 21, 9, 23, 71, 79, 81, 90, 6, 67, 64, 2, 65], [119, 101, 51, 127, 120, 84, 97, 113, 115, 54, 59, 123, 62, 118, 82, 92, 30, 50, 37, 46, 27, 124, 126, 89, 25, 28, 43, 47, 110, 125, 61, 52, 45, 121, 40, 109, 53, 100, 122, 42, 102, 18, 107, 116, 58, 60, 49, 104, 48, 35, 15, 57, 114, 44, 63, 117, 95, 56, 105, 91, 96, 36, 22, 111, 108, 77, 106, 112, 73, 55, 90, 34, 41, 32, 99, 20, 38, 33, 39, 103, 98, 12, 79, 19, 26, 94, 86, 17, 93, 31, 81, 88, 29, 24, 87, 11, 13, 16, 21, 23, 74, 83, 76, 80, 10, 85, 14, 75, 72, 9, 71, 69, 6, 78, 70, 7, 68, 8, 0, 5, 67, 2, 4, 1, 3, 66, 65, 64], [110, 50, 37, 46, 26, 38, 88, 114, 41, 58, 98, 30, 36, 82, 53, 94, 90, 85, 97, 28, 34, 84, 117, 109, 100, 122, 20, 96, 123, 86, 57, 87, 104, 39, 51, 21, 25, 111, 81, 126, 120, 102, 99, 62, 22, 61, 113, 52, 45, 76, 112, 48, 116, 115, 27, 95, 29, 43, 78, 44, 79, 42, 103, 59, 47, 101, 108, 17, 105, 107, 83, 18, 121, 127, 106, 23, 31, 56, 60, 119, 54, 40, 125, 49, 124, 63, 10, 33, 91, 35, 32, 89, 55, 24, 92, 16, 4, 93, 118, 64, 75, 6, 66, 13, 72, 65, 15, 80, 77, 7, 74, 11, 73, 14, 19, 5, 12, 3, 67, 70, 68, 71, 8, 1, 9, 69, 2, 0], [110, 50, 46, 37, 58, 26, 97, 88, 41, 84, 96, 62, 114, 90, 106, 109, 30, 78, 126, 20, 92, 85, 86, 81, 34, 28, 87, 53, 123, 94, 99, 83, 104, 91, 23, 122, 38, 117, 57, 31, 93, 39, 10, 124, 63, 98, 112, 11, 21, 42, 51, 56, 48, 127, 61, 125, 17, 108, 111, 107, 116, 29, 16, 105, 115, 3, 24, 52, 118, 49, 121, 60, 7, 55, 6, 113, 70, 43, 54, 103, 47, 45, 36, 68, 102, 59, 80, 4, 32, 33, 120, 25, 95, 119, 100, 64, 44, 76, 40, 89, 77, 27, 19, 14, 82, 15, 18, 73, 35, 0, 75, 5, 79, 66, 74, 71, 72, 101, 22, 8, 9, 12, 65, 67, 13, 1, 69, 2], [110, 50, 37, 58, 46, 26, 41, 98, 97, 30, 109, 87, 122, 90, 84, 88, 94, 31, 85, 62, 60, 81, 28, 125, 78, 29, 63, 35, 20, 91, 36, 43, 126, 48, 17, 86, 121, 124, 61, 83, 51, 32, 102, 100, 54, 15, 118, 117, 114, 123, 52, 93, 92, 47, 45, 127, 56, 11, 112, 111, 113, 40, 107, 120, 116, 4, 103, 22, 27, 108, 106, 55, 104, 53, 24, 42, 101, 76, 49, 99, 89, 21, 115, 34, 96, 16, 38, 57, 33, 44, 39, 66, 6, 18, 59, 75, 119, 7, 95, 19, 79, 25, 105, 23, 8, 80, 72, 77, 10, 73, 14, 64, 74, 82, 70, 12, 5, 3, 65, 13, 67, 69, 71, 9, 2, 68, 1, 0], [110, 50, 37, 46, 58, 26, 97, 84, 30, 41, 109, 88, 28, 78, 90, 96, 117, 94, 126, 17, 47, 20, 91, 87, 85, 53, 106, 108, 81, 114, 29, 122, 105, 93, 86, 107, 11, 31, 111, 48, 34, 23, 40, 95, 45, 57, 43, 124, 16, 92, 119, 123, 49, 59, 83, 51, 15, 21, 118, 62, 63, 112, 104, 55, 98, 36, 24, 56, 116, 4, 61, 10, 127, 113, 44, 65, 54, 60, 121, 18, 103, 5, 120, 125, 39, 77, 52, 38, 100, 72, 115, 35, 89, 102, 76, 6, 99, 33, 42, 32, 74, 79, 75, 7, 70, 3, 27, 80, 73, 66, 14, 22, 25, 82, 12, 13, 101, 71, 0, 19, 8, 68, 9, 64, 67, 2, 1, 69], [125, 48, 112, 37, 53, 124, 46, 126, 61, 117, 100, 113, 121, 54, 51, 116, 57, 56, 59, 58, 114, 34, 120, 127, 118, 52, 60, 122, 123, 50, 49, 63, 62, 55, 110, 115, 97, 111, 119, 47, 109, 45, 108, 42, 44, 102, 43, 105, 103, 107, 41, 99, 104, 26, 39, 40, 106, 32, 35, 84, 38, 101, 86, 90, 96, 94, 95, 82, 88, 36, 28, 33, 30, 98, 31, 29, 87, 22, 93, 73, 14, 76, 20, 23, 16, 92, 18, 24, 78, 25, 80, 27, 7, 89, 5, 12, 91, 3, 65, 69, 9, 85, 1, 71, 83, 0, 67, 17, 21, 64, 2, 4, 15, 66, 19, 70, 10, 11, 13, 75, 77, 68, 6, 8, 74, 81, 72, 79], [48, 100, 46, 112, 25, 85, 125, 17, 34, 87, 14, 15, 83, 27, 86, 80, 96, 12, 10, 40, 29, 84, 97, 98, 103, 13, 94, 95, 37, 93, 16, 9, 22, 75, 24, 99, 8, 78, 42, 124, 73, 28, 32, 110, 69, 6, 3, 36, 77, 118, 20, 117, 11, 81, 68, 26, 21, 5, 92, 71, 74, 76, 18, 30, 82, 90, 126, 88, 50, 121, 89, 23, 79, 47, 38, 19, 0, 91, 39, 105, 1, 31, 70, 7, 115, 35, 2, 41, 116, 67, 45, 54, 114, 65, 101, 120, 53, 4, 102, 63, 104, 61, 109, 43, 64, 72, 58, 113, 33, 57, 127, 59, 56, 119, 62, 123, 52, 49, 66, 108, 106, 111, 122, 51, 55, 107, 44, 60], [48, 112, 27, 100, 46, 85, 83, 34, 25, 97, 17, 13, 15, 125, 75, 8, 28, 87, 86, 32, 37, 30, 29, 4, 12, 10, 18, 72, 124, 68, 66, 6, 96, 16, 2, 24, 77, 40, 47, 91, 33, 20, 76, 93, 26, 70, 71, 7, 31, 84, 95, 14, 90, 22, 88, 9, 36, 0, 23, 82, 39, 121, 89, 126, 81, 21, 11, 79, 38, 92, 94, 74, 118, 73, 1, 99, 54, 98, 43, 103, 19, 78, 69, 110, 114, 117, 35, 80, 115, 5, 65, 102, 67, 53, 113, 3, 45, 59, 57, 101, 56, 64, 116, 50, 105, 44, 42, 123, 111, 41, 58, 61, 52, 49, 63, 62, 104, 120, 107, 60, 119, 51, 109, 108, 127, 55, 122, 106], [46, 48, 112, 125, 121, 37, 124, 54, 126, 57, 56, 117, 62, 53, 113, 59, 120, 58, 116, 61, 114, 127, 122, 49, 123, 51, 52, 63, 118, 115, 60, 110, 55, 50, 111, 119, 45, 47, 109, 43, 44, 100, 108, 102, 104, 106, 103, 107, 41, 39, 42, 105, 40, 38, 96, 34, 101, 98, 99, 97, 33, 32, 36, 95, 86, 35, 90, 88, 28, 22, 94, 84, 82, 26, 31, 30, 92, 29, 93, 25, 89, 65, 91, 87, 20, 78, 18, 7, 16, 24, 80, 27, 76, 14, 23, 0, 3, 12, 85, 67, 73, 2, 69, 71, 5, 83, 9, 1, 21, 17, 70, 4, 81, 19, 10, 64, 13, 15, 66, 8, 75, 79, 68, 74, 72, 77, 11, 6], [116, 122, 103, 57, 126, 55, 119, 118, 40, 127, 54, 51, 59, 100, 120, 114, 125, 124, 121, 61, 115, 112, 111, 50, 123, 85, 63, 53, 62, 113, 108, 56, 47, 117, 60, 30, 48, 58, 52, 46, 104, 49, 109, 42, 33, 110, 44, 102, 45, 106, 41, 43, 107, 29, 105, 38, 36, 24, 37, 27, 34, 99, 101, 35, 25, 32, 96, 98, 19, 31, 94, 95, 97, 39, 93, 23, 78, 28, 91, 87, 83, 21, 14, 22, 92, 89, 17, 86, 90, 26, 10, 1, 72, 0, 88, 8, 5, 3, 4, 16, 81, 80, 74, 75, 68, 13, 71, 64, 67, 66, 6, 18, 84, 69, 12, 2, 70, 7, 65, 77, 82, 9, 76, 79, 15, 11, 73, 20], [122, 116, 103, 55, 54, 120, 50, 40, 114, 118, 33, 53, 124, 24, 126, 100, 115, 59, 127, 19, 113, 117, 31, 125, 63, 121, 61, 57, 119, 112, 111, 60, 123, 56, 62, 58, 46, 110, 52, 51, 30, 102, 47, 42, 49, 48, 108, 27, 85, 74, 106, 109, 104, 22, 94, 80, 25, 44, 35, 107, 36, 45, 93, 43, 91, 41, 105, 32, 29, 38, 37, 84, 101, 28, 34, 26, 20, 98, 89, 82, 86, 23, 99, 97, 87, 96, 90, 21, 95, 83, 15, 72, 92, 88, 39, 5, 12, 0, 14, 18, 16, 78, 77, 3, 17, 73, 71, 6, 79, 76, 81, 68, 13, 10, 70, 9, 4, 2, 8, 66, 69, 64, 67, 75, 1, 65, 11, 7], [116, 122, 103, 55, 118, 54, 126, 50, 127, 61, 114, 57, 120, 112, 119, 121, 125, 115, 124, 62, 117, 40, 123, 33, 52, 53, 59, 60, 56, 113, 58, 104, 63, 111, 51, 100, 34, 91, 47, 49, 26, 36, 48, 93, 22, 110, 19, 46, 42, 44, 45, 25, 109, 102, 108, 24, 27, 107, 21, 106, 41, 31, 43, 80, 101, 94, 37, 35, 99, 38, 105, 29, 85, 28, 98, 86, 32, 39, 97, 96, 95, 89, 30, 92, 23, 78, 90, 74, 16, 14, 83, 82, 17, 88, 84, 87, 3, 18, 77, 12, 15, 10, 11, 1, 9, 68, 4, 81, 66, 0, 8, 20, 65, 13, 6, 79, 67, 70, 64, 71, 75, 69, 5, 2, 72, 73, 76, 7], [103, 116, 122, 93, 100, 84, 82, 55, 81, 33, 88, 15, 22, 75, 91, 87, 77, 52, 24, 11, 118, 79, 17, 71, 50, 26, 31, 119, 102, 126, 27, 40, 120, 13, 57, 32, 54, 89, 29, 23, 20, 34, 25, 90, 12, 21, 28, 72, 83, 125, 115, 121, 14, 18, 86, 59, 68, 94, 114, 61, 112, 85, 117, 124, 113, 127, 53, 96, 92, 123, 63, 8, 30, 111, 80, 58, 76, 62, 16, 110, 46, 47, 51, 7, 56, 104, 78, 69, 9, 10, 19, 60, 73, 97, 67, 95, 5, 37, 44, 49, 4, 66, 3, 108, 48, 99, 6, 74, 43, 109, 42, 70, 45, 98, 38, 36, 0, 107, 35, 106, 1, 64, 2, 105, 41, 101, 65, 39], [102, 113, 56, 97, 23, 49, 79, 29, 77, 81, 20, 75, 10, 72, 6, 3, 22, 16, 18, 32, 67, 38, 28, 15, 34, 5, 9, 88, 48, 17, 112, 65, 115, 92, 25, 70, 26, 74, 53, 78, 0, 39, 69, 86, 84, 63, 87, 64, 11, 31, 12, 105, 83, 19, 89, 66, 58, 117, 57, 80, 30, 85, 21, 13, 91, 93, 73, 8, 55, 82, 90, 52, 100, 41, 27, 1, 44, 7, 95, 96, 103, 36, 76, 94, 59, 99, 24, 119, 121, 14, 51, 50, 116, 35, 118, 33, 47, 60, 106, 101, 110, 46, 122, 2, 98, 123, 45, 62, 40, 109, 4, 42, 71, 37, 126, 104, 54, 124, 68, 61, 43, 114, 120, 125, 107, 108, 127, 111], [102, 56, 113, 97, 29, 49, 23, 20, 79, 77, 75, 81, 10, 3, 6, 72, 22, 69, 0, 30, 18, 65, 88, 25, 53, 84, 74, 13, 70, 17, 87, 86, 59, 26, 93, 71, 1, 89, 64, 66, 15, 76, 52, 80, 92, 11, 119, 73, 24, 4, 95, 96, 8, 31, 34, 90, 91, 14, 33, 19, 32, 28, 38, 62, 58, 82, 9, 41, 36, 112, 94, 110, 12, 83, 78, 16, 5, 121, 27, 57, 21, 103, 7, 48, 115, 50, 63, 117, 85, 35, 99, 2, 101, 43, 98, 39, 100, 67, 125, 60, 68, 45, 123, 122, 105, 51, 126, 44, 54, 127, 47, 61, 37, 124, 114, 106, 55, 108, 111, 40, 120, 46, 42, 107, 109, 104, 118, 116], [113, 102, 56, 121, 48, 49, 97, 39, 29, 22, 44, 126, 16, 115, 52, 50, 60, 59, 124, 118, 51, 117, 123, 38, 57, 119, 63, 54, 47, 112, 28, 114, 127, 58, 46, 122, 90, 62, 120, 110, 53, 125, 43, 55, 40, 109, 104, 61, 116, 108, 24, 41, 37, 111, 105, 103, 107, 32, 101, 106, 42, 45, 91, 23, 20, 26, 36, 100, 86, 25, 34, 98, 81, 99, 88, 80, 96, 93, 31, 94, 35, 85, 14, 30, 27, 95, 92, 83, 76, 33, 18, 78, 82, 9, 89, 19, 21, 12, 73, 77, 75, 69, 84, 10, 71, 79, 7, 4, 2, 68, 87, 17, 66, 5, 72, 64, 65, 0, 3, 1, 6, 15, 11, 13, 67, 8, 70, 74], [56, 102, 113, 48, 97, 49, 29, 23, 41, 22, 117, 52, 28, 115, 16, 46, 36, 20, 114, 103, 81, 86, 121, 44, 31, 50, 53, 63, 105, 58, 25, 77, 39, 119, 51, 106, 126, 91, 54, 90, 76, 78, 27, 60, 89, 55, 59, 123, 62, 92, 79, 118, 10, 38, 107, 104, 42, 120, 122, 109, 108, 124, 72, 69, 112, 14, 12, 127, 47, 116, 57, 110, 33, 43, 19, 111, 93, 32, 83, 24, 40, 101, 45, 125, 30, 61, 80, 75, 37, 35, 99, 82, 84, 26, 98, 88, 18, 34, 85, 96, 100, 9, 94, 21, 3, 64, 95, 66, 68, 71, 5, 7, 74, 1, 0, 17, 73, 6, 4, 65, 2, 87, 13, 70, 15, 67, 11, 8]], "model.layers.20.self_attn.k_proj": [[126, 114, 46, 37, 86, 34, 50, 64, 124, 28, 12, 16, 25, 51, 19, 66, 119, 57, 120, 7, 60, 58, 115, 82, 53, 127, 123, 62, 116, 61, 74, 122, 14, 113, 112, 9, 117, 63, 56, 125, 54, 121, 55, 47, 118, 48, 44, 111, 43, 79, 59, 49, 13, 45, 109, 29, 39, 101, 95, 6, 52, 107, 24, 3, 41, 36, 42, 108, 102, 104, 87, 105, 106, 40, 94, 103, 21, 38, 98, 4, 33, 90, 27, 32, 23, 8, 35, 72, 96, 5, 1, 99, 15, 100, 91, 80, 30, 110, 77, 97, 89, 84, 20, 93, 70, 88, 17, 2, 31, 11, 18, 65, 68, 26, 81, 75, 92, 69, 76, 85, 78, 73, 67, 0, 10, 83, 71, 22], [107, 34, 43, 29, 116, 51, 90, 78, 81, 8, 127, 19, 31, 70, 12, 56, 0, 110, 7, 120, 87, 32, 68, 61, 63, 3, 74, 62, 75, 54, 2, 39, 121, 88, 21, 66, 100, 24, 92, 73, 85, 58, 46, 95, 28, 86, 109, 45, 52, 59, 69, 77, 126, 117, 37, 1, 48, 113, 57, 36, 33, 47, 42, 114, 53, 27, 64, 4, 11, 84, 99, 18, 15, 5, 50, 119, 108, 97, 112, 65, 82, 80, 124, 44, 125, 16, 30, 71, 102, 22, 25, 41, 122, 40, 94, 60, 118, 105, 55, 101, 104, 10, 79, 13, 106, 38, 89, 115, 20, 26, 49, 9, 123, 111, 23, 35, 91, 72, 76, 96, 17, 14, 67, 93, 83, 6, 103, 98], [45, 103, 46, 33, 29, 109, 86, 19, 85, 123, 90, 119, 5, 10, 16, 64, 78, 57, 116, 76, 71, 52, 17, 112, 54, 126, 127, 121, 65, 3, 53, 115, 125, 51, 122, 107, 55, 48, 113, 58, 56, 114, 118, 59, 120, 60, 24, 124, 9, 2, 47, 44, 61, 63, 34, 111, 89, 62, 108, 106, 67, 13, 41, 11, 49, 32, 117, 100, 43, 99, 50, 72, 35, 38, 101, 104, 6, 31, 69, 105, 23, 42, 37, 15, 18, 88, 30, 95, 82, 87, 36, 91, 94, 102, 25, 84, 92, 75, 0, 40, 110, 27, 93, 98, 14, 1, 77, 96, 28, 20, 8, 4, 73, 79, 66, 81, 26, 83, 68, 22, 7, 70, 74, 39, 12, 97, 80, 21], [51, 37, 119, 120, 127, 33, 22, 46, 122, 123, 121, 50, 118, 56, 45, 124, 113, 52, 126, 61, 116, 53, 62, 117, 48, 125, 106, 115, 54, 55, 60, 57, 30, 59, 58, 63, 40, 89, 91, 49, 108, 114, 101, 47, 110, 104, 111, 112, 41, 109, 80, 44, 102, 103, 43, 42, 107, 38, 105, 39, 16, 92, 93, 95, 23, 36, 32, 82, 35, 34, 31, 21, 84, 29, 25, 86, 99, 96, 100, 28, 77, 76, 26, 98, 87, 88, 19, 72, 12, 27, 90, 94, 24, 83, 70, 78, 15, 85, 10, 75, 97, 7, 9, 74, 81, 14, 17, 13, 68, 3, 79, 11, 5, 2, 20, 18, 73, 65, 6, 4, 71, 64, 67, 8, 69, 0, 1, 66], [46, 50, 110, 101, 26, 30, 58, 20, 114, 85, 78, 33, 17, 4, 66, 88, 11, 76, 79, 41, 62, 7, 72, 126, 64, 75, 5, 53, 122, 70, 6, 3, 19, 34, 87, 73, 18, 23, 91, 86, 15, 68, 83, 16, 32, 77, 60, 84, 81, 49, 43, 10, 65, 117, 100, 118, 105, 127, 24, 112, 109, 31, 51, 92, 45, 67, 61, 120, 48, 35, 57, 63, 56, 29, 40, 107, 96, 25, 89, 22, 27, 113, 119, 125, 82, 52, 80, 124, 55, 116, 21, 93, 0, 1, 115, 121, 103, 108, 44, 54, 106, 99, 28, 74, 38, 123, 95, 13, 104, 39, 47, 59, 98, 36, 102, 111, 42, 8, 97, 14, 9, 37, 12, 2, 90, 94, 71, 69], [112, 48, 36, 86, 15, 17, 125, 110, 83, 75, 25, 10, 27, 33, 13, 29, 85, 70, 32, 30, 8, 124, 117, 46, 56, 126, 121, 54, 84, 82, 115, 61, 28, 57, 53, 2, 116, 98, 0, 47, 4, 62, 90, 120, 59, 51, 80, 63, 102, 60, 114, 87, 123, 35, 127, 44, 113, 50, 109, 52, 122, 118, 49, 58, 92, 105, 111, 55, 41, 119, 77, 45, 11, 104, 26, 107, 42, 73, 43, 38, 6, 103, 101, 106, 37, 31, 14, 108, 40, 74, 79, 65, 99, 7, 88, 16, 34, 23, 72, 39, 12, 76, 96, 95, 67, 21, 93, 78, 3, 24, 20, 89, 91, 9, 69, 94, 97, 100, 19, 18, 68, 71, 81, 5, 66, 22, 1, 64], [122, 116, 39, 22, 55, 97, 54, 61, 126, 118, 52, 117, 112, 124, 57, 121, 50, 123, 120, 115, 62, 113, 127, 27, 58, 56, 114, 53, 59, 63, 51, 95, 119, 98, 60, 26, 34, 49, 125, 111, 47, 110, 104, 45, 42, 44, 46, 82, 109, 48, 24, 107, 108, 80, 32, 38, 35, 41, 43, 84, 102, 37, 106, 16, 105, 101, 12, 36, 96, 40, 14, 99, 85, 92, 23, 15, 29, 9, 33, 94, 90, 17, 77, 19, 30, 31, 93, 100, 28, 10, 71, 75, 103, 81, 87, 25, 91, 89, 20, 79, 78, 76, 70, 67, 83, 18, 11, 8, 72, 1, 88, 13, 69, 86, 74, 5, 3, 21, 73, 0, 6, 66, 7, 4, 64, 2, 68, 65], [113, 56, 38, 33, 22, 93, 81, 23, 79, 72, 77, 20, 6, 10, 75, 112, 0, 3, 120, 69, 117, 27, 57, 65, 16, 49, 52, 115, 103, 54, 59, 110, 50, 126, 46, 122, 51, 28, 53, 60, 124, 26, 58, 55, 61, 119, 125, 102, 118, 105, 62, 123, 5, 121, 48, 95, 116, 2, 94, 47, 63, 45, 127, 83, 108, 30, 114, 99, 109, 106, 9, 107, 66, 100, 89, 67, 32, 104, 44, 43, 35, 21, 14, 76, 40, 42, 18, 96, 78, 85, 88, 24, 101, 39, 111, 34, 73, 68, 41, 25, 4, 36, 31, 37, 29, 90, 98, 12, 92, 1, 80, 19, 7, 71, 74, 82, 64, 91, 87, 84, 8, 17, 70, 11, 13, 15, 86, 97]], "model.layers.20.self_attn.qk_proj": [[46, 113, 56, 116, 122, 112, 126, 48, 110, 51, 114, 45, 107, 43, 119, 120, 50, 109, 127, 37, 93, 125, 121, 54, 49, 22, 52, 123, 86, 101, 58, 39, 102, 118, 55, 29, 115, 53, 57, 90, 63, 124, 60, 62, 103, 26, 33, 92, 61, 97, 59, 17, 81, 34, 23, 83, 38, 19, 84, 89, 47, 117, 91, 87, 85, 24, 14, 111, 44, 32, 108, 41, 78, 98, 21, 16, 74, 30, 80, 88, 25, 20, 105, 36, 79, 100, 75, 77, 10, 15, 106, 95, 11, 12, 13, 28, 104, 27, 76, 31, 94, 72, 6, 82, 40, 18, 8, 96, 35, 7, 42, 67, 71, 3, 66, 9, 64, 0, 99, 73, 5, 2, 70, 4, 65, 1, 69, 68], [46, 113, 56, 116, 126, 112, 122, 48, 110, 45, 51, 114, 120, 107, 119, 43, 50, 109, 127, 93, 37, 125, 121, 101, 49, 52, 58, 123, 86, 22, 54, 102, 39, 29, 53, 103, 115, 55, 60, 97, 61, 90, 118, 117, 63, 34, 57, 59, 33, 124, 26, 62, 38, 81, 92, 23, 85, 87, 83, 24, 47, 89, 19, 17, 30, 98, 91, 84, 14, 41, 78, 16, 88, 32, 111, 21, 80, 36, 75, 95, 20, 76, 100, 44, 25, 108, 10, 77, 15, 12, 11, 74, 104, 79, 105, 106, 13, 42, 28, 94, 96, 6, 27, 72, 82, 31, 40, 9, 8, 64, 18, 35, 3, 71, 66, 2, 7, 0, 99, 70, 5, 68, 67, 1, 73, 69, 65, 4], [46, 113, 116, 56, 126, 112, 48, 122, 110, 51, 114, 45, 119, 107, 43, 120, 50, 109, 127, 93, 125, 37, 121, 22, 86, 101, 49, 54, 52, 58, 55, 39, 29, 63, 102, 123, 103, 115, 33, 118, 60, 90, 53, 124, 61, 97, 62, 92, 26, 34, 57, 59, 30, 81, 17, 23, 24, 83, 38, 117, 47, 14, 85, 87, 44, 88, 19, 21, 41, 84, 111, 108, 98, 91, 32, 95, 78, 100, 89, 80, 25, 105, 16, 28, 36, 75, 104, 74, 79, 31, 20, 15, 77, 106, 12, 27, 76, 13, 94, 10, 82, 72, 18, 11, 64, 35, 96, 67, 0, 42, 8, 6, 9, 40, 70, 99, 3, 66, 7, 69, 2, 5, 71, 68, 73, 4, 65, 1], [46, 113, 116, 56, 126, 122, 112, 48, 110, 51, 114, 45, 107, 120, 119, 43, 109, 50, 127, 125, 93, 37, 49, 22, 101, 121, 54, 58, 55, 53, 39, 102, 115, 86, 63, 29, 60, 103, 52, 61, 97, 90, 57, 118, 62, 34, 124, 33, 117, 26, 59, 92, 47, 17, 81, 123, 19, 14, 87, 30, 83, 38, 75, 23, 24, 78, 88, 16, 80, 85, 95, 20, 98, 106, 89, 91, 84, 21, 44, 111, 31, 79, 15, 105, 25, 12, 100, 32, 28, 74, 42, 41, 104, 13, 27, 76, 36, 94, 10, 11, 77, 82, 96, 108, 67, 72, 40, 18, 3, 8, 2, 70, 7, 66, 64, 0, 5, 71, 6, 35, 73, 69, 9, 99, 68, 4, 65, 1], [46, 113, 116, 56, 122, 126, 48, 112, 110, 51, 114, 45, 120, 107, 119, 43, 109, 50, 127, 125, 93, 49, 37, 54, 121, 22, 86, 39, 115, 58, 101, 63, 123, 55, 29, 61, 103, 53, 102, 57, 52, 60, 59, 62, 26, 124, 97, 81, 90, 118, 34, 33, 47, 117, 19, 17, 98, 38, 23, 87, 78, 83, 84, 24, 92, 30, 85, 20, 80, 21, 75, 16, 111, 89, 79, 91, 25, 95, 14, 88, 105, 74, 44, 15, 12, 104, 40, 36, 10, 108, 31, 28, 41, 77, 100, 106, 76, 32, 27, 13, 11, 82, 42, 72, 70, 73, 18, 94, 71, 8, 7, 35, 0, 3, 96, 66, 64, 99, 67, 9, 69, 2, 6, 5, 68, 65, 4, 1], [46, 113, 116, 56, 122, 126, 112, 48, 110, 51, 45, 114, 119, 120, 107, 43, 50, 109, 127, 93, 125, 49, 37, 22, 86, 121, 58, 54, 101, 39, 60, 115, 29, 123, 102, 52, 97, 90, 63, 124, 53, 117, 26, 57, 81, 103, 118, 34, 19, 59, 55, 61, 84, 16, 87, 80, 85, 62, 33, 17, 23, 47, 79, 78, 98, 38, 92, 24, 89, 14, 20, 111, 30, 12, 83, 44, 75, 31, 88, 95, 104, 91, 15, 25, 21, 10, 74, 76, 32, 28, 11, 77, 100, 36, 105, 82, 13, 108, 106, 41, 27, 72, 96, 8, 70, 40, 42, 94, 18, 73, 9, 3, 67, 7, 0, 64, 71, 35, 99, 66, 6, 5, 2, 69, 4, 65, 1, 68], [46, 113, 56, 116, 48, 126, 112, 110, 122, 114, 45, 51, 120, 119, 109, 107, 43, 50, 127, 125, 93, 37, 22, 86, 49, 39, 102, 101, 121, 58, 29, 123, 115, 57, 54, 60, 103, 90, 34, 118, 26, 117, 55, 52, 63, 53, 33, 61, 124, 87, 19, 97, 17, 23, 16, 84, 24, 47, 81, 62, 85, 59, 38, 92, 83, 91, 88, 30, 98, 80, 20, 79, 78, 14, 44, 21, 89, 25, 15, 76, 104, 32, 36, 75, 100, 108, 40, 10, 12, 28, 31, 95, 27, 41, 74, 77, 13, 111, 11, 42, 105, 106, 82, 18, 94, 35, 9, 96, 8, 72, 70, 99, 7, 73, 64, 71, 67, 3, 0, 6, 66, 2, 69, 5, 4, 68, 1, 65], [46, 113, 56, 116, 126, 48, 112, 110, 122, 51, 45, 114, 120, 119, 107, 43, 109, 50, 127, 125, 93, 22, 37, 86, 101, 121, 123, 102, 49, 58, 39, 29, 53, 57, 117, 54, 55, 60, 90, 62, 115, 118, 103, 124, 52, 33, 26, 87, 34, 63, 91, 92, 59, 24, 81, 23, 61, 19, 97, 83, 85, 38, 30, 17, 47, 98, 89, 21, 111, 80, 108, 41, 88, 32, 36, 84, 25, 15, 14, 20, 79, 104, 27, 16, 10, 44, 31, 78, 28, 40, 100, 95, 77, 94, 75, 12, 42, 13, 76, 18, 74, 11, 105, 106, 82, 99, 35, 70, 8, 96, 7, 72, 64, 3, 9, 0, 71, 66, 73, 69, 2, 6, 67, 5, 68, 1, 4, 65], [46, 113, 56, 126, 116, 48, 112, 122, 110, 51, 45, 114, 120, 107, 119, 50, 43, 109, 127, 93, 125, 49, 37, 121, 101, 22, 123, 58, 102, 29, 86, 55, 117, 60, 39, 26, 53, 63, 54, 61, 57, 62, 103, 90, 52, 118, 115, 34, 33, 124, 92, 17, 81, 38, 97, 47, 85, 87, 83, 23, 78, 59, 24, 30, 21, 91, 36, 88, 32, 84, 19, 89, 14, 16, 20, 104, 41, 111, 15, 98, 25, 44, 80, 79, 31, 100, 95, 27, 28, 11, 10, 105, 77, 94, 76, 12, 40, 75, 108, 13, 106, 74, 18, 96, 42, 8, 35, 7, 82, 99, 72, 3, 70, 0, 2, 73, 6, 67, 64, 9, 71, 69, 66, 68, 1, 4, 5, 65], [46, 113, 116, 56, 126, 48, 112, 122, 110, 51, 45, 114, 120, 107, 43, 119, 50, 109, 127, 93, 37, 125, 121, 49, 58, 22, 54, 52, 123, 101, 117, 86, 102, 53, 55, 124, 29, 57, 115, 39, 63, 118, 61, 62, 103, 90, 34, 97, 60, 38, 26, 17, 33, 92, 81, 47, 78, 87, 85, 59, 23, 19, 83, 88, 24, 89, 25, 30, 32, 98, 10, 16, 80, 21, 41, 84, 14, 111, 11, 95, 100, 15, 36, 44, 91, 77, 79, 104, 12, 75, 28, 40, 31, 20, 27, 8, 74, 76, 106, 96, 13, 105, 42, 108, 94, 7, 6, 71, 0, 72, 3, 18, 35, 2, 64, 9, 82, 70, 73, 69, 67, 66, 99, 68, 5, 4, 1, 65], [46, 113, 56, 116, 122, 126, 48, 110, 112, 51, 114, 45, 120, 107, 43, 119, 50, 109, 127, 125, 93, 37, 121, 49, 115, 22, 58, 101, 54, 86, 118, 123, 39, 63, 52, 124, 53, 62, 102, 97, 29, 103, 55, 26, 57, 61, 60, 90, 81, 17, 19, 117, 33, 111, 87, 84, 92, 34, 38, 83, 85, 78, 16, 15, 47, 25, 79, 24, 14, 80, 95, 30, 11, 59, 23, 21, 44, 75, 20, 88, 10, 91, 98, 12, 104, 77, 32, 28, 106, 6, 76, 8, 74, 36, 100, 82, 72, 89, 105, 27, 31, 41, 108, 13, 18, 40, 94, 42, 7, 64, 96, 2, 73, 71, 0, 3, 9, 35, 67, 69, 66, 99, 5, 70, 4, 1, 65, 68], [46, 113, 56, 116, 48, 126, 110, 122, 112, 51, 45, 114, 120, 107, 119, 43, 50, 109, 125, 127, 93, 37, 22, 86, 49, 101, 58, 102, 121, 39, 29, 115, 53, 54, 123, 63, 117, 34, 55, 52, 90, 62, 26, 103, 61, 17, 19, 124, 60, 97, 33, 118, 81, 57, 83, 38, 85, 87, 59, 84, 92, 23, 78, 47, 15, 98, 16, 80, 24, 14, 10, 30, 88, 91, 20, 100, 79, 12, 21, 44, 111, 36, 25, 32, 11, 95, 31, 89, 104, 75, 28, 41, 77, 76, 27, 74, 106, 82, 8, 108, 94, 96, 40, 18, 105, 6, 13, 72, 71, 73, 9, 42, 67, 35, 66, 3, 0, 64, 7, 99, 69, 2, 70, 5, 1, 65, 4, 68], [46, 113, 56, 116, 126, 48, 112, 110, 122, 51, 45, 120, 114, 107, 50, 109, 43, 119, 93, 127, 125, 37, 49, 22, 102, 86, 58, 123, 39, 121, 54, 101, 29, 55, 117, 115, 124, 62, 63, 52, 33, 53, 97, 60, 103, 34, 118, 90, 61, 47, 38, 57, 26, 92, 30, 19, 85, 59, 87, 81, 83, 89, 88, 98, 17, 32, 91, 23, 84, 36, 25, 44, 24, 16, 111, 80, 78, 21, 100, 20, 108, 15, 95, 11, 31, 105, 77, 28, 10, 79, 27, 12, 14, 41, 74, 82, 76, 94, 40, 13, 104, 75, 106, 96, 35, 71, 8, 42, 18, 6, 72, 0, 66, 64, 73, 7, 9, 3, 99, 2, 67, 70, 5, 69, 1, 4, 68, 65], [46, 113, 56, 122, 126, 116, 48, 112, 110, 51, 45, 114, 120, 107, 119, 50, 43, 109, 127, 125, 93, 37, 22, 49, 58, 54, 101, 121, 86, 39, 60, 29, 102, 123, 57, 52, 53, 62, 103, 115, 55, 63, 117, 90, 61, 124, 118, 34, 26, 59, 17, 81, 47, 33, 92, 97, 83, 111, 38, 87, 19, 32, 85, 78, 98, 23, 30, 80, 21, 91, 24, 89, 84, 11, 36, 20, 14, 95, 15, 44, 10, 28, 104, 25, 105, 108, 16, 79, 41, 106, 31, 88, 76, 96, 74, 12, 77, 100, 82, 75, 27, 72, 94, 42, 8, 18, 64, 0, 13, 71, 73, 6, 35, 9, 70, 7, 99, 66, 67, 40, 5, 1, 2, 3, 69, 4, 65, 68], [46, 113, 116, 56, 48, 126, 112, 122, 110, 51, 114, 45, 107, 120, 43, 119, 50, 109, 127, 93, 125, 37, 22, 86, 121, 49, 102, 101, 58, 52, 123, 39, 55, 29, 118, 54, 117, 103, 63, 61, 57, 33, 62, 90, 26, 17, 53, 81, 19, 60, 124, 59, 34, 115, 87, 97, 24, 92, 78, 85, 83, 74, 11, 91, 79, 16, 21, 10, 89, 104, 23, 80, 32, 38, 98, 20, 84, 36, 111, 95, 47, 12, 27, 30, 14, 28, 88, 76, 15, 25, 100, 75, 41, 108, 106, 77, 96, 94, 44, 18, 31, 73, 105, 40, 8, 82, 70, 35, 13, 71, 72, 0, 64, 67, 7, 2, 3, 9, 6, 66, 42, 69, 5, 68, 1, 65, 99, 4], [46, 113, 56, 116, 126, 48, 122, 110, 112, 51, 114, 45, 120, 107, 43, 119, 109, 50, 125, 127, 93, 37, 49, 22, 101, 121, 86, 54, 58, 102, 57, 123, 52, 63, 60, 117, 115, 29, 53, 55, 62, 118, 103, 39, 81, 59, 26, 124, 97, 61, 90, 83, 87, 33, 17, 34, 38, 19, 74, 47, 23, 78, 85, 32, 89, 11, 91, 24, 84, 95, 92, 98, 20, 15, 79, 104, 111, 36, 16, 30, 77, 14, 25, 10, 88, 75, 27, 12, 21, 80, 28, 40, 44, 41, 106, 105, 13, 70, 31, 100, 108, 76, 94, 8, 72, 18, 42, 96, 82, 71, 73, 67, 7, 35, 9, 3, 0, 66, 2, 64, 99, 6, 69, 5, 65, 1, 4, 68], [46, 113, 56, 116, 126, 122, 48, 112, 110, 51, 45, 120, 114, 107, 119, 43, 109, 50, 127, 125, 93, 37, 121, 49, 22, 86, 101, 102, 58, 29, 54, 103, 123, 60, 52, 39, 115, 118, 124, 117, 57, 55, 26, 34, 53, 62, 97, 90, 63, 33, 47, 92, 59, 83, 61, 81, 19, 17, 23, 24, 111, 38, 87, 30, 20, 91, 89, 85, 84, 32, 98, 16, 21, 88, 36, 78, 80, 25, 100, 40, 104, 44, 79, 74, 108, 95, 28, 14, 27, 11, 31, 15, 12, 77, 94, 41, 76, 10, 96, 82, 13, 72, 18, 70, 75, 35, 105, 106, 42, 64, 71, 9, 0, 7, 67, 2, 8, 5, 73, 99, 66, 3, 6, 68, 69, 65, 1, 4], [46, 113, 56, 116, 126, 48, 110, 112, 122, 45, 51, 114, 107, 120, 43, 119, 109, 50, 127, 93, 125, 37, 121, 22, 86, 49, 101, 117, 58, 55, 29, 102, 53, 39, 118, 54, 103, 63, 124, 33, 115, 90, 123, 92, 60, 26, 34, 57, 62, 52, 97, 17, 61, 19, 81, 24, 111, 14, 83, 91, 85, 98, 23, 30, 16, 38, 87, 36, 89, 44, 11, 21, 59, 20, 47, 88, 74, 84, 28, 94, 95, 32, 15, 76, 79, 41, 80, 108, 10, 40, 78, 27, 31, 25, 13, 12, 100, 104, 75, 77, 42, 70, 72, 18, 96, 82, 106, 0, 105, 9, 35, 64, 67, 71, 99, 5, 3, 2, 8, 69, 7, 66, 73, 6, 65, 4, 68, 1], [46, 113, 116, 56, 126, 112, 48, 122, 110, 51, 45, 114, 120, 107, 43, 119, 109, 50, 127, 93, 125, 37, 49, 121, 117, 58, 63, 54, 22, 102, 39, 52, 101, 86, 62, 61, 29, 55, 53, 118, 103, 123, 115, 97, 57, 124, 33, 60, 90, 34, 26, 59, 23, 38, 83, 111, 24, 92, 17, 81, 14, 36, 30, 47, 21, 95, 87, 88, 85, 19, 20, 89, 78, 40, 16, 80, 98, 91, 44, 104, 84, 32, 28, 41, 11, 106, 76, 100, 25, 10, 74, 75, 15, 27, 31, 94, 105, 77, 42, 96, 79, 72, 108, 12, 82, 13, 70, 0, 66, 71, 35, 8, 67, 18, 6, 64, 7, 9, 4, 3, 5, 2, 99, 73, 1, 68, 69, 65], [46, 113, 116, 56, 126, 48, 110, 122, 112, 51, 45, 114, 107, 120, 43, 109, 50, 119, 127, 125, 93, 37, 121, 49, 22, 58, 101, 86, 117, 102, 52, 54, 97, 118, 63, 123, 29, 115, 39, 62, 55, 60, 124, 61, 103, 53, 34, 26, 38, 90, 57, 33, 87, 19, 95, 59, 81, 85, 92, 47, 24, 23, 83, 32, 98, 80, 11, 111, 84, 17, 16, 30, 89, 20, 78, 88, 91, 36, 14, 25, 21, 74, 104, 106, 15, 79, 100, 108, 40, 76, 28, 94, 12, 31, 10, 13, 72, 96, 44, 41, 27, 75, 77, 42, 8, 35, 6, 18, 71, 9, 0, 7, 82, 66, 73, 70, 105, 67, 64, 99, 3, 2, 4, 68, 69, 5, 1, 65], [46, 113, 116, 56, 126, 48, 110, 122, 112, 51, 114, 45, 107, 43, 120, 119, 109, 50, 127, 125, 93, 49, 37, 121, 22, 58, 86, 117, 54, 53, 123, 115, 63, 101, 124, 52, 118, 29, 39, 102, 55, 60, 26, 57, 103, 34, 90, 62, 97, 33, 81, 92, 78, 61, 87, 16, 38, 59, 83, 19, 111, 47, 17, 95, 32, 30, 91, 98, 11, 20, 84, 21, 23, 89, 88, 14, 36, 24, 25, 15, 44, 74, 28, 85, 108, 79, 76, 106, 12, 31, 94, 105, 13, 80, 104, 42, 75, 27, 41, 100, 82, 40, 77, 6, 10, 72, 8, 67, 18, 7, 35, 9, 96, 73, 99, 5, 0, 71, 3, 70, 66, 69, 2, 1, 68, 64, 4, 65], [46, 113, 116, 56, 48, 126, 112, 110, 122, 51, 114, 45, 120, 107, 43, 119, 50, 109, 127, 125, 93, 49, 37, 121, 54, 58, 117, 22, 53, 118, 39, 63, 86, 124, 29, 115, 55, 52, 101, 102, 123, 97, 103, 62, 61, 34, 57, 90, 26, 60, 33, 59, 92, 38, 17, 95, 81, 91, 83, 19, 87, 24, 47, 84, 16, 21, 88, 85, 111, 78, 108, 23, 89, 14, 30, 11, 32, 25, 44, 104, 36, 100, 28, 98, 74, 20, 80, 42, 79, 94, 41, 15, 106, 31, 76, 40, 75, 27, 82, 96, 77, 8, 64, 10, 13, 6, 105, 12, 71, 0, 18, 7, 3, 72, 9, 73, 67, 35, 2, 70, 99, 66, 5, 69, 68, 65, 1, 4], [46, 113, 116, 56, 126, 112, 122, 48, 110, 51, 114, 45, 120, 107, 119, 43, 109, 50, 125, 127, 93, 49, 121, 37, 123, 54, 22, 86, 117, 101, 58, 115, 52, 39, 102, 124, 118, 53, 29, 57, 60, 63, 26, 103, 61, 55, 34, 97, 62, 38, 90, 33, 92, 83, 47, 59, 17, 111, 88, 87, 81, 44, 19, 21, 91, 89, 24, 23, 32, 80, 36, 85, 78, 84, 98, 95, 30, 108, 14, 20, 74, 16, 94, 25, 27, 41, 31, 28, 106, 100, 79, 104, 12, 40, 15, 96, 10, 76, 11, 75, 77, 105, 18, 82, 13, 6, 35, 8, 73, 42, 7, 99, 64, 0, 72, 3, 71, 67, 66, 2, 9, 5, 69, 70, 4, 68, 65, 1], [46, 113, 116, 122, 56, 126, 48, 112, 110, 51, 114, 45, 107, 120, 119, 43, 109, 50, 127, 125, 93, 37, 49, 54, 22, 58, 86, 121, 101, 115, 117, 123, 29, 118, 39, 57, 52, 53, 124, 102, 97, 63, 26, 55, 90, 103, 60, 111, 61, 34, 59, 33, 92, 87, 47, 81, 62, 17, 19, 83, 85, 84, 23, 16, 98, 32, 36, 30, 91, 80, 78, 89, 38, 24, 95, 75, 14, 20, 88, 21, 31, 25, 44, 76, 74, 28, 79, 11, 12, 106, 15, 27, 13, 41, 10, 100, 104, 72, 96, 77, 105, 94, 108, 8, 82, 18, 42, 6, 40, 71, 9, 35, 73, 67, 7, 3, 70, 0, 64, 66, 5, 2, 69, 99, 1, 4, 68, 65], [46, 113, 116, 56, 122, 112, 48, 126, 110, 114, 51, 45, 120, 107, 43, 119, 109, 50, 125, 127, 93, 37, 22, 49, 86, 121, 115, 58, 54, 124, 63, 29, 102, 118, 53, 117, 39, 52, 123, 57, 101, 90, 55, 61, 26, 81, 34, 83, 19, 103, 97, 87, 60, 33, 59, 62, 23, 111, 24, 92, 80, 47, 84, 38, 17, 85, 89, 78, 21, 20, 88, 44, 91, 30, 16, 75, 12, 15, 25, 36, 95, 14, 74, 98, 10, 32, 31, 28, 76, 100, 104, 27, 13, 79, 41, 11, 94, 8, 77, 42, 40, 106, 96, 108, 18, 7, 9, 99, 72, 105, 35, 0, 82, 6, 71, 73, 70, 67, 5, 2, 66, 64, 3, 69, 1, 4, 68, 65], [46, 113, 116, 56, 112, 126, 48, 122, 110, 51, 45, 114, 120, 107, 119, 43, 109, 50, 127, 125, 93, 37, 124, 121, 22, 49, 58, 86, 115, 54, 118, 53, 101, 117, 102, 123, 29, 60, 39, 63, 55, 52, 61, 90, 34, 97, 62, 57, 103, 26, 33, 92, 38, 24, 83, 36, 59, 47, 111, 98, 19, 21, 32, 30, 88, 91, 17, 85, 95, 84, 87, 20, 81, 89, 23, 74, 14, 16, 104, 25, 100, 78, 75, 27, 80, 108, 44, 28, 41, 94, 79, 15, 31, 12, 96, 40, 105, 10, 11, 77, 35, 76, 8, 82, 13, 42, 72, 73, 106, 18, 70, 99, 64, 67, 71, 9, 66, 6, 7, 3, 69, 0, 5, 2, 1, 4, 65, 68], [46, 113, 122, 56, 116, 126, 112, 48, 110, 114, 45, 51, 107, 120, 119, 109, 43, 50, 127, 125, 93, 121, 123, 115, 49, 37, 101, 54, 58, 102, 118, 22, 124, 117, 86, 55, 53, 39, 61, 57, 103, 29, 60, 63, 52, 97, 90, 34, 62, 26, 92, 47, 38, 111, 33, 19, 81, 83, 17, 23, 44, 98, 59, 36, 87, 30, 95, 84, 32, 21, 85, 24, 91, 14, 108, 100, 78, 88, 89, 25, 75, 28, 94, 41, 79, 20, 80, 104, 10, 31, 16, 76, 74, 105, 15, 77, 13, 96, 40, 12, 27, 18, 106, 70, 82, 11, 42, 99, 8, 35, 2, 3, 9, 64, 72, 71, 69, 7, 73, 67, 5, 0, 66, 6, 65, 4, 68, 1], [46, 113, 116, 56, 126, 122, 112, 48, 110, 51, 114, 45, 107, 50, 120, 43, 119, 109, 127, 93, 125, 37, 49, 121, 58, 22, 55, 54, 118, 101, 115, 86, 124, 102, 39, 123, 29, 63, 53, 117, 57, 97, 90, 61, 52, 103, 62, 33, 59, 26, 60, 92, 19, 81, 17, 87, 34, 83, 47, 23, 85, 14, 75, 106, 16, 20, 38, 78, 88, 21, 111, 25, 84, 30, 24, 80, 95, 32, 91, 89, 44, 74, 10, 15, 8, 100, 31, 76, 12, 79, 11, 70, 41, 13, 98, 77, 94, 28, 104, 40, 96, 36, 27, 105, 108, 7, 42, 82, 72, 35, 9, 67, 0, 18, 64, 71, 69, 66, 73, 6, 99, 2, 3, 68, 5, 1, 4, 65], [46, 113, 116, 56, 48, 126, 122, 112, 110, 51, 114, 45, 119, 107, 43, 120, 109, 50, 127, 37, 93, 125, 49, 121, 54, 22, 123, 86, 118, 102, 57, 39, 58, 52, 55, 101, 62, 115, 53, 29, 124, 103, 34, 117, 63, 60, 26, 97, 33, 90, 61, 47, 59, 81, 78, 19, 17, 87, 95, 38, 85, 83, 84, 100, 75, 111, 21, 92, 89, 91, 16, 23, 14, 98, 32, 74, 44, 80, 104, 24, 30, 10, 41, 12, 79, 20, 88, 15, 36, 105, 31, 76, 40, 108, 25, 27, 28, 77, 64, 8, 11, 70, 3, 72, 96, 82, 35, 106, 18, 94, 67, 13, 2, 9, 0, 7, 71, 42, 5, 66, 73, 69, 6, 4, 1, 68, 99, 65], [46, 113, 116, 56, 122, 126, 48, 112, 110, 51, 114, 45, 119, 120, 107, 43, 50, 109, 127, 125, 93, 37, 121, 54, 49, 22, 118, 86, 101, 123, 63, 57, 102, 58, 117, 29, 53, 124, 52, 59, 103, 26, 115, 39, 97, 61, 62, 34, 60, 90, 33, 47, 55, 81, 92, 111, 87, 19, 23, 16, 17, 83, 38, 14, 80, 30, 84, 78, 24, 21, 44, 91, 98, 75, 15, 20, 88, 89, 12, 10, 85, 104, 25, 95, 79, 36, 32, 100, 31, 74, 40, 105, 13, 28, 76, 41, 77, 11, 27, 8, 106, 96, 108, 72, 18, 9, 94, 70, 82, 6, 71, 42, 3, 7, 67, 35, 66, 73, 64, 5, 0, 99, 69, 2, 68, 65, 1, 4], [46, 113, 116, 56, 122, 48, 112, 126, 110, 51, 45, 114, 107, 119, 120, 43, 50, 109, 127, 125, 93, 54, 37, 49, 121, 86, 22, 58, 101, 52, 102, 124, 118, 123, 63, 39, 57, 115, 53, 60, 29, 117, 103, 55, 90, 62, 97, 61, 34, 47, 33, 59, 26, 92, 83, 111, 17, 81, 91, 87, 44, 23, 38, 19, 98, 14, 32, 75, 85, 78, 88, 80, 24, 104, 21, 30, 89, 10, 100, 36, 16, 25, 79, 72, 74, 12, 20, 76, 95, 84, 28, 41, 15, 31, 40, 13, 108, 11, 105, 77, 35, 96, 71, 27, 8, 18, 94, 73, 6, 64, 82, 106, 66, 7, 70, 67, 0, 42, 9, 69, 99, 5, 3, 2, 68, 65, 1, 4], [46, 113, 116, 56, 122, 126, 48, 112, 110, 51, 45, 114, 120, 119, 107, 50, 43, 109, 93, 127, 125, 37, 22, 121, 54, 86, 101, 49, 52, 102, 58, 124, 39, 118, 123, 53, 29, 60, 103, 57, 90, 55, 61, 97, 34, 115, 63, 62, 117, 33, 92, 47, 26, 81, 83, 38, 59, 85, 111, 24, 19, 91, 98, 23, 87, 17, 89, 88, 14, 30, 78, 32, 21, 12, 84, 75, 44, 20, 95, 36, 16, 100, 80, 10, 15, 79, 25, 41, 76, 104, 27, 74, 77, 105, 13, 31, 72, 94, 40, 28, 18, 106, 108, 11, 96, 42, 6, 8, 82, 70, 9, 7, 35, 73, 71, 3, 64, 5, 99, 0, 69, 2, 66, 65, 67, 4, 68, 1]], "model.layers.21.self_attn.q_proj": [[103, 49, 112, 48, 97, 82, 15, 84, 29, 86, 12, 113, 99, 62, 8, 26, 54, 78, 13, 57, 4, 70, 96, 52, 90, 33, 17, 72, 23, 37, 80, 56, 89, 79, 77, 88, 73, 120, 98, 93, 24, 21, 20, 92, 18, 9, 27, 22, 59, 87, 83, 16, 5, 76, 94, 39, 19, 107, 85, 123, 14, 1, 124, 91, 46, 50, 28, 67, 31, 81, 108, 66, 25, 10, 6, 40, 51, 74, 95, 30, 122, 106, 102, 2, 116, 127, 44, 68, 115, 38, 60, 7, 125, 100, 75, 119, 109, 55, 63, 58, 34, 101, 53, 118, 61, 36, 121, 69, 32, 64, 117, 114, 47, 104, 35, 11, 126, 41, 110, 43, 45, 42, 71, 65, 111, 105, 3, 0], [49, 103, 112, 48, 97, 29, 113, 62, 56, 84, 87, 35, 38, 26, 123, 52, 82, 63, 95, 115, 120, 59, 57, 51, 124, 61, 116, 104, 86, 89, 53, 106, 119, 78, 45, 12, 122, 125, 60, 118, 126, 47, 40, 54, 117, 127, 50, 46, 88, 110, 108, 111, 55, 96, 30, 58, 114, 43, 37, 105, 34, 107, 44, 121, 102, 42, 15, 11, 19, 41, 109, 8, 100, 16, 99, 36, 92, 101, 31, 94, 32, 98, 39, 67, 23, 14, 25, 33, 85, 91, 3, 28, 17, 73, 70, 10, 4, 13, 18, 7, 22, 27, 90, 93, 24, 21, 68, 80, 71, 66, 5, 20, 83, 75, 76, 77, 9, 79, 69, 2, 65, 72, 81, 64, 6, 1, 74, 0], [112, 62, 49, 103, 48, 56, 52, 123, 122, 59, 120, 113, 50, 97, 115, 121, 51, 57, 54, 109, 46, 119, 63, 125, 35, 114, 118, 60, 105, 104, 117, 58, 108, 124, 29, 61, 127, 126, 42, 41, 110, 116, 45, 47, 88, 55, 53, 99, 87, 106, 43, 111, 107, 74, 3, 71, 37, 44, 82, 40, 38, 2, 102, 68, 6, 100, 95, 0, 84, 86, 101, 36, 30, 85, 34, 5, 26, 31, 98, 32, 14, 33, 27, 72, 94, 92, 96, 65, 89, 81, 9, 21, 90, 76, 18, 39, 91, 7, 28, 1, 19, 23, 83, 22, 11, 12, 25, 75, 93, 24, 67, 15, 17, 80, 16, 4, 73, 70, 77, 13, 20, 79, 10, 69, 78, 64, 8, 66], [112, 103, 49, 48, 97, 113, 29, 0, 65, 50, 26, 11, 95, 84, 78, 17, 54, 82, 31, 86, 62, 5, 56, 57, 123, 2, 66, 122, 88, 114, 70, 52, 42, 46, 118, 120, 59, 121, 4, 64, 8, 104, 115, 67, 125, 63, 116, 60, 108, 1, 106, 44, 35, 10, 127, 38, 119, 15, 75, 51, 12, 109, 13, 58, 117, 61, 93, 7, 3, 25, 124, 69, 126, 90, 110, 111, 45, 53, 47, 81, 99, 43, 34, 55, 96, 9, 94, 107, 37, 41, 6, 105, 100, 83, 72, 98, 74, 71, 102, 77, 85, 73, 22, 33, 40, 24, 92, 19, 87, 101, 32, 30, 89, 27, 68, 91, 14, 28, 23, 16, 20, 21, 36, 80, 39, 18, 76, 79], [102, 120, 118, 54, 53, 84, 56, 11, 15, 9, 18, 1, 67, 76, 6, 5, 87, 31, 71, 64, 66, 0, 93, 68, 77, 78, 89, 72, 13, 80, 25, 65, 86, 74, 46, 21, 109, 63, 48, 92, 69, 114, 52, 22, 32, 8, 127, 40, 126, 75, 16, 7, 10, 117, 107, 70, 115, 82, 4, 73, 57, 12, 116, 121, 113, 3, 94, 60, 38, 14, 24, 23, 79, 41, 45, 44, 110, 20, 91, 90, 98, 2, 85, 125, 103, 36, 124, 95, 17, 59, 30, 100, 47, 119, 50, 101, 62, 105, 29, 43, 34, 19, 42, 61, 99, 35, 33, 112, 108, 123, 97, 27, 51, 28, 39, 49, 96, 122, 106, 26, 37, 81, 104, 55, 83, 111, 88, 58], [102, 54, 118, 120, 84, 15, 18, 56, 72, 76, 53, 11, 25, 6, 93, 9, 78, 5, 80, 71, 31, 67, 89, 66, 74, 16, 22, 64, 1, 69, 3, 77, 46, 87, 48, 29, 92, 24, 85, 8, 117, 10, 13, 90, 65, 124, 40, 110, 75, 20, 107, 23, 109, 57, 12, 70, 83, 115, 125, 114, 79, 41, 103, 0, 42, 113, 99, 82, 63, 45, 86, 36, 14, 127, 73, 68, 32, 19, 44, 37, 39, 47, 116, 26, 61, 55, 88, 121, 51, 105, 7, 30, 81, 96, 112, 59, 123, 33, 95, 2, 38, 60, 49, 119, 4, 52, 126, 91, 17, 100, 34, 122, 97, 35, 50, 27, 104, 21, 111, 62, 98, 101, 43, 108, 28, 94, 106, 58], [102, 120, 54, 118, 56, 80, 18, 84, 76, 25, 9, 78, 93, 11, 89, 87, 31, 53, 15, 6, 71, 72, 5, 67, 77, 66, 13, 1, 64, 0, 22, 86, 74, 113, 46, 68, 65, 116, 115, 90, 32, 63, 26, 4, 81, 16, 7, 107, 8, 52, 82, 69, 40, 17, 125, 117, 73, 48, 92, 99, 36, 23, 70, 75, 12, 98, 43, 88, 51, 14, 126, 124, 19, 29, 55, 79, 37, 39, 20, 61, 114, 85, 3, 10, 122, 112, 49, 57, 109, 38, 127, 59, 30, 2, 27, 119, 24, 21, 33, 121, 35, 104, 83, 60, 100, 103, 105, 45, 34, 91, 42, 96, 94, 110, 28, 108, 62, 106, 41, 111, 101, 123, 50, 58, 47, 95, 97, 44], [102, 120, 54, 118, 56, 78, 53, 84, 72, 31, 18, 5, 15, 1, 93, 76, 71, 67, 66, 11, 9, 89, 25, 87, 13, 64, 65, 0, 117, 6, 77, 46, 68, 80, 73, 70, 107, 2, 26, 75, 22, 88, 8, 3, 39, 113, 32, 48, 69, 12, 4, 63, 82, 85, 60, 19, 109, 127, 74, 16, 116, 90, 125, 27, 28, 52, 79, 14, 110, 86, 29, 41, 126, 98, 114, 81, 124, 7, 34, 55, 92, 62, 105, 91, 96, 44, 101, 51, 115, 57, 40, 17, 42, 20, 104, 111, 36, 61, 119, 24, 94, 83, 43, 45, 50, 58, 121, 33, 38, 122, 59, 108, 100, 23, 10, 99, 106, 112, 103, 97, 95, 123, 30, 47, 37, 21, 35, 49], [59, 106, 60, 36, 99, 62, 28, 119, 42, 87, 123, 55, 61, 85, 121, 18, 53, 127, 31, 114, 32, 98, 89, 122, 124, 86, 47, 56, 92, 54, 117, 104, 112, 111, 126, 116, 125, 58, 115, 13, 118, 30, 41, 57, 113, 25, 90, 107, 100, 49, 110, 97, 109, 120, 46, 43, 50, 39, 38, 51, 52, 105, 45, 44, 101, 103, 48, 91, 63, 108, 35, 93, 102, 21, 80, 94, 29, 37, 34, 33, 40, 14, 75, 19, 79, 20, 96, 27, 4, 26, 15, 65, 22, 74, 23, 95, 82, 7, 17, 71, 83, 24, 6, 88, 77, 76, 73, 9, 68, 12, 8, 1, 84, 64, 81, 2, 69, 67, 3, 66, 16, 0, 5, 11, 10, 78, 70, 72], [106, 99, 59, 80, 28, 13, 20, 74, 8, 14, 86, 6, 60, 42, 75, 4, 18, 65, 58, 55, 30, 32, 85, 89, 64, 2, 67, 114, 31, 111, 27, 24, 91, 10, 119, 25, 62, 92, 21, 95, 22, 90, 19, 35, 107, 84, 81, 72, 82, 36, 96, 123, 16, 7, 70, 102, 112, 97, 78, 125, 17, 101, 57, 77, 79, 11, 88, 29, 87, 56, 12, 23, 69, 9, 5, 68, 73, 109, 46, 53, 113, 34, 71, 93, 0, 83, 61, 15, 127, 122, 94, 51, 45, 54, 37, 26, 66, 41, 116, 76, 117, 3, 1, 108, 52, 121, 104, 98, 115, 33, 43, 100, 120, 39, 110, 118, 105, 38, 103, 40, 124, 49, 47, 48, 50, 63, 44, 126], [106, 99, 59, 20, 14, 80, 86, 75, 74, 32, 6, 67, 8, 28, 64, 92, 60, 42, 18, 111, 25, 58, 55, 4, 2, 3, 91, 114, 69, 29, 87, 97, 65, 123, 102, 66, 127, 119, 57, 30, 1, 62, 107, 90, 36, 95, 31, 7, 88, 113, 112, 35, 12, 22, 70, 10, 122, 96, 21, 68, 89, 24, 73, 82, 11, 109, 17, 56, 125, 13, 45, 16, 72, 100, 61, 77, 78, 53, 79, 27, 15, 5, 116, 81, 19, 9, 71, 34, 84, 104, 0, 26, 108, 51, 23, 101, 85, 94, 54, 76, 98, 83, 121, 43, 48, 37, 52, 39, 115, 93, 41, 33, 46, 105, 103, 110, 38, 117, 120, 47, 118, 40, 50, 44, 49, 124, 126, 63], [106, 60, 36, 62, 42, 119, 58, 123, 55, 112, 61, 114, 121, 59, 87, 124, 111, 53, 125, 41, 115, 47, 110, 118, 63, 103, 122, 101, 127, 113, 85, 99, 18, 126, 54, 104, 56, 50, 120, 49, 107, 25, 51, 46, 108, 52, 57, 48, 28, 116, 44, 109, 43, 45, 100, 117, 105, 40, 32, 92, 89, 39, 35, 38, 102, 90, 27, 31, 37, 15, 34, 98, 33, 91, 88, 19, 30, 97, 86, 26, 93, 96, 95, 21, 94, 29, 13, 83, 22, 24, 17, 23, 76, 79, 82, 20, 81, 77, 71, 14, 80, 73, 12, 7, 9, 84, 65, 75, 4, 74, 68, 78, 1, 16, 6, 69, 11, 67, 2, 5, 8, 64, 10, 70, 3, 0, 66, 72], [102, 57, 58, 59, 114, 90, 87, 127, 26, 96, 60, 29, 38, 93, 85, 74, 82, 15, 61, 18, 110, 21, 46, 35, 48, 71, 111, 121, 62, 12, 44, 51, 84, 113, 32, 22, 49, 117, 79, 16, 50, 112, 13, 41, 119, 109, 86, 63, 43, 124, 72, 100, 4, 23, 123, 53, 52, 105, 115, 56, 118, 122, 107, 40, 106, 54, 120, 116, 98, 45, 126, 125, 55, 2, 39, 108, 33, 104, 101, 17, 94, 27, 37, 30, 103, 47, 31, 95, 20, 42, 34, 5, 91, 25, 88, 8, 28, 97, 0, 92, 89, 36, 99, 11, 83, 81, 3, 80, 19, 24, 78, 68, 1, 75, 9, 77, 14, 70, 66, 73, 65, 76, 7, 6, 64, 10, 69, 67], [102, 58, 57, 59, 114, 90, 87, 127, 121, 96, 26, 29, 38, 15, 93, 85, 49, 74, 41, 110, 82, 32, 60, 84, 63, 43, 109, 113, 50, 116, 117, 79, 62, 21, 124, 35, 52, 54, 105, 22, 48, 53, 46, 56, 13, 55, 18, 20, 125, 44, 71, 51, 12, 86, 126, 123, 61, 119, 111, 112, 108, 118, 122, 115, 120, 103, 45, 95, 23, 104, 106, 36, 47, 101, 42, 39, 31, 16, 100, 107, 33, 40, 89, 80, 28, 98, 37, 78, 2, 97, 88, 94, 34, 30, 4, 91, 99, 27, 25, 0, 72, 92, 77, 24, 83, 11, 19, 17, 73, 5, 81, 14, 68, 3, 75, 64, 70, 1, 8, 7, 66, 6, 9, 76, 10, 65, 67, 69], [102, 59, 58, 114, 57, 90, 87, 26, 15, 29, 85, 96, 74, 93, 121, 71, 18, 82, 21, 38, 109, 13, 79, 47, 44, 84, 48, 2, 116, 117, 12, 115, 49, 32, 86, 60, 54, 0, 4, 119, 55, 127, 105, 124, 35, 45, 112, 5, 43, 46, 113, 110, 17, 118, 63, 53, 52, 123, 41, 111, 37, 72, 122, 65, 61, 23, 120, 51, 50, 66, 25, 108, 1, 126, 62, 81, 56, 106, 89, 3, 22, 20, 104, 125, 16, 100, 24, 101, 36, 107, 83, 98, 8, 91, 28, 40, 94, 67, 19, 103, 31, 39, 42, 7, 68, 77, 10, 99, 95, 64, 27, 33, 75, 6, 88, 92, 73, 78, 30, 9, 34, 70, 80, 97, 11, 76, 69, 14], [102, 59, 58, 57, 127, 114, 90, 60, 26, 87, 96, 15, 85, 71, 38, 29, 74, 121, 18, 93, 105, 35, 12, 84, 49, 108, 82, 79, 41, 47, 4, 120, 110, 32, 22, 54, 21, 52, 119, 43, 118, 46, 122, 117, 124, 62, 20, 123, 116, 109, 16, 111, 104, 44, 55, 99, 112, 113, 13, 115, 61, 86, 63, 89, 40, 5, 2, 48, 107, 50, 45, 30, 103, 23, 98, 51, 0, 39, 106, 94, 125, 36, 126, 37, 33, 56, 34, 53, 42, 101, 100, 24, 27, 97, 8, 83, 17, 66, 28, 73, 31, 25, 95, 68, 88, 92, 19, 91, 1, 78, 3, 77, 72, 81, 7, 65, 64, 11, 70, 80, 9, 76, 14, 75, 10, 69, 6, 67], [126, 104, 99, 87, 28, 32, 83, 92, 85, 81, 127, 110, 79, 47, 42, 76, 115, 111, 119, 72, 11, 15, 30, 74, 51, 113, 31, 108, 21, 95, 116, 78, 37, 50, 24, 57, 122, 45, 6, 48, 25, 12, 90, 96, 5, 121, 56, 68, 26, 53, 23, 107, 55, 106, 120, 62, 59, 101, 70, 123, 89, 17, 49, 43, 46, 109, 105, 41, 97, 13, 19, 2, 33, 112, 93, 39, 52, 124, 82, 40, 65, 114, 117, 34, 0, 125, 102, 61, 29, 118, 3, 44, 80, 60, 20, 84, 58, 54, 100, 88, 27, 22, 94, 36, 98, 103, 64, 18, 86, 91, 75, 63, 38, 66, 16, 10, 67, 14, 9, 77, 1, 73, 4, 71, 35, 7, 69, 8], [126, 104, 99, 87, 28, 83, 85, 32, 127, 92, 47, 115, 81, 79, 119, 51, 106, 76, 50, 118, 72, 48, 74, 111, 113, 42, 108, 15, 116, 45, 120, 31, 122, 121, 6, 58, 5, 61, 110, 57, 11, 41, 12, 70, 68, 37, 53, 91, 54, 21, 75, 43, 9, 97, 89, 49, 23, 38, 46, 30, 105, 13, 55, 59, 56, 40, 35, 65, 0, 62, 123, 17, 117, 109, 107, 34, 78, 33, 114, 60, 95, 103, 98, 112, 25, 67, 52, 39, 3, 93, 101, 36, 82, 19, 63, 102, 125, 18, 44, 26, 90, 29, 94, 2, 24, 66, 124, 20, 96, 86, 100, 84, 27, 73, 88, 80, 1, 69, 14, 22, 10, 16, 7, 71, 77, 4, 8, 64], [126, 104, 99, 87, 28, 32, 85, 81, 92, 83, 115, 47, 127, 79, 110, 42, 76, 45, 121, 116, 58, 95, 108, 68, 31, 15, 78, 72, 41, 113, 60, 86, 106, 5, 54, 122, 117, 11, 119, 96, 50, 9, 24, 26, 6, 7, 52, 91, 22, 0, 101, 21, 57, 8, 12, 90, 17, 111, 66, 13, 44, 18, 23, 93, 94, 30, 55, 105, 114, 70, 118, 37, 49, 51, 25, 123, 48, 97, 88, 46, 74, 63, 38, 56, 39, 43, 59, 27, 120, 82, 34, 53, 62, 89, 102, 33, 103, 19, 80, 36, 61, 98, 124, 125, 14, 112, 109, 29, 100, 16, 107, 73, 84, 2, 20, 10, 35, 4, 77, 40, 71, 3, 64, 67, 1, 75, 65, 69], [104, 126, 99, 28, 87, 85, 32, 83, 81, 79, 92, 127, 116, 89, 111, 76, 47, 48, 25, 50, 72, 122, 15, 74, 54, 6, 11, 106, 51, 12, 119, 2, 52, 68, 105, 43, 58, 57, 42, 118, 3, 45, 108, 1, 13, 14, 31, 70, 23, 94, 5, 0, 115, 17, 29, 110, 41, 107, 113, 64, 93, 21, 46, 63, 120, 30, 123, 26, 61, 82, 39, 8, 125, 62, 102, 59, 112, 75, 90, 124, 53, 37, 60, 9, 19, 86, 65, 101, 84, 78, 73, 117, 69, 44, 40, 109, 33, 114, 77, 18, 24, 121, 34, 56, 38, 103, 66, 10, 95, 36, 100, 49, 97, 55, 98, 27, 7, 22, 67, 4, 35, 91, 88, 96, 20, 16, 80, 71], [62, 126, 48, 39, 116, 53, 30, 27, 122, 88, 20, 35, 117, 76, 121, 15, 91, 106, 112, 83, 55, 57, 79, 81, 94, 58, 86, 32, 123, 124, 52, 46, 49, 72, 19, 84, 33, 60, 98, 63, 89, 115, 100, 127, 50, 29, 24, 34, 56, 107, 114, 5, 41, 105, 108, 109, 51, 61, 44, 113, 118, 110, 26, 125, 47, 42, 23, 119, 87, 103, 18, 28, 43, 120, 36, 111, 17, 54, 22, 37, 59, 45, 31, 75, 95, 104, 93, 10, 97, 13, 102, 12, 85, 99, 40, 25, 38, 21, 74, 101, 90, 92, 78, 2, 14, 8, 82, 80, 96, 9, 16, 77, 3, 1, 11, 71, 73, 69, 6, 7, 70, 67, 66, 68, 4, 0, 65, 64], [116, 126, 39, 62, 48, 55, 57, 117, 123, 46, 53, 125, 111, 49, 121, 30, 50, 60, 41, 20, 122, 52, 127, 51, 120, 63, 114, 124, 89, 56, 115, 35, 59, 58, 113, 118, 43, 109, 32, 86, 54, 112, 47, 61, 88, 42, 106, 119, 108, 44, 45, 110, 107, 37, 34, 91, 40, 103, 105, 27, 104, 81, 23, 92, 102, 80, 38, 98, 29, 94, 101, 33, 28, 99, 36, 22, 87, 93, 9, 79, 100, 90, 73, 95, 96, 13, 97, 25, 31, 76, 26, 14, 82, 85, 84, 68, 83, 17, 1, 77, 21, 15, 75, 7, 78, 18, 2, 24, 16, 3, 71, 70, 11, 72, 4, 67, 0, 74, 12, 10, 5, 66, 65, 6, 19, 64, 8, 69], [126, 62, 116, 39, 48, 117, 55, 57, 53, 111, 123, 112, 49, 122, 114, 58, 120, 118, 124, 50, 20, 35, 127, 121, 42, 51, 30, 109, 88, 46, 56, 32, 52, 60, 28, 41, 115, 119, 43, 113, 125, 59, 47, 63, 106, 108, 103, 94, 44, 61, 110, 54, 101, 45, 89, 104, 107, 40, 86, 34, 38, 102, 100, 105, 81, 99, 37, 95, 87, 29, 93, 23, 92, 14, 36, 98, 80, 33, 97, 79, 21, 26, 31, 90, 27, 85, 91, 18, 22, 96, 25, 13, 83, 15, 77, 17, 84, 24, 76, 9, 82, 12, 69, 73, 75, 70, 78, 2, 19, 11, 16, 72, 10, 64, 67, 66, 3, 4, 0, 6, 71, 65, 68, 74, 1, 8, 5, 7], [126, 116, 39, 48, 55, 57, 27, 123, 30, 117, 83, 86, 122, 52, 37, 20, 53, 26, 62, 118, 110, 89, 112, 91, 72, 17, 19, 35, 32, 121, 16, 41, 94, 10, 79, 4, 74, 76, 88, 15, 125, 124, 114, 60, 25, 120, 115, 49, 29, 107, 56, 63, 22, 13, 106, 97, 50, 58, 51, 93, 42, 46, 34, 113, 127, 80, 111, 109, 81, 61, 108, 28, 5, 105, 66, 95, 100, 59, 31, 47, 75, 119, 38, 24, 23, 87, 33, 92, 44, 98, 9, 54, 21, 78, 45, 14, 36, 18, 77, 102, 84, 67, 7, 104, 43, 101, 68, 103, 8, 73, 6, 12, 90, 40, 11, 99, 82, 69, 85, 0, 71, 65, 64, 96, 70, 2, 3, 1], [55, 103, 62, 52, 117, 98, 25, 57, 89, 92, 115, 28, 119, 54, 20, 83, 22, 78, 120, 126, 48, 121, 73, 19, 58, 50, 76, 27, 51, 82, 45, 125, 60, 111, 49, 32, 109, 12, 123, 112, 107, 94, 81, 124, 53, 40, 33, 113, 110, 38, 61, 118, 114, 43, 42, 116, 86, 17, 100, 84, 59, 14, 63, 47, 46, 108, 127, 97, 79, 95, 101, 18, 56, 96, 122, 106, 88, 44, 5, 104, 105, 41, 6, 37, 77, 99, 36, 15, 35, 8, 75, 39, 70, 93, 80, 30, 102, 91, 26, 72, 31, 16, 90, 3, 23, 85, 29, 66, 24, 64, 71, 34, 68, 69, 21, 1, 11, 9, 87, 0, 74, 67, 13, 65, 7, 10, 2, 4], [55, 103, 98, 20, 25, 89, 120, 92, 112, 51, 117, 28, 126, 78, 54, 84, 48, 62, 57, 73, 58, 52, 50, 121, 32, 118, 125, 99, 95, 127, 22, 108, 56, 119, 122, 59, 124, 110, 116, 83, 104, 109, 47, 61, 15, 111, 106, 113, 46, 53, 115, 49, 101, 82, 43, 79, 63, 114, 100, 107, 40, 44, 37, 67, 123, 60, 24, 97, 45, 35, 102, 27, 38, 42, 31, 11, 76, 74, 88, 105, 87, 36, 68, 91, 81, 23, 71, 94, 29, 19, 18, 26, 93, 33, 85, 30, 41, 90, 21, 39, 17, 9, 96, 86, 12, 10, 77, 3, 14, 80, 69, 34, 1, 8, 16, 2, 70, 13, 65, 6, 75, 72, 4, 7, 5, 66, 64, 0], [55, 103, 57, 62, 25, 98, 89, 83, 92, 78, 117, 20, 52, 54, 28, 120, 112, 22, 126, 125, 115, 48, 32, 76, 84, 61, 73, 51, 118, 50, 58, 19, 119, 41, 116, 60, 53, 64, 8, 108, 5, 56, 113, 105, 109, 27, 59, 49, 46, 127, 121, 95, 111, 114, 47, 12, 82, 124, 63, 123, 110, 66, 43, 65, 106, 44, 107, 30, 17, 122, 6, 38, 104, 3, 39, 79, 99, 45, 16, 72, 77, 81, 102, 86, 40, 13, 37, 94, 100, 36, 91, 14, 42, 75, 93, 2, 35, 23, 69, 101, 29, 97, 33, 90, 85, 34, 26, 70, 68, 31, 88, 24, 74, 10, 80, 96, 1, 15, 87, 18, 0, 21, 9, 67, 11, 7, 4, 71], [55, 103, 62, 52, 98, 20, 82, 92, 89, 22, 25, 117, 63, 51, 108, 112, 84, 57, 58, 54, 73, 56, 125, 50, 28, 44, 71, 119, 124, 48, 32, 79, 11, 15, 109, 87, 120, 74, 46, 53, 49, 43, 18, 97, 42, 126, 121, 110, 116, 77, 59, 113, 101, 45, 122, 61, 115, 21, 39, 107, 114, 106, 37, 30, 118, 26, 88, 123, 75, 111, 2, 104, 100, 31, 41, 127, 68, 14, 38, 60, 93, 99, 81, 102, 33, 94, 29, 17, 40, 105, 95, 85, 1, 47, 83, 23, 35, 7, 10, 27, 12, 8, 24, 90, 91, 0, 80, 19, 86, 4, 70, 96, 13, 36, 78, 16, 67, 69, 34, 6, 3, 76, 9, 5, 66, 72, 65, 64], [121, 127, 56, 55, 39, 118, 63, 98, 31, 51, 122, 89, 25, 57, 124, 120, 119, 84, 112, 125, 18, 105, 92, 53, 62, 59, 58, 52, 115, 15, 50, 49, 110, 28, 23, 47, 76, 54, 108, 60, 117, 116, 61, 114, 111, 113, 106, 126, 123, 10, 46, 30, 16, 107, 40, 24, 104, 48, 32, 96, 102, 43, 101, 6, 44, 90, 45, 41, 80, 87, 109, 99, 88, 42, 94, 79, 22, 36, 100, 14, 83, 95, 38, 37, 85, 69, 82, 93, 19, 75, 20, 86, 27, 77, 9, 81, 35, 97, 8, 34, 74, 71, 64, 13, 78, 2, 26, 103, 33, 12, 67, 72, 29, 65, 21, 11, 5, 91, 70, 68, 7, 17, 3, 1, 4, 0, 73, 66], [55, 127, 56, 39, 121, 118, 63, 122, 51, 89, 31, 84, 57, 120, 125, 18, 124, 119, 105, 98, 116, 62, 112, 117, 52, 53, 25, 115, 59, 50, 92, 95, 28, 76, 60, 15, 61, 49, 106, 47, 96, 111, 58, 54, 48, 123, 114, 126, 40, 113, 23, 108, 90, 85, 110, 46, 45, 107, 43, 44, 30, 109, 36, 102, 16, 101, 41, 38, 42, 79, 104, 24, 10, 103, 99, 20, 100, 22, 37, 35, 14, 87, 32, 33, 34, 97, 93, 88, 81, 80, 83, 9, 86, 82, 77, 27, 94, 29, 91, 78, 19, 75, 8, 74, 13, 21, 72, 71, 4, 26, 12, 17, 70, 73, 2, 6, 11, 5, 68, 65, 0, 69, 1, 64, 7, 3, 67, 66], [127, 39, 56, 63, 118, 55, 31, 98, 89, 51, 122, 57, 84, 92, 53, 25, 120, 125, 62, 105, 47, 115, 54, 96, 30, 126, 119, 52, 124, 18, 112, 114, 87, 59, 110, 116, 40, 44, 50, 58, 49, 90, 48, 113, 103, 85, 117, 60, 107, 23, 45, 16, 123, 111, 61, 121, 95, 106, 43, 76, 108, 102, 21, 35, 46, 104, 42, 79, 109, 26, 22, 34, 99, 41, 101, 83, 93, 15, 36, 32, 28, 38, 91, 37, 100, 24, 20, 27, 33, 19, 82, 97, 81, 94, 10, 80, 86, 74, 88, 29, 14, 77, 72, 17, 75, 8, 12, 13, 78, 9, 5, 69, 71, 6, 4, 11, 2, 70, 66, 68, 64, 73, 65, 67, 3, 7, 1, 0], [56, 39, 127, 55, 121, 118, 51, 31, 63, 122, 57, 84, 92, 25, 105, 120, 124, 62, 59, 115, 119, 125, 53, 50, 112, 52, 98, 47, 58, 116, 60, 49, 95, 89, 61, 113, 126, 110, 123, 114, 117, 54, 18, 44, 111, 41, 79, 48, 43, 108, 45, 90, 23, 76, 109, 101, 106, 104, 107, 42, 46, 96, 40, 102, 88, 100, 38, 10, 28, 15, 37, 36, 99, 35, 22, 29, 94, 97, 32, 103, 30, 82, 33, 83, 16, 86, 93, 77, 34, 8, 21, 27, 75, 87, 20, 85, 24, 80, 9, 72, 5, 91, 26, 81, 12, 13, 74, 17, 19, 71, 78, 14, 2, 4, 70, 73, 6, 69, 64, 3, 11, 67, 66, 0, 68, 7, 1, 65]], "model.layers.21.self_attn.k_proj": [[112, 49, 39, 86, 93, 57, 84, 15, 33, 26, 54, 123, 35, 50, 82, 122, 12, 8, 121, 52, 59, 64, 119, 60, 113, 116, 127, 118, 13, 125, 109, 58, 51, 73, 126, 117, 70, 111, 114, 53, 120, 55, 108, 29, 63, 48, 115, 78, 66, 10, 110, 46, 32, 47, 56, 43, 61, 44, 45, 40, 107, 88, 124, 62, 41, 68, 87, 42, 101, 106, 105, 16, 102, 38, 36, 65, 17, 95, 104, 100, 34, 94, 98, 89, 67, 27, 5, 92, 24, 4, 91, 37, 9, 30, 6, 7, 83, 96, 19, 80, 85, 28, 81, 69, 31, 21, 99, 22, 3, 90, 75, 71, 25, 2, 97, 11, 79, 74, 23, 18, 1, 77, 20, 14, 76, 72, 0, 103], [120, 54, 64, 118, 9, 84, 78, 76, 15, 5, 18, 53, 71, 11, 72, 65, 56, 38, 6, 67, 25, 2, 3, 0, 66, 13, 87, 29, 80, 4, 95, 69, 1, 89, 93, 22, 85, 102, 92, 31, 10, 70, 7, 116, 68, 43, 86, 73, 40, 24, 88, 17, 90, 19, 8, 50, 110, 32, 39, 114, 26, 112, 55, 63, 57, 115, 46, 27, 83, 21, 124, 49, 48, 99, 36, 52, 45, 113, 51, 16, 111, 33, 74, 109, 91, 47, 28, 103, 125, 96, 81, 30, 59, 41, 105, 23, 98, 60, 108, 97, 121, 119, 100, 117, 77, 75, 34, 62, 94, 12, 127, 58, 61, 82, 35, 126, 122, 123, 44, 42, 104, 14, 37, 107, 20, 101, 106, 79], [42, 59, 35, 86, 55, 64, 8, 119, 58, 47, 80, 6, 14, 20, 92, 74, 114, 75, 25, 2, 62, 60, 18, 53, 127, 106, 13, 116, 112, 123, 57, 61, 56, 122, 4, 50, 125, 118, 95, 54, 87, 65, 67, 124, 96, 117, 51, 111, 126, 109, 48, 99, 110, 121, 49, 113, 103, 52, 69, 45, 120, 108, 30, 115, 85, 46, 36, 38, 90, 44, 76, 63, 40, 28, 32, 26, 91, 5, 100, 43, 83, 107, 94, 0, 98, 37, 3, 41, 101, 33, 105, 34, 15, 71, 97, 17, 102, 93, 79, 1, 104, 39, 27, 68, 73, 9, 88, 19, 31, 23, 29, 66, 21, 7, 24, 89, 72, 81, 12, 84, 16, 22, 77, 11, 78, 82, 10, 70], [38, 57, 59, 58, 93, 32, 26, 85, 87, 84, 15, 18, 74, 71, 0, 13, 123, 45, 52, 12, 111, 121, 22, 126, 63, 116, 110, 1, 55, 117, 42, 127, 16, 107, 125, 108, 62, 51, 114, 54, 61, 109, 56, 53, 60, 124, 99, 122, 25, 106, 3, 112, 46, 115, 48, 118, 41, 119, 49, 5, 43, 50, 89, 44, 105, 102, 4, 120, 2, 24, 104, 75, 47, 113, 68, 35, 101, 95, 14, 103, 39, 17, 40, 66, 19, 69, 100, 83, 78, 36, 70, 27, 81, 9, 34, 98, 88, 37, 33, 92, 30, 28, 97, 72, 82, 31, 94, 91, 11, 73, 8, 80, 7, 20, 29, 86, 6, 77, 65, 23, 21, 67, 76, 90, 64, 79, 96, 10], [40, 126, 83, 81, 92, 35, 115, 85, 87, 15, 74, 111, 96, 89, 72, 76, 46, 42, 79, 52, 122, 12, 68, 58, 0, 28, 70, 106, 127, 114, 48, 6, 66, 5, 14, 54, 44, 57, 116, 11, 119, 31, 2, 56, 41, 1, 3, 43, 123, 110, 65, 125, 32, 118, 60, 75, 107, 9, 109, 29, 50, 82, 112, 120, 78, 49, 101, 93, 7, 13, 25, 121, 84, 59, 77, 88, 90, 86, 24, 34, 64, 8, 67, 108, 94, 30, 53, 20, 61, 47, 27, 39, 55, 117, 113, 69, 100, 26, 36, 22, 33, 105, 97, 103, 99, 102, 91, 51, 124, 95, 73, 98, 38, 23, 37, 16, 62, 45, 4, 63, 80, 10, 18, 71, 21, 19, 17, 104], [126, 103, 116, 86, 99, 55, 96, 52, 94, 62, 48, 26, 53, 112, 57, 109, 42, 91, 88, 63, 28, 58, 113, 56, 123, 20, 29, 110, 121, 102, 117, 50, 114, 47, 92, 124, 83, 35, 13, 115, 60, 81, 119, 127, 120, 59, 105, 125, 51, 37, 97, 43, 98, 49, 122, 61, 54, 111, 107, 95, 89, 46, 30, 33, 44, 79, 45, 118, 106, 10, 70, 40, 101, 108, 15, 36, 31, 104, 34, 7, 76, 27, 41, 85, 65, 38, 100, 72, 73, 87, 23, 66, 21, 14, 93, 0, 18, 5, 3, 90, 32, 17, 80, 25, 82, 6, 39, 71, 68, 16, 4, 8, 74, 24, 22, 84, 75, 77, 78, 12, 19, 9, 2, 11, 67, 69, 1, 64], [55, 39, 34, 22, 28, 89, 62, 125, 78, 117, 20, 53, 58, 50, 51, 54, 83, 109, 96, 112, 61, 59, 44, 113, 49, 114, 48, 121, 118, 119, 111, 60, 116, 63, 124, 126, 46, 127, 8, 47, 52, 64, 5, 56, 123, 82, 73, 43, 120, 91, 42, 122, 115, 12, 106, 1, 108, 110, 45, 30, 40, 18, 104, 79, 105, 107, 95, 57, 37, 93, 100, 81, 98, 23, 76, 41, 70, 77, 36, 3, 38, 66, 102, 27, 26, 90, 101, 35, 33, 31, 2, 32, 4, 11, 85, 99, 92, 10, 75, 24, 88, 97, 0, 80, 6, 65, 15, 17, 29, 21, 71, 94, 69, 103, 16, 25, 74, 87, 84, 13, 19, 7, 67, 68, 14, 9, 72, 86], [103, 127, 56, 34, 22, 55, 121, 95, 28, 118, 63, 122, 89, 116, 51, 125, 115, 62, 119, 57, 59, 58, 60, 53, 43, 52, 38, 124, 49, 48, 47, 117, 114, 61, 84, 92, 120, 123, 46, 106, 111, 44, 112, 113, 54, 126, 107, 50, 108, 71, 45, 23, 105, 29, 80, 109, 42, 75, 110, 40, 104, 35, 16, 41, 78, 101, 99, 37, 27, 98, 96, 102, 100, 18, 36, 10, 12, 64, 25, 33, 81, 86, 97, 30, 94, 87, 68, 79, 91, 39, 93, 88, 13, 90, 32, 66, 85, 83, 82, 17, 24, 26, 69, 21, 31, 1, 9, 20, 6, 8, 73, 14, 77, 19, 4, 5, 7, 76, 11, 65, 15, 74, 67, 2, 3, 72, 0, 70]], "model.layers.21.self_attn.qk_proj": [[126, 55, 120, 54, 59, 112, 127, 57, 49, 58, 56, 116, 42, 102, 118, 62, 53, 106, 121, 39, 28, 48, 103, 84, 20, 79, 119, 114, 25, 38, 15, 89, 29, 113, 86, 82, 122, 92, 40, 60, 22, 52, 23, 87, 99, 90, 35, 115, 18, 32, 12, 14, 111, 46, 78, 76, 6, 85, 72, 50, 117, 93, 51, 64, 75, 61, 21, 96, 125, 19, 10, 43, 26, 63, 123, 30, 108, 31, 34, 0, 83, 110, 124, 9, 45, 17, 44, 47, 8, 16, 11, 74, 80, 104, 73, 105, 5, 7, 69, 66, 95, 109, 67, 81, 107, 2, 97, 65, 13, 1, 71, 98, 68, 27, 77, 37, 33, 3, 41, 70, 36, 24, 88, 101, 94, 91, 4, 100], [126, 55, 120, 54, 59, 112, 49, 127, 57, 58, 56, 116, 42, 102, 118, 62, 53, 106, 121, 39, 28, 103, 48, 79, 20, 114, 84, 25, 29, 38, 89, 113, 119, 92, 22, 23, 60, 15, 99, 90, 52, 122, 87, 82, 12, 115, 86, 32, 35, 18, 78, 14, 51, 111, 40, 6, 76, 72, 117, 123, 93, 85, 75, 50, 125, 46, 0, 43, 44, 63, 108, 19, 26, 17, 21, 61, 64, 47, 30, 69, 31, 66, 124, 8, 74, 73, 11, 9, 96, 10, 109, 105, 104, 34, 110, 45, 2, 5, 83, 7, 95, 71, 13, 67, 27, 80, 16, 3, 88, 81, 33, 98, 1, 65, 107, 36, 97, 77, 41, 37, 68, 70, 4, 101, 91, 94, 24, 100], [126, 55, 120, 54, 59, 112, 127, 49, 57, 56, 58, 116, 42, 102, 62, 106, 118, 53, 121, 28, 39, 103, 84, 48, 29, 20, 25, 15, 99, 22, 92, 23, 122, 119, 38, 79, 114, 86, 89, 115, 90, 113, 60, 87, 35, 117, 52, 78, 40, 82, 18, 76, 14, 12, 111, 32, 6, 51, 64, 85, 72, 46, 0, 96, 124, 44, 125, 75, 63, 93, 31, 21, 69, 43, 30, 34, 108, 2, 50, 19, 123, 10, 110, 5, 74, 8, 17, 65, 66, 11, 47, 109, 83, 9, 104, 61, 26, 98, 73, 70, 16, 1, 95, 7, 80, 67, 13, 3, 81, 77, 41, 105, 45, 36, 88, 27, 71, 107, 4, 68, 33, 24, 97, 37, 91, 101, 94, 100], [126, 55, 120, 54, 59, 112, 127, 49, 57, 56, 58, 116, 42, 102, 118, 62, 121, 106, 53, 39, 28, 48, 103, 84, 20, 89, 79, 15, 29, 92, 119, 22, 25, 99, 113, 23, 60, 38, 52, 115, 40, 14, 82, 122, 90, 35, 18, 114, 12, 86, 78, 111, 76, 32, 87, 117, 46, 85, 6, 0, 17, 51, 72, 125, 96, 50, 30, 5, 11, 75, 93, 63, 123, 43, 110, 69, 47, 44, 34, 70, 19, 64, 2, 8, 10, 104, 83, 108, 124, 31, 21, 26, 61, 16, 66, 109, 65, 73, 74, 1, 9, 77, 98, 27, 81, 95, 45, 105, 67, 80, 7, 107, 71, 3, 88, 41, 101, 33, 36, 4, 13, 68, 91, 24, 97, 37, 94, 100], [126, 55, 120, 54, 59, 112, 127, 49, 57, 56, 58, 116, 42, 118, 102, 62, 106, 121, 53, 39, 103, 113, 28, 20, 84, 48, 60, 79, 38, 15, 23, 89, 119, 114, 29, 115, 82, 25, 92, 22, 86, 52, 90, 76, 122, 99, 40, 18, 78, 12, 32, 117, 35, 111, 14, 51, 50, 75, 46, 87, 85, 72, 96, 63, 125, 8, 70, 64, 0, 21, 93, 26, 43, 109, 44, 17, 6, 31, 69, 10, 30, 47, 2, 19, 5, 110, 61, 11, 7, 74, 16, 83, 34, 73, 123, 104, 66, 81, 98, 9, 108, 95, 65, 1, 105, 124, 27, 80, 77, 4, 67, 45, 71, 33, 88, 3, 107, 97, 13, 37, 36, 68, 101, 41, 91, 94, 24, 100], [126, 55, 120, 54, 59, 112, 127, 57, 49, 58, 56, 116, 42, 102, 118, 62, 106, 121, 53, 103, 28, 39, 20, 84, 79, 29, 48, 114, 89, 86, 15, 115, 82, 119, 23, 113, 60, 76, 22, 25, 40, 38, 92, 35, 99, 78, 122, 18, 32, 14, 90, 12, 87, 52, 111, 85, 50, 75, 63, 72, 70, 43, 51, 46, 125, 117, 8, 17, 31, 64, 21, 96, 93, 34, 0, 124, 26, 16, 83, 30, 44, 69, 19, 11, 123, 10, 108, 109, 9, 81, 47, 104, 74, 95, 66, 5, 2, 61, 73, 97, 7, 27, 110, 13, 65, 80, 45, 67, 1, 105, 3, 98, 107, 6, 4, 71, 77, 33, 88, 24, 41, 68, 37, 94, 91, 36, 101, 100], [126, 55, 120, 54, 59, 112, 127, 57, 49, 58, 56, 42, 116, 118, 102, 62, 106, 121, 53, 39, 28, 20, 84, 79, 23, 25, 29, 103, 38, 60, 15, 89, 86, 90, 22, 99, 82, 119, 18, 76, 40, 122, 12, 92, 113, 48, 35, 114, 87, 32, 78, 115, 111, 70, 85, 14, 51, 75, 46, 52, 117, 50, 93, 64, 19, 8, 13, 125, 72, 16, 17, 11, 10, 30, 21, 63, 9, 0, 77, 26, 31, 83, 44, 96, 123, 43, 34, 1, 66, 47, 74, 104, 80, 81, 45, 124, 105, 69, 110, 61, 7, 73, 2, 71, 65, 108, 109, 95, 97, 5, 41, 27, 4, 3, 107, 98, 36, 37, 88, 6, 67, 33, 24, 91, 101, 68, 94, 100], [126, 55, 120, 54, 59, 112, 49, 57, 127, 58, 56, 116, 42, 102, 118, 106, 62, 121, 53, 28, 39, 20, 25, 122, 38, 103, 23, 84, 22, 29, 90, 48, 60, 119, 15, 79, 32, 86, 111, 89, 82, 99, 52, 113, 18, 92, 12, 40, 70, 117, 85, 115, 87, 35, 78, 14, 76, 114, 51, 46, 125, 0, 93, 21, 75, 123, 30, 64, 19, 96, 11, 63, 8, 31, 83, 43, 26, 50, 104, 34, 44, 16, 47, 17, 110, 61, 13, 10, 72, 108, 74, 69, 124, 9, 80, 73, 2, 65, 45, 95, 77, 81, 66, 109, 67, 7, 41, 5, 3, 36, 68, 1, 98, 6, 105, 33, 71, 107, 37, 4, 27, 88, 24, 97, 91, 94, 101, 100], [126, 55, 120, 54, 59, 112, 57, 127, 49, 58, 56, 42, 116, 102, 118, 62, 106, 121, 53, 28, 39, 48, 84, 38, 20, 29, 15, 103, 89, 122, 25, 119, 111, 23, 79, 60, 22, 92, 90, 87, 51, 52, 99, 32, 86, 78, 82, 35, 18, 76, 115, 40, 114, 12, 113, 14, 85, 117, 46, 70, 93, 123, 125, 47, 17, 0, 21, 8, 96, 44, 26, 31, 19, 109, 110, 50, 30, 63, 83, 11, 75, 34, 64, 43, 45, 61, 72, 9, 10, 81, 104, 13, 73, 5, 74, 108, 16, 2, 65, 66, 6, 69, 77, 67, 124, 95, 80, 41, 105, 7, 68, 27, 3, 98, 71, 107, 33, 1, 88, 97, 101, 4, 36, 24, 37, 94, 91, 100], [126, 55, 120, 54, 59, 112, 127, 49, 57, 58, 56, 116, 42, 102, 118, 62, 121, 53, 106, 39, 103, 28, 48, 84, 79, 38, 20, 119, 114, 92, 89, 15, 29, 52, 25, 60, 86, 51, 23, 99, 90, 115, 122, 22, 113, 35, 40, 78, 76, 87, 111, 46, 50, 32, 12, 18, 14, 125, 82, 123, 8, 64, 85, 43, 117, 108, 110, 0, 63, 96, 21, 6, 74, 109, 75, 31, 70, 83, 11, 17, 124, 5, 30, 10, 72, 44, 93, 66, 47, 34, 45, 26, 2, 19, 73, 104, 9, 105, 81, 61, 69, 95, 98, 3, 67, 16, 7, 1, 27, 41, 71, 65, 80, 33, 77, 107, 13, 97, 36, 37, 88, 4, 91, 24, 68, 101, 94, 100], [126, 55, 120, 54, 59, 112, 127, 57, 49, 56, 58, 116, 42, 102, 118, 62, 53, 121, 106, 39, 103, 28, 20, 48, 79, 84, 113, 119, 89, 15, 38, 115, 99, 86, 25, 92, 23, 22, 29, 78, 40, 51, 35, 52, 60, 76, 82, 114, 8, 122, 90, 18, 14, 12, 0, 87, 32, 85, 6, 5, 50, 46, 64, 111, 125, 96, 66, 123, 72, 47, 117, 30, 34, 75, 74, 83, 43, 110, 93, 17, 11, 70, 124, 19, 9, 31, 73, 81, 44, 21, 65, 98, 61, 108, 2, 45, 67, 63, 69, 80, 3, 26, 71, 109, 1, 10, 105, 27, 7, 95, 16, 77, 104, 68, 33, 97, 13, 107, 88, 41, 4, 91, 94, 24, 101, 100, 36, 37], [126, 55, 120, 54, 59, 112, 57, 127, 49, 58, 56, 116, 42, 102, 118, 62, 106, 121, 53, 28, 39, 89, 20, 25, 84, 48, 86, 23, 22, 60, 15, 79, 103, 29, 38, 115, 99, 76, 40, 114, 35, 12, 90, 122, 78, 32, 113, 51, 92, 6, 18, 82, 119, 85, 111, 87, 8, 14, 96, 52, 93, 75, 19, 47, 17, 74, 11, 72, 0, 46, 50, 64, 83, 13, 34, 31, 43, 21, 123, 9, 73, 117, 30, 110, 125, 44, 81, 10, 16, 80, 26, 63, 5, 65, 77, 69, 109, 71, 108, 66, 104, 3, 1, 97, 67, 98, 45, 95, 61, 2, 105, 124, 27, 4, 7, 107, 70, 88, 41, 68, 36, 101, 91, 33, 94, 24, 37, 100], [126, 55, 54, 120, 112, 59, 49, 57, 127, 58, 56, 116, 42, 102, 118, 62, 121, 106, 53, 103, 39, 28, 25, 48, 20, 84, 60, 29, 23, 38, 15, 122, 90, 86, 89, 115, 52, 79, 92, 119, 32, 22, 111, 113, 51, 87, 35, 114, 18, 40, 82, 12, 99, 6, 78, 85, 76, 123, 50, 96, 43, 14, 46, 93, 125, 21, 47, 117, 108, 31, 72, 34, 64, 83, 124, 19, 63, 110, 11, 61, 104, 74, 73, 75, 17, 8, 26, 109, 10, 105, 77, 41, 30, 45, 44, 95, 0, 16, 5, 81, 2, 97, 9, 13, 69, 67, 80, 36, 66, 27, 98, 71, 3, 33, 1, 65, 7, 88, 107, 68, 37, 91, 101, 94, 4, 70, 24, 100], [126, 55, 120, 54, 112, 59, 57, 49, 127, 58, 56, 116, 42, 102, 118, 121, 62, 106, 53, 28, 39, 103, 119, 48, 84, 79, 22, 20, 86, 29, 15, 99, 89, 25, 60, 38, 23, 113, 114, 122, 51, 111, 87, 52, 12, 115, 40, 50, 35, 18, 92, 90, 76, 32, 14, 82, 6, 64, 78, 125, 0, 85, 72, 46, 123, 117, 8, 96, 63, 17, 75, 43, 45, 11, 21, 110, 61, 104, 10, 19, 1, 73, 83, 30, 31, 26, 74, 47, 2, 34, 5, 124, 93, 80, 66, 81, 9, 70, 108, 3, 69, 109, 13, 65, 105, 71, 41, 44, 16, 95, 67, 7, 27, 77, 107, 98, 97, 4, 33, 68, 101, 88, 36, 24, 37, 94, 91, 100], [126, 55, 120, 54, 112, 59, 127, 57, 49, 58, 56, 116, 42, 102, 118, 62, 106, 121, 53, 39, 28, 103, 22, 79, 84, 20, 29, 38, 23, 89, 48, 15, 25, 86, 40, 60, 113, 119, 92, 122, 114, 99, 12, 18, 76, 90, 82, 52, 32, 35, 115, 51, 78, 111, 0, 14, 46, 85, 72, 87, 8, 6, 123, 64, 96, 75, 21, 50, 5, 125, 117, 17, 83, 30, 74, 10, 26, 34, 70, 31, 11, 47, 45, 73, 81, 9, 93, 105, 66, 98, 44, 7, 43, 110, 2, 19, 61, 108, 63, 69, 1, 67, 16, 71, 80, 124, 95, 109, 3, 65, 27, 13, 41, 4, 77, 68, 91, 97, 33, 104, 37, 24, 88, 36, 107, 101, 94, 100], [126, 55, 120, 54, 112, 59, 127, 57, 49, 58, 56, 116, 42, 102, 118, 121, 62, 106, 53, 28, 103, 39, 22, 84, 79, 48, 89, 119, 29, 20, 115, 23, 99, 60, 25, 122, 15, 32, 86, 114, 12, 38, 113, 92, 40, 76, 18, 78, 90, 35, 52, 46, 111, 82, 85, 72, 87, 14, 51, 125, 8, 93, 0, 123, 75, 43, 50, 11, 96, 70, 30, 6, 10, 5, 21, 61, 64, 110, 17, 63, 34, 26, 2, 19, 83, 74, 73, 69, 105, 31, 47, 95, 7, 44, 109, 117, 108, 81, 9, 65, 98, 66, 13, 16, 124, 1, 80, 68, 45, 104, 71, 27, 4, 41, 97, 67, 77, 3, 107, 24, 91, 88, 33, 36, 37, 101, 100, 94], [126, 55, 120, 54, 59, 112, 127, 49, 57, 58, 56, 42, 116, 102, 118, 62, 121, 106, 28, 39, 53, 103, 25, 84, 89, 29, 20, 86, 38, 48, 60, 23, 119, 114, 79, 15, 22, 92, 90, 87, 111, 76, 115, 18, 40, 113, 52, 35, 82, 99, 32, 85, 122, 12, 51, 46, 125, 78, 70, 50, 14, 93, 123, 31, 117, 72, 21, 96, 19, 61, 10, 11, 63, 124, 44, 26, 30, 0, 64, 47, 9, 75, 34, 13, 83, 108, 43, 5, 16, 81, 17, 110, 8, 45, 80, 95, 41, 66, 1, 77, 105, 73, 104, 69, 65, 2, 74, 36, 67, 27, 97, 98, 109, 71, 7, 107, 37, 3, 6, 4, 24, 88, 101, 33, 91, 94, 68, 100], [126, 55, 120, 54, 59, 112, 127, 57, 49, 58, 56, 116, 42, 102, 118, 62, 106, 121, 28, 53, 39, 48, 84, 20, 103, 29, 89, 79, 38, 22, 60, 86, 15, 99, 114, 23, 113, 25, 12, 90, 92, 87, 76, 119, 122, 46, 111, 70, 82, 18, 14, 40, 35, 78, 32, 85, 115, 117, 0, 52, 123, 72, 64, 51, 44, 31, 26, 93, 63, 47, 96, 11, 30, 43, 75, 21, 125, 17, 10, 50, 83, 19, 69, 74, 104, 61, 66, 9, 5, 1, 34, 8, 110, 73, 80, 16, 2, 124, 108, 3, 95, 65, 13, 81, 27, 71, 77, 7, 45, 4, 67, 41, 36, 68, 109, 107, 6, 98, 97, 101, 88, 105, 37, 33, 91, 24, 94, 100], [126, 55, 120, 54, 59, 112, 127, 49, 57, 58, 56, 116, 42, 118, 102, 62, 121, 53, 106, 39, 48, 28, 103, 38, 20, 84, 119, 79, 89, 46, 122, 15, 29, 60, 111, 114, 92, 23, 113, 52, 86, 25, 99, 12, 40, 90, 22, 76, 82, 32, 35, 18, 123, 70, 87, 115, 14, 125, 78, 117, 51, 72, 93, 44, 63, 47, 11, 85, 64, 31, 96, 0, 75, 30, 110, 43, 10, 34, 61, 17, 26, 2, 5, 21, 83, 50, 109, 9, 8, 104, 45, 108, 74, 19, 81, 73, 105, 124, 98, 16, 1, 7, 97, 66, 107, 95, 6, 69, 80, 41, 71, 27, 67, 13, 3, 68, 33, 88, 37, 101, 36, 65, 77, 24, 91, 4, 94, 100], [126, 55, 120, 54, 59, 112, 127, 57, 49, 58, 56, 116, 42, 102, 118, 62, 121, 106, 53, 28, 39, 103, 48, 20, 119, 84, 29, 25, 38, 79, 60, 113, 99, 52, 90, 40, 114, 89, 23, 12, 15, 86, 92, 35, 115, 32, 22, 123, 76, 14, 46, 125, 122, 78, 87, 18, 82, 85, 51, 111, 70, 72, 117, 96, 93, 43, 0, 11, 30, 50, 108, 47, 31, 10, 34, 44, 124, 61, 63, 75, 66, 5, 64, 19, 8, 109, 74, 26, 9, 95, 6, 17, 104, 73, 21, 83, 110, 2, 81, 105, 98, 69, 7, 71, 13, 45, 107, 1, 3, 16, 33, 67, 80, 68, 37, 97, 77, 88, 27, 65, 41, 4, 36, 91, 101, 100, 94, 24], [126, 55, 120, 54, 59, 112, 127, 57, 49, 58, 56, 116, 42, 102, 118, 121, 62, 106, 53, 28, 48, 103, 39, 119, 20, 113, 29, 84, 79, 60, 38, 22, 86, 23, 89, 25, 15, 35, 40, 122, 92, 90, 46, 99, 114, 12, 115, 52, 76, 32, 87, 18, 78, 111, 82, 125, 123, 51, 14, 117, 43, 11, 85, 72, 96, 50, 34, 6, 47, 31, 26, 104, 30, 0, 17, 93, 44, 19, 63, 75, 64, 10, 61, 45, 108, 21, 83, 9, 8, 110, 70, 74, 80, 95, 66, 2, 5, 81, 124, 109, 73, 71, 67, 16, 69, 1, 13, 98, 3, 7, 27, 107, 88, 36, 91, 97, 33, 77, 65, 68, 24, 41, 101, 105, 37, 4, 94, 100], [126, 55, 120, 54, 59, 112, 127, 57, 49, 58, 56, 116, 42, 118, 102, 121, 62, 106, 53, 28, 39, 20, 48, 84, 29, 60, 38, 103, 15, 25, 22, 113, 79, 122, 92, 119, 99, 23, 89, 111, 46, 86, 114, 52, 35, 90, 12, 125, 82, 14, 40, 78, 76, 115, 32, 64, 6, 87, 123, 0, 18, 51, 43, 85, 117, 47, 93, 72, 50, 8, 63, 21, 61, 2, 30, 31, 10, 17, 11, 5, 75, 66, 19, 44, 110, 124, 96, 26, 34, 108, 104, 80, 1, 83, 95, 71, 74, 69, 9, 73, 45, 65, 81, 109, 70, 107, 67, 16, 77, 7, 41, 97, 13, 27, 98, 36, 3, 105, 4, 33, 68, 88, 91, 101, 37, 24, 94, 100], [126, 55, 120, 54, 59, 112, 127, 49, 57, 58, 56, 116, 42, 118, 102, 62, 121, 106, 53, 28, 39, 103, 20, 25, 60, 38, 114, 84, 48, 92, 119, 29, 122, 22, 86, 90, 23, 117, 111, 113, 46, 89, 79, 15, 32, 125, 40, 52, 35, 115, 87, 14, 99, 51, 82, 18, 12, 61, 6, 78, 76, 64, 43, 96, 30, 123, 44, 108, 124, 21, 31, 11, 10, 50, 93, 47, 85, 8, 34, 17, 19, 72, 0, 63, 110, 104, 83, 5, 75, 109, 26, 66, 74, 9, 45, 95, 107, 3, 73, 2, 69, 71, 81, 16, 80, 1, 97, 7, 98, 36, 13, 67, 88, 33, 77, 41, 27, 70, 65, 91, 37, 24, 68, 105, 4, 101, 94, 100], [126, 55, 120, 54, 112, 59, 57, 49, 127, 58, 56, 116, 42, 102, 118, 62, 106, 121, 53, 28, 39, 48, 103, 114, 29, 20, 84, 38, 79, 113, 89, 25, 119, 22, 15, 60, 40, 23, 92, 52, 115, 86, 82, 90, 12, 14, 99, 35, 122, 76, 87, 46, 18, 32, 111, 6, 78, 8, 123, 96, 43, 85, 0, 104, 51, 72, 10, 50, 125, 63, 11, 21, 30, 61, 26, 34, 83, 44, 17, 31, 9, 93, 19, 108, 117, 110, 47, 2, 5, 74, 75, 64, 71, 7, 45, 69, 73, 95, 81, 13, 109, 27, 1, 105, 67, 66, 16, 98, 80, 70, 65, 124, 3, 97, 107, 33, 4, 77, 88, 91, 101, 41, 24, 37, 68, 36, 94, 100], [126, 55, 120, 54, 59, 112, 57, 49, 127, 58, 56, 116, 42, 102, 118, 62, 121, 106, 39, 28, 53, 20, 84, 25, 23, 38, 103, 60, 89, 79, 15, 119, 86, 48, 22, 29, 113, 90, 12, 114, 92, 40, 18, 122, 99, 115, 82, 14, 46, 52, 32, 35, 51, 76, 78, 123, 111, 125, 87, 6, 85, 8, 21, 11, 83, 0, 61, 10, 108, 64, 34, 72, 96, 75, 93, 19, 17, 45, 26, 63, 74, 47, 43, 50, 71, 9, 104, 30, 124, 16, 31, 109, 73, 80, 117, 13, 81, 70, 5, 66, 77, 44, 2, 95, 110, 65, 69, 1, 67, 97, 7, 105, 98, 27, 107, 4, 91, 3, 33, 37, 68, 88, 36, 24, 101, 94, 41, 100], [126, 55, 120, 54, 59, 112, 127, 49, 57, 58, 56, 116, 42, 102, 118, 62, 121, 28, 53, 106, 39, 48, 103, 25, 89, 23, 38, 20, 119, 22, 60, 29, 84, 86, 79, 122, 113, 90, 40, 123, 15, 32, 35, 12, 92, 82, 46, 52, 99, 61, 114, 14, 18, 51, 111, 115, 85, 76, 87, 78, 93, 43, 21, 8, 125, 50, 30, 108, 0, 96, 10, 31, 11, 117, 34, 44, 26, 70, 47, 64, 19, 83, 75, 6, 110, 9, 17, 72, 80, 63, 104, 124, 77, 66, 95, 109, 69, 73, 45, 74, 5, 98, 81, 16, 13, 97, 41, 71, 7, 105, 2, 65, 3, 27, 107, 1, 67, 33, 36, 68, 94, 37, 88, 101, 4, 24, 91, 100], [126, 55, 120, 54, 59, 112, 49, 127, 57, 58, 56, 42, 116, 102, 118, 121, 53, 62, 106, 28, 39, 103, 119, 48, 25, 20, 23, 84, 38, 89, 60, 29, 15, 79, 22, 115, 46, 52, 86, 114, 99, 35, 122, 51, 113, 123, 90, 82, 111, 12, 92, 32, 76, 40, 87, 14, 117, 61, 18, 125, 70, 78, 8, 11, 85, 50, 44, 63, 96, 34, 110, 45, 64, 83, 108, 0, 30, 93, 21, 104, 109, 47, 31, 72, 19, 73, 17, 75, 43, 5, 124, 69, 3, 10, 13, 105, 74, 80, 26, 98, 9, 65, 2, 81, 16, 1, 41, 66, 97, 7, 77, 6, 68, 95, 107, 67, 27, 36, 71, 33, 4, 24, 101, 94, 88, 91, 37, 100], [126, 55, 120, 54, 59, 112, 127, 57, 49, 58, 56, 116, 42, 102, 118, 62, 121, 106, 53, 39, 28, 20, 84, 79, 103, 89, 48, 23, 15, 38, 25, 52, 119, 29, 12, 82, 113, 22, 60, 114, 90, 76, 14, 92, 122, 32, 78, 18, 99, 40, 86, 70, 35, 46, 50, 123, 8, 115, 87, 85, 117, 125, 111, 64, 21, 72, 83, 96, 43, 5, 11, 19, 0, 93, 51, 74, 47, 73, 9, 66, 61, 31, 63, 75, 26, 81, 10, 30, 2, 17, 104, 44, 34, 108, 45, 69, 110, 80, 1, 7, 124, 67, 95, 109, 77, 105, 6, 98, 71, 16, 13, 68, 41, 3, 65, 27, 4, 97, 33, 24, 107, 36, 101, 88, 37, 94, 91, 100], [126, 55, 120, 54, 59, 112, 127, 49, 57, 58, 56, 116, 42, 102, 118, 53, 62, 121, 106, 103, 28, 114, 39, 84, 38, 20, 48, 52, 79, 25, 89, 15, 22, 29, 92, 40, 23, 113, 86, 119, 60, 32, 12, 99, 50, 35, 76, 70, 90, 18, 115, 82, 78, 14, 122, 64, 8, 111, 46, 31, 85, 51, 0, 123, 125, 63, 87, 43, 45, 5, 61, 11, 72, 34, 108, 75, 96, 66, 69, 74, 117, 10, 93, 124, 17, 2, 30, 19, 21, 83, 1, 110, 73, 104, 44, 109, 7, 47, 95, 65, 81, 71, 9, 26, 16, 33, 98, 3, 107, 27, 77, 67, 105, 13, 97, 4, 80, 6, 37, 101, 36, 24, 88, 41, 68, 100, 94, 91], [126, 55, 120, 54, 112, 59, 127, 49, 57, 58, 56, 116, 42, 102, 118, 121, 62, 53, 106, 28, 39, 103, 84, 20, 38, 89, 15, 29, 60, 79, 52, 25, 48, 86, 119, 22, 113, 23, 32, 92, 114, 82, 18, 99, 76, 14, 40, 12, 35, 78, 115, 90, 46, 70, 87, 50, 51, 122, 125, 111, 8, 63, 123, 93, 117, 96, 19, 21, 85, 72, 31, 11, 43, 75, 124, 17, 73, 30, 83, 10, 74, 34, 108, 69, 61, 26, 110, 0, 45, 104, 80, 44, 13, 7, 64, 9, 109, 5, 47, 81, 27, 67, 66, 95, 16, 1, 2, 65, 105, 77, 71, 98, 3, 6, 68, 107, 97, 37, 33, 91, 4, 41, 101, 24, 88, 36, 94, 100], [126, 55, 120, 54, 112, 59, 127, 49, 57, 58, 56, 116, 42, 102, 118, 62, 106, 121, 53, 28, 103, 39, 114, 84, 20, 25, 113, 48, 15, 115, 119, 29, 23, 52, 79, 22, 89, 90, 122, 60, 40, 86, 38, 12, 32, 99, 92, 14, 51, 76, 78, 18, 46, 111, 35, 125, 87, 82, 50, 63, 85, 0, 72, 123, 124, 70, 43, 74, 8, 96, 64, 31, 117, 6, 19, 75, 110, 93, 104, 34, 9, 73, 7, 30, 5, 11, 2, 47, 10, 83, 69, 21, 17, 66, 44, 13, 108, 45, 65, 61, 16, 1, 26, 81, 80, 98, 95, 109, 97, 67, 68, 77, 71, 3, 41, 27, 105, 4, 107, 37, 36, 101, 33, 24, 88, 91, 100, 94], [126, 55, 120, 54, 59, 112, 49, 57, 127, 58, 56, 42, 116, 102, 118, 62, 121, 106, 103, 39, 53, 28, 48, 20, 84, 25, 23, 15, 29, 79, 38, 89, 114, 86, 119, 60, 22, 90, 92, 115, 18, 99, 40, 35, 12, 32, 14, 52, 122, 76, 113, 111, 51, 82, 125, 123, 46, 63, 87, 50, 85, 78, 75, 72, 43, 61, 93, 8, 74, 104, 96, 6, 11, 117, 108, 19, 110, 21, 31, 17, 30, 124, 73, 70, 44, 9, 34, 10, 83, 45, 64, 109, 47, 5, 26, 80, 105, 13, 81, 7, 0, 66, 41, 98, 77, 27, 95, 69, 71, 65, 16, 2, 107, 33, 67, 3, 88, 36, 68, 97, 1, 37, 4, 94, 24, 91, 101, 100]], "model.layers.22.self_attn.q_proj": [[57, 40, 50, 121, 54, 118, 51, 125, 52, 119, 123, 124, 63, 60, 117, 58, 61, 116, 33, 47, 62, 45, 114, 127, 59, 48, 56, 111, 126, 112, 49, 38, 55, 44, 53, 113, 122, 115, 120, 36, 102, 92, 108, 42, 109, 46, 107, 28, 17, 43, 110, 106, 94, 77, 41, 26, 25, 85, 105, 103, 83, 86, 89, 99, 39, 32, 87, 95, 19, 35, 100, 30, 37, 104, 27, 101, 31, 11, 22, 97, 21, 96, 34, 24, 98, 81, 13, 90, 29, 93, 69, 15, 2, 88, 84, 71, 0, 91, 79, 20, 16, 3, 75, 82, 1, 5, 9, 23, 67, 78, 65, 64, 73, 18, 66, 7, 80, 4, 14, 10, 74, 68, 72, 12, 8, 6, 70, 76], [50, 40, 121, 57, 51, 118, 54, 33, 114, 125, 52, 119, 123, 45, 44, 63, 58, 60, 116, 117, 127, 124, 112, 62, 49, 47, 59, 126, 61, 36, 113, 56, 48, 53, 94, 120, 122, 111, 55, 46, 115, 28, 17, 107, 26, 109, 110, 92, 25, 41, 108, 38, 43, 77, 99, 30, 42, 89, 102, 106, 85, 105, 86, 83, 19, 95, 87, 100, 24, 103, 21, 27, 31, 35, 39, 104, 101, 22, 37, 32, 98, 97, 11, 90, 84, 34, 2, 81, 96, 16, 13, 15, 69, 64, 93, 88, 82, 78, 0, 29, 79, 5, 66, 73, 23, 1, 65, 91, 67, 3, 75, 20, 7, 72, 9, 12, 18, 4, 14, 71, 68, 74, 10, 80, 6, 8, 70, 76], [121, 40, 50, 114, 125, 51, 54, 118, 57, 33, 63, 45, 52, 123, 119, 60, 117, 127, 58, 44, 116, 124, 56, 112, 59, 126, 62, 108, 61, 48, 113, 28, 46, 53, 120, 49, 122, 47, 55, 111, 115, 110, 17, 102, 26, 92, 42, 107, 109, 36, 94, 25, 38, 99, 106, 85, 43, 41, 105, 89, 95, 100, 30, 77, 87, 86, 19, 83, 103, 21, 35, 39, 104, 24, 37, 101, 96, 32, 22, 27, 97, 98, 34, 31, 81, 11, 13, 84, 2, 82, 88, 90, 64, 73, 29, 93, 16, 0, 15, 79, 9, 78, 1, 91, 69, 65, 23, 14, 5, 75, 66, 3, 4, 7, 67, 20, 71, 12, 18, 72, 10, 74, 68, 6, 70, 8, 80, 76], [40, 121, 50, 57, 82, 114, 78, 12, 33, 27, 16, 25, 99, 118, 23, 38, 29, 74, 19, 86, 6, 10, 93, 18, 36, 89, 72, 94, 51, 14, 70, 76, 104, 8, 28, 20, 84, 63, 102, 88, 90, 30, 95, 68, 54, 21, 80, 58, 119, 4, 55, 87, 91, 22, 125, 9, 15, 85, 49, 31, 1, 81, 96, 26, 62, 47, 42, 73, 24, 7, 127, 66, 116, 83, 92, 3, 52, 123, 39, 11, 32, 13, 115, 2, 124, 60, 64, 44, 98, 34, 117, 17, 105, 61, 79, 35, 126, 53, 101, 112, 48, 56, 5, 122, 45, 75, 77, 109, 71, 107, 67, 120, 113, 106, 37, 108, 43, 110, 59, 41, 46, 111, 100, 103, 69, 97, 0, 65], [42, 54, 32, 35, 123, 99, 106, 50, 89, 85, 27, 87, 16, 23, 58, 29, 96, 18, 78, 36, 57, 40, 98, 83, 103, 116, 107, 117, 7, 84, 62, 31, 17, 108, 61, 48, 60, 82, 44, 12, 47, 37, 105, 95, 100, 77, 26, 21, 125, 74, 76, 118, 46, 55, 45, 41, 90, 9, 114, 10, 20, 115, 94, 113, 101, 119, 92, 127, 110, 19, 124, 112, 81, 97, 122, 109, 126, 102, 49, 28, 24, 121, 59, 75, 104, 56, 22, 70, 88, 79, 43, 120, 34, 68, 33, 51, 25, 53, 63, 93, 15, 8, 91, 30, 52, 86, 39, 111, 80, 38, 4, 11, 14, 0, 72, 66, 13, 3, 69, 71, 5, 73, 64, 67, 6, 65, 1, 2], [54, 42, 32, 89, 117, 99, 123, 85, 87, 23, 29, 58, 35, 106, 96, 28, 50, 121, 55, 44, 36, 60, 15, 111, 33, 101, 57, 40, 46, 82, 122, 52, 126, 51, 112, 53, 31, 47, 10, 16, 100, 78, 103, 104, 41, 18, 125, 119, 45, 110, 109, 38, 83, 114, 95, 81, 17, 105, 115, 39, 118, 21, 12, 25, 113, 56, 76, 127, 19, 124, 48, 116, 34, 107, 37, 14, 27, 8, 98, 90, 108, 91, 49, 77, 63, 62, 79, 59, 102, 43, 120, 97, 30, 61, 69, 93, 84, 80, 92, 24, 72, 26, 94, 86, 74, 22, 88, 6, 67, 20, 75, 66, 71, 13, 11, 7, 5, 3, 70, 65, 73, 64, 68, 2, 4, 9, 1, 0], [54, 42, 32, 99, 35, 29, 89, 87, 96, 115, 23, 85, 36, 106, 95, 27, 110, 62, 105, 101, 18, 61, 123, 119, 94, 44, 45, 50, 83, 82, 117, 118, 63, 59, 48, 41, 86, 113, 28, 40, 121, 112, 55, 58, 31, 93, 120, 47, 114, 92, 103, 109, 125, 107, 37, 127, 122, 104, 57, 53, 108, 34, 60, 100, 126, 38, 52, 49, 16, 51, 20, 43, 116, 15, 124, 102, 90, 46, 33, 56, 22, 39, 26, 97, 12, 77, 98, 78, 84, 10, 25, 111, 30, 91, 81, 17, 24, 13, 75, 88, 14, 19, 76, 21, 4, 6, 8, 72, 74, 67, 7, 69, 79, 73, 1, 68, 5, 80, 70, 9, 11, 0, 66, 71, 64, 3, 65, 2], [42, 54, 32, 85, 35, 123, 106, 99, 89, 23, 16, 87, 76, 78, 18, 12, 50, 48, 27, 84, 29, 125, 36, 11, 14, 6, 72, 117, 10, 81, 4, 115, 31, 21, 80, 51, 25, 17, 8, 116, 28, 26, 101, 111, 79, 104, 83, 77, 3, 74, 109, 55, 92, 82, 2, 19, 9, 73, 60, 88, 57, 62, 1, 98, 69, 86, 64, 124, 97, 103, 119, 33, 66, 37, 44, 90, 93, 49, 102, 40, 105, 108, 30, 95, 114, 7, 110, 96, 67, 100, 41, 94, 75, 46, 56, 15, 127, 24, 126, 34, 38, 107, 53, 70, 63, 20, 118, 91, 43, 5, 112, 22, 39, 47, 52, 13, 45, 58, 71, 120, 122, 61, 59, 113, 121, 68, 65, 0], [117, 38, 120, 97, 25, 58, 63, 93, 89, 83, 116, 52, 53, 121, 55, 49, 102, 39, 86, 112, 115, 75, 122, 126, 127, 113, 119, 57, 125, 123, 50, 109, 47, 21, 48, 60, 114, 118, 46, 62, 20, 84, 41, 88, 105, 61, 59, 111, 110, 51, 44, 73, 56, 29, 124, 17, 100, 54, 104, 45, 80, 106, 42, 107, 108, 103, 40, 43, 22, 87, 16, 101, 37, 24, 26, 98, 99, 34, 36, 35, 15, 95, 18, 96, 31, 77, 94, 19, 23, 71, 32, 30, 33, 92, 4, 28, 79, 91, 14, 81, 90, 13, 11, 85, 27, 68, 1, 7, 82, 72, 3, 74, 70, 65, 64, 10, 76, 69, 66, 78, 9, 5, 0, 2, 8, 12, 67, 6], [38, 120, 117, 63, 97, 93, 17, 83, 25, 89, 86, 20, 29, 88, 102, 14, 58, 76, 74, 121, 65, 72, 15, 53, 116, 77, 71, 73, 49, 64, 41, 126, 122, 60, 100, 57, 4, 91, 46, 55, 59, 22, 11, 119, 123, 98, 94, 50, 81, 127, 115, 34, 10, 26, 39, 66, 104, 7, 47, 111, 109, 125, 16, 113, 52, 42, 118, 19, 56, 48, 51, 112, 90, 54, 114, 61, 44, 45, 84, 79, 62, 96, 24, 28, 35, 37, 78, 82, 2, 103, 87, 70, 108, 31, 75, 101, 124, 99, 106, 67, 40, 105, 32, 95, 21, 30, 36, 80, 5, 43, 27, 85, 110, 92, 12, 9, 13, 69, 1, 23, 33, 68, 107, 8, 3, 18, 6, 0], [38, 120, 117, 97, 58, 25, 83, 86, 93, 63, 73, 89, 17, 102, 29, 121, 15, 53, 88, 3, 14, 49, 75, 22, 126, 57, 20, 71, 19, 109, 41, 76, 77, 113, 116, 55, 127, 84, 46, 111, 79, 72, 52, 119, 11, 4, 47, 122, 65, 118, 112, 39, 125, 87, 115, 51, 60, 62, 50, 59, 123, 74, 70, 45, 21, 80, 56, 81, 9, 44, 114, 61, 16, 48, 42, 104, 10, 82, 100, 24, 108, 67, 124, 78, 34, 105, 103, 2, 85, 110, 33, 90, 23, 54, 7, 107, 40, 8, 37, 98, 35, 91, 28, 101, 43, 36, 0, 66, 99, 13, 69, 26, 12, 94, 18, 96, 1, 106, 68, 92, 31, 95, 6, 27, 30, 32, 5, 64], [38, 117, 120, 97, 63, 83, 70, 93, 25, 15, 73, 89, 3, 86, 58, 53, 41, 14, 17, 88, 80, 74, 29, 109, 78, 77, 75, 121, 57, 84, 65, 22, 55, 111, 115, 46, 49, 19, 79, 105, 0, 102, 118, 98, 122, 76, 42, 126, 106, 116, 119, 52, 125, 72, 4, 20, 107, 113, 127, 50, 56, 21, 54, 104, 62, 61, 47, 60, 9, 6, 123, 45, 59, 110, 11, 10, 44, 39, 51, 26, 114, 48, 12, 103, 2, 112, 81, 124, 82, 85, 8, 66, 108, 67, 43, 87, 71, 24, 40, 16, 36, 69, 34, 101, 94, 1, 23, 31, 7, 37, 95, 30, 90, 99, 28, 91, 100, 5, 32, 18, 35, 27, 68, 92, 64, 96, 13, 33], [122, 58, 53, 38, 125, 124, 121, 56, 87, 90, 92, 55, 59, 34, 111, 127, 76, 117, 63, 126, 30, 81, 123, 113, 62, 50, 120, 54, 52, 110, 49, 57, 61, 60, 114, 119, 48, 118, 51, 116, 115, 47, 100, 102, 19, 112, 8, 40, 107, 109, 91, 3, 22, 44, 89, 46, 83, 45, 21, 108, 41, 23, 2, 32, 80, 31, 103, 84, 105, 42, 43, 104, 106, 72, 36, 94, 39, 5, 37, 29, 35, 85, 28, 96, 33, 16, 24, 93, 25, 101, 70, 65, 95, 17, 14, 27, 26, 99, 10, 86, 0, 97, 13, 6, 78, 64, 12, 73, 9, 88, 98, 77, 1, 18, 15, 82, 4, 11, 75, 79, 66, 71, 20, 7, 68, 69, 67, 74], [53, 38, 125, 122, 58, 87, 121, 56, 30, 127, 34, 92, 90, 60, 50, 113, 63, 124, 55, 59, 123, 48, 120, 126, 32, 51, 41, 57, 114, 61, 62, 52, 118, 54, 119, 116, 47, 111, 115, 49, 21, 117, 76, 102, 110, 19, 108, 46, 85, 112, 40, 45, 107, 83, 106, 22, 109, 81, 44, 94, 37, 43, 80, 42, 100, 27, 28, 26, 84, 104, 105, 39, 103, 33, 91, 101, 23, 29, 36, 15, 31, 72, 97, 99, 16, 89, 77, 35, 86, 13, 78, 96, 98, 17, 93, 3, 0, 95, 65, 8, 5, 14, 18, 88, 2, 25, 69, 20, 73, 10, 75, 74, 24, 11, 9, 82, 12, 79, 71, 7, 6, 68, 70, 66, 4, 67, 64, 1], [122, 53, 38, 90, 121, 58, 56, 59, 127, 34, 87, 60, 55, 124, 92, 126, 123, 63, 117, 52, 30, 120, 48, 57, 111, 113, 61, 49, 110, 114, 125, 115, 62, 118, 119, 54, 50, 51, 47, 116, 102, 83, 22, 112, 46, 81, 19, 94, 32, 89, 109, 85, 23, 104, 108, 41, 14, 43, 84, 103, 107, 76, 31, 44, 105, 17, 106, 45, 36, 40, 42, 101, 25, 86, 28, 78, 100, 13, 82, 80, 37, 27, 39, 35, 98, 33, 26, 21, 29, 15, 97, 99, 88, 95, 8, 24, 91, 96, 16, 65, 93, 72, 18, 10, 12, 79, 69, 20, 3, 11, 6, 9, 77, 75, 7, 68, 74, 73, 71, 70, 67, 5, 0, 2, 66, 64, 4, 1], [125, 122, 53, 38, 58, 90, 121, 56, 51, 59, 127, 113, 120, 110, 81, 87, 55, 50, 63, 30, 61, 48, 126, 124, 123, 102, 49, 108, 57, 52, 117, 60, 114, 119, 34, 118, 62, 47, 111, 46, 109, 54, 115, 116, 19, 44, 106, 112, 92, 39, 42, 107, 43, 76, 103, 83, 22, 101, 32, 100, 94, 37, 105, 41, 21, 40, 45, 104, 36, 99, 14, 31, 35, 33, 8, 85, 91, 17, 26, 93, 24, 96, 29, 23, 97, 27, 84, 78, 95, 88, 28, 25, 98, 86, 80, 72, 10, 89, 74, 12, 15, 79, 13, 16, 18, 82, 11, 77, 3, 9, 73, 20, 70, 69, 75, 6, 5, 68, 7, 4, 71, 66, 0, 64, 65, 1, 67, 2], [102, 110, 97, 126, 46, 73, 21, 77, 82, 68, 11, 15, 26, 66, 90, 70, 53, 0, 7, 25, 87, 71, 85, 121, 31, 18, 1, 23, 29, 79, 100, 93, 4, 54, 118, 81, 88, 124, 10, 117, 55, 9, 57, 106, 78, 28, 64, 13, 80, 22, 20, 127, 2, 75, 32, 5, 108, 76, 83, 65, 69, 17, 12, 58, 6, 105, 92, 3, 89, 63, 16, 24, 116, 115, 86, 125, 84, 27, 72, 39, 40, 104, 123, 98, 91, 48, 59, 30, 62, 96, 49, 74, 52, 122, 37, 101, 67, 60, 14, 95, 112, 34, 19, 8, 51, 45, 99, 35, 94, 103, 109, 50, 61, 120, 56, 47, 36, 43, 107, 42, 44, 119, 111, 113, 41, 114, 33, 38], [102, 110, 126, 97, 21, 26, 82, 46, 15, 77, 73, 11, 87, 90, 93, 68, 7, 23, 12, 25, 45, 115, 40, 53, 78, 70, 8, 18, 44, 76, 85, 118, 84, 55, 66, 120, 54, 116, 109, 127, 0, 96, 101, 36, 95, 31, 37, 91, 13, 112, 79, 72, 125, 24, 83, 1, 98, 121, 124, 27, 41, 48, 19, 71, 32, 114, 17, 103, 52, 56, 75, 122, 58, 69, 14, 9, 123, 39, 89, 49, 42, 106, 59, 88, 63, 22, 81, 105, 6, 20, 51, 107, 62, 94, 80, 119, 117, 28, 34, 29, 3, 86, 60, 38, 47, 99, 61, 57, 108, 43, 92, 104, 30, 111, 10, 16, 100, 50, 35, 113, 74, 4, 33, 2, 5, 67, 65, 64], [102, 110, 126, 97, 118, 46, 26, 21, 15, 82, 87, 11, 31, 63, 95, 90, 93, 68, 49, 73, 77, 44, 41, 120, 53, 116, 88, 127, 106, 122, 123, 25, 54, 112, 22, 7, 101, 38, 84, 42, 56, 37, 12, 58, 52, 17, 51, 80, 29, 59, 109, 70, 23, 43, 92, 48, 119, 55, 57, 124, 66, 5, 113, 94, 45, 125, 111, 115, 0, 65, 104, 121, 117, 83, 10, 50, 108, 60, 105, 1, 62, 85, 100, 47, 96, 91, 69, 61, 36, 107, 27, 78, 98, 99, 28, 114, 40, 30, 39, 19, 8, 86, 76, 3, 32, 103, 79, 74, 16, 34, 71, 64, 67, 24, 72, 35, 89, 20, 18, 4, 81, 75, 33, 14, 2, 6, 13, 9], [102, 110, 97, 126, 21, 46, 26, 77, 82, 11, 15, 90, 73, 116, 87, 70, 92, 91, 68, 7, 25, 66, 37, 54, 32, 45, 85, 99, 62, 72, 49, 30, 83, 108, 41, 51, 19, 124, 18, 44, 111, 52, 71, 0, 113, 53, 122, 112, 127, 105, 104, 88, 24, 118, 48, 58, 100, 89, 40, 106, 50, 17, 123, 8, 75, 101, 16, 95, 36, 121, 96, 78, 63, 42, 93, 31, 94, 22, 38, 98, 3, 76, 115, 79, 28, 60, 55, 5, 107, 64, 33, 34, 114, 39, 35, 125, 109, 61, 119, 20, 57, 10, 47, 84, 6, 120, 29, 103, 59, 13, 43, 1, 86, 56, 81, 2, 117, 14, 9, 23, 27, 74, 80, 12, 67, 69, 4, 65], [111, 55, 101, 47, 25, 87, 113, 95, 21, 28, 48, 109, 58, 119, 82, 56, 59, 31, 62, 97, 91, 118, 94, 44, 54, 45, 122, 89, 33, 127, 126, 42, 116, 51, 115, 50, 120, 107, 61, 102, 80, 106, 23, 15, 76, 85, 123, 38, 53, 74, 121, 112, 32, 39, 92, 18, 57, 40, 110, 52, 105, 19, 104, 125, 117, 72, 124, 22, 41, 77, 63, 37, 49, 46, 30, 108, 16, 98, 29, 103, 99, 35, 60, 34, 93, 114, 26, 100, 14, 90, 84, 71, 96, 17, 36, 83, 69, 43, 86, 88, 27, 24, 75, 81, 20, 11, 13, 73, 79, 65, 9, 7, 78, 70, 3, 12, 67, 1, 5, 4, 8, 66, 10, 2, 68, 6, 0, 64], [111, 55, 101, 47, 25, 95, 21, 87, 28, 82, 33, 94, 91, 15, 74, 53, 76, 59, 62, 97, 22, 89, 72, 29, 114, 60, 56, 121, 49, 116, 107, 37, 92, 45, 93, 123, 38, 32, 11, 70, 43, 18, 110, 109, 67, 122, 98, 54, 113, 48, 103, 57, 23, 99, 115, 78, 102, 4, 69, 31, 80, 44, 51, 40, 83, 85, 105, 124, 36, 119, 50, 34, 86, 46, 66, 41, 13, 117, 77, 104, 120, 58, 26, 52, 108, 118, 42, 27, 71, 17, 61, 127, 63, 112, 96, 39, 125, 106, 84, 19, 90, 126, 79, 10, 81, 100, 30, 8, 7, 24, 12, 35, 73, 88, 20, 9, 75, 16, 64, 68, 5, 3, 1, 14, 65, 2, 6, 0], [111, 101, 55, 47, 25, 87, 21, 95, 15, 74, 82, 70, 4, 89, 76, 94, 64, 97, 67, 77, 28, 80, 113, 2, 114, 72, 84, 50, 73, 18, 0, 91, 58, 31, 126, 29, 79, 66, 11, 85, 6, 122, 23, 92, 40, 83, 16, 10, 71, 45, 3, 116, 107, 75, 41, 65, 5, 7, 86, 35, 27, 88, 13, 98, 93, 9, 14, 12, 121, 33, 8, 78, 19, 123, 17, 68, 96, 81, 69, 20, 24, 90, 26, 105, 34, 22, 117, 44, 30, 32, 51, 115, 46, 38, 99, 54, 37, 1, 62, 124, 60, 109, 127, 103, 100, 59, 61, 48, 39, 110, 108, 102, 53, 120, 125, 42, 36, 112, 52, 104, 56, 63, 106, 57, 119, 49, 118, 43], [111, 55, 101, 47, 25, 120, 87, 21, 121, 28, 95, 56, 116, 31, 82, 110, 53, 33, 123, 37, 126, 38, 46, 58, 50, 124, 104, 122, 115, 89, 60, 44, 62, 51, 97, 112, 59, 29, 118, 61, 57, 127, 76, 23, 63, 48, 94, 52, 45, 39, 107, 103, 113, 91, 49, 117, 114, 85, 80, 42, 109, 125, 105, 119, 32, 40, 108, 41, 98, 102, 99, 106, 18, 15, 90, 54, 30, 26, 84, 43, 34, 72, 77, 92, 35, 96, 86, 100, 93, 27, 36, 16, 74, 78, 20, 67, 17, 11, 83, 22, 79, 88, 24, 81, 13, 14, 69, 19, 71, 70, 9, 12, 65, 75, 7, 73, 8, 4, 64, 3, 66, 5, 10, 2, 0, 1, 68, 6], [115, 100, 51, 23, 91, 32, 96, 84, 86, 9, 77, 15, 81, 39, 71, 5, 11, 27, 20, 108, 49, 10, 94, 36, 24, 83, 22, 44, 68, 21, 67, 118, 17, 0, 64, 65, 57, 82, 126, 127, 121, 117, 114, 16, 54, 45, 110, 2, 87, 79, 28, 35, 95, 107, 33, 13, 12, 80, 73, 25, 101, 74, 3, 69, 19, 76, 40, 85, 116, 37, 18, 42, 75, 30, 112, 8, 90, 89, 104, 92, 72, 124, 70, 88, 122, 14, 41, 78, 48, 120, 52, 34, 99, 50, 62, 4, 93, 97, 102, 123, 47, 105, 29, 98, 63, 119, 125, 109, 60, 53, 113, 31, 66, 38, 6, 46, 103, 56, 26, 7, 59, 1, 43, 106, 55, 111, 61, 58], [115, 100, 51, 121, 36, 91, 32, 23, 86, 84, 39, 82, 77, 81, 127, 48, 96, 113, 15, 118, 33, 20, 29, 44, 63, 104, 55, 71, 116, 120, 45, 119, 123, 49, 37, 47, 108, 22, 102, 114, 41, 126, 107, 50, 42, 106, 46, 54, 35, 110, 124, 53, 125, 105, 27, 10, 43, 38, 52, 111, 112, 16, 59, 56, 60, 58, 57, 30, 61, 122, 40, 95, 11, 19, 18, 62, 99, 109, 97, 117, 66, 34, 101, 98, 64, 31, 103, 83, 17, 26, 92, 24, 94, 5, 68, 85, 87, 90, 12, 21, 93, 89, 28, 76, 73, 6, 88, 8, 25, 13, 79, 78, 2, 75, 9, 80, 14, 67, 74, 70, 72, 1, 69, 4, 0, 65, 7, 3], [115, 100, 51, 121, 36, 91, 23, 32, 84, 86, 103, 63, 82, 20, 114, 104, 44, 22, 77, 15, 48, 39, 117, 46, 107, 99, 55, 43, 127, 49, 116, 29, 34, 45, 81, 110, 31, 30, 40, 119, 61, 118, 37, 33, 57, 108, 38, 41, 112, 42, 120, 113, 54, 125, 101, 35, 111, 60, 87, 106, 96, 58, 109, 53, 59, 56, 52, 95, 123, 47, 98, 126, 92, 105, 62, 24, 90, 97, 102, 27, 50, 94, 124, 122, 28, 9, 93, 71, 26, 17, 16, 85, 74, 25, 88, 76, 18, 19, 14, 89, 79, 13, 21, 11, 83, 80, 78, 10, 6, 12, 66, 64, 7, 73, 1, 4, 0, 75, 5, 72, 8, 68, 70, 69, 3, 2, 67, 65], [115, 100, 51, 36, 23, 91, 121, 96, 84, 86, 81, 11, 32, 15, 118, 94, 48, 117, 39, 9, 50, 108, 116, 107, 102, 5, 20, 2, 119, 25, 105, 45, 77, 44, 0, 37, 113, 27, 63, 82, 58, 62, 68, 19, 57, 46, 47, 110, 120, 49, 114, 126, 122, 43, 112, 123, 99, 125, 127, 53, 59, 22, 54, 41, 61, 55, 24, 124, 109, 52, 104, 42, 95, 17, 60, 65, 40, 56, 33, 38, 106, 35, 111, 70, 101, 71, 103, 87, 88, 93, 64, 16, 26, 98, 30, 79, 34, 29, 73, 83, 10, 31, 97, 90, 72, 75, 69, 8, 28, 14, 89, 12, 67, 6, 92, 13, 21, 18, 66, 78, 85, 4, 1, 80, 76, 3, 74, 7], [115, 40, 122, 53, 93, 124, 49, 55, 58, 51, 52, 48, 120, 33, 59, 60, 56, 62, 127, 111, 125, 26, 44, 61, 54, 47, 50, 123, 63, 113, 43, 126, 114, 118, 121, 116, 107, 57, 119, 45, 46, 112, 117, 22, 19, 109, 42, 110, 90, 29, 108, 88, 84, 41, 101, 77, 99, 38, 104, 106, 105, 32, 94, 92, 95, 103, 39, 83, 36, 102, 37, 34, 81, 100, 89, 97, 35, 86, 16, 98, 96, 25, 82, 28, 87, 71, 27, 10, 31, 13, 21, 23, 91, 79, 30, 80, 17, 14, 7, 3, 24, 74, 15, 12, 78, 85, 2, 20, 64, 66, 0, 18, 9, 67, 5, 72, 11, 69, 65, 6, 1, 75, 73, 4, 68, 76, 8, 70], [53, 115, 40, 122, 124, 93, 49, 52, 55, 59, 51, 33, 60, 56, 62, 48, 111, 126, 61, 58, 113, 117, 120, 125, 43, 114, 50, 127, 54, 26, 121, 63, 47, 123, 109, 119, 57, 116, 118, 46, 45, 112, 90, 22, 108, 81, 107, 44, 110, 35, 29, 88, 37, 42, 41, 39, 105, 101, 106, 103, 92, 77, 99, 36, 104, 32, 34, 102, 95, 84, 100, 83, 38, 94, 19, 89, 97, 86, 98, 17, 25, 96, 31, 27, 16, 82, 13, 28, 20, 30, 15, 79, 24, 10, 91, 85, 14, 87, 21, 71, 80, 74, 78, 23, 72, 5, 7, 12, 66, 64, 11, 8, 0, 2, 1, 3, 75, 4, 67, 18, 69, 65, 68, 9, 73, 6, 70, 76], [40, 115, 53, 122, 88, 14, 33, 19, 82, 12, 22, 90, 80, 9, 92, 49, 28, 111, 91, 26, 30, 1, 55, 73, 93, 6, 31, 50, 124, 29, 25, 75, 20, 72, 27, 4, 95, 23, 17, 89, 69, 86, 78, 15, 96, 85, 32, 58, 10, 38, 117, 59, 76, 18, 51, 24, 34, 45, 98, 83, 70, 112, 21, 100, 104, 35, 127, 52, 16, 99, 105, 46, 56, 11, 123, 94, 79, 84, 42, 81, 106, 108, 61, 113, 121, 13, 8, 68, 109, 62, 114, 3, 120, 54, 110, 87, 116, 103, 101, 126, 37, 63, 125, 119, 39, 102, 71, 77, 48, 107, 36, 47, 43, 60, 118, 41, 57, 67, 7, 74, 44, 97, 5, 0, 66, 65, 2, 64], [122, 40, 53, 115, 33, 124, 93, 55, 52, 60, 49, 59, 26, 48, 62, 111, 61, 56, 120, 58, 125, 113, 101, 47, 127, 51, 54, 116, 126, 121, 119, 114, 50, 117, 123, 57, 63, 118, 109, 45, 84, 95, 39, 88, 112, 90, 43, 29, 46, 108, 22, 41, 107, 110, 44, 77, 34, 32, 19, 25, 42, 89, 94, 92, 106, 99, 100, 105, 83, 103, 37, 21, 81, 97, 102, 104, 35, 38, 24, 36, 98, 96, 86, 31, 28, 23, 17, 87, 27, 16, 91, 30, 2, 80, 3, 82, 69, 64, 13, 74, 1, 65, 14, 5, 0, 79, 66, 10, 7, 71, 67, 85, 15, 20, 78, 12, 11, 68, 9, 6, 72, 75, 4, 73, 8, 18, 70, 76]], "model.layers.22.self_attn.k_proj": [[104, 121, 50, 86, 57, 97, 92, 100, 54, 94, 51, 118, 114, 119, 25, 31, 124, 63, 125, 58, 17, 60, 117, 123, 102, 62, 126, 116, 52, 127, 48, 85, 55, 83, 47, 53, 49, 56, 112, 122, 44, 120, 59, 113, 111, 115, 27, 105, 61, 46, 107, 108, 35, 110, 106, 42, 19, 109, 43, 77, 39, 41, 45, 16, 80, 103, 12, 15, 37, 78, 93, 33, 99, 34, 38, 87, 26, 75, 79, 98, 101, 91, 32, 76, 84, 11, 23, 72, 96, 82, 90, 7, 95, 36, 5, 28, 10, 20, 24, 13, 30, 29, 40, 74, 6, 88, 73, 71, 18, 89, 21, 9, 4, 3, 22, 8, 14, 81, 70, 1, 68, 0, 67, 69, 2, 64, 66, 65], [106, 54, 89, 96, 123, 50, 23, 85, 16, 17, 99, 42, 78, 82, 76, 48, 124, 27, 114, 30, 29, 21, 57, 72, 19, 34, 112, 74, 12, 93, 14, 108, 10, 18, 70, 55, 77, 100, 60, 47, 68, 127, 125, 3, 35, 118, 7, 11, 53, 115, 46, 8, 79, 104, 110, 95, 59, 20, 49, 15, 111, 109, 119, 41, 98, 38, 28, 105, 44, 65, 13, 83, 101, 107, 5, 126, 9, 81, 113, 33, 84, 37, 117, 120, 43, 116, 52, 103, 92, 56, 66, 31, 61, 63, 90, 122, 102, 40, 45, 62, 39, 121, 86, 97, 24, 51, 94, 87, 36, 75, 64, 91, 58, 88, 26, 22, 25, 6, 69, 80, 32, 71, 73, 67, 4, 2, 1, 0], [120, 117, 102, 33, 86, 29, 25, 58, 17, 83, 49, 121, 0, 50, 88, 73, 118, 14, 126, 65, 119, 53, 3, 122, 15, 116, 57, 46, 47, 125, 103, 55, 123, 52, 115, 113, 111, 38, 4, 62, 75, 112, 44, 48, 51, 114, 127, 61, 105, 45, 7, 59, 56, 70, 107, 36, 106, 41, 124, 60, 77, 74, 108, 109, 84, 63, 80, 110, 42, 21, 104, 94, 16, 40, 20, 18, 72, 54, 43, 98, 39, 90, 13, 5, 89, 37, 87, 34, 76, 91, 100, 26, 99, 2, 68, 31, 35, 92, 28, 101, 95, 64, 71, 1, 10, 67, 23, 96, 69, 32, 8, 82, 27, 85, 30, 12, 93, 79, 11, 66, 78, 24, 81, 19, 6, 9, 97, 22], [122, 53, 102, 22, 98, 57, 121, 117, 62, 56, 55, 110, 50, 116, 28, 59, 123, 115, 63, 124, 120, 126, 52, 48, 60, 58, 118, 114, 113, 127, 119, 54, 51, 61, 112, 49, 111, 33, 47, 109, 44, 94, 104, 108, 96, 46, 107, 90, 45, 40, 105, 34, 43, 125, 41, 80, 37, 42, 106, 78, 103, 36, 92, 101, 39, 16, 82, 100, 85, 13, 95, 19, 86, 83, 97, 99, 35, 17, 73, 87, 24, 25, 93, 31, 11, 29, 27, 81, 76, 32, 38, 30, 68, 84, 23, 91, 21, 15, 26, 88, 10, 71, 5, 14, 18, 8, 79, 75, 7, 67, 89, 72, 66, 20, 77, 70, 9, 69, 74, 1, 12, 0, 2, 65, 6, 3, 64, 4], [46, 126, 38, 33, 110, 82, 21, 26, 15, 77, 73, 0, 11, 70, 66, 7, 68, 64, 87, 25, 53, 3, 93, 65, 121, 19, 17, 95, 54, 2, 84, 125, 58, 10, 80, 12, 118, 88, 59, 115, 63, 48, 22, 1, 124, 45, 94, 55, 14, 105, 71, 16, 49, 112, 74, 117, 40, 8, 60, 104, 44, 92, 101, 116, 123, 51, 52, 67, 62, 6, 113, 42, 120, 96, 127, 69, 41, 43, 56, 39, 5, 119, 109, 34, 83, 114, 23, 103, 111, 122, 107, 108, 57, 35, 37, 32, 50, 36, 72, 99, 28, 61, 47, 102, 29, 106, 24, 100, 81, 89, 20, 86, 98, 30, 27, 78, 31, 91, 4, 9, 76, 79, 97, 75, 13, 85, 90, 18], [47, 111, 37, 55, 87, 21, 72, 82, 15, 25, 76, 31, 74, 66, 70, 107, 77, 69, 64, 4, 94, 80, 65, 119, 97, 102, 28, 116, 78, 115, 56, 91, 40, 44, 58, 123, 118, 53, 38, 43, 110, 121, 39, 120, 127, 62, 7, 92, 83, 122, 42, 60, 113, 52, 1, 105, 114, 45, 124, 126, 117, 104, 108, 112, 59, 106, 125, 22, 34, 63, 51, 48, 73, 17, 61, 93, 46, 54, 57, 50, 90, 98, 49, 103, 32, 41, 109, 36, 71, 20, 33, 86, 35, 29, 99, 19, 81, 100, 11, 30, 26, 13, 84, 96, 85, 24, 2, 14, 27, 68, 101, 88, 16, 23, 75, 10, 3, 9, 5, 12, 67, 8, 6, 89, 79, 0, 95, 18], [51, 115, 36, 86, 91, 96, 84, 23, 81, 77, 15, 71, 64, 121, 94, 82, 11, 117, 1, 49, 44, 113, 119, 68, 57, 9, 48, 66, 112, 39, 47, 46, 106, 45, 108, 67, 120, 122, 127, 50, 53, 125, 5, 101, 63, 24, 104, 58, 59, 41, 114, 109, 61, 107, 126, 118, 62, 56, 55, 3, 60, 103, 123, 70, 105, 54, 10, 116, 43, 124, 88, 52, 26, 0, 110, 2, 19, 97, 38, 76, 42, 99, 111, 100, 31, 25, 37, 74, 80, 40, 85, 89, 78, 33, 93, 16, 65, 30, 98, 6, 69, 34, 12, 102, 72, 92, 14, 21, 29, 95, 35, 8, 90, 28, 83, 4, 18, 73, 32, 7, 27, 87, 13, 79, 75, 22, 20, 17], [104, 115, 53, 22, 122, 97, 51, 29, 124, 111, 59, 56, 52, 26, 28, 55, 49, 62, 126, 50, 54, 118, 117, 113, 61, 60, 127, 116, 121, 119, 47, 58, 82, 63, 12, 19, 83, 120, 114, 48, 125, 110, 123, 35, 91, 112, 16, 109, 81, 88, 44, 57, 45, 6, 107, 105, 77, 43, 15, 93, 46, 32, 89, 94, 108, 17, 42, 41, 84, 31, 36, 14, 80, 37, 33, 78, 101, 103, 96, 24, 102, 79, 30, 106, 39, 27, 95, 23, 38, 9, 99, 98, 34, 90, 13, 87, 100, 69, 75, 40, 74, 25, 85, 21, 11, 18, 92, 72, 66, 20, 7, 68, 71, 73, 70, 0, 64, 76, 10, 65, 86, 4, 67, 1, 8, 2, 3, 5]], "model.layers.22.self_attn.qk_proj": [[115, 53, 122, 51, 117, 120, 47, 111, 126, 121, 54, 50, 46, 110, 57, 106, 55, 42, 102, 38, 104, 89, 58, 22, 123, 118, 125, 25, 87, 97, 85, 49, 40, 27, 56, 36, 21, 90, 86, 23, 93, 18, 62, 63, 127, 79, 60, 101, 59, 96, 15, 33, 82, 77, 114, 124, 99, 13, 116, 81, 48, 17, 26, 112, 52, 61, 19, 100, 29, 44, 83, 119, 11, 37, 113, 28, 84, 32, 95, 92, 45, 9, 30, 94, 73, 91, 31, 75, 7, 108, 20, 16, 109, 41, 14, 80, 78, 76, 70, 12, 39, 35, 4, 0, 71, 34, 74, 10, 105, 107, 8, 98, 64, 43, 68, 88, 103, 67, 3, 2, 66, 6, 24, 65, 1, 72, 5, 69], [115, 53, 122, 117, 51, 120, 47, 111, 126, 121, 54, 50, 46, 110, 57, 55, 106, 42, 102, 38, 123, 58, 89, 22, 104, 125, 87, 97, 118, 25, 36, 62, 85, 56, 49, 23, 93, 27, 40, 86, 21, 90, 114, 101, 79, 59, 116, 82, 99, 113, 52, 18, 13, 63, 100, 96, 127, 77, 15, 33, 29, 60, 19, 81, 124, 17, 112, 73, 61, 26, 119, 44, 48, 92, 28, 91, 11, 109, 32, 94, 107, 45, 20, 105, 70, 37, 83, 41, 75, 9, 84, 76, 108, 0, 16, 14, 12, 78, 95, 4, 7, 30, 39, 31, 35, 98, 80, 74, 43, 103, 10, 71, 64, 34, 2, 68, 24, 88, 8, 1, 67, 3, 66, 72, 65, 6, 5, 69], [115, 53, 122, 117, 51, 120, 47, 111, 121, 126, 54, 50, 46, 57, 110, 55, 106, 42, 102, 38, 58, 104, 22, 25, 118, 123, 89, 97, 93, 27, 56, 23, 87, 59, 36, 21, 40, 62, 85, 86, 125, 49, 63, 116, 15, 90, 114, 33, 60, 18, 82, 101, 77, 79, 100, 99, 96, 13, 52, 48, 32, 127, 124, 113, 81, 17, 119, 45, 92, 112, 29, 44, 107, 26, 73, 19, 28, 37, 11, 20, 0, 108, 94, 41, 91, 30, 61, 83, 84, 70, 109, 71, 9, 75, 95, 34, 7, 98, 35, 80, 105, 76, 12, 78, 31, 4, 39, 16, 14, 43, 64, 10, 103, 74, 68, 88, 66, 72, 2, 67, 3, 65, 1, 24, 69, 6, 8, 5], [115, 53, 122, 117, 51, 120, 47, 111, 126, 121, 50, 54, 46, 57, 110, 106, 55, 42, 38, 102, 58, 123, 104, 25, 22, 89, 125, 56, 62, 21, 87, 63, 59, 27, 97, 93, 86, 118, 85, 23, 15, 114, 49, 82, 18, 77, 40, 90, 36, 100, 124, 101, 127, 13, 52, 33, 81, 79, 45, 116, 48, 99, 96, 119, 113, 112, 32, 60, 17, 107, 11, 37, 29, 84, 61, 83, 70, 26, 73, 12, 44, 19, 9, 92, 95, 28, 64, 41, 105, 109, 91, 7, 71, 98, 14, 94, 75, 78, 43, 16, 80, 30, 76, 31, 4, 108, 20, 0, 74, 68, 34, 66, 35, 72, 10, 103, 67, 39, 3, 24, 65, 88, 2, 6, 1, 5, 69, 8], [115, 122, 53, 117, 51, 120, 47, 111, 126, 121, 54, 50, 46, 110, 57, 106, 55, 42, 102, 38, 58, 123, 22, 89, 97, 104, 118, 49, 63, 25, 21, 86, 87, 85, 62, 27, 125, 23, 93, 40, 59, 82, 36, 114, 56, 15, 79, 60, 18, 124, 48, 77, 33, 81, 45, 90, 116, 100, 99, 13, 119, 52, 127, 96, 44, 101, 19, 26, 112, 61, 17, 92, 11, 29, 73, 91, 9, 113, 75, 37, 71, 20, 32, 31, 95, 83, 28, 64, 16, 94, 14, 78, 7, 80, 41, 12, 84, 76, 0, 109, 98, 108, 39, 10, 43, 70, 34, 105, 107, 72, 74, 4, 68, 35, 30, 6, 2, 67, 24, 66, 103, 1, 3, 88, 5, 69, 65, 8], [115, 53, 122, 117, 51, 120, 47, 111, 121, 126, 50, 54, 46, 57, 110, 106, 55, 42, 102, 38, 123, 22, 58, 86, 27, 89, 85, 125, 97, 21, 25, 87, 104, 62, 93, 40, 23, 56, 49, 63, 82, 15, 77, 33, 59, 36, 60, 118, 18, 81, 13, 119, 90, 79, 114, 124, 99, 96, 100, 127, 116, 11, 48, 17, 37, 92, 84, 113, 32, 29, 19, 83, 26, 52, 41, 101, 16, 112, 31, 73, 28, 75, 94, 91, 9, 44, 20, 61, 95, 14, 12, 98, 45, 80, 76, 7, 109, 108, 78, 74, 0, 64, 10, 105, 71, 68, 107, 30, 72, 39, 6, 35, 43, 4, 34, 24, 2, 103, 65, 70, 3, 67, 88, 1, 66, 5, 69, 8], [115, 53, 122, 117, 51, 120, 47, 111, 126, 54, 121, 50, 46, 57, 110, 106, 55, 42, 102, 38, 22, 123, 87, 89, 86, 25, 85, 104, 23, 27, 21, 58, 40, 97, 93, 60, 63, 36, 15, 62, 90, 49, 82, 96, 79, 125, 33, 18, 118, 77, 114, 13, 99, 116, 32, 17, 59, 48, 81, 113, 119, 52, 44, 100, 19, 101, 56, 84, 127, 29, 112, 94, 11, 124, 26, 92, 91, 45, 83, 28, 107, 37, 76, 73, 109, 20, 12, 61, 95, 16, 9, 78, 80, 75, 14, 41, 31, 43, 39, 105, 34, 72, 108, 7, 35, 71, 30, 10, 6, 64, 74, 98, 0, 24, 88, 103, 68, 4, 1, 66, 3, 2, 67, 70, 69, 65, 5, 8], [115, 53, 122, 117, 51, 120, 47, 126, 111, 54, 121, 50, 46, 57, 110, 55, 106, 42, 38, 102, 22, 89, 87, 97, 25, 63, 85, 21, 58, 36, 60, 23, 27, 104, 118, 93, 90, 86, 40, 123, 18, 49, 62, 59, 96, 33, 15, 125, 82, 112, 116, 119, 48, 79, 77, 100, 113, 101, 17, 99, 52, 114, 61, 56, 32, 13, 124, 81, 44, 26, 91, 29, 20, 28, 127, 19, 37, 84, 108, 80, 35, 12, 73, 83, 11, 109, 92, 94, 107, 39, 34, 95, 43, 45, 14, 6, 9, 16, 88, 76, 78, 75, 41, 105, 30, 7, 31, 64, 72, 4, 74, 71, 10, 98, 0, 24, 103, 68, 66, 65, 2, 3, 67, 69, 1, 5, 70, 8], [115, 53, 122, 117, 51, 120, 47, 126, 111, 121, 54, 50, 46, 110, 57, 55, 106, 42, 102, 38, 89, 58, 22, 125, 25, 62, 36, 27, 93, 87, 21, 23, 97, 123, 118, 90, 104, 85, 63, 49, 59, 114, 82, 86, 112, 127, 40, 15, 77, 96, 116, 56, 33, 79, 101, 124, 60, 61, 52, 18, 48, 32, 100, 99, 119, 19, 107, 13, 28, 113, 81, 9, 17, 109, 45, 29, 26, 37, 44, 11, 20, 41, 83, 91, 92, 84, 35, 94, 108, 12, 78, 105, 39, 80, 14, 34, 31, 76, 73, 95, 30, 6, 98, 75, 16, 7, 103, 43, 74, 72, 68, 64, 0, 88, 71, 4, 10, 24, 66, 65, 2, 1, 3, 67, 5, 70, 69, 8], [115, 53, 122, 51, 117, 120, 47, 126, 111, 54, 121, 50, 46, 110, 57, 55, 106, 42, 102, 38, 104, 58, 123, 89, 22, 125, 62, 25, 87, 118, 27, 63, 97, 23, 119, 85, 36, 93, 86, 49, 21, 90, 114, 52, 56, 59, 40, 61, 96, 99, 124, 101, 79, 116, 127, 18, 60, 77, 82, 13, 48, 15, 100, 33, 45, 107, 11, 32, 91, 109, 41, 9, 94, 17, 28, 81, 112, 19, 37, 113, 92, 44, 26, 29, 20, 83, 108, 84, 73, 35, 75, 105, 31, 14, 78, 7, 12, 39, 43, 16, 6, 34, 30, 64, 95, 0, 76, 71, 98, 80, 103, 74, 68, 4, 10, 70, 2, 72, 3, 66, 1, 8, 67, 24, 88, 65, 69, 5], [115, 53, 122, 51, 117, 120, 47, 111, 121, 126, 54, 50, 46, 110, 106, 57, 55, 42, 38, 102, 58, 104, 125, 89, 22, 25, 63, 97, 27, 123, 86, 87, 62, 21, 49, 118, 85, 23, 119, 100, 93, 82, 59, 114, 101, 77, 18, 52, 90, 15, 79, 116, 40, 33, 99, 56, 36, 127, 61, 13, 96, 48, 113, 17, 124, 83, 112, 81, 29, 9, 60, 41, 94, 28, 37, 32, 26, 109, 92, 16, 64, 44, 91, 20, 11, 12, 84, 7, 45, 75, 107, 70, 19, 74, 73, 95, 76, 31, 71, 78, 80, 14, 39, 98, 68, 10, 4, 35, 0, 108, 103, 105, 43, 8, 34, 6, 66, 30, 2, 3, 24, 72, 67, 1, 88, 65, 69, 5], [115, 53, 122, 51, 117, 120, 47, 111, 126, 121, 54, 50, 46, 110, 57, 106, 55, 42, 102, 38, 22, 89, 123, 86, 25, 21, 104, 58, 40, 87, 27, 97, 36, 85, 118, 23, 90, 82, 93, 79, 62, 125, 49, 124, 63, 77, 18, 116, 96, 59, 15, 33, 119, 61, 114, 100, 99, 127, 17, 13, 112, 32, 113, 81, 101, 19, 26, 83, 60, 94, 28, 44, 84, 92, 20, 48, 11, 37, 29, 9, 73, 56, 52, 12, 75, 91, 41, 95, 14, 70, 31, 76, 80, 78, 10, 16, 7, 45, 107, 64, 98, 0, 74, 35, 34, 105, 71, 4, 109, 108, 68, 39, 43, 30, 88, 24, 8, 103, 66, 3, 1, 6, 2, 67, 72, 69, 65, 5], [115, 53, 122, 117, 51, 120, 47, 126, 111, 54, 121, 50, 46, 110, 57, 55, 106, 102, 42, 38, 89, 22, 123, 87, 25, 118, 36, 23, 93, 97, 21, 86, 27, 58, 40, 85, 104, 90, 82, 125, 96, 116, 63, 49, 33, 59, 100, 99, 56, 114, 112, 124, 18, 61, 119, 60, 15, 113, 29, 101, 13, 62, 32, 79, 77, 26, 81, 28, 94, 19, 44, 17, 91, 92, 20, 83, 127, 48, 37, 107, 41, 109, 84, 9, 75, 105, 95, 39, 11, 31, 52, 14, 16, 73, 35, 108, 45, 12, 70, 30, 78, 80, 34, 64, 76, 10, 43, 71, 98, 7, 8, 68, 24, 88, 0, 103, 4, 74, 65, 66, 2, 3, 1, 67, 69, 5, 72, 6], [115, 122, 53, 117, 51, 120, 47, 126, 111, 54, 121, 50, 46, 110, 57, 106, 55, 42, 38, 102, 58, 123, 22, 104, 25, 86, 125, 89, 21, 118, 59, 23, 27, 85, 119, 40, 87, 90, 93, 36, 82, 124, 116, 77, 60, 63, 49, 112, 56, 114, 62, 97, 100, 79, 127, 33, 96, 15, 113, 52, 61, 18, 81, 9, 37, 101, 99, 32, 29, 17, 11, 13, 44, 48, 70, 75, 31, 109, 73, 94, 0, 41, 20, 19, 84, 45, 91, 83, 95, 92, 26, 28, 30, 107, 98, 14, 12, 71, 7, 35, 108, 80, 16, 64, 105, 78, 8, 76, 39, 68, 43, 10, 4, 74, 103, 34, 2, 1, 24, 66, 67, 65, 88, 3, 5, 72, 6, 69], [115, 122, 53, 117, 51, 120, 47, 111, 121, 126, 54, 50, 46, 110, 106, 57, 55, 42, 38, 102, 58, 22, 104, 97, 86, 25, 89, 87, 27, 123, 21, 125, 40, 36, 62, 23, 118, 82, 79, 93, 119, 63, 85, 77, 90, 49, 112, 60, 114, 99, 15, 56, 18, 81, 127, 33, 124, 96, 13, 100, 61, 59, 101, 113, 17, 75, 83, 32, 48, 116, 109, 9, 91, 45, 29, 52, 37, 26, 92, 19, 28, 71, 76, 84, 14, 11, 31, 44, 95, 73, 12, 107, 20, 39, 80, 7, 70, 41, 78, 94, 35, 64, 30, 0, 43, 105, 8, 74, 108, 34, 10, 16, 98, 4, 103, 68, 2, 24, 6, 3, 67, 88, 66, 1, 65, 72, 5, 69], [115, 122, 53, 117, 51, 120, 47, 121, 111, 126, 54, 50, 46, 110, 106, 55, 57, 42, 38, 102, 123, 22, 104, 86, 21, 25, 40, 125, 58, 89, 85, 87, 97, 27, 23, 63, 93, 36, 82, 90, 77, 79, 49, 18, 56, 100, 60, 118, 59, 15, 116, 33, 81, 62, 61, 127, 114, 113, 52, 119, 48, 96, 13, 101, 112, 124, 99, 29, 41, 17, 26, 12, 32, 45, 75, 73, 109, 83, 84, 76, 94, 19, 91, 92, 44, 9, 64, 28, 31, 7, 16, 37, 20, 78, 11, 34, 14, 10, 8, 95, 35, 39, 68, 70, 74, 108, 103, 105, 107, 30, 43, 71, 98, 80, 0, 24, 4, 6, 66, 88, 65, 67, 1, 2, 3, 5, 69, 72], [115, 53, 122, 51, 117, 120, 47, 126, 111, 121, 54, 50, 46, 110, 55, 57, 106, 42, 102, 38, 104, 89, 22, 58, 86, 123, 87, 40, 25, 85, 23, 90, 36, 114, 93, 125, 97, 21, 27, 63, 96, 118, 82, 79, 100, 33, 59, 49, 60, 56, 18, 77, 116, 15, 81, 113, 29, 124, 61, 94, 99, 17, 84, 19, 26, 44, 101, 127, 48, 28, 119, 62, 112, 32, 83, 13, 41, 31, 52, 75, 20, 91, 73, 37, 92, 30, 11, 45, 16, 35, 39, 78, 76, 109, 98, 12, 80, 108, 107, 9, 95, 6, 34, 43, 103, 71, 0, 10, 105, 7, 14, 24, 8, 88, 68, 64, 4, 74, 1, 2, 66, 70, 65, 3, 67, 5, 72, 69], [115, 53, 122, 51, 117, 120, 47, 126, 111, 54, 121, 50, 46, 110, 57, 55, 106, 42, 38, 102, 25, 58, 22, 86, 104, 87, 89, 40, 123, 93, 27, 114, 36, 90, 21, 97, 125, 118, 85, 79, 63, 23, 59, 96, 60, 18, 99, 82, 77, 62, 112, 15, 49, 33, 116, 100, 26, 124, 81, 48, 56, 101, 29, 13, 61, 17, 127, 52, 32, 75, 19, 37, 113, 44, 107, 109, 45, 73, 92, 20, 91, 119, 9, 28, 84, 94, 6, 83, 78, 16, 12, 95, 41, 30, 31, 108, 11, 39, 0, 105, 7, 35, 34, 80, 71, 76, 14, 98, 10, 64, 68, 4, 103, 43, 74, 88, 66, 67, 8, 2, 65, 24, 3, 1, 72, 69, 70, 5], [115, 53, 122, 117, 51, 120, 47, 126, 111, 121, 54, 50, 46, 110, 57, 55, 106, 42, 102, 38, 63, 104, 89, 125, 22, 25, 114, 97, 123, 58, 21, 40, 86, 36, 87, 62, 48, 27, 82, 23, 85, 90, 93, 127, 112, 99, 49, 118, 101, 79, 124, 60, 96, 77, 59, 52, 116, 100, 13, 61, 15, 18, 33, 56, 26, 29, 17, 41, 44, 32, 75, 28, 37, 84, 45, 109, 119, 91, 73, 81, 113, 94, 83, 107, 19, 6, 105, 9, 95, 12, 11, 39, 92, 78, 7, 108, 34, 31, 30, 35, 98, 43, 76, 20, 0, 64, 14, 103, 10, 71, 16, 4, 80, 74, 68, 67, 24, 88, 2, 72, 65, 8, 66, 1, 3, 70, 5, 69], [115, 122, 53, 117, 51, 120, 47, 126, 111, 121, 50, 54, 46, 110, 57, 106, 55, 42, 102, 38, 104, 89, 22, 123, 97, 58, 125, 93, 85, 87, 40, 21, 25, 86, 63, 118, 36, 23, 27, 62, 60, 56, 90, 114, 116, 124, 49, 96, 77, 101, 100, 18, 15, 33, 13, 79, 82, 99, 127, 113, 52, 59, 92, 81, 48, 26, 41, 112, 119, 29, 94, 44, 75, 28, 83, 17, 9, 45, 61, 91, 73, 32, 11, 84, 37, 107, 20, 19, 109, 30, 31, 78, 12, 6, 35, 95, 105, 76, 64, 39, 14, 0, 10, 80, 71, 16, 7, 34, 43, 98, 103, 68, 108, 66, 74, 72, 4, 67, 88, 1, 24, 2, 3, 65, 8, 70, 69, 5], [115, 53, 122, 117, 51, 120, 47, 111, 126, 121, 50, 54, 46, 110, 57, 55, 106, 42, 102, 38, 58, 89, 22, 104, 118, 25, 40, 123, 97, 63, 86, 87, 85, 27, 93, 23, 116, 21, 60, 36, 49, 90, 125, 56, 79, 82, 59, 77, 124, 96, 33, 62, 18, 99, 112, 113, 48, 114, 100, 15, 119, 52, 13, 127, 101, 81, 92, 44, 31, 26, 32, 29, 17, 19, 107, 61, 94, 75, 41, 73, 109, 28, 37, 20, 83, 11, 91, 78, 7, 39, 30, 95, 45, 84, 12, 10, 108, 9, 71, 64, 76, 98, 14, 34, 68, 6, 16, 80, 103, 0, 74, 35, 72, 105, 4, 88, 43, 70, 67, 24, 1, 2, 66, 3, 5, 65, 69, 8], [115, 53, 122, 117, 51, 120, 47, 111, 126, 121, 54, 50, 46, 110, 57, 55, 106, 42, 38, 102, 118, 89, 58, 123, 125, 104, 22, 25, 87, 63, 85, 62, 97, 56, 124, 93, 23, 86, 90, 114, 27, 52, 48, 21, 40, 59, 36, 49, 96, 79, 82, 119, 116, 60, 18, 127, 15, 112, 113, 13, 29, 33, 101, 77, 99, 100, 26, 44, 91, 61, 32, 109, 81, 17, 92, 19, 45, 107, 31, 20, 41, 75, 37, 28, 64, 83, 73, 39, 11, 0, 7, 71, 94, 9, 105, 10, 95, 78, 98, 68, 80, 103, 72, 12, 84, 16, 35, 70, 108, 76, 30, 14, 43, 34, 4, 1, 88, 6, 74, 67, 66, 65, 24, 3, 2, 5, 69, 8], [115, 122, 53, 117, 51, 120, 47, 111, 121, 126, 54, 50, 46, 110, 57, 55, 106, 42, 102, 38, 118, 89, 123, 22, 58, 104, 97, 27, 25, 52, 85, 59, 87, 93, 125, 60, 23, 36, 86, 49, 63, 116, 40, 124, 21, 96, 90, 61, 119, 48, 82, 79, 18, 33, 56, 112, 113, 13, 101, 114, 62, 44, 100, 26, 99, 77, 29, 15, 92, 109, 17, 127, 28, 107, 81, 94, 32, 20, 83, 91, 19, 37, 45, 31, 30, 41, 84, 75, 73, 7, 70, 11, 39, 9, 64, 12, 72, 95, 0, 14, 76, 105, 35, 78, 43, 71, 34, 80, 98, 10, 16, 108, 68, 74, 88, 4, 2, 67, 103, 1, 24, 66, 6, 65, 3, 5, 69, 8], [115, 122, 53, 117, 51, 120, 47, 111, 126, 121, 54, 50, 46, 110, 57, 55, 106, 42, 102, 38, 118, 22, 89, 86, 104, 25, 27, 97, 58, 49, 87, 85, 123, 125, 93, 40, 36, 90, 21, 124, 56, 23, 18, 63, 62, 96, 15, 79, 119, 59, 82, 116, 114, 77, 127, 52, 60, 13, 81, 61, 33, 99, 113, 48, 100, 112, 92, 17, 26, 101, 37, 29, 83, 9, 28, 75, 11, 19, 44, 70, 31, 84, 107, 41, 73, 76, 94, 32, 20, 91, 95, 109, 10, 108, 105, 7, 45, 12, 30, 98, 14, 72, 78, 16, 80, 71, 0, 103, 68, 64, 43, 74, 4, 35, 39, 34, 88, 65, 24, 67, 3, 66, 2, 1, 69, 5, 8, 6], [115, 53, 122, 117, 51, 120, 47, 111, 126, 121, 50, 54, 46, 110, 57, 55, 106, 42, 102, 38, 22, 89, 104, 25, 123, 87, 86, 118, 85, 58, 23, 93, 21, 40, 97, 27, 36, 125, 90, 52, 49, 62, 79, 59, 18, 56, 63, 48, 77, 119, 33, 124, 82, 96, 127, 15, 114, 116, 61, 99, 13, 112, 100, 17, 81, 101, 32, 60, 19, 92, 29, 84, 26, 44, 11, 83, 9, 113, 20, 28, 75, 91, 37, 73, 109, 94, 41, 70, 7, 16, 31, 76, 12, 45, 95, 10, 35, 107, 105, 64, 71, 78, 39, 14, 108, 30, 34, 80, 98, 0, 43, 74, 68, 72, 24, 88, 4, 103, 66, 1, 67, 65, 3, 2, 69, 5, 6, 8], [115, 53, 122, 117, 51, 120, 47, 121, 126, 111, 50, 54, 46, 110, 57, 55, 106, 42, 102, 38, 58, 123, 22, 104, 25, 89, 118, 36, 87, 125, 23, 86, 85, 90, 124, 40, 27, 93, 97, 21, 96, 63, 52, 59, 49, 82, 116, 33, 60, 18, 79, 119, 114, 99, 127, 56, 62, 13, 100, 77, 26, 48, 101, 29, 44, 15, 41, 61, 112, 109, 19, 113, 17, 94, 81, 92, 32, 28, 37, 91, 84, 73, 83, 20, 9, 39, 30, 11, 107, 76, 35, 7, 75, 70, 105, 45, 0, 78, 98, 80, 10, 43, 31, 12, 108, 16, 103, 95, 34, 14, 71, 24, 68, 74, 64, 72, 88, 4, 66, 1, 3, 6, 2, 8, 67, 65, 5, 69], [115, 53, 122, 51, 117, 120, 47, 121, 126, 111, 50, 54, 46, 110, 57, 55, 106, 42, 102, 38, 58, 118, 125, 89, 104, 123, 22, 87, 25, 27, 63, 21, 59, 36, 62, 86, 116, 49, 124, 40, 85, 96, 52, 93, 56, 23, 90, 97, 60, 114, 119, 99, 127, 82, 100, 112, 33, 48, 18, 61, 101, 13, 79, 15, 107, 77, 81, 113, 32, 44, 26, 37, 19, 92, 109, 29, 17, 45, 83, 94, 28, 11, 20, 9, 30, 12, 73, 91, 34, 105, 98, 108, 41, 76, 14, 95, 31, 84, 71, 43, 35, 16, 103, 0, 80, 39, 78, 75, 70, 10, 74, 7, 68, 6, 4, 24, 64, 88, 8, 66, 2, 1, 72, 67, 3, 69, 5, 65], [115, 53, 122, 51, 117, 120, 47, 121, 111, 126, 54, 50, 46, 110, 57, 106, 55, 42, 102, 38, 25, 58, 123, 118, 104, 89, 62, 125, 22, 87, 27, 97, 85, 21, 90, 93, 86, 36, 23, 56, 59, 18, 124, 13, 79, 40, 63, 48, 82, 49, 60, 114, 96, 52, 116, 33, 15, 113, 77, 99, 101, 100, 29, 112, 119, 44, 127, 17, 11, 81, 107, 28, 32, 26, 92, 19, 73, 61, 45, 84, 75, 9, 83, 41, 94, 37, 6, 71, 78, 16, 76, 64, 91, 105, 109, 95, 20, 30, 31, 14, 98, 43, 0, 12, 108, 7, 10, 39, 80, 74, 35, 68, 4, 34, 66, 103, 8, 65, 3, 88, 67, 70, 2, 24, 1, 72, 5, 69], [115, 53, 122, 51, 117, 120, 47, 111, 121, 126, 54, 50, 46, 110, 57, 106, 55, 42, 102, 38, 58, 123, 104, 89, 118, 97, 22, 25, 86, 27, 21, 125, 87, 40, 62, 56, 60, 93, 36, 85, 63, 59, 23, 116, 79, 49, 124, 33, 52, 82, 13, 18, 99, 90, 127, 44, 48, 96, 77, 100, 119, 114, 113, 61, 15, 101, 112, 81, 32, 26, 11, 73, 92, 28, 6, 31, 29, 17, 0, 20, 45, 83, 41, 91, 84, 94, 37, 19, 71, 9, 39, 109, 105, 34, 108, 95, 98, 76, 75, 107, 78, 12, 14, 64, 30, 8, 80, 4, 7, 43, 68, 74, 35, 16, 10, 103, 67, 88, 1, 66, 3, 2, 65, 24, 69, 5, 72, 70], [115, 53, 122, 117, 51, 120, 47, 121, 111, 126, 54, 50, 46, 110, 57, 106, 55, 42, 38, 102, 123, 22, 58, 118, 89, 87, 125, 104, 21, 25, 86, 97, 27, 85, 23, 124, 63, 93, 60, 36, 127, 90, 62, 116, 59, 82, 79, 40, 18, 100, 13, 56, 49, 99, 15, 119, 96, 81, 77, 52, 33, 48, 114, 61, 113, 19, 101, 11, 29, 17, 32, 26, 28, 83, 9, 92, 84, 73, 91, 94, 109, 6, 41, 44, 31, 20, 37, 112, 12, 80, 76, 108, 78, 98, 75, 43, 74, 95, 107, 8, 71, 14, 105, 68, 7, 45, 39, 0, 30, 35, 34, 16, 103, 10, 4, 64, 88, 24, 66, 67, 1, 69, 3, 2, 65, 70, 5, 72], [115, 53, 122, 51, 117, 120, 47, 111, 121, 126, 54, 50, 46, 110, 57, 106, 55, 42, 38, 102, 58, 89, 104, 123, 25, 22, 118, 97, 87, 56, 27, 93, 86, 23, 40, 21, 36, 85, 90, 59, 124, 100, 49, 63, 125, 33, 127, 52, 60, 82, 79, 116, 18, 13, 96, 77, 99, 15, 101, 29, 62, 114, 113, 61, 119, 11, 81, 17, 48, 19, 44, 84, 32, 94, 9, 31, 28, 7, 73, 6, 30, 75, 92, 83, 112, 45, 91, 71, 26, 78, 74, 109, 37, 64, 0, 20, 41, 12, 95, 8, 16, 76, 4, 35, 98, 108, 80, 34, 14, 68, 107, 10, 105, 43, 39, 66, 103, 2, 67, 24, 65, 3, 70, 88, 1, 5, 69, 72], [115, 53, 122, 51, 117, 120, 47, 111, 126, 54, 121, 50, 46, 110, 55, 106, 57, 42, 102, 38, 22, 123, 58, 104, 89, 25, 87, 86, 27, 97, 85, 118, 56, 21, 125, 63, 40, 93, 36, 23, 90, 18, 82, 49, 79, 124, 96, 116, 60, 77, 15, 62, 119, 99, 101, 13, 114, 33, 59, 100, 61, 11, 48, 29, 81, 17, 9, 94, 26, 112, 127, 44, 19, 52, 113, 83, 32, 84, 28, 37, 92, 75, 73, 91, 76, 41, 30, 107, 8, 80, 35, 16, 78, 12, 109, 39, 31, 20, 45, 7, 14, 95, 108, 71, 10, 105, 6, 103, 74, 64, 98, 43, 4, 34, 68, 24, 0, 70, 88, 66, 3, 2, 65, 67, 1, 5, 69, 72]], "model.layers.23.self_attn.q_proj": [[111, 100, 107, 15, 88, 90, 21, 93, 81, 47, 20, 75, 11, 31, 52, 82, 73, 39, 19, 83, 85, 22, 41, 104, 12, 110, 54, 28, 13, 32, 95, 120, 14, 77, 27, 72, 76, 45, 119, 37, 87, 36, 40, 106, 71, 94, 24, 50, 6, 118, 26, 92, 101, 51, 33, 44, 38, 25, 16, 122, 114, 59, 10, 80, 112, 34, 58, 124, 5, 56, 108, 46, 123, 116, 109, 84, 79, 62, 68, 23, 70, 125, 96, 43, 103, 63, 127, 17, 57, 61, 105, 113, 78, 97, 55, 53, 126, 86, 89, 117, 74, 60, 98, 30, 18, 9, 8, 48, 49, 91, 42, 115, 29, 69, 99, 35, 121, 7, 102, 2, 67, 4, 0, 3, 66, 1, 65, 64], [111, 100, 47, 107, 32, 88, 90, 27, 94, 21, 83, 85, 24, 78, 80, 84, 81, 40, 95, 109, 30, 22, 75, 86, 17, 26, 44, 15, 10, 89, 7, 96, 77, 110, 28, 118, 119, 36, 31, 123, 50, 51, 39, 68, 0, 23, 46, 2, 63, 43, 116, 126, 127, 60, 120, 5, 73, 56, 121, 93, 41, 105, 124, 48, 42, 49, 52, 37, 33, 106, 57, 72, 101, 45, 114, 125, 98, 91, 122, 99, 103, 29, 61, 35, 20, 55, 115, 113, 112, 53, 11, 92, 54, 71, 59, 62, 34, 102, 38, 58, 108, 104, 97, 117, 6, 74, 87, 64, 25, 79, 82, 66, 19, 3, 16, 67, 18, 12, 4, 69, 14, 76, 8, 13, 65, 9, 1, 70], [111, 100, 107, 47, 90, 83, 86, 27, 88, 31, 93, 18, 80, 24, 78, 79, 91, 41, 17, 95, 85, 23, 32, 74, 72, 101, 39, 22, 96, 110, 15, 36, 94, 29, 76, 77, 50, 21, 120, 92, 34, 109, 81, 62, 13, 89, 40, 9, 118, 123, 82, 122, 119, 7, 30, 11, 84, 75, 121, 12, 127, 16, 106, 10, 56, 59, 114, 60, 20, 117, 116, 61, 5, 57, 104, 87, 58, 124, 43, 33, 52, 25, 115, 26, 46, 68, 37, 35, 28, 19, 54, 108, 97, 51, 126, 8, 102, 14, 112, 48, 73, 53, 38, 98, 105, 42, 103, 2, 0, 45, 63, 55, 71, 125, 44, 99, 49, 113, 4, 3, 6, 69, 70, 64, 1, 66, 67, 65], [111, 100, 107, 47, 32, 88, 121, 27, 85, 24, 63, 90, 51, 119, 21, 95, 81, 49, 36, 94, 40, 80, 91, 52, 50, 109, 84, 83, 93, 45, 42, 96, 61, 78, 38, 106, 34, 58, 124, 108, 110, 10, 118, 56, 46, 44, 122, 117, 125, 30, 57, 15, 115, 62, 28, 97, 104, 55, 39, 120, 59, 60, 54, 126, 112, 114, 103, 53, 23, 22, 92, 105, 75, 99, 116, 31, 41, 98, 113, 48, 35, 86, 43, 127, 123, 102, 37, 33, 77, 29, 101, 26, 5, 20, 89, 68, 17, 0, 16, 18, 72, 7, 2, 71, 25, 87, 82, 11, 73, 67, 19, 79, 14, 76, 12, 6, 74, 13, 3, 9, 8, 66, 64, 70, 69, 1, 4, 65], [56, 51, 103, 46, 110, 19, 13, 97, 15, 90, 28, 17, 99, 23, 116, 60, 94, 21, 10, 115, 47, 81, 87, 24, 57, 79, 100, 75, 72, 124, 91, 9, 53, 93, 86, 109, 80, 31, 61, 84, 89, 54, 14, 85, 74, 58, 92, 126, 3, 7, 77, 76, 6, 52, 18, 48, 113, 25, 11, 5, 123, 114, 41, 30, 127, 59, 119, 122, 27, 55, 45, 112, 12, 83, 63, 16, 71, 68, 49, 121, 29, 62, 22, 67, 43, 26, 78, 70, 88, 8, 98, 102, 82, 95, 20, 32, 108, 125, 0, 34, 36, 118, 117, 104, 105, 1, 69, 120, 101, 96, 38, 37, 106, 111, 35, 50, 107, 40, 44, 42, 64, 73, 4, 2, 33, 65, 66, 39], [46, 56, 103, 110, 53, 57, 60, 115, 61, 116, 47, 122, 124, 20, 52, 89, 127, 97, 126, 112, 51, 59, 55, 108, 114, 121, 58, 94, 54, 118, 63, 49, 62, 123, 48, 113, 109, 119, 50, 125, 111, 84, 88, 30, 106, 28, 99, 117, 25, 86, 44, 45, 80, 105, 43, 120, 107, 37, 36, 41, 24, 92, 42, 104, 27, 78, 14, 40, 38, 39, 22, 101, 102, 98, 82, 16, 33, 100, 35, 91, 19, 87, 95, 34, 31, 96, 32, 11, 9, 4, 17, 71, 64, 66, 69, 93, 0, 26, 90, 68, 73, 2, 29, 18, 1, 13, 75, 10, 21, 65, 85, 5, 83, 12, 15, 7, 67, 72, 3, 81, 23, 79, 6, 70, 76, 77, 8, 74], [56, 103, 46, 53, 57, 110, 60, 61, 124, 116, 47, 126, 51, 122, 112, 52, 97, 127, 115, 20, 89, 55, 59, 54, 58, 94, 123, 118, 28, 121, 114, 49, 113, 125, 62, 37, 119, 109, 108, 63, 99, 48, 120, 111, 25, 117, 45, 50, 80, 44, 30, 36, 105, 86, 84, 43, 92, 106, 41, 107, 88, 24, 104, 14, 42, 40, 19, 23, 39, 82, 102, 101, 38, 22, 98, 100, 27, 34, 33, 35, 78, 16, 91, 13, 96, 31, 21, 71, 87, 95, 9, 81, 12, 90, 15, 69, 32, 18, 68, 11, 72, 0, 93, 3, 74, 5, 29, 26, 85, 10, 64, 66, 1, 2, 4, 83, 17, 75, 73, 67, 6, 65, 76, 7, 79, 8, 77, 70], [46, 51, 103, 56, 19, 110, 87, 17, 15, 97, 28, 13, 92, 116, 10, 81, 61, 12, 47, 90, 53, 94, 23, 86, 112, 60, 57, 126, 72, 58, 52, 59, 96, 121, 124, 43, 89, 122, 115, 25, 114, 127, 106, 49, 55, 62, 21, 48, 63, 54, 108, 45, 4, 123, 109, 125, 119, 120, 42, 111, 118, 113, 24, 117, 50, 83, 30, 44, 105, 79, 85, 93, 107, 75, 67, 99, 9, 82, 11, 31, 20, 71, 8, 98, 104, 70, 6, 76, 26, 101, 84, 77, 40, 32, 41, 38, 7, 34, 37, 29, 22, 36, 102, 100, 95, 5, 3, 91, 80, 18, 14, 68, 27, 73, 69, 0, 1, 65, 35, 88, 74, 66, 16, 64, 2, 78, 39, 33], [102, 110, 33, 49, 46, 92, 113, 111, 86, 28, 122, 19, 57, 81, 24, 55, 78, 54, 70, 74, 85, 13, 108, 90, 61, 8, 117, 94, 30, 23, 66, 105, 80, 109, 114, 26, 76, 87, 44, 14, 60, 75, 9, 68, 79, 127, 67, 17, 21, 47, 77, 107, 31, 119, 65, 51, 0, 106, 18, 63, 116, 56, 62, 15, 7, 120, 58, 53, 126, 83, 52, 112, 115, 40, 11, 50, 39, 103, 43, 121, 104, 36, 124, 2, 123, 12, 20, 99, 125, 3, 118, 45, 48, 41, 59, 37, 22, 32, 29, 98, 71, 42, 89, 72, 97, 25, 35, 101, 69, 88, 16, 100, 27, 93, 95, 34, 91, 82, 96, 5, 4, 10, 1, 84, 38, 64, 73, 6], [110, 102, 33, 46, 49, 57, 92, 113, 111, 86, 28, 19, 24, 81, 85, 61, 116, 119, 56, 122, 115, 87, 26, 79, 108, 90, 107, 78, 94, 63, 109, 114, 9, 13, 52, 23, 121, 127, 30, 74, 47, 104, 123, 70, 21, 17, 99, 126, 59, 48, 62, 51, 31, 14, 60, 98, 124, 58, 125, 80, 55, 106, 54, 43, 112, 53, 120, 50, 100, 44, 118, 29, 42, 76, 41, 105, 117, 103, 75, 77, 8, 45, 37, 36, 83, 15, 101, 25, 20, 66, 68, 35, 22, 95, 34, 40, 12, 18, 39, 93, 11, 32, 96, 0, 97, 38, 27, 89, 88, 69, 91, 3, 65, 84, 1, 72, 4, 10, 71, 73, 7, 16, 82, 6, 64, 5, 67, 2], [110, 102, 46, 113, 49, 33, 92, 111, 86, 19, 122, 28, 24, 81, 79, 55, 60, 85, 78, 39, 26, 127, 87, 74, 115, 68, 90, 93, 12, 118, 119, 114, 77, 70, 9, 125, 56, 8, 13, 50, 84, 108, 61, 29, 107, 98, 14, 59, 37, 105, 51, 100, 22, 112, 73, 71, 35, 25, 47, 94, 58, 88, 44, 117, 124, 53, 120, 42, 106, 21, 95, 57, 40, 23, 20, 17, 83, 82, 121, 2, 96, 62, 54, 52, 32, 30, 41, 65, 31, 116, 43, 109, 76, 104, 126, 75, 36, 64, 63, 38, 99, 45, 48, 91, 101, 34, 11, 5, 80, 103, 123, 66, 15, 89, 27, 18, 10, 97, 16, 4, 6, 72, 0, 7, 3, 69, 67, 1], [110, 102, 46, 49, 33, 113, 92, 111, 86, 122, 19, 28, 54, 81, 85, 119, 24, 90, 26, 109, 94, 126, 79, 70, 13, 120, 78, 9, 51, 57, 112, 74, 125, 21, 87, 108, 123, 61, 50, 59, 105, 107, 121, 52, 14, 58, 8, 53, 47, 56, 68, 17, 60, 43, 45, 100, 93, 127, 114, 63, 12, 118, 106, 80, 44, 116, 55, 20, 48, 29, 15, 83, 30, 124, 23, 117, 103, 77, 31, 39, 65, 115, 42, 62, 75, 22, 104, 91, 40, 76, 99, 36, 71, 66, 34, 97, 101, 41, 98, 37, 32, 95, 88, 35, 82, 10, 27, 89, 18, 96, 25, 73, 84, 5, 11, 64, 16, 38, 72, 3, 0, 2, 4, 69, 7, 6, 1, 67], [48, 39, 119, 117, 56, 33, 60, 112, 121, 127, 58, 120, 47, 116, 125, 63, 122, 49, 90, 29, 114, 118, 55, 51, 123, 115, 124, 52, 59, 126, 113, 61, 62, 53, 107, 24, 44, 111, 54, 50, 110, 45, 91, 108, 93, 46, 106, 57, 88, 95, 109, 26, 86, 42, 20, 40, 43, 41, 105, 102, 17, 101, 104, 97, 36, 103, 38, 27, 85, 87, 37, 92, 100, 34, 22, 96, 94, 99, 98, 28, 89, 35, 21, 81, 16, 32, 31, 30, 84, 83, 78, 76, 14, 25, 12, 15, 82, 80, 10, 23, 18, 74, 19, 72, 65, 1, 67, 3, 0, 13, 64, 8, 69, 5, 68, 66, 2, 79, 4, 11, 7, 6, 77, 71, 9, 75, 73, 70], [119, 39, 48, 117, 60, 33, 56, 121, 47, 127, 120, 55, 63, 125, 58, 116, 51, 122, 49, 123, 114, 118, 115, 61, 52, 112, 90, 53, 59, 113, 106, 44, 62, 126, 124, 110, 29, 57, 54, 24, 107, 50, 111, 45, 108, 46, 41, 91, 93, 26, 95, 88, 105, 20, 86, 109, 43, 42, 102, 40, 104, 17, 103, 92, 101, 100, 38, 27, 97, 21, 34, 36, 94, 85, 28, 37, 87, 35, 99, 96, 89, 31, 22, 98, 16, 32, 76, 30, 81, 84, 25, 83, 15, 80, 78, 23, 14, 12, 82, 18, 10, 74, 1, 0, 65, 67, 64, 8, 19, 69, 72, 5, 4, 3, 13, 79, 11, 68, 66, 2, 7, 6, 71, 75, 70, 9, 77, 73], [117, 39, 119, 48, 56, 90, 60, 33, 55, 62, 124, 52, 121, 120, 95, 127, 47, 46, 105, 93, 58, 29, 116, 106, 61, 115, 122, 114, 63, 83, 26, 125, 51, 10, 118, 102, 126, 113, 49, 24, 57, 107, 50, 53, 123, 89, 59, 45, 104, 109, 108, 96, 54, 16, 111, 86, 44, 110, 112, 20, 42, 27, 43, 18, 41, 81, 85, 78, 38, 30, 40, 32, 94, 31, 19, 36, 25, 17, 34, 101, 82, 92, 100, 98, 14, 88, 91, 23, 75, 35, 97, 84, 87, 28, 71, 37, 13, 99, 22, 103, 12, 8, 21, 76, 3, 15, 11, 80, 4, 9, 68, 2, 72, 7, 74, 79, 5, 0, 6, 69, 73, 70, 67, 77, 1, 66, 65, 64], [39, 117, 119, 48, 83, 33, 15, 13, 11, 9, 29, 86, 6, 89, 66, 7, 74, 55, 88, 27, 87, 73, 81, 68, 80, 53, 90, 72, 22, 84, 91, 75, 82, 85, 8, 65, 30, 23, 18, 62, 26, 25, 17, 56, 16, 34, 0, 77, 79, 12, 21, 67, 76, 19, 92, 14, 31, 78, 70, 71, 28, 94, 4, 20, 5, 10, 32, 58, 50, 96, 35, 3, 64, 43, 106, 45, 24, 95, 40, 2, 69, 47, 102, 104, 93, 112, 38, 105, 99, 101, 54, 121, 108, 61, 124, 120, 57, 122, 1, 103, 110, 41, 98, 127, 49, 100, 115, 123, 36, 60, 111, 44, 63, 125, 118, 42, 52, 51, 107, 37, 116, 59, 97, 114, 113, 126, 109, 46], [44, 37, 108, 76, 18, 21, 14, 97, 24, 57, 80, 28, 71, 0, 5, 73, 125, 90, 69, 67, 7, 2, 51, 121, 58, 3, 11, 72, 75, 113, 8, 10, 19, 52, 23, 12, 89, 118, 55, 33, 91, 68, 16, 64, 65, 9, 88, 101, 34, 92, 70, 96, 77, 85, 13, 1, 84, 119, 99, 124, 20, 109, 25, 123, 66, 86, 26, 6, 82, 46, 93, 83, 78, 127, 114, 104, 79, 74, 15, 81, 35, 4, 100, 95, 47, 111, 29, 30, 27, 50, 56, 120, 98, 60, 126, 17, 117, 22, 31, 32, 61, 63, 110, 94, 36, 87, 106, 62, 103, 45, 41, 39, 102, 38, 105, 112, 43, 40, 107, 54, 116, 115, 122, 48, 42, 49, 53, 59], [44, 37, 108, 28, 24, 21, 97, 80, 18, 14, 19, 76, 73, 57, 87, 51, 90, 58, 71, 125, 15, 121, 17, 113, 117, 47, 5, 55, 11, 7, 106, 95, 69, 23, 52, 50, 98, 2, 93, 30, 114, 99, 85, 83, 45, 86, 22, 127, 27, 92, 10, 66, 35, 109, 31, 84, 20, 46, 3, 81, 82, 100, 29, 6, 75, 91, 79, 78, 124, 70, 16, 34, 96, 118, 105, 12, 33, 123, 61, 13, 9, 48, 89, 94, 88, 26, 36, 8, 111, 104, 25, 112, 32, 40, 103, 77, 38, 42, 53, 120, 126, 102, 41, 119, 107, 49, 43, 74, 110, 72, 68, 115, 0, 67, 122, 116, 39, 56, 54, 63, 59, 60, 101, 62, 4, 64, 65, 1], [57, 44, 37, 125, 108, 90, 97, 19, 28, 58, 21, 114, 24, 126, 113, 109, 117, 17, 86, 116, 121, 61, 51, 40, 110, 50, 49, 122, 55, 118, 41, 46, 59, 47, 11, 48, 43, 111, 106, 56, 124, 102, 63, 53, 123, 52, 62, 115, 112, 120, 119, 91, 54, 60, 127, 42, 107, 80, 104, 45, 105, 18, 87, 93, 39, 95, 14, 72, 35, 38, 84, 73, 77, 103, 23, 13, 101, 75, 27, 36, 20, 74, 66, 34, 89, 83, 25, 15, 26, 6, 30, 98, 8, 33, 100, 4, 29, 22, 81, 99, 94, 5, 79, 31, 68, 96, 32, 70, 76, 69, 2, 10, 1, 65, 12, 85, 92, 0, 64, 88, 78, 7, 9, 82, 16, 3, 71, 67], [44, 37, 125, 24, 108, 21, 28, 18, 97, 14, 90, 19, 58, 80, 57, 51, 11, 121, 114, 73, 109, 17, 13, 95, 118, 117, 52, 87, 75, 113, 69, 35, 50, 93, 61, 81, 46, 123, 76, 9, 124, 34, 119, 102, 71, 111, 115, 59, 36, 104, 100, 32, 96, 4, 55, 103, 20, 84, 99, 30, 85, 45, 105, 60, 106, 62, 29, 86, 83, 39, 25, 43, 40, 48, 66, 94, 38, 42, 120, 107, 47, 91, 110, 23, 112, 82, 15, 127, 78, 116, 98, 92, 16, 31, 122, 54, 89, 126, 49, 33, 72, 88, 41, 56, 22, 53, 101, 26, 10, 63, 74, 79, 67, 6, 1, 27, 8, 77, 5, 64, 12, 2, 3, 70, 68, 65, 7, 0], [54, 37, 127, 62, 63, 24, 116, 33, 101, 119, 123, 18, 55, 59, 51, 120, 58, 112, 46, 53, 60, 125, 114, 117, 113, 39, 126, 44, 15, 57, 61, 56, 111, 121, 122, 118, 47, 52, 49, 27, 110, 50, 48, 91, 115, 45, 124, 94, 109, 85, 107, 108, 105, 106, 43, 92, 86, 20, 21, 104, 42, 73, 103, 41, 102, 88, 100, 30, 40, 38, 25, 29, 82, 23, 95, 36, 13, 28, 97, 35, 31, 22, 9, 77, 79, 99, 84, 90, 98, 34, 16, 93, 87, 26, 17, 89, 12, 14, 1, 80, 75, 71, 65, 32, 4, 19, 6, 96, 70, 2, 66, 10, 7, 11, 68, 67, 72, 78, 3, 76, 5, 69, 0, 83, 8, 81, 64, 74], [127, 54, 37, 25, 62, 17, 94, 101, 33, 86, 15, 123, 20, 6, 63, 24, 14, 12, 27, 77, 19, 10, 90, 30, 109, 23, 116, 84, 88, 13, 61, 93, 72, 113, 9, 51, 107, 95, 91, 59, 120, 38, 119, 92, 125, 117, 112, 22, 7, 41, 124, 70, 32, 58, 85, 35, 67, 36, 75, 118, 48, 42, 78, 31, 18, 126, 122, 21, 79, 28, 102, 55, 34, 80, 29, 115, 114, 68, 82, 5, 11, 53, 50, 16, 87, 57, 83, 103, 46, 98, 106, 81, 26, 56, 60, 44, 4, 111, 39, 74, 89, 8, 108, 76, 52, 110, 121, 1, 45, 49, 73, 40, 96, 100, 97, 66, 105, 104, 99, 43, 47, 0, 71, 3, 2, 69, 65, 64], [127, 63, 37, 62, 54, 24, 33, 51, 112, 123, 116, 101, 117, 55, 120, 59, 58, 119, 125, 53, 19, 114, 46, 111, 44, 60, 113, 122, 61, 47, 126, 57, 121, 91, 118, 52, 56, 49, 48, 27, 50, 45, 94, 110, 115, 85, 124, 39, 108, 109, 18, 102, 86, 106, 88, 30, 107, 29, 105, 43, 41, 104, 103, 42, 96, 21, 14, 40, 38, 95, 20, 97, 32, 26, 100, 22, 16, 31, 75, 83, 36, 25, 98, 12, 92, 35, 99, 78, 34, 15, 90, 82, 13, 93, 80, 17, 6, 11, 73, 77, 9, 65, 1, 84, 89, 28, 87, 23, 68, 4, 66, 2, 7, 70, 10, 71, 72, 5, 8, 76, 79, 0, 81, 67, 64, 3, 69, 74], [127, 54, 37, 33, 17, 62, 86, 10, 15, 24, 14, 123, 12, 5, 72, 90, 27, 65, 7, 116, 25, 0, 101, 63, 77, 91, 64, 20, 44, 23, 88, 73, 4, 19, 67, 87, 89, 113, 22, 71, 112, 2, 93, 8, 107, 83, 59, 81, 111, 80, 76, 68, 79, 74, 82, 118, 21, 13, 6, 120, 104, 46, 114, 75, 126, 28, 119, 70, 122, 18, 3, 97, 11, 30, 102, 1, 16, 84, 69, 56, 31, 106, 58, 26, 9, 66, 78, 32, 85, 48, 95, 40, 117, 61, 94, 96, 45, 125, 99, 35, 29, 98, 57, 92, 60, 39, 53, 124, 34, 36, 100, 51, 38, 52, 103, 41, 108, 55, 109, 110, 105, 47, 43, 121, 49, 50, 115, 42], [43, 121, 45, 107, 100, 97, 109, 27, 91, 102, 123, 24, 103, 15, 22, 54, 81, 59, 88, 127, 120, 118, 114, 117, 110, 20, 44, 124, 85, 61, 111, 31, 79, 119, 51, 26, 75, 57, 7, 41, 11, 33, 58, 98, 49, 92, 46, 23, 112, 39, 52, 99, 42, 13, 113, 55, 21, 62, 29, 17, 122, 28, 47, 84, 63, 125, 25, 48, 18, 93, 50, 108, 126, 56, 82, 115, 116, 86, 101, 104, 37, 105, 53, 60, 32, 30, 96, 38, 40, 35, 34, 89, 80, 106, 90, 19, 94, 77, 83, 95, 16, 73, 87, 71, 74, 70, 76, 14, 78, 36, 3, 6, 9, 12, 5, 69, 8, 10, 72, 66, 67, 65, 2, 1, 68, 64, 0, 4], [43, 107, 123, 117, 100, 27, 97, 91, 88, 51, 103, 21, 49, 124, 80, 59, 54, 24, 109, 55, 48, 85, 115, 127, 57, 14, 50, 121, 61, 114, 113, 126, 58, 118, 22, 45, 119, 60, 62, 52, 41, 31, 63, 125, 112, 47, 111, 74, 122, 116, 53, 29, 19, 76, 110, 102, 46, 44, 56, 108, 39, 93, 37, 120, 5, 20, 42, 89, 8, 30, 90, 6, 2, 99, 105, 3, 18, 40, 106, 68, 35, 101, 98, 87, 104, 84, 38, 78, 32, 33, 16, 1, 86, 12, 17, 96, 0, 28, 34, 25, 94, 26, 81, 10, 72, 11, 95, 23, 15, 36, 92, 4, 69, 83, 82, 66, 73, 65, 64, 70, 13, 77, 67, 71, 7, 75, 9, 79], [43, 121, 45, 107, 100, 27, 97, 91, 116, 63, 24, 22, 61, 106, 125, 54, 114, 37, 15, 49, 88, 118, 58, 124, 29, 33, 85, 76, 120, 57, 31, 17, 19, 32, 81, 21, 41, 127, 47, 109, 123, 80, 50, 18, 52, 60, 51, 55, 98, 122, 74, 86, 53, 101, 113, 34, 105, 103, 12, 28, 119, 14, 111, 62, 94, 93, 59, 8, 115, 46, 38, 44, 112, 42, 7, 110, 90, 40, 126, 68, 48, 20, 117, 26, 104, 102, 36, 99, 108, 35, 2, 39, 96, 30, 92, 84, 13, 11, 56, 79, 82, 25, 95, 89, 75, 73, 83, 77, 1, 78, 5, 3, 10, 87, 71, 69, 0, 72, 9, 23, 67, 6, 4, 66, 65, 64, 70, 16], [121, 43, 45, 100, 91, 97, 27, 107, 88, 85, 21, 123, 80, 51, 54, 24, 115, 49, 61, 117, 127, 126, 48, 59, 57, 114, 109, 46, 52, 116, 118, 63, 55, 74, 53, 50, 22, 14, 76, 60, 105, 31, 62, 119, 124, 120, 111, 19, 122, 125, 113, 58, 106, 47, 44, 56, 102, 8, 112, 90, 103, 110, 12, 108, 39, 93, 37, 20, 32, 29, 42, 41, 99, 89, 81, 33, 5, 87, 2, 104, 17, 40, 38, 6, 34, 16, 82, 26, 101, 78, 28, 86, 96, 92, 30, 3, 35, 1, 98, 36, 68, 94, 72, 23, 95, 18, 10, 84, 0, 69, 25, 83, 15, 70, 4, 73, 65, 67, 11, 13, 66, 64, 77, 79, 71, 9, 7, 75], [103, 115, 51, 85, 83, 80, 62, 10, 90, 13, 124, 118, 26, 63, 72, 48, 56, 70, 60, 98, 68, 52, 55, 88, 106, 47, 59, 41, 53, 50, 93, 110, 30, 91, 100, 107, 76, 71, 86, 58, 111, 19, 45, 84, 15, 66, 82, 122, 61, 42, 123, 57, 125, 38, 44, 96, 89, 77, 119, 65, 81, 40, 21, 28, 16, 14, 87, 11, 3, 23, 25, 94, 101, 104, 43, 33, 102, 17, 74, 27, 105, 121, 49, 95, 112, 31, 92, 117, 116, 24, 108, 20, 120, 36, 7, 9, 109, 5, 114, 46, 97, 34, 69, 126, 127, 73, 35, 113, 79, 8, 99, 22, 29, 18, 32, 78, 54, 12, 37, 64, 2, 39, 6, 1, 75, 4, 0, 67], [103, 51, 115, 85, 83, 10, 80, 13, 72, 70, 66, 68, 26, 62, 63, 90, 118, 60, 81, 124, 107, 64, 2, 91, 53, 59, 55, 44, 52, 119, 89, 56, 65, 93, 106, 47, 111, 50, 0, 123, 98, 19, 15, 100, 40, 76, 1, 110, 61, 5, 48, 114, 69, 77, 16, 41, 25, 6, 99, 74, 8, 4, 104, 3, 32, 126, 79, 11, 94, 117, 105, 22, 121, 87, 20, 88, 29, 58, 116, 46, 43, 73, 112, 14, 75, 21, 27, 23, 127, 24, 120, 35, 71, 38, 86, 109, 57, 49, 82, 92, 125, 45, 113, 122, 67, 54, 102, 78, 96, 84, 9, 108, 18, 95, 12, 101, 42, 37, 30, 33, 17, 28, 34, 39, 7, 31, 36, 97], [103, 51, 115, 85, 80, 13, 10, 68, 83, 70, 72, 26, 66, 0, 62, 124, 55, 65, 90, 118, 63, 53, 60, 64, 47, 76, 100, 98, 89, 107, 46, 59, 88, 50, 56, 48, 61, 2, 91, 93, 82, 1, 58, 23, 111, 4, 3, 19, 81, 73, 99, 121, 25, 44, 5, 106, 123, 6, 69, 75, 74, 11, 41, 87, 119, 15, 20, 127, 71, 79, 21, 95, 77, 110, 8, 52, 40, 126, 16, 114, 67, 27, 17, 112, 9, 122, 78, 94, 57, 22, 117, 86, 49, 12, 14, 32, 39, 113, 30, 37, 38, 92, 120, 18, 29, 105, 84, 96, 31, 36, 7, 33, 108, 42, 125, 35, 116, 97, 101, 24, 109, 45, 104, 54, 43, 102, 28, 34], [103, 115, 51, 80, 85, 10, 13, 83, 72, 70, 68, 62, 26, 124, 118, 60, 59, 55, 66, 107, 90, 82, 64, 93, 119, 47, 52, 25, 15, 63, 48, 53, 98, 40, 88, 56, 111, 87, 73, 5, 1, 89, 76, 65, 69, 23, 29, 2, 61, 71, 0, 46, 44, 81, 100, 50, 117, 77, 58, 19, 22, 96, 84, 32, 112, 110, 24, 74, 8, 7, 3, 16, 114, 12, 17, 122, 126, 121, 125, 91, 27, 11, 123, 35, 38, 28, 99, 4, 104, 106, 116, 20, 101, 45, 6, 9, 79, 31, 78, 95, 41, 18, 30, 21, 86, 67, 92, 49, 108, 113, 43, 94, 37, 33, 57, 42, 127, 36, 105, 109, 54, 34, 102, 14, 120, 97, 75, 39]], "model.layers.23.self_attn.k_proj": [[47, 111, 36, 107, 88, 96, 30, 90, 80, 21, 0, 78, 81, 2, 27, 104, 68, 84, 83, 7, 15, 75, 74, 93, 95, 5, 103, 106, 10, 77, 11, 86, 124, 110, 22, 6, 120, 91, 59, 71, 73, 85, 43, 122, 46, 127, 13, 51, 67, 94, 119, 114, 48, 32, 50, 105, 55, 92, 101, 23, 66, 118, 42, 52, 33, 97, 56, 63, 60, 102, 125, 35, 39, 123, 72, 29, 31, 99, 116, 64, 19, 14, 45, 41, 53, 61, 112, 28, 12, 117, 49, 58, 113, 16, 40, 34, 57, 17, 25, 38, 108, 62, 3, 79, 98, 126, 37, 54, 109, 121, 18, 4, 44, 115, 24, 20, 26, 82, 89, 87, 65, 8, 76, 9, 100, 1, 70, 69], [39, 56, 46, 51, 86, 33, 110, 116, 61, 53, 30, 20, 60, 47, 57, 124, 55, 59, 120, 80, 89, 58, 114, 119, 52, 122, 127, 112, 49, 91, 126, 123, 63, 54, 118, 48, 115, 121, 44, 113, 62, 34, 125, 45, 106, 109, 117, 92, 50, 111, 101, 96, 100, 93, 14, 94, 108, 105, 107, 88, 98, 43, 42, 35, 40, 24, 37, 78, 104, 38, 41, 36, 11, 102, 97, 95, 27, 16, 90, 82, 29, 65, 72, 13, 31, 19, 84, 71, 99, 64, 5, 73, 26, 87, 85, 3, 32, 25, 10, 28, 12, 17, 21, 15, 69, 9, 18, 7, 23, 70, 6, 0, 76, 1, 79, 77, 103, 75, 83, 4, 67, 22, 68, 66, 81, 8, 2, 74], [46, 113, 38, 110, 97, 28, 19, 86, 79, 26, 94, 24, 81, 21, 87, 77, 122, 14, 70, 74, 47, 17, 78, 60, 68, 9, 59, 52, 126, 65, 57, 112, 111, 119, 62, 44, 8, 55, 114, 20, 105, 115, 56, 127, 108, 117, 106, 85, 58, 76, 104, 0, 125, 50, 11, 42, 64, 93, 118, 45, 41, 109, 13, 2, 43, 103, 80, 124, 92, 48, 123, 63, 51, 116, 39, 16, 121, 120, 36, 61, 66, 100, 53, 99, 75, 54, 40, 18, 89, 101, 49, 37, 107, 90, 15, 73, 95, 5, 29, 72, 32, 71, 23, 3, 35, 25, 31, 34, 96, 84, 98, 12, 91, 27, 30, 22, 69, 82, 10, 33, 88, 7, 6, 83, 4, 67, 1, 102], [117, 103, 119, 86, 48, 97, 112, 13, 93, 11, 83, 15, 56, 26, 60, 120, 124, 47, 121, 6, 89, 113, 53, 122, 116, 63, 58, 127, 55, 125, 115, 24, 9, 114, 123, 7, 52, 49, 118, 51, 61, 62, 91, 59, 54, 126, 18, 107, 81, 111, 45, 42, 50, 46, 44, 57, 109, 16, 110, 20, 43, 108, 74, 41, 73, 98, 104, 29, 92, 30, 23, 105, 40, 25, 106, 37, 88, 35, 38, 21, 68, 17, 64, 100, 101, 78, 102, 31, 14, 27, 36, 99, 2, 95, 28, 8, 32, 96, 94, 34, 69, 85, 10, 82, 4, 19, 87, 84, 66, 90, 80, 12, 76, 79, 75, 65, 33, 70, 71, 5, 77, 72, 67, 0, 39, 1, 3, 22], [108, 101, 44, 57, 21, 80, 28, 14, 24, 11, 71, 33, 18, 76, 73, 90, 58, 125, 19, 67, 50, 5, 64, 17, 1, 52, 55, 15, 118, 117, 121, 51, 115, 122, 53, 119, 4, 47, 48, 69, 75, 124, 123, 120, 95, 45, 35, 31, 111, 2, 59, 13, 87, 126, 12, 104, 61, 60, 56, 72, 110, 42, 127, 41, 114, 62, 43, 112, 8, 84, 7, 63, 109, 107, 68, 116, 54, 23, 40, 98, 106, 88, 91, 39, 6, 46, 49, 27, 105, 10, 70, 89, 113, 26, 30, 86, 36, 29, 34, 103, 32, 100, 92, 38, 81, 102, 94, 22, 78, 93, 74, 3, 66, 16, 0, 97, 77, 99, 25, 79, 9, 96, 82, 20, 85, 83, 37, 65], [127, 54, 101, 97, 86, 24, 27, 123, 118, 18, 94, 77, 113, 63, 116, 120, 48, 17, 10, 64, 58, 85, 62, 59, 117, 57, 15, 38, 114, 12, 111, 67, 42, 121, 122, 14, 108, 29, 55, 61, 7, 125, 53, 126, 60, 49, 103, 1, 51, 119, 19, 56, 16, 52, 112, 124, 43, 109, 50, 5, 105, 45, 110, 46, 115, 47, 31, 72, 75, 71, 37, 90, 6, 25, 66, 107, 40, 44, 20, 39, 89, 87, 95, 104, 106, 83, 36, 100, 41, 34, 102, 93, 98, 68, 96, 11, 99, 0, 2, 35, 3, 21, 26, 92, 9, 79, 23, 30, 28, 80, 32, 78, 91, 81, 84, 76, 8, 88, 73, 69, 74, 13, 82, 33, 70, 22, 4, 65], [107, 33, 121, 36, 22, 91, 43, 124, 88, 45, 61, 80, 21, 51, 54, 60, 57, 49, 74, 55, 119, 123, 58, 115, 117, 29, 116, 62, 63, 122, 127, 50, 26, 125, 53, 52, 111, 42, 118, 59, 112, 113, 109, 120, 126, 46, 110, 56, 48, 38, 47, 76, 95, 108, 101, 19, 114, 39, 14, 65, 17, 106, 66, 44, 69, 4, 78, 20, 104, 12, 41, 0, 105, 98, 8, 35, 37, 15, 30, 67, 103, 70, 64, 83, 102, 6, 23, 40, 99, 31, 32, 34, 94, 92, 5, 81, 10, 87, 73, 96, 28, 25, 100, 82, 9, 97, 18, 11, 93, 90, 89, 79, 71, 27, 72, 86, 85, 77, 84, 13, 1, 7, 3, 75, 68, 2, 24, 16], [115, 39, 13, 72, 51, 80, 64, 83, 10, 70, 85, 68, 66, 90, 62, 2, 124, 0, 65, 76, 43, 111, 26, 60, 91, 112, 118, 75, 53, 55, 61, 56, 67, 5, 81, 88, 63, 4, 119, 89, 46, 42, 40, 34, 59, 52, 50, 108, 73, 29, 123, 1, 93, 82, 114, 15, 121, 105, 23, 35, 36, 71, 126, 30, 31, 22, 96, 7, 122, 125, 84, 69, 41, 110, 58, 3, 102, 79, 25, 109, 87, 86, 24, 120, 94, 57, 127, 38, 14, 98, 28, 100, 104, 99, 6, 106, 117, 44, 33, 107, 54, 45, 8, 48, 101, 95, 78, 97, 49, 116, 17, 12, 47, 18, 92, 27, 32, 20, 9, 113, 37, 11, 74, 21, 19, 77, 16, 103]], "model.layers.23.self_attn.qk_proj": [[46, 115, 111, 127, 54, 56, 47, 110, 108, 119, 117, 107, 51, 44, 121, 48, 43, 113, 22, 24, 26, 21, 55, 83, 85, 57, 62, 19, 92, 27, 124, 101, 80, 88, 118, 90, 63, 39, 37, 16, 59, 86, 45, 60, 123, 103, 61, 33, 13, 116, 58, 125, 49, 122, 81, 77, 74, 97, 14, 10, 120, 78, 50, 72, 79, 53, 52, 17, 36, 28, 93, 15, 91, 94, 29, 70, 106, 112, 11, 126, 75, 68, 109, 8, 4, 64, 82, 114, 0, 89, 71, 18, 100, 2, 25, 9, 76, 66, 102, 38, 12, 30, 105, 23, 73, 7, 42, 5, 96, 32, 20, 84, 104, 40, 95, 87, 69, 6, 41, 98, 1, 31, 34, 67, 99, 35, 65, 3], [46, 115, 127, 111, 54, 56, 47, 110, 108, 119, 107, 117, 51, 44, 121, 48, 43, 113, 57, 21, 22, 26, 83, 124, 27, 118, 39, 24, 92, 62, 85, 63, 55, 101, 19, 88, 16, 60, 86, 90, 80, 58, 37, 59, 123, 116, 33, 125, 74, 61, 103, 49, 77, 45, 78, 13, 10, 81, 120, 112, 52, 70, 91, 122, 53, 14, 72, 97, 50, 17, 15, 93, 114, 94, 28, 4, 0, 11, 29, 36, 106, 126, 2, 66, 79, 109, 75, 8, 82, 68, 102, 30, 38, 25, 105, 100, 64, 76, 73, 12, 104, 23, 18, 42, 89, 7, 20, 9, 71, 5, 96, 31, 69, 32, 98, 84, 95, 6, 40, 41, 87, 3, 34, 35, 65, 99, 1, 67], [46, 115, 111, 127, 56, 54, 110, 47, 108, 119, 107, 117, 51, 44, 48, 121, 43, 113, 57, 22, 124, 26, 62, 85, 21, 24, 118, 83, 27, 55, 39, 92, 37, 63, 19, 101, 60, 90, 80, 88, 33, 58, 16, 86, 103, 59, 97, 61, 123, 125, 74, 53, 77, 122, 116, 45, 14, 13, 78, 91, 120, 10, 52, 72, 81, 49, 112, 70, 36, 66, 4, 28, 50, 15, 17, 109, 93, 64, 79, 11, 29, 126, 2, 114, 38, 94, 0, 68, 106, 25, 12, 82, 75, 104, 100, 23, 76, 89, 18, 42, 71, 7, 30, 84, 8, 96, 105, 41, 20, 5, 9, 32, 98, 73, 102, 40, 69, 87, 35, 31, 95, 6, 34, 99, 65, 67, 1, 3], [46, 115, 127, 111, 54, 56, 47, 110, 108, 107, 119, 117, 51, 44, 121, 48, 43, 113, 57, 22, 21, 83, 63, 124, 26, 62, 118, 60, 19, 24, 85, 92, 39, 88, 58, 27, 55, 80, 101, 37, 16, 122, 53, 123, 86, 59, 90, 77, 61, 33, 125, 10, 116, 17, 81, 13, 14, 78, 120, 103, 74, 45, 72, 112, 49, 97, 15, 70, 52, 4, 50, 91, 11, 66, 114, 106, 126, 2, 109, 36, 100, 68, 29, 28, 93, 94, 0, 18, 75, 76, 79, 12, 71, 38, 89, 8, 102, 7, 69, 64, 42, 73, 82, 25, 9, 104, 30, 84, 20, 5, 96, 105, 23, 6, 31, 41, 32, 98, 87, 95, 35, 65, 67, 40, 99, 34, 3, 1], [46, 115, 127, 111, 54, 47, 56, 110, 108, 119, 117, 107, 51, 44, 48, 121, 43, 113, 57, 62, 83, 124, 63, 22, 21, 19, 26, 85, 55, 24, 60, 27, 101, 39, 118, 86, 16, 37, 92, 58, 90, 122, 123, 88, 80, 61, 33, 103, 97, 72, 59, 14, 13, 10, 77, 45, 74, 49, 125, 52, 4, 81, 78, 68, 112, 17, 50, 116, 75, 53, 120, 91, 66, 79, 15, 28, 11, 109, 0, 126, 106, 93, 114, 29, 64, 2, 36, 12, 94, 70, 6, 7, 18, 20, 76, 25, 100, 82, 71, 38, 102, 9, 23, 73, 69, 89, 8, 42, 30, 5, 105, 84, 40, 104, 41, 32, 98, 96, 35, 34, 87, 31, 99, 95, 67, 3, 1, 65], [46, 115, 111, 127, 54, 47, 56, 110, 108, 119, 107, 117, 51, 44, 121, 48, 43, 113, 22, 57, 21, 26, 19, 27, 55, 24, 83, 62, 85, 101, 63, 39, 16, 80, 124, 88, 86, 58, 92, 60, 118, 37, 122, 72, 90, 123, 10, 33, 77, 45, 74, 13, 61, 81, 14, 97, 59, 125, 103, 17, 112, 49, 15, 78, 116, 75, 120, 53, 52, 0, 50, 91, 11, 68, 126, 28, 79, 64, 94, 66, 29, 82, 106, 114, 20, 36, 100, 6, 7, 4, 12, 76, 93, 25, 18, 23, 109, 38, 71, 2, 69, 102, 89, 30, 8, 105, 9, 70, 84, 73, 96, 42, 87, 32, 34, 95, 41, 5, 104, 65, 98, 40, 1, 35, 67, 31, 99, 3], [46, 115, 111, 127, 54, 56, 110, 47, 108, 119, 107, 51, 121, 44, 48, 117, 43, 113, 26, 22, 57, 21, 27, 85, 24, 83, 19, 88, 124, 92, 62, 80, 16, 101, 90, 86, 63, 39, 58, 60, 55, 77, 37, 122, 123, 33, 14, 103, 125, 74, 118, 10, 13, 97, 72, 81, 120, 45, 61, 59, 17, 78, 112, 116, 28, 49, 91, 106, 93, 50, 15, 79, 36, 53, 6, 126, 64, 114, 75, 82, 11, 52, 94, 109, 29, 68, 12, 89, 20, 38, 18, 25, 100, 4, 8, 66, 76, 23, 84, 102, 7, 0, 9, 71, 2, 73, 87, 42, 96, 105, 30, 32, 98, 31, 69, 70, 34, 5, 1, 41, 95, 104, 40, 35, 65, 99, 67, 3], [46, 115, 111, 127, 56, 54, 110, 47, 108, 119, 107, 51, 117, 44, 121, 48, 43, 113, 27, 21, 22, 24, 83, 57, 85, 26, 60, 92, 62, 19, 124, 55, 16, 80, 88, 90, 39, 86, 101, 123, 63, 33, 37, 61, 118, 58, 125, 59, 49, 77, 122, 116, 13, 97, 74, 103, 14, 45, 10, 81, 78, 112, 79, 17, 120, 28, 114, 36, 6, 91, 50, 52, 126, 72, 53, 94, 93, 11, 82, 106, 75, 4, 100, 109, 68, 29, 15, 18, 8, 0, 38, 23, 64, 89, 2, 25, 12, 105, 102, 66, 32, 73, 76, 30, 71, 20, 42, 96, 84, 9, 104, 87, 7, 70, 31, 98, 95, 5, 69, 65, 40, 41, 35, 34, 99, 1, 67, 3], [46, 115, 111, 127, 56, 54, 110, 47, 108, 107, 117, 119, 51, 121, 44, 48, 43, 113, 57, 83, 85, 24, 26, 21, 124, 22, 92, 27, 19, 88, 60, 55, 16, 62, 90, 123, 39, 101, 63, 86, 118, 58, 80, 33, 13, 116, 61, 37, 125, 97, 77, 74, 49, 45, 120, 103, 112, 81, 59, 17, 28, 122, 53, 10, 14, 52, 91, 78, 93, 79, 8, 36, 6, 29, 126, 11, 106, 75, 50, 100, 72, 114, 15, 12, 82, 109, 68, 94, 102, 23, 38, 4, 18, 76, 42, 73, 0, 2, 66, 105, 32, 104, 7, 89, 30, 41, 96, 31, 9, 64, 20, 25, 87, 98, 84, 5, 71, 95, 70, 34, 35, 40, 99, 69, 65, 3, 1, 67], [46, 115, 111, 127, 56, 54, 47, 110, 108, 117, 107, 51, 119, 44, 121, 48, 43, 113, 57, 62, 21, 26, 123, 85, 19, 124, 27, 83, 24, 22, 92, 118, 55, 39, 60, 16, 88, 86, 37, 58, 90, 63, 101, 33, 10, 80, 116, 61, 77, 45, 125, 8, 13, 103, 122, 59, 112, 74, 49, 97, 120, 17, 78, 52, 53, 14, 114, 2, 0, 81, 91, 28, 68, 4, 94, 36, 15, 93, 75, 29, 79, 50, 64, 109, 72, 6, 12, 106, 66, 126, 38, 71, 11, 76, 42, 89, 70, 82, 100, 18, 7, 73, 96, 25, 5, 9, 20, 105, 102, 23, 87, 104, 32, 30, 31, 69, 98, 41, 40, 84, 99, 1, 67, 34, 65, 95, 35, 3], [46, 115, 111, 127, 54, 47, 56, 110, 108, 117, 51, 119, 107, 44, 121, 43, 48, 113, 57, 124, 62, 22, 21, 85, 39, 19, 26, 80, 83, 123, 86, 92, 24, 16, 118, 60, 63, 88, 27, 58, 77, 37, 101, 55, 125, 61, 10, 13, 116, 90, 45, 103, 8, 120, 74, 97, 33, 59, 28, 52, 4, 14, 81, 75, 78, 112, 49, 17, 53, 50, 66, 68, 15, 91, 122, 11, 94, 79, 114, 29, 70, 64, 36, 71, 82, 2, 109, 106, 126, 0, 6, 100, 7, 12, 76, 42, 93, 20, 38, 25, 32, 72, 23, 9, 84, 73, 5, 18, 105, 89, 102, 69, 30, 104, 87, 96, 31, 95, 34, 98, 40, 99, 67, 3, 41, 1, 65, 35], [46, 115, 111, 127, 54, 56, 47, 110, 108, 107, 119, 51, 117, 44, 121, 48, 43, 113, 22, 21, 57, 85, 26, 83, 19, 24, 92, 124, 80, 88, 27, 86, 101, 39, 62, 63, 16, 90, 60, 58, 55, 33, 13, 77, 10, 8, 125, 118, 14, 37, 123, 74, 61, 97, 116, 45, 81, 52, 28, 49, 103, 91, 78, 59, 53, 120, 17, 79, 15, 70, 122, 75, 4, 112, 68, 11, 50, 36, 64, 94, 29, 93, 109, 12, 23, 126, 100, 106, 18, 114, 2, 38, 82, 25, 0, 9, 7, 76, 66, 71, 30, 84, 73, 20, 89, 102, 105, 32, 72, 6, 5, 96, 34, 98, 95, 69, 104, 87, 31, 40, 41, 42, 99, 1, 67, 35, 65, 3], [46, 115, 111, 127, 56, 110, 54, 47, 108, 107, 119, 117, 51, 44, 121, 48, 43, 113, 21, 57, 27, 24, 22, 85, 26, 60, 92, 19, 88, 80, 90, 55, 101, 83, 61, 16, 39, 86, 62, 124, 33, 58, 59, 63, 123, 49, 125, 122, 116, 91, 13, 103, 37, 74, 8, 77, 118, 97, 10, 52, 120, 81, 14, 45, 29, 53, 100, 17, 94, 28, 112, 70, 50, 36, 78, 15, 126, 93, 106, 79, 114, 64, 109, 38, 25, 82, 4, 18, 75, 23, 2, 102, 89, 12, 68, 105, 30, 0, 20, 11, 76, 71, 9, 72, 87, 66, 42, 96, 31, 7, 104, 98, 73, 41, 99, 65, 34, 84, 5, 32, 69, 95, 6, 35, 40, 1, 3, 67], [46, 115, 111, 127, 54, 110, 56, 47, 108, 117, 119, 51, 107, 44, 121, 48, 43, 113, 57, 22, 60, 21, 85, 19, 26, 55, 63, 80, 62, 27, 24, 123, 124, 92, 88, 101, 58, 86, 83, 39, 118, 61, 16, 59, 13, 10, 122, 37, 90, 45, 8, 74, 120, 81, 33, 49, 125, 52, 53, 77, 70, 112, 17, 116, 103, 14, 91, 28, 97, 78, 126, 64, 114, 93, 2, 68, 15, 94, 79, 11, 100, 50, 36, 4, 29, 0, 109, 106, 66, 75, 89, 12, 38, 76, 82, 18, 9, 42, 20, 30, 72, 23, 104, 7, 71, 105, 32, 102, 96, 69, 95, 41, 73, 25, 5, 84, 87, 6, 31, 98, 34, 1, 99, 67, 40, 35, 3, 65], [46, 115, 111, 127, 54, 56, 47, 110, 108, 51, 117, 107, 119, 44, 48, 121, 43, 113, 22, 57, 21, 19, 85, 26, 83, 24, 124, 62, 123, 55, 80, 27, 63, 92, 39, 88, 37, 16, 90, 60, 101, 61, 13, 58, 97, 86, 74, 8, 125, 103, 33, 118, 45, 116, 10, 77, 120, 122, 81, 52, 49, 78, 70, 94, 14, 91, 15, 59, 93, 36, 50, 68, 112, 17, 28, 66, 79, 0, 64, 53, 75, 106, 4, 2, 29, 11, 72, 102, 100, 109, 114, 71, 12, 126, 30, 7, 6, 76, 20, 82, 18, 105, 73, 9, 32, 25, 96, 42, 89, 38, 104, 69, 23, 41, 87, 5, 40, 35, 84, 98, 99, 34, 31, 95, 65, 67, 1, 3], [46, 115, 127, 111, 54, 56, 110, 47, 108, 107, 117, 51, 119, 44, 121, 48, 43, 113, 22, 57, 21, 19, 26, 124, 85, 62, 80, 123, 39, 83, 55, 88, 24, 16, 92, 60, 61, 86, 27, 13, 37, 90, 33, 77, 63, 101, 125, 118, 103, 10, 58, 74, 45, 49, 8, 59, 14, 78, 97, 122, 116, 81, 91, 17, 50, 120, 75, 28, 53, 93, 112, 52, 79, 11, 29, 68, 15, 94, 36, 70, 72, 100, 109, 4, 102, 64, 114, 126, 18, 76, 106, 6, 2, 73, 82, 71, 7, 12, 66, 9, 20, 32, 23, 89, 30, 0, 105, 25, 84, 38, 87, 69, 96, 41, 104, 42, 95, 40, 5, 98, 31, 3, 1, 34, 99, 35, 67, 65], [46, 115, 127, 111, 54, 56, 110, 47, 108, 107, 119, 51, 117, 44, 48, 121, 43, 113, 22, 21, 85, 57, 88, 26, 27, 19, 80, 24, 83, 92, 62, 86, 90, 124, 39, 55, 101, 61, 60, 33, 123, 16, 125, 103, 63, 116, 13, 97, 91, 58, 122, 37, 118, 10, 45, 77, 17, 59, 49, 74, 81, 14, 78, 94, 36, 112, 28, 93, 120, 52, 114, 29, 6, 72, 79, 53, 15, 102, 0, 8, 106, 100, 68, 18, 25, 126, 64, 82, 38, 50, 11, 12, 2, 109, 30, 75, 4, 23, 89, 66, 105, 71, 76, 41, 7, 32, 87, 96, 20, 84, 70, 73, 9, 98, 69, 5, 40, 95, 42, 65, 31, 104, 1, 99, 34, 35, 67, 3], [46, 115, 127, 111, 56, 54, 110, 47, 108, 107, 119, 51, 117, 44, 48, 121, 43, 113, 57, 22, 85, 83, 21, 19, 26, 124, 92, 86, 27, 62, 24, 60, 80, 63, 39, 123, 88, 101, 61, 103, 125, 13, 55, 90, 16, 37, 58, 10, 118, 77, 74, 120, 14, 91, 59, 33, 122, 97, 116, 53, 72, 94, 6, 112, 17, 45, 81, 36, 52, 49, 28, 114, 78, 15, 29, 79, 126, 4, 109, 75, 100, 50, 66, 2, 0, 11, 93, 82, 64, 106, 68, 12, 38, 102, 89, 18, 76, 8, 25, 84, 71, 30, 7, 87, 73, 104, 105, 32, 20, 9, 42, 96, 23, 41, 40, 98, 69, 31, 5, 65, 70, 34, 99, 95, 35, 1, 3, 67], [46, 115, 127, 111, 56, 54, 47, 110, 108, 119, 107, 51, 117, 44, 48, 121, 43, 113, 57, 22, 62, 85, 83, 26, 21, 123, 124, 19, 24, 63, 80, 92, 101, 39, 90, 58, 103, 60, 86, 88, 55, 45, 120, 118, 61, 125, 37, 74, 33, 27, 97, 16, 10, 53, 122, 116, 13, 77, 50, 72, 59, 91, 49, 52, 28, 78, 14, 114, 126, 112, 2, 6, 81, 93, 94, 79, 29, 11, 4, 17, 15, 0, 68, 106, 75, 36, 64, 102, 12, 18, 32, 71, 109, 66, 38, 82, 89, 105, 76, 8, 41, 73, 100, 7, 9, 25, 42, 31, 104, 20, 84, 98, 23, 69, 40, 30, 96, 5, 99, 87, 95, 70, 35, 65, 34, 3, 67, 1], [46, 115, 127, 111, 56, 54, 47, 110, 108, 119, 107, 117, 44, 51, 48, 121, 43, 113, 57, 62, 21, 22, 19, 39, 124, 24, 26, 101, 63, 83, 85, 90, 92, 123, 88, 27, 58, 16, 55, 86, 61, 80, 33, 103, 60, 10, 37, 13, 125, 120, 118, 116, 45, 97, 59, 77, 49, 112, 72, 74, 78, 122, 53, 52, 91, 29, 126, 81, 6, 75, 28, 94, 68, 15, 106, 114, 14, 93, 79, 0, 4, 36, 17, 109, 11, 64, 66, 102, 82, 2, 100, 50, 12, 32, 84, 30, 7, 76, 38, 89, 71, 9, 41, 18, 105, 25, 23, 8, 73, 42, 96, 5, 69, 87, 20, 104, 40, 70, 34, 98, 95, 31, 35, 67, 65, 99, 1, 3], [46, 115, 111, 127, 47, 54, 56, 110, 108, 119, 107, 117, 51, 44, 121, 48, 43, 113, 22, 57, 62, 124, 63, 85, 26, 19, 21, 83, 24, 39, 92, 27, 55, 61, 37, 88, 101, 123, 125, 49, 80, 58, 86, 16, 118, 60, 90, 59, 45, 72, 116, 53, 33, 52, 13, 77, 97, 120, 10, 74, 103, 91, 112, 78, 81, 14, 68, 122, 94, 2, 15, 28, 79, 93, 36, 75, 29, 126, 17, 6, 100, 64, 114, 109, 11, 82, 106, 25, 18, 0, 4, 50, 89, 32, 76, 73, 30, 7, 38, 66, 20, 71, 70, 12, 41, 84, 102, 40, 105, 9, 23, 87, 104, 5, 69, 42, 34, 98, 8, 96, 95, 1, 31, 67, 35, 65, 99, 3], [46, 115, 127, 111, 56, 54, 47, 110, 119, 108, 107, 51, 117, 48, 44, 121, 43, 113, 57, 62, 63, 85, 124, 22, 24, 60, 19, 21, 123, 83, 26, 92, 90, 27, 39, 53, 101, 80, 88, 61, 49, 86, 59, 37, 16, 58, 116, 74, 118, 120, 103, 125, 72, 13, 122, 10, 55, 45, 91, 112, 33, 52, 77, 14, 78, 97, 64, 114, 81, 17, 68, 29, 126, 93, 79, 70, 28, 36, 11, 15, 89, 100, 2, 106, 94, 50, 66, 0, 4, 75, 18, 38, 20, 109, 76, 82, 32, 7, 42, 30, 25, 12, 6, 84, 41, 71, 105, 8, 9, 73, 69, 31, 87, 96, 102, 104, 5, 95, 23, 65, 35, 99, 98, 34, 3, 1, 40, 67], [46, 115, 127, 111, 47, 56, 54, 110, 108, 119, 107, 117, 51, 44, 48, 121, 43, 113, 57, 62, 124, 22, 85, 21, 19, 27, 24, 92, 101, 90, 59, 63, 26, 60, 123, 83, 61, 39, 86, 55, 80, 125, 33, 118, 16, 88, 37, 49, 58, 116, 103, 120, 13, 77, 45, 74, 52, 97, 122, 10, 53, 112, 91, 14, 70, 72, 81, 50, 29, 17, 68, 15, 79, 78, 114, 28, 36, 94, 64, 100, 109, 93, 4, 126, 18, 38, 106, 75, 76, 11, 8, 66, 2, 89, 42, 0, 82, 105, 25, 7, 9, 12, 20, 73, 96, 30, 102, 71, 23, 31, 32, 84, 87, 41, 69, 5, 104, 99, 6, 98, 95, 40, 34, 35, 67, 3, 65, 1], [46, 115, 127, 111, 54, 47, 56, 110, 108, 119, 107, 117, 51, 44, 48, 121, 43, 113, 22, 57, 21, 19, 24, 85, 62, 124, 27, 101, 83, 26, 92, 39, 88, 60, 63, 16, 86, 55, 123, 90, 80, 13, 37, 33, 58, 74, 45, 61, 77, 103, 118, 125, 97, 59, 120, 14, 49, 10, 81, 70, 72, 52, 91, 122, 114, 17, 116, 53, 36, 29, 78, 94, 15, 11, 8, 93, 75, 50, 28, 112, 109, 79, 106, 126, 68, 12, 0, 76, 82, 38, 4, 30, 32, 18, 66, 100, 25, 89, 42, 7, 71, 64, 20, 23, 73, 105, 9, 2, 102, 84, 41, 40, 96, 95, 87, 5, 98, 104, 6, 69, 31, 34, 1, 3, 99, 67, 65, 35], [46, 115, 111, 127, 54, 56, 47, 110, 108, 119, 107, 51, 117, 44, 48, 121, 43, 113, 22, 85, 26, 21, 124, 24, 19, 83, 57, 62, 88, 55, 63, 27, 39, 90, 60, 16, 92, 101, 103, 86, 80, 123, 59, 74, 37, 58, 33, 125, 45, 10, 77, 118, 97, 13, 49, 91, 120, 14, 61, 116, 53, 52, 17, 112, 81, 70, 78, 122, 93, 79, 94, 114, 15, 11, 8, 72, 28, 64, 126, 36, 0, 75, 18, 29, 106, 109, 82, 68, 50, 2, 12, 4, 25, 7, 66, 89, 20, 71, 100, 38, 76, 32, 84, 30, 105, 9, 23, 5, 42, 73, 102, 96, 6, 69, 98, 87, 104, 41, 40, 31, 95, 34, 1, 65, 35, 3, 99, 67], [46, 115, 127, 111, 54, 56, 47, 110, 108, 119, 107, 117, 51, 44, 48, 121, 43, 113, 57, 21, 22, 85, 124, 27, 60, 26, 19, 83, 92, 62, 24, 123, 63, 88, 86, 61, 90, 59, 55, 101, 39, 80, 16, 33, 125, 118, 49, 58, 53, 103, 97, 120, 74, 91, 37, 116, 13, 112, 77, 52, 45, 122, 10, 94, 36, 78, 93, 8, 81, 106, 14, 79, 50, 29, 70, 126, 114, 100, 28, 17, 4, 109, 75, 66, 64, 12, 15, 18, 68, 11, 82, 25, 30, 38, 89, 0, 41, 42, 105, 76, 72, 20, 102, 23, 2, 7, 104, 71, 32, 9, 95, 40, 96, 73, 87, 6, 84, 99, 5, 34, 69, 31, 98, 65, 67, 1, 3, 35], [46, 115, 127, 111, 54, 47, 56, 110, 108, 107, 117, 119, 51, 44, 48, 121, 43, 113, 57, 21, 85, 60, 124, 26, 125, 83, 22, 19, 62, 92, 55, 27, 61, 39, 88, 123, 24, 63, 90, 118, 80, 101, 59, 120, 13, 86, 37, 58, 16, 33, 8, 45, 52, 53, 97, 77, 103, 49, 74, 114, 81, 112, 10, 126, 122, 28, 116, 14, 50, 109, 91, 17, 15, 78, 29, 79, 36, 94, 75, 11, 82, 4, 106, 68, 100, 93, 6, 2, 38, 104, 76, 66, 12, 64, 42, 0, 32, 30, 25, 41, 70, 20, 102, 89, 71, 105, 18, 9, 96, 87, 7, 23, 73, 84, 40, 95, 98, 31, 34, 69, 99, 72, 35, 5, 1, 67, 3, 65], [46, 115, 111, 127, 56, 47, 54, 110, 108, 107, 117, 119, 51, 44, 121, 48, 43, 113, 57, 21, 62, 22, 124, 19, 24, 85, 26, 83, 63, 88, 92, 80, 27, 118, 55, 58, 39, 123, 120, 60, 101, 90, 16, 53, 13, 86, 8, 74, 103, 61, 45, 125, 37, 49, 33, 77, 59, 10, 52, 122, 97, 78, 14, 112, 6, 116, 81, 91, 79, 94, 0, 114, 17, 15, 68, 11, 29, 28, 66, 75, 126, 4, 109, 100, 50, 12, 2, 36, 106, 38, 93, 76, 89, 82, 102, 7, 18, 71, 64, 42, 32, 5, 84, 73, 20, 25, 23, 70, 30, 105, 72, 9, 87, 69, 96, 104, 98, 95, 67, 31, 41, 40, 34, 65, 1, 3, 35, 99], [46, 115, 111, 127, 54, 47, 56, 110, 108, 119, 51, 107, 117, 44, 121, 48, 43, 113, 57, 62, 22, 26, 55, 21, 83, 124, 24, 19, 85, 39, 45, 118, 27, 63, 92, 88, 37, 60, 101, 90, 123, 61, 8, 80, 86, 59, 74, 97, 33, 16, 125, 120, 49, 13, 103, 52, 53, 58, 77, 122, 114, 6, 78, 10, 68, 14, 50, 0, 81, 11, 17, 29, 91, 116, 66, 106, 112, 75, 15, 4, 79, 94, 28, 64, 126, 93, 109, 100, 71, 36, 76, 102, 18, 7, 2, 73, 25, 32, 12, 84, 38, 89, 82, 105, 9, 98, 5, 72, 30, 69, 96, 41, 35, 42, 23, 20, 34, 95, 87, 70, 40, 99, 104, 65, 31, 67, 3, 1], [46, 115, 111, 127, 56, 54, 47, 110, 108, 119, 107, 117, 51, 44, 48, 121, 43, 113, 57, 22, 26, 19, 62, 55, 85, 21, 24, 39, 27, 124, 83, 92, 86, 88, 60, 101, 16, 45, 80, 123, 58, 118, 90, 59, 37, 61, 63, 13, 33, 8, 103, 125, 77, 74, 97, 81, 122, 120, 53, 10, 14, 49, 78, 6, 112, 116, 17, 68, 52, 11, 91, 114, 29, 50, 94, 15, 36, 75, 28, 79, 93, 18, 4, 109, 100, 66, 106, 126, 25, 82, 0, 102, 76, 12, 38, 71, 105, 20, 72, 30, 7, 9, 2, 89, 73, 32, 64, 23, 5, 104, 87, 95, 84, 69, 70, 98, 40, 42, 96, 41, 34, 31, 35, 67, 1, 3, 65, 99], [46, 115, 127, 111, 56, 54, 47, 110, 108, 107, 119, 117, 51, 44, 48, 121, 43, 113, 26, 22, 57, 21, 62, 27, 124, 88, 19, 24, 55, 85, 63, 60, 39, 92, 59, 83, 16, 86, 118, 80, 101, 58, 45, 37, 103, 125, 90, 53, 123, 74, 61, 97, 8, 33, 10, 13, 116, 120, 14, 49, 122, 77, 91, 78, 112, 81, 6, 52, 0, 15, 68, 36, 93, 17, 114, 94, 102, 29, 50, 11, 2, 75, 4, 28, 126, 109, 64, 7, 79, 76, 18, 71, 100, 66, 12, 89, 25, 82, 70, 72, 38, 106, 9, 20, 5, 30, 69, 32, 73, 98, 105, 84, 104, 23, 87, 96, 1, 3, 95, 65, 42, 41, 40, 34, 31, 35, 99, 67], [46, 115, 127, 111, 56, 54, 47, 110, 108, 107, 119, 117, 44, 51, 121, 48, 43, 113, 57, 26, 22, 62, 19, 27, 80, 21, 24, 92, 63, 39, 124, 55, 85, 88, 83, 86, 60, 90, 16, 123, 101, 118, 37, 58, 61, 77, 59, 33, 97, 45, 120, 10, 125, 116, 49, 74, 13, 53, 103, 112, 122, 8, 81, 78, 14, 17, 93, 91, 36, 50, 29, 79, 28, 11, 52, 15, 94, 114, 75, 106, 76, 72, 126, 18, 68, 25, 12, 100, 82, 6, 109, 102, 70, 64, 7, 38, 66, 84, 73, 9, 71, 89, 4, 42, 105, 20, 30, 96, 23, 2, 32, 0, 69, 41, 98, 104, 87, 95, 31, 5, 34, 99, 40, 65, 35, 1, 3, 67]], "model.layers.24.self_attn.q_proj": [[112, 37, 48, 54, 90, 41, 94, 86, 30, 101, 56, 25, 59, 81, 62, 19, 20, 106, 58, 44, 63, 121, 123, 89, 105, 114, 12, 84, 9, 22, 95, 35, 74, 113, 46, 77, 52, 97, 104, 70, 2, 16, 24, 107, 117, 124, 60, 39, 18, 43, 119, 31, 88, 14, 125, 79, 122, 26, 10, 126, 109, 49, 4, 108, 68, 67, 80, 50, 45, 92, 8, 47, 120, 111, 29, 57, 0, 32, 115, 98, 51, 61, 118, 53, 99, 27, 71, 28, 116, 42, 110, 23, 103, 93, 102, 36, 127, 33, 91, 38, 40, 55, 100, 3, 21, 87, 17, 15, 78, 34, 85, 75, 1, 5, 96, 82, 13, 83, 65, 7, 69, 73, 76, 11, 66, 64, 72, 6], [112, 37, 48, 54, 122, 90, 111, 94, 30, 86, 125, 25, 101, 117, 41, 81, 24, 58, 19, 89, 88, 22, 35, 121, 59, 84, 45, 57, 126, 114, 109, 53, 20, 63, 106, 42, 105, 77, 12, 31, 119, 102, 120, 55, 104, 46, 10, 82, 60, 124, 3, 113, 14, 50, 18, 123, 70, 62, 100, 97, 49, 56, 47, 8, 52, 110, 44, 61, 74, 21, 26, 16, 107, 118, 87, 36, 9, 43, 103, 29, 40, 116, 65, 115, 95, 64, 93, 4, 27, 79, 32, 34, 51, 108, 28, 23, 17, 92, 91, 85, 127, 15, 96, 33, 2, 38, 39, 99, 80, 78, 98, 72, 73, 5, 83, 71, 11, 6, 75, 13, 7, 66, 68, 67, 0, 69, 76, 1], [112, 37, 48, 89, 25, 90, 54, 86, 94, 30, 56, 24, 18, 19, 35, 41, 22, 45, 105, 43, 20, 59, 50, 101, 116, 127, 97, 42, 81, 117, 125, 63, 16, 77, 60, 123, 88, 49, 95, 85, 38, 47, 9, 121, 106, 122, 109, 12, 114, 84, 115, 119, 100, 67, 0, 108, 55, 40, 26, 31, 110, 58, 70, 87, 51, 103, 104, 36, 102, 27, 92, 124, 111, 79, 2, 62, 46, 61, 13, 107, 91, 113, 53, 23, 126, 57, 74, 32, 34, 71, 39, 52, 33, 120, 29, 14, 82, 93, 99, 98, 44, 83, 96, 21, 15, 118, 69, 66, 8, 28, 78, 17, 10, 80, 7, 1, 75, 68, 76, 3, 11, 4, 72, 73, 6, 65, 64, 5], [112, 37, 48, 54, 90, 41, 94, 84, 81, 86, 117, 56, 62, 88, 30, 14, 20, 105, 19, 111, 24, 25, 51, 125, 101, 109, 49, 114, 10, 120, 63, 45, 58, 47, 31, 8, 60, 70, 126, 59, 43, 64, 77, 55, 121, 22, 52, 92, 102, 42, 119, 18, 57, 118, 53, 66, 3, 124, 4, 122, 113, 127, 65, 97, 106, 108, 89, 35, 123, 44, 107, 50, 16, 34, 39, 78, 103, 74, 110, 61, 26, 46, 32, 98, 104, 99, 38, 116, 40, 28, 12, 69, 17, 72, 100, 80, 21, 115, 93, 87, 1, 95, 33, 83, 29, 36, 85, 73, 96, 67, 5, 79, 91, 11, 2, 27, 9, 23, 71, 68, 75, 15, 0, 82, 6, 13, 76, 7], [108, 36, 44, 123, 51, 90, 26, 114, 85, 92, 96, 52, 121, 120, 18, 82, 31, 77, 28, 111, 94, 24, 15, 125, 56, 53, 95, 127, 60, 126, 20, 62, 107, 63, 113, 122, 23, 76, 46, 73, 110, 55, 32, 124, 39, 105, 102, 101, 45, 84, 87, 58, 50, 43, 74, 30, 37, 118, 41, 109, 116, 16, 6, 8, 97, 29, 27, 49, 21, 54, 33, 61, 115, 38, 9, 48, 7, 57, 112, 106, 40, 98, 14, 103, 117, 91, 89, 119, 47, 22, 99, 34, 25, 59, 104, 100, 35, 3, 42, 86, 78, 64, 88, 93, 4, 19, 80, 2, 83, 81, 11, 12, 17, 65, 10, 79, 67, 71, 5, 13, 69, 75, 66, 0, 68, 70, 1, 72], [108, 36, 44, 123, 51, 63, 127, 52, 90, 28, 96, 56, 114, 120, 92, 23, 105, 95, 119, 118, 46, 58, 31, 85, 87, 111, 30, 102, 106, 112, 57, 54, 124, 40, 103, 122, 55, 42, 77, 26, 107, 125, 60, 113, 99, 104, 116, 110, 15, 43, 109, 39, 32, 94, 61, 41, 100, 45, 62, 47, 38, 49, 11, 121, 82, 18, 115, 117, 126, 48, 53, 17, 37, 50, 10, 20, 59, 101, 93, 84, 8, 35, 97, 98, 33, 76, 34, 27, 6, 91, 88, 24, 16, 9, 89, 29, 83, 25, 19, 80, 73, 5, 21, 86, 22, 74, 3, 79, 81, 13, 7, 68, 12, 72, 75, 78, 14, 69, 66, 70, 64, 1, 4, 71, 65, 67, 2, 0], [108, 36, 44, 51, 52, 92, 96, 56, 85, 121, 90, 126, 26, 28, 31, 60, 114, 23, 120, 45, 117, 87, 18, 82, 15, 105, 49, 94, 77, 124, 102, 24, 112, 32, 48, 88, 109, 123, 55, 104, 110, 46, 103, 93, 41, 27, 73, 76, 100, 107, 125, 115, 34, 116, 61, 29, 63, 118, 113, 122, 43, 50, 53, 84, 119, 89, 111, 57, 127, 95, 101, 16, 22, 38, 47, 42, 39, 33, 106, 54, 25, 62, 4, 2, 40, 8, 37, 59, 98, 21, 97, 58, 64, 6, 3, 74, 99, 86, 30, 91, 9, 35, 80, 20, 65, 83, 78, 19, 7, 69, 71, 79, 81, 14, 5, 12, 17, 70, 10, 0, 13, 11, 75, 68, 72, 67, 66, 1], [108, 36, 123, 52, 90, 94, 44, 92, 28, 121, 124, 111, 63, 62, 93, 59, 109, 86, 105, 82, 87, 23, 116, 114, 50, 60, 26, 20, 100, 49, 96, 42, 91, 46, 31, 127, 119, 17, 103, 51, 78, 19, 41, 84, 57, 110, 15, 27, 53, 98, 48, 102, 37, 122, 117, 112, 58, 55, 33, 101, 99, 30, 104, 47, 45, 40, 25, 56, 120, 43, 61, 35, 54, 11, 106, 38, 118, 39, 113, 115, 97, 126, 107, 125, 22, 29, 18, 34, 8, 14, 85, 89, 80, 95, 88, 32, 21, 73, 77, 76, 24, 9, 83, 6, 16, 10, 81, 13, 79, 3, 7, 74, 12, 71, 75, 64, 2, 70, 72, 4, 5, 69, 65, 68, 66, 1, 67, 0], [45, 52, 109, 32, 89, 85, 18, 123, 12, 29, 87, 10, 9, 79, 69, 81, 96, 83, 71, 91, 101, 58, 4, 30, 117, 0, 68, 2, 82, 8, 19, 1, 16, 105, 59, 84, 35, 108, 37, 7, 64, 23, 6, 50, 54, 77, 76, 78, 127, 42, 25, 124, 97, 33, 17, 46, 102, 24, 31, 120, 114, 21, 28, 72, 15, 20, 94, 80, 73, 26, 88, 3, 126, 62, 104, 92, 70, 74, 110, 75, 66, 14, 103, 13, 122, 61, 44, 116, 27, 95, 67, 36, 60, 112, 39, 40, 86, 100, 22, 90, 34, 5, 119, 38, 113, 43, 98, 49, 93, 125, 65, 115, 56, 107, 11, 118, 48, 51, 53, 111, 99, 121, 63, 106, 57, 41, 47, 55], [45, 52, 109, 87, 123, 91, 79, 85, 69, 89, 12, 10, 32, 18, 96, 101, 4, 71, 1, 29, 9, 0, 81, 66, 83, 92, 30, 2, 36, 35, 56, 82, 65, 122, 42, 120, 117, 60, 8, 105, 22, 70, 58, 39, 110, 97, 64, 104, 88, 127, 106, 7, 84, 72, 53, 90, 14, 61, 95, 63, 74, 3, 126, 5, 21, 77, 16, 48, 54, 73, 111, 80, 27, 23, 68, 24, 76, 67, 94, 119, 17, 25, 13, 50, 33, 75, 62, 26, 107, 15, 116, 51, 44, 20, 49, 93, 31, 19, 46, 11, 6, 102, 98, 37, 108, 114, 103, 78, 28, 41, 38, 59, 121, 86, 125, 34, 118, 113, 99, 40, 57, 112, 100, 55, 124, 115, 43, 47], [45, 52, 109, 91, 32, 123, 85, 89, 87, 29, 18, 12, 9, 79, 46, 81, 112, 10, 4, 114, 69, 71, 83, 126, 96, 50, 54, 94, 61, 58, 47, 119, 108, 60, 82, 127, 92, 25, 63, 72, 22, 70, 62, 77, 30, 84, 107, 59, 110, 101, 35, 105, 64, 15, 57, 106, 16, 13, 111, 40, 49, 27, 93, 68, 116, 117, 42, 21, 1, 56, 55, 41, 39, 2, 115, 17, 33, 20, 23, 118, 24, 80, 0, 75, 19, 37, 113, 120, 103, 99, 95, 36, 26, 7, 122, 88, 124, 31, 34, 98, 90, 125, 28, 104, 100, 51, 8, 121, 102, 44, 73, 76, 14, 97, 5, 6, 38, 11, 3, 86, 43, 74, 53, 48, 78, 67, 65, 66], [45, 52, 109, 89, 32, 87, 91, 29, 85, 123, 79, 96, 18, 12, 10, 71, 83, 69, 50, 114, 9, 60, 35, 119, 49, 92, 84, 81, 106, 4, 77, 101, 95, 57, 107, 93, 63, 16, 1, 30, 36, 46, 82, 108, 100, 61, 117, 94, 39, 22, 112, 120, 118, 23, 58, 127, 53, 26, 21, 113, 110, 116, 43, 40, 44, 59, 28, 105, 25, 103, 88, 99, 42, 122, 55, 31, 121, 111, 124, 24, 126, 13, 54, 98, 62, 125, 37, 56, 27, 38, 115, 97, 15, 80, 51, 19, 20, 48, 102, 86, 68, 14, 90, 6, 74, 34, 33, 75, 78, 47, 17, 64, 8, 41, 76, 72, 104, 7, 73, 11, 5, 0, 70, 65, 67, 2, 3, 66], [104, 127, 98, 92, 87, 84, 81, 109, 31, 15, 17, 22, 54, 41, 28, 58, 82, 51, 37, 76, 124, 14, 21, 63, 122, 116, 95, 121, 89, 30, 20, 43, 60, 38, 57, 90, 53, 13, 56, 25, 73, 18, 94, 85, 102, 29, 24, 11, 88, 126, 111, 117, 125, 70, 27, 46, 55, 107, 23, 48, 62, 12, 42, 59, 78, 67, 61, 4, 40, 45, 93, 110, 80, 105, 118, 106, 32, 9, 114, 83, 79, 19, 35, 97, 99, 120, 96, 36, 47, 6, 39, 101, 33, 68, 86, 52, 75, 10, 50, 49, 77, 100, 119, 74, 113, 123, 115, 44, 103, 26, 5, 108, 91, 16, 72, 69, 8, 112, 34, 71, 7, 66, 3, 1, 65, 64, 0, 2], [104, 127, 92, 98, 84, 87, 14, 81, 109, 31, 73, 76, 37, 6, 63, 125, 122, 82, 57, 105, 75, 89, 124, 18, 41, 4, 107, 94, 99, 78, 25, 17, 97, 116, 95, 83, 61, 10, 64, 58, 28, 30, 68, 59, 108, 66, 43, 54, 20, 65, 60, 51, 47, 115, 72, 15, 100, 106, 3, 50, 46, 53, 45, 56, 49, 85, 48, 23, 13, 16, 22, 114, 96, 24, 9, 62, 77, 12, 74, 35, 26, 80, 70, 119, 19, 36, 33, 38, 91, 79, 90, 40, 117, 44, 55, 29, 111, 121, 101, 39, 88, 102, 103, 2, 21, 34, 93, 8, 123, 69, 120, 126, 7, 32, 113, 112, 5, 27, 11, 42, 110, 52, 118, 71, 0, 86, 1, 67], [104, 127, 98, 101, 22, 43, 92, 18, 105, 53, 31, 59, 125, 84, 124, 87, 63, 61, 89, 54, 81, 41, 39, 122, 57, 106, 50, 95, 45, 19, 46, 28, 99, 35, 121, 109, 71, 79, 118, 38, 26, 62, 25, 76, 58, 37, 116, 44, 8, 96, 15, 56, 0, 30, 102, 27, 111, 117, 14, 115, 13, 120, 126, 107, 108, 60, 94, 93, 49, 73, 74, 51, 55, 110, 52, 11, 85, 123, 113, 103, 82, 119, 32, 4, 97, 42, 75, 36, 77, 112, 100, 90, 48, 33, 47, 86, 114, 68, 65, 6, 5, 29, 24, 34, 10, 80, 88, 23, 91, 83, 40, 7, 67, 17, 20, 72, 69, 2, 21, 1, 16, 70, 66, 3, 64, 9, 12, 78], [104, 127, 98, 73, 87, 31, 66, 14, 92, 81, 124, 84, 76, 4, 6, 116, 37, 25, 64, 89, 2, 63, 71, 13, 5, 60, 41, 80, 65, 75, 19, 0, 18, 106, 61, 114, 22, 79, 67, 46, 10, 1, 105, 53, 47, 58, 57, 125, 8, 122, 95, 40, 85, 54, 82, 101, 12, 59, 102, 51, 77, 74, 117, 121, 62, 70, 3, 111, 11, 90, 55, 24, 69, 72, 29, 109, 50, 23, 49, 15, 9, 100, 16, 86, 21, 91, 42, 33, 56, 88, 35, 99, 26, 7, 119, 123, 107, 17, 112, 43, 68, 36, 32, 38, 44, 118, 48, 94, 103, 96, 27, 126, 113, 52, 97, 108, 83, 28, 115, 34, 110, 39, 120, 78, 45, 30, 93, 20], [61, 102, 121, 114, 58, 116, 119, 62, 115, 54, 50, 33, 111, 126, 60, 125, 59, 122, 120, 124, 123, 117, 112, 110, 63, 52, 38, 24, 56, 53, 49, 45, 113, 42, 57, 29, 127, 55, 48, 26, 51, 118, 46, 107, 15, 27, 108, 47, 109, 91, 88, 105, 41, 85, 21, 84, 106, 44, 43, 86, 93, 37, 90, 18, 101, 28, 103, 100, 40, 104, 39, 22, 30, 97, 36, 32, 83, 17, 79, 34, 98, 35, 82, 96, 12, 19, 73, 95, 99, 31, 92, 81, 9, 94, 23, 80, 77, 64, 7, 20, 76, 25, 71, 66, 78, 89, 67, 87, 0, 2, 1, 65, 3, 5, 4, 69, 16, 13, 68, 11, 6, 14, 75, 10, 8, 72, 74, 70], [121, 102, 114, 61, 116, 58, 119, 62, 115, 54, 33, 60, 126, 59, 122, 111, 50, 123, 117, 120, 124, 110, 125, 52, 112, 53, 57, 24, 29, 63, 56, 45, 38, 49, 48, 26, 127, 55, 42, 113, 46, 118, 51, 88, 15, 47, 109, 108, 27, 91, 105, 41, 107, 106, 21, 85, 43, 84, 44, 86, 93, 28, 37, 90, 18, 103, 100, 101, 104, 39, 30, 97, 22, 40, 83, 17, 36, 79, 32, 12, 99, 95, 35, 34, 96, 98, 73, 19, 20, 31, 81, 87, 92, 82, 94, 80, 76, 23, 9, 25, 71, 67, 7, 77, 64, 66, 0, 3, 65, 1, 78, 89, 4, 2, 69, 5, 68, 13, 16, 11, 14, 75, 6, 74, 10, 72, 8, 70], [114, 121, 102, 61, 58, 116, 119, 115, 62, 122, 60, 124, 126, 50, 59, 123, 54, 33, 111, 112, 125, 120, 110, 117, 24, 52, 63, 56, 53, 42, 106, 57, 127, 48, 49, 107, 55, 26, 118, 113, 38, 15, 46, 51, 45, 29, 47, 108, 27, 105, 84, 109, 91, 85, 21, 44, 41, 88, 93, 28, 43, 86, 37, 90, 18, 103, 40, 104, 32, 101, 36, 100, 39, 30, 17, 22, 82, 79, 97, 98, 35, 9, 34, 96, 12, 99, 83, 92, 31, 95, 73, 81, 94, 7, 19, 76, 67, 69, 20, 77, 25, 64, 1, 0, 78, 71, 66, 65, 3, 4, 2, 80, 23, 5, 89, 68, 87, 14, 13, 11, 16, 6, 74, 75, 8, 72, 70, 10], [61, 121, 114, 102, 23, 80, 83, 50, 58, 74, 77, 30, 25, 45, 11, 8, 33, 37, 29, 90, 91, 78, 32, 92, 20, 70, 81, 119, 100, 69, 116, 88, 86, 62, 75, 68, 72, 120, 89, 59, 42, 105, 26, 21, 14, 125, 18, 6, 87, 16, 10, 66, 28, 54, 24, 76, 19, 27, 127, 85, 96, 126, 103, 56, 107, 1, 111, 106, 93, 63, 64, 51, 12, 67, 22, 13, 48, 84, 34, 4, 82, 17, 60, 31, 47, 110, 43, 115, 122, 98, 124, 52, 101, 7, 112, 117, 94, 35, 123, 57, 79, 108, 49, 0, 104, 41, 95, 2, 46, 71, 55, 5, 9, 36, 118, 3, 113, 99, 44, 53, 109, 15, 38, 39, 40, 65, 73, 97], [53, 42, 58, 120, 122, 100, 125, 116, 61, 45, 63, 123, 127, 48, 50, 52, 121, 49, 30, 54, 113, 89, 126, 51, 23, 108, 56, 59, 32, 44, 112, 111, 115, 46, 60, 106, 114, 62, 18, 47, 117, 57, 55, 21, 124, 119, 110, 109, 43, 118, 90, 91, 107, 40, 86, 92, 35, 15, 25, 104, 103, 28, 94, 24, 85, 41, 105, 12, 31, 37, 27, 38, 39, 83, 99, 101, 96, 36, 22, 102, 82, 34, 80, 98, 88, 93, 81, 95, 87, 78, 33, 1, 20, 76, 79, 97, 17, 5, 14, 29, 11, 26, 7, 77, 74, 16, 67, 68, 65, 0, 3, 72, 84, 19, 73, 64, 2, 70, 4, 71, 66, 69, 10, 9, 8, 13, 75, 6], [53, 42, 58, 120, 83, 80, 100, 89, 32, 11, 77, 73, 86, 27, 106, 45, 122, 70, 28, 24, 1, 92, 99, 26, 85, 81, 17, 67, 25, 74, 79, 72, 14, 5, 103, 20, 75, 111, 116, 23, 40, 113, 76, 125, 63, 123, 57, 48, 84, 119, 96, 61, 66, 6, 37, 93, 78, 82, 19, 50, 8, 91, 127, 34, 88, 15, 30, 126, 43, 94, 31, 9, 87, 69, 10, 49, 22, 105, 29, 4, 21, 0, 16, 95, 112, 71, 13, 39, 90, 56, 33, 54, 52, 124, 121, 101, 18, 51, 104, 118, 59, 109, 108, 7, 68, 114, 62, 12, 3, 35, 97, 117, 60, 115, 46, 44, 55, 98, 38, 2, 65, 102, 107, 64, 47, 110, 41, 36], [58, 42, 53, 120, 122, 125, 116, 45, 100, 61, 30, 123, 63, 48, 121, 50, 127, 49, 111, 54, 52, 89, 56, 59, 51, 112, 126, 23, 106, 108, 32, 46, 113, 115, 57, 114, 44, 62, 47, 60, 117, 55, 119, 21, 124, 110, 18, 109, 24, 118, 90, 94, 43, 107, 104, 91, 40, 25, 41, 105, 103, 92, 31, 35, 85, 86, 39, 37, 99, 12, 101, 38, 28, 27, 15, 96, 102, 36, 22, 82, 95, 93, 34, 98, 17, 81, 88, 97, 79, 33, 83, 76, 20, 80, 14, 87, 29, 26, 7, 16, 78, 2, 74, 8, 19, 64, 65, 72, 66, 77, 68, 0, 5, 71, 1, 67, 3, 84, 10, 4, 11, 69, 6, 70, 13, 75, 9, 73], [120, 42, 58, 53, 122, 125, 45, 61, 116, 123, 106, 50, 89, 52, 23, 49, 100, 48, 59, 56, 63, 112, 111, 44, 121, 54, 127, 117, 126, 57, 62, 110, 113, 51, 115, 30, 55, 60, 46, 109, 118, 114, 47, 90, 119, 124, 18, 108, 107, 28, 43, 21, 105, 24, 104, 91, 32, 83, 95, 29, 40, 103, 38, 39, 35, 101, 34, 41, 99, 102, 94, 86, 37, 97, 85, 92, 27, 31, 33, 25, 15, 98, 96, 77, 36, 26, 12, 82, 93, 20, 13, 19, 22, 75, 17, 10, 14, 87, 7, 11, 76, 84, 74, 79, 88, 16, 80, 71, 67, 6, 68, 66, 72, 1, 8, 2, 5, 81, 70, 3, 64, 0, 4, 69, 65, 78, 9, 73], [51, 118, 55, 39, 63, 54, 121, 25, 23, 122, 124, 120, 126, 59, 53, 61, 113, 123, 62, 58, 117, 95, 50, 56, 57, 89, 52, 125, 60, 82, 49, 116, 115, 119, 34, 47, 48, 114, 127, 106, 111, 112, 42, 46, 44, 109, 104, 110, 45, 108, 29, 87, 80, 14, 27, 43, 107, 84, 36, 76, 35, 37, 99, 41, 96, 40, 102, 103, 20, 101, 100, 105, 28, 97, 38, 31, 18, 12, 85, 91, 26, 24, 16, 74, 32, 93, 92, 98, 78, 86, 30, 71, 69, 33, 94, 8, 21, 88, 7, 2, 22, 1, 68, 90, 83, 10, 70, 72, 13, 81, 3, 73, 6, 19, 11, 0, 5, 9, 79, 65, 4, 64, 66, 75, 67, 17, 77, 15], [55, 39, 118, 122, 25, 23, 120, 121, 63, 89, 95, 115, 124, 51, 123, 113, 126, 61, 58, 60, 34, 82, 59, 112, 48, 53, 119, 106, 52, 57, 62, 117, 111, 42, 49, 50, 56, 47, 114, 125, 44, 127, 116, 43, 54, 80, 14, 29, 108, 45, 110, 46, 109, 84, 76, 41, 104, 107, 40, 27, 87, 20, 31, 105, 18, 102, 100, 16, 28, 96, 78, 37, 36, 101, 38, 103, 12, 93, 35, 8, 97, 32, 24, 94, 99, 83, 88, 74, 33, 86, 10, 71, 92, 85, 72, 7, 98, 90, 70, 73, 69, 68, 13, 30, 6, 21, 3, 2, 5, 26, 9, 22, 0, 65, 66, 19, 1, 4, 67, 64, 81, 11, 91, 79, 17, 77, 75, 15], [51, 55, 39, 120, 121, 118, 54, 122, 63, 23, 89, 25, 59, 123, 46, 113, 62, 115, 124, 60, 61, 126, 44, 112, 125, 58, 50, 116, 33, 53, 52, 49, 82, 95, 57, 34, 119, 29, 108, 56, 45, 110, 114, 111, 117, 87, 47, 42, 127, 48, 43, 76, 14, 109, 100, 84, 68, 71, 31, 36, 99, 98, 3, 106, 26, 72, 11, 80, 38, 107, 37, 40, 88, 32, 18, 102, 105, 27, 0, 83, 104, 20, 41, 28, 96, 97, 1, 12, 101, 81, 30, 92, 93, 90, 94, 78, 103, 5, 35, 2, 64, 74, 24, 69, 65, 70, 9, 4, 85, 19, 21, 8, 91, 13, 16, 6, 86, 7, 75, 67, 10, 73, 79, 66, 17, 22, 77, 15], [51, 55, 120, 39, 63, 121, 27, 124, 126, 59, 53, 61, 123, 116, 122, 57, 58, 60, 62, 56, 45, 118, 119, 54, 25, 117, 125, 114, 113, 50, 47, 49, 127, 111, 34, 115, 108, 95, 52, 48, 46, 84, 107, 112, 36, 86, 110, 82, 43, 20, 109, 42, 76, 89, 44, 29, 40, 23, 14, 92, 41, 106, 24, 94, 80, 102, 101, 105, 91, 93, 22, 79, 99, 31, 103, 28, 17, 81, 90, 35, 104, 38, 15, 74, 98, 100, 12, 69, 85, 7, 37, 8, 88, 32, 13, 16, 9, 96, 21, 70, 71, 97, 26, 18, 33, 87, 68, 1, 77, 83, 6, 30, 2, 11, 78, 73, 10, 19, 0, 67, 72, 5, 75, 66, 4, 65, 3, 64], [38, 115, 114, 50, 51, 113, 89, 75, 19, 82, 16, 23, 77, 7, 73, 78, 69, 10, 39, 87, 25, 6, 94, 67, 8, 85, 57, 104, 13, 74, 125, 52, 27, 3, 26, 80, 118, 21, 14, 68, 65, 5, 106, 92, 124, 12, 105, 11, 123, 30, 64, 79, 98, 122, 37, 95, 81, 43, 44, 63, 62, 56, 41, 18, 93, 70, 102, 54, 101, 2, 112, 121, 107, 83, 42, 76, 100, 59, 34, 48, 111, 49, 45, 40, 119, 117, 33, 28, 126, 60, 84, 47, 108, 120, 36, 96, 109, 103, 4, 61, 88, 86, 29, 32, 17, 99, 127, 35, 58, 72, 46, 9, 116, 97, 31, 55, 22, 15, 90, 110, 53, 91, 71, 24, 20, 1, 66, 0], [38, 114, 115, 50, 51, 89, 23, 82, 19, 77, 16, 69, 75, 7, 73, 113, 64, 2, 85, 14, 93, 17, 67, 66, 78, 1, 21, 72, 3, 87, 79, 12, 94, 8, 13, 32, 71, 126, 106, 25, 15, 68, 95, 102, 10, 22, 41, 57, 74, 6, 127, 24, 44, 34, 112, 107, 56, 39, 76, 27, 92, 26, 90, 98, 0, 54, 18, 101, 80, 65, 40, 121, 105, 49, 81, 30, 122, 84, 47, 9, 104, 108, 83, 109, 117, 4, 125, 11, 20, 100, 37, 36, 48, 42, 116, 43, 86, 103, 97, 59, 63, 120, 28, 29, 61, 46, 45, 111, 99, 55, 110, 119, 53, 35, 70, 58, 60, 91, 33, 124, 118, 31, 88, 96, 62, 52, 123, 5], [38, 114, 115, 50, 51, 89, 7, 19, 73, 16, 75, 113, 82, 2, 77, 64, 69, 23, 3, 67, 14, 25, 85, 87, 93, 6, 13, 1, 17, 48, 66, 107, 8, 125, 9, 124, 41, 56, 71, 104, 0, 12, 79, 20, 52, 122, 45, 126, 5, 94, 81, 18, 22, 65, 74, 95, 127, 91, 57, 49, 59, 83, 15, 11, 80, 46, 24, 60, 92, 39, 118, 21, 86, 90, 105, 68, 10, 117, 121, 4, 44, 110, 29, 78, 72, 62, 106, 111, 88, 32, 108, 96, 58, 27, 40, 31, 123, 34, 109, 116, 120, 76, 119, 97, 101, 43, 63, 37, 100, 112, 98, 36, 28, 30, 99, 33, 61, 54, 103, 53, 26, 42, 55, 47, 84, 35, 102, 70], [38, 50, 114, 115, 51, 113, 23, 16, 82, 89, 39, 19, 77, 85, 126, 75, 73, 78, 7, 30, 24, 37, 22, 57, 116, 14, 52, 12, 35, 92, 34, 27, 125, 112, 36, 79, 83, 69, 13, 81, 26, 107, 49, 56, 88, 63, 80, 104, 17, 106, 67, 21, 94, 25, 20, 120, 44, 90, 46, 87, 60, 121, 45, 117, 42, 40, 9, 108, 47, 41, 48, 8, 109, 93, 76, 6, 122, 32, 72, 84, 97, 102, 53, 62, 10, 59, 119, 61, 99, 103, 98, 2, 118, 31, 111, 11, 54, 105, 74, 28, 18, 86, 127, 124, 33, 110, 15, 95, 29, 123, 43, 100, 96, 101, 58, 55, 91, 68, 3, 70, 5, 64, 71, 65, 1, 4, 66, 0]], "model.layers.24.self_attn.k_proj": [[48, 112, 101, 30, 86, 90, 19, 88, 89, 25, 81, 84, 12, 70, 14, 107, 77, 63, 8, 54, 47, 120, 60, 17, 52, 125, 41, 0, 49, 117, 115, 119, 93, 114, 2, 124, 74, 10, 33, 79, 64, 113, 43, 46, 102, 1, 57, 62, 80, 59, 42, 18, 20, 100, 118, 110, 35, 9, 109, 56, 29, 123, 44, 55, 58, 51, 68, 106, 116, 45, 28, 38, 103, 122, 108, 61, 50, 111, 4, 3, 40, 104, 87, 53, 127, 126, 39, 121, 67, 98, 99, 16, 97, 96, 15, 23, 32, 36, 31, 27, 85, 91, 5, 105, 92, 83, 34, 71, 94, 95, 73, 24, 82, 75, 21, 78, 7, 26, 11, 13, 72, 69, 76, 65, 22, 6, 66, 37], [44, 108, 52, 121, 82, 77, 15, 85, 100, 26, 32, 87, 60, 123, 6, 8, 76, 73, 55, 28, 3, 51, 74, 127, 120, 24, 48, 92, 61, 0, 9, 64, 65, 29, 124, 4, 109, 2, 89, 16, 5, 111, 94, 20, 110, 63, 18, 114, 105, 116, 78, 56, 35, 118, 7, 83, 47, 126, 59, 22, 88, 91, 50, 95, 62, 33, 122, 46, 84, 119, 115, 25, 31, 80, 54, 23, 27, 113, 86, 97, 98, 45, 81, 99, 103, 39, 102, 69, 40, 12, 70, 101, 107, 37, 57, 43, 117, 34, 112, 49, 11, 93, 53, 41, 38, 36, 42, 106, 96, 125, 104, 58, 19, 90, 30, 21, 14, 17, 71, 10, 13, 75, 79, 68, 72, 66, 1, 67], [109, 52, 45, 0, 123, 85, 32, 12, 69, 10, 87, 79, 116, 89, 18, 9, 4, 91, 81, 68, 2, 29, 83, 60, 71, 84, 8, 77, 78, 3, 1, 70, 16, 119, 120, 7, 93, 126, 58, 117, 94, 41, 30, 64, 66, 115, 124, 72, 118, 112, 114, 113, 61, 46, 54, 47, 44, 37, 92, 49, 111, 127, 65, 50, 53, 125, 95, 105, 106, 51, 110, 63, 35, 40, 43, 107, 48, 56, 102, 55, 39, 34, 19, 108, 104, 22, 121, 57, 97, 6, 59, 42, 62, 75, 103, 36, 100, 122, 82, 38, 67, 25, 28, 101, 99, 98, 13, 33, 27, 14, 20, 11, 88, 31, 21, 23, 90, 96, 86, 26, 73, 15, 80, 24, 5, 76, 17, 74], [127, 40, 34, 84, 87, 76, 95, 81, 73, 14, 17, 89, 6, 5, 28, 66, 125, 124, 64, 4, 116, 60, 122, 110, 0, 111, 13, 79, 58, 74, 19, 75, 92, 117, 51, 18, 80, 50, 24, 57, 105, 43, 63, 22, 45, 41, 101, 107, 26, 106, 54, 16, 121, 70, 10, 86, 71, 53, 20, 39, 78, 59, 8, 65, 52, 56, 30, 37, 62, 49, 94, 3, 42, 25, 123, 114, 61, 27, 83, 90, 7, 85, 96, 15, 109, 119, 103, 67, 21, 35, 44, 118, 33, 47, 99, 113, 55, 77, 108, 120, 126, 32, 1, 102, 69, 112, 115, 93, 68, 31, 98, 38, 36, 88, 46, 29, 48, 97, 82, 91, 12, 72, 100, 23, 9, 11, 2, 104], [38, 61, 114, 121, 86, 97, 119, 93, 18, 54, 30, 27, 50, 116, 15, 59, 41, 62, 56, 120, 126, 58, 115, 125, 124, 122, 55, 63, 90, 60, 127, 57, 49, 117, 48, 85, 123, 47, 112, 113, 118, 108, 52, 110, 101, 24, 51, 45, 36, 109, 46, 96, 53, 43, 94, 29, 106, 111, 95, 17, 104, 44, 25, 42, 107, 39, 99, 91, 105, 79, 33, 98, 12, 20, 28, 81, 80, 103, 92, 13, 40, 73, 77, 88, 100, 83, 7, 14, 16, 34, 10, 21, 31, 37, 84, 102, 23, 35, 74, 19, 26, 32, 78, 11, 89, 87, 8, 82, 70, 22, 75, 6, 68, 3, 9, 1, 76, 72, 2, 69, 66, 65, 5, 71, 0, 4, 67, 64], [106, 36, 120, 86, 53, 58, 96, 116, 94, 28, 49, 61, 89, 63, 27, 121, 123, 42, 122, 50, 45, 125, 48, 56, 62, 111, 54, 124, 59, 52, 18, 117, 55, 80, 127, 57, 119, 115, 112, 35, 46, 114, 51, 83, 60, 108, 34, 126, 107, 118, 47, 113, 109, 21, 98, 23, 12, 41, 43, 73, 15, 81, 110, 9, 105, 11, 104, 77, 29, 44, 40, 102, 14, 16, 39, 38, 7, 37, 103, 101, 17, 90, 0, 95, 87, 97, 33, 92, 24, 91, 20, 31, 67, 26, 100, 13, 99, 32, 71, 93, 72, 5, 88, 30, 68, 84, 19, 66, 78, 70, 74, 82, 85, 25, 2, 65, 64, 76, 69, 10, 75, 6, 79, 8, 22, 3, 4, 1], [103, 51, 55, 86, 120, 98, 123, 63, 93, 126, 124, 61, 31, 114, 53, 45, 59, 58, 49, 57, 121, 89, 122, 52, 125, 62, 112, 56, 47, 117, 116, 50, 60, 46, 119, 127, 115, 48, 110, 113, 108, 111, 107, 38, 109, 40, 42, 105, 44, 54, 43, 36, 80, 41, 106, 22, 18, 33, 118, 34, 79, 82, 16, 104, 99, 88, 90, 81, 84, 26, 96, 97, 102, 92, 91, 11, 100, 13, 37, 27, 21, 101, 32, 35, 83, 12, 23, 25, 29, 94, 72, 78, 85, 20, 10, 6, 30, 28, 17, 39, 24, 14, 66, 87, 19, 95, 9, 5, 73, 69, 4, 15, 8, 77, 75, 67, 76, 65, 71, 68, 0, 74, 7, 3, 1, 70, 2, 64], [50, 115, 102, 77, 89, 16, 19, 82, 75, 73, 114, 7, 23, 64, 69, 113, 2, 3, 67, 66, 1, 85, 10, 93, 17, 8, 70, 14, 65, 78, 21, 57, 30, 38, 112, 49, 94, 42, 43, 126, 34, 76, 71, 39, 45, 108, 92, 25, 90, 48, 6, 32, 44, 117, 123, 122, 13, 121, 87, 56, 106, 103, 125, 22, 79, 28, 74, 20, 27, 120, 63, 124, 116, 68, 33, 127, 61, 54, 15, 60, 104, 4, 41, 52, 119, 105, 100, 95, 12, 91, 72, 9, 88, 26, 53, 36, 31, 101, 29, 98, 62, 35, 118, 47, 59, 84, 46, 109, 51, 40, 96, 99, 0, 110, 55, 107, 111, 24, 37, 83, 58, 80, 97, 11, 5, 81, 86, 18]], "model.layers.24.self_attn.qk_proj": [[127, 50, 48, 115, 112, 114, 109, 51, 45, 108, 55, 61, 121, 44, 53, 120, 58, 52, 38, 25, 23, 124, 123, 63, 87, 18, 42, 89, 92, 60, 22, 113, 106, 40, 125, 82, 126, 116, 29, 122, 83, 19, 59, 17, 77, 32, 101, 54, 20, 26, 13, 94, 84, 73, 16, 9, 30, 81, 12, 56, 102, 85, 80, 90, 21, 57, 117, 49, 37, 76, 46, 86, 79, 111, 75, 98, 118, 110, 36, 7, 62, 15, 78, 14, 41, 11, 47, 119, 71, 27, 91, 103, 64, 43, 95, 34, 39, 5, 69, 74, 10, 105, 0, 96, 24, 31, 100, 104, 6, 88, 93, 28, 3, 68, 107, 67, 2, 66, 33, 70, 8, 4, 97, 99, 72, 35, 1, 65], [127, 50, 115, 114, 48, 109, 112, 51, 55, 108, 61, 45, 121, 44, 53, 120, 58, 52, 38, 25, 123, 92, 23, 87, 60, 89, 113, 18, 42, 124, 22, 106, 116, 40, 122, 63, 54, 125, 82, 29, 126, 73, 59, 83, 77, 101, 56, 32, 13, 20, 85, 19, 94, 16, 17, 26, 30, 81, 102, 9, 12, 49, 117, 86, 76, 84, 79, 21, 119, 111, 118, 57, 71, 36, 98, 80, 110, 41, 27, 103, 39, 75, 37, 90, 62, 47, 104, 78, 14, 46, 91, 11, 15, 43, 74, 34, 64, 105, 5, 69, 24, 100, 31, 95, 107, 7, 0, 96, 4, 93, 10, 28, 6, 88, 2, 68, 8, 66, 3, 67, 33, 97, 70, 72, 99, 35, 65, 1], [127, 50, 48, 115, 114, 112, 109, 51, 45, 108, 55, 61, 121, 120, 44, 53, 58, 52, 25, 38, 123, 113, 23, 87, 106, 92, 42, 18, 89, 60, 40, 22, 126, 125, 122, 82, 63, 32, 56, 116, 124, 54, 29, 59, 17, 26, 101, 13, 19, 73, 30, 102, 49, 83, 85, 16, 119, 20, 77, 94, 81, 76, 86, 12, 90, 9, 117, 21, 47, 57, 111, 118, 98, 110, 84, 104, 41, 80, 103, 78, 34, 79, 36, 31, 39, 7, 37, 62, 71, 75, 64, 15, 11, 14, 27, 5, 105, 69, 96, 91, 46, 95, 74, 100, 43, 10, 107, 0, 28, 93, 24, 66, 6, 33, 67, 2, 88, 3, 8, 4, 68, 97, 70, 99, 35, 65, 1, 72], [127, 50, 48, 115, 114, 112, 109, 51, 61, 45, 108, 55, 121, 44, 53, 120, 58, 52, 123, 38, 25, 106, 113, 23, 42, 87, 89, 18, 126, 82, 92, 59, 22, 124, 63, 125, 56, 116, 119, 29, 60, 94, 40, 76, 13, 122, 17, 32, 30, 73, 86, 19, 77, 80, 9, 101, 85, 16, 54, 49, 83, 81, 26, 79, 102, 12, 84, 111, 20, 98, 118, 110, 75, 47, 117, 5, 15, 41, 21, 104, 62, 71, 39, 36, 37, 69, 57, 14, 90, 7, 27, 34, 78, 46, 31, 0, 43, 11, 74, 103, 10, 95, 100, 96, 2, 28, 24, 88, 66, 67, 8, 70, 64, 93, 91, 4, 105, 6, 3, 107, 33, 35, 68, 97, 99, 65, 1, 72], [127, 48, 50, 115, 114, 112, 109, 51, 108, 61, 55, 45, 44, 121, 120, 53, 58, 52, 25, 123, 38, 23, 42, 113, 60, 63, 82, 18, 89, 87, 106, 59, 92, 56, 126, 124, 125, 22, 122, 40, 54, 13, 17, 73, 19, 86, 49, 29, 116, 9, 94, 80, 77, 30, 57, 32, 84, 83, 81, 26, 20, 76, 12, 101, 16, 75, 85, 90, 110, 118, 119, 14, 79, 111, 102, 21, 117, 37, 15, 104, 62, 98, 36, 5, 103, 47, 41, 34, 71, 78, 27, 95, 7, 11, 10, 46, 69, 91, 64, 70, 74, 96, 105, 31, 100, 39, 0, 24, 43, 107, 33, 28, 88, 3, 66, 8, 2, 67, 4, 93, 97, 68, 6, 35, 1, 99, 72, 65], [127, 50, 48, 115, 114, 112, 109, 61, 51, 108, 55, 45, 44, 121, 53, 120, 58, 52, 25, 38, 42, 23, 22, 123, 89, 113, 87, 18, 82, 106, 92, 60, 126, 40, 124, 77, 84, 122, 73, 63, 59, 76, 83, 81, 17, 94, 80, 19, 116, 125, 86, 29, 12, 13, 101, 85, 54, 30, 16, 32, 9, 26, 102, 37, 79, 56, 15, 98, 21, 41, 110, 57, 90, 36, 34, 118, 49, 20, 47, 14, 103, 5, 11, 95, 75, 27, 111, 78, 71, 46, 70, 10, 7, 104, 0, 117, 64, 43, 105, 62, 119, 39, 91, 8, 74, 24, 69, 100, 96, 93, 107, 88, 31, 2, 33, 66, 97, 28, 3, 67, 68, 6, 35, 99, 4, 65, 1, 72], [127, 50, 48, 115, 114, 112, 109, 55, 108, 51, 45, 61, 44, 121, 53, 120, 58, 52, 25, 23, 42, 87, 38, 82, 89, 123, 92, 18, 106, 124, 22, 60, 113, 125, 122, 40, 63, 19, 17, 54, 126, 32, 94, 84, 86, 26, 73, 30, 80, 13, 77, 29, 83, 101, 81, 56, 116, 12, 16, 59, 76, 85, 20, 9, 57, 21, 119, 98, 79, 102, 15, 90, 49, 111, 78, 41, 75, 118, 47, 104, 11, 110, 27, 14, 46, 71, 117, 5, 36, 37, 91, 7, 103, 95, 34, 64, 10, 62, 105, 100, 39, 74, 0, 24, 43, 31, 69, 70, 107, 96, 88, 8, 28, 93, 33, 35, 2, 4, 3, 97, 68, 67, 99, 66, 6, 72, 1, 65], [127, 50, 48, 115, 114, 109, 112, 55, 51, 45, 108, 121, 44, 61, 120, 53, 58, 52, 25, 23, 123, 38, 124, 89, 92, 18, 87, 42, 60, 82, 22, 113, 63, 125, 106, 40, 122, 116, 83, 59, 29, 126, 119, 32, 30, 56, 16, 94, 54, 73, 86, 26, 101, 20, 80, 19, 49, 13, 84, 17, 21, 90, 81, 102, 12, 9, 85, 77, 57, 76, 37, 15, 111, 41, 79, 78, 62, 11, 46, 71, 36, 110, 39, 117, 118, 34, 91, 27, 5, 105, 104, 43, 28, 75, 98, 31, 0, 47, 24, 103, 64, 10, 14, 95, 7, 69, 100, 70, 88, 74, 93, 96, 68, 107, 66, 2, 35, 3, 33, 4, 8, 97, 67, 72, 6, 99, 1, 65], [127, 50, 48, 115, 112, 109, 114, 55, 45, 51, 108, 121, 61, 44, 53, 120, 58, 52, 25, 38, 23, 92, 89, 123, 124, 87, 42, 125, 113, 63, 82, 106, 60, 59, 18, 126, 116, 122, 83, 22, 32, 77, 40, 56, 101, 80, 94, 81, 26, 84, 29, 54, 86, 30, 19, 117, 13, 119, 73, 102, 20, 85, 17, 49, 90, 76, 16, 9, 21, 111, 12, 37, 98, 118, 43, 15, 41, 47, 79, 57, 14, 34, 91, 110, 103, 27, 96, 75, 105, 11, 71, 36, 46, 69, 39, 100, 62, 104, 31, 78, 28, 7, 5, 95, 24, 74, 107, 10, 88, 93, 64, 0, 70, 33, 2, 4, 35, 68, 66, 6, 97, 3, 72, 99, 67, 8, 1, 65], [127, 50, 48, 114, 115, 109, 112, 45, 51, 108, 55, 121, 61, 44, 53, 120, 58, 52, 25, 38, 123, 124, 23, 87, 89, 42, 125, 113, 63, 106, 92, 82, 59, 18, 60, 116, 40, 126, 22, 29, 122, 30, 101, 83, 9, 73, 13, 94, 32, 54, 17, 119, 19, 77, 76, 56, 86, 84, 26, 85, 117, 16, 81, 80, 102, 57, 36, 111, 49, 12, 21, 37, 41, 118, 79, 103, 69, 20, 14, 110, 7, 11, 34, 90, 75, 104, 71, 98, 15, 5, 27, 47, 62, 78, 91, 95, 0, 10, 105, 6, 64, 31, 96, 24, 43, 39, 74, 28, 46, 100, 4, 66, 3, 88, 72, 2, 68, 93, 33, 70, 107, 67, 35, 97, 99, 8, 1, 65], [127, 50, 48, 115, 109, 112, 114, 51, 108, 45, 44, 55, 61, 121, 53, 120, 58, 52, 25, 42, 38, 123, 124, 87, 23, 106, 92, 113, 18, 82, 89, 22, 63, 122, 126, 125, 40, 73, 13, 29, 60, 116, 83, 9, 59, 56, 54, 77, 80, 86, 16, 84, 85, 19, 32, 81, 94, 17, 30, 26, 12, 76, 119, 102, 101, 20, 118, 36, 15, 62, 79, 71, 11, 117, 5, 49, 111, 90, 0, 75, 21, 69, 37, 96, 34, 27, 98, 104, 103, 7, 6, 57, 14, 39, 95, 78, 10, 64, 43, 74, 110, 46, 105, 47, 91, 2, 31, 41, 88, 24, 72, 67, 100, 28, 66, 3, 68, 93, 107, 4, 70, 97, 33, 8, 65, 35, 99, 1], [127, 50, 48, 115, 114, 112, 109, 51, 45, 108, 61, 44, 55, 121, 53, 120, 58, 52, 25, 42, 23, 87, 92, 38, 123, 22, 18, 106, 124, 89, 113, 82, 63, 40, 77, 13, 76, 125, 19, 126, 86, 30, 80, 73, 29, 17, 60, 85, 20, 84, 83, 9, 54, 94, 116, 81, 122, 101, 32, 16, 49, 26, 12, 56, 118, 90, 98, 15, 21, 57, 59, 111, 79, 11, 37, 102, 36, 7, 34, 95, 75, 71, 0, 117, 104, 27, 62, 47, 14, 100, 10, 91, 6, 119, 39, 96, 41, 78, 5, 69, 103, 74, 43, 24, 105, 46, 64, 88, 107, 28, 93, 31, 110, 72, 33, 97, 67, 66, 68, 2, 4, 3, 99, 70, 35, 8, 1, 65], [127, 50, 48, 115, 112, 109, 114, 51, 55, 45, 108, 61, 44, 121, 53, 120, 58, 52, 25, 38, 92, 124, 123, 23, 89, 42, 22, 87, 125, 113, 82, 63, 126, 18, 106, 40, 60, 32, 116, 94, 101, 30, 59, 54, 122, 29, 84, 26, 85, 77, 17, 13, 9, 86, 49, 117, 83, 56, 81, 19, 57, 20, 80, 73, 16, 111, 21, 119, 90, 37, 76, 12, 98, 41, 79, 36, 103, 118, 91, 15, 34, 27, 95, 11, 105, 100, 102, 24, 110, 75, 14, 71, 39, 7, 62, 47, 78, 46, 0, 5, 104, 28, 69, 31, 74, 10, 88, 43, 96, 72, 6, 68, 64, 93, 33, 97, 107, 2, 99, 66, 67, 4, 35, 70, 3, 65, 1, 8], [127, 50, 48, 114, 115, 112, 109, 51, 108, 45, 55, 61, 44, 120, 121, 53, 58, 52, 25, 124, 42, 38, 123, 23, 126, 89, 113, 106, 87, 22, 125, 59, 92, 40, 63, 18, 82, 122, 32, 56, 102, 77, 60, 101, 73, 116, 83, 84, 9, 119, 13, 30, 94, 54, 19, 81, 118, 16, 17, 86, 49, 26, 57, 111, 85, 29, 80, 117, 12, 37, 21, 76, 79, 90, 36, 20, 7, 41, 11, 98, 62, 27, 47, 15, 78, 103, 39, 69, 34, 75, 105, 71, 43, 0, 100, 64, 104, 10, 88, 74, 31, 14, 5, 95, 110, 96, 24, 28, 72, 6, 66, 91, 46, 33, 107, 93, 4, 2, 3, 67, 70, 97, 35, 68, 99, 65, 8, 1], [127, 114, 48, 50, 115, 109, 112, 51, 108, 45, 44, 55, 61, 121, 120, 53, 58, 52, 42, 25, 38, 23, 106, 89, 123, 22, 124, 113, 87, 63, 82, 92, 40, 18, 125, 126, 116, 59, 29, 77, 32, 9, 81, 60, 73, 83, 122, 16, 19, 30, 94, 54, 84, 13, 85, 26, 12, 101, 111, 80, 17, 20, 118, 49, 21, 76, 36, 119, 102, 56, 57, 37, 11, 98, 86, 79, 7, 15, 117, 41, 14, 62, 5, 103, 43, 90, 34, 64, 78, 110, 69, 39, 95, 10, 71, 24, 47, 31, 105, 91, 100, 27, 75, 0, 104, 74, 46, 96, 72, 33, 70, 66, 28, 97, 107, 88, 2, 67, 6, 68, 3, 4, 93, 35, 99, 8, 65, 1], [127, 50, 48, 115, 114, 109, 112, 51, 45, 55, 108, 61, 121, 44, 120, 53, 58, 52, 38, 25, 42, 23, 123, 92, 82, 87, 106, 89, 124, 22, 63, 113, 18, 59, 125, 126, 77, 122, 116, 84, 40, 73, 83, 54, 13, 80, 81, 94, 30, 9, 16, 19, 32, 60, 12, 29, 86, 17, 76, 56, 20, 26, 85, 21, 111, 37, 101, 15, 118, 102, 90, 119, 62, 27, 57, 98, 110, 11, 71, 7, 75, 79, 39, 36, 10, 43, 14, 5, 46, 34, 31, 49, 117, 95, 78, 24, 41, 0, 69, 103, 104, 74, 105, 70, 47, 96, 91, 88, 107, 68, 100, 64, 72, 28, 66, 2, 67, 93, 97, 33, 6, 99, 4, 3, 8, 35, 65, 1], [127, 50, 48, 115, 114, 112, 109, 51, 55, 45, 61, 108, 121, 44, 120, 53, 52, 58, 25, 38, 123, 42, 92, 23, 87, 124, 106, 89, 126, 18, 82, 22, 63, 125, 40, 59, 113, 30, 32, 116, 19, 29, 86, 101, 77, 60, 84, 26, 94, 81, 122, 16, 13, 56, 20, 17, 21, 54, 73, 9, 83, 111, 57, 76, 85, 102, 27, 110, 80, 118, 12, 90, 79, 7, 98, 103, 47, 41, 117, 36, 75, 119, 49, 24, 37, 95, 91, 15, 11, 62, 14, 104, 31, 10, 34, 28, 43, 105, 46, 96, 5, 78, 0, 39, 71, 107, 74, 64, 100, 88, 93, 70, 69, 68, 66, 4, 33, 97, 8, 67, 2, 72, 35, 99, 3, 6, 65, 1], [127, 50, 48, 115, 114, 109, 112, 55, 45, 51, 108, 61, 120, 121, 44, 53, 58, 52, 25, 38, 92, 123, 89, 87, 23, 126, 42, 106, 125, 113, 124, 82, 18, 116, 63, 22, 59, 86, 29, 40, 54, 32, 17, 19, 77, 94, 60, 101, 83, 26, 56, 16, 122, 9, 73, 13, 30, 57, 102, 119, 84, 81, 80, 76, 20, 85, 49, 12, 90, 39, 98, 111, 118, 21, 36, 37, 79, 7, 15, 41, 5, 27, 78, 47, 11, 69, 110, 91, 75, 64, 31, 104, 96, 103, 71, 14, 24, 117, 34, 95, 28, 105, 0, 62, 10, 43, 74, 70, 46, 100, 88, 2, 93, 107, 68, 4, 33, 8, 67, 66, 3, 35, 97, 6, 65, 72, 99, 1], [127, 50, 48, 114, 115, 109, 112, 51, 45, 108, 61, 55, 44, 121, 53, 120, 58, 52, 38, 123, 124, 89, 42, 25, 23, 113, 126, 125, 106, 18, 92, 63, 87, 59, 82, 116, 22, 40, 77, 73, 29, 32, 30, 94, 101, 9, 76, 60, 84, 119, 81, 19, 54, 17, 86, 16, 26, 13, 21, 122, 83, 103, 57, 56, 111, 85, 49, 98, 37, 41, 36, 80, 117, 20, 11, 12, 102, 75, 27, 47, 118, 79, 110, 5, 90, 7, 69, 62, 34, 64, 104, 39, 105, 31, 15, 78, 43, 46, 74, 0, 14, 24, 71, 91, 95, 107, 8, 100, 88, 66, 96, 28, 70, 68, 10, 4, 67, 93, 3, 33, 2, 97, 6, 35, 99, 65, 72, 1], [127, 48, 50, 114, 115, 112, 109, 108, 51, 55, 45, 61, 44, 121, 53, 120, 58, 52, 38, 25, 123, 113, 124, 42, 23, 89, 18, 106, 125, 126, 22, 92, 87, 82, 40, 59, 63, 116, 29, 60, 32, 77, 30, 81, 73, 13, 9, 101, 83, 94, 122, 86, 76, 80, 84, 17, 26, 85, 37, 110, 57, 98, 111, 12, 21, 56, 19, 54, 36, 20, 102, 16, 104, 119, 117, 75, 15, 7, 41, 24, 103, 10, 27, 79, 34, 69, 11, 91, 118, 105, 64, 5, 49, 78, 62, 39, 95, 71, 90, 43, 88, 74, 8, 46, 107, 0, 31, 96, 28, 47, 14, 100, 93, 97, 70, 2, 6, 68, 4, 33, 66, 67, 3, 99, 35, 65, 72, 1], [127, 48, 50, 114, 112, 115, 109, 51, 108, 55, 45, 120, 44, 61, 121, 53, 58, 52, 25, 124, 123, 23, 38, 92, 42, 113, 125, 126, 106, 89, 116, 59, 22, 87, 60, 18, 82, 40, 122, 32, 63, 13, 77, 83, 80, 73, 86, 26, 29, 19, 101, 119, 94, 17, 9, 81, 12, 56, 84, 57, 76, 85, 16, 30, 102, 104, 54, 20, 21, 110, 37, 49, 90, 111, 98, 117, 75, 15, 27, 11, 79, 69, 118, 41, 34, 5, 31, 36, 46, 7, 14, 71, 96, 47, 24, 10, 91, 39, 78, 95, 74, 62, 6, 43, 100, 103, 107, 0, 28, 105, 2, 93, 64, 8, 88, 4, 66, 68, 70, 67, 33, 3, 97, 35, 99, 65, 1, 72], [127, 50, 48, 114, 115, 112, 109, 45, 55, 51, 108, 120, 121, 61, 44, 53, 58, 52, 38, 25, 123, 113, 125, 59, 126, 89, 42, 23, 106, 92, 124, 18, 116, 63, 82, 87, 22, 32, 40, 122, 54, 60, 56, 101, 119, 94, 30, 29, 77, 9, 86, 81, 73, 57, 13, 12, 17, 83, 110, 76, 26, 85, 20, 80, 21, 16, 19, 102, 84, 111, 71, 36, 46, 79, 11, 117, 90, 5, 34, 7, 75, 103, 69, 15, 98, 37, 62, 104, 0, 27, 31, 41, 118, 49, 39, 47, 78, 24, 6, 91, 96, 14, 64, 10, 4, 95, 88, 100, 74, 43, 67, 107, 93, 66, 68, 105, 2, 8, 28, 3, 33, 70, 97, 99, 65, 35, 1, 72], [127, 48, 50, 115, 112, 114, 109, 55, 51, 45, 108, 121, 44, 61, 120, 53, 58, 52, 38, 25, 113, 125, 23, 124, 89, 123, 92, 42, 59, 18, 87, 106, 126, 82, 63, 40, 122, 22, 30, 60, 77, 54, 116, 29, 86, 32, 73, 20, 56, 94, 57, 83, 17, 26, 101, 19, 9, 13, 111, 85, 81, 102, 119, 80, 12, 16, 37, 21, 84, 62, 98, 110, 76, 117, 15, 5, 75, 118, 47, 90, 104, 79, 91, 14, 49, 7, 36, 103, 11, 71, 34, 27, 46, 69, 24, 6, 100, 41, 95, 31, 39, 28, 0, 78, 88, 43, 74, 107, 93, 68, 10, 96, 105, 4, 8, 2, 66, 64, 97, 67, 33, 3, 70, 99, 35, 72, 65, 1], [127, 48, 50, 115, 114, 112, 109, 51, 55, 108, 61, 45, 121, 44, 53, 120, 58, 52, 25, 38, 42, 124, 23, 89, 123, 87, 18, 106, 113, 92, 63, 22, 82, 125, 122, 60, 126, 40, 77, 54, 59, 9, 29, 86, 73, 116, 32, 17, 101, 13, 94, 83, 30, 12, 19, 56, 26, 84, 20, 80, 16, 76, 57, 21, 85, 102, 119, 81, 98, 37, 104, 36, 27, 90, 7, 15, 111, 79, 75, 110, 103, 41, 78, 10, 34, 69, 62, 47, 11, 118, 95, 74, 5, 71, 31, 14, 91, 46, 28, 117, 88, 43, 49, 0, 96, 39, 100, 107, 6, 64, 8, 24, 105, 2, 67, 66, 33, 70, 68, 4, 97, 3, 93, 99, 72, 35, 65, 1], [127, 50, 48, 114, 115, 112, 109, 51, 45, 108, 55, 61, 44, 121, 53, 120, 58, 52, 25, 38, 123, 23, 22, 124, 82, 42, 87, 18, 106, 113, 89, 63, 92, 126, 122, 125, 84, 40, 60, 13, 54, 116, 83, 12, 32, 73, 59, 101, 94, 29, 86, 17, 9, 85, 57, 30, 16, 77, 21, 19, 26, 111, 56, 81, 20, 110, 80, 98, 76, 75, 117, 90, 79, 102, 91, 37, 71, 15, 119, 36, 103, 34, 78, 5, 46, 27, 95, 39, 11, 62, 7, 104, 14, 31, 47, 41, 24, 69, 64, 74, 105, 49, 10, 118, 0, 96, 43, 28, 88, 93, 6, 100, 107, 3, 2, 67, 4, 70, 68, 33, 66, 72, 8, 97, 99, 1, 35, 65], [127, 50, 48, 114, 115, 109, 112, 51, 45, 55, 61, 108, 121, 53, 44, 120, 58, 52, 25, 38, 123, 23, 124, 92, 126, 113, 42, 63, 87, 89, 106, 116, 22, 122, 82, 125, 18, 40, 32, 59, 54, 30, 86, 101, 29, 94, 84, 77, 73, 17, 57, 9, 83, 21, 85, 13, 60, 19, 119, 12, 118, 26, 110, 98, 20, 16, 111, 81, 41, 56, 103, 79, 90, 80, 37, 76, 102, 117, 15, 75, 27, 49, 34, 71, 36, 43, 78, 47, 62, 46, 91, 74, 107, 104, 69, 11, 100, 96, 24, 14, 28, 7, 64, 31, 39, 0, 95, 5, 105, 10, 72, 93, 68, 88, 97, 2, 4, 33, 70, 3, 6, 35, 99, 66, 67, 65, 8, 1], [127, 50, 48, 115, 109, 114, 112, 55, 51, 45, 61, 108, 121, 44, 120, 53, 52, 58, 25, 124, 23, 38, 123, 92, 63, 87, 106, 89, 42, 59, 113, 60, 122, 54, 125, 18, 116, 82, 126, 32, 40, 22, 118, 83, 111, 119, 19, 57, 77, 56, 9, 29, 20, 26, 101, 73, 102, 12, 94, 86, 30, 85, 81, 84, 13, 17, 49, 117, 80, 16, 36, 76, 98, 75, 21, 27, 103, 41, 110, 47, 104, 90, 71, 37, 62, 78, 79, 91, 95, 15, 34, 105, 46, 43, 11, 31, 39, 69, 14, 74, 5, 7, 88, 64, 96, 100, 107, 10, 28, 70, 93, 24, 33, 0, 2, 68, 4, 66, 97, 72, 3, 99, 6, 35, 67, 65, 1, 8], [127, 50, 48, 114, 115, 112, 109, 51, 55, 108, 61, 45, 121, 44, 53, 120, 58, 52, 123, 38, 25, 124, 23, 89, 87, 116, 113, 125, 106, 18, 60, 82, 63, 92, 42, 126, 22, 40, 122, 32, 9, 73, 59, 101, 77, 13, 54, 12, 83, 30, 84, 94, 81, 110, 85, 19, 29, 57, 17, 119, 86, 26, 80, 111, 16, 76, 36, 56, 21, 49, 20, 117, 41, 15, 102, 79, 75, 71, 98, 46, 37, 34, 103, 69, 74, 27, 39, 91, 5, 0, 7, 11, 90, 78, 104, 118, 14, 96, 62, 24, 10, 47, 95, 31, 70, 105, 28, 64, 100, 72, 2, 43, 4, 66, 67, 107, 88, 93, 97, 3, 33, 68, 35, 6, 99, 65, 1, 8], [127, 50, 114, 115, 48, 112, 109, 51, 108, 45, 61, 44, 55, 121, 53, 120, 58, 52, 124, 38, 23, 25, 63, 123, 113, 89, 42, 92, 18, 22, 122, 87, 82, 60, 116, 106, 40, 125, 59, 126, 73, 77, 32, 54, 12, 13, 83, 94, 9, 101, 85, 81, 110, 29, 30, 17, 16, 19, 111, 86, 20, 76, 80, 119, 75, 26, 37, 56, 36, 117, 62, 49, 98, 102, 41, 84, 90, 57, 71, 118, 27, 79, 104, 103, 34, 69, 21, 15, 70, 11, 46, 64, 14, 7, 5, 10, 47, 105, 95, 74, 78, 72, 43, 39, 2, 91, 88, 96, 31, 28, 100, 0, 66, 107, 24, 4, 33, 3, 93, 67, 97, 99, 68, 35, 6, 65, 1, 8], [127, 50, 48, 115, 114, 112, 109, 51, 55, 108, 61, 45, 44, 53, 121, 120, 58, 52, 25, 124, 38, 123, 23, 42, 89, 113, 87, 92, 22, 63, 82, 18, 106, 60, 122, 126, 116, 59, 40, 125, 77, 17, 32, 13, 73, 56, 94, 9, 86, 83, 81, 29, 12, 19, 80, 101, 54, 16, 30, 84, 20, 85, 21, 102, 26, 76, 111, 57, 75, 117, 98, 36, 62, 79, 47, 118, 41, 110, 15, 71, 90, 46, 7, 103, 78, 34, 14, 49, 27, 74, 37, 119, 11, 105, 39, 69, 91, 104, 70, 31, 5, 64, 100, 10, 95, 96, 88, 43, 72, 24, 33, 107, 68, 0, 2, 93, 28, 4, 66, 6, 67, 3, 97, 99, 35, 8, 65, 1], [127, 50, 48, 115, 114, 51, 109, 112, 45, 108, 61, 55, 121, 44, 53, 120, 58, 52, 25, 123, 23, 124, 38, 42, 87, 106, 89, 116, 60, 113, 126, 18, 125, 92, 122, 82, 40, 63, 22, 77, 94, 29, 30, 73, 59, 9, 32, 101, 17, 86, 13, 12, 83, 81, 80, 20, 56, 54, 85, 26, 76, 111, 118, 84, 57, 21, 16, 98, 19, 41, 102, 62, 110, 117, 36, 71, 7, 37, 34, 49, 75, 47, 79, 15, 69, 27, 119, 90, 78, 10, 104, 64, 0, 24, 91, 11, 39, 14, 95, 96, 74, 5, 93, 46, 31, 103, 107, 43, 6, 33, 105, 28, 88, 67, 72, 100, 2, 66, 3, 68, 70, 4, 35, 8, 97, 99, 1, 65], [127, 50, 48, 115, 114, 109, 112, 51, 55, 108, 45, 61, 121, 44, 53, 120, 52, 58, 25, 38, 123, 23, 124, 42, 87, 89, 92, 113, 22, 82, 60, 106, 18, 63, 40, 116, 125, 13, 126, 29, 59, 122, 32, 9, 54, 83, 57, 12, 94, 73, 81, 77, 26, 101, 20, 16, 86, 19, 30, 17, 85, 111, 56, 75, 84, 80, 21, 102, 15, 76, 36, 110, 117, 41, 98, 90, 78, 14, 62, 118, 79, 37, 10, 11, 49, 7, 91, 119, 47, 71, 27, 24, 103, 31, 43, 46, 74, 96, 34, 95, 100, 69, 5, 105, 104, 39, 6, 64, 28, 88, 93, 107, 0, 67, 72, 3, 4, 68, 33, 2, 70, 97, 66, 8, 35, 99, 1, 65]], "model.layers.25.self_attn.q_proj": [[62, 102, 124, 121, 56, 60, 97, 47, 50, 63, 38, 93, 114, 24, 127, 117, 95, 86, 105, 53, 89, 41, 35, 29, 43, 106, 115, 51, 52, 22, 37, 49, 103, 122, 20, 25, 123, 59, 87, 55, 33, 83, 91, 116, 57, 58, 100, 36, 99, 79, 27, 26, 46, 113, 111, 92, 108, 112, 109, 94, 61, 48, 125, 82, 19, 110, 54, 34, 42, 17, 39, 44, 12, 119, 126, 18, 118, 80, 120, 101, 104, 40, 45, 77, 98, 32, 16, 96, 28, 30, 107, 31, 90, 88, 21, 23, 81, 85, 70, 75, 84, 74, 72, 9, 4, 78, 15, 2, 14, 8, 76, 13, 67, 11, 0, 5, 66, 71, 10, 7, 73, 6, 68, 65, 3, 1, 64, 69], [102, 62, 93, 121, 124, 24, 97, 83, 56, 35, 60, 127, 85, 81, 13, 21, 17, 63, 86, 20, 22, 98, 36, 25, 123, 114, 92, 19, 120, 77, 84, 46, 74, 78, 48, 116, 111, 76, 52, 49, 11, 95, 33, 30, 32, 79, 115, 1, 38, 9, 71, 28, 122, 88, 34, 69, 31, 12, 104, 75, 101, 7, 66, 80, 108, 0, 43, 99, 117, 67, 29, 4, 72, 107, 8, 90, 113, 70, 58, 96, 23, 119, 103, 27, 26, 37, 18, 59, 54, 87, 14, 15, 126, 89, 16, 91, 94, 105, 53, 125, 10, 40, 106, 2, 51, 6, 118, 39, 82, 68, 42, 100, 73, 57, 50, 44, 3, 61, 65, 47, 41, 110, 55, 64, 45, 109, 112, 5], [62, 124, 102, 60, 97, 122, 123, 116, 93, 38, 44, 117, 43, 127, 59, 121, 87, 48, 114, 53, 118, 24, 89, 54, 39, 29, 58, 46, 94, 56, 40, 25, 82, 119, 57, 95, 106, 50, 125, 108, 99, 51, 49, 20, 109, 55, 47, 86, 126, 52, 111, 112, 63, 113, 45, 41, 91, 98, 104, 42, 110, 22, 79, 61, 115, 120, 103, 100, 31, 27, 92, 107, 17, 23, 105, 37, 35, 76, 85, 36, 34, 90, 101, 80, 32, 33, 96, 26, 28, 84, 12, 30, 13, 21, 18, 83, 19, 77, 15, 81, 16, 8, 9, 88, 74, 78, 7, 6, 73, 11, 75, 14, 70, 71, 10, 4, 5, 72, 68, 69, 2, 3, 67, 1, 66, 65, 64, 0], [62, 121, 102, 117, 124, 114, 60, 49, 50, 53, 116, 97, 123, 43, 38, 108, 122, 93, 45, 63, 127, 94, 59, 113, 104, 46, 58, 25, 44, 87, 125, 106, 61, 52, 54, 109, 17, 31, 118, 111, 56, 48, 47, 103, 33, 42, 119, 51, 120, 112, 34, 126, 29, 110, 107, 115, 57, 41, 40, 55, 89, 28, 105, 39, 22, 99, 37, 27, 98, 95, 82, 85, 90, 86, 100, 35, 20, 30, 26, 91, 36, 101, 24, 79, 96, 76, 80, 32, 19, 18, 23, 12, 92, 21, 8, 11, 84, 75, 77, 64, 1, 78, 10, 6, 16, 14, 0, 83, 4, 66, 74, 68, 69, 81, 67, 72, 7, 2, 70, 5, 71, 9, 65, 13, 73, 88, 3, 15], [58, 126, 117, 39, 111, 124, 59, 48, 51, 112, 25, 125, 54, 127, 47, 118, 123, 35, 62, 21, 55, 57, 89, 113, 96, 49, 63, 50, 60, 53, 61, 119, 116, 56, 29, 91, 45, 120, 110, 87, 52, 23, 114, 82, 42, 121, 122, 115, 46, 85, 109, 44, 108, 80, 27, 76, 107, 20, 105, 14, 93, 106, 83, 99, 41, 40, 43, 104, 38, 102, 9, 103, 28, 18, 31, 32, 16, 101, 36, 22, 95, 100, 79, 24, 8, 88, 81, 12, 13, 4, 84, 78, 92, 33, 71, 90, 37, 86, 97, 10, 69, 6, 94, 34, 3, 11, 30, 74, 75, 98, 73, 19, 77, 26, 66, 7, 70, 72, 17, 15, 2, 5, 0, 68, 64, 67, 1, 65], [111, 117, 58, 126, 39, 124, 59, 51, 48, 118, 25, 21, 50, 57, 54, 127, 55, 91, 125, 62, 112, 49, 113, 47, 63, 53, 110, 96, 61, 52, 119, 123, 116, 35, 56, 60, 120, 29, 87, 89, 23, 42, 121, 115, 45, 114, 46, 85, 82, 44, 108, 122, 109, 80, 76, 27, 107, 14, 106, 93, 41, 37, 105, 9, 102, 38, 43, 83, 104, 88, 40, 8, 20, 99, 100, 18, 24, 12, 28, 31, 10, 13, 81, 79, 32, 71, 101, 92, 103, 36, 69, 97, 30, 78, 94, 4, 33, 6, 16, 70, 22, 3, 90, 98, 95, 34, 86, 84, 26, 73, 19, 75, 74, 7, 11, 66, 5, 72, 77, 17, 65, 64, 15, 2, 67, 1, 68, 0], [117, 126, 39, 58, 111, 124, 59, 51, 48, 112, 21, 25, 57, 54, 127, 118, 62, 29, 55, 110, 50, 125, 49, 53, 35, 96, 91, 123, 63, 52, 61, 119, 113, 56, 116, 89, 120, 60, 45, 47, 87, 115, 121, 114, 82, 23, 80, 46, 44, 108, 85, 122, 42, 109, 107, 27, 76, 83, 14, 105, 102, 40, 38, 43, 9, 103, 28, 41, 106, 104, 20, 93, 8, 99, 32, 10, 13, 31, 37, 24, 36, 18, 81, 79, 12, 88, 70, 100, 16, 71, 92, 78, 101, 34, 22, 84, 33, 90, 94, 3, 7, 6, 97, 86, 69, 95, 73, 11, 30, 4, 75, 98, 19, 74, 72, 77, 66, 26, 15, 17, 2, 5, 64, 1, 0, 68, 65, 67], [126, 117, 39, 58, 111, 124, 51, 59, 48, 21, 25, 112, 54, 57, 62, 55, 127, 89, 125, 96, 63, 118, 29, 123, 50, 53, 113, 35, 110, 49, 61, 52, 60, 119, 120, 56, 116, 87, 91, 47, 23, 42, 45, 121, 44, 114, 80, 115, 122, 85, 46, 82, 108, 109, 83, 14, 76, 107, 27, 20, 40, 105, 102, 9, 8, 103, 106, 43, 38, 104, 41, 99, 93, 32, 18, 31, 4, 28, 79, 37, 95, 36, 24, 16, 84, 10, 100, 13, 22, 101, 81, 12, 92, 33, 90, 71, 78, 97, 30, 69, 70, 86, 88, 34, 94, 6, 11, 19, 98, 73, 3, 75, 74, 26, 7, 72, 77, 15, 17, 0, 66, 5, 64, 68, 65, 1, 2, 67], [40, 59, 34, 90, 83, 12, 53, 91, 63, 88, 17, 95, 45, 112, 85, 31, 79, 9, 124, 123, 13, 6, 108, 119, 113, 5, 109, 74, 126, 72, 99, 93, 96, 15, 78, 114, 62, 58, 47, 28, 29, 61, 55, 8, 73, 107, 18, 97, 52, 35, 41, 20, 86, 48, 14, 33, 87, 76, 94, 127, 122, 92, 16, 50, 121, 11, 98, 82, 116, 24, 101, 57, 110, 77, 120, 103, 10, 51, 71, 111, 49, 38, 60, 125, 54, 115, 42, 56, 105, 118, 106, 80, 23, 44, 46, 69, 100, 26, 117, 32, 39, 30, 102, 43, 36, 81, 89, 7, 37, 27, 22, 21, 84, 19, 25, 75, 70, 3, 2, 67, 4, 68, 64, 1, 66, 104, 0, 65], [40, 59, 34, 90, 124, 88, 83, 13, 63, 126, 79, 17, 85, 6, 95, 50, 121, 118, 53, 48, 46, 127, 12, 45, 10, 91, 106, 103, 22, 29, 4, 72, 116, 68, 74, 96, 60, 98, 55, 99, 112, 11, 58, 9, 125, 35, 69, 41, 24, 115, 120, 43, 31, 37, 61, 122, 62, 0, 39, 33, 113, 123, 105, 110, 78, 21, 77, 84, 107, 51, 67, 20, 7, 87, 54, 119, 28, 38, 47, 14, 73, 114, 82, 93, 26, 75, 117, 32, 94, 66, 18, 42, 56, 108, 111, 52, 44, 36, 1, 109, 2, 8, 102, 15, 30, 70, 23, 19, 71, 57, 86, 92, 49, 76, 81, 100, 101, 80, 27, 89, 97, 5, 16, 25, 104, 3, 65, 64], [40, 59, 34, 90, 85, 83, 127, 124, 63, 53, 17, 48, 88, 110, 116, 118, 79, 95, 13, 113, 29, 58, 96, 91, 31, 42, 111, 121, 12, 24, 11, 50, 94, 115, 98, 38, 45, 60, 10, 112, 47, 14, 114, 103, 125, 22, 55, 108, 25, 35, 44, 119, 109, 100, 36, 9, 57, 56, 86, 117, 102, 49, 26, 39, 72, 28, 6, 71, 61, 101, 23, 89, 52, 43, 27, 122, 41, 21, 51, 33, 123, 62, 30, 126, 54, 106, 105, 87, 37, 99, 46, 97, 92, 107, 84, 93, 120, 20, 0, 18, 104, 81, 16, 4, 82, 76, 78, 32, 15, 19, 80, 1, 7, 68, 74, 69, 75, 2, 8, 77, 73, 70, 66, 3, 5, 67, 65, 64], [40, 59, 34, 90, 88, 83, 124, 79, 17, 13, 95, 53, 12, 118, 9, 58, 126, 72, 85, 24, 63, 81, 4, 48, 114, 86, 2, 71, 11, 6, 45, 44, 112, 121, 75, 103, 116, 1, 50, 10, 105, 14, 0, 110, 3, 31, 69, 60, 78, 42, 46, 106, 39, 120, 74, 67, 115, 54, 32, 29, 43, 26, 98, 62, 8, 51, 113, 68, 52, 7, 20, 22, 21, 80, 91, 65, 92, 107, 57, 35, 37, 47, 56, 127, 84, 18, 89, 125, 16, 77, 15, 19, 94, 41, 96, 82, 108, 70, 55, 25, 109, 104, 76, 28, 49, 27, 61, 64, 93, 97, 23, 36, 100, 33, 99, 122, 111, 101, 66, 30, 119, 38, 73, 123, 102, 117, 87, 5], [58, 61, 127, 56, 38, 120, 125, 53, 89, 52, 115, 112, 107, 50, 83, 122, 57, 106, 124, 59, 116, 85, 60, 119, 23, 114, 118, 121, 123, 117, 55, 126, 87, 48, 99, 46, 96, 102, 49, 63, 110, 54, 62, 51, 111, 47, 76, 109, 113, 29, 45, 94, 41, 18, 44, 91, 42, 16, 28, 43, 36, 25, 104, 108, 103, 92, 78, 95, 40, 90, 82, 105, 80, 100, 35, 84, 39, 14, 17, 26, 34, 73, 70, 69, 12, 71, 32, 21, 13, 22, 37, 3, 74, 7, 27, 101, 20, 97, 98, 24, 31, 11, 33, 86, 79, 9, 30, 93, 8, 15, 88, 68, 72, 75, 5, 19, 10, 77, 81, 66, 4, 2, 6, 1, 64, 67, 65, 0], [56, 127, 61, 58, 38, 120, 89, 125, 53, 52, 50, 115, 59, 57, 124, 83, 114, 107, 119, 116, 112, 85, 106, 122, 118, 60, 55, 121, 123, 117, 126, 102, 23, 99, 63, 51, 54, 49, 110, 48, 46, 62, 96, 45, 29, 111, 47, 87, 94, 109, 113, 18, 25, 43, 76, 16, 41, 95, 44, 42, 90, 36, 91, 108, 104, 103, 92, 28, 40, 35, 80, 105, 73, 78, 84, 100, 39, 21, 82, 17, 70, 32, 71, 37, 13, 31, 98, 14, 8, 74, 26, 97, 22, 12, 101, 79, 24, 34, 27, 86, 11, 3, 15, 7, 20, 30, 33, 69, 77, 93, 68, 9, 72, 10, 19, 88, 81, 5, 0, 2, 6, 75, 4, 67, 1, 64, 65, 66], [61, 127, 58, 38, 56, 125, 120, 53, 89, 52, 115, 107, 50, 124, 83, 57, 59, 119, 116, 114, 85, 112, 121, 118, 126, 60, 122, 55, 123, 46, 23, 117, 48, 49, 106, 110, 62, 99, 63, 51, 102, 96, 54, 94, 87, 111, 47, 42, 29, 45, 113, 109, 16, 18, 25, 76, 43, 28, 41, 44, 108, 103, 40, 78, 95, 36, 90, 105, 104, 91, 80, 92, 39, 84, 82, 100, 73, 34, 35, 17, 21, 37, 14, 32, 71, 13, 12, 26, 22, 33, 70, 101, 97, 20, 74, 86, 11, 27, 98, 79, 68, 8, 15, 10, 24, 30, 93, 31, 9, 72, 77, 3, 69, 7, 88, 19, 75, 6, 81, 5, 4, 0, 65, 2, 64, 66, 1, 67], [127, 61, 56, 38, 58, 120, 89, 125, 53, 52, 57, 50, 115, 119, 59, 124, 114, 83, 116, 55, 107, 112, 23, 85, 123, 122, 121, 63, 118, 126, 99, 60, 62, 110, 117, 29, 48, 49, 106, 96, 94, 102, 87, 51, 54, 46, 45, 111, 47, 113, 109, 16, 18, 25, 43, 42, 44, 76, 90, 41, 91, 28, 108, 36, 92, 103, 104, 84, 40, 82, 14, 105, 26, 37, 35, 80, 100, 34, 95, 32, 39, 17, 12, 22, 69, 21, 3, 78, 13, 97, 70, 71, 101, 73, 74, 24, 98, 11, 20, 8, 86, 27, 33, 31, 30, 79, 93, 68, 72, 7, 10, 9, 15, 19, 81, 88, 77, 5, 75, 2, 4, 1, 6, 65, 66, 0, 64, 67], [42, 98, 106, 112, 26, 36, 21, 49, 118, 16, 94, 18, 54, 22, 48, 31, 60, 14, 13, 95, 101, 77, 93, 9, 123, 116, 88, 90, 114, 125, 58, 38, 113, 127, 53, 92, 15, 19, 8, 63, 47, 20, 52, 17, 99, 61, 124, 46, 12, 55, 111, 23, 73, 29, 27, 120, 115, 117, 84, 28, 122, 87, 10, 105, 51, 85, 33, 72, 24, 109, 104, 44, 34, 97, 74, 126, 45, 102, 43, 40, 30, 75, 59, 62, 107, 50, 4, 39, 100, 81, 108, 119, 121, 57, 41, 76, 2, 83, 35, 110, 71, 89, 64, 103, 37, 56, 67, 80, 7, 32, 91, 11, 82, 79, 25, 86, 96, 6, 0, 3, 78, 5, 1, 69, 65, 66, 70, 68], [42, 118, 98, 36, 106, 26, 18, 21, 124, 116, 63, 49, 111, 127, 54, 112, 123, 52, 88, 94, 24, 122, 126, 48, 31, 22, 99, 117, 53, 110, 55, 121, 58, 16, 38, 101, 20, 92, 47, 62, 33, 59, 61, 90, 95, 12, 51, 119, 60, 108, 46, 109, 8, 57, 29, 30, 120, 114, 28, 50, 44, 93, 11, 14, 113, 86, 125, 115, 107, 100, 39, 43, 77, 105, 56, 4, 34, 79, 97, 103, 40, 41, 27, 104, 45, 15, 37, 82, 102, 35, 23, 2, 10, 13, 96, 85, 74, 32, 75, 91, 25, 89, 0, 7, 87, 83, 84, 17, 19, 1, 73, 80, 71, 9, 81, 76, 6, 64, 67, 68, 5, 65, 78, 72, 66, 69, 70, 3], [42, 98, 118, 126, 106, 36, 26, 18, 112, 127, 24, 21, 48, 88, 46, 31, 41, 94, 63, 116, 20, 113, 125, 62, 53, 11, 54, 7, 8, 16, 52, 38, 49, 12, 124, 121, 123, 99, 111, 122, 39, 60, 4, 2, 50, 58, 109, 114, 120, 86, 117, 95, 90, 47, 30, 119, 57, 61, 1, 55, 33, 59, 79, 14, 56, 77, 37, 44, 51, 115, 103, 101, 92, 91, 107, 43, 75, 29, 6, 64, 102, 40, 110, 97, 105, 22, 108, 9, 80, 23, 45, 89, 19, 96, 93, 27, 82, 13, 67, 0, 104, 35, 83, 34, 17, 69, 85, 100, 32, 10, 74, 73, 25, 5, 76, 28, 84, 15, 81, 66, 87, 72, 65, 68, 71, 78, 3, 70], [42, 98, 106, 26, 31, 20, 16, 21, 18, 36, 112, 54, 75, 14, 77, 113, 9, 73, 49, 94, 79, 76, 15, 93, 116, 123, 118, 90, 101, 70, 10, 7, 19, 8, 1, 60, 53, 48, 6, 69, 58, 84, 74, 3, 34, 127, 78, 107, 114, 71, 99, 88, 0, 17, 2, 41, 105, 125, 67, 87, 109, 81, 25, 68, 119, 11, 28, 23, 22, 13, 27, 80, 40, 12, 64, 91, 82, 37, 72, 52, 4, 61, 47, 83, 62, 85, 38, 92, 24, 33, 32, 122, 44, 66, 115, 111, 46, 39, 86, 65, 104, 110, 102, 108, 120, 45, 63, 50, 96, 89, 5, 29, 55, 56, 95, 126, 57, 117, 59, 97, 100, 35, 121, 51, 124, 43, 30, 103], [39, 63, 64, 0, 66, 1, 12, 121, 3, 4, 2, 69, 79, 77, 9, 23, 87, 74, 70, 122, 43, 49, 7, 108, 127, 42, 83, 85, 53, 46, 67, 118, 48, 61, 18, 115, 31, 58, 93, 103, 113, 68, 45, 59, 105, 16, 99, 40, 47, 124, 52, 60, 75, 17, 20, 78, 89, 97, 19, 71, 114, 100, 112, 111, 33, 65, 55, 11, 84, 126, 25, 50, 57, 5, 88, 51, 116, 81, 6, 90, 102, 120, 98, 117, 14, 86, 21, 30, 109, 34, 32, 41, 92, 94, 80, 13, 10, 27, 28, 106, 73, 38, 56, 26, 123, 76, 96, 82, 101, 35, 29, 54, 15, 119, 36, 107, 22, 37, 8, 110, 91, 72, 104, 95, 125, 44, 24, 62], [39, 63, 87, 122, 79, 77, 121, 74, 12, 118, 9, 31, 46, 69, 4, 66, 83, 127, 23, 115, 108, 3, 17, 70, 64, 1, 93, 49, 100, 18, 43, 75, 20, 41, 52, 15, 72, 90, 85, 58, 42, 89, 7, 82, 16, 48, 78, 59, 13, 60, 113, 27, 124, 109, 50, 10, 94, 8, 112, 80, 92, 95, 125, 99, 106, 35, 67, 96, 117, 53, 114, 97, 81, 29, 19, 24, 91, 123, 55, 86, 11, 104, 56, 126, 111, 36, 21, 105, 101, 57, 71, 6, 65, 116, 62, 119, 54, 61, 45, 51, 26, 22, 33, 84, 47, 40, 120, 110, 88, 37, 38, 28, 14, 73, 34, 107, 25, 102, 44, 30, 98, 76, 68, 0, 5, 32, 2, 103], [39, 63, 64, 0, 1, 121, 12, 74, 4, 3, 23, 9, 69, 79, 77, 66, 87, 70, 2, 122, 43, 108, 49, 42, 127, 83, 46, 53, 93, 7, 68, 99, 48, 52, 61, 67, 5, 118, 40, 103, 31, 18, 85, 59, 113, 45, 58, 112, 114, 6, 65, 81, 60, 55, 120, 105, 51, 100, 116, 88, 71, 115, 25, 90, 126, 80, 34, 91, 22, 86, 14, 33, 11, 124, 17, 16, 78, 47, 72, 73, 20, 101, 94, 56, 123, 57, 37, 19, 89, 75, 8, 111, 35, 106, 109, 82, 29, 10, 36, 26, 97, 62, 27, 98, 96, 54, 28, 24, 92, 84, 119, 125, 117, 38, 104, 30, 76, 32, 95, 41, 102, 50, 107, 44, 13, 15, 110, 21], [39, 63, 74, 79, 9, 87, 77, 12, 4, 121, 3, 66, 64, 23, 1, 70, 69, 43, 108, 18, 72, 127, 93, 49, 42, 2, 122, 46, 67, 20, 31, 115, 40, 83, 5, 113, 48, 7, 53, 118, 61, 27, 68, 60, 45, 6, 19, 125, 100, 97, 52, 24, 114, 16, 124, 111, 71, 112, 58, 81, 47, 89, 13, 85, 17, 34, 10, 99, 86, 8, 14, 78, 76, 73, 59, 26, 82, 90, 51, 92, 33, 84, 120, 11, 75, 0, 65, 119, 29, 22, 126, 57, 62, 32, 80, 28, 95, 30, 117, 56, 88, 25, 116, 15, 94, 37, 105, 36, 104, 91, 109, 21, 102, 98, 41, 35, 50, 123, 55, 106, 101, 107, 44, 96, 110, 38, 54, 103], [106, 36, 85, 42, 63, 83, 77, 113, 80, 15, 6, 73, 122, 124, 91, 74, 119, 3, 71, 48, 27, 115, 68, 58, 127, 0, 2, 76, 25, 98, 50, 112, 95, 66, 32, 57, 118, 67, 93, 69, 94, 75, 12, 61, 9, 87, 65, 56, 5, 14, 1, 10, 19, 29, 22, 121, 79, 126, 4, 16, 11, 81, 13, 84, 125, 8, 54, 37, 64, 109, 7, 110, 88, 21, 116, 114, 70, 39, 46, 30, 120, 105, 33, 41, 18, 99, 55, 20, 23, 123, 86, 78, 59, 72, 102, 90, 52, 51, 34, 49, 82, 104, 117, 107, 26, 89, 96, 24, 53, 111, 45, 28, 40, 31, 17, 97, 44, 60, 35, 108, 62, 103, 43, 101, 47, 92, 38, 100], [106, 36, 63, 42, 122, 85, 91, 58, 83, 124, 48, 80, 113, 77, 50, 27, 61, 112, 119, 15, 93, 116, 29, 25, 87, 98, 115, 120, 95, 73, 59, 32, 74, 114, 46, 123, 51, 57, 35, 71, 45, 109, 117, 126, 44, 2, 121, 54, 94, 30, 96, 118, 127, 52, 47, 111, 65, 56, 125, 108, 60, 68, 105, 49, 110, 6, 41, 62, 90, 55, 26, 21, 39, 99, 86, 17, 53, 10, 81, 8, 38, 107, 11, 101, 43, 19, 78, 104, 40, 103, 75, 33, 82, 66, 102, 18, 20, 37, 88, 5, 14, 0, 23, 79, 97, 92, 24, 31, 16, 76, 28, 34, 89, 22, 3, 12, 84, 64, 72, 7, 13, 100, 69, 1, 9, 4, 70, 67], [106, 36, 63, 42, 122, 113, 85, 91, 50, 83, 27, 80, 117, 61, 48, 124, 25, 112, 29, 118, 98, 87, 93, 52, 123, 77, 95, 60, 15, 119, 32, 59, 58, 127, 115, 57, 96, 90, 53, 126, 51, 120, 17, 73, 54, 35, 49, 40, 94, 114, 41, 109, 45, 46, 44, 121, 55, 111, 116, 21, 110, 62, 47, 105, 74, 71, 56, 30, 125, 108, 99, 39, 107, 38, 88, 103, 24, 43, 6, 104, 11, 86, 23, 101, 78, 102, 37, 34, 20, 19, 97, 5, 79, 31, 75, 28, 8, 33, 14, 81, 22, 92, 68, 89, 100, 84, 10, 16, 66, 26, 12, 2, 82, 0, 13, 76, 69, 72, 18, 65, 3, 7, 4, 9, 67, 70, 1, 64], [106, 36, 122, 42, 85, 124, 113, 48, 91, 50, 83, 127, 93, 58, 63, 59, 27, 98, 80, 118, 32, 126, 77, 47, 15, 87, 29, 25, 73, 95, 35, 115, 57, 109, 119, 96, 39, 90, 54, 114, 17, 112, 55, 123, 99, 56, 94, 21, 71, 41, 117, 30, 74, 45, 11, 51, 61, 38, 6, 75, 52, 121, 49, 103, 5, 68, 46, 62, 120, 34, 111, 2, 116, 40, 60, 125, 110, 53, 108, 105, 86, 18, 24, 33, 44, 43, 107, 97, 81, 102, 104, 26, 19, 12, 88, 37, 101, 79, 10, 72, 89, 78, 8, 20, 31, 82, 92, 16, 28, 14, 23, 84, 0, 76, 3, 22, 65, 66, 13, 69, 100, 7, 9, 4, 70, 64, 67, 1], [108, 111, 93, 44, 24, 14, 125, 116, 15, 72, 85, 74, 82, 19, 121, 98, 76, 6, 47, 124, 81, 36, 63, 20, 35, 37, 119, 95, 1, 25, 11, 87, 3, 33, 67, 68, 126, 38, 97, 57, 64, 27, 34, 100, 29, 55, 77, 60, 96, 13, 51, 65, 40, 4, 62, 49, 70, 105, 123, 75, 52, 5, 30, 0, 79, 43, 73, 86, 41, 92, 71, 9, 56, 46, 113, 88, 32, 117, 112, 80, 31, 50, 7, 109, 103, 127, 84, 53, 107, 23, 10, 48, 104, 26, 106, 114, 45, 12, 115, 122, 83, 120, 90, 78, 2, 61, 89, 21, 17, 18, 99, 42, 28, 101, 110, 102, 59, 118, 54, 8, 94, 66, 22, 16, 58, 69, 39, 91], [108, 44, 111, 93, 74, 14, 24, 116, 72, 124, 125, 82, 76, 85, 19, 68, 15, 6, 36, 1, 98, 64, 13, 47, 87, 81, 37, 63, 4, 20, 126, 121, 100, 65, 29, 53, 3, 67, 35, 119, 46, 7, 51, 60, 43, 49, 11, 34, 123, 57, 95, 2, 50, 80, 41, 113, 0, 55, 62, 86, 101, 56, 127, 69, 45, 88, 70, 40, 73, 99, 75, 104, 97, 38, 96, 90, 92, 117, 105, 9, 5, 31, 27, 58, 10, 114, 61, 18, 23, 21, 12, 89, 25, 71, 33, 42, 83, 66, 102, 103, 77, 120, 107, 30, 112, 79, 54, 8, 122, 59, 78, 118, 28, 115, 48, 84, 16, 110, 106, 91, 26, 22, 32, 39, 52, 94, 17, 109], [108, 111, 44, 125, 93, 24, 82, 15, 14, 76, 13, 74, 50, 68, 36, 116, 124, 72, 19, 98, 1, 6, 121, 113, 0, 87, 20, 86, 89, 67, 9, 64, 47, 37, 126, 95, 34, 53, 60, 35, 11, 71, 63, 4, 33, 32, 119, 92, 29, 100, 127, 41, 22, 3, 62, 84, 73, 83, 80, 55, 59, 49, 46, 123, 85, 70, 16, 57, 94, 23, 43, 7, 56, 88, 27, 65, 52, 115, 51, 8, 120, 90, 96, 105, 30, 26, 31, 102, 103, 69, 110, 40, 97, 61, 81, 91, 42, 17, 28, 114, 38, 18, 106, 104, 10, 117, 45, 21, 58, 118, 12, 107, 78, 101, 112, 109, 99, 75, 39, 122, 77, 25, 48, 5, 79, 66, 54, 2], [108, 111, 93, 44, 82, 24, 125, 14, 68, 36, 76, 6, 72, 116, 22, 13, 74, 64, 19, 67, 50, 1, 37, 0, 124, 87, 15, 35, 98, 75, 119, 85, 103, 113, 47, 121, 18, 57, 51, 11, 55, 49, 29, 70, 62, 63, 34, 3, 53, 95, 88, 9, 127, 112, 86, 25, 65, 10, 60, 46, 126, 59, 20, 100, 89, 5, 97, 52, 2, 81, 40, 90, 109, 115, 120, 38, 92, 61, 117, 94, 104, 83, 30, 12, 96, 4, 123, 31, 84, 78, 56, 26, 43, 66, 33, 110, 48, 23, 41, 122, 54, 21, 28, 42, 106, 102, 58, 91, 39, 99, 107, 32, 7, 118, 73, 16, 105, 45, 114, 101, 27, 69, 80, 17, 8, 79, 71, 77]], "model.layers.25.self_attn.k_proj": [[62, 38, 121, 33, 124, 56, 127, 29, 20, 60, 72, 13, 26, 79, 83, 63, 74, 17, 91, 16, 78, 50, 24, 86, 80, 4, 77, 52, 71, 49, 22, 32, 25, 19, 102, 100, 123, 53, 47, 6, 9, 28, 120, 5, 18, 93, 117, 113, 12, 75, 99, 115, 3, 46, 110, 2, 54, 126, 119, 36, 111, 82, 44, 42, 95, 70, 116, 40, 34, 23, 57, 30, 87, 61, 118, 59, 43, 81, 8, 48, 92, 125, 58, 45, 106, 94, 11, 51, 41, 122, 88, 108, 0, 112, 107, 105, 55, 103, 65, 109, 37, 114, 21, 101, 31, 27, 89, 67, 10, 96, 104, 97, 85, 98, 35, 39, 68, 90, 1, 15, 84, 73, 14, 64, 7, 76, 69, 66], [103, 22, 99, 117, 126, 58, 32, 111, 59, 124, 51, 119, 60, 48, 62, 54, 93, 92, 114, 127, 61, 57, 123, 116, 56, 49, 120, 52, 55, 121, 63, 110, 46, 112, 50, 125, 122, 118, 113, 30, 53, 89, 43, 109, 44, 86, 115, 45, 41, 108, 42, 100, 104, 47, 40, 37, 27, 106, 107, 38, 35, 91, 105, 82, 34, 84, 97, 101, 80, 81, 14, 29, 79, 102, 21, 16, 36, 33, 90, 26, 94, 83, 78, 20, 18, 87, 76, 98, 31, 28, 19, 74, 24, 11, 85, 12, 95, 23, 25, 6, 7, 39, 77, 17, 96, 5, 8, 13, 75, 72, 15, 88, 68, 10, 67, 73, 2, 0, 1, 4, 9, 71, 69, 3, 66, 70, 65, 64], [59, 104, 98, 90, 83, 17, 13, 79, 72, 112, 63, 31, 12, 88, 24, 75, 118, 85, 0, 53, 50, 9, 124, 1, 4, 71, 121, 126, 14, 6, 86, 68, 81, 10, 67, 2, 125, 116, 111, 66, 123, 42, 55, 109, 41, 45, 11, 25, 57, 47, 52, 107, 46, 60, 7, 26, 28, 115, 20, 114, 49, 51, 95, 69, 48, 108, 58, 56, 18, 103, 84, 29, 74, 44, 91, 62, 94, 106, 22, 33, 54, 92, 122, 39, 21, 30, 119, 110, 82, 113, 117, 99, 16, 100, 105, 93, 89, 38, 8, 87, 23, 27, 43, 96, 80, 78, 102, 35, 120, 76, 97, 32, 3, 77, 101, 36, 37, 127, 70, 65, 61, 19, 15, 73, 5, 34, 64, 40], [102, 22, 61, 127, 35, 56, 58, 32, 125, 50, 52, 120, 60, 53, 30, 59, 98, 116, 111, 86, 49, 117, 55, 126, 115, 119, 124, 118, 110, 123, 54, 92, 121, 33, 63, 112, 122, 57, 48, 113, 16, 109, 62, 114, 51, 28, 43, 26, 46, 44, 47, 108, 42, 45, 100, 106, 36, 107, 105, 14, 18, 85, 27, 40, 99, 41, 37, 83, 104, 103, 101, 25, 11, 39, 89, 24, 34, 20, 77, 19, 29, 38, 23, 90, 91, 87, 82, 12, 17, 97, 94, 31, 93, 84, 74, 95, 72, 79, 96, 13, 5, 88, 73, 78, 15, 70, 8, 4, 10, 81, 67, 9, 80, 1, 6, 21, 76, 66, 7, 64, 75, 71, 2, 68, 0, 69, 3, 65], [106, 34, 30, 26, 49, 48, 118, 14, 64, 21, 88, 16, 42, 116, 18, 54, 123, 8, 6, 119, 20, 121, 60, 117, 53, 127, 114, 124, 55, 120, 31, 63, 4, 126, 105, 104, 87, 58, 67, 107, 10, 12, 2, 3, 28, 122, 99, 110, 74, 56, 47, 57, 125, 83, 111, 59, 1, 46, 109, 108, 62, 9, 52, 86, 43, 61, 51, 115, 112, 40, 44, 45, 98, 37, 113, 50, 79, 23, 17, 84, 65, 13, 101, 102, 39, 35, 103, 89, 22, 25, 41, 38, 77, 27, 73, 95, 100, 29, 75, 33, 11, 93, 19, 96, 68, 70, 32, 7, 36, 81, 91, 66, 97, 71, 15, 78, 69, 82, 94, 92, 5, 76, 0, 85, 72, 80, 24, 90], [63, 103, 64, 121, 9, 65, 1, 79, 12, 66, 4, 77, 69, 3, 87, 74, 70, 122, 44, 113, 107, 118, 110, 106, 67, 23, 127, 48, 52, 115, 18, 58, 68, 7, 20, 5, 85, 59, 81, 6, 40, 105, 50, 83, 31, 93, 61, 78, 53, 117, 47, 89, 39, 2, 33, 60, 27, 19, 124, 46, 43, 125, 75, 95, 109, 16, 17, 80, 84, 35, 10, 36, 29, 49, 126, 25, 72, 42, 90, 112, 55, 116, 11, 54, 92, 99, 98, 71, 56, 97, 73, 0, 21, 86, 111, 45, 15, 41, 37, 38, 82, 30, 51, 101, 8, 96, 62, 114, 24, 104, 91, 26, 14, 88, 119, 28, 120, 123, 100, 57, 94, 108, 34, 22, 32, 102, 76, 13], [42, 63, 122, 83, 85, 100, 112, 0, 119, 80, 15, 77, 91, 73, 124, 29, 115, 50, 57, 127, 66, 106, 25, 113, 96, 71, 6, 61, 87, 109, 34, 126, 121, 74, 65, 68, 3, 58, 75, 1, 49, 45, 11, 46, 2, 111, 56, 90, 51, 30, 44, 125, 120, 95, 8, 17, 81, 5, 14, 60, 4, 98, 31, 24, 108, 55, 23, 93, 41, 89, 59, 116, 27, 36, 110, 52, 12, 118, 102, 62, 86, 35, 10, 47, 94, 114, 104, 54, 48, 101, 105, 82, 43, 33, 18, 103, 78, 107, 40, 20, 38, 22, 99, 117, 53, 123, 28, 92, 7, 39, 84, 37, 26, 97, 76, 88, 32, 69, 13, 72, 64, 67, 16, 19, 70, 21, 79, 9], [44, 111, 108, 6, 0, 93, 47, 24, 14, 72, 76, 68, 65, 74, 64, 82, 2, 15, 53, 66, 125, 124, 50, 67, 116, 9, 85, 19, 87, 20, 81, 121, 119, 52, 13, 3, 57, 70, 75, 34, 101, 7, 11, 113, 73, 54, 86, 100, 62, 51, 127, 55, 80, 126, 120, 112, 10, 90, 16, 123, 104, 63, 60, 110, 49, 41, 61, 97, 45, 32, 107, 4, 96, 105, 89, 115, 95, 58, 59, 114, 22, 83, 88, 122, 79, 37, 36, 31, 5, 1, 109, 33, 56, 43, 117, 98, 46, 103, 118, 99, 94, 27, 39, 28, 23, 21, 42, 91, 35, 102, 29, 25, 40, 69, 38, 48, 106, 84, 30, 71, 26, 92, 18, 77, 12, 17, 78, 8]], "model.layers.25.self_attn.qk_proj": [[63, 59, 42, 62, 106, 44, 111, 58, 127, 124, 61, 108, 121, 126, 117, 56, 122, 15, 93, 64, 50, 0, 79, 102, 125, 60, 77, 19, 118, 53, 13, 85, 90, 104, 10, 12, 23, 83, 115, 21, 36, 38, 76, 24, 113, 87, 98, 26, 103, 123, 73, 112, 119, 9, 74, 88, 116, 49, 52, 27, 68, 48, 47, 120, 6, 82, 34, 65, 46, 18, 4, 2, 39, 55, 57, 114, 67, 80, 51, 16, 1, 78, 17, 66, 54, 70, 29, 14, 99, 3, 40, 8, 31, 81, 86, 72, 110, 95, 97, 89, 33, 20, 84, 43, 22, 35, 109, 41, 25, 107, 32, 94, 69, 91, 100, 96, 7, 5, 71, 11, 105, 37, 28, 30, 75, 45, 101, 92], [63, 59, 42, 62, 106, 44, 111, 127, 58, 61, 124, 121, 117, 108, 126, 56, 122, 0, 93, 102, 15, 50, 79, 64, 125, 77, 60, 19, 23, 118, 10, 21, 12, 112, 53, 103, 38, 76, 83, 13, 6, 90, 98, 115, 74, 119, 88, 113, 4, 24, 36, 9, 87, 116, 85, 104, 26, 49, 39, 48, 68, 73, 123, 120, 52, 47, 34, 65, 40, 2, 29, 82, 27, 57, 46, 1, 16, 66, 18, 55, 54, 72, 78, 14, 80, 17, 51, 3, 70, 99, 110, 114, 81, 89, 31, 22, 67, 97, 20, 109, 94, 8, 95, 43, 33, 25, 86, 107, 84, 41, 100, 5, 11, 75, 71, 96, 45, 28, 91, 69, 35, 105, 7, 30, 37, 92, 32, 101], [63, 59, 42, 62, 106, 44, 111, 127, 58, 61, 126, 108, 121, 124, 117, 56, 64, 93, 122, 125, 50, 79, 60, 0, 77, 118, 15, 83, 13, 112, 19, 119, 23, 90, 102, 36, 76, 21, 38, 98, 10, 74, 116, 115, 87, 24, 12, 120, 85, 73, 103, 9, 104, 113, 53, 49, 88, 4, 34, 48, 39, 6, 68, 82, 29, 26, 52, 66, 47, 27, 2, 1, 65, 72, 123, 3, 16, 18, 57, 70, 67, 80, 54, 51, 55, 46, 40, 78, 114, 14, 99, 17, 31, 89, 81, 110, 86, 35, 94, 22, 20, 109, 33, 84, 100, 25, 95, 107, 91, 96, 43, 69, 71, 105, 5, 8, 97, 92, 41, 28, 75, 32, 7, 11, 45, 30, 101, 37], [63, 59, 42, 106, 62, 44, 58, 111, 127, 61, 121, 108, 126, 117, 56, 124, 122, 125, 0, 93, 64, 50, 79, 53, 102, 60, 77, 15, 83, 115, 13, 112, 113, 19, 21, 10, 85, 118, 38, 12, 119, 74, 76, 87, 73, 23, 26, 98, 104, 34, 9, 36, 24, 88, 39, 6, 120, 52, 49, 116, 55, 4, 103, 68, 65, 66, 67, 72, 40, 90, 80, 48, 3, 47, 16, 46, 123, 114, 18, 54, 1, 78, 82, 27, 110, 57, 29, 70, 14, 99, 2, 17, 81, 51, 31, 89, 86, 94, 33, 20, 43, 22, 109, 45, 97, 69, 35, 84, 25, 100, 41, 5, 71, 8, 7, 95, 91, 96, 105, 107, 28, 75, 32, 92, 11, 30, 101, 37], [63, 59, 62, 42, 106, 44, 58, 111, 127, 121, 126, 61, 108, 56, 117, 124, 125, 122, 50, 93, 79, 60, 15, 0, 102, 64, 13, 77, 118, 119, 23, 83, 53, 113, 98, 38, 74, 85, 116, 104, 112, 76, 73, 19, 115, 12, 36, 90, 10, 21, 103, 123, 34, 4, 9, 49, 88, 55, 24, 52, 87, 26, 72, 46, 68, 120, 70, 48, 27, 39, 65, 18, 2, 57, 82, 1, 47, 16, 14, 99, 66, 114, 51, 81, 80, 6, 17, 54, 29, 40, 67, 78, 31, 3, 22, 110, 86, 25, 33, 94, 35, 95, 20, 107, 89, 84, 43, 109, 41, 97, 75, 69, 105, 45, 7, 101, 71, 5, 92, 96, 28, 30, 100, 91, 8, 32, 11, 37], [63, 59, 106, 62, 42, 44, 58, 111, 127, 121, 61, 124, 126, 108, 56, 117, 122, 93, 125, 79, 50, 64, 0, 15, 13, 102, 77, 83, 118, 115, 119, 36, 23, 85, 90, 21, 98, 87, 52, 12, 88, 19, 112, 38, 60, 113, 34, 10, 49, 104, 76, 73, 53, 103, 26, 4, 9, 116, 74, 24, 27, 72, 68, 70, 39, 17, 1, 14, 51, 16, 46, 81, 66, 57, 55, 47, 48, 6, 82, 2, 3, 40, 123, 18, 120, 80, 65, 99, 54, 114, 67, 22, 78, 86, 110, 97, 29, 84, 35, 31, 89, 94, 25, 41, 20, 32, 33, 109, 69, 107, 43, 100, 5, 75, 95, 7, 8, 105, 71, 96, 28, 11, 30, 45, 92, 91, 37, 101], [63, 59, 42, 106, 62, 44, 111, 127, 58, 121, 61, 108, 124, 126, 117, 56, 122, 79, 93, 102, 15, 0, 77, 64, 125, 19, 13, 85, 26, 118, 113, 60, 98, 38, 119, 21, 112, 83, 76, 116, 88, 74, 90, 104, 50, 12, 23, 103, 36, 1, 9, 49, 10, 53, 24, 52, 73, 57, 48, 39, 70, 34, 87, 18, 47, 46, 27, 16, 82, 115, 72, 2, 51, 81, 123, 80, 65, 68, 99, 14, 120, 67, 66, 17, 78, 55, 4, 54, 6, 29, 40, 3, 25, 22, 86, 31, 35, 114, 94, 41, 33, 110, 109, 89, 8, 100, 107, 84, 5, 20, 75, 97, 32, 95, 11, 28, 91, 7, 45, 30, 69, 71, 43, 105, 101, 37, 96, 92], [63, 59, 62, 42, 106, 44, 58, 111, 127, 121, 124, 108, 61, 117, 126, 56, 122, 93, 125, 0, 64, 79, 60, 15, 50, 118, 13, 77, 116, 112, 38, 21, 90, 102, 52, 83, 19, 23, 103, 26, 85, 119, 10, 48, 12, 36, 98, 113, 24, 53, 88, 76, 87, 74, 70, 57, 39, 27, 73, 65, 9, 49, 47, 104, 1, 51, 29, 4, 68, 34, 46, 18, 123, 115, 2, 55, 66, 82, 78, 16, 120, 54, 72, 81, 99, 80, 6, 110, 22, 95, 86, 14, 17, 40, 3, 67, 31, 33, 114, 89, 25, 94, 35, 8, 84, 97, 100, 107, 41, 43, 28, 20, 109, 45, 91, 69, 5, 96, 7, 30, 75, 32, 37, 11, 105, 71, 92, 101], [63, 59, 42, 106, 62, 44, 111, 58, 127, 124, 121, 61, 108, 117, 126, 56, 122, 93, 125, 0, 64, 15, 50, 79, 85, 13, 102, 77, 38, 90, 19, 118, 112, 83, 116, 21, 23, 26, 103, 60, 119, 113, 24, 88, 53, 76, 12, 10, 48, 52, 74, 68, 87, 73, 34, 46, 27, 39, 36, 9, 98, 47, 115, 4, 104, 51, 29, 49, 70, 1, 120, 65, 82, 123, 14, 80, 114, 55, 6, 17, 66, 16, 3, 54, 57, 67, 78, 81, 2, 18, 99, 110, 31, 22, 8, 86, 40, 25, 72, 89, 35, 100, 97, 95, 94, 20, 43, 33, 109, 91, 84, 107, 41, 69, 5, 45, 30, 96, 75, 28, 71, 32, 11, 7, 37, 92, 105, 101], [63, 59, 106, 42, 62, 44, 111, 58, 127, 121, 124, 108, 61, 117, 126, 56, 122, 0, 64, 93, 125, 119, 50, 15, 60, 79, 102, 53, 83, 116, 118, 77, 74, 13, 38, 90, 12, 120, 23, 112, 98, 76, 85, 10, 73, 103, 21, 19, 36, 113, 9, 88, 104, 115, 49, 68, 24, 46, 48, 6, 4, 87, 26, 70, 34, 47, 39, 1, 52, 65, 57, 66, 18, 54, 2, 8, 51, 3, 123, 80, 27, 29, 67, 55, 14, 17, 40, 81, 114, 82, 86, 16, 110, 72, 78, 97, 99, 31, 25, 22, 94, 33, 84, 43, 89, 20, 35, 7, 95, 69, 75, 100, 11, 28, 30, 32, 5, 71, 96, 107, 105, 45, 109, 41, 91, 37, 101, 92], [63, 59, 62, 42, 106, 44, 58, 111, 127, 124, 108, 121, 61, 117, 126, 56, 122, 93, 0, 64, 79, 15, 13, 125, 119, 50, 77, 102, 38, 116, 118, 83, 53, 60, 21, 10, 74, 85, 112, 12, 19, 23, 76, 98, 73, 49, 113, 104, 34, 36, 6, 90, 9, 115, 87, 88, 68, 103, 4, 48, 24, 120, 2, 39, 26, 8, 55, 65, 80, 1, 47, 52, 46, 40, 66, 54, 51, 123, 14, 70, 16, 99, 81, 67, 3, 82, 86, 29, 27, 78, 18, 17, 114, 110, 94, 22, 57, 35, 84, 31, 33, 25, 20, 97, 72, 89, 69, 96, 95, 107, 43, 7, 75, 71, 5, 105, 11, 28, 91, 100, 30, 92, 32, 41, 45, 101, 37, 109], [63, 59, 42, 106, 62, 44, 111, 127, 58, 121, 108, 126, 124, 61, 56, 117, 122, 0, 93, 15, 79, 64, 77, 13, 85, 19, 83, 50, 102, 118, 21, 116, 9, 23, 90, 53, 38, 98, 104, 26, 10, 76, 125, 112, 60, 88, 74, 36, 119, 24, 12, 103, 48, 113, 73, 68, 6, 87, 4, 49, 40, 82, 65, 115, 47, 39, 46, 27, 34, 55, 52, 8, 123, 120, 14, 80, 17, 57, 16, 54, 2, 99, 1, 110, 29, 67, 114, 81, 18, 66, 22, 78, 3, 86, 31, 70, 97, 35, 94, 69, 51, 84, 25, 107, 33, 72, 89, 30, 28, 20, 75, 41, 5, 100, 45, 43, 95, 96, 91, 7, 11, 37, 71, 105, 32, 109, 101, 92], [63, 59, 42, 106, 62, 44, 111, 58, 127, 124, 121, 61, 108, 126, 117, 56, 122, 93, 64, 116, 118, 102, 50, 0, 15, 79, 83, 112, 90, 125, 77, 13, 19, 48, 113, 60, 23, 119, 85, 24, 26, 103, 104, 38, 36, 53, 21, 12, 87, 10, 9, 76, 88, 98, 74, 73, 54, 39, 6, 52, 34, 8, 68, 49, 4, 115, 1, 65, 27, 123, 47, 18, 46, 82, 51, 120, 66, 14, 2, 55, 29, 57, 99, 16, 22, 17, 80, 40, 3, 81, 70, 89, 110, 25, 86, 114, 107, 94, 78, 67, 31, 33, 97, 20, 35, 84, 91, 28, 43, 95, 41, 32, 30, 105, 11, 109, 69, 100, 75, 5, 72, 7, 96, 45, 92, 71, 37, 101], [63, 59, 42, 106, 62, 44, 111, 127, 58, 61, 124, 108, 121, 117, 126, 56, 64, 122, 93, 50, 0, 60, 125, 77, 79, 102, 90, 15, 118, 116, 13, 119, 48, 83, 85, 112, 120, 36, 19, 53, 38, 52, 9, 76, 113, 10, 74, 87, 21, 27, 103, 23, 73, 6, 104, 12, 26, 24, 8, 34, 88, 98, 1, 51, 49, 39, 65, 115, 68, 47, 66, 4, 80, 54, 81, 67, 123, 16, 82, 18, 57, 110, 2, 29, 17, 70, 55, 99, 14, 31, 114, 86, 46, 3, 78, 22, 89, 40, 11, 43, 95, 20, 97, 100, 33, 41, 35, 72, 32, 94, 5, 69, 84, 45, 25, 37, 107, 7, 75, 109, 91, 28, 105, 30, 92, 71, 96, 101], [63, 59, 42, 106, 62, 44, 58, 111, 127, 61, 108, 121, 124, 56, 117, 126, 122, 0, 93, 64, 79, 102, 77, 15, 60, 50, 83, 125, 85, 13, 116, 76, 118, 74, 112, 38, 104, 36, 53, 19, 98, 119, 26, 10, 34, 9, 23, 48, 12, 24, 21, 113, 52, 90, 103, 73, 68, 27, 87, 115, 88, 39, 8, 55, 1, 6, 70, 49, 4, 67, 2, 120, 66, 80, 82, 65, 29, 18, 51, 3, 47, 16, 81, 123, 14, 46, 54, 99, 86, 31, 57, 22, 78, 17, 97, 110, 114, 40, 43, 100, 72, 20, 33, 89, 94, 35, 41, 84, 25, 107, 11, 7, 96, 95, 69, 5, 30, 75, 105, 28, 32, 45, 71, 91, 101, 92, 37, 109], [63, 59, 62, 42, 106, 44, 127, 111, 58, 124, 121, 126, 108, 61, 56, 117, 122, 93, 79, 15, 64, 77, 50, 85, 0, 125, 116, 102, 83, 112, 26, 76, 118, 13, 38, 23, 73, 90, 24, 60, 19, 113, 87, 74, 98, 10, 12, 9, 103, 119, 21, 88, 53, 4, 55, 34, 27, 47, 36, 104, 48, 39, 68, 120, 52, 115, 80, 1, 40, 123, 16, 49, 14, 8, 70, 78, 29, 46, 114, 17, 18, 81, 57, 2, 51, 82, 3, 66, 65, 6, 110, 54, 67, 99, 33, 86, 97, 31, 22, 20, 72, 89, 69, 95, 84, 94, 41, 35, 5, 107, 25, 43, 11, 100, 105, 7, 28, 75, 91, 71, 45, 109, 30, 96, 101, 32, 37, 92], [63, 59, 42, 106, 62, 44, 58, 111, 127, 121, 124, 61, 126, 108, 117, 56, 122, 93, 64, 0, 15, 102, 50, 79, 23, 118, 125, 112, 90, 19, 116, 13, 119, 83, 77, 103, 113, 85, 88, 87, 38, 21, 53, 39, 26, 104, 120, 98, 12, 36, 76, 60, 24, 74, 27, 73, 48, 10, 34, 9, 29, 70, 47, 52, 65, 66, 123, 82, 49, 68, 1, 4, 80, 54, 46, 40, 16, 51, 114, 110, 18, 2, 57, 3, 78, 115, 99, 14, 81, 72, 95, 33, 17, 8, 67, 89, 86, 97, 6, 22, 31, 84, 25, 41, 94, 55, 20, 43, 35, 5, 107, 11, 28, 45, 75, 96, 100, 30, 69, 32, 91, 105, 71, 109, 7, 92, 37, 101], [63, 59, 42, 62, 106, 44, 58, 127, 111, 121, 61, 117, 124, 108, 126, 56, 93, 122, 0, 64, 50, 118, 79, 116, 15, 77, 112, 13, 19, 102, 90, 125, 23, 76, 60, 38, 85, 87, 48, 21, 119, 83, 24, 74, 53, 98, 103, 88, 39, 70, 12, 104, 26, 1, 120, 113, 36, 73, 54, 10, 9, 115, 29, 34, 27, 4, 47, 52, 49, 66, 3, 114, 18, 82, 16, 65, 123, 110, 68, 67, 72, 81, 57, 80, 78, 2, 14, 51, 6, 99, 40, 55, 86, 89, 31, 8, 46, 22, 43, 17, 94, 95, 20, 33, 69, 25, 84, 28, 109, 107, 41, 35, 11, 100, 97, 91, 5, 96, 32, 7, 75, 71, 30, 45, 105, 92, 37, 101], [63, 59, 42, 62, 106, 44, 58, 111, 127, 121, 61, 108, 124, 117, 126, 56, 122, 64, 50, 102, 93, 0, 60, 79, 15, 13, 125, 116, 103, 77, 118, 98, 83, 53, 112, 104, 76, 23, 113, 38, 12, 119, 34, 19, 74, 36, 24, 10, 85, 70, 87, 21, 52, 9, 4, 90, 73, 88, 48, 123, 1, 120, 29, 47, 72, 46, 68, 26, 49, 39, 82, 114, 54, 115, 16, 66, 51, 67, 18, 55, 27, 40, 2, 80, 3, 110, 6, 99, 81, 57, 65, 78, 43, 14, 86, 33, 31, 94, 17, 20, 89, 35, 109, 25, 97, 71, 107, 105, 69, 22, 100, 96, 45, 32, 7, 91, 84, 5, 8, 11, 30, 95, 41, 101, 28, 75, 92, 37], [63, 59, 42, 62, 106, 44, 111, 127, 58, 121, 108, 61, 117, 124, 126, 56, 122, 64, 93, 0, 102, 79, 15, 125, 50, 116, 23, 13, 60, 112, 77, 83, 118, 103, 19, 76, 12, 38, 119, 36, 87, 85, 74, 98, 120, 104, 34, 21, 26, 115, 90, 113, 48, 88, 73, 4, 53, 1, 10, 68, 72, 70, 24, 9, 52, 47, 49, 54, 123, 39, 18, 46, 2, 29, 40, 55, 6, 65, 27, 114, 16, 67, 82, 66, 99, 80, 81, 78, 57, 3, 51, 110, 17, 97, 14, 22, 94, 89, 86, 25, 107, 96, 105, 33, 41, 95, 100, 20, 71, 31, 35, 11, 69, 109, 84, 7, 43, 75, 8, 5, 91, 37, 32, 28, 45, 30, 101, 92], [63, 59, 106, 42, 62, 44, 127, 111, 58, 124, 121, 117, 61, 108, 126, 56, 122, 93, 0, 50, 15, 13, 125, 64, 116, 112, 79, 77, 104, 23, 118, 113, 60, 98, 74, 85, 87, 53, 83, 76, 120, 36, 12, 119, 38, 90, 24, 48, 49, 54, 102, 103, 9, 73, 19, 21, 10, 47, 72, 34, 88, 26, 70, 115, 46, 4, 27, 67, 114, 1, 55, 39, 66, 52, 29, 17, 40, 65, 6, 123, 51, 16, 18, 80, 2, 68, 82, 86, 78, 14, 57, 3, 110, 99, 89, 43, 81, 31, 94, 20, 25, 95, 33, 22, 5, 35, 97, 84, 107, 91, 100, 109, 71, 7, 96, 41, 32, 75, 8, 69, 37, 11, 105, 30, 28, 92, 45, 101], [63, 59, 42, 106, 62, 44, 58, 111, 127, 61, 121, 108, 117, 126, 124, 56, 64, 122, 0, 93, 125, 15, 50, 79, 112, 77, 60, 13, 118, 53, 116, 23, 102, 119, 74, 24, 83, 12, 113, 120, 38, 85, 87, 10, 36, 115, 76, 104, 98, 19, 73, 103, 21, 1, 54, 9, 48, 26, 6, 72, 88, 90, 47, 49, 34, 66, 68, 51, 4, 82, 52, 123, 39, 114, 3, 46, 70, 65, 55, 18, 2, 29, 27, 40, 57, 67, 99, 16, 14, 110, 80, 31, 78, 17, 43, 33, 86, 35, 89, 94, 109, 84, 95, 81, 8, 22, 25, 20, 97, 5, 45, 11, 107, 7, 69, 91, 100, 28, 32, 71, 105, 30, 75, 92, 41, 96, 101, 37], [63, 59, 42, 106, 62, 44, 58, 111, 127, 61, 121, 108, 124, 117, 126, 56, 125, 93, 122, 0, 79, 23, 102, 118, 77, 50, 64, 116, 15, 60, 123, 76, 103, 13, 53, 90, 120, 83, 104, 115, 9, 24, 38, 21, 113, 74, 36, 98, 85, 87, 26, 12, 112, 48, 6, 19, 51, 73, 4, 10, 88, 34, 49, 68, 119, 52, 29, 1, 39, 66, 46, 55, 82, 47, 72, 114, 27, 18, 2, 54, 3, 16, 57, 80, 40, 95, 67, 99, 14, 33, 65, 81, 94, 31, 70, 78, 22, 110, 25, 35, 89, 86, 17, 20, 8, 97, 43, 91, 84, 105, 96, 71, 28, 45, 5, 11, 109, 32, 100, 69, 7, 30, 41, 101, 75, 107, 37, 92], [63, 59, 42, 62, 106, 44, 58, 111, 127, 61, 124, 121, 108, 126, 117, 56, 125, 122, 93, 15, 0, 64, 77, 23, 102, 79, 118, 83, 50, 13, 112, 116, 19, 104, 21, 38, 113, 53, 98, 49, 115, 60, 103, 12, 85, 74, 6, 76, 119, 9, 10, 26, 48, 90, 34, 120, 36, 87, 88, 24, 73, 4, 68, 123, 54, 52, 46, 72, 16, 51, 39, 65, 40, 47, 27, 82, 18, 55, 3, 66, 2, 80, 29, 114, 22, 110, 1, 81, 78, 94, 99, 14, 17, 67, 70, 33, 57, 45, 84, 8, 97, 31, 20, 25, 86, 89, 5, 32, 41, 35, 75, 71, 95, 96, 100, 43, 28, 11, 107, 69, 109, 7, 91, 105, 92, 37, 30, 101], [63, 59, 42, 106, 62, 44, 111, 58, 127, 121, 61, 108, 124, 126, 117, 56, 122, 93, 64, 15, 0, 102, 79, 125, 13, 50, 118, 83, 77, 90, 53, 98, 12, 85, 23, 21, 112, 19, 103, 119, 60, 113, 10, 24, 104, 76, 87, 36, 49, 9, 74, 116, 48, 26, 120, 73, 88, 38, 6, 115, 47, 123, 18, 39, 34, 82, 27, 52, 16, 65, 1, 2, 54, 40, 70, 14, 68, 55, 4, 80, 29, 66, 72, 86, 46, 57, 81, 110, 17, 67, 8, 51, 99, 31, 89, 114, 3, 25, 78, 20, 33, 22, 94, 97, 95, 84, 43, 107, 5, 32, 11, 71, 75, 7, 105, 100, 109, 35, 91, 96, 41, 28, 92, 30, 45, 69, 101, 37], [63, 59, 62, 42, 106, 44, 58, 111, 127, 121, 124, 61, 108, 126, 117, 56, 93, 122, 64, 50, 0, 15, 118, 125, 79, 116, 102, 13, 24, 90, 87, 48, 26, 23, 112, 120, 77, 60, 38, 119, 19, 12, 21, 103, 113, 85, 76, 49, 104, 83, 10, 73, 36, 54, 74, 98, 53, 47, 52, 123, 88, 39, 9, 27, 29, 34, 4, 6, 68, 51, 82, 115, 1, 70, 18, 2, 40, 16, 46, 65, 114, 80, 66, 67, 8, 99, 14, 3, 57, 89, 110, 31, 81, 22, 55, 33, 78, 25, 17, 43, 95, 86, 94, 97, 20, 72, 35, 84, 5, 91, 109, 7, 30, 41, 69, 11, 100, 71, 107, 45, 105, 28, 32, 75, 96, 92, 101, 37], [63, 59, 62, 42, 106, 44, 111, 127, 58, 124, 61, 121, 117, 108, 126, 56, 122, 93, 125, 50, 0, 64, 102, 118, 15, 112, 79, 48, 23, 90, 19, 13, 83, 116, 120, 103, 38, 77, 115, 39, 12, 76, 47, 113, 87, 49, 85, 53, 88, 74, 36, 73, 119, 60, 98, 24, 26, 21, 51, 52, 10, 123, 9, 4, 104, 68, 70, 40, 34, 55, 1, 110, 54, 57, 29, 27, 16, 46, 81, 78, 65, 8, 82, 67, 80, 2, 18, 66, 14, 17, 99, 6, 43, 94, 114, 22, 31, 35, 84, 3, 89, 95, 97, 33, 20, 91, 86, 25, 69, 5, 100, 41, 45, 28, 105, 96, 107, 72, 109, 11, 71, 37, 32, 7, 101, 92, 75, 30], [63, 59, 42, 106, 62, 44, 111, 127, 58, 121, 61, 124, 108, 117, 126, 56, 122, 64, 93, 79, 0, 15, 125, 50, 102, 13, 118, 60, 77, 116, 53, 12, 112, 23, 21, 38, 83, 74, 115, 10, 36, 119, 73, 85, 76, 103, 19, 70, 24, 87, 52, 98, 49, 26, 88, 114, 9, 113, 68, 8, 104, 51, 90, 34, 54, 47, 1, 120, 123, 4, 39, 80, 18, 65, 48, 2, 81, 16, 27, 29, 82, 66, 55, 57, 110, 78, 46, 14, 40, 94, 99, 3, 17, 6, 67, 43, 97, 22, 31, 89, 33, 20, 86, 25, 69, 84, 100, 45, 72, 75, 95, 35, 96, 11, 7, 107, 71, 109, 5, 32, 91, 37, 92, 30, 28, 105, 41, 101], [63, 59, 42, 62, 106, 44, 111, 58, 127, 124, 121, 108, 61, 117, 56, 126, 64, 122, 125, 15, 0, 93, 79, 102, 53, 115, 118, 23, 13, 77, 50, 10, 119, 104, 12, 73, 60, 98, 83, 19, 113, 103, 112, 85, 9, 24, 38, 70, 21, 90, 52, 36, 116, 8, 74, 26, 87, 49, 76, 120, 1, 48, 88, 39, 54, 46, 68, 51, 34, 123, 27, 47, 2, 65, 18, 57, 4, 66, 114, 16, 17, 67, 55, 81, 82, 80, 99, 31, 110, 3, 78, 29, 40, 97, 6, 14, 22, 86, 84, 89, 94, 33, 20, 25, 35, 69, 43, 100, 71, 7, 45, 109, 107, 72, 75, 32, 105, 5, 96, 95, 101, 30, 91, 41, 11, 28, 37, 92], [63, 59, 62, 42, 106, 44, 111, 127, 58, 124, 61, 117, 108, 121, 56, 126, 122, 93, 125, 15, 79, 50, 0, 118, 102, 13, 85, 12, 23, 116, 64, 77, 112, 113, 119, 19, 38, 83, 60, 76, 90, 26, 21, 52, 24, 49, 104, 87, 10, 73, 120, 4, 115, 88, 9, 98, 53, 103, 39, 70, 48, 123, 74, 34, 68, 8, 47, 27, 36, 81, 55, 54, 57, 40, 80, 51, 46, 1, 3, 18, 78, 16, 2, 67, 110, 114, 22, 65, 82, 29, 17, 66, 99, 14, 6, 20, 94, 31, 97, 25, 89, 33, 86, 84, 69, 43, 100, 5, 35, 95, 107, 75, 72, 41, 109, 32, 105, 71, 11, 28, 7, 30, 45, 92, 96, 91, 101, 37], [63, 59, 42, 62, 106, 44, 111, 127, 58, 124, 61, 121, 108, 117, 126, 56, 0, 122, 64, 93, 125, 50, 15, 118, 79, 102, 19, 13, 60, 98, 77, 112, 119, 36, 85, 53, 23, 116, 12, 113, 10, 120, 76, 26, 74, 83, 104, 73, 90, 103, 49, 9, 88, 24, 38, 52, 4, 21, 87, 115, 47, 48, 34, 123, 68, 70, 65, 8, 39, 1, 54, 27, 66, 18, 80, 29, 2, 51, 6, 40, 46, 55, 3, 82, 110, 81, 78, 114, 14, 57, 99, 31, 16, 89, 94, 67, 33, 17, 43, 97, 25, 22, 72, 86, 20, 35, 95, 100, 5, 84, 109, 32, 69, 28, 11, 75, 105, 45, 91, 71, 107, 92, 41, 7, 30, 96, 37, 101], [63, 59, 62, 106, 42, 44, 111, 58, 127, 124, 61, 121, 108, 117, 126, 56, 15, 122, 93, 50, 125, 79, 64, 13, 102, 19, 60, 112, 118, 12, 0, 77, 23, 74, 52, 116, 76, 98, 83, 87, 104, 10, 21, 53, 90, 103, 115, 119, 73, 85, 38, 9, 24, 26, 36, 113, 48, 123, 49, 120, 34, 54, 88, 39, 47, 8, 6, 29, 18, 27, 55, 70, 4, 82, 46, 68, 16, 65, 78, 1, 81, 3, 66, 80, 51, 57, 14, 110, 17, 22, 31, 94, 2, 99, 67, 40, 114, 89, 20, 25, 97, 72, 35, 95, 91, 84, 33, 86, 11, 32, 69, 28, 100, 43, 109, 107, 5, 7, 71, 75, 45, 105, 96, 30, 41, 92, 101, 37]], "model.layers.26.self_attn.q_proj": [[107, 43, 51, 99, 36, 100, 127, 53, 57, 52, 105, 116, 54, 58, 126, 60, 114, 48, 122, 62, 123, 118, 119, 32, 115, 55, 112, 124, 46, 120, 49, 125, 28, 44, 117, 106, 40, 113, 59, 56, 30, 35, 110, 63, 111, 42, 109, 103, 61, 39, 24, 47, 37, 108, 50, 41, 95, 121, 102, 104, 45, 96, 38, 93, 98, 23, 81, 92, 33, 76, 34, 29, 101, 84, 97, 85, 86, 88, 21, 31, 67, 91, 82, 25, 19, 22, 5, 94, 27, 89, 20, 26, 9, 90, 73, 77, 66, 83, 70, 18, 72, 87, 80, 79, 8, 78, 12, 17, 14, 15, 16, 3, 74, 75, 11, 6, 13, 0, 4, 69, 65, 7, 71, 1, 68, 10, 2, 64], [107, 43, 35, 65, 13, 74, 80, 7, 68, 28, 22, 32, 18, 126, 99, 127, 69, 3, 2, 57, 0, 119, 100, 52, 64, 118, 12, 122, 86, 72, 11, 114, 124, 54, 115, 53, 46, 9, 56, 62, 112, 67, 58, 1, 123, 117, 51, 108, 113, 34, 63, 49, 121, 116, 20, 6, 111, 47, 120, 84, 125, 110, 92, 61, 50, 10, 4, 83, 55, 78, 59, 70, 60, 71, 21, 16, 109, 5, 44, 31, 19, 66, 30, 17, 90, 45, 77, 76, 87, 48, 8, 85, 105, 79, 15, 41, 93, 73, 82, 81, 25, 23, 91, 75, 14, 26, 89, 101, 95, 42, 38, 104, 103, 27, 102, 24, 40, 106, 33, 37, 36, 97, 88, 29, 98, 94, 96, 39], [107, 43, 105, 32, 122, 100, 116, 37, 53, 103, 28, 124, 35, 92, 127, 57, 56, 21, 51, 39, 30, 52, 126, 49, 115, 117, 22, 58, 118, 112, 80, 62, 104, 108, 24, 55, 123, 41, 44, 109, 29, 91, 20, 54, 45, 18, 89, 23, 61, 17, 119, 75, 99, 46, 111, 34, 113, 114, 120, 27, 125, 63, 110, 121, 47, 42, 60, 19, 93, 48, 59, 72, 101, 50, 106, 38, 26, 25, 98, 95, 96, 40, 31, 84, 78, 83, 94, 33, 12, 90, 87, 79, 97, 85, 16, 13, 36, 11, 76, 82, 9, 70, 102, 15, 88, 77, 66, 81, 14, 73, 71, 86, 69, 8, 10, 4, 0, 6, 67, 7, 5, 74, 68, 3, 65, 1, 2, 64], [107, 126, 43, 127, 114, 52, 57, 37, 119, 103, 110, 99, 39, 122, 123, 46, 105, 54, 111, 118, 62, 28, 115, 55, 113, 124, 58, 84, 35, 44, 117, 120, 49, 19, 100, 112, 53, 63, 47, 9, 125, 116, 60, 48, 56, 51, 109, 61, 108, 59, 32, 76, 121, 104, 33, 50, 41, 45, 101, 36, 23, 80, 30, 93, 20, 89, 38, 22, 83, 42, 106, 40, 21, 91, 102, 67, 34, 87, 17, 97, 98, 29, 95, 11, 26, 78, 96, 25, 94, 92, 18, 85, 2, 15, 88, 86, 0, 13, 31, 65, 7, 4, 24, 6, 14, 10, 12, 90, 16, 79, 27, 74, 70, 77, 81, 5, 73, 82, 72, 8, 66, 1, 69, 68, 75, 71, 3, 64], [110, 55, 36, 46, 115, 95, 121, 26, 60, 50, 123, 61, 54, 53, 124, 100, 22, 119, 45, 58, 88, 31, 30, 116, 42, 127, 51, 40, 114, 106, 59, 92, 24, 47, 113, 44, 34, 117, 112, 105, 56, 28, 37, 41, 109, 63, 107, 99, 19, 122, 48, 52, 125, 126, 111, 39, 118, 49, 57, 43, 38, 33, 85, 93, 104, 120, 32, 17, 27, 108, 35, 25, 62, 87, 103, 101, 86, 21, 18, 102, 91, 29, 97, 98, 96, 16, 15, 94, 20, 23, 77, 90, 89, 14, 84, 75, 81, 76, 12, 80, 13, 83, 82, 74, 79, 9, 71, 78, 10, 5, 72, 7, 67, 6, 70, 73, 3, 68, 4, 2, 1, 11, 66, 8, 65, 64, 0, 69], [110, 46, 36, 26, 95, 22, 55, 19, 115, 4, 54, 30, 72, 31, 14, 100, 17, 58, 41, 62, 119, 2, 77, 74, 65, 121, 5, 63, 106, 37, 28, 125, 61, 97, 111, 91, 60, 10, 90, 107, 44, 94, 18, 73, 86, 52, 93, 32, 88, 104, 21, 84, 24, 118, 7, 33, 13, 34, 71, 45, 92, 39, 123, 11, 98, 75, 23, 6, 29, 101, 112, 113, 27, 116, 35, 117, 103, 127, 59, 42, 96, 25, 51, 89, 1, 15, 50, 81, 108, 85, 105, 79, 82, 80, 49, 69, 126, 76, 114, 12, 102, 20, 120, 87, 53, 99, 48, 9, 64, 56, 38, 57, 124, 47, 122, 67, 40, 68, 43, 109, 83, 3, 78, 70, 66, 8, 16, 0], [110, 36, 115, 62, 46, 26, 95, 30, 55, 88, 119, 22, 58, 100, 60, 31, 54, 51, 19, 124, 50, 96, 63, 125, 117, 34, 40, 86, 42, 123, 97, 41, 24, 122, 45, 44, 48, 87, 38, 126, 17, 116, 121, 108, 21, 16, 14, 59, 28, 61, 83, 49, 57, 120, 103, 52, 118, 33, 53, 127, 27, 106, 35, 111, 43, 114, 113, 107, 81, 56, 39, 89, 12, 93, 90, 91, 102, 47, 105, 94, 104, 109, 29, 112, 15, 98, 99, 32, 92, 101, 18, 79, 37, 80, 10, 82, 74, 25, 85, 75, 77, 20, 23, 7, 76, 8, 84, 72, 11, 78, 73, 13, 4, 9, 70, 5, 6, 69, 2, 67, 71, 66, 1, 0, 64, 65, 3, 68], [110, 36, 46, 54, 58, 121, 123, 50, 95, 26, 108, 62, 60, 119, 55, 30, 88, 31, 48, 100, 22, 49, 47, 24, 59, 33, 57, 114, 115, 109, 125, 113, 63, 56, 28, 17, 19, 112, 52, 106, 41, 117, 92, 122, 45, 51, 18, 21, 116, 15, 37, 42, 104, 40, 44, 61, 103, 111, 127, 126, 53, 107, 120, 86, 77, 118, 105, 14, 32, 124, 85, 39, 93, 99, 25, 101, 102, 43, 98, 38, 75, 13, 35, 20, 29, 16, 97, 90, 12, 34, 27, 83, 96, 89, 72, 94, 91, 87, 74, 84, 81, 23, 80, 7, 6, 73, 76, 10, 67, 5, 79, 1, 9, 4, 82, 2, 71, 64, 78, 8, 11, 65, 69, 66, 0, 70, 68, 3], [123, 57, 101, 33, 50, 37, 121, 88, 51, 44, 63, 92, 118, 24, 115, 62, 28, 60, 108, 23, 21, 54, 85, 116, 103, 117, 127, 97, 31, 105, 41, 61, 52, 56, 26, 114, 91, 109, 47, 82, 45, 110, 124, 49, 104, 126, 58, 125, 59, 34, 119, 112, 90, 55, 93, 48, 107, 111, 113, 18, 96, 122, 46, 106, 43, 87, 53, 42, 38, 79, 32, 40, 98, 20, 95, 83, 19, 75, 36, 100, 39, 15, 102, 30, 77, 120, 35, 89, 27, 94, 29, 99, 76, 16, 10, 17, 71, 86, 65, 25, 22, 84, 80, 67, 70, 14, 73, 69, 78, 81, 5, 68, 72, 8, 13, 12, 66, 0, 4, 1, 74, 2, 3, 9, 6, 64, 7, 11], [57, 101, 123, 33, 50, 37, 121, 24, 51, 124, 85, 103, 15, 88, 97, 76, 90, 20, 115, 82, 54, 105, 60, 10, 118, 29, 21, 34, 93, 104, 19, 96, 18, 91, 119, 95, 26, 87, 44, 117, 46, 67, 89, 49, 77, 59, 25, 79, 65, 28, 40, 62, 112, 73, 31, 99, 75, 30, 72, 12, 92, 74, 13, 9, 107, 43, 8, 7, 5, 83, 35, 102, 63, 127, 38, 94, 108, 114, 69, 4, 32, 2, 27, 125, 1, 52, 70, 22, 81, 71, 61, 109, 106, 53, 17, 68, 47, 126, 3, 6, 36, 23, 111, 39, 98, 116, 48, 16, 64, 100, 84, 120, 14, 110, 45, 41, 42, 58, 55, 122, 0, 113, 66, 78, 56, 80, 11, 86], [101, 123, 57, 50, 124, 33, 37, 121, 88, 28, 119, 21, 92, 61, 51, 90, 24, 44, 54, 85, 118, 126, 19, 58, 56, 105, 62, 114, 31, 117, 18, 16, 60, 115, 49, 103, 26, 82, 46, 52, 106, 97, 45, 116, 75, 38, 59, 14, 77, 42, 73, 47, 122, 86, 112, 91, 48, 109, 35, 107, 43, 63, 53, 76, 55, 39, 125, 94, 30, 110, 41, 113, 111, 89, 36, 79, 120, 80, 83, 127, 34, 93, 22, 95, 104, 99, 102, 17, 96, 29, 98, 108, 40, 71, 10, 100, 32, 20, 84, 27, 72, 81, 78, 5, 87, 23, 67, 25, 70, 65, 68, 13, 66, 15, 64, 4, 69, 11, 74, 9, 12, 2, 8, 7, 3, 1, 6, 0], [101, 123, 57, 50, 33, 121, 37, 51, 88, 24, 19, 115, 61, 54, 92, 28, 85, 21, 124, 75, 118, 105, 73, 117, 119, 77, 114, 60, 59, 44, 113, 18, 100, 46, 97, 62, 14, 83, 31, 90, 17, 98, 93, 58, 16, 95, 79, 43, 107, 78, 126, 104, 52, 45, 111, 48, 38, 127, 110, 125, 82, 41, 109, 47, 35, 116, 122, 103, 71, 29, 84, 112, 86, 89, 55, 30, 56, 42, 53, 76, 26, 36, 34, 120, 106, 5, 49, 87, 99, 63, 96, 40, 108, 102, 94, 32, 20, 4, 22, 39, 27, 11, 23, 91, 15, 80, 72, 10, 70, 66, 25, 67, 12, 13, 81, 68, 65, 6, 74, 7, 3, 1, 0, 9, 8, 69, 64, 2], [61, 59, 101, 63, 44, 125, 116, 51, 21, 62, 33, 58, 93, 118, 113, 55, 85, 88, 119, 127, 114, 110, 49, 56, 42, 54, 30, 123, 126, 124, 115, 52, 45, 50, 53, 40, 120, 112, 117, 57, 27, 48, 103, 78, 121, 122, 109, 60, 83, 24, 92, 105, 111, 37, 47, 80, 43, 90, 107, 91, 46, 26, 106, 89, 108, 95, 104, 41, 22, 82, 81, 25, 39, 74, 20, 11, 99, 75, 7, 13, 29, 12, 84, 77, 102, 32, 38, 9, 36, 6, 19, 10, 73, 17, 96, 100, 98, 34, 94, 8, 79, 23, 14, 69, 3, 15, 35, 16, 70, 28, 86, 31, 87, 18, 68, 72, 5, 76, 97, 67, 65, 71, 4, 0, 66, 1, 2, 64], [125, 63, 61, 101, 59, 116, 44, 51, 55, 58, 21, 50, 49, 127, 85, 62, 119, 114, 56, 110, 33, 113, 53, 126, 118, 52, 54, 124, 123, 57, 115, 48, 117, 42, 88, 45, 40, 120, 93, 37, 121, 109, 60, 83, 24, 112, 122, 30, 111, 27, 91, 47, 90, 46, 43, 92, 78, 105, 26, 107, 106, 41, 108, 103, 104, 89, 80, 39, 29, 22, 34, 38, 99, 96, 25, 102, 100, 82, 95, 17, 77, 81, 94, 7, 98, 36, 73, 32, 20, 75, 35, 28, 87, 31, 97, 11, 10, 13, 19, 79, 86, 6, 8, 74, 84, 12, 9, 23, 70, 14, 16, 67, 69, 72, 0, 4, 68, 18, 76, 15, 66, 3, 71, 5, 64, 1, 65, 2], [63, 101, 59, 61, 44, 21, 51, 33, 115, 85, 118, 127, 62, 113, 58, 93, 114, 119, 88, 49, 55, 126, 56, 116, 30, 54, 123, 110, 45, 117, 42, 53, 124, 50, 125, 103, 52, 40, 92, 120, 37, 57, 91, 48, 27, 112, 121, 83, 80, 109, 60, 78, 122, 111, 105, 47, 24, 43, 108, 46, 90, 26, 106, 107, 41, 104, 29, 7, 74, 82, 22, 95, 39, 89, 12, 20, 77, 25, 99, 81, 75, 19, 32, 13, 84, 14, 102, 17, 11, 38, 10, 73, 36, 15, 79, 16, 8, 100, 23, 34, 9, 94, 96, 98, 4, 18, 86, 35, 28, 69, 76, 72, 6, 87, 68, 5, 31, 97, 71, 70, 0, 1, 66, 67, 3, 64, 2, 65], [125, 59, 61, 63, 101, 62, 115, 54, 55, 116, 56, 33, 58, 44, 120, 126, 119, 114, 50, 110, 51, 124, 127, 118, 88, 53, 123, 37, 48, 49, 57, 117, 113, 112, 40, 52, 121, 45, 111, 60, 122, 90, 109, 93, 83, 42, 47, 103, 46, 43, 24, 108, 106, 81, 107, 41, 104, 78, 85, 105, 22, 7, 32, 39, 25, 89, 19, 80, 30, 74, 73, 21, 34, 6, 17, 27, 29, 91, 11, 100, 102, 26, 69, 36, 38, 77, 79, 75, 15, 13, 95, 82, 86, 12, 16, 14, 28, 97, 35, 98, 10, 9, 99, 87, 84, 8, 96, 92, 31, 94, 20, 4, 76, 68, 70, 5, 2, 23, 1, 0, 18, 67, 71, 66, 72, 65, 3, 64], [41, 117, 55, 34, 52, 31, 23, 115, 119, 21, 62, 39, 80, 58, 83, 45, 27, 105, 60, 99, 59, 44, 63, 13, 113, 30, 61, 72, 103, 57, 11, 109, 42, 29, 74, 32, 93, 94, 126, 111, 33, 88, 89, 8, 108, 112, 104, 121, 92, 116, 56, 118, 81, 90, 100, 54, 75, 26, 43, 17, 47, 37, 107, 25, 87, 20, 85, 24, 101, 51, 18, 91, 48, 15, 36, 22, 95, 124, 76, 127, 106, 110, 123, 71, 82, 50, 97, 49, 35, 38, 122, 114, 86, 84, 12, 19, 79, 40, 69, 96, 14, 46, 102, 28, 120, 53, 125, 78, 16, 67, 5, 10, 77, 64, 9, 66, 6, 7, 68, 98, 73, 4, 3, 70, 2, 65, 0, 1], [41, 55, 34, 52, 31, 13, 21, 23, 83, 80, 117, 27, 11, 66, 115, 105, 6, 96, 109, 28, 74, 19, 8, 49, 108, 44, 60, 73, 26, 64, 63, 67, 37, 30, 72, 119, 62, 4, 91, 45, 33, 85, 18, 22, 15, 7, 89, 43, 25, 17, 56, 87, 94, 9, 78, 84, 113, 122, 99, 77, 59, 93, 5, 61, 88, 57, 118, 81, 32, 48, 97, 70, 102, 10, 46, 42, 16, 29, 120, 92, 35, 71, 24, 82, 36, 103, 65, 69, 116, 2, 101, 20, 106, 40, 12, 14, 76, 90, 39, 68, 54, 127, 1, 53, 124, 123, 79, 75, 100, 110, 104, 111, 125, 86, 121, 47, 58, 3, 112, 38, 126, 50, 95, 0, 51, 114, 107, 98], [55, 41, 59, 34, 117, 45, 31, 23, 112, 54, 105, 119, 120, 110, 44, 111, 56, 46, 61, 92, 47, 116, 115, 58, 108, 42, 28, 118, 48, 124, 63, 121, 122, 123, 51, 114, 39, 127, 126, 109, 60, 25, 43, 125, 53, 57, 107, 94, 113, 21, 62, 50, 36, 106, 40, 20, 30, 49, 83, 97, 26, 103, 88, 102, 104, 100, 38, 33, 79, 52, 37, 99, 93, 29, 101, 80, 15, 35, 11, 91, 95, 86, 66, 96, 32, 71, 24, 27, 98, 69, 10, 7, 81, 87, 3, 74, 85, 17, 84, 14, 12, 89, 90, 22, 5, 1, 76, 70, 65, 82, 13, 64, 67, 0, 9, 8, 78, 19, 18, 68, 2, 4, 6, 73, 72, 75, 16, 77], [41, 55, 34, 52, 23, 31, 21, 13, 83, 80, 82, 74, 40, 117, 119, 125, 4, 103, 49, 28, 27, 105, 59, 60, 17, 115, 64, 61, 43, 44, 6, 101, 113, 66, 26, 96, 47, 118, 57, 56, 45, 94, 42, 62, 109, 3, 25, 116, 39, 70, 100, 65, 111, 112, 11, 53, 97, 72, 54, 108, 36, 126, 0, 85, 33, 104, 99, 29, 63, 123, 110, 18, 91, 16, 50, 38, 127, 32, 124, 68, 93, 87, 88, 92, 106, 122, 48, 102, 46, 22, 37, 84, 95, 58, 9, 89, 107, 69, 114, 35, 81, 78, 121, 30, 51, 10, 120, 79, 73, 19, 14, 15, 24, 7, 8, 90, 1, 86, 71, 98, 5, 12, 75, 20, 77, 76, 2, 67], [45, 103, 109, 38, 29, 54, 126, 87, 119, 57, 107, 83, 84, 114, 115, 117, 93, 118, 111, 47, 80, 122, 50, 95, 31, 46, 22, 121, 113, 127, 78, 61, 13, 108, 63, 85, 44, 10, 124, 53, 26, 52, 112, 98, 17, 59, 62, 49, 56, 125, 41, 116, 123, 51, 32, 71, 55, 58, 120, 110, 104, 30, 60, 96, 86, 36, 21, 42, 12, 48, 102, 37, 24, 34, 40, 100, 28, 106, 2, 88, 33, 35, 43, 89, 94, 70, 72, 101, 9, 14, 91, 92, 97, 90, 27, 69, 23, 76, 81, 105, 3, 6, 25, 75, 99, 19, 8, 0, 11, 67, 82, 68, 18, 4, 79, 20, 15, 73, 66, 7, 39, 65, 5, 1, 16, 64, 77, 74], [103, 45, 29, 109, 87, 80, 84, 13, 54, 10, 71, 69, 113, 32, 23, 115, 65, 47, 67, 126, 93, 64, 118, 3, 12, 127, 60, 77, 125, 38, 94, 17, 30, 62, 31, 72, 95, 57, 49, 59, 88, 20, 39, 66, 89, 68, 14, 119, 16, 83, 27, 24, 120, 74, 21, 102, 101, 79, 111, 34, 22, 18, 96, 7, 82, 15, 98, 85, 58, 90, 100, 121, 107, 51, 81, 76, 36, 5, 37, 42, 78, 25, 11, 48, 56, 63, 97, 46, 117, 91, 33, 112, 116, 4, 55, 104, 19, 99, 92, 73, 86, 41, 28, 50, 53, 6, 110, 105, 108, 52, 124, 35, 40, 61, 44, 122, 8, 70, 43, 123, 9, 106, 26, 114, 2, 0, 75, 1], [103, 45, 29, 109, 84, 87, 38, 13, 80, 54, 10, 69, 93, 71, 12, 83, 113, 3, 126, 2, 47, 32, 78, 127, 82, 79, 118, 65, 1, 115, 23, 22, 119, 88, 59, 64, 6, 95, 0, 46, 60, 89, 77, 123, 111, 31, 37, 49, 8, 62, 30, 66, 57, 122, 24, 5, 70, 97, 72, 33, 81, 125, 42, 15, 16, 58, 68, 43, 52, 7, 25, 124, 14, 74, 67, 96, 50, 112, 121, 20, 106, 85, 98, 107, 41, 92, 9, 34, 117, 17, 53, 51, 61, 120, 63, 48, 26, 114, 40, 91, 18, 110, 36, 55, 11, 101, 76, 19, 104, 99, 116, 108, 4, 35, 27, 94, 56, 100, 105, 86, 90, 44, 73, 21, 102, 28, 39, 75], [103, 45, 29, 109, 87, 84, 54, 80, 13, 10, 38, 126, 69, 71, 113, 93, 115, 32, 65, 47, 82, 67, 23, 118, 127, 30, 64, 95, 88, 40, 77, 56, 60, 125, 21, 102, 24, 94, 72, 79, 31, 46, 41, 108, 49, 83, 58, 20, 11, 101, 121, 78, 57, 25, 76, 66, 22, 26, 53, 52, 75, 117, 50, 17, 3, 119, 114, 44, 74, 42, 0, 16, 34, 9, 91, 97, 100, 48, 14, 4, 63, 62, 18, 59, 111, 124, 7, 15, 19, 5, 61, 96, 89, 122, 36, 37, 43, 112, 106, 8, 28, 55, 33, 90, 85, 104, 51, 105, 99, 81, 35, 107, 68, 110, 98, 86, 123, 12, 116, 27, 2, 39, 70, 73, 92, 120, 6, 1], [104, 59, 99, 92, 20, 114, 119, 54, 62, 82, 117, 55, 95, 98, 51, 107, 34, 63, 47, 120, 25, 125, 124, 49, 113, 115, 60, 86, 45, 13, 30, 112, 16, 123, 28, 103, 33, 58, 108, 121, 50, 48, 42, 56, 18, 44, 89, 61, 31, 39, 110, 87, 127, 52, 106, 116, 46, 122, 126, 88, 53, 57, 111, 105, 37, 118, 109, 41, 40, 43, 78, 29, 101, 38, 97, 93, 11, 94, 17, 22, 24, 76, 10, 80, 67, 15, 84, 36, 100, 71, 90, 21, 102, 79, 77, 26, 73, 96, 68, 85, 75, 23, 8, 91, 27, 19, 1, 81, 32, 35, 74, 14, 70, 7, 5, 83, 9, 66, 12, 0, 4, 65, 6, 69, 64, 72, 3, 2], [104, 59, 99, 92, 114, 82, 20, 51, 98, 62, 115, 95, 63, 105, 54, 47, 13, 16, 61, 117, 30, 34, 25, 119, 58, 122, 124, 45, 60, 109, 116, 86, 67, 55, 89, 87, 102, 66, 28, 11, 56, 80, 127, 43, 50, 88, 113, 15, 65, 123, 46, 57, 10, 18, 64, 53, 70, 42, 101, 7, 49, 48, 9, 31, 17, 97, 103, 8, 106, 44, 112, 78, 73, 40, 111, 68, 69, 22, 12, 94, 0, 5, 37, 121, 120, 76, 41, 125, 74, 33, 39, 84, 93, 29, 79, 38, 21, 118, 126, 24, 108, 75, 91, 32, 110, 107, 26, 85, 52, 71, 19, 90, 36, 83, 6, 23, 77, 81, 1, 100, 27, 14, 35, 96, 72, 4, 3, 2], [104, 59, 99, 92, 114, 20, 82, 16, 51, 116, 13, 25, 95, 87, 89, 54, 10, 115, 124, 34, 17, 8, 56, 50, 45, 53, 21, 100, 78, 76, 94, 86, 31, 63, 68, 62, 61, 18, 66, 70, 67, 65, 71, 123, 73, 80, 125, 98, 74, 85, 41, 52, 44, 12, 101, 97, 47, 39, 28, 40, 88, 91, 55, 29, 9, 127, 103, 15, 107, 118, 119, 108, 96, 11, 69, 27, 122, 37, 49, 90, 33, 84, 19, 7, 30, 5, 24, 106, 42, 121, 126, 32, 22, 58, 113, 110, 48, 105, 77, 112, 36, 6, 111, 79, 93, 23, 120, 43, 64, 109, 38, 75, 102, 26, 0, 60, 117, 46, 57, 14, 1, 83, 81, 35, 2, 4, 72, 3], [104, 59, 122, 45, 92, 20, 95, 114, 25, 99, 86, 115, 54, 62, 82, 63, 61, 34, 116, 51, 98, 49, 11, 56, 43, 0, 127, 124, 123, 33, 4, 119, 53, 1, 28, 110, 2, 46, 41, 58, 113, 31, 125, 55, 89, 120, 52, 70, 48, 111, 126, 76, 73, 50, 108, 118, 39, 121, 71, 112, 88, 78, 5, 13, 42, 106, 117, 109, 60, 83, 57, 44, 107, 105, 30, 22, 19, 87, 90, 47, 26, 102, 103, 84, 101, 16, 14, 38, 36, 21, 93, 3, 24, 100, 32, 37, 94, 6, 91, 81, 97, 35, 96, 15, 18, 8, 17, 29, 27, 65, 40, 12, 85, 10, 23, 69, 7, 66, 74, 75, 80, 64, 79, 9, 67, 68, 77, 72], [107, 56, 126, 43, 99, 87, 82, 110, 91, 78, 20, 25, 81, 40, 75, 104, 79, 27, 12, 112, 7, 72, 52, 31, 86, 32, 89, 117, 118, 49, 116, 92, 10, 125, 62, 48, 55, 16, 88, 127, 109, 95, 120, 26, 28, 85, 121, 69, 17, 93, 23, 44, 113, 74, 37, 77, 6, 57, 14, 53, 122, 84, 97, 58, 9, 80, 115, 61, 54, 11, 50, 101, 90, 102, 29, 76, 15, 108, 30, 51, 46, 68, 124, 18, 21, 83, 60, 114, 119, 105, 19, 96, 47, 33, 45, 24, 123, 38, 98, 59, 41, 100, 5, 103, 111, 94, 36, 63, 22, 2, 34, 106, 67, 8, 73, 71, 42, 39, 35, 70, 13, 4, 64, 3, 1, 65, 66, 0], [107, 43, 112, 56, 53, 127, 48, 115, 50, 116, 55, 125, 117, 25, 99, 19, 52, 120, 126, 61, 37, 118, 46, 121, 57, 62, 93, 113, 63, 58, 124, 29, 51, 114, 109, 122, 59, 54, 60, 49, 110, 91, 45, 119, 89, 31, 123, 36, 105, 47, 104, 111, 33, 44, 40, 86, 108, 24, 106, 42, 27, 28, 41, 95, 88, 38, 80, 39, 35, 101, 100, 83, 34, 102, 22, 20, 16, 103, 13, 98, 21, 77, 87, 97, 92, 90, 30, 17, 26, 79, 82, 32, 23, 70, 14, 96, 94, 81, 4, 10, 65, 85, 7, 11, 68, 6, 3, 1, 67, 84, 71, 15, 78, 18, 74, 5, 69, 75, 2, 72, 64, 66, 0, 8, 12, 9, 76, 73], [107, 126, 56, 99, 43, 117, 109, 25, 79, 86, 82, 31, 93, 91, 84, 89, 118, 116, 18, 33, 4, 110, 0, 73, 20, 77, 53, 28, 66, 78, 9, 55, 96, 41, 105, 1, 75, 98, 39, 112, 37, 81, 120, 121, 125, 72, 127, 36, 32, 17, 26, 30, 52, 95, 113, 54, 64, 80, 29, 3, 97, 92, 42, 6, 27, 76, 60, 50, 101, 58, 61, 94, 48, 40, 22, 114, 49, 63, 62, 57, 51, 44, 46, 122, 19, 23, 5, 14, 104, 38, 24, 15, 87, 90, 123, 69, 16, 67, 2, 34, 100, 59, 45, 70, 12, 10, 103, 21, 88, 102, 13, 7, 74, 119, 108, 124, 47, 83, 85, 111, 11, 68, 8, 35, 71, 65, 106, 115], [56, 107, 112, 99, 125, 19, 50, 126, 116, 43, 127, 25, 52, 117, 62, 121, 120, 48, 53, 45, 31, 124, 93, 57, 113, 40, 61, 109, 55, 46, 60, 51, 122, 49, 118, 63, 114, 54, 59, 29, 58, 123, 115, 119, 105, 47, 111, 36, 104, 89, 108, 110, 33, 35, 44, 91, 24, 86, 106, 27, 41, 42, 39, 38, 98, 88, 101, 37, 100, 83, 95, 103, 102, 28, 21, 80, 22, 92, 34, 90, 13, 74, 77, 17, 23, 30, 97, 87, 32, 26, 94, 81, 16, 96, 75, 10, 7, 71, 14, 20, 78, 65, 82, 70, 6, 1, 3, 79, 69, 4, 67, 72, 85, 18, 68, 11, 2, 5, 8, 66, 84, 15, 0, 64, 12, 76, 9, 73]], "model.layers.26.self_attn.k_proj": [[43, 22, 107, 99, 92, 126, 127, 96, 64, 47, 46, 56, 48, 80, 119, 124, 68, 7, 122, 13, 116, 57, 49, 109, 54, 113, 18, 2, 117, 112, 63, 118, 44, 62, 115, 55, 52, 58, 123, 50, 125, 53, 120, 51, 59, 121, 60, 110, 108, 114, 61, 101, 74, 98, 111, 11, 106, 42, 45, 76, 41, 102, 3, 37, 78, 36, 6, 40, 105, 103, 34, 104, 1, 97, 69, 39, 38, 10, 70, 8, 21, 89, 72, 15, 5, 94, 31, 90, 84, 33, 95, 87, 30, 9, 17, 100, 23, 24, 82, 28, 29, 26, 88, 91, 32, 81, 12, 25, 93, 27, 73, 85, 75, 67, 19, 20, 79, 71, 35, 83, 14, 16, 66, 77, 4, 65, 0, 86], [46, 110, 100, 31, 26, 22, 19, 86, 17, 14, 94, 12, 74, 28, 58, 121, 60, 24, 77, 15, 72, 62, 104, 123, 54, 29, 118, 43, 10, 106, 122, 52, 112, 32, 37, 61, 108, 68, 83, 49, 67, 7, 16, 40, 63, 125, 5, 44, 119, 127, 116, 97, 98, 33, 38, 113, 50, 27, 59, 89, 57, 41, 42, 111, 115, 109, 102, 126, 93, 91, 81, 56, 34, 101, 53, 69, 55, 70, 124, 103, 105, 84, 99, 48, 18, 117, 107, 71, 39, 51, 1, 95, 35, 23, 114, 47, 76, 120, 96, 87, 36, 65, 45, 0, 21, 64, 30, 88, 92, 25, 20, 85, 9, 13, 82, 8, 79, 75, 90, 6, 66, 80, 4, 3, 11, 73, 78, 2], [123, 57, 97, 37, 24, 95, 92, 85, 82, 90, 80, 108, 115, 124, 73, 75, 19, 50, 114, 104, 76, 40, 78, 77, 83, 101, 53, 59, 121, 39, 35, 70, 119, 10, 0, 88, 105, 14, 55, 111, 71, 5, 48, 45, 126, 61, 4, 103, 30, 93, 109, 52, 21, 43, 1, 113, 116, 3, 112, 49, 42, 72, 54, 62, 96, 44, 117, 107, 41, 51, 58, 106, 60, 81, 118, 66, 46, 56, 127, 47, 110, 15, 63, 122, 120, 17, 20, 125, 36, 86, 89, 102, 100, 91, 32, 99, 34, 33, 84, 98, 27, 22, 94, 38, 29, 87, 2, 13, 25, 23, 79, 18, 16, 67, 31, 8, 7, 74, 26, 28, 64, 11, 6, 12, 69, 65, 68, 9], [125, 37, 63, 61, 22, 97, 59, 114, 126, 119, 51, 55, 124, 117, 112, 58, 57, 127, 53, 60, 121, 110, 123, 108, 122, 56, 52, 49, 111, 107, 44, 118, 113, 45, 31, 50, 62, 47, 29, 115, 120, 48, 109, 104, 54, 46, 36, 106, 43, 101, 90, 105, 79, 42, 34, 27, 17, 102, 116, 41, 40, 83, 38, 35, 39, 98, 85, 88, 92, 86, 30, 81, 89, 103, 87, 91, 99, 20, 24, 25, 93, 100, 13, 15, 78, 95, 82, 28, 18, 32, 96, 73, 21, 75, 94, 84, 26, 19, 33, 12, 69, 23, 11, 6, 72, 4, 8, 80, 10, 77, 76, 7, 14, 16, 67, 2, 74, 65, 71, 64, 9, 70, 1, 68, 66, 5, 0, 3], [105, 55, 98, 52, 95, 23, 21, 80, 83, 64, 13, 61, 11, 116, 74, 6, 28, 117, 51, 60, 27, 46, 69, 8, 49, 109, 2, 90, 71, 59, 123, 65, 42, 57, 12, 111, 4, 115, 127, 67, 113, 72, 103, 53, 126, 50, 79, 54, 58, 93, 82, 124, 43, 118, 26, 100, 62, 119, 108, 56, 110, 25, 122, 44, 41, 112, 63, 78, 47, 34, 97, 45, 114, 99, 1, 104, 106, 29, 125, 37, 121, 107, 35, 30, 101, 32, 38, 39, 3, 120, 96, 18, 40, 102, 81, 33, 24, 48, 94, 15, 36, 20, 88, 17, 86, 91, 76, 9, 14, 77, 22, 89, 92, 75, 66, 5, 84, 31, 73, 10, 7, 87, 70, 19, 0, 85, 16, 68], [109, 39, 80, 84, 10, 93, 13, 87, 45, 71, 54, 69, 64, 3, 111, 118, 57, 12, 119, 29, 126, 117, 65, 121, 49, 78, 68, 106, 88, 31, 1, 9, 122, 127, 55, 123, 22, 51, 115, 113, 17, 104, 102, 96, 53, 82, 46, 76, 58, 125, 41, 14, 61, 63, 83, 2, 52, 59, 62, 112, 72, 15, 44, 60, 124, 101, 100, 32, 86, 36, 26, 50, 94, 95, 43, 30, 48, 105, 6, 47, 114, 108, 85, 107, 110, 42, 34, 27, 56, 98, 120, 116, 66, 21, 37, 75, 0, 40, 25, 91, 35, 28, 90, 79, 4, 67, 24, 89, 33, 99, 92, 38, 70, 18, 5, 19, 8, 23, 97, 73, 103, 7, 11, 81, 74, 77, 20, 16], [59, 40, 31, 20, 28, 86, 114, 98, 13, 113, 89, 88, 54, 115, 82, 56, 116, 111, 50, 99, 124, 21, 66, 0, 67, 35, 127, 41, 87, 62, 16, 53, 49, 17, 121, 27, 15, 68, 80, 126, 18, 11, 78, 44, 48, 55, 1, 109, 8, 70, 123, 47, 92, 10, 105, 58, 57, 112, 25, 60, 46, 118, 33, 122, 106, 63, 23, 51, 76, 29, 43, 125, 83, 30, 34, 36, 7, 61, 117, 119, 45, 52, 120, 73, 93, 65, 103, 39, 108, 26, 107, 42, 69, 9, 100, 96, 74, 97, 5, 101, 110, 102, 38, 85, 94, 79, 24, 81, 71, 64, 37, 90, 14, 32, 19, 6, 12, 72, 91, 4, 2, 77, 22, 75, 95, 3, 84, 104], [43, 35, 86, 56, 107, 29, 25, 19, 118, 126, 117, 95, 127, 116, 121, 57, 79, 109, 119, 55, 101, 92, 112, 125, 58, 63, 113, 41, 97, 52, 59, 51, 53, 54, 122, 114, 115, 61, 60, 62, 49, 124, 123, 45, 50, 111, 105, 82, 120, 48, 47, 64, 42, 44, 110, 20, 27, 34, 94, 46, 108, 80, 103, 38, 77, 37, 106, 104, 102, 75, 70, 100, 9, 65, 98, 39, 4, 40, 2, 36, 85, 84, 24, 7, 32, 67, 99, 90, 93, 69, 28, 16, 71, 26, 73, 96, 33, 11, 30, 87, 78, 91, 31, 76, 17, 81, 14, 21, 12, 18, 68, 23, 74, 8, 88, 10, 72, 13, 5, 15, 0, 83, 89, 1, 6, 3, 22, 66]], "model.layers.26.self_attn.qk_proj": [[59, 43, 57, 123, 109, 110, 107, 46, 63, 61, 55, 125, 56, 45, 105, 37, 52, 54, 126, 29, 41, 50, 117, 99, 40, 101, 28, 115, 114, 23, 62, 119, 22, 58, 95, 92, 127, 121, 80, 31, 124, 87, 16, 53, 116, 13, 88, 86, 103, 60, 39, 77, 51, 20, 112, 111, 44, 21, 19, 104, 84, 97, 90, 48, 24, 47, 49, 85, 82, 93, 113, 118, 83, 122, 120, 10, 34, 18, 26, 74, 33, 106, 100, 98, 42, 108, 7, 35, 25, 89, 36, 91, 71, 64, 30, 72, 75, 11, 102, 32, 96, 81, 79, 15, 14, 0, 78, 17, 76, 38, 67, 12, 3, 69, 27, 94, 5, 66, 68, 1, 73, 9, 70, 65, 8, 2, 6, 4], [43, 59, 57, 123, 109, 107, 110, 46, 61, 63, 125, 55, 56, 45, 105, 52, 37, 54, 126, 29, 117, 101, 41, 115, 50, 114, 62, 40, 99, 127, 58, 28, 31, 121, 23, 92, 103, 51, 22, 124, 119, 112, 116, 77, 80, 95, 104, 39, 16, 87, 86, 24, 60, 49, 53, 13, 88, 85, 48, 21, 111, 93, 97, 44, 47, 120, 122, 20, 113, 19, 118, 83, 90, 34, 10, 84, 108, 82, 74, 100, 18, 33, 26, 35, 25, 89, 98, 42, 91, 71, 106, 15, 30, 36, 0, 75, 64, 72, 96, 11, 32, 7, 79, 67, 12, 14, 102, 5, 76, 81, 78, 3, 38, 94, 69, 27, 17, 9, 66, 73, 2, 6, 4, 1, 68, 70, 8, 65], [59, 43, 57, 123, 109, 107, 110, 46, 61, 63, 55, 125, 56, 45, 105, 54, 52, 37, 126, 29, 117, 41, 101, 114, 62, 99, 50, 40, 115, 124, 28, 104, 127, 121, 119, 22, 51, 112, 103, 86, 53, 80, 95, 92, 116, 31, 23, 87, 48, 77, 44, 16, 49, 88, 39, 58, 113, 97, 13, 93, 34, 47, 24, 21, 60, 20, 122, 118, 85, 83, 111, 90, 74, 19, 84, 120, 98, 108, 82, 10, 18, 33, 36, 26, 100, 35, 64, 106, 42, 91, 71, 89, 25, 0, 7, 15, 78, 38, 81, 17, 30, 11, 76, 79, 75, 14, 67, 96, 102, 32, 69, 72, 94, 3, 27, 5, 12, 1, 6, 68, 2, 4, 73, 65, 8, 66, 9, 70], [43, 59, 57, 123, 109, 110, 107, 46, 63, 61, 55, 56, 125, 45, 105, 52, 37, 126, 54, 117, 29, 114, 101, 50, 49, 41, 121, 40, 115, 62, 124, 99, 53, 44, 112, 104, 80, 119, 116, 23, 28, 22, 16, 92, 95, 86, 51, 77, 60, 113, 31, 13, 103, 58, 127, 111, 88, 87, 93, 120, 19, 48, 20, 39, 21, 118, 24, 85, 10, 90, 84, 97, 122, 108, 47, 74, 34, 83, 82, 33, 98, 26, 18, 0, 100, 106, 71, 7, 36, 89, 35, 81, 79, 42, 15, 102, 64, 91, 25, 17, 75, 78, 30, 14, 76, 5, 67, 11, 3, 38, 69, 72, 8, 96, 27, 6, 4, 12, 94, 32, 65, 68, 2, 1, 66, 73, 9, 70], [43, 59, 57, 123, 109, 110, 107, 46, 63, 61, 125, 56, 55, 45, 105, 52, 126, 37, 117, 54, 114, 50, 41, 29, 40, 121, 99, 119, 44, 101, 49, 124, 23, 22, 62, 127, 28, 51, 104, 53, 80, 16, 13, 116, 88, 115, 60, 86, 95, 118, 112, 92, 87, 77, 120, 31, 47, 58, 20, 84, 93, 103, 111, 74, 39, 34, 19, 10, 21, 97, 122, 83, 24, 48, 85, 113, 90, 82, 108, 18, 26, 98, 33, 35, 7, 100, 106, 64, 89, 71, 36, 75, 96, 0, 15, 81, 91, 25, 79, 5, 67, 76, 30, 102, 8, 78, 3, 11, 17, 32, 69, 14, 42, 4, 94, 12, 1, 6, 2, 38, 65, 27, 72, 68, 9, 66, 73, 70], [43, 59, 57, 123, 107, 110, 109, 46, 63, 61, 125, 55, 56, 45, 105, 52, 37, 54, 117, 29, 50, 126, 41, 99, 40, 23, 80, 115, 22, 114, 101, 86, 44, 116, 124, 16, 95, 28, 121, 104, 62, 13, 60, 92, 88, 49, 119, 118, 19, 103, 93, 51, 127, 77, 31, 39, 58, 85, 87, 112, 20, 21, 53, 111, 120, 84, 47, 97, 10, 74, 24, 48, 122, 34, 113, 83, 26, 82, 33, 108, 90, 100, 106, 36, 98, 15, 18, 35, 25, 7, 64, 0, 91, 11, 89, 81, 71, 30, 14, 12, 75, 102, 79, 32, 96, 42, 17, 5, 78, 76, 38, 8, 3, 67, 94, 72, 69, 66, 70, 4, 6, 27, 9, 73, 65, 68, 1, 2], [59, 43, 57, 123, 109, 110, 107, 46, 63, 61, 55, 125, 56, 45, 105, 52, 37, 54, 29, 126, 41, 50, 117, 28, 121, 114, 124, 40, 99, 101, 22, 80, 119, 95, 87, 23, 86, 49, 115, 16, 92, 31, 44, 48, 88, 77, 13, 51, 116, 104, 62, 20, 122, 60, 58, 19, 84, 97, 127, 85, 118, 39, 103, 47, 34, 21, 120, 93, 90, 24, 112, 83, 74, 100, 111, 10, 108, 18, 26, 53, 82, 113, 98, 35, 33, 25, 30, 36, 106, 91, 89, 15, 7, 42, 11, 78, 71, 102, 14, 79, 75, 17, 64, 32, 81, 0, 96, 5, 38, 12, 8, 76, 94, 69, 3, 67, 27, 6, 4, 73, 9, 68, 1, 2, 66, 65, 70, 72], [59, 43, 57, 123, 107, 109, 110, 46, 61, 63, 55, 125, 56, 45, 105, 52, 37, 29, 126, 54, 117, 41, 101, 115, 114, 28, 121, 99, 50, 22, 119, 23, 116, 31, 86, 40, 124, 95, 92, 87, 51, 62, 16, 49, 103, 80, 88, 112, 39, 127, 104, 53, 113, 13, 44, 48, 60, 58, 97, 20, 19, 122, 90, 77, 21, 93, 85, 24, 100, 84, 118, 120, 34, 83, 82, 42, 74, 108, 91, 47, 18, 33, 26, 10, 36, 25, 106, 35, 111, 30, 89, 98, 7, 14, 78, 79, 17, 0, 32, 75, 81, 64, 15, 12, 71, 8, 27, 94, 102, 5, 38, 11, 3, 96, 67, 69, 66, 76, 65, 2, 4, 73, 9, 1, 68, 70, 6, 72], [59, 43, 57, 123, 107, 109, 110, 46, 63, 61, 55, 56, 125, 45, 105, 37, 52, 29, 54, 41, 117, 126, 101, 50, 115, 58, 99, 124, 114, 28, 40, 121, 44, 23, 119, 92, 22, 62, 116, 95, 31, 16, 127, 49, 13, 104, 97, 87, 122, 60, 86, 53, 112, 77, 103, 88, 80, 113, 85, 51, 48, 93, 21, 118, 34, 20, 24, 39, 84, 111, 90, 47, 19, 82, 100, 120, 74, 36, 108, 83, 33, 18, 91, 10, 26, 25, 106, 35, 98, 89, 30, 42, 38, 15, 71, 14, 32, 11, 78, 79, 96, 7, 27, 8, 17, 75, 94, 64, 102, 12, 0, 76, 81, 67, 5, 3, 69, 70, 9, 4, 1, 66, 68, 73, 2, 65, 72, 6], [59, 43, 57, 123, 109, 46, 110, 107, 63, 61, 55, 125, 56, 45, 105, 37, 54, 52, 126, 29, 117, 115, 50, 41, 40, 101, 62, 28, 114, 112, 99, 121, 119, 58, 124, 53, 116, 44, 23, 95, 103, 51, 86, 92, 22, 127, 104, 60, 31, 13, 16, 48, 80, 87, 49, 77, 122, 88, 21, 39, 118, 108, 113, 120, 111, 93, 20, 97, 24, 84, 19, 85, 90, 18, 34, 74, 47, 10, 100, 83, 33, 36, 82, 7, 71, 35, 98, 0, 91, 42, 25, 75, 11, 26, 8, 64, 79, 30, 96, 89, 14, 15, 106, 17, 5, 67, 76, 3, 78, 81, 38, 70, 102, 69, 32, 12, 94, 1, 9, 65, 2, 4, 73, 27, 66, 68, 72, 6], [59, 43, 57, 123, 109, 110, 46, 107, 61, 63, 56, 55, 125, 45, 105, 52, 37, 126, 54, 29, 115, 41, 121, 117, 101, 99, 50, 114, 119, 80, 28, 53, 40, 127, 16, 51, 60, 22, 13, 62, 23, 124, 58, 44, 87, 116, 86, 95, 112, 77, 104, 49, 31, 88, 103, 20, 111, 92, 113, 39, 74, 19, 84, 21, 97, 118, 122, 10, 90, 24, 85, 120, 83, 93, 26, 33, 108, 100, 7, 18, 47, 34, 48, 98, 82, 91, 35, 42, 71, 36, 25, 0, 11, 64, 81, 79, 15, 67, 75, 14, 12, 106, 3, 38, 89, 76, 78, 96, 8, 17, 30, 69, 102, 5, 32, 72, 73, 4, 1, 70, 2, 94, 27, 65, 68, 66, 6, 9], [43, 59, 57, 123, 109, 107, 46, 110, 63, 61, 55, 125, 56, 45, 105, 52, 37, 126, 29, 54, 41, 114, 101, 117, 22, 121, 40, 115, 99, 28, 86, 80, 23, 50, 119, 13, 104, 95, 31, 16, 124, 127, 51, 92, 87, 44, 58, 116, 77, 88, 97, 39, 60, 20, 62, 93, 53, 103, 84, 118, 74, 112, 113, 21, 85, 111, 48, 120, 90, 19, 49, 10, 33, 82, 83, 122, 34, 24, 18, 25, 100, 108, 26, 35, 91, 47, 89, 71, 98, 79, 7, 15, 30, 96, 36, 106, 11, 12, 78, 0, 102, 14, 64, 75, 17, 81, 42, 32, 69, 67, 27, 73, 38, 76, 3, 8, 65, 94, 5, 2, 4, 1, 72, 6, 9, 70, 68, 66], [59, 43, 57, 123, 109, 107, 110, 46, 63, 61, 55, 125, 56, 45, 105, 37, 52, 54, 29, 126, 121, 114, 50, 101, 41, 117, 115, 99, 22, 95, 119, 40, 28, 86, 87, 116, 80, 58, 118, 92, 124, 62, 103, 104, 31, 88, 23, 97, 127, 93, 122, 51, 16, 44, 39, 112, 49, 113, 77, 60, 33, 84, 48, 34, 85, 19, 53, 13, 108, 111, 120, 90, 24, 21, 10, 20, 83, 26, 82, 18, 36, 30, 74, 89, 98, 91, 100, 25, 35, 47, 42, 106, 96, 7, 64, 79, 15, 32, 75, 78, 71, 12, 81, 38, 94, 11, 69, 14, 102, 76, 17, 67, 5, 27, 72, 8, 0, 1, 6, 3, 2, 73, 4, 65, 66, 9, 68, 70], [59, 43, 57, 123, 107, 109, 110, 46, 63, 61, 55, 125, 45, 56, 105, 37, 52, 126, 29, 54, 101, 41, 116, 50, 117, 114, 121, 124, 115, 119, 23, 40, 99, 62, 120, 51, 86, 13, 28, 103, 122, 95, 104, 87, 31, 16, 22, 80, 127, 92, 118, 44, 53, 49, 113, 39, 111, 58, 88, 19, 60, 112, 77, 24, 97, 84, 10, 85, 21, 20, 90, 48, 93, 34, 108, 106, 100, 47, 18, 83, 82, 74, 26, 33, 98, 42, 0, 30, 35, 36, 7, 89, 71, 91, 64, 25, 72, 15, 81, 79, 75, 38, 69, 12, 67, 3, 11, 32, 14, 94, 96, 5, 78, 17, 76, 102, 65, 8, 4, 6, 1, 2, 73, 27, 9, 70, 66, 68], [59, 43, 57, 123, 109, 107, 110, 46, 63, 61, 55, 125, 56, 45, 105, 37, 52, 126, 54, 29, 41, 114, 101, 117, 40, 50, 99, 115, 23, 124, 31, 60, 44, 86, 121, 58, 22, 119, 95, 62, 16, 13, 28, 104, 116, 87, 80, 39, 88, 112, 103, 53, 113, 127, 77, 92, 51, 49, 48, 118, 93, 85, 120, 111, 84, 74, 97, 20, 19, 24, 21, 10, 90, 122, 82, 100, 47, 34, 83, 18, 33, 106, 108, 91, 26, 71, 0, 35, 42, 98, 7, 64, 75, 25, 89, 79, 36, 30, 38, 81, 15, 5, 78, 72, 3, 11, 102, 67, 76, 96, 17, 14, 32, 94, 73, 69, 6, 4, 68, 1, 66, 8, 65, 12, 2, 27, 9, 70], [59, 43, 57, 123, 107, 109, 110, 46, 63, 61, 55, 56, 125, 45, 105, 52, 37, 126, 54, 101, 29, 117, 50, 41, 115, 114, 62, 44, 80, 99, 23, 28, 95, 22, 86, 121, 40, 124, 116, 16, 31, 92, 60, 13, 119, 87, 127, 112, 51, 104, 113, 58, 88, 103, 111, 118, 97, 77, 84, 53, 39, 48, 85, 19, 20, 10, 74, 49, 120, 122, 21, 108, 90, 93, 100, 24, 26, 34, 47, 18, 83, 33, 82, 106, 89, 25, 98, 7, 35, 71, 30, 42, 79, 0, 96, 91, 36, 15, 75, 78, 17, 12, 102, 76, 11, 64, 14, 27, 81, 72, 67, 5, 38, 3, 94, 1, 32, 69, 65, 73, 66, 9, 68, 70, 6, 2, 4, 8], [59, 43, 57, 123, 107, 109, 110, 46, 61, 63, 125, 55, 56, 45, 105, 52, 37, 54, 126, 115, 101, 29, 40, 114, 50, 28, 117, 121, 41, 62, 95, 116, 31, 99, 92, 22, 127, 80, 23, 124, 86, 39, 87, 16, 58, 60, 13, 49, 44, 51, 88, 119, 122, 112, 53, 97, 84, 118, 19, 120, 85, 104, 93, 103, 24, 20, 21, 48, 108, 34, 90, 33, 82, 77, 111, 100, 26, 74, 83, 113, 35, 98, 18, 36, 89, 47, 25, 10, 42, 91, 30, 106, 71, 96, 7, 79, 64, 0, 81, 78, 15, 32, 11, 75, 102, 72, 12, 38, 76, 17, 27, 3, 14, 94, 67, 69, 5, 73, 65, 6, 70, 1, 9, 68, 4, 66, 8, 2], [59, 43, 57, 123, 109, 107, 110, 46, 61, 63, 55, 56, 125, 45, 105, 37, 52, 126, 41, 29, 54, 50, 115, 101, 117, 40, 114, 80, 22, 28, 49, 121, 23, 104, 31, 99, 95, 86, 92, 103, 119, 60, 124, 116, 58, 13, 87, 62, 16, 51, 39, 44, 127, 77, 88, 93, 97, 113, 24, 120, 122, 21, 118, 20, 19, 112, 84, 48, 74, 53, 34, 85, 111, 18, 108, 26, 90, 100, 82, 47, 33, 10, 83, 42, 89, 35, 98, 25, 71, 91, 79, 0, 30, 106, 81, 36, 72, 7, 17, 96, 64, 11, 15, 78, 5, 67, 14, 69, 94, 32, 102, 75, 3, 12, 76, 65, 38, 66, 68, 27, 73, 70, 9, 4, 1, 6, 2, 8], [59, 43, 57, 123, 109, 110, 46, 107, 63, 61, 55, 56, 125, 45, 105, 37, 126, 52, 54, 117, 50, 29, 114, 41, 40, 44, 101, 115, 121, 62, 124, 104, 58, 28, 60, 99, 80, 119, 116, 103, 113, 23, 53, 95, 118, 22, 51, 31, 122, 92, 13, 86, 120, 49, 39, 112, 16, 87, 88, 77, 111, 97, 100, 24, 74, 93, 108, 84, 21, 20, 127, 48, 85, 47, 34, 19, 90, 82, 10, 18, 106, 33, 83, 35, 98, 42, 26, 71, 36, 7, 30, 0, 91, 89, 102, 64, 79, 11, 81, 38, 25, 75, 3, 72, 32, 94, 96, 78, 76, 15, 12, 17, 69, 67, 27, 5, 14, 70, 73, 65, 66, 9, 8, 4, 1, 68, 2, 6], [43, 59, 57, 123, 109, 107, 110, 46, 63, 61, 56, 55, 125, 45, 105, 37, 52, 126, 54, 29, 117, 50, 41, 121, 101, 62, 40, 114, 115, 44, 23, 99, 95, 104, 28, 16, 58, 116, 124, 31, 60, 103, 13, 51, 49, 92, 22, 80, 39, 122, 87, 127, 86, 119, 88, 93, 118, 77, 120, 112, 113, 97, 85, 111, 108, 47, 24, 10, 53, 20, 84, 74, 21, 34, 100, 90, 19, 33, 18, 83, 98, 48, 89, 26, 82, 91, 42, 35, 30, 11, 71, 7, 64, 36, 75, 106, 32, 3, 25, 12, 79, 69, 15, 0, 102, 96, 94, 17, 67, 76, 78, 38, 72, 14, 8, 5, 81, 27, 70, 65, 68, 66, 9, 73, 1, 2, 4, 6], [59, 43, 57, 123, 109, 107, 110, 46, 61, 63, 55, 56, 125, 45, 105, 126, 54, 37, 52, 29, 41, 121, 40, 101, 50, 117, 115, 99, 116, 114, 95, 127, 28, 44, 119, 62, 104, 49, 86, 122, 22, 124, 39, 80, 23, 53, 118, 16, 31, 113, 120, 13, 103, 87, 60, 92, 112, 51, 93, 88, 58, 111, 77, 84, 48, 34, 97, 20, 74, 85, 33, 21, 19, 47, 98, 90, 10, 24, 108, 83, 18, 26, 100, 35, 91, 25, 89, 71, 82, 36, 7, 42, 15, 11, 79, 30, 75, 17, 106, 96, 76, 81, 102, 78, 0, 72, 14, 64, 32, 69, 67, 3, 94, 38, 12, 8, 5, 70, 1, 68, 73, 4, 65, 66, 2, 27, 9, 6], [59, 43, 57, 123, 109, 107, 110, 46, 61, 63, 56, 125, 55, 45, 105, 126, 37, 52, 54, 29, 117, 41, 50, 101, 99, 40, 121, 115, 114, 116, 49, 104, 28, 44, 113, 119, 122, 124, 62, 23, 80, 103, 92, 22, 58, 16, 95, 87, 60, 51, 86, 39, 112, 118, 31, 88, 53, 48, 127, 120, 97, 13, 24, 77, 34, 20, 84, 111, 93, 21, 74, 85, 108, 90, 47, 19, 33, 83, 100, 26, 82, 64, 10, 98, 18, 71, 36, 106, 35, 0, 30, 42, 91, 89, 25, 7, 15, 11, 67, 17, 75, 8, 81, 79, 78, 3, 14, 38, 5, 102, 69, 32, 12, 94, 76, 72, 65, 1, 96, 70, 68, 27, 4, 66, 6, 2, 73, 9], [59, 43, 57, 123, 107, 109, 110, 46, 63, 61, 55, 56, 125, 45, 105, 37, 126, 54, 52, 117, 29, 50, 101, 41, 115, 40, 99, 114, 28, 119, 116, 121, 104, 124, 23, 62, 53, 95, 22, 92, 127, 51, 58, 49, 86, 103, 80, 122, 112, 31, 118, 16, 44, 39, 97, 113, 13, 77, 93, 20, 87, 88, 21, 90, 111, 24, 60, 120, 19, 85, 48, 34, 83, 84, 47, 91, 30, 108, 74, 33, 100, 26, 18, 35, 10, 82, 42, 98, 25, 89, 36, 7, 15, 71, 102, 11, 106, 32, 8, 0, 75, 79, 81, 94, 27, 14, 12, 17, 67, 64, 3, 69, 96, 38, 78, 5, 66, 65, 76, 68, 73, 72, 1, 70, 6, 9, 2, 4], [59, 43, 57, 123, 107, 109, 110, 46, 63, 61, 125, 56, 55, 45, 105, 37, 52, 29, 54, 126, 41, 114, 121, 116, 40, 50, 117, 28, 62, 101, 115, 95, 104, 99, 44, 23, 119, 80, 22, 103, 16, 60, 92, 118, 13, 124, 58, 39, 86, 88, 77, 111, 31, 87, 112, 127, 20, 93, 51, 53, 49, 21, 84, 122, 85, 113, 83, 10, 97, 74, 90, 19, 24, 33, 120, 82, 25, 98, 26, 18, 47, 34, 108, 48, 7, 100, 35, 36, 89, 91, 71, 79, 8, 75, 11, 106, 30, 12, 76, 32, 42, 15, 0, 78, 102, 14, 69, 38, 17, 96, 81, 3, 73, 67, 64, 5, 94, 6, 9, 65, 27, 70, 66, 4, 72, 68, 2, 1], [59, 43, 57, 123, 109, 107, 110, 46, 61, 63, 125, 55, 56, 45, 105, 37, 52, 126, 29, 41, 54, 114, 117, 40, 50, 99, 28, 127, 101, 121, 23, 22, 95, 86, 115, 16, 80, 31, 60, 111, 62, 116, 44, 104, 58, 77, 53, 92, 88, 49, 113, 13, 103, 124, 119, 87, 39, 21, 51, 20, 118, 19, 97, 122, 84, 112, 93, 18, 47, 10, 74, 90, 108, 24, 34, 48, 85, 83, 26, 120, 82, 100, 35, 98, 25, 33, 7, 30, 8, 89, 36, 71, 11, 106, 14, 42, 91, 79, 15, 102, 75, 76, 0, 78, 32, 96, 17, 81, 64, 12, 94, 69, 67, 3, 6, 38, 5, 73, 65, 68, 9, 70, 1, 4, 27, 66, 2, 72], [59, 43, 57, 123, 107, 109, 46, 110, 61, 63, 55, 125, 56, 45, 105, 37, 54, 126, 52, 50, 29, 117, 114, 101, 41, 121, 115, 28, 99, 40, 31, 112, 49, 116, 124, 62, 92, 22, 23, 95, 127, 113, 58, 86, 44, 88, 97, 60, 39, 87, 103, 104, 118, 122, 80, 16, 111, 21, 13, 53, 77, 24, 84, 119, 51, 85, 34, 20, 120, 100, 19, 48, 90, 93, 26, 47, 74, 83, 42, 33, 106, 108, 36, 18, 30, 89, 10, 98, 82, 35, 7, 25, 91, 11, 0, 64, 96, 15, 75, 32, 102, 79, 78, 8, 14, 71, 94, 81, 27, 76, 69, 12, 38, 3, 17, 67, 1, 5, 68, 73, 65, 6, 66, 2, 4, 9, 70, 72], [59, 43, 57, 123, 109, 107, 46, 110, 61, 63, 55, 125, 56, 45, 105, 37, 52, 126, 54, 29, 101, 50, 114, 41, 121, 40, 115, 117, 99, 62, 124, 28, 53, 127, 23, 95, 116, 58, 119, 31, 92, 80, 49, 44, 104, 118, 22, 86, 39, 16, 13, 112, 51, 111, 122, 88, 103, 60, 87, 97, 48, 77, 120, 20, 19, 34, 21, 113, 24, 85, 93, 84, 26, 47, 100, 74, 90, 10, 108, 33, 98, 83, 35, 106, 18, 82, 36, 42, 91, 7, 25, 30, 89, 38, 102, 79, 96, 8, 71, 81, 12, 14, 11, 78, 15, 94, 17, 75, 5, 76, 64, 32, 27, 0, 3, 69, 67, 4, 6, 72, 73, 65, 9, 68, 66, 2, 1, 70], [59, 43, 57, 123, 109, 107, 110, 46, 63, 61, 125, 55, 56, 45, 105, 37, 52, 126, 29, 54, 114, 121, 41, 50, 101, 117, 99, 40, 116, 28, 115, 124, 119, 80, 118, 104, 44, 95, 86, 49, 16, 23, 103, 13, 127, 62, 92, 22, 77, 58, 31, 51, 87, 112, 60, 39, 21, 88, 53, 113, 24, 111, 85, 93, 48, 84, 19, 10, 20, 122, 34, 74, 83, 120, 108, 97, 90, 18, 82, 26, 7, 106, 47, 33, 36, 100, 98, 42, 35, 71, 25, 0, 89, 64, 11, 79, 30, 69, 75, 3, 91, 78, 102, 14, 17, 12, 8, 67, 32, 15, 76, 96, 81, 5, 38, 94, 65, 6, 70, 72, 27, 68, 73, 4, 9, 66, 1, 2], [59, 43, 57, 123, 109, 110, 107, 46, 61, 63, 56, 125, 55, 45, 105, 52, 37, 54, 126, 29, 41, 50, 117, 53, 114, 101, 121, 40, 99, 60, 44, 104, 28, 95, 116, 58, 127, 124, 115, 80, 119, 86, 23, 111, 22, 92, 16, 112, 118, 103, 49, 77, 31, 88, 87, 39, 13, 62, 113, 20, 21, 10, 48, 93, 74, 122, 24, 51, 84, 97, 108, 33, 34, 19, 64, 106, 90, 83, 35, 26, 7, 85, 82, 120, 47, 36, 18, 98, 71, 100, 91, 11, 79, 0, 75, 42, 96, 102, 30, 15, 76, 78, 69, 17, 25, 89, 3, 67, 12, 32, 38, 14, 72, 65, 66, 81, 8, 5, 70, 4, 68, 94, 9, 1, 6, 73, 2, 27], [43, 59, 57, 123, 107, 109, 110, 46, 61, 63, 55, 56, 125, 45, 105, 52, 37, 54, 126, 29, 50, 41, 101, 114, 99, 116, 117, 115, 40, 112, 127, 28, 22, 23, 95, 60, 119, 16, 124, 104, 80, 86, 13, 118, 88, 92, 58, 44, 121, 103, 77, 62, 53, 31, 87, 49, 97, 113, 21, 85, 39, 93, 111, 20, 84, 10, 83, 19, 51, 122, 34, 90, 108, 24, 26, 48, 47, 120, 74, 33, 100, 106, 82, 18, 36, 98, 7, 91, 75, 25, 71, 35, 30, 15, 78, 64, 32, 17, 89, 12, 42, 14, 96, 81, 79, 11, 0, 102, 38, 3, 72, 5, 69, 76, 70, 67, 9, 65, 4, 8, 68, 2, 94, 66, 6, 73, 1, 27], [59, 43, 57, 123, 109, 110, 107, 46, 61, 63, 55, 125, 56, 45, 105, 52, 37, 126, 54, 29, 101, 50, 41, 28, 117, 115, 40, 116, 121, 99, 114, 112, 92, 104, 119, 124, 62, 127, 23, 16, 95, 58, 103, 80, 22, 31, 86, 39, 13, 88, 60, 53, 44, 113, 87, 118, 97, 111, 77, 34, 47, 85, 49, 21, 24, 122, 84, 108, 120, 20, 51, 48, 19, 93, 74, 90, 10, 33, 83, 18, 82, 100, 26, 0, 98, 7, 106, 35, 91, 71, 36, 64, 89, 11, 25, 42, 79, 67, 75, 102, 72, 96, 30, 15, 69, 14, 32, 12, 3, 5, 76, 68, 38, 81, 17, 70, 78, 94, 9, 2, 73, 4, 65, 8, 6, 27, 1, 66], [59, 43, 57, 123, 109, 107, 110, 46, 61, 63, 55, 56, 125, 45, 105, 52, 37, 126, 54, 29, 114, 41, 101, 50, 40, 115, 116, 28, 99, 117, 112, 95, 121, 127, 22, 23, 80, 86, 119, 92, 62, 103, 16, 87, 31, 39, 53, 13, 124, 60, 118, 44, 58, 88, 104, 113, 77, 51, 49, 21, 97, 19, 48, 93, 84, 111, 24, 10, 34, 33, 122, 85, 47, 20, 120, 74, 90, 82, 108, 83, 18, 26, 35, 36, 91, 100, 7, 75, 30, 25, 98, 14, 72, 106, 71, 79, 42, 89, 11, 76, 102, 96, 32, 17, 12, 64, 78, 15, 94, 5, 81, 27, 38, 73, 69, 0, 67, 9, 3, 70, 2, 68, 8, 66, 4, 1, 65, 6]], "model.layers.27.self_attn.q_proj": [[109, 45, 94, 90, 33, 23, 83, 81, 21, 60, 54, 117, 79, 125, 76, 28, 78, 123, 111, 112, 39, 58, 73, 62, 56, 61, 51, 5, 32, 59, 38, 114, 57, 115, 6, 52, 48, 105, 11, 35, 97, 43, 55, 100, 9, 122, 113, 106, 46, 7, 37, 53, 121, 10, 116, 98, 118, 63, 24, 120, 3, 126, 19, 0, 110, 49, 87, 17, 44, 22, 85, 50, 119, 75, 104, 124, 47, 103, 42, 127, 95, 101, 31, 29, 88, 108, 92, 25, 13, 4, 26, 14, 36, 30, 40, 80, 18, 107, 41, 20, 84, 86, 27, 93, 102, 99, 96, 15, 77, 89, 91, 82, 34, 74, 16, 66, 70, 72, 65, 8, 12, 64, 1, 2, 68, 69, 71, 67], [109, 45, 33, 94, 90, 21, 83, 23, 81, 79, 76, 105, 54, 28, 112, 73, 97, 123, 99, 7, 117, 75, 121, 6, 5, 127, 60, 39, 4, 38, 11, 9, 47, 125, 78, 106, 3, 0, 37, 48, 32, 56, 115, 111, 62, 63, 26, 71, 14, 74, 2, 46, 120, 18, 58, 119, 35, 113, 114, 17, 51, 110, 87, 16, 19, 49, 85, 1, 66, 107, 36, 70, 77, 59, 98, 126, 53, 89, 57, 44, 24, 50, 31, 124, 43, 22, 102, 122, 88, 55, 52, 42, 103, 108, 10, 92, 118, 82, 86, 104, 96, 80, 34, 30, 65, 116, 20, 84, 95, 41, 13, 29, 8, 61, 101, 25, 91, 72, 40, 100, 67, 69, 93, 27, 64, 15, 12, 68], [109, 45, 94, 90, 33, 83, 23, 21, 76, 79, 81, 123, 125, 28, 54, 39, 105, 99, 6, 73, 112, 111, 106, 5, 127, 60, 62, 58, 56, 9, 78, 38, 70, 32, 50, 3, 115, 44, 42, 72, 75, 66, 37, 40, 17, 48, 7, 97, 11, 63, 124, 52, 18, 84, 89, 51, 120, 10, 113, 101, 26, 87, 25, 98, 47, 41, 88, 80, 57, 59, 14, 49, 85, 0, 108, 19, 117, 126, 24, 31, 30, 95, 77, 116, 104, 103, 119, 107, 12, 74, 121, 110, 22, 1, 35, 102, 13, 4, 2, 55, 46, 34, 53, 118, 92, 96, 43, 91, 61, 114, 29, 122, 8, 100, 86, 16, 36, 82, 20, 93, 65, 15, 27, 71, 69, 67, 68, 64], [109, 45, 94, 90, 33, 125, 83, 23, 117, 81, 21, 76, 115, 32, 124, 57, 119, 113, 112, 79, 54, 121, 114, 9, 28, 123, 58, 39, 47, 51, 52, 60, 91, 118, 122, 53, 59, 29, 48, 49, 106, 61, 24, 14, 30, 101, 13, 110, 37, 38, 43, 100, 126, 116, 44, 17, 85, 80, 87, 50, 127, 5, 108, 95, 92, 88, 102, 103, 25, 63, 111, 99, 73, 96, 97, 55, 19, 107, 56, 40, 120, 89, 41, 75, 46, 6, 36, 84, 78, 22, 62, 42, 82, 20, 18, 35, 31, 16, 27, 26, 104, 98, 105, 93, 34, 86, 15, 12, 77, 65, 10, 8, 7, 74, 72, 11, 71, 0, 1, 68, 3, 69, 70, 4, 67, 66, 2, 64], [118, 53, 120, 63, 101, 60, 127, 121, 84, 57, 126, 119, 88, 56, 50, 112, 61, 124, 92, 52, 17, 54, 125, 117, 123, 24, 115, 113, 33, 55, 116, 49, 58, 59, 62, 51, 94, 110, 122, 93, 48, 47, 111, 45, 114, 43, 30, 37, 108, 39, 46, 44, 7, 14, 107, 90, 41, 109, 11, 104, 26, 9, 25, 74, 78, 20, 91, 77, 42, 103, 106, 5, 105, 29, 31, 75, 40, 22, 32, 80, 38, 36, 100, 21, 18, 6, 13, 72, 95, 99, 96, 73, 79, 4, 34, 16, 86, 35, 98, 83, 12, 3, 19, 81, 66, 10, 102, 85, 28, 97, 69, 23, 27, 87, 2, 89, 64, 1, 65, 70, 68, 71, 82, 0, 15, 67, 8, 76], [53, 120, 118, 101, 63, 60, 127, 50, 56, 57, 61, 124, 126, 84, 123, 121, 112, 119, 54, 115, 62, 52, 49, 117, 88, 93, 113, 116, 58, 110, 125, 55, 51, 122, 59, 108, 39, 111, 48, 47, 92, 114, 33, 45, 17, 46, 107, 37, 44, 14, 24, 77, 30, 90, 7, 109, 41, 94, 104, 26, 43, 105, 20, 103, 74, 106, 91, 9, 42, 83, 4, 19, 100, 25, 96, 3, 5, 11, 22, 29, 23, 40, 75, 38, 34, 80, 12, 6, 16, 73, 21, 32, 18, 36, 78, 98, 102, 95, 31, 89, 99, 86, 35, 27, 85, 72, 97, 79, 28, 81, 10, 87, 64, 65, 13, 70, 66, 1, 2, 71, 15, 82, 69, 76, 68, 8, 0, 67], [120, 118, 53, 101, 63, 60, 126, 121, 127, 56, 57, 84, 50, 119, 61, 124, 54, 112, 52, 62, 123, 115, 117, 125, 58, 116, 55, 110, 88, 93, 39, 113, 122, 49, 59, 47, 17, 51, 48, 111, 92, 108, 114, 33, 46, 14, 45, 43, 94, 109, 37, 90, 30, 11, 26, 41, 7, 25, 44, 107, 104, 24, 106, 9, 103, 42, 77, 22, 20, 100, 105, 74, 29, 91, 40, 78, 6, 34, 80, 5, 38, 31, 102, 18, 98, 95, 36, 32, 96, 73, 3, 85, 83, 16, 4, 86, 12, 79, 10, 28, 19, 75, 99, 27, 23, 21, 35, 13, 97, 89, 72, 81, 64, 66, 65, 1, 87, 70, 15, 71, 82, 67, 69, 0, 76, 68, 2, 8], [63, 53, 120, 118, 101, 60, 57, 50, 84, 127, 62, 61, 54, 112, 56, 124, 121, 88, 123, 115, 119, 117, 52, 49, 55, 126, 116, 39, 113, 58, 110, 122, 30, 59, 51, 125, 93, 111, 44, 48, 108, 114, 47, 14, 24, 45, 90, 43, 17, 46, 94, 77, 37, 33, 109, 92, 103, 74, 107, 11, 5, 7, 104, 73, 106, 98, 26, 9, 91, 41, 42, 105, 100, 36, 6, 83, 3, 18, 4, 40, 80, 32, 22, 38, 102, 85, 28, 25, 96, 20, 27, 75, 35, 34, 31, 95, 21, 23, 16, 78, 19, 99, 79, 72, 97, 86, 89, 10, 66, 29, 81, 12, 87, 13, 82, 65, 68, 15, 0, 64, 71, 2, 70, 1, 76, 69, 67, 8], [40, 98, 63, 23, 31, 85, 80, 26, 121, 60, 13, 19, 125, 54, 82, 122, 79, 8, 49, 74, 6, 1, 117, 55, 113, 105, 59, 57, 9, 12, 106, 127, 111, 52, 46, 66, 50, 119, 56, 58, 53, 112, 43, 103, 11, 27, 120, 108, 39, 126, 4, 124, 37, 24, 62, 123, 109, 28, 115, 38, 67, 100, 107, 64, 0, 90, 51, 15, 73, 75, 47, 61, 76, 42, 116, 97, 78, 21, 99, 35, 18, 30, 25, 104, 36, 118, 41, 87, 48, 88, 44, 102, 32, 93, 45, 94, 84, 101, 114, 3, 95, 33, 68, 5, 83, 110, 34, 96, 77, 29, 20, 89, 16, 91, 86, 69, 92, 7, 14, 81, 17, 72, 70, 22, 65, 71, 10, 2], [40, 63, 98, 31, 80, 23, 60, 85, 13, 74, 6, 4, 19, 8, 22, 64, 52, 66, 121, 46, 108, 28, 124, 56, 122, 106, 0, 79, 125, 104, 54, 65, 11, 55, 107, 113, 75, 59, 18, 90, 48, 82, 7, 1, 58, 73, 126, 119, 12, 68, 117, 84, 127, 83, 105, 34, 57, 49, 21, 120, 35, 77, 109, 20, 101, 111, 94, 47, 14, 15, 39, 81, 30, 97, 24, 100, 96, 2, 3, 16, 62, 114, 10, 53, 33, 32, 70, 9, 25, 112, 95, 17, 123, 116, 38, 89, 5, 91, 87, 88, 72, 86, 76, 51, 71, 78, 29, 67, 69, 43, 27, 36, 103, 44, 115, 50, 93, 41, 110, 102, 99, 26, 92, 42, 45, 37, 61, 118], [40, 63, 98, 31, 60, 8, 80, 85, 19, 13, 23, 6, 74, 1, 66, 121, 64, 122, 117, 124, 4, 26, 46, 106, 15, 52, 28, 54, 57, 109, 55, 125, 0, 89, 104, 87, 12, 24, 120, 56, 79, 119, 105, 68, 69, 108, 72, 27, 7, 113, 11, 71, 59, 126, 42, 21, 48, 18, 76, 83, 33, 78, 84, 103, 96, 75, 32, 16, 82, 65, 115, 20, 41, 90, 81, 30, 77, 50, 2, 62, 3, 51, 25, 116, 92, 123, 5, 114, 127, 67, 37, 93, 88, 61, 10, 47, 29, 118, 58, 36, 38, 107, 39, 94, 17, 70, 111, 97, 14, 49, 86, 110, 45, 91, 73, 100, 101, 35, 44, 53, 112, 99, 102, 22, 43, 9, 95, 34], [40, 63, 98, 23, 85, 60, 26, 80, 31, 13, 8, 74, 19, 66, 82, 6, 121, 59, 52, 106, 4, 58, 41, 122, 113, 46, 119, 28, 49, 48, 95, 57, 84, 5, 120, 111, 105, 125, 65, 38, 67, 79, 54, 55, 108, 109, 117, 2, 73, 102, 90, 7, 123, 47, 61, 76, 64, 0, 18, 107, 50, 83, 45, 69, 56, 53, 97, 32, 115, 77, 81, 87, 71, 124, 15, 42, 89, 103, 91, 11, 30, 44, 114, 126, 21, 118, 78, 100, 43, 24, 39, 96, 36, 62, 17, 68, 35, 127, 93, 20, 29, 51, 116, 16, 3, 22, 94, 27, 70, 75, 9, 112, 88, 33, 110, 92, 37, 10, 101, 25, 14, 12, 72, 86, 1, 99, 104, 34], [104, 120, 98, 46, 95, 126, 115, 52, 44, 91, 108, 59, 56, 60, 54, 27, 84, 58, 88, 96, 118, 82, 24, 122, 51, 36, 50, 21, 45, 42, 48, 112, 101, 57, 62, 124, 114, 55, 63, 113, 92, 119, 117, 116, 61, 76, 125, 86, 85, 47, 127, 23, 53, 123, 111, 121, 49, 43, 38, 13, 107, 110, 109, 22, 41, 106, 74, 99, 103, 105, 80, 89, 33, 97, 31, 29, 32, 19, 6, 78, 37, 102, 25, 39, 69, 94, 28, 100, 35, 64, 90, 18, 93, 8, 30, 15, 26, 83, 65, 87, 34, 14, 12, 0, 40, 2, 79, 20, 66, 81, 72, 7, 1, 17, 3, 73, 16, 67, 11, 4, 75, 10, 5, 68, 77, 71, 9, 70], [120, 104, 98, 95, 44, 108, 126, 46, 113, 125, 54, 27, 88, 91, 55, 58, 127, 122, 61, 59, 41, 60, 115, 50, 110, 106, 56, 84, 36, 118, 116, 109, 123, 22, 38, 103, 62, 124, 111, 57, 96, 63, 114, 49, 112, 43, 28, 121, 40, 117, 53, 48, 35, 42, 24, 52, 45, 51, 101, 107, 119, 37, 47, 39, 105, 82, 94, 19, 85, 86, 83, 97, 102, 21, 12, 79, 93, 73, 26, 25, 81, 71, 100, 99, 31, 29, 92, 9, 30, 4, 33, 32, 89, 34, 23, 76, 74, 78, 90, 80, 15, 20, 18, 87, 8, 14, 10, 17, 70, 69, 72, 66, 7, 6, 2, 68, 13, 75, 3, 67, 5, 16, 77, 11, 65, 0, 64, 1], [104, 120, 98, 95, 126, 91, 80, 84, 13, 82, 22, 86, 115, 27, 74, 44, 31, 93, 116, 36, 16, 97, 69, 88, 20, 77, 60, 100, 52, 59, 58, 90, 56, 6, 108, 125, 72, 94, 45, 2, 19, 122, 32, 25, 3, 62, 63, 18, 54, 114, 92, 96, 42, 64, 41, 43, 75, 15, 29, 113, 68, 55, 87, 30, 46, 106, 11, 118, 102, 33, 79, 48, 81, 76, 24, 99, 78, 103, 117, 70, 89, 83, 57, 85, 50, 65, 37, 8, 23, 21, 35, 26, 67, 17, 61, 73, 47, 107, 9, 4, 7, 1, 111, 38, 49, 71, 10, 12, 28, 14, 110, 53, 5, 127, 0, 34, 39, 66, 105, 101, 124, 121, 119, 112, 109, 123, 51, 40], [104, 120, 98, 46, 95, 82, 84, 96, 91, 80, 115, 14, 126, 44, 66, 48, 125, 27, 13, 67, 60, 29, 42, 116, 68, 74, 93, 8, 28, 85, 63, 102, 113, 36, 0, 23, 55, 117, 101, 105, 38, 6, 76, 65, 1, 21, 35, 24, 111, 54, 39, 45, 56, 99, 88, 26, 32, 119, 103, 86, 37, 50, 17, 108, 107, 64, 9, 97, 109, 52, 79, 40, 90, 16, 22, 31, 100, 92, 124, 41, 94, 122, 25, 58, 61, 62, 59, 47, 121, 72, 57, 112, 34, 123, 81, 15, 43, 110, 5, 89, 70, 20, 19, 106, 30, 114, 127, 83, 51, 118, 49, 53, 12, 87, 33, 11, 73, 78, 18, 69, 75, 10, 77, 3, 4, 7, 71, 2], [111, 100, 47, 56, 24, 53, 95, 121, 120, 58, 31, 54, 77, 36, 82, 103, 86, 57, 1, 124, 59, 119, 60, 84, 127, 16, 52, 83, 104, 97, 94, 10, 63, 45, 105, 49, 64, 116, 110, 28, 122, 62, 51, 126, 108, 92, 42, 112, 67, 113, 39, 27, 55, 125, 43, 102, 61, 46, 19, 118, 26, 40, 50, 8, 68, 123, 4, 38, 44, 114, 85, 71, 2, 109, 115, 22, 73, 6, 15, 41, 117, 99, 48, 66, 69, 106, 98, 37, 107, 101, 20, 35, 90, 0, 96, 81, 70, 30, 32, 89, 33, 17, 80, 78, 9, 88, 65, 23, 29, 91, 34, 87, 93, 5, 25, 11, 12, 18, 79, 13, 72, 21, 3, 75, 7, 74, 14, 76], [111, 47, 58, 100, 24, 31, 122, 95, 61, 106, 51, 125, 82, 54, 84, 28, 113, 26, 85, 56, 53, 20, 126, 110, 117, 112, 30, 77, 127, 50, 13, 46, 118, 98, 83, 36, 104, 22, 55, 6, 12, 79, 120, 39, 45, 37, 49, 15, 123, 86, 116, 41, 94, 52, 119, 78, 59, 124, 14, 121, 17, 57, 114, 92, 43, 108, 105, 60, 109, 101, 63, 25, 107, 99, 88, 23, 103, 115, 44, 68, 96, 40, 27, 90, 80, 35, 62, 0, 29, 97, 32, 73, 48, 74, 33, 89, 87, 38, 102, 3, 42, 21, 81, 10, 34, 72, 66, 76, 18, 16, 93, 19, 91, 11, 65, 7, 8, 69, 75, 9, 4, 2, 5, 71, 67, 70, 64, 1], [111, 47, 100, 58, 56, 24, 95, 31, 114, 115, 52, 94, 106, 83, 121, 36, 82, 119, 53, 86, 54, 120, 112, 110, 28, 85, 113, 55, 103, 51, 49, 92, 104, 59, 109, 118, 17, 84, 126, 63, 127, 20, 48, 50, 60, 62, 45, 117, 30, 122, 116, 105, 38, 57, 124, 61, 77, 96, 46, 125, 16, 107, 43, 41, 123, 39, 108, 88, 42, 102, 22, 80, 90, 26, 40, 79, 6, 78, 44, 98, 101, 10, 89, 37, 97, 35, 12, 81, 68, 99, 34, 15, 11, 33, 0, 19, 23, 13, 29, 73, 9, 21, 32, 27, 91, 93, 71, 72, 25, 18, 74, 87, 66, 3, 14, 8, 76, 7, 4, 75, 64, 67, 5, 69, 65, 70, 1, 2], [111, 47, 58, 100, 56, 24, 95, 31, 127, 121, 122, 106, 115, 36, 94, 120, 44, 62, 82, 28, 83, 108, 116, 126, 59, 53, 52, 112, 85, 77, 54, 92, 118, 84, 105, 51, 113, 104, 60, 49, 17, 107, 110, 124, 45, 109, 20, 63, 57, 61, 103, 86, 55, 43, 114, 46, 48, 123, 125, 78, 6, 80, 119, 102, 96, 42, 30, 117, 38, 26, 39, 37, 33, 16, 50, 22, 41, 90, 34, 19, 12, 10, 40, 79, 32, 15, 88, 101, 74, 98, 91, 27, 29, 93, 81, 89, 35, 99, 97, 66, 11, 23, 68, 13, 65, 21, 25, 87, 71, 72, 8, 7, 3, 18, 1, 73, 76, 14, 0, 9, 4, 75, 5, 70, 69, 2, 67, 64], [48, 41, 51, 62, 125, 55, 121, 112, 24, 88, 30, 57, 52, 54, 60, 53, 123, 114, 49, 119, 115, 118, 117, 113, 56, 124, 110, 126, 61, 97, 50, 127, 94, 116, 47, 44, 27, 120, 122, 106, 58, 90, 103, 82, 111, 91, 63, 59, 36, 109, 107, 108, 89, 45, 37, 43, 46, 79, 21, 42, 19, 22, 104, 28, 39, 101, 102, 105, 93, 15, 83, 38, 35, 80, 33, 40, 98, 32, 99, 34, 18, 73, 86, 100, 95, 76, 96, 71, 84, 13, 16, 29, 92, 77, 85, 87, 25, 9, 31, 20, 26, 65, 17, 23, 81, 7, 3, 66, 68, 5, 12, 6, 1, 64, 0, 2, 67, 4, 70, 14, 11, 69, 78, 74, 10, 75, 8, 72], [41, 48, 51, 119, 112, 62, 20, 89, 105, 14, 81, 115, 80, 11, 97, 10, 12, 30, 8, 125, 54, 27, 60, 5, 70, 78, 25, 121, 22, 49, 52, 84, 56, 117, 0, 28, 118, 72, 57, 87, 126, 61, 29, 127, 104, 17, 37, 110, 96, 124, 55, 93, 50, 94, 67, 3, 111, 88, 123, 114, 58, 16, 122, 23, 90, 53, 36, 45, 120, 2, 74, 100, 44, 92, 113, 76, 32, 46, 31, 116, 83, 24, 102, 43, 47, 19, 26, 75, 59, 108, 103, 107, 21, 106, 63, 109, 6, 71, 101, 95, 91, 98, 34, 42, 39, 77, 18, 35, 33, 99, 7, 68, 38, 40, 79, 86, 65, 69, 4, 15, 82, 85, 13, 9, 66, 1, 64, 73], [51, 48, 41, 119, 125, 24, 30, 121, 52, 62, 57, 124, 115, 113, 54, 88, 55, 60, 53, 123, 49, 117, 110, 27, 118, 127, 56, 126, 97, 61, 112, 50, 114, 103, 120, 94, 122, 58, 36, 91, 116, 44, 111, 47, 106, 101, 59, 82, 90, 107, 109, 46, 63, 108, 43, 45, 42, 37, 39, 22, 104, 102, 15, 35, 93, 77, 40, 79, 89, 21, 18, 33, 38, 19, 28, 105, 9, 83, 26, 32, 84, 17, 100, 99, 98, 86, 13, 76, 5, 95, 92, 71, 73, 12, 65, 87, 34, 31, 29, 16, 96, 70, 7, 3, 66, 80, 23, 4, 11, 85, 1, 25, 81, 68, 6, 0, 2, 20, 14, 10, 64, 67, 75, 69, 74, 78, 8, 72], [119, 48, 41, 51, 125, 24, 121, 57, 62, 52, 55, 60, 30, 97, 54, 88, 113, 123, 53, 115, 56, 124, 49, 110, 117, 126, 127, 118, 114, 50, 61, 112, 116, 94, 120, 103, 58, 91, 122, 27, 36, 111, 44, 82, 47, 63, 109, 101, 59, 90, 107, 106, 46, 108, 37, 45, 89, 104, 43, 22, 39, 19, 79, 42, 102, 99, 98, 73, 28, 105, 38, 80, 35, 21, 34, 18, 93, 83, 40, 96, 100, 32, 15, 33, 86, 71, 77, 95, 65, 85, 13, 31, 26, 92, 29, 76, 16, 3, 84, 9, 23, 17, 87, 68, 66, 5, 81, 6, 7, 4, 75, 64, 0, 1, 12, 70, 2, 67, 25, 20, 11, 69, 10, 74, 14, 78, 8, 72], [106, 35, 42, 110, 91, 85, 50, 89, 52, 96, 83, 54, 113, 31, 51, 56, 122, 123, 114, 55, 14, 17, 32, 126, 57, 46, 117, 59, 53, 120, 62, 60, 27, 107, 75, 22, 9, 58, 29, 87, 119, 49, 70, 125, 15, 115, 16, 112, 109, 21, 61, 13, 48, 44, 63, 127, 39, 28, 68, 86, 23, 84, 41, 69, 19, 71, 45, 33, 92, 97, 25, 116, 111, 5, 20, 26, 34, 81, 121, 37, 79, 36, 38, 40, 124, 47, 94, 90, 11, 76, 82, 80, 18, 93, 105, 88, 43, 100, 104, 30, 108, 118, 103, 77, 24, 98, 102, 99, 95, 101, 78, 73, 8, 12, 72, 74, 10, 7, 65, 3, 6, 4, 66, 2, 0, 67, 1, 64], [106, 35, 42, 85, 31, 17, 83, 96, 89, 50, 52, 55, 51, 14, 9, 27, 113, 56, 69, 75, 62, 57, 110, 53, 93, 91, 122, 6, 117, 3, 66, 1, 61, 65, 127, 64, 87, 123, 32, 126, 7, 15, 59, 68, 13, 70, 114, 25, 23, 76, 84, 120, 0, 21, 63, 5, 22, 74, 10, 109, 54, 119, 71, 29, 11, 19, 16, 82, 72, 20, 8, 18, 4, 67, 78, 81, 77, 118, 41, 37, 73, 79, 88, 90, 38, 30, 121, 86, 115, 12, 48, 94, 98, 45, 43, 44, 26, 24, 33, 80, 39, 108, 100, 105, 99, 47, 36, 92, 28, 125, 107, 111, 102, 34, 49, 124, 116, 46, 2, 97, 60, 101, 112, 103, 104, 95, 58, 40], [106, 35, 42, 110, 91, 85, 50, 122, 83, 126, 56, 57, 17, 89, 117, 96, 113, 14, 54, 55, 31, 32, 59, 52, 114, 62, 127, 63, 61, 53, 22, 9, 116, 112, 75, 51, 94, 29, 27, 123, 48, 37, 65, 45, 99, 47, 25, 76, 7, 23, 58, 3, 21, 107, 69, 124, 46, 33, 66, 19, 115, 64, 93, 109, 41, 71, 120, 39, 118, 36, 84, 70, 119, 79, 97, 13, 105, 60, 92, 100, 87, 28, 81, 11, 5, 98, 12, 88, 111, 6, 44, 125, 86, 104, 102, 43, 108, 68, 49, 121, 26, 18, 103, 80, 20, 24, 90, 38, 82, 16, 30, 4, 95, 15, 40, 78, 101, 8, 73, 34, 74, 10, 77, 72, 67, 1, 0, 2], [106, 35, 42, 110, 56, 91, 85, 52, 53, 83, 122, 96, 57, 17, 114, 50, 113, 51, 32, 62, 31, 120, 89, 14, 55, 59, 117, 29, 112, 27, 22, 54, 46, 126, 49, 119, 75, 45, 99, 109, 94, 87, 123, 124, 115, 116, 61, 9, 118, 21, 92, 11, 43, 86, 60, 25, 37, 47, 39, 125, 127, 28, 58, 38, 34, 19, 23, 121, 108, 44, 33, 84, 107, 103, 41, 97, 111, 63, 88, 48, 40, 100, 30, 16, 102, 93, 104, 26, 101, 105, 81, 24, 90, 82, 98, 36, 15, 78, 20, 76, 18, 79, 95, 71, 4, 12, 13, 80, 74, 77, 68, 70, 73, 7, 3, 10, 8, 65, 69, 6, 72, 66, 67, 64, 0, 5, 1, 2], [61, 122, 118, 102, 49, 54, 109, 45, 59, 50, 116, 90, 103, 119, 63, 125, 101, 112, 108, 94, 55, 127, 56, 57, 113, 62, 111, 37, 124, 60, 38, 22, 110, 44, 121, 114, 123, 115, 47, 40, 117, 41, 42, 51, 48, 46, 96, 43, 120, 58, 53, 104, 93, 52, 107, 105, 106, 126, 36, 100, 39, 92, 31, 23, 11, 34, 86, 99, 97, 98, 32, 79, 30, 35, 33, 85, 81, 4, 83, 20, 88, 14, 18, 2, 26, 66, 95, 25, 76, 16, 15, 91, 73, 28, 29, 13, 3, 5, 27, 72, 10, 6, 82, 21, 24, 68, 89, 78, 80, 0, 70, 8, 71, 19, 84, 65, 75, 17, 74, 12, 87, 69, 67, 7, 1, 77, 9, 64], [102, 118, 54, 61, 116, 77, 9, 29, 7, 122, 23, 1, 64, 81, 69, 20, 4, 68, 49, 3, 15, 67, 82, 65, 26, 74, 45, 33, 66, 70, 90, 11, 113, 53, 0, 22, 75, 86, 19, 5, 112, 83, 73, 124, 71, 25, 87, 10, 96, 72, 18, 107, 119, 14, 36, 85, 24, 30, 78, 89, 6, 79, 91, 80, 13, 108, 59, 12, 94, 31, 17, 88, 27, 125, 93, 8, 16, 21, 120, 2, 84, 99, 63, 32, 76, 95, 110, 28, 55, 92, 56, 127, 98, 100, 101, 35, 57, 109, 46, 50, 39, 97, 47, 37, 34, 126, 42, 40, 104, 103, 38, 43, 111, 44, 60, 117, 123, 106, 58, 41, 114, 115, 48, 105, 121, 62, 52, 51], [102, 118, 54, 116, 122, 61, 45, 90, 49, 23, 112, 53, 33, 93, 113, 127, 15, 81, 38, 43, 107, 114, 57, 31, 44, 96, 125, 101, 20, 50, 119, 30, 24, 106, 83, 28, 26, 56, 59, 29, 124, 48, 55, 21, 39, 77, 11, 100, 60, 117, 6, 76, 22, 42, 67, 14, 51, 52, 126, 82, 37, 94, 111, 66, 92, 97, 47, 99, 108, 64, 58, 40, 5, 41, 19, 74, 46, 110, 63, 34, 98, 103, 4, 123, 62, 75, 36, 71, 105, 7, 86, 109, 32, 78, 120, 85, 89, 35, 121, 9, 12, 69, 80, 8, 84, 115, 1, 70, 18, 72, 95, 91, 17, 104, 25, 79, 88, 2, 27, 16, 3, 10, 73, 87, 68, 0, 13, 65], [102, 118, 116, 61, 122, 54, 49, 90, 23, 20, 45, 9, 92, 81, 74, 77, 29, 15, 22, 93, 33, 113, 107, 7, 124, 53, 11, 64, 26, 38, 112, 55, 6, 94, 82, 31, 21, 70, 79, 66, 100, 50, 28, 119, 72, 57, 4, 83, 44, 37, 41, 127, 86, 59, 101, 36, 89, 5, 3, 85, 58, 110, 121, 67, 126, 125, 106, 35, 43, 115, 75, 1, 103, 63, 108, 60, 117, 123, 68, 56, 114, 98, 39, 109, 69, 47, 19, 34, 104, 42, 51, 62, 48, 24, 97, 111, 96, 27, 120, 65, 105, 95, 30, 52, 84, 14, 2, 32, 88, 46, 99, 10, 16, 80, 78, 76, 87, 18, 12, 0, 8, 25, 40, 91, 17, 71, 73, 13]], "model.layers.27.self_attn.k_proj": [[45, 109, 83, 23, 21, 90, 94, 81, 76, 79, 33, 74, 30, 54, 97, 28, 125, 60, 6, 123, 48, 127, 52, 22, 51, 8, 0, 113, 49, 112, 32, 101, 47, 126, 73, 63, 106, 31, 122, 46, 118, 42, 103, 117, 124, 91, 24, 121, 96, 44, 18, 11, 35, 102, 50, 7, 37, 95, 108, 114, 13, 39, 4, 119, 43, 65, 105, 116, 120, 115, 16, 36, 57, 86, 89, 61, 104, 111, 53, 40, 56, 110, 34, 82, 55, 15, 29, 38, 107, 25, 26, 59, 9, 5, 62, 41, 92, 14, 58, 88, 80, 75, 99, 100, 27, 98, 78, 69, 93, 10, 84, 3, 20, 77, 87, 85, 17, 71, 72, 19, 66, 70, 68, 12, 1, 67, 2, 64], [53, 37, 22, 120, 63, 118, 97, 86, 58, 61, 60, 57, 116, 121, 125, 112, 113, 56, 55, 114, 26, 51, 119, 127, 62, 124, 52, 122, 50, 108, 110, 59, 117, 123, 48, 126, 95, 45, 115, 54, 49, 44, 93, 111, 109, 35, 47, 46, 42, 107, 96, 43, 41, 83, 28, 105, 40, 106, 29, 104, 15, 89, 38, 103, 102, 79, 100, 23, 101, 81, 32, 99, 82, 36, 39, 31, 12, 13, 34, 33, 18, 98, 72, 17, 91, 88, 92, 30, 78, 87, 27, 11, 25, 85, 94, 16, 20, 10, 9, 90, 6, 77, 7, 24, 84, 14, 68, 80, 21, 75, 5, 76, 19, 2, 69, 74, 3, 67, 71, 65, 73, 70, 0, 1, 4, 8, 64, 66], [104, 63, 34, 80, 13, 74, 23, 8, 85, 60, 19, 4, 64, 6, 95, 26, 66, 52, 121, 114, 106, 122, 105, 117, 49, 79, 54, 119, 65, 57, 110, 28, 7, 44, 127, 82, 124, 120, 43, 55, 56, 125, 1, 112, 107, 59, 45, 11, 98, 3, 116, 103, 46, 70, 41, 2, 9, 58, 126, 69, 39, 53, 5, 47, 84, 51, 18, 24, 71, 115, 0, 123, 42, 50, 108, 67, 48, 31, 14, 96, 12, 20, 62, 61, 100, 35, 27, 75, 68, 97, 22, 118, 30, 99, 102, 36, 29, 109, 111, 10, 25, 83, 37, 113, 76, 78, 17, 32, 101, 94, 72, 89, 38, 33, 88, 93, 90, 15, 92, 81, 86, 91, 73, 87, 77, 21, 16, 40], [40, 120, 34, 27, 31, 110, 126, 56, 84, 82, 88, 74, 46, 80, 52, 13, 125, 64, 48, 93, 67, 108, 6, 16, 58, 85, 49, 76, 65, 72, 114, 23, 44, 30, 106, 77, 55, 100, 45, 115, 66, 117, 68, 111, 60, 86, 107, 5, 20, 22, 69, 79, 8, 78, 75, 94, 54, 59, 53, 21, 91, 61, 24, 57, 63, 109, 102, 124, 96, 73, 62, 112, 42, 87, 90, 15, 81, 83, 28, 2, 116, 101, 122, 123, 105, 103, 12, 121, 113, 127, 70, 38, 7, 41, 92, 118, 3, 119, 29, 33, 43, 39, 97, 51, 50, 36, 26, 89, 32, 11, 25, 99, 17, 37, 18, 47, 35, 0, 14, 98, 95, 4, 19, 71, 10, 9, 1, 104], [47, 111, 58, 36, 31, 86, 24, 92, 56, 82, 0, 26, 17, 15, 6, 16, 121, 3, 83, 12, 54, 11, 105, 91, 113, 84, 116, 20, 66, 85, 45, 55, 19, 77, 63, 46, 51, 126, 57, 78, 43, 124, 25, 61, 119, 68, 49, 42, 38, 117, 48, 44, 104, 59, 118, 60, 89, 50, 114, 52, 112, 53, 103, 127, 123, 7, 115, 97, 122, 120, 65, 110, 62, 99, 9, 41, 1, 8, 5, 39, 40, 34, 29, 109, 107, 37, 10, 108, 94, 93, 125, 35, 106, 71, 32, 74, 101, 21, 102, 72, 76, 4, 33, 67, 90, 22, 87, 64, 79, 98, 81, 100, 96, 30, 23, 69, 27, 18, 80, 2, 13, 75, 28, 73, 88, 70, 14, 95], [105, 48, 22, 119, 33, 51, 62, 112, 54, 99, 49, 61, 94, 118, 125, 60, 55, 122, 50, 121, 127, 35, 45, 56, 124, 106, 117, 123, 126, 108, 40, 52, 120, 114, 53, 82, 59, 100, 111, 47, 63, 98, 58, 39, 24, 110, 57, 44, 28, 32, 113, 92, 46, 115, 43, 79, 109, 91, 107, 41, 38, 34, 101, 42, 26, 116, 19, 18, 29, 102, 103, 36, 9, 15, 104, 73, 85, 88, 96, 37, 97, 21, 80, 77, 13, 90, 23, 31, 89, 30, 93, 95, 87, 4, 81, 27, 25, 83, 20, 14, 11, 71, 76, 1, 86, 12, 16, 74, 66, 17, 7, 8, 78, 10, 5, 75, 68, 6, 72, 84, 70, 64, 67, 3, 69, 2, 65, 0], [42, 85, 83, 89, 17, 114, 126, 120, 55, 35, 46, 14, 9, 56, 61, 127, 32, 117, 99, 123, 109, 57, 49, 62, 3, 75, 106, 69, 95, 7, 45, 91, 122, 48, 115, 15, 119, 53, 50, 51, 31, 93, 110, 59, 52, 0, 1, 87, 116, 13, 64, 58, 6, 54, 29, 44, 60, 23, 84, 111, 43, 66, 121, 41, 40, 38, 22, 113, 112, 74, 97, 107, 5, 10, 125, 68, 30, 108, 8, 118, 124, 20, 104, 80, 71, 98, 101, 76, 100, 102, 47, 33, 63, 11, 25, 39, 88, 26, 92, 103, 82, 12, 36, 105, 34, 94, 90, 37, 96, 16, 28, 86, 27, 72, 24, 18, 77, 79, 70, 21, 65, 4, 78, 81, 19, 73, 2, 67], [118, 38, 61, 54, 116, 113, 64, 122, 65, 0, 109, 69, 7, 23, 77, 74, 90, 93, 3, 15, 94, 108, 81, 97, 9, 59, 2, 48, 6, 68, 52, 124, 53, 117, 112, 43, 82, 75, 57, 119, 103, 20, 30, 62, 50, 19, 111, 123, 21, 83, 106, 115, 110, 127, 56, 67, 107, 120, 60, 125, 11, 63, 47, 46, 105, 96, 102, 66, 49, 51, 121, 55, 104, 22, 114, 126, 100, 44, 41, 40, 58, 4, 84, 39, 36, 45, 35, 98, 76, 33, 89, 34, 1, 8, 42, 14, 37, 92, 101, 95, 27, 88, 5, 80, 85, 16, 99, 72, 28, 70, 31, 79, 12, 24, 25, 91, 32, 86, 10, 18, 71, 78, 87, 13, 73, 29, 17, 26]], "model.layers.27.self_attn.qk_proj": [[120, 118, 63, 45, 109, 47, 111, 42, 53, 48, 61, 106, 119, 51, 54, 58, 104, 60, 122, 56, 126, 40, 116, 31, 85, 94, 62, 52, 35, 55, 26, 83, 21, 91, 19, 125, 87, 121, 102, 110, 127, 95, 57, 117, 90, 23, 105, 46, 113, 49, 112, 108, 33, 80, 41, 77, 6, 16, 81, 34, 13, 98, 84, 88, 27, 50, 17, 10, 115, 86, 44, 38, 79, 59, 114, 24, 15, 20, 22, 37, 124, 36, 74, 123, 30, 64, 97, 100, 28, 0, 25, 18, 107, 93, 12, 76, 82, 101, 72, 43, 39, 11, 96, 92, 29, 89, 32, 9, 103, 69, 8, 99, 4, 65, 70, 73, 67, 14, 68, 3, 75, 66, 78, 2, 5, 71, 1, 7], [120, 63, 118, 45, 109, 47, 111, 42, 53, 48, 51, 119, 106, 61, 54, 122, 58, 104, 60, 126, 125, 56, 31, 116, 62, 40, 121, 35, 55, 52, 85, 102, 19, 94, 95, 21, 46, 26, 23, 127, 105, 110, 57, 113, 112, 90, 33, 6, 117, 98, 49, 91, 87, 77, 108, 41, 13, 124, 50, 83, 80, 44, 59, 17, 114, 86, 115, 22, 123, 24, 38, 27, 16, 36, 88, 84, 15, 10, 74, 81, 79, 37, 34, 64, 93, 28, 20, 100, 43, 30, 0, 82, 101, 97, 107, 8, 96, 12, 11, 92, 68, 76, 25, 18, 9, 32, 39, 103, 1, 72, 69, 65, 99, 29, 3, 73, 66, 2, 75, 78, 4, 89, 5, 67, 7, 71, 14, 70], [120, 118, 63, 45, 109, 47, 111, 42, 53, 48, 51, 61, 54, 106, 119, 58, 60, 56, 122, 104, 40, 116, 126, 62, 121, 125, 85, 31, 19, 105, 127, 94, 21, 46, 26, 83, 49, 110, 112, 52, 55, 35, 6, 117, 113, 57, 91, 102, 87, 95, 98, 23, 90, 80, 33, 17, 16, 108, 41, 123, 13, 27, 77, 43, 88, 24, 107, 115, 22, 38, 79, 37, 36, 34, 10, 59, 50, 44, 81, 84, 86, 64, 114, 93, 97, 30, 15, 124, 100, 0, 20, 8, 18, 74, 28, 92, 39, 101, 12, 82, 96, 11, 9, 68, 32, 4, 29, 76, 78, 99, 25, 65, 103, 1, 67, 73, 69, 14, 75, 89, 3, 66, 5, 72, 7, 2, 71, 70], [120, 118, 63, 45, 109, 47, 111, 42, 53, 48, 51, 61, 106, 119, 54, 58, 60, 104, 122, 62, 116, 56, 40, 126, 125, 127, 85, 121, 31, 55, 46, 19, 26, 110, 113, 52, 112, 35, 87, 105, 102, 83, 91, 21, 49, 6, 94, 57, 16, 95, 124, 41, 117, 90, 77, 59, 98, 123, 43, 86, 17, 23, 115, 108, 79, 33, 27, 13, 10, 37, 88, 22, 44, 80, 107, 24, 81, 84, 8, 34, 0, 114, 30, 93, 64, 74, 15, 97, 50, 36, 38, 101, 100, 18, 28, 69, 68, 82, 76, 92, 20, 12, 73, 65, 11, 67, 29, 9, 1, 103, 4, 3, 99, 2, 96, 39, 71, 32, 70, 5, 25, 78, 66, 14, 89, 75, 72, 7], [120, 118, 63, 45, 109, 47, 111, 42, 53, 48, 51, 61, 54, 106, 119, 58, 104, 122, 60, 40, 126, 56, 116, 85, 125, 52, 112, 21, 35, 62, 46, 87, 83, 121, 127, 31, 105, 113, 90, 26, 110, 91, 49, 55, 57, 117, 19, 108, 98, 27, 16, 102, 13, 94, 23, 41, 95, 123, 6, 59, 17, 77, 86, 34, 33, 50, 124, 80, 74, 8, 79, 88, 44, 64, 22, 115, 84, 81, 15, 37, 93, 114, 107, 70, 20, 36, 24, 0, 10, 97, 18, 28, 82, 38, 43, 30, 73, 92, 12, 69, 11, 29, 25, 101, 100, 76, 68, 103, 96, 2, 39, 99, 9, 65, 67, 3, 1, 78, 5, 14, 72, 89, 4, 71, 7, 75, 32, 66], [120, 118, 63, 45, 109, 47, 42, 111, 53, 48, 106, 51, 119, 54, 61, 58, 104, 60, 122, 116, 56, 126, 40, 127, 21, 87, 125, 85, 62, 52, 83, 110, 113, 35, 90, 112, 94, 46, 49, 105, 33, 13, 102, 41, 31, 26, 91, 77, 117, 57, 98, 23, 108, 17, 19, 16, 124, 15, 121, 86, 59, 80, 34, 95, 79, 22, 55, 8, 44, 27, 70, 81, 115, 20, 88, 10, 84, 74, 64, 43, 37, 24, 0, 38, 123, 18, 114, 50, 30, 107, 6, 39, 28, 12, 97, 100, 36, 93, 82, 92, 96, 73, 3, 75, 4, 25, 9, 101, 5, 99, 76, 65, 29, 11, 66, 69, 103, 78, 68, 2, 1, 67, 71, 7, 14, 32, 72, 89], [120, 118, 63, 45, 109, 47, 42, 111, 53, 48, 106, 51, 61, 119, 54, 58, 104, 122, 116, 56, 40, 94, 126, 60, 90, 87, 46, 83, 35, 52, 21, 125, 127, 85, 113, 110, 23, 112, 105, 62, 31, 57, 49, 102, 19, 55, 91, 121, 98, 117, 41, 27, 80, 77, 22, 33, 16, 95, 26, 124, 13, 70, 88, 59, 15, 24, 34, 84, 115, 108, 17, 86, 81, 8, 10, 79, 20, 18, 114, 44, 50, 38, 74, 30, 36, 123, 107, 37, 82, 28, 97, 64, 25, 93, 0, 43, 100, 39, 12, 96, 11, 76, 101, 99, 92, 66, 73, 14, 103, 69, 9, 32, 4, 5, 75, 71, 29, 89, 78, 3, 1, 68, 72, 65, 7, 6, 2, 67], [120, 118, 63, 45, 109, 47, 111, 42, 53, 48, 61, 51, 106, 54, 119, 58, 122, 56, 60, 104, 126, 94, 40, 116, 35, 125, 52, 21, 105, 121, 85, 87, 31, 112, 110, 55, 83, 19, 113, 46, 102, 90, 62, 33, 95, 23, 98, 117, 57, 127, 49, 91, 27, 26, 70, 108, 80, 41, 22, 88, 115, 77, 13, 38, 16, 84, 36, 81, 79, 10, 24, 86, 59, 34, 114, 124, 28, 15, 17, 107, 74, 97, 44, 82, 20, 123, 30, 25, 0, 100, 50, 64, 43, 37, 93, 101, 8, 32, 12, 18, 99, 92, 76, 103, 96, 29, 72, 39, 66, 11, 75, 89, 9, 5, 4, 68, 14, 78, 73, 65, 3, 1, 67, 69, 7, 71, 2, 6], [120, 118, 63, 45, 109, 47, 42, 111, 53, 48, 61, 51, 106, 119, 54, 58, 122, 104, 60, 56, 126, 116, 21, 94, 52, 121, 125, 40, 31, 117, 19, 85, 105, 110, 95, 35, 57, 62, 90, 87, 26, 113, 127, 112, 33, 23, 102, 46, 91, 83, 49, 16, 27, 59, 13, 55, 98, 80, 44, 114, 41, 77, 70, 108, 115, 22, 88, 50, 124, 17, 38, 24, 123, 34, 37, 10, 79, 84, 92, 86, 81, 43, 15, 36, 18, 97, 20, 74, 107, 30, 100, 28, 82, 64, 103, 93, 96, 0, 32, 101, 9, 76, 25, 89, 39, 12, 72, 99, 8, 75, 4, 73, 14, 29, 11, 68, 78, 5, 3, 2, 67, 7, 6, 69, 1, 65, 71, 66], [120, 118, 63, 45, 109, 47, 111, 42, 53, 48, 61, 51, 54, 106, 119, 58, 60, 122, 104, 125, 116, 56, 55, 40, 52, 46, 31, 121, 126, 85, 62, 35, 94, 90, 112, 83, 113, 127, 21, 110, 33, 95, 105, 117, 87, 26, 102, 49, 59, 124, 70, 19, 115, 23, 98, 41, 50, 91, 77, 80, 57, 108, 27, 0, 10, 74, 16, 114, 86, 123, 13, 43, 15, 34, 22, 17, 38, 30, 81, 44, 24, 64, 84, 100, 28, 36, 79, 20, 37, 107, 88, 72, 82, 92, 93, 18, 9, 97, 6, 32, 8, 65, 25, 39, 68, 76, 73, 5, 1, 67, 75, 96, 101, 69, 4, 2, 29, 103, 3, 12, 7, 66, 99, 11, 14, 78, 89, 71], [120, 63, 118, 45, 109, 47, 42, 111, 53, 48, 61, 51, 119, 106, 54, 58, 104, 60, 122, 126, 40, 56, 116, 85, 125, 31, 87, 113, 117, 21, 94, 62, 55, 46, 105, 57, 127, 83, 26, 91, 35, 52, 90, 23, 112, 33, 80, 27, 95, 124, 19, 115, 110, 102, 13, 81, 121, 59, 77, 16, 74, 86, 49, 34, 6, 72, 79, 15, 98, 108, 114, 0, 22, 17, 41, 38, 24, 50, 10, 20, 43, 70, 44, 64, 88, 84, 30, 37, 18, 82, 36, 93, 28, 107, 123, 75, 100, 73, 12, 97, 68, 76, 92, 9, 3, 1, 101, 5, 29, 14, 99, 4, 67, 78, 8, 66, 11, 96, 32, 69, 2, 39, 89, 7, 103, 65, 71, 25], [120, 118, 63, 45, 109, 47, 42, 111, 53, 48, 106, 61, 119, 51, 54, 104, 58, 56, 60, 122, 116, 126, 40, 31, 87, 21, 105, 85, 83, 113, 49, 19, 127, 35, 46, 23, 94, 55, 102, 117, 90, 52, 121, 125, 110, 6, 33, 91, 115, 112, 95, 77, 41, 26, 62, 80, 57, 34, 86, 124, 13, 98, 16, 79, 27, 72, 17, 59, 88, 50, 44, 108, 15, 114, 81, 22, 84, 74, 24, 123, 38, 10, 20, 0, 28, 43, 97, 93, 64, 30, 18, 75, 37, 76, 107, 99, 25, 73, 82, 92, 14, 100, 36, 12, 67, 68, 96, 9, 29, 101, 39, 65, 11, 78, 103, 2, 32, 3, 7, 70, 8, 89, 69, 66, 4, 5, 71, 1], [120, 118, 63, 45, 109, 47, 42, 111, 53, 48, 61, 106, 51, 119, 54, 58, 56, 104, 60, 122, 116, 40, 105, 126, 121, 35, 52, 49, 125, 94, 112, 102, 110, 87, 23, 21, 57, 85, 90, 83, 113, 46, 31, 33, 19, 62, 41, 117, 26, 91, 95, 98, 115, 80, 55, 108, 27, 86, 127, 6, 124, 24, 15, 77, 22, 34, 79, 59, 38, 13, 50, 74, 88, 72, 123, 44, 114, 84, 16, 30, 17, 10, 100, 37, 81, 18, 20, 36, 28, 93, 43, 92, 64, 82, 97, 25, 0, 39, 75, 101, 107, 89, 12, 32, 96, 9, 14, 29, 76, 103, 99, 73, 4, 5, 1, 2, 68, 69, 66, 3, 78, 11, 8, 7, 67, 71, 65, 70], [120, 118, 63, 45, 109, 47, 42, 111, 53, 48, 119, 106, 61, 54, 51, 58, 56, 60, 104, 122, 40, 126, 116, 62, 94, 125, 52, 46, 49, 31, 121, 117, 127, 21, 105, 114, 57, 85, 110, 87, 83, 113, 90, 6, 91, 98, 35, 102, 112, 23, 13, 19, 27, 26, 55, 95, 124, 33, 41, 80, 86, 59, 44, 16, 72, 24, 115, 74, 50, 84, 34, 123, 17, 108, 10, 77, 38, 36, 79, 0, 22, 43, 15, 88, 64, 81, 107, 37, 93, 82, 20, 30, 100, 18, 28, 4, 75, 92, 97, 76, 101, 32, 9, 25, 96, 2, 12, 29, 78, 5, 3, 39, 1, 68, 73, 14, 69, 67, 99, 65, 89, 103, 7, 11, 71, 66, 8, 70], [120, 63, 118, 45, 109, 47, 42, 111, 53, 48, 106, 61, 51, 54, 119, 58, 104, 60, 122, 116, 56, 126, 40, 94, 87, 52, 125, 21, 46, 31, 35, 62, 85, 83, 19, 105, 112, 121, 90, 113, 102, 127, 117, 26, 95, 57, 91, 55, 110, 27, 33, 124, 80, 49, 59, 114, 13, 81, 6, 41, 86, 15, 10, 108, 23, 50, 72, 34, 17, 64, 88, 16, 84, 98, 115, 44, 43, 36, 74, 77, 79, 123, 20, 100, 30, 37, 0, 38, 24, 73, 97, 75, 93, 12, 18, 22, 82, 32, 107, 28, 76, 39, 101, 92, 89, 69, 65, 68, 3, 5, 70, 4, 25, 9, 67, 1, 96, 2, 103, 29, 66, 78, 11, 14, 71, 99, 8, 7], [120, 63, 118, 45, 109, 47, 42, 111, 53, 48, 51, 106, 61, 119, 54, 58, 104, 56, 60, 122, 116, 126, 85, 87, 83, 62, 105, 40, 35, 21, 52, 102, 31, 125, 46, 94, 95, 113, 91, 19, 33, 55, 110, 57, 13, 23, 26, 121, 59, 112, 90, 127, 41, 80, 117, 108, 115, 34, 81, 124, 49, 16, 77, 27, 38, 79, 17, 15, 44, 123, 10, 86, 37, 22, 114, 50, 6, 43, 88, 84, 74, 98, 24, 72, 0, 36, 93, 20, 12, 75, 76, 92, 101, 82, 100, 28, 70, 30, 73, 18, 64, 9, 4, 97, 99, 107, 8, 1, 5, 96, 14, 68, 11, 39, 25, 3, 2, 32, 89, 69, 29, 78, 71, 66, 7, 103, 67, 65], [120, 118, 63, 45, 109, 47, 53, 42, 111, 48, 106, 51, 61, 54, 119, 58, 104, 122, 56, 60, 52, 116, 121, 46, 94, 126, 35, 21, 40, 95, 83, 102, 31, 49, 85, 23, 90, 110, 125, 62, 113, 41, 33, 55, 112, 57, 87, 19, 105, 26, 91, 98, 13, 80, 127, 86, 108, 115, 59, 124, 34, 27, 15, 16, 88, 117, 123, 24, 92, 44, 28, 10, 22, 17, 84, 74, 114, 70, 50, 36, 77, 81, 79, 30, 97, 20, 38, 0, 100, 93, 37, 82, 18, 64, 43, 101, 107, 72, 8, 12, 103, 9, 25, 99, 96, 32, 39, 29, 76, 69, 1, 4, 89, 66, 73, 6, 68, 78, 75, 14, 67, 11, 3, 7, 65, 2, 71, 5], [120, 63, 118, 45, 109, 47, 42, 111, 53, 48, 51, 61, 119, 106, 54, 58, 122, 104, 60, 56, 116, 94, 121, 40, 52, 62, 126, 85, 31, 35, 90, 21, 105, 19, 125, 49, 87, 110, 95, 102, 46, 83, 55, 23, 41, 112, 91, 59, 117, 26, 108, 113, 98, 57, 33, 27, 70, 24, 13, 127, 81, 114, 17, 44, 74, 77, 22, 16, 86, 15, 36, 79, 80, 43, 34, 124, 88, 50, 37, 38, 115, 100, 10, 84, 92, 123, 97, 28, 30, 0, 64, 20, 8, 12, 32, 101, 18, 107, 82, 96, 72, 9, 75, 1, 103, 25, 69, 76, 29, 93, 4, 78, 89, 99, 68, 67, 39, 11, 66, 7, 73, 14, 2, 5, 3, 6, 71, 65], [120, 118, 63, 45, 109, 47, 42, 111, 53, 48, 106, 119, 51, 61, 54, 58, 104, 60, 122, 116, 40, 121, 52, 56, 125, 62, 126, 46, 35, 85, 19, 49, 110, 21, 105, 102, 90, 94, 31, 108, 95, 70, 87, 55, 117, 127, 124, 112, 59, 26, 113, 91, 33, 23, 57, 83, 41, 37, 123, 114, 98, 13, 115, 79, 50, 44, 38, 80, 16, 8, 81, 27, 10, 36, 17, 15, 34, 24, 88, 43, 84, 64, 0, 77, 22, 30, 86, 107, 74, 1, 20, 97, 100, 82, 92, 101, 75, 93, 28, 9, 99, 18, 32, 69, 12, 103, 96, 25, 2, 73, 4, 78, 76, 5, 39, 72, 89, 29, 67, 68, 11, 3, 65, 66, 7, 71, 6, 14], [120, 118, 63, 45, 109, 47, 111, 42, 53, 48, 51, 106, 61, 119, 54, 58, 104, 60, 122, 116, 40, 52, 62, 35, 56, 113, 126, 85, 55, 46, 125, 94, 110, 102, 90, 21, 26, 121, 105, 31, 23, 70, 57, 83, 127, 41, 117, 33, 19, 95, 87, 59, 108, 112, 49, 124, 13, 91, 86, 16, 27, 123, 50, 8, 98, 15, 37, 77, 114, 80, 74, 44, 10, 17, 38, 24, 34, 115, 81, 88, 43, 36, 79, 64, 22, 30, 107, 84, 100, 97, 82, 0, 93, 73, 28, 20, 92, 32, 101, 12, 1, 68, 18, 66, 96, 4, 75, 76, 69, 103, 39, 3, 5, 9, 25, 67, 99, 14, 78, 29, 65, 71, 2, 72, 89, 7, 11, 6], [120, 63, 118, 45, 109, 47, 111, 42, 53, 48, 106, 61, 51, 54, 58, 119, 104, 60, 116, 122, 40, 56, 126, 85, 55, 125, 105, 52, 113, 94, 62, 46, 121, 21, 35, 83, 110, 90, 87, 127, 117, 31, 19, 57, 102, 95, 41, 49, 115, 26, 124, 16, 91, 23, 112, 77, 33, 108, 98, 13, 34, 27, 59, 81, 86, 15, 70, 123, 38, 74, 44, 8, 22, 80, 50, 43, 88, 10, 79, 17, 36, 84, 30, 114, 20, 92, 24, 100, 28, 37, 0, 75, 107, 99, 82, 25, 97, 39, 18, 101, 93, 12, 6, 64, 73, 32, 9, 76, 14, 68, 4, 5, 67, 1, 78, 66, 103, 3, 69, 96, 72, 11, 65, 71, 7, 89, 29, 2], [120, 118, 63, 45, 109, 47, 111, 42, 53, 48, 51, 61, 106, 54, 119, 58, 122, 60, 104, 116, 121, 40, 126, 125, 56, 62, 110, 52, 46, 85, 113, 55, 21, 94, 127, 35, 105, 102, 112, 19, 117, 31, 124, 59, 87, 83, 26, 49, 90, 57, 95, 41, 43, 108, 44, 114, 23, 98, 91, 13, 38, 33, 123, 0, 27, 88, 77, 16, 80, 6, 34, 8, 10, 64, 86, 37, 15, 115, 36, 24, 74, 81, 100, 79, 50, 17, 107, 22, 28, 30, 93, 97, 1, 84, 32, 82, 20, 92, 101, 5, 70, 39, 18, 73, 68, 4, 66, 12, 9, 69, 76, 67, 2, 65, 103, 75, 11, 14, 96, 29, 99, 25, 71, 3, 89, 78, 7, 72], [120, 118, 63, 45, 109, 47, 111, 42, 53, 48, 61, 106, 54, 51, 119, 58, 122, 56, 104, 60, 116, 40, 52, 117, 85, 126, 125, 105, 46, 35, 62, 121, 31, 94, 95, 21, 83, 87, 55, 102, 59, 49, 127, 57, 90, 113, 26, 110, 124, 112, 19, 6, 41, 114, 91, 27, 23, 33, 108, 98, 34, 44, 123, 16, 80, 77, 115, 88, 13, 15, 38, 10, 22, 36, 24, 28, 81, 84, 79, 8, 74, 50, 100, 37, 97, 86, 107, 30, 43, 0, 17, 93, 18, 92, 39, 82, 20, 101, 25, 64, 9, 4, 68, 103, 96, 76, 75, 1, 89, 99, 32, 12, 11, 69, 73, 78, 2, 3, 5, 67, 29, 71, 72, 66, 7, 65, 14, 70], [120, 118, 63, 45, 109, 47, 111, 42, 53, 48, 51, 106, 61, 119, 54, 58, 104, 122, 60, 56, 116, 40, 126, 52, 21, 105, 94, 31, 85, 125, 113, 121, 35, 46, 87, 90, 83, 95, 110, 49, 57, 6, 55, 59, 62, 26, 102, 127, 41, 112, 33, 117, 124, 19, 23, 44, 108, 91, 114, 27, 86, 80, 77, 98, 22, 34, 16, 13, 88, 10, 81, 107, 15, 123, 17, 115, 74, 43, 8, 79, 37, 38, 84, 50, 30, 36, 20, 97, 18, 92, 100, 0, 93, 24, 82, 28, 64, 4, 76, 101, 12, 9, 73, 75, 96, 25, 5, 39, 11, 78, 68, 3, 14, 2, 103, 69, 66, 29, 32, 67, 72, 1, 65, 99, 7, 89, 71, 70], [120, 118, 63, 45, 109, 111, 47, 42, 53, 48, 51, 106, 54, 61, 119, 122, 104, 58, 60, 116, 40, 126, 56, 125, 52, 21, 83, 87, 90, 85, 94, 62, 105, 121, 35, 46, 124, 55, 6, 117, 27, 33, 113, 31, 49, 26, 102, 127, 95, 91, 110, 115, 19, 77, 112, 23, 80, 13, 86, 108, 41, 17, 59, 123, 57, 98, 16, 22, 88, 34, 79, 10, 15, 38, 84, 24, 44, 43, 74, 30, 81, 114, 50, 20, 8, 36, 64, 107, 18, 37, 28, 82, 93, 97, 0, 100, 25, 9, 92, 72, 101, 12, 99, 73, 75, 103, 78, 96, 32, 1, 76, 5, 66, 68, 69, 4, 11, 29, 3, 7, 14, 89, 65, 2, 39, 67, 71, 70], [120, 118, 63, 45, 109, 47, 111, 53, 42, 48, 51, 61, 106, 54, 119, 60, 122, 58, 104, 56, 116, 125, 21, 35, 40, 121, 52, 126, 87, 31, 62, 83, 94, 95, 46, 105, 55, 57, 85, 33, 113, 102, 117, 127, 26, 115, 49, 91, 23, 59, 110, 19, 90, 108, 123, 41, 124, 77, 114, 27, 13, 98, 22, 50, 44, 36, 112, 80, 38, 34, 88, 86, 24, 6, 16, 10, 15, 81, 84, 43, 30, 79, 37, 28, 17, 93, 0, 92, 74, 97, 100, 107, 18, 72, 32, 82, 101, 64, 20, 103, 76, 96, 25, 1, 29, 73, 8, 12, 9, 39, 68, 78, 75, 89, 11, 99, 14, 3, 69, 66, 4, 5, 67, 7, 70, 65, 71, 2], [120, 118, 63, 45, 47, 109, 42, 111, 53, 48, 54, 106, 61, 119, 51, 104, 122, 56, 58, 60, 126, 116, 31, 40, 94, 125, 35, 62, 117, 52, 21, 87, 121, 105, 127, 55, 83, 46, 85, 95, 90, 91, 115, 33, 102, 19, 113, 23, 41, 112, 57, 27, 26, 114, 110, 108, 13, 44, 77, 49, 124, 22, 59, 98, 80, 34, 16, 36, 81, 88, 123, 17, 72, 24, 38, 74, 37, 15, 86, 107, 100, 50, 10, 18, 79, 84, 43, 97, 92, 20, 25, 82, 30, 28, 101, 93, 64, 70, 6, 103, 96, 39, 12, 68, 0, 73, 29, 32, 4, 11, 99, 76, 69, 9, 1, 75, 3, 14, 67, 66, 78, 8, 89, 65, 7, 2, 71, 5], [120, 118, 63, 45, 109, 47, 111, 42, 53, 48, 51, 61, 54, 106, 119, 58, 104, 60, 122, 116, 126, 56, 40, 52, 62, 21, 94, 125, 35, 46, 112, 127, 31, 85, 90, 87, 121, 102, 19, 105, 95, 108, 110, 55, 41, 83, 26, 23, 49, 117, 113, 33, 59, 70, 91, 123, 27, 13, 57, 72, 16, 98, 77, 124, 115, 44, 43, 79, 17, 34, 88, 80, 37, 50, 15, 86, 36, 22, 114, 74, 81, 10, 64, 84, 100, 0, 38, 30, 24, 20, 18, 39, 107, 92, 82, 28, 4, 93, 5, 66, 101, 97, 73, 76, 9, 1, 11, 32, 12, 25, 6, 69, 89, 65, 68, 75, 29, 67, 3, 7, 2, 78, 96, 14, 103, 99, 8, 71], [120, 118, 63, 45, 109, 47, 111, 42, 53, 48, 106, 61, 51, 54, 119, 58, 122, 104, 60, 56, 116, 125, 40, 126, 46, 85, 55, 52, 62, 121, 35, 70, 49, 90, 31, 87, 117, 19, 105, 26, 83, 21, 57, 94, 112, 113, 110, 102, 95, 124, 91, 27, 108, 41, 17, 115, 50, 77, 33, 13, 64, 72, 34, 127, 23, 59, 16, 81, 44, 80, 86, 10, 43, 123, 38, 36, 84, 98, 0, 79, 15, 100, 74, 88, 22, 1, 37, 20, 28, 24, 107, 92, 9, 30, 93, 114, 82, 76, 101, 97, 11, 2, 5, 73, 25, 12, 18, 29, 96, 3, 67, 66, 39, 4, 103, 75, 68, 99, 69, 78, 32, 89, 14, 7, 65, 71, 8, 6], [120, 63, 118, 45, 109, 47, 42, 111, 53, 48, 51, 106, 61, 54, 119, 58, 60, 122, 104, 56, 125, 116, 126, 52, 94, 40, 85, 62, 90, 87, 21, 55, 46, 35, 117, 83, 31, 105, 49, 102, 70, 110, 113, 23, 57, 26, 123, 19, 121, 127, 91, 59, 77, 95, 33, 114, 112, 13, 41, 108, 27, 16, 34, 80, 124, 22, 81, 86, 44, 15, 10, 72, 98, 17, 84, 43, 74, 115, 38, 37, 36, 79, 88, 24, 50, 20, 100, 18, 107, 64, 97, 92, 82, 28, 101, 93, 76, 96, 0, 73, 30, 11, 25, 9, 4, 68, 12, 103, 39, 32, 5, 69, 99, 14, 89, 65, 3, 1, 78, 75, 67, 66, 29, 7, 71, 8, 2, 6], [120, 63, 118, 45, 109, 47, 42, 53, 111, 48, 51, 61, 106, 54, 119, 58, 60, 104, 56, 122, 125, 116, 126, 40, 94, 52, 62, 35, 121, 57, 85, 87, 105, 83, 46, 21, 110, 19, 26, 90, 31, 55, 113, 98, 49, 70, 102, 117, 91, 127, 95, 41, 112, 23, 33, 59, 124, 115, 123, 77, 86, 16, 72, 34, 27, 88, 79, 15, 44, 10, 81, 80, 74, 13, 108, 64, 114, 84, 17, 43, 22, 24, 0, 28, 38, 50, 36, 30, 20, 97, 92, 100, 82, 37, 93, 68, 107, 101, 4, 25, 18, 5, 76, 9, 39, 69, 73, 66, 67, 11, 78, 1, 32, 65, 96, 3, 14, 12, 71, 2, 8, 103, 75, 29, 99, 7, 89, 6], [120, 118, 63, 45, 109, 47, 42, 111, 53, 48, 106, 61, 51, 54, 119, 58, 122, 60, 104, 56, 126, 40, 125, 116, 52, 85, 31, 35, 94, 121, 87, 83, 19, 90, 21, 117, 55, 62, 105, 57, 108, 26, 127, 95, 46, 110, 77, 102, 33, 112, 86, 113, 49, 23, 41, 91, 98, 27, 124, 13, 80, 16, 79, 38, 22, 44, 114, 115, 34, 15, 17, 74, 50, 10, 88, 59, 70, 81, 72, 36, 100, 24, 30, 25, 84, 20, 82, 123, 76, 97, 37, 92, 43, 93, 28, 18, 64, 11, 0, 12, 73, 107, 99, 96, 6, 101, 39, 32, 78, 4, 14, 9, 67, 69, 103, 66, 29, 89, 68, 8, 5, 65, 1, 75, 3, 71, 7, 2]], "model.layers.28.self_attn.q_proj": [[40, 120, 48, 34, 27, 51, 23, 85, 116, 126, 89, 53, 54, 16, 17, 112, 121, 3, 62, 31, 125, 124, 61, 93, 123, 50, 52, 109, 58, 114, 30, 49, 122, 19, 45, 110, 42, 7, 78, 29, 57, 82, 119, 60, 8, 115, 113, 59, 47, 55, 111, 63, 107, 117, 5, 118, 12, 56, 108, 46, 43, 79, 127, 74, 14, 44, 105, 88, 71, 100, 21, 20, 41, 65, 37, 64, 106, 11, 39, 1, 101, 38, 36, 87, 35, 91, 102, 0, 103, 22, 97, 66, 95, 6, 96, 10, 99, 98, 104, 84, 26, 32, 94, 2, 4, 90, 68, 73, 24, 77, 15, 13, 33, 9, 25, 28, 69, 18, 67, 83, 92, 80, 86, 72, 81, 75, 70, 76], [40, 120, 34, 48, 23, 27, 85, 17, 16, 31, 51, 11, 25, 13, 97, 61, 29, 62, 100, 55, 4, 78, 126, 19, 87, 56, 58, 73, 70, 59, 93, 28, 36, 35, 21, 113, 79, 46, 68, 22, 53, 18, 32, 96, 66, 109, 92, 10, 123, 95, 84, 8, 60, 63, 127, 0, 106, 49, 104, 37, 12, 112, 114, 116, 94, 26, 119, 99, 124, 7, 101, 30, 117, 83, 76, 103, 2, 118, 20, 44, 89, 52, 54, 125, 108, 81, 38, 33, 77, 110, 107, 15, 39, 102, 121, 111, 43, 122, 80, 86, 91, 47, 14, 57, 24, 90, 41, 82, 105, 50, 98, 75, 45, 74, 115, 5, 6, 88, 42, 67, 9, 72, 69, 71, 1, 65, 3, 64], [120, 40, 34, 48, 62, 23, 51, 27, 126, 85, 31, 93, 117, 109, 112, 58, 56, 113, 107, 16, 61, 50, 55, 44, 89, 123, 17, 19, 49, 53, 47, 8, 0, 30, 116, 106, 21, 95, 42, 52, 46, 25, 79, 43, 29, 108, 100, 121, 111, 119, 68, 110, 11, 57, 73, 38, 98, 54, 122, 59, 96, 36, 124, 118, 99, 63, 5, 103, 12, 105, 41, 78, 127, 125, 115, 94, 114, 97, 45, 35, 87, 37, 33, 60, 32, 39, 102, 22, 66, 101, 14, 24, 2, 70, 13, 10, 26, 86, 28, 91, 88, 92, 65, 18, 82, 67, 90, 6, 83, 84, 77, 4, 20, 64, 71, 75, 81, 104, 7, 1, 80, 69, 3, 15, 76, 9, 72, 74], [40, 120, 48, 34, 23, 63, 51, 31, 27, 62, 85, 49, 13, 123, 61, 108, 56, 79, 1, 110, 53, 55, 67, 93, 10, 64, 107, 16, 74, 113, 30, 19, 2, 17, 117, 70, 124, 18, 100, 59, 115, 127, 45, 68, 38, 26, 103, 102, 105, 111, 58, 46, 122, 119, 109, 52, 37, 116, 96, 36, 54, 24, 78, 106, 121, 112, 60, 50, 6, 84, 11, 125, 29, 114, 21, 118, 126, 0, 101, 57, 94, 43, 12, 73, 89, 88, 97, 47, 77, 39, 22, 35, 98, 95, 41, 5, 99, 33, 65, 44, 28, 32, 42, 86, 87, 92, 90, 8, 14, 72, 83, 25, 9, 82, 20, 69, 91, 3, 76, 7, 75, 80, 81, 15, 4, 71, 104, 66], [40, 125, 87, 17, 34, 21, 82, 70, 12, 74, 14, 4, 44, 0, 8, 124, 113, 111, 66, 54, 79, 58, 127, 68, 52, 109, 73, 110, 53, 19, 28, 75, 16, 33, 35, 123, 22, 59, 104, 27, 1, 95, 115, 119, 69, 86, 25, 114, 42, 50, 89, 18, 93, 108, 11, 26, 71, 96, 13, 90, 77, 56, 116, 67, 84, 63, 55, 5, 64, 62, 81, 78, 6, 92, 85, 106, 51, 7, 45, 39, 101, 23, 57, 103, 122, 98, 49, 94, 91, 88, 99, 32, 107, 48, 9, 2, 100, 29, 83, 105, 112, 65, 60, 24, 15, 76, 97, 3, 61, 47, 72, 38, 80, 36, 37, 121, 126, 46, 30, 20, 10, 31, 102, 118, 120, 41, 117, 43], [125, 40, 3, 87, 34, 12, 17, 14, 70, 0, 74, 1, 67, 66, 82, 21, 58, 64, 65, 8, 4, 69, 124, 54, 113, 25, 44, 111, 106, 52, 71, 49, 127, 2, 28, 75, 104, 109, 110, 50, 11, 42, 114, 22, 5, 79, 43, 26, 23, 121, 37, 68, 123, 101, 53, 86, 96, 31, 98, 112, 90, 27, 95, 77, 78, 20, 59, 13, 35, 122, 102, 81, 16, 29, 119, 6, 19, 91, 45, 39, 80, 47, 24, 32, 73, 117, 9, 76, 118, 103, 7, 88, 93, 84, 46, 72, 18, 33, 116, 55, 41, 30, 126, 105, 36, 115, 89, 62, 15, 120, 60, 63, 10, 57, 85, 48, 100, 38, 92, 107, 108, 94, 56, 83, 51, 97, 61, 99], [125, 40, 87, 34, 82, 21, 124, 12, 17, 14, 74, 109, 103, 50, 19, 4, 44, 8, 71, 95, 89, 92, 54, 70, 123, 127, 90, 9, 111, 68, 93, 108, 5, 39, 28, 115, 43, 41, 25, 77, 29, 110, 42, 105, 27, 61, 59, 100, 119, 58, 116, 64, 56, 66, 22, 79, 114, 88, 113, 47, 73, 121, 45, 91, 20, 60, 62, 106, 18, 75, 37, 49, 15, 10, 57, 52, 122, 31, 63, 51, 23, 48, 38, 33, 26, 96, 94, 6, 53, 32, 117, 86, 120, 112, 102, 36, 80, 97, 101, 72, 118, 83, 35, 69, 126, 99, 107, 46, 30, 85, 24, 55, 81, 13, 84, 78, 2, 98, 16, 7, 76, 11, 0, 1, 3, 67, 65, 104], [40, 125, 87, 0, 17, 34, 21, 12, 82, 1, 14, 66, 124, 4, 70, 50, 74, 44, 75, 58, 26, 127, 54, 69, 110, 113, 28, 111, 8, 95, 52, 13, 88, 33, 45, 79, 93, 42, 114, 64, 22, 103, 32, 98, 109, 67, 20, 123, 73, 119, 35, 11, 92, 19, 49, 104, 15, 39, 30, 27, 18, 90, 25, 59, 68, 96, 71, 102, 107, 53, 72, 101, 112, 62, 2, 65, 91, 60, 16, 77, 61, 80, 106, 99, 121, 115, 43, 7, 23, 105, 81, 84, 78, 55, 97, 3, 118, 76, 51, 122, 94, 83, 29, 89, 48, 117, 41, 86, 63, 9, 56, 5, 46, 108, 120, 31, 126, 24, 6, 116, 57, 85, 47, 100, 38, 10, 36, 37], [42, 36, 30, 43, 56, 106, 20, 118, 87, 77, 35, 82, 91, 123, 94, 46, 52, 80, 117, 49, 53, 89, 104, 120, 27, 99, 112, 17, 39, 47, 23, 122, 51, 78, 85, 54, 15, 11, 60, 19, 114, 103, 124, 71, 105, 125, 34, 58, 57, 95, 107, 108, 126, 110, 61, 62, 22, 74, 50, 32, 84, 109, 9, 121, 44, 88, 29, 13, 116, 101, 18, 55, 31, 59, 92, 115, 86, 45, 113, 98, 90, 111, 102, 41, 63, 48, 26, 12, 21, 25, 97, 119, 8, 24, 38, 127, 16, 37, 81, 83, 75, 14, 100, 33, 7, 96, 40, 28, 93, 72, 70, 69, 5, 76, 73, 79, 3, 67, 66, 2, 10, 68, 6, 1, 4, 65, 0, 64], [42, 36, 114, 118, 30, 15, 74, 35, 20, 17, 68, 87, 106, 77, 82, 70, 4, 91, 56, 76, 71, 0, 66, 123, 95, 64, 50, 31, 94, 43, 52, 21, 27, 23, 62, 7, 53, 11, 10, 122, 98, 9, 12, 29, 5, 67, 102, 84, 19, 18, 28, 111, 13, 65, 88, 1, 37, 85, 6, 8, 120, 2, 107, 32, 49, 112, 57, 103, 81, 83, 93, 14, 79, 69, 59, 96, 124, 46, 24, 44, 97, 26, 54, 126, 47, 109, 25, 16, 45, 73, 3, 60, 38, 72, 104, 86, 80, 78, 34, 108, 115, 41, 101, 58, 39, 22, 75, 61, 105, 92, 89, 48, 90, 51, 117, 33, 121, 119, 125, 110, 116, 63, 100, 113, 55, 127, 40, 99], [42, 36, 114, 118, 30, 87, 15, 123, 17, 35, 89, 74, 106, 20, 82, 94, 70, 56, 77, 50, 19, 66, 104, 52, 27, 68, 91, 8, 43, 6, 54, 72, 29, 44, 120, 58, 95, 75, 11, 47, 1, 38, 23, 51, 59, 122, 112, 113, 57, 49, 124, 61, 32, 86, 53, 62, 125, 99, 93, 111, 3, 116, 92, 102, 98, 12, 24, 103, 34, 71, 48, 28, 78, 64, 63, 76, 40, 127, 107, 108, 97, 46, 83, 119, 126, 96, 60, 110, 84, 10, 115, 81, 45, 25, 117, 37, 101, 0, 105, 21, 121, 2, 33, 31, 14, 109, 80, 55, 18, 26, 41, 9, 39, 85, 13, 16, 79, 5, 22, 90, 67, 88, 69, 100, 73, 7, 4, 65], [42, 114, 36, 35, 30, 106, 91, 103, 20, 87, 50, 27, 15, 17, 43, 74, 82, 94, 89, 68, 71, 123, 56, 104, 77, 95, 70, 120, 113, 85, 105, 65, 29, 62, 46, 3, 57, 127, 118, 61, 80, 119, 2, 125, 76, 122, 48, 117, 90, 1, 126, 47, 45, 111, 52, 44, 11, 51, 55, 109, 12, 49, 54, 121, 38, 75, 124, 63, 8, 102, 112, 67, 78, 107, 58, 66, 31, 59, 110, 115, 108, 116, 7, 72, 69, 81, 101, 41, 24, 86, 53, 33, 0, 73, 5, 26, 39, 28, 37, 19, 60, 13, 23, 21, 98, 84, 34, 93, 99, 14, 64, 92, 16, 9, 83, 96, 40, 25, 97, 88, 32, 22, 10, 4, 18, 79, 6, 100], [126, 41, 98, 56, 115, 89, 84, 13, 111, 80, 18, 112, 28, 45, 86, 75, 74, 31, 50, 21, 96, 82, 54, 58, 79, 5, 73, 8, 29, 81, 55, 11, 14, 70, 63, 120, 68, 4, 3, 25, 106, 119, 127, 117, 77, 40, 60, 71, 122, 92, 48, 10, 95, 88, 66, 46, 57, 59, 6, 76, 78, 83, 108, 34, 2, 93, 43, 51, 72, 16, 22, 0, 19, 20, 35, 124, 23, 44, 102, 121, 125, 62, 87, 107, 1, 113, 100, 61, 52, 24, 27, 85, 12, 90, 26, 17, 123, 49, 47, 105, 9, 114, 118, 32, 116, 103, 36, 7, 53, 99, 91, 110, 67, 109, 104, 33, 38, 94, 30, 97, 37, 39, 101, 65, 42, 64, 15, 69], [56, 41, 58, 98, 87, 63, 31, 32, 126, 28, 54, 122, 115, 44, 57, 17, 53, 60, 121, 46, 112, 43, 103, 55, 124, 116, 108, 127, 50, 120, 51, 26, 47, 117, 111, 86, 48, 113, 62, 52, 110, 61, 118, 119, 49, 45, 92, 114, 123, 109, 59, 125, 89, 101, 105, 40, 95, 107, 42, 23, 88, 22, 39, 91, 24, 100, 106, 84, 25, 94, 34, 30, 104, 37, 82, 15, 102, 38, 97, 99, 36, 35, 90, 21, 93, 80, 96, 33, 83, 29, 18, 20, 81, 79, 12, 27, 78, 19, 10, 85, 77, 1, 76, 14, 13, 7, 72, 66, 0, 5, 67, 3, 65, 16, 2, 64, 69, 11, 75, 71, 4, 8, 73, 70, 9, 74, 6, 68], [56, 41, 126, 98, 89, 79, 84, 18, 115, 80, 15, 31, 21, 28, 72, 13, 58, 10, 11, 63, 86, 82, 46, 4, 57, 40, 14, 73, 112, 70, 92, 77, 111, 100, 25, 23, 32, 122, 3, 121, 113, 60, 43, 127, 119, 75, 81, 16, 45, 87, 125, 95, 53, 66, 29, 62, 55, 109, 96, 51, 19, 124, 69, 5, 50, 52, 120, 9, 49, 71, 123, 1, 0, 48, 59, 22, 20, 47, 33, 76, 116, 8, 85, 6, 44, 27, 117, 103, 93, 17, 30, 61, 78, 37, 110, 64, 114, 88, 118, 24, 35, 2, 12, 91, 94, 83, 39, 90, 104, 54, 105, 99, 26, 102, 108, 68, 101, 7, 65, 97, 106, 36, 107, 42, 38, 74, 34, 67], [41, 126, 56, 63, 105, 60, 31, 127, 122, 87, 51, 52, 115, 98, 47, 49, 62, 121, 57, 123, 112, 124, 58, 46, 28, 117, 103, 114, 59, 116, 118, 45, 109, 86, 119, 54, 61, 113, 48, 55, 111, 120, 92, 50, 32, 53, 125, 44, 110, 108, 89, 84, 40, 43, 42, 107, 18, 39, 106, 104, 26, 100, 101, 37, 38, 102, 23, 7, 36, 35, 94, 96, 34, 91, 95, 99, 97, 17, 33, 80, 30, 25, 21, 85, 82, 88, 20, 22, 93, 24, 29, 27, 90, 11, 19, 83, 78, 77, 5, 73, 14, 81, 12, 3, 70, 0, 8, 71, 76, 79, 1, 16, 65, 10, 13, 67, 66, 64, 2, 74, 69, 9, 4, 72, 68, 75, 6, 15], [123, 103, 55, 47, 34, 125, 27, 95, 88, 32, 110, 122, 21, 15, 76, 19, 127, 24, 53, 97, 91, 94, 18, 36, 71, 51, 59, 43, 104, 38, 40, 101, 44, 118, 74, 106, 42, 26, 58, 102, 57, 52, 90, 117, 9, 92, 115, 111, 62, 2, 124, 96, 10, 87, 29, 48, 116, 33, 16, 126, 22, 78, 120, 112, 105, 85, 60, 63, 46, 107, 100, 114, 41, 113, 25, 1, 45, 54, 121, 37, 99, 23, 14, 31, 98, 49, 108, 56, 93, 39, 89, 50, 61, 109, 5, 79, 70, 119, 68, 13, 35, 28, 84, 82, 83, 86, 81, 30, 20, 80, 66, 64, 17, 11, 77, 12, 8, 73, 75, 3, 69, 4, 72, 7, 6, 65, 0, 67], [123, 103, 55, 34, 116, 27, 76, 88, 70, 0, 3, 18, 15, 1, 19, 68, 122, 66, 21, 95, 99, 71, 94, 91, 22, 59, 51, 9, 62, 125, 74, 56, 30, 73, 77, 67, 33, 126, 10, 47, 31, 37, 96, 109, 36, 57, 6, 79, 8, 107, 48, 80, 110, 29, 49, 35, 45, 118, 115, 58, 50, 63, 44, 121, 65, 86, 24, 26, 60, 17, 82, 85, 92, 23, 119, 40, 20, 52, 104, 13, 98, 112, 5, 120, 28, 113, 117, 114, 69, 72, 11, 102, 124, 81, 78, 14, 38, 53, 46, 43, 93, 61, 39, 87, 16, 108, 105, 75, 111, 42, 54, 106, 41, 25, 84, 32, 90, 97, 83, 101, 127, 4, 89, 100, 7, 12, 2, 64], [123, 103, 58, 122, 27, 88, 21, 91, 47, 18, 15, 59, 38, 34, 62, 9, 5, 95, 48, 120, 55, 49, 125, 60, 110, 44, 53, 121, 50, 56, 117, 119, 30, 105, 57, 52, 63, 24, 124, 67, 40, 113, 126, 94, 109, 112, 96, 51, 54, 118, 97, 46, 45, 116, 107, 111, 76, 108, 114, 85, 42, 115, 100, 43, 106, 104, 61, 36, 127, 101, 99, 41, 75, 29, 0, 92, 71, 86, 33, 102, 2, 64, 4, 37, 14, 39, 35, 17, 81, 3, 98, 73, 93, 82, 69, 77, 7, 11, 32, 87, 79, 68, 26, 84, 72, 89, 31, 8, 25, 65, 20, 83, 80, 90, 23, 22, 13, 28, 6, 78, 16, 19, 74, 1, 10, 12, 70, 66], [123, 103, 47, 34, 71, 76, 62, 122, 27, 55, 21, 22, 59, 19, 88, 125, 68, 15, 18, 101, 91, 109, 97, 45, 58, 24, 2, 56, 48, 64, 95, 117, 63, 36, 32, 38, 85, 111, 94, 66, 92, 77, 107, 10, 84, 9, 110, 30, 44, 119, 127, 5, 115, 50, 87, 79, 82, 73, 20, 99, 40, 124, 83, 49, 51, 42, 75, 61, 116, 57, 41, 46, 89, 60, 126, 120, 52, 54, 70, 118, 104, 113, 29, 112, 114, 53, 35, 121, 26, 105, 1, 93, 0, 100, 37, 108, 106, 80, 33, 65, 98, 12, 17, 43, 16, 72, 86, 74, 90, 23, 25, 96, 14, 31, 11, 28, 81, 4, 102, 78, 7, 13, 8, 6, 3, 69, 67, 39], [103, 61, 95, 60, 23, 104, 54, 91, 18, 116, 114, 27, 127, 80, 20, 108, 51, 40, 98, 62, 7, 46, 56, 78, 123, 107, 121, 122, 55, 113, 57, 12, 97, 53, 117, 109, 59, 58, 42, 75, 125, 124, 112, 86, 74, 126, 47, 41, 45, 63, 22, 115, 25, 65, 120, 49, 52, 69, 119, 50, 101, 28, 31, 44, 43, 118, 48, 99, 110, 105, 81, 39, 87, 38, 32, 24, 111, 102, 106, 93, 37, 85, 36, 79, 88, 100, 19, 34, 35, 21, 33, 16, 29, 84, 9, 83, 30, 90, 13, 82, 5, 96, 92, 8, 66, 94, 15, 89, 6, 70, 2, 72, 67, 26, 77, 76, 3, 17, 73, 1, 68, 71, 14, 4, 11, 0, 10, 64], [103, 61, 95, 113, 23, 104, 27, 91, 49, 42, 108, 60, 78, 114, 80, 18, 12, 74, 109, 7, 54, 127, 57, 21, 20, 34, 52, 58, 53, 45, 105, 112, 101, 2, 6, 56, 47, 86, 59, 69, 121, 117, 1, 70, 51, 39, 107, 126, 46, 43, 3, 64, 31, 119, 85, 116, 19, 62, 125, 32, 88, 72, 122, 67, 97, 13, 40, 92, 102, 65, 123, 8, 41, 120, 111, 48, 106, 124, 55, 118, 63, 44, 4, 115, 99, 25, 96, 66, 0, 22, 28, 9, 11, 38, 83, 50, 17, 110, 30, 100, 84, 35, 89, 68, 73, 90, 36, 81, 98, 33, 77, 15, 71, 37, 93, 24, 94, 29, 26, 79, 87, 75, 5, 14, 16, 10, 82, 76], [103, 61, 95, 74, 18, 86, 69, 27, 12, 78, 23, 48, 60, 64, 7, 3, 20, 54, 66, 113, 65, 22, 80, 123, 0, 127, 6, 21, 98, 33, 47, 72, 114, 38, 82, 68, 121, 73, 85, 35, 49, 75, 29, 81, 43, 94, 62, 16, 31, 88, 84, 19, 104, 53, 106, 34, 26, 32, 55, 56, 90, 119, 87, 44, 124, 14, 118, 101, 83, 24, 15, 46, 13, 2, 97, 17, 10, 93, 28, 25, 77, 92, 108, 51, 96, 100, 58, 111, 5, 99, 50, 89, 30, 11, 36, 116, 110, 40, 115, 57, 102, 67, 79, 42, 105, 45, 52, 107, 76, 126, 70, 122, 63, 120, 1, 59, 117, 8, 125, 71, 112, 41, 37, 109, 91, 9, 4, 39], [103, 61, 60, 55, 23, 95, 54, 91, 27, 18, 56, 78, 20, 113, 7, 53, 51, 114, 98, 123, 121, 12, 116, 107, 104, 69, 41, 58, 46, 63, 108, 59, 40, 109, 126, 62, 80, 49, 74, 65, 97, 124, 22, 117, 52, 45, 112, 86, 57, 75, 127, 115, 47, 105, 42, 122, 38, 25, 125, 44, 101, 37, 110, 119, 34, 118, 43, 50, 106, 6, 66, 32, 102, 48, 39, 87, 111, 31, 71, 93, 83, 120, 33, 35, 28, 29, 4, 79, 36, 21, 17, 96, 100, 68, 72, 99, 30, 16, 67, 84, 19, 15, 76, 92, 88, 89, 3, 13, 24, 14, 90, 5, 85, 82, 73, 77, 81, 9, 0, 1, 94, 8, 26, 70, 10, 11, 2, 64], [121, 114, 39, 56, 120, 33, 93, 116, 19, 101, 29, 57, 113, 126, 63, 77, 49, 103, 109, 51, 110, 127, 54, 122, 111, 53, 119, 58, 50, 66, 105, 23, 59, 125, 117, 115, 43, 55, 124, 52, 46, 48, 61, 118, 41, 112, 123, 22, 108, 62, 45, 60, 107, 8, 65, 47, 25, 89, 42, 106, 24, 74, 44, 1, 104, 98, 37, 15, 69, 102, 40, 38, 32, 72, 26, 99, 3, 35, 90, 97, 67, 28, 96, 100, 91, 36, 88, 34, 79, 94, 7, 17, 2, 16, 83, 92, 18, 86, 95, 81, 10, 30, 31, 6, 84, 13, 85, 71, 87, 27, 20, 5, 21, 76, 9, 11, 68, 78, 64, 70, 80, 82, 0, 75, 4, 73, 14, 12], [56, 39, 114, 33, 88, 85, 16, 78, 18, 26, 120, 76, 71, 29, 91, 50, 11, 31, 20, 121, 116, 100, 115, 94, 119, 58, 84, 80, 97, 6, 9, 54, 24, 57, 37, 19, 51, 22, 15, 124, 110, 113, 75, 81, 79, 126, 99, 95, 83, 87, 111, 108, 13, 90, 92, 25, 27, 61, 34, 44, 7, 122, 63, 125, 52, 28, 96, 89, 127, 53, 30, 72, 60, 77, 82, 93, 41, 74, 21, 48, 86, 123, 55, 117, 59, 49, 10, 40, 98, 14, 17, 43, 105, 101, 23, 62, 68, 36, 32, 12, 109, 102, 35, 47, 107, 118, 45, 42, 106, 112, 104, 38, 46, 69, 1, 73, 70, 8, 65, 3, 66, 67, 103, 5, 4, 2, 64, 0], [39, 56, 114, 0, 64, 9, 121, 120, 68, 6, 11, 67, 78, 16, 33, 65, 88, 66, 71, 4, 19, 29, 3, 85, 1, 76, 77, 2, 18, 22, 73, 12, 70, 86, 75, 20, 5, 87, 83, 8, 81, 7, 10, 72, 90, 99, 110, 31, 27, 116, 13, 14, 91, 69, 108, 17, 21, 41, 23, 80, 98, 58, 26, 25, 113, 79, 96, 74, 61, 35, 84, 52, 54, 95, 119, 82, 94, 107, 89, 103, 125, 15, 28, 102, 30, 57, 92, 24, 118, 34, 59, 32, 117, 49, 100, 44, 53, 115, 93, 63, 36, 42, 127, 97, 105, 123, 101, 104, 62, 40, 38, 50, 124, 37, 51, 43, 47, 106, 60, 48, 46, 109, 55, 111, 122, 45, 126, 112], [114, 39, 56, 120, 116, 29, 57, 117, 126, 93, 54, 119, 19, 113, 51, 111, 122, 49, 110, 58, 63, 50, 127, 101, 53, 23, 125, 109, 33, 55, 60, 59, 62, 118, 123, 61, 115, 52, 121, 46, 48, 47, 124, 43, 112, 45, 103, 107, 108, 22, 44, 42, 105, 41, 106, 37, 104, 27, 97, 38, 40, 86, 98, 83, 91, 26, 102, 89, 31, 88, 99, 36, 35, 25, 77, 15, 100, 34, 20, 87, 95, 84, 24, 32, 96, 90, 17, 30, 85, 18, 81, 94, 79, 28, 13, 78, 92, 16, 71, 6, 69, 11, 21, 10, 80, 74, 7, 82, 5, 65, 72, 9, 14, 1, 76, 8, 12, 75, 68, 67, 66, 70, 73, 2, 3, 4, 64, 0], [46, 102, 49, 110, 26, 84, 16, 53, 29, 123, 60, 126, 98, 10, 119, 90, 25, 91, 44, 56, 124, 113, 62, 41, 112, 86, 63, 127, 42, 92, 59, 57, 33, 47, 118, 107, 31, 54, 125, 6, 87, 115, 80, 114, 66, 104, 95, 105, 20, 99, 111, 120, 55, 61, 43, 52, 24, 58, 116, 36, 108, 117, 45, 18, 7, 122, 93, 39, 51, 48, 109, 35, 97, 121, 37, 50, 30, 32, 106, 100, 64, 103, 27, 2, 13, 74, 40, 1, 22, 101, 81, 88, 72, 28, 94, 21, 78, 70, 96, 34, 69, 0, 83, 38, 85, 15, 17, 89, 11, 12, 3, 23, 68, 73, 19, 75, 67, 76, 5, 14, 71, 79, 65, 4, 9, 8, 82, 77], [46, 102, 110, 29, 26, 60, 84, 16, 86, 51, 69, 18, 100, 123, 109, 67, 121, 36, 112, 49, 113, 1, 63, 97, 90, 11, 13, 50, 31, 53, 33, 108, 7, 61, 115, 30, 47, 107, 28, 12, 105, 119, 37, 59, 106, 114, 42, 120, 99, 45, 52, 98, 87, 124, 104, 89, 125, 92, 41, 116, 55, 111, 39, 103, 10, 57, 58, 127, 117, 62, 56, 35, 44, 54, 122, 40, 126, 64, 6, 38, 101, 85, 76, 93, 27, 81, 14, 32, 24, 25, 4, 43, 95, 48, 34, 73, 118, 96, 70, 3, 66, 91, 9, 17, 19, 83, 72, 88, 94, 78, 22, 79, 20, 80, 2, 74, 68, 21, 71, 15, 23, 5, 75, 82, 8, 65, 77, 0], [46, 102, 110, 26, 84, 18, 29, 13, 72, 16, 67, 116, 10, 109, 68, 107, 121, 113, 59, 0, 47, 6, 90, 112, 60, 98, 115, 48, 24, 87, 22, 123, 120, 8, 12, 11, 49, 127, 30, 73, 23, 106, 126, 41, 119, 34, 61, 40, 37, 35, 66, 97, 124, 108, 100, 114, 42, 93, 25, 85, 103, 53, 33, 81, 80, 89, 63, 55, 31, 104, 105, 74, 50, 44, 117, 95, 45, 3, 111, 21, 57, 36, 43, 62, 92, 58, 125, 88, 86, 122, 20, 54, 83, 96, 70, 39, 7, 75, 91, 28, 52, 118, 14, 27, 101, 99, 51, 56, 32, 19, 9, 2, 69, 17, 94, 82, 64, 79, 5, 77, 65, 1, 76, 4, 71, 15, 78, 38], [46, 102, 18, 110, 13, 29, 84, 26, 16, 10, 72, 6, 68, 93, 25, 87, 1, 73, 90, 60, 0, 2, 22, 91, 78, 7, 14, 49, 9, 88, 15, 82, 79, 36, 59, 33, 112, 20, 17, 30, 123, 109, 27, 31, 19, 116, 67, 65, 8, 71, 92, 96, 85, 3, 80, 61, 76, 81, 24, 11, 77, 98, 113, 83, 50, 5, 107, 28, 106, 23, 118, 54, 101, 86, 37, 75, 34, 12, 66, 89, 63, 74, 41, 56, 48, 4, 108, 32, 35, 115, 99, 94, 21, 100, 119, 62, 57, 114, 70, 95, 52, 58, 124, 69, 39, 117, 120, 44, 121, 127, 105, 43, 55, 64, 126, 104, 40, 53, 103, 47, 42, 122, 125, 97, 51, 45, 111, 38]], "model.layers.28.self_attn.k_proj": [[120, 104, 98, 23, 85, 0, 29, 11, 17, 16, 27, 61, 58, 30, 19, 95, 110, 73, 78, 125, 66, 114, 91, 108, 42, 50, 115, 51, 62, 52, 68, 121, 127, 57, 47, 124, 60, 45, 59, 53, 122, 55, 49, 70, 43, 67, 112, 119, 109, 65, 117, 54, 123, 116, 48, 102, 113, 56, 44, 8, 118, 111, 63, 38, 4, 46, 79, 105, 107, 89, 106, 5, 12, 126, 41, 25, 77, 2, 39, 36, 86, 88, 13, 7, 82, 31, 92, 35, 103, 101, 100, 37, 10, 90, 33, 28, 71, 34, 84, 26, 24, 1, 32, 22, 94, 20, 99, 96, 9, 72, 6, 15, 97, 83, 18, 81, 3, 93, 80, 14, 21, 74, 75, 69, 76, 64, 87, 40], [125, 104, 64, 87, 98, 14, 70, 74, 17, 12, 82, 0, 21, 65, 8, 68, 66, 124, 108, 58, 69, 3, 1, 90, 4, 114, 47, 79, 28, 49, 2, 54, 19, 127, 73, 9, 75, 46, 50, 123, 61, 93, 52, 25, 16, 77, 103, 111, 31, 67, 41, 113, 53, 106, 92, 20, 109, 89, 45, 122, 105, 27, 97, 107, 71, 94, 60, 33, 13, 55, 59, 121, 120, 86, 26, 11, 63, 99, 29, 95, 91, 51, 101, 119, 5, 62, 118, 88, 83, 112, 7, 102, 24, 80, 22, 115, 56, 38, 42, 44, 48, 37, 32, 116, 110, 57, 15, 36, 84, 100, 23, 117, 43, 39, 76, 30, 35, 126, 85, 96, 6, 72, 34, 18, 10, 78, 81, 40], [106, 114, 100, 94, 118, 87, 68, 0, 99, 71, 77, 74, 20, 82, 15, 17, 50, 70, 52, 119, 42, 27, 47, 44, 115, 62, 75, 89, 56, 66, 123, 1, 122, 109, 102, 120, 76, 2, 110, 53, 57, 45, 48, 9, 65, 41, 43, 107, 121, 39, 35, 127, 95, 73, 29, 54, 59, 91, 31, 58, 124, 61, 72, 69, 51, 22, 113, 49, 28, 12, 111, 93, 85, 63, 97, 60, 3, 125, 46, 40, 116, 96, 30, 98, 126, 108, 117, 33, 55, 34, 32, 37, 5, 112, 101, 11, 92, 64, 19, 80, 104, 78, 103, 90, 14, 7, 83, 21, 86, 24, 88, 8, 16, 67, 105, 38, 26, 4, 81, 25, 79, 18, 6, 10, 84, 36, 23, 13], [105, 126, 56, 34, 95, 86, 92, 89, 115, 84, 82, 79, 57, 119, 51, 62, 110, 63, 87, 54, 59, 80, 120, 112, 58, 121, 113, 127, 117, 55, 124, 116, 118, 109, 48, 60, 123, 61, 122, 46, 49, 114, 52, 53, 47, 125, 111, 45, 32, 104, 44, 107, 99, 50, 108, 21, 17, 91, 39, 101, 43, 26, 106, 42, 103, 96, 11, 36, 77, 27, 28, 102, 30, 13, 38, 40, 97, 33, 37, 73, 29, 8, 22, 41, 76, 35, 10, 19, 93, 75, 100, 98, 14, 94, 70, 85, 18, 88, 24, 12, 20, 31, 16, 23, 90, 25, 83, 4, 5, 3, 78, 81, 0, 72, 9, 65, 74, 7, 71, 68, 1, 6, 66, 15, 69, 67, 64, 2], [123, 39, 88, 21, 27, 64, 76, 15, 18, 30, 1, 68, 98, 71, 122, 95, 70, 125, 9, 47, 63, 121, 66, 46, 19, 57, 60, 86, 116, 110, 2, 49, 119, 120, 118, 33, 113, 3, 45, 52, 117, 44, 112, 48, 51, 124, 40, 10, 50, 54, 59, 114, 20, 69, 111, 31, 102, 37, 126, 8, 108, 61, 56, 106, 109, 115, 55, 34, 43, 87, 53, 62, 13, 42, 14, 41, 127, 58, 104, 29, 107, 105, 32, 101, 100, 11, 36, 75, 93, 92, 23, 90, 28, 35, 99, 22, 26, 74, 89, 97, 25, 96, 84, 65, 77, 17, 80, 81, 0, 16, 94, 38, 82, 78, 79, 103, 72, 4, 5, 6, 7, 83, 67, 73, 91, 24, 12, 85], [61, 39, 31, 23, 91, 18, 64, 74, 80, 78, 86, 62, 69, 49, 56, 12, 54, 3, 51, 112, 66, 7, 55, 113, 110, 124, 121, 109, 50, 20, 52, 117, 114, 58, 0, 34, 122, 59, 60, 123, 115, 42, 111, 126, 118, 107, 63, 53, 127, 116, 44, 41, 108, 45, 79, 47, 57, 35, 25, 46, 98, 65, 106, 6, 48, 88, 119, 75, 1, 43, 40, 72, 120, 33, 125, 21, 32, 105, 67, 68, 19, 37, 100, 36, 38, 92, 70, 73, 17, 2, 102, 96, 97, 81, 99, 27, 90, 94, 101, 103, 4, 11, 84, 104, 77, 93, 29, 9, 28, 15, 24, 13, 22, 89, 26, 83, 30, 85, 16, 5, 8, 10, 76, 14, 71, 95, 82, 87], [56, 103, 64, 114, 93, 121, 97, 1, 50, 19, 9, 68, 6, 22, 120, 67, 116, 11, 23, 113, 78, 66, 77, 16, 46, 88, 57, 58, 110, 54, 3, 69, 76, 71, 51, 122, 105, 118, 126, 43, 95, 8, 62, 61, 117, 24, 55, 59, 47, 49, 53, 123, 125, 42, 119, 63, 60, 44, 48, 115, 85, 124, 12, 26, 37, 52, 109, 18, 112, 2, 45, 74, 127, 111, 108, 41, 89, 106, 21, 107, 40, 81, 38, 65, 101, 91, 14, 15, 35, 36, 84, 104, 0, 99, 102, 92, 4, 34, 100, 94, 30, 28, 96, 25, 32, 17, 20, 31, 98, 72, 80, 10, 75, 90, 70, 82, 27, 79, 13, 5, 87, 73, 7, 86, 29, 39, 33, 83], [110, 38, 46, 29, 84, 26, 13, 16, 18, 6, 2, 0, 112, 10, 25, 121, 123, 68, 11, 113, 73, 86, 1, 47, 7, 97, 72, 63, 60, 109, 115, 34, 119, 53, 114, 30, 59, 55, 61, 124, 32, 41, 103, 107, 101, 88, 49, 50, 116, 42, 87, 51, 104, 62, 37, 57, 111, 108, 58, 127, 120, 45, 56, 122, 118, 40, 44, 105, 39, 125, 117, 54, 52, 64, 92, 43, 106, 95, 15, 76, 48, 14, 69, 36, 96, 91, 4, 35, 99, 31, 100, 126, 98, 33, 75, 22, 67, 23, 5, 94, 81, 19, 24, 85, 17, 78, 12, 27, 83, 8, 28, 3, 89, 21, 65, 9, 79, 71, 93, 82, 77, 74, 80, 66, 70, 90, 20, 102]], "model.layers.28.self_attn.qk_proj": [[123, 125, 61, 56, 120, 114, 110, 46, 126, 106, 42, 104, 23, 87, 121, 118, 103, 82, 18, 91, 64, 0, 29, 95, 58, 39, 40, 50, 85, 41, 27, 10, 70, 20, 76, 74, 105, 84, 30, 12, 55, 62, 98, 93, 21, 81, 51, 68, 115, 80, 79, 13, 77, 122, 124, 44, 90, 6, 60, 15, 4, 48, 112, 59, 113, 54, 31, 116, 78, 49, 57, 47, 53, 109, 66, 65, 17, 127, 16, 2, 22, 14, 63, 28, 102, 24, 11, 34, 1, 83, 9, 86, 43, 73, 52, 19, 38, 25, 7, 88, 108, 36, 8, 119, 94, 89, 26, 33, 107, 3, 71, 117, 75, 5, 45, 67, 111, 99, 97, 35, 101, 100, 72, 69, 32, 37, 92, 96], [123, 125, 61, 56, 120, 114, 110, 46, 126, 106, 42, 104, 121, 23, 87, 103, 118, 91, 0, 82, 64, 18, 40, 70, 95, 50, 58, 29, 30, 47, 10, 62, 39, 27, 20, 85, 105, 4, 41, 76, 12, 93, 51, 98, 81, 55, 74, 21, 60, 84, 13, 49, 77, 44, 115, 63, 53, 80, 65, 124, 66, 116, 34, 112, 113, 22, 54, 79, 102, 17, 16, 2, 6, 68, 11, 14, 78, 90, 15, 122, 28, 1, 9, 109, 59, 127, 48, 8, 108, 52, 31, 38, 57, 43, 71, 24, 86, 88, 83, 73, 33, 117, 7, 107, 111, 119, 19, 94, 45, 5, 36, 100, 3, 89, 75, 69, 26, 67, 35, 25, 37, 97, 99, 72, 101, 92, 32, 96], [123, 125, 61, 56, 120, 114, 110, 46, 126, 106, 42, 121, 104, 23, 87, 0, 118, 64, 18, 103, 82, 91, 70, 39, 95, 27, 58, 41, 40, 98, 105, 29, 12, 21, 84, 76, 10, 85, 112, 30, 50, 74, 93, 55, 20, 115, 44, 53, 4, 62, 63, 80, 34, 81, 77, 124, 60, 15, 68, 79, 113, 22, 13, 66, 2, 90, 57, 51, 49, 78, 47, 17, 54, 102, 65, 122, 16, 14, 1, 116, 8, 48, 59, 28, 31, 119, 127, 86, 109, 19, 6, 43, 71, 36, 88, 9, 107, 38, 108, 52, 7, 25, 11, 117, 111, 24, 94, 67, 89, 26, 75, 3, 73, 83, 33, 5, 69, 99, 101, 100, 97, 37, 45, 35, 92, 96, 32, 72], [123, 125, 56, 61, 120, 114, 110, 46, 126, 106, 42, 104, 121, 23, 103, 87, 118, 64, 0, 18, 91, 82, 70, 40, 39, 41, 74, 76, 58, 95, 53, 105, 12, 27, 50, 29, 10, 98, 21, 84, 112, 122, 62, 4, 63, 81, 30, 124, 55, 2, 77, 14, 78, 85, 34, 93, 80, 1, 20, 44, 68, 66, 113, 16, 17, 115, 60, 22, 13, 15, 54, 6, 79, 8, 90, 49, 57, 51, 59, 65, 47, 25, 116, 31, 7, 86, 28, 11, 48, 36, 33, 19, 67, 9, 83, 119, 71, 102, 52, 38, 88, 94, 73, 107, 109, 3, 89, 69, 101, 127, 117, 108, 24, 75, 43, 45, 5, 26, 97, 111, 35, 99, 100, 37, 92, 72, 32, 96], [123, 61, 125, 56, 120, 114, 110, 46, 126, 106, 42, 104, 121, 23, 87, 103, 118, 18, 0, 82, 64, 40, 58, 41, 70, 91, 50, 53, 39, 29, 105, 55, 12, 76, 95, 98, 112, 21, 27, 80, 74, 10, 84, 20, 85, 30, 6, 78, 77, 113, 4, 81, 62, 115, 15, 122, 60, 124, 68, 66, 54, 48, 14, 2, 57, 44, 59, 90, 86, 116, 16, 47, 79, 65, 17, 8, 28, 63, 93, 49, 13, 51, 127, 1, 25, 31, 11, 34, 19, 22, 7, 107, 117, 9, 109, 24, 52, 108, 102, 83, 36, 73, 119, 67, 88, 3, 38, 71, 94, 111, 75, 5, 26, 69, 89, 33, 101, 43, 45, 97, 100, 35, 99, 92, 72, 37, 96, 32], [123, 125, 56, 61, 120, 114, 110, 46, 126, 106, 42, 104, 121, 23, 103, 87, 18, 82, 118, 64, 0, 50, 29, 58, 40, 91, 95, 21, 41, 74, 27, 44, 112, 62, 20, 30, 105, 98, 85, 81, 10, 55, 53, 68, 113, 84, 12, 76, 6, 115, 70, 34, 54, 39, 49, 48, 4, 15, 77, 31, 80, 124, 2, 65, 66, 93, 47, 14, 13, 78, 60, 122, 79, 1, 8, 22, 63, 17, 16, 116, 19, 86, 28, 57, 127, 109, 24, 90, 119, 25, 108, 5, 7, 59, 51, 71, 11, 94, 83, 43, 117, 52, 75, 89, 33, 9, 102, 3, 88, 97, 73, 38, 100, 35, 111, 67, 69, 99, 45, 101, 107, 26, 37, 36, 96, 92, 32, 72], [123, 61, 125, 56, 120, 114, 110, 46, 126, 106, 42, 23, 104, 87, 121, 103, 41, 118, 82, 40, 50, 64, 58, 18, 91, 0, 29, 98, 105, 95, 27, 6, 39, 76, 44, 30, 21, 113, 85, 53, 112, 55, 20, 12, 81, 90, 10, 115, 15, 84, 4, 93, 13, 80, 74, 16, 109, 34, 47, 77, 49, 14, 68, 17, 124, 60, 54, 79, 48, 127, 51, 63, 66, 62, 59, 70, 78, 1, 57, 22, 122, 86, 25, 31, 116, 2, 24, 19, 36, 65, 8, 28, 7, 89, 111, 119, 9, 43, 83, 108, 73, 94, 88, 52, 117, 71, 11, 38, 67, 45, 97, 102, 26, 69, 3, 101, 99, 100, 75, 107, 37, 35, 72, 33, 5, 32, 92, 96], [123, 61, 125, 56, 120, 114, 110, 46, 126, 106, 42, 23, 104, 87, 121, 103, 118, 18, 82, 91, 0, 29, 64, 39, 41, 58, 40, 105, 6, 95, 44, 27, 98, 21, 50, 112, 55, 30, 76, 34, 54, 12, 85, 47, 20, 74, 122, 4, 53, 81, 84, 77, 79, 115, 51, 10, 113, 48, 93, 90, 31, 80, 16, 49, 14, 15, 102, 17, 116, 68, 65, 124, 59, 2, 22, 60, 13, 1, 57, 66, 43, 127, 63, 109, 62, 119, 89, 78, 38, 25, 70, 28, 108, 24, 88, 9, 86, 73, 83, 45, 52, 107, 36, 11, 19, 94, 71, 67, 111, 7, 117, 33, 26, 100, 37, 5, 72, 97, 99, 69, 75, 3, 8, 101, 92, 32, 35, 96], [123, 125, 61, 56, 120, 114, 110, 46, 126, 106, 42, 87, 23, 104, 121, 103, 18, 82, 29, 118, 64, 91, 27, 0, 44, 58, 95, 6, 40, 41, 39, 21, 50, 76, 20, 112, 30, 74, 113, 55, 47, 10, 105, 84, 13, 63, 85, 98, 68, 124, 77, 122, 80, 12, 93, 51, 16, 34, 78, 109, 54, 48, 90, 115, 4, 116, 79, 62, 81, 17, 53, 15, 66, 49, 14, 102, 65, 43, 119, 22, 31, 28, 60, 59, 57, 127, 83, 88, 2, 1, 24, 86, 89, 72, 70, 52, 73, 19, 7, 108, 25, 107, 9, 117, 38, 94, 67, 71, 33, 111, 36, 45, 75, 99, 26, 5, 100, 11, 3, 101, 35, 97, 8, 32, 69, 92, 37, 96], [123, 61, 125, 56, 120, 114, 110, 46, 126, 106, 42, 104, 121, 87, 23, 64, 58, 118, 103, 0, 82, 18, 91, 6, 29, 41, 40, 30, 27, 76, 48, 4, 39, 50, 98, 85, 63, 55, 12, 74, 113, 10, 122, 44, 20, 95, 112, 93, 21, 51, 124, 105, 47, 68, 2, 62, 34, 22, 53, 84, 54, 70, 66, 16, 115, 81, 13, 108, 49, 60, 57, 43, 79, 77, 90, 14, 80, 17, 116, 65, 15, 102, 78, 1, 59, 86, 109, 127, 119, 72, 24, 28, 31, 11, 73, 7, 9, 38, 25, 71, 111, 19, 33, 83, 94, 3, 107, 36, 88, 75, 52, 89, 26, 5, 67, 8, 45, 117, 69, 101, 100, 97, 35, 99, 32, 37, 92, 96], [123, 56, 61, 125, 120, 114, 110, 46, 126, 106, 42, 104, 121, 23, 87, 118, 82, 18, 103, 0, 64, 91, 10, 40, 58, 70, 27, 74, 29, 95, 50, 76, 98, 6, 85, 39, 41, 30, 21, 4, 20, 55, 77, 84, 66, 53, 13, 16, 12, 90, 79, 81, 54, 105, 68, 78, 72, 93, 15, 1, 80, 124, 122, 2, 62, 48, 112, 57, 17, 47, 60, 51, 115, 34, 113, 71, 59, 94, 63, 49, 7, 31, 65, 28, 14, 24, 19, 86, 22, 25, 73, 88, 109, 83, 11, 44, 89, 116, 3, 102, 52, 9, 127, 108, 119, 38, 5, 67, 33, 43, 107, 75, 36, 26, 117, 45, 111, 69, 35, 97, 100, 101, 99, 92, 8, 37, 32, 96], [123, 56, 125, 61, 120, 114, 110, 46, 126, 106, 104, 42, 87, 23, 121, 82, 18, 103, 118, 64, 29, 40, 91, 0, 98, 95, 70, 76, 27, 50, 84, 21, 39, 10, 20, 74, 44, 58, 30, 4, 79, 41, 12, 85, 55, 105, 34, 13, 16, 93, 78, 77, 80, 122, 68, 66, 90, 59, 113, 54, 17, 51, 47, 15, 62, 115, 112, 6, 31, 81, 2, 49, 14, 22, 60, 53, 65, 72, 57, 1, 25, 102, 24, 124, 86, 116, 28, 48, 75, 83, 63, 73, 109, 127, 7, 71, 36, 9, 3, 19, 119, 94, 89, 67, 52, 43, 11, 107, 88, 33, 45, 38, 69, 108, 111, 35, 26, 5, 101, 99, 117, 100, 32, 37, 97, 8, 96, 92], [123, 125, 61, 56, 120, 114, 110, 46, 126, 106, 104, 87, 42, 103, 121, 23, 118, 58, 18, 91, 40, 82, 64, 0, 50, 29, 39, 95, 44, 70, 51, 47, 27, 41, 20, 98, 12, 105, 54, 21, 113, 30, 115, 55, 85, 93, 60, 116, 63, 112, 84, 102, 76, 74, 34, 57, 68, 59, 62, 53, 28, 17, 4, 31, 48, 81, 90, 72, 49, 109, 124, 122, 65, 86, 80, 16, 14, 77, 78, 22, 13, 10, 79, 108, 2, 66, 24, 15, 127, 6, 43, 1, 9, 88, 119, 75, 73, 45, 52, 19, 89, 38, 7, 111, 11, 36, 71, 94, 107, 83, 25, 26, 117, 3, 97, 35, 5, 67, 99, 37, 69, 100, 101, 33, 92, 32, 96, 8], [123, 61, 125, 56, 120, 114, 110, 46, 126, 106, 42, 121, 104, 87, 23, 64, 103, 0, 118, 91, 18, 82, 70, 58, 39, 54, 95, 76, 29, 40, 21, 41, 74, 50, 12, 27, 93, 30, 122, 68, 10, 20, 124, 85, 63, 57, 55, 105, 115, 84, 4, 98, 2, 44, 22, 34, 13, 81, 77, 116, 113, 1, 60, 112, 16, 78, 90, 79, 109, 49, 51, 65, 66, 47, 15, 14, 62, 17, 6, 48, 59, 80, 53, 43, 102, 119, 31, 127, 72, 24, 108, 111, 28, 88, 86, 71, 11, 38, 19, 9, 107, 52, 7, 36, 75, 73, 83, 25, 33, 94, 97, 99, 117, 5, 89, 3, 35, 67, 45, 69, 100, 101, 26, 37, 8, 92, 32, 96], [123, 61, 56, 125, 120, 110, 114, 46, 126, 106, 42, 104, 23, 121, 87, 18, 64, 0, 91, 103, 118, 82, 40, 70, 41, 74, 58, 76, 29, 98, 85, 95, 12, 50, 10, 105, 84, 21, 39, 30, 4, 2, 81, 20, 68, 27, 63, 90, 16, 55, 66, 124, 6, 115, 13, 15, 54, 77, 112, 53, 47, 44, 79, 22, 113, 93, 14, 122, 34, 78, 80, 17, 62, 116, 109, 65, 51, 57, 48, 59, 60, 1, 31, 9, 7, 86, 119, 75, 72, 127, 49, 28, 33, 108, 11, 88, 52, 24, 94, 67, 38, 89, 3, 25, 19, 5, 111, 73, 71, 36, 83, 107, 69, 43, 97, 35, 117, 45, 100, 26, 99, 102, 8, 101, 37, 96, 92, 32], [123, 125, 61, 56, 120, 114, 110, 46, 126, 106, 42, 121, 104, 23, 87, 103, 18, 118, 91, 82, 64, 29, 50, 58, 0, 95, 10, 70, 40, 27, 74, 105, 55, 93, 6, 30, 76, 84, 39, 85, 13, 12, 20, 98, 68, 21, 41, 81, 90, 122, 115, 77, 47, 16, 112, 80, 78, 15, 4, 44, 34, 66, 54, 1, 62, 113, 109, 14, 2, 49, 60, 22, 79, 17, 59, 65, 119, 31, 63, 19, 28, 53, 48, 124, 86, 7, 75, 71, 9, 57, 24, 116, 73, 127, 102, 25, 3, 88, 33, 89, 43, 108, 83, 52, 72, 11, 51, 67, 8, 94, 111, 107, 38, 100, 45, 69, 5, 97, 26, 37, 35, 117, 36, 32, 99, 101, 96, 92], [123, 125, 61, 56, 120, 114, 110, 46, 126, 106, 42, 104, 23, 87, 121, 103, 64, 18, 118, 0, 82, 58, 95, 29, 40, 91, 39, 30, 41, 27, 50, 98, 20, 93, 21, 74, 85, 112, 12, 105, 6, 84, 115, 90, 55, 10, 16, 76, 44, 81, 54, 68, 108, 122, 4, 14, 60, 1, 53, 34, 80, 62, 22, 13, 113, 59, 77, 17, 66, 109, 31, 86, 28, 47, 57, 48, 116, 49, 78, 15, 2, 70, 63, 51, 127, 102, 79, 124, 9, 88, 38, 43, 119, 19, 24, 71, 45, 52, 65, 73, 75, 25, 89, 107, 7, 26, 83, 36, 8, 94, 33, 35, 11, 99, 111, 100, 67, 3, 97, 72, 117, 5, 101, 69, 92, 37, 32, 96], [123, 125, 61, 56, 120, 114, 110, 46, 126, 106, 42, 104, 87, 23, 121, 64, 103, 82, 18, 118, 0, 91, 6, 39, 29, 40, 27, 95, 74, 76, 30, 50, 85, 58, 10, 112, 1, 12, 34, 20, 41, 54, 44, 55, 21, 84, 13, 93, 105, 81, 98, 2, 68, 14, 47, 113, 80, 122, 63, 16, 53, 4, 77, 90, 51, 115, 119, 60, 78, 62, 49, 102, 15, 17, 79, 22, 124, 66, 48, 70, 31, 43, 116, 86, 59, 65, 9, 109, 71, 57, 75, 19, 108, 8, 33, 94, 88, 28, 25, 26, 127, 24, 38, 36, 89, 3, 52, 67, 83, 111, 107, 7, 69, 99, 117, 73, 101, 97, 11, 5, 100, 35, 45, 37, 32, 72, 96, 92], [123, 125, 61, 56, 120, 114, 110, 46, 126, 106, 42, 104, 23, 121, 103, 87, 64, 0, 118, 91, 40, 82, 6, 18, 58, 41, 29, 27, 112, 76, 39, 50, 74, 98, 49, 10, 95, 93, 85, 12, 2, 122, 113, 20, 60, 84, 4, 30, 105, 124, 44, 63, 21, 34, 81, 62, 48, 115, 14, 68, 13, 77, 55, 80, 109, 116, 119, 53, 47, 54, 17, 51, 1, 8, 15, 59, 127, 43, 16, 66, 78, 70, 22, 102, 108, 31, 90, 79, 75, 57, 33, 65, 86, 36, 19, 38, 28, 45, 88, 52, 24, 9, 89, 71, 94, 111, 73, 3, 83, 11, 25, 99, 7, 107, 69, 67, 101, 97, 35, 100, 26, 5, 117, 37, 96, 92, 32, 72], [123, 61, 56, 125, 120, 114, 110, 46, 126, 106, 42, 104, 103, 121, 23, 87, 118, 0, 91, 82, 40, 6, 64, 18, 58, 29, 95, 10, 27, 93, 30, 41, 74, 39, 4, 55, 50, 98, 122, 81, 124, 12, 76, 66, 105, 85, 20, 112, 21, 62, 34, 54, 8, 68, 84, 48, 44, 16, 53, 115, 116, 90, 77, 113, 17, 14, 70, 63, 13, 78, 2, 1, 60, 80, 79, 15, 47, 28, 59, 109, 49, 102, 57, 75, 108, 119, 22, 127, 83, 71, 51, 31, 86, 73, 65, 33, 52, 9, 45, 7, 38, 35, 3, 19, 25, 88, 43, 111, 94, 69, 36, 97, 89, 26, 107, 24, 117, 11, 100, 99, 67, 5, 101, 92, 72, 37, 96, 32], [123, 61, 125, 56, 120, 114, 110, 46, 126, 106, 42, 104, 23, 121, 87, 118, 103, 18, 82, 40, 29, 64, 0, 91, 27, 44, 39, 58, 95, 21, 105, 50, 55, 30, 41, 6, 112, 74, 12, 10, 76, 113, 85, 122, 115, 20, 51, 98, 116, 47, 93, 54, 62, 34, 63, 70, 84, 1, 68, 109, 60, 17, 13, 8, 16, 80, 79, 78, 4, 90, 119, 124, 81, 2, 49, 108, 127, 102, 48, 15, 77, 53, 31, 86, 22, 43, 66, 59, 107, 52, 57, 14, 65, 28, 25, 75, 88, 7, 24, 19, 9, 94, 3, 71, 38, 73, 33, 67, 36, 83, 100, 5, 89, 97, 26, 11, 117, 111, 101, 69, 35, 37, 32, 99, 96, 45, 92, 72], [123, 125, 61, 56, 120, 114, 110, 46, 126, 106, 42, 104, 121, 87, 64, 23, 0, 118, 103, 91, 18, 40, 82, 70, 58, 41, 27, 29, 39, 50, 12, 76, 112, 105, 95, 68, 20, 21, 10, 34, 30, 124, 54, 44, 85, 122, 74, 62, 78, 93, 55, 115, 2, 6, 81, 98, 113, 63, 84, 116, 13, 1, 51, 53, 60, 15, 119, 65, 66, 43, 17, 49, 57, 16, 4, 86, 79, 102, 77, 80, 47, 22, 8, 48, 59, 14, 52, 109, 31, 127, 90, 38, 75, 7, 108, 28, 71, 19, 88, 67, 25, 107, 33, 94, 9, 24, 5, 36, 89, 73, 97, 11, 26, 3, 45, 83, 92, 69, 100, 117, 37, 111, 101, 99, 35, 72, 32, 96], [123, 56, 61, 125, 120, 114, 110, 46, 126, 106, 42, 121, 104, 23, 87, 103, 64, 118, 18, 0, 91, 82, 39, 58, 70, 40, 95, 98, 29, 27, 50, 62, 30, 105, 93, 112, 116, 12, 44, 41, 55, 21, 10, 115, 122, 81, 74, 20, 63, 85, 68, 113, 76, 57, 78, 84, 54, 34, 2, 16, 80, 102, 13, 4, 6, 31, 66, 15, 79, 124, 109, 86, 1, 51, 90, 17, 60, 119, 59, 53, 48, 49, 47, 14, 22, 28, 65, 88, 108, 77, 127, 38, 7, 36, 73, 9, 75, 107, 25, 3, 8, 52, 24, 43, 71, 89, 26, 33, 67, 11, 83, 111, 94, 97, 45, 69, 19, 117, 5, 100, 72, 99, 35, 37, 92, 101, 32, 96], [123, 56, 61, 125, 120, 114, 110, 46, 126, 106, 42, 23, 104, 87, 121, 18, 103, 91, 118, 82, 58, 0, 64, 95, 70, 29, 74, 40, 122, 105, 21, 55, 27, 85, 39, 10, 30, 12, 50, 115, 34, 98, 20, 93, 41, 76, 13, 4, 54, 81, 80, 68, 15, 84, 62, 78, 113, 90, 63, 44, 51, 59, 16, 112, 57, 14, 124, 28, 48, 116, 22, 86, 79, 1, 77, 17, 47, 6, 66, 108, 60, 102, 119, 49, 2, 71, 25, 109, 31, 94, 53, 127, 24, 9, 19, 75, 111, 11, 43, 52, 33, 7, 83, 8, 36, 65, 88, 38, 107, 72, 97, 73, 3, 5, 89, 100, 69, 35, 26, 67, 99, 45, 37, 117, 32, 101, 96, 92], [123, 61, 56, 125, 120, 114, 110, 46, 126, 106, 42, 23, 104, 87, 121, 118, 103, 18, 82, 91, 64, 70, 58, 0, 29, 40, 122, 98, 39, 41, 50, 10, 30, 21, 55, 12, 27, 105, 20, 54, 124, 74, 113, 81, 95, 57, 85, 76, 116, 112, 109, 84, 47, 62, 93, 44, 15, 79, 127, 22, 13, 115, 78, 4, 17, 16, 34, 68, 49, 65, 80, 90, 77, 2, 48, 86, 119, 59, 63, 6, 51, 66, 60, 14, 53, 24, 31, 28, 1, 108, 52, 43, 89, 9, 94, 102, 19, 71, 73, 111, 88, 25, 75, 83, 72, 38, 7, 107, 67, 11, 36, 101, 117, 26, 33, 45, 3, 35, 5, 100, 97, 69, 37, 8, 99, 96, 32, 92], [123, 61, 125, 56, 120, 114, 110, 46, 126, 106, 42, 87, 104, 23, 121, 118, 91, 103, 0, 18, 64, 82, 58, 29, 39, 95, 30, 44, 51, 50, 27, 41, 40, 70, 116, 54, 122, 10, 47, 98, 34, 20, 12, 113, 84, 119, 112, 105, 21, 124, 93, 76, 115, 81, 109, 102, 60, 74, 85, 68, 53, 49, 63, 62, 57, 16, 4, 59, 31, 14, 66, 48, 77, 55, 86, 90, 13, 43, 79, 6, 1, 65, 127, 15, 78, 80, 17, 108, 33, 22, 28, 36, 2, 19, 24, 38, 9, 52, 72, 107, 73, 88, 71, 26, 45, 11, 89, 3, 97, 25, 94, 111, 67, 7, 100, 75, 117, 69, 83, 35, 92, 99, 101, 37, 32, 5, 8, 96], [123, 125, 56, 61, 120, 114, 110, 46, 126, 106, 42, 104, 23, 121, 87, 103, 118, 64, 0, 18, 40, 82, 41, 50, 29, 58, 91, 47, 95, 55, 39, 30, 27, 98, 60, 122, 124, 62, 57, 21, 20, 105, 93, 10, 76, 34, 49, 54, 12, 51, 115, 74, 85, 116, 81, 90, 6, 112, 53, 63, 84, 113, 1, 70, 4, 48, 13, 17, 59, 43, 16, 14, 77, 80, 15, 31, 2, 68, 78, 127, 102, 44, 28, 109, 72, 86, 79, 22, 119, 65, 25, 52, 108, 24, 36, 107, 7, 66, 88, 19, 38, 89, 99, 33, 11, 83, 9, 73, 71, 94, 75, 111, 101, 67, 35, 97, 3, 117, 100, 26, 37, 45, 69, 5, 96, 8, 32, 92], [123, 125, 56, 61, 120, 114, 110, 46, 126, 106, 42, 23, 104, 121, 87, 118, 64, 82, 0, 103, 18, 91, 58, 40, 6, 41, 10, 29, 74, 122, 12, 55, 62, 50, 95, 27, 85, 124, 17, 76, 39, 13, 105, 93, 30, 21, 20, 34, 68, 81, 98, 4, 57, 115, 54, 60, 78, 59, 90, 70, 80, 63, 48, 84, 77, 2, 79, 66, 112, 15, 116, 16, 72, 1, 19, 44, 49, 47, 119, 127, 113, 22, 11, 14, 51, 86, 28, 31, 102, 65, 7, 52, 53, 36, 73, 109, 33, 71, 43, 67, 38, 89, 24, 108, 75, 9, 111, 83, 25, 88, 101, 3, 69, 94, 26, 107, 117, 35, 5, 99, 45, 97, 100, 8, 37, 92, 32, 96], [123, 61, 56, 125, 120, 114, 110, 46, 126, 106, 42, 104, 121, 23, 87, 0, 103, 64, 118, 18, 91, 82, 6, 40, 58, 115, 41, 98, 62, 116, 29, 12, 50, 76, 39, 30, 95, 10, 55, 93, 85, 74, 105, 21, 4, 122, 47, 68, 84, 124, 78, 60, 49, 27, 51, 13, 66, 20, 44, 34, 48, 59, 77, 2, 112, 72, 63, 1, 17, 53, 70, 113, 57, 79, 65, 81, 14, 54, 86, 80, 16, 22, 90, 15, 31, 7, 52, 119, 11, 127, 28, 109, 111, 108, 25, 38, 19, 33, 83, 3, 9, 89, 94, 73, 102, 67, 36, 71, 100, 43, 24, 117, 97, 88, 75, 107, 45, 69, 5, 35, 26, 101, 37, 99, 96, 32, 92, 8], [123, 61, 125, 56, 120, 114, 110, 46, 126, 106, 42, 104, 23, 87, 121, 103, 18, 82, 58, 118, 6, 91, 29, 64, 40, 50, 0, 95, 44, 85, 39, 10, 12, 30, 62, 74, 21, 47, 27, 122, 51, 20, 41, 84, 124, 115, 105, 63, 76, 116, 98, 54, 60, 55, 112, 81, 68, 78, 17, 77, 80, 93, 4, 14, 13, 34, 16, 48, 113, 65, 15, 53, 119, 59, 66, 57, 31, 109, 90, 2, 49, 70, 79, 83, 72, 86, 1, 11, 7, 28, 22, 52, 127, 71, 24, 9, 111, 19, 73, 108, 25, 38, 89, 88, 75, 3, 67, 45, 43, 94, 33, 102, 97, 117, 69, 107, 26, 36, 37, 100, 5, 101, 99, 8, 35, 32, 92, 96], [123, 125, 61, 56, 120, 114, 110, 46, 126, 106, 104, 42, 23, 121, 87, 0, 103, 64, 118, 18, 91, 6, 29, 82, 58, 40, 95, 10, 105, 50, 39, 41, 27, 74, 68, 12, 62, 55, 76, 81, 63, 30, 122, 85, 112, 84, 20, 124, 21, 115, 34, 4, 70, 93, 98, 54, 60, 66, 59, 80, 65, 2, 90, 78, 31, 51, 15, 44, 49, 14, 16, 116, 77, 13, 22, 57, 17, 113, 86, 79, 28, 53, 47, 48, 72, 71, 109, 7, 127, 119, 1, 108, 24, 25, 11, 111, 9, 36, 73, 88, 83, 67, 75, 117, 52, 102, 43, 19, 99, 69, 94, 38, 33, 89, 45, 5, 3, 97, 100, 35, 26, 107, 8, 37, 101, 92, 96, 32], [123, 61, 125, 56, 120, 114, 110, 46, 126, 106, 42, 104, 87, 23, 121, 103, 118, 18, 82, 29, 50, 91, 53, 95, 12, 58, 64, 30, 85, 41, 40, 51, 6, 27, 39, 21, 10, 124, 0, 34, 76, 62, 105, 54, 112, 93, 63, 98, 55, 48, 74, 60, 47, 15, 115, 90, 20, 84, 17, 81, 16, 44, 122, 113, 57, 14, 68, 22, 4, 80, 59, 79, 13, 70, 77, 116, 31, 127, 78, 28, 108, 86, 102, 49, 66, 109, 83, 9, 1, 2, 71, 11, 43, 65, 119, 24, 7, 25, 33, 73, 88, 52, 111, 89, 72, 107, 19, 94, 38, 45, 8, 35, 100, 37, 36, 99, 26, 75, 67, 3, 117, 5, 97, 69, 101, 92, 32, 96]], "model.layers.29.self_attn.q_proj": [[45, 35, 109, 93, 22, 25, 80, 18, 83, 11, 14, 91, 67, 31, 1, 115, 124, 10, 88, 127, 43, 55, 13, 119, 8, 64, 81, 90, 6, 21, 7, 102, 48, 95, 5, 9, 69, 27, 112, 70, 40, 113, 78, 114, 126, 4, 120, 53, 50, 111, 61, 121, 2, 101, 24, 12, 103, 54, 37, 63, 57, 108, 38, 122, 92, 16, 44, 59, 97, 33, 52, 123, 28, 3, 96, 26, 116, 79, 104, 47, 49, 86, 100, 15, 17, 32, 60, 117, 62, 106, 36, 34, 118, 41, 58, 84, 51, 39, 89, 94, 46, 110, 19, 107, 29, 42, 77, 72, 56, 105, 125, 76, 30, 85, 0, 87, 23, 98, 20, 99, 82, 66, 73, 75, 65, 71, 68, 74], [45, 109, 35, 112, 93, 25, 22, 115, 18, 83, 14, 61, 13, 10, 80, 119, 31, 91, 55, 11, 121, 124, 54, 126, 7, 114, 50, 103, 60, 90, 102, 101, 21, 113, 62, 116, 123, 1, 8, 84, 15, 127, 110, 33, 88, 9, 20, 79, 48, 108, 27, 47, 122, 92, 125, 23, 95, 49, 28, 17, 69, 32, 118, 100, 75, 117, 85, 120, 52, 24, 51, 97, 107, 111, 37, 98, 63, 41, 46, 34, 59, 81, 4, 56, 43, 39, 104, 40, 106, 26, 53, 87, 57, 58, 42, 86, 94, 82, 38, 6, 36, 19, 105, 44, 76, 89, 30, 5, 29, 12, 64, 73, 67, 96, 16, 77, 71, 68, 74, 78, 72, 99, 70, 2, 66, 65, 3, 0], [45, 109, 55, 35, 118, 93, 22, 112, 111, 121, 113, 115, 43, 48, 25, 31, 91, 53, 62, 126, 40, 122, 52, 80, 119, 123, 50, 61, 49, 117, 51, 56, 27, 92, 58, 103, 104, 102, 18, 83, 60, 39, 124, 46, 108, 59, 44, 94, 54, 41, 127, 106, 90, 114, 76, 47, 57, 107, 99, 42, 20, 21, 116, 101, 120, 110, 32, 125, 88, 84, 63, 14, 16, 23, 38, 33, 28, 74, 64, 105, 95, 37, 97, 36, 7, 15, 98, 96, 1, 100, 2, 30, 79, 19, 89, 11, 86, 34, 85, 69, 10, 0, 87, 66, 29, 8, 72, 67, 12, 68, 65, 26, 24, 17, 70, 75, 78, 4, 81, 5, 6, 82, 3, 71, 13, 9, 77, 73], [45, 35, 109, 93, 22, 18, 25, 55, 83, 13, 124, 31, 115, 14, 88, 80, 61, 112, 91, 81, 10, 118, 21, 50, 11, 15, 121, 28, 90, 63, 77, 9, 73, 48, 64, 56, 33, 7, 103, 85, 23, 44, 126, 120, 127, 27, 41, 24, 43, 92, 119, 111, 114, 113, 29, 26, 12, 62, 99, 30, 58, 67, 1, 8, 122, 52, 117, 76, 97, 95, 57, 123, 60, 69, 116, 108, 84, 89, 102, 71, 20, 51, 39, 70, 74, 40, 53, 2, 17, 37, 86, 59, 107, 6, 104, 106, 79, 5, 38, 19, 105, 54, 100, 110, 82, 49, 42, 36, 101, 34, 47, 32, 98, 46, 96, 125, 72, 87, 16, 94, 3, 66, 78, 75, 0, 68, 4, 65], [41, 98, 57, 117, 25, 51, 86, 127, 95, 114, 125, 120, 121, 18, 93, 16, 78, 50, 115, 61, 110, 105, 59, 53, 48, 55, 119, 46, 15, 49, 31, 58, 38, 87, 40, 12, 47, 39, 28, 19, 88, 118, 62, 108, 63, 45, 74, 85, 11, 116, 72, 75, 54, 67, 123, 68, 9, 76, 56, 106, 124, 35, 122, 10, 126, 80, 101, 90, 7, 69, 107, 13, 65, 111, 91, 66, 60, 43, 42, 109, 64, 92, 1, 113, 71, 112, 100, 33, 70, 23, 103, 32, 44, 82, 21, 89, 52, 20, 27, 22, 36, 104, 17, 29, 102, 99, 81, 30, 37, 24, 26, 79, 77, 14, 2, 96, 84, 97, 94, 4, 83, 8, 73, 0, 3, 34, 5, 6], [41, 98, 117, 57, 127, 95, 25, 125, 86, 114, 51, 93, 53, 54, 50, 18, 48, 120, 60, 61, 58, 105, 119, 55, 111, 38, 78, 115, 31, 16, 87, 59, 62, 121, 40, 108, 88, 21, 106, 43, 126, 56, 35, 63, 19, 15, 116, 85, 110, 12, 28, 10, 45, 52, 122, 47, 92, 7, 90, 101, 124, 123, 104, 109, 36, 113, 49, 72, 29, 80, 100, 33, 81, 76, 112, 103, 44, 30, 74, 107, 32, 46, 91, 118, 9, 77, 42, 23, 24, 82, 97, 22, 1, 75, 2, 39, 89, 67, 37, 20, 94, 27, 83, 13, 5, 102, 11, 26, 99, 70, 96, 79, 17, 3, 4, 84, 14, 68, 71, 69, 73, 64, 34, 6, 8, 0, 66, 65], [41, 98, 57, 117, 25, 86, 95, 51, 61, 127, 114, 18, 78, 59, 125, 119, 121, 48, 16, 93, 53, 110, 50, 105, 120, 116, 62, 58, 55, 38, 49, 31, 7, 76, 67, 65, 39, 10, 15, 87, 115, 12, 28, 108, 9, 74, 88, 45, 71, 56, 75, 21, 118, 109, 44, 72, 47, 19, 46, 60, 113, 124, 111, 68, 99, 40, 122, 106, 54, 103, 92, 35, 64, 81, 85, 70, 100, 89, 90, 107, 29, 43, 23, 63, 42, 69, 66, 112, 82, 1, 52, 91, 20, 32, 126, 123, 104, 11, 36, 80, 27, 13, 102, 14, 2, 33, 37, 26, 84, 22, 4, 8, 5, 83, 30, 101, 77, 17, 79, 97, 73, 96, 24, 94, 0, 3, 34, 6], [41, 98, 57, 117, 95, 120, 86, 25, 114, 51, 127, 125, 18, 110, 93, 121, 78, 16, 105, 59, 48, 108, 53, 58, 62, 119, 50, 45, 61, 38, 47, 49, 15, 124, 115, 39, 87, 44, 72, 85, 31, 46, 55, 111, 118, 12, 76, 21, 101, 54, 109, 7, 65, 116, 77, 67, 92, 35, 107, 43, 40, 69, 32, 90, 9, 11, 10, 112, 19, 88, 52, 75, 80, 97, 106, 28, 42, 82, 68, 113, 123, 102, 83, 81, 66, 20, 74, 70, 84, 24, 56, 64, 60, 22, 89, 99, 29, 63, 94, 126, 104, 36, 14, 13, 122, 91, 26, 103, 27, 73, 33, 5, 30, 100, 4, 96, 17, 37, 79, 0, 23, 8, 71, 1, 2, 34, 6, 3], [39, 63, 52, 97, 18, 113, 13, 87, 22, 28, 100, 120, 10, 4, 8, 45, 92, 11, 66, 0, 89, 126, 6, 81, 94, 27, 29, 68, 69, 42, 72, 65, 15, 12, 1, 14, 30, 59, 127, 108, 25, 70, 86, 74, 82, 110, 80, 119, 20, 79, 77, 44, 111, 90, 21, 7, 78, 62, 50, 85, 43, 16, 17, 121, 5, 19, 102, 23, 101, 40, 35, 106, 116, 9, 64, 24, 67, 96, 122, 75, 26, 84, 55, 83, 47, 38, 118, 125, 95, 54, 105, 91, 99, 76, 58, 88, 37, 104, 93, 3, 98, 32, 61, 34, 57, 117, 49, 73, 123, 115, 56, 36, 109, 103, 46, 41, 51, 31, 60, 48, 53, 124, 112, 71, 107, 114, 2, 33], [63, 52, 39, 113, 120, 37, 122, 125, 126, 45, 106, 119, 58, 61, 127, 111, 55, 110, 57, 47, 116, 51, 62, 103, 123, 56, 48, 115, 117, 59, 49, 54, 118, 108, 60, 97, 109, 53, 50, 121, 44, 114, 46, 124, 112, 100, 105, 107, 43, 36, 22, 42, 28, 41, 40, 38, 104, 21, 87, 20, 101, 102, 27, 35, 96, 2, 91, 94, 98, 11, 32, 34, 19, 99, 18, 5, 25, 92, 24, 33, 90, 29, 26, 31, 85, 8, 89, 93, 30, 95, 88, 84, 78, 81, 14, 86, 15, 13, 73, 6, 83, 23, 16, 64, 75, 79, 17, 69, 66, 9, 10, 80, 82, 12, 76, 77, 3, 67, 72, 0, 65, 70, 71, 4, 74, 7, 1, 68], [63, 39, 52, 97, 24, 87, 50, 66, 54, 110, 64, 83, 18, 69, 11, 7, 92, 58, 120, 79, 3, 36, 73, 13, 20, 114, 28, 0, 57, 2, 67, 6, 9, 22, 119, 106, 80, 44, 45, 10, 113, 8, 26, 1, 68, 108, 48, 37, 111, 4, 30, 65, 122, 85, 29, 62, 47, 72, 71, 107, 126, 25, 88, 125, 49, 61, 89, 76, 103, 70, 51, 96, 34, 12, 123, 40, 16, 104, 35, 81, 5, 38, 78, 116, 124, 117, 115, 31, 90, 84, 127, 32, 46, 56, 59, 55, 101, 98, 42, 60, 100, 118, 14, 75, 121, 94, 91, 105, 53, 112, 43, 27, 15, 17, 23, 109, 19, 95, 99, 93, 77, 102, 86, 41, 21, 82, 74, 33], [63, 39, 113, 52, 97, 120, 55, 28, 122, 87, 126, 119, 62, 92, 50, 114, 59, 22, 111, 58, 109, 36, 118, 106, 51, 47, 18, 110, 121, 61, 125, 54, 56, 116, 105, 115, 48, 57, 42, 45, 60, 49, 15, 102, 107, 21, 101, 127, 123, 124, 46, 100, 41, 37, 103, 117, 40, 44, 112, 13, 19, 35, 24, 53, 108, 43, 89, 81, 38, 84, 93, 99, 20, 69, 91, 104, 33, 8, 96, 74, 85, 34, 94, 30, 11, 98, 27, 31, 32, 86, 25, 23, 29, 26, 95, 90, 10, 17, 9, 88, 79, 83, 82, 76, 12, 4, 14, 5, 78, 6, 80, 64, 16, 71, 72, 73, 66, 3, 1, 77, 70, 2, 65, 68, 7, 75, 0, 67], [111, 47, 95, 28, 25, 84, 97, 18, 86, 33, 39, 122, 16, 123, 74, 29, 69, 15, 53, 117, 78, 120, 107, 49, 58, 7, 4, 106, 124, 35, 75, 125, 64, 17, 2, 110, 62, 55, 61, 113, 63, 91, 13, 32, 46, 42, 126, 99, 23, 12, 54, 121, 3, 108, 112, 72, 115, 1, 37, 57, 38, 96, 73, 59, 85, 44, 36, 43, 41, 60, 45, 50, 119, 77, 19, 114, 27, 48, 109, 101, 67, 116, 118, 104, 79, 52, 87, 127, 56, 102, 22, 76, 26, 103, 70, 90, 40, 24, 20, 83, 31, 89, 51, 6, 82, 92, 105, 30, 88, 94, 0, 65, 93, 100, 98, 9, 34, 21, 11, 80, 71, 81, 5, 14, 8, 10, 66, 68], [111, 47, 28, 95, 25, 84, 39, 97, 86, 33, 18, 16, 60, 59, 115, 112, 29, 54, 15, 74, 7, 106, 78, 52, 110, 118, 123, 1, 114, 69, 64, 17, 46, 62, 120, 57, 109, 2, 13, 4, 75, 12, 122, 101, 116, 107, 126, 49, 99, 27, 119, 9, 113, 121, 50, 45, 48, 41, 124, 42, 70, 63, 72, 58, 81, 36, 55, 85, 90, 56, 23, 73, 87, 117, 125, 76, 94, 61, 26, 67, 20, 35, 3, 96, 30, 32, 38, 43, 100, 53, 44, 103, 77, 24, 51, 40, 127, 89, 91, 37, 108, 31, 88, 22, 79, 19, 82, 83, 102, 6, 34, 105, 65, 98, 104, 92, 93, 80, 21, 66, 5, 0, 14, 68, 71, 11, 10, 8], [111, 47, 28, 95, 25, 84, 126, 33, 18, 97, 57, 86, 39, 120, 16, 60, 110, 124, 61, 29, 63, 116, 74, 54, 78, 56, 17, 64, 7, 46, 15, 125, 99, 50, 4, 12, 107, 62, 127, 75, 48, 53, 27, 35, 96, 106, 112, 109, 92, 59, 38, 113, 67, 2, 49, 55, 101, 72, 45, 42, 36, 69, 1, 41, 119, 20, 31, 94, 108, 52, 58, 123, 13, 89, 85, 122, 121, 87, 23, 22, 40, 114, 117, 73, 37, 19, 44, 32, 115, 70, 76, 105, 100, 90, 118, 9, 24, 51, 103, 43, 104, 91, 102, 34, 26, 88, 82, 81, 30, 98, 6, 3, 83, 0, 68, 21, 14, 80, 77, 93, 71, 10, 79, 8, 5, 65, 11, 66], [111, 47, 28, 25, 95, 84, 57, 86, 39, 110, 62, 33, 18, 16, 74, 116, 78, 29, 97, 2, 49, 41, 17, 118, 96, 15, 115, 1, 7, 64, 69, 4, 126, 13, 59, 120, 3, 127, 113, 63, 51, 65, 58, 56, 125, 36, 44, 50, 85, 123, 35, 60, 121, 75, 99, 43, 54, 37, 42, 76, 117, 119, 98, 114, 106, 46, 6, 55, 67, 70, 108, 45, 79, 124, 40, 89, 109, 52, 48, 38, 32, 53, 0, 26, 24, 105, 101, 72, 73, 27, 9, 20, 82, 112, 103, 22, 5, 122, 30, 81, 94, 61, 12, 102, 104, 107, 83, 90, 34, 77, 23, 11, 80, 92, 87, 88, 93, 21, 10, 91, 31, 14, 19, 100, 71, 66, 68, 8], [49, 117, 102, 113, 123, 52, 116, 53, 38, 23, 41, 91, 33, 25, 121, 85, 118, 119, 54, 122, 115, 109, 59, 93, 104, 120, 30, 110, 62, 61, 51, 108, 60, 124, 82, 112, 45, 111, 98, 114, 57, 48, 50, 96, 42, 63, 36, 94, 56, 47, 126, 46, 106, 37, 125, 21, 35, 43, 107, 127, 58, 44, 77, 79, 27, 55, 100, 103, 99, 95, 89, 40, 105, 39, 97, 24, 84, 18, 34, 101, 32, 20, 16, 17, 22, 31, 19, 29, 76, 87, 11, 78, 90, 83, 28, 88, 86, 26, 72, 92, 15, 5, 12, 4, 14, 70, 71, 13, 74, 80, 73, 9, 75, 3, 7, 81, 66, 1, 10, 8, 6, 0, 65, 68, 69, 67, 2, 64], [49, 102, 113, 117, 33, 91, 25, 38, 116, 123, 54, 30, 82, 23, 110, 55, 85, 35, 53, 121, 109, 39, 115, 95, 58, 62, 122, 100, 43, 93, 98, 51, 89, 52, 59, 60, 27, 21, 41, 125, 78, 48, 119, 57, 120, 42, 77, 40, 37, 92, 50, 118, 108, 80, 90, 96, 63, 28, 114, 112, 24, 111, 61, 103, 106, 124, 104, 71, 56, 34, 47, 101, 5, 14, 11, 127, 99, 22, 20, 36, 46, 94, 29, 12, 126, 32, 9, 65, 19, 88, 68, 86, 31, 97, 44, 15, 107, 105, 26, 45, 87, 2, 18, 17, 83, 67, 70, 72, 6, 64, 84, 8, 10, 76, 1, 3, 0, 16, 13, 75, 81, 74, 79, 7, 73, 69, 66, 4], [49, 102, 113, 117, 53, 33, 30, 25, 38, 91, 85, 120, 23, 125, 122, 93, 60, 82, 119, 54, 61, 41, 48, 118, 96, 107, 59, 27, 94, 109, 50, 108, 121, 110, 114, 106, 52, 35, 98, 42, 103, 101, 20, 40, 95, 43, 45, 123, 58, 57, 105, 99, 14, 116, 62, 89, 46, 36, 44, 56, 100, 124, 39, 47, 80, 77, 26, 127, 51, 111, 55, 97, 78, 72, 126, 112, 92, 3, 63, 115, 31, 34, 104, 22, 11, 24, 90, 29, 84, 32, 28, 75, 37, 10, 88, 73, 71, 64, 83, 4, 87, 18, 21, 15, 17, 12, 19, 2, 81, 65, 74, 9, 5, 86, 70, 16, 79, 76, 7, 13, 6, 68, 1, 0, 69, 8, 66, 67], [49, 102, 113, 117, 123, 33, 52, 25, 53, 91, 122, 61, 23, 38, 63, 20, 41, 48, 82, 85, 100, 118, 121, 93, 112, 27, 89, 116, 15, 96, 114, 42, 108, 11, 35, 30, 97, 50, 39, 101, 62, 9, 120, 99, 51, 87, 46, 45, 6, 115, 47, 60, 57, 59, 44, 124, 68, 126, 111, 127, 105, 109, 58, 43, 94, 54, 90, 34, 110, 80, 32, 103, 40, 55, 56, 28, 29, 26, 107, 119, 77, 125, 106, 104, 14, 98, 4, 1, 37, 95, 84, 22, 73, 36, 81, 21, 18, 16, 92, 83, 88, 31, 8, 86, 66, 79, 70, 19, 24, 10, 12, 0, 75, 2, 17, 76, 69, 5, 71, 78, 13, 7, 67, 3, 74, 64, 65, 72], [117, 120, 60, 38, 58, 127, 122, 46, 52, 62, 102, 126, 55, 125, 118, 43, 110, 116, 123, 121, 124, 115, 113, 63, 61, 119, 51, 57, 53, 114, 112, 56, 25, 54, 48, 50, 45, 59, 47, 108, 111, 39, 107, 44, 97, 104, 35, 37, 49, 26, 109, 94, 106, 41, 105, 86, 42, 98, 83, 103, 17, 24, 101, 23, 28, 36, 40, 85, 88, 92, 100, 91, 78, 19, 22, 21, 75, 32, 30, 34, 99, 11, 96, 16, 95, 27, 77, 79, 29, 82, 84, 31, 33, 14, 90, 93, 20, 81, 89, 80, 72, 18, 9, 3, 12, 87, 7, 67, 76, 15, 73, 71, 69, 70, 8, 2, 6, 1, 5, 0, 13, 65, 64, 74, 66, 68, 10, 4], [117, 60, 58, 127, 38, 110, 122, 46, 52, 116, 62, 115, 118, 126, 120, 125, 123, 43, 102, 51, 61, 124, 63, 55, 113, 109, 53, 94, 56, 121, 119, 57, 112, 59, 97, 54, 50, 114, 48, 45, 107, 47, 108, 111, 39, 104, 28, 44, 106, 25, 103, 23, 49, 86, 26, 105, 42, 35, 83, 41, 37, 36, 98, 101, 17, 91, 40, 74, 30, 100, 88, 84, 33, 24, 32, 99, 19, 22, 95, 90, 85, 81, 34, 96, 92, 75, 93, 31, 11, 69, 79, 27, 77, 78, 29, 66, 15, 80, 72, 16, 0, 14, 3, 64, 5, 89, 12, 87, 71, 20, 67, 18, 2, 82, 21, 65, 1, 8, 68, 6, 13, 10, 70, 76, 7, 9, 73, 4], [117, 60, 46, 120, 38, 110, 127, 122, 58, 52, 115, 62, 57, 125, 116, 118, 43, 126, 119, 48, 63, 123, 61, 56, 35, 49, 25, 109, 94, 121, 55, 124, 47, 51, 111, 50, 112, 54, 114, 83, 53, 113, 24, 26, 108, 45, 59, 105, 40, 107, 104, 102, 44, 86, 41, 97, 106, 28, 103, 37, 42, 88, 30, 23, 36, 39, 101, 100, 22, 75, 34, 85, 99, 11, 84, 17, 31, 19, 32, 16, 98, 79, 27, 81, 91, 69, 92, 33, 78, 93, 96, 95, 3, 14, 72, 82, 90, 29, 8, 89, 77, 20, 5, 13, 18, 66, 80, 74, 15, 87, 2, 64, 0, 9, 21, 67, 70, 65, 71, 6, 1, 76, 7, 12, 10, 73, 68, 4], [46, 117, 38, 120, 110, 85, 94, 25, 97, 16, 9, 90, 12, 126, 122, 81, 23, 116, 58, 100, 15, 62, 118, 98, 52, 68, 14, 76, 28, 89, 7, 125, 87, 82, 61, 71, 121, 37, 18, 123, 48, 86, 1, 42, 60, 30, 21, 99, 57, 96, 31, 91, 44, 107, 32, 119, 112, 36, 53, 47, 56, 127, 51, 101, 45, 55, 115, 10, 113, 43, 70, 114, 124, 88, 104, 29, 26, 73, 92, 54, 79, 109, 111, 75, 49, 50, 63, 24, 19, 13, 20, 93, 59, 83, 4, 108, 95, 74, 80, 34, 84, 35, 17, 106, 72, 27, 6, 11, 103, 2, 77, 22, 105, 78, 41, 8, 40, 39, 5, 65, 33, 69, 64, 3, 66, 0, 67, 102], [41, 99, 67, 74, 80, 0, 86, 44, 4, 13, 71, 73, 19, 115, 118, 69, 124, 105, 52, 119, 113, 114, 51, 65, 122, 54, 12, 116, 112, 111, 35, 117, 110, 126, 57, 2, 62, 63, 64, 60, 48, 107, 61, 22, 6, 89, 121, 76, 84, 123, 109, 1, 29, 90, 3, 45, 30, 88, 14, 106, 83, 79, 10, 33, 92, 9, 108, 120, 20, 55, 23, 68, 39, 15, 40, 101, 102, 103, 98, 104, 66, 75, 100, 56, 27, 94, 11, 38, 53, 17, 58, 24, 31, 32, 8, 77, 70, 7, 5, 26, 43, 16, 85, 95, 81, 46, 34, 37, 49, 125, 96, 21, 28, 36, 72, 50, 127, 91, 93, 59, 97, 42, 82, 78, 87, 18, 47, 25], [41, 99, 80, 74, 13, 4, 86, 71, 19, 67, 73, 113, 52, 69, 61, 51, 12, 44, 118, 115, 112, 122, 64, 60, 114, 105, 111, 66, 2, 120, 57, 116, 103, 63, 6, 54, 119, 62, 124, 35, 75, 117, 43, 1, 55, 20, 15, 88, 48, 108, 126, 22, 101, 27, 125, 107, 21, 82, 29, 85, 31, 102, 76, 17, 50, 78, 34, 3, 90, 106, 37, 79, 26, 24, 8, 109, 83, 127, 59, 39, 23, 91, 45, 46, 87, 49, 98, 123, 100, 81, 96, 92, 110, 32, 30, 68, 93, 89, 38, 56, 97, 42, 18, 53, 84, 36, 33, 5, 58, 94, 72, 25, 28, 104, 14, 95, 121, 10, 65, 47, 70, 40, 11, 7, 16, 77, 9, 0], [41, 74, 0, 99, 65, 86, 2, 44, 80, 13, 67, 69, 4, 118, 119, 116, 115, 113, 71, 112, 54, 19, 111, 110, 114, 105, 73, 124, 12, 117, 121, 52, 122, 51, 126, 57, 35, 22, 107, 62, 48, 63, 29, 61, 15, 94, 3, 59, 68, 89, 83, 60, 6, 123, 10, 27, 90, 33, 45, 20, 108, 5, 24, 40, 88, 16, 106, 1, 49, 92, 64, 9, 76, 23, 53, 8, 32, 38, 75, 120, 100, 66, 7, 56, 21, 97, 79, 39, 104, 85, 26, 91, 18, 109, 93, 11, 82, 70, 55, 101, 87, 30, 58, 125, 98, 17, 78, 127, 31, 81, 72, 96, 84, 14, 77, 42, 28, 95, 34, 43, 25, 103, 102, 37, 47, 46, 36, 50], [41, 99, 0, 2, 44, 67, 86, 80, 13, 4, 65, 115, 118, 69, 71, 119, 116, 113, 73, 114, 111, 112, 122, 54, 19, 52, 51, 105, 124, 110, 126, 22, 121, 117, 57, 35, 74, 62, 29, 12, 107, 3, 48, 68, 61, 60, 63, 89, 33, 123, 64, 15, 6, 108, 90, 40, 5, 97, 94, 88, 16, 70, 27, 106, 20, 83, 100, 98, 17, 75, 79, 38, 49, 81, 45, 84, 11, 1, 39, 24, 76, 92, 56, 9, 91, 120, 104, 55, 96, 28, 18, 10, 26, 8, 14, 78, 82, 125, 109, 42, 25, 30, 85, 53, 59, 93, 7, 23, 37, 95, 101, 36, 58, 72, 46, 21, 127, 87, 31, 47, 34, 32, 102, 43, 50, 77, 66, 103], [106, 99, 59, 26, 118, 124, 94, 49, 42, 21, 62, 19, 115, 86, 17, 113, 60, 121, 40, 109, 55, 30, 78, 32, 80, 56, 117, 107, 73, 122, 28, 123, 47, 12, 24, 96, 98, 29, 120, 74, 23, 43, 50, 53, 11, 69, 51, 37, 125, 61, 63, 114, 52, 27, 79, 108, 116, 103, 57, 126, 110, 58, 127, 112, 119, 48, 84, 89, 105, 92, 39, 54, 46, 111, 36, 104, 4, 22, 8, 71, 101, 45, 3, 72, 67, 7, 90, 44, 41, 34, 38, 15, 77, 33, 102, 93, 97, 31, 100, 70, 14, 66, 64, 95, 88, 85, 76, 0, 1, 82, 91, 25, 81, 16, 87, 18, 5, 83, 20, 13, 6, 35, 65, 9, 68, 10, 75, 2], [106, 61, 99, 118, 115, 59, 121, 26, 94, 42, 107, 62, 113, 21, 19, 86, 125, 29, 11, 109, 28, 79, 50, 22, 55, 60, 25, 117, 40, 17, 126, 47, 90, 56, 123, 23, 119, 124, 24, 58, 52, 93, 80, 57, 78, 114, 39, 98, 13, 74, 66, 32, 16, 73, 116, 101, 85, 34, 65, 104, 53, 63, 103, 49, 12, 70, 122, 35, 36, 82, 120, 108, 45, 46, 4, 51, 71, 95, 30, 44, 20, 91, 37, 38, 105, 87, 72, 48, 112, 43, 27, 100, 77, 6, 7, 111, 97, 31, 41, 3, 89, 64, 110, 54, 127, 88, 96, 102, 92, 18, 84, 83, 33, 5, 69, 81, 76, 15, 14, 0, 67, 10, 75, 8, 9, 2, 1, 68], [106, 59, 115, 99, 118, 62, 50, 112, 26, 55, 94, 125, 86, 19, 117, 121, 124, 42, 24, 116, 21, 53, 80, 17, 29, 52, 122, 107, 44, 78, 49, 123, 30, 32, 40, 28, 45, 11, 57, 74, 61, 98, 63, 58, 120, 101, 113, 23, 46, 12, 110, 60, 104, 27, 48, 96, 51, 108, 109, 114, 47, 72, 126, 119, 69, 4, 127, 111, 54, 97, 73, 56, 71, 43, 36, 39, 95, 37, 103, 38, 102, 105, 41, 84, 33, 100, 7, 22, 34, 91, 93, 90, 35, 31, 79, 8, 66, 92, 87, 67, 88, 82, 20, 16, 25, 18, 68, 85, 76, 70, 15, 81, 64, 89, 3, 83, 1, 0, 14, 77, 6, 5, 9, 10, 65, 13, 75, 2], [106, 99, 107, 115, 59, 26, 94, 113, 86, 17, 19, 42, 21, 12, 121, 80, 30, 62, 40, 119, 117, 55, 96, 49, 78, 74, 60, 69, 61, 125, 112, 24, 72, 11, 118, 53, 114, 56, 124, 70, 120, 4, 32, 123, 116, 109, 108, 58, 5, 44, 104, 73, 110, 68, 52, 126, 93, 54, 89, 122, 67, 48, 45, 57, 1, 101, 63, 51, 37, 29, 47, 28, 71, 36, 127, 16, 43, 41, 111, 25, 105, 46, 22, 102, 13, 88, 9, 98, 83, 50, 27, 0, 84, 39, 8, 20, 95, 90, 14, 87, 7, 38, 77, 33, 64, 100, 6, 82, 92, 34, 79, 23, 76, 35, 3, 103, 31, 65, 18, 91, 85, 66, 97, 81, 10, 15, 2, 75]], "model.layers.29.self_attn.k_proj": [[109, 45, 99, 93, 22, 64, 31, 83, 80, 14, 25, 18, 11, 91, 67, 8, 2, 7, 126, 119, 1, 124, 6, 127, 70, 122, 112, 77, 13, 115, 60, 16, 21, 55, 9, 43, 114, 23, 10, 51, 4, 29, 90, 12, 52, 110, 63, 116, 41, 42, 36, 111, 57, 69, 118, 28, 59, 49, 120, 84, 103, 72, 123, 56, 61, 39, 121, 88, 113, 62, 54, 105, 44, 47, 53, 27, 40, 82, 58, 50, 117, 85, 102, 96, 48, 38, 92, 33, 100, 5, 24, 87, 108, 107, 106, 94, 101, 125, 98, 32, 46, 26, 34, 104, 15, 37, 30, 76, 97, 89, 79, 20, 95, 78, 75, 17, 35, 73, 81, 74, 66, 19, 86, 68, 71, 3, 0, 65], [105, 34, 57, 18, 53, 46, 51, 125, 78, 86, 117, 25, 58, 120, 16, 45, 64, 31, 127, 67, 95, 109, 98, 50, 12, 119, 29, 69, 59, 111, 108, 61, 66, 10, 112, 9, 19, 74, 87, 7, 70, 113, 41, 60, 92, 80, 13, 49, 15, 55, 91, 52, 114, 88, 21, 121, 62, 76, 2, 107, 65, 75, 116, 103, 38, 83, 63, 68, 100, 93, 104, 39, 0, 40, 72, 56, 20, 35, 106, 90, 110, 115, 73, 54, 118, 27, 36, 48, 6, 42, 44, 126, 81, 33, 122, 124, 47, 102, 43, 26, 123, 84, 77, 37, 1, 101, 28, 30, 96, 94, 23, 24, 99, 79, 85, 97, 32, 17, 71, 89, 11, 8, 22, 82, 5, 4, 14, 3], [63, 103, 22, 33, 92, 87, 13, 65, 18, 120, 10, 50, 4, 119, 126, 111, 54, 0, 61, 109, 49, 6, 62, 58, 81, 51, 117, 48, 127, 47, 125, 123, 122, 118, 59, 124, 46, 8, 45, 60, 56, 121, 115, 44, 30, 110, 113, 53, 116, 114, 112, 29, 2, 57, 71, 40, 55, 42, 43, 102, 52, 108, 37, 107, 25, 105, 73, 41, 98, 36, 20, 106, 104, 38, 11, 3, 34, 5, 39, 101, 91, 14, 100, 12, 7, 16, 78, 89, 67, 99, 83, 79, 31, 80, 24, 90, 76, 15, 32, 70, 95, 27, 93, 96, 35, 88, 21, 74, 94, 23, 17, 9, 85, 84, 26, 28, 86, 69, 75, 82, 19, 64, 1, 68, 97, 72, 66, 77], [47, 111, 64, 86, 95, 16, 84, 28, 25, 18, 74, 4, 1, 2, 72, 7, 97, 78, 67, 65, 6, 69, 15, 17, 29, 123, 12, 3, 124, 23, 76, 103, 75, 54, 31, 14, 0, 39, 77, 120, 55, 53, 27, 113, 85, 79, 61, 110, 70, 35, 81, 56, 13, 63, 122, 36, 105, 50, 126, 9, 19, 33, 101, 87, 91, 121, 57, 71, 125, 114, 32, 117, 62, 116, 24, 73, 48, 99, 109, 38, 127, 100, 108, 106, 118, 43, 115, 46, 52, 37, 44, 41, 119, 21, 42, 51, 26, 102, 60, 104, 30, 45, 107, 59, 34, 58, 90, 40, 98, 80, 94, 88, 49, 92, 82, 112, 89, 8, 96, 93, 20, 83, 11, 22, 10, 5, 66, 68], [49, 38, 97, 91, 25, 93, 117, 23, 80, 20, 92, 82, 15, 9, 64, 77, 11, 85, 2, 30, 68, 65, 87, 3, 45, 14, 108, 21, 5, 22, 17, 95, 96, 94, 50, 71, 120, 112, 36, 35, 18, 61, 37, 119, 8, 6, 118, 53, 70, 48, 46, 126, 106, 109, 55, 104, 59, 57, 34, 58, 122, 63, 111, 107, 33, 54, 123, 40, 62, 103, 52, 78, 105, 116, 124, 121, 39, 110, 99, 47, 28, 44, 24, 31, 101, 115, 42, 56, 41, 43, 89, 51, 127, 114, 125, 10, 60, 90, 83, 32, 19, 26, 98, 29, 1, 13, 67, 7, 100, 72, 113, 88, 84, 86, 0, 102, 12, 75, 81, 79, 16, 27, 76, 69, 74, 66, 73, 4], [110, 117, 102, 86, 33, 30, 52, 92, 25, 99, 58, 62, 119, 22, 125, 123, 54, 122, 114, 126, 53, 112, 55, 14, 81, 49, 111, 116, 51, 124, 47, 121, 48, 59, 61, 44, 103, 56, 63, 120, 75, 41, 108, 113, 115, 57, 118, 50, 83, 109, 16, 127, 39, 38, 107, 105, 35, 43, 31, 104, 46, 106, 28, 42, 45, 94, 91, 34, 32, 85, 23, 40, 37, 72, 100, 24, 101, 93, 26, 36, 18, 12, 29, 90, 5, 97, 9, 27, 98, 60, 84, 67, 0, 96, 95, 21, 87, 6, 20, 19, 82, 89, 80, 8, 78, 15, 2, 1, 7, 10, 88, 76, 68, 13, 77, 11, 79, 3, 17, 74, 65, 71, 64, 70, 69, 66, 73, 4], [105, 64, 35, 1, 118, 0, 116, 108, 67, 51, 119, 71, 19, 13, 86, 69, 73, 66, 4, 80, 65, 49, 2, 124, 48, 57, 12, 61, 122, 113, 47, 46, 114, 115, 43, 111, 44, 54, 3, 63, 99, 117, 126, 50, 74, 53, 58, 62, 121, 123, 52, 107, 60, 5, 56, 70, 106, 89, 6, 93, 55, 15, 103, 110, 59, 29, 94, 20, 88, 26, 112, 104, 17, 7, 75, 85, 72, 41, 33, 109, 76, 79, 97, 68, 45, 120, 90, 92, 9, 18, 101, 11, 125, 102, 84, 91, 30, 96, 32, 8, 77, 82, 87, 27, 23, 100, 38, 34, 78, 24, 14, 40, 21, 31, 37, 39, 127, 25, 98, 95, 42, 28, 81, 10, 83, 36, 22, 16], [42, 30, 99, 118, 35, 86, 26, 51, 21, 19, 78, 113, 61, 17, 64, 45, 11, 121, 66, 63, 111, 117, 106, 125, 28, 73, 124, 114, 123, 102, 71, 126, 53, 80, 108, 49, 122, 62, 12, 105, 41, 4, 32, 43, 74, 116, 16, 7, 38, 23, 56, 40, 119, 44, 55, 52, 3, 46, 104, 54, 65, 101, 115, 24, 47, 127, 110, 57, 109, 79, 69, 95, 70, 89, 36, 103, 58, 120, 48, 39, 29, 112, 33, 107, 98, 100, 25, 1, 59, 60, 50, 97, 37, 34, 94, 87, 15, 27, 91, 92, 20, 6, 72, 82, 31, 96, 85, 88, 93, 8, 10, 84, 67, 77, 13, 68, 18, 5, 75, 76, 90, 0, 22, 83, 81, 9, 2, 14]], "model.layers.29.self_attn.qk_proj": [[49, 63, 117, 111, 105, 47, 45, 109, 42, 110, 99, 22, 57, 106, 118, 41, 113, 86, 64, 0, 89, 51, 35, 119, 38, 16, 80, 53, 93, 61, 116, 95, 52, 28, 115, 82, 120, 124, 121, 94, 114, 62, 122, 25, 27, 46, 58, 125, 19, 44, 77, 102, 98, 18, 103, 83, 13, 54, 126, 108, 65, 50, 55, 33, 1, 39, 48, 10, 127, 3, 92, 71, 68, 78, 43, 112, 123, 7, 4, 97, 59, 14, 67, 107, 31, 56, 2, 66, 9, 90, 87, 5, 69, 74, 84, 73, 6, 21, 60, 26, 85, 29, 30, 11, 12, 23, 20, 75, 76, 79, 104, 81, 15, 8, 34, 72, 37, 17, 40, 101, 91, 36, 32, 24, 96, 88, 100, 70], [49, 63, 117, 111, 105, 47, 45, 109, 42, 110, 57, 41, 106, 99, 22, 0, 118, 64, 113, 86, 51, 89, 52, 35, 38, 119, 61, 16, 115, 95, 80, 121, 120, 124, 28, 116, 94, 62, 93, 114, 55, 53, 122, 59, 65, 82, 25, 18, 98, 127, 46, 44, 58, 77, 50, 27, 123, 13, 126, 83, 102, 54, 2, 43, 68, 1, 108, 33, 125, 103, 19, 48, 39, 4, 7, 3, 74, 71, 87, 67, 14, 97, 92, 112, 5, 78, 10, 31, 73, 66, 56, 60, 9, 90, 107, 21, 11, 6, 29, 76, 84, 69, 75, 12, 23, 20, 30, 104, 17, 85, 34, 40, 8, 72, 79, 81, 15, 26, 91, 36, 101, 88, 37, 24, 70, 32, 96, 100], [49, 63, 117, 111, 105, 47, 45, 109, 42, 110, 57, 99, 22, 113, 106, 64, 41, 86, 0, 118, 89, 35, 51, 119, 38, 115, 52, 124, 116, 16, 61, 80, 93, 120, 95, 62, 94, 122, 18, 28, 98, 102, 123, 25, 59, 53, 82, 65, 127, 121, 19, 50, 27, 67, 48, 46, 54, 83, 126, 125, 114, 77, 1, 44, 7, 58, 33, 43, 56, 108, 103, 2, 3, 13, 68, 112, 14, 39, 66, 10, 92, 97, 4, 55, 78, 90, 71, 9, 74, 84, 69, 87, 60, 73, 5, 21, 75, 31, 29, 107, 15, 20, 30, 12, 23, 11, 76, 85, 104, 6, 26, 70, 79, 81, 34, 40, 37, 101, 72, 17, 8, 91, 36, 32, 96, 88, 100, 24], [49, 63, 117, 105, 111, 47, 45, 109, 42, 110, 57, 106, 41, 99, 22, 113, 86, 64, 0, 118, 35, 89, 122, 51, 119, 115, 52, 16, 124, 62, 116, 80, 38, 120, 61, 25, 94, 93, 95, 53, 28, 123, 46, 18, 121, 114, 82, 55, 54, 126, 102, 19, 1, 44, 50, 77, 39, 66, 127, 13, 3, 98, 125, 43, 33, 56, 48, 59, 58, 4, 27, 78, 83, 71, 7, 65, 68, 67, 14, 103, 97, 74, 69, 60, 108, 112, 11, 9, 31, 73, 2, 107, 87, 92, 70, 21, 10, 5, 76, 90, 85, 26, 29, 20, 79, 104, 84, 23, 12, 75, 15, 37, 81, 32, 72, 30, 36, 40, 101, 8, 6, 91, 34, 17, 88, 100, 24, 96], [49, 63, 117, 105, 111, 47, 45, 109, 42, 110, 41, 106, 57, 99, 86, 22, 64, 118, 0, 113, 89, 51, 119, 124, 61, 35, 122, 16, 115, 52, 80, 62, 121, 120, 38, 54, 28, 46, 116, 25, 93, 18, 82, 95, 94, 53, 125, 114, 48, 44, 58, 126, 98, 1, 102, 83, 19, 27, 13, 59, 123, 50, 77, 43, 55, 78, 127, 7, 112, 39, 56, 103, 4, 68, 108, 67, 71, 73, 33, 65, 74, 66, 87, 14, 70, 2, 107, 10, 92, 69, 31, 97, 84, 3, 9, 60, 90, 30, 5, 21, 20, 11, 26, 76, 81, 23, 29, 15, 12, 85, 79, 91, 75, 104, 40, 8, 34, 37, 101, 100, 32, 17, 72, 88, 96, 24, 36, 6], [49, 63, 117, 105, 111, 47, 45, 109, 42, 41, 110, 106, 57, 22, 86, 118, 0, 99, 89, 64, 51, 113, 119, 35, 80, 52, 16, 116, 38, 115, 122, 124, 95, 61, 18, 44, 62, 28, 82, 94, 46, 53, 121, 93, 25, 125, 120, 27, 13, 126, 58, 19, 54, 114, 98, 50, 55, 48, 77, 65, 83, 71, 112, 102, 4, 123, 108, 107, 33, 59, 39, 68, 1, 56, 127, 103, 66, 78, 97, 92, 3, 87, 67, 74, 70, 14, 73, 10, 2, 43, 7, 60, 11, 90, 5, 31, 84, 21, 20, 9, 30, 69, 76, 23, 15, 34, 79, 29, 37, 75, 12, 8, 85, 40, 17, 26, 91, 81, 72, 104, 36, 32, 88, 101, 6, 24, 100, 96], [49, 63, 105, 117, 111, 47, 45, 109, 42, 41, 106, 22, 57, 99, 110, 86, 118, 64, 89, 0, 51, 35, 113, 38, 80, 119, 16, 95, 52, 115, 61, 116, 120, 94, 28, 46, 124, 122, 121, 126, 18, 125, 82, 53, 77, 93, 62, 25, 65, 13, 50, 27, 83, 1, 114, 112, 44, 127, 54, 123, 59, 43, 58, 48, 19, 103, 2, 102, 3, 67, 55, 98, 14, 108, 31, 71, 39, 4, 68, 74, 78, 10, 87, 90, 92, 5, 107, 7, 33, 97, 66, 29, 56, 60, 21, 9, 30, 84, 23, 70, 11, 73, 76, 20, 26, 85, 69, 12, 15, 34, 37, 81, 72, 75, 79, 17, 40, 91, 104, 101, 8, 24, 88, 100, 96, 32, 36, 6], [49, 63, 117, 105, 111, 47, 45, 109, 42, 41, 110, 99, 22, 106, 86, 57, 0, 113, 118, 64, 51, 38, 35, 89, 119, 52, 95, 28, 61, 115, 120, 93, 124, 80, 122, 62, 116, 16, 94, 25, 53, 121, 125, 82, 58, 123, 77, 102, 27, 103, 126, 44, 98, 54, 1, 83, 18, 114, 65, 112, 13, 127, 39, 48, 67, 59, 50, 19, 46, 56, 71, 108, 33, 68, 2, 66, 3, 97, 43, 92, 7, 55, 31, 14, 87, 90, 74, 10, 78, 4, 5, 23, 9, 107, 73, 20, 29, 85, 60, 21, 30, 69, 11, 84, 70, 26, 12, 40, 75, 79, 76, 15, 81, 17, 34, 37, 72, 101, 36, 91, 88, 104, 32, 6, 96, 24, 100, 8], [49, 63, 117, 105, 111, 47, 45, 109, 42, 99, 41, 110, 57, 22, 106, 118, 86, 64, 113, 89, 51, 0, 35, 38, 95, 119, 120, 52, 93, 16, 61, 116, 115, 123, 28, 80, 62, 124, 121, 94, 127, 126, 44, 59, 114, 82, 122, 58, 83, 48, 1, 98, 18, 25, 102, 103, 125, 65, 108, 13, 53, 19, 56, 67, 46, 112, 39, 54, 55, 3, 68, 33, 77, 27, 4, 43, 71, 66, 10, 31, 60, 50, 92, 97, 107, 90, 7, 74, 78, 30, 87, 69, 29, 14, 2, 85, 20, 23, 5, 26, 73, 21, 84, 76, 11, 75, 9, 12, 17, 79, 37, 81, 40, 15, 91, 104, 36, 101, 6, 70, 72, 34, 88, 96, 8, 32, 100, 24], [49, 63, 117, 105, 111, 47, 45, 109, 42, 110, 57, 99, 118, 106, 41, 113, 22, 119, 64, 0, 86, 51, 35, 89, 38, 61, 115, 62, 52, 124, 116, 16, 55, 28, 80, 121, 120, 123, 25, 58, 46, 95, 65, 122, 126, 127, 82, 94, 53, 93, 18, 39, 83, 13, 98, 112, 114, 59, 44, 67, 125, 102, 19, 33, 77, 48, 56, 50, 92, 27, 4, 54, 31, 7, 68, 9, 2, 60, 43, 1, 71, 78, 3, 103, 108, 66, 74, 69, 97, 87, 14, 84, 10, 90, 11, 5, 107, 73, 29, 30, 76, 6, 20, 21, 23, 79, 26, 104, 40, 75, 85, 12, 91, 15, 34, 17, 81, 101, 8, 72, 36, 37, 70, 88, 100, 32, 96, 24], [49, 63, 117, 105, 111, 47, 45, 109, 42, 110, 41, 99, 64, 22, 118, 0, 106, 57, 86, 51, 113, 89, 119, 35, 52, 38, 16, 122, 80, 61, 116, 25, 82, 28, 95, 121, 120, 93, 115, 46, 124, 18, 94, 126, 62, 125, 127, 53, 13, 19, 48, 65, 50, 1, 44, 83, 123, 59, 71, 4, 27, 3, 78, 114, 54, 77, 2, 55, 39, 108, 7, 58, 9, 102, 67, 14, 97, 10, 66, 103, 74, 68, 6, 73, 5, 33, 60, 56, 92, 98, 90, 69, 20, 23, 11, 87, 43, 112, 29, 84, 21, 30, 75, 31, 15, 85, 107, 12, 76, 79, 26, 81, 72, 104, 91, 17, 34, 32, 8, 37, 40, 101, 70, 36, 96, 24, 88, 100], [49, 63, 105, 117, 111, 47, 45, 109, 42, 41, 110, 22, 57, 106, 99, 86, 64, 118, 0, 89, 113, 35, 122, 52, 51, 38, 119, 16, 80, 61, 120, 28, 116, 95, 115, 93, 62, 82, 25, 124, 94, 18, 121, 19, 126, 53, 13, 83, 125, 103, 58, 1, 59, 3, 77, 48, 44, 55, 46, 4, 27, 123, 54, 102, 33, 114, 50, 2, 71, 43, 65, 78, 98, 68, 74, 67, 127, 108, 39, 14, 112, 60, 10, 66, 92, 97, 7, 107, 31, 56, 20, 90, 75, 9, 21, 6, 85, 23, 69, 87, 5, 29, 12, 26, 73, 76, 11, 81, 84, 30, 40, 17, 79, 34, 91, 15, 8, 37, 72, 36, 88, 24, 104, 100, 32, 101, 70, 96], [49, 63, 105, 117, 111, 47, 45, 109, 42, 110, 57, 99, 106, 41, 118, 113, 22, 86, 51, 64, 89, 120, 119, 0, 35, 52, 38, 61, 115, 116, 80, 95, 94, 124, 25, 122, 28, 16, 62, 46, 125, 121, 93, 43, 44, 53, 59, 58, 126, 123, 98, 102, 83, 18, 114, 33, 39, 82, 55, 65, 27, 48, 103, 77, 127, 19, 56, 54, 50, 13, 108, 112, 3, 67, 92, 1, 97, 71, 4, 90, 31, 60, 23, 2, 66, 68, 10, 78, 87, 73, 74, 107, 7, 26, 21, 40, 5, 29, 69, 85, 30, 20, 84, 14, 76, 9, 75, 6, 91, 17, 15, 12, 79, 34, 81, 11, 104, 101, 88, 100, 8, 37, 24, 96, 72, 36, 70, 32], [49, 63, 117, 105, 111, 47, 45, 109, 42, 110, 99, 41, 106, 22, 57, 64, 118, 0, 86, 113, 51, 89, 35, 119, 80, 52, 61, 120, 38, 121, 95, 115, 16, 124, 122, 28, 116, 62, 25, 93, 53, 94, 59, 58, 123, 1, 48, 46, 65, 125, 50, 83, 13, 127, 114, 18, 92, 19, 98, 82, 71, 33, 4, 103, 66, 3, 126, 67, 77, 108, 54, 102, 27, 55, 60, 39, 14, 112, 2, 44, 43, 73, 10, 7, 107, 78, 56, 74, 68, 97, 31, 69, 84, 5, 87, 90, 20, 29, 9, 23, 76, 12, 75, 11, 26, 21, 30, 85, 40, 6, 34, 70, 79, 15, 81, 17, 37, 72, 104, 8, 100, 91, 101, 24, 88, 36, 32, 96], [49, 63, 117, 105, 111, 47, 45, 109, 42, 110, 41, 106, 57, 118, 22, 99, 0, 64, 86, 89, 51, 113, 119, 38, 35, 52, 80, 95, 120, 16, 122, 116, 61, 28, 121, 126, 25, 94, 124, 82, 46, 53, 93, 123, 115, 13, 48, 114, 125, 65, 55, 83, 19, 62, 18, 39, 102, 3, 2, 1, 127, 71, 9, 98, 108, 54, 44, 50, 59, 73, 67, 4, 78, 43, 77, 7, 68, 92, 27, 14, 84, 60, 58, 74, 103, 66, 70, 10, 112, 33, 97, 21, 5, 20, 69, 75, 31, 107, 56, 90, 11, 23, 29, 76, 87, 30, 15, 26, 12, 79, 34, 91, 17, 85, 8, 40, 81, 104, 6, 101, 72, 37, 88, 100, 32, 24, 36, 96], [49, 63, 117, 105, 111, 47, 45, 109, 42, 41, 110, 57, 22, 99, 118, 106, 0, 86, 64, 113, 89, 51, 38, 35, 116, 119, 16, 80, 95, 52, 121, 124, 122, 46, 61, 123, 28, 120, 62, 25, 93, 126, 115, 18, 94, 82, 53, 83, 114, 13, 125, 102, 50, 33, 127, 1, 44, 48, 19, 4, 98, 108, 59, 103, 3, 68, 7, 65, 27, 78, 67, 77, 74, 39, 56, 58, 54, 55, 2, 71, 66, 73, 60, 97, 14, 43, 9, 70, 21, 90, 92, 69, 10, 87, 107, 5, 75, 85, 29, 84, 31, 30, 76, 20, 23, 12, 112, 104, 11, 79, 26, 81, 8, 37, 17, 15, 40, 34, 91, 72, 101, 32, 36, 100, 88, 96, 6, 24], [49, 63, 105, 117, 111, 47, 45, 109, 42, 41, 22, 110, 106, 99, 57, 118, 86, 51, 113, 64, 0, 35, 89, 38, 52, 80, 95, 16, 120, 28, 119, 61, 94, 115, 93, 25, 124, 116, 125, 123, 53, 121, 122, 46, 82, 102, 18, 114, 62, 83, 39, 126, 19, 27, 58, 13, 55, 1, 59, 48, 92, 33, 103, 127, 50, 98, 43, 44, 77, 87, 108, 54, 56, 4, 14, 65, 74, 90, 67, 31, 78, 66, 97, 3, 71, 73, 7, 10, 85, 68, 112, 2, 60, 29, 21, 23, 69, 9, 30, 76, 11, 107, 84, 26, 20, 5, 70, 81, 75, 12, 34, 104, 17, 91, 37, 15, 40, 32, 36, 24, 101, 79, 8, 72, 100, 96, 88, 6], [49, 63, 117, 111, 105, 47, 45, 109, 42, 110, 57, 86, 41, 106, 99, 22, 113, 118, 64, 0, 51, 89, 35, 95, 16, 115, 38, 119, 80, 122, 94, 28, 120, 116, 18, 62, 121, 93, 52, 124, 61, 123, 25, 53, 98, 83, 102, 125, 19, 82, 1, 27, 114, 3, 13, 48, 65, 50, 126, 67, 103, 39, 58, 59, 55, 127, 92, 54, 66, 68, 33, 77, 2, 71, 7, 74, 56, 46, 44, 14, 112, 78, 4, 108, 73, 10, 97, 60, 43, 5, 29, 20, 31, 9, 69, 90, 85, 30, 70, 87, 84, 23, 76, 107, 21, 26, 12, 75, 11, 15, 34, 81, 79, 91, 40, 17, 104, 101, 88, 8, 36, 72, 37, 32, 96, 6, 24, 100], [49, 63, 117, 111, 105, 47, 45, 109, 42, 110, 41, 106, 57, 118, 99, 22, 113, 64, 86, 0, 51, 89, 122, 38, 123, 119, 115, 120, 35, 121, 116, 61, 16, 28, 94, 80, 58, 55, 95, 25, 62, 124, 102, 125, 52, 127, 93, 53, 82, 19, 46, 39, 114, 13, 126, 43, 59, 18, 44, 83, 1, 54, 65, 66, 98, 33, 50, 92, 56, 7, 77, 48, 3, 27, 112, 108, 71, 14, 67, 78, 9, 73, 4, 103, 10, 31, 2, 23, 68, 29, 74, 60, 97, 107, 104, 5, 21, 69, 84, 30, 90, 87, 85, 12, 76, 20, 75, 11, 15, 26, 70, 91, 34, 72, 81, 6, 79, 36, 17, 40, 37, 8, 96, 88, 101, 32, 100, 24], [49, 63, 117, 105, 111, 47, 45, 109, 42, 41, 110, 106, 57, 99, 22, 118, 86, 64, 113, 51, 0, 89, 35, 119, 38, 61, 52, 16, 28, 94, 120, 124, 116, 122, 80, 121, 62, 115, 82, 48, 125, 55, 25, 95, 93, 102, 127, 123, 54, 53, 39, 58, 46, 44, 83, 98, 114, 27, 59, 19, 18, 65, 1, 13, 112, 126, 33, 14, 2, 50, 77, 74, 3, 108, 68, 103, 73, 92, 43, 97, 71, 4, 7, 66, 78, 31, 60, 9, 10, 21, 56, 67, 6, 5, 90, 75, 23, 87, 76, 84, 69, 11, 29, 107, 20, 30, 85, 12, 79, 104, 26, 17, 72, 81, 36, 34, 37, 15, 101, 8, 70, 88, 40, 91, 96, 24, 32, 100], [49, 63, 117, 111, 105, 47, 45, 109, 42, 110, 57, 106, 22, 41, 99, 86, 118, 113, 0, 51, 89, 64, 35, 124, 61, 120, 62, 38, 119, 116, 16, 125, 93, 28, 121, 115, 94, 52, 80, 95, 122, 53, 25, 102, 18, 50, 44, 48, 83, 123, 82, 27, 98, 67, 114, 59, 1, 55, 54, 66, 13, 33, 77, 46, 108, 19, 103, 58, 4, 126, 112, 2, 71, 43, 14, 39, 3, 127, 7, 92, 65, 10, 97, 74, 9, 78, 69, 73, 60, 31, 68, 107, 56, 20, 87, 75, 23, 6, 84, 21, 90, 30, 29, 11, 5, 15, 76, 40, 26, 37, 104, 79, 12, 91, 34, 72, 81, 17, 85, 36, 24, 8, 32, 101, 96, 70, 100, 88], [49, 63, 117, 111, 105, 47, 45, 109, 42, 57, 41, 64, 110, 106, 22, 0, 99, 113, 118, 86, 51, 119, 124, 89, 62, 38, 80, 116, 35, 52, 121, 122, 95, 120, 16, 93, 115, 61, 28, 123, 46, 25, 102, 94, 48, 98, 127, 50, 1, 18, 58, 125, 83, 65, 53, 33, 2, 13, 59, 19, 3, 114, 77, 82, 54, 66, 4, 71, 27, 44, 112, 55, 103, 39, 67, 108, 7, 97, 56, 78, 126, 60, 92, 31, 6, 9, 68, 43, 14, 73, 10, 74, 107, 5, 29, 87, 30, 90, 69, 84, 75, 23, 72, 21, 20, 11, 85, 76, 104, 37, 26, 91, 79, 40, 12, 15, 17, 34, 81, 101, 36, 96, 8, 32, 88, 24, 70, 100], [49, 63, 117, 105, 111, 47, 45, 109, 42, 110, 41, 106, 57, 99, 22, 118, 113, 64, 86, 51, 0, 52, 89, 35, 119, 121, 62, 120, 16, 38, 80, 95, 61, 28, 124, 93, 58, 25, 122, 94, 83, 115, 125, 116, 98, 48, 82, 123, 102, 39, 46, 55, 27, 53, 103, 18, 126, 127, 108, 114, 44, 13, 19, 77, 43, 54, 50, 33, 1, 59, 7, 65, 3, 71, 56, 4, 92, 31, 66, 78, 97, 10, 112, 29, 2, 67, 68, 9, 74, 69, 73, 87, 90, 14, 107, 5, 30, 76, 60, 84, 6, 20, 26, 23, 21, 11, 79, 75, 85, 40, 12, 104, 34, 91, 37, 72, 15, 17, 81, 101, 8, 70, 88, 32, 36, 96, 100, 24], [49, 63, 117, 105, 111, 47, 45, 109, 42, 110, 41, 106, 57, 22, 99, 118, 0, 86, 51, 89, 113, 52, 35, 64, 16, 119, 62, 38, 28, 121, 95, 116, 115, 80, 122, 124, 82, 25, 120, 94, 61, 93, 125, 53, 58, 27, 46, 18, 98, 55, 126, 48, 50, 102, 83, 114, 19, 33, 39, 3, 77, 123, 71, 13, 108, 127, 54, 1, 103, 68, 44, 78, 74, 65, 59, 7, 97, 112, 67, 43, 66, 14, 4, 92, 60, 56, 10, 73, 69, 9, 21, 84, 76, 31, 2, 30, 11, 87, 107, 29, 5, 6, 85, 23, 90, 75, 20, 26, 12, 34, 72, 37, 79, 15, 40, 101, 8, 36, 104, 81, 17, 91, 70, 88, 32, 96, 24, 100], [49, 63, 117, 111, 105, 47, 45, 109, 42, 41, 106, 57, 118, 22, 110, 99, 86, 113, 64, 0, 89, 119, 51, 35, 124, 62, 52, 80, 28, 38, 61, 16, 121, 116, 120, 115, 95, 93, 122, 82, 53, 25, 94, 83, 125, 18, 54, 114, 48, 27, 98, 77, 46, 55, 127, 108, 102, 59, 58, 39, 50, 65, 126, 13, 123, 44, 66, 19, 1, 43, 74, 92, 33, 78, 2, 3, 67, 103, 97, 71, 68, 7, 4, 112, 73, 14, 31, 84, 21, 10, 87, 60, 20, 9, 29, 23, 11, 90, 5, 75, 69, 107, 56, 30, 76, 26, 40, 12, 34, 70, 85, 6, 104, 91, 15, 37, 101, 81, 79, 8, 72, 17, 32, 96, 88, 24, 100, 36], [49, 63, 117, 105, 111, 47, 45, 109, 42, 41, 99, 110, 57, 118, 22, 106, 86, 0, 120, 113, 64, 89, 51, 35, 119, 124, 62, 61, 52, 95, 16, 38, 80, 28, 115, 94, 116, 93, 122, 126, 121, 25, 102, 123, 58, 48, 83, 18, 53, 125, 114, 82, 55, 27, 127, 54, 98, 46, 33, 65, 59, 44, 77, 13, 103, 108, 2, 39, 19, 97, 1, 60, 92, 10, 43, 7, 50, 112, 67, 3, 71, 74, 78, 87, 31, 14, 4, 73, 68, 90, 66, 69, 56, 29, 9, 107, 11, 21, 30, 20, 23, 5, 85, 70, 76, 40, 84, 12, 26, 37, 75, 91, 17, 104, 15, 81, 36, 34, 8, 101, 100, 72, 79, 88, 32, 96, 6, 24], [49, 63, 117, 105, 111, 47, 45, 109, 42, 110, 118, 41, 22, 99, 57, 106, 86, 64, 0, 89, 51, 113, 35, 119, 38, 52, 124, 61, 120, 116, 80, 115, 16, 95, 126, 25, 102, 121, 93, 28, 122, 82, 114, 53, 62, 94, 54, 125, 83, 108, 58, 1, 123, 18, 39, 103, 19, 127, 27, 44, 50, 3, 55, 33, 66, 46, 67, 7, 92, 59, 13, 77, 65, 48, 43, 98, 4, 74, 112, 107, 97, 56, 68, 87, 78, 71, 9, 60, 90, 31, 10, 5, 14, 73, 2, 29, 26, 84, 85, 69, 20, 21, 11, 30, 70, 23, 12, 37, 17, 76, 75, 15, 34, 81, 104, 91, 40, 36, 79, 72, 101, 32, 8, 88, 96, 100, 24, 6], [49, 63, 117, 111, 105, 47, 45, 109, 42, 110, 118, 57, 41, 106, 22, 99, 64, 86, 113, 0, 51, 89, 119, 124, 52, 61, 120, 38, 62, 115, 122, 28, 35, 95, 16, 116, 82, 94, 80, 53, 25, 121, 93, 58, 98, 18, 114, 46, 19, 44, 126, 83, 125, 33, 123, 77, 1, 13, 127, 27, 59, 54, 55, 102, 9, 108, 39, 50, 3, 60, 7, 65, 78, 68, 71, 43, 4, 92, 66, 48, 10, 67, 73, 2, 14, 103, 31, 56, 97, 74, 5, 84, 70, 112, 87, 29, 23, 21, 69, 90, 107, 11, 15, 30, 85, 20, 76, 75, 79, 26, 37, 8, 12, 104, 17, 40, 101, 91, 34, 72, 36, 81, 32, 88, 6, 24, 96, 100], [49, 63, 117, 105, 111, 47, 45, 109, 42, 110, 41, 57, 106, 99, 118, 22, 64, 0, 113, 86, 51, 119, 35, 89, 61, 38, 16, 120, 116, 46, 52, 80, 124, 53, 115, 25, 95, 94, 122, 28, 121, 62, 18, 125, 55, 126, 93, 114, 82, 44, 58, 65, 33, 77, 13, 108, 54, 98, 4, 48, 102, 27, 59, 1, 19, 83, 127, 60, 39, 123, 2, 78, 103, 43, 9, 56, 50, 92, 73, 67, 14, 71, 68, 70, 3, 66, 74, 7, 10, 112, 97, 29, 11, 87, 31, 5, 69, 84, 90, 20, 21, 107, 30, 75, 26, 76, 85, 79, 23, 8, 40, 34, 17, 91, 104, 15, 12, 81, 6, 36, 72, 101, 37, 100, 32, 88, 96, 24], [49, 63, 117, 111, 105, 47, 45, 109, 42, 41, 57, 110, 22, 99, 106, 86, 118, 51, 64, 113, 89, 0, 80, 35, 16, 120, 38, 119, 61, 95, 122, 52, 115, 124, 94, 25, 93, 121, 116, 62, 46, 28, 125, 82, 18, 126, 53, 27, 102, 58, 83, 44, 114, 103, 65, 13, 55, 54, 77, 108, 98, 127, 19, 123, 50, 39, 33, 3, 48, 59, 60, 7, 92, 10, 71, 67, 68, 4, 1, 56, 90, 9, 43, 78, 74, 14, 66, 87, 69, 97, 31, 112, 21, 29, 73, 84, 2, 23, 85, 107, 5, 11, 20, 70, 30, 26, 79, 34, 12, 37, 6, 15, 40, 76, 81, 75, 104, 91, 72, 17, 8, 101, 88, 36, 24, 96, 32, 100], [49, 63, 117, 111, 105, 47, 45, 109, 42, 110, 57, 99, 41, 22, 0, 118, 106, 86, 64, 51, 113, 89, 35, 80, 119, 62, 52, 38, 124, 28, 16, 120, 95, 116, 121, 115, 61, 93, 122, 53, 18, 123, 94, 46, 25, 83, 98, 125, 77, 102, 48, 58, 54, 108, 82, 4, 55, 126, 1, 44, 27, 7, 59, 114, 127, 68, 39, 66, 103, 65, 50, 92, 71, 43, 10, 13, 33, 73, 2, 3, 74, 112, 90, 9, 67, 19, 60, 14, 78, 11, 87, 20, 29, 31, 97, 12, 21, 56, 6, 107, 5, 104, 69, 84, 40, 30, 75, 23, 85, 34, 76, 79, 26, 91, 70, 8, 37, 101, 81, 72, 17, 24, 88, 15, 32, 100, 36, 96], [49, 63, 117, 105, 111, 47, 45, 109, 42, 110, 41, 57, 22, 118, 106, 99, 86, 113, 51, 35, 119, 89, 64, 0, 61, 115, 38, 16, 52, 80, 62, 28, 53, 120, 95, 124, 121, 116, 122, 94, 25, 125, 93, 77, 114, 18, 55, 82, 102, 44, 27, 59, 98, 83, 48, 50, 126, 33, 19, 54, 123, 13, 7, 58, 65, 108, 10, 46, 3, 39, 127, 97, 71, 92, 103, 112, 1, 68, 67, 43, 78, 74, 14, 31, 73, 2, 87, 66, 4, 5, 90, 60, 11, 21, 85, 56, 23, 84, 107, 6, 26, 20, 12, 9, 69, 75, 29, 76, 34, 30, 104, 40, 79, 81, 15, 91, 17, 72, 8, 101, 37, 36, 32, 24, 96, 88, 100, 70]], "model.layers.30.self_attn.q_proj": [[61, 38, 48, 50, 97, 49, 111, 123, 83, 64, 89, 1, 127, 108, 79, 41, 86, 28, 94, 114, 14, 116, 102, 106, 17, 119, 67, 57, 54, 4, 40, 13, 59, 65, 85, 73, 109, 39, 52, 37, 58, 126, 12, 68, 60, 92, 84, 45, 120, 11, 125, 76, 8, 72, 44, 55, 87, 105, 80, 121, 43, 10, 2, 62, 110, 63, 74, 46, 118, 51, 81, 42, 101, 23, 3, 24, 9, 56, 71, 6, 112, 124, 5, 31, 15, 53, 35, 107, 0, 34, 117, 115, 21, 18, 88, 36, 113, 100, 7, 27, 47, 104, 16, 69, 93, 26, 90, 19, 95, 77, 103, 122, 20, 82, 32, 96, 66, 91, 99, 30, 29, 75, 22, 98, 33, 70, 78, 25], [61, 38, 50, 48, 97, 89, 112, 83, 55, 108, 17, 11, 60, 114, 72, 57, 85, 70, 28, 14, 111, 15, 62, 101, 3, 92, 123, 66, 25, 127, 34, 6, 126, 113, 36, 106, 86, 59, 52, 87, 69, 124, 39, 102, 45, 58, 0, 120, 94, 125, 63, 37, 29, 118, 104, 35, 43, 109, 47, 115, 10, 64, 119, 121, 56, 122, 51, 49, 40, 22, 27, 95, 53, 4, 46, 116, 90, 103, 21, 24, 32, 107, 81, 110, 23, 117, 26, 44, 19, 99, 16, 76, 42, 105, 41, 98, 88, 77, 30, 67, 20, 54, 73, 31, 93, 100, 65, 96, 18, 8, 84, 82, 12, 5, 80, 1, 78, 2, 91, 75, 7, 33, 71, 79, 13, 68, 74, 9], [50, 61, 38, 111, 48, 55, 97, 89, 106, 45, 86, 127, 60, 41, 125, 123, 124, 120, 17, 85, 102, 108, 70, 121, 62, 112, 92, 59, 52, 58, 51, 43, 119, 114, 49, 126, 66, 87, 11, 57, 83, 14, 53, 46, 39, 28, 40, 101, 115, 113, 104, 118, 117, 47, 56, 116, 122, 105, 63, 54, 44, 110, 103, 91, 3, 25, 72, 107, 109, 32, 36, 42, 0, 64, 34, 15, 6, 100, 30, 35, 94, 9, 33, 22, 37, 81, 90, 10, 31, 21, 95, 27, 96, 13, 12, 24, 99, 88, 26, 78, 84, 93, 19, 69, 67, 4, 98, 8, 16, 23, 18, 76, 29, 82, 77, 65, 73, 20, 80, 2, 7, 5, 79, 75, 1, 71, 68, 74], [61, 50, 111, 38, 112, 55, 60, 114, 124, 48, 110, 125, 97, 123, 109, 92, 53, 127, 89, 58, 52, 51, 62, 121, 44, 28, 118, 59, 41, 108, 119, 120, 45, 102, 46, 63, 126, 122, 116, 47, 104, 113, 56, 117, 54, 43, 101, 105, 57, 107, 37, 42, 29, 115, 88, 86, 106, 103, 40, 80, 100, 39, 34, 91, 49, 35, 30, 84, 36, 85, 83, 93, 25, 98, 32, 95, 99, 24, 16, 94, 17, 31, 23, 18, 22, 15, 69, 96, 21, 76, 82, 11, 90, 33, 87, 77, 20, 27, 81, 26, 4, 13, 19, 12, 68, 7, 73, 2, 71, 9, 5, 79, 75, 14, 10, 72, 74, 66, 70, 6, 0, 78, 1, 64, 67, 8, 65, 3], [46, 110, 34, 92, 119, 86, 24, 90, 83, 113, 61, 49, 54, 95, 82, 94, 31, 38, 70, 85, 74, 0, 121, 13, 7, 120, 127, 114, 12, 8, 101, 9, 59, 29, 79, 17, 47, 112, 23, 1, 48, 98, 53, 80, 51, 52, 4, 63, 2, 109, 67, 97, 55, 26, 42, 125, 58, 116, 36, 117, 60, 40, 41, 126, 108, 18, 62, 56, 124, 111, 122, 43, 115, 123, 73, 57, 105, 87, 118, 91, 81, 45, 25, 102, 28, 11, 19, 39, 37, 106, 64, 107, 100, 44, 104, 66, 93, 16, 27, 103, 20, 33, 88, 50, 96, 32, 68, 89, 35, 22, 84, 14, 5, 99, 21, 69, 30, 65, 78, 3, 10, 15, 6, 77, 71, 75, 76, 72], [46, 110, 34, 90, 86, 92, 24, 79, 18, 113, 31, 95, 83, 17, 119, 9, 70, 54, 4, 85, 42, 74, 55, 56, 61, 12, 120, 59, 58, 107, 98, 117, 7, 63, 67, 115, 13, 44, 50, 49, 116, 88, 114, 38, 16, 36, 40, 96, 29, 73, 8, 94, 91, 20, 48, 111, 52, 23, 97, 118, 37, 81, 2, 0, 109, 100, 57, 122, 125, 43, 45, 1, 51, 39, 127, 103, 93, 104, 47, 41, 27, 106, 5, 123, 33, 126, 101, 112, 124, 80, 26, 62, 28, 14, 102, 15, 32, 60, 87, 77, 108, 11, 78, 75, 105, 99, 84, 53, 76, 25, 69, 121, 35, 89, 30, 82, 21, 19, 72, 10, 68, 64, 6, 22, 66, 3, 65, 71], [46, 34, 110, 113, 120, 92, 24, 83, 90, 42, 31, 86, 17, 94, 8, 79, 14, 54, 118, 112, 117, 15, 52, 9, 66, 100, 45, 85, 36, 55, 38, 70, 13, 76, 84, 58, 11, 60, 10, 51, 98, 3, 56, 4, 63, 0, 29, 95, 49, 65, 32, 12, 48, 127, 119, 124, 122, 39, 59, 116, 47, 104, 41, 23, 43, 53, 126, 19, 44, 107, 61, 75, 7, 62, 1, 111, 67, 78, 106, 22, 114, 68, 109, 105, 35, 108, 18, 80, 82, 37, 123, 115, 96, 91, 102, 88, 97, 28, 93, 26, 6, 103, 50, 33, 40, 27, 121, 69, 87, 5, 20, 25, 57, 71, 101, 81, 30, 2, 89, 21, 125, 16, 77, 74, 73, 99, 72, 64], [46, 110, 34, 92, 90, 24, 86, 49, 83, 69, 13, 113, 114, 95, 61, 119, 31, 23, 85, 56, 94, 53, 80, 100, 17, 98, 54, 108, 9, 125, 57, 52, 103, 79, 27, 115, 74, 63, 50, 11, 18, 126, 111, 7, 41, 99, 122, 25, 28, 120, 1, 67, 55, 88, 29, 4, 26, 87, 62, 101, 45, 39, 33, 117, 104, 36, 43, 60, 102, 58, 38, 116, 118, 127, 59, 40, 51, 96, 44, 124, 123, 109, 73, 42, 121, 15, 70, 107, 5, 97, 106, 30, 112, 91, 78, 48, 47, 37, 89, 35, 32, 76, 105, 12, 0, 20, 21, 81, 75, 93, 16, 71, 19, 14, 8, 84, 64, 68, 10, 82, 72, 2, 65, 3, 22, 77, 6, 66], [42, 63, 117, 97, 27, 86, 106, 89, 16, 78, 31, 84, 115, 52, 11, 99, 81, 76, 53, 119, 96, 56, 49, 13, 10, 88, 100, 26, 73, 92, 121, 55, 118, 7, 24, 51, 50, 61, 114, 57, 8, 102, 127, 90, 20, 122, 41, 23, 46, 19, 116, 60, 28, 95, 82, 38, 6, 25, 36, 58, 94, 33, 120, 91, 62, 22, 35, 21, 54, 110, 83, 48, 9, 59, 15, 18, 66, 5, 125, 111, 126, 109, 17, 124, 72, 47, 39, 103, 112, 107, 108, 85, 113, 44, 123, 87, 40, 93, 14, 69, 80, 29, 45, 4, 77, 70, 30, 105, 43, 32, 74, 34, 101, 104, 98, 3, 12, 79, 37, 75, 68, 0, 71, 65, 67, 2, 1, 64], [42, 63, 117, 97, 35, 16, 56, 84, 27, 83, 86, 102, 101, 94, 92, 18, 19, 57, 11, 64, 106, 121, 49, 116, 93, 52, 100, 4, 37, 103, 24, 29, 98, 81, 72, 85, 65, 77, 99, 127, 9, 50, 76, 88, 70, 54, 55, 122, 61, 31, 73, 32, 96, 51, 119, 118, 58, 120, 68, 115, 78, 79, 95, 114, 82, 26, 74, 90, 109, 111, 28, 20, 125, 124, 23, 25, 41, 60, 7, 59, 48, 34, 3, 10, 80, 47, 46, 43, 36, 113, 67, 53, 5, 108, 126, 66, 1, 22, 44, 110, 105, 89, 15, 39, 21, 112, 38, 12, 30, 91, 8, 45, 62, 40, 104, 33, 123, 14, 6, 69, 13, 0, 107, 2, 17, 87, 71, 75], [42, 63, 11, 81, 78, 7, 97, 117, 84, 1, 86, 106, 27, 102, 94, 4, 55, 0, 65, 114, 6, 116, 98, 16, 9, 21, 13, 2, 100, 66, 58, 50, 92, 119, 89, 14, 30, 19, 101, 115, 93, 8, 64, 67, 52, 35, 18, 77, 76, 71, 82, 73, 75, 88, 31, 108, 12, 38, 70, 122, 103, 68, 72, 36, 20, 83, 10, 3, 91, 17, 23, 74, 80, 24, 79, 15, 32, 26, 5, 22, 29, 28, 90, 85, 120, 46, 96, 53, 39, 61, 69, 25, 34, 99, 49, 95, 118, 59, 87, 104, 121, 123, 57, 111, 47, 109, 126, 56, 37, 54, 124, 62, 45, 110, 125, 127, 48, 33, 60, 51, 105, 44, 41, 40, 43, 113, 107, 112], [63, 42, 55, 57, 49, 56, 51, 121, 112, 52, 122, 54, 116, 118, 127, 61, 120, 50, 48, 111, 59, 125, 47, 124, 108, 113, 115, 126, 60, 117, 114, 110, 97, 62, 44, 53, 85, 58, 123, 94, 46, 109, 38, 45, 24, 43, 86, 107, 41, 119, 27, 33, 102, 29, 106, 40, 39, 100, 28, 92, 104, 105, 18, 36, 26, 103, 22, 31, 90, 99, 101, 35, 88, 91, 21, 81, 37, 96, 84, 77, 30, 87, 93, 32, 15, 98, 95, 34, 20, 17, 82, 83, 19, 79, 25, 23, 80, 13, 16, 89, 12, 73, 75, 76, 11, 70, 78, 68, 74, 1, 10, 14, 0, 4, 8, 5, 67, 72, 9, 7, 6, 69, 71, 65, 66, 64, 2, 3], [111, 44, 61, 125, 116, 84, 108, 20, 124, 97, 52, 75, 100, 60, 11, 120, 117, 48, 115, 46, 47, 106, 56, 50, 49, 123, 121, 119, 28, 112, 87, 70, 63, 62, 55, 53, 59, 122, 127, 118, 110, 57, 113, 42, 90, 126, 114, 39, 51, 92, 45, 88, 54, 58, 93, 107, 109, 105, 32, 41, 98, 104, 91, 40, 79, 102, 25, 43, 15, 17, 86, 30, 37, 103, 27, 38, 31, 26, 101, 19, 35, 81, 33, 34, 36, 94, 99, 74, 23, 95, 83, 96, 22, 29, 89, 10, 2, 82, 21, 6, 16, 13, 85, 66, 24, 72, 64, 67, 18, 0, 76, 14, 65, 77, 68, 80, 5, 71, 9, 78, 4, 1, 12, 8, 73, 3, 7, 69], [44, 111, 116, 88, 18, 97, 61, 16, 78, 9, 115, 90, 51, 108, 89, 30, 109, 100, 71, 47, 4, 125, 102, 85, 26, 76, 2, 20, 120, 83, 10, 86, 79, 80, 73, 56, 68, 92, 23, 82, 94, 93, 112, 27, 55, 95, 11, 58, 66, 6, 40, 64, 118, 14, 87, 70, 52, 63, 53, 42, 127, 122, 126, 65, 7, 124, 67, 41, 50, 49, 12, 19, 24, 119, 91, 113, 28, 29, 74, 117, 81, 101, 59, 25, 1, 114, 103, 13, 38, 43, 21, 31, 17, 32, 107, 57, 123, 48, 96, 98, 0, 121, 72, 62, 34, 60, 84, 105, 106, 77, 46, 22, 8, 35, 54, 104, 5, 3, 45, 39, 36, 99, 33, 110, 37, 15, 75, 69], [44, 61, 108, 125, 97, 106, 115, 85, 88, 100, 116, 47, 51, 49, 120, 117, 126, 46, 124, 52, 87, 28, 93, 112, 59, 111, 50, 48, 119, 55, 17, 122, 92, 76, 57, 127, 62, 90, 123, 113, 118, 114, 56, 63, 60, 121, 21, 26, 53, 32, 54, 107, 58, 105, 45, 91, 40, 30, 42, 12, 104, 109, 86, 41, 39, 79, 27, 74, 110, 94, 103, 5, 34, 102, 43, 19, 23, 69, 38, 98, 83, 36, 37, 33, 31, 15, 101, 81, 95, 82, 99, 22, 35, 67, 72, 25, 89, 84, 96, 13, 2, 10, 29, 64, 18, 24, 16, 71, 20, 7, 77, 80, 3, 0, 66, 8, 78, 14, 65, 70, 4, 9, 11, 1, 68, 6, 73, 75], [44, 125, 111, 108, 61, 116, 115, 97, 51, 117, 87, 52, 122, 88, 106, 112, 49, 55, 126, 60, 100, 120, 93, 47, 59, 124, 58, 50, 113, 57, 48, 119, 118, 62, 127, 28, 46, 45, 92, 56, 63, 121, 114, 123, 90, 42, 17, 53, 74, 39, 41, 107, 104, 109, 19, 54, 86, 32, 79, 30, 91, 102, 40, 105, 13, 98, 110, 103, 26, 94, 15, 43, 95, 38, 36, 83, 35, 25, 101, 27, 37, 34, 89, 81, 96, 22, 23, 31, 33, 99, 29, 77, 67, 21, 85, 72, 84, 82, 10, 16, 2, 64, 24, 0, 9, 18, 70, 20, 80, 65, 8, 76, 5, 71, 78, 3, 14, 66, 12, 7, 11, 69, 75, 1, 73, 4, 68, 6], [113, 103, 122, 124, 52, 33, 55, 93, 119, 107, 47, 24, 42, 26, 48, 31, 112, 43, 54, 104, 90, 117, 123, 56, 45, 85, 41, 57, 108, 118, 116, 53, 87, 83, 115, 121, 37, 62, 120, 63, 35, 61, 99, 22, 29, 125, 114, 91, 59, 23, 39, 40, 21, 28, 60, 105, 49, 44, 102, 111, 101, 98, 51, 92, 127, 126, 106, 110, 58, 32, 36, 50, 38, 88, 46, 34, 97, 95, 100, 27, 11, 109, 96, 84, 30, 89, 18, 94, 82, 8, 10, 16, 72, 19, 14, 20, 80, 13, 71, 25, 75, 86, 77, 74, 81, 15, 78, 70, 5, 7, 68, 65, 2, 76, 6, 69, 0, 66, 64, 79, 12, 67, 17, 4, 3, 73, 1, 9], [122, 103, 33, 52, 93, 31, 117, 119, 91, 83, 22, 24, 54, 112, 111, 124, 90, 85, 26, 60, 105, 43, 27, 125, 42, 107, 45, 39, 97, 127, 29, 113, 20, 114, 96, 123, 73, 92, 118, 53, 78, 46, 61, 120, 110, 44, 58, 86, 116, 40, 115, 102, 48, 51, 21, 49, 47, 38, 62, 50, 55, 109, 101, 108, 25, 121, 126, 95, 35, 81, 56, 99, 37, 36, 63, 19, 30, 41, 28, 15, 57, 13, 34, 104, 89, 84, 87, 94, 32, 100, 106, 9, 82, 80, 23, 75, 98, 18, 14, 16, 59, 72, 88, 77, 5, 6, 64, 70, 67, 8, 1, 11, 0, 69, 10, 65, 68, 2, 17, 71, 4, 3, 79, 66, 7, 76, 12, 74], [57, 122, 103, 52, 113, 110, 105, 112, 22, 33, 54, 38, 101, 59, 116, 125, 43, 111, 60, 26, 109, 118, 50, 48, 97, 55, 3, 34, 39, 47, 126, 92, 78, 45, 127, 24, 40, 124, 31, 114, 61, 64, 120, 90, 75, 65, 119, 49, 123, 108, 83, 115, 18, 67, 20, 68, 71, 99, 36, 107, 106, 104, 44, 62, 63, 46, 100, 53, 96, 102, 25, 41, 42, 117, 19, 95, 51, 58, 74, 84, 121, 93, 56, 37, 21, 89, 87, 27, 94, 30, 32, 13, 35, 73, 86, 88, 82, 85, 98, 72, 28, 91, 10, 14, 23, 4, 77, 29, 9, 6, 8, 2, 1, 7, 0, 69, 66, 80, 70, 11, 16, 12, 15, 5, 79, 81, 17, 76], [103, 113, 93, 122, 24, 33, 15, 81, 76, 29, 83, 17, 57, 85, 10, 70, 104, 88, 21, 36, 34, 48, 19, 7, 22, 84, 97, 26, 66, 107, 5, 35, 12, 27, 124, 87, 42, 54, 30, 38, 55, 90, 23, 43, 16, 82, 74, 102, 75, 8, 20, 121, 63, 86, 49, 119, 94, 77, 45, 91, 68, 127, 14, 71, 62, 117, 73, 0, 79, 61, 13, 25, 32, 28, 2, 106, 80, 111, 11, 69, 116, 18, 47, 98, 59, 52, 31, 58, 89, 3, 72, 67, 92, 4, 78, 37, 96, 40, 99, 123, 100, 95, 118, 110, 39, 120, 112, 6, 46, 101, 65, 9, 64, 105, 60, 53, 126, 108, 44, 125, 114, 1, 41, 50, 115, 51, 56, 109], [56, 39, 60, 53, 117, 122, 94, 127, 109, 123, 116, 92, 58, 50, 124, 61, 103, 105, 59, 51, 55, 52, 113, 33, 111, 90, 114, 119, 115, 62, 97, 110, 54, 42, 121, 63, 47, 96, 57, 98, 34, 43, 95, 25, 36, 44, 45, 26, 108, 46, 112, 19, 126, 28, 49, 91, 85, 125, 30, 107, 118, 40, 38, 120, 102, 37, 106, 104, 88, 48, 41, 100, 23, 35, 101, 99, 27, 31, 29, 81, 20, 32, 83, 86, 87, 84, 89, 22, 93, 24, 76, 21, 78, 14, 17, 15, 75, 16, 66, 18, 79, 71, 74, 9, 0, 5, 4, 12, 67, 11, 80, 65, 82, 73, 7, 69, 70, 1, 2, 72, 77, 8, 68, 64, 3, 10, 6, 13], [53, 39, 60, 123, 61, 124, 55, 56, 50, 109, 51, 116, 54, 127, 120, 57, 122, 63, 97, 115, 47, 52, 59, 121, 90, 112, 119, 26, 58, 113, 117, 62, 114, 108, 126, 28, 125, 44, 49, 118, 107, 43, 94, 48, 110, 46, 19, 45, 105, 106, 111, 23, 38, 42, 41, 103, 101, 29, 100, 40, 85, 33, 104, 87, 102, 34, 25, 78, 35, 99, 22, 92, 95, 37, 36, 86, 30, 96, 98, 31, 91, 32, 83, 93, 21, 24, 17, 18, 27, 81, 20, 15, 14, 84, 75, 88, 89, 11, 79, 72, 76, 82, 16, 80, 12, 5, 77, 9, 74, 70, 66, 13, 67, 7, 6, 71, 8, 2, 69, 3, 10, 4, 65, 73, 0, 68, 1, 64], [39, 53, 56, 25, 18, 77, 16, 23, 74, 71, 4, 60, 97, 70, 64, 9, 0, 91, 52, 29, 31, 76, 34, 92, 67, 65, 122, 1, 100, 117, 30, 36, 15, 84, 13, 124, 40, 68, 2, 12, 82, 125, 99, 20, 10, 5, 17, 69, 98, 8, 94, 55, 6, 104, 79, 19, 81, 73, 14, 85, 42, 61, 88, 32, 7, 50, 86, 11, 90, 26, 3, 72, 24, 66, 21, 83, 75, 27, 80, 37, 107, 93, 38, 118, 87, 22, 114, 89, 113, 35, 78, 28, 41, 95, 63, 46, 126, 109, 96, 127, 51, 101, 58, 119, 33, 116, 123, 120, 59, 105, 45, 108, 102, 47, 112, 106, 54, 115, 62, 43, 57, 121, 44, 111, 110, 49, 48, 103], [39, 56, 53, 25, 18, 16, 23, 34, 97, 30, 83, 77, 52, 60, 122, 76, 29, 100, 74, 86, 93, 124, 61, 11, 94, 104, 10, 9, 71, 15, 22, 21, 13, 36, 55, 127, 90, 20, 26, 108, 91, 27, 119, 19, 80, 89, 98, 35, 40, 117, 79, 88, 92, 82, 59, 70, 107, 31, 12, 65, 51, 85, 84, 28, 41, 17, 4, 78, 102, 14, 7, 64, 87, 67, 75, 114, 72, 99, 8, 69, 81, 0, 37, 46, 32, 57, 24, 1, 112, 2, 125, 96, 73, 95, 54, 58, 115, 33, 126, 50, 118, 3, 101, 113, 38, 116, 42, 105, 6, 111, 63, 45, 120, 62, 44, 123, 49, 106, 68, 66, 103, 48, 110, 109, 5, 47, 43, 121], [123, 41, 34, 120, 115, 107, 58, 60, 12, 48, 51, 90, 127, 125, 82, 70, 23, 109, 73, 116, 32, 47, 77, 121, 62, 29, 106, 84, 126, 14, 59, 55, 86, 24, 108, 80, 46, 26, 45, 57, 63, 52, 54, 122, 37, 110, 4, 36, 92, 38, 50, 61, 105, 56, 117, 79, 113, 111, 112, 119, 104, 103, 118, 42, 69, 124, 15, 43, 94, 49, 53, 67, 66, 101, 85, 44, 93, 99, 114, 7, 89, 33, 88, 39, 97, 1, 95, 31, 28, 30, 91, 100, 35, 40, 64, 102, 21, 22, 74, 10, 13, 19, 83, 78, 8, 98, 9, 20, 76, 6, 16, 18, 87, 17, 25, 96, 27, 2, 81, 0, 71, 65, 11, 75, 5, 3, 72, 68], [41, 123, 34, 107, 65, 66, 0, 29, 90, 121, 14, 37, 1, 74, 2, 11, 59, 93, 70, 80, 124, 72, 81, 69, 68, 105, 26, 47, 4, 10, 60, 75, 58, 12, 84, 3, 24, 86, 32, 109, 17, 82, 103, 7, 115, 77, 111, 127, 125, 35, 13, 21, 5, 15, 120, 73, 46, 110, 51, 71, 45, 8, 67, 94, 52, 97, 126, 118, 55, 87, 48, 56, 106, 116, 42, 62, 61, 57, 108, 101, 63, 112, 38, 33, 98, 28, 50, 122, 31, 44, 117, 113, 43, 53, 6, 18, 64, 79, 100, 119, 88, 92, 104, 54, 114, 49, 16, 99, 30, 27, 85, 20, 39, 83, 78, 91, 36, 76, 40, 95, 19, 25, 89, 9, 96, 23, 102, 22], [123, 41, 109, 34, 59, 29, 115, 90, 58, 23, 127, 61, 120, 51, 125, 43, 126, 111, 103, 108, 46, 54, 113, 122, 47, 121, 57, 110, 116, 56, 86, 107, 48, 50, 124, 52, 118, 63, 38, 117, 12, 55, 112, 80, 53, 119, 84, 42, 89, 93, 60, 21, 62, 49, 45, 114, 102, 106, 104, 99, 70, 44, 101, 26, 77, 6, 7, 4, 82, 25, 14, 36, 39, 32, 40, 37, 31, 94, 88, 35, 65, 33, 100, 97, 73, 92, 10, 105, 22, 74, 3, 27, 83, 96, 87, 0, 67, 20, 30, 19, 17, 91, 15, 64, 85, 98, 95, 28, 24, 72, 66, 79, 68, 81, 13, 18, 75, 11, 69, 9, 71, 78, 16, 76, 1, 8, 2, 5], [123, 41, 109, 34, 107, 59, 58, 121, 23, 47, 60, 127, 29, 115, 84, 55, 90, 120, 125, 106, 56, 104, 43, 112, 12, 80, 126, 88, 61, 46, 50, 62, 103, 53, 122, 48, 116, 124, 57, 51, 32, 54, 45, 52, 111, 110, 63, 93, 113, 26, 38, 73, 114, 117, 42, 118, 82, 99, 108, 36, 101, 119, 86, 14, 37, 35, 89, 49, 24, 102, 105, 70, 100, 74, 17, 77, 65, 25, 44, 39, 21, 85, 97, 83, 31, 4, 94, 40, 67, 95, 7, 3, 92, 30, 76, 91, 79, 33, 0, 16, 9, 19, 28, 98, 20, 27, 96, 18, 22, 6, 87, 11, 78, 72, 15, 69, 81, 64, 13, 75, 66, 1, 71, 10, 68, 8, 5, 2], [127, 126, 63, 38, 120, 82, 20, 97, 91, 80, 9, 11, 13, 21, 56, 25, 69, 14, 30, 15, 29, 24, 90, 26, 75, 8, 94, 86, 102, 99, 124, 2, 88, 37, 93, 39, 95, 76, 103, 84, 78, 3, 7, 74, 43, 98, 71, 46, 68, 114, 17, 54, 121, 18, 47, 67, 125, 108, 19, 105, 0, 101, 85, 119, 58, 77, 44, 5, 49, 10, 55, 111, 42, 34, 51, 89, 96, 83, 31, 27, 62, 92, 107, 123, 52, 57, 35, 53, 48, 28, 110, 61, 87, 6, 72, 12, 60, 116, 16, 33, 64, 32, 59, 22, 118, 73, 70, 23, 66, 115, 79, 122, 100, 117, 112, 50, 41, 109, 36, 113, 104, 65, 81, 4, 40, 45, 106, 1], [63, 38, 126, 123, 119, 54, 56, 120, 62, 110, 47, 124, 61, 59, 125, 118, 60, 122, 51, 121, 115, 117, 111, 58, 48, 53, 50, 46, 49, 55, 23, 127, 116, 102, 109, 112, 97, 113, 114, 57, 52, 44, 87, 29, 45, 81, 93, 43, 26, 105, 92, 86, 108, 94, 88, 107, 41, 106, 17, 42, 104, 99, 103, 22, 40, 33, 21, 90, 39, 84, 101, 37, 95, 15, 83, 18, 35, 98, 79, 30, 74, 36, 31, 96, 100, 89, 70, 34, 27, 25, 28, 76, 0, 32, 80, 20, 6, 85, 19, 91, 65, 10, 24, 82, 78, 16, 12, 13, 1, 75, 14, 8, 67, 66, 64, 4, 69, 73, 68, 3, 9, 11, 7, 71, 2, 5, 77, 72], [126, 127, 63, 38, 123, 54, 119, 56, 62, 110, 125, 120, 61, 47, 60, 124, 59, 118, 46, 122, 49, 121, 51, 111, 115, 53, 55, 48, 58, 50, 116, 117, 113, 112, 97, 52, 44, 23, 114, 102, 29, 57, 43, 109, 45, 105, 93, 81, 86, 94, 26, 108, 104, 92, 88, 87, 107, 41, 42, 40, 99, 90, 103, 17, 22, 106, 33, 39, 31, 37, 74, 98, 101, 30, 15, 21, 95, 83, 36, 100, 35, 70, 96, 6, 34, 84, 27, 32, 20, 28, 79, 18, 25, 89, 0, 65, 16, 76, 19, 85, 80, 14, 91, 12, 82, 10, 24, 67, 78, 1, 4, 13, 64, 8, 75, 66, 68, 9, 3, 7, 73, 77, 69, 72, 5, 71, 2, 11], [126, 63, 38, 127, 16, 120, 20, 88, 78, 97, 82, 91, 75, 56, 30, 47, 73, 99, 21, 29, 54, 32, 13, 125, 39, 7, 15, 86, 119, 95, 62, 5, 90, 124, 25, 10, 8, 66, 80, 51, 69, 76, 0, 123, 9, 67, 84, 102, 111, 24, 77, 49, 58, 98, 53, 103, 109, 43, 114, 28, 26, 61, 44, 27, 79, 60, 50, 34, 18, 104, 14, 117, 118, 71, 59, 107, 19, 115, 70, 122, 57, 65, 121, 48, 17, 12, 85, 87, 89, 31, 113, 55, 83, 101, 112, 92, 100, 72, 52, 116, 23, 4, 110, 94, 22, 74, 35, 11, 46, 6, 45, 96, 33, 37, 105, 41, 108, 93, 68, 106, 40, 36, 81, 3, 1, 42, 2, 64]], "model.layers.30.self_attn.k_proj": [[61, 102, 50, 33, 48, 89, 86, 112, 92, 85, 14, 83, 60, 0, 58, 52, 113, 2, 108, 94, 11, 17, 114, 40, 120, 121, 47, 62, 65, 59, 55, 45, 123, 126, 72, 57, 53, 43, 116, 127, 56, 119, 124, 125, 63, 118, 51, 69, 98, 42, 38, 122, 117, 67, 46, 115, 44, 77, 49, 109, 41, 106, 80, 110, 54, 68, 107, 103, 100, 104, 29, 37, 105, 101, 9, 39, 10, 87, 91, 7, 6, 35, 64, 15, 31, 99, 111, 12, 36, 71, 4, 5, 84, 93, 70, 30, 90, 96, 32, 88, 27, 95, 23, 26, 34, 73, 82, 18, 28, 13, 66, 22, 76, 24, 16, 20, 21, 3, 74, 75, 1, 78, 25, 8, 19, 79, 97, 81], [110, 46, 98, 90, 0, 92, 24, 13, 31, 67, 18, 86, 2, 70, 113, 74, 1, 15, 83, 52, 17, 8, 4, 12, 16, 80, 51, 105, 9, 49, 84, 59, 21, 79, 41, 123, 116, 11, 64, 68, 94, 85, 7, 23, 54, 127, 119, 109, 29, 96, 124, 58, 115, 22, 122, 34, 100, 97, 101, 30, 38, 48, 14, 53, 45, 60, 126, 106, 118, 104, 39, 40, 35, 117, 56, 108, 120, 62, 63, 107, 47, 93, 37, 114, 5, 44, 87, 50, 102, 33, 43, 78, 111, 91, 27, 125, 42, 10, 89, 103, 112, 55, 36, 99, 32, 73, 69, 121, 61, 76, 26, 57, 95, 66, 71, 25, 28, 65, 75, 19, 20, 77, 81, 72, 6, 3, 88, 82], [106, 63, 86, 33, 117, 27, 11, 77, 78, 9, 4, 81, 7, 0, 84, 1, 55, 18, 49, 57, 31, 122, 16, 121, 95, 30, 72, 42, 116, 85, 56, 50, 59, 114, 119, 52, 51, 87, 58, 110, 54, 53, 43, 120, 34, 111, 44, 48, 118, 61, 124, 112, 115, 90, 113, 125, 103, 46, 36, 47, 62, 24, 29, 66, 41, 107, 105, 127, 37, 64, 45, 65, 109, 123, 3, 28, 126, 60, 104, 6, 67, 102, 10, 100, 108, 68, 32, 38, 99, 39, 35, 92, 40, 80, 94, 12, 5, 101, 73, 26, 23, 69, 93, 25, 76, 15, 8, 88, 20, 89, 70, 98, 91, 96, 83, 82, 14, 19, 74, 79, 13, 71, 2, 21, 17, 22, 75, 97], [108, 36, 44, 33, 86, 115, 47, 94, 111, 127, 17, 61, 52, 59, 26, 28, 41, 57, 96, 114, 126, 119, 49, 112, 104, 125, 117, 122, 118, 37, 48, 56, 120, 42, 62, 107, 60, 13, 55, 63, 50, 21, 123, 58, 124, 121, 88, 46, 43, 113, 39, 51, 106, 53, 102, 45, 40, 110, 109, 116, 54, 79, 101, 100, 105, 90, 31, 103, 72, 38, 5, 95, 98, 85, 93, 99, 76, 19, 14, 34, 24, 35, 27, 84, 87, 30, 91, 16, 12, 92, 89, 80, 32, 67, 97, 11, 3, 78, 74, 25, 65, 29, 23, 9, 75, 22, 81, 82, 18, 0, 20, 77, 2, 70, 83, 8, 73, 71, 6, 15, 7, 66, 1, 10, 68, 64, 4, 69], [39, 122, 113, 97, 49, 29, 57, 112, 26, 22, 88, 27, 73, 124, 24, 19, 93, 78, 48, 102, 59, 30, 127, 81, 123, 21, 10, 17, 47, 105, 118, 117, 85, 1, 83, 40, 54, 15, 111, 61, 107, 119, 63, 76, 43, 115, 95, 100, 60, 58, 109, 114, 52, 106, 126, 31, 12, 62, 55, 5, 53, 87, 41, 110, 37, 0, 121, 66, 120, 46, 7, 108, 4, 92, 104, 116, 51, 44, 103, 36, 125, 89, 79, 84, 101, 99, 96, 50, 42, 34, 25, 3, 32, 45, 13, 94, 56, 98, 14, 35, 38, 82, 77, 90, 75, 91, 80, 8, 69, 28, 20, 9, 18, 23, 16, 11, 33, 72, 71, 68, 86, 67, 6, 64, 70, 65, 2, 74], [103, 56, 53, 25, 33, 18, 16, 94, 77, 74, 60, 90, 23, 4, 70, 71, 87, 76, 22, 119, 0, 65, 109, 124, 55, 89, 122, 50, 120, 63, 52, 75, 123, 51, 86, 85, 67, 91, 93, 113, 44, 54, 35, 125, 127, 117, 114, 100, 47, 61, 116, 29, 9, 118, 36, 28, 19, 101, 59, 107, 106, 57, 98, 43, 62, 32, 72, 111, 49, 121, 42, 48, 58, 110, 8, 115, 45, 78, 108, 126, 11, 81, 41, 104, 46, 96, 105, 112, 79, 102, 2, 34, 5, 95, 92, 17, 31, 7, 37, 40, 38, 84, 66, 99, 30, 24, 12, 20, 6, 73, 27, 68, 88, 13, 14, 15, 39, 3, 80, 83, 1, 82, 64, 21, 97, 69, 26, 10], [105, 123, 98, 86, 59, 93, 64, 127, 50, 125, 60, 90, 67, 116, 121, 46, 80, 117, 124, 126, 51, 119, 73, 96, 56, 122, 101, 58, 57, 62, 48, 63, 14, 115, 45, 61, 11, 53, 112, 118, 52, 120, 82, 110, 55, 42, 43, 111, 113, 47, 106, 12, 114, 54, 44, 69, 1, 40, 109, 84, 49, 70, 108, 66, 39, 102, 24, 81, 37, 68, 3, 23, 100, 95, 19, 104, 36, 91, 71, 28, 8, 15, 77, 32, 21, 94, 35, 74, 17, 72, 103, 97, 85, 10, 5, 89, 99, 30, 38, 29, 25, 107, 33, 20, 87, 27, 83, 31, 92, 79, 41, 34, 6, 7, 88, 75, 65, 22, 26, 2, 13, 76, 18, 78, 4, 0, 9, 16], [126, 102, 63, 33, 86, 93, 127, 56, 95, 26, 120, 47, 124, 111, 112, 121, 54, 114, 46, 119, 53, 23, 61, 117, 51, 118, 57, 107, 48, 15, 122, 60, 58, 45, 55, 44, 59, 105, 52, 17, 104, 62, 20, 115, 49, 82, 94, 103, 13, 123, 50, 106, 108, 40, 125, 113, 116, 30, 41, 43, 42, 109, 110, 88, 39, 91, 79, 31, 28, 37, 38, 27, 87, 101, 18, 77, 92, 81, 89, 99, 29, 36, 96, 100, 34, 35, 83, 21, 11, 74, 32, 7, 10, 98, 75, 19, 25, 90, 84, 71, 65, 9, 6, 85, 16, 8, 14, 12, 76, 5, 97, 22, 78, 80, 24, 4, 73, 67, 1, 72, 66, 0, 69, 68, 70, 3, 2, 64]], "model.layers.30.self_attn.qk_proj": [[63, 61, 123, 56, 110, 126, 53, 46, 122, 113, 44, 50, 106, 127, 111, 117, 108, 60, 48, 22, 42, 57, 102, 59, 103, 52, 124, 125, 39, 105, 33, 49, 120, 119, 116, 38, 51, 90, 112, 86, 115, 55, 54, 26, 93, 29, 58, 97, 118, 114, 41, 109, 121, 47, 34, 24, 88, 30, 94, 92, 43, 62, 82, 25, 31, 45, 98, 28, 81, 89, 36, 23, 18, 107, 77, 83, 91, 101, 87, 13, 40, 80, 14, 17, 21, 85, 100, 78, 11, 16, 19, 27, 37, 79, 9, 84, 104, 75, 95, 15, 73, 76, 64, 0, 20, 96, 12, 71, 10, 7, 8, 35, 74, 67, 6, 4, 70, 68, 5, 99, 32, 65, 1, 72, 3, 69, 2, 66], [63, 61, 123, 126, 56, 110, 46, 53, 122, 113, 50, 44, 111, 106, 42, 117, 127, 108, 60, 48, 22, 57, 102, 120, 125, 103, 105, 124, 59, 52, 38, 112, 39, 86, 51, 33, 29, 115, 58, 54, 49, 26, 47, 55, 118, 97, 41, 109, 93, 116, 90, 119, 114, 30, 121, 45, 24, 94, 62, 88, 28, 25, 107, 92, 43, 31, 34, 98, 82, 91, 23, 89, 13, 17, 100, 18, 40, 104, 36, 81, 77, 14, 11, 19, 16, 78, 83, 80, 85, 87, 21, 37, 95, 9, 79, 99, 101, 76, 27, 0, 12, 75, 15, 70, 73, 68, 74, 64, 10, 20, 71, 35, 32, 84, 8, 7, 4, 3, 96, 1, 65, 67, 69, 6, 5, 72, 66, 2], [63, 61, 123, 126, 56, 110, 53, 46, 122, 113, 50, 44, 111, 106, 127, 117, 42, 48, 108, 22, 57, 102, 60, 103, 124, 105, 120, 59, 52, 51, 86, 33, 38, 39, 115, 54, 116, 49, 125, 55, 112, 58, 93, 119, 26, 47, 118, 121, 114, 29, 90, 97, 107, 24, 94, 109, 30, 62, 28, 88, 41, 34, 92, 25, 43, 40, 89, 45, 98, 18, 83, 31, 36, 81, 23, 101, 82, 100, 13, 16, 77, 80, 14, 91, 21, 19, 0, 11, 70, 17, 87, 85, 104, 95, 78, 20, 9, 37, 99, 75, 79, 15, 27, 74, 76, 71, 10, 84, 12, 32, 73, 64, 7, 4, 35, 67, 1, 96, 72, 68, 65, 3, 8, 5, 69, 2, 66, 6], [63, 61, 123, 126, 56, 110, 53, 46, 122, 113, 50, 44, 111, 106, 117, 108, 42, 127, 48, 52, 57, 22, 120, 124, 103, 102, 60, 58, 125, 39, 105, 38, 116, 121, 47, 49, 112, 51, 54, 86, 26, 33, 59, 93, 118, 114, 97, 119, 55, 43, 29, 109, 90, 115, 30, 24, 62, 41, 94, 45, 107, 34, 28, 92, 40, 25, 31, 88, 82, 104, 13, 98, 81, 89, 14, 80, 23, 36, 83, 78, 19, 18, 101, 100, 21, 91, 17, 77, 64, 11, 37, 87, 16, 76, 85, 95, 70, 99, 75, 79, 84, 15, 9, 73, 0, 27, 10, 71, 12, 20, 68, 32, 7, 3, 35, 1, 74, 96, 72, 65, 4, 8, 67, 69, 5, 66, 2, 6], [63, 61, 123, 56, 126, 53, 110, 46, 122, 113, 44, 50, 111, 127, 106, 108, 117, 42, 22, 48, 102, 52, 60, 57, 124, 120, 103, 125, 59, 51, 86, 116, 105, 39, 119, 58, 54, 47, 49, 55, 33, 112, 90, 26, 115, 118, 38, 93, 114, 29, 97, 41, 121, 109, 24, 62, 43, 107, 34, 31, 83, 92, 25, 89, 88, 94, 36, 30, 82, 81, 23, 45, 28, 18, 80, 21, 13, 98, 77, 14, 11, 17, 78, 40, 87, 101, 85, 27, 16, 100, 0, 19, 95, 79, 91, 104, 76, 15, 73, 12, 9, 75, 70, 64, 20, 84, 74, 37, 10, 1, 7, 72, 3, 71, 4, 65, 68, 35, 67, 96, 32, 8, 6, 5, 99, 69, 66, 2], [63, 61, 126, 56, 123, 110, 53, 46, 122, 113, 44, 50, 108, 111, 106, 42, 127, 22, 57, 48, 117, 105, 103, 102, 120, 86, 60, 52, 59, 38, 47, 112, 125, 124, 39, 54, 58, 116, 118, 51, 93, 33, 114, 29, 26, 45, 49, 90, 97, 55, 119, 41, 24, 109, 121, 115, 92, 25, 28, 43, 62, 88, 94, 81, 82, 23, 30, 21, 98, 36, 34, 14, 95, 31, 18, 13, 83, 107, 87, 89, 91, 80, 78, 17, 77, 27, 85, 40, 11, 19, 15, 104, 16, 101, 100, 12, 64, 9, 76, 79, 75, 37, 73, 70, 0, 4, 72, 74, 84, 20, 10, 7, 35, 6, 68, 65, 71, 32, 96, 3, 99, 67, 66, 69, 5, 8, 1, 2], [63, 61, 110, 123, 126, 56, 53, 122, 46, 44, 113, 50, 42, 111, 106, 108, 48, 22, 127, 102, 117, 59, 105, 125, 103, 124, 60, 57, 52, 120, 86, 112, 39, 47, 90, 116, 26, 38, 29, 93, 119, 115, 41, 109, 45, 55, 33, 121, 97, 114, 58, 51, 118, 43, 92, 54, 88, 62, 25, 24, 28, 49, 94, 23, 31, 81, 18, 107, 21, 82, 87, 85, 30, 80, 36, 89, 83, 98, 91, 17, 27, 77, 40, 95, 100, 11, 34, 14, 13, 78, 19, 16, 20, 15, 101, 12, 76, 6, 79, 73, 9, 75, 74, 104, 84, 10, 35, 37, 64, 0, 32, 7, 67, 71, 4, 72, 99, 65, 68, 70, 8, 1, 96, 3, 2, 5, 69, 66], [63, 61, 123, 56, 110, 126, 53, 122, 46, 113, 44, 50, 111, 106, 108, 42, 127, 48, 22, 117, 57, 102, 59, 60, 103, 124, 120, 52, 125, 115, 55, 105, 119, 112, 33, 90, 93, 38, 58, 116, 39, 49, 86, 51, 26, 47, 29, 97, 41, 121, 109, 114, 118, 92, 45, 54, 88, 25, 62, 24, 43, 30, 31, 89, 28, 107, 91, 94, 34, 18, 21, 36, 81, 23, 82, 17, 98, 40, 83, 100, 87, 13, 27, 14, 101, 16, 80, 78, 11, 85, 77, 20, 37, 95, 64, 75, 79, 0, 76, 19, 104, 15, 6, 9, 73, 84, 35, 10, 12, 96, 67, 7, 1, 99, 74, 72, 32, 65, 4, 71, 68, 3, 8, 69, 2, 66, 5, 70], [63, 61, 56, 110, 123, 53, 126, 46, 122, 113, 44, 50, 106, 111, 127, 42, 48, 108, 52, 60, 102, 57, 124, 117, 103, 22, 120, 58, 59, 116, 86, 125, 105, 39, 38, 51, 93, 119, 55, 112, 109, 97, 33, 29, 90, 26, 115, 49, 45, 47, 118, 43, 25, 41, 94, 92, 114, 88, 24, 54, 31, 107, 121, 30, 62, 23, 28, 18, 89, 34, 98, 91, 13, 40, 82, 27, 87, 83, 80, 21, 78, 100, 17, 36, 19, 77, 81, 95, 85, 11, 104, 20, 16, 14, 76, 84, 101, 79, 75, 37, 73, 6, 15, 12, 68, 99, 0, 32, 7, 4, 72, 9, 10, 74, 64, 71, 35, 1, 3, 96, 65, 5, 70, 8, 67, 2, 69, 66], [63, 61, 123, 126, 56, 110, 53, 122, 46, 113, 44, 50, 111, 106, 108, 127, 48, 60, 117, 124, 125, 120, 52, 102, 57, 22, 42, 103, 55, 38, 59, 49, 51, 58, 105, 33, 39, 47, 119, 121, 97, 86, 116, 115, 90, 93, 26, 43, 109, 54, 112, 29, 114, 34, 62, 118, 92, 24, 94, 88, 45, 30, 98, 107, 31, 41, 13, 40, 23, 28, 25, 83, 100, 82, 18, 91, 81, 37, 89, 104, 16, 77, 17, 85, 87, 80, 11, 21, 19, 64, 78, 6, 14, 36, 101, 75, 95, 27, 99, 74, 84, 73, 7, 68, 10, 4, 0, 20, 9, 71, 79, 12, 15, 76, 35, 72, 3, 65, 67, 96, 32, 1, 8, 70, 5, 66, 69, 2], [63, 61, 123, 126, 56, 110, 46, 122, 53, 113, 44, 50, 111, 106, 127, 108, 117, 48, 42, 22, 60, 52, 125, 57, 102, 103, 59, 124, 105, 112, 33, 39, 93, 49, 55, 120, 119, 86, 114, 121, 51, 116, 47, 38, 26, 118, 90, 54, 97, 29, 62, 115, 41, 109, 58, 24, 88, 34, 43, 92, 94, 30, 25, 31, 13, 89, 82, 14, 107, 17, 45, 28, 40, 98, 85, 36, 18, 77, 78, 19, 23, 91, 101, 81, 80, 83, 21, 104, 16, 11, 95, 73, 74, 9, 79, 87, 76, 12, 15, 0, 75, 64, 100, 27, 20, 6, 84, 7, 71, 4, 10, 72, 35, 8, 65, 67, 70, 3, 37, 68, 99, 32, 69, 96, 1, 66, 5, 2], [63, 61, 110, 123, 126, 56, 53, 46, 122, 113, 44, 50, 106, 108, 127, 111, 42, 22, 102, 48, 60, 124, 125, 52, 105, 103, 86, 117, 57, 112, 120, 59, 116, 33, 39, 29, 58, 93, 55, 119, 26, 41, 38, 115, 97, 47, 90, 114, 49, 51, 109, 118, 25, 121, 92, 24, 43, 54, 31, 62, 88, 30, 82, 23, 17, 94, 13, 81, 18, 98, 40, 28, 45, 14, 91, 77, 78, 101, 85, 16, 107, 34, 83, 89, 19, 21, 87, 80, 36, 27, 11, 95, 73, 100, 79, 75, 76, 9, 12, 84, 104, 37, 20, 71, 64, 35, 15, 4, 74, 0, 10, 68, 70, 69, 32, 72, 7, 8, 3, 96, 67, 99, 1, 6, 65, 5, 66, 2], [63, 61, 56, 110, 123, 126, 53, 122, 46, 113, 44, 50, 108, 127, 111, 106, 22, 42, 48, 124, 60, 102, 103, 38, 57, 120, 125, 55, 52, 58, 86, 119, 59, 33, 49, 105, 117, 47, 90, 29, 26, 97, 112, 39, 115, 116, 93, 109, 92, 118, 51, 54, 45, 114, 41, 24, 25, 43, 94, 88, 23, 121, 28, 91, 31, 62, 30, 18, 34, 89, 107, 17, 100, 82, 85, 83, 27, 87, 98, 95, 81, 19, 40, 13, 80, 78, 36, 21, 77, 37, 75, 14, 15, 16, 101, 104, 11, 0, 79, 70, 35, 76, 84, 73, 20, 99, 10, 9, 64, 12, 4, 96, 32, 68, 7, 74, 71, 72, 3, 1, 67, 8, 65, 66, 2, 69, 6, 5], [63, 61, 123, 56, 110, 126, 46, 53, 122, 113, 44, 50, 108, 111, 42, 106, 48, 127, 22, 124, 120, 117, 60, 57, 55, 102, 105, 103, 59, 39, 52, 58, 125, 38, 119, 33, 49, 51, 121, 47, 86, 116, 93, 112, 90, 26, 115, 41, 118, 109, 114, 29, 97, 24, 54, 62, 94, 43, 45, 92, 31, 25, 28, 88, 98, 107, 23, 82, 30, 21, 91, 89, 18, 34, 40, 83, 87, 17, 13, 78, 80, 36, 14, 81, 85, 75, 77, 104, 101, 16, 100, 64, 0, 15, 27, 70, 11, 95, 19, 37, 76, 12, 73, 79, 84, 7, 68, 74, 35, 8, 9, 71, 32, 67, 20, 99, 10, 4, 65, 69, 1, 3, 5, 72, 96, 66, 6, 2], [63, 61, 126, 123, 56, 110, 46, 53, 122, 44, 113, 50, 111, 106, 108, 42, 22, 127, 48, 117, 105, 57, 124, 102, 52, 103, 125, 55, 60, 120, 38, 115, 49, 39, 33, 121, 26, 97, 59, 116, 119, 86, 93, 54, 41, 47, 109, 90, 43, 51, 29, 114, 112, 34, 118, 58, 24, 45, 98, 62, 88, 92, 94, 25, 30, 31, 82, 28, 13, 23, 81, 89, 107, 40, 14, 78, 0, 80, 21, 87, 77, 18, 91, 83, 16, 17, 19, 100, 101, 36, 85, 76, 95, 75, 11, 70, 64, 104, 84, 9, 74, 73, 79, 15, 20, 12, 8, 37, 27, 35, 71, 10, 99, 7, 1, 68, 67, 4, 69, 65, 32, 3, 72, 96, 66, 2, 5, 6], [63, 61, 126, 123, 56, 110, 53, 46, 122, 113, 44, 50, 111, 106, 127, 108, 42, 22, 48, 117, 102, 52, 124, 57, 103, 105, 38, 60, 33, 86, 125, 116, 58, 59, 120, 118, 115, 114, 49, 39, 55, 112, 119, 93, 29, 97, 51, 26, 47, 41, 30, 94, 90, 54, 43, 121, 24, 109, 92, 25, 31, 89, 45, 34, 88, 40, 28, 107, 82, 98, 14, 91, 23, 81, 77, 62, 18, 80, 78, 100, 16, 85, 83, 87, 95, 13, 17, 21, 19, 101, 11, 64, 36, 76, 27, 75, 104, 79, 9, 73, 20, 12, 37, 84, 8, 3, 10, 70, 74, 0, 68, 4, 15, 7, 71, 1, 35, 96, 67, 99, 65, 5, 32, 69, 72, 2, 6, 66], [63, 61, 110, 126, 53, 56, 123, 46, 122, 44, 113, 50, 111, 106, 42, 108, 127, 22, 124, 57, 102, 48, 117, 60, 52, 105, 86, 103, 38, 116, 59, 125, 58, 33, 55, 47, 39, 112, 26, 29, 119, 120, 118, 115, 97, 90, 93, 49, 51, 121, 94, 109, 114, 43, 41, 62, 92, 31, 88, 45, 24, 25, 30, 34, 54, 23, 28, 40, 81, 83, 18, 89, 82, 17, 91, 98, 13, 100, 77, 16, 107, 87, 36, 21, 101, 95, 78, 14, 80, 27, 19, 84, 11, 85, 37, 20, 75, 104, 79, 15, 10, 12, 99, 9, 64, 76, 0, 4, 73, 96, 35, 7, 74, 8, 67, 68, 6, 3, 65, 71, 70, 32, 1, 5, 72, 66, 69, 2], [63, 61, 110, 56, 126, 123, 53, 46, 122, 113, 44, 50, 111, 106, 127, 42, 108, 48, 117, 57, 22, 102, 52, 105, 103, 60, 124, 59, 86, 116, 115, 33, 55, 38, 120, 39, 47, 125, 109, 112, 49, 97, 26, 51, 93, 62, 41, 29, 90, 119, 114, 118, 121, 58, 54, 94, 24, 92, 30, 107, 34, 88, 28, 45, 89, 43, 25, 18, 98, 31, 91, 23, 81, 83, 40, 13, 36, 16, 82, 19, 17, 78, 75, 77, 37, 14, 87, 100, 21, 85, 27, 101, 80, 104, 95, 9, 0, 64, 79, 20, 6, 15, 76, 73, 84, 35, 74, 11, 99, 12, 7, 96, 8, 32, 10, 1, 67, 71, 3, 4, 69, 72, 70, 66, 68, 65, 5, 2], [63, 61, 123, 126, 110, 56, 46, 53, 122, 113, 50, 44, 111, 106, 42, 127, 52, 108, 48, 57, 102, 117, 124, 120, 22, 103, 60, 51, 105, 39, 125, 33, 38, 47, 116, 59, 55, 86, 115, 49, 121, 118, 26, 112, 29, 58, 119, 97, 93, 109, 45, 43, 90, 54, 62, 41, 114, 30, 31, 107, 94, 24, 34, 40, 92, 98, 88, 23, 28, 36, 25, 77, 100, 81, 101, 89, 82, 17, 18, 91, 13, 21, 75, 78, 83, 95, 19, 14, 80, 16, 87, 37, 104, 9, 85, 11, 99, 6, 27, 12, 35, 76, 7, 84, 73, 71, 20, 64, 15, 79, 74, 10, 0, 68, 1, 32, 96, 8, 69, 3, 4, 65, 2, 72, 67, 5, 70, 66], [63, 61, 126, 123, 110, 56, 53, 46, 122, 113, 44, 50, 111, 108, 106, 42, 48, 127, 117, 22, 38, 124, 57, 52, 102, 105, 103, 120, 60, 33, 125, 59, 47, 39, 86, 26, 112, 97, 119, 51, 49, 58, 114, 116, 118, 29, 55, 115, 93, 90, 109, 121, 41, 54, 92, 30, 24, 45, 94, 34, 88, 31, 28, 62, 43, 107, 25, 23, 91, 98, 77, 36, 18, 81, 104, 40, 89, 87, 82, 17, 13, 19, 78, 101, 21, 85, 80, 83, 14, 79, 95, 16, 9, 10, 75, 6, 100, 76, 20, 11, 0, 15, 74, 37, 12, 71, 73, 68, 84, 64, 8, 27, 7, 32, 99, 4, 3, 72, 65, 35, 1, 96, 67, 66, 5, 69, 2, 70], [63, 61, 126, 123, 110, 56, 53, 46, 122, 113, 50, 44, 111, 106, 42, 108, 117, 57, 48, 127, 22, 60, 124, 52, 105, 102, 103, 59, 109, 33, 120, 125, 115, 114, 112, 116, 55, 86, 49, 38, 51, 118, 47, 39, 62, 90, 58, 93, 119, 26, 29, 97, 41, 121, 54, 107, 94, 45, 24, 28, 88, 92, 43, 98, 34, 30, 89, 36, 25, 31, 82, 95, 91, 83, 13, 80, 40, 77, 18, 104, 21, 87, 17, 81, 101, 23, 14, 78, 16, 75, 19, 11, 85, 64, 27, 37, 100, 15, 9, 76, 79, 6, 73, 0, 10, 35, 20, 99, 84, 4, 74, 65, 67, 32, 7, 71, 68, 12, 8, 1, 72, 69, 3, 96, 70, 2, 5, 66], [63, 61, 123, 126, 56, 110, 53, 46, 122, 113, 50, 44, 111, 117, 42, 106, 108, 127, 124, 52, 48, 102, 57, 60, 120, 103, 125, 22, 105, 112, 55, 58, 39, 59, 114, 38, 33, 115, 121, 119, 51, 118, 47, 116, 86, 29, 49, 26, 90, 109, 93, 54, 41, 107, 97, 94, 88, 62, 45, 30, 43, 31, 28, 23, 34, 92, 24, 91, 25, 83, 87, 36, 98, 82, 100, 17, 81, 89, 18, 77, 104, 64, 40, 37, 14, 19, 13, 85, 21, 95, 27, 101, 75, 80, 16, 78, 0, 76, 15, 20, 79, 11, 4, 9, 3, 73, 74, 99, 35, 10, 12, 96, 84, 6, 1, 70, 71, 7, 67, 8, 32, 65, 72, 69, 5, 68, 2, 66], [63, 61, 123, 110, 126, 56, 53, 46, 122, 113, 44, 50, 111, 127, 106, 108, 42, 124, 117, 48, 102, 22, 120, 60, 52, 125, 58, 38, 116, 33, 57, 103, 59, 119, 105, 115, 86, 118, 112, 49, 39, 55, 26, 47, 29, 93, 90, 121, 51, 97, 114, 41, 54, 94, 109, 88, 43, 28, 34, 45, 24, 62, 25, 30, 31, 92, 107, 89, 36, 98, 91, 82, 18, 23, 83, 87, 81, 100, 77, 17, 13, 40, 21, 101, 16, 14, 95, 85, 80, 78, 19, 0, 11, 27, 75, 104, 64, 9, 15, 84, 76, 71, 20, 73, 4, 74, 37, 79, 72, 65, 70, 12, 35, 99, 3, 1, 96, 7, 10, 68, 67, 2, 32, 8, 5, 66, 6, 69], [63, 61, 126, 123, 110, 56, 53, 46, 122, 113, 50, 44, 106, 111, 108, 42, 48, 22, 127, 120, 117, 124, 105, 52, 102, 57, 103, 38, 125, 33, 60, 59, 116, 118, 55, 112, 86, 97, 26, 93, 49, 90, 39, 121, 47, 119, 29, 51, 58, 114, 41, 115, 54, 109, 24, 92, 43, 62, 94, 28, 30, 88, 34, 25, 45, 36, 89, 107, 40, 82, 17, 98, 31, 23, 78, 91, 18, 81, 13, 104, 77, 83, 80, 19, 14, 95, 85, 87, 21, 16, 11, 75, 15, 79, 27, 76, 100, 70, 10, 9, 101, 37, 12, 73, 35, 4, 99, 20, 64, 72, 84, 32, 74, 71, 68, 7, 0, 67, 5, 1, 65, 3, 8, 96, 69, 6, 66, 2], [63, 61, 123, 126, 56, 110, 53, 46, 122, 113, 44, 50, 111, 108, 106, 42, 22, 48, 117, 57, 124, 52, 127, 60, 105, 120, 102, 59, 103, 116, 112, 55, 114, 115, 38, 125, 119, 86, 33, 121, 49, 39, 26, 109, 90, 47, 51, 118, 93, 29, 54, 43, 97, 41, 107, 58, 94, 28, 92, 25, 24, 30, 62, 23, 45, 88, 31, 36, 17, 34, 77, 85, 83, 98, 89, 40, 81, 87, 82, 101, 21, 18, 78, 16, 19, 91, 100, 27, 95, 13, 11, 104, 80, 75, 15, 70, 79, 9, 14, 73, 64, 84, 76, 12, 37, 7, 35, 74, 20, 0, 10, 68, 71, 32, 72, 67, 3, 99, 65, 8, 1, 96, 4, 5, 66, 69, 6, 2], [63, 61, 110, 126, 123, 56, 53, 46, 122, 113, 44, 50, 111, 108, 106, 42, 22, 48, 127, 124, 120, 60, 52, 102, 117, 57, 103, 105, 55, 125, 116, 33, 59, 115, 38, 119, 86, 49, 29, 39, 112, 118, 47, 90, 26, 58, 93, 121, 109, 97, 114, 94, 54, 51, 41, 24, 28, 62, 30, 92, 34, 25, 88, 43, 107, 31, 89, 98, 23, 91, 45, 82, 18, 100, 40, 36, 37, 83, 77, 19, 87, 17, 81, 85, 101, 27, 80, 95, 16, 13, 104, 21, 11, 75, 14, 64, 78, 35, 12, 79, 84, 9, 20, 15, 73, 99, 76, 10, 70, 0, 71, 74, 96, 68, 65, 32, 72, 3, 1, 7, 67, 5, 4, 69, 8, 2, 66, 6], [63, 61, 110, 123, 126, 56, 53, 46, 122, 113, 50, 44, 106, 111, 42, 108, 48, 117, 127, 22, 57, 52, 60, 102, 103, 105, 124, 59, 116, 38, 112, 125, 120, 109, 115, 121, 39, 55, 33, 119, 49, 86, 51, 93, 97, 118, 114, 54, 58, 26, 90, 41, 29, 62, 47, 88, 24, 94, 30, 91, 89, 34, 43, 31, 92, 25, 107, 28, 45, 101, 36, 98, 40, 18, 81, 82, 87, 83, 17, 85, 23, 19, 80, 11, 21, 13, 78, 100, 16, 27, 14, 77, 95, 104, 84, 37, 79, 35, 75, 12, 73, 0, 99, 76, 15, 9, 8, 64, 20, 74, 70, 32, 68, 10, 71, 7, 65, 67, 96, 4, 72, 3, 69, 1, 5, 6, 2, 66], [63, 61, 123, 126, 56, 110, 53, 46, 122, 113, 50, 44, 111, 106, 108, 117, 48, 127, 42, 22, 124, 120, 57, 52, 102, 103, 60, 125, 38, 105, 116, 33, 59, 39, 121, 49, 55, 112, 51, 47, 119, 26, 114, 115, 86, 93, 41, 43, 54, 90, 97, 58, 29, 109, 118, 62, 45, 34, 88, 92, 24, 94, 31, 107, 25, 89, 28, 30, 36, 91, 82, 81, 23, 98, 18, 77, 21, 11, 83, 14, 17, 40, 80, 0, 87, 78, 37, 16, 13, 19, 85, 100, 79, 9, 104, 84, 73, 95, 75, 12, 10, 64, 27, 15, 101, 76, 35, 71, 74, 99, 20, 6, 67, 68, 7, 1, 4, 65, 96, 70, 3, 32, 2, 72, 5, 8, 69, 66], [63, 61, 123, 126, 110, 53, 56, 46, 122, 113, 50, 44, 111, 106, 127, 117, 108, 42, 48, 125, 52, 124, 59, 22, 102, 57, 103, 38, 112, 60, 105, 33, 120, 116, 51, 119, 39, 115, 55, 97, 54, 43, 86, 62, 93, 49, 29, 26, 114, 41, 47, 58, 90, 118, 121, 34, 94, 24, 92, 109, 45, 89, 30, 88, 28, 31, 40, 95, 18, 101, 98, 25, 23, 14, 91, 107, 80, 82, 17, 16, 11, 85, 36, 77, 13, 78, 19, 100, 81, 104, 0, 87, 15, 83, 64, 73, 71, 6, 21, 79, 9, 75, 37, 76, 12, 10, 74, 27, 35, 20, 84, 4, 65, 68, 7, 8, 3, 67, 1, 96, 99, 70, 32, 72, 69, 66, 5, 2], [63, 61, 126, 123, 56, 110, 46, 53, 122, 113, 50, 44, 111, 42, 106, 57, 108, 117, 124, 22, 127, 48, 59, 52, 102, 125, 60, 120, 103, 105, 116, 38, 86, 39, 55, 112, 119, 58, 33, 26, 49, 114, 97, 29, 93, 51, 47, 41, 118, 90, 43, 115, 54, 45, 24, 109, 62, 88, 28, 92, 25, 94, 121, 18, 30, 34, 91, 31, 23, 100, 82, 81, 98, 89, 77, 21, 36, 85, 78, 87, 17, 19, 40, 14, 107, 80, 101, 13, 83, 11, 16, 104, 95, 73, 27, 15, 6, 9, 76, 79, 84, 75, 12, 37, 0, 4, 35, 74, 20, 68, 10, 71, 99, 64, 8, 7, 3, 67, 32, 1, 72, 69, 65, 5, 70, 96, 66, 2], [63, 61, 126, 123, 110, 56, 53, 46, 122, 113, 50, 44, 111, 124, 108, 117, 106, 42, 57, 60, 127, 22, 48, 103, 52, 102, 38, 116, 120, 59, 105, 49, 86, 55, 125, 39, 33, 47, 121, 114, 112, 51, 90, 115, 97, 29, 119, 26, 54, 118, 41, 93, 109, 58, 43, 25, 28, 62, 94, 30, 34, 24, 92, 31, 23, 88, 45, 36, 95, 78, 81, 82, 98, 19, 18, 83, 107, 91, 89, 13, 77, 21, 87, 6, 100, 14, 80, 85, 11, 40, 64, 73, 17, 104, 16, 15, 101, 9, 0, 71, 37, 4, 79, 7, 27, 75, 10, 74, 12, 84, 68, 67, 76, 8, 35, 3, 20, 1, 99, 65, 96, 66, 32, 72, 2, 69, 5, 70], [63, 61, 123, 110, 56, 126, 53, 122, 46, 113, 44, 50, 111, 106, 42, 108, 127, 22, 48, 60, 117, 102, 103, 124, 57, 125, 120, 52, 59, 38, 33, 105, 86, 55, 39, 29, 97, 49, 51, 121, 119, 26, 47, 90, 112, 115, 116, 93, 54, 114, 30, 109, 118, 41, 62, 43, 94, 25, 92, 31, 24, 88, 58, 34, 28, 91, 18, 81, 45, 100, 98, 82, 23, 107, 85, 11, 14, 13, 89, 37, 17, 36, 21, 80, 19, 83, 16, 40, 87, 77, 95, 27, 79, 78, 104, 73, 101, 6, 15, 9, 75, 35, 12, 10, 76, 99, 74, 8, 20, 71, 0, 4, 84, 96, 7, 32, 68, 72, 67, 69, 1, 3, 70, 65, 5, 64, 2, 66]], "model.layers.31.self_attn.q_proj": [[40, 121, 98, 46, 18, 25, 111, 9, 30, 21, 13, 51, 15, 11, 48, 105, 49, 67, 23, 60, 55, 53, 4, 6, 58, 116, 114, 124, 72, 107, 31, 88, 115, 95, 41, 109, 22, 108, 61, 17, 37, 59, 92, 50, 45, 19, 87, 43, 127, 20, 5, 97, 1, 14, 123, 27, 126, 69, 75, 78, 81, 90, 16, 120, 102, 101, 8, 24, 112, 93, 85, 52, 26, 63, 118, 0, 7, 33, 117, 62, 122, 96, 76, 94, 74, 113, 35, 36, 54, 28, 12, 77, 79, 57, 39, 56, 82, 70, 73, 42, 89, 84, 3, 80, 32, 106, 110, 103, 83, 99, 100, 2, 125, 65, 119, 68, 38, 29, 86, 71, 44, 47, 10, 104, 91, 64, 66, 34], [40, 121, 98, 46, 9, 13, 111, 21, 25, 18, 51, 67, 15, 49, 4, 95, 11, 108, 60, 50, 48, 117, 45, 68, 107, 88, 61, 116, 37, 64, 122, 41, 30, 58, 6, 92, 1, 72, 2, 105, 43, 123, 65, 109, 55, 0, 28, 87, 42, 3, 53, 5, 126, 23, 27, 39, 127, 54, 74, 19, 101, 114, 73, 85, 14, 62, 70, 113, 22, 115, 66, 31, 63, 34, 20, 93, 16, 78, 120, 33, 118, 69, 110, 124, 36, 90, 7, 75, 52, 104, 82, 57, 80, 77, 94, 32, 29, 24, 81, 10, 103, 12, 83, 17, 79, 76, 89, 26, 59, 8, 35, 112, 38, 44, 99, 119, 84, 125, 56, 71, 86, 100, 96, 102, 106, 91, 97, 47], [40, 111, 46, 121, 98, 37, 49, 21, 25, 117, 54, 114, 18, 95, 23, 60, 11, 15, 107, 51, 61, 53, 13, 116, 105, 4, 78, 62, 16, 48, 67, 30, 9, 113, 124, 80, 24, 72, 39, 41, 123, 17, 19, 6, 1, 120, 108, 88, 42, 90, 92, 70, 83, 52, 63, 8, 109, 0, 50, 58, 20, 85, 27, 87, 55, 122, 56, 84, 26, 65, 127, 115, 45, 38, 22, 103, 59, 112, 43, 96, 125, 29, 71, 126, 75, 118, 91, 57, 100, 7, 119, 36, 102, 101, 10, 68, 33, 79, 76, 2, 14, 34, 35, 81, 44, 74, 86, 94, 12, 97, 82, 110, 32, 47, 89, 73, 104, 5, 106, 93, 31, 69, 28, 99, 77, 66, 64, 3], [40, 121, 98, 46, 117, 111, 13, 21, 18, 61, 37, 9, 25, 15, 4, 51, 48, 30, 105, 11, 122, 107, 72, 49, 58, 87, 108, 24, 6, 23, 123, 67, 116, 63, 1, 110, 90, 88, 60, 0, 124, 14, 31, 127, 101, 39, 126, 45, 32, 95, 43, 41, 27, 109, 85, 52, 26, 92, 93, 64, 80, 50, 75, 5, 97, 22, 36, 78, 55, 100, 74, 16, 57, 70, 53, 103, 89, 19, 82, 73, 102, 28, 115, 113, 120, 83, 59, 35, 38, 42, 62, 12, 118, 17, 106, 20, 2, 114, 119, 79, 76, 71, 33, 125, 81, 66, 54, 112, 8, 68, 44, 91, 7, 77, 56, 96, 86, 94, 69, 65, 99, 29, 3, 84, 10, 104, 47, 34], [117, 50, 105, 108, 123, 49, 99, 44, 97, 25, 31, 87, 120, 21, 112, 16, 93, 82, 58, 78, 34, 29, 62, 118, 48, 114, 51, 38, 30, 43, 109, 32, 90, 42, 55, 7, 106, 10, 59, 103, 36, 53, 107, 52, 12, 61, 96, 45, 33, 124, 47, 46, 60, 26, 56, 23, 35, 63, 88, 39, 95, 113, 17, 125, 77, 28, 115, 121, 54, 19, 83, 57, 24, 122, 98, 94, 84, 68, 119, 116, 104, 15, 110, 92, 11, 69, 111, 40, 102, 126, 3, 100, 20, 127, 14, 22, 91, 86, 79, 27, 101, 8, 75, 37, 71, 89, 65, 74, 81, 85, 2, 72, 18, 64, 6, 76, 66, 73, 9, 80, 70, 67, 4, 13, 1, 5, 41, 0], [105, 123, 49, 117, 31, 99, 118, 2, 114, 109, 10, 16, 7, 82, 25, 78, 64, 69, 3, 68, 65, 50, 12, 8, 21, 77, 44, 97, 87, 67, 89, 115, 58, 41, 0, 66, 4, 71, 1, 111, 23, 98, 59, 61, 93, 104, 120, 127, 72, 63, 5, 39, 56, 28, 51, 70, 42, 91, 108, 112, 24, 57, 95, 122, 20, 107, 74, 15, 36, 75, 121, 106, 14, 34, 83, 52, 86, 29, 9, 54, 6, 37, 79, 92, 84, 88, 30, 18, 27, 101, 22, 13, 73, 110, 103, 46, 80, 11, 17, 96, 62, 26, 19, 90, 119, 33, 76, 113, 43, 85, 60, 102, 45, 38, 48, 53, 81, 116, 47, 40, 55, 125, 32, 124, 94, 35, 126, 100], [105, 49, 123, 108, 31, 58, 118, 10, 25, 16, 117, 82, 99, 3, 69, 21, 8, 87, 12, 65, 78, 77, 68, 109, 94, 7, 29, 122, 114, 1, 120, 42, 5, 55, 97, 64, 4, 44, 50, 111, 43, 6, 36, 98, 115, 23, 73, 95, 33, 13, 125, 38, 2, 93, 11, 79, 48, 76, 63, 62, 66, 88, 61, 72, 89, 30, 67, 91, 47, 54, 46, 51, 27, 18, 56, 39, 32, 9, 92, 110, 90, 101, 100, 45, 112, 75, 86, 70, 113, 103, 116, 0, 127, 17, 34, 106, 74, 80, 71, 81, 14, 52, 40, 20, 104, 22, 121, 53, 60, 41, 84, 19, 83, 59, 96, 24, 107, 26, 37, 119, 15, 28, 57, 126, 85, 102, 124, 35], [118, 49, 108, 105, 58, 44, 42, 99, 21, 117, 87, 97, 94, 31, 12, 39, 25, 93, 29, 82, 104, 46, 51, 56, 52, 33, 38, 95, 43, 103, 113, 119, 107, 26, 109, 45, 73, 50, 16, 59, 115, 91, 125, 123, 111, 47, 60, 57, 112, 40, 127, 114, 20, 121, 54, 110, 77, 53, 101, 63, 37, 122, 126, 23, 61, 48, 1, 120, 124, 116, 32, 19, 102, 36, 55, 100, 96, 81, 78, 75, 4, 98, 62, 8, 85, 106, 92, 79, 34, 5, 17, 30, 70, 72, 10, 89, 9, 7, 90, 69, 35, 6, 22, 66, 76, 68, 24, 65, 28, 27, 18, 15, 3, 88, 83, 84, 13, 86, 2, 0, 41, 14, 67, 11, 80, 74, 71, 64], [40, 109, 121, 34, 59, 89, 53, 120, 17, 45, 55, 15, 86, 83, 114, 12, 29, 113, 71, 95, 28, 93, 4, 73, 94, 31, 69, 125, 67, 46, 122, 56, 106, 42, 33, 11, 92, 2, 77, 107, 61, 44, 88, 65, 108, 23, 38, 6, 58, 115, 8, 30, 50, 110, 26, 116, 0, 118, 126, 16, 99, 90, 21, 119, 87, 80, 62, 117, 20, 48, 101, 37, 49, 3, 68, 32, 85, 24, 41, 81, 84, 13, 52, 76, 19, 36, 82, 18, 1, 43, 91, 100, 112, 97, 74, 103, 78, 14, 96, 79, 35, 5, 64, 22, 75, 27, 39, 54, 57, 25, 47, 123, 63, 10, 124, 7, 127, 70, 104, 111, 51, 105, 9, 72, 102, 60, 66, 98], [40, 109, 121, 34, 15, 12, 86, 17, 73, 83, 53, 4, 89, 55, 71, 67, 59, 120, 45, 42, 92, 114, 2, 106, 95, 69, 122, 113, 0, 77, 6, 46, 29, 68, 28, 103, 56, 118, 11, 8, 33, 116, 72, 31, 18, 125, 65, 111, 37, 38, 88, 1, 61, 100, 85, 23, 108, 30, 101, 115, 64, 107, 98, 119, 20, 112, 82, 48, 3, 74, 126, 66, 70, 16, 13, 14, 81, 5, 93, 21, 84, 110, 19, 27, 94, 79, 80, 87, 96, 22, 102, 76, 58, 35, 7, 97, 25, 10, 91, 24, 36, 26, 62, 104, 75, 63, 117, 57, 47, 39, 78, 49, 41, 32, 90, 44, 9, 51, 43, 52, 127, 60, 99, 123, 105, 50, 124, 54], [40, 109, 121, 34, 89, 17, 86, 15, 55, 83, 12, 53, 45, 95, 59, 67, 122, 28, 92, 2, 29, 0, 65, 73, 69, 71, 114, 56, 4, 21, 42, 116, 6, 62, 120, 61, 125, 119, 31, 77, 46, 49, 10, 11, 93, 16, 1, 19, 26, 96, 88, 98, 36, 126, 106, 113, 118, 64, 30, 23, 57, 108, 44, 48, 101, 13, 103, 18, 123, 58, 22, 66, 37, 14, 27, 72, 76, 20, 87, 70, 80, 112, 127, 8, 115, 78, 94, 54, 68, 33, 84, 32, 81, 82, 24, 25, 7, 91, 39, 79, 90, 107, 74, 3, 85, 104, 52, 35, 5, 60, 51, 110, 75, 99, 9, 43, 100, 38, 47, 111, 97, 124, 102, 63, 117, 41, 50, 105], [121, 109, 40, 59, 34, 53, 111, 44, 114, 39, 124, 118, 116, 115, 88, 20, 61, 58, 55, 93, 117, 60, 126, 56, 62, 49, 108, 50, 72, 122, 48, 54, 63, 43, 46, 41, 57, 51, 119, 92, 127, 123, 47, 86, 125, 42, 31, 110, 105, 120, 98, 38, 52, 107, 113, 112, 45, 106, 83, 97, 89, 103, 104, 27, 36, 102, 28, 96, 26, 100, 14, 101, 32, 35, 37, 99, 10, 17, 95, 29, 33, 30, 94, 18, 85, 91, 87, 68, 23, 78, 25, 90, 13, 84, 73, 15, 4, 77, 70, 21, 74, 82, 8, 24, 6, 16, 75, 12, 80, 2, 71, 11, 19, 22, 66, 1, 64, 9, 67, 3, 81, 7, 69, 5, 76, 79, 0, 65], [107, 100, 58, 32, 21, 112, 78, 19, 61, 127, 87, 93, 115, 12, 43, 9, 114, 120, 116, 119, 16, 56, 82, 55, 42, 96, 111, 113, 25, 72, 38, 125, 54, 92, 79, 48, 29, 5, 66, 47, 88, 97, 110, 105, 57, 123, 4, 124, 35, 50, 49, 2, 18, 27, 0, 24, 63, 36, 62, 28, 53, 71, 91, 39, 77, 75, 68, 1, 86, 11, 3, 26, 69, 46, 20, 7, 15, 104, 126, 14, 95, 117, 106, 83, 73, 74, 98, 13, 23, 6, 70, 64, 41, 84, 45, 89, 67, 80, 30, 44, 99, 90, 85, 81, 10, 108, 40, 76, 59, 51, 94, 37, 31, 34, 103, 22, 65, 101, 102, 122, 8, 60, 52, 17, 109, 33, 121, 118], [107, 58, 100, 63, 127, 32, 112, 25, 56, 61, 47, 39, 115, 114, 116, 79, 21, 105, 113, 54, 108, 104, 96, 19, 119, 87, 42, 55, 120, 126, 93, 45, 43, 50, 41, 110, 125, 35, 106, 17, 122, 82, 111, 57, 97, 20, 44, 118, 60, 52, 48, 88, 123, 30, 124, 117, 49, 59, 16, 53, 38, 27, 46, 109, 62, 36, 74, 91, 78, 33, 51, 5, 121, 102, 101, 28, 24, 103, 40, 71, 75, 92, 99, 89, 31, 11, 37, 86, 94, 90, 72, 95, 12, 26, 98, 34, 15, 22, 9, 23, 84, 10, 14, 13, 77, 81, 18, 65, 29, 85, 70, 6, 3, 67, 66, 7, 83, 80, 69, 0, 68, 1, 8, 76, 4, 64, 2, 73], [43, 93, 58, 2, 68, 65, 107, 67, 0, 39, 87, 12, 70, 9, 69, 78, 64, 1, 21, 25, 63, 72, 96, 10, 16, 66, 110, 19, 112, 103, 100, 8, 61, 114, 97, 56, 7, 32, 37, 73, 29, 79, 5, 75, 55, 82, 120, 116, 14, 38, 20, 17, 49, 44, 91, 42, 3, 80, 57, 115, 113, 51, 109, 119, 101, 45, 74, 4, 104, 23, 40, 85, 28, 125, 98, 33, 127, 124, 47, 46, 31, 6, 53, 105, 59, 99, 48, 71, 35, 83, 11, 24, 121, 15, 26, 84, 106, 81, 60, 77, 86, 22, 89, 108, 123, 34, 95, 18, 30, 13, 76, 27, 92, 126, 118, 88, 50, 90, 41, 54, 94, 122, 36, 52, 111, 102, 62, 117], [107, 100, 58, 127, 112, 32, 12, 19, 78, 61, 25, 16, 9, 82, 119, 114, 21, 87, 43, 93, 113, 115, 54, 120, 66, 56, 0, 55, 125, 39, 49, 4, 38, 42, 105, 29, 116, 72, 97, 111, 68, 27, 63, 5, 81, 110, 96, 47, 69, 48, 124, 62, 51, 6, 71, 57, 35, 10, 92, 123, 3, 83, 28, 91, 44, 84, 126, 95, 79, 76, 7, 89, 23, 98, 75, 20, 18, 64, 88, 80, 90, 37, 101, 106, 22, 8, 52, 15, 85, 30, 73, 67, 50, 103, 59, 117, 11, 74, 70, 53, 118, 2, 13, 31, 14, 34, 1, 45, 109, 24, 46, 65, 77, 94, 33, 86, 108, 26, 104, 99, 41, 40, 60, 17, 122, 121, 102, 36], [55, 58, 115, 50, 52, 39, 122, 62, 63, 59, 121, 106, 34, 54, 103, 53, 47, 48, 95, 120, 119, 118, 117, 124, 56, 126, 113, 107, 127, 123, 49, 105, 96, 111, 116, 51, 112, 93, 42, 61, 84, 46, 109, 60, 114, 57, 44, 125, 43, 110, 45, 102, 91, 27, 22, 108, 38, 29, 99, 86, 41, 104, 76, 97, 24, 89, 30, 35, 40, 37, 82, 33, 32, 101, 100, 25, 36, 92, 16, 18, 10, 8, 20, 88, 31, 98, 94, 90, 28, 65, 21, 85, 23, 81, 15, 74, 14, 87, 80, 26, 4, 12, 78, 3, 17, 11, 1, 72, 5, 19, 2, 77, 68, 71, 79, 83, 6, 69, 73, 70, 64, 67, 66, 0, 75, 13, 9, 7], [50, 58, 39, 62, 52, 34, 122, 55, 45, 120, 118, 115, 106, 24, 108, 105, 81, 47, 64, 51, 126, 63, 71, 124, 93, 84, 103, 56, 109, 95, 113, 59, 96, 44, 53, 125, 117, 57, 27, 119, 54, 25, 48, 5, 112, 116, 111, 38, 42, 99, 127, 60, 91, 49, 22, 121, 114, 3, 86, 30, 2, 32, 61, 0, 20, 82, 123, 104, 35, 4, 46, 73, 98, 107, 43, 41, 110, 77, 66, 65, 101, 6, 15, 31, 76, 102, 14, 97, 28, 40, 88, 29, 67, 10, 37, 92, 21, 83, 18, 36, 26, 87, 8, 80, 89, 33, 100, 23, 78, 16, 85, 11, 94, 90, 12, 7, 69, 70, 79, 68, 74, 17, 72, 13, 9, 1, 19, 75], [58, 50, 115, 39, 34, 83, 77, 15, 2, 73, 11, 3, 71, 109, 24, 27, 6, 0, 81, 64, 122, 93, 79, 1, 51, 52, 85, 44, 68, 4, 22, 20, 67, 45, 113, 82, 9, 54, 99, 5, 25, 26, 66, 72, 8, 80, 106, 111, 65, 56, 19, 88, 114, 7, 12, 95, 116, 13, 78, 62, 104, 59, 63, 107, 21, 35, 89, 75, 10, 17, 119, 105, 70, 98, 74, 69, 28, 30, 32, 18, 61, 29, 76, 33, 14, 120, 91, 117, 118, 86, 41, 57, 92, 46, 127, 87, 16, 96, 126, 108, 31, 43, 23, 97, 94, 90, 38, 36, 103, 100, 102, 101, 37, 53, 110, 84, 124, 121, 47, 49, 125, 55, 40, 48, 123, 60, 112, 42], [58, 115, 62, 39, 55, 52, 122, 106, 34, 108, 107, 53, 120, 61, 63, 103, 105, 95, 118, 47, 54, 124, 59, 109, 49, 91, 117, 60, 57, 119, 48, 114, 112, 111, 93, 96, 99, 121, 84, 126, 46, 45, 116, 127, 123, 125, 41, 104, 51, 110, 42, 43, 19, 56, 22, 113, 44, 24, 33, 86, 7, 40, 38, 71, 32, 102, 101, 76, 25, 82, 97, 87, 37, 27, 16, 29, 35, 10, 100, 36, 83, 94, 50, 31, 98, 21, 30, 18, 90, 92, 20, 81, 11, 23, 78, 28, 89, 26, 75, 88, 74, 85, 15, 5, 14, 64, 8, 4, 80, 67, 0, 3, 69, 1, 12, 65, 66, 17, 68, 2, 79, 73, 77, 72, 6, 13, 9, 70], [122, 104, 105, 126, 53, 112, 51, 110, 118, 123, 55, 119, 57, 111, 124, 120, 114, 117, 59, 121, 125, 63, 116, 115, 61, 49, 34, 54, 113, 106, 60, 89, 32, 86, 50, 31, 62, 48, 46, 58, 56, 127, 44, 52, 45, 109, 91, 33, 98, 47, 107, 43, 25, 108, 95, 73, 41, 40, 20, 82, 93, 42, 75, 22, 83, 81, 103, 23, 19, 88, 39, 96, 101, 37, 87, 102, 92, 99, 36, 11, 90, 38, 29, 100, 27, 26, 35, 21, 72, 97, 84, 17, 94, 13, 30, 18, 14, 24, 16, 28, 78, 76, 85, 6, 80, 8, 77, 9, 68, 4, 10, 70, 79, 5, 12, 71, 67, 3, 74, 69, 15, 7, 65, 2, 64, 0, 66, 1], [55, 122, 104, 105, 127, 119, 121, 114, 62, 60, 118, 117, 112, 110, 57, 51, 63, 111, 49, 124, 34, 56, 116, 123, 61, 31, 125, 113, 59, 44, 86, 47, 120, 48, 115, 108, 54, 58, 33, 89, 106, 45, 46, 50, 52, 53, 126, 109, 98, 107, 91, 41, 43, 81, 42, 95, 88, 40, 87, 23, 103, 83, 27, 101, 32, 37, 25, 29, 92, 19, 93, 90, 39, 20, 30, 96, 22, 99, 38, 102, 36, 18, 82, 100, 94, 80, 26, 35, 97, 75, 15, 17, 28, 24, 85, 69, 12, 77, 76, 78, 84, 21, 13, 6, 14, 3, 10, 16, 71, 79, 8, 5, 70, 4, 72, 11, 9, 65, 67, 74, 68, 73, 2, 7, 64, 66, 0, 1], [55, 122, 104, 105, 126, 41, 88, 18, 20, 80, 79, 10, 31, 60, 98, 15, 85, 78, 114, 76, 121, 29, 86, 75, 34, 69, 12, 33, 93, 91, 118, 8, 47, 53, 123, 89, 27, 62, 110, 13, 120, 71, 59, 119, 63, 3, 116, 108, 57, 113, 51, 124, 56, 28, 44, 96, 97, 127, 74, 49, 43, 112, 87, 16, 106, 83, 115, 111, 54, 50, 117, 5, 26, 61, 25, 58, 94, 24, 125, 95, 46, 21, 48, 52, 72, 45, 42, 90, 82, 84, 109, 107, 14, 92, 23, 9, 39, 30, 100, 19, 32, 77, 101, 22, 73, 102, 36, 37, 81, 99, 35, 11, 17, 38, 70, 103, 7, 2, 6, 4, 40, 68, 67, 66, 65, 64, 1, 0], [122, 55, 104, 64, 71, 2, 126, 79, 4, 10, 1, 3, 73, 105, 78, 18, 8, 76, 20, 80, 77, 75, 9, 31, 69, 86, 7, 93, 51, 98, 68, 81, 88, 6, 27, 85, 121, 5, 66, 65, 107, 119, 70, 16, 23, 67, 114, 0, 84, 37, 74, 63, 11, 59, 97, 82, 106, 17, 15, 33, 120, 91, 118, 127, 21, 14, 57, 52, 41, 49, 12, 53, 87, 34, 95, 39, 72, 13, 110, 100, 89, 24, 47, 83, 29, 101, 46, 32, 25, 117, 112, 19, 56, 44, 28, 60, 111, 103, 99, 90, 26, 22, 58, 50, 124, 45, 125, 30, 96, 102, 92, 123, 94, 109, 36, 43, 113, 35, 40, 108, 38, 42, 48, 115, 116, 54, 61, 62], [103, 126, 98, 9, 14, 4, 21, 17, 71, 95, 64, 75, 67, 115, 27, 24, 88, 1, 109, 63, 48, 44, 62, 61, 16, 70, 118, 104, 114, 116, 31, 124, 45, 90, 122, 106, 3, 19, 5, 99, 105, 127, 6, 65, 56, 12, 46, 32, 37, 91, 53, 50, 113, 13, 66, 47, 100, 93, 22, 57, 81, 23, 20, 78, 15, 87, 0, 74, 92, 51, 69, 123, 72, 7, 41, 76, 73, 34, 10, 83, 11, 85, 112, 68, 49, 2, 79, 94, 125, 55, 58, 40, 89, 8, 77, 117, 26, 35, 121, 33, 80, 107, 119, 25, 82, 38, 52, 86, 42, 84, 54, 101, 97, 30, 36, 29, 28, 18, 110, 59, 39, 60, 96, 43, 102, 108, 111, 120], [103, 126, 98, 17, 65, 75, 14, 9, 71, 21, 4, 3, 61, 48, 88, 95, 109, 69, 0, 115, 44, 41, 5, 24, 27, 106, 124, 63, 118, 70, 62, 105, 114, 112, 12, 16, 122, 1, 31, 64, 45, 13, 50, 127, 91, 113, 19, 76, 56, 99, 46, 47, 90, 68, 81, 66, 84, 85, 86, 57, 116, 110, 78, 123, 55, 104, 23, 80, 77, 117, 96, 92, 18, 15, 100, 37, 20, 29, 42, 73, 67, 6, 82, 87, 72, 33, 53, 74, 107, 120, 34, 7, 119, 43, 54, 11, 32, 58, 26, 121, 89, 30, 83, 94, 93, 49, 51, 36, 102, 97, 38, 25, 79, 40, 35, 2, 22, 125, 10, 8, 28, 108, 52, 101, 111, 59, 60, 39], [103, 126, 98, 61, 95, 21, 88, 17, 27, 124, 44, 14, 109, 63, 91, 48, 113, 75, 115, 9, 104, 31, 122, 105, 77, 71, 90, 39, 50, 24, 83, 100, 127, 12, 4, 47, 37, 41, 114, 116, 53, 45, 19, 99, 36, 42, 57, 107, 87, 56, 33, 121, 93, 62, 94, 74, 51, 72, 13, 32, 70, 35, 29, 106, 119, 38, 112, 10, 123, 67, 1, 108, 40, 118, 84, 96, 97, 79, 20, 52, 110, 85, 26, 28, 5, 58, 54, 80, 76, 86, 46, 25, 55, 111, 73, 15, 69, 18, 22, 59, 125, 102, 64, 43, 117, 16, 101, 30, 81, 78, 23, 82, 8, 49, 89, 120, 92, 60, 6, 34, 11, 2, 3, 7, 68, 0, 66, 65], [126, 103, 41, 98, 61, 95, 118, 51, 112, 127, 119, 109, 54, 110, 91, 124, 27, 116, 62, 57, 88, 21, 53, 48, 29, 44, 39, 106, 122, 113, 105, 104, 46, 31, 114, 55, 63, 37, 42, 45, 47, 58, 50, 56, 92, 108, 22, 17, 115, 121, 14, 16, 100, 59, 60, 111, 117, 90, 123, 19, 43, 49, 120, 125, 52, 107, 94, 9, 84, 15, 75, 36, 102, 33, 38, 99, 30, 101, 40, 89, 35, 93, 24, 18, 32, 8, 83, 82, 96, 71, 85, 3, 23, 97, 5, 72, 4, 86, 13, 77, 20, 87, 25, 76, 28, 10, 26, 12, 80, 2, 65, 34, 74, 79, 69, 78, 66, 70, 1, 6, 67, 81, 0, 64, 68, 11, 73, 7], [58, 104, 56, 34, 80, 52, 84, 26, 42, 109, 12, 9, 82, 31, 28, 24, 35, 64, 70, 37, 103, 96, 11, 6, 94, 36, 66, 95, 49, 60, 114, 2, 87, 126, 76, 92, 7, 20, 73, 47, 115, 23, 14, 86, 113, 21, 51, 16, 97, 53, 22, 85, 10, 48, 41, 107, 38, 127, 27, 91, 93, 4, 106, 67, 111, 77, 30, 74, 0, 122, 54, 68, 121, 108, 59, 17, 18, 19, 45, 15, 13, 29, 75, 90, 124, 125, 102, 120, 83, 110, 118, 8, 88, 63, 116, 40, 46, 55, 50, 123, 43, 61, 101, 3, 65, 39, 69, 98, 89, 105, 79, 72, 32, 71, 119, 81, 57, 99, 33, 112, 62, 1, 44, 78, 100, 25, 5, 117], [56, 52, 104, 58, 107, 37, 34, 31, 109, 48, 21, 85, 55, 120, 95, 87, 112, 114, 126, 63, 40, 26, 111, 41, 113, 121, 47, 36, 54, 116, 60, 125, 28, 42, 124, 59, 50, 110, 92, 123, 61, 118, 62, 83, 108, 91, 49, 14, 119, 24, 115, 43, 45, 44, 53, 127, 74, 51, 46, 84, 94, 106, 105, 66, 7, 80, 0, 38, 39, 99, 67, 11, 103, 57, 64, 23, 35, 101, 25, 19, 102, 122, 88, 17, 6, 98, 81, 96, 27, 79, 32, 86, 13, 10, 15, 117, 30, 90, 100, 8, 93, 33, 78, 4, 97, 82, 29, 3, 72, 12, 68, 89, 2, 5, 1, 22, 77, 75, 65, 70, 9, 16, 69, 18, 71, 20, 73, 76], [58, 104, 37, 109, 107, 53, 127, 34, 126, 57, 50, 31, 120, 60, 121, 41, 63, 59, 114, 115, 118, 54, 48, 123, 113, 116, 108, 111, 110, 52, 49, 21, 61, 119, 51, 112, 95, 45, 55, 47, 44, 103, 46, 106, 40, 43, 124, 105, 87, 26, 39, 36, 62, 125, 92, 42, 117, 77, 17, 96, 28, 122, 56, 82, 97, 102, 38, 24, 18, 35, 15, 30, 85, 32, 83, 90, 86, 99, 69, 94, 33, 101, 25, 100, 65, 98, 27, 79, 74, 89, 84, 23, 13, 29, 80, 88, 93, 4, 72, 7, 67, 14, 91, 1, 5, 19, 81, 6, 64, 66, 12, 3, 10, 22, 11, 8, 71, 9, 20, 78, 68, 75, 70, 0, 2, 16, 76, 73], [104, 58, 34, 56, 31, 126, 26, 28, 47, 52, 23, 127, 84, 92, 41, 82, 83, 122, 38, 87, 103, 80, 115, 94, 35, 114, 48, 109, 14, 53, 54, 95, 90, 12, 20, 50, 125, 36, 71, 42, 86, 27, 101, 113, 7, 44, 74, 32, 102, 118, 119, 37, 96, 121, 60, 97, 30, 88, 111, 106, 62, 55, 24, 59, 29, 1, 124, 18, 57, 78, 99, 112, 45, 22, 105, 93, 9, 63, 117, 116, 120, 25, 33, 51, 123, 6, 49, 3, 110, 89, 19, 100, 85, 43, 21, 17, 81, 98, 108, 39, 16, 5, 46, 61, 91, 72, 76, 11, 77, 10, 69, 8, 15, 40, 107, 13, 68, 79, 73, 65, 4, 67, 0, 75, 66, 70, 2, 64]], "model.layers.31.self_attn.k_proj": [[104, 121, 34, 25, 47, 1, 72, 6, 21, 110, 51, 0, 15, 11, 13, 18, 112, 113, 4, 9, 64, 60, 44, 41, 111, 31, 87, 54, 30, 116, 66, 45, 107, 114, 67, 120, 123, 19, 10, 105, 109, 117, 53, 80, 58, 71, 92, 61, 98, 57, 17, 62, 23, 124, 55, 27, 42, 14, 22, 103, 63, 56, 3, 7, 65, 102, 69, 5, 84, 115, 16, 78, 50, 106, 49, 46, 43, 90, 122, 28, 100, 101, 119, 94, 99, 91, 125, 24, 79, 93, 52, 59, 26, 36, 33, 118, 81, 76, 97, 108, 96, 88, 68, 74, 83, 75, 12, 39, 95, 126, 8, 32, 89, 38, 70, 48, 29, 127, 37, 40, 35, 86, 20, 82, 2, 73, 77, 85], [41, 49, 123, 117, 64, 50, 113, 25, 87, 95, 44, 53, 93, 7, 65, 33, 35, 82, 69, 16, 21, 3, 45, 108, 10, 118, 8, 77, 54, 78, 12, 121, 68, 66, 127, 63, 115, 43, 120, 58, 47, 83, 106, 91, 42, 57, 111, 2, 0, 99, 110, 36, 124, 11, 55, 109, 94, 97, 125, 112, 102, 116, 61, 107, 101, 59, 37, 31, 32, 51, 119, 19, 126, 62, 6, 60, 52, 100, 84, 40, 34, 46, 39, 114, 98, 26, 9, 15, 122, 24, 38, 105, 103, 70, 28, 88, 22, 56, 73, 48, 27, 29, 5, 89, 86, 104, 75, 17, 30, 13, 96, 79, 90, 85, 76, 92, 20, 1, 81, 14, 4, 67, 80, 23, 18, 72, 71, 74], [104, 121, 45, 98, 109, 0, 86, 65, 83, 89, 15, 67, 73, 56, 6, 17, 69, 50, 12, 53, 120, 93, 114, 55, 92, 61, 125, 110, 59, 31, 49, 71, 118, 2, 62, 58, 116, 126, 124, 11, 26, 57, 52, 88, 37, 123, 119, 47, 122, 48, 42, 108, 106, 30, 111, 54, 100, 51, 43, 115, 113, 77, 105, 102, 4, 127, 63, 60, 112, 97, 44, 29, 117, 103, 66, 28, 85, 7, 107, 64, 46, 20, 27, 33, 16, 90, 23, 41, 13, 101, 99, 96, 39, 38, 21, 95, 32, 18, 36, 74, 94, 91, 76, 68, 10, 35, 75, 70, 25, 82, 87, 72, 3, 80, 81, 24, 84, 14, 22, 19, 78, 79, 5, 9, 1, 8, 34, 40], [43, 93, 96, 48, 87, 16, 12, 127, 61, 78, 9, 36, 49, 19, 21, 66, 51, 0, 25, 119, 68, 58, 82, 65, 39, 5, 114, 54, 72, 50, 111, 10, 79, 3, 47, 56, 8, 106, 110, 120, 75, 116, 70, 105, 53, 107, 4, 33, 6, 63, 20, 55, 115, 108, 125, 44, 71, 32, 35, 91, 46, 17, 62, 27, 76, 126, 37, 123, 31, 102, 24, 97, 42, 57, 118, 121, 117, 98, 40, 7, 26, 18, 52, 113, 1, 59, 73, 11, 99, 88, 109, 122, 34, 41, 92, 60, 64, 74, 112, 124, 2, 45, 101, 95, 67, 103, 38, 94, 104, 13, 29, 89, 30, 77, 80, 86, 22, 83, 90, 28, 81, 84, 23, 15, 14, 100, 85, 69], [103, 58, 98, 115, 50, 45, 77, 83, 73, 81, 91, 114, 6, 11, 15, 24, 108, 29, 25, 99, 27, 4, 64, 106, 2, 31, 65, 30, 71, 127, 3, 95, 110, 28, 82, 54, 117, 57, 113, 35, 63, 107, 93, 96, 105, 49, 42, 90, 88, 84, 125, 22, 111, 119, 120, 43, 48, 126, 53, 59, 60, 118, 124, 102, 51, 46, 47, 39, 21, 68, 123, 26, 112, 122, 61, 14, 109, 5, 44, 41, 40, 52, 97, 80, 23, 69, 87, 86, 85, 37, 56, 33, 100, 116, 38, 8, 36, 101, 34, 92, 104, 94, 89, 62, 79, 55, 74, 121, 12, 16, 32, 70, 17, 10, 76, 66, 19, 20, 78, 18, 72, 67, 1, 13, 75, 7, 9, 0], [40, 126, 122, 95, 55, 34, 91, 86, 20, 29, 65, 88, 64, 33, 23, 2, 119, 41, 82, 89, 107, 114, 113, 98, 63, 127, 115, 4, 110, 121, 1, 61, 71, 118, 106, 75, 18, 78, 42, 73, 0, 49, 125, 80, 79, 103, 51, 117, 116, 59, 62, 124, 97, 47, 56, 45, 36, 85, 123, 52, 43, 48, 108, 46, 50, 68, 60, 76, 54, 111, 10, 57, 101, 109, 22, 44, 58, 81, 112, 25, 120, 7, 93, 13, 77, 8, 92, 53, 37, 90, 83, 102, 30, 39, 32, 38, 67, 17, 96, 19, 74, 3, 21, 100, 14, 99, 69, 105, 35, 28, 94, 72, 5, 24, 6, 27, 66, 26, 12, 70, 15, 87, 16, 84, 9, 31, 11, 104], [126, 34, 39, 64, 17, 88, 21, 75, 9, 71, 14, 31, 4, 112, 3, 63, 108, 45, 114, 1, 27, 124, 70, 105, 115, 65, 103, 12, 109, 111, 49, 61, 5, 106, 2, 36, 51, 91, 58, 16, 53, 46, 18, 116, 118, 122, 95, 127, 69, 24, 94, 104, 55, 41, 54, 76, 42, 50, 113, 35, 90, 107, 13, 19, 96, 62, 22, 32, 20, 40, 48, 79, 125, 57, 110, 83, 87, 59, 99, 102, 60, 119, 89, 52, 92, 26, 100, 123, 117, 47, 67, 77, 29, 93, 86, 25, 23, 38, 97, 30, 10, 28, 101, 56, 72, 66, 84, 43, 15, 37, 120, 82, 33, 121, 8, 80, 44, 98, 74, 68, 0, 78, 7, 81, 11, 73, 6, 85], [40, 58, 56, 98, 95, 87, 92, 52, 90, 26, 30, 101, 84, 32, 6, 66, 45, 80, 11, 64, 21, 41, 106, 18, 51, 28, 9, 54, 82, 60, 48, 59, 126, 112, 118, 100, 61, 12, 25, 79, 116, 88, 125, 122, 63, 110, 111, 113, 78, 39, 3, 8, 62, 127, 20, 43, 74, 119, 49, 114, 13, 72, 29, 47, 24, 50, 121, 108, 46, 97, 16, 76, 17, 124, 53, 123, 44, 104, 120, 109, 89, 115, 55, 102, 105, 27, 96, 57, 14, 1, 38, 4, 91, 73, 7, 117, 85, 65, 10, 94, 77, 22, 33, 83, 15, 42, 0, 93, 36, 103, 35, 19, 99, 37, 75, 5, 86, 81, 23, 107, 67, 70, 69, 68, 71, 2, 31, 34]], "model.layers.31.self_attn.qk_proj": [[121, 126, 58, 43, 122, 49, 109, 50, 55, 115, 123, 103, 56, 104, 117, 45, 93, 89, 40, 61, 98, 105, 85, 21, 107, 25, 127, 23, 34, 41, 51, 114, 64, 118, 39, 95, 111, 0, 31, 53, 73, 9, 48, 108, 52, 4, 29, 87, 78, 27, 82, 12, 14, 44, 120, 24, 113, 28, 54, 16, 65, 79, 15, 76, 18, 81, 68, 66, 88, 80, 112, 63, 116, 17, 124, 83, 71, 2, 75, 7, 67, 119, 3, 19, 1, 110, 11, 60, 22, 5, 106, 62, 59, 46, 47, 6, 96, 13, 8, 99, 32, 72, 77, 125, 69, 42, 70, 91, 57, 26, 74, 86, 30, 36, 100, 20, 10, 37, 84, 97, 33, 90, 35, 101, 102, 94, 38, 92], [121, 58, 126, 43, 122, 49, 109, 50, 55, 115, 56, 103, 123, 117, 104, 45, 93, 89, 61, 98, 85, 107, 21, 105, 40, 118, 25, 127, 51, 23, 34, 41, 114, 111, 64, 0, 31, 39, 9, 29, 95, 52, 73, 27, 68, 108, 87, 76, 124, 48, 112, 12, 54, 53, 4, 113, 28, 78, 80, 88, 15, 18, 82, 81, 116, 1, 65, 66, 120, 24, 16, 2, 14, 67, 7, 83, 6, 71, 79, 11, 110, 119, 75, 44, 63, 17, 19, 72, 106, 3, 47, 46, 60, 22, 99, 59, 13, 5, 77, 62, 32, 69, 125, 96, 42, 57, 8, 74, 91, 86, 84, 70, 26, 37, 30, 20, 97, 10, 36, 100, 90, 94, 33, 35, 38, 102, 101, 92], [121, 58, 126, 43, 122, 109, 49, 50, 55, 115, 56, 123, 103, 117, 45, 104, 98, 21, 89, 93, 105, 107, 40, 61, 85, 51, 114, 25, 34, 41, 0, 64, 127, 118, 108, 111, 23, 95, 39, 31, 53, 9, 68, 87, 73, 52, 12, 27, 48, 82, 63, 15, 29, 14, 28, 124, 78, 4, 116, 24, 88, 120, 112, 44, 54, 76, 113, 18, 2, 80, 16, 1, 81, 66, 6, 65, 7, 79, 110, 72, 17, 83, 67, 3, 60, 19, 75, 119, 11, 71, 46, 59, 69, 106, 47, 99, 22, 5, 62, 77, 32, 13, 96, 125, 42, 91, 86, 100, 57, 20, 84, 37, 70, 26, 33, 10, 36, 97, 30, 74, 8, 35, 94, 90, 102, 101, 92, 38], [121, 126, 58, 43, 122, 49, 109, 55, 50, 115, 56, 123, 103, 104, 117, 45, 105, 89, 21, 107, 98, 93, 61, 40, 118, 85, 51, 41, 34, 25, 23, 127, 31, 0, 64, 52, 114, 108, 73, 39, 111, 95, 68, 29, 9, 113, 53, 116, 16, 54, 27, 124, 28, 78, 87, 12, 4, 76, 14, 24, 80, 120, 66, 79, 15, 88, 1, 112, 81, 72, 82, 65, 44, 48, 18, 63, 71, 83, 7, 2, 67, 6, 46, 3, 110, 106, 59, 17, 60, 19, 75, 11, 42, 69, 13, 32, 47, 99, 5, 125, 77, 26, 22, 57, 119, 62, 84, 74, 91, 96, 33, 20, 86, 10, 100, 94, 90, 37, 70, 36, 8, 30, 97, 102, 101, 35, 38, 92], [121, 126, 58, 43, 122, 49, 109, 50, 55, 115, 56, 123, 104, 103, 117, 45, 21, 93, 98, 105, 40, 107, 25, 89, 41, 61, 23, 118, 85, 127, 51, 34, 0, 73, 111, 114, 31, 9, 64, 39, 53, 108, 52, 95, 68, 12, 78, 24, 120, 15, 4, 113, 65, 82, 16, 79, 116, 124, 14, 48, 29, 27, 54, 87, 80, 44, 81, 72, 67, 2, 3, 76, 63, 83, 28, 71, 1, 7, 66, 18, 112, 47, 75, 17, 11, 46, 88, 119, 106, 19, 5, 60, 42, 13, 6, 69, 110, 77, 26, 59, 99, 22, 62, 57, 70, 125, 96, 32, 33, 86, 84, 10, 30, 20, 100, 91, 37, 8, 74, 97, 36, 102, 94, 90, 92, 38, 35, 101], [121, 126, 58, 43, 49, 122, 109, 50, 55, 115, 56, 123, 104, 103, 45, 117, 98, 93, 21, 61, 41, 89, 105, 40, 107, 23, 25, 0, 85, 127, 64, 114, 31, 39, 51, 118, 34, 73, 111, 9, 108, 95, 27, 53, 15, 16, 79, 80, 4, 87, 113, 24, 52, 120, 82, 78, 76, 68, 1, 12, 14, 119, 83, 48, 54, 66, 18, 65, 116, 17, 7, 112, 29, 75, 88, 44, 2, 28, 71, 72, 81, 67, 63, 11, 106, 5, 99, 3, 59, 124, 46, 19, 60, 110, 13, 47, 69, 77, 22, 70, 42, 26, 96, 125, 6, 32, 62, 20, 84, 86, 57, 10, 91, 33, 30, 37, 36, 100, 74, 8, 101, 97, 94, 102, 90, 35, 38, 92], [121, 126, 58, 43, 122, 49, 55, 109, 50, 115, 123, 56, 93, 104, 103, 117, 45, 21, 89, 98, 61, 107, 105, 40, 25, 41, 85, 23, 127, 114, 34, 108, 118, 39, 51, 0, 9, 31, 64, 73, 53, 95, 27, 111, 113, 79, 14, 29, 4, 24, 15, 82, 78, 52, 12, 76, 16, 44, 88, 68, 48, 18, 54, 87, 17, 80, 116, 81, 11, 65, 75, 63, 112, 120, 19, 28, 1, 119, 7, 66, 124, 71, 110, 83, 3, 72, 70, 106, 67, 2, 13, 96, 22, 125, 42, 59, 77, 30, 5, 47, 62, 99, 57, 69, 60, 26, 46, 32, 86, 20, 33, 8, 84, 97, 6, 37, 10, 36, 100, 91, 74, 90, 101, 38, 94, 102, 35, 92], [121, 58, 126, 43, 122, 49, 109, 50, 55, 115, 123, 56, 117, 103, 104, 45, 93, 98, 21, 89, 61, 40, 85, 105, 41, 107, 25, 114, 108, 23, 34, 118, 51, 127, 0, 64, 39, 111, 9, 95, 53, 31, 73, 29, 27, 24, 113, 124, 82, 4, 14, 79, 44, 48, 76, 68, 52, 18, 112, 12, 16, 28, 87, 116, 120, 66, 15, 54, 80, 63, 78, 81, 1, 7, 119, 75, 65, 3, 83, 71, 88, 70, 17, 2, 19, 11, 60, 67, 59, 106, 47, 110, 22, 13, 72, 77, 99, 125, 46, 5, 96, 57, 42, 32, 26, 30, 69, 6, 91, 8, 36, 86, 62, 37, 84, 10, 74, 97, 20, 33, 101, 90, 100, 94, 38, 102, 35, 92], [121, 126, 58, 43, 122, 49, 50, 109, 55, 115, 56, 103, 123, 45, 117, 93, 104, 40, 85, 21, 89, 98, 105, 41, 61, 107, 25, 23, 114, 34, 51, 108, 111, 39, 127, 9, 95, 29, 31, 118, 73, 27, 113, 53, 52, 4, 64, 87, 0, 12, 24, 124, 44, 16, 48, 82, 76, 63, 78, 116, 18, 17, 88, 112, 120, 28, 68, 119, 60, 54, 15, 79, 14, 80, 81, 83, 65, 110, 19, 11, 7, 1, 46, 59, 75, 70, 106, 71, 3, 32, 42, 2, 66, 67, 125, 77, 47, 26, 13, 96, 99, 57, 91, 72, 22, 8, 62, 5, 86, 30, 97, 6, 74, 20, 69, 84, 37, 10, 36, 33, 94, 90, 100, 101, 35, 102, 92, 38], [121, 58, 126, 43, 122, 49, 109, 50, 55, 115, 56, 123, 103, 45, 117, 93, 104, 98, 89, 105, 61, 40, 21, 107, 41, 25, 118, 64, 0, 34, 51, 114, 85, 95, 39, 127, 53, 23, 9, 108, 31, 73, 27, 111, 112, 87, 68, 29, 113, 52, 120, 15, 78, 12, 24, 4, 44, 80, 48, 88, 16, 2, 14, 110, 82, 3, 67, 7, 116, 71, 63, 1, 65, 76, 28, 54, 66, 81, 18, 11, 106, 79, 83, 60, 75, 124, 17, 119, 19, 70, 46, 59, 8, 99, 13, 32, 42, 6, 47, 5, 77, 69, 22, 62, 125, 57, 72, 30, 20, 26, 91, 74, 86, 96, 84, 10, 97, 37, 36, 94, 90, 100, 33, 101, 35, 38, 102, 92], [121, 58, 126, 43, 122, 55, 49, 109, 50, 115, 56, 123, 103, 104, 117, 45, 93, 98, 21, 105, 61, 41, 89, 0, 40, 107, 127, 23, 118, 64, 34, 25, 51, 114, 9, 73, 85, 39, 31, 108, 52, 95, 111, 4, 16, 78, 15, 12, 113, 87, 14, 76, 80, 24, 124, 68, 27, 44, 53, 79, 1, 82, 120, 29, 65, 67, 83, 8, 116, 54, 88, 48, 18, 71, 7, 63, 11, 28, 2, 66, 3, 75, 60, 81, 5, 17, 110, 46, 19, 106, 112, 13, 70, 119, 59, 42, 6, 32, 77, 99, 69, 62, 22, 57, 86, 26, 47, 91, 125, 84, 74, 10, 20, 97, 96, 33, 72, 37, 100, 30, 36, 94, 90, 35, 102, 101, 92, 38], [121, 126, 58, 43, 122, 49, 109, 55, 50, 115, 56, 123, 103, 104, 117, 45, 93, 89, 21, 98, 85, 105, 61, 107, 40, 41, 25, 51, 118, 23, 114, 127, 34, 95, 0, 73, 64, 39, 31, 9, 111, 108, 53, 113, 27, 29, 12, 52, 28, 24, 44, 78, 124, 14, 87, 16, 4, 82, 15, 79, 80, 68, 65, 76, 54, 48, 1, 11, 81, 18, 120, 88, 116, 8, 63, 2, 17, 71, 75, 7, 83, 110, 6, 112, 119, 66, 46, 67, 13, 19, 60, 3, 42, 77, 57, 59, 32, 47, 99, 106, 5, 26, 69, 22, 125, 91, 86, 20, 96, 62, 70, 100, 84, 97, 74, 37, 30, 72, 33, 10, 36, 102, 90, 101, 94, 38, 35, 92], [121, 126, 58, 43, 122, 49, 109, 50, 55, 115, 56, 123, 103, 117, 45, 104, 93, 89, 98, 40, 21, 107, 25, 61, 105, 85, 95, 51, 41, 127, 114, 118, 39, 34, 108, 29, 23, 111, 27, 31, 53, 64, 112, 73, 113, 0, 87, 9, 44, 119, 76, 28, 4, 78, 82, 54, 24, 79, 120, 18, 80, 14, 71, 16, 52, 68, 15, 12, 48, 17, 124, 2, 88, 8, 1, 65, 63, 19, 6, 11, 7, 81, 110, 67, 83, 75, 116, 3, 60, 59, 106, 66, 47, 46, 32, 22, 125, 13, 42, 77, 57, 99, 30, 69, 96, 5, 26, 84, 97, 70, 91, 10, 86, 74, 36, 62, 20, 100, 37, 33, 90, 72, 94, 38, 35, 101, 102, 92], [121, 126, 58, 43, 122, 49, 109, 50, 55, 115, 56, 123, 104, 103, 117, 45, 93, 21, 89, 98, 61, 64, 40, 105, 107, 51, 127, 34, 0, 25, 41, 118, 85, 31, 73, 114, 95, 23, 39, 9, 52, 111, 4, 53, 108, 29, 113, 76, 87, 27, 78, 16, 1, 28, 82, 44, 71, 48, 8, 12, 24, 68, 18, 67, 112, 110, 81, 3, 120, 88, 65, 14, 116, 80, 15, 2, 11, 6, 60, 79, 66, 83, 54, 63, 19, 7, 106, 124, 75, 17, 125, 119, 47, 13, 46, 99, 5, 69, 59, 77, 42, 91, 62, 32, 96, 86, 10, 57, 97, 22, 26, 74, 84, 37, 20, 70, 72, 36, 30, 100, 33, 101, 94, 90, 102, 35, 38, 92], [121, 126, 58, 43, 122, 49, 109, 50, 55, 115, 123, 56, 117, 103, 104, 45, 21, 93, 98, 0, 105, 64, 40, 107, 25, 34, 41, 89, 51, 73, 23, 61, 114, 118, 53, 85, 9, 127, 31, 27, 95, 111, 76, 68, 113, 4, 39, 108, 52, 29, 16, 78, 82, 67, 24, 65, 18, 66, 80, 54, 7, 44, 87, 15, 79, 14, 71, 116, 1, 63, 6, 83, 28, 12, 8, 48, 81, 11, 112, 88, 2, 120, 17, 3, 106, 75, 119, 124, 5, 60, 19, 22, 110, 47, 99, 46, 59, 13, 69, 77, 42, 125, 84, 26, 32, 70, 57, 10, 91, 62, 74, 96, 37, 97, 20, 72, 30, 86, 100, 90, 36, 33, 94, 38, 102, 101, 92, 35], [121, 126, 58, 43, 122, 49, 50, 109, 55, 115, 56, 103, 123, 45, 117, 104, 93, 98, 21, 89, 85, 118, 107, 114, 61, 25, 23, 34, 105, 41, 40, 127, 73, 51, 31, 108, 95, 9, 0, 111, 64, 39, 29, 52, 4, 53, 87, 27, 76, 14, 79, 44, 24, 1, 16, 78, 120, 68, 28, 82, 112, 12, 15, 124, 48, 116, 65, 113, 80, 81, 71, 63, 18, 54, 7, 88, 83, 67, 11, 66, 106, 119, 110, 60, 75, 19, 17, 8, 2, 47, 32, 22, 59, 57, 13, 3, 26, 77, 6, 99, 70, 46, 125, 5, 42, 69, 62, 72, 91, 96, 33, 20, 84, 74, 10, 30, 86, 97, 37, 100, 36, 101, 90, 102, 35, 38, 94, 92], [121, 58, 126, 43, 122, 49, 109, 50, 55, 115, 56, 123, 117, 103, 104, 45, 93, 89, 85, 21, 40, 107, 25, 105, 23, 98, 61, 31, 34, 127, 39, 51, 114, 95, 118, 64, 41, 111, 0, 108, 29, 9, 27, 73, 87, 53, 28, 82, 52, 112, 68, 88, 76, 113, 24, 15, 48, 1, 81, 14, 120, 44, 18, 79, 4, 71, 78, 16, 106, 12, 80, 54, 2, 110, 66, 67, 17, 63, 19, 65, 116, 11, 119, 83, 75, 7, 60, 70, 32, 125, 13, 124, 3, 59, 47, 77, 91, 42, 96, 8, 57, 62, 99, 97, 72, 26, 46, 22, 69, 37, 5, 20, 74, 30, 100, 6, 86, 10, 33, 84, 90, 35, 36, 38, 101, 94, 102, 92], [121, 58, 126, 43, 122, 109, 49, 50, 55, 115, 56, 123, 103, 117, 45, 104, 93, 105, 98, 89, 40, 21, 107, 61, 85, 51, 0, 34, 64, 41, 118, 114, 25, 95, 23, 39, 73, 111, 31, 9, 127, 108, 53, 76, 52, 28, 27, 29, 87, 4, 116, 82, 44, 112, 78, 15, 113, 120, 48, 80, 24, 54, 106, 16, 124, 110, 18, 1, 2, 63, 68, 71, 88, 14, 12, 11, 83, 19, 17, 81, 66, 119, 65, 79, 67, 7, 46, 60, 3, 70, 77, 72, 47, 59, 99, 75, 32, 5, 22, 69, 91, 125, 96, 62, 42, 13, 57, 6, 84, 86, 26, 8, 30, 10, 36, 37, 74, 97, 33, 20, 94, 90, 100, 102, 35, 38, 101, 92], [121, 126, 58, 43, 49, 55, 122, 50, 109, 115, 123, 56, 103, 117, 104, 45, 105, 98, 93, 40, 21, 25, 107, 89, 61, 51, 41, 85, 0, 95, 111, 23, 34, 31, 39, 9, 64, 114, 118, 73, 29, 127, 53, 27, 4, 87, 113, 52, 44, 14, 78, 71, 116, 1, 76, 28, 24, 108, 16, 80, 48, 54, 66, 124, 82, 112, 68, 15, 18, 88, 3, 12, 2, 83, 81, 46, 79, 120, 17, 70, 47, 67, 11, 106, 7, 119, 63, 65, 72, 110, 59, 75, 42, 60, 125, 19, 32, 13, 77, 99, 26, 5, 62, 57, 91, 69, 20, 22, 84, 96, 86, 74, 6, 8, 37, 10, 97, 100, 36, 30, 90, 33, 101, 94, 102, 38, 35, 92], [121, 58, 126, 43, 122, 49, 109, 55, 50, 115, 56, 123, 104, 117, 103, 45, 93, 98, 21, 105, 25, 89, 40, 61, 107, 85, 41, 23, 34, 64, 51, 127, 31, 118, 0, 9, 39, 114, 95, 73, 111, 27, 68, 16, 79, 108, 4, 53, 82, 29, 52, 76, 87, 48, 44, 24, 12, 65, 78, 88, 28, 116, 14, 66, 46, 71, 18, 80, 113, 3, 11, 15, 112, 7, 2, 72, 1, 54, 124, 70, 106, 119, 81, 17, 120, 83, 67, 75, 63, 47, 60, 19, 99, 110, 5, 32, 77, 13, 22, 69, 59, 42, 26, 125, 57, 62, 84, 74, 10, 96, 91, 86, 30, 6, 33, 20, 37, 97, 8, 90, 100, 36, 38, 102, 94, 35, 101, 92], [121, 126, 58, 43, 109, 49, 122, 50, 55, 115, 56, 123, 103, 117, 45, 104, 93, 105, 61, 98, 40, 21, 89, 41, 107, 114, 51, 25, 127, 85, 118, 34, 23, 0, 95, 64, 31, 111, 53, 52, 9, 39, 108, 44, 73, 48, 27, 87, 76, 78, 82, 79, 16, 24, 29, 113, 4, 14, 1, 68, 17, 120, 28, 12, 110, 81, 88, 15, 71, 72, 124, 2, 54, 80, 11, 112, 66, 60, 63, 116, 65, 18, 46, 7, 106, 3, 83, 67, 47, 99, 62, 19, 59, 119, 70, 22, 75, 5, 125, 13, 69, 77, 100, 91, 6, 57, 96, 42, 84, 32, 26, 37, 20, 86, 33, 74, 36, 30, 10, 8, 102, 97, 94, 90, 101, 35, 38, 92], [121, 126, 58, 43, 122, 55, 49, 109, 50, 115, 56, 123, 117, 45, 103, 104, 93, 105, 98, 61, 64, 51, 89, 95, 21, 40, 41, 107, 85, 0, 25, 34, 118, 23, 31, 29, 114, 53, 108, 39, 9, 111, 52, 73, 127, 44, 87, 4, 27, 124, 16, 113, 116, 14, 68, 79, 76, 18, 67, 82, 120, 66, 78, 2, 28, 112, 63, 12, 48, 1, 106, 80, 72, 15, 24, 17, 88, 54, 65, 7, 47, 83, 110, 81, 71, 119, 3, 19, 11, 75, 6, 46, 125, 60, 57, 69, 99, 59, 5, 13, 77, 32, 70, 42, 26, 22, 62, 84, 96, 86, 91, 10, 97, 20, 8, 74, 33, 37, 30, 90, 101, 100, 36, 94, 102, 38, 35, 92], [121, 58, 126, 43, 122, 109, 49, 50, 55, 115, 56, 123, 117, 103, 45, 104, 93, 105, 98, 21, 40, 89, 61, 25, 107, 51, 41, 34, 85, 114, 118, 127, 95, 23, 31, 39, 111, 0, 108, 73, 64, 53, 29, 9, 27, 52, 87, 4, 113, 44, 28, 112, 76, 78, 14, 48, 79, 82, 88, 3, 80, 54, 71, 116, 16, 15, 110, 17, 18, 120, 1, 24, 7, 12, 124, 47, 63, 68, 67, 83, 2, 65, 81, 59, 60, 119, 19, 106, 11, 72, 6, 75, 66, 5, 125, 46, 77, 13, 69, 32, 99, 57, 22, 26, 96, 42, 62, 91, 20, 86, 74, 97, 84, 8, 10, 30, 70, 37, 33, 36, 100, 90, 38, 35, 94, 92, 101, 102], [121, 58, 126, 43, 122, 49, 109, 55, 50, 115, 123, 56, 103, 117, 104, 45, 93, 21, 98, 61, 40, 105, 23, 89, 25, 41, 107, 118, 34, 85, 127, 31, 51, 0, 111, 108, 114, 64, 95, 73, 39, 9, 52, 27, 53, 24, 113, 4, 76, 18, 15, 12, 54, 80, 79, 87, 120, 44, 16, 68, 78, 82, 48, 29, 28, 112, 6, 1, 14, 81, 71, 63, 88, 2, 65, 7, 17, 75, 66, 11, 124, 83, 116, 119, 72, 106, 47, 19, 46, 3, 67, 110, 69, 22, 59, 60, 99, 42, 125, 77, 32, 13, 26, 62, 5, 91, 74, 96, 8, 57, 86, 84, 100, 20, 33, 97, 10, 36, 30, 37, 70, 90, 94, 101, 102, 35, 38, 92], [121, 58, 126, 43, 122, 49, 109, 55, 50, 115, 56, 123, 103, 117, 104, 45, 93, 98, 40, 89, 21, 105, 41, 61, 85, 107, 51, 25, 127, 23, 34, 95, 118, 114, 0, 39, 9, 73, 31, 64, 53, 108, 111, 27, 52, 4, 113, 79, 87, 44, 48, 78, 12, 80, 28, 68, 24, 14, 76, 54, 3, 120, 29, 82, 15, 81, 110, 66, 65, 116, 18, 75, 63, 1, 112, 124, 7, 6, 88, 16, 11, 71, 17, 2, 83, 119, 106, 67, 19, 42, 60, 46, 22, 99, 47, 77, 62, 57, 96, 72, 8, 13, 59, 32, 5, 69, 125, 91, 70, 30, 20, 26, 97, 74, 86, 84, 37, 10, 33, 36, 100, 90, 101, 38, 102, 35, 94, 92], [121, 126, 58, 43, 122, 49, 109, 50, 55, 115, 56, 123, 103, 45, 117, 104, 89, 93, 98, 40, 105, 21, 85, 34, 107, 25, 114, 41, 23, 118, 95, 127, 51, 64, 31, 61, 39, 111, 29, 108, 0, 27, 112, 53, 73, 28, 87, 44, 9, 12, 113, 52, 68, 18, 78, 88, 24, 15, 82, 124, 116, 120, 54, 48, 14, 76, 60, 79, 16, 66, 1, 7, 75, 81, 65, 80, 4, 17, 63, 110, 11, 19, 2, 71, 106, 6, 119, 47, 59, 83, 99, 67, 3, 13, 46, 77, 42, 8, 125, 96, 22, 32, 57, 91, 5, 62, 70, 69, 20, 86, 37, 30, 74, 72, 84, 26, 36, 97, 100, 94, 38, 10, 33, 101, 90, 35, 102, 92], [121, 58, 126, 43, 122, 49, 109, 50, 55, 115, 56, 123, 103, 117, 104, 45, 93, 40, 98, 21, 89, 105, 127, 61, 25, 107, 23, 85, 34, 41, 111, 114, 51, 108, 73, 118, 39, 31, 95, 0, 29, 9, 64, 48, 28, 52, 68, 53, 27, 54, 112, 87, 24, 82, 12, 120, 113, 4, 78, 76, 44, 88, 16, 14, 79, 17, 80, 1, 15, 18, 116, 110, 81, 83, 63, 65, 2, 75, 7, 47, 59, 46, 66, 19, 71, 60, 8, 67, 119, 11, 106, 3, 13, 6, 42, 32, 99, 62, 125, 124, 5, 77, 26, 70, 69, 57, 86, 96, 37, 30, 91, 33, 72, 22, 97, 20, 84, 74, 100, 10, 36, 102, 94, 35, 90, 38, 101, 92], [121, 58, 126, 43, 122, 49, 109, 50, 55, 115, 56, 123, 103, 117, 104, 45, 93, 98, 21, 61, 89, 40, 105, 41, 85, 107, 23, 0, 25, 51, 34, 114, 64, 95, 31, 127, 111, 73, 118, 39, 9, 113, 108, 29, 24, 53, 52, 27, 68, 4, 120, 76, 79, 16, 18, 82, 28, 87, 12, 78, 44, 54, 116, 1, 124, 14, 15, 80, 7, 71, 48, 46, 3, 8, 66, 2, 75, 88, 67, 83, 112, 81, 110, 65, 63, 11, 17, 60, 70, 106, 99, 125, 47, 19, 22, 5, 119, 13, 59, 42, 77, 57, 32, 69, 62, 26, 74, 6, 37, 10, 97, 91, 96, 84, 100, 72, 20, 86, 33, 30, 36, 90, 94, 101, 102, 35, 38, 92], [121, 126, 58, 43, 122, 49, 109, 55, 50, 115, 56, 123, 103, 104, 117, 45, 93, 98, 105, 89, 21, 64, 40, 61, 85, 0, 107, 51, 25, 114, 118, 23, 41, 127, 9, 53, 34, 111, 95, 31, 73, 39, 108, 48, 113, 1, 68, 12, 52, 27, 44, 54, 87, 4, 29, 120, 14, 80, 18, 78, 70, 24, 66, 7, 8, 79, 28, 82, 15, 76, 63, 16, 112, 67, 71, 81, 124, 75, 46, 65, 88, 2, 3, 17, 110, 116, 69, 83, 11, 47, 106, 119, 19, 125, 59, 60, 77, 57, 32, 13, 99, 42, 62, 5, 26, 96, 10, 22, 33, 84, 86, 74, 97, 100, 20, 37, 91, 30, 36, 6, 72, 90, 102, 101, 94, 35, 92, 38], [121, 58, 126, 43, 122, 49, 50, 55, 109, 115, 56, 123, 103, 104, 117, 45, 93, 21, 40, 105, 98, 89, 61, 85, 127, 23, 41, 25, 107, 118, 114, 51, 34, 31, 39, 95, 73, 111, 9, 108, 64, 53, 0, 27, 29, 48, 12, 44, 79, 52, 82, 68, 87, 24, 54, 15, 18, 14, 113, 80, 78, 28, 16, 76, 120, 81, 70, 65, 112, 17, 116, 4, 88, 124, 75, 8, 83, 7, 110, 3, 1, 71, 11, 60, 19, 66, 63, 59, 106, 125, 47, 119, 2, 67, 13, 57, 69, 46, 77, 99, 5, 32, 96, 22, 42, 62, 20, 30, 26, 86, 10, 74, 91, 84, 97, 100, 33, 37, 36, 72, 94, 6, 90, 101, 102, 38, 35, 92], [121, 126, 58, 43, 122, 49, 109, 55, 50, 115, 56, 123, 104, 103, 117, 45, 98, 93, 89, 105, 61, 21, 40, 107, 41, 23, 25, 51, 127, 64, 34, 85, 0, 114, 39, 118, 31, 108, 73, 111, 9, 95, 4, 12, 53, 15, 87, 44, 29, 27, 113, 28, 79, 68, 52, 1, 78, 48, 14, 112, 80, 24, 16, 120, 116, 63, 76, 18, 65, 70, 2, 7, 54, 71, 67, 66, 81, 8, 82, 110, 11, 83, 124, 17, 3, 19, 47, 88, 75, 106, 46, 119, 125, 13, 60, 69, 59, 32, 5, 77, 99, 57, 62, 22, 96, 91, 6, 42, 86, 26, 30, 97, 72, 37, 20, 74, 84, 10, 90, 36, 94, 33, 100, 101, 102, 38, 35, 92], [121, 126, 58, 43, 122, 49, 50, 109, 55, 115, 123, 56, 103, 117, 104, 45, 93, 40, 89, 85, 105, 107, 98, 61, 21, 127, 23, 25, 114, 34, 118, 41, 95, 111, 51, 39, 9, 108, 29, 73, 31, 52, 53, 27, 12, 0, 24, 64, 18, 16, 120, 14, 113, 78, 87, 79, 54, 44, 112, 48, 4, 76, 15, 82, 80, 88, 68, 63, 28, 17, 1, 75, 81, 83, 119, 116, 8, 71, 11, 19, 7, 65, 106, 124, 2, 67, 3, 110, 66, 60, 70, 59, 5, 77, 13, 47, 96, 22, 62, 32, 99, 69, 46, 6, 42, 125, 57, 91, 86, 20, 26, 30, 72, 84, 74, 97, 10, 37, 36, 33, 94, 90, 100, 101, 35, 38, 102, 92]]}
diff --git a/test/srt/entrypoints/http_server/test_abort_request.py b/test/srt/entrypoints/http_server/test_abort_request.py
new file mode 100644
index 000000000..7a366f244
--- /dev/null
+++ b/test/srt/entrypoints/http_server/test_abort_request.py
@@ -0,0 +1,206 @@
+"""
+Integration test for abort_request functionality with a SGLang server.
+
+Run with:
+    python -m unittest sglang.test.srt.entrypoints.http_server.test_abort_request -v
+"""
+
+import threading
+import time
+import unittest
+from typing import Optional
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestAbortRequest(CustomTestCase):
+    """Integration test class for abort request functionality."""
+
+    model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+    base_url = DEFAULT_URL_FOR_TEST
+
+    @classmethod
+    def setUpClass(cls):
+        """Launch the server."""
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--disable-cuda-graph"],
+        )
+
+        cls.completion_url = f"{cls.base_url}/generate"
+        cls.abort_url = f"{cls.base_url}/abort_request"
+        cls.health_url = f"{cls.base_url}/health"
+
+        print(f"Server started at {cls.base_url}")
+
+    @classmethod
+    def tearDownClass(cls):
+        """Clean up the server."""
+        kill_process_tree(cls.process.pid)
+
+    def _send_completion_request(
+        self,
+        text: str,
+        request_id: str,
+        max_tokens: int = 50,
+        temperature: float = 0.8,
+        stream: bool = True,
+    ) -> requests.Response:
+        """Send a completion request to the server."""
+        payload = {
+            "text": text,
+            "sampling_params": {
+                "max_new_tokens": max_tokens,
+                "temperature": temperature,
+            },
+            "stream": stream,
+            "rid": request_id,
+        }
+
+        response = requests.post(
+            self.completion_url,
+            json=payload,
+            headers={"Content-Type": "application/json"},
+            timeout=30,
+            stream=stream,
+        )
+
+        return response
+
+    def _send_abort_request(self, request_id: str) -> requests.Response:
+        """Send an abort request."""
+        payload = {"rid": request_id}
+        return requests.post(self.abort_url, json=payload, timeout=10)
+
+    def _check_server_health(self) -> bool:
+        """Check if server is healthy."""
+        try:
+            response = requests.get(self.health_url, timeout=5)
+            return response.status_code == 200
+        except:
+            return False
+
+    def test_abort_during_non_streaming_generation(self):
+        """Test aborting a non-streaming request during generation."""
+        self.assertTrue(self._check_server_health(), "Server should be healthy")
+
+        request_id = "test_abort_non_streaming"
+        completion_result = {}
+
+        def run_completion():
+            response = self._send_completion_request(
+                "Write a detailed essay about artificial intelligence",
+                max_tokens=500,
+                temperature=1,
+                request_id=request_id,
+                stream=False,
+            )
+
+            if response.status_code == 200:
+                result = response.json()
+                completion_result["text"] = result.get("text", "")
+                completion_result["finish_reason"] = result.get("meta_info", {}).get(
+                    "finish_reason"
+                )
+
+        completion_thread = threading.Thread(target=run_completion)
+        completion_thread.start()
+        time.sleep(0.1)
+
+        abort_response = self._send_abort_request(request_id)
+        completion_thread.join()
+
+        self.assertEqual(abort_response.status_code, 200)
+        self.assertIsNotNone(completion_result, "Should have completion result")
+        if completion_result:
+            finish_reason_obj = completion_result.get("finish_reason")
+            self.assertIsNotNone(finish_reason_obj, "Should have finish_reason")
+            if finish_reason_obj:
+                self.assertEqual(
+                    finish_reason_obj.get("type"), "abort", "Should be aborted"
+                )
+
+    def test_batch_requests_with_selective_abort(self):
+        """Test multiple concurrent requests with selective abort of one request."""
+        self.assertTrue(self._check_server_health(), "Server should be healthy")
+
+        request_ids = ["batch_test_0", "batch_test_1", "batch_test_2"]
+        abort_target_id = "batch_test_1"
+        completion_results = {}
+        threads = []
+
+        def run_completion(req_id, prompt):
+            response = self._send_completion_request(
+                f"Write a story about {prompt}",
+                max_tokens=100,
+                temperature=0.8,
+                request_id=req_id,
+                stream=False,
+            )
+
+            if response.status_code == 200:
+                result = response.json()
+                completion_results[req_id] = {
+                    "text": result.get("text", ""),
+                    "finish_reason": result.get("meta_info", {}).get("finish_reason"),
+                }
+
+        # Start all requests
+        prompts = ["a knight's adventure", "a space discovery", "a chef's restaurant"]
+        for i, req_id in enumerate(request_ids):
+            thread = threading.Thread(target=run_completion, args=(req_id, prompts[i]))
+            threads.append(thread)
+            thread.start()
+
+        # Abort one request
+        time.sleep(0.1)
+        abort_response = self._send_abort_request(abort_target_id)
+
+        # Wait for completion
+        for thread in threads:
+            thread.join(timeout=30)
+
+        # Verify results
+        self.assertEqual(abort_response.status_code, 200)
+
+        # Check aborted request
+        aborted_result = completion_results.get(abort_target_id)
+        self.assertIsNotNone(
+            aborted_result, f"Aborted request {abort_target_id} should have result"
+        )
+        if aborted_result:
+            aborted_finish_reason = aborted_result.get("finish_reason")
+            self.assertIsNotNone(
+                aborted_finish_reason, "Aborted request should have finish_reason"
+            )
+            if aborted_finish_reason:
+                self.assertEqual(aborted_finish_reason.get("type"), "abort")
+
+        # Check other requests completed normally
+        normal_completions = 0
+        for req_id in request_ids:
+            if req_id != abort_target_id and req_id in completion_results:
+                result = completion_results[req_id]
+                if result:
+                    finish_reason = result.get("finish_reason")
+                    if finish_reason and finish_reason.get("type") == "length":
+                        normal_completions += 1
+
+        self.assertEqual(
+            normal_completions, 2, "Other 2 requests should complete normally"
+        )
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2, warnings="ignore")
diff --git a/test/srt/ep/test_deepep_internode.py b/test/srt/ep/test_deepep_internode.py
new file mode 100644
index 000000000..1e4239606
--- /dev/null
+++ b/test/srt/ep/test_deepep_internode.py
@@ -0,0 +1,445 @@
+# Copy from deepseek-ai/DeepEP/tests/test_internode.py
+
+import os
+import time
+
+# noinspection PyUnresolvedReferences
+import deep_ep
+
+# Test compatibility with low latency functions
+import test_deepep_low_latency
+import torch
+import torch.distributed as dist
+
+from sglang.test.test_deepep_utils import (
+    bench,
+    calc_diff,
+    create_grouped_scores,
+    init_dist,
+    inplace_unique,
+    per_token_cast_back,
+    per_token_cast_to_fp8,
+)
+
+
+def test_main(
+    num_sms: int,
+    local_rank: int,
+    num_local_ranks: int,
+    num_ranks: int,
+    num_nodes: int,
+    rank: int,
+    buffer: deep_ep.Buffer,
+    group: dist.ProcessGroup,
+):
+    # Settings
+    num_tokens, hidden, num_topk_groups, num_topk, num_experts = (
+        4096,
+        7168,
+        min(num_nodes, 4),
+        8,
+        (256 // num_ranks) * num_ranks,
+    )
+    assert num_experts % num_ranks == 0 and num_local_ranks == 8
+    if local_rank == 0:
+        print(
+            f"[config] num_tokens={num_tokens}, hidden={hidden}, num_topk_groups={num_topk_groups}, num_topk={num_topk}",
+            flush=True,
+        )
+
+    # Random data
+    x = torch.ones((num_tokens, hidden), dtype=torch.bfloat16, device="cuda") * rank
+    x_pure_rand = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device="cuda")
+    x_e4m3 = per_token_cast_to_fp8(x)
+    scores = (
+        torch.randn((num_tokens, num_experts), dtype=torch.float32, device="cuda").abs()
+        + 1
+    )
+    group_scores = scores.view(num_tokens, num_nodes, -1).amax(dim=-1)
+    group_idx = torch.topk(
+        group_scores, k=num_topk_groups, dim=-1, sorted=False
+    ).indices
+    masked_scores = create_grouped_scores(scores, group_idx, num_nodes)
+    topk_idx = torch.topk(masked_scores, num_topk, dim=-1, largest=True, sorted=False)[
+        1
+    ]
+    topk_weights = (
+        torch.ones((num_tokens, num_topk), dtype=torch.float32, device="cuda") * rank
+    )
+    topk_weights_pure_rand = torch.randn(
+        (num_tokens, num_topk), dtype=torch.float32, device="cuda"
+    )
+    rank_idx = topk_idx // (num_experts // num_ranks)
+    rank_idx.masked_fill_(topk_idx == -1, -1)
+    inplace_unique(rank_idx, num_ranks)
+    rdma_rank_idx = rank_idx // num_local_ranks
+    rdma_rank_idx.masked_fill_(rank_idx == -1, -1)
+    inplace_unique(rdma_rank_idx, num_nodes)
+
+    # RDMA dispatch counts
+    rdma_idx = topk_idx // (num_experts // num_nodes)
+    rdma_idx.masked_fill_(topk_idx == -1, -1)
+    inplace_unique(rdma_idx, num_nodes)
+    num_rdma_token_sent = rdma_idx.ne(-1).sum().item()
+
+    # Expert meta
+    num_tokens_per_expert = torch.zeros((num_experts,), dtype=torch.int, device="cuda")
+    for i in range(num_experts):
+        num_tokens_per_expert[i] = (topk_idx == i).sum()
+    gbl_num_tokens_per_expert = num_tokens_per_expert.clone()
+    dist.all_reduce(gbl_num_tokens_per_expert, group=group)
+
+    # Rank layout meta
+    num_tokens_per_rank = torch.empty((num_ranks,), dtype=torch.int, device="cuda")
+    num_tokens_per_rdma_rank = torch.empty((num_nodes,), dtype=torch.int, device="cuda")
+    token_idx_in_rank = torch.full(
+        (num_ranks, num_tokens), -1, dtype=torch.long, device="cuda"
+    )
+    for i in range(num_ranks):
+        num_tokens_per_rank[i] = (rank_idx == i).sum()
+        token_sel = (rank_idx == i).max(dim=-1)[0]
+        count = token_sel.sum().item()
+        tokens = torch.sort(token_sel.to(torch.int), descending=True)[1]
+        tokens[:count] = torch.sort(tokens[:count])[0]
+        token_idx_in_rank[i][tokens[:count]] = torch.arange(
+            count, dtype=torch.long, device="cuda"
+        )
+    for i in range(num_nodes):
+        num_tokens_per_rdma_rank[i] = (rdma_rank_idx == i).sum()
+    token_idx_in_rank = token_idx_in_rank.T.contiguous().to(torch.int)
+    is_token_in_rank = token_idx_in_rank >= 0
+    gbl_num_tokens_per_rank = num_tokens_per_rank.clone()
+    dist.all_reduce(gbl_num_tokens_per_rank, group=group)
+
+    (
+        ref_num_tokens_per_rank,
+        ref_num_tokens_per_rdma_rank,
+        ref_num_tokens_per_expert,
+        ref_is_token_in_rank,
+        _,
+    ) = buffer.get_dispatch_layout(topk_idx, num_experts)
+    assert torch.allclose(ref_num_tokens_per_rank, num_tokens_per_rank)
+    assert torch.allclose(ref_num_tokens_per_rdma_rank, num_tokens_per_rdma_rank)
+    assert torch.allclose(ref_num_tokens_per_expert, num_tokens_per_expert)
+    assert torch.allclose(ref_is_token_in_rank, is_token_in_rank)
+    t = bench(lambda: buffer.get_dispatch_layout(topk_idx, num_experts))[0]
+    if local_rank == 0:
+        print(f"[layout] Kernel performance: {t * 1000:.3f} ms", flush=True)
+        print("", flush=True)
+    group.barrier()
+    time.sleep(1)
+
+    # Config
+    rdma_buffer_size, nvl_buffer_size = 128, (720 if num_ranks in (144, 160) else 512)
+    config = deep_ep.Config(num_sms, 8, nvl_buffer_size, 16, rdma_buffer_size)
+
+    # Test dispatch
+    # noinspection PyShadowingNames
+    def check_data(check_x, recv_gbl_rank_prefix_sum):
+        assert torch.allclose(check_x.amin(dim=1), check_x.amax(dim=1))
+        check_start = 0
+        for i in range(num_ranks):
+            check_end = recv_gbl_rank_prefix_sum[i].item()
+            assert (check_x[check_start:check_end, :].int() - i).sum().item() == 0
+            check_start = check_end
+
+    for previous_mode in (False, True):
+        for async_mode in (False, True):
+            for current_x in (x_pure_rand, x, x_e4m3):
+                for with_topk in (False, True):
+                    if local_rank == 0:
+                        print(
+                            f'[testing] Running with {"FP8" if isinstance(current_x, tuple) else "BF16"}, {"with" if with_topk else "without"} top-k (async={async_mode}, previous={previous_mode}) ...',
+                            flush=True,
+                            end="",
+                        )
+                    dispatch_args = {
+                        "x": current_x,
+                        "num_tokens_per_rank": num_tokens_per_rank,
+                        "num_tokens_per_rdma_rank": num_tokens_per_rdma_rank,
+                        "is_token_in_rank": is_token_in_rank,
+                        "num_tokens_per_expert": num_tokens_per_expert,
+                        "config": config,
+                        "async_finish": async_mode,
+                    }
+                    if with_topk:
+                        dispatch_args.update(
+                            {
+                                "topk_idx": topk_idx,
+                                "topk_weights": (
+                                    topk_weights_pure_rand
+                                    if current_x is x_pure_rand
+                                    else topk_weights
+                                ),
+                            }
+                        )
+                    if previous_mode:
+                        dispatch_args.update({"previous_event": buffer.capture()})
+                    (
+                        recv_x,
+                        recv_topk_idx,
+                        recv_topk_weights,
+                        recv_num_tokens_per_expert_list,
+                        handle,
+                        event,
+                    ) = buffer.dispatch(**dispatch_args)
+                    event.current_stream_wait() if async_mode else ()
+                    recv_x = (
+                        per_token_cast_back(*recv_x)
+                        if isinstance(recv_x, tuple)
+                        else recv_x
+                    )
+
+                    # Checks
+                    recv_gbl_rank_prefix_sum = handle[-4]
+                    assert gbl_num_tokens_per_rank[rank].item() == recv_x.size(
+                        0
+                    ), f"{gbl_num_tokens_per_rank[rank].item()} != {recv_x.size(0)}"
+                    assert (
+                        gbl_num_tokens_per_expert.view(num_ranks, -1)[rank].tolist()
+                        == recv_num_tokens_per_expert_list
+                    )
+                    if current_x is not x_pure_rand:
+                        check_data(recv_x, recv_gbl_rank_prefix_sum)
+                    if with_topk:
+                        # Check `topk_idx`
+                        assert (
+                            recv_topk_idx.eq(-1)
+                            | (
+                                (recv_topk_idx >= 0)
+                                & (recv_topk_idx < (num_experts // num_ranks))
+                            )
+                        ).sum().item() == recv_topk_idx.numel()
+                        for i, count in enumerate(recv_num_tokens_per_expert_list):
+                            assert recv_topk_idx.eq(i).sum().item() == count
+
+                        # Check `topk_weights`
+                        if current_x is not x_pure_rand:
+                            recv_topk_weights[recv_topk_idx.eq(-1)] = (
+                                recv_topk_weights.amax(dim=1, keepdim=True).expand_as(
+                                    recv_topk_weights
+                                )[recv_topk_idx.eq(-1)]
+                            )
+                            check_data(recv_topk_weights, recv_gbl_rank_prefix_sum)
+
+                    # Test cached dispatch (must without top-k staffs)
+                    if not with_topk:
+                        dispatch_args = {
+                            "x": current_x,
+                            "handle": handle,
+                            "config": config,
+                            "async_finish": async_mode,
+                        }
+                        if previous_mode:
+                            dispatch_args.update({"previous_event": buffer.capture()})
+                        recv_x, _, _, _, _, event = buffer.dispatch(**dispatch_args)
+                        event.current_stream_wait() if async_mode else ()
+                        recv_x = (
+                            per_token_cast_back(*recv_x)
+                            if isinstance(recv_x, tuple)
+                            else recv_x
+                        )
+                        if current_x is not x_pure_rand:
+                            check_data(recv_x, recv_gbl_rank_prefix_sum)
+
+                    # Test combine
+                    combine_args = {
+                        "x": recv_x,
+                        "handle": handle,
+                        "config": config,
+                        "async_finish": async_mode,
+                    }
+                    if with_topk:
+                        combine_args.update({"topk_weights": recv_topk_weights})
+                    if previous_mode:
+                        dispatch_args.update({"previous_event": buffer.capture()})
+                    combined_x, combined_topk_weights, event = buffer.combine(
+                        **combine_args
+                    )
+                    event.current_stream_wait() if async_mode else ()
+                    check_x = combined_x.float() / is_token_in_rank.sum(
+                        dim=1
+                    ).unsqueeze(1)
+                    ref_x = x_pure_rand if current_x is x_pure_rand else x
+                    assert calc_diff(check_x, ref_x) < 5e-6
+                    if with_topk:
+                        check_topk_weights = (
+                            combined_topk_weights
+                            if (current_x is x_pure_rand)
+                            else (
+                                combined_topk_weights
+                                / is_token_in_rank.sum(dim=1).unsqueeze(1)
+                            )
+                        )
+                        ref_topk_weights = (
+                            topk_weights_pure_rand
+                            if current_x is x_pure_rand
+                            else topk_weights
+                        )
+                        assert calc_diff(check_topk_weights, ref_topk_weights) < 1e-9
+
+                    # For later tuning
+                    dispatch_bf16_rdma_send_bytes = num_rdma_token_sent * hidden * 2
+                    dispatch_bf16_nvl_recv_bytes = recv_x.numel() * 2
+                    combine_bf16_nvl_send_bytes = dispatch_bf16_nvl_recv_bytes
+                    combine_bf16_rdma_recv_bytes = dispatch_bf16_rdma_send_bytes
+
+                    if local_rank == 0:
+                        print(" passed", flush=True)
+    if local_rank == 0:
+        print("", flush=True)
+
+    # Tune dispatch performance
+    best_dispatch_results = None
+    fp8_factor = (1 + 4 / 128) / 2
+    for current_x in (x_e4m3, x):
+        best_time, best_results = 1e10, None
+        rdma_send_bytes = (
+            (dispatch_bf16_rdma_send_bytes * fp8_factor)
+            if isinstance(current_x, tuple)
+            else dispatch_bf16_rdma_send_bytes
+        )
+        nvl_recv_bytes = (
+            (dispatch_bf16_nvl_recv_bytes * fp8_factor)
+            if isinstance(current_x, tuple)
+            else dispatch_bf16_nvl_recv_bytes
+        )
+        for nvl_chunk_size in range(4, 33, 4):
+            for rdma_chunk_size in range(4, 33, 4):
+                config = deep_ep.Config(
+                    num_sms,
+                    nvl_chunk_size,
+                    nvl_buffer_size,
+                    rdma_chunk_size,
+                    rdma_buffer_size,
+                )
+                tune_args = {"x": current_x, "handle": handle, "config": config}
+                t = bench(lambda: buffer.dispatch(**tune_args))[0]
+                if t < best_time:
+                    best_time, best_results = t, (
+                        num_sms,
+                        nvl_chunk_size,
+                        rdma_chunk_size,
+                    )
+                if local_rank == 0:
+                    print(
+                        f"[tuning] SMs {num_sms}, NVL chunk {nvl_chunk_size}, RDMA chunk {rdma_chunk_size}: {rdma_send_bytes / 1e9 / t:.2f} GB/s (RDMA), {nvl_recv_bytes / 1e9 / t:.2f} GB/s (NVL) ",
+                        flush=True,
+                    )
+        if local_rank == 0:
+            print(
+                f'[tuning] Best dispatch ({"FP8" if isinstance(current_x, tuple) else "BF16"}): SMs {best_results[0]}, NVL chunk {best_results[1]}, RDMA chunk {best_results[2]}: {rdma_send_bytes / 1e9 / best_time:.2f} GB/s (RDMA), {nvl_recv_bytes / 1e9 / best_time:.2f} GB/s (NVL)',
+                flush=True,
+            )
+            print("", flush=True)
+
+        if isinstance(current_x, tuple):
+            # Gather FP8 the best config from rank 0
+            best_dispatch_results = torch.tensor(
+                [best_results[0], best_results[1], best_results[2]],
+                dtype=torch.int32,
+                device="cuda",
+            )
+            all_best_fp8_results_list = [
+                torch.zeros_like(best_dispatch_results)
+                for _ in range(torch.distributed.get_world_size())
+            ]
+            dist.all_gather(
+                all_best_fp8_results_list, best_dispatch_results, group=group
+            )
+            best_dispatch_results = all_best_fp8_results_list[0].tolist()
+    dispatch_config = deep_ep.Config(
+        best_dispatch_results[0],
+        best_dispatch_results[1],
+        nvl_buffer_size,
+        best_dispatch_results[2],
+        rdma_buffer_size,
+    )
+
+    dispatch_args = {
+        "x": x,
+        "num_tokens_per_rank": num_tokens_per_rank,
+        "num_tokens_per_rdma_rank": num_tokens_per_rdma_rank,
+        "is_token_in_rank": is_token_in_rank,
+        "num_tokens_per_expert": num_tokens_per_expert,
+        "config": dispatch_config if dispatch_config is not None else config,
+    }
+    recv_x, _, _, _, handle, _ = buffer.dispatch(**dispatch_args)
+
+    # Tune combine performance
+    best_time, best_results = 1e10, None
+    for nvl_chunk_size in range(1, 5, 1):
+        for rdma_chunk_size in range(8, 33, 4):
+            config = deep_ep.Config(
+                num_sms,
+                nvl_chunk_size,
+                nvl_buffer_size,
+                rdma_chunk_size,
+                rdma_buffer_size,
+            )
+            tune_args = {"x": recv_x, "handle": handle, "config": config}
+            t = bench(lambda: buffer.combine(**tune_args))[0]
+            if local_rank == 0:
+                print(
+                    f"[tuning] SMs {num_sms}, NVL chunk {nvl_chunk_size}, RDMA chunk {rdma_chunk_size}: {combine_bf16_rdma_recv_bytes / 1e9 / t:.2f} GB/s (RDMA), {combine_bf16_nvl_send_bytes / 1e9 / t:.2f} GB/s (NVL) ",
+                    flush=True,
+                )
+                if t < best_time:
+                    best_time, best_results = t, (
+                        num_sms,
+                        nvl_chunk_size,
+                        rdma_chunk_size,
+                    )
+
+    if local_rank == 0:
+        print(
+            f"[tuning] Best combine: SMs {best_results[0]}, NVL chunk {best_results[1]}, RDMA chunk {best_results[2]}: {combine_bf16_rdma_recv_bytes / 1e9 / best_time:.2f} GB/s (RDMA), {combine_bf16_nvl_send_bytes / 1e9 / best_time:.2f} GB/s (NVL)",
+            flush=True,
+        )
+        print("", flush=True)
+
+
+# noinspection PyUnboundLocalVariable
+def test_loop(local_rank: int, num_local_ranks: int):
+    num_nodes = int(os.getenv("WORLD_SIZE", 1))
+    rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
+    test_ll_compatibility = False
+    if test_ll_compatibility:
+        ll_num_tokens, ll_hidden, ll_num_experts, ll_num_topk = 16, 5120, 256, 9
+
+    buffer = deep_ep.Buffer(
+        group,
+        int(1e9),
+        int(1e9),
+        low_latency_mode=test_ll_compatibility,
+        num_qps_per_rank=(ll_num_experts // num_ranks if test_ll_compatibility else 1),
+    )
+    assert num_local_ranks == 8 and num_ranks > 8
+    torch.manual_seed(rank)
+
+    for i in (24,):
+        test_main(
+            i, local_rank, num_local_ranks, num_ranks, num_nodes, rank, buffer, group
+        )
+        if local_rank == 0:
+            print("", flush=True)
+
+    # Test compatibility with low latency functions
+    if test_ll_compatibility:
+        buffer.clean_low_latency_buffer(ll_num_tokens, ll_hidden, ll_num_experts)
+        test_deepep_low_latency.test_main(
+            ll_num_tokens,
+            ll_hidden,
+            ll_num_experts,
+            ll_num_topk,
+            rank,
+            num_ranks,
+            group,
+            buffer,
+            seed=1,
+        )
+
+
+if __name__ == "__main__":
+    num_processes = 8
+    torch.multiprocessing.spawn(test_loop, args=(num_processes,), nprocs=num_processes)
diff --git a/test/srt/ep/test_deepep_intranode.py b/test/srt/ep/test_deepep_intranode.py
new file mode 100644
index 000000000..97acd000c
--- /dev/null
+++ b/test/srt/ep/test_deepep_intranode.py
@@ -0,0 +1,379 @@
+# Copy from deepseek-ai/DeepEP/tests/test_intranode.py
+
+import os
+import time
+
+# noinspection PyUnresolvedReferences
+import deep_ep
+
+# Test compatibility with low latency functions
+import test_deepep_low_latency
+import torch
+import torch.distributed as dist
+
+from sglang.test.test_deepep_utils import (
+    bench,
+    calc_diff,
+    init_dist,
+    inplace_unique,
+    per_token_cast_back,
+    per_token_cast_to_fp8,
+)
+
+
+def test_main(
+    num_sms: int,
+    local_rank: int,
+    num_ranks: int,
+    rank: int,
+    buffer: deep_ep.Buffer,
+    group: dist.ProcessGroup,
+):
+    # Settings
+    num_tokens, hidden, num_topk, num_experts = (
+        4096,
+        7168,
+        8,
+        (256 // num_ranks) * num_ranks,
+    )
+    assert num_experts % num_ranks == 0
+    if local_rank == 0:
+        print(
+            f"[config] num_tokens={num_tokens}, hidden={hidden}, num_topk={num_topk}",
+            flush=True,
+        )
+
+    # Random data
+    x = torch.ones((num_tokens, hidden), dtype=torch.bfloat16, device="cuda") * rank
+    x_pure_rand = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device="cuda")
+    x_e4m3 = per_token_cast_to_fp8(x)
+    scores = (
+        torch.randn((num_tokens, num_experts), dtype=torch.float32, device="cuda").abs()
+        + 1
+    )
+    topk_idx = torch.topk(scores, num_topk, dim=-1, largest=True, sorted=False)[1]
+    topk_weights = (
+        torch.ones((num_tokens, num_topk), dtype=torch.float32, device="cuda") * rank
+    )
+    topk_weights_pure_rand = torch.randn(
+        (num_tokens, num_topk), dtype=torch.float32, device="cuda"
+    )
+    rank_idx = topk_idx // (num_experts // num_ranks)
+    rank_idx.masked_fill_(topk_idx == -1, -1)
+    inplace_unique(rank_idx, num_ranks)
+
+    # Expert meta
+    num_tokens_per_expert = torch.zeros((num_experts,), dtype=torch.int, device="cuda")
+    for i in range(num_experts):
+        num_tokens_per_expert[i] = (topk_idx == i).sum()
+    gbl_num_tokens_per_expert = num_tokens_per_expert.clone()
+    dist.all_reduce(gbl_num_tokens_per_expert, group=group)
+
+    # Rank layout meta
+    num_tokens_per_rank = torch.empty((num_ranks,), dtype=torch.int, device="cuda")
+    token_idx_in_rank = torch.full(
+        (num_ranks, num_tokens), -1, dtype=torch.long, device="cuda"
+    )
+    for i in range(num_ranks):
+        num_tokens_per_rank[i] = (rank_idx == i).sum()
+        token_sel = (rank_idx == i).max(dim=-1)[0]
+        count = token_sel.sum().item()
+        tokens = torch.sort(token_sel.to(torch.int), descending=True)[1]
+        tokens[:count] = torch.sort(tokens[:count])[0]
+        token_idx_in_rank[i][tokens[:count]] = torch.arange(
+            count, dtype=torch.long, device="cuda"
+        )
+    token_idx_in_rank = token_idx_in_rank.T.contiguous().to(torch.int)
+    is_token_in_rank = token_idx_in_rank >= 0
+    gbl_num_tokens_per_rank = num_tokens_per_rank.clone()
+    dist.all_reduce(gbl_num_tokens_per_rank, group=group)
+
+    ref_num_tokens_per_rank, _, ref_num_tokens_per_expert, ref_is_token_in_rank, _ = (
+        buffer.get_dispatch_layout(topk_idx, num_experts)
+    )
+    assert torch.allclose(ref_num_tokens_per_rank, num_tokens_per_rank)
+    assert torch.allclose(ref_num_tokens_per_expert, num_tokens_per_expert)
+    assert torch.allclose(ref_is_token_in_rank, is_token_in_rank)
+    t = bench(lambda: buffer.get_dispatch_layout(topk_idx, num_experts))[0]
+    if local_rank == 0:
+        print(f"[layout] Kernel performance: {t * 1000:.3f} ms", flush=True)
+        print("", flush=True)
+    group.barrier()
+    time.sleep(1)
+
+    # Config
+    nvl_buffer_size = 256
+    config = deep_ep.Config(num_sms, 8, nvl_buffer_size)
+
+    # Test dispatch
+    # noinspection PyShadowingNames
+    def check_data(check_x, rank_prefix_matrix):
+        assert torch.allclose(check_x.amin(dim=1), check_x.amax(dim=1))
+        check_start = 0
+        for i in range(num_ranks):
+            check_end = rank_prefix_matrix[i][rank].item()
+            assert (check_x[check_start:check_end, :].int() - i).sum().item() == 0
+            check_start = check_end
+
+    for previous_mode in (False, True):
+        for async_mode in (False, True):
+            for current_x in (x_pure_rand, x, x_e4m3):
+                for with_topk in (False, True):
+                    if local_rank == 0:
+                        print(
+                            f'[testing] Running with {"FP8" if isinstance(current_x, tuple) else "BF16"}, {"with" if with_topk else "without"} top-k (async={async_mode}, previous={previous_mode}) ...',
+                            flush=True,
+                            end="",
+                        )
+                    dispatch_args = {
+                        "x": current_x,
+                        "num_tokens_per_rank": num_tokens_per_rank,
+                        "is_token_in_rank": is_token_in_rank,
+                        "num_tokens_per_expert": num_tokens_per_expert,
+                        "config": config,
+                        "async_finish": async_mode,
+                    }
+                    if with_topk:
+                        dispatch_args.update(
+                            {
+                                "topk_idx": topk_idx,
+                                "topk_weights": (
+                                    topk_weights_pure_rand
+                                    if current_x is x_pure_rand
+                                    else topk_weights
+                                ),
+                            }
+                        )
+                    if previous_mode:
+                        dispatch_args.update({"previous_event": buffer.capture()})
+                    (
+                        recv_x,
+                        recv_topk_idx,
+                        recv_topk_weights,
+                        recv_num_tokens_per_expert_list,
+                        handle,
+                        event,
+                    ) = buffer.dispatch(**dispatch_args)
+                    event.current_stream_wait() if async_mode else ()
+                    recv_x = (
+                        per_token_cast_back(*recv_x)
+                        if isinstance(recv_x, tuple)
+                        else recv_x
+                    )
+
+                    # Checks
+                    rank_prefix_matrix = handle[0]
+                    assert gbl_num_tokens_per_rank[rank].item() == recv_x.size(
+                        0
+                    ), f"{gbl_num_tokens_per_rank[rank].item()} != {recv_x.size(0)}"
+                    assert (
+                        gbl_num_tokens_per_expert.view(num_ranks, -1)[rank].tolist()
+                        == recv_num_tokens_per_expert_list
+                    )
+                    if current_x is not x_pure_rand:
+                        check_data(recv_x, rank_prefix_matrix)
+                    if with_topk:
+                        # Check `topk_idx`
+                        assert (
+                            recv_topk_idx.eq(-1)
+                            | (
+                                (recv_topk_idx >= 0)
+                                & (recv_topk_idx < (num_experts // num_ranks))
+                            )
+                        ).sum().item() == recv_topk_idx.numel()
+                        for i, count in enumerate(recv_num_tokens_per_expert_list):
+                            assert recv_topk_idx.eq(i).sum().item() == count
+
+                        # Check `topk_weights`
+                        if current_x is not x_pure_rand:
+                            recv_topk_weights[recv_topk_idx.eq(-1)] = (
+                                recv_topk_weights.amax(dim=1, keepdim=True).expand_as(
+                                    recv_topk_weights
+                                )[recv_topk_idx.eq(-1)]
+                            )
+                            check_data(recv_topk_weights, rank_prefix_matrix)
+
+                    # Test cached dispatch (must without top-k staffs)
+                    if not with_topk:
+                        dispatch_args = {
+                            "x": current_x,
+                            "handle": handle,
+                            "config": config,
+                            "async_finish": async_mode,
+                        }
+                        if previous_mode:
+                            dispatch_args.update({"previous_event": buffer.capture()})
+                        recv_x, _, _, _, _, event = buffer.dispatch(**dispatch_args)
+                        event.current_stream_wait() if async_mode else ()
+                        recv_x = (
+                            per_token_cast_back(*recv_x)
+                            if isinstance(recv_x, tuple)
+                            else recv_x
+                        )
+                        if current_x is not x_pure_rand:
+                            check_data(recv_x, rank_prefix_matrix)
+
+                    # Test combine
+                    combine_args = {
+                        "x": recv_x,
+                        "handle": handle,
+                        "config": config,
+                        "async_finish": async_mode,
+                    }
+                    if with_topk:
+                        combine_args.update({"topk_weights": recv_topk_weights})
+                    if previous_mode:
+                        dispatch_args.update({"previous_event": buffer.capture()})
+                    combined_x, combined_topk_weights, event = buffer.combine(
+                        **combine_args
+                    )
+                    event.current_stream_wait() if async_mode else ()
+                    check_x = combined_x.float() / is_token_in_rank.sum(
+                        dim=1
+                    ).unsqueeze(1)
+                    ref_x = x_pure_rand if current_x is x_pure_rand else x
+                    assert calc_diff(check_x, ref_x) < 5e-6
+                    if with_topk:
+                        check_topk_weights = (
+                            combined_topk_weights
+                            if (current_x is x_pure_rand)
+                            else (
+                                combined_topk_weights
+                                / is_token_in_rank.sum(dim=1).unsqueeze(1)
+                            )
+                        )
+                        ref_topk_weights = (
+                            topk_weights_pure_rand
+                            if current_x is x_pure_rand
+                            else topk_weights
+                        )
+                        assert calc_diff(check_topk_weights, ref_topk_weights) < 1e-9
+
+                    # For later tuning
+                    dispatch_bf16_nvl_recv_bytes = recv_x.numel() * 2
+                    combine_bf16_nvl_send_bytes = dispatch_bf16_nvl_recv_bytes
+
+                    if local_rank == 0:
+                        print(" passed", flush=True)
+    if local_rank == 0:
+        print("", flush=True)
+
+    # Tune dispatch performance
+    best_dispatch_results = None
+    fp8_factor = (1 + 4 / 128) / 2
+    for current_x in (x_e4m3, x):
+        best_time, best_results = 1e10, None
+        nvl_recv_bytes = (
+            (dispatch_bf16_nvl_recv_bytes * fp8_factor)
+            if isinstance(current_x, tuple)
+            else dispatch_bf16_nvl_recv_bytes
+        )
+        for nvl_chunk_size in range(4, 33, 4):
+            config = deep_ep.Config(num_sms, nvl_chunk_size, nvl_buffer_size)
+            tune_args = {"x": current_x, "handle": handle, "config": config}
+            t = bench(lambda: buffer.dispatch(**tune_args))[0]
+            if t < best_time:
+                best_time, best_results = t, (num_sms, nvl_chunk_size)
+            if local_rank == 0:
+                print(
+                    f"[tuning] SMs {num_sms}, NVL chunk {nvl_chunk_size}: {nvl_recv_bytes / 1e9 / t:.2f} GB/s (NVL) ",
+                    flush=True,
+                )
+        if local_rank == 0:
+            print(
+                f'[tuning] Best dispatch ({"FP8" if isinstance(current_x, tuple) else "BF16"}): SMs {best_results[0]}, NVL chunk {best_results[1]}, {nvl_recv_bytes / 1e9 / best_time:.2f} GB/s (NVL)',
+                flush=True,
+            )
+            print("", flush=True)
+
+        if isinstance(current_x, tuple):
+            # Gather FP8 the best config from rank 0
+            best_dispatch_results = torch.tensor(
+                [best_results[0], best_results[1]], dtype=torch.int32, device="cuda"
+            )
+            all_best_fp8_results_list = [
+                torch.zeros_like(best_dispatch_results)
+                for _ in range(torch.distributed.get_world_size())
+            ]
+            dist.all_gather(
+                all_best_fp8_results_list, best_dispatch_results, group=group
+            )
+            best_dispatch_results = all_best_fp8_results_list[0].tolist()
+    dispatch_config = deep_ep.Config(
+        best_dispatch_results[0], best_dispatch_results[1], nvl_buffer_size
+    )
+
+    dispatch_args = {
+        "x": x,
+        "num_tokens_per_rank": num_tokens_per_rank,
+        "is_token_in_rank": is_token_in_rank,
+        "num_tokens_per_expert": num_tokens_per_expert,
+        "config": dispatch_config if dispatch_config is not None else config,
+    }
+    recv_x, _, _, _, handle, _ = buffer.dispatch(**dispatch_args)
+
+    # Tune combine performance
+    best_time, best_results = 1e10, None
+    for nvl_chunk_size in range(1, 7, 1):
+        config = deep_ep.Config(num_sms, nvl_chunk_size, nvl_buffer_size)
+        tune_args = {"x": recv_x, "handle": handle, "config": config}
+        t = bench(lambda: buffer.combine(**tune_args))[0]
+        if local_rank == 0:
+            print(
+                f"[tuning] SMs {num_sms}, NVL chunk {nvl_chunk_size}: {combine_bf16_nvl_send_bytes / 1e9 / t:.2f} GB/s (NVL) ",
+                flush=True,
+            )
+            if t < best_time:
+                best_time, best_results = t, (num_sms, nvl_chunk_size)
+
+    if local_rank == 0:
+        print(
+            f"[tuning] Best combine: SMs {best_results[0]}, NVL chunk {best_results[1]}: {combine_bf16_nvl_send_bytes / 1e9 / best_time:.2f} GB/s (NVL)",
+            flush=True,
+        )
+        print("", flush=True)
+
+
+# noinspection PyUnboundLocalVariable
+def test_loop(local_rank: int, num_local_ranks: int):
+    rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
+    test_ll_compatibility, num_rdma_bytes = False, 0
+    if test_ll_compatibility:
+        ll_num_tokens, ll_hidden, ll_num_experts, ll_num_topk = 16, 5120, 256, 9
+        num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
+            ll_num_tokens, ll_hidden, num_ranks, ll_num_experts
+        )
+
+    buffer = deep_ep.Buffer(
+        group,
+        int(1e9),
+        num_rdma_bytes,
+        low_latency_mode=test_ll_compatibility,
+        num_qps_per_rank=(ll_num_experts // num_ranks if test_ll_compatibility else 1),
+    )
+    torch.manual_seed(rank)
+
+    for i in (24,):
+        test_main(i, local_rank, num_ranks, rank, buffer, group)
+        if local_rank == 0:
+            print("", flush=True)
+
+    # Test compatibility with low latency functions
+    if test_ll_compatibility:
+        buffer.clean_low_latency_buffer(ll_num_tokens, ll_hidden, ll_num_experts)
+        test_deepep_low_latency.test_main(
+            ll_num_tokens,
+            ll_hidden,
+            ll_num_experts,
+            ll_num_topk,
+            rank,
+            num_ranks,
+            group,
+            buffer,
+            seed=1,
+        )
+
+
+if __name__ == "__main__":
+    num_processes = 8
+    torch.multiprocessing.spawn(test_loop, args=(num_processes,), nprocs=num_processes)
diff --git a/test/srt/ep/test_deepep_large.py b/test/srt/ep/test_deepep_large.py
new file mode 100644
index 000000000..94fff566c
--- /dev/null
+++ b/test/srt/ep/test_deepep_large.py
@@ -0,0 +1,149 @@
+import unittest
+from types import SimpleNamespace
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestDeepseek(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--moe-dense-tp-size",
+                "1",
+                "--enable-dp-lm-head",
+                "--moe-a2a-backend",
+                "deepep",
+                "--enable-two-batch-overlap",
+                "--ep-num-redundant-experts",
+                "32",
+                "--ep-dispatch-algorithm",
+                "dynamic",
+                "--eplb-algorithm",
+                "deepseek",
+                "--cuda-graph-bs",
+                "256",
+                "--max-running-requests",
+                "2048",
+                "--disable-radix-cache",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=1200,
+            parallel=1200,
+            max_new_tokens=512,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(f"Eval accuracy of GSM8K: {metrics=}")
+
+        self.assertGreater(metrics["accuracy"], 0.92)
+
+
+class TestDeepseekMTP(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--moe-dense-tp-size",
+                "1",
+                "--enable-dp-lm-head",
+                "--moe-a2a-backend",
+                "deepep",
+                "--enable-two-batch-overlap",
+                "--ep-num-redundant-experts",
+                "32",
+                "--ep-dispatch-algorithm",
+                "dynamic",
+                "--eplb-algorithm",
+                "deepseek",
+                "--cuda-graph-bs",
+                "64",  # TODO: increase it to 128 when TBO is supported in draft_extend
+                "--max-running-requests",
+                "512",
+                "--speculative-algorithm",
+                "EAGLE",
+                "--speculative-num-steps",
+                "1",
+                "--speculative-eagle-topk",
+                "1",
+                "--speculative-num-draft-tokens",
+                "2",
+                "--disable-radix-cache",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=1200,
+            parallel=1200,
+            max_new_tokens=512,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(f"Eval accuracy of GSM8K: {metrics=}")
+
+        self.assertGreater(metrics["accuracy"], 0.92)
+
+        server_info = requests.get(self.base_url + "/get_server_info")
+        avg_spec_accept_length = server_info.json()["internal_states"][0][
+            "avg_spec_accept_length"
+        ]
+        print(
+            f"###test_gsm8k:\n"
+            f"accuracy={metrics['accuracy']=:.3f}\n"
+            f"{avg_spec_accept_length=:.3f}\n"
+        )
+        self.assertGreater(avg_spec_accept_length, 1.85)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/ep/test_deepep_low_latency.py b/test/srt/ep/test_deepep_low_latency.py
new file mode 100644
index 000000000..1b3f34da9
--- /dev/null
+++ b/test/srt/ep/test_deepep_low_latency.py
@@ -0,0 +1,325 @@
+# Copy from deepseek-ai/DeepEP/tests/test_low_latency.py
+
+import random
+from functools import partial
+
+import deep_ep
+import torch
+import torch.distributed as dist
+
+from sglang.test.test_deepep_utils import (
+    bench,
+    bench_kineto,
+    calc_diff,
+    hash_tensor,
+    init_dist,
+    per_token_cast_back,
+)
+
+
+def test_main(
+    num_tokens: int,
+    hidden: int,
+    num_experts: int,
+    num_topk: int,
+    rank: int,
+    num_ranks: int,
+    group: dist.ProcessGroup,
+    buffer: deep_ep.Buffer,
+    seed: int = 0,
+):
+    torch.manual_seed(seed + rank)
+    random.seed(seed + rank)
+
+    assert num_experts % num_ranks == 0
+    num_local_experts = num_experts // num_ranks
+
+    # NOTES: the integers greater than 256 exceeds the BF16 precision limit
+    rank_offset = 128
+    assert (
+        num_ranks - rank_offset < 257
+    ), "Too many ranks (exceeding test precision limit)"
+
+    x = torch.ones((num_tokens, hidden), dtype=torch.bfloat16, device="cuda") * (
+        rank - rank_offset
+    )
+    x[:, -128:] = torch.arange(num_tokens, device="cuda").to(torch.bfloat16).view(-1, 1)
+    scores = (
+        torch.randn((num_tokens, num_experts), dtype=torch.float32, device="cuda").abs()
+        + 1
+    )
+    topk_idx = torch.topk(scores, num_topk, dim=-1, largest=True, sorted=True)[1]
+    topk_weights = torch.randn(
+        (num_tokens, num_topk), dtype=torch.float32, device="cuda"
+    ).abs()
+
+    # Randomly mask some positions
+    for i in range(10):
+        topk_idx[random.randint(0, num_tokens - 1), random.randint(0, num_topk - 1)] = (
+            -1
+        )
+
+    # Check dispatch correctness
+    do_check = True
+    hash_value, num_times = 0, 0
+    for return_recv_hook in (False, True):
+        for dispatch_use_fp8 in (False, True):
+            num_times += 1
+            for i in range((num_times % 2) + 1):
+                packed_recv_x, packed_recv_count, handle, event, hook = (
+                    buffer.low_latency_dispatch(
+                        x,
+                        topk_idx,
+                        num_tokens,
+                        num_experts,
+                        use_fp8=dispatch_use_fp8,
+                        async_finish=not return_recv_hook,
+                        return_recv_hook=return_recv_hook,
+                    )
+                )
+                hook() if return_recv_hook else event.current_stream_wait()
+            packed_recv_x = (
+                (packed_recv_x[0], packed_recv_x[1].contiguous())
+                if dispatch_use_fp8
+                else packed_recv_x
+            )
+            simulated_gemm_x = (
+                per_token_cast_back(
+                    packed_recv_x[0].view(-1, hidden),
+                    packed_recv_x[1].view(-1, hidden // 128),
+                ).view(packed_recv_x[0].shape)
+                if dispatch_use_fp8
+                else packed_recv_x.clone()
+            )
+            all_topk_idx = torch.empty(
+                (num_ranks, num_tokens, num_topk), dtype=topk_idx.dtype, device="cuda"
+            )
+            dist.all_gather_into_tensor(all_topk_idx, topk_idx, group=group)
+            for i in range(num_local_experts if do_check else 0):
+                expert_id = rank * num_local_experts + i
+                recv_x = (
+                    per_token_cast_back(packed_recv_x[0][i], packed_recv_x[1][i])
+                    if dispatch_use_fp8
+                    else packed_recv_x[i]
+                )
+                recv_count, recv_src_info, recv_layout_range = (
+                    packed_recv_count[i],
+                    handle[0][i],
+                    handle[1][i],
+                )
+
+                # Check expert indices
+                int_mask = (2**32) - 1
+                num_valid_tokens = recv_count.item()
+                assert (
+                    num_valid_tokens == (recv_layout_range & int_mask).sum().item()
+                ), f"{num_valid_tokens} != {recv_layout_range & int_mask}.sum().item()"
+                assert (
+                    num_valid_tokens == (all_topk_idx == expert_id).sum().item()
+                ), f"{num_valid_tokens} != {(all_topk_idx == expert_id).sum().item()}"
+
+                # Check received data
+                recv_x = recv_x[:num_valid_tokens]
+                recv_x_amin = recv_x[:, :-128].amin(dim=-1)
+                recv_src_info = recv_src_info[:num_valid_tokens]
+                assert torch.equal(recv_x_amin, recv_x[:, :-128].amax(dim=-1))
+                assert (
+                    recv_x[:, -128:] - recv_src_info.view(-1, 1) % num_tokens
+                ).sum().item() == 0
+                for j in range(num_ranks):
+                    begin_idx, count = (recv_layout_range[j] >> 32).item(), (
+                        recv_layout_range[j] & int_mask
+                    ).item()
+                    assert (recv_x_amin == j - rank_offset).sum().item() == (
+                        all_topk_idx[j] == expert_id
+                    ).sum().item()
+                    assert (
+                        recv_x[begin_idx : begin_idx + count][:-128] - j
+                    ).sum().item() == 0
+                if dispatch_use_fp8:
+                    hash_value ^= hash_tensor(packed_recv_x[0][i, :num_valid_tokens])
+                    hash_value ^= hash_tensor(packed_recv_x[1][i, :num_valid_tokens])
+                else:
+                    hash_value ^= hash_tensor(packed_recv_x[i, :num_valid_tokens])
+
+            # Check combine correctness
+            for zero_copy in (False, True):
+                if zero_copy:
+                    buffer.get_next_low_latency_combine_buffer(handle)[
+                        :, :, :
+                    ] = simulated_gemm_x
+                out = torch.empty(
+                    (num_tokens, hidden), dtype=torch.bfloat16, device="cuda"
+                )
+                combined_x, event, hook = buffer.low_latency_combine(
+                    simulated_gemm_x,
+                    topk_idx,
+                    topk_weights,
+                    handle,
+                    async_finish=not return_recv_hook,
+                    zero_copy=zero_copy,
+                    return_recv_hook=return_recv_hook,
+                    out=out,
+                )
+                hook() if return_recv_hook else event.current_stream_wait()
+                if do_check:
+                    diff = calc_diff(
+                        x
+                        * topk_weights.masked_fill(topk_idx == -1, 0)
+                        .sum(dim=1)
+                        .view(-1, 1),
+                        combined_x,
+                    )
+                    assert torch.isnan(combined_x).sum().item() == 0
+                    assert diff < 1e-5, f"Error: {diff=}, {zero_copy=}"
+                    hash_value ^= hash_tensor(combined_x)
+
+    def create_test_cast_with_outliers(num_outliers):
+        tmp = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device="cuda")
+        tmp /= tmp.abs().amax(dim=1).view(-1, 1)
+        assert tmp.abs().amax().item() <= 1
+
+        # Create some amax outliers
+        for i in range(num_outliers):
+            tmp[random.randint(0, num_tokens - 1)] *= 1e3
+        return tmp
+
+    # noinspection PyShadowingNames
+    def large_gemm_with_hook(hook):
+        mat_0 = torch.randn((8192, 8192), dtype=torch.float)
+        mat_1 = torch.randn((8192, 8192), dtype=torch.float)
+        mat_0 @ mat_1
+        hook()
+
+    # noinspection PyShadowingNames
+    def test_func(zero_copy: bool, return_recv_hook: bool):
+        recv_x, recv_count, handle, event, hook = buffer.low_latency_dispatch(
+            x,
+            topk_idx,
+            num_tokens,
+            num_experts,
+            async_finish=False,
+            return_recv_hook=return_recv_hook,
+        )
+        large_gemm_with_hook(hook) if return_recv_hook else None
+        if zero_copy:
+            buffer.get_next_low_latency_combine_buffer(handle)[
+                :, :, :
+            ] = simulated_gemm_x
+        combined_x, event, hook = buffer.low_latency_combine(
+            simulated_gemm_x,
+            topk_idx,
+            topk_weights,
+            handle,
+            zero_copy=zero_copy,
+            return_recv_hook=return_recv_hook,
+        )
+        large_gemm_with_hook(hook) if return_recv_hook else None
+
+    # Calculate bandwidth
+    num_fp8_bytes, num_bf16_bytes = (hidden + hidden / 128 * 4 + 16), hidden * 2
+    num_dispatch_comm_bytes, num_combine_comm_bytes = 0, 0
+    for i in range(num_tokens):
+        num_selections = (topk_idx[i] != -1).sum().item()
+        num_dispatch_comm_bytes += num_fp8_bytes * num_selections
+        num_combine_comm_bytes += num_bf16_bytes * num_selections
+
+    # Dispatch + combine testing
+    avg_t, min_t, max_t = bench(
+        partial(test_func, zero_copy=False, return_recv_hook=False)
+    )
+    print(
+        f"[rank {rank}] Dispatch + combine bandwidth: {(num_dispatch_comm_bytes + num_combine_comm_bytes) / 1e9 / avg_t:.2f} GB/s, "
+        f"avg_t={avg_t * 1e6:.2f} us, min_t={min_t * 1e6:.2f} us, max_t={max_t * 1e6:.2f} us",
+        flush=True,
+    )
+
+    # Separate profiling
+    for return_recv_hook in (False, True):
+        group.barrier()
+        dispatch_t, combine_t = bench_kineto(
+            partial(test_func, zero_copy=True, return_recv_hook=return_recv_hook),
+            kernel_names=("dispatch", "combine"),
+            barrier_comm_profiling=True,
+            suppress_kineto_output=True,
+        )
+        if not return_recv_hook:
+            print(
+                f"[rank {rank}] Dispatch bandwidth: {num_dispatch_comm_bytes / 1e9 / dispatch_t:.2f} GB/s, avg_t={dispatch_t * 1e6:.2f} us | "
+                f"Combine bandwidth: {num_combine_comm_bytes / 1e9 / combine_t:.2f} GB/s, avg_t={combine_t * 1e6:.2f} us",
+                flush=True,
+            )
+        else:
+            print(
+                f"[rank {rank}] Dispatch send/recv time: {dispatch_t * 2 * 1e6:.2f} us | "
+                f"Combine send/recv time: {combine_t * 2 * 1e6:.2f} us",
+                flush=True,
+            )
+
+    return hash_value
+
+
+# noinspection PyUnboundLocalVariable
+def test_loop(local_rank: int, num_local_ranks: int):
+    rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
+    num_tokens, hidden, num_topk, num_experts = 128, 7168, 8, 288
+
+    num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
+        num_tokens, hidden, num_ranks, num_experts
+    )
+    if local_rank == 0:
+        print(f"Allocating buffer size: {num_rdma_bytes / 1e6} MB ...", flush=True)
+    buffer = deep_ep.Buffer(
+        group,
+        num_rdma_bytes=num_rdma_bytes,
+        low_latency_mode=True,
+        num_qps_per_rank=num_experts // num_ranks,
+    )
+    test_main(
+        num_tokens,
+        hidden,
+        num_experts,
+        num_topk,
+        rank,
+        num_ranks,
+        group,
+        buffer,
+        seed=1,
+    )
+
+    do_pressure_test = False
+    for seed in range(int(1e9) if do_pressure_test else 0):
+        if local_rank == 0:
+            print(f"Testing with seed {seed} ...", flush=True)
+        ref_hash = test_main(
+            num_tokens,
+            hidden,
+            num_experts,
+            num_topk,
+            rank,
+            num_ranks,
+            group,
+            buffer,
+            seed=seed,
+        )
+        for i in range(20):
+            assert (
+                test_main(
+                    num_tokens,
+                    hidden,
+                    num_experts,
+                    num_topk,
+                    rank,
+                    num_ranks,
+                    group,
+                    buffer,
+                    seed=seed,
+                )
+                == ref_hash
+            ), f"Error: seed={seed}"
+
+
+if __name__ == "__main__":
+    # TODO: you may modify NUMA binding for less CPU overhead
+    num_processes = 8
+    torch.multiprocessing.spawn(test_loop, args=(num_processes,), nprocs=num_processes)
diff --git a/test/srt/ep/test_deepep_small.py b/test/srt/ep/test_deepep_small.py
new file mode 100644
index 000000000..05aefe79a
--- /dev/null
+++ b/test/srt/ep/test_deepep_small.py
@@ -0,0 +1,389 @@
+import unittest
+from types import SimpleNamespace
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST_MLA,
+    DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestPureDP(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "4",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "128",
+                "--max-running-requests",
+                "512",
+                "--mem-fraction-static",
+                "0.5",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.60)
+
+
+class TestHybridDPTP(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "4",
+                "--enable-dp-attention",
+                "--dp",
+                "2",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "128",
+                "--max-running-requests",
+                "256",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.60)
+
+
+class TestTP(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "4",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "128",
+                "--max-running-requests",
+                "128",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.60)
+
+
+@unittest.skip("covered in test_deepep_large.py")
+class TestNoGatherdBuffer(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "4",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--moe-dense-tp-size",
+                "1",
+                "--enable-dp-lm-head",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "32",
+                "--max-running-requests",
+                "512",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.60)
+
+
+class TestTBO(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "4",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--moe-dense-tp-size",
+                "1",
+                "--moe-a2a-backend",
+                "deepep",
+                "--enable-two-batch-overlap",
+                "--cuda-graph-max-bs",
+                "128",
+                "--max-running-requests",
+                "512",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.60)
+
+
+@unittest.skip("covered in TestMTPWithTBO")
+class TestMTP(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "4",
+                "--enable-dp-attention",
+                "--dp",
+                "2",
+                "--enable-dp-lm-head",
+                "--moe-a2a-backend",
+                "deepep",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN,
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "3",
+                "--speculative-num-draft-tokens",
+                "3",
+                "--cuda-graph-max-bs",
+                "32",
+                "--max-running-requests",
+                "64",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.60)
+
+        server_info = requests.get(self.base_url + "/get_server_info")
+        avg_spec_accept_length = server_info.json()["internal_states"][0][
+            "avg_spec_accept_length"
+        ]
+        print(
+            f"###test_gsm8k (deepseek-v3 mtp + dp + tbo):\n"
+            f"accuracy={metrics['accuracy']=:.3f}\n"
+            f"{avg_spec_accept_length=:.3f}\n"
+        )
+        self.assertGreater(avg_spec_accept_length, 2.1)
+
+
+class TestMTPWithTBO(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        import os
+
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--tp-size",
+                "4",
+                "--enable-dp-attention",
+                "--dp-size",
+                "4",
+                "--enable-two-batch-overlap",
+                "--moe-a2a-backend",
+                "deepep",
+                "--trust-remote-code",
+                "--speculative-algorithm",
+                "EAGLE",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "3",
+                "--speculative-num-draft-tokens",
+                "3",
+                "--speculative-draft-model-path",
+                DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN,
+                "--chunked-prefill-size",
+                "256",
+                "--cuda-graph-max-bs",
+                "32",
+                "--max-running-requests",
+                "128",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.60)
+
+        server_info = requests.get(self.base_url + "/get_server_info")
+        avg_spec_accept_length = server_info.json()["internal_states"][0][
+            "avg_spec_accept_length"
+        ]
+        print(
+            f"###test_gsm8k (deepseek-v3 mtp + dp + tbo):\n"
+            f"accuracy={metrics['accuracy']=:.3f}\n"
+            f"{avg_spec_accept_length=:.3f}\n"
+        )
+        self.assertGreater(avg_spec_accept_length, 2.1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/ep/test_eplb.py b/test/srt/ep/test_eplb.py
new file mode 100755
index 000000000..c2acc07bb
--- /dev/null
+++ b/test/srt/ep/test_eplb.py
@@ -0,0 +1,155 @@
+import os
+import tempfile
+import unittest
+from pathlib import Path
+from types import SimpleNamespace
+
+import sglang as sgl
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MLA_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class _BaseTestDynamicEPLB(CustomTestCase):
+    extra_args = []
+
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "2",
+                "--dp",
+                "2",
+                "--enable-dp-attention",
+                "--moe-a2a-backend",
+                "deepep",
+                "--deepep-mode",
+                "normal",
+                "--disable-cuda-graph",
+                "--enable-eplb",
+                "--ep-num-redundant-experts",
+                "4",
+                "--eplb-rebalance-num-iterations",
+                "50",
+                "--expert-distribution-recorder-buffer-size",
+                "50",
+                # TODO pr-chain: enable later
+                # "--enable-expert-distribution-metrics",
+                # TODO auto determine these flags
+                "--expert-distribution-recorder-mode",
+                "stat",
+                "--ep-dispatch-algorithm",
+                "static",
+                *cls.extra_args,
+            ],
+            env={
+                "SGL_ENABLE_JIT_DEEPGEMM": "0",
+                "SGLANG_EXPERT_LOCATION_UPDATER_CANARY": "1",
+                **os.environ,
+            },
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreater(metrics["score"], 0.5)
+
+
+class TestDynamicEPLBSimple(_BaseTestDynamicEPLB):
+    pass
+
+
+class TestDynamicEPLBMultiChunk(_BaseTestDynamicEPLB):
+    extra_args = ["--eplb-rebalance-layers-per-chunk", "1"]
+
+
+class TestStaticEPLB(CustomTestCase):
+    def test_save_expert_distribution_and_init_expert_location(self):
+        os.environ["SGL_ENABLE_JIT_DEEPGEMM"] = "0"
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            engine_kwargs = dict(
+                model_path=DEFAULT_MLA_MODEL_NAME_FOR_TEST,
+                trust_remote_code=True,
+                ep_num_redundant_experts=4,
+                enable_dp_attention=True,
+                moe_a2a_backend="deepep",
+                disable_cuda_graph=True,
+                expert_distribution_recorder_mode="stat",
+                tp_size=2,
+                dp_size=2,
+                log_level="info",
+                # TODO pr-chain: enable later
+                # enable_expert_distribution_metrics=True,
+            )
+
+            print(f"Action: start engine")
+            os.environ["SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR"] = tmp_dir
+            engine = sgl.Engine(
+                **engine_kwargs,
+                disable_overlap_schedule=True,
+            )
+            engine.start_expert_distribution_record()
+            self._assert_engine_generate_correct(engine)
+
+            print(f"Action: dump_expert_distribution_record")
+            engine.dump_expert_distribution_record()
+            snapshot_path = list(Path(tmp_dir).glob("*.pt"))[0]
+            assert snapshot_path is not None
+            print(f"{snapshot_path=}")
+
+            print(f"Action: shutdown engine")
+            engine.shutdown()
+            del engine
+
+            print(f"Action: start engine with init_expert_location")
+            engine = sgl.Engine(
+                **engine_kwargs,
+                init_expert_location=str(snapshot_path),
+                port=21000,
+                # TODO auto determine these flags
+                ep_dispatch_algorithm="static",
+            )
+            self._assert_engine_generate_correct(engine)
+            print(f"Action: shutdown engine")
+            engine.shutdown()
+            del engine
+
+    def _assert_engine_generate_correct(self, engine: sgl.Engine):
+        output = engine.generate(
+            prompt=["1+1=2, 2+2=4", "One plus one is two, two plus two is four"],
+            sampling_params=dict(max_new_tokens=8, temperature=0.0),
+        )
+        print(f"engine.generate {output=}")
+        self.assertEqual(
+            [x["text"] for x in output],
+            [", 4+4=8,", ", four plus four is eight, eight"],
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/ep/test_hybrid_dp_ep_tp_mtp.py b/test/srt/ep/test_hybrid_dp_ep_tp_mtp.py
new file mode 100644
index 000000000..65fbad428
--- /dev/null
+++ b/test/srt/ep/test_hybrid_dp_ep_tp_mtp.py
@@ -0,0 +1,2728 @@
+# Comprehensive test for hybrid parallelism (DP/TP attention, DP/TP Dense FFN, TP/EP Sparse FFN, DP/VP LM head) plus speculative decoding.
+# These tests are not run by default but can be launched on demand.
+
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST,
+    DEFAULT_MLA_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class Test00(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test01(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test02(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test03(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--moe-dense-tp-size",
+                "1",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test04(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--moe-dense-tp-size",
+                "1",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test05(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--moe-dense-tp-size",
+                "1",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test06(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--enable-dp-lm-head",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test07(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--enable-dp-lm-head",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test08(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--moe-dense-tp-size",
+                "1",
+                "--enable-dp-lm-head",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test09(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--moe-dense-tp-size",
+                "1",
+                "--enable-dp-lm-head",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test10(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "128",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test11(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "128",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test12(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "128",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test13(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--moe-dense-tp-size",
+                "1",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "128",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test14(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--moe-dense-tp-size",
+                "1",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "128",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test15(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--moe-dense-tp-size",
+                "1",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "128",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test16(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--enable-dp-lm-head",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "128",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test17(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--enable-dp-lm-head",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "128",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test18(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--moe-dense-tp-size",
+                "1",
+                "--enable-dp-lm-head",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "128",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test19(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--moe-dense-tp-size",
+                "1",
+                "--enable-dp-lm-head",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "128",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test20(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--ep",
+                "8",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test21(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--ep",
+                "8",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test22(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--ep",
+                "8",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test23(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--moe-dense-tp-size",
+                "1",
+                "--ep",
+                "8",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test24(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--moe-dense-tp-size",
+                "1",
+                "--ep",
+                "8",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test25(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--moe-dense-tp-size",
+                "1",
+                "--ep",
+                "8",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test26(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--enable-dp-lm-head",
+                "--ep",
+                "8",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test27(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--enable-dp-lm-head",
+                "--ep",
+                "8",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test28(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--moe-dense-tp-size",
+                "1",
+                "--enable-dp-lm-head",
+                "--ep",
+                "8",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test29(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--moe-dense-tp-size",
+                "1",
+                "--enable-dp-lm-head",
+                "--ep",
+                "8",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test30(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test31(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test32(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test33(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--moe-dense-tp-size",
+                "1",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test34(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--moe-dense-tp-size",
+                "1",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test35(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--moe-dense-tp-size",
+                "1",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test36(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--enable-dp-lm-head",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test37(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--enable-dp-lm-head",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test38(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--moe-dense-tp-size",
+                "1",
+                "--enable-dp-lm-head",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test39(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--moe-dense-tp-size",
+                "1",
+                "--enable-dp-lm-head",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test40(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "32",
+                "--max-running-requests",
+                "32",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test41(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "32",
+                "--max-running-requests",
+                "32",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test42(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "32",
+                "--max-running-requests",
+                "32",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test43(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--moe-dense-tp-size",
+                "1",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "32",
+                "--max-running-requests",
+                "32",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test44(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--moe-dense-tp-size",
+                "1",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "32",
+                "--max-running-requests",
+                "32",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test45(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--moe-dense-tp-size",
+                "1",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "32",
+                "--max-running-requests",
+                "32",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test46(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--enable-dp-lm-head",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "32",
+                "--max-running-requests",
+                "32",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test47(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--enable-dp-lm-head",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "32",
+                "--max-running-requests",
+                "32",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test48(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--moe-dense-tp-size",
+                "1",
+                "--enable-dp-lm-head",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "32",
+                "--max-running-requests",
+                "32",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test49(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--moe-dense-tp-size",
+                "1",
+                "--enable-dp-lm-head",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "32",
+                "--max-running-requests",
+                "32",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test50(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--ep",
+                "8",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test51(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--ep",
+                "8",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test52(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--ep",
+                "8",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test53(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--moe-dense-tp-size",
+                "1",
+                "--ep",
+                "8",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test54(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--moe-dense-tp-size",
+                "1",
+                "--ep",
+                "8",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test55(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--moe-dense-tp-size",
+                "1",
+                "--ep",
+                "8",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test56(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--enable-dp-lm-head",
+                "--ep",
+                "8",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test57(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--enable-dp-lm-head",
+                "--ep",
+                "8",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test58(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "4",
+                "--moe-dense-tp-size",
+                "1",
+                "--enable-dp-lm-head",
+                "--ep",
+                "8",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+class Test59(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--enable-dp-attention",
+                "--dp",
+                "8",
+                "--moe-dense-tp-size",
+                "1",
+                "--enable-dp-lm-head",
+                "--ep",
+                "8",
+                "--speculative-algo",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                "lmsys/DeepSeek-V3-0324-NextN",
+                "--speculative-num-steps",
+                "2",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "4",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.48)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/ep/test_moe_deepep.py b/test/srt/ep/test_moe_deepep.py
new file mode 100644
index 000000000..aa9d7a1f8
--- /dev/null
+++ b/test/srt/ep/test_moe_deepep.py
@@ -0,0 +1,119 @@
+import json
+import os
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MLA_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestPureTP(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "2",
+                "--moe-a2a-backend",
+                "deepep",
+                "--disable-cuda-graph",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreater(metrics["score"], 0.5)
+
+
+class TestDPAttn(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "2",
+                "--dp",
+                "2",
+                "--enable-dp-attention",
+                "--moe-a2a-backend",
+                "deepep",
+                "--deepep-mode",
+                "normal",
+                "--disable-cuda-graph",
+                # Test custom config
+                "--deepep-config",
+                json.dumps(
+                    {
+                        "normal_dispatch": {
+                            "num_sms": 20,
+                            "num_max_nvl_chunked_send_tokens": 16,
+                            "num_max_nvl_chunked_recv_tokens": 256,
+                            "num_max_rdma_chunked_send_tokens": 6,
+                            "num_max_rdma_chunked_recv_tokens": 128,
+                        },
+                        "normal_combine": {
+                            "num_sms": 20,
+                            "num_max_nvl_chunked_send_tokens": 6,
+                            "num_max_nvl_chunked_recv_tokens": 256,
+                            "num_max_rdma_chunked_send_tokens": 6,
+                            "num_max_rdma_chunked_recv_tokens": 128,
+                        },
+                    }
+                ),
+            ],
+            env={
+                "SGL_ENABLE_JIT_DEEPGEMM": "0",
+                **os.environ,
+            },
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreater(metrics["score"], 0.5)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/ep/test_moe_deepep_eval_accuracy_large.py b/test/srt/ep/test_moe_deepep_eval_accuracy_large.py
new file mode 100644
index 000000000..66797ffa1
--- /dev/null
+++ b/test/srt/ep/test_moe_deepep_eval_accuracy_large.py
@@ -0,0 +1,75 @@
+"""
+Usage:
+python -m unittest test_moe_deepep_eval_accuracy_large.TestMoEDeepEPEvalAccuracyLarge.test_mmlu
+"""
+
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestMoEDeepEPEvalAccuracyLarge(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "8",
+                "--moe-a2a-backend",
+                "deepep",
+                "--cuda-graph-max-bs",
+                "128",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=8,
+            data_path=None,
+            num_questions=200,
+            parallel=64,
+            max_new_tokens=512,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(f"Eval accuracy of GSM8K: {metrics=}")
+
+        self.assertGreater(metrics["accuracy"], 0.93)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"Eval accuracy of MMLU: {metrics=}")
+        self.assertGreater(metrics["score"], 0.87)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/ep/test_moe_ep.py b/test/srt/ep/test_moe_ep.py
new file mode 100644
index 000000000..7456c9329
--- /dev/null
+++ b/test/srt/ep/test_moe_ep.py
@@ -0,0 +1,112 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MLA_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestEpMoE(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "2",
+                "--ep-size",
+                "2",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreaterEqual(metrics["score"], 0.5)
+
+    def test_mgsm_en(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mgsm_en",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreaterEqual(metrics["score"], 0.8)
+
+
+class TestEpMoEFP8(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "2",
+                "--ep-size",
+                "2",
+                "--quantization",
+                "fp8",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreaterEqual(metrics["score"], 0.5)
+
+    def test_mgsm_en(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mgsm_en",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreaterEqual(metrics["score"], 0.8)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/experiment_runner.py b/test/srt/experiment_runner.py
new file mode 100644
index 000000000..f32f61d3b
--- /dev/null
+++ b/test/srt/experiment_runner.py
@@ -0,0 +1,365 @@
+import argparse
+import logging
+import os
+import queue
+import re
+import subprocess
+import threading
+import time
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import psutil
+import requests
+import yaml
+
+
+@dataclass
+class ServerConfig:
+    command: str
+    process_names: List[str]
+    default_port: int
+
+
+@dataclass
+class TaskConfig:
+    server_cmd: str
+    client_cmd: str
+    name: Optional[str] = None
+    server_type: Optional[str] = None
+
+
+@dataclass
+class TaskResult:
+    name: str
+    success: bool
+    output: str
+    runtime: float
+    timestamp: str
+
+
+SERVER_DEFAULTS = {
+    "sglang": ServerConfig(
+        command="sglang.launch_server",
+        process_names=["sglang.launch_server"],
+        default_port=30000,
+    ),
+    "vllm": ServerConfig(
+        command="vllm.entrypoints.openai.api_server",
+        process_names=["vllm.entrypoints.openai.api_server"],
+        default_port=8000,
+    ),
+}
+
+
+def parse_key_info(output: str) -> str:
+    """Extract and format key information from the output"""
+    key_info = []
+
+    # Extract Args namespace
+    args_match = re.search(r"Namespace\(.*?\)", output, re.DOTALL)
+    if args_match:
+        key_info.append(args_match.group(0))
+
+    # Extract input/output token counts
+    token_matches = re.findall(r"#(Input|Output) tokens: \d+", output)
+    key_info.extend(token_matches)
+
+    # Extract benchmark result section
+    result_match = re.search(
+        r"============ Serving Benchmark Result ============.*?={50,}",
+        output,
+        re.DOTALL,
+    )
+    if result_match:
+        key_info.append(result_match.group(0))
+
+    return "\n\n".join(key_info)
+
+
+def extract_port_from_command(cmd: str, server_type: str) -> int:
+    port_match = re.search(r"--port[= ](\d+)", cmd)
+    if port_match:
+        return int(port_match.group(1))
+    return SERVER_DEFAULTS.get(server_type, ServerConfig("", [], 8000)).default_port
+
+
+def detect_server_type(cmd: str) -> str:
+    for server_type, config in SERVER_DEFAULTS.items():
+        if config.command in cmd:
+            return server_type
+    return "unknown"
+
+
+def stream_output(
+    process: subprocess.Popen, prefix: str, logger: logging.Logger
+) -> queue.Queue:
+    output_queue = queue.Queue()
+
+    def stream_pipe(pipe, prefix):
+        for line in iter(pipe.readline, ""):
+            if prefix == "CLIENT":
+                output_queue.put(line.rstrip())
+            logger.debug(f"{prefix} | {line.rstrip()}")
+
+    stdout_thread = threading.Thread(
+        target=stream_pipe, args=(process.stdout, prefix), daemon=True
+    )
+    stderr_thread = threading.Thread(
+        target=stream_pipe, args=(process.stderr, prefix), daemon=True
+    )
+
+    stdout_thread.start()
+    stderr_thread.start()
+    return output_queue, (stdout_thread, stderr_thread)
+
+
+class ProcessManager:
+    def __init__(self):
+        self.server_process: Optional[subprocess.Popen] = None
+        self.client_process: Optional[subprocess.Popen] = None
+        self.logger = logging.getLogger(__name__)
+
+    def start_process(
+        self, command: str, prefix: str
+    ) -> Tuple[subprocess.Popen, queue.Queue]:
+        process = subprocess.Popen(
+            command,
+            shell=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            bufsize=1,
+        )
+
+        output_queue, threads = stream_output(process, prefix, self.logger)
+        return process, output_queue, threads
+
+    def kill_process_tree(self, process: subprocess.Popen):
+        try:
+            parent = psutil.Process(process.pid)
+            children = parent.children(recursive=True)
+
+            for child in children:
+                try:
+                    child.kill()
+                except psutil.NoSuchProcess:
+                    pass
+
+            parent.kill()
+            gone, alive = psutil.wait_procs(children + [parent], timeout=3)
+
+            for p in alive:
+                try:
+                    p.kill()
+                except psutil.NoSuchProcess:
+                    pass
+
+        except psutil.NoSuchProcess:
+            pass
+
+    def cleanup(self, process_names: List[str]):
+        if self.client_process:
+            self.kill_process_tree(self.client_process)
+            self.client_process = None
+
+        if self.server_process:
+            self.kill_process_tree(self.server_process)
+            self.server_process = None
+
+        for proc in psutil.process_iter(["pid", "name", "cmdline"]):
+            try:
+                cmdline = " ".join(proc.cmdline())
+                if any(name in cmdline for name in process_names):
+                    proc.kill()
+            except (psutil.NoSuchProcess, psutil.AccessDenied):
+                continue
+
+
+class ExperimentRunner:
+    def __init__(self):
+        self.process_manager = ProcessManager()
+        self.logger = logging.getLogger(__name__)
+
+    def wait_for_server(self, port: int, timeout: int = 300) -> bool:
+        start_time = time.perf_counter()
+
+        while time.perf_counter() - start_time < timeout:
+            try:
+                response = requests.get(f"http://localhost:{port}/health")
+                if response.status_code == 200:
+                    self.logger.debug(f"Server ready on port {port}")
+                    return True
+            except requests.RequestException:
+                time.sleep(2)
+        return False
+
+    def run_task(self, config: TaskConfig) -> TaskResult:
+        start_time = time.perf_counter()
+        client_output = []
+
+        try:
+            if not config.server_type:
+                config.server_type = detect_server_type(config.server_cmd)
+
+            server_config = SERVER_DEFAULTS.get(config.server_type)
+            if not server_config:
+                raise ValueError(f"Unknown server type: {config.server_type}")
+
+            port = extract_port_from_command(config.server_cmd, config.server_type)
+
+            self.process_manager.cleanup(server_config.process_names)
+
+            self.logger.debug(f"Starting server: {config.name}")
+            self.process_manager.server_process, _, server_threads = (
+                self.process_manager.start_process(config.server_cmd, "SERVER")
+            )
+
+            if not self.wait_for_server(port):
+                raise TimeoutError("Server startup timeout")
+
+            time.sleep(10)
+
+            self.logger.debug("Starting client")
+            self.process_manager.client_process, output_queue, client_threads = (
+                self.process_manager.start_process(config.client_cmd, "CLIENT")
+            )
+
+            returncode = self.process_manager.client_process.wait()
+
+            while True:
+                try:
+                    line = output_queue.get_nowait()
+                    client_output.append(line)
+                except queue.Empty:
+                    break
+
+            if returncode != 0:
+                raise RuntimeError(f"Client failed with code {returncode}")
+
+            # Parse and format the output
+            full_output = "\n".join(client_output)
+            formatted_output = parse_key_info(full_output)
+
+            return TaskResult(
+                name=config.name,
+                success=True,
+                output=formatted_output,
+                runtime=time.perf_counter() - start_time,
+                timestamp=datetime.now().isoformat(),
+            )
+
+        except Exception as e:
+            return TaskResult(
+                name=config.name,
+                success=False,
+                output=str(e),
+                runtime=time.perf_counter() - start_time,
+                timestamp=datetime.now().isoformat(),
+            )
+
+        finally:
+            if config.server_type in SERVER_DEFAULTS:
+                self.process_manager.cleanup(
+                    SERVER_DEFAULTS[config.server_type].process_names
+                )
+            time.sleep(10)
+
+
+def load_config(config_path: str) -> List[TaskConfig]:
+    with open(config_path, "r") as f:
+        config_data = yaml.safe_load(f)
+
+    configs = []
+    for idx, entry in enumerate(config_data.get("tasks", [])):
+        if not isinstance(entry, dict):
+            raise ValueError(f"Invalid entry at index {idx}")
+
+        config = TaskConfig(
+            server_cmd=entry.get("server_cmd"),
+            client_cmd=entry.get("client_cmd"),
+            name=entry.get("name", f"task-{idx+1}"),
+            server_type=entry.get("server_type"),
+        )
+
+        if not config.server_cmd or not config.client_cmd:
+            raise ValueError(f"Missing commands in {config.name}")
+
+        configs.append(config)
+
+    return configs
+
+
+def setup_logging(debug: bool = False):
+    level = logging.DEBUG if debug else logging.INFO
+    logging.basicConfig(
+        level=level,
+        format="%(asctime)s - %(levelname)s - %(message)s",
+        handlers=[logging.StreamHandler(), logging.FileHandler("experiment.log")],
+    )
+
+
+def format_results(results: List[TaskResult]) -> str:
+    """Format experiment results in Markdown for GitHub step summary."""
+    output = ["# Experiment Results\n"]
+
+    for result in results:
+        output.append(f"## {result.name}")
+        output.append(f"**Status**: {'✅ Success' if result.success else '❌ Failed'}")
+        output.append(f"**Runtime**: {result.runtime:.2f} seconds")
+        output.append(f"**Timestamp**: {result.timestamp}")
+        output.append("\n**Output**:\n```")
+        output.append(result.output)
+        output.append("```\n")
+
+    return "\n".join(output)
+
+
+def get_bool_env_var(name: str, default: str = "false") -> bool:
+    value = os.getenv(name, default)
+    return value.lower() in ("true", "1")
+
+
+def write_in_github_step_summary(results: List[TaskResult]):
+    """Write formatted results to GitHub step summary."""
+    if not os.environ.get("GITHUB_STEP_SUMMARY"):
+        logging.warning("GITHUB_STEP_SUMMARY environment variable not set")
+        return
+
+    formatted_content = format_results(results)
+    with open(os.environ["GITHUB_STEP_SUMMARY"], "a") as f:
+        f.write(formatted_content)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Experiment Runner")
+    parser.add_argument(
+        "--config", type=str, required=True, help="Path to YAML config file"
+    )
+    parser.add_argument("--debug", action="store_true", help="Enable debug output")
+    args = parser.parse_args()
+
+    setup_logging(args.debug)
+    logger = logging.getLogger(__name__)
+    results = []
+
+    try:
+        configs = load_config(args.config)
+        runner = ExperimentRunner()
+
+        for config in configs:
+            logger.info(f"Running {config.name}")
+            result = runner.run_task(config)
+            results.append(result)
+
+        if get_bool_env_var("SGLANG_IS_IN_CI"):
+            write_in_github_step_summary(results)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/srt/hicache/test_hicache.py b/test/srt/hicache/test_hicache.py
new file mode 100644
index 000000000..f7616d098
--- /dev/null
+++ b/test/srt/hicache/test_hicache.py
@@ -0,0 +1,53 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import is_hip, kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+_is_hip = is_hip()
+
+
+class TestHiCache(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--enable-hierarchical-cache",
+                "--mem-fraction-static",
+                0.7,
+                "--hicache-size",
+                100 if not _is_hip else 200,
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreaterEqual(metrics["score"], 0.65)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/hicache/test_hicache_mla.py b/test/srt/hicache/test_hicache_mla.py
new file mode 100644
index 000000000..c5db0f74a
--- /dev/null
+++ b/test/srt/hicache/test_hicache_mla.py
@@ -0,0 +1,67 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import is_hip, kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MLA_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+_is_hip = is_hip()
+if _is_hip:
+    hicache_args = ["--hicache-size", 200]
+else:
+    hicache_args = ["--hicache-ratio", 2]
+
+
+class TestHierarchicalMLA(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--enable-hierarchical-cache",
+            ]
+            + hicache_args,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreater(metrics["score"], 0.5)
+
+    def test_mgsm_en(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mgsm_en",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreater(metrics["score"], 0.8)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/hicache/test_hicache_page.py b/test/srt/hicache/test_hicache_page.py
new file mode 100644
index 000000000..b1e1459c2
--- /dev/null
+++ b/test/srt/hicache/test_hicache_page.py
@@ -0,0 +1,51 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestHiCachePage(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--enable-hierarchical-cache",
+                "--page-size",
+                32,
+                "--hicache-write-policy",
+                "write_back",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreaterEqual(metrics["score"], 0.65)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/hicache/test_hicache_storage.py b/test/srt/hicache/test_hicache_storage.py
new file mode 100644
index 000000000..7bc947b8c
--- /dev/null
+++ b/test/srt/hicache/test_hicache_storage.py
@@ -0,0 +1,57 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import is_hip, kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+_is_hip = is_hip()
+
+
+class TestHiCache(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--enable-hierarchical-cache",
+                "--mem-fraction-static",
+                0.7,
+                "--hicache-size",
+                100 if not _is_hip else 200,
+                "--page-size",
+                "64",
+                "--hicache-storage-backend",
+                "file",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreaterEqual(metrics["score"], 0.65)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/hicache/test_hicache_storage_3fs_backend.py b/test/srt/hicache/test_hicache_storage_3fs_backend.py
new file mode 100644
index 000000000..d0f519075
--- /dev/null
+++ b/test/srt/hicache/test_hicache_storage_3fs_backend.py
@@ -0,0 +1,135 @@
+"""
+Benchmark tests for HiCache Storage with 3FS backend.
+Usage:
+    python3 -m pytest test/srt/hicache/test_hicache_storage_3fs_backend.py -v
+"""
+
+import json
+import os
+import time
+import unittest
+from types import SimpleNamespace
+
+from test_hicache_storage_file_backend import HiCacheStorageBaseMixin
+
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import CustomTestCase
+
+
+class HiCacheStorage3FSBackendBaseMixin(HiCacheStorageBaseMixin):
+    """Base mixin class with common setup and utilities"""
+
+    @classmethod
+    def _get_additional_server_args_and_env(cls):
+        """Get additional server arguments specific to configuration - override in subclasses"""
+        # Create a temporary JSON config file for HF3FS
+        hf3fs_config = {
+            "file_path_prefix": os.path.join(cls.temp_dir, "hicache"),
+            "file_size": 1024 * 1024 * 1024 * 2,
+            "numjobs": 2,
+            "entries": 8,
+            "use_mock_hf3fs_client": True,
+        }
+
+        # Write config to temporary file
+        config_file = os.path.join(cls.temp_dir, "hf3fs_config.json")
+        with open(config_file, "w") as f:
+            json.dump(hf3fs_config, f, indent=2)
+
+        server_args = {
+            "--tp-size": 1,
+            "--hicache-ratio": 1.2,
+            "--hicache-storage-backend": "hf3fs",
+            "--hicache-storage-backend-extra-config": json.dumps(hf3fs_config),
+        }
+
+        # Set the environment variable to point to our config file
+        env_vars = {
+            "SGLANG_HICACHE_HF3FS_CONFIG_PATH": config_file,
+        }
+
+        return server_args, env_vars
+
+
+class TestHf3fsBackendLayerFirstLayout(
+    HiCacheStorage3FSBackendBaseMixin, CustomTestCase
+):
+    """Layer first layout tests for HiCache-Hf3fs backend"""
+
+    @classmethod
+    def _get_additional_server_args_and_env(cls):
+        """Get additional server arguments specific to configuration - override in subclasses"""
+        server_args, env_vars = super()._get_additional_server_args_and_env()
+        server_args["--hicache-mem-layout"] = "layer_first"
+        server_args["--hicache-io-backend"] = "direct"
+        return server_args, env_vars
+
+
+class TestHf3fsBackendPageFirstLayout(
+    HiCacheStorage3FSBackendBaseMixin, CustomTestCase
+):
+    """Page first layout tests for HiCache-Hf3fs backend"""
+
+    @classmethod
+    def _get_additional_server_args_and_env(cls):
+        """Get additional server arguments specific to configuration - override in subclasses"""
+        server_args, env_vars = super()._get_additional_server_args_and_env()
+        server_args["--hicache-mem-layout"] = "page_first"
+        return server_args, env_vars
+
+
+class TestHf3fsBackendAccuracy(HiCacheStorage3FSBackendBaseMixin, CustomTestCase):
+    """Accuracy tests for HiCache-Hf3fs backend"""
+
+    @classmethod
+    def _get_additional_server_args_and_env(cls):
+        """Get additional server arguments specific to configuration - override in subclasses"""
+        server_args, env_vars = super()._get_additional_server_args_and_env()
+        server_args["--hicache-ratio"] = 1.5
+        server_args["--tp-size"] = 2
+        return server_args, env_vars
+
+    def test_eval_accuracy(self):
+        """Test eval accuracy with cache persistence across cache flushes"""
+        print("\n=== Testing Eval Accuracy with Cache Persistence ===")
+
+        # First evaluation - populate cache
+        print("Phase 1: Running initial GSM8K evaluation to populate cache...")
+        args_initial = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=50,
+            max_new_tokens=512,
+            parallel=10,
+            host=f"http://{self.base_host}",
+            port=int(self.base_port),
+        )
+        metrics_initial = run_eval_few_shot_gsm8k(args_initial)
+
+        # Flush cache to force remote storage access
+        print("Phase 2: Flushing device cache...")
+        self.assertTrue(self.flush_cache(), "Cache flush should succeed")
+        time.sleep(2)
+
+        # Second evaluation - should use remote cache
+        print("Phase 3: Running second GSM8K evaluation using remote cache...")
+        metrics_cached = run_eval_few_shot_gsm8k(args_initial)
+
+        # Verify accuracy consistency
+        accuracy_diff = abs(metrics_initial["accuracy"] - metrics_cached["accuracy"])
+        print(f"Accuracy difference: {accuracy_diff:.4f}")
+
+        # Assertions
+        self.assertGreater(
+            metrics_initial["accuracy"], 0.6, "Initial accuracy should be reasonable"
+        )
+        self.assertGreater(
+            metrics_cached["accuracy"], 0.6, "Cached accuracy should be reasonable"
+        )
+        self.assertLess(
+            accuracy_diff, 0.05, "Accuracy should be consistent between cache states"
+        )
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/test/srt/hicache/test_hicache_storage_file_backend.py b/test/srt/hicache/test_hicache_storage_file_backend.py
new file mode 100644
index 000000000..fc8a0e25d
--- /dev/null
+++ b/test/srt/hicache/test_hicache_storage_file_backend.py
@@ -0,0 +1,334 @@
+"""
+E2E tests for HiCache Storage functionality.
+Usage:
+    python3 -m pytest test/srt/hicache/test_hicache_storage_e2e.py -v
+"""
+
+import os
+import random
+import tempfile
+import time
+import unittest
+from types import SimpleNamespace
+from typing import Dict
+from urllib.parse import urlparse
+
+import requests
+
+from sglang.bench_serving import get_tokenizer
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_MLA_MODEL_NAME_FOR_TEST,
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class HiCacheStorageBaseMixin:
+    """Base mixin class with common setup and utilities"""
+
+    @classmethod
+    def setUpClass(cls):
+        """Set up test environment and launch server once for all tests"""
+        cls.temp_dir = tempfile.mkdtemp()
+        cls.model = cls._get_model_name()
+        cls.base_url = DEFAULT_URL_FOR_TEST
+
+        parsed_url = urlparse(cls.base_url)
+        cls.base_host = parsed_url.hostname
+        cls.base_port = str(parsed_url.port)
+
+        # Prepare tokenizer for prompt generation
+        cls.tokenizer = get_tokenizer(cls.model)
+
+        # Launch server with HiCache enabled and cache report
+        cls.process = cls._launch_server_with_hicache()
+        cls._wait_for_server_ready()
+
+        print(f"Test server launched successfully at {cls.base_url}")
+        print(f"Cache directory: {cls.temp_dir}")
+
+    @classmethod
+    def tearDownClass(cls):
+        """Clean up test environment"""
+        kill_process_tree(cls.process.pid)
+
+        import shutil
+
+        shutil.rmtree(cls.temp_dir, ignore_errors=True)
+
+    @classmethod
+    def _get_model_name(cls):
+        """Get model name for the test configuration - override in subclasses"""
+        return DEFAULT_MODEL_NAME_FOR_TEST
+
+    @classmethod
+    def _get_base_server_args(cls):
+        """Get base server arguments - can be extended in subclasses"""
+        return {
+            "--enable-hierarchical-cache": True,
+            "--mem-fraction-static": 0.6,
+            "--hicache-ratio": 1.2,
+            "--page-size": 64,
+            "--enable-cache-report": True,
+            "--hicache-storage-prefetch-policy": "wait_complete",
+            "--hicache-storage-backend": "file",
+        }
+
+    @classmethod
+    def _get_additional_server_args_and_env(cls):
+        """Get additional server arguments specific to configuration - override in subclasses"""
+        return {}, {"SGLANG_HICACHE_FILE_BACKEND_STORAGE_DIR": cls.temp_dir}
+
+    @classmethod
+    def _launch_server_with_hicache(cls):
+        """Launch server with HiCache enabled"""
+
+        additional_server_args, env_vars = cls._get_additional_server_args_and_env()
+        server_args = cls._get_base_server_args()
+        if additional_server_args:
+            server_args.update(additional_server_args)
+
+        final_server_args = []
+        for k, v in server_args.items():
+            if isinstance(v, bool):
+                final_server_args.append(str(k))
+            else:
+                final_server_args.append(str(k))
+                final_server_args.append(str(v))
+
+        print(f"final_server_args: {final_server_args}")
+
+        env_vars = {
+            **os.environ,
+            **env_vars,
+        }
+
+        return popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=final_server_args,
+            env=env_vars,
+        )
+
+    @classmethod
+    def _wait_for_server_ready(cls, timeout: int = 60) -> bool:
+        """Wait for server to be ready"""
+        start_time = time.time()
+        while time.time() - start_time < timeout:
+            try:
+                response = requests.get(f"{cls.base_url}/health", timeout=5)
+                if response.status_code == 200:
+                    return True
+            except requests.RequestException:
+                pass
+            time.sleep(2)
+        raise TimeoutError("Server failed to start within timeout")
+
+    def send_request(
+        self, prompt: str, max_tokens: int = 100, temperature: float = 0.0
+    ) -> Dict:
+        """Send a generate request and return response"""
+        response = requests.post(
+            f"{self.base_url}/generate",
+            json={
+                "text": prompt,
+                "sampling_params": {
+                    "temperature": temperature,
+                    "max_new_tokens": max_tokens,
+                    "ignore_eos": True,
+                },
+            },
+            timeout=60,
+        )
+
+        self.assertEqual(
+            response.status_code,
+            200,
+            f"Request failed: {response.status_code} - {response.text}",
+        )
+        return response.json()
+
+    def get_cached_tokens(self, response_json: Dict) -> int:
+        """Extract cached tokens count from /generate response"""
+        meta = response_json.get("meta_info", {})
+        return int(meta.get("cached_tokens", 0))
+
+    def flush_cache(self) -> bool:
+        """Flush device cache to force remote storage access"""
+        try:
+            response = requests.post(f"{self.base_url}/flush_cache", timeout=10)
+            return response.status_code == 200
+        except requests.RequestException:
+            return False
+
+    def gen_prompt(self, token_num: int) -> str:
+        """Generate a random prompt of specified token length using tokenizer vocabulary."""
+        all_available_tokens = list(self.tokenizer.get_vocab().values())
+        selected_tokens = random.choices(all_available_tokens, k=token_num)
+        return self.tokenizer.decode(selected_tokens)
+
+    def trigger_offloading_and_flush(self):
+        """Helper method to trigger offloading and flush cache"""
+        # Trigger offloading
+        self.send_request(self.gen_prompt(1), max_tokens=150)
+
+        # Flush device cache to force remote storage access
+        time.sleep(2)
+        self.assertTrue(self.flush_cache(), "Cache flush should succeed")
+
+    def test_basic_backup_and_prefetch(self):
+        """Test storage and retrieval of large context through remote cache"""
+        print("\n=== Testing Large Context Cache Storage & Retrieval ===")
+
+        # Generate substantial context that will be cached
+        base_prompt = self.gen_prompt(768)
+
+        # First request - populate cache
+        print("Step 1: Populating cache with large context...")
+        response1 = self.send_request(base_prompt, max_tokens=150)
+        self.assertIsNotNone(response1)
+
+        # Flush device cache to force remote storage access
+        self.trigger_offloading_and_flush()
+
+        # Second request with extended prompt - should hit remote cache
+        print("Step 2: Testing cache hit from remote storage...")
+
+        start_time = time.time()
+        response2 = self.send_request(base_prompt, max_tokens=150)
+        retrieval_time = time.time() - start_time
+
+        cached_tokens = self.get_cached_tokens(response2)
+        print(
+            f"Remote cache retrieval time: {retrieval_time:.3f}s, cached_tokens={cached_tokens}"
+        )
+
+        # Assert cached tokens indicate a remote hit
+        self.assertGreater(
+            cached_tokens, 700, "Expected significant cached tokens for remote hit"
+        )
+
+
+class TestHiCacheStorageTP(HiCacheStorageBaseMixin, CustomTestCase):
+    """Multi-TP tests for HiCache Storage functionality"""
+
+    @classmethod
+    def _get_additional_server_args_and_env(cls):
+        """Get additional server arguments specific to configuration - override in subclasses"""
+        server_args = {"--tp-size": 2}
+        return server_args, {}
+
+
+class TestHiCacheStorageLayerFirstDirectIO(HiCacheStorageBaseMixin, CustomTestCase):
+    """Layer first direct tests for HiCache Storage functionality"""
+
+    @classmethod
+    def _get_additional_server_args_and_env(cls):
+        """Get additional server arguments specific to configuration - override in subclasses"""
+        server_args = {
+            "--hicache-mem-layout": "layer_first",
+            "--hicache-io-backend": "direct",
+        }
+        return server_args, {}
+
+
+class TestHiCacheStoragePageFirstLayout(HiCacheStorageBaseMixin, CustomTestCase):
+    """Page first layout tests for HiCache Storage functionality"""
+
+    @classmethod
+    def _get_additional_server_args_and_env(cls):
+        """Get additional server arguments specific to configuration - override in subclasses"""
+        server_args = {"--hicache-mem-layout": "page_first"}
+        return server_args, {}
+
+
+class TestHiCacheStorageMLA(HiCacheStorageBaseMixin, CustomTestCase):
+    """MLA Model tests for HiCache Storage functionality"""
+
+    @classmethod
+    def _get_model_name(cls):
+        """Use MLA model for testing"""
+        return DEFAULT_MLA_MODEL_NAME_FOR_TEST
+
+    @classmethod
+    def _get_additional_server_args_and_env(cls):
+        """Get additional server arguments specific to configuration - override in subclasses"""
+        server_args = {"--tp-size": 2}
+        return server_args, {}
+
+
+class TestHiCacheStorageAccuracy(HiCacheStorageBaseMixin, CustomTestCase):
+    """Accuracy tests for HiCache Storage functionality"""
+
+    @classmethod
+    def _get_additional_server_args_and_env(cls):
+        """Get additional server arguments specific to configuration - override in subclasses"""
+        server_args = {"--tp-size": 2, "--hicache-ratio": 1.5}
+        return server_args, {}
+
+    def test_eval_accuracy(self):
+        """Test eval accuracy with cache persistence across cache flushes"""
+        print("\n=== Testing Eval Accuracy with Cache Persistence ===")
+
+        # First evaluation - populate cache
+        print("Phase 1: Running initial GSM8K evaluation to populate cache...")
+        args_initial = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=50,
+            max_new_tokens=512,
+            parallel=10,
+            host=f"http://{self.base_host}",
+            port=int(self.base_port),
+        )
+        metrics_initial = run_eval_few_shot_gsm8k(args_initial)
+
+        # Flush cache to force remote storage access
+        print("Phase 2: Flushing device cache...")
+        self.assertTrue(self.flush_cache(), "Cache flush should succeed")
+        time.sleep(2)
+
+        # Second evaluation - should use remote cache
+        print("Phase 3: Running second GSM8K evaluation using remote cache...")
+        metrics_cached = run_eval_few_shot_gsm8k(args_initial)
+
+        # Verify accuracy consistency
+        accuracy_diff = abs(metrics_initial["accuracy"] - metrics_cached["accuracy"])
+        print(f"Accuracy difference: {accuracy_diff:.4f}")
+
+        # Assertions
+        self.assertGreater(
+            metrics_initial["accuracy"], 0.6, "Initial accuracy should be reasonable"
+        )
+        self.assertGreater(
+            metrics_cached["accuracy"], 0.6, "Cached accuracy should be reasonable"
+        )
+        self.assertLess(
+            accuracy_diff, 0.05, "Accuracy should be consistent between cache states"
+        )
+
+
+# TODO: Add other backends tests（3fs/mooncake）
+# class TestHiCacheStorageMooncakeBackend(HiCacheStorageBaseTest):
+#     """Mooncake backend tests for HiCache Storage functionality"""
+
+#     @classmethod
+#     def _get_additional_server_args_and_env(cls):
+#         """Get additional server arguments specific to configuration - override in subclasses"""
+#         server_args = ["--hicache-storage-backend", "mooncake"]
+#         env = {
+#             "MOONCAKE_TE_META_DATA_SERVER": "http://127.0.0.1:8080/metadata",
+#             "MOONCAKE_MASTER": "127.0.0.1:50051"
+#             xxxxx
+#         }
+#         return server_args, {}
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/test/srt/kv_cache_scales_llama3_1_8b.json b/test/srt/kv_cache_scales_llama3_1_8b.json
new file mode 100644
index 000000000..3e890e50e
--- /dev/null
+++ b/test/srt/kv_cache_scales_llama3_1_8b.json
@@ -0,0 +1,42 @@
+{
+    "model_type": "llama",
+    "kv_cache": {
+        "dtype": "float8_e4m3fn",
+        "scaling_factor": {
+            "0": {
+                "0": 1,
+                "1": 1,
+                "2": 1,
+                "3": 1,
+                "4": 1,
+                "5": 1,
+                "6": 1,
+                "7": 1,
+                "8": 1,
+                "9": 1,
+                "10": 1,
+                "11": 1,
+                "12": 1,
+                "13": 1,
+                "14": 1,
+                "15": 1,
+                "16": 1,
+                "17": 1,
+                "18": 1,
+                "19": 1,
+                "20": 1,
+                "21": 1,
+                "22": 1,
+                "23": 1,
+                "24": 1,
+                "25": 1,
+                "26": 1,
+                "27": 1,
+                "28": 1,
+                "29": 1,
+                "30": 1,
+                "31": 1
+            }
+        }
+    }
+}
diff --git a/test/srt/kv_cache_scales_llama3_8b.json b/test/srt/kv_cache_scales_llama3_8b.json
new file mode 100644
index 000000000..466b0d01a
--- /dev/null
+++ b/test/srt/kv_cache_scales_llama3_8b.json
@@ -0,0 +1,42 @@
+{
+    "model_type": "llama",
+    "kv_cache": {
+        "dtype": "float8_e4m3fn",
+        "scaling_factor": {
+            "0": {
+                "0": 0.0408,
+                "1": 0.0503,
+                "2": 0.0667,
+                "3": 0.0909,
+                "4": 0.1135,
+                "5": 0.127,
+                "6": 0.1768,
+                "7": 0.1488,
+                "8": 0.1135,
+                "9": 0.1203,
+                "10": 0.1013,
+                "11": 0.0842,
+                "12": 0.1231,
+                "13": 0.1096,
+                "14": 0.1221,
+                "15": 0.1013,
+                "16": 0.1067,
+                "17": 0.0952,
+                "18": 0.0899,
+                "19": 0.097,
+                "20": 0.087,
+                "21": 0.0994,
+                "22": 0.0904,
+                "23": 0.1013,
+                "24": 0.1019,
+                "25": 0.1053,
+                "26": 0.1,
+                "27": 0.0894,
+                "28": 0.1013,
+                "29": 0.1488,
+                "30": 0.0766,
+                "31": 0.0821
+            }
+        }
+    }
+}
diff --git a/test/srt/kv_cache_scales_qwen2_1_5b.json b/test/srt/kv_cache_scales_qwen2_1_5b.json
new file mode 100644
index 000000000..984747509
--- /dev/null
+++ b/test/srt/kv_cache_scales_qwen2_1_5b.json
@@ -0,0 +1,38 @@
+{
+    "model_type": "qwen",
+    "kv_cache": {
+        "dtype": "float8_e4m3fn",
+        "scaling_factor": {
+            "0": {
+                "0": 0.9846,
+                 "1": 0.0645,
+                 "2": 0.0731,
+                 "3": 0.0800,
+                 "4": 0.0748,
+                 "5": 0.0780,
+                 "6": 0.0702,
+                 "7": 0.0894,
+                 "8": 0.0410,
+                 "9": 0.0758,
+                 "10": 0.0556,
+                 "11": 0.0731,
+                 "12": 0.0899,
+                 "13": 0.0780,
+                 "14": 0.1441,
+                 "15": 0.0914,
+                 "16": 0.5614,
+                 "17": 0.1067,
+                 "18": 0.0537,
+                 "19": 0.0658,
+                 "20": 0.0523,
+                 "21": 0.0533,
+                 "22": 0.0699,
+                 "23": 0.0635,
+                 "24": 0.0588,
+                 "25": 0.0884,
+                 "26": 0.0947,
+                 "27": 0.1032
+            }
+        }
+    }
+}
diff --git a/test/srt/lora/test_lora.py b/test/srt/lora/test_lora.py
new file mode 100644
index 000000000..ab1c630fc
--- /dev/null
+++ b/test/srt/lora/test_lora.py
@@ -0,0 +1,172 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import multiprocessing as mp
+import os
+import random
+import unittest
+from typing import List
+
+import torch
+from utils import (
+    ALL_OTHER_MULTI_LORA_MODELS,
+    CI_MULTI_LORA_MODELS,
+    TORCH_DTYPES,
+    LoRAModelCase,
+    ensure_reproducibility,
+)
+
+from sglang.test.runners import HFRunner, SRTRunner
+from sglang.test.test_utils import CustomTestCase, calculate_rouge_l, is_in_ci
+
+TEST_MULTIPLE_BATCH_PROMPTS = [
+    """
+    ### Instruction:
+    Tell me about llamas and alpacas
+    ### Response:
+    Llamas are large, long-necked animals with a woolly coat. They have two toes on each foot instead of three like other camelids (camels, dromedaries). Llamas live in the Andean mountains of South America where they graze on grasses and shrubs. Alpaca is another name for domesticated llama. The word "alpaca" comes from an Incan language meaning "golden fleece." Alpacas look very similar to llamas but are smaller than their wild relatives. Both species were used by ancient people as pack animals and for meat. Today both llamas and alpacas are raised primarily for their fiber which can be spun into yarn or knitted into clothing.
+    ### Question 2:
+    What do you know about llamas?
+    ### Answer:
+    """,
+    """
+    ### Instruction:
+    Write a poem about the transformers Python library.
+    Mention the word "large language models" in that poem.
+    ### Response:
+    The Transformers are large language models,
+    They're used to make predictions on text.
+    """,
+    "AI is a field of computer science focused on",
+    "Computer science is the study of",
+    "Write a short story.",
+    "What are the main components of a computer?",
+]
+
+
+class TestLoRA(CustomTestCase):
+    def _create_test_samples(
+        self, lora_adapter_paths: List[str], repeated_trials: int = 3
+    ):
+        random.seed(42)  # Ensure reproducibility
+
+        patterns = [
+            [None, lora_adapter_paths[0], lora_adapter_paths[1]],
+            [lora_adapter_paths[0], None, lora_adapter_paths[1]],
+            [lora_adapter_paths[0], lora_adapter_paths[1], None],
+            [None, lora_adapter_paths[1], None],
+            [None, None, None],
+        ]
+
+        batches = [
+            [random.choice(pattern) for _ in range(3)]
+            for pattern in patterns
+            for _ in range(repeated_trials)
+        ]
+
+        return batches
+
+    def _run_lora_multiple_batch_on_model_cases(self, model_cases: List[LoRAModelCase]):
+        for model_case in model_cases:
+            for torch_dtype in TORCH_DTYPES:
+                max_new_tokens = 32
+                backend = "triton"
+                base_path = model_case.base
+                lora_adapter_paths = [a.name for a in model_case.adaptors]
+                assert len(lora_adapter_paths) >= 2
+
+                print(
+                    f"\n========== Testing multiple batches on base '{base_path}' with backend={backend}, dtype={torch_dtype} ---"
+                )
+
+                # Initialize runners
+                srt_runner = SRTRunner(
+                    base_path,
+                    torch_dtype=torch_dtype,
+                    model_type="generation",
+                    lora_paths=[lora_adapter_paths[0], lora_adapter_paths[1]],
+                    max_loras_per_batch=len(lora_adapter_paths) + 1,
+                    lora_backend=backend,
+                    sleep_on_idle=True,  # Eliminate non-determinism by forcing all requests to be processed in one batch.
+                    attention_backend="torch_native",
+                )
+                hf_runner = HFRunner(
+                    base_path, torch_dtype=torch_dtype, model_type="generation"
+                )
+
+                batches = self._create_test_samples(lora_adapter_paths)
+                with srt_runner, hf_runner:
+                    for i, lora_paths in enumerate(batches, start=1):
+                        prompts = [
+                            random.choice(TEST_MULTIPLE_BATCH_PROMPTS) for _ in range(3)
+                        ]
+                        print(
+                            f"\n--- Running Batch {i} --- prompts: {prompts}, lora_paths: {lora_paths}"
+                        )
+
+                        ensure_reproducibility()
+                        srt_outputs = srt_runner.batch_forward(
+                            prompts,
+                            max_new_tokens=max_new_tokens,
+                            lora_paths=lora_paths,
+                        )
+
+                        ensure_reproducibility()
+                        hf_outputs = hf_runner.forward(
+                            prompts,
+                            max_new_tokens=max_new_tokens,
+                            lora_paths=lora_paths,
+                        )
+
+                        print("SRT outputs:", [s for s in srt_outputs.output_strs])
+                        print("HF outputs:", [s for s in hf_outputs.output_strs])
+
+                        for srt_out, hf_out in zip(
+                            srt_outputs.output_strs, hf_outputs.output_strs
+                        ):
+                            srt_str = srt_out.strip()
+                            hf_str = hf_out.strip()
+                            rouge_tol = model_case.rouge_l_tolerance
+                            rouge_score = calculate_rouge_l([srt_str], [hf_str])[0]
+                            if rouge_score < rouge_tol:
+                                raise AssertionError(
+                                    f"ROUGE-L score {rouge_score} below tolerance {rouge_tol} "
+                                    f"for base '{base_path}', adaptor '{lora_paths}', backend '{backend}', prompt: '{prompts}...'"
+                                )
+
+                        print(f"--- Batch {i} Comparison Passed --- ")
+
+    def test_ci_lora_models(self):
+        self._run_lora_multiple_batch_on_model_cases(CI_MULTI_LORA_MODELS)
+
+    def test_all_lora_models(self):
+        if is_in_ci():
+            return
+
+        filtered_models = []
+        for model_case in ALL_OTHER_MULTI_LORA_MODELS:
+            if "ONLY_RUN" in os.environ and os.environ["ONLY_RUN"] != model_case.base:
+                continue
+            filtered_models.append(model_case)
+
+        self._run_lora_multiple_batch_on_model_cases(filtered_models)
+
+
+if __name__ == "__main__":
+    try:
+        mp.set_start_method("spawn")
+    except RuntimeError:
+        pass
+
+    unittest.main(warnings="ignore")
diff --git a/test/srt/lora/test_lora_backend.py b/test/srt/lora/test_lora_backend.py
new file mode 100644
index 000000000..2217e8dd6
--- /dev/null
+++ b/test/srt/lora/test_lora_backend.py
@@ -0,0 +1,76 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import multiprocessing as mp
+import os
+import unittest
+from typing import List
+
+from utils import (
+    ALL_OTHER_LORA_MODELS,
+    BACKENDS,
+    CI_LORA_MODELS,
+    DEFAULT_PROMPTS,
+    TORCH_DTYPES,
+    LoRAModelCase,
+    run_lora_test_one_by_one,
+)
+
+from sglang.test.test_utils import CustomTestCase, is_in_ci
+
+
+class TestLoRABackend(CustomTestCase):
+
+    def _run_backend_on_model_cases(self, model_cases: List[LoRAModelCase]):
+        for model_case in model_cases:
+            # If skip_long_prompt is True, filter out prompts longer than 1000 characters
+            prompts = (
+                DEFAULT_PROMPTS
+                if not model_case.skip_long_prompt
+                else [p for p in DEFAULT_PROMPTS if len(p) < 1000]
+            )
+            for torch_dtype in TORCH_DTYPES:
+                for backend in BACKENDS:
+                    run_lora_test_one_by_one(
+                        prompts,
+                        model_case,
+                        torch_dtype,
+                        max_new_tokens=32,
+                        backend=backend,
+                    )
+
+    def test_ci_lora_models(self):
+        self._run_backend_on_model_cases(CI_LORA_MODELS)
+
+    def test_all_lora_models(self):
+        if is_in_ci():
+            return
+
+        # Retain ONLY_RUN check here
+        filtered_models = []
+        for model_case in ALL_OTHER_LORA_MODELS:
+            if "ONLY_RUN" in os.environ and os.environ["ONLY_RUN"] != model_case.base:
+                continue
+            filtered_models.append(model_case)
+
+        self._run_backend_on_model_cases(filtered_models)
+
+
+if __name__ == "__main__":
+    try:
+        mp.set_start_method("spawn")
+    except RuntimeError:
+        pass
+
+    unittest.main(warnings="ignore")
diff --git a/test/srt/lora/test_lora_cuda_graph.py b/test/srt/lora/test_lora_cuda_graph.py
new file mode 100644
index 000000000..ba68df59a
--- /dev/null
+++ b/test/srt/lora/test_lora_cuda_graph.py
@@ -0,0 +1,110 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import multiprocessing as mp
+import os
+import unittest
+from typing import List
+
+from utils import (
+    ALL_OTHER_LORA_MODELS,
+    CI_LORA_MODELS,
+    DEFAULT_PROMPTS,
+    TORCH_DTYPES,
+    LoRAModelCase,
+    run_lora_test_by_batch,
+    run_lora_test_one_by_one,
+)
+
+from sglang.test.test_utils import CustomTestCase, is_in_ci
+
+TEST_CUDA_GRAPH_PADDING_PROMPTS = [
+    "AI is a field of computer science focused on",
+    """
+    ### Instruction:
+    Tell me about llamas and alpacas
+    ### Response:
+    Llamas are large, long-necked animals with a woolly coat. They have two toes on each foot instead of three like other camelids (camels, dromedaries). Llamas live in the Andean mountains of South America where they graze on grasses and shrubs. Alpaca is another name for domesticated llama. The word "alpaca" comes from an Incan language meaning "golden fleece." Alpacas look very similar to llamas but are smaller than their wild relatives. Both species were used by ancient people as pack animals and for meat. Today both llamas and alpacas are raised primarily for their fiber which can be spun into yarn or knitted into clothing.
+    ### Question 2:
+    What do you know about llamas?
+    ### Answer:
+    """,
+    "Computer science is the study of",
+]
+
+
+class TestLoRACudaGraph(CustomTestCase):
+
+    def _run_without_cuda_graph_on_model_cases(self, model_cases: List[LoRAModelCase]):
+        # Since we have already enabled CUDA graph by default in other lora tests,
+        # we only need to run lora tests without CUDA graph here.
+        for model_case in model_cases:
+            # If skip_long_prompt is True, filter out prompts longer than 1000 characters
+            prompts = (
+                DEFAULT_PROMPTS
+                if not model_case.skip_long_prompt
+                else [p for p in DEFAULT_PROMPTS if len(p) < 1000]
+            )
+            for torch_dtype in TORCH_DTYPES:
+                run_lora_test_one_by_one(
+                    prompts,
+                    model_case,
+                    torch_dtype,
+                    max_new_tokens=32,
+                    backend="triton",
+                    disable_cuda_graph=True,
+                    test_tag="without_cuda_graph",
+                )
+
+    def _run_cuda_graph_padding_on_model_cases(self, model_cases: List[LoRAModelCase]):
+        for model_case in model_cases:
+            # Run a batch size of 3, which will not be captured by CUDA graph and need padding
+            prompts = TEST_CUDA_GRAPH_PADDING_PROMPTS
+            for torch_dtype in TORCH_DTYPES:
+                run_lora_test_by_batch(
+                    prompts,
+                    model_case,
+                    torch_dtype,
+                    max_new_tokens=32,
+                    backend="triton",
+                    disable_cuda_graph=False,
+                    test_tag="cuda_graph_padding",
+                )
+
+    def test_ci_lora_models(self):
+        self._run_without_cuda_graph_on_model_cases(CI_LORA_MODELS)
+        self._run_cuda_graph_padding_on_model_cases(CI_LORA_MODELS)
+
+    def test_all_lora_models(self):
+        if is_in_ci():
+            return
+
+        # Retain ONLY_RUN check here
+        filtered_models = []
+        for model_case in ALL_OTHER_LORA_MODELS:
+            if "ONLY_RUN" in os.environ and os.environ["ONLY_RUN"] != model_case.base:
+                continue
+            filtered_models.append(model_case)
+
+        self._run_without_cuda_graph_on_model_cases(filtered_models)
+        self._run_cuda_graph_padding_on_model_cases(filtered_models)
+
+
+if __name__ == "__main__":
+    try:
+        mp.set_start_method("spawn")
+    except RuntimeError:
+        pass
+
+    unittest.main(warnings="ignore")
diff --git a/test/srt/lora/test_lora_eviction.py b/test/srt/lora/test_lora_eviction.py
new file mode 100644
index 000000000..d27b11906
--- /dev/null
+++ b/test/srt/lora/test_lora_eviction.py
@@ -0,0 +1,146 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import contextlib
+import multiprocessing as mp
+import unittest
+from typing import Dict, List, Tuple
+
+import torch
+
+from sglang.test.runners import SRTRunner
+from sglang.test.test_utils import CustomTestCase
+
+PROMPTS = [
+    "AI is a field of computer science focused on",
+    """
+    ### Instruction:
+    Compose a SQL query that uses the following table: users, and returns the user_id and name of all users whose name that does not have a duplicate in the table.
+    ### Response:
+    SELECT user_id, name FROM users WHERE name LIKE 'A%';
+    """,
+]
+
+ADAPTERS = [
+    "faridlazuarda/valadapt-llama-3.1-8B-it-chinese",  # target_modules = q, v
+    "philschmid/code-llama-3-1-8b-text-to-sql-lora",  # target_modules = q, k, v, o, gate, up, down
+]
+
+BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+
+
+@contextlib.contextmanager
+def dynamically_loaded_adapter(runner, lora_path: str, lora_name: str):
+    """A context manager to load and automatically unload a LoRA adapter."""
+    try:
+        runner.load_lora_adapter(lora_name=lora_name, lora_path=lora_path)
+        yield
+    finally:
+        runner.unload_lora_adapter(lora_name=lora_name)
+
+
+class TestLoRAEviction(CustomTestCase):
+    def test_lora_eviction_with_different_target_modules(self):
+        """
+        Test LoRA eviction with different target modules.
+
+        This test runs inference against two LoRA adapters in different orders to force eviction behavior, and ensures
+        that the outputs of the same (adapter, prompt) pair are consistent across runs.
+        """
+        output_history = {}
+        self._run_test(ADAPTERS, output_history, reverse=False)
+        self._run_test(ADAPTERS, output_history, reverse=True)
+
+    def test_lora_eviction_with_reused_lora_name(self):
+        """
+        Test LoRA eviction with reused LoRA names.
+
+        This test runs inference against two LoRA adapters with the same name to ensure that the eviction behavior
+        works correctly when reusing LoRA names.
+        """
+        output_history = {}
+        self._run_test(ADAPTERS, output_history, reuse_lora_name=True, repeat=1)
+        self._run_test(ADAPTERS, output_history, reuse_lora_name=False, repeat=1)
+
+    def _run_test(
+        self,
+        lora_paths: List[str],
+        output_history: Dict[Tuple[str, str], str],
+        reverse: bool = False,
+        repeat: int = 2,
+        reuse_lora_name: bool = False,
+    ):
+        REUSED_LORA_NAME = "lora"
+        max_new_tokens = 256
+        backend = "triton"
+        torch_dtype = torch.float16
+        base_path = BASE_MODEL
+        assert len(lora_paths) >= 2
+
+        initial_lora_paths = lora_paths if not reuse_lora_name else None
+        # Initialize runners
+        with SRTRunner(
+            base_path,
+            torch_dtype=torch_dtype,
+            model_type="generation",
+            lora_paths=initial_lora_paths,
+            max_loras_per_batch=1,
+            lora_backend=backend,
+            enable_lora=True,
+            max_lora_rank=256,
+            lora_target_modules=["all"],
+        ) as srt_runner:
+            adapter_sequence = lora_paths if not reverse else lora_paths[::-1]
+
+            for i in range(repeat):
+                for j, lora_path in enumerate(adapter_sequence):
+                    print(
+                        f"\n========== Testing LoRA eviction with adapter '{lora_path}' (#{j + 1}/{len(adapter_sequence)}), reuse_lora_name: {reuse_lora_name}, reversed: {reverse}, repeat: {i + 1}/{repeat} ---"
+                    )
+
+                    lora_name = REUSED_LORA_NAME if reuse_lora_name else lora_path
+                    context = (
+                        dynamically_loaded_adapter(srt_runner, lora_path, lora_name)
+                        if reuse_lora_name
+                        else contextlib.nullcontext()
+                    )
+                    with context:
+                        for prompt in PROMPTS:
+                            print("\nprompt:\n", prompt)
+                            srt_outputs = srt_runner.forward(
+                                [prompt],
+                                max_new_tokens=max_new_tokens,
+                                lora_paths=[lora_name],
+                            )
+                            output = srt_outputs.output_strs[0].strip()
+                            print("\noutput:\n", output)
+
+                            prev_output = output_history.get((lora_path, prompt))
+                            if prev_output is not None:
+                                self.assertEqual(
+                                    prev_output,
+                                    output,
+                                    f"Output mismatch for adapter {lora_path} and prompt '{prompt}' on repeat {j + 1}, previous: '{prev_output}', current: '{output}'.",
+                                )
+                            else:
+                                output_history[(lora_path, prompt)] = output
+
+
+if __name__ == "__main__":
+    try:
+        mp.set_start_method("spawn")
+    except RuntimeError:
+        pass
+
+    unittest.main(warnings="ignore")
diff --git a/test/srt/lora/test_lora_llama4.py b/test/srt/lora/test_lora_llama4.py
new file mode 100644
index 000000000..65a4b766f
--- /dev/null
+++ b/test/srt/lora/test_lora_llama4.py
@@ -0,0 +1,61 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+MODELS = [
+    SimpleNamespace(
+        model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+        tp_size=8,
+    ),
+]
+
+
+class TestLlama4LoRA(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.base_url = DEFAULT_URL_FOR_TEST
+
+    def test_bringup(self):
+        for model in MODELS:
+            try:
+                process = popen_launch_server(
+                    model.model,
+                    self.base_url,
+                    timeout=3 * DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                    other_args=[
+                        "--enable-lora",
+                        "--max-lora-rank",
+                        "64",
+                        "--lora-target-modules",
+                        "all",
+                        "--tp-size",
+                        str(model.tp_size),
+                        "--context-length",
+                        "1048576",
+                        "--attention-backend",
+                        "fa3",
+                    ],
+                )
+            except Exception as e:
+                print(f"Error testing {model.model}: {e}")
+                self.fail(f"Test failed for {model.model}: {e}")
+
+            finally:
+                # Ensure process cleanup happens regardless of success/failure
+                if process is not None and process.poll() is None:
+                    print(f"Cleaning up process {process.pid}")
+                    try:
+                        kill_process_tree(process.pid)
+                    except Exception as e:
+                        print(f"Error killing process: {e}")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/lora/test_lora_qwen3.py b/test/srt/lora/test_lora_qwen3.py
new file mode 100644
index 000000000..f77156707
--- /dev/null
+++ b/test/srt/lora/test_lora_qwen3.py
@@ -0,0 +1,212 @@
+# Copyright 2023-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import multiprocessing as mp
+import os
+import random
+import unittest
+from typing import List
+
+from utils import TORCH_DTYPES, LoRAAdaptor, LoRAModelCase, ensure_reproducibility
+
+from sglang.test.runners import HFRunner, SRTRunner
+from sglang.test.test_utils import CustomTestCase, calculate_rouge_l, is_in_ci
+
+LORA_MODELS_QWEN3 = [
+    LoRAModelCase(
+        base="Qwen/Qwen3-4B",
+        adaptors=[
+            LoRAAdaptor(
+                name="nissenj/Qwen3-4B-lora-v2",
+                prefill_tolerance=3e-1,
+            ),
+            LoRAAdaptor(
+                name="y9760210/Qwen3-4B-lora_model",
+                prefill_tolerance=3e-1,
+            ),
+        ],
+        max_loras_per_batch=2,
+    ),
+]
+
+
+TEST_MULTIPLE_BATCH_PROMPTS = [
+    """
+    ### Instruction:
+    Tell me about llamas and alpacas
+    ### Response:
+    Llamas are large, long-necked animals with a woolly coat. They have two toes on each foot instead of three like other camelids (camels, dromedaries). Llamas live in the Andean mountains of South America where they graze on grasses and shrubs. Alpaca is another name for domesticated llama. The word "alpaca" comes from an Incan language meaning "golden fleece." Alpacas look very similar to llamas but are smaller than their wild relatives. Both species were used by ancient people as pack animals and for meat. Today both llamas and alpacas are raised primarily for their fiber which can be spun into yarn or knitted into clothing.
+    ### Question 2:
+    What do you know about llamas?
+    ### Answer:
+    """,
+    """
+    ### Instruction:
+    Write a poem about the transformers Python library.
+    Mention the word "large language models" in that poem.
+    ### Response:
+    The Transformers are large language models,
+    They're used to make predictions on text.
+    """,
+    "AI is a field of computer science focused on",
+    "Computer science is the study of",
+    "Write a short story.",
+    "What are the main components of a computer?",
+]
+
+
+class TestLoRAQwen3(CustomTestCase):
+    def _run_lora_multiple_batch_on_model_cases(self, model_cases: List[LoRAModelCase]):
+        for model_case in model_cases:
+            for torch_dtype in TORCH_DTYPES:
+                max_new_tokens = 32
+                backend = "triton"
+                base_path = model_case.base
+                lora_adapter_paths = [a.name for a in model_case.adaptors]
+                assert len(lora_adapter_paths) >= 2
+
+                batches = [
+                    (
+                        [
+                            random.choice(TEST_MULTIPLE_BATCH_PROMPTS),
+                            random.choice(TEST_MULTIPLE_BATCH_PROMPTS),
+                            random.choice(TEST_MULTIPLE_BATCH_PROMPTS),
+                        ],
+                        [
+                            None,
+                            lora_adapter_paths[0],
+                            lora_adapter_paths[1],
+                        ],
+                    ),
+                    (
+                        [
+                            random.choice(TEST_MULTIPLE_BATCH_PROMPTS),
+                            random.choice(TEST_MULTIPLE_BATCH_PROMPTS),
+                            random.choice(TEST_MULTIPLE_BATCH_PROMPTS),
+                        ],
+                        [
+                            lora_adapter_paths[0],
+                            None,
+                            lora_adapter_paths[1],
+                        ],
+                    ),
+                    (
+                        [
+                            random.choice(TEST_MULTIPLE_BATCH_PROMPTS),
+                            random.choice(TEST_MULTIPLE_BATCH_PROMPTS),
+                            random.choice(TEST_MULTIPLE_BATCH_PROMPTS),
+                        ],
+                        [lora_adapter_paths[0], lora_adapter_paths[1], None],
+                    ),
+                    (
+                        [
+                            random.choice(TEST_MULTIPLE_BATCH_PROMPTS),
+                            random.choice(TEST_MULTIPLE_BATCH_PROMPTS),
+                            random.choice(TEST_MULTIPLE_BATCH_PROMPTS),
+                        ],
+                        [None, lora_adapter_paths[1], None],
+                    ),
+                    (
+                        [
+                            random.choice(TEST_MULTIPLE_BATCH_PROMPTS),
+                            random.choice(TEST_MULTIPLE_BATCH_PROMPTS),
+                            random.choice(TEST_MULTIPLE_BATCH_PROMPTS),
+                        ],
+                        [None, None, None],
+                    ),
+                ]
+
+                print(
+                    f"\n========== Testing multiple batches on base '{base_path}' with backend={backend}, dtype={torch_dtype} ---"
+                )
+
+                # Initialize runners
+                ensure_reproducibility()
+                srt_runner = SRTRunner(
+                    base_path,
+                    torch_dtype=torch_dtype,
+                    model_type="generation",
+                    lora_paths=[lora_adapter_paths[0], lora_adapter_paths[1]],
+                    max_loras_per_batch=len(lora_adapter_paths) + 1,
+                    lora_backend=backend,
+                    sleep_on_idle=True,  # Eliminate non-determinism by forcing all requests to be processed in one batch.
+                    attention_backend="torch_native",
+                )
+
+                ensure_reproducibility()
+                hf_runner = HFRunner(
+                    base_path,
+                    torch_dtype=torch_dtype,
+                    model_type="generation",
+                    patch_model_do_sample_false=True,
+                )
+
+                with srt_runner, hf_runner:
+                    for i, (prompts, lora_paths) in enumerate(batches):
+                        print(
+                            f"\n--- Running Batch {i+1} --- prompts: {prompts}, lora_paths: {lora_paths}"
+                        )
+
+                        srt_outputs = srt_runner.batch_forward(
+                            prompts,
+                            max_new_tokens=max_new_tokens,
+                            lora_paths=lora_paths,
+                        )
+
+                        hf_outputs = hf_runner.forward(
+                            prompts,
+                            max_new_tokens=max_new_tokens,
+                            lora_paths=lora_paths,
+                        )
+
+                        print("SRT outputs:", [s for s in srt_outputs.output_strs])
+                        print("HF outputs:", [s for s in hf_outputs.output_strs])
+
+                        for srt_out, hf_out in zip(
+                            srt_outputs.output_strs, hf_outputs.output_strs
+                        ):
+                            srt_str = srt_out.strip()
+                            hf_str = hf_out.strip()
+                            rouge_tol = model_case.rouge_l_tolerance
+                            rouge_score = calculate_rouge_l([srt_str], [hf_str])[0]
+                            if rouge_score < rouge_tol:
+                                raise AssertionError(
+                                    f"ROUGE-L score {rouge_score} below tolerance {rouge_tol} "
+                                    f"for base '{base_path}', adaptor '{lora_paths}', backend '{backend}', prompt: '{prompts}...'"
+                                )
+
+                        print(f"--- Batch {i+1} Comparison Passed --- ")
+
+    def test_ci_lora_models(self):
+        self._run_lora_multiple_batch_on_model_cases(LORA_MODELS_QWEN3)
+
+    def test_all_lora_models(self):
+        if is_in_ci():
+            return
+        qwen_filtered_models = []
+        for model_case in LORA_MODELS_QWEN3:
+            if "ONLY_RUN" in os.environ and os.environ["ONLY_RUN"] != model_case.base:
+                continue
+            qwen_filtered_models.append(model_case)
+
+        self._run_lora_multiple_batch_on_model_cases(qwen_filtered_models)
+
+
+if __name__ == "__main__":
+    try:
+        mp.set_start_method("spawn")
+    except RuntimeError:
+        pass
+
+    unittest.main(warnings="ignore")
diff --git a/test/srt/lora/test_lora_radix_cache.py b/test/srt/lora/test_lora_radix_cache.py
new file mode 100644
index 000000000..d3ecb219c
--- /dev/null
+++ b/test/srt/lora/test_lora_radix_cache.py
@@ -0,0 +1,83 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import multiprocessing as mp
+import random
+import unittest
+
+import torch
+from utils import CI_MULTI_LORA_MODELS, DEFAULT_PROMPTS, run_lora_test_one_by_one
+
+from sglang.test.runners import HFRunner, SRTRunner
+from sglang.test.test_utils import CustomTestCase
+
+PROMPTS = [
+    "AI is a field of computer science focused on",
+    """
+    ### Instruction:
+    Tell me about llamas and alpacas
+    ### Response:
+    Llamas are large, long-necked animals with a woolly coat. They have two toes on each foot instead of three like other camelids.
+    ### Question:
+    What do you know about llamas?
+    ### Answer:
+    """,
+]
+
+
+class TestLoRARadixCache(CustomTestCase):
+
+    def test_lora_radix_cache(self):
+        # Here we need a model case with multiple adaptors for testing correctness of radix cache
+        model_case = CI_MULTI_LORA_MODELS[0]
+
+        torch_dtype = torch.float16
+        max_new_tokens = 32
+        backend = "triton"
+        batch_prompts = (
+            PROMPTS
+            if not model_case.skip_long_prompt
+            else [p for p in PROMPTS if len(p) < 1000]
+        )
+
+        # Test lora with radix cache
+        run_lora_test_one_by_one(
+            batch_prompts,
+            model_case,
+            torch_dtype,
+            max_new_tokens=max_new_tokens,
+            backend=backend,
+            disable_radix_cache=False,
+            test_tag="lora-with-radix-cache",
+        )
+
+        # Test lora without radix cache
+        run_lora_test_one_by_one(
+            batch_prompts,
+            model_case,
+            torch_dtype,
+            max_new_tokens=max_new_tokens,
+            backend=backend,
+            disable_radix_cache=True,
+            test_tag="lora-without-radix-cache",
+        )
+
+
+if __name__ == "__main__":
+    try:
+        mp.set_start_method("spawn")
+    except RuntimeError:
+        pass
+
+    unittest.main(warnings="ignore")
diff --git a/test/srt/lora/test_lora_tp.py b/test/srt/lora/test_lora_tp.py
new file mode 100644
index 000000000..51a552d78
--- /dev/null
+++ b/test/srt/lora/test_lora_tp.py
@@ -0,0 +1,78 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import multiprocessing as mp
+import os
+import unittest
+from typing import List
+
+from utils import (
+    ALL_OTHER_LORA_MODELS,
+    CI_LORA_MODELS,
+    DEFAULT_PROMPTS,
+    TORCH_DTYPES,
+    LoRAModelCase,
+    run_lora_test_one_by_one,
+)
+
+from sglang.test.test_utils import CustomTestCase, is_in_ci
+
+
+class TestLoRATP(CustomTestCase):
+
+    def _run_tp_on_model_cases(self, model_cases: List[LoRAModelCase]):
+        tp_list = [2]  # Define TP sizes to iterate over
+        for model_case in model_cases:
+            # If skip_long_prompt is True, filter out prompts longer than 1000 characters
+            prompts = (
+                DEFAULT_PROMPTS
+                if not model_case.skip_long_prompt
+                else [p for p in DEFAULT_PROMPTS if len(p) < 1000]
+            )
+            for tp_size in tp_list:
+                model_case.tp_size = tp_size
+                for torch_dtype in TORCH_DTYPES:
+                    run_lora_test_one_by_one(
+                        prompts,
+                        model_case,
+                        torch_dtype,
+                        max_new_tokens=32,
+                        backend="triton",
+                        test_tag=f"tp={tp_size}",
+                    )
+
+    def test_ci_lora_models(self):
+        self._run_tp_on_model_cases(CI_LORA_MODELS)
+
+    def test_all_lora_models(self):
+        if is_in_ci():
+            return
+
+        # Retain ONLY_RUN check here
+        filtered_models = []
+        for model_case in ALL_OTHER_LORA_MODELS:
+            if "ONLY_RUN" in os.environ and os.environ["ONLY_RUN"] != model_case.base:
+                continue
+            filtered_models.append(model_case)
+
+        self._run_tp_on_model_cases(filtered_models)
+
+
+if __name__ == "__main__":
+    try:
+        mp.set_start_method("spawn")
+    except RuntimeError:
+        pass
+
+    unittest.main(warnings="ignore")
diff --git a/test/srt/lora/test_lora_update.py b/test/srt/lora/test_lora_update.py
new file mode 100644
index 000000000..f2058f466
--- /dev/null
+++ b/test/srt/lora/test_lora_update.py
@@ -0,0 +1,1306 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import json
+import multiprocessing as mp
+import unittest
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any, Iterable, List, Optional, Union
+
+import requests
+import torch
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.runners import SRTRunner
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+)
+
+PROMPTS = [
+    "SGL is a",
+    "AI is a field of computer science focused on",
+    "Computer science is the study of",
+    "Write a short story.",
+    "What are the main components of a computer?",
+]
+
+MEM_FRACTION_STATIC = 0.8
+
+
+class OperationType(Enum):
+    LOAD = "load"
+    UNLOAD = "unload"
+    FORWARD = "forward"
+
+
+@dataclass
+class Operation:
+    # Operation type, can be LOAD, UNLOAD, FORWARD
+    type: OperationType
+    # Data associated with the operation. Exact type varies depending on the operation
+    data: Optional[Any]
+    # If the operation is expected to fail, this is the error message to expect
+    expected_error: Optional[str] = None
+
+
+@dataclass
+class TestCase:
+    description: str
+    base: str
+    max_loras_per_batch: int
+    all_adapters: List[str]
+    op_sequence: List[Operation]
+    initial_adapters: Optional[List[str]] = None
+    enable_lora: Optional[bool] = None
+    max_lora_rank: Optional[int] = None
+    lora_target_modules: Optional[List] = None
+    max_new_tokens: int = 32
+    max_loaded_loras: Optional[int] = None
+
+
+def create_batch_data(adapters: Union[str, list]) -> List[tuple[str, str]]:
+    if not isinstance(adapters, list):
+        adapters = [adapters]
+    return [(prompt, adapter) for prompt in PROMPTS for adapter in adapters]
+
+
+BASIC_TESTS = [
+    TestCase(
+        description="dynamic lora update with initial lora_paths",
+        base="meta-llama/Llama-3.1-8B-Instruct",
+        max_loras_per_batch=3,
+        all_adapters=[
+            "philschmid/code-llama-3-1-8b-text-to-sql-lora",
+            "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+            "pbevan11/llama-3.1-8b-ocr-correction",
+        ],
+        initial_adapters=[
+            # Testing 3 supported lora-path formats.
+            "philschmid/code-llama-3-1-8b-text-to-sql-lora",
+            "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16=Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+            {
+                "lora_name": "pbevan11/llama-3.1-8b-ocr-correction",
+                "lora_path": "pbevan11/llama-3.1-8b-ocr-correction",
+                "pinned": False,
+            },
+        ],
+        op_sequence=[
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    [
+                        "philschmid/code-llama-3-1-8b-text-to-sql-lora",
+                        "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+                        "pbevan11/llama-3.1-8b-ocr-correction",
+                    ]
+                ),
+            ),
+            Operation(
+                type=OperationType.UNLOAD,
+                data="Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+            ),
+            Operation(
+                type=OperationType.UNLOAD,
+                data="pbevan11/llama-3.1-8b-ocr-correction",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data("philschmid/code-llama-3-1-8b-text-to-sql-lora"),
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16"
+                ),
+                expected_error="not loaded",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data("pbevan11/llama-3.1-8b-ocr-correction"),
+                expected_error="not loaded",
+            ),
+            Operation(
+                type=OperationType.LOAD,
+                data="Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+            ),
+            Operation(
+                type=OperationType.LOAD,
+                data="pbevan11/llama-3.1-8b-ocr-correction",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    [
+                        "philschmid/code-llama-3-1-8b-text-to-sql-lora",
+                        "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+                        "pbevan11/llama-3.1-8b-ocr-correction",
+                    ]
+                ),
+            ),
+            Operation(
+                type=OperationType.UNLOAD,
+                data="philschmid/code-llama-3-1-8b-text-to-sql-lora",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data("philschmid/code-llama-3-1-8b-text-to-sql-lora"),
+                expected_error="not loaded",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    [
+                        "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+                        "pbevan11/llama-3.1-8b-ocr-correction",
+                    ]
+                ),
+            ),
+            Operation(
+                type=OperationType.UNLOAD,
+                data="Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+            ),
+            Operation(
+                type=OperationType.UNLOAD,
+                data="pbevan11/llama-3.1-8b-ocr-correction",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16"
+                ),
+                expected_error="not loaded",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data("pbevan11/llama-3.1-8b-ocr-correction"),
+                expected_error="not loaded",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    None,
+                ),
+            ),
+        ],
+    ),
+    TestCase(
+        description="dynamic lora update without initial lora_paths",
+        base="meta-llama/Llama-3.1-8B-Instruct",
+        enable_lora=True,
+        max_lora_rank=256,
+        lora_target_modules=["all"],
+        max_loras_per_batch=4,
+        all_adapters=[
+            "philschmid/code-llama-3-1-8b-text-to-sql-lora",
+            "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+            "pbevan11/llama-3.1-8b-ocr-correction",
+        ],
+        op_sequence=[
+            Operation(
+                type=OperationType.LOAD,
+                data="philschmid/code-llama-3-1-8b-text-to-sql-lora",
+            ),
+            Operation(
+                type=OperationType.LOAD,
+                data="Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+            ),
+            Operation(
+                type=OperationType.LOAD,
+                data="pbevan11/llama-3.1-8b-ocr-correction",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    [
+                        "philschmid/code-llama-3-1-8b-text-to-sql-lora",
+                        "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+                        "pbevan11/llama-3.1-8b-ocr-correction",
+                        None,
+                    ]
+                ),
+            ),
+            Operation(
+                type=OperationType.UNLOAD,
+                data="philschmid/code-llama-3-1-8b-text-to-sql-lora",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data("philschmid/code-llama-3-1-8b-text-to-sql-lora"),
+                expected_error="not loaded",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    [
+                        None,
+                        "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+                        "pbevan11/llama-3.1-8b-ocr-correction",
+                        None,
+                    ]
+                ),
+            ),
+            Operation(
+                type=OperationType.UNLOAD,
+                data="Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+            ),
+            Operation(
+                type=OperationType.UNLOAD,
+                data="pbevan11/llama-3.1-8b-ocr-correction",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16"
+                ),
+                expected_error="not loaded",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data("pbevan11/llama-3.1-8b-ocr-correction"),
+                expected_error="not loaded",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(None),
+            ),
+            Operation(
+                type=OperationType.LOAD,
+                data="philschmid/code-llama-3-1-8b-text-to-sql-lora",
+            ),
+            Operation(
+                type=OperationType.LOAD,
+                data="Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+            ),
+            Operation(
+                type=OperationType.LOAD,
+                data="pbevan11/llama-3.1-8b-ocr-correction",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    [
+                        "philschmid/code-llama-3-1-8b-text-to-sql-lora",
+                        "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+                        "pbevan11/llama-3.1-8b-ocr-correction",
+                        None,
+                    ]
+                ),
+            ),
+        ],
+    ),
+]
+TARGET_MODULE_TESTS = [
+    TestCase(
+        description="Test explicitly specified lora-target-modules.",
+        base="meta-llama/Llama-3.1-8B-Instruct",
+        max_loras_per_batch=3,
+        lora_target_modules=[
+            "q_proj",
+            "k_proj",
+            "v_proj",
+            "o_proj",
+            "gate_proj",
+            "up_proj",
+            "down_proj",
+        ],
+        max_lora_rank=64,
+        all_adapters=[
+            "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",  # target_modules = q, k, v, o, gate, up, down
+            "algoprog/fact-generation-llama-3.1-8b-instruct-lora",  # target_modules = q, k, v, o, gate
+        ],
+        initial_adapters=["algoprog/fact-generation-llama-3.1-8b-instruct-lora"],
+        op_sequence=[
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    "algoprog/fact-generation-llama-3.1-8b-instruct-lora"
+                ),
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16"
+                ),
+                expected_error="not loaded",
+            ),
+            Operation(
+                type=OperationType.LOAD,
+                data="Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    [
+                        "algoprog/fact-generation-llama-3.1-8b-instruct-lora",
+                        "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+                        None,
+                    ]
+                ),
+            ),
+        ],
+    ),
+    TestCase(
+        description="Test inferred lora-target-modules - start with larger adapter",
+        base="meta-llama/Llama-3.1-8B-Instruct",
+        max_loras_per_batch=3,
+        max_lora_rank=64,
+        all_adapters=[
+            "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",  # target_modules = q, k, v, o, gate, up, down
+            "algoprog/fact-generation-llama-3.1-8b-instruct-lora",  # target_modules = q, k, v, o, gate
+        ],
+        initial_adapters=["Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16"],
+        op_sequence=[
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16"
+                ),
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    "algoprog/fact-generation-llama-3.1-8b-instruct-lora"
+                ),
+                expected_error="not loaded",
+            ),
+            Operation(
+                type=OperationType.LOAD,
+                data="algoprog/fact-generation-llama-3.1-8b-instruct-lora",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    [
+                        "algoprog/fact-generation-llama-3.1-8b-instruct-lora",
+                        "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+                        None,
+                    ]
+                ),
+            ),
+        ],
+    ),
+    TestCase(
+        description="Test inferred lora-target-modules - start with smaller adapter",
+        base="meta-llama/Llama-3.1-8B-Instruct",
+        max_loras_per_batch=3,
+        max_lora_rank=64,
+        all_adapters=[
+            "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",  # target_modules = q, k, v, o, gate, up, down
+            "algoprog/fact-generation-llama-3.1-8b-instruct-lora",  # target_modules = q, k, v, o, gate
+        ],
+        initial_adapters=["algoprog/fact-generation-llama-3.1-8b-instruct-lora"],
+        op_sequence=[
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    "algoprog/fact-generation-llama-3.1-8b-instruct-lora"
+                ),
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16"
+                ),
+                expected_error="not loaded",
+            ),
+            Operation(
+                type=OperationType.LOAD,
+                data="Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+                expected_error="incompatible",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    [
+                        "algoprog/fact-generation-llama-3.1-8b-instruct-lora",
+                        None,
+                    ]
+                ),
+            ),
+        ],
+    ),
+]
+MAX_LORA_RANK_TESTS = [
+    TestCase(
+        description="Test explicitly specified max-lora-rank.",
+        base="meta-llama/Llama-3.1-8B-Instruct",
+        max_loras_per_batch=3,
+        max_lora_rank=32,
+        all_adapters=[
+            "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",  # r = 4
+            "pbevan11/llama-3.1-8b-ocr-correction",  # r = 32
+            "philschmid/code-llama-3-1-8b-text-to-sql-lora",  # r = 256
+        ],
+        initial_adapters=["Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16"],
+        op_sequence=[
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16"
+                ),
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data("philschmid/code-llama-3-1-8b-text-to-sql-lora"),
+                expected_error="not loaded",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data("pbevan11/llama-3.1-8b-ocr-correction"),
+                expected_error="not loaded",
+            ),
+            Operation(
+                type=OperationType.LOAD,
+                data="pbevan11/llama-3.1-8b-ocr-correction",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    [
+                        "pbevan11/llama-3.1-8b-ocr-correction",
+                        "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+                        None,
+                    ]
+                ),
+            ),
+            Operation(
+                type=OperationType.LOAD,
+                data="philschmid/code-llama-3-1-8b-text-to-sql-lora",
+                expected_error="incompatible",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    "philschmid/code-llama-3-1-8b-text-to-sql-lora",
+                ),
+                expected_error="not loaded",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    [
+                        "pbevan11/llama-3.1-8b-ocr-correction",
+                        "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+                        None,
+                    ]
+                ),
+            ),
+        ],
+    ),
+    TestCase(
+        description="test implicitly inferred max-lora-rank",
+        base="meta-llama/Llama-3.1-8B-Instruct",
+        max_loras_per_batch=3,
+        all_adapters=[
+            "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",  # r = 4
+            "pbevan11/llama-3.1-8b-ocr-correction",  # r = 32
+            "philschmid/code-llama-3-1-8b-text-to-sql-lora",  # r = 256
+        ],
+        initial_adapters=["pbevan11/llama-3.1-8b-ocr-correction"],
+        op_sequence=[
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data("pbevan11/llama-3.1-8b-ocr-correction"),
+            ),
+            Operation(
+                type=OperationType.LOAD,
+                data="philschmid/code-llama-3-1-8b-text-to-sql-lora",
+                expected_error="incompatible",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data("philschmid/code-llama-3-1-8b-text-to-sql-lora"),
+                expected_error="not loaded",
+            ),
+            Operation(
+                type=OperationType.LOAD,
+                data="Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+                ),
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    [
+                        "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+                        "pbevan11/llama-3.1-8b-ocr-correction",
+                        None,
+                    ]
+                ),
+            ),
+        ],
+    ),
+]
+MAX_LOADED_LORAS_TESTS = [
+    TestCase(
+        description="Test max_loaded_loras limit",
+        base="meta-llama/Llama-3.1-8B-Instruct",
+        max_loras_per_batch=2,
+        max_loaded_loras=2,
+        all_adapters=[
+            "philschmid/code-llama-3-1-8b-text-to-sql-lora",
+            "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+            "pbevan11/llama-3.1-8b-ocr-correction",
+        ],
+        initial_adapters=["philschmid/code-llama-3-1-8b-text-to-sql-lora"],
+        op_sequence=[
+            Operation(
+                type=OperationType.LOAD,
+                data="Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+            ),
+            Operation(
+                type=OperationType.LOAD,
+                data="pbevan11/llama-3.1-8b-ocr-correction",
+                expected_error="Maximum number of loaded LoRA adapters",
+            ),
+            Operation(
+                type=OperationType.UNLOAD,
+                data="Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+            ),
+            Operation(
+                type=OperationType.LOAD,
+                data="pbevan11/llama-3.1-8b-ocr-correction",
+            ),
+        ],
+    ),
+]
+EVICTION_TESTS = [
+    TestCase(
+        description="dynamic lora update with evictions",
+        base="meta-llama/Llama-3.1-8B-Instruct",
+        max_loras_per_batch=2,
+        all_adapters=[
+            "lora1=philschmid/code-llama-3-1-8b-text-to-sql-lora",
+            "lora2=Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+            "lora3=pbevan11/llama-3.1-8b-ocr-correction",
+        ],
+        enable_lora=True,
+        max_lora_rank=256,
+        lora_target_modules=["all"],
+        op_sequence=[
+            Operation(
+                type=OperationType.LOAD,
+                data={
+                    "lora_name": "lora1",
+                    "lora_path": "philschmid/code-llama-3-1-8b-text-to-sql-lora",
+                    "pinned": True,
+                },
+            ),
+            Operation(
+                type=OperationType.LOAD,
+                data={
+                    "lora_name": "lora2",
+                    "lora_path": "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+                    "pinned": True,
+                },
+                expected_error="starvation",
+            ),
+            Operation(
+                type=OperationType.LOAD,
+                data={
+                    "lora_name": "lora2",
+                    "lora_path": "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+                    "pinned": False,
+                },
+            ),
+            Operation(
+                type=OperationType.LOAD,
+                data={
+                    "lora_name": "lora3",
+                    "lora_path": "pbevan11/llama-3.1-8b-ocr-correction",
+                    "pinned": False,
+                },
+            ),
+            Operation(
+                type=OperationType.UNLOAD,
+                data="lora1",
+            ),
+            Operation(
+                type=OperationType.UNLOAD,
+                data="lora3",
+            ),
+            Operation(
+                type=OperationType.LOAD,
+                data={
+                    "lora_name": "lora3",
+                    "lora_path": "pbevan11/llama-3.1-8b-ocr-correction",
+                    "pinned": True,
+                },
+            ),
+            Operation(
+                type=OperationType.LOAD,
+                data={
+                    "lora_name": "lora1",
+                    "lora_path": "philschmid/code-llama-3-1-8b-text-to-sql-lora",
+                    "pinned": True,
+                },
+                expected_error="starvation",
+            ),
+            Operation(
+                type=OperationType.LOAD,
+                data={
+                    "lora_name": "lora1",
+                    "lora_path": "philschmid/code-llama-3-1-8b-text-to-sql-lora",
+                    "pinned": False,
+                },
+            ),
+            # pinned: lora3
+            # unpinned: lora1, lora2
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    [
+                        "lora1",
+                        "lora2",
+                    ]
+                ),
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    [
+                        "lora1",
+                        "lora3",
+                    ]
+                ),
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    [
+                        "lora1",
+                        "lora2",
+                    ]
+                ),
+            ),
+            Operation(
+                type=OperationType.FORWARD,
+                data=create_batch_data(
+                    [
+                        "lora1",
+                        "lora2",
+                        None,
+                    ]
+                ),
+            ),
+        ],
+    ),
+]
+
+ALL_TESTS = (
+    BASIC_TESTS
+    + TARGET_MODULE_TESTS
+    + MAX_LORA_RANK_TESTS
+    + MAX_LOADED_LORAS_TESTS
+    + EVICTION_TESTS
+)
+
+
+class LoRAUpdateTestSessionMode(Enum):
+    ENGINE = "engine"
+    SERVER = "server"
+
+
+class LoRAUpdateTestSessionBase:
+    """
+    Base context manager for testing LoRA adapters.
+    """
+
+    def __init__(
+        self,
+        *,
+        testcase: Optional[TestCase],
+        model_path: str,
+        lora_paths: List[Union[str, dict]],
+        max_loras_per_batch: int,
+        max_loaded_loras: Optional[int] = None,
+        max_lora_rank: Optional[int],
+        enable_lora: Optional[bool] = None,
+        lora_target_modules: Optional[List[str]] = None,
+        lora_backend: str = "triton",
+        disable_cuda_graph: bool = False,
+        cuda_graph_max_bs: int = 4,
+    ):
+        self.testcase = testcase
+        self.model_path = model_path
+        self.lora_paths = lora_paths
+        self.max_lora_rank = max_lora_rank
+        self.lora_target_modules = lora_target_modules
+        self.max_loras_per_batch = max_loras_per_batch
+        self.max_loaded_loras = max_loaded_loras
+        self.lora_backend = lora_backend
+        self.disable_cuda_graph = disable_cuda_graph
+        self.cuda_graph_max_bs = cuda_graph_max_bs
+        self.enable_lora = enable_lora
+
+        self.expected_adapters = set()
+        if self.lora_paths:
+            for adapter in self.lora_paths:
+                if isinstance(adapter, dict):
+                    lora_name = adapter["lora_name"]
+                elif "=" in adapter:
+                    lora_name = adapter.split("=")[0]
+                else:
+                    lora_name = adapter
+                self.expected_adapters.add(lora_name)
+
+        self.handle = None  # Will be set in __enter__
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        # Don't suppress exceptions by default
+        return False
+
+    def load_lora_adapter(
+        self,
+        lora_name: str,
+        lora_path: Optional[str] = None,
+        expected_error: Optional[str] = None,
+    ):
+        """
+        Load a LoRA adapter by name and path.
+        """
+        raise NotImplementedError("Subclasses must implement load_lora_adapter")
+
+    def unload_lora_adapter(self, lora_name: str):
+        """
+        Unload a LoRA adapter by name.
+        """
+        raise NotImplementedError("Subclasses must implement unload_lora_adapter")
+
+    def forward(
+        self,
+        prompts: List[str],
+        lora_paths: List[str],
+        max_new_tokens: int = 32,
+        expected_error: Optional[str] = None,
+    ):
+        """
+        Perform a batch forward pass with the current set of loaded LoRA adapters.
+        """
+        raise NotImplementedError("Subclasses must implement forward")
+
+
+class LoRAUpdateEngineTestSession(LoRAUpdateTestSessionBase):
+    """
+    Context manager for testing LoRA adapters with in-process engine.
+    """
+
+    def __enter__(self):
+        # in-process runner
+        self.handle = SRTRunner(
+            model_path=self.model_path,
+            model_type="generation",
+            lora_paths=self.lora_paths,
+            max_lora_rank=self.max_lora_rank,
+            lora_target_modules=self.lora_target_modules,
+            lora_backend=self.lora_backend,
+            torch_dtype=torch.float16,
+            mem_fraction_static=MEM_FRACTION_STATIC,
+            max_loras_per_batch=self.max_loras_per_batch,
+            max_loaded_loras=self.max_loaded_loras,
+            disable_cuda_graph=self.disable_cuda_graph,
+            cuda_graph_max_bs=self.cuda_graph_max_bs,
+            enable_lora=self.enable_lora,
+            disable_radix_cache=True,
+        )
+        self.handle.__enter__()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.handle is not None:
+            # delegate cleanup to SRTRunner
+            return self.handle.__exit__(exc_type, exc_val, exc_tb)
+        # don't suppress exceptions
+        return False
+
+    def load_lora_adapter(
+        self,
+        lora_name: str,
+        lora_path: Optional[str] = None,
+        expected_error: Optional[str] = None,
+        pinned: bool = False,
+    ):
+        """
+        Load a LoRA adapter by name and path.
+        """
+        if lora_path is None:
+            lora_path = lora_name
+
+        response = self.handle.load_lora_adapter(
+            lora_name=lora_name,
+            lora_path=lora_path,
+            pinned=pinned,
+        )
+        if expected_error:
+            self.testcase.assertFalse(
+                response.success, f"Expected failure for {lora_name}, but got success."
+            )
+            self.testcase.assertIn(
+                expected_error,
+                response.error_message,
+                f"Expected error message to contain '{expected_error}', but got '{response.error_message}'",
+            )
+            print(f"Received error as expected: {response.error_message}")
+        else:
+            self.expected_adapters.add(lora_name)
+            self.testcase.assertTrue(
+                response.success,
+                f"Failed to load LoRA adapter {lora_name}: {response.error_message}",
+            )
+            loaded_adapters = set(response.loaded_adapters)
+            print(f"loaded_adapters: {loaded_adapters}")
+            self.testcase.assertEqual(
+                loaded_adapters,
+                self.expected_adapters,
+                f"Expected loaded adapters to be {self.expected_adapters}, but got {loaded_adapters}",
+            )
+
+    def unload_lora_adapter(self, lora_name: str):
+        """
+        Unload a LoRA adapter by name.
+        """
+        self.expected_adapters.remove(lora_name)
+
+        response = self.handle.unload_lora_adapter(
+            lora_name=lora_name,
+        )
+        self.testcase.assertTrue(
+            response.success,
+            f"Failed to unload LoRA adapter {lora_name}: {response.error_message}",
+        )
+        loaded_adapters = set(response.loaded_adapters)
+
+        print(f"loaded_adapters: {loaded_adapters}")
+        self.testcase.assertEqual(
+            loaded_adapters,
+            self.expected_adapters,
+            f"Expected loaded adapters to be {self.expected_adapters}, but got {loaded_adapters}",
+        )
+
+    def forward(
+        self,
+        prompts: List[str],
+        lora_paths: List[str],
+        max_new_tokens: int = 32,
+        expected_error: Optional[str] = None,
+    ):
+        """
+        Perform a batch forward pass with the current set of loaded LoRA adapters.
+        """
+        try:
+            response = self.handle.batch_forward(
+                prompts=prompts,
+                lora_paths=lora_paths,
+                max_new_tokens=max_new_tokens,
+            )
+        except ValueError as e:
+            if expected_error:
+                error_message = str(e)
+                self.testcase.assertIn(
+                    expected_error,
+                    error_message,
+                    f"Expected error message to contain '{expected_error}', but got '{error_message}'",
+                )
+                print(f"Received error as expected: {error_message}")
+                return error_message
+
+            raise e
+
+        self.testcase.assertEqual(
+            len(response.output_strs),
+            len(prompts),
+            f"Expected {len(prompts)} outputs, but got {len(response.output_strs)}",
+        )
+        output = response.output_strs
+        print(f"output_strs: {output}")
+
+        return output
+
+
+class LoRAUpdateServerTestSession(LoRAUpdateTestSessionBase):
+    """
+    Context manager for testing LoRA adapters with standalone server.
+    """
+
+    def __enter__(self):
+        other_args = [
+            "--cuda-graph-max-bs",
+            str(self.cuda_graph_max_bs),
+            "--max-loras-per-batch",
+            str(self.max_loras_per_batch),
+            "--lora-backend",
+            self.lora_backend,
+            "--random-seed",
+            "42",
+            "--max-running-request",
+            "1",
+            "--mem-fraction-static",
+            str(MEM_FRACTION_STATIC),
+            "--disable-radix-cache",
+        ]
+        if self.enable_lora:
+            other_args.append("--enable-lora")
+        if self.lora_paths:
+            other_args.append("--lora-paths")
+            for lora_path in self.lora_paths:
+                if isinstance(lora_path, dict):
+                    lora_path = json.dumps(lora_path)
+                other_args.append(lora_path)
+        if self.disable_cuda_graph:
+            other_args.append("--disable-cuda-graph")
+        if self.max_lora_rank is not None:
+            other_args.extend(["--max-lora-rank", str(self.max_lora_rank)])
+        if self.lora_target_modules is not None:
+            other_args.extend(["--lora-target-modules"] + self.lora_target_modules)
+        if self.max_loaded_loras is not None:
+            other_args.extend(["--max-loaded-loras", str(self.max_loaded_loras)])
+
+        # launch external server
+        self.handle = popen_launch_server(
+            self.model_path,
+            DEFAULT_URL_FOR_TEST,
+            DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.handle is not None:
+            kill_process_tree(self.handle.pid)
+        # don't suppress exceptions
+        return False
+
+    def load_lora_adapter(
+        self,
+        lora_name: str,
+        lora_path: Optional[str] = None,
+        expected_error: Optional[str] = None,
+        pinned: bool = False,
+    ):
+        """
+        Load a LoRA adapter by name and path.
+        """
+        if lora_path is None:
+            lora_path = lora_name
+
+        response = requests.post(
+            DEFAULT_URL_FOR_TEST + "/load_lora_adapter",
+            json={"lora_name": lora_name, "lora_path": lora_path, "pinned": pinned},
+        )
+        if expected_error:
+            self.testcase.assertEqual(
+                response.status_code,
+                400,
+                f"Expected error for {lora_name}, but got success.",
+            )
+            self.testcase.assertIn(
+                expected_error,
+                response.text,
+                f"Expected error message to contain '{expected_error}', but got '{response.text}'",
+            )
+            print(f"Received error as expected: {response.text}")
+        else:
+            self.expected_adapters.add(lora_name)
+            self.testcase.assertTrue(
+                response.ok, f"Failed to load LoRA adapter {lora_name}: {response.text}"
+            )
+            loaded_adapters = set(response.json()["loaded_adapters"])
+            print(f"loaded_adapters: {loaded_adapters}")
+            self.testcase.assertEqual(
+                loaded_adapters,
+                self.expected_adapters,
+                f"Expected loaded adapters to be {self.expected_adapters}, but got {loaded_adapters}",
+            )
+
+    def unload_lora_adapter(self, lora_name: str):
+        """
+        Unload a LoRA adapter by name.
+        """
+        self.expected_adapters.remove(lora_name)
+
+        response = requests.post(
+            DEFAULT_URL_FOR_TEST + "/unload_lora_adapter",
+            json={"lora_name": lora_name},
+        )
+        self.testcase.assertTrue(
+            response.ok, f"Failed to unload LoRA adapter {lora_name}: {response.text}"
+        )
+        loaded_adapters = set(response.json()["loaded_adapters"])
+
+        print(f"loaded_adapters: {loaded_adapters}")
+        self.testcase.assertEqual(
+            loaded_adapters,
+            self.expected_adapters,
+            f"Expected loaded adapters to be {self.expected_adapters}, but got {loaded_adapters}",
+        )
+
+    def forward(
+        self,
+        prompts: List[str],
+        lora_paths: List[str],
+        max_new_tokens: int = 32,
+        expected_error: Optional[str] = None,
+    ):
+        """
+        Perform a batch forward pass with the current set of loaded LoRA adapters.
+        """
+        response = requests.post(
+            DEFAULT_URL_FOR_TEST + "/generate",
+            json={
+                "text": prompts,
+                "lora_path": lora_paths,
+                "sampling_params": {
+                    "temperature": 0,
+                    "top_k": 1,
+                    "max_new_tokens": max_new_tokens,
+                },
+            },
+        )
+        if expected_error:
+            self.testcase.assertEqual(
+                response.status_code,
+                400,
+                f"Expected error for forward pass, but got success: {response.text}",
+            )
+            self.testcase.assertIn(
+                expected_error,
+                response.text,
+                f"Expected error message to contain '{expected_error}', but got '{response.text}'",
+            )
+            output = response.text
+            print(f"Received error as expected: {response.text}")
+            return output
+        else:
+            self.testcase.assertTrue(
+                response.ok, f"Failed to generate text: {response.text}"
+            )
+            output = [r["text"] for r in response.json()]
+            self.testcase.assertEqual(
+                len(output),
+                len(prompts),
+                f"Expected {len(prompts)} outputs, but got {len(output)}",
+            )
+            print(f"output_strs: {output}")
+            return output
+
+
+# Factory function to create the appropriate LoRA test session based on mode
+def LoRAUpdateTestSession(
+    testcase: Optional[TestCase],
+    mode: LoRAUpdateTestSessionMode,
+    **kwargs: Any,
+):
+    if mode == LoRAUpdateTestSessionMode.ENGINE:
+        return LoRAUpdateEngineTestSession(testcase=testcase, **kwargs)
+    elif mode == LoRAUpdateTestSessionMode.SERVER:
+        return LoRAUpdateServerTestSession(testcase=testcase, **kwargs)
+    else:
+        raise ValueError(f"Unrecognized mode: {mode!r}")
+
+
+class TestLoRADynamicUpdate(CustomTestCase):
+    """
+    This test case verifies that the SRT runner can dynamically load and unload LoRA adapters
+    during a sequence of operations, and that the outputs of forward passes with dynamically loaded
+    adapters match the outputs of forward passes with statically loaded adapters.
+    """
+
+    def _repeat_each(lst, n):
+        return [x for x in lst for _ in range(n)]
+
+    def _run_operation_sequence(
+        self,
+        mode: LoRAUpdateTestSessionMode,
+        base: str,
+        initial_adapters: List[Union[str, dict]],
+        op_sequence: List[Operation],
+        max_loras_per_batch: int,
+        max_loaded_loras: Optional[int] = None,
+        enable_lora: Optional[bool] = None,
+        max_lora_rank: Optional[int] = None,
+        lora_target_modules: Optional[List[str]] = None,
+        max_new_tokens: int = 32,
+    ) -> List[tuple]:
+        """
+        Runs a sequence of operations on the SRT runner, including loading and unloading LoRA adapters,
+        and performing forward passes with the current set of loaded adapters.
+        """
+
+        forward_outputs = []
+        with LoRAUpdateTestSession(
+            testcase=self,
+            mode=mode,
+            model_path=base,
+            lora_paths=initial_adapters,
+            max_loras_per_batch=max_loras_per_batch,
+            max_loaded_loras=max_loaded_loras,
+            max_lora_rank=max_lora_rank,
+            lora_target_modules=lora_target_modules,
+            enable_lora=enable_lora,
+        ) as session:
+            for op in op_sequence:
+                op_type = op.type
+                data = op.data
+                expected_error = op.expected_error
+                print("-" * 100)
+                print(
+                    f"Running operation: {op_type} --- data: {data} --- mode: {mode} ---"
+                )
+                if op_type == OperationType.LOAD:
+                    if isinstance(data, str):
+                        adapter_info = {
+                            "lora_name": data,
+                            "lora_path": data,
+                            "pinned": False,
+                        }
+                    else:
+                        adapter_info = data
+
+                    result = session.load_lora_adapter(
+                        expected_error=expected_error,
+                        **adapter_info,
+                    )
+                elif op_type == OperationType.UNLOAD:
+                    result = session.unload_lora_adapter(
+                        lora_name=data,
+                    )
+                elif op_type == OperationType.FORWARD:
+                    prompts, adapters = zip(*data)
+                    result = session.forward(
+                        prompts=list(prompts),
+                        lora_paths=list(adapters),
+                        max_new_tokens=max_new_tokens,
+                        expected_error=expected_error,
+                    )
+                    if not expected_error:
+                        forward_outputs.append(result)
+
+            return forward_outputs
+
+    def _run_dynamic_adapter_updates(
+        self, mode: LoRAUpdateTestSessionMode, test_cases: Iterable[TestCase]
+    ):
+        for case_idx, test_case in enumerate(test_cases, start=1):
+            print("=" * 100)
+            print(
+                f"Starting test case {case_idx} in {mode.value} mode. Test description: {test_case.description}"
+            )
+            print("=" * 100)
+
+            print(
+                f"--- Running dynamic update pass with {len(test_case.op_sequence)} operations ---"
+            )
+            # Test dynamic loading of adapters
+            dynamic_output = self._run_operation_sequence(
+                mode=mode,
+                initial_adapters=test_case.initial_adapters,
+                enable_lora=test_case.enable_lora,
+                base=test_case.base,
+                max_loras_per_batch=test_case.max_loras_per_batch,
+                max_loaded_loras=test_case.max_loaded_loras,
+                op_sequence=test_case.op_sequence,
+                max_new_tokens=test_case.max_new_tokens,
+                max_lora_rank=test_case.max_lora_rank,
+                lora_target_modules=test_case.lora_target_modules,
+            )
+
+            # static loading
+            forward_ops = [
+                x
+                for x in test_case.op_sequence
+                if x.type == OperationType.FORWARD and x.expected_error is None
+            ]
+
+            if not forward_ops:
+                print(
+                    f"No forward operations found in test case {case_idx}. Skipping static pass."
+                )
+                continue
+
+            print("=" * 100)
+            print(f"\n--- Running static pass with {len(forward_ops)} operations ---")
+            static_output = self._run_operation_sequence(
+                mode=mode,
+                initial_adapters=test_case.all_adapters,
+                enable_lora=test_case.enable_lora,
+                base=test_case.base,
+                max_loras_per_batch=test_case.max_loras_per_batch,
+                op_sequence=forward_ops,
+                max_new_tokens=test_case.max_new_tokens,
+            )
+
+            print(f"Dynamic output: {dynamic_output}")
+            print(f"Static output: {static_output}")
+            print("=" * 100)
+            self.assertEqual(
+                len(dynamic_output),
+                len(static_output),
+                f"Dynamic output length {len(dynamic_output)} does not match static output length {len(static_output)}",
+            )
+            for i, (dynamic, static) in enumerate(
+                zip(dynamic_output, static_output), start=1
+            ):
+                self.assertEqual(
+                    len(dynamic),
+                    len(static),
+                    f"Output length mismatch at batch {i}:\n- Dynamic={len(dynamic)}\n- Static={len(static)}",
+                )
+                for j, (d_out, s_out) in enumerate(zip(dynamic, static), start=1):
+                    d_out = d_out.strip()
+                    s_out = s_out.strip()
+                    self.assertEqual(
+                        d_out,
+                        s_out,
+                        f"Output mismatch at batch {i}, prompt {j}:\n- Dynamic: '{d_out}'\n- Static: '{s_out}'",
+                    )
+
+    def test_dynamic_lora_update_engine(self):
+        """
+        Test dynamic LoRA updates in engine mode.
+        """
+        test_cases = BASIC_TESTS if is_in_ci() else ALL_TESTS
+        self._run_dynamic_adapter_updates(
+            mode=LoRAUpdateTestSessionMode.ENGINE,
+            test_cases=test_cases,
+        )
+
+    def test_dynamic_lora_update_server(self):
+        """
+        Test dynamic LoRA updates in server mode.
+        """
+        test_cases = BASIC_TESTS if is_in_ci() else ALL_TESTS
+        self._run_dynamic_adapter_updates(
+            mode=LoRAUpdateTestSessionMode.SERVER, test_cases=test_cases
+        )
+
+
+if __name__ == "__main__":
+    try:
+        mp.set_start_method("spawn")
+    except RuntimeError:
+        pass
+
+    unittest.main(warnings="ignore")
diff --git a/test/srt/lora/test_multi_lora_backend.py b/test/srt/lora/test_multi_lora_backend.py
new file mode 100644
index 000000000..310ce72e4
--- /dev/null
+++ b/test/srt/lora/test_multi_lora_backend.py
@@ -0,0 +1,90 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import multiprocessing as mp
+import os
+import unittest
+from typing import List
+
+from utils import (
+    ALL_OTHER_MULTI_LORA_MODELS,
+    BACKENDS,
+    CI_MULTI_LORA_MODELS,
+    TORCH_DTYPES,
+    LoRAModelCase,
+    run_lora_test_one_by_one,
+)
+
+from sglang.test.test_utils import CustomTestCase, is_in_ci
+
+# All prompts are used at once in a batch.
+PROMPTS = [
+    "AI is a field of computer science focused on",
+    """
+    ### Instruction:
+    Tell me about llamas and alpacas
+    ### Response:
+    Llamas are large, long-necked animals with a woolly coat. They have two toes on each foot instead of three like other camelids.
+    ### Question:
+    What do you know about llamas?
+    ### Answer:
+    """,
+]
+
+
+class TestMultiLoRABackend(CustomTestCase):
+
+    def _run_multi_lora_test_on_model_cases(self, model_cases: List[LoRAModelCase]):
+        for model_case in model_cases:
+            # If skip_long_prompt is True, filter out prompts longer than 1000 characters.
+            batch_prompts = (
+                PROMPTS
+                if not model_case.skip_long_prompt
+                else [p for p in PROMPTS if len(p) < 1000]
+            )
+            for torch_dtype in TORCH_DTYPES:
+                for backend in BACKENDS:
+                    run_lora_test_one_by_one(
+                        batch_prompts,
+                        model_case,
+                        torch_dtype,
+                        max_new_tokens=32,
+                        backend=backend,
+                        test_tag="multi-lora-backend",
+                    )
+
+    def test_ci_lora_models(self):
+        self._run_multi_lora_test_on_model_cases(CI_MULTI_LORA_MODELS)
+
+    def test_all_lora_models(self):
+        if is_in_ci():
+            return
+
+        # Retain ONLY_RUN check here
+        filtered_models = []
+        for model_case in ALL_OTHER_MULTI_LORA_MODELS:
+            if "ONLY_RUN" in os.environ and os.environ["ONLY_RUN"] != model_case.base:
+                continue
+            filtered_models.append(model_case)
+
+        self._run_multi_lora_test_on_model_cases(filtered_models)
+
+
+if __name__ == "__main__":
+    try:
+        mp.set_start_method("spawn")
+    except RuntimeError:
+        pass
+
+    unittest.main(warnings="ignore")
diff --git a/test/srt/lora/utils.py b/test/srt/lora/utils.py
new file mode 100644
index 000000000..94ce8ab60
--- /dev/null
+++ b/test/srt/lora/utils.py
@@ -0,0 +1,397 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import dataclasses
+import random
+from typing import List
+
+import torch
+
+from sglang.test.runners import HFRunner, SRTRunner
+from sglang.test.test_utils import calculate_rouge_l
+
+
+@dataclasses.dataclass
+class LoRAAdaptor:
+    name: str
+    prefill_tolerance: float = None
+    decode_tolerance: float = None
+    rouge_l_tolerance: float = None
+
+
+@dataclasses.dataclass
+class LoRAModelCase:
+    base: str
+    adaptors: List[LoRAAdaptor]
+    tp_size: int = 1
+    prefill_tolerance: float = 1e-1
+    decode_tolerance: float = 1e-1
+    rouge_l_tolerance: float = 1.0
+    max_loras_per_batch: int = 1
+    skip_long_prompt: bool = False
+
+    def __post_init__(self):
+        if len(self.adaptors) > self.max_loras_per_batch:
+            raise ValueError(
+                f"For base '{self.base}', number of adaptors ({len(self.adaptors)}) "
+                f"must be <= max_loras_per_batch ({self.max_loras_per_batch})"
+            )
+
+
+TORCH_DTYPES = [torch.float16]
+BACKENDS = ["triton"]
+DEFAULT_PROMPTS = [
+    "AI is a field of computer science focused on",
+    """
+    ### Instruction:
+    Tell me about llamas and alpacas
+    ### Response:
+    Llamas are large, long-necked animals with a woolly coat. They have two toes on each foot instead of three like other camelids (camels, dromedaries). Llamas live in the Andean mountains of South America where they graze on grasses and shrubs. Alpaca is another name for domesticated llama. The word "alpaca" comes from an Incan language meaning "golden fleece." Alpacas look very similar to llamas but are smaller than their wild relatives. Both species were used by ancient people as pack animals and for meat. Today both llamas and alpacas are raised primarily for their fiber which can be spun into yarn or knitted into clothing.
+    ### Question 2:
+    What do you know about llamas?
+    ### Answer:
+    """,
+]
+
+CI_LORA_MODELS = [
+    LoRAModelCase(
+        base="meta-llama/Llama-3.1-8B-Instruct",
+        adaptors=[
+            LoRAAdaptor(
+                name="algoprog/fact-generation-llama-3.1-8b-instruct-lora",
+            ),
+        ],
+        max_loras_per_batch=1,
+    ),
+]
+
+ALL_OTHER_LORA_MODELS = [
+    LoRAModelCase(
+        base="meta-llama/Llama-3.1-8B-Instruct",
+        adaptors=[
+            LoRAAdaptor(
+                name="Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+                prefill_tolerance=1e-1,
+            ),
+        ],
+        max_loras_per_batch=1,
+    ),
+    LoRAModelCase(
+        base="meta-llama/Llama-2-7b-hf",
+        adaptors=[LoRAAdaptor(name="winddude/wizardLM-LlaMA-LoRA-7B")],
+        max_loras_per_batch=2,
+    ),
+]
+
+CI_MULTI_LORA_MODELS = [
+    # multi-rank case
+    LoRAModelCase(
+        base="meta-llama/Llama-2-7b-hf",
+        adaptors=[
+            LoRAAdaptor(
+                name="winddude/wizardLM-LlaMA-LoRA-7B",
+                prefill_tolerance=1e-1,
+            ),
+            LoRAAdaptor(
+                name="RuterNorway/Llama-2-7b-chat-norwegian-LoRa",
+                prefill_tolerance=3e-1,
+            ),
+        ],
+        max_loras_per_batch=2,
+    ),
+]
+
+ALL_OTHER_MULTI_LORA_MODELS = [
+    LoRAModelCase(
+        base="meta-llama/Llama-3.1-8B-Instruct",
+        adaptors=[
+            LoRAAdaptor(
+                name="algoprog/fact-generation-llama-3.1-8b-instruct-lora",
+                prefill_tolerance=1e-1,
+            ),
+            LoRAAdaptor(
+                name="Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+                prefill_tolerance=1e-1,
+            ),
+        ],
+        max_loras_per_batch=2,
+    ),
+]
+
+
+def run_lora_test_one_by_one(
+    prompts: List[str],
+    model_case: LoRAModelCase,
+    torch_dtype: torch.dtype,
+    max_new_tokens: int,
+    backend: str,
+    disable_cuda_graph: bool = False,
+    disable_radix_cache: bool = False,
+    mem_fraction_static: float = 0.88,
+    test_tag: str = "",
+):
+    """
+    Input a batch of prompts, and run lora tests one by one with several generate requests
+    (each request will have bs=1).
+    For prompt0, prompt1, ..., promptN,
+    we will use adaptor0, adaptor1, ..., adaptorN included in model case,
+    We will then compare the outputs of HF and SRT with and without LoRA.
+    If number of prompts is larger than number of adaptors,
+    the prompt i will use adaptor i % (number of adaptors).
+
+    Args:
+        prompts (List[str]): The batch of prompts to test.
+        model_case (LoRAModelCase): The model case to test.
+        torch_dtype (torch.dtype): The torch dtype to use.
+        max_new_tokens (int): The maximum number of new tokens to generate.
+        backend (str): The lora backend to use.
+        disable_cuda_graph (bool, optional): Whether to disable CUDA graph. Defaults to False.
+        disable_radix_cache (bool, optional): Whether to disable radix cache. Defaults to False.
+        mem_fraction_static (float, optional): The fraction of memory to use. Defaults to 0.88.
+        test_tag (str, optional): The tag to use for the test. Defaults to "".
+    """
+    base_path = model_case.base
+
+    # Create used adaptors for each prompt in batch
+    i, adaptors = 0, []
+    for _ in range(len(prompts)):
+        adaptors.append(model_case.adaptors[i])
+        i = (i + 1) % len(model_case.adaptors)
+    adaptor_names = [adaptor.name for adaptor in adaptors]
+
+    print(
+        f"\n========== Testing {test_tag} on base '{model_case.base}' with backend={backend}, dtype={torch_dtype} --- "
+        f"Using prompts {[p[:50] for p in prompts]} with adaptors: {adaptor_names} ---"
+    )
+    with SRTRunner(
+        base_path,
+        torch_dtype=torch_dtype,
+        model_type="generation",
+        tp_size=model_case.tp_size,
+        lora_paths=[
+            adaptor.name for adaptor in model_case.adaptors if adaptor.name is not None
+        ],
+        max_loras_per_batch=model_case.max_loras_per_batch,
+        lora_backend=backend,
+        disable_cuda_graph=disable_cuda_graph,
+        disable_radix_cache=disable_radix_cache,
+        mem_fraction_static=mem_fraction_static,
+    ) as srt_runner:
+        srt_outputs = srt_runner.forward(
+            prompts, max_new_tokens=max_new_tokens, lora_paths=adaptor_names
+        )
+
+    with SRTRunner(
+        base_path,
+        torch_dtype=torch_dtype,
+        model_type="generation",
+        tp_size=model_case.tp_size,
+        mem_fraction_static=mem_fraction_static,
+    ) as srt_runner:
+        srt_no_lora_outputs = srt_runner.forward(prompts, max_new_tokens=max_new_tokens)
+
+    with HFRunner(
+        base_path, torch_dtype=torch_dtype, model_type="generation"
+    ) as hf_runner:
+        hf_outputs = hf_runner.forward(
+            prompts, max_new_tokens=max_new_tokens, lora_paths=adaptor_names
+        )
+        hf_no_lora_outputs = hf_runner.forward(prompts, max_new_tokens=max_new_tokens)
+
+    # Compare prefill stage logprobs (HF vs SRTRunner with LoRA)
+    for i in range(len(prompts)):
+        adaptor = adaptors[i]
+        # Use individual adaptor tolerances if set, otherwise use model defaults
+        prefill_tol = (
+            adaptor.prefill_tolerance
+            if adaptor.prefill_tolerance is not None
+            else model_case.prefill_tolerance
+        )
+        decode_tol = (
+            adaptor.decode_tolerance
+            if adaptor.decode_tolerance is not None
+            else model_case.decode_tolerance
+        )
+        rouge_tol = (
+            adaptor.rouge_l_tolerance
+            if adaptor.rouge_l_tolerance is not None
+            else model_case.rouge_l_tolerance
+        )
+        # Compare prefill stage logprobs (HF vs SRTRunner with LoRA)
+        hf_prefill = torch.tensor(hf_outputs.top_input_logprobs[i])
+        srt_prefill = torch.tensor(srt_outputs.top_input_logprobs[i])
+        max_prefill_diff = torch.max(torch.abs(hf_prefill - srt_prefill))
+        print("Max prefill diff (HF vs SRT):", max_prefill_diff)
+
+        # Compare decode stage logprobs
+        hf_decode = torch.tensor(hf_outputs.top_output_logprobs[i])
+        srt_decode = torch.tensor(srt_outputs.top_output_logprobs[i])
+        max_decode_diff = torch.max(torch.abs(hf_decode - srt_decode))
+        print("Max decode diff (HF vs SRT):", max_decode_diff)
+
+        srt_output_str = srt_outputs.output_strs[i].strip()
+        hf_output_str = hf_outputs.output_strs[i].strip()
+        rouge_score = calculate_rouge_l([srt_output_str], [hf_output_str])[0]
+        print("ROUGE-L score:", rouge_score)
+        print("SRT output:", srt_output_str)
+        print("HF output:", hf_output_str)
+
+        # Additional: compare prefill outputs between base model (no LoRA) and LoRA model for reference
+        hf_no_lora_prefill = torch.tensor(hf_no_lora_outputs.top_input_logprobs[i])
+        srt_no_lora_prefill = torch.tensor(srt_no_lora_outputs.top_input_logprobs[i])
+        print(
+            "Max diff (SRT base vs SRT LoRA prefill):",
+            torch.max(torch.abs(srt_no_lora_prefill - srt_prefill)),
+        )
+        print(
+            "Max diff (HF base vs HF LoRA prefill):",
+            torch.max(torch.abs(hf_no_lora_prefill - hf_prefill)),
+        )
+
+        if hf_prefill.shape[0] <= 100:
+            assert torch.all(torch.abs(hf_prefill - srt_prefill) < prefill_tol), (
+                f"Prefill logprobs mismatch for base '{base_path}', adaptor '{adaptor_names}', "
+                f"backend '{backend}', prompt: '{prompts[0][:50]}...'"
+            )
+
+        if hf_decode.shape[0] <= 100:
+            assert torch.all(torch.abs(hf_decode - srt_decode) < decode_tol), (
+                f"Decode logprobs mismatch for base '{base_path}', adaptor '{adaptor_names}', "
+                f"backend '{backend}', prompt: '{prompts[0][:50]}...'"
+            )
+
+        if rouge_score < rouge_tol:
+            raise AssertionError(
+                f"ROUGE-L score {rouge_score} below tolerance {rouge_tol} "
+                f"for base '{base_path}', adaptor '{adaptor_names}', backend '{backend}', prompt: '{prompts[0][:50]}...'"
+            )
+
+
+def run_lora_test_by_batch(
+    prompts: List[str],
+    model_case: LoRAModelCase,
+    torch_dtype: torch.dtype,
+    max_new_tokens: int,
+    backend: str,
+    disable_cuda_graph: bool = False,
+    disable_radix_cache: bool = False,
+    mem_fraction_static: float = 0.88,
+    test_tag: str = "",
+):
+    """
+    Run lora tests as a batch.
+    For prompt0, prompt1, ..., promptN,
+    we will use adaptor0, adaptor1, ..., adaptorN included in model case,
+    We will then compare the outputs of HF and SRT with LoRA.
+    If number of prompts is larger than number of adaptors,
+    the prompt i will use adaptor i % (number of adaptors).
+
+    Args:
+        prompts (List[str]): The batch of prompts to test.
+        model_case (LoRAModelCase): The model case to test.
+        torch_dtype (torch.dtype): The torch dtype to use.
+        max_new_tokens (int): The maximum number of new tokens to generate.
+        backend (str): The lora backend to use.
+        disable_cuda_graph (bool, optional): Whether to disable CUDA graph. Defaults to False.
+        disable_radix_cache (bool, optional): Whether to disable radix cache. Defaults to False.
+        mem_fraction_static (float, optional): The fraction of memory to use. Defaults to 0.88.
+        test_tag (str, optional): The tag to use for the test. Defaults to "".
+    """
+    base_path = model_case.base
+
+    # Create used adaptors for each prompt in batch
+    i, adaptors = 0, []
+    for _ in range(len(prompts)):
+        adaptors.append(model_case.adaptors[i])
+        i = (i + 1) % len(model_case.adaptors)
+    adaptor_names = [adaptor.name for adaptor in adaptors]
+
+    print(
+        f"\n========== Testing {test_tag} on base '{model_case.base}' with backend={backend}, dtype={torch_dtype} --- "
+        f"Using prompts {[p[:50] for p in prompts]} with adaptors: {adaptor_names} ---"
+    )
+    with SRTRunner(
+        base_path,
+        torch_dtype=torch_dtype,
+        model_type="generation",
+        tp_size=model_case.tp_size,
+        lora_paths=[
+            adaptor.name for adaptor in model_case.adaptors if adaptor.name is not None
+        ],
+        max_loras_per_batch=model_case.max_loras_per_batch,
+        lora_backend=backend,
+        disable_cuda_graph=disable_cuda_graph,
+        disable_radix_cache=disable_radix_cache,
+        mem_fraction_static=mem_fraction_static,
+    ) as srt_runner:
+        srt_outputs = srt_runner.batch_forward(
+            prompts, max_new_tokens=max_new_tokens, lora_paths=adaptor_names
+        )
+
+    with SRTRunner(
+        base_path,
+        torch_dtype=torch_dtype,
+        model_type="generation",
+        tp_size=model_case.tp_size,
+        mem_fraction_static=mem_fraction_static,
+    ) as srt_runner:
+        srt_no_lora_outputs = srt_runner.batch_forward(
+            prompts, max_new_tokens=max_new_tokens
+        )
+
+    with HFRunner(
+        base_path, torch_dtype=torch_dtype, model_type="generation"
+    ) as hf_runner:
+        hf_outputs = hf_runner.forward(
+            prompts, max_new_tokens=max_new_tokens, lora_paths=adaptor_names
+        )
+
+    with HFRunner(
+        base_path, torch_dtype=torch_dtype, model_type="generation"
+    ) as hf_runner:
+        hf_no_lora_outputs = hf_runner.forward(
+            prompts,
+            max_new_tokens=max_new_tokens,
+        )
+
+    for i in range(len(prompts)):
+
+        srt_output_str = srt_outputs.output_strs[i].strip()
+        hf_output_str = hf_outputs.output_strs[i].strip()
+        rouge_score = calculate_rouge_l([srt_output_str], [hf_output_str])[0]
+        print("ROUGE-L score:", rouge_score)
+        print("SRT output:", srt_output_str)
+        print("HF output:", hf_output_str)
+        print("SRT no lora output:", srt_no_lora_outputs.output_strs[i].strip())
+        print("HF no lora output:", hf_no_lora_outputs.output_strs[i].strip())
+        assert srt_outputs.output_strs[i].strip(" ") == hf_outputs.output_strs[i].strip(
+            " "
+        ), (
+            srt_outputs.output_strs[i].strip(" "),
+            hf_outputs.output_strs[i].strip(" "),
+        )
+        assert srt_no_lora_outputs.output_strs[i].strip(
+            " "
+        ) == hf_no_lora_outputs.output_strs[i].strip(" "), (
+            srt_no_lora_outputs.output_strs[i].strip(" "),
+            hf_no_lora_outputs.output_strs[i].strip(" "),
+        )
+
+
+def ensure_reproducibility():
+    seed = 42
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.use_deterministic_algorithms(True)
diff --git a/test/srt/models/compare.py b/test/srt/models/compare.py
new file mode 100644
index 000000000..2fe35357c
--- /dev/null
+++ b/test/srt/models/compare.py
@@ -0,0 +1,52 @@
+"""
+used for debug using tensor comparison
+dump {name: tensor} into "log_hf.jsonl" and "log_srt.jsonl"
+use the same name for two tensors that supposed to be close
+recommend name like: "layer 2 after mlp"
+"""
+
+import json
+import sys
+
+import torch
+
+if len(sys.argv) > 1:
+    assert sys.argv[1] == "base"
+    hf_log = "base_log_hf.jsonl"
+    srt_log = "base_log_srt.jsonl"
+else:
+    hf_log = "log_hf.jsonl"
+    srt_log = "log_srt.jsonl"
+
+
+def load_data(filepath):
+    tensors = {}
+    with open(filepath, "r") as f:
+        lines = f.readlines()
+        for line in lines:
+            data = json.loads(line)
+            for k, v in data.items():
+                tensors[k] = torch.tensor(v)
+    return tensors
+
+
+hf_tensors = load_data(hf_log)
+srt_tensors = load_data(srt_log)
+
+
+def get_diff(t1, t2):
+    t1 = t1.reshape(t2.shape)
+    max_diff = torch.max(abs(t1.reshape(t2.shape) - t2))
+    l2_dis = torch.dist(t1, t2, p=2)
+    return l2_dis, max_diff
+
+
+for k, _ in srt_tensors.items():
+    l2_dis, max_diff = get_diff(hf_tensors[k], srt_tensors[k])
+    print(f"{k} {l2_dis=} {max_diff=}")
+    if k == "layer 1 attn":
+        print(hf_tensors[k])
+        print(srt_tensors[k])
+    if k == "layer 0 prefill k":
+        print(srt_tensors[k].shape)
+        print(hf_tensors[k].shape)
diff --git a/test/srt/models/test_clip_models.py b/test/srt/models/test_clip_models.py
new file mode 100644
index 000000000..8a79656d0
--- /dev/null
+++ b/test/srt/models/test_clip_models.py
@@ -0,0 +1,80 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import multiprocessing as mp
+import unittest
+
+import torch
+from transformers import AutoProcessor
+
+from sglang.srt.utils import load_image
+from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner
+from sglang.test.test_utils import get_similarities
+
+TEXTS = "two Subway Series sandwiches with meats, cheese, lettuce, tomatoes, and onions on a black background, accompanied by the Subway Series logo, highlighting a new sandwich series."
+IMAGES = "https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild/resolve/main/images/023.jpg"
+MODELS = [
+    ("openai/clip-vit-large-patch14-336", 1e-5),
+]
+TORCH_DTYPES = [torch.float16]
+
+
+class TestClipModels(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+
+    def assert_close_embeddings(self, model, prefill_tolerance, torch_dtype):
+
+        with HFRunner(
+            model,
+            torch_dtype=torch_dtype,
+            model_type="embedding",
+        ) as hf_runner:
+            hf_text_embeds = hf_runner.forward(prompts=TEXTS)
+            hf_image_embeds = hf_runner.forward(image_data=IMAGES)
+
+        with SRTRunner(
+            model,
+            tp_size=1,
+            torch_dtype=torch_dtype,
+            model_type="embedding",
+        ) as srt_runner:
+            text_embeds = srt_runner.forward(prompts=TEXTS)
+            image_embeds = srt_runner.forward(prompts="padding", image_data=IMAGES)
+
+        text_similarity = get_similarities(
+            text_embeds.embed_logits[0], hf_text_embeds.embed_logits[0]
+        )
+        image_similarity = get_similarities(
+            image_embeds.embed_logits[0], hf_image_embeds.embed_logits[0]
+        )
+        print("text similarity diff", abs(text_similarity - 1))
+        print("image similarity diff", abs(image_similarity - 1))
+        assert torch.all(
+            abs(text_similarity - 1) < prefill_tolerance
+        ), "embeddings are not all close"
+        assert torch.all(
+            abs(image_similarity - 1) < prefill_tolerance
+        ), "embeddings are not all close"
+
+    def test_accuracy(self):
+        for model, prefill_tolerance in MODELS:
+            for torch_dtype in TORCH_DTYPES:
+                self.assert_close_embeddings(model, prefill_tolerance, torch_dtype)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/models/test_compressed_tensors_models.py b/test/srt/models/test_compressed_tensors_models.py
new file mode 100644
index 000000000..34f699de4
--- /dev/null
+++ b/test/srt/models/test_compressed_tensors_models.py
@@ -0,0 +1,46 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestCompressedTensorsLlama3FP8(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "RedHatAI/Meta-Llama-3.1-8B-FP8"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreaterEqual(metrics["accuracy"], 0.45)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/models/test_cross_encoder_models.py b/test/srt/models/test_cross_encoder_models.py
new file mode 100644
index 000000000..93edc3fa1
--- /dev/null
+++ b/test/srt/models/test_cross_encoder_models.py
@@ -0,0 +1,91 @@
+import multiprocessing as mp
+import random
+import unittest
+
+import torch
+from transformers import AutoConfig, AutoTokenizer
+
+from sglang.test.runners import TEST_RERANK_QUERY_DOCS, HFRunner, SRTRunner
+from sglang.test.test_utils import CustomTestCase, is_in_ci
+
+MODELS = [
+    ("cross-encoder/ms-marco-MiniLM-L6-v2", 1, 1e-2),
+    ("BAAI/bge-reranker-v2-m3", 1, 1e-2),
+]
+ATTENTION_BACKEND = ["torch_native", "triton"]
+
+TORCH_DTYPES = [torch.float32]
+
+
+class TestCrossEncoderModels(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+
+    def assert_close_prefill_logits(
+        self,
+        prompts,
+        model_path,
+        tp_size,
+        torch_dtype,
+        score_tolerance,
+        attention_backend,
+    ) -> None:
+        with HFRunner(
+            model_path,
+            torch_dtype=torch_dtype,
+            model_type="cross_encoder",
+        ) as hf_runner:
+            hf_scores = hf_runner.forward(prompts).scores
+
+        with SRTRunner(
+            model_path,
+            tp_size=tp_size,
+            torch_dtype=torch_dtype,
+            model_type="cross_encoder",
+            attention_backend=attention_backend,
+            chunked_prefill_size=-1,
+            disable_radix_cache=True,
+        ) as srt_runner:
+            srt_scores = srt_runner.forward(prompts).scores
+
+        for i in range(len(srt_scores)):
+            score_difference = abs(hf_scores[i] - srt_scores[i])
+
+            assert (
+                score_difference < score_tolerance
+            ), "cross encoder scores are not all close"
+
+    def preprocess_prompts(self, prompt):
+        processed_prompts = []
+        query = prompt["query"]
+        documents = prompt["documents"]
+        for document in documents:
+            processed_prompts.append([query, document])
+
+        return processed_prompts
+
+    def test_prefill_logits(self):
+        models_to_test = MODELS
+
+        if is_in_ci():
+            models_to_test = [random.choice(MODELS)]
+
+        for model, tp_size, prefill_tolerance in models_to_test:
+            for attention_backend in ATTENTION_BACKEND:
+                for queryDocs in TEST_RERANK_QUERY_DOCS:
+                    prompts = self.preprocess_prompts(queryDocs)
+                    for torch_dtype in TORCH_DTYPES:
+                        self.assert_close_prefill_logits(
+                            prompts,
+                            model,
+                            tp_size,
+                            torch_dtype,
+                            prefill_tolerance,
+                            attention_backend,
+                        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/models/test_dummy_grok_models.py b/test/srt/models/test_dummy_grok_models.py
new file mode 100644
index 000000000..bebf9949c
--- /dev/null
+++ b/test/srt/models/test_dummy_grok_models.py
@@ -0,0 +1,34 @@
+import unittest
+
+from sglang.test.test_utils import CustomTestCase, is_in_ci, run_bench_one_batch
+
+
+class TestDummyGrok1(CustomTestCase):
+
+    def test_dummy_grok_1(self):
+        _, output_throughput, _ = run_bench_one_batch(
+            None,
+            [
+                "--model",
+                "/dummy-grok",
+                "--tokenizer-path",
+                "Xenova/grok-1-tokenizer",
+                "--batch-size",
+                "2",
+                "--tp",
+                "2",
+                "--quantization",
+                "fp8",
+                "--load-format",
+                "dummy",
+                "--json-model-override-args",
+                '{"num_hidden_layers": 2}',
+            ],
+        )
+
+        if is_in_ci():
+            self.assertGreater(output_throughput, 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/models/test_embedding_models.py b/test/srt/models/test_embedding_models.py
new file mode 100644
index 000000000..b56e952d7
--- /dev/null
+++ b/test/srt/models/test_embedding_models.py
@@ -0,0 +1,111 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import multiprocessing as mp
+import random
+import unittest
+
+import torch
+from transformers import AutoConfig, AutoTokenizer
+
+from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner
+from sglang.test.test_utils import CustomTestCase, get_similarities, is_in_ci
+
+MODELS = [
+    ("Alibaba-NLP/gte-Qwen2-1.5B-instruct", 1, 1e-5),
+    ("intfloat/e5-mistral-7b-instruct", 1, 1e-5),
+    ("marco/mcdse-2b-v1", 1, 1e-5),
+    ("Qwen/Qwen3-Embedding-8B", 1, 1e-5),
+    # Temporarily disable before this model is fixed
+    # ("jason9693/Qwen2.5-1.5B-apeach", 1, 1e-5),
+]
+TORCH_DTYPES = [torch.float16]
+
+
+class TestEmbeddingModels(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+
+    def _truncate_prompts(self, prompts, model_path):
+        config = AutoConfig.from_pretrained(model_path)
+        max_length = getattr(config, "max_position_embeddings", 2048)
+
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+        truncated_prompts = []
+        for prompt in prompts:
+            tokens = tokenizer(prompt, return_tensors="pt", truncation=False)
+            if len(tokens.input_ids[0]) > max_length:
+                truncated_text = tokenizer.decode(
+                    tokens.input_ids[0][: max_length - 1], skip_special_tokens=True
+                )
+                truncated_prompts.append(truncated_text)
+            else:
+                truncated_prompts.append(prompt)
+        return truncated_prompts
+
+    def assert_close_prefill_logits(
+        self,
+        prompts,
+        model_path,
+        tp_size,
+        torch_dtype,
+        prefill_tolerance,
+    ) -> None:
+        truncated_prompts = self._truncate_prompts(prompts, model_path)
+
+        with HFRunner(
+            model_path,
+            torch_dtype=torch_dtype,
+            model_type="embedding",
+        ) as hf_runner:
+            hf_outputs = hf_runner.forward(truncated_prompts)
+
+        with SRTRunner(
+            model_path,
+            tp_size=tp_size,
+            torch_dtype=torch_dtype,
+            model_type="embedding",
+        ) as srt_runner:
+            srt_outputs = srt_runner.forward(truncated_prompts)
+
+        for i in range(len(prompts)):
+            hf_logits = torch.Tensor(hf_outputs.embed_logits[i])
+            srt_logits = torch.Tensor(srt_outputs.embed_logits[i])
+
+            similarity = torch.tensor(get_similarities(hf_logits, srt_logits))
+            print("similarity diff", abs(similarity - 1))
+
+            if len(prompts[i]) <= 1000:
+                assert torch.all(
+                    abs(similarity - 1) < prefill_tolerance
+                ), "embeddings are not all close"
+
+    def test_prefill_logits(self):
+        models_to_test = MODELS
+
+        if is_in_ci():
+            models_to_test = [random.choice(MODELS)]
+
+        for model, tp_size, prefill_tolerance in models_to_test:
+            for torch_dtype in TORCH_DTYPES:
+                self.assert_close_prefill_logits(
+                    DEFAULT_PROMPTS, model, tp_size, torch_dtype, prefill_tolerance
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/models/test_encoder_embedding_models.py b/test/srt/models/test_encoder_embedding_models.py
new file mode 100644
index 000000000..dafaa72db
--- /dev/null
+++ b/test/srt/models/test_encoder_embedding_models.py
@@ -0,0 +1,162 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# python -m unittest test_encoder_embedding_models.TestEncoderEmbeddingModels.test_prefill_logits
+
+import multiprocessing as mp
+import random
+import time
+import unittest
+
+import torch
+from transformers import AutoConfig, AutoTokenizer
+
+from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner
+from sglang.test.test_utils import CustomTestCase, get_similarities, is_in_ci
+
+MODELS = [("BAAI/bge-small-en", 1, 1e-5), ("BAAI/bge-m3", 1, 1e-5)]
+
+ATTENTION_BACKEND = ["torch_native", "triton", "flashinfer"]
+BATCH_SIZE = [1, 2]
+TORCH_DTYPES = [torch.float32, torch.float16]
+sgl_to_st_ratio = []
+
+
+class TestEncoderEmbeddingModels(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+
+    def _truncate_prompts(self, prompts, model_path):
+        config = AutoConfig.from_pretrained(model_path)
+        max_length = getattr(config, "max_position_embeddings", 512) - 20
+
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+        truncated_prompts = []
+        for prompt in prompts:
+            tokens = tokenizer(prompt, return_tensors="pt", truncation=False)
+            if len(tokens.input_ids[0]) > max_length:
+                truncated_text = tokenizer.decode(
+                    tokens.input_ids[0][: max_length - 1], skip_special_tokens=True
+                )
+                truncated_prompts.append(truncated_text)
+            else:
+                truncated_prompts.append(prompt)
+
+        return truncated_prompts
+
+    def assert_close_prefill_logits(
+        self,
+        prompts,
+        model_path,
+        tp_size,
+        torch_dtype,
+        prefill_tolerance,
+        attention_backend,
+        batch_size,
+    ) -> None:
+        truncated_prompts = self._truncate_prompts(prompts, model_path)
+        truncated_prompts = truncated_prompts * batch_size
+
+        with HFRunner(
+            model_path,
+            torch_dtype=torch_dtype,
+            model_type="embedding",
+        ) as hf_runner:
+            # warm up
+            hf_outputs = hf_runner.forward(truncated_prompts)
+
+            st_start_time = time.perf_counter()
+            hf_outputs = hf_runner.forward(truncated_prompts)
+            st_end_time = time.perf_counter()
+
+        with SRTRunner(
+            model_path,
+            tp_size=tp_size,
+            torch_dtype=torch_dtype,
+            model_type="embedding",
+            attention_backend=attention_backend,
+            chunked_prefill_size=-1,
+            disable_radix_cache=True,
+        ) as srt_runner:
+            # warm up
+            srt_outputs = srt_runner.forward(truncated_prompts)
+
+            sgl_start_time = time.perf_counter()
+            srt_outputs = srt_runner.forward(truncated_prompts)
+            sgl_end_time = time.perf_counter()
+
+        transformer_time = st_end_time - st_start_time
+        sgl_time = sgl_end_time - sgl_start_time
+        sgl_to_st_ratio.append(sgl_time / transformer_time)
+
+        for i in range(len(truncated_prompts)):
+            hf_logits = torch.Tensor(hf_outputs.embed_logits[i])
+            srt_logits = torch.Tensor(srt_outputs.embed_logits[i])
+
+            similarity = torch.tensor(get_similarities(hf_logits, srt_logits))
+            # If something is wrong, uncomment this to observe similarity.
+            # print("similarity diff", abs(similarity - 1))
+
+            if len(truncated_prompts[i]) <= 1000:
+                assert torch.all(
+                    abs(similarity - 1) < prefill_tolerance
+                ), "embeddings are not all close"
+
+    def test_prefill_logits(self):
+        models_to_test = MODELS
+
+        if is_in_ci():
+            models_to_test = [random.choice(MODELS)]
+
+        for model, tp_size, prefill_tolerance in models_to_test:
+            for attention_backend in ATTENTION_BACKEND:
+                for batch_size in BATCH_SIZE:
+                    for torch_dtype in TORCH_DTYPES:
+                        # NOTE: FlashInfer currently has limitations with head_dim = 32 or
+                        # other dimensions.
+                        # The FlashInfer head_dim limitation itself is tracked here:
+                        # https://github.com/flashinfer-ai/flashinfer/issues/1048
+                        #
+                        # Flashinfer does not support torch.float32 for dtype_q, so skip it
+                        if attention_backend == "flashinfer":
+                            if (
+                                model == "BAAI/bge-small-en"
+                                or torch_dtype == torch.float32
+                            ):
+                                continue
+
+                        self.assert_close_prefill_logits(
+                            DEFAULT_PROMPTS,
+                            model,
+                            tp_size,
+                            torch_dtype,
+                            prefill_tolerance,
+                            attention_backend,
+                            batch_size,
+                        )
+
+        for i in range(len(BATCH_SIZE)):
+            print(
+                "bacth size: ",
+                BATCH_SIZE[i] * 5,
+                "sgl_time/st_time",
+                round(sgl_to_st_ratio[i], 3),
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/models/test_generation_models.py b/test/srt/models/test_generation_models.py
new file mode 100644
index 000000000..a9d8fe0df
--- /dev/null
+++ b/test/srt/models/test_generation_models.py
@@ -0,0 +1,181 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Usage:
+
+To test a specific model locally:
+1. Add it to ALL_MODELS, for example, `ModelCase("Qwen/Qwen2-1.5B")`
+2. Run `ONLY_RUN=Qwen/Qwen2-1.5B python3 -m unittest test_generation_models.TestGenerationModels`
+"""
+
+import dataclasses
+import multiprocessing as mp
+import os
+import random
+import unittest
+from typing import List
+
+import torch
+
+from sglang.test.runners import (
+    DEFAULT_PROMPTS,
+    HFRunner,
+    SRTRunner,
+    check_close_model_outputs,
+)
+from sglang.test.test_utils import CustomTestCase, is_in_ci
+
+
+@dataclasses.dataclass
+class ModelCase:
+    model_path: str
+    tp_size: int = 1
+    prefill_tolerance: float = 5e-2
+    decode_tolerance: float = 6e-2  # Increased to fix numerical error in issue #8614.
+    rouge_l_tolerance: float = 1
+    skip_long_prompt: bool = False
+    trust_remote_code: bool = False
+
+
+# Popular models that run on the CI
+CI_MODELS = [
+    ModelCase("meta-llama/Llama-3.1-8B-Instruct"),
+    ModelCase("google/gemma-2-2b"),
+]
+
+# the complete set of models to test sglang's generation model
+ALL_MODELS = [
+    *CI_MODELS,
+    ModelCase("Qwen/Qwen2-1.5B"),
+    ModelCase("Qwen/Qwen2.5-14B-Instruct"),
+    ModelCase("HuggingFaceTB/SmolLM-135M-Instruct", skip_long_prompt=True),
+    ModelCase("allenai/OLMo-1B-0724-hf", decode_tolerance=8e-2, skip_long_prompt=True),
+    ModelCase(
+        "THUDM/glm-4-9b-chat", tp_size=2, trust_remote_code=True, skip_long_prompt=True
+    ),
+    ModelCase("openai-community/gpt2"),
+    ModelCase("microsoft/phi-1_5", trust_remote_code=True),
+    ModelCase("adept/persimmon-8b-chat"),
+    ModelCase("inclusionAI/Ling-lite", trust_remote_code=True),
+    ModelCase("microsoft/Phi-3-small-8k-instruct", trust_remote_code=True),
+    ModelCase("allenai/OLMo-2-1124-7B-Instruct", skip_long_prompt=True),
+    ModelCase("ibm-granite/granite-3.0-2b-instruct", skip_long_prompt=True),
+    ModelCase(
+        "microsoft/Phi-3.5-MoE-instruct",
+        tp_size=2,
+        trust_remote_code=True,
+        skip_long_prompt=True,
+    ),
+    ModelCase("facebook/opt-125m", skip_long_prompt=True),
+    ModelCase(
+        "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5",
+        tp_size=2,
+        trust_remote_code=True,
+        skip_long_prompt=True,
+    ),
+    ModelCase(
+        "nvidia/Llama-3_1-Nemotron-Ultra-253B-v1",
+        tp_size=8,
+        trust_remote_code=True,
+        skip_long_prompt=True,
+    ),
+]
+
+TORCH_DTYPES = [torch.float16]
+
+
+class TestGenerationModels(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+
+    def assert_close_logits_and_output_strs(
+        self,
+        prompts: List[str],
+        model_case: ModelCase,
+        torch_dtype: torch.dtype,
+    ) -> None:
+        model_path = model_case.model_path
+        prefill_tolerance, decode_tolerance, rouge_l_tolerance = (
+            model_case.prefill_tolerance,
+            model_case.decode_tolerance,
+            model_case.rouge_l_tolerance,
+        )
+        max_new_tokens = 32
+
+        with HFRunner(
+            model_path,
+            torch_dtype=torch_dtype,
+            model_type="generation",
+            trust_remote_code=model_case.trust_remote_code,
+        ) as hf_runner:
+            hf_outputs = hf_runner.forward(prompts, max_new_tokens=max_new_tokens)
+
+        with SRTRunner(
+            model_path,
+            tp_size=model_case.tp_size,
+            torch_dtype=torch_dtype,
+            model_type="generation",
+            trust_remote_code=model_case.trust_remote_code,
+        ) as srt_runner:
+            srt_outputs = srt_runner.forward(prompts, max_new_tokens=max_new_tokens)
+
+        check_close_model_outputs(
+            hf_outputs=hf_outputs,
+            srt_outputs=srt_outputs,
+            prefill_tolerance=model_case.prefill_tolerance,
+            decode_tolerance=model_case.decode_tolerance,
+            rouge_l_tolerance=model_case.rouge_l_tolerance,
+            debug_text=f"model_path={model_path} prompts={prompts}",
+        )
+
+    @unittest.skipIf(not is_in_ci(), "Local test should run all models")
+    def test_ci_models(self):
+        for model_case in CI_MODELS:
+            for torch_dtype in TORCH_DTYPES:
+                prompts = DEFAULT_PROMPTS
+
+                # Skip long prompts for models that do not have a long context
+                if model_case.skip_long_prompt:
+                    prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000]
+
+                # Assert the logits and output strs are close
+                self.assert_close_logits_and_output_strs(
+                    prompts, model_case, torch_dtype
+                )
+
+    @unittest.skipIf(is_in_ci(), "CI only runs selected models for simplicity")
+    def test_all_models(self):
+        for model_case in ALL_MODELS:
+            for torch_dtype in TORCH_DTYPES:
+                if (
+                    "ONLY_RUN" in os.environ
+                    and os.environ["ONLY_RUN"] != model_case.model_path
+                ):
+                    continue
+
+                # Skip long prompts for models that do not have a long context
+                prompts = DEFAULT_PROMPTS
+                if model_case.skip_long_prompt:
+                    prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000]
+
+                # Assert the logits and output strs are close
+                self.assert_close_logits_and_output_strs(
+                    prompts, model_case, torch_dtype
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/models/test_gme_qwen_models.py b/test/srt/models/test_gme_qwen_models.py
new file mode 100644
index 000000000..265bf8a39
--- /dev/null
+++ b/test/srt/models/test_gme_qwen_models.py
@@ -0,0 +1,85 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+import multiprocessing as mp
+import unittest
+
+import torch
+
+from sglang.test.runners import HFRunner, SRTRunner
+from sglang.test.test_utils import CustomTestCase, get_similarities
+
+TEXTS = "two Subway Series sandwiches with meats, cheese, lettuce, tomatoes, and onions on a black background, accompanied by the Subway Series logo, highlighting a new sandwich series."
+IMAGES = "https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild/resolve/main/images/023.jpg"
+
+
+MODELS = [
+    ("Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", 1e-3),
+]
+TORCH_DTYPES = [torch.float16]
+
+
+class TestQmeQwenModels(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+
+    def assert_close_embeddings(self, model, prefill_tolerance, torch_dtype):
+
+        prompts_no_image = f"<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{TEXTS}<|im_end|>\n<|im_start|>assistant\n<|endoftext|>"
+        prompts_with_image = f"<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>\n<|im_start|>assistant\n<|endoftext|>"
+        with HFRunner(
+            model,
+            torch_dtype=torch_dtype,
+            model_type="embedding",
+        ) as hf_runner:
+            hf_text_embeddings = hf_runner.forward(prompts=[prompts_no_image])
+            hf_image_embeddings = hf_runner.forward(
+                prompts=[prompts_with_image], image_data=[IMAGES]
+            )
+        with SRTRunner(
+            model,
+            tp_size=1,
+            torch_dtype=torch_dtype,
+            model_type="embedding",
+        ) as srt_runner:
+            srt_text_embeddings = srt_runner.forward(prompts=prompts_no_image)
+            srt_image_embeddings = srt_runner.forward(
+                prompts=prompts_with_image, image_data=IMAGES
+            )
+
+        similarity = get_similarities(
+            hf_text_embeddings.embed_logits[0], srt_text_embeddings.embed_logits[0]
+        )
+        print("texts similarity diff", abs(similarity - 1))
+        assert torch.all(
+            abs(similarity - 1) < prefill_tolerance
+        ), "embeddings are not all close"
+        similarity = get_similarities(
+            hf_image_embeddings.embed_logits[0], srt_image_embeddings.embed_logits[0]
+        )
+        print("images similarity diff", abs(similarity - 1))
+        assert torch.all(
+            abs(similarity - 1) < prefill_tolerance
+        ), "embeddings are not all close"
+
+    def test_accuracy(self):
+        for model, prefill_tolerance in MODELS:
+            for torch_dtype in TORCH_DTYPES:
+                self.assert_close_embeddings(model, prefill_tolerance, torch_dtype)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/models/test_grok_models.py b/test/srt/models/test_grok_models.py
new file mode 100644
index 000000000..625fa1a65
--- /dev/null
+++ b/test/srt/models/test_grok_models.py
@@ -0,0 +1,53 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestGrok(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "lmzheng/grok-1"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--load-format",
+                "dummy",
+                "--json-model-override-args",
+                '{"num_hidden_layers": 2}',
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=64,
+            max_new_tokens=256,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+
+        # It is dummy weights so we only assert the output throughput instead of accuracy.
+        self.assertGreater(metrics["output_throughput"], 1000)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/models/test_llama4_models.py b/test/srt/models/test_llama4_models.py
new file mode 100644
index 000000000..3835ca8d7
--- /dev/null
+++ b/test/srt/models/test_llama4_models.py
@@ -0,0 +1,74 @@
+import random
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+MODELS = [
+    SimpleNamespace(
+        model="meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        accuracy=0.9,
+        tp_size=4,
+    ),
+]
+
+
+class TestLlama4(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.base_url = DEFAULT_URL_FOR_TEST
+
+    def test_gsm8k(self):
+
+        for model in MODELS:
+            try:
+                process = popen_launch_server(
+                    model.model,
+                    self.base_url,
+                    timeout=3 * DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                    other_args=[
+                        "--chat-template",
+                        "llama-4",
+                        "--tp-size",
+                        str(model.tp_size),
+                        "--mem-fraction-static",
+                        "0.8",
+                        "--context-length",
+                        "8192",
+                    ],
+                )
+                args = SimpleNamespace(
+                    num_shots=5,
+                    data_path=None,
+                    num_questions=200,
+                    max_new_tokens=512,
+                    parallel=128,
+                    host="http://127.0.0.1",
+                    port=int(self.base_url.split(":")[-1]),
+                )
+                metrics = run_eval(args)
+                print(f"{metrics=}")
+                self.assertGreaterEqual(metrics["accuracy"], model.accuracy)
+            except Exception as e:
+                print(f"Error testing {model.model}: {e}")
+                self.fail(f"Test failed for {model.model}: {e}")
+
+            finally:
+                # Ensure process cleanup happens regardless of success/failure
+                if process is not None and process.poll() is None:
+                    print(f"Cleaning up process {process.pid}")
+                    try:
+                        kill_process_tree(process.pid)
+                    except Exception as e:
+                        print(f"Error killing process: {e}")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/models/test_mtp_models.py b/test/srt/models/test_mtp_models.py
new file mode 100644
index 000000000..49b53c1e4
--- /dev/null
+++ b/test/srt/models/test_mtp_models.py
@@ -0,0 +1,58 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestMiMoMTP(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "XiaomiMiMo/MiMo-7B-RL"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--speculative-algorithm",
+                "EAGLE",
+                "--speculative-num-steps",
+                "1",
+                "--speculative-eagle-topk",
+                "1",
+                "--speculative-num-draft-tokens",
+                "2",
+                "--mem-fraction-static",
+                "0.5",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["accuracy"], 0.7)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/models/test_qwen_models.py b/test/srt/models/test_qwen_models.py
new file mode 100644
index 000000000..567e19f08
--- /dev/null
+++ b/test/srt/models/test_qwen_models.py
@@ -0,0 +1,77 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestQwen2(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "Qwen/Qwen2-7B-Instruct"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["accuracy"], 0.78)
+
+
+class TestQwen2FP8(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "neuralmagic/Qwen2-7B-Instruct-FP8"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["accuracy"], 0.78)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/models/test_reward_models.py b/test/srt/models/test_reward_models.py
new file mode 100644
index 000000000..5592ce223
--- /dev/null
+++ b/test/srt/models/test_reward_models.py
@@ -0,0 +1,92 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import multiprocessing as mp
+import unittest
+
+import torch
+
+from sglang.test.runners import HFRunner, SRTRunner
+from sglang.test.test_utils import CustomTestCase
+
+MODELS = [
+    ("LxzGordon/URM-LLaMa-3.1-8B", 1, 4e-2),
+    ("Skywork/Skywork-Reward-Llama-3.1-8B-v0.2", 1, 4e-2),
+]
+TORCH_DTYPES = [torch.float16]
+
+# PROMPT = "Jane has 12 apples. She gives 4 apples to her friend Mark, then buys 1 more apple, and finally splits all her apples equally among herself and her 2 siblings. How many apples does each person get?"
+# RESPONSE1 = "1. Jane starts with 12 apples and gives 4 to Mark. 12 - 4 = 8. Jane now has 8 apples.\n2. Jane buys 1 more apple. 8 + 1 = 9. Jane now has 9 apples.\n3. Jane splits the 9 apples equally among herself and her 2 siblings (3 people in total). 9 ÷ 3 = 3 apples each. Each person gets 3 apples."
+# RESPONSE2 = "1. Jane starts with 12 apples and gives 4 to Mark. 12 - 4 = 8. Jane now has 8 apples.\n2. Jane buys 1 more apple. 8 + 1 = 9. Jane now has 9 apples.\n3. Jane splits the 9 apples equally among her 2 siblings (2 people in total). 9 ÷ 2 = 4.5 apples each. Each person gets 4 apples."
+
+PROMPT = (
+    "What is the range of the numeric output of a sigmoid node in a neural network?"
+)
+RESPONSE1 = "The output of a sigmoid node is bounded between -1 and 1."
+RESPONSE2 = "The output of a sigmoid node is bounded between 0 and 1."
+
+CONVS = [
+    [{"role": "user", "content": PROMPT}, {"role": "assistant", "content": RESPONSE1}],
+    [{"role": "user", "content": PROMPT}, {"role": "assistant", "content": RESPONSE2}],
+]
+
+
+class TestRewardModels(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+
+    def assert_close_reward_scores(
+        self,
+        convs,
+        model_path,
+        tp_size,
+        torch_dtype,
+        tolerance,
+    ) -> None:
+        with HFRunner(
+            model_path,
+            torch_dtype=torch_dtype,
+            model_type="reward",
+        ) as hf_runner:
+            hf_outputs = hf_runner.forward(convs)
+
+        with SRTRunner(
+            model_path,
+            torch_dtype=torch_dtype,
+            model_type="reward",
+        ) as srt_runner:
+            prompts = srt_runner.tokenizer.apply_chat_template(convs, tokenize=False)
+            srt_outputs = srt_runner.forward(prompts)
+
+        hf_scores = torch.tensor(hf_outputs.scores)
+        srt_scores = torch.tensor(srt_outputs.scores)
+        print(f"{hf_scores=}")
+        print(f"{srt_scores=}")
+
+        assert torch.all(
+            abs(hf_scores - srt_scores) < tolerance
+        ), "reward scores are not all close"
+
+    def test_reward_scores(self):
+        for model, tp_size, tolerance in MODELS:
+            for torch_dtype in TORCH_DTYPES:
+                self.assert_close_reward_scores(
+                    CONVS, model, tp_size, torch_dtype, tolerance
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/models/test_transformers_models.py b/test/srt/models/test_transformers_models.py
new file mode 100644
index 000000000..95592453f
--- /dev/null
+++ b/test/srt/models/test_transformers_models.py
@@ -0,0 +1,181 @@
+import dataclasses
+import multiprocessing as mp
+import unittest
+from types import SimpleNamespace
+from typing import List
+
+import torch
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.runners import DEFAULT_PROMPTS, SRTRunner, check_close_model_outputs
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+)
+
+
+class TestTransformersFallbackEndpoint(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--model-impl", "transformers"],
+        )
+        cls.mmlu_lower_bound = 0.65
+        cls.gsm8k_lower_bound = 0.65
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+        from sglang.test.run_eval import run_eval
+
+        metrics = run_eval(args)
+        self.assertGreaterEqual(metrics["score"], self.mmlu_lower_bound)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        from sglang.test.few_shot_gsm8k import run_eval
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["accuracy"], self.gsm8k_lower_bound)
+
+
+class TestTransformersFallbackTorchAO(TestTransformersFallbackEndpoint):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--model-impl",
+                "transformers",
+                "--torchao-config",
+                "int4wo-128",
+            ],
+        )
+        cls.mmlu_lower_bound = 0.65
+        cls.gsm8k_lower_bound = 0.65
+
+
+@dataclasses.dataclass
+class ModelCase:
+    model_path: str
+    tp_size: int = 1
+    prefill_tolerance: float = 5e-2
+    decode_tolerance: float = 5e-2
+    rouge_l_tolerance: float = 1
+    skip_long_prompt: bool = False
+    trust_remote_code: bool = False
+    torchao_config: str = None
+    torch_dtype: torch.dtype = torch.float16
+
+
+# Popular models that run on the CI
+CI_MODELS = [
+    ModelCase(DEFAULT_MODEL_NAME_FOR_TEST),
+]
+
+ALL_OTHER_MODELS = [
+    ModelCase(DEFAULT_MODEL_NAME_FOR_TEST, tp_size=2),
+]
+
+
+class TestTransformersFallbackEngine(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+
+    def assert_close_logits_and_output_strs(
+        self,
+        prompts: List[str],
+        model_case: ModelCase,
+    ) -> None:
+        model_path = model_case.model_path
+        max_new_tokens = 32
+        # force to use transformers impl
+        with SRTRunner(
+            model_path,
+            tp_size=model_case.tp_size,
+            torch_dtype=model_case.torch_dtype,
+            model_type="generation",
+            model_impl="transformers",
+            trust_remote_code=model_case.trust_remote_code,
+            torchao_config=model_case.torchao_config,
+        ) as srt_runner:
+            srt_outputs = srt_runner.forward(prompts, max_new_tokens=max_new_tokens)
+
+        with SRTRunner(
+            model_path,
+            tp_size=model_case.tp_size,
+            torch_dtype=model_case.torch_dtype,
+            model_type="generation",
+            trust_remote_code=model_case.trust_remote_code,
+            torchao_config=model_case.torchao_config,
+        ) as srt_runner:
+            srt_transformers_outputs = srt_runner.forward(
+                prompts, max_new_tokens=max_new_tokens
+            )
+
+        check_close_model_outputs(
+            hf_outputs=srt_transformers_outputs,
+            srt_outputs=srt_outputs,
+            prefill_tolerance=model_case.prefill_tolerance,
+            decode_tolerance=model_case.decode_tolerance,
+            rouge_l_tolerance=model_case.rouge_l_tolerance,
+            debug_text=f"model_path={model_path} prompts={prompts}",
+        )
+
+    def test_ci_models(self):
+        for model_case in CI_MODELS:
+            # Skip long prompts for models that do not have a long context
+            prompts = DEFAULT_PROMPTS
+            if model_case.skip_long_prompt:
+                prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000]
+            # Assert the logits and output strs are close
+            self.assert_close_logits_and_output_strs(prompts, model_case)
+
+    def test_others(self):
+        if is_in_ci():
+            return
+
+        # Skip long prompts for models that do not have a long context
+        prompts = DEFAULT_PROMPTS
+        for model_case in ALL_OTHER_MODELS:
+            if model_case.skip_long_prompt:
+                prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000]
+
+            # Assert the logits and output strs are close
+            self.assert_close_logits_and_output_strs(prompts, model_case)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/models/test_unsloth_models.py b/test/srt/models/test_unsloth_models.py
new file mode 100644
index 000000000..24660ea34
--- /dev/null
+++ b/test/srt/models/test_unsloth_models.py
@@ -0,0 +1,213 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestUnslothPhi4(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "unsloth/phi-4"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["accuracy"], 0.78)
+
+
+class TestUnslothPhi4Bnb4bit(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "unsloth/phi-4-bnb-4bit"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--load-format",
+                "bitsandbytes",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["accuracy"], 0.75)
+
+
+class TestUnslothPhi4UnslothBnb4bit(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "unsloth/phi-4-unsloth-bnb-4bit"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--load-format",
+                "bitsandbytes",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["accuracy"], 0.75)
+
+
+class TestUnslothPhi4MiniInstruct(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "unsloth/Phi-4-mini-instruct"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["accuracy"], 0.65)
+
+
+class TestUnslothPhi4MiniBnb4bit(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "unsloth/Phi-4-mini-instruct-bnb-4bit"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--load-format",
+                "bitsandbytes",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["accuracy"], 0.6)
+
+
+class TestUnslothPhi4MiniUnslothBnb4bit(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "unsloth/Phi-4-mini-instruct-unsloth-bnb-4bit"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--load-format",
+                "bitsandbytes",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["accuracy"], 0.6)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/models/test_vlm_models.py b/test/srt/models/test_vlm_models.py
new file mode 100644
index 000000000..0748f1ee0
--- /dev/null
+++ b/test/srt/models/test_vlm_models.py
@@ -0,0 +1,315 @@
+import argparse
+import glob
+import json
+import os
+import random
+import subprocess
+import sys
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+)
+
+# VLM models for testing
+MODELS = [
+    SimpleNamespace(model="google/gemma-3-27b-it", mmmu_accuracy=0.45),
+    SimpleNamespace(
+        model="Qwen/Qwen2.5-VL-3B-Instruct",
+        mmmu_accuracy=0.4,
+    ),
+    SimpleNamespace(model="openbmb/MiniCPM-V-2_6", mmmu_accuracy=0.4),
+]
+
+
+class TestVLMModels(CustomTestCase):
+    parsed_args = None  # Class variable to store args
+
+    @classmethod
+    def setUpClass(cls):
+        # Removed argument parsing from here
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.time_out = DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
+
+        # Set OpenAI API key and base URL environment variables. Needed for lmm-evals to work.
+        os.environ["OPENAI_API_KEY"] = cls.api_key
+        os.environ["OPENAI_API_BASE"] = f"{cls.base_url}/v1"
+
+    def _detect_eviction_in_logs(self, log_output):
+        """Detect if eviction events occurred in the log output."""
+        eviction_keywords = ["Cache eviction: evicted"]
+
+        eviction_detected = False
+        eviction_count = 0
+
+        for line in log_output.split("\n"):
+            if any(keyword in line for keyword in eviction_keywords):
+                eviction_detected = True
+                eviction_count += 1
+                print(f"Eviction detected: {line.strip()}")
+
+        return eviction_detected, eviction_count
+
+    def run_mmmu_eval(
+        self,
+        model_version: str,
+        output_path: str,
+        *,
+        env: dict | None = None,
+    ):
+        """
+        Evaluate a VLM on the MMMU validation set with lmms‑eval.
+        Only `model_version` (checkpoint) and `chat_template` vary;
+        We are focusing only on the validation set due to resource constraints.
+        """
+        # -------- fixed settings --------
+        model = "openai_compatible"
+        tp = 1
+        tasks = "mmmu_val"
+        batch_size = 2
+        log_suffix = "openai_compatible"
+        os.makedirs(output_path, exist_ok=True)
+
+        # -------- compose --model_args --------
+        model_args = f'model_version="{model_version}",' f"tp={tp}"
+
+        # -------- build command list --------
+        cmd = [
+            "python3",
+            "-m",
+            "lmms_eval",
+            "--model",
+            model,
+            "--model_args",
+            model_args,
+            "--tasks",
+            tasks,
+            "--batch_size",
+            str(batch_size),
+            "--log_samples",
+            "--log_samples_suffix",
+            log_suffix,
+            "--output_path",
+            str(output_path),
+        ]
+
+        subprocess.run(
+            cmd,
+            check=True,
+            timeout=3600,
+        )
+
+    def _run_vlm_mmmu_test(
+        self,
+        model,
+        output_path,
+        test_name="",
+        custom_env=None,
+        log_level="info",
+        capture_output=False,
+    ):
+        """
+        Common method to run VLM MMMU benchmark test.
+
+        Args:
+            model: Model to test
+            output_path: Path for output logs
+            test_name: Optional test name for logging
+            custom_env: Optional custom environment variables
+            log_level: Log level for server (default: "info")
+            capture_output: Whether to capture server stdout/stderr
+        """
+        print(f"\nTesting model: {model.model}{test_name}")
+
+        process = None
+        mmmu_accuracy = 0  # Initialize to handle potential exceptions
+        server_output = ""
+
+        try:
+            # Prepare environment variables
+            process_env = os.environ.copy()
+            if custom_env:
+                process_env.update(custom_env)
+
+            # Prepare stdout/stderr redirection if needed
+            stdout_file = None
+            stderr_file = None
+            if capture_output:
+                stdout_file = open("/tmp/server_stdout.log", "w")
+                stderr_file = open("/tmp/server_stderr.log", "w")
+
+            # Launch server for testing
+            process = popen_launch_server(
+                model.model,
+                base_url=self.base_url,
+                timeout=self.time_out,
+                api_key=self.api_key,
+                other_args=[
+                    "--trust-remote-code",
+                    "--cuda-graph-max-bs",
+                    "32",
+                    "--enable-multimodal",
+                    "--mem-fraction-static",
+                    str(self.parsed_args.mem_fraction_static),  # Use class variable
+                    "--log-level",
+                    log_level,
+                ],
+                env=process_env,
+                return_stdout_stderr=(
+                    (stdout_file, stderr_file) if capture_output else None
+                ),
+            )
+
+            # Run evaluation
+            self.run_mmmu_eval(model.model, output_path)
+
+            # Get the result file
+            result_file_path = glob.glob(f"{output_path}/*.json")[0]
+
+            with open(result_file_path, "r") as f:
+                result = json.load(f)
+                print(f"Result{test_name}\n: {result}")
+
+            # Process the result
+            mmmu_accuracy = result["results"]["mmmu_val"]["mmmu_acc,none"]
+            print(
+                f"Model {model.model} achieved accuracy{test_name}: {mmmu_accuracy:.4f}"
+            )
+
+            # Capture server output if requested
+            if capture_output and process:
+                server_output = self._read_output_from_files()
+
+            # Assert performance meets expected threshold
+            self.assertGreaterEqual(
+                mmmu_accuracy,
+                model.mmmu_accuracy,
+                f"Model {model.model} accuracy ({mmmu_accuracy:.4f}) below expected threshold ({model.mmmu_accuracy:.4f}){test_name}",
+            )
+
+            return server_output
+
+        except Exception as e:
+            print(f"Error testing {model.model}{test_name}: {e}")
+            self.fail(f"Test failed for {model.model}{test_name}: {e}")
+
+        finally:
+            # Ensure process cleanup happens regardless of success/failure
+            if process is not None and process.poll() is None:
+                print(f"Cleaning up process {process.pid}")
+                try:
+                    kill_process_tree(process.pid)
+                except Exception as e:
+                    print(f"Error killing process: {e}")
+
+            # clean up temporary files
+            if capture_output:
+                if stdout_file:
+                    stdout_file.close()
+                if stderr_file:
+                    stderr_file.close()
+                for filename in ["/tmp/server_stdout.log", "/tmp/server_stderr.log"]:
+                    try:
+                        if os.path.exists(filename):
+                            os.remove(filename)
+                    except Exception as e:
+                        print(f"Error removing {filename}: {e}")
+
+    def _read_output_from_files(self):
+        output_lines = []
+
+        log_files = [
+            ("/tmp/server_stdout.log", "[STDOUT]"),
+            ("/tmp/server_stderr.log", "[STDERR]"),
+        ]
+        for filename, tag in log_files:
+            try:
+                if os.path.exists(filename):
+                    with open(filename, "r") as f:
+                        for line in f:
+                            output_lines.append(f"{tag} {line.rstrip()}")
+            except Exception as e:
+                print(f"Error reading {tag.lower()} file: {e}")
+
+        return "\n".join(output_lines)
+
+    def test_vlm_mmmu_benchmark(self):
+        """Test VLM models against MMMU benchmark."""
+        models_to_test = MODELS
+
+        if is_in_ci():
+            models_to_test = [random.choice(MODELS)]
+
+        for model in models_to_test:
+            self._run_vlm_mmmu_test(model, "./logs")
+
+    def test_vlm_mmmu_benchmark_with_small_cache(self):
+        """Test VLM models against MMMU benchmark with a small embedding cache to force eviction."""
+        models_to_test = MODELS
+
+        if is_in_ci():
+            models_to_test = [random.choice(MODELS)]
+
+        for model in models_to_test:
+            custom_env = {"SGLANG_VLM_CACHE_SIZE_MB": "5"}
+
+            # Run the test with output capture
+            server_output = self._run_vlm_mmmu_test(
+                model,
+                "./logs_small_cache",
+                test_name=" with small embedding cache (evict test)",
+                custom_env=custom_env,
+                log_level="debug",  # Enable debug logging for eviction detection
+                capture_output=True,  # Capture server output
+            )
+
+            # Print server output for debugging
+            print("Server output:\n", server_output)
+
+            # Analyze server output for eviction events
+            eviction_detected, eviction_count = self._detect_eviction_in_logs(
+                server_output
+            )
+
+            # Assert that eviction was detected (since we're using small cache)
+            self.assertTrue(
+                eviction_detected,
+                f"Expected eviction events to be detected with small cache (5MB), but none found. "
+                f"Cache size may be too large for the workload or eviction logic may not be working. "
+                f"Total log content length: {len(server_output)} characters",
+            )
+
+            print(
+                f"Eviction detection summary: {eviction_count} eviction events detected"
+            )
+
+            # Additional assertion: if eviction was detected, the test passed
+            if eviction_detected:
+                print("✅ Eviction logic successfully triggered and detected!")
+
+
+if __name__ == "__main__":
+    # Define and parse arguments here, before unittest.main
+    parser = argparse.ArgumentParser(description="Test VLM models")
+    parser.add_argument(
+        "--mem-fraction-static",
+        type=float,
+        help="Static memory fraction for the model",
+        default=0.8,
+    )
+
+    # Parse args intended for unittest
+    args = parser.parse_args()
+
+    # Store the parsed args object on the class
+    TestVLMModels.parsed_args = args
+
+    # Pass args to unittest
+    unittest.main(argv=[sys.argv[0]])
diff --git a/test/srt/openai_server/__init__.py b/test/srt/openai_server/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/srt/openai_server/basic/__init__.py b/test/srt/openai_server/basic/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/srt/openai_server/basic/test_openai_embedding.py b/test/srt/openai_server/basic/test_openai_embedding.py
new file mode 100644
index 000000000..60eb8f764
--- /dev/null
+++ b/test/srt/openai_server/basic/test_openai_embedding.py
@@ -0,0 +1,97 @@
+import unittest
+
+import openai
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestOpenAIEmbedding(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+
+        # Configure embedding-specific args
+        other_args = ["--is-embedding", "--enable-metrics"]
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=other_args,
+        )
+        cls.base_url += "/v1"
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_embedding_single(self):
+        """Test single embedding request"""
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        response = client.embeddings.create(model=self.model, input="Hello world")
+        self.assertEqual(len(response.data), 1)
+        self.assertTrue(len(response.data[0].embedding) > 0)
+
+    def test_embedding_batch(self):
+        """Test batch embedding request"""
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        response = client.embeddings.create(
+            model=self.model, input=["Hello world", "Test text"]
+        )
+        self.assertEqual(len(response.data), 2)
+        self.assertTrue(len(response.data[0].embedding) > 0)
+        self.assertTrue(len(response.data[1].embedding) > 0)
+
+    def test_embedding_single_batch_str(self):
+        """Test embedding with a List[str] and length equals to 1"""
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        response = client.embeddings.create(model=self.model, input=["Hello world"])
+        self.assertEqual(len(response.data), 1)
+        self.assertTrue(len(response.data[0].embedding) > 0)
+
+    def test_embedding_single_int_list(self):
+        """Test embedding with a List[int] or List[List[int]]]"""
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        response = client.embeddings.create(
+            model=self.model,
+            input=[[15339, 314, 703, 284, 612, 262, 10658, 10188, 286, 2061]],
+        )
+        self.assertEqual(len(response.data), 1)
+        self.assertTrue(len(response.data[0].embedding) > 0)
+
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        response = client.embeddings.create(
+            model=self.model,
+            input=[15339, 314, 703, 284, 612, 262, 10658, 10188, 286, 2061],
+        )
+        self.assertEqual(len(response.data), 1)
+        self.assertTrue(len(response.data[0].embedding) > 0)
+
+    def test_empty_string_embedding(self):
+        """Test embedding an empty string."""
+
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        # Text embedding example with empty string
+        text = ""
+        # Expect a BadRequestError for empty input
+        with self.assertRaises(openai.BadRequestError) as cm:
+            client.embeddings.create(
+                model=self.model,
+                input=text,
+            )
+        # check the status code
+        self.assertEqual(cm.exception.status_code, 400)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/openai_server/basic/test_openai_server.py b/test/srt/openai_server/basic/test_openai_server.py
new file mode 100644
index 000000000..f42039bff
--- /dev/null
+++ b/test/srt/openai_server/basic/test_openai_server.py
@@ -0,0 +1,669 @@
+"""
+python3 -m unittest openai_server.basic.test_openai_server.TestOpenAIServer.test_completion
+python3 -m unittest openai_server.basic.test_openai_server.TestOpenAIServer.test_completion_stream
+python3 -m unittest openai_server.basic.test_openai_server.TestOpenAIServer.test_chat_completion
+python3 -m unittest openai_server.basic.test_openai_server.TestOpenAIServer.test_chat_completion_stream
+"""
+
+import json
+import re
+import unittest
+
+import numpy as np
+import openai
+import requests
+
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils import kill_process_tree
+from sglang.test.runners import TEST_RERANK_QUERY_DOCS
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_CROSS_ENCODER_MODEL_NAME_FOR_TEST,
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestOpenAIServer(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+        )
+        cls.base_url += "/v1"
+        cls.tokenizer = get_tokenizer(DEFAULT_SMALL_MODEL_NAME_FOR_TEST)
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def run_completion(
+        self, echo, logprobs, use_list_input, parallel_sample_num, token_input
+    ):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        prompt = "The capital of France is"
+        if token_input:
+            prompt_input = self.tokenizer.encode(prompt)
+            num_prompt_tokens = len(prompt_input)
+        else:
+            prompt_input = prompt
+            num_prompt_tokens = len(self.tokenizer.encode(prompt))
+
+        if use_list_input:
+            prompt_arg = [prompt_input, prompt_input]
+            num_choices = len(prompt_arg)
+            num_prompt_tokens *= 2
+        else:
+            prompt_arg = prompt_input
+            num_choices = 1
+
+        response = client.completions.create(
+            model=self.model,
+            prompt=prompt_arg,
+            temperature=0,
+            max_tokens=32,
+            echo=echo,
+            logprobs=logprobs,
+            n=parallel_sample_num,
+        )
+
+        assert len(response.choices) == num_choices * parallel_sample_num
+
+        if echo:
+            text = response.choices[0].text
+            assert text.startswith(prompt)
+
+        if logprobs:
+            assert response.choices[0].logprobs
+            assert isinstance(response.choices[0].logprobs.tokens[0], str)
+            assert isinstance(response.choices[0].logprobs.top_logprobs[1], dict)
+            ret_num_top_logprobs = len(response.choices[0].logprobs.top_logprobs[1])
+
+            # FIXME: Sometimes, some top_logprobs are missing in the return value. The reason is that some output id maps to the same output token and duplicate in the map
+            # assert ret_num_top_logprobs == logprobs, f"{ret_num_top_logprobs} vs {logprobs}"
+            assert ret_num_top_logprobs > 0
+
+            # when echo=True and request.logprobs>0, logprob_start_len is 0, so the first token's logprob would be None.
+            if not echo:
+                assert response.choices[0].logprobs.token_logprobs[0]
+
+        assert response.id
+        assert response.created
+        assert (
+            response.usage.prompt_tokens == num_prompt_tokens
+        ), f"{response.usage.prompt_tokens} vs {num_prompt_tokens}"
+        assert response.usage.completion_tokens > 0
+        assert response.usage.total_tokens > 0
+
+    def run_completion_stream(
+        self, echo, logprobs, use_list_input, parallel_sample_num, token_input
+    ):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        prompt = "The capital of France is"
+        if token_input:
+            prompt_input = self.tokenizer.encode(prompt)
+            num_prompt_tokens = len(prompt_input)
+        else:
+            prompt_input = prompt
+            num_prompt_tokens = len(self.tokenizer.encode(prompt))
+
+        if use_list_input:
+            prompt_arg = [prompt_input, prompt_input]
+            num_choices = len(prompt_arg)
+            num_prompt_tokens *= 2
+        else:
+            prompt_arg = prompt_input
+            num_choices = 1
+
+        generator = client.completions.create(
+            model=self.model,
+            prompt=prompt_arg,
+            temperature=0,
+            max_tokens=32,
+            echo=echo,
+            logprobs=logprobs,
+            stream=True,
+            stream_options={"include_usage": True},
+            n=parallel_sample_num,
+        )
+
+        is_firsts = {}
+        for response in generator:
+            usage = response.usage
+            if usage is not None:
+                assert usage.prompt_tokens > 0, f"usage.prompt_tokens was zero"
+                assert usage.completion_tokens > 0, f"usage.completion_tokens was zero"
+                assert usage.total_tokens > 0, f"usage.total_tokens was zero"
+                continue
+
+            index = response.choices[0].index
+            is_first = is_firsts.get(index, True)
+
+            if logprobs:
+                assert response.choices[0].logprobs, f"no logprobs in response"
+                assert isinstance(
+                    response.choices[0].logprobs.tokens[0], str
+                ), f"{response.choices[0].logprobs.tokens[0]} is not a string"
+                if not (is_first and echo):
+                    assert isinstance(
+                        response.choices[0].logprobs.top_logprobs[0], dict
+                    ), f"top_logprobs was not a dictionary"
+                    ret_num_top_logprobs = len(
+                        response.choices[0].logprobs.top_logprobs[0]
+                    )
+                    # FIXME: Sometimes, some top_logprobs are missing in the return value. The reason is that some output id maps to the same output token and duplicate in the map
+                    # assert ret_num_top_logprobs == logprobs, f"{ret_num_top_logprobs} vs {logprobs}"
+                    assert ret_num_top_logprobs > 0, f"ret_num_top_logprobs was 0"
+
+            if is_first:
+                if echo:
+                    assert response.choices[0].text.startswith(
+                        prompt
+                    ), f"{response.choices[0].text} and all args {echo} {logprobs} {token_input} {is_first}"
+                is_firsts[index] = False
+            assert response.id, f"no id in response"
+            assert response.created, f"no created in response"
+
+        for index in [i for i in range(parallel_sample_num * num_choices)]:
+            assert not is_firsts.get(
+                index, True
+            ), f"index {index} is not found in the response"
+
+    def run_chat_completion(self, logprobs, parallel_sample_num):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        response = client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "You are a helpful AI assistant"},
+                {
+                    "role": "user",
+                    "content": "What is the capital of France? Answer in a few words.",
+                },
+            ],
+            temperature=0,
+            logprobs=logprobs is not None and logprobs > 0,
+            top_logprobs=logprobs,
+            n=parallel_sample_num,
+        )
+
+        if logprobs:
+            assert isinstance(
+                response.choices[0].logprobs.content[0].top_logprobs[0].token, str
+            )
+
+            ret_num_top_logprobs = len(
+                response.choices[0].logprobs.content[0].top_logprobs
+            )
+            assert (
+                ret_num_top_logprobs == logprobs
+            ), f"{ret_num_top_logprobs} vs {logprobs}"
+
+        assert len(response.choices) == parallel_sample_num
+        assert response.choices[0].message.role == "assistant"
+        assert isinstance(response.choices[0].message.content, str)
+        assert response.id
+        assert response.created
+        assert response.usage.prompt_tokens > 0
+        assert response.usage.completion_tokens > 0
+        assert response.usage.total_tokens > 0
+
+    def run_chat_completion_stream(self, logprobs, parallel_sample_num=1):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        generator = client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "You are a helpful AI assistant"},
+                {"role": "user", "content": "What is the capital of France?"},
+            ],
+            temperature=0,
+            logprobs=logprobs is not None and logprobs > 0,
+            top_logprobs=logprobs,
+            stream=True,
+            stream_options={"include_usage": True},
+            n=parallel_sample_num,
+        )
+
+        is_firsts = {}
+        is_finished = {}
+        finish_reason_counts = {}
+        for response in generator:
+            usage = response.usage
+            if usage is not None:
+                assert usage.prompt_tokens > 0, f"usage.prompt_tokens was zero"
+                assert usage.completion_tokens > 0, f"usage.completion_tokens was zero"
+                assert usage.total_tokens > 0, f"usage.total_tokens was zero"
+                continue
+
+            index = response.choices[0].index
+            finish_reason = response.choices[0].finish_reason
+            if finish_reason is not None:
+                is_finished[index] = True
+                finish_reason_counts[index] = finish_reason_counts.get(index, 0) + 1
+
+            data = response.choices[0].delta
+
+            if is_firsts.get(index, True):
+                assert (
+                    data.role == "assistant"
+                ), f"data.role was not 'assistant' for first chunk"
+                is_firsts[index] = False
+                continue
+
+            if logprobs and not is_finished.get(index, False):
+                assert response.choices[0].logprobs, f"logprobs was not returned"
+                assert isinstance(
+                    response.choices[0].logprobs.content[0].top_logprobs[0].token, str
+                ), f"top_logprobs token was not a string"
+                assert isinstance(
+                    response.choices[0].logprobs.content[0].top_logprobs, list
+                ), f"top_logprobs was not a list"
+                ret_num_top_logprobs = len(
+                    response.choices[0].logprobs.content[0].top_logprobs
+                )
+                assert (
+                    ret_num_top_logprobs == logprobs
+                ), f"{ret_num_top_logprobs} vs {logprobs}"
+
+            assert (
+                isinstance(data.content, str)
+                or isinstance(data.reasoning_content, str)
+                or (isinstance(data.tool_calls, list) and len(data.tool_calls) > 0)
+                or response.choices[0].finish_reason
+            )
+            assert response.id
+            assert response.created
+
+        for index in [i for i in range(parallel_sample_num)]:
+            assert not is_firsts.get(
+                index, True
+            ), f"index {index} is not found in the response"
+
+        # Verify that each choice gets exactly one finish_reason chunk
+        for index in range(parallel_sample_num):
+            assert (
+                index in finish_reason_counts
+            ), f"No finish_reason found for index {index}"
+            assert (
+                finish_reason_counts[index] == 1
+            ), f"Expected 1 finish_reason chunk for index {index}, got {finish_reason_counts[index]}"
+
+    def test_completion(self):
+        for echo in [False, True]:
+            for logprobs in [None, 5]:
+                for use_list_input in [True, False]:
+                    for parallel_sample_num in [1, 2]:
+                        for token_input in [False, True]:
+                            self.run_completion(
+                                echo,
+                                logprobs,
+                                use_list_input,
+                                parallel_sample_num,
+                                token_input,
+                            )
+
+    def test_completion_stream(self):
+        # parallel sampling and list input are not supported in streaming mode
+        for echo in [False, True]:
+            for logprobs in [None, 5]:
+                for use_list_input in [True, False]:
+                    for parallel_sample_num in [1, 2]:
+                        for token_input in [False, True]:
+                            self.run_completion_stream(
+                                echo,
+                                logprobs,
+                                use_list_input,
+                                parallel_sample_num,
+                                token_input,
+                            )
+
+    def test_chat_completion(self):
+        for logprobs in [None, 5]:
+            for parallel_sample_num in [1, 2]:
+                self.run_chat_completion(logprobs, parallel_sample_num)
+
+    def test_chat_completion_stream(self):
+        for logprobs in [None, 5]:
+            for parallel_sample_num in [1, 2]:
+                self.run_chat_completion_stream(logprobs, parallel_sample_num)
+
+    def test_regex(self):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        regex = (
+            r"""\{\n"""
+            + r"""   "name": "[\w]+",\n"""
+            + r"""   "population": [\d]+\n"""
+            + r"""\}"""
+        )
+
+        response = client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "You are a helpful AI assistant"},
+                {"role": "user", "content": "Introduce the capital of France."},
+            ],
+            temperature=0,
+            max_tokens=128,
+            extra_body={"regex": regex},
+        )
+        text = response.choices[0].message.content
+
+        try:
+            js_obj = json.loads(text)
+        except (TypeError, json.decoder.JSONDecodeError):
+            print("JSONDecodeError", text)
+            raise
+        assert isinstance(js_obj["name"], str)
+        assert isinstance(js_obj["population"], int)
+
+    def test_penalty(self):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        response = client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "You are a helpful AI assistant"},
+                {"role": "user", "content": "Introduce the capital of France."},
+            ],
+            temperature=0,
+            max_tokens=32,
+            frequency_penalty=1.0,
+        )
+        text = response.choices[0].message.content
+        assert isinstance(text, str)
+
+    def test_response_prefill(self):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        response = client.chat.completions.create(
+            model="meta-llama/Llama-3.1-8B-Instruct",
+            messages=[
+                {"role": "system", "content": "You are a helpful AI assistant"},
+                {
+                    "role": "user",
+                    "content": """
+Extract the name, size, price, and color from this product description as a JSON object:
+
+<description>
+The SmartHome Mini is a compact smart home assistant available in black or white for only $49.99. At just 5 inches wide, it lets you control lights, thermostats, and other connected devices via voice or app—no matter where you place it in your home. This affordable little hub brings convenient hands-free control to your smart devices.
+</description>
+""",
+                },
+                {
+                    "role": "assistant",
+                    "content": "{\n",
+                },
+            ],
+            temperature=0,
+            extra_body={"continue_final_message": True},
+        )
+
+        assert (
+            response.choices[0]
+            .message.content.strip()
+            .startswith('"name": "SmartHome Mini",')
+        )
+
+    def test_model_list(self):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        models = list(client.models.list())
+        assert len(models) == 1
+        assert isinstance(getattr(models[0], "max_model_len", None), int)
+
+    def test_retrieve_model(self):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        # Test retrieving an existing model
+        retrieved_model = client.models.retrieve(self.model)
+        self.assertEqual(retrieved_model.id, self.model)
+        self.assertEqual(retrieved_model.root, self.model)
+
+        # Test retrieving a non-existent model
+        with self.assertRaises(openai.NotFoundError):
+            client.models.retrieve("non-existent-model")
+
+
+class TestOpenAIV1Rerank(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_CROSS_ENCODER_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.score_tolerance = 1e-2
+
+        # Configure embedding-specific args
+        other_args = [
+            "--is-embedding",
+            "--enable-metrics",
+            "--disable-radix-cache",
+            "--chunked-prefill-size",
+            "-1",
+            "--attention-backend",
+            "torch_native",
+        ]
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=other_args,
+        )
+        cls.base_url += "/v1/rerank"
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def run_rerank(self, query, docs):
+        response = requests.post(
+            self.base_url,
+            headers={
+                "Authorization": f"Bearer {self.api_key}",
+                "Content-Type": "application/json",
+            },
+            json={"query": query, "documents": docs},
+        )
+
+        return response.json()
+
+    def test_rerank_single(self):
+        """Test single rerank request"""
+        query = TEST_RERANK_QUERY_DOCS[0]["query"]
+        docs = TEST_RERANK_QUERY_DOCS[0]["documents"]
+
+        response = self.run_rerank(query, docs)
+
+        self.assertEqual(len(response), 1)
+        self.assertTrue(isinstance(response[0]["score"], float))
+        self.assertTrue(isinstance(response[0]["document"], str))
+        self.assertTrue(isinstance(response[0]["index"], int))
+
+    def test_rerank_batch(self):
+        """Test batch rerank request"""
+        query = TEST_RERANK_QUERY_DOCS[1]["query"]
+        docs = TEST_RERANK_QUERY_DOCS[1]["documents"]
+
+        response = self.run_rerank(query, docs)
+
+        self.assertEqual(len(response), 2)
+        self.assertTrue(isinstance(response[0]["score"], float))
+        self.assertTrue(isinstance(response[1]["score"], float))
+        self.assertTrue(isinstance(response[0]["document"], str))
+        self.assertTrue(isinstance(response[1]["document"], str))
+        self.assertTrue(isinstance(response[0]["index"], int))
+        self.assertTrue(isinstance(response[1]["index"], int))
+
+
+class TestOpenAIV1Score(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+        )
+        cls.base_url += "/v1/score"
+        cls.tokenizer = get_tokenizer(DEFAULT_SMALL_MODEL_NAME_FOR_TEST)
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def run_score(
+        self, query, items, label_token_ids, apply_softmax=False, item_first=False
+    ):
+        response = requests.post(
+            self.base_url,
+            headers={
+                "Authorization": f"Bearer {self.api_key}",
+                "Content-Type": "application/json",
+            },
+            json={
+                "model": self.model,
+                "query": query,
+                "items": items,
+                "label_token_ids": label_token_ids,
+                "apply_softmax": apply_softmax,
+                "item_first": item_first,
+            },
+        )
+        return response.json()
+
+    def test_score_text_input(self):
+        """Test scoring with text input"""
+        query = "The capital of France is"
+        items = ["Paris", "London", "Berlin"]
+
+        # Get valid token IDs from the tokenizer
+        label_token_ids = []
+        for item in items:
+            token_ids = self.tokenizer.encode(item, add_special_tokens=False)
+            if not token_ids:
+                self.fail(f"Failed to encode item: {item}")
+            label_token_ids.append(token_ids[0])
+
+        response = self.run_score(query, items, label_token_ids, apply_softmax=True)
+
+        # Handle error responses
+        if response.get("type") == "BadRequestError":
+            self.fail(f"Score request failed with error: {response['message']}")
+
+        # Verify response structure
+        self.assertIn("scores", response, "Response should have a 'scores' field")
+        self.assertIsInstance(response["scores"], list, "scores should be a list")
+        self.assertEqual(
+            len(response["scores"]),
+            len(items),
+            "Number of scores should match number of items",
+        )
+
+        # Each score should be a list of floats in the order of label_token_ids
+        for i, score_list in enumerate(response["scores"]):
+            self.assertIsInstance(score_list, list, f"Score {i} should be a list")
+            self.assertEqual(
+                len(score_list),
+                len(label_token_ids),
+                f"Score {i} length should match label_token_ids",
+            )
+            self.assertTrue(
+                all(isinstance(v, float) for v in score_list),
+                f"Score {i} values should be floats",
+            )
+            self.assertAlmostEqual(
+                sum(score_list),
+                1.0,
+                places=6,
+                msg=f"Score {i} probabilities should sum to 1",
+            )
+
+    def test_score_token_input(self):
+        """Test scoring with token IDs input"""
+        query = "The capital of France is"
+        items = ["Paris", "London", "Berlin"]
+
+        # Get valid token IDs
+        query_ids = self.tokenizer.encode(query, add_special_tokens=False)
+        item_ids = [
+            self.tokenizer.encode(item, add_special_tokens=False) for item in items
+        ]
+        label_token_ids = [
+            ids[0] for ids in item_ids if ids
+        ]  # Get first token ID of each item
+
+        response = self.run_score(
+            query_ids, item_ids, label_token_ids, apply_softmax=True
+        )
+
+        # Handle error responses
+        if response.get("type") == "BadRequestError":
+            self.fail(f"Score request failed with error: {response['message']}")
+
+        # Verify response structure
+        self.assertIn("scores", response, "Response should have a 'scores' field")
+        self.assertIsInstance(response["scores"], list, "scores should be a list")
+        self.assertEqual(
+            len(response["scores"]),
+            len(items),
+            "Number of scores should match number of items",
+        )
+
+        # Each score should be a list of floats in the order of label_token_ids
+        for i, score_list in enumerate(response["scores"]):
+            self.assertIsInstance(score_list, list, f"Score {i} should be a list")
+            self.assertEqual(
+                len(score_list),
+                len(label_token_ids),
+                f"Score {i} length should match label_token_ids",
+            )
+            self.assertTrue(
+                all(isinstance(v, float) for v in score_list),
+                f"Score {i} values should be floats",
+            )
+            self.assertAlmostEqual(
+                sum(score_list),
+                1.0,
+                places=6,
+                msg=f"Score {i} probabilities should sum to 1",
+            )
+
+    def test_score_error_handling(self):
+        """Test error handling for invalid inputs"""
+        query = "The capital of France is"
+        items = ["Paris", "London", "Berlin"]
+
+        # Test with invalid token ID
+        response = requests.post(
+            self.base_url,
+            headers={
+                "Authorization": f"Bearer {self.api_key}",
+                "Content-Type": "application/json",
+            },
+            json={
+                "model": self.model,
+                "query": query,
+                "items": items,
+                "label_token_ids": [999999],  # Invalid token ID
+                "apply_softmax": True,
+            },
+        )
+        self.assertEqual(response.status_code, 400)
+        error_response = response.json()
+        self.assertEqual(error_response["type"], "BadRequestError")
+        self.assertIn("Token ID 999999 is out of vocabulary", error_response["message"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/openai_server/basic/test_protocol.py b/test/srt/openai_server/basic/test_protocol.py
new file mode 100644
index 000000000..fcaa9770b
--- /dev/null
+++ b/test/srt/openai_server/basic/test_protocol.py
@@ -0,0 +1,368 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for OpenAI API protocol models"""
+
+import json
+import time
+import unittest
+from typing import Dict, List, Optional
+
+from pydantic import BaseModel, Field, ValidationError
+
+from sglang.srt.entrypoints.openai.protocol import (
+    BatchRequest,
+    BatchResponse,
+    ChatCompletionMessageContentImagePart,
+    ChatCompletionMessageContentTextPart,
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ChatCompletionResponseChoice,
+    ChatCompletionResponseStreamChoice,
+    ChatCompletionStreamResponse,
+    ChatCompletionTokenLogprob,
+    ChatMessage,
+    ChoiceLogprobs,
+    CompletionRequest,
+    CompletionResponse,
+    CompletionResponseChoice,
+    DeltaMessage,
+    EmbeddingObject,
+    EmbeddingRequest,
+    EmbeddingResponse,
+    ErrorResponse,
+    FileDeleteResponse,
+    FileRequest,
+    FileResponse,
+    Function,
+    FunctionResponse,
+    JsonSchemaResponseFormat,
+    LogProbs,
+    ModelCard,
+    ModelList,
+    MultimodalEmbeddingInput,
+    ResponseFormat,
+    ScoringRequest,
+    ScoringResponse,
+    StreamOptions,
+    StructuralTagResponseFormat,
+    Tool,
+    ToolCall,
+    ToolChoice,
+    TopLogprob,
+    UsageInfo,
+)
+
+
+class TestModelCard(unittest.TestCase):
+    """Test ModelCard protocol model"""
+
+    def test_model_card_serialization(self):
+        """Test model card JSON serialization"""
+        card = ModelCard(id="test-model", max_model_len=4096)
+        data = card.model_dump()
+        self.assertEqual(data["id"], "test-model")
+        self.assertEqual(data["object"], "model")
+        self.assertEqual(data["max_model_len"], 4096)
+
+
+class TestModelList(unittest.TestCase):
+    """Test ModelList protocol model"""
+
+    def test_empty_model_list(self):
+        """Test empty model list creation"""
+        model_list = ModelList()
+        self.assertEqual(model_list.object, "list")
+        self.assertEqual(len(model_list.data), 0)
+
+    def test_model_list_with_cards(self):
+        """Test model list with model cards"""
+        cards = [
+            ModelCard(id="model-1"),
+            ModelCard(id="model-2", max_model_len=2048),
+        ]
+        model_list = ModelList(data=cards)
+        self.assertEqual(len(model_list.data), 2)
+        self.assertEqual(model_list.data[0].id, "model-1")
+        self.assertEqual(model_list.data[1].id, "model-2")
+
+
+class TestCompletionRequest(unittest.TestCase):
+    """Test CompletionRequest protocol model"""
+
+    def test_basic_completion_request(self):
+        """Test basic completion request"""
+        request = CompletionRequest(model="test-model", prompt="Hello world")
+        self.assertEqual(request.model, "test-model")
+        self.assertEqual(request.prompt, "Hello world")
+        self.assertEqual(request.max_tokens, 16)  # default
+        self.assertEqual(request.temperature, 1.0)  # default
+        self.assertEqual(request.n, 1)  # default
+        self.assertFalse(request.stream)  # default
+        self.assertFalse(request.echo)  # default
+
+    def test_completion_request_sglang_extensions(self):
+        """Test completion request with SGLang-specific extensions"""
+        request = CompletionRequest(
+            model="test-model",
+            prompt="Hello",
+            top_k=50,
+            min_p=0.1,
+            repetition_penalty=1.1,
+            regex=r"\d+",
+            json_schema='{"type": "object"}',
+            lora_path="/path/to/lora",
+        )
+        self.assertEqual(request.top_k, 50)
+        self.assertEqual(request.min_p, 0.1)
+        self.assertEqual(request.repetition_penalty, 1.1)
+        self.assertEqual(request.regex, r"\d+")
+        self.assertEqual(request.json_schema, '{"type": "object"}')
+        self.assertEqual(request.lora_path, "/path/to/lora")
+
+    def test_completion_request_validation_errors(self):
+        """Test completion request validation errors"""
+        with self.assertRaises(ValidationError):
+            CompletionRequest()  # missing required fields
+
+        with self.assertRaises(ValidationError):
+            CompletionRequest(model="test-model")  # missing prompt
+
+
+class TestChatCompletionRequest(unittest.TestCase):
+    """Test ChatCompletionRequest protocol model"""
+
+    def test_basic_chat_completion_request(self):
+        """Test basic chat completion request"""
+        messages = [{"role": "user", "content": "Hello"}]
+        request = ChatCompletionRequest(model="test-model", messages=messages)
+        self.assertEqual(request.model, "test-model")
+        self.assertEqual(len(request.messages), 1)
+        self.assertEqual(request.messages[0].role, "user")
+        self.assertEqual(request.messages[0].content, "Hello")
+        self.assertEqual(request.temperature, 0.7)  # default
+        self.assertFalse(request.stream)  # default
+        self.assertEqual(request.tool_choice, "none")  # default when no tools
+
+    def test_chat_completion_tool_choice_validation(self):
+        """Test tool choice validation logic"""
+        messages = [{"role": "user", "content": "Hello"}]
+
+        # No tools, tool_choice should default to "none"
+        request1 = ChatCompletionRequest(model="test-model", messages=messages)
+        self.assertEqual(request1.tool_choice, "none")
+
+        # With tools, tool_choice should default to "auto"
+        tools = [
+            {
+                "type": "function",
+                "function": {"name": "test_func", "description": "Test function"},
+            }
+        ]
+        request2 = ChatCompletionRequest(
+            model="test-model", messages=messages, tools=tools
+        )
+        self.assertEqual(request2.tool_choice, "auto")
+
+    def test_chat_completion_sglang_extensions(self):
+        """Test chat completion with SGLang extensions"""
+        messages = [{"role": "user", "content": "Hello"}]
+        request = ChatCompletionRequest(
+            model="test-model",
+            messages=messages,
+            top_k=40,
+            min_p=0.05,
+            separate_reasoning=False,
+            stream_reasoning=False,
+            chat_template_kwargs={"custom_param": "value"},
+        )
+        self.assertEqual(request.top_k, 40)
+        self.assertEqual(request.min_p, 0.05)
+        self.assertFalse(request.separate_reasoning)
+        self.assertFalse(request.stream_reasoning)
+        self.assertEqual(request.chat_template_kwargs, {"custom_param": "value"})
+
+    def test_chat_completion_reasoning_effort(self):
+        """Test chat completion with reasoning effort"""
+        messages = [{"role": "user", "content": "Hello"}]
+        request = ChatCompletionRequest(
+            model="test-model",
+            messages=messages,
+            reasoning={
+                "enabled": True,
+                "reasoning_effort": "high",
+            },
+        )
+        self.assertEqual(request.reasoning_effort, "high")
+        self.assertEqual(request.chat_template_kwargs, {"thinking": True})
+
+    def test_chat_completion_json_format(self):
+        """Test chat completion json format"""
+        transcript = "Good morning! It's 7:00 AM, and I'm just waking up. Today is going to be a busy day, "
+        "so let's get started. First, I need to make a quick breakfast. I think I'll have some "
+        "scrambled eggs and toast with a cup of coffee. While I'm cooking, I'll also check my "
+        "emails to see if there's anything urgent."
+
+        messages = [
+            {
+                "role": "system",
+                "content": "The following is a voice message transcript. Only answer in JSON.",
+            },
+            {
+                "role": "user",
+                "content": transcript,
+            },
+        ]
+
+        class VoiceNote(BaseModel):
+            title: str = Field(description="A title for the voice note")
+            summary: str = Field(
+                description="A short one sentence summary of the voice note."
+            )
+            strict: Optional[bool] = True
+            actionItems: List[str] = Field(
+                description="A list of action items from the voice note"
+            )
+
+        request = ChatCompletionRequest(
+            model="test-model",
+            messages=messages,
+            top_k=40,
+            min_p=0.05,
+            separate_reasoning=False,
+            stream_reasoning=False,
+            chat_template_kwargs={"custom_param": "value"},
+            response_format={
+                "type": "json_schema",
+                "schema": VoiceNote.model_json_schema(),
+            },
+        )
+        res_format = request.response_format
+        json_format = res_format.json_schema
+        name = json_format.name
+        schema = json_format.schema_
+        strict = json_format.strict
+        self.assertEqual(name, "VoiceNote")
+        self.assertEqual(strict, True)
+        self.assertNotIn("strict", schema["properties"])
+
+        request = ChatCompletionRequest(
+            model="test-model",
+            messages=messages,
+            top_k=40,
+            min_p=0.05,
+            separate_reasoning=False,
+            stream_reasoning=False,
+            chat_template_kwargs={"custom_param": "value"},
+            response_format={
+                "type": "json_schema",
+                "json_schema": {
+                    "name": "VoiceNote",
+                    "schema": VoiceNote.model_json_schema(),
+                    "strict": True,
+                },
+            },
+        )
+        res_format = request.response_format
+        json_format = res_format.json_schema
+        name = json_format.name
+        schema = json_format.schema_
+        strict = json_format.strict
+        self.assertEqual(name, "VoiceNote")
+        self.assertEqual(strict, True)
+
+
+class TestModelSerialization(unittest.TestCase):
+    """Test model serialization with hidden states"""
+
+    def test_hidden_states_excluded_when_none(self):
+        """Test that None hidden_states are excluded with exclude_none=True"""
+        choice = ChatCompletionResponseChoice(
+            index=0,
+            message=ChatMessage(role="assistant", content="Hello"),
+            finish_reason="stop",
+            hidden_states=None,
+        )
+
+        response = ChatCompletionResponse(
+            id="test-id",
+            model="test-model",
+            choices=[choice],
+            usage=UsageInfo(prompt_tokens=5, completion_tokens=1, total_tokens=6),
+        )
+
+        # Test exclude_none serialization (should exclude None hidden_states)
+        data = response.model_dump(exclude_none=True)
+        self.assertNotIn("hidden_states", data["choices"][0])
+
+    def test_hidden_states_included_when_not_none(self):
+        """Test that non-None hidden_states are included"""
+        choice = ChatCompletionResponseChoice(
+            index=0,
+            message=ChatMessage(role="assistant", content="Hello"),
+            finish_reason="stop",
+            hidden_states=[0.1, 0.2, 0.3],
+        )
+
+        response = ChatCompletionResponse(
+            id="test-id",
+            model="test-model",
+            choices=[choice],
+            usage=UsageInfo(prompt_tokens=5, completion_tokens=1, total_tokens=6),
+        )
+
+        # Test exclude_none serialization (should include non-None hidden_states)
+        data = response.model_dump(exclude_none=True)
+        self.assertIn("hidden_states", data["choices"][0])
+        self.assertEqual(data["choices"][0]["hidden_states"], [0.1, 0.2, 0.3])
+
+
+class TestValidationEdgeCases(unittest.TestCase):
+    """Test edge cases and validation scenarios"""
+
+    def test_invalid_tool_choice_type(self):
+        """Test invalid tool choice type"""
+        messages = [{"role": "user", "content": "Hello"}]
+        with self.assertRaises(ValidationError):
+            ChatCompletionRequest(
+                model="test-model", messages=messages, tool_choice=123
+            )
+
+    def test_negative_token_limits(self):
+        """Test negative token limits"""
+        with self.assertRaises(ValidationError):
+            CompletionRequest(model="test-model", prompt="Hello", max_tokens=-1)
+
+    def test_model_serialization_roundtrip(self):
+        """Test that models can be serialized and deserialized"""
+        original_request = ChatCompletionRequest(
+            model="test-model",
+            messages=[{"role": "user", "content": "Hello"}],
+            temperature=0.7,
+            max_tokens=100,
+        )
+
+        # Serialize to dict
+        data = original_request.model_dump()
+
+        # Deserialize back
+        restored_request = ChatCompletionRequest(**data)
+
+        self.assertEqual(restored_request.model, original_request.model)
+        self.assertEqual(restored_request.temperature, original_request.temperature)
+        self.assertEqual(restored_request.max_tokens, original_request.max_tokens)
+        self.assertEqual(len(restored_request.messages), len(original_request.messages))
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/test/srt/openai_server/basic/test_serving_chat.py b/test/srt/openai_server/basic/test_serving_chat.py
new file mode 100644
index 000000000..9f0d48004
--- /dev/null
+++ b/test/srt/openai_server/basic/test_serving_chat.py
@@ -0,0 +1,425 @@
+"""
+Unit-tests for OpenAIServingChat — rewritten to use only the std-lib 'unittest'.
+Run with either:
+    python tests/test_serving_chat_unit.py -v
+or
+    python -m unittest discover -s tests -p "test_*unit.py" -v
+"""
+
+import asyncio
+import json
+import unittest
+import uuid
+from typing import Optional
+from unittest.mock import Mock, patch
+
+from fastapi import Request
+
+from sglang.srt.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    MessageProcessingResult,
+)
+from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
+from sglang.srt.managers.io_struct import GenerateReqInput
+
+
+class _MockTokenizerManager:
+    """Minimal mock that satisfies OpenAIServingChat."""
+
+    def __init__(self):
+        self.model_config = Mock(is_multimodal=False)
+        self.server_args = Mock(
+            enable_cache_report=False,
+            tool_call_parser="hermes",
+            reasoning_parser=None,
+        )
+        self.chat_template_name: Optional[str] = "llama-3"
+
+        # tokenizer stub
+        self.tokenizer = Mock()
+        self.tokenizer.encode.return_value = [1, 2, 3, 4, 5]
+        self.tokenizer.decode.return_value = "Test response"
+        self.tokenizer.chat_template = None
+        self.tokenizer.bos_token_id = 1
+
+        # async generator stub for generate_request
+        async def _mock_generate():
+            yield {
+                "text": "Test response",
+                "meta_info": {
+                    "id": f"chatcmpl-{uuid.uuid4()}",
+                    "prompt_tokens": 10,
+                    "completion_tokens": 5,
+                    "cached_tokens": 0,
+                    "finish_reason": {"type": "stop", "matched": None},
+                    "output_token_logprobs": [(0.1, 1, "Test"), (0.2, 2, "response")],
+                    "output_top_logprobs": None,
+                },
+                "index": 0,
+            }
+
+        self.generate_request = Mock(return_value=_mock_generate())
+        self.create_abort_task = Mock()
+
+
+class _MockTemplateManager:
+    """Minimal mock for TemplateManager."""
+
+    def __init__(self):
+        self.chat_template_name: Optional[str] = "llama-3"
+        self.jinja_template_content_format: Optional[str] = None
+        self.completion_template_name: Optional[str] = None
+
+
+class ServingChatTestCase(unittest.TestCase):
+    # ------------- common fixtures -------------
+    def setUp(self):
+        self.tm = _MockTokenizerManager()
+        self.template_manager = _MockTemplateManager()
+        self.chat = OpenAIServingChat(self.tm, self.template_manager)
+
+        # frequently reused requests
+        self.basic_req = ChatCompletionRequest(
+            model="x",
+            messages=[{"role": "user", "content": "Hi?"}],
+            temperature=0.7,
+            max_tokens=100,
+            stream=False,
+        )
+        self.stream_req = ChatCompletionRequest(
+            model="x",
+            messages=[{"role": "user", "content": "Hi?"}],
+            temperature=0.7,
+            max_tokens=100,
+            stream=True,
+        )
+
+        self.fastapi_request = Mock(spec=Request)
+        self.fastapi_request.headers = {}
+
+    # ------------- conversion tests -------------
+    def test_convert_to_internal_request_single(self):
+        with patch(
+            "sglang.srt.entrypoints.openai.serving_chat.generate_chat_conv"
+        ) as conv_mock, patch.object(self.chat, "_process_messages") as proc_mock:
+            conv_ins = Mock()
+            conv_ins.get_prompt.return_value = "Test prompt"
+            conv_ins.image_data = conv_ins.audio_data = None
+            conv_ins.modalities = []
+            conv_ins.stop_str = ["</s>"]
+            conv_mock.return_value = conv_ins
+
+            proc_mock.return_value = MessageProcessingResult(
+                "Test prompt",
+                [1, 2, 3],
+                None,
+                None,
+                [],
+                ["</s>"],
+                None,
+            )
+
+            adapted, processed = self.chat._convert_to_internal_request(self.basic_req)
+            self.assertIsInstance(adapted, GenerateReqInput)
+            self.assertFalse(adapted.stream)
+            self.assertEqual(processed, self.basic_req)
+
+    def test_stop_str_isolation_between_requests(self):
+        """Test that stop strings from one request don't affect subsequent requests.
+
+        This tests the fix for the bug where conv.stop_str was being mutated globally,
+        causing stop strings from one request to persist in subsequent requests.
+        """
+        # Mock conversation template with initial stop_str
+        initial_stop_str = ["\n"]
+
+        with patch(
+            "sglang.srt.entrypoints.openai.serving_chat.generate_chat_conv"
+        ) as conv_mock:
+            # Create a mock conversation object that will be returned by generate_chat_conv
+            conv_ins = Mock()
+            conv_ins.get_prompt.return_value = "Test prompt"
+            conv_ins.image_data = None
+            conv_ins.audio_data = None
+            conv_ins.modalities = []
+            conv_ins.stop_str = (
+                initial_stop_str.copy()
+            )  # Template's default stop strings
+            conv_mock.return_value = conv_ins
+
+            # First request with additional stop string
+            req1 = ChatCompletionRequest(
+                model="x",
+                messages=[{"role": "user", "content": "First request"}],
+                stop=["CUSTOM_STOP"],
+            )
+
+            # Call the actual _apply_conversation_template method (not mocked)
+            result1 = self.chat._apply_conversation_template(req1, is_multimodal=False)
+
+            # Verify first request has both stop strings
+            expected_stop1 = initial_stop_str + ["CUSTOM_STOP"]
+            self.assertEqual(result1.stop, expected_stop1)
+
+            # Verify the original template's stop_str wasn't mutated after first request
+            self.assertEqual(conv_ins.stop_str, initial_stop_str)
+
+            # Second request without additional stop string
+            req2 = ChatCompletionRequest(
+                model="x",
+                messages=[{"role": "user", "content": "Second request"}],
+                # No custom stop strings
+            )
+            result2 = self.chat._apply_conversation_template(req2, is_multimodal=False)
+
+            # Verify second request only has original stop strings (no CUSTOM_STOP from req1)
+            self.assertEqual(result2.stop, initial_stop_str)
+            self.assertNotIn("CUSTOM_STOP", result2.stop)
+            self.assertEqual(conv_ins.stop_str, initial_stop_str)
+
+    # ------------- sampling-params -------------
+    def test_sampling_param_build(self):
+        req = ChatCompletionRequest(
+            model="x",
+            messages=[{"role": "user", "content": "Hi"}],
+            temperature=0.8,
+            max_tokens=150,
+            min_tokens=5,
+            top_p=0.9,
+            stop=["</s>"],
+        )
+        with patch.object(
+            self.chat,
+            "_process_messages",
+            return_value=("Prompt", [1], None, None, [], ["</s>"], None),
+        ):
+            params = self.chat._build_sampling_params(req, ["</s>"], None)
+            self.assertEqual(params["temperature"], 0.8)
+            self.assertEqual(params["max_new_tokens"], 150)
+            self.assertEqual(params["min_new_tokens"], 5)
+            self.assertEqual(params["stop"], ["</s>"])
+
+    async def test_unstreamed_tool_args_completion(self):
+        """Test that remaining tool call arguments are sent when generation finishes."""
+
+        # Mock FunctionCallParser with detector that has partial tool call data
+        mock_parser = Mock()
+        mock_detector = Mock()
+
+        # Simulate a tool call that was partially streamed
+        mock_detector.prev_tool_call_arr = [
+            {
+                "name": "get_weather",
+                "arguments": {"location": "San Francisco", "unit": "celsius"},
+            }
+        ]
+        mock_detector.streamed_args_for_tool = [
+            '{"location": "San Francisco"'  # Partial arguments streamed so far
+        ]
+        mock_parser.detector = mock_detector
+
+        content = {
+            "meta_info": {
+                "id": "chatcmpl-test123",
+            }
+        }
+
+        request = ChatCompletionRequest(
+            model="test",
+            messages=[{"role": "user", "content": "What's the weather?"}],
+            tools=[{"type": "function", "function": {"name": "get_weather"}}],
+        )
+
+        # Test the completion method
+        result = self.chat._check_for_unstreamed_tool_args(
+            parser=mock_parser,
+            content=content,
+            request=request,
+            finish_reason_type="stop",
+            index=0,
+        )
+
+        # Should return a chunk with remaining arguments
+        self.assertIsNotNone(result, "Should return chunk with remaining arguments")
+        self.assertIn('"arguments":', result, "Should contain arguments field")
+        self.assertIn(
+            ', "unit": "celsius"}', result, "Should contain remaining arguments"
+        )
+        self.assertIn(
+            '"finish_reason":null',
+            result,
+            "Should not include finish_reason in completion chunk",
+        )
+
+    async def test_unstreamed_tool_args_no_completion_needed(self):
+        """Test that no completion chunk is sent when all arguments were already streamed."""
+
+        # Mock FunctionCallParser with detector that has complete tool call data
+        mock_parser = Mock()
+        mock_detector = Mock()
+
+        # Simulate a tool call that was completely streamed
+        mock_detector.prev_tool_call_arr = [
+            {"name": "get_weather", "arguments": {"location": "San Francisco"}}
+        ]
+        mock_detector.streamed_args_for_tool = [
+            '{"location": "San Francisco"}'  # All arguments already streamed
+        ]
+        mock_parser.detector = mock_detector
+
+        content = {
+            "meta_info": {
+                "id": "chatcmpl-test123",
+            }
+        }
+
+        request = ChatCompletionRequest(
+            model="test",
+            messages=[{"role": "user", "content": "What's the weather?"}],
+            tools=[{"type": "function", "function": {"name": "get_weather"}}],
+        )
+
+        # Test the completion method
+        result = self.chat._check_for_unstreamed_tool_args(
+            parser=mock_parser,
+            content=content,
+            request=request,
+            finish_reason_type="stop",
+            index=0,
+        )
+
+        # Should return None since no completion is needed
+        self.assertIsNone(result, "Should return None when no completion is needed")
+
+    async def test_unstreamed_tool_args_no_parser_data(self):
+        """Test that no completion chunk is sent when parser has no tool call data."""
+
+        # Mock FunctionCallParser with empty detector
+        mock_parser = Mock()
+        mock_detector = Mock()
+        mock_detector.prev_tool_call_arr = []
+        mock_detector.streamed_args_for_tool = []
+        mock_parser.detector = mock_detector
+
+        content = {
+            "meta_info": {
+                "id": "chatcmpl-test123",
+            }
+        }
+
+        request = ChatCompletionRequest(
+            model="test",
+            messages=[{"role": "user", "content": "What's the weather?"}],
+            tools=[{"type": "function", "function": {"name": "get_weather"}}],
+        )
+
+        # Test the completion method
+        result = self.chat._check_for_unstreamed_tool_args(
+            parser=mock_parser,
+            content=content,
+            request=request,
+            finish_reason_type="stop",
+            index=0,
+        )
+
+        # Should return None since there's no parser data
+        self.assertIsNone(
+            result, "Should return None when parser has no tool call data"
+        )
+
+    # ------------- kimi_k2 tool_call_id formatting -------------
+    def test_kimi_k2_non_streaming_tool_call_id_format(self):
+        """Ensure non-streaming tool_call.id matches functions.{name}:{index} for kimi_k2 parser."""
+
+        # Force kimi_k2 parser
+        self.chat.tool_call_parser = "kimi_k2"
+
+        # Mock FunctionCallParser.parse_non_stream to return one tool call
+        with patch(
+            "sglang.srt.entrypoints.openai.serving_chat.FunctionCallParser"
+        ) as ParserMock:
+            parser_instance = ParserMock.return_value
+
+            # Build a mock ToolCallItem-like object
+            call_info = Mock()
+            call_info.name = "get_weather"
+            call_info.parameters = '{"city":"Paris"}'
+            call_info.tool_index = 0
+
+            parser_instance.has_tool_call.return_value = True
+            parser_instance.parse_non_stream.return_value = ("", [call_info])
+
+            finish_reason = {"type": "stop", "matched": None}
+            tools = [
+                {"type": "function", "function": {"name": "get_weather"}},
+            ]
+
+            tool_calls, remaining_text, _ = self.chat._process_tool_calls(
+                text="<|tool_calls_section_begin|>...",
+                tools=tools,
+                finish_reason=finish_reason,
+            )
+
+            self.assertIsNotNone(tool_calls)
+            self.assertEqual(len(tool_calls), 1)
+            self.assertEqual(tool_calls[0].id, "functions.get_weather:0")
+            self.assertEqual(tool_calls[0].function.name, "get_weather")
+
+    def test_kimi_k2_streaming_tool_call_id_format(self):
+        """Ensure streaming first chunk tool_call.id matches functions.{name}:{index} for kimi_k2 parser."""
+
+        # Force kimi_k2 parser
+        self.chat.tool_call_parser = "kimi_k2"
+
+        # Prepare request with tools
+        req = ChatCompletionRequest(
+            model="x",
+            messages=[{"role": "user", "content": "Hi?"}],
+            tools=[{"type": "function", "function": {"name": "get_weather"}}],
+            stream=True,
+        )
+
+        # Patch FunctionCallParser used inside _process_tool_call_stream
+        with patch(
+            "sglang.srt.entrypoints.openai.serving_chat.FunctionCallParser"
+        ) as ParserMock:
+            parser_instance = ParserMock.return_value
+
+            # First call returns one ToolCallItem-like chunk (with name)
+            first_chunk_call = Mock()
+            first_chunk_call.tool_index = 0
+            first_chunk_call.name = "get_weather"
+            first_chunk_call.parameters = ""
+            parser_instance.parse_stream_chunk.side_effect = [
+                ("", [first_chunk_call]),
+                ("", []),
+            ]
+
+            async def collect_first_tool_chunk():
+                gen = self.chat._process_tool_call_stream(
+                    index=0,
+                    delta="irrelevant",
+                    parser_dict={},
+                    content={"meta_info": {"id": "chatcmpl-test"}},
+                    request=req,
+                    has_tool_calls={},
+                )
+                # Get first yielded SSE line
+                line = None
+                async for emitted in gen:
+                    line = emitted
+                    break
+                return line
+
+            loop = asyncio.get_event_loop()
+            line = loop.run_until_complete(collect_first_tool_chunk())
+            self.assertIsNotNone(line)
+            self.assertTrue(line.startswith("data: "))
+
+            payload = json.loads(line[len("data: ") :])
+            tool_calls = payload["choices"][0]["delta"]["tool_calls"]
+            self.assertEqual(tool_calls[0]["id"], "functions.get_weather:0")
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/test/srt/openai_server/basic/test_serving_completions.py b/test/srt/openai_server/basic/test_serving_completions.py
new file mode 100644
index 000000000..022ba9ad1
--- /dev/null
+++ b/test/srt/openai_server/basic/test_serving_completions.py
@@ -0,0 +1,157 @@
+"""
+Unit-tests for the refactored completions-serving handler (no pytest).
+Run with:
+    python -m unittest tests.test_serving_completions_unit -v
+"""
+
+import unittest
+from typing import Optional
+from unittest.mock import AsyncMock, Mock, patch
+
+from sglang.srt.entrypoints.openai.protocol import CompletionRequest
+from sglang.srt.entrypoints.openai.serving_completions import OpenAIServingCompletion
+from sglang.srt.managers.tokenizer_manager import TokenizerManager
+
+
+class _MockTemplateManager:
+    """Minimal mock for TemplateManager."""
+
+    def __init__(self):
+        self.chat_template_name: Optional[str] = None
+        self.jinja_template_content_format: Optional[str] = None
+        self.completion_template_name: Optional[str] = (
+            None  # Set to None to avoid template processing
+        )
+
+
+class ServingCompletionTestCase(unittest.TestCase):
+    """Bundle all prompt/echo tests in one TestCase."""
+
+    # ---------- shared test fixtures ----------
+    def setUp(self):
+        # build the mock TokenizerManager once for every test
+        tm = Mock(spec=TokenizerManager)
+
+        tm.tokenizer = Mock()
+        tm.tokenizer.encode.return_value = [1, 2, 3, 4]
+        tm.tokenizer.decode.return_value = "decoded text"
+        tm.tokenizer.bos_token_id = 1
+
+        tm.model_config = Mock(is_multimodal=False)
+        tm.server_args = Mock(enable_cache_report=False)
+
+        tm.generate_request = AsyncMock()
+        tm.create_abort_task = Mock()
+
+        self.template_manager = _MockTemplateManager()
+        self.sc = OpenAIServingCompletion(tm, self.template_manager)
+
+    # ---------- prompt-handling ----------
+    def test_single_string_prompt(self):
+        req = CompletionRequest(model="x", prompt="Hello world", max_tokens=100)
+        internal, _ = self.sc._convert_to_internal_request(req)
+        self.assertEqual(internal.text, "Hello world")
+
+    def test_single_token_ids_prompt(self):
+        req = CompletionRequest(model="x", prompt=[1, 2, 3, 4], max_tokens=100)
+        internal, _ = self.sc._convert_to_internal_request(req)
+        self.assertEqual(internal.input_ids, [1, 2, 3, 4])
+
+    # ---------- echo-handling ----------
+    def test_echo_with_string_prompt_streaming(self):
+        req = CompletionRequest(model="x", prompt="Hello", max_tokens=1, echo=True)
+        self.assertEqual(self.sc._get_echo_text(req, 0), "Hello")
+
+    def test_echo_with_list_of_strings_streaming(self):
+        req = CompletionRequest(
+            model="x", prompt=["A", "B"], max_tokens=1, echo=True, n=1
+        )
+        self.assertEqual(self.sc._get_echo_text(req, 0), "A")
+        self.assertEqual(self.sc._get_echo_text(req, 1), "B")
+
+    def test_echo_with_token_ids_streaming(self):
+        req = CompletionRequest(model="x", prompt=[1, 2, 3], max_tokens=1, echo=True)
+        self.sc.tokenizer_manager.tokenizer.decode.return_value = "decoded_prompt"
+        self.assertEqual(self.sc._get_echo_text(req, 0), "decoded_prompt")
+
+    def test_echo_with_multiple_token_ids_streaming(self):
+        req = CompletionRequest(
+            model="x", prompt=[[1, 2], [3, 4]], max_tokens=1, echo=True, n=1
+        )
+        self.sc.tokenizer_manager.tokenizer.decode.return_value = "decoded"
+        self.assertEqual(self.sc._get_echo_text(req, 0), "decoded")
+
+    def test_prepare_echo_prompts_non_streaming(self):
+        # single string
+        req = CompletionRequest(model="x", prompt="Hi", echo=True)
+        self.assertEqual(self.sc._prepare_echo_prompts(req), ["Hi"])
+
+        # list of strings
+        req = CompletionRequest(model="x", prompt=["Hi", "Yo"], echo=True)
+        self.assertEqual(self.sc._prepare_echo_prompts(req), ["Hi", "Yo"])
+
+        # token IDs
+        req = CompletionRequest(model="x", prompt=[1, 2, 3], echo=True)
+        self.sc.tokenizer_manager.tokenizer.decode.return_value = "decoded"
+        self.assertEqual(self.sc._prepare_echo_prompts(req), ["decoded"])
+
+    # ---------- response_format handling ----------
+    def test_response_format_json_object(self):
+        """Test that response_format json_object is correctly processed in sampling params."""
+        req = CompletionRequest(
+            model="x",
+            prompt="Generate a JSON object:",
+            max_tokens=100,
+            response_format={"type": "json_object"},
+        )
+        sampling_params = self.sc._build_sampling_params(req)
+        self.assertEqual(sampling_params["json_schema"], '{"type": "object"}')
+
+    def test_response_format_json_schema(self):
+        """Test that response_format json_schema is correctly processed in sampling params."""
+        schema = {
+            "type": "object",
+            "properties": {"name": {"type": "string"}, "age": {"type": "integer"}},
+        }
+        req = CompletionRequest(
+            model="x",
+            prompt="Generate a JSON object:",
+            max_tokens=100,
+            response_format={
+                "type": "json_schema",
+                "json_schema": {"name": "person", "schema": schema},
+            },
+        )
+        sampling_params = self.sc._build_sampling_params(req)
+        # The schema should be converted to string by convert_json_schema_to_str
+        self.assertIn("json_schema", sampling_params)
+        self.assertIsInstance(sampling_params["json_schema"], str)
+
+    def test_response_format_structural_tag(self):
+        """Test that response_format structural_tag is correctly processed in sampling params."""
+        req = CompletionRequest(
+            model="x",
+            prompt="Generate structured output:",
+            max_tokens=100,
+            response_format={
+                "type": "structural_tag",
+                "structures": [{"begin": "<data>", "end": "</data>"}],
+                "triggers": ["<data>"],
+            },
+        )
+        sampling_params = self.sc._build_sampling_params(req)
+        # The structural_tag should be processed
+        self.assertIn("structural_tag", sampling_params)
+        self.assertIsInstance(sampling_params["structural_tag"], str)
+
+    def test_response_format_none(self):
+        """Test that no response_format doesn't add extra constraints."""
+        req = CompletionRequest(model="x", prompt="Generate text:", max_tokens=100)
+        sampling_params = self.sc._build_sampling_params(req)
+        # Should not have json_schema or structural_tag from response_format
+        # (but might have json_schema from the legacy json_schema field)
+        self.assertIsNone(sampling_params.get("structural_tag"))
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/test/srt/openai_server/basic/test_serving_embedding.py b/test/srt/openai_server/basic/test_serving_embedding.py
new file mode 100644
index 000000000..47f8330b7
--- /dev/null
+++ b/test/srt/openai_server/basic/test_serving_embedding.py
@@ -0,0 +1,145 @@
+"""
+Unit tests for the OpenAIServingEmbedding class from serving_embedding.py.
+"""
+
+import unittest
+import uuid
+from unittest.mock import Mock
+
+from fastapi import Request
+
+from sglang.srt.entrypoints.openai.protocol import (
+    EmbeddingRequest,
+    EmbeddingResponse,
+    MultimodalEmbeddingInput,
+)
+from sglang.srt.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
+from sglang.srt.managers.io_struct import EmbeddingReqInput
+
+
+# Mock TokenizerManager for embedding tests
+class _MockTokenizerManager:
+    def __init__(self):
+        self.model_config = Mock()
+        self.model_config.is_multimodal = False
+        self.server_args = Mock()
+        self.server_args.enable_cache_report = False
+        self.model_path = "test-model"
+
+        # Mock tokenizer
+        self.tokenizer = Mock()
+        self.tokenizer.encode = Mock(return_value=[1, 2, 3, 4, 5])
+        self.tokenizer.decode = Mock(return_value="Test embedding input")
+        self.tokenizer.chat_template = None
+        self.tokenizer.bos_token_id = 1
+
+        # Mock generate_request method for embeddings
+        async def mock_generate_embedding():
+            yield {
+                "embedding": [0.1, 0.2, 0.3, 0.4, 0.5] * 20,  # 100-dim embedding
+                "meta_info": {
+                    "id": f"embd-{uuid.uuid4()}",
+                    "prompt_tokens": 5,
+                },
+            }
+
+        self.generate_request = Mock(return_value=mock_generate_embedding())
+
+
+# Mock TemplateManager for embedding tests
+class _MockTemplateManager:
+    def __init__(self):
+        self.chat_template_name = None  # None for embeddings usually
+        self.jinja_template_content_format = None
+        self.completion_template_name = None
+
+
+class ServingEmbeddingTestCase(unittest.TestCase):
+    def setUp(self):
+        """Set up test fixtures."""
+        self.tokenizer_manager = _MockTokenizerManager()
+        self.template_manager = _MockTemplateManager()
+        self.serving_embedding = OpenAIServingEmbedding(
+            self.tokenizer_manager, self.template_manager
+        )
+
+        self.request = Mock(spec=Request)
+        self.request.headers = {}
+
+        self.basic_req = EmbeddingRequest(
+            model="test-model",
+            input="Hello, how are you?",
+            encoding_format="float",
+        )
+        self.list_req = EmbeddingRequest(
+            model="test-model",
+            input=["Hello, how are you?", "I am fine, thank you!"],
+            encoding_format="float",
+        )
+        self.multimodal_req = EmbeddingRequest(
+            model="test-model",
+            input=[
+                MultimodalEmbeddingInput(text="Hello", image="base64_image_data"),
+                MultimodalEmbeddingInput(text="World", image=None),
+            ],
+            encoding_format="float",
+        )
+        self.token_ids_req = EmbeddingRequest(
+            model="test-model",
+            input=[1, 2, 3, 4, 5],
+            encoding_format="float",
+        )
+
+    def test_convert_single_string_request(self):
+        """Test converting single string request to internal format."""
+        adapted_request, processed_request = (
+            self.serving_embedding._convert_to_internal_request(self.basic_req)
+        )
+
+        self.assertIsInstance(adapted_request, EmbeddingReqInput)
+        self.assertEqual(adapted_request.text, "Hello, how are you?")
+        # self.assertEqual(adapted_request.rid, "test-id")
+        self.assertEqual(processed_request, self.basic_req)
+
+    def test_convert_list_string_request(self):
+        """Test converting list of strings request to internal format."""
+        adapted_request, processed_request = (
+            self.serving_embedding._convert_to_internal_request(self.list_req)
+        )
+
+        self.assertIsInstance(adapted_request, EmbeddingReqInput)
+        self.assertEqual(
+            adapted_request.text, ["Hello, how are you?", "I am fine, thank you!"]
+        )
+        # self.assertEqual(adapted_request.rid, "test-id")
+        self.assertEqual(processed_request, self.list_req)
+
+    def test_convert_token_ids_request(self):
+        """Test converting token IDs request to internal format."""
+        adapted_request, processed_request = (
+            self.serving_embedding._convert_to_internal_request(self.token_ids_req)
+        )
+
+        self.assertIsInstance(adapted_request, EmbeddingReqInput)
+        self.assertEqual(adapted_request.input_ids, [1, 2, 3, 4, 5])
+        # self.assertEqual(adapted_request.rid, "test-id")
+        self.assertEqual(processed_request, self.token_ids_req)
+
+    def test_convert_multimodal_request(self):
+        """Test converting multimodal request to internal format."""
+        adapted_request, processed_request = (
+            self.serving_embedding._convert_to_internal_request(self.multimodal_req)
+        )
+
+        self.assertIsInstance(adapted_request, EmbeddingReqInput)
+        # Should extract text and images separately
+        self.assertEqual(len(adapted_request.text), 2)
+        self.assertIn("Hello", adapted_request.text)
+        self.assertIn("World", adapted_request.text)
+        self.assertEqual(adapted_request.image_data[0], "base64_image_data")
+        self.assertIsNone(adapted_request.image_data[1])
+        # self.assertEqual(adapted_request.rid, "test-id")
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/test/srt/openai_server/features/__init__.py b/test/srt/openai_server/features/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/srt/openai_server/features/test_cache_report.py b/test/srt/openai_server/features/test_cache_report.py
new file mode 100644
index 000000000..999111a2e
--- /dev/null
+++ b/test/srt/openai_server/features/test_cache_report.py
@@ -0,0 +1,212 @@
+import asyncio
+import unittest
+
+import openai
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestCacheReport(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.min_cached = 5
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=300,
+            other_args=[
+                "--chunked-prefill-size=40",
+                "--enable-cache-report",
+            ],
+        )
+        cls.client = openai.Client(api_key="EMPTY", base_url=f"{cls.base_url}/v1")
+        cls.aclient = openai.AsyncClient(api_key="EMPTY", base_url=f"{cls.base_url}/v1")
+
+        usage = cls.run_openai(cls, "1").usage
+        # we can assume that our request is of size 1, plus the total template size
+        # ideally we would like to know the begin size / end size of the template to be more precise
+        total_template_size = usage.prompt_tokens - 1
+        print(f"template size: {total_template_size}")
+        usage2 = cls.run_openai(cls, "2").usage
+        assert usage2.prompt_tokens_details.cached_tokens <= total_template_size
+        cls.min_cached = max(
+            usage2.prompt_tokens_details.cached_tokens,
+            total_template_size - usage2.prompt_tokens_details.cached_tokens,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def run_decode(self, return_logprob=False, top_logprobs_num=0, n=1):
+        response = requests.post(
+            self.base_url + "/generate",
+            # we use an uncommon start to minimise the chance that the cache is hit by chance
+            json={
+                "text": "_ The capital of France is",
+                "sampling_params": {
+                    "temperature": 0 if n == 1 else 0.5,
+                    "max_new_tokens": 128,
+                    "n": n,
+                    "stop_token_ids": [119690],
+                },
+                "stream": False,
+                "return_logprob": return_logprob,
+                "top_logprobs_num": top_logprobs_num,
+                "logprob_start_len": 0,
+            },
+        )
+        return response
+
+    def run_openai(self, message):
+        response = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                # {"role": "system", "content": "You are a helpful AI assistant"},
+                {"role": "user", "content": message},
+            ],
+            temperature=0,
+            max_tokens=100,
+        )
+        return response
+
+    async def run_openai_async(self, message):
+        response = await self.aclient.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "user", "content": message},
+            ],
+            temperature=0,
+            max_tokens=100,
+        )
+        return response
+
+    def cache_report_openai(self, message):
+        response = self.run_openai(message)
+        print(
+            f"openai first request cached_tokens: {int(response.usage.prompt_tokens_details.cached_tokens)}"
+        )
+        first_cached_tokens = int(response.usage.prompt_tokens_details.cached_tokens)
+        # assert int(response.usage.cached_tokens) == 0
+        assert first_cached_tokens <= self.min_cached
+        response = self.run_openai(message)
+        cached_tokens = int(response.usage.prompt_tokens_details.cached_tokens)
+        print(f"openai second request cached_tokens: {cached_tokens}")
+        assert cached_tokens > 0
+        assert cached_tokens == int(response.usage.prompt_tokens) - 1
+        return first_cached_tokens
+
+    async def cache_report_openai_async(self, message):
+        response = await self.run_openai_async(message)
+        cached_tokens = int(response.usage.prompt_tokens_details.cached_tokens)
+        prompt_tokens = int(response.usage.prompt_tokens)
+        return cached_tokens, prompt_tokens
+
+    def test_generate(self):
+        print("=" * 100)
+        response = self.run_decode()
+        # print(response.json())
+        cached_tokens = int(response.json()["meta_info"]["cached_tokens"])
+        print(f"sglang first request cached_tokens: {cached_tokens}")
+        print(
+            f"sglang first request prompt_tokens: {int(response.json()['meta_info']['prompt_tokens'])}"
+        )
+        # can't assure to be 0: depends on the initialisation request / if a template is used with the model
+        assert cached_tokens < self.min_cached
+        response = self.run_decode()
+        cached_tokens = int(response.json()["meta_info"]["cached_tokens"])
+        print(f"sglang second request cached_tokens: {cached_tokens}")
+        print(
+            f"sglang second request prompt_tokens: {int(response.json()['meta_info']['prompt_tokens'])}"
+        )
+        assert cached_tokens == int(response.json()["meta_info"]["prompt_tokens"]) - 1
+
+    def test_cache_split_prefill_openai(self):
+        print("=" * 100)
+        self.cache_report_openai(
+            "€ This is a very long and unique text that should not be already cached, the twist is"
+            " that it should be longer than the chunked-prefill-size, so it should be split among"
+            " several prefill requests. Still, it shouldn't be cached"
+        )
+
+    def test_cache_report_openai(self):
+        print("=" * 100)
+        # warm up the cache, for the template
+        self.run_openai("Introduce the capital of France.")
+
+        first_cached_tokens_1 = self.run_openai(
+            "How many sparrow do you need to lift a coconut?"
+        ).usage.prompt_tokens_details.cached_tokens
+
+        usage_2 = self.run_openai("* sing something about cats").usage
+        first_cached_tokens_2 = usage_2.prompt_tokens_details.cached_tokens
+        # first request may not have 0 cached tokens, but if they only have the template in common they
+        # should be the same once the cache is warmed up
+        assert first_cached_tokens_1 == first_cached_tokens_2
+
+        resp = self.run_openai("* sing something about cats and dogs")
+        print(resp.usage)
+
+        resp = self.run_openai("* sing something about cats, please")
+        print(resp.usage)
+        assert (
+            resp.usage.prompt_tokens_details.cached_tokens
+            >= usage_2.prompt_tokens - self.min_cached
+        )
+
+    # TODO: flaky test
+    # def test_cache_report_openai_async(self):
+    #     print("=" * 100)
+
+    #     async def run_test():
+    #         task0 = asyncio.create_task(
+    #             self.cache_report_openai_async(
+    #                 "first request, to start the inference and let the next two request be started in the same batch"
+    #             )
+    #         )
+    #         await asyncio.sleep(1)  # to force the first request to be started first
+    #         task1 = asyncio.create_task(
+    #             self.cache_report_openai_async(
+    #                 "> can the same batch parallel request use the cache?"
+    #             )
+    #         )
+    #         task2 = asyncio.create_task(
+    #             self.cache_report_openai_async(
+    #                 "> can the same batch parallel request use the cache?"
+    #             )
+    #         )
+    #         result0, result1, result2 = await asyncio.gather(task0, task1, task2)
+
+    #         cached_tokens0, prompt_tokens0 = result0
+    #         cached_tokens1, prompt_tokens1 = result1
+    #         cached_tokens2, prompt_tokens2 = result2
+
+    #         print(
+    #             f"Async request 0 - Cached tokens: {cached_tokens0}, Prompt tokens: {prompt_tokens0}"
+    #         )
+    #         print(
+    #             f"Async request 1 - Cached tokens: {cached_tokens1}, Prompt tokens: {prompt_tokens1}"
+    #         )
+    #         print(
+    #             f"Async request 2 - Cached tokens: {cached_tokens2}, Prompt tokens: {prompt_tokens2}"
+    #         )
+
+    #         # Assert that no requests used the cache (because first is alone, and the next two are in the same batch)
+    #         # If a new optimisation limiting starting request with same prefix at the same time was added
+    #         # to maximise the cache hit, this would not be true
+    #         assert cached_tokens1 == cached_tokens2 == cached_tokens0
+
+    #     asyncio.run(run_test())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/openai_server/features/test_enable_thinking.py b/test/srt/openai_server/features/test_enable_thinking.py
new file mode 100644
index 000000000..00ba4fc94
--- /dev/null
+++ b/test/srt/openai_server/features/test_enable_thinking.py
@@ -0,0 +1,243 @@
+"""
+Usage:
+python3 -m unittest openai_server.features.test_enable_thinking.TestEnableThinking.test_chat_completion_with_reasoning
+python3 -m unittest openai_server.features.test_enable_thinking.TestEnableThinking.test_chat_completion_without_reasoning
+python3 -m unittest openai_server.features.test_enable_thinking.TestEnableThinking.test_stream_chat_completion_with_reasoning
+python3 -m unittest openai_server.features.test_enable_thinking.TestEnableThinking.test_stream_chat_completion_without_reasoning
+"""
+
+import asyncio
+import json
+import os
+import sys
+import time
+import unittest
+
+import openai
+import requests
+
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestEnableThinking(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-1234"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=[
+                "--reasoning-parser",
+                "qwen3",
+            ],
+        )
+        cls.additional_chat_kwargs = {}
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_chat_completion_with_reasoning(self):
+        # Test non-streaming with "enable_thinking": True, reasoning_content should not be empty
+        client = requests.post(
+            f"{self.base_url}/v1/chat/completions",
+            headers={"Authorization": f"Bearer {self.api_key}"},
+            json={
+                "model": self.model,
+                "messages": [{"role": "user", "content": "Hello"}],
+                "temperature": 0,
+                "separate_reasoning": True,
+                "chat_template_kwargs": {"enable_thinking": True},
+                **self.additional_chat_kwargs,
+            },
+        )
+
+        self.assertEqual(client.status_code, 200, f"Failed with: {client.text}")
+        data = client.json()
+
+        self.assertIn("choices", data)
+        self.assertTrue(len(data["choices"]) > 0)
+        self.assertIn("message", data["choices"][0])
+        self.assertIn("reasoning_content", data["choices"][0]["message"])
+        self.assertIsNotNone(data["choices"][0]["message"]["reasoning_content"])
+
+    def test_chat_completion_without_reasoning(self):
+        # Test non-streaming with "enable_thinking": False, reasoning_content should be empty
+        client = requests.post(
+            f"{self.base_url}/v1/chat/completions",
+            headers={"Authorization": f"Bearer {self.api_key}"},
+            json={
+                "model": self.model,
+                "messages": [{"role": "user", "content": "Hello"}],
+                "temperature": 0,
+                "separate_reasoning": True,
+                "chat_template_kwargs": {"enable_thinking": False},
+                **self.additional_chat_kwargs,
+            },
+        )
+
+        self.assertEqual(client.status_code, 200, f"Failed with: {client.text}")
+        data = client.json()
+
+        self.assertIn("choices", data)
+        self.assertTrue(len(data["choices"]) > 0)
+        self.assertIn("message", data["choices"][0])
+
+        if "reasoning_content" in data["choices"][0]["message"]:
+            self.assertIsNone(data["choices"][0]["message"]["reasoning_content"])
+
+    def test_stream_chat_completion_with_reasoning(self):
+        # Test streaming with "enable_thinking": True, reasoning_content should not be empty
+        response = requests.post(
+            f"{self.base_url}/v1/chat/completions",
+            headers={"Authorization": f"Bearer {self.api_key}"},
+            json={
+                "model": self.model,
+                "messages": [{"role": "user", "content": "Hello"}],
+                "temperature": 0,
+                "separate_reasoning": True,
+                "stream": True,
+                "chat_template_kwargs": {"enable_thinking": True},
+                **self.additional_chat_kwargs,
+            },
+            stream=True,
+        )
+
+        self.assertEqual(response.status_code, 200, f"Failed with: {response.text}")
+
+        has_reasoning = False
+        has_content = False
+
+        print("\n=== Stream With Reasoning ===")
+        for line in response.iter_lines():
+            if line:
+                line = line.decode("utf-8")
+                if line.startswith("data:") and not line.startswith("data: [DONE]"):
+                    data = json.loads(line[6:])
+                    if "choices" in data and len(data["choices"]) > 0:
+                        delta = data["choices"][0].get("delta", {})
+
+                        if "reasoning_content" in delta and delta["reasoning_content"]:
+                            has_reasoning = True
+
+                        if "content" in delta and delta["content"]:
+                            has_content = True
+
+        self.assertTrue(
+            has_reasoning,
+            "The reasoning content is not included in the stream response",
+        )
+        self.assertTrue(
+            has_content, "The stream response does not contain normal content"
+        )
+
+    def test_stream_chat_completion_without_reasoning(self):
+        # Test streaming with "enable_thinking": False, reasoning_content should  be empty
+        response = requests.post(
+            f"{self.base_url}/v1/chat/completions",
+            headers={"Authorization": f"Bearer {self.api_key}"},
+            json={
+                "model": self.model,
+                "messages": [{"role": "user", "content": "Hello"}],
+                "temperature": 0,
+                "separate_reasoning": True,
+                "stream": True,
+                "chat_template_kwargs": {"enable_thinking": False},
+                **self.additional_chat_kwargs,
+            },
+            stream=True,
+        )
+
+        self.assertEqual(response.status_code, 200, f"Failed with: {response.text}")
+
+        has_reasoning = False
+        has_content = False
+
+        print("\n=== Stream Without Reasoning ===")
+        for line in response.iter_lines():
+            if line:
+                line = line.decode("utf-8")
+                if line.startswith("data:") and not line.startswith("data: [DONE]"):
+                    data = json.loads(line[6:])
+                    if "choices" in data and len(data["choices"]) > 0:
+                        delta = data["choices"][0].get("delta", {})
+
+                        if "reasoning_content" in delta and delta["reasoning_content"]:
+                            has_reasoning = True
+
+                        if "content" in delta and delta["content"]:
+                            has_content = True
+
+        self.assertFalse(
+            has_reasoning,
+            "The reasoning content should not be included in the stream response",
+        )
+        self.assertTrue(
+            has_content, "The stream response does not contain normal content"
+        )
+
+
+# Skip for ci test
+# class TestGLM45EnableThinking(TestEnableThinking):
+#     @classmethod
+#     def setUpClass(cls):
+#         # Replace with the model name needed for testing; if not required, reuse DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+#         cls.model = "THUDM/GLM-4.5"
+#         cls.base_url = DEFAULT_URL_FOR_TEST
+#         cls.api_key = "sk-1234"
+#         cls.process = popen_launch_server(
+#             cls.model,
+#             cls.base_url,
+#             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+#             api_key=cls.api_key,
+#             other_args=[
+#                 "--tool-call-parser",
+#                 "glm45",
+#                 "--reasoning-parser",
+#                 "glm45",
+#                 "--tp-size",
+#                 "8"
+#             ],
+#         )
+
+#         # Validate whether enable-thinking conflict with tool_calls
+#         cls.additional_chat_kwargs = {
+#             "tools": [
+#                 {
+#                     "type": "function",
+#                     "function": {
+#                         "name": "add",
+#                         "description": "Compute the sum of two numbers",
+#                         "parameters": {
+#                             "type": "object",
+#                             "properties": {
+#                                 "a": {
+#                                     "type": "int",
+#                                     "description": "A number",
+#                                 },
+#                                 "b": {
+#                                     "type": "int",
+#                                     "description": "A number",
+#                                 },
+#                             },
+#                             "required": ["a", "b"],
+#                         },
+#                     },
+#                 }
+#             ]
+#         }
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/openai_server/features/test_json_constrained.py b/test/srt/openai_server/features/test_json_constrained.py
new file mode 100644
index 000000000..e4fdeecb5
--- /dev/null
+++ b/test/srt/openai_server/features/test_json_constrained.py
@@ -0,0 +1,153 @@
+"""
+python3 -m unittest openai_server.features.test_json_constrained.TestJSONConstrainedOutlinesBackend.test_json_generate
+python3 -m unittest openai_server.features.test_json_constrained.TestJSONConstrainedXGrammarBackend.test_json_generate
+python3 -m unittest openai_server.features.test_json_constrained.TestJSONConstrainedLLGuidanceBackend.test_json_generate
+"""
+
+import json
+import unittest
+from concurrent.futures import ThreadPoolExecutor
+
+import openai
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+def setup_class(cls, backend: str):
+    cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+    cls.base_url = DEFAULT_URL_FOR_TEST
+    cls.json_schema = json.dumps(
+        {
+            "type": "object",
+            "properties": {
+                "name": {"type": "string", "pattern": "^[\\w]+$"},
+                "population": {"type": "integer"},
+            },
+            "required": ["name", "population"],
+            "additionalProperties": False,
+        }
+    )
+
+    other_args = [
+        "--max-running-requests",
+        "10",
+        "--grammar-backend",
+        backend,
+    ]
+
+    cls.process = popen_launch_server(
+        cls.model,
+        cls.base_url,
+        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        other_args=other_args,
+    )
+
+
+class TestJSONConstrainedOutlinesBackend(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        setup_class(cls, backend="outlines")
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def run_decode(self, json_schema, return_logprob=False, top_logprobs_num=0, n=1):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "temperature": 0 if n == 1 else 0.5,
+                    "max_new_tokens": 128,
+                    "n": n,
+                    "stop_token_ids": [119690],
+                    "json_schema": json_schema,
+                },
+                "stream": False,
+                "return_logprob": return_logprob,
+                "top_logprobs_num": top_logprobs_num,
+                "logprob_start_len": 0,
+            },
+        )
+        ret = response.json()
+        print(json.dumps(ret))
+        print("=" * 100)
+
+        if not json_schema or json_schema == "INVALID":
+            return
+
+        # Make sure the json output is valid
+        try:
+            js_obj = json.loads(ret["text"])
+        except (TypeError, json.decoder.JSONDecodeError):
+            raise
+
+        self.assertIsInstance(js_obj["name"], str)
+        self.assertIsInstance(js_obj["population"], int)
+
+    def test_json_generate(self):
+        self.run_decode(json_schema=self.json_schema)
+
+    def test_json_invalid(self):
+        self.run_decode(json_schema="INVALID")
+
+    def test_json_openai(self):
+        client = openai.Client(api_key="EMPTY", base_url=f"{self.base_url}/v1")
+
+        response = client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "You are a helpful AI assistant"},
+                {
+                    "role": "user",
+                    "content": "Introduce the capital of France. Return in a JSON format.",
+                },
+            ],
+            temperature=0,
+            max_tokens=128,
+            response_format={
+                "type": "json_schema",
+                "json_schema": {"name": "foo", "schema": json.loads(self.json_schema)},
+            },
+        )
+        text = response.choices[0].message.content
+
+        try:
+            js_obj = json.loads(text)
+        except (TypeError, json.decoder.JSONDecodeError):
+            print("JSONDecodeError", text)
+            raise
+
+        self.assertIsInstance(js_obj["name"], str)
+        self.assertIsInstance(js_obj["population"], int)
+
+    def test_mix_json_and_other(self):
+        json_schemas = [None, None, self.json_schema, self.json_schema] * 10
+
+        with ThreadPoolExecutor(len(json_schemas)) as executor:
+            list(executor.map(self.run_decode, json_schemas))
+
+
+class TestJSONConstrainedXGrammarBackend(TestJSONConstrainedOutlinesBackend):
+    @classmethod
+    def setUpClass(cls):
+        setup_class(cls, backend="xgrammar")
+
+
+class TestJSONConstrainedLLGuidanceBackend(TestJSONConstrainedOutlinesBackend):
+    @classmethod
+    def setUpClass(cls):
+        setup_class(cls, backend="llguidance")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/openai_server/features/test_json_mode.py b/test/srt/openai_server/features/test_json_mode.py
new file mode 100644
index 000000000..d74eccf0f
--- /dev/null
+++ b/test/srt/openai_server/features/test_json_mode.py
@@ -0,0 +1,137 @@
+"""
+python3 -m unittest openai_server.features.test_json_mode.TestJSONModeOutlines.test_json_mode_response
+python3 -m unittest openai_server.features.test_json_mode.TestJSONModeOutlines.test_json_mode_with_streaming
+
+python3 -m unittest openai_server.features.test_json_mode.TestJSONModeXGrammar.test_json_mode_response
+python3 -m unittest openai_server.features.test_json_mode.TestJSONModeXGrammar.test_json_mode_with_streaming
+
+python3 -m unittest openai_server.features.test_json_mode.TestJSONModeLLGuidance.test_json_mode_response
+python3 -m unittest openai_server.features.test_json_mode.TestJSONModeLLGuidance.test_json_mode_with_streaming
+"""
+
+import json
+import unittest
+
+import openai
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
+
+
+def setup_class(cls, backend):
+    cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+    cls.base_url = DEFAULT_URL_FOR_TEST
+
+    other_args = [
+        "--max-running-requests",
+        "10",
+        "--grammar-backend",
+        backend,
+    ]
+
+    cls.process = popen_launch_server(
+        cls.model,
+        cls.base_url,
+        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        other_args=other_args,
+    )
+    cls.client = openai.Client(api_key="EMPTY", base_url=f"{cls.base_url}/v1")
+
+
+class TestJSONModeOutlines(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        setup_class(cls, "outlines")
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_json_mode_response(self):
+        """Test that response_format json_object (also known as "json mode") produces valid JSON, even without a system prompt that mentions JSON."""
+        response = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                # We are deliberately omitting "That produces JSON" or similar phrases from the assistant prompt so that we don't have misleading test results
+                {
+                    "role": "system",
+                    "content": "You are a helpful AI assistant that gives a short answer.",
+                },
+                {"role": "user", "content": "What is the capital of Bulgaria?"},
+            ],
+            temperature=0,
+            max_tokens=128,
+            response_format={"type": "json_object"},
+        )
+        text = response.choices[0].message.content
+
+        print(f"Response ({len(text)} characters): {text}")
+
+        # Verify the response is valid JSON
+        try:
+            js_obj = json.loads(text)
+        except json.JSONDecodeError as e:
+            self.fail(f"Response is not valid JSON. Error: {e}. Response: {text}")
+
+        # Verify it's actually an object (dict)
+        self.assertIsInstance(js_obj, dict, f"Response is not a JSON object: {text}")
+
+    def test_json_mode_with_streaming(self):
+        """Test that streaming with json_object response (also known as "json mode") format works correctly, even without a system prompt that mentions JSON."""
+        stream = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                # We are deliberately omitting "That produces JSON" or similar phrases from the assistant prompt so that we don't have misleading test results
+                {
+                    "role": "system",
+                    "content": "You are a helpful AI assistant that gives a short answer.",
+                },
+                {"role": "user", "content": "What is the capital of Bulgaria?"},
+            ],
+            temperature=0,
+            max_tokens=128,
+            response_format={"type": "json_object"},
+            stream=True,
+        )
+
+        # Collect all chunks
+        chunks = []
+        for chunk in stream:
+            if chunk.choices[0].delta.content is not None:
+                chunks.append(chunk.choices[0].delta.content)
+        full_response = "".join(chunks)
+
+        print(
+            f"Concatenated Response ({len(full_response)} characters): {full_response}"
+        )
+
+        # Verify the combined response is valid JSON
+        try:
+            js_obj = json.loads(full_response)
+        except json.JSONDecodeError as e:
+            self.fail(
+                f"Streamed response is not valid JSON. Error: {e}. Response: {full_response}"
+            )
+
+        self.assertIsInstance(js_obj, dict)
+
+
+class TestJSONModeXGrammar(TestJSONModeOutlines):
+    @classmethod
+    def setUpClass(cls):
+        setup_class(cls, backend="xgrammar")
+
+
+class TestJSONModeLLGuidance(TestJSONModeOutlines):
+    @classmethod
+    def setUpClass(cls):
+        setup_class(cls, backend="llguidance")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/openai_server/features/test_openai_server_ebnf.py b/test/srt/openai_server/features/test_openai_server_ebnf.py
new file mode 100644
index 000000000..126556ed7
--- /dev/null
+++ b/test/srt/openai_server/features/test_openai_server_ebnf.py
@@ -0,0 +1,98 @@
+import re
+
+import openai
+
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+# -------------------------------------------------------------------------
+#    EBNF Test Class: TestOpenAIServerEBNF
+#    Launches the server with xgrammar, has only EBNF tests
+# -------------------------------------------------------------------------
+class TestOpenAIServerEBNF(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+
+        # passing xgrammar specifically
+        other_args = ["--grammar-backend", "xgrammar"]
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=other_args,
+        )
+        cls.base_url += "/v1"
+        cls.tokenizer = get_tokenizer(DEFAULT_SMALL_MODEL_NAME_FOR_TEST)
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_ebnf(self):
+        """
+        Ensure we can pass `ebnf` to the local openai server
+        and that it enforces the grammar.
+        """
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        ebnf_grammar = r"""
+        root ::= "Hello" | "Hi" | "Hey"
+        """
+        pattern = re.compile(r"^(Hello|Hi|Hey)[.!?]*\s*$")
+
+        response = client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "You are a helpful EBNF test bot."},
+                {"role": "user", "content": "Say a greeting (Hello, Hi, or Hey)."},
+            ],
+            temperature=0,
+            max_tokens=32,
+            extra_body={"ebnf": ebnf_grammar},
+        )
+        text = response.choices[0].message.content.strip()
+        self.assertTrue(len(text) > 0, "Got empty text from EBNF generation")
+        self.assertRegex(text, pattern, f"Text '{text}' doesn't match EBNF choices")
+
+    def test_ebnf_strict_json(self):
+        """
+        A stricter EBNF that produces exactly {"name":"Alice"} format
+        with no trailing punctuation or extra fields.
+        """
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        ebnf_grammar = r"""
+        root    ::= "{" pair "}"
+        pair    ::= "\"name\"" ":" string
+        string  ::= "\"" [A-Za-z]+ "\""
+        """
+        pattern = re.compile(r'^\{"name":"[A-Za-z]+"\}$')
+
+        response = client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "EBNF mini-JSON generator."},
+                {
+                    "role": "user",
+                    "content": "Generate single key JSON with only letters.",
+                },
+            ],
+            temperature=0,
+            max_tokens=64,
+            extra_body={"ebnf": ebnf_grammar},
+        )
+        text = response.choices[0].message.content.strip()
+        self.assertTrue(len(text) > 0, "Got empty text from EBNF strict JSON test")
+        self.assertRegex(
+            text, pattern, f"Text '{text}' not matching the EBNF strict JSON shape"
+        )
diff --git a/test/srt/openai_server/features/test_openai_server_hidden_states.py b/test/srt/openai_server/features/test_openai_server_hidden_states.py
new file mode 100644
index 000000000..34e5ddde7
--- /dev/null
+++ b/test/srt/openai_server/features/test_openai_server_hidden_states.py
@@ -0,0 +1,356 @@
+import json
+import re
+import time
+import unittest
+from abc import ABC
+
+import numpy as np
+import openai
+import torch
+
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+    DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
+    DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST,
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class BaseTestOpenAIServerWithHiddenStates(ABC):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.return_hidden_states = [False, True]
+        cls.use_list_input = [True, False]
+        cls.parallel_sample_nums = [1, 2]
+
+    def test_completion(self):
+        for return_hidden_states in self.return_hidden_states:
+            for use_list_input in self.use_list_input:
+                for parallel_sample_num in self.parallel_sample_nums:
+                    self.run_completion(
+                        use_list_input,
+                        parallel_sample_num,
+                        return_hidden_states,
+                    )
+
+    def test_completion_stream(self):
+        # parallel sampling and list input are not supported in streaming mode
+        for return_hidden_states in self.return_hidden_states:
+            for use_list_input in self.use_list_input:
+                for parallel_sample_num in self.parallel_sample_nums:
+                    self.run_completion_stream(
+                        use_list_input,
+                        parallel_sample_num,
+                        return_hidden_states,
+                    )
+
+    def test_chat_completion(self):
+        for return_hidden_states in self.return_hidden_states:
+            for (
+                parallel_sample_num
+            ) in (
+                self.parallel_sample_nums
+            ):  # parallel sample num 2 breaks in the adapter with a 400 for EAGLE
+                self.run_chat_completion(parallel_sample_num, return_hidden_states)
+
+    def test_chat_completion_stream(self):
+        for return_hidden_states in self.return_hidden_states:
+            for (
+                parallel_sample_num
+            ) in (
+                self.parallel_sample_nums
+            ):  # parallel sample num > 1 breaks in the adapter with a 400 for EAGLE
+                self.run_chat_completion_stream(
+                    parallel_sample_num, return_hidden_states
+                )
+
+    def run_completion(
+        self,
+        use_list_input,
+        parallel_sample_num,
+        return_hidden_states,
+    ):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        prompt = "The capital of France is"
+        prompt_input = prompt
+
+        if use_list_input:
+            prompt_arg = [prompt_input, prompt_input]
+            num_choices = len(prompt_arg)
+        else:
+            prompt_arg = prompt_input
+            num_choices = 1
+
+        response = client.completions.create(
+            model=self.model,
+            prompt=prompt_arg,
+            temperature=0,
+            max_tokens=32,
+            n=parallel_sample_num,
+            extra_body=dict(return_hidden_states=return_hidden_states),
+        )
+
+        for choice in response.choices:
+            assert hasattr(choice, "hidden_states") == return_hidden_states
+            if return_hidden_states:
+                assert choice.hidden_states is not None, "hidden_states was None"
+
+    def run_completion_stream(
+        self,
+        use_list_input,
+        parallel_sample_num,
+        return_hidden_states,
+    ):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        prompt = "The capital of France is"
+        prompt_input = prompt
+        num_prompt_tokens = len(self.tokenizer.encode(prompt))
+
+        if use_list_input:
+            prompt_arg = [prompt_input, prompt_input]
+            num_choices = len(prompt_arg)
+            num_prompt_tokens *= 2
+        else:
+            prompt_arg = prompt_input
+            num_choices = 1
+
+        generator = client.completions.create(
+            model=self.model,
+            prompt=prompt_arg,
+            temperature=0,
+            max_tokens=32,
+            stream=True,
+            stream_options={"include_usage": True},
+            n=parallel_sample_num,
+            extra_body=dict(return_hidden_states=return_hidden_states),
+        )
+
+        hidden_states_list = []
+        for response in generator:
+            usage = response.usage
+            for choice in response.choices:
+                if hasattr(choice, "hidden_states"):
+                    assert return_hidden_states
+                    assert choice.hidden_states is not None
+                    hidden_states_list.append(choice.hidden_states)
+
+        if return_hidden_states:
+            assert (
+                len(hidden_states_list) == parallel_sample_num * num_choices
+            ), f"Expected {parallel_sample_num * num_choices} hidden states, got {len(hidden_states_list)}"
+        else:
+            assert (
+                hidden_states_list == []
+            ), "hidden_states were returned and should not have been"
+
+    def run_chat_completion(self, parallel_sample_num, return_hidden_states):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        response = client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "You are a helpful AI assistant"},
+                {
+                    "role": "user",
+                    "content": "What is the capital of France? Answer in a few words.",
+                },
+            ],
+            temperature=0,
+            n=parallel_sample_num,
+            extra_body=dict(return_hidden_states=return_hidden_states),
+        )
+
+        for choice in response.choices:
+            assert hasattr(choice, "hidden_states") == return_hidden_states
+            if return_hidden_states:
+                assert choice.hidden_states is not None, "hidden_states was None"
+
+    def run_chat_completion_stream(
+        self, parallel_sample_num=1, return_hidden_states=False
+    ):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        generator = client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "You are a helpful AI assistant"},
+                {"role": "user", "content": "What is the capital of France?"},
+            ],
+            temperature=0,
+            stream=True,
+            stream_options={"include_usage": True},
+            n=parallel_sample_num,
+            extra_body=dict(return_hidden_states=return_hidden_states),
+        )
+
+        is_firsts = {}
+        hidden_states_list = []
+
+        for response in generator:
+            for choice in response.choices:
+                if hasattr(choice.delta, "hidden_states"):
+                    assert return_hidden_states
+                    assert choice.delta.hidden_states is not None
+                    hidden_states_list.append(choice.delta.hidden_states)
+
+        if return_hidden_states:
+            assert (
+                len(hidden_states_list) == parallel_sample_num
+            ), f"Expected {parallel_sample_num} hidden states, got {len(hidden_states_list)}"
+        else:
+            assert (
+                hidden_states_list == []
+            ), "hidden_states were returned and should not have been"
+
+
+class TestOpenAIServerWithHiddenStatesEnabled(
+    CustomTestCase, BaseTestOpenAIServerWithHiddenStates
+):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=["--enable-return-hidden-states"],
+        )
+        cls.base_url += "/v1"
+        cls.tokenizer = get_tokenizer(DEFAULT_SMALL_MODEL_NAME_FOR_TEST)
+        cls.return_hidden_states = [False, True]
+        cls.use_list_input = [True, False]
+        cls.parallel_sample_nums = [1, 2]
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+
+class TestOpenAIServerWithHiddenStatesEnabledAndCUDAGraphDisabled(
+    CustomTestCase, BaseTestOpenAIServerWithHiddenStates
+):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=["--enable-return-hidden-states", "--disable-cuda-graph"],
+        )
+        cls.base_url += "/v1"
+        cls.tokenizer = get_tokenizer(DEFAULT_SMALL_MODEL_NAME_FOR_TEST)
+        cls.return_hidden_states = [False, True]
+        cls.use_list_input = [True, False]
+        cls.parallel_sample_nums = [1]
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+
+class TestOpenAIServerWithEAGLEAndHiddenStatesEnabled(
+    CustomTestCase, BaseTestOpenAIServerWithHiddenStates
+):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.speculative_draft_model = DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST
+        cls.speculative_algorithm = "EAGLE"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--speculative-algorithm",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+                "--speculative-num-steps",
+                5,
+                "--speculative-eagle-topk",
+                8,
+                "--speculative-num-draft-tokens",
+                64,
+                "--mem-fraction-static",
+                0.7,
+                "--chunked-prefill-size",
+                128,
+                "--max-running-requests",
+                8,
+                "--enable-return-hidden-states",
+            ],
+        )
+        cls.base_url += "/v1"
+        cls.tokenizer = get_tokenizer(DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST)
+        cls.return_hidden_states = [False, True]
+        cls.use_list_input = [True, False]
+        cls.parallel_sample_nums = [1]
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+
+class TestOpenAIServerWithEAGLE3AndHiddenStatesEnabled(
+    CustomTestCase, BaseTestOpenAIServerWithHiddenStates
+):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "meta-llama/Llama-3.1-8B-Instruct"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.speculative_algorithm = "EAGLE3"
+        cls.speculative_draft_model = "jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--speculative-algorithm",
+                cls.speculative_algorithm,
+                "--speculative-draft-model-path",
+                cls.speculative_draft_model,
+                "--speculative-num-steps",
+                5,
+                "--speculative-eagle-topk",
+                16,
+                "--speculative-num-draft-tokens",
+                64,
+                "--mem-fraction-static",
+                0.7,
+                "--chunked-prefill-size",
+                128,
+                "--max-running-requests",
+                8,
+                "--dtype",
+                "float16",
+                "--enable-return-hidden-states",
+            ],
+        )
+        cls.base_url += "/v1"
+        cls.tokenizer = get_tokenizer(cls.model)
+        cls.return_hidden_states = [False, True]
+        cls.use_list_input = [True, False]
+        cls.parallel_sample_nums = [1]
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/openai_server/features/test_reasoning_content.py b/test/srt/openai_server/features/test_reasoning_content.py
new file mode 100644
index 000000000..04e5160a7
--- /dev/null
+++ b/test/srt/openai_server/features/test_reasoning_content.py
@@ -0,0 +1,343 @@
+"""
+Usage:
+python3 -m unittest openai_server.features.test_reasoning_content.TestReasoningContentAPI.test_streaming_separate_reasoning_false
+python3 -m unittest openai_server.features.test_reasoning_content.TestReasoningContentAPI.test_streaming_separate_reasoning_true
+python3 -m unittest openai_server.features.test_reasoning_content.TestReasoningContentAPI.test_streaming_separate_reasoning_true_stream_reasoning_false
+python3 -m unittest openai_server.features.test_reasoning_content.TestReasoningContentAPI.test_nonstreaming_separate_reasoning_false
+python3 -m unittest openai_server.features.test_reasoning_content.TestReasoningContentAPI.test_nonstreaming_separate_reasoning_true
+python3 -m unittest openai_server.features.test_reasoning_content.TestReasoningContentStartup.test_nonstreaming
+python3 -m unittest openai_server.features.test_reasoning_content.TestReasoningContentStartup.test_streaming
+"""
+
+import json
+import unittest
+
+import openai
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_REASONING_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestReasoningContentAPI(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_REASONING_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-1234"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=[
+                "--reasoning-parser",
+                "deepseek-r1",
+            ],
+        )
+        cls.base_url += "/v1"
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_streaming_separate_reasoning_false(self):
+        # Test streaming with separate_reasoning=False, reasoning_content should be empty
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "What is 1+3?",
+                }
+            ],
+            "max_tokens": 100,
+            "stream": True,
+            "extra_body": {"separate_reasoning": False},
+        }
+        response = client.chat.completions.create(**payload)
+
+        reasoning_content = ""
+        content = ""
+        for chunk in response:
+            if chunk.choices[0].delta.content:
+                content += chunk.choices[0].delta.content
+            elif chunk.choices[0].delta.reasoning_content:
+                reasoning_content += chunk.choices[0].delta.reasoning_content
+
+        assert len(reasoning_content) == 0
+        assert len(content) > 0
+
+    def test_streaming_separate_reasoning_true(self):
+        # Test streaming with separate_reasoning=True, reasoning_content should not be empty
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "What is 1+3?",
+                }
+            ],
+            "max_tokens": 100,
+            "stream": True,
+            "extra_body": {"separate_reasoning": True},
+        }
+        response = client.chat.completions.create(**payload)
+
+        reasoning_content = ""
+        content = ""
+        for chunk in response:
+            if chunk.choices[0].delta.content:
+                content += chunk.choices[0].delta.content
+            elif chunk.choices[0].delta.reasoning_content:
+                reasoning_content += chunk.choices[0].delta.reasoning_content
+
+        assert len(reasoning_content) > 0
+        assert len(content) > 0
+
+    def test_streaming_separate_reasoning_true_stream_reasoning_false(self):
+        # Test streaming with separate_reasoning=True, reasoning_content should not be empty
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "What is 1+3?",
+                }
+            ],
+            "max_tokens": 100,
+            "stream": True,
+            "extra_body": {"separate_reasoning": True, "stream_reasoning": False},
+        }
+        response = client.chat.completions.create(**payload)
+
+        reasoning_content = ""
+        content = ""
+        first_chunk = False
+        for chunk in response:
+            if chunk.choices[0].delta.reasoning_content:
+                reasoning_content = chunk.choices[0].delta.reasoning_content
+                first_chunk = True
+            if chunk.choices[0].delta.content:
+                content += chunk.choices[0].delta.content
+                if not first_chunk:
+                    reasoning_content = chunk.choices[0].delta.reasoning_content
+                first_chunk = True
+            if not first_chunk:
+                assert (
+                    not chunk.choices[0].delta.reasoning_content
+                    or len(chunk.choices[0].delta.reasoning_content) == 0
+                )
+        assert len(reasoning_content) > 0
+        assert len(content) > 0
+
+    def test_nonstreaming_separate_reasoning_false(self):
+        # Test non-streaming with separate_reasoning=False, reasoning_content should be empty
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "What is 1+3?",
+                }
+            ],
+            "max_tokens": 100,
+            "extra_body": {"separate_reasoning": False},
+        }
+        response = client.chat.completions.create(**payload)
+
+        assert (
+            not response.choices[0].message.reasoning_content
+            or len(response.choices[0].message.reasoning_content) == 0
+        )
+        assert len(response.choices[0].message.content) > 0
+
+    def test_nonstreaming_separate_reasoning_true(self):
+        # Test non-streaming with separate_reasoning=True, reasoning_content should not be empty
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "What is 1+3?",
+                }
+            ],
+            "max_tokens": 100,
+            "extra_body": {"separate_reasoning": True},
+        }
+        response = client.chat.completions.create(**payload)
+
+        assert len(response.choices[0].message.reasoning_content) > 0
+        assert len(response.choices[0].message.content) > 0
+
+
+class TestReasoningContentWithoutParser(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_REASONING_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-1234"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=[],  # No reasoning parser
+        )
+        cls.base_url += "/v1"
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_streaming_separate_reasoning_false(self):
+        # Test streaming with separate_reasoning=False, reasoning_content should be empty
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "What is 1+3?",
+                }
+            ],
+            "max_tokens": 100,
+            "stream": True,
+            "extra_body": {"separate_reasoning": False},
+        }
+        response = client.chat.completions.create(**payload)
+
+        reasoning_content = ""
+        content = ""
+        for chunk in response:
+            if chunk.choices[0].delta.content:
+                content += chunk.choices[0].delta.content
+            elif chunk.choices[0].delta.reasoning_content:
+                reasoning_content += chunk.choices[0].delta.reasoning_content
+
+        assert len(reasoning_content) == 0
+        assert len(content) > 0
+
+    def test_streaming_separate_reasoning_true(self):
+        # Test streaming with separate_reasoning=True, reasoning_content should not be empty
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "What is 1+3?",
+                }
+            ],
+            "max_tokens": 100,
+            "stream": True,
+            "extra_body": {"separate_reasoning": True},
+        }
+        response = client.chat.completions.create(**payload)
+
+        reasoning_content = ""
+        content = ""
+        for chunk in response:
+            if chunk.choices[0].delta.content:
+                content += chunk.choices[0].delta.content
+            elif chunk.choices[0].delta.reasoning_content:
+                reasoning_content += chunk.choices[0].delta.reasoning_content
+
+        assert len(reasoning_content) == 0
+        assert len(content) > 0
+
+    def test_streaming_separate_reasoning_true_stream_reasoning_false(self):
+        # Test streaming with separate_reasoning=True, reasoning_content should not be empty
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "What is 1+3?",
+                }
+            ],
+            "max_tokens": 100,
+            "stream": True,
+            "extra_body": {"separate_reasoning": True, "stream_reasoning": False},
+        }
+        response = client.chat.completions.create(**payload)
+
+        reasoning_content = ""
+        content = ""
+        first_chunk = False
+        for chunk in response:
+            if chunk.choices[0].delta.reasoning_content:
+                reasoning_content = chunk.choices[0].delta.reasoning_content
+                first_chunk = True
+            if chunk.choices[0].delta.content:
+                content += chunk.choices[0].delta.content
+                if not first_chunk:
+                    reasoning_content = chunk.choices[0].delta.reasoning_content
+                first_chunk = True
+            if not first_chunk:
+                assert (
+                    not chunk.choices[0].delta.reasoning_content
+                    or len(chunk.choices[0].delta.reasoning_content) == 0
+                )
+        assert not reasoning_content or len(reasoning_content) == 0
+        assert len(content) > 0
+
+    def test_nonstreaming_separate_reasoning_false(self):
+        # Test non-streaming with separate_reasoning=False, reasoning_content should be empty
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "What is 1+3?",
+                }
+            ],
+            "max_tokens": 100,
+            "extra_body": {"separate_reasoning": False},
+        }
+        response = client.chat.completions.create(**payload)
+
+        assert (
+            not response.choices[0].message.reasoning_content
+            or len(response.choices[0].message.reasoning_content) == 0
+        )
+        assert len(response.choices[0].message.content) > 0
+
+    def test_nonstreaming_separate_reasoning_true(self):
+        # Test non-streaming with separate_reasoning=True, reasoning_content should not be empty
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "What is 1+3?",
+                }
+            ],
+            "max_tokens": 100,
+            "extra_body": {"separate_reasoning": True},
+        }
+        response = client.chat.completions.create(**payload)
+
+        assert (
+            not response.choices[0].message.reasoning_content
+            or len(response.choices[0].message.reasoning_content) == 0
+        )
+        assert len(response.choices[0].message.content) > 0
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/openai_server/function_call/__init__.py b/test/srt/openai_server/function_call/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/srt/openai_server/function_call/test_openai_function_calling.py b/test/srt/openai_server/function_call/test_openai_function_calling.py
new file mode 100644
index 000000000..291ef98b7
--- /dev/null
+++ b/test/srt/openai_server/function_call/test_openai_function_calling.py
@@ -0,0 +1,953 @@
+import json
+import time
+import unittest
+
+import openai
+
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestOpenAIServerFunctionCalling(CustomTestCase):
+    # NOTE: this system_message is for Llama3.2 system prompt. Without this,
+    # sometimes Llama3.2 gives a different tool call format such as:
+    # '<|python_tag|>{"type": "function", "function": "add", "parameters": {"a": "3", "b": "5"}}'
+    SYSTEM_MESSAGE = (
+        "You are a helpful assistant with tool calling capabilities. "
+        "Only reply with a tool call if the function exists in the library provided by the user. "
+        "If it doesn't exist, just reply directly in natural language. "
+        "When you receive a tool call response, use the output to format an answer to the original user question. "
+        "You have access to the following functions. "
+        "To call a function, please respond with JSON for a function call. "
+        'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. '
+        "Do not use variables.\n\n"
+    )
+
+    @classmethod
+    def setUpClass(cls):
+        # Replace with the model name needed for testing; if not required, reuse DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+
+        # Start the local OpenAI Server. If necessary, you can add other parameters such as --enable-tools.
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=[
+                # If your server needs extra parameters to test function calling, please add them here.
+                "--tool-call-parser",
+                "llama3",
+            ],
+        )
+        cls.base_url += "/v1"
+        cls.tokenizer = get_tokenizer(cls.model)
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_function_calling_format(self):
+        """
+        Test: Whether the function call format returned by the AI is correct.
+        When returning a tool call, message.content should be None, and tool_calls should be a list.
+        """
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "add",
+                    "description": "Compute the sum of two numbers",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "a": {
+                                "type": "int",
+                                "description": "A number",
+                            },
+                            "b": {
+                                "type": "int",
+                                "description": "A number",
+                            },
+                        },
+                        "required": ["a", "b"],
+                    },
+                },
+            }
+        ]
+
+        messages = [
+            {"role": "system", "content": self.SYSTEM_MESSAGE},
+            {"role": "user", "content": "Compute (3+5)"},
+        ]
+        response = client.chat.completions.create(
+            model=self.model,
+            max_tokens=2048,
+            messages=messages,
+            temperature=0.8,
+            top_p=0.8,
+            stream=False,
+            tools=tools,
+        )
+
+        tool_calls = response.choices[0].message.tool_calls
+
+        assert (
+            isinstance(tool_calls, list) and len(tool_calls) > 0
+        ), "tool_calls should be a non-empty list"
+
+        function_name = tool_calls[0].function.name
+        assert function_name == "add", "Function name should be 'add'"
+
+    # This unit test is too difficult for default model. Mark it as optional unit tests so it won't trigger unless specified.
+    def _test_function_calling_multiturn(self):
+        """
+        Test: Whether the function call format returned by the AI is correct.
+        When returning a tool call, message.content should be None, and tool_calls should be a list.
+        """
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "add",
+                    "description": "Compute the sum of two numbers",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "a": {
+                                "type": "int",
+                                "description": "A number",
+                            },
+                            "b": {
+                                "type": "int",
+                                "description": "A number",
+                            },
+                        },
+                        "required": ["a", "b"],
+                    },
+                },
+            }
+        ]
+
+        messages = [{"role": "user", "content": "Compute (3+5)"}]
+
+        response = client.chat.completions.create(
+            model=self.model,
+            max_tokens=2048,
+            messages=messages,
+            temperature=0.8,
+            top_p=0.8,
+            stream=False,
+            tools=tools,
+        )
+
+        tool_call = response.choices[0].message.tool_calls[0]
+        function_name = tool_call.function.name
+        assert function_name == "add", "Function name should be 'add'"
+        function_arguments = tool_call.function.arguments
+        function_arguments = json.loads(tool_call.function.arguments)
+        assert function_arguments in [
+            {"a": 3, "b": 5},
+            {"a": "3", "b": "5"},
+        ], f"Unexpected function arguments: {function_arguments}"
+
+        messages.append(response.choices[0].message)
+        messages.append(
+            {
+                "role": "tool",
+                "tool_call_id": tool_call.id,
+                "content": "8",
+                "name": function_name,
+            }
+        )
+
+        final_response = client.chat.completions.create(
+            model=self.model,
+            max_tokens=2048,
+            messages=messages,
+            temperature=0.8,
+            top_p=0.8,
+            stream=False,
+            tools=tools,
+        )
+
+        assert (
+            "8" in final_response.choices[0].message.content
+        ), "tool_call response should have the sum 8 in the content"
+
+    def test_function_calling_streaming_simple(self):
+        """
+        Test: Whether the function name can be correctly recognized in streaming mode.
+        - Expect a function call to be found, and the function name to be correct.
+        - Verify that streaming mode returns at least multiple chunks.
+        """
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_current_weather",
+                    "description": "Get the current weather in a given location",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "city": {
+                                "type": "string",
+                                "description": "The city to find the weather for",
+                            },
+                            "unit": {
+                                "type": "string",
+                                "description": "Weather unit (celsius or fahrenheit)",
+                                "enum": ["celsius", "fahrenheit"],
+                            },
+                        },
+                        "required": ["city", "unit"],
+                    },
+                },
+            }
+        ]
+
+        messages = [
+            {"role": "system", "content": self.SYSTEM_MESSAGE},
+            {
+                "role": "user",
+                "content": "What is the temperature in Paris in celsius??",
+            },
+        ]
+
+        response_stream = client.chat.completions.create(
+            model=self.model,
+            max_tokens=2048,
+            messages=messages,
+            temperature=0.8,
+            top_p=0.8,
+            stream=True,
+            tools=tools,
+        )
+
+        chunks = list(response_stream)
+        self.assertTrue(len(chunks) > 0, "Streaming should return at least one chunk")
+
+        found_function_name = False
+        for chunk in chunks:
+            choice = chunk.choices[0]
+            # Check whether the current chunk contains tool_calls
+            if choice.delta.tool_calls:
+                tool_call = choice.delta.tool_calls[0]
+                if tool_call.function.name:
+                    self.assertEqual(
+                        tool_call.function.name,
+                        "get_current_weather",
+                        "Function name should be 'get_current_weather'",
+                    )
+                    found_function_name = True
+                    break
+
+        self.assertTrue(
+            found_function_name,
+            "Target function name 'get_current_weather' was not found in the streaming chunks",
+        )
+
+        finish_reason = chunks[-1].choices[0].finish_reason
+        self.assertEqual(
+            finish_reason,
+            "tool_calls",
+            "Final response of function calling should have finish_reason 'tool_calls'",
+        )
+
+    def test_function_calling_streaming_args_parsing(self):
+        """
+        Test: Whether the function call arguments returned in streaming mode can be correctly concatenated into valid JSON.
+        - The user request requires multiple parameters.
+        - AI may return the arguments in chunks that need to be concatenated.
+        """
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "add",
+                    "description": "Compute the sum of two integers",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "a": {
+                                "type": "integer",
+                                "description": "First integer",
+                            },
+                            "b": {
+                                "type": "integer",
+                                "description": "Second integer",
+                            },
+                        },
+                        "required": ["a", "b"],
+                    },
+                    "strict": True,  # Llama-3.2-1B is flaky in tool call. It won't always respond with parameters unless we set strict.
+                },
+            }
+        ]
+
+        messages = [
+            {"role": "system", "content": self.SYSTEM_MESSAGE},
+            {"role": "user", "content": "Please sum 5 and 7, just call the function."},
+        ]
+
+        response_stream = client.chat.completions.create(
+            model=self.model,
+            max_tokens=2048,
+            messages=messages,
+            temperature=0.9,
+            top_p=0.9,
+            stream=True,
+            tools=tools,
+        )
+
+        argument_fragments = []
+        chunks = list(response_stream)
+        function_name = None
+        for chunk in chunks:
+            choice = chunk.choices[0]
+            if choice.delta.tool_calls:
+                tool_call = choice.delta.tool_calls[0]
+                # Record the function name on first occurrence
+                function_name = tool_call.function.name or function_name
+                # In case of multiple chunks, JSON fragments may need to be concatenated
+                if tool_call.function.arguments is not None:
+                    argument_fragments.append(tool_call.function.arguments)
+
+        self.assertEqual(function_name, "add", "Function name should be 'add'")
+        joined_args = "".join(argument_fragments)
+        self.assertTrue(
+            len(joined_args) > 0,
+            "No parameter fragments were returned in the function call",
+        )
+
+        finish_reason = chunks[-1].choices[0].finish_reason
+        self.assertEqual(
+            finish_reason,
+            "tool_calls",
+            "Final response of function calling should have finish_reason 'tool_calls'",
+        )
+
+        # Check whether the concatenated JSON is valid
+        try:
+            args_obj = json.loads(joined_args)
+        except json.JSONDecodeError:
+            self.fail(
+                "The concatenated tool call arguments are not valid JSON, parsing failed"
+            )
+
+        self.assertIn("a", args_obj, "Missing parameter 'a'")
+        self.assertIn("b", args_obj, "Missing parameter 'b'")
+        self.assertEqual(str(args_obj["a"]), "5", "Parameter a should be 5")
+        self.assertEqual(str(args_obj["b"]), "7", "Parameter b should be 7")
+
+    def test_function_call_strict(self):
+        """
+        Test: Whether the strict mode of function calling works as expected.
+        - When strict mode is enabled, the AI should not return a function call if the function name is not recognized.
+        """
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "sub",
+                    "description": "Compute the difference of two integers",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "int_a": {
+                                "type": "integer",
+                                "description": "First integer",
+                            },
+                            "int_b": {
+                                "type": "integer",
+                                "description": "Second integer",
+                            },
+                        },
+                        "required": ["int_a", "int_b"],
+                    },
+                    "strict": True,
+                },
+            }
+        ]
+
+        messages = [
+            {"role": "user", "content": "Please compute 5 - 7, using your tool."}
+        ]
+        response = client.chat.completions.create(
+            model=self.model,
+            max_tokens=2048,
+            messages=messages,
+            temperature=0.8,
+            top_p=0.8,
+            stream=False,
+            tools=tools,
+        )
+
+        tool_calls = response.choices[0].message.tool_calls
+        function_name = tool_calls[0].function.name
+        arguments = tool_calls[0].function.arguments
+        args_obj = json.loads(arguments)
+
+        self.assertEqual(function_name, "sub", "Function name should be 'sub'")
+        self.assertEqual(str(args_obj["int_a"]), "5", "Parameter int_a should be 5")
+        self.assertEqual(str(args_obj["int_b"]), "7", "Parameter int_b should be 7")
+
+    def test_function_call_required(self):
+        """
+        Test: Whether tool_choice: "required" works as expected
+        - When tool_choice == "required", the model should return one or more tool_calls.
+        """
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "sub",
+                    "description": "Compute the difference of two integers",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "int_a": {
+                                "type": "integer",
+                                "description": "First integer",
+                            },
+                            "int_b": {
+                                "type": "integer",
+                                "description": "Second integer",
+                            },
+                        },
+                        "required": ["int_a", "int_b"],
+                    },
+                    "strict": True,
+                },
+            },
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_weather",
+                    "description": "use this to get latest weather information for a city given its name",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "city": {
+                                "type": "string",
+                                "description": "name of the city to get weather for",
+                            }
+                        },
+                        "required": ["city"],
+                    },
+                },
+            },
+        ]
+
+        messages = [{"role": "user", "content": "What is the capital of France?"}]
+        response = client.chat.completions.create(
+            model=self.model,
+            max_tokens=2048,
+            messages=messages,
+            temperature=0.8,
+            top_p=0.8,
+            stream=False,
+            tools=tools,
+            tool_choice="required",
+        )
+
+        tool_calls = response.choices[0].message.tool_calls
+        self.assertIsNotNone(tool_calls, "No tool_calls in the response")
+        function_name = tool_calls[0].function.name
+        arguments = tool_calls[0].function.arguments
+        args_obj = json.loads(arguments)
+
+        self.assertEqual(
+            function_name,
+            "get_weather",
+            f"Function name should be 'get_weather', got: {function_name}",
+        )
+        self.assertIn(
+            "city", args_obj, f"Function arguments should have 'city', got: {args_obj}"
+        )
+
+        # Make the test more robust by checking type and accepting valid responses
+        city_value = args_obj["city"]
+        self.assertIsInstance(
+            city_value,
+            str,
+            f"Parameter city should be a string, got: {type(city_value)}",
+        )
+        self.assertTrue(
+            "Paris" in city_value or "France" in city_value,
+            f"Parameter city should contain either 'Paris' or 'France', got: {city_value}",
+        )
+
+    def test_function_call_specific(self):
+        """
+        Test: Whether tool_choice: ToolChoice works as expected
+        - When tool_choice is a specific ToolChoice, the model should return one or more tool_calls.
+        """
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "sub",
+                    "description": "Compute the difference of two integers",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "int_a": {
+                                "type": "integer",
+                                "description": "First integer",
+                            },
+                            "int_b": {
+                                "type": "integer",
+                                "description": "Second integer",
+                            },
+                        },
+                        "required": ["int_a", "int_b"],
+                    },
+                    "strict": True,
+                },
+            },
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_weather",
+                    "description": "use this to get latest weather information for a city given its name",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "city": {
+                                "type": "string",
+                                "description": "name of the city to get weather for",
+                            }
+                        },
+                        "required": ["city"],
+                    },
+                },
+            },
+        ]
+
+        messages = [{"role": "user", "content": "What is the capital of France?"}]
+        response = client.chat.completions.create(
+            model=self.model,
+            max_tokens=2048,
+            messages=messages,
+            temperature=0.8,
+            top_p=0.8,
+            stream=False,
+            tools=tools,
+            tool_choice={"type": "function", "function": {"name": "get_weather"}},
+        )
+
+        tool_calls = response.choices[0].message.tool_calls
+        self.assertIsNotNone(tool_calls, "No tool_calls in the response")
+        function_name = tool_calls[0].function.name
+        arguments = tool_calls[0].function.arguments
+        args_obj = json.loads(arguments)
+
+        self.assertEqual(
+            function_name, "get_weather", "Function name should be 'get_weather'"
+        )
+        self.assertIn("city", args_obj, "Function arguments should have 'city'")
+
+    def test_streaming_multiple_choices_finish_reason(self):
+        """
+        Test: Verify that each choice gets its own finish_reason chunk in streaming mode with n > 1.
+        This tests the fix for the bug where only the last index got a finish_reason chunk.
+        """
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_current_weather",
+                    "description": "Get the current weather in a given location",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "location": {
+                                "type": "string",
+                                "description": "The city and state, e.g. San Francisco, CA",
+                            },
+                            "unit": {
+                                "type": "string",
+                                "enum": ["celsius", "fahrenheit"],
+                            },
+                        },
+                        "required": ["location"],
+                    },
+                },
+            }
+        ]
+
+        messages = [
+            {"role": "user", "content": "What is the weather like in Los Angeles?"}
+        ]
+
+        # Request with n=2 to get multiple choices
+        response_stream = client.chat.completions.create(
+            model=self.model,
+            messages=messages,
+            max_tokens=2048,
+            temperature=0.8,
+            stream=True,
+            tools=tools,
+            tool_choice="required",  # Force tool calls
+            n=2,  # Multiple choices
+        )
+
+        chunks = list(response_stream)
+
+        # Track finish_reason chunks for each index
+        finish_reason_chunks = {}
+        for chunk in chunks:
+            if chunk.choices:
+                for choice in chunk.choices:
+                    if choice.finish_reason is not None:
+                        index = choice.index
+                        if index not in finish_reason_chunks:
+                            finish_reason_chunks[index] = []
+                        finish_reason_chunks[index].append(choice.finish_reason)
+
+        # Verify we got finish_reason chunks for both indices
+        self.assertEqual(
+            len(finish_reason_chunks),
+            2,
+            f"Expected finish_reason chunks for 2 indices, got {len(finish_reason_chunks)}",
+        )
+
+        # Verify both index 0 and 1 have finish_reason
+        self.assertIn(
+            0, finish_reason_chunks, "Missing finish_reason chunk for index 0"
+        )
+        self.assertIn(
+            1, finish_reason_chunks, "Missing finish_reason chunk for index 1"
+        )
+
+        # Verify the finish_reason is "tool_calls" since we forced tool calls
+        for index, reasons in finish_reason_chunks.items():
+            self.assertEqual(
+                reasons[-1],  # Last finish_reason for this index
+                "tool_calls",
+                f"Expected finish_reason 'tool_calls' for index {index}, got {reasons[-1]}",
+            )
+
+    def test_function_calling_streaming_no_tool_call(self):
+        """
+        Test: Whether the finish_reason is stop in streaming mode when no tool call is given.
+        - Expect no function call to be found.
+        - Verify that finish_reason is stop
+        """
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_current_weather",
+                    "description": "Get the current weather in a given location",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "city": {
+                                "type": "string",
+                                "description": "The city to find the weather for",
+                            },
+                            "unit": {
+                                "type": "string",
+                                "description": "Weather unit (celsius or fahrenheit)",
+                                "enum": ["celsius", "fahrenheit"],
+                            },
+                        },
+                        "required": ["city", "unit"],
+                    },
+                },
+            }
+        ]
+
+        messages = [{"role": "user", "content": "Who are you?"}]
+
+        response_stream = client.chat.completions.create(
+            model=self.model,
+            max_tokens=2048,
+            messages=messages,
+            temperature=0.8,
+            top_p=0.8,
+            stream=True,
+            tools=tools,
+            tool_choice="none",
+        )
+
+        chunks = list(response_stream)
+        self.assertTrue(len(chunks) > 0, "Streaming should return at least one chunk")
+
+        found_tool_call = False
+        for chunk in chunks:
+            choice = chunk.choices[0]
+            # Check whether the current chunk contains tool_calls
+            found_tool_call = choice.delta.tool_calls is not None
+
+        self.assertFalse(
+            found_tool_call,
+            "Shouldn't have any tool_call in the streaming chunks",
+        )
+
+        finish_reason = chunks[-1].choices[0].finish_reason
+        self.assertEqual(
+            finish_reason,
+            "stop",
+            "Final response of no function calling should have finish_reason 'stop'",
+        )
+
+    def test_streaming_multiple_choices_without_tools(self):
+        """
+        Test: Verify that each choice gets its own finish_reason chunk without tool calls.
+        This tests the fix for regular content streaming with multiple choices.
+        """
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        messages = [{"role": "user", "content": "Say hello in one word."}]
+
+        # Request with n=2 to get multiple choices, no tools
+        response_stream = client.chat.completions.create(
+            model=self.model,
+            messages=messages,
+            temperature=0.8,
+            stream=True,
+            max_tokens=10,  # Keep it short
+            n=2,  # Multiple choices
+        )
+
+        chunks = list(response_stream)
+
+        # Track finish_reason chunks for each index
+        finish_reason_chunks = {}
+        for chunk in chunks:
+            if chunk.choices:
+                for choice in chunk.choices:
+                    if choice.finish_reason is not None:
+                        index = choice.index
+                        if index not in finish_reason_chunks:
+                            finish_reason_chunks[index] = []
+                        finish_reason_chunks[index].append(choice.finish_reason)
+
+        # Verify we got finish_reason chunks for both indices
+        self.assertEqual(
+            len(finish_reason_chunks),
+            2,
+            f"Expected finish_reason chunks for 2 indices, got {len(finish_reason_chunks)}",
+        )
+
+        # Verify both index 0 and 1 have finish_reason
+        self.assertIn(
+            0, finish_reason_chunks, "Missing finish_reason chunk for index 0"
+        )
+        self.assertIn(
+            1, finish_reason_chunks, "Missing finish_reason chunk for index 1"
+        )
+
+        # Verify the finish_reason is "stop" (regular completion)
+        for index, reasons in finish_reason_chunks.items():
+            self.assertIn(
+                reasons[-1],
+                ["stop", "length"],  # Could be either depending on how model responds
+                f"Expected finish_reason 'stop' or 'length' for index {index}, got {reasons[-1]}",
+            )
+
+
+class TestOpenAIPythonicFunctionCalling(CustomTestCase):
+    PYTHONIC_TOOLS = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather for a given location.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The name of the city or location.",
+                        }
+                    },
+                    "required": ["location"],
+                },
+            },
+        },
+        {
+            "type": "function",
+            "function": {
+                "name": "get_tourist_attractions",
+                "description": "Get a list of top tourist attractions for a given city.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {
+                            "type": "string",
+                            "description": "The name of the city to find attractions for.",
+                        }
+                    },
+                    "required": ["city"],
+                },
+            },
+        },
+    ]
+
+    PYTHONIC_MESSAGES = [
+        {
+            "role": "system",
+            "content": (
+                "You are a travel assistant. "
+                "When asked to call functions, ALWAYS respond ONLY with a python list of function calls, "
+                "using this format: [func_name1(param1=value1, param2=value2), func_name2(param=value)]. "
+                "Do NOT use JSON, do NOT use variables, do NOT use any other format. "
+                "Here is an example:\n"
+                '[get_weather(location="Paris"), get_tourist_attractions(city="Paris")]'
+            ),
+        },
+        {
+            "role": "user",
+            "content": (
+                "I'm planning a trip to Tokyo next week. What's the weather like and what are some top tourist attractions? "
+                "Propose parallel tool calls at once, using the python list of function calls format as shown above."
+            ),
+        },
+    ]
+
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=[
+                "--tool-call-parser",
+                "pythonic",
+            ],
+        )
+        cls.base_url += "/v1"
+        cls.tokenizer = get_tokenizer(cls.model)
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_pythonic_tool_call_prompt(self):
+        """
+        Test: Explicit prompt for pythonic tool call format without chat template.
+        """
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        response = client.chat.completions.create(
+            model=self.model,
+            messages=self.PYTHONIC_MESSAGES,
+            tools=self.PYTHONIC_TOOLS,
+            temperature=0.1,
+            stream=False,
+        )
+        tool_calls = response.choices[0].message.tool_calls
+        self.assertIsInstance(tool_calls, list, "No tool_calls found")
+        self.assertGreaterEqual(len(tool_calls), 1)
+        names = [tc.function.name for tc in tool_calls]
+        self.assertTrue(
+            "get_weather" in names or "get_tourist_attractions" in names,
+            f"Function name '{names}' should container either 'get_weather' or 'get_tourist_attractions'",
+        )
+
+    def test_pythonic_tool_call_streaming(self):
+        """
+        Test: Streaming pythonic tool call format; assert tool_call index is present.
+        """
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        response_stream = client.chat.completions.create(
+            model=self.model,
+            messages=self.PYTHONIC_MESSAGES,
+            tools=self.PYTHONIC_TOOLS,
+            temperature=0.1,
+            stream=True,
+        )
+        found_tool_calls = False
+        found_index = False
+        found_names = set()
+        for chunk in response_stream:
+            choice = chunk.choices[0]
+            if getattr(choice.delta, "tool_calls", None):
+                found_tool_calls = True
+                tool_call = choice.delta.tool_calls[0]
+                if hasattr(tool_call, "index") or (
+                    isinstance(tool_call, dict) and "index" in tool_call
+                ):
+                    found_index = True
+                found_names.add(str(tool_call.function.name))
+
+        self.assertTrue(found_tool_calls, "No tool_calls found in streaming response")
+        self.assertTrue(found_index, "No index field found in any streamed tool_call")
+        self.assertTrue(
+            "get_weather" in found_names or "get_tourist_attractions" in found_names,
+            f"Function name '{found_names}' should container either 'get_weather' or 'get_tourist_attractions'",
+        )
+
+
+# Skip for ci test
+# class TestGLM45ServerFunctionCalling(TestOpenAIServerFunctionCalling):
+#     @classmethod
+#     def setUpClass(cls):
+#         # Replace with the model name needed for testing; if not required, reuse DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+#         cls.model = "THUDM/GLM-4.5"
+#         cls.base_url = DEFAULT_URL_FOR_TEST
+#         cls.api_key = "sk-123456"
+
+#         # Start the local OpenAI Server. If necessary, you can add other parameters such as --enable-tools.
+#         cls.process = popen_launch_server(
+#             cls.model,
+#             cls.base_url,
+#             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+#             api_key=cls.api_key,
+#             other_args=[
+#                 # If your server needs extra parameters to test function calling, please add them here.
+#                 "--tool-call-parser",
+#                 "glm45",
+#                 "--reasoning-parser",
+#                 "glm45",
+#                 "--tp-size",
+#                 "8"
+#             ],
+#         )
+#         cls.base_url += "/v1"
+#         cls.tokenizer = get_tokenizer(cls.model)
+
+#     # This test is too difficult for GLM4-moe. Skip it from the UT
+#     def test_function_call_required(self):
+#         pass
+
+#     def test_function_calling_multiturn(self):
+#         self._test_function_calling_multiturn()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/openai_server/function_call/test_tool_choice.py b/test/srt/openai_server/function_call/test_tool_choice.py
new file mode 100644
index 000000000..d8094e930
--- /dev/null
+++ b/test/srt/openai_server/function_call/test_tool_choice.py
@@ -0,0 +1,550 @@
+"""
+Test script for tool_choice functionality in SGLang
+Tests: required, auto, and specific function choices in both streaming and non-streaming modes
+
+# To run the tests, use the following command:
+#
+# python3 -m unittest openai_server.function_call.test_tool_choice
+"""
+
+import json
+import unittest
+
+import openai
+
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestToolChoiceLlama32(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        # Mark flaky tests for this model
+        cls.flaky_tests = {
+            "test_multi_tool_scenario_auto",
+            "test_multi_tool_scenario_required",
+        }
+
+        # Use a model that supports function calling
+        cls.model = "meta-llama/Llama-3.2-1B-Instruct"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+
+        # Start the local OpenAI Server with tool calling support
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=[
+                "--tool-call-parser",
+                "llama3",  # Default parser for the test model
+            ],
+        )
+        cls.base_url += "/v1"
+        cls.tokenizer = get_tokenizer(cls.model)
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def setUp(self):
+        self.client = openai.Client(base_url=self.base_url, api_key=self.api_key)
+        self.model_name = self.client.models.list().data[0].id
+
+    def _is_flaky_test(self):
+        """Check if the current test is marked as flaky for this class"""
+        return (
+            hasattr(self.__class__, "flaky_tests")
+            and self._testMethodName in self.__class__.flaky_tests
+        )
+
+    def get_test_tools(self):
+        """Get the test tools for function calling"""
+        return [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_weather",
+                    "description": "use this to get latest weather information for a city given its name",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "city": {
+                                "type": "string",
+                                "description": "name of the city to get weather for",
+                            },
+                            "unit": {
+                                "type": "string",
+                                "enum": ["celsius", "fahrenheit"],
+                            },
+                        },
+                        "required": ["city"],
+                    },
+                },
+            },
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_pokemon_info",
+                    "description": "get detailed information about a pokemon given its name",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "name": {
+                                "type": "string",
+                                "description": "name of the pokemon to get info for",
+                            }
+                        },
+                        "required": ["name"],
+                    },
+                },
+            },
+            {
+                "type": "function",
+                "function": {
+                    "name": "make_next_step_decision",
+                    "description": "You will be given a trace of thinking process in the following format.\n\nQuestion: the input question you must answer\nTOOL: think about what to do, and choose a tool to use ONLY IF there are defined tools. \n  You should never call the same tool with the same input twice in a row.\n  If the previous conversation history already contains the information that can be retrieved from the tool, you should not call the tool again.\nOBSERVATION: the result of the tool call, NEVER include this in your response, this information will be provided\n... (this TOOL/OBSERVATION can repeat N times)\nANSWER: If you know the answer to the original question, require for more information,\n  or you don't know the answer and there are no defined tools or all available tools are not helpful, respond with the answer without mentioning anything else.\n  If the previous conversation history already contains the answer, respond with the answer right away.\n\n  If no tools are configured, naturally mention this limitation while still being helpful. Briefly note that adding tools in the agent configuration would expand capabilities.\n\nYour task is to respond with the next step to take, based on the traces, \nor answer the question if you have enough information.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "decision": {
+                                "type": "string",
+                                "description": 'The next step to take, it must be either "TOOL" or "ANSWER". If the previous conversation history already contains the information that can be retrieved from the tool, you should not call the tool again. If there are no defined tools, you should not return "TOOL" in your response.',
+                            },
+                            "content": {
+                                "type": "string",
+                                "description": 'The content of the next step. If the decision is "TOOL", this should be a short and concise reasoning of why you chose the tool, MUST include the tool name. If the decision is "ANSWER", this should be the answer to the question. If no tools are available, integrate this limitation conversationally without sounding scripted.',
+                            },
+                        },
+                        "required": ["decision", "content"],
+                    },
+                },
+            },
+        ]
+
+    def get_test_messages(self):
+        """Get test messages that should trigger tool usage"""
+        return [
+            {
+                "role": "user",
+                "content": "Answer the following questions as best you can:\n\nYou will be given a trace of thinking process in the following format.\n\nQuestion: the input question you must answer\nTOOL: think about what to do, and choose a tool to use ONLY IF there are defined tools\nOBSERVATION: the result of the tool call or the observation of the current task, NEVER include this in your response, this information will be provided\n... (this TOOL/OBSERVATION can repeat N times)\nANSWER: If you know the answer to the original question, require for more information, \nif the previous conversation history already contains the answer, \nor you don't know the answer and there are no defined tools or all available tools are not helpful, respond with the answer without mentioning anything else.\nYou may use light Markdown formatting to improve clarity (e.g. lists, **bold**, *italics*), but keep it minimal and unobtrusive.\n\nYour task is to respond with the next step to take, based on the traces, \nor answer the question if you have enough information.\n\nQuestion: what is the weather in top 5 populated cities in the US in celsius?\n\nTraces:\n\n\nThese are some additional instructions that you should follow:",
+            }
+        ]
+
+    def get_travel_tools(self):
+        """Get tools for travel assistant scenario that should trigger multiple tool calls"""
+        return [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_weather",
+                    "description": "Get the current weather for a given location.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "location": {
+                                "type": "string",
+                                "description": "The name of the city or location.",
+                            },
+                            "unit": {
+                                "type": "string",
+                                "enum": ["celsius", "fahrenheit"],
+                            },
+                        },
+                        "required": ["location"],
+                    },
+                },
+            },
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_tourist_attractions",
+                    "description": "Get a list of top tourist attractions for a given city.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "city": {
+                                "type": "string",
+                                "description": "The name of the city to find attractions for.",
+                            }
+                        },
+                        "required": ["city"],
+                    },
+                },
+            },
+        ]
+
+    def get_travel_messages(self):
+        """Get travel assistant messages that should trigger multiple tool calls"""
+        return [
+            {
+                "content": "You are a travel assistant providing real-time weather updates and top tourist attractions.",
+                "role": "system",
+            },
+            {
+                "content": "I'm planning a trip to Tokyo next week. What's the weather like? What are the most amazing sights?",
+                "role": "user",
+            },
+        ]
+
+    def test_tool_choice_auto_non_streaming(self):
+        """Test tool_choice='auto' in non-streaming mode"""
+        tools = self.get_test_tools()
+        messages = self.get_test_messages()
+
+        response = self.client.chat.completions.create(
+            model=self.model_name,
+            messages=messages,
+            max_tokens=2048,
+            tools=tools,
+            tool_choice="auto",
+            stream=False,
+        )
+
+        self.assertIsNotNone(response.choices[0].message)
+        # With auto, tool calls are optional
+
+    def test_tool_choice_auto_streaming(self):
+        """Test tool_choice='auto' in streaming mode"""
+        tools = self.get_test_tools()
+        messages = self.get_test_messages()
+
+        response = self.client.chat.completions.create(
+            model=self.model_name,
+            messages=messages,
+            max_tokens=2048,
+            tools=tools,
+            tool_choice="auto",
+            stream=True,
+        )
+
+        # Collect streaming response
+        content_chunks = []
+        tool_call_chunks = []
+
+        for chunk in response:
+            if chunk.choices[0].delta.content:
+                content_chunks.append(chunk.choices[0].delta.content)
+            elif chunk.choices[0].delta.tool_calls:
+                tool_call_chunks.extend(chunk.choices[0].delta.tool_calls)
+
+        # Should complete without errors
+        self.assertIsInstance(content_chunks, list)
+        self.assertIsInstance(tool_call_chunks, list)
+
+    def test_tool_choice_required_non_streaming(self):
+        """Test tool_choice='required' in non-streaming mode"""
+        tools = self.get_test_tools()
+        messages = self.get_test_messages()
+
+        response = self.client.chat.completions.create(
+            model=self.model_name,
+            messages=messages,
+            max_tokens=2048,
+            temperature=0.2,
+            tools=tools,
+            tool_choice="required",
+            stream=False,
+        )
+
+        # With required, we should get tool calls
+        tool_calls = response.choices[0].message.tool_calls
+        self.assertIsNotNone(tool_calls)
+        self.assertGreater(len(tool_calls), 0)
+
+    def test_tool_choice_required_streaming(self):
+        """Test tool_choice='required' in streaming mode"""
+        tools = self.get_test_tools()
+        messages = self.get_test_messages()
+
+        response = self.client.chat.completions.create(
+            model=self.model_name,
+            messages=messages,
+            max_tokens=2048,
+            tools=tools,
+            tool_choice="required",
+            stream=True,
+        )
+
+        # Collect streaming response
+        tool_call_chunks = []
+
+        for chunk in response:
+            if chunk.choices[0].delta.tool_calls:
+                tool_call_chunks.extend(chunk.choices[0].delta.tool_calls)
+
+        # With required, we should get tool call chunks
+        self.assertGreater(len(tool_call_chunks), 0)
+
+    def test_tool_choice_specific_function_non_streaming(self):
+        """Test tool_choice with specific function in non-streaming mode"""
+        tools = self.get_test_tools()
+        messages = self.get_test_messages()
+
+        tool_choice = {"type": "function", "function": {"name": "get_weather"}}
+
+        response = self.client.chat.completions.create(
+            model=self.model_name,
+            messages=messages,
+            max_tokens=2048,
+            tools=tools,
+            tool_choice=tool_choice,
+            stream=False,
+        )
+
+        # Should call the specific function
+        tool_calls = response.choices[0].message.tool_calls
+        self.assertIsNotNone(tool_calls)
+        # Our messages ask the top 5 populated cities in the US, so the model could get 5 tool calls
+        self.assertGreaterEqual(len(tool_calls), 1)
+        for tool_call in tool_calls:
+            self.assertEqual(tool_call.function.name, "get_weather")
+
+    def test_tool_choice_specific_function_streaming(self):
+        """Test tool_choice with specific function in streaming mode"""
+        tools = self.get_test_tools()
+        messages = self.get_test_messages()
+
+        tool_choice = {"type": "function", "function": {"name": "get_weather"}}
+
+        response = self.client.chat.completions.create(
+            model=self.model_name,
+            messages=messages,
+            max_tokens=2048,
+            tools=tools,
+            tool_choice=tool_choice,
+            stream=True,
+        )
+
+        # Collect streaming response
+        tool_call_chunks = []
+
+        for chunk in response:
+            if chunk.choices[0].delta.tool_calls:
+                tool_call_chunks.extend(chunk.choices[0].delta.tool_calls)
+
+        # Should get tool call chunks for the specific function
+        self.assertGreater(len(tool_call_chunks), 0)
+
+        # Find function name in chunks
+        found_name = None
+        for chunk in tool_call_chunks:
+            if chunk.function and chunk.function.name:
+                found_name = chunk.function.name
+                break
+
+        self.assertEqual(found_name, "get_weather")
+
+    def test_multi_tool_scenario_auto(self):
+        """Test multi-tool scenario with tool_choice='auto'"""
+        tools = self.get_travel_tools()
+        messages = self.get_travel_messages()
+
+        response = self.client.chat.completions.create(
+            model=self.model_name,
+            messages=messages,
+            max_tokens=2048,
+            temperature=0.2,
+            tools=tools,
+            tool_choice="auto",
+            stream=False,
+        )
+
+        # Should complete without errors
+        self.assertIsNotNone(response.choices[0].message)
+
+        tool_calls = response.choices[0].message.tool_calls
+        expected_functions = {"get_weather", "get_tourist_attractions"}
+
+        if self._is_flaky_test():
+            # For flaky tests, just verify all called functions are available tools
+            if tool_calls:
+                available_names = [tool["function"]["name"] for tool in tools]
+                for call in tool_calls:
+                    self.assertIn(call.function.name, available_names)
+        else:
+            # For non-flaky tests, enforce strict requirements
+            self.assertIsNotNone(tool_calls, "Expected tool calls but got none")
+            self.assertEqual(
+                len(tool_calls), 2, f"Expected 2 tool calls, got {len(tool_calls)}"
+            )
+
+            called_functions = {call.function.name for call in tool_calls}
+            self.assertEqual(
+                called_functions,
+                expected_functions,
+                f"Expected functions {expected_functions}, got {called_functions}",
+            )
+
+    def test_multi_tool_scenario_required(self):
+        """Test multi-tool scenario with tool_choice='required'"""
+        tools = self.get_travel_tools()
+        messages = self.get_travel_messages()
+
+        response = self.client.chat.completions.create(
+            model=self.model_name,
+            messages=messages,
+            max_tokens=2048,
+            temperature=0.2,
+            tools=tools,
+            tool_choice="required",
+            stream=False,
+        )
+
+        # With required, we should get at least one tool call
+        tool_calls = response.choices[0].message.tool_calls
+        self.assertIsNotNone(tool_calls)
+        self.assertGreater(len(tool_calls), 0)
+
+        # Verify all called functions are available tools
+        available_names = [tool["function"]["name"] for tool in tools]
+        expected_functions = {"get_weather", "get_tourist_attractions"}
+
+        if self._is_flaky_test():
+            # For flaky tests, just ensure basic functionality works
+            self.assertGreater(
+                len(tool_calls),
+                0,
+                f"Expected at least 1 tool call, got {len(tool_calls)}",
+            )
+            for call in tool_calls:
+                self.assertIn(call.function.name, available_names)
+        else:
+            # For non-flaky tests, enforce strict requirements
+            self.assertEqual(
+                len(tool_calls), 2, f"Expected 2 tool calls, got {len(tool_calls)}"
+            )
+
+            called_functions = {call.function.name for call in tool_calls}
+            self.assertEqual(
+                called_functions,
+                expected_functions,
+                f"Expected functions {expected_functions}, got {called_functions}",
+            )
+
+    def test_error_handling_invalid_tool_choice(self):
+        """Test error handling for invalid tool_choice"""
+        import logging
+        from unittest.mock import patch
+
+        tools = self.get_test_tools()
+        messages = self.get_test_messages()
+
+        # Test with invalid function name
+        tool_choice = {"type": "function", "function": {"name": "nonexistent_function"}}
+
+        # The behavior could be either:
+        # 1. Log a warning and continue (if fallback is implemented)
+        # 2. Raise an exception (if strict validation is implemented)
+
+        # First try to capture any logging that might happen
+        with patch("logging.warning") as mock_warning:
+            response = self.client.chat.completions.create(
+                model=self.model_name,
+                messages=messages,
+                max_tokens=2048,
+                tools=tools,
+                tool_choice=tool_choice,
+                stream=False,
+            )
+
+            self.assertIsNotNone(response.choices[0].message)
+
+            if mock_warning.called:
+                warning_message = mock_warning.call_args[0][0]
+                self.assertIn("nonexistent_function", warning_message)
+
+
+class TestToolChoiceQwen25(TestToolChoiceLlama32):
+    """Test tool_choice functionality with Qwen2.5 model"""
+
+    @classmethod
+    def setUpClass(cls):
+        cls.flaky_tests = {}
+
+        cls.model = "Qwen/Qwen2.5-7B-Instruct"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=[
+                "--tool-call-parser",
+                "qwen25",
+            ],
+        )
+        cls.base_url += "/v1"
+        cls.tokenizer = get_tokenizer(cls.model)
+
+
+class TestToolChoiceMistral(TestToolChoiceLlama32):
+    """Test tool_choice functionality with Mistral model"""
+
+    @classmethod
+    def setUpClass(cls):
+        # Mark flaky tests for this model
+        cls.flaky_tests = {
+            "test_multi_tool_scenario_auto",
+            "test_multi_tool_scenario_required",
+        }
+
+        cls.model = "mistralai/Mistral-7B-Instruct-v0.3"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=[
+                "--tool-call-parser",
+                "mistral",
+            ],
+        )
+        cls.base_url += "/v1"
+        cls.tokenizer = get_tokenizer(cls.model)
+
+
+# Skip for ci test
+# class TestToolChoiceGLM45(TestToolChoiceLlama32):
+#     @classmethod
+#     def setUpClass(cls):
+#         # Replace with the model name needed for testing; if not required, reuse DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+#         cls.model = "THUDM/GLM-4.5"
+#         cls.base_url = DEFAULT_URL_FOR_TEST
+#         cls.api_key = "sk-123456"
+
+#         # Start the local OpenAI Server. If necessary, you can add other parameters such as --enable-tools.
+#         cls.process = popen_launch_server(
+#             cls.model,
+#             cls.base_url,
+#             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+#             api_key=cls.api_key,
+#             other_args=[
+#                 # If your server needs extra parameters to test function calling, please add them here.
+#                 "--tool-call-parser",
+#                 "glm45",
+#                 "--reasoning-parser",
+#                 "glm45",
+#                 "--tp-size",
+#                 "8"
+#             ],
+#         )
+#         cls.base_url += "/v1"
+#         cls.tokenizer = get_tokenizer(cls.model)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/openai_server/validation/__init__.py b/test/srt/openai_server/validation/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/srt/openai_server/validation/test_large_max_new_tokens.py b/test/srt/openai_server/validation/test_large_max_new_tokens.py
new file mode 100644
index 000000000..49601a784
--- /dev/null
+++ b/test/srt/openai_server/validation/test_large_max_new_tokens.py
@@ -0,0 +1,103 @@
+"""
+python3 -m unittest openai_server.validation.test_large_max_new_tokens.TestLargeMaxNewTokens.test_chat_completion
+"""
+
+import os
+import time
+import unittest
+from concurrent.futures import ThreadPoolExecutor
+
+import openai
+
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    STDERR_FILENAME,
+    STDOUT_FILENAME,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestLargeMaxNewTokens(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+
+        cls.stdout = open(STDOUT_FILENAME, "w")
+        cls.stderr = open(STDERR_FILENAME, "w")
+
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=(
+                "--max-total-token",
+                "1536",
+                "--context-len",
+                "8192",
+                "--decode-log-interval",
+                "2",
+            ),
+            env={"SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION": "256", **os.environ},
+            return_stdout_stderr=(cls.stdout, cls.stderr),
+        )
+        cls.base_url += "/v1"
+        cls.tokenizer = get_tokenizer(DEFAULT_SMALL_MODEL_NAME_FOR_TEST)
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+        cls.stdout.close()
+        cls.stderr.close()
+        os.remove(STDOUT_FILENAME)
+        os.remove(STDERR_FILENAME)
+
+    def run_chat_completion(self):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        response = client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "You are a helpful AI assistant"},
+                {
+                    "role": "user",
+                    "content": "Please repeat the world 'hello' for 10000 times.",
+                },
+            ],
+            temperature=0,
+        )
+        return response
+
+    def test_chat_completion(self):
+        num_requests = 4
+
+        futures = []
+        with ThreadPoolExecutor(num_requests) as executor:
+            # Send multiple requests
+            for i in range(num_requests):
+                futures.append(executor.submit(self.run_chat_completion))
+
+            # Ensure that they are running concurrently
+            pt = 0
+            while pt >= 0:
+                time.sleep(5)
+                lines = open(STDERR_FILENAME).readlines()
+                for line in lines[pt:]:
+                    print(line, end="", flush=True)
+                    if f"#running-req: {num_requests}" in line:
+                        all_requests_running = True
+                        pt = -1
+                        break
+                    pt += 1
+
+        assert all_requests_running
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/openai_server/validation/test_matched_stop.py b/test/srt/openai_server/validation/test_matched_stop.py
new file mode 100644
index 000000000..357b07f31
--- /dev/null
+++ b/test/srt/openai_server/validation/test_matched_stop.py
@@ -0,0 +1,140 @@
+import json
+import unittest
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+MANY_NEW_TOKENS_PROMPT = """
+Please write an extremely detailed and vivid fantasy story, set in a world full of intricate magic systems, political intrigue, and complex characters.
+Ensure that you thoroughly describe every scene, character's motivations, and the environment. Include long, engaging dialogues and elaborate on the inner thoughts of the characters.
+Each section should be as comprehensive as possible to create a rich and immersive experience for the reader.
+The story should span multiple events, challenges, and character developments over time. Aim to make the story at least 3,000 words long.
+"""
+
+
+class TestMatchedStop(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=300,
+            other_args=["--max-running-requests", "10"],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def run_completions_generation(
+        self,
+        prompt=MANY_NEW_TOKENS_PROMPT,
+        max_tokens=1,
+        stop=None,
+        finish_reason=None,
+        matched_stop=None,
+    ):
+        payload = {
+            "prompt": prompt,
+            "model": self.model,
+            "temperature": 0,
+            "top_p": 1,
+            "max_tokens": max_tokens,
+        }
+
+        if stop is not None:
+            payload["stop"] = stop
+
+        response_completions = requests.post(
+            self.base_url + "/v1/completions",
+            json=payload,
+        )
+        print(json.dumps(response_completions.json()))
+        print("=" * 100)
+
+        assert (
+            response_completions.json()["choices"][0]["finish_reason"] == finish_reason
+        )
+        assert response_completions.json()["choices"][0]["matched_stop"] == matched_stop
+
+    def run_chat_completions_generation(
+        self,
+        prompt=MANY_NEW_TOKENS_PROMPT,
+        max_tokens=1,
+        stop=None,
+        finish_reason=None,
+        matched_stop=None,
+    ):
+        chat_payload = {
+            "model": self.model,
+            "messages": [
+                {"role": "system", "content": "You are a helpful AI assistant"},
+                {"role": "user", "content": prompt},
+            ],
+            "temperature": 0,
+            "top_p": 1,
+            "max_tokens": max_tokens,
+        }
+
+        if stop is not None:
+            chat_payload["stop"] = stop
+
+        response_chat = requests.post(
+            self.base_url + "/v1/chat/completions",
+            json=chat_payload,
+        )
+        print(json.dumps(response_chat.json()))
+        print("=" * 100)
+
+        assert response_chat.json()["choices"][0]["finish_reason"] == finish_reason
+        assert response_chat.json()["choices"][0]["matched_stop"] == matched_stop
+
+    def test_finish_stop_str(self):
+        self.run_completions_generation(
+            max_tokens=1000, stop="\n", finish_reason="stop", matched_stop="\n"
+        )
+        self.run_chat_completions_generation(
+            max_tokens=1000, stop="\n", finish_reason="stop", matched_stop="\n"
+        )
+
+    def test_finish_stop_eos(self):
+        llama_format_prompt = """
+        <|begin_of_text|><|start_header_id|>system<|end_header_id|>
+        You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+        What is 2 + 2?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+        """
+        eos_token_id = 128009
+        self.run_completions_generation(
+            prompt=llama_format_prompt,
+            max_tokens=1000,
+            finish_reason="stop",
+            matched_stop=eos_token_id,
+        )
+        self.run_chat_completions_generation(
+            prompt="What is 2 + 2?",
+            max_tokens=1000,
+            finish_reason="stop",
+            matched_stop=eos_token_id,
+        )
+
+    def test_finish_length(self):
+        self.run_completions_generation(
+            max_tokens=5, finish_reason="length", matched_stop=None
+        )
+        self.run_chat_completions_generation(
+            max_tokens=5, finish_reason="length", matched_stop=None
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/openai_server/validation/test_openai_server_ignore_eos.py b/test/srt/openai_server/validation/test_openai_server_ignore_eos.py
new file mode 100644
index 000000000..a3594dfd0
--- /dev/null
+++ b/test/srt/openai_server/validation/test_openai_server_ignore_eos.py
@@ -0,0 +1,84 @@
+import openai
+
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestOpenAIServerIgnoreEOS(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+        )
+        cls.base_url += "/v1"
+        cls.tokenizer = get_tokenizer(DEFAULT_SMALL_MODEL_NAME_FOR_TEST)
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_ignore_eos(self):
+        """
+        Test that ignore_eos=True allows generation to continue beyond EOS token
+        and reach the max_tokens limit.
+        """
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        max_tokens = 200
+
+        response_default = client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Count from 1 to 20."},
+            ],
+            temperature=0,
+            max_tokens=max_tokens,
+            extra_body={"ignore_eos": False},
+        )
+
+        response_ignore_eos = client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Count from 1 to 20."},
+            ],
+            temperature=0,
+            max_tokens=max_tokens,
+            extra_body={"ignore_eos": True},
+        )
+
+        default_tokens = len(
+            self.tokenizer.encode(response_default.choices[0].message.content)
+        )
+        ignore_eos_tokens = len(
+            self.tokenizer.encode(response_ignore_eos.choices[0].message.content)
+        )
+
+        # Check if ignore_eos resulted in more tokens or exactly max_tokens
+        # The ignore_eos response should either:
+        # 1. Have more tokens than the default response (if default stopped at EOS before max_tokens)
+        # 2. Have exactly max_tokens (if it reached the max_tokens limit)
+        self.assertTrue(
+            ignore_eos_tokens > default_tokens or ignore_eos_tokens >= max_tokens,
+            f"ignore_eos did not generate more tokens: {ignore_eos_tokens} vs {default_tokens}",
+        )
+
+        self.assertEqual(
+            response_ignore_eos.choices[0].finish_reason,
+            "length",
+            f"Expected finish_reason='length' for ignore_eos=True, got {response_ignore_eos.choices[0].finish_reason}",
+        )
diff --git a/test/srt/openai_server/validation/test_request_length_validation.py b/test/srt/openai_server/validation/test_request_length_validation.py
new file mode 100644
index 000000000..7276906d4
--- /dev/null
+++ b/test/srt/openai_server/validation/test_request_length_validation.py
@@ -0,0 +1,88 @@
+import unittest
+
+import openai
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestRequestLengthValidation(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+
+        # Start server with auto truncate disabled
+        cls.process = popen_launch_server(
+            DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=("--max-total-tokens", "1000", "--context-length", "1000"),
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_input_length_longer_than_context_length(self):
+        client = openai.Client(api_key=self.api_key, base_url=f"{self.base_url}/v1")
+
+        long_text = "hello " * 1200  # Will tokenize to more than context length
+
+        with self.assertRaises(openai.BadRequestError) as cm:
+            client.chat.completions.create(
+                model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+                messages=[
+                    {"role": "user", "content": long_text},
+                ],
+                temperature=0,
+            )
+
+        self.assertIn("is longer than the model's context length", str(cm.exception))
+
+    def test_input_length_longer_than_maximum_allowed_length(self):
+        client = openai.Client(api_key=self.api_key, base_url=f"{self.base_url}/v1")
+
+        long_text = "hello " * 999  # the maximum allowed length is 994 tokens
+
+        with self.assertRaises(openai.BadRequestError) as cm:
+            client.chat.completions.create(
+                model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+                messages=[
+                    {"role": "user", "content": long_text},
+                ],
+                temperature=0,
+            )
+
+        self.assertIn("is longer than the model's context length", str(cm.exception))
+
+    def test_max_tokens_validation(self):
+        client = openai.Client(api_key=self.api_key, base_url=f"{self.base_url}/v1")
+
+        long_text = "hello "
+
+        with self.assertRaises(openai.BadRequestError) as cm:
+            client.chat.completions.create(
+                model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+                messages=[
+                    {"role": "user", "content": long_text},
+                ],
+                temperature=0,
+                max_tokens=1200,
+            )
+
+        self.assertIn(
+            "max_completion_tokens is too large",
+            str(cm.exception),
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/parse_results.py b/test/srt/parse_results.py
new file mode 100644
index 000000000..f552739f5
--- /dev/null
+++ b/test/srt/parse_results.py
@@ -0,0 +1,57 @@
+import argparse
+import json
+import os
+
+import pandas as pd
+from tabulate import tabulate
+
+# Parse command-line arguments
+parser = argparse.ArgumentParser(description="Parse JSONL benchmark and summarize.")
+parser.add_argument("input_file", type=str, help="Path to input JSONL file")
+parser.add_argument(
+    "--md",
+    action="store_true",
+    help="If set, print the summary table in Markdown format (GitHub style)",
+)
+args = parser.parse_args()
+
+input_file = args.input_file
+base_name = os.path.splitext(os.path.basename(input_file))[0]
+output_file = f"{base_name}_summary.csv"
+
+fields = [
+    "max_concurrency",
+    "input_throughput",
+    "output_throughput",
+    "mean_ttft_ms",
+    "median_ttft_ms",
+    "p99_ttft_ms",
+    "mean_tpot_ms",
+    "median_tpot_ms",
+    "p99_tpot_ms",
+]
+
+# Read JSONL and parse
+results = []
+with open(input_file, "r") as f:
+    for line in f:
+        data = json.loads(line)
+        row = {field: data.get(field, None) for field in fields}
+        max_conc = data.get("max_concurrency")
+        out_tp = data.get("output_throughput")
+        row["per_user_throughput"] = out_tp / max_conc if max_conc else None
+        results.append(row)
+
+# Convert to DataFrame
+df = pd.DataFrame(results)
+
+# Save to CSV
+df.to_csv(output_file, index=False)
+print(f"\nSaved summary to: {output_file}\n")
+
+if args.md:
+    # Print Markdown table
+    print(tabulate(df, headers="keys", tablefmt="github", floatfmt=".3f"))
+else:
+    # Print ASCII table
+    print(tabulate(df, headers="keys", tablefmt="grid", floatfmt=".3f"))
diff --git a/test/srt/quant/test_awq.py b/test/srt/quant/test_awq.py
new file mode 100644
index 000000000..99ca3f842
--- /dev/null
+++ b/test/srt/quant/test_awq.py
@@ -0,0 +1,45 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestAWQ(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--trust-remote-code"],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreater(metrics["score"], 0.64)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/quant/test_awq_dequant.py b/test/srt/quant/test_awq_dequant.py
new file mode 100644
index 000000000..ec1f2b16a
--- /dev/null
+++ b/test/srt/quant/test_awq_dequant.py
@@ -0,0 +1,175 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/tests/kernels/quantization/test_awq_triton.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+unittest version of the AWQ Triton kernel tests.
+
+Run with:
+    python -m unittest test_awq_dequant.py
+"""
+import unittest
+
+import torch
+
+from sglang.srt.layers.quantization.awq_triton import (
+    AWQ_TRITON_SUPPORTED_GROUP_SIZES,
+    awq_dequantize_triton,
+    awq_gemm_triton,
+)
+from sglang.test.test_utils import CustomTestCase
+
+device = "cuda"
+
+
+def reverse_awq_order(t: torch.Tensor) -> torch.Tensor:
+    bits = 4
+    AWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]
+    idx = torch.arange(t.shape[-1], dtype=torch.int32, device=t.device)
+    idx = idx.view(-1, 32 // bits)[:, AWQ_REVERSE_ORDER].view(-1)
+    return (t[:, idx] & 0xF).contiguous()
+
+
+def awq_dequantize_torch(
+    qweight: torch.Tensor,
+    scales: torch.Tensor,
+    qzeros: torch.Tensor,
+    group_size: int,
+) -> torch.Tensor:
+    if group_size == -1:
+        group_size = qweight.shape[0]
+
+    bits = 4
+    shifts = torch.arange(0, 32, bits, device=qzeros.device)
+
+    iweights = torch.bitwise_right_shift(qweight[:, :, None], shifts[None, None, :]).to(
+        torch.int8
+    )
+    iweights = reverse_awq_order(iweights.view(iweights.shape[0], -1))
+
+    zeros = torch.bitwise_right_shift(qzeros[:, :, None], shifts[None, None, :]).to(
+        torch.int8
+    )
+    zeros = reverse_awq_order(zeros.view(qzeros.shape[0], -1))
+
+    iweights = torch.bitwise_and(iweights, (2**bits) - 1)
+    zeros = torch.bitwise_and(zeros, (2**bits) - 1)
+
+    scales = scales.repeat_interleave(group_size, dim=0)
+    zeros = zeros.repeat_interleave(group_size, dim=0)
+    return (iweights - zeros) * scales
+
+
+class TestAWQTriton(CustomTestCase):
+    def test_dequantize(self):
+        rows_list = [3584, 18944, 128, 256, 512, 1024]
+        cols_list = [448, 576, 4736, 16, 32, 64, 128]
+
+        for qweight_rows in rows_list:
+            for qweight_cols in cols_list:
+                for group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES:
+                    with self.subTest(
+                        rows=qweight_rows, cols=qweight_cols, g=group_size
+                    ):
+                        self._run_dequant_case(
+                            qweight_rows=qweight_rows,
+                            qweight_cols=qweight_cols,
+                            group_size=group_size,
+                        )
+
+    def _run_dequant_case(self, qweight_rows, qweight_cols, group_size):
+        if group_size == -1:
+            group_size = qweight_rows
+
+        torch.manual_seed(0)
+
+        qweight = torch.randint(
+            0,
+            torch.iinfo(torch.int32).max,
+            (qweight_rows, qweight_cols),
+            dtype=torch.int32,
+            device=device,
+        )
+        scales = torch.rand(
+            qweight_rows // group_size,
+            qweight_cols * 8,
+            dtype=torch.float16,
+            device=device,
+        )
+        zeros = torch.randint(
+            0,
+            torch.iinfo(torch.int32).max,
+            (qweight_rows // group_size, qweight_cols),
+            dtype=torch.int32,
+            device=device,
+        )
+
+        ref = awq_dequantize_torch(qweight, scales, zeros, group_size)
+        tri = awq_dequantize_triton(qweight, scales, zeros)
+
+        # sanity
+        self.assertFalse(torch.any(torch.isinf(tri)) or torch.any(torch.isnan(tri)))
+        torch.testing.assert_close(ref, tri)
+
+    # GEMM
+    def test_gemm(self):
+        N_list = [1, 2, 4, 8, 14, 17, 23, 32]
+        K_list = [128]
+        M_list = [16, 24, 32]
+        splitK_list = [1, 8]
+
+        for N in N_list:
+            for K in K_list:
+                for M in M_list:
+                    for group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES:
+                        for splitK in splitK_list:
+                            with self.subTest(N=N, K=K, M=M, g=group_size, sk=splitK):
+                                self._run_gemm_case(
+                                    N=N,
+                                    K=K,
+                                    M=M,
+                                    group_size=group_size,
+                                    splitK=splitK,
+                                )
+
+    def _run_gemm_case(self, N, K, M, group_size, splitK):
+        if group_size == -1:
+            group_size = K
+
+        torch.manual_seed(0)
+
+        x = torch.rand((N, K), dtype=torch.float32, device=device)
+        qweight = torch.randint(
+            0,
+            torch.iinfo(torch.int32).max,
+            (K, M // 8),
+            dtype=torch.int32,
+            device=device,
+        )
+        qzeros = torch.randint(
+            0,
+            torch.iinfo(torch.int32).max,
+            (K // group_size, M // 8),
+            dtype=torch.int32,
+            device=device,
+        )
+        scales = torch.rand((K // group_size, M), dtype=torch.float32, device=device)
+
+        tri_out = awq_gemm_triton(x, qweight, scales, qzeros, splitK)
+
+        self.assertFalse(
+            torch.any(torch.isinf(tri_out)) or torch.any(torch.isnan(tri_out))
+        )
+
+        # dequantize & compare
+        w_deq = awq_dequantize_triton(qweight, scales, qzeros)
+        ref_out = torch.matmul(x, w_deq)
+
+        self.assertFalse(
+            torch.any(torch.isinf(ref_out)) or torch.any(torch.isnan(ref_out))
+        )
+
+        torch.testing.assert_close(tri_out.cpu(), ref_out.cpu(), atol=1e-1, rtol=1e-1)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/test/srt/quant/test_block_int8.py b/test/srt/quant/test_block_int8.py
new file mode 100644
index 000000000..f6ceb03d0
--- /dev/null
+++ b/test/srt/quant/test_block_int8.py
@@ -0,0 +1,227 @@
+import itertools
+import unittest
+
+import torch
+
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
+from sglang.srt.layers.moe.topk import TopKConfig, select_experts
+from sglang.test.test_utils import CustomTestCase
+
+
+# For test
+def native_per_token_group_quant_int8(x, group_size, eps=1e-10, dtype=torch.int8):
+    """Function to perform per-token-group quantization on an input tensor `x` using native torch.
+
+    It converts the tensor values into float8 values and returns the
+    quantized tensor along with the scaling factor used for quantization.
+    Note that only `torch.float8_e4m3fn` is supported for now.
+    """
+    assert (
+        x.shape[-1] % group_size == 0
+    ), "the last dimension of `x` cannot be divisible by `group_size`"
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    iinfo = torch.iinfo(dtype)
+    int8_min = iinfo.min
+    int8_max = iinfo.max
+
+    x_ = x.reshape(x.numel() // group_size, group_size)
+    amax = x_.abs().max(dim=-1, keepdim=True)[0].clamp(min=eps).to(torch.float32)
+    x_s = amax / int8_max
+    x_q = (x_ / x_s).clamp(min=int8_min, max=int8_max).to(dtype)
+    x_q = x_q.reshape(x.shape)
+    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size,))
+
+    return x_q, x_s
+
+
+# For test
+def native_w8a8_block_int8_matmul(A, B, As, Bs, block_size, output_dtype=torch.float16):
+    """This function performs matrix multiplication with block-wise quantization using native torch.
+
+    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
+    The output is returned in the specified `output_dtype`.
+    """
+
+    A = A.to(torch.float32)
+    B = B.to(torch.float32)
+    assert A.shape[-1] == B.shape[-1]
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+    assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1]
+
+    M = A.numel() // A.shape[-1]
+    N, K = B.shape
+    origin_C_shape = A.shape[:-1] + (N,)
+    A = A.reshape(M, A.shape[-1])
+    As = As.reshape(M, As.shape[-1])
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+    assert n_tiles == Bs.shape[0]
+    assert k_tiles == Bs.shape[1]
+
+    C_shape = (M, N)
+    C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
+
+    A_tiles = [A[:, i * block_k : min((i + 1) * block_k, K)] for i in range(k_tiles)]
+    B_tiles = [
+        [
+            B[
+                j * block_n : min((j + 1) * block_n, N),
+                i * block_k : min((i + 1) * block_k, K),
+            ]
+            for i in range(k_tiles)
+        ]
+        for j in range(n_tiles)
+    ]
+    C_tiles = [C[:, j * block_n : min((j + 1) * block_n, N)] for j in range(n_tiles)]
+    As_tiles = [As[:, i : i + 1] for i in range(k_tiles)]
+
+    for i in range(k_tiles):
+        for j in range(n_tiles):
+            a = A_tiles[i]
+            b = B_tiles[j][i]
+            c = C_tiles[j]
+            s = As_tiles[i] * Bs[j][i]
+            c[:, :] += torch.matmul(a, b.t()) * s
+
+    C = C.reshape(origin_C_shape).to(output_dtype)
+    return C
+
+
+# For test
+def torch_w8a8_block_int8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
+    """This function performs fused moe with block-wise quantization using native torch."""
+
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+
+    _, block_k = block_shape[0], block_shape[1]
+    a_q, a_s = native_per_token_group_quant_int8(a, block_k)
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            inter_out = native_w8a8_block_int8_matmul(
+                a_q[mask], w1[i], a_s[mask], w1_s[i], block_shape, output_dtype=a.dtype
+            )
+            act_out = SiluAndMul().forward_native(inter_out)
+            act_out_q, act_out_s = native_per_token_group_quant_int8(act_out, block_k)
+            act_out = act_out.to(torch.float32)
+            out[mask] = native_w8a8_block_int8_matmul(
+                act_out_q, w2[i], act_out_s, w2_s[i], block_shape, output_dtype=a.dtype
+            )
+    return (
+        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+    ).sum(dim=1)
+
+
+class TestW8A8BlockINT8FusedMoE(CustomTestCase):
+    DTYPES = [torch.half, torch.bfloat16]
+    M = [1, 33, 64, 222]
+    N = [128, 1024]
+    K = [256, 4096]
+    E = [8, 24]
+    TOP_KS = [2, 6]
+    # BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
+    BLOCK_SIZE = [[128, 128]]
+    SEEDS = [0]
+
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+
+    def _w8a8_block_int8_fused_moe(self, M, N, K, E, topk, block_size, dtype, seed):
+        torch.manual_seed(seed)
+        # NOTE(HandH1998): to avoid overflow when out_dtype = torch.half
+        factor_for_scale = 1e-2
+        int8_info = torch.iinfo(torch.int8)
+        int8_max, int8_min = int8_info.max, int8_info.min
+
+        a = torch.randn((M, K), dtype=dtype) / 10
+
+        w1_fp32 = (torch.rand((E, 2 * N, K), dtype=torch.float32) - 0.5) * 2 * int8_max
+        w1 = w1_fp32.clamp(min=int8_min, max=int8_max).to(torch.int8)
+
+        w2_fp32 = (torch.rand((E, K, N), dtype=torch.float32) - 0.5) * 2 * int8_max
+        w2 = w2_fp32.clamp(min=int8_min, max=int8_max).to(torch.int8)
+
+        block_n, block_k = block_size[0], block_size[1]
+        n_tiles_w1 = (2 * N + block_n - 1) // block_n
+        n_tiles_w2 = (K + block_n - 1) // block_n
+        k_tiles_w1 = (K + block_k - 1) // block_k
+        k_tiles_w2 = (N + block_k - 1) // block_k
+
+        w1_s = (
+            torch.rand((E, n_tiles_w1, k_tiles_w1), dtype=torch.float32)
+            * factor_for_scale
+        )
+        w2_s = (
+            torch.rand((E, n_tiles_w2, k_tiles_w2), dtype=torch.float32)
+            * factor_for_scale
+        )
+
+        score = torch.randn((M, E), dtype=dtype)
+
+        topk_output = select_experts(
+            hidden_states=a,
+            router_logits=score,
+            topk_config=TopKConfig(top_k=topk, renormalize=False),
+        )
+
+        with torch.inference_mode():
+            ref_out = torch_w8a8_block_int8_moe(
+                a, w1, w2, w1_s, w2_s, score, topk, block_size
+            )
+            out = fused_moe(
+                a,
+                w1,
+                w2,
+                topk_output,
+                use_int8_w8a8=True,
+                w1_scale=w1_s,
+                w2_scale=w2_s,
+                block_shape=block_size,
+            )
+
+        self.assertTrue(
+            torch.mean(torch.abs(out.to(torch.float32) - ref_out.to(torch.float32)))
+            / torch.mean(torch.abs(ref_out.to(torch.float32)))
+            < 0.02
+        )
+
+    def test_w8a8_block_int8_fused_moe(self):
+        for params in itertools.product(
+            self.M,
+            self.N,
+            self.K,
+            self.E,
+            self.TOP_KS,
+            self.BLOCK_SIZE,
+            self.DTYPES,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                M=params[0],
+                N=params[1],
+                K=params[2],
+                E=params[3],
+                topk=params[4],
+                block_size=params[5],
+                dtype=params[6],
+                seed=params[7],
+            ):
+                self._w8a8_block_int8_fused_moe(*params)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/test/srt/quant/test_fp8_kernel.py b/test/srt/quant/test_fp8_kernel.py
new file mode 100644
index 000000000..42502277b
--- /dev/null
+++ b/test/srt/quant/test_fp8_kernel.py
@@ -0,0 +1,126 @@
+import unittest
+
+import torch
+
+from sglang.srt.layers.quantization.fp8_kernel import (
+    per_token_group_quant_fp8,
+    w8a8_block_fp8_matmul,
+)
+from sglang.test.test_utils import CustomTestCase
+
+
+class TestFP8Base(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.M = 256
+        # test non-aligned
+        cls.N = 1024 + 64
+        cls.K = 512
+        cls.group_size = 128
+        cls.quant_type = torch.float8_e4m3fn
+        cls.output_type = torch.bfloat16
+
+    @staticmethod
+    def _make_A(M, K, group_size, out_dtype):
+        quant_A = torch.rand(
+            M, K // group_size, group_size, dtype=torch.float32, device="cuda"
+        )
+        # -1 ~ 1
+        quant_A = quant_A * 2 - 1
+        # scaling abs max to fmax
+        finfo = torch.finfo(out_dtype)
+        fmax = finfo.max
+        scaling = fmax / quant_A.abs().amax(-1, keepdim=True)
+        quant_A *= scaling
+        quant_A = quant_A.to(out_dtype).to(torch.float32)
+
+        # create scale and A
+        scale = torch.rand(M, K // group_size, dtype=torch.float32, device="cuda")
+        scale /= fmax
+        A = quant_A * scale[..., None]
+
+        A = A.reshape(M, K)
+        quant_A = quant_A.reshape(M, K).to(out_dtype)
+        return A, quant_A, scale
+
+    @staticmethod
+    def _make_B(K, N, group_size, out_dtype):
+        def _aligned_size(a, b):
+            return (a + b - 1) // b * b
+
+        K_aligned = _aligned_size(K, group_size)
+        N_aligned = _aligned_size(N, group_size)
+
+        quant_B = torch.rand(
+            K_aligned // group_size,
+            group_size,
+            N_aligned // group_size,
+            group_size,
+            dtype=torch.float32,
+            device="cuda",
+        )
+        quant_B = quant_B * 2 - 1
+
+        # scaling abs max to fmax
+        finfo = torch.finfo(out_dtype)
+        fmax = finfo.max
+        scaling = fmax / quant_B.abs().amax((1, 3), keepdim=True)
+        quant_B *= scaling
+        quant_B = quant_B.to(out_dtype).to(torch.float32)
+
+        scale = torch.rand(
+            K_aligned // group_size,
+            1,
+            N_aligned // group_size,
+            1,
+            dtype=torch.float32,
+            device="cuda",
+        )
+        scale /= fmax
+
+        B = quant_B * scale
+
+        B = B.reshape(K_aligned, N_aligned)[:K, :N]
+        quant_B = quant_B.reshape(K_aligned, N_aligned).to(out_dtype)[:K, :N]
+        scale = scale.reshape(K_aligned // group_size, N_aligned // group_size)
+        return B, quant_B, scale
+
+
+class TestPerTokenGroupQuantFP8(TestFP8Base):
+    def test_per_token_group_quant_fp8(self):
+        if torch.cuda.get_device_capability()[0] < 9:
+            return
+        A, A_quant_gt, scale_gt = self._make_A(
+            M=self.M, K=self.K, group_size=self.group_size, out_dtype=self.quant_type
+        )
+        A_quant, scale = per_token_group_quant_fp8(x=A, group_size=self.group_size)
+        torch.testing.assert_close(scale, scale_gt)
+        diff = (A_quant.to(torch.float16) - A_quant_gt.to(torch.float16)).abs()
+        diff_count = (diff > 1e-5).count_nonzero()
+        assert diff_count / diff.numel() < 1e-4
+
+
+class TestW8A8BlockFP8Matmul(TestFP8Base):
+    def test_w8a8_block_fp8_matmul(self):
+        if torch.cuda.get_device_capability()[0] < 9:
+            return
+        A, A_quant_gt, A_scale_gt = self._make_A(
+            M=self.M, K=self.K, group_size=self.group_size, out_dtype=self.quant_type
+        )
+        B, B_quant_gt, B_scale_gt = self._make_B(
+            K=self.K, N=self.N, group_size=self.group_size, out_dtype=self.quant_type
+        )
+        C_gt = A.to(self.output_type) @ B.to(self.output_type)
+        C = w8a8_block_fp8_matmul(
+            A=A_quant_gt,
+            B=B_quant_gt.T.contiguous(),
+            As=A_scale_gt,
+            Bs=B_scale_gt.T.contiguous(),
+            block_size=[128, 128],
+            output_dtype=self.output_type,
+        )
+        torch.testing.assert_close(C, C_gt, atol=0.5, rtol=1e-4)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/quant/test_fp8_kvcache.py b/test/srt/quant/test_fp8_kvcache.py
new file mode 100644
index 000000000..669926f5d
--- /dev/null
+++ b/test/srt/quant/test_fp8_kvcache.py
@@ -0,0 +1,114 @@
+import os
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestFp8KvcacheBase(CustomTestCase):
+    model_config = None
+
+    @classmethod
+    def setUpClass(cls):
+        if cls.model_config is None:
+            raise NotImplementedError("model_config must be specified in subclass")
+
+        cls.model = cls.model_config["model_name"]
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        dirpath = os.path.dirname(__file__)
+        config_file = os.path.join(dirpath, cls.model_config["config_filename"])
+
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--kv-cache-dtype",
+                "fp8_e4m3",
+                "--quantization-param-path",
+                config_file,
+            ],
+        )
+
+
+class TestFp8KvcacheLlama(TestFp8KvcacheBase):
+    model_config = {
+        "model_name": DEFAULT_MODEL_NAME_FOR_TEST,
+        "config_filename": "kv_cache_scales_llama3_8b.json",
+    }
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mgsm_en(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mgsm_en",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreater(metrics["score"], 0.80)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreaterEqual(metrics["score"], 0.65)
+
+
+class TestFp8KvcacheQwen(TestFp8KvcacheBase):
+    model_config = {
+        "model_name": DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN,
+        "config_filename": "kv_cache_scales_qwen2_1_5b.json",
+    }
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mgsm_en(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mgsm_en",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreater(metrics["score"], 0.01)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreaterEqual(metrics["score"], 0.3)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/quant/test_int8_kernel.py b/test/srt/quant/test_int8_kernel.py
new file mode 100644
index 000000000..dd75d06af
--- /dev/null
+++ b/test/srt/quant/test_int8_kernel.py
@@ -0,0 +1,169 @@
+import itertools
+import unittest
+
+import torch
+
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
+from sglang.srt.layers.moe.topk import TopKConfig, select_experts
+from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
+from sglang.test.test_utils import CustomTestCase
+
+
+def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
+    """Matrix multiplication function that supports per-token input quantization and per-column weight quantization"""
+    A = A.to(torch.float32)
+    B = B.to(torch.float32)
+
+    assert A.shape[-1] == B.shape[-1], "Dimension mismatch"
+    assert B.ndim == 2 and B.is_contiguous(), "B must be a 2D contiguous tensor"
+
+    # Reshape input
+    M = A.numel() // A.shape[-1]
+    B = B.t()  # Transpose weight matrix
+    N, K = B.shape
+    origin_C_shape = A.shape[:-1] + (K,)
+    A = A.reshape(M, N)
+
+    # As is per-token [M, 1], Bs is per-column [1, K]
+    C = torch.matmul(A, B)  # [M, K]
+    C = As * C * Bs.view(1, -1)  # Broadcast per-column scale
+
+    return C.reshape(origin_C_shape).to(output_dtype)
+
+
+def torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk):
+    """This function performs fused moe with per-column int8 quantization using native torch."""
+
+    B, D = a.shape
+    # Perform per-token quantization
+    a_q, a_s = per_token_quant_int8(a)
+    # Repeat tokens to match topk
+    a_q = a_q.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    # Also repeat the scale
+    a_s = a_s.view(B, -1, 1).repeat(1, topk, 1).reshape(-1, 1)  # [B*topk, 1]
+
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+
+    # Calculate routing
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+    # Process each expert
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            # First MLP layer: note that a_s is now per-token
+            inter_out = native_w8a8_per_token_matmul(
+                a_q[mask], w1[i], a_s[mask], w1_s[i], output_dtype=a.dtype
+            )
+            # Activation function
+            act_out = SiluAndMul().forward_native(inter_out)
+            # Quantize activation output with per-token
+            act_out_q, act_out_s = per_token_quant_int8(act_out)
+
+            # Second MLP layer
+            out[mask] = native_w8a8_per_token_matmul(
+                act_out_q, w2[i], act_out_s, w2_s[i], output_dtype=a.dtype
+            )
+    # Apply routing weights and sum
+    return (
+        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+    ).sum(dim=1)
+
+
+class TestW8A8Int8FusedMoE(CustomTestCase):
+    DTYPES = [torch.half, torch.bfloat16]
+    M = [1, 33]
+    N = [128, 1024]
+    K = [256, 4096]
+    E = [8]
+    TOP_KS = [2, 6]
+    BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
+    BLOCK_SIZE = [[128, 128]]
+    SEEDS = [0]
+
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+
+    def _w8a8_int8_fused_moe(self, M, N, K, E, topk, block_size, dtype, seed):
+        torch.manual_seed(seed)
+        # Initialize int8 quantization parameters
+        factor_for_scale = 1e-2
+        int8_max = 127
+        int8_min = -128
+
+        # Input tensor
+        # M * K
+        a = torch.randn((M, K), dtype=dtype) / 10
+
+        # Generate int8 weights
+        w1_fp32 = (torch.rand((E, 2 * N, K), dtype=torch.float32) - 0.5) * 2
+        w1 = (w1_fp32 * int8_max).clamp(min=int8_min, max=int8_max).to(torch.int8)
+
+        w2_fp32 = (torch.rand((E, K, N), dtype=torch.float32) - 0.5) * 2
+        w2 = (w2_fp32 * int8_max).clamp(min=int8_min, max=int8_max).to(torch.int8)
+
+        # Generate scale for each column (per-column quantization)
+        w1_s = torch.rand(E, 2 * N, device=w1_fp32.device) * factor_for_scale
+        w2_s = torch.rand(E, K, device=w2_fp32.device) * factor_for_scale
+        score = torch.randn((M, E), dtype=dtype)
+
+        with torch.inference_mode():
+            ref_out = torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk)
+            topk_output = select_experts(
+                hidden_states=a,
+                router_logits=score,
+                topk_config=TopKConfig(top_k=topk, renormalize=False),
+            )
+            out = fused_moe(
+                a,
+                w1,
+                w2,
+                topk_output,
+                use_fp8_w8a8=False,  # Not using fp8
+                use_int8_w8a16=False,  # Not using int8-w8a16
+                use_int8_w8a8=True,  # Using int8-w8a8
+                per_channel_quant=True,
+                w1_scale=w1_s,
+                w2_scale=w2_s,
+                block_shape=None,  # Not using block quantization
+            )
+
+        # Check results
+        self.assertTrue(
+            torch.mean(torch.abs(out.to(torch.float32) - ref_out.to(torch.float32)))
+            / torch.mean(torch.abs(ref_out.to(torch.float32)))
+            < 0.05
+        )
+
+    def test_w8a8_int8_fused_moe(self):
+        for params in itertools.product(
+            self.M,
+            self.N,
+            self.K,
+            self.E,
+            self.TOP_KS,
+            self.BLOCK_SIZE,
+            self.DTYPES,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                M=params[0],
+                N=params[1],
+                K=params[2],
+                E=params[3],
+                topk=params[4],
+                block_size=params[5],
+                dtype=params[6],
+                seed=params[7],
+            ):
+                self._w8a8_int8_fused_moe(*params)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/test/srt/quant/test_triton_scaled_mm.py b/test/srt/quant/test_triton_scaled_mm.py
new file mode 100644
index 000000000..dafde83be
--- /dev/null
+++ b/test/srt/quant/test_triton_scaled_mm.py
@@ -0,0 +1,94 @@
+import itertools
+import unittest
+from typing import Optional
+
+import torch
+import torch.testing
+
+from sglang.srt.layers.quantization.fp8_kernel import triton_scaled_mm
+from sglang.test.test_utils import CustomTestCase
+
+
+def torch_scaled_mm(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    out_dtype: torch.dtype,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """Reference implementation using float32 for stability"""
+    out = torch.mm(a.to(torch.float32), b.to(torch.float32))
+    out = scale_a.to(torch.float32) * out * scale_b.to(torch.float32).T
+    if bias is not None:
+        out = out + bias.to(torch.float32)
+    return out.to(out_dtype)
+
+
+class TestScaledMM(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("This test requires a CUDA device.")
+        torch.set_default_device("cuda")
+
+    def _make_inputs(self, M, K, N, in_dtype):
+        if in_dtype == torch.int8:
+            a = torch.randint(-8, 8, (M, K), dtype=in_dtype, device="cuda")
+            b = torch.randint(-8, 8, (K, N), dtype=in_dtype, device="cuda")
+        else:  # fp8
+            a = torch.clamp(
+                0.1 * torch.randn((M, K), dtype=torch.float16, device="cuda"), -0.3, 0.3
+            ).to(in_dtype)
+            b = torch.clamp(
+                0.1 * torch.randn((K, N), dtype=torch.float16, device="cuda"), -0.3, 0.3
+            ).to(in_dtype)
+        return a, b
+
+    def test_basic_cases(self):
+        """Test core functionality with reduced precision requirements"""
+        test_configs = [
+            (32, 32, 32, torch.int8, torch.float16, False),
+            (64, 64, 64, torch.int8, torch.float16, True),
+        ]
+
+        try:
+            torch.tensor([1.0], dtype=torch.float8_e4m3fn, device="cuda")
+            test_configs.append((32, 32, 32, torch.float8_e4m3fn, torch.float16, False))
+        except:
+            print("FP8 not supported, skipping")
+
+        for M, K, N, in_dtype, out_dtype, with_bias in test_configs:
+            with self.subTest(M=M, K=K, N=N, dtype=in_dtype, bias=with_bias):
+                print(f"Currently testing with in_dtype: {in_dtype}")
+                torch.manual_seed(42)
+
+                input, weight = self._make_inputs(M, K, N, in_dtype)
+                scale_a = 0.1 + 0.05 * torch.rand(
+                    (M, 1), dtype=torch.float32, device="cuda"
+                )
+                scale_b = 0.1 + 0.05 * torch.rand(
+                    (N, 1), dtype=torch.float32, device="cuda"
+                )
+                bias = (
+                    0.01 * torch.randn((M, N), dtype=out_dtype, device="cuda")
+                    if with_bias
+                    else None
+                )
+
+                triton_out = triton_scaled_mm(
+                    input, weight, scale_a, scale_b, out_dtype, bias
+                )
+                ref_out = torch_scaled_mm(
+                    input, weight, scale_a, scale_b, out_dtype, bias
+                )
+
+                # Use relaxed tolerances
+                rtol = 0.15 if in_dtype == torch.int8 else 0.25
+                atol = 0.1 if in_dtype == torch.int8 else 0.15
+
+                torch.testing.assert_close(triton_out, ref_out, rtol=rtol, atol=atol)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/test/srt/quant/test_w4a8_deepseek_v3.py b/test/srt/quant/test_w4a8_deepseek_v3.py
new file mode 100644
index 000000000..eb813bd70
--- /dev/null
+++ b/test/srt/quant/test_w4a8_deepseek_v3.py
@@ -0,0 +1,122 @@
+import unittest
+from types import SimpleNamespace
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_DEEPSEEK_W4AFP8_MODEL_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_amd_ci,
+    is_in_ci,
+    popen_launch_server,
+    try_cached_model,
+    write_github_step_summary,
+)
+
+
+class TestDeepseekV3W4afp8(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = try_cached_model(DEFAULT_DEEPSEEK_W4AFP8_MODEL_FOR_TEST)
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = ["--trust-remote-code", "--tp", "8", "--ep-size", "8"]
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=1200,
+            parallel=1200,
+            max_new_tokens=512,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(f"Eval accuracy of GSM8K: {metrics=}")
+
+        self.assertGreater(metrics["accuracy"], 0.92)
+
+
+class TestDeepseekV3W4Afp8Mtp(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = try_cached_model(DEFAULT_DEEPSEEK_W4AFP8_MODEL_FOR_TEST)
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = [
+            "--tp",
+            "8",
+            "--trust-remote-code",
+            "--ep-size",
+            "8",
+            "--cuda-graph-bs",
+            "256",
+            "--disable-radix-cache",
+            "--speculative-algorithm",
+            "EAGLE",
+            "--speculative-num-steps",
+            "3",
+            "--speculative-eagle-topk",
+            "2",
+            "--speculative-num-draft-tokens",
+            "4",
+        ]
+        if not is_in_amd_ci():
+            other_args += ["--mem-frac", "0.7"]
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(
+        self,
+    ):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(f"{metrics=}")
+
+        server_info = requests.get(self.base_url + "/get_server_info")
+        avg_spec_accept_length = server_info.json()["internal_states"][0][
+            "avg_spec_accept_length"
+        ]
+        print(f"{avg_spec_accept_length=}")
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_gsm8k (deepseek-v3 mtp)\n"
+                f'{metrics["accuracy"]=:.3f}\n'
+                f"{avg_spec_accept_length=:.2f}\n"
+            )
+            self.assertGreater(metrics["accuracy"], 0.935)
+            self.assertGreater(avg_spec_accept_length, 2.9)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/quant/test_w8a8_quantization.py b/test/srt/quant/test_w8a8_quantization.py
new file mode 100644
index 000000000..acb7f5c7d
--- /dev/null
+++ b/test/srt/quant/test_w8a8_quantization.py
@@ -0,0 +1,75 @@
+import time
+import unittest
+from types import SimpleNamespace
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestW8A8(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "neuralmagic/Meta-Llama-3-8B-Instruct-quantized.w8a8"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--quantization", "w8a8_int8"],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.69)
+
+    def run_decode(self, max_new_tokens):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": max_new_tokens,
+                },
+                "ignore_eos": True,
+            },
+        )
+        return response.json()
+
+    def test_throughput(self):
+        max_tokens = 256
+
+        tic = time.perf_counter()
+        res = self.run_decode(max_tokens)
+        tok = time.perf_counter()
+        print(res["text"])
+        throughput = max_tokens / (tok - tic)
+        print(f"Throughput: {throughput} tokens/s")
+        assert throughput >= 140
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/rl/test_update_weights_from_disk.py b/test/srt/rl/test_update_weights_from_disk.py
new file mode 100644
index 000000000..02b283efd
--- /dev/null
+++ b/test/srt/rl/test_update_weights_from_disk.py
@@ -0,0 +1,371 @@
+import json
+import random
+import time
+import unittest
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import requests
+
+import sglang as sgl
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+)
+
+
+###############################################################################
+# Engine Mode Tests (Single-configuration)
+###############################################################################
+class TestEngineUpdateWeightsFromDisk(CustomTestCase):
+    def setUp(self):
+        self.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        # Initialize the engine in offline (direct) mode.
+        self.engine = sgl.Engine(model_path=self.model)
+
+    def tearDown(self):
+        self.engine.shutdown()
+
+    def run_decode(self):
+        prompts = ["The capital of France is"]
+        sampling_params = {"temperature": 0, "max_new_tokens": 32}
+        outputs = self.engine.generate(prompts, sampling_params)
+        print("=" * 100)
+        print(
+            f"[Engine Mode] Prompt: {prompts[0]}\nGenerated text: {outputs[0]['text']}"
+        )
+        return outputs[0]["text"]
+
+    def run_update_weights(self, model_path):
+        ret = self.engine.update_weights_from_disk(model_path)
+        print(json.dumps(ret))
+        return ret
+
+    def test_update_weights(self):
+        origin_response = self.run_decode()
+        # Update weights: use new model (remove "-Instruct")
+        new_model_path = self.model.replace("-Instruct", "")
+        ret = self.run_update_weights(new_model_path)
+        self.assertTrue(ret[0])  # ret is a tuple; index 0 holds the success flag
+
+        updated_response = self.run_decode()
+        self.assertNotEqual(origin_response[:32], updated_response[:32])
+
+        # Revert back to original weights
+        ret = self.run_update_weights(self.model)
+        self.assertTrue(ret[0])
+        reverted_response = self.run_decode()
+        self.assertEqual(origin_response[:32], reverted_response[:32])
+
+    def test_update_weights_unexist_model(self):
+        origin_response = self.run_decode()
+        new_model_path = self.model.replace("-Instruct", "wrong")
+        ret = self.run_update_weights(new_model_path)
+        self.assertFalse(ret[0])
+        updated_response = self.run_decode()
+        self.assertEqual(origin_response[:32], updated_response[:32])
+
+
+###############################################################################
+# HTTP Server Mode Tests (Single-configuration)
+###############################################################################
+class TestServerUpdateWeightsFromDisk(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def run_decode(self):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": "The capital of France is",
+                "sampling_params": {"temperature": 0, "max_new_tokens": 32},
+            },
+        )
+        print("=" * 100)
+        print(f"[Server Mode] Generated text: {response.json()['text']}")
+        return response.json()["text"]
+
+    def get_model_info(self):
+        response = requests.get(self.base_url + "/get_model_info")
+        model_path = response.json()["model_path"]
+        print(json.dumps(response.json()))
+        return model_path
+
+    def run_update_weights(self, model_path):
+        response = requests.post(
+            self.base_url + "/update_weights_from_disk",
+            json={"model_path": model_path},
+        )
+        ret = response.json()
+        print(json.dumps(ret))
+        return ret
+
+    def test_update_weights(self):
+        origin_model_path = self.get_model_info()
+        print(f"[Server Mode] origin_model_path: {origin_model_path}")
+        origin_response = self.run_decode()
+
+        new_model_path = DEFAULT_SMALL_MODEL_NAME_FOR_TEST.replace("-Instruct", "")
+        ret = self.run_update_weights(new_model_path)
+        self.assertTrue(ret["success"])
+
+        updated_model_path = self.get_model_info()
+        print(f"[Server Mode] updated_model_path: {updated_model_path}")
+        self.assertEqual(updated_model_path, new_model_path)
+        self.assertNotEqual(updated_model_path, origin_model_path)
+
+        updated_response = self.run_decode()
+        self.assertNotEqual(origin_response[:32], updated_response[:32])
+
+        ret = self.run_update_weights(origin_model_path)
+        self.assertTrue(ret["success"])
+        updated_model_path = self.get_model_info()
+        self.assertEqual(updated_model_path, origin_model_path)
+
+        updated_response = self.run_decode()
+        self.assertEqual(origin_response[:32], updated_response[:32])
+
+    def test_update_weights_unexist_model(self):
+        origin_model_path = self.get_model_info()
+        print(f"[Server Mode] origin_model_path: {origin_model_path}")
+        origin_response = self.run_decode()
+
+        new_model_path = DEFAULT_SMALL_MODEL_NAME_FOR_TEST.replace("-Instruct", "wrong")
+        ret = self.run_update_weights(new_model_path)
+        self.assertFalse(ret["success"])
+
+        updated_model_path = self.get_model_info()
+        print(f"[Server Mode] updated_model_path: {updated_model_path}")
+        self.assertEqual(updated_model_path, origin_model_path)
+
+        updated_response = self.run_decode()
+        self.assertEqual(origin_response[:32], updated_response[:32])
+
+
+class TestServerUpdateWeightsFromDiskAbortAllRequests(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--max-running-requests", 8],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def run_decode(self, max_new_tokens=32):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": max_new_tokens,
+                    "ignore_eos": True,
+                },
+            },
+        )
+        return response.json()
+
+    def get_model_info(self):
+        response = requests.get(self.base_url + "/get_model_info")
+        model_path = response.json()["model_path"]
+        print(json.dumps(response.json()))
+        return model_path
+
+    def run_update_weights(self, model_path, abort_all_requests=False):
+        response = requests.post(
+            self.base_url + "/update_weights_from_disk",
+            json={
+                "model_path": model_path,
+                "abort_all_requests": abort_all_requests,
+            },
+        )
+        ret = response.json()
+        print(json.dumps(ret))
+        return ret
+
+    def test_update_weights_abort_all_requests(self):
+        origin_model_path = self.get_model_info()
+        print(f"[Server Mode] origin_model_path: {origin_model_path}")
+
+        num_requests = 32
+        with ThreadPoolExecutor(num_requests) as executor:
+            futures = [
+                executor.submit(self.run_decode, 16000) for _ in range(num_requests)
+            ]
+
+            # ensure the decode has been started
+            time.sleep(2)
+
+            new_model_path = DEFAULT_SMALL_MODEL_NAME_FOR_TEST.replace("-Instruct", "")
+            ret = self.run_update_weights(new_model_path, abort_all_requests=True)
+            self.assertTrue(ret["success"])
+
+            for future in as_completed(futures):
+                self.assertEqual(
+                    future.result()["meta_info"]["finish_reason"]["type"], "abort"
+                )
+
+        updated_model_path = self.get_model_info()
+        print(f"[Server Mode] updated_model_path: {updated_model_path}")
+        self.assertEqual(updated_model_path, new_model_path)
+        self.assertNotEqual(updated_model_path, origin_model_path)
+
+
+###############################################################################
+# Parameterized Tests for update_weights_from_disk
+# Test coverage is determined based on the value of is_in_ci:
+# - In a CI environment: randomly select one mode (Engine or Server) and test only with tp=1, dp=1.
+# - In a non-CI environment: test both Engine and Server modes, and enumerate all combinations
+#   with tp and dp ranging from 1 to 2.
+###############################################################################
+class TestUpdateWeightsFromDiskParameterized(CustomTestCase):
+    def run_common_test(self, mode, tp, dp):
+        """
+        Common test procedure for update_weights_from_disk.
+        For Engine mode, we instantiate the engine with tp_size=tp.
+        For Server mode, we launch the server with additional arguments for tp (dp is not used in server launch here).
+        """
+        if mode == "Engine":
+            # Instantiate engine with additional parameter tp_size.
+            print(f"[Parameterized Engine] Testing with tp={tp}, dp={dp}")
+            engine = sgl.Engine(
+                model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+                random_seed=42,
+                tp_size=tp,
+                # dp parameter is not explicitly used in this API.
+            )
+            try:
+                origin_response = self._engine_update_weights_test(engine)
+            finally:
+                engine.shutdown()
+        elif mode == "Server":
+            print(f"[Parameterized Server] Testing with tp={tp}, dp={dp}")
+            # Pass additional arguments to launch the server.
+            base_args = ["--tp-size", str(tp)]
+            process = popen_launch_server(
+                DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+                DEFAULT_URL_FOR_TEST,
+                timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                other_args=base_args,
+            )
+            try:
+                origin_response = self._server_update_weights_test(DEFAULT_URL_FOR_TEST)
+            finally:
+                kill_process_tree(process.pid)
+        else:
+            raise ValueError(f"Unknown mode: {mode}")
+
+    def _engine_update_weights_test(self, engine):
+        # Run the update weights test on the given engine instance.
+        def run_decode():
+            prompts = ["The capital of France is"]
+            sampling_params = {"temperature": 0, "max_new_tokens": 32}
+            outputs = engine.generate(prompts, sampling_params)
+            print("=" * 100)
+            print(
+                f"[Parameterized Engine] Prompt: {prompts[0]}\nGenerated text: {outputs[0]['text']}"
+            )
+            return outputs[0]["text"]
+
+        def run_update_weights(model_path):
+            ret = engine.update_weights_from_disk(model_path)
+            print(json.dumps(ret))
+            return ret
+
+        origin_response = run_decode()
+        new_model_path = DEFAULT_SMALL_MODEL_NAME_FOR_TEST.replace("-Instruct", "")
+        ret = run_update_weights(new_model_path)
+        self.assertTrue(ret[0])
+        updated_response = run_decode()
+        self.assertNotEqual(origin_response[:32], updated_response[:32])
+        ret = run_update_weights(DEFAULT_SMALL_MODEL_NAME_FOR_TEST)
+        self.assertTrue(ret[0])
+        reverted_response = run_decode()
+        self.assertEqual(origin_response[:32], reverted_response[:32])
+        return origin_response
+
+    def _server_update_weights_test(self, base_url):
+        def run_decode():
+            response = requests.post(
+                base_url + "/generate",
+                json={
+                    "text": "The capital of France is",
+                    "sampling_params": {"temperature": 0, "max_new_tokens": 32},
+                },
+            )
+            print("=" * 100)
+            print(f"[Parameterized Server] Generated text: {response.json()['text']}")
+            return response.json()["text"]
+
+        def get_model_info():
+            response = requests.get(base_url + "/get_model_info")
+            model_path = response.json()["model_path"]
+            print(json.dumps(response.json()))
+            return model_path
+
+        def run_update_weights(model_path):
+            response = requests.post(
+                base_url + "/update_weights_from_disk",
+                json={"model_path": model_path},
+            )
+            ret = response.json()
+            print(json.dumps(ret))
+            return ret
+
+        origin_model_path = get_model_info()
+        origin_response = run_decode()
+        new_model_path = DEFAULT_SMALL_MODEL_NAME_FOR_TEST.replace("-Instruct", "")
+        ret = run_update_weights(new_model_path)
+        self.assertTrue(ret["success"])
+        updated_model_path = get_model_info()
+        self.assertEqual(updated_model_path, new_model_path)
+        self.assertNotEqual(updated_model_path, origin_model_path)
+        updated_response = run_decode()
+        self.assertNotEqual(origin_response[:32], updated_response[:32])
+        ret = run_update_weights(origin_model_path)
+        self.assertTrue(ret["success"])
+        updated_model_path = get_model_info()
+        self.assertEqual(updated_model_path, origin_model_path)
+        reverted_response = run_decode()
+        self.assertEqual(origin_response[:32], reverted_response[:32])
+        return origin_response
+
+    def test_parameterized_update_weights(self):
+        if is_in_ci():
+            # In CI, choose one random mode (Engine or Server) with tp=1, dp=1.
+            mode = random.choice(["Engine", "Server"])
+            test_suits = [(1, 1, mode)]
+        else:
+            # Otherwise, test both modes and enumerate tp,dp combinations from 1 to 2.
+            test_suits = []
+            for mode in ["Engine", "Server"]:
+                for tp in [1, 2]:
+                    for dp in [1, 2]:
+                        test_suits.append((tp, dp, mode))
+        for tp, dp, mode in test_suits:
+            with self.subTest(mode=mode, tp=tp, dp=dp):
+                self.run_common_test(mode, tp, dp)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/rl/test_update_weights_from_distributed.py b/test/srt/rl/test_update_weights_from_distributed.py
new file mode 100644
index 000000000..a3b938c38
--- /dev/null
+++ b/test/srt/rl/test_update_weights_from_distributed.py
@@ -0,0 +1,615 @@
+"""Test distributed weight updates.
+
+This test suite simulates a distributed training environment to ensure
+correct weight synchronization. On rank 0, the instruct model represents
+pre-training weights, and the base model represents post-training weights.
+The base model's weights are broadcasted to other ranks using the online
+weight update API.
+
+On other ranks, an engine is initialized with the instruct model, and its
+parameters are verified against the Hugging Face model. After updating
+weights from the distributed system, post-training weights are loaded
+and verified again to ensure consistency and accuracy across the
+distributed setup.
+"""
+
+import gc
+import os
+import random
+import time
+import unittest
+
+import numpy as np
+import requests
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from transformers import AutoModelForCausalLM
+
+import sglang as sgl
+from sglang.srt.utils import init_custom_process_group
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+)
+from sglang.utils import terminate_process
+
+mp.set_start_method("spawn", force=True)
+
+
+def verify_params_close(params1, params2, error_msg):
+    """Verify if two parameter arrays are close enough."""
+    try:
+        assert np.allclose(np.array(params1), np.array(params2)), error_msg
+    except Exception as e:
+        print(f"Parameters not close for {error_msg}")
+        print("Params1:", np.array(params1))
+        print("Params2:", np.array(params2))
+        raise e
+
+
+def verify_params_not_close(params1, params2, error_msg):
+    """Verify if two parameter arrays are different enough."""
+    assert not np.allclose(np.array(params1), np.array(params2)), error_msg
+
+
+def init_process(
+    rank,
+    world_size,
+    param_queue,
+    truncate_size,
+    state_dict_key_to_shape,
+    tp_size,
+    model_name,
+    backend,
+    checking_parameters,
+    tie_word_embeddings,
+):
+    torch.cuda.set_device(rank)
+
+    if rank == 0:
+        init_process_hf(
+            rank,
+            world_size,
+            param_queue,
+            truncate_size,
+            model_name,
+            checking_parameters,
+            tie_word_embeddings,
+            state_dict_key_to_shape,
+        )
+    elif rank in [1, 2]:
+        init_process_sgl(
+            rank,
+            world_size,
+            param_queue,
+            truncate_size,
+            model_name,
+            checking_parameters,
+            tie_word_embeddings,
+            state_dict_key_to_shape,
+            backend,
+            tp_size,
+        )
+
+
+def init_process_hf(
+    rank,
+    world_size,
+    param_queue,
+    truncate_size,
+    model_name,
+    checking_parameters,
+    tie_word_embeddings,
+    state_dict_key_to_shape,
+):
+    # These two environment variables are very important
+    # to avoid unexpected behaviors of CUDA and NCCL.
+    os.environ["NCCL_CUMEM_ENABLE"] = "0"
+    os.environ["NCCL_NVLS_ENABLE"] = "0"
+
+    # Load model and get parameters
+    hf_instruct_model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype="bfloat16",
+        tie_word_embeddings=tie_word_embeddings,
+    ).to("cuda:0")
+    base_model_name = model_name.replace("-Instruct", "")
+    hf_base_model = AutoModelForCausalLM.from_pretrained(
+        base_model_name,
+        torch_dtype="bfloat16",
+        tie_word_embeddings=tie_word_embeddings,
+    ).to("cuda:0")
+
+    hf_instruct_params = []
+    hf_base_params = []
+
+    print("[hf] get parameter in hf instruct model and base model")
+    for parameter_name in checking_parameters:
+        hf_instruct_params.append(
+            hf_instruct_model.get_parameter(parameter_name)[:truncate_size]
+            .cpu()
+            .detach()
+            .float()
+            .numpy()
+            .tolist()
+        )
+        hf_base_params.append(
+            hf_base_model.get_parameter(parameter_name)[:truncate_size]
+            .cpu()
+            .detach()
+            .float()
+            .numpy()
+            .tolist()
+        )
+
+    param_queue.put(("hf_instruct_params", hf_instruct_params))
+    param_queue.put(("hf_base_params", hf_base_params))
+
+    # Init weight update group for rank 0 (the training engine in RLHF).
+    port = 60000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 100
+    init_method = f"tcp://localhost:{port}"
+    print(f"[hf] {rank=} {world_size=} init custom process group. {init_method=}")
+    group = init_custom_process_group(
+        backend="nccl",
+        init_method=init_method,
+        world_size=world_size,
+        rank=rank,
+        group_name="test_parameter_update_group",
+    )
+    torch.cuda.synchronize()
+    time_begin_broadcast = time.perf_counter()
+
+    # The last parameter is lm_head.weight, which is tied
+    # with embed_tokens.weight. Actually, we only need
+    # to broadcast embed_tokens.weight once.
+    broadcast_parameters = list(state_dict_key_to_shape.keys())
+    if tie_word_embeddings:
+        broadcast_parameters.remove("lm_head.weight")
+
+    # Broadcast all the weights from the training
+    # engine to other ranks (inference engine).
+    for parameter_name in broadcast_parameters:
+        torch.distributed.broadcast(
+            hf_base_model.get_parameter(parameter_name),
+            src=0,
+            group=group,
+        )
+    torch.cuda.synchronize()
+    time_end_broadcast = time.perf_counter()
+
+    # Measure the latency of broadcasting/weights update.
+    broadcast_time = time_end_broadcast - time_begin_broadcast
+    print(f"[hf] {rank=} {broadcast_time=:.3f}s")
+    param_queue.put(("broadcast_time", broadcast_time))
+
+    # Delete the huggingface models to free up memory.
+    del hf_instruct_model
+    del hf_base_model
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+def init_process_sgl(
+    rank,
+    world_size,
+    param_queue,
+    truncate_size,
+    model_name,
+    checking_parameters,
+    tie_word_embeddings,
+    state_dict_key_to_shape,
+    backend,
+    tp_size,
+):
+    torch.cuda.set_device(rank)
+    torch.cuda.synchronize()
+    base_gpu_id = 1 if rank == 1 else 1 + tp_size
+    if backend == "Engine":
+        print(f"[sgl] rank {rank} init engine")
+        engine = sgl.Engine(
+            model_path=model_name,
+            base_gpu_id=base_gpu_id,
+            tp_size=tp_size,
+            cuda_graph_max_bs=2,
+        )
+    else:
+        if rank == 1:
+            url = DEFAULT_URL_FOR_TEST
+        else:
+            host, _, port = DEFAULT_URL_FOR_TEST.rpartition(":")
+            url = ":".join([host, str(int(port) + 10000)])
+
+        print(f"[sgl] rank {rank} init server on url: {url}")
+        process = popen_launch_server(
+            model_name,
+            url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=(
+                "--base-gpu-id",
+                str(base_gpu_id),
+                "--tp-size",
+                str(tp_size),
+                "--cuda-graph-max-bs",
+                2,
+            ),
+        )
+    torch.cuda.synchronize()
+
+    # Get weights of instruct model, i.e. pre-training weights.
+    instruct_params = []
+    for parameter_name in checking_parameters:
+        instruct_params.append(
+            engine.get_weights_by_name(parameter_name, truncate_size)
+            if backend == "Engine"
+            else requests.get(
+                f"{url}/get_weights_by_name",
+                json={"name": parameter_name, "truncate_size": truncate_size},
+            ).json()
+        )
+
+    param_queue.put((f"sgl_dp_{rank}_instruct_params", instruct_params))
+
+    port = 60000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 100
+
+    # Init weight update group with the training engine.
+    if backend == "Engine":
+        engine.init_weights_update_group(
+            master_address="localhost",
+            master_port=str(port),
+            rank_offset=base_gpu_id,
+            world_size=world_size,
+            group_name="test_parameter_update_group",
+            backend="nccl",
+        )
+    else:
+        requests.post(
+            f"{url}/init_weights_update_group",
+            json={
+                "master_address": "localhost",
+                "master_port": str(port),
+                "rank_offset": base_gpu_id,
+                "world_size": world_size,
+                "group_name": "test_parameter_update_group",
+                "backend": "nccl",
+            },
+        )
+
+    torch.cuda.synchronize()
+    time_begin_update = time.perf_counter()
+
+    # The last parameter is lm_head.weight, which is tied
+    # with embed_tokens.weight. Actually, we only need
+    # to update embed_tokens.weight once.
+    tie_word_embeddings = (
+        True if model_name == DEFAULT_SMALL_MODEL_NAME_FOR_TEST else False
+    )
+    update_parameters = list(state_dict_key_to_shape.keys())
+    if tie_word_embeddings:
+        update_parameters.remove("lm_head.weight")
+
+    # Get weights from the training engine and update the inference engine.
+    names = [parameter_name for parameter_name in update_parameters]
+    dtypes = [torch.bfloat16 if backend == "Engine" else "bfloat16"] * len(names)
+    shapes = [state_dict_key_to_shape[parameter_name] for parameter_name in names]
+
+    if backend == "Engine":
+        engine.update_weights_from_distributed(
+            names,
+            dtypes=dtypes,
+            shapes=shapes,
+            group_name="test_parameter_update_group",
+        )
+    else:
+        requests.post(
+            f"{url}/update_weights_from_distributed",
+            json={
+                "names": names,
+                "dtypes": dtypes,
+                "shapes": shapes,
+                "group_name": "test_parameter_update_group",
+            },
+        )
+    torch.cuda.synchronize()
+    time_end_update = time.perf_counter()
+
+    # Measure the latency of broadcast/weights update.
+    update_time = time_end_update - time_begin_update
+    print(
+        f"[sgl] fully update model_name {model_name} rank {rank} parameter from distributed time: {update_time:.3f}s"
+    )
+    param_queue.put((f"update_sgl_dp_{rank}_time", update_time))
+
+    # Get the weights of post-training model after weights update for correctness check.
+    base_params = []
+    for parameter_name in checking_parameters:
+        if backend == "Engine":
+            base_params.append(
+                engine.get_weights_by_name(parameter_name, truncate_size)
+            )
+        else:
+            base_params.append(
+                requests.get(
+                    f"{url}/get_weights_by_name",
+                    json={
+                        "name": parameter_name,
+                        "truncate_size": truncate_size,
+                    },
+                ).json()
+            )
+    param_queue.put((f"sgl_dp_{rank}_base_params", base_params))
+
+    # Shutdown the engine or terminate the server process.
+    if backend == "Engine":
+        engine.shutdown()
+    else:
+        terminate_process(process)
+
+
+def assert_tied_weights(params_list, message, should_be_tied):
+    for params in params_list:
+        if should_be_tied:
+            assert np.allclose(params[0], params[-1]), message
+        else:
+            assert not np.allclose(params[0], params[-1]), message
+
+
+def test_update_weights_from_distributed(
+    tp_size,
+    dp_size,
+    model_name,
+    backend,
+    state_dict_key_to_shape,
+    truncate_size,
+    checking_parameters,
+):
+    tie_word_embeddings = (
+        True if model_name == DEFAULT_SMALL_MODEL_NAME_FOR_TEST else False
+    )
+
+    print(
+        f"Testing model: {model_name} tp_size: {tp_size}, dp_size: {dp_size} backend: {backend}"
+    )
+    param_queue = mp.Queue()
+    results = {}
+
+    context = mp.spawn(
+        init_process,
+        args=(
+            1 + tp_size * dp_size,
+            param_queue,
+            truncate_size,
+            state_dict_key_to_shape,
+            tp_size,
+            model_name,
+            backend,
+            checking_parameters,
+            tie_word_embeddings,
+        ),
+        nprocs=1 + dp_size,
+        join=False,
+    )
+
+    while len(results) < 3 * (1 + dp_size):
+        try:
+            key, value = param_queue.get(timeout=5)
+            results[key] = value
+        except Exception as e:
+            if all(not p.is_alive() for p in context.processes):
+                break
+
+    context.join()
+
+    if len(results) != 3 * (1 + dp_size):
+        raise RuntimeError(
+            f"Expected {3 * (1 + dp_size)} parameters but got {len(results)}"
+        )
+
+    params = {
+        "hf_instruct": results.get("hf_instruct_params"),
+        "hf_base": results.get("hf_base_params"),
+        "sgl_dp_1_instruct": results.get("sgl_dp_1_instruct_params"),
+        "sgl_dp_1_base": results.get("sgl_dp_1_base_params"),
+        "broadcast_time": results.get("broadcast_time"),
+        "update_sgl_dp_1_time": results.get("update_sgl_dp_1_time"),
+    }
+
+    if dp_size == 2:
+        dp2_params = {
+            "sgl_dp_2_instruct": results.get("sgl_dp_2_instruct_params"),
+            "sgl_dp_2_base": results.get("sgl_dp_2_base_params"),
+            "update_sgl_dp_2_time": results.get("update_sgl_dp_2_time"),
+        }
+        assert all(v is not None for v in dp2_params.values())
+        params.update(dp2_params)
+
+    # Check the correctness of weights update by verifying
+    # the weights of instruct model and base model.
+    for i in range(len(params["hf_instruct"])):
+        verify_params_close(
+            params["hf_instruct"][i],
+            params["sgl_dp_1_instruct"][i],
+            f"sgl_dp_1_instruct_params rank {i}",
+        )
+
+        verify_params_close(
+            params["hf_base"][i],
+            params["sgl_dp_1_base"][i],
+            f"sgl_dp_1_base_params rank {i}",
+        )
+
+        verify_params_not_close(
+            params["hf_instruct"][i],
+            params["hf_base"][i],
+            f"hf_instruct_params rank {i}",
+        )
+
+        if dp_size == 2:
+            verify_params_close(
+                params["hf_base"][i],
+                params["sgl_dp_2_base"][i],
+                f"sgl_dp_2_base_params rank {i}",
+            )
+            verify_params_close(
+                params["hf_instruct"][i],
+                params["sgl_dp_2_instruct"][i],
+                f"sgl_dp_2_instruct_params rank {i}",
+            )
+
+    assert len(params["hf_instruct"]) == len(
+        params["hf_base"]
+    ), "hf_instruct_params and hf_base_params have different lengths"
+
+    # Check if the weights of lm_head are tied with embed_tokens.
+    params_to_check = [
+        (
+            params["hf_instruct"],
+            "lm_head.weight is not tied with embed_tokens.weight",
+        ),
+        (
+            params["hf_base"],
+            "lm_head.weight is not tied with embed_tokens.weight",
+        ),
+        (
+            params["sgl_dp_1_instruct"],
+            "lm_head.weight is not tied with embed_tokens.weight",
+        ),
+        (
+            params["sgl_dp_1_base"],
+            "lm_head.weight is not tied with embed_tokens.weight",
+        ),
+    ]
+
+    if dp_size == 2:
+        params_to_check.extend(
+            [
+                (
+                    params["sgl_dp_2_instruct"],
+                    "lm_head.weight is not tied with embed_tokens.weight",
+                ),
+                (
+                    params["sgl_dp_2_base"],
+                    "lm_head.weight is not tied with embed_tokens.weight",
+                ),
+            ]
+        )
+
+    assert_tied_weights(
+        [params for params, _ in params_to_check],
+        (
+            "lm_head.weight is not tied with embed_tokens.weight"
+            if tie_word_embeddings
+            else "lm_head.weight is tied with embed_tokens.weight"
+        ),
+        tie_word_embeddings,
+    )
+
+    # Time limit for broadcast and update on CI is 3 / 6
+    # On local H100, it's 1 / 2
+    time_limit = 3 if model_name == DEFAULT_SMALL_MODEL_NAME_FOR_TEST else 6
+
+    assert (
+        params["broadcast_time"] < time_limit
+    ), f"broadcast_time exceeds time limit {time_limit}s"
+
+    assert (
+        params["update_sgl_dp_1_time"] < time_limit
+    ), f"update_sgl_dp_one_time exceeds time limit {time_limit}s"
+
+    if dp_size == 2:
+        assert (
+            params["update_sgl_dp_2_time"] < time_limit
+        ), f"update_sgl_dp_two_time exceeds time limit {time_limit}s"
+
+    # Delete the context and close the parameter queue.
+    del context
+    param_queue.close()
+    param_queue.join_thread()
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+class TestUpdateWeightsFromDistributed(CustomTestCase):
+
+    def test_update_weights_from_distributed(self):
+
+        assert torch.cuda.device_count() >= 2, "At least 2 GPUs are required"
+        # test_suits : tp, dp, model_name, backend
+        if is_in_ci():
+            mode = random.choice(["Engine", "Server"])
+            test_suits = [
+                (1, 1, DEFAULT_SMALL_MODEL_NAME_FOR_TEST, mode),
+            ]
+        else:
+            test_suits = [
+                (1, 1, DEFAULT_SMALL_MODEL_NAME_FOR_TEST, "Engine"),
+                (1, 1, DEFAULT_MODEL_NAME_FOR_TEST, "Sever"),
+            ]
+
+            if torch.cuda.device_count() >= 4:
+                test_suits.extend(
+                    [
+                        (2, 1, DEFAULT_SMALL_MODEL_NAME_FOR_TEST, "Engine"),
+                        (1, 2, DEFAULT_MODEL_NAME_FOR_TEST, "Server"),
+                    ]
+                )
+
+            if torch.cuda.device_count() >= 5:
+                test_suits.extend(
+                    [
+                        (2, 2, DEFAULT_SMALL_MODEL_NAME_FOR_TEST, "Engine"),
+                        (2, 2, DEFAULT_MODEL_NAME_FOR_TEST, "Server"),
+                    ]
+                )
+
+        model_state_dict_shapes = {}
+        test_models = [test_suit[2] for test_suit in test_suits]
+
+        for model_name in test_models:
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name, torch_dtype="bfloat16"
+            ).to("cuda:0")
+            state_dict = model.state_dict()
+            state_dict_keys = list(state_dict.keys())
+            model_state_dict_shapes[model_name] = {
+                key: state_dict[key].shape for key in state_dict_keys
+            }
+            del model
+            gc.collect()
+            torch.cuda.empty_cache()
+
+        truncate_size = 10
+        checking_parameters = [
+            "model.embed_tokens.weight",
+            "model.layers.0.input_layernorm.weight",
+            "model.layers.1.self_attn.q_proj.weight",
+            "model.layers.2.self_attn.k_proj.weight",
+            "model.layers.3.self_attn.v_proj.weight",
+            "model.layers.4.self_attn.o_proj.weight",
+            "model.layers.5.mlp.gate_proj.weight",
+            "model.layers.6.mlp.up_proj.weight",
+            "model.layers.7.mlp.down_proj.weight",
+            "model.layers.8.post_attention_layernorm.weight",
+            "model.norm.weight",
+            "lm_head.weight",
+        ]
+
+        for tp_size, dp_size, model_name, backend in test_suits:
+            test_update_weights_from_distributed(
+                tp_size,
+                dp_size,
+                model_name,
+                backend,
+                model_state_dict_shapes[model_name],
+                truncate_size,
+                checking_parameters,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/rl/test_update_weights_from_tensor.py b/test/srt/rl/test_update_weights_from_tensor.py
new file mode 100644
index 000000000..0dc947b60
--- /dev/null
+++ b/test/srt/rl/test_update_weights_from_tensor.py
@@ -0,0 +1,178 @@
+import gc
+import time
+import unittest
+
+import torch
+
+import sglang as sgl
+from sglang.srt.weight_sync.tensor_bucket import FlattenedTensorBucket
+from sglang.test.test_utils import DEFAULT_SMALL_MODEL_NAME_FOR_TEST, CustomTestCase
+
+
+def test_update_weights_from_tensor(tp_size):
+    assert torch.cuda.device_count() >= tp_size, f"At least {tp_size} GPUs are required"
+    torch.cuda.empty_cache()
+
+    engine = sgl.Engine(model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST, tp_size=tp_size)
+
+    param_names = [f"model.layers.{i}.mlp.up_proj.weight" for i in range(6, 16)]
+
+    _check_param(engine, param_names[0], [0.0087, -0.0214, -0.0004, 0.0039, 0.0110])
+
+    memory_before = torch.cuda.memory_allocated()
+    new_tensor = torch.full((16384, 2048), 1.5, device="cuda")
+
+    time_start = time.perf_counter()
+    engine.update_weights_from_tensor([(x, new_tensor) for x in param_names])
+    print(f"Time delta: {time.perf_counter() - time_start:.03f}")
+
+    for param_name in param_names[:3]:
+        _check_param(engine, param_name, [1.5] * 5)
+
+    engine.shutdown()
+
+    del new_tensor
+    gc.collect()
+    torch.cuda.ipc_collect()
+    torch.cuda.empty_cache()
+    memory_after = torch.cuda.memory_allocated()
+    assert (
+        memory_after <= memory_before + 1024
+    ), f"Memory leak detected: {memory_after - memory_before} bytes"
+
+
+class TestUpdateWeightsFromTensor(CustomTestCase):
+    def test_update_weights_from_tensor(self):
+        tp_sizes = [1, 2]
+        for tp_size in tp_sizes:
+            if torch.cuda.device_count() < tp_size:
+                continue
+
+            with self.subTest(tp_size=tp_size):
+                test_update_weights_from_tensor(tp_size)
+
+    def test_update_weights_from_tensor_load_format_direct(self):
+        engine = sgl.Engine(model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST)
+
+        write_param_names = [
+            f"model.layers.{i}.self_attn.qkv_proj.weight" for i in range(6, 16)
+        ]
+        read_param_names = [
+            f"model.layers.{i}.self_attn.k_proj.weight" for i in range(6, 16)
+        ]
+
+        _check_param(
+            engine, read_param_names[0], [-0.0198, 0.0227, 0.0168, 0.0232, -0.0178]
+        )
+
+        new_tensor = torch.full((3072, 2048), 1.5)
+        engine.update_weights_from_tensor(
+            [
+                (write_param_name, new_tensor.clone())
+                for write_param_name in write_param_names
+            ],
+            load_format="direct",
+        )
+
+        for read_param_name in read_param_names[:3]:
+            _check_param(engine, read_param_name, [1.5] * 5)
+
+        engine.shutdown()
+
+    def test_update_weights_from_tensor_load_format_custom(self):
+        custom_loader_name = (
+            "sglang.srt.model_executor.model_runner._model_load_weights_direct"
+        )
+        engine = sgl.Engine(
+            model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+            custom_weight_loader=[custom_loader_name],
+        )
+
+        write_param_names = [
+            f"model.layers.{i}.self_attn.qkv_proj.weight" for i in range(6, 16)
+        ]
+        read_param_names = [
+            f"model.layers.{i}.self_attn.k_proj.weight" for i in range(6, 16)
+        ]
+
+        _check_param(
+            engine, read_param_names[0], [-0.0198, 0.0227, 0.0168, 0.0232, -0.0178]
+        )
+
+        new_tensor = torch.full((3072, 2048), 1.5)
+        engine.update_weights_from_tensor(
+            [
+                (write_param_name, new_tensor.clone())
+                for write_param_name in write_param_names
+            ],
+            load_format=custom_loader_name,
+        )
+
+        for read_param_name in read_param_names[:3]:
+            _check_param(engine, read_param_name, [1.5] * 5)
+
+        engine.shutdown()
+
+    def test_update_weights_from_tensor_load_format_flattened_bucket(self):
+        """Test updating weights using flattened_bucket format"""
+        engine = sgl.Engine(model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST)
+
+        # Create a small set of parameters for testing
+        param_names = [f"model.layers.{i}.mlp.up_proj.weight" for i in range(6, 10)]
+
+        # Check original values
+        _check_param(engine, param_names[0], [0.0087, -0.0214, -0.0004, 0.0039, 0.0110])
+
+        # Create new tensors with different values
+        new_tensors = []
+        for _, name in enumerate(param_names):
+            # Create tensors with different values for each parameter
+            value = 2.0  # Different value for each parameter
+            new_tensor = torch.full((16384, 2048), value, device="cuda")
+            new_tensors.append((name, new_tensor))
+
+        # Create a flattened bucket
+        flattened_bucket = FlattenedTensorBucket(named_tensors=new_tensors)
+
+        # Extract the flattened tensor and metadata in the format expected by model_runner
+        flattened_tensor = flattened_bucket.get_flattened_tensor()
+        metadata = flattened_bucket.get_metadata()
+
+        # Create the dict format expected by _update_weights_from_flattened_bucket
+        bucket_dict = {"flattened_tensor": flattened_tensor, "metadata": metadata}
+
+        # Serialize the bucket data
+        from sglang.srt.utils import MultiprocessingSerializer
+
+        serialized_bucket = MultiprocessingSerializer.serialize(
+            bucket_dict, output_str=True
+        )
+
+        # Create a list where each rank contains the same serialized data
+        # This simulates the distributed environment where each rank has the same data
+        serialized_bucket_list = [serialized_bucket]
+
+        # Update weights using flattened_bucket format
+        time_start = time.perf_counter()
+        engine.update_weights_from_tensor(
+            named_tensors=serialized_bucket_list, load_format="flattened_bucket"
+        )
+        update_time = time.perf_counter() - time_start
+        print(f"Flattened bucket update time: {update_time:.03f}")
+
+        # Verify the weights were updated correctly
+        for i, param_name in enumerate(param_names):
+            _check_param(engine, param_name, [2.0] * 5)
+
+        engine.shutdown()
+
+
+def _check_param(engine, param_name, expect_values):
+    actual_values = torch.tensor(engine.get_weights_by_name(param_name))[0, :5]
+    assert torch.allclose(
+        actual_values, torch.tensor(expect_values), atol=0.002
+    ), f"{actual_values=}"
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/rl/test_verl_engine_2_gpu.py b/test/srt/rl/test_verl_engine_2_gpu.py
new file mode 100644
index 000000000..40321ee3f
--- /dev/null
+++ b/test/srt/rl/test_verl_engine_2_gpu.py
@@ -0,0 +1,277 @@
+import multiprocessing
+import multiprocessing as mp
+import os
+import random
+import traceback
+import unittest
+from multiprocessing import Process
+
+import torch
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.fsdp import CPUOffload
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import MixedPrecision
+from torch.distributed.fsdp.api import (
+    ShardedStateDictConfig,
+    ShardingStrategy,
+    StateDictType,
+)
+from transformers import AutoModelForCausalLM
+
+from sglang.srt.entrypoints.verl_engine import VerlEngine
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils import is_port_available
+from sglang.test.runners import (
+    HFRunner,
+    SRTRunner,
+    check_close_model_outputs,
+    get_dtype_str,
+)
+from sglang.test.test_utils import CustomTestCase, find_available_port, is_in_ci
+
+_MAX_NEW_TOKENS = 8
+_PROMPTS = ["1+1=2, 1+2=3, 1+3=4, 1+4=5, 1+5=", "1*1=1, 1*2=2, 1*3=3, 1*4=4, 1*5="]
+_TORCH_DTYPE = torch.float16
+
+# Set to false to temporarily debug issues unrelated to weight update
+_ENABLE_UPDATE_WEIGHTS = True
+# _ENABLE_UPDATE_WEIGHTS = False
+
+# TODO maybe we should add more other models? should we keep it in sync with test_generation_models.py?
+ALL_MODELS = [
+    dict(model_path="meta-llama/Llama-3.2-1B-Instruct"),
+    dict(model_path="Qwen/Qwen2-1.5B"),
+    dict(model_path="allenai/OLMo-1B-0724-hf"),
+    dict(model_path="allenai/OLMo-2-1124-7B-Instruct"),
+    dict(
+        model_path="ibm-granite/granite-3.0-2b-instruct",
+        prefill_tolerance=0.22,
+        decode_tolerance=0.22,
+    ),
+]
+
+
+class TestVerlEngine(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        multiprocessing.set_start_method("spawn")
+
+    def assert_fragment_e2e_execution(
+        self,
+        index: int,
+        model_path: str,
+        mem_fraction_static: float = 0.4,
+        dp_size: int = 1,
+        tp_size: int = 2,
+        tight_memory: bool = False,
+        prefill_tolerance: float = 0.1,
+        decode_tolerance: float = 0.1,
+    ):
+        master_port = find_available_port(23456)
+
+        print(f"assert_fragment_e2e_execution START {index=} {model_path=}")
+
+        processes = []
+        output_reader, output_writer = mp.Pipe(duplex=False)
+        world_size = dp_size * tp_size
+        for rank in range(world_size):
+            p = Process(
+                target=_run_subprocess,
+                kwargs=dict(
+                    rank=rank,
+                    dp_size=dp_size,
+                    tp_size=tp_size,
+                    master_port=master_port,
+                    output_writer=output_writer,
+                    model_path=model_path,
+                    mem_fraction_static=mem_fraction_static,
+                    tight_memory=tight_memory,
+                    prefill_tolerance=prefill_tolerance,
+                    decode_tolerance=decode_tolerance,
+                ),
+            )
+            p.start()
+            processes.append(p)
+
+        for _ in range(tp_size):
+            self.assertTrue(
+                output_reader.recv(),
+                f"Subprocess has error, please see logs above. ({index=} {model_path=})",
+            )
+
+        for p in processes:
+            p.join()
+
+    def test_ci_models(self):
+        ci_models = [random.choice(ALL_MODELS)]
+        for index, model_info in enumerate(ci_models):
+            self.assert_fragment_e2e_execution(index=index, **model_info)
+
+    def test_others(self):
+        if is_in_ci():
+            return
+
+        for index, model_info in enumerate(ALL_MODELS):
+            self.assert_fragment_e2e_execution(index=index, **model_info)
+
+    # def test_adhoc(self):
+    #     self.assert_fragment_e2e_execution(index=0, model_path="meta-llama/Llama-3.2-1B-Instruct")
+
+
+def _run_subprocess(
+    rank: int,
+    dp_size: int,
+    tp_size: int,
+    master_port: int,
+    output_writer,
+    model_path: str,
+    mem_fraction_static: float,
+    tight_memory: bool,
+    prefill_tolerance: float,
+    decode_tolerance: float,
+):
+    try:
+        print(f"subprocess[{rank=}] Start {os.environ.get('CUDA_VISIBLE_DEVICES')=}")
+
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = str(master_port)
+        torch.distributed.init_process_group(rank=rank, world_size=dp_size * tp_size)
+        torch.cuda.set_device(rank)
+
+        base_gpu_id = rank // tp_size * tp_size
+
+        mesh_kwargs = dict(
+            mesh_shape=(dp_size, tp_size, 1), mesh_dim_names=["dp", "tp", "pp"]
+        )
+        inference_device_mesh_device = init_device_mesh("cuda", **mesh_kwargs)
+        inference_device_mesh_cpu = init_device_mesh("cpu", **mesh_kwargs)
+        print(
+            f"subprocess[{rank=},{base_gpu_id=}] {inference_device_mesh_device=} {inference_device_mesh_cpu=}"
+        )
+
+        # hf model is used for comparison
+        hf_model = AutoModelForCausalLM.from_pretrained(
+            model_path, torch_dtype=_TORCH_DTYPE, trust_remote_code=True
+        ).cuda()
+        hf_tokenizer = get_tokenizer(model_path, trust_remote_code=True)
+
+        hf_outputs = HFRunner.forward_generation_raw(
+            base_model=hf_model,
+            prompts=_PROMPTS,
+            max_new_tokens=_MAX_NEW_TOKENS,
+            tokenizer=hf_tokenizer,
+            lora_paths=None,
+            torch_dtype=_TORCH_DTYPE,
+            output_str_only=False,
+        )
+        print(
+            f"subprocess[{rank=}] call hf.forward {hf_outputs=}",
+            flush=True,
+        )
+
+        if _ENABLE_UPDATE_WEIGHTS:
+            if tight_memory:
+                hf_model.cpu()
+                torch.cuda.empty_cache()
+
+            # test update weights
+            print(f"subprocess[{rank=}] get_fsdp_state_dict", flush=True)
+            fsdp_state_dict = _get_fsdp_state_dict(
+                hf_model=hf_model, world_size=dp_size * tp_size
+            )
+
+        engine = VerlEngine(
+            model_path=model_path,
+            load_format="dummy" if _ENABLE_UPDATE_WEIGHTS else "auto",
+            mem_fraction_static=mem_fraction_static,
+            random_seed=42,
+            base_gpu_id=base_gpu_id,
+            trust_remote_code=True,
+            dtype=get_dtype_str(_TORCH_DTYPE),
+            device_mesh_cpu=inference_device_mesh_cpu["tp"],
+        )
+        print(f"subprocess[{rank=}] {engine=}", flush=True)
+
+        if _ENABLE_UPDATE_WEIGHTS:
+            print(f"subprocess[{rank=}] call update_weights_from_tensor", flush=True)
+            engine.update_weights_from_tensor(
+                [(k, v) for k, v in fsdp_state_dict.items()]
+            )
+
+        for enable_batch in [False, True]:
+            if enable_batch:
+                fn = SRTRunner.batch_forward_generation_raw
+            else:
+                fn = SRTRunner.forward_generation_raw
+
+            srt_outputs = fn(
+                prompts=_PROMPTS,
+                max_new_tokens=_MAX_NEW_TOKENS,
+                lora_paths=None,
+                engine=engine,
+            )
+            print(
+                f"subprocess[{rank=}] call srt.forward {enable_batch=} {srt_outputs=}",
+                flush=True,
+            )
+
+            check_close_model_outputs(
+                hf_outputs=hf_outputs,
+                srt_outputs=srt_outputs,
+                prefill_tolerance=prefill_tolerance,
+                decode_tolerance=decode_tolerance,
+                rouge_l_tolerance=1,
+                check_logprobs=not enable_batch,
+                debug_text=f"{enable_batch=} {rank=}",
+            )
+
+        execution_ok = True
+
+    except Exception as e:
+        print(f"subprocess[{rank=}] has error: {e}", flush=True)
+        traceback.print_exc()
+        execution_ok = False
+
+    output_writer.send(execution_ok)
+    output_writer.close()
+
+    if "engine" in locals() and engine is not None:
+        engine.shutdown()
+    print(f"subprocess[{rank=}] end", flush=True)
+
+
+# Adapted from https://github.com/volcengine/verl/blob/main/tests/rollout/run_fsdp_vllm.py
+def _get_fsdp_state_dict(hf_model, world_size: int):
+    device_mesh = init_device_mesh(
+        "cuda", mesh_shape=(world_size,), mesh_dim_names=["fsdp"]
+    )
+
+    mixed_precision = MixedPrecision(
+        param_dtype=torch.bfloat16,
+        reduce_dtype=torch.float32,
+        buffer_dtype=torch.float32,
+    )
+    fsdp_model = FSDP(
+        hf_model,
+        use_orig_params=True,
+        auto_wrap_policy=None,
+        device_id=torch.cuda.current_device(),
+        sharding_strategy=ShardingStrategy.FULL_SHARD,
+        mixed_precision=mixed_precision,
+        cpu_offload=CPUOffload(offload_params=False),
+        sync_module_states=False,
+        device_mesh=device_mesh,
+    )
+    print(f"{fsdp_model=}")
+
+    FSDP.set_state_dict_type(
+        fsdp_model,
+        state_dict_type=StateDictType.SHARDED_STATE_DICT,
+        state_dict_config=ShardedStateDictConfig(),
+    )
+
+    return fsdp_model.state_dict()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/rl/test_verl_engine_4_gpu.py b/test/srt/rl/test_verl_engine_4_gpu.py
new file mode 100644
index 000000000..014f17daf
--- /dev/null
+++ b/test/srt/rl/test_verl_engine_4_gpu.py
@@ -0,0 +1,291 @@
+import multiprocessing
+import multiprocessing as mp
+import os
+import random
+import traceback
+import unittest
+from multiprocessing import Process
+
+import torch
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.fsdp import CPUOffload
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import MixedPrecision
+from torch.distributed.fsdp.api import (
+    ShardedStateDictConfig,
+    ShardingStrategy,
+    StateDictType,
+)
+from transformers import AutoModelForCausalLM
+
+from sglang.srt.entrypoints.verl_engine import VerlEngine
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils import is_port_available
+from sglang.test.runners import (
+    HFRunner,
+    SRTRunner,
+    check_close_model_outputs,
+    get_dtype_str,
+)
+from sglang.test.test_utils import CustomTestCase, find_available_port, is_in_ci
+
+_MAX_NEW_TOKENS = 8
+_PROMPTS = ["1+1=2, 1+2=3, 1+3=4, 1+4=5, 1+5=", "1*1=1, 1*2=2, 1*3=3, 1*4=4, 1*5="]
+_TORCH_DTYPE = torch.float16
+
+# Set to false to temporarily debug issues unrelated to weight update
+_ENABLE_UPDATE_WEIGHTS = True
+# _ENABLE_UPDATE_WEIGHTS = False
+
+# TODO maybe we should add more other models? should we keep it in sync with test_generation_models.py?
+ALL_MODELS = [
+    dict(
+        model_path="Qwen/Qwen2.5-0.5B",
+        dp_size=2,
+        tp_size=2,  # default to 2
+    ),
+    dict(
+        model_path="Qwen/Qwen2.5-14B-Instruct",
+        mem_fraction_static=0.7,
+        dp_size=2,
+        tp_size=2,
+        tight_memory=True,
+        decode_tolerance=1.3,
+    ),  # test_generation_models.py same config (qwen + tp=8) gives 1.22 decode error
+    dict(
+        model_path="THUDM/glm-4-9b-chat",
+        mem_fraction_static=0.5,
+        dp_size=2,
+        tp_size=2,
+        tight_memory=True,
+    ),
+    # Fail to run these models in test_generation_models.py, need to fix that first
+    # dict(model_path="openai-community/gpt2"),
+    # dict(model_path="microsoft/Phi-3-small-8k-instruct"),
+]
+
+
+class TestVerlEngine(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        multiprocessing.set_start_method("spawn")
+
+    def assert_fragment_e2e_execution(
+        self,
+        index: int,
+        model_path: str,
+        mem_fraction_static: float = 0.4,
+        dp_size: int = 1,
+        tp_size: int = 2,
+        tight_memory: bool = False,
+        prefill_tolerance: float = 0.1,
+        decode_tolerance: float = 0.1,
+    ):
+        master_port = find_available_port(23456)
+
+        print(f"assert_fragment_e2e_execution START {index=} {model_path=}")
+
+        processes = []
+        output_reader, output_writer = mp.Pipe(duplex=False)
+        world_size = dp_size * tp_size
+        for rank in range(world_size):
+            p = Process(
+                target=_run_subprocess,
+                kwargs=dict(
+                    rank=rank,
+                    dp_size=dp_size,
+                    tp_size=tp_size,
+                    master_port=master_port,
+                    output_writer=output_writer,
+                    model_path=model_path,
+                    mem_fraction_static=mem_fraction_static,
+                    tight_memory=tight_memory,
+                    prefill_tolerance=prefill_tolerance,
+                    decode_tolerance=decode_tolerance,
+                ),
+            )
+            p.start()
+            processes.append(p)
+
+        for _ in range(tp_size):
+            self.assertTrue(
+                output_reader.recv(),
+                f"Subprocess has error, please see logs above. ({index=} {model_path=})",
+            )
+
+        for p in processes:
+            p.join()
+
+    def test_ci_models(self):
+        ci_models = [random.choice(ALL_MODELS)]
+        for index, model_info in enumerate(ci_models):
+            self.assert_fragment_e2e_execution(index=index, **model_info)
+
+    def test_others(self):
+        if is_in_ci():
+            return
+
+        for index, model_info in enumerate(ALL_OTHER_MODELS):
+            self.assert_fragment_e2e_execution(index=index, **model_info)
+
+    # def test_adhoc(self):
+    #     self.assert_fragment_e2e_execution(index=0, model_path="meta-llama/Llama-3.2-1B-Instruct")
+
+
+def _run_subprocess(
+    rank: int,
+    dp_size: int,
+    tp_size: int,
+    master_port: int,
+    output_writer,
+    model_path: str,
+    mem_fraction_static: float,
+    tight_memory: bool,
+    prefill_tolerance: float,
+    decode_tolerance: float,
+):
+    try:
+        print(f"subprocess[{rank=}] Start {os.environ.get('CUDA_VISIBLE_DEVICES')=}")
+
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = str(master_port)
+        torch.distributed.init_process_group(rank=rank, world_size=dp_size * tp_size)
+        torch.cuda.set_device(rank)
+
+        base_gpu_id = rank // tp_size * tp_size
+
+        mesh_kwargs = dict(
+            mesh_shape=(dp_size, tp_size, 1), mesh_dim_names=["dp", "tp", "pp"]
+        )
+        inference_device_mesh_device = init_device_mesh("cuda", **mesh_kwargs)
+        inference_device_mesh_cpu = init_device_mesh("cpu", **mesh_kwargs)
+        print(
+            f"subprocess[{rank=},{base_gpu_id=}] {inference_device_mesh_device=} {inference_device_mesh_cpu=}"
+        )
+
+        # hf model is used for comparison
+        hf_model = AutoModelForCausalLM.from_pretrained(
+            model_path, torch_dtype=_TORCH_DTYPE, trust_remote_code=True
+        ).cuda()
+        hf_tokenizer = get_tokenizer(model_path, trust_remote_code=True)
+
+        hf_outputs = HFRunner.forward_generation_raw(
+            base_model=hf_model,
+            prompts=_PROMPTS,
+            max_new_tokens=_MAX_NEW_TOKENS,
+            tokenizer=hf_tokenizer,
+            lora_paths=None,
+            torch_dtype=_TORCH_DTYPE,
+            output_str_only=False,
+        )
+        print(
+            f"subprocess[{rank=}] call hf.forward {hf_outputs=}",
+            flush=True,
+        )
+
+        if _ENABLE_UPDATE_WEIGHTS:
+            if tight_memory:
+                hf_model.cpu()
+                torch.cuda.empty_cache()
+
+            # test update weights
+            print(f"subprocess[{rank=}] get_fsdp_state_dict", flush=True)
+            fsdp_state_dict = _get_fsdp_state_dict(
+                hf_model=hf_model, world_size=dp_size * tp_size
+            )
+
+        engine = VerlEngine(
+            model_path=model_path,
+            load_format="dummy" if _ENABLE_UPDATE_WEIGHTS else "auto",
+            mem_fraction_static=mem_fraction_static,
+            random_seed=42,
+            base_gpu_id=base_gpu_id,
+            trust_remote_code=True,
+            dtype=get_dtype_str(_TORCH_DTYPE),
+            device_mesh_cpu=inference_device_mesh_cpu["tp"],
+        )
+        print(f"subprocess[{rank=}] {engine=}", flush=True)
+
+        if _ENABLE_UPDATE_WEIGHTS:
+            print(f"subprocess[{rank=}] call update_weights_from_tensor", flush=True)
+            engine.update_weights_from_tensor(
+                [(k, v) for k, v in fsdp_state_dict.items()]
+            )
+
+        for enable_batch in [False, True]:
+            if enable_batch:
+                fn = SRTRunner.batch_forward_generation_raw
+            else:
+                fn = SRTRunner.forward_generation_raw
+
+            srt_outputs = fn(
+                prompts=_PROMPTS,
+                max_new_tokens=_MAX_NEW_TOKENS,
+                lora_paths=None,
+                engine=engine,
+            )
+            print(
+                f"subprocess[{rank=}] call srt.forward {enable_batch=} {srt_outputs=}",
+                flush=True,
+            )
+
+            check_close_model_outputs(
+                hf_outputs=hf_outputs,
+                srt_outputs=srt_outputs,
+                prefill_tolerance=prefill_tolerance,
+                decode_tolerance=decode_tolerance,
+                rouge_l_tolerance=1,
+                check_logprobs=not enable_batch,
+                debug_text=f"{enable_batch=} {rank=}",
+            )
+
+        execution_ok = True
+
+    except Exception as e:
+        print(f"subprocess[{rank=}] has error: {e}", flush=True)
+        traceback.print_exc()
+        execution_ok = False
+
+    output_writer.send(execution_ok)
+    output_writer.close()
+
+    if "engine" in locals() and engine is not None:
+        engine.shutdown()
+    print(f"subprocess[{rank=}] end", flush=True)
+
+
+# Adapted from https://github.com/volcengine/verl/blob/main/tests/rollout/run_fsdp_vllm.py
+def _get_fsdp_state_dict(hf_model, world_size: int):
+    device_mesh = init_device_mesh(
+        "cuda", mesh_shape=(world_size,), mesh_dim_names=["fsdp"]
+    )
+
+    mixed_precision = MixedPrecision(
+        param_dtype=torch.bfloat16,
+        reduce_dtype=torch.float32,
+        buffer_dtype=torch.float32,
+    )
+    fsdp_model = FSDP(
+        hf_model,
+        use_orig_params=True,
+        auto_wrap_policy=None,
+        device_id=torch.cuda.current_device(),
+        sharding_strategy=ShardingStrategy.FULL_SHARD,
+        mixed_precision=mixed_precision,
+        cpu_offload=CPUOffload(offload_params=False),
+        sync_module_states=False,
+        device_mesh=device_mesh,
+    )
+    print(f"{fsdp_model=}")
+
+    FSDP.set_state_dict_type(
+        fsdp_model,
+        state_dict_type=StateDictType.SHARDED_STATE_DICT,
+        state_dict_config=ShardedStateDictConfig(),
+    )
+
+    return fsdp_model.state_dict()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
new file mode 100644
index 000000000..593920d9d
--- /dev/null
+++ b/test/srt/run_suite.py
@@ -0,0 +1,410 @@
+import argparse
+import glob
+from dataclasses import dataclass
+
+from sglang.test.test_utils import run_unittest_files
+
+
+@dataclass
+class TestFile:
+    name: str
+    estimated_time: float = 60
+
+
+suites = {
+    "per-commit": [
+        TestFile("hicache/test_hicache.py", 116),
+        TestFile("hicache/test_hicache_mla.py", 127),
+        TestFile("hicache/test_hicache_storage.py", 127),
+        TestFile("lora/test_lora.py", 200),
+        TestFile("lora/test_lora_eviction.py", 200),
+        TestFile("lora/test_lora_backend.py", 99),
+        TestFile("lora/test_multi_lora_backend.py", 60),
+        TestFile("lora/test_lora_cuda_graph.py", 250),
+        TestFile("lora/test_lora_update.py", 400),
+        TestFile("lora/test_lora_qwen3.py", 97),
+        TestFile("lora/test_lora_radix_cache.py", 100),
+        TestFile("models/test_embedding_models.py", 73),
+        # TestFile("models/test_clip_models.py", 52),
+        TestFile("models/test_encoder_embedding_models.py", 100),
+        TestFile("models/test_cross_encoder_models.py", 100),
+        TestFile("models/test_compressed_tensors_models.py", 42),
+        TestFile("models/test_generation_models.py", 103),
+        # TestFile("models/test_gme_qwen_models.py", 45),
+        # TestFile("models/test_grok_models.py", 60),  # Disabled due to illegal memory access
+        TestFile("models/test_qwen_models.py", 82),
+        TestFile("models/test_reward_models.py", 132),
+        TestFile("models/test_vlm_models.py", 437),
+        TestFile("models/test_transformers_models.py", 320),
+        TestFile("openai_server/basic/test_protocol.py", 10),
+        TestFile("openai_server/basic/test_serving_chat.py", 10),
+        TestFile("openai_server/basic/test_serving_completions.py", 10),
+        TestFile("openai_server/basic/test_serving_embedding.py", 10),
+        TestFile("openai_server/basic/test_openai_embedding.py", 141),
+        TestFile("openai_server/basic/test_openai_server.py", 149),
+        TestFile("openai_server/features/test_enable_thinking.py", 70),
+        TestFile("openai_server/features/test_json_constrained.py", 98),
+        TestFile("openai_server/features/test_json_mode.py", 90),
+        TestFile("openai_server/features/test_openai_server_ebnf.py", 95),
+        TestFile("openai_server/features/test_openai_server_hidden_states.py", 240),
+        TestFile("openai_server/features/test_reasoning_content.py", 89),
+        TestFile("openai_server/function_call/test_openai_function_calling.py", 60),
+        TestFile("openai_server/function_call/test_tool_choice.py", 226),
+        TestFile("openai_server/validation/test_large_max_new_tokens.py", 41),
+        TestFile("openai_server/validation/test_matched_stop.py", 60),
+        TestFile("openai_server/validation/test_openai_server_ignore_eos.py", 85),
+        TestFile("openai_server/validation/test_request_length_validation.py", 31),
+        TestFile("quant/test_block_int8.py", 22),
+        TestFile("quant/test_fp8_kernel.py", 8),
+        TestFile("quant/test_int8_kernel.py", 8),
+        TestFile("quant/test_triton_scaled_mm.py", 8),
+        TestFile("quant/test_w8a8_quantization.py", 46),
+        TestFile("rl/test_update_weights_from_disk.py", 114),
+        TestFile("rl/test_update_weights_from_tensor.py", 48),
+        TestFile("test_abort.py", 51),
+        TestFile("test_create_kvindices.py", 2),
+        TestFile("test_chunked_prefill.py", 313),
+        TestFile("test_eagle_infer_a.py", 370),
+        TestFile("test_eagle_infer_b.py", 700),
+        TestFile("test_ebnf_constrained.py", 108),
+        TestFile("test_eval_fp8_accuracy.py", 303),
+        TestFile("test_fa3.py", 376),
+        # TestFile("test_flashmla.py", 352),
+        TestFile("test_function_call_parser.py", 10),
+        TestFile("test_fused_moe.py", 30),
+        TestFile("test_gpt_oss_1gpu.py", 600),
+        TestFile("test_harmony_parser.py", 20),
+        TestFile("test_hidden_states.py", 55),
+        TestFile("test_hybrid_attn_backend.py", 100),
+        TestFile("test_standalone_speculative_decoding.py", 250),
+        TestFile("test_input_embeddings.py", 38),
+        TestFile("test_io_struct.py", 8),
+        TestFile("test_jinja_template_utils.py", 1),
+        TestFile("test_metrics.py", 32),
+        TestFile("test_metrics_utils.py", 1),
+        TestFile("test_mla.py", 167),
+        TestFile("test_mla_deepseek_v3.py", 700),
+        TestFile("test_mla_int8_deepseek_v3.py", 429),
+        TestFile("test_mla_flashinfer.py", 302),
+        TestFile("test_mla_fp8.py", 93),
+        TestFile("test_multi_tokenizer.py", 230),
+        TestFile("test_no_chunked_prefill.py", 108),
+        TestFile("test_no_overlap_scheduler.py", 234),
+        TestFile("test_original_logprobs.py", 200),
+        TestFile("test_penalty.py", 41),
+        TestFile("test_page_size.py", 60),
+        TestFile("test_pytorch_sampling_backend.py", 66),
+        TestFile("test_radix_attention.py", 105),
+        TestFile("test_regex_constrained.py", 64),
+        TestFile("test_reasoning_parser.py", 5),
+        TestFile("test_retract_decode.py", 54),
+        TestFile("test_request_queue_validation.py", 30),
+        TestFile("test_server_args.py", 1),
+        TestFile("test_skip_tokenizer_init.py", 117),
+        TestFile("test_srt_engine.py", 261),
+        TestFile("test_srt_endpoint.py", 130),
+        TestFile("test_start_profile.py", 60),
+        TestFile("test_torch_compile.py", 76),
+        TestFile("test_torch_compile_moe.py", 172),
+        TestFile("test_torch_native_attention_backend.py", 123),
+        TestFile("test_torchao.py", 70),
+        TestFile("test_triton_attention_kernels.py", 4),
+        TestFile("test_triton_attention_backend.py", 150),
+        TestFile("test_triton_moe_channel_fp8_kernel.py", 25),
+        TestFile("test_triton_sliding_window.py", 250),
+        TestFile("test_utils_update_weights.py", 48),
+        TestFile("test_vision_chunked_prefill.py", 175),
+        TestFile("test_vlm_input_format.py", 300),
+        TestFile("test_vision_openai_server_a.py", 403),
+        TestFile("test_vision_openai_server_b.py", 446),
+    ],
+    "per-commit-2-gpu": [
+        TestFile("lora/test_lora_tp.py", 116),
+        TestFile("rl/test_update_weights_from_distributed.py", 103),
+        TestFile("test_data_parallelism.py", 73),
+        TestFile("test_dp_attention.py", 277),
+        TestFile("test_patch_torch.py", 19),
+        TestFile("test_release_memory_occupation.py", 127),
+        TestFile("hicache/test_hicache_storage_file_backend.py", 400),
+        TestFile("hicache/test_hicache_storage_3fs_backend.py", 400),
+    ],
+    "per-commit-4-gpu": [
+        TestFile("test_gpt_oss_4gpu.py", 600),
+        TestFile("test_local_attn.py", 250),
+        TestFile("test_pp_single_node.py", 372),
+        TestFile("test_multi_instance_release_memory_occupation.py", 64),
+    ],
+    "per-commit-8-gpu": [
+        # Disabled because it hangs on the CI.
+        # TestFile("ep/test_moe_ep.py", 181),
+        TestFile("lora/test_lora_llama4.py", 600),
+        TestFile("test_disaggregation.py", 499),
+        TestFile("test_disaggregation_different_tp.py", 155),
+        TestFile("test_disaggregation_pp.py", 60),
+        TestFile("test_full_deepseek_v3.py", 333),
+    ],
+    "per-commit-8-gpu-b200": [
+        # add more here
+        TestFile("test_gpt_oss_4gpu.py", 600),
+    ],
+    "per-commit-4-gpu-deepep": [
+        TestFile("ep/test_deepep_small.py", 531),
+    ],
+    "per-commit-8-gpu-deepep": [
+        TestFile("ep/test_deepep_large.py", 338),
+    ],
+    "per-commit-8-gpu-h20": [
+        TestFile("quant/test_w4a8_deepseek_v3.py", 371),
+    ],
+    "nightly": [
+        TestFile("test_nightly_gsm8k_eval.py"),
+    ],
+    "vllm_dependency_test": [
+        TestFile("quant/test_awq.py", 163),
+        TestFile("test_bnb.py", 5),
+        TestFile("test_gptqmodel_dynamic.py", 102),
+        TestFile("test_vllm_dependency.py", 185),
+        # TestFile("test_gguf.py", 96),
+    ],
+}
+
+# Add AMD tests
+suite_amd = {
+    "per-commit-amd": [
+        TestFile("hicache/test_hicache.py", 116),
+        TestFile("hicache/test_hicache_mla.py", 127),
+        TestFile("hicache/test_hicache_storage.py", 127),
+        TestFile("lora/test_lora.py", 200),
+        TestFile("lora/test_lora_eviction.py", 200),
+        TestFile("lora/test_lora_backend.py", 99),
+        TestFile("lora/test_multi_lora_backend.py", 60),
+        TestFile("lora/test_lora_cuda_graph.py", 250),
+        TestFile("lora/test_lora_qwen3.py", 97),
+        TestFile("models/test_embedding_models.py", 73),
+        TestFile("models/test_compressed_tensors_models.py", 42),
+        TestFile("models/test_qwen_models.py", 82),
+        TestFile("models/test_reward_models.py", 132),
+        TestFile("models/test_transformers_models.py", 320),
+        TestFile("openai_server/basic/test_protocol.py", 10),
+        TestFile("openai_server/basic/test_serving_chat.py", 10),
+        TestFile("openai_server/basic/test_serving_completions.py", 10),
+        TestFile("openai_server/basic/test_serving_embedding.py", 10),
+        TestFile("openai_server/basic/test_openai_embedding.py", 141),
+        TestFile("openai_server/basic/test_openai_server.py", 149),
+        TestFile("openai_server/features/test_enable_thinking.py", 70),
+        TestFile("openai_server/features/test_json_constrained.py", 98),
+        TestFile("openai_server/features/test_json_mode.py", 90),
+        TestFile("openai_server/features/test_openai_server_ebnf.py", 95),
+        # TestFile("openai_server/features/test_openai_server_hidden_states.py", 240),
+        TestFile("openai_server/features/test_reasoning_content.py", 89),
+        TestFile("openai_server/function_call/test_openai_function_calling.py", 60),
+        TestFile("openai_server/function_call/test_tool_choice.py", 226),
+        TestFile("openai_server/validation/test_large_max_new_tokens.py", 41),
+        TestFile("openai_server/validation/test_matched_stop.py", 60),
+        TestFile("openai_server/validation/test_openai_server_ignore_eos.py", 85),
+        TestFile("openai_server/validation/test_request_length_validation.py", 31),
+        TestFile("quant/test_block_int8.py", 22),
+        TestFile("quant/test_awq_dequant.py", 2),
+        TestFile("rl/test_update_weights_from_disk.py", 114),
+        # TestFile("rl/test_update_weights_from_tensor.py", 48),
+        TestFile("test_abort.py", 51),
+        TestFile("test_create_kvindices.py", 2),
+        TestFile("test_chunked_prefill.py", 313),
+        TestFile("test_ebnf_constrained.py", 108),
+        TestFile("test_eval_fp8_accuracy.py", 303),
+        TestFile("test_function_call_parser.py", 10),
+        TestFile("test_fused_moe.py", 30),
+        TestFile("test_input_embeddings.py", 38),
+        TestFile("test_io_struct.py", 8),
+        TestFile("test_jinja_template_utils.py", 1),
+        TestFile("test_metrics.py", 32),
+        TestFile("test_metrics_utils.py", 1),
+        TestFile("test_mla.py", 242),
+        TestFile("test_mla_deepseek_v3.py", 221),
+        TestFile("test_no_chunked_prefill.py", 108),
+        # TestFile("test_no_overlap_scheduler.py", 234), # Disabled temporarily and track in #7703
+        TestFile("test_penalty.py", 41),
+        TestFile("test_page_size.py", 60),
+        TestFile("test_pytorch_sampling_backend.py", 66),
+        TestFile("test_radix_attention.py", 105),
+        TestFile("test_regex_constrained.py", 64),
+        TestFile("test_retract_decode.py", 54),
+        TestFile("test_reasoning_parser.py", 5),
+        TestFile("test_rope_rocm.py", 3),
+        TestFile("test_server_args.py", 1),
+        TestFile("test_skip_tokenizer_init.py", 117),
+        TestFile("test_srt_engine.py", 261),
+        TestFile("test_srt_endpoint.py", 130),
+        TestFile("test_torch_compile.py", 76),
+        TestFile("test_torch_compile_moe.py", 172),
+        TestFile("test_torch_native_attention_backend.py", 123),
+        TestFile("test_triton_attention_backend.py", 150),
+        # TestFile("test_vision_chunked_prefill.py", 175), # Disabled temporarily and track in #7701
+        TestFile("test_wave_attention_kernels.py", 2),
+        TestFile("test_wave_attention_backend.py", 150),
+    ],
+    "per-commit-amd-mi35x": [
+        TestFile("test_mla.py", 242),
+        TestFile("test_gpt_oss_1gpu.py", 600),
+    ],
+    "per-commit-2-gpu-amd": [
+        TestFile("lora/test_lora_tp.py", 116),
+        TestFile("rl/test_update_weights_from_distributed.py", 103),
+        TestFile("test_data_parallelism.py", 73),
+        TestFile("test_patch_torch.py", 19),
+    ],
+    "per-commit-4-gpu-amd": [
+        TestFile("test_pp_single_node.py", 150),
+    ],
+    "per-commit-8-gpu-amd": [
+        TestFile("test_full_deepseek_v3.py", 250),
+    ],
+    "nightly-amd": [
+        TestFile("test_nightly_gsm8k_eval_amd.py"),
+    ],
+}
+
+# Add Intel Xeon tests
+suite_xeon = {
+    "per-commit-cpu": [
+        TestFile("cpu/test_activation.py"),
+        TestFile("cpu/test_binding.py"),
+        TestFile("cpu/test_decode.py"),
+        TestFile("cpu/test_extend.py"),
+        TestFile("cpu/test_gemm.py"),
+        TestFile("cpu/test_mla.py"),
+        TestFile("cpu/test_moe.py"),
+        TestFile("cpu/test_norm.py"),
+        TestFile("cpu/test_qkv_proj_with_rope.py"),
+        TestFile("cpu/test_rope.py"),
+        TestFile("cpu/test_shared_expert.py"),
+        TestFile("cpu/test_topk.py"),
+        TestFile("test_intel_amx_attention_backend.py"),
+        TestFile("test_cpu_graph.py"),
+    ],
+}
+
+# Add Ascend NPU tests
+suite_ascend = {
+    "per-commit-1-ascend-npu": [
+        TestFile("ascend/test_ascend_tp1_bf16.py", 400),
+        TestFile("ascend/test_ascend_graph_tp1_bf16.py", 400),
+    ],
+    "per-commit-2-ascend-npu": [
+        TestFile("ascend/test_ascend_tp2_bf16.py", 400),
+        TestFile("ascend/test_ascend_graph_tp2_bf16.py", 400),
+        TestFile("ascend/test_ascend_tp2_fia_bf16.py", 400),
+        TestFile("ascend/test_ascend_mla_fia_w8a8int8.py", 400),
+    ],
+    "per-commit-4-ascend-npu": [
+        TestFile("ascend/test_ascend_mla_w8a8int8.py", 400),
+        TestFile("ascend/test_ascend_tp4_bf16.py", 400),
+    ],
+    "per-commit-16-ascend-a3": [
+        TestFile("ascend/test_ascend_deepep.py", 400),
+    ],
+}
+
+suites.update(suite_amd)
+suites.update(suite_xeon)
+suites.update(suite_ascend)
+
+
+def auto_partition(files, rank, size):
+    """
+    Partition files into size sublists with approximately equal sums of estimated times
+    using stable sorting, and return the partition for the specified rank.
+
+    Args:
+        files (list): List of file objects with estimated_time attribute
+        rank (int): Index of the partition to return (0 to size-1)
+        size (int): Number of partitions
+
+    Returns:
+        list: List of file objects in the specified rank's partition
+    """
+    weights = [f.estimated_time for f in files]
+
+    if not weights or size <= 0 or size > len(weights):
+        return []
+
+    # Create list of (weight, original_index) tuples
+    # Using negative index as secondary key to maintain original order for equal weights
+    indexed_weights = [(w, -i) for i, w in enumerate(weights)]
+    # Stable sort in descending order by weight
+    # If weights are equal, larger (negative) index comes first (i.e., earlier original position)
+    indexed_weights = sorted(indexed_weights, reverse=True)
+
+    # Extract original indices (negate back to positive)
+    indexed_weights = [(w, -i) for w, i in indexed_weights]
+
+    # Initialize partitions and their sums
+    partitions = [[] for _ in range(size)]
+    sums = [0.0] * size
+
+    # Greedy approach: assign each weight to partition with smallest current sum
+    for weight, idx in indexed_weights:
+        # Find partition with minimum sum
+        min_sum_idx = sums.index(min(sums))
+        partitions[min_sum_idx].append(idx)
+        sums[min_sum_idx] += weight
+
+    # Return the files corresponding to the indices in the specified rank's partition
+    indices = partitions[rank]
+    return [files[i] for i in indices]
+
+
+if __name__ == "__main__":
+    arg_parser = argparse.ArgumentParser()
+    arg_parser.add_argument(
+        "--timeout-per-file",
+        type=int,
+        default=1800,
+        help="The time limit for running one file in seconds.",
+    )
+    arg_parser.add_argument(
+        "--suite",
+        type=str,
+        default=list(suites.keys())[0],
+        choices=list(suites.keys()) + ["all"],
+        help="The suite to run",
+    )
+    arg_parser.add_argument(
+        "--range-begin",
+        type=int,
+        default=0,
+        help="The begin index of the range of the files to run.",
+    )
+    arg_parser.add_argument(
+        "--range-end",
+        type=int,
+        default=None,
+        help="The end index of the range of the files to run.",
+    )
+    arg_parser.add_argument(
+        "--auto-partition-id",
+        type=int,
+        help="Use auto load balancing. The part id.",
+    )
+    arg_parser.add_argument(
+        "--auto-partition-size",
+        type=int,
+        help="Use auto load balancing. The number of parts.",
+    )
+    args = arg_parser.parse_args()
+    print(f"{args=}")
+
+    if args.suite == "all":
+        files = glob.glob("**/test_*.py", recursive=True)
+    else:
+        files = suites[args.suite]
+
+    if args.auto_partition_size:
+        files = auto_partition(files, args.auto_partition_id, args.auto_partition_size)
+    else:
+        files = files[args.range_begin : args.range_end]
+
+    print("The running tests are ", [f.name for f in files])
+
+    exit_code = run_unittest_files(files, args.timeout_per_file)
+    exit(exit_code)
diff --git a/test/srt/test_abort.py b/test/srt/test_abort.py
new file mode 100644
index 000000000..591c21674
--- /dev/null
+++ b/test/srt/test_abort.py
@@ -0,0 +1,114 @@
+import json
+import multiprocessing
+import time
+import unittest
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+    run_and_check_memory_leak,
+)
+
+
+class TestAbort(CustomTestCase):
+    def workload_func(self, base_url, model):
+        def process_func():
+            def run_one(_):
+                prompt = """
+                System: You are a helpful assistant.
+                User: What is the capital of France?
+                Assistant: The capital of France is
+                """
+
+                response = requests.post(
+                    f"{base_url}/generate",
+                    json={
+                        "text": prompt,
+                        "sampling_params": {
+                            "temperature": 0,
+                            "max_new_tokens": 2048,
+                        },
+                    },
+                )
+                ret = response.json()
+
+            with ThreadPoolExecutor(16) as executor:
+                list(executor.map(run_one, list(range(16))))
+
+        p = multiprocessing.Process(target=process_func)
+        p.start()
+        time.sleep(0.5)
+        p.terminate()
+        time.sleep(10)
+
+    def test_memory_leak(self):
+        run_and_check_memory_leak(
+            self.workload_func,
+            disable_radix_cache=False,
+            enable_mixed_chunk=False,
+            disable_overlap=False,
+            chunked_prefill_size=8192,
+            assert_has_abort=True,
+        )
+
+
+class TestAbortAll(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--max-running-requests", 8],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def _run_decode(self):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": 16000,
+                    "ignore_eos": True,
+                },
+            },
+        )
+        return response.json()
+
+    def test_abort_all(self):
+        num_requests = 32
+        with ThreadPoolExecutor(num_requests) as executor:
+            futures = [executor.submit(self._run_decode) for _ in range(num_requests)]
+
+            # ensure the decode has been started
+            time.sleep(2)
+
+            requests.post(
+                self.base_url + "/abort_request",
+                json={
+                    "abort_all": True,
+                },
+            )
+
+            for future in as_completed(futures):
+                self.assertEqual(
+                    future.result()["meta_info"]["finish_reason"]["type"], "abort"
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_bench_one_batch.py b/test/srt/test_bench_one_batch.py
new file mode 100644
index 000000000..7ec33a559
--- /dev/null
+++ b/test/srt/test_bench_one_batch.py
@@ -0,0 +1,73 @@
+import os
+import unittest
+
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    CustomTestCase,
+    is_in_amd_ci,
+    is_in_ci,
+    run_bench_offline_throughput,
+    run_bench_one_batch,
+    write_github_step_summary,
+)
+
+# We use `run_bench_offline_throughput`` instead of `run_bench_one_batch` for most cases
+# because `run_bench_offline_throughput`` has overlap scheduler.
+
+
+class TestBenchOneBatch(CustomTestCase):
+
+    def test_bs1_small(self):
+        _, output_throughput, _ = run_bench_one_batch(
+            DEFAULT_SMALL_MODEL_NAME_FOR_TEST, ["--cuda-graph-max-bs", "2"]
+        )
+        self.assertGreater(output_throughput, 50)
+
+    def test_bs1_default(self):
+        output_throughput = run_bench_offline_throughput(
+            DEFAULT_MODEL_NAME_FOR_TEST, ["--cuda-graph-max-bs", "2"]
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_bs1_default (llama-3.1-8b)\n"
+                f"output_throughput: {output_throughput:.2f} token/s\n"
+            )
+            self.assertGreater(output_throughput, 135)
+
+    def test_moe_tp2_bs1(self):
+        output_throughput = run_bench_offline_throughput(
+            DEFAULT_MOE_MODEL_NAME_FOR_TEST, ["--tp", "2", "--cuda-graph-max-bs", "2"]
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_moe_tp2_bs1 (Mixtral-8x7B)\n"
+                f"output_throughput: {output_throughput:.2f} token/s\n"
+            )
+            if is_in_amd_ci():
+                self.assertGreater(output_throughput, 85)
+            else:
+                self.assertGreater(output_throughput, 125)
+
+    def test_torch_compile_tp2_bs1(self):
+        output_throughput = run_bench_offline_throughput(
+            DEFAULT_MODEL_NAME_FOR_TEST,
+            ["--tp", "2", "--enable-torch-compile", "--cuda-graph-max-bs", "2"],
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_torch_compile_tp2_bs1 (Mixtral-8x7B)\n"
+                f"output_throughput: {output_throughput:.2f} token/s\n"
+            )
+            if is_in_amd_ci():
+                self.assertGreater(output_throughput, 200)
+            else:
+                self.assertGreater(output_throughput, 220)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py
new file mode 100644
index 000000000..608595b95
--- /dev/null
+++ b/test/srt/test_bench_serving.py
@@ -0,0 +1,445 @@
+import asyncio
+import itertools
+import unittest
+
+import requests
+
+from sglang.test.test_utils import (
+    DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+    DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_MODEL_NAME_FOR_TEST_FP8,
+    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+    DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
+    CustomTestCase,
+    is_in_amd_ci,
+    is_in_ci,
+    run_bench_serving,
+    write_github_step_summary,
+)
+
+
+class TestBenchServing(CustomTestCase):
+    def test_offline_throughput_default(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=500,
+            request_rate=float("inf"),
+            other_server_args=[],
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_offline_throughput_default\n"
+                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
+            )
+            if is_in_amd_ci():
+                self.assertGreater(res["output_throughput"], 3050)
+            else:
+                self.assertGreater(res["output_throughput"], 3800)
+
+    def test_offline_throughput_non_stream_small_batch_size(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=200,
+            request_rate=float("inf"),
+            other_server_args=["--max-running-requests", "10"],
+            dataset_name="sharegpt",
+            random_input_len=None,
+            random_output_len=None,
+            disable_stream=True,
+            need_warmup=True,
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_offline_throughput_non_stream_small_batch_size\n"
+                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
+            )
+            if is_in_amd_ci():
+                self.assertGreater(res["output_throughput"], 1000)
+            else:
+                self.assertGreater(res["output_throughput"], 1050)
+
+    def test_offline_throughput_without_radix_cache(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=500,
+            request_rate=float("inf"),
+            other_server_args=["--disable-radix-cache"],
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_offline_throughput_without_radix_cache\n"
+                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
+            )
+            if is_in_amd_ci():
+                self.assertGreater(res["output_throughput"], 3050)
+            else:
+                self.assertGreater(res["output_throughput"], 3800)
+
+    def test_offline_throughput_without_chunked_prefill(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=500,
+            request_rate=float("inf"),
+            other_server_args=["--chunked-prefill-size", "-1"],
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_offline_throughput_without_chunked_prefill\n"
+                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
+            )
+            self.assertGreater(res["output_throughput"], 2600)
+
+    def test_offline_throughput_with_triton_attention_backend(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=500,
+            request_rate=float("inf"),
+            other_server_args=[
+                "--attention-backend",
+                "triton",
+                "--context-length",
+                "8192",
+            ],
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_offline_throughput_with_triton_attention_backend\n"
+                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
+            )
+            if is_in_amd_ci():
+                self.assertGreater(res["output_throughput"], 3500)
+            else:
+                self.assertGreater(res["output_throughput"], 3700)
+
+    def test_offline_throughput_default_fp8(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST_FP8,
+            num_prompts=500,
+            request_rate=float("inf"),
+            other_server_args=[],
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_offline_throughput_default_fp8\n"
+                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
+            )
+            if is_in_amd_ci():
+                self.assertGreater(res["output_throughput"], 3500)
+            else:
+                self.assertGreater(res["output_throughput"], 4300)
+
+    def test_online_latency_default(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=100,
+            request_rate=1,
+            other_server_args=[],
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_online_latency_default\n"
+                f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
+            )
+            self.assertLess(res["median_e2e_latency_ms"], 11000)
+            if is_in_amd_ci():
+                self.assertLess(res["median_ttft_ms"], 115)
+            else:
+                self.assertLess(res["median_ttft_ms"], 86)
+            self.assertLess(res["median_itl_ms"], 10)
+
+    def test_vlm_offline_throughput(self):
+        res = run_bench_serving(
+            model=DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
+            num_prompts=200,
+            request_rate=float("inf"),
+            other_server_args=[
+                "--mem-fraction-static",
+                "0.7",
+            ],
+            dataset_name="mmmu",
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_vlm_offline_throughput\n"
+                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
+            )
+            if is_in_amd_ci():
+                self.assertGreater(res["output_throughput"], 2000)
+                # TODO: not set yet, need AMD machine
+            else:
+                self.assertGreater(res["output_throughput"], 2500)
+
+    def test_vlm_online_latency(self):
+        res = run_bench_serving(
+            model=DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
+            num_prompts=250,
+            request_rate=1,
+            other_server_args=[
+                "--mem-fraction-static",
+                "0.7",
+            ],
+            dataset_name="mmmu",
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_vlm_online_latency\n"
+                f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
+            )
+            self.assertLess(res["median_e2e_latency_ms"], 16500)
+            if is_in_amd_ci():
+                self.assertLess(res["median_ttft_ms"], 150)
+                # TODO: not set yet, need AMD machine
+            else:
+                self.assertLess(res["median_ttft_ms"], 100)
+            self.assertLess(res["median_itl_ms"], 8)
+
+    def test_lora_online_latency(self):
+        # TODO (lifuhuang): verify LoRA support in AMD.
+        if is_in_amd_ci():
+            pass
+
+        res = self._run_lora_latency_test(enable_background_task=False)
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_lora_online_latency\n"
+                f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
+                f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n"
+            )
+            self.assertLess(res["median_e2e_latency_ms"], 2400)
+            self.assertLess(res["median_ttft_ms"], 58)
+
+    def test_lora_online_latency_with_concurrent_adapter_updates(self):
+        # TODO (lifuhuang): verify LoRA support in AMD.
+        if is_in_amd_ci():
+            pass
+
+        res = self._run_lora_latency_test(enable_background_task=True)
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_lora_online_latency\n"
+                f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
+                f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n"
+            )
+            self.assertLess(res["median_e2e_latency_ms"], 4000)
+            self.assertLess(res["median_ttft_ms"], 80)
+
+    def _run_lora_latency_test(self, enable_background_task: bool):
+        """
+        Run a latency test for LoRA with the specified background task setting.
+        """
+
+        async def lora_loader_unloader_task(
+            base_url: str,
+            start_event: asyncio.Event,
+            stop_event: asyncio.Event,
+        ):
+            """
+            A background task that repeatedly loads and unloads a LoRA adapter.
+            """
+            await start_event.wait()
+
+            path_cycler = itertools.cycle(
+                [
+                    "pbevan11/llama-3.1-8b-ocr-correction",
+                    "faridlazuarda/valadapt-llama-3.1-8B-it-chinese",
+                    "philschmid/code-llama-3-1-8b-text-to-sql-lora",
+                ]
+            )
+            load_url = f"{base_url}/load_lora_adapter"
+            unload_url = f"{base_url}/unload_lora_adapter"
+            num_updates = 0
+
+            while not stop_event.is_set():
+                # 1. Load the LoRA adapter
+                lora_path = next(path_cycler)
+                response = await asyncio.to_thread(
+                    requests.post,
+                    load_url,
+                    json={"lora_name": lora_path, "lora_path": lora_path},
+                )
+                self.assertTrue(
+                    response.ok, f"Failed to load LoRA adapter: {response.text}"
+                )
+                num_updates += 1
+
+                if stop_event.is_set():
+                    break
+
+                # Yield control to allow other tasks to run.
+                await asyncio.sleep(1)
+
+                # 2. Unload the LoRA adapter
+                response = await asyncio.to_thread(
+                    requests.post,
+                    unload_url,
+                    json={"lora_name": lora_path},
+                )
+                self.assertTrue(
+                    response.ok, f"Failed to unload LoRA adapter: {response.text}"
+                )
+                num_updates += 1
+
+                # Yield control to allow other tasks to run.
+                await asyncio.sleep(1)
+
+        background_task = lora_loader_unloader_task if enable_background_task else None
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=400,
+            request_rate=8,
+            other_server_args=[
+                "--enable-lora",
+                "--max-loras-per-batch",
+                "1",
+                "--disable-radix-cache",
+                "--random-seed",
+                "42",
+                "--mem-fraction-static",
+                "0.8",
+                "--lora-paths",
+                "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+                "--max-lora-rank",
+                "256",
+            ],
+            dataset_name="random",
+            random_input_len=256,
+            random_output_len=256,
+            lora_name=["Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16"],
+            background_task=background_task,
+        )
+
+        return res
+
+    def test_online_latency_eagle(self):
+        res = run_bench_serving(
+            model=DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
+            num_prompts=300,
+            request_rate=8,
+            sharegpt_context_len=3072,
+            disable_ignore_eos=True,
+            dataset_name="sharegpt",
+            other_server_args=[
+                "--speculative-algorithm",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+                "--speculative-num-steps",
+                "5",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "16",
+                "--mem-fraction-static",
+                "0.7",
+            ],
+            need_warmup=True,
+            seed=42,
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_online_latency_eagle\n"
+                f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
+                f"accept_length: {res['accept_length']:.2f} \n"
+            )
+            if is_in_amd_ci():
+                self.assertLess(res["median_e2e_latency_ms"], 1800)
+            else:
+                self.assertLess(res["median_e2e_latency_ms"], 900)
+            self.assertGreater(res["accept_length"], 3.0)
+
+    def test_moe_offline_throughput_default(self):
+        res = run_bench_serving(
+            model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+            num_prompts=300,
+            request_rate=float("inf"),
+            other_server_args=["--tp", "2"],
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_moe_offline_throughput_default\n"
+                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
+            )
+            if is_in_amd_ci():
+                self.assertGreater(res["output_throughput"], 2100)
+            else:
+                self.assertGreater(res["output_throughput"], 2200)
+
+    def test_moe_offline_throughput_without_radix_cache(self):
+        res = run_bench_serving(
+            model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+            num_prompts=300,
+            request_rate=float("inf"),
+            other_server_args=["--tp", "2", "--disable-radix-cache"],
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_moe_offline_throughput_without_radix_cache\n"
+                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
+            )
+            if is_in_amd_ci():
+                self.assertGreater(res["output_throughput"], 2100)
+            else:
+                self.assertGreater(res["output_throughput"], 2200)
+
+    def test_pp_offline_throughput_default_decode(self):
+        res = run_bench_serving(
+            model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+            num_prompts=1000,
+            request_rate=float("inf"),
+            random_input_len=1,
+            random_output_len=1024,
+            other_server_args=["--pp", "2"],
+            need_warmup=True,
+            seed=42,
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_pp_offline_throughput_default_decode\n"
+                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
+            )
+            self.assertGreater(res["output_throughput"], 6700)
+
+    def test_pp_long_context_prefill(self):
+        res = run_bench_serving(
+            model="meta-llama/Llama-3.3-70B-Instruct",
+            num_prompts=4,
+            request_rate=float("inf"),
+            random_input_len=128000,
+            random_output_len=1,
+            dataset_name="random",
+            other_server_args=[
+                "--quantization",
+                "fp8",
+                "--pp",
+                2,
+            ],
+            need_warmup=False,
+            seed=42,
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_pp_long_context_latency_prefill\n"
+                f"input_throughput: {res['input_throughput']:.2f} ms\n"
+            )
+            self.assertGreater(res["input_throughput"], 4000)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_bnb.py b/test/srt/test_bnb.py
new file mode 100644
index 000000000..1d9f0201d
--- /dev/null
+++ b/test/srt/test_bnb.py
@@ -0,0 +1,314 @@
+"""
+Usage:
+python3 -m unittest test_bnb.TestVisionModel.test_vlm
+python3 -m unittest test_bnb.TestLanguageModel.test_mmlu
+"""
+
+import base64
+import io
+import json
+import multiprocessing as mp
+import os
+import unittest
+from concurrent.futures import ThreadPoolExecutor
+from types import SimpleNamespace
+
+import numpy as np
+import openai
+import requests
+from PIL import Image
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+)
+
+VISION_MODELS = [
+    "unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit",
+    "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
+    "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit",
+    "unsloth/Llama-3.2-11B-Vision-bnb-4bit",
+    "unsloth/gemma-3-4b-it-bnb-4bit",
+    "unsloth/gemma-3-4b-it-unsloth-bnb-4bit",
+]
+LANGUAGE_MODELS = [
+    "unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
+    "unsloth/Qwen2-7B-Instruct-bnb-4bit",
+    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
+    "unsloth/gemma-3-1b-it-bnb-4bit",
+]
+
+# image
+IMAGE_MAN_IRONING_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/images/man_ironing_on_back_of_suv.png"
+IMAGE_SGL_LOGO_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/images/sgl_logo.png"
+
+# video
+VIDEO_JOBS_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/videos/jobs_presenting_ipod.mp4"
+
+# audio
+AUDIO_TRUMP_SPEECH_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/audios/Trump_WEF_2018_10s.mp3"
+AUDIO_BIRD_SONG_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/audios/bird_song.mp3"
+
+
+def popen_launch_server_wrapper(base_url, model, other_args):
+    process = popen_launch_server(
+        model,
+        base_url,
+        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        other_args=other_args,
+    )
+    return process
+
+
+class TestVisionModel(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.base_url += "/v1"
+        cls.api_key = "sk-123456"
+
+    def _run_single_image_chat_completion(self):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        response = client.chat.completions.create(
+            model="default",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": IMAGE_MAN_IRONING_URL},
+                        },
+                        {
+                            "type": "text",
+                            "text": "Describe this image in a very short sentence.",
+                        },
+                    ],
+                },
+            ],
+            temperature=0,
+        )
+
+        assert response.choices[0].message.role == "assistant"
+        text = response.choices[0].message.content
+        assert isinstance(text, str)
+        # `driver` is for gemma-3-it
+        assert "man" in text or "person" or "driver" in text, text
+        assert "cab" in text or "taxi" in text or "SUV" in text, text
+        # MiniCPMO fails to recognize `iron`, but `hanging`
+        assert "iron" in text or "hang" in text, text
+        assert response.id
+        assert response.created
+        assert response.usage.prompt_tokens > 0
+        assert response.usage.completion_tokens > 0
+        assert response.usage.total_tokens > 0
+
+    def _run_multi_turn_chat_completion(self):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        response = client.chat.completions.create(
+            model="default",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": IMAGE_MAN_IRONING_URL},
+                        },
+                        {
+                            "type": "text",
+                            "text": "Describe this image in a very short sentence.",
+                        },
+                    ],
+                },
+                {
+                    "role": "assistant",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "There is a man at the back of a yellow cab ironing his clothes.",
+                        }
+                    ],
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "Repeat your previous answer."}
+                    ],
+                },
+            ],
+            temperature=0,
+        )
+
+        assert response.choices[0].message.role == "assistant"
+        text = response.choices[0].message.content
+        assert isinstance(text, str)
+        assert "man" in text or "cab" in text, text
+        assert response.id
+        assert response.created
+        assert response.usage.prompt_tokens > 0
+        assert response.usage.completion_tokens > 0
+        assert response.usage.total_tokens > 0
+
+    def _run_multi_images_chat_completion(self):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        response = client.chat.completions.create(
+            model="default",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": IMAGE_MAN_IRONING_URL},
+                            "modalities": "multi-images",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": IMAGE_SGL_LOGO_URL},
+                            "modalities": "multi-images",
+                        },
+                        {
+                            "type": "text",
+                            "text": "I have two very different images. They are not related at all. "
+                            "Please describe the first image in one sentence, and then describe the second image in another sentence.",
+                        },
+                    ],
+                },
+            ],
+            temperature=0,
+        )
+
+        assert response.choices[0].message.role == "assistant"
+        text = response.choices[0].message.content
+        assert isinstance(text, str)
+        print("-" * 30)
+        print(f"Multi images response:\n{text}")
+        print("-" * 30)
+        assert "man" in text or "cab" in text or "SUV" in text or "taxi" in text, text
+        assert "logo" in text or '"S"' in text or "SG" in text, text
+        assert response.id
+        assert response.created
+        assert response.usage.prompt_tokens > 0
+        assert response.usage.completion_tokens > 0
+        assert response.usage.total_tokens > 0
+
+    def run_decode_with_image(self, image_id):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        content = []
+        if image_id == 0:
+            content.append(
+                {
+                    "type": "image_url",
+                    "image_url": {"url": IMAGE_MAN_IRONING_URL},
+                }
+            )
+        elif image_id == 1:
+            content.append(
+                {
+                    "type": "image_url",
+                    "image_url": {"url": IMAGE_SGL_LOGO_URL},
+                }
+            )
+        else:
+            pass
+
+        content.append(
+            {
+                "type": "text",
+                "text": "Describe this image in a very short sentence.",
+            }
+        )
+
+        response = client.chat.completions.create(
+            model="default",
+            messages=[
+                {"role": "user", "content": content},
+            ],
+            temperature=0,
+        )
+
+        assert response.choices[0].message.role == "assistant"
+        text = response.choices[0].message.content
+        assert isinstance(text, str)
+
+    def _run_test_mixed_batch(self):
+        image_ids = [0, 1, 2] * 4
+        with ThreadPoolExecutor(4) as executor:
+            list(executor.map(self.run_decode_with_image, image_ids))
+
+    def test_vlm(self):
+        models_to_test = VISION_MODELS
+
+        if is_in_ci():
+            models_to_test = [random.choice(VISION_MODELS)]
+
+        for model in models_to_test:
+            with self.subTest(model=model):
+                other_args = [
+                    "--mem-fraction-static",
+                    "0.6",
+                    "--load-format",
+                    "bitsandbytes",
+                    "--enable-multimodal",
+                ]
+                try:
+                    process = popen_launch_server_wrapper(
+                        DEFAULT_URL_FOR_TEST, model, other_args
+                    )
+                    self._run_test_mixed_batch()
+                    self._run_multi_images_chat_completion()
+                    self._run_multi_turn_chat_completion()
+                    self._run_single_image_chat_completion()
+                finally:
+                    kill_process_tree(process.pid)
+
+
+class TestLanguageModel(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        # cls.base_url += "/v1"
+        cls.api_key = "sk-123456"
+
+    def test_mmlu(self):
+        models_to_test = LANGUAGE_MODELS
+
+        if is_in_ci():
+            models_to_test = [random.choice(LANGUAGE_MODELS)]
+
+        for model in models_to_test:
+            with self.subTest(model=model):
+                other_args = [
+                    "--mem-fraction-static",
+                    "0.6",
+                    "--load-format",
+                    "bitsandbytes",
+                ]
+                try:
+                    process = popen_launch_server_wrapper(
+                        DEFAULT_URL_FOR_TEST, model, other_args
+                    )
+                    args = SimpleNamespace(
+                        base_url=self.base_url,
+                        model=model,
+                        eval_name="mmlu",
+                        num_examples=32,
+                        num_threads=16,
+                    )
+
+                    metrics = run_eval(args)
+                    print(f"{metrics=}")
+                    self.assertGreater(metrics["score"], 0.3)
+                finally:
+                    kill_process_tree(process.pid)
diff --git a/test/srt/test_chunked_prefill.py b/test/srt/test_chunked_prefill.py
new file mode 100644
index 000000000..fbdc87c55
--- /dev/null
+++ b/test/srt/test_chunked_prefill.py
@@ -0,0 +1,31 @@
+"""
+python3 -m unittest test_chunked_prefill.TestChunkedPrefill.test_mixed_chunked_prefill_without_radix_cache
+"""
+
+import unittest
+
+from sglang.test.test_utils import CustomTestCase, run_mmlu_test, run_mulit_request_test
+
+
+class TestChunkedPrefill(CustomTestCase):
+    def test_chunked_prefill(self):
+        run_mmlu_test(disable_radix_cache=False, enable_mixed_chunk=False)
+
+    def test_mixed_chunked_prefill(self):
+        run_mmlu_test(disable_radix_cache=False, enable_mixed_chunk=True)
+
+    def test_chunked_prefill_without_radix_cache(self):
+        run_mmlu_test(disable_radix_cache=True, enable_mixed_chunk=False)
+
+    def test_mixed_chunked_prefill_without_radix_cache(self):
+        run_mmlu_test(disable_radix_cache=True, enable_mixed_chunk=True)
+
+    def test_mixed_chunked_prefill_multi_requests(self):
+        run_mulit_request_test(
+            enable_mixed_chunk=True,
+            chunked_prefill_size=2048,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_cpp_radix_cache.py b/test/srt/test_cpp_radix_cache.py
new file mode 100644
index 000000000..cb2822b88
--- /dev/null
+++ b/test/srt/test_cpp_radix_cache.py
@@ -0,0 +1,47 @@
+import os
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestCppRadixCache(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ["SGLANG_EXPERIMENTAL_CPP_RADIX_TREE"] = "1"
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(metrics)
+        self.assertGreaterEqual(metrics["score"], 0.65)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_cpu_graph.py b/test/srt/test_cpu_graph.py
new file mode 100644
index 000000000..4e3c40539
--- /dev/null
+++ b/test/srt/test_cpu_graph.py
@@ -0,0 +1,87 @@
+"""
+Usage:
+python3 -m unittest test_cpu_graph.TestCPUGraph.test_mmlu_torch_compile_cpu
+"""
+
+import copy
+import os
+import unittest
+from types import SimpleNamespace
+
+from test_intel_amx_attention_backend import intel_amx_benchmark
+
+from sglang.srt.utils import get_cpu_ids_by_node, kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MLA_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+)
+
+
+class TestCPUGraph(CustomTestCase):
+
+    @intel_amx_benchmark(
+        extra_args=[
+            "--batch-size",
+            "1",
+            "--mem-fraction-static",
+            "0.05",
+            "--enable-torch-compile",
+            "--torch-compile-max-bs",
+            "1",
+        ],
+        min_throughput=10,
+    )
+    def test_latency_torch_compile_cpu(self):
+        return DEFAULT_MLA_MODEL_NAME_FOR_TEST
+
+    def test_mmlu_torch_compile_cpu(self):
+        model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        base_url = DEFAULT_URL_FOR_TEST
+        cpu_ids_by_node = get_cpu_ids_by_node()
+        n_numa_node = len(cpu_ids_by_node)
+        env = copy.deepcopy(os.environ)
+        env["SGLANG_CPU_OMP_THREADS_BIND"] = "all"
+        process = popen_launch_server(
+            model,
+            base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--attention-backend",
+                "intel_amx",
+                "--mem-fraction-static",
+                "0.05",
+                "--disable-radix",
+                "--trust-remote-code",
+                "--disable-overlap-schedule",
+                "--enable-torch-compile",
+                "--torch-compile-max-bs",
+                "1",
+                "--tp",
+                f"{n_numa_node}",
+            ],
+            env=env,
+        )
+
+        try:
+            args = SimpleNamespace(
+                base_url=base_url,
+                model=model,
+                eval_name="mmlu",
+                num_examples=64,
+                num_threads=32,
+            )
+
+            metrics = run_eval(args)
+            if is_in_ci():
+                self.assertGreater(metrics["score"], 0.45)
+        finally:
+            kill_process_tree(process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_create_kvindices.py b/test/srt/test_create_kvindices.py
new file mode 100644
index 000000000..4196eb290
--- /dev/null
+++ b/test/srt/test_create_kvindices.py
@@ -0,0 +1,75 @@
+import itertools
+import unittest
+
+import numpy as np
+import torch
+
+from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
+from sglang.test.test_utils import CustomTestCase
+
+
+class TestCreateKvIndices(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+
+    def _run_test(self, batch, max_batch, max_context_len):
+        req_to_token = torch.arange(
+            max_batch * max_context_len, dtype=torch.int32, device="cuda"
+        ).reshape((max_batch, max_context_len))
+        req_pool_indices = torch.tensor(
+            torch.from_numpy(
+                np.random.choice(range(max_batch), size=batch, replace=False)
+            ),
+            dtype=torch.int32,
+            device="cuda",
+        )
+        paged_kernel_lens = torch.tensor(
+            torch.from_numpy(
+                np.random.choice(range(max_context_len), size=batch, replace=False)
+            ),
+            dtype=torch.int32,
+            device="cuda",
+        )
+
+        kv_indptr = torch.zeros((batch + 1,), dtype=torch.int32, device="cuda")
+        kv_indptr[1:] = torch.cumsum(paged_kernel_lens, dim=0)
+
+        # ref
+        req_pool_indices_cpu = req_pool_indices.cpu().numpy()
+        paged_kernel_lens_cpu = paged_kernel_lens.cpu().numpy()
+        kv_indices_ref = torch.cat(
+            [
+                req_to_token[req_pool_indices_cpu[i], : paged_kernel_lens_cpu[i]]
+                for i in range(batch)
+            ],
+            dim=0,
+        ).contiguous()
+
+        # triton
+        kv_indices_triton = torch.empty(kv_indptr[-1], dtype=torch.int32, device="cuda")
+        create_flashinfer_kv_indices_triton[(batch,)](
+            req_to_token,
+            req_pool_indices,
+            paged_kernel_lens,
+            kv_indptr,
+            None,
+            kv_indices_triton,
+            req_to_token.size(1),
+        )
+
+        # Check
+        self.assertTrue(torch.equal(kv_indices_ref, kv_indices_triton))
+
+    def test_create_kvindices(self):
+        BATCH = [1, 37, 1786]
+        MAX_BATCH = 4096
+        MAX_CONTEXT_LEN = 4096
+        for batch in BATCH:
+            self._run_test(batch, MAX_BATCH, MAX_CONTEXT_LEN)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_custom_allreduce.py b/test/srt/test_custom_allreduce.py
new file mode 100644
index 000000000..462ac578e
--- /dev/null
+++ b/test/srt/test_custom_allreduce.py
@@ -0,0 +1,174 @@
+import os
+import random
+import socket
+import unittest
+from typing import Any
+
+import ray
+import torch
+import torch.distributed as dist
+
+from sglang.srt.distributed import init_distributed_environment
+from sglang.srt.distributed.communication_op import (  # noqa
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.distributed.parallel_state import (
+    get_tensor_model_parallel_group,
+    graph_capture,
+    initialize_model_parallel,
+)
+from sglang.test.test_utils import CustomTestCase
+
+
+def get_open_port() -> int:
+    # try ipv4
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+    except OSError:
+        # try ipv6
+        with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+
+
+def multi_process_parallel(
+    world_size: int,
+    cls: Any,
+    test_target: Any,
+) -> None:
+
+    # Using ray helps debugging the error when it failed
+    # as compared to multiprocessing.
+    # NOTE: We need to set working_dir for distributed tests,
+    # otherwise we may get import errors on ray workers
+
+    ray.init(log_to_driver=True)
+
+    distributed_init_port = get_open_port()
+    refs = []
+    for rank in range(world_size):
+        refs.append(test_target.remote(cls, world_size, rank, distributed_init_port))
+    ray.get(refs)
+
+    ray.shutdown()
+
+
+class TestCustomAllReduce(CustomTestCase):
+    TEST_SIZES = [
+        512,
+        4096,
+        32768,
+        262144,
+        2097152,
+        16777216,
+        33554432,
+    ]  # 512B...32MB
+    WORLD_SIZES = [2, 4, 6, 8]
+    TEST_LOOP = 10
+
+    @classmethod
+    def setUpClass(cls):
+        random.seed(42)  # keep the deterministic seed
+
+    def test_graph_allreduce(self):
+        for world_size in self.WORLD_SIZES:
+            if world_size > torch.cuda.device_count():
+                continue
+            multi_process_parallel(world_size, self, self.graph_allreduce)
+
+    def test_eager_allreduce(self):
+        for world_size in self.WORLD_SIZES:
+            if world_size > torch.cuda.device_count():
+                continue
+            multi_process_parallel(world_size, self, self.eager_allreduce)
+
+    @ray.remote(num_gpus=1, max_calls=1)
+    def graph_allreduce(self, world_size, rank, distributed_init_port):
+        del os.environ["CUDA_VISIBLE_DEVICES"]
+        device = torch.device(f"cuda:{rank}")
+        torch.cuda.set_device(device)
+        distributed_init_method = f"tcp://localhost:{distributed_init_port}"
+        init_distributed_environment(
+            world_size=world_size,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            local_rank=rank,
+        )
+        initialize_model_parallel(tensor_model_parallel_size=world_size)
+        group = get_tensor_model_parallel_group().device_group
+
+        # A small all_reduce for warmup.
+        # this is needed because device communicators might be created lazily
+        # (e.g. NCCL). This will ensure that the communicator is initialized
+        # before any communication happens, so that this group can be used for
+        # graph capture immediately.
+        data = torch.zeros(1)
+        data = data.to(device=device)
+        torch.distributed.all_reduce(data, group=group)
+        torch.cuda.synchronize()
+        del data
+
+        for sz in self.TEST_SIZES:
+            for dtype in [torch.float32, torch.float16, torch.bfloat16]:
+                for _ in range(self.TEST_LOOP):
+                    with graph_capture() as graph_capture_context:
+                        # use integers so result matches NCCL exactly
+                        inp1 = torch.randint(
+                            1,
+                            16,
+                            (sz,),
+                            dtype=dtype,
+                            device=torch.cuda.current_device(),
+                        )
+                        inp2 = torch.randint(
+                            1,
+                            16,
+                            (sz,),
+                            dtype=dtype,
+                            device=torch.cuda.current_device(),
+                        )
+                        torch.cuda.synchronize()
+                        graph = torch.cuda.CUDAGraph()
+                        with torch.cuda.graph(
+                            graph, stream=graph_capture_context.stream
+                        ):
+                            out1 = tensor_model_parallel_all_reduce(inp1)
+                            # the input buffer is immediately modified to test
+                            # synchronization
+                            dist.all_reduce(inp1, group=group)
+                            out2 = tensor_model_parallel_all_reduce(inp2)
+                            dist.all_reduce(inp2, group=group)
+                    graph.replay()
+                    torch.testing.assert_close(out1, inp1)
+                    torch.testing.assert_close(out2, inp2)
+
+    @ray.remote(num_gpus=1, max_calls=1)
+    def eager_allreduce(self, world_size, rank, distributed_init_port):
+        del os.environ["CUDA_VISIBLE_DEVICES"]
+        device = torch.device(f"cuda:{rank}")
+        torch.cuda.set_device(device)
+        distributed_init_method = f"tcp://localhost:{distributed_init_port}"
+        init_distributed_environment(
+            world_size=world_size,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            local_rank=rank,
+        )
+        initialize_model_parallel(tensor_model_parallel_size=world_size)
+        group = get_tensor_model_parallel_group().device_group
+
+        for sz in self.TEST_SIZES:
+            for dtype in [torch.float32, torch.float16, torch.bfloat16]:
+                for _ in range(self.TEST_LOOP):
+                    inp1 = torch.randint(
+                        1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
+                    )
+                    out1 = tensor_model_parallel_all_reduce(inp1)
+                    dist.all_reduce(inp1, group=group)
+                    torch.testing.assert_close(out1, inp1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_data_parallelism.py b/test/srt/test_data_parallelism.py
new file mode 100644
index 000000000..e54e3a5cf
--- /dev/null
+++ b/test/srt/test_data_parallelism.py
@@ -0,0 +1,78 @@
+import time
+import unittest
+from types import SimpleNamespace
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestDataParallelism(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--dp", 2],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreaterEqual(metrics["score"], 0.65)
+
+    def test_update_weight(self):
+        response = requests.post(
+            self.base_url + "/update_weights_from_disk",
+            json={"model_path": DEFAULT_MODEL_NAME_FOR_TEST},
+        )
+
+        # check if the response is 200
+        assert response.status_code == 200
+
+        # pause a few seconds then send again
+        time.sleep(1)
+
+        response = requests.post(
+            self.base_url + "/update_weights_from_disk",
+            json={"model_path": DEFAULT_MODEL_NAME_FOR_TEST},
+        )
+
+        # check if the response is 200
+        assert response.status_code == 200
+
+    def test_get_memory_pool_size(self):
+        # use `get_server_info` instead since `get_memory_pool_size` is merged into `get_server_info`
+        response = requests.get(self.base_url + "/get_server_info")
+        assert response.status_code == 200
+
+        time.sleep(1)
+
+        response = requests.get(self.base_url + "/get_server_info")
+        assert response.status_code == 200
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_disaggregation.py b/test/srt/test_disaggregation.py
new file mode 100644
index 000000000..827bfc3b8
--- /dev/null
+++ b/test/srt/test_disaggregation.py
@@ -0,0 +1,434 @@
+import json
+import os
+import time
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+
+import requests
+
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_disaggregation_utils import TestDisaggregationBase
+from sglang.test.test_utils import (
+    DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+    DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_pd_server,
+)
+
+
+class TestDisaggregationAccuracy(TestDisaggregationBase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        parsed_url = urlparse(DEFAULT_URL_FOR_TEST)
+        cls.base_host = parsed_url.hostname
+        base_port = str(parsed_url.port)
+        cls.lb_port = base_port
+        cls.prefill_port = f"{int(base_port) + 100}"
+        cls.decode_port = f"{int(base_port) + 200}"
+        cls.prefill_url = f"http://{cls.base_host}:{cls.prefill_port}"
+        cls.decode_url = f"http://{cls.base_host}:{cls.decode_port}"
+        cls.lb_url = f"http://{cls.base_host}:{cls.lb_port}"
+        print(f"{cls.base_host=} {cls.lb_port=} {cls.prefill_port=} {cls.decode_port=}")
+
+        # Non blocking start servers
+        cls.start_prefill()
+        cls.start_decode()
+
+        # Block until both
+        cls.wait_server_ready(cls.prefill_url + "/health")
+        cls.wait_server_ready(cls.decode_url + "/health")
+
+        cls.launch_lb()
+
+    @classmethod
+    def start_prefill(cls):
+        prefill_args = [
+            "--trust-remote-code",
+            "--disaggregation-mode",
+            "prefill",
+            "--tp",
+            "1",
+            "--disaggregation-ib-device",
+            "mlx5_roce0",
+        ]
+        cls.process_prefill = popen_launch_pd_server(
+            cls.model,
+            cls.prefill_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=prefill_args,
+        )
+
+    @classmethod
+    def start_decode(cls):
+        decode_args = [
+            "--trust-remote-code",
+            "--disaggregation-mode",
+            "decode",
+            "--tp",
+            "1",
+            "--base-gpu-id",
+            "1",
+            "--disaggregation-ib-device",
+            "mlx5_roce1",
+        ]
+        cls.process_decode = popen_launch_pd_server(
+            cls.model,
+            cls.decode_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=decode_args,
+        )
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host=f"http://{self.base_host}",
+            port=int(self.lb_port),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(f"Evaluation metrics: {metrics}")
+
+        self.assertGreater(metrics["accuracy"], 0.62)
+
+    def test_logprob(self):
+        prompt = "The capital of france is "
+        response = requests.post(
+            self.lb_url + "/generate",
+            json={
+                "text": prompt,
+                "sampling_params": {"temperature": 0},
+                "return_logprob": True,
+                "return_input_logprob": True,
+                "logprob_start_len": 0,
+            },
+        )
+
+        j = response.json()
+        completion_tokens = j["meta_info"]["completion_tokens"]
+        input_logprobs = j["meta_info"]["input_token_logprobs"]
+        output_logprobs = j["meta_info"]["output_token_logprobs"]
+
+        assert (
+            len(output_logprobs) == completion_tokens
+        ), f"output_logprobs and completion_tokens should have the same length, but got {len(output_logprobs)} and {completion_tokens}"
+        assert (
+            len(input_logprobs) > 0
+        ), f"input_logprobs should have at least one token, but got {len(input_logprobs)}"
+
+    def test_structured_output(self):
+        json_schema = json.dumps(
+            {
+                "type": "object",
+                "properties": {
+                    "name": {"type": "string", "pattern": "^[\\w]+$"},
+                    "population": {"type": "integer"},
+                },
+                "required": ["name", "population"],
+            }
+        )
+
+        # JSON
+        response = requests.post(
+            f"{self.lb_url}/generate",
+            json={
+                "text": "Here is the information of the capital of France in the JSON format.\n",
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": 64,
+                    "json_schema": json_schema,
+                },
+            },
+        )
+        output = response.json()["text"]
+        # ensure the output is a valid JSON
+        json.loads(output)
+
+
+class TestDisaggregationMooncakeFailure(TestDisaggregationBase):
+    @classmethod
+    def setUpClass(cls):
+        # set DISAGGREGATION_TEST_FAILURE_PROB to simulate failure
+        os.environ["DISAGGREGATION_TEST_FAILURE_PROB"] = "0.05"
+
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        parsed_url = urlparse(DEFAULT_URL_FOR_TEST)
+        cls.base_host = parsed_url.hostname
+        base_port = str(parsed_url.port)
+        cls.lb_port = base_port
+        cls.prefill_port = f"{int(base_port) + 100}"
+        cls.decode_port = f"{int(base_port) + 200}"
+        cls.prefill_url = f"http://{cls.base_host}:{cls.prefill_port}"
+        cls.decode_url = f"http://{cls.base_host}:{cls.decode_port}"
+        cls.lb_url = f"http://{cls.base_host}:{cls.lb_port}"
+        print(f"{cls.base_host=} {cls.lb_port=} {cls.prefill_port=} {cls.decode_port=}")
+
+        # Non blocking start servers
+        cls.start_prefill()
+        cls.start_decode()
+
+        # Block until both
+        cls.wait_server_ready(cls.prefill_url + "/health")
+        cls.wait_server_ready(cls.decode_url + "/health")
+
+        cls.launch_lb()
+
+    @classmethod
+    def tearDownClass(cls):
+        os.environ.pop("DISAGGREGATION_TEST_FAILURE_PROB")
+        super().tearDownClass()
+
+    @classmethod
+    def start_prefill(cls):
+        prefill_args = [
+            "--trust-remote-code",
+            "--disaggregation-mode",
+            "prefill",
+            "--tp",
+            "1",
+            "--disaggregation-ib-device",
+            "mlx5_roce0",
+        ]
+        cls.process_prefill = popen_launch_pd_server(
+            cls.model,
+            cls.prefill_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=prefill_args,
+        )
+
+    @classmethod
+    def start_decode(cls):
+        decode_args = [
+            "--trust-remote-code",
+            "--disaggregation-mode",
+            "decode",
+            "--tp",
+            "1",
+            "--base-gpu-id",
+            "1",
+            "--disaggregation-ib-device",
+            "mlx5_roce1",
+        ]
+        cls.process_decode = popen_launch_pd_server(
+            cls.model,
+            cls.decode_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=decode_args,
+        )
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host=f"http://{self.base_host}",
+            port=int(self.lb_port),
+        )
+
+        # Expect lots of failure but the server cannot crash
+        try:
+            metrics = run_eval_few_shot_gsm8k(args)
+            print(f"Evaluation metrics: {metrics}")
+        except Exception as e:
+            print(f"Test encountered expected errors: {e}")
+            # Check if servers are still healthy
+            try:
+                response = requests.get(self.prefill_url + "/health_generate")
+                assert response.status_code == 200
+                response = requests.get(self.decode_url + "/health_generate")
+                assert response.status_code == 200
+            except Exception as health_check_error:
+                # If health check fails, re-raise the original exception
+                raise e from health_check_error
+
+
+class TestDisaggregationMooncakeSpec(TestDisaggregationBase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST
+        cls.draft_model = DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST
+        parsed_url = urlparse(DEFAULT_URL_FOR_TEST)
+        cls.base_host = parsed_url.hostname
+        base_port = str(parsed_url.port)
+        cls.lb_port = base_port
+        cls.prefill_port = f"{int(base_port) + 100}"
+        cls.decode_port = f"{int(base_port) + 200}"
+        cls.prefill_url = f"http://{cls.base_host}:{cls.prefill_port}"
+        cls.decode_url = f"http://{cls.base_host}:{cls.decode_port}"
+        cls.lb_url = f"http://{cls.base_host}:{cls.lb_port}"
+        cls.spec_args = [
+            "--speculative-algorithm",
+            "EAGLE",
+            "--speculative-draft-model-path",
+            cls.draft_model,
+            "--speculative-num-steps",
+            "3",
+            "--speculative-eagle-topk",
+            "4",
+            "--speculative-num-draft-tokens",
+            "16",
+            "--cuda-graph-max-bs",
+            "8",
+        ]
+        print(f"{cls.base_host=} {cls.lb_port=} {cls.prefill_port=} {cls.decode_port=}")
+
+        # Non blocking start servers
+        cls.start_prefill()
+        cls.start_decode()
+
+        # Block until both
+        cls.wait_server_ready(cls.prefill_url + "/health")
+        cls.wait_server_ready(cls.decode_url + "/health")
+
+        cls.launch_lb()
+
+    @classmethod
+    def start_prefill(cls):
+        prefill_args = [
+            "--trust-remote-code",
+            "--disaggregation-mode",
+            "prefill",
+            "--tp",
+            "2",
+            "--disaggregation-ib-device",
+            "mlx5_roce0,mlx5_roce1",
+        ] + cls.spec_args
+        cls.process_prefill = popen_launch_pd_server(
+            cls.model,
+            cls.prefill_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=prefill_args,
+        )
+
+    @classmethod
+    def start_decode(cls):
+        decode_args = [
+            "--trust-remote-code",
+            "--disaggregation-mode",
+            "decode",
+            "--tp",
+            "2",
+            "--base-gpu-id",
+            "2",
+            "--disaggregation-ib-device",
+            "mlx5_roce2,mlx5_roce3",
+        ] + cls.spec_args
+        cls.process_decode = popen_launch_pd_server(
+            cls.model,
+            cls.decode_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=decode_args,
+        )
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=2,
+            host=f"http://{self.base_host}",
+            port=int(self.lb_port),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(f"Evaluation metrics: {metrics}")
+
+        self.assertGreater(metrics["accuracy"], 0.20)
+
+
+class TestDisaggregationSimulatedRetract(TestDisaggregationBase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ["SGLANG_TEST_RETRACT"] = "true"
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        parsed_url = urlparse(DEFAULT_URL_FOR_TEST)
+        cls.base_host = parsed_url.hostname
+        base_port = str(parsed_url.port)
+        cls.lb_port = base_port
+        cls.prefill_port = f"{int(base_port) + 100}"
+        cls.decode_port = f"{int(base_port) + 200}"
+        cls.prefill_url = f"http://{cls.base_host}:{cls.prefill_port}"
+        cls.decode_url = f"http://{cls.base_host}:{cls.decode_port}"
+        cls.lb_url = f"http://{cls.base_host}:{cls.lb_port}"
+        print(f"{cls.base_host=} {cls.lb_port=} {cls.prefill_port=} {cls.decode_port=}")
+
+        # Non blocking start servers
+        cls.start_prefill()
+        cls.start_decode()
+
+        # Block until both
+        cls.wait_server_ready(cls.prefill_url + "/health")
+        cls.wait_server_ready(cls.decode_url + "/health")
+
+        cls.launch_lb()
+
+    @classmethod
+    def tearDownClass(cls):
+        os.environ.pop("SGLANG_TEST_RETRACT")
+        super().tearDownClass()
+
+    @classmethod
+    def start_prefill(cls):
+        prefill_args = [
+            "--trust-remote-code",
+            "--disaggregation-mode",
+            "prefill",
+            "--tp",
+            "1",
+            "--disaggregation-ib-device",
+            "mlx5_roce0",
+        ]
+        cls.process_prefill = popen_launch_pd_server(
+            cls.model,
+            cls.prefill_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=prefill_args,
+        )
+
+    @classmethod
+    def start_decode(cls):
+        decode_args = [
+            "--trust-remote-code",
+            "--disaggregation-mode",
+            "decode",
+            "--tp",
+            "1",
+            "--base-gpu-id",
+            "1",
+            "--disaggregation-ib-device",
+            "mlx5_roce1",
+        ]
+        cls.process_decode = popen_launch_pd_server(
+            cls.model,
+            cls.decode_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=decode_args,
+        )
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host=f"http://{self.base_host}",
+            port=int(self.lb_port),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(f"Evaluation metrics: {metrics}")
+
+        self.assertGreater(metrics["accuracy"], 0.62)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_disaggregation_different_tp.py b/test/srt/test_disaggregation_different_tp.py
new file mode 100644
index 000000000..67a3afcbe
--- /dev/null
+++ b/test/srt/test_disaggregation_different_tp.py
@@ -0,0 +1,184 @@
+import os
+import time
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_disaggregation_utils import TestDisaggregationBase
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST_MLA,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_pd_server,
+)
+
+
+class TestDisaggregationMooncakePrefillLargerTP(TestDisaggregationBase):
+    @classmethod
+    def setUpClass(cls):
+        # Temporarily disable JIT DeepGEMM
+        cls.original_jit_deepgemm = os.environ.get("SGL_ENABLE_JIT_DEEPGEMM")
+        os.environ["SGL_ENABLE_JIT_DEEPGEMM"] = "false"
+
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
+        parsed_url = urlparse(DEFAULT_URL_FOR_TEST)
+        cls.base_host = parsed_url.hostname
+        base_port = str(parsed_url.port)
+        cls.lb_port = base_port
+        cls.prefill_port = f"{int(base_port) + 100}"
+        cls.decode_port = f"{int(base_port) + 200}"
+        cls.prefill_url = f"http://{cls.base_host}:{cls.prefill_port}"
+        cls.decode_url = f"http://{cls.base_host}:{cls.decode_port}"
+        cls.lb_url = f"http://{cls.base_host}:{cls.lb_port}"
+        print(f"{cls.base_host=} {cls.lb_port=} {cls.prefill_port=} {cls.decode_port=}")
+
+        # Non blocking start servers
+        cls.start_prefill()
+        cls.start_decode()
+
+        # Block until both
+        cls.wait_server_ready(cls.prefill_url + "/health")
+        cls.wait_server_ready(cls.decode_url + "/health")
+
+        cls.launch_lb()
+
+    @classmethod
+    def start_prefill(cls):
+        prefill_args = [
+            "--trust-remote-code",
+            "--disaggregation-mode",
+            "prefill",
+            "--tp",
+            "2",
+            "--disaggregation-ib-device",
+            "mlx5_roce0,mlx5_roce1",
+        ]
+        cls.process_prefill = popen_launch_pd_server(
+            cls.model,
+            cls.prefill_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=prefill_args,
+        )
+
+    @classmethod
+    def start_decode(cls):
+        decode_args = [
+            "--trust-remote-code",
+            "--disaggregation-mode",
+            "decode",
+            "--tp",
+            "1",
+            "--base-gpu-id",
+            "2",
+            "--disaggregation-ib-device",
+            "mlx5_roce2",
+        ]
+        cls.process_decode = popen_launch_pd_server(
+            cls.model,
+            cls.decode_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=decode_args,
+        )
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host=f"http://{self.base_host}",
+            port=int(self.lb_port),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(f"Evaluation metrics: {metrics}")
+
+        self.assertGreater(metrics["accuracy"], 0.60)
+
+
+class TestDisaggregationMooncakeDecodeLargerTP(TestDisaggregationBase):
+    @classmethod
+    def setUpClass(cls):
+        # Temporarily disable JIT DeepGEMM
+        cls.original_jit_deepgemm = os.environ.get("SGL_ENABLE_JIT_DEEPGEMM")
+        os.environ["SGL_ENABLE_JIT_DEEPGEMM"] = "false"
+
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
+        parsed_url = urlparse(DEFAULT_URL_FOR_TEST)
+        cls.base_host = parsed_url.hostname
+        base_port = str(parsed_url.port)
+        cls.lb_port = base_port
+        cls.prefill_port = f"{int(base_port) + 100}"
+        cls.decode_port = f"{int(base_port) + 200}"
+        cls.prefill_url = f"http://{cls.base_host}:{cls.prefill_port}"
+        cls.decode_url = f"http://{cls.base_host}:{cls.decode_port}"
+        cls.lb_url = f"http://{cls.base_host}:{cls.lb_port}"
+        print(f"{cls.base_host=} {cls.lb_port=} {cls.prefill_port=} {cls.decode_port=}")
+
+        # Non blocking start servers
+        cls.start_prefill()
+        cls.start_decode()
+
+        # Block until both
+        cls.wait_server_ready(cls.prefill_url + "/health")
+        cls.wait_server_ready(cls.decode_url + "/health")
+
+        cls.launch_lb()
+
+    @classmethod
+    def start_prefill(cls):
+        prefill_args = [
+            "--trust-remote-code",
+            "--disaggregation-mode",
+            "prefill",
+            "--tp",
+            "1",
+            "--disaggregation-ib-device",
+            "mlx5_roce0",
+        ]
+        cls.process_prefill = popen_launch_pd_server(
+            cls.model,
+            cls.prefill_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=prefill_args,
+        )
+
+    @classmethod
+    def start_decode(cls):
+        decode_args = [
+            "--trust-remote-code",
+            "--disaggregation-mode",
+            "decode",
+            "--tp",
+            "2",
+            "--base-gpu-id",
+            "1",
+            "--disaggregation-ib-device",
+            "mlx5_roce1,mlx5_roce2",
+        ]
+        cls.process_decode = popen_launch_pd_server(
+            cls.model,
+            cls.decode_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=decode_args,
+        )
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host=f"http://{self.base_host}",
+            port=int(self.lb_port),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(f"Evaluation metrics: {metrics}")
+
+        self.assertGreater(metrics["accuracy"], 0.60)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_disaggregation_pp.py b/test/srt/test_disaggregation_pp.py
new file mode 100644
index 000000000..a8bab8f81
--- /dev/null
+++ b/test/srt/test_disaggregation_pp.py
@@ -0,0 +1,101 @@
+import time
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+
+from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.test_disaggregation_utils import TestDisaggregationBase
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_pd_server,
+)
+
+
+class TestDisaggregationPPAccuracy(TestDisaggregationBase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        parsed_url = urlparse(DEFAULT_URL_FOR_TEST)
+        cls.base_host = parsed_url.hostname
+        base_port = str(parsed_url.port)
+        cls.lb_port = base_port
+        cls.prefill_port = f"{int(base_port) + 100}"
+        cls.decode_port = f"{int(base_port) + 200}"
+        cls.prefill_url = f"http://{cls.base_host}:{cls.prefill_port}"
+        cls.decode_url = f"http://{cls.base_host}:{cls.decode_port}"
+        cls.lb_url = f"http://{cls.base_host}:{cls.lb_port}"
+        print(f"{cls.base_host=} {cls.lb_port=} {cls.prefill_port=} {cls.decode_port=}")
+
+        # Non blocking start servers
+        cls.start_prefill()
+        cls.start_decode()
+
+        # Block until both
+        cls.wait_server_ready(cls.prefill_url + "/health")
+        cls.wait_server_ready(cls.decode_url + "/health")
+
+        cls.launch_lb()
+
+    @classmethod
+    def start_prefill(cls):
+        prefill_args = [
+            "--trust-remote-code",
+            "--disaggregation-mode",
+            "prefill",
+            "--tp-size",
+            "1",
+            "--pp-size",
+            "2",
+            "--disaggregation-ib-device",
+            "mlx5_roce0,mlx5_roce1",
+            "--disable-overlap-schedule",
+        ]
+        cls.process_prefill = popen_launch_pd_server(
+            cls.model,
+            cls.prefill_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=prefill_args,
+        )
+
+    @classmethod
+    def start_decode(cls):
+        decode_args = [
+            "--trust-remote-code",
+            "--disaggregation-mode",
+            "decode",
+            "--tp",
+            "1",
+            "--base-gpu-id",
+            "2",
+            "--disaggregation-ib-device",
+            "mlx5_roce2",
+        ]
+        cls.process_decode = popen_launch_pd_server(
+            cls.model,
+            cls.decode_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=decode_args,
+        )
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host=f"http://{self.base_host}",
+            port=int(self.lb_port),
+        )
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+
+        self.assertGreater(metrics["accuracy"], 0.24)
+        # Wait a little bit so that the memory check happens.
+        time.sleep(5)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_double_sparsity.py b/test/srt/test_double_sparsity.py
new file mode 100644
index 000000000..c936e79bb
--- /dev/null
+++ b/test/srt/test_double_sparsity.py
@@ -0,0 +1,65 @@
+import os
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestDoubleSparsity(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        dirpath = os.path.dirname(__file__)
+        config_file = os.path.join(
+            dirpath, "double-sparsity-config-Llama-3.1-8B-Instruct.json"
+        )
+        # NOTE: Generate the config file by running https://github.com/andy-yang-1/DoubleSparse/blob/main/evaluation/group_channel_config.py
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--enable-double-sparsity",
+                "--ds-channel-config-path",
+                config_file,
+                "--ds-heavy-channel-num",
+                "32",
+                "--ds-heavy-channel-type",
+                "k",
+                "--ds-heavy-token-num",
+                "512",
+                "--ds-sparse-decode-threshold",
+                "0",
+                "--max-total-tokens",
+                "200000",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreaterEqual(metrics["score"], 0.65)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_dp_attention.py b/test/srt/test_dp_attention.py
new file mode 100644
index 000000000..4486dc16e
--- /dev/null
+++ b/test/srt/test_dp_attention.py
@@ -0,0 +1,170 @@
+import unittest
+from types import SimpleNamespace
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MLA_MODEL_NAME_FOR_TEST,
+    DEFAULT_MODEL_NAME_FOR_TEST_MLA,
+    DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_amd_ci,
+    popen_launch_server,
+)
+
+
+class TestDPAttentionDP2TP2(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "2",
+                "--enable-dp-attention",
+                "--dp",
+                "2",
+                "--enable-torch-compile",
+                "--torch-compile-max-bs",
+                "2",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mgsm_en(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mgsm_en",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.8)
+
+
+class TestDPAttentionDP2TP2DeepseekV3MTP(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = [
+            "--trust-remote-code",
+            "--disable-radix",
+            "--speculative-algorithm",
+            "EAGLE",
+            "--speculative-num-steps",
+            "2",
+            "--speculative-eagle-topk",
+            "4",
+            "--speculative-num-draft-tokens",
+            "4",
+            "--speculative-draft-model-path",
+            DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN,
+            "--tp-size",
+            "2",
+            "--enable-dp-attention",
+            "--dp-size",
+            "2",
+        ]
+        if not is_in_amd_ci():
+            other_args += ["--mem-frac", "0.7"]
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        requests.get(self.base_url + "/flush_cache")
+
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.60)
+
+        server_info = requests.get(self.base_url + "/get_server_info")
+        avg_spec_accept_length = server_info.json()["internal_states"][0][
+            "avg_spec_accept_length"
+        ]
+        print(
+            f"###test_gsm8k (deepseek-v3 mtp + dp):\n"
+            f"accuracy={metrics['accuracy']=:.3f}\n"
+            f"{avg_spec_accept_length=:.3f}\n"
+        )
+        self.assertGreater(avg_spec_accept_length, 2.5)
+
+
+class TestDPAttentionMinimumTokenLoadBalance(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "2",
+                "--enable-dp-attention",
+                "--dp",
+                "2",
+                "--enable-torch-compile",
+                "--torch-compile-max-bs",
+                "2",
+                "--load-balance-method",
+                "minimum_tokens",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mgsm_en(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mgsm_en",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.8)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_eagle_infer_a.py b/test/srt/test_eagle_infer_a.py
new file mode 100644
index 000000000..c19f0c22f
--- /dev/null
+++ b/test/srt/test_eagle_infer_a.py
@@ -0,0 +1,323 @@
+import unittest
+
+import requests
+import torch
+
+import sglang as sgl
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+    DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
+    DEFAULT_MODEL_NAME_FOR_TEST_MLA,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+)
+
+torch_dtype = torch.float16
+prefill_tolerance = 5e-2
+decode_tolerance: float = 5e-2
+
+
+class TestEAGLEEngine(CustomTestCase):
+    BASE_CONFIG = {
+        "model_path": DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
+        "speculative_draft_model_path": DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+        "speculative_algorithm": "EAGLE",
+        "speculative_num_steps": 5,
+        "speculative_eagle_topk": 4,
+        "speculative_num_draft_tokens": 8,
+        "mem_fraction_static": 0.7,
+        "cuda_graph_max_bs": 5,
+    }
+    NUM_CONFIGS = 2
+
+    def setUp(self):
+        self.prompt = "Today is a sunny day and I like"
+        self.sampling_params = {"temperature": 0, "max_new_tokens": 8}
+
+        ref_engine = sgl.Engine(
+            model_path=self.BASE_CONFIG["model_path"], cuda_graph_max_bs=1
+        )
+        self.ref_output = ref_engine.generate(self.prompt, self.sampling_params)["text"]
+        ref_engine.shutdown()
+
+    def test_correctness(self):
+        configs = [
+            # Basic config
+            self.BASE_CONFIG,
+            # Chunked prefill
+            {**self.BASE_CONFIG, "chunked_prefill_size": 4},
+        ]
+
+        for i, config in enumerate(configs[: self.NUM_CONFIGS]):
+            with self.subTest(i=i):
+                print(f"{config=}")
+                engine = sgl.Engine(**config, log_level="info", decode_log_interval=10)
+                try:
+                    self._test_single_generation(engine)
+                    self._test_batch_generation(engine)
+                    self._test_eos_token(engine)
+                    self._test_acc_length(engine)
+                finally:
+                    engine.shutdown()
+                print("=" * 100)
+
+    def _test_single_generation(self, engine):
+        output = engine.generate(self.prompt, self.sampling_params)["text"]
+        print(f"{output=}, {self.ref_output=}")
+        self.assertEqual(output, self.ref_output)
+
+    def _test_batch_generation(self, engine):
+        prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
+        params = {"temperature": 0, "max_new_tokens": 50}
+
+        outputs = engine.generate(prompts, params)
+        for prompt, output in zip(prompts, outputs):
+            print(f"Prompt: {prompt}")
+            print(f"Generated: {output['text']}")
+            print("-" * 40)
+
+        print(f"{engine.get_server_info()=}")
+
+        avg_spec_accept_length = engine.get_server_info()["internal_states"][0][
+            "avg_spec_accept_length"
+        ]
+        print(f"{avg_spec_accept_length=}")
+        self.assertGreater(avg_spec_accept_length, 1.9)
+
+    def _test_eos_token(self, engine):
+        prompt = "[INST] <<SYS>>\nYou are a helpful assistant.\n<</SYS>>\nToday is a sunny day and I like [/INST]"
+        params = {
+            "temperature": 0.1,
+            "max_new_tokens": 1024,
+            "skip_special_tokens": False,
+        }
+
+        tokenizer = get_tokenizer(DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST)
+        output = engine.generate(prompt, params)["text"]
+        print(f"{output=}")
+
+        tokens = tokenizer.encode(output, truncation=False)
+        self.assertNotIn(tokenizer.eos_token_id, tokens)
+
+    def _test_acc_length(self, engine):
+        prompt = [
+            "Human: Give me a fully functional FastAPI server. Show the python code.\n\nAssistant:",
+        ] * 5  # test batched generation
+        sampling_params = {"temperature": 0, "max_new_tokens": 512}
+        output = engine.generate(prompt, sampling_params)
+        output = output[0]
+
+        if "spec_verify_ct" in output["meta_info"]:
+            acc_length = (
+                output["meta_info"]["completion_tokens"]
+                / output["meta_info"]["spec_verify_ct"]
+            )
+        else:
+            acc_length = 1.0
+
+        speed = (
+            output["meta_info"]["completion_tokens"]
+            / output["meta_info"]["e2e_latency"]
+        )
+        print(f"{acc_length=:.4f}, {speed=}")
+
+        if engine.server_args.model_path == DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST:
+            self.assertGreater(acc_length, 3.6)
+        else:
+            self.assertGreater(acc_length, 2.5)
+
+
+class TestEAGLEEngineTokenMap(TestEAGLEEngine):
+    BASE_CONFIG = {
+        "model_path": "meta-llama/Meta-Llama-3-8B-Instruct",
+        "speculative_draft_model_path": "lmsys/sglang-EAGLE-LLaMA3-Instruct-8B",
+        "speculative_algorithm": "EAGLE",
+        "speculative_num_steps": 5,
+        "speculative_eagle_topk": 4,
+        "speculative_num_draft_tokens": 8,
+        "speculative_token_map": "thunlp/LLaMA3-Instruct-8B-FR-Spec/freq_32768.pt",
+        "mem_fraction_static": 0.7,
+        "cuda_graph_max_bs": 5,
+        "dtype": "float16",
+    }
+    NUM_CONFIGS = 1
+
+
+class TestEAGLE3Engine(TestEAGLEEngine):
+    BASE_CONFIG = {
+        "model_path": "meta-llama/Llama-3.1-8B-Instruct",
+        "speculative_draft_model_path": "jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B",
+        "speculative_algorithm": "EAGLE3",
+        "speculative_num_steps": 5,
+        "speculative_eagle_topk": 16,
+        "speculative_num_draft_tokens": 64,
+        "mem_fraction_static": 0.7,
+        "cuda_graph_max_bs": 5,
+        "dtype": "float16",
+    }
+    NUM_CONFIGS = 1
+
+
+@unittest.skipIf(is_in_ci(), "To reduce the CI execution time.")
+class TestEAGLEDraftExtend(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--speculative-algorithm",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+                "--speculative-num-steps",
+                1,
+                "--speculative-eagle-topk",
+                1,
+                "--speculative-num-draft-tokens",
+                2,
+                "--max-running-requests",
+                4,
+                "--attention-backend",
+                "fa3",
+            ],
+        )
+        cls.accept_len_threshold = 1.50
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_one_batch_accept_length(self):
+        resp = requests.get(self.base_url + "/flush_cache")
+        self.assertEqual(resp.status_code, 200)
+
+        prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
+        url = self.base_url + "/generate"
+        data = {
+            "text": prompts,
+            "sampling_params": {
+                "temperature": 0,
+                "max_new_tokens": 512,
+            },
+        }
+        response = requests.post(url, json=data)
+        self.assertEqual(response.status_code, 200)
+        outputs = response.json()
+        for i in range(len(prompts)):
+            output = outputs[i]
+            if "spec_verify_ct" in output["meta_info"]:
+                acc_length = (
+                    output["meta_info"]["completion_tokens"]
+                    / output["meta_info"]["spec_verify_ct"]
+                )
+            else:
+                acc_length = 1.0
+
+            print(f"{acc_length=}")
+            self.assertGreater(acc_length, self.accept_len_threshold)
+
+
+class TestEAGLEDraftExtendFlashinfer(TestEAGLEDraftExtend):
+    @classmethod
+    def setUpClass(cls):
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--speculative-algorithm",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+                "--speculative-num-steps",
+                1,
+                "--speculative-eagle-topk",
+                1,
+                "--speculative-num-draft-tokens",
+                2,
+                "--max-running-requests",
+                4,
+                "--attention-backend",
+                "flashinfer",
+            ],
+        )
+        cls.accept_len_threshold = 1.50
+
+
+@unittest.skipIf(is_in_ci(), "To reduce the CI execution time.")
+class TestEAGLEDraftExtendTriton(TestEAGLEDraftExtend):
+    @classmethod
+    def setUpClass(cls):
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--speculative-algorithm",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+                "--speculative-num-steps",
+                1,
+                "--speculative-eagle-topk",
+                1,
+                "--speculative-num-draft-tokens",
+                2,
+                "--max-running-requests",
+                4,
+                "--attention-backend",
+                "triton",
+            ],
+        )
+        cls.accept_len_threshold = 1.50
+
+
+@unittest.skipIf(is_in_ci(), "To reduce the CI execution time.")
+class TestEAGLEDraftExtendFlashinferMLA(TestEAGLEDraftExtend):
+    @classmethod
+    def setUpClass(cls):
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            DEFAULT_MODEL_NAME_FOR_TEST_MLA,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--speculative-algorithm",
+                "EAGLE",
+                "--speculative-num-steps",
+                1,
+                "--speculative-eagle-topk",
+                1,
+                "--speculative-num-draft-tokens",
+                2,
+                "--max-running-requests",
+                4,
+                "--attention-backend",
+                "flashinfer",
+            ],
+        )
+        cls.accept_len_threshold = 1.85
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_eagle_infer_b.py b/test/srt/test_eagle_infer_b.py
new file mode 100644
index 000000000..5b9df1630
--- /dev/null
+++ b/test/srt/test_eagle_infer_b.py
@@ -0,0 +1,511 @@
+import json
+import os
+import random
+import threading
+import time
+import unittest
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from types import SimpleNamespace
+
+import numpy as np
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+    DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+    run_logprob_check,
+)
+
+
+class TestEAGLEServer(CustomTestCase):
+    PROMPTS = [
+        "[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nToday is a sunny day and I like[/INST]"
+        '[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nWhat are the mental triggers in Jeff Walker\'s Product Launch Formula and "Launch" book?[/INST]',
+        "[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nSummarize Russell Brunson's Perfect Webinar Script...[/INST]",
+        "[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nwho are you?[/INST]",
+        "[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nwhere are you from?[/INST]",
+    ]
+
+    @classmethod
+    def setUpClass(cls):
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--speculative-algorithm",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+                "--speculative-num-steps",
+                5,
+                "--speculative-eagle-topk",
+                8,
+                "--speculative-num-draft-tokens",
+                64,
+                "--mem-fraction-static",
+                0.7,
+                "--chunked-prefill-size",
+                128,
+                "--max-running-requests",
+                8,
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def send_request(self):
+        time.sleep(random.uniform(0, 2))
+        for prompt in self.PROMPTS:
+            url = self.base_url + "/generate"
+            data = {
+                "text": prompt,
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": 1024,
+                },
+            }
+            response = requests.post(url, json=data)
+            assert response.status_code == 200
+
+    def send_requests_abort(self):
+        for prompt in self.PROMPTS:
+            try:
+                time.sleep(random.uniform(0, 2))
+                url = self.base_url + "/generate"
+                data = {
+                    "model": "base",
+                    "text": prompt,
+                    "sampling_params": {
+                        "temperature": 0,
+                        "max_new_tokens": 1024,
+                    },
+                }
+                # set timeout = 1s, mock disconnected
+                requests.post(url, json=data, timeout=1)
+            except Exception as e:
+                print(e)
+                pass
+
+    def test_request_abort(self):
+        concurrency = 4
+        threads = [
+            threading.Thread(target=self.send_request) for _ in range(concurrency)
+        ] + [
+            threading.Thread(target=self.send_requests_abort)
+            for _ in range(concurrency)
+        ]
+        for worker in threads:
+            worker.start()
+        for p in threads:
+            p.join()
+
+    def test_max_token_one(self):
+        requests.get(self.base_url + "/flush_cache")
+
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=1,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+
+        # Just run and check it does not hang
+        metrics = run_eval(args)
+        self.assertGreater(metrics["output_throughput"], 50)
+
+    def test_gsm8k(self):
+        requests.get(self.base_url + "/flush_cache")
+
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["accuracy"], 0.20)
+
+        server_info = requests.get(self.base_url + "/get_server_info").json()
+        avg_spec_accept_length = server_info["internal_states"][0][
+            "avg_spec_accept_length"
+        ]
+        print(f"{avg_spec_accept_length=}")
+
+        speculative_eagle_topk = server_info["speculative_eagle_topk"]
+
+        if speculative_eagle_topk == 1:
+            self.assertGreater(avg_spec_accept_length, 2.5)
+        else:
+            self.assertGreater(avg_spec_accept_length, 3.5)
+
+        # Wait a little bit so that the memory check happens.
+        time.sleep(4)
+
+    def test_logprob_start_len(self):
+        logprob_start_len = 4
+        new_tokens = 4
+        prompts = [
+            "I have a very good idea on",
+            "Today is a sunndy day and",
+        ]
+
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": prompts,
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": new_tokens,
+                },
+                "return_logprob": True,
+                "top_logprobs_num": 5,
+                "logprob_start_len": logprob_start_len,
+            },
+        )
+        response_json = response.json()
+        print(json.dumps(response_json, indent=2))
+
+        for res in response_json:
+            self.assertEqual(
+                res["meta_info"]["prompt_tokens"],
+                logprob_start_len + len(res["meta_info"]["input_token_logprobs"]),
+            )
+
+            self.assertEqual(res["meta_info"]["completion_tokens"], new_tokens)
+            self.assertEqual(len(res["meta_info"]["output_token_logprobs"]), new_tokens)
+
+    def test_logprob_match(self):
+        """Test the output logprobs are close to the input logprobs if we run a prefill again."""
+
+        def run_generate(
+            prompt,
+            return_logprob=False,
+            max_new_tokens=512,
+            logprob_start_len=-1,
+            temperature=1.0,
+        ):
+
+            if isinstance(prompt, str):
+                prompt_kwargs = {"text": prompt}
+            else:
+                prompt_kwargs = {"input_ids": prompt}
+
+            response = requests.post(
+                self.base_url + "/generate",
+                json={
+                    **prompt_kwargs,
+                    "sampling_params": {
+                        "temperature": temperature,
+                        "max_new_tokens": max_new_tokens,
+                        "ignore_eos": True,
+                    },
+                    "return_logprob": return_logprob,
+                    "return_text_in_logprobs": True,
+                    "logprob_start_len": logprob_start_len,
+                    "temp_scaled_logprobs": True,
+                },
+            )
+            return response.json()
+
+        prompt = "I have a very good idea on how to"
+
+        for temperature in [1.0]:
+            gen = run_generate(
+                prompt,
+                return_logprob=True,
+                logprob_start_len=0,
+                temperature=temperature,
+            )
+            output_logprobs = np.array(
+                [x[0] for x in gen["meta_info"]["output_token_logprobs"]]
+            )
+            num_prompts_tokens = gen["meta_info"]["prompt_tokens"]
+
+            input_tokens = [x[1] for x in gen["meta_info"]["input_token_logprobs"]]
+            output_tokens = [x[1] for x in gen["meta_info"]["output_token_logprobs"]]
+
+            new_prompt = input_tokens + output_tokens
+            score = run_generate(
+                new_prompt,
+                return_logprob=True,
+                logprob_start_len=0,
+                max_new_tokens=0,
+                temperature=temperature,
+            )
+            output_logprobs_score = np.array(
+                [
+                    x[0]
+                    for x in score["meta_info"]["input_token_logprobs"][
+                        num_prompts_tokens:
+                    ]
+                ]
+            )
+
+            print(f"{output_logprobs[-10:]=}")
+            print(f"{output_logprobs_score[-10:]=}")
+
+            diff = np.abs(output_logprobs - output_logprobs_score)
+            max_diff = np.max(diff)
+            self.assertLess(max_diff, 0.255)
+
+    def test_logprob_mixed(self):
+        args = []
+        temperature = 0
+        # input_len, output_len, temperature, logprob_start_len, return_logprob, top_logprobs_num
+        # Llama 2 context length seems to be only 2k, so we can only test small length.
+        for input_len in [200, 500, 1000, 2000]:
+            for output_len in [4, 8]:
+                for logprob_start_len in [0, 100, 300, 800, 1998]:
+                    for return_logprob in [True, False]:
+                        for top_logprobs_num in [0, 5]:
+
+                            if logprob_start_len >= input_len:
+                                continue
+
+                            args.append(
+                                (
+                                    input_len,
+                                    output_len,
+                                    temperature,
+                                    logprob_start_len,
+                                    return_logprob,
+                                    top_logprobs_num,
+                                )
+                            )
+
+        random.shuffle(args)
+
+        func = partial(run_logprob_check, self)
+        with ThreadPoolExecutor(8) as executor:
+            list(executor.map(func, args))
+
+    def run_decode(self, sampling_params):
+        return_logprob = True
+        top_logprobs_num = 5
+        return_text = True
+        n = 1
+
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": "Human: Write a travel blog post to Hawaii.\n\nAssistant:",
+                "sampling_params": {
+                    "max_new_tokens": 48,
+                    "n": n,
+                    "temperature": 0.7,
+                    **sampling_params,
+                },
+                "return_logprob": return_logprob,
+                "top_logprobs_num": top_logprobs_num,
+                "return_text_in_logprobs": return_text,
+                "logprob_start_len": 0,
+            },
+        )
+        self.assertEqual(response.status_code, 200)
+        print(json.dumps(response.json()))
+        print("=" * 100)
+
+    def test_penalty_mixed(self):
+        args = [
+            {},
+            {},
+            {},
+            {"frequency_penalty": 2},
+            {"presence_penalty": 1},
+            {"min_new_tokens": 16},
+            {"frequency_penalty": 0.2},
+            {"presence_penalty": 0.4},
+            {"min_new_tokens": 8},
+            {"frequency_penalty": 0.4, "presence_penalty": 0.8},
+            {"frequency_penalty": 0.4, "min_new_tokens": 12},
+            {"presence_penalty": 0.8, "min_new_tokens": 12},
+            {"presence_penalty": -0.3, "frequency_penalty": 1.3, "min_new_tokens": 32},
+            {"presence_penalty": 0.3, "frequency_penalty": -1.3, "min_new_tokens": 32},
+        ]
+        random.shuffle(args * 5)
+        with ThreadPoolExecutor(8) as executor:
+            list(executor.map(self.run_decode, args))
+
+    def test_constrained_decoding(self):
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Give me a json"},
+        ]
+
+        response = requests.post(
+            self.base_url + "/v1/chat/completions",
+            json={
+                "model": DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
+                "messages": messages,
+                "temperature": 0,
+                "response_format": {"type": "json_object"},
+            },
+        )
+        self.assertEqual(response.status_code, 200)
+        res = response.json()
+
+        # Validate response structure
+        self.assertIn("choices", res)
+        self.assertEqual(len(res["choices"]), 1)
+        self.assertIn("message", res["choices"][0])
+        self.assertIn("content", res["choices"][0]["message"])
+
+        # Validate JSON content
+        content_json = res["choices"][0]["message"]["content"]
+        is_valid_json = True
+        try:
+            content = json.loads(content_json)
+            self.assertIsInstance(content, dict)
+        except Exception:
+            print(f"parse JSON failed: {content_json}")
+            is_valid_json = False
+        self.assertTrue(is_valid_json)
+
+
+class TestEAGLERetract(TestEAGLEServer):
+    @classmethod
+    def setUpClass(cls):
+        # These config helps find a leak.
+        os.environ["SGLANG_CI_SMALL_KV_SIZE"] = "4500"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--speculative-algorithm",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+                "--speculative-num-steps",
+                5,
+                "--speculative-eagle-topk",
+                8,
+                "--speculative-num-draft-tokens",
+                64,
+                "--mem-fraction-static",
+                0.7,
+                "--chunked-prefill-size",
+                128,
+                "--max-running-requests",
+                64,
+            ],
+        )
+
+
+class TestEAGLEServerTriton(TestEAGLEServer):
+    @classmethod
+    def setUpClass(cls):
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--speculative-algorithm",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+                "--speculative-num-steps",
+                5,
+                "--speculative-eagle-topk",
+                8,
+                "--speculative-num-draft-tokens",
+                64,
+                "--mem-fraction-static",
+                0.7,
+                "--attention-backend",
+                "triton",
+                "--max-running-requests",
+                8,
+            ],
+        )
+
+
+class TestEAGLEServerPageSize(TestEAGLEServer):
+    @classmethod
+    def setUpClass(cls):
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--speculative-algorithm",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+                "--speculative-num-steps",
+                5,
+                "--speculative-eagle-topk",
+                1,
+                "--speculative-num-draft-tokens",
+                6,
+                "--mem-fraction-static",
+                0.7,
+                "--chunked-prefill-size",
+                128,
+                "--max-running-requests",
+                8,
+                "--page-size",
+                4,
+                "--attention-backend",
+                "flashinfer",
+            ],
+        )
+
+
+class TestEAGLEServerPageSizeTopk(TestEAGLEServer):
+    @classmethod
+    def setUpClass(cls):
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--speculative-algorithm",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+                "--speculative-num-steps",
+                5,
+                "--speculative-eagle-topk",
+                8,
+                "--speculative-num-draft-tokens",
+                64,
+                "--mem-fraction-static",
+                0.7,
+                "--chunked-prefill-size",
+                128,
+                "--max-running-requests",
+                8,
+                "--page-size",
+                4,
+                "--attention-backend",
+                "flashinfer",
+            ],
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_ebnf_constrained.py b/test/srt/test_ebnf_constrained.py
new file mode 100644
index 000000000..c029f7743
--- /dev/null
+++ b/test/srt/test_ebnf_constrained.py
@@ -0,0 +1,282 @@
+"""
+python3 -m unittest test_ebnf_constrained.TestEBNFConstrained.test_ebnf_generate_email
+python3 -m unittest test_ebnf_constrained.TestEBNFConstrained.test_ebnf_generate_greeting
+python3 -m unittest test_ebnf_constrained.TestEBNFConstrained.test_ebnf_generate_all_optional_function_params
+python3 -m unittest test_ebnf_constrained.TestEBNFConstrainedLLGuidance.test_ebnf_generate_email
+python3 -m unittest test_ebnf_constrained.TestEBNFConstrainedLLGuidance.test_ebnf_generate_greeting
+python3 -m unittest test_ebnf_constrained.TestEBNFConstrainedLLGuidance.test_ebnf_generate_all_optional_function_params
+"""
+
+import json
+import unittest
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+def setup_class(cls, backend: str, disable_overlap: bool):
+    cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+    cls.base_url = DEFAULT_URL_FOR_TEST
+    cls.ebnf_grammar = 'root ::= "test"'  # Default grammar
+
+    other_args = [
+        "--max-running-requests",
+        "10",
+        "--grammar-backend",
+        backend,
+    ]
+
+    if disable_overlap:
+        other_args += ["--disable-overlap-schedule"]
+
+    cls.process = popen_launch_server(
+        cls.model,
+        cls.base_url,
+        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        other_args=other_args,
+    )
+
+
+class TestEBNFConstrained(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        setup_class(cls, "xgrammar", disable_overlap=False)
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def run_decode(
+        self,
+        ebnf,
+        expected_patterns,
+        prompt,
+        return_logprob=False,
+        top_logprobs_num=0,
+        n=1,
+    ):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": prompt,
+                "sampling_params": {
+                    "temperature": 0 if n == 1 else 0.5,
+                    "max_new_tokens": 128,
+                    "n": n,
+                    "ebnf": ebnf,
+                },
+                "stream": False,
+                "return_logprob": return_logprob,
+                "top_logprobs_num": top_logprobs_num,
+                "logprob_start_len": 0,
+            },
+        )
+
+        ret = response.json()
+        print(json.dumps(ret, indent=2))
+        print("=" * 100)
+
+        if not isinstance(ret, list):
+            self.fail(f"Expected response to be a list, but got {type(ret)}")
+
+        for item in ret:
+            text = item.get("text", "").strip()
+            if not text:
+                self.fail("Generated text is empty.")
+
+            match = False
+            for pattern in expected_patterns:
+                if self.regex_match(text, pattern):
+                    match = True
+                    break
+            if not match:
+                self.fail(f"Text '{text}' does not match any of the allowed patterns.")
+
+    def regex_match(self, text, pattern):
+        import re
+
+        return re.match(pattern, text) is not None
+
+    def test_ebnf_generate_email(self):
+        self.__class__.ebnf_grammar = 'root ::= "user@example.com"'
+        allowed_patterns = [r"^user@example\.com$"]
+        prompt = "Generate an email address:"
+
+        self.run_decode(
+            ebnf=self.__class__.ebnf_grammar,
+            expected_patterns=allowed_patterns,
+            prompt=prompt,
+            n=3,
+        )
+
+    def test_ebnf_generate_greeting(self):
+        self.__class__.ebnf_grammar = 'root ::= "Hello" | "Hi" | "Hey"'
+        allowed_patterns = [r"^(Hello|Hi|Hey)$"]
+        prompt = "Generate a greeting:"
+
+        self.run_decode(
+            ebnf=self.__class__.ebnf_grammar,
+            expected_patterns=allowed_patterns,
+            prompt=prompt,
+            n=3,
+        )
+
+    def test_ebnf_generate_number(self):
+        self.__class__.ebnf_grammar = """
+        root ::= digit digit digit
+        digit ::= [0-9]
+        """
+        allowed_patterns = [r"^\d{3}$"]
+        prompt = "Generate a three-digit number:"
+
+        self.run_decode(
+            ebnf=self.__class__.ebnf_grammar,
+            expected_patterns=allowed_patterns,
+            prompt=prompt,
+            n=3,
+        )
+
+    def test_ebnf_generate_phone(self):
+        self.__class__.ebnf_grammar = """
+        root ::= "(" area ")" " " prefix "-" line
+        area ::= [0-9] [0-9] [0-9]
+        prefix ::= [0-9] [0-9] [0-9]
+        line ::= [0-9] [0-9] [0-9] [0-9]
+        """
+        allowed_patterns = [r"^\(\d{3}\) \d{3}-\d{4}$"]
+        prompt = "Generate a phone number:"
+
+        self.run_decode(
+            ebnf=self.__class__.ebnf_grammar,
+            expected_patterns=allowed_patterns,
+            prompt=prompt,
+            n=3,
+        )
+
+    def test_ebnf_generate_date(self):
+        self.__class__.ebnf_grammar = """
+        root ::= year "-" month "-" day
+        year ::= "2024"
+        month ::= "01" | "02" | "03" | "04" | "05" | "06" | "07" | "08" | "09" | "10" | "11" | "12"
+        day ::= "01" | "02" | "03" | "04" | "05" | "06" | "07" | "08" | "09" | "10" |
+               "11" | "12" | "13" | "14" | "15" | "16" | "17" | "18" | "19" | "20" |
+               "21" | "22" | "23" | "24" | "25" | "26" | "27" | "28" | "29" | "30" | "31"
+        """
+        allowed_patterns = [r"^2024-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])$"]
+        prompt = "Generate a date in YYYY-MM-DD format:"
+
+        self.run_decode(
+            ebnf=self.__class__.ebnf_grammar,
+            expected_patterns=allowed_patterns,
+            prompt=prompt,
+            n=3,
+        )
+
+    def test_ebnf_generate_hex_color(self):
+        self.__class__.ebnf_grammar = """
+        root ::= "#" hex hex hex hex hex hex
+        hex ::= [0-9] | [A-F]
+        """
+        allowed_patterns = [r"^#[0-9A-F]{6}$"]
+        prompt = "Generate a hex color code:"
+
+        self.run_decode(
+            ebnf=self.__class__.ebnf_grammar,
+            expected_patterns=allowed_patterns,
+            prompt=prompt,
+            n=3,
+        )
+
+    def test_ebnf_generate_complex_json(self):
+        self.__class__.ebnf_grammar = """
+        root ::= object
+        object ::= "{" ws pair (ws "," ws pair)* ws "}"
+        pair ::= "\\"name\\"" ws ":" ws value |
+                 "\\"age\\"" ws ":" ws number |
+                 "\\"city\\"" ws ":" ws string
+        value ::= string | number
+        string ::= "\\"" [a-zA-Z0-9 ]+ "\\""
+        number ::= [1-9] [0-9]*
+        ws ::= [ ]*
+        """
+        allowed_patterns = [
+            r'^{\s*"name"\s*:\s*"[a-zA-Z0-9 ]+"\s*,\s*"age"\s*:\s*[1-9][0-9]*\s*,\s*"city"\s*:\s*"[a-zA-Z0-9 ]+"\s*}$',
+        ]
+        prompt = "Generate a simple JSON with name, age, and city:"
+
+        self.run_decode(
+            ebnf=self.__class__.ebnf_grammar,
+            expected_patterns=allowed_patterns,
+            prompt=prompt,
+            n=3,
+        )
+
+    def test_ebnf_generate_custom_log_format(self):
+        self.__class__.ebnf_grammar = """
+        root ::= logentry
+        logentry ::= "[" datetime "] " level ": System.process - " message
+        datetime ::= "2024-01-01T12:00:00Z"
+        level ::= "INFO"
+        message ::= "Operation " [a-z]+ " successfully"
+        """
+        allowed_patterns = [
+            r"^\[2024-01-01T12:00:00Z\] INFO: System\.process - Operation [a-z]+ successfully$"
+        ]
+        prompt = "Generate a log entry:"
+
+        self.run_decode(
+            ebnf=self.__class__.ebnf_grammar,
+            expected_patterns=allowed_patterns,
+            prompt=prompt,
+            n=3,
+        )
+
+    def test_ebnf_generate_all_optional_function_params(self):
+        """Test function call with all optional parameters - verifies flexible ordering."""
+        self.__class__.ebnf_grammar = """
+        root ::= function_call
+        function_call ::= call_config_service
+        call_config_service ::= "{" "\\"name\\"" ":" "\\"config_service\\"" ", " "\\"arguments\\"" ":" arguments_config_service "}"
+        arguments_config_service ::= "{" ( "\\"theme\\"" ":" ("\\"light\\"" | "\\"dark\\"") ( "," "\\"language\\"" ":" ("\\"en\\"" | "\\"es\\"" | "\\"fr\\"") )? ( "," "\\"notifications\\"" ":" ("true" | "false") )? | "\\"language\\"" ":" ("\\"en\\"" | "\\"es\\"" | "\\"fr\\"") ( "," "\\"notifications\\"" ":" ("true" | "false") )? | "\\"notifications\\"" ":" ("true" | "false") )? "}"
+        """
+        # Test patterns that should match - flexible ordering of optional parameters
+        allowed_patterns = [
+            # Empty arguments
+            r'^\{"name":"config_service",\s*"arguments":\{\}\}$',
+            # Single optional parameters (any can appear first)
+            r'^\{"name":"config_service",\s*"arguments":\{"theme":"(light|dark)"\}\}$',
+            r'^\{"name":"config_service",\s*"arguments":\{"language":"(en|es|fr)"\}\}$',
+            r'^\{"name":"config_service",\s*"arguments":\{"notifications":(true|false)\}\}$',
+            # Two optional parameters (in any order)
+            r'^\{"name":"config_service",\s*"arguments":\{"theme":"(light|dark)",\s*"language":"(en|es|fr)"\}\}$',
+            r'^\{"name":"config_service",\s*"arguments":\{"theme":"(light|dark)",\s*"notifications":(true|false)\}\}$',
+            r'^\{"name":"config_service",\s*"arguments":\{"language":"(en|es|fr)",\s*"notifications":(true|false)\}\}$',
+            # All three optional parameters
+            r'^\{"name":"config_service",\s*"arguments":\{"theme":"(light|dark)",\s*"language":"(en|es|fr)",\s*"notifications":(true|false)\}\}$',
+        ]
+        prompt = "Configure the service with optional settings:"
+
+        self.run_decode(
+            ebnf=self.__class__.ebnf_grammar,
+            expected_patterns=allowed_patterns,
+            prompt=prompt,
+            n=5,
+        )
+
+
+class TestEBNFConstrainedLLGuidance(TestEBNFConstrained):
+    @classmethod
+    def setUpClass(cls):
+        setup_class(cls, "llguidance", disable_overlap=False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_eval_accuracy_large.py b/test/srt/test_eval_accuracy_large.py
new file mode 100644
index 000000000..99304dfde
--- /dev/null
+++ b/test/srt/test_eval_accuracy_large.py
@@ -0,0 +1,94 @@
+"""
+Usage:
+python -m unittest test_eval_accuracy_large.TestEvalAccuracyLarge.test_mmlu
+"""
+
+import os
+import time
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    write_github_step_summary,
+)
+
+
+class TestEvalAccuracyLarge(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--log-level-http", "warning"],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=5000,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+
+        if is_in_ci():
+            write_github_step_summary(f"### test_mmlu\n" f'{metrics["score"]=:.4f}\n')
+
+        self.assertGreater(metrics["score"], 0.70)
+
+    def test_human_eval(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="humaneval",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_human_eval\n" f'{metrics["score"]=:.4f}\n'
+            )
+
+        self.assertGreater(metrics["score"], 0.64)
+
+    def test_mgsm_en(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mgsm_en",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_mgsm_en\n" f'{metrics["score"]=:.4f}\n'
+            )
+
+        self.assertGreater(metrics["score"], 0.835)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_eval_fp8_accuracy.py b/test/srt/test_eval_fp8_accuracy.py
new file mode 100644
index 000000000..329e2dad8
--- /dev/null
+++ b/test/srt/test_eval_fp8_accuracy.py
@@ -0,0 +1,114 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import is_hip, kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_ACCURACY_TEST_FP8,
+    DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8,
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestEvalFP8Accuracy(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_ACCURACY_TEST_FP8
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+            temperature=0.1,
+        )
+
+        metrics = run_eval(args)
+        if is_hip():
+            # Another threshold for AMD because fp8 dtype is difference
+            self.assertGreaterEqual(metrics["score"], 0.60)
+        else:
+            self.assertGreaterEqual(metrics["score"], 0.60)
+
+
+class TestEvalFP8DynamicQuantAccuracy(CustomTestCase):
+
+    def _run_test(self, model, other_args, expected_score):
+        base_url = DEFAULT_URL_FOR_TEST
+        other_args = other_args or []
+
+        process = popen_launch_server(
+            model,
+            base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+
+        try:
+            args = SimpleNamespace(
+                base_url=base_url,
+                model=model,
+                eval_name="mmlu",
+                num_examples=64,
+                num_threads=32,
+                temperature=0.1,
+            )
+
+            metrics = run_eval(args)
+            self.assertGreaterEqual(metrics["score"], expected_score)
+        finally:
+            kill_process_tree(process.pid)
+
+    def test_mmlu_offline_only(self):
+        """Test with offline quantization only."""
+        self._run_test(
+            model=DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8,
+            other_args=[],
+            expected_score=0.64,
+        )
+
+    def test_mmlu_offline_and_online_override(self):
+        """Test with both offline and online quantization."""
+        self._run_test(
+            model=DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8,
+            other_args=["--quantization", "w8a8_fp8"],
+            # inference will use sgl kernel w/ online quant override
+            # we observed that the accuracy is higher then offline only
+            expected_score=0.64,
+        )
+
+    def test_mmlu_online_only(self):
+        """Test with online quantization only."""
+        self._run_test(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            # inference will use sgl kernel w/ online quantization only
+            # we observed that the accuracy is higher then offline only
+            other_args=["--quantization", "w8a8_fp8"],
+            expected_score=0.64,
+        )
+
+    def test_mmlu_fp16_baseline(self):
+        """Test with unquantized fp16 baseline."""
+        self._run_test(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            other_args=[],
+            expected_score=0.64,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_expert_distribution.py b/test/srt/test_expert_distribution.py
new file mode 100755
index 000000000..5d4add72f
--- /dev/null
+++ b/test/srt/test_expert_distribution.py
@@ -0,0 +1,102 @@
+import os
+import tempfile
+import unittest
+from pathlib import Path
+
+import requests
+import torch
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestExpertDistribution(CustomTestCase):
+    def test_expert_distribution_record(self):
+        # TODO: Add tests for DeepEP gatherer (currently our CI cannot run that)
+        for info in [
+            dict(model_path="deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"),
+            dict(model_path="Qwen/Qwen1.5-MoE-A2.7B"),
+            dict(model_path="Qwen/Qwen1.5-MoE-A2.7B", tp_size=2),
+            dict(model_path="Qwen/Qwen1.5-MoE-A2.7B", mode="per_pass"),
+            dict(model_path="Qwen/Qwen1.5-MoE-A2.7B", mode="per_token"),
+        ]:
+            with self.subTest(info=info):
+                self._execute_core(**info)
+
+    def _execute_core(self, model_path: str, mode: str = "stat", tp_size: int = 1):
+        """Test expert distribution record endpoints"""
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            os.environ["SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR"] = tmp_dir
+
+            process = popen_launch_server(
+                model_path,
+                DEFAULT_URL_FOR_TEST,
+                timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                other_args=[
+                    "--trust-remote-code",
+                    "--tp-size",
+                    str(tp_size),
+                    "--expert-distribution-recorder-mode",
+                    mode,
+                    "--disable-cuda-graph",
+                    "--disable-overlap-schedule",
+                ],
+            )
+
+            try:
+                # Start recording
+                response = requests.post(
+                    f"{DEFAULT_URL_FOR_TEST}/start_expert_distribution_record"
+                )
+                self.assertEqual(response.status_code, 200)
+
+                # Make some requests to generate expert distribution data
+                response = requests.post(
+                    f"{DEFAULT_URL_FOR_TEST}/generate",
+                    json={
+                        "text": "The capital of France is",
+                        "sampling_params": {
+                            "temperature": 0,
+                            "max_new_tokens": 32,
+                        },
+                    },
+                )
+                self.assertEqual(response.status_code, 200)
+
+                # Stop recording
+                response = requests.post(
+                    f"{DEFAULT_URL_FOR_TEST}/stop_expert_distribution_record"
+                )
+                self.assertEqual(response.status_code, 200)
+
+                # Dump the recorded data
+                response = requests.post(
+                    f"{DEFAULT_URL_FOR_TEST}/dump_expert_distribution_record"
+                )
+                self.assertEqual(response.status_code, 200)
+
+                # Check data rows
+                data = torch.load(
+                    list(Path(tmp_dir).glob("*.pt"))[0], weights_only=True
+                )
+                print(f"{data=}")
+
+                if mode in ["per_pass", "per_token"]:
+                    self.assertGreater(len(data), 0, "Should contain data rows")
+                else:
+                    logical_count = data["logical_count"]
+                    print(f"{logical_count.sum()=} {logical_count=}")
+                    self.assertTrue(logical_count.sum() > 0)
+
+            finally:
+                kill_process_tree(process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_expert_location_updater.py b/test/srt/test_expert_location_updater.py
new file mode 100644
index 000000000..094540294
--- /dev/null
+++ b/test/srt/test_expert_location_updater.py
@@ -0,0 +1,255 @@
+import os
+import traceback
+import unittest
+from dataclasses import dataclass
+from typing import List
+
+import torch
+import torch.distributed
+import torch.multiprocessing as mp
+from torch.multiprocessing import Process
+
+from sglang.srt.eplb import expert_location_updater
+from sglang.test.test_utils import CustomTestCase, find_available_port
+from sglang.utils import is_in_ci
+
+
+@dataclass
+class _TestInfo:
+    nnodes: int
+    num_logical_experts: int
+    num_physical_experts: int
+    num_repeat: int = 5000
+
+
+class TestExpertLocationUpdater(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+
+    def test_cpu(self):
+        self._test_common(device="cpu")
+        self._test_core(
+            num_gpus=32,
+            device="cpu",
+            infos=[
+                _TestInfo(
+                    nnodes=4,
+                    num_logical_experts=256,
+                    num_physical_experts=288,
+                    num_repeat=10000,
+                )
+            ],
+        )
+
+    def test_cpu_slow(self):
+        if is_in_ci():
+            return
+        self._test_core(
+            num_gpus=144,
+            device="cpu",
+            infos=[
+                _TestInfo(
+                    nnodes=18,
+                    num_logical_experts=256,
+                    num_physical_experts=288,
+                    num_repeat=10000,
+                )
+            ],
+        )
+
+    def test_gpu(self):
+        if is_in_ci():
+            return
+        self._test_common(device="cuda")
+
+    def _test_common(self, device):
+        infos = []
+
+        for nnodes in [1, 2, 4]:
+            for num_logical_experts in [2, 5, 20, 256]:
+                for num_physical_experts in [8, 16, 256, 288]:
+                    if num_logical_experts > num_physical_experts:
+                        continue
+                    infos.append(
+                        _TestInfo(
+                            nnodes=nnodes,
+                            num_logical_experts=num_logical_experts,
+                            num_physical_experts=num_physical_experts,
+                        )
+                    )
+
+        self._test_core(num_gpus=8, device=device, infos=infos)
+
+    def _test_core(
+        self,
+        num_gpus: int,
+        device: str,
+        infos: List[_TestInfo],
+    ):
+        master_port = find_available_port(23456)
+
+        processes = []
+        output_reader, output_writer = mp.Pipe(duplex=False)
+        for rank in range(num_gpus):
+            p = Process(
+                target=_run_subprocess,
+                kwargs=dict(
+                    rank=rank,
+                    num_gpus=num_gpus,
+                    output_writer=output_writer,
+                    master_port=master_port,
+                    device=device,
+                    infos=infos,
+                ),
+            )
+            p.start()
+            processes.append(p)
+
+        for _ in range(num_gpus):
+            self.assertTrue(
+                output_reader.recv(), f"Subprocess has error, please see logs above."
+            )
+
+        for p in processes:
+            p.join()
+
+
+def _run_subprocess(
+    rank: int,
+    num_gpus: int,
+    master_port: int,
+    device: str,
+    infos: List[_TestInfo],
+    output_writer,
+):
+    try:
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = str(master_port)
+
+        torch.random.manual_seed(42)
+        torch.distributed.init_process_group(
+            rank=rank,
+            world_size=num_gpus,
+            backend={"cpu": "gloo", "cuda": None}[device],
+        )
+        if device == "cuda":
+            torch.cuda.set_device(f"cuda:{rank}")
+
+        for info in infos:
+            _execute_test(info, rank=rank, num_gpus=num_gpus, device=device)
+
+        execution_ok = True
+    except Exception as e:
+        print(f"subprocess[{rank=}] has error: {e}", flush=True)
+        traceback.print_exc()
+        execution_ok = False
+
+    output_writer.send(execution_ok)
+    output_writer.close()
+
+
+def _execute_test(info: _TestInfo, rank: int, num_gpus: int, device: str):
+    if rank == 0:
+        print(f"Test: {num_gpus=} {info=}", flush=True)
+
+    assert info.num_physical_experts % num_gpus == 0
+    num_local_physical_experts = info.num_physical_experts // num_gpus
+    assert num_gpus % info.nnodes == 0
+    num_gpu_per_node = num_gpus // info.nnodes
+
+    def _create_routed_experts_weights(physical_to_logical_map):
+        local_logical_expert_ids = physical_to_logical_map[
+            rank * num_local_physical_experts : (rank + 1) * num_local_physical_experts
+        ].cpu()
+        return [
+            local_logical_expert_ids.to(device).clone(),
+            torch.tensor(
+                [
+                    [local_logical_expert_id * 10, local_logical_expert_id * 100]
+                    for local_logical_expert_id in local_logical_expert_ids.tolist()
+                ],
+                device=device,
+            ),
+        ]
+
+    def _create_physical_to_logical_map():
+        if rank == 0:
+            ans = torch.concat(
+                [
+                    torch.arange(0, info.num_logical_experts),
+                    torch.randint(
+                        0,
+                        info.num_logical_experts,
+                        (info.num_physical_experts - info.num_logical_experts,),
+                    ),
+                ]
+            )
+            ans = ans[torch.randperm(ans.shape[0])]
+        else:
+            ans = torch.empty((info.num_physical_experts,), dtype=torch.int64)
+
+        assert ans.dtype == torch.int64 and ans.shape == (info.num_physical_experts,)
+        ans = ans.to(device)
+        torch.distributed.broadcast(ans, src=0)
+
+        return ans.cpu()
+
+    physical_to_logical_map = _create_physical_to_logical_map()
+    routed_experts_weights = _create_routed_experts_weights(physical_to_logical_map)
+
+    for i in range(info.num_repeat):
+        if rank == 0 and ((i % 500 == 0) or (i == info.num_repeat - 1)):
+            print(f"Step {i}/{info.num_repeat}", flush=True)
+
+        new_physical_to_logical_map = _create_physical_to_logical_map()
+        expect_new_weights = _create_routed_experts_weights(new_physical_to_logical_map)
+
+        output_logs = expert_location_updater.update_expert_weights_single_layer(
+            routed_experts_weights=routed_experts_weights,
+            temp_buffers=expert_location_updater.create_temp_buffers(
+                routed_experts_weights
+            ),
+            old_physical_to_logical_map=physical_to_logical_map.tolist(),
+            new_physical_to_logical_map=new_physical_to_logical_map.tolist(),
+            num_local_physical_experts=num_local_physical_experts,
+            num_gpu_per_node=num_gpu_per_node,
+            rank=rank,
+            debug=True,
+        )
+
+        local_has_error = not all(
+            torch.all(x == y)
+            for x, y in zip(routed_experts_weights, expect_new_weights, strict=True)
+        )
+        global_has_error = torch.tensor(local_has_error, device=device)
+        torch.distributed.all_reduce(
+            global_has_error, op=torch.distributed.ReduceOp.MAX
+        )
+
+        if global_has_error.cpu().item():
+            output_logs_str = "\n".join(output_logs)
+            local_message = (
+                f"===================== rank {rank} ============================\n"
+                f"{num_gpus=} {info=}\n"
+                f"{routed_experts_weights[0].tolist()=}\n"
+                f"{expect_new_weights[0].tolist()=}\n"
+                f"{physical_to_logical_map.tolist()=}\n"
+                f"{new_physical_to_logical_map.tolist()=}\n"
+                f"===logs===\n"
+                f"{output_logs_str}\n"
+                f"==============================================================\n"
+            )
+
+            global_messages = ([None] * num_gpus) if rank == 0 else None
+            torch.distributed.gather_object(local_message, global_messages, dst=0)
+
+            if rank == 0:
+                print("\n\n".join(global_messages), flush=True)
+            raise AssertionError(f"Error happens, see logs above")
+
+        physical_to_logical_map = new_physical_to_logical_map
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_fa3.py b/test/srt/test_fa3.py
new file mode 100644
index 000000000..c9f286fca
--- /dev/null
+++ b/test/srt/test_fa3.py
@@ -0,0 +1,261 @@
+import os
+import unittest
+from types import SimpleNamespace
+
+import requests
+
+from sglang.srt.utils import get_device_sm, kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3,
+    DEFAULT_MODEL_NAME_FOR_TEST_MLA,
+    DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+GSM_DATASET_PATH = None
+
+# In case of some machine lack internet connection, we can set OFFLINE_MODE to True.
+OFFLINE_MODE = False
+
+# Change the path below when OFFLINE_MODE is True.
+OFFLINE_PATH_DICT = {
+    DEFAULT_MODEL_NAME_FOR_TEST: "/shared/public/elr-models/meta-llama/Meta-Llama-3.1-8B-Instruct",
+    DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3: "/shared/public/elr-models/jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B",
+    DEFAULT_MODEL_NAME_FOR_TEST_MLA: "/shared/public/sharing/deepseek/dsv3-test/snapshots/",
+    DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN: "/shared/public/sharing/deepseek/dsv3-test-NextN/snapshots/",
+    GSM_DATASET_PATH: "/shared/public/data/gsm8k/test.jsonl",
+}
+
+
+if OFFLINE_MODE:
+    DEFAULT_MODEL_NAME_FOR_TEST = OFFLINE_PATH_DICT[DEFAULT_MODEL_NAME_FOR_TEST]
+    DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3 = OFFLINE_PATH_DICT[
+        DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3
+    ]
+    DEFAULT_MODEL_NAME_FOR_TEST_MLA = OFFLINE_PATH_DICT[DEFAULT_MODEL_NAME_FOR_TEST_MLA]
+    DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN = OFFLINE_PATH_DICT[
+        DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN
+    ]
+    GSM_DATASET_PATH = OFFLINE_PATH_DICT[GSM_DATASET_PATH]
+
+
+# Default server arguments shared across all tests
+DEFAULT_SERVER_ARGS = [
+    "--trust-remote-code",
+    "--cuda-graph-max-bs",
+    "8",
+    "--attention-backend",
+    "fa3",
+]
+
+"""
+Integration test for python/sglang/srt/layers/attention/flashattention_backend.py
+"""
+
+
+@unittest.skipIf(get_device_sm() < 90, "Test requires CUDA SM 90 or higher")
+class BaseFlashAttentionTest(CustomTestCase):
+    """Base class for testing FlashAttention3."""
+
+    model = DEFAULT_MODEL_NAME_FOR_TEST
+    base_url = DEFAULT_URL_FOR_TEST
+    accuracy_threshold = 0.65  # derived tests need to override this
+    speculative_decode = False
+    spec_decode_threshold = 1.0  # derived spec decoding tests need to override this
+
+    @classmethod
+    def get_server_args(cls):
+        """Return the arguments for the server launch. Override in subclasses."""
+        return DEFAULT_SERVER_ARGS
+
+    @classmethod
+    def setUpClass(cls):
+        # disable deep gemm precompile to make launch server faster
+        # please don't do this if you want to make your inference workload faster
+        os.environ["SGL_JIT_DEEPGEMM_PRECOMPILE"] = "false"
+        os.environ["SGL_ENABLE_JIT_DEEPGEMM"] = "false"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=cls.get_server_args(),
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        requests.get(self.base_url + "/flush_cache")
+
+        args = SimpleNamespace(
+            num_shots=4,
+            num_questions=100,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+            data_path=GSM_DATASET_PATH,
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(f"{metrics=}")
+
+        # Use the appropriate metric key based on the test class
+        metric_key = "accuracy"
+        self.assertGreater(metrics[metric_key], self.accuracy_threshold)
+
+        if self.speculative_decode:
+            server_info = requests.get(self.base_url + "/get_server_info")
+            avg_spec_accept_length = server_info.json()["internal_states"][0][
+                "avg_spec_accept_length"
+            ]
+            print(f"{avg_spec_accept_length=}")
+            self.assertGreater(avg_spec_accept_length, self.spec_decode_threshold)
+
+
+class TestFlashAttention3MLA(BaseFlashAttentionTest):
+    """Test FlashAttention3 with MLA, e.g. deepseek v3 test model"""
+
+    accuracy_threshold = 0.60
+    model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
+
+    @classmethod
+    def get_server_args(cls):
+        return DEFAULT_SERVER_ARGS
+
+
+class TestFlashAttention3SpeculativeDecode(BaseFlashAttentionTest):
+    """Test FlashAttention3 with speculative decode enabled with Llama 3.1 8B and its eagle3 model"""
+
+    model = DEFAULT_MODEL_NAME_FOR_TEST
+    accuracy_threshold = 0.65
+    speculative_decode = True
+    spec_decode_threshold = 1.5
+
+    @classmethod
+    def get_server_args(cls):
+        args = DEFAULT_SERVER_ARGS
+        args.extend(
+            [
+                "--cuda-graph-max-bs",
+                "4",
+                "--speculative-algorithm",
+                "EAGLE3",
+                "--speculative-draft-model-path",
+                DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3,
+                "--speculative-num-steps",
+                "3",
+                "--speculative-eagle-topk",
+                "1",
+                "--speculative-num-draft-tokens",
+                "4",
+                "--dtype",
+                "float16",
+            ]
+        )
+        return args
+
+
+class TestFlashAttention3SpeculativeDecodeTopk(BaseFlashAttentionTest):
+    """Tests FlashAttention3 with enhanced speculative decoding using Llama 3.1 8B and EAGLE3.
+    This test will be using top-k value > 1 which would verify the other branches of the FA3 code
+    """
+
+    model = DEFAULT_MODEL_NAME_FOR_TEST
+    accuracy_threshold = 0.65
+    speculative_decode = True
+    spec_decode_threshold = 1.6
+
+    @classmethod
+    def get_server_args(cls):
+        args = DEFAULT_SERVER_ARGS
+        args.extend(
+            [
+                "--cuda-graph-max-bs",
+                "4",
+                "--speculative-algorithm",
+                "EAGLE3",
+                "--speculative-draft-model-path",
+                DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3,
+                "--speculative-num-steps",
+                "5",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "8",
+                "--dtype",
+                "float16",
+            ]
+        )
+        return args
+
+
+class TestFlashAttention3MLASpeculativeDecode(BaseFlashAttentionTest):
+    """Test FlashAttention3 with speculative decode enabled with deepseek v3 test model and its nextN model"""
+
+    model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
+    accuracy_threshold = 0.60
+    speculative_decode = True
+    spec_decode_threshold = 2.5
+
+    @classmethod
+    def get_server_args(cls):
+        args = DEFAULT_SERVER_ARGS
+        args.extend(
+            [
+                "--cuda-graph-max-bs",
+                "4",
+                "--speculative-algorithm",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN,
+                "--speculative-num-steps",
+                "3",
+                "--speculative-eagle-topk",
+                "1",
+                "--speculative-num-draft-tokens",
+                "4",
+            ]
+        )
+        return args
+
+
+class TestFlashAttention3MLASpeculativeDecodeTopk(BaseFlashAttentionTest):
+    """Test FlashAttention3 with speculative decode enabled with deepseek v3 test model and its nextN model
+    This test will be using top-k value > 1 which would verify the other branches of the FA3 code
+    """
+
+    model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
+    accuracy_threshold = 0.60
+    speculative_decode = True
+    spec_decode_threshold = 2.95
+
+    @classmethod
+    def get_server_args(cls):
+        args = DEFAULT_SERVER_ARGS
+        args.extend(
+            [
+                "--cuda-graph-max-bs",
+                "4",
+                "--speculative-algorithm",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN,
+                "--speculative-num-steps",
+                "5",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "8",
+            ]
+        )
+        return args
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_fim_completion.py b/test/srt/test_fim_completion.py
new file mode 100644
index 000000000..09db1d4bc
--- /dev/null
+++ b/test/srt/test_fim_completion.py
@@ -0,0 +1,72 @@
+import unittest
+
+import openai
+
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestFimCompletion(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "deepseek-ai/deepseek-coder-1.3b-base"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        other_args = ["--completion-template", "deepseek_coder"]
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=other_args,
+        )
+        cls.base_url += "/v1"
+        cls.tokenizer = get_tokenizer(cls.model)
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def run_fim_completion(self, number_of_completion):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        prompt = "function sum(a: number, b: number): number{\n"
+        suffix = "}"
+
+        prompt_input = self.tokenizer.encode(prompt) + self.tokenizer.encode(suffix)
+        num_prompt_tokens = len(prompt_input) + 2
+
+        response = client.completions.create(
+            model=self.model,
+            prompt=prompt,
+            suffix=suffix,
+            temperature=0.3,
+            max_tokens=32,
+            stream=False,
+            n=number_of_completion,
+        )
+
+        print(response)
+        print(len(response.choices))
+        assert len(response.choices) == number_of_completion
+        assert response.id
+        assert response.created
+        assert response.object == "text_completion"
+        assert (
+            response.usage.prompt_tokens == num_prompt_tokens
+        ), f"{response.usage.prompt_tokens} vs {num_prompt_tokens}"
+        assert response.usage.completion_tokens > 0
+        assert response.usage.total_tokens > 0
+
+    def test_fim_completion(self):
+        for number_of_completion in [1, 3]:
+            self.run_fim_completion(number_of_completion)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_flashmla.py b/test/srt/test_flashmla.py
new file mode 100644
index 000000000..681c9b8eb
--- /dev/null
+++ b/test/srt/test_flashmla.py
@@ -0,0 +1,153 @@
+"""
+Usage:
+python3 test/srt/test_flashmla.py
+"""
+
+import unittest
+from types import SimpleNamespace
+
+import requests
+import torch
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST_MLA,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    run_bench_one_batch,
+)
+
+
+class TestFlashMLAAttnBackend(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = ["--trust-remote-code"]
+        if torch.cuda.is_available() and torch.version.cuda:
+            other_args.extend(
+                [
+                    "--enable-torch-compile",
+                    "--cuda-graph-max-bs",
+                    "2",
+                    "--attention-backend",
+                    "flashmla",
+                ]
+            )
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.60)
+
+
+class TestFlashMLAAttnLatency(unittest.TestCase):
+    def test_latency(self):
+        _, output_throughput, _ = run_bench_one_batch(
+            DEFAULT_MODEL_NAME_FOR_TEST_MLA,
+            [
+                "--attention-backend",
+                "flashmla",
+                "--enable-torch-compile",
+                "--cuda-graph-max-bs",
+                "16",
+                "--trust-remote-code",
+            ],
+        )
+
+        if is_in_ci():
+            self.assertGreater(output_throughput, 100)
+
+
+class TestFlashMLAMTP(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "lmsys/sglang-ci-dsv3-test"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = ["--trust-remote-code"]
+        if torch.cuda.is_available() and torch.version.cuda:
+            other_args.extend(
+                [
+                    "--cuda-graph-max-bs",
+                    "4",
+                    "--disable-radix",
+                    "--enable-torch-compile",
+                    "--torch-compile-max-bs",
+                    "1",
+                    "--speculative-algorithm",
+                    "EAGLE",
+                    "--speculative-draft-model-path",
+                    "lmsys/sglang-ci-dsv3-test-NextN",
+                    "--speculative-num-steps",
+                    "1",
+                    "--speculative-eagle-topk",
+                    "1",
+                    "--speculative-num-draft-tokens",
+                    "2",
+                    "--attention-backend",
+                    "flashmla",
+                ]
+            )
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        requests.get(self.base_url + "/flush_cache")
+
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.60)
+
+        server_info = requests.get(self.base_url + "/get_server_info")
+        print(f"{server_info=}")
+        avg_spec_accept_length = server_info.json()["internal_states"][0][
+            "avg_spec_accept_length"
+        ]
+        print(f"{avg_spec_accept_length=}")
+        self.assertGreater(avg_spec_accept_length, 1.8)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_forward_split_prefill.py b/test/srt/test_forward_split_prefill.py
new file mode 100644
index 000000000..bbd247583
--- /dev/null
+++ b/test/srt/test_forward_split_prefill.py
@@ -0,0 +1,299 @@
+"""
+Test forward_split_prefill functionality.
+
+Usage:
+python3 -m unittest test_forward_split_prefill.TestForwardSplitPrefill
+or
+python3 test_forward_split_prefill.py
+"""
+
+import time
+import unittest
+
+import numpy as np
+import torch
+
+from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.model_executor.model_runner import ModelRunner
+from sglang.srt.sampling.sampling_params import SamplingParams
+from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+from sglang.test.test_utils import DEFAULT_SMALL_MODEL_NAME_FOR_TEST, CustomTestCase
+
+
+class TestForwardSplitPrefill(CustomTestCase):
+    """Test cases for forward_split_prefill functionality."""
+
+    @classmethod
+    def setUpClass(cls):
+        """Set up the test environment once for all tests."""
+        cls.model_path = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.tp_size = 1
+        cls.device = "cuda"
+
+        # Initialize server args
+        cls.server_args = ServerArgs(
+            model_path=cls.model_path,
+            tokenizer_path=cls.model_path,
+            host="127.0.0.1",
+            disable_cuda_graph=True,  # Disable CUDA graph for testing split prefill
+            disable_hybrid_swa_memory=True,
+            port=30000,
+            tp_size=cls.tp_size,
+            mem_fraction_static=0.8,
+            trust_remote_code=True,
+        )
+
+        cls.port_args = PortArgs.init_new(cls.server_args)
+
+        # Load model and tokenizer
+        cls.model_config = ModelConfig.from_server_args(cls.server_args)
+        cls.model_runner = ModelRunner(
+            model_config=cls.model_config,
+            mem_fraction_static=cls.server_args.mem_fraction_static,
+            gpu_id=0,
+            tp_rank=0,
+            tp_size=cls.tp_size,
+            pp_rank=0,
+            pp_size=1,
+            nccl_port=cls.port_args.nccl_port,
+            server_args=cls.server_args,
+        )
+
+        cls.tokenizer = get_tokenizer(
+            cls.server_args.tokenizer_path,
+            tokenizer_mode=cls.server_args.tokenizer_mode,
+            trust_remote_code=cls.server_args.trust_remote_code,
+        )
+
+        print(
+            f"Test with model: {cls.model_path}, num_hidden_layers: {cls.model_config.num_hidden_layers}"
+        )
+
+    def prepare_test_batch(self, batch_size=2, input_len=128, is_split_prefill=True):
+        """Prepare a test batch for split prefill testing."""
+        # Create synthetic input
+        input_ids = np.random.randint(10, 1000, (batch_size, input_len), dtype=np.int32)
+
+        sampling_params = SamplingParams(
+            temperature=0.0,
+            max_new_tokens=8,
+        )
+
+        reqs = []
+        for i in range(batch_size):
+            req = Req(
+                rid=i,
+                origin_input_text="",
+                origin_input_ids=list(input_ids[i]),
+                sampling_params=sampling_params,
+            )
+            req.prefix_indices = []
+            req.fill_ids = req.origin_input_ids
+            req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
+            req.logprob_start_len = len(req.origin_input_ids) - 1
+            reqs.append(req)
+
+        batch = ScheduleBatch.init_new(
+            reqs=reqs,
+            req_to_token_pool=self.model_runner.req_to_token_pool,
+            token_to_kv_pool_allocator=self.model_runner.token_to_kv_pool_allocator,
+            tree_cache=None,
+            model_config=self.model_config,
+            enable_overlap=False,
+            spec_algorithm=SpeculativeAlgorithm.NONE,
+            enable_custom_logit_processor=False,
+        )
+        if is_split_prefill:
+            batch.prepare_for_split_prefill()
+        else:
+            batch.prepare_for_extend()
+
+        # Create forward batch
+        model_worker_batch = batch.get_model_worker_batch()
+        forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
+
+        return forward_batch
+
+    def test_split_prefill_functionality(self):
+        """Test that split prefill can complete successfully."""
+        print("\n=== Testing split prefill functionality ===")
+
+        forward_batch = self.prepare_test_batch(batch_size=2, input_len=64)
+
+        # Reset split index
+        forward_batch.split_index = 0
+
+        # Test split prefill in chunks
+        num_layers = self.model_config.num_hidden_layers
+        chunk_size = max(1, num_layers // 4)  # Split into 4 chunks
+
+        results = []
+        split_count = 0
+
+        while forward_batch.split_index < num_layers:
+            print(
+                f"Processing split {split_count}, split_index: {forward_batch.split_index}"
+            )
+
+            result = self.model_runner.forward_split_prefill(
+                forward_batch=forward_batch,
+                reinit_attn_backend=(split_count == 0),
+                forward_count=chunk_size,
+            )
+
+            results.append(result)
+            split_count += 1
+
+            # Verify split_index is updated correctly
+            expected_next_index = min(split_count * chunk_size, num_layers)
+            self.assertEqual(forward_batch.split_index, expected_next_index)
+
+        # The last result should contain logits
+        self.assertIsNotNone(results[-1], "Final split should return logits")
+        print(f"Split prefill completed in {split_count} splits")
+
+    def test_split_prefill_vs_normal_prefill(self):
+        """Test that split prefill produces the same results as normal prefill."""
+        print("\n=== Testing split prefill vs normal prefill consistency ===")
+
+        forward_batch_normal = self.prepare_test_batch(
+            batch_size=2, input_len=128, is_split_prefill=False
+        )
+        forward_batch_split = self.prepare_test_batch(
+            batch_size=2, input_len=128, is_split_prefill=True
+        )
+
+        # Ensure same input
+        forward_batch_split.input_ids = forward_batch_normal.input_ids.clone()
+        forward_batch_split.positions = forward_batch_normal.positions.clone()
+
+        # Method 1: Normal extend (prefill)
+        print("Running normal extend (prefill)...")
+        normal_result = self.model_runner.forward_extend(forward_batch_normal)
+
+        # Method 2: Split prefill
+        print("Running split prefill...")
+        num_layers = self.model_config.num_hidden_layers
+        chunk_size = max(1, num_layers // 3)  # Split into 3 chunks
+
+        split_result = None
+
+        while forward_batch_split.split_index < num_layers:
+            result = self.model_runner.forward_split_prefill(
+                forward_batch=forward_batch_split,
+                forward_count=chunk_size,
+            )
+            if result is not None:
+                split_result = result
+
+        # Compare results
+        self.assertIsNotNone(normal_result, "Normal prefill should return result")
+        self.assertIsNotNone(split_result, "Split prefill should return result")
+
+        # Compare logits shapes
+        self.assertEqual(
+            normal_result.next_token_logits.shape,
+            split_result.next_token_logits.shape,
+            "Logits shapes should match",
+        )
+
+        # Compare logits values (should be very close due to same computation)
+        # Use a larger tolerance for numerical differences in split computation
+        torch.testing.assert_close(
+            normal_result.next_token_logits,
+            split_result.next_token_logits,
+            rtol=1e-3,
+            atol=1e-3,
+            msg="Split prefill and normal prefill should produce similar logits",
+        )
+
+        print("✓ Split prefill and normal prefill produce consistent results")
+
+    def test_split_prefill_different_chunk_sizes(self):
+        """Test split prefill with different chunk sizes."""
+        print("\n=== Testing split prefill with different chunk sizes ===")
+
+        num_layers = self.model_config.num_hidden_layers
+        chunk_sizes = [1, 2, max(1, num_layers // 2), num_layers]
+
+        # Prepare identical batches for each test
+        base_batch = self.prepare_test_batch(batch_size=1, input_len=16)
+        base_input_ids = base_batch.input_ids.clone()
+        base_positions = base_batch.positions.clone()
+
+        results = []
+
+        for chunk_size in chunk_sizes:
+            if chunk_size > num_layers:
+                continue
+
+            print(f"Testing chunk size: {chunk_size}")
+
+            # Prepare fresh batch
+            forward_batch = self.prepare_test_batch(batch_size=1, input_len=16)
+            forward_batch.input_ids = base_input_ids.clone()
+            forward_batch.positions = base_positions.clone()
+            forward_batch.split_index = 0
+
+            # Run split prefill
+            split_result = None
+
+            while forward_batch.split_index < num_layers:
+                result = self.model_runner.forward_split_prefill(
+                    forward_batch=forward_batch,
+                    forward_count=chunk_size,
+                )
+                if result is not None:
+                    split_result = result
+
+            self.assertIsNotNone(
+                split_result,
+                f"Split prefill should succeed with chunk_size={chunk_size}",
+            )
+            results.append(split_result)
+
+        # Compare all results should be identical (same input, same computation)
+        if len(results) > 1:
+            for i, result in enumerate(results[1:], 1):
+                torch.testing.assert_close(
+                    results[0].next_token_logits,
+                    result.next_token_logits,
+                    rtol=1e-3,
+                    atol=1e-3,
+                    msg=f"Results with different chunk sizes should be identical (chunk_size {chunk_sizes[i]})",
+                )
+
+        print("✓ All chunk sizes produce consistent results")
+
+    def test_split_prefill_edge_cases(self):
+        """Test edge cases for split prefill."""
+        print("\n=== Testing split prefill edge cases ===")
+
+        # Test with single layer chunks
+        forward_batch = self.prepare_test_batch(batch_size=1, input_len=8)
+
+        # Process one layer at a time
+        num_layers = self.model_config.num_hidden_layers
+        for layer_idx in range(num_layers):
+            result = self.model_runner.forward_split_prefill(
+                forward_batch=forward_batch,
+                reinit_attn_backend=(layer_idx == 0),
+                forward_count=1,  # One layer at a time
+            )
+
+            if layer_idx == num_layers - 1:
+                # Last layer should return result
+                self.assertIsNotNone(result, "Last layer should return logits")
+            else:
+                # Intermediate layers should return None
+                self.assertIsNone(result, f"Layer {layer_idx} should return None")
+
+        print("✓ Single layer processing works correctly")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_full_deepseek_v3.py b/test/srt/test_full_deepseek_v3.py
new file mode 100644
index 000000000..f6a58536a
--- /dev/null
+++ b/test/srt/test_full_deepseek_v3.py
@@ -0,0 +1,162 @@
+import unittest
+from types import SimpleNamespace
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.send_one import BenchArgs, send_one_prompt
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_amd_ci,
+    is_in_ci,
+    popen_launch_server,
+    write_github_step_summary,
+)
+
+FULL_DEEPSEEK_V3_MODEL_PATH = "deepseek-ai/DeepSeek-V3-0324"
+
+
+class TestDeepseekV3(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = FULL_DEEPSEEK_V3_MODEL_PATH
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = ["--trust-remote-code", "--tp", "8"]
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_a_gsm8k(
+        self,
+    ):  # Append an "a" to make this test run first (alphabetically) to warm up the server
+        args = SimpleNamespace(
+            num_shots=8,
+            data_path=None,
+            num_questions=1400,
+            parallel=1400,
+            max_new_tokens=512,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(f"{metrics=}")
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_gsm8k (deepseek-v3)\n" f'{metrics["accuracy"]=:.3f}\n'
+            )
+            self.assertGreater(metrics["accuracy"], 0.935)
+
+    def test_bs_1_speed(self):
+        args = BenchArgs(port=int(self.base_url.split(":")[-1]), max_new_tokens=2048)
+        acc_length, speed = send_one_prompt(args)
+
+        print(f"{speed=:.2f}")
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_bs_1_speed (deepseek-v3)\n" f"{speed=:.2f} token/s\n"
+            )
+            if is_in_amd_ci():
+                self.assertGreater(speed, 12)
+            else:
+                self.assertGreater(speed, 75)
+
+
+class TestDeepseekV3MTP(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = FULL_DEEPSEEK_V3_MODEL_PATH
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = [
+            "--tp",
+            "8",
+            "--trust-remote-code",
+            "--speculative-algorithm",
+            "EAGLE",
+            "--speculative-num-steps",
+            "3",
+            "--speculative-eagle-topk",
+            "1",
+            "--speculative-num-draft-tokens",
+            "4",
+        ]
+        if not is_in_amd_ci():
+            other_args += ["--mem-frac", "0.7"]
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_a_gsm8k(
+        self,
+    ):  # Append an "a" to make this test run first (alphabetically) to warm up the server
+        requests.get(self.base_url + "/flush_cache")
+
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(f"{metrics=}")
+
+        server_info = requests.get(self.base_url + "/get_server_info")
+        avg_spec_accept_length = server_info.json()["internal_states"][0][
+            "avg_spec_accept_length"
+        ]
+        print(f"{avg_spec_accept_length=}")
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_gsm8k (deepseek-v3 mtp)\n"
+                f'{metrics["accuracy"]=:.3f}\n'
+                f"{avg_spec_accept_length=:.2f}\n"
+            )
+            self.assertGreater(metrics["accuracy"], 0.935)
+            self.assertGreater(avg_spec_accept_length, 2.9)
+
+    def test_bs_1_speed(self):
+        args = BenchArgs(port=int(self.base_url.split(":")[-1]), max_new_tokens=2048)
+        acc_length, speed = send_one_prompt(args)
+
+        print(f"{acc_length=:.2f} {speed=:.2f}")
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_bs_1_speed (deepseek-v3 mtp)\n"
+                f"{acc_length=:.2f}\n"
+                f"{speed=:.2f} token/s\n"
+            )
+            if is_in_amd_ci():
+                self.assertGreater(acc_length, 2.8)
+            else:
+                self.assertGreater(acc_length, 2.9)
+            if is_in_amd_ci():
+                self.assertGreater(speed, 15)
+            else:
+                self.assertGreater(speed, 130)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_function_call_parser.py b/test/srt/test_function_call_parser.py
new file mode 100644
index 000000000..10003a4db
--- /dev/null
+++ b/test/srt/test_function_call_parser.py
@@ -0,0 +1,2194 @@
+import json
+import unittest
+
+from xgrammar import GrammarCompiler, TokenizerInfo
+
+from sglang.srt.entrypoints.openai.protocol import Function, Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.deepseekv3_detector import DeepSeekV3Detector
+from sglang.srt.function_call.glm4_moe_detector import Glm4MoeDetector
+from sglang.srt.function_call.kimik2_detector import KimiK2Detector
+from sglang.srt.function_call.llama32_detector import Llama32Detector
+from sglang.srt.function_call.mistral_detector import MistralDetector
+from sglang.srt.function_call.pythonic_detector import PythonicDetector
+from sglang.srt.function_call.qwen3_coder_detector import Qwen3CoderDetector
+from sglang.srt.function_call.qwen25_detector import Qwen25Detector
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.test.test_utils import DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+
+
+class TestPythonicDetector(unittest.TestCase):
+    def setUp(self):
+        # Create sample tools for testing
+        self.tools = [
+            Tool(
+                type="function",
+                function=Function(
+                    name="get_weather",
+                    description="Get weather information",
+                    parameters={
+                        "properties": {
+                            "location": {
+                                "type": "string",
+                                "description": "Location to get weather for",
+                            },
+                            "unit": {
+                                "type": "string",
+                                "description": "Temperature unit",
+                                "enum": ["celsius", "fahrenheit"],
+                            },
+                        },
+                        "required": ["location"],
+                    },
+                ),
+            ),
+            Tool(
+                type="function",
+                function=Function(
+                    name="search",
+                    description="Search for information",
+                    parameters={
+                        "properties": {
+                            "query": {
+                                "type": "string",
+                                "description": "Search query",
+                            },
+                        },
+                        "required": ["query"],
+                    },
+                ),
+            ),
+        ]
+        self.detector = PythonicDetector()
+
+    def test_parse_streaming_no_brackets(self):
+        """Test parsing text with no brackets (no tool calls)."""
+        text = "This is just normal text without any tool calls."
+        result = self.detector.parse_streaming_increment(text, self.tools)
+
+        self.assertEqual(result.normal_text, text)
+        self.assertEqual(result.calls, [])
+        self.assertEqual(self.detector._buffer, "")  # Buffer should be cleared
+
+    def test_parse_streaming_complete_tool_call(self):
+        """Test parsing a complete tool call."""
+        text = "Here's a tool call: [get_weather(location='New York', unit='celsius')]"
+        result = self.detector.parse_streaming_increment(text, self.tools)
+
+        self.assertEqual(result.normal_text, "Here's a tool call: ")
+        self.assertEqual(len(result.calls), 1)
+        self.assertEqual(result.calls[0].name, "get_weather")
+        self.assertEqual(
+            self.detector._buffer, ""
+        )  # Buffer should be cleared after processing
+
+        # Check the parameters
+        params = json.loads(result.calls[0].parameters)
+        self.assertEqual(params["location"], "New York")
+        self.assertEqual(params["unit"], "celsius")
+
+    def test_parse_streaming_text_before_tool_call(self):
+        """Test parsing text that appears before a tool call."""
+        text = "This is some text before [get_weather(location='London')]"
+        result = self.detector.parse_streaming_increment(text, self.tools)
+
+        self.assertEqual(result.normal_text, "This is some text before ")
+        self.assertEqual(len(result.calls), 1)
+        self.assertEqual(result.calls[0].name, "get_weather")
+
+        # Check the parameters
+        params = json.loads(result.calls[0].parameters)
+        self.assertEqual(params["location"], "London")
+
+    def test_parse_streaming_partial_tool_call(self):
+        """Test parsing a partial tool call that spans multiple chunks."""
+        # First chunk with opening bracket but no closing bracket
+        text1 = "Let me check the weather: [get_weather(location="
+        result1 = self.detector.parse_streaming_increment(text1, self.tools)
+
+        self.assertEqual(result1.normal_text, "Let me check the weather: ")
+        self.assertEqual(result1.calls, [])
+        self.assertEqual(
+            self.detector._buffer, "[get_weather(location="
+        )  # Partial tool call remains in buffer
+
+        # Second chunk completing the tool call
+        text2 = "'Paris')]"
+        result2 = self.detector.parse_streaming_increment(text2, self.tools)
+
+        self.assertEqual(result2.normal_text, "")
+        self.assertEqual(len(result2.calls), 1)
+        self.assertEqual(result2.calls[0].name, "get_weather")
+
+        # Check the parameters
+        params = json.loads(result2.calls[0].parameters)
+        self.assertEqual(params["location"], "Paris")
+        self.assertEqual(
+            self.detector._buffer, ""
+        )  # Buffer should be cleared after processing
+
+    def test_parse_streaming_bracket_without_text_before(self):
+        """Test parsing a tool call that starts at the beginning of the text."""
+        text = "[search(query='python programming')]"
+        result = self.detector.parse_streaming_increment(text, self.tools)
+
+        self.assertEqual(result.normal_text, "")
+        self.assertEqual(len(result.calls), 1)
+        self.assertEqual(result.calls[0].name, "search")
+
+        # Check the parameters
+        params = json.loads(result.calls[0].parameters)
+        self.assertEqual(params["query"], "python programming")
+
+    def test_parse_streaming_text_after_tool_call(self):
+        """Test parsing text that appears after a tool call."""
+        # First chunk with complete tool call and some text after
+        text = "[get_weather(location='Tokyo')] Here's the forecast:"
+        result = self.detector.parse_streaming_increment(text, self.tools)
+
+        self.assertEqual(result.normal_text, "")
+        self.assertEqual(len(result.calls), 1)
+        self.assertEqual(result.calls[0].name, "get_weather")
+        self.assertEqual(
+            self.detector._buffer, " Here's the forecast:"
+        )  # Text after tool call remains in buffer
+
+        # Process the remaining text in buffer
+        result2 = self.detector.parse_streaming_increment("", self.tools)
+        self.assertEqual(result2.normal_text, " Here's the forecast:")
+        self.assertEqual(result2.calls, [])
+        self.assertEqual(self.detector._buffer, "")  # Buffer should be cleared
+
+    def test_parse_streaming_multiple_tool_calls(self):
+        """Test parsing multiple tool calls in sequence."""
+        text = "[get_weather(location='Berlin')] and [search(query='restaurants')]"
+
+        # First tool call
+        result1 = self.detector.parse_streaming_increment(text, self.tools)
+        self.assertEqual(len(result1.calls), 1)
+        self.assertEqual(result1.calls[0].name, "get_weather")
+        self.assertEqual(self.detector._buffer, " and [search(query='restaurants')]")
+
+        # Second tool call
+        result2 = self.detector.parse_streaming_increment("", self.tools)
+        self.assertEqual(result2.normal_text, " and ")
+        self.assertEqual(len(result2.calls), 1)
+        self.assertEqual(result2.calls[0].name, "search")
+        self.assertEqual(self.detector._buffer, "")
+
+    def test_parse_streaming_opening_bracket_only(self):
+        """Test parsing text with only an opening bracket but no closing bracket."""
+        text = "Let's try this: ["
+        result = self.detector.parse_streaming_increment(text, self.tools)
+
+        self.assertEqual(result.normal_text, "Let's try this: ")
+        self.assertEqual(result.calls, [])
+        self.assertEqual(
+            self.detector._buffer, "["
+        )  # Opening bracket remains in buffer
+
+    def test_parse_streaming_nested_brackets(self):
+        """Test parsing tool calls with nested brackets in arguments."""
+        # Test with list argument containing nested brackets
+        text = "[get_weather(location='New York', unit='celsius', data=[1, 2, 3])]"
+        result = self.detector.parse_streaming_increment(text, self.tools)
+
+        self.assertEqual(result.normal_text, "")
+        self.assertEqual(len(result.calls), 1)
+        self.assertEqual(result.calls[0].name, "get_weather")
+        self.assertEqual(self.detector._buffer, "")
+
+        # Check the parameters
+        params = json.loads(result.calls[0].parameters)
+        self.assertEqual(params["location"], "New York")
+        self.assertEqual(params["unit"], "celsius")
+        self.assertEqual(params["data"], [1, 2, 3])
+
+    def test_parse_streaming_nested_brackets_dict(self):
+        """Test parsing tool calls with nested dictionaries and lists."""
+        # Test with nested dict and list arguments
+        text = "[search(query='test', config={'options': [1, 2], 'nested': {'key': 'value'}})]"
+        result = self.detector.parse_streaming_increment(text, self.tools)
+
+        self.assertEqual(result.normal_text, "")
+        self.assertEqual(len(result.calls), 1)
+        self.assertEqual(result.calls[0].name, "search")
+        self.assertEqual(self.detector._buffer, "")
+
+        # Check the parameters
+        params = json.loads(result.calls[0].parameters)
+        self.assertEqual(params["query"], "test")
+        self.assertEqual(params["config"]["options"], [1, 2])
+        self.assertEqual(params["config"]["nested"]["key"], "value")
+
+    def test_parse_streaming_multiple_tools_with_nested_brackets(self):
+        """Test parsing multiple tool calls with nested brackets."""
+        text = "[get_weather(location='Paris', data=[10, 20]), search(query='test', filters=['a', 'b'])]"
+        result = self.detector.parse_streaming_increment(text, self.tools)
+
+        self.assertEqual(result.normal_text, "")
+        self.assertEqual(len(result.calls), 2)
+        self.assertEqual(self.detector._buffer, "")
+
+        # Check first tool call
+        params1 = json.loads(result.calls[0].parameters)
+        self.assertEqual(result.calls[0].name, "get_weather")
+        self.assertEqual(params1["location"], "Paris")
+        self.assertEqual(params1["data"], [10, 20])
+
+        # Check second tool call
+        params2 = json.loads(result.calls[1].parameters)
+        self.assertEqual(result.calls[1].name, "search")
+        self.assertEqual(params2["query"], "test")
+        self.assertEqual(params2["filters"], ["a", "b"])
+
+    def test_parse_streaming_partial_nested_brackets(self):
+        """Test parsing partial tool calls with nested brackets across chunks."""
+        # First chunk with nested brackets but incomplete
+        text1 = "Here's a call: [get_weather(location='Tokyo', data=[1, 2"
+        result1 = self.detector.parse_streaming_increment(text1, self.tools)
+
+        self.assertEqual(result1.normal_text, "Here's a call: ")
+        self.assertEqual(result1.calls, [])
+        self.assertEqual(
+            self.detector._buffer, "[get_weather(location='Tokyo', data=[1, 2"
+        )
+
+        # Second chunk completing the nested brackets
+        text2 = ", 3])]"
+        result2 = self.detector.parse_streaming_increment(text2, self.tools)
+
+        self.assertEqual(result2.normal_text, "")
+        self.assertEqual(len(result2.calls), 1)
+        self.assertEqual(result2.calls[0].name, "get_weather")
+        self.assertEqual(self.detector._buffer, "")
+
+        # Check the parameters
+        params = json.loads(result2.calls[0].parameters)
+        self.assertEqual(params["location"], "Tokyo")
+        self.assertEqual(params["data"], [1, 2, 3])
+
+    def test_parse_streaming_with_python_start_and_end_token(self):
+        """Test parsing a message that starts with <|python_start|> and <|python_end|> across chunks."""
+        chunks = [
+            "Here's a call: ",
+            "<|python_",
+            "start|>[get_weather(location=",
+            "'Tokyo', data=[1, 2",
+            ", 3])]<|python_end|>",
+        ]
+
+        normal_text = ""
+        call_name = ""
+        parameters = ""
+        for chunk in chunks:
+            result = self.detector.parse_streaming_increment(chunk, self.tools)
+            if result.normal_text:
+                normal_text += result.normal_text
+            if result.calls:
+                call_name += result.calls[0].name
+                parameters += result.calls[0].parameters
+
+        self.assertEqual(normal_text, "Here's a call: ")
+        self.assertEqual(call_name, "get_weather")
+        self.assertEqual(self.detector._buffer, "")
+        self.assertEqual(
+            result.normal_text, "", "Final result should have no normal text"
+        )
+
+        # Check the parameters
+        params = json.loads(parameters)
+        self.assertEqual(params["location"], "Tokyo")
+        self.assertEqual(params["data"], [1, 2, 3])
+
+        chunks = [
+            "Here's a call: <|python_start|>[get_weather(location='Tokyo', data=[1, 2, 3])]<|python_end|>"
+        ]
+
+        normal_text = ""
+        call_name = ""
+        parameters = ""
+        for chunk in chunks:
+            result = self.detector.parse_streaming_increment(chunk, self.tools)
+            if result.normal_text:
+                normal_text += result.normal_text
+            if result.calls:
+                call_name += result.calls[0].name
+                parameters += result.calls[0].parameters
+
+        self.assertEqual(normal_text, "Here's a call: ")
+        self.assertEqual(call_name, "get_weather")
+        self.assertEqual(self.detector._buffer, "")
+
+        # Check the parameters
+        params = json.loads(parameters)
+        self.assertEqual(params["location"], "Tokyo")
+        self.assertEqual(params["data"], [1, 2, 3])
+
+    def test_detect_and_parse_with_python_start_and_end_token(self):
+        """Test parsing a message that starts with <|python_start|> and contains a valid tool call."""
+        text = "User wants to get the weather in Mars. <|python_start|>[get_weather(location='Mars', unit='celsius')]<|python_end|> In this way we will get the weather in Mars."
+        result = self.detector.detect_and_parse(text, self.tools)
+
+        self.assertEqual(
+            result.normal_text,
+            "User wants to get the weather in Mars.  In this way we will get the weather in Mars.",
+        )
+        self.assertEqual(len(result.calls), 1)
+        self.assertEqual(result.calls[0].name, "get_weather")
+        self.assertEqual(self.detector._buffer, "")
+
+        # Check the parameters
+        params = json.loads(result.calls[0].parameters)
+        self.assertEqual(params["location"], "Mars")
+        self.assertEqual(params["unit"], "celsius")
+
+
+class TestMistralDetector(unittest.TestCase):
+    def setUp(self):
+        """Set up test tools and detector for Mistral format testing."""
+        self.tools = [
+            Tool(
+                type="function",
+                function=Function(
+                    name="make_next_step_decision",
+                    description="Test function for decision making",
+                    parameters={
+                        "type": "object",
+                        "properties": {
+                            "decision": {
+                                "type": "string",
+                                "description": "The next step to take",
+                            },
+                            "content": {
+                                "type": "string",
+                                "description": "The content of the next step",
+                            },
+                        },
+                        "required": ["decision", "content"],
+                    },
+                ),
+            ),
+        ]
+        self.detector = MistralDetector()
+
+    def test_detect_and_parse_with_nested_brackets_in_content(self):
+        """Test parsing Mistral format with nested brackets in JSON content.
+
+        This test case specifically addresses the issue where the regex pattern
+        was incorrectly truncating JSON when it contained nested brackets like [City Name].
+        """
+        # This is the exact problematic text from the original test failure
+        test_text = '[TOOL_CALLS] [{"name":"make_next_step_decision", "arguments":{"decision":"","content":"```\\nTOOL: Access a weather API or service\\nOBSERVATION: Retrieve the current weather data for the top 5 populated cities in the US\\nANSWER: The weather in the top 5 populated cities in the US is as follows: [City Name] - [Weather Conditions] - [Temperature]\\n```"}}]'
+
+        result = self.detector.detect_and_parse(test_text, self.tools)
+
+        # Verify that the parsing was successful
+        self.assertEqual(len(result.calls), 1, "Should detect exactly one tool call")
+
+        call = result.calls[0]
+        self.assertEqual(
+            call.name,
+            "make_next_step_decision",
+            "Should detect the correct function name",
+        )
+
+        # Verify that the parameters are valid JSON and contain the expected content
+        params = json.loads(call.parameters)
+        self.assertEqual(
+            params["decision"], "", "Decision parameter should be empty string"
+        )
+
+        # The content should contain the full text including the nested brackets [City Name]
+        expected_content = "```\nTOOL: Access a weather API or service\nOBSERVATION: Retrieve the current weather data for the top 5 populated cities in the US\nANSWER: The weather in the top 5 populated cities in the US is as follows: [City Name] - [Weather Conditions] - [Temperature]\n```"
+        self.assertEqual(
+            params["content"],
+            expected_content,
+            "Content should include nested brackets without truncation",
+        )
+
+        # Verify that normal text is empty (since the entire input is a tool call)
+        self.assertEqual(
+            result.normal_text, "", "Normal text should be empty for pure tool call"
+        )
+
+    def test_detect_and_parse_simple_case(self):
+        """Test parsing a simple Mistral format tool call without nested brackets."""
+        test_text = '[TOOL_CALLS] [{"name":"make_next_step_decision", "arguments":{"decision":"TOOL", "content":"Use weather API"}}]'
+
+        result = self.detector.detect_and_parse(test_text, self.tools)
+
+        self.assertEqual(len(result.calls), 1)
+        call = result.calls[0]
+        self.assertEqual(call.name, "make_next_step_decision")
+
+        params = json.loads(call.parameters)
+        self.assertEqual(params["decision"], "TOOL")
+        self.assertEqual(params["content"], "Use weather API")
+
+    def test_detect_and_parse_no_tool_calls(self):
+        """Test parsing text without any tool calls."""
+        test_text = "This is just normal text without any tool calls."
+
+        result = self.detector.detect_and_parse(test_text, self.tools)
+
+        self.assertEqual(len(result.calls), 0, "Should detect no tool calls")
+        self.assertEqual(
+            result.normal_text,
+            test_text,
+            "Should return the original text as normal text",
+        )
+
+    def test_detect_and_parse_with_text_before_tool_call(self):
+        """Test parsing text that has content before the tool call."""
+        test_text = 'Here is some text before the tool call: [TOOL_CALLS] [{"name":"make_next_step_decision", "arguments":{"decision":"ANSWER", "content":"The answer is 42"}}]'
+
+        result = self.detector.detect_and_parse(test_text, self.tools)
+
+        self.assertEqual(len(result.calls), 1)
+        self.assertEqual(result.normal_text, "Here is some text before the tool call:")
+
+        call = result.calls[0]
+        self.assertEqual(call.name, "make_next_step_decision")
+
+        params = json.loads(call.parameters)
+        self.assertEqual(params["decision"], "ANSWER")
+        self.assertEqual(params["content"], "The answer is 42")
+
+
+class TestEBNFGeneration(unittest.TestCase):
+    def setUp(self):
+        # Create sample tools for testing
+        self.tools = [
+            Tool(
+                type="function",
+                function=Function(
+                    name="get_weather",
+                    description="Get weather information",
+                    parameters={
+                        "properties": {
+                            "location": {
+                                "type": "string",
+                                "description": "Location to get weather for",
+                            },
+                            "unit": {
+                                "type": "string",
+                                "description": "Temperature unit",
+                                "enum": ["celsius", "fahrenheit"],
+                            },
+                        },
+                        "required": ["location"],
+                    },
+                ),
+            ),
+            Tool(
+                type="function",
+                function=Function(
+                    name="search",
+                    description="Search for information",
+                    parameters={
+                        "properties": {
+                            "query": {
+                                "type": "string",
+                                "description": "Search query",
+                            },
+                        },
+                        "required": ["query"],
+                    },
+                ),
+            ),
+            Tool(
+                type="function",
+                function=Function(
+                    name="empty_param_func",
+                    description="Function with empty parameters",
+                    parameters={
+                        "properties": {},
+                        "required": [],
+                    },
+                ),
+            ),
+        ]
+
+        self.tokenizer = get_tokenizer(DEFAULT_SMALL_MODEL_NAME_FOR_TEST)
+        tokenizer_info = TokenizerInfo.from_huggingface(self.tokenizer)
+        self.grammar_compiler = GrammarCompiler(tokenizer_info=tokenizer_info)
+
+        # Initialize all detectors
+        self.pythonic_detector = PythonicDetector()
+        self.deepseekv3_detector = DeepSeekV3Detector()
+        self.llama32_detector = Llama32Detector()
+        self.mistral_detector = MistralDetector()
+        self.qwen25_detector = Qwen25Detector()
+        self.qwen3_coder_detector = Qwen3CoderDetector()
+        self.kimik2_detector = KimiK2Detector()
+        self.glm45_detector = Glm4MoeDetector()
+
+    def test_pythonic_detector_ebnf(self):
+        """Test that the PythonicDetector generates valid EBNF."""
+        ebnf = self.pythonic_detector.build_ebnf(self.tools)
+        self.assertIsNotNone(ebnf)
+
+        # Check that the EBNF contains expected patterns
+        self.assertIn('call_get_weather ::= "get_weather" "(" ', ebnf)
+        self.assertIn('"location" "=" basic_string', ebnf)
+        self.assertIn('( "unit" "=" ("\\"celsius\\"" | "\\"fahrenheit\\"") )', ebnf)
+
+        # Validate that the EBNF can be compiled by GrammarCompiler
+        try:
+            ctx = self.grammar_compiler.compile_grammar(ebnf)
+            self.assertIsNotNone(ctx, "EBNF should be valid and compile successfully")
+        except RuntimeError as e:
+            self.fail(f"Failed to compile EBNF: {e}")
+
+    def test_deepseekv3_detector_ebnf(self):
+        """Test that the DeepSeekV3Detector generates valid EBNF."""
+        ebnf = self.deepseekv3_detector.build_ebnf(self.tools)
+        self.assertIsNotNone(ebnf)
+
+        # Check that the EBNF contains expected patterns
+        self.assertIn("<｜tool▁calls▁begin｜>", ebnf)
+        self.assertIn("<｜tool▁call▁begin｜>function<｜tool▁sep｜>get_weather", ebnf)
+        self.assertIn('\\"location\\"" ws ":" ws basic_string ', ebnf)
+
+        # Validate that the EBNF can be compiled by GrammarCompiler
+        try:
+            ctx = self.grammar_compiler.compile_grammar(ebnf)
+            self.assertIsNotNone(ctx, "EBNF should be valid and compile successfully")
+        except RuntimeError as e:
+            self.fail(f"Failed to compile EBNF: {e}")
+
+    def test_kimik2_detector_ebnf(self):
+        """Test that the KimiK2Detector generates valid EBNF."""
+        ebnf = self.kimik2_detector.build_ebnf(self.tools)
+        self.assertIsNotNone(ebnf)
+
+        # Check that the EBNF contains expected patterns for KimiK2 format
+        self.assertIn("<|tool_calls_section_begin|>", ebnf)
+        self.assertIn("<|tool_calls_section_end|>", ebnf)
+
+        # Check for KimiK2-specific function call structure
+        self.assertIn("<|tool_call_begin|>functions.get_weather:", ebnf)
+        self.assertIn("<|tool_call_begin|>functions.search:", ebnf)
+        self.assertIn("<|tool_call_argument_begin|>", ebnf)
+        self.assertIn("<|tool_call_end|>", ebnf)
+
+        # Check that it uses the correct namespace.function format with numeric index pattern
+        self.assertIn("functions.get_weather:", ebnf)
+        self.assertIn("functions.search:", ebnf)
+        self.assertIn("[0-9]+", ebnf)  # Numeric index pattern
+
+        # Validate that the EBNF can be compiled by GrammarCompiler
+        try:
+            ctx = self.grammar_compiler.compile_grammar(ebnf)
+            self.assertIsNotNone(ctx, "EBNF should be valid and compile successfully")
+        except RuntimeError as e:
+            self.fail(f"Failed to compile EBNF: {e}")
+
+    def test_llama32_detector_ebnf(self):
+        """Test that the Llama32Detector generates valid EBNF."""
+        ebnf = self.llama32_detector.build_ebnf(self.tools)
+        self.assertIsNotNone(ebnf)
+
+        # Check that the EBNF contains expected patterns
+        self.assertIn('\\"name\\"" ws ":" ws "\\"get_weather\\"', ebnf)
+        self.assertIn('"\\"arguments\\"" ws ":"', ebnf)
+
+        # Validate that the EBNF can be compiled by GrammarCompiler
+        try:
+            ctx = self.grammar_compiler.compile_grammar(ebnf)
+            self.assertIsNotNone(ctx, "EBNF should be valid and compile successfully")
+        except RuntimeError as e:
+            self.fail(f"Failed to compile EBNF: {e}")
+
+    def test_mistral_detector_ebnf(self):
+        """Test that the MistralDetector generates valid EBNF."""
+        ebnf = self.mistral_detector.build_ebnf(self.tools)
+        self.assertIsNotNone(ebnf)
+
+        # Check that the EBNF contains expected patterns
+        self.assertIn('"[TOOL_CALLS] ["', ebnf)
+        self.assertIn("call_get_weather | call_search", ebnf)
+        self.assertIn('"\\"arguments\\"" ws ":"', ebnf)
+
+        # Validate that the EBNF can be compiled by GrammarCompiler
+        try:
+            ctx = self.grammar_compiler.compile_grammar(ebnf)
+            self.assertIsNotNone(ctx, "EBNF should be valid and compile successfully")
+        except RuntimeError as e:
+            self.fail(f"Failed to compile EBNF: {e}")
+
+    def test_qwen25_detector_ebnf(self):
+        """Test that the Qwen25Detector generates valid EBNF."""
+        ebnf = self.qwen25_detector.build_ebnf(self.tools)
+        self.assertIsNotNone(ebnf)
+
+        # Check that the EBNF contains expected patterns
+        self.assertIn("<tool_call>", ebnf)
+        self.assertIn('\\"name\\"" ws ":" ws "\\"get_weather\\"', ebnf)
+        self.assertIn('"\\"arguments\\"" ws ":"', ebnf)
+
+        # Validate that the EBNF can be compiled by GrammarCompiler
+        try:
+            ctx = self.grammar_compiler.compile_grammar(ebnf)
+            self.assertIsNotNone(ctx, "EBNF should be valid and compile successfully")
+        except RuntimeError as e:
+            self.fail(f"Failed to compile EBNF: {e}")
+
+    def test_glm45_detector_ebnf(self):
+        """Test that the Glm4MoeDetector generates valid EBNF."""
+        ebnf = self.glm45_detector.build_ebnf(self.tools)
+        self.assertIsNotNone(ebnf)
+        # Check that the EBNF contains expected patterns for XML format
+        self.assertIn('"<tool_call>" function_call "</tool_call>"', ebnf)
+        self.assertIn('"get_weather" "\\n" ( arguments_get_weather "\\n" )?', ebnf)
+        self.assertIn(
+            '"<arg_key>location</arg_key>" "\\n" "<arg_value>" xml_text "</arg_value>" ( "\\n" ( "<arg_key>unit</arg_key>" "\\n" "<arg_value>" ("celsius" | "fahrenheit") "</arg_value>" ) )?',
+            ebnf,
+        )
+        self.assertIn('"search" "\\n" ( arguments_search "\\n" )?', ebnf)
+        self.assertIn(
+            '"<arg_key>query</arg_key>" "\\n" "<arg_value>" xml_text "</arg_value>"',
+            ebnf,
+        )
+        self.assertIn(
+            '"empty_param_func" "\\n" ( arguments_empty_param_func "\\n" )?', ebnf
+        )
+        self.assertIn('arguments_empty_param_func ::= ""', ebnf)
+
+        # Validate that the EBNF can be compiled by GrammarCompiler
+        try:
+            ctx = self.grammar_compiler.compile_grammar(ebnf)
+            self.assertIsNotNone(ctx, "EBNF should be valid and compile successfully")
+        except RuntimeError as e:
+            self.fail(f"Failed to compile EBNF: {e}")
+
+    def test_qwen3_coder_detector_ebnf(self):
+        """Test that the Qwen3CoderDetector generates valid EBNF."""
+        ebnf = self.qwen3_coder_detector.build_ebnf(self.tools)
+        self.assertIsNotNone(ebnf)
+        # Check that the EBNF contains expected patterns for XML format
+        self.assertIn("<tool_call>", ebnf)
+        self.assertIn("</tool_call>", ebnf)
+        self.assertIn('"<function=get_weather>\\n"', ebnf)
+        self.assertIn('"\\n</function>"', ebnf)
+        self.assertIn('"<parameter=location>\\n"', ebnf)
+        self.assertIn('"\\n</parameter>"', ebnf)
+        # Check that it uses xml_text for string parameters
+        self.assertIn("xml_text", ebnf)
+        # Validate that the EBNF can be compiled by GrammarCompiler
+        try:
+            ctx = self.grammar_compiler.compile_grammar(ebnf)
+            self.assertIsNotNone(ctx, "EBNF should be valid and compile successfully")
+        except RuntimeError as e:
+            self.fail(f"Failed to compile EBNF: {e}")
+
+    def test_weather_function_optional_parameter_handling(self):
+        """Test that weather function with optional unit parameter generates correct EBNF without trailing commas."""
+        # Create a weather tool with required location and optional unit
+        weather_tool = Tool(
+            type="function",
+            function=Function(
+                name="get_current_weather",
+                description="Get the current weather in a given location",
+                parameters={
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The city and state, e.g. San Francisco, CA",
+                        },
+                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                    },
+                    "required": ["location"],
+                },
+            ),
+        )
+
+        # Test all detectors with the weather tool
+        detectors = {
+            "pythonic": self.pythonic_detector,
+            "deepseekv3": self.deepseekv3_detector,
+            "llama32": self.llama32_detector,
+            "mistral": self.mistral_detector,
+            "qwen25": self.qwen25_detector,
+        }
+
+        for name, detector in detectors.items():
+            with self.subTest(detector=name):
+                ebnf = detector.build_ebnf([weather_tool])
+                self.assertIsNotNone(ebnf, f"{name} detector should generate EBNF")
+
+                # Check that the EBNF properly handles optional parameters
+                if name == "pythonic":
+                    # Pythonic format: location="Paris" ( , ( unit=("celsius" | "fahrenheit") )?
+                    self.assertIn('"location" "=" basic_string', ebnf)
+                    # The comma should be inside the optional brackets for unit
+                    self.assertIn('( ws "," ws ( "unit" "=" ', ebnf)
+                else:
+                    # JSON format: "location": "Paris" ( , ( "unit": ("celsius" | "fahrenheit") )?
+                    self.assertIn('"location\\"" ws ":" ws basic_string', ebnf)
+                    # The comma should be part of the optional group
+                    # This pattern ensures no trailing comma when unit is omitted
+                    self.assertIn('( ws "," ws ( "\\"unit\\"" ws ":"', ebnf)
+
+                # Validate that the EBNF can be compiled
+                try:
+                    ctx = self.grammar_compiler.compile_grammar(ebnf)
+                    self.assertIsNotNone(
+                        ctx, f"{name} EBNF should compile successfully"
+                    )
+                except RuntimeError as e:
+                    self.fail(f"Failed to compile {name} EBNF: {e}")
+
+    def test_multiple_optional_parameters_flexible_ordering(self):
+        """Test that multiple optional parameters allow flexible ordering using llama.cpp approach."""
+        # Create a tool with one required and multiple optional parameters
+        test_tool = Tool(
+            type="function",
+            function=Function(
+                name="test_func",
+                description="Test function with multiple optional parameters",
+                parameters={
+                    "type": "object",
+                    "properties": {
+                        "required_field": {"type": "string"},
+                        "opt1": {"type": "number"},
+                        "opt2": {"type": "boolean"},
+                        "opt3": {"type": "string"},
+                    },
+                    "required": ["required_field"],
+                },
+            ),
+        )
+
+        # Test JSON-based detectors (not pythonic)
+        json_detectors = {
+            "deepseekv3": self.deepseekv3_detector,
+            "llama32": self.llama32_detector,
+            "mistral": self.mistral_detector,
+            "qwen25": self.qwen25_detector,
+        }
+
+        for name, detector in json_detectors.items():
+            with self.subTest(detector=name):
+                ebnf = detector.build_ebnf([test_tool])
+                self.assertIsNotNone(ebnf, f"{name} detector should generate EBNF")
+
+                # Print the arguments rule for debugging
+                lines = ebnf.split("\n")
+                args_rule = None
+                for line in lines:
+                    if line.startswith("arguments_test_func ::="):
+                        args_rule = line
+                        break
+
+                self.assertIsNotNone(
+                    args_rule, f"{name} should have arguments_test_func rule"
+                )
+
+                # Check required field
+                self.assertIn('"required_field\\"" ws ":" ws basic_string', ebnf)
+
+                # Check the structure for optional parameters
+                # The pattern should be: required_field ( "," ( opt1 ... | opt2 ... | opt3 ... ) )?
+                # This allows flexible ordering where any optional can be first
+
+                # Check that optional parameters are in a group with comma
+                if args_rule:  # Only check if args_rule was found
+                    self.assertIn(
+                        '( ws "," ws (',
+                        args_rule,
+                        f"{name} should have comma grouped with optional parameters",
+                    )
+
+                    # Check for the alternation pattern that allows flexible ordering
+                    # Should contain patterns like: opt1 ... | opt2 ... | opt3
+                    self.assertIn('"opt1\\"" ws ":" ws basic_number', args_rule)
+                    self.assertIn('"opt2\\"" ws ":" ws basic_boolean', args_rule)
+                    self.assertIn('"opt3\\"" ws ":" ws basic_string', args_rule)
+
+                    # Check for alternation (|) which allows skipping optional parameters
+                    self.assertIn(
+                        "|",
+                        args_rule,
+                        f"{name} should use alternation for flexible optional ordering",
+                    )
+
+                    # Check that the pattern ends properly with closing braces
+                    self.assertTrue(
+                        args_rule.endswith('"}"'),
+                        f"{name} arguments rule should end with closing brace",
+                    )
+
+                # Validate compilation
+                try:
+                    ctx = self.grammar_compiler.compile_grammar(ebnf)
+                    self.assertIsNotNone(
+                        ctx, f"{name} EBNF should compile successfully"
+                    )
+                except RuntimeError as e:
+                    self.fail(f"Failed to compile {name} EBNF: {e}")
+
+    def test_all_optional_parameters_ordering(self):
+        """Test the behavior when ALL parameters are optional - verifies ordering constraints."""
+        # Create a tool with only optional parameters
+        all_optional_tool = Tool(
+            type="function",
+            function=Function(
+                name="optional_func",
+                description="Function with all optional parameters",
+                parameters={
+                    "type": "object",
+                    "properties": {
+                        "opt1": {"type": "string"},
+                        "opt2": {"type": "number"},
+                        "opt3": {"type": "boolean"},
+                    },
+                    "required": [],  # No required parameters
+                },
+            ),
+        )
+
+        # Test JSON-based detectors
+        json_detectors = {
+            "deepseekv3": self.deepseekv3_detector,
+            "llama32": self.llama32_detector,
+            "mistral": self.mistral_detector,
+            "qwen25": self.qwen25_detector,
+        }
+
+        for name, detector in json_detectors.items():
+            with self.subTest(detector=name):
+                ebnf = detector.build_ebnf([all_optional_tool])
+                self.assertIsNotNone(ebnf, f"{name} detector should generate EBNF")
+
+                # Extract the arguments rule
+                lines = ebnf.split("\n")
+                args_rule = None
+                for line in lines:
+                    if line.startswith("arguments_optional_func ::="):
+                        args_rule = line
+                        break
+
+                self.assertIsNotNone(
+                    args_rule, f"{name} should have arguments_optional_func rule"
+                )
+
+                if args_rule:
+                    # When all parameters are optional, the pattern now uses alternation:
+                    # "{" ( opt1 ... | opt2 ... | opt3 ... )? "}"
+                    # This allows flexible ordering where any optional can appear first
+
+                    # Check the structure
+                    self.assertIn('"opt1\\"" ws ":" ws basic_string', args_rule)
+                    self.assertIn('"opt2\\"" ws ":" ws basic_number', args_rule)
+                    self.assertIn('"opt3\\"" ws ":" ws basic_boolean', args_rule)
+
+                    # The pattern SHOULD have alternation (|) for flexible ordering
+                    self.assertIn(
+                        "|",
+                        args_rule,
+                        f"{name} should use alternation for flexible ordering even when all properties are optional",
+                    )
+
+                # Validate compilation
+                try:
+                    ctx = self.grammar_compiler.compile_grammar(ebnf)
+                    self.assertIsNotNone(
+                        ctx, f"{name} EBNF should compile successfully"
+                    )
+                except RuntimeError as e:
+                    self.fail(f"Failed to compile {name} EBNF: {e}")
+
+
+class TestBaseFormatDetector(unittest.TestCase):
+    """Test buffer management and sequential tool index assignment in BaseFormatDetector."""
+
+    def setUp(self):
+        """Set up test detector and tools."""
+
+        # Create a concrete implementation of BaseFormatDetector for testing
+        class TestFormatDetector(BaseFormatDetector):
+            def __init__(self):
+                super().__init__()
+                self.bot_token = "<tool_call>"
+                self.eot_token = "</tool_call>"
+
+            def detect_and_parse(self, text, tools):
+                # Not used in streaming tests
+                pass
+
+            def has_tool_call(self, text):
+                return "<tool_call>" in text
+
+            def structure_info(self):
+                # Not used in streaming tests
+                pass
+
+            def build_ebnf(self, tools):
+                # Not used in streaming tests
+                pass
+
+        self.detector = TestFormatDetector()
+        self.tools = [
+            Tool(
+                type="function",
+                function=Function(
+                    name="get_weather",
+                    description="Get weather information",
+                    parameters={
+                        "type": "object",
+                        "properties": {
+                            "city": {
+                                "type": "string",
+                                "description": "City name",
+                            }
+                        },
+                        "required": ["city"],
+                    },
+                ),
+            ),
+            Tool(
+                type="function",
+                function=Function(
+                    name="get_tourist_attractions",
+                    description="Get tourist attractions",
+                    parameters={
+                        "type": "object",
+                        "properties": {
+                            "city": {
+                                "type": "string",
+                                "description": "City name",
+                            }
+                        },
+                        "required": ["city"],
+                    },
+                ),
+            ),
+        ]
+
+    def test_sequential_tool_index_assignment(self):
+        """Test that multiple tool calls get sequential tool_index values (0, 1, 2, ...)."""
+        # Simulate streaming chunks for two consecutive tool calls
+        chunks = [
+            "<tool_call>",
+            '{"name": "get_weather", ',
+            '"arguments": {"city": "Paris"}}',
+            ", ",
+            '{"name": "get_tourist_attractions", ',
+            '"arguments": {"city": "London"}}',
+            "</tool_call>",
+        ]
+
+        tool_indices_seen = []
+
+        for chunk in chunks:
+            result = self.detector.parse_streaming_increment(chunk, self.tools)
+
+            if result.calls:
+                for call in result.calls:
+                    if call.tool_index is not None:
+                        tool_indices_seen.append(call.tool_index)
+
+        # Verify we got sequential tool indices
+        unique_indices = sorted(set(tool_indices_seen))
+        self.assertEqual(
+            unique_indices,
+            [0, 1],
+            f"Expected sequential tool indices [0, 1], got {unique_indices}",
+        )
+
+    def test_buffer_content_preservation(self):
+        """Test that buffer correctly preserves unprocessed content when tool completes."""
+        # Test simpler scenario: tool completion followed by new tool start
+        chunks = [
+            "<tool_call>",
+            '{"name": "get_weather", ',
+            '"arguments": {"city": "Paris"}}',
+            ", ",
+            '{"name": "get_tourist_attractions", ',
+            '"arguments": {"city": "London"}} </tool_call>',
+        ]
+
+        tool_calls_seen = []
+
+        for chunk in chunks:
+            result = self.detector.parse_streaming_increment(chunk, self.tools)
+            if result.calls:
+                for call in result.calls:
+                    if (
+                        call.name
+                    ):  # Only count calls with names (not just parameter updates)
+                        tool_calls_seen.append(call.name)
+
+        # Should see both tool names
+        self.assertIn("get_weather", tool_calls_seen, "Should process first tool")
+        self.assertIn(
+            "get_tourist_attractions", tool_calls_seen, "Should process second tool"
+        )
+
+    def test_current_tool_id_increment_on_completion(self):
+        """Test that current_tool_id increments when a tool completes."""
+        # Initial state
+        self.assertEqual(
+            self.detector.current_tool_id, -1, "Should start with current_tool_id=-1"
+        )
+
+        # Process first tool completely
+        chunks = [
+            "<tool_call>",
+            '{"name": "get_weather", ',
+        ]
+
+        for chunk in chunks:
+            result = self.detector.parse_streaming_increment(chunk, self.tools)
+
+        self.assertEqual(
+            self.detector.current_tool_id, 0, "current_tool_id should be 0"
+        )
+        self.assertEqual(
+            result.calls[0].name, "get_weather", "The first tool should be get_weather"
+        )
+        self.assertEqual(
+            result.calls[0].tool_index, 0, "The first tool index should be 0"
+        )
+
+        # Complete second tool name - this should show that current_tool_id is now 1
+        result = self.detector.parse_streaming_increment(
+            '"arguments": {"city": "Paris"}}, {"name": "get_', self.tools
+        )
+        self.assertEqual(result.calls[0].parameters, '{"city": "Paris"}')
+
+        self.assertEqual(
+            self.detector.current_tool_id,
+            1,
+            "current_tool_id should be 1 after first tool completes and second tool starts",
+        )
+
+        result = self.detector.parse_streaming_increment(
+            'tourist_attractions", ', self.tools
+        )
+
+        # Second tool should have tool_index=1
+        tourist_calls = [
+            call for call in result.calls if call.name == "get_tourist_attractions"
+        ]
+        self.assertEqual(
+            tourist_calls[0].tool_index, 1, "Second tool should have tool_index=1"
+        )
+
+    def test_tool_name_streaming_with_correct_index(self):
+        """Test that tool names are streamed with correct tool_index values."""
+        # Process first tool
+        self.detector.parse_streaming_increment("<tool_call>", self.tools)
+        result1 = self.detector.parse_streaming_increment(
+            '{"name": "get_weather", ', self.tools
+        )
+
+        # First tool name should have tool_index=0
+        weather_calls = [call for call in result1.calls if call.name == "get_weather"]
+        self.assertEqual(len(weather_calls), 1, "Should have one weather call")
+        self.assertEqual(
+            weather_calls[0].tool_index, 0, "First tool should have tool_index=0"
+        )
+
+        # Complete first tool
+        self.detector.parse_streaming_increment(
+            '"arguments": {"city": "Paris"}}', self.tools
+        )
+
+        # Start second tool
+        self.detector.parse_streaming_increment(", ", self.tools)
+        result2 = self.detector.parse_streaming_increment(
+            '{"name": "get_tourist_attractions", ', self.tools
+        )
+
+        # Second tool name should have tool_index=1
+        tourist_calls = [
+            call for call in result2.calls if call.name == "get_tourist_attractions"
+        ]
+        self.assertEqual(
+            len(tourist_calls), 1, "Should have one tourist attractions call"
+        )
+        self.assertEqual(
+            tourist_calls[0].tool_index, 1, "Second tool should have tool_index=1"
+        )
+
+    def test_buffer_reset_on_invalid_tool(self):
+        """Test that buffer and state are reset when an invalid tool name is encountered."""
+        # Start fresh with an invalid tool name from the beginning
+        result = self.detector.parse_streaming_increment(
+            '<tool_call>{"name": "invalid_tool", ', self.tools
+        )
+
+        # Should return empty result and reset state
+        self.assertEqual(result.calls, [], "Should return no calls for invalid tool")
+        self.assertEqual(
+            self.detector.current_tool_id,
+            -1,
+            "current_tool_id should remain -1 for invalid tool",
+        )
+        self.assertEqual(
+            self.detector._buffer, "", "Buffer should be cleared for invalid tool"
+        )
+
+
+class TestLlama32Detector(unittest.TestCase):
+    def setUp(self):
+        """Set up test tools and detector for Mistral format testing."""
+        self.tools = [
+            Tool(
+                type="function",
+                function=Function(
+                    name="get_weather",
+                    description="Get weather information",
+                    parameters={
+                        "type": "object",
+                        "properties": {
+                            "city": {
+                                "type": "string",
+                                "description": "City name",
+                            }
+                        },
+                        "required": ["city"],
+                    },
+                ),
+            ),
+            Tool(
+                type="function",
+                function=Function(
+                    name="get_tourist_attractions",
+                    description="Get tourist attractions",
+                    parameters={
+                        "type": "object",
+                        "properties": {
+                            "city": {
+                                "type": "string",
+                                "description": "City name",
+                            }
+                        },
+                        "required": ["city"],
+                    },
+                ),
+            ),
+        ]
+        self.detector = Llama32Detector()
+
+    def test_single_json(self):
+        text = '{"name": "get_weather", "parameters": {"city": "Paris"}}'
+        result = self.detector.detect_and_parse(text, self.tools)
+        assert len(result.calls) == 1
+        assert result.calls[0].name == "get_weather"
+        assert result.normal_text == ""
+
+    def test_multiple_json_with_separator(self):
+        text = (
+            '<|python_tag|>{"name": "get_weather", "parameters": {"city": "Paris"}};'
+            '{"name": "get_tourist_attractions", "parameters": {"city": "Paris"}}'
+        )
+        result = self.detector.detect_and_parse(text, self.tools)
+        self.assertEqual(len(result.calls), 2)
+        self.assertEqual(result.calls[1].name, "get_tourist_attractions")
+        self.assertEqual(result.normal_text, "")
+
+    def test_multiple_json_with_separator_customized(self):
+        text = (
+            '<|python_tag|>{"name": "get_weather", "parameters": {}}'
+            '<|python_tag|>{"name": "get_tourist_attractions", "parameters": {}}'
+        )
+        result = self.detector.detect_and_parse(text, self.tools)
+        self.assertEqual(len(result.calls), 2)
+        self.assertEqual(result.calls[1].name, "get_tourist_attractions")
+        self.assertEqual(result.normal_text, "")
+
+    def test_json_with_trailing_text(self):
+        text = '{"name": "get_weather", "parameters": {}} Some follow-up text'
+        result = self.detector.detect_and_parse(text, self.tools)
+        self.assertEqual(len(result.calls), 1)
+        self.assertIn("follow-up", result.normal_text)
+
+    def test_invalid_then_valid_json(self):
+        text = (
+            '{"name": "get_weather", "parameters": {'  # malformed
+            '{"name": "get_weather", "parameters": {}}'
+        )
+        result = self.detector.detect_and_parse(text, self.tools)
+        self.assertEqual(len(result.calls), 1)
+        self.assertEqual(result.calls[0].name, "get_weather")
+
+    def test_plain_text_only(self):
+        text = "This is just plain explanation text."
+        result = self.detector.detect_and_parse(text, self.tools)
+        self.assertEqual(result.calls, [])
+        self.assertEqual(result.normal_text, text)
+
+    def test_with_python_tag_prefix(self):
+        text = 'Some intro. <|python_tag|>{"name": "get_weather", "parameters": {}}'
+        result = self.detector.detect_and_parse(text, self.tools)
+        self.assertEqual(len(result.calls), 1)
+        self.assertTrue(result.normal_text.strip().startswith("Some intro."))
+
+
+class TestKimiK2Detector(unittest.TestCase):
+
+    def setUp(self):
+        """Set up test tools and detector."""
+        self.tools = [
+            Tool(
+                type="function",
+                function=Function(
+                    name="get_weather",
+                    description="Get weather information",
+                    parameters={
+                        "type": "object",
+                        "properties": {
+                            "city": {
+                                "type": "string",
+                                "description": "City name",
+                            }
+                        },
+                        "required": ["city"],
+                    },
+                ),
+            ),
+            Tool(
+                type="function",
+                function=Function(
+                    name="get_tourist_attractions",
+                    description="Get tourist attractions",
+                    parameters={
+                        "type": "object",
+                        "properties": {
+                            "city": {
+                                "type": "string",
+                                "description": "City name",
+                            }
+                        },
+                        "required": ["city"],
+                    },
+                ),
+            ),
+        ]
+        self.detector = KimiK2Detector()
+
+    def test_single_tool_call(self):
+        """Test parsing a single tool call in a complete text."""
+        text = '<|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"city": "Paris"}<|tool_call_end|><|tool_calls_section_end|>'
+        result = self.detector.detect_and_parse(text, self.tools)
+        self.assertEqual(len(result.calls), 1)
+        self.assertEqual(result.calls[0].name, "get_weather")
+        self.assertEqual(result.calls[0].parameters, '{"city": "Paris"}')
+        self.assertEqual(result.normal_text, "")
+
+    def test_multiple_tool_calls(self):
+        """Test parsing multiple tool calls in a complete text."""
+        text = '<|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"city": "Paris"}<|tool_call_end|><|tool_call_begin|>functions.get_tourist_attractions:1<|tool_call_argument_begin|>{"city": "London"}<|tool_call_end|><|tool_calls_section_end|>'
+        result = self.detector.detect_and_parse(text, self.tools)
+        self.assertEqual(len(result.calls), 2)
+        self.assertEqual(result.calls[0].name, "get_weather")
+        self.assertEqual(result.calls[0].parameters, '{"city": "Paris"}')
+        self.assertEqual(result.calls[1].name, "get_tourist_attractions")
+        self.assertEqual(result.calls[1].parameters, '{"city": "London"}')
+        self.assertEqual(result.normal_text, "")
+
+    def test_streaming_tool_call(self):
+        """Test streaming incremental parsing of a tool call."""
+        chunks = [
+            "<|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{",
+            '"city": "Paris"',
+            "}",
+            "<|tool_call_end|><|tool_calls_section_end|>",
+        ]
+
+        tool_calls = []
+        for chunk in chunks:
+            result = self.detector.parse_streaming_increment(chunk, self.tools)
+            for tool_call_chunk in result.calls:
+                if tool_call_chunk.tool_index is not None:
+
+                    while len(tool_calls) <= tool_call_chunk.tool_index:
+                        tool_calls.append({"name": "", "parameters": ""})
+
+                    tc = tool_calls[tool_call_chunk.tool_index]
+
+                    if tool_call_chunk.name:
+                        tc["name"] += tool_call_chunk.name
+                    if tool_call_chunk.parameters:
+                        tc["parameters"] += tool_call_chunk.parameters
+
+        self.assertEqual(len(tool_calls), 1)
+        self.assertEqual(tool_calls[0]["name"], "get_weather")
+        self.assertEqual(tool_calls[0]["parameters"], '{"city": "Paris"}')
+
+    def test_streaming_multiple_tool_calls(self):
+        """Test streaming incremental parsing of multiple tool calls."""
+        chunks = [
+            "<|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{",
+            '"city": "Paris"',
+            "}<|tool_call_end|>",
+            "<|tool_call_begin|>functions.get_tourist_attractions:1<|tool_call_argument_begin|>{",
+            '"city": "London"',
+            "}<|tool_call_end|>",
+            "<|tool_calls_section_end|>",
+        ]
+
+        tool_calls = []
+        for chunk in chunks:
+            result = self.detector.parse_streaming_increment(chunk, self.tools)
+            for tool_call_chunk in result.calls:
+                if tool_call_chunk.tool_index is not None:
+
+                    while len(tool_calls) <= tool_call_chunk.tool_index:
+                        tool_calls.append({"name": "", "parameters": ""})
+
+                    tc = tool_calls[tool_call_chunk.tool_index]
+
+                    if tool_call_chunk.name:
+                        tc["name"] += tool_call_chunk.name
+                    if tool_call_chunk.parameters:
+                        tc["parameters"] += tool_call_chunk.parameters
+
+        self.assertEqual(len(tool_calls), 2)
+        self.assertEqual(tool_calls[0]["name"], "get_weather")
+        self.assertEqual(tool_calls[0]["parameters"], '{"city": "Paris"}')
+        self.assertEqual(tool_calls[1]["name"], "get_tourist_attractions")
+        self.assertEqual(tool_calls[1]["parameters"], '{"city": "London"}')
+
+    def test_tool_call_completion(self):
+        """Test that the buffer and state are reset after a tool call is completed."""
+        chunks = [
+            "<|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{",
+            '"city": "Paris"',
+            "}",
+            "<|tool_call_end|>",
+            "<|tool_calls_section_end|>",
+        ]
+
+        for chunk in chunks:
+            result = self.detector.parse_streaming_increment(chunk, self.tools)
+
+        # After processing all chunks, the buffer should be empty and current_tool_id should be reset
+        self.assertEqual(self.detector._buffer, "")
+        self.assertEqual(self.detector.current_tool_id, 1)
+
+    def test_tool_name_streaming(self):
+        """Test that tool names are streamed correctly with the right index."""
+        chunks = [
+            "<|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{",
+            '"city": "Paris"',
+            "}",
+            "<|tool_call_end|>",
+            "<|tool_call_begin|>functions.get_tourist_attractions:1<|tool_call_argument_begin|>{",
+        ]
+
+        tool_calls = []
+        for chunk in chunks:
+            result = self.detector.parse_streaming_increment(chunk, self.tools)
+            for tool_call_chunk in result.calls:
+                if tool_call_chunk.tool_index is not None:
+
+                    while len(tool_calls) <= tool_call_chunk.tool_index:
+                        tool_calls.append({"name": "", "parameters": ""})
+
+                    tc = tool_calls[tool_call_chunk.tool_index]
+
+                    if tool_call_chunk.name:
+                        tc["name"] += tool_call_chunk.name
+                    if tool_call_chunk.parameters:
+                        tc["parameters"] += tool_call_chunk.parameters
+
+        self.assertEqual(len(tool_calls), 2)
+        self.assertEqual(tool_calls[0]["name"], "get_weather")
+        self.assertEqual(tool_calls[0]["parameters"], '{"city": "Paris"}')
+        self.assertEqual(tool_calls[1]["name"], "get_tourist_attractions")
+
+    def test_invalid_tool_call(self):
+        """Test that invalid tool calls are handled correctly."""
+        text = 'invalid_tool:0<|tool_call_argument_begin|>{"city": "Paris"}<|tool_call_end|><|tool_calls_section_end|>'
+        result = self.detector.detect_and_parse(text, self.tools)
+        self.assertEqual(len(result.calls), 0)
+        self.assertEqual(result.normal_text, text)
+
+    def test_partial_tool_call(self):
+        """Test that partial tool calls are handled correctly in streaming mode."""
+        chunks = [
+            "<|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{",
+            '"city": "Paris"',
+        ]
+
+        tool_calls = []
+        for chunk in chunks:
+            result = self.detector.parse_streaming_increment(chunk, self.tools)
+            for tool_call_chunk in result.calls:
+                if tool_call_chunk.tool_index is not None:
+
+                    while len(tool_calls) <= tool_call_chunk.tool_index:
+                        tool_calls.append({"name": "", "parameters": ""})
+
+                    tc = tool_calls[tool_call_chunk.tool_index]
+
+                    if tool_call_chunk.name:
+                        tc["name"] += tool_call_chunk.name
+                    if tool_call_chunk.parameters:
+                        tc["parameters"] += tool_call_chunk.parameters
+
+        self.assertEqual(len(tool_calls), 1)
+        self.assertEqual(tool_calls[0]["name"], "get_weather")
+        self.assertEqual(tool_calls[0]["parameters"], '{"city": "Paris"')
+
+
+class TestDeepSeekV3Detector(unittest.TestCase):
+    def setUp(self):
+        """Set up test tools and detector for DeepSeekV3 format testing."""
+        self.tools = [
+            Tool(
+                type="function",
+                function=Function(
+                    name="get_weather",
+                    description="Get weather information",
+                    parameters={
+                        "type": "object",
+                        "properties": {
+                            "city": {
+                                "type": "string",
+                                "description": "City name",
+                            }
+                        },
+                        "required": ["city"],
+                    },
+                ),
+            ),
+            Tool(
+                type="function",
+                function=Function(
+                    name="get_tourist_attractions",
+                    description="Get tourist attractions",
+                    parameters={
+                        "type": "object",
+                        "properties": {
+                            "city": {
+                                "type": "string",
+                                "description": "City name",
+                            }
+                        },
+                        "required": ["city"],
+                    },
+                ),
+            ),
+        ]
+        self.detector = DeepSeekV3Detector()
+
+    def test_parse_streaming_multiple_tool_calls_with_multi_token_chunk(self):
+        """Test parsing multiple tool calls when streaming chunks contains multi-tokens (e.g. DeepSeekV3 enable MTP)"""
+        # Simulate streaming chunks with multi-tokens for two consecutive tool calls
+        chunks = [
+            "<｜tool▁calls▁begin｜>",
+            "<｜tool▁call▁begin｜>function",
+            "<｜tool▁sep｜>get",
+            "_weather\n",
+            "```json\n",
+            '{"city":',
+            '"Shanghai',
+            '"}\n```<｜tool▁call▁end｜>',
+            "\n<｜tool▁call▁begin｜>",
+            "function<｜tool▁sep｜>",
+            "get_tour",
+            "ist_att",
+            "ractions\n```" 'json\n{"',
+            'city": "',
+            'Beijing"}\n',
+            "```<｜tool▁call▁end｜>",
+            "<｜tool▁calls▁end｜>",
+        ]
+
+        tool_calls_seen = []
+        tool_calls_parameters = []
+
+        for chunk in chunks:
+            result = self.detector.parse_streaming_increment(chunk, self.tools)
+            if result.calls:
+                for call in result.calls:
+                    if call.name:
+                        tool_calls_seen.append(call.name)
+                    if call.parameters:
+                        tool_calls_parameters.append(call.parameters)
+
+        # Should see both tool names
+        self.assertIn("get_weather", tool_calls_seen, "Should process first tool")
+        self.assertIn(
+            "get_tourist_attractions", tool_calls_seen, "Should process second tool"
+        )
+
+        # Verify that the parameters are valid JSON and contain the expected content
+        params1 = json.loads(tool_calls_parameters[0])
+        params2 = json.loads(tool_calls_parameters[1])
+        self.assertEqual(params1["city"], "Shanghai")
+        self.assertEqual(params2["city"], "Beijing")
+
+
+class TestQwen3CoderDetector(unittest.TestCase):
+    def setUp(self):
+        # Create sample tools for testing
+        self.tools = [
+            Tool(
+                type="function",
+                function=Function(
+                    name="get_current_weather",
+                    description="Get the current weather",
+                    parameters={
+                        "properties": {
+                            "city": {"type": "string", "description": "The city name"},
+                            "state": {
+                                "type": "string",
+                                "description": "The state code",
+                            },
+                            "unit": {
+                                "type": "string",
+                                "enum": ["fahrenheit", "celsius"],
+                            },
+                        },
+                        "required": ["city", "state"],
+                    },
+                ),
+            ),
+            Tool(
+                type="function",
+                function=Function(
+                    name="calculate_area",
+                    description="Calculate area of a shape",
+                    parameters={
+                        "properties": {
+                            "shape": {"type": "string"},
+                            "dimensions": {"type": "object"},
+                            "precision": {"type": "integer"},
+                        }
+                    },
+                ),
+            ),
+        ]
+        self.detector = Qwen3CoderDetector()
+
+    def test_has_tool_call(self):
+        """Test detection of tool call markers."""
+        self.assertTrue(self.detector.has_tool_call("<tool_call>test</tool_call>"))
+        self.assertFalse(self.detector.has_tool_call("No tool call here"))
+
+    def test_detect_and_parse_no_tools(self):
+        """Test parsing text without tool calls."""
+        model_output = "This is a test response without any tool calls"
+        result = self.detector.detect_and_parse(model_output, tools=[])
+        self.assertEqual(result.normal_text, model_output)
+        self.assertEqual(result.calls, [])
+
+    def test_detect_and_parse_single_tool(self):
+        """Test parsing a single tool call."""
+        model_output = """<tool_call>
+<function=get_current_weather>
+<parameter=city>
+Dallas
+</parameter>
+<parameter=state>
+TX
+</parameter>
+<parameter=unit>
+fahrenheit
+</parameter>
+</function>
+</tool_call>"""
+
+        result = self.detector.detect_and_parse(model_output, tools=self.tools)
+
+        self.assertEqual(result.normal_text, "")
+        self.assertEqual(len(result.calls), 1)
+        self.assertEqual(result.calls[0].name, "get_current_weather")
+
+        params = json.loads(result.calls[0].parameters)
+        self.assertEqual(params["city"], "Dallas")
+        self.assertEqual(params["state"], "TX")
+        self.assertEqual(params["unit"], "fahrenheit")
+
+    def test_detect_and_parse_with_content(self):
+        """Test parsing tool call with surrounding text."""
+        model_output = """Sure! Let me check the weather for you.<tool_call>
+<function=get_current_weather>
+<parameter=city>
+Dallas
+</parameter>
+<parameter=state>
+TX
+</parameter>
+<parameter=unit>
+fahrenheit
+</parameter>
+</function>
+</tool_call>"""
+
+        result = self.detector.detect_and_parse(model_output, tools=self.tools)
+
+        self.assertEqual(result.normal_text, "Sure! Let me check the weather for you.")
+        self.assertEqual(len(result.calls), 1)
+        self.assertEqual(result.calls[0].name, "get_current_weather")
+
+    def test_detect_and_parse_multiline_param(self):
+        """Test parsing tool call with multiline parameter values."""
+        model_output = """<tool_call>
+<function=calculate_area>
+<parameter=shape>
+rectangle
+</parameter>
+<parameter=dimensions>
+{"width": 10,
+ "height": 20}
+</parameter>
+<parameter=precision>
+2
+</parameter>
+</function>
+</tool_call>"""
+
+        result = self.detector.detect_and_parse(model_output, tools=self.tools)
+
+        self.assertEqual(len(result.calls), 1)
+        self.assertEqual(result.calls[0].name, "calculate_area")
+
+        params = json.loads(result.calls[0].parameters)
+        self.assertEqual(params["shape"], "rectangle")
+        self.assertEqual(params["dimensions"], {"width": 10, "height": 20})
+        self.assertEqual(params["precision"], 2)
+
+    def test_detect_and_parse_parallel_tools(self):
+        """Test parsing multiple tool calls."""
+        model_output = """<tool_call>
+<function=get_current_weather>
+<parameter=city>
+Dallas
+</parameter>
+<parameter=state>
+TX
+</parameter>
+<parameter=unit>
+fahrenheit
+</parameter>
+</function>
+</tool_call>
+<tool_call>
+<function=get_current_weather>
+<parameter=city>
+Orlando
+</parameter>
+<parameter=state>
+FL
+</parameter>
+<parameter=unit>
+fahrenheit
+</parameter>
+</function>
+</tool_call>"""
+
+        result = self.detector.detect_and_parse(model_output, tools=self.tools)
+
+        self.assertEqual(result.normal_text, "\n")
+        self.assertEqual(len(result.calls), 2)
+
+        # First call
+        self.assertEqual(result.calls[0].name, "get_current_weather")
+        params1 = json.loads(result.calls[0].parameters)
+        self.assertEqual(params1["city"], "Dallas")
+        self.assertEqual(params1["state"], "TX")
+
+        # Second call
+        self.assertEqual(result.calls[1].name, "get_current_weather")
+        params2 = json.loads(result.calls[1].parameters)
+        self.assertEqual(params2["city"], "Orlando")
+        self.assertEqual(params2["state"], "FL")
+
+    def test_parse_streaming_simple(self):
+        """Test basic streaming parsing."""
+        chunks = [
+            "Sure! ",
+            "Let me check ",
+            "the weather.",
+            "<tool_call>",
+            "\n<function=get_current_weather>",
+            "\n<parameter=city>",
+            "\nDallas",
+            "\n</parameter>",
+            "\n<parameter=state>",
+            "\nTX",
+            "\n</parameter>",
+            "\n</function>",
+            "\n</tool_call>",
+        ]
+
+        accumulated_text = ""
+        accumulated_calls = []
+        tool_calls_by_index = {}
+
+        for chunk in chunks:
+            result = self.detector.parse_streaming_increment(chunk, tools=self.tools)
+            accumulated_text += result.normal_text
+
+            # Track calls by tool_index to handle streaming properly
+            for call in result.calls:
+                if call.tool_index is not None:
+                    if call.tool_index not in tool_calls_by_index:
+                        tool_calls_by_index[call.tool_index] = {
+                            "name": "",
+                            "parameters": "",
+                        }
+
+                    if call.name:
+                        tool_calls_by_index[call.tool_index]["name"] = call.name
+                    if call.parameters:
+                        tool_calls_by_index[call.tool_index][
+                            "parameters"
+                        ] += call.parameters
+
+        self.assertEqual(accumulated_text, "Sure! Let me check the weather.")
+        self.assertEqual(len(tool_calls_by_index), 1)
+
+        # Get the complete tool call
+        tool_call = tool_calls_by_index[0]
+        self.assertEqual(tool_call["name"], "get_current_weather")
+
+        # Parse the accumulated parameters
+        params = json.loads(tool_call["parameters"])
+        self.assertEqual(params["city"], "Dallas")
+        self.assertEqual(params["state"], "TX")
+
+    def test_parse_streaming_incomplete(self):
+        """Test streaming with incomplete tool call."""
+        # Send incomplete tool call
+        chunks = [
+            "<tool_call>",
+            "\n<function=get_current_weather>",
+            "\n<parameter=city>",
+            "\nDallas",
+            "\n</parameter>",
+            "\n<parameter=state>",
+            "\nTX",
+            # Missing </parameter>, </function>, </tool_call>
+        ]
+
+        tool_calls_by_index = {}
+        for chunk in chunks:
+            result = self.detector.parse_streaming_increment(chunk, tools=self.tools)
+
+            # Track calls by tool_index to handle streaming properly
+            for call in result.calls:
+                if call.tool_index is not None:
+                    if call.tool_index not in tool_calls_by_index:
+                        tool_calls_by_index[call.tool_index] = {
+                            "name": "",
+                            "parameters": "",
+                        }
+
+                    if call.name:
+                        tool_calls_by_index[call.tool_index]["name"] = call.name
+                    if call.parameters:
+                        tool_calls_by_index[call.tool_index][
+                            "parameters"
+                        ] += call.parameters
+
+        # Should have partial tool call with name but incomplete parameters
+        self.assertGreater(len(tool_calls_by_index), 0)
+        self.assertEqual(tool_calls_by_index[0]["name"], "get_current_weather")
+
+        # Parameters should be incomplete (no closing brace)
+        params_str = tool_calls_by_index[0]["parameters"]
+        self.assertTrue(params_str.startswith('{"city": "Dallas"'))
+        self.assertFalse(params_str.endswith("}"))
+
+        # Now complete it
+        result = self.detector.parse_streaming_increment(
+            "\n</parameter>\n</function>\n</tool_call>", tools=self.tools
+        )
+
+        # Update the accumulated parameters
+        for call in result.calls:
+            if call.tool_index is not None and call.parameters:
+                tool_calls_by_index[call.tool_index]["parameters"] += call.parameters
+
+        # Now should have complete parameters
+        final_params = json.loads(tool_calls_by_index[0]["parameters"])
+        self.assertEqual(final_params["city"], "Dallas")
+        self.assertEqual(final_params["state"], "TX")
+
+    def test_edge_case_no_parameters(self):
+        """Test tool call without parameters."""
+        model_output = """<tool_call>
+<function=get_current_weather>
+</function>
+</tool_call>"""
+
+        result = self.detector.detect_and_parse(model_output, tools=self.tools)
+        self.assertEqual(len(result.calls), 1)
+        self.assertEqual(result.calls[0].name, "get_current_weather")
+        self.assertEqual(json.loads(result.calls[0].parameters), {})
+
+    def test_edge_case_special_chars_in_value(self):
+        """Test parameter with special characters in value."""
+        model_output = """<tool_call>
+<function=get_current_weather>
+<parameter=city>
+Dallas->TX
+</parameter>
+</function>
+</tool_call>"""
+
+        result = self.detector.detect_and_parse(model_output, tools=self.tools)
+        self.assertEqual(len(result.calls), 1)
+
+        params = json.loads(result.calls[0].parameters)
+        self.assertEqual(params["city"], "Dallas->TX")
+
+    def test_extract_tool_calls_fallback_no_tags(self):
+        """Test fallback parsing when XML tags are missing (just function without tool_call wrapper)."""
+        model_output = """<function=get_current_weather>
+<parameter=city>
+Dallas
+</parameter>
+<parameter=state>
+TX
+</parameter>
+</function>"""
+
+        result = self.detector.detect_and_parse(model_output, tools=self.tools)
+
+        self.assertIsNotNone(result)
+
+    def test_extract_tool_calls_type_conversion(self):
+        """Test parameter type conversion based on tool schema."""
+        test_tool = Tool(
+            type="function",
+            function=Function(
+                name="test_types",
+                parameters={
+                    "type": "object",
+                    "properties": {
+                        "int_param": {"type": "integer"},
+                        "float_param": {"type": "float"},
+                        "bool_param": {"type": "boolean"},
+                        "str_param": {"type": "string"},
+                        "obj_param": {"type": "object"},
+                    },
+                },
+            ),
+        )
+
+        model_output = """<tool_call>
+<function=test_types>
+<parameter=int_param>
+42
+</parameter>
+<parameter=float_param>
+3.14
+</parameter>
+<parameter=bool_param>
+true
+</parameter>
+<parameter=str_param>
+hello world
+</parameter>
+<parameter=obj_param>
+{"key": "value"}
+</parameter>
+</function>
+</tool_call>"""
+
+        result = self.detector.detect_and_parse(model_output, tools=[test_tool])
+
+        self.assertEqual(len(result.calls), 1)
+        params = json.loads(result.calls[0].parameters)
+        self.assertEqual(params["int_param"], 42)
+        self.assertEqual(params["float_param"], 3.14)
+        self.assertEqual(params["bool_param"], True)
+        self.assertEqual(params["str_param"], "hello world")
+        self.assertEqual(params["obj_param"], {"key": "value"})
+
+    def test_parse_streaming_incremental(self):
+        """Test that streaming is truly incremental with very small chunks."""
+        model_output = """I'll check the weather.<tool_call>
+        <function=get_current_weather>
+        <parameter=city>
+        Dallas
+        </parameter>
+        <parameter=state>
+        TX
+        </parameter>
+        </function>
+        </tool_call>"""
+
+        # Simulate more realistic token-based chunks where <tool_call> is a single token
+        chunks = [
+            "I'll check the weather.",
+            "<tool_call>",
+            "\n<function=get_current_weather>\n",
+            "<parameter=city>\n",
+            "Dallas\n",
+            "</parameter>\n",
+            "<parameter=state>\n",
+            "TX\n",
+            "</parameter>\n",
+            "</function>\n",
+            "</tool_call>",
+        ]
+
+        accumulated_text = ""
+        tool_calls = []
+        chunks_count = 0
+
+        for chunk in chunks:
+            result = self.detector.parse_streaming_increment(chunk, self.tools)
+            accumulated_text += result.normal_text
+            chunks_count += 1
+            for tool_call_chunk in result.calls:
+                if (
+                    hasattr(tool_call_chunk, "tool_index")
+                    and tool_call_chunk.tool_index is not None
+                ):
+                    while len(tool_calls) <= tool_call_chunk.tool_index:
+                        tool_calls.append({"name": "", "parameters": ""})
+                    tc = tool_calls[tool_call_chunk.tool_index]
+                    if tool_call_chunk.name:
+                        tc["name"] = tool_call_chunk.name
+                    if tool_call_chunk.parameters:
+                        tc["parameters"] += tool_call_chunk.parameters
+
+        self.assertGreater(chunks_count, 3)
+
+        # Verify the accumulated results
+        self.assertIn("I'll check the weather.", accumulated_text)
+        self.assertEqual(len(tool_calls), 1)
+        self.assertEqual(tool_calls[0]["name"], "get_current_weather")
+
+        params = json.loads(tool_calls[0]["parameters"])
+        self.assertEqual(params, {"city": "Dallas", "state": "TX"})
+
+    def test_parse_streaming_multiple_tools(self):
+        """Test streaming with multiple tool calls."""
+        model_output = """<tool_call>
+        <function=get_current_weather>
+        <parameter=city>
+        Dallas
+        </parameter>
+        <parameter=state>
+        TX
+        </parameter>
+        </function>
+        </tool_call>
+        Some text in between.
+        <tool_call>
+        <function=calculate_area>
+        <parameter=shape>
+        circle
+        </parameter>
+        <parameter=dimensions>
+        {"radius": 5}
+        </parameter>
+        </function>
+        </tool_call>"""
+
+        # Simulate streaming by chunks
+        chunk_size = 20
+        chunks = [
+            model_output[i : i + chunk_size]
+            for i in range(0, len(model_output), chunk_size)
+        ]
+
+        accumulated_text = ""
+        tool_calls = []
+        chunks_count = 0
+
+        for chunk in chunks:
+            result = self.detector.parse_streaming_increment(chunk, self.tools)
+            accumulated_text += result.normal_text
+            chunks_count += 1
+            for tool_call_chunk in result.calls:
+                if (
+                    hasattr(tool_call_chunk, "tool_index")
+                    and tool_call_chunk.tool_index is not None
+                ):
+                    while len(tool_calls) <= tool_call_chunk.tool_index:
+                        tool_calls.append({"name": "", "parameters": ""})
+                    tc = tool_calls[tool_call_chunk.tool_index]
+                    if tool_call_chunk.name:
+                        tc["name"] = tool_call_chunk.name
+                    if tool_call_chunk.parameters:
+                        tc["parameters"] += tool_call_chunk.parameters
+
+        self.assertIn("Some text in between.", accumulated_text)
+        self.assertEqual(len(tool_calls), 2)
+        self.assertEqual(tool_calls[0]["name"], "get_current_weather")
+        self.assertEqual(tool_calls[1]["name"], "calculate_area")
+
+        # Verify parameters
+        params1 = json.loads(tool_calls[0]["parameters"])
+        self.assertEqual(params1, {"city": "Dallas", "state": "TX"})
+
+        params2 = json.loads(tool_calls[1]["parameters"])
+        self.assertEqual(params2, {"shape": "circle", "dimensions": {"radius": 5}})
+
+
+class TestGlm4MoeDetector(unittest.TestCase):
+    def setUp(self):
+        self.tools = [
+            Tool(
+                type="function",
+                function=Function(
+                    name="get_weather",
+                    description="Get weather information",
+                    parameters={
+                        "type": "object",
+                        "properties": {
+                            "city": {"type": "string", "description": "City name"},
+                            "date": {"type": "string", "description": "Date"},
+                        },
+                        "required": ["city", "date"],
+                    },
+                ),
+            ),
+        ]
+        self.detector = Glm4MoeDetector()
+
+    def test_single_tool_call(self):
+        text = (
+            "<tool_call>get_weather\n"
+            "<arg_key>city</arg_key>\n<arg_value>Beijing</arg_value>\n"
+            "<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n"
+            "</tool_call>"
+        )
+        result = self.detector.detect_and_parse(text, self.tools)
+        self.assertEqual(len(result.calls), 1)
+        self.assertEqual(result.calls[0].name, "get_weather")
+        self.assertEqual(
+            result.calls[0].parameters, '{"city": "Beijing", "date": "2024-06-27"}'
+        )
+        self.assertEqual(result.normal_text, "")
+
+    def test_multiple_tool_calls(self):
+        text = (
+            "<tool_call>get_weather\n"
+            "<arg_key>city</arg_key>\n<arg_value>Beijing</arg_value>\n"
+            "<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n"
+            "</tool_call>"
+            "<tool_call>get_weather\n"
+            "<arg_key>city</arg_key>\n<arg_value>Shanghai</arg_value>\n"
+            "<arg_key>date</arg_key>\n<arg_value>2024-06-28</arg_value>\n"
+            "</tool_call>"
+        )
+        result = self.detector.detect_and_parse(text, self.tools)
+        self.assertEqual(len(result.calls), 2)
+        self.assertEqual(result.calls[0].name, "get_weather")
+        self.assertEqual(
+            result.calls[0].parameters, '{"city": "Beijing", "date": "2024-06-27"}'
+        )
+        self.assertEqual(result.calls[1].name, "get_weather")
+        self.assertEqual(
+            result.calls[1].parameters, '{"city": "Shanghai", "date": "2024-06-28"}'
+        )
+        self.assertEqual(result.normal_text, "")
+
+    def test_streaming_tool_call(self):
+        """Test streaming incremental parsing of a tool call."""
+        chunks = [
+            "<tool_call>get_weather\n",
+            "<arg_key>city</arg_key>\n<arg_value>Beijing</arg_value>\n",
+            "<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n",
+            "</tool_call>",
+        ]
+        tool_calls = []
+        for chunk in chunks:
+            result = self.detector.parse_streaming_increment(chunk, self.tools)
+            for tool_call_chunk in result.calls:
+                if (
+                    hasattr(tool_call_chunk, "tool_index")
+                    and tool_call_chunk.tool_index is not None
+                ):
+                    while len(tool_calls) <= tool_call_chunk.tool_index:
+                        tool_calls.append({"name": "", "parameters": {}})
+                    tc = tool_calls[tool_call_chunk.tool_index]
+                    if tool_call_chunk.name:
+                        tc["name"] = tool_call_chunk.name
+                    if tool_call_chunk.parameters:
+                        tc["parameters"] = tool_call_chunk.parameters
+        self.assertEqual(len(tool_calls), 1)
+        self.assertEqual(tool_calls[0]["name"], "get_weather")
+        self.assertEqual(
+            tool_calls[0]["parameters"], '{"city": "Beijing", "date": "2024-06-27"}'
+        )
+
+    def test_streaming_multiple_tool_calls(self):
+        """Test streaming incremental parsing of multiple tool calls."""
+        chunks = [
+            "<tool_call>get_weather\n",
+            "<arg_key>city</arg_key>\n<arg_value>Beijing</arg_value>\n",
+            "<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n",
+            "</tool_call><tool_call>get_weather\n",
+            "<arg_key>city</arg_key>\n<arg_value>Shanghai</arg_value>\n",
+            "<arg_key>date</arg_key>\n<arg_value>2024-06-28</arg_value>\n",
+            "</tool_call>",
+        ]
+        tool_calls = []
+        for chunk in chunks:
+            result = self.detector.parse_streaming_increment(chunk, self.tools)
+            for tool_call_chunk in result.calls:
+                if (
+                    hasattr(tool_call_chunk, "tool_index")
+                    and tool_call_chunk.tool_index is not None
+                ):
+                    while len(tool_calls) <= tool_call_chunk.tool_index:
+                        tool_calls.append({"name": "", "parameters": {}})
+                    tc = tool_calls[tool_call_chunk.tool_index]
+                    if tool_call_chunk.name:
+                        tc["name"] = tool_call_chunk.name
+                    if tool_call_chunk.parameters:
+                        tc["parameters"] = tool_call_chunk.parameters
+        self.assertEqual(len(tool_calls), 2)
+        self.assertEqual(tool_calls[0]["name"], "get_weather")
+        self.assertEqual(
+            tool_calls[0]["parameters"], '{"city": "Beijing", "date": "2024-06-27"}'
+        )
+        self.assertEqual(tool_calls[1]["name"], "get_weather")
+        self.assertEqual(
+            tool_calls[1]["parameters"], '{"city": "Shanghai", "date": "2024-06-28"}'
+        )
+
+    def test_tool_call_id(self):
+        """Test that the buffer and state are reset after a tool call is completed."""
+        chunks = [
+            "<tool_call>get_weather\n",
+            "<arg_key>city</arg_key>\n<arg_value>Beijing</arg_value>\n",
+            "<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n",
+            "</tool_call>",
+        ]
+        for chunk in chunks:
+            result = self.detector.parse_streaming_increment(chunk, self.tools)
+        self.assertEqual(self.detector.current_tool_id, 1)
+
+    def test_invalid_tool_call(self):
+        """Test that invalid tool calls are handled correctly."""
+        text = "<tool_call>invalid_func\n<arg_key>city</arg_key>\n<arg_value>Beijing</arg_value>\n</tool_call>"
+        result = self.detector.detect_and_parse(text, self.tools)
+        self.assertEqual(len(result.calls), 0)
+
+    def test_partial_tool_call(self):
+        """Test parsing a partial tool call that spans multiple chunks."""
+        text1 = "<tool_call>get_weather\n<arg_key>city</arg_key>\n"
+        result1 = self.detector.parse_streaming_increment(text1, self.tools)
+        self.assertEqual(result1.normal_text, "")
+        self.assertEqual(result1.calls, [])
+        self.assertEqual(self.detector._buffer, text1)
+        text2 = "<arg_value>Beijing</arg_value>\n<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n</tool_call>"
+        result2 = self.detector.parse_streaming_increment(text2, self.tools)
+        self.assertEqual(len(result2.calls), 1)
+        self.assertEqual(result2.calls[0].name, "get_weather")
+        self.assertEqual(
+            result2.calls[0].parameters, '{"city": "Beijing", "date": "2024-06-27"}'
+        )
+        self.assertEqual(self.detector._buffer, "")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_fused_moe.py b/test/srt/test_fused_moe.py
new file mode 100644
index 000000000..9f2cc31b1
--- /dev/null
+++ b/test/srt/test_fused_moe.py
@@ -0,0 +1,236 @@
+import unittest
+
+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
+from sglang.srt.layers.moe.topk import TopKConfig, select_experts
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
+from sglang.srt.layers.quantization.fp8_utils import normalize_e4m3fn_to_e4m3fnuz
+from sglang.srt.utils import is_hip
+from sglang.test.test_utils import CustomTestCase
+
+_is_hip = is_hip()
+_is_fp8_fnuz = is_fp8_fnuz()
+
+
+class TestFusedMOE(CustomTestCase):
+    NUM_EXPERTS = [8, 64]
+    TOP_KS = [2, 6]
+
+    @staticmethod
+    def create_random_cuda_tensor(shape, dtype, mean=0, std=0.01):
+        """Create a random CUDA tensor
+
+        Args:
+            shape: Tensor shape
+            dtype: Data type
+            mean: Mean value
+            std: Standard deviation
+
+        Returns:
+            torch.Tensor: Randomly initialized CUDA tensor
+        """
+        return torch.empty(shape, dtype=dtype, device="cuda").normal_(mean, std)
+
+    def get_tolerance(self, dtype):
+        """Get tolerance values for different data types
+
+        Args:
+            dtype: Data type
+
+        Returns:
+            tuple: (relative tolerance, absolute tolerance)
+        """
+        if dtype == torch.float32:
+            return 1e-3, 1e-5
+        elif dtype in [torch.float16, torch.bfloat16]:
+            return 1e-1, 1e-2
+        else:
+            return 1e-2, 1e-2  # Default values for other types
+
+    def torch_naive_moe(
+        self,
+        a,
+        w1,
+        w2,
+        score,
+        topk,
+        w1_scale=None,
+        w2_scale=None,
+        a1_scale=None,
+        a2_scale=None,
+    ):
+        B, D = a.shape
+        a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+        out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+        score = torch.softmax(score, dim=-1, dtype=torch.float32)
+        topk_weight, topk_ids = torch.topk(score, topk)
+        topk_weight = topk_weight.view(-1)
+        topk_ids = topk_ids.view(-1)
+
+        if w1.dtype in [torch.float8_e4m3fn, torch.float8_e4m3fnuz]:
+            w1_compute = w1.to(a.dtype)
+            w2_compute = w2.to(a.dtype)
+
+            if w1_scale is not None:
+                w1_compute = (w1_compute * w1_scale.view(-1, 1, 1)).to(a.dtype)
+            if w2_scale is not None:
+                w2_compute = (w2_compute * w2_scale.view(-1, 1, 1)).to(a.dtype)
+            if a1_scale is not None:
+                a = (a * a1_scale).to(a.dtype)
+            if a2_scale is not None:
+                a = (a * a2_scale).to(a.dtype)
+        else:
+            w1_compute = w1
+            w2_compute = w2
+
+        for i in range(w1_compute.shape[0]):
+            mask = topk_ids == i
+            if mask.sum():
+                out[mask] = SiluAndMul()(
+                    a[mask] @ w1_compute[i].transpose(0, 1)
+                ) @ w2_compute[i].transpose(0, 1)
+
+        return (
+            out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+        ).sum(dim=1)
+
+    def _test_case(self, m, n, k, e, topk, dtype, use_fp8_w8a8=False):
+        rtol, atol = self.get_tolerance(dtype)
+
+        if use_fp8_w8a8:
+            # AssertionError: fp8e4nv data type is not supported on CUDA arch < 89
+            capability = torch.cuda.get_device_capability()
+            if not _is_hip and not (capability[0] >= 9 or capability == (8, 9)):
+                return
+
+            a = self.create_random_cuda_tensor((m, k), dtype)
+            w1 = self.create_random_cuda_tensor((e, 2 * n, k), dtype)
+            w2 = self.create_random_cuda_tensor((e, k, n), dtype)
+            w1 = w1.to(torch.float8_e4m3fn)
+            w2 = w2.to(torch.float8_e4m3fn)
+            score = self.create_random_cuda_tensor((m, e), dtype)
+            w1_scale = self.create_random_cuda_tensor(e, torch.float32)
+            w2_scale = self.create_random_cuda_tensor(e, torch.float32)
+            a1_scale = self.create_random_cuda_tensor(1, torch.float32)
+            a2_scale = self.create_random_cuda_tensor(1, torch.float32)
+
+            # Handle HIP case: normalize float8 weights so fused kernel doesn't break
+            # on ROCm.
+            if _is_fp8_fnuz:
+                # Normalize to e4m3fnuz on HIP
+                w1, w1_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                    weight=w1,
+                    weight_scale=w1_scale,
+                    input_scale=a1_scale,
+                )
+                w2, w2_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                    weight=w2,
+                    weight_scale=w2_scale,
+                    input_scale=a2_scale,
+                )
+
+            topk_output = select_experts(
+                hidden_states=a,
+                router_logits=score,
+                topk_config=TopKConfig(top_k=topk, renormalize=False),
+            )
+
+            torch_output = self.torch_naive_moe(
+                a,
+                w1,
+                w2,
+                score,
+                topk,
+                w1_scale,
+                w2_scale,
+                a1_scale,
+                a2_scale,
+            )
+
+            sglang_output = fused_moe(
+                a,
+                w1,
+                w2,
+                topk_output,
+                use_fp8_w8a8=True,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                a1_scale=a1_scale,
+                a2_scale=a2_scale,
+            )
+            torch.testing.assert_close(
+                sglang_output, torch_output, rtol=rtol, atol=atol
+            )
+        else:
+            a = self.create_random_cuda_tensor((m, k), dtype)
+            w1 = self.create_random_cuda_tensor((e, 2 * n, k), dtype)
+            w2 = self.create_random_cuda_tensor((e, k, n), dtype)
+            score = self.create_random_cuda_tensor((m, e), dtype)
+
+            topk_output = select_experts(
+                hidden_states=a,
+                router_logits=score,
+                topk_config=TopKConfig(top_k=topk, renormalize=False),
+            )
+
+            triton_output = fused_moe(a, w1, w2, topk_output)
+            torch_output = self.torch_naive_moe(a, w1, w2, score, topk)
+            torch.testing.assert_close(
+                triton_output, torch_output, rtol=rtol, atol=atol
+            )
+
+    def test_various_configurations(self):
+        m_values = [1, 33, 64, 222]
+        n_values = [128, 1024]
+        k_values = [128, 511, 1024]
+        dtypes = [torch.float16, torch.bfloat16]
+        fp8_modes = [False, True]
+
+        # Calculate total number of tests
+        total_tests = (
+            len(m_values)
+            * len(n_values)
+            * len(k_values)
+            * len(self.NUM_EXPERTS)
+            * len(self.TOP_KS)
+            * len(dtypes)
+            * len(fp8_modes)
+        )
+
+        # Create progress bar
+        with tqdm(total=total_tests, desc="Running MoE tests") as pbar:
+            for m in m_values:
+                for n in n_values:
+                    for k in k_values:
+                        for e in self.NUM_EXPERTS:
+                            for topk in self.TOP_KS:
+                                for dtype in dtypes:
+                                    for use_fp8_w8a8 in fp8_modes:
+                                        with self.subTest(
+                                            m=m,
+                                            n=n,
+                                            k=k,
+                                            e=e,
+                                            topk=topk,
+                                            dtype=dtype,
+                                            fp8=use_fp8_w8a8,
+                                        ):
+                                            self._test_case(
+                                                m,
+                                                n,
+                                                k,
+                                                e,
+                                                topk,
+                                                dtype,
+                                                use_fp8_w8a8=use_fp8_w8a8,
+                                            )
+                                            torch.cuda.empty_cache()
+                                        pbar.update(1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_get_weights_by_name.py b/test/srt/test_get_weights_by_name.py
new file mode 100644
index 000000000..3d404df10
--- /dev/null
+++ b/test/srt/test_get_weights_by_name.py
@@ -0,0 +1,183 @@
+import gc
+import unittest
+
+import numpy as np
+import requests
+import torch
+from transformers import AutoModelForCausalLM
+
+import sglang as sgl
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+)
+from sglang.utils import terminate_process
+
+
+def _process_return(ret):
+    if isinstance(ret, list) and len(ret) == 2:
+        print(f"running assert_allclose on data parallel")
+        np.testing.assert_allclose(ret[0], ret[1])
+        return np.array(ret[0])
+    return np.array(ret)
+
+
+class TestGetWeightsByName(CustomTestCase):
+
+    def init_hf_model(self, model_name, tie_word_embeddings):
+        self.hf_model = AutoModelForCausalLM.from_pretrained(
+            model_name, torch_dtype="bfloat16", tie_word_embeddings=tie_word_embeddings
+        ).to("cuda:0")
+
+    def init_backend(self, backend, dp, tp, model_name):
+        self.backend = backend
+        self.dp = dp
+        self.tp = tp
+        if backend == "Engine":
+            self.engine = sgl.Engine(
+                model_path=model_name,
+                random_seed=42,
+                tp_size=tp,
+                dp_size=dp,
+            )
+        else:
+            self.process = popen_launch_server(
+                model_name,
+                DEFAULT_URL_FOR_TEST,
+                timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                other_args=(
+                    "--tp-size",
+                    str(tp),
+                    "--dp-size",
+                    str(dp),
+                ),
+            )
+
+    def clean_up(self):
+        del self.hf_model
+        gc.collect()
+        torch.cuda.empty_cache()
+        if self.backend == "Engine":
+            self.engine.shutdown()
+        else:
+            terminate_process(self.process)
+
+    def assert_tie_word_embeddings(self, truncate_size):
+        print("assert_tie_word_embeddings")
+        if self.backend == "Engine":
+            backend_ret = _process_return(
+                self.engine.get_weights_by_name("lm_head.weight", truncate_size)
+            )
+        else:
+            backend_ret = _process_return(
+                requests.get(
+                    f"{DEFAULT_URL_FOR_TEST}/get_weights_by_name",
+                    json={"name": "lm_head.weight", "truncate_size": truncate_size},
+                ).json()
+            )
+        print("assert_tie_word_embeddings of hf and backend")
+        assert np.allclose(
+            self.hf_model.get_parameter("model.embed_tokens.weight")
+            .cpu()
+            .detach()
+            .float()
+            .numpy()[:truncate_size],
+            backend_ret,
+        )
+        assert np.allclose(
+            self.hf_model.get_parameter("lm_head.weight")
+            .cpu()
+            .detach()
+            .float()
+            .numpy()[:truncate_size],
+            self.hf_model.get_parameter("model.embed_tokens.weight")
+            .cpu()
+            .detach()
+            .float()
+            .numpy()[:truncate_size],
+        )
+
+    def assert_weights_all_close(self, param_name, truncate_size):
+        print(
+            f"param_name: {param_name}, backend: {self.backend}, dp: {self.dp}, tp: {self.tp}"
+        )
+        param = self.hf_model.get_parameter(param_name)[:truncate_size]
+        param_np = param.cpu().detach().float().numpy()
+
+        if self.backend == "Engine":
+            engine_ret = self.engine.get_weights_by_name(param_name, truncate_size)
+            engine_ret = _process_return(engine_ret)
+            np.testing.assert_allclose(engine_ret, param_np, rtol=1e-5, atol=1e-5)
+
+        if self.backend == "Runtime":
+            runtime_ret = requests.get(
+                f"{DEFAULT_URL_FOR_TEST}/get_weights_by_name",
+                json={"name": param_name, "truncate_size": truncate_size},
+            ).json()
+            runtime_ret = _process_return(runtime_ret)
+            np.testing.assert_allclose(runtime_ret, param_np, rtol=1e-5, atol=1e-5)
+
+    def test_get_weights_by_name(self):
+        if is_in_ci():
+            test_suits = [
+                ("Engine", 1, 1, DEFAULT_SMALL_MODEL_NAME_FOR_TEST),
+            ]
+        else:
+            test_suits = [
+                ("Runtime", 1, 1, DEFAULT_SMALL_MODEL_NAME_FOR_TEST),
+                ("Engine", 1, 1, DEFAULT_MODEL_NAME_FOR_TEST),
+            ]
+            if torch.cuda.device_count() >= 2:
+                test_suits.append(("Engine", 1, 2, DEFAULT_SMALL_MODEL_NAME_FOR_TEST))
+                test_suits.append(("Runtime", 2, 1, DEFAULT_MODEL_NAME_FOR_TEST))
+
+            if torch.cuda.device_count() >= 4:
+                test_suits.extend(
+                    [
+                        ("Engine", 2, 2, DEFAULT_SMALL_MODEL_NAME_FOR_TEST),
+                        ("Runtime", 2, 2, DEFAULT_MODEL_NAME_FOR_TEST),
+                    ]
+                )
+
+        parameters = [
+            "model.embed_tokens.weight",
+            "model.layers.0.input_layernorm.weight",
+            "model.layers.1.self_attn.q_proj.weight",
+            "model.layers.2.self_attn.k_proj.weight",
+            "model.layers.3.self_attn.v_proj.weight",
+            "model.layers.4.self_attn.o_proj.weight",
+            "model.layers.5.mlp.gate_proj.weight",
+            "model.layers.6.mlp.up_proj.weight",
+            "model.layers.7.mlp.down_proj.weight",
+            "model.layers.8.post_attention_layernorm.weight",
+            "model.norm.weight",
+            "lm_head.weight",
+        ]
+
+        truncate_size = 100
+
+        for test_suit in test_suits:
+            if test_suit[-1] == DEFAULT_MODEL_NAME_FOR_TEST:
+                tie_word_embeddings = False
+            else:
+                tie_word_embeddings = True
+
+            self.init_hf_model(test_suit[-1], tie_word_embeddings)
+            self.init_backend(*test_suit)
+
+            for param_name in parameters:
+                self.assert_weights_all_close(param_name, truncate_size)
+
+            if tie_word_embeddings:
+                self.assert_tie_word_embeddings(truncate_size)
+
+            self.clean_up()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_gguf.py b/test/srt/test_gguf.py
new file mode 100644
index 000000000..e9776067c
--- /dev/null
+++ b/test/srt/test_gguf.py
@@ -0,0 +1,27 @@
+import unittest
+
+from huggingface_hub import hf_hub_download
+
+import sglang as sgl
+from sglang.test.test_utils import CustomTestCase
+
+
+class TestGGUF(CustomTestCase):
+    def test_models(self):
+        prompt = "Today is a sunny day and I like"
+        sampling_params = {"temperature": 0, "max_new_tokens": 8}
+
+        model_path = hf_hub_download(
+            "Qwen/Qwen2-1.5B-Instruct-GGUF",
+            filename="qwen2-1_5b-instruct-q4_k_m.gguf",
+        )
+
+        engine = sgl.Engine(model_path=model_path, random_seed=42, cuda_graph_max_bs=2)
+        outputs = engine.generate(prompt, sampling_params)["text"]
+        engine.shutdown()
+
+        self.assertEqual(outputs, " it. I have a lot of work")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_gpt_oss_1gpu.py b/test/srt/test_gpt_oss_1gpu.py
new file mode 100644
index 000000000..d3fc06931
--- /dev/null
+++ b/test/srt/test_gpt_oss_1gpu.py
@@ -0,0 +1,31 @@
+import unittest
+
+from test_gpt_oss_common import BaseTestGptOss
+
+
+class TestGptOss1Gpu(BaseTestGptOss):
+    def test_mxfp4_20b(self):
+        self.run_test(
+            model_variant="20b",
+            quantization="mxfp4",
+            expected_score_of_reasoning_effort={
+                "low": 0.34,
+                "medium": 0.34,
+                "high": 0.27,  # TODO investigate
+            },
+        )
+
+    def test_bf16_20b(self):
+        self.run_test(
+            model_variant="20b",
+            quantization="bf16",
+            expected_score_of_reasoning_effort={
+                "low": 0.34,
+                "medium": 0.34,
+                "high": 0.27,  # TODO investigate
+            },
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_gpt_oss_4gpu.py b/test/srt/test_gpt_oss_4gpu.py
new file mode 100644
index 000000000..da787c6fb
--- /dev/null
+++ b/test/srt/test_gpt_oss_4gpu.py
@@ -0,0 +1,36 @@
+import unittest
+
+from test_gpt_oss_common import BaseTestGptOss
+
+
+class TestGptOss4Gpu(BaseTestGptOss):
+    def test_bf16_120b(self):
+        self.run_test(
+            model_variant="120b",
+            quantization="bf16",
+            expected_score_of_reasoning_effort={
+                "low": 0.60,
+            },
+            other_args=["--tp", "4", "--cuda-graph-max-bs", "200"],
+        )
+
+    def test_mxfp4_120b(self):
+        self.run_test(
+            model_variant="120b",
+            quantization="mxfp4",
+            expected_score_of_reasoning_effort={
+                "low": 0.60,
+            },
+            other_args=[
+                "--tp",
+                "4",
+                "--cuda-graph-max-bs",
+                "200",
+                "--mem-fraction-static",
+                "0.93",
+            ],
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_gpt_oss_common.py b/test/srt/test_gpt_oss_common.py
new file mode 100644
index 000000000..6be739277
--- /dev/null
+++ b/test/srt/test_gpt_oss_common.py
@@ -0,0 +1,111 @@
+import os
+from concurrent.futures import ThreadPoolExecutor
+from types import SimpleNamespace
+from typing import Dict, List, Literal, Optional
+
+from sglang.srt.utils import is_hip, kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    write_github_step_summary,
+)
+
+_base_url = DEFAULT_URL_FOR_TEST
+_is_hip = is_hip()
+
+
+class BaseTestGptOss(CustomTestCase):
+    def run_test(
+        self,
+        model_variant: Literal["20b", "120b"],
+        quantization: Literal["mxfp4", "bf16"],
+        expected_score_of_reasoning_effort: Dict[str, float],
+        other_args: Optional[List[str]] = None,
+    ):
+        if other_args is None:
+            other_args = []
+
+        model = {
+            ("20b", "bf16"): "lmsys/gpt-oss-20b-bf16",
+            ("120b", "bf16"): "lmsys/gpt-oss-120b-bf16",
+            ("20b", "mxfp4"): "openai/gpt-oss-20b",
+            ("120b", "mxfp4"): "openai/gpt-oss-120b",
+        }[(model_variant, quantization)]
+
+        if model_variant == "20b":
+            other_args += ["--cuda-graph-max-bs", "600"]
+        if _is_hip:
+            os.environ["SGLANG_USE_AITER"] = "0"
+        self._run_test_raw(
+            model=model,
+            expected_score_of_reasoning_effort=expected_score_of_reasoning_effort,
+            other_args=other_args,
+        )
+
+    def _run_test_raw(
+        self,
+        model: str,
+        expected_score_of_reasoning_effort: Dict[str, float],
+        other_args: List[str],
+    ):
+        process = popen_launch_server(
+            model,
+            _base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+
+        try:
+            # run multiple tests in parallel since we are mostly bound by the longest generate sequence
+            # instead of the number of questions
+            with ThreadPoolExecutor(max_workers=4) as executor:
+                list(
+                    executor.map(
+                        lambda d: self._run_one_eval(**d),
+                        [
+                            dict(
+                                model=model,
+                                reasoning_effort=reasoning_effort,
+                                expected_score=expected_score,
+                            )
+                            for reasoning_effort, expected_score in expected_score_of_reasoning_effort.items()
+                        ],
+                    )
+                )
+        finally:
+            kill_process_tree(process.pid)
+
+    def _run_one_eval(self, model, reasoning_effort, expected_score):
+        args = SimpleNamespace(
+            base_url=_base_url,
+            model=model,
+            eval_name="gpqa",
+            num_examples=198,
+            # use enough threads to allow parallelism
+            num_threads=198,
+            # TODO 4k is still not enough, we need e.g. 64k token, but that is super slow
+            # otherwise a lot of questions are not answered
+            max_tokens=4096,
+            # simple-evals by default use 0.5 and is better than 0.0 temperature
+            # but here for reproducibility, we use 0.1
+            temperature=0.1,
+            reasoning_effort=reasoning_effort,
+        )
+
+        setup = f"model={model} reasoning_effort={reasoning_effort} expected_score={expected_score}"
+
+        print(f"Evaluation start: {setup}")
+        metrics = run_eval(args)
+        print(f"Evaluation end: {setup} {metrics=}")
+        self.assertGreaterEqual(metrics["score"], expected_score)
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_gpt_oss_common\n"
+                f"Setup: {setup}\n"
+                f"Score: {metrics['score']:.2f}\n"
+            )
diff --git a/test/srt/test_gptqmodel_dynamic.py b/test/srt/test_gptqmodel_dynamic.py
new file mode 100644
index 000000000..9be711d12
--- /dev/null
+++ b/test/srt/test_gptqmodel_dynamic.py
@@ -0,0 +1,204 @@
+import time
+import unittest
+
+import requests
+import torch
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+def check_quant_method(model_path: str, use_marlin_kernel: bool):
+    from sglang.srt.configs.device_config import DeviceConfig
+    from sglang.srt.configs.load_config import LoadConfig
+    from sglang.srt.configs.model_config import AttentionArch, ModelConfig
+    from sglang.srt.distributed import (
+        get_tp_group,
+        init_distributed_environment,
+        initialize_model_parallel,
+        set_custom_all_reduce,
+    )
+    from sglang.srt.distributed.parallel_state import monkey_patch_vllm_parallel_state
+    from sglang.srt.layers.quantization.utils import get_dynamic_override
+    from sglang.srt.model_loader import get_model
+    from sglang.srt.server_args import PortArgs, ServerArgs
+
+    try:
+        init_distributed_environment(
+            backend="nccl",
+            world_size=1,
+            rank=0,
+            local_rank=0,
+            distributed_init_method="tcp://127.0.0.1:2646",
+        )
+        initialize_model_parallel(tensor_model_parallel_size=1)
+        monkey_patch_vllm_parallel_state()
+    except AssertionError:
+        # ignore this error: tensor model parallel group is already initialized
+        pass
+
+    server_args = ServerArgs(model_path=model_path, dtype=torch.float16)
+    model_config = ModelConfig.from_server_args(server_args)
+
+    load_config = LoadConfig()
+    device_config = DeviceConfig("cuda")
+    model = get_model(
+        model_config=model_config, load_config=load_config, device_config=device_config
+    )
+
+    from sglang.srt.layers.linear import UnquantizedLinearMethod
+    from sglang.srt.layers.quantization.gptq import (
+        GPTQLinearMethod,
+        GPTQMarlinLinearMethod,
+    )
+
+    linear_method_cls = (
+        GPTQMarlinLinearMethod if use_marlin_kernel else (GPTQLinearMethod)
+    )
+
+    for name, submodule in model.named_modules():
+        if name == "lm_head":
+            assert isinstance(submodule.quant_method, linear_method_cls)
+        elif name == "model.layers.0.self_attn.qkv_proj":
+            # The first layer is quantized using bits=4, group_size=128
+            # desc_act=True
+            assert isinstance(submodule.quant_method, linear_method_cls)
+            config = submodule.quant_method.quant_config
+            assert config.weight_bits == 4
+            assert config.group_size == 128
+            assert config.desc_act
+        elif name == "model.layers.1.self_attn.qkv_proj":
+            # The second layer is quantized using bits=8, group_size=32
+            # desc_act=False
+            assert isinstance(submodule.quant_method, linear_method_cls)
+            config = submodule.quant_method.quant_config
+            assert get_dynamic_override(config, layer_name=name, key="bits") == 8
+            assert get_dynamic_override(config, layer_name=name, key="group_size") == 32
+            assert not get_dynamic_override(config, layer_name=name, key="desc_act")
+        elif (
+            name == "model.layers.2.self_attn.qkv_proj"
+            or name == "model.layers.2.mlp.gate_up_proj"
+        ):
+            # All other layers (layer index >= 2) are not quantized
+            assert isinstance(submodule.quant_method, UnquantizedLinearMethod)
+
+    del model
+
+
+# GPTQ with Dynamic Per/Module Quantization Control
+# Leverages GPTQModel (pypi) to produce the `dynamic` models
+# Test GPTQ fallback kernel that is not Marlin
+class TestGPTQModelDynamic(CustomTestCase):
+    MODEL_PATH = (
+        "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse"
+    )
+
+    @classmethod
+    def setUpClass(cls):
+        cls.model = cls.MODEL_PATH
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--dtype", "float16"],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def run_decode(self, max_new_tokens):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "max_new_tokens": max_new_tokens,
+                    "temperature": 0.001,
+                },
+            },
+        )
+        return response.json()
+
+    def test_throughput(self):
+        max_tokens = 256
+
+        tic = time.perf_counter()
+        result = self.run_decode(max_tokens)
+        tok = time.perf_counter()
+
+        print(f"result = `{result}`")
+
+        self.assertIn("paris", result["text"].lower())
+
+        throughput = max_tokens / (tok - tic)
+        print(f"Throughput: {throughput} tokens/s")
+        self.assertGreaterEqual(throughput, 140)
+
+    def test_gptq_module(self):
+        check_quant_method(self.MODEL_PATH, use_marlin_kernel=False)
+
+
+# GPTQ with Dynamic Per/Module Quantization Control
+# Leverages GPTQModel (pypi) to produce the `dynamic` models
+# Test Marlin kernel
+class TestGPTQModelDynamicWithMarlin(CustomTestCase):
+    MODEL_PATH = (
+        "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue"
+    )
+
+    @classmethod
+    def setUpClass(cls):
+        cls.model = cls.MODEL_PATH
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--dtype", "bfloat16"],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def run_decode(self, max_new_tokens):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "max_new_tokens": max_new_tokens,
+                    "temperature": 0.001,
+                },
+            },
+        )
+        return response.json()
+
+    def test_throughput(self):
+        max_tokens = 256
+
+        tic = time.perf_counter()
+        result = self.run_decode(max_tokens)
+        tok = time.perf_counter()
+
+        print(f"result = `{result}`")
+
+        assert "paris" in result["text"].lower()
+
+        throughput = max_tokens / (tok - tic)
+        print(f"Throughput: {throughput} tokens/s")
+        assert throughput >= 140
+
+    def test_gptq_marlin_module(self):
+        check_quant_method(self.MODEL_PATH, use_marlin_kernel=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_harmony_parser.py b/test/srt/test_harmony_parser.py
new file mode 100644
index 000000000..20cc02e5c
--- /dev/null
+++ b/test/srt/test_harmony_parser.py
@@ -0,0 +1,876 @@
+import unittest
+
+from sglang.srt.parser.harmony_parser import (
+    CanonicalStrategy,
+    Event,
+    HarmonyParser,
+    TextStrategy,
+    Token,
+    iter_tokens,
+    prefix_hold,
+)
+from sglang.test.test_utils import CustomTestCase
+
+
+class TestEvent(CustomTestCase):
+    def test_init(self):
+        """Test Event dataclass initialization."""
+        event = Event("reasoning", "content")
+        self.assertEqual(event.event_type, "reasoning")
+        self.assertEqual(event.content, "content")
+
+
+class TestToken(CustomTestCase):
+    def test_init(self):
+        """Test Token dataclass initialization."""
+        token = Token("START", 0, 7)
+        self.assertEqual(token.type, "START")
+        self.assertEqual(token.start, 0)
+        self.assertEqual(token.end, 7)
+
+
+class TestPrefixHold(CustomTestCase):
+    def test_empty_text(self):
+        """Test prefix_hold with empty text."""
+        emit, hold = prefix_hold("", ["<|start|>"])
+        self.assertEqual(emit, "")
+        self.assertEqual(hold, "")
+
+    def test_no_matching_prefixes(self):
+        """Test prefix_hold with no matching prefixes."""
+        emit, hold = prefix_hold("hello world", ["<|start|>", "<|end|>"])
+        self.assertEqual(emit, "hello world")
+        self.assertEqual(hold, "")
+
+    def test_partial_token_suffix(self):
+        """Test prefix_hold with partial token at end."""
+        emit, hold = prefix_hold("hello <|ret", ["<|return|>"])
+        self.assertEqual(emit, "hello ")
+        self.assertEqual(hold, "<|ret")
+
+    def test_multiple_potential_matches(self):
+        """Test prefix_hold with multiple potential matches."""
+        emit, hold = prefix_hold("text <|", ["<|start|>", "<|end|>"])
+        self.assertEqual(emit, "text ")
+        self.assertEqual(hold, "<|")
+
+    def test_exact_token_match(self):
+        """Test prefix_hold with exact token match."""
+        emit, hold = prefix_hold("text <|start|>", ["<|start|>"])
+        self.assertEqual(emit, "text <|start|>")
+        self.assertEqual(hold, "")
+
+
+class TestIterTokens(CustomTestCase):
+    def test_empty_text(self):
+        """Test iter_tokens with empty text."""
+        tokens = list(iter_tokens(""))
+        self.assertEqual(tokens, [])
+
+    def test_plain_text(self):
+        """Test iter_tokens with plain text."""
+        tokens = list(iter_tokens("hello world"))
+        self.assertEqual(len(tokens), 1)
+        self.assertEqual(tokens[0].type, "TEXT")
+        self.assertEqual(tokens[0].start, 0)
+        self.assertEqual(tokens[0].end, 11)
+
+    def test_single_token(self):
+        """Test iter_tokens with single structural token."""
+        tokens = list(iter_tokens("<|start|>"))
+        self.assertEqual(len(tokens), 1)
+        self.assertEqual(tokens[0].type, "START")
+        self.assertEqual(tokens[0].start, 0)
+        self.assertEqual(tokens[0].end, 9)
+
+    def test_mixed_content(self):
+        """Test iter_tokens with mixed text and tokens."""
+        tokens = list(iter_tokens("text<|start|>more text"))
+        self.assertEqual(len(tokens), 3)
+
+        self.assertEqual(tokens[0].type, "TEXT")
+        self.assertEqual(tokens[0].start, 0)
+        self.assertEqual(tokens[0].end, 4)
+
+        self.assertEqual(tokens[1].type, "START")
+        self.assertEqual(tokens[1].start, 4)
+        self.assertEqual(tokens[1].end, 13)
+
+        self.assertEqual(tokens[2].type, "TEXT")
+        self.assertEqual(tokens[2].start, 13)
+        self.assertEqual(tokens[2].end, 22)
+
+    def test_unknown_token_partial_suffix(self):
+        """Test iter_tokens with unknown token that could be partial."""
+        tokens = list(iter_tokens("text <|ret"))
+        self.assertEqual(len(tokens), 2)
+
+        self.assertEqual(tokens[0].type, "TEXT")
+        self.assertEqual(tokens[0].start, 0)
+        self.assertEqual(tokens[0].end, 5)
+
+        self.assertEqual(tokens[1].type, "TEXT")
+        self.assertEqual(tokens[1].start, 5)
+        self.assertEqual(tokens[1].end, 10)
+
+    def test_unknown_token_middle(self):
+        """Test iter_tokens with unknown token in middle."""
+        tokens = list(iter_tokens("text <|weird|> more <|start|>"))
+        self.assertEqual(len(tokens), 5)
+
+        self.assertEqual(tokens[0].type, "TEXT")
+        self.assertEqual(tokens[1].type, "TEXT")  # "<|"
+        self.assertEqual(tokens[2].type, "TEXT")  # "weird|> more "
+        self.assertEqual(tokens[3].type, "START")
+        # No trailing text token since it ends with a known token
+
+    def test_all_structural_tokens(self):
+        """Test iter_tokens recognizes all structural tokens."""
+        text = "<|start|><|channel|><|message|><|constrain|><|end|><|call|><|return|>"
+        tokens = list(iter_tokens(text))
+
+        expected_types = [
+            "START",
+            "CHANNEL",
+            "MESSAGE",
+            "CONSTRAIN",
+            "END",
+            "CALL",
+            "RETURN",
+        ]
+        self.assertEqual(len(tokens), len(expected_types))
+
+        for token, expected_type in zip(tokens, expected_types):
+            self.assertEqual(token.type, expected_type)
+
+
+class TestCanonicalStrategy(CustomTestCase):
+    def setUp(self):
+        self.strategy = CanonicalStrategy()
+
+    def test_init(self):
+        """Test CanonicalStrategy initialization."""
+        self.assertIn("<|start|>", self.strategy.guard_tokens)
+        self.assertIn("<|constrain|>", self.strategy.guard_tokens)
+
+    def test_extract_channel_type(self):
+        """Test _extract_channel_type method."""
+        self.assertEqual(self.strategy._extract_channel_type("analysis"), "analysis")
+        self.assertEqual(
+            self.strategy._extract_channel_type("commentary to=functions.tool"),
+            "commentary",
+        )
+        self.assertEqual(self.strategy._extract_channel_type("final to=user"), "final")
+        self.assertEqual(self.strategy._extract_channel_type("ANALYSIS"), "analysis")
+        self.assertIsNone(self.strategy._extract_channel_type("unknown"))
+
+    def test_parse_single_analysis_block(self):
+        """Test parsing single analysis block."""
+        text = "<|channel|>analysis<|message|>Let me think about this<|end|>"
+        events, remaining = self.strategy.parse(text)
+
+        self.assertEqual(len(events), 1)
+        self.assertEqual(events[0].event_type, "reasoning")
+        self.assertEqual(events[0].content, "Let me think about this")
+        self.assertEqual(remaining, "")
+
+    def test_parse_single_commentary_block(self):
+        """Test parsing single commentary block."""
+        text = "<|channel|>commentary<|message|>User-visible message<|end|>"
+        events, remaining = self.strategy.parse(text)
+
+        self.assertEqual(len(events), 1)
+        self.assertEqual(events[0].event_type, "normal")
+        self.assertEqual(events[0].content, "User-visible message")
+        self.assertEqual(remaining, "")
+
+    def test_parse_single_final_block(self):
+        """Test parsing single final block."""
+        text = "<|start|>assistant<|channel|>final<|message|>The answer is 42<|return|>"
+        events, remaining = self.strategy.parse(text)
+
+        self.assertEqual(len(events), 1)
+        self.assertEqual(events[0].event_type, "normal")
+        self.assertEqual(events[0].content, "The answer is 42")
+        self.assertEqual(remaining, "")
+
+    def test_parse_tool_call_commentary(self):
+        """Test parsing tool call on commentary channel."""
+        text = '<|channel|>commentary to=functions.get_weather<|message|>{"location": "SF"}<|call|>'
+        events, remaining = self.strategy.parse(text)
+
+        self.assertEqual(len(events), 1)
+        self.assertEqual(events[0].event_type, "tool_call")
+        self.assertEqual(events[0].content, '{"location": "SF"}')
+        self.assertEqual(remaining, "")
+
+    def test_parse_tool_call_analysis(self):
+        """Test parsing built-in tool call on analysis channel."""
+        text = '<|channel|>analysis to=browser.search<|message|>{"query": "SGLang"}<|call|>'
+        events, remaining = self.strategy.parse(text)
+
+        self.assertEqual(len(events), 1)
+        self.assertEqual(events[0].event_type, "tool_call")
+        self.assertEqual(events[0].content, '{"query": "SGLang"}')
+        self.assertEqual(remaining, "")
+
+    def test_parse_complex_sequence(self):
+        """Test parsing complex sequence with multiple blocks."""
+        text = (
+            "<|channel|>analysis<|message|>Need to use function get_weather.<|end|>"
+            "<|start|>assistant<|channel|>commentary to=functions.get_weather<|message|>"
+            '{"location":"San Francisco"}<|call|>'
+        )
+        events, remaining = self.strategy.parse(text)
+
+        self.assertEqual(len(events), 2)
+        self.assertEqual(events[0].event_type, "reasoning")
+        self.assertEqual(events[0].content, "Need to use function get_weather.")
+        self.assertEqual(events[1].event_type, "tool_call")
+        self.assertEqual(events[1].content, '{"location":"San Francisco"}')
+        self.assertEqual(remaining, "")
+
+    def test_parse_with_interspersed_text(self):
+        """Test parsing with plain text between blocks."""
+        text = (
+            "Some text "
+            "<|channel|>analysis<|message|>reasoning<|end|>"
+            " more text "
+            "<|start|>assistant<|channel|>final<|message|>answer<|return|>"
+            " trailing text"
+        )
+        events, remaining = self.strategy.parse(text)
+
+        self.assertEqual(len(events), 4)
+        self.assertEqual(events[0].event_type, "normal")
+        self.assertEqual(events[0].content, "Some text ")
+        self.assertEqual(events[1].event_type, "reasoning")
+        self.assertEqual(events[1].content, "reasoning")
+        self.assertEqual(events[2].event_type, "normal")
+        self.assertEqual(events[2].content, " more text ")
+        self.assertEqual(events[3].event_type, "normal")
+        self.assertEqual(events[3].content, "answer trailing text")
+        self.assertEqual(remaining, "")
+
+    def test_parse_incomplete_block(self):
+        """Test parsing incomplete block (streaming scenario)."""
+        text = "<|channel|>analysis<|message|>partial content"
+        events, remaining = self.strategy.parse(text)
+
+        self.assertEqual(len(events), 1)
+        self.assertEqual(events[0].event_type, "reasoning")
+        self.assertEqual(events[0].content, "partial content")
+        self.assertEqual(remaining, "<|channel|>analysis<|message|>")
+
+    def test_parse_partial_token_suffix(self):
+        """Test parsing with partial token at end."""
+        text = "complete text <|ret"
+        events, remaining = self.strategy.parse(text)
+
+        self.assertEqual(len(events), 1)
+        self.assertEqual(events[0].event_type, "normal")
+        self.assertEqual(events[0].content, "complete text ")
+        self.assertEqual(remaining, "<|ret")
+
+    def test_parse_tool_response_message(self):
+        """Test parsing tool response message (no channel)."""
+        text = '<|start|>functions.get_weather to=assistant<|message|>{"sunny": true}<|end|>'
+        events, remaining = self.strategy.parse(text)
+
+        self.assertEqual(len(events), 1)
+        self.assertEqual(events[0].event_type, "normal")
+        self.assertEqual(events[0].content, '{"sunny": true}')
+        self.assertEqual(remaining, "")
+
+    def test_parse_empty_content_blocks(self):
+        """Test parsing blocks with empty content."""
+        text = "<|channel|>analysis<|message|><|end|>"
+        events, remaining = self.strategy.parse(text)
+
+        self.assertEqual(len(events), 1)
+        self.assertEqual(events[0].event_type, "reasoning")
+        self.assertEqual(events[0].content, "")
+        self.assertEqual(remaining, "")
+
+    def test_parse_commentary_filler_between_blocks(self):
+        """Test that 'commentary' filler between <|call|> and <|channel|> is filtered out."""
+        # This pattern occurs when the model generates malformed output
+        text = (
+            '<|channel|>commentary to=functions.get_weather<|message|>{"location":"SF"}<|call|>'
+            "commentary"  # This should be filtered out
+            '<|channel|>commentary to=functions.get_temp<|message|>{"location":"NYC"}<|call|>'
+        )
+        events, remaining = self.strategy.parse(text)
+
+        # Should have 2 tool calls, no "commentary" normal text
+        self.assertEqual(len(events), 2)
+        self.assertEqual(events[0].event_type, "tool_call")
+        self.assertEqual(events[0].content, '{"location":"SF"}')
+        self.assertEqual(events[1].event_type, "tool_call")
+        self.assertEqual(events[1].content, '{"location":"NYC"}')
+        self.assertEqual(remaining, "")
+
+        # Verify no "commentary" text was emitted as normal content
+        normal_events = [e for e in events if e.event_type == "normal"]
+        commentary_events = [
+            e for e in normal_events if "commentary" in e.content.lower()
+        ]
+        self.assertEqual(
+            len(commentary_events), 0, "Commentary filler should be filtered out"
+        )
+
+
+class TestTextStrategy(CustomTestCase):
+    def setUp(self):
+        self.strategy = TextStrategy()
+
+    def test_init(self):
+        """Test TextStrategy initialization."""
+        self.assertIn("analysis_then_final", self.strategy.patterns)
+
+    def test_parse_analysis_then_final(self):
+        """Test parsing analysis then final format."""
+        text = "analysis I need to think about this. assistantfinal The answer is 42."
+        events, remaining = self.strategy.parse(text)
+
+        self.assertEqual(len(events), 2)
+        self.assertEqual(events[0].event_type, "reasoning")
+        self.assertEqual(events[0].content, "I need to think about this.")
+        self.assertEqual(events[1].event_type, "normal")
+        self.assertEqual(events[1].content, "The answer is 42.")
+        self.assertEqual(remaining, "")
+
+    def test_parse_commentary_then_final(self):
+        """Test parsing commentary then final format."""
+        text = "commentary User-visible preamble. assistantfinal The answer is 42."
+        events, remaining = self.strategy.parse(text)
+
+        self.assertEqual(len(events), 2)
+        self.assertEqual(events[0].event_type, "normal")
+        self.assertEqual(events[0].content, "User-visible preamble.")
+        self.assertEqual(events[1].event_type, "normal")
+        self.assertEqual(events[1].content, "The answer is 42.")
+        self.assertEqual(remaining, "")
+
+    def test_parse_final_only(self):
+        """Test parsing final-only format."""
+        text = "assistantfinal The direct answer."
+        events, remaining = self.strategy.parse(text)
+
+        self.assertEqual(len(events), 1)
+        self.assertEqual(events[0].event_type, "normal")
+        self.assertEqual(events[0].content, "The direct answer.")
+        self.assertEqual(remaining, "")
+
+    def test_parse_analysis_only(self):
+        """Test parsing analysis-only format."""
+        text = "analysis This is reasoning content."
+        events, remaining = self.strategy.parse(text)
+
+        # For analysis-only, streaming parse should keep header and emit with leading space
+        self.assertEqual(len(events), 1)
+        self.assertEqual(events[0].event_type, "reasoning")
+        self.assertEqual(events[0].content, " This is reasoning content.")
+        self.assertEqual(remaining, "analysis")
+
+    def test_parse_incomplete_assistantfinal(self):
+        """Test parsing with incomplete assistantfinal."""
+        text = "analysis reasoning content assistantfin"
+        events, remaining = self.strategy.parse(text)
+
+        self.assertEqual(len(events), 0)
+        self.assertEqual(remaining, text)  # Hold entire buffer
+
+    def test_parse_partial_analysis_streaming(self):
+        """Test streaming partial analysis content."""
+        text = "analysis partial content"
+        events, remaining = self.strategy.parse(text)
+
+        self.assertEqual(len(events), 1)
+        self.assertEqual(events[0].event_type, "reasoning")
+        self.assertEqual(events[0].content, " partial content")  # Space preserved
+        self.assertEqual(remaining, "analysis")  # Hold header
+
+    def test_parse_case_insensitive(self):
+        """Test case insensitive parsing."""
+        text = "ANALYSIS reasoning ASSISTANTFINAL answer"
+        events, remaining = self.strategy.parse(text)
+
+        self.assertEqual(len(events), 2)
+        self.assertEqual(events[0].event_type, "reasoning")
+        self.assertEqual(events[1].event_type, "normal")
+
+    def test_parse_plain_text_fallback(self):
+        """Test parsing plain text without harmony markers."""
+        text = "Just plain text without any markers."
+        events, remaining = self.strategy.parse(text)
+
+        self.assertEqual(len(events), 1)
+        self.assertEqual(events[0].event_type, "normal")
+        self.assertEqual(events[0].content, "Just plain text without any markers.")
+        self.assertEqual(remaining, "")
+
+    def test_parse_analysis_no_space_after_header(self):
+        """Test parsing analysis format without space after header (real gpt-oss output)."""
+        text = "analysisThe user typed random strings. We should respond politely.assistantfinalIt looks like you're testing. How can I help?"
+        events, remaining = self.strategy.parse(text)
+
+        self.assertEqual(len(events), 2)
+        self.assertEqual(events[0].event_type, "reasoning")
+        self.assertEqual(
+            events[0].content,
+            "The user typed random strings. We should respond politely.",
+        )
+        self.assertEqual(events[1].event_type, "normal")
+        self.assertEqual(
+            events[1].content, "It looks like you're testing. How can I help?"
+        )
+
+
+class TestHarmonyParser(CustomTestCase):
+    def setUp(self):
+        self.parser = HarmonyParser()
+
+    def test_init(self):
+        """Test HarmonyParser initialization."""
+        self.assertIsNone(self.parser.strategy)
+        self.assertEqual(self.parser._buffer, "")
+
+    def test_strategy_selection_canonical(self):
+        """Test automatic strategy selection for canonical format."""
+        events = self.parser.parse("<|channel|>analysis<|message|>test<|end|>")
+
+        self.assertIsInstance(self.parser.strategy, CanonicalStrategy)
+        self.assertEqual(len(events), 1)
+        self.assertEqual(events[0].event_type, "reasoning")
+
+    def test_strategy_selection_text(self):
+        """Test automatic strategy selection for text format."""
+        events = self.parser.parse("analysis test content")
+
+        self.assertIsInstance(self.parser.strategy, TextStrategy)
+        self.assertEqual(len(events), 1)
+        self.assertEqual(events[0].event_type, "reasoning")
+
+    def test_strategy_selection_delayed(self):
+        """Test strategy selection with insufficient initial content."""
+        # First chunk doesn't have enough info
+        events1 = self.parser.parse("some")
+        self.assertEqual(len(events1), 0)
+        self.assertIsNone(self.parser.strategy)
+
+        # Second chunk triggers strategy selection
+        events2 = self.parser.parse(" analysis content")
+        self.assertIsInstance(self.parser.strategy, TextStrategy)
+        self.assertEqual(len(events2), 1)
+
+    def test_streaming_canonical_format(self):
+        """Test streaming with canonical format."""
+        chunks = [
+            "<|channel|>analysis<|message|>",
+            "reasoning content",
+            "<|end|>",
+            "<|start|>assistant<|channel|>final<|message|>",
+            "final answer",
+            "<|return|>",
+        ]
+
+        all_events = []
+        for chunk in chunks:
+            events = self.parser.parse(chunk)
+            all_events.extend(events)
+
+        self.assertEqual(len(all_events), 5)
+
+        # Verify we get reasoning events
+        reasoning_events = [e for e in all_events if e.event_type == "reasoning"]
+        self.assertTrue(len(reasoning_events) > 0)
+
+        # Verify we get normal events
+        normal_events = [e for e in all_events if e.event_type == "normal"]
+        self.assertTrue(len(normal_events) > 0)
+
+        # Verify content is eventually parsed correctly
+        combined_reasoning = "".join(e.content for e in reasoning_events)
+        combined_normal = "".join(
+            e.content
+            for e in normal_events
+            if e.content and "<|return|>" not in e.content
+        )
+
+        self.assertIn("reasoning content", combined_reasoning)
+        self.assertIn("final answer", combined_normal)
+
+    def test_streaming_text_format(self):
+        """Test streaming with text format."""
+        chunks = ["analysis reasoning", " content assistantfinal", " the answer"]
+
+        all_events = []
+        for chunk in chunks:
+            events = self.parser.parse(chunk)
+            all_events.extend(events)
+
+        # Should have reasoning and normal events
+        reasoning_events = [e for e in all_events if e.event_type == "reasoning"]
+        normal_events = [e for e in all_events if e.event_type == "normal"]
+
+        self.assertGreater(len(reasoning_events), 0)
+        self.assertGreater(len(normal_events), 0)
+
+    def test_streaming_commentary_filler(self):
+        """Test that 'commentary' filler is filtered in streaming case."""
+        # Test when commentary arrives as a separate chunk after <|call|>
+        chunks = [
+            "<|channel|>commentary to=functions.get_weather",
+            "<|message|>",
+            '{"location":"SF"}',
+            "<|call|>",
+            "comment",  # This arrives as separate chunk - should be filtered
+            "ary",  # Continuation of the filler - should be filtered
+            "<|channel|>commentary to=functions.get_temp",
+            "<|message|>",
+            '{"location":"NYC"}',
+            "<|call|>",
+            "comment",  # Another separate chunk - should be filtered
+            "ary",  # Continuation of the filler - should be filtered
+            "<|start|>assistant<|channel|>final",
+            "<|message|>Done<|return|>",
+        ]
+
+        all_events = []
+        for chunk in chunks:
+            events = self.parser.parse(chunk)
+            all_events.extend(events)
+
+        # Count event types
+        tool_events = [e for e in all_events if e.event_type == "tool_call"]
+        normal_events = [e for e in all_events if e.event_type == "normal"]
+
+        # Should have 2 tool calls and 1 final message
+        self.assertEqual(len(tool_events), 2, "Should have 2 tool calls")
+        self.assertEqual(
+            len(normal_events), 1, "Should have 1 normal event (final message)"
+        )
+
+        # Verify no "commentary" in normal events
+        for event in normal_events:
+            self.assertNotEqual(
+                event.content.strip().lower(),
+                "commentary",
+                "Commentary filler should not appear as normal content in streaming",
+            )
+
+        # Verify content
+        self.assertEqual(tool_events[0].content, '{"location":"SF"}')
+        self.assertEqual(tool_events[1].content, '{"location":"NYC"}')
+        self.assertEqual(normal_events[0].content, "Done")
+
+    def test_repetitive_tool_calls_with_commentary_filler(self):
+        """Test handling of repetitive tool calls with 'commentary' filler text."""
+        # This simulates malformed output with repeated tool calls and commentary filler
+        text = (
+            "<|channel|>analysis<|message|>Need to get weather<|end|>"
+            '<|start|>assistant<|channel|>commentary to=functions.get_weather<|message|>{"city":"Boston"}<|call|>'
+            "commentary"  # Filler that should be filtered
+            '<|channel|>commentary to=functions.get_weather<|message|>{"city":"Boston"}<|call|>'
+            "commentary"  # Another filler
+            '<|channel|>commentary to=functions.get_weather<|message|>{"city":"Boston"}<|call|>'
+            "<|channel|>analysis<|message|>Tool not responding<|end|>"
+            "<|start|>assistant<|channel|>final<|message|>Unable to fetch weather data<|return|>"
+        )
+
+        events = self.parser.parse(text)
+
+        # Count event types
+        reasoning_events = [e for e in events if e.event_type == "reasoning"]
+        tool_events = [e for e in events if e.event_type == "tool_call"]
+        normal_events = [e for e in events if e.event_type == "normal"]
+
+        # Verify correct number of each type
+        self.assertEqual(len(reasoning_events), 2, "Should have 2 reasoning events")
+        self.assertEqual(len(tool_events), 3, "Should have 3 tool calls")
+        self.assertEqual(
+            len(normal_events), 1, "Should have 1 normal event (final message)"
+        )
+
+        # Verify no "commentary" filler in normal events
+        for event in normal_events:
+            self.assertNotEqual(
+                event.content.strip().lower(),
+                "commentary",
+                "Commentary filler should not appear as normal content",
+            )
+
+        # Verify content is correct
+        self.assertEqual(reasoning_events[0].content, "Need to get weather")
+        self.assertEqual(reasoning_events[1].content, "Tool not responding")
+        self.assertEqual(normal_events[0].content, "Unable to fetch weather data")
+
+
+class TestIntegrationScenarios(CustomTestCase):
+    """Integration tests for realistic Harmony parsing scenarios."""
+
+    def test_complete_reasoning_flow(self):
+        """Test complete reasoning flow from HARMONY_DOCS.md examples."""
+        parser = HarmonyParser()
+
+        text = (
+            '<|channel|>analysis<|message|>User asks: "What is 2 + 2?" Simple arithmetic. Provide answer.<|end|>'
+            "<|start|>assistant<|channel|>final<|message|>2 + 2 = 4.<|return|>"
+        )
+
+        events = parser.parse(text)
+
+        self.assertEqual(len(events), 2)
+        self.assertEqual(events[0].event_type, "reasoning")
+        self.assertIn("Simple arithmetic", events[0].content)
+        self.assertEqual(events[1].event_type, "normal")
+        self.assertEqual(events[1].content, "2 + 2 = 4.")
+
+    def test_tool_call_sequence(self):
+        """Test tool call sequence from HARMONY_DOCS.md examples."""
+        parser = HarmonyParser()
+
+        text = (
+            "<|channel|>analysis<|message|>Need to use function get_weather.<|end|>"
+            "<|start|>assistant<|channel|>commentary to=functions.get_weather <|constrain|>json<|message|>"
+            '{"location":"San Francisco"}<|call|>'
+        )
+
+        events = parser.parse(text)
+
+        self.assertEqual(len(events), 2)
+        self.assertEqual(events[0].event_type, "reasoning")
+        self.assertEqual(events[0].content, "Need to use function get_weather.")
+        self.assertEqual(events[1].event_type, "tool_call")
+        self.assertEqual(events[1].content, '{"location":"San Francisco"}')
+
+    def test_preamble_sequence(self):
+        """Test preamble sequence with multiple commentary blocks."""
+        parser = HarmonyParser()
+
+        text = (
+            "<|channel|>analysis<|message|>Long chain of thought<|end|>"
+            "<|start|>assistant<|channel|>commentary<|message|>**Action plan**: 1. Generate file 2. Start server<|end|>"
+            "<|start|>assistant<|channel|>commentary to=functions.generate_file<|message|>"
+            '{"template": "basic_html"}<|call|>'
+        )
+
+        events = parser.parse(text)
+
+        self.assertEqual(len(events), 3)
+        self.assertEqual(events[0].event_type, "reasoning")
+        self.assertEqual(events[1].event_type, "normal")
+        self.assertIn("Action plan", events[1].content)
+        self.assertEqual(events[2].event_type, "tool_call")
+
+    def test_built_in_tool_call(self):
+        """Test built-in tool call on analysis channel."""
+        parser = HarmonyParser()
+
+        text = '<|channel|>analysis to=browser.search<|message|>{"query": "SGLang"}<|call|>'
+
+        events = parser.parse(text)
+
+        self.assertEqual(len(events), 1)
+        self.assertEqual(events[0].event_type, "tool_call")
+        self.assertEqual(events[0].content, '{"query": "SGLang"}')
+
+    def test_tool_response_handling(self):
+        """Test tool response message handling."""
+        parser = HarmonyParser()
+
+        text = '<|start|>functions.get_weather to=assistant<|channel|>commentary<|message|>{"sunny": true, "temperature": 20}<|end|>'
+
+        events = parser.parse(text)
+
+        self.assertEqual(len(events), 1)
+        self.assertEqual(events[0].event_type, "normal")
+        self.assertEqual(events[0].content, '{"sunny": true, "temperature": 20}')
+
+    def test_text_fallback_formats(self):
+        """Test various text fallback formats."""
+        parser = HarmonyParser()
+
+        # Test analysis then final
+        events1 = parser.parse("analysis thinking assistantfinal answer")
+        self.assertEqual(len([e for e in events1 if e.event_type == "reasoning"]), 1)
+        self.assertEqual(len([e for e in events1 if e.event_type == "normal"]), 1)
+
+        # Reset parser for next test
+        parser = HarmonyParser()
+
+        # Test final only
+        events2 = parser.parse("assistantfinal direct answer")
+        self.assertEqual(len(events2), 1)
+        self.assertEqual(events2[0].event_type, "normal")
+
+    def test_streaming_property_canonical(self):
+        """Test streaming property: chunked parsing produces same semantic content as one-shot parsing."""
+        full_text = (
+            "<|channel|>analysis<|message|>reasoning content<|end|>"
+            "<|start|>assistant<|channel|>final<|message|>final content"
+        )
+
+        # One-shot parsing
+        parser1 = HarmonyParser()
+        events_oneshot = parser1.parse(full_text)
+        events_oneshot += parser1.parse("")
+
+        # Chunked parsing
+        parser2 = HarmonyParser()
+        chunks = [
+            "<|channel|>",
+            "analysis",
+            "<|message|>",
+            "reasoning content",
+            "<|end|>",
+            "<|start|>assistant",
+            "<|channel|>final",
+            "<|message|>",
+            "final ",
+            "content",
+        ]
+        events_chunked = []
+        for chunk in chunks:
+            events_chunked.extend(parser2.parse(chunk))
+
+        # Compare semantic content rather than exact event structure
+        reasoning_oneshot = "".join(
+            e.content for e in events_oneshot if e.event_type == "reasoning"
+        )
+        normal_oneshot = "".join(
+            e.content for e in events_oneshot if e.event_type == "normal"
+        )
+
+        reasoning_chunked = "".join(
+            e.content for e in events_chunked if e.event_type == "reasoning"
+        )
+        normal_chunked = "".join(
+            e.content for e in events_chunked if e.event_type == "normal"
+        )
+
+        self.assertEqual(reasoning_chunked, reasoning_oneshot)
+        self.assertEqual(normal_chunked, normal_oneshot)
+
+    def test_streaming_property_text(self):
+        """Test streaming property for text format."""
+        full_text = "analysis reasoning content assistantfinal final answer"
+
+        # One-shot parsing
+        parser1 = HarmonyParser()
+        events_oneshot = parser1.parse(full_text)
+
+        # Chunked parsing
+        parser2 = HarmonyParser()
+        chunks = ["analysis reason", "ing content assistant", "final final answer"]
+        events_chunked = []
+        for chunk in chunks:
+            events_chunked.extend(parser2.parse(chunk))
+
+        # Combine content by type for comparison
+        reasoning_oneshot = "".join(
+            e.content for e in events_oneshot if e.event_type == "reasoning"
+        )
+        normal_oneshot = "".join(
+            e.content for e in events_oneshot if e.event_type == "normal"
+        )
+
+        reasoning_chunked = "".join(
+            e.content for e in events_chunked if e.event_type == "reasoning"
+        )
+        normal_chunked = "".join(
+            e.content for e in events_chunked if e.event_type == "normal"
+        )
+
+        # Account for whitespace differences due to streaming - compare trimmed content
+        self.assertEqual(reasoning_oneshot.strip(), reasoning_chunked.strip())
+        self.assertEqual(normal_oneshot.strip(), normal_chunked.strip())
+
+
+class TestEdgeCases(CustomTestCase):
+    """Test edge cases and error conditions."""
+
+    def test_malformed_channel_headers(self):
+        """Test handling of malformed channel headers."""
+        parser = HarmonyParser()
+
+        # Unknown channel type
+        text = "<|channel|>unknown<|message|>content<|end|>"
+        events = parser.parse(text)
+
+        # Should be held as incomplete since channel is unknown
+        self.assertEqual(len(events), 0)
+
+    def test_mixed_unknown_tokens(self):
+        """Test handling of mixed unknown tokens."""
+        parser = HarmonyParser()
+
+        text = "text <|weird|> more text <|channel|>analysis<|message|>content<|end|>"
+        events = parser.parse(text)
+
+        # Should parse the valid parts
+        reasoning_events = [e for e in events if e.event_type == "reasoning"]
+        normal_events = [e for e in events if e.event_type == "normal"]
+
+        self.assertEqual(len(reasoning_events), 1)
+        self.assertGreater(len(normal_events), 0)
+
+    def test_empty_input(self):
+        """Test handling of empty input."""
+        parser = HarmonyParser()
+        events = parser.parse("")
+        self.assertEqual(len(events), 0)
+
+    def test_whitespace_preservation(self):
+        """Test that whitespace is preserved correctly."""
+        parser = HarmonyParser()
+
+        text = "<|channel|>analysis<|message|>  content with spaces  <|end|>"
+        events = parser.parse(text)
+
+        self.assertEqual(len(events), 1)
+        self.assertEqual(events[0].content, "  content with spaces  ")
+
+    def test_streaming_whitespace_preservation(self):
+        """Test that streaming preserves whitespace between chunks."""
+        parser = HarmonyParser()
+
+        # Simulate streaming where space is at chunk boundary
+        chunks = ["analysis The user typed ", '"wapppa". Not a question.']
+
+        all_events = []
+        for chunk in chunks:
+            events = parser.parse(chunk)
+            all_events.extend(events)
+
+        # Combine all reasoning content
+        reasoning_content = "".join(
+            e.content for e in all_events if e.event_type == "reasoning"
+        )
+
+        # Should preserve the space before the quote
+        self.assertIn('typed "wapppa"', reasoning_content)
+        self.assertNotIn(
+            'typed"wapppa"', reasoning_content
+        )  # Should not be mashed together
+
+    def test_consecutive_blocks_same_type(self):
+        """Test consecutive blocks of the same type."""
+        parser = HarmonyParser()
+
+        text = (
+            "<|channel|>analysis<|message|>first reasoning<|end|>"
+            "<|channel|>analysis<|message|>second reasoning<|end|>"
+        )
+        events = parser.parse(text)
+
+        self.assertEqual(len(events), 2)
+        self.assertEqual(events[0].event_type, "reasoning")
+        self.assertEqual(events[1].event_type, "reasoning")
+        self.assertEqual(events[0].content, "first reasoning")
+        self.assertEqual(events[1].content, "second reasoning")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_health_check.py b/test/srt/test_health_check.py
new file mode 100644
index 000000000..1f101e43b
--- /dev/null
+++ b/test/srt/test_health_check.py
@@ -0,0 +1,28 @@
+import unittest
+
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestHealthCheck(CustomTestCase):
+    def test_health_check(self):
+        """Test that metrics endpoint returns data when enabled"""
+        with self.assertRaises(TimeoutError):
+            popen_launch_server(
+                DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+                DEFAULT_URL_FOR_TEST,
+                timeout=60,
+                other_args=[
+                    "--disable-cuda-graph",
+                    "--json-model-override-args",
+                    '{"architectures": ["LlamaForCausalLMForHealthTest"]}',
+                ],
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_hidden_states.py b/test/srt/test_hidden_states.py
new file mode 100644
index 000000000..2046ce529
--- /dev/null
+++ b/test/srt/test_hidden_states.py
@@ -0,0 +1,139 @@
+import unittest
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+import sglang as sgl
+from sglang.test.test_utils import DEFAULT_SMALL_MODEL_NAME_FOR_TEST, CustomTestCase
+
+
+class TestHiddenState(CustomTestCase):
+    def test_return_hidden_states(self):
+        prompts = ["Today is", "Today is a sunny day and I like"]
+        model_path = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+        input_ids = tokenizer(prompts).input_ids
+
+        sampling_params = {
+            "temperature": 0,
+            "max_new_tokens": 8,
+        }
+
+        engine = sgl.Engine(
+            model_path=model_path,
+            random_seed=42,
+            skip_tokenizer_init=True,
+            enable_return_hidden_states=True,
+        )
+        outputs = engine.generate(
+            input_ids=input_ids,
+            sampling_params=sampling_params,
+            return_hidden_states=True,
+        )
+        engine.shutdown()
+
+        for output in outputs:
+            self.assertEqual(len(output["meta_info"]["hidden_states"]), 8)
+            for i in range(len(output["meta_info"]["hidden_states"])):
+                assert isinstance(output["meta_info"]["hidden_states"][i], list)
+                output["meta_info"]["hidden_states"][i] = torch.tensor(
+                    output["meta_info"]["hidden_states"][i], dtype=torch.bfloat16
+                )
+        # Checks that splicing of the batch was done correctly
+        self.assertGreater(
+            outputs[1]["meta_info"]["hidden_states"][0].shape[0],
+            outputs[0]["meta_info"]["hidden_states"][0].shape[0],
+        )
+
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path, torch_dtype=torch.bfloat16, device_map="cuda"
+        )
+
+        for input_id, output in zip(input_ids, outputs):
+            with torch.inference_mode():
+                hf_out = model(
+                    torch.tensor(
+                        [input_id + output["output_ids"][:-1]], device=model.device
+                    ),
+                    output_hidden_states=True,
+                )
+            print("=== HF Hiddens ===")
+            print(hf_out["hidden_states"][-1][0])
+            sg_hidden_states = torch.cat(
+                [
+                    i.unsqueeze(0) if len(i.shape) == 1 else i
+                    for i in output["meta_info"]["hidden_states"]
+                ]
+            ).to("cuda")
+            print("=== SRT Hiddens ===")
+            print(sg_hidden_states)
+
+            print(
+                f"Max diff: {torch.max(torch.abs(hf_out['hidden_states'][-1][0] - sg_hidden_states))}"
+            )
+
+            atol = 0.8
+            self.assertTrue(
+                torch.allclose(
+                    hf_out["hidden_states"][-1][0],
+                    sg_hidden_states,
+                    atol=atol,
+                    rtol=0,
+                )
+            )
+
+    def test_repeatedly_changes_hidden_states(self):
+        prompts = ["Today is", "Today is a sunny day and I like"]
+        model_path = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+        input_ids = tokenizer(prompts).input_ids
+
+        sampling_params = {
+            "temperature": 0,
+            "max_new_tokens": 8,
+        }
+
+        engine = sgl.Engine(
+            model_path=model_path,
+            random_seed=42,
+            skip_tokenizer_init=True,
+            enable_return_hidden_states=True,
+        )
+        outputs_completion_first_round = engine.generate(
+            input_ids=input_ids,
+            sampling_params=sampling_params,
+            return_hidden_states=True,
+        )
+        outputs_hidden_state = engine.generate(
+            input_ids=input_ids,
+            sampling_params=sampling_params,
+            return_hidden_states=False,
+        )
+
+        outputs_completion_last_round = engine.generate(
+            input_ids=input_ids,
+            sampling_params=sampling_params,
+            return_hidden_states=True,
+        )
+        engine.shutdown()
+
+        for (
+            output_completion_first_round,
+            output_hidden_state,
+            output_completion_last_round,
+        ) in zip(
+            outputs_completion_first_round,
+            outputs_hidden_state,
+            outputs_completion_last_round,
+        ):
+            self.assertEqual(
+                len(output_completion_first_round["meta_info"]["hidden_states"]), 8
+            )
+            self.assertNotIn("hidden_states", output_hidden_state["meta_info"])
+            self.assertEqual(
+                len(output_completion_last_round["meta_info"]["hidden_states"]), 8
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_hybrid_attn_backend.py b/test/srt/test_hybrid_attn_backend.py
new file mode 100644
index 000000000..1574ff873
--- /dev/null
+++ b/test/srt/test_hybrid_attn_backend.py
@@ -0,0 +1,161 @@
+import os
+import unittest
+from types import SimpleNamespace
+
+import requests
+
+from sglang.srt.utils import get_device_sm, kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+    DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_MODEL_NAME_FOR_TEST_MLA,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+GSM_DATASET_PATH = None
+
+# Default server arguments shared across all tests
+DEFAULT_SERVER_ARGS = [
+    "--trust-remote-code",
+    "--cuda-graph-max-bs",
+    "8",
+    "--prefill-attention-backend",
+    "fa3",
+    "--decode-attention-backend",
+    "flashinfer",
+]
+
+
+@unittest.skipIf(get_device_sm() < 90, "Test requires CUDA SM 90 or higher")
+class TestHybridAttnBackendBase(CustomTestCase):
+
+    model = DEFAULT_MODEL_NAME_FOR_TEST
+    base_url = DEFAULT_URL_FOR_TEST
+    accuracy_threshold = 0.65  # derived tests need to override this
+    speculative_decode = False
+    spec_decode_threshold = 2.2  # derived spec decoding tests need to override this
+
+    @classmethod
+    def get_server_args(cls):
+        """Return the arguments for the server launch. Override in subclasses."""
+        return DEFAULT_SERVER_ARGS
+
+    @classmethod
+    def setUpClass(cls):
+        # disable deep gemm precompile to make launch server faster
+        # please don't do this if you want to make your inference workload faster
+        os.environ["SGL_JIT_DEEPGEMM_PRECOMPILE"] = "false"
+        os.environ["SGL_ENABLE_JIT_DEEPGEMM"] = "false"
+        if cls.speculative_decode:
+            model = DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST
+        else:
+            model = cls.model
+        cls.process = popen_launch_server(
+            model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=cls.get_server_args(),
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        requests.get(self.base_url + "/flush_cache")
+
+        args = SimpleNamespace(
+            num_shots=4,
+            num_questions=100,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+            data_path=GSM_DATASET_PATH,
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(f"{metrics=}")
+
+        # Use the appropriate metric key based on the test class
+        metric_key = "accuracy"
+        self.assertGreater(metrics[metric_key], self.accuracy_threshold)
+
+        if self.speculative_decode:
+            server_info = requests.get(self.base_url + "/get_server_info")
+            avg_spec_accept_length = server_info.json()["internal_states"][0][
+                "avg_spec_accept_length"
+            ]
+            print(f"{avg_spec_accept_length=}")
+            self.assertGreater(avg_spec_accept_length, self.spec_decode_threshold)
+
+
+class TestHybridAttnBackendMLA(TestHybridAttnBackendBase):
+    accuracy_threshold = 0.60
+    model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
+
+    @classmethod
+    def get_server_args(cls):
+        return DEFAULT_SERVER_ARGS
+
+
+class TestHybridAttnBackendTorchCompile(TestHybridAttnBackendBase):
+    accuracy_threshold = 0.65
+
+    @classmethod
+    def get_server_args(cls):
+        return DEFAULT_SERVER_ARGS + ["--enable-torch-compile"]
+
+
+class TestHybridAttnBackendSpeculativeDecodingPrefillBackend(TestHybridAttnBackendBase):
+    speculative_decode = True
+    # This eagle test uses a very small model, so the accuracy is low.
+    accuracy_threshold = 0.2
+
+    @classmethod
+    def get_server_args(cls):
+        return DEFAULT_SERVER_ARGS + [
+            "--speculative-algorithm",
+            "EAGLE",
+            "--speculative-draft-model-path",
+            DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+            "--speculative-num-steps",
+            "3",
+            "--speculative-eagle-topk",
+            "2",
+            "--speculative-num-draft-tokens",
+            "4",
+            "--speculative-attention-mode",
+            "prefill",
+        ]
+
+
+class TestHybridAttnBackendSpeculativeDecodingDecodeBackend(TestHybridAttnBackendBase):
+    speculative_decode = True
+    # This eagle test uses a very small model, so the accuracy is low.
+    accuracy_threshold = 0.2
+
+    @classmethod
+    def get_server_args(cls):
+        return DEFAULT_SERVER_ARGS + [
+            "--speculative-algorithm",
+            "EAGLE",
+            "--speculative-draft-model-path",
+            DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
+            "--speculative-num-steps",
+            "3",
+            "--speculative-eagle-topk",
+            "2",
+            "--speculative-num-draft-tokens",
+            "4",
+            "--speculative-attention-mode",
+            "decode",
+        ]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_input_embeddings.py b/test/srt/test_input_embeddings.py
new file mode 100644
index 000000000..7efd66565
--- /dev/null
+++ b/test/srt/test_input_embeddings.py
@@ -0,0 +1,151 @@
+import json
+import os
+import tempfile
+import unittest
+
+import requests
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestInputEmbeds(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model)
+        cls.ref_model = AutoModelForCausalLM.from_pretrained(cls.model)
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--disable-radix", "--cuda-graph-max-bs", 4],
+        )
+        cls.texts = [
+            "The capital of France is",
+            "What is the best time of year to visit Japan for cherry blossoms?",
+        ]
+
+    def generate_input_embeddings(self, text):
+        """Generate input embeddings for a given text."""
+        input_ids = self.tokenizer(text, return_tensors="pt")["input_ids"]
+        embeddings = self.ref_model.get_input_embeddings()(input_ids)
+        return embeddings.squeeze().tolist()  # Convert tensor to a list for API use
+
+    def send_request(self, payload):
+        """Send a POST request to the /generate endpoint and return the response."""
+        response = requests.post(
+            self.base_url + "/generate",
+            json=payload,
+            timeout=30,  # Set a reasonable timeout for the API request
+        )
+        if response.status_code == 200:
+            return response.json()
+        return {
+            "error": f"Request failed with status {response.status_code}: {response.text}"
+        }
+
+    def send_file_request(self, file_path):
+        """Send a POST request to the /generate_from_file endpoint with a file."""
+        with open(file_path, "rb") as f:
+            response = requests.post(
+                self.base_url + "/generate_from_file",
+                files={"file": f},
+                timeout=30,  # Set a reasonable timeout for the API request
+            )
+        if response.status_code == 200:
+            return response.json()
+        return {
+            "error": f"Request failed with status {response.status_code}: {response.text}"
+        }
+
+    def test_text_based_response(self):
+        """Test and print API responses using text-based input."""
+        for text in self.texts:
+            payload = {
+                "model": self.model,
+                "text": text,
+                "sampling_params": {"temperature": 0, "max_new_tokens": 50},
+            }
+            response = self.send_request(payload)
+            print(
+                f"Text Input: {text}\nResponse: {json.dumps(response, indent=2)}\n{'-' * 80}"
+            )
+
+    def test_embedding_based_response(self):
+        """Test and print API responses using input embeddings."""
+        for text in self.texts:
+            embeddings = self.generate_input_embeddings(text)
+            payload = {
+                "model": self.model,
+                "input_embeds": embeddings,
+                "sampling_params": {"temperature": 0, "max_new_tokens": 50},
+            }
+            response = self.send_request(payload)
+            print(
+                f"Embeddings Input (for text '{text}'):\nResponse: {json.dumps(response, indent=2)}\n{'-' * 80}"
+            )
+
+    def test_compare_text_vs_embedding(self):
+        """Test and compare responses for text-based and embedding-based inputs."""
+        for text in self.texts:
+            # Text-based payload
+            text_payload = {
+                "model": self.model,
+                "text": text,
+                "sampling_params": {"temperature": 0, "max_new_tokens": 50},
+            }
+            # Embedding-based payload
+            embeddings = self.generate_input_embeddings(text)
+            embed_payload = {
+                "model": self.model,
+                "input_embeds": embeddings,
+                "sampling_params": {"temperature": 0, "max_new_tokens": 50},
+            }
+            # Get responses
+            text_response = self.send_request(text_payload)
+            embed_response = self.send_request(embed_payload)
+            # Print responses
+            print(
+                f"Text Input: {text}\nText-Based Response: {json.dumps(text_response, indent=2)}\n"
+            )
+            print(
+                f"Embeddings Input (for text '{text}'):\nEmbedding-Based Response: {json.dumps(embed_response, indent=2)}\n{'-' * 80}"
+            )
+            # This is flaky, so we skip this temporarily
+            # self.assertEqual(text_response["text"], embed_response["text"])
+
+    def test_generate_from_file(self):
+        """Test the /generate_from_file endpoint using tokenized embeddings."""
+        for text in self.texts:
+            embeddings = self.generate_input_embeddings(text)
+            with tempfile.NamedTemporaryFile(
+                mode="w", suffix=".json", delete=False
+            ) as tmp_file:
+                json.dump(embeddings, tmp_file)
+                tmp_file_path = tmp_file.name
+
+            try:
+                response = self.send_file_request(tmp_file_path)
+                print(
+                    f"Text Input: {text}\nResponse from /generate_from_file: {json.dumps(response, indent=2)}\n{'-' * 80}"
+                )
+            finally:
+                # Ensure the temporary file is deleted
+                os.remove(tmp_file_path)
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_intel_amx_attention_backend.py b/test/srt/test_intel_amx_attention_backend.py
new file mode 100644
index 000000000..5534c57f9
--- /dev/null
+++ b/test/srt/test_intel_amx_attention_backend.py
@@ -0,0 +1,135 @@
+"""
+Usage:
+python3 -m unittest test_intel_amx_attention_backend.TestIntelAMXAttnBackend.test_mmlu
+"""
+
+import unittest
+from functools import wraps
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MLA_MODEL_NAME_FOR_TEST,
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE,
+    DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8,
+    DEFAULT_MODEL_NAME_FOR_TEST_W8A8,
+    DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    run_bench_one_batch,
+)
+
+
+def intel_amx_benchmark(extra_args=None, min_throughput=None):
+    def decorator(test_func):
+        @wraps(test_func)
+        def wrapper(self):
+            common_args = [
+                "--attention-backend",
+                "intel_amx",
+                "--disable-radix",
+                "--trust-remote-code",
+            ]
+            full_args = common_args + (extra_args or [])
+
+            model = test_func(self)
+            prefill_latency, decode_throughput, decode_latency = run_bench_one_batch(
+                model, full_args
+            )
+
+            print(f"{model=}")
+            print(f"{prefill_latency=}")
+            print(f"{decode_throughput=}")
+            print(f"{decode_latency=}")
+
+            if is_in_ci() and min_throughput is not None:
+                self.assertGreater(decode_throughput, min_throughput)
+
+        return wrapper
+
+    return decorator
+
+
+class TestIntelAMXAttnBackend(CustomTestCase):
+
+    @intel_amx_benchmark(extra_args=["--batch-size", "4"], min_throughput=10)
+    def test_latency_mla_model(self):
+        return DEFAULT_MLA_MODEL_NAME_FOR_TEST
+
+    @intel_amx_benchmark(extra_args=["--batch-size", "4"], min_throughput=40)
+    def test_latency_default_model(self):
+        return DEFAULT_MODEL_NAME_FOR_TEST
+
+    @intel_amx_benchmark(extra_args=["--batch-size", "4"], min_throughput=150)
+    def test_latency_fp8_qwen(self):
+        return DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8
+
+    @intel_amx_benchmark(extra_args=["--batch-size", "4"], min_throughput=50)
+    def test_latency_fp8_moe_model(self):
+        return DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE
+
+    @intel_amx_benchmark(
+        extra_args=["--batch-size", "4", "--quantization", "w8a8_int8"],
+        min_throughput=100,
+    )
+    def test_latency_w8a8_default_model(self):
+        return DEFAULT_MODEL_NAME_FOR_TEST_W8A8
+
+    @intel_amx_benchmark(
+        extra_args=[
+            "--batch-size",
+            "4",
+            "--quantization",
+            "w8a8_int8",
+            "--mem-fraction-static",
+            "0.9",
+            "--max-total-tokens",
+            "65536",
+            "--tp",
+            "6",
+        ],
+        min_throughput=100,
+    )
+    def test_latency_w8a8_moe_model(self):
+        return DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE
+
+    def test_mmlu(self):
+        model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        base_url = DEFAULT_URL_FOR_TEST
+        process = popen_launch_server(
+            model,
+            base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--attention-backend",
+                "intel_amx",
+                "--mem-fraction-static",
+                "0.3",
+                "--disable-radix",
+                "--trust-remote-code",
+                "--disable-overlap-schedule",
+            ],
+        )
+
+        try:
+            args = SimpleNamespace(
+                base_url=base_url,
+                model=model,
+                eval_name="mmlu",
+                num_examples=64,
+                num_threads=32,
+            )
+            metrics = run_eval(args)
+            if is_in_ci():
+                self.assertGreater(metrics["score"], 0.45)
+        finally:
+            kill_process_tree(process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_io_struct.py b/test/srt/test_io_struct.py
new file mode 100644
index 000000000..a8efebb23
--- /dev/null
+++ b/test/srt/test_io_struct.py
@@ -0,0 +1,577 @@
+import copy
+import unittest
+
+from sglang.srt.managers.io_struct import GenerateReqInput
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+)
+
+
+class TestGenerateReqInputNormalization(CustomTestCase):
+    """Test the normalization of GenerateReqInput for batch processing and different input formats."""
+
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+
+    def setUp(self):
+        # Common setup for all tests
+        self.base_req = GenerateReqInput(
+            text=["Hello", "World"],
+            sampling_params=[{}, {}],
+            rid=["id1", "id2"],
+        )
+
+    def test_single_image_to_list_of_lists(self):
+        """Test that a single image is converted to a list of single-image lists."""
+        req = copy.deepcopy(self.base_req)
+        req.image_data = "single_image.jpg"  # A single image (non-list)
+
+        req.normalize_batch_and_arguments()
+
+        # Should be converted to [[image], [image]]
+        self.assertEqual(len(req.image_data), 2)
+        self.assertEqual(len(req.image_data[0]), 1)
+        self.assertEqual(len(req.image_data[1]), 1)
+        self.assertEqual(req.image_data[0][0], "single_image.jpg")
+        self.assertEqual(req.image_data[1][0], "single_image.jpg")
+
+        # Check modalities
+        self.assertEqual(req.modalities, ["image", "image"])
+
+    def test_list_of_images_to_list_of_lists(self):
+        """Test that a list of images is converted to a list of single-image lists."""
+        req = copy.deepcopy(self.base_req)
+        req.image_data = ["image1.jpg", "image2.jpg"]  # List of images
+
+        req.normalize_batch_and_arguments()
+
+        # Should be converted to [[image1], [image2]]
+        self.assertEqual(len(req.image_data), 2)
+        self.assertEqual(len(req.image_data[0]), 1)
+        self.assertEqual(len(req.image_data[1]), 1)
+        self.assertEqual(req.image_data[0][0], "image1.jpg")
+        self.assertEqual(req.image_data[1][0], "image2.jpg")
+
+        # Check modalities
+        self.assertEqual(req.modalities, ["image", "image"])
+
+    def test_list_of_lists_with_different_modalities(self):
+        """Test handling of list of lists of images with different modalities."""
+        req = copy.deepcopy(self.base_req)
+        req.image_data = [
+            ["image1.jpg"],  # Single image (image modality)
+            ["image2.jpg", "image3.jpg"],  # Multiple images (multi-images modality)
+        ]
+
+        req.normalize_batch_and_arguments()
+
+        # Structure should remain the same
+        self.assertEqual(len(req.image_data), 2)
+        self.assertEqual(len(req.image_data[0]), 1)
+        self.assertEqual(len(req.image_data[1]), 2)
+
+        # Check modalities
+        self.assertEqual(req.modalities, ["image", "multi-images"])
+
+    def test_list_of_lists_with_none_values(self):
+        """Test handling of list of lists with None values."""
+        req = copy.deepcopy(self.base_req)
+        req.image_data = [
+            [None],  # None value
+            ["image.jpg"],  # Single image
+        ]
+
+        req.normalize_batch_and_arguments()
+
+        # Structure should remain the same
+        self.assertEqual(len(req.image_data), 2)
+        self.assertEqual(len(req.image_data[0]), 1)
+        self.assertEqual(len(req.image_data[1]), 1)
+
+        # Check modalities
+        self.assertEqual(req.modalities, [None, "image"])
+
+    def test_expanding_parallel_sample_correlation(self):
+        """Test that when expanding with parallel samples, prompts, images and modalities are properly correlated."""
+        req = copy.deepcopy(self.base_req)
+        req.text = ["Prompt 1", "Prompt 2"]
+        req.image_data = [
+            ["image1.jpg"],
+            ["image2.jpg", "image3.jpg"],
+        ]
+        req.sampling_params = {"n": 3}  # All prompts get 3 samples
+
+        # Define expected values before normalization
+        expected_text = req.text * 3
+        expected_images = req.image_data * 3
+        expected_modalities = ["image", "multi-images"] * 3
+
+        req.normalize_batch_and_arguments()
+
+        # Should be expanded to 6 items (2 original * 3 parallel)
+        self.assertEqual(len(req.image_data), 6)
+
+        # Check that images are properly expanded
+        self.assertEqual(req.image_data, expected_images)
+
+        # Check modalities
+        self.assertEqual(req.modalities, expected_modalities)
+
+        # Ensure that text items are properly duplicated too
+        self.assertEqual(req.text, expected_text)
+
+    def test_specific_parallel_n_per_sample(self):
+        """Test parallel expansion when different samples have different n values."""
+        req = copy.deepcopy(self.base_req)
+        req.text = ["Prompt 1", "Prompt 2"]
+        req.image_data = [
+            ["image1.jpg"],
+            ["image2.jpg", "image3.jpg"],
+        ]
+        req.sampling_params = [
+            {"n": 2},
+            {"n": 2},
+        ]  # First prompt gets 2 samples, second prompt gets 2 samples
+
+        expected_images = req.image_data * 2
+        expected_modalities = ["image", "multi-images"] * 2
+        expected_text = req.text * 2
+
+        req.normalize_batch_and_arguments()
+
+        # Should be expanded to 4 items (2 original * 2 parallel)
+        self.assertEqual(len(req.image_data), 4)
+
+        # Check that the first 2 are copies for the first prompt
+        self.assertEqual(req.image_data, expected_images)
+
+        # Check modalities
+        self.assertEqual(req.modalities, expected_modalities)
+
+        # Check text expansion
+        self.assertEqual(req.text, expected_text)
+
+    def test_mixed_none_and_images_with_parallel_samples(self):
+        """Test that when some batch items have images and others None, parallel expansion works correctly."""
+        req = copy.deepcopy(self.base_req)
+        req.text = ["Prompt 1", "Prompt 2", "Prompt 3"]
+        req.rid = ["id1", "id2", "id3"]
+        req.image_data = [
+            ["image1.jpg"],
+            None,
+            ["image3_1.jpg", "image3_2.jpg"],
+        ]
+        req.sampling_params = {"n": 2}  # All prompts get 2 samples
+
+        expected_images = req.image_data * 2
+        expected_modalities = ["image", None, "multi-images"] * 2
+        expected_text = req.text * 2
+
+        req.normalize_batch_and_arguments()
+
+        # Should be expanded to 6 items (3 original * 2 parallel)
+        self.assertEqual(len(req.image_data), 6)
+
+        # Check image data
+        self.assertEqual(req.image_data, expected_images)
+
+        # Check modalities
+        self.assertEqual(req.modalities, expected_modalities)
+
+        # Check text expansion
+        self.assertEqual(req.text, expected_text)
+
+    def test_correlation_with_sampling_params(self):
+        """Test that sampling parameters are correctly correlated with prompts during expansion."""
+        req = copy.deepcopy(self.base_req)
+        req.text = ["Prompt 1", "Prompt 2"]
+        req.image_data = [
+            ["image1.jpg"],
+            ["image2.jpg"],
+        ]
+        req.sampling_params = [
+            {"temperature": 0.7, "n": 2},
+            {"temperature": 0.9, "n": 2},
+        ]
+
+        req.normalize_batch_and_arguments()
+
+        # Check sampling params expansion
+        self.assertEqual(len(req.sampling_params), 4)
+        self.assertEqual(req.sampling_params[0]["temperature"], 0.7)
+        self.assertEqual(req.sampling_params[1]["temperature"], 0.9)
+        self.assertEqual(req.sampling_params[2]["temperature"], 0.7)
+        self.assertEqual(req.sampling_params[3]["temperature"], 0.9)
+
+        # Should be expanded to 4 items (2 original * 2 parallel)
+        self.assertEqual(len(req.image_data), 4)
+
+        # Check correlation with images
+        self.assertEqual(req.image_data[0], ["image1.jpg"])
+        self.assertEqual(req.image_data[1], ["image2.jpg"])
+        self.assertEqual(req.image_data[2], ["image1.jpg"])
+        self.assertEqual(req.image_data[3], ["image2.jpg"])
+
+    def test_single_example_with_image(self):
+        """Test handling of single example with image."""
+        req = GenerateReqInput(
+            text="Hello",
+            image_data="single_image.jpg",
+        )
+
+        req.normalize_batch_and_arguments()
+
+        # For single examples, image_data doesn't get processed into lists
+        self.assertEqual(req.image_data, "single_image.jpg")
+        self.assertIsNone(req.modalities)  # Modalities isn't set for single examples
+
+    def test_single_to_batch_with_parallel_sampling(self):
+        """Test single example converted to batch with parallel sampling."""
+        req = GenerateReqInput(
+            text="Hello",
+            image_data="single_image.jpg",
+            sampling_params={"n": 3},  # parallel_sample_num = 3
+        )
+
+        # Define expected values before normalization
+        expected_text = ["Hello"] * 3
+
+        req.normalize_batch_and_arguments()
+
+        # Should be converted to batch with text=["Hello"]
+        self.assertEqual(req.text, expected_text)
+
+        # Image should be automatically wrapped to list of lists with length 1*3=3
+        self.assertEqual(len(req.image_data), 3)
+        self.assertEqual(req.image_data[0][0], "single_image.jpg")
+        self.assertEqual(req.image_data[1][0], "single_image.jpg")
+        self.assertEqual(req.image_data[2][0], "single_image.jpg")
+
+        # Modalities should be set for all 3 examples
+        self.assertEqual(req.modalities, ["image", "image", "image"])
+
+    def test_audio_data_handling(self):
+        """Test handling of audio_data."""
+        req = copy.deepcopy(self.base_req)
+        req.audio_data = "audio.mp3"  # Single audio
+
+        req.normalize_batch_and_arguments()
+
+        # Should be converted to ["audio.mp3", "audio.mp3"]
+        self.assertEqual(len(req.audio_data), 2)
+        self.assertEqual(req.audio_data[0], "audio.mp3")
+        self.assertEqual(req.audio_data[1], "audio.mp3")
+
+        # Test with list
+        req = copy.deepcopy(self.base_req)
+        req.audio_data = ["audio1.mp3", "audio2.mp3"]
+
+        req.normalize_batch_and_arguments()
+
+        # Should remain the same
+        self.assertEqual(len(req.audio_data), 2)
+        self.assertEqual(req.audio_data[0], "audio1.mp3")
+        self.assertEqual(req.audio_data[1], "audio2.mp3")
+
+    def test_input_ids_normalization(self):
+        """Test normalization of input_ids instead of text."""
+        # Test single input_ids
+        req = GenerateReqInput(input_ids=[1, 2, 3])
+        req.normalize_batch_and_arguments()
+        self.assertTrue(req.is_single)
+        self.assertEqual(req.batch_size, 1)
+
+        # Test batch input_ids
+        req = GenerateReqInput(input_ids=[[1, 2, 3], [4, 5, 6]])
+        req.normalize_batch_and_arguments()
+        self.assertFalse(req.is_single)
+        self.assertEqual(req.batch_size, 2)
+
+        # Test with parallel sampling
+        req = GenerateReqInput(
+            input_ids=[[1, 2, 3], [4, 5, 6]], sampling_params={"n": 2}
+        )
+        req.normalize_batch_and_arguments()
+        self.assertEqual(len(req.input_ids), 4)  # 2 original * 2 parallel
+
+    def test_input_embeds_normalization(self):
+        """Test normalization of input_embeds."""
+        # Test single input_embeds
+        req = GenerateReqInput(input_embeds=[[0.1, 0.2], [0.3, 0.4]])
+        req.normalize_batch_and_arguments()
+        self.assertTrue(req.is_single)
+        self.assertEqual(req.batch_size, 1)
+
+        # Test batch input_embeds
+        req = GenerateReqInput(input_embeds=[[[0.1, 0.2]], [[0.3, 0.4]]])
+        req.normalize_batch_and_arguments()
+        self.assertFalse(req.is_single)
+        self.assertEqual(req.batch_size, 2)
+
+    def test_input_embeds_with_parallel_sampling(self):
+        """Test input_embeds normalization with parallel sampling (n > 1)."""
+        # Test single input_embeds with parallel sampling
+        req = GenerateReqInput(
+            input_embeds=[[0.1, 0.2]],  # single embedding vector
+            sampling_params={"n": 2},
+        )
+        req.normalize_batch_and_arguments()
+
+        # Should be converted from single to batch and then expanded
+        self.assertFalse(req.is_single)
+        self.assertEqual(len(req.input_embeds), 2)
+        # Both should be the same input_embeds
+        self.assertEqual(req.input_embeds[0], [[0.1, 0.2]])
+        self.assertEqual(req.input_embeds[1], [[0.1, 0.2]])
+
+        # Test batch input_embeds with parallel sampling
+        req = GenerateReqInput(
+            input_embeds=[[[0.1, 0.2]], [[0.3, 0.4]]], sampling_params={"n": 3}
+        )
+        req.normalize_batch_and_arguments()
+
+        # Should be expanded
+        self.assertFalse(req.is_single)
+        self.assertEqual(len(req.input_embeds), 6)
+
+        # Check that the expansion is correct
+        expected_embeds = [[[0.1, 0.2]], [[0.3, 0.4]]] * 3
+        self.assertEqual(req.input_embeds, expected_embeds)
+
+        # Test with different n values per sample (should raise error)
+        req = GenerateReqInput(
+            input_embeds=[[[0.1, 0.2]], [[0.3, 0.4]]],
+            sampling_params=[{"n": 2}, {"n": 3}],
+        )
+        with self.assertRaises(ValueError):
+            req.normalize_batch_and_arguments()
+
+    def test_input_embeds_single_to_batch_conversion(self):
+        """Test that single input_embeds are properly converted to batch when using parallel sampling."""
+        # Test the specific case that was fixed: single input_embeds with n > 1
+        req = GenerateReqInput(
+            input_embeds=[[0.1, 0.2, 0.3]], sampling_params={"n": 2}  # Single embedding
+        )
+        req.normalize_batch_and_arguments()
+
+        # Should convert single to batch and then expand
+        self.assertFalse(req.is_single)
+        self.assertEqual(len(req.input_embeds), 2)
+
+        # Both should be the same single embedding
+        self.assertEqual(req.input_embeds[0], [[0.1, 0.2, 0.3]])
+        self.assertEqual(req.input_embeds[1], [[0.1, 0.2, 0.3]])
+
+        # Test with higher n value
+        req = GenerateReqInput(input_embeds=[[0.1, 0.2, 0.3]], sampling_params={"n": 5})
+        req.normalize_batch_and_arguments()
+
+        self.assertFalse(req.is_single)
+        self.assertEqual(len(req.input_embeds), 5)
+
+        # All should be the same
+        for i in range(5):
+            self.assertEqual(req.input_embeds[i], [[0.1, 0.2, 0.3]])
+
+    def test_lora_path_normalization(self):
+        """Test normalization of lora_path."""
+        # Test single lora_path with batch input
+        req = GenerateReqInput(text=["Hello", "World"], lora_path="path/to/lora")
+
+        # Define expected lora_paths before normalization
+        expected_lora_paths = ["path/to/lora", "path/to/lora"]
+
+        req.normalize_batch_and_arguments()
+        self.assertEqual(req.lora_path, expected_lora_paths)
+
+        # Test list of lora_paths
+        req = GenerateReqInput(text=["Hello", "World"], lora_path=["path1", "path2"])
+
+        # Define expected lora_paths before normalization
+        expected_lora_paths = ["path1", "path2"]
+
+        req.normalize_batch_and_arguments()
+        self.assertEqual(req.lora_path, expected_lora_paths)
+
+        # Test with parallel sampling
+        req = GenerateReqInput(
+            text=["Hello", "World"],
+            lora_path=["path1", "path2"],
+            sampling_params={"n": 2},
+        )
+
+        # Define expected lora_paths before normalization
+        expected_lora_paths = ["path1", "path2"] * 2
+
+        req.normalize_batch_and_arguments()
+        self.assertEqual(req.lora_path, expected_lora_paths)
+
+    def test_logprob_parameters_normalization(self):
+        """Test normalization of logprob-related parameters."""
+        # Test single example
+        req = GenerateReqInput(
+            text="Hello",
+            return_logprob=True,
+            logprob_start_len=10,
+            top_logprobs_num=5,
+            token_ids_logprob=[7, 8, 9],
+        )
+        req.normalize_batch_and_arguments()
+        self.assertEqual(req.return_logprob, True)
+        self.assertEqual(req.logprob_start_len, 10)
+        self.assertEqual(req.top_logprobs_num, 5)
+        self.assertEqual(req.token_ids_logprob, [7, 8, 9])
+
+        # Test batch with scalar values
+        req = GenerateReqInput(
+            text=["Hello", "World"],
+            return_logprob=True,
+            logprob_start_len=10,
+            top_logprobs_num=5,
+            token_ids_logprob=[7, 8, 9],
+        )
+        req.normalize_batch_and_arguments()
+        self.assertEqual(req.return_logprob, [True, True])
+        self.assertEqual(req.logprob_start_len, [10, 10])
+        self.assertEqual(req.top_logprobs_num, [5, 5])
+        self.assertEqual(req.token_ids_logprob, [[7, 8, 9], [7, 8, 9]])
+
+        # Test batch with list values
+        req = GenerateReqInput(
+            text=["Hello", "World"],
+            return_logprob=[True, False],
+            logprob_start_len=[10, 5],
+            top_logprobs_num=[5, 3],
+            token_ids_logprob=[[7, 8, 9], [4, 5, 6]],
+            return_hidden_states=[False, False, True],
+        )
+        req.normalize_batch_and_arguments()
+        self.assertEqual(req.return_logprob, [True, False])
+        self.assertEqual(req.logprob_start_len, [10, 5])
+        self.assertEqual(req.top_logprobs_num, [5, 3])
+        self.assertEqual(req.token_ids_logprob, [[7, 8, 9], [4, 5, 6]])
+        self.assertEqual(req.return_hidden_states, [False, False, True])
+
+    def test_custom_logit_processor_normalization(self):
+        """Test normalization of custom_logit_processor."""
+        # Test single processor
+        req = GenerateReqInput(
+            text=["Hello", "World"], custom_logit_processor="serialized_processor"
+        )
+        req.normalize_batch_and_arguments()
+        self.assertEqual(
+            req.custom_logit_processor, ["serialized_processor", "serialized_processor"]
+        )
+
+        # Test list of processors
+        req = GenerateReqInput(
+            text=["Hello", "World"], custom_logit_processor=["processor1", "processor2"]
+        )
+        req.normalize_batch_and_arguments()
+        self.assertEqual(req.custom_logit_processor, ["processor1", "processor2"])
+
+    def test_session_params_handling(self):
+        """Test handling of session_params."""
+        # Test with dict
+        req = GenerateReqInput(
+            text=["Hello", "World"], session_params={"id": "session1", "offset": 10}
+        )
+        req.normalize_batch_and_arguments()
+        self.assertEqual(req.session_params, {"id": "session1", "offset": 10})
+
+        # Test with list of dicts
+        req = GenerateReqInput(
+            text=["Hello", "World"],
+            session_params=[{"id": "session1"}, {"id": "session2"}],
+        )
+        req.normalize_batch_and_arguments()
+        self.assertEqual(req.session_params, [{"id": "session1"}, {"id": "session2"}])
+
+    def test_getitem_method(self):
+        """Test the __getitem__ method."""
+        req = GenerateReqInput(
+            text=["Hello", "World"],
+            image_data=[["img1.jpg"], ["img2.jpg"]],
+            audio_data=["audio1.mp3", "audio2.mp3"],
+            sampling_params=[{"temp": 0.7}, {"temp": 0.8}],
+            rid=["id1", "id2"],
+            return_logprob=[True, False],
+            logprob_start_len=[10, 5],
+            top_logprobs_num=[5, 3],
+            token_ids_logprob=[[7, 8, 9], [4, 5, 6]],
+            stream=True,
+            log_metrics=True,
+            modalities=["image", "image"],
+            lora_path=["path1", "path2"],
+            custom_logit_processor=["processor1", "processor2"],
+            return_hidden_states=True,
+        )
+        req.normalize_batch_and_arguments()
+
+        # Get the first item
+        item0 = req[0]
+        self.assertEqual(item0.text, "Hello")
+        self.assertEqual(item0.image_data, ["img1.jpg"])
+        self.assertEqual(item0.audio_data, "audio1.mp3")
+        self.assertEqual(item0.sampling_params, {"temp": 0.7})
+        self.assertEqual(item0.rid, "id1")
+        self.assertEqual(item0.return_logprob, True)
+        self.assertEqual(item0.logprob_start_len, 10)
+        self.assertEqual(item0.top_logprobs_num, 5)
+        self.assertEqual(item0.token_ids_logprob, [7, 8, 9])
+        self.assertEqual(item0.stream, True)
+        self.assertEqual(item0.log_metrics, True)
+        self.assertEqual(item0.modalities, "image")
+        self.assertEqual(item0.lora_path, "path1")
+        self.assertEqual(item0.custom_logit_processor, "processor1")
+        self.assertEqual(item0.return_hidden_states, True)
+
+    def test_regenerate_rid(self):
+        """Test the regenerate_rid method."""
+        req = GenerateReqInput(text="Hello")
+        req.normalize_batch_and_arguments()
+
+        original_rid = req.rid
+        new_rid = req.regenerate_rid()
+
+        self.assertNotEqual(original_rid, new_rid)
+        self.assertEqual(req.rid, new_rid)
+
+    def test_error_cases(self):
+        """Test various error cases."""
+        # Test when neither text, input_ids, nor input_embeds is provided
+        with self.assertRaises(ValueError):
+            req = GenerateReqInput()
+            req.normalize_batch_and_arguments()
+
+        # Test when all of text, input_ids, and input_embeds are provided
+        with self.assertRaises(ValueError):
+            req = GenerateReqInput(
+                text="Hello", input_ids=[1, 2, 3], input_embeds=[[0.1, 0.2]]
+            )
+            req.normalize_batch_and_arguments()
+
+    def test_multiple_input_formats(self):
+        """Test different combinations of input formats."""
+        # Test with text only
+        req = GenerateReqInput(text="Hello")
+        req.normalize_batch_and_arguments()
+        self.assertTrue(req.is_single)
+
+        # Test with input_ids only
+        req = GenerateReqInput(input_ids=[1, 2, 3])
+        req.normalize_batch_and_arguments()
+        self.assertTrue(req.is_single)
+
+        # Test with input_embeds only
+        req = GenerateReqInput(input_embeds=[[0.1, 0.2]])
+        req.normalize_batch_and_arguments()
+        self.assertTrue(req.is_single)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_jinja_template_utils.py b/test/srt/test_jinja_template_utils.py
new file mode 100644
index 000000000..46e634006
--- /dev/null
+++ b/test/srt/test_jinja_template_utils.py
@@ -0,0 +1,310 @@
+"""
+Unit tests for Jinja chat template utils.
+"""
+
+import unittest
+
+from sglang.srt.parser.jinja_template_utils import (
+    detect_jinja_template_content_format,
+    process_content_for_template_format,
+)
+from sglang.test.test_utils import CustomTestCase
+
+
+class TestTemplateContentFormatDetection(CustomTestCase):
+    """Test template content format detection functionality."""
+
+    def test_detect_llama4_openai_format(self):
+        """Test detection of llama4-style template (should be 'openai' format)."""
+        llama4_pattern = """
+{%- for message in messages %}
+    {%- if message['content'] is string %}
+        {{- message['content'] }}
+    {%- else %}
+        {%- for content in message['content'] %}
+            {%- if content['type'] == 'image' %}
+                {{- '<|image|>' }}
+            {%- elif content['type'] == 'text' %}
+                {{- content['text'] | trim }}
+            {%- endif %}
+        {%- endfor %}
+    {%- endif %}
+{%- endfor %}
+        """
+
+        result = detect_jinja_template_content_format(llama4_pattern)
+        self.assertEqual(result, "openai")
+
+    def test_detect_deepseek_string_format(self):
+        """Test detection of deepseek-style template (should be 'string' format)."""
+        deepseek_pattern = """
+{%- for message in messages %}
+    {%- if message['role'] == 'user' %}
+        {{- '<|User|>' + message['content'] + '<|Assistant|>' }}
+    {%- endif %}
+{%- endfor %}
+        """
+
+        result = detect_jinja_template_content_format(deepseek_pattern)
+        self.assertEqual(result, "string")
+
+    def test_detect_invalid_template(self):
+        """Test handling of invalid template (should default to 'string')."""
+        invalid_pattern = "{{{{ invalid jinja syntax }}}}"
+
+        result = detect_jinja_template_content_format(invalid_pattern)
+        self.assertEqual(result, "string")
+
+    def test_detect_empty_template(self):
+        """Test handling of empty template (should default to 'string')."""
+        result = detect_jinja_template_content_format("")
+        self.assertEqual(result, "string")
+
+    def test_detect_msg_content_pattern(self):
+        """Test detection of template with msg.content pattern (should be 'openai' format)."""
+        msg_content_pattern = """
+[gMASK]<sop>
+{%- for msg in messages %}
+    {%- if msg.role == 'system' %}
+<|system|>
+{{ msg.content }}
+    {%- elif msg.role == 'user' %}
+<|user|>{{ '\n' }}
+        {%- if msg.content is string %}
+{{ msg.content }}
+        {%- else %}
+            {%- for item in msg.content %}
+                {%- if item.type == 'video' or 'video' in item %}
+<|begin_of_video|><|video|><|end_of_video|>
+                {%- elif item.type == 'image' or 'image' in item %}
+<|begin_of_image|><|image|><|end_of_image|>
+                {%- elif item.type == 'text' %}
+{{ item.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+    {%- elif msg.role == 'assistant' %}
+        {%- if msg.metadata %}
+<|assistant|>{{ msg.metadata }}
+{{ msg.content }}
+        {%- else %}
+<|assistant|>
+{{ msg.content }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{% if add_generation_prompt %}<|assistant|>
+{% endif %}
+        """
+
+        result = detect_jinja_template_content_format(msg_content_pattern)
+        self.assertEqual(result, "openai")
+
+    def test_detect_m_content_pattern(self):
+        """Test detection of template with m.content pattern (should be 'openai' format)."""
+        msg_content_pattern = """
+[gMASK]<sop>
+{%- for m in messages %}
+    {%- if m.role == 'system' %}
+<|system|>
+{{ m.content }}
+    {%- elif m.role == 'user' %}
+<|user|>{{ '\n' }}
+        {%- if m.content is string %}
+{{ m.content }}
+        {%- else %}
+            {%- for item in m.content %}
+                {%- if item.type == 'video' or 'video' in item %}
+<|begin_of_video|><|video|><|end_of_video|>
+                {%- elif item.type == 'image' or 'image' in item %}
+<|begin_of_image|><|image|><|end_of_image|>
+                {%- elif item.type == 'text' %}
+{{ item.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+    {%- elif m.role == 'assistant' %}
+        {%- if m.metadata %}
+<|assistant|>{{ m.metadata }}
+{{ m.content }}
+        {%- else %}
+<|assistant|>
+{{ m.content }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{% if add_generation_prompt %}<|assistant|>
+{% endif %}
+        """
+
+        result = detect_jinja_template_content_format(msg_content_pattern)
+        self.assertEqual(result, "openai")
+
+    def test_process_content_openai_format(self):
+        """Test content processing for openai format."""
+        msg_dict = {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Look at this image:"},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": "http://example.com/image.jpg"},
+                },
+                {"type": "text", "text": "What do you see?"},
+            ],
+        }
+
+        image_data = []
+        video_data = []
+        audio_data = []
+        modalities = []
+
+        result = process_content_for_template_format(
+            msg_dict, "openai", image_data, video_data, audio_data, modalities
+        )
+
+        # Check that image_data was extracted
+        self.assertEqual(len(image_data), 1)
+        self.assertEqual(image_data[0].url, "http://example.com/image.jpg")
+
+        # Check that content was normalized
+        expected_content = [
+            {"type": "text", "text": "Look at this image:"},
+            {"type": "image"},  # normalized from image_url
+            {"type": "text", "text": "What do you see?"},
+        ]
+        self.assertEqual(result["content"], expected_content)
+        self.assertEqual(result["role"], "user")
+
+    def test_process_content_string_format(self):
+        """Test content processing for string format."""
+        msg_dict = {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Hello"},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": "http://example.com/image.jpg"},
+                },
+                {"type": "text", "text": "world"},
+            ],
+        }
+
+        image_data = []
+        video_data = []
+        audio_data = []
+        modalities = []
+
+        result = process_content_for_template_format(
+            msg_dict, "string", image_data, video_data, audio_data, modalities
+        )
+
+        # For string format, should flatten to text only
+        self.assertEqual(result["content"], "Hello world")
+        self.assertEqual(result["role"], "user")
+
+        # Image data should not be extracted for string format
+        self.assertEqual(len(image_data), 0)
+
+    def test_process_content_with_audio(self):
+        """Test content processing with audio content."""
+        msg_dict = {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Listen to this:"},
+                {
+                    "type": "audio_url",
+                    "audio_url": {"url": "http://example.com/audio.mp3"},
+                },
+            ],
+        }
+
+        image_data = []
+        video_data = []
+        audio_data = []
+        modalities = []
+
+        result = process_content_for_template_format(
+            msg_dict, "openai", image_data, video_data, audio_data, modalities
+        )
+
+        # Check that audio_data was extracted
+        self.assertEqual(len(audio_data), 1)
+        self.assertEqual(audio_data[0], "http://example.com/audio.mp3")
+
+        # Check that content was normalized
+        expected_content = [
+            {"type": "text", "text": "Listen to this:"},
+            {"type": "audio"},  # normalized from audio_url
+        ]
+        self.assertEqual(result["content"], expected_content)
+
+    def test_process_content_already_string(self):
+        """Test processing content that's already a string."""
+        msg_dict = {"role": "user", "content": "Hello world"}
+
+        image_data = []
+        video_data = []
+        audio_data = []
+        modalities = []
+
+        result = process_content_for_template_format(
+            msg_dict, "openai", image_data, video_data, audio_data, modalities
+        )
+
+        # Should pass through unchanged
+        self.assertEqual(result["content"], "Hello world")
+        self.assertEqual(result["role"], "user")
+        self.assertEqual(len(image_data), 0)
+
+    def test_process_content_with_modalities(self):
+        """Test content processing with modalities field."""
+        msg_dict = {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": "http://example.com/image.jpg"},
+                    "modalities": ["vision"],
+                }
+            ],
+        }
+
+        image_data = []
+        video_data = []
+        audio_data = []
+        modalities = []
+
+        result = process_content_for_template_format(
+            msg_dict, "openai", image_data, video_data, audio_data, modalities
+        )
+
+        # Check that modalities was extracted
+        self.assertEqual(len(modalities), 1)
+        self.assertEqual(modalities[0], ["vision"])
+
+    def test_process_content_filter_none_values(self):
+        """Test that None values are filtered out of processed messages."""
+        msg_dict = {
+            "role": "user",
+            "content": "Hello",
+            "name": None,
+            "tool_call_id": None,
+        }
+
+        image_data = []
+        video_data = []
+        audio_data = []
+        modalities = []
+
+        result = process_content_for_template_format(
+            msg_dict, "string", image_data, video_data, audio_data, modalities
+        )
+
+        # None values should be filtered out
+        expected_keys = {"role", "content"}
+        self.assertEqual(set(result.keys()), expected_keys)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_kv_events.py b/test/srt/test_kv_events.py
new file mode 100644
index 000000000..d333738c7
--- /dev/null
+++ b/test/srt/test_kv_events.py
@@ -0,0 +1,377 @@
+import time
+import unittest
+
+import msgspec
+import requests
+import zmq
+from msgspec.msgpack import Decoder
+
+from sglang.srt.disaggregation.kv_events import (
+    AllBlocksCleared,
+    BlockRemoved,
+    BlockStored,
+    EventBatch,
+    KVCacheEvent,
+    KVEventBatch,
+)
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestKvEvents(CustomTestCase):
+    def test_kv_events_enabled(self):
+        """Test that kv events are sent and received by subscriber data when enabled"""
+
+        # Launch kv events subscriber
+        decoder = Decoder(type=KVEventBatch)
+        context = zmq.Context()
+        sub = context.socket(zmq.SUB)
+        sub.connect("tcp://localhost:5557")
+        topic = "kv-events"
+        sub.setsockopt_string(zmq.SUBSCRIBE, topic)
+
+        # Launch sglang server
+        process = popen_launch_server(
+            DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+            DEFAULT_URL_FOR_TEST,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--kv-events-config",
+                '{"publisher": "zmq", "topic": "kv-events"}',
+                "--max-total-tokens",
+                32,
+                "--cuda-graph-max-bs",
+                2,
+                "--enable-dp-attention",
+                "--dp-size",
+                1,
+            ],
+        )
+
+        try:
+            # Make some requests to generate some metrics
+            response = requests.get(f"{DEFAULT_URL_FOR_TEST}/health_generate")
+            self.assertEqual(response.status_code, 200)
+
+            response = requests.post(
+                f"{DEFAULT_URL_FOR_TEST}/generate",
+                json={
+                    "text": "The capital of France is",
+                    "sampling_params": {
+                        "temperature": 0,
+                        "max_new_tokens": 32,
+                    },
+                },
+            )
+            response = requests.post(
+                f"{DEFAULT_URL_FOR_TEST}/generate",
+                json={
+                    "text": "The capital of Spain is",
+                    "sampling_params": {
+                        "temperature": 0,
+                        "max_new_tokens": 32,
+                    },
+                },
+            )
+
+            # Expected events. These may be dependent on model used (meta-llama/Llama-3.2-1B-Instruct)
+            expected_events = [
+                # <begin> The capital city of France is
+                BlockStored(
+                    block_hashes=[-6650323075460941099],
+                    parent_block_hash=5740354900026072187,
+                    token_ids=[128000, 791, 6864, 3363, 315, 9822, 374],
+                    block_size=7,
+                    lora_id=None,
+                ),
+                # Paris. The Eiffel Tower
+                BlockStored(
+                    block_hashes=[-7584018293207282755],
+                    parent_block_hash=-6650323075460941099,
+                    token_ids=[12366, 13, 578, 469, 3168, 301, 22703],
+                    block_size=7,
+                    lora_id=None,
+                ),
+                BlockStored(
+                    block_hashes=[-8753497827991233192],
+                    parent_block_hash=5740354900026072187,
+                    token_ids=[0],
+                    block_size=1,
+                    lora_id=None,
+                ),
+                BlockRemoved(block_hashes=[-6650323075460941099]),
+                # <begin> The capital
+                BlockStored(
+                    block_hashes=[-2697055055087824455],
+                    parent_block_hash=5740354900026072187,
+                    token_ids=[128000, 791, 6864],
+                    block_size=3,
+                    lora_id=None,
+                ),
+                # city of France is
+                BlockStored(
+                    block_hashes=[-7505627135785778022],
+                    parent_block_hash=-2697055055087824455,
+                    token_ids=[3363, 315, 9822, 374],
+                    block_size=4,
+                    lora_id=None,
+                ),
+                # of France is
+                BlockStored(
+                    block_hashes=[-3861108700662737012],
+                    parent_block_hash=-2697055055087824455,
+                    token_ids=[315, 9822, 374],
+                    block_size=3,
+                    lora_id=None,
+                ),
+                BlockRemoved(block_hashes=[-7584018293207282755]),
+                BlockRemoved(block_hashes=[-8753497827991233192]),
+                BlockRemoved(block_hashes=[-7505627135785778022]),
+                # Paris. The Eiffel Tower is located in Paris. The Eiffel Tower is a famous landmark in Paris
+                BlockStored(
+                    block_hashes=[-3064341286825792715],
+                    parent_block_hash=-3861108700662737012,
+                    token_ids=[
+                        12366,
+                        13,
+                        578,
+                        469,
+                        3168,
+                        301,
+                        22703,
+                        374,
+                        7559,
+                        304,
+                        12366,
+                        13,
+                        578,
+                        469,
+                        3168,
+                        301,
+                        22703,
+                        374,
+                        264,
+                        11495,
+                        38350,
+                        304,
+                        12366,
+                    ],
+                    block_size=23,
+                    lora_id=None,
+                ),
+                BlockRemoved(block_hashes=[-3861108700662737012]),
+                # of
+                BlockStored(
+                    block_hashes=[6115672085296369592],
+                    parent_block_hash=-2697055055087824455,
+                    token_ids=[315],
+                    block_size=1,
+                    lora_id=None,
+                ),
+                # France is
+                BlockStored(
+                    block_hashes=[4208810872343132234],
+                    parent_block_hash=6115672085296369592,
+                    token_ids=[9822, 374],
+                    block_size=2,
+                    lora_id=None,
+                ),
+                # Spain is
+                BlockStored(
+                    block_hashes=[1675819893649989955],
+                    parent_block_hash=6115672085296369592,
+                    token_ids=[18157, 374],
+                    block_size=2,
+                    lora_id=None,
+                ),
+                BlockRemoved(block_hashes=[-3064341286825792715]),
+                # Madrid. The capital of France is Paris. The capital of Italy is Rome. The capital of Spain is Madrid.
+                BlockStored(
+                    block_hashes=[-8505834929190027295],
+                    parent_block_hash=1675819893649989955,
+                    token_ids=[
+                        25048,
+                        13,
+                        578,
+                        6864,
+                        315,
+                        9822,
+                        374,
+                        12366,
+                        13,
+                        578,
+                        6864,
+                        315,
+                        15704,
+                        374,
+                        22463,
+                        13,
+                        578,
+                        6864,
+                        315,
+                        18157,
+                        374,
+                        25048,
+                        13,
+                    ],
+                    block_size=23,
+                    lora_id=None,
+                ),
+            ]
+
+            # Get events
+            events = []
+            start = time.time()
+            max_wait_s = 5
+            while (
+                len(events) < len(expected_events)
+                and (time.time() - start) < max_wait_s
+            ):
+                _, seq_bytes, payload = sub.recv_multipart()
+                event_batch = decoder.decode(payload)
+                for event in event_batch.events:
+                    events.append(event)
+
+            for expected in expected_events:
+                self.assertIn(expected, events)
+
+        finally:
+            kill_process_tree(process.pid)
+
+    def test_kv_events_attn_dp(self):
+        """Test that kv events are properly tagged with DP rank in attention DP mode"""
+
+        # Launch multiple subscribers for different DP ranks
+        decoder = Decoder(type=KVEventBatch)
+        context = zmq.Context()
+
+        # Subscribe to both DP rank endpoints
+        sub_dp0 = context.socket(zmq.SUB)
+        sub_dp0.connect("tcp://localhost:5557")  # DP rank 0
+        topic = "kv-events"
+        sub_dp0.setsockopt_string(zmq.SUBSCRIBE, topic)
+
+        sub_dp1 = context.socket(zmq.SUB)
+        sub_dp1.connect("tcp://localhost:5558")  # DP rank 1 (offset by rank)
+        sub_dp1.setsockopt_string(zmq.SUBSCRIBE, topic)
+
+        # Launch sglang server with DP attention enabled
+        process = popen_launch_server(
+            "silence09/DeepSeek-R1-Small-2layers",
+            DEFAULT_URL_FOR_TEST,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--kv-events-config",
+                '{"publisher": "zmq", "topic": "kv-events"}',
+                "--max-total-tokens",
+                64,
+                "--cuda-graph-max-bs",
+                4,
+                "--enable-dp-attention",
+                "--dp-size",
+                2,
+                "--tp-size",
+                2,
+            ],
+        )
+
+        try:
+            # Make requests to generate events
+            response = requests.get(f"{DEFAULT_URL_FOR_TEST}/health_generate")
+            self.assertEqual(response.status_code, 200)
+
+            # Send multiple requests to trigger events from both DP ranks
+            for i in range(4):
+                response = requests.post(
+                    f"{DEFAULT_URL_FOR_TEST}/generate",
+                    json={
+                        "text": f"Request {i}: The capital of country {i} is",
+                        "sampling_params": {
+                            "temperature": 0,
+                            "max_new_tokens": 16,
+                        },
+                    },
+                )
+
+            # Collect events from both DP ranks
+            events_dp0 = []
+            events_dp1 = []
+            start = time.time()
+            max_wait_s = 10
+            min_events_per_rank = 3  # Expect at least a few events from each rank
+
+            while (time.time() - start) < max_wait_s and (
+                len(events_dp0) < min_events_per_rank
+                or len(events_dp1) < min_events_per_rank
+            ):
+                # Check DP rank 0
+                if sub_dp0.poll(timeout=100):  # 100ms timeout
+                    _, seq_bytes, payload = sub_dp0.recv_multipart()
+                    event_batch = decoder.decode(payload)
+                    print(
+                        f"DP Rank 0 - EventBatch: ts={event_batch.ts}, attn_dp_rank={event_batch.attn_dp_rank}"
+                    )
+                    self.assertEqual(
+                        event_batch.attn_dp_rank,
+                        0,
+                        "DP rank 0 events should have attn_dp_rank=0",
+                    )
+                    for event in event_batch.events:
+                        print(f"  DP0 - {event}")
+                        events_dp0.append(event)
+
+                # Check DP rank 1
+                if sub_dp1.poll(timeout=100):  # 100ms timeout
+                    _, seq_bytes, payload = sub_dp1.recv_multipart()
+                    event_batch = decoder.decode(payload)
+                    print(
+                        f"DP Rank 1 - EventBatch: ts={event_batch.ts}, attn_dp_rank={event_batch.attn_dp_rank}"
+                    )
+                    self.assertEqual(
+                        event_batch.attn_dp_rank,
+                        1,
+                        "DP rank 1 events should have attn_dp_rank=1",
+                    )
+                    for event in event_batch.events:
+                        print(f"  DP1 - {event}")
+                        events_dp1.append(event)
+
+            # Verify we got events from both DP ranks
+            print(f"Collected {len(events_dp0)} events from DP rank 0")
+            print(f"Collected {len(events_dp1)} events from DP rank 1")
+
+            self.assertGreaterEqual(
+                len(events_dp0),
+                min_events_per_rank,
+                f"Expected at least {min_events_per_rank} events from DP rank 0",
+            )
+            self.assertGreaterEqual(
+                len(events_dp1),
+                min_events_per_rank,
+                f"Expected at least {min_events_per_rank} events from DP rank 1",
+            )
+
+            # Verify event types are as expected
+            for events in [events_dp0, events_dp1]:
+                for event in events:
+                    self.assertIsInstance(
+                        event,
+                        (BlockStored, BlockRemoved, AllBlocksCleared),
+                        f"Event should be a KV cache event, got {type(event)}",
+                    )
+
+        finally:
+            sub_dp0.close()
+            sub_dp1.close()
+            context.term()
+            kill_process_tree(process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_local_attn.py b/test/srt/test_local_attn.py
new file mode 100644
index 000000000..923ffdd5e
--- /dev/null
+++ b/test/srt/test_local_attn.py
@@ -0,0 +1,72 @@
+import os
+import unittest
+from types import SimpleNamespace
+
+import requests
+
+from sglang.srt.utils import get_device_sm, kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+@unittest.skipIf(get_device_sm() < 90, "Test requires CUDA SM 90 or higher")
+class TestFlashAttention3LocalAttn(CustomTestCase):
+    model = DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION
+    base_url = DEFAULT_URL_FOR_TEST
+    accuracy_threshold = 0.90
+
+    @classmethod
+    def get_server_args(cls):
+        return [
+            "--cuda-graph-max-bs",
+            "2",
+            "--attention-backend",
+            "fa3",
+            "--tp",
+            "4",
+            "--context-length",
+            "1000000",
+        ]
+
+    @classmethod
+    def setUpClass(cls):
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=cls.get_server_args(),
+            env=os.environ,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        requests.get(self.base_url + "/flush_cache")
+
+        args = SimpleNamespace(
+            num_shots=4,
+            num_questions=100,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+            data_path=None,
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(f"{metrics=}")
+
+        # Use the appropriate metric key based on the test class
+        metric_key = "accuracy"
+        self.assertGreater(metrics[metric_key], self.accuracy_threshold)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_metrics.py b/test/srt/test_metrics.py
new file mode 100644
index 000000000..ce85e0d8a
--- /dev/null
+++ b/test/srt/test_metrics.py
@@ -0,0 +1,88 @@
+import unittest
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestEnableMetrics(CustomTestCase):
+    def test_metrics_enabled(self):
+        """Test that metrics endpoint returns data when enabled"""
+        process = popen_launch_server(
+            DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+            DEFAULT_URL_FOR_TEST,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--enable-metrics", "--cuda-graph-max-bs", 2],
+        )
+
+        try:
+            # Make some requests to generate some metrics
+            response = requests.get(f"{DEFAULT_URL_FOR_TEST}/health_generate")
+            self.assertEqual(response.status_code, 200)
+
+            response = requests.post(
+                f"{DEFAULT_URL_FOR_TEST}/generate",
+                json={
+                    "text": "The capital of France is",
+                    "sampling_params": {
+                        "temperature": 0,
+                        "max_new_tokens": 32,
+                    },
+                    "stream": True,
+                },
+                stream=True,
+            )
+            for _ in response.iter_lines(decode_unicode=False):
+                pass
+
+            # Get metrics
+            metrics_response = requests.get(f"{DEFAULT_URL_FOR_TEST}/metrics")
+            self.assertEqual(metrics_response.status_code, 200)
+            metrics_content = metrics_response.text
+
+            print(f"metrics_content=\n{metrics_content}")
+
+            # Verify essential metrics are present
+            essential_metrics = [
+                "sglang:num_running_reqs",
+                "sglang:num_used_tokens",
+                "sglang:token_usage",
+                "sglang:gen_throughput",
+                "sglang:num_queue_reqs",
+                "sglang:num_grammar_queue_reqs",
+                "sglang:cache_hit_rate",
+                "sglang:spec_accept_length",
+                "sglang:prompt_tokens_total",
+                "sglang:generation_tokens_total",
+                "sglang:cached_tokens_total",
+                "sglang:num_requests_total",
+                "sglang:time_to_first_token_seconds",
+                "sglang:inter_token_latency_seconds",
+                "sglang:e2e_request_latency_seconds",
+            ]
+
+            for metric in essential_metrics:
+                self.assertIn(metric, metrics_content, f"Missing metric: {metric}")
+
+            # Verify model name label is present and correct
+            expected_model_name = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+            self.assertIn(f'model_name="{expected_model_name}"', metrics_content)
+
+            # Verify metrics have values (not empty)
+            self.assertIn("_sum{", metrics_content)
+            self.assertIn("_count{", metrics_content)
+            self.assertIn("_bucket{", metrics_content)
+
+        finally:
+            kill_process_tree(process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_metrics_utils.py b/test/srt/test_metrics_utils.py
new file mode 100644
index 000000000..4d33ad950
--- /dev/null
+++ b/test/srt/test_metrics_utils.py
@@ -0,0 +1,137 @@
+import unittest
+
+from sglang.srt.metrics.utils import generate_buckets, two_sides_exponential_buckets
+
+
+class TestMetricsUtils(unittest.TestCase):
+    """Test cases for metrics utility functions."""
+
+    def test_two_sides_exponential_buckets_basic(self):
+        """Test basic functionality of two_sides_exponential_buckets."""
+        # Test with simple parameters
+        count = 5
+        buckets = two_sides_exponential_buckets(middle=10.0, base=2.0, count=count)
+
+        # Should contain the middle value
+        self.assertIn(10.0, buckets)
+
+        # Should be sorted
+        self.assertEqual(buckets, sorted(buckets))
+
+        # Should have unique values (no duplicates)
+        self.assertEqual(len(buckets), len(set(buckets)))
+
+        # Should have reasonable number of buckets (not exactly count due to ceiling and deduplication)
+        self.assertGreaterEqual(len(buckets), 3)
+        self.assertLessEqual(len(buckets), count + 2)
+
+    def test_two_sides_exponential_buckets_specific_values(self):
+        """Test specific values for two_sides_exponential_buckets."""
+        buckets = two_sides_exponential_buckets(middle=100.0, base=2.0, count=4)
+        expected_values = [96.0, 98.0, 100.0, 102.0, 104.0]
+        self.assertEqual(buckets, expected_values)
+
+    def test_two_sides_exponential_buckets_negative_values(self):
+        """Test two_sides_exponential_buckets with values that could go negative."""
+        buckets = two_sides_exponential_buckets(middle=5.0, base=3.0, count=4)
+
+        # Should not contain negative values (max(0, middle - distance))
+        for bucket in buckets:
+            self.assertGreaterEqual(bucket, 0.0)
+
+        # Should contain the middle value
+        self.assertIn(5.0, buckets)
+
+    def test_two_sides_exponential_buckets_edge_cases(self):
+        """Test edge cases for two_sides_exponential_buckets."""
+        # Count = 1
+        buckets = two_sides_exponential_buckets(middle=10.0, base=2.0, count=1)
+        self.assertIn(10.0, buckets)
+
+        # Very small middle value
+        buckets = two_sides_exponential_buckets(middle=0.1, base=2.0, count=2)
+        self.assertIn(0.1, buckets)
+        for bucket in buckets:
+            self.assertGreaterEqual(bucket, 0.0)
+
+    def test_generate_buckets_default(self):
+        """Test generate_buckets with default rule."""
+        default_buckets = [1.0, 5.0, 10.0, 50.0, 100.0]
+
+        # Test with "default" rule
+        result = generate_buckets(["default"], default_buckets)
+        self.assertEqual(result, default_buckets)
+
+        # Test with None (should default to "default")
+        result = generate_buckets(None, default_buckets)
+        self.assertEqual(result, default_buckets)
+
+        # Test with empty (should default to "default")
+        result = generate_buckets(None, default_buckets)
+        self.assertEqual(result, default_buckets)
+
+    def test_generate_buckets_tse(self):
+        """Test generate_buckets with tse (two sides exponential) rule."""
+        default_buckets = [1.0, 5.0, 10.0]
+
+        # Test with "tse" rule
+        result = generate_buckets(["tse", "10", "2.0", "4"], default_buckets)
+
+        # Should return the same as calling two_sides_exponential_buckets directly
+        expected = two_sides_exponential_buckets(10.0, 2.0, 4)
+        self.assertEqual(result, expected)
+
+    def test_generate_buckets_customer(self):
+        """Test generate_buckets with customer rule."""
+        default_buckets = [1.0, 5.0, 10.0]
+
+        # Test with "customer" rule
+        result = generate_buckets(
+            ["customer", "1.5", "3.2", "7.8", "15.6"], default_buckets
+        )
+        expected = [1.5, 3.2, 7.8, 15.6]
+        self.assertEqual(result, expected)
+
+    def test_generate_buckets_customer_with_integers(self):
+        """Test generate_buckets with customer rule using integer strings."""
+        default_buckets = [1.0, 5.0, 10.0]
+
+        # Test with integer strings
+        result = generate_buckets(["customer", "1", "5", "10", "50"], default_buckets)
+        expected = [1.0, 5.0, 10.0, 50.0]
+        self.assertEqual(result, expected)
+
+    def test_generate_buckets_preserves_order_and_type(self):
+        """Test that generate_buckets preserves order and returns floats."""
+        default_buckets = [1, 5, 10, 50, 100]  # integers
+
+        # Test default rule
+        result = generate_buckets(["default"], default_buckets)
+        self.assertEqual(result, default_buckets)
+        self.assertIsInstance(result, list)
+
+        # Test customer rule with proper float conversion
+        result = generate_buckets(
+            ["customer", "100", "50", "10", "5", "1"], default_buckets
+        )
+        expected = [1.0, 5.0, 10.0, 50.0, 100.0]
+        self.assertEqual(result, expected)
+
+        # All values should be floats
+        for value in result:
+            self.assertIsInstance(value, float)
+
+    def test_integration_tse_through_generate_buckets(self):
+        """Test integration of TSE buckets through generate_buckets function."""
+        default_buckets = [1.0, 10.0, 100.0]
+
+        # Generate buckets using both methods
+        direct_result = two_sides_exponential_buckets(50.0, 1.5, 6)
+        indirect_result = generate_buckets(["tse", "50.0", "1.5", "6"], default_buckets)
+
+        # Results should be identical
+        self.assertEqual(direct_result, indirect_result)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_mla.py b/test/srt/test_mla.py
new file mode 100644
index 000000000..af867797c
--- /dev/null
+++ b/test/srt/test_mla.py
@@ -0,0 +1,52 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MLA_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestMLA(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--enable-torch-compile",
+                "--torch-compile-max-bs",
+                "4",
+                "--chunked-prefill-size",
+                "256",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mgsm_en(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mgsm_en",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreater(metrics["score"], 0.8)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_mla_deepseek_v3.py b/test/srt/test_mla_deepseek_v3.py
new file mode 100644
index 000000000..634100fdb
--- /dev/null
+++ b/test/srt/test_mla_deepseek_v3.py
@@ -0,0 +1,190 @@
+import os
+import unittest
+from types import SimpleNamespace
+
+import requests
+import torch
+
+from sglang.srt.utils import is_cuda, is_hip, kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestMLADeepseekV3(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "lmsys/sglang-ci-dsv3-test"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = ["--trust-remote-code", "--chunked-prefill-size", "256"]
+        if is_cuda():
+            other_args.extend(["--enable-torch-compile", "--cuda-graph-max-bs", "2"])
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.62)
+
+
+class TestMLADeepseekV3DisableFusedFunc(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ["SGLANG_CI_DISABLE_MOE_FUSED_FUNC"] = "1"
+        cls.model = "lmsys/sglang-ci-dsv3-test"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = ["--trust-remote-code", "--chunked-prefill-size", "256"]
+        if is_cuda():
+            other_args.extend(["--cuda-graph-max-bs", "2"])
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.62)
+
+
+@unittest.skipIf(is_hip(), "FA is not available.")
+class TestMLADeepseekV3Fa3Fp8Kvcache(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "lmsys/sglang-ci-dsv3-test"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = [
+            "--trust-remote-code",
+            "--chunked-prefill-size",
+            "256",
+            "--kv-cache-dtype",
+            "fp8_e4m3",
+        ]
+        if is_cuda():
+            other_args.extend(["--attention-backend", "fa3"])
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.62)
+
+
+class TestDeepseekV3MTP(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "lmsys/sglang-ci-dsv3-test"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = [
+            "--trust-remote-code",
+            "--cuda-graph-max-bs",
+            "2",
+            "--disable-radix",
+            "--enable-torch-compile",
+            "--torch-compile-max-bs",
+            "1",
+            "--speculative-algorithm",
+            "EAGLE",
+            "--speculative-num-steps",
+            "2",
+            "--speculative-eagle-topk",
+            "4",
+            "--speculative-num-draft-tokens",
+            "4",
+        ]
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        requests.get(self.base_url + "/flush_cache")
+
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.60)
+
+        server_info = requests.get(self.base_url + "/get_server_info")
+        avg_spec_accept_length = server_info.json()["internal_states"][0][
+            "avg_spec_accept_length"
+        ]
+        print(f"{avg_spec_accept_length=}")
+        self.assertGreater(avg_spec_accept_length, 2.5)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_mla_flashinfer.py b/test/srt/test_mla_flashinfer.py
new file mode 100644
index 000000000..f72aef5a5
--- /dev/null
+++ b/test/srt/test_mla_flashinfer.py
@@ -0,0 +1,124 @@
+import unittest
+from types import SimpleNamespace
+
+import requests
+import torch
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestFlashinferMLA(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "lmsys/sglang-ci-dsv3-test"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = ["--trust-remote-code"]
+        if torch.cuda.is_available() and torch.version.cuda:
+            other_args.extend(
+                [
+                    "--enable-torch-compile",
+                    "--cuda-graph-max-bs",
+                    "4",
+                    "--attention-backend",
+                    "flashinfer",
+                ]
+            )
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.615)
+
+
+class TestFlashinferMLAMTP(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "lmsys/sglang-ci-dsv3-test"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = ["--trust-remote-code"]
+        if torch.cuda.is_available() and torch.version.cuda:
+            other_args.extend(
+                [
+                    "--cuda-graph-max-bs",
+                    "4",
+                    "--enable-torch-compile",
+                    "--torch-compile-max-bs",
+                    "1",
+                    "--speculative-algorithm",
+                    "EAGLE",
+                    "--speculative-num-steps",
+                    "3",
+                    "--speculative-eagle-topk",
+                    "1",
+                    "--speculative-num-draft-tokens",
+                    "4",
+                    "--attention-backend",
+                    "flashinfer",
+                ]
+            )
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        requests.get(self.base_url + "/flush_cache")
+
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.60)
+
+        server_info = requests.get(self.base_url + "/get_server_info")
+        print(f"{server_info=}")
+        avg_spec_accept_length = server_info.json()["internal_states"][0][
+            "avg_spec_accept_length"
+        ]
+        print(f"{avg_spec_accept_length=}")
+        self.assertGreater(avg_spec_accept_length, 2.5)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_mla_fp8.py b/test/srt/test_mla_fp8.py
new file mode 100644
index 000000000..a2fac9883
--- /dev/null
+++ b/test/srt/test_mla_fp8.py
@@ -0,0 +1,49 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestMLA(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--kv-cache-dtype",
+                "fp8_e5m2",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mgsm_en(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mgsm_en",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.8
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_mla_int8_deepseek_v3.py b/test/srt/test_mla_int8_deepseek_v3.py
new file mode 100644
index 000000000..519cb0554
--- /dev/null
+++ b/test/srt/test_mla_int8_deepseek_v3.py
@@ -0,0 +1,214 @@
+import unittest
+from types import SimpleNamespace
+
+import requests
+import torch
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+)
+
+
+class TestMLADeepseekV3ChannelInt8(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "sgl-project/sglang-ci-dsv3-channel-int8-test"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = ["--trust-remote-code"]
+        if torch.cuda.is_available() and torch.version.cuda:
+            other_args.extend(["--enable-torch-compile", "--cuda-graph-max-bs", "2"])
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+
+        self.assertGreaterEqual(metrics["accuracy"], 0.61)
+
+
+class TestDeepseekV3MTPChannelInt8(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "sgl-project/sglang-ci-dsv3-channel-int8-test"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = ["--trust-remote-code"]
+        if torch.cuda.is_available() and torch.version.cuda:
+            other_args.extend(
+                [
+                    "--cuda-graph-max-bs",
+                    "2",
+                    "--disable-radix",
+                    "--enable-torch-compile",
+                    "--torch-compile-max-bs",
+                    "1",
+                    "--speculative-algorithm",
+                    "EAGLE",
+                    "--speculative-draft-model-path",
+                    "sgl-project/sglang-ci-dsv3-channel-int8-test-NextN",
+                    "--speculative-num-steps",
+                    "2",
+                    "--speculative-eagle-topk",
+                    "4",
+                    "--speculative-num-draft-tokens",
+                    "4",
+                ]
+            )
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        requests.get(self.base_url + "/flush_cache")
+
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.60)
+
+        server_info = requests.get(self.base_url + "/get_server_info")
+        avg_spec_accept_length = server_info.json()["internal_states"][0][
+            "avg_spec_accept_length"
+        ]
+        print(f"{avg_spec_accept_length=}")
+        self.assertGreater(avg_spec_accept_length, 2.5)
+
+
+@unittest.skipIf(is_in_ci(), "To reduce the CI execution time.")
+class TestMLADeepseekV3BlockInt8(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "sgl-project/sglang-ci-dsv3-block-int8-test"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = ["--trust-remote-code"]
+        if torch.cuda.is_available() and torch.version.cuda:
+            other_args.extend(["--enable-torch-compile", "--cuda-graph-max-bs", "2"])
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.62)
+
+
+class TestDeepseekV3MTPBlockInt8(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "sgl-project/sglang-ci-dsv3-block-int8-test"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = ["--trust-remote-code"]
+        if torch.cuda.is_available() and torch.version.cuda:
+            other_args.extend(
+                [
+                    "--cuda-graph-max-bs",
+                    "2",
+                    "--disable-radix",
+                    "--enable-torch-compile",
+                    "--torch-compile-max-bs",
+                    "1",
+                    "--speculative-algorithm",
+                    "EAGLE",
+                    "--speculative-num-steps",
+                    "2",
+                    "--speculative-eagle-topk",
+                    "4",
+                    "--speculative-num-draft-tokens",
+                    "4",
+                ]
+            )
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        requests.get(self.base_url + "/flush_cache")
+
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+
+        self.assertGreater(metrics["accuracy"], 0.60)
+
+        server_info = requests.get(self.base_url + "/get_server_info")
+        avg_spec_accept_length = server_info.json()["internal_states"][0][
+            "avg_spec_accept_length"
+        ]
+        print(f"{avg_spec_accept_length=}")
+        self.assertGreater(avg_spec_accept_length, 2.5)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_mla_tp.py b/test/srt/test_mla_tp.py
new file mode 100644
index 000000000..e957cf2de
--- /dev/null
+++ b/test/srt/test_mla_tp.py
@@ -0,0 +1,66 @@
+import unittest
+from types import SimpleNamespace
+
+import torch
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestDeepseekTP2(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "lmsys/sglang-ci-dsv3-test"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = ["--trust-remote-code"]
+        if torch.cuda.is_available() and torch.version.cuda:
+            other_args.extend(
+                ["--tp", "2", "--enable-torch-compile", "--cuda-graph-max-bs", "2"]
+            )
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        self.assertGreater(metrics["accuracy"], 0.62)
+
+    def test_gsm8k_bs1(self):
+        # test torch compile accuracy for bs=1
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=10,
+            max_new_tokens=512,
+            parallel=1,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        self.assertGreater(metrics["accuracy"], 0.62)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_modelopt.py b/test/srt/test_modelopt.py
new file mode 100644
index 000000000..ef6a959ec
--- /dev/null
+++ b/test/srt/test_modelopt.py
@@ -0,0 +1,58 @@
+import unittest
+from types import SimpleNamespace
+
+import torch
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8,
+    DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8_REVISION,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestEvalFP8ModelOptQuantAccuracy(CustomTestCase):
+
+    def _run_test(self, model, other_args, expected_score):
+        base_url = DEFAULT_URL_FOR_TEST
+        other_args = other_args or []
+
+        process = popen_launch_server(
+            model,
+            base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+
+        try:
+            args = SimpleNamespace(
+                base_url=base_url,
+                model=model,
+                eval_name="mmlu",
+                num_examples=64,
+                num_threads=32,
+                temperature=0.1,
+            )
+
+            metrics = run_eval(args)
+            self.assertGreaterEqual(metrics["score"], expected_score)
+        finally:
+            kill_process_tree(process.pid)
+
+    @unittest.skipIf(
+        torch.version.hip is not None, "modelopt quantization unsupported on ROCm"
+    )
+    def test_mmlu_offline_only(self):
+        """Test with offline quantization only."""
+        self._run_test(
+            model=DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8,
+            other_args=[
+                "--revision",
+                DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8_REVISION,
+            ],
+            expected_score=0.64,
+        )
diff --git a/test/srt/test_modelopt_fp8kvcache.py b/test/srt/test_modelopt_fp8kvcache.py
new file mode 100644
index 000000000..a4704c239
--- /dev/null
+++ b/test/srt/test_modelopt_fp8kvcache.py
@@ -0,0 +1,30 @@
+import unittest
+
+from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+
+from sglang.srt.layers.quantization.modelopt_quant import (
+    ModelOptFp8Config,
+    ModelOptFp8KVCacheMethod,
+)
+from sglang.test.test_utils import CustomTestCase
+
+
+class TestModelOptFp8KVCacheMethod(CustomTestCase):
+    def test_kv_cache_method_initialization(self):
+        """Test that ModelOptFp8KVCacheMethod can be instantiated and
+        inherits from BaseKVCacheMethod."""
+        # Create a ModelOptFp8Config object
+        quant_config = ModelOptFp8Config(is_checkpoint_fp8_serialized=True)
+
+        # Instantiate the KV cache method
+        kv_cache_method = ModelOptFp8KVCacheMethod(quant_config)
+
+        # Check inheritance
+        self.assertIsInstance(kv_cache_method, BaseKVCacheMethod)
+
+        # Check that the quant_config is stored
+        self.assertEqual(kv_cache_method.quant_config, quant_config)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_models_from_modelscope.py b/test/srt/test_models_from_modelscope.py
new file mode 100644
index 000000000..dacaae30a
--- /dev/null
+++ b/test/srt/test_models_from_modelscope.py
@@ -0,0 +1,40 @@
+import os
+import shutil
+import subprocess
+import unittest
+from unittest import mock
+
+from sglang.srt.utils import prepare_model_and_tokenizer
+from sglang.test.test_utils import CustomTestCase
+
+
+class TestDownloadFromModelScope(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "iic/nlp_lstmcrf_word-segmentation_chinese-news"
+        stat, output = subprocess.getstatusoutput("pip install modelscope")
+
+        cls.with_modelscope_environ = {k: v for k, v in os.environ.items()}
+        cls.with_modelscope_environ["SGLANG_USE_MODELSCOPE"] = "True"
+
+    @classmethod
+    def tearDownClass(cls):
+        pass
+
+    def test_prepare_model_and_tokenizer(self):
+        from modelscope.utils.file_utils import get_model_cache_root
+
+        model_cache_root = get_model_cache_root()
+        if os.path.exists(model_cache_root):
+            shutil.rmtree(model_cache_root)
+        with mock.patch.dict(os.environ, self.with_modelscope_environ, clear=True):
+            model_path, tokenizer_path = prepare_model_and_tokenizer(
+                self.model, self.model
+            )
+            assert os.path.exists(os.path.join(model_path, "pytorch_model.bin"))
+            assert os.path.exists(os.path.join(tokenizer_path, "config.json"))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_moe_eval_accuracy_large.py b/test/srt/test_moe_eval_accuracy_large.py
new file mode 100644
index 000000000..26bbd247e
--- /dev/null
+++ b/test/srt/test_moe_eval_accuracy_large.py
@@ -0,0 +1,94 @@
+"""
+Usage:
+python -m unittest test_moe_eval_accuracy_large.TestMoEEvalAccuracyLarge.test_mmlu
+"""
+
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    write_github_step_summary,
+)
+
+
+class TestMoEEvalAccuracyLarge(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--log-level-http",
+                "warning",
+                "--tp",
+                "2",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=5000,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreater(metrics["score"], 0.62)
+
+        if is_in_ci():
+            write_github_step_summary(f"### test_mmlu\n" f'{metrics["score"]=:.4f}\n')
+
+    def test_human_eval(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="humaneval",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreater(metrics["score"], 0.40)
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_human_eval\n" f'{metrics["score"]=:.4f}\n'
+            )
+
+    def test_mgsm_en(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mgsm_en",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreater(metrics["score"], 0.61)
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_mgsm_en\n" f'{metrics["score"]=:.4f}\n'
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_mscclpp.py b/test/srt/test_mscclpp.py
new file mode 100644
index 000000000..894598b3d
--- /dev/null
+++ b/test/srt/test_mscclpp.py
@@ -0,0 +1,205 @@
+"""For Now, MSCCL is only supported on TP16 and TP8 case
+
+if [[ $RANK -eq 0  ]]; then
+    ray start --block --head --port=6379 &
+    python3 test_mscclpp.py;
+else
+    ray start --block --address=${MASTER_ADDR}:6379;
+fi
+"""
+
+import itertools
+import os
+import random
+import socket
+import unittest
+from contextlib import contextmanager, nullcontext
+from typing import Any, List, Optional, Union
+
+import ray
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup, ReduceOp
+
+from sglang.srt.distributed import init_distributed_environment
+from sglang.srt.distributed.communication_op import (  # noqa
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.distributed.device_communicators.custom_all_reduce import (
+    CustomAllreduce,
+)
+from sglang.srt.distributed.device_communicators.pymscclpp import PyMscclppCommunicator
+from sglang.srt.distributed.device_communicators.pynccl import PyNcclCommunicator
+from sglang.srt.distributed.parallel_state import (
+    get_tensor_model_parallel_group,
+    graph_capture,
+    initialize_model_parallel,
+    set_custom_all_reduce,
+    set_mscclpp_all_reduce,
+)
+from sglang.srt.distributed.utils import StatelessProcessGroup
+from sglang.test.test_utils import CustomTestCase
+
+
+def get_open_port() -> int:
+    # try ipv4
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+    except OSError:
+        # try ipv6
+        with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+
+
+def multi_process_parallel(
+    world_size: int,
+    master_addr: str,
+    cls: Any,
+    test_target: Any,
+) -> None:
+
+    # Using ray helps debugging the error when it failed
+    # as compared to multiprocessing.
+    # NOTE: We need to set working_dir for distributed tests,
+    # otherwise we may get import errors on ray workers
+
+    ray.init(log_to_driver=True)
+
+    distributed_init_port = get_open_port()
+    refs = []
+    for rank in range(world_size):
+        refs.append(
+            test_target.remote(
+                cls, world_size, master_addr, rank, distributed_init_port
+            )
+        )
+    ray.get(refs)
+
+    ray.shutdown()
+
+
+class TestMSCCLAllReduce(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        random.seed(42)
+        # 1KB to 1MB
+        cls.test_sizes = [512, 4096, 32768, 262144, 524288]
+        cls.world_sizes = [8]
+        TEST_TP16 = int(os.getenv("SGL_MSCCLPP_TEST_TP16", "0"))
+        if TEST_TP16:
+            cls.world_sizes = [16]
+        cls.test_loop = 10
+
+    def test_graph_allreduce(self):
+        TEST_MASTER_ADDR = os.getenv("SGL_MSCCLPP_TEST_MASTER_ADDR", "localhost")
+        for world_size in self.world_sizes:
+            if world_size not in [8, 16]:
+                continue
+            multi_process_parallel(
+                world_size, TEST_MASTER_ADDR, self, self.graph_allreduce
+            )
+
+    def test_eager_allreduce(self):
+        TEST_MASTER_ADDR = os.getenv("SGL_MSCCLPP_TEST_MASTER_ADDR", "localhost")
+        for world_size in self.world_sizes:
+            if world_size not in [8, 16]:
+                continue
+            multi_process_parallel(
+                world_size, TEST_MASTER_ADDR, self, self.eager_allreduce
+            )
+
+    @ray.remote(num_gpus=1, max_calls=1)
+    def graph_allreduce(self, world_size, master_addr, rank, distributed_init_port):
+        del os.environ["CUDA_VISIBLE_DEVICES"]
+        device = torch.device(f"cuda:{rank % torch.cuda.device_count()}")
+        torch.cuda.set_device(device)
+        distributed_init_method = f"tcp://{master_addr}:{distributed_init_port}"
+        set_mscclpp_all_reduce(True)
+        set_custom_all_reduce(False)
+        init_distributed_environment(
+            world_size=world_size,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            local_rank=rank % torch.cuda.device_count(),
+        )
+        initialize_model_parallel(tensor_model_parallel_size=world_size)
+        group = get_tensor_model_parallel_group().device_group
+
+        # A small all_reduce for warmup.
+        # this is needed because device communicators might be created lazily
+        # (e.g. NCCL). This will ensure that the communicator is initialized
+        # before any communication happens, so that this group can be used for
+        # graph capture immediately.
+        data = torch.zeros(1)
+        data = data.to(device=device)
+        torch.distributed.all_reduce(data, group=group)
+        torch.cuda.synchronize()
+        del data
+
+        for sz in self.test_sizes:
+            for dtype in [torch.float32, torch.float16, torch.bfloat16]:
+                for _ in range(self.test_loop):
+                    with graph_capture() as graph_capture_context:
+                        # use integers so result matches NCCL exactly
+                        inp1 = torch.randint(
+                            1,
+                            16,
+                            (sz,),
+                            dtype=dtype,
+                            device=torch.cuda.current_device(),
+                        )
+                        inp2 = torch.randint(
+                            1,
+                            16,
+                            (sz,),
+                            dtype=dtype,
+                            device=torch.cuda.current_device(),
+                        )
+                        torch.cuda.synchronize()
+                        graph = torch.cuda.CUDAGraph()
+                        with torch.cuda.graph(
+                            graph, stream=graph_capture_context.stream
+                        ):
+                            out1 = tensor_model_parallel_all_reduce(inp1)
+                            # the input buffer is immediately modified to test
+                            # synchronization
+                            dist.all_reduce(inp1, group=group)
+                            out2 = tensor_model_parallel_all_reduce(inp2)
+                            dist.all_reduce(inp2, group=group)
+                    graph.replay()
+                    torch.testing.assert_close(out1, inp1)
+                    torch.testing.assert_close(out2, inp2)
+
+    @ray.remote(num_gpus=1, max_calls=1)
+    def eager_allreduce(self, world_size, master_addr, rank, distributed_init_port):
+        del os.environ["CUDA_VISIBLE_DEVICES"]
+        device = torch.device(f"cuda:{rank % torch.cuda.device_count()}")
+        torch.cuda.set_device(device)
+        distributed_init_method = f"tcp://{master_addr}:{distributed_init_port}"
+        set_mscclpp_all_reduce(True)
+        set_custom_all_reduce(False)
+        init_distributed_environment(
+            world_size=world_size,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            local_rank=rank,
+        )
+        initialize_model_parallel(tensor_model_parallel_size=world_size)
+        group = get_tensor_model_parallel_group().device_group
+
+        for sz in self.test_sizes:
+            for dtype in [torch.float32, torch.float16, torch.bfloat16]:
+                for _ in range(self.test_loop):
+                    inp1 = torch.randint(
+                        1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
+                    )
+                    out1 = tensor_model_parallel_all_reduce(inp1)
+                    dist.all_reduce(inp1, group=group)
+                    torch.testing.assert_close(out1, inp1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_multi_instance_release_memory_occupation.py b/test/srt/test_multi_instance_release_memory_occupation.py
new file mode 100644
index 000000000..e4e8d9081
--- /dev/null
+++ b/test/srt/test_multi_instance_release_memory_occupation.py
@@ -0,0 +1,250 @@
+import multiprocessing
+import os
+import subprocess
+import traceback
+import unittest
+from multiprocessing import Process
+from typing import Iterable, Tuple
+
+import torch
+import torch.distributed as dist
+from torch.distributed.device_mesh import init_device_mesh
+from transformers import AutoModelForCausalLM
+
+from sglang.srt.entrypoints.engine import Engine as SglangEngine
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE,
+    CustomTestCase,
+    find_available_port,
+)
+
+TEST_SUITE = dict(
+    model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    mem_fraction_static=0.85,
+    dp_size=2,
+    tp_size=2,
+)
+
+
+class EngineWrapper:
+    """
+    A wrapper around Sglang engine to mock multi instance cases such as RL traing.
+
+    """
+
+    def __init__(
+        self, model_path, random_seed, mem_fraction_static, device_mesh_cpu, base_gpu_id
+    ):
+        self._device_mesh_cpu = device_mesh_cpu
+        self._tp_rank = device_mesh_cpu.get_local_rank()
+        self._rank = device_mesh_cpu.get_rank()
+        self._tp_size = device_mesh_cpu.size()
+        tp_size_per_node = self._tp_size
+        node_rank = self._tp_rank // tp_size_per_node
+        first_rank_in_node = self._tp_rank % tp_size_per_node == 0
+        engine_kwargs = dict(
+            model_path=model_path,
+            random_seed=random_seed,
+            mem_fraction_static=mem_fraction_static,
+            base_gpu_id=base_gpu_id,
+            enable_memory_saver=True,
+            tp_size=self._tp_size,
+            node_rank=node_rank,
+            nnodes=1,
+        )
+        self._engine = None
+        if first_rank_in_node:
+            os.environ["SGLANG_BLOCK_NONZERO_RANK_CHILDREN"] = "0"
+            self._engine = SglangEngine(**engine_kwargs)
+
+        dist.barrier(group=self._device_mesh_cpu.get_group())
+
+    def update_weights_from_tensor(
+        self, named_tensors: Iterable[Tuple[str, torch.Tensor]]
+    ):
+        if self._tp_rank == 0:
+            self._engine.update_weights_from_tensor(list(named_tensors))
+            self._engine.flush_cache()
+        dist.barrier(group=self._device_mesh_cpu.get_group())
+
+    def release_memory_occupation(self, tags):
+        if self._tp_rank == 0:
+            self._engine.release_memory_occupation(tags)
+        dist.barrier(group=self._device_mesh_cpu.get_group())
+
+    def resume_memory_occupation(self, tags):
+        if self._tp_rank == 0:
+            self._engine.resume_memory_occupation(tags)
+        dist.barrier(group=self._device_mesh_cpu.get_group())
+
+    def shutdown(self):
+        if self._tp_rank == 0:
+            self._engine.shutdown()
+        dist.barrier(group=self._device_mesh_cpu.get_group())
+
+
+def get_gpu_memory_gb(gpu_id=0):
+    return torch.cuda.device_memory_used() / 1024**3
+
+
+class TestMultiInstanceReleaseMemoryOccupation(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        multiprocessing.set_start_method("spawn")
+
+    def test_multi_instance_release_memory_occupation(self):
+        master_port = find_available_port(23456)
+
+        dp_size = TEST_SUITE["dp_size"]
+        tp_size = TEST_SUITE["tp_size"]
+        world_size = dp_size * tp_size
+        processes = []
+        output_reader, output_writer = multiprocessing.Pipe(duplex=False)
+        for rank in range(world_size):
+            p = Process(
+                target=_run_sglang_subprocess,
+                kwargs=dict(
+                    rank=rank,
+                    dp_size=dp_size,
+                    tp_size=tp_size,
+                    model_path=TEST_SUITE["model_path"],
+                    master_port=master_port,
+                    output_writer=output_writer,
+                    mem_fraction_static=TEST_SUITE["mem_fraction_static"],
+                ),
+            )
+            p.start()
+            processes.append(p)
+
+        for _ in range(world_size):
+            self.assertTrue(
+                output_reader.recv(), f"Subprocess fail. Check the logs above."
+            )
+        for p in processes:
+            p.join()
+
+
+def _run_sglang_subprocess(
+    rank: int,
+    dp_size: int,
+    tp_size: int,
+    model_path: str,
+    master_port: int,
+    output_writer,
+    mem_fraction_static: float,
+):
+    engine = None
+    try:
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = str(master_port)
+        dist.init_process_group(
+            rank=rank,
+            device_id=torch.device(f"cuda:{rank}"),
+            world_size=dp_size * tp_size,
+        )
+        torch.cuda.set_device(rank)
+
+        base_gpu_id = rank // tp_size * tp_size
+        mesh_kwargs = dict(
+            mesh_shape=(dp_size, tp_size, 1), mesh_dim_names=["dp", "tp", "pp"]
+        )
+        inference_device_mesh_device = init_device_mesh("cuda", **mesh_kwargs)
+        inference_device_mesh_cpu = init_device_mesh("cpu", **mesh_kwargs)
+        print(
+            f"subprocess[{rank=},{base_gpu_id=},{rank=},{tp_size=}] {inference_device_mesh_device=} {inference_device_mesh_cpu=}"
+        )
+
+        _mem_usage = get_gpu_memory_gb(rank)
+        print(f"GPU{rank} Memory usage before starting Engine: {_mem_usage}")
+
+        engine = EngineWrapper(
+            model_path=model_path,
+            random_seed=42,
+            mem_fraction_static=mem_fraction_static,
+            device_mesh_cpu=inference_device_mesh_cpu["tp"],
+            base_gpu_id=base_gpu_id,
+        )
+        print(f"subprocess[{rank=}] {engine=}", flush=True)
+
+        # 1 - release kv cache
+        _mem_usage = get_gpu_memory_gb(rank)
+        print(f"GPU{rank} Memory usage before releasing Sgl KV cache: {_mem_usage}")
+        engine.release_memory_occupation(tags=["kv_cache"])
+        _curr_usage = get_gpu_memory_gb(rank)
+        assert (
+            _curr_usage < _mem_usage
+        ), f"Memory usage after releasing KV cache must be reduced! before: {_mem_usage} vs after: {_curr_usage}"
+
+        # 2 - release sglang weights
+        _mem_usage = get_gpu_memory_gb(rank)
+        print(f"GPU{rank} Memory usage before releasing Sgl weights: {_mem_usage}")
+        engine.release_memory_occupation(tags=["weights"])
+
+        _curr_usage = get_gpu_memory_gb(rank)
+        assert (
+            _curr_usage < _mem_usage
+        ), f"Memory usage after releasing weights must be reduced! before: {_mem_usage} vs after: {_curr_usage}"
+
+        # 3 - load hf model
+        _mem_usage = get_gpu_memory_gb(rank)
+        print(
+            f"GPU{rank} Memory usage after releasing Sgl weights and kv cache: {_mem_usage}"
+        )
+        hf_model = AutoModelForCausalLM.from_pretrained(
+            DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE,
+            torch_dtype="bfloat16",
+            device_map=f"cuda:{rank}",
+            trust_remote_code=True,
+        ).cuda()
+        _curr_usage = get_gpu_memory_gb(rank)
+        assert (
+            _curr_usage > _mem_usage
+        ), f"Memory usage after loading hf model must be increased! before: {_mem_usage} vs after: {_curr_usage}"
+
+        # 4 - resume sglang weights and update the weights
+        _mem_usage = get_gpu_memory_gb(rank)
+        print(f"GPU{rank} Memory usage after loading hf model: {_mem_usage}")
+        engine.resume_memory_occupation(tags=["weights"])
+        engine.update_weights_from_tensor(
+            named_tensors=list(hf_model.named_parameters())
+        )
+
+        # 5 - release hf model
+        _mem_usage = get_gpu_memory_gb(rank)
+        print(f"GPU{rank} Memory usage after resuming Sgl weights: {_mem_usage}")
+        del hf_model
+        torch.cuda.empty_cache()
+        _curr_usage = get_gpu_memory_gb(rank)
+        assert (
+            _curr_usage < _mem_usage
+        ), f"Memory usage after releasing hf model must be reduced! before: {_mem_usage} vs after: {_curr_usage}"
+
+        # 6 - resume slgang kv cache
+        _mem_usage = get_gpu_memory_gb(rank)
+        print(f"GPU{rank} Memory usage after releasing hf model: {_mem_usage}")
+        engine.resume_memory_occupation(tags=["kv_cache"])
+        _curr_usage = get_gpu_memory_gb(rank)
+        assert (
+            _curr_usage > _mem_usage
+        ), f"Memory usage after resuming kv cache must be increased! before: {_mem_usage} vs after: {_curr_usage}"
+
+        # 7 - Final checking!
+        _mem_usage = get_gpu_memory_gb(rank)
+        print(f"GPU{rank} Memory usage after resuming Sgl KV cache: {_mem_usage}")
+
+        execution_ok = True
+    except Exception as e:
+        print(f"subprocess[{rank=}] has error: {e}", flush=True)
+        traceback.print_exc()
+        execution_ok = False
+
+    output_writer.send(execution_ok)
+    output_writer.close()
+
+    if engine:
+        engine.shutdown()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_multi_tokenizer.py b/test/srt/test_multi_tokenizer.py
new file mode 100644
index 000000000..182454e5e
--- /dev/null
+++ b/test/srt/test_multi_tokenizer.py
@@ -0,0 +1,84 @@
+import unittest
+from types import SimpleNamespace
+
+import sglang.srt.managers.io_struct as io_struct
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    auto_config_device,
+    get_benchmark_args,
+    is_in_ci,
+    popen_launch_server,
+    run_benchmark,
+    write_github_step_summary,
+)
+
+
+class TestMultiTokenizer(CustomTestCase):
+    # from test_hicache.py
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--tokenizer-worker-num",
+                8,
+                "--mem-fraction-static",
+                0.7,
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+        metrics = run_eval(args)
+        self.assertGreaterEqual(metrics["score"], 0.65)
+
+    def test_multi_tokenizer_ttft(self):
+        # from test_bench_serving.py run_bench_serving
+        args = get_benchmark_args(
+            base_url=self.base_url,
+            dataset_name="random",
+            dataset_path="",
+            tokenizer=None,
+            num_prompts=100,
+            random_input_len=4096,
+            random_output_len=2048,
+            sharegpt_context_len=None,
+            request_rate=1,
+            disable_stream=False,
+            disable_ignore_eos=False,
+            seed=0,
+            device=auto_config_device(),
+            lora_name=None,
+        )
+        res = run_benchmark(args)
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_multi_tokenizer_ttft\n"
+                f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
+            )
+            self.assertLess(res["median_e2e_latency_ms"], 11000)
+            self.assertLess(res["median_ttft_ms"], 86)
+            self.assertLess(res["median_itl_ms"], 10)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_nightly_gsm8k_eval.py b/test/srt/test_nightly_gsm8k_eval.py
new file mode 100644
index 000000000..a6b3070e4
--- /dev/null
+++ b/test/srt/test_nightly_gsm8k_eval.py
@@ -0,0 +1,171 @@
+import json
+import os
+import unittest
+import warnings
+from datetime import datetime
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1,
+    DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2,
+    DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1,
+    DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    is_in_ci,
+    popen_launch_server,
+    write_github_step_summary,
+)
+
+MODEL_SCORE_THRESHOLDS = {
+    "meta-llama/Llama-3.1-8B-Instruct": 0.82,
+    "mistralai/Mistral-7B-Instruct-v0.3": 0.58,
+    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.85,
+    "google/gemma-2-27b-it": 0.91,
+    "meta-llama/Llama-3.1-70B-Instruct": 0.95,
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.64,
+    "Qwen/Qwen2-57B-A14B-Instruct": 0.86,
+    "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.83,
+    "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54,
+    "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.84,
+    "zai-org/GLM-4.5-Air-FP8": 0.75,
+    # The threshold of neuralmagic/gemma-2-2b-it-FP8 should be 0.6, but this model has some accuracy regression.
+    # The fix is tracked at https://github.com/sgl-project/sglang/issues/4324, we set it to 0.50, for now, to make CI green.
+    "neuralmagic/gemma-2-2b-it-FP8": 0.50,
+    "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.94,
+    "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.65,
+    "neuralmagic/Qwen2-72B-Instruct-FP8": 0.94,
+    "neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.82,
+}
+
+
+def parse_models(model_string):
+    return [model.strip() for model in model_string.split(",") if model.strip()]
+
+
+def popen_launch_server_wrapper(base_url, model, is_tp2):
+    other_args = ["--log-level-http", "warning", "--trust-remote-code"]
+    if is_tp2:
+        other_args.extend(["--tp", "2"])
+
+    process = popen_launch_server(
+        model,
+        base_url,
+        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        other_args=other_args,
+    )
+    return process
+
+
+def write_results_to_json(model, metrics, mode="a"):
+    result = {
+        "timestamp": datetime.now().isoformat(),
+        "model": model,
+        "metrics": metrics,
+        "score": metrics["score"],
+    }
+
+    existing_results = []
+    if mode == "a" and os.path.exists("results.json"):
+        try:
+            with open("results.json", "r") as f:
+                existing_results = json.load(f)
+        except json.JSONDecodeError:
+            existing_results = []
+
+    if isinstance(existing_results, list):
+        existing_results.append(result)
+    else:
+        existing_results = [result]
+
+    with open("results.json", "w") as f:
+        json.dump(existing_results, f, indent=2)
+
+
+def check_model_scores(results):
+    failed_models = []
+    summary = " | model | score | threshold |\n"
+    summary += "| ----- | ----- | --------- |\n"
+
+    for model, score in results:
+        threshold = MODEL_SCORE_THRESHOLDS.get(model)
+        if threshold is None:
+            print(f"Warning: No threshold defined for model {model}")
+            continue
+
+        if score < threshold:
+            failed_models.append(
+                f"\nScore Check Failed: {model}\n"
+                f"Model {model} score ({score:.4f}) is below threshold ({threshold:.4f})"
+            )
+
+        line = f"| {model} | {score} | {threshold} |\n"
+        summary += line
+
+    print(summary)
+
+    if is_in_ci():
+        write_github_step_summary(f"### TestNightlyGsm8KEval\n{summary}")
+
+    if failed_models:
+        raise AssertionError("\n".join(failed_models))
+
+
+# Do not use `CustomTestCase` since `test_mgsm_en_all_models` does not want retry
+class TestNightlyGsm8KEval(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model_groups = [
+            (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False),
+            (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True),
+            (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False),
+            (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True),
+        ]
+        cls.base_url = DEFAULT_URL_FOR_TEST
+
+    def test_mgsm_en_all_models(self):
+        warnings.filterwarnings(
+            "ignore", category=ResourceWarning, message="unclosed.*socket"
+        )
+        is_first = True
+        all_results = []
+
+        for model_group, is_fp8, is_tp2 in self.model_groups:
+            for model in model_group:
+                with self.subTest(model=model):
+                    process = popen_launch_server_wrapper(self.base_url, model, is_tp2)
+
+                    args = SimpleNamespace(
+                        base_url=self.base_url,
+                        model=model,
+                        eval_name="mgsm_en",
+                        num_examples=None,
+                        num_threads=1024,
+                    )
+
+                    metrics = run_eval(args)
+                    print(
+                        f"{'=' * 42}\n{model} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n"
+                    )
+
+                    write_results_to_json(model, metrics, "w" if is_first else "a")
+                    is_first = False
+
+                    all_results.append((model, metrics["score"]))
+                    kill_process_tree(process.pid)
+
+        try:
+            with open("results.json", "r") as f:
+                print("\nFinal Results from results.json:")
+                print(json.dumps(json.load(f), indent=2))
+        except Exception as e:
+            print(f"Error reading results.json: {e}")
+
+        # Check all scores after collecting all results
+        check_model_scores(all_results)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_nightly_gsm8k_eval_amd.py b/test/srt/test_nightly_gsm8k_eval_amd.py
new file mode 100644
index 000000000..d03684b99
--- /dev/null
+++ b/test/srt/test_nightly_gsm8k_eval_amd.py
@@ -0,0 +1,221 @@
+import json
+import os
+import unittest
+import warnings
+from datetime import datetime
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1,
+    DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2,
+    DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1,
+    DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    is_in_ci,
+    popen_launch_server,
+    write_github_step_summary,
+)
+
+MODEL_SCORE_THRESHOLDS = {
+    "meta-llama/Llama-3.1-8B-Instruct": 0.82,
+    "mistralai/Mistral-7B-Instruct-v0.3": 0.58,
+    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.85,
+    "meta-llama/Llama-3.1-70B-Instruct": 0.95,
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.64,
+    "Qwen/Qwen2-57B-A14B-Instruct": 0.86,
+    "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.83,
+    "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54,
+    "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.94,
+    "neuralmagic/Qwen2-72B-Instruct-FP8": 0.94,
+    "neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.86,
+    "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.65,
+    "google/gemma-2-27b-it": 0.91,
+    "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.84,
+}
+
+failing_models = {
+    "neuralmagic/gemma-2-2b-it-FP8",
+}
+
+
+def remove_failing_models(model_str):
+    models = model_str.split(",")
+    filtered = [m for m in models if m not in failing_models]
+    return ",".join(filtered)
+
+
+DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = remove_failing_models(
+    DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1
+)
+DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = remove_failing_models(
+    DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2
+)
+DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = remove_failing_models(
+    DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1
+)
+DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = remove_failing_models(
+    DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2
+)
+
+NO_MOE_PADDING_MODELS = {"neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"}
+DISABLE_HF_XET_MODELS = {
+    "Qwen/Qwen2-57B-A14B-Instruct",
+    "neuralmagic/Qwen2-57B-A14B-Instruct-FP8",
+}
+TRITON_MOE_MODELS = {
+    "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8",
+    "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8",
+    "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    "mistralai/Mistral-7B-Instruct-v0.3",
+}
+
+
+def parse_models(model_string):
+    return [model.strip() for model in model_string.split(",") if model.strip()]
+
+
+def popen_launch_server_wrapper(base_url, model, is_tp2):
+    other_args = ["--log-level-http", "warning", "--trust-remote-code"]
+    if is_tp2:
+        other_args.extend(["--tp", "2"])
+
+    process = popen_launch_server(
+        model,
+        base_url,
+        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        other_args=other_args,
+    )
+    return process
+
+
+def write_results_to_json(model, metrics, mode="a"):
+    result = {
+        "timestamp": datetime.now().isoformat(),
+        "model": model,
+        "metrics": metrics,
+        "score": metrics["score"],
+    }
+
+    existing_results = []
+    if mode == "a" and os.path.exists("results.json"):
+        try:
+            with open("results.json", "r") as f:
+                existing_results = json.load(f)
+        except json.JSONDecodeError:
+            existing_results = []
+
+    if isinstance(existing_results, list):
+        existing_results.append(result)
+    else:
+        existing_results = [result]
+
+    with open("results.json", "w") as f:
+        json.dump(existing_results, f, indent=2)
+
+
+def check_model_scores(results):
+    failed_models = []
+    summary = " | model | score | threshold |\n"
+    summary += "| ----- | ----- | --------- |\n"
+
+    for model, score in results:
+        threshold = MODEL_SCORE_THRESHOLDS.get(model)
+        if threshold is None:
+            print(f"Warning: No threshold defined for model {model}")
+            continue
+
+        if score < threshold:
+            failed_models.append(
+                f"\nScore Check Failed: {model}\n"
+                f"Model {model} score ({score:.4f}) is below threshold ({threshold:.4f})"
+            )
+
+        line = f"| {model} | {score} | {threshold} |\n"
+        summary += line
+
+    print(summary)
+
+    if is_in_ci():
+        write_github_step_summary(f"### TestNightlyGsm8KEval\n{summary}")
+
+    if failed_models:
+        raise AssertionError("\n".join(failed_models))
+
+
+# Do not use `CustomTestCase` since `test_mgsm_en_all_models` does not want retry
+class TestNightlyGsm8KEval(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model_groups = [
+            (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False),
+            (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True),
+            (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False),
+            (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True),
+        ]
+        cls.base_url = DEFAULT_URL_FOR_TEST
+
+    def test_mgsm_en_all_models(self):
+        warnings.filterwarnings(
+            "ignore", category=ResourceWarning, message="unclosed.*socket"
+        )
+        is_first = True
+        all_results = []
+
+        for model_group, is_fp8, is_tp2 in self.model_groups:
+            for model in model_group:
+                with self.subTest(model=model):
+                    os.environ["SGLANG_MOE_PADDING"] = (
+                        "0" if model in NO_MOE_PADDING_MODELS else "1"
+                    )
+                    os.environ["HF_HUB_DISABLE_XET"] = (
+                        "1" if model in DISABLE_HF_XET_MODELS else "0"
+                    )
+                    os.environ["SGLANG_USE_AITER"] = (
+                        "0" if model in TRITON_MOE_MODELS else "1"
+                    )
+
+                    process = popen_launch_server_wrapper(self.base_url, model, is_tp2)
+
+                    args = SimpleNamespace(
+                        base_url=self.base_url,
+                        model=model,
+                        eval_name="mgsm_en",
+                        num_examples=None,
+                        num_threads=1024,
+                    )
+                    # Allow retries, so flaky errors are avoided.
+                    threshold = MODEL_SCORE_THRESHOLDS.get(model)
+                    for attempt in range(3):
+                        try:
+                            metrics = run_eval(args)
+                            score = metrics["score"]
+                            if score >= threshold:
+                                break
+                        except Exception as e:
+                            print(f"Attempt {attempt + 1} failed with error: {e}")
+                    print(
+                        f"{'=' * 42}\n{model} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n"
+                    )
+
+                    write_results_to_json(model, metrics, "w" if is_first else "a")
+                    is_first = False
+
+                    all_results.append((model, metrics["score"]))
+                    kill_process_tree(process.pid)
+
+        try:
+            with open("results.json", "r") as f:
+                print("\nFinal Results from results.json:")
+                print(json.dumps(json.load(f), indent=2))
+        except Exception as e:
+            print(f"Error reading results.json: {e}")
+
+        # Check all scores after collecting all results
+        check_model_scores(all_results)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_no_chunked_prefill.py b/test/srt/test_no_chunked_prefill.py
new file mode 100644
index 000000000..59869ff59
--- /dev/null
+++ b/test/srt/test_no_chunked_prefill.py
@@ -0,0 +1,30 @@
+import unittest
+
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    CustomTestCase,
+    run_bench_serving,
+    run_mmlu_test,
+)
+
+
+class TestNoChunkedPrefill(CustomTestCase):
+
+    def test_no_chunked_prefill(self):
+        run_mmlu_test(
+            disable_radix_cache=False, enable_mixed_chunk=False, chunked_prefill_size=-1
+        )
+
+    def test_no_chunked_prefill_without_radix_cache(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=10,
+            request_rate=float("inf"),
+            other_server_args=["--disable-radix-cache", "--chunked-prefill-size", "-1"],
+        )
+
+        assert res["completed"] == 10
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_no_overlap_scheduler.py b/test/srt/test_no_overlap_scheduler.py
new file mode 100644
index 000000000..d236819bc
--- /dev/null
+++ b/test/srt/test_no_overlap_scheduler.py
@@ -0,0 +1,35 @@
+"""
+Usage:
+python3 -m unittest test_overlap_schedule.TestOverlapSchedule.test_radix_attention_chunked_prefill
+python3 test_overlap_schedule.py
+"""
+
+import unittest
+
+from sglang.test.test_utils import CustomTestCase, run_mmlu_test
+
+
+class TestOverlapSchedule(CustomTestCase):
+    def test_no_radix_attention_chunked_prefill(self):
+        run_mmlu_test(
+            disable_radix_cache=True, chunked_prefill_size=32, disable_overlap=True
+        )
+
+    def test_no_radix_attention_no_chunked_prefill(self):
+        run_mmlu_test(
+            disable_radix_cache=True, chunked_prefill_size=-1, disable_overlap=True
+        )
+
+    def test_radix_attention_chunked_prefill(self):
+        run_mmlu_test(
+            disable_radix_cache=False, chunked_prefill_size=32, disable_overlap=True
+        )
+
+    def test_radix_attention_no_chunked_prefill(self):
+        run_mmlu_test(
+            disable_radix_cache=False, chunked_prefill_size=-1, disable_overlap=True
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_original_logprobs.py b/test/srt/test_original_logprobs.py
new file mode 100644
index 000000000..ddcfe3d8e
--- /dev/null
+++ b/test/srt/test_original_logprobs.py
@@ -0,0 +1,196 @@
+"""Test original log probability alignment between SGLang and Hugging Face.
+
+This test suite verifies the correctness of the `origin_logprobs` output (temperature=1)
+and the `logprobs` output (temperature=0.5) in SGLang by comparing it against
+raw logit-based probabilities computed directly from a reference Hugging Face model.
+
+The test covers the following scenarios:
+- Next-token prediction: Verifies that the log probability of the next token from
+  SGLang matches the Hugging Face model.
+- Top-k logprobs: Ensures that the top-k original logprobs returned by SGLang are
+  consistent with Hugging Face outputs.
+- Specified token IDs: Confirms that the original logprobs for specific token IDs
+  match the values computed from Hugging Face logits.
+"""
+
+import os
+import random
+import unittest
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+import sglang as sgl
+from sglang.test.test_utils import DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+
+# ------------------------- Configurable via env ------------------------- #
+MODEL_ID = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+PROMPTS = [
+    "Hello, my name is",
+    "The future of AI is",
+    "The president of the United States is",
+    "The capital of France is ",
+]
+TOP_LOGPROBS_NUM = 50
+NUM_RANDOM_TOKEN_IDS = 10
+RTOL = 0.20
+ATOL = 0.00
+# ------------------------------------------------
+
+torch.manual_seed(1234)
+if torch.cuda.is_available():
+    torch.cuda.manual_seed_all(1234)
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
+
+
+class TestOriginalLogprob(unittest.TestCase):
+    def setUp(self):
+        # ----- HF side (float32 weights) -----
+        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="right")
+        self.hf_model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID, torch_dtype=torch.float32, device_map="auto"
+        )
+
+        # Shared sampling parameters
+        self.sampling_params = {
+            "temperature": 0.5,  # SGLang uses 0.5, but original logprobs are used 1.0
+            "top_p": 1.0,
+            "top_k": 10,
+            "max_new_tokens": 1,
+        }
+
+    # ---------------------------------------------------------------------
+    # Helper: compare one SGLang block (token_logprobs / top_logprobs / ids_logprobs)
+    #         against a reference HF log‑prob vector.
+    # ---------------------------------------------------------------------
+    def assert_logprobs_block_equal(
+        self,
+        hf_log_probs: torch.Tensor,  # [V]
+        token_log_probs: list,
+        top_log_probs: list,
+        ids_log_probs: list,
+        random_token_ids: list,
+        tag: str = "",
+    ):
+        vals, idxs, _ = zip(*token_log_probs)
+        sgl_vals = torch.tensor(vals, device=self.hf_model.device, dtype=torch.float32)
+        sgl_idxs = torch.tensor(idxs, device=self.hf_model.device, dtype=torch.long)
+        hf_vals = hf_log_probs[sgl_idxs]
+
+        self.assertTrue(
+            torch.allclose(hf_vals, sgl_vals, rtol=RTOL, atol=ATOL),
+            msg=f"[{tag}] token‑level mismatch at indices {sgl_idxs.tolist()}",
+        )
+
+        hf_topk, _ = torch.topk(hf_log_probs, k=TOP_LOGPROBS_NUM, dim=-1)
+
+        sgl_topk = torch.tensor(
+            [float(t[0]) for t in top_log_probs[0] if t and t[0] is not None][
+                :TOP_LOGPROBS_NUM
+            ],
+            dtype=torch.float32,
+            device=self.hf_model.device,
+        )
+
+        k = min(hf_topk.numel(), sgl_topk.numel())
+        self.assertTrue(
+            torch.allclose(hf_topk[:k], sgl_topk[:k], rtol=RTOL, atol=ATOL),
+            msg=f"[{tag}] top‑k mismatch",
+        )
+
+        indices = torch.tensor(
+            random_token_ids, dtype=torch.long, device=hf_log_probs.device
+        )
+
+        hf_token_ids = hf_log_probs[indices]
+
+        sgl_token_ids = torch.tensor(
+            [v for v, _, _ in ids_log_probs[0]],
+            device=self.hf_model.device,
+            dtype=torch.float32,
+        )
+        self.assertTrue(
+            torch.allclose(hf_token_ids, sgl_token_ids, rtol=RTOL, atol=ATOL),
+            msg=f"[{tag}] token‑IDs mismatch",
+        )
+
+        # Optional: print max abs diff for quick diagnostics
+        max_diff = torch.max(torch.abs(hf_vals - sgl_vals)).item()
+        print(f"[{tag}] max|diff| token‑level = {max_diff:.4f}")
+
+    def test_logprob_match(self):
+        vocab_size = self.tokenizer.vocab_size
+
+        for env_val in ["True", "False"]:
+            with self.subTest(return_original_logprob=env_val):
+                os.environ["RETURN_ORIGINAL_LOGPROB"] = env_val
+
+                # ----- SGLang side -----
+                sgl_engine = sgl.Engine(
+                    model_path=MODEL_ID,
+                    skip_tokenizer_init=True,
+                    trust_remote_code=True,
+                    mem_fraction_static=0.60,
+                )
+
+                for prompt in PROMPTS:
+                    random_token_ids = sorted(
+                        random.sample(range(vocab_size), NUM_RANDOM_TOKEN_IDS)
+                    )
+
+                    enc = self.tokenizer(prompt, return_tensors="pt")
+                    input_ids = enc["input_ids"].to(self.hf_model.device)
+                    attn_mask = enc["attention_mask"].to(self.hf_model.device)
+
+                    with torch.inference_mode():
+                        hf_out = self.hf_model(
+                            input_ids=input_ids,
+                            attention_mask=attn_mask,
+                            return_dict=True,
+                        )
+                    logits = hf_out.logits[:, -1, :]  # [1, V]
+                    hf_log_probs = F.log_softmax(
+                        logits.float() / self.sampling_params["temperature"], dim=-1
+                    )[0]
+                    hf_original_log_probs = F.log_softmax(logits.float(), dim=-1)[0]
+
+                    outputs = sgl_engine.generate(
+                        input_ids=input_ids[0].tolist(),
+                        sampling_params=self.sampling_params,
+                        return_logprob=True,
+                        top_logprobs_num=TOP_LOGPROBS_NUM,
+                        token_ids_logprob=random_token_ids,
+                    )
+
+                    if isinstance(outputs, list):
+                        outputs = outputs[0]
+                    meta = outputs["meta_info"]
+
+                    # Check original logprobs only if enabled
+                    if env_val.lower() == "true":
+                        self.assert_logprobs_block_equal(
+                            hf_log_probs=hf_original_log_probs,
+                            token_log_probs=meta["output_token_logprobs"],
+                            top_log_probs=meta["output_top_logprobs"],
+                            ids_log_probs=meta["output_token_ids_logprobs"],
+                            random_token_ids=random_token_ids,
+                            tag=f"Original logprobs SGLang vs HF: {prompt} ({env_val})",
+                        )
+                    else:
+                        # Always check regular logprobs
+                        self.assert_logprobs_block_equal(
+                            hf_log_probs=hf_log_probs,
+                            token_log_probs=meta["output_token_logprobs"],
+                            top_log_probs=meta["output_top_logprobs"],
+                            ids_log_probs=meta["output_token_ids_logprobs"],
+                            random_token_ids=random_token_ids,
+                            tag=f"logprobs SGLang vs HF: {prompt} ({env_val})",
+                        )
+                sgl_engine.shutdown()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_page_size.py b/test/srt/test_page_size.py
new file mode 100644
index 000000000..f56e1ed6b
--- /dev/null
+++ b/test/srt/test_page_size.py
@@ -0,0 +1,47 @@
+import os
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestPageSize(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ["SGLANG_DEBUG_MEMORY_POOL"] = "1"
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--page-size", 4, "--chunked-prefill-size", 128],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreaterEqual(metrics["score"], 0.65)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_patch_torch.py b/test/srt/test_patch_torch.py
new file mode 100644
index 000000000..a2c04509e
--- /dev/null
+++ b/test/srt/test_patch_torch.py
@@ -0,0 +1,133 @@
+import os
+import traceback
+import unittest
+from typing import Dict, List
+
+import torch
+import torch.multiprocessing as mp
+
+from sglang.srt.patch_torch import monkey_patch_torch_reductions
+
+
+class TestReleaseMemoryOccupation(unittest.TestCase):
+    def test_monkey_patch_torch_reductions(self):
+        mp.set_start_method("spawn", force=True)
+
+        for enable_patch in [False, True]:
+            for params in [
+                # Same visible devices
+                dict(
+                    sender_info=dict(
+                        visible_devices=[0, 1],
+                        tensor_device=1,
+                    ),
+                    receiver_info=dict(
+                        visible_devices=[0, 1],
+                        tensor_device=1,
+                    ),
+                ),
+                # Different visible devices
+                dict(
+                    sender_info=dict(
+                        visible_devices=[0, 1],
+                        tensor_device=1,
+                    ),
+                    receiver_info=dict(
+                        visible_devices=[1, 0],
+                        # If enable patch, this should be fixed, and cuda:1 becomes cuda:0
+                        tensor_device=0 if enable_patch else 1,
+                    ),
+                ),
+            ]:
+                with self.subTest(f"{enable_patch=} {params=}"):
+                    self._test_monkey_patch_torch_reductions_core(
+                        enable_patch=enable_patch, **params
+                    )
+
+    def _test_monkey_patch_torch_reductions_core(
+        self,
+        sender_info: Dict,
+        receiver_info: Dict,
+        enable_patch: bool,
+    ):
+        print(
+            f'test_monkey_patch_torch_reductions_core {os.environ.get("CUDA_VISIBLE_DEVICES")=}'
+        )
+        cuda_visible_devices_list: List[int] = [
+            int(x)
+            for x in os.environ.get("CUDA_VISIBLE_DEVICES", "0,1,2,3,4,5,6,7").split(
+                ","
+            )
+        ]
+
+        processes = []
+        output_reader, output_writer = mp.Pipe(duplex=False)
+        queue = mp.Queue()
+        for role, info in [
+            ("sender", sender_info),
+            ("receiver", receiver_info),
+        ]:
+            os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
+                str(cuda_visible_devices_list[device])
+                for device in info["visible_devices"]
+            )
+            p = mp.Process(
+                target=_run_subprocess,
+                kwargs=dict(
+                    role=role,
+                    queue=queue,
+                    output_writer=output_writer,
+                    tensor_device=info["tensor_device"],
+                    enable_patch=enable_patch,
+                ),
+            )
+            p.start()
+            processes.append(p)
+
+        for _ in range(len(processes)):
+            self.assertTrue(
+                output_reader.recv(), f"Subprocess has error, please see logs above."
+            )
+
+        for p in processes:
+            p.join()
+
+
+def _run_subprocess(
+    role: str, queue: mp.Queue, output_writer, tensor_device: int, enable_patch: bool
+):
+    print(
+        f'subprocess[{role}] start {os.environ.get("CUDA_VISIBLE_DEVICES")=}',
+        flush=True,
+    )
+
+    if enable_patch:
+        print(f"subprocess[{role}] execute monkey_patch_torch_reductions", flush=True)
+        monkey_patch_torch_reductions()
+
+    try:
+        if role == "sender":
+            tensor = torch.tensor([1.0, 2.0], device=f"cuda:{tensor_device}")
+            print(f"sender queue.put {tensor=} {tensor.device=}")
+            queue.put(tensor)
+            assert queue.get() == "done"
+        elif role == "receiver":
+            tensor = queue.get()
+            print(f"receiver queue.get {tensor=} {tensor.device=}")
+            assert str(tensor.device) == f"cuda:{tensor_device}"
+            queue.put("done")
+        else:
+            raise NotImplementedError
+
+        execution_ok = True
+    except Exception as e:
+        print(f"subprocess[{role}] has error: {e}", flush=True)
+        traceback.print_exc()
+        execution_ok = False
+
+    output_writer.send(execution_ok)
+    output_writer.close()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_penalty.py b/test/srt/test_penalty.py
new file mode 100644
index 000000000..bfbd2777a
--- /dev/null
+++ b/test/srt/test_penalty.py
@@ -0,0 +1,95 @@
+import json
+import random
+import unittest
+from concurrent.futures import ThreadPoolExecutor
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestPenalty(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def run_decode(self, sampling_params):
+        return_logprob = True
+        top_logprobs_num = 5
+        return_text = True
+        n = 1
+
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                # prompt that is supposed to generate < 32 tokens
+                "text": "<|start_header_id|>user<|end_header_id|>\n\nWhat is the answer for 1 + 1 = ?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+                "sampling_params": {
+                    "max_new_tokens": 48,
+                    "n": n,
+                    **sampling_params,
+                },
+                "return_logprob": return_logprob,
+                "top_logprobs_num": top_logprobs_num,
+                "return_text_in_logprobs": return_text,
+                "logprob_start_len": 0,
+            },
+        )
+        self.assertEqual(response.status_code, 200)
+        print(json.dumps(response.json()))
+        print("=" * 100)
+
+    def test_default_values(self):
+        self.run_decode({})
+
+    def test_frequency_penalty(self):
+        self.run_decode({"frequency_penalty": 2})
+
+    def test_min_new_tokens(self):
+        self.run_decode({"min_new_tokens": 16})
+
+    def test_presence_penalty(self):
+        self.run_decode({"presence_penalty": 2})
+
+    def test_penalty_mixed(self):
+        args = [
+            {},
+            {},
+            {},
+            {"frequency_penalty": 2},
+            {"presence_penalty": 1},
+            {"min_new_tokens": 16},
+            {"frequency_penalty": 0.2},
+            {"presence_penalty": 0.4},
+            {"min_new_tokens": 8},
+            {"frequency_penalty": 0.4, "presence_penalty": 0.8},
+            {"frequency_penalty": 0.4, "min_new_tokens": 12},
+            {"presence_penalty": 0.8, "min_new_tokens": 12},
+            {"presence_penalty": -0.3, "frequency_penalty": 1.3, "min_new_tokens": 32},
+            {"presence_penalty": 0.3, "frequency_penalty": -1.3, "min_new_tokens": 32},
+        ]
+        random.shuffle(args * 5)
+        with ThreadPoolExecutor(8) as executor:
+            list(executor.map(self.run_decode, args))
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=3)
diff --git a/test/srt/test_pp_single_node.py b/test/srt/test_pp_single_node.py
new file mode 100644
index 000000000..e333c2d4c
--- /dev/null
+++ b/test/srt/test_pp_single_node.py
@@ -0,0 +1,322 @@
+"""
+Usage:
+python3 -m unittest test_pp_single_node.TestPPAccuracy.test_gsm8k
+python3 -m unittest test_pp_single_node.TestQwenPPAccuracy.test_pp_consistency
+python3 -m unittest test_pp_single_node.TestFixedBugs.test_chunked_prefill_with_small_bs
+"""
+
+import time
+import unittest
+from types import SimpleNamespace
+
+import requests
+
+from sglang.bench_one_batch_server import BenchArgs as OneBatchBenchArgs
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MLA_MODEL_NAME_FOR_TEST,
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    run_bench_one_batch_server,
+)
+
+
+class TestPPAccuracy(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.base_url = "http://127.0.0.1:23333"
+        cls.process = popen_launch_server(
+            DEFAULT_MODEL_NAME_FOR_TEST,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--tp-size",
+                2,
+                "--pp-size",
+                2,
+                "--chunked-prefill-size",
+                256,
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(f"{metrics=}")
+
+        self.assertGreater(metrics["accuracy"], 0.74)
+        # Wait a little bit so that the memory check happens.
+        time.sleep(4)
+
+    def test_logprob(self):
+        response = requests.post(
+            f"{self.base_url}/generate",
+            json={
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": 16,
+                },
+                "return_logprob": True,
+                "top_logprobs_num": 5,
+                "logprob_start_len": 0,
+            },
+        )
+        response_json = response.json()
+        input_token_logprobs = response_json["meta_info"]["input_token_logprobs"]
+        output_token_logprobs = response_json["meta_info"]["output_token_logprobs"]
+        output_top_logprobs = response_json["meta_info"]["output_top_logprobs"]
+
+        assert len(input_token_logprobs) == 6
+        assert len(output_token_logprobs) == 16
+        assert len(output_top_logprobs) == 16
+
+
+class TestDPAttentionDP2PP2(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "2",
+                "--pp-size",
+                "2",
+                "--enable-dp-attention",
+                "--dp",
+                "2",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mgsm_en(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mgsm_en",
+            num_examples=None,
+            num_threads=1024,
+        )
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["score"], 0.8)
+
+
+class TestQwenPPAccuracy(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.base_url = "http://127.0.0.1:23334"  # different ports to avoid conflicts
+        cls.model_name = "Qwen/Qwen3-8B"  # replace with your Qwen Model if needed
+
+    def run_gsm8k_test(self, pp_size):
+        process = popen_launch_server(
+            self.model_name,
+            self.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--pp-size",
+                pp_size,
+                "--chunked-prefill-size",
+                256,
+            ],
+        )
+
+        try:
+            args = SimpleNamespace(
+                num_shots=5,
+                data_path=None,
+                num_questions=200,
+                max_new_tokens=512,
+                parallel=128,
+                host="http://127.0.0.1",
+                port=int(self.base_url.split(":")[-1]),
+            )
+            metrics = run_eval_few_shot_gsm8k(args)
+            time.sleep(5)
+            return metrics
+        finally:
+            kill_process_tree(process.pid)
+
+    @unittest.skipIf(is_in_ci(), "To reduce the CI execution time.")
+    def test_pp_consistency(self):
+        baseline = self.run_gsm8k_test(pp_size=1)
+        pp_metrics = self.run_gsm8k_test(pp_size=2)
+
+        print(f"[Qwen PP Comparison] Baseline: {baseline} | PP: {pp_metrics}")
+
+        self.assertGreaterEqual(baseline["accuracy"], 0.74)
+        self.assertGreaterEqual(
+            pp_metrics["accuracy"],
+            baseline["accuracy"] - 0.02,
+            msg=(
+                f"PP accuracy dropped more than 1% compared to baseline. "
+                f"Baseline: {baseline['accuracy']:.2%}, PP: {pp_metrics['accuracy']:.2%}"
+            ),
+        )
+
+
+class TestQwenPPTieWeightsAccuracy(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.base_url = "http://127.0.0.1:23335"  # different ports to avoid conflicts
+        cls.model_name = (
+            "Qwen/Qwen3-0.6B"  # qwen3 < 8B all have tie_word_embeddings = True
+        )
+
+    def run_gsm8k_test(self, pp_size):
+        process = popen_launch_server(
+            self.model_name,
+            self.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--pp-size",
+                pp_size,
+                "--chunked-prefill-size",
+                256,
+            ],
+        )
+
+        try:
+            args = SimpleNamespace(
+                num_shots=5,
+                data_path=None,
+                num_questions=200,
+                max_new_tokens=512,
+                parallel=128,
+                host="http://127.0.0.1",
+                port=int(self.base_url.split(":")[-1]),
+            )
+            metrics = run_eval_few_shot_gsm8k(args)
+            time.sleep(5)
+            return metrics
+        finally:
+            kill_process_tree(process.pid)
+
+    def test_pp_consistency(self):
+        baseline = self.run_gsm8k_test(pp_size=1)
+        pp_metrics = self.run_gsm8k_test(pp_size=2)
+
+        print(f"[Qwen PP Comparison] Baseline: {baseline} | PP: {pp_metrics}")
+
+        self.assertGreaterEqual(baseline["accuracy"], 0.38)
+        self.assertGreaterEqual(
+            pp_metrics["accuracy"],
+            baseline["accuracy"] - 0.02,
+            msg=(
+                f"PP accuracy dropped more than 1% compared to baseline. "
+                f"Baseline: {baseline['accuracy']:.2%}, PP: {pp_metrics['accuracy']:.2%}"
+            ),
+        )
+
+
+class TestQwenMoePPAccuracy(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.base_url = "http://127.0.0.1:23336"  # different ports to avoid conflicts
+        cls.model_name = "Qwen/Qwen3-30B-A3B"  # replace with your Qwen Model if needed
+
+    def run_gsm8k_test(self, pp_size):
+        process = popen_launch_server(
+            self.model_name,
+            self.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--pp-size",
+                pp_size,
+                "--chunked-prefill-size",
+                256,
+            ],
+        )
+
+        try:
+            args = SimpleNamespace(
+                num_shots=5,
+                data_path=None,
+                num_questions=200,
+                max_new_tokens=512,
+                parallel=128,
+                host="http://127.0.0.1",
+                port=int(self.base_url.split(":")[-1]),
+            )
+            metrics = run_eval_few_shot_gsm8k(args)
+            time.sleep(5)
+            return metrics
+        finally:
+            kill_process_tree(process.pid)
+
+    def test_pp_consistency(self):
+        baseline = self.run_gsm8k_test(pp_size=1)
+        pp_metrics = self.run_gsm8k_test(pp_size=2)
+
+        print(f"[Qwen PP Comparison] Baseline: {baseline} | PP: {pp_metrics}")
+
+        self.assertGreaterEqual(baseline["accuracy"], 0.74)
+        self.assertGreaterEqual(
+            pp_metrics["accuracy"],
+            baseline["accuracy"] - 0.02,
+            msg=(
+                f"PP accuracy dropped more than 1% compared to baseline. "
+                f"Baseline: {baseline['accuracy']:.2%}, PP: {pp_metrics['accuracy']:.2%}"
+            ),
+        )
+
+
+class TestFixedBugs(unittest.TestCase):
+    def test_chunked_prefill_with_small_bs(self):
+        model = DEFAULT_MODEL_NAME_FOR_TEST
+        server_args = ServerArgs(model_path=model)
+        bench_args = OneBatchBenchArgs(
+            batch_size=(1,),
+            input_len=(1,),
+            output_len=(1,),
+            base_url=DEFAULT_URL_FOR_TEST,
+        )
+        other_server_args = [
+            "--tp-size",
+            2,
+            "--pp-size",
+            2,
+            "--chunked-prefill",
+            256,
+            "--max-running-requests",
+            2,
+        ]
+        run_bench_one_batch_server(
+            model,
+            DEFAULT_URL_FOR_TEST,
+            server_args,
+            bench_args,
+            other_server_args,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_pytorch_sampling_backend.py b/test/srt/test_pytorch_sampling_backend.py
new file mode 100644
index 000000000..8a368310e
--- /dev/null
+++ b/test/srt/test_pytorch_sampling_backend.py
@@ -0,0 +1,90 @@
+import unittest
+from types import SimpleNamespace
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestPyTorchSamplingBackend(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--sampling-backend", "pytorch", "--disable-radix-cache"],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+            temperature=0.1,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreaterEqual(metrics["score"], 0.65)
+
+    def test_greedy(self):
+
+        first_text = None
+
+        # ensure the answer is identical across single response
+        for _ in range(5):
+            response_single = requests.post(
+                self.base_url + "/generate",
+                json={
+                    "text": "The capital of Germany is",
+                    "sampling_params": {
+                        "temperature": 0,
+                        "max_new_tokens": 32,
+                    },
+                },
+            ).json()
+            text = response_single["text"]
+            if first_text is None:
+                first_text = text
+
+            self.assertEqual(text, first_text)
+
+        first_text = None
+
+        response_batch = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": ["The capital of Germany is"] * 10,
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": 32,
+                },
+            },
+        ).json()
+
+        # ensure the answer is identical among the batch
+        for i in range(10):
+            text = response_batch[i]["text"]
+            if first_text is None:
+                first_text = text
+            self.assertEqual(text, first_text)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_quick_allreduce.py b/test/srt/test_quick_allreduce.py
new file mode 100644
index 000000000..ed081255f
--- /dev/null
+++ b/test/srt/test_quick_allreduce.py
@@ -0,0 +1,212 @@
+import os
+import random
+import socket
+import unittest
+from typing import Any
+
+import ray
+import torch
+import torch.distributed as dist
+
+from sglang.srt.distributed import init_distributed_environment
+from sglang.srt.distributed.communication_op import (  # noqa
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.distributed.device_communicators.quick_all_reduce import (
+    qr_rocm_arch_available,
+)
+from sglang.srt.distributed.parallel_state import (
+    get_tensor_model_parallel_group,
+    graph_capture,
+    initialize_model_parallel,
+)
+from sglang.test.test_utils import CustomTestCase
+
+torch.manual_seed(42)
+random.seed(44)  # keep the deterministic seed
+
+
+def get_open_port() -> int:
+    # try ipv4
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+    except OSError:
+        # try ipv6
+        with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+
+
+def multi_process_parallel(
+    world_size: int, cls: Any, test_target: Any, quant_mode: str
+) -> None:
+
+    # Using ray helps debugging the error when it failed
+    # as compared to multiprocessing.
+    # NOTE: We need to set working_dir for distributed tests,
+    # otherwise we may get import errors on ray workers
+
+    ray.init(log_to_driver=True)
+
+    distributed_init_port = get_open_port()
+    refs = []
+    for rank in range(world_size):
+        refs.append(
+            test_target.remote(cls, world_size, rank, distributed_init_port, quant_mode)
+        )
+    ray.get(refs)
+
+    ray.shutdown()
+
+
+class TestQuickAllReduce(CustomTestCase):
+    TEST_SIZES = [
+        2 * 1024 * 1024,
+        4 * 1024 * 1024,
+        8 * 1024 * 1024,
+        16 * 1024 * 1024,
+        32 * 1024 * 1024,
+    ]
+    TEST_LOOP = 5
+    # Too many configurations can lead to a test grid that is too large
+    # The tp takes too long to boot,let's just choose 4 out of 12 configurations
+    # WORLD_SIZES = [2, 4, 8]
+    # QUANT_MODE = ["FP", "INT8", "INT6", "INT4"]
+    QUANT_MODE_WORLD_SIZE_PART = [["FP", 8], ["INT4", 4], ["INT8", 2], ["INT6", 2]]
+
+    @unittest.skipIf(
+        not qr_rocm_arch_available(),
+        "Only test Quick AllReduce on ROCm architectures >= gfx94*",
+    )
+    def test_graph_allreduce(self):
+        for quant_mode_world_size_part in self.QUANT_MODE_WORLD_SIZE_PART:
+            quant_mode = quant_mode_world_size_part[0]
+            world_size = quant_mode_world_size_part[1]
+            if world_size > torch.cuda.device_count():
+                continue
+            multi_process_parallel(world_size, self, self.graph_allreduce, quant_mode)
+
+    @unittest.skipIf(
+        not qr_rocm_arch_available(),
+        "Only test Quick AllReduce on ROCm architectures >= gfx94*",
+    )
+    def test_eager_allreduce(self):
+        for quant_mode_world_size_part in self.QUANT_MODE_WORLD_SIZE_PART:
+            quant_mode = quant_mode_world_size_part[0]
+            world_size = quant_mode_world_size_part[1]
+            if world_size > torch.cuda.device_count():
+                continue
+            multi_process_parallel(world_size, self, self.eager_allreduce, quant_mode)
+
+    @ray.remote(num_gpus=1, max_calls=1)
+    def graph_allreduce(self, world_size, rank, distributed_init_port, quant_mode):
+        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+        os.environ["ROCM_QUICK_REDUCE_QUANTIZATION"] = quant_mode
+        os.environ["ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16"] = "0"
+        device = torch.device(f"cuda:{rank}")
+        torch.cuda.set_device(device)
+        distributed_init_method = f"tcp://localhost:{distributed_init_port}"
+        init_distributed_environment(
+            world_size=world_size,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            local_rank=rank,
+        )
+        initialize_model_parallel(tensor_model_parallel_size=world_size)
+        group = get_tensor_model_parallel_group().device_group
+
+        # A small all_reduce for warmup.
+        # this is needed because device communicators might be created lazily
+        # (e.g. NCCL). This will ensure that the communicator is initialized
+        # before any communication happens, so that this group can be used for
+        # graph capture immediately.
+        data = torch.zeros(1)
+        data = data.to(device=device)
+        torch.distributed.all_reduce(data, group=group)
+        torch.cuda.synchronize()
+        del data
+
+        for sz in self.TEST_SIZES:
+            for dtype in [torch.float16, torch.bfloat16]:
+                for _ in range(self.TEST_LOOP):
+                    with graph_capture() as graph_capture_context:
+                        # use integers so result matches NCCL exactly
+                        inp1 = torch.randint(
+                            1,
+                            23,
+                            (sz,),
+                            dtype=dtype,
+                            device=torch.cuda.current_device(),
+                        )
+                        inp2 = torch.randint(
+                            -23,
+                            1,
+                            (sz,),
+                            dtype=dtype,
+                            device=torch.cuda.current_device(),
+                        )
+                        torch.cuda.synchronize()
+                        graph = torch.cuda.CUDAGraph()
+                        with torch.cuda.graph(
+                            graph, stream=graph_capture_context.stream
+                        ):
+                            out1 = tensor_model_parallel_all_reduce(inp1)
+                            # the input buffer is immediately modified to test
+                            # synchronization
+                            dist.all_reduce(inp1, group=group)
+                            out2 = tensor_model_parallel_all_reduce(inp2)
+                            dist.all_reduce(inp2, group=group)
+                    graph.replay()
+                    atol = 1.25 * world_size
+                    rtol = 0.5 * world_size
+                    for inp, out in [[inp1, out1], [inp2, out2]]:
+                        torch.testing.assert_close(out, inp, atol=atol, rtol=rtol)
+                        # try:
+                        #     torch.testing.assert_close(out, inp, atol=atol, rtol=rtol)
+                        # except AssertionError as e:
+                        #     print("Max abs diff:", (out - inp).abs().max())
+                        #     print("Max rel diff:", ((out - inp).abs() / inp.abs().clamp(min=1e-5)).max())
+
+    @ray.remote(num_gpus=1, max_calls=1)
+    def eager_allreduce(self, world_size, rank, distributed_init_port, quant_mode):
+        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+        os.environ["ROCM_QUICK_REDUCE_QUANTIZATION"] = quant_mode
+        os.environ["ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16"] = "0"
+        device = torch.device(f"cuda:{rank}")
+        torch.cuda.set_device(device)
+        distributed_init_method = f"tcp://localhost:{distributed_init_port}"
+        init_distributed_environment(
+            world_size=world_size,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            local_rank=rank,
+        )
+        initialize_model_parallel(tensor_model_parallel_size=world_size)
+        group = get_tensor_model_parallel_group().device_group
+
+        for sz in self.TEST_SIZES:
+            for dtype in [torch.float16, torch.bfloat16]:
+                for _ in range(self.TEST_LOOP):
+                    inp1 = torch.randint(
+                        1,
+                        23,
+                        (sz,),
+                        dtype=dtype,
+                        device=torch.cuda.current_device(),
+                    )
+                    out1 = tensor_model_parallel_all_reduce(inp1)
+                    dist.all_reduce(inp1, group=group)
+                    atol = 1.25 * world_size
+                    rtol = 0.5 * world_size
+                    torch.testing.assert_close(out1, inp1, atol=atol, rtol=rtol)
+                    # try:
+                    #     torch.testing.assert_close(out1, inp1, atol=atol, rtol=rtol)
+                    # except AssertionError as e:
+                    #     print("Max abs diff:", (out1 - inp1).abs().max())
+                    #     print("Max rel diff:", ((out1 - inp1).abs() / inp1.abs().clamp(min=1e-5)).max())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_radix_attention.py b/test/srt/test_radix_attention.py
new file mode 100644
index 000000000..66f948621
--- /dev/null
+++ b/test/srt/test_radix_attention.py
@@ -0,0 +1,136 @@
+import os
+import random
+import unittest
+
+import requests
+
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    kill_process_tree,
+    popen_launch_server,
+)
+
+
+def gen_radix_tree(num_nodes=400, chunk_len=256):
+    num0 = num_nodes // 2
+    num1 = num_nodes - num0
+    nodes = [{"input_ids": [37] * 117, "decode_len": 217}]
+    for _ in range(num0):
+        parent = random.choice(nodes)
+        unique_len = random.randint(0, chunk_len)
+        decode_len = random.randint(0, chunk_len)
+        token_id = random.randint(0, 32000)
+        child = {
+            "input_ids": parent["input_ids"] + [token_id] * unique_len,
+            "decode_len": decode_len,
+        }
+        nodes.append(child)
+
+    while num1 > 0:
+        num_branch = random.randint(1, min(num1, 10))
+        parent = random.choice(nodes)
+        for _ in range(num_branch):
+            unique_len = random.randint(0, chunk_len)
+            decode_len = random.randint(0, chunk_len)
+            token_id = random.randint(0, 32000)
+            child = {
+                "input_ids": parent["input_ids"] + [token_id] * unique_len,
+                "decode_len": decode_len,
+            }
+            nodes.append(child)
+
+        num1 -= num_branch
+
+    random.shuffle(nodes)
+    return nodes
+
+
+def run_test(base_url, nodes):
+    data = {
+        "input_ids": [node["input_ids"] for node in nodes],
+        "sampling_params": [
+            {"max_new_tokens": node["decode_len"], "temperature": 0} for node in nodes
+        ],
+    }
+
+    res = requests.post(base_url + "/generate", json=data)
+    assert res.status_code == 200
+
+
+class TestRadixCacheFCFS(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--chunked-prefill-size",
+                "128",
+                "--max-total-tokens",
+                "20000",
+                "--schedule-policy",
+                "fcfs",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_radix_attention(self):
+        nodes = gen_radix_tree()
+        run_test(self.base_url, nodes)
+
+
+@unittest.skipIf(is_in_ci(), "To reduce the CI execution time.")
+class TestRadixCacheLPM(TestRadixCacheFCFS):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--chunked-prefill-size",
+                "128",
+                "--max-total-tokens",
+                "20000",
+                "--schedule-policy",
+                "lpm",
+            ],
+        )
+
+
+class TestRadixCacheNonOverlapLPM(TestRadixCacheFCFS):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--disable-overlap-schedule",
+                "--chunked-prefill-size",
+                "128",
+                "--max-total-tokens",
+                "20000",
+                "--schedule-policy",
+                "lpm",
+            ],
+        )
+
+
+if __name__ == "__main__":
+    os.environ["SGLANG_TEST_RETRACT"] = "true"
+    unittest.main()
diff --git a/test/srt/test_reasoning_parser.py b/test/srt/test_reasoning_parser.py
new file mode 100644
index 000000000..7d3f2a139
--- /dev/null
+++ b/test/srt/test_reasoning_parser.py
@@ -0,0 +1,603 @@
+import unittest
+
+from sglang.srt.parser.reasoning_parser import (
+    BaseReasoningFormatDetector,
+    DeepSeekR1Detector,
+    KimiDetector,
+    Qwen3Detector,
+    ReasoningParser,
+    StreamingParseResult,
+)
+from sglang.test.test_utils import CustomTestCase
+
+
+class TestStreamingParseResult(CustomTestCase):
+    def test_init_default(self):
+        """Test default initialization of StreamingParseResult."""
+        result = StreamingParseResult()
+        self.assertEqual(result.normal_text, "")
+        self.assertEqual(result.reasoning_text, "")
+
+    def test_init_with_values(self):
+        """Test initialization with specific values."""
+        result = StreamingParseResult("normal", "reasoning")
+        self.assertEqual(result.normal_text, "normal")
+        self.assertEqual(result.reasoning_text, "reasoning")
+
+
+class TestBaseReasoningFormatDetector(CustomTestCase):
+    def setUp(self):
+        self.detector = BaseReasoningFormatDetector(
+            think_start_token="<think>",
+            think_end_token="</think>",
+            force_reasoning=False,
+            stream_reasoning=True,
+        )
+
+    def test_init(self):
+        """Test initialization of BaseReasoningFormatDetector."""
+        self.assertEqual(self.detector.think_start_token, "<think>")
+        self.assertEqual(self.detector.think_end_token, "</think>")
+        self.assertFalse(self.detector._in_reasoning)
+        self.assertTrue(self.detector.stream_reasoning)
+        self.assertEqual(self.detector._buffer, "")
+        self.assertFalse(self.detector.stripped_think_start)
+
+    def test_detect_and_parse_normal_text(self):
+        """Test parsing normal text without reasoning."""
+        text = "This is normal text"
+        result = self.detector.detect_and_parse(text)
+        self.assertEqual(result.normal_text, text)
+        self.assertEqual(result.reasoning_text, "")
+
+    def test_detect_and_parse_with_start_token(self):
+        """Test parsing text starting with think token."""
+        text = "<think>This is reasoning"
+        result = self.detector.detect_and_parse(text)
+        self.assertEqual(result.reasoning_text, "This is reasoning")
+        self.assertEqual(result.normal_text, "")
+
+    def test_detect_and_parse_complete_reasoning(self):
+        """Test parsing complete reasoning block."""
+        text = "<think>This is reasoning</think>This is normal"
+        result = self.detector.detect_and_parse(text)
+        self.assertEqual(result.reasoning_text, "This is reasoning")
+        self.assertEqual(result.normal_text, "This is normal")
+
+    def test_detect_and_parse_force_reasoning(self):
+        """Test forced reasoning mode."""
+        detector = BaseReasoningFormatDetector(
+            "<think>", "</think>", force_reasoning=True
+        )
+        text = "This should be reasoning"
+        result = detector.detect_and_parse(text)
+        self.assertEqual(result.reasoning_text, "This should be reasoning")
+        self.assertEqual(result.normal_text, "")
+
+    def test_parse_streaming_increment_normal(self):
+        """Test streaming parse of normal text."""
+        result = self.detector.parse_streaming_increment("Hello world")
+        self.assertEqual(result.normal_text, "Hello world")
+        self.assertEqual(result.reasoning_text, "")
+
+    def test_parse_streaming_increment_partial_token(self):
+        """Test streaming parse with partial token."""
+        # Test partial start token
+        result = self.detector.parse_streaming_increment("<thi")
+        self.assertEqual(result.normal_text, "")
+        self.assertEqual(result.reasoning_text, "")
+
+        # Reset detector and test partial end token when in reasoning mode
+        detector = BaseReasoningFormatDetector("<think>", "</think>")
+        detector._in_reasoning = True
+        result = detector.parse_streaming_increment("</thi")
+        self.assertEqual(result.normal_text, "")
+        self.assertEqual(result.reasoning_text, "")
+
+    def test_parse_streaming_increment_complete_start(self):
+        """Test streaming parse with complete start token."""
+        result = self.detector.parse_streaming_increment("<think>")
+        self.assertEqual(result.normal_text, "")
+        self.assertEqual(result.reasoning_text, "")
+        self.assertTrue(self.detector._in_reasoning)
+        self.assertTrue(self.detector.stripped_think_start)
+
+    def test_parse_streaming_increment_reasoning_content(self):
+        """Test streaming parse of reasoning content."""
+        # First add start token
+        self.detector.parse_streaming_increment("<think>")
+
+        # Then add reasoning content
+        result = self.detector.parse_streaming_increment("reasoning content")
+        self.assertEqual(result.reasoning_text, "reasoning content")
+        self.assertEqual(result.normal_text, "")
+
+    def test_parse_streaming_increment_end_token(self):
+        """Test streaming parse with end token."""
+        # Start reasoning mode
+        self.detector.parse_streaming_increment("<think>")
+        self.detector.parse_streaming_increment("reasoning")
+
+        # End reasoning - the reasoning content accumulated in previous calls is cleared when end token is found
+        result = self.detector.parse_streaming_increment("</think>normal text")
+        self.assertEqual(result.reasoning_text, "")  # Buffer cleared, returns empty
+        self.assertEqual(result.normal_text, "normal text")
+        self.assertFalse(self.detector._in_reasoning)
+
+    def test_parse_streaming_increment_no_stream_reasoning(self):
+        """Test streaming parse without streaming reasoning."""
+        detector = BaseReasoningFormatDetector(
+            "<think>", "</think>", stream_reasoning=False
+        )
+
+        # Start reasoning mode
+        detector.parse_streaming_increment("<think>")
+
+        # Add reasoning content - should not return content
+        result = detector.parse_streaming_increment("reasoning content")
+        self.assertEqual(result.reasoning_text, "")
+        self.assertEqual(result.normal_text, "")
+
+    def test_parse_streaming_increment_mixed_content(self):
+        """Test streaming parse with mixed content in one chunk."""
+        result = self.detector.parse_streaming_increment(
+            "<think>reasoning</think>normal"
+        )
+        self.assertEqual(result.reasoning_text, "reasoning")
+        self.assertEqual(result.normal_text, "normal")
+
+
+class TestDeepSeekR1Detector(CustomTestCase):
+    def setUp(self):
+        self.detector = DeepSeekR1Detector()
+
+    def test_init(self):
+        """Test DeepSeekR1Detector initialization."""
+        self.assertEqual(self.detector.think_start_token, "<think>")
+        self.assertEqual(self.detector.think_end_token, "</think>")
+        self.assertTrue(self.detector._in_reasoning)  # force_reasoning=True
+        self.assertTrue(self.detector.stream_reasoning)
+
+    def test_init_no_stream_reasoning(self):
+        """Test DeepSeekR1Detector with stream_reasoning=False."""
+        detector = DeepSeekR1Detector(stream_reasoning=False)
+        self.assertFalse(detector.stream_reasoning)
+
+    def test_detect_and_parse_r1_format(self):
+        """Test parsing DeepSeek-R1 format."""
+        text = "I need to think about this. The answer is 42."
+        result = self.detector.detect_and_parse(text)
+        # Should be treated as reasoning because force_reasoning=True
+        self.assertEqual(
+            result.reasoning_text, "I need to think about this. The answer is 42."
+        )
+        self.assertEqual(result.normal_text, "")
+
+    def test_detect_and_parse_with_end_token(self):
+        """Test parsing with end token."""
+        text = "I think this is the answer</think>The final answer is 42."
+        result = self.detector.detect_and_parse(text)
+        self.assertEqual(result.reasoning_text, "I think this is the answer")
+        self.assertEqual(result.normal_text, "The final answer is 42.")
+
+    def test_detect_and_parse_with_start_token(self):
+        """Test parsing deepseek-ai/DeepSeek-R1-0528 format, which generates the <think> token."""
+        text = "<think>I need to think about this.</think>The answer is 42."
+        result = self.detector.detect_and_parse(text)
+        # Should be treated as reasoning because force_reasoning=True
+        self.assertEqual(result.reasoning_text, "I need to think about this.")
+        self.assertEqual(result.normal_text, "The answer is 42.")
+
+
+class TestQwen3Detector(CustomTestCase):
+    def setUp(self):
+        self.detector = Qwen3Detector()
+
+    def test_init(self):
+        """Test Qwen3Detector initialization."""
+        self.assertEqual(self.detector.think_start_token, "<think>")
+        self.assertEqual(self.detector.think_end_token, "</think>")
+        self.assertFalse(self.detector._in_reasoning)  # force_reasoning=False
+        self.assertTrue(self.detector.stream_reasoning)
+
+    def test_detect_and_parse_qwen3_format(self):
+        """Test parsing Qwen3 format."""
+        text = "<think>Let me think about this problem</think>The answer is 42."
+        result = self.detector.detect_and_parse(text)
+        self.assertEqual(result.reasoning_text, "Let me think about this problem")
+        self.assertEqual(result.normal_text, "The answer is 42.")
+
+    def test_detect_and_parse_without_thinking(self):
+        """Test parsing without thinking (enable_thinking=False case)."""
+        text = "Direct answer without thinking."
+        result = self.detector.detect_and_parse(text)
+        self.assertEqual(result.normal_text, text)
+        self.assertEqual(result.reasoning_text, "")
+
+
+class TestQwen3ForcedReasoningDetector(CustomTestCase):
+    def setUp(self):
+        self.detector = Qwen3Detector(force_reasoning=True)
+
+    def test_init(self):
+        """Test Qwen3ForcedReasoningDetector initialization."""
+        self.assertEqual(self.detector.think_start_token, "<think>")
+        self.assertEqual(self.detector.think_end_token, "</think>")
+        self.assertTrue(self.detector._in_reasoning)  # force_reasoning=True
+        self.assertTrue(self.detector.stream_reasoning)
+
+    def test_detect_and_parse_qwen3_forced_reasoning_format(self):
+        """Test parsing Qwen3-ForcedReasoning format (no <think> start tag)."""
+        text = "I need to think about this step by step.</think>The answer is 42."
+        result = self.detector.detect_and_parse(text)
+        self.assertEqual(
+            result.reasoning_text, "I need to think about this step by step."
+        )
+        self.assertEqual(result.normal_text, "The answer is 42.")
+
+    def test_detect_and_parse_with_start_token(self):
+        """Test parsing Qwen3-ForcedReasoning with optional <think> start tag."""
+        text = "<think>I need to think about this.</think>The answer is 42."
+        result = self.detector.detect_and_parse(text)
+        # Should work because base class logic handles both force_reasoning=True OR start token
+        self.assertEqual(result.reasoning_text, "I need to think about this.")
+        self.assertEqual(result.normal_text, "The answer is 42.")
+
+    def test_streaming_qwen3_forced_reasoning_format(self):
+        """Test streaming parse of Qwen3-ForcedReasoning format."""
+        # First chunk without <think> start
+        result = self.detector.parse_streaming_increment("I need to")
+        self.assertEqual(result.reasoning_text, "I need to")
+        self.assertEqual(result.normal_text, "")
+
+        # More reasoning content
+        result = self.detector.parse_streaming_increment(" think about this.")
+        self.assertEqual(result.reasoning_text, " think about this.")
+        self.assertEqual(result.normal_text, "")
+
+        # End token with normal text
+        result = self.detector.parse_streaming_increment("</think>The answer is 42.")
+        self.assertEqual(result.reasoning_text, "")  # Buffer cleared
+        self.assertEqual(result.normal_text, "The answer is 42.")
+
+
+class TestKimiDetector(CustomTestCase):
+    def setUp(self):
+        self.detector = KimiDetector()
+
+    def test_init(self):
+        """Test KimiDetector initialization."""
+        self.assertEqual(self.detector.think_start_token, "◁think▷")
+        self.assertEqual(self.detector.think_end_token, "◁/think▷")
+        self.assertFalse(self.detector._in_reasoning)
+        self.assertTrue(self.detector.stream_reasoning)
+
+    def test_detect_and_parse_kimi_format(self):
+        """Test parsing Kimi format."""
+        text = "◁think▷Let me consider this carefully◁/think▷The answer is 42."
+        result = self.detector.detect_and_parse(text)
+        self.assertEqual(result.reasoning_text, "Let me consider this carefully")
+        self.assertEqual(result.normal_text, "The answer is 42.")
+
+    def test_detect_and_parse_kimi_no_thinking(self):
+        """Test parsing Kimi format without thinking."""
+        text = "Direct answer without thinking tokens."
+        result = self.detector.detect_and_parse(text)
+        self.assertEqual(result.normal_text, text)
+        self.assertEqual(result.reasoning_text, "")
+
+    def test_streaming_kimi_format(self):
+        """Test streaming parse of Kimi format."""
+        # Test partial token
+        result = self.detector.parse_streaming_increment("◁thi")
+        self.assertEqual(result.normal_text, "")
+        self.assertEqual(result.reasoning_text, "")
+
+        # Complete start token
+        result = self.detector.parse_streaming_increment("nk▷Start")
+        self.assertEqual(result.normal_text, "")
+        self.assertEqual(result.reasoning_text, "Start")
+        self.assertTrue(self.detector._in_reasoning)
+
+        # Add reasoning content
+        result = self.detector.parse_streaming_increment("thinking...")
+        self.assertEqual(result.reasoning_text, "thinking...")
+        self.assertEqual(result.normal_text, "")
+
+        # End token - reasoning content is cleared when end token is processed
+        result = self.detector.parse_streaming_increment("◁/think▷answer")
+        self.assertEqual(result.reasoning_text, "")  # Buffer cleared
+        self.assertEqual(result.normal_text, "answer")
+
+
+class TestReasoningParser(CustomTestCase):
+    def test_init_valid_model(self):
+        """Test initialization with valid model types."""
+        parser = ReasoningParser("deepseek-r1")
+        self.assertIsInstance(parser.detector, DeepSeekR1Detector)
+
+        parser = ReasoningParser("qwen3")
+        self.assertIsInstance(parser.detector, Qwen3Detector)
+
+        parser = ReasoningParser("kimi")
+        self.assertIsInstance(parser.detector, KimiDetector)
+
+    def test_init_invalid_model(self):
+        """Test initialization with invalid model type."""
+        with self.assertRaises(ValueError) as context:
+            ReasoningParser("invalid-model")
+        self.assertIn("Unsupported model type", str(context.exception))
+
+    def test_init_no_model(self):
+        """Test initialization without model type."""
+        with self.assertRaises(ValueError) as context:
+            ReasoningParser(None)
+        self.assertEqual(str(context.exception), "Model type must be specified")
+
+    def test_parse_non_stream(self):
+        """Test non-streaming parsing."""
+        parser = ReasoningParser("qwen3")
+        reasoning, normal = parser.parse_non_stream(
+            "<think>Let me think</think>The answer is 42."
+        )
+        self.assertEqual(reasoning, "Let me think")
+        self.assertEqual(normal, "The answer is 42.")
+
+    def test_parse_stream_chunk(self):
+        """Test streaming chunk parsing."""
+        parser = ReasoningParser("qwen3")
+
+        # First chunk with start token
+        reasoning, normal = parser.parse_stream_chunk("<think>")
+        self.assertEqual(reasoning, "")
+        self.assertEqual(normal, "")
+
+        # Second chunk with reasoning content
+        reasoning, normal = parser.parse_stream_chunk("thinking...")
+        self.assertEqual(reasoning, "thinking...")
+        self.assertEqual(normal, "")
+
+        # Third chunk with end token and normal text
+        reasoning, normal = parser.parse_stream_chunk("</think>answer")
+        self.assertEqual(reasoning, "")  # Buffer cleared when end token processed
+        self.assertEqual(normal, "answer")
+
+    def test_case_insensitive_model_type(self):
+        """Test case insensitive model type matching."""
+        parser1 = ReasoningParser("DeepSeek-R1")
+        parser2 = ReasoningParser("QWEN3")
+        parser3 = ReasoningParser("Kimi")
+
+        self.assertIsInstance(parser1.detector, DeepSeekR1Detector)
+        self.assertIsInstance(parser2.detector, Qwen3Detector)
+        self.assertIsInstance(parser3.detector, KimiDetector)
+
+    def test_stream_reasoning_parameter(self):
+        """Test stream_reasoning parameter is passed correctly."""
+        parser = ReasoningParser("qwen3", stream_reasoning=False)
+        self.assertFalse(parser.detector.stream_reasoning)
+
+        parser = ReasoningParser("qwen3", stream_reasoning=True)
+        self.assertTrue(parser.detector.stream_reasoning)
+
+
+class TestIntegrationScenarios(CustomTestCase):
+    """Integration tests for realistic usage scenarios."""
+
+    def test_deepseek_r1_complete_response(self):
+        """Test complete DeepSeek-R1 response parsing."""
+        parser = ReasoningParser("deepseek-r1")
+        text = "I need to solve this step by step. First, I'll analyze the problem. The given equation is x + 2 = 5. To solve for x, I subtract 2 from both sides: x = 5 - 2 = 3.</think>The answer is x = 3."
+
+        reasoning, normal = parser.parse_non_stream(text)
+        self.assertIn("step by step", reasoning)
+        self.assertIn(
+            "= 3", reasoning
+        )  # The reasoning contains "x = 5 - 2 = 3" which has "= 3"
+        self.assertEqual(normal, "The answer is x = 3.")
+
+    def test_qwen3_streaming_scenario(self):
+        """Test Qwen3 streaming scenario."""
+        parser = ReasoningParser("qwen3")
+
+        chunks = [
+            "<think>",
+            "Let me analyze this problem.",
+            " I need to consider multiple factors.",
+            "</think>",
+            "Based on my analysis, the solution is to use a different approach.",
+        ]
+
+        all_reasoning = ""
+        all_normal = ""
+
+        for chunk in chunks:
+            reasoning, normal = parser.parse_stream_chunk(chunk)
+            all_reasoning += reasoning
+            all_normal += normal
+
+        self.assertIn("analyze", all_reasoning)
+        self.assertIn("multiple factors", all_reasoning)
+        self.assertIn("different approach", all_normal)
+
+    def test_kimi_streaming_scenario(self):
+        """Test Kimi streaming scenario."""
+        parser = ReasoningParser("kimi")
+        chunks = [
+            "◁thi",
+            "nk▷",
+            "Let me analyze this problem.",
+            " I need to consider multiple factors.",
+            "◁/th",
+            "ink▷",
+            "The answer is 42.",
+        ]
+        all_reasoning = ""
+        all_normal = ""
+        for chunk in chunks:
+            reasoning, normal = parser.parse_stream_chunk(chunk)
+            all_reasoning += reasoning
+            all_normal += normal
+
+        self.assertIn("analyze", all_reasoning)
+        self.assertIn("multiple factors", all_reasoning)
+        self.assertIn("42", all_normal)
+
+    def test_empty_reasoning_blocks(self):
+        """Test handling of empty reasoning blocks."""
+        parser = ReasoningParser("qwen3")
+        text = "<think></think>Just the answer."
+
+        reasoning, normal = parser.parse_non_stream(text)
+        self.assertEqual(reasoning, "")
+        self.assertEqual(normal, "Just the answer.")
+
+    def test_qwen3_forced_reasoning_complete_response(self):
+        """Test complete Qwen3-ForcedReasoning response parsing."""
+        parser = ReasoningParser("qwen3", force_reasoning=True)
+        text = "Let me solve this step by step. The equation is x + 2 = 5. Subtracting 2 from both sides gives x = 3.</think>The solution is x = 3."
+
+        reasoning, normal = parser.parse_non_stream(text)
+        self.assertIn("step by step", reasoning)
+        self.assertIn("x = 3", reasoning)
+        self.assertEqual(normal, "The solution is x = 3.")
+
+    def test_qwen3_forced_reasoning_streaming_scenario(self):
+        """Test Qwen3-ForcedReasoning streaming scenario."""
+        parser = ReasoningParser("qwen3", force_reasoning=True)
+
+        chunks = [
+            "I need to analyze",
+            " this problem carefully.",
+            " Let me break it down.",
+            "</think>",
+            "The final answer is 42.",
+        ]
+
+        all_reasoning = ""
+        all_normal = ""
+
+        for chunk in chunks:
+            reasoning, normal = parser.parse_stream_chunk(chunk)
+            all_reasoning += reasoning
+            all_normal += normal
+
+        self.assertIn("analyze", all_reasoning)
+        self.assertIn("break it down", all_reasoning)
+        self.assertIn("final answer", all_normal)
+
+
+class TestBufferLossBugFix(CustomTestCase):
+    """Test cases for the buffer loss bug fix in parse_streaming_increment."""
+
+    def test_partial_end_tag_buffer_loss_bug(self):
+        """
+        Test the bug where partial end tag fragments are lost when followed by normal text.
+
+        Bug scenario:
+        1. _in_reasoning is False
+        2. new_text is "</" (part of closing thinking tag)
+        3. Fragment is stored in buffer and empty string is returned
+        4. Next step: new_text is "answer", _in_reasoning still False
+        5. Buffer is cleared and "answer" is returned directly
+        6. The "</" from previous step is lost
+
+        This test verifies the fix where line 108 was changed from:
+        return StreamingParseResult(normal_text=new_text)
+        to:
+        return StreamingParseResult(normal_text=current_text)
+        """
+        detector = BaseReasoningFormatDetector("<think>", "</think>")
+
+        # Step 1: Send partial end tag when not in reasoning mode
+        # This should be buffered since it could be start of "</think>"
+        result1 = detector.parse_streaming_increment("</")
+        self.assertEqual(result1.normal_text, "")
+        self.assertEqual(result1.reasoning_text, "")
+
+        # Step 2: Send normal text that doesn't complete the end tag
+        # Before fix: would return only "answer", losing the "</"
+        # After fix: should return the complete buffered content "</answer"
+        result2 = detector.parse_streaming_increment("answer")
+        self.assertEqual(result2.normal_text, "</answer")
+        self.assertEqual(result2.reasoning_text, "")
+
+    def test_partial_start_tag_buffer_preservation(self):
+        """
+        Test that partial start tag fragments are properly preserved.
+        """
+        detector = BaseReasoningFormatDetector("<think>", "</think>")
+
+        # Send partial start tag
+        result1 = detector.parse_streaming_increment("<th")
+        self.assertEqual(result1.normal_text, "")
+        self.assertEqual(result1.reasoning_text, "")
+
+        # Complete with non-matching text
+        result2 = detector.parse_streaming_increment("is is text")
+        self.assertEqual(result2.normal_text, "<this is text")
+        self.assertEqual(result2.reasoning_text, "")
+
+    def test_partial_end_tag_in_reasoning_mode(self):
+        """
+        Test partial end tag handling when already in reasoning mode.
+        """
+        detector = BaseReasoningFormatDetector("<think>", "</think>")
+
+        # Enter reasoning mode
+        detector.parse_streaming_increment("<think>")
+        detector.parse_streaming_increment("some reasoning")
+
+        # Send partial end tag
+        result1 = detector.parse_streaming_increment("</")
+        self.assertEqual(result1.normal_text, "")
+        self.assertEqual(result1.reasoning_text, "")
+
+        # Complete the end tag with normal text
+        result2 = detector.parse_streaming_increment("think>normal text")
+        self.assertEqual(result2.normal_text, "normal text")
+        # The reasoning text should be empty since buffer was cleared when end tag was processed
+        self.assertEqual(result2.reasoning_text, "")
+
+    def test_multiple_partial_fragments(self):
+        """
+        Test handling of multiple partial fragments that don't match any tokens.
+        """
+        detector = BaseReasoningFormatDetector("<think>", "</think>")
+
+        # Send multiple partial fragments
+        result1 = detector.parse_streaming_increment("<")
+        self.assertEqual(result1.normal_text, "")
+        self.assertEqual(result1.reasoning_text, "")
+
+        result2 = detector.parse_streaming_increment("/")
+        self.assertEqual(result2.normal_text, "")
+        self.assertEqual(result2.reasoning_text, "")
+
+        result3 = detector.parse_streaming_increment("random>")
+        self.assertEqual(result3.normal_text, "</random>")
+        self.assertEqual(result3.reasoning_text, "")
+
+    def test_edge_case_exact_token_match(self):
+        """
+        Test edge case where buffer content exactly matches a token.
+        """
+        detector = BaseReasoningFormatDetector("<think>", "</think>")
+
+        # Build up the exact start token character by character
+        detector.parse_streaming_increment("<")
+        detector.parse_streaming_increment("t")
+        detector.parse_streaming_increment("h")
+        detector.parse_streaming_increment("i")
+        detector.parse_streaming_increment("n")
+        result = detector.parse_streaming_increment("k>")
+
+        # Should enter reasoning mode
+        self.assertEqual(result.normal_text, "")
+        self.assertEqual(result.reasoning_text, "")
+        self.assertTrue(detector._in_reasoning)
+        self.assertTrue(detector.stripped_think_start)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_regex_constrained.py b/test/srt/test_regex_constrained.py
new file mode 100644
index 000000000..b0fb49796
--- /dev/null
+++ b/test/srt/test_regex_constrained.py
@@ -0,0 +1,187 @@
+"""
+python3 -m unittest test_regex_constrained.TestRegexConstrained.test_regex_generate_email
+python3 -m unittest test_regex_constrained.TestRegexConstrained.test_regex_generate_greeting
+python3 -m unittest test_regex_constrained.TestRegexConstrainedLLGuidance.test_regex_generate_email
+python3 -m unittest test_regex_constrained.TestRegexConstrainedLLGuidance.test_regex_generate_greeting
+"""
+
+import json
+import unittest
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+def setup_class(cls, backend: str, disable_overlap: bool):
+    cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+    cls.base_url = DEFAULT_URL_FOR_TEST
+
+    other_args = [
+        "--max-running-requests",
+        "10",
+        "--grammar-backend",
+        backend,
+    ]
+
+    if disable_overlap:
+        other_args += ["--disable-overlap-schedule"]
+
+    cls.process = popen_launch_server(
+        cls.model,
+        cls.base_url,
+        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        other_args=other_args,
+    )
+
+
+class TestRegexConstrained(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        setup_class(cls, "xgrammar", disable_overlap=False)
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def run_decode(
+        self,
+        regex,
+        prompt,
+        return_logprob=False,
+        top_logprobs_num=0,
+        n=1,
+    ):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": prompt,
+                "sampling_params": {
+                    "temperature": 0 if n == 1 else 0.5,
+                    "max_new_tokens": 128,
+                    "n": n,
+                    "regex": regex,
+                },
+                "stream": False,
+                "return_logprob": return_logprob,
+                "top_logprobs_num": top_logprobs_num,
+                "logprob_start_len": 0,
+            },
+        )
+
+        ret = response.json()
+        print(json.dumps(ret, indent=2))
+        print("=" * 100)
+
+        if not isinstance(ret, list):
+            self.fail(f"Expected response to be a list, but got {type(ret)}")
+
+        for item in ret:
+            text = item.get("text", "").strip()
+            if not text:
+                self.fail("Generated text is empty.")
+
+            if not self.regex_match(text, regex):
+                self.fail(f"Text '{text}' does not match regex pattern.")
+
+    def regex_match(self, text, pattern):
+        import re
+
+        return re.match(pattern, text) is not None
+
+    def test_regex_generate_email(self):
+        pattern = r"^user@example\.com$"
+        prompt = "Generate an email address:"
+
+        self.run_decode(
+            regex=pattern,
+            prompt=prompt,
+            n=3,
+        )
+
+    def test_regex_generate_greeting(self):
+        pattern = r"^(Hello|Hi|Hey)$"
+        prompt = "Generate a greeting:"
+
+        self.run_decode(
+            regex=pattern,
+            prompt=prompt,
+            n=3,
+        )
+
+    def test_regex_generate_number(self):
+        pattern = r"^\d{3}$"
+        prompt = "Generate a three-digit number:"
+
+        self.run_decode(
+            regex=pattern,
+            prompt=prompt,
+            n=3,
+        )
+
+    def test_regex_generate_phone(self):
+        pattern = r"^\(\d{3}\) \d{3}-\d{4}$"
+        prompt = "Generate a phone number:"
+
+        self.run_decode(
+            regex=pattern,
+            prompt=prompt,
+            n=3,
+        )
+
+    def test_regex_generate_date(self):
+        pattern = r"^2024-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])$"
+        prompt = "Generate a date in YYYY-MM-DD format:"
+
+        self.run_decode(
+            regex=pattern,
+            prompt=prompt,
+            n=3,
+        )
+
+    def test_regex_generate_hex_color(self):
+        pattern = r"^#[0-9A-F]{6}$"
+        prompt = "Generate a hex color code:"
+
+        self.run_decode(
+            regex=pattern,
+            prompt=prompt,
+            n=3,
+        )
+
+    def test_regex_generate_complex_json(self):
+        pattern = r'^\{\s*"name"\s*:\s*"[a-zA-Z0-9 ]+"\s*,\s*"age"\s*:\s*[1-9][0-9]*\s*,\s*"city"\s*:\s*"[a-zA-Z0-9 ]+"\s*\}$'
+        prompt = "Generate a simple JSON with name, age, and city:"
+
+        self.run_decode(
+            regex=pattern,
+            prompt=prompt,
+            n=3,
+        )
+
+    def test_regex_generate_custom_log_format(self):
+        pattern = r"^\[2024-01-01T12:00:00Z\] INFO: System\.process - Operation [a-z]+ successfully$"
+        prompt = "Generate a log entry:"
+
+        self.run_decode(
+            regex=pattern,
+            prompt=prompt,
+            n=3,
+        )
+
+
+class TestRegexConstrainedLLGuidance(TestRegexConstrained):
+    @classmethod
+    def setUpClass(cls):
+        setup_class(cls, "llguidance", disable_overlap=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_release_memory_occupation.py b/test/srt/test_release_memory_occupation.py
new file mode 100644
index 000000000..35be029df
--- /dev/null
+++ b/test/srt/test_release_memory_occupation.py
@@ -0,0 +1,328 @@
+"""Test memory release and resume operations for SGLang engine in hybrid RL training.
+
+This test suite evaluates the SGLang engine's memory management capabilities, focusing
+on releasing and resuming memory occupation for KV cache and model weights. It simulates
+an RL workflow where the SGLang engine acts as a rollout engine for experience collection.
+The process involves initializing the engine, sending a small number of requests to simulate
+rollout, releasing memory to mimic offloading during RL training, resuming memory occupation,
+updating weights with a trained HuggingFace model, and verifying the updated weights.
+
+Detailed in our proposal (https://github.com/sgl-project/sglang/pull/7099), two test cases
+are included:
+
+1. Basic Release and Resume: Uses a lower mem_fraction_static (0.6) to control memory allocation
+and avoid OOM errors carefully. This test simulates a scenario without multi-stage memory management,
+ensuring the engine can release and resume memory occupation while maintaining functionality after
+weight updates.
+
+2. Multi-Stage Release and Resume: Employs a higher mem_fraction_static (0.85) to simulate higher
+memory pressure, leveraging multi-stage memory management. It sequentially releases and resumes
+KV cache and model weights, verifying memory deallocation and reallocation at each stage, and
+ensuring correct weight updates and text generation.
+
+3. Tensor Parallel Tests: Tests memory release and resume operations with different tensor parallel
+configurations (tp=1, tp=2) to ensure proper memory management in distributed settings. For different
+data parallel size, we test it in verl.
+"""
+
+import gc
+import os
+import time
+import unittest
+
+import torch
+from transformers import AutoModelForCausalLM
+
+import sglang as sgl
+from sglang.srt.constants import GPU_MEMORY_TYPE_KV_CACHE, GPU_MEMORY_TYPE_WEIGHTS
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE,
+    DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE,
+    DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT,
+    CustomTestCase,
+)
+
+# (temporarily) set to true to observe memory usage in nvidia-smi more clearly
+_DEBUG_EXTRA = False
+
+
+def get_gpu_memory_gb():
+    return torch.cuda.device_memory_used() / 1024**3
+
+
+class TestReleaseMemoryOccupation(CustomTestCase):
+    def _setup_engine(self, model_name, mem_fraction_static=0.8, tp_size=1, ep_size=1):
+        """Common setup for engine and HF model."""
+        engine = sgl.Engine(
+            model_path=model_name,
+            random_seed=42,
+            enable_memory_saver=True,
+            mem_fraction_static=mem_fraction_static,
+            tp_size=tp_size,
+            ep_size=ep_size,
+            # disable_cuda_graph=True,  # for debugging only
+        )
+
+        return engine
+
+    def _common_test_params(self):
+        """Common test parameters."""
+        return {
+            "prompt": "Today is a sunny day and I like",
+            "sampling_params": {"temperature": 0, "max_new_tokens": 8},
+            "expect_output_before_update_weights": " to spend it outdoors. I decided to",
+            "expect_output_after_update_weights": " to go for a walk. I like",
+            "prompt_moe": "The weather is nice today, and I want to",
+            "sampling_params_moe": {"temperature": 0, "max_new_tokens": 16},
+            "expect_output_before_update_weights_moe": " go to the park. I have a picnic basket, a book, and a",
+            "expect_output_after_update_weights_moe": " go to the park. I have a lot of things to do, but I",
+        }
+
+    def _test_initial_generation(
+        self, engine, prompt, sampling_params, expect_output_before_update_weights
+    ):
+        """Test initial generation and memory allocation."""
+        print("generate (#1)")
+        outputs = engine.generate(prompt, sampling_params)["text"]
+        self.assertEqual(outputs, expect_output_before_update_weights)
+
+        if _DEBUG_EXTRA:
+            time.sleep(3)
+
+    def test_release_and_resume_occupation(self):
+        # Without multi-stage release and resume, we need to carefully control the memory fraction to avoid OOM
+        model_name = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        assert (
+            torch.cuda.device_count() >= 2
+        ), "Need at least 2 GPUs for tensor parallel tests"
+
+        for tp_size in [1, 2]:
+
+            print(f"Testing tp_size={tp_size} for test_release_and_resume_occupation")
+            engine = self._setup_engine(
+                model_name=model_name, mem_fraction_static=0.6, tp_size=tp_size
+            )
+            params = self._common_test_params()
+
+            self._test_initial_generation(
+                engine,
+                params["prompt"],
+                params["sampling_params"],
+                params["expect_output_before_update_weights"],
+            )
+
+            t = time.perf_counter()
+            gpu_memory_usage_before_release = get_gpu_memory_gb()
+            engine.release_memory_occupation()
+            gpu_memory_usage_after_release = get_gpu_memory_gb()
+
+            self.assertLess(
+                gpu_memory_usage_after_release,
+                gpu_memory_usage_before_release,
+            )
+
+            print(
+                f"Release took {time.perf_counter() - t:.2f}s, memory: {gpu_memory_usage_before_release:.1f} GB → {gpu_memory_usage_after_release:.1f} GB"
+            )
+
+            if _DEBUG_EXTRA:
+                time.sleep(3)
+
+            t = time.perf_counter()
+            engine.resume_memory_occupation()
+            print(
+                f"Resume took {time.perf_counter() - t:.2f}s, memory: {get_gpu_memory_gb():.1f} GB"
+            )
+
+            hf_model_new = AutoModelForCausalLM.from_pretrained(
+                DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE,
+                torch_dtype="bfloat16",
+                device_map="cuda",
+            )
+            engine.update_weights_from_tensor(list(hf_model_new.named_parameters()))
+
+            # destroy the hf model
+            del hf_model_new
+            torch.cuda.empty_cache()
+
+            print("generate (#2)")
+            outputs = engine.generate(params["prompt"], params["sampling_params"])[
+                "text"
+            ]
+            self.assertEqual(outputs, params["expect_output_after_update_weights"])
+            engine.shutdown()
+
+    def test_multi_stage_release_and_resume(self):
+        # With multi-stage release and resume, we can set the memory fraction to 0.85 without concern of OOM
+        model_name = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+
+        for tp_size in [1, 2]:
+            if tp_size == 2 and torch.cuda.device_count() < 2:
+                continue
+
+            print(f"Testing tp_size={tp_size} for test_multi_stage_release_and_resume")
+            engine = sgl.Engine(
+                model_path=model_name,
+                random_seed=42,
+                enable_memory_saver=True,
+                mem_fraction_static=0.85,  # Higher memory pressure
+                tp_size=tp_size,
+            )
+            params = self._common_test_params()
+
+            self._test_initial_generation(
+                engine,
+                params["prompt"],
+                params["sampling_params"],
+                params["expect_output_before_update_weights"],
+            )
+
+            t = time.perf_counter()
+            gpu_memory_usage_before_release_kv_cache = get_gpu_memory_gb()
+            engine.release_memory_occupation(tags=[GPU_MEMORY_TYPE_KV_CACHE])
+
+            gpu_memory_usage_after_release_kv_cache = get_gpu_memory_gb()
+
+            self.assertLess(
+                gpu_memory_usage_after_release_kv_cache,
+                gpu_memory_usage_before_release_kv_cache,
+            )
+            engine.release_memory_occupation(tags=[GPU_MEMORY_TYPE_WEIGHTS])
+
+            gpu_memory_usage_after_release_weights = get_gpu_memory_gb()
+
+            self.assertLess(
+                gpu_memory_usage_after_release_weights,
+                gpu_memory_usage_after_release_kv_cache,
+            )
+
+            print(f"Release took {time.perf_counter() - t:.2f}s")
+            print(
+                f"Memory: {gpu_memory_usage_before_release_kv_cache:.1f} → {gpu_memory_usage_after_release_kv_cache:.1f} → {gpu_memory_usage_after_release_weights:.1f} GB"
+            )
+
+            if _DEBUG_EXTRA:
+                time.sleep(3)
+
+            t = time.perf_counter()
+            gpu_memory_usage_before_resume_weights = get_gpu_memory_gb()
+
+            # gpu_memory_usage_after_release_weights and gpu_memory_usage_before_resume_weights should be close
+
+            self.assertAlmostEqual(
+                gpu_memory_usage_after_release_weights,
+                gpu_memory_usage_before_resume_weights,
+                delta=3.0,
+            )
+            print(f"Resume weights took {time.perf_counter() - t:.2f}s")
+
+            engine.resume_memory_occupation(tags=[GPU_MEMORY_TYPE_WEIGHTS])
+            gpu_memory_usage_after_resume_weights = get_gpu_memory_gb()
+
+            self.assertGreater(
+                gpu_memory_usage_after_resume_weights,
+                gpu_memory_usage_before_resume_weights,
+            )
+
+            # Update weights from a trained model to serving engine, and then destroy the trained model
+            hf_model_new = AutoModelForCausalLM.from_pretrained(
+                DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE,
+                torch_dtype="bfloat16",
+                device_map="cuda",
+            )
+            gpu_memory_usage_after_loaded_hf_model = get_gpu_memory_gb()
+            engine.update_weights_from_tensor(list(hf_model_new.named_parameters()))
+
+            # destroy the hf model
+            del hf_model_new
+            torch.cuda.empty_cache()
+            engine.resume_memory_occupation(tags=[GPU_MEMORY_TYPE_KV_CACHE])
+
+            gpu_memory_usage_after_resume_kv_cache = get_gpu_memory_gb()
+            self.assertGreater(
+                gpu_memory_usage_after_resume_kv_cache,
+                gpu_memory_usage_after_resume_weights,
+            )
+
+            print(f"Resume + update took {time.perf_counter() - t:.2f}s")
+            print(
+                f"Memory: {gpu_memory_usage_before_resume_weights:.1f} → {gpu_memory_usage_after_resume_weights:.1f} → {gpu_memory_usage_after_loaded_hf_model:.1f} → {gpu_memory_usage_after_resume_kv_cache:.1f} GB"
+            )
+
+            print("generate (#2)")
+            outputs = engine.generate(params["prompt"], params["sampling_params"])[
+                "text"
+            ]
+            self.assertEqual(outputs, params["expect_output_after_update_weights"])
+            engine.shutdown()
+
+    def test_moe_model_release_and_resume(self):
+        # Test with MoE model
+        model_name = DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT
+
+        tp_size = ep_size = 2
+
+        print(
+            f"Testing tp_size={tp_size} and ep_size={ep_size} for test_moe_model_release_and_resume"
+        )
+        engine = sgl.Engine(
+            model_path=model_name,
+            random_seed=42,
+            enable_memory_saver=True,
+            mem_fraction_static=0.5,
+            tp_size=tp_size,
+            ep_size=ep_size,
+        )
+        params = self._common_test_params()
+
+        self._test_initial_generation(
+            engine,
+            params["prompt_moe"],
+            params["sampling_params_moe"],
+            params["expect_output_before_update_weights_moe"],
+        )
+
+        t = time.perf_counter()
+        gpu_memory_usage_before_release = get_gpu_memory_gb()
+        engine.release_memory_occupation()
+        gpu_memory_usage_after_release = get_gpu_memory_gb()
+        self.assertLess(
+            gpu_memory_usage_after_release,
+            gpu_memory_usage_before_release,
+        )
+
+        print(
+            f"Release took {time.perf_counter() - t:.2f}s, memory: {gpu_memory_usage_before_release:.1f} GB → {gpu_memory_usage_after_release:.1f} GB"
+        )
+
+        if _DEBUG_EXTRA:
+            time.sleep(3)
+
+        t = time.perf_counter()
+        engine.resume_memory_occupation()
+        print(
+            f"Resume took {time.perf_counter() - t:.2f}s, memory: {get_gpu_memory_gb():.1f} GB"
+        )
+
+        hf_model_new = AutoModelForCausalLM.from_pretrained(
+            DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE,
+            torch_dtype="bfloat16",
+            device_map="cuda",
+        )
+        engine.update_weights_from_tensor(list(hf_model_new.named_parameters()))
+
+        # destroy the hf model
+        del hf_model_new
+        torch.cuda.empty_cache()
+
+        print("generate (#2)")
+        outputs = engine.generate(params["prompt_moe"], params["sampling_params_moe"])[
+            "text"
+        ]
+        self.assertEqual(outputs, params["expect_output_after_update_weights_moe"])
+        engine.shutdown()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_request_queue_validation.py b/test/srt/test_request_queue_validation.py
new file mode 100644
index 000000000..2a9739a1c
--- /dev/null
+++ b/test/srt/test_request_queue_validation.py
@@ -0,0 +1,87 @@
+import asyncio
+import os
+import re
+import unittest
+from concurrent.futures import ThreadPoolExecutor
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    STDERR_FILENAME,
+    STDOUT_FILENAME,
+    CustomTestCase,
+    popen_launch_server,
+    send_concurrent_generate_requests,
+    send_generate_requests,
+)
+
+
+class TestMaxQueuedRequests(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+
+        cls.stdout = open(STDOUT_FILENAME, "w")
+        cls.stderr = open(STDERR_FILENAME, "w")
+
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=(
+                "--max-running-requests",  # Enforce max request concurrency is 1
+                "1",
+                "--max-queued-requests",  # Enforce max queued request number is 1
+                "1",
+            ),
+            return_stdout_stderr=(cls.stdout, cls.stderr),
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+        cls.stdout.close()
+        cls.stderr.close()
+        os.remove(STDOUT_FILENAME)
+        os.remove(STDERR_FILENAME)
+
+    def test_max_queued_requests_validation_with_serial_requests(self):
+        """Verify request is not throttled when the max concurrency is 1."""
+        status_codes = send_generate_requests(
+            self.base_url,
+            num_requests=10,
+        )
+
+        for status_code in status_codes:
+            assert status_code == 200  # request shouldn't be throttled
+
+    def test_max_queued_requests_validation_with_concurrent_requests(self):
+        """Verify request throttling with concurrent requests."""
+        status_codes = asyncio.run(
+            send_concurrent_generate_requests(self.base_url, num_requests=10)
+        )
+
+        assert 200 in status_codes
+        assert 503 in status_codes
+        assert all(status_code in [200, 503] for status_code in status_codes)
+
+    def test_max_running_requests_and_max_queued_request_validation(self):
+        """Verify running request and queued request numbers based on server logs."""
+        rr_pattern = re.compile(r"#running-req:\s*(\d+)")
+        qr_pattern = re.compile(r"#queue-req:\s*(\d+)")
+
+        with open(STDERR_FILENAME) as lines:
+            for line in lines:
+                rr_match, qr_match = rr_pattern.search(line), qr_pattern.search(line)
+                if rr_match:
+                    assert int(rr_match.group(1)) <= 1
+                if qr_match:
+                    assert int(qr_match.group(1)) <= 1
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_retract_decode.py b/test/srt/test_retract_decode.py
new file mode 100644
index 000000000..92f5ab915
--- /dev/null
+++ b/test/srt/test_retract_decode.py
@@ -0,0 +1,60 @@
+import os
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestRetractDecode(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ["SGLANG_TEST_RETRACT"] = "1"
+
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreaterEqual(metrics["score"], 0.65)
+
+
+class TestRetractDecodeChunkCache(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ["SGLANG_TEST_RETRACT"] = "1"
+
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--disable-radix-cache", "--chunked-prefill-size", 128],
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_rope_rocm.py b/test/srt/test_rope_rocm.py
new file mode 100644
index 000000000..5850e7061
--- /dev/null
+++ b/test/srt/test_rope_rocm.py
@@ -0,0 +1,116 @@
+import unittest
+
+import torch
+
+from sglang.srt.layers.rotary_embedding import RotaryEmbedding
+from sglang.srt.utils import get_bool_env_var, is_hip
+from sglang.test.test_utils import CustomTestCase
+
+torch.manual_seed(0)
+
+_is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+
+_CASES = [
+    (64, 64, 32, 8000, True, torch.bfloat16, "cuda", 32, 32, 1, 1),
+    (256, 128, 4096, 10000, True, torch.bfloat16, "cuda", 2, 512, 4, 2),
+    (512, 128, 311, 10000, True, torch.bfloat16, "cuda", 3, 39, 4, 2),
+    (128, 128, 2048, 10000, False, torch.bfloat16, "cuda", 2, 512, 32, 8),
+    (128, 128, 2048, 10000, False, torch.bfloat16, "cuda", 2, 512, 16, 4),
+    (512, 128, 311, 10000, False, torch.bfloat16, "cuda", 3, 39, 4, 2),
+]
+
+
+@unittest.skipIf(_use_aiter, reason="SGLANG_USE_AITER=1 will not use vllm path.")
+class TestRotaryEmbeddingNative(CustomTestCase):
+    # Compare RotaryEmbedding.forward_hip() to forward_native().
+    def _run_case(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_pos: int,
+        base: int,
+        is_neox: bool,
+        dtype: torch.dtype,
+        device: str,
+        batch_size: int,
+        seq_len: int,
+        num_q: int,
+        num_kv: int,
+    ) -> None:
+        rope_ref = RotaryEmbedding(
+            head_size, rotary_dim, max_pos, base, is_neox, dtype
+        ).to(device)
+        rope_hip = RotaryEmbedding(
+            head_size, rotary_dim, max_pos, base, is_neox, dtype
+        ).to(device)
+
+        pos_ids = torch.arange(seq_len, device=device).repeat(batch_size)
+        query = torch.randn(
+            batch_size * seq_len, num_q * head_size, dtype=dtype, device=device
+        )
+        key = torch.randn(
+            batch_size * seq_len, num_kv * head_size, dtype=dtype, device=device
+        )
+
+        q_ref, k_ref = rope_ref.forward_native(pos_ids, query.clone(), key.clone())
+        q_hip, k_hip = rope_hip.forward_hip(pos_ids, query.clone(), key.clone())
+
+        torch.testing.assert_close(q_ref, q_hip, atol=1e-2, rtol=1e-2)
+        torch.testing.assert_close(k_ref, k_hip, atol=1e-2, rtol=1e-2)
+
+    def test_all_cases(self) -> None:
+        """Drive over the full parameter matrix using subTest()."""
+        for case in _CASES:
+            with self.subTest(case=case):
+                self._run_case(*case)
+
+
+@unittest.skipIf(not _use_aiter, reason="Requires AMD GPU plus SGLANG_USE_AITER=1")
+class TestRotaryEmbeddingAITer(CustomTestCase):
+    @staticmethod
+    def _run_case_aiter(
+        head_size: int,
+        rotary_dim: int,
+        max_pos: int,
+        base: int,
+        is_neox: bool,
+        dtype: torch.dtype,
+        device: str,
+        batch_size: int,
+        seq_len: int,
+        num_q: int,
+        num_kv: int,
+    ) -> None:
+        from aiter.rotary_embedding import RotaryEmbedding as AiterRotaryEmbedding
+
+        rope_ref = AiterRotaryEmbedding(
+            head_size, rotary_dim, max_pos, base, is_neox, dtype
+        ).to(device)
+        rope_hip = AiterRotaryEmbedding(
+            head_size, rotary_dim, max_pos, base, is_neox, dtype
+        ).to(device)
+
+        pos_ids = torch.arange(seq_len, device=device).repeat(batch_size)
+        query = torch.randn(
+            batch_size * seq_len, num_q * head_size, dtype=dtype, device=device
+        )
+        key = torch.randn(
+            batch_size * seq_len, num_kv * head_size, dtype=dtype, device=device
+        )
+
+        q_ref, k_ref = rope_ref.forward_native(pos_ids, query.clone(), key.clone())
+        q_hip, k_hip = rope_hip.forward_hip(pos_ids, query.clone(), key.clone())
+
+        torch.testing.assert_close(q_ref, q_hip, atol=1e-2, rtol=1e-2)
+        torch.testing.assert_close(k_ref, k_hip, atol=1e-2, rtol=1e-2)
+
+    def test_all_cases(self) -> None:
+        for case in _CASES:
+            with self.subTest(case=case):
+                self._run_case_aiter(*case)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_sagemaker_server.py b/test/srt/test_sagemaker_server.py
new file mode 100644
index 000000000..68688c112
--- /dev/null
+++ b/test/srt/test_sagemaker_server.py
@@ -0,0 +1,179 @@
+"""
+python3 -m unittest test_sagemaker_server.TestSageMakerServer.test_chat_completion
+"""
+
+import json
+import unittest
+
+import requests
+
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestSageMakerServer(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+        )
+        cls.tokenizer = get_tokenizer(DEFAULT_SMALL_MODEL_NAME_FOR_TEST)
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def run_chat_completion(self, logprobs, parallel_sample_num):
+        data = {
+            "model": self.model,
+            "messages": [
+                {"role": "system", "content": "You are a helpful AI assistant"},
+                {
+                    "role": "user",
+                    "content": "What is the capital of France? Answer in a few words.",
+                },
+            ],
+            "temperature": 0,
+            "logprobs": logprobs is not None and logprobs > 0,
+            "top_logprobs": logprobs,
+            "n": parallel_sample_num,
+        }
+
+        headers = {"Authorization": f"Bearer {self.api_key}"}
+
+        response = requests.post(
+            f"{self.base_url}/invocations", json=data, headers=headers
+        ).json()
+
+        if logprobs:
+            assert isinstance(
+                response["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0][
+                    "token"
+                ],
+                str,
+            )
+
+            ret_num_top_logprobs = len(
+                response["choices"][0]["logprobs"]["content"][0]["top_logprobs"]
+            )
+            assert (
+                ret_num_top_logprobs == logprobs
+            ), f"{ret_num_top_logprobs} vs {logprobs}"
+
+        assert len(response["choices"]) == parallel_sample_num
+        assert response["choices"][0]["message"]["role"] == "assistant"
+        assert isinstance(response["choices"][0]["message"]["content"], str)
+        assert response["id"]
+        assert response["created"]
+        assert response["usage"]["prompt_tokens"] > 0
+        assert response["usage"]["completion_tokens"] > 0
+        assert response["usage"]["total_tokens"] > 0
+
+    def run_chat_completion_stream(self, logprobs, parallel_sample_num=1):
+        data = {
+            "model": self.model,
+            "messages": [
+                {"role": "system", "content": "You are a helpful AI assistant"},
+                {
+                    "role": "user",
+                    "content": "What is the capital of France? Answer in a few words.",
+                },
+            ],
+            "temperature": 0,
+            "logprobs": logprobs is not None and logprobs > 0,
+            "top_logprobs": logprobs,
+            "stream": True,
+            "stream_options": {"include_usage": True},
+            "n": parallel_sample_num,
+        }
+
+        headers = {"Authorization": f"Bearer {self.api_key}"}
+
+        response = requests.post(
+            f"{self.base_url}/invocations", json=data, stream=True, headers=headers
+        )
+
+        is_firsts = {}
+        for line in response.iter_lines():
+            line = line.decode("utf-8").replace("data: ", "")
+            if len(line) < 1 or line == "[DONE]":
+                continue
+            print(f"value: {line}")
+            line = json.loads(line)
+            usage = line.get("usage")
+            if usage is not None:
+                assert usage["prompt_tokens"] > 0
+                assert usage["completion_tokens"] > 0
+                assert usage["total_tokens"] > 0
+                continue
+
+            index = line.get("choices")[0].get("index")
+            data = line.get("choices")[0].get("delta")
+
+            if is_firsts.get(index, True):
+                assert data["role"] == "assistant"
+                is_firsts[index] = False
+                continue
+
+            if logprobs:
+                assert line.get("choices")[0].get("logprobs")
+                assert isinstance(
+                    line.get("choices")[0]
+                    .get("logprobs")
+                    .get("content")[0]
+                    .get("top_logprobs")[0]
+                    .get("token"),
+                    str,
+                )
+                assert isinstance(
+                    line.get("choices")[0]
+                    .get("logprobs")
+                    .get("content")[0]
+                    .get("top_logprobs"),
+                    list,
+                )
+                ret_num_top_logprobs = len(
+                    line.get("choices")[0]
+                    .get("logprobs")
+                    .get("content")[0]
+                    .get("top_logprobs")
+                )
+                assert (
+                    ret_num_top_logprobs == logprobs
+                ), f"{ret_num_top_logprobs} vs {logprobs}"
+
+            assert isinstance(data["content"], str)
+            assert line["id"]
+            assert line["created"]
+
+        for index in [i for i in range(parallel_sample_num)]:
+            assert not is_firsts.get(
+                index, True
+            ), f"index {index} is not found in the response"
+
+    def test_chat_completion(self):
+        for logprobs in [None, 5]:
+            for parallel_sample_num in [1, 2]:
+                self.run_chat_completion(logprobs, parallel_sample_num)
+
+    def test_chat_completion_stream(self):
+        for logprobs in [None, 5]:
+            for parallel_sample_num in [1, 2]:
+                self.run_chat_completion_stream(logprobs, parallel_sample_num)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_schedule_policy.py b/test/srt/test_schedule_policy.py
new file mode 100644
index 000000000..4a4f57b35
--- /dev/null
+++ b/test/srt/test_schedule_policy.py
@@ -0,0 +1,65 @@
+import unittest
+
+from sglang.srt.managers.schedule_batch import Req
+from sglang.srt.managers.schedule_policy import (
+    CacheAgnosticPolicy,
+    CacheAwarePolicy,
+    SchedulePolicy,
+)
+from sglang.srt.mem_cache.radix_cache import RadixCache, TreeNode
+from sglang.srt.sampling.sampling_params import SamplingParams
+from sglang.test.test_utils import CustomTestCase
+
+
+class TestSchedulePolicy(CustomTestCase):
+
+    def setUp(self):
+        self.tree_cache = RadixCache(None, None, False)
+
+    def test_init_with_cache_aware_policy(self):
+        policy = SchedulePolicy(
+            policy="lpm", tree_cache=self.tree_cache, enable_hierarchical_cache=True
+        )
+        self.assertEqual(policy.policy, CacheAwarePolicy.LPM)
+
+    def test_init_with_cache_agnostic_policy(self):
+        policy = SchedulePolicy(
+            policy="fcfs", tree_cache=self.tree_cache, enable_hierarchical_cache=True
+        )
+        self.assertEqual(policy.policy, CacheAgnosticPolicy.FCFS)
+
+    def test_init_with_unknown_policy(self):
+        with self.assertRaises(ValueError):
+            SchedulePolicy(
+                policy="invalid",
+                tree_cache=self.tree_cache,
+                enable_hierarchical_cache=True,
+            )
+
+    def test_init_with_disabled_cache(self):
+        disabled_tree_cache = RadixCache(None, None, disable=True, page_size=1)
+        policy = SchedulePolicy(
+            policy="lpm", tree_cache=disabled_tree_cache, enable_hierarchical_cache=True
+        )
+        self.assertEqual(policy.policy, CacheAgnosticPolicy.FCFS)
+
+    def test_calc_priority_fcfs(self):
+        tree_cache = RadixCache(None, None, False)
+        waiting_queue = [
+            Req(1, "a b", [1, 2], SamplingParams()),
+            Req(3, "a b c", [1, 2, 3], SamplingParams()),
+            Req(2, "a", [1], SamplingParams()),
+        ]
+
+        policy = SchedulePolicy(
+            policy="fcfs", tree_cache=tree_cache, enable_hierarchical_cache=True
+        )
+        policy.calc_priority(waiting_queue)
+        # Check if FCFS keeps the original order
+        self.assertEqual(waiting_queue[0].rid, 1)
+        self.assertEqual(waiting_queue[1].rid, 3)
+        self.assertEqual(waiting_queue[2].rid, 2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_score_api.py b/test/srt/test_score_api.py
new file mode 100644
index 000000000..d08ae9df7
--- /dev/null
+++ b/test/srt/test_score_api.py
@@ -0,0 +1,300 @@
+import unittest
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from sglang.srt.entrypoints.engine import Engine
+from sglang.test.test_utils import DEFAULT_SMALL_MODEL_NAME_FOR_TEST, CustomTestCase
+
+TEST_MODEL_NAME = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+
+
+class TestScoreAPI(CustomTestCase):
+    """Test the scoring API functionality."""
+
+    def setUp(self):
+        """Set up each test case."""
+        self.engine = Engine(model_path=TEST_MODEL_NAME)
+
+    def tearDown(self):
+        """Clean up after each test case."""
+        if self.engine is not None:
+            self.engine.shutdown()
+            torch.cuda.empty_cache()
+
+    def compute_hf_scores(
+        self, query, items, label_token_ids, apply_softmax=False, item_first=False
+    ):
+        """Compute scores using direct HuggingFace model inference.
+        Returns probabilities for each token ID, optionally normalized with softmax.
+
+        Args:
+            query: The query text
+            items: List of item texts
+            label_token_ids: List of token IDs to compute probabilities for
+            apply_softmax: Whether to normalize probabilities using softmax
+            item_first: If True, prepend items to query. Otherwise append items to query.
+        """
+        # Initialize HF model and tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(
+            TEST_MODEL_NAME, trust_remote_code=True
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            TEST_MODEL_NAME, trust_remote_code=True
+        )
+
+        try:
+            scores = []
+            for item in items:
+                # Construct full text based on item_first parameter
+                full_text = f"{item}{query}" if item_first else f"{query}{item}"
+                inputs = tokenizer(full_text, return_tensors="pt").to(model.device)
+
+                # Get logits for the last token
+                with torch.no_grad():
+                    outputs = model(**inputs)
+                    last_token_logits = outputs.logits[0, -1]
+
+                # Get logits for just our target tokens
+                target_logits = last_token_logits[label_token_ids]
+
+                # Apply softmax over just the target tokens
+                target_probs = torch.softmax(target_logits, dim=-1)
+
+                # Convert to list of probabilities in order of label_token_ids
+                probs = [target_probs[i].item() for i in range(len(label_token_ids))]
+
+                scores.append(probs)
+
+            return scores
+        finally:
+            # Clean up HF resources
+            model.cpu()
+            del model
+            del tokenizer
+            torch.cuda.empty_cache()
+
+    def _get_token_ids(self, tokens):
+        """Helper method to get token IDs for a list of tokens."""
+        tokenizer = AutoTokenizer.from_pretrained(
+            TEST_MODEL_NAME, trust_remote_code=True
+        )
+        try:
+            label_token_ids = []
+            for token in tokens:
+                encoding = tokenizer.encode_plus(token, add_special_tokens=False)
+                token_ids = encoding["input_ids"]
+                label_token_ids.append(token_ids[0])
+            return label_token_ids
+        finally:
+            del tokenizer
+
+    def _compare_scores(self, hf_scores, sglang_scores, label_token_ids, case_name=""):
+        """Helper method to compare scores between HF and SGLang using relative tolerance."""
+        self.assertEqual(
+            len(hf_scores),
+            len(sglang_scores),
+            f"Score lengths don't match for {case_name}",
+        )
+
+        # Use a relative tolerance of 1%
+        TOLERANCE = 0.01
+
+        for hf_score_list, sglang_score_list in zip(hf_scores, sglang_scores):
+            self.assertEqual(
+                len(hf_score_list),
+                len(sglang_score_list),
+                f"Score list lengths don't match for {case_name}",
+            )
+
+            for hf_score, sglang_score in zip(hf_score_list, sglang_score_list):
+                diff = abs(hf_score - sglang_score)
+                self.assertLessEqual(
+                    diff,
+                    TOLERANCE,
+                    msg=f"Scores differ by {diff:.2%} ({case_name}): "
+                    f"HF={hf_score:.6f}, SGLang={sglang_score:.6f}",
+                )
+
+                self.assertGreaterEqual(
+                    sglang_score, 0, f"SGLang score {sglang_score:.6f} not in [0,1]"
+                )
+                self.assertLessEqual(
+                    sglang_score, 1, f"SGLang score {sglang_score:.6f} not in [0,1]"
+                )
+
+            self.assertAlmostEqual(
+                sum(sglang_score_list),
+                1.0,
+                places=6,
+                msg=f"SGLang scores don't sum to 1 ({case_name}): {sum(sglang_score_list):.6f}",
+            )
+
+    def test_score_consistency(self):
+        """Test that SGLang scoring matches direct HuggingFace model scoring."""
+        # Define test cases
+        test_cases = [
+            {
+                "name": "default case",
+                "query": "I pledge allegiance",
+                "items": ["", " to"],
+                "item_first": False,
+            },
+            {
+                "name": "item_first case",
+                "query": " is a city",
+                "items": ["Tokyo", "Japan"],
+                "item_first": True,
+            },
+        ]
+
+        # Common tokens to test for all cases
+        tokens = [" to", " the"]
+        label_token_ids = self._get_token_ids(tokens)
+
+        # Run each test case
+        for case in test_cases:
+            # Get scores from SGLang
+            sglang_scores = self.engine.score(
+                query=case["query"],
+                items=case["items"],
+                label_token_ids=label_token_ids,
+                apply_softmax=True,
+                item_first=case["item_first"],
+            )
+
+            # Get scores from HuggingFace using the same parameters
+            hf_scores = self.compute_hf_scores(
+                query=case["query"],
+                items=case["items"],
+                label_token_ids=label_token_ids,
+                apply_softmax=True,
+                item_first=case["item_first"],
+            )
+
+            # Compare scores
+            self._compare_scores(
+                hf_scores, sglang_scores, label_token_ids, case["name"]
+            )
+
+    def test_score_batch_handling(self):
+        """Test that batch scoring works correctly."""
+        # Test with different batch sizes
+        batch_sizes = [1, 2, 4, 8]
+        label_token_ids = [1, 2, 3]
+
+        for batch_size in batch_sizes:
+            texts = [f"test {i}" for i in range(batch_size)]
+            scores = self.engine.score(
+                query="The test was",
+                items=texts,
+                label_token_ids=label_token_ids,
+                apply_softmax=True,
+            )
+
+            self.assertEqual(
+                len(scores),
+                batch_size,
+                f"Expected {batch_size} scores, got {len(scores)}",
+            )
+
+            # Verify each score list has the correct length
+            for score_list in scores:
+                self.assertEqual(
+                    len(score_list),
+                    len(label_token_ids),
+                    f"Score list length {len(score_list)} doesn't match label_token_ids length {len(label_token_ids)}",
+                )
+                self.assertTrue(
+                    all(isinstance(v, float) for v in score_list),
+                    "All scores should be floats",
+                )
+                self.assertAlmostEqual(
+                    1.0, sum(score_list), 6, "Scores should sum to 1"
+                )
+
+    def test_score_request_construction(self):
+        """Test that scoring requests are constructed to avoid decode phase."""
+        from unittest.mock import patch
+
+        # Capture the internal request to verify optimization
+        captured_requests = []
+        original_gen = self.engine.tokenizer_manager.generate_request
+
+        async def mock_generate_request(req, request=None):
+            captured_requests.append(req)
+            async for result in original_gen(req, request):
+                yield result
+
+        # Patch the generate_request method
+        with patch.object(
+            self.engine.tokenizer_manager,
+            "generate_request",
+            side_effect=mock_generate_request,
+        ):
+            # Run a scoring request
+            query = "What is the capital of"
+            items = ["France", "Germany"]
+            label_token_ids = [1, 2, 3]
+
+            scores = self.engine.score(
+                query=query,
+                items=items,
+                label_token_ids=label_token_ids,
+                apply_softmax=True,
+            )
+
+            # Verify we got results
+            self.assertEqual(len(scores), len(items))
+
+            # Verify the captured request has decode-avoiding properties
+            self.assertEqual(len(captured_requests), 1)
+            request = captured_requests[0]
+
+            # Key assertions for decode phase avoidance:
+            # 1. max_new_tokens should be 0 (prevents token generation)
+            # Handle both single and batch request cases
+            if isinstance(request.sampling_params, dict):
+                max_new_tokens = request.sampling_params.get("max_new_tokens", 0)
+            elif isinstance(request.sampling_params, list):
+                # For batch requests, check the first item
+                max_new_tokens = request.sampling_params[0].get("max_new_tokens", 0)
+            else:
+                max_new_tokens = getattr(request.sampling_params, "max_new_tokens", 0)
+
+            self.assertEqual(
+                max_new_tokens, 0, "max_new_tokens should be 0 to avoid decode phase"
+            )
+
+            # 2. Should have token_ids_logprob for scoring
+            # Handle both single and batch request cases
+            if (
+                isinstance(request.token_ids_logprob, list)
+                and len(request.token_ids_logprob) > 0
+                and isinstance(request.token_ids_logprob[0], list)
+            ):
+                # Batch case: token_ids_logprob is a list of lists
+                # Each item in the batch should have the same label_token_ids
+                for item_token_ids in request.token_ids_logprob:
+                    self.assertEqual(
+                        item_token_ids,
+                        label_token_ids,
+                        "Each batch item should have label_token_ids for scoring",
+                    )
+            else:
+                # Single request case
+                self.assertEqual(
+                    request.token_ids_logprob,
+                    label_token_ids,
+                    "Should have label_token_ids for scoring",
+                )
+
+            # 3. Should request logprobs but not stream
+            self.assertTrue(
+                request.return_logprob, "Should request logprobs for scoring"
+            )
+            self.assertFalse(request.stream, "Scoring requests should not stream")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_server_args.py b/test/srt/test_server_args.py
new file mode 100644
index 000000000..6096bc13b
--- /dev/null
+++ b/test/srt/test_server_args.py
@@ -0,0 +1,273 @@
+import json
+import unittest
+from unittest.mock import MagicMock, patch
+
+from sglang.srt.server_args import PortArgs, prepare_server_args
+from sglang.test.test_utils import CustomTestCase
+
+
+class TestPrepareServerArgs(CustomTestCase):
+    def test_prepare_server_args(self):
+        server_args = prepare_server_args(
+            [
+                "--model-path",
+                "meta-llama/Meta-Llama-3.1-8B-Instruct",
+                "--json-model-override-args",
+                '{"rope_scaling": {"factor": 2.0, "rope_type": "linear"}}',
+            ]
+        )
+        self.assertEqual(
+            server_args.model_path, "meta-llama/Meta-Llama-3.1-8B-Instruct"
+        )
+        self.assertEqual(
+            json.loads(server_args.json_model_override_args),
+            {"rope_scaling": {"factor": 2.0, "rope_type": "linear"}},
+        )
+
+
+class TestPortArgs(unittest.TestCase):
+    @patch("sglang.srt.server_args.is_port_available")
+    @patch("sglang.srt.server_args.tempfile.NamedTemporaryFile")
+    def test_init_new_standard_case(self, mock_temp_file, mock_is_port_available):
+        mock_is_port_available.return_value = True
+        mock_temp_file.return_value.name = "temp_file"
+
+        server_args = MagicMock()
+        server_args.port = 30000
+        server_args.nccl_port = None
+        server_args.enable_dp_attention = False
+
+        port_args = PortArgs.init_new(server_args)
+
+        self.assertTrue(port_args.tokenizer_ipc_name.startswith("ipc://"))
+        self.assertTrue(port_args.scheduler_input_ipc_name.startswith("ipc://"))
+        self.assertTrue(port_args.detokenizer_ipc_name.startswith("ipc://"))
+        self.assertIsInstance(port_args.nccl_port, int)
+
+    @patch("sglang.srt.server_args.is_port_available")
+    def test_init_new_with_single_node_dp_attention(self, mock_is_port_available):
+        mock_is_port_available.return_value = True
+
+        server_args = MagicMock()
+        server_args.port = 30000
+        server_args.nccl_port = None
+        server_args.enable_dp_attention = True
+        server_args.nnodes = 1
+        server_args.dist_init_addr = None
+
+        port_args = PortArgs.init_new(server_args)
+
+        self.assertTrue(port_args.tokenizer_ipc_name.startswith("tcp://127.0.0.1:"))
+        self.assertTrue(
+            port_args.scheduler_input_ipc_name.startswith("tcp://127.0.0.1:")
+        )
+        self.assertTrue(port_args.detokenizer_ipc_name.startswith("tcp://127.0.0.1:"))
+        self.assertIsInstance(port_args.nccl_port, int)
+
+    @patch("sglang.srt.server_args.is_port_available")
+    def test_init_new_with_dp_rank(self, mock_is_port_available):
+        mock_is_port_available.return_value = True
+
+        server_args = MagicMock()
+        server_args.port = 30000
+        server_args.nccl_port = None
+        server_args.enable_dp_attention = True
+        server_args.nnodes = 1
+        server_args.dist_init_addr = "192.168.1.1:25000"
+
+        port_args = PortArgs.init_new(server_args, dp_rank=2)
+
+        self.assertTrue(port_args.scheduler_input_ipc_name.endswith(":25008"))
+
+        self.assertTrue(port_args.tokenizer_ipc_name.startswith("tcp://192.168.1.1:"))
+        self.assertTrue(port_args.detokenizer_ipc_name.startswith("tcp://192.168.1.1:"))
+        self.assertIsInstance(port_args.nccl_port, int)
+
+    @patch("sglang.srt.server_args.is_port_available")
+    def test_init_new_with_ipv4_address(self, mock_is_port_available):
+        mock_is_port_available.return_value = True
+
+        server_args = MagicMock()
+        server_args.port = 30000
+
+        server_args.nccl_port = None
+
+        server_args.enable_dp_attention = True
+        server_args.nnodes = 2
+        server_args.dist_init_addr = "192.168.1.1:25000"
+
+        port_args = PortArgs.init_new(server_args)
+
+        self.assertTrue(port_args.tokenizer_ipc_name.startswith("tcp://192.168.1.1:"))
+        self.assertTrue(
+            port_args.scheduler_input_ipc_name.startswith("tcp://192.168.1.1:")
+        )
+        self.assertTrue(port_args.detokenizer_ipc_name.startswith("tcp://192.168.1.1:"))
+        self.assertIsInstance(port_args.nccl_port, int)
+
+    @patch("sglang.srt.server_args.is_port_available")
+    def test_init_new_with_malformed_ipv4_address(self, mock_is_port_available):
+        mock_is_port_available.return_value = True
+
+        server_args = MagicMock()
+        server_args.port = 30000
+        server_args.nccl_port = None
+
+        server_args.enable_dp_attention = True
+        server_args.nnodes = 2
+        server_args.dist_init_addr = "192.168.1.1"
+
+        with self.assertRaises(AssertionError) as context:
+            PortArgs.init_new(server_args)
+
+        self.assertIn(
+            "please provide --dist-init-addr as host:port", str(context.exception)
+        )
+
+    @patch("sglang.srt.server_args.is_port_available")
+    def test_init_new_with_malformed_ipv4_address_invalid_port(
+        self, mock_is_port_available
+    ):
+        mock_is_port_available.return_value = True
+
+        server_args = MagicMock()
+        server_args.port = 30000
+        server_args.nccl_port = None
+
+        server_args.enable_dp_attention = True
+        server_args.nnodes = 2
+        server_args.dist_init_addr = "192.168.1.1:abc"
+
+        with self.assertRaises(ValueError) as context:
+            PortArgs.init_new(server_args)
+
+    @patch("sglang.srt.server_args.is_port_available")
+    @patch("sglang.srt.server_args.is_valid_ipv6_address", return_value=True)
+    def test_init_new_with_ipv6_address(
+        self, mock_is_valid_ipv6, mock_is_port_available
+    ):
+        mock_is_port_available.return_value = True
+
+        server_args = MagicMock()
+        server_args.port = 30000
+        server_args.nccl_port = None
+
+        server_args.enable_dp_attention = True
+        server_args.nnodes = 2
+        server_args.dist_init_addr = "[2001:db8::1]:25000"
+
+        port_args = PortArgs.init_new(server_args)
+
+        self.assertTrue(port_args.tokenizer_ipc_name.startswith("tcp://[2001:db8::1]:"))
+        self.assertTrue(
+            port_args.scheduler_input_ipc_name.startswith("tcp://[2001:db8::1]:")
+        )
+        self.assertTrue(
+            port_args.detokenizer_ipc_name.startswith("tcp://[2001:db8::1]:")
+        )
+        self.assertIsInstance(port_args.nccl_port, int)
+
+    @patch("sglang.srt.server_args.is_port_available")
+    @patch("sglang.srt.server_args.is_valid_ipv6_address", return_value=False)
+    def test_init_new_with_invalid_ipv6_address(
+        self, mock_is_valid_ipv6, mock_is_port_available
+    ):
+        mock_is_port_available.return_value = True
+
+        server_args = MagicMock()
+        server_args.port = 30000
+        server_args.nccl_port = None
+
+        server_args.enable_dp_attention = True
+        server_args.nnodes = 2
+        server_args.dist_init_addr = "[invalid-ipv6]:25000"
+
+        with self.assertRaises(ValueError) as context:
+            PortArgs.init_new(server_args)
+
+        self.assertIn("invalid IPv6 address", str(context.exception))
+
+    @patch("sglang.srt.server_args.is_port_available")
+    def test_init_new_with_malformed_ipv6_address_missing_bracket(
+        self, mock_is_port_available
+    ):
+        mock_is_port_available.return_value = True
+
+        server_args = MagicMock()
+        server_args.port = 30000
+        server_args.nccl_port = None
+
+        server_args.enable_dp_attention = True
+        server_args.nnodes = 2
+        server_args.dist_init_addr = "[2001:db8::1:25000"
+
+        with self.assertRaises(ValueError) as context:
+            PortArgs.init_new(server_args)
+
+        self.assertIn("invalid IPv6 address format", str(context.exception))
+
+    @patch("sglang.srt.server_args.is_port_available")
+    @patch("sglang.srt.server_args.is_valid_ipv6_address", return_value=True)
+    def test_init_new_with_malformed_ipv6_address_missing_port(
+        self, mock_is_valid_ipv6, mock_is_port_available
+    ):
+        mock_is_port_available.return_value = True
+
+        server_args = MagicMock()
+        server_args.port = 30000
+        server_args.nccl_port = None
+
+        server_args.enable_dp_attention = True
+        server_args.nnodes = 2
+        server_args.dist_init_addr = "[2001:db8::1]"
+
+        with self.assertRaises(ValueError) as context:
+            PortArgs.init_new(server_args)
+
+        self.assertIn(
+            "a port must be specified in IPv6 address", str(context.exception)
+        )
+
+    @patch("sglang.srt.server_args.is_port_available")
+    @patch("sglang.srt.server_args.is_valid_ipv6_address", return_value=True)
+    def test_init_new_with_malformed_ipv6_address_invalid_port(
+        self, mock_is_valid_ipv6, mock_is_port_available
+    ):
+        mock_is_port_available.return_value = True
+
+        server_args = MagicMock()
+        server_args.port = 30000
+        server_args.nccl_port = None
+
+        server_args.enable_dp_attention = True
+        server_args.nnodes = 2
+        server_args.dist_init_addr = "[2001:db8::1]:abcde"
+
+        with self.assertRaises(ValueError) as context:
+            PortArgs.init_new(server_args)
+
+        self.assertIn("invalid port in IPv6 address", str(context.exception))
+
+    @patch("sglang.srt.server_args.is_port_available")
+    @patch("sglang.srt.server_args.is_valid_ipv6_address", return_value=True)
+    def test_init_new_with_malformed_ipv6_address_wrong_separator(
+        self, mock_is_valid_ipv6, mock_is_port_available
+    ):
+        mock_is_port_available.return_value = True
+
+        server_args = MagicMock()
+        server_args.port = 30000
+        server_args.nccl_port = None
+
+        server_args.enable_dp_attention = True
+        server_args.nnodes = 2
+        server_args.dist_init_addr = "[2001:db8::1]#25000"
+
+        with self.assertRaises(ValueError) as context:
+            PortArgs.init_new(server_args)
+
+        self.assertIn("expected ':' after ']'", str(context.exception))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_session_control.py b/test/srt/test_session_control.py
new file mode 100644
index 000000000..4b0da75dc
--- /dev/null
+++ b/test/srt/test_session_control.py
@@ -0,0 +1,786 @@
+"""
+Usage:
+python3 -m unittest test_session_control.TestSessionControl.test_session_control
+python3 -m unittest test_session_control.TestSessionControl.test_session_control_with_branching
+python3 -m unittest test_session_control.TestSessionControl.test_session_control_backtrack_with_abort
+python3 -m unittest test_session_control.TestSessionControlVision.test_session_control
+"""
+
+import asyncio
+import json
+import unittest
+
+import aiohttp
+import requests
+
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+def remove_prefix(text: str, prefix: str) -> str:
+    return text[len(prefix) :] if text.startswith(prefix) else text
+
+
+class TestSessionControl(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--attention-backend",
+                "flashinfer",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_session_control(self, gen_len=12):
+        chunks = [
+            "Let me tell you something about France.",
+            "The capital of France is",
+            "The population of the city is",
+            "A brief history about that city is",
+        ]
+        tokenizer = get_tokenizer(self.model)
+        chunks_ids = [tokenizer.encode(x) for x in chunks]
+        for i in range(1, len(chunks_ids)):
+            if chunks_ids[i][0] == tokenizer.bos_token_id:
+                chunks_ids[i] = chunks_ids[i][1:]
+
+        # 1. using session control
+        requests.post(self.base_url + "/flush_cache")
+        session_id = requests.post(
+            self.base_url + "/open_session",
+            json={"capacity_of_str_len": 1000},
+        ).json()
+        rid = None
+
+        # open an existing session, should get session_id as None
+        ret = requests.post(
+            self.base_url + "/open_session",
+            json={"capacity_of_str_len": 1000, "session_id": session_id},
+        )
+        self.assertNotEqual(ret.status_code, 200)
+
+        first_rid = None
+        outputs_from_session = []
+        logprobs_from_session = []
+        cur_logprob_start_len = 0
+        for i, chunk_ids in enumerate(chunks_ids):
+            max_new_tokens = gen_len if i > 0 else 1  # prefill only for the first chunk
+            response = requests.post(
+                self.base_url + "/generate",
+                json={
+                    "input_ids": chunk_ids,
+                    "session_params": {
+                        "id": session_id,
+                        "rid": rid,
+                        "offset": -1,
+                        "replace": True,
+                    },
+                    "sampling_params": {
+                        "temperature": 0,
+                        "max_new_tokens": max_new_tokens,
+                        "no_stop_trim": True,
+                        "skip_special_tokens": False,
+                    },
+                    "return_logprob": True,
+                    "logprob_start_len": cur_logprob_start_len - 1,
+                },
+            ).json()
+            rid = response["meta_info"]["id"]
+            if i == 0:
+                first_rid = rid
+            if i > 0:
+                outputs_from_session.append(response["text"])
+                logprobs_from_session.extend(
+                    [
+                        round(sublist[0], 2)
+                        for sublist in response["meta_info"]["output_token_logprobs"]
+                    ]
+                )
+            cur_logprob_start_len += len(chunk_ids) + max_new_tokens
+
+        # query with a logprob_start_len longer than the request, should see error
+        ret = requests.post(
+            self.base_url + "/generate",
+            json={
+                "input_ids": chunk_ids,
+                "session_params": {
+                    "id": session_id,
+                    "rid": rid,
+                    "offset": -1,
+                    "replace": True,
+                },
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": max_new_tokens,
+                    "no_stop_trim": True,
+                    "skip_special_tokens": False,
+                },
+                "return_logprob": True,
+                "logprob_start_len": cur_logprob_start_len + len(chunk_ids),
+            },
+        )
+        self.assertNotEqual(ret.status_code, 200)
+
+        # backtrack to the first request and regenerate
+        cur_logprob_start_len = 0
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "input_ids": chunks_ids[-1],
+                "session_params": {
+                    "id": session_id,
+                    "rid": first_rid,
+                    "offset": -1,
+                    "replace": True,
+                },
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": gen_len,
+                    "no_stop_trim": True,
+                    "skip_special_tokens": False,
+                },
+                "return_logprob": True,
+                "logprob_start_len": cur_logprob_start_len,
+            },
+        ).json()
+        outputs_from_session.append(response["text"])
+        logprobs_from_session.extend(
+            [
+                round(sublist[0], 2)
+                for sublist in response["meta_info"]["output_token_logprobs"]
+            ]
+        )
+
+        # query with a non-existing rid (the last one should be disappeared because of backtrack), should see abort
+        ret = requests.post(
+            self.base_url + "/generate",
+            json={
+                "input_ids": chunks_ids[-1],
+                "session_params": {
+                    "id": session_id,
+                    "rid": rid,
+                    "offset": -1,
+                    "replace": True,
+                },
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": gen_len,
+                    "no_stop_trim": True,
+                    "skip_special_tokens": False,
+                },
+                "return_logprob": True,
+            },
+        )
+        self.assertNotEqual(ret.status_code, 200)
+
+        ret = requests.post(
+            self.base_url + "/close_session",
+            json={"session_id": session_id},
+        )
+        self.assertEqual(ret.status_code, 200)
+
+        # send a request to a closed session, should see abort
+        ret = requests.post(
+            self.base_url + "/generate",
+            json={
+                "input_ids": chunks_ids[-1],
+                "session_params": {
+                    "id": session_id,
+                    "rid": first_rid,
+                    "offset": -1,
+                    "replace": True,
+                },
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": gen_len,
+                    "no_stop_trim": True,
+                    "skip_special_tokens": False,
+                },
+                "return_logprob": True,
+            },
+        )
+        self.assertNotEqual(ret.status_code, 200)
+
+        # 2. not use session control
+        requests.post(self.base_url + "/flush_cache")
+
+        input_ids_first_req = None
+        input_ids = []
+        outputs_normal = []
+        logprobs_normal = []
+        for i, chunk_ids in enumerate(chunks_ids):
+            input_ids += chunk_ids
+            response = requests.post(
+                self.base_url + "/generate",
+                json={
+                    "input_ids": input_ids,
+                    "sampling_params": {
+                        "temperature": 0,
+                        "max_new_tokens": (
+                            gen_len if i > 0 else 1
+                        ),  # prefill only for the first chunk
+                        "no_stop_trim": True,
+                        "skip_special_tokens": False,
+                    },
+                    "return_logprob": True,
+                },
+            ).json()
+            if i > 0:
+                output_ids = tokenizer.encode(response["text"])
+                if output_ids[0] == tokenizer.bos_token_id:
+                    output_ids = output_ids[1:]
+                input_ids += output_ids[:-1]
+                outputs_normal.append(response["text"])
+                logprobs_normal.extend(
+                    [
+                        round(sublist[0], 2)
+                        for sublist in response["meta_info"]["output_token_logprobs"]
+                    ]
+                )
+            if i == 0:
+                input_ids_first_req = input_ids.copy()
+
+        input_ids_first_req += chunks_ids[-1]
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "input_ids": input_ids_first_req,
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": gen_len,
+                    "no_stop_trim": True,
+                    "skip_special_tokens": False,
+                },
+                "return_logprob": True,
+            },
+        ).json()
+        outputs_normal.append(response["text"])
+        logprobs_normal.extend(
+            [
+                round(sublist[0], 2)
+                for sublist in response["meta_info"]["output_token_logprobs"]
+            ]
+        )
+
+        print("outputs from chunked queries with session control:")
+        print(outputs_from_session)
+        print("outputs from normal queries:")
+        print(outputs_normal)
+        self.assertEqual(outputs_from_session, outputs_normal)
+        print("logprobs from chunked queries with session control:")
+        print(logprobs_from_session)
+        print("logprobs from normal queries:")
+        print(logprobs_normal)
+        assert len(logprobs_from_session) == len(
+            logprobs_normal
+        ), "logprobs must have equal length"
+        for a, b in zip(logprobs_from_session, logprobs_normal):
+            assert abs(a - b) <= 0.15, f"logprobs {a} and {b} differ by more than 0.15"
+
+    async def async_generate(self, payload):
+        url = self.base_url + "/generate"
+        async with aiohttp.ClientSession() as session:
+            async with session.post(url=url, json=payload) as response:
+                assert response.status == 200
+                async for chunk_bytes in response.content:
+                    chunk_bytes = chunk_bytes.strip()
+                    if not chunk_bytes:
+                        continue
+                    chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                    if chunk == "[DONE]":
+                        yield "", None, ""
+                    else:
+                        data = json.loads(chunk)
+                        finish_reason = (
+                            data["meta_info"]["finish_reason"]["type"]
+                            if data["meta_info"]["finish_reason"]
+                            else ""
+                        )
+                        yield data["text"], data["meta_info"]["id"], finish_reason
+
+    async def run_session_control_backtrack_with_abort(self, replace):
+        chunks = [
+            "Let me tell you something about France.",
+            "The capital of France is",
+        ]
+        tokenizer = get_tokenizer(self.model)
+        chunks_ids = [tokenizer.encode(x) for x in chunks]
+        for i in range(1, len(chunks_ids)):
+            if chunks_ids[i][0] == tokenizer.bos_token_id:
+                chunks_ids[i] = chunks_ids[i][1:]
+
+        # 1. using session control
+        requests.post(self.base_url + "/flush_cache")
+        session_id = requests.post(
+            self.base_url + "/open_session",
+            json={"capacity_of_str_len": 1000},
+        ).json()
+        rid = None
+
+        payload = {
+            "input_ids": chunks_ids[0],
+            "session_params": {
+                "id": session_id,
+                "rid": rid,
+                "offset": -1,
+                "replace": True,
+            },
+            "sampling_params": {
+                "temperature": 0,
+                "max_new_tokens": 100,
+                "no_stop_trim": True,
+                "skip_special_tokens": False,
+                "ignore_eos": True,
+            },
+            "stream": True,
+        }
+        gen_so_far = ""
+        finish_reason = ""
+        second_output = ""
+        async for chunk, rid, finish_reason_chunk in self.async_generate(payload):
+            gen_so_far += chunk
+            if finish_reason == "":
+                finish_reason = finish_reason_chunk
+            if len(gen_so_far) > 50 and second_output == "":
+                payload2 = {
+                    "input_ids": chunks_ids[1],
+                    "session_params": {
+                        "id": session_id,
+                        "rid": rid,
+                        "offset": 50,
+                        "replace": replace,
+                    },
+                    "sampling_params": {
+                        "temperature": 0,
+                        "max_new_tokens": 32,
+                        "no_stop_trim": True,
+                        "skip_special_tokens": False,
+                    },
+                    "stream": False,
+                    "stream_output": True,
+                }
+                response = requests.post(
+                    url=self.base_url + "/generate", json=payload2
+                ).json()
+                second_output = response["text"]
+        if replace:
+            assert finish_reason == "abort"
+        print("first request output:")
+        print(gen_so_far)
+        print("second request output:")
+        print(second_output)
+
+        # close the session
+        ret = requests.post(
+            self.base_url + "/close_session",
+            json={"session_id": session_id},
+        )
+        assert ret.status_code == 200
+
+        if not replace:
+            assert response["meta_info"]["finish_reason"]["type"] == "abort"
+        else:
+            # 2. not using session control
+            requests.post(self.base_url + "/flush_cache")
+            output_ids = tokenizer.encode(gen_so_far)
+            if output_ids[0] == tokenizer.bos_token_id:
+                output_ids = output_ids[1:]
+            input_ids = chunks_ids[0] + output_ids
+            input_ids = input_ids[:50] + chunks_ids[1]
+            payload = {
+                "input_ids": input_ids,
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": 32,
+                    "no_stop_trim": True,
+                    "skip_special_tokens": False,
+                },
+                "stream": False,
+                "stream_output": True,
+            }
+            response = requests.post(
+                url=self.base_url + "/generate", json=payload
+            ).json()
+            output_no_session = response["text"]
+            print("second request output without session:")
+            print(output_no_session)
+            assert (
+                second_output == output_no_session
+            ), f"second_output: {second_output}, output_no_session: {output_no_session}"
+
+    @unittest.skip("broken")
+    def test_session_control_backtrack_with_abort(self):
+        asyncio.run(self.run_session_control_backtrack_with_abort(replace=True))
+        asyncio.run(self.run_session_control_backtrack_with_abort(replace=False))
+
+    def run_session_control_with_branching(
+        self, root_prompt, chunks_per_step, gen_len=16
+    ):
+        for x in chunks_per_step:
+            assert len(x) == len(chunks_per_step[0])
+
+        # 1. using session control
+        requests.post(self.base_url + "/flush_cache")
+        session_id = requests.post(
+            self.base_url + "/open_session",
+            json={"capacity_of_str_len": 1000},
+        ).json()
+
+        outputs_from_session = []
+        # send the root prompt
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": root_prompt,
+                "session_params": {
+                    "id": session_id,
+                    "rid": None,
+                    "offset": 0,
+                    "replace": False,
+                },
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": gen_len,
+                    "no_stop_trim": True,
+                    "skip_special_tokens": False,
+                },
+            },
+        ).json()
+        rid_per_branch = [response["meta_info"]["id"]] * len(chunks_per_step[0])
+        outputs_from_session.append(response["text"])
+
+        # send the prompts in branches
+        for chunks_for_branches in chunks_per_step:
+            for j, chunk in enumerate(chunks_for_branches):
+                response = requests.post(
+                    self.base_url + "/generate",
+                    json={
+                        "text": chunk,
+                        "session_params": {
+                            "id": session_id,
+                            "rid": rid_per_branch[j],
+                            "offset": 0,
+                            "replace": False,
+                        },
+                        "sampling_params": {
+                            "temperature": 0,
+                            "max_new_tokens": gen_len,
+                            "no_stop_trim": True,
+                            "skip_special_tokens": False,
+                        },
+                    },
+                ).json()
+                rid = response["meta_info"]["id"]
+                rid_per_branch[j] = rid
+                outputs_from_session.append(response["text"])
+
+        # close the session
+        ret = requests.post(
+            self.base_url + "/close_session",
+            json={"session_id": session_id},
+        )
+        assert ret.status_code == 200
+
+        # 2. not use session control
+        requests.post(self.base_url + "/flush_cache")
+
+        outputs_normal = []
+        input_texts = [root_prompt] * len(chunks_per_step[0])
+        # send the root prompt
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": root_prompt,
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": gen_len,
+                    "no_stop_trim": True,
+                    "skip_special_tokens": False,
+                },
+            },
+        ).json()
+        outputs_normal.append(response["text"])
+        input_texts = [x + response["text"] for x in input_texts]
+
+        # send the prompts in branches
+        for chunks_for_branches in chunks_per_step:
+            for j, chunk in enumerate(chunks_for_branches):
+                input_texts[j] += chunk
+                response = requests.post(
+                    self.base_url + "/generate",
+                    json={
+                        "text": input_texts[j],
+                        "sampling_params": {
+                            "temperature": 0,
+                            "max_new_tokens": gen_len,
+                            "no_stop_trim": True,
+                            "skip_special_tokens": False,
+                        },
+                    },
+                ).json()
+                outputs_normal.append(response["text"])
+                input_texts[j] += response["text"]
+
+        print("====== outputs from chunked queries with session control: =======")
+        print(outputs_from_session)
+        print("====== outputs from normal queries: =======")
+        print(outputs_normal)
+        assert (
+            outputs_from_session == outputs_normal
+        ), f"outputs_from_session: {outputs_from_session}, outputs_normal: {outputs_normal}"
+
+    def test_session_control_with_branching(self):
+        root_prompt = "First, let me explain in one sentence about AI"
+        chunks_per_step = [
+            [
+                "Then, briefly, the positive side of AI is",
+                "But, briefly, AI could be harmful to human",
+            ],
+            ["For example", "For example"],
+        ]
+        self.run_session_control_with_branching(
+            root_prompt=root_prompt, chunks_per_step=chunks_per_step, gen_len=8
+        )
+
+        root_prompt = "I have three apples."
+        chunks_per_step = [
+            ["I then give one apple to my friend", "My friend give me another apple."],
+            ["I still have", "I now have"],
+        ]
+        self.run_session_control_with_branching(
+            root_prompt=root_prompt, chunks_per_step=chunks_per_step, gen_len=8
+        )
+
+
+@unittest.skip("broken")
+class TestSessionControlVision(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "lmms-lab/llava-onevision-qwen2-7b-ov"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            # other_args={"--disable-radix"},
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_session_control(self):
+        text_chunks = [
+            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n",
+            "<|im_start|>user\n<image>\nDescribe this image in a very short sentence.<|im_end|>\n<|im_start|>assistant\n",
+            "<|im_start|>user\n<image>\nIs this image same with one of the previous images?<|im_end|>\n<|im_start|>assistant\n",
+            "<|im_start|>user\n<image>\nIs this image same with one of the previous images?<|im_end|>\n<|im_start|>assistant\n",
+            "<|im_start|>user\nDescribe this image in a very short sentence.<|im_end|>\nassistant:",
+        ]
+        image_chunks = [
+            "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png",
+            "https://raw.githubusercontent.com/sgl-project/sglang/main/test/lang/example_image.png",
+            "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png",
+        ]
+
+        self.assertEqual(
+            len(text_chunks), len(image_chunks) + 2
+        )  # the first and the last prompt does not contain images
+        tokenizer = get_tokenizer(self.model)
+        text_input_ids = [tokenizer.encode(x) for x in text_chunks]
+        for i in range(1, len(text_input_ids)):
+            if text_input_ids[i][0] == tokenizer.bos_token_id:
+                text_input_ids[i] = text_input_ids[i][1:]
+        gen_len = 32
+
+        # 1. using session control
+        requests.post(self.base_url + "/flush_cache")
+        session_id = requests.post(
+            self.base_url + "/open_session",
+            json={"capacity_of_str_len": 1000},
+        ).json()
+        rid = None
+
+        # open an existing session, should get session_id as None
+        ret = requests.post(
+            self.base_url + "/open_session",
+            json={"capacity_of_str_len": 1000, "session_id": session_id},
+        )
+        self.assertNotEqual(ret.status_code, 200)
+
+        first_rid = None
+        outputs_from_session = []
+        for i in range(len(text_input_ids[:-1])):
+            response = requests.post(
+                self.base_url + "/generate",
+                json={
+                    "input_ids": text_input_ids[i],
+                    "image_data": image_chunks[i - 1] if i > 0 else None,
+                    "modalities": ["multi-images"],
+                    "session_params": {
+                        "id": session_id,
+                        "rid": rid,
+                        "offset": 0,
+                        "replace": True,
+                    },
+                    "sampling_params": {
+                        "temperature": 0,
+                        "max_new_tokens": (
+                            gen_len if i > 0 else 0
+                        ),  # prefill only for the first chunk
+                        "no_stop_trim": True,
+                        "skip_special_tokens": False,
+                    },
+                },
+            ).json()
+            rid = response["meta_info"]["id"]
+            if i == 0:
+                first_rid = rid
+            if i > 0:
+                outputs_from_session.append(response["text"])
+
+        # backtrack to the first request and regenerate
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "input_ids": text_input_ids[-1],
+                "session_params": {
+                    "id": session_id,
+                    "rid": first_rid,
+                    "offset": 0,
+                    "replace": True,
+                },
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": gen_len,
+                    "no_stop_trim": True,
+                    "skip_special_tokens": False,
+                },
+            },
+        ).json()
+        outputs_from_session.append(response["text"])
+
+        # query with a non-existing rid (the last one should be disappeared because of backtrack), should see abort
+        ret = requests.post(
+            self.base_url + "/generate",
+            json={
+                "input_ids": text_input_ids[-1],
+                "session_params": {
+                    "id": session_id,
+                    "rid": rid,
+                    "offset": 0,
+                    "replace": True,
+                },
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": gen_len,
+                    "no_stop_trim": True,
+                    "skip_special_tokens": False,
+                },
+            },
+        )
+        self.assertNotEqual(ret.status_code, 200)
+
+        ret = requests.post(
+            self.base_url + "/close_session",
+            json={"session_id": session_id},
+        )
+        self.assertEqual(ret.status_code, 200)
+
+        # send a request to a closed session, should see abort
+        ret = requests.post(
+            self.base_url + "/generate",
+            json={
+                "input_ids": text_input_ids[-1],
+                "session_params": {
+                    "id": session_id,
+                    "rid": first_rid,
+                    "offset": 0,
+                    "replace": True,
+                },
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": gen_len,
+                    "no_stop_trim": True,
+                    "skip_special_tokens": False,
+                },
+            },
+        )
+        self.assertNotEqual(ret.status_code, 200)
+
+        # 2. not use session control
+        requests.post(self.base_url + "/flush_cache")
+
+        input_ids_first_req = None
+        input_ids = []
+        outputs_normal = []
+        for i in range(len(text_input_ids[:-1])):
+            input_ids += text_input_ids[i]
+            image_data = image_chunks[:i] if i > 0 else None
+            response = requests.post(
+                self.base_url + "/generate",
+                json={
+                    "input_ids": input_ids,
+                    "image_data": image_data,
+                    "modalities": ["multi-images"],
+                    "sampling_params": {
+                        "temperature": 0,
+                        "max_new_tokens": (
+                            gen_len if i > 0 else 0
+                        ),  # prefill only for the first chunk
+                        "no_stop_trim": True,
+                        "skip_special_tokens": False,
+                    },
+                },
+            ).json()
+            if i > 0:
+                output_ids = tokenizer.encode(response["text"])
+                if output_ids[0] == tokenizer.bos_token_id:
+                    output_ids = output_ids[1:]
+                input_ids += output_ids
+                outputs_normal.append(response["text"])
+            if i == 0:
+                input_ids_first_req = input_ids.copy()
+
+        input_ids_first_req += text_input_ids[-1]
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "input_ids": input_ids_first_req,
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": gen_len,
+                    "no_stop_trim": True,
+                    "skip_special_tokens": False,
+                },
+            },
+        ).json()
+        outputs_normal.append(response["text"])
+
+        print("outputs from chunked queries with session control:")
+        print(outputs_from_session)
+        print("outputs from normal queries:")
+        print(outputs_normal)
+        assert (
+            outputs_from_session == outputs_normal
+        ), f"outputs_from_session: {outputs_from_session}, outputs_normal: {outputs_normal}"
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_skip_tokenizer_init.py b/test/srt/test_skip_tokenizer_init.py
new file mode 100644
index 000000000..02b1b40c6
--- /dev/null
+++ b/test/srt/test_skip_tokenizer_init.py
@@ -0,0 +1,246 @@
+"""
+python3 -m unittest test_skip_tokenizer_init.TestSkipTokenizerInit.test_parallel_sample
+python3 -m unittest test_skip_tokenizer_init.TestSkipTokenizerInit.run_decode_stream
+"""
+
+import json
+import unittest
+from io import BytesIO
+
+import requests
+from PIL import Image
+from transformers import AutoProcessor, AutoTokenizer
+
+from sglang.lang.chat_template import get_chat_template_by_model_path
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_IMAGE_URL,
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestSkipTokenizerInit(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--skip-tokenizer-init", "--stream-output"],
+        )
+        cls.eos_token_id = [119690]
+        cls.tokenizer = AutoTokenizer.from_pretrained(
+            DEFAULT_SMALL_MODEL_NAME_FOR_TEST, use_fast=False
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def run_decode(
+        self,
+        prompt_text="The capital of France is",
+        max_new_tokens=32,
+        return_logprob=False,
+        top_logprobs_num=0,
+        n=1,
+    ):
+        input_ids = self.get_input_ids(prompt_text)
+
+        request = self.get_request_json(
+            input_ids=input_ids,
+            return_logprob=return_logprob,
+            top_logprobs_num=top_logprobs_num,
+            max_new_tokens=max_new_tokens,
+            stream=False,
+            n=n,
+        )
+        response = requests.post(
+            self.base_url + "/generate",
+            json=request,
+        )
+        ret = response.json()
+        print(json.dumps(ret, indent=2))
+
+        def assert_one_item(item):
+            if item["meta_info"]["finish_reason"]["type"] == "stop":
+                self.assertEqual(
+                    item["meta_info"]["finish_reason"]["matched"],
+                    self.tokenizer.eos_token_id,
+                )
+            elif item["meta_info"]["finish_reason"]["type"] == "length":
+                self.assertEqual(
+                    len(item["output_ids"]), item["meta_info"]["completion_tokens"]
+                )
+                self.assertEqual(len(item["output_ids"]), max_new_tokens)
+                self.assertEqual(item["meta_info"]["prompt_tokens"], len(input_ids))
+
+                if return_logprob:
+                    num_input_logprobs = len(input_ids) - request["logprob_start_len"]
+                    if num_input_logprobs > len(input_ids):
+                        num_input_logprobs -= len(input_ids)
+                    self.assertEqual(
+                        len(item["meta_info"]["input_token_logprobs"]),
+                        num_input_logprobs,
+                        f'{len(item["meta_info"]["input_token_logprobs"])} mismatch with {len(input_ids)}',
+                    )
+                    self.assertEqual(
+                        len(item["meta_info"]["output_token_logprobs"]),
+                        max_new_tokens,
+                    )
+
+        # Determine whether to assert a single item or multiple items based on n
+        if n == 1:
+            assert_one_item(ret)
+        else:
+            self.assertEqual(len(ret), n)
+            for i in range(n):
+                assert_one_item(ret[i])
+
+        print("=" * 100)
+
+    def run_decode_stream(self, return_logprob=False, top_logprobs_num=0, n=1):
+        max_new_tokens = 32
+        input_ids = self.get_input_ids("The capital of France is")
+        requests.post(self.base_url + "/flush_cache")
+        response = requests.post(
+            self.base_url + "/generate",
+            json=self.get_request_json(
+                input_ids=input_ids,
+                max_new_tokens=max_new_tokens,
+                return_logprob=return_logprob,
+                top_logprobs_num=top_logprobs_num,
+                stream=False,
+                n=n,
+            ),
+        )
+        ret = response.json()
+        print(json.dumps(ret))
+        output_ids = ret["output_ids"]
+        print("output from non-streaming request:")
+        print(output_ids)
+        print(self.tokenizer.decode(output_ids, skip_special_tokens=True))
+
+        requests.post(self.base_url + "/flush_cache")
+        response_stream = requests.post(
+            self.base_url + "/generate",
+            json=self.get_request_json(
+                input_ids=input_ids,
+                return_logprob=return_logprob,
+                top_logprobs_num=top_logprobs_num,
+                stream=True,
+                n=n,
+            ),
+        )
+
+        response_stream_json = []
+        for line in response_stream.iter_lines():
+            print(line)
+            if line.startswith(b"data: ") and line[6:] != b"[DONE]":
+                response_stream_json.append(json.loads(line[6:]))
+        out_stream_ids = []
+        for x in response_stream_json:
+            out_stream_ids += x["output_ids"]
+        print("output from streaming request:")
+        print(out_stream_ids)
+        print(self.tokenizer.decode(out_stream_ids, skip_special_tokens=True))
+
+        assert output_ids == out_stream_ids
+
+    def test_simple_decode(self):
+        self.run_decode()
+
+    def test_parallel_sample(self):
+        self.run_decode(n=3)
+
+    def test_logprob(self):
+        for top_logprobs_num in [0, 3]:
+            self.run_decode(return_logprob=True, top_logprobs_num=top_logprobs_num)
+
+    def test_eos_behavior(self):
+        self.run_decode(max_new_tokens=256)
+
+    def test_simple_decode_stream(self):
+        self.run_decode_stream()
+
+    def get_input_ids(self, prompt_text) -> list[int]:
+        input_ids = self.tokenizer(prompt_text, return_tensors="pt")["input_ids"][
+            0
+        ].tolist()
+        return input_ids
+
+    def get_request_json(
+        self,
+        input_ids,
+        max_new_tokens=32,
+        return_logprob=False,
+        top_logprobs_num=0,
+        stream=False,
+        n=1,
+    ):
+        return {
+            "input_ids": input_ids,
+            "sampling_params": {
+                "temperature": 0 if n == 1 else 0.5,
+                "max_new_tokens": max_new_tokens,
+                "n": n,
+                "stop_token_ids": self.eos_token_id,
+            },
+            "stream": stream,
+            "return_logprob": return_logprob,
+            "top_logprobs_num": top_logprobs_num,
+            "logprob_start_len": 0,
+        }
+
+
+class TestSkipTokenizerInitVLM(TestSkipTokenizerInit):
+    @classmethod
+    def setUpClass(cls):
+        cls.image_url = DEFAULT_IMAGE_URL
+        response = requests.get(cls.image_url)
+        cls.image = Image.open(BytesIO(response.content))
+        cls.model = DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST
+        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model, use_fast=False)
+        cls.processor = AutoProcessor.from_pretrained(cls.model, trust_remote_code=True)
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--skip-tokenizer-init"],
+        )
+        cls.eos_token_id = [cls.tokenizer.eos_token_id]
+
+    def get_input_ids(self, _prompt_text) -> list[int]:
+        chat_template = get_chat_template_by_model_path(self.model)
+        text = f"{chat_template.image_token}What is in this picture?"
+        inputs = self.processor(
+            text=[text],
+            images=[self.image],
+            return_tensors="pt",
+        )
+
+        return inputs.input_ids[0].tolist()
+
+    def get_request_json(self, *args, **kwargs):
+        ret = super().get_request_json(*args, **kwargs)
+        ret["image_data"] = [self.image_url]
+        ret["logprob_start_len"] = (
+            -1
+        )  # Do not try to calculate logprobs of image embeddings.
+        return ret
+
+    def test_simple_decode_stream(self):
+        # TODO mick
+        pass
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_srt_endpoint.py b/test/srt/test_srt_endpoint.py
new file mode 100644
index 000000000..089da355d
--- /dev/null
+++ b/test/srt/test_srt_endpoint.py
@@ -0,0 +1,640 @@
+"""
+python3 -m unittest test_srt_endpoint.TestSRTEndpoint.test_simple_decode
+python3 -m unittest test_srt_endpoint.TestSRTEndpoint.test_logprob_with_chunked_prefill
+"""
+
+import json
+import random
+import time
+import unittest
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from typing import Optional
+
+import numpy as np
+import requests
+
+from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+    run_logprob_check,
+)
+
+
+class TestSRTEndpoint(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=(
+                "--enable-custom-logit-processor",
+                "--mem-fraction-static",
+                "0.7",
+                "--cuda-graph-max-bs",
+                "8",
+            ),
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def run_decode(
+        self,
+        return_logprob=False,
+        top_logprobs_num=0,
+        return_text=False,
+        n=1,
+        stream=False,
+        batch=False,
+    ):
+        if batch:
+            text = ["The capital of France is"]
+        else:
+            text = "The capital of France is"
+
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": text,
+                "sampling_params": {
+                    "temperature": 0 if n == 1 else 0.5,
+                    "max_new_tokens": 16,
+                    "n": n,
+                },
+                "stream": stream,
+                "return_logprob": return_logprob,
+                "top_logprobs_num": top_logprobs_num,
+                "return_text_in_logprobs": return_text,
+                "logprob_start_len": 0,
+            },
+        )
+        if not stream:
+            response_json = response.json()
+        else:
+            response_json = []
+            for line in response.iter_lines():
+                if line.startswith(b"data: ") and line[6:] != b"[DONE]":
+                    response_json.append(json.loads(line[6:]))
+
+        print(json.dumps(response_json, indent=2))
+        print("=" * 100)
+
+    def test_simple_decode(self):
+        self.run_decode()
+
+    def test_simple_decode_batch(self):
+        self.run_decode(batch=True)
+
+    def test_parallel_sample(self):
+        self.run_decode(n=3)
+
+    def test_parallel_sample_stream(self):
+        self.run_decode(n=3, stream=True)
+
+    def test_logprob(self):
+        self.run_decode(
+            return_logprob=True,
+            top_logprobs_num=5,
+            return_text=True,
+        )
+
+    def test_logprob_start_len(self):
+        logprob_start_len = 4
+        new_tokens = 4
+        prompts = [
+            "I have a very good idea on",
+            "Today is a sunndy day and",
+        ]
+
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": prompts,
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": new_tokens,
+                },
+                "return_logprob": True,
+                "top_logprobs_num": 5,
+                "return_text_in_logprobs": True,
+                "logprob_start_len": logprob_start_len,
+            },
+        )
+        response_json = response.json()
+        print(json.dumps(response_json, indent=2))
+
+        for i, res in enumerate(response_json):
+            self.assertEqual(
+                res["meta_info"]["prompt_tokens"],
+                logprob_start_len + len(res["meta_info"]["input_token_logprobs"]),
+            )
+            assert prompts[i].endswith(
+                "".join([x[-1] for x in res["meta_info"]["input_token_logprobs"]])
+            )
+
+            self.assertEqual(res["meta_info"]["completion_tokens"], new_tokens)
+            self.assertEqual(len(res["meta_info"]["output_token_logprobs"]), new_tokens)
+            self.assertEqual(
+                res["text"],
+                "".join([x[-1] for x in res["meta_info"]["output_token_logprobs"]]),
+            )
+
+    def test_logprob_with_chunked_prefill(self):
+        """Test a long prompt that requests output logprobs will not hit OOM."""
+        new_tokens = 4
+        prompts = "I have a very good idea on this. " * 8000
+
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": prompts,
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": new_tokens,
+                },
+                "return_logprob": True,
+                "logprob_start_len": -1,
+                "top_logprobs_num": 5,
+            },
+        )
+        response_json = response.json()
+        # print(json.dumps(response_json, indent=2))
+
+        res = response_json
+        self.assertEqual(res["meta_info"]["completion_tokens"], new_tokens)
+
+        # Test the number of tokens are correct
+        self.assertEqual(len(res["meta_info"]["output_token_logprobs"]), new_tokens)
+        self.assertEqual(len(res["meta_info"]["output_top_logprobs"]), new_tokens)
+
+        # Test the top-1 tokens are the same as output tokens (because temp = 0.0)
+        for i in range(new_tokens):
+            self.assertListEqual(
+                res["meta_info"]["output_token_logprobs"][i],
+                res["meta_info"]["output_top_logprobs"][i][0],
+            )
+            self.assertEqual(len(res["meta_info"]["output_top_logprobs"][i]), 5)
+
+    def test_logprob_match(self):
+        """Test the output logprobs are close to the input logprobs if we run a prefill again."""
+
+        def run_generate(
+            prompt, return_logprob=False, max_new_tokens=512, logprob_start_len=-1
+        ):
+
+            if isinstance(prompt, str):
+                prompt_kwargs = {"text": prompt}
+            else:
+                prompt_kwargs = {"input_ids": prompt}
+
+            response = requests.post(
+                self.base_url + "/generate",
+                json={
+                    **prompt_kwargs,
+                    "sampling_params": {
+                        "temperature": 1.0,
+                        "max_new_tokens": max_new_tokens,
+                        "ignore_eos": True,
+                    },
+                    "return_logprob": return_logprob,
+                    "return_text_in_logprobs": True,
+                    "logprob_start_len": logprob_start_len,
+                },
+            )
+            return response.json()
+
+        prompt = "I have a very good idea on how to"
+
+        gen = run_generate(prompt, return_logprob=True, logprob_start_len=0)
+        output_logprobs = np.array(
+            [x[0] for x in gen["meta_info"]["output_token_logprobs"]]
+        )
+        num_prompts_tokens = gen["meta_info"]["prompt_tokens"]
+
+        input_tokens = [x[1] for x in gen["meta_info"]["input_token_logprobs"]]
+        output_tokens = [x[1] for x in gen["meta_info"]["output_token_logprobs"]]
+
+        new_prompt = input_tokens + output_tokens
+        score = run_generate(
+            new_prompt, return_logprob=True, logprob_start_len=0, max_new_tokens=0
+        )
+        output_logprobs_score = np.array(
+            [
+                x[0]
+                for x in score["meta_info"]["input_token_logprobs"][num_prompts_tokens:]
+            ]
+        )
+
+        print(f"{output_logprobs[-10:]=}")
+        print(f"{output_logprobs_score[-10:]=}")
+
+        diff = np.abs(output_logprobs - output_logprobs_score)
+        max_diff = np.max(diff)
+        self.assertLess(max_diff, 0.35)
+
+    def test_logprob_mixed(self):
+        args = []
+        temperature = 0
+        # input_len, output_len, temperature, logprob_start_len, return_logprob, top_logprobs_num
+        for input_len in [1000, 5000, 10000, 50000]:
+            for output_len in [4, 8]:
+                for logprob_start_len in [0, 500, 2500, 5000, 25000]:
+                    for return_logprob in [True, False]:
+                        for top_logprobs_num in [0, 5]:
+
+                            if logprob_start_len >= input_len:
+                                continue
+
+                            args.append(
+                                (
+                                    input_len,
+                                    output_len,
+                                    temperature,
+                                    logprob_start_len,
+                                    return_logprob,
+                                    top_logprobs_num,
+                                )
+                            )
+
+        random.shuffle(args)
+
+        func = partial(run_logprob_check, self)
+        with ThreadPoolExecutor(8) as executor:
+            list(executor.map(func, args))
+
+    def test_logprob_grammar(self):
+        prompts = "Question: Is Paris the Capital of France? Answer:"
+        allowed_tokens = [" Yes", " No"]
+
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": prompts,
+                "sampling_params": {
+                    "temperature": 1.0,
+                    "max_new_tokens": 1,
+                    "regex": "( Yes| No)",
+                },
+                "return_logprob": True,
+                "top_logprobs_num": 5,  # The grammar constraint allows all prefix tokens so we need to use a larger top_k.
+                "return_text_in_logprobs": True,
+            },
+        )
+        response_json = response.json()
+        output_top_logprobs = response_json["meta_info"]["output_top_logprobs"][0]
+        print(f"{output_top_logprobs=}")
+
+        # Parse results
+        # This is because the grammar constraint allows all prefix tokens
+        logprobs = [None] * 2
+        for i in range(len(output_top_logprobs)):
+            try:
+                idx = allowed_tokens.index(output_top_logprobs[i][2])
+            except ValueError:
+                # Not found
+                continue
+            logprobs[idx] = output_top_logprobs[i][0]
+
+        self.assertTrue(all(x is not None for x in logprobs))
+
+    def run_custom_logit_processor(self, target_token_id: Optional[int] = None):
+        """Test custom logit processor with custom params.
+
+        If target_token_id is None, the custom logit processor won't be passed in.
+        """
+
+        custom_params = {"token_id": target_token_id}
+
+        class DeterministicLogitProcessor(CustomLogitProcessor):
+            """A dummy logit processor that changes the logits to always
+            sample the given token id.
+            """
+
+            def __call__(self, logits, custom_param_list):
+                assert logits.shape[0] == len(custom_param_list)
+                key = "token_id"
+
+                for i, param_dict in enumerate(custom_param_list):
+                    # Mask all other tokens
+                    logits[i, :] = -float("inf")
+                    # Assign highest probability to the specified token
+                    logits[i, param_dict[key]] = 0.0
+                return logits
+
+        prompts = "Question: Is Paris the Capital of France? Answer:"
+
+        # Base case json data to be posted to the server.
+        base_json = {
+            "text": prompts,
+            "sampling_params": {"temperature": 0.0},
+            "return_logprob": True,
+        }
+
+        # Custom json data with custom logit processor and params.
+        custom_json = base_json.copy()
+        # Only set the custom logit processor if target_token_id is not None.
+        if target_token_id is not None:
+            custom_json["custom_logit_processor"] = DeterministicLogitProcessor.to_str()
+            custom_json["sampling_params"]["custom_params"] = custom_params
+
+        custom_response = requests.post(
+            self.base_url + "/generate",
+            json=custom_json,
+        ).json()
+
+        output_token_logprobs = custom_response["meta_info"]["output_token_logprobs"]
+        sampled_tokens = [x[1] for x in output_token_logprobs]
+
+        # The logit processor should always sample the given token as the logits is deterministic.
+        if target_token_id is not None:
+            self.assertTrue(
+                all(x == custom_params["token_id"] for x in sampled_tokens),
+                # Print the detailed test case info if the test fails.
+                f"{target_token_id=}\n{sampled_tokens=}\n{custom_response=}",
+            )
+
+    def run_stateful_custom_logit_processor(
+        self, first_token_id: int | None, delay: int = 2
+    ):
+        """Test custom logit processor with custom params and state.
+
+        Should sample the first `delay` tokens normally, then output first_token_id and consecutive tokens after that.
+        If first_token_id is None, the custom logit processor won't be passed in.
+        """
+        custom_params = {"token_id": first_token_id, "delay": 2}
+
+        class DeterministicStatefulLogitProcessor(CustomLogitProcessor):
+            """A dummy logit processor that changes the logits to always
+            sample the given token id.
+            """
+
+            def __call__(self, logits, custom_param_list):
+                assert logits.shape[0] == len(custom_param_list)
+
+                for i, param_dict in enumerate(custom_param_list):
+                    if param_dict["delay"] > 0:
+                        param_dict["delay"] -= 1
+                        continue
+                    if param_dict["delay"] == 0:
+                        param_dict["delay"] -= 1
+                        force_token = param_dict["token_id"]
+                    else:
+                        output_ids = param_dict["__req__"].output_ids
+                        force_token = output_ids[-1] + 1
+                    # Mask all other tokens
+                    logits[i, :] = -float("inf")
+                    # Assign highest probability to the specified token
+                    logits[i, force_token] = 0.0
+                return logits
+
+        prompts = "Question: Is Paris the Capital of France? Answer:"
+
+        # Base case json data to be posted to the server.
+        base_json = {
+            "text": prompts,
+            "sampling_params": {"temperature": 0.0},
+            "return_logprob": True,
+        }
+
+        # Custom json data with custom logit processor and params.
+        custom_json = base_json.copy()
+        # Only set the custom logit processor if target_token_id is not None.
+        if first_token_id is not None:
+            custom_json["custom_logit_processor"] = (
+                DeterministicStatefulLogitProcessor().to_str()
+            )
+            custom_json["sampling_params"]["custom_params"] = custom_params
+
+        custom_response = requests.post(
+            self.base_url + "/generate",
+            json=custom_json,
+        ).json()
+
+        output_token_logprobs = custom_response["meta_info"]["output_token_logprobs"]
+        sampled_tokens = [x[1] for x in output_token_logprobs]
+        # The logit processor should always sample the given token as the logits is deterministic.
+        if first_token_id is not None:
+            self.assertTrue(
+                all(
+                    x == custom_params["token_id"] + k
+                    for k, x in enumerate(sampled_tokens[custom_params["delay"] :])
+                ),
+                # Print the detailed test case info if the test fails.
+                f"{first_token_id=}\n{sampled_tokens=}\n{custom_response=}",
+            )
+
+    def test_custom_logit_processor(self):
+        """Test custom logit processor with a single request."""
+        self.run_custom_logit_processor(target_token_id=5)
+
+    def test_custom_logit_processor_batch_mixed(self):
+        """Test a batch of requests mixed of requests with and without custom logit processor."""
+        target_token_ids = list(range(32)) + [None] * 16
+        random.shuffle(target_token_ids)
+        with ThreadPoolExecutor(len(target_token_ids)) as executor:
+            list(executor.map(self.run_custom_logit_processor, target_token_ids))
+
+    @unittest.skip("Skip this test because this feature has a bug. See comments below.")
+    def test_stateful_custom_logit_processor(self):
+        """Test custom logit processor with a single request."""
+
+        """
+        NOTE: This feature has a race condition bug.
+        This line https://github.com/sgl-project/sglang/blob/ef8ec07b2ce4c70c2a33ec5acda4ce529bc3cda4/test/srt/test_srt_endpoint.py#L395-L396 can be accessed by two concurrent threads at the same time. The access order is not guaranteed.
+        In sglang, we use two python threads to overlap the GPU computation and CPU scheduling.
+        Thread 1 (the CPU scheduling thread) will update the `param_dict["__req__"].output_ids`.
+        Thread 2 (the GPU computation thread) will call `DeterministicStatefulLogitProcessor` because sampling is considered as GPU computation.
+        We can fix this by moving the call of DeterministicStatefulLogitProcessor to the CPU scheduling thread.
+        """
+
+        self.run_stateful_custom_logit_processor(first_token_id=5)
+
+    @unittest.skip("Skip this test because this feature has a bug. See comments above.")
+    def test_stateful_custom_logit_processor_batch_mixed(self):
+        """Test a batch of requests mixed of requests with and without custom logit processor."""
+        target_token_ids = list(range(32)) + [None] * 16
+        random.shuffle(target_token_ids)
+        with ThreadPoolExecutor(len(target_token_ids)) as executor:
+            list(
+                executor.map(self.run_stateful_custom_logit_processor, target_token_ids)
+            )
+
+    def test_cache_tokens(self):
+        for _ in range(2):
+            time.sleep(1)
+            response = requests.post(self.base_url + "/flush_cache")
+            assert response.status_code == 200
+
+        def send_and_check_cached_tokens(input_ids):
+            response = requests.post(
+                self.base_url + "/generate",
+                json={
+                    "input_ids": list(input_ids),
+                    "sampling_params": {
+                        "max_new_tokens": 1,
+                    },
+                },
+            )
+            response_json = response.json()
+            return response_json["meta_info"]["cached_tokens"]
+
+        self.assertEqual(send_and_check_cached_tokens(range(0, 100)), 0)
+        self.assertEqual(send_and_check_cached_tokens(range(0, 10000)), 100)
+        self.assertEqual(send_and_check_cached_tokens(range(0, 10000)), 9999)
+        self.assertEqual(send_and_check_cached_tokens(range(0, 1000)), 999)
+        self.assertEqual(send_and_check_cached_tokens(range(0, 11000)), 10000)
+
+    def test_get_server_info(self):
+        response = requests.get(self.base_url + "/get_server_info")
+        response_json = response.json()
+
+        max_total_num_tokens = response_json["max_total_num_tokens"]
+        self.assertIsInstance(max_total_num_tokens, int)
+
+        version = response_json["version"]
+        self.assertIsInstance(version, str)
+
+    def test_logit_bias(self):
+        """Test that a very high logit bias forces sampling of a specific token."""
+        # Choose a token ID to bias (using 5 as an example)
+        target_token_id = 60704  # Paris for meta-llama/Llama-3.2-1B-Instruct, DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        logit_bias = {str(target_token_id): 100.0}  # Very high positive bias
+
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "temperature": 1.0,  # Use high temperature to encourage exploration
+                    "max_new_tokens": 4,
+                    "logit_bias": logit_bias,
+                },
+                "return_logprob": True,
+            },
+        )
+        response_json = response.json()
+
+        # Extract the sampled token IDs from the output
+        output_token_logprobs = response_json["meta_info"]["output_token_logprobs"]
+        sampled_tokens = [x[1] for x in output_token_logprobs]
+
+        # Verify that all sampled tokens are the target token
+        self.assertTrue(
+            all(x == target_token_id for x in sampled_tokens),
+            f"Expected all tokens to be {target_token_id}, but got {sampled_tokens}",
+        )
+
+    def test_forbidden_token(self):
+        """Test that a forbidden token (very negative logit bias) doesn't appear in the output."""
+        # Choose a token ID to forbid (using 10 as an example)
+        forbidden_token_id = 23994  # rice for meta-llama/Llama-3.2-1B-Instruct, DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        logit_bias = {
+            str(forbidden_token_id): -100.0
+        }  # Very negative bias to forbid the token
+
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": "Only output 'rice' exactly like this, in lowercase ONLY: rice",
+                "sampling_params": {
+                    "temperature": 1.0,  # Use high temperature to encourage diverse output
+                    "max_new_tokens": 50,  # Generate enough tokens to likely include numbers
+                    "logit_bias": logit_bias,
+                },
+                "return_logprob": True,
+            },
+        )
+        response_json = response.json()
+
+        # Extract the sampled token IDs from the output
+        output_token_logprobs = response_json["meta_info"]["output_token_logprobs"]
+        sampled_tokens = [x[1] for x in output_token_logprobs]
+
+        # Verify that the forbidden token doesn't appear in the output
+        self.assertNotIn(
+            forbidden_token_id,
+            sampled_tokens,
+            f"Expected forbidden token {forbidden_token_id} not to be present, but it was found",
+        )
+
+    def test_logit_bias_isolation(self):
+        """Test that logit_bias applied to one request doesn't affect other requests in batch."""
+        # Choose a token ID to bias in first request only
+        biased_token_id = 60704  # Paris for meta-llama/Llama-3.2-1B-Instruct, DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+
+        # Prepare batch requests - one with logit_bias and one without
+        requests_data = [
+            {
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "temperature": 1.0,
+                    "max_new_tokens": 4,
+                    "logit_bias": {str(biased_token_id): 100.0},  # Strong bias
+                },
+                "return_logprob": True,
+            },
+            {
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "temperature": 1.0,
+                    "max_new_tokens": 4,
+                },
+                "return_logprob": True,
+            },
+        ]
+
+        # Send both requests
+        responses = []
+        for req in requests_data:
+            response = requests.post(self.base_url + "/generate", json=req)
+            responses.append(response.json())
+
+        # Extract token IDs from each response
+        biased_tokens = [
+            x[1] for x in responses[0]["meta_info"]["output_token_logprobs"]
+        ]
+        unbiased_tokens = [
+            x[1] for x in responses[1]["meta_info"]["output_token_logprobs"]
+        ]
+
+        # Verify first response contains only biased tokens
+        self.assertTrue(
+            all(x == biased_token_id for x in biased_tokens),
+            f"Expected all tokens to be {biased_token_id} in first response, but got {biased_tokens}",
+        )
+
+        # Verify second response contains at least some different tokens
+        # (We can't guarantee exactly what tokens will be generated, but they shouldn't all be the biased token)
+        self.assertTrue(
+            any(x != biased_token_id for x in unbiased_tokens),
+            f"Expected some tokens to be different from {biased_token_id} in second response, but got {unbiased_tokens}",
+        )
+
+    def test_get_server_info_concurrent(self):
+        """Make sure the concurrent get_server_info doesn't crash the server."""
+        tp = ThreadPoolExecutor(max_workers=30)
+
+        def s():
+            server_info = requests.get(self.base_url + "/get_server_info")
+            server_info.json()
+
+        futures = []
+        for _ in range(4):
+            futures.append(tp.submit(s))
+
+        for f in futures:
+            f.result()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_srt_engine.py b/test/srt/test_srt_engine.py
new file mode 100644
index 000000000..a50669d48
--- /dev/null
+++ b/test/srt/test_srt_engine.py
@@ -0,0 +1,219 @@
+"""
+Usage:
+python3 -m unittest test_srt_engine.TestSRTEngine.test_4_sync_async_stream_combination
+"""
+
+import asyncio
+import json
+import unittest
+from types import SimpleNamespace
+
+import torch
+
+import sglang as sgl
+from sglang.bench_offline_throughput import BenchArgs, throughput_test
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.server_args import ServerArgs
+from sglang.test.few_shot_gsm8k_engine import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST,
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    CustomTestCase,
+)
+
+
+class TestSRTEngine(CustomTestCase):
+
+    def test_1_engine_runtime_consistency(self):
+        prompt = "Today is a sunny day and I like"
+        model_path = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+
+        sampling_params = {"temperature": 0, "max_new_tokens": 8}
+
+        engine = sgl.Engine(model_path=model_path, random_seed=42)
+        out1 = engine.generate(prompt, sampling_params)["text"]
+        engine.shutdown()
+
+        runtime = sgl.Runtime(model_path=model_path, random_seed=42)
+        out2 = json.loads(runtime.generate(prompt, sampling_params))["text"]
+        runtime.shutdown()
+
+        print("==== Answer 1 ====")
+        print(out1)
+
+        print("==== Answer 2 ====")
+        print(out2)
+        self.assertEqual(out1, out2)
+
+    def test_2_engine_runtime_encode_consistency(self):
+        prompt = "Today is a sunny day and I like"
+        model_path = DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST
+
+        engine = sgl.Engine(model_path=model_path, is_embedding=True, random_seed=42)
+        out1 = torch.tensor(engine.encode(prompt)["embedding"])
+        engine.shutdown()
+
+        runtime = sgl.Runtime(model_path=model_path, is_embedding=True, random_seed=42)
+        out2 = torch.tensor(json.loads(runtime.encode(prompt))["embedding"])
+        runtime.shutdown()
+
+        self.assertTrue(torch.allclose(out1, out2, atol=1e-5, rtol=1e-3))
+
+    def test_3_engine_token_ids_consistency(self):
+        # just to ensure there is no issue running multiple generate calls
+        prompt = "Today is a sunny day and I like"
+        model_path = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        sampling_params = {"temperature": 0, "max_new_tokens": 8}
+
+        engine = sgl.Engine(
+            model_path=model_path, random_seed=42, disable_radix_cache=True
+        )
+        out1 = engine.generate(prompt, sampling_params)["text"]
+
+        tokenizer = get_tokenizer(model_path)
+        token_ids = tokenizer.encode(prompt)
+        out2 = engine.generate(input_ids=token_ids, sampling_params=sampling_params)[
+            "text"
+        ]
+
+        engine.shutdown()
+
+        print("==== Answer 1 ====")
+        print(out1)
+
+        print("==== Answer 2 ====")
+        print(out2)
+        self.assertEqual(out1, out2)
+
+    def test_4_sync_async_stream_combination(self):
+        prompt = "AI safety is"
+        sampling_params = {"temperature": 0.8, "top_p": 0.95}
+
+        # Create an LLM.
+        llm = sgl.Engine(
+            model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+        )
+
+        if True:
+            # 1. sync + non streaming
+            print("\n\n==== 1. sync + non streaming ====")
+            output = llm.generate(prompt, sampling_params)
+            print(output["text"])
+
+            # 2. sync + streaming
+            print("\n\n==== 2. sync + streaming ====")
+            output_generator = llm.generate(prompt, sampling_params, stream=True)
+            offset = 0
+            for output in output_generator:
+                print(output["text"][offset:], end="", flush=True)
+                offset = len(output["text"])
+            print()
+
+        if True:
+            loop = asyncio.get_event_loop()
+            # 3. async + non_streaming
+            print("\n\n==== 3. async + non streaming ====")
+            output = loop.run_until_complete(
+                llm.async_generate(prompt, sampling_params)
+            )
+            print(output["text"])
+
+            # 4. async + streaming
+            async def async_streaming(engine):
+                generator = await engine.async_generate(
+                    prompt, sampling_params, stream=True
+                )
+
+                offset = 0
+                async for output in generator:
+                    print(output["text"][offset:], end="", flush=True)
+                    offset = len(output["text"])
+                print()
+
+            print("\n\n==== 4. async + streaming ====")
+            loop.run_until_complete(async_streaming(llm))
+
+        llm.shutdown()
+
+    def test_5_gsm8k(self):
+
+        args = SimpleNamespace(
+            model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+            local_data_path=None,
+            num_shots=5,
+            num_questions=1400,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreater(metrics["accuracy"], 0.33)
+
+    def test_6_engine_cpu_offload(self):
+        prompt = "Today is a sunny day and I like"
+        model_path = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+
+        sampling_params = {"temperature": 0, "max_new_tokens": 8}
+
+        engine = sgl.Engine(
+            model_path=model_path,
+            random_seed=42,
+            max_total_tokens=128,
+        )
+        out1 = engine.generate(prompt, sampling_params)["text"]
+        engine.shutdown()
+
+        engine = sgl.Engine(
+            model_path=model_path,
+            random_seed=42,
+            max_total_tokens=128,
+            cpu_offload_gb=3,
+        )
+        out2 = engine.generate(prompt, sampling_params)["text"]
+        engine.shutdown()
+
+        print("==== Answer 1 ====")
+        print(out1)
+
+        print("==== Answer 2 ====")
+        print(out2)
+        self.assertEqual(out1, out2)
+
+    def test_7_engine_offline_throughput(self):
+        server_args = ServerArgs(
+            model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+        )
+        bench_args = BenchArgs(num_prompts=10)
+        result = throughput_test(server_args=server_args, bench_args=bench_args)
+        self.assertGreater(result["total_throughput"], 3000)
+
+    def test_8_engine_async_encode_consistency(self):
+        prompt = "Today is a sunny day and I like"
+        model_path = DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST
+
+        engine = sgl.Engine(
+            model_path=model_path,
+            is_embedding=True,
+            random_seed=42,
+            disable_radix_cache=True,
+        )
+
+        # Get sync and async embeddings
+        out1 = torch.tensor(engine.encode(prompt)["embedding"])
+        loop = asyncio.get_event_loop()
+        out2 = torch.tensor(
+            loop.run_until_complete(engine.async_encode(prompt))["embedding"]
+        )
+
+        engine.shutdown()
+
+        print("\n==== Shapes ====")
+        print(f"sync shape: {out1.shape}")
+        print(f"async shape: {out2.shape}")
+
+        self.assertTrue(
+            torch.allclose(out1, out2, atol=1e-5, rtol=1e-3),
+            "Sync and async embeddings are not equal within tolerance",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_srt_engine_with_quant_args.py b/test/srt/test_srt_engine_with_quant_args.py
new file mode 100644
index 000000000..47baf5688
--- /dev/null
+++ b/test/srt/test_srt_engine_with_quant_args.py
@@ -0,0 +1,60 @@
+import unittest
+
+import sglang as sgl
+from sglang.test.test_utils import DEFAULT_SMALL_MODEL_NAME_FOR_TEST, CustomTestCase
+
+
+class TestSRTEngineWithQuantArgs(CustomTestCase):
+
+    def test_1_quantization_args(self):
+
+        # we only test fp8 because other methods are currently dependent on vllm. We can add other methods back to test after vllm dependency is resolved.
+        quantization_args_list = [
+            # "awq",
+            "fp8",
+            # "gptq",
+            # "marlin",
+            # "gptq_marlin",
+            # "awq_marlin",
+            # "bitsandbytes",
+            # "gguf",
+        ]
+
+        prompt = "Today is a sunny day and I like"
+        model_path = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+
+        sampling_params = {"temperature": 0, "max_new_tokens": 8}
+
+        for quantization_args in quantization_args_list:
+            engine = sgl.Engine(
+                model_path=model_path, random_seed=42, quantization=quantization_args
+            )
+            engine.generate(prompt, sampling_params)
+            engine.shutdown()
+
+    def test_2_torchao_args(self):
+
+        # we don't test int8dq because currently there is conflict between int8dq and capture cuda graph
+        torchao_args_list = [
+            # "int8dq",
+            "int8wo",
+            "fp8wo",
+            "fp8dq-per_tensor",
+            "fp8dq-per_row",
+        ] + [f"int4wo-{group_size}" for group_size in [32, 64, 128, 256]]
+
+        prompt = "Today is a sunny day and I like"
+        model_path = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+
+        sampling_params = {"temperature": 0, "max_new_tokens": 8}
+
+        for torchao_config in torchao_args_list:
+            engine = sgl.Engine(
+                model_path=model_path, random_seed=42, torchao_config=torchao_config
+            )
+            engine.generate(prompt, sampling_params)
+            engine.shutdown()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_standalone_speculative_decoding.py b/test/srt/test_standalone_speculative_decoding.py
new file mode 100644
index 000000000..e2962b716
--- /dev/null
+++ b/test/srt/test_standalone_speculative_decoding.py
@@ -0,0 +1,115 @@
+import os
+import unittest
+from types import SimpleNamespace
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_STANDALONE_SPECULATIVE_DRAFT_MODEL_FOR_TEST,
+    DEFAULT_STANDALONE_SPECULATIVE_TARGET_MODEL_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+GSM_DATASET_PATH = None
+
+
+# Default server arguments shared across all tests
+DEFAULT_SERVER_ARGS = [
+    "--trust-remote-code",
+    "--cuda-graph-max-bs",
+    "8",
+    "--speculative-algorithm",
+    "STANDALONE",
+    "--speculative-draft-model-path",
+    DEFAULT_STANDALONE_SPECULATIVE_DRAFT_MODEL_FOR_TEST,
+    "--speculative-num-steps",
+    "4",
+    "--speculative-eagle-topk",
+    "2",
+    "--speculative-num-draft-tokens",
+    "7",
+    "--mem-fraction-static",
+    0.7,
+]
+
+
+class TestStandaloneSpeculativeDecodingBase(CustomTestCase):
+
+    model = DEFAULT_STANDALONE_SPECULATIVE_TARGET_MODEL_FOR_TEST
+    draft_model = DEFAULT_STANDALONE_SPECULATIVE_DRAFT_MODEL_FOR_TEST
+    base_url = DEFAULT_URL_FOR_TEST
+    accuracy_threshold = 0.7  # derived tests need to override this
+    spec_decode_threshold = 3.6  # derived spec decoding tests need to override this
+
+    @classmethod
+    def get_server_args(cls):
+        """Return the arguments for the server launch. Override in subclasses."""
+        return DEFAULT_SERVER_ARGS + ["--attention-backend", "fa3"]
+
+    @classmethod
+    def setUpClass(cls):
+        # disable deep gemm precompile to make launch server faster
+        # please don't do this if you want to make your inference workload faster
+        os.environ["SGL_JIT_DEEPGEMM_PRECOMPILE"] = "false"
+        os.environ["SGL_ENABLE_JIT_DEEPGEMM"] = "false"
+        model = cls.model
+        cls.process = popen_launch_server(
+            model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=cls.get_server_args(),
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        requests.get(self.base_url + "/flush_cache")
+
+        args = SimpleNamespace(
+            num_shots=4,
+            num_questions=100,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+            data_path=GSM_DATASET_PATH,
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(f"{metrics=}")
+
+        # Use the appropriate metric key based on the test class
+        metric_key = "accuracy"
+        self.assertGreater(metrics[metric_key], self.accuracy_threshold)
+
+        server_info = requests.get(self.base_url + "/get_server_info")
+        avg_spec_accept_length = server_info.json()["internal_states"][0][
+            "avg_spec_accept_length"
+        ]
+        print(f"{avg_spec_accept_length=}")
+        self.assertGreater(avg_spec_accept_length, self.spec_decode_threshold)
+
+
+class TestStandaloneSpeculativeDecodingTriton(TestStandaloneSpeculativeDecodingBase):
+
+    @classmethod
+    def get_server_args(cls):
+        return DEFAULT_SERVER_ARGS + ["--attention-backend", "triton"]
+
+
+class TestStandaloneSpeculativeDecodingFlashinfer(
+    TestStandaloneSpeculativeDecodingBase
+):
+    @classmethod
+    def get_server_args(cls):
+        return DEFAULT_SERVER_ARGS + ["--attention-backend", "flashinfer"]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_start_profile.py b/test/srt/test_start_profile.py
new file mode 100644
index 000000000..60f5f7960
--- /dev/null
+++ b/test/srt/test_start_profile.py
@@ -0,0 +1,115 @@
+"""
+Usage:
+python3 -m unittest test_srt_engine.TestSRTEngine.test_4_sync_async_stream_combination
+"""
+
+import os
+import shutil
+import unittest
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+OUTPUT_DIR = "./profiler_dir"
+
+
+class TestStartProfile(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def setUp(self):
+        self._clear_profile_dir()
+
+    def test_start_profile_1(self):
+        """Test /start_profile with start_step and num_steps argument. This have to be the first test for start_step to work"""
+        response = self._start_profile(start_step="15", num_steps=5)
+
+        self._post_request()
+
+        self._check_non_empty_profile_dir()
+
+    def test_start_profile_2(self):
+        """Test /start_profile with no argument"""
+        response = self._start_profile()
+
+        self._post_request()
+
+        # Before /stop_profile, the profile directory should be empty
+        self._check_empty_profile_dir()
+
+        # Post /stop_profile and check the profile directory is non-empty
+        response = requests.post(
+            f"{DEFAULT_URL_FOR_TEST}/stop_profile",
+        )
+        self._check_non_empty_profile_dir()
+
+    def test_start_profile_3(self):
+        """Test /start_profile with num_steps argument"""
+        response = self._start_profile(num_steps=5)
+
+        self._post_request()
+
+        self._check_non_empty_profile_dir()
+
+    def _start_profile(self, **kwargs):
+        """Start profiling with optional parameters."""
+        response = requests.post(
+            f"{DEFAULT_URL_FOR_TEST}/start_profile",
+            json=kwargs if kwargs else None,
+        )
+        self.assertEqual(response.status_code, 200)
+
+    def _post_request(self):
+        response = requests.post(
+            f"{DEFAULT_URL_FOR_TEST}/generate",
+            json={
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": 32,
+                },
+            },
+        )
+        self.assertEqual(response.status_code, 200)
+
+    def _clear_profile_dir(self):
+        if os.path.isdir(OUTPUT_DIR):
+            # Remove the directory and all its contents
+            shutil.rmtree(OUTPUT_DIR)
+
+    def _check_non_empty_profile_dir(self):
+        self.assertTrue(os.path.isdir(OUTPUT_DIR), "Output directory does not exist.")
+        self.assertNotEqual(
+            len(os.listdir(OUTPUT_DIR)), 0, "Output directory is empty!"
+        )
+
+    def _check_empty_profile_dir(self):
+        if os.path.isdir(OUTPUT_DIR):
+            self.assertEqual(
+                len(os.listdir(OUTPUT_DIR)), 0, "Output directory is non-empty!"
+            )
+
+
+if __name__ == "__main__":
+    os.environ["SGLANG_TORCH_PROFILER_DIR"] = OUTPUT_DIR
+    unittest.main()
diff --git a/test/srt/test_swa_unittest.py b/test/srt/test_swa_unittest.py
new file mode 100644
index 000000000..128462029
--- /dev/null
+++ b/test/srt/test_swa_unittest.py
@@ -0,0 +1,182 @@
+import unittest
+
+import torch
+
+from sglang.srt.mem_cache.allocator import SWAKVPool, SWATokenToKVPoolAllocator
+from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
+from sglang.srt.mem_cache.radix_cache import SWARadixCache
+
+
+class TestSWA(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        pass
+
+    @classmethod
+    def tearDownClass(cls):
+        pass
+
+    def test_swa_memory_pool(self):
+        size = 16
+        size_swa = 16
+        num_head = 8
+        head_dim = 128
+        num_layers = 48
+        global_interval = 4
+        dtype = torch.bfloat16
+        device = "cuda"
+        full_attention_layer_ids = [i for i in range(0, num_layers, global_interval)]
+        full_attention_layer_ids_set = set(full_attention_layer_ids)
+        swa_attention_layer_ids = [
+            i for i in range(num_layers) if i not in full_attention_layer_ids_set
+        ]
+        pool = SWAKVPool(
+            size=size,
+            size_swa=size_swa,
+            dtype=dtype,
+            num_head=num_head,
+            head_dim=head_dim,
+            swa_attention_layer_ids=swa_attention_layer_ids,
+            full_attention_layer_ids=full_attention_layer_ids,
+            device=device,
+        )
+        alloc = SWATokenToKVPoolAllocator(
+            size=size, size_swa=size_swa, dtype=dtype, device=device, kvcache=pool
+        )
+        assert alloc.available_size() == size + size_swa
+        index = alloc.alloc(1)
+        assert alloc.available_size() == size_swa + size_swa - 2
+        alloc.free_swa(index)
+        result = alloc.translate_loc_from_full_to_swa(index)
+        print(result)
+
+    def test_swa_radix_cache_1(self):
+        # args
+        req_size = 10
+        max_context_len = 128
+        kv_size = 128
+        kv_size_swa = 64
+        sliding_window_size = 4
+        num_head = 8
+        head_dim = 128
+        num_layers = 48
+        global_interval = 4
+        dtype = torch.bfloat16
+        device = "cuda"
+        full_attention_layer_ids = [i for i in range(0, num_layers, global_interval)]
+        full_attention_layer_ids_set = set(full_attention_layer_ids)
+        swa_attention_layer_ids = [
+            i for i in range(num_layers) if i not in full_attention_layer_ids_set
+        ]
+        # setup req to token pool
+        req_to_token_pool = ReqToTokenPool(
+            size=req_size,
+            max_context_len=max_context_len,
+            device=device,
+            enable_memory_saver=False,
+        )
+        # setup kv pool
+        kv_pool = SWAKVPool(
+            size=kv_size,
+            size_swa=kv_size_swa,
+            dtype=dtype,
+            num_head=num_head,
+            head_dim=head_dim,
+            swa_attention_layer_ids=swa_attention_layer_ids,
+            full_attention_layer_ids=full_attention_layer_ids,
+            device=device,
+        )
+        # setup token to kv pool allocator
+        allocator = SWATokenToKVPoolAllocator(
+            size=kv_size,
+            size_swa=kv_size_swa,
+            dtype=dtype,
+            device=device,
+            kvcache=kv_pool,
+        )
+        # setup radix cache
+        tree = SWARadixCache(
+            req_to_token_pool=req_to_token_pool,
+            token_to_kv_pool_allocator=allocator,
+            sliding_window_size=sliding_window_size,
+            page_size=1,
+            disable=False,
+        )
+
+        # test
+        print(
+            f"[Start] allocator swa available size: {allocator.swa_available_size()}, full available size: {allocator.full_available_size()}"
+        )
+        req1_token_ids, req1_kv_indices = [1, 2, 3], allocator.alloc(3)
+        assert len(req1_token_ids) == len(req1_kv_indices)
+        print(
+            f"req1: inserting, req1_token_ids: {req1_token_ids}, req1_kv_indices: {req1_kv_indices}"
+        )
+        prefix_len = tree.insert(req1_token_ids, req1_kv_indices)
+        print(
+            f"req1: prefix_len: {prefix_len}, allocator swa available size: {allocator.swa_available_size()}, full available size: {allocator.full_available_size()}"
+        )
+        req2_token_ids, req2_kv_indices = [1, 2, 3, 4, 5, 6, 7], allocator.alloc(7)
+        assert len(req2_token_ids) == len(req2_kv_indices)
+        print(
+            f"req2: inserting, req2_token_ids: {req2_token_ids}, req2_kv_indices: {req2_kv_indices}"
+        )
+        prefix_len = tree.insert(req2_token_ids, req2_kv_indices)
+        print(
+            f"req2: prefix_len: {prefix_len}, allocator swa available size: {allocator.swa_available_size()}, full available size: {allocator.full_available_size()}"
+        )
+        req3_token_ids, req3_kv_indices = [10, 11, 12], allocator.alloc(3)
+        assert len(req3_token_ids) == len(req3_kv_indices)
+        print(
+            f"req3: inserting, req3_token_ids: {req3_token_ids}, req3_kv_indices: {req3_kv_indices}"
+        )
+        prefix_len = tree.insert(req3_token_ids, req3_kv_indices)
+        print(
+            f"req3: prefix_len: {prefix_len}, allocator swa available size: {allocator.swa_available_size()}, full available size: {allocator.full_available_size()}"
+        )
+        req4_token_ids, req4_kv_indices = [1, 2, 3, 4, 5, 60, 70], allocator.alloc(7)
+        assert len(req4_token_ids) == len(req4_kv_indices)
+        print(
+            f"req4: inserting, req4_token_ids: {req4_token_ids}, req4_kv_indices: {req4_kv_indices}"
+        )
+        prefix_len = tree.insert(req4_token_ids, req4_kv_indices)
+        print(
+            f"req4: prefix_len: {prefix_len}, allocator swa available size: {allocator.swa_available_size()}, full available size: {allocator.full_available_size()}"
+        )
+
+        tree.pretty_print()
+        full_num_tokens, swa_num_tokens = 1, 0
+        print(f"evicting {full_num_tokens} full token and {swa_num_tokens} swa token")
+        tree.evict(full_num_tokens=full_num_tokens, swa_num_tokens=swa_num_tokens)
+        tree.pretty_print()
+
+        full_num_tokens, swa_num_tokens = 0, 1
+        print(f"evicting {full_num_tokens} full token and {swa_num_tokens} swa token")
+        tree.evict(full_num_tokens=full_num_tokens, swa_num_tokens=swa_num_tokens)
+        tree.pretty_print()
+
+        full_num_tokens, swa_num_tokens = 1, 2
+        print(f"evicting {full_num_tokens} full token and {swa_num_tokens} swa token")
+        tree.evict(full_num_tokens=full_num_tokens, swa_num_tokens=swa_num_tokens)
+        tree.pretty_print()
+
+        req5_token_ids = [1, 2, 3, 4, 5]
+        kv_indices, last_node = tree.match_prefix(req5_token_ids)
+        print(
+            f"req5: token_ids: {req5_token_ids}, matched kv_indices: {kv_indices}, last_node.key: {last_node.key}"
+        )
+        assert len(kv_indices) == 0
+
+        req6_token_ids = [1, 2, 3, 4, 5, 60, 70]
+        kv_indices, last_node = tree.match_prefix(req6_token_ids)
+        print(
+            f"req6: token_ids: {req6_token_ids}, matched kv_indices: {kv_indices}, last_node.key: {last_node.key}"
+        )
+        assert len(kv_indices) == 7
+        assert len(last_node.key) == 2
+        assert last_node.key[0] == 60
+        assert last_node.key[1] == 70
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_tokenizer_batch_encode.py b/test/srt/test_tokenizer_batch_encode.py
new file mode 100644
index 000000000..f3294c049
--- /dev/null
+++ b/test/srt/test_tokenizer_batch_encode.py
@@ -0,0 +1,121 @@
+"""
+Unit tests for enable_tokenizer_batch_encode feature.
+
+This tests the batch tokenization functionality which allows processing
+multiple text inputs in a single batch for improved performance.
+
+Usage:
+python3 -m unittest test_tokenizer_batch_encode.TestTokenizerBatchEncode.test_batch_validation_constraints
+python3 -m unittest test_tokenizer_batch_encode.TestTokenizerBatchEncodeUnit.test_batch_tokenize_and_process_logic
+python3 -m unittest test_tokenizer_batch_encode.TestTokenizerBatchEncodeLogic.test_batch_processing_path
+"""
+
+import asyncio
+import unittest
+from typing import List
+from unittest.mock import AsyncMock, Mock, call, patch
+
+from sglang.srt.managers.io_struct import GenerateReqInput, TokenizedGenerateReqInput
+from sglang.srt.managers.tokenizer_manager import TokenizerManager
+from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.test.test_utils import DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+
+
+class TestTokenizerBatchEncode(unittest.TestCase):
+    """Test cases for tokenizer batch encoding validation and setup."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        self.server_args = ServerArgs(
+            model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+            enable_tokenizer_batch_encode=True,
+        )
+        self.port_args = PortArgs.init_new(self.server_args)
+
+        with patch("zmq.asyncio.Context"), patch(
+            "sglang.srt.utils.get_zmq_socket"
+        ), patch("sglang.srt.hf_transformers_utils.get_tokenizer") as mock_tokenizer:
+
+            mock_tokenizer.return_value = Mock(vocab_size=32000)
+            self.tokenizer_manager = TokenizerManager(self.server_args, self.port_args)
+
+    def test_batch_encode_enabled(self):
+        """Test that batch encoding is enabled when configured."""
+        self.assertTrue(self.server_args.enable_tokenizer_batch_encode)
+
+    def test_batch_encode_disabled(self):
+        """Test that batch encoding can be disabled."""
+        server_args_disabled = ServerArgs(
+            model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+            enable_tokenizer_batch_encode=False,
+        )
+        self.assertFalse(server_args_disabled.enable_tokenizer_batch_encode)
+
+    def test_multimodal_input_validation(self):
+        """Test that multimodal inputs are rejected in batch mode."""
+        req = GenerateReqInput(text="test", image_data=["dummy"])
+        req.contains_mm_input = Mock(return_value=True)
+
+        batch_obj = Mock()
+        batch_obj.__getitem__ = lambda self, i: req
+
+        self.tokenizer_manager.is_generation = True
+
+        with self.assertRaises(ValueError) as cm:
+            self.tokenizer_manager._validate_batch_tokenization_constraints(
+                1, batch_obj
+            )
+
+        self.assertIn("multimodal", str(cm.exception))
+
+    def test_pretokenized_input_validation(self):
+        """Test that pre-tokenized inputs are rejected in batch mode."""
+        req = GenerateReqInput(input_ids=[1, 2, 3])
+
+        batch_obj = Mock()
+        batch_obj.__getitem__ = lambda self, i: req
+
+        with self.assertRaises(ValueError) as cm:
+            self.tokenizer_manager._validate_batch_tokenization_constraints(
+                1, batch_obj
+            )
+
+        self.assertIn("pre-tokenized", str(cm.exception))
+
+    def test_input_embeds_validation(self):
+        """Test that input embeds are rejected in batch mode."""
+        req = GenerateReqInput(input_embeds=[0.1, 0.2])
+
+        batch_obj = Mock()
+        batch_obj.__getitem__ = lambda self, i: req
+
+        with self.assertRaises(ValueError) as cm:
+            self.tokenizer_manager._validate_batch_tokenization_constraints(
+                1, batch_obj
+            )
+
+        self.assertIn("input_embeds", str(cm.exception))
+
+    def test_valid_text_only_requests_pass_validation(self):
+        """Test that valid text-only requests pass validation."""
+        # Create valid requests (text-only)
+        requests = []
+        for i in range(3):
+            req = GenerateReqInput(text=f"test text {i}")
+            req.contains_mm_input = Mock(return_value=False)
+            requests.append(req)
+
+        batch_obj = Mock()
+        batch_obj.__getitem__ = Mock(side_effect=lambda i: requests[i])
+
+        # Should not raise any exception
+        try:
+            self.tokenizer_manager._validate_batch_tokenization_constraints(
+                3, batch_obj
+            )
+        except Exception as e:
+            self.fail(f"Validation failed for valid text-only requests: {e}")
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/test/srt/test_torch_compile.py b/test/srt/test_torch_compile.py
new file mode 100644
index 000000000..36815718a
--- /dev/null
+++ b/test/srt/test_torch_compile.py
@@ -0,0 +1,80 @@
+import time
+import unittest
+from types import SimpleNamespace
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_amd_ci,
+    popen_launch_server,
+)
+
+
+class TestTorchCompile(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--enable-torch-compile", "--cuda-graph-max-bs", "4"],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreaterEqual(metrics["score"], 0.65)
+
+    def run_decode(self, max_new_tokens):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": max_new_tokens,
+                    "ignore_eos": True,
+                },
+            },
+        )
+        return response.json()
+
+    def test_throughput(self):
+        # Warmup
+        res = self.run_decode(16)
+
+        max_tokens = 256
+        tic = time.perf_counter()
+        res = self.run_decode(max_tokens)
+        tok = time.perf_counter()
+        print(f"{res=}")
+        throughput = max_tokens / (tok - tic)
+        print(f"Throughput: {throughput} tokens/s")
+
+        if is_in_amd_ci():
+            self.assertGreaterEqual(throughput, 145)
+        else:
+            self.assertGreaterEqual(throughput, 152)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_torch_compile_moe.py b/test/srt/test_torch_compile_moe.py
new file mode 100644
index 000000000..8bc7b45d3
--- /dev/null
+++ b/test/srt/test_torch_compile_moe.py
@@ -0,0 +1,77 @@
+import time
+import unittest
+from types import SimpleNamespace
+
+import requests
+
+from sglang.srt.utils import is_cuda, kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestTorchCompileMoe(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--enable-torch-compile", "--torch-compile-max-bs", "4"],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreaterEqual(metrics["score"], 0.50)
+
+    def run_decode(self, max_new_tokens):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": max_new_tokens,
+                    "ignore_eos": True,
+                },
+            },
+        )
+        return response.json()
+
+    def test_throughput(self):
+        # Warmup
+        res = self.run_decode(16)
+
+        max_tokens = 256
+        tic = time.perf_counter()
+        res = self.run_decode(max_tokens)
+        tok = time.perf_counter()
+        print(f"{res=}")
+        throughput = max_tokens / (tok - tic)
+        if is_cuda():
+            self.assertGreaterEqual(throughput, 285)
+        else:
+            self.assertGreaterEqual(throughput, 270)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_torch_native_attention_backend.py b/test/srt/test_torch_native_attention_backend.py
new file mode 100644
index 000000000..5bf012cca
--- /dev/null
+++ b/test/srt/test_torch_native_attention_backend.py
@@ -0,0 +1,47 @@
+"""
+Usage:
+python3 -m unittest test_triton_attention_backend.TestTritonAttnBackend.test_mmlu
+"""
+
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestTorchNativeAttnBackend(CustomTestCase):
+    def test_mmlu(self):
+        model = DEFAULT_MODEL_NAME_FOR_TEST
+        base_url = DEFAULT_URL_FOR_TEST
+        process = popen_launch_server(
+            model,
+            base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--attention-backend", "torch_native"],
+        )
+
+        try:
+            args = SimpleNamespace(
+                base_url=base_url,
+                model=model,
+                eval_name="mmlu",
+                num_examples=64,
+                num_threads=32,
+            )
+
+            metrics = run_eval(args)
+            self.assertGreaterEqual(metrics["score"], 0.65)
+        finally:
+            kill_process_tree(process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_torch_tp.py b/test/srt/test_torch_tp.py
new file mode 100644
index 000000000..dca2612d4
--- /dev/null
+++ b/test/srt/test_torch_tp.py
@@ -0,0 +1,30 @@
+import unittest
+
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    run_bench_offline_throughput,
+)
+
+
+class TestTorchTP(CustomTestCase):
+    def test_torch_native_llama(self):
+        output_throughput = run_bench_offline_throughput(
+            DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+            [
+                "--tp",
+                "2",
+                # This cannot run anymore with the new torch version.
+                # "--json-model-override-args",
+                # '{"architectures": ["TorchNativeLlamaForCausalLM"]}',
+                "--disable-cuda-graph",
+            ],
+        )
+
+        if is_in_ci():
+            self.assertGreater(output_throughput, 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_torchao.py b/test/srt/test_torchao.py
new file mode 100644
index 000000000..13c7b60b5
--- /dev/null
+++ b/test/srt/test_torchao.py
@@ -0,0 +1,74 @@
+import unittest
+from types import SimpleNamespace
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestTorchAO(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--torchao-config", "int4wo-128"],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.60
+
+    def run_decode(self, max_new_tokens):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": "The capital of France is",
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": max_new_tokens,
+                },
+                "ignore_eos": True,
+            },
+        )
+        return response.json()
+
+    def test_throughput(self):
+        import time
+
+        max_tokens = 256
+
+        tic = time.perf_counter()
+        res = self.run_decode(max_tokens)
+        tok = time.perf_counter()
+        print(res["text"])
+        throughput = max_tokens / (tok - tic)
+        print(f"Throughput: {throughput} tokens/s")
+        assert throughput >= 210
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_triton_attention_backend.py b/test/srt/test_triton_attention_backend.py
new file mode 100644
index 000000000..05725301a
--- /dev/null
+++ b/test/srt/test_triton_attention_backend.py
@@ -0,0 +1,66 @@
+"""
+Usage:
+python3 -m unittest test_triton_attention_backend.TestTritonAttnBackend.test_mmlu
+"""
+
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    run_bench_offline_throughput,
+)
+
+
+class TestTritonAttnBackend(CustomTestCase):
+    def test_latency(self):
+        output_throughput = run_bench_offline_throughput(
+            DEFAULT_MODEL_NAME_FOR_TEST,
+            [
+                "--attention-backend",
+                "triton",
+                "--enable-torch-compile",
+                "--cuda-graph-max-bs",
+                4,
+            ],
+        )
+
+        print(f"{output_throughput=}")
+
+        if is_in_ci():
+            self.assertGreater(output_throughput, 153)
+
+    def test_mmlu(self):
+        model = DEFAULT_MODEL_NAME_FOR_TEST
+        base_url = DEFAULT_URL_FOR_TEST
+        process = popen_launch_server(
+            model,
+            base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--attention-backend", "triton"],
+        )
+
+        try:
+            args = SimpleNamespace(
+                base_url=base_url,
+                model=model,
+                eval_name="mmlu",
+                num_examples=64,
+                num_threads=32,
+            )
+
+            metrics = run_eval(args)
+            self.assertGreaterEqual(metrics["score"], 0.65)
+        finally:
+            kill_process_tree(process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_triton_attention_kernels.py b/test/srt/test_triton_attention_kernels.py
new file mode 100644
index 000000000..b15684f9a
--- /dev/null
+++ b/test/srt/test_triton_attention_kernels.py
@@ -0,0 +1,576 @@
+import random
+import unittest
+
+import torch
+import torch.nn.functional as F
+
+from sglang.srt.layers.attention.triton_ops.decode_attention import (
+    decode_attention_fwd,
+    decode_attention_fwd_grouped,
+    decode_attention_fwd_normal,
+)
+from sglang.srt.layers.attention.triton_ops.extend_attention import (
+    extend_attention_fwd,
+    redundant_attention,
+)
+from sglang.srt.layers.attention.triton_ops.prefill_attention import (
+    context_attention_fwd,
+)
+from sglang.test.test_utils import CustomTestCase
+
+
+def extend_attention_fwd_torch(
+    q: torch.Tensor,  # [extend_tokens, H_Q, D]
+    k: torch.Tensor,  # [extend_tokens, H_KV, D]
+    v: torch.Tensor,  # [extend_tokens, H_KV, D]
+    o: torch.Tensor,  # [extend_tokens, H_Q, D]
+    k_cache: torch.Tensor,  # [total_tokens, H_KV, D]
+    v_cache: torch.Tensor,  # [total_tokens, H_KV, D]
+    qo_indptr: torch.Tensor,  # [B+1]
+    kv_indptr: torch.Tensor,  # [B+1]
+    kv_indices: torch.Tensor,  # [prefix_tokens]
+    sliding_window_size: int,
+):
+    B = qo_indptr.size(0) - 1
+    _, H_Q, D = q.shape
+    _, H_KV, _ = k.shape
+
+    group_size = H_Q // H_KV
+    scale = 1.0 / D**0.5
+
+    for i in range(B):
+        q_start = int(qo_indptr[i].item())
+        q_end = int(qo_indptr[i + 1].item())
+        kv_start = int(kv_indptr[i].item())
+        kv_end = int(kv_indptr[i + 1].item())
+
+        prefix_indices = kv_indices[kv_start:kv_end]
+        k_prefix = k_cache[prefix_indices]  # [prefix_len, H_KV, D]
+        v_prefix = v_cache[prefix_indices]  # [prefix_len, H_KV, D]
+
+        k_extend = k[q_start:q_end]  # [extend_len, H_KV, D]
+        v_extend = v[q_start:q_end]  # [extend_len, H_KV, D]
+        q_extend = q[q_start:q_end]  # [extend_len, H_Q,  D]
+
+        k_full = torch.cat([k_prefix, k_extend], dim=0)  # [total_len, H_KV, D]
+        v_full = torch.cat([v_prefix, v_extend], dim=0)  # [total_len, H_KV, D]
+
+        if group_size != 1:
+            k_full_hq = k_full.repeat_interleave(
+                group_size, dim=1
+            )  # [total_len, H_Q, D]
+            v_full_hq = v_full.repeat_interleave(
+                group_size, dim=1
+            )  # [total_len, H_Q, D]
+        else:
+            k_full_hq = k_full
+            v_full_hq = v_full
+
+        prefix_len = k_prefix.size(0)
+        extend_len = k_extend.size(0)
+        total_len = prefix_len + extend_len
+
+        # causal
+        pos_keys = torch.arange(total_len, device=q.device)
+        t = prefix_len + torch.arange(extend_len, device=q.device)  # [extend_len]
+        causal_mask = pos_keys.unsqueeze(0) <= t.unsqueeze(1)
+
+        # sliding window
+        if sliding_window_size is not None and sliding_window_size > 0:
+            start = (t - (sliding_window_size)).clamp_min(0)  # [extend_len]
+        else:
+            start = torch.zeros_like(t)
+        window_mask = pos_keys.unsqueeze(0) >= start.unsqueeze(1)
+
+        final_mask = causal_mask & window_mask
+
+        attn_scores = (
+            torch.einsum("qhd,khd->qhk", q_extend, k_full_hq) * scale
+        )  # [extend_len, H_Q, total_len]
+        attn_scores = attn_scores.masked_fill(~final_mask.unsqueeze(1), float("-inf"))
+
+        attn_weights = F.softmax(attn_scores, dim=-1)
+        o[q_start:q_end] = torch.einsum("qhk,khd->qhd", attn_weights, v_full_hq)
+
+
+class TestTritonAttention(CustomTestCase):
+
+    def _set_all_seeds(self, seed):
+        """Set all random seeds for reproducibility."""
+        random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+
+    def setUp(self):
+        # Set seeds before each test method
+        self._set_all_seeds(42)
+
+    def _test_extend_attention_once(self, B, N_CTX, H_Q, H_KV, D):
+        dtype = torch.bfloat16
+
+        b_seq_len_prefix = torch.randint(
+            1, N_CTX // 2, (B,), dtype=torch.int32, device="cuda"
+        )
+        b_seq_len_extend = torch.randint(
+            1, N_CTX // 2, (B,), dtype=torch.int32, device="cuda"
+        )
+        b_seq_len = b_seq_len_prefix + b_seq_len_extend
+        max_len_in_batch = torch.max(b_seq_len, 0)[0].item()
+
+        b_req_idx = torch.arange(B, dtype=torch.int32, device="cuda")
+        b_start_loc = torch.zeros((B,), dtype=torch.int32, device="cuda")
+        b_start_loc[1:] = torch.cumsum(b_seq_len[:-1], 0)
+        b_start_loc_extend = torch.zeros((B,), dtype=torch.int32, device="cuda")
+        b_start_loc_extend[1:] = torch.cumsum(b_seq_len_extend[:-1], 0)
+
+        kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda")
+        kv_indptr[1 : B + 1] = torch.cumsum(b_seq_len_prefix[:B], dim=0)
+        kv_indices = torch.zeros(
+            (b_seq_len_prefix.sum().item(),), dtype=torch.int32, device="cuda"
+        )
+
+        for i in range(B):
+            kv_indices[kv_indptr[i] : kv_indptr[i + 1]] = torch.arange(
+                b_start_loc[i], b_start_loc[i] + b_seq_len_prefix[i]
+            )
+
+        total_token_num = torch.sum(b_seq_len).item()
+        extend_token_num = torch.sum(b_seq_len_extend).item()
+        k_buffer = torch.empty(
+            (total_token_num, H_KV, D), dtype=dtype, device="cuda"
+        ).normal_(mean=0.1, std=0.2)
+        v_buffer = torch.empty(
+            (total_token_num, H_KV, D), dtype=dtype, device="cuda"
+        ).normal_(mean=0.1, std=0.2)
+
+        k_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device="cuda")
+        v_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device="cuda")
+        q_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda")
+        for i in range(B):
+            extend_start_in_buffer = b_start_loc[i] + b_seq_len_prefix[i]
+            extend_end_in_buffer = b_start_loc[i] + b_seq_len[i]
+            extend_start = b_start_loc_extend[i]
+            extend_end = b_start_loc_extend[i] + b_seq_len_extend[i]
+            k_extend[extend_start:extend_end] = k_buffer[
+                extend_start_in_buffer:extend_end_in_buffer
+            ]
+            v_extend[extend_start:extend_end] = v_buffer[
+                extend_start_in_buffer:extend_end_in_buffer
+            ]
+            q_extend[extend_start:extend_end] = torch.empty(
+                (b_seq_len_extend[i], H_Q, D), dtype=dtype, device="cuda"
+            ).normal_(mean=0.1, std=0.2)
+
+        o_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda")
+        o_extend_mask = torch.empty(
+            (extend_token_num, H_Q, D), dtype=dtype, device="cuda"
+        )
+        o_redundant = torch.empty(
+            (extend_token_num, H_Q, D), dtype=dtype, device="cuda"
+        )
+
+        b_seq_len_extend = b_seq_len - b_seq_len_prefix
+        max_len_extend = torch.max(b_seq_len_extend, 0)[0].item()
+        qo_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda")
+        qo_indptr[1 : B + 1] = torch.cumsum(b_seq_len_extend[:B], dim=0)
+
+        custom_mask = None
+        mask_indptr = None
+
+        extend_attention_fwd(
+            q_extend,
+            k_extend,
+            v_extend,
+            o_extend,
+            k_buffer,
+            v_buffer,
+            qo_indptr,
+            kv_indptr,
+            kv_indices,
+            custom_mask,
+            True,
+            mask_indptr,
+            max_len_extend,
+        )
+
+        b_seq_mask_len = b_seq_len_extend * b_seq_len
+        custom_mask = torch.ones(
+            (b_seq_mask_len.sum().item(),), dtype=torch.bool, device="cuda"
+        )
+        mask_indptr = torch.zeros((B + 1,), dtype=torch.int64, device="cuda")
+        mask_indptr[1 : B + 1] = torch.cumsum(b_seq_mask_len[:B], dim=0)
+        for i in range(B):
+            causal_mask = (
+                torch.tril(
+                    torch.ones(b_seq_len_extend[i], b_seq_len_extend[i]), diagonal=0
+                )
+                == 1
+            )
+            prefix_mask = torch.ones(
+                b_seq_len_extend[i], b_seq_len_prefix[i], dtype=torch.bool
+            )
+            mask_flatten = torch.cat([prefix_mask, causal_mask], dim=1).flatten()
+            custom_mask[mask_indptr[i] : mask_indptr[i + 1]] = mask_flatten
+
+        extend_attention_fwd(
+            q_extend,
+            k_extend,
+            v_extend,
+            o_extend_mask,
+            k_buffer,
+            v_buffer,
+            qo_indptr,
+            kv_indptr,
+            kv_indices,
+            custom_mask,
+            True,
+            mask_indptr,
+            max_len_extend,
+        )
+
+        redundant_attention(
+            q_extend,
+            o_redundant,
+            k_buffer,
+            v_buffer,
+            b_req_idx,
+            b_start_loc,
+            b_seq_len,
+            b_seq_len_prefix,
+            max_len_in_batch,
+        )
+
+        self.assertTrue(torch.allclose(o_extend, o_redundant, rtol=1e-2))
+        self.assertTrue(torch.allclose(o_extend_mask, o_redundant, rtol=1e-2))
+
+    def test_extend_attention(self):
+
+        # Define the varying parameter values
+        attention_values = [128, 96, 80, 13]
+
+        # Loop through the values and call the method
+        for value in attention_values:
+            self._test_extend_attention_once(19, 12331, 12, 4, value)
+
+    def _test_extend_attention_sliding_window_once(
+        self, B, N_CTX, H_Q, H_KV, D, WINDOW_SIZE
+    ):
+        dtype = torch.bfloat16
+
+        b_seq_len_prefix = torch.randint(
+            1, N_CTX // 2, (B,), dtype=torch.int32, device="cuda"
+        )
+        b_seq_len_extend = torch.randint(
+            1, N_CTX // 2, (B,), dtype=torch.int32, device="cuda"
+        )
+        b_seq_len = b_seq_len_prefix + b_seq_len_extend
+
+        b_start_loc = torch.zeros((B,), dtype=torch.int32, device="cuda")
+        b_start_loc[1:] = torch.cumsum(b_seq_len[:-1], 0)
+        b_start_loc_extend = torch.zeros((B,), dtype=torch.int32, device="cuda")
+        b_start_loc_extend[1:] = torch.cumsum(b_seq_len_extend[:-1], 0)
+
+        kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda")
+        kv_indptr[1 : B + 1] = torch.cumsum(b_seq_len_prefix[:B], dim=0)
+        kv_indices = torch.zeros(
+            (b_seq_len_prefix.sum().item(),), dtype=torch.int32, device="cuda"
+        )
+
+        for i in range(B):
+            kv_indices[kv_indptr[i] : kv_indptr[i + 1]] = torch.arange(
+                b_start_loc[i], b_start_loc[i] + b_seq_len_prefix[i]
+            )
+
+        total_token_num = torch.sum(b_seq_len).item()
+        extend_token_num = torch.sum(b_seq_len_extend).item()
+        k_buffer = torch.empty(
+            (total_token_num, H_KV, D), dtype=dtype, device="cuda"
+        ).normal_(mean=0.1, std=0.2)
+        v_buffer = torch.empty(
+            (total_token_num, H_KV, D), dtype=dtype, device="cuda"
+        ).normal_(mean=0.1, std=0.2)
+
+        k_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device="cuda")
+        v_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device="cuda")
+        q_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda")
+        for i in range(B):
+            extend_start_in_buffer = b_start_loc[i] + b_seq_len_prefix[i]
+            extend_end_in_buffer = b_start_loc[i] + b_seq_len[i]
+            extend_start = b_start_loc_extend[i]
+            extend_end = b_start_loc_extend[i] + b_seq_len_extend[i]
+            k_extend[extend_start:extend_end] = k_buffer[
+                extend_start_in_buffer:extend_end_in_buffer
+            ]
+            v_extend[extend_start:extend_end] = v_buffer[
+                extend_start_in_buffer:extend_end_in_buffer
+            ]
+            q_extend[extend_start:extend_end] = torch.empty(
+                (b_seq_len_extend[i], H_Q, D), dtype=dtype, device="cuda"
+            ).normal_(mean=0.1, std=0.2)
+
+        o_extend_triton = torch.empty(
+            (extend_token_num, H_Q, D), dtype=dtype, device="cuda"
+        )
+        o_extend_torch = torch.empty(
+            (extend_token_num, H_Q, D), dtype=dtype, device="cuda"
+        )
+
+        b_seq_len_extend = b_seq_len - b_seq_len_prefix
+        max_len_extend = torch.max(b_seq_len_extend, 0)[0].item()
+        qo_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda")
+        qo_indptr[1 : B + 1] = torch.cumsum(b_seq_len_extend[:B], dim=0)
+
+        extend_attention_fwd(
+            q_extend,
+            k_extend,
+            v_extend,
+            o_extend_triton,
+            k_buffer,
+            v_buffer,
+            qo_indptr,
+            kv_indptr,
+            kv_indices,
+            custom_mask=None,
+            is_causal=True,
+            mask_indptr=None,
+            max_len_extend=max_len_extend,
+            sliding_window_size=WINDOW_SIZE,
+        )
+
+        extend_attention_fwd_torch(
+            q_extend,
+            k_extend,
+            v_extend,
+            o_extend_torch,
+            k_buffer,
+            v_buffer,
+            qo_indptr,
+            kv_indptr,
+            kv_indices,
+            WINDOW_SIZE,
+        )
+
+        self.assertTrue(
+            torch.allclose(o_extend_triton, o_extend_torch, rtol=1e-3, atol=1e-3)
+        )
+
+    def test_extend_attention_sliding_window(self):
+        window_sizes = [-1, 127]
+        for window_size in window_sizes:
+            self._test_extend_attention_sliding_window_once(
+                19, 12331, 64, 8, 128, window_size
+            )
+
+    def _test_context_attention_once(self, head_dim, is_causal):
+        # Set up a simple test case
+        num_heads = 4
+        seq_lens = [8, 12]
+        max_seq_len = max(seq_lens)
+
+        # Create random input tensors
+        q = torch.randn(sum(seq_lens), num_heads, head_dim, device="cuda")
+        k = torch.randn(sum(seq_lens), num_heads, head_dim, device="cuda")
+        v = torch.randn(sum(seq_lens), num_heads, head_dim, device="cuda")
+        o = torch.zeros(sum(seq_lens), num_heads, head_dim, device="cuda")
+
+        # Create b_start_loc and b_seq_len tensors
+        b_start_loc = torch.tensor([0, seq_lens[0]], device="cuda")
+        b_seq_len = torch.tensor(seq_lens, device="cuda")
+
+        context_attention_fwd(
+            q, k, v, o, b_start_loc, b_seq_len, max_seq_len, is_causal=is_causal
+        )
+
+        cu_seq_lens = [0] * (len(seq_lens) + 1)
+        for i, seq_len in enumerate(seq_lens):
+            cu_seq_lens[i + 1] = cu_seq_lens[i] + seq_len
+
+        for i in range(len(seq_lens)):
+            start, end = cu_seq_lens[i], cu_seq_lens[i + 1]
+            o_torch = torch.nn.functional.scaled_dot_product_attention(
+                q[start:end].permute(1, 0, 2),
+                k[start:end].permute(1, 0, 2),
+                v[start:end].permute(1, 0, 2),
+                is_causal=is_causal,
+            ).permute(1, 0, 2)
+
+            cos_sim = torch.nn.functional.cosine_similarity(
+                o[start:end].flatten(), o_torch.flatten(), dim=0
+            )
+            self.assertTrue(cos_sim.item() > 1 - (1e-5))
+            self.assertTrue(torch.allclose(o[start:end], o_torch, atol=1e-2))
+
+    def test_context_attention(self):
+        head_dim = [128, 96, 80, 13]
+
+        for dim in head_dim:
+            for is_causal in [True, False]:
+                self._test_context_attention_once(dim, is_causal)
+
+    def _test_decode_attention_once(self, B, H_Q, H_KV, D):
+        dtype = torch.bfloat16
+        seq_len = 10  # This represents the number of tokens already in the sequence
+        total_tokens = B * seq_len
+        sm_scale = 1.0 / (D**0.5)
+        max_kv_splits = 8
+        num_kv_splits = torch.full((B,), 4, dtype=torch.int32, device="cuda")
+
+        # q represents the new token being generated, one per batch
+        q = torch.randn(B, H_Q, D, dtype=dtype, device="cuda")
+
+        # k_buffer and v_buffer represent all previous tokens
+        k_buffer = torch.randn(total_tokens, H_KV, D, dtype=dtype, device="cuda")
+        v_buffer = torch.randn(total_tokens, H_KV, D, dtype=dtype, device="cuda")
+
+        # o will have the same shape as q
+        o = torch.zeros(B, H_Q, D, dtype=dtype, device="cuda")
+
+        b_seq_len = torch.full((B,), seq_len, device="cuda")
+
+        kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda")
+        kv_indptr[1 : B + 1] = torch.cumsum(b_seq_len[:B], dim=0)
+        kv_indices = torch.arange(total_tokens, device="cuda")
+
+        attn_logits = torch.empty(
+            (B, H_Q, max_kv_splits, D),
+            dtype=torch.float32,
+            device="cuda",
+        )
+        attn_lse = torch.empty(
+            (B, H_Q, max_kv_splits),
+            dtype=torch.float32,
+            device="cuda",
+        )
+
+        decode_attention_fwd(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            kv_indptr,
+            kv_indices,
+            attn_logits,
+            attn_lse,
+            num_kv_splits,
+            max_kv_splits,
+            sm_scale,
+        )
+
+    def test_decode_attention(self):
+        # Here we just to ensure there is no error
+        # TODO: correctnesss test
+
+        # Test configurations
+        configs = [
+            (2, 4, 4, 64),  # MHA
+            (2, 4, 2, 64),  # GQA
+            (2, 4, 4, 80),  # Non-standard head dim
+            (2, 4, 4, 13),  # Prime number head dim
+        ]
+
+        for B, H_Q, H_KV, D in configs:
+            self._test_decode_attention_once(B, H_Q, H_KV, D)
+
+    def _test_grouped_decode_attention_once(self, B, S, H_Q, H_KV, D, D_V):
+        dtype = torch.bfloat16
+        seq_len = S  # This represents the number of tokens already in the sequence
+        total_tokens = B * seq_len
+        sm_scale = 1.0 / (D**0.5)
+        max_kv_splits = 8
+        num_kv_splits = torch.full((B,), 4, dtype=torch.int32, device="cuda")
+
+        # q represents the new token being generated, one per batch
+        q = torch.randn(B, H_Q, D, dtype=dtype, device="cuda")
+
+        # k_buffer and v_buffer represent all previous tokens
+        k_buffer = torch.randn(total_tokens, H_KV, D, dtype=dtype, device="cuda")
+        v_buffer = torch.randn(total_tokens, H_KV, D_V, dtype=dtype, device="cuda")
+
+        # o will have the same shape as q
+        o = torch.zeros(B, H_Q, D_V, dtype=dtype, device="cuda")
+        o_grouped = torch.zeros(B, H_Q, D_V, dtype=dtype, device="cuda")
+
+        b_seq_len = torch.full((B,), seq_len, device="cuda")
+
+        kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda")
+        kv_indptr[1 : B + 1] = torch.cumsum(b_seq_len[:B], dim=0)
+        kv_indices = torch.arange(total_tokens, device="cuda")
+
+        attn_logits = torch.empty(
+            (B, H_Q, max_kv_splits, D_V),
+            dtype=torch.float32,
+            device="cuda",
+        )
+        attn_lse = torch.empty(
+            (B, H_Q, max_kv_splits),
+            dtype=torch.float32,
+            device="cuda",
+        )
+
+        decode_attention_fwd_normal(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            kv_indptr,
+            kv_indices,
+            attn_logits,
+            attn_lse,
+            num_kv_splits,
+            max_kv_splits,
+            sm_scale,
+        )
+
+        attn_logits1 = torch.empty(
+            (B, H_Q, max_kv_splits, D_V),
+            dtype=torch.float32,
+            device="cuda",
+        )
+        attn_lse1 = torch.empty(
+            (B, H_Q, max_kv_splits, D_V),
+            dtype=torch.float32,
+            device="cuda",
+        )
+
+        decode_attention_fwd_grouped(
+            q,
+            k_buffer,
+            v_buffer,
+            o_grouped,
+            kv_indptr,
+            kv_indices,
+            attn_logits1,
+            attn_lse1,
+            num_kv_splits,
+            max_kv_splits,
+            sm_scale,
+        )
+
+        cos_sim = torch.nn.functional.cosine_similarity(
+            o.flatten(), o_grouped.flatten(), dim=0
+        )
+        print(cos_sim.item())
+        self.assertTrue(cos_sim.item() > 0.99)
+        self.assertTrue(torch.allclose(o, o_grouped, atol=3e-2))
+
+    def test_grouped_decode_attention(self):
+        seq_lens = [5, 100, 128, 500]
+        configs = [
+            (2, 16, 16, 64, 64),
+            (2, 16, 1, 64, 64),
+            (2, 64, 1, 13, 13),
+            (2, 128, 1, 80, 80),
+            (2, 128, 2, 512, 512),
+            (2, 128, 1, 576, 512),
+        ]
+
+        for S in seq_lens:
+            for B, H_Q, H_KV, D, D_V in configs:
+                self._test_grouped_decode_attention_once(B, S, H_Q, H_KV, D, D_V)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_triton_attention_rocm_mla.py b/test/srt/test_triton_attention_rocm_mla.py
new file mode 100644
index 000000000..f3074aeeb
--- /dev/null
+++ b/test/srt/test_triton_attention_rocm_mla.py
@@ -0,0 +1,259 @@
+import random
+import unittest
+
+import torch
+
+from sglang.srt.layers.attention.triton_ops.decode_attention import (
+    decode_attention_fwd_grouped,
+)
+from sglang.srt.layers.attention.triton_ops.rocm_mla_decode_rope import (
+    decode_attention_fwd_grouped_rope,
+)
+from sglang.srt.layers.rotary_embedding import DeepseekScalingRotaryEmbedding
+from sglang.test.test_utils import CustomTestCase
+
+
+class TestTritonAttentionMLA(CustomTestCase):
+
+    def _set_all_seeds(self, seed):
+        """Set all random seeds for reproducibility."""
+        random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+
+    def setUp(self):
+        # Set seeds before each test method
+        self._set_all_seeds(42)
+
+    def preprocess_kv_cache(self, kv_cache, kv_lora_rank):
+        latent_cache = kv_cache
+        v_input = latent_cache[..., :kv_lora_rank]
+        v_input = v_input.contiguous().unsqueeze(1)
+        k_input = latent_cache.unsqueeze(1)
+        k_input[..., :kv_lora_rank] = v_input
+
+        return k_input, v_input
+
+    def input_helper(
+        self,
+        B,
+        H,
+        S,
+        kv_lora_rank,
+        rotary_dim,
+        qk_rope_head_dim,
+        num_kv_splits,
+        dtype,
+        device,
+        rope_base=10,
+        rope_max_seq_len=16384,
+        rope_scaling=1.0,
+        is_neox_style=False,
+    ):
+        q = torch.randn(
+            B, H, kv_lora_rank + qk_rope_head_dim, device=device, dtype=dtype
+        )
+        kv_cache = torch.randn(
+            B * S, kv_lora_rank + qk_rope_head_dim, dtype=dtype, device=device
+        )
+        kv_indptr = torch.arange(B + 1, device=device) * S
+        kv_indices = torch.arange(B * S, device=device)
+        attn_logits = torch.empty(
+            B, H, num_kv_splits, kv_lora_rank + 1, dtype=dtype, device=device
+        )
+        rotary_emb = DeepseekScalingRotaryEmbedding(
+            qk_rope_head_dim,
+            rotary_dim,
+            rope_max_seq_len,
+            rope_base,
+            is_neox_style,
+            rope_scaling,
+            q.dtype,
+            device="cpu",
+        ).cuda()
+        positions = torch.tensor([S], device=device).unsqueeze(0).repeat(B, 1)
+
+        return kv_indptr, kv_indices, q, kv_cache, attn_logits, rotary_emb, positions
+
+    def ref_compute_full_fwd(
+        self,
+        q,
+        k_input,
+        v_input,
+        kv_lora_rank,
+        kv_indptr,
+        kv_indices,
+        num_kv_splits,
+        sm_scale,
+        logit_cap,
+        rotary_emb,
+        positions,
+        use_rope,
+        device="cuda",
+    ):
+
+        B, H = q.shape[0], q.shape[1]
+        S = kv_indptr[1].item()
+        qk_rope_head_dim = k_input.shape[-1] - kv_lora_rank
+
+        q_input = torch.empty(B, H, kv_lora_rank + qk_rope_head_dim, dtype=q.dtype).to(
+            device
+        )
+        q_nope_out, q_pe = q.split([kv_lora_rank, qk_rope_head_dim], dim=-1)
+        k_pe_t = k_input.view(B, 1, S, -1)[:, :, -1:, kv_lora_rank:]
+
+        if use_rope:
+            q_pe, k_pe_t = rotary_emb(positions, q_pe.unsqueeze(2), k_pe_t)
+            q_pe = q_pe.squeeze()
+
+        k_input.view(B, 1, S, -1)[:, :, -1:, kv_lora_rank:] = k_pe_t
+
+        q_input[..., :kv_lora_rank] = q_nope_out
+        q_input[..., kv_lora_rank:] = q_pe
+
+        B, H = q_input.shape[0], q_input.shape[1]
+        kv_lora_rank = v_input.shape[-1]
+        device = q_input.device
+
+        attn_logits = torch.empty(
+            B, H, num_kv_splits, kv_lora_rank + 1, dtype=q_input.dtype, device=device
+        )
+        o = torch.empty(B, H, kv_lora_rank, dtype=q_input.dtype, device=device)
+
+        decode_attention_fwd_grouped(
+            q_input,
+            k_input,
+            v_input,
+            o,
+            kv_indptr,
+            kv_indices,
+            attn_logits,
+            num_kv_splits,
+            sm_scale,
+            logit_cap,
+        )
+
+        return attn_logits, o, k_pe_t.squeeze()
+
+    def _test_rocm_fused_mla_kernel(
+        self,
+        B,
+        H,
+        S,
+        kv_lora_rank,
+        qk_rope_head_dim,
+        rotary_dim,
+        dtype,
+        use_rope,
+        is_neox_style,
+        num_kv_splits=2,
+        sm_scale=1.0,
+        logit_cap=0.0,
+        device="cuda",
+    ):
+        kv_indptr, kv_indices, q, kv_cache, attn_logits, rotary_emb, positions = (
+            self.input_helper(
+                B,
+                H,
+                S,
+                kv_lora_rank,
+                rotary_dim,
+                qk_rope_head_dim,
+                num_kv_splits,
+                dtype,
+                device=device,
+                is_neox_style=is_neox_style,
+            )
+        )
+
+        k_input, v_input = self.preprocess_kv_cache(kv_cache, kv_lora_rank)
+        k_pe_tokens = torch.empty(
+            B, qk_rope_head_dim, dtype=kv_cache.dtype, device=device
+        )
+        tri_o = torch.empty(B, H, kv_lora_rank, dtype=kv_cache.dtype, device=device)
+
+        decode_attention_fwd_grouped_rope(
+            q,
+            k_input,
+            v_input,
+            tri_o,
+            kv_indptr,
+            kv_indices,
+            k_pe_tokens if use_rope else None,
+            kv_lora_rank,
+            rotary_dim if use_rope else None,
+            rotary_emb.cos_sin_cache if use_rope else None,
+            positions if use_rope else None,
+            attn_logits,
+            num_kv_splits,
+            sm_scale,
+            logit_cap,
+            use_rope,
+            is_neox_style,
+        )
+
+        tri_logits = attn_logits
+
+        # reference
+        ref_logits, ref_o, ref_k_pe_tokens = self.ref_compute_full_fwd(
+            q,
+            k_input,
+            v_input,
+            kv_lora_rank,
+            kv_indptr,
+            kv_indices,
+            num_kv_splits,
+            sm_scale,
+            logit_cap,
+            rotary_emb,
+            positions,
+            use_rope,
+            device="cuda",
+        )
+
+        if use_rope:
+            torch.testing.assert_close(
+                ref_k_pe_tokens, k_pe_tokens.squeeze(), atol=1e-2, rtol=1e-2
+            )
+        torch.testing.assert_close(ref_logits, tri_logits, atol=1e-2, rtol=1e-2)
+        torch.testing.assert_close(ref_o, tri_o, atol=1e-2, rtol=1e-2)
+
+    def test_grouped_rocm_fused_mla(self):
+        configs = [
+            (1, 128, 2048, 512, 64, 64),
+            (1, 128, 2048, 512, 128, 64),
+            (1, 128, 2048, 512, 127, 64),
+            (1, 128, 2050, 512, 127, 64),
+            (1, 128, 2050, 512, 128, 64),
+            (8, 128, 2048, 512, 64, 64),
+            (8, 128, 2048, 512, 128, 64),
+            (8, 128, 2048, 512, 127, 64),
+            (8, 128, 2050, 512, 127, 64),
+            (8, 128, 2050, 512, 128, 64),
+        ]
+        dtypes = [torch.bfloat16, torch.float32]
+        use_rope_list = [True, False]
+        is_neox_style_list = [True, False]
+
+        for B, H, S, kv_lora_rank, qk_rope_head_dim, rotary_dim in configs:
+            for dtype in dtypes:
+                for use_rope in use_rope_list:
+                    for is_neox_style in is_neox_style_list:
+                        self._test_rocm_fused_mla_kernel(
+                            B,
+                            H,
+                            S,
+                            kv_lora_rank,
+                            qk_rope_head_dim,
+                            rotary_dim,
+                            dtype,
+                            use_rope,
+                            is_neox_style,
+                        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_triton_fused_moe.py b/test/srt/test_triton_fused_moe.py
new file mode 100644
index 000000000..88d33b5f7
--- /dev/null
+++ b/test/srt/test_triton_fused_moe.py
@@ -0,0 +1,162 @@
+import unittest
+
+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.moe.fused_moe_triton.triton_kernels_moe import (
+    triton_kernel_moe_forward,
+)
+from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
+from sglang.srt.layers.moe.topk import TopK
+from sglang.test.test_utils import CustomTestCase
+
+
+class TestFusedMOE(CustomTestCase):
+    NUM_EXPERTS = [8, 64]
+    TOP_KS = [2, 4]
+
+    @staticmethod
+    def create_random_cuda_tensor(shape, dtype, mean=0, std=0.01):
+        """Create a random CUDA tensor
+
+        Args:
+            shape: Tensor shape
+            dtype: Data type
+            mean: Mean value
+            std: Standard deviation
+
+        Returns:
+            torch.Tensor: Randomly initialized CUDA tensor
+        """
+        return torch.empty(shape, dtype=dtype, device="cuda").normal_(mean, std)
+
+    def get_tolerance(self, dtype):
+        """Get tolerance values for different data types
+
+        Args:
+            dtype: Data type
+
+        Returns:
+            tuple: (relative tolerance, absolute tolerance)
+        """
+        if dtype == torch.float32:
+            return 1e-5, 1e-5
+        elif dtype in [torch.float16, torch.bfloat16]:
+            return 1e-5, 1e-5
+        else:
+            return 1e-2, 1e-2  # Default values for other types
+
+    def torch_naive_moe(
+        self,
+        a,
+        w1,
+        w2,
+        score,
+        topk,
+    ):
+        B, D = a.shape
+        a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+        out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+        score = torch.softmax(score, dim=-1, dtype=torch.float32)
+        topk_weight, topk_ids = torch.topk(score, topk)
+        topk_weight = topk_weight.view(-1)
+        topk_ids = topk_ids.view(-1)
+
+        if w1.dtype == torch.float8_e4m3fn:
+            w1_compute = w1.to(a.dtype)
+            w2_compute = w2.to(a.dtype)
+        else:
+            w1_compute = w1
+            w2_compute = w2
+
+        for i in range(w1_compute.shape[0]):
+            mask = topk_ids == i
+            if mask.sum():
+                out[mask] = SiluAndMul()(
+                    a[mask] @ w1_compute[i].transpose(0, 1)
+                ) @ w2_compute[i].transpose(0, 1)
+
+        return (
+            out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+        ).sum(dim=1)
+
+    def _test_case(self, m, n, k, e, topk, dtype):
+        rtol, atol = self.get_tolerance(dtype)
+
+        a = self.create_random_cuda_tensor((m, k), dtype)
+        w1 = self.create_random_cuda_tensor((e, 2 * n, k), dtype)
+        w2 = self.create_random_cuda_tensor((e, k, n), dtype)
+        w1_tri = w1.clone()
+        w2_tri = w2.clone()
+        w1_tri = w1_tri.transpose(-2, -1).contiguous()
+        w2_tri = w2_tri.transpose(-2, -1).contiguous()
+        score = self.create_random_cuda_tensor((m, e), dtype)
+
+        topk_op = TopK(
+            top_k=topk,
+            renormalize=False,
+            use_grouped_topk=False,
+        )
+        topk_op.use_triton_kernels = True
+        triton_topk_output = topk_op.forward_cuda(
+            hidden_states=a,
+            router_logits=score,
+        )
+
+        moe_runner_config = MoeRunnerConfig(
+            inplace=False,
+        )
+        triton_output = triton_kernel_moe_forward(
+            a, w1_tri, w2_tri, triton_topk_output, moe_runner_config
+        )
+        torch_output = self.torch_naive_moe(a, w1, w2, score, topk)
+        torch.testing.assert_close(triton_output, torch_output, rtol=rtol, atol=atol)
+
+    def test_various_configurations(self):
+        m_values = [1, 32, 64, 256]
+        n_values = [128, 1024]
+        k_values = [128, 512, 1024]
+        dtypes = [torch.bfloat16]
+
+        # Calculate total number of tests
+        total_tests = (
+            len(m_values)
+            * len(n_values)
+            * len(k_values)
+            * len(self.NUM_EXPERTS)
+            * len(self.TOP_KS)
+            * len(dtypes)
+        )
+
+        # Create progress bar
+        with tqdm(total=total_tests, desc="Running MoE tests") as pbar:
+            for m in m_values:
+                for n in n_values:
+                    for k in k_values:
+                        for e in self.NUM_EXPERTS:
+                            for topk in self.TOP_KS:
+                                for dtype in dtypes:
+                                    with self.subTest(
+                                        m=m,
+                                        n=n,
+                                        k=k,
+                                        e=e,
+                                        topk=topk,
+                                        dtype=dtype,
+                                    ):
+                                        self._test_case(
+                                            m,
+                                            n,
+                                            k,
+                                            e,
+                                            topk,
+                                            dtype,
+                                        )
+                                        torch.cuda.empty_cache()
+                                    pbar.update(1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_triton_moe_channel_fp8_kernel.py b/test/srt/test_triton_moe_channel_fp8_kernel.py
new file mode 100644
index 000000000..bbe44308f
--- /dev/null
+++ b/test/srt/test_triton_moe_channel_fp8_kernel.py
@@ -0,0 +1,181 @@
+import itertools
+import unittest
+
+import torch
+
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
+from sglang.srt.layers.moe.topk import TopKConfig, select_experts
+from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant
+from sglang.test.test_utils import CustomTestCase
+
+
+def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
+    """Matrix multiplication function that supports per-token input quantization and per-column weight quantization"""
+    A = A.to(torch.float32)
+    B = B.to(torch.float32)
+
+    assert A.shape[-1] == B.shape[-1], "Dimension mismatch"
+    assert B.ndim == 2 and B.is_contiguous(), "B must be a 2D contiguous tensor"
+
+    # Reshape input
+    M = A.numel() // A.shape[-1]
+    B = B.t()  # Transpose weight matrix
+    N, K = B.shape
+    origin_C_shape = A.shape[:-1] + (K,)
+    A = A.reshape(M, N)
+
+    # As is per-token [M, 1], Bs is per-column [1, K]
+    C = torch.matmul(A, B)  # [M, K]
+    C = As * C * Bs.view(1, -1)  # Broadcast per-column scale
+
+    return C.reshape(origin_C_shape).to(output_dtype)
+
+
+def fp8_mask(a, mask):
+    dtype = a.dtype
+    return a.view(torch.int8)[mask].view(dtype)
+
+
+def torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk):
+    """This function performs fused moe with per-column int8 quantization using native torch."""
+
+    B, D = a.shape
+    # Perform per-token quantization
+    a_q, a_s = scaled_fp8_quant(a, use_per_token_if_dynamic=True)
+    # Repeat tokens to match topk
+    a_q = a_q.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    # Also repeat the scale
+    a_s = a_s.view(B, -1, 1).repeat(1, topk, 1).reshape(-1, 1)  # [B*topk, 1]
+
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+
+    # Calculate routing
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+    # Process each expert
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            # First MLP layer: note that a_s is now per-token
+            inter_out = native_w8a8_per_token_matmul(
+                fp8_mask(a_q, mask),
+                w1[i],
+                fp8_mask(a_s, mask),
+                w1_s[i],
+                output_dtype=a.dtype,
+            )
+            # Activation function
+            act_out = SiluAndMul().forward_native(inter_out)
+            # Quantize activation output with per-token
+            act_out_q, act_out_s = scaled_fp8_quant(
+                act_out, use_per_token_if_dynamic=True
+            )
+
+            # Second MLP layer
+            out[mask] = native_w8a8_per_token_matmul(
+                act_out_q, w2[i], act_out_s, w2_s[i], output_dtype=a.dtype
+            )
+    # Apply routing weights and sum
+    return (
+        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+    ).sum(dim=1)
+
+
+class TestW8A8FP8FusedMoE(CustomTestCase):
+    DTYPES = [torch.half, torch.bfloat16]
+    M = [1, 33]
+    N = [128, 1024]
+    K = [256, 4096]
+    E = [8]
+    TOP_KS = [2, 6]
+    BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
+    BLOCK_SIZE = [[128, 128]]
+    SEEDS = [0]
+
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+
+    def _w8a8_fp8_fused_moe(self, M, N, K, E, topk, block_size, dtype, seed):
+        torch.manual_seed(seed)
+        # Initialize int8 quantization parameters
+        factor_for_scale = 1e-2
+        finfo = torch.finfo(torch.float8_e4m3fn)
+        fp8_max = finfo.max
+        fp8_min = finfo.min
+
+        # Input tensor
+        # M * K
+        a = torch.randn((M, K), dtype=dtype) / 10
+
+        # Generate int8 weights
+        w1_fp32 = (torch.rand((E, 2 * N, K), dtype=torch.float32) - 0.5) * 2
+        w1 = (w1_fp32 * fp8_max).clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+        w2_fp32 = (torch.rand((E, K, N), dtype=torch.float32) - 0.5) * 2
+        w2 = (w2_fp32 * fp8_max).clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+        # Generate scale for each column (per-column quantization)
+        w1_s = torch.rand(E, 2 * N, device=w1_fp32.device) * factor_for_scale
+        w2_s = torch.rand(E, K, device=w2_fp32.device) * factor_for_scale
+        score = torch.randn((M, E), dtype=dtype)
+
+        with torch.inference_mode():
+            ref_out = torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk)
+            topk_output = select_experts(
+                hidden_states=a,
+                router_logits=score,
+                topk_config=TopKConfig(top_k=topk, renormalize=False),
+            )
+            out = fused_moe(
+                a,
+                w1,
+                w2,
+                topk_output,
+                use_fp8_w8a8=True,  # using fp8
+                use_int8_w8a16=False,
+                use_int8_w8a8=False,
+                per_channel_quant=True,
+                w1_scale=w1_s,
+                w2_scale=w2_s,
+                block_shape=None,  # Not using block quantization
+            )
+
+        # Check results
+        self.assertTrue(
+            torch.mean(torch.abs(out.to(torch.float32) - ref_out.to(torch.float32)))
+            / torch.mean(torch.abs(ref_out.to(torch.float32)))
+            < 0.05
+        )
+
+    def test_w8a8_fp8_fused_moe(self):
+        for params in itertools.product(
+            self.M,
+            self.N,
+            self.K,
+            self.E,
+            self.TOP_KS,
+            self.BLOCK_SIZE,
+            self.DTYPES,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                M=params[0],
+                N=params[1],
+                K=params[2],
+                E=params[3],
+                topk=params[4],
+                block_size=params[5],
+                dtype=params[6],
+                seed=params[7],
+            ):
+                self._w8a8_fp8_fused_moe(*params)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/test/srt/test_triton_moe_wna16.py b/test/srt/test_triton_moe_wna16.py
new file mode 100644
index 000000000..b447b532f
--- /dev/null
+++ b/test/srt/test_triton_moe_wna16.py
@@ -0,0 +1,243 @@
+from typing import Optional
+
+import pytest
+import torch
+
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
+from sglang.srt.layers.moe.topk import TopKConfig, select_experts
+
+NUM_EXPERTS = [8, 64]
+TOP_KS = [2, 6]
+
+
+def quantize_weights(
+    w: torch.Tensor,
+    quant_type: str,
+    group_size: Optional[int],
+    zero_points: bool = False,
+    ref_zero_points_after_scales: bool = False,
+):
+    assert quant_type in ["w4a16", "w4a16b8", "w8a16", "w8a16b128"]
+    assert not zero_points or group_size is not None, (
+        "to have group zero points, group_size must be provided "
+        "(-1 group_size is channelwise)"
+    )
+
+    orig_device = w.device
+    orig_type = w.dtype
+    size_k, size_n = w.shape
+
+    assert w.is_floating_point(), "w must be float"
+
+    if group_size == -1:
+        group_size = size_k
+
+    # Reshape to [groupsize, -1]
+    if group_size is not None and group_size < size_k:
+        w = w.reshape((-1, group_size, size_n))
+        w = w.permute(1, 0, 2)
+        w = w.reshape((group_size, -1))
+
+    # Compute scale for each group
+    max_val = torch.max(w, 0, keepdim=True).values
+    min_val = torch.min(w, 0, keepdim=True).values
+
+    if quant_type == "w4a16":
+        max_q_val = 15
+        min_q_val = 0
+    elif quant_type == "w4a16b8":
+        max_q_val = 7
+        min_q_val = -1
+    elif quant_type == "w8a16":
+        max_q_val = 255
+        min_q_val = 0
+    elif quant_type == "w8a16b128":
+        max_q_val = 127
+        min_q_val = -128
+
+    w_s = torch.Tensor([1.0]).to(w.device)  # unscaled case
+    maybe_w_zp = None
+    if group_size is not None:
+        if zero_points:
+            w_s = (max_val - min_val).clamp(min=1e-5) / max_q_val
+            maybe_w_zp = (
+                torch.round(torch.abs(min_val / w_s)).clamp(min_q_val, max_q_val).int()
+            )
+        else:
+            # If the bias is such that there are no possible negative/positive
+            #  values, set the max value to inf to avoid divide by 0
+            w_s = torch.max(
+                abs(max_val / (max_q_val if max_q_val != 0 else torch.inf)),
+                abs(min_val / (min_q_val if min_q_val != 0 else torch.inf)),
+            )
+
+    # Quantize
+    w_q = torch.round(w / w_s).int() + (maybe_w_zp if zero_points else 0)
+    w_q = torch.clamp(w_q, min_q_val, max_q_val)
+
+    # Compute ref (dequantized)
+    # For some kernels (namely Machete) the zero-points are applied after the
+    # scales are applied, for this case computing the reference in similar way
+    # allows us to use tighter error tolerances in our unit tests.
+    if ref_zero_points_after_scales and maybe_w_zp is not None:
+        w_ref = w_q.to(orig_type) * w_s - maybe_w_zp.to(orig_type) * w_s
+    else:
+        w_ref = (w_q - (maybe_w_zp if zero_points else 0)).to(orig_type) * w_s
+
+    if quant_type == "w4a16b8":
+        w_q += 8
+    elif quant_type == "w8a16b128":
+        w_q += 128
+
+    # Restore original shapes
+    if group_size is not None and group_size < size_k:
+
+        def reshape_w(w):
+            w = w.reshape((group_size, -1, size_n))
+            w = w.permute(1, 0, 2)
+            w = w.reshape((size_k, size_n)).contiguous()
+            return w
+
+        w_q = reshape_w(w_q)
+        w_ref = reshape_w(w_ref)
+        w_s = w_s.reshape((-1, size_n)).contiguous()
+
+    if maybe_w_zp is not None:
+        maybe_w_zp = maybe_w_zp.reshape((-1, size_n)).contiguous()
+        maybe_w_zp = maybe_w_zp.to(device=orig_device)
+
+    return (
+        w_ref.to(device=orig_device),
+        w_q.to(device=orig_device),
+        w_s if group_size is not None else None,
+        maybe_w_zp,
+    )
+
+
+def torch_moe(a, w1, w2, score, topk):
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            out[mask] = SiluAndMul()(a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(
+                0, 1
+            )
+    return (
+        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+    ).sum(dim=1)
+
+
+# fork from https://github.com/vllm-project/vllm/blob/main/tests/kernels/test_moe.py
+@pytest.mark.parametrize("m", [1, 32, 222])
+@pytest.mark.parametrize("n", [128, 1024, 2048])
+@pytest.mark.parametrize("k", [128, 1024])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("group_size", [64, 128])
+@pytest.mark.parametrize("has_zp", [True, False])
+@pytest.mark.parametrize("weight_bits", [8])  # [4, 8])
+def test_fused_moe_wn16(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    dtype: torch.dtype,
+    group_size: int,
+    has_zp: bool,
+    weight_bits: int,
+):
+    print(m, n, k, e, topk, dtype, group_size, has_zp, weight_bits)
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+
+    if weight_bits == 4:
+        pack_factor = 2
+        quant_type = "w4a16" if has_zp else "w4a16b8"
+    elif weight_bits == 8:
+        pack_factor = 1
+        quant_type = "w8a16" if has_zp else "w8a16b128"
+
+    w1_ref = w1.clone()
+    w2_ref = w2.clone()
+    w1_qweight = torch.empty(
+        (e, 2 * n, k // pack_factor), device="cuda", dtype=torch.uint8
+    )
+    w2_qweight = torch.empty((e, k, n // pack_factor), device="cuda", dtype=torch.uint8)
+    w1_scales = torch.empty((e, 2 * n, k // group_size), device="cuda", dtype=dtype)
+    w2_scales = torch.empty((e, k, n // group_size), device="cuda", dtype=dtype)
+    w1_qzeros = torch.empty(
+        (e, 2 * n // pack_factor, k // group_size), device="cuda", dtype=torch.uint8
+    )
+    w2_qzeros = torch.empty(
+        (e, k // pack_factor, n // group_size), device="cuda", dtype=torch.uint8
+    )
+
+    for i in range(e * 2):
+        expert_id = i % e
+        if i // e == 0:
+            w, w_ref, w_qweight, w_scales, w_qzeros = (
+                w1,
+                w1_ref,
+                w1_qweight,
+                w1_scales,
+                w1_qzeros,
+            )
+        else:
+            w, w_ref, w_qweight, w_scales, w_qzeros = (
+                w2,
+                w2_ref,
+                w2_qweight,
+                w2_scales,
+                w2_qzeros,
+            )
+        weight, qweight, scales, qzeros = quantize_weights(
+            w[expert_id].T, quant_type, group_size, has_zp, False
+        )
+        weight = weight.T
+        qweight = qweight.T.contiguous().to(torch.uint8)
+        scales = scales.T
+        if has_zp:
+            qzeros = qzeros.T.contiguous().to(torch.uint8)
+        if weight_bits == 4:
+            qweight = qweight[:, 1::2] * 16 + qweight[:, ::2]
+            if has_zp:
+                qzeros = qzeros[1::2, :] * 16 + qzeros[::2, :]
+
+        w_ref[expert_id] = weight
+        w_qweight[expert_id] = qweight
+        w_scales[expert_id] = scales
+        if has_zp:
+            w_qzeros[expert_id] = qzeros
+
+    topk_output = select_experts(
+        hidden_states=a,
+        router_logits=score,
+        topk_config=TopKConfig(top_k=topk),
+    )
+
+    triton_output = fused_moe(
+        a,
+        w1_qweight,
+        w2_qweight,
+        topk_output,
+        use_int4_w4a16=weight_bits == 4,
+        use_int8_w8a16=weight_bits == 8,
+        w1_scale=w1_scales,
+        w2_scale=w2_scales,
+        w1_zp=w1_qzeros if has_zp else None,
+        w2_zp=w2_qzeros if has_zp else None,
+        block_shape=[0, group_size],
+    )
+    torch_output = torch_moe(a, w1_ref, w2_ref, score, topk)
+    torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
diff --git a/test/srt/test_triton_sliding_window.py b/test/srt/test_triton_sliding_window.py
new file mode 100644
index 000000000..f2b422c5e
--- /dev/null
+++ b/test/srt/test_triton_sliding_window.py
@@ -0,0 +1,129 @@
+import unittest
+from types import SimpleNamespace
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+)
+
+
+class TestSlidingWindowAttentionTriton(CustomTestCase):
+    """Test sliding window attention functionality with triton backend."""
+
+    @classmethod
+    def setUpClass(cls):
+        """Set up the test server with Gemma3 model and triton backend."""
+        # Gemma3 model supports sliding window attention
+        cls.model = "google/gemma-3-4b-it"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+
+        cls.common_args = [
+            "--trust-remote-code",
+            "--attention-backend",
+            "triton",
+            "--context-length",
+            "8192",
+            "--random-seed",
+            "42",
+        ]
+
+        cls.short_context_prompt = "The capital of France is"
+
+        # Test prompt longer than window size
+        cls.long_context_prompt = (
+            """
+        Once upon a time, there was a mountain. In the mountain, there was a temple. In the temple, there was an old monk telling a story. The story was:
+        """
+            * 100
+        )
+        cls.long_context_prompt += "\nNow, summarize the story in one sentence:"
+
+    def _test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=200,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        print(f"MMLU metrics with sliding window: {metrics}")
+
+        self.assertGreaterEqual(metrics["score"], 0.60)
+
+    def _test_short_context_generation(self):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": self.short_context_prompt,
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": 256,
+                },
+            },
+        )
+
+        self.assertEqual(response.status_code, 200)
+        result = response.json()
+        self.assertIn("paris", result["text"].lower())
+        print(f"Short context generation result: {result['text']}")
+
+    def _test_long_context_generation(self):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": self.long_context_prompt,
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": 256,
+                },
+            },
+        )
+
+        self.assertEqual(response.status_code, 200)
+        result = response.json()
+        self.assertGreater(len(result["text"].strip()), 0)
+        print(f"Long context generation result: {result['text'][:100]}...")
+
+    @unittest.skipIf(is_in_ci(), "To reduce the CI execution time.")
+    def test_no_cuda_graph(self):
+        self.no_cuda_graph_process = popen_launch_server(
+            self.model,
+            self.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=self.common_args + ["--disable-cuda-graph"],
+        )
+
+        try:
+            self._test_short_context_generation()
+            self._test_long_context_generation()
+            self._test_mmlu()
+        finally:
+            kill_process_tree(self.no_cuda_graph_process.pid)
+
+    def test_cuda_graph(self):
+        self.cuda_graph_process = popen_launch_server(
+            self.model,
+            self.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=self.common_args,
+        )
+
+        try:
+            self._test_short_context_generation()
+            self._test_long_context_generation()
+            self._test_mmlu()
+        finally:
+            kill_process_tree(self.cuda_graph_process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_two_batch_overlap.py b/test/srt/test_two_batch_overlap.py
new file mode 100644
index 000000000..6aa550c46
--- /dev/null
+++ b/test/srt/test_two_batch_overlap.py
@@ -0,0 +1,152 @@
+import os
+import unittest
+from types import SimpleNamespace
+
+import requests
+
+from sglang.srt.model_executor.forward_batch_info import ForwardMode
+from sglang.srt.two_batch_overlap import (
+    compute_split_seq_index,
+    compute_split_token_index,
+)
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST,
+    DEFAULT_MLA_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
+
+
+class TestTwoBatchOverlap(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "2",
+                "--dp",
+                "2",
+                "--enable-dp-attention",
+                "--moe-a2a-backend",
+                "deepep",
+                "--deepep-mode",
+                "normal",
+                "--disable-cuda-graph",  # DeepEP normal does not support CUDA Graph
+                "--enable-two-batch-overlap",
+            ],
+            env={"SGL_ENABLE_JIT_DEEPGEMM": "0", **os.environ},
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_generate_single_prompt(self):
+        response = requests.post(
+            self.base_url + "/generate",
+            # we use an uncommon start to minimise the chance that the cache is hit by chance
+            json={
+                "text": "_ 1+1=2, 1+2=3, 1+3=4, 1+4=",
+                "sampling_params": {"temperature": 0, "max_new_tokens": 8},
+            },
+        )
+        print(f"{response.json()=}")
+        self.assertEqual(response.json()["text"], "5, 1+5=6")
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+
+        metrics = run_eval(args)
+        self.assertGreater(metrics["score"], 0.5)
+
+
+class TestTwoBatchOverlapUnitTest(unittest.TestCase):
+    def test_compute_split_seq_and_token_index(self):
+        for num_tokens, expect in [
+            (0, 0),
+            (100, 50),
+            (99, 49),
+        ]:
+            actual = compute_split_seq_index(
+                forward_mode=ForwardMode.DECODE,
+                num_tokens=num_tokens,
+                extend_lens=None,
+                token_num_per_seq=1,
+            )
+            self.assertEqual(actual, expect)
+
+        for extend_lens, expect in [
+            ([], (0, 0)),
+            ([42], (0, 21)),
+            ([42, 999], (1, 520)),
+            ([999, 42], (0, 520)),
+            ([498, 502], (1, 498)),
+            ([4096, 4096, 4096, 4096], (2, 8192)),
+            ([4095, 4096, 4096, 4096, 1], (2, 8191)),
+            ([1, 4095, 4096, 4096, 4096], (3, 8192)),
+            ([4097, 4096, 4096, 4095, 1], (2, 8193)),
+            ([1, 1, 1, 1, 99999], (4, 50001)),
+            ([99999, 1, 1, 1, 1], (0, 50001)),
+        ]:
+            actual_seq_idx = compute_split_seq_index(
+                forward_mode=ForwardMode.EXTEND,
+                num_tokens=None,
+                extend_lens=extend_lens,
+                token_num_per_seq=None,
+            )
+            actual_token_idx = compute_split_token_index(
+                split_seq_index=actual_seq_idx,
+                forward_mode=ForwardMode.EXTEND,
+                extend_seq_lens=extend_lens,
+                token_num_per_seq=None,
+            )
+            actual = (actual_seq_idx, actual_token_idx)
+            print(f"{extend_lens=} {expect=} {actual=}")
+            self.assertEqual(actual, expect)
+
+
+class TestQwen3TwoBatchOverlap(TestTwoBatchOverlap):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-1234"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--tp",
+                "2",
+                "--dp",
+                "2",
+                "--enable-dp-attention",
+                "--moe-a2a-backend",
+                "deepep",
+                "--deepep-mode",
+                "normal",
+                "--disable-cuda-graph",  # DeepEP normal does not support CUDA Graph
+                "--enable-two-batch-overlap",
+            ],
+            env={"SGL_ENABLE_JIT_DEEPGEMM": "0", **os.environ},
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_utils_update_weights.py b/test/srt/test_utils_update_weights.py
new file mode 100644
index 000000000..8c138f0ab
--- /dev/null
+++ b/test/srt/test_utils_update_weights.py
@@ -0,0 +1,170 @@
+import asyncio
+import os
+import unittest
+
+import torch
+import torch.distributed as dist
+from torch.distributed.device_mesh import init_device_mesh
+from transformers import AutoModelForCausalLM
+
+from sglang.srt.entrypoints.engine import Engine
+from sglang.srt.weight_sync.utils import update_weights
+from sglang.test.test_utils import DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+
+
+class AsyncEngine(Engine):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    async def update_weights_from_tensor(self, update_weights_request):
+        return await self.tokenizer_manager.update_weights_from_tensor(
+            update_weights_request, None
+        )
+
+
+def is_distributed_available():
+    """Check if distributed training environment is available"""
+    required_vars = ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT"]
+    return all(var in os.environ for var in required_vars)
+
+
+def setup_single_process_distributed():
+    """Setup distributed environment for single process testing"""
+    if not is_distributed_available():
+        os.environ["RANK"] = "0"
+        os.environ["WORLD_SIZE"] = "1"
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = "12356"
+        os.environ["LOCAL_RANK"] = "0"
+
+
+class TestUtilsUpdateWeights(unittest.TestCase):
+    """Test class for utils.update_weights function"""
+
+    @classmethod
+    def setUpClass(cls):
+        """Setup distributed environment and test fixtures for the entire test class"""
+        cls.setup_distributed()
+        cls.setup_test_engine()
+        cls.setup_test_model()
+        cls.setup_device_mesh()
+
+    @classmethod
+    def tearDownClass(cls):
+        """Cleanup after all tests"""
+        if hasattr(cls, "engine") and cls.engine:
+            cls.engine.shutdown()
+
+        # Cleanup distributed
+        if dist.is_initialized():
+            dist.destroy_process_group()
+
+    @classmethod
+    def setup_distributed(cls):
+        """Setup distributed environment for testing"""
+        setup_single_process_distributed()
+
+        if not dist.is_initialized():
+            try:
+                dist.init_process_group(
+                    backend="nccl" if torch.cuda.is_available() else "gloo"
+                )
+            except Exception as e:
+                raise unittest.SkipTest(
+                    f"Could not initialize distributed backend: {e}"
+                )
+
+        cls.rank = dist.get_rank()
+        cls.world_size = dist.get_world_size()
+
+        if torch.cuda.is_available():
+            torch.cuda.set_device(cls.rank % torch.cuda.device_count())
+
+        # Set up environment variables
+        os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+        os.environ["NCCL_CUMEM_ENABLE"] = "0"
+        os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
+        os.environ["CUDA_MODULE_LOADING"] = "AUTO"
+
+    @classmethod
+    def setup_test_engine(cls):
+        """Setup test engine"""
+        if cls.rank == 0:
+            cls.engine = AsyncEngine(
+                model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+                dtype="bfloat16",
+                mem_fraction_static=0.3,
+                enable_memory_saver=True,
+                tp_size=cls.world_size,
+                disable_cuda_graph=False,
+            )
+        else:
+            cls.engine = None
+
+    @classmethod
+    def setup_test_model(cls):
+        """Load test model"""
+        try:
+            cls.model = AutoModelForCausalLM.from_pretrained(
+                DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+                device_map="cpu",
+                trust_remote_code=True,
+                low_cpu_mem_usage=True,
+                torch_dtype=(
+                    torch.float16 if torch.cuda.is_available() else torch.float32
+                ),
+            )
+        except Exception as e:
+            raise unittest.SkipTest(f"Could not load test model: {e}")
+
+    @classmethod
+    def setup_device_mesh(cls):
+        """Create device mesh for testing"""
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA not available for device mesh")
+
+        cls.device_mesh_key = "tp"
+        cls.mesh = init_device_mesh(
+            "cuda", (cls.world_size,), mesh_dim_names=(cls.device_mesh_key,)
+        )
+
+    def create_test_params_batch(self, model, num_params=64):
+        """Create a batch of test parameters from the model"""
+        param_names = []
+        test_tensors = []
+
+        # Get first few parameters from the model for testing
+        for i, (name, tensor) in enumerate(model.named_parameters()):
+            if i >= num_params:
+                break
+            param_names.append(name)
+            # Create test tensor with known values, matching original shape and dtype
+            test_tensor = torch.full_like(tensor, 1.5, dtype=tensor.dtype).cuda()
+            test_tensors.append(test_tensor)
+
+        return list(zip(param_names, test_tensors))
+
+    def test_utils_update_weights(self):
+        """Test basic functionality of utils.update_weights"""
+
+        async def async_test():
+            # Create test parameters batch
+            params_batch = self.create_test_params_batch(self.model, num_params=2)
+
+            # Test the utils.update_weights function
+            result = await update_weights(
+                engine=self.engine,
+                params_batch=params_batch,
+                device_mesh_key=self.device_mesh_key,
+                device_mesh=self.mesh,
+                load_format=None,
+            )
+
+            self.assertIn("Success", result)
+
+        # Run the async test
+        asyncio.run(async_test())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_vertex_endpoint.py b/test/srt/test_vertex_endpoint.py
new file mode 100644
index 000000000..42e48cb1b
--- /dev/null
+++ b/test/srt/test_vertex_endpoint.py
@@ -0,0 +1,64 @@
+"""
+python3 -m unittest test_vertex_endpoint.TestVertexEndpoint.test_vertex_generate
+"""
+
+import unittest
+from http import HTTPStatus
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestVertexEndpoint(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--cuda-graph-max-bs", 2],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def run_generate(self, parameters):
+        data = {
+            "instances": [
+                {"text": "The capital of France is"},
+                {"text": "The capital of China is"},
+            ],
+            "parameters": parameters,
+        }
+        response = requests.post(self.base_url + "/vertex_generate", json=data)
+        response_json = response.json()
+        assert len(response_json["predictions"]) == len(data["instances"])
+        return response_json
+
+    def test_vertex_generate(self):
+        for parameters in [None, {"sampling_params": {"max_new_tokens": 4}}]:
+            self.run_generate(parameters)
+
+    def test_vertex_generate_fail(self):
+        data = {
+            "instances": [
+                {"prompt": "The capital of France is"},
+            ],
+        }
+        response = requests.post(self.base_url + "/vertex_generate", json=data)
+        assert response.status_code == HTTPStatus.BAD_REQUEST
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_vision_chunked_prefill.py b/test/srt/test_vision_chunked_prefill.py
new file mode 100644
index 000000000..3876e915b
--- /dev/null
+++ b/test/srt/test_vision_chunked_prefill.py
@@ -0,0 +1,188 @@
+"""
+Usage:
+python3 -m unittest test_vision_chunked_prefill.TestVisionChunkedPrefill.test_chunked_prefill
+"""
+
+import base64
+import io
+import os
+import unittest
+from concurrent.futures import ThreadPoolExecutor
+from typing import Union
+
+import numpy as np
+import requests
+from PIL import Image
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestVisionChunkedPrefill(CustomTestCase):
+
+    def prepare_video_messages(self, video_path, max_frames_num=8):
+        # We import decord here to avoid a strange Segmentation fault (core dumped) issue.
+        # The following import order will cause Segmentation fault.
+        # import decord
+        # from transformers import AutoTokenizer
+        from decord import VideoReader, cpu
+
+        vr = VideoReader(video_path, ctx=cpu(0))
+        total_frame_num = len(vr)
+        uniform_sampled_frames = np.linspace(
+            0, total_frame_num - 1, max_frames_num, dtype=int
+        )
+        frame_idx = uniform_sampled_frames.tolist()
+        frames = vr.get_batch(frame_idx).asnumpy()
+
+        base64_frames = []
+        for frame in frames:
+            pil_img = Image.fromarray(frame)
+            buff = io.BytesIO()
+            pil_img.save(buff, format="JPEG")
+            base64_str = base64.b64encode(buff.getvalue()).decode("utf-8")
+            base64_frames.append(base64_str)
+
+        messages = [{"role": "user", "content": []}]
+        frame_format = {
+            "type": "image_url",
+            "image_url": {"url": "data:image/jpeg;base64,{}"},
+            "modalities": "video",
+        }
+
+        for base64_frame in base64_frames:
+            frame_format["image_url"]["url"] = "data:image/jpeg;base64,{}".format(
+                base64_frame
+            )
+            messages[0]["content"].append(frame_format.copy())
+
+        prompt = {"type": "text", "text": "Please describe the video briefly."}
+        messages[0]["content"].append(prompt)
+
+        return messages
+
+    def get_prompt_from_messages(self, messages):
+        text = (
+            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+            "<|im_start|>user\n"
+        )
+        image_data = []
+        for content in messages[0]["content"]:
+            if content["type"] == "image_url":
+                text += "<image>\n"
+                image_data.append(content["image_url"]["url"])
+        text += "Please describe the video briefly.<|im_end|>\n<|im_start|>assistant\n"
+        return text, image_data
+
+    def generate(self, text, image_data):
+        response = requests.post(
+            self.base_url + "/generate",
+            json={
+                "text": text,
+                "image_data": image_data,
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": 32,
+                    "no_stop_trim": True,
+                    "skip_special_tokens": False,
+                },
+                "modalities": ["multi-images"],
+            },
+        ).json()
+        return response["text"]
+
+    def generate_for_video(self, batch, num_frame) -> Union[str, list[str]]:
+        # prepare the video input about Steven introducing ipod nano
+        url = "https://raw.githubusercontent.com/evolvinglmms-lab/sglang/dev/onevision_local/assets/jobs.mp4"
+        cache_dir = os.path.expanduser("~/.cache")
+        file_path = os.path.join(cache_dir, "jobs.mp4")
+        os.makedirs(cache_dir, exist_ok=True)
+        if not os.path.exists(file_path):
+            response = requests.get(url)
+            response.raise_for_status()
+            with open(file_path, "wb") as f:
+                f.write(response.content)
+
+        if not batch:
+            assert isinstance(num_frame, int)
+            messages = self.prepare_video_messages(file_path, max_frames_num=num_frame)
+            text, image_data = self.get_prompt_from_messages(messages)
+            return self.generate(text, image_data)
+        else:
+            assert isinstance(num_frame, list)
+            func_args = []
+            for max_frames_num in num_frame:
+                messages = self.prepare_video_messages(
+                    file_path,
+                    max_frames_num=max_frames_num,
+                )
+                text, image_data = self.get_prompt_from_messages(messages)
+                func_args.append((text, image_data))
+
+            with ThreadPoolExecutor(max_workers=10) as executor:
+                responses = list(executor.map(lambda p: self.generate(*p), func_args))
+
+            return responses
+
+    def launch_server(self, chunked_prefill_size) -> int:
+        # launch server
+        model = "lmms-lab/llava-onevision-qwen2-7b-ov"
+        # model = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+        self.base_url = DEFAULT_URL_FOR_TEST
+        process = popen_launch_server(
+            model,
+            self.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--chunked-prefill-size",
+                f"{chunked_prefill_size}",
+            ],
+        )
+        return process.pid
+
+    def _test_chunked_prefill(self, batches, num_frames):
+        # Chunked
+        chunked_server_pid = self.launch_server(chunked_prefill_size=1024)
+        try:
+            outputs_chunked = []
+            for batch, num_frame in zip(batches, num_frames):
+                output_chunked = self.generate_for_video(
+                    batch=batch, num_frame=num_frame
+                )
+                outputs_chunked += [output_chunked]
+        finally:
+            kill_process_tree(chunked_server_pid)
+
+        # None-chunked
+        try:
+            no_chunked_server_pid = self.launch_server(chunked_prefill_size=-1)
+            outputs_no_chunked = []
+            for batch, num_frame in zip(batches, num_frames):
+                output_no_chunked = self.generate_for_video(
+                    batch=batch, num_frame=num_frame
+                )
+                outputs_no_chunked += [output_no_chunked]
+
+        finally:
+            kill_process_tree(no_chunked_server_pid)
+
+        for output_chunked, output_no_chunked in zip(
+            outputs_chunked, outputs_no_chunked
+        ):
+            print("output with chunked prefill:")
+            print(output_chunked)
+            print("output without chunked prefill:")
+            print(output_no_chunked)
+            self.assertEqual(output_chunked, output_no_chunked)
+
+    def test_chunked_prefill(self):
+        self._test_chunked_prefill(batches=[False, True], num_frames=[1, [2, 6, 8, 10]])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_vision_openai_server_a.py b/test/srt/test_vision_openai_server_a.py
new file mode 100644
index 000000000..e8e0d62e9
--- /dev/null
+++ b/test/srt/test_vision_openai_server_a.py
@@ -0,0 +1,342 @@
+"""
+Usage:
+python3 -m unittest test_vision_openai_server.TestOpenAIVisionServer.test_mixed_batch
+python3 -m unittest test_vision_openai_server.TestOpenAIVisionServer.test_multi_images_chat_completion
+"""
+
+import unittest
+
+from test_vision_openai_server_common import *
+
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    popen_launch_server,
+)
+
+
+class TestLlava(ImageOpenAITestMixin):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "lmms-lab/llava-onevision-qwen2-0.5b-ov"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+        )
+        cls.base_url += "/v1"
+
+
+class TestQwen2VLServer(ImageOpenAITestMixin, VideoOpenAITestMixin):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "Qwen/Qwen2-VL-7B-Instruct"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=[
+                "--mem-fraction-static",
+                "0.35",
+                "--cuda-graph-max-bs",
+                "4",
+            ],
+        )
+        cls.base_url += "/v1"
+
+
+class TestQwen2_5_VLServer(ImageOpenAITestMixin, VideoOpenAITestMixin):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "Qwen/Qwen2.5-VL-7B-Instruct"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=[
+                "--mem-fraction-static",
+                "0.35",
+                "--cuda-graph-max-bs",
+                "4",
+            ],
+        )
+        cls.base_url += "/v1"
+
+
+class TestVLMContextLengthIssue(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "Qwen/Qwen2-VL-7B-Instruct"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=[
+                "--context-length",
+                "300",
+                "--mem-fraction-static=0.75",
+                "--cuda-graph-max-bs",
+                "4",
+            ],
+        )
+        cls.base_url += "/v1"
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_single_image_chat_completion(self):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        with self.assertRaises(openai.BadRequestError) as cm:
+            client.chat.completions.create(
+                model="default",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": IMAGE_MAN_IRONING_URL},
+                            },
+                            {
+                                "type": "text",
+                                "text": "Give a lengthy description of this picture",
+                            },
+                        ],
+                    },
+                ],
+                temperature=0,
+            )
+
+        # context length is checked first, then max_req_input_len, which is calculated from the former
+        assert (
+            "Multimodal prompt is too long after expanding multimodal tokens."
+            in str(cm.exception)
+            or "is longer than the model's context length" in str(cm.exception)
+        )
+
+
+# Note(Xinyuan): mllama is not stable for now, skip for CI
+# class TestMllamaServer(TestOpenAIVisionServer):
+#     @classmethod
+#     def setUpClass(cls):
+#         cls.model = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+#         cls.base_url = DEFAULT_URL_FOR_TEST
+#         cls.api_key = "sk-123456"
+#         cls.process = popen_launch_server(
+#             cls.model,
+#             cls.base_url,
+#             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+#             api_key=cls.api_key,
+#         )
+#         cls.base_url += "/v1"
+
+
+class TestMinicpmvServer(ImageOpenAITestMixin):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "openbmb/MiniCPM-V-2_6"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--mem-fraction-static",
+                "0.35",
+                "--cuda-graph-max-bs",
+                "4",
+            ],
+        )
+        cls.base_url += "/v1"
+
+
+class TestMinicpmv4Server(ImageOpenAITestMixin):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "openbmb/MiniCPM-V-4"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--mem-fraction-static",
+                "0.35",
+                "--cuda-graph-max-bs",
+                "4",
+            ],
+        )
+        cls.base_url += "/v1"
+
+
+class TestInternVL2_5Server(ImageOpenAITestMixin):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "OpenGVLab/InternVL2_5-2B"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--cuda-graph-max-bs",
+                "4",
+            ],
+        )
+        cls.base_url += "/v1"
+
+
+class TestMinicpmo2_6Server(ImageOpenAITestMixin, AudioOpenAITestMixin):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "openbmb/MiniCPM-o-2_6"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--mem-fraction-static",
+                "0.65",
+                "--cuda-graph-max-bs",
+                "4",
+            ],
+        )
+        cls.base_url += "/v1"
+
+
+class TestMimoVLServer(ImageOpenAITestMixin):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "XiaomiMiMo/MiMo-VL-7B-RL"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=[
+                "--trust-remote-code",
+                "--mem-fraction-static",
+                "0.6",
+                "--cuda-graph-max-bs",
+                "4",
+            ],
+        )
+        cls.base_url += "/v1"
+
+
+class TestVILAServer(ImageOpenAITestMixin):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "Efficient-Large-Model/NVILA-Lite-2B-hf-0626"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.revision = "6bde1de5964b40e61c802b375fff419edc867506"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=[
+                "--trust-remote-code",
+                "--context-length=65536",
+                f"--revision={cls.revision}",
+                "--cuda-graph-max-bs",
+                "4",
+            ],
+        )
+        cls.base_url += "/v1"
+
+
+class TestPhi4MMServer(ImageOpenAITestMixin, AudioOpenAITestMixin):
+    @classmethod
+    def setUpClass(cls):
+        # Manually download LoRA adapter_config.json as it's not downloaded by the model loader by default.
+        from huggingface_hub import constants, snapshot_download
+
+        snapshot_download(
+            "microsoft/Phi-4-multimodal-instruct",
+            allow_patterns=["**/adapter_config.json"],
+        )
+
+        cls.model = "microsoft/Phi-4-multimodal-instruct"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+
+        revision = "33e62acdd07cd7d6635badd529aa0a3467bb9c6a"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--mem-fraction-static",
+                "0.70",
+                "--disable-radix-cache",
+                "--max-loras-per-batch",
+                "2",
+                "--revision",
+                revision,
+                "--lora-paths",
+                f"vision={constants.HF_HUB_CACHE}/models--microsoft--Phi-4-multimodal-instruct/snapshots/{revision}/vision-lora",
+                f"speech={constants.HF_HUB_CACHE}/models--microsoft--Phi-4-multimodal-instruct/snapshots/{revision}/speech-lora",
+                "--cuda-graph-max-bs",
+                "4",
+            ],
+        )
+        cls.base_url += "/v1"
+
+    def get_vision_request_kwargs(self):
+        return {
+            "extra_body": {
+                "lora_path": "vision",
+                "top_k": 1,
+                "top_p": 1.0,
+            }
+        }
+
+    def get_audio_request_kwargs(self):
+        return {
+            "extra_body": {
+                "lora_path": "speech",
+                "top_k": 1,
+                "top_p": 1.0,
+            }
+        }
+
+    # This _test_audio_ambient_completion test is way too complicated to pass for a small LLM
+    def test_audio_ambient_completion(self):
+        pass
+
+
+if __name__ == "__main__":
+    del (
+        TestOpenAIOmniServerBase,
+        ImageOpenAITestMixin,
+        VideoOpenAITestMixin,
+        AudioOpenAITestMixin,
+    )
+    unittest.main()
diff --git a/test/srt/test_vision_openai_server_b.py b/test/srt/test_vision_openai_server_b.py
new file mode 100644
index 000000000..6c2fa86d5
--- /dev/null
+++ b/test/srt/test_vision_openai_server_b.py
@@ -0,0 +1,250 @@
+import unittest
+
+from test_vision_openai_server_common import *
+
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    popen_launch_server,
+)
+
+
+class TestPixtralServer(ImageOpenAITestMixin):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "mistral-community/pixtral-12b"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--mem-fraction-static",
+                "0.70",
+                "--cuda-graph-max-bs",
+                "4",
+            ],
+        )
+        cls.base_url += "/v1"
+
+
+class TestMistral3_1Server(ImageOpenAITestMixin):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "unsloth/Mistral-Small-3.1-24B-Instruct-2503"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--mem-fraction-static",
+                "0.75",
+                "--cuda-graph-max-bs",
+                "4",
+            ],
+        )
+        cls.base_url += "/v1"
+
+
+class TestDeepseekVL2Server(ImageOpenAITestMixin):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "deepseek-ai/deepseek-vl2-small"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--context-length",
+                "4096",
+                "--cuda-graph-max-bs",
+                "4",
+            ],
+        )
+        cls.base_url += "/v1"
+
+
+class TestJanusProServer(ImageOpenAITestMixin):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "deepseek-ai/Janus-Pro-7B"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--mem-fraction-static",
+                "0.35",
+                "--cuda-graph-max-bs",
+                "4",
+            ],
+        )
+        cls.base_url += "/v1"
+
+    def test_video_images_chat_completion(self):
+        pass
+
+
+## Skip for ci test
+# class TestLlama4Server(TestOpenAIVisionServer):
+#     @classmethod
+#     def setUpClass(cls):
+#         cls.model = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+#         cls.base_url = DEFAULT_URL_FOR_TEST
+#         cls.api_key = "sk-123456"
+#         cls.process = popen_launch_server(
+#             cls.model,
+#             cls.base_url,
+#             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+#             other_args=[
+#                 "--chat-template",
+#                 "llama-4",
+#                 "--mem-fraction-static",
+#                 "0.8",
+#                 "--tp-size=8",
+#                 "--context-length=8192",
+#                 "--mm-attention-backend",
+#                 "fa3",
+#                 "--cuda-graph-max-bs",
+#                 "4",
+#             ],
+#         )
+#         cls.base_url += "/v1"
+
+
+class TestGemma3itServer(ImageOpenAITestMixin):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "google/gemma-3-4b-it"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--mem-fraction-static",
+                "0.70",
+                "--enable-multimodal",
+                "--cuda-graph-max-bs",
+                "4",
+            ],
+        )
+        cls.base_url += "/v1"
+
+
+class TestGemma3nServer(ImageOpenAITestMixin, AudioOpenAITestMixin):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "google/gemma-3n-E4B-it"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--mem-fraction-static",
+                "0.70",
+                "--cuda-graph-max-bs",
+                "4",
+            ],
+        )
+        cls.base_url += "/v1"
+
+    # This _test_audio_ambient_completion test is way too complicated to pass for a small LLM
+    def test_audio_ambient_completion(self):
+        pass
+
+    def _test_mixed_image_audio_chat_completion(self):
+        self._test_mixed_image_audio_chat_completion()
+
+
+class TestQwen2AudioServer(AudioOpenAITestMixin):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "Qwen/Qwen2-Audio-7B-Instruct"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--mem-fraction-static",
+                "0.70",
+            ],
+        )
+        cls.base_url += "/v1"
+
+
+class TestKimiVLServer(ImageOpenAITestMixin):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "moonshotai/Kimi-VL-A3B-Instruct"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--context-length",
+                "4096",
+                "--dtype",
+                "bfloat16",
+                "--cuda-graph-max-bs",
+                "4",
+            ],
+        )
+        cls.base_url += "/v1"
+
+    def test_video_images_chat_completion(self):
+        pass
+
+
+class TestGLM41VServer(ImageOpenAITestMixin, VideoOpenAITestMixin):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "zai-org/GLM-4.1V-9B-Thinking"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--mem-fraction-static",
+                "0.68",
+                "--cuda-graph-max-bs",
+                "4",
+                "--reasoning-parser",
+                "glm45",
+            ],
+        )
+        cls.base_url += "/v1"
+
+
+if __name__ == "__main__":
+    del (
+        TestOpenAIOmniServerBase,
+        ImageOpenAITestMixin,
+        VideoOpenAITestMixin,
+        AudioOpenAITestMixin,
+    )
+    unittest.main()
diff --git a/test/srt/test_vision_openai_server_common.py b/test/srt/test_vision_openai_server_common.py
new file mode 100644
index 000000000..792636060
--- /dev/null
+++ b/test/srt/test_vision_openai_server_common.py
@@ -0,0 +1,527 @@
+import base64
+import io
+import os
+
+import numpy as np
+import openai
+import requests
+from PIL import Image
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, CustomTestCase
+
+# image
+IMAGE_MAN_IRONING_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/images/man_ironing_on_back_of_suv.png"
+IMAGE_SGL_LOGO_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/images/sgl_logo.png"
+
+# video
+VIDEO_JOBS_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/videos/jobs_presenting_ipod.mp4"
+
+# audio
+AUDIO_TRUMP_SPEECH_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/audios/Trump_WEF_2018_10s.mp3"
+AUDIO_BIRD_SONG_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/audios/bird_song.mp3"
+
+
+class TestOpenAIOmniServerBase(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = ""
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = None
+        cls.base_url += "/v1"
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def get_vision_request_kwargs(self):
+        return self.get_request_kwargs()
+
+    def get_request_kwargs(self):
+        return {}
+
+    def get_or_download_file(self, url: str) -> str:
+        cache_dir = os.path.expanduser("~/.cache")
+        if url is None:
+            raise ValueError()
+        file_name = url.split("/")[-1]
+        file_path = os.path.join(cache_dir, file_name)
+        os.makedirs(cache_dir, exist_ok=True)
+
+        if not os.path.exists(file_path):
+            response = requests.get(url)
+            response.raise_for_status()
+
+            with open(file_path, "wb") as f:
+                f.write(response.content)
+        return file_path
+
+
+class AudioOpenAITestMixin(TestOpenAIOmniServerBase):
+    def prepare_audio_messages(self, prompt, audio_file_name):
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "audio_url",
+                        "audio_url": {"url": f"{audio_file_name}"},
+                    },
+                    {
+                        "type": "text",
+                        "text": prompt,
+                    },
+                ],
+            }
+        ]
+
+        return messages
+
+    def get_audio_request_kwargs(self):
+        return self.get_request_kwargs()
+
+    def get_audio_response(self, url: str, prompt, category):
+        audio_file_path = self.get_or_download_file(url)
+        client = openai.Client(api_key="sk-123456", base_url=self.base_url)
+
+        messages = self.prepare_audio_messages(prompt, audio_file_path)
+
+        response = client.chat.completions.create(
+            model="default",
+            messages=messages,
+            temperature=0,
+            max_tokens=128,
+            stream=False,
+            **(self.get_audio_request_kwargs()),
+        )
+
+        audio_response = response.choices[0].message.content
+
+        print("-" * 30)
+        print(f"audio {category} response:\n{audio_response}")
+        print("-" * 30)
+
+        audio_response = audio_response.lower()
+
+        self.assertIsNotNone(audio_response)
+        self.assertGreater(len(audio_response), 0)
+
+        return audio_response.lower()
+
+    def test_audio_speech_completion(self):
+        # a fragment of Trump's speech
+        audio_response = self.get_audio_response(
+            AUDIO_TRUMP_SPEECH_URL,
+            "Listen to this audio and write down the audio transcription in English.",
+            category="speech",
+        )
+        check_list = [
+            "thank you",
+            "it's a privilege to be here",
+            "leader",
+            "science",
+            "art",
+        ]
+        for check_word in check_list:
+            assert (
+                check_word in audio_response
+            ), f"audio_response: ｜{audio_response}｜ should contain ｜{check_word}｜"
+
+    def test_audio_ambient_completion(self):
+        # bird song
+        audio_response = self.get_audio_response(
+            AUDIO_BIRD_SONG_URL,
+            "Please listen to the audio snippet carefully and transcribe the content in English.",
+            "ambient",
+        )
+        assert "bird" in audio_response
+
+
+class ImageOpenAITestMixin(TestOpenAIOmniServerBase):
+    def test_single_image_chat_completion(self):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        response = client.chat.completions.create(
+            model="default",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": IMAGE_MAN_IRONING_URL},
+                        },
+                        {
+                            "type": "text",
+                            "text": "Describe this image in a sentence.",
+                        },
+                    ],
+                },
+            ],
+            temperature=0,
+            **(self.get_vision_request_kwargs()),
+        )
+
+        assert response.choices[0].message.role == "assistant"
+        text = response.choices[0].message.content
+        assert isinstance(text, str)
+        # `driver` is for gemma-3-it
+        assert (
+            "man" in text or "person" or "driver" in text
+        ), f"text: {text}, should contain man, person or driver"
+        assert (
+            "cab" in text
+            or "taxi" in text
+            or "SUV" in text
+            or "vehicle" in text
+            or "car" in text
+        ), f"text: {text}, should contain cab, taxi, SUV, vehicle or car"
+        # MiniCPMO fails to recognize `iron`, but `hanging`
+        assert (
+            "iron" in text
+            or "hang" in text
+            or "cloth" in text
+            or "coat" in text
+            or "holding" in text
+            or "outfit" in text
+        ), f"text: {text}, should contain iron, hang, cloth, coat or holding or outfit"
+        assert response.id
+        assert response.created
+        assert response.usage.prompt_tokens > 0
+        assert response.usage.completion_tokens > 0
+        assert response.usage.total_tokens > 0
+
+    def test_multi_turn_chat_completion(self):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        response = client.chat.completions.create(
+            model="default",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": IMAGE_MAN_IRONING_URL},
+                        },
+                        {
+                            "type": "text",
+                            "text": "Describe this image in a sentence.",
+                        },
+                    ],
+                },
+                {
+                    "role": "assistant",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "There is a man at the back of a yellow cab ironing his clothes.",
+                        }
+                    ],
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "Repeat your previous answer."}
+                    ],
+                },
+            ],
+            temperature=0,
+            **(self.get_vision_request_kwargs()),
+        )
+
+        assert response.choices[0].message.role == "assistant"
+        text = response.choices[0].message.content
+        assert isinstance(text, str)
+        assert (
+            "man" in text or "cab" in text
+        ), f"text: {text}, should contain man or cab"
+        assert response.id
+        assert response.created
+        assert response.usage.prompt_tokens > 0
+        assert response.usage.completion_tokens > 0
+        assert response.usage.total_tokens > 0
+
+    def test_multi_images_chat_completion(self):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        response = client.chat.completions.create(
+            model="default",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": IMAGE_MAN_IRONING_URL},
+                            "modalities": "multi-images",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": IMAGE_SGL_LOGO_URL},
+                            "modalities": "multi-images",
+                        },
+                        {
+                            "type": "text",
+                            "text": "I have two very different images. They are not related at all. "
+                            "Please describe the first image in one sentence, and then describe the second image in another sentence.",
+                        },
+                    ],
+                },
+            ],
+            temperature=0,
+            **(self.get_vision_request_kwargs()),
+        )
+
+        assert response.choices[0].message.role == "assistant"
+        text = response.choices[0].message.content
+        assert isinstance(text, str)
+        print("-" * 30)
+        print(f"Multi images response:\n{text}")
+        print("-" * 30)
+        assert (
+            "man" in text
+            or "cab" in text
+            or "SUV" in text
+            or "taxi" in text
+            or "car" in text
+        ), f"text: {text}, should contain man, cab, SUV, taxi or car"
+        assert (
+            "logo" in text or '"S"' in text or "SG" in text or "graphic" in text
+        ), f"text: {text}, should contain logo, S or SG or graphic"
+        assert response.id
+        assert response.created
+        assert response.usage.prompt_tokens > 0
+        assert response.usage.completion_tokens > 0
+        assert response.usage.total_tokens > 0
+
+    def _test_mixed_image_audio_chat_completion(self):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        response = client.chat.completions.create(
+            model="default",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": IMAGE_MAN_IRONING_URL},
+                        },
+                        {
+                            "type": "audio_url",
+                            "audio_url": {"url": AUDIO_TRUMP_SPEECH_URL},
+                        },
+                        {
+                            "type": "text",
+                            "text": "Please describe the image in one sentence, and then write down the audio transcription in English.",
+                        },
+                    ],
+                },
+            ],
+            temperature=0,
+            **(self.get_vision_request_kwargs()),
+        )
+
+        assert response.choices[0].message.role == "assistant"
+        text = response.choices[0].message.content
+        assert isinstance(text, str)
+        print("-" * 30)
+        print(f"Mixed image & audio response:\n{text}")
+        print("-" * 30)
+        assert (
+            "man" in text
+            or "cab" in text
+            or "SUV" in text
+            or "taxi" in text
+            or "car" in text
+        ), f"text: {text}, should contain man, cab, SUV, taxi or car"
+        check_list = [
+            "thank you",
+            "it's a privilege to be here",
+            "leader",
+            "science",
+            "art",
+        ]
+        for check_word in check_list:
+            assert (
+                check_word in text
+            ), f"text: ｜{text}｜ should contain ｜{check_word}｜"
+        assert response.id
+        assert response.created
+        assert response.usage.prompt_tokens > 0
+        assert response.usage.completion_tokens > 0
+        assert response.usage.total_tokens > 0
+
+    def prepare_video_images_messages(self, video_path):
+        # the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
+        # the size of the video embeds differs from the `modality` argument when preprocessed
+
+        # We import decord here to avoid a strange Segmentation fault (core dumped) issue.
+        # The following import order will cause Segmentation fault.
+        # import decord
+        # from transformers import AutoTokenizer
+        from decord import VideoReader, cpu
+
+        max_frames_num = 10
+        vr = VideoReader(video_path, ctx=cpu(0))
+        total_frame_num = len(vr)
+        uniform_sampled_frames = np.linspace(
+            0, total_frame_num - 1, max_frames_num, dtype=int
+        )
+        frame_idx = uniform_sampled_frames.tolist()
+        frames = vr.get_batch(frame_idx).asnumpy()
+
+        base64_frames = []
+        for frame in frames:
+            pil_img = Image.fromarray(frame)
+            buff = io.BytesIO()
+            pil_img.save(buff, format="JPEG")
+            base64_str = base64.b64encode(buff.getvalue()).decode("utf-8")
+            base64_frames.append(base64_str)
+
+        messages = [{"role": "user", "content": []}]
+        frame_format = {
+            "type": "image_url",
+            "image_url": {"url": "data:image/jpeg;base64,{}"},
+            "modalities": "image",
+        }
+
+        for base64_frame in base64_frames:
+            frame_format["image_url"]["url"] = "data:image/jpeg;base64,{}".format(
+                base64_frame
+            )
+            messages[0]["content"].append(frame_format.copy())
+
+        prompt = {"type": "text", "text": "Please describe the video in detail."}
+        messages[0]["content"].append(prompt)
+
+        return messages
+
+    def test_video_images_chat_completion(self):
+        url = VIDEO_JOBS_URL
+        file_path = self.get_or_download_file(url)
+
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        messages = self.prepare_video_images_messages(file_path)
+
+        response = client.chat.completions.create(
+            model="default",
+            messages=messages,
+            temperature=0,
+            max_tokens=1024,
+            stream=False,
+        )
+
+        video_response = response.choices[0].message.content
+
+        print("-" * 30)
+        print(f"Video images response:\n{video_response}")
+        print("-" * 30)
+
+        # Add assertions to validate the video response
+        assert (
+            "iPod" in video_response
+            or "device" in video_response
+            or "microphone" in video_response
+        ), f"""
+        ====================== video_response =====================
+        {video_response}
+        ===========================================================
+        should contain 'iPod' or 'device' or 'microphone'
+        """
+        assert (
+            "man" in video_response
+            or "person" in video_response
+            or "individual" in video_response
+            or "speaker" in video_response
+            or "presenter" in video_response
+            or "Steve" in video_response
+            or "hand" in video_response
+        ), f"""
+        ====================== video_response =====================
+        {video_response}
+        ===========================================================
+        should contain 'man' or 'person' or 'individual' or 'speaker' or 'presenter' or 'Steve' or 'hand'
+        """
+        assert (
+            "present" in video_response
+            or "examine" in video_response
+            or "display" in video_response
+            or "hold" in video_response
+        ), f"""
+        ====================== video_response =====================
+        {video_response}
+        ===========================================================
+        should contain 'present' or 'examine' or 'display' or 'hold'
+        """
+        self.assertIsNotNone(video_response)
+        self.assertGreater(len(video_response), 0)
+
+
+class VideoOpenAITestMixin(TestOpenAIOmniServerBase):
+    def prepare_video_messages(self, video_path):
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "video_url",
+                        "video_url": {"url": f"{video_path}"},
+                    },
+                    {"type": "text", "text": "Please describe the video in detail."},
+                ],
+            },
+        ]
+        return messages
+
+    def test_video_chat_completion(self):
+        url = VIDEO_JOBS_URL
+        file_path = self.get_or_download_file(url)
+
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        messages = self.prepare_video_messages(file_path)
+
+        response = client.chat.completions.create(
+            model="default",
+            messages=messages,
+            temperature=0,
+            max_tokens=1024,
+            stream=False,
+            **(self.get_vision_request_kwargs()),
+        )
+
+        video_response = response.choices[0].message.content
+
+        print("-" * 30)
+        print(f"Video response:\n{video_response}")
+        print("-" * 30)
+
+        # Add assertions to validate the video response
+        assert (
+            "iPod" in video_response
+            or "device" in video_response
+            or "microphone" in video_response
+        ), f"video_response: {video_response}, should contain 'iPod' or 'device'"
+        assert (
+            "man" in video_response
+            or "person" in video_response
+            or "individual" in video_response
+            or "speaker" in video_response
+            or "presenter" in video_response
+            or "hand" in video_response
+        ), f"video_response: {video_response}, should either have 'man' in video_response, or 'person' in video_response, or 'individual' in video_response or 'speaker' in video_response or 'presenter' or 'hand' in video_response"
+        assert (
+            "present" in video_response
+            or "examine" in video_response
+            or "display" in video_response
+            or "hold" in video_response
+        ), f"video_response: {video_response}, should contain 'present', 'examine', 'display', or 'hold'"
+        assert (
+            "black" in video_response or "dark" in video_response
+        ), f"video_response: {video_response}, should contain 'black' or 'dark'"
+        self.assertIsNotNone(video_response)
+        self.assertGreater(len(video_response), 0)
diff --git a/test/srt/test_vllm_dependency.py b/test/srt/test_vllm_dependency.py
new file mode 100644
index 000000000..b4451f369
--- /dev/null
+++ b/test/srt/test_vllm_dependency.py
@@ -0,0 +1,164 @@
+import json
+import os
+import unittest
+import warnings
+from datetime import datetime
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    is_in_ci,
+    popen_launch_server,
+    write_github_step_summary,
+)
+
+MODEL_SCORE_THRESHOLDS = {
+    "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.825,
+    "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.825,
+    "hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4": 0.62,
+}
+
+
+def parse_models(model_string):
+    return [model.strip() for model in model_string.split(",") if model.strip()]
+
+
+def popen_launch_server_wrapper(base_url, model, is_fp8, is_tp2):
+    other_args = ["--log-level-http", "warning", "--trust-remote-code"]
+    if is_fp8:
+        if "Llama-3" in model or "gemma-2" in model:
+            other_args.extend(["--kv-cache-dtype", "fp8_e5m2"])
+        elif "Qwen2-72B-Instruct-FP8" in model:
+            other_args.extend(["--quantization", "fp8"])
+        elif "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8" in model:
+            other_args.extend([])
+        else:
+            other_args.extend(["--quantization", "fp8", "--kv-cache-dtype", "fp8_e5m2"])
+    if is_tp2:
+        other_args.extend(["--tp", "2"])
+    if "DeepSeek" in model:
+        other_args.extend(["--mem-frac", "0.85"])
+
+    process = popen_launch_server(
+        model,
+        base_url,
+        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        other_args=other_args,
+    )
+    return process
+
+
+def write_results_to_json(model, metrics, mode="a"):
+    result = {
+        "timestamp": datetime.now().isoformat(),
+        "model": model,
+        "metrics": metrics,
+        "score": metrics["score"],
+    }
+
+    existing_results = []
+    if mode == "a" and os.path.exists("results.json"):
+        try:
+            with open("results.json", "r") as f:
+                existing_results = json.load(f)
+        except json.JSONDecodeError:
+            existing_results = []
+
+    if isinstance(existing_results, list):
+        existing_results.append(result)
+    else:
+        existing_results = [result]
+
+    with open("results.json", "w") as f:
+        json.dump(existing_results, f, indent=2)
+
+
+def check_model_scores(results):
+    failed_models = []
+    summary = " | model | score | threshold |\n"
+    summary += "| ----- | ----- | --------- |\n"
+
+    for model, score in results:
+        threshold = MODEL_SCORE_THRESHOLDS.get(model)
+        if threshold is None:
+            print(f"Warning: No threshold defined for model {model}")
+            continue
+
+        if score < threshold:
+            failed_models.append(
+                f"\nScore Check Failed: {model}\n"
+                f"Model {model} score ({score:.4f}) is below threshold ({threshold:.4f})"
+            )
+
+        line = f"| {model} | {score} | {threshold} |\n"
+        summary += line
+
+    print(summary)
+
+    if is_in_ci():
+        write_github_step_summary(
+            f"### TestNightlyGsm8KEval for vLLM awq, gptq, gguf\n{summary}"
+        )
+
+    if failed_models:
+        raise AssertionError("\n".join(failed_models))
+
+
+class TestNightlyGsm8KEval(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model_groups = [
+            (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1), False, False),
+        ]
+        cls.base_url = DEFAULT_URL_FOR_TEST
+
+    def test_mgsm_en_all_models(self):
+        warnings.filterwarnings(
+            "ignore", category=ResourceWarning, message="unclosed.*socket"
+        )
+        is_first = True
+        all_results = []
+
+        for model_group, is_fp8, is_tp2 in self.model_groups:
+            for model in model_group:
+                with self.subTest(model=model):
+                    process = popen_launch_server_wrapper(
+                        self.base_url, model, is_fp8, is_tp2
+                    )
+
+                    args = SimpleNamespace(
+                        base_url=self.base_url,
+                        model=model,
+                        eval_name="mgsm_en",
+                        num_examples=None,
+                        num_threads=1024,
+                    )
+
+                    metrics = run_eval(args)
+                    print(
+                        f"{'=' * 42}\n{model} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n"
+                    )
+
+                    write_results_to_json(model, metrics, "w" if is_first else "a")
+                    is_first = False
+
+                    all_results.append((model, metrics["score"]))
+                    kill_process_tree(process.pid)
+
+        try:
+            with open("results.json", "r") as f:
+                print("\nFinal Results from results.json:")
+                print(json.dumps(json.load(f), indent=2))
+        except Exception as e:
+            print(f"Error reading results.json: {e}")
+
+        # Check all scores after collecting all results
+        check_model_scores(all_results)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_vlm_accuracy.py b/test/srt/test_vlm_accuracy.py
new file mode 100644
index 000000000..ef9a2ad51
--- /dev/null
+++ b/test/srt/test_vlm_accuracy.py
@@ -0,0 +1,324 @@
+"""
+"""
+
+import unittest
+from io import BytesIO
+from typing import List, Optional
+
+import numpy as np
+import requests
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from transformers import AutoModel, AutoProcessor, AutoTokenizer
+
+from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.entrypoints.openai.protocol import ChatCompletionRequest
+from sglang.srt.managers.mm_utils import embed_mm_inputs, init_embedding_cache
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.model_runner import ModelRunner
+from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
+from sglang.srt.parser.conversation import generate_chat_conv
+from sglang.srt.server_args import ServerArgs
+
+
+# Test the logits output between HF and SGLang
+class VisionLLMLogitsBase(unittest.IsolatedAsyncioTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.image_url = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
+        cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        cls.model_path = ""
+        cls.chat_template = ""
+        cls.processor = ""
+        response = requests.get(cls.image_url)
+        cls.main_image = Image.open(BytesIO(response.content))
+
+    def compare_outputs(self, sglang_output: torch.Tensor, hf_output: torch.Tensor):
+        # Convert to float32 for numerical stability if needed
+        hf = hf_output.float()
+        sg = sglang_output.float()
+
+        # Basic shape and dtype comparison
+        print("\n=== Basic Properties ===")
+        print(f"Shapes match: {hf.shape == sg.shape}")
+        print(f"HF shape: {hf.shape}, SGLang shape: {sg.shape}")
+        print(f"HF dtype: {hf.dtype}, SGLang dtype: {sg.dtype}")
+
+        # Move tensors to CPU for numpy operations
+        hf_np = hf.cpu().numpy()
+        sg_np = sg.cpu().numpy()
+
+        # Statistical metrics
+        print("\n=== Statistical Metrics ===")
+        print(f"Mean absolute difference: {torch.mean(torch.abs(hf - sg)).item():.6f}")
+        print(f"Max absolute difference: {torch.max(torch.abs(hf - sg)).item():.6f}")
+        print(f"Mean squared error: {torch.mean((hf - sg) ** 2).item():.6f}")
+        print(
+            f"Root mean squared error: {torch.sqrt(torch.mean((hf - sg) ** 2)).item():.6f}"
+        )
+
+        # Cosine similarity (across feature dimension)
+        cos_sim = F.cosine_similarity(hf, sg)
+        print(f"Mean cosine similarity: {torch.mean(cos_sim).item():.6f}")
+        print(f"Min cosine similarity: {torch.min(cos_sim).item():.6f}")
+
+        # Find largest absolute differences
+        print("\n=== Largest Absolute Differences ===")
+        diffs = torch.abs(hf - sg)
+        flat_diffs = diffs.flatten()
+
+        # Get indices of top 10 differences
+        top_k = 10
+        top_values, top_flat_indices = torch.topk(flat_diffs, top_k)
+
+        # Convert flat indices to multidimensional indices
+        top_indices = np.unravel_index(top_flat_indices.cpu().numpy(), diffs.shape)
+
+        print(f"\nTop {top_k} largest absolute differences:")
+        print(
+            "Index".ljust(30)
+            + "Difference".ljust(15)
+            + "HF Value".ljust(15)
+            + "SGLang Value"
+        )
+        print("-" * 75)
+
+        for i in range(top_k):
+            # Get the index tuple for this difference
+            idx = tuple(dim[i] for dim in top_indices)
+        diff_val = top_values[i].item()
+        hf_val = hf[idx].item()
+        sg_val = sg[idx].item()
+
+        # Format the index tuple and values
+        idx_str = str(idx)
+        print(f"{idx_str:<30}{diff_val:<15.6f}{hf_val:<15.6f}{sg_val:.6f}")
+
+        np.testing.assert_allclose(hf_np, sg_np)
+
+    def get_completion_request(self) -> ChatCompletionRequest:
+        json_str = f"""
+        {{
+  "model": "{self.model_path}",
+  "messages": [
+    {{
+      "role": "user",
+      "content": [
+        {{
+          "type": "image_url",
+          "image_url": {{
+            "url": "{self.image_url}"
+          }}
+        }},
+        {{
+          "type": "text",
+          "text": "What's in this picture?"
+        }}
+      ]
+    }}
+  ]
+}}
+        """
+
+        return ChatCompletionRequest.model_validate_json(json_str)
+
+    def get_processor_output(self, req: Optional[ChatCompletionRequest] = None):
+        if req is None:
+            req = self.get_completion_request()
+        conv = generate_chat_conv(req, template_name=self.chat_template)
+        text = conv.get_prompt()
+
+        # Process inputs using processor
+        # FIXME: the formal arguments may differ
+        inputs = self.processor(
+            text=[text],
+            images=[self.main_image],
+            return_tensors="pt",
+        ).to(self.device)
+
+        return inputs
+
+    def get_sglang_model(self):
+        self.model_runner = ModelRunner(
+            model_config=ModelConfig(self.model_path, model_override_args="{}"),
+            mem_fraction_static=0.8,
+            gpu_id=0,
+            tp_rank=0,
+            tp_size=1,
+            pp_rank=0,
+            pp_size=1,
+            nccl_port=12435,
+            server_args=ServerArgs(
+                model_path=self.model_path,
+                disable_cuda_graph=True,
+            ),
+        )
+        return self.model_runner.model
+
+
+class TestMiniCPMV2_6Logits(VisionLLMLogitsBase):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls.model_path = "openbmb/MiniCPM-V-2_6"
+        cls.tokenizer = AutoTokenizer.from_pretrained(
+            cls.model_path, trust_remote_code=True
+        )
+        cls.processor = AutoProcessor.from_pretrained(
+            cls.model_path, trust_remote_code=True
+        )
+        cls.chat_template = "minicpmv"
+
+        cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        cls.hf_model = (
+            AutoModel.from_pretrained(
+                cls.model_path, torch_dtype=torch.bfloat16, trust_remote_code=True
+            )
+            .eval()
+            .to(cls.device)
+        )
+        init_embedding_cache()
+
+    async def test_vlm_embedding_output(self):
+        """
+        Compares the embedding output of vlm
+        """
+        inputs = self.get_processor_output()
+
+        with torch.no_grad():
+            # hf
+            model_inputs = {
+                "input_ids": inputs.input_ids,
+                "image_bound": inputs.image_bound,
+                "pixel_values": inputs.pixel_values,
+                "tgt_sizes": inputs.tgt_sizes,
+            }
+            (hf_output, _) = self.hf_model.get_vllm_embedding(
+                model_inputs,
+            )
+            hf_output = hf_output.squeeze(0)
+
+            # sglang
+            model = self.get_sglang_model()
+            input_ids = inputs["input_ids"].to(self.device).flatten()
+
+            pixel_values = inputs["pixel_values"]
+            tgt_sizes = inputs["tgt_sizes"]
+            pixel_values_flat: List[torch.Tensor] = []
+            tgt_sizes_flat: List[torch.Tensor] = []
+            for pixel_b, tgt_b in zip(pixel_values, tgt_sizes):
+                # per image
+                if len(pixel_b) != len(tgt_b):
+                    raise ValueError(
+                        "Inconsistent N lengths, found: "
+                        f"{len(pixel_b)} vs {len(tgt_b)}"
+                    )
+                for pixel_n, tgt_n in zip(pixel_b, tgt_b):
+                    pixel_values_flat += [pixel_n]
+                    tgt_sizes_flat += [tgt_n]
+
+            im_start_id, im_end_id = (
+                self.tokenizer.im_start_id,
+                self.tokenizer.im_end_id,
+            )
+            slice_start_id, slice_end_id = (
+                self.tokenizer.slice_start_id,
+                self.tokenizer.slice_end_id,
+            )
+
+            image_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair(
+                input_ids=input_ids, mm_start_id=im_start_id, mm_end_id=im_end_id
+            )
+            slice_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair(
+                input_ids=input_ids, mm_start_id=slice_start_id, mm_end_id=slice_end_id
+            )
+            image_offsets.extend(slice_offsets)
+            image_offsets = sorted(image_offsets)
+
+            sglang_output = embed_mm_inputs(
+                mm_inputs_list=[
+                    MultimodalInputs(
+                        mm_items=[
+                            MultimodalDataItem(
+                                feature=pixel_values_flat,
+                                offsets=image_offsets,
+                                tgt_size=tgt_sizes_flat,
+                                modality=Modality.IMAGE,
+                                pad_value=self.processor.tokenizer.unk_token_id,
+                            )
+                        ]
+                    ),
+                ],
+                extend_prefix_lens=[0],
+                extend_seq_lens=[input_ids.shape[0]],
+                input_ids=input_ids,
+                input_embedding=model.get_input_embeddings(),
+                multimodal_model=model,
+                placeholder_tokens={
+                    Modality.IMAGE: self.processor.tokenizer.unk_token_id,
+                },
+            )
+
+        self.compare_outputs(sglang_output, hf_output)
+
+
+class TestMiniCPMV4Logits(VisionLLMLogitsBase):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls.model_path = "openbmb/MiniCPM-V-4"
+        cls.tokenizer = AutoTokenizer.from_pretrained(
+            cls.model_path, trust_remote_code=True
+        )
+        cls.processor = AutoProcessor.from_pretrained(
+            cls.model_path, trust_remote_code=True
+        )
+        cls.chat_template = "minicpmv"
+
+        cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        cls.hf_model = (
+            AutoModel.from_pretrained(
+                cls.model_path, torch_dtype=torch.bfloat16, trust_remote_code=True
+            )
+            .eval()
+            .to(cls.device)
+        )
+        init_embedding_cache()
+
+    async def test_vlm_embedding_output(self):
+        """
+        Compares the embedding output of vlm
+        """
+        inputs = self.get_processor_output()
+
+        with torch.no_grad():
+            # hf
+            model_inputs = {
+                "input_ids": inputs.input_ids,
+                "image_bound": inputs.image_bound,
+                "pixel_values": inputs.pixel_values,
+                "tgt_sizes": inputs.tgt_sizes,
+            }
+            hf_output = self.hf_model.get_input_embeddings()(inputs.input_ids)
+
+            # sglang
+            model = self.get_model()
+            sglang_output = self.vlm_func(
+                model,
+                input_ids=inputs.input_ids.to(self.device),
+                pixel_values=inputs.pixel_values,
+                image_bound=inputs.image_bound.to(self.device),
+                tgt_sizes=inputs.tgt_sizes.to(self.device),
+                input_embedding=model.get_input_embeddings(),
+                multimodal_model=model,
+                placeholder_tokens={
+                    Modality.IMAGE: self.processor.tokenizer.unk_token_id,
+                },
+            )
+
+        self.compare_outputs(sglang_output, hf_output)
diff --git a/test/srt/test_vlm_input_format.py b/test/srt/test_vlm_input_format.py
new file mode 100644
index 000000000..261700da5
--- /dev/null
+++ b/test/srt/test_vlm_input_format.py
@@ -0,0 +1,258 @@
+import json
+import unittest
+from io import BytesIO
+from typing import Optional
+
+import requests
+import torch
+from PIL import Image
+from transformers import (
+    AutoModel,
+    AutoProcessor,
+    Gemma3ForConditionalGeneration,
+    Qwen2_5_VLForConditionalGeneration,
+)
+
+from sglang import Engine
+from sglang.srt.entrypoints.openai.protocol import ChatCompletionRequest
+from sglang.srt.parser.conversation import generate_chat_conv
+
+TEST_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
+
+
+class VLMInputTestBase:
+    model_path = None
+    chat_template = None
+    processor = None
+    visual = None  # Should be a callable for precomputed embeddings
+
+    @classmethod
+    def setUpClass(cls):
+        assert cls.model_path is not None, "Set model_path in subclass"
+        assert cls.chat_template is not None, "Set chat_template in subclass"
+        cls.image_url = TEST_IMAGE_URL
+        cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        response = requests.get(cls.image_url)
+        cls.main_image = Image.open(BytesIO(response.content))
+        cls.processor = AutoProcessor.from_pretrained(
+            cls.model_path, trust_remote_code=True, use_fast=True
+        )
+        cls._init_visual()
+
+    @classmethod
+    def _init_visual(cls):
+        """Override in subclass to set up cls.visual as a callable for precomputed embeddings."""
+        raise NotImplementedError
+
+    def setUp(self):
+        self.engine = Engine(
+            model_path=self.model_path,
+            chat_template=self.chat_template,
+            device=self.device.type,
+            mem_fraction_static=0.8,
+            enable_multimodal=True,
+            disable_cuda_graph=True,
+            trust_remote_code=True,
+        )
+
+    def tearDown(self):
+        self.engine.shutdown()
+
+    def verify_response(self, output):
+        out_text = output["text"].lower()
+        assert "taxi" in out_text or "cab" in out_text or "car" in out_text, out_text
+
+    def get_completion_request(self) -> ChatCompletionRequest:
+        json_structure = {
+            "model": self.model_path,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": self.image_url}},
+                        {"type": "text", "text": "What's in this picture?"},
+                    ],
+                }
+            ],
+        }
+        json_str = json.dumps(json_structure)
+        return ChatCompletionRequest.model_validate_json(json_str)
+
+    def get_processor_output(self, req: Optional[ChatCompletionRequest] = None):
+        if req is None:
+            req = self.get_completion_request()
+        conv = generate_chat_conv(req, template_name=self.chat_template)
+        text = conv.get_prompt()
+
+        # Process inputs using processor
+        inputs = self.processor(
+            text=[text],
+            images=[self.main_image],
+            return_tensors="pt",
+        ).to(self.device)
+
+        return inputs
+
+    async def test_understands_image(self):
+        req = self.get_completion_request()
+        conv = generate_chat_conv(req, template_name=self.chat_template)
+        text = conv.get_prompt()
+        output = await self.engine.async_generate(
+            prompt=text,
+            image_data=[self.main_image],
+            sampling_params=dict(temperature=0.0),
+        )
+        self.verify_response(output)
+
+    async def test_understands_precomputed_embeddings(self):
+        req = self.get_completion_request()
+        processor_output = self.get_processor_output(req=req)
+        with torch.inference_mode():
+            precomputed_embeddings = self.__class__.visual(processor_output)
+        output = await self.engine.async_generate(
+            input_ids=processor_output["input_ids"][0].detach().cpu().tolist(),
+            image_data=[
+                self._precomputed_image_data(processor_output, precomputed_embeddings)
+            ],
+            sampling_params=dict(temperature=0.0),
+        )
+        self.verify_response(output)
+
+    async def test_understands_pixel_values(self):
+        req = self.get_completion_request()
+        processor_output = self.get_processor_output(req=req)
+        output = await self.engine.async_generate(
+            input_ids=processor_output["input_ids"][0].detach().cpu().tolist(),
+            image_data=[self._pixel_values_image_data(processor_output)],
+            sampling_params=dict(temperature=0.0),
+        )
+        self.verify_response(output)
+
+    def _precomputed_image_data(self, processor_output, precomputed_embeddings):
+        """This should not be overridden."""
+        return dict(
+            modality="IMAGE",
+            precomputed_embeddings=precomputed_embeddings,
+        )
+
+    def _pixel_values_image_data(self, processor_output):
+        """Override in subclass to pass the correct set of arguments."""
+        raise NotImplementedError
+
+
+class TestQwenVLUnderstandsImage(VLMInputTestBase, unittest.IsolatedAsyncioTestCase):
+    model_path = "Qwen/Qwen2.5-VL-3B-Instruct"
+    chat_template = "qwen2-vl"
+
+    @classmethod
+    def _init_visual(cls):
+        cls.visual_model = (
+            Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                cls.model_path, torch_dtype=torch.bfloat16
+            )
+            .eval()
+            .visual.to(cls.device)
+        )
+        cls.visual = lambda processor_output: cls.visual_model(
+            processor_output["pixel_values"], processor_output["image_grid_thw"]
+        )
+
+    def _pixel_values_image_data(self, processor_output):
+        return dict(
+            modality="IMAGE",
+            image_grid_thw=processor_output["image_grid_thw"],
+            pixel_values=processor_output["pixel_values"],
+        )
+
+
+class TestGemmaUnderstandsImage(VLMInputTestBase, unittest.IsolatedAsyncioTestCase):
+    model_path = "google/gemma-3-4b-it"
+    chat_template = "gemma-it"
+
+    @classmethod
+    def _init_visual(cls):
+        model = Gemma3ForConditionalGeneration.from_pretrained(
+            cls.model_path, torch_dtype=torch.bfloat16
+        )
+        cls.vision_tower = model.vision_tower.eval().to(cls.device)
+        cls.mm_projector = model.multi_modal_projector.eval().to(cls.device)
+        cls.visual = lambda processor_output: cls.mm_projector(
+            cls.vision_tower(
+                pixel_values=processor_output["pixel_values"]
+            ).last_hidden_state
+        )
+
+    def _pixel_values_image_data(self, processor_output):
+        return dict(
+            modality="IMAGE",
+            pixel_values=processor_output["pixel_values"][0],
+        )
+
+
+class TestKimiVLImageUnderstandsImage(
+    VLMInputTestBase, unittest.IsolatedAsyncioTestCase
+):
+    model_path = "moonshotai/Kimi-VL-A3B-Instruct"
+    chat_template = "kimi-vl"
+
+    @classmethod
+    def _init_visual(cls):
+        model = AutoModel.from_pretrained(cls.model_path, trust_remote_code=True)
+        cls.vision_tower = model.vision_tower.eval().to(cls.device)
+        cls.mm_projector = model.multi_modal_projector.eval().to(cls.device)
+
+        cls.visual = lambda tokenizer_output: cls.mm_projector(
+            cls.vision_tower(
+                pixel_values=tokenizer_output["pixel_values"],
+                grid_hws=tokenizer_output["image_grid_hws"],
+            )
+        )
+
+    def _pixel_values_image_data(self, processor_output):
+        return dict(
+            modality="IMAGE",
+            pixel_values=processor_output["pixel_values"],
+            image_grid_hws=processor_output["image_grid_hws"],
+        )
+
+
+# not for CI: too large
+# class TestLlama4ImageUnderstandsImage(
+#     VLMInputTestBase, unittest.IsolatedAsyncioTestCase
+# ):
+#     model_path = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+#     chat_template = "llama_4_vision"
+
+#     def setUp(self):
+#         self.engine = Engine(
+#             model_path=self.model_path,
+#             trust_remote_code=True,
+#             chat_template=self.chat_template,
+#             enable_multimodal=True,
+#             mem_fraction_static=0.8,
+#             tp_size=4,
+#             attention_backend="fa3",
+#             context_length=65536,
+#         )
+
+#     @classmethod
+#     def _init_visual(cls):
+#         model = AutoModel.from_pretrained(cls.model_path, trust_remote_code=True, torch_dtype="auto")
+#         cls.vision_tower = model.vision_model.eval().to(cls.device)
+#         cls.mm_projector = model.multi_modal_projector.eval().to(cls.device)
+
+#         cls.visual = lambda tokenizer_output: cls.mm_projector(
+#             cls.vision_tower(
+#                 pixel_values=tokenizer_output["pixel_values"],
+#             ).last_hidden_state.flatten(0, -2)
+#         )
+
+#     def _pixel_values_image_data(self, processor_output):
+#         return dict(
+#             modality="IMAGE",
+#             pixel_values=processor_output["pixel_values"],
+#         )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_wave_attention_backend.py b/test/srt/test_wave_attention_backend.py
new file mode 100644
index 000000000..5feab4595
--- /dev/null
+++ b/test/srt/test_wave_attention_backend.py
@@ -0,0 +1,61 @@
+"""
+Usage:
+python3 -m unittest test_wave_attention_backend.TestWaveAttnBackend.test_mmlu
+"""
+
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    is_in_ci,
+    popen_launch_server,
+    run_bench_one_batch,
+)
+
+
+class TestWaveAttnBackend(unittest.TestCase):
+    def test_latency(self):
+        _, output_throughput, _ = run_bench_one_batch(
+            DEFAULT_MODEL_NAME_FOR_TEST,
+            [
+                "--attention-backend",
+                "wave",
+                "--enable-torch-compile",
+            ],
+        )
+
+        if is_in_ci():
+            self.assertGreater(output_throughput, 153)
+
+    def _test_mmlu(self):
+        model = DEFAULT_MODEL_NAME_FOR_TEST
+        base_url = DEFAULT_URL_FOR_TEST
+        process = popen_launch_server(
+            model,
+            base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--attention-backend", "wave"],
+        )
+
+        try:
+            args = SimpleNamespace(
+                base_url=base_url,
+                model=model,
+                eval_name="mmlu",
+                num_examples=64,
+                num_threads=32,
+            )
+
+            metrics = run_eval(args)
+            self.assertGreaterEqual(metrics["score"], 0.65)
+        finally:
+            kill_process_tree(process.pid)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_wave_attention_kernels.py b/test/srt/test_wave_attention_kernels.py
new file mode 100644
index 000000000..d4c2ff8e5
--- /dev/null
+++ b/test/srt/test_wave_attention_kernels.py
@@ -0,0 +1,322 @@
+import random
+import unittest
+
+import torch
+
+from sglang.srt.layers.attention.triton_ops.decode_attention import (
+    decode_attention_fwd_grouped as triton_decode_attention_fwd_grouped,
+)
+from sglang.srt.layers.attention.triton_ops.extend_attention import (
+    extend_attention_fwd,
+    redundant_attention,
+)
+from sglang.srt.layers.attention.triton_ops.prefill_attention import (
+    context_attention_fwd,
+)
+from sglang.srt.layers.attention.wave_ops.decode_attention import (
+    decode_attention_intermediate_arrays_shapes,
+    decode_attention_wave,
+)
+from sglang.srt.layers.attention.wave_ops.extend_attention import extend_attention_wave
+from sglang.srt.layers.attention.wave_ops.prefill_attention import (
+    prefill_attention_wave,
+)
+
+
+class TestWaveAttention(unittest.TestCase):
+
+    def _set_all_seeds(self, seed):
+        """Set all random seeds for reproducibility."""
+        random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+
+    def setUp(self):
+        # Set seeds before each test method
+        self._set_all_seeds(42)
+
+    def _test_extend_attention_once(self, B, N_CTX, H_Q, H_KV, D):
+        dtype = torch.float16
+        extend_seq_len = 1024
+
+        b_seq_len_prefix = torch.full(
+            (B,), N_CTX // B, dtype=torch.int32, device="cuda"
+        )
+        b_seq_len_extend = torch.full(
+            (B,), extend_seq_len, dtype=torch.int32, device="cuda"
+        )
+        b_seq_len = b_seq_len_prefix + b_seq_len_extend
+        max_len_in_batch = torch.max(b_seq_len, 0)[0].item()
+
+        b_req_idx = torch.arange(B, dtype=torch.int32, device="cuda")
+        b_start_loc = torch.zeros((B,), dtype=torch.int32, device="cuda")
+        b_start_loc[1:] = torch.cumsum(b_seq_len[:-1], 0)
+        b_start_loc_extend = torch.zeros((B,), dtype=torch.int32, device="cuda")
+        b_start_loc_extend[1:] = torch.cumsum(b_seq_len_extend[:-1], 0)
+
+        kv_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda")
+        kv_indptr[1 : B + 1] = torch.cumsum(b_seq_len_prefix[:B], dim=0)
+        kv_indices = torch.zeros(
+            (b_seq_len_prefix.sum().item(),), dtype=torch.int32, device="cuda"
+        )
+
+        for i in range(B):
+            kv_indices[kv_indptr[i] : kv_indptr[i + 1]] = torch.arange(
+                b_start_loc[i], b_start_loc[i] + b_seq_len_prefix[i]
+            )
+
+        total_token_num = torch.sum(b_seq_len).item()
+        extend_token_num = torch.sum(b_seq_len_extend).item()
+        k_buffer = torch.empty(
+            (total_token_num, H_KV, D), dtype=dtype, device="cuda"
+        ).normal_(mean=0.1, std=0.2)
+        v_buffer = torch.empty(
+            (total_token_num, H_KV, D), dtype=dtype, device="cuda"
+        ).normal_(mean=0.1, std=0.2)
+
+        k_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device="cuda")
+        v_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device="cuda")
+        q_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda")
+        for i in range(B):
+            extend_start_in_buffer = b_start_loc[i] + b_seq_len_prefix[i]
+            extend_end_in_buffer = b_start_loc[i] + b_seq_len[i]
+            extend_start = b_start_loc_extend[i]
+            extend_end = b_start_loc_extend[i] + b_seq_len_extend[i]
+            k_extend[extend_start:extend_end] = k_buffer[
+                extend_start_in_buffer:extend_end_in_buffer
+            ]
+            v_extend[extend_start:extend_end] = v_buffer[
+                extend_start_in_buffer:extend_end_in_buffer
+            ]
+            q_extend[extend_start:extend_end] = torch.empty(
+                (b_seq_len_extend[i], H_Q, D), dtype=dtype, device="cuda"
+            ).normal_(mean=0.1, std=0.2)
+
+        o_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda")
+        o_extend_mask = torch.empty(
+            (extend_token_num, H_Q, D), dtype=dtype, device="cuda"
+        )
+        o_redundant = torch.empty(
+            (extend_token_num, H_Q, D), dtype=dtype, device="cuda"
+        )
+
+        b_seq_len_extend = b_seq_len - b_seq_len_prefix
+        max_len_extend = torch.max(b_seq_len_extend, 0)[0].item()
+        qo_indptr = torch.zeros((B + 1,), dtype=torch.int32, device="cuda")
+        qo_indptr[1 : B + 1] = torch.cumsum(b_seq_len_extend[:B], dim=0)
+
+        custom_mask = None
+        mask_indptr = None
+
+        redundant_attention(
+            q_extend,
+            o_redundant,
+            k_buffer,
+            v_buffer,
+            b_req_idx,
+            b_start_loc,
+            b_seq_len,
+            b_seq_len_prefix,
+            max_len_in_batch,
+        )
+
+        is_causal = True
+
+        o_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda")
+        extend_attention_fwd(
+            q_extend,
+            k_extend,
+            v_extend,
+            o_extend,
+            k_buffer,
+            v_buffer,
+            qo_indptr,
+            kv_indptr,
+            kv_indices,
+            custom_mask,
+            is_causal,
+            mask_indptr,
+            max_len_extend,
+        )
+
+        o_wave = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda")
+        extend_attention_wave(
+            q_extend,
+            k_extend,
+            v_extend,
+            k_buffer,
+            v_buffer,
+            qo_indptr,
+            kv_indptr,
+            kv_indices,
+            custom_mask,
+            mask_indptr,
+            max_len_extend,
+            o_wave,
+            is_causal=is_causal,
+        )
+
+        self.assertTrue(torch.allclose(o_extend, o_redundant, rtol=1e-2))
+        self.assertTrue(torch.allclose(o_wave, o_redundant, rtol=1e-2))
+
+    def test_extend_attention(self):
+
+        # Define the varying parameter values
+        attention_values = [128]
+
+        # Loop through the values and call the method
+        for value in attention_values:
+            self._test_extend_attention_once(32, 16384, 6, 1, value)
+
+    def _test_grouped_decode_attention_once(self, B, S, H_Q, H_KV, D, D_V):
+        dtype = torch.float16
+        seq_len = S  # This represents the number of tokens already in the sequence
+        total_tokens = B * seq_len
+        sm_scale = 1.0 / (D**0.5)
+        max_kv_splits = 8
+        num_kv_splits = torch.full((B,), 4, dtype=torch.int32, device="cuda")
+
+        # q represents the new token being generated, one per batch
+        q = torch.randn(B, H_Q, D, dtype=dtype, device="cuda")
+
+        # k_buffer and v_buffer represent all previous tokens
+        k_buffer = torch.randn(total_tokens, H_KV, D, dtype=dtype, device="cuda")
+        v_buffer = torch.randn(total_tokens, H_KV, D_V, dtype=dtype, device="cuda")
+
+        # o will have the same shape as q
+        o_triton = torch.zeros(B, H_Q, D_V, dtype=dtype, device="cuda")
+        o = torch.zeros(B, H_Q, D_V, dtype=dtype, device="cuda")
+
+        req_to_token = torch.arange(total_tokens, device="cuda", dtype=torch.int32)
+        b_req_idx = torch.zeros(B + 1, device="cuda", dtype=torch.int32)
+        b_seq_len = torch.full((B,), seq_len, device="cuda", dtype=torch.int32)
+        b_req_idx[1 : B + 1] = torch.cumsum(b_seq_len, dim=0)
+
+        attn_logits = torch.empty(
+            (B, H_Q, max_kv_splits, D_V + 1),
+            dtype=torch.float32,
+            device="cuda",
+        )
+        attn_lse = torch.empty(
+            (B, H_Q, max_kv_splits),
+            dtype=torch.float32,
+            device="cuda",
+        )
+
+        logit_cap = 0.0
+        triton_decode_attention_fwd_grouped(
+            q,
+            k_buffer,
+            v_buffer,
+            o_triton,
+            b_req_idx,
+            req_to_token,
+            attn_logits,
+            attn_lse,
+            num_kv_splits,
+            max_kv_splits,
+            sm_scale,
+            logit_cap,
+        )
+
+        attn_logits_shape, attn_logits_max_shape = (
+            decode_attention_intermediate_arrays_shapes(B, D_V, H_Q, max_kv_splits)
+        )
+
+        attn_logits = torch.empty(
+            attn_logits_shape,
+            dtype=torch.float32,
+            device="cuda",
+        )
+
+        attn_logits_max = torch.empty(
+            attn_logits_max_shape,
+            dtype=torch.float32,
+            device="cuda",
+        )
+
+        decode_attention_wave(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            b_req_idx,
+            req_to_token,
+            attn_logits,
+            attn_logits_max,
+            num_kv_splits,
+            max_kv_splits,
+            sm_scale,
+            logit_cap,
+        )
+
+        cos_sim = torch.nn.functional.cosine_similarity(
+            o.flatten(), o_triton.flatten(), dim=0
+        )
+        print(cos_sim.item())
+        self.assertTrue(cos_sim.item() > 0.99)
+        self.assertTrue(torch.allclose(o, o_triton, atol=3e-2))
+
+    def test_grouped_decode_attention(self):
+        seq_lens = [5, 100, 128, 500]
+        configs = [
+            (2, 16, 16, 64, 64),
+            (2, 16, 1, 64, 64),
+            (2, 128, 1, 80, 80),
+            (32, 128, 2, 512, 512),
+            (2, 128, 2, 512, 512),
+            (2, 128, 1, 576, 512),
+        ]
+
+        for S in seq_lens:
+            for B, H_Q, H_KV, D, D_V in configs:
+                self._test_grouped_decode_attention_once(B, S, H_Q, H_KV, D, D_V)
+
+    def _test_context_attention_once(self, head_dim, is_causal):
+        # Set up a simple test case
+        dtype = torch.float16
+        num_heads = 4
+        kv_heads = 1
+        seq_lens = [128, 256]
+        max_seq_len = max(seq_lens)
+
+        # Create random input tensors
+        q = torch.randn(sum(seq_lens), num_heads, head_dim, dtype=dtype, device="cuda")
+        k = torch.randn(sum(seq_lens), kv_heads, head_dim, dtype=dtype, device="cuda")
+        v = torch.randn(sum(seq_lens), kv_heads, head_dim, dtype=dtype, device="cuda")
+        o_triton = torch.zeros(
+            sum(seq_lens), num_heads, head_dim, dtype=dtype, device="cuda"
+        )
+        o = torch.zeros(sum(seq_lens), num_heads, head_dim, dtype=dtype, device="cuda")
+
+        # Create b_start_loc and b_seq_len tensors
+        b_start_loc = torch.tensor([0, seq_lens[0]], device="cuda")
+        b_seq_len = torch.tensor(seq_lens, device="cuda")
+
+        context_attention_fwd(
+            q, k, v, o_triton, b_start_loc, b_seq_len, max_seq_len, is_causal=is_causal
+        )
+        prefill_attention_wave(
+            q, k, v, o, b_start_loc, b_seq_len, max_seq_len, is_causal=is_causal
+        )
+        cos_sim = torch.nn.functional.cosine_similarity(
+            o.flatten(), o_triton.flatten(), dim=0
+        )
+
+        print(cos_sim.item())
+        self.assertTrue(torch.allclose(o, o_triton, atol=3e-2))
+        self.assertTrue(cos_sim.item() > 1 - (1e-5))
+
+    def test_context_attention(self):
+        head_dim = [128, 96]
+
+        for dim in head_dim:
+            for is_causal in [False]:
+                self._test_context_attention_once(dim, is_causal)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_weight_version.py b/test/srt/test_weight_version.py
new file mode 100644
index 000000000..5011ee701
--- /dev/null
+++ b/test/srt/test_weight_version.py
@@ -0,0 +1,227 @@
+"""
+Test weight version functionality.
+
+This test suite verifies the weight_version feature implementation including:
+1. Default weight_version setting
+2. /get_weight_version endpoint
+3. /update_weight_version endpoint
+4. /generate request meta_info contains weight_version
+5. OpenAI API response metadata contains weight_version
+"""
+
+import unittest
+
+import requests
+
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestWeightVersion(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Start server once for all tests with custom weight version."""
+        cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+        cls.base_url = "http://127.0.0.1:30000"
+        cls.process = popen_launch_server(
+            cls.model,
+            base_url=cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--weight-version",
+                "test_version_1.0",
+                "--attention-backend",
+                "flashinfer",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        """Terminate server after all tests complete."""
+        if cls.process:
+            cls.process.terminate()
+
+    def test_weight_version_comprehensive(self):
+        """Comprehensive test for all weight_version functionality."""
+
+        response = requests.get(f"{self.base_url}/get_model_info")
+        self.assertEqual(response.status_code, 200)
+        data = response.json()
+        self.assertIn("weight_version", data)
+        self.assertEqual(data["weight_version"], "test_version_1.0")
+
+        response = requests.get(f"{self.base_url}/get_weight_version")
+        self.assertEqual(response.status_code, 200)
+        data = response.json()
+        self.assertIn("weight_version", data)
+        self.assertEqual(data["weight_version"], "test_version_1.0")
+
+        request_data = {
+            "text": "Hello, how are you?",
+            "sampling_params": {
+                "temperature": 0.0,
+                "max_new_tokens": 5,
+            },
+        }
+        response = requests.post(f"{self.base_url}/generate", json=request_data)
+        self.assertEqual(response.status_code, 200)
+        data = response.json()
+        self.assertIn("meta_info", data)
+        self.assertIn("weight_version", data["meta_info"])
+        self.assertEqual(data["meta_info"]["weight_version"], "test_version_1.0")
+
+        request_data = {
+            "model": self.model,
+            "messages": [{"role": "user", "content": "Hello"}],
+            "max_tokens": 5,
+            "temperature": 0.0,
+        }
+        response = requests.post(
+            f"{self.base_url}/v1/chat/completions", json=request_data
+        )
+        self.assertEqual(response.status_code, 200)
+        data = response.json()
+        self.assertIn("metadata", data)
+        self.assertIn("weight_version", data["metadata"])
+        self.assertEqual(data["metadata"]["weight_version"], "test_version_1.0")
+
+        request_data = {
+            "model": self.model,
+            "prompt": "Hello",
+            "max_tokens": 5,
+            "temperature": 0.0,
+        }
+        response = requests.post(f"{self.base_url}/v1/completions", json=request_data)
+        self.assertEqual(response.status_code, 200)
+        data = response.json()
+        self.assertIn("metadata", data)
+        self.assertIn("weight_version", data["metadata"])
+        self.assertEqual(data["metadata"]["weight_version"], "test_version_1.0")
+
+        update_data = {
+            "new_version": "updated_version_2.0",
+            "abort_all_requests": False,
+        }
+        response = requests.post(
+            f"{self.base_url}/update_weight_version", json=update_data
+        )
+        self.assertEqual(response.status_code, 200)
+        data = response.json()
+        self.assertTrue(data["success"])
+        self.assertEqual(data["new_version"], "updated_version_2.0")
+
+        response = requests.get(f"{self.base_url}/get_weight_version")
+        self.assertEqual(response.status_code, 200)
+        data = response.json()
+        self.assertEqual(data["weight_version"], "updated_version_2.0")
+
+        gen_data = {
+            "text": "Test persistence",
+            "sampling_params": {"temperature": 0.0, "max_new_tokens": 3},
+        }
+        response = requests.post(f"{self.base_url}/generate", json=gen_data)
+        self.assertEqual(response.status_code, 200)
+        data = response.json()
+        self.assertEqual(data["meta_info"]["weight_version"], "updated_version_2.0")
+
+        chat_data = {
+            "model": self.model,
+            "messages": [{"role": "user", "content": "Test"}],
+            "max_tokens": 3,
+            "temperature": 0.0,
+        }
+        response = requests.post(f"{self.base_url}/v1/chat/completions", json=chat_data)
+        self.assertEqual(response.status_code, 200)
+        data = response.json()
+        self.assertEqual(data["metadata"]["weight_version"], "updated_version_2.0")
+
+        update_data = {"new_version": "final_version_3.0", "abort_all_requests": True}
+        response = requests.post(
+            f"{self.base_url}/update_weight_version", json=update_data
+        )
+        self.assertEqual(response.status_code, 200)
+        data = response.json()
+        self.assertTrue(data["success"])
+        self.assertEqual(data["new_version"], "final_version_3.0")
+
+        # Check /get_weight_version
+        response = requests.get(f"{self.base_url}/get_weight_version")
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.json()["weight_version"], "final_version_3.0")
+
+        # Check /get_model_info
+        response = requests.get(f"{self.base_url}/get_model_info")
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.json()["weight_version"], "final_version_3.0")
+
+        # Check /generate meta_info
+        response = requests.post(
+            f"{self.base_url}/generate",
+            json={
+                "text": "Final test",
+                "sampling_params": {"temperature": 0.0, "max_new_tokens": 2},
+            },
+        )
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(
+            response.json()["meta_info"]["weight_version"], "final_version_3.0"
+        )
+
+        # Check OpenAI chat metadata
+        response = requests.post(
+            f"{self.base_url}/v1/chat/completions",
+            json={
+                "model": self.model,
+                "messages": [{"role": "user", "content": "Final"}],
+                "max_tokens": 2,
+                "temperature": 0.0,
+            },
+        )
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(
+            response.json()["metadata"]["weight_version"], "final_version_3.0"
+        )
+
+        print("All weight_version functionality tests passed!")
+
+    def test_update_weight_version_with_weight_updates(self):
+        """Test that weight_version can be updated along with weight updates using real model data."""
+        print("Testing weight_version update with real weight operations...")
+
+        # Get current model info for reference
+        model_info_response = requests.get(f"{self.base_url}/get_model_info")
+        self.assertEqual(model_info_response.status_code, 200)
+        current_model_path = model_info_response.json()["model_path"]
+
+        update_data = {
+            "model_path": current_model_path,
+            "load_format": "auto",
+            "abort_all_requests": False,
+            "weight_version": "disk_update_v2.0.0",
+        }
+
+        response = requests.post(
+            f"{self.base_url}/update_weights_from_disk", json=update_data
+        )
+        self.assertEqual(
+            response.status_code,
+            200,
+            f"update_weights_from_disk failed with status {response.status_code}",
+        )
+
+        # Verify version was updated
+        version_response = requests.get(f"{self.base_url}/get_weight_version")
+        self.assertEqual(version_response.status_code, 200)
+        self.assertEqual(
+            version_response.json()["weight_version"], "disk_update_v2.0.0"
+        )
+
+        print("Weight update with weight_version test completed!")
+
+
+if __name__ == "__main__":
+    unittest.main()